diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 0000000..e69de29
diff --git a/cache.json b/cache.json
new file mode 100644
index 0000000..2081d20
--- /dev/null
+++ b/cache.json
@@ -0,0 +1 @@
+{"2025-01-07T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.04005v1","updated":"2025-01-07T18:59:59Z","published":"2025-01-07T18:59:59Z","title":"LargeAD: Large-Scale Cross-Sensor Data Pretraining for Autonomous\n  Driving","summary":"  Recent advancements in vision foundation models (VFMs) have revolutionized\nvisual perception in 2D, yet their potential for 3D scene understanding,\nparticularly in autonomous driving applications, remains underexplored. In this\npaper, we introduce LargeAD, a versatile and scalable framework designed for\nlarge-scale 3D pretraining across diverse real-world driving datasets. Our\nframework leverages VFMs to extract semantically rich superpixels from 2D\nimages, which are aligned with LiDAR point clouds to generate high-quality\ncontrastive samples. This alignment facilitates cross-modal representation\nlearning, enhancing the semantic consistency between 2D and 3D data. We\nintroduce several key innovations: i) VFM-driven superpixel generation for\ndetailed semantic representation, ii) a VFM-assisted contrastive learning\nstrategy to align multimodal features, iii) superpoint temporal consistency to\nmaintain stable representations across time, and iv) multi-source data\npretraining to generalize across various LiDAR configurations. Our approach\ndelivers significant performance improvements over state-of-the-art methods in\nboth linear probing and fine-tuning tasks for both LiDAR-based segmentation and\nobject detection. Extensive experiments on eleven large-scale multi-modal\ndatasets highlight our superior performance, demonstrating the adaptability,\nefficiency, and robustness in real-world autonomous driving scenarios.\n","authors":["Lingdong Kong","Xiang Xu","Youquan Liu","Jun Cen","Runnan Chen","Wenwei Zhang","Liang Pan","Kai Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.04005v1.pdf","comment":"Preprint; 16 pages, 7 figures, 8 tables; Project Page at\n  https://ldkong.com/LargeAD"},{"id":"http://arxiv.org/abs/2501.04004v1","updated":"2025-01-07T18:59:58Z","published":"2025-01-07T18:59:58Z","title":"LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes","summary":"  LiDAR data pretraining offers a promising approach to leveraging large-scale,\nreadily available datasets for enhanced data utilization. However, existing\nmethods predominantly focus on sparse voxel representation, overlooking the\ncomplementary attributes provided by other LiDAR representations. In this work,\nwe propose LiMoE, a framework that integrates the Mixture of Experts (MoE)\nparadigm into LiDAR data representation learning to synergistically combine\nmultiple representations, such as range images, sparse voxels, and raw points.\nOur approach consists of three stages: i) Image-to-LiDAR Pretraining, which\ntransfers prior knowledge from images to point clouds across different\nrepresentations; ii) Contrastive Mixture Learning (CML), which uses MoE to\nadaptively activate relevant attributes from each representation and distills\nthese mixed features into a unified 3D network; iii) Semantic Mixture\nSupervision (SMS), which combines semantic logits from multiple representations\nto boost downstream segmentation performance. Extensive experiments across 11\nlarge-scale LiDAR datasets demonstrate our effectiveness and superiority. The\ncode and model checkpoints have been made publicly accessible.\n","authors":["Xiang Xu","Lingdong Kong","Hui Shuai","Liang Pan","Ziwei Liu","Qingshan Liu"],"pdf_url":"https://arxiv.org/pdf/2501.04004v1.pdf","comment":"Preprint; 26 pages, 17 figures, 7 tables; Project Page at\n  https://ldkong.com/LiMoE"},{"id":"http://arxiv.org/abs/2501.04003v1","updated":"2025-01-07T18:59:55Z","published":"2025-01-07T18:59:55Z","title":"Are VLMs Ready for Autonomous Driving? An Empirical Study from the\n  Reliability, Data, and Metric Perspectives","summary":"  Recent advancements in Vision-Language Models (VLMs) have sparked interest in\ntheir use for autonomous driving, particularly in generating interpretable\ndriving decisions through natural language. However, the assumption that VLMs\ninherently provide visually grounded, reliable, and interpretable explanations\nfor driving remains largely unexamined. To address this gap, we introduce\nDriveBench, a benchmark dataset designed to evaluate VLM reliability across 17\nsettings (clean, corrupted, and text-only inputs), encompassing 19,200 frames,\n20,498 question-answer pairs, three question types, four mainstream driving\ntasks, and a total of 12 popular VLMs. Our findings reveal that VLMs often\ngenerate plausible responses derived from general knowledge or textual cues\nrather than true visual grounding, especially under degraded or missing visual\ninputs. This behavior, concealed by dataset imbalances and insufficient\nevaluation metrics, poses significant risks in safety-critical scenarios like\nautonomous driving. We further observe that VLMs struggle with multi-modal\nreasoning and display heightened sensitivity to input corruptions, leading to\ninconsistencies in performance. To address these challenges, we propose refined\nevaluation metrics that prioritize robust visual grounding and multi-modal\nunderstanding. Additionally, we highlight the potential of leveraging VLMs'\nawareness of corruptions to enhance their reliability, offering a roadmap for\ndeveloping more trustworthy and interpretable decision-making systems in\nreal-world autonomous driving contexts. The benchmark toolkit is publicly\naccessible.\n","authors":["Shaoyuan Xie","Lingdong Kong","Yuhao Dong","Chonghao Sima","Wenwei Zhang","Qi Alfred Chen","Ziwei Liu","Liang Pan"],"pdf_url":"https://arxiv.org/pdf/2501.04003v1.pdf","comment":"Preprint; 41 pages, 32 figures, 16 tables; Project Page at\n  https://drive-bench.github.io/"},{"id":"http://arxiv.org/abs/2412.05313v3","updated":"2025-01-07T18:57:23Z","published":"2024-11-28T19:31:50Z","title":"λ: A Benchmark for Data-Efficiency in Long-Horizon Indoor Mobile\n  Manipulation Robotics","summary":"  Efficiently learning and executing long-horizon mobile manipulation (MoMa)\ntasks is crucial for advancing robotics in household and workplace settings.\nHowever, current MoMa models are data-inefficient, underscoring the need for\nimproved models that require realistic-sized benchmarks to evaluate their\nefficiency, which do not exist. To address this, we introduce the LAMBDA\n({\\lambda}) benchmark (Long-horizon Actions for Mobile-manipulation\nBenchmarking of Directed Activities), which evaluates the data efficiency of\nmodels on language-conditioned, long-horizon, multi-room, multi-floor,\npick-and-place tasks using a dataset of manageable size, more feasible for\ncollection. The benchmark includes 571 human-collected demonstrations that\nprovide realism and diversity in simulated and real-world settings. Unlike\nplanner-generated data, these trajectories offer natural variability and\nreplay-verifiability, ensuring robust learning and evaluation. We benchmark\nseveral models, including learning-based models and a neuro-symbolic modular\napproach combining foundation models with task and motion planning.\nLearning-based models show suboptimal success rates, even when leveraging\npretrained weights, underscoring significant data inefficiencies. However, the\nneuro-symbolic approach performs significantly better while being more data\nefficient. Findings highlight the need for more data-efficient learning-based\nMoMa approaches. {\\lambda} addresses this gap by serving as a key benchmark for\nevaluating the data efficiency of those future models in handling household\nrobotics tasks.\n","authors":["Ahmed Jaafar","Shreyas Sundara Raman","Yichen Wei","Sudarshan Harithas","Sofia Juliani","Anneke Wernerfelt","Benedict Quartey","Ifrah Idrees","Jason Xinyu Liu","Stefanie Tellex"],"pdf_url":"https://arxiv.org/pdf/2412.05313v3.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2412.20429v3","updated":"2025-01-07T18:24:45Z","published":"2024-12-29T10:46:08Z","title":"Multi-Scenario Reasoning: Unlocking Cognitive Autonomy in Humanoid\n  Robots for Multimodal Understanding","summary":"  To improve the cognitive autonomy of humanoid robots, this research proposes\na multi-scenario reasoning architecture to solve the technical shortcomings of\nmulti-modal understanding in this field. It draws on simulation based\nexperimental design that adopts multi-modal synthesis (visual, auditory,\ntactile) and builds a simulator \"Maha\" to perform the experiment. The findings\ndemonstrate the feasibility of this architecture in multimodal data. It\nprovides reference experience for the exploration of cross-modal interaction\nstrategies for humanoid robots in dynamic environments. In addition,\nmulti-scenario reasoning simulates the high-level reasoning mechanism of the\nhuman brain to humanoid robots at the cognitive level. This new concept\npromotes cross-scenario practical task transfer and semantic-driven action\nplanning. It heralds the future development of self-learning and autonomous\nbehavior of humanoid robots in changing scenarios.\n","authors":["Libo Wang"],"pdf_url":"https://arxiv.org/pdf/2412.20429v3.pdf","comment":"The main text is 5 pages, 2 figures, and 3 tables"},{"id":"http://arxiv.org/abs/2501.03972v1","updated":"2025-01-07T18:22:44Z","published":"2025-01-07T18:22:44Z","title":"MAD-BA: 3D LiDAR Bundle Adjustment -- from Uncertainty Modelling to\n  Structure Optimization","summary":"  The joint optimization of sensor poses and 3D structure is fundamental for\nstate estimation in robotics and related fields. Current LiDAR systems often\nprioritize pose optimization, with structure refinement either omitted or\ntreated separately using representations like signed distance functions or\nneural networks. This paper introduces a framework for simultaneous\noptimization of sensor poses and 3D map, represented as surfels. A generalized\nLiDAR uncertainty model is proposed to address degraded or less reliable\nmeasurements in varying scenarios. Experimental results on public datasets\ndemonstrate improved performance over most comparable state-of-the-art methods.\nThe system is provided as open-source software to support further research.\n","authors":["Krzysztof Ćwian","Luca Di Giammarino","Simone Ferrari","Thomas Ciarfuglia","Giorgio Grisetti","Piotr Skrzypczyński"],"pdf_url":"https://arxiv.org/pdf/2501.03972v1.pdf","comment":"8 pages, 6 figures, this work has been submitted to IEEE RA-L"},{"id":"http://arxiv.org/abs/2501.03971v1","updated":"2025-01-07T18:22:23Z","published":"2025-01-07T18:22:23Z","title":"Impact of Leg Stiffness on Energy Efficiency in One Legged Hopping","summary":"  In the fields of robotics and biomechanics, the integration of elastic\nelements such as springs and tendons in legged systems has long been recognized\nfor enabling energy-efficient locomotion. Yet, a significant challenge\npersists: designing a robotic leg that perform consistently across diverse\noperating conditions, especially varying average forward speeds. It remains\nunclear whether, for such a range of operating conditions, the stiffness of the\nelastic elements needs to be varied or if a similar performance can be obtained\nby changing the motion and actuation while keeping the stiffness fixed. This\nwork explores the influence of the leg stiffness on the energy efficiency of a\nmonopedal robot through an extensive parametric study of its periodic hopping\nmotion. To this end, we formulate an optimal control problem parameterized by\naverage forward speed and leg stiffness, solving it numerically using direct\ncollocation. Our findings indicate that, compared to the use of a fixed\nstiffness, employing variable stiffness in legged systems improves energy\nefficiency by 20 % maximally and by 6.8 % on average across a range of speeds.\n","authors":["Iskandar Khemakhem","Dominik Tschemernjak","Maximilian Raff","C. David Remy"],"pdf_url":"https://arxiv.org/pdf/2501.03971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03968v1","updated":"2025-01-07T18:06:27Z","published":"2025-01-07T18:06:27Z","title":"VLM-driven Behavior Tree for Context-aware Task Planning","summary":"  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)\nhas recently gained attention in the robotics community, yet remains in its\nearly stages of development. In this paper, we propose a novel framework that\nleverages Vision-Language Models (VLMs) to interactively generate and edit BTs\nthat address visual conditions, enabling context-aware robot operations in\nvisually complex environments. A key feature of our approach lies in the\nconditional control through self-prompted visual conditions. Specifically, the\nVLM generates BTs with visual condition nodes, where conditions are expressed\nas free-form text. Another VLM process integrates the text into its prompt and\nevaluates the conditions against real-world images during robot execution. We\nvalidated our framework in a real-world cafe scenario, demonstrating both its\nfeasibility and limitations.\n","authors":["Naoki Wake","Atsushi Kanehira","Jun Takamatsu","Kazuhiro Sasabuchi","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2501.03968v1.pdf","comment":"10 pages, 11 figures, 5 tables. Last updated on January 7th, 2024"},{"id":"http://arxiv.org/abs/2408.01333v3","updated":"2025-01-07T17:32:29Z","published":"2024-08-02T15:30:51Z","title":"Incorporating Control Inputs in Continuous-Time Gaussian Process State\n  Estimation for Robotics","summary":"  Continuous-time batch state estimation using Gaussian processes is an\nefficient approach to estimate the trajectories of robots over time. In the\npast, relatively simple physics-motivated priors have been considered for such\napproaches, using assumptions such as constant velocity or acceleration. This\npaper presents an approach to incorporating exogenous control inputs, such as\nvelocity or acceleration commands, into the continuous Gaussian process\nstate-estimation framework. It is shown that this approach generalizes across\ndifferent domains in robotics, making it applicable to both the estimation of\ncontinuous-time trajectories for mobile robots and the estimation of\nquasi-static continuum robot shapes. Results show that incorporating control\ninputs leads to more informed priors, potentially requiring less measurements\nand estimation nodes to obtain accurate estimates. This makes the approach\nparticularly useful in situations in which limited sensing is available. For\nexample, in a mobile robot localization experiment with sparse landmark\ndistance measurements and frequent odometry control inputs, our approach\nprovides accurate trajectory estimates with root-mean-square errors around 3-4\ncm and 4-5 degrees, even with time intervals up to five seconds between\ndiscrete estimation nodes, which significantly reduces computation time.\n","authors":["Sven Lilge","Timothy D. Barfoot"],"pdf_url":"https://arxiv.org/pdf/2408.01333v3.pdf","comment":"21 pages, 7 figures, Accepted to Robotica"},{"id":"http://arxiv.org/abs/2410.23963v2","updated":"2025-01-07T16:48:25Z","published":"2024-10-31T14:15:54Z","title":"Exploiting Information Theory for Intuitive Robot Programming of Manual\n  Activities","summary":"  Observational learning is a promising approach to enable people without\nexpertise in programming to transfer skills to robots in a user-friendly\nmanner, since it mirrors how humans learn new behaviors by observing others.\nMany existing methods focus on instructing robots to mimic human trajectories,\nbut motion-level strategies often pose challenges in skills generalization\nacross diverse environments. This paper proposes a novel framework that allows\nrobots to achieve a higher-level understanding of human-demonstrated manual\ntasks recorded in RGB videos. By recognizing the task structure and goals,\nrobots generalize what observed to unseen scenarios. We found our task\nrepresentation on Shannon's Information Theory (IT), which is applied for the\nfirst time to manual tasks. IT helps extract the active scene elements and\nquantify the information shared between hands and objects. We exploit scene\ngraph properties to encode the extracted interaction features in a compact\nstructure and segment the demonstration into blocks, streamlining the\ngeneration of Behavior Trees for robot replicas. Experiments validated the\neffectiveness of IT to automatically generate robot execution plans from a\nsingle human demonstration. Additionally, we provide HANDSOME, an open-source\ndataset of HAND Skills demOnstrated by Multi-subjEcts, to promote further\nresearch and evaluation in this field.\n","authors":["Elena Merlo","Marta Lagomarsino","Edoardo Lamon","Arash Ajoudani"],"pdf_url":"https://arxiv.org/pdf/2410.23963v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03907v1","updated":"2025-01-07T16:22:12Z","published":"2025-01-07T16:22:12Z","title":"Implicit Coordination using Active Epistemic Inference","summary":"  A Multi-robot system (MRS) provides significant advantages for intricate\ntasks such as environmental monitoring, underwater inspections, and space\nmissions. However, addressing potential communication failures or the lack of\ncommunication infrastructure in these fields remains a challenge. A significant\nportion of MRS research presumes that the system can maintain communication\nwith proximity constraints, but this approach does not solve situations where\ncommunication is either non-existent, unreliable, or poses a security risk.\nSome approaches tackle this issue using predictions about other robots while\nnot communicating, but these methods generally only permit agents to utilize\nfirst-order reasoning, which involves reasoning based purely on their own\nobservations. In contrast, to deal with this problem, our proposed framework\nutilizes Theory of Mind (ToM), employing higher-order reasoning by shifting a\nrobot's perspective to reason about a belief of others observations. Our\napproach has two main phases: i) an efficient runtime plan adaptation using\nactive inference to signal intentions and reason about a robot's own belief and\nthe beliefs of others in the system, and ii) a hierarchical epistemic planning\nframework to iteratively reason about the current MRS mission state. The\nproposed framework outperforms greedy and first-order reasoning approaches and\nis validated using simulations and experiments with heterogeneous robotic\nsystems.\n","authors":["Lauren Bramblett","Jonathan Reasoner","Nicola Bezzo"],"pdf_url":"https://arxiv.org/pdf/2501.03907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03881v1","updated":"2025-01-07T15:44:06Z","published":"2025-01-07T15:44:06Z","title":"An LSTM-based Test Selection Method for Self-Driving Cars","summary":"  Self-driving cars require extensive testing, which can be costly in terms of\ntime. To optimize this process, simple and straightforward tests should be\nexcluded, focusing on challenging tests instead. This study addresses the test\nselection problem for lane-keeping systems for self-driving cars. Road segment\nfeatures, such as angles and lengths, were extracted and treated as sequences,\nenabling classification of the test cases as \"safe\" or \"unsafe\" using a long\nshort-term memory (LSTM) model. The proposed model is compared against machine\nlearning-based test selectors. Results demonstrated that the LSTM-based method\noutperformed machine learning-based methods in accuracy and precision metrics\nwhile exhibiting comparable performance in recall and F1 scores. This work\nintroduces a novel deep learning-based approach to the road classification\nproblem, providing an effective solution for self-driving car test selection\nusing a simulation environment.\n","authors":["Ali Güllü","Faiz Ali Shah","Dietmar Pfahl"],"pdf_url":"https://arxiv.org/pdf/2501.03881v1.pdf","comment":"8 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.03859v1","updated":"2025-01-07T15:16:16Z","published":"2025-01-07T15:16:16Z","title":"A Synergistic Framework for Learning Shape Estimation and Shape-Aware\n  Whole-Body Control Policy for Continuum Robots","summary":"  In this paper, we present a novel synergistic framework for learning shape\nestimation and a shape-aware whole-body control policy for tendon-driven\ncontinuum robots. Our approach leverages the interaction between two Augmented\nNeural Ordinary Differential Equations (ANODEs) -- the Shape-NODE and\nControl-NODE -- to achieve continuous shape estimation and shape-aware control.\nThe Shape-NODE integrates prior knowledge from Cosserat rod theory, allowing it\nto adapt and account for model mismatches, while the Control-NODE uses this\nshape information to optimize a whole-body control policy, trained in a Model\nPredictive Control (MPC) fashion. This unified framework effectively overcomes\nlimitations of existing data-driven methods, such as poor shape awareness and\nchallenges in capturing complex nonlinear dynamics. Extensive evaluations in\nboth simulation and real-world environments demonstrate the framework's robust\nperformance in shape estimation, trajectory tracking, and obstacle avoidance.\nThe proposed method consistently outperforms state-of-the-art end-to-end,\nNeural-ODE, and Recurrent Neural Network (RNN) models, particularly in terms of\ntracking accuracy and generalization capabilities.\n","authors":["Mohammadreza Kasaei","Farshid Alambeigi","Mohsen Khadem"],"pdf_url":"https://arxiv.org/pdf/2501.03859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03191v2","updated":"2025-01-07T15:04:47Z","published":"2024-12-04T10:23:27Z","title":"Soft Adaptive Feet for Legged Robots: An Open-Source Model for\n  Locomotion Simulation","summary":"  In recent years, artificial feet based on soft robotics and under-actuation\nprinciples emerged to improve mobility on challenging terrains. This paper\npresents the application of the MuJoCo physics engine to realize a digital twin\nof an adaptive soft foot developed for use with legged robots. We release the\nMuJoCo soft foot digital twin as open source to allow users and researchers to\nexplore new approaches to locomotion. The work includes the system modeling\ntechniques along with the kinematic and dynamic attributes involved. Validation\nis conducted through a rigorous comparison with bench tests on a physical\nprototype, replicating these experiments in simulation. Results are evaluated\nbased on sole deformation and contact forces during foot-obstacle interaction.\nThe foot model is subsequently integrated into simulations of the humanoid\nrobot COMAN+, replacing its original flat feet. Results show an improvement in\nthe robot's ability to negotiate small obstacles without altering its control\nstrategy. Ultimately, this study offers a comprehensive modeling approach for\nadaptive soft feet, supported by qualitative comparisons of bipedal locomotion\nwith state of the art robotic feet.\n","authors":["Matteo Crotti","Luca Rossini","Balint K. Hodossy","Anna Pace","Giorgio Grioli","Antonio Bicchi","Manuel G. Catalano"],"pdf_url":"https://arxiv.org/pdf/2412.03191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03841v1","updated":"2025-01-07T14:50:33Z","published":"2025-01-07T14:50:33Z","title":"OmniManip: Towards General Robotic Manipulation via Object-Centric\n  Interaction Primitives as Spatial Constraints","summary":"  The development of general robotic systems capable of manipulating in\nunstructured environments is a significant challenge. While Vision-Language\nModels(VLM) excel in high-level commonsense reasoning, they lack the\nfine-grained 3D spatial understanding required for precise manipulation tasks.\nFine-tuning VLM on robotic datasets to create Vision-Language-Action\nModels(VLA) is a potential solution, but it is hindered by high data collection\ncosts and generalization issues. To address these challenges, we propose a\nnovel object-centric representation that bridges the gap between VLM's\nhigh-level reasoning and the low-level precision required for manipulation. Our\nkey insight is that an object's canonical space, defined by its functional\naffordances, provides a structured and semantically meaningful way to describe\ninteraction primitives, such as points and directions. These primitives act as\na bridge, translating VLM's commonsense reasoning into actionable 3D spatial\nconstraints. In this context, we introduce a dual closed-loop, open-vocabulary\nrobotic manipulation system: one loop for high-level planning through primitive\nresampling, interaction rendering and VLM checking, and another for low-level\nexecution via 6D pose tracking. This design ensures robust, real-time control\nwithout requiring VLM fine-tuning. Extensive experiments demonstrate strong\nzero-shot generalization across diverse robotic manipulation tasks,\nhighlighting the potential of this approach for automating large-scale\nsimulation data generation.\n","authors":["Mingjie Pan","Jiyao Zhang","Tianshu Wu","Yinghao Zhao","Wenlong Gao","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2501.03841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19950v2","updated":"2025-01-07T14:35:01Z","published":"2024-12-27T23:10:32Z","title":"Data-driven tool wear prediction in milling, based on a\n  process-integrated single-sensor approach","summary":"  Accurate tool wear prediction is essential for maintaining productivity and\nminimizing costs in machining. However, the complex nature of the tool wear\nprocess poses significant challenges to achieving reliable predictions. This\nstudy explores data-driven methods, in particular deep learning, for tool wear\nprediction. Traditional data-driven approaches often focus on a single process,\nrelying on multi-sensor setups and extensive data generation, which limits\ngeneralization to new settings. Moreover, multi-sensor integration is often\nimpractical in industrial environments. To address these limitations, this\nresearch investigates the transferability of predictive models using minimal\ntraining data, validated across two processes. Furthermore, it uses a simple\nsetup with a single acceleration sensor to establish a low-cost data generation\napproach that facilitates the generalization of models to other processes via\ntransfer learning. The study evaluates several machine learning models,\nincluding convolutional neural networks (CNN), long short-term memory networks\n(LSTM), support vector machines (SVM) and decision trees, trained on different\ninput formats such as feature vectors and short-time Fourier transform (STFT).\nThe performance of the models is evaluated on different amounts of training\ndata, including scenarios with significantly reduced datasets, providing\ninsight into their effectiveness under constrained data conditions. The results\ndemonstrate the potential of specific models and configurations for effective\ntool wear prediction, contributing to the development of more adaptable and\nefficient predictive maintenance strategies in machining. Notably, the ConvNeXt\nmodel has an exceptional performance, achieving an 99.1% accuracy in\nidentifying tool wear using data from only four milling tools operated until\nthey are worn.\n","authors":["Eric Hirsch","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2412.19950v2.pdf","comment":"Preprint submitted to Robotics and Computer-Integrated Manufacturing\n  ,14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.03819v1","updated":"2025-01-07T14:32:36Z","published":"2025-01-07T14:32:36Z","title":"An innovative mixed reality approach for Robotics Surgery","summary":"  Robotic-assisted procedures offer numerous advantages over traditional\napproaches, including improved dexterity, reduced fatigue, minimized trauma,\nand superior outcomes. However, the main challenge of these systems remains the\npoor visualization and perception of the surgical field. The goal of this paper\nis to provide an innovative approach concerning an application able to improve\nthe surgical procedures offering assistance in both preplanning and\nintraoperative steps of the surgery. The system has been designed to offer a\nbetter understanding of the patient through techniques that provide medical\nimages visualization, 3D anatomical structures perception and robotic planning.\nThe application was designed to be intuitive and user friendly, providing an\naugmented reality experience through the Hololens 2 device. It was tested in\nlaboratory conditions, yielding positive results.\n","authors":["Gabriela Rus","Nadim Al Hajjar","Ionut Zima","Calin Vaida","Corina Radu","Damien Chablat","Andra Ciocan","Doina Pîslă"],"pdf_url":"https://arxiv.org/pdf/2501.03819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19153v3","updated":"2025-01-07T13:41:26Z","published":"2024-12-26T10:17:21Z","title":"Sketch-MoMa: Teleoperation for Mobile Manipulator via Interpretation of\n  Hand-Drawn Sketches","summary":"  To use assistive robots in everyday life, a remote control system with common\ndevices, such as 2D devices, is helpful to control the robots anytime and\nanywhere as intended. Hand-drawn sketches are one of the intuitive ways to\ncontrol robots with 2D devices. However, since similar sketches have different\nintentions from scene to scene, existing work needs additional modalities to\nset the sketches' semantics. This requires complex operations for users and\nleads to decreasing usability. In this paper, we propose Sketch-MoMa, a\nteleoperation system using the user-given hand-drawn sketches as instructions\nto control a robot. We use Vision-Language Models (VLMs) to understand the\nuser-given sketches superimposed on an observation image and infer drawn shapes\nand low-level tasks of the robot. We utilize the sketches and the generated\nshapes for recognition and motion planning of the generated low-level tasks for\nprecise and intuitive operations. We validate our approach using\nstate-of-the-art VLMs with 7 tasks and 5 sketch shapes. We also demonstrate\nthat our approach effectively specifies the detailed motions, such as how to\ngrasp and how much to rotate. Moreover, we show the competitive usability of\nour approach compared with the existing 2D interface through a user experiment\nwith 14 participants.\n","authors":["Kosei Tanada","Yuka Iwanaga","Masayoshi Tsuchinaga","Yuji Nakamura","Takemitsu Mori","Remi Sakai","Takashi Yamamoto"],"pdf_url":"https://arxiv.org/pdf/2412.19153v3.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Project Page: https://toyotafrc.github.io/SketchMoMa-Proj"},{"id":"http://arxiv.org/abs/2501.03763v1","updated":"2025-01-07T13:04:39Z","published":"2025-01-07T13:04:39Z","title":"3D Printable Gradient Lattice Design for Multi-Stiffness Robotic Fingers","summary":"  Human fingers achieve exceptional dexterity and adaptability by combining\nstructures with varying stiffness levels, from soft tissues (low) to tendons\nand cartilage (medium) to bones (high). This paper explores developing a\nrobotic finger with similar multi-stiffness characteristics. Specifically, we\npropose using a lattice configuration, parameterized by voxel size and unit\ncell geometry, to optimize and achieve fine-tuned stiffness properties with\nhigh granularity. A significant advantage of this approach is the feasibility\nof 3D printing the designs in a single process, eliminating the need for manual\nassembly of elements with differing stiffness. Based on this method, we present\na novel, human-like finger, and a soft gripper. We integrate the latter with a\nrigid manipulator and demonstrate the effectiveness in pick and place tasks.\n","authors":["Siebe J. Schouten","Tomas Steenman","Rens File","Merlijn Den Hartog","Aimee Sakes","Cosimo Della Santina","Kirsten Lussenburg","Ebrahim Shahabi"],"pdf_url":"https://arxiv.org/pdf/2501.03763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12761v2","updated":"2025-01-07T11:18:08Z","published":"2024-03-19T14:27:31Z","title":"BTGenBot: Behavior Tree Generation for Robotic Tasks with Lightweight\n  LLMs","summary":"  This paper presents a novel approach to generating behavior trees for robots\nusing lightweight large language models (LLMs) with a maximum of 7 billion\nparameters. The study demonstrates that it is possible to achieve satisfying\nresults with compact LLMs when fine-tuned on a specific dataset. The key\ncontributions of this research include the creation of a fine-tuning dataset\nbased on existing behavior trees using GPT-3.5 and a comprehensive comparison\nof multiple LLMs (namely llama2, llama-chat, and code-llama) across nine\ndistinct tasks. To be thorough, we evaluated the generated behavior trees using\nstatic syntactical analysis, a validation system, a simulated environment, and\na real robot. Furthermore, this work opens the possibility of deploying such\nsolutions directly on the robot, enhancing its practical applicability.\nFindings from this study demonstrate the potential of LLMs with a limited\nnumber of parameters in generating effective and efficient robot behaviors.\n","authors":["Riccardo Andrea Izzo","Gianluca Bardaro","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2403.12761v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03666v1","updated":"2025-01-07T10:06:59Z","published":"2025-01-07T10:06:59Z","title":"Hybrid Machine Learning Model with a Constrained Action Space for\n  Trajectory Prediction","summary":"  Trajectory prediction is crucial to advance autonomous driving, improving\nsafety, and efficiency. Although end-to-end models based on deep learning have\ngreat potential, they often do not consider vehicle dynamic limitations,\nleading to unrealistic predictions. To address this problem, this work\nintroduces a novel hybrid model that combines deep learning with a kinematic\nmotion model. It is able to predict object attributes such as acceleration and\nyaw rate and generate trajectories based on them. A key contribution is the\nincorporation of expert knowledge into the learning objective of the deep\nlearning model. This results in the constraint of the available action space,\nthus enabling the prediction of physically feasible object attributes and\ntrajectories, thereby increasing safety and robustness. The proposed hybrid\nmodel facilitates enhanced interpretability, thereby reinforcing the\ntrustworthiness of deep learning methods and promoting the development of safe\nplanning solutions. Experiments conducted on the publicly available real-world\nArgoverse dataset demonstrate realistic driving behaviour, with benchmark\ncomparisons and ablation studies showing promising results.\n","authors":["Alexander Fertig","Lakshman Balasubramanian","Michael Botsch"],"pdf_url":"https://arxiv.org/pdf/2501.03666v1.pdf","comment":"Submitted to 2025 IEEE Intelligent Vehicles Symposium (IV)"},{"id":"http://arxiv.org/abs/2501.03606v1","updated":"2025-01-07T08:14:53Z","published":"2025-01-07T08:14:53Z","title":"VTAO-BiManip: Masked Visual-Tactile-Action Pre-training with Object\n  Understanding for Bimanual Dexterous Manipulation","summary":"  Bimanual dexterous manipulation remains significant challenges in robotics\ndue to the high DoFs of each hand and their coordination. Existing single-hand\nmanipulation techniques often leverage human demonstrations to guide RL methods\nbut fail to generalize to complex bimanual tasks involving multiple sub-skills.\nIn this paper, we introduce VTAO-BiManip, a novel framework that combines\nvisual-tactile-action pretraining with object understanding to facilitate\ncurriculum RL to enable human-like bimanual manipulation. We improve prior\nlearning by incorporating hand motion data, providing more effective guidance\nfor dual-hand coordination than binary tactile feedback. Our pretraining model\npredicts future actions as well as object pose and size using masked multimodal\ninputs, facilitating cross-modal regularization. To address the multi-skill\nlearning challenge, we introduce a two-stage curriculum RL approach to\nstabilize training. We evaluate our method on a bottle-cap unscrewing task,\ndemonstrating its effectiveness in both simulated and real-world environments.\nOur approach achieves a success rate that surpasses existing visual-tactile\npretraining methods by over 20%.\n","authors":["Zhengnan Sun","Zhaotai Shi","Jiayin Chen","Qingtao Liu","Yu Cui","Qi Ye","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2501.03606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03585v1","updated":"2025-01-07T07:19:30Z","published":"2025-01-07T07:19:30Z","title":"Collision Risk Quantification and Conflict Resolution in Trajectory\n  Tracking for Acceleration-Actuated Multi-Robot Systems","summary":"  One of the pivotal challenges in a multi-robot system is how to give\nattention to accuracy and efficiency while ensuring safety. Prior arts cannot\nstrictly guarantee collision-free for an arbitrarily large number of robots or\nthe results are considerably conservative. Smoothness of the avoidance\ntrajectory also needs to be further optimized. This paper proposes an\naccelerationactuated simultaneous obstacle avoidance and trajectory tracking\nmethod for arbitrarily large teams of robots, that provides a nonconservative\ncollision avoidance strategy and gives approaches for deadlock avoidance. We\npropose two ways of deadlock resolution, one involves incorporating an\nauxiliary velocity vector into the error function of the trajectory tracking\nmodule, which is proven to have no influence on global convergence of the\ntracking error. Furthermore, unlike the traditional methods that they address\nconflicts after a deadlock occurs, our decision-making mechanism avoids the\nnear-zero velocity, which is much more safer and efficient in crowed\nenvironments. Extensive comparison show that the proposed method is superior to\nthe existing studies when deployed in a large-scale robot system, with minimal\ninvasiveness.\n","authors":["Xiaoxiao Li","Zhirui Sun","Mansha Zheng","Hongpeng Wang","Shuai Li","Jiankun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03575v1","updated":"2025-01-07T06:55:50Z","published":"2025-01-07T06:55:50Z","title":"Cosmos World Foundation Model Platform for Physical AI","summary":"  Physical AI needs to be trained digitally first. It needs a digital twin of\nitself, the policy model, and a digital twin of the world, the world model. In\nthis paper, we present the Cosmos World Foundation Model Platform to help\ndevelopers build customized world models for their Physical AI setups. We\nposition a world foundation model as a general-purpose world model that can be\nfine-tuned into customized world models for downstream applications. Our\nplatform covers a video curation pipeline, pre-trained world foundation models,\nexamples of post-training of pre-trained world foundation models, and video\ntokenizers. To help Physical AI builders solve the most critical problems of\nour society, we make our platform open-source and our models open-weight with\npermissive licenses available via https://github.com/NVIDIA/Cosmos.\n","authors":[" NVIDIA"," :","Niket Agarwal","Arslan Ali","Maciej Bala","Yogesh Balaji","Erik Barker","Tiffany Cai","Prithvijit Chattopadhyay","Yongxin Chen","Yin Cui","Yifan Ding","Daniel Dworakowski","Jiaojiao Fan","Michele Fenzi","Francesco Ferroni","Sanja Fidler","Dieter Fox","Songwei Ge","Yunhao Ge","Jinwei Gu","Siddharth Gururani","Ethan He","Jiahui Huang","Jacob Huffman","Pooya Jannaty","Jingyi Jin","Seung Wook Kim","Gergely Klár","Grace Lam","Shiyi Lan","Laura Leal-Taixe","Anqi Li","Zhaoshuo Li","Chen-Hsuan Lin","Tsung-Yi Lin","Huan Ling","Ming-Yu Liu","Xian Liu","Alice Luo","Qianli Ma","Hanzi Mao","Kaichun Mo","Arsalan Mousavian","Seungjun Nah","Sriharsha Niverty","David Page","Despoina Paschalidou","Zeeshan Patel","Lindsey Pavao","Morteza Ramezanali","Fitsum Reda","Xiaowei Ren","Vasanth Rao Naik Sabavat","Ed Schmerling","Stella Shi","Bartosz Stefaniak","Shitao Tang","Lyne Tchapmi","Przemek Tredak","Wei-Cheng Tseng","Jibin Varghese","Hao Wang","Haoxiang Wang","Heng Wang","Ting-Chun Wang","Fangyin Wei","Xinyue Wei","Jay Zhangjie Wu","Jiashu Xu","Wei Yang","Lin Yen-Chen","Xiaohui Zeng","Yu Zeng","Jing Zhang","Qinsheng Zhang","Yuxuan Zhang","Qingqing Zhao","Artur Zolkowski"],"pdf_url":"https://arxiv.org/pdf/2501.03575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03535v1","updated":"2025-01-07T05:15:46Z","published":"2025-01-07T05:15:46Z","title":"SenseRAG: Constructing Environmental Knowledge Bases with Proactive\n  Querying for LLM-Based Autonomous Driving","summary":"  This study addresses the critical need for enhanced situational awareness in\nautonomous driving (AD) by leveraging the contextual reasoning capabilities of\nlarge language models (LLMs). Unlike traditional perception systems that rely\non rigid, label-based annotations, it integrates real-time, multimodal sensor\ndata into a unified, LLMs-readable knowledge base, enabling LLMs to dynamically\nunderstand and respond to complex driving environments. To overcome the\ninherent latency and modality limitations of LLMs, a proactive\nRetrieval-Augmented Generation (RAG) is designed for AD, combined with a\nchain-of-thought prompting mechanism, ensuring rapid and context-rich\nunderstanding. Experimental results using real-world Vehicle-to-everything\n(V2X) datasets demonstrate significant improvements in perception and\nprediction performance, highlighting the potential of this framework to enhance\nsafety, adaptability, and decision-making in next-generation AD systems.\n","authors":["Xuewen Luo","Fan Ding","Fengze Yang","Yang Zhou","Junnyong Loo","Hwa Hui Tew","Chenxi Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03535v1.pdf","comment":"This paper has been accepted for presentation at WACV Workshop LLMAD\n  2025"},{"id":"http://arxiv.org/abs/2401.06949v2","updated":"2025-01-07T05:00:50Z","published":"2024-01-13T02:03:28Z","title":"ORGANA: A Robotic Assistant for Automated Chemistry Experimentation and\n  Characterization","summary":"  Chemistry experiments can be resource- and labor-intensive, often requiring\nmanual tasks like polishing electrodes in electrochemistry. Traditional lab\nautomation infrastructure faces challenges adapting to new experiments. To\naddress this, we introduce ORGANA, an assistive robotic system that automates\ndiverse chemistry experiments using decision-making and perception tools. It\nmakes decisions with chemists in the loop to control robots and lab devices.\nORGANA interacts with chemists using Large Language Models (LLMs) to derive\nexperiment goals, handle disambiguation, and provide experiment logs. ORGANA\nplans and executes complex tasks with visual feedback, while supporting\nscheduling and parallel task execution. We demonstrate ORGANA's capabilities in\nsolubility, pH measurement, recrystallization, and electrochemistry\nexperiments. In electrochemistry, it executes a 19-step plan in parallel to\ncharacterize quinone derivatives for flow batteries. Our user study shows\nORGANA reduces frustration and physical demand by over 50%, with users saving\nan average of 80.3% of their time when using it.\n","authors":["Kourosh Darvish","Marta Skreta","Yuchi Zhao","Naruki Yoshikawa","Sagnik Som","Miroslav Bogdanovic","Yang Cao","Han Hao","Haoping Xu","Alán Aspuru-Guzik","Animesh Garg","Florian Shkurti"],"pdf_url":"https://arxiv.org/pdf/2401.06949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03515v1","updated":"2025-01-07T04:17:15Z","published":"2025-01-07T04:17:15Z","title":"Effects of Robot Competency and Motion Legibility on Human Correction\n  Feedback","summary":"  As robot deployments become more commonplace, people are likely to take on\nthe role of supervising robots (i.e., correcting their mistakes) rather than\ndirectly teaching them. Prior works on Learning from Corrections (LfC) have\nrelied on three key assumptions to interpret human feedback: (1) people correct\nthe robot only when there is significant task objective divergence; (2) people\ncan accurately predict if a correction is necessary; and (3) people trade off\nprecision and physical effort when giving corrections. In this work, we study\nhow two key factors (robot competency and motion legibility) affect how people\nprovide correction feedback and their implications on these existing\nassumptions. We conduct a user study ($N=60$) under an LfC setting where\nparticipants supervise and correct a robot performing pick-and-place tasks. We\nfind that people are more sensitive to suboptimal behavior by a highly\ncompetent robot compared to an incompetent robot when the motions are legible\n($p=0.0015$) and predictable ($p=0.0055$). In addition, people also tend to\nwithhold necessary corrections ($p < 0.0001$) when supervising an incompetent\nrobot and are more prone to offering unnecessary ones ($p = 0.0171$) when\nsupervising a highly competent robot. We also find that physical effort\npositively correlates with correction precision, providing empirical evidence\nto support this common assumption. We also find that this correlation is\nsignificantly weaker for an incompetent robot with legible motions than an\nincompetent robot with predictable motions ($p = 0.0075$). Our findings offer\ninsights for accounting for competency and legibility when designing robot\ninteraction behaviors and learning task objectives from corrections.\n","authors":["Shuangge Wang","Anjiabei Wang","Sofiya Goncharova","Brian Scassellati","Tesca Fitzgerald"],"pdf_url":"https://arxiv.org/pdf/2501.03515v1.pdf","comment":"to be published in the 2025 ACM/IEEE International Conference on\n  Human-Robot Interaction (HRI)"},{"id":"http://arxiv.org/abs/2501.03467v1","updated":"2025-01-07T01:51:12Z","published":"2025-01-07T01:51:12Z","title":"FRESHR-GSI: A Generalized Safety Model and Evaluation Framework for\n  Mobile Robots in Multi-Human Environments","summary":"  Human safety is critical in applications involving close human-robot\ninteractions (HRI) and is a key aspect of physical compatibility between humans\nand robots. While measures of human safety in HRI exist, these mainly target\nindustrial settings involving robotic manipulators. Less attention has been\npaid to settings where mobile robots and humans share the space. This paper\nintroduces a new robot-centered directional framework of human safety. It is\nparticularly useful for evaluating mobile robots as they operate in\nenvironments populated by multiple humans. The framework integrates several key\nmetrics, such as each human's relative distance, speed, and orientation. The\ncore novelty lies in the framework's flexibility to accommodate different\napplication requirements while allowing for both the robot-centered and\nexternal observer points of view. We instantiate the framework by using RGB-D\nbased vision integrated with a deep learning-based human detection pipeline to\nyield a generalized safety index (GSI) that instantaneously assesses human\nsafety. We evaluate GSI's capability of producing appropriate, robust, and\nfine-grained safety measures in real-world experimental scenarios and compare\nits performance with extant safety models.\n","authors":["Pranav Pandey","Ramviyas Parasuraman","Prashant Doshi"],"pdf_url":"https://arxiv.org/pdf/2501.03467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04264v2","updated":"2025-01-07T22:42:46Z","published":"2024-11-06T21:17:06Z","title":"MonoRollBot: 3-DOF Spherical Robot with Underactuated Single Compliant\n  Actuator Design","summary":"  Spherical rolling robots have garnered significant attention in the field of\nmobile robotics for applications such as inspection and space exploration.\nDesigning underactuated rolling robots poses challenges in achieving\nmulti-directional propulsion with high degrees of freedom while utilizing a\nlimited number of actuators. This paper presents the MonoRollBot, a novel\n3-degree-of-freedom (DOF) spherical robot that utilizes an underactuated\nmechanism driven by only a single spring-motor system. Unlike conventional\nspherical robots, MonoRollBot employs a minimalist actuation approach, relying\non only one motor and a passive spring to control its locomotion. The robot\nachieves 3-DOF motion through an innovative coupling of spring dynamics and\nmotor control. In this work, we detail the design of the MonoRollBot and\nevaluate its motion capabilities through design studies. We also do studies on\nits locomotion behaviours based on changes in rotating mass and stiffness\nproperties.\n","authors":["Zhiwei Liu","Seyed Amir Tafrishi"],"pdf_url":"https://arxiv.org/pdf/2411.04264v2.pdf","comment":"6 pages, 11 figures, accepted at IEEE RoboSoft 2025"},{"id":"http://arxiv.org/abs/2501.04170v1","updated":"2025-01-07T22:40:37Z","published":"2025-01-07T22:40:37Z","title":"A Bayesian Modeling Framework for Estimation and Ground Segmentation of\n  Cluttered Staircases","summary":"  Autonomous robot navigation in complex environments requires robust\nperception as well as high-level scene understanding due to perceptual\nchallenges, such as occlusions, and uncertainty introduced by robot movement.\nFor example, a robot climbing a cluttered staircase can misinterpret clutter as\na step, misrepresenting the state and compromising safety. This requires robust\nstate estimation methods capable of inferring the underlying structure of the\nenvironment even from incomplete sensor data. In this paper, we introduce a\nnovel method for robust state estimation of staircases. To address the\nchallenge of perceiving occluded staircases extending beyond the robot's\nfield-of-view, our approach combines an infinite-width staircase representation\nwith a finite endpoint state to capture the overall staircase structure. This\nrepresentation is integrated into a Bayesian inference framework to fuse noisy\nmeasurements enabling accurate estimation of staircase location even with\npartial observations and occlusions. Additionally, we present a segmentation\nalgorithm that works in conjunction with the staircase estimation pipeline to\naccurately identify clutter-free regions on a staircase. Our method is\nextensively evaluated on real robot across diverse staircases, demonstrating\nsignificant improvements in estimation accuracy and segmentation performance\ncompared to baseline approaches.\n","authors":["Prasanna Sriganesh","Burhanuddin Shirose","Matthew Travers"],"pdf_url":"https://arxiv.org/pdf/2501.04170v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.04169v1","updated":"2025-01-07T22:33:47Z","published":"2025-01-07T22:33:47Z","title":"Learning to Transfer Human Hand Skills for Robot Manipulations","summary":"  We present a method for teaching dexterous manipulation tasks to robots from\nhuman hand motion demonstrations. Unlike existing approaches that solely rely\non kinematics information without taking into account the plausibility of robot\nand object interaction, our method directly infers plausible robot manipulation\nactions from human motion demonstrations. To address the embodiment gap between\nthe human hand and the robot system, our approach learns a joint motion\nmanifold that maps human hand movements, robot hand actions, and object\nmovements in 3D, enabling us to infer one motion component from others. Our key\nidea is the generation of pseudo-supervision triplets, which pair human,\nobject, and robot motion trajectories synthetically. Through real-world\nexperiments with robot hand manipulation, we demonstrate that our data-driven\nretargeting method significantly outperforms conventional retargeting\ntechniques, effectively bridging the embodiment gap between human and robotic\nhands. Website at https://rureadyo.github.io/MocapRobot/.\n","authors":["Sungjae Park","Seungho Lee","Mingi Choi","Jiye Lee","Jeonghwan Kim","Jisoo Kim","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2501.04169v1.pdf","comment":"Preprint. Under Review"},{"id":"http://arxiv.org/abs/2401.14554v2","updated":"2025-01-07T20:44:10Z","published":"2024-01-25T22:49:13Z","title":"GCBF+: A Neural Graph Control Barrier Function Framework for Distributed\n  Safe Multi-Agent Control","summary":"  Distributed, scalable, and safe control of large-scale multi-agent systems is\na challenging problem. In this paper, we design a distributed framework for\nsafe multi-agent control in large-scale environments with obstacles, where a\nlarge number of agents are required to maintain safety using only local\ninformation and reach their goal locations. We introduce a new class of\ncertificates, termed graph control barrier function (GCBF), which are based on\nthe well-established control barrier function theory for safety guarantees and\nutilize a graph structure for scalable and generalizable distributed control of\nMAS. We develop a novel theoretical framework to prove the safety of an\narbitrary-sized MAS with a single GCBF. We propose a new training framework\nGCBF+ that uses graph neural networks to parameterize a candidate GCBF and a\ndistributed control policy. The proposed framework is distributed and is\ncapable of taking point clouds from LiDAR, instead of actual state information,\nfor real-world robotic applications. We illustrate the efficacy of the proposed\nmethod through various hardware experiments on a swarm of drones with\nobjectives ranging from exchanging positions to docking on a moving target\nwithout collision. Additionally, we perform extensive numerical experiments,\nwhere the number and density of agents, as well as the number of obstacles,\nincrease. Empirical results show that in complex environments with agents with\nnonlinear dynamics (e.g., Crazyflie drones), GCBF+ outperforms the hand-crafted\nCBF-based method with the best performance by up to 20% for relatively\nsmall-scale MAS with up to 256 agents, and leading reinforcement learning (RL)\nmethods by up to 40% for MAS with 1024 agents. Furthermore, the proposed method\ndoes not compromise on the performance, in terms of goal reaching, for\nachieving high safety rates, which is a common trade-off in RL-based methods.\n","authors":["Songyuan Zhang","Oswin So","Kunal Garg","Chuchu Fan"],"pdf_url":"https://arxiv.org/pdf/2401.14554v2.pdf","comment":"20 pages, 15 figures; Accepted by IEEE Transactions on Robotics\n  (T-RO)"},{"id":"http://arxiv.org/abs/2407.08213v2","updated":"2025-01-07T20:08:13Z","published":"2024-07-11T06:30:46Z","title":"PrefCLM: Enhancing Preference-based Reinforcement Learning with\n  Crowdsourced Large Language Models","summary":"  Preference-based reinforcement learning (PbRL) is emerging as a promising\napproach to teaching robots through human comparative feedback, sidestepping\nthe need for complex reward engineering. However, the substantial volume of\nfeedback required in existing PbRL methods often lead to reliance on synthetic\nfeedback generated by scripted teachers. This approach necessitates intricate\nreward engineering again and struggles to adapt to the nuanced preferences\nparticular to human-robot interaction (HRI) scenarios, where users may have\nunique expectations toward the same task. To address these challenges, we\nintroduce PrefCLM, a novel framework that utilizes crowdsourced large language\nmodels (LLMs) as simulated teachers in PbRL. We utilize Dempster-Shafer Theory\nto fuse individual preferences from multiple LLM agents at the score level,\nefficiently leveraging their diversity and collective intelligence. We also\nintroduce a human-in-the-loop pipeline that facilitates collective refinements\nbased on user interactive feedback. Experimental results across various general\nRL tasks show that PrefCLM achieves competitive performance compared to\ntraditional scripted teachers and excels in facilitating more more natural and\nefficient behaviors. A real-world user study (N=10) further demonstrates its\ncapability to tailor robot behaviors to individual user preferences,\nsignificantly enhancing user satisfaction in HRI scenarios.\n","authors":["Ruiqi Wang","Dezhong Zhao","Ziqin Yuan","Ike Obi","Byung-Cheol Min"],"pdf_url":"https://arxiv.org/pdf/2407.08213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08637v2","updated":"2025-01-07T19:26:17Z","published":"2024-06-12T20:50:26Z","title":"A Game Between Two Identical Dubins Cars: Evading a Conic Sensor in\n  Minimum Time","summary":"  A fundamental task in mobile robotics is keeping an intelligent agent under\nsurveillance with an autonomous robot as it travels in the environment. This\nwork studies a theoretical version of that problem involving one of the most\npopular vehicle platforms in robotics. In particular, we consider two identical\nDubins cars moving on a plane without obstacles. One of them plays as the\npursuer, and it is equipped with a limited field-of-view detection region\nmodeled as a semi-infinite cone with its apex at the pursuer's position. The\npursuer aims to maintain the other Dubins car, which plays as the evader, as\nmuch time as possible inside its detection region. On the contrary, the evader\nwants to escape as soon as possible. In this work, employing differential game\ntheory, we find the time-optimal motion strategies near the game's end. The\nanalysis of those trajectories reveals the existence of at least two singular\nsurfaces: a Transition Surface (also known as a Switch Surface) and an Evader's\nUniversal Surface. We also found that the barrier's standard construction\nproduces a surface that partially lies outside the playing space.\n","authors":["Ubaldo Ruiz"],"pdf_url":"https://arxiv.org/pdf/2406.08637v2.pdf","comment":"35 pages, 16 figures"},{"id":"http://arxiv.org/abs/2501.05478v1","updated":"2025-01-07T16:01:25Z","published":"2025-01-07T16:01:25Z","title":"Language and Planning in Robotic Navigation: A Multilingual Evaluation\n  of State-of-the-Art Models","summary":"  Large Language Models (LLMs) such as GPT-4, trained on huge amount of\ndatasets spanning multiple domains, exhibit significant reasoning,\nunderstanding, and planning capabilities across various tasks. This study\npresents the first-ever work in Arabic language integration within the\nVision-and-Language Navigation (VLN) domain in robotics, an area that has been\nnotably underexplored in existing research. We perform a comprehensive\nevaluation of state-of-the-art multi-lingual Small Language Models (SLMs),\nincluding GPT-4o mini, Llama 3 8B, and Phi-3 medium 14B, alongside the\nArabic-centric LLM, Jais. Our approach utilizes the NavGPT framework, a pure\nLLM-based instruction-following navigation agent, to assess the impact of\nlanguage on navigation reasoning through zero-shot sequential action prediction\nusing the R2R dataset. Through comprehensive experiments, we demonstrate that\nour framework is capable of high-level planning for navigation tasks when\nprovided with instructions in both English and Arabic. However, certain models\nstruggled with reasoning and planning in the Arabic language due to inherent\nlimitations in their capabilities, sub-optimal performance, and parsing issues.\nThese findings highlight the importance of enhancing planning and reasoning\ncapabilities in language models for effective navigation, emphasizing this as a\nkey area for further development while also unlocking the potential of\nArabic-language models for impactful real-world applications.\n","authors":["Malak Mansour","Ahmed Aly","Bahey Tharwat","Sarim Hashmi","Dong An","Ian Reid"],"pdf_url":"https://arxiv.org/pdf/2501.05478v1.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2408.13510v2","updated":"2025-01-07T18:16:17Z","published":"2024-08-24T08:12:22Z","title":"Intelligent Router for LLM Workloads: Improving Performance Through\n  Workload-Aware Load Balancing","summary":"  Large Language Model (LLM) workloads have distinct prefill and decode phases\nwith different compute and memory requirements which should ideally be\naccounted for when scheduling input queries across different LLM instances in a\ncluster. However existing scheduling algorithms treat LLM workloads as\nmonolithic jobs without considering the distinct characteristics of the two\nphases in each workload. This leads to sub-optimal scheduling and increased\nresponse latency. In this work, we start by characterizing factors affecting\nthe response latency during LLM inference serving. We establish that better\nload balancing of inference requests across the available LLM instances can\nimprove the end-to-end latency to a larger extent than merely focusing on\noptimizing the instance-level scheduler. Motivated by our findings, we propose\na heuristic-guided reinforcement learning-based intelligent router for\ndata-driven and workload-aware scheduling. Our router schedules queries across\nLLM instances by leveraging a trainable response-length predictor, and a novel\nformulation for estimating the impact of mixing different workloads and\nachieves over 11% lower end-to-end latency than existing approaches on a mix of\npublic datasets and 7.8% lower end-to-end latency on real workload data with\ndiverse input and output trends from Cloud Provider X. Additionally, the\nproposed framework can also serve as a standard for benchmarking different LLM\ninference schedulers since it provides the best latency for a given model,\nhardware, and instance-level scheduler combination.\n","authors":["Kunal Jain","Anjaly Parayil","Ankur Mallick","Esha Choukse","Xiaoting Qin","Jue Zhang","Íñigo Goiri","Rujia Wang","Chetan Bansal","Victor Rühle","Anoop Kulkarni","Steve Kofsky","Saravan Rajmohan"],"pdf_url":"https://arxiv.org/pdf/2408.13510v2.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.12235v3","updated":"2025-01-07T17:46:06Z","published":"2023-07-23T05:58:04Z","title":"Optimal Time-Invariant Distributed Formation Tracking for Second-Order\n  Multi-Agent Systems","summary":"  This paper addresses the optimal time-invariant formation tracking problem\nwith the aim of providing a distributed solution for multi-agent systems with\nsecond-order integrator dynamics. In the literature, most of the results\nrelated to multi-agent formation tracking do not consider energy issues while\ninvestigating distributed feedback control laws. In order to account for this\ncrucial design aspect, we contribute by formalizing and proposing a solution to\nan optimization problem that encapsulates trajectory tracking, distance-based\nformation control and input energy minimization, through a specific and key\nchoice of potential functions in the optimization cost. To this end, we show\nhow to compute the inverse dynamics in a centralized fashion by means of the\nProjector-Operator-based Newton's method for Trajectory Optimization (PRONTO)\nand, more importantly, we exploit such an offline solution as a general\nreference to devise a stabilizing online distributed control law. Finally,\nnumerical examples involving a cubic formation following a chicane-like path in\nthe 3D space are provided to validate the proposed control strategies.\n","authors":["Marco Fabris","Giulio Fattore","Angelo Cenedese"],"pdf_url":"https://arxiv.org/pdf/2307.12235v3.pdf","comment":"35 pages, 3 figures, accepted on March 27th, 2024 by the European\n  Journal of Control (first submission: June 23rd, 2023)"},{"id":"http://arxiv.org/abs/2312.14788v3","updated":"2025-01-07T17:07:18Z","published":"2023-12-22T16:00:42Z","title":"Harnessing Uncertainty for a Separation Principle in Direct Data-Driven\n  Predictive Control","summary":"  Model Predictive Control (MPC) is a powerful method for complex system\nregulation, but its reliance on an accurate model poses many limitations in\nreal-world applications. Data-driven predictive control (DDPC) aims at\novercoming this limitation, by relying on historical data to provide\ninformation on the plant to be controlled. In this work, we present a unified\nstochastic framework for direct DDPC, where control actions are obtained by\noptimizing the Final Control Error (FCE), which is directly computed from\navailable data only and automatically weighs the impact of uncertainty on the\ncontrol objective. Our framework allows us to establish a separation principle\nfor Predictive Control, elucidating the role that predictive models and their\nuncertainty play in DDPC. Moreover, it generalizes existing DDPC methods, like\nregularized Data-enabled Predictive Control (DeePC) and $\\gamma$-DDPC,\nproviding a path toward noise-tolerant data-based control with rigorous\noptimality guarantees. The theoretical investigation is complemented by a\nseries of experiments (code available on GitHub:\nhttps://github.com/marcofabris92/a-separation-principle-in-d3pc), revealing\nthat the proposed method consistently outperforms or, at worst, matches\nexisting techniques without requiring tuning regularization parameters as other\nmethods do.\n","authors":["Alessandro Chiuso","Marco Fabris","Valentina Breschi","Simone Formentin"],"pdf_url":"https://arxiv.org/pdf/2312.14788v3.pdf","comment":"17 pages, 2 figures, 1 table, accepted by Automatica on October 31st,\n  2024 (first submission: December 22nd, 2023)"},{"id":"http://arxiv.org/abs/2501.03894v1","updated":"2025-01-07T16:03:13Z","published":"2025-01-07T16:03:13Z","title":"Robust Moving-horizon Estimation for Nonlinear Systems: From Perfect to\n  Imperfect Optimization","summary":"  Robust stability of moving-horizon estimators is investigated for nonlinear\ndiscrete-time systems that are detectable in the sense of incremental\ninput/output-to-state stability and are affected by disturbances. The estimate\nof a moving-horizon estimator stems from the on-line solution of a\nleast-squares minimization problem at each time instant. The resulting\nstability guarantees depend on the optimization tolerance in solving such\nminimization problems. Specifically, two main contributions are established:\n(i) the robust stability of the estimation error, while supposing to solve\nexactly the on-line minimization problem; (ii) the practical robust stability\nof the estimation error with state estimates obtained by an imperfect\nminimization. Finally, the construction of such robust moving-horizon\nestimators and the performances resulting from the design based on the\ntheoretical findings are showcased with two numerical examples.\n","authors":["Angelo Alessandri"],"pdf_url":"https://arxiv.org/pdf/2501.03894v1.pdf","comment":"18 pages, 2 figures, 24 bibliographic references"},{"id":"http://arxiv.org/abs/2004.00159v3","updated":"2025-01-07T15:36:31Z","published":"2020-03-31T23:20:33Z","title":"Resilient Control of Dynamic Flow Networks Subject to Stochastic\n  Cyber-Physical Disruptions","summary":"  Modern network systems, such as transportation and communication systems, are\nprone to cyber-physical disruptions and thus suffer efficiency loss. This paper\nstudies network resiliency, in terms of throughput, and develops resilient\ncontrol to improve throughput. We consider single-commodity networks that admit\ncongestion propagation. We also apply a Markov process to model disruption\nswitches. For throughput analysis, we first use insights into congestion\nspillback to propose novel Lyapunov functions and then exploit monotone network\ndynamics to reduce computational costs of verifying stability conditions. For\ncontrol design, we show that (i) for a network with infinite link storage\nspace, there exists an open-loop control that attains the min-expected-cut\ncapacity; (ii) for a network with observable disruptions that restrict maximum\nsending and/or receiving flows, there exists a mode-dependent control that\nattains the expected-min-cut capacity; (iii) for general networks, there exists\na closed-loop control with throughput guarantees. We also derive lower bounds\nof resiliency scores for a set of numerical examples and verify resiliency\nimprovement with our method.\n","authors":["Yu Tang","Li Jin"],"pdf_url":"https://arxiv.org/pdf/2004.00159v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03191v2","updated":"2025-01-07T15:04:47Z","published":"2024-12-04T10:23:27Z","title":"Soft Adaptive Feet for Legged Robots: An Open-Source Model for\n  Locomotion Simulation","summary":"  In recent years, artificial feet based on soft robotics and under-actuation\nprinciples emerged to improve mobility on challenging terrains. This paper\npresents the application of the MuJoCo physics engine to realize a digital twin\nof an adaptive soft foot developed for use with legged robots. We release the\nMuJoCo soft foot digital twin as open source to allow users and researchers to\nexplore new approaches to locomotion. The work includes the system modeling\ntechniques along with the kinematic and dynamic attributes involved. Validation\nis conducted through a rigorous comparison with bench tests on a physical\nprototype, replicating these experiments in simulation. Results are evaluated\nbased on sole deformation and contact forces during foot-obstacle interaction.\nThe foot model is subsequently integrated into simulations of the humanoid\nrobot COMAN+, replacing its original flat feet. Results show an improvement in\nthe robot's ability to negotiate small obstacles without altering its control\nstrategy. Ultimately, this study offers a comprehensive modeling approach for\nadaptive soft feet, supported by qualitative comparisons of bipedal locomotion\nwith state of the art robotic feet.\n","authors":["Matteo Crotti","Luca Rossini","Balint K. Hodossy","Anna Pace","Giorgio Grioli","Antonio Bicchi","Manuel G. Catalano"],"pdf_url":"https://arxiv.org/pdf/2412.03191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02893v2","updated":"2025-01-07T13:21:10Z","published":"2025-01-06T10:15:21Z","title":"A Volumetric Approach to Privacy of Dynamical Systems","summary":"  Information-theoretic metrics, such as mutual information, have been widely\nused to evaluate privacy leakage in dynamic systems. However, these approaches\nare typically limited to stochastic systems and face computational challenges.\nIn this paper, we introduce a novel volumetric framework for analyzing privacy\nin systems affected by unknown but bounded noise. Our model considers a dynamic\nsystem comprising public and private states, where an observation set of the\npublic state is released. An adversary utilizes the observed public state to\ninfer an uncertainty set of the private state, referred to as the inference\nattack. We define the evolution dynamics of these inference attacks and\nquantify the privacy level of the private state using the volume of its\nuncertainty sets. For linear scalar systems, we derive an explicit formulation\nof the uncertainty set. For multi-dimensional linear systems, we develop an\napproximate computation method leveraging interval analysis. We investigate the\nproperties of the proposed volumetric privacy measure and demonstrate that it\nis bounded by the information gain derived from the observation set.\nFurthermore, we propose an optimization approach to designing privacy filter\nusing randomization and linear programming based on the proposed privacy\nmeasure. The effectiveness of the optimal privacy filter design is evaluated\nthrough a production-inventory case study, illustrating its robustness against\nthe inference attack.\n","authors":["Chuanghong Weng","Ehsan Nekouei"],"pdf_url":"https://arxiv.org/pdf/2501.02893v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03746v1","updated":"2025-01-07T12:40:11Z","published":"2025-01-07T12:40:11Z","title":"A Multimodal Lightweight Approach to Fault Diagnosis of Induction Motors\n  in High-Dimensional Dataset","summary":"  An accurate AI-based diagnostic system for induction motors (IMs) holds the\npotential to enhance proactive maintenance, mitigating unplanned downtime and\ncurbing overall maintenance costs within an industrial environment. Notably,\namong the prevalent faults in IMs, a Broken Rotor Bar (BRB) fault is frequently\nencountered. Researchers have proposed various fault diagnosis approaches using\nsignal processing (SP), machine learning (ML), deep learning (DL), and hybrid\narchitectures for BRB faults. One limitation in the existing literature is the\ntraining of these architectures on relatively small datasets, risking\noverfitting when implementing such systems in industrial environments. This\npaper addresses this limitation by implementing large-scale data of BRB faults\nby using a transfer-learning-based lightweight DL model named ShuffleNetV2 for\ndiagnosing one, two, three, and four BRB faults using current and vibration\nsignal data. Spectral images for training and testing are generated using a\nShort-Time Fourier Transform (STFT). The dataset comprises 57,500 images, with\n47,500 used for training and 10,000 for testing. Remarkably, the ShuffleNetV2\nmodel exhibited superior performance, in less computational cost as well as\naccurately classifying 98.856% of spectral images. To further enhance the\nvisualization of harmonic sidebands resulting from broken bars, Fast Fourier\nTransform (FFT) is applied to current and vibration data. The paper also\nprovides insights into the training and testing times for each model,\ncontributing to a comprehensive understanding of the proposed fault diagnosis\nmethodology. The findings of our research provide valuable insights into the\nperformance and efficiency of different ML and DL models, offering a foundation\nfor the development of robust fault diagnosis systems for induction motors in\nindustrial settings.\n","authors":["Usman Ali"],"pdf_url":"https://arxiv.org/pdf/2501.03746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05022v5","updated":"2025-01-07T11:51:30Z","published":"2024-08-09T12:22:35Z","title":"Robust Backstepping Control of a Quadrotor Unmanned Aerial Vehicle Under\n  Colored Noises","summary":"  Advances in software and hardware technologies have facilitated the\nproduction of quadrotor unmanned aerial vehicles (UAVs). Nowadays, people\nactively use quadrotor UAVs in essential missions such as search and rescue,\ncounter-terrorism, firefighting, surveillance, and cargo transportation. While\nperforming these tasks, quadrotors must operate in noisy environments.\nTherefore, a robust controller design that can control the altitude and\nattitude of the quadrotor in noisy environments is of great importance. Many\nresearchers have focused only on white Gaussian noise in their studies, whereas\nresearchers need to consider the effects of all colored noises during the\noperation of the quadrotor. This study aims to design a robust controller that\nis resistant to all colored noises. Firstly, a nonlinear quadrotor model was\ncreated with MATLAB. Then, a backstepping controller resistant to colored\nnoises was designed. The designed backstepping controller was tested under\nGaussian white, pink, brown, blue, and purple noises. PID and Lyapunov-based\ncontroller designs were also carried out, and their time responses (rise time,\novershoot, settling time) were compared with those of the backstepping\ncontroller. In the simulations, time was in seconds, altitude was in meters,\nand roll, pitch, and yaw references were in radians. Rise and settling time\nvalues were in seconds, and overshoot value was in percent. When the obtained\nvalues are examined, simulations prove that the proposed backstepping\ncontroller has the least overshoot and the shortest settling time under all\nnoise types.\n","authors":["Mehmet Karahan"],"pdf_url":"https://arxiv.org/pdf/2408.05022v5.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.03691v1","updated":"2025-01-07T10:43:26Z","published":"2025-01-07T10:43:26Z","title":"Stabilization of Strictly Pre-Dissipative Receding Horizon Linear\n  Quadratic Control by Terminal Costs","summary":"  Asymptotic stability in receding horizon control is obtained under a strict\npre-dissipativity assumption, in the presence of suitable state constraints. In\nthis paper we analyze how terminal constraints can be replaced by suitable\nterminal costs. We restrict to the linear-quadratic setting as that allows us\nto obtain stronger results, while we analyze the full nonlinear case in a\nseparate contribution.\n","authors":["Mario Zanon","Lars Grüne"],"pdf_url":"https://arxiv.org/pdf/2501.03691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03671v1","updated":"2025-01-07T10:18:37Z","published":"2025-01-07T10:18:37Z","title":"Imitation Learning of MPC with Neural Networks: Error Guarantees and\n  Sparsification","summary":"  This paper presents a framework for bounding the approximation error in\nimitation model predictive controllers utilizing neural networks. Leveraging\nthe Lipschitz properties of these neural networks, we derive a bound that\nguides dataset design to ensure the approximation error remains at chosen\nlimits. We discuss how this method can be used to design a stable neural\nnetwork controller with performance guarantees employing existing robust model\npredictive control approaches for data generation. Additionally, we introduce a\ntraining adjustment, which is based on the sensitivities of the optimization\nproblem and reduces dataset density requirements based on the derived bounds.\nWe verify that the proposed augmentation results in improvements to the\nnetwork's predictive capabilities and a reduction of the Lipschitz constant.\nMoreover, on a simulated inverted pendulum problem, we show that the approach\nresults in a closer match of the closed-loop behavior between the imitation and\nthe original model predictive controller.\n","authors":["Hendrik Alsmeier","Lukas Theiner","Anton Savchenko","Ali Mesbah","Rolf Findeisen"],"pdf_url":"https://arxiv.org/pdf/2501.03671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03653v1","updated":"2025-01-07T09:33:50Z","published":"2025-01-07T09:33:50Z","title":"Study of Frictional and Impact Transients in Active-Passive Mechanical\n  Pair","summary":"  We consider an active-passive mechanical pair in which the relative motion of\nthe latter is constrained by the mechanical impact. The system dynamics is\ndescribed by the previously introduced modeling frameworks of force transition\nand dissipation through the nonlinear Coulomb friction and structural damping,\nthe later in accord with Hertzian contact theory. The focus of the recent study\nis on combining both interaction mechanisms, and the detailed experimental\nevaluation which discloses validity of the modeling assumptions. Such\nmechanical pair interactions can be found in various mechatronic systems and\nmechanisms, like for example clutches, backlash elements, sliding items on the\nshaking and inclining surfaces, conveyor belts and others. This practical study\ndemonstrates and discusses the transients of a vibro-impact dynamics and shows\ntheoretical developments in line with experimental evaluation.\n","authors":["Michael Ruderman","Francesco De Rito"],"pdf_url":"https://arxiv.org/pdf/2501.03653v1.pdf","comment":"4 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.03628v1","updated":"2025-01-07T08:56:56Z","published":"2025-01-07T08:56:56Z","title":"A Novel Approach to Real-Time Short-Term Traffic Prediction based on\n  Distributed Fiber-Optic Sensing and Data Assimilation with a Stochastic\n  Cell-Automata Model","summary":"  This paper demonstrates real-time short-term traffic flow prediction through\ndistributed fiber-optic sensing (DFOS) and data assimilation with a stochastic\ncell-automata-based traffic model. Traffic congestion on expressways is a\nsevere issue. To alleviate its negative impacts, it is necessary to optimize\ntraffic flow prior to becoming serious congestion. For this purpose, real-time\nshort-term traffic flow prediction is promising. However, conventional traffic\nmonitoring apparatus used in prediction methods faces a technical issue due to\nthe sparsity in traffic flow data. To overcome the issue for realizing\nreal-time traffic prediction, this paper employs DFOS, which enables to obtain\nspatially continuous and real-time traffic flow data along the road without\ndead zones. Using mean velocities derived from DFOS data as a feature\nextraction, this paper proposes a real-time data assimilation method for the\nshort-term prediction. As the theoretical model, the stochastic\nNishinari-Fukui-Schadschneider model is adopted. Future traffic flow is\nsimulated with the optimal values of model parameters estimated from observed\nmean velocities and the initial condition estimated as the latest microscopic\ntraffic state. This concept is validated using two congestion scenarios\nobtained in Japanese expressways. The results show that the mean absolute error\nof the predicted mean velocities is 10-15 km/h in the prediction horizon of 30\nminutes. Furthermore, the prediction error in congestion length and travel time\ndecreases by 40-84% depending on congestion scenarios when compared with\nconventional methods with traffic counters. This paper concludes that real-time\ndata assimilation using DFOS enables an accurate short-term traffic prediction.\n","authors":["Yoshiyuki Yajima","Hemant Prasad","Daisuke Ikefuji","Takemasa Suzuki","Shin Tominaga","Hitoshi Sakurai","Manabu Otani"],"pdf_url":"https://arxiv.org/pdf/2501.03628v1.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.03608v1","updated":"2025-01-07T08:19:27Z","published":"2025-01-07T08:19:27Z","title":"A 3D Continuous-Space Electromagnetic Channel Model for 6G Tri-Polarized\n  Multi-user Communications","summary":"  It is envisioned that the sixth generation (6G) and beyond 6G (B6G) wireless\ncommunication networks will enable global coverage in space, air, ground, and\nsea. In this case, both base stations and users can be mobile and will tend to\nmove continuously in three-dimensional (3D) space. Therefore, obtaining channel\nstate information (CSI) in 3D continuous-space is crucial for the design and\nperformance evaluation of future 6G and B6G wireless systems. On the other\nhand, new 6G technologies such as integrated sensing and communications (ISAC)\nwill also require prior knowledge of CSI in 3D continuous-space. In this paper,\na 3D continuous-space electromagnetic channel model is proposed for\ntri-polarized multi-user communications, taking into account scatterers and\nspherical wavefronts. Scattered fields are calculated using the method of\nmoments (MoM) with high accuracy. Spherical wave functions are utilized to\ndecompose the dyadic Green's functions that connect the transmitted source\ncurrents and the received electric fields. Simulation results demonstrate that\ntransmit power, apertures, scatterers, and sample intervals have significant\nimpacts on statistical properties and channel capacities, providing insights\ninto the performance of continuous-space electromagnetic channel models and the\ndesign of future wireless systems.\n","authors":["Yue Yang","Cheng-Xiang Wang","Jie Huang","John Thompson","H. Vincent Poor"],"pdf_url":"https://arxiv.org/pdf/2501.03608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03577v1","updated":"2025-01-07T07:06:46Z","published":"2025-01-07T07:06:46Z","title":"Wireless Channel Measurements and Characterization in Industrial IoT\n  Scenarios","summary":"  Wireless Fidelity (Wi-Fi) communication technologies hold significant\npotential for realizing the Industrial Internet of Things (IIoT). In this\npaper, both Single-Input Single-Output (SISO) and polarized Multiple-Input\nMultiple-Output (MIMO) channel measurements are conducted in an IIoT scenario\nat the less congested Wi-Fi band, i.e., 5.5~GHz. The purpose is to investigate\nwireless characteristics of communications between access points and terminals\nmounted on automated guided vehicles as well as those surrounding manufacturing\nareas. For SISO channel measurements, statistical properties including the\ndelay Power Spectral Density (PSD), path loss, shadowing fading, delay spread,\nexcess delay, K-factor, and amplitude distribution of small-scale fading are\nanalyzed and compared with those observed in an office scenario. For MIMO\nchannel measurements, results show that there are multiple Dense Multipath\nComponent (DMC) processes in the delay PSD. An estimation algorithm based on\nthe algorithm for a single DMC process is proposed to effectively process the\nmulti-processes data. Moreover, delay, angular, power, and polarization\nproperties of DMCs are investigated and compared with those of specular\nmultipath components. Furthermore, effects of DMCs on Singular Values (SVs) and\nchannel capacities are explored. Ignoring DMCs can overestimate SVs and\nunderestimate channel capacities.\n","authors":["Li Zhang","Cheng-Xiang Wang","Zihao Zhou","Yuxiao Li","Jie Huang","Lijian Xin","Chun Pan","Dabo Zheng","Xiping Wu"],"pdf_url":"https://arxiv.org/pdf/2501.03577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03552v1","updated":"2025-01-07T05:58:07Z","published":"2025-01-07T05:58:07Z","title":"Proxy Control Barrier Functions: Integrating Barrier-Based and\n  Lyapunov-Based Safety-Critical Control Design","summary":"  This work introduces a novel Proxy Control Barrier Function (PCBF) scheme\nthat integrates barrier-based and Lyapunov-based safety-critical control\nstrategies for strict-feedback systems with potentially unknown dynamics. The\nproposed method employs a modular design procedure, decomposing the original\nsystem into a proxy subsystem and a virtual tracking subsystem that are\ncontrolled by the control barrier function (CBF)-based and Lyapunov-based\ncontrollers, respectively. By integrating these separately designed\ncontrollers, the overall system's safety is ensured. Moreover, a new\nfilter-based disturbance observer is utilized to design a PCBF-based safe\ncontroller for strict-feedback systems subject to mismatched disturbances. This\napproach broadens the class of systems to which CBF-based methods can be\napplied and significantly simplifies CBF construction by requiring only the\nmodel of the proxy subsystem. The effectiveness of the proposed method is\ndemonstrated through numerical simulations.\n","authors":["Yujie Wang","Xiangru Xu"],"pdf_url":"https://arxiv.org/pdf/2501.03552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03543v1","updated":"2025-01-07T05:37:59Z","published":"2025-01-07T05:37:59Z","title":"Distributionally Robust Joint Chance-Constrained Optimal Power Flow\n  using Relative Entropy","summary":"  Designing robust algorithms for the optimal power flow (OPF) problem is\ncritical for the control of large-scale power systems under uncertainty. The\nchance-constrained OPF (CCOPF) problem provides a natural formulation of the\ntrade-off between the operating cost and the constraint satisfaction rate. In\nthis work, we propose a new data-driven algorithm for the CCOPF problem, based\non distributionally robust optimization (DRO). \\revise{We show that the\nproposed reformulation of the distributionally robust chance constraints is\nexact, whereas other approaches in the CCOPF literature rely on conservative\napproximations. We establish out-of-sample robustness guarantees for the\ndistributionally robust solution and prove that the solution is the most\nefficient among all approaches enjoying the same guarantees.} We apply the\nproposed algorithm to the the CCOPF problem and compare the performance of our\napproach with existing methods using simulations on IEEE benchmark power\nsystems.\n","authors":["Eli Brock","Haixiang Zhang","Javad Lavaei","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2501.03543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03503v1","updated":"2025-01-07T03:46:38Z","published":"2025-01-07T03:46:38Z","title":"Resilient Distributed Control for Uncertain Nonlinear Interconnected\n  Systems under Network Anomaly","summary":"  We address a distributed adaptive control methodology for nonlinear\ninterconnected systems possibly affected by network anomalies. In the framework\nof adaptive approximation, the distributed controller and parameter estimator\nare designed by exploiting a backstepping approach. The stability of the\ndistributed control system under anomalies is analyzed, where both local and\nneighboring anomaly effects are considered. To quantify the resilience of the\ninterconnected system under the action of network anomalies, we derive bounds\non the duration of each anomaly and the resting time between two consecutive\nanomalies. Specifically, when each anomaly duration is smaller than our\ndesigned upper bound, the interconnected system controlled by the distributed\napproximation-based controller remains asymptotically stable. Moreover, if the\nresting time between two consecutive anomalies is larger than the proposed\nbound, then all signals of the control system are guaranteed to be bounded. In\nthe paper, we show that under the action of the proposed distributed adaptive\ncontroller, the interconnected system remains stable in the presence of network\nanomalies, with both the qualitative and quantitative resilient conditions.\nExtensive simulation results show the effectiveness of our theoretical results.\n","authors":["Youqing Wang","Ying Li","Thomas Parisini","Dong Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.03503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03496v1","updated":"2025-01-07T03:35:14Z","published":"2025-01-07T03:35:14Z","title":"A Unified Attack Detection Strategy for Multi-Agent Systems over\n  Transient and Steady Stages","summary":"  This paper proposes a unified detection strategy against three kinds of\nattacks for multi-agent systems (MASs) which is applicable to both transient\nand steady stages. For attacks on the communication layer, a watermarking-based\ndetection scheme with KullbackLeibler (KL) divergence is designed. Different\nfrom traditional communication schemes, each agent transmits a message set\ncontaining two state values with different types of watermarking. It is found\nthat the detection performance is determined by the relevant parameters of the\nwatermarking signal. Unlike the existing detection manoeuvres, such a scheme is\ncapable of transient and steady stages. For attacks on the agent layer, a\nconvergence rate related detection approach is put forward. It is shown that\nthe resilience of the considered system is characterized by the coefficient and\noffset of the envelope. For hybrid attacks, based on the above detection\nmechanisms, a general framework resorting to trusted agents is presented, which\nrequires weaker graph conditions and less information transmission. Finally, an\nexample associated with the platooning of connected vehicles is given to\nsupport the theoretical results.\n","authors":["Jinming Gao","Yijing Wang","Wentao Zhang","Rui Zhao","Yang Shi","Zhiqiang Zuo"],"pdf_url":"https://arxiv.org/pdf/2501.03496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09061v2","updated":"2025-01-07T01:53:54Z","published":"2024-06-13T12:50:57Z","title":"Joint Observer Gain and Input Design for Asymptotic Active Fault\n  Diagnosis","summary":"  This paper proposes a joint gain and input design method for observer-based\nasymptotic active fault diagnosis, which is based on a newly-defined notion\nnamed the excluding degree of the origin from a zonotope. Using the excluding\ndegree, a quantitative specification is obtained to characterize the\nperformance of set-based robust fault diagnosis. Furthermore, a single gain\ndesign method and a joint gain and input design method are proposed,\nrespectively. This is the first work to achieve a joint observer gain and input\ndesign for set-based active fault diagnosis. Compared with the existing methods\nthat design gains and input separately, the proposed joint gain and input\ndesign method has advantages to exploit the fault diagnosis potential of\nobserver-based schemes. Finally, several examples are used to illustrate the\neffectiveness of the proposed methods.\n","authors":["Feng Xu","Yiming Wan","Ye Wang","Vicenc Puig"],"pdf_url":"https://arxiv.org/pdf/2406.09061v2.pdf","comment":"Provisionally accepted by Automatica as Regular Paper"},{"id":"http://arxiv.org/abs/2501.03465v1","updated":"2025-01-07T01:47:49Z","published":"2025-01-07T01:47:49Z","title":"Extending Internet Access Over LoRa for Internet of Things and Critical\n  Applications","summary":"  LoRa bridges the gap between remote locations and mainstream networks,\nenabling large-scale Internet of Things (IoT) deployments. Despite the recent\nadvancements around LoRa, Internet access over this technology is still largely\nunexplored. Most existing solutions only handle packets within the local LoRa\nnetwork and do not interact with web applications. This limits the scalability\nand the ability to deliver essential web services in disconnected regions. This\nwork proposes and implements ILoRa to extend the public Internet to\ndisconnected areas for essential service delivery. ILoRa enables accessing\nApplication Programming Interfaces (APIs) and web pages on the Internet over a\nLoRa backbone network. It comprises a ILoRa coordinator code (ICN) and access\npoint nodes (APNs). The ICN interfaces the LoRa network with the public\nInternet and interprets content. The APN tethers a WiFi hotspot to which\ndevices connect and access the web content. This work further proposes data\nhandling methods for ICNs and APNs. An actual hardware-based implementation\nvalidates the proposed system. The implementation achieves a throughput of 1.06\nkbps tested for an Internet-based API returning JSON data of 930 B.\nFurthermore, the APN consumed approximately $0.162$A current, and the resource\nutilization on the ICN was minimal.\n","authors":["Atonu Ghosh","Devadeep Misra","Hirdesh Mewada"],"pdf_url":"https://arxiv.org/pdf/2501.03465v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.00252v2","updated":"2025-01-07T23:33:22Z","published":"2024-11-29T20:48:50Z","title":"Localization Phenomena in Large-Scale Networked Systems: Robustness and\n  Fragility of Dynamics","summary":"  We study phenomena where some eigenvectors of a graph Laplacian are largely\nconfined in small subsets of the graph. These localization phenomena are\nsimilar to those generally termed Anderson Localization in the Physics\nliterature, and are related to the complexity of the structure of large graphs\nin still unexplored ways. Using spectral perturbation theory and\npseudo-spectrum analysis, we explain how the presence of localized eigenvectors\ngives rise to fragilities (low robustness margins) to unmodeled node or link\ndynamics. Our analysis is demonstrated by examples of networks with relatively\nlow complexity, but with features that appear to induce eigenvector\nlocalization. The implications of this newly-discovered fragility phenomenon\nare briefly discussed.\n","authors":["Poorva Shukla","Bassam Bamieh"],"pdf_url":"https://arxiv.org/pdf/2412.00252v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04160v1","updated":"2025-01-07T22:19:06Z","published":"2025-01-07T22:19:06Z","title":"Collaborative Spacecraft Servicing under Partial Feedback using\n  Lyapunov-based Deep Neural Networks","summary":"  Multi-agent systems are increasingly applied in space missions, including\ndistributed space systems, resilient constellations, and autonomous rendezvous\nand docking operations. A critical emerging application is collaborative\nspacecraft servicing, which encompasses on-orbit maintenance, space debris\nremoval, and swarm-based satellite repositioning. These missions involve\nservicing spacecraft interacting with malfunctioning or defunct spacecraft\nunder challenging conditions, such as limited state information, measurement\ninaccuracies, and erratic target behaviors. Existing approaches often rely on\nassumptions of full state knowledge or single-integrator dynamics, which are\nimpractical for real-world applications involving second-order spacecraft\ndynamics. This work addresses these challenges by developing a distributed\nstate estimation and tracking framework that requires only relative position\nmeasurements and operates under partial state information. A novel\n$\\rho$-filter is introduced to reconstruct unknown states using locally\navailable information, and a Lyapunov-based deep neural network adaptive\ncontroller is developed that adaptively compensates for uncertainties stemming\nfrom unknown spacecraft dynamics. To ensure the collaborative spacecraft\nregulation problem is well-posed, a trackability condition is defined. A\nLyapunov-based stability analysis is provided to ensure exponential convergence\nof errors in state estimation and spacecraft regulation to a neighborhood of\nthe origin under the trackability condition. The developed method eliminates\nthe need for expensive velocity sensors or extensive pre-training, offering a\npractical and robust solution for spacecraft servicing in complex, dynamic\nenvironments.\n","authors":["Cristian F. Nino","Omkar Sudhir Patil","Christopher D. Petersen","Sean Phillips","Warren E. Dixon"],"pdf_url":"https://arxiv.org/pdf/2501.04160v1.pdf","comment":"24 pages, 4 Figures, Journal"},{"id":"http://arxiv.org/abs/2501.04120v1","updated":"2025-01-07T20:02:11Z","published":"2025-01-07T20:02:11Z","title":"Bridging Impulse Control of Piecewise Deterministic Markov Processes and\n  Markov Decision Processes: Frameworks, Extensions, and Open Challenges","summary":"  Control theory plays a pivotal role in understanding and optimizing the\nbehavior of complex dynamical systems across various scientific and engineering\ndisciplines. Two key frameworks that have emerged for modeling and solving\ncontrol problems in stochastic systems are piecewise deterministic Markov\nprocesses (PDMPs) and Markov decision processes (MDPs). Each framework has its\nunique strengths, and their intersection offers promising opportunities for\ntackling a broad class of problems, particularly in the context of impulse\ncontrols and decision-making in complex systems.\n  The relationship between PDMPs and MDPs is a natural subject of exploration,\nas embedding impulse control problems for PDMPs into the MDP framework could\nopen new avenues for their analysis and resolution. Specifically, this\nintegration would allow leveraging the computational and theoretical tools\ndeveloped for MDPs to address the challenges inherent in PDMPs. On the other\nhand, PDMPs can offer a versatile and simple paradigm to model continuous time\nproblems that are often described as discrete-time MDPs parametrized by complex\ntransition kernels. This transformation has the potential to bridge the gap\nbetween the two frameworks, enabling solutions to previously intractable\nproblems and expanding the scope of both fields. This paper presents a\ncomprehensive review of two research domains, illustrated through a recurring\nmedical example. The example is revisited and progressively formalized within\nthe framework of thevarious concepts and objects introduced\n","authors":["Alice Cleynen","Benoîte de Saporta","Orlane Rossini","Régis Sabbadin","Amélie Vernay"],"pdf_url":"https://arxiv.org/pdf/2501.04120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04104v1","updated":"2025-01-07T19:24:11Z","published":"2025-01-07T19:24:11Z","title":"Security by Design Issues in Autonomous Vehicles","summary":"  As autonomous vehicle (AV) technology advances towards maturity, it becomes\nimperative to examine the security vulnerabilities within these cyber-physical\nsystems. While conventional cyber-security concerns are often at the forefront\nof discussions, it is essential to get deeper into the various layers of\nvulnerability that are often overlooked within mainstream frameworks. Our goal\nis to spotlight imminent challenges faced by AV operators and explore emerging\ntechnologies for comprehensive solutions. This research outlines the diverse\nsecurity layers, spanning physical, cyber, coding, and communication aspects,\nin the context of AVs. Furthermore, we provide insights into potential\nsolutions for each potential attack vector, ensuring that autonomous vehicles\nremain secure and resilient in an evolving threat landscape.\n","authors":["Martin Higgins","Devki Jha","David Blundell","David Wallom"],"pdf_url":"https://arxiv.org/pdf/2501.04104v1.pdf","comment":null}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2409.08861v5","updated":"2025-01-07T18:12:27Z","published":"2024-09-13T14:22:14Z","title":"Adjoint Matching: Fine-tuning Flow and Diffusion Generative Models with\n  Memoryless Stochastic Optimal Control","summary":"  Dynamical generative models that produce samples through an iterative\nprocess, such as Flow Matching and denoising diffusion models, have seen\nwidespread use, but there have not been many theoretically-sound methods for\nimproving these models with reward fine-tuning. In this work, we cast reward\nfine-tuning as stochastic optimal control (SOC). Critically, we prove that a\nvery specific memoryless noise schedule must be enforced during fine-tuning, in\norder to account for the dependency between the noise variable and the\ngenerated samples. We also propose a new algorithm named Adjoint Matching which\noutperforms existing SOC algorithms, by casting SOC problems as a regression\nproblem. We find that our approach significantly improves over existing methods\nfor reward fine-tuning, achieving better consistency, realism, and\ngeneralization to unseen human preference reward models, while retaining sample\ndiversity.\n","authors":["Carles Domingo-Enrich","Michal Drozdzal","Brian Karrer","Ricky T. Q. Chen"],"pdf_url":"https://arxiv.org/pdf/2409.08861v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08232v2","updated":"2025-01-07T17:49:59Z","published":"2024-08-15T15:57:59Z","title":"Characterizations of the Aubin Property of the Solution Mapping for\n  Nonlinear Semidefinite Programming","summary":"  In this paper, we study the Aubin property of the Karush-Kuhn-Tucker solution\nmapping for the nonlinear semidefinite programming (NLSDP) problem at a locally\noptimal solution. In the literature, it is known that the Aubin property\nimplies the constraint nondegeneracy by Fusek [SIAM J. Optim. 23 (2013), pp.\n1041-1061] and the second-order sufficient condition by Ding et al. [SIAM J.\nOptim. 27 (2017), pp. 67-90]. Based on the Mordukhovich criterion, here we\nfurther prove that the strong second-order sufficient condition is also\nnecessary for the Aubin property to hold. Consequently, several equivalent\nconditions including the strong regularity are established for NLSDP's Aubin\nproperty. Together with the recent progress made by Chen et al. on the\nequivalence between the Aubin property and the strong regularity for nonlinear\nsecond-order cone programming [SIAM J. Optim., in press; arXiv:2406.13798v3\n(2024)], this paper constitutes a significant step forward in characterizing\nthe Aubin property for general non-polyhedral $C^2$-cone reducible constrained\noptimization problems.\n","authors":["Liang Chen","Ruoning Chen","Defeng Sun","Liping Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08232v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00568v2","updated":"2025-01-07T17:36:14Z","published":"2024-11-01T13:26:13Z","title":"Constrained Sampling with Primal-Dual Langevin Monte Carlo","summary":"  This work considers the problem of sampling from a probability distribution\nknown up to a normalization constant while satisfying a set of statistical\nconstraints specified by the expected values of general nonlinear functions.\nThis problem finds applications in, e.g., Bayesian inference, where it can\nconstrain moments to evaluate counterfactual scenarios or enforce desiderata\nsuch as prediction fairness. Methods developed to handle support constraints,\nsuch as those based on mirror maps, barriers, and penalties, are not suited for\nthis task. This work therefore relies on gradient descent-ascent dynamics in\nWasserstein space to put forward a discrete-time primal-dual Langevin Monte\nCarlo algorithm (PD-LMC) that simultaneously constrains the target distribution\nand samples from it. We analyze the convergence of PD-LMC under standard\nassumptions on the target distribution and constraints, namely (strong)\nconvexity and log-Sobolev inequalities. To do so, we bring classical\noptimization arguments for saddle-point algorithms to the geometry of\nWasserstein space. We illustrate the relevance and effectiveness of PD-LMC in\nseveral applications.\n","authors":["Luiz F. O. Chamon","Mohammad Reza Karimi","Anna Korba"],"pdf_url":"https://arxiv.org/pdf/2411.00568v2.pdf","comment":"39 pages, 14 figures. Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.00799v2","updated":"2025-01-07T17:32:19Z","published":"2025-01-01T10:50:35Z","title":"Follow The Approximate Sparse Leader for No-Regret Online Sparse Linear\n  Approximation","summary":"  We consider the problem of \\textit{online sparse linear approximation}, where\none predicts the best sparse approximation of a sequence of measurements in\nterms of linear combination of columns of a given measurement matrix. Such\nonline prediction problems are ubiquitous, ranging from medical trials to web\ncaching to resource allocation. The inherent difficulty of offline recovery\nalso makes the online problem challenging. In this letter, we propose\nFollow-The-Approximate-Sparse-Leader, an efficient online meta-policy to\naddress this online problem. Through a detailed theoretical analysis, we prove\nthat under certain assumptions on the measurement sequence, the proposed policy\nenjoys a data-dependent sublinear upper bound on the static regret, which can\nrange from logarithmic to square-root. Numerical simulations are performed to\ncorroborate the theoretical findings and demonstrate the efficacy of the\nproposed online policy.\n","authors":["Samrat Mukhopadhyay","Debasmita Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2501.00799v2.pdf","comment":"12 pages, 5 figures, corrected title, added proof of a lemma in\n  appendix"},{"id":"http://arxiv.org/abs/2501.03954v1","updated":"2025-01-07T17:26:35Z","published":"2025-01-07T17:26:35Z","title":"Learning to Relax Nonconvex Quadratically Constrained Quadratic Programs","summary":"  Quadratically constrained quadratic programs (QCQPs) are ubiquitous in\noptimization: Such problems arise in applications from operations research,\npower systems, signal processing, chemical engineering, portfolio theory, among\nothers. Despite their flexibility in modeling real-life situations and the\nrecent effort to understand their properties, nonconvex QCQPs are hard to solve\nin practice. Most of the approaches in the literature are based on either\nLinear Programming (LP) or Semidefinite Programming (SDP) relaxations, each of\nwhich works very well for some problem subclasses but perform poorly on others.\nIn this paper, we develop a relaxation selection procedure for nonconvex QCQPs\nthat can adaptively decide whether an LP- or SDP-based approach is expected to\nbe more beneficial by considering the instance structure. The proposed\nmethodology relies on utilizing machine learning methods that involve features\nderived from spectral properties and sparsity patterns of data matrices, and\nonce trained appropriately, the prediction model is applicable to any instance\nwith an arbitrary number of variables and constraints. We train and test\nclassification and regression models over synthetically generated instances,\nand empirically show the efficacy of our approach.\n","authors":["Buket Ozen","Burak Kocuk"],"pdf_url":"https://arxiv.org/pdf/2501.03954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13798v3","updated":"2025-01-07T17:19:28Z","published":"2024-06-19T19:58:01Z","title":"Aubin Property and Strong Regularity Are Equivalent for Nonlinear\n  Second-Order Cone Programming","summary":"  This paper solves a fundamental open problem in variational analysis on the\nequivalence between the Aubin property and the strong regularity for nonlinear\nsecond-order cone programming (SOCP) at a locally optimal solution. We achieve\nthis by introducing a reduction approach to the Aubin property characterized by\nthe Mordukhovich criterion and a lemma of alternative choices on cones to\nreplace the S-lemma used in Outrata and Ram\\'irez [SIAM J. Optim. 21 (2011)\n789-823] and Opazo, Outrata, and Ram\\'irez [SIAM J. Optim. 27 (2017)\n2141-2151], where the same SOCP was considered under the strict complementarity\ncondition except for possibly only one block of constraints. As a byproduct, we\nalso offer a new approach to the well-known result of Dontchev and Rockafellar\n[SIAM J. Optim. 6 (1996) 1087-1105] on the equivalence of the two concepts in\nconventional nonlinear programming.\n","authors":["Liang Chen","Ruoning Chen","Defeng Sun","Junyuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.13798v3.pdf","comment":"To appear in SIAM Journal on Optimization"},{"id":"http://arxiv.org/abs/2501.03933v1","updated":"2025-01-07T16:49:01Z","published":"2025-01-07T16:49:01Z","title":"Data-driven Optimization for the Evolve-Filter-Relax regularization of\n  convection-dominated flows","summary":"  Numerical stabilization techniques are often employed in under-resolved\nsimulations of convection-dominated flows to improve accuracy and mitigate\nspurious oscillations. Specifically, the Evolve-Filter-Relax (EFR) algorithm is\na framework which consists in evolving the solution, applying a filtering step\nto remove high-frequency noise, and relaxing through a convex combination of\nfiltered and original solutions. The stability and accuracy of the EFR solution\nstrongly depend on two parameters, the filter radius $\\delta$ and the\nrelaxation parameter $\\chi$. Standard choices for these parameters are usually\nfixed in time, and related to the full order model setting, i.e., the grid size\nfor $\\delta$ and the time step for $\\chi$. This paper makes two significant\nimprovements to the standard EFR framework by proposing: (i) time-dependent\nparameters, (ii) data-driven adaptive optimization of the parameters in time,\nconsidering a fully-resolved simulation as a reference. In particular, we\npropose three different classes of Optimized-EFR strategies, aiming to optimize\none or both parameters. Moreover, we investigate the accuracy and efficiency of\nthe proposed optimization algorithms considering different objective functions,\nboth local (point-valued) and global (such as the kinetic energy). The new\nOptimized-EFR strategies are tested in the under-resolved simulation of a\nturbulent flow past a cylinder at $Re=1000$. The new Optimized-EFR results are\nmore accurate than the standard EFR solution while maintaining a similar\ncomputational time. In particular, we show that using a global objective\nfunction and including the $H^1$ velocity seminorm is crucial to accurately\nmatch the reference flow dynamics.\n","authors":["Anna Ivagnes","Maria Strazzullo","Michele Girfoglio","Traian Iliescu","Gianluigi Rozza"],"pdf_url":"https://arxiv.org/pdf/2501.03933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.05017v3","updated":"2025-01-07T16:21:59Z","published":"2024-12-06T13:09:34Z","title":"Reduction from the partition problem: Dynamic lot sizing problem with\n  polynomial complexity","summary":"  In this note, we polynomially reduce an instance of the partition problem to\na dynamic lot sizing problem, and show that solving the latter problem solves\nthe former problem. By solving the dynamic programming formulation of the\ndynamic lot sizing problem, we show that the instance of the partition problem\ncan be solved with pseudo-polynomial time complexity. Numerical results on\nsolving instances of the partition problem are also provided using an\nimplementation of the algorithm that solves the dynamic program.\n","authors":["Chee-Khian Sim"],"pdf_url":"https://arxiv.org/pdf/2412.05017v3.pdf","comment":"11 pages. Latest version contains improved arguments and results"},{"id":"http://arxiv.org/abs/2501.03906v1","updated":"2025-01-07T16:21:40Z","published":"2025-01-07T16:21:40Z","title":"A regularized transportation cost stemming from entropic approximation","summary":"  We study the entropic regularizations of optimal transport problems under\nsuitable summability assumptions on the point-wise transport cost. These\nsummability assumptions already appear in the literature. However, we show that\nthe weakest compactness conditions that can be derived are already enough to\nobtain the convergence of the regularized functionals. This approach allows us\nto characterize the variational limit of the regularization even when it does\nnot converge to the original problem. The results apply also to problems with\nmore than two marginals.\n","authors":["Camilla Brizzi","Luigi De Pascale","Anna Kausamo"],"pdf_url":"https://arxiv.org/pdf/2501.03906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03882v1","updated":"2025-01-07T15:44:16Z","published":"2025-01-07T15:44:16Z","title":"An obstruction to small-time local controllability for a bilinear\n  Schrödinger equation","summary":"  We consider the small-time local controllability in the vicinity of the\nground state of a bilinear Schr\\\"odinger equation with Neumann boundary\nconditions. We prove that, when the linearized system is not controllable, the\nnonlinear system is not controllable, due to a quadratic obstruction involving\nthe squared norm of the control's primitive. This obstruction has been known\nsince 1983 for ODEs and observed for some PDEs since 2006. However, our\nsituation is more intricate since the kernel describing the quadratic expansion\nof the solution is not twice differentiable. We thus follow a Fourier-based\napproach, closer to the one used for quadratic obstructions of fractional\nSobolev regularity.\n  In this Fourier-based approach, a challenge is to formulate a necessary and\nsufficient condition on the convolution kernel, for the quadratic form to be\ncoercive. In previous studies, the coercivity was ensured by a signed\nasymptotic equivalent for the Fourier transform of the convolution kernel of\nthe form $\\widehat{K}(\\omega) \\sim \\omega^{-2}$ as $|\\omega| \\to \\infty$. In\nour case, $\\widehat{K}$ is a distribution which has singularities and changes\nsign up to infinity. We still prove coercivity because one of the signs appears\ntoo infrequently.\n","authors":["Karine Beauchard","Frédéric Marbach","Thomas Perrin"],"pdf_url":"https://arxiv.org/pdf/2501.03882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03560v2","updated":"2025-01-07T14:27:58Z","published":"2024-05-06T15:29:55Z","title":"Converse Lyapunov Results for Stability of Switched Systems with Average\n  Dwell-Time","summary":"  This article provides a characterization of stability for switched nonlinear\nsystems under average dwell-time constraints, in terms of necessary and\nsufficient conditions involving multiple Lyapunov functions. Earlier converse\nresults focus on switched systems with dwell-time constraints only, and the\nresulting inequalities depend on the flow of individual subsystems. With the\nhelp of a counterexample, we show that a lower bound that guarantees stability\nfor dwell-time switching signals may not necessarily imply stability for\nswitching signals with same lower bound on the average dwell-time. Based on\nthese two observations, we provide a converse result for the average dwell-time\nconstrained systems in terms of inequalities which do not depend on the flow of\nindividual subsystems and are easier to check. The particular case of linear\nswitched systems is studied as a corollary to our main result.\n","authors":["Matteo Della Rossa","Aneel Tanwani"],"pdf_url":"https://arxiv.org/pdf/2405.03560v2.pdf","comment":"To appear in ESAIM: Control, Optimisation and Calculus of Variations\n  (ESAIM: COCV)"},{"id":"http://arxiv.org/abs/2501.02098v2","updated":"2025-01-07T14:20:44Z","published":"2025-01-03T20:51:07Z","title":"Graph-Based Modeling and Decomposition of Hierarchical Optimization\n  Problems","summary":"  We present a graph-theoretic modeling approach for hierarchical optimization\nthat leverages the OptiGraph abstraction implemented in the Julia package\nPlasmo.jl. We show that the abstraction is flexible and can effectively capture\ncomplex hierarchical connectivity that arises from decision-making over\nmultiple spatial and temporal scales (e.g., integration of planning,\nscheduling, and operations in manufacturing and infrastructures). We also show\nthat the graph abstraction facilitates the conceptualization and implementation\nof decomposition and approximation schemes. Specifically, we propose a\ngraph-based Benders decomposition (gBD) framework that enables the exploitation\nof hierarchical (nested) structures and that uses graph\naggregation/partitioning procedures to discover such structures. In addition,\nwe provide a Julia implementation of gBD, which we call PlasmoBenders.jl. We\nillustrate the capabilities using examples arising in the context of energy and\npower systems.\n","authors":["David L. Cole","Filippo Pecci","Omar J. Guerra","Harsha Gangammanavar","Jesse D. Jenkins","Victor M. Zavala"],"pdf_url":"https://arxiv.org/pdf/2501.02098v2.pdf","comment":"66 pages, 3 tables, 28 figures, updated abstract"},{"id":"http://arxiv.org/abs/2501.03784v1","updated":"2025-01-07T13:45:29Z","published":"2025-01-07T13:45:29Z","title":"Optimal control of a nonlinear kinetic Fokker-Planck equation","summary":"  A tracking type optimal control problem for a nonlinear and nonlocal kinetic\nFokker-Planck equation which arises as the mean field limit of an interacting\nparticle systems that is subject to distance dependent random fluctuations is\nstudied. As the equation of interest is only hypocoercive and the control\noperator is unbounded with respect to the canonical state space, classical\nvariational solution techniques cannot be utilized directly. Instead, the\nconcept of admissible control operators is employed. For the underlying\nnonlinearities, local Lipschitz estimates are derived and subsequently used\nwithin a fixed point argument to obtain local existence of solutions. Again,\ndue to hypocoercivity, existence of optimal controls requires non standard\ntechniques as (compensated) compactness arguments are not readily available.\n","authors":["Tobias Breiten","Karl Kunisch"],"pdf_url":"https://arxiv.org/pdf/2501.03784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.01369v2","updated":"2025-01-07T13:41:00Z","published":"2022-05-03T08:36:05Z","title":"Improving the Convergence Rates for the Kinetic Fokker-Planck Equation\n  by Optimal Control","summary":"  The long time behavior and detailed convergence analysis of Langevin\nequations has received increased attention over the last years. Difficulties\narise from a lack of coercivity, usually termed hypocoercivity, of the\nunderlying kinetic Fokker-Planck operator which is a consequence of the\npartially deterministic nature of a second order stochastic differential\nequation. In this manuscript, the effect of controlling the confinement\npotential without altering the original invariant measure is investigated. This\nleads to an abstract bilinear control system with an unbounded but\ninfinite-time admissible control operator which, by means of an artificial\ndiffusion approach, is shown to possess a unique solution. The compactness of\nthe underlying semigroup is further used to define an infinite-horizon optimal\ncontrol problem on an appropriately reduced state space. Under smallness\nassumptions on the initial data, feasibility of and existence of a solution to\nthe optimal control problem are discussed. Numerical results based on a local\napproximation based on a shifted Riccati equation illustrate the theoretical\nfindings.\n","authors":["Tobias Breiten","Karl Kunisch"],"pdf_url":"https://arxiv.org/pdf/2205.01369v2.pdf","comment":"32 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.03773v1","updated":"2025-01-07T13:29:37Z","published":"2025-01-07T13:29:37Z","title":"The maximal angle between $3 \\times 3$ copositive matrices","summary":"  In 2010, Hiriart-Urruty and Seeger posed the problem of finding the maximal\npossible angle $\\theta_n$ between two copositive matrices of order $n$. They\nproved that $\\theta_2=\\frac{3}{4}\\pi$. In this paper, we study the maximal\nangle between two copositive matrices of order 3. We show that\n$\\theta_3=\\frac{3}{4}\\pi$ and give all possible pairs of matrices achieving\nthis maximal angle. The proof is based on case analysis and uses optimization\nand basic linear algebra techniques.\n","authors":["Daniel Gourion"],"pdf_url":"https://arxiv.org/pdf/2501.03773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11639v3","updated":"2025-01-07T13:21:43Z","published":"2024-11-18T15:19:04Z","title":"Trade-off Invariance Principle for minimizers of regularized functionals","summary":"  In this paper, we consider functionals of the form $H_\\alpha(u)=F(u)+\\alpha\nG(u)$ with $\\alpha\\in[0,+\\infty)$, where $u$ varies in a set $U\\neq\\emptyset$\n(without further structure). We first show that, excluding at most countably\nmany values of $\\alpha$, we have that $\\inf_{H_\\alpha^\\star}G=\n\\sup_{H_\\alpha^\\star}G$, where $H_\\alpha^\\star := \\arg \\min_U H_\\alpha$, which\nis assumed to be non-empty. We further prove a stronger result that concerns\nthe invariance of the limiting value of the functional $G$ along minimizing\nsequences for $H_\\alpha$. Moreover, we show to what extent these findings\ngeneralize to multi-regularized functionals and -- in the presence of an\nunderlying differentiable structure -- to critical points. Finally, the main\nresult implies an unexpected consequence for functionals regularized with\nuniformly convex norms: excluding again at most countably many values of\n$\\alpha$, it turns out that for a minimizing sequence, convergence to a\nminimizer in the weak or strong sense is equivalent.\n","authors":["Massimo Fornasier","Jona Klemenc","Alessandro Scagliotti"],"pdf_url":"https://arxiv.org/pdf/2411.11639v3.pdf","comment":"16 pages, extension to multi-regularization and to critical points"},{"id":"http://arxiv.org/abs/2410.24222v2","updated":"2025-01-07T13:04:51Z","published":"2024-10-31T17:59:56Z","title":"Robust Gaussian Processes via Relevance Pursuit","summary":"  Gaussian processes (GPs) are non-parametric probabilistic regression models\nthat are popular due to their flexibility, data efficiency, and well-calibrated\nuncertainty estimates. However, standard GP models assume homoskedastic\nGaussian noise, while many real-world applications are subject to non-Gaussian\ncorruptions. Variants of GPs that are more robust to alternative noise models\nhave been proposed, and entail significant trade-offs between accuracy and\nrobustness, and between computational requirements and theoretical guarantees.\nIn this work, we propose and study a GP model that achieves robustness against\nsparse outliers by inferring data-point-specific noise levels with a sequential\nselection procedure maximizing the log marginal likelihood that we refer to as\nrelevance pursuit. We show, surprisingly, that the model can be parameterized\nsuch that the associated log marginal likelihood is strongly concave in the\ndata-point-specific noise variances, a property rarely found in either robust\nregression objectives or GP marginal likelihoods. This in turn implies the weak\nsubmodularity of the corresponding subset selection problem, and thereby proves\napproximation guarantees for the proposed algorithm. We compare the model's\nperformance relative to other approaches on diverse regression and Bayesian\noptimization tasks, including the challenging but common setting of sparse\ncorruptions of the labels within or close to the function range.\n","authors":["Sebastian Ament","Elizabeth Santorella","David Eriksson","Ben Letham","Maximilian Balandat","Eytan Bakshy"],"pdf_url":"https://arxiv.org/pdf/2410.24222v2.pdf","comment":"NeurIPS 2024 Article (https://openreview.net/forum?id=5FATPIlWUJ)"},{"id":"http://arxiv.org/abs/2501.03744v1","updated":"2025-01-07T12:38:21Z","published":"2025-01-07T12:38:21Z","title":"Hydrogen Network Expansion Planning considering the Chicken-and-egg\n  Dilemma and Market Uncertainty","summary":"  Comparable performance to fully flexible settings through optimized revision\ntimes.Green hydrogen is thought to be a game changer for reaching\nsustainability targets. However, the transition to a green hydrogen economy\nfaces a critical challenge known as the `chicken-and-egg dilemma', wherein\nestablishing a hydrogen supply network relies on demand, while demand only\ngrows with reliable supply. In addition, as the hydrogen market is in the early\nstage, predicting demand distributions is challenging due to lack of data\navailability. This paper addresses these complex issues through a risk-averse\nframework with the introduction of a distributionally robust hydrogen network\nexpansion planning problem under decision-dependent demand ambiguity. The\nproblem optimizes location and production capacity decisions of the suppliers\nconsidering the moments of the stochastic hydrogen demand as a function of\nthese investment decisions. To obtain tractable representations of this\nproblem, we derive two different reformulations that consider continuous and\ndiscrete hydrogen demand support sets under different forms of decision\ndependencies. To efficiently solve the reformulations, we develop a tailored\nalgorithm based on the column-and-constraint generation approach, and enhance\nthe computational performance through solving the master problems to a relative\noptimality gap, decomposing the subproblems, and integrating pre-generated\ncolumns and constraints. To validate the effectiveness of our approach, we\ninvestigate a real case study leveraging data from the ``Hydrogen Energy\nApplications in Valley Environments for Northern Netherlands (HEAVENN)\"\nproject. The results reveal that considering the chicken-and-egg dilemma under\nuncertain hydrogen market conditions leads to earlier and more diverse\ninvestments, providing critical insights for policymakers based on the degree\nof decision dependency.\n","authors":["Sezen Ece Kayacık","Beste Basciftci","Albert H. Schrotenboer","Iris F. A. Vis","Evrim Ursavas"],"pdf_url":"https://arxiv.org/pdf/2501.03744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16449v3","updated":"2025-01-07T12:16:43Z","published":"2024-05-26T06:33:11Z","title":"Reinforcement Learning for Jump-Diffusions, with Financial Applications","summary":"  We study continuous-time reinforcement learning (RL) for stochastic control\nin which system dynamics are governed by jump-diffusion processes. We formulate\nan entropy-regularized exploratory control problem with stochastic policies to\ncapture the exploration--exploitation balance essential for RL. Unlike the pure\ndiffusion case initially studied by Wang et al. (2020), the derivation of the\nexploratory dynamics under jump-diffusions calls for a careful formulation of\nthe jump part. Through a theoretical analysis, we find that one can simply use\nthe same policy evaluation and $q$-learning algorithms in Jia and Zhou (2022a,\n2023), originally developed for controlled diffusions, without needing to check\na priori whether the underlying data come from a pure diffusion or a\njump-diffusion. However, we show that the presence of jumps ought to affect\nparameterizations of actors and critics in general. We investigate as an\napplication the mean--variance portfolio selection problem with stock price\nmodelled as a jump-diffusion, and show that both RL algorithms and\nparameterizations are invariant with respect to jumps. Finally, we present a\ndetailed study on applying the general theory to option hedging.\n","authors":["Xuefeng Gao","Lingfei Li","Xun Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.16449v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03718v1","updated":"2025-01-07T11:58:10Z","published":"2025-01-07T11:58:10Z","title":"Scalable Second-Order Optimization Algorithms for Minimizing Low-rank\n  Functions","summary":"  We present a random-subspace variant of cubic regularization algorithm that\nchooses the size of the subspace adaptively, based on the rank of the projected\nsecond derivative matrix. Iteratively, our variant only requires access to\n(small-dimensional) projections of first- and second-order problem derivatives\nand calculates a reduced step inexpensively. The ensuing method maintains the\noptimal global rate of convergence of (full-dimensional) cubic regularization,\nwhile showing improved scalability both theoretically and numerically,\nparticularly when applied to low-rank functions. When applied to the latter,\nour algorithm naturally adapts the subspace size to the true rank of the\nfunction, without knowing it a priori.\n","authors":["Edward Tansley","Coralia Cartis"],"pdf_url":"https://arxiv.org/pdf/2501.03718v1.pdf","comment":"Accepted at NeurIPS 2024 Workshop OPT2024: Optimization for Machine\n  Learning"},{"id":"http://arxiv.org/abs/2501.03698v1","updated":"2025-01-07T11:03:11Z","published":"2025-01-07T11:03:11Z","title":"Computational complexity of sum-of-squares bounds for copositive\n  programs","summary":"  In recent years, copositive programming has received significant attention\nfor its ability to model hard problems in both discrete and continuous\noptimization. Several relaxations of copositive programs based on semidefinite\nprogramming (SDP) have been proposed in the literature, meant to provide\ntractable bounds. However, while these SDP-based relaxations are amenable to\nthe ellipsoid algorithm and interior point methods, it is not immediately\nobvious that they can be solved in polynomial time (even approximately). In\nthis paper, we consider the sum-of-squares (SOS) hierarchies of relaxations for\ncopositive programs introduced by Parrilo (2000), de Klerk & Pasechnik (2002)\nand Pe\\~na, Vera & Zuluaga (2006), which can be formulated as SDPs. We\nestablish sufficient conditions that guarantee the polynomial-time\ncomputability (up to fixed precision) of these relaxations. These conditions\nare satisfied by copositive programs that represent standard quadratic programs\nand their reciprocals. As an application, we show that the SOS bounds for the\n(weighted) stability number of a graph can be computed efficiently.\nAdditionally, we provide pathological examples of copositive programs (that do\nnot satisfy the sufficient conditions) whose SOS relaxations admit only\nfeasible solutions of doubly-exponential size.\n","authors":["Marilena Palomba","Lucas Slot","Luis Felipe Vargas","Monaldo Mastrolilli"],"pdf_url":"https://arxiv.org/pdf/2501.03698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03691v1","updated":"2025-01-07T10:43:26Z","published":"2025-01-07T10:43:26Z","title":"Stabilization of Strictly Pre-Dissipative Receding Horizon Linear\n  Quadratic Control by Terminal Costs","summary":"  Asymptotic stability in receding horizon control is obtained under a strict\npre-dissipativity assumption, in the presence of suitable state constraints. In\nthis paper we analyze how terminal constraints can be replaced by suitable\nterminal costs. We restrict to the linear-quadratic setting as that allows us\nto obtain stronger results, while we analyze the full nonlinear case in a\nseparate contribution.\n","authors":["Mario Zanon","Lars Grüne"],"pdf_url":"https://arxiv.org/pdf/2501.03691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03668v1","updated":"2025-01-07T10:12:01Z","published":"2025-01-07T10:12:01Z","title":"Controlling the low-temperature Ising model using spatiotemporal Markov\n  decision theory","summary":"  We introduce the spatiotemporal Markov decision process (STMDP), a special\ntype of Markov decision process that models sequential decision-making problems\nwhich are not only characterized by temporal, but also by spatial interaction\nstructures. To illustrate the framework, we construct an STMDP inspired by the\nlow-temperature two-dimensional Ising model on a finite, square lattice,\nevolving according to the Metropolis dynamics. We consider the situation in\nwhich an external decision maker aims to drive the system towards the all-plus\nconfiguration by flipping spins at specified moments in time. In order to\nanalyze this problem, we construct an auxiliary MDP by means of a reduction of\nthe configuration space to the local minima of the Hamiltonian. Leveraging the\nconvenient form of this auxiliary MDP, we uncover the structure of the optimal\npolicy by solving the Bellman equations in a recursive manner. Finally, we\nconduct a numerical study on the performance of the optimal policy obtained\nfrom the auxiliary MDP in the original Ising STMDP.\n","authors":["M. C. de Jongh","Richard J. Boucherie","M. N. M. van Lieshout"],"pdf_url":"https://arxiv.org/pdf/2501.03668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23016v2","updated":"2025-01-07T09:39:51Z","published":"2024-10-30T13:46:07Z","title":"Regularity and stability for the Gibbs conditioning principle on path\n  space via McKean-Vlasov control","summary":"  We consider a system of diffusion processes interacting through their\nempirical distribution. Assuming that the empirical average of a given\nobservable can be observed at any time, we derive regularity and quantitative\nstability results for the optimal solutions in the associated version of the\nGibbs conditioning principle. The proofs rely on the analysis of a\nMcKean-Vlasov control problem with distributional constraints. Some new\nestimates are derived for Hamilton-Jacobi-Bellman equations and the Hessian of\nthe log-density of diffusion processes, which are of independent interest.\n","authors":["Louis-Pierre Chaintron","Giovanni Conforti"],"pdf_url":"https://arxiv.org/pdf/2410.23016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00502v3","updated":"2025-01-07T08:50:35Z","published":"2024-06-01T17:10:56Z","title":"Non-geodesically-convex optimization in the Wasserstein space","summary":"  We study a class of optimization problems in the Wasserstein space (the space\nof probability measures) where the objective function is nonconvex along\ngeneralized geodesics. Specifically, the objective exhibits some\ndifference-of-convex structure along these geodesics. The setting also\nencompasses sampling problems where the logarithm of the target distribution is\ndifference-of-convex. We derive multiple convergence insights for a novel semi\nForward-Backward Euler scheme under several nonconvex (and possibly nonsmooth)\nregimes. Notably, the semi Forward-Backward Euler is just a slight modification\nof the Forward-Backward Euler whose convergence is -- to our knowledge -- still\nunknown in our very general non-geodesically-convex setting.\n","authors":["Hoang Phuc Hau Luu","Hanlin Yu","Bernardo Williams","Petrus Mikkola","Marcelo Hartmann","Kai Puolamäki","Arto Klami"],"pdf_url":"https://arxiv.org/pdf/2406.00502v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09498v2","updated":"2025-01-07T06:49:09Z","published":"2024-12-12T17:47:08Z","title":"Gradient descent inference in empirical risk minimization","summary":"  Gradient descent is one of the most widely used iterative algorithms in\nmodern statistical learning. However, its precise algorithmic dynamics in\nhigh-dimensional settings remain only partially understood, which has therefore\nlimited its broader potential for statistical inference applications.\n  This paper provides a precise, non-asymptotic distributional characterization\nof gradient descent iterates in a broad class of empirical risk minimization\nproblems, in the so-called mean-field regime where the sample size is\nproportional to the signal dimension. Our non-asymptotic state evolution theory\nholds for both general non-convex loss functions and non-Gaussian data, and\nreveals the central role of two Onsager correction matrices that precisely\ncharacterize the non-trivial dependence among all gradient descent iterates in\nthe mean-field regime.\n  Although the Onsager correction matrices are typically analytically\nintractable, our state evolution theory facilitates a generic gradient descent\ninference algorithm that consistently estimates these matrices across a broad\nclass of models. Leveraging this algorithm, we show that the state evolution\ncan be inverted to construct (i) data-driven estimators for the generalization\nerror of gradient descent iterates and (ii) debiased gradient descent iterates\nfor inference of the unknown signal. Detailed applications to two canonical\nmodels--linear regression and (generalized) logistic regression--are worked out\nto illustrate model-specific features of our general theory and inference\nmethods.\n","authors":["Qiyang Han","Xiaocong Xu"],"pdf_url":"https://arxiv.org/pdf/2412.09498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03552v1","updated":"2025-01-07T05:58:07Z","published":"2025-01-07T05:58:07Z","title":"Proxy Control Barrier Functions: Integrating Barrier-Based and\n  Lyapunov-Based Safety-Critical Control Design","summary":"  This work introduces a novel Proxy Control Barrier Function (PCBF) scheme\nthat integrates barrier-based and Lyapunov-based safety-critical control\nstrategies for strict-feedback systems with potentially unknown dynamics. The\nproposed method employs a modular design procedure, decomposing the original\nsystem into a proxy subsystem and a virtual tracking subsystem that are\ncontrolled by the control barrier function (CBF)-based and Lyapunov-based\ncontrollers, respectively. By integrating these separately designed\ncontrollers, the overall system's safety is ensured. Moreover, a new\nfilter-based disturbance observer is utilized to design a PCBF-based safe\ncontroller for strict-feedback systems subject to mismatched disturbances. This\napproach broadens the class of systems to which CBF-based methods can be\napplied and significantly simplifies CBF construction by requiring only the\nmodel of the proxy subsystem. The effectiveness of the proposed method is\ndemonstrated through numerical simulations.\n","authors":["Yujie Wang","Xiangru Xu"],"pdf_url":"https://arxiv.org/pdf/2501.03552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03543v1","updated":"2025-01-07T05:37:59Z","published":"2025-01-07T05:37:59Z","title":"Distributionally Robust Joint Chance-Constrained Optimal Power Flow\n  using Relative Entropy","summary":"  Designing robust algorithms for the optimal power flow (OPF) problem is\ncritical for the control of large-scale power systems under uncertainty. The\nchance-constrained OPF (CCOPF) problem provides a natural formulation of the\ntrade-off between the operating cost and the constraint satisfaction rate. In\nthis work, we propose a new data-driven algorithm for the CCOPF problem, based\non distributionally robust optimization (DRO). \\revise{We show that the\nproposed reformulation of the distributionally robust chance constraints is\nexact, whereas other approaches in the CCOPF literature rely on conservative\napproximations. We establish out-of-sample robustness guarantees for the\ndistributionally robust solution and prove that the solution is the most\nefficient among all approaches enjoying the same guarantees.} We apply the\nproposed algorithm to the the CCOPF problem and compare the performance of our\napproach with existing methods using simulations on IEEE benchmark power\nsystems.\n","authors":["Eli Brock","Haixiang Zhang","Javad Lavaei","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2501.03543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03542v1","updated":"2025-01-07T05:30:06Z","published":"2025-01-07T05:30:06Z","title":"Turbulence modeling over riblets via domain transformation","summary":"  Numerical and experimental studies have demonstrated the drag-reducing\npotential of carefully designed streamwise-elongated riblets in lowering\nskin-friction drag. To support the systematic design of such surface\ncorrugations, recent efforts have integrated simplified versions of the\ngoverning equations with innovative methods for representing the effects of\nrough boundaries on flow dynamics. Notably, the statistical response of the\neddy-viscosity-enhanced linearized Navier-Stokes equations has been shown to\neffectively capture the ability of riblets in suppressing turbulence, quantify\nthe influence of background turbulence on the mean velocity, and reproduce\nestablished drag-reduction trends. In this paper, we enhance the flexibility\nand computational efficiency of this simulation-free approach by implementing a\ndomain transformation for surface representation, along with a perturbation\nanalysis on a small geometric parameter of the riblets. While domain\ntransformation complicates the differential equations, it provides accurate\nboundary representations and facilitates the analysis of complex riblet shapes\nat high Reynolds numbers by enabling perturbation analysis to simplify the\ndimensional complexity of the governing equations. Our method successfully\npredicts drag reduction trends for semi-circular riblets, consistent with\nexisting literature. We further utilize our framework to investigate flow\nmechanisms influenced by riblets and extend our study to channel flows with\nfriction Reynolds numbers up to 2003. Our findings reveal the emergence of\nKelvin-Helmholtz rollers over large and sharp semi-circular riblets,\ncontributing to the degradation of drag reduction in these geometries.\nAdditionally, we examine the impact of riblets on near-wall flow structures,\nfocusing on their suppression of streamwise-elongated structures in flows over\nlarge riblets.\n","authors":["Mohammadamin Naseri","Armin Zare"],"pdf_url":"https://arxiv.org/pdf/2501.03542v1.pdf","comment":"40 pages, 26 figures"},{"id":"http://arxiv.org/abs/2411.13805v3","updated":"2025-01-07T03:08:57Z","published":"2024-11-21T03:09:18Z","title":"On Representing Convex Quadratically Constrained Quadratic Programs via\n  Graph Neural Networks","summary":"  Convex quadratically constrained quadratic programs (QCQPs) involve finding a\nsolution within a convex feasible region defined by quadratic constraints while\nminimizing a convex quadratic objective function. These problems arise in\nvarious industrial applications, including power systems and signal processing.\nTraditional methods for solving convex QCQPs primarily rely on matrix\nfactorization, which quickly becomes computationally prohibitive as the problem\nsize increases. Recently, graph neural networks (GNNs) have gained attention\nfor their potential in representing and solving various optimization problems\nsuch as linear programs and linearly constrained quadratic programs. In this\nwork, we investigate the representation power of GNNs in the context of QCQP\ntasks. Specifically, we propose a new tripartite graph representation for\ngeneral convex QCQPs and properly associate it with message-passing GNNs. We\ndemonstrate that there exist GNNs capable of reliably representing key\nproperties of convex QCQPs, including feasibility, optimal value, and optimal\nsolution. Our result deepens the understanding of the connection between QCQPs\nand GNNs, paving the way for future machine learning approaches to efficiently\nsolve QCQPs.\n","authors":["Chenyang Wu","Qian Chen","Akang Wang","Tian Ding","Ruoyu Sun","Wenguo Yang","Qingjiang Shi"],"pdf_url":"https://arxiv.org/pdf/2411.13805v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19678v3","updated":"2025-01-07T03:01:22Z","published":"2024-09-29T12:10:35Z","title":"SymILO: A Symmetry-Aware Learning Framework for Integer Linear\n  Optimization","summary":"  Integer linear programs (ILPs) are commonly employed to model diverse\npractical problems such as scheduling and planning. Recently, machine learning\ntechniques have been utilized to solve ILPs. A straightforward idea is to train\na model via supervised learning, with an ILP as the input and an optimal\nsolution as the label. An ILP is symmetric if its variables can be permuted\nwithout changing the problem structure, resulting in numerous equivalent and\noptimal solutions. Randomly selecting an optimal solution as the label can\nintroduce variability in the training data, which may hinder the model from\nlearning stable patterns. In this work, we incorporate the intrinsic symmetry\nof ILPs and propose a novel training framework called SymILO. Specifically, we\nmodify the learning task by introducing solution permutation along with neural\nnetwork weights as learnable parameters and then design an alternating\nalgorithm to jointly optimize the loss function. We conduct extensive\nexperiments on ILPs involving different symmetries and the computational\nresults demonstrate that our symmetry-aware approach significantly outperforms\nthree existing methods -- achieving $50.3\\%$, $66.5\\%$, and $45.4\\%$ average\nimprovements, respectively.\n","authors":["Qian Chen","Tianjian Zhang","Linxin Yang","Qingyu Han","Akang Wang","Ruoyu Sun","Xiaodong Luo","Tsung-Hui Chang"],"pdf_url":"https://arxiv.org/pdf/2409.19678v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13865v2","updated":"2025-01-07T02:50:02Z","published":"2024-02-21T15:13:43Z","title":"Variable Projection Algorithms: Theoretical Insights and A Novel\n  Approach for Problems with Large Residual","summary":"  This paper delves into an in-depth exploration of the Variable Projection\n(VP) algorithm, a powerful tool for solving separable nonlinear optimization\nproblems across multiple domains, including system identification, image\nprocessing, and machine learning. We first establish a theoretical framework to\nexamine the effect of the approximate treatment of the coupling relationship\namong parameters on the local convergence of the VP algorithm and theoretically\nprove that the Kaufman's VP algorithm can achieve a similar convergence rate as\nthe Golub \\& Pereyra's form. These studies fill the gap in the existing\nconvergence theory analysis, and provide a solid foundation for understanding\nthe mechanism of VP algorithm and broadening its application horizons.\nFurthermore, drawing inspiration from these theoretical revelations, we design\na refined VP algorithm for handling separable nonlinear optimization problems\ncharacterized by large residual, called VPLR, which boosts the convergence\nperformance by addressing the interdependence of parameters within the\nseparable model and by continually correcting the approximated Hessian matrix\nto counteract the influence of large residual during the iterative process. The\neffectiveness of this refined algorithm is corroborated through numerical\nexperimentation.\n","authors":["Guangyong Chen","Peng Xue","Min Gan","Jing Chen","Wenzhong Guo","C. L. Philip. Chen"],"pdf_url":"https://arxiv.org/pdf/2402.13865v2.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.07108v3","updated":"2025-01-07T02:14:56Z","published":"2024-02-11T05:35:50Z","title":"Decoupling Learning and Decision-Making: Breaking the\n  $\\mathcal{O}(\\sqrt{T})$ Barrier in Online Resource Allocation with\n  First-Order Methods","summary":"  Online linear programming plays an important role in both revenue management\nand resource allocation, and recent research has focused on developing\nefficient first-order online learning algorithms. Despite the empirical success\nof first-order methods, they typically achieve a regret no better than\n$\\mathcal{O}(\\sqrt{T})$, which is suboptimal compared to the $\\mathcal{O}(\\log\nT)$ bound guaranteed by the state-of-the-art linear programming (LP)-based\nonline algorithms. This paper establishes several important facts about online\nlinear programming, which unveils the challenge for first-order-method-based\nonline algorithms to achieve beyond $\\mathcal{O}(\\sqrt{T})$ regret. To address\nthe challenge, we introduce a new algorithmic framework that decouples learning\nfrom decision-making. For the first time, we show that first-order methods can\nattain regret $\\mathcal{O}(T^{1/3})$ with this new framework.\n","authors":["Wenzhi Gao","Chunlin Sun","Chenyu Xue","Dongdong Ge","Yinyu Ye"],"pdf_url":"https://arxiv.org/pdf/2402.07108v3.pdf","comment":"Merged into arXiv:2501.02761"},{"id":"http://arxiv.org/abs/2501.03470v1","updated":"2025-01-07T02:12:00Z","published":"2025-01-07T02:12:00Z","title":"Positivstellensätze for polynomial matrices with universal quantifiers","summary":"  This paper studies Positivstellens\\\"atze for a polynomial matrix subject to\npolynomial matrix inequality constraints with universal quantifiers. We first\npresent a Scherer-Hol-type Positivstellensatz under the Archimedean condition.\nWhen the objective is a scalar polynomial, we further provide a sparse\nScherer-Hol-type Positivstellensatz in the presence of correlative sparsity.\nNext, without assuming the Archimedean condition, we derive\nPutinar-Vasilescu-type, P\\'olya-type, and Lasserre-Netzer-type\nPositivstellens\\\"atze under the same setting. These results can be viewed as\ncommon generalizations of corresponding Positivstellens\\\"atze in the cases of\npolynomials, polynomials with universal quantifiers, and polynomial matrices.\nFor the proofs, techniques from *-algebra, real algebraic geometry, operator\ntheory, and convex optimization are employed. Applications of the established\nPositivstellens\\\"atze to robust polynomial matrix optimization are also\ndiscussed.\n","authors":["Feng Guo","Jie Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03470v1.pdf","comment":"31 pages, 2 tables"},{"id":"http://arxiv.org/abs/2406.00612v3","updated":"2025-01-07T02:11:06Z","published":"2024-06-02T04:02:40Z","title":"Policy Iteration for Exploratory Hamilton--Jacobi--Bellman Equations","summary":"  We study the policy iteration algorithm (PIA) for entropy-regularized\nstochastic control problems on an infinite time horizon with a large discount\nrate, focusing on two main scenarios. First, we analyze PIA with bounded\ncoefficients where the controls applied to the diffusion term satisfy a\nsmallness condition. We demonstrate the convergence of PIA based on a uniform\n$\\mathcal{C}^{2,\\alpha}$ estimate for the value sequence generated by PIA, and\nprovide a quantitative convergence analysis for this scenario. Second, we\ninvestigate PIA with unbounded coefficients but no control over the diffusion\nterm. In this scenario, we first provide the well-posedness of the exploratory\nHamilton--Jacobi--Bellman equation with linear growth coefficients and\npolynomial growth reward function. By such a well-posedess result we achieve\nPIA's convergence by establishing a quantitative locally uniform\n$\\mathcal{C}^{1,\\alpha}$ estimates for the generated value sequence.\n","authors":["Hung Vinh Tran","Zhenhua Wang","Yuming Paul Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.00612v3.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2501.03459v1","updated":"2025-01-07T01:21:07Z","published":"2025-01-07T01:21:07Z","title":"Convergence of a particle method for gradient flows on the\n  $L^p$-Wasserstein space","summary":"  We study the particle method to approximate the gradient flow on the\n$L^p$-Wasserstein space. This method relies on the discretization of the energy\nintroduced by [3] via nonoverlapping balls centered at the particles and\npreserves the gradient flow structure at the particle level. We prove the\nconvergence of the discrete gradient flow to the continuum gradient flow on the\n$L^p$-Wasserstein space over $\\mathbb R$, specifically to the doubly nonlinear\ndiffusion equation in one dimension.\n","authors":["Rong Lei"],"pdf_url":"https://arxiv.org/pdf/2501.03459v1.pdf","comment":"arXiv admin note: text overlap with arXiv:1605.08086 by other authors"},{"id":"http://arxiv.org/abs/2501.03443v1","updated":"2025-01-07T00:09:52Z","published":"2025-01-07T00:09:52Z","title":"Optimization Learning","summary":"  This article introduces the concept of optimization learning, a methodology\nto design optimization proxies that learn the input/output mapping of\nparametric optimization problems. These optimization proxies are trustworthy by\ndesign: they compute feasible solutions to the underlying optimization\nproblems, provide quality guarantees on the returned solutions, and scale to\nlarge instances. Optimization proxies are differentiable programs that combine\ntraditional deep learning technology with repair or completion layers to\nproduce feasible solutions. The article shows that optimization proxies can be\ntrained end-to-end in a self-supervised way. It presents methodologies to\nprovide performance guarantees and to scale optimization proxies to large-scale\noptimization problems. The potential of optimization proxies is highlighted\nthrough applications in power systems and, in particular, real-time risk\nassessment and security-constrained optimal power flow.\n","authors":["Pascal Van Hentenryck"],"pdf_url":"https://arxiv.org/pdf/2501.03443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04165v1","updated":"2025-01-07T22:25:40Z","published":"2025-01-07T22:25:40Z","title":"Unifying restart accelerated gradient and proximal bundle methods","summary":"  This paper presents a novel restarted version of Nesterov's accelerated\ngradient method and establishes its optimal iteration-complexity for solving\nconvex smooth composite optimization problems. The proposed restart accelerated\ngradient method is shown to be a specific instance of the accelerated inexact\nproximal point framework introduced in \"An accelerated hybrid proximal\nextragradient method for convex optimization and its implications to\nsecond-order methods\" by Monteiro and Svaiter, SIAM Journal on Optimization,\n2013. Furthermore, this work examines the proximal bundle method within the\ninexact proximal point framework, demonstrating that it is an instance of the\nframework. Notably, this paper provides new insights into the underlying\nalgorithmic principle that unifies two seemingly disparate optimization\nmethods, namely, the restart accelerated gradient and the proximal bundle\nmethods.\n","authors":["Jiaming Liang"],"pdf_url":"https://arxiv.org/pdf/2501.04165v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.04160v1","updated":"2025-01-07T22:19:06Z","published":"2025-01-07T22:19:06Z","title":"Collaborative Spacecraft Servicing under Partial Feedback using\n  Lyapunov-based Deep Neural Networks","summary":"  Multi-agent systems are increasingly applied in space missions, including\ndistributed space systems, resilient constellations, and autonomous rendezvous\nand docking operations. A critical emerging application is collaborative\nspacecraft servicing, which encompasses on-orbit maintenance, space debris\nremoval, and swarm-based satellite repositioning. These missions involve\nservicing spacecraft interacting with malfunctioning or defunct spacecraft\nunder challenging conditions, such as limited state information, measurement\ninaccuracies, and erratic target behaviors. Existing approaches often rely on\nassumptions of full state knowledge or single-integrator dynamics, which are\nimpractical for real-world applications involving second-order spacecraft\ndynamics. This work addresses these challenges by developing a distributed\nstate estimation and tracking framework that requires only relative position\nmeasurements and operates under partial state information. A novel\n$\\rho$-filter is introduced to reconstruct unknown states using locally\navailable information, and a Lyapunov-based deep neural network adaptive\ncontroller is developed that adaptively compensates for uncertainties stemming\nfrom unknown spacecraft dynamics. To ensure the collaborative spacecraft\nregulation problem is well-posed, a trackability condition is defined. A\nLyapunov-based stability analysis is provided to ensure exponential convergence\nof errors in state estimation and spacecraft regulation to a neighborhood of\nthe origin under the trackability condition. The developed method eliminates\nthe need for expensive velocity sensors or extensive pre-training, offering a\npractical and robust solution for spacecraft servicing in complex, dynamic\nenvironments.\n","authors":["Cristian F. Nino","Omkar Sudhir Patil","Christopher D. Petersen","Sean Phillips","Warren E. Dixon"],"pdf_url":"https://arxiv.org/pdf/2501.04160v1.pdf","comment":"24 pages, 4 Figures, Journal"},{"id":"http://arxiv.org/abs/2501.04151v1","updated":"2025-01-07T21:37:41Z","published":"2025-01-07T21:37:41Z","title":"Efficient LP warmstarting for linear modifications of the constraint\n  matrix","summary":"  We consider the problem of computing the optimal solution and objective of a\nlinear program under linearly changing linear constraints. More specifically,\nwe want to compute the optimal solution of a linear optimization where the\nconstraint matrix linearly depends on a paramater that can take p different\nvalues. Based on the information given by a precomputed basis, we present three\nefficient LP warm-starting algorithms. Each algorithm is either based on the\neigenvalue decomposition, the Schur decomposition, or a tweaked eigenvalue\ndecomposition to evaluate the optimal solution and optimal objective of these\nproblems. The three algorithms have an overall complexity O(m^3 + pm^2) where m\nis the number of constraints of the original problem and p the number of values\nof the parameter that we want to evaluate. We also provide theorems related to\nthe optimality conditions to verify when a basis is still optimal and a local\nbound on the objective.\n","authors":["Guillaume Derval","Bardhyl Miftari","Damien Ernst","Quentin Louveaux"],"pdf_url":"https://arxiv.org/pdf/2501.04151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04143v1","updated":"2025-01-07T21:12:08Z","published":"2025-01-07T21:12:08Z","title":"Linear Optimization for the Perfect Meal: A Data-Driven Approach to\n  Optimising the Perfect Meal Using Gurobi","summary":"  This study aims to optimize meal planning for nutritional health and cost\nefficiency using linear programming. Linear optimization provides an effective\nframework for addressing the problem of an optimal diet, as the composition of\nfood can be naturally modeled as a linearly additive system. Leveraging a\ncomprehensive nutrition dataset, our model minimizes meal costs while meeting\nspecific nutritional requirements. We explore additional complexities, such as\nfractional weights and nutrient ratio constraints, enhancing the robustness of\nthe solution. Case studies address common nutritional challenges, providing\ntailored diet plans. The significance lies in aiding individuals to form\nbalanced, cost-effective dietary schedules, considering fitness goals and\ncaloric needs. This research contributes to efficient, sustainable, and\ntime-sensitive meal planning, emphasizing the intersection of nutrition,\noptimization, and real-world applicability.\n","authors":["Utkarsh Prajapati","Tanushree Jain","Abhishek Machiraju","Divyam Kaushik"],"pdf_url":"https://arxiv.org/pdf/2501.04143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04134v1","updated":"2025-01-07T20:46:59Z","published":"2025-01-07T20:46:59Z","title":"Mixing Times and Privacy Analysis for the Projected Langevin Algorithm\n  under a Modulus of Continuity","summary":"  We study the mixing time of the projected Langevin algorithm (LA) and the\nprivacy curve of noisy Stochastic Gradient Descent (SGD), beyond nonexpansive\niterations. Specifically, we derive new mixing time bounds for the projected LA\nwhich are, in some important cases, dimension-free and poly-logarithmic on the\naccuracy, closely matching the existing results in the smooth convex case.\nAdditionally, we establish new upper bounds for the privacy curve of the\nsubsampled noisy SGD algorithm. These bounds show a crucial dependency on the\nregularity of gradients, and are useful for a wide range of convex losses\nbeyond the smooth case. Our analysis relies on a suitable extension of the\nPrivacy Amplification by Iteration (PABI) framework (Feldman et al., 2018;\nAltschuler and Talwar, 2022, 2023) to noisy iterations whose gradient map is\nnot necessarily nonexpansive. This extension is achieved by designing an\noptimization problem which accounts for the best possible R\\'enyi divergence\nbound obtained by an application of PABI, where the tractability of the problem\nis crucially related to the modulus of continuity of the associated gradient\nmapping. We show that, in several interesting cases -- including the nonsmooth\nconvex, weakly smooth and (strongly) dissipative -- such optimization problem\ncan be solved exactly and explicitly. This yields the tightest possible\nPABI-based bounds, where our results are either new or substantially sharper\nthan those in previous works.\n","authors":["Mario Bravo","Juan P. Flores-Mella","Cristóbal Guzmán"],"pdf_url":"https://arxiv.org/pdf/2501.04134v1.pdf","comment":"40 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.14554v2","updated":"2025-01-07T20:44:10Z","published":"2024-01-25T22:49:13Z","title":"GCBF+: A Neural Graph Control Barrier Function Framework for Distributed\n  Safe Multi-Agent Control","summary":"  Distributed, scalable, and safe control of large-scale multi-agent systems is\na challenging problem. In this paper, we design a distributed framework for\nsafe multi-agent control in large-scale environments with obstacles, where a\nlarge number of agents are required to maintain safety using only local\ninformation and reach their goal locations. We introduce a new class of\ncertificates, termed graph control barrier function (GCBF), which are based on\nthe well-established control barrier function theory for safety guarantees and\nutilize a graph structure for scalable and generalizable distributed control of\nMAS. We develop a novel theoretical framework to prove the safety of an\narbitrary-sized MAS with a single GCBF. We propose a new training framework\nGCBF+ that uses graph neural networks to parameterize a candidate GCBF and a\ndistributed control policy. The proposed framework is distributed and is\ncapable of taking point clouds from LiDAR, instead of actual state information,\nfor real-world robotic applications. We illustrate the efficacy of the proposed\nmethod through various hardware experiments on a swarm of drones with\nobjectives ranging from exchanging positions to docking on a moving target\nwithout collision. Additionally, we perform extensive numerical experiments,\nwhere the number and density of agents, as well as the number of obstacles,\nincrease. Empirical results show that in complex environments with agents with\nnonlinear dynamics (e.g., Crazyflie drones), GCBF+ outperforms the hand-crafted\nCBF-based method with the best performance by up to 20% for relatively\nsmall-scale MAS with up to 256 agents, and leading reinforcement learning (RL)\nmethods by up to 40% for MAS with 1024 agents. Furthermore, the proposed method\ndoes not compromise on the performance, in terms of goal reaching, for\nachieving high safety rates, which is a common trade-off in RL-based methods.\n","authors":["Songyuan Zhang","Oswin So","Kunal Garg","Chuchu Fan"],"pdf_url":"https://arxiv.org/pdf/2401.14554v2.pdf","comment":"20 pages, 15 figures; Accepted by IEEE Transactions on Robotics\n  (T-RO)"},{"id":"http://arxiv.org/abs/2402.16623v2","updated":"2025-01-07T20:06:34Z","published":"2024-02-26T14:53:39Z","title":"Generalized sparsity-promoting solvers for Bayesian inverse problems:\n  Versatile sparsifying transforms and unknown noise variances","summary":"  Bayesian hierarchical models can provide efficient algorithms for finding\nsparse solutions to ill-posed inverse problems. The models typically comprise a\nconditionally Gaussian prior model for the unknown which is augmented by a\ngeneralized gamma hyper-prior model for variance hyper-parameters. This\ninvestigation generalizes these models and their efficient maximum a posterior\n(MAP) estimation using the iterative alternating sequential (IAS) algorithm in\ntwo ways: (1) General sparsifying transforms: Diverging from conventional\nmethods, our approach permits the use of sparsifying transformations with\nnontrivial kernels; (2) Unknown noise variances: We treat the noise variance as\na random variable that is estimated during the inference procedure. This is\nimportant in applications where the noise estimate cannot be accurately\nestimated a priori. Remarkably, these augmentations neither significantly\nburden the computational expense of the algorithm nor compromise its efficacy.\nWe include convexity and convergence analysis for the method and demonstrate\nits efficacy in several numerical experiments.\n","authors":["Jonathan Lindbloom","Jan Glaubitz","Anne Gelb"],"pdf_url":"https://arxiv.org/pdf/2402.16623v2.pdf","comment":"27 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.10735v2","updated":"2025-01-07T19:59:26Z","published":"2024-05-17T12:29:48Z","title":"Variance-reduction for Variational Inequality Problems with Bregman\n  Distance Function","summary":"  In this paper, we address variational inequalities (VI) with a finite-sum\nstructure. We introduce a novel single-loop stochastic variance-reduced\nalgorithm, incorporating the Bregman distance function, and establish an\noptimal convergence guarantee under a monotone setting. Additionally, we\nexplore a structured class of non-monotone problems that exhibit weak Minty\nsolutions, and analyze the complexity of our proposed method, highlighting a\nsignificant improvement over existing approaches. Numerical experiments are\npresented to demonstrate the performance of our algorithm compared to\nstate-of-the-art methods\n","authors":["Zeinab Alizadeh","Erfan Yazdandoost Hamedani","Afrooz Jalilzadeh"],"pdf_url":"https://arxiv.org/pdf/2405.10735v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04105v1","updated":"2025-01-07T19:29:10Z","published":"2025-01-07T19:29:10Z","title":"DeepVIVONet: Using deep neural operators to optimize sensor locations\n  with application to vortex-induced vibrations","summary":"  We introduce DeepVIVONet, a new framework for optimal dynamic reconstruction\nand forecasting of the vortex-induced vibrations (VIV) of a marine riser, using\nfield data. We demonstrate the effectiveness of DeepVIVONet in accurately\nreconstructing the motion of an off--shore marine riser by using sparse\nspatio-temporal measurements. We also show the generalization of our model in\nextrapolating to other flow conditions via transfer learning, underscoring its\npotential to streamline operational efficiency and enhance predictive accuracy.\nThe trained DeepVIVONet serves as a fast and accurate surrogate model for the\nmarine riser, which we use in an outer--loop optimization algorithm to obtain\nthe optimal locations for placing the sensors. Furthermore, we employ an\nexisting sensor placement method based on proper orthogonal decomposition (POD)\nto compare with our data-driven approach. We find that that while POD offers a\ngood approach for initial sensor placement, DeepVIVONet's adaptive capabilities\nyield more precise and cost-effective configurations.\n","authors":["Ruyin Wan","Ehsan Kharazmi","Michael S Triantafyllou","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2501.04105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08637v2","updated":"2025-01-07T19:26:17Z","published":"2024-06-12T20:50:26Z","title":"A Game Between Two Identical Dubins Cars: Evading a Conic Sensor in\n  Minimum Time","summary":"  A fundamental task in mobile robotics is keeping an intelligent agent under\nsurveillance with an autonomous robot as it travels in the environment. This\nwork studies a theoretical version of that problem involving one of the most\npopular vehicle platforms in robotics. In particular, we consider two identical\nDubins cars moving on a plane without obstacles. One of them plays as the\npursuer, and it is equipped with a limited field-of-view detection region\nmodeled as a semi-infinite cone with its apex at the pursuer's position. The\npursuer aims to maintain the other Dubins car, which plays as the evader, as\nmuch time as possible inside its detection region. On the contrary, the evader\nwants to escape as soon as possible. In this work, employing differential game\ntheory, we find the time-optimal motion strategies near the game's end. The\nanalysis of those trajectories reveals the existence of at least two singular\nsurfaces: a Transition Surface (also known as a Switch Surface) and an Evader's\nUniversal Surface. We also found that the barrier's standard construction\nproduces a surface that partially lies outside the playing space.\n","authors":["Ubaldo Ruiz"],"pdf_url":"https://arxiv.org/pdf/2406.08637v2.pdf","comment":"35 pages, 16 figures"},{"id":"http://arxiv.org/abs/2501.02098v2","updated":"2025-01-07T14:20:44Z","published":"2025-01-03T20:51:07Z","title":"Graph-Based Modeling and Decomposition of Hierarchical Optimization\n  Problems","summary":"  We present a graph-theoretic modeling approach for hierarchical optimization\nthat leverages the OptiGraph abstraction implemented in the Julia package\nPlasmo$.$jl. We show that the abstraction is flexible and can effectively\ncapture complex hierarchical connectivity that arises from decision-making over\nmultiple spatial and temporal scales (e.g., integration of planning,\nscheduling, and operations in manufacturing and infrastructures). We also show\nthat the graph abstraction facilitates the conceptualization and implementation\nof decomposition and approximation schemes. Specifically, we propose a\ngraph-based Benders decomposition (gBD) framework that enables the exploitation\nof hierarchical (nested) structures and that uses graph\naggregation/partitioning procedures to discover such structures. In addition,\nwe provide a Julia implementation of gBD, which we call PlasmoBenders$.$jl. We\nillustrate the capabilities using examples arising in the context of energy and\npower systems.\n","authors":["David L. Cole","Filippo Pecci","Omar J. Guerra","Harsha Gangammanavar","Jesse D. Jenkins","Victor M. Zavala"],"pdf_url":"https://arxiv.org/pdf/2501.02098v2.pdf","comment":"66 pages, 3 tables, 28 figures, updated abstract"},{"id":"http://arxiv.org/abs/2112.02215v3","updated":"2025-01-07T20:32:52Z","published":"2021-12-04T01:40:34Z","title":"Deep Policy Iteration with Integer Programming for Inventory Management","summary":"  We present a Reinforcement Learning (RL) based framework for optimizing\nlong-term discounted reward problems with large combinatorial action space and\nstate dependent constraints. These characteristics are common to many\noperations management problems, e.g., network inventory replenishment, where\nmanagers have to deal with uncertain demand, lost sales, and capacity\nconstraints that results in more complex feasible action spaces. Our proposed\nProgrammable Actor Reinforcement Learning (PARL) uses a deep-policy iteration\nmethod that leverages neural networks (NNs) to approximate the value function\nand combines it with mathematical programming (MP) and sample average\napproximation (SAA) to solve the per-step-action optimally while accounting for\ncombinatorial action spaces and state-dependent constraint sets. We show how\nthe proposed methodology can be applied to complex inventory replenishment\nproblems where analytical solutions are intractable. We also benchmark the\nproposed algorithm against state-of-the-art RL algorithms and commonly used\nreplenishment heuristics and find it considerably outperforms existing methods\nby as much as 14.7% on average in various complex supply chain settings. We\nfind that this improvement of PARL over benchmark algorithms can be directly\nattributed to better inventory cost management, especially in inventory\nconstrained settings. Furthermore, in the simpler setting where optimal\nreplenishment policy is tractable or known near optimal heuristics exist, we\nfind that the RL approaches can learn near optimal policies. Finally, to make\nRL algorithms more accessible for inventory management researchers, we also\ndiscuss the development of a modular Python library that can be used to test\nthe performance of RL algorithms with various supply chain structures and spur\nfuture research in developing practical and near-optimal algorithms for\ninventory management problems.\n","authors":["Pavithra Harsha","Ashish Jagmohan","Jayant Kalagnanam","Brian Quanz","Divya Singhvi"],"pdf_url":"https://arxiv.org/pdf/2112.02215v3.pdf","comment":"Prior shorter version accepted to NeurIPS 2021 Deep RL Workshop.\n  Updated version to appear in MSOM journal. Authors are listed in alphabetical\n  order"},{"id":"http://arxiv.org/abs/2501.05481v1","updated":"2025-01-07T19:43:35Z","published":"2025-01-07T19:43:35Z","title":"Blackwell Equilibrium in Repeated Games","summary":"  We apply Blackwell optimality to repeated games. An equilibrium whose\nstrategy profile is sequentially rational for all high enough discount factors\nsimultaneously is a Blackwell (subgame-perfect, perfect public, etc.)\nequilibrium. The bite of this requirement depends on the monitoring structure.\nUnder perfect monitoring, a ``folk'' theorem holds relative to an appropriate\nnotion of minmax. Under imperfect public monitoring, absent a public\nrandomization device, any perfect public equilibrium generically involves pure\naction profiles or stage-game Nash equilibria only. Under private conditionally\nindependent monitoring, in a class of games that includes the prisoner's\ndilemma, the stage-game Nash equilibrium is played in every round.\n","authors":["Costas Cavounidis","Sambuddha Ghosh","Johannes Hörner","Eilon Solan","Satoru Takahashi"],"pdf_url":"https://arxiv.org/pdf/2501.05481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.14104v2","updated":"2025-01-07T03:02:38Z","published":"2021-11-28T11:01:48Z","title":"Optimal Partition for Multi-Type Queueing System","summary":"  We study an optimal server partition and customer assignment problem for an\nuncapacitated FCFS queueing system with heterogeneous types of customers. Each\ntype of customers is associated with a Poisson arrival, a certain service time\ndistribution, and a unit waiting cost. The goal is to minimize the expected\ntotal waiting cost by partitioning the server into sub-queues, each with a\nsmaller service capacity, and routing customer types probabilistically. First,\nwe show that by properly partitioning the queue, it is possible to reduce the\nexpected waiting costs by an arbitrarily large ratio. Then, we show that for\nany given server partition, the optimal customer assignment admits a certain\ngeometric structure, enabling an efficient algorithm to find the optimal\nassignment. Such an optimal structure also applies when minimizing the expected\nsojourn time. Finally, we consider the joint partition-assignment optimization\nproblem. The customer assignment under the optimal server partition admits a\nstronger structure. Specifically, if the first two moments of the service time\ndistributions satisfy certain properties, it is optimal to deterministically\nassign customer types with consecutive service rates to the same sub-queue.\nThis structure allows for more efficient algorithms. Overall, the common rule\nof thumb to partition customers into continuous segments ranked by service\nrates could be suboptimal, and our work is the first to comprehensively study\nthe queue partition problem based on customer types.\n","authors":["Shengyu Cao","Simai He","Zizhuo Wang","Yifan Feng"],"pdf_url":"https://arxiv.org/pdf/2111.14104v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.04005v1","updated":"2025-01-07T18:59:59Z","published":"2025-01-07T18:59:59Z","title":"LargeAD: Large-Scale Cross-Sensor Data Pretraining for Autonomous\n  Driving","summary":"  Recent advancements in vision foundation models (VFMs) have revolutionized\nvisual perception in 2D, yet their potential for 3D scene understanding,\nparticularly in autonomous driving applications, remains underexplored. In this\npaper, we introduce LargeAD, a versatile and scalable framework designed for\nlarge-scale 3D pretraining across diverse real-world driving datasets. Our\nframework leverages VFMs to extract semantically rich superpixels from 2D\nimages, which are aligned with LiDAR point clouds to generate high-quality\ncontrastive samples. This alignment facilitates cross-modal representation\nlearning, enhancing the semantic consistency between 2D and 3D data. We\nintroduce several key innovations: i) VFM-driven superpixel generation for\ndetailed semantic representation, ii) a VFM-assisted contrastive learning\nstrategy to align multimodal features, iii) superpoint temporal consistency to\nmaintain stable representations across time, and iv) multi-source data\npretraining to generalize across various LiDAR configurations. Our approach\ndelivers significant performance improvements over state-of-the-art methods in\nboth linear probing and fine-tuning tasks for both LiDAR-based segmentation and\nobject detection. Extensive experiments on eleven large-scale multi-modal\ndatasets highlight our superior performance, demonstrating the adaptability,\nefficiency, and robustness in real-world autonomous driving scenarios.\n","authors":["Lingdong Kong","Xiang Xu","Youquan Liu","Jun Cen","Runnan Chen","Wenwei Zhang","Liang Pan","Kai Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.04005v1.pdf","comment":"Preprint; 16 pages, 7 figures, 8 tables; Project Page at\n  https://ldkong.com/LargeAD"},{"id":"http://arxiv.org/abs/2501.04004v1","updated":"2025-01-07T18:59:58Z","published":"2025-01-07T18:59:58Z","title":"LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes","summary":"  LiDAR data pretraining offers a promising approach to leveraging large-scale,\nreadily available datasets for enhanced data utilization. However, existing\nmethods predominantly focus on sparse voxel representation, overlooking the\ncomplementary attributes provided by other LiDAR representations. In this work,\nwe propose LiMoE, a framework that integrates the Mixture of Experts (MoE)\nparadigm into LiDAR data representation learning to synergistically combine\nmultiple representations, such as range images, sparse voxels, and raw points.\nOur approach consists of three stages: i) Image-to-LiDAR Pretraining, which\ntransfers prior knowledge from images to point clouds across different\nrepresentations; ii) Contrastive Mixture Learning (CML), which uses MoE to\nadaptively activate relevant attributes from each representation and distills\nthese mixed features into a unified 3D network; iii) Semantic Mixture\nSupervision (SMS), which combines semantic logits from multiple representations\nto boost downstream segmentation performance. Extensive experiments across 11\nlarge-scale LiDAR datasets demonstrate our effectiveness and superiority. The\ncode and model checkpoints have been made publicly accessible.\n","authors":["Xiang Xu","Lingdong Kong","Hui Shuai","Liang Pan","Ziwei Liu","Qingshan Liu"],"pdf_url":"https://arxiv.org/pdf/2501.04004v1.pdf","comment":"Preprint; 26 pages, 17 figures, 7 tables; Project Page at\n  https://ldkong.com/LiMoE"},{"id":"http://arxiv.org/abs/2501.04003v1","updated":"2025-01-07T18:59:55Z","published":"2025-01-07T18:59:55Z","title":"Are VLMs Ready for Autonomous Driving? An Empirical Study from the\n  Reliability, Data, and Metric Perspectives","summary":"  Recent advancements in Vision-Language Models (VLMs) have sparked interest in\ntheir use for autonomous driving, particularly in generating interpretable\ndriving decisions through natural language. However, the assumption that VLMs\ninherently provide visually grounded, reliable, and interpretable explanations\nfor driving remains largely unexamined. To address this gap, we introduce\nDriveBench, a benchmark dataset designed to evaluate VLM reliability across 17\nsettings (clean, corrupted, and text-only inputs), encompassing 19,200 frames,\n20,498 question-answer pairs, three question types, four mainstream driving\ntasks, and a total of 12 popular VLMs. Our findings reveal that VLMs often\ngenerate plausible responses derived from general knowledge or textual cues\nrather than true visual grounding, especially under degraded or missing visual\ninputs. This behavior, concealed by dataset imbalances and insufficient\nevaluation metrics, poses significant risks in safety-critical scenarios like\nautonomous driving. We further observe that VLMs struggle with multi-modal\nreasoning and display heightened sensitivity to input corruptions, leading to\ninconsistencies in performance. To address these challenges, we propose refined\nevaluation metrics that prioritize robust visual grounding and multi-modal\nunderstanding. Additionally, we highlight the potential of leveraging VLMs'\nawareness of corruptions to enhance their reliability, offering a roadmap for\ndeveloping more trustworthy and interpretable decision-making systems in\nreal-world autonomous driving contexts. The benchmark toolkit is publicly\naccessible.\n","authors":["Shaoyuan Xie","Lingdong Kong","Yuhao Dong","Chonghao Sima","Wenwei Zhang","Qi Alfred Chen","Ziwei Liu","Liang Pan"],"pdf_url":"https://arxiv.org/pdf/2501.04003v1.pdf","comment":"Preprint; 41 pages, 32 figures, 16 tables; Project Page at\n  https://drive-bench.github.io/"},{"id":"http://arxiv.org/abs/2501.04002v1","updated":"2025-01-07T18:59:28Z","published":"2025-01-07T18:59:28Z","title":"Extraction Of Cumulative Blobs From Dynamic Gestures","summary":"  Gesture recognition is a perceptual user interface, which is based on CV\ntechnology that allows the computer to interpret human motions as commands,\nallowing users to communicate with a computer without the use of hands, thus\nmaking the mouse and keyboard superfluous. Gesture recognition's main weakness\nis a light condition because gesture control is based on computer vision, which\nheavily relies on cameras. These cameras are used to interpret gestures in 2D\nand 3D, so the extracted information can vary depending on the source of light.\nThe limitation of the system cannot work in a dark environment. A simple night\nvision camera can be used as our camera for motion capture as they also blast\nout infrared light which is not visible to humans but can be clearly seen with\na camera that has no infrared filter this majorly overcomes the limitation of\nsystems which cannot work in a dark environment. So, the video stream from the\ncamera is fed into a Raspberry Pi which has a Python program running OpenCV\nmodule which is used for detecting, isolating and tracking the path of dynamic\ngesture, then we use an algorithm of machine learning to recognize the pattern\ndrawn and accordingly control the GPIOs of the raspberry pi to perform some\nactivities.\n","authors":["Rishabh Naulakha","Shubham Gaur","Dhairya Lodha","Mehek Tulsyan","Utsav Kotecha"],"pdf_url":"https://arxiv.org/pdf/2501.04002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04001v1","updated":"2025-01-07T18:58:54Z","published":"2025-01-07T18:58:54Z","title":"Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of\n  Images and Videos","summary":"  This work presents Sa2VA, the first unified model for dense grounded\nunderstanding of both images and videos. Unlike existing multi-modal large\nlanguage models, which are often limited to specific modalities and tasks,\nSa2VA supports a wide range of image and video tasks, including referring\nsegmentation and conversation, with minimal one-shot instruction tuning. Sa2VA\ncombines SAM-2, a foundation video segmentation model, with LLaVA, an advanced\nvision-language model, and unifies text, image, and video into a shared LLM\ntoken space. Using the LLM, Sa2VA generates instruction tokens that guide SAM-2\nin producing precise masks, enabling a grounded, multi-modal understanding of\nboth static and dynamic visual content. Additionally, we introduce Ref-SAV, an\nauto-labeled dataset containing over 72k object expressions in complex video\nscenes, designed to boost model performance. We also manually validate 2k video\nobjects in the Ref-SAV datasets to benchmark referring video object\nsegmentation in complex environments. Experiments show that Sa2VA achieves\nstate-of-the-art across multiple tasks, particularly in referring video object\nsegmentation, highlighting its potential for complex real-world applications.\n","authors":["Haobo Yuan","Xiangtai Li","Tao Zhang","Zilong Huang","Shilin Xu","Shunping Ji","Yunhai Tong","Lu Qi","Jiashi Feng","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2501.04001v1.pdf","comment":"Project page: https://lxtgh.github.io/project/sa2va"},{"id":"http://arxiv.org/abs/2501.03995v1","updated":"2025-01-07T18:52:05Z","published":"2025-01-07T18:52:05Z","title":"RAG-Check: Evaluating Multimodal Retrieval Augmented Generation\n  Performance","summary":"  Retrieval-augmented generation (RAG) improves large language models (LLMs) by\nusing external knowledge to guide response generation, reducing hallucinations.\nHowever, RAG, particularly multi-modal RAG, can introduce new hallucination\nsources: (i) the retrieval process may select irrelevant pieces (e.g.,\ndocuments, images) as raw context from the database, and (ii) retrieved images\nare processed into text-based context via vision-language models (VLMs) or\ndirectly used by multi-modal language models (MLLMs) like GPT-4o, which may\nhallucinate. To address this, we propose a novel framework to evaluate the\nreliability of multi-modal RAG using two performance measures: (i) the\nrelevancy score (RS), assessing the relevance of retrieved entries to the\nquery, and (ii) the correctness score (CS), evaluating the accuracy of the\ngenerated response. We train RS and CS models using a ChatGPT-derived database\nand human evaluator samples. Results show that both models achieve ~88%\naccuracy on test data. Additionally, we construct a 5000-sample human-annotated\ndatabase evaluating the relevancy of retrieved pieces and the correctness of\nresponse statements. Our RS model aligns with human preferences 20% more often\nthan CLIP in retrieval, and our CS model matches human preferences ~91% of the\ntime. Finally, we assess various RAG systems' selection and generation\nperformances using RS and CS.\n","authors":["Matin Mortaheb","Mohammad A. Amir Khojastepour","Srimat T. Chakradhar","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2501.03995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03992v1","updated":"2025-01-07T18:50:06Z","published":"2025-01-07T18:50:06Z","title":"NeuralSVG: An Implicit Representation for Text-to-Vector Generation","summary":"  Vector graphics are essential in design, providing artists with a versatile\nmedium for creating resolution-independent and highly editable visual content.\nRecent advancements in vision-language and diffusion models have fueled\ninterest in text-to-vector graphics generation. However, existing approaches\noften suffer from over-parameterized outputs or treat the layered structure - a\ncore feature of vector graphics - as a secondary goal, diminishing their\npractical use. Recognizing the importance of layered SVG representations, we\npropose NeuralSVG, an implicit neural representation for generating vector\ngraphics from text prompts. Inspired by Neural Radiance Fields (NeRFs),\nNeuralSVG encodes the entire scene into the weights of a small MLP network,\noptimized using Score Distillation Sampling (SDS). To encourage a layered\nstructure in the generated SVG, we introduce a dropout-based regularization\ntechnique that strengthens the standalone meaning of each shape. We\nadditionally demonstrate that utilizing a neural representation provides an\nadded benefit of inference-time control, enabling users to dynamically adapt\nthe generated SVG based on user-provided inputs, all with a single learned\nrepresentation. Through extensive qualitative and quantitative evaluations, we\ndemonstrate that NeuralSVG outperforms existing methods in generating\nstructured and flexible SVG.\n","authors":["Sagi Polaczek","Yuval Alaluf","Elad Richardson","Yael Vinker","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2501.03992v1.pdf","comment":"Project Page: https://sagipolaczek.github.io/NeuralSVG/"},{"id":"http://arxiv.org/abs/2406.14794v5","updated":"2025-01-07T18:49:42Z","published":"2024-06-20T23:51:32Z","title":"ImageFlowNet: Forecasting Multiscale Image-Level Trajectories of Disease\n  Progression with Irregularly-Sampled Longitudinal Medical Images","summary":"  Advances in medical imaging technologies have enabled the collection of\nlongitudinal images, which involve repeated scanning of the same patients over\ntime, to monitor disease progression. However, predictive modeling of such data\nremains challenging due to high dimensionality, irregular sampling, and data\nsparsity. To address these issues, we propose ImageFlowNet, a novel model\ndesigned to forecast disease trajectories from initial images while preserving\nspatial details. ImageFlowNet first learns multiscale joint representation\nspaces across patients and time points, then optimizes deterministic or\nstochastic flow fields within these spaces using a position-parameterized\nneural ODE/SDE framework. The model leverages a UNet architecture to create\nrobust multiscale representations and mitigates data scarcity by combining\nknowledge from all patients. We provide theoretical insights that support our\nformulation of ODEs, and motivate our regularizations involving high-level\nvisual features, latent space organization, and trajectory smoothness. We\nvalidate ImageFlowNet on three longitudinal medical image datasets depicting\nprogression in geographic atrophy, multiple sclerosis, and glioblastoma,\ndemonstrating its ability to effectively forecast disease progression and\noutperform existing methods. Our contributions include the development of\nImageFlowNet, its theoretical underpinnings, and empirical validation on\nreal-world datasets. The official implementation is available at\nhttps://github.com/KrishnaswamyLab/ImageFlowNet.\n","authors":["Chen Liu","Ke Xu","Liangbo L. Shen","Guillaume Huguet","Zilong Wang","Alexander Tong","Danilo Bzdok","Jay Stewart","Jay C. Wang","Lucian V. Del Priore","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2406.14794v5.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.03968v1","updated":"2025-01-07T18:06:27Z","published":"2025-01-07T18:06:27Z","title":"VLM-driven Behavior Tree for Context-aware Task Planning","summary":"  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)\nhas recently gained attention in the robotics community, yet remains in its\nearly stages of development. In this paper, we propose a novel framework that\nleverages Vision-Language Models (VLMs) to interactively generate and edit BTs\nthat address visual conditions, enabling context-aware robot operations in\nvisually complex environments. A key feature of our approach lies in the\nconditional control through self-prompted visual conditions. Specifically, the\nVLM generates BTs with visual condition nodes, where conditions are expressed\nas free-form text. Another VLM process integrates the text into its prompt and\nevaluates the conditions against real-world images during robot execution. We\nvalidated our framework in a real-world cafe scenario, demonstrating both its\nfeasibility and limitations.\n","authors":["Naoki Wake","Atsushi Kanehira","Jun Takamatsu","Kazuhiro Sasabuchi","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2501.03968v1.pdf","comment":"10 pages, 11 figures, 5 tables. Last updated on January 7th, 2024"},{"id":"http://arxiv.org/abs/2501.03967v1","updated":"2025-01-07T18:05:24Z","published":"2025-01-07T18:05:24Z","title":"Temporal Feature Weaving for Neonatal Echocardiographic Viewpoint Video\n  Classification","summary":"  Automated viewpoint classification in echocardiograms can help\nunder-resourced clinics and hospitals in providing faster diagnosis and\nscreening when expert technicians may not be available. We propose a novel\napproach towards echocardiographic viewpoint classification. We show that\ntreating viewpoint classification as video classification rather than image\nclassification yields advantage. We propose a CNN-GRU architecture with a novel\ntemporal feature weaving method, which leverages both spatial and temporal\ninformation to yield a 4.33\\% increase in accuracy over baseline image\nclassification while using only four consecutive frames. The proposed approach\nincurs minimal computational overhead. Additionally, we publish the Neonatal\nEchocardiogram Dataset (NED), a professionally-annotated dataset providing\nsixteen viewpoints and associated echocardipgraphy videos to encourage future\nwork and development in this field. Code available at:\nhttps://github.com/satchelfrench/NED\n","authors":["Satchel French","Faith Zhu","Amish Jain","Naimul Khan"],"pdf_url":"https://arxiv.org/pdf/2501.03967v1.pdf","comment":"Accepted to ISBI 2025"},{"id":"http://arxiv.org/abs/2501.03957v1","updated":"2025-01-07T17:37:57Z","published":"2025-01-07T17:37:57Z","title":"Vision Language Models as Values Detectors","summary":"  Large Language Models integrating textual and visual inputs have introduced\nnew possibilities for interpreting complex data. Despite their remarkable\nability to generate coherent and contextually relevant text based on visual\nstimuli, the alignment of these models with human perception in identifying\nrelevant elements in images requires further exploration. This paper\ninvestigates the alignment between state-of-the-art LLMs and human annotators\nin detecting elements of relevance within home environment scenarios. We\ncreated a set of twelve images depicting various domestic scenarios and\nenlisted fourteen annotators to identify the key element in each image. We then\ncompared these human responses with outputs from five different LLMs, including\nGPT-4o and four LLaVA variants. Our findings reveal a varied degree of\nalignment, with LLaVA 34B showing the highest performance but still scoring\nlow. However, an analysis of the results highlights the models' potential to\ndetect value-laden elements in images, suggesting that with improved training\nand refined prompts, LLMs could enhance applications in social robotics,\nassistive technologies, and human-computer interaction by providing deeper\ninsights and more contextually relevant responses.\n","authors":["Giulio Antonio Abbo","Tony Belpaeme"],"pdf_url":"https://arxiv.org/pdf/2501.03957v1.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2405.18679v2","updated":"2025-01-07T17:00:36Z","published":"2024-05-29T01:01:19Z","title":"Vim-F: Visual State Space Model Benefiting from Learning in the\n  Frequency Domain","summary":"  In recent years, State Space Models (SSMs) with efficient hardware-aware\ndesigns, known as the Mamba deep learning models, have made significant\nprogress in modeling long sequences such as language understanding. Therefore,\nbuilding efficient and general-purpose visual backbones based on SSMs is a\npromising direction. Compared to traditional convolutional neural networks\n(CNNs) and Vision Transformers (ViTs), the performance of Vision Mamba (ViM)\nmethods is not yet fully competitive. To enable SSMs to process image data,\nViMs typically flatten 2D images into 1D sequences, inevitably ignoring some 2D\nlocal dependencies, thereby weakening the model's ability to interpret spatial\nrelationships from a global perspective. We use Fast Fourier Transform (FFT) to\nobtain the spectrum of the feature map and add it to the original feature map,\nenabling ViM to model a unified visual representation in both frequency and\nspatial domains. The introduction of frequency domain information enables ViM\nto have a global receptive field during scanning. We propose a novel model\ncalled Vim-F, which employs pure Mamba encoders and scans in both the frequency\nand spatial domains. Moreover, we question the necessity of position embedding\nin ViM and remove it accordingly in Vim-F, which helps to fully utilize the\nefficient long-sequence modeling capability of ViM. Finally, we redesign a\npatch embedding for Vim-F, leveraging a convolutional stem to capture more\nlocal correlations, further improving the performance of Vim-F. Code is\navailable at: \\url{https://github.com/yws-wxs/Vim-F}.\n","authors":["Juntao Zhang","Shaogeng Liu","Kun Bian","You Zhou","Pei Zhang","Wenbo An","Jun Zhou","Kun Shao"],"pdf_url":"https://arxiv.org/pdf/2405.18679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03939v1","updated":"2025-01-07T17:00:35Z","published":"2025-01-07T17:00:35Z","title":"Visual question answering: from early developments to recent advances --\n  a survey","summary":"  Visual Question Answering (VQA) is an evolving research field aimed at\nenabling machines to answer questions about visual content by integrating image\nand language processing techniques such as feature extraction, object\ndetection, text embedding, natural language understanding, and language\ngeneration. With the growth of multimodal data research, VQA has gained\nsignificant attention due to its broad applications, including interactive\neducational tools, medical image diagnosis, customer service, entertainment,\nand social media captioning. Additionally, VQA plays a vital role in assisting\nvisually impaired individuals by generating descriptive content from images.\nThis survey introduces a taxonomy of VQA architectures, categorizing them based\non design choices and key components to facilitate comparative analysis and\nevaluation. We review major VQA approaches, focusing on deep learning-based\nmethods, and explore the emerging field of Large Visual Language Models (LVLMs)\nthat have demonstrated success in multimodal tasks like VQA. The paper further\nexamines available datasets and evaluation metrics essential for measuring VQA\nsystem performance, followed by an exploration of real-world VQA applications.\nFinally, we highlight ongoing challenges and future directions in VQA research,\npresenting open questions and potential areas for further development. This\nsurvey serves as a comprehensive resource for researchers and practitioners\ninterested in the latest advancements and future\n","authors":["Ngoc Dung Huynh","Mohamed Reda Bouadjenek","Sunil Aryal","Imran Razzak","Hakim Hacid"],"pdf_url":"https://arxiv.org/pdf/2501.03939v1.pdf","comment":"20"},{"id":"http://arxiv.org/abs/2501.00625v2","updated":"2025-01-07T16:49:29Z","published":"2024-12-31T19:53:27Z","title":"Gaussian Building Mesh (GBM): Extract a Building's 3D Mesh with Google\n  Earth and Gaussian Splatting","summary":"  Recently released open-source pre-trained foundational image segmentation and\nobject detection models (SAM2+GroundingDINO) allow for geometrically consistent\nsegmentation of objects of interest in multi-view 2D images. Users can use\ntext-based or click-based prompts to segment objects of interest without\nrequiring labeled training datasets. Gaussian Splatting allows for the learning\nof the 3D representation of a scene's geometry and radiance based on 2D images.\nCombining Google Earth Studio, SAM2+GroundingDINO, 2D Gaussian Splatting, and\nour improvements in mask refinement based on morphological operations and\ncontour simplification, we created a pipeline to extract the 3D mesh of any\nbuilding based on its name, address, or geographic coordinates.\n","authors":["Kyle Gao","Liangzhi Li","Hongjie He","Dening Lu","Linlin Xu","Jonathan Li"],"pdf_url":"https://arxiv.org/pdf/2501.00625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03932v1","updated":"2025-01-07T16:48:47Z","published":"2025-01-07T16:48:47Z","title":"CoStruction: Conjoint radiance field optimization for urban scene\n  reconStruction with limited image overlap","summary":"  Reconstructing the surrounding surface geometry from recorded driving\nsequences poses a significant challenge due to the limited image overlap and\ncomplex topology of urban environments. SoTA neural implicit surface\nreconstruction methods often struggle in such setting, either failing due to\nsmall vision overlap or exhibiting suboptimal performance in accurately\nreconstructing both the surface and fine structures. To address these\nlimitations, we introduce CoStruction, a novel hybrid implicit surface\nreconstruction method tailored for large driving sequences with limited camera\noverlap. CoStruction leverages cross-representation uncertainty estimation to\nfilter out ambiguous geometry caused by limited observations. Our method\nperforms joint optimization of both radiance fields in addition to guided\nsampling achieving accurate reconstruction of large areas along with fine\nstructures in complex urban scenarios. Extensive evaluation on major driving\ndatasets demonstrates the superiority of our approach in reconstructing large\ndriving sequences with limited image overlap, outperforming concurrent SoTA\nmethods.\n","authors":["Fusang Wang","Hala Djeghim","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou"],"pdf_url":"https://arxiv.org/pdf/2501.03932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03931v1","updated":"2025-01-07T16:48:31Z","published":"2025-01-07T16:48:31Z","title":"Magic Mirror: ID-Preserved Video Generation in Video Diffusion\n  Transformers","summary":"  We present Magic Mirror, a framework for generating identity-preserved videos\nwith cinematic-level quality and dynamic motion. While recent advances in video\ndiffusion models have shown impressive capabilities in text-to-video\ngeneration, maintaining consistent identity while producing natural motion\nremains challenging. Previous methods either require person-specific\nfine-tuning or struggle to balance identity preservation with motion diversity.\nBuilt upon Video Diffusion Transformers, our method introduces three key\ncomponents: (1) a dual-branch facial feature extractor that captures both\nidentity and structural features, (2) a lightweight cross-modal adapter with\nConditioned Adaptive Normalization for efficient identity integration, and (3)\na two-stage training strategy combining synthetic identity pairs with video\ndata. Extensive experiments demonstrate that Magic Mirror effectively balances\nidentity consistency with natural motion, outperforming existing methods across\nmultiple metrics while requiring minimal parameters added. The code and model\nwill be made publicly available at:\nhttps://github.com/dvlab-research/MagicMirror/\n","authors":["Yuechen Zhang","Yaoyang Liu","Bin Xia","Bohao Peng","Zexin Yan","Eric Lo","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2501.03931v1.pdf","comment":"It is best viewed in Acrobat. Project Page:\n  https://julianjuaner.github.io/projects/MagicMirror/"},{"id":"http://arxiv.org/abs/2501.03923v1","updated":"2025-01-07T16:35:29Z","published":"2025-01-07T16:35:29Z","title":"Explainable AI model reveals disease-related mechanisms in single-cell\n  RNA-seq data","summary":"  Neurodegenerative diseases (NDDs) are complex and lack effective treatment\ndue to their poorly understood mechanism. The increasingly used data analysis\nfrom Single nucleus RNA Sequencing (snRNA-seq) allows to explore transcriptomic\nevents at a single cell level, yet face challenges in interpreting the\nmechanisms underlying a disease. On the other hand, Neural Network (NN) models\ncan handle complex data to offer insights but can be seen as black boxes with\npoor interpretability. In this context, explainable AI (XAI) emerges as a\nsolution that could help to understand disease-associated mechanisms when\ncombined with efficient NN models. However, limited research explores XAI in\nsingle-cell data. In this work, we implement a method for identifying\ndisease-related genes and the mechanistic explanation of disease progression\nbased on NN model combined with SHAP. We analyze available Huntington's disease\n(HD) data to identify both HD-altered genes and mechanisms by adding Gene Set\nEnrichment Analysis (GSEA) comparing two methods, differential gene expression\nanalysis (DGE) and NN combined with SHAP approach. Our results show that DGE\nand SHAP approaches offer both common and differential sets of altered genes\nand pathways, reinforcing the usefulness of XAI methods for a broader\nperspective of disease.\n","authors":["Mohammad Usman","Olga Varea","Petia Radeva","Josep Canals","Jordi Abante","Daniel Ortiz"],"pdf_url":"https://arxiv.org/pdf/2501.03923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03916v1","updated":"2025-01-07T16:31:10Z","published":"2025-01-07T16:31:10Z","title":"Dolphin: Closed-loop Open-ended Auto-research through Thinking,\n  Practice, and Feedback","summary":"  The scientific research paradigm is undergoing a profound transformation\nowing to the development of Artificial Intelligence (AI). Recent works\ndemonstrate that various AI-assisted research methods can largely improve\nresearch efficiency by improving data analysis, accelerating computation, and\nfostering novel idea generation. To further move towards the ultimate goal\n(i.e., automatic scientific research), in this paper, we propose Dolphin, the\nfirst closed-loop open-ended auto-research framework to further build the\nentire process of human scientific research. Dolphin can generate research\nideas, perform experiments, and get feedback from experimental results to\ngenerate higher-quality ideas. More specifically, Dolphin first generates novel\nideas based on relevant papers which are ranked by the topic and task\nattributes. Then, the codes are automatically generated and debugged with the\nexception-traceback-guided local code structure. Finally, Dolphin automatically\nanalyzes the results of each idea and feeds the results back to the next round\nof idea generation. Experiments are conducted on the benchmark datasets of\ndifferent topics and results show that Dolphin can generate novel ideas\ncontinuously and complete the experiment in a loop. We highlight that Dolphin\ncan automatically propose methods that are comparable to the state-of-the-art\nin some tasks such as 2D image classification and 3D point classification.\n","authors":["Jiakang Yuan","Xiangchao Yan","Botian Shi","Tao Chen","Wanli Ouyang","Bo Zhang","Lei Bai","Yu Qiao","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.03916v1.pdf","comment":"19 pages, 11 figures, and our homepage:\n  https://unimodal4reasoning.github.io/Dolphin-project-page/"},{"id":"http://arxiv.org/abs/2501.03910v1","updated":"2025-01-07T16:24:43Z","published":"2025-01-07T16:24:43Z","title":"HYB-VITON: A Hybrid Approach to Virtual Try-On Combining Explicit and\n  Implicit Warping","summary":"  Virtual try-on systems have significant potential in e-commerce, allowing\ncustomers to visualize garments on themselves. Existing image-based methods\nfall into two categories: those that directly warp garment-images onto\nperson-images (explicit warping), and those using cross-attention to\nreconstruct given garments (implicit warping). Explicit warping preserves\ngarment details but often produces unrealistic output, while implicit warping\nachieves natural reconstruction but struggles with fine details. We propose\nHYB-VITON, a novel approach that combines the advantages of each method and\nincludes both a preprocessing pipeline for warped garments and a novel training\noption. These components allow us to utilize beneficial regions of explicitly\nwarped garments while leveraging the natural reconstruction of implicit\nwarping. A series of experiments demonstrates that HYB-VITON preserves garment\ndetails more faithfully than recent diffusion-based methods, while producing\nmore realistic results than a state-of-the-art explicit warping method.\n","authors":["Kosuke Takemoto","Takafumi Koshinaka"],"pdf_url":"https://arxiv.org/pdf/2501.03910v1.pdf","comment":"Accepted at IEEE ICASSP 2025"},{"id":"http://arxiv.org/abs/2402.16315v4","updated":"2025-01-07T16:05:16Z","published":"2024-02-26T05:43:51Z","title":"Finer: Investigating and Enhancing Fine-Grained Visual Concept\n  Recognition in Large Vision Language Models","summary":"  Recent advances in instruction-tuned Large Vision-Language Models (LVLMs)\nhave imbued the models with the ability to generate high-level, image-grounded\nexplanations with ease. While such capability is largely attributed to the rich\nworld knowledge contained within the Large Language Models (LLMs), our work\nreveals their shortcomings in fine-grained visual categorization (FGVC) across\nsix different benchmark settings. Most recent state-of-the-art LVLMs like\nLLaVa-1.5, InstructBLIP and GPT-4V not only severely deteriorate in terms of\nclassification performance, e.g., average drop of 65.58 in EM for Stanford Dogs\nfor LLaVA-1.5, but also struggle to generate an accurate explanation with\ndetailed attributes based on the concept that appears within an input image\ndespite their capability to generate holistic image-level descriptions.\nIn-depth analyses show that instruction-tuned LVLMs exhibit modality gap,\nshowing discrepancy when given textual and visual inputs that correspond to the\nsame concept, preventing the image modality from leveraging the rich parametric\nknowledge within the LLMs. In an effort to further the community's endeavor in\nthis direction, we propose a multiple granularity attribute-centric evaluation\nbenchmark, Finer, which aims to establish a ground to evaluate LVLMs'\nfine-grained visual comprehension ability and provide significantly improved\nexplainability.\n","authors":["Jeonghwan Kim","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2402.16315v4.pdf","comment":"EMNLP 2024; Main Conference"},{"id":"http://arxiv.org/abs/2501.03895v1","updated":"2025-01-07T16:03:14Z","published":"2025-01-07T16:03:14Z","title":"LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One\n  Vision Token","summary":"  The advent of real-time large multimodal models (LMMs) like GPT-4o has\nsparked considerable interest in efficient LMMs. LMM frameworks typically\nencode visual inputs into vision tokens (continuous representations) and\nintegrate them and textual instructions into the context of large language\nmodels (LLMs), where large-scale parameters and numerous context tokens\n(predominantly vision tokens) result in substantial computational overhead.\nPrevious efforts towards efficient LMMs always focus on replacing the LLM\nbackbone with smaller models, while neglecting the crucial issue of token\nquantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal\nvision tokens. To achieve a high compression ratio of vision tokens while\npreserving visual information, we first analyze how LMMs understand vision\ntokens and find that most vision tokens only play a crucial role in the early\nlayers of LLM backbone, where they mainly fuse visual information into text\ntokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to\nfuse visual information into text tokens in advance, thereby facilitating the\nextreme compression of vision tokens fed to LLM backbone into one token.\nLLaVA-Mini is a unified large multimodal model that can support the\nunderstanding of images, high-resolution images, and videos in an efficient\nmanner. Experiments across 11 image-based and 7 video-based benchmarks\ndemonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token\ninstead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by\n77%, deliver low-latency responses within 40 milliseconds, and process over\n10,000 frames of video on the GPU hardware with 24GB of memory.\n","authors":["Shaolei Zhang","Qingkai Fang","Zhe Yang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2501.03895v1.pdf","comment":"Code: https://github.com/ictnlp/LLaVA-Mini; Model:\n  https://huggingface.co/ICTNLP/llava-mini-llama-3.1-8b"},{"id":"http://arxiv.org/abs/2501.03891v1","updated":"2025-01-07T15:54:03Z","published":"2025-01-07T15:54:03Z","title":"Superpixel Boundary Correction for Weakly-Supervised Semantic\n  Segmentation on Histopathology Images","summary":"  With the rapid advancement of deep learning, computational pathology has made\nsignificant progress in cancer diagnosis and subtyping. Tissue segmentation is\na core challenge, essential for prognosis and treatment decisions. Weakly\nsupervised semantic segmentation (WSSS) reduces the annotation requirement by\nusing image-level labels instead of pixel-level ones. However, Class Activation\nMap (CAM)-based methods still suffer from low spatial resolution and unclear\nboundaries. To address these issues, we propose a multi-level superpixel\ncorrection algorithm that refines CAM boundaries using superpixel clustering\nand floodfill. Experimental results show that our method achieves great\nperformance on breast cancer segmentation dataset with mIoU of 71.08%,\nsignificantly improving tumor microenvironment boundary delineation.\n","authors":["Hongyi Wu","Hong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03891v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.03732v3","updated":"2025-01-07T15:46:25Z","published":"2024-05-06T10:53:13Z","title":"Deep Learning-based Accelerated MR Cholangiopancreatography without\n  Fully-sampled Data","summary":"  The purpose of this study was to accelerate MR cholangiopancreatography\n(MRCP) acquisitions using deep learning-based (DL) reconstruction at 3T and\n0.55T. A total of 35 healthy volunteers underwent conventional two-fold\naccelerated MRCP scans at field strengths of 3T and 0.55T. We trained DL\nreconstructions using two different training strategies, supervised (SV) and\nself-supervised (SSV), with retrospectively six-fold undersampled data obtained\nat 3T. We then evaluated the DL reconstructions against standard techniques,\nparallel imaging (PI) and compressed sensing (CS), focusing on peak\nsignal-to-noise ratio (PSNR) and structural similarity (SSIM) as metrics. We\nalso tested DL reconstructions with prospectively accelerated acquisitions and\nevaluated their robustness when changing fields strengths from 3T to 0.55T. DL\nreconstructions demonstrated a reduction in average acquisition time from\n599/542 to 255/180 seconds for MRCP at 3T/0.55T. In both retrospective and\nprospective undersampling, PSNR and SSIM of DL reconstructions were higher than\nthose of PI and CS. At the same time, DL reconstructions preserved the image\nquality of undersampled data, including sharpness and the visibility of\nhepatobiliary ducts. In addition, both DL approaches produced high-quality\nreconstructions at 0.55T. In summary, DL reconstructions trained for highly\naccelerated MRCP enabled a reduction in acquisition time by a factor of 2.4/3.0\nat 3T/0.55T while maintaining the image quality of conventional acquisitions.\n","authors":["Jinho Kim","Marcel Dominik Nickel","Florian Knoll"],"pdf_url":"https://arxiv.org/pdf/2405.03732v3.pdf","comment":"19 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.03880v1","updated":"2025-01-07T15:43:36Z","published":"2025-01-07T15:43:36Z","title":"SELMA3D challenge: Self-supervised learning for 3D light-sheet\n  microscopy image segmentation","summary":"  Recent innovations in light sheet microscopy, paired with developments in\ntissue clearing techniques, enable the 3D imaging of large mammalian tissues\nwith cellular resolution. Combined with the progress in large-scale data\nanalysis, driven by deep learning, these innovations empower researchers to\nrapidly investigate the morphological and functional properties of diverse\nbiological samples. Segmentation, a crucial preliminary step in the analysis\nprocess, can be automated using domain-specific deep learning models with\nexpert-level performance. However, these models exhibit high sensitivity to\ndomain shifts, leading to a significant drop in accuracy when applied to data\noutside their training distribution. To address this limitation, and inspired\nby the recent success of self-supervised learning in training generalizable\nmodels, we organized the SELMA3D Challenge during the MICCAI 2024 conference.\nSELMA3D provides a vast collection of light-sheet images from cleared mice and\nhuman brains, comprising 35 large 3D images-each with over 1000^3 voxels-and\n315 annotated small patches for finetuning, preliminary testing and final\ntesting. The dataset encompasses diverse biological structures, including\nvessel-like and spot-like structures. Five teams participated in all phases of\nthe challenge, and their proposed methods are reviewed in this paper.\nQuantitative and qualitative results from most participating teams demonstrate\nthat self-supervised learning on large datasets improves segmentation model\nperformance and generalization. We will continue to support and extend SELMA3D\nas an inaugural MICCAI challenge focused on self-supervised learning for 3D\nmicroscopy image segmentation.\n","authors":["Ying Chen","Rami Al-Maskari","Izabela Horvath","Mayar Ali","Luciano Höher","Kaiyuan Yang","Zengming Lin","Zhiwei Zhai","Mengzhe Shen","Dejin Xun","Yi Wang","Tony Xu","Maged Goubran","Yunheng Wu","Ali Erturk","Johannes C. Paetzold"],"pdf_url":"https://arxiv.org/pdf/2501.03880v1.pdf","comment":"1st version"},{"id":"http://arxiv.org/abs/2501.03879v1","updated":"2025-01-07T15:42:32Z","published":"2025-01-07T15:42:32Z","title":"CL3DOR: Contrastive Learning for 3D Large Multimodal Models via Odds\n  Ratio on High-Resolution Point Clouds","summary":"  Recent research has demonstrated that Large Language Models (LLMs) are not\nlimited to text-only tasks but can also function as multimodal models across\nvarious modalities, including audio, images, and videos. In particular,\nresearch on 3D Large Multimodal Models (3D LMMs) is making notable strides,\ndriven by the potential of processing higher-dimensional data like point\nclouds. However, upon closer examination, we find that the visual and textual\ncontent within each sample of existing training datasets lacks both high\ninformational granularity and clarity, which serve as a bottleneck for precise\ncross-modal understanding. To address these issues, we propose CL3DOR,\nContrastive Learning for 3D large multimodal models via Odds ratio on\nhigh-Resolution point clouds, designed to ensure greater specificity and\nclarity in both visual and textual content. Specifically, we increase the\ndensity of point clouds per object and construct informative hard negative\nresponses in the training dataset to penalize unwanted responses. To leverage\nhard negative responses, we incorporate the odds ratio as an auxiliary term for\ncontrastive learning into the conventional language modeling loss. CL3DOR\nachieves state-of-the-art performance in 3D scene understanding and reasoning\nbenchmarks. Additionally, we demonstrate the effectiveness of CL3DOR's key\ncomponents through extensive experiments.\n","authors":["Keonwoo Kim","Yeongjae Cho","Taebaek Hwang","Minsoo Jo","Sangdo Han"],"pdf_url":"https://arxiv.org/pdf/2501.03879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03875v1","updated":"2025-01-07T15:39:02Z","published":"2025-01-07T15:39:02Z","title":"ZDySS -- Zero-Shot Dynamic Scene Stylization using Gaussian Splatting","summary":"  Stylizing a dynamic scene based on an exemplar image is critical for various\nreal-world applications, including gaming, filmmaking, and augmented and\nvirtual reality. However, achieving consistent stylization across both spatial\nand temporal dimensions remains a significant challenge. Most existing methods\nare designed for static scenes and often require an optimization process for\neach style image, limiting their adaptability. We introduce ZDySS, a zero-shot\nstylization framework for dynamic scenes, allowing our model to generalize to\npreviously unseen style images at inference. Our approach employs Gaussian\nsplatting for scene representation, linking each Gaussian to a learned feature\nvector that renders a feature map for any given view and timestamp. By applying\nstyle transfer on the learned feature vectors instead of the rendered feature\nmap, we enhance spatio-temporal consistency across frames. Our method\ndemonstrates superior performance and coherence over state-of-the-art baselines\nin tests on real-world dynamic scenes, making it a robust solution for\npractical applications.\n","authors":["Abhishek Saroha","Florian Hofherr","Mariia Gladkova","Cecilia Curreli","Or Litany","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2501.03875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03874v1","updated":"2025-01-07T15:38:13Z","published":"2025-01-07T15:38:13Z","title":"Neuromorphic Optical Tracking and Imaging of Randomly Moving Targets\n  through Strongly Scattering Media","summary":"  Tracking and acquiring simultaneous optical images of randomly moving targets\nobscured by scattering media remains a challenging problem of importance to\nmany applications that require precise object localization and identification.\nIn this work we develop an end-to-end neuromorphic optical engineering and\ncomputational approach to demonstrate how to track and image normally invisible\nobjects by combining an event detecting camera with a multistage neuromorphic\ndeep learning strategy. Photons emerging from dense scattering media are\ndetected by the event camera and converted to pixel-wise asynchronized spike\ntrains - a first step in isolating object-specific information from the\ndominant uninformative background. Spiking data is fed into a deep spiking\nneural network (SNN) engine where object tracking and image reconstruction are\nperformed by two separate yet interconnected modules running in parallel in\ndiscrete time steps over the event duration. Through benchtop experiments we\ndemonstrate tracking and imaging randomly moving objects in dense turbid media\nas well as image reconstruction of spatially stationary but optically dynamic\nobjects. Standardized character sets serve as representative proxies for\ngeometrically complex objects, underscoring the method's generality. The\nresults highlight the advantages of a fully neuromorphic approach in meeting a\nmajor imaging technology with high computational efficiency and low power\nconsumption.\n","authors":["Ning Zhang","Timothy Shea","Arto Nurmikko"],"pdf_url":"https://arxiv.org/pdf/2501.03874v1.pdf","comment":"22 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.12359v2","updated":"2025-01-07T15:36:54Z","published":"2024-12-16T21:14:11Z","title":"LLaVA Steering: Visual Instruction Tuning with 500x Fewer Parameters\n  through Modality Linear Representation-Steering","summary":"  Multimodal Large Language Models (MLLMs) have significantly advanced visual\ntasks by integrating visual representations into large language models (LLMs).\nThe textual modality, inherited from LLMs, equips MLLMs with abilities like\ninstruction following and in-context learning. In contrast, the visual modality\nenhances performance in downstream tasks by leveraging rich semantic content,\nspatial information, and grounding capabilities. These intrinsic modalities\nwork synergistically across various visual tasks. Our research initially\nreveals a persistent imbalance between these modalities, with text often\ndominating output generation during visual instruction tuning. This imbalance\noccurs when using both full fine-tuning and parameter-efficient fine-tuning\n(PEFT) methods. We then found that re-balancing these modalities can\nsignificantly reduce the number of trainable parameters required, inspiring a\ndirection for further optimizing visual instruction tuning. We introduce\nModality Linear Representation-Steering (MoReS) to achieve the goal. MoReS\neffectively re-balances the intrinsic modalities throughout the model, where\nthe key idea is to steer visual representations through linear transformations\nin the visual subspace across each model layer. To validate our solution, we\ncomposed LLaVA Steering, a suite of models integrated with the proposed MoReS\nmethod. Evaluation results show that the composed LLaVA Steering models\nrequire, on average, 500 times fewer trainable parameters than LoRA needs while\nstill achieving comparable performance across three visual benchmarks and eight\nvisual question-answering tasks. Last, we present the LLaVA Steering Factory,\nan in-house developed platform that enables researchers to quickly customize\nvarious MLLMs with component-based architecture for seamlessly integrating\nstate-of-the-art models, and evaluate their intrinsic modality imbalance.\n","authors":["Jinhe Bi","Yujun Wang","Haokun Chen","Xun Xiao","Artur Hecker","Volker Tresp","Yunpu Ma"],"pdf_url":"https://arxiv.org/pdf/2412.12359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03848v1","updated":"2025-01-07T15:03:55Z","published":"2025-01-07T15:03:55Z","title":"Semise: Semi-supervised learning for severity representation in medical\n  image","summary":"  This paper introduces SEMISE, a novel method for representation learning in\nmedical imaging that combines self-supervised and supervised learning. By\nleveraging both labeled and augmented data, SEMISE addresses the challenge of\ndata scarcity and enhances the encoder's ability to extract meaningful\nfeatures. This integrated approach leads to more informative representations,\nimproving performance on downstream tasks. As result, our approach achieved a\n12% improvement in classification and a 3% improvement in segmentation,\noutperforming existing methods. These results demonstrate the potential of\nSIMESE to advance medical image analysis and offer more accurate solutions for\nhealthcare applications, particularly in contexts where labeled data is\nlimited.\n","authors":["Dung T. Tran","Hung Vu","Anh Tran","Hieu Pham","Hong Nguyen","Phong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.03848v1.pdf","comment":"Accepted for presentation at the 2025 IEEE 22nd International\n  Symposium on Biomedical Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2501.03847v1","updated":"2025-01-07T15:01:58Z","published":"2025-01-07T15:01:58Z","title":"Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video\n  Generation Control","summary":"  Diffusion models have demonstrated impressive performance in generating\nhigh-quality videos from text prompts or images. However, precise control over\nthe video generation process, such as camera manipulation or content editing,\nremains a significant challenge. Existing methods for controlled video\ngeneration are typically limited to a single control type, lacking the\nflexibility to handle diverse control demands. In this paper, we introduce\nDiffusion as Shader (DaS), a novel approach that supports multiple video\ncontrol tasks within a unified architecture. Our key insight is that achieving\nversatile video control necessitates leveraging 3D control signals, as videos\nare fundamentally 2D renderings of dynamic 3D content. Unlike prior methods\nlimited to 2D control signals, DaS leverages 3D tracking videos as control\ninputs, making the video diffusion process inherently 3D-aware. This innovation\nallows DaS to achieve a wide range of video controls by simply manipulating the\n3D tracking videos. A further advantage of using 3D tracking videos is their\nability to effectively link frames, significantly enhancing the temporal\nconsistency of the generated videos. With just 3 days of fine-tuning on 8 H800\nGPUs using less than 10k videos, DaS demonstrates strong control capabilities\nacross diverse tasks, including mesh-to-video generation, camera control,\nmotion transfer, and object manipulation.\n","authors":["Zekai Gu","Rui Yan","Jiahao Lu","Peng Li","Zhiyang Dou","Chenyang Si","Zhen Dong","Qifeng Liu","Cheng Lin","Ziwei Liu","Wenping Wang","Yuan Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03847v1.pdf","comment":"Project page: https://igl-hkust.github.io/das/ Codes:\n  https://github.com/IGL-HKUST/DiffusionAsShader"},{"id":"http://arxiv.org/abs/2403.18873v2","updated":"2025-01-07T14:52:34Z","published":"2024-03-26T14:42:46Z","title":"Predicting risk of cardiovascular disease using retinal OCT imaging","summary":"  Cardiovascular diseases (CVD) are the leading cause of death globally.\nNon-invasive, cost-effective imaging techniques play a crucial role in early\ndetection and prevention of CVD. Optical coherence tomography (OCT) has gained\nrecognition as a potential tool for early CVD risk prediction, though its use\nremains underexplored. In this study, we investigated the potential of OCT as\nan additional imaging technique to predict future CVD events. We analysed\nretinal OCT data from the UK Biobank. The dataset included 612 patients who\nsuffered a myocardial infarction (MI) or stroke within five years of imaging\nand 2,234 controls without CVD (total: 2,846 participants). A self-supervised\ndeep learning approach based on Variational Autoencoders (VAE) was used to\nextract low-dimensional latent representations from high-dimensional 3D OCT\nimages, capturing distinct features of retinal layers. These latent features,\nalong with clinical data, were used to train a Random Forest (RF) classifier to\ndifferentiate between patients at risk of future CVD events (MI or stroke) and\nhealthy controls. Our model achieved an AUC of 0.75, sensitivity of 0.70,\nspecificity of 0.70, and accuracy of 0.70, outperforming the QRISK3 score (the\nthird version of the QRISK cardiovascular disease risk prediction algorithm;\nAUC = 0.60, sensitivity = 0.60, specificity = 0.55, accuracy = 0.55). The\nchoroidal layer in OCT images was identified as a key predictor of future CVD\nevents, revealed through a novel model explainability approach. This study\ndemonstrates that retinal OCT imaging is a cost-effective, non-invasive\nalternative for predicting CVD risk, offering potential for widespread\napplication in optometry practices and hospitals.\n","authors":["Cynthia Maldonado-Garcia","Rodrigo Bonazzola","Enzo Ferrante","Thomas H Julian","Panagiotis I Sergouniotis","Nishant Ravikumara","Alejandro F Frangi"],"pdf_url":"https://arxiv.org/pdf/2403.18873v2.pdf","comment":"New version - 26 pages for main manuscript, 7 figures, 7 pages for\n  appendix and preprint for a journal"},{"id":"http://arxiv.org/abs/2501.03839v1","updated":"2025-01-07T14:49:12Z","published":"2025-01-07T14:49:12Z","title":"MedFocusCLIP : Improving few shot classification in medical datasets\n  using pixel wise attention","summary":"  With the popularity of foundational models, parameter efficient fine tuning\nhas become the defacto approach to leverage pretrained models to perform\ndownstream tasks. Taking inspiration from recent advances in large language\nmodels, Visual Prompt Tuning, and similar techniques, learn an additional\nprompt to efficiently finetune a pretrained vision foundational model. However,\nwe observe that such prompting is insufficient for fine-grained visual\nclassification tasks such as medical image classification, where there is large\ninter-class variance, and small intra-class variance. Hence, in this paper we\npropose to leverage advanced segmentation capabilities of Segment Anything\nModel 2 (SAM2) as a visual prompting cue to help visual encoder in the CLIP\n(Contrastive Language-Image Pretraining) by guiding the attention in CLIP\nvisual encoder to relevant regions in the image. This helps the model to focus\non highly discriminative regions, without getting distracted from visually\nsimilar background features, an essential requirement in a fewshot, finegrained\nclassification setting. We evaluate our method on diverse medical datasets\nincluding X-rays, CT scans, and MRI images, and report an accuracy of (71%,\n81%, 86%, 58%) from the proposed approach on (COVID, lung-disease, brain-tumor,\nbreast-cancer) datasets against (66%, 70%, 68%, 29%) from a pretrained CLIP\nmodel after fewshot training. The proposed approach also allows to obtain\ninterpretable explanation for the classification performance through the\nlocalization obtained using segmentation.\n","authors":["Aadya Arora","Vinay Namboodiri"],"pdf_url":"https://arxiv.org/pdf/2501.03839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03838v1","updated":"2025-01-07T14:47:15Z","published":"2025-01-07T14:47:15Z","title":"LM-Net: A Light-weight and Multi-scale Network for Medical Image\n  Segmentation","summary":"  Current medical image segmentation approaches have limitations in deeply\nexploring multi-scale information and effectively combining local detail\ntextures with global contextual semantic information. This results in\nover-segmentation, under-segmentation, and blurred segmentation boundaries. To\ntackle these challenges, we explore multi-scale feature representations from\ndifferent perspectives, proposing a novel, lightweight, and multi-scale\narchitecture (LM-Net) that integrates advantages of both Convolutional Neural\nNetworks (CNNs) and Vision Transformers (ViTs) to enhance segmentation\naccuracy. LM-Net employs a lightweight multi-branch module to capture\nmulti-scale features at the same level. Furthermore, we introduce two modules\nto concurrently capture local detail textures and global semantics with\nmulti-scale features at different levels: the Local Feature Transformer (LFT)\nand Global Feature Transformer (GFT). The LFT integrates local window\nself-attention to capture local detail textures, while the GFT leverages global\nself-attention to capture global contextual semantics. By combining these\nmodules, our model achieves complementarity between local and global\nrepresentations, alleviating the problem of blurred segmentation boundaries in\nmedical image segmentation. To evaluate the feasibility of LM-Net, extensive\nexperiments have been conducted on three publicly available datasets with\ndifferent modalities. Our proposed model achieves state-of-the-art results,\nsurpassing previous methods, while only requiring 4.66G FLOPs and 5.4M\nparameters. These state-of-the-art results on three datasets with different\nmodalities demonstrate the effectiveness and adaptability of our proposed\nLM-Net for various medical image segmentation tasks.\n","authors":["Zhenkun Lu","Chaoyin She","Wei Wang","Qinghua Huang"],"pdf_url":"https://arxiv.org/pdf/2501.03838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03836v1","updated":"2025-01-07T14:45:39Z","published":"2025-01-07T14:45:39Z","title":"SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor\n  Diagnosis","summary":"  Brain tumors can result in neurological dysfunction, alterations in cognitive\nand psychological states, increased intracranial pressure, and the occurrence\nof seizures, thereby presenting a substantial risk to human life and health.\nThe You Only Look Once(YOLO) series models have demonstrated superior accuracy\nin object detection for medical imaging. In this paper, we develop a novel\nSCC-YOLO architecture by integrating the SCConv attention mechanism into\nYOLOv9. The SCConv module reconstructs an efficient convolutional module by\nreducing spatial and channel redundancy among features, thereby enhancing the\nlearning of image features. We investigate the impact of intergrating different\nattention mechanisms with the YOLOv9 model on brain tumor image detection using\nboth the Br35H dataset and our self-made dataset(Brain_Tumor_Dataset).\nExperimental results show that on the Br35H dataset, SCC-YOLO achieved a 0.3%\nimprovement in mAp50 compared to YOLOv9, while on our self-made dataset,\nSCC-YOLO exhibited a 0.5% improvement over YOLOv9. SCC-YOLO has reached\nstate-of-the-art performance in brain tumor detection. Source code is available\nat : https://jihulab.com/healthcare-information-studio/SCC-YOLO/-/tree/master\n","authors":["Runci Bai"],"pdf_url":"https://arxiv.org/pdf/2501.03836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03830v1","updated":"2025-01-07T14:41:26Z","published":"2025-01-07T14:41:26Z","title":"MeshConv3D: Efficient convolution and pooling operators for triangular\n  3D meshes","summary":"  Convolutional neural networks (CNNs) have been pivotal in various 2D image\nanalysis tasks, including computer vision, image indexing and retrieval or\nsemantic classification. Extending CNNs to 3D data such as point clouds and 3D\nmeshes raises significant challenges since the very basic convolution and\npooling operators need to be completely re-visited and re-defined in an\nappropriate manner to tackle irregular connectivity issues. In this paper, we\nintroduce MeshConv3D, a 3D mesh-dedicated methodology integrating specialized\nconvolution and face collapse-based pooling operators. MeshConv3D operates\ndirectly on meshes of arbitrary topology, without any need of prior\nre-meshing/conversion techniques. In order to validate our approach, we have\nconsidered a semantic classification task. The experimental results obtained on\nthree distinct benchmark datasets show that the proposed approach makes it\npossible to achieve equivalent or superior classification results, while\nminimizing the related memory footprint and computational load.\n","authors":["Germain Bregeon","Marius Preda","Radu Ispas","Titus Zaharia"],"pdf_url":"https://arxiv.org/pdf/2501.03830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16169v5","updated":"2025-01-07T14:39:31Z","published":"2024-03-24T14:24:13Z","title":"Gaze-guided Hand-Object Interaction Synthesis: Dataset and Method","summary":"  Gaze plays a crucial role in revealing human attention and intention,\nparticularly in hand-object interaction scenarios, where it guides and\nsynchronizes complex tasks that require precise coordination between the brain,\nhand, and object. Motivated by this, we introduce a novel task: Gaze-Guided\nHand-Object Interaction Synthesis, with potential applications in augmented\nreality, virtual reality, and assistive technologies. To support this task, we\npresent GazeHOI, the first dataset to capture simultaneous 3D modeling of gaze,\nhand, and object interactions. This task poses significant challenges due to\nthe inherent sparsity and noise in gaze data, as well as the need for high\nconsistency and physical plausibility in generating hand and object motions. To\ntackle these issues, we propose a stacked gaze-guided hand-object interaction\ndiffusion model, named GHO-Diffusion. The stacked design effectively reduces\nthe complexity of motion generation. We also introduce HOI-Manifold Guidance\nduring the sampling stage of GHO-Diffusion, enabling fine-grained control over\ngenerated motions while maintaining the data manifold. Additionally, we propose\na spatial-temporal gaze feature encoding for the diffusion condition and select\ndiffusion results based on consistency scores between gaze-contact maps and\ngaze-interaction trajectories. Extensive experiments highlight the\neffectiveness of our method and the unique contributions of our dataset. More\ndetails in https://takiee.github.io/gaze-hoi/.\n","authors":["Jie Tian","Ran Ji","Lingxiao Yang","Suting Ni","Yuexin Ma","Lan Xu","Jingyi Yu","Ye Shi","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16169v5.pdf","comment":"Project Page: https://takiee.github.io/gaze-hoi/"},{"id":"http://arxiv.org/abs/2501.03825v1","updated":"2025-01-07T14:37:14Z","published":"2025-01-07T14:37:14Z","title":"Deep Sylvester Posterior Inference for Adaptive Compressed Sensing in\n  Ultrasound Imaging","summary":"  Ultrasound images are commonly formed by sequential acquisition of\nbeam-steered scan-lines. Minimizing the number of required scan-lines can\nsignificantly enhance frame rate, field of view, energy efficiency, and data\ntransfer speeds. Existing approaches typically use static subsampling schemes\nin combination with sparsity-based or, more recently, deep-learning-based\nrecovery. In this work, we introduce an adaptive subsampling method that\nmaximizes intrinsic information gain in-situ, employing a Sylvester Normalizing\nFlow encoder to infer an approximate Bayesian posterior under partial\nobservation in real-time. Using the Bayesian posterior and a deep generative\nmodel for future observations, we determine the subsampling scheme that\nmaximizes the mutual information between the subsampled observations, and the\nnext frame of the video. We evaluate our approach using the EchoNet cardiac\nultrasound video dataset and demonstrate that our active sampling method\noutperforms competitive baselines, including uniform and variable-density\nrandom sampling, as well as equidistantly spaced scan-lines, improving mean\nabsolute reconstruction error by 15%. Moreover, posterior inference and the\nsampling scheme generation are performed in just 0.015 seconds (66Hz), making\nit fast enough for real-time 2D ultrasound imaging applications.\n","authors":["Simon W. Penninga","Hans van Gorp","Ruud J. G. van Sloun"],"pdf_url":"https://arxiv.org/pdf/2501.03825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01460v2","updated":"2025-01-07T14:19:35Z","published":"2024-12-31T10:43:19Z","title":"GDSR: Global-Detail Integration through Dual-Branch Network with Wavelet\n  Losses for Remote Sensing Image Super-Resolution","summary":"  In recent years, deep neural networks, including Convolutional Neural\nNetworks, Transformers, and State Space Models, have achieved significant\nprogress in Remote Sensing Image (RSI) Super-Resolution (SR). However, existing\nSR methods typically overlook the complementary relationship between global and\nlocal dependencies. These methods either focus on capturing local information\nor prioritize global information, which results in models that are unable to\neffectively capture both global and local features simultaneously. Moreover,\ntheir computational cost becomes prohibitive when applied to large-scale RSIs.\nTo address these challenges, we introduce the novel application of Receptance\nWeighted Key Value (RWKV) to RSI-SR, which captures long-range dependencies\nwith linear complexity. To simultaneously model global and local features, we\npropose the Global-Detail dual-branch structure, GDSR, which performs SR\nreconstruction by paralleling RWKV and convolutional operations to handle\nlarge-scale RSIs. Furthermore, we introduce the Global-Detail Reconstruction\nModule (GDRM) as an intermediary between the two branches to bridge their\ncomplementary roles. In addition, we propose Wavelet Loss, a loss function that\neffectively captures high-frequency detail information in images, thereby\nenhancing the visual quality of SR, particularly in terms of detail\nreconstruction. Extensive experiments on several benchmarks, including AID,\nAID_CDM, RSSRD-QH, and RSSRD-QH_CDM, demonstrate that GSDR outperforms the\nstate-of-the-art Transformer-based method HAT by an average of 0.05 dB in PSNR,\nwhile using only 63% of its parameters and 51% of its FLOPs, achieving an\ninference speed 2.9 times faster. Furthermore, the Wavelet Loss shows excellent\ngeneralization across various architectures, providing a novel perspective for\nRSI-SR enhancement.\n","authors":["Qiwei Zhu","Kai Li","Guojing Zhang","Xiaoying Wang","Jianqiang Huang","Xilai Li"],"pdf_url":"https://arxiv.org/pdf/2501.01460v2.pdf","comment":"The experiments were conducted using private datasets that were\n  incomplete as they did not include all the necessary copyrights.\n  Additionally, the conclusions require further exploration as the work is\n  still in progress"},{"id":"http://arxiv.org/abs/2501.03800v1","updated":"2025-01-07T14:06:57Z","published":"2025-01-07T14:06:57Z","title":"MADation: Face Morphing Attack Detection with Foundation Models","summary":"  Despite the considerable performance improvements of face recognition\nalgorithms in recent years, the same scientific advances responsible for this\nprogress can also be used to create efficient ways to attack them, posing a\nthreat to their secure deployment. Morphing attack detection (MAD) systems aim\nto detect a specific type of threat, morphing attacks, at an early stage,\npreventing them from being considered for verification in critical processes.\nFoundation models (FM) learn from extensive amounts of unlabeled data,\nachieving remarkable zero-shot generalization to unseen domains. Although this\ngeneralization capacity might be weak when dealing with domain-specific\ndownstream tasks such as MAD, FMs can easily adapt to these settings while\nretaining the built-in knowledge acquired during pre-training. In this work, we\nrecognize the potential of FMs to perform well in the MAD task when properly\nadapted to its specificities. To this end, we adapt FM CLIP architectures with\nLoRA weights while simultaneously training a classification header. The\nproposed framework, MADation surpasses our alternative FM and transformer-based\nframeworks and constitutes the first adaption of FMs to the MAD task. MADation\npresents competitive results with current MAD solutions in the literature and\neven surpasses them in several evaluation scenarios. To encourage\nreproducibility and facilitate further research in MAD, we publicly release the\nimplementation of MADation at https: //github.com/gurayozgur/MADation\n","authors":["Eduarda Caldeira","Guray Ozgur","Tahar Chettaoui","Marija Ivanovska","Fadi Boutros","Vitomir Struc","Naser Damer"],"pdf_url":"https://arxiv.org/pdf/2501.03800v1.pdf","comment":"Accepted at WACV 2025 workshops"},{"id":"http://arxiv.org/abs/2501.03786v1","updated":"2025-01-07T13:51:41Z","published":"2025-01-07T13:51:41Z","title":"KAnoCLIP: Zero-Shot Anomaly Detection through Knowledge-Driven Prompt\n  Learning and Enhanced Cross-Modal Integration","summary":"  Zero-shot anomaly detection (ZSAD) identifies anomalies without needing\ntraining samples from the target dataset, essential for scenarios with privacy\nconcerns or limited data. Vision-language models like CLIP show potential in\nZSAD but have limitations: relying on manually crafted fixed textual\ndescriptions or anomaly prompts is time-consuming and prone to semantic\nambiguity, and CLIP struggles with pixel-level anomaly segmentation, focusing\nmore on global semantics than local details. To address these limitations, We\nintroduce KAnoCLIP, a novel ZSAD framework that leverages vision-language\nmodels. KAnoCLIP combines general knowledge from a Large Language Model\n(GPT-3.5) and fine-grained, image-specific knowledge from a Visual Question\nAnswering system (Llama3) via Knowledge-Driven Prompt Learning (KnPL). KnPL\nuses a knowledge-driven (KD) loss function to create learnable anomaly prompts,\nremoving the need for fixed text prompts and enhancing generalization. KAnoCLIP\nincludes the CLIP visual encoder with V-V attention (CLIP-VV), Bi-Directional\nCross-Attention for Multi-Level Cross-Modal Interaction (Bi-CMCI), and\nConv-Adapter. These components preserve local visual semantics, improve local\ncross-modal fusion, and align global visual features with textual information,\nenhancing pixel-level anomaly detection. KAnoCLIP achieves state-of-the-art\nperformance in ZSAD across 12 industrial and medical datasets, demonstrating\nsuperior generalization compared to existing methods.\n","authors":["Chengyuan Li","Suyang Zhou","Jieping Kong","Lei Qi","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2501.03786v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2410.23091v6","updated":"2025-01-07T13:44:57Z","published":"2024-10-30T15:06:44Z","title":"CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for\n  Adversarial Defense","summary":"  Despite ongoing efforts to defend neural classifiers from adversarial\nattacks, they remain vulnerable, especially to unseen attacks. In contrast,\nhumans are difficult to be cheated by subtle manipulations, since we make\njudgments only based on essential factors. Inspired by this observation, we\nattempt to model label generation with essential label-causative factors and\nincorporate label-non-causative factors to assist data generation. For an\nadversarial example, we aim to discriminate the perturbations as non-causative\nfactors and make predictions only based on the label-causative factors.\nConcretely, we propose a casual diffusion model (CausalDiff) that adapts\ndiffusion models for conditional data generation and disentangles the two types\nof casual factors by learning towards a novel casual information bottleneck\nobjective. Empirically, CausalDiff has significantly outperformed\nstate-of-the-art defense methods on various unseen attacks, achieving an\naverage robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on\nCIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition\nBenchmark). The code is available at\nhttps://github.com/CAS-AISafetyBasicResearchGroup/CausalDiff.\n","authors":["Mingkun Zhang","Keping Bi","Wei Chen","Quanrun Chen","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.23091v6.pdf","comment":"accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.04280v3","updated":"2025-01-07T13:43:36Z","published":"2024-06-06T17:26:40Z","title":"xMIL: Insightful Explanations for Multiple Instance Learning in\n  Histopathology","summary":"  Multiple instance learning (MIL) is an effective and widely used approach for\nweakly supervised machine learning. In histopathology, MIL models have achieved\nremarkable success in tasks like tumor detection, biomarker prediction, and\noutcome prognostication. However, MIL explanation methods are still lagging\nbehind, as they are limited to small bag sizes or disregard instance\ninteractions. We revisit MIL through the lens of explainable AI (XAI) and\nintroduce xMIL, a refined framework with more general assumptions. We\ndemonstrate how to obtain improved MIL explanations using layer-wise relevance\npropagation (LRP) and conduct extensive evaluation experiments on three toy\nsettings and four real-world histopathology datasets. Our approach consistently\noutperforms previous explanation attempts with particularly improved\nfaithfulness scores on challenging biomarker prediction tasks. Finally, we\nshowcase how xMIL explanations enable pathologists to extract insights from MIL\nmodels, representing a significant advance for knowledge discovery and model\ndebugging in digital histopathology. Codes are available at:\nhttps://github.com/bifold-pathomics/xMIL.\n","authors":["Julius Hense","Mina Jamshidi Idaji","Oliver Eberle","Thomas Schnake","Jonas Dippel","Laure Ciernik","Oliver Buchstab","Andreas Mock","Frederick Klauschen","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04280v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02285v2","updated":"2025-01-07T13:38:34Z","published":"2025-01-04T13:27:18Z","title":"Hyperbolic Contrastive Learning for Hierarchical 3D Point Cloud\n  Embedding","summary":"  Hyperbolic spaces allow for more efficient modeling of complex, hierarchical\nstructures, which is particularly beneficial in tasks involving multi-modal\ndata. Although hyperbolic geometries have been proven effective for\nlanguage-image pre-training, their capabilities to unify language, image, and\n3D Point Cloud modalities are under-explored. We extend the 3D Point Cloud\nmodality in hyperbolic multi-modal contrastive pre-training. Additionally, we\nexplore the entailment, modality gap, and alignment regularizers for learning\nhierarchical 3D embeddings and facilitating the transfer of knowledge from both\nText and Image modalities. These regularizers enable the learning of\nintra-modal hierarchy within each modality and inter-modal hierarchy across\ntext, 2D images, and 3D Point Clouds. Experimental results demonstrate that our\nproposed training strategy yields an outstanding 3D Point Cloud encoder, and\nthe obtained 3D Point Cloud hierarchical embeddings significantly improve\nperformance on various downstream tasks.\n","authors":["Yingjie Liu","Pengyu Zhang","Ziyao He","Mingsong Chen","Xuan Tang","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2501.02285v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03775v1","updated":"2025-01-07T13:30:54Z","published":"2025-01-07T13:30:54Z","title":"Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection","summary":"  While witnessed with rapid development, remote sensing object detection\nremains challenging for detecting high aspect ratio objects. This paper shows\nthat large strip convolutions are good feature representation learners for\nremote sensing object detection and can detect objects of various aspect ratios\nwell. Based on large strip convolutions, we build a new network architecture\ncalled Strip R-CNN, which is simple, efficient, and powerful. Unlike recent\nremote sensing object detectors that leverage large-kernel convolutions with\nsquare shapes, our Strip R-CNN takes advantage of sequential orthogonal large\nstrip convolutions to capture spatial information. In addition, we enhance the\nlocalization capability of remote-sensing object detectors by decoupling the\ndetection heads and equipping the localization head with strip convolutions to\nbetter localize the target objects. Extensive experiments on several\nbenchmarks, e.g., DOTA, FAIR1M, HRSC2016, and DIOR, show that our Strip R-CNN\ncan largely improve previous works. Notably, our 30M model achieves 82.75% mAP\non DOTA-v1.0, setting a new state-of-the-art record.Code is available at\nhttps://github.com/YXB-NKU/Strip-R-CNN.\n","authors":["Xinbin Yuan","ZhaoHui Zheng","Yuxuan Li","Xialei Liu","Li Liu","Xiang Li","Qibin Hou","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.03775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03767v1","updated":"2025-01-07T13:14:25Z","published":"2025-01-07T13:14:25Z","title":"AutoFish: Dataset and Benchmark for Fine-grained Analysis of Fish","summary":"  Automated fish documentation processes are in the near future expected to\nplay an essential role in sustainable fisheries management and for addressing\nchallenges of overfishing. In this paper, we present a novel and publicly\navailable dataset named AutoFish designed for fine-grained fish analysis. The\ndataset comprises 1,500 images of 454 specimens of visually similar fish placed\nin various constellations on a white conveyor belt and annotated with instance\nsegmentation masks, IDs, and length measurements. The data was collected in a\ncontrolled environment using an RGB camera. The annotation procedure involved\nmanual point annotations, initial segmentation masks proposed by the Segment\nAnything Model (SAM), and subsequent manual correction of the masks. We\nestablish baseline instance segmentation results using two variations of the\nMask2Former architecture, with the best performing model reaching an mAP of\n89.15%. Additionally, we present two baseline length estimation methods, the\nbest performing being a custom MobileNetV2-based regression model reaching an\nMAE of 0.62cm in images with no occlusion and 1.38cm in images with occlusion.\nLink to project page: https://vap.aau.dk/autofish/.\n","authors":["Stefan Hein Bengtson","Daniel Lehotský","Vasiliki Ismiroglou","Niels Madsen","Thomas B. Moeslund","Malte Pedersen"],"pdf_url":"https://arxiv.org/pdf/2501.03767v1.pdf","comment":"In the 3rd Workshop on Maritime Computer Vision (MaCVi) at WACV'25"},{"id":"http://arxiv.org/abs/2501.02867v2","updated":"2025-01-07T13:13:17Z","published":"2025-01-06T09:19:23Z","title":"Diff-Lung: Diffusion-Based Texture Synthesis for Enhanced Pathological\n  Tissue Segmentation in Lung CT Scans","summary":"  Accurate quantification of the extent of lung pathological patterns\n(fibrosis, ground-glass opacity, emphysema, consolidation) is prerequisite for\ndiagnosis and follow-up of interstitial lung diseases. However, segmentation is\nchallenging due to the significant class imbalance between healthy and\npathological tissues. This paper addresses this issue by leveraging a diffusion\nmodel for data augmentation applied during training an AI model. Our approach\ngenerates synthetic pathological tissue patches while preserving essential\nshape characteristics and intricate details specific to each tissue type. This\nmethod enhances the segmentation process by increasing the occurence of\nunderrepresented classes in the training data. We demonstrate that our\ndiffusion-based augmentation technique improves segmentation accuracy across\nall pathological tissue types, particularly for the less common patterns. This\nadvancement contributes to more reliable automated analysis of lung CT scans,\npotentially improving clinical decision-making and patient outcomes\n","authors":["Rezkellah Noureddine Khiati","Pierre-Yves Brillet","Radu Ispas","Catalin Fetita"],"pdf_url":"https://arxiv.org/pdf/2501.02867v2.pdf","comment":"accepted at ISBI 2025"},{"id":"http://arxiv.org/abs/2501.03765v1","updated":"2025-01-07T13:09:44Z","published":"2025-01-07T13:09:44Z","title":"Image Segmentation: Inducing graph-based learning","summary":"  This study explores the potential of graph neural networks (GNNs) to enhance\nsemantic segmentation across diverse image modalities. We evaluate the\neffectiveness of a novel GNN-based U-Net architecture on three distinct\ndatasets: PascalVOC, a standard benchmark for natural image segmentation,\nWoodScape, a challenging dataset of fisheye images commonly used in autonomous\ndriving, introducing significant geometric distortions; and ISIC2016, a dataset\nof dermoscopic images for skin lesion segmentation. We compare our proposed\nUNet-GNN model against established convolutional neural networks (CNNs) based\nsegmentation models, including U-Net and U-Net++, as well as the\ntransformer-based SwinUNet. Unlike these methods, which primarily rely on local\nconvolutional operations or global self-attention, GNNs explicitly model\nrelationships between image regions by constructing and operating on a graph\nrepresentation of the image features. This approach allows the model to capture\nlong-range dependencies and complex spatial relationships, which we hypothesize\nwill be particularly beneficial for handling geometric distortions present in\nfisheye imagery and capturing intricate boundaries in medical images. Our\nanalysis demonstrates the versatility of GNNs in addressing diverse\nsegmentation challenges and highlights their potential to improve segmentation\naccuracy in various applications, including autonomous driving and medical\nimage analysis.\n","authors":["Aryan Singh","Pepijn Van de Ven","Ciarán Eising","Patrick Denny"],"pdf_url":"https://arxiv.org/pdf/2501.03765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09453v2","updated":"2025-01-07T13:00:57Z","published":"2024-10-12T09:16:09Z","title":"MMAD: The First-Ever Comprehensive Benchmark for Multimodal Large\n  Language Models in Industrial Anomaly Detection","summary":"  In the field of industrial inspection, Multimodal Large Language Models\n(MLLMs) have a high potential to renew the paradigms in practical applications\ndue to their robust language capabilities and generalization abilities.\nHowever, despite their impressive problem-solving skills in many domains,\nMLLMs' ability in industrial anomaly detection has not been systematically\nstudied. To bridge this gap, we present MMAD, the first-ever full-spectrum\nMLLMs benchmark in industrial Anomaly Detection. We defined seven key subtasks\nof MLLMs in industrial inspection and designed a novel pipeline to generate the\nMMAD dataset with 39,672 questions for 8,366 industrial images. With MMAD, we\nhave conducted a comprehensive, quantitative evaluation of various\nstate-of-the-art MLLMs. The commercial models performed the best, with the\naverage accuracy of GPT-4o models reaching 74.9%. However, this result falls\nfar short of industrial requirements. Our analysis reveals that current MLLMs\nstill have significant room for improvement in answering questions related to\nindustrial anomalies and defects. We further explore two training-free\nperformance enhancement strategies to help models improve in industrial\nscenarios, highlighting their promising potential for future research.\n","authors":["Xi Jiang","Jian Li","Hanqiu Deng","Yong Liu","Bin-Bin Gao","Yifeng Zhou","Jialin Li","Chengjie Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.09453v2.pdf","comment":"The code and data are available at https://github.com/jam-cc/MMAD"},{"id":"http://arxiv.org/abs/2409.18301v3","updated":"2025-01-07T12:44:48Z","published":"2024-09-26T21:16:51Z","title":"Wavelet-Driven Generalizable Framework for Deepfake Face Forgery\n  Detection","summary":"  The evolution of digital image manipulation, particularly with the\nadvancement of deep generative models, significantly challenges existing\ndeepfake detection methods, especially when the origin of the deepfake is\nobscure. To tackle the increasing complexity of these forgeries, we propose\n\\textbf{Wavelet-CLIP}, a deepfake detection framework that integrates wavelet\ntransforms with features derived from the ViT-L/14 architecture, pre-trained in\nthe CLIP fashion. Wavelet-CLIP utilizes Wavelet Transforms to deeply analyze\nboth spatial and frequency features from images, thus enhancing the model's\ncapability to detect sophisticated deepfakes. To verify the effectiveness of\nour approach, we conducted extensive evaluations against existing\nstate-of-the-art methods for cross-dataset generalization and detection of\nunseen images generated by standard diffusion models. Our method showcases\noutstanding performance, achieving an average AUC of 0.749 for cross-data\ngeneralization and 0.893 for robustness against unseen deepfakes, outperforming\nall compared methods. The code can be reproduced from the repo:\n\\url{https://github.com/lalithbharadwajbaru/Wavelet-CLIP}\n","authors":["Lalith Bharadwaj Baru","Rohit Boddeda","Shilhora Akshay Patel","Sai Mohan Gajapaka"],"pdf_url":"https://arxiv.org/pdf/2409.18301v3.pdf","comment":"9 Pages, 2 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2501.03737v1","updated":"2025-01-07T12:29:32Z","published":"2025-01-07T12:29:32Z","title":"Re-Visible Dual-Domain Self-Supervised Deep Unfolding Network for MRI\n  Reconstruction","summary":"  Magnetic Resonance Imaging (MRI) is widely used in clinical practice, but\nsuffered from prolonged acquisition time. Although deep learning methods have\nbeen proposed to accelerate acquisition and demonstrate promising performance,\nthey rely on high-quality fully-sampled datasets for training in a supervised\nmanner. However, such datasets are time-consuming and expensive-to-collect,\nwhich constrains their broader applications. On the other hand, self-supervised\nmethods offer an alternative by enabling learning from under-sampled data\nalone, but most existing methods rely on further partitioned under-sampled\nk-space data as model's input for training, resulting in a loss of valuable\ninformation. Additionally, their models have not fully incorporated image\npriors, leading to degraded reconstruction performance. In this paper, we\npropose a novel re-visible dual-domain self-supervised deep unfolding network\nto address these issues when only under-sampled datasets are available.\nSpecifically, by incorporating re-visible dual-domain loss, all under-sampled\nk-space data are utilized during training to mitigate information loss caused\nby further partitioning. This design enables the model to implicitly adapt to\nall under-sampled k-space data as input. Additionally, we design a deep\nunfolding network based on Chambolle and Pock Proximal Point Algorithm\n(DUN-CP-PPA) to achieve end-to-end reconstruction, incorporating imaging\nphysics and image priors to guide the reconstruction process. By employing a\nSpatial-Frequency Feature Extraction (SFFE) block to capture global and local\nfeature representation, we enhance the model's efficiency to learn\ncomprehensive image priors. Experiments conducted on the fastMRI and IXI\ndatasets demonstrate that our method significantly outperforms state-of-the-art\napproaches in terms of reconstruction performance.\n","authors":["Hao Zhang","Qi Wang","Jian Sun","Zhijie Wen","Jun Shi","Shihui Ying"],"pdf_url":"https://arxiv.org/pdf/2501.03737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03729v1","updated":"2025-01-07T12:17:25Z","published":"2025-01-07T12:17:25Z","title":"Realistic Test-Time Adaptation of Vision-Language Models","summary":"  The zero-shot capabilities of Vision-Language Models (VLMs) have been widely\nleveraged to improve predictive performance. However, previous works on\ntransductive or test-time adaptation (TTA) often make strong assumptions about\nthe data distribution, such as the presence of all classes. Our work challenges\nthese favorable deployment scenarios, and introduces a more realistic\nevaluation framework, including: (i) a variable number of effective classes for\nadaptation within a single batch, and (ii) non-i.i.d. batches of test samples\nin online adaptation settings. We provide comprehensive evaluations,\ncomparisons, and ablation studies that demonstrate how current transductive or\nTTA methods for VLMs systematically compromise the models' initial zero-shot\nrobustness across various realistic scenarios, favoring performance gains under\nadvantageous assumptions about the test samples' distributions. Furthermore, we\nintroduce StatA, a versatile method that could handle a wide range of\ndeployment scenarios, including those with a variable number of effective\nclasses at test time. Our approach incorporates a novel regularization term\ndesigned specifically for VLMs, which acts as a statistical anchor preserving\nthe initial text-encoder knowledge, particularly in low-data regimes. Code\navailable at https://github.com/MaxZanella/StatA.\n","authors":["Maxime Zanella","Clément Fuchs","Christophe De Vleeschouwer","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2501.03729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03722v1","updated":"2025-01-07T12:03:02Z","published":"2025-01-07T12:03:02Z","title":"Self-adaptive vision-language model for 3D segmentation of pulmonary\n  artery and vein","summary":"  Accurate segmentation of pulmonary structures iscrucial in clinical\ndiagnosis, disease study, and treatment planning. Significant progress has been\nmade in deep learning-based segmentation techniques, but most require much\nlabeled data for training. Consequently, developing precise segmentation\nmethods that demand fewer labeled datasets is paramount in medical image\nanalysis. The emergence of pre-trained vision-language foundation models, such\nas CLIP, recently opened the door for universal computer vision tasks.\nExploiting the generalization ability of these pre-trained foundation models on\ndownstream tasks, such as segmentation, leads to unexpected performance with a\nrelatively small amount of labeled data. However, exploring these models for\npulmonary artery-vein segmentation is still limited. This paper proposes a\nnovel framework called Language-guided self-adaptive Cross-Attention Fusion\nFramework. Our method adopts pre-trained CLIP as a strong feature extractor for\ngenerating the segmentation of 3D CT scans, while adaptively aggregating the\ncross-modality of text and image representations. We propose a s pecially\ndesigned adapter module to fine-tune pre-trained CLIP with a self-adaptive\nlearning strategy to effectively fuse the two modalities of embeddings. We\nextensively validate our method on a local dataset, which is the largest\npulmonary artery-vein CT dataset to date and consists of 718 labeled data in\ntotal. The experiments show that our method outperformed other state-of-the-art\nmethods by a large margin. Our data and code will be made publicly available\nupon acceptance.\n","authors":["Xiaotong Guo","Deqian Yang","Dan Wang","Haochen Zhao","Yuan Li","Zhilin Sui","Tao Zhou","Lijun Zhang","Yanda Meng"],"pdf_url":"https://arxiv.org/pdf/2501.03722v1.pdf","comment":"8 pages,3 figures"},{"id":"http://arxiv.org/abs/2408.16469v2","updated":"2025-01-07T12:02:22Z","published":"2024-08-29T12:00:11Z","title":"Multi-source Domain Adaptation for Panoramic Semantic Segmentation","summary":"  Unsupervised domain adaptation methods for panoramic semantic segmentation\nutilize real pinhole images or low-cost synthetic panoramic images to transfer\nsegmentation models to real panoramic images. However, these methods struggle\nto understand the panoramic structure using only real pinhole images and lack\nreal-world scene perception with only synthetic panoramic images. Therefore, in\nthis paper, we propose a new task, Multi-source Domain Adaptation for Panoramic\nSemantic Segmentation (MSDA4PASS), which leverages both real pinhole and\nsynthetic panoramic images to improve segmentation on unlabeled real panoramic\nimages. There are two key issues in the MSDA4PASS task: (1) distortion gaps\nbetween the pinhole and panoramic domains -- panoramic images exhibit global\nand local distortions absent in pinhole images; (2) texture gaps between the\nsource and target domains -- scenes and styles differ across domains. To\naddress these two issues, we propose a novel framework, Deformation Transform\nAligner for Panoramic Semantic Segmentation (DTA4PASS), which converts all\npinhole images in the source domains into distorted images and aligns the\nsource distorted and panoramic images with the target panoramic images.\nSpecifically, DTA4PASS consists of two main components: Unpaired Semantic\nMorphing (USM) and Distortion Gating Alignment (DGA). First, in USM, the\nDual-view Discriminator (DvD) assists in training the diffeomorphic deformation\nnetwork at the image and pixel level, enabling the effective deformation\ntransformation of pinhole images without paired panoramic views, alleviating\ndistortion gaps. Second, DGA assigns pinhole-like (pin-like) and panoramic-like\n(pan-like) features to each image by gating, and aligns these two features\nthrough uncertainty estimation, reducing texture gaps.\n","authors":["Jing Jiang","Sicheng Zhao","Jiankun Zhu","Wenbo Tang","Zhaopan Xu","Jidong Yang","Guoping Liu","Tengfei Xing","Pengfei Xu","Hongxun Yao"],"pdf_url":"https://arxiv.org/pdf/2408.16469v2.pdf","comment":"Accepted by Information Fusion 2025"},{"id":"http://arxiv.org/abs/2501.03717v1","updated":"2025-01-07T11:52:01Z","published":"2025-01-07T11:52:01Z","title":"Materialist: Physically Based Editing Using Single-Image Inverse\n  Rendering","summary":"  To perform image editing based on single-view, inverse physically based\nrendering, we present a method combining a learning-based approach with\nprogressive differentiable rendering. Given an image, our method leverages\nneural networks to predict initial material properties. Progressive\ndifferentiable rendering is then used to optimize the environment map and\nrefine the material properties with the goal of closely matching the rendered\nresult to the input image. We require only a single image while other inverse\nrendering methods based on the rendering equation require multiple views. In\ncomparison to single-view methods that rely on neural renderers, our approach\nachieves more realistic light material interactions, accurate shadows, and\nglobal illumination. Furthermore, with optimized material properties and\nillumination, our method enables a variety of tasks, including physically based\nmaterial editing, object insertion, and relighting. We also propose a method\nfor material transparency editing that operates effectively without requiring\nfull scene geometry. Compared with methods based on Stable Diffusion, our\napproach offers stronger interpretability and more realistic light refraction\nbased on empirical results.\n","authors":["Lezhong Wang","Duc Minh Tran","Ruiqi Cui","Thomson TG","Manmohan Chandraker","Jeppe Revall Frisvad"],"pdf_url":"https://arxiv.org/pdf/2501.03717v1.pdf","comment":"code will be available at github.com/lez-s/Materialist"},{"id":"http://arxiv.org/abs/2501.03714v1","updated":"2025-01-07T11:43:13Z","published":"2025-01-07T11:43:13Z","title":"MoDec-GS: Global-to-Local Motion Decomposition and Temporal Interval\n  Adjustment for Compact Dynamic 3D Gaussian Splatting","summary":"  3D Gaussian Splatting (3DGS) has made significant strides in scene\nrepresentation and neural rendering, with intense efforts focused on adapting\nit for dynamic scenes. Despite delivering remarkable rendering quality and\nspeed, existing methods struggle with storage demands and representing complex\nreal-world motions. To tackle these issues, we propose MoDecGS, a\nmemory-efficient Gaussian splatting framework designed for reconstructing novel\nviews in challenging scenarios with complex motions. We introduce GlobaltoLocal\nMotion Decomposition (GLMD) to effectively capture dynamic motions in a\ncoarsetofine manner. This approach leverages Global Canonical Scaffolds (Global\nCS) and Local Canonical Scaffolds (Local CS), extending static Scaffold\nrepresentation to dynamic video reconstruction. For Global CS, we propose\nGlobal Anchor Deformation (GAD) to efficiently represent global dynamics along\ncomplex motions, by directly deforming the implicit Scaffold attributes which\nare anchor position, offset, and local context features. Next, we finely adjust\nlocal motions via the Local Gaussian Deformation (LGD) of Local CS explicitly.\nAdditionally, we introduce Temporal Interval Adjustment (TIA) to automatically\ncontrol the temporal coverage of each Local CS during training, allowing\nMoDecGS to find optimal interval assignments based on the specified number of\ntemporal segments. Extensive evaluations demonstrate that MoDecGS achieves an\naverage 70% reduction in model size over stateoftheart methods for dynamic 3D\nGaussians from realworld dynamic videos while maintaining or even improving\nrendering quality.\n","authors":["Sangwoon Kwak","Joonsoo Kim","Jun Young Jeong","Won-Sik Cheong","Jihyong Oh","Munchurl Kim"],"pdf_url":"https://arxiv.org/pdf/2501.03714v1.pdf","comment":"The last two authors are co-corresponding authors. Please visit our\n  project page at https://kaist-viclab.github.io/MoDecGS-site/"},{"id":"http://arxiv.org/abs/2409.09424v3","updated":"2025-01-07T11:37:57Z","published":"2024-09-14T12:25:14Z","title":"NBBOX: Noisy Bounding Box Improves Remote Sensing Object Detection","summary":"  Data augmentation has shown significant advancements in computer vision to\nimprove model performance over the years, particularly in scenarios with\nlimited and insufficient data. Currently, most studies focus on adjusting the\nimage or its features to expand the size, quality, and variety of samples\nduring training in various tasks including object detection. However, we argue\nthat it is necessary to investigate bounding box transformations as a data\naugmentation technique rather than image-level transformations, especially in\naerial imagery due to potentially inconsistent bounding box annotations. Hence,\nthis letter presents a thorough investigation of bounding box transformation in\nterms of scaling, rotation, and translation for remote sensing object\ndetection. We call this augmentation strategy NBBOX (Noise Injection into\nBounding Box). We conduct extensive experiments on DOTA and DIOR-R, both\nwell-known datasets that include a variety of rotated generic objects in aerial\nimages. Experimental results show that our approach significantly improves\nremote sensing object detection without whistles and bells and it is more\ntime-efficient than other state-of-the-art augmentation strategies.\n","authors":["Yechan Kim","SooYeon Kim","Moongu Jeon"],"pdf_url":"https://arxiv.org/pdf/2409.09424v3.pdf","comment":"Accepted to IEEE Geoscience and Remote Sensing Letters"},{"id":"http://arxiv.org/abs/2411.11543v3","updated":"2025-01-07T11:09:52Z","published":"2024-11-18T13:01:57Z","title":"PSA-VLM: Enhancing Vision-Language Model Safety through Progressive\n  Concept-Bottleneck-Driven Alignment","summary":"  Benefiting from the powerful capabilities of Large Language Models (LLMs),\npre-trained visual encoder models connected to LLMs form Vision Language Models\n(VLMs). However, recent research shows that the visual modality in VLMs is\nhighly vulnerable, allowing attackers to bypass safety alignment in LLMs\nthrough visually transmitted content, launching harmful attacks. To address\nthis challenge, we propose a progressive concept-based alignment strategy,\nPSA-VLM, which incorporates safety modules as concept bottlenecks to enhance\nvisual modality safety alignment. By aligning model predictions with specific\nsafety concepts, we improve defenses against risky images, enhancing\nexplainability and controllability while minimally impacting general\nperformance. Our method is obtained through two-stage training. The low\ncomputational cost of the first stage brings very effective performance\nimprovement, and the fine-tuning of the language model in the second stage\nfurther improves the safety performance. Our method achieves state-of-the-art\nresults on popular VLM safety benchmark.\n","authors":["Zhendong Liu","Yuanbi Nie","Yingshui Tan","Jiaheng Liu","Xiangyu Yue","Qiushi Cui","Chongjun Wang","Xiaoyong Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.11543v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.13581"},{"id":"http://arxiv.org/abs/2501.03700v1","updated":"2025-01-07T11:07:32Z","published":"2025-01-07T11:07:32Z","title":"AuxDepthNet: Real-Time Monocular 3D Object Detection with\n  Depth-Sensitive Features","summary":"  Monocular 3D object detection is a challenging task in autonomous systems due\nto the lack of explicit depth information in single-view images. Existing\nmethods often depend on external depth estimators or expensive sensors, which\nincrease computational complexity and hinder real-time performance. To overcome\nthese limitations, we propose AuxDepthNet, an efficient framework for real-time\nmonocular 3D object detection that eliminates the reliance on external depth\nmaps or pre-trained depth models. AuxDepthNet introduces two key components:\nthe Auxiliary Depth Feature (ADF) module, which implicitly learns\ndepth-sensitive features to improve spatial reasoning and computational\nefficiency, and the Depth Position Mapping (DPM) module, which embeds depth\npositional information directly into the detection process to enable accurate\nobject localization and 3D bounding box regression. Leveraging the DepthFusion\nTransformer architecture, AuxDepthNet globally integrates visual and\ndepth-sensitive features through depth-guided interactions, ensuring robust and\nefficient detection. Extensive experiments on the KITTI dataset show that\nAuxDepthNet achieves state-of-the-art performance, with $\\text{AP}_{3D}$ scores\nof 24.72\\% (Easy), 18.63\\% (Moderate), and 15.31\\% (Hard), and\n$\\text{AP}_{\\text{BEV}}$ scores of 34.11\\% (Easy), 25.18\\% (Moderate), and\n21.90\\% (Hard) at an IoU threshold of 0.7.\n","authors":["Ruochen Zhang","Hyeung-Sik Choi","Dongwook Jung","Phan Huy Nam Anh","Sang-Ki Jeong","Zihao Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.03700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15353v3","updated":"2025-01-07T11:06:13Z","published":"2024-03-22T17:08:03Z","title":"Fully automated workflow for designing patient-specific orthopaedic\n  implants: application to total knee arthroplasty","summary":"  Background. Osteoarthritis affects about 528 million people worldwide,\ncausing pain and stiffness in the joints. Arthroplasty is commonly performed to\ntreat joint osteoarthritis, reducing pain and improving mobility. Nevertheless,\na significant share of patients remain unsatisfied with their surgery.\nPersonalised arthroplasty was introduced to improve surgical outcomes however\ncurrent solutions require delays, making it difficult to integrate in clinical\nroutine. We propose a fully automated workflow to design patient-specific\nimplants for total knee arthroplasty.\n  Methods. The proposed pipeline first uses artificial neural networks to\nsegment the femur and tibia proximal and distal extremities. Then the full\nbones are reconstructed using augmented statistical shape models, combining\nshape and landmarks information. Finally, 77 morphological parameters are\ncomputed to design patient-specific implants. The developed workflow has been\ntrained on 91 CT scans and evaluated on 41 CT scans, in terms of accuracy and\nexecution time.\n  Results. The workflow accuracy was $0.4\\pm0.2mm$ for segmentation,\n$1.0\\pm0.3mm$ for full bone reconstruction, and $2.2\\pm1.5mm$ for anatomical\nlandmarks determination. The custom implants fitted the patients' anatomy with\n$0.9\\pm0.5mm$ accuracy. The whole process from segmentation to implants' design\nlasted about 15 minutes.\n  Conclusion. The proposed workflow performs a fast and reliable\npersonalisation of knee implants, directly from a CT image without requiring\nany manual intervention. It allows the establishment of a patient-specific\npre-operative planning in a very short time, making it easily available for all\npatients. Combined with efficient implant manufacturing techniques, this\nsolution could help answer the growing number of arthroplasties while reducing\ncomplications and improving patients' satisfaction.\n","authors":["Aziliz Guezou-Philippe","Arnaud Clavé","Ehouarn Maguet","Ludivine Maintier","Charles Garraud","Jean-Rassaire Fouefack","Valérie Burdin","Eric Stindel","Guillaume Dardenne"],"pdf_url":"https://arxiv.org/pdf/2403.15353v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03699v1","updated":"2025-01-07T11:03:43Z","published":"2025-01-07T11:03:43Z","title":"Motion-Aware Generative Frame Interpolation","summary":"  Generative frame interpolation, empowered by large-scale pre-trained video\ngeneration models, has demonstrated remarkable advantages in complex scenes.\nHowever, existing methods heavily rely on the generative model to independently\ninfer the correspondences between input frames, an ability that is inadequately\ndeveloped during pre-training. In this work, we propose a novel framework,\ntermed Motion-aware Generative frame interpolation (MoG), to significantly\nenhance the model's motion awareness by integrating explicit motion guidance.\nSpecifically we investigate two key questions: what can serve as an effective\nmotion guidance, and how we can seamlessly embed this guidance into the\ngenerative model. For the first question, we reveal that the intermediate flow\nfrom flow-based interpolation models could efficiently provide task-oriented\nmotion guidance. Regarding the second, we first obtain guidance-based\nrepresentations of intermediate frames by warping input frames' representations\nusing guidance, and then integrate them into the model at both latent and\nfeature levels. To demonstrate the versatility of our method, we train MoG on\nboth real-world and animation datasets. Comprehensive evaluations show that our\nMoG significantly outperforms the existing methods in both domains, achieving\nsuperior video quality and improved fidelity.\n","authors":["Guozhen Zhang","Yuhan Zhu","Yutao Cui","Xiaotong Zhao","Kai Ma","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15286v3","updated":"2025-01-07T10:34:12Z","published":"2024-05-24T07:18:09Z","title":"3D Annotation-Free Learning by Distilling 2D Open-Vocabulary\n  Segmentation Models for Autonomous Driving","summary":"  Point cloud data labeling is considered a time-consuming and expensive task\nin autonomous driving, whereas annotation-free learning training can avoid it\nby learning point cloud representations from unannotated data. In this paper,\nwe propose AFOV, a novel 3D \\textbf{A}nnotation-\\textbf{F}ree framework\nassisted by 2D \\textbf{O}pen-\\textbf{V}ocabulary segmentation models. It\nconsists of two stages: In the first stage, we innovatively integrate\nhigh-quality textual and image features of 2D open-vocabulary models and\npropose the Tri-Modal contrastive Pre-training (TMP). In the second stage,\nspatial mapping between point clouds and images is utilized to generate\npseudo-labels, enabling cross-modal knowledge distillation. Besides, we\nintroduce the Approximate Flat Interaction (AFI) to address the noise during\nalignment and label confusion. To validate the superiority of AFOV, extensive\nexperiments are conducted on multiple related datasets. We achieved a\nrecord-breaking 47.73\\% mIoU on the annotation-free 3D segmentation task in\nnuScenes, surpassing the previous best model by 3.13\\% mIoU. Meanwhile, the\nperformance of fine-tuning with 1\\% data on nuScenes and SemanticKITTI reached\na remarkable 51.75\\% mIoU and 48.14\\% mIoU, outperforming all previous\npre-trained models\n","authors":["Boyi Sun","Yuhang Liu","Xingxia Wang","Bin Tian","Long Chen","Fei-Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2405.15286v3.pdf","comment":"15 pages, 7 figures, codes are available at\n  https://github.com/sbysbysbys/AFOV"},{"id":"http://arxiv.org/abs/2501.03675v1","updated":"2025-01-07T10:21:21Z","published":"2025-01-07T10:21:21Z","title":"SMIR: Efficient Synthetic Data Pipeline To Improve Multi-Image Reasoning","summary":"  Vision-Language Models (VLMs) have shown strong performance in understanding\nsingle images, aided by numerous high-quality instruction datasets. However,\nmulti-image reasoning tasks are still under-explored in the open-source\ncommunity due to two main challenges: (1) scaling datasets with multiple\ncorrelated images and complex reasoning instructions is resource-intensive and\nmaintaining quality is difficult, and (2) there is a lack of robust evaluation\nbenchmarks for multi-image tasks. To address these issues, we introduce SMIR,\nan efficient synthetic data-generation pipeline for multi-image reasoning, and\na high-quality dataset generated using this pipeline. Our pipeline efficiently\nextracts highly correlated images using multimodal embeddings, combining visual\nand descriptive information and leverages open-source LLMs to generate quality\ninstructions. Using this pipeline, we generated 160K synthetic training\nsamples, offering a cost-effective alternative to expensive closed-source\nsolutions. Additionally, we present SMIR-BENCH, a novel multi-image reasoning\nevaluation benchmark comprising 200 diverse examples across 7 complex\nmulti-image reasoning tasks. SMIR-BENCH is multi-turn and utilizes a VLM judge\nto evaluate free-form responses, providing a comprehensive assessment of model\nexpressiveness and reasoning capability across modalities. We demonstrate the\neffectiveness of SMIR dataset by fine-tuning several open-source VLMs and\nevaluating their performance on SMIR-BENCH. Our results show that models\ntrained on our dataset outperform baseline models in multi-image reasoning\ntasks up to 8% with a much more scalable data pipeline.\n","authors":["Andrew Li","Rahul Thapa","Rahul Chalamala","Qingyang Wu","Kezhen Chen","James Zou"],"pdf_url":"https://arxiv.org/pdf/2501.03675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03674v1","updated":"2025-01-07T10:20:16Z","published":"2025-01-07T10:20:16Z","title":"Action Quality Assessment via Hierarchical Pose-guided Multi-stage\n  Contrastive Regression","summary":"  Action Quality Assessment (AQA), which aims at automatic and fair evaluation\nof athletic performance, has gained increasing attention in recent years.\nHowever, athletes are often in rapid movement and the corresponding visual\nappearance variances are subtle, making it challenging to capture fine-grained\npose differences and leading to poor estimation performance. Furthermore, most\ncommon AQA tasks, such as diving in sports, are usually divided into multiple\nsub-actions, each of which contains different durations. However, existing\nmethods focus on segmenting the video into fixed frames, which disrupts the\ntemporal continuity of sub-actions resulting in unavoidable prediction errors.\nTo address these challenges, we propose a novel action quality assessment\nmethod through hierarchically pose-guided multi-stage contrastive regression.\nFirstly, we introduce a multi-scale dynamic visual-skeleton encoder to capture\nfine-grained spatio-temporal visual and skeletal features. Then, a procedure\nsegmentation network is introduced to separate different sub-actions and obtain\nsegmented features. Afterwards, the segmented visual and skeletal features are\nboth fed into a multi-modal fusion module as physics structural priors, to\nguide the model in learning refined activity similarities and variances.\nFinally, a multi-stage contrastive learning regression approach is employed to\nlearn discriminative representations and output prediction results. In\naddition, we introduce a newly-annotated FineDiving-Pose Dataset to improve the\ncurrent low-quality human pose labels. In experiments, the results on\nFineDiving and MTL-AQA datasets demonstrate the effectiveness and superiority\nof our proposed approach. Our source code and dataset are available at\nhttps://github.com/Lumos0507/HP-MCoRe.\n","authors":["Mengshi Qi","Hao Ye","Jiaxuan Peng","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.03674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17678v3","updated":"2025-01-07T10:08:19Z","published":"2024-03-26T13:05:49Z","title":"Hierarchical Light Transformer Ensembles for Multimodal Trajectory\n  Forecasting","summary":"  Accurate trajectory forecasting is crucial for the performance of various\nsystems, such as advanced driver-assistance systems and self-driving vehicles.\nThese forecasts allow us to anticipate events that lead to collisions and,\ntherefore, to mitigate them. Deep Neural Networks have excelled in motion\nforecasting, but overconfidence and weak uncertainty quantification persist.\nDeep Ensembles address these concerns, yet applying them to multimodal\ndistributions remains challenging. In this paper, we propose a novel approach\nnamed Hierarchical Light Transformer Ensembles (HLT-Ens) aimed at efficiently\ntraining an ensemble of Transformer architectures using a novel hierarchical\nloss function. HLT-Ens leverages grouped fully connected layers, inspired by\ngrouped convolution techniques, to capture multimodal distributions\neffectively. We demonstrate that HLT-Ens achieves state-of-the-art performance\nlevels through extensive experimentation, offering a promising avenue for\nimproving trajectory forecasting techniques.\n","authors":["Adrien Lafage","Mathieu Barbier","Gianni Franchi","David Filliat"],"pdf_url":"https://arxiv.org/pdf/2403.17678v3.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2501.03664v1","updated":"2025-01-07T10:04:01Z","published":"2025-01-07T10:04:01Z","title":"Local Compositional Complexity: How to Detect a Human-readable Messsage","summary":"  Data complexity is an important concept in the natural sciences and related\nareas, but lacks a rigorous and computable definition. In this paper, we focus\non a particular sense of complexity that is high if the data is structured in a\nway that could serve to communicate a message. In this sense, human speech,\nwritten language, drawings, diagrams and photographs are high complexity,\nwhereas data that is close to uniform throughout or populated by random values\nis low complexity. We describe a general framework for measuring data\ncomplexity based on dividing the shortest description of the data into a\nstructured and an unstructured portion, and taking the size of the former as\nthe complexity score. We outline an application of this framework in\nstatistical mechanics that may allow a more objective characterisation of the\nmacrostate and entropy of a physical system. Then, we derive a more precise and\ncomputable definition geared towards human communication, by proposing local\ncompositionality as an appropriate specific structure. We demonstrate\nexperimentally that this method can distinguish meaningful signals from noise\nor repetitive signals in auditory, visual and text domains, and could\npotentially help determine whether an extra-terrestrial signal contained a\nmessage.\n","authors":["Louis Mahon"],"pdf_url":"https://arxiv.org/pdf/2501.03664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19543v2","updated":"2025-01-07T09:56:36Z","published":"2024-12-27T09:10:30Z","title":"Diverse Rare Sample Generation with Pretrained GANs","summary":"  Deep generative models are proficient in generating realistic data but\nstruggle with producing rare samples in low density regions due to their\nscarcity of training datasets and the mode collapse problem. While recent\nmethods aim to improve the fidelity of generated samples, they often reduce\ndiversity and coverage by ignoring rare and novel samples. This study proposes\na novel approach for generating diverse rare samples from high-resolution image\ndatasets with pretrained GANs. Our method employs gradient-based optimization\nof latent vectors within a multi-objective framework and utilizes normalizing\nflows for density estimation on the feature space. This enables the generation\nof diverse rare images, with controllable parameters for rarity, diversity, and\nsimilarity to a reference image. We demonstrate the effectiveness of our\napproach both qualitatively and quantitatively across various datasets and GANs\nwithout retraining or fine-tuning the pretrained GANs.\n","authors":["Subeen Lee","Jiyeon Han","Soyeon Kim","Jaesik Choi"],"pdf_url":"https://arxiv.org/pdf/2412.19543v2.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.03659v1","updated":"2025-01-07T09:47:46Z","published":"2025-01-07T09:47:46Z","title":"DehazeGS: Seeing Through Fog with 3D Gaussian Splatting","summary":"  Current novel view synthesis tasks primarily rely on high-quality and clear\nimages. However, in foggy scenes, scattering and attenuation can significantly\ndegrade the reconstruction and rendering quality. Although NeRF-based dehazing\nreconstruction algorithms have been developed, their use of deep fully\nconnected neural networks and per-ray sampling strategies leads to high\ncomputational costs. Moreover, NeRF's implicit representation struggles to\nrecover fine details from hazy scenes. In contrast, recent advancements in 3D\nGaussian Splatting achieve high-quality 3D scene reconstruction by explicitly\nmodeling point clouds into 3D Gaussians. In this paper, we propose leveraging\nthe explicit Gaussian representation to explain the foggy image formation\nprocess through a physically accurate forward rendering process. We introduce\nDehazeGS, a method capable of decomposing and rendering a fog-free background\nfrom participating media using only muti-view foggy images as input. We model\nthe transmission within each Gaussian distribution to simulate the formation of\nfog. During this process, we jointly learn the atmospheric light and scattering\ncoefficient while optimizing the Gaussian representation of the hazy scene. In\nthe inference stage, we eliminate the effects of scattering and attenuation on\nthe Gaussians and directly project them onto a 2D plane to obtain a clear view.\nExperiments on both synthetic and real-world foggy datasets demonstrate that\nDehazeGS achieves state-of-the-art performance in terms of both rendering\nquality and computational efficiency.\n","authors":["Jinze Yu","Yiqun Wang","Zhengda Lu","Jianwei Guo","Yong Li","Hongxing Qin","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03659v1.pdf","comment":"9 pages,4 figures"},{"id":"http://arxiv.org/abs/2408.12928v2","updated":"2025-01-07T09:39:15Z","published":"2024-08-23T09:14:58Z","title":"ParGo: Bridging Vision-Language with Partial and Global Views","summary":"  This work presents ParGo, a novel Partial-Global projector designed to\nconnect the vision and language modalities for Multimodal Large Language Models\n(MLLMs). Unlike previous works that rely on global attention-based projectors,\nour ParGo bridges the representation gap between the separately pre-trained\nvision encoders and the LLMs by integrating global and partial views, which\nalleviates the overemphasis on prominent regions. To facilitate the effective\ntraining of ParGo, we collect a large-scale detail-captioned image-text dataset\nnamed ParGoCap-1M-PT, consisting of 1 million images paired with high-quality\ncaptions. Extensive experiments on several MLLM benchmarks demonstrate the\neffectiveness of our ParGo, highlighting its superiority in aligning vision and\nlanguage modalities. Compared to conventional Q-Former projector, our ParGo\nachieves an improvement of 259.96 in MME benchmark. Furthermore, our\nexperiments reveal that ParGo significantly outperforms other projectors,\nparticularly in tasks that emphasize detail perception ability.\n","authors":["An-Lan Wang","Bin Shan","Wei Shi","Kun-Yu Lin","Xiang Fei","Guozhi Tang","Lei Liao","Jingqun Tang","Can Huang","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.12928v2.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2411.16370v2","updated":"2025-01-07T09:34:51Z","published":"2024-11-25T13:26:09Z","title":"A Review of Bayesian Uncertainty Quantification in Deep Probabilistic\n  Image Segmentation","summary":"  Advancements in image segmentation play an integral role within the broad\nscope of Deep Learning-based Computer Vision. Furthermore, their widespread\napplicability in critical real-world tasks has resulted in challenges related\nto the reliability of such algorithms. Hence, uncertainty quantification has\nbeen extensively studied within this context, enabling the expression of model\nignorance (epistemic uncertainty) or data ambiguity (aleatoric uncertainty) to\nprevent uninformed decision-making. Due to the rapid adoption of Convolutional\nNeural Network (CNN)-based segmentation models in high-stake applications, a\nsubstantial body of research has been published on this very topic, causing its\nswift expansion into a distinct field. This work provides a comprehensive\noverview of probabilistic segmentation, by discussing fundamental concepts of\nuncertainty quantification, governing advancements in the field as well as the\napplication to various tasks. Moreover, literature on both types of\nuncertainties trace back to four key applications: (1) to quantify statistical\ninconsistencies in the annotation process due ambiguous images, (2) correlating\nprediction error with uncertainty, (3) expanding the model hypothesis space for\nbetter generalization, and (4) Active Learning. An extensive discussion follows\nthat includes an overview of utilized datasets for each of the applications and\nevaluation of the available methods. We also highlight challenges related to\narchitectures, uncertainty quantification methods, standardization and\nbenchmarking, and finally end with recommendations for future work such as\nmethods based on single forward passes and models that appropriately leverage\nvolumetric data.\n","authors":["M. M. A. Valiuddin","R. J. G. van Sloun","C. G. A. Viviers","P. H. N. de With","F. van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2411.16370v2.pdf","comment":"20 pages, revised"},{"id":"http://arxiv.org/abs/2409.00698v2","updated":"2025-01-07T09:26:03Z","published":"2024-09-01T11:39:13Z","title":"Enhancing Remote Sensing Vision-Language Models for Zero-Shot Scene\n  Classification","summary":"  Vision-Language Models for remote sensing have shown promising uses thanks to\ntheir extensive pretraining. However, their conventional usage in zero-shot\nscene classification methods still involves dividing large images into patches\nand making independent predictions, i.e., inductive inference, thereby limiting\ntheir effectiveness by ignoring valuable contextual information. Our approach\ntackles this issue by utilizing initial predictions based on text prompting and\npatch affinity relationships from the image encoder to enhance zero-shot\ncapabilities through transductive inference, all without the need for\nsupervision and at a minor computational cost. Experiments on 10 remote sensing\ndatasets with state-of-the-art Vision-Language Models demonstrate significant\naccuracy improvements over inductive zero-shot classification. Our source code\nis publicly available on Github: https://github.com/elkhouryk/RS-TransCLIP\n","authors":["Karim El Khoury","Maxime Zanella","Benoît Gérin","Tiffanie Godelaine","Benoît Macq","Saïd Mahmoudi","Christophe De Vleeschouwer","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2409.00698v2.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.01427v3","updated":"2025-01-07T09:16:57Z","published":"2025-01-02T18:59:54Z","title":"VideoAnydoor: High-fidelity Video Object Insertion with Precise Motion\n  Control","summary":"  Despite significant advancements in video generation, inserting a given\nobject into videos remains a challenging task. The difficulty lies in\npreserving the appearance details of the reference object and accurately\nmodeling coherent motions at the same time. In this paper, we propose\nVideoAnydoor, a zero-shot video object insertion framework with high-fidelity\ndetail preservation and precise motion control. Starting from a text-to-video\nmodel, we utilize an ID extractor to inject the global identity and leverage a\nbox sequence to control the overall motion. To preserve the detailed appearance\nand meanwhile support fine-grained motion control, we design a pixel warper. It\ntakes the reference image with arbitrary key-points and the corresponding\nkey-point trajectories as inputs. It warps the pixel details according to the\ntrajectories and fuses the warped features with the diffusion U-Net, thus\nimproving detail preservation and supporting users in manipulating the motion\ntrajectories. In addition, we propose a training strategy involving both videos\nand static images with a weighted loss to enhance insertion quality.\nVideoAnydoor demonstrates significant superiority over existing methods and\nnaturally supports various downstream applications (e.g., talking head\ngeneration, video virtual try-on, multi-region editing) without task-specific\nfine-tuning.\n","authors":["Yuanpeng Tu","Hao Luo","Xi Chen","Sihui Ji","Xiang Bai","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.01427v3.pdf","comment":"Project page: https://videoanydoor.github.io/"},{"id":"http://arxiv.org/abs/2410.16020v2","updated":"2025-01-07T09:15:19Z","published":"2024-10-21T13:50:32Z","title":"START: A Generalized State Space Model with Saliency-Driven Token-Aware\n  Transformation","summary":"  Domain Generalization (DG) aims to enable models to generalize to unseen\ntarget domains by learning from multiple source domains. Existing DG methods\nprimarily rely on convolutional neural networks (CNNs), which inherently learn\ntexture biases due to their limited receptive fields, making them prone to\noverfitting source domains. While some works have introduced transformer-based\nmethods (ViTs) for DG to leverage the global receptive field, these methods\nincur high computational costs due to the quadratic complexity of\nself-attention. Recently, advanced state space models (SSMs), represented by\nMamba, have shown promising results in supervised learning tasks by achieving\nlinear complexity in sequence length during training and fast RNN-like\ncomputation during inference. Inspired by this, we investigate the\ngeneralization ability of the Mamba model under domain shifts and find that\ninput-dependent matrices within SSMs could accumulate and amplify\ndomain-specific features, thus hindering model generalization. To address this\nissue, we propose a novel SSM-based architecture with saliency-based\ntoken-aware transformation (namely START), which achieves state-of-the-art\n(SOTA) performances and offers a competitive alternative to CNNs and ViTs. Our\nSTART can selectively perturb and suppress domain-specific features in salient\ntokens within the input-dependent matrices of SSMs, thus effectively reducing\nthe discrepancy between different domains. Extensive experiments on five\nbenchmarks demonstrate that START outperforms existing SOTA DG methods with\nefficient linear complexity. Our code is available at\nhttps://github.com/lingeringlight/START.\n","authors":["Jintao Guo","Lei Qi","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2410.16020v2.pdf","comment":"Accepted by NeurIPS2024. The code is available at\n  https://github.com/lingeringlight/START"},{"id":"http://arxiv.org/abs/2501.03637v1","updated":"2025-01-07T09:12:55Z","published":"2025-01-07T09:12:55Z","title":"Advancing the Understanding of Fine-Grained 3D Forest Structures using\n  Digital Cousins and Simulation-to-Reality: Methods and Datasets","summary":"  Understanding and analyzing the spatial semantics and structure of forests is\nessential for accurate forest resource monitoring and ecosystem research.\nHowever, the lack of large-scale and annotated datasets has limited the\nwidespread use of advanced intelligent techniques in this field. To address\nthis challenge, a fully automated synthetic data generation and processing\nframework based on the concepts of Digital Cousins and Simulation-to-Reality\n(Sim2Real) is proposed, offering versatility and scalability to any size and\nplatform. Using this process, we created the Boreal3D, the world's largest\nforest point cloud dataset. It includes 1000 highly realistic and structurally\ndiverse forest plots across four different platforms, totaling 48,403 trees and\nover 35.3 billion points. Each point is labeled with semantic, instance, and\nviewpoint information, while each tree is described with structural parameters\nsuch as diameter, crown width, leaf area, and total volume. We designed and\nconducted extensive experiments to evaluate the potential of Boreal3D in\nadvancing fine-grained 3D forest structure analysis in real-world applications.\nThe results demonstrate that with certain strategies, models pre-trained on\nsynthetic data can significantly improve performance when applied to real\nforest datasets. Especially, the findings reveal that fine-tuning with only 20%\nof real-world data enables the model to achieve performance comparable to\nmodels trained exclusively on entire real-world data, highlighting the value\nand potential of our proposed framework. The Boreal3D dataset, and more\nbroadly, the synthetic data augmentation framework, is poised to become a\ncritical resource for advancing research in large-scale 3D forest scene\nunderstanding and structural parameter estimation.\n","authors":["Jing Liu","Duanchu Wang","Haoran Gong","Chongyu Wang","Jihua Zhu","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03631v1","updated":"2025-01-07T09:00:36Z","published":"2025-01-07T09:00:36Z","title":"Exploring Optimal Latent Trajetory for Zero-shot Image Editing","summary":"  Editability and fidelity are two essential demands for text-driven image\nediting, which expects that the editing area should align with the target\nprompt and the rest should remain unchanged separately. The current\ncutting-edge editing methods usually obey an \"inversion-then-editing\" pipeline,\nwhere the source image is first inverted to an approximate Gaussian noise\n${z}_T$, based on which a sampling process is conducted using the target\nprompt. Nevertheless, we argue that it is not a good choice to use a\nnear-Gaussian noise as a pivot for further editing since it almost lost all\nstructure fidelity. We verify this by a pilot experiment, discovering that some\nintermediate-inverted latents can achieve a better trade-off between\neditability and fidelity than the fully-inverted ${z}_T$. Based on this, we\npropose a novel editing paradigm dubbed ZZEdit, which gentlely strengthens the\ntarget guidance on a sufficient-for-editing while structure-preserving latent.\nSpecifically, we locate such an editing pivot by searching the first point on\nthe inversion trajectory which has larger response levels toward the target\nprompt than the source one. Then, we propose a ZigZag process to perform mild\ntarget guiding on this pivot, which fulfills denoising and inversion\niteratively, approaching the target while still holding fidelity. Afterwards,\nto achieve the same number of inversion and denoising steps, we perform a pure\nsampling process under the target prompt. Extensive experiments highlight the\neffectiveness of our ZZEdit in diverse image editing scenarios compared with\nthe \"inversion-then-editing\" pipeline.\n","authors":["Maomao Li","Yu Li","Yunfei Liu","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2501.03631v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2501.03630v1","updated":"2025-01-07T09:00:07Z","published":"2025-01-07T09:00:07Z","title":"MC-VTON: Minimal Control Virtual Try-On Diffusion Transformer","summary":"  Virtual try-on methods based on diffusion models achieve realistic try-on\neffects. They use an extra reference network or an additional image encoder to\nprocess multiple conditional image inputs, which results in high training\ncosts. Besides, they require more than 25 inference steps, bringing a long\ninference time. In this work, with the development of diffusion transformer\n(DiT), we rethink the necessity of reference network or image encoder, then\npropose MC-VTON, enabling DiT to integrate minimal conditional try-on inputs by\nutilizing its intrinsic backbone. Compared to existing methods, the superiority\nof MC-VTON is demonstrated in four aspects: (1)Superior detail fidelity. Our\nDiT-based MC-VTON exhibits superior fidelity in preserving fine-grained\ndetails. (2)Simplified network and inputs. We remove any extra reference\nnetwork or image encoder. We also remove unnecessary conditions like the long\nprompt, pose estimation, human parsing, and depth map. We require only the\nmasked person image and the garment image. (3)Parameter-efficient training. To\nprocess the try-on task, we fine-tune the FLUX.1-dev with only 39.7M additional\nparameters 0.33% of the backbone parameters). (4)Less inference steps. We apply\ndistillation diffusion on MC-VTON and only need 8 steps to generate a realistic\ntry-on image, with only 86.8M additional parameters (0.72% of the backbone\nparameters). Experiments show that MC-VTON achieves superior qualitative and\nquantitative results with fewer condition inputs, fewer inference steps, and\nfewer trainable parameters than baseline methods.\n","authors":["Junsheng Luan","Guangyuan Li","Lei Zhao","Wei Xing"],"pdf_url":"https://arxiv.org/pdf/2501.03630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03629v1","updated":"2025-01-07T08:59:20Z","published":"2025-01-07T08:59:20Z","title":"CFFormer: Cross CNN-Transformer Channel Attention and Spatial Feature\n  Fusion for Improved Segmentation of Low Quality Medical Images","summary":"  Hybrid CNN-Transformer models are designed to combine the advantages of\nConvolutional Neural Networks (CNNs) and Transformers to efficiently model both\nlocal information and long-range dependencies. However, most research tends to\nfocus on integrating the spatial features of CNNs and Transformers, while\noverlooking the critical importance of channel features. This is particularly\nsignificant for model performance in low-quality medical image segmentation.\nEffective channel feature extraction can significantly enhance the model's\nability to capture contextual information and improve its representation\ncapabilities. To address this issue, we propose a hybrid CNN-Transformer model,\nCFFormer, and introduce two modules: the Cross Feature Channel Attention (CFCA)\nmodule and the X-Spatial Feature Fusion (XFF) module. The model incorporates\ndual encoders, with the CNN encoder focusing on capturing local features and\nthe Transformer encoder modeling global features. The CFCA module filters and\nfacilitates interactions between the channel features from the two encoders,\nwhile the XFF module effectively reduces the significant semantic information\ndifferences in spatial features, enabling a smooth and cohesive spatial feature\nfusion. We evaluate our model across eight datasets covering five modalities to\ntest its generalization capability. Experimental results demonstrate that our\nmodel outperforms current state-of-the-art (SOTA) methods, with particularly\nsuperior performance on datasets characterized by blurry boundaries and low\ncontrast.\n","authors":["Jiaxuan Li","Qing Xu","Xiangjian He","Ziyu Liu","Daokun Zhang","Ruili Wang","Rong Qu","Guoping Qiu"],"pdf_url":"https://arxiv.org/pdf/2501.03629v1.pdf","comment":"The article consists of 15 pages, including 10 figures and 7 tables.\n  The code will be made open-source once the article is accepted by the journal"},{"id":"http://arxiv.org/abs/2208.06538v2","updated":"2025-01-07T08:52:30Z","published":"2022-08-13T01:20:39Z","title":"Transferable Adversarial Examples with Bayes Approach","summary":"  The vulnerability of deep neural networks (DNNs) to black-box adversarial\nattacks is one of the most heated topics in trustworthy AI. In such attacks,\nthe attackers operate without any insider knowledge of the model, making the\ncross-model transferability of adversarial examples critical. Despite the\npotential for adversarial examples to be effective across various models, it\nhas been observed that adversarial examples that are specifically crafted for a\nspecific model often exhibit poor transferability. In this paper, we explore\nthe transferability of adversarial examples via the lens of Bayesian approach.\nSpecifically, we leverage Bayesian approach to probe the transferability and\nthen study what constitutes a transferability-promoting prior. Following this,\nwe design two concrete transferability-promoting priors, along with an adaptive\ndynamic weighting strategy for instances sampled from these priors. Employing\nthese techniques, we present BayAtk. Extensive experiments illustrate the\nsignificant effectiveness of BayAtk in crafting more transferable adversarial\nexamples against both undefended and defended black-box models compared to\nexisting state-of-the-art attacks.\n","authors":["Mingyuan Fan","Cen Chen","Wenmeng Zhou","Yinggui Wang"],"pdf_url":"https://arxiv.org/pdf/2208.06538v2.pdf","comment":"Accepted in AsiaCCS'25"},{"id":"http://arxiv.org/abs/2501.02487v2","updated":"2025-01-07T08:47:34Z","published":"2025-01-05T09:40:58Z","title":"ACE++: Instruction-Based Image Creation and Editing via Context-Aware\n  Content Filling","summary":"  We report ACE++, an instruction-based diffusion framework that tackles\nvarious image generation and editing tasks. Inspired by the input format for\nthe inpainting task proposed by FLUX.1-Fill-dev, we improve the Long-context\nCondition Unit (LCU) introduced in ACE and extend this input paradigm to any\nediting and generation tasks. To take full advantage of image generative\npriors, we develop a two-stage training scheme to minimize the efforts of\nfinetuning powerful text-to-image diffusion models like FLUX.1-dev. In the\nfirst stage, we pre-train the model using task data with the 0-ref tasks from\nthe text-to-image model. There are many models in the community based on the\npost-training of text-to-image foundational models that meet this training\nparadigm of the first stage. For example, FLUX.1-Fill-dev deals primarily with\npainting tasks and can be used as an initialization to accelerate the training\nprocess. In the second stage, we finetune the above model to support the\ngeneral instructions using all tasks defined in ACE. To promote the widespread\napplication of ACE++ in different scenarios, we provide a comprehensive set of\nmodels that cover both full finetuning and lightweight finetuning, while\nconsidering general applicability and applicability in vertical scenarios. The\nqualitative analysis showcases the superiority of ACE++ in terms of generating\nimage quality and prompt following ability. Code and models will be available\non the project page: https://ali-vilab. github.io/ACE_plus_page/.\n","authors":["Chaojie Mao","Jingfeng Zhang","Yulin Pan","Zeyinzi Jiang","Zhen Han","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.02487v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03619v1","updated":"2025-01-07T08:36:46Z","published":"2025-01-07T08:36:46Z","title":"Deep Learning-based Compression Detection for explainable Face Image\n  Quality Assessment","summary":"  The assessment of face image quality is crucial to ensure reliable face\nrecognition. In order to provide data subjects and operators with explainable\nand actionable feedback regarding captured face images, relevant quality\ncomponents have to be measured. Quality components that are known to negatively\nimpact the utility of face images include JPEG and JPEG 2000 compression\nartefacts, among others. Compression can result in a loss of important image\ndetails which may impair the recognition performance. In this work, deep neural\nnetworks are trained to detect the compression artefacts in a face images. For\nthis purpose, artefact-free facial images are compressed with the JPEG and JPEG\n2000 compression algorithms. Subsequently, the PSNR and SSIM metrics are\nemployed to obtain training labels based on which neural networks are trained\nusing a single network to detect JPEG and JPEG 2000 artefacts, respectively.\nThe evaluation of the proposed method shows promising results: in terms of\ndetection accuracy, error rates of 2-3% are obtained for utilizing PSNR labels\nduring training. In addition, we show that error rates of different open-source\nand commercial face recognition systems can be significantly reduced by\ndiscarding face images exhibiting severe compression artefacts. To minimize\nresource consumption, EfficientNetV2 serves as basis for the presented\nalgorithm, which is available as part of the OFIQ software.\n","authors":["Laurin Jonientz","Johannes Merkle","Christian Rathgeb","Benjamin Tams","Georg Merz"],"pdf_url":"https://arxiv.org/pdf/2501.03619v1.pdf","comment":"2nd Workshop on Fairness in Biometric Systems (FAIRBIO) at\n  International Conference on Pattern Recognition (ICPR) 2024"},{"id":"http://arxiv.org/abs/2501.03616v1","updated":"2025-01-07T08:32:48Z","published":"2025-01-07T08:32:48Z","title":"BTMTrack: Robust RGB-T Tracking via Dual-template Bridging and\n  Temporal-Modal Candidate Elimination","summary":"  RGB-T tracking leverages the complementary strengths of RGB and thermal\ninfrared (TIR) modalities to address challenging scenarios such as low\nillumination and adverse weather. However, existing methods often fail to\neffectively integrate temporal information and perform efficient cross-modal\ninteractions, which constrain their adaptability to dynamic targets. In this\npaper, we propose BTMTrack, a novel framework for RGB-T tracking. The core of\nour approach lies in the dual-template backbone network and the Temporal-Modal\nCandidate Elimination (TMCE) strategy. The dual-template backbone effectively\nintegrates temporal information, while the TMCE strategy focuses the model on\ntarget-relevant tokens by evaluating temporal and modal correlations, reducing\ncomputational overhead and avoiding irrelevant background noise. Building upon\nthis foundation, we propose the Temporal Dual Template Bridging (TDTB) module,\nwhich facilitates precise cross-modal fusion through dynamically filtered\ntokens. This approach further strengthens the interaction between templates and\nthe search region. Extensive experiments conducted on three benchmark datasets\ndemonstrate the effectiveness of BTMTrack. Our method achieves state-of-the-art\nperformance, with a 72.3% precision rate on the LasHeR test set and competitive\nresults on RGBT210 and RGBT234 datasets.\n","authors":["Zhongxuan Zhang","Bi Zeng","Xinyu Ni","Yimin Du"],"pdf_url":"https://arxiv.org/pdf/2501.03616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04783v2","updated":"2025-01-07T08:23:43Z","published":"2024-12-06T05:20:08Z","title":"KNN-MMD: Cross Domain Wireless Sensing via Local Distribution Alignment","summary":"  Wireless sensing has recently found widespread applications in diverse\nenvironments, including homes, offices, and public spaces. By analyzing\npatterns in channel state information (CSI), it is possible to infer human\nactions for tasks such as person identification, gesture recognition, and fall\ndetection. However, CSI is highly sensitive to environmental changes, where\neven minor alterations can significantly distort the CSI patterns. This\nsensitivity often leads to performance degradation or outright failure when\napplying wireless sensing models trained in one environment to another. To\naddress this challenge, Domain Alignment (DAL) has been widely adopted for\ncross-domain classification tasks, as it focuses on aligning the global\ndistributions of the source and target domains in feature space. Despite its\npopularity, DAL often neglects inter-category relationships, which can lead to\nmisalignment between categories across domains, even when global alignment is\nachieved. To overcome these limitations, we propose K-Nearest Neighbors Maximum\nMean Discrepancy (KNN-MMD), a novel few-shot method for cross-domain wireless\nsensing. Our approach begins by constructing a help set using KNN from the\ntarget domain, enabling local alignment between the source and target domains\nwithin each category using MMD. Additionally, we address a key instability\nissue commonly observed in cross-domain methods, where model performance\nfluctuates sharply between epochs. Further, most existing methods struggle to\ndetermine an optimal stopping point during training due to the absence of\nlabeled data from the target domain. Our method resolves this by excluding the\nsupport set from the target domain during training and employing it as a\nvalidation set to determine the stopping criterion.\n","authors":["Zijian Zhao","Zhijie Cai","Tingwei Chen","Xiaoyang Li","Hang Li","Qimei Chen","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.04783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03606v1","updated":"2025-01-07T08:14:53Z","published":"2025-01-07T08:14:53Z","title":"VTAO-BiManip: Masked Visual-Tactile-Action Pre-training with Object\n  Understanding for Bimanual Dexterous Manipulation","summary":"  Bimanual dexterous manipulation remains significant challenges in robotics\ndue to the high DoFs of each hand and their coordination. Existing single-hand\nmanipulation techniques often leverage human demonstrations to guide RL methods\nbut fail to generalize to complex bimanual tasks involving multiple sub-skills.\nIn this paper, we introduce VTAO-BiManip, a novel framework that combines\nvisual-tactile-action pretraining with object understanding to facilitate\ncurriculum RL to enable human-like bimanual manipulation. We improve prior\nlearning by incorporating hand motion data, providing more effective guidance\nfor dual-hand coordination than binary tactile feedback. Our pretraining model\npredicts future actions as well as object pose and size using masked multimodal\ninputs, facilitating cross-modal regularization. To address the multi-skill\nlearning challenge, we introduce a two-stage curriculum RL approach to\nstabilize training. We evaluate our method on a bottle-cap unscrewing task,\ndemonstrating its effectiveness in both simulated and real-world environments.\nOur approach achieves a success rate that surpasses existing visual-tactile\npretraining methods by over 20%.\n","authors":["Zhengnan Sun","Zhaotai Shi","Jiayin Chen","Qingtao Liu","Yu Cui","Qi Ye","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2501.03606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03605v1","updated":"2025-01-07T08:06:35Z","published":"2025-01-07T08:06:35Z","title":"ConcealGS: Concealing Invisible Copyright Information in 3D Gaussian\n  Splatting","summary":"  With the rapid development of 3D reconstruction technology, the widespread\ndistribution of 3D data has become a future trend. While traditional visual\ndata (such as images and videos) and NeRF-based formats already have mature\ntechniques for copyright protection, steganographic techniques for the emerging\n3D Gaussian Splatting (3D-GS) format have yet to be fully explored. To address\nthis, we propose ConcealGS, an innovative method for embedding implicit\ninformation into 3D-GS. By introducing the knowledge distillation and gradient\noptimization strategy based on 3D-GS, ConcealGS overcomes the limitations of\nNeRF-based models and enhances the robustness of implicit information and the\nquality of 3D reconstruction. We evaluate ConcealGS in various potential\napplication scenarios, and experimental results have demonstrated that\nConcealGS not only successfully recovers implicit information but also has\nalmost no impact on rendering quality, providing a new approach for embedding\ninvisible and recoverable information into 3D models in the future.\n","authors":["Yifeng Yang","Hengyu Liu","Chenxin Li","Yining Sun","Wuyang Li","Yifan Liu","Yiyang Lin","Yixuan Yuan","Nanyang Ye"],"pdf_url":"https://arxiv.org/pdf/2501.03605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02807v2","updated":"2025-01-07T07:47:22Z","published":"2025-01-06T07:00:22Z","title":"AE-NeRF: Augmenting Event-Based Neural Radiance Fields for Non-ideal\n  Conditions and Larger Scene","summary":"  Compared to frame-based methods, computational neuromorphic imaging using\nevent cameras offers significant advantages, such as minimal motion blur,\nenhanced temporal resolution, and high dynamic range. The multi-view\nconsistency of Neural Radiance Fields combined with the unique benefits of\nevent cameras, has spurred recent research into reconstructing NeRF from data\ncaptured by moving event cameras. While showing impressive performance,\nexisting methods rely on ideal conditions with the availability of uniform and\nhigh-quality event sequences and accurate camera poses, and mainly focus on the\nobject level reconstruction, thus limiting their practical applications. In\nthis work, we propose AE-NeRF to address the challenges of learning event-based\nNeRF from non-ideal conditions, including non-uniform event sequences, noisy\nposes, and various scales of scenes. Our method exploits the density of event\nstreams and jointly learn a pose correction module with an event-based NeRF\n(e-NeRF) framework for robust 3D reconstruction from inaccurate camera poses.\nTo generalize to larger scenes, we propose hierarchical event distillation with\na proposal e-NeRF network and a vanilla e-NeRF network to resample and refine\nthe reconstruction process. We further propose an event reconstruction loss and\na temporal loss to improve the view consistency of the reconstructed scene. We\nestablished a comprehensive benchmark that includes large-scale scenes to\nsimulate practical non-ideal conditions, incorporating both synthetic and\nchallenging real-world event datasets. The experimental results show that our\nmethod achieves a new state-of-the-art in event-based 3D reconstruction.\n","authors":["Chaoran Feng","Wangbo Yu","Xinhua Cheng","Zhenyu Tang","Junwu Zhang","Li Yuan","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2501.02807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03592v1","updated":"2025-01-07T07:45:21Z","published":"2025-01-07T07:45:21Z","title":"A Value Mapping Virtual Staining Framework for Large-scale Histological\n  Imaging","summary":"  The emergence of virtual staining technology provides a rapid and efficient\nalternative for researchers in tissue pathology. It enables the utilization of\nunlabeled microscopic samples to generate virtual replicas of chemically\nstained histological slices, or facilitate the transformation of one staining\ntype into another. The remarkable performance of generative networks, such as\nCycleGAN, offers an unsupervised learning approach for virtual coloring,\novercoming the limitations of high-quality paired data required in supervised\nlearning. Nevertheless, large-scale color transformation necessitates\nprocessing large field-of-view images in patches, often resulting in\nsignificant boundary inconsistency and artifacts. Additionally, the\ntransformation between different colorized modalities typically needs further\nefforts to modify loss functions and tune hyperparameters for independent\ntraining of networks. In this study, we introduce a general virtual staining\nframework that is adaptable to various conditions. We propose a loss function\nbased on the value mapping constraint to ensure the accuracy of virtual\ncoloring between different pathological modalities, termed the Value Mapping\nGenerative Adversarial Network (VM-GAN). Meanwhile, we present a\nconfidence-based tiling method to address the challenge of boundary\ninconsistency arising from patch-wise processing. Experimental results on\ndiverse data with varying staining protocols demonstrate that our method\nachieves superior quantitative indicators and improved visual perception.\n","authors":["Junjia Wang","Bo Xiong","You Zhou","Xun Cao","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2501.03592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15507v2","updated":"2025-01-07T07:35:10Z","published":"2024-07-22T09:44:35Z","title":"SpotDiffusion: A Fast Approach For Seamless Panorama Generation Over\n  Time","summary":"  Generating high-resolution images with generative models has recently been\nmade widely accessible by leveraging diffusion models pre-trained on\nlarge-scale datasets. Various techniques, such as MultiDiffusion and\nSyncDiffusion, have further pushed image generation beyond training\nresolutions, i.e., from square images to panorama, by merging multiple\noverlapping diffusion paths or employing gradient descent to maintain\nperceptual coherence. However, these methods suffer from significant\ncomputational inefficiencies due to generating and averaging numerous\npredictions, which is required in practice to produce high-quality and seamless\nimages. This work addresses this limitation and presents a novel approach that\neliminates the need to generate and average numerous overlapping denoising\npredictions. Our method shifts non-overlapping denoising windows over time,\nensuring that seams in one timestep are corrected in the next. This results in\ncoherent, high-resolution images with fewer overall steps. We demonstrate the\neffectiveness of our approach through qualitative and quantitative evaluations,\ncomparing it with MultiDiffusion, SyncDiffusion, and StitchDiffusion. Our\nmethod offers several key benefits, including improved computational efficiency\nand faster inference times while producing comparable or better image quality.\nLink to code https://github.com/stanifrolov/spotdiffusion\n","authors":["Stanislav Frolov","Brian B. Moser","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2407.15507v2.pdf","comment":"Project page: https://spotdiffusion.github.io/"},{"id":"http://arxiv.org/abs/2411.15778v3","updated":"2025-01-07T07:31:00Z","published":"2024-11-24T10:58:48Z","title":"Enhancing the automatic segmentation and analysis of 3D liver\n  vasculature models","summary":"  Surgical assessment of liver cancer patients requires identification of the\nvessel trees from medical images. Specifically, the venous trees - the portal\n(perfusing) and the hepatic (draining) trees are important for understanding\nthe liver anatomy and disease state, and perform surgery planning. This\nresearch aims to improve the 3D segmentation, skeletonization, and subsequent\nanalysis of vessel trees, by creating an automatic pipeline based on deep\nlearning and image processing techniques.\n  The first part of this work explores the impact of differentiable\nskeletonization methods such as ClDice and morphological skeletonization loss,\non the overall liver vessel segmentation performance. To this aim, it studies\nhow to improve vessel tree connectivity.\n  The second part of this study converts a single class vessel segmentation\ninto multi-class ones, separating the two venous trees. It builds on the\nprevious two-class vessel segmentation model, which vessel tree outputs might\nbe entangled, and on connected components and skeleton analyses of the trees.\n  After providing sub-labeling of the specific anatomical branches of each\nvenous tree, these algorithms also enable a morphometric analysis of the vessel\ntrees by extracting various geometrical markers.\n  In conclusion, we propose a method that successfully improves current\nskeletonization methods, for extensive vascular trees that contain vessels of\ndifferent calibers. The separation algorithm creates a clean multi-class\nsegmentation of the vessels, validated by surgeons to provide low error. A new,\npublicly shared high-quality liver vessel dataset of 77 cases is thus created.\nFinally a method to annotate vessel trees according to anatomy is provided,\nenabling a unique liver vessel morphometry analysis.\n","authors":["Yassine Machta","Omar Ali","Kevin Hakkakian","Ana Vlasceanu","Amaury Facque","Nicolas Golse","Irene Vignon-Clementel"],"pdf_url":"https://arxiv.org/pdf/2411.15778v3.pdf","comment":"Paper presented at MICCAI 2024 Workshop: ADSMI. This work was done in\n  the context of an internship at Simbiotx, Inria"},{"id":"http://arxiv.org/abs/2501.03580v1","updated":"2025-01-07T07:08:46Z","published":"2025-01-07T07:08:46Z","title":"BASIC: Semi-supervised Multi-organ Segmentation with Balanced Subclass\n  Regularization and Semantic-conflict Penalty","summary":"  Semi-supervised learning (SSL) has shown notable potential in relieving the\nheavy demand of dense prediction tasks on large-scale well-annotated datasets,\nespecially for the challenging multi-organ segmentation (MoS). However, the\nprevailing class-imbalance problem in MoS caused by the substantial variations\nin organ size exacerbates the learning difficulty of the SSL network. To\naddress this issue, in this paper, we propose an innovative semi-supervised\nnetwork with BAlanced Subclass regularIzation and semantic-Conflict penalty\nmechanism (BASIC) to effectively learn the unbiased knowledge for\nsemi-supervised MoS. Concretely, we construct a novel auxiliary subclass\nsegmentation (SCS) task based on priorly generated balanced subclasses, thus\ndeeply excavating the unbiased information for the main MoS task with the\nfashion of multi-task learning. Additionally, based on a mean teacher\nframework, we elaborately design a balanced subclass regularization to utilize\nthe teacher predictions of SCS task to supervise the student predictions of MoS\ntask, thus effectively transferring unbiased knowledge to the MoS subnetwork\nand alleviating the influence of the class-imbalance problem. Considering the\nsimilar semantic information inside the subclasses and their corresponding\noriginal classes (i.e., parent classes), we devise a semantic-conflict penalty\nmechanism to give heavier punishments to the conflicting SCS predictions with\nwrong parent classes and provide a more accurate constraint to the MoS\npredictions. Extensive experiments conducted on two publicly available\ndatasets, i.e., the WORD dataset and the MICCAI FLARE 2022 dataset, have\nverified the superior performance of our proposed BASIC compared to other\nstate-of-the-art methods.\n","authors":["Zhenghao Feng","Lu Wen","Yuanyuan Xu","Binyu Yan","Xi Wu","Jiliu Zhou","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03173v3","updated":"2025-01-07T07:05:05Z","published":"2024-02-05T16:41:02Z","title":"MULTI: Multimodal Understanding Leaderboard with Text and Images","summary":"  The rapid development of multimodal large language models (MLLMs) raises the\nquestion of how they compare to human performance. While existing datasets\noften feature synthetic or overly simplistic tasks, some models have already\nsurpassed human expert baselines. In this paper, we present MULTI, a Chinese\nmultimodal dataset derived from authentic examination questions. Comprising\nover 18,000 carefully selected and refined questions, MULTI evaluates models\nusing real-world examination standards, encompassing image-text comprehension,\ncomplex reasoning, and knowledge recall. Additionally, We also introduce\nMULTI-Elite, a 500-question selected hard subset, and MULTI-Extend with more\nthan 4,500 external knowledge context pieces for testing in-context learning\ncapabilities. Our evaluation highlights substantial room for MLLM advancement,\nwith Qwen2-VL-72B achieving a 76.9% accuracy on MULTI and 53.1% on MULTI-Elite\nleading 25 evaluated models, compared to human expert baselines of 86.1% and\n73.1%. MULTI serves not only as a robust evaluation platform but also paves the\nway for the development of expert-level AI.\n","authors":["Zichen Zhu","Yang Xu","Lu Chen","Jingkai Yang","Yichuan Ma","Yiming Sun","Hailin Wen","Jiaqi Liu","Jinyu Cai","Yingzi Ma","Situo Zhang","Zihan Zhao","Liangtai Sun","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2402.03173v3.pdf","comment":"24 pages, 19 figures, 10 tables. Details and access are available at:\n  https://OpenDFM.github.io/MULTI-Benchmark/"},{"id":"http://arxiv.org/abs/2501.03575v1","updated":"2025-01-07T06:55:50Z","published":"2025-01-07T06:55:50Z","title":"Cosmos World Foundation Model Platform for Physical AI","summary":"  Physical AI needs to be trained digitally first. It needs a digital twin of\nitself, the policy model, and a digital twin of the world, the world model. In\nthis paper, we present the Cosmos World Foundation Model Platform to help\ndevelopers build customized world models for their Physical AI setups. We\nposition a world foundation model as a general-purpose world model that can be\nfine-tuned into customized world models for downstream applications. Our\nplatform covers a video curation pipeline, pre-trained world foundation models,\nexamples of post-training of pre-trained world foundation models, and video\ntokenizers. To help Physical AI builders solve the most critical problems of\nour society, we make our platform open-source and our models open-weight with\npermissive licenses available via https://github.com/NVIDIA/Cosmos.\n","authors":[" NVIDIA"," :","Niket Agarwal","Arslan Ali","Maciej Bala","Yogesh Balaji","Erik Barker","Tiffany Cai","Prithvijit Chattopadhyay","Yongxin Chen","Yin Cui","Yifan Ding","Daniel Dworakowski","Jiaojiao Fan","Michele Fenzi","Francesco Ferroni","Sanja Fidler","Dieter Fox","Songwei Ge","Yunhao Ge","Jinwei Gu","Siddharth Gururani","Ethan He","Jiahui Huang","Jacob Huffman","Pooya Jannaty","Jingyi Jin","Seung Wook Kim","Gergely Klár","Grace Lam","Shiyi Lan","Laura Leal-Taixe","Anqi Li","Zhaoshuo Li","Chen-Hsuan Lin","Tsung-Yi Lin","Huan Ling","Ming-Yu Liu","Xian Liu","Alice Luo","Qianli Ma","Hanzi Mao","Kaichun Mo","Arsalan Mousavian","Seungjun Nah","Sriharsha Niverty","David Page","Despoina Paschalidou","Zeeshan Patel","Lindsey Pavao","Morteza Ramezanali","Fitsum Reda","Xiaowei Ren","Vasanth Rao Naik Sabavat","Ed Schmerling","Stella Shi","Bartosz Stefaniak","Shitao Tang","Lyne Tchapmi","Przemek Tredak","Wei-Cheng Tseng","Jibin Varghese","Hao Wang","Haoxiang Wang","Heng Wang","Ting-Chun Wang","Fangyin Wei","Xinyue Wei","Jay Zhangjie Wu","Jiashu Xu","Wei Yang","Lin Yen-Chen","Xiaohui Zeng","Yu Zeng","Jing Zhang","Qinsheng Zhang","Yuxuan Zhang","Qingqing Zhao","Artur Zolkowski"],"pdf_url":"https://arxiv.org/pdf/2501.03575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01595v2","updated":"2025-01-07T06:55:35Z","published":"2025-01-03T01:54:16Z","title":"Adaptive Homophily Clustering: Structure Homophily Graph Learning with\n  Adaptive Filter for Hyperspectral Image","summary":"  Hyperspectral image (HSI) clustering has been a fundamental but challenging\ntask with zero training labels. Currently, some deep graph clustering methods\nhave been successfully explored for HSI due to their outstanding performance in\neffective spatial structural information encoding. Nevertheless, insufficient\nstructural information utilization, poor feature presentation ability, and weak\ngraph update capability limit their performance. Thus, in this paper, a\nhomophily structure graph learning with an adaptive filter clustering method\n(AHSGC) for HSI is proposed. Specifically, homogeneous region generation is\nfirst developed for HSI processing and constructing the original graph.\nAfterward, an adaptive filter graph encoder is designed to adaptively capture\nthe high and low frequency features on the graph for subsequence processing.\nThen, a graph embedding clustering self-training decoder is developed with KL\nDivergence, with which the pseudo-label is generated for network training.\nMeanwhile, homophily-enhanced structure learning is introduced to update the\ngraph according to the clustering task, in which the orient correlation\nestimation is adopted to estimate the node connection, and graph edge\nsparsification is designed to adjust the edges in the graph dynamically.\nFinally, a joint network optimization is introduced to achieve network\nself-training and update the graph. The K-means is adopted to express the\nlatent features. Extensive experiments and repeated comparative analysis have\nverified that our AHSGC contains high clustering accuracy, low computational\ncomplexity, and strong robustness. The code source will be available at\nhttps://github.com/DY-HYX.\n","authors":["Yao Ding","Weijie Kang","Aitao Yang","Zhili Zhang","Junyang Zhao","Jie Feng","Danfeng Hong","Qinhe Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.01595v2.pdf","comment":"14 pages, 8 figure"},{"id":"http://arxiv.org/abs/2403.10089v4","updated":"2025-01-07T06:45:58Z","published":"2024-03-15T08:05:16Z","title":"Approximation and bounding techniques for the Fisher-Rao distances\n  between parametric statistical models","summary":"  The Fisher-Rao distance between two probability distributions of a\nstatistical model is defined as the Riemannian geodesic distance induced by the\nFisher information metric. In order to calculate the Fisher-Rao distance in\nclosed-form, we need (1) to elicit a formula for the Fisher-Rao geodesics, and\n(2) to integrate the Fisher length element along those geodesics. We consider\nseveral numerically robust approximation and bounding techniques for the\nFisher-Rao distances: First, we report generic upper bounds on Fisher-Rao\ndistances based on closed-form 1D Fisher-Rao distances of submodels. Second, we\ndescribe several generic approximation schemes depending on whether the\nFisher-Rao geodesics or pregeodesics are available in closed-form or not. In\nparticular, we obtain a generic method to guarantee an arbitrarily small\nadditive error on the approximation provided that Fisher-Rao pregeodesics and\ntight lower and upper bounds are available. Third, we consider the case of\nFisher metrics being Hessian metrics, and report generic tight upper bounds on\nthe Fisher-Rao distances using techniques of information geometry.\nUniparametric and biparametric statistical models always have Fisher Hessian\nmetrics, and in general a simple test allows to check whether the Fisher\ninformation matrix yields a Hessian metric or not. Fourth, we consider\nelliptical distribution families and show how to apply the above techniques to\nthese models. We also propose two new distances based either on the Fisher-Rao\nlengths of curves serving as proxies of Fisher-Rao geodesics, or based on the\nBirkhoff/Hilbert projective cone distance. Last, we consider an alternative\ngroup-theoretic approach for statistical transformation models based on the\nnotion of maximal invariant which yields insights on the structures of the\nFisher-Rao distance formula which may be used fruitfully in applications.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2403.10089v4.pdf","comment":"48 pages"},{"id":"http://arxiv.org/abs/2501.03567v1","updated":"2025-01-07T06:35:34Z","published":"2025-01-07T06:35:34Z","title":"Evaluating Image Caption via Cycle-consistent Text-to-Image Generation","summary":"  Evaluating image captions typically relies on reference captions, which are\ncostly to obtain and exhibit significant diversity and subjectivity. While\nreference-free evaluation metrics have been proposed, most focus on cross-modal\nevaluation between captions and images. Recent research has revealed that the\nmodality gap generally exists in the representation of contrastive\nlearning-based multi-modal systems, undermining the reliability of\ncross-modality metrics like CLIPScore. In this paper, we propose CAMScore, a\ncyclic reference-free automatic evaluation metric for image captioning models.\nTo circumvent the aforementioned modality gap, CAMScore utilizes a\ntext-to-image model to generate images from captions and subsequently evaluates\nthese generated images against the original images. Furthermore, to provide\nfine-grained information for a more comprehensive evaluation, we design a\nthree-level evaluation framework for CAMScore that encompasses pixel-level,\nsemantic-level, and objective-level perspectives. Extensive experiment results\nacross multiple benchmark datasets show that CAMScore achieves a superior\ncorrelation with human judgments compared to existing reference-based and\nreference-free metrics, demonstrating the effectiveness of the framework.\n","authors":["Tianyu Cui","Jinbin Bai","Guohua Wang","Qingguo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","Ye Shi"],"pdf_url":"https://arxiv.org/pdf/2501.03567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08983v2","updated":"2025-01-07T06:32:53Z","published":"2022-12-18T01:07:20Z","title":"Adaptive deep learning framework for robust unsupervised underwater\n  image enhancement","summary":"  One of the main challenges in deep learning-based underwater image\nenhancement is the limited availability of high-quality training data.\nUnderwater images are difficult to capture and are often of poor quality due to\nthe distortion and loss of colour and contrast in water. This makes it\ndifficult to train supervised deep learning models on large and diverse\ndatasets, which can limit the model's performance. In this paper, we explore an\nalternative approach to supervised underwater image enhancement. Specifically,\nwe propose a novel unsupervised underwater image enhancement framework that\nemploys a conditional variational autoencoder (cVAE) to train a deep learning\nmodel with probabilistic adaptive instance normalization (PAdaIN) and\nstatistically guided multi-colour space stretch that produces realistic\nunderwater images. The resulting framework is composed of a U-Net as a feature\nextractor and a PAdaIN to encode the uncertainty, which we call UDnet. To\nimprove the visual quality of the images generated by UDnet, we use a\nstatistically guided multi-colour space stretch module that ensures visual\nconsistency with the input image and provides an alternative to training using\na ground truth image. The proposed model does not need manual human annotation\nand can learn with a limited amount of data and achieves state-of-the-art\nresults on underwater images. We evaluated our proposed framework on eight\npublicly-available datasets. The results show that our proposed framework\nyields competitive performance compared to other state-of-the-art approaches in\nquantitative as well as qualitative metrics. Code available at\nhttps://github.com/alzayats/UDnet .\n","authors":["Alzayat Saleh","Marcus Sheaves","Dean Jerry","Mostafa Rahimi Azghadi"],"pdf_url":"https://arxiv.org/pdf/2212.08983v2.pdf","comment":"25 pages, 7 figures, 6 tables, accepted for publication in Expert\n  Systems with Applications"},{"id":"http://arxiv.org/abs/2501.03565v1","updated":"2025-01-07T06:30:52Z","published":"2025-01-07T06:30:52Z","title":"Bridged Semantic Alignment for Zero-shot 3D Medical Image Diagnosis","summary":"  3D medical images such as Computed tomography (CT) are widely used in\nclinical practice, offering a great potential for automatic diagnosis.\nSupervised learning-based approaches have achieved significant progress but\nrely heavily on extensive manual annotations, limited by the availability of\ntraining data and the diversity of abnormality types. Vision-language alignment\n(VLA) offers a promising alternative by enabling zero-shot learning without\nadditional annotations. However, we empirically discover that the visual and\ntextural embeddings after alignment endeavors from existing VLA methods form\ntwo well-separated clusters, presenting a wide gap to be bridged. To bridge\nthis gap, we propose a Bridged Semantic Alignment (BrgSA) framework. First, we\nutilize a large language model to perform semantic summarization of reports,\nextracting high-level semantic information. Second, we design a Cross-Modal\nKnowledge Interaction (CMKI) module that leverages a cross-modal knowledge bank\nas a semantic bridge, facilitating interaction between the two modalities,\nnarrowing the gap, and improving their alignment. To comprehensively evaluate\nour method, we construct a benchmark dataset that includes 15 underrepresented\nabnormalities as well as utilize two existing benchmark datasets. Experimental\nresults demonstrate that BrgSA achieves state-of-the-art performances on both\npublic benchmark datasets and our custom-labeled dataset, with significant\nimprovements in zero-shot diagnosis of underrepresented abnormalities.\n","authors":["Haoran Lai","Zihang Jiang","Qingsong Yao","Rongsheng Wang","Zhiyang He","Xiaodong Tao","Wei Wei","Weifu Lv","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.03565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03544v1","updated":"2025-01-07T05:39:21Z","published":"2025-01-07T05:39:21Z","title":"PromptGuard: Soft Prompt-Guided Unsafe Content Moderation for\n  Text-to-Image Models","summary":"  Text-to-image (T2I) models have been shown to be vulnerable to misuse,\nparticularly in generating not-safe-for-work (NSFW) content, raising serious\nethical concerns. In this work, we present PromptGuard, a novel content\nmoderation technique that draws inspiration from the system prompt mechanism in\nlarge language models (LLMs) for safety alignment. Unlike LLMs, T2I models lack\na direct interface for enforcing behavioral guidelines. Our key idea is to\noptimize a safety soft prompt that functions as an implicit system prompt\nwithin the T2I model's textual embedding space. This universal soft prompt (P*)\ndirectly moderates NSFW inputs, enabling safe yet realistic image generation\nwithout altering the inference efficiency or requiring proxy models. Extensive\nexperiments across three datasets demonstrate that PromptGuard effectively\nmitigates NSFW content generation while preserving high-quality benign outputs.\nPromptGuard achieves 7.8 times faster than prior content moderation methods,\nsurpassing eight state-of-the-art defenses with an optimal unsafe ratio down to\n5.84%.\n","authors":["Lingzhi Yuan","Xinfeng Li","Chejian Xu","Guanhong Tao","Xiaojun Jia","Yihao Huang","Wei Dong","Yang Liu","XiaoFeng Wang","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2501.03544v1.pdf","comment":"16 pages, 8 figures, 10 tables"},{"id":"http://arxiv.org/abs/2501.03539v1","updated":"2025-01-07T05:21:13Z","published":"2025-01-07T05:21:13Z","title":"Enhanced Tuberculosis Bacilli Detection using Attention-Residual U-Net\n  and Ensemble Classification","summary":"  Tuberculosis (TB), caused by Mycobacterium tuberculosis, remains a critical\nglobal health issue, necessitating timely diagnosis and treatment. Current\nmethods for detecting tuberculosis bacilli from bright field microscopic sputum\nsmear images suffer from low automation, inadequate segmentation performance,\nand limited classification accuracy. This paper proposes an efficient hybrid\napproach that combines deep learning for segmentation and an ensemble model for\nclassification. An enhanced U-Net model incorporating attention blocks and\nresidual connections is introduced to precisely segment microscopic sputum\nsmear images, facilitating the extraction of Regions of Interest (ROIs). These\nROIs are subsequently classified using an ensemble classifier comprising\nSupport Vector Machine (SVM), Random Forest, and Extreme Gradient Boost\n(XGBoost), resulting in an accurate identification of bacilli within the\nimages. Experiments conducted on a newly created dataset, along with public\ndatasets, demonstrate that the proposed model achieves superior segmentation\nperformance, higher classification accuracy, and enhanced automation compared\nto existing methods.\n","authors":["Greeshma K","Vishnukumar S"],"pdf_url":"https://arxiv.org/pdf/2501.03539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03538v1","updated":"2025-01-07T05:17:43Z","published":"2025-01-07T05:17:43Z","title":"Efficient and Accurate Tuberculosis Diagnosis: Attention Residual U-Net\n  and Vision Transformer Based Detection Framework","summary":"  Tuberculosis (TB), an infectious disease caused by Mycobacterium\ntuberculosis, continues to be a major global health threat despite being\npreventable and curable. This burden is particularly high in low and middle\nincome countries. Microscopy remains essential for diagnosing TB by enabling\ndirect visualization of Mycobacterium tuberculosis in sputum smear samples,\noffering a cost effective approach for early detection and effective treatment.\nGiven the labour-intensive nature of microscopy, automating the detection of\nbacilli in microscopic images is crucial to improve both the expediency and\nreliability of TB diagnosis. The current methodologies for detecting\ntuberculosis bacilli in bright field microscopic sputum smear images are\nhindered by limited automation capabilities, inconsistent segmentation quality,\nand constrained classification precision. This paper proposes a twostage deep\nlearning methodology for tuberculosis bacilli detection, comprising bacilli\nsegmentation followed by classification. In the initial phase, an advanced\nU-Net model employing attention blocks and residual connections is proposed to\nsegment microscopic sputum smear images, enabling the extraction of Regions of\nInterest (ROIs). The extracted ROIs are then classified using a Vision\nTransformer, which we specifically customized as TBViT to enhance the precise\ndetection of bacilli within the images. For the experiments, a newly developed\ndataset of microscopic sputum smear images derived from Ziehl-Neelsen-stained\nslides is used in conjunction with existing public datasets. The qualitative\nand quantitative evaluation of the experiments using various metrics\ndemonstrates that the proposed model achieves significantly improved\nsegmentation performance, higher classification accuracy, and a greater level\nof automation, surpassing existing methods.\n","authors":["Greeshma K","Vishnukumar S"],"pdf_url":"https://arxiv.org/pdf/2501.03538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03533v1","updated":"2025-01-07T05:12:49Z","published":"2025-01-07T05:12:49Z","title":"Anomaly Triplet-Net: Progress Recognition Model Using Deep Metric\n  Learning Considering Occlusion for Manual Assembly Work","summary":"  In this paper, a progress recognition method consider occlusion using deep\nmetric learning is proposed to visualize the product assembly process in a\nfactory. First, the target assembly product is detected from images acquired\nfrom a fixed-point camera installed in the factory using a deep learning-based\nobject detection method. Next, the detection area is cropped from the image.\nFinally, by using a classification method based on deep metric learning on the\ncropped image, the progress of the product assembly work is estimated as a\nrough progress step.\n  As a specific progress estimation model, we propose an Anomaly Triplet-Net\nthat adds anomaly samples to Triplet Loss for progress estimation considering\nocclusion.\n  In experiments, an 82.9% success rate is achieved for the progress estimation\nmethod using Anomaly Triplet-Net.\n  We also experimented with the practicality of the sequence of detection,\ncropping, and progression estimation, and confirmed the effectiveness of the\noverall system.\n","authors":["Takumi Kitsukawa","Kazuma Miura","Shigeki Yumoto","Sarthak Pathak","Alessandro Moro","Kazunori Umeda"],"pdf_url":"https://arxiv.org/pdf/2501.03533v1.pdf","comment":"This paper has been peer-reviewed, revised, and published in Advanced\n  Robotics"},{"id":"http://arxiv.org/abs/2501.03526v1","updated":"2025-01-07T04:42:45Z","published":"2025-01-07T04:42:45Z","title":"FgC2F-UDiff: Frequency-guided and Coarse-to-fine Unified Diffusion Model\n  for Multi-modality Missing MRI Synthesis","summary":"  Multi-modality magnetic resonance imaging (MRI) is essential for the\ndiagnosis and treatment of brain tumors. However, missing modalities are\ncommonly observed due to limitations in scan time, scan corruption, artifacts,\nmotion, and contrast agent intolerance. Synthesis of missing MRI has been a\nmeans to address the limitations of modality insufficiency in clinical practice\nand research. However, there are still some challenges, such as poor\ngeneralization, inaccurate non-linear mapping, and slow processing speeds. To\naddress the aforementioned issues, we propose a novel unified synthesis model,\nthe Frequency-guided and Coarse-to-fine Unified Diffusion Model (FgC2F-UDiff),\ndesigned for multiple inputs and outputs. Specifically, the Coarse-to-fine\nUnified Network (CUN) fully exploits the iterative denoising properties of\ndiffusion models, from global to detail, by dividing the denoising process into\ntwo stages, coarse and fine, to enhance the fidelity of synthesized images.\nSecondly, the Frequency-guided Collaborative Strategy (FCS) harnesses\nappropriate frequency information as prior knowledge to guide the learning of a\nunified, highly non-linear mapping. Thirdly, the Specific-acceleration Hybrid\nMechanism (SHM) integrates specific mechanisms to accelerate the diffusion\nmodel and enhance the feasibility of many-to-many synthesis. Extensive\nexperimental evaluations have demonstrated that our proposed FgC2F-UDiff model\nachieves superior performance on two datasets, validated through a\ncomprehensive assessment that includes both qualitative observations and\nquantitative metrics, such as PSNR SSIM, LPIPS, and FID.\n","authors":["Xiaojiao Xiao","Qinmin Vivian Hu","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03525v1","updated":"2025-01-07T04:40:55Z","published":"2025-01-07T04:40:55Z","title":"TexHOI: Reconstructing Textures of 3D Unknown Objects in Monocular\n  Hand-Object Interaction Scenes","summary":"  Reconstructing 3D models of dynamic, real-world objects with high-fidelity\ntextures from monocular frame sequences has been a challenging problem in\nrecent years. This difficulty stems from factors such as shadows, indirect\nillumination, and inaccurate object-pose estimations due to occluding\nhand-object interactions. To address these challenges, we propose a novel\napproach that predicts the hand's impact on environmental visibility and\nindirect illumination on the object's surface albedo. Our method first learns\nthe geometry and low-fidelity texture of the object, hand, and background\nthrough composite rendering of radiance fields. Simultaneously, we optimize the\nhand and object poses to achieve accurate object-pose estimations. We then\nrefine physics-based rendering parameters - including roughness, specularity,\nalbedo, hand visibility, skin color reflections, and environmental illumination\n- to produce precise albedo, and accurate hand illumination and shadow regions.\nOur approach surpasses state-of-the-art methods in texture reconstruction and,\nto the best of our knowledge, is the first to account for hand-object\ninteractions in object texture reconstruction.\n","authors":["Alakh Aggarwal","Ningna Wang","Xiaohu Guo"],"pdf_url":"https://arxiv.org/pdf/2501.03525v1.pdf","comment":"This paper was accepted at ICCVM 2025 and will appear in the\n  proceedings of IEEE TVCG as part of the conference"},{"id":"http://arxiv.org/abs/2310.15624v2","updated":"2025-01-07T04:39:25Z","published":"2023-10-24T08:45:15Z","title":"GUPNet++: Geometry Uncertainty Propagation Network for Monocular 3D\n  Object Detection","summary":"  Geometry plays a significant role in monocular 3D object detection. It can be\nused to estimate object depth by using the perspective projection between\nobject's physical size and 2D projection in the image plane, which can\nintroduce mathematical priors into deep models. However, this projection\nprocess also introduces error amplification, where the error of the estimated\nheight is amplified and reflected into the projected depth. It leads to\nunreliable depth inferences and also impairs training stability. To tackle this\nproblem, we propose a novel Geometry Uncertainty Propagation Network (GUPNet++)\nby modeling geometry projection in a probabilistic manner. This ensures depth\npredictions are well-bounded and associated with a reasonable uncertainty. The\nsignificance of introducing such geometric uncertainty is two-fold: (1). It\nmodels the uncertainty propagation relationship of the geometry projection\nduring training, improving the stability and efficiency of the end-to-end model\nlearning. (2). It can be derived to a highly reliable confidence to indicate\nthe quality of the 3D detection result, enabling more reliable detection\ninference. Experiments show that the proposed approach not only obtains\n(state-of-the-art) SOTA performance in image-based monocular 3D detection but\nalso demonstrates superiority in efficacy with a simplified framework.\n","authors":["Yan Lu","Xinzhu Ma","Lei Yang","Tianzhu Zhang","Yating Liu","Qi Chu","Tong He","Yonghui Li","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.15624v2.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.03510v1","updated":"2025-01-07T04:06:07Z","published":"2025-01-07T04:06:07Z","title":"Salient Region Matching for Fully Automated MR-TRUS Registration","summary":"  Prostate cancer is a leading cause of cancer-related mortality in men. The\nregistration of magnetic resonance (MR) and transrectal ultrasound (TRUS) can\nprovide guidance for the targeted biopsy of prostate cancer. In this study, we\npropose a salient region matching framework for fully automated MR-TRUS\nregistration. The framework consists of prostate segmentation, rigid alignment\nand deformable registration. Prostate segmentation is performed using two\nsegmentation networks on MR and TRUS respectively, and the predicted salient\nregions are used for the rigid alignment. The rigidly-aligned MR and TRUS\nimages serve as initialization for the deformable registration. The deformable\nregistration network has a dual-stream encoder with cross-modal spatial\nattention modules to facilitate multi-modality feature learning, and a salient\nregion matching loss to consider both structure and intensity similarity within\nthe prostate region. Experiments on a public MR-TRUS dataset demonstrate that\nour method achieves satisfactory registration results, outperforming several\ncutting-edge methods. The code is publicly available at\nhttps://github.com/mock1ngbrd/salient-region-matching.\n","authors":["Zetian Feng","Dong Ni","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16766v2","updated":"2025-01-07T03:53:12Z","published":"2024-05-27T02:27:28Z","title":"Concept Matching with Agent for Out-of-Distribution Detection","summary":"  The remarkable achievements of Large Language Models (LLMs) have captivated\nthe attention of both academia and industry, transcending their initial role in\ndialogue generation. To expand the usage scenarios of LLM, some works enhance\nthe effectiveness and capabilities of the model by introducing more external\ninformation, which is called the agent paradigm. Based on this idea, we propose\na new method that integrates the agent paradigm into out-of-distribution (OOD)\ndetection task, aiming to improve its robustness and adaptability. Our proposed\nmethod, Concept Matching with Agent (CMA), employs neutral prompts as agents to\naugment the CLIP-based OOD detection process. These agents function as dynamic\nobservers and communication hubs, interacting with both In-distribution (ID)\nlabels and data inputs to form vector triangle relationships. This triangular\nframework offers a more nuanced approach than the traditional binary\nrelationship, allowing for better separation and identification of ID and OOD\ninputs. Our extensive experimental results showcase the superior performance of\nCMA over both zero-shot and training-required methods in a diverse array of\nreal-world scenarios.\n","authors":["Yuxiao Lee","Xiaofeng Cao","Jingcai Guo","Wei Ye","Qing Guo","Yi Chang"],"pdf_url":"https://arxiv.org/pdf/2405.16766v2.pdf","comment":"Accepted by AAAI-25"},{"id":"http://arxiv.org/abs/2501.03507v1","updated":"2025-01-07T03:50:11Z","published":"2025-01-07T03:50:11Z","title":"An Empirical Study of Accuracy-Robustness Tradeoff and Training\n  Efficiency in Self-Supervised Learning","summary":"  Self-supervised learning (SSL) has significantly advanced image\nrepresentation learning, yet efficiency challenges persist, particularly with\nadversarial training. Many SSL methods require extensive epochs to achieve\nconvergence, a demand further amplified in adversarial settings. To address\nthis inefficiency, we revisit the robust EMP-SSL framework, emphasizing the\nimportance of increasing the number of crops per image to accelerate learning.\nUnlike traditional contrastive learning, robust EMP-SSL leverages multi-crop\nsampling, integrates an invariance term and regularization, and reduces\ntraining epochs, enhancing time efficiency. Evaluated with both standard linear\nclassifiers and multi-patch embedding aggregation, robust EMP-SSL provides new\ninsights into SSL evaluation strategies.\n  Our results show that robust crop-based EMP-SSL not only accelerates\nconvergence but also achieves a superior balance between clean accuracy and\nadversarial robustness, outperforming multi-crop embedding aggregation.\nAdditionally, we extend this approach with free adversarial training in\nMulti-Crop SSL, introducing the Cost-Free Adversarial Multi-Crop\nSelf-Supervised Learning (CF-AMC-SSL) method. CF-AMC-SSL demonstrates the\neffectiveness of free adversarial training in reducing training time while\nsimultaneously improving clean accuracy and adversarial robustness. These\nfindings underscore the potential of CF-AMC-SSL for practical SSL applications.\nOur code is publicly available at https://github.com/softsys4ai/CF-AMC-SSL.\n","authors":["Fatemeh Ghofrani","Pooyan Jamshidi"],"pdf_url":"https://arxiv.org/pdf/2501.03507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18710v3","updated":"2025-01-07T03:48:04Z","published":"2023-05-30T03:30:24Z","title":"High-Performance Inference Graph Convolutional Networks for\n  Skeleton-Based Action Recognition","summary":"  Recently, the significant achievements have been made in skeleton-based human\naction recognition with the emergence of graph convolutional networks (GCNs).\nHowever, the state-of-the-art (SOTA) models used for this task focus on\nconstructing more complex higher-order connections between joint nodes to\ndescribe skeleton information, which leads to complex inference processes and\nhigh computational costs. To address the slow inference speed caused by overly\ncomplex model structures, we introduce re-parameterization and\nover-parameterization techniques to GCNs and propose two novel high-performance\ninference GCNs, namely HPI-GCN-RP and HPI-GCN-OP. After the completion of model\ntraining, model parameters are fixed. HPI-GCN-RP adopts re-parameterization\ntechnique to transform high-performance training model into fast inference\nmodel through linear transformations, which achieves a higher inference speed\nwith competitive model performance. HPI-GCN-OP further utilizes\nover-parameterization technique to achieve higher performance improvement by\nintroducing additional inference parameters, albeit with slightly decreased\ninference speed. The experimental results on the two skeleton-based action\nrecognition datasets demonstrate the effectiveness of our approach. Our\nHPI-GCN-OP achieves performance comparable to the current SOTA models, with\ninference speeds five times faster. Specifically, our HPI-GCN-OP achieves an\naccuracy of 93\\% on the cross-subject split of the NTU-RGB+D 60 dataset, and\n90.1\\% on the cross-subject benchmark of the NTU-RGB+D 120 dataset. Code is\navailable at github.com/lizaowo/HPI-GCN.\n","authors":["Junyi Wang","Ziao Li","Bangli Liu","Haibin Cai","Mohamad Saada","Qinggang Meng"],"pdf_url":"https://arxiv.org/pdf/2305.18710v3.pdf","comment":"23 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.17875v3","updated":"2025-01-07T03:43:11Z","published":"2023-10-27T03:32:05Z","title":"Siamese-DETR for Generic Multi-Object Tracking","summary":"  The ability to detect and track the dynamic objects in different scenes is\nfundamental to real-world applications, e.g., autonomous driving and robot\nnavigation. However, traditional Multi-Object Tracking (MOT) is limited to\ntracking objects belonging to the pre-defined closed-set categories. Recently,\nOpen-Vocabulary MOT (OVMOT) and Generic MOT (GMOT) are proposed to track\ninterested objects beyond pre-defined categories with the given text prompt and\ntemplate image. However, the expensive well pre-trained (vision-)language model\nand fine-grained category annotations are required to train OVMOT models. In\nthis paper, we focus on GMOT and propose a simple but effective method,\nSiamese-DETR, for GMOT. Only the commonly used detection datasets (e.g., COCO)\nare required for training. Different from existing GMOT methods, which train a\nSingle Object Tracking (SOT) based detector to detect interested objects and\nthen apply a data association based MOT tracker to get the trajectories, we\nleverage the inherent object queries in DETR variants. Specifically: 1) The\nmulti-scale object queries are designed based on the given template image,\nwhich are effective for detecting different scales of objects with the same\ncategory as the template image; 2) A dynamic matching training strategy is\nintroduced to train Siamese-DETR on commonly used detection datasets, which\ntakes full advantage of provided annotations; 3) The online tracking pipeline\nis simplified through a tracking-by-query manner by incorporating the tracked\nboxes in previous frame as additional query boxes. The complex data association\nis replaced with the much simpler Non-Maximum Suppression (NMS). Extensive\nexperimental results show that Siamese-DETR surpasses existing MOT methods on\nGMOT-40 dataset by a large margin. Codes are avaliable at\n\\url{https://github.com/yumu-173/Siamese-DETR}.\n","authors":["Qiankun Liu","Yichen Li","Yuqi Jiang","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2310.17875v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03499v1","updated":"2025-01-07T03:39:43Z","published":"2025-01-07T03:39:43Z","title":"Can Deep Learning Trigger Alerts from Mobile-Captured Images?","summary":"  Our research presents a comprehensive approach to leveraging mobile camera\nimage data for real-time air quality assessment and recommendation. We develop\na regression-based Convolutional Neural Network model and tailor it explicitly\nfor air quality prediction by exploiting the inherent relationship between\noutput parameters. As a result, the Mean Squared Error of 0.0077 and 0.0112\nobtained for 2 and 5 pollutants respectively outperforms existing models.\nFurthermore, we aim to verify the common practice of augmenting the original\ndataset with a view to introducing more variation in the training phase. It is\none of our most significant contributions that our experimental results\ndemonstrate minimal accuracy differences between the original and augmented\ndatasets. Finally, a real-time, user-friendly dashboard is implemented which\ndynamically displays the Air Quality Index and pollutant values derived from\ncaptured mobile camera images. Users' health conditions are considered to\nrecommend whether a location is suitable based on current air quality metrics.\nOverall, this research contributes to verification of data augmentation\ntechniques, CNN-based regression modelling for air quality prediction, and\nuser-centric air quality monitoring through mobile technology. The proposed\nsystem offers practical solutions for individuals to make informed\nenvironmental health and well-being decisions.\n","authors":["Pritisha Sarkar","Duranta Durbaar Vishal Saha","Mousumi Saha"],"pdf_url":"https://arxiv.org/pdf/2501.03499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13656v2","updated":"2025-01-07T03:37:12Z","published":"2024-08-24T19:14:02Z","title":"Localize-and-Stitch: Efficient Model Merging via Sparse Task Arithmetic","summary":"  Model merging offers an effective strategy to combine the strengths of\nmultiple finetuned models into a unified model that preserves the specialized\ncapabilities of each. Existing methods merge models in a global manner,\nperforming arithmetic operations across all model parameters. However, such\nglobal merging often leads to task interference, degrading the performance of\nthe merged model. In this work, we introduce Localize-and-Stitch, a novel\napproach that merges models in a localized way. Our algorithm works in two\nsteps: i) Localization: identify tiny ($1\\%$ of the total parameters) localized\nregions in the finetuned models containing essential skills for the downstream\ntasks, and ii) Stitching: reintegrate only these essential regions back into\nthe pretrained model for task synergy. We demonstrate that our approach\neffectively locates sparse regions responsible for finetuned performance, and\nthe localized regions could be treated as compact and interpretable\nrepresentations of the finetuned models (tasks). Empirically, we evaluate our\nmethod on various vision and language benchmarks, showing that it outperforms\nexisting model merging methods under different data availability scenarios.\nBeyond strong empirical performance, our algorithm also facilitates model\ncompression and preserves pretrained knowledge, enabling flexible and continual\nskill composition from multiple finetuned models with minimal storage and\ncomputational overhead. Our code is available at\nhttps://github.com/uiuctml/Localize-and-Stitch.\n","authors":["Yifei He","Yuzheng Hu","Yong Lin","Tong Zhang","Han Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.13656v2.pdf","comment":"TMLR camera-ready version"},{"id":"http://arxiv.org/abs/2501.03495v1","updated":"2025-01-07T03:33:22Z","published":"2025-01-07T03:33:22Z","title":"Textualize Visual Prompt for Image Editing via Diffusion Bridge","summary":"  Visual prompt, a pair of before-and-after edited images, can convey\nindescribable imagery transformations and prosper in image editing. However,\ncurrent visual prompt methods rely on a pretrained text-guided image-to-image\ngenerative model that requires a triplet of text, before, and after images for\nretraining over a text-to-image model. Such crafting triplets and retraining\nprocesses limit the scalability and generalization of editing. In this paper,\nwe present a framework based on any single text-to-image model without reliance\non the explicit image-to-image model thus enhancing the generalizability and\nscalability. Specifically, by leveraging the probability-flow ordinary\nequation, we construct a diffusion bridge to transfer the distribution between\nbefore-and-after images under the text guidance. By optimizing the text via the\nbridge, the framework adaptively textualizes the editing transformation\nconveyed by visual prompts into text embeddings without other models.\nMeanwhile, we introduce differential attention control during text\noptimization, which disentangles the text embedding from the invariance of the\nbefore-and-after images and makes it solely capture the delicate transformation\nand generalize to edit various images. Experiments on real images validate\ncompetitive results on the generalization, contextual coherence, and high\nfidelity for delicate editing with just one image pair as the visual prompt.\n","authors":["Pengcheng Xu","Qingnan Fan","Fei Kou","Shuai Qin","Hong Gu","Ruoyu Zhao","Charles Ling","Boyu Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03495v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.02024v2","updated":"2025-01-07T03:29:43Z","published":"2025-01-02T20:47:04Z","title":"Model Checking in Medical Imaging for Tumor Detection and Segmentation","summary":"  Recent advancements in model checking have demonstrated significant potential\nacross diverse applications, particularly in signal and image analysis. Medical\nimaging stands out as a critical domain where model checking can be effectively\napplied to design and evaluate robust frameworks. These frameworks facilitate\nautomatic and semi-automatic delineation of regions of interest within images,\naiding in accurate segmentation. This paper provides a comprehensive analysis\nof recent works leveraging spatial logic to develop operators and tools for\nidentifying regions of interest, including tumorous and non-tumorous areas.\nAdditionally, we examine the challenges inherent to spatial model-checking\ntechniques, such as variability in ground truth data and the need for\nstreamlined procedures suitable for routine clinical practice.\n","authors":["Elhoucine Elfatimi","Lahcen El fatimi"],"pdf_url":"https://arxiv.org/pdf/2501.02024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00166v2","updated":"2025-01-07T03:21:43Z","published":"2024-09-30T19:15:05Z","title":"EEG Emotion Copilot: Optimizing Lightweight LLMs for Emotional EEG\n  Interpretation with Assisted Medical Record Generation","summary":"  In the fields of affective computing (AC) and brain-machine interface (BMI),\nthe analysis of physiological and behavioral signals to discern individual\nemotional states has emerged as a critical research frontier. While deep\nlearning-based approaches have made notable strides in EEG emotion recognition,\nparticularly in feature extraction and pattern recognition, significant\nchallenges persist in achieving end-to-end emotion computation, including\nreal-time processing, individual adaptation, and seamless user interaction.\nThis paper presents the EEG Emotion Copilot, a system optimizing a lightweight\nlarge language model (LLM) with 0.5B parameters operating in a local setting,\nwhich first recognizes emotional states directly from EEG signals, subsequently\ngenerates personalized diagnostic and treatment suggestions, and finally\nsupports the automation of assisted electronic medical records. Specifically,\nwe demonstrate the critical techniques in the novel data structure of prompt,\nmodel pruning and fine-tuning training, and deployment strategies aiming at\nimproving real-time performance and computational efficiency. Extensive\nexperiments show that our optimized lightweight LLM-based copilot achieves an\nenhanced intuitive interface for participant interaction, superior accuracy of\nemotion recognition and assisted electronic medical records generation, in\ncomparison to such models with similar scale parameters or large-scale\nparameters such as 1.5B, 1.8B, 3B and 7B. In summary, through these efforts,\nthe proposed copilot is expected to advance the application of AC in the\nmedical domain, offering innovative solution to mental health monitoring. The\ncodes will be released at https://github.com/NZWANG/EEG_Emotion_Copilot.\n","authors":["Hongyu Chen","Weiming Zeng","Chengcheng Chen","Luhui Cai","Fei Wang","Yuhu Shi","Lei Wang","Wei Zhang","Yueyang Li","Hongjie Yan","Wai Ting Siok","Nizhuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.00166v2.pdf","comment":"10 pages, 12 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.03490v1","updated":"2025-01-07T03:18:15Z","published":"2025-01-07T03:18:15Z","title":"SceneBooth: Diffusion-based Framework for Subject-preserved\n  Text-to-Image Generation","summary":"  Due to the demand for personalizing image generation, subject-driven\ntext-to-image generation method, which creates novel renditions of an input\nsubject based on text prompts, has received growing research interest. Existing\nmethods often learn subject representation and incorporate it into the prompt\nembedding to guide image generation, but they struggle with preserving subject\nfidelity. To solve this issue, this paper approaches a novel framework named\nSceneBooth for subject-preserved text-to-image generation, which consumes\ninputs of a subject image, object phrases and text prompts. Instead of learning\nthe subject representation and generating a subject, our SceneBooth fixes the\ngiven subject image and generates its background image guided by the text\nprompts. To this end, our SceneBooth introduces two key components, i.e., a\nmultimodal layout generation module and a background painting module. The\nformer determines the position and scale of the subject by generating\nappropriate scene layouts that align with text captions, object phrases, and\nsubject visual information. The latter integrates two adapters (ControlNet and\nGated Self-Attention) into the latent diffusion model to generate a background\nthat harmonizes with the subject guided by scene layouts and text descriptions.\nIn this manner, our SceneBooth ensures accurate preservation of the subject's\nappearance in the output. Quantitative and qualitative experimental results\ndemonstrate that SceneBooth significantly outperforms baseline methods in terms\nof subject preservation, image harmonization and overall quality.\n","authors":["Shang Chai","Zihang Lin","Min Zhou","Xubin Li","Liansheng Zhuang","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2501.03490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19391v2","updated":"2025-01-07T03:15:49Z","published":"2024-12-27T00:36:40Z","title":"An In-Depth Analysis of Adversarial Discriminative Domain Adaptation for\n  Digit Classification","summary":"  Domain adaptation is an active area of research driven by the growing demand\nfor robust machine learning models that perform well on real-world data.\nAdversarial learning for deep neural networks (DNNs) has emerged as a promising\napproach to improving generalization ability, particularly for image\nclassification. In this paper, we implement a specific adversarial learning\ntechnique known as Adversarial Discriminative Domain Adaptation (ADDA) and\nreplicate digit classification experiments from the original ADDA paper. We\nextend their findings by examining a broader range of domain shifts and provide\na detailed analysis of in-domain classification accuracy post-ADDA. Our results\ndemonstrate that ADDA significantly improves accuracy across certain domain\nshifts with minimal impact on in-domain performance. Furthermore, we provide\nqualitative analysis and propose potential explanations for ADDA's limitations\nin less successful domain shifts. Code is at\nhttps://github.com/eugenechoi2004/COS429_FINAL .\n","authors":["Eugene Choi","Julian Rodriguez","Edmund Young"],"pdf_url":"https://arxiv.org/pdf/2412.19391v2.pdf","comment":"Replacement: Updated methodology section to include grayscale\n  preprocessing of SVHN data"},{"id":"http://arxiv.org/abs/2501.03482v1","updated":"2025-01-07T03:00:58Z","published":"2025-01-07T03:00:58Z","title":"VOILA: Complexity-Aware Universal Segmentation of CT images by Voxel\n  Interacting with Language","summary":"  Satisfactory progress has been achieved recently in universal segmentation of\nCT images. Following the success of vision-language methods, there is a growing\ntrend towards utilizing text prompts and contrastive learning to develop\nuniversal segmentation models. However, there exists a significant imbalance in\ninformation density between 3D images and text prompts. Moreover, the standard\nfully connected layer segmentation approach faces significant challenges in\nhandling multiple classes and exhibits poor generalizability. To address these\nchallenges, we propose the VOxel Interacting with LAnguage method (VOILA) for\nuniversal CT image segmentation. Initially, we align voxels and language into a\nshared representation space and classify voxels on the basis of cosine\nsimilarity. Subsequently, we develop the Voxel-Language Interaction framework\nto mitigate the impact of class imbalance caused by foreground-background\ndiscrepancies and variations in target volumes. Furthermore, a Complexity-Aware\nSampling method is proposed to focus on region hard to segment, achieved by\ngenerating pseudo-heatmaps from a trainable Gaussian mixture distribution. Our\nresults indicate the proposed VOILA is capable to achieve improved performance\nwith reduced parameters and computational cost during training. Furthermore, it\ndemonstrates significant generalizability across diverse datasets without\nadditional fine-tuning.\n","authors":["Zishuo Wan","Yu Gao","Wanyuan Pang","Dawei Ding"],"pdf_url":"https://arxiv.org/pdf/2501.03482v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.01691v2","updated":"2025-01-07T02:57:03Z","published":"2025-01-03T08:18:08Z","title":"VidFormer: A novel end-to-end framework fused by 3DCNN and Transformer\n  for Video-based Remote Physiological Measurement","summary":"  Remote physiological signal measurement based on facial videos, also known as\nremote photoplethysmography (rPPG), involves predicting changes in facial\nvascular blood flow from facial videos. While most deep learning-based methods\nhave achieved good results, they often struggle to balance performance across\nsmall and large-scale datasets due to the inherent limitations of convolutional\nneural networks (CNNs) and Transformer. In this paper, we introduce VidFormer,\na novel end-to-end framework that integrates 3-Dimension Convolutional Neural\nNetwork (3DCNN) and Transformer models for rPPG tasks. Initially, we conduct an\nanalysis of the traditional skin reflection model and subsequently introduce an\nenhanced model for the reconstruction of rPPG signals. Based on this improved\nmodel, VidFormer utilizes 3DCNN and Transformer to extract local and global\nfeatures from input data, respectively. To enhance the spatiotemporal feature\nextraction capabilities of VidFormer, we incorporate temporal-spatial attention\nmechanisms tailored for both 3DCNN and Transformer. Additionally, we design a\nmodule to facilitate information exchange and fusion between the 3DCNN and\nTransformer. Our evaluation on five publicly available datasets demonstrates\nthat VidFormer outperforms current state-of-the-art (SOTA) methods. Finally, we\ndiscuss the essential roles of each VidFormer module and examine the effects of\nethnicity, makeup, and exercise on its performance.\n","authors":["Jiachen Li","Shisheng Guo","Longzhen Tang","Cuolong Cui","Lingjiang Kong","Xiaobo Yang"],"pdf_url":"https://arxiv.org/pdf/2501.01691v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02964v2","updated":"2025-01-07T02:55:15Z","published":"2025-01-06T12:16:56Z","title":"Socratic Questioning: Learn to Self-guide Multimodal Reasoning in the\n  Wild","summary":"  Complex visual reasoning remains a key challenge today. Typically, the\nchallenge is tackled using methodologies such as Chain of Thought (COT) and\nvisual instruction tuning. However, how to organically combine these two\nmethodologies for greater success remains unexplored. Also, issues like\nhallucinations and high training cost still need to be addressed. In this work,\nwe devise an innovative multi-round training and reasoning framework suitable\nfor lightweight Multimodal Large Language Models (MLLMs). Our self-questioning\napproach heuristically guides MLLMs to focus on visual clues relevant to the\ntarget problem, reducing hallucinations and enhancing the model's ability to\ndescribe fine-grained image details. This ultimately enables the model to\nperform well in complex visual reasoning and question-answering tasks. We have\nnamed this framework Socratic Questioning(SQ). To facilitate future research,\nwe create a multimodal mini-dataset named CapQA, which includes 1k images of\nfine-grained activities, for visual instruction tuning and evaluation, our\nproposed SQ method leads to a 31.2% improvement in the hallucination score. Our\nextensive experiments on various benchmarks demonstrate SQ's remarkable\ncapabilities in heuristic self-questioning, zero-shot visual reasoning and\nhallucination mitigation. Our model and code will be publicly available.\n","authors":["Wanpeng Hu","Haodi Liu","Lin Chen","Feng Zhou","Changming Xiao","Qi Yang","Changshui Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.02964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21534v6","updated":"2025-01-07T02:54:18Z","published":"2024-07-31T11:40:29Z","title":"ControlMLLM: Training-Free Visual Prompt Learning for Multimodal Large\n  Language Models","summary":"  In this work, we propose a training-free method to inject visual prompts into\nMultimodal Large Language Models (MLLMs) through test-time optimization of a\nlearnable latent variable. We observe that attention, as the core module of\nMLLMs, connects text prompt tokens and visual tokens, ultimately determining\nthe final results. Our approach involves adjusting visual tokens from the MLP\noutput at test time, controlling the attention response to ensure text prompt\ntokens attend to visual tokens in referring regions. We optimize a learnable\nlatent variable based on an energy function, enhancing the strength of\nreferring regions in the attention map. This enables detailed region\ndescription and reasoning without the need for substantial training costs or\nmodel retraining. Our method offers a promising direction for integrating\nreferring abilities into MLLMs, and supports referring with box, mask, scribble\nand point. The results demonstrate that our method exhibits out-of-domain\ngeneralization and interpretability.\n","authors":["Mingrui Wu","Xinyue Cai","Jiayi Ji","Jiale Li","Oucheng Huang","Gen Luo","Hao Fei","Guannan Jiang","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.21534v6.pdf","comment":"Accepted to NeurIPS 2024;\n  Code:https://github.com/mrwu-mac/ControlMLLM"},{"id":"http://arxiv.org/abs/2501.02962v2","updated":"2025-01-07T02:51:31Z","published":"2025-01-06T12:09:08Z","title":"SceneVTG++: Controllable Multilingual Visual Text Generation in the Wild","summary":"  Generating visual text in natural scene images is a challenging task with\nmany unsolved problems. Different from generating text on artificially designed\nimages (such as posters, covers, cartoons, etc.), the text in natural scene\nimages needs to meet the following four key criteria: (1) Fidelity: the\ngenerated text should appear as realistic as a photograph and be completely\naccurate, with no errors in any of the strokes. (2) Reasonability: the text\nshould be generated on reasonable carrier areas (such as boards, signs, walls,\netc.), and the generated text content should also be relevant to the scene. (3)\nUtility: the generated text can facilitate to the training of natural scene OCR\n(Optical Character Recognition) tasks. (4) Controllability: The attribute of\nthe text (such as font and color) should be controllable as needed. In this\npaper, we propose a two stage method, SceneVTG++, which simultaneously\nsatisfies the four aspects mentioned above. SceneVTG++ consists of a Text\nLayout and Content Generator (TLCG) and a Controllable Local Text Diffusion\n(CLTD). The former utilizes the world knowledge of multi modal large language\nmodels to find reasonable text areas and recommend text content according to\nthe nature scene background images, while the latter generates controllable\nmultilingual text based on the diffusion model. Through extensive experiments,\nwe respectively verified the effectiveness of TLCG and CLTD, and demonstrated\nthe state-of-the-art text generation performance of SceneVTG++. In addition,\nthe generated images have superior utility in OCR tasks like text detection and\ntext recognition. Codes and datasets will be available.\n","authors":["Jiawei Liu","Yuanzhi Zhu","Feiyu Gao","Zhibo Yang","Peng Wang","Junyang Lin","Xinggang Wang","Wenyu Liu"],"pdf_url":"https://arxiv.org/pdf/2501.02962v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03471v1","updated":"2025-01-07T02:15:58Z","published":"2025-01-07T02:15:58Z","title":"Hyperbolic Binary Neural Network","summary":"  Binary Neural Network (BNN) converts full-precision weights and activations\ninto their extreme 1-bit counterparts, making it particularly suitable for\ndeployment on lightweight mobile devices. While binary neural networks are\ntypically formulated as a constrained optimization problem and optimized in the\nbinarized space, general neural networks are formulated as an unconstrained\noptimization problem and optimized in the continuous space. This paper\nintroduces the Hyperbolic Binary Neural Network (HBNN) by leveraging the\nframework of hyperbolic geometry to optimize the constrained problem.\nSpecifically, we transform the constrained problem in hyperbolic space into an\nunconstrained one in Euclidean space using the Riemannian exponential map. On\nthe other hand, we also propose the Exponential Parametrization Cluster (EPC)\nmethod, which, compared to the Riemannian exponential map, shrinks the segment\ndomain based on a diffeomorphism. This approach increases the probability of\nweight flips, thereby maximizing the information gain in BNNs. Experimental\nresults on CIFAR10, CIFAR100, and ImageNet classification datasets with\nVGGsmall, ResNet18, and ResNet34 models illustrate the superior performance of\nour HBNN over state-of-the-art methods.\n","authors":["Jun Chen","Jingyang Xiang","Tianxin Huang","Xiangrui Zhao","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03469v1","updated":"2025-01-07T02:10:52Z","published":"2025-01-07T02:10:52Z","title":"Information-Maximized Soft Variable Discretization for Self-Supervised\n  Image Representation Learning","summary":"  Self-supervised learning (SSL) has emerged as a crucial technique in image\nprocessing, encoding, and understanding, especially for developing today's\nvision foundation models that utilize large-scale datasets without annotations\nto enhance various downstream tasks. This study introduces a novel SSL\napproach, Information-Maximized Soft Variable Discretization (IMSVD), for image\nrepresentation learning. Specifically, IMSVD softly discretizes each variable\nin the latent space, enabling the estimation of their probability distributions\nover training batches and allowing the learning process to be directly guided\nby information measures. Motivated by the MultiView assumption, we propose an\ninformation-theoretic objective function to learn transform-invariant,\nnon-travail, and redundancy-minimized representation features. We then derive a\njoint-cross entropy loss function for self-supervised image representation\nlearning, which theoretically enjoys superiority over the existing methods in\nreducing feature redundancy. Notably, our non-contrastive IMSVD method\nstatistically performs contrastive learning. Extensive experimental results\ndemonstrate the effectiveness of IMSVD on various downstream tasks in terms of\nboth accuracy and efficiency. Thanks to our variable discretization, the\nembedding features optimized by IMSVD offer unique explainability at the\nvariable level. IMSVD has the potential to be adapted to other learning\nparadigms. Our code is publicly available at\nhttps://github.com/niuchuangnn/IMSVD.\n","authors":["Chuang Niu","Wenjun Xia","Hongming Shan","Ge Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01973v2","updated":"2025-01-07T02:10:45Z","published":"2024-12-28T02:28:19Z","title":"INFELM: In-depth Fairness Evaluation of Large Text-To-Image Models","summary":"  The rapid development of large language models (LLMs) and large vision models\n(LVMs) have propelled the evolution of multi-modal AI systems, which have\ndemonstrated the remarkable potential for industrial applications by emulating\nhuman-like cognition. However, they also pose significant ethical challenges,\nincluding amplifying harmful content and reinforcing societal biases. For\ninstance, biases in some industrial image generation models highlighted the\nurgent need for robust fairness assessments. Most existing evaluation\nframeworks focus on the comprehensiveness of various aspects of the models, but\nthey exhibit critical limitations, including insufficient attention to content\ngeneration alignment and social bias-sensitive domains. More importantly, their\nreliance on pixel-detection techniques is prone to inaccuracies.\n  To address these issues, this paper presents INFELM, an in-depth fairness\nevaluation on widely-used text-to-image models. Our key contributions are: (1)\nan advanced skintone classifier incorporating facial topology and refined skin\npixel representation to enhance classification precision by at least 16.04%,\n(2) a bias-sensitive content alignment measurement for understanding societal\nimpacts, (3) a generalizable representation bias evaluation for diverse\ndemographic groups, and (4) extensive experiments analyzing large-scale\ntext-to-image model outputs across six social-bias-sensitive domains. We find\nthat existing models in the study generally do not meet the empirical fairness\ncriteria, and representation bias is generally more pronounced than alignment\nerrors. INFELM establishes a robust benchmark for fairness assessment,\nsupporting the development of multi-modal AI systems that align with ethical\nand human-centric principles.\n","authors":["Di Jin","Xing Liu","Yu Liu","Jia Qing Yap","Andrea Wong","Adriana Crespo","Qi Lin","Zhiyuan Yin","Qiang Yan","Ryan Ye"],"pdf_url":"https://arxiv.org/pdf/2501.01973v2.pdf","comment":"Di Jin and Xing Liu contributed equally to this work"},{"id":"http://arxiv.org/abs/2412.16487v2","updated":"2025-01-07T02:08:56Z","published":"2024-12-21T05:04:36Z","title":"Trusted Mamba Contrastive Network for Multi-View Clustering","summary":"  Multi-view clustering can partition data samples into their categories by\nlearning a consensus representation in an unsupervised way and has received\nmore and more attention in recent years. However, there is an untrusted fusion\nproblem. The reasons for this problem are as follows: 1) The current methods\nignore the presence of noise or redundant information in the view; 2) The\nsimilarity of contrastive learning comes from the same sample rather than the\nsame cluster in deep multi-view clustering. It causes multi-view fusion in the\nwrong direction. This paper proposes a novel multi-view clustering network to\naddress this problem, termed as Trusted Mamba Contrastive Network (TMCN).\nSpecifically, we present a new Trusted Mamba Fusion Network (TMFN), which\nachieves a trusted fusion of multi-view data through a selective mechanism.\nMoreover, we align the fused representation and the view-specific\nrepresentation using the Average-similarity Contrastive Learning (AsCL) module.\nAsCL increases the similarity of view presentation from the same cluster, not\nmerely from the same sample. Extensive experiments show that the proposed\nmethod achieves state-of-the-art results in deep multi-view clustering tasks.\nThe source code is available at https://github.com/HackerHyper/TMCN.\n","authors":["Jian Zhu","Xin Zou","Lei Liu","Zhangmin Huang","Ying Zhang","Chang Tang","Li-Rong Dai"],"pdf_url":"https://arxiv.org/pdf/2412.16487v2.pdf","comment":"accepted by 2025 IEEE International Conference on Acoustics, Speech,\n  and Signal Processing(ICASSP2025)"},{"id":"http://arxiv.org/abs/2412.19139v2","updated":"2025-01-07T01:50:11Z","published":"2024-12-26T09:51:05Z","title":"PlanLLM: Video Procedure Planning with Refinable Large Language Models","summary":"  Video procedure planning, i.e., planning a sequence of action steps given the\nvideo frames of start and goal states, is an essential ability for embodied AI.\nRecent works utilize Large Language Models (LLMs) to generate enriched action\nstep description texts to guide action step decoding. Although LLMs are\nintroduced, these methods decode the action steps into a closed-set of one-hot\nvectors, limiting the model's capability of generalizing to new steps or tasks.\nAdditionally, fixed action step descriptions based on world-level commonsense\nmay contain noise in specific instances of visual states. In this paper, we\npropose PlanLLM, a cross-modal joint learning framework with LLMs for video\nprocedure planning. We propose an LLM-Enhanced Planning module which fully uses\nthe generalization ability of LLMs to produce free-form planning output and to\nenhance action step decoding. We also propose Mutual Information Maximization\nmodule to connect world-level commonsense of step descriptions and\nsample-specific information of visual states, enabling LLMs to employ the\nreasoning ability to generate step sequences. With the assistance of LLMs, our\nmethod can both closed-set and open vocabulary procedure planning tasks. Our\nPlanLLM achieves superior performance on three benchmarks, demonstrating the\neffectiveness of our designs.\n","authors":["Dejie Yang","Zijing Zhao","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2412.19139v2.pdf","comment":"accepted to AAAI2025"},{"id":"http://arxiv.org/abs/2501.03466v1","updated":"2025-01-07T01:47:57Z","published":"2025-01-07T01:47:57Z","title":"DGSSA: Domain generalization with structural and stylistic augmentation\n  for retinal vessel segmentation","summary":"  Retinal vascular morphology is crucial for diagnosing diseases such as\ndiabetes, glaucoma, and hypertension, making accurate segmentation of retinal\nvessels essential for early intervention. Traditional segmentation methods\nassume that training and testing data share similar distributions, which can\nlead to poor performance on unseen domains due to domain shifts caused by\nvariations in imaging devices and patient demographics. This paper presents a\nnovel approach, DGSSA, for retinal vessel image segmentation that enhances\nmodel generalization by combining structural and style augmentation strategies.\nWe utilize a space colonization algorithm to generate diverse vascular-like\nstructures that closely mimic actual retinal vessels, which are then used to\ngenerate pseudo-retinal images with an improved Pix2Pix model, allowing the\nsegmentation model to learn a broader range of structure distributions.\nAdditionally, we utilize PixMix to implement random photometric augmentations\nand introduce uncertainty perturbations, thereby enriching stylistic diversity\nand significantly enhancing the model's adaptability to varying imaging\nconditions. Our framework has been rigorously evaluated on four challenging\ndatasets-DRIVE, CHASEDB, HRF, and STARE-demonstrating state-of-the-art\nperformance that surpasses existing methods. This validates the effectiveness\nof our proposed approach, highlighting its potential for clinical application\nin automated retinal vessel analysis.\n","authors":["Bo Liu","Yudong Zhang","Shuihua Wang","Siyue Li","Jin Hong"],"pdf_url":"https://arxiv.org/pdf/2501.03466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22376v2","updated":"2025-01-07T01:41:13Z","published":"2024-10-29T07:43:39Z","title":"Rare-to-Frequent: Unlocking Compositional Generation Power of Diffusion\n  Models on Rare Concepts with LLM Guidance","summary":"  State-of-the-art text-to-image (T2I) diffusion models often struggle to\ngenerate rare compositions of concepts, e.g., objects with unusual attributes.\nIn this paper, we show that the compositional generation power of diffusion\nmodels on such rare concepts can be significantly enhanced by the Large\nLanguage Model (LLM) guidance. We start with empirical and theoretical\nanalysis, demonstrating that exposing frequent concepts relevant to the target\nrare concepts during the diffusion sampling process yields more accurate\nconcept composition. Based on this, we propose a training-free approach, R2F,\nthat plans and executes the overall rare-to-frequent concept guidance\nthroughout the diffusion inference by leveraging the abundant semantic\nknowledge in LLMs. Our framework is flexible across any pre-trained diffusion\nmodels and LLMs, and can be seamlessly integrated with the region-guided\ndiffusion approaches. Extensive experiments on three datasets, including our\nnewly proposed benchmark, RareBench, containing various prompts with rare\ncompositions of concepts, R2F significantly surpasses existing models including\nSD3.0 and FLUX by up to 28.1%p in T2I alignment. Code is available at\nhttps://github.com/krafton-ai/Rare-to-Frequent.\n","authors":["Dongmin Park","Sebin Kim","Taehong Moon","Minkyu Kim","Kangwook Lee","Jaewoong Cho"],"pdf_url":"https://arxiv.org/pdf/2410.22376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09369v2","updated":"2025-01-07T01:23:54Z","published":"2024-08-18T05:47:33Z","title":"Flemme: A Flexible and Modular Learning Platform for Medical Images","summary":"  As the rapid development of computer vision and the emergence of powerful\nnetwork backbones and architectures, the application of deep learning in\nmedical imaging has become increasingly significant. Unlike natural images,\nmedical images lack huge volumes of data but feature more modalities, making it\ndifficult to train a general model that has satisfactory performance across\nvarious datasets. In practice, practitioners often suffer from manually\ncreating and testing models combining independent backbones and architectures,\nwhich is a laborious and time-consuming process. We propose Flemme, a FLExible\nand Modular learning platform for MEdical images. Our platform separates\nencoders from the model architectures so that different models can be\nconstructed via various combinations of supported encoders and architectures.\nWe construct encoders using building blocks based on convolution, transformer,\nand state-space model (SSM) to process both 2D and 3D image patches. A base\narchitecture is implemented following an encoder-decoder style, with several\nderived architectures for image segmentation, reconstruction, and generation\ntasks. In addition, we propose a general hierarchical architecture\nincorporating a pyramid loss to optimize and fuse vertical features.\nExperiments demonstrate that this simple design leads to an average improvement\nof 5.60% in Dice score and 7.81% in mean interaction of units (mIoU) for\nsegmentation models, as well as an enhancement of 5.57% in peak signal-to-noise\nratio (PSNR) and 8.22% in structural similarity (SSIM) for reconstruction\nmodels. We further utilize Flemme as an analytical tool to assess the\neffectiveness and efficiency of various encoders across different tasks. Code\nis available at https://github.com/wlsdzyzl/flemme.\n","authors":["Guoqing Zhang","Jingyun Yang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2408.09369v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.03458v1","updated":"2025-01-07T01:19:48Z","published":"2025-01-07T01:19:48Z","title":"Activating Associative Disease-Aware Vision Token Memory for LLM-Based\n  X-ray Report Generation","summary":"  X-ray image based medical report generation achieves significant progress in\nrecent years with the help of the large language model, however, these models\nhave not fully exploited the effective information in visual image regions,\nresulting in reports that are linguistically sound but insufficient in\ndescribing key diseases. In this paper, we propose a novel associative\nmemory-enhanced X-ray report generation model that effectively mimics the\nprocess of professional doctors writing medical reports. It considers both the\nmining of global and local visual information and associates historical report\ninformation to better complete the writing of the current report. Specifically,\ngiven an X-ray image, we first utilize a classification model along with its\nactivation maps to accomplish the mining of visual regions highly associated\nwith diseases and the learning of disease query tokens. Then, we employ a\nvisual Hopfield network to establish memory associations for disease-related\ntokens, and a report Hopfield network to retrieve report memory information.\nThis process facilitates the generation of high-quality reports based on a\nlarge language model and achieves state-of-the-art performance on multiple\nbenchmark datasets, including the IU X-ray, MIMIC-CXR, and Chexpert Plus. The\nsource code of this work is released on\n\\url{https://github.com/Event-AHU/Medical_Image_Analysis}.\n","authors":["Xiao Wang","Fuling Wang","Haowen Wang","Bo Jiang","Chuanfu Li","Yaowei Wang","Yonghong Tian","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2501.03458v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2501.04184v1","updated":"2025-01-07T23:32:05Z","published":"2025-01-07T23:32:05Z","title":"MedicalNarratives: Connecting Medical Vision and Language with Localized\n  Narratives","summary":"  We propose MedicalNarratives, a dataset curated from medical pedagogical\nvideos similar in nature to data collected in Think-Aloud studies and inspired\nby Localized Narratives, which collects grounded image-text data by curating\ninstructors' speech and mouse cursor movements synchronized in time.\nMedicalNarratives enables pretraining of both semantic and dense objectives,\nalleviating the need to train medical semantic and dense tasks disparately due\nto the lack of reasonably sized datasets. Our dataset contains 4.7M image-text\npairs from videos and articles, with 1M samples containing dense annotations in\nthe form of traces and bounding boxes. To evaluate the utility of\nMedicalNarratives, we train GenMedClip based on the CLIP architecture using our\ndataset spanning 12 medical domains and demonstrate that it outperforms\nprevious state-of-the-art models on a newly constructed medical imaging\nbenchmark that comprehensively evaluates performance across all modalities.\nData, demo, code and models available at https://medical-narratives.github.io\n","authors":["Wisdom O. Ikezogwo","Kevin Zhang","Mehmet Saygin Seyfioglu","Fatemeh Ghezloo","Linda Shapiro","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2501.04184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02716v2","updated":"2025-01-07T23:12:33Z","published":"2024-07-02T23:48:43Z","title":"Light-weight Fine-tuning Method for Defending Adversarial Noise in\n  Pre-trained Medical Vision-Language Models","summary":"  Fine-tuning pre-trained Vision-Language Models (VLMs) has shown remarkable\ncapabilities in medical image and textual depiction synergy. Nevertheless, many\npre-training datasets are restricted by patient privacy concerns, potentially\ncontaining noise that can adversely affect downstream performance. Moreover,\nthe growing reliance on multi-modal generation exacerbates this issue because\nof its susceptibility to adversarial attacks. To investigate how VLMs trained\non adversarial noisy data perform on downstream medical tasks, we first craft\nnoisy upstream datasets using multi-modal adversarial attacks. Through our\ncomprehensive analysis, we unveil that moderate noise enhances model robustness\nand transferability, but increasing noise levels negatively impact downstream\ntask performance. To mitigate this issue, we propose rectify adversarial noise\n(RAN) framework, a recipe designed to effectively defend adversarial attacks\nand rectify the influence of upstream noise during fine-tuning.\n","authors":["Xu Han","Linghao Jin","Xuezhe Ma","Xiaofeng Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02716v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07320v2","updated":"2025-01-07T23:10:25Z","published":"2024-12-10T09:08:41Z","title":"CoMA: Compositional Human Motion Generation with Multi-modal Agents","summary":"  3D human motion generation has seen substantial advancement in recent years.\nWhile state-of-the-art approaches have improved performance significantly, they\nstill struggle with complex and detailed motions unseen in training data,\nlargely due to the scarcity of motion datasets and the prohibitive cost of\ngenerating new training examples. To address these challenges, we introduce\nCoMA, an agent-based solution for complex human motion generation, editing, and\ncomprehension. CoMA leverages multiple collaborative agents powered by large\nlanguage and vision models, alongside a mask transformer-based motion generator\nfeaturing body part-specific encoders and codebooks for fine-grained control.\nOur framework enables generation of both short and long motion sequences with\ndetailed instructions, text-guided motion editing, and self-correction for\nimproved quality. Evaluations on the HumanML3D dataset demonstrate competitive\nperformance against state-of-the-art methods. Additionally, we create a set of\ncontext-rich, compositional, and long text prompts, where user studies show our\nmethod significantly outperforms existing approaches.\n","authors":["Shanlin Sun","Gabriel De Araujo","Jiaqi Xu","Shenghan Zhou","Hanwen Zhang","Ziheng Huang","Chenyu You","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2412.07320v2.pdf","comment":"Project Page: https://gabrie-l.github.io/coma-page/"},{"id":"http://arxiv.org/abs/2406.14847v2","updated":"2025-01-07T23:01:21Z","published":"2024-06-21T03:23:37Z","title":"Fair Text to Medical Image Diffusion Model with Subgroup Distribution\n  Aligned Tuning","summary":"  The text to medical image (T2MedI) with latent diffusion model has great\npotential to alleviate the scarcity of medical imaging data and explore the\nunderlying appearance distribution of lesions in a specific patient status\ndescription. However, as the text to nature image models, we show that the\nT2MedI model can also bias to some subgroups to overlook the minority ones in\nthe training set. In this work, we first build a T2MedI model based on the\npre-trained Imagen model, which has the fixed contrastive language-image\npre-training (CLIP) text encoder, while its decoder has been fine-tuned on\nmedical images from the Radiology Objects in COntext (ROCO) dataset. Its gender\nbias is analyzed qualitatively and quantitatively. Toward this issue, we\npropose to fine-tune the T2MedI toward the target application dataset to align\ntheir sensitive subgroups distribution probability. Specifically, the alignment\nloss for fine-tuning is guided by an off-the-shelf sensitivity-subgroup\nclassifier to match the classification probability between the generated images\nand the expected target dataset. In addition, the image quality is maintained\nby a CLIP-consistency regularization term following a knowledge distillation\nscheme. For evaluation, we set the target dataset to be enhanced as the BraST18\ndataset, and trained a brain magnetic resonance (MR) slice-based gender\nclassifier from it. With our method, the generated MR image can markedly reduce\nthe inconsistency with the gender proportion in the BraTS18 dataset.\n","authors":["Xu Han","Fangfang Fan","Jingzhao Rong","Zhen Li","Georges El Fakhri","Qingyu Chen","Xiaofeng Liu"],"pdf_url":"https://arxiv.org/pdf/2406.14847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.03819v2","updated":"2025-01-07T23:00:02Z","published":"2022-08-07T21:06:42Z","title":"Cross-Skeleton Interaction Graph Aggregation Network for Representation\n  Learning of Mouse Social Behaviour","summary":"  Automated social behaviour analysis of mice has become an increasingly\npopular research area in behavioural neuroscience. Recently, pose information\n(i.e., locations of keypoints or skeleton) has been used to interpret social\nbehaviours of mice. Nevertheless, effective encoding and decoding of social\ninteraction information underlying the keypoints of mice has been rarely\ninvestigated in the existing methods. In particular, it is challenging to model\ncomplex social interactions between mice due to highly deformable body shapes\nand ambiguous movement patterns. To deal with the interaction modelling\nproblem, we here propose a Cross-Skeleton Interaction Graph Aggregation Network\n(CS-IGANet) to learn abundant dynamics of freely interacting mice, where a\nCross-Skeleton Node-level Interaction module (CS-NLI) is used to model\nmulti-level interactions (i.e., intra-, inter- and cross-skeleton\ninteractions). Furthermore, we design a novel Interaction-Aware Transformer\n(IAT) to dynamically learn the graph-level representation of social behaviours\nand update the node-level representation, guided by our proposed\ninteraction-aware self-attention mechanism. Finally, to enhance the\nrepresentation ability of our model, an auxiliary self-supervised learning task\nis proposed for measuring the similarity between cross-skeleton nodes.\nExperimental results on the standard CRMI13-Skeleton and our PDMB-Skeleton\ndatasets show that our proposed model outperforms several other\nstate-of-the-art approaches.\n","authors":["Feixiang Zhou","Xinyu Yang","Fang Chen","Long Chen","Zheheng Jiang","Hui Zhu","Reiko Heckel","Haikuan Wang","Minrui Fei","Huiyu Zhou"],"pdf_url":"https://arxiv.org/pdf/2208.03819v2.pdf","comment":"Accepted to IEEE Transactions on Image Processing"},{"id":"http://arxiv.org/abs/2501.04172v1","updated":"2025-01-07T22:51:10Z","published":"2025-01-07T22:51:10Z","title":"Machine Learning for Identifying Grain Boundaries in Scanning Electron\n  Microscopy (SEM) Images of Nanoparticle Superlattices","summary":"  Nanoparticle superlattices consisting of ordered arrangements of\nnanoparticles exhibit unique optical, magnetic, and electronic properties\narising from nanoparticle characteristics as well as their collective\nbehaviors. Understanding how processing conditions influence the nanoscale\narrangement and microstructure is critical for engineering materials with\ndesired macroscopic properties. Microstructural features such as grain\nboundaries, lattice defects, and pores significantly affect these properties\nbut are challenging to quantify using traditional manual analyses as they are\nlabor-intensive and prone to errors. In this work, we present a machine\nlearning workflow for automating grain segmentation in scanning electron\nmicroscopy (SEM) images of nanoparticle superlattices. This workflow integrates\nsignal processing techniques, such as Radon transforms, with unsupervised\nlearning methods like agglomerative hierarchical clustering to identify and\nsegment grains without requiring manually annotated data. In the workflow we\ntransform the raw pixel data into explainable numerical representation of\nsuperlattice orientations for clustering. Benchmarking results demonstrate the\nworkflow's robustness against noisy images and edge cases, with a processing\nspeed of four images per minute on standard computational hardware. This\nefficiency makes the workflow scalable to large datasets and makes it a\nvaluable tool for integrating data-driven models into decision-making processes\nfor material design and analysis. For example, one can use this workflow to\nquantify grain size distributions at varying processing conditions like\ntemperature and pressure and using that knowledge adjust processing conditions\nto achieve desired superlattice orientations and grain sizes.\n","authors":["Aanish Paruchuri","Carl Thrasher","A. J. Hart","Robert Macfarlane","Arthi Jayaraman"],"pdf_url":"https://arxiv.org/pdf/2501.04172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13190v3","updated":"2025-01-07T22:06:07Z","published":"2024-12-17T18:59:33Z","title":"MotionBridge: Dynamic Video Inbetweening with Flexible Controls","summary":"  By generating plausible and smooth transitions between two image frames,\nvideo inbetweening is an essential tool for video editing and long video\nsynthesis. Traditional works lack the capability to generate complex large\nmotions. While recent video generation techniques are powerful in creating\nhigh-quality results, they often lack fine control over the details of\nintermediate frames, which can lead to results that do not align with the\ncreative mind. We introduce MotionBridge, a unified video inbetweening\nframework that allows flexible controls, including trajectory strokes,\nkeyframes, masks, guide pixels, and text. However, learning such multi-modal\ncontrols in a unified framework is a challenging task. We thus design two\ngenerators to extract the control signal faithfully and encode feature through\ndual-branch embedders to resolve ambiguities. We further introduce a curriculum\ntraining strategy to smoothly learn various controls. Extensive qualitative and\nquantitative experiments have demonstrated that such multi-modal controls\nenable a more dynamic, customizable, and contextually accurate visual\nnarrative.\n","authors":["Maham Tanveer","Yang Zhou","Simon Niklaus","Ali Mahdavi Amiri","Hao Zhang","Krishna Kumar Singh","Nanxuan Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.13190v3.pdf","comment":"Project website: [https://motionbridge.github.io/]"},{"id":"http://arxiv.org/abs/2501.04155v1","updated":"2025-01-07T21:55:56Z","published":"2025-01-07T21:55:56Z","title":"MM-GEN: Enhancing Task Performance Through Targeted Multimodal Data\n  Curation","summary":"  Vision-language models (VLMs) are highly effective but often underperform on\nspecialized tasks; for example, Llava-1.5 struggles with chart and diagram\nunderstanding due to scarce task-specific training data. Existing training\ndata, sourced from general-purpose datasets, fails to capture the nuanced\ndetails needed for these tasks. We introduce MM-Gen, a scalable method that\ngenerates task-specific, high-quality synthetic text for candidate images by\nleveraging stronger models. MM-Gen employs a three-stage targeted process:\npartitioning data into subgroups, generating targeted text based on task\ndescriptions, and filtering out redundant and outlier data. Fine-tuning VLMs\nwith data generated by MM-Gen leads to significant performance gains, including\n29% on spatial reasoning and 15% on diagram understanding for Llava-1.5 (7B).\nCompared to human-curated caption data, MM-Gen achieves up to 1.6x better\nimprovements for the original models, proving its effectiveness in enhancing\ntask-specific VLM performance and bridging the gap between general-purpose\ndatasets and specialized requirements. Code available at\nhttps://github.com/sjoshi804/MM-Gen.\n","authors":["Siddharth Joshi","Besmira Nushi","Vidhisha Balachandran","Varun Chandrasekaran","Vibhav Vineet","Neel Joshi","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2501.04155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01054v3","updated":"2025-01-07T21:53:44Z","published":"2024-02-01T22:58:21Z","title":"Unconditional Latent Diffusion Models Memorize Patient Imaging Data:\n  Implications for Openly Sharing Synthetic Data","summary":"  AI models present a wide range of applications in the field of medicine.\nHowever, achieving optimal performance requires access to extensive healthcare\ndata, which is often not readily available. Furthermore, the imperative to\npreserve patient privacy restricts patient data sharing with third parties and\neven within institutes. Recently, generative AI models have been gaining\ntraction for facilitating open-data sharing by proposing synthetic data as\nsurrogates of real patient data. Despite the promise, some of these models are\nsusceptible to patient data memorization, where models generate patient data\ncopies instead of novel synthetic samples. Considering the importance of the\nproblem, surprisingly it has received relatively little attention in the\nmedical imaging community. To this end, we assess memorization in unconditional\nlatent diffusion models. We train latent diffusion models on CT, MR, and X-ray\ndatasets for synthetic data generation. We then detect the amount of training\ndata memorized utilizing our novel self-supervised copy detection approach and\nfurther investigate various factors that can influence memorization. Our\nfindings show a surprisingly high degree of patient data memorization across\nall datasets. Comparison with non-diffusion generative models, such as\nautoencoders and generative adversarial networks, indicates that while latent\ndiffusion models are more susceptible to memorization, overall they outperform\nnon-diffusion models in synthesis quality. Further analyses reveal that using\naugmentation strategies, small architecture, and increasing dataset can reduce\nmemorization while over-training the models can enhance it. Collectively, our\nresults emphasize the importance of carefully training generative models on\nprivate medical imaging datasets, and examining the synthetic data to ensure\npatient privacy before sharing it for medical research and applications.\n","authors":["Salman Ul Hassan Dar","Marvin Seyfarth","Isabelle Ayx","Theano Papavassiliu","Stefan O. Schoenberg","Robert Malte Siepmann","Fabian Christopher Laqua","Jannik Kahmann","Norbert Frey","Bettina Baeßler","Sebastian Foersch","Daniel Truhn","Jakob Nikolas Kather","Sandy Engelhardt"],"pdf_url":"https://arxiv.org/pdf/2402.01054v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18038v2","updated":"2025-01-07T21:41:28Z","published":"2024-03-26T18:49:56Z","title":"TGGLinesPlus: A robust topological graph-guided computer vision\n  algorithm for line detection from images","summary":"  Line detection is a classic and essential problem in image processing,\ncomputer vision and machine intelligence. Line detection has many important\napplications, including image vectorization (e.g., document recognition and art\ndesign), indoor mapping, and important societal challenges (e.g., sea ice\nfracture line extraction from satellite imagery). Many line detection\nalgorithms and methods have been developed, but robust and intuitive methods\nare still lacking. In this paper, we proposed and implemented a topological\ngraph-guided algorithm, named TGGLinesPlus, for line detection. Our experiments\non images from a wide range of domains have demonstrated the flexibility of our\nTGGLinesPlus algorithm. We benchmarked our algorithm with five classic and\nstate-of-the-art line detection methods and evaluated the benchmark results\nqualitatively and quantitatively, the results demonstrate the robustness of\nTGGLinesPlus.\n","authors":["Liping Yang","Joshua Driscol","Ming Gong","Katie Slack","Wenbin Zhang","Shujie Wang","Catherine G. Potts"],"pdf_url":"https://arxiv.org/pdf/2403.18038v2.pdf","comment":"Our TGGLinesPlus Python implementation is open-sourced. 29 pages, 8\n  figures and 4 tables"},{"id":"http://arxiv.org/abs/2501.04144v1","updated":"2025-01-07T21:14:11Z","published":"2025-01-07T21:14:11Z","title":"Chirpy3D: Continuous Part Latents for Creative 3D Bird Generation","summary":"  In this paper, we push the boundaries of fine-grained 3D generation into\ntruly creative territory. Current methods either lack intricate details or\nsimply mimic existing objects -- we enable both. By lifting 2D fine-grained\nunderstanding into 3D through multi-view diffusion and modeling part latents as\ncontinuous distributions, we unlock the ability to generate entirely new, yet\nplausible parts through interpolation and sampling. A self-supervised feature\nconsistency loss further ensures stable generation of these unseen parts. The\nresult is the first system capable of creating novel 3D objects with\nspecies-specific details that transcend existing examples. While we demonstrate\nour approach on birds, the underlying framework extends beyond things that can\nchirp! Code will be released at https://github.com/kamwoh/chirpy3d.\n","authors":["Kam Woh Ng","Jing Yang","Jia Wei Sii","Jiankang Deng","Chee Seng Chan","Yi-Zhe Song","Tao Xiang","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.04144v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.05764v2","updated":"2025-01-07T20:50:51Z","published":"2023-08-09T10:05:11Z","title":"Unlocking the diagnostic potential of electrocardiograms through\n  information transfer from cardiac magnetic resonance imaging","summary":"  Cardiovascular diseases (CVD) can be diagnosed using various diagnostic\nmodalities. The electrocardiogram (ECG) is a cost-effective and widely\navailable diagnostic aid that provides functional information of the heart.\nHowever, its ability to classify and spatially localise CVD is limited. In\ncontrast, cardiac magnetic resonance (CMR) imaging provides detailed structural\ninformation of the heart and thus enables evidence-based diagnosis of CVD, but\nlong scan times and high costs limit its use in clinical routine. In this work,\nwe present a deep learning strategy for cost-effective and comprehensive\ncardiac screening solely from ECG. Our approach combines multimodal contrastive\nlearning with masked data modelling to transfer domain-specific information\nfrom CMR imaging to ECG representations. In extensive experiments using data\nfrom 40,044 UK Biobank subjects, we demonstrate the utility and\ngeneralisability of our method for subject-specific risk prediction of CVD and\nthe prediction of cardiac phenotypes using only ECG data. Specifically, our\nnovel multimodal pre-training paradigm improves performance by up to 12.19 %\nfor risk prediction and 27.59 % for phenotype prediction. In a qualitative\nanalysis, we demonstrate that our learned ECG representations incorporate\ninformation from CMR image regions of interest. Our entire pipeline is publicly\navailable at https://github.com/oetu/MMCL-ECG-CMR.\n","authors":["Özgün Turgut","Philip Müller","Paul Hager","Suprosanna Shit","Sophie Starck","Martin J. Menten","Eimo Martens","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2308.05764v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.07554v3","updated":"2025-01-07T20:29:48Z","published":"2021-08-17T10:39:50Z","title":"KCNet: An Insect-Inspired Single-Hidden-Layer Neural Network with\n  Randomized Binary Weights for Prediction and Classification Tasks","summary":"  Fruit flies are established model systems for studying olfactory learning as\nthey will readily learn to associate odors with both electric shock or sugar\nrewards. The mechanisms of the insect brain apparently responsible for odor\nlearning form a relatively shallow neuronal architecture. Olfactory inputs are\nreceived by the antennal lobe (AL) of the brain, which produces an encoding of\neach odor mixture across ~50 sub-units known as glomeruli. Each of these\nglomeruli then projects its component of this feature vector to several of\n~2000 so-called Kenyon Cells (KCs) in a region of the brain known as the\nmushroom body (MB). Fly responses to odors are generated by small downstream\nneutrophils that decode the higher-order representation from the MB. Research\nhas shown that there is no recognizable pattern in the glomeruli--KC\nconnections (and thus the particular higher-order representations); they are\nakin to fingerprints--even isogenic flies have different projections.\nLeveraging insights from this architecture, we propose KCNet, a\nsingle-hidden-layer neural network that contains sparse, randomized, binary\nweights between the input layer and the hidden layer and analytically learned\nweights between the hidden layer and the output layer. Furthermore, we also\npropose a dynamic optimization algorithm that enables the KCNet to increase\nperformance beyond its structural limits by searching for a more efficient set\nof inputs. For odorant-perception tasks that predict the perceptual properties\nof an odorant, we show that KCNet outperforms existing data-driven approaches,\nsuch as XGBoost. For image classification tasks, KCNet achieves reasonable\nperformance on benchmark datasets (MNIST, Fashion-MNIST, and EMNIST) without\nany data-augmentation methods or convolutional layers and shows a particularly\nfast running time.\n","authors":["Jinyung Hong","Theodore P. Pavlic"],"pdf_url":"https://arxiv.org/pdf/2108.07554v3.pdf","comment":"24 pages, 46 figures, 3 tables; The GitHub repo link was updated"},{"id":"http://arxiv.org/abs/2412.05781v3","updated":"2025-01-07T20:27:09Z","published":"2024-12-08T02:27:17Z","title":"Open-Source Acceleration of Stable-Diffusion.cpp Deployable on All\n  Devices","summary":"  Stable diffusion plays a crucial role in generating high-quality images.\nHowever, image generation is time-consuming and memory-intensive. To address\nthis, stable-diffusion.cpp (Sdcpp) emerges as an efficient inference framework\nto accelerate the diffusion models. Although it is lightweight, the current\nimplementation of ggml_conv_2d operator in Sdcpp is suboptimal, exhibiting both\nhigh inference latency and massive memory usage. To address this, in this work,\nwe present an optimized version of Sdcpp leveraging the Winograd algorithm to\naccelerate 2D convolution operations, which is the primary bottleneck in the\npipeline. By analyzing both dependent and independent computation graphs, we\nexploit the device's locality and parallelism to achieve substantial\nperformance improvements. Our framework delivers correct end-to-end results\nacross various stable diffusion models, including SDv1.4, v1.5, v2.1, SDXL, and\nSDXL-Turbo. Our evaluation results demonstrate a speedup up to 2.76x for\nindividual convolutional layers and an inference speedup up to 4.79x for the\noverall image generation process, compared with the original Sdcpp on M1 pro.\nHomepage: https://github.com/SealAILab/stable-diffusion-cpp\n","authors":["Jingxu Ng","Cheng Lv","Pu Zhao","Wei Niu","Juyi Lin","Minzhou Pan","Yun Liang","Yanzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2412.05781v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04121v1","updated":"2025-01-07T20:02:55Z","published":"2025-01-07T20:02:55Z","title":"Graph-Based Multimodal and Multi-view Alignment for Keystep Recognition","summary":"  Egocentric videos capture scenes from a wearer's viewpoint, resulting in\ndynamic backgrounds, frequent motion, and occlusions, posing challenges to\naccurate keystep recognition. We propose a flexible graph-learning framework\nfor fine-grained keystep recognition that is able to effectively leverage\nlong-term dependencies in egocentric videos, and leverage alignment between\negocentric and exocentric videos during training for improved inference on\negocentric videos. Our approach consists of constructing a graph where each\nvideo clip of the egocentric video corresponds to a node. During training, we\nconsider each clip of each exocentric video (if available) as additional nodes.\nWe examine several strategies to define connections across these nodes and pose\nkeystep recognition as a node classification task on the constructed graphs. We\nperform extensive experiments on the Ego-Exo4D dataset and show that our\nproposed flexible graph-based framework notably outperforms existing methods by\nmore than 12 points in accuracy. Furthermore, the constructed graphs are sparse\nand compute efficient. We also present a study examining on harnessing several\nmultimodal features, including narrations, depth, and object class labels, on a\nheterogeneous graph and discuss their corresponding contribution to the keystep\nrecognition performance.\n","authors":["Julia Lee Romero","Kyle Min","Subarna Tripathi","Morteza Karimzadeh"],"pdf_url":"https://arxiv.org/pdf/2501.04121v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.06267v2","updated":"2025-01-07T19:53:41Z","published":"2024-09-10T07:12:18Z","title":"Mahalanobis k-NN: A Statistical Lens for Robust Point-Cloud\n  Registrations","summary":"  In this paper, we discuss Mahalanobis k-NN: A Statistical Lens designed to\naddress the challenges of feature matching in learning-based point cloud\nregistration when confronted with an arbitrary density of point clouds. We\ntackle this by adopting Mahalanobis k-NN's inherent property to capture the\ndistribution of the local neighborhood and surficial geometry. Our method can\nbe seamlessly integrated into any local-graph-based point cloud analysis\nmethod. In this paper, we focus on two distinct methodologies: Deep Closest\nPoint (DCP) and Deep Universal Manifold Embedding (DeepUME). Our extensive\nbenchmarking on the ModelNet40 and FAUST datasets highlights the efficacy of\nthe proposed method in point cloud registration tasks. Moreover, we establish\nfor the first time that the features acquired through point cloud registration\ninherently can possess discriminative capabilities. This is evident by a\nsubstantial improvement of about 20% in the average accuracy observed in the\npoint cloud few-shot classification task, benchmarked on ModelNet40 and\nScanObjectNN.\n","authors":["Tejas Anvekar","Shivanand Venkanna Sheshappanavar"],"pdf_url":"https://arxiv.org/pdf/2409.06267v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04074v1","updated":"2025-01-07T18:59:53Z","published":"2025-01-07T18:59:53Z","title":"NeRFs are Mirror Detectors: Using Structural Similarity for Multi-View\n  Mirror Scene Reconstruction with 3D Surface Primitives","summary":"  While neural radiance fields (NeRF) led to a breakthrough in photorealistic\nnovel view synthesis, handling mirroring surfaces still denotes a particular\nchallenge as they introduce severe inconsistencies in the scene representation.\nPrevious attempts either focus on reconstructing single reflective objects or\nrely on strong supervision guidance in terms of additional user-provided\nannotations of visible image regions of the mirrors, thereby limiting the\npractical usability. In contrast, in this paper, we present NeRF-MD, a method\nwhich shows that NeRFs can be considered as mirror detectors and which is\ncapable of reconstructing neural radiance fields of scenes containing mirroring\nsurfaces without the need for prior annotations. To this end, we first compute\nan initial estimate of the scene geometry by training a standard NeRF using a\ndepth reprojection loss. Our key insight lies in the fact that parts of the\nscene corresponding to a mirroring surface will still exhibit a significant\nphotometric inconsistency, whereas the remaining parts are already\nreconstructed in a plausible manner. This allows us to detect mirror surfaces\nby fitting geometric primitives to such inconsistent regions in this initial\nstage of the training. Using this information, we then jointly optimize the\nradiance field and mirror geometry in a second training stage to refine their\nquality. We demonstrate the capability of our method to allow the faithful\ndetection of mirrors in the scene as well as the reconstruction of a single\nconsistent scene representation, and demonstrate its potential in comparison to\nbaseline and mirror-aware approaches.\n","authors":["Leif Van Holland","Michael Weinmann","Jan U. Müller","Patrick Stotko","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2501.04074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04073v1","updated":"2025-01-07T18:53:14Z","published":"2025-01-07T18:53:14Z","title":"Deep Learning for Ophthalmology: The State-of-the-Art and Future Trends","summary":"  The emergence of artificial intelligence (AI), particularly deep learning\n(DL), has marked a new era in the realm of ophthalmology, offering\ntransformative potential for the diagnosis and treatment of posterior segment\neye diseases. This review explores the cutting-edge applications of DL across a\nrange of ocular conditions, including diabetic retinopathy, glaucoma,\nage-related macular degeneration, and retinal vessel segmentation. We provide a\ncomprehensive overview of foundational ML techniques and advanced DL\narchitectures, such as CNNs, attention mechanisms, and transformer-based\nmodels, highlighting the evolving role of AI in enhancing diagnostic accuracy,\noptimizing treatment strategies, and improving overall patient care.\nAdditionally, we present key challenges in integrating AI solutions into\nclinical practice, including ensuring data diversity, improving algorithm\ntransparency, and effectively leveraging multimodal data. This review\nemphasizes AI's potential to improve disease diagnosis and enhance patient care\nwhile stressing the importance of collaborative efforts to overcome these\nbarriers and fully harness AI's impact in advancing eye care.\n","authors":["Duy M. H. Nguyen","Hasan Md Tusfiqur Alam","Tai Nguyen","Devansh Srivastav","Hans-Juergen Profitlich","Ngan Le","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2501.04073v1.pdf","comment":"First version"},{"id":"http://arxiv.org/abs/2501.04735v1","updated":"2025-01-07T19:57:15Z","published":"2025-01-07T19:57:15Z","title":"Topology-based deep-learning segmentation method for deep anterior\n  lamellar keratoplasty (DALK) surgical guidance using M-mode OCT data","summary":"  Deep Anterior Lamellar Keratoplasty (DALK) is a partial-thickness corneal\ntransplant procedure used to treat corneal stromal diseases. A crucial step in\nthis procedure is the precise separation of the deep stroma from Descemet's\nmembrane (DM) using the Big Bubble technique. To simplify the tasks of needle\ninsertion and pneumo-dissection in this technique, we previously developed an\nOptical Coherence Tomography (OCT)-guided, eye-mountable robot that uses\nreal-time tracking of corneal layers from M-mode OCT signals for control.\nHowever, signal noise and instability during manipulation of the OCT fiber\nsensor-integrated needle have hindered the performance of conventional\ndeep-learning segmentation methods, resulting in rough and inaccurate detection\nof corneal layers. To address these challenges, we have developed a\ntopology-based deep-learning segmentation method that integrates a topological\nloss function with a modified network architecture. This approach effectively\nreduces the effects of noise and improves segmentation speed, precision, and\nstability. Validation using in vivo, ex vivo, and hybrid rabbit eye datasets\ndemonstrates that our method outperforms traditional loss-based techniques,\nproviding fast, accurate, and robust segmentation of the epithelium and DM to\nguide surgery.\n","authors":["J. Yu","H. Yi","Y. Wang","J. D. Opfermann","W. G. Gensheimer","A. Krieger","J. U. Kang"],"pdf_url":"https://arxiv.org/pdf/2501.04735v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2501.03995v1","updated":"2025-01-07T18:52:05Z","published":"2025-01-07T18:52:05Z","title":"RAG-Check: Evaluating Multimodal Retrieval Augmented Generation\n  Performance","summary":"  Retrieval-augmented generation (RAG) improves large language models (LLMs) by\nusing external knowledge to guide response generation, reducing hallucinations.\nHowever, RAG, particularly multi-modal RAG, can introduce new hallucination\nsources: (i) the retrieval process may select irrelevant pieces (e.g.,\ndocuments, images) as raw context from the database, and (ii) retrieved images\nare processed into text-based context via vision-language models (VLMs) or\ndirectly used by multi-modal language models (MLLMs) like GPT-4o, which may\nhallucinate. To address this, we propose a novel framework to evaluate the\nreliability of multi-modal RAG using two performance measures: (i) the\nrelevancy score (RS), assessing the relevance of retrieved entries to the\nquery, and (ii) the correctness score (CS), evaluating the accuracy of the\ngenerated response. We train RS and CS models using a ChatGPT-derived database\nand human evaluator samples. Results show that both models achieve ~88%\naccuracy on test data. Additionally, we construct a 5000-sample human-annotated\ndatabase evaluating the relevancy of retrieved pieces and the correctness of\nresponse statements. Our RS model aligns with human preferences 20% more often\nthan CLIP in retrieval, and our CS model matches human preferences ~91% of the\ntime. Finally, we assess various RAG systems' selection and generation\nperformances using RS and CS.\n","authors":["Matin Mortaheb","Mohammad A. Amir Khojastepour","Srimat T. Chakradhar","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2501.03995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03989v1","updated":"2025-01-07T18:46:34Z","published":"2025-01-07T18:46:34Z","title":"(De)-Indexing and the Right to be Forgotten","summary":"  In the digital age, the challenge of forgetfulness has emerged as a\nsignificant concern, particularly regarding the management of personal data and\nits accessibility online. The right to be forgotten (RTBF) allows individuals\nto request the removal of outdated or harmful information from public access,\nyet implementing this right poses substantial technical difficulties for search\nengines. This paper aims to introduce non-experts to the foundational concepts\nof information retrieval (IR) and de-indexing, which are critical for\nunderstanding how search engines can effectively \"forget\" certain content. We\nwill explore various IR models, including boolean, probabilistic, vector space,\nand embedding-based approaches, as well as the role of Large Language Models\n(LLMs) in enhancing data processing capabilities. By providing this overview,\nwe seek to highlight the complexities involved in balancing individual privacy\nrights with the operational challenges faced by search engines in managing\ninformation visibility.\n","authors":["Salvatore Vilella","Giancarlo Ruffo"],"pdf_url":"https://arxiv.org/pdf/2501.03989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03930v1","updated":"2025-01-07T16:48:21Z","published":"2025-01-07T16:48:21Z","title":"Towards Reliable Testing for Multiple Information Retrieval System\n  Comparisons","summary":"  Null Hypothesis Significance Testing is the \\textit{de facto} tool for\nassessing effectiveness differences between Information Retrieval systems.\nResearchers use statistical tests to check whether those differences will\ngeneralise to online settings or are just due to the samples observed in the\nlaboratory. Much work has been devoted to studying which test is the most\nreliable when comparing a pair of systems, but most of the IR real-world\nexperiments involve more than two. In the multiple comparisons scenario,\ntesting several systems simultaneously may inflate the errors committed by the\ntests. In this paper, we use a new approach to assess the reliability of\nmultiple comparison procedures using simulated and real TREC data. Experiments\nshow that Wilcoxon plus the Benjamini-Hochberg correction yields Type I error\nrates according to the significance level for typical sample sizes while being\nthe best test in terms of statistical power.\n","authors":["David Otero","Javier Parapar","Álvaro Barreiro"],"pdf_url":"https://arxiv.org/pdf/2501.03930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03904v1","updated":"2025-01-07T16:18:55Z","published":"2025-01-07T16:18:55Z","title":"Exploring the Potential of Large Language Models in Public\n  Transportation: San Antonio Case Study","summary":"  The integration of large language models (LLMs) into public transit systems\npresents a transformative opportunity to enhance urban mobility. This study\nexplores the potential of LLMs to revolutionize public transportation\nmanagement within the context of San Antonio's transit system. Leveraging the\ncapabilities of LLMs in natural language processing and data analysis, we\ninvestigate their capabilities to optimize route planning, reduce wait times,\nand provide personalized travel assistance. By utilizing the General Transit\nFeed Specification (GTFS) and other relevant data, this research aims to\ndemonstrate how LLMs can potentially improve resource allocation, elevate\npassenger satisfaction, and inform data-driven decision-making in transit\noperations. A comparative analysis of different ChatGPT models was conducted to\nassess their ability to understand transportation information, retrieve\nrelevant data, and provide comprehensive responses. Findings from this study\nsuggest that while LLMs hold immense promise for public transit, careful\nengineering and fine-tuning are essential to realizing their full potential.\nSan Antonio serves as a case study to inform the development of LLM-powered\ntransit systems in other urban environments.\n","authors":["Ramya Jonnala","Gongbo Liang","Jeong Yang","Izzat Alsmadi"],"pdf_url":"https://arxiv.org/pdf/2501.03904v1.pdf","comment":"This work is accepted to AAAI 2025 Workshop on AI for Urban Planning.\n  arXiv admin note: substantial text overlap with arXiv:2407.11003"},{"id":"http://arxiv.org/abs/2501.03843v1","updated":"2025-01-07T14:53:35Z","published":"2025-01-07T14:53:35Z","title":"BERTopic for Topic Modeling of Hindi Short Texts: A Comparative Study","summary":"  As short text data in native languages like Hindi increasingly appear in\nmodern media, robust methods for topic modeling on such data have gained\nimportance. This study investigates the performance of BERTopic in modeling\nHindi short texts, an area that has been under-explored in existing research.\nUsing contextual embeddings, BERTopic can capture semantic relationships in\ndata, making it potentially more effective than traditional models, especially\nfor short and diverse texts. We evaluate BERTopic using 6 different document\nembedding models and compare its performance against 8 established topic\nmodeling techniques, such as Latent Dirichlet Allocation (LDA), Non-negative\nMatrix Factorization (NMF), Latent Semantic Indexing (LSI), Additive\nRegularization of Topic Models (ARTM), Probabilistic Latent Semantic Analysis\n(PLSA), Embedded Topic Model (ETM), Combined Topic Model (CTM), and Top2Vec.\nThe models are assessed using coherence scores across a range of topic counts.\nOur results reveal that BERTopic consistently outperforms other models in\ncapturing coherent topics from short Hindi texts.\n","authors":["Atharva Mutsaddi","Anvi Jamkhande","Aryan Thakre","Yashodhara Haribhakta"],"pdf_url":"https://arxiv.org/pdf/2501.03843v1.pdf","comment":"Accepted into IndoNLP: The First Workshop on Natural Language\n  Processing for Indo-Aryan and Dravidian Languages, collocated with COLING\n  2025. Set to appear in the workshop proceedings published in ACL Anthology"},{"id":"http://arxiv.org/abs/2501.03835v1","updated":"2025-01-07T14:45:30Z","published":"2025-01-07T14:45:30Z","title":"TACLR: A Scalable and Efficient Retrieval-based Method for Industrial\n  Product Attribute Value Identification","summary":"  Product Attribute Value Identification (PAVI) involves identifying attribute\nvalues from product profiles, a key task for improving product search,\nrecommendations, and business analytics on e-commerce platforms. However,\nexisting PAVI methods face critical challenges, such as inferring implicit\nvalues, handling out-of-distribution (OOD) values, and producing normalized\noutputs. To address these limitations, we introduce Taxonomy-Aware Contrastive\nLearning Retrieval (TACLR), the first retrieval-based method for PAVI. TACLR\nformulates PAVI as an information retrieval task by encoding product profiles\nand candidate values into embeddings and retrieving values based on their\nsimilarity to the item embedding. It leverages contrastive training with\ntaxonomy-aware hard negative sampling and employs adaptive inference with\ndynamic thresholds. TACLR offers three key advantages: (1) it effectively\nhandles implicit and OOD values while producing normalized outputs; (2) it\nscales to thousands of categories, tens of thousands of attributes, and\nmillions of values; and (3) it supports efficient inference for high-load\nindustrial scenarios. Extensive experiments on proprietary and public datasets\nvalidate the effectiveness and efficiency of TACLR. Moreover, it has been\nsuccessfully deployed in a real-world e-commerce platform, processing millions\nof product listings daily while supporting dynamic, large-scale attribute\ntaxonomies.\n","authors":["Yindu Su","Huike Zou","Lin Sun","Ting Zhang","Haiyang Yang","Liyu Chen","David Lo","Qingheng Zhang","Shuguang Han","Jufeng Chen"],"pdf_url":"https://arxiv.org/pdf/2501.03835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03811v1","updated":"2025-01-07T14:24:49Z","published":"2025-01-07T14:24:49Z","title":"Extending ChatGPT with a Browserless System for Web Product Price\n  Extraction","summary":"  With the advenement of ChatGPT, we can find very clean, precise answers to a\nvaried amount of questions. However, for questions such as 'find the price of\nthe lemon cake at zingerman's', the answer looks like 'I can't browse the web\nright now'. In this paper, we propose a system, called Wextractor, which\nextends ChatGPT to answer questions as the one mentioned before. Obviously, our\nsystem cannot be labeled as `artificial intelligence'. Simply, it offers to\ncover a kind of transactional search that is not included in the current\nversion of ChatGPT. Moreover, Wextractor includes two improvements with respect\nto the initial version: social extraction and pointing pattern extraction to\nimprove the answer speed.\n","authors":["Jorge Lloret-Gazo"],"pdf_url":"https://arxiv.org/pdf/2501.03811v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.03769v1","updated":"2025-01-07T13:22:35Z","published":"2025-01-07T13:22:35Z","title":"Multi-label Cross-lingual automatic music genre classification from\n  lyrics with Sentence BERT","summary":"  Music genres are shaped by both the stylistic features of songs and the\ncultural preferences of artists' audiences. Automatic classification of music\ngenres using lyrics can be useful in several applications such as\nrecommendation systems, playlist creation, and library organization. We present\na multi-label, cross-lingual genre classification system based on multilingual\nsentence embeddings generated by sBERT. Using a bilingual Portuguese-English\ndataset with eight overlapping genres, we demonstrate the system's ability to\ntrain on lyrics in one language and predict genres in another. Our approach\noutperforms the baseline approach of translating lyrics and using a\nbag-of-words representation, improving the genrewise average F1-Score from 0.35\nto 0.69. The classifier uses a one-vs-all architecture, enabling it to assign\nmultiple genre labels to a single lyric. Experimental results reveal that\ndataset centralization notably improves cross-lingual performance. This\napproach offers a scalable solution for genre classification across\nunderrepresented languages and cultural domains, advancing the capabilities of\nmusic information retrieval systems.\n","authors":["Tiago Fernandes Tavares","Fabio José Ayres"],"pdf_url":"https://arxiv.org/pdf/2501.03769v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2409.06096v4","updated":"2025-01-07T10:45:58Z","published":"2024-09-09T22:16:48Z","title":"Latent Diffusion Bridges for Unsupervised Musical Audio Timbre Transfer","summary":"  Music timbre transfer is a challenging task that involves modifying the\ntimbral characteristics of an audio signal while preserving its melodic\nstructure. In this paper, we propose a novel method based on dual diffusion\nbridges, trained using the CocoChorales Dataset, which consists of unpaired\nmonophonic single-instrument audio data. Each diffusion model is trained on a\nspecific instrument with a Gaussian prior. During inference, a model is\ndesignated as the source model to map the input audio to its corresponding\nGaussian prior, and another model is designated as the target model to\nreconstruct the target audio from this Gaussian prior, thereby facilitating\ntimbre transfer. We compare our approach against existing unsupervised timbre\ntransfer models such as VAEGAN and Gaussian Flow Bridges (GFB). Experimental\nresults demonstrate that our method achieves both better Fr\\'echet Audio\nDistance (FAD) and melody preservation, as reflected by lower pitch distances\n(DPD) compared to VAEGAN and GFB. Additionally, we discover that the noise\nlevel from the Gaussian prior, $\\sigma$, can be adjusted to control the degree\nof melody preservation and amount of timbre transferred.\n","authors":["Michele Mancusi","Yurii Halychanskyi","Kin Wai Cheuk","Eloi Moliner","Chieh-Hsin Lai","Stefan Uhlich","Junghyun Koo","Marco A. Martínez-Ramírez","Wei-Hsiang Liao","Giorgio Fabbro","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2409.06096v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03598v1","updated":"2025-01-07T07:55:35Z","published":"2025-01-07T07:55:35Z","title":"RecKG: Knowledge Graph for Recommender Systems","summary":"  Knowledge graphs have proven successful in integrating heterogeneous data\nacross various domains. However, there remains a noticeable dearth of research\non their seamless integration among heterogeneous recommender systems, despite\nknowledge graph-based recommender systems garnering extensive research\nattention. This study aims to fill this gap by proposing RecKG, a standardized\nknowledge graph for recommender systems. RecKG ensures the consistent\nrepresentation of entities across different datasets, accommodating diverse\nattribute types for effective data integration. Through a meticulous\nexamination of various recommender system datasets, we select attributes for\nRecKG, ensuring standardized formatting through consistent naming conventions.\nBy these characteristics, RecKG can seamlessly integrate heterogeneous data\nsources, enabling the discovery of additional semantic information within the\nintegrated knowledge graph. We apply RecKG to standardize real-world datasets,\nsubsequently developing an application for RecKG using a graph database.\nFinally, we validate RecKG's achievement in interoperability through a\nqualitative evaluation between RecKG and other studies.\n","authors":["Junhyuk Kwon","Seokho Ahn","Young-Duk Seo"],"pdf_url":"https://arxiv.org/pdf/2501.03598v1.pdf","comment":"Accepted by The 39th ACM/SIGAPP Symposium On Applied Computing(SAC)\n  2024"},{"id":"http://arxiv.org/abs/2412.02155v2","updated":"2025-01-07T06:30:24Z","published":"2024-12-03T04:29:27Z","title":"CausalMob: Causal Human Mobility Prediction with LLMs-derived Human\n  Intentions toward Public Events","summary":"  Large-scale human mobility exhibits spatial and temporal patterns that can\nassist policymakers in decision making. Although traditional prediction models\nattempt to capture these patterns, they often interfered by non-periodic public\nevents, such as disasters and occasional celebrations. Since regular human\nmobility patterns are heavily affected by these events, estimating their causal\neffects is critical to accurate mobility predictions. Although news articles\nprovide unique perspectives on these events in an unstructured format,\nprocessing is a challenge. In this study, we propose a causality-augmented\nprediction model, called CausalMob, to analyze the causal effects of public\nevents. We first utilize large language models (LLMs) to extract human\nintentions from news articles and transform them into features that act as\ncausal treatments. Next, the model learns representations of spatio-temporal\nregional covariates from multiple data sources to serve as confounders for\ncausal inference. Finally, we present a causal effect estimation framework to\nensure event features remain independent of confounders during prediction.\nBased on large-scale real-world data, the experimental results show that the\nproposed model excels in human mobility prediction, outperforming\nstate-of-the-art models.\n","authors":["Xiaojie Yang","Hangli Ge","Jiawei Wang","Zipei Fan","Renhe Jiang","Ryosuke Shibasaki","Noboru Koshizuka"],"pdf_url":"https://arxiv.org/pdf/2412.02155v2.pdf","comment":"Accepted by KDD 2025"},{"id":"http://arxiv.org/abs/2501.03228v2","updated":"2025-01-07T04:05:53Z","published":"2025-01-06T18:59:55Z","title":"LightGNN: Simple Graph Neural Network for Recommendation","summary":"  Graph neural networks (GNNs) have demonstrated superior performance in\ncollaborative recommendation through their ability to conduct high-order\nrepresentation smoothing, effectively capturing structural information within\nusers' interaction patterns. However, existing GNN paradigms face significant\nchallenges in scalability and robustness when handling large-scale, noisy, and\nreal-world datasets. To address these challenges, we present LightGNN, a\nlightweight and distillation-based GNN pruning framework designed to\nsubstantially reduce model complexity while preserving essential collaboration\nmodeling capabilities. Our LightGNN framework introduces a computationally\nefficient pruning module that adaptively identifies and removes redundant edges\nand embedding entries for model compression. The framework is guided by a\nresource-friendly hierarchical knowledge distillation objective, whose\nintermediate layer augments the observed graph to maintain performance,\nparticularly in high-rate compression scenarios. Extensive experiments on\npublic datasets demonstrate LightGNN's effectiveness, significantly improving\nboth computational efficiency and recommendation accuracy. Notably, LightGNN\nachieves an 80% reduction in edge count and 90% reduction in embedding entries\nwhile maintaining performance comparable to more complex state-of-the-art\nbaselines. The implementation of our LightGNN framework is available at the\ngithub repository: https://github.com/HKUDS/LightGNN.\n","authors":["Guoxuan Chen","Lianghao Xia","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2501.03228v2.pdf","comment":"Accepted to WSDM 2025 Oral"},{"id":"http://arxiv.org/abs/2501.04167v1","updated":"2025-01-07T22:29:08Z","published":"2025-01-07T22:29:08Z","title":"Reasoning-Enhanced Self-Training for Long-Form Personalized Text\n  Generation","summary":"  Personalized text generation requires a unique ability of large language\nmodels (LLMs) to learn from context that they often do not encounter during\ntheir standard training. One way to encourage LLMs to better use personalized\ncontext for generating outputs that better align with the user's expectations\nis to instruct them to reason over the user's past preferences, background\nknowledge, or writing style. To achieve this, we propose Reasoning-Enhanced\nSelf-Training for Personalized Text Generation (REST-PG), a framework that\ntrains LLMs to reason over personal data during response generation. REST-PG\nfirst generates reasoning paths to train the LLM's reasoning abilities and then\nemploys Expectation-Maximization Reinforced Self-Training to iteratively train\nthe LLM based on its own high-reward outputs. We evaluate REST-PG on the\nLongLaMP benchmark, consisting of four diverse personalized long-form text\ngeneration tasks. Our experiments demonstrate that REST-PG achieves significant\nimprovements over state-of-the-art baselines, with an average relative\nperformance gain of 14.5% on the benchmark.\n","authors":["Alireza Salemi","Cheng Li","Mingyang Zhang","Qiaozhu Mei","Weize Kong","Tao Chen","Zhuowan Li","Michael Bendersky","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2501.04167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04161v1","updated":"2025-01-07T22:19:15Z","published":"2025-01-07T22:19:15Z","title":"KGIF: Optimizing Relation-Aware Recommendations with Knowledge Graph\n  Information Fusion","summary":"  While deep-learning-enabled recommender systems demonstrate strong\nperformance benchmarks, many struggle to adapt effectively in real-world\nenvironments due to limited use of user-item relationship data and insufficient\ntransparency in recommendation generation. Traditional collaborative filtering\napproaches fail to integrate multifaceted item attributes, and although\nFactorization Machines account for item-specific details, they overlook broader\nrelational patterns. Collaborative knowledge graph-based models have progressed\nby embedding user-item interactions with item-attribute relationships, offering\na holistic perspective on interconnected entities. However, these models\nfrequently aggregate attribute and interaction data in an implicit manner,\nleaving valuable relational nuances underutilized.\n  This study introduces the Knowledge Graph Attention Network with Information\nFusion (KGIF), a specialized framework designed to merge entity and relation\nembeddings explicitly through a tailored self-attention mechanism. The KGIF\nframework integrates reparameterization via dynamic projection vectors,\nenabling embeddings to adaptively represent intricate relationships within\nknowledge graphs. This explicit fusion enhances the interplay between user-item\ninteractions and item-attribute relationships, providing a nuanced balance\nbetween user-centric and item-centric representations. An attentive propagation\nmechanism further optimizes knowledge graph embeddings, capturing multi-layered\ninteraction patterns. The contributions of this work include an innovative\nmethod for explicit information fusion, improved robustness for sparse\nknowledge graphs, and the ability to generate explainable recommendations\nthrough interpretable path visualization.\n","authors":["Dong Hyun Jeon","Wenbo Sun","Houbing Herbert Song","Dongfang Liu","Velasquez Alvaro","Yixin Chloe Xie","Shuteng Niu"],"pdf_url":"https://arxiv.org/pdf/2501.04161v1.pdf","comment":"Published at IEEE Big Data 2024"},{"id":"http://arxiv.org/abs/2412.16181v2","updated":"2025-01-07T22:12:47Z","published":"2024-12-10T16:51:11Z","title":"Minimum Weighted Feedback Arc Sets for Ranking from Pairwise Comparisons","summary":"  The Minimum Weighted Feedback Arc Set (MWFAS) problem is fundamentally\nconnected to the Ranking Problem -- the task of deriving global rankings from\npairwise comparisons. Recent work [He et al. ICML2022] has advanced the\nstate-of-the-art for the Ranking Problem using learning-based methods,\nimproving upon multiple previous approaches. However, the connection to MWFAS\nremains underexplored. This paper investigates this relationship and presents\nefficient combinatorial algorithms for solving MWFAS, thus addressing the\nRanking Problem. Our experimental results demonstrate that these simple,\nlearning-free algorithms not only significantly outperform learning-based\nmethods in terms of speed but also generally achieve superior ranking accuracy.\n","authors":["Soroush Vahidi","Ioannis Koutis"],"pdf_url":"https://arxiv.org/pdf/2412.16181v2.pdf","comment":"This is a preliminary paper"},{"id":"http://arxiv.org/abs/2501.05475v1","updated":"2025-01-07T08:57:42Z","published":"2025-01-07T08:57:42Z","title":"Retrieval-Augmented Generation by Evidence Retroactivity in LLMs","summary":"  Retrieval-augmented generation has gained significant attention due to its\nability to integrate relevant external knowledge, enhancing the accuracy and\nreliability of the LLMs' responses. Most of the existing methods apply a\ndynamic multiple retrieval-generating process, to address multi-hop complex\nquestions by decomposing them into sub-problems. However, these methods rely on\nan unidirectional forward reasoning paradigm, where errors from insufficient\nreasoning steps or inherent flaws in current retrieval systems are\nirreversible, potentially derailing the entire reasoning chain. For the first\ntime, this work introduces Retroactive Retrieval-Augmented Generation\n(RetroRAG), a novel framework to build a retroactive reasoning paradigm.\nRetroRAG revises and updates the evidence, redirecting the reasoning chain to\nthe correct direction. RetroRAG constructs an evidence-collation-discovery\nframework to search, generate, and refine credible evidence. It synthesizes\ninferential evidence related to the key entities in the question from the\nexisting source knowledge and formulates search queries to uncover additional\ninformation. As new evidence is found, RetroRAG continually updates and\norganizes this information, enhancing its ability to locate further necessary\nevidence. Paired with an Answerer to generate and evaluate outputs, RetroRAG is\ncapable of refining its reasoning process iteratively until a reliable answer\nis obtained. Empirical evaluations show that RetroRAG significantly outperforms\nexisting methods.\n","authors":["Liang Xiao","Wen Dai","Shuai Chen","Bin Qin","Chongyang Shi","Haopeng Jing","Tianyu Guo"],"pdf_url":"https://arxiv.org/pdf/2501.05475v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2501.04005v1","updated":"2025-01-07T18:59:59Z","published":"2025-01-07T18:59:59Z","title":"LargeAD: Large-Scale Cross-Sensor Data Pretraining for Autonomous\n  Driving","summary":"  Recent advancements in vision foundation models (VFMs) have revolutionized\nvisual perception in 2D, yet their potential for 3D scene understanding,\nparticularly in autonomous driving applications, remains underexplored. In this\npaper, we introduce LargeAD, a versatile and scalable framework designed for\nlarge-scale 3D pretraining across diverse real-world driving datasets. Our\nframework leverages VFMs to extract semantically rich superpixels from 2D\nimages, which are aligned with LiDAR point clouds to generate high-quality\ncontrastive samples. This alignment facilitates cross-modal representation\nlearning, enhancing the semantic consistency between 2D and 3D data. We\nintroduce several key innovations: i) VFM-driven superpixel generation for\ndetailed semantic representation, ii) a VFM-assisted contrastive learning\nstrategy to align multimodal features, iii) superpoint temporal consistency to\nmaintain stable representations across time, and iv) multi-source data\npretraining to generalize across various LiDAR configurations. Our approach\ndelivers significant performance improvements over state-of-the-art methods in\nboth linear probing and fine-tuning tasks for both LiDAR-based segmentation and\nobject detection. Extensive experiments on eleven large-scale multi-modal\ndatasets highlight our superior performance, demonstrating the adaptability,\nefficiency, and robustness in real-world autonomous driving scenarios.\n","authors":["Lingdong Kong","Xiang Xu","Youquan Liu","Jun Cen","Runnan Chen","Wenwei Zhang","Liang Pan","Kai Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.04005v1.pdf","comment":"Preprint; 16 pages, 7 figures, 8 tables; Project Page at\n  https://ldkong.com/LargeAD"},{"id":"http://arxiv.org/abs/2501.04004v1","updated":"2025-01-07T18:59:58Z","published":"2025-01-07T18:59:58Z","title":"LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes","summary":"  LiDAR data pretraining offers a promising approach to leveraging large-scale,\nreadily available datasets for enhanced data utilization. However, existing\nmethods predominantly focus on sparse voxel representation, overlooking the\ncomplementary attributes provided by other LiDAR representations. In this work,\nwe propose LiMoE, a framework that integrates the Mixture of Experts (MoE)\nparadigm into LiDAR data representation learning to synergistically combine\nmultiple representations, such as range images, sparse voxels, and raw points.\nOur approach consists of three stages: i) Image-to-LiDAR Pretraining, which\ntransfers prior knowledge from images to point clouds across different\nrepresentations; ii) Contrastive Mixture Learning (CML), which uses MoE to\nadaptively activate relevant attributes from each representation and distills\nthese mixed features into a unified 3D network; iii) Semantic Mixture\nSupervision (SMS), which combines semantic logits from multiple representations\nto boost downstream segmentation performance. Extensive experiments across 11\nlarge-scale LiDAR datasets demonstrate our effectiveness and superiority. The\ncode and model checkpoints have been made publicly accessible.\n","authors":["Xiang Xu","Lingdong Kong","Hui Shuai","Liang Pan","Ziwei Liu","Qingshan Liu"],"pdf_url":"https://arxiv.org/pdf/2501.04004v1.pdf","comment":"Preprint; 26 pages, 17 figures, 7 tables; Project Page at\n  https://ldkong.com/LiMoE"},{"id":"http://arxiv.org/abs/2412.05313v3","updated":"2025-01-07T18:57:23Z","published":"2024-11-28T19:31:50Z","title":"λ: A Benchmark for Data-Efficiency in Long-Horizon Indoor Mobile\n  Manipulation Robotics","summary":"  Efficiently learning and executing long-horizon mobile manipulation (MoMa)\ntasks is crucial for advancing robotics in household and workplace settings.\nHowever, current MoMa models are data-inefficient, underscoring the need for\nimproved models that require realistic-sized benchmarks to evaluate their\nefficiency, which do not exist. To address this, we introduce the LAMBDA\n({\\lambda}) benchmark (Long-horizon Actions for Mobile-manipulation\nBenchmarking of Directed Activities), which evaluates the data efficiency of\nmodels on language-conditioned, long-horizon, multi-room, multi-floor,\npick-and-place tasks using a dataset of manageable size, more feasible for\ncollection. The benchmark includes 571 human-collected demonstrations that\nprovide realism and diversity in simulated and real-world settings. Unlike\nplanner-generated data, these trajectories offer natural variability and\nreplay-verifiability, ensuring robust learning and evaluation. We benchmark\nseveral models, including learning-based models and a neuro-symbolic modular\napproach combining foundation models with task and motion planning.\nLearning-based models show suboptimal success rates, even when leveraging\npretrained weights, underscoring significant data inefficiencies. However, the\nneuro-symbolic approach performs significantly better while being more data\nefficient. Findings highlight the need for more data-efficient learning-based\nMoMa approaches. {\\lambda} addresses this gap by serving as a key benchmark for\nevaluating the data efficiency of those future models in handling household\nrobotics tasks.\n","authors":["Ahmed Jaafar","Shreyas Sundara Raman","Yichen Wei","Sudarshan Harithas","Sofia Juliani","Anneke Wernerfelt","Benedict Quartey","Ifrah Idrees","Jason Xinyu Liu","Stefanie Tellex"],"pdf_url":"https://arxiv.org/pdf/2412.05313v3.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2501.04000v1","updated":"2025-01-07T18:56:14Z","published":"2025-01-07T18:56:14Z","title":"A Survey on Federated Learning in Human Sensing","summary":"  Human Sensing, a field that leverages technology to monitor human activities,\npsycho-physiological states, and interactions with the environment, enhances\nour understanding of human behavior and drives the development of advanced\nservices that improve overall quality of life. However, its reliance on\ndetailed and often privacy-sensitive data as the basis for its machine learning\n(ML) models raises significant legal and ethical concerns. The recently\nproposed ML approach of Federated Learning (FL) promises to alleviate many of\nthese concerns, as it is able to create accurate ML models without sending raw\nuser data to a central server. While FL has demonstrated its usefulness across\na variety of areas, such as text prediction and cyber security, its benefits in\nHuman Sensing are under-explored, given the particular challenges in this\ndomain. This survey conducts a comprehensive analysis of the current\nstate-of-the-art studies on FL in Human Sensing, and proposes a taxonomy and an\neight-dimensional assessment for FL approaches. Through the eight-dimensional\nassessment, we then evaluate whether the surveyed studies consider a specific\nFL-in-Human-Sensing challenge or not. Finally, based on the overall analysis,\nwe discuss open challenges and highlight five research aspects related to FL in\nHuman Sensing that require urgent research attention. Our work provides a\ncomprehensive corpus of FL studies and aims to assist FL practitioners in\ndeveloping and evaluating solutions that effectively address the real-world\ncomplexities of Human Sensing.\n","authors":["Mohan Li","Martin Gjoreski","Pietro Barbiero","Gašper Slapničar","Mitja Luštrek","Nicholas D. Lane","Marc Langheinrich"],"pdf_url":"https://arxiv.org/pdf/2501.04000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03999v1","updated":"2025-01-07T18:55:02Z","published":"2025-01-07T18:55:02Z","title":"WAPTS: A Weighted Allocation Probability Adjusted Thompson Sampling\n  Algorithm for High-Dimensional and Sparse Experiment Settings","summary":"  Aiming for more effective experiment design, such as in video content\nadvertising where different content options compete for user engagement, these\nscenarios can be modeled as multi-arm bandit problems. In cases where limited\ninteractions are available due to external factors, such as the cost of\nconducting experiments, recommenders often face constraints due to the small\nnumber of user interactions. In addition, there is a trade-off between\nselecting the best treatment and the ability to personalize and contextualize\nbased on individual factors. A popular solution to this dilemma is the\nContextual Bandit framework. It aims to maximize outcomes while incorporating\npersonalization (contextual) factors, customizing treatments such as a user's\nprofile to individual preferences. Despite their advantages, Contextual Bandit\nalgorithms face challenges like measurement bias and the 'curse of\ndimensionality.' These issues complicate the management of numerous\ninterventions and often lead to data sparsity through participant segmentation.\nTo address these problems, we introduce the Weighted Allocation Probability\nAdjusted Thompson Sampling (WAPTS) algorithm. WAPTS builds on the contextual\nThompson Sampling method by using a dynamic weighting parameter. This improves\nthe allocation process for interventions and enables rapid optimization in\ndata-sparse environments. We demonstrate the performance of our approach on\ndifferent numbers of arms and effect sizes.\n","authors":["Haochen Song","Ilya Musabirov","Ananya Bhattacharjee","Audrey Durand","Meredith Franklin","Anna Rafferty","Joseph Jay Williams"],"pdf_url":"https://arxiv.org/pdf/2501.03999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03995v1","updated":"2025-01-07T18:52:05Z","published":"2025-01-07T18:52:05Z","title":"RAG-Check: Evaluating Multimodal Retrieval Augmented Generation\n  Performance","summary":"  Retrieval-augmented generation (RAG) improves large language models (LLMs) by\nusing external knowledge to guide response generation, reducing hallucinations.\nHowever, RAG, particularly multi-modal RAG, can introduce new hallucination\nsources: (i) the retrieval process may select irrelevant pieces (e.g.,\ndocuments, images) as raw context from the database, and (ii) retrieved images\nare processed into text-based context via vision-language models (VLMs) or\ndirectly used by multi-modal language models (MLLMs) like GPT-4o, which may\nhallucinate. To address this, we propose a novel framework to evaluate the\nreliability of multi-modal RAG using two performance measures: (i) the\nrelevancy score (RS), assessing the relevance of retrieved entries to the\nquery, and (ii) the correctness score (CS), evaluating the accuracy of the\ngenerated response. We train RS and CS models using a ChatGPT-derived database\nand human evaluator samples. Results show that both models achieve ~88%\naccuracy on test data. Additionally, we construct a 5000-sample human-annotated\ndatabase evaluating the relevancy of retrieved pieces and the correctness of\nresponse statements. Our RS model aligns with human preferences 20% more often\nthan CLIP in retrieval, and our CS model matches human preferences ~91% of the\ntime. Finally, we assess various RAG systems' selection and generation\nperformances using RS and CS.\n","authors":["Matin Mortaheb","Mohammad A. Amir Khojastepour","Srimat T. Chakradhar","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2501.03995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14794v5","updated":"2025-01-07T18:49:42Z","published":"2024-06-20T23:51:32Z","title":"ImageFlowNet: Forecasting Multiscale Image-Level Trajectories of Disease\n  Progression with Irregularly-Sampled Longitudinal Medical Images","summary":"  Advances in medical imaging technologies have enabled the collection of\nlongitudinal images, which involve repeated scanning of the same patients over\ntime, to monitor disease progression. However, predictive modeling of such data\nremains challenging due to high dimensionality, irregular sampling, and data\nsparsity. To address these issues, we propose ImageFlowNet, a novel model\ndesigned to forecast disease trajectories from initial images while preserving\nspatial details. ImageFlowNet first learns multiscale joint representation\nspaces across patients and time points, then optimizes deterministic or\nstochastic flow fields within these spaces using a position-parameterized\nneural ODE/SDE framework. The model leverages a UNet architecture to create\nrobust multiscale representations and mitigates data scarcity by combining\nknowledge from all patients. We provide theoretical insights that support our\nformulation of ODEs, and motivate our regularizations involving high-level\nvisual features, latent space organization, and trajectory smoothness. We\nvalidate ImageFlowNet on three longitudinal medical image datasets depicting\nprogression in geographic atrophy, multiple sclerosis, and glioblastoma,\ndemonstrating its ability to effectively forecast disease progression and\noutperform existing methods. Our contributions include the development of\nImageFlowNet, its theoretical underpinnings, and empirical validation on\nreal-world datasets. The official implementation is available at\nhttps://github.com/KrishnaswamyLab/ImageFlowNet.\n","authors":["Chen Liu","Ke Xu","Liangbo L. Shen","Guillaume Huguet","Zilong Wang","Alexander Tong","Danilo Bzdok","Jay Stewart","Jay C. Wang","Lucian V. Del Priore","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2406.14794v5.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.03190v2","updated":"2025-01-07T18:34:22Z","published":"2025-01-06T18:05:35Z","title":"Multimodal Machine Learning Can Predict Videoconference Fluidity and\n  Enjoyment","summary":"  Videoconferencing is now a frequent mode of communication in both\nprofessional and informal settings, yet it often lacks the fluidity and\nenjoyment of in-person conversation. This study leverages multimodal machine\nlearning to predict moments of negative experience in videoconferencing. We\nsampled thousands of short clips from the RoomReader corpus, extracting audio\nembeddings, facial actions, and body motion features to train models for\nidentifying low conversational fluidity, low enjoyment, and classifying\nconversational events (backchanneling, interruption, or gap). Our best models\nachieved an ROC-AUC of up to 0.87 on hold-out videoconference sessions, with\ndomain-general audio features proving most critical. This work demonstrates\nthat multimodal audio-video signals can effectively predict high-level\nsubjective conversational outcomes. In addition, this is a contribution to\nresearch on videoconferencing user experience by showing that multimodal\nmachine learning can be used to identify rare moments of negative user\nexperience for further study or mitigation.\n","authors":["Andrew Chang","Viswadruth Akkaraju","Ray McFadden Cogliano","David Poeppel","Dustin Freeman"],"pdf_url":"https://arxiv.org/pdf/2501.03190v2.pdf","comment":"ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.08861v5","updated":"2025-01-07T18:12:27Z","published":"2024-09-13T14:22:14Z","title":"Adjoint Matching: Fine-tuning Flow and Diffusion Generative Models with\n  Memoryless Stochastic Optimal Control","summary":"  Dynamical generative models that produce samples through an iterative\nprocess, such as Flow Matching and denoising diffusion models, have seen\nwidespread use, but there have not been many theoretically-sound methods for\nimproving these models with reward fine-tuning. In this work, we cast reward\nfine-tuning as stochastic optimal control (SOC). Critically, we prove that a\nvery specific memoryless noise schedule must be enforced during fine-tuning, in\norder to account for the dependency between the noise variable and the\ngenerated samples. We also propose a new algorithm named Adjoint Matching which\noutperforms existing SOC algorithms, by casting SOC problems as a regression\nproblem. We find that our approach significantly improves over existing methods\nfor reward fine-tuning, achieving better consistency, realism, and\ngeneralization to unseen human preference reward models, while retaining sample\ndiversity.\n","authors":["Carles Domingo-Enrich","Michal Drozdzal","Brian Karrer","Ricky T. Q. Chen"],"pdf_url":"https://arxiv.org/pdf/2409.08861v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05300v5","updated":"2025-01-07T17:42:16Z","published":"2024-03-08T13:29:46Z","title":"Unity by Diversity: Improved Representation Learning in Multimodal VAEs","summary":"  Variational Autoencoders for multimodal data hold promise for many tasks in\ndata analysis, such as representation learning, conditional generation, and\nimputation. Current architectures either share the encoder output, decoder\ninput, or both across modalities to learn a shared representation. Such\narchitectures impose hard constraints on the model. In this work, we show that\na better latent representation can be obtained by replacing these hard\nconstraints with a soft constraint. We propose a new mixture-of-experts prior,\nsoftly guiding each modality's latent representation towards a shared aggregate\nposterior. This approach results in a superior latent representation and allows\neach encoding to preserve information better from its uncompressed original\nfeatures. In extensive experiments on multiple benchmark datasets and two\nchallenging real-world datasets, we show improved learned latent\nrepresentations and imputation of missing data modalities compared to existing\nmethods.\n","authors":["Thomas M. Sutter","Yang Meng","Andrea Agostini","Daphné Chopard","Norbert Fortin","Julia E. Vogt","Babak Shahbaba","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2403.05300v5.pdf","comment":"Accepted at Neurips 2024"},{"id":"http://arxiv.org/abs/2411.00568v2","updated":"2025-01-07T17:36:14Z","published":"2024-11-01T13:26:13Z","title":"Constrained Sampling with Primal-Dual Langevin Monte Carlo","summary":"  This work considers the problem of sampling from a probability distribution\nknown up to a normalization constant while satisfying a set of statistical\nconstraints specified by the expected values of general nonlinear functions.\nThis problem finds applications in, e.g., Bayesian inference, where it can\nconstrain moments to evaluate counterfactual scenarios or enforce desiderata\nsuch as prediction fairness. Methods developed to handle support constraints,\nsuch as those based on mirror maps, barriers, and penalties, are not suited for\nthis task. This work therefore relies on gradient descent-ascent dynamics in\nWasserstein space to put forward a discrete-time primal-dual Langevin Monte\nCarlo algorithm (PD-LMC) that simultaneously constrains the target distribution\nand samples from it. We analyze the convergence of PD-LMC under standard\nassumptions on the target distribution and constraints, namely (strong)\nconvexity and log-Sobolev inequalities. To do so, we bring classical\noptimization arguments for saddle-point algorithms to the geometry of\nWasserstein space. We illustrate the relevance and effectiveness of PD-LMC in\nseveral applications.\n","authors":["Luiz F. O. Chamon","Mohammad Reza Karimi","Anna Korba"],"pdf_url":"https://arxiv.org/pdf/2411.00568v2.pdf","comment":"39 pages, 14 figures. Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.11814v4","updated":"2025-01-07T17:35:00Z","published":"2024-06-17T17:54:42Z","title":"Stochastic Neural Network Symmetrisation in Markov Categories","summary":"  We consider the problem of symmetrising a neural network along a group\nhomomorphism: given a homomorphism $\\varphi : H \\to G$, we would like a\nprocedure that converts $H$-equivariant neural networks to $G$-equivariant\nones. We formulate this in terms of Markov categories, which allows us to\nconsider neural networks whose outputs may be stochastic, but with\nmeasure-theoretic details abstracted away. We obtain a flexible and\ncompositional framework for symmetrisation that relies on minimal assumptions\nabout the structure of the group and the underlying neural network\narchitecture. Our approach recovers existing canonicalisation and averaging\ntechniques for symmetrising deterministic models, and extends to provide a\nnovel methodology for symmetrising stochastic models also. Beyond this, our\nfindings also demonstrate the utility of Markov categories for addressing\ncomplex problems in machine learning in a conceptually clear yet mathematically\nprecise way.\n","authors":["Rob Cornish"],"pdf_url":"https://arxiv.org/pdf/2406.11814v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00799v2","updated":"2025-01-07T17:32:19Z","published":"2025-01-01T10:50:35Z","title":"Follow The Approximate Sparse Leader for No-Regret Online Sparse Linear\n  Approximation","summary":"  We consider the problem of \\textit{online sparse linear approximation}, where\none predicts the best sparse approximation of a sequence of measurements in\nterms of linear combination of columns of a given measurement matrix. Such\nonline prediction problems are ubiquitous, ranging from medical trials to web\ncaching to resource allocation. The inherent difficulty of offline recovery\nalso makes the online problem challenging. In this letter, we propose\nFollow-The-Approximate-Sparse-Leader, an efficient online meta-policy to\naddress this online problem. Through a detailed theoretical analysis, we prove\nthat under certain assumptions on the measurement sequence, the proposed policy\nenjoys a data-dependent sublinear upper bound on the static regret, which can\nrange from logarithmic to square-root. Numerical simulations are performed to\ncorroborate the theoretical findings and demonstrate the efficacy of the\nproposed online policy.\n","authors":["Samrat Mukhopadhyay","Debasmita Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2501.00799v2.pdf","comment":"12 pages, 5 figures, corrected title, added proof of a lemma in\n  appendix"},{"id":"http://arxiv.org/abs/2301.08110v6","updated":"2025-01-07T17:26:26Z","published":"2023-01-19T15:01:00Z","title":"AtMan: Understanding Transformer Predictions Through Memory Efficient\n  Attention Manipulation","summary":"  Generative transformer models have become increasingly complex, with large\nnumbers of parameters and the ability to process multiple input modalities.\nCurrent methods for explaining their predictions are resource-intensive. Most\ncrucially, they require prohibitively large amounts of extra memory, since they\nrely on backpropagation which allocates almost twice as much GPU memory as the\nforward pass. This makes it difficult, if not impossible, to use them in\nproduction. We present AtMan that provides explanations of generative\ntransformer models at almost no extra cost. Specifically, AtMan is a\nmodality-agnostic perturbation method that manipulates the attention mechanisms\nof transformers to produce relevance maps for the input with respect to the\noutput prediction. Instead of using backpropagation, AtMan applies a\nparallelizable token-based search method based on cosine similarity\nneighborhood in the embedding space. Our exhaustive experiments on text and\nimage-text benchmarks demonstrate that AtMan outperforms current\nstate-of-the-art gradient-based methods on several metrics while being\ncomputationally efficient. As such, AtMan is suitable for use in large model\ninference deployments.\n","authors":["Björn Deiseroth","Mayukh Deb","Samuel Weinbach","Manuel Brack","Patrick Schramowski","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2301.08110v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17547v2","updated":"2025-01-07T17:23:55Z","published":"2024-12-23T13:08:23Z","title":"Probability-density-aware Semi-supervised Learning","summary":"  Semi-supervised learning (SSL) assumes that neighbor points lie in the same\ncategory (neighbor assumption), and points in different clusters belong to\nvarious categories (cluster assumption). Existing methods usually rely on\nsimilarity measures to retrieve the similar neighbor points, ignoring cluster\nassumption, which may not utilize unlabeled information sufficiently and\neffectively. This paper first provides a systematical investigation into the\nsignificant role of probability density in SSL and lays a solid theoretical\nfoundation for cluster assumption. To this end, we introduce a\nProbability-Density-Aware Measure (PM) to discern the similarity between\nneighbor points. To further improve Label Propagation, we also design a\nProbability-Density-Aware Measure Label Propagation (PMLP) algorithm to fully\nconsider the cluster assumption in label propagation. Last but not least, we\nprove that traditional pseudo-labeling could be viewed as a particular case of\nPMLP, which provides a comprehensive theoretical understanding of PMLP's\nsuperior performance. Extensive experiments demonstrate that PMLP achieves\noutstanding performance compared with other recent methods.\n","authors":["Shuyang Liu","Ruiqiu Zheng","Yunhang Shen","Ke Li","Xing Sun","Zhou Yu","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2412.17547v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16834v2","updated":"2025-01-07T17:05:11Z","published":"2024-06-24T17:42:03Z","title":"Statistical Error Bounds for GANs with Nonlinear Objective Functionals","summary":"  Generative adversarial networks (GANs) are unsupervised learning methods for\ntraining a generator distribution to produce samples that approximate those\ndrawn from a target distribution. Many such methods can be formulated as\nminimization of a metric or divergence between probability distributions.\nRecent works have derived statistical error bounds for GANs that are based on\nintegral probability metrics (IPMs), e.g., WGAN which is based on the\n1-Wasserstein metric. In general, IPMs are defined by optimizing a linear\nfunctional (difference of expectations) over a space of discriminators. A much\nlarger class of GANs, which we here call $(f,\\Gamma)$-GANs, can be constructed\nusing $f$-divergences (e.g., Jensen-Shannon, KL, or $\\alpha$-divergences)\ntogether with a regularizing discriminator space $\\Gamma$ (e.g., $1$-Lipschitz\nfunctions). These GANs have nonlinear objective functions, depending on the\nchoice of $f$, and have been shown to exhibit improved performance in a number\nof applications. In this work we derive statistical error bounds for\n$(f,\\Gamma)$-GANs for general classes of $f$ and $\\Gamma$ in the form of\nfinite-sample concentration inequalities. These results prove the statistical\nconsistency of $(f,\\Gamma)$-GANs and reduce to the known results for IPM-GANs\nin the appropriate limit. Finally, our results also give new insight into the\nperformance of GANs for distributions with unbounded support.\n","authors":["Jeremiah Birrell"],"pdf_url":"https://arxiv.org/pdf/2406.16834v2.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2501.03941v1","updated":"2025-01-07T17:02:33Z","published":"2025-01-07T17:02:33Z","title":"Synthetic Data Privacy Metrics","summary":"  Recent advancements in generative AI have made it possible to create\nsynthetic datasets that can be as accurate as real-world data for training AI\nmodels, powering statistical insights, and fostering collaboration with\nsensitive datasets while offering strong privacy guarantees. Effectively\nmeasuring the empirical privacy of synthetic data is an important step in the\nprocess. However, while there is a multitude of new privacy metrics being\npublished every day, there currently is no standardization. In this paper, we\nreview the pros and cons of popular metrics that include simulations of\nadversarial attacks. We also review current best practices for amending\ngenerative models to enhance the privacy of the data they create (e.g.\ndifferential privacy).\n","authors":["Amy Steier","Lipika Ramaswamy","Andre Manoel","Alexa Haushalter"],"pdf_url":"https://arxiv.org/pdf/2501.03941v1.pdf","comment":"14 pages, 2 figures"},{"id":"http://arxiv.org/abs/2402.14746v3","updated":"2025-01-07T16:58:05Z","published":"2024-02-22T18:06:19Z","title":"Scaling Efficient LLMs","summary":"  Trained LLMs are typically sparse in that most of the parameters are zero,\nraising questions on efficiency. In response, we inquire into efficient LLMs,\ni.e. those with the fewest parameters that achieve the desired accuracy on a\ntraining corpus. Specifically, we compare theoretical and empirical estimates\nfor training loss to obtain upper and lower bounds on the number of unique\nsequences in a natural training corpus as a function of its size. Our result\nimplies (1) to double the number of skills represented in a training corpus,\nthe corpus must scale more than four fold (2) for efficient LLMs, the number of\nparameters N and the size D of a natural training corpus scale as $N \\propto\nD^{0.44}$; (3) if the number of parameters of an LLM is smaller than the number\nof unique sequences in the training corpus, scaling up can uncover emergent\nskills.\n","authors":["B. N. Kausik"],"pdf_url":"https://arxiv.org/pdf/2402.14746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03937v1","updated":"2025-01-07T16:56:40Z","published":"2025-01-07T16:56:40Z","title":"A precise asymptotic analysis of learning diffusion models: theory and\n  insights","summary":"  In this manuscript, we consider the problem of learning a flow or\ndiffusion-based generative model parametrized by a two-layer auto-encoder,\ntrained with online stochastic gradient descent, on a high-dimensional target\ndensity with an underlying low-dimensional manifold structure. We derive a\ntight asymptotic characterization of low-dimensional projections of the\ndistribution of samples generated by the learned model, ascertaining in\nparticular its dependence on the number of training samples. Building on this\nanalysis, we discuss how mode collapse can arise, and lead to model collapse\nwhen the generative model is re-trained on generated synthetic data.\n","authors":["Hugo Cui","Cengiz Pehlevan","Yue M. Lu"],"pdf_url":"https://arxiv.org/pdf/2501.03937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03928v1","updated":"2025-01-07T16:45:37Z","published":"2025-01-07T16:45:37Z","title":"From Newswire to Nexus: Using text-based actor embeddings and\n  transformer networks to forecast conflict dynamics","summary":"  This study advances the field of conflict forecasting by using text-based\nactor embeddings with transformer models to predict dynamic changes in violent\nconflict patterns at the actor level. More specifically, we combine newswire\ntexts with structured conflict event data and leverage recent advances in\nNatural Language Processing (NLP) techniques to forecast escalations and\nde-escalations among conflicting actors, such as governments, militias,\nseparatist movements, and terrorists. This new approach accurately and promptly\ncaptures the inherently volatile patterns of violent conflicts, which existing\nmethods have not been able to achieve. To create this framework, we began by\ncurating and annotating a vast international newswire corpus, leveraging\nhand-labeled event data from the Uppsala Conflict Data Program. By using this\nhybrid dataset, our models can incorporate the textual context of news sources\nalong with the precision and detail of structured event data. This combination\nenables us to make both dynamic and granular predictions about conflict\ndevelopments. We validate our approach through rigorous back-testing against\nhistorical events, demonstrating superior out-of-sample predictive power. We\nfind that our approach is quite effective in identifying and predicting phases\nof conflict escalation and de-escalation, surpassing the capabilities of\ntraditional models. By focusing on actor interactions, our explicit goal is to\nprovide actionable insights to policymakers, humanitarian organizations, and\npeacekeeping operations in order to enable targeted and effective intervention\nstrategies.\n","authors":["Mihai Croicu","Simon Polichinel von der Maase"],"pdf_url":"https://arxiv.org/pdf/2501.03928v1.pdf","comment":"35 pages, 5 figures. Paper presented at the 120th American Political\n  Science Association Annual Meeting"},{"id":"http://arxiv.org/abs/2501.03923v1","updated":"2025-01-07T16:35:29Z","published":"2025-01-07T16:35:29Z","title":"Explainable AI model reveals disease-related mechanisms in single-cell\n  RNA-seq data","summary":"  Neurodegenerative diseases (NDDs) are complex and lack effective treatment\ndue to their poorly understood mechanism. The increasingly used data analysis\nfrom Single nucleus RNA Sequencing (snRNA-seq) allows to explore transcriptomic\nevents at a single cell level, yet face challenges in interpreting the\nmechanisms underlying a disease. On the other hand, Neural Network (NN) models\ncan handle complex data to offer insights but can be seen as black boxes with\npoor interpretability. In this context, explainable AI (XAI) emerges as a\nsolution that could help to understand disease-associated mechanisms when\ncombined with efficient NN models. However, limited research explores XAI in\nsingle-cell data. In this work, we implement a method for identifying\ndisease-related genes and the mechanistic explanation of disease progression\nbased on NN model combined with SHAP. We analyze available Huntington's disease\n(HD) data to identify both HD-altered genes and mechanisms by adding Gene Set\nEnrichment Analysis (GSEA) comparing two methods, differential gene expression\nanalysis (DGE) and NN combined with SHAP approach. Our results show that DGE\nand SHAP approaches offer both common and differential sets of altered genes\nand pathways, reinforcing the usefulness of XAI methods for a broader\nperspective of disease.\n","authors":["Mohammad Usman","Olga Varea","Petia Radeva","Josep Canals","Jordi Abante","Daniel Ortiz"],"pdf_url":"https://arxiv.org/pdf/2501.03923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19218v4","updated":"2025-01-07T16:31:31Z","published":"2023-10-30T01:34:33Z","title":"Exploring Federated Unlearning: Analysis, Comparison, and Insights","summary":"  The increasing demand for privacy-preserving machine learning has spurred\ninterest in federated unlearning, which enables the selective removal of data\nfrom models trained in federated systems. However, developing federated\nunlearning methods presents challenges, particularly in balancing three often\nconflicting objectives: privacy, accuracy, and efficiency. This paper provides\na comprehensive analysis of existing federated unlearning approaches, examining\ntheir algorithmic efficiency, impact on model accuracy, and effectiveness in\npreserving privacy. We discuss key trade-offs among these dimensions and\nhighlight their implications for practical applications across various domains.\nAdditionally, we propose the OpenFederatedUnlearning framework, a unified\nbenchmark for evaluating federated unlearning methods, incorporating classic\nbaselines and diverse performance metrics. Our findings aim to guide\npractitioners in navigating the complex interplay of these objectives, offering\ninsights to achieve effective and efficient federated unlearning. Finally, we\noutline directions for future research to further advance the state of\nfederated unlearning techniques.\n","authors":["Yang Zhao","Jiaxi Yang","Yiling Tao","Lixu Wang","Xiaoxiao Li","Dusit Niyato","H. Vincent Poor"],"pdf_url":"https://arxiv.org/pdf/2310.19218v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19223v2","updated":"2025-01-07T16:20:17Z","published":"2024-06-27T14:49:08Z","title":"T-FREE: Subword Tokenizer-Free Generative LLMs via Sparse\n  Representations for Memory-Efficient Embeddings","summary":"  Tokenizers are crucial for encoding information in Large Language Models, but\ntheir development has recently stagnated, and they contain inherent weaknesses.\nMajor limitations include computational overhead, ineffective vocabulary use,\nand unnecessarily large embedding and head layers. Additionally, their\nperformance is biased towards a reference corpus, leading to reduced\neffectiveness for underrepresented languages.\n  To remedy these issues, we propose T-FREE, which directly embeds words\nthrough sparse activation patterns over character triplets, and does not\nrequire a reference corpus. T-FREE inherently exploits morphological\nsimilarities and allows for strong compression of embedding layers. In our\nexhaustive experimental evaluation, we achieve competitive downstream\nperformance with a parameter reduction of more than 85% on these layers.\nFurther, T-FREE shows significant improvements in cross-lingual transfer\nlearning.\n","authors":["Björn Deiseroth","Manuel Brack","Patrick Schramowski","Kristian Kersting","Samuel Weinbach"],"pdf_url":"https://arxiv.org/pdf/2406.19223v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03905v1","updated":"2025-01-07T16:19:40Z","published":"2025-01-07T16:19:40Z","title":"mFabric: An Efficient and Scalable Fabric for Mixture-of-Experts\n  Training","summary":"  Mixture-of-Expert (MoE) models outperform conventional models by selectively\nactivating different subnets, named \\emph{experts}, on a per-token basis. This\ngated computation generates dynamic communications that cannot be determined\nbeforehand, challenging the existing GPU interconnects that remain\n\\emph{static} during the distributed training process. In this paper, we\nadvocate for a first-of-its-kind system, called mFabric, that unlocks topology\nreconfiguration \\emph{during} distributed MoE training. Towards this vision, we\nfirst perform a production measurement study and show that the MoE dynamic\ncommunication pattern has \\emph{strong locality}, alleviating the requirement\nof global reconfiguration. Based on this, we design and implement a\n\\emph{regionally reconfigurable high-bandwidth domain} on top of existing\nelectrical interconnects using optical circuit switching (OCS), achieving\nscalability while maintaining rapid adaptability. We have built a fully\nfunctional mFabric prototype with commodity hardware and a customized\ncollective communication runtime that trains state-of-the-art MoE models with\n\\emph{in-training} topology reconfiguration across 32 A100 GPUs. Large-scale\npacket-level simulations show that mFabric delivers comparable performance as\nthe non-blocking fat-tree fabric while boosting the training cost efficiency\n(e.g., performance per dollar) of four representative MoE models by\n1.2$\\times$--1.5$\\times$ and 1.9$\\times$--2.3$\\times$ at 100 Gbps and 400 Gbps\nlink bandwidths, respectively.\n","authors":["Xudong Liao","Yijun Sun","Han Tian","Xinchen Wan","Yilun Jin","Zilong Wang","Zhenghang Ren","Xinyang Huang","Wenxue Li","Kin Fai Tse","Zhizhen Zhong","Guyue Liu","Ying Zhang","Xiaofeng Ye","Yiming Zhang","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2501.03905v1.pdf","comment":"Corresponding authors: zhizhenz@mit.edu (Z. Zhong),\n  kaichen@cse.ust.hk (K. Chen)"},{"id":"http://arxiv.org/abs/2501.03904v1","updated":"2025-01-07T16:18:55Z","published":"2025-01-07T16:18:55Z","title":"Exploring the Potential of Large Language Models in Public\n  Transportation: San Antonio Case Study","summary":"  The integration of large language models (LLMs) into public transit systems\npresents a transformative opportunity to enhance urban mobility. This study\nexplores the potential of LLMs to revolutionize public transportation\nmanagement within the context of San Antonio's transit system. Leveraging the\ncapabilities of LLMs in natural language processing and data analysis, we\ninvestigate their capabilities to optimize route planning, reduce wait times,\nand provide personalized travel assistance. By utilizing the General Transit\nFeed Specification (GTFS) and other relevant data, this research aims to\ndemonstrate how LLMs can potentially improve resource allocation, elevate\npassenger satisfaction, and inform data-driven decision-making in transit\noperations. A comparative analysis of different ChatGPT models was conducted to\nassess their ability to understand transportation information, retrieve\nrelevant data, and provide comprehensive responses. Findings from this study\nsuggest that while LLMs hold immense promise for public transit, careful\nengineering and fine-tuning are essential to realizing their full potential.\nSan Antonio serves as a case study to inform the development of LLM-powered\ntransit systems in other urban environments.\n","authors":["Ramya Jonnala","Gongbo Liang","Jeong Yang","Izzat Alsmadi"],"pdf_url":"https://arxiv.org/pdf/2501.03904v1.pdf","comment":"This work is accepted to AAAI 2025 Workshop on AI for Urban Planning.\n  arXiv admin note: substantial text overlap with arXiv:2407.11003"},{"id":"http://arxiv.org/abs/2412.06866v3","updated":"2025-01-07T16:16:49Z","published":"2024-12-09T09:31:58Z","title":"LMS-AutoTSF: Learnable Multi-Scale Decomposition and Integrated\n  Autocorrelation for Time Series Forecasting","summary":"  Time series forecasting is an important challenge with significant\napplications in areas such as weather prediction, stock market analysis,\nscientific simulations and industrial process analysis. In this work, we\nintroduce LMS-AutoTSF, a novel time series forecasting architecture that\nincorporates autocorrelation while leveraging dual encoders operating at\nmultiple scales. Unlike models that rely on predefined trend and seasonal\ncomponents, LMS-AutoTSF employs two separate encoders per scale: one focusing\non low-pass filtering to capture trends and the other utilizing high-pass\nfiltering to model seasonal variations. These filters are learnable, allowing\nthe model to dynamically adapt and isolate trend and seasonal components\ndirectly in the frequency domain. A key innovation in our approach is the\nintegration of autocorrelation, achieved by computing lagged differences in\ntime steps, which enables the model to capture dependencies across time more\neffectively. Each encoder processes the input through fully connected layers to\nhandle temporal and channel interactions. By combining frequency-domain\nfiltering, autocorrelation-based temporal modeling, and channel-wise\ntransformations, LMS-AutoTSF not only accurately captures long-term\ndependencies and fine-grained patterns but also operates more efficiently\ncompared to other state-of-the-art methods. Its lightweight design ensures\nfaster processing while maintaining high precision in forecasting across\ndiverse time horizons. The source code is publicly available at\n\\url{http://github.com/mribrahim/LMS-TSF}\n","authors":["Ibrahim Delibasoglu","Sanjay Chakraborty","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2412.06866v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12615v3","updated":"2025-01-07T16:11:10Z","published":"2023-11-21T13:59:00Z","title":"Koopman Learning with Episodic Memory","summary":"  Koopman operator theory has found significant success in learning models of\ncomplex, real-world dynamical systems, enabling prediction and control. The\ngreater interpretability and lower computational costs of these models,\ncompared to traditional machine learning methodologies, make Koopman learning\nan especially appealing approach. Despite this, little work has been performed\non endowing Koopman learning with the ability to leverage its own failures. To\naddress this, we equip Koopman methods -- developed for predicting\nnon-autonomous time-series -- with an episodic memory mechanism, enabling\nglobal recall of (or attention to) periods in time where similar dynamics\npreviously occurred. We find that a basic implementation of Koopman learning\nwith episodic memory leads to significant improvements in prediction on\nsynthetic and real-world data. Our framework has considerable potential for\nexpansion, allowing for future advances, and opens exciting new directions for\nKoopman learning.\n","authors":["William T. Redman","Dean Huang","Maria Fonoberova","Igor Mezić"],"pdf_url":"https://arxiv.org/pdf/2311.12615v3.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.03902v1","updated":"2025-01-07T16:10:09Z","published":"2025-01-07T16:10:09Z","title":"Explainable Reinforcement Learning via Temporal Policy Decomposition","summary":"  We investigate the explainability of Reinforcement Learning (RL) policies\nfrom a temporal perspective, focusing on the sequence of future outcomes\nassociated with individual actions. In RL, value functions compress information\nabout rewards collected across multiple trajectories and over an infinite\nhorizon, allowing a compact form of knowledge representation. However, this\ncompression obscures the temporal details inherent in sequential\ndecision-making, presenting a key challenge for interpretability. We present\nTemporal Policy Decomposition (TPD), a novel explainability approach that\nexplains individual RL actions in terms of their Expected Future Outcome (EFO).\nThese explanations decompose generalized value functions into a sequence of\nEFOs, one for each time step up to a prediction horizon of interest, revealing\ninsights into when specific outcomes are expected to occur. We leverage\nfixed-horizon temporal difference learning to devise an off-policy method for\nlearning EFOs for both optimal and suboptimal actions, enabling contrastive\nexplanations consisting of EFOs for different state-action pairs. Our\nexperiments demonstrate that TPD generates accurate explanations that (i)\nclarify the policy's future strategy and anticipated trajectory for a given\naction and (ii) improve understanding of the reward composition, facilitating\nfine-tuning of the reward function to align with human expectations.\n","authors":["Franco Ruggeri","Alessio Russo","Rafia Inam","Karl Henrik Johansson"],"pdf_url":"https://arxiv.org/pdf/2501.03902v1.pdf","comment":"21 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.23440v2","updated":"2025-01-07T16:07:33Z","published":"2024-10-30T20:32:30Z","title":"Learning Lipschitz Operators with respect to Gaussian Measures with\n  Near-Optimal Sample Complexity","summary":"  Operator learning, the approximation of mappings between infinite-dimensional\nfunction spaces using ideas from machine learning, has gained increasing\nresearch attention in recent years. Approximate operators, learned from data,\nhold promise to serve as efficient surrogate models for problems in\ncomputational science and engineering, complementing traditional numerical\nmethods. However, despite their empirical success, our understanding of the\nunderpinning mathematical theory is in large part still incomplete. In this\npaper, we study the approximation of Lipschitz operators in expectation with\nrespect to Gaussian measures. We prove higher Gaussian Sobolev regularity of\nLipschitz operators and establish lower and upper bounds on the Hermite\npolynomial approximation error. We further consider the reconstruction of\nLipschitz operators from $m$ arbitrary (adaptive) linear samples. A key finding\nis the tight characterization of the smallest achievable error for all possible\n(adaptive) sampling and reconstruction maps in terms of $m$. It is shown that\nHermite polynomial approximation is an optimal recovery strategy, but we have\nthe following curse of sample complexity: No method to approximate Lipschitz\noperators based on $m$ samples can achieve algebraic convergence rates in $m$.\nOn the positive side, we prove that a sufficiently fast spectral decay of the\ncovariance operator of the Gaussian measure guarantees convergence rates which\nare arbitrarily close to any algebraic rate in the large data limit $m \\to\n\\infty$. A main focus of this work is on the recovery of Lipschitz operators\nfrom finitely many point samples. We use Christoffel sampling and weighted\nleast-squares approximation to propose an algorithm which provably achieves\nnear-optimal sample complexity in high probability.\n","authors":["Ben Adcock","Michael Griebel","Gregor Maier"],"pdf_url":"https://arxiv.org/pdf/2410.23440v2.pdf","comment":"56 pages"},{"id":"http://arxiv.org/abs/2408.11876v2","updated":"2025-01-07T16:01:15Z","published":"2024-08-20T13:19:06Z","title":"From Glucose Patterns to Health Outcomes: A Generalizable Foundation\n  Model for Continuous Glucose Monitor Data Analysis","summary":"  Recent advances in SSL enabled novel medical AI models, known as foundation\nmodels, offer great potential for better characterizing health from diverse\nbiomedical data. CGM provides rich, temporal data on glycemic patterns, but its\nfull potential for predicting broader health outcomes remains underutilized.\nHere, we present GluFormer, a generative foundation model for CGM data that\nlearns nuanced glycemic patterns and translates them into predictive\nrepresentations of metabolic health. Trained on over 10 million CGM\nmeasurements from 10,812 adults, primarily without diabetes, GluFormer uses\nautoregressive token prediction to capture longitudinal glucose dynamics. We\nshow that GluFormer generalizes to 19 external cohorts (n=6,044) spanning\ndifferent ethnicities and ages, 5 countries, 8 CGM devices, and diverse\npathophysiological states. GluFormers representations exceed the performance of\ncurrent CGM metrics, such as the Glucose Management Indicator (GMI), for\nforecasting clinical measures. In a longitudinal study of 580 adults with CGM\ndata and 12-year follow-up, GluFormer identifies individuals at elevated risk\nof developing diabetes more effectively than blood HbA1C%, capturing 66% of all\nnew-onset diabetes diagnoses in the top quartile versus 7% in the bottom\nquartile. Similarly, 69% of cardiovascular-death events occurred in the top\nquartile with none in the bottom quartile, demonstrating powerful risk\nstratification beyond traditional glycemic metrics. We also show that CGM\nrepresentations from pre-intervention periods in Randomized Clinical Trials\noutperform other methods in predicting primary and secondary outcomes. When\nintegrating dietary data into GluFormer, we show that the multi-modal version\nof the model can accurately generate CGM data based on dietary intake data,\nsimulate outcomes of dietary interventions, and predict individual responses to\nspecific foods.\n","authors":["Guy Lutsker","Gal Sapir","Smadar Shilo","Jordi Merino","Anastasia Godneva","Jerry R Greenfield","Dorit Samocha-Bonet","Raja Dhir","Francisco Gude","Shie Mannor","Eli Meirom","Gal Chechik","Hagai Rossman","Eran Segal"],"pdf_url":"https://arxiv.org/pdf/2408.11876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01006v3","updated":"2025-01-07T16:00:44Z","published":"2024-11-01T20:04:59Z","title":"Abstracted Shapes as Tokens -- A Generalizable and Interpretable Model\n  for Time-series Classification","summary":"  In time-series analysis, many recent works seek to provide a unified view and\nrepresentation for time-series across multiple domains, leading to the\ndevelopment of foundation models for time-series data. Despite diverse modeling\ntechniques, existing models are black boxes and fail to provide insights and\nexplanations about their representations. In this paper, we present VQShape, a\npre-trained, generalizable, and interpretable model for time-series\nrepresentation learning and classification. By introducing a novel\nrepresentation for time-series data, we forge a connection between the latent\nspace of VQShape and shape-level features. Using vector quantization, we show\nthat time-series from different domains can be described using a unified set of\nlow-dimensional codes, where each code can be represented as an abstracted\nshape in the time domain. On classification tasks, we show that the\nrepresentations of VQShape can be utilized to build interpretable classifiers,\nachieving comparable performance to specialist models. Additionally, in\nzero-shot learning, VQShape and its codebook can generalize to previously\nunseen datasets and domains that are not included in the pre-training process.\nThe code and pre-trained weights are available at\nhttps://github.com/YunshiWen/VQShape.\n","authors":["Yunshi Wen","Tengfei Ma","Tsui-Wei Weng","Lam M. Nguyen","Anak Agung Julius"],"pdf_url":"https://arxiv.org/pdf/2411.01006v3.pdf","comment":"Published in Neural Information Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2501.03888v1","updated":"2025-01-07T15:51:49Z","published":"2025-01-07T15:51:49Z","title":"Neural DNF-MT: A Neuro-symbolic Approach for Learning Interpretable and\n  Editable Policies","summary":"  Although deep reinforcement learning has been shown to be effective, the\nmodel's black-box nature presents barriers to direct policy interpretation. To\naddress this problem, we propose a neuro-symbolic approach called neural DNF-MT\nfor end-to-end policy learning. The differentiable nature of the neural DNF-MT\nmodel enables the use of deep actor-critic algorithms for training. At the same\ntime, its architecture is designed so that trained models can be directly\ntranslated into interpretable policies expressed as standard (bivalent or\nprobabilistic) logic programs. Moreover, additional layers can be included to\nextract abstract features from complex observations, acting as a form of\npredicate invention. The logic representations are highly interpretable, and we\nshow how the bivalent representations of deterministic policies can be edited\nand incorporated back into a neural model, facilitating manual intervention and\nadaptation of learned policies. We evaluate our approach on a range of tasks\nrequiring learning deterministic or stochastic behaviours from various forms of\nobservations. Our empirical results show that our neural DNF-MT model performs\nat the level of competing black-box methods whilst providing interpretable\npolicies.\n","authors":["Kexin Gu Baugh","Luke Dickens","Alessandra Russo"],"pdf_url":"https://arxiv.org/pdf/2501.03888v1.pdf","comment":"AAMAS 2025"},{"id":"http://arxiv.org/abs/2410.11463v2","updated":"2025-01-07T15:48:15Z","published":"2024-10-15T10:10:33Z","title":"Advanced Persistent Threats (APT) Attribution Using Deep Reinforcement\n  Learning","summary":"  The development of the DRL model for malware attribution involved extensive\nresearch, iterative coding, and numerous adjustments based on the insights\ngathered from predecessor models and contemporary research papers. This\npreparatory work was essential to establish a robust foundation for the model,\nensuring it could adapt and respond effectively to the dynamic nature of\nmalware threats. Initially, the model struggled with low accuracy levels, but\nthrough persistent adjustments to its architecture and learning algorithms,\naccuracy improved dramatically from about 7 percent to over 73 percent in early\niterations. By the end of the training, the model consistently reached accuracy\nlevels near 98 percent, demonstrating its strong capability to accurately\nrecognise and attribute malware activities. This upward trajectory in training\naccuracy is graphically represented in the Figure, which vividly illustrates\nthe model maturation and increasing proficiency over time.\n","authors":["Animesh Singh Basnet","Mohamed Chahine Ghanem","Dipo Dunsin","Wiktor Sowinski-Mydlarz"],"pdf_url":"https://arxiv.org/pdf/2410.11463v2.pdf","comment":"21 Pages"},{"id":"http://arxiv.org/abs/2405.03732v3","updated":"2025-01-07T15:46:25Z","published":"2024-05-06T10:53:13Z","title":"Deep Learning-based Accelerated MR Cholangiopancreatography without\n  Fully-sampled Data","summary":"  The purpose of this study was to accelerate MR cholangiopancreatography\n(MRCP) acquisitions using deep learning-based (DL) reconstruction at 3T and\n0.55T. A total of 35 healthy volunteers underwent conventional two-fold\naccelerated MRCP scans at field strengths of 3T and 0.55T. We trained DL\nreconstructions using two different training strategies, supervised (SV) and\nself-supervised (SSV), with retrospectively six-fold undersampled data obtained\nat 3T. We then evaluated the DL reconstructions against standard techniques,\nparallel imaging (PI) and compressed sensing (CS), focusing on peak\nsignal-to-noise ratio (PSNR) and structural similarity (SSIM) as metrics. We\nalso tested DL reconstructions with prospectively accelerated acquisitions and\nevaluated their robustness when changing fields strengths from 3T to 0.55T. DL\nreconstructions demonstrated a reduction in average acquisition time from\n599/542 to 255/180 seconds for MRCP at 3T/0.55T. In both retrospective and\nprospective undersampling, PSNR and SSIM of DL reconstructions were higher than\nthose of PI and CS. At the same time, DL reconstructions preserved the image\nquality of undersampled data, including sharpness and the visibility of\nhepatobiliary ducts. In addition, both DL approaches produced high-quality\nreconstructions at 0.55T. In summary, DL reconstructions trained for highly\naccelerated MRCP enabled a reduction in acquisition time by a factor of 2.4/3.0\nat 3T/0.55T while maintaining the image quality of conventional acquisitions.\n","authors":["Jinho Kim","Marcel Dominik Nickel","Florian Knoll"],"pdf_url":"https://arxiv.org/pdf/2405.03732v3.pdf","comment":"19 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.03880v1","updated":"2025-01-07T15:43:36Z","published":"2025-01-07T15:43:36Z","title":"SELMA3D challenge: Self-supervised learning for 3D light-sheet\n  microscopy image segmentation","summary":"  Recent innovations in light sheet microscopy, paired with developments in\ntissue clearing techniques, enable the 3D imaging of large mammalian tissues\nwith cellular resolution. Combined with the progress in large-scale data\nanalysis, driven by deep learning, these innovations empower researchers to\nrapidly investigate the morphological and functional properties of diverse\nbiological samples. Segmentation, a crucial preliminary step in the analysis\nprocess, can be automated using domain-specific deep learning models with\nexpert-level performance. However, these models exhibit high sensitivity to\ndomain shifts, leading to a significant drop in accuracy when applied to data\noutside their training distribution. To address this limitation, and inspired\nby the recent success of self-supervised learning in training generalizable\nmodels, we organized the SELMA3D Challenge during the MICCAI 2024 conference.\nSELMA3D provides a vast collection of light-sheet images from cleared mice and\nhuman brains, comprising 35 large 3D images-each with over 1000^3 voxels-and\n315 annotated small patches for finetuning, preliminary testing and final\ntesting. The dataset encompasses diverse biological structures, including\nvessel-like and spot-like structures. Five teams participated in all phases of\nthe challenge, and their proposed methods are reviewed in this paper.\nQuantitative and qualitative results from most participating teams demonstrate\nthat self-supervised learning on large datasets improves segmentation model\nperformance and generalization. We will continue to support and extend SELMA3D\nas an inaugural MICCAI challenge focused on self-supervised learning for 3D\nmicroscopy image segmentation.\n","authors":["Ying Chen","Rami Al-Maskari","Izabela Horvath","Mayar Ali","Luciano Höher","Kaiyuan Yang","Zengming Lin","Zhiwei Zhai","Mengzhe Shen","Dejin Xun","Yi Wang","Tony Xu","Maged Goubran","Yunheng Wu","Ali Erturk","Johannes C. Paetzold"],"pdf_url":"https://arxiv.org/pdf/2501.03880v1.pdf","comment":"1st version"},{"id":"http://arxiv.org/abs/2501.03877v1","updated":"2025-01-07T15:40:22Z","published":"2025-01-07T15:40:22Z","title":"Stochastically Constrained Best Arm Identification with Thompson\n  Sampling","summary":"  We consider the problem of the best arm identification in the presence of\nstochastic constraints, where there is a finite number of arms associated with\nmultiple performance measures. The goal is to identify the arm that optimizes\nthe objective measure subject to constraints on the remaining measures. We will\nexplore the popular idea of Thompson sampling (TS) as a means to solve it. To\nthe best of our knowledge, it is the first attempt to extend TS to this\nproblem. We will design a TS-based sampling algorithm, establish its asymptotic\noptimality in the rate of posterior convergence, and demonstrate its superior\nperformance using numerical examples.\n","authors":["Le Yang","Siyang Gao","Cheng Li","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03877v1.pdf","comment":"30 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2501.03874v1","updated":"2025-01-07T15:38:13Z","published":"2025-01-07T15:38:13Z","title":"Neuromorphic Optical Tracking and Imaging of Randomly Moving Targets\n  through Strongly Scattering Media","summary":"  Tracking and acquiring simultaneous optical images of randomly moving targets\nobscured by scattering media remains a challenging problem of importance to\nmany applications that require precise object localization and identification.\nIn this work we develop an end-to-end neuromorphic optical engineering and\ncomputational approach to demonstrate how to track and image normally invisible\nobjects by combining an event detecting camera with a multistage neuromorphic\ndeep learning strategy. Photons emerging from dense scattering media are\ndetected by the event camera and converted to pixel-wise asynchronized spike\ntrains - a first step in isolating object-specific information from the\ndominant uninformative background. Spiking data is fed into a deep spiking\nneural network (SNN) engine where object tracking and image reconstruction are\nperformed by two separate yet interconnected modules running in parallel in\ndiscrete time steps over the event duration. Through benchtop experiments we\ndemonstrate tracking and imaging randomly moving objects in dense turbid media\nas well as image reconstruction of spatially stationary but optically dynamic\nobjects. Standardized character sets serve as representative proxies for\ngeometrically complex objects, underscoring the method's generality. The\nresults highlight the advantages of a fully neuromorphic approach in meeting a\nmajor imaging technology with high computational efficiency and low power\nconsumption.\n","authors":["Ning Zhang","Timothy Shea","Arto Nurmikko"],"pdf_url":"https://arxiv.org/pdf/2501.03874v1.pdf","comment":"22 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.13850v3","updated":"2025-01-07T15:28:09Z","published":"2024-10-17T17:59:02Z","title":"Influence Functions for Scalable Data Attribution in Diffusion Models","summary":"  Diffusion models have led to significant advancements in generative\nmodelling. Yet their widespread adoption poses challenges regarding data\nattribution and interpretability. In this paper, we aim to help address such\nchallenges in diffusion models by developing an influence functions framework.\nInfluence function-based data attribution methods approximate how a model's\noutput would have changed if some training data were removed. In supervised\nlearning, this is usually used for predicting how the loss on a particular\nexample would change. For diffusion models, we focus on predicting the change\nin the probability of generating a particular example via several proxy\nmeasurements. We show how to formulate influence functions for such quantities\nand how previously proposed methods can be interpreted as particular design\nchoices in our framework. To ensure scalability of the Hessian computations in\ninfluence functions, we systematically develop K-FAC approximations based on\ngeneralised Gauss-Newton matrices specifically tailored to diffusion models. We\nrecast previously proposed methods as specific design choices in our framework\nand show that our recommended method outperforms previous data attribution\napproaches on common evaluations, such as the Linear Data-modelling Score (LDS)\nor retraining without top influences, without the need for method-specific\nhyperparameter tuning.\n","authors":["Bruno Mlodozeniec","Runa Eschenhagen","Juhan Bae","Alexander Immer","David Krueger","Richard Turner"],"pdf_url":"https://arxiv.org/pdf/2410.13850v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11727v2","updated":"2025-01-07T15:26:14Z","published":"2024-05-20T02:09:07Z","title":"Highway Graph to Accelerate Reinforcement Learning","summary":"  Reinforcement Learning (RL) algorithms often struggle with low training\nefficiency. A common approach to address this challenge is integrating\nmodel-based planning algorithms, such as Monte Carlo Tree Search (MCTS) or\nValue Iteration (VI), into the environmental model. However, VI requires\niterating over a large tensor which updates the value of the preceding state\nbased on the succeeding state through value propagation, resulting in\ncomputationally intensive operations. To enhance the RL training efficiency, we\npropose improving the efficiency of the value learning process. In\ndeterministic environments with discrete state and action spaces, we observe\nthat on the sampled empirical state-transition graph, a non-branching sequence\nof transitions-termed a highway-can take the agent to another state without\ndeviation through intermediate states. On these non-branching highways, the\nvalue-updating process can be streamlined into a single-step operation,\neliminating the need for step-by-step updates. Building on this observation, we\nintroduce the highway graph to model state transitions. The highway graph\ncompresses the transition model into a compact representation, where edges can\nencapsulate multiple state transitions, enabling value propagation across\nmultiple time steps in a single iteration. By integrating the highway graph\ninto RL, the training process is significantly accelerated, particularly in the\nearly stages of training. Experiments across four categories of environments\ndemonstrate that our method learns significantly faster than established and\nstate-of-the-art RL algorithms (often by a factor of 10 to 150) while\nmaintaining equal or superior expected returns. Furthermore, a deep neural\nnetwork-based agent trained using the highway graph exhibits improved\ngeneralization capabilities and reduced storage costs. Code is publicly\navailable at https://github.com/coodest/highwayRL.\n","authors":["Zidu Yin","Zhen Zhang","Dong Gong","Stefano V. Albrecht","Javen Q. Shi"],"pdf_url":"https://arxiv.org/pdf/2405.11727v2.pdf","comment":"Published in TMLR"},{"id":"http://arxiv.org/abs/2501.03865v1","updated":"2025-01-07T15:24:53Z","published":"2025-01-07T15:24:53Z","title":"Truthful mechanisms for linear bandit games with private contexts","summary":"  The contextual bandit problem, where agents arrive sequentially with personal\ncontexts and the system adapts its arm allocation decisions accordingly, has\nrecently garnered increasing attention for enabling more personalized outcomes.\nHowever, in many healthcare and recommendation applications, agents have\nprivate profiles and may misreport their contexts to gain from the system. For\nexample, in adaptive clinical trials, where hospitals sequentially recruit\nvolunteers to test multiple new treatments and adjust plans based on\nvolunteers' reported profiles such as symptoms and interim data, participants\nmay misreport severe side effects like allergy and nausea to avoid perceived\nsuboptimal treatments. We are the first to study this issue of private context\nmisreporting in a stochastic contextual bandit game between the system and\nnon-repeated agents. We show that traditional low-regret algorithms, such as\nUCB family algorithms and Thompson sampling, fail to ensure truthful reporting\nand can result in linear regret in the worst case, while traditional truthful\nalgorithms like explore-then-commit (ETC) and $\\epsilon$-greedy algorithm incur\nsublinear but high regret. We propose a mechanism that uses a linear program to\nensure truthfulness while minimizing deviation from Thompson sampling, yielding\nan $O(\\ln T)$ frequentist regret. Our numerical experiments further demonstrate\nstrong performance in multiple contexts and across other distribution families.\n","authors":["Yiting Hu","Lingjie Duan"],"pdf_url":"https://arxiv.org/pdf/2501.03865v1.pdf","comment":"To appear at AAMAS 2025"},{"id":"http://arxiv.org/abs/2501.03858v1","updated":"2025-01-07T15:14:58Z","published":"2025-01-07T15:14:58Z","title":"Symmetry and Generalisation in Machine Learning","summary":"  This work is about understanding the impact of invariance and equivariance on\ngeneralisation in supervised learning. We use the perspective afforded by an\naveraging operator to show that for any predictor that is not equivariant,\nthere is an equivariant predictor with strictly lower test risk on all\nregression problems where the equivariance is correctly specified. This\nconstitutes a rigorous proof that symmetry, in the form of invariance or\nequivariance, is a useful inductive bias.\n  We apply these ideas to equivariance and invariance in random design least\nsquares and kernel ridge regression respectively. This allows us to specify the\nreduction in expected test risk in more concrete settings and express it in\nterms of properties of the group, the model and the data.\n  Along the way, we give examples and additional results to demonstrate the\nutility of the averaging operator approach in analysing equivariant predictors.\nIn addition, we adopt an alternative perspective and formalise the common\nintuition that learning with invariant models reduces to a problem in terms of\norbit representatives. The formalism extends naturally to a similar intuition\nfor equivariant models. We conclude by connecting the two perspectives and\ngiving some ideas for future work.\n","authors":["Hayder Elesedy"],"pdf_url":"https://arxiv.org/pdf/2501.03858v1.pdf","comment":"PhD Thesis"},{"id":"http://arxiv.org/abs/2501.03038v2","updated":"2025-01-07T15:13:41Z","published":"2025-01-06T14:26:00Z","title":"Piano Transcription by Hierarchical Language Modeling with Pretrained\n  Roll-based Encoders","summary":"  Automatic Music Transcription (AMT), aiming to get musical notes from raw\naudio, typically uses frame-level systems with piano-roll outputs or language\nmodel (LM)-based systems with note-level predictions. However, frame-level\nsystems require manual thresholding, while the LM-based systems struggle with\nlong sequences. In this paper, we propose a hybrid method combining pre-trained\nroll-based encoders with an LM decoder to leverage the strengths of both\nmethods. Besides, our approach employs a hierarchical prediction strategy,\nfirst predicting onset and pitch, then velocity, and finally offset. The\nhierarchical prediction strategy reduces computational costs by breaking down\nlong sequences into different hierarchies. Evaluated on two benchmark\nroll-based encoders, our method outperforms traditional piano-roll outputs 0.01\nand 0.022 in onset-offset-velocity F1 score, demonstrating its potential as a\nperformance-enhancing plug-in for arbitrary roll-based music transcription\nencoder.\n","authors":["Dichucheng Li","Yongyi Zang","Qiuqiang Kong"],"pdf_url":"https://arxiv.org/pdf/2501.03038v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.03853v1","updated":"2025-01-07T15:10:07Z","published":"2025-01-07T15:10:07Z","title":"Leveraging time and parameters for nonlinear model reduction methods","summary":"  In this paper, we consider model order reduction (MOR) methods for problems\nwith slowly decaying Kolmogorov $n$-widths as, e.g., certain wave-like or\ntransport-dominated problems. To overcome this Kolmogorov barrier within MOR,\nnonlinear projections are used, which are often realized numerically using\nautoencoders. These autoencoders generally consist of a nonlinear encoder and a\nnonlinear decoder and involve costly training of the hyperparameters to obtain\na good approximation quality of the reduced system. To facilitate the training\nprocess, we show that extending the to-be-reduced system and its corresponding\ntraining data makes it possible to replace the nonlinear encoder with a linear\nencoder without sacrificing accuracy, thus roughly halving the number of\nhyperparameters to be trained.\n","authors":["Silke Glas","Benjamin Unger"],"pdf_url":"https://arxiv.org/pdf/2501.03853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16670v2","updated":"2025-01-07T15:00:20Z","published":"2024-09-25T06:57:42Z","title":"GraphLoRA: Structure-Aware Contrastive Low-Rank Adaptation for\n  Cross-Graph Transfer Learning","summary":"  Graph Neural Networks (GNNs) have demonstrated remarkable proficiency in\nhandling a range of graph analytical tasks across various domains, such as\ne-commerce and social networks. Despite their versatility, GNNs face\nsignificant challenges in transferability, limiting their utility in real-world\napplications. Existing research in GNN transfer learning overlooks\ndiscrepancies in distribution among various graph datasets, facing challenges\nwhen transferring across different distributions. How to effectively adopt a\nwell-trained GNN to new graphs with varying feature and structural\ndistributions remains an under-explored problem. Taking inspiration from the\nsuccess of Low-Rank Adaptation (LoRA) in adapting large language models to\nvarious domains, we propose GraphLoRA, an effective and parameter-efficient\nmethod for transferring well-trained GNNs to diverse graph domains.\nSpecifically, we first propose a Structure-aware Maximum Mean Discrepancy\n(SMMD) to align divergent node feature distributions across source and target\ngraphs. Moreover, we introduce low-rank adaptation by injecting a small\ntrainable GNN alongside the pre-trained one, effectively bridging structural\ndistribution gaps while mitigating the catastrophic forgetting. Additionally, a\nstructure-aware regularization objective is proposed to enhance the\nadaptability of the pre-trained GNN to target graph with scarce supervision\nlabels. Extensive experiments on eight real-world datasets demonstrate the\neffectiveness of GraphLoRA against fourteen baselines by tuning only 20% of\nparameters, even across disparate graph domains. The code is available at\nhttps://github.com/AllminerLab/GraphLoRA.\n","authors":["Zhe-Rui Yang","Jindong Han","Chang-Dong Wang","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2409.16670v2.pdf","comment":"Accepted by KDD2025"},{"id":"http://arxiv.org/abs/2501.03843v1","updated":"2025-01-07T14:53:35Z","published":"2025-01-07T14:53:35Z","title":"BERTopic for Topic Modeling of Hindi Short Texts: A Comparative Study","summary":"  As short text data in native languages like Hindi increasingly appear in\nmodern media, robust methods for topic modeling on such data have gained\nimportance. This study investigates the performance of BERTopic in modeling\nHindi short texts, an area that has been under-explored in existing research.\nUsing contextual embeddings, BERTopic can capture semantic relationships in\ndata, making it potentially more effective than traditional models, especially\nfor short and diverse texts. We evaluate BERTopic using 6 different document\nembedding models and compare its performance against 8 established topic\nmodeling techniques, such as Latent Dirichlet Allocation (LDA), Non-negative\nMatrix Factorization (NMF), Latent Semantic Indexing (LSI), Additive\nRegularization of Topic Models (ARTM), Probabilistic Latent Semantic Analysis\n(PLSA), Embedded Topic Model (ETM), Combined Topic Model (CTM), and Top2Vec.\nThe models are assessed using coherence scores across a range of topic counts.\nOur results reveal that BERTopic consistently outperforms other models in\ncapturing coherent topics from short Hindi texts.\n","authors":["Atharva Mutsaddi","Anvi Jamkhande","Aryan Thakre","Yashodhara Haribhakta"],"pdf_url":"https://arxiv.org/pdf/2501.03843v1.pdf","comment":"Accepted into IndoNLP: The First Workshop on Natural Language\n  Processing for Indo-Aryan and Dravidian Languages, collocated with COLING\n  2025. Set to appear in the workshop proceedings published in ACL Anthology"},{"id":"http://arxiv.org/abs/2403.18873v2","updated":"2025-01-07T14:52:34Z","published":"2024-03-26T14:42:46Z","title":"Predicting risk of cardiovascular disease using retinal OCT imaging","summary":"  Cardiovascular diseases (CVD) are the leading cause of death globally.\nNon-invasive, cost-effective imaging techniques play a crucial role in early\ndetection and prevention of CVD. Optical coherence tomography (OCT) has gained\nrecognition as a potential tool for early CVD risk prediction, though its use\nremains underexplored. In this study, we investigated the potential of OCT as\nan additional imaging technique to predict future CVD events. We analysed\nretinal OCT data from the UK Biobank. The dataset included 612 patients who\nsuffered a myocardial infarction (MI) or stroke within five years of imaging\nand 2,234 controls without CVD (total: 2,846 participants). A self-supervised\ndeep learning approach based on Variational Autoencoders (VAE) was used to\nextract low-dimensional latent representations from high-dimensional 3D OCT\nimages, capturing distinct features of retinal layers. These latent features,\nalong with clinical data, were used to train a Random Forest (RF) classifier to\ndifferentiate between patients at risk of future CVD events (MI or stroke) and\nhealthy controls. Our model achieved an AUC of 0.75, sensitivity of 0.70,\nspecificity of 0.70, and accuracy of 0.70, outperforming the QRISK3 score (the\nthird version of the QRISK cardiovascular disease risk prediction algorithm;\nAUC = 0.60, sensitivity = 0.60, specificity = 0.55, accuracy = 0.55). The\nchoroidal layer in OCT images was identified as a key predictor of future CVD\nevents, revealed through a novel model explainability approach. This study\ndemonstrates that retinal OCT imaging is a cost-effective, non-invasive\nalternative for predicting CVD risk, offering potential for widespread\napplication in optometry practices and hospitals.\n","authors":["Cynthia Maldonado-Garcia","Rodrigo Bonazzola","Enzo Ferrante","Thomas H Julian","Panagiotis I Sergouniotis","Nishant Ravikumara","Alejandro F Frangi"],"pdf_url":"https://arxiv.org/pdf/2403.18873v2.pdf","comment":"New version - 26 pages for main manuscript, 7 figures, 7 pages for\n  appendix and preprint for a journal"},{"id":"http://arxiv.org/abs/2501.03840v1","updated":"2025-01-07T14:50:05Z","published":"2025-01-07T14:50:05Z","title":"Machine learning applications in archaeological practices: a review","summary":"  Artificial intelligence and machine learning applications in archaeology have\nincreased significantly in recent years, and these now span all subfields,\ngeographical regions, and time periods. The prevalence and success of these\napplications have remained largely unexamined, as recent reviews on the use of\nmachine learning in archaeology have only focused only on specific subfields of\narchaeology. Our review examined an exhaustive corpus of 135 articles published\nbetween 1997 and 2022. We observed a significant increase in the number of\nrelevant publications from 2019 onwards. Automatic structure detection and\nartefact classification were the most represented tasks in the articles\nreviewed, followed by taphonomy, and archaeological predictive modelling. From\nthe review, clustering and unsupervised methods were underrepresented compared\nto supervised models. Artificial neural networks and ensemble learning account\nfor two thirds of the total number of models used. However, if machine learning\nis gaining in popularity it remains subject to misunderstanding. We observed,\nin some cases, poorly defined requirements and caveats of the machine learning\nmethods used. Furthermore, the goals and the needs of machine learning\napplications for archaeological purposes are in some cases unclear or poorly\nexpressed. To address this, we proposed a workflow guide for archaeologists to\ndevelop coherent and consistent methodologies adapted to their research\nquestions, project scale and data. As in many other areas, machine learning is\nrapidly becoming an important tool in archaeological research and practice,\nuseful for the analyses of large and multivariate data, although not without\nlimitations. This review highlights the importance of well-defined and\nwell-reported structured methodologies and collaborative practices to maximise\nthe potential of applications of machine learning methods in archaeology.\n","authors":["Mathias Bellat","Jordy D. Orellana Figueroa","Jonathan S. Reeves","Ruhollah Taghizadeh-Mehrjardi","Claudio Tennie","Thomas Scholten"],"pdf_url":"https://arxiv.org/pdf/2501.03840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12968v2","updated":"2025-01-07T14:45:04Z","published":"2024-12-17T14:53:38Z","title":"On Local Overfitting and Forgetting in Deep Neural Networks","summary":"  The infrequent occurrence of overfitting in deep neural networks is\nperplexing: contrary to theoretical expectations, increasing model size often\nenhances performance in practice. But what if overfitting does occur, though\nrestricted to specific sub-regions of the data space? In this work, we propose\na novel score that captures the forgetting rate of deep models on validation\ndata. We posit that this score quantifies local overfitting: a decline in\nperformance confined to certain regions of the data space. We then show\nempirically that local overfitting occurs regardless of the presence of\ntraditional overfitting. Using the framework of deep over-parametrized linear\nmodels, we offer a certain theoretical characterization of forgotten knowledge,\nand show that it correlates with knowledge forgotten by real deep models.\nFinally, we devise a new ensemble method that aims to recover forgotten\nknowledge, relying solely on the training history of a single network. When\ncombined with self-distillation, this method enhances the performance of any\ntrained model without adding inference costs. Extensive empirical evaluations\ndemonstrate the efficacy of our method across multiple datasets, contemporary\nneural network architectures, and training protocols.\n","authors":["Uri Stern","Tomer Yaacoby","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2412.12968v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2310.11094"},{"id":"http://arxiv.org/abs/2501.03832v1","updated":"2025-01-07T14:42:38Z","published":"2025-01-07T14:42:38Z","title":"Three-dimensional attention Transformer for state evaluation in\n  real-time strategy games","summary":"  Situation assessment in Real-Time Strategy (RTS) games is crucial for\nunderstanding decision-making in complex adversarial environments. However,\nexisting methods remain limited in processing multi-dimensional feature\ninformation and temporal dependencies. Here we propose a tri-dimensional\nSpace-Time-Feature Transformer (TSTF Transformer) architecture, which\nefficiently models battlefield situations through three independent but\ncascaded modules: spatial attention, temporal attention, and feature attention.\nOn a dataset comprising 3,150 adversarial experiments, the 8-layer TSTF\nTransformer demonstrates superior performance: achieving 58.7% accuracy in the\nearly game (~4% progress), significantly outperforming the conventional\nTimesformer's 41.8%; reaching 97.6% accuracy in the mid-game (~40% progress)\nwhile maintaining low performance variation (standard deviation 0.114).\nMeanwhile, this architecture requires fewer parameters (4.75M) compared to the\nbaseline model (5.54M). Our study not only provides new insights into situation\nassessment in RTS games but also presents an innovative paradigm for\nTransformer-based multi-dimensional temporal modeling.\n","authors":["Yanqing Ye","Weilong Yang","Kai Qiu","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03832v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.12875v2","updated":"2025-01-07T14:39:26Z","published":"2024-08-23T07:14:56Z","title":"Disentangling, Amplifying, and Debiasing: Learning Disentangled\n  Representations for Fair Graph Neural Networks","summary":"  Graph Neural Networks (GNNs) have become essential tools for graph\nrepresentation learning in various domains, such as social media and\nhealthcare. However, they often suffer from fairness issues due to inherent\nbiases in node attributes and graph structure, leading to unfair predictions.\nTo address these challenges, we propose a novel GNN framework, DAB-GNN, that\nDisentangles, Amplifies, and deBiases attribute, structure, and potential\nbiases in the GNN mechanism. DAB-GNN employs a disentanglement and\namplification module that isolates and amplifies each type of bias through\nspecialized disentanglers, followed by a debiasing module that minimizes the\ndistance between subgroup distributions. Extensive experiments on five datasets\ndemonstrate that DAB-GNN significantly outperforms ten state-of-the-art\ncompetitors in terms of achieving an optimal balance between accuracy and\nfairness. The codebase of DAB-GNN is available at\nhttps://github.com/Bigdasgit/DAB-GNN\n","authors":["Yeon-Chang Lee","Hojung Shin","Sang-Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2408.12875v2.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.03826v1","updated":"2025-01-07T14:38:49Z","published":"2025-01-07T14:38:49Z","title":"Investigating the Impact of Data Selection Strategies on Language Model\n  Performance","summary":"  Data selection is critical for enhancing the performance of language models,\nparticularly when aligning training datasets with a desired target\ndistribution. This study explores the effects of different data selection\nmethods and feature types on model performance. We evaluate whether selecting\ndata subsets can influence downstream tasks, whether n-gram features improve\nalignment with target distributions, and whether embedding-based neural\nfeatures provide complementary benefits. Through comparative experiments using\nbaseline random selection methods and distribution aligned approaches, we\nprovide insights into the interplay between data selection strategies and model\ntraining efficacy. All code for this study can be found on\n\\href{https://github.com/jgu13/HIR-Hybrid-Importance-Resampling-for-Language-Models}{github\nrepository}.\n","authors":["Jiayao Gu","Liting Chen","Yihong Li"],"pdf_url":"https://arxiv.org/pdf/2501.03826v1.pdf","comment":"7 pages, 1 figure"},{"id":"http://arxiv.org/abs/2501.03821v1","updated":"2025-01-07T14:35:09Z","published":"2025-01-07T14:35:09Z","title":"Class-Balance Bias in Regularized Regression","summary":"  Regularized models are often sensitive to the scales of the features in the\ndata and it has therefore become standard practice to normalize (center and\nscale) the features before fitting the model. But there are many different ways\nto normalize the features and the choice may have dramatic effects on the\nresulting model. In spite of this, there has so far been no research on this\ntopic. In this paper, we begin to bridge this knowledge gap by studying\nnormalization in the context of lasso, ridge, and elastic net regression. We\nfocus on normal and binary features and show that the class balances of binary\nfeatures directly influences the regression coefficients and that this effect\ndepends on the combination of normalization and regularization methods used. We\ndemonstrate that this effect can be mitigated by scaling binary features with\ntheir variance in the case of the lasso and standard deviation in the case of\nridge regression, but that this comes at the cost of increased variance. For\nthe elastic net, we show that scaling the penalty weights, rather than the\nfeatures, can achieve the same effect. Finally, we also tackle mixes of binary\nand normal features as well as interactions and provide some initial results on\nhow to normalize features in these cases.\n","authors":["Johan Larsson","Jonas Wallin"],"pdf_url":"https://arxiv.org/pdf/2501.03821v1.pdf","comment":"27 pages, 21 figures"},{"id":"http://arxiv.org/abs/2412.19950v2","updated":"2025-01-07T14:35:01Z","published":"2024-12-27T23:10:32Z","title":"Data-driven tool wear prediction in milling, based on a\n  process-integrated single-sensor approach","summary":"  Accurate tool wear prediction is essential for maintaining productivity and\nminimizing costs in machining. However, the complex nature of the tool wear\nprocess poses significant challenges to achieving reliable predictions. This\nstudy explores data-driven methods, in particular deep learning, for tool wear\nprediction. Traditional data-driven approaches often focus on a single process,\nrelying on multi-sensor setups and extensive data generation, which limits\ngeneralization to new settings. Moreover, multi-sensor integration is often\nimpractical in industrial environments. To address these limitations, this\nresearch investigates the transferability of predictive models using minimal\ntraining data, validated across two processes. Furthermore, it uses a simple\nsetup with a single acceleration sensor to establish a low-cost data generation\napproach that facilitates the generalization of models to other processes via\ntransfer learning. The study evaluates several machine learning models,\nincluding convolutional neural networks (CNN), long short-term memory networks\n(LSTM), support vector machines (SVM) and decision trees, trained on different\ninput formats such as feature vectors and short-time Fourier transform (STFT).\nThe performance of the models is evaluated on different amounts of training\ndata, including scenarios with significantly reduced datasets, providing\ninsight into their effectiveness under constrained data conditions. The results\ndemonstrate the potential of specific models and configurations for effective\ntool wear prediction, contributing to the development of more adaptable and\nefficient predictive maintenance strategies in machining. Notably, the ConvNeXt\nmodel has an exceptional performance, achieving an 99.1% accuracy in\nidentifying tool wear using data from only four milling tools operated until\nthey are worn.\n","authors":["Eric Hirsch","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2412.19950v2.pdf","comment":"Preprint submitted to Robotics and Computer-Integrated Manufacturing\n  ,14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.01707v2","updated":"2025-01-07T14:28:54Z","published":"2025-01-03T09:09:58Z","title":"Catch Causal Signals from Edges for Label Imbalance in Graph\n  Classification","summary":"  Despite significant advancements in causal research on graphs and its\napplication to cracking label imbalance, the role of edge features in detecting\nthe causal effects within graphs has been largely overlooked, leaving existing\nmethods with untapped potential for further performance gains. In this paper,\nwe enhance the causal attention mechanism through effectively leveraging edge\ninformation to disentangle the causal subgraph from the original graph, as well\nas further utilizing edge features to reshape graph representations. Capturing\nmore comprehensive causal signals, our design leads to improved performance on\ngraph classification tasks with label imbalance issues. We evaluate our\napproach on real-word datasets PTC, Tox21, and ogbg-molhiv, observing\nimprovements over baselines. Overall, we highlight the importance of edge\nfeatures in graph causal detection and provide a promising direction for\naddressing label imbalance challenges in graph-level tasks. The model\nimplementation details and the codes are available on\nhttps://github.com/fengrui-z/ECAL\n","authors":["Fengrui Zhang","Yujia Yin","Hongzong Li","Yifan Chen","Tianyi Qu"],"pdf_url":"https://arxiv.org/pdf/2501.01707v2.pdf","comment":"ICASSP 2025"},{"id":"http://arxiv.org/abs/2405.15840v2","updated":"2025-01-07T14:24:59Z","published":"2024-05-24T16:03:47Z","title":"Learning the Language of Protein Structure","summary":"  Representation learning and \\emph{de novo} generation of proteins are pivotal\ncomputational biology tasks. Whilst natural language processing (NLP)\ntechniques have proven highly effective for protein sequence modelling,\nstructure modelling presents a complex challenge, primarily due to its\ncontinuous and three-dimensional nature. Motivated by this discrepancy, we\nintroduce an approach using a vector-quantized autoencoder that effectively\ntokenizes protein structures into discrete representations. This method\ntransforms the continuous, complex space of protein structures into a\nmanageable, discrete format with a codebook ranging from 4096 to 64000 tokens,\nachieving high-fidelity reconstructions with backbone root mean square\ndeviations (RMSD) of approximately 1-5 \\AA. To demonstrate the efficacy of our\nlearned representations, we show that a simple GPT model trained on our\ncodebooks can generate novel, diverse, and designable protein structures. Our\napproach not only provides representations of protein structure, but also\nmitigates the challenges of disparate modal representations and sets a\nfoundation for seamless, multi-modal integration, enhancing the capabilities of\ncomputational methods in protein design.\n","authors":["Benoit Gaujac","Jérémie Donà","Liviu Copoiu","Timothy Atkinson","Thomas Pierrot","Thomas D. Barrett"],"pdf_url":"https://arxiv.org/pdf/2405.15840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01460v2","updated":"2025-01-07T14:19:35Z","published":"2024-12-31T10:43:19Z","title":"GDSR: Global-Detail Integration through Dual-Branch Network with Wavelet\n  Losses for Remote Sensing Image Super-Resolution","summary":"  In recent years, deep neural networks, including Convolutional Neural\nNetworks, Transformers, and State Space Models, have achieved significant\nprogress in Remote Sensing Image (RSI) Super-Resolution (SR). However, existing\nSR methods typically overlook the complementary relationship between global and\nlocal dependencies. These methods either focus on capturing local information\nor prioritize global information, which results in models that are unable to\neffectively capture both global and local features simultaneously. Moreover,\ntheir computational cost becomes prohibitive when applied to large-scale RSIs.\nTo address these challenges, we introduce the novel application of Receptance\nWeighted Key Value (RWKV) to RSI-SR, which captures long-range dependencies\nwith linear complexity. To simultaneously model global and local features, we\npropose the Global-Detail dual-branch structure, GDSR, which performs SR\nreconstruction by paralleling RWKV and convolutional operations to handle\nlarge-scale RSIs. Furthermore, we introduce the Global-Detail Reconstruction\nModule (GDRM) as an intermediary between the two branches to bridge their\ncomplementary roles. In addition, we propose Wavelet Loss, a loss function that\neffectively captures high-frequency detail information in images, thereby\nenhancing the visual quality of SR, particularly in terms of detail\nreconstruction. Extensive experiments on several benchmarks, including AID,\nAID_CDM, RSSRD-QH, and RSSRD-QH_CDM, demonstrate that GSDR outperforms the\nstate-of-the-art Transformer-based method HAT by an average of 0.05 dB in PSNR,\nwhile using only 63% of its parameters and 51% of its FLOPs, achieving an\ninference speed 2.9 times faster. Furthermore, the Wavelet Loss shows excellent\ngeneralization across various architectures, providing a novel perspective for\nRSI-SR enhancement.\n","authors":["Qiwei Zhu","Kai Li","Guojing Zhang","Xiaoying Wang","Jianqiang Huang","Xilai Li"],"pdf_url":"https://arxiv.org/pdf/2501.01460v2.pdf","comment":"The experiments were conducted using private datasets that were\n  incomplete as they did not include all the necessary copyrights.\n  Additionally, the conclusions require further exploration as the work is\n  still in progress"},{"id":"http://arxiv.org/abs/2501.03782v1","updated":"2025-01-07T13:45:09Z","published":"2025-01-07T13:45:09Z","title":"Vision Transformer Neural Architecture Search for Out-of-Distribution\n  Generalization: Benchmark and Insights","summary":"  While ViTs have achieved across machine learning tasks, deploying them in\nreal-world scenarios faces a critical challenge: generalizing under OoD shifts.\nA crucial research gap exists in understanding how to design ViT architectures,\nboth manually and automatically, for better OoD generalization. To this end, we\nintroduce OoD-ViT-NAS, the first systematic benchmark for ViTs NAS focused on\nOoD generalization. This benchmark includes 3000 ViT architectures of varying\ncomputational budgets evaluated on 8 common OoD datasets. Using this benchmark,\nwe analyze factors contributing to OoD generalization. Our findings reveal key\ninsights. First, ViT architecture designs significantly affect OoD\ngeneralization. Second, ID accuracy is often a poor indicator of OoD accuracy,\nhighlighting the risk of optimizing ViT architectures solely for ID\nperformance. Third, we perform the first study of NAS for ViTs OoD robustness,\nanalyzing 9 Training-free NAS methods. We find that existing Training-free NAS\nmethods are largely ineffective in predicting OoD accuracy despite excelling at\nID accuracy. Simple proxies like Param or Flop surprisingly outperform complex\nTraining-free NAS methods in predicting OoD accuracy. Finally, we study how ViT\narchitectural attributes impact OoD generalization and discover that increasing\nembedding dimensions generally enhances performance. Our benchmark shows that\nViT architectures exhibit a wide range of OoD accuracy, with up to 11.85%\nimprovement for some OoD shifts. This underscores the importance of studying\nViT architecture design for OoD. We believe OoD-ViT-NAS can catalyze further\nresearch into how ViT designs influence OoD generalization.\n","authors":["Sy-Tuyen Ho","Tuan Van Vo","Somayeh Ebrahimkhani","Ngai-Man Cheung"],"pdf_url":"https://arxiv.org/pdf/2501.03782v1.pdf","comment":"Accepted in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.04280v3","updated":"2025-01-07T13:43:36Z","published":"2024-06-06T17:26:40Z","title":"xMIL: Insightful Explanations for Multiple Instance Learning in\n  Histopathology","summary":"  Multiple instance learning (MIL) is an effective and widely used approach for\nweakly supervised machine learning. In histopathology, MIL models have achieved\nremarkable success in tasks like tumor detection, biomarker prediction, and\noutcome prognostication. However, MIL explanation methods are still lagging\nbehind, as they are limited to small bag sizes or disregard instance\ninteractions. We revisit MIL through the lens of explainable AI (XAI) and\nintroduce xMIL, a refined framework with more general assumptions. We\ndemonstrate how to obtain improved MIL explanations using layer-wise relevance\npropagation (LRP) and conduct extensive evaluation experiments on three toy\nsettings and four real-world histopathology datasets. Our approach consistently\noutperforms previous explanation attempts with particularly improved\nfaithfulness scores on challenging biomarker prediction tasks. Finally, we\nshowcase how xMIL explanations enable pathologists to extract insights from MIL\nmodels, representing a significant advance for knowledge discovery and model\ndebugging in digital histopathology. Codes are available at:\nhttps://github.com/bifold-pathomics/xMIL.\n","authors":["Julius Hense","Mina Jamshidi Idaji","Oliver Eberle","Thomas Schnake","Jonas Dippel","Laure Ciernik","Oliver Buchstab","Andreas Mock","Frederick Klauschen","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04280v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00546v3","updated":"2025-01-07T13:31:01Z","published":"2023-12-31T17:21:02Z","title":"AllSpark: A Multimodal Spatio-Temporal General Intelligence Model with\n  Ten Modalities via Language as a Reference Framework","summary":"  Leveraging multimodal data is an inherent requirement for comprehending\ngeographic objects. However, due to the high heterogeneity in structure and\nsemantics among various spatio-temporal modalities, the joint interpretation of\nmultimodal spatio-temporal data has long been an extremely challenging problem.\nThe primary challenge resides in striking a trade-off between the cohesion and\nautonomy of diverse modalities. This trade-off becomes progressively nonlinear\nas the number of modalities expands. Inspired by the human cognitive system and\nlinguistic philosophy, where perceptual signals from the five senses converge\ninto language, we introduce the Language as Reference Framework (LaRF), a\nfundamental principle for constructing a multimodal unified model. Building\nupon this, we propose AllSpark, a multimodal spatio-temporal general artificial\nintelligence model. Our model integrates ten different modalities into a\nunified framework. To achieve modal cohesion, AllSpark introduces a modal\nbridge and multimodal large language model (LLM) to map diverse modal features\ninto the language feature space. To maintain modality autonomy, AllSpark uses\nmodality-specific encoders to extract the tokens of various spatio-temporal\nmodalities. Finally, observing a gap between the model's interpretability and\ndownstream tasks, we designed modality-specific prompts and task heads,\nenhancing the model's generalization capability across specific tasks.\nExperiments indicate that the incorporation of language enables AllSpark to\nexcel in few-shot classification tasks for RGB and point cloud modalities\nwithout additional training, surpassing baseline performance by up to 41.82\\%.\nThe source code is available at https://github.com/GeoX-Lab/AllSpark.\n","authors":["Run Shao","Cheng Yang","Qiujun Li","Qing Zhu","Yongjun Zhang","YanSheng Li","Yu Liu","Yong Tang","Dapeng Liu","Shizhong Yang","Haifeng Li"],"pdf_url":"https://arxiv.org/pdf/2401.00546v3.pdf","comment":"19 pages, 19 tables, 3 figures"},{"id":"http://arxiv.org/abs/2407.15857v2","updated":"2025-01-07T13:28:00Z","published":"2024-07-08T06:38:50Z","title":"BoRA: Bayesian Hierarchical Low-Rank Adaption for Multi-Task Large\n  Language Models","summary":"  This paper introduces Bayesian Hierarchical Low-Rank Adaption (BoRA), a novel\nmethod for finetuning multi-task Large Language Models (LLMs). Current\nfinetuning approaches, such as Low-Rank Adaption (LoRA), perform exeptionally\nwell in reducing training parameters and memory usage but face limitations when\napplied to multiple similar tasks. Practitioners usually have to choose between\ntraining separate models for each task or a single model for all tasks, both of\nwhich come with trade-offs in specialization and data utilization. BoRA\naddresses these trade-offs by leveraging a Bayesian hierarchical model that\nallows tasks to share information through global hierarchical priors. This\nenables tasks with limited data to benefit from the overall structure derived\nfrom related tasks while allowing tasks with more data to specialize. Our\nexperimental results show that BoRA outperforms both individual and unified\nmodel approaches, achieving lower perplexity and better generalization across\ntasks. This method provides a scalable and efficient solution for multi-task\nLLM finetuning, with significant practical implications for diverse\napplications.\n","authors":["Simen Eide","Arnoldo Frigessi"],"pdf_url":"https://arxiv.org/pdf/2407.15857v2.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.03769v1","updated":"2025-01-07T13:22:35Z","published":"2025-01-07T13:22:35Z","title":"Multi-label Cross-lingual automatic music genre classification from\n  lyrics with Sentence BERT","summary":"  Music genres are shaped by both the stylistic features of songs and the\ncultural preferences of artists' audiences. Automatic classification of music\ngenres using lyrics can be useful in several applications such as\nrecommendation systems, playlist creation, and library organization. We present\na multi-label, cross-lingual genre classification system based on multilingual\nsentence embeddings generated by sBERT. Using a bilingual Portuguese-English\ndataset with eight overlapping genres, we demonstrate the system's ability to\ntrain on lyrics in one language and predict genres in another. Our approach\noutperforms the baseline approach of translating lyrics and using a\nbag-of-words representation, improving the genrewise average F1-Score from 0.35\nto 0.69. The classifier uses a one-vs-all architecture, enabling it to assign\nmultiple genre labels to a single lyric. Experimental results reveal that\ndataset centralization notably improves cross-lingual performance. This\napproach offers a scalable solution for genre classification across\nunderrepresented languages and cultural domains, advancing the capabilities of\nmusic information retrieval systems.\n","authors":["Tiago Fernandes Tavares","Fabio José Ayres"],"pdf_url":"https://arxiv.org/pdf/2501.03769v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2310.20708v3","updated":"2025-01-07T13:11:19Z","published":"2023-10-31T17:59:56Z","title":"Unexpected Improvements to Expected Improvement for Bayesian\n  Optimization","summary":"  Expected Improvement (EI) is arguably the most popular acquisition function\nin Bayesian optimization and has found countless successful applications, but\nits performance is often exceeded by that of more recent methods. Notably, EI\nand its variants, including for the parallel and multi-objective settings, are\nchallenging to optimize because their acquisition values vanish numerically in\nmany regions. This difficulty generally increases as the number of\nobservations, dimensionality of the search space, or the number of constraints\ngrow, resulting in performance that is inconsistent across the literature and\nmost often sub-optimal. Herein, we propose LogEI, a new family of acquisition\nfunctions whose members either have identical or approximately equal optima as\ntheir canonical counterparts, but are substantially easier to optimize\nnumerically. We demonstrate that numerical pathologies manifest themselves in\n\"classic\" analytic EI, Expected Hypervolume Improvement (EHVI), as well as\ntheir constrained, noisy, and parallel variants, and propose corresponding\nreformulations that remedy these pathologies. Our empirical results show that\nmembers of the LogEI family of acquisition functions substantially improve on\nthe optimization performance of their canonical counterparts and surprisingly,\nare on par with or exceed the performance of recent state-of-the-art\nacquisition functions, highlighting the understated role of numerical\noptimization in the literature.\n","authors":["Sebastian Ament","Samuel Daulton","David Eriksson","Maximilian Balandat","Eytan Bakshy"],"pdf_url":"https://arxiv.org/pdf/2310.20708v3.pdf","comment":"NeurIPS 2023 Spotlight (https://openreview.net/forum?id=QFgYOtOkDB)"},{"id":"http://arxiv.org/abs/2410.24222v2","updated":"2025-01-07T13:04:51Z","published":"2024-10-31T17:59:56Z","title":"Robust Gaussian Processes via Relevance Pursuit","summary":"  Gaussian processes (GPs) are non-parametric probabilistic regression models\nthat are popular due to their flexibility, data efficiency, and well-calibrated\nuncertainty estimates. However, standard GP models assume homoskedastic\nGaussian noise, while many real-world applications are subject to non-Gaussian\ncorruptions. Variants of GPs that are more robust to alternative noise models\nhave been proposed, and entail significant trade-offs between accuracy and\nrobustness, and between computational requirements and theoretical guarantees.\nIn this work, we propose and study a GP model that achieves robustness against\nsparse outliers by inferring data-point-specific noise levels with a sequential\nselection procedure maximizing the log marginal likelihood that we refer to as\nrelevance pursuit. We show, surprisingly, that the model can be parameterized\nsuch that the associated log marginal likelihood is strongly concave in the\ndata-point-specific noise variances, a property rarely found in either robust\nregression objectives or GP marginal likelihoods. This in turn implies the weak\nsubmodularity of the corresponding subset selection problem, and thereby proves\napproximation guarantees for the proposed algorithm. We compare the model's\nperformance relative to other approaches on diverse regression and Bayesian\noptimization tasks, including the challenging but common setting of sparse\ncorruptions of the labels within or close to the function range.\n","authors":["Sebastian Ament","Elizabeth Santorella","David Eriksson","Ben Letham","Maximilian Balandat","Eytan Bakshy"],"pdf_url":"https://arxiv.org/pdf/2410.24222v2.pdf","comment":"NeurIPS 2024 Article (https://openreview.net/forum?id=5FATPIlWUJ)"},{"id":"http://arxiv.org/abs/2409.09138v2","updated":"2025-01-07T13:04:13Z","published":"2024-09-13T18:42:11Z","title":"Fast Structured Orthogonal Dictionary Learning using Householder\n  Reflections","summary":"  In this paper, we propose and investigate algorithms for the structured\northogonal dictionary learning problem. First, we investigate the case when the\ndictionary is a Householder matrix. We give sample complexity results and show\ntheoretically guaranteed approximate recovery (in the $l_{\\infty}$ sense) with\noptimal computational complexity. We then attempt to generalize these\ntechniques when the dictionary is a product of a few Householder matrices. We\nnumerically validate these techniques in the sample-limited setting to show\nperformance similar to or better than existing techniques while having much\nimproved computational complexity.\n","authors":["Anirudh Dash","Aditya Siripuram"],"pdf_url":"https://arxiv.org/pdf/2409.09138v2.pdf","comment":"12 pages, 5 figures, accepted for publication: IEEE ICASSP, 2025"},{"id":"http://arxiv.org/abs/2409.18301v3","updated":"2025-01-07T12:44:48Z","published":"2024-09-26T21:16:51Z","title":"Wavelet-Driven Generalizable Framework for Deepfake Face Forgery\n  Detection","summary":"  The evolution of digital image manipulation, particularly with the\nadvancement of deep generative models, significantly challenges existing\ndeepfake detection methods, especially when the origin of the deepfake is\nobscure. To tackle the increasing complexity of these forgeries, we propose\n\\textbf{Wavelet-CLIP}, a deepfake detection framework that integrates wavelet\ntransforms with features derived from the ViT-L/14 architecture, pre-trained in\nthe CLIP fashion. Wavelet-CLIP utilizes Wavelet Transforms to deeply analyze\nboth spatial and frequency features from images, thus enhancing the model's\ncapability to detect sophisticated deepfakes. To verify the effectiveness of\nour approach, we conducted extensive evaluations against existing\nstate-of-the-art methods for cross-dataset generalization and detection of\nunseen images generated by standard diffusion models. Our method showcases\noutstanding performance, achieving an average AUC of 0.749 for cross-data\ngeneralization and 0.893 for robustness against unseen deepfakes, outperforming\nall compared methods. The code can be reproduced from the repo:\n\\url{https://github.com/lalithbharadwajbaru/Wavelet-CLIP}\n","authors":["Lalith Bharadwaj Baru","Rohit Boddeda","Shilhora Akshay Patel","Sai Mohan Gajapaka"],"pdf_url":"https://arxiv.org/pdf/2409.18301v3.pdf","comment":"9 Pages, 2 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2401.04482v3","updated":"2025-01-07T12:40:58Z","published":"2024-01-09T10:39:17Z","title":"Continuously Learning New Words in Automatic Speech Recognition","summary":"  Despite recent advances, Automatic Speech Recognition (ASR) systems are still\nfar from perfect. Typical errors include acronyms, named entities, and\ndomain-specific special words for which little or no labeled data is available.\nTo address the problem of recognizing these words, we propose a self-supervised\ncontinual learning approach: Given the audio of a lecture talk with the\ncorresponding slides, we bias the model towards decoding new words from the\nslides by using a memory-enhanced ASR model from the literature. Then, we\nperform inference on the talk, collecting utterances that contain detected new\nwords into an adaptation data set. Continual learning is then performed by\ntraining adaptation weights added to the model on this data set. The whole\nprocedure is iterated for many talks. We show that with this approach, we\nobtain increasing performance on the new words when they occur more frequently\n(more than 80% recall) while preserving the general performance of the model.\n","authors":["Christian Huber","Alexander Waibel"],"pdf_url":"https://arxiv.org/pdf/2401.04482v3.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.03747v1","updated":"2025-01-07T12:40:35Z","published":"2025-01-07T12:40:35Z","title":"Context-Alignment: Activating and Enhancing LLM Capabilities in Time\n  Series","summary":"  Recently, leveraging pre-trained Large Language Models (LLMs) for time series\n(TS) tasks has gained increasing attention, which involves activating and\nenhancing LLMs' capabilities. Many methods aim to activate LLMs' capabilities\nbased on token-level alignment but overlook LLMs' inherent strength on natural\nlanguage processing -- their deep understanding of linguistic logic and\nstructure rather than superficial embedding processing. We propose\nContext-Alignment, a new paradigm that aligns TS with a linguistic component in\nthe language environments familiar to LLMs to enable LLMs to contextualize and\ncomprehend TS data, thereby activating their capabilities. Specifically, such\ncontext-level alignment comprises structural alignment and logical alignment,\nwhich is achieved by a Dual-Scale Context-Alignment GNNs (DSCA-GNNs) applied to\nTS-language multimodal inputs. Structural alignment utilizes dual-scale nodes\nto describe hierarchical structure in TS-language, enabling LLMs treat long TS\ndata as a whole linguistic component while preserving intrinsic token features.\nLogical alignment uses directed edges to guide logical relationships, ensuring\ncoherence in the contextual semantics. Demonstration examples prompt are\nemployed to construct Demonstration Examples based Context-Alignment (DECA)\nfollowing DSCA-GNNs framework. DECA can be flexibly and repeatedly integrated\ninto various layers of pre-trained LLMs to improve awareness of logic and\nstructure, thereby enhancing performance. Extensive experiments show the\neffectiveness of DECA and the importance of Context-Alignment across tasks,\nparticularly in few-shot and zero-shot forecasting, confirming that\nContext-Alignment provide powerful prior knowledge on context.\n","authors":["Yuxiao Hu","Qian Li","Dongxiao Zhang","Jinyue Yan","Yuntian Chen"],"pdf_url":"https://arxiv.org/pdf/2501.03747v1.pdf","comment":"no comment"},{"id":"http://arxiv.org/abs/2501.03746v1","updated":"2025-01-07T12:40:11Z","published":"2025-01-07T12:40:11Z","title":"A Multimodal Lightweight Approach to Fault Diagnosis of Induction Motors\n  in High-Dimensional Dataset","summary":"  An accurate AI-based diagnostic system for induction motors (IMs) holds the\npotential to enhance proactive maintenance, mitigating unplanned downtime and\ncurbing overall maintenance costs within an industrial environment. Notably,\namong the prevalent faults in IMs, a Broken Rotor Bar (BRB) fault is frequently\nencountered. Researchers have proposed various fault diagnosis approaches using\nsignal processing (SP), machine learning (ML), deep learning (DL), and hybrid\narchitectures for BRB faults. One limitation in the existing literature is the\ntraining of these architectures on relatively small datasets, risking\noverfitting when implementing such systems in industrial environments. This\npaper addresses this limitation by implementing large-scale data of BRB faults\nby using a transfer-learning-based lightweight DL model named ShuffleNetV2 for\ndiagnosing one, two, three, and four BRB faults using current and vibration\nsignal data. Spectral images for training and testing are generated using a\nShort-Time Fourier Transform (STFT). The dataset comprises 57,500 images, with\n47,500 used for training and 10,000 for testing. Remarkably, the ShuffleNetV2\nmodel exhibited superior performance, in less computational cost as well as\naccurately classifying 98.856% of spectral images. To further enhance the\nvisualization of harmonic sidebands resulting from broken bars, Fast Fourier\nTransform (FFT) is applied to current and vibration data. The paper also\nprovides insights into the training and testing times for each model,\ncontributing to a comprehensive understanding of the proposed fault diagnosis\nmethodology. The findings of our research provide valuable insights into the\nperformance and efficiency of different ML and DL models, offering a foundation\nfor the development of robust fault diagnosis systems for induction motors in\nindustrial settings.\n","authors":["Usman Ali"],"pdf_url":"https://arxiv.org/pdf/2501.03746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03124v2","updated":"2025-01-07T12:33:44Z","published":"2025-01-06T16:31:45Z","title":"PRMBench: A Fine-grained and Challenging Benchmark for Process-Level\n  Reward Models","summary":"  Process-level Reward Models (PRMs) are crucial for complex reasoning and\ndecision-making tasks, where each intermediate step plays an important role in\nthe reasoning process. Since language models are prone to various types of\nerrors during the reasoning process, PRMs are required to possess nuanced\ncapabilities for detecting various implicit error types in real-world\nscenarios. However, current benchmarks primarily focus on step correctness,\nfailing to evaluate PRMs' performance systematically. To address this gap, we\nintroduce PRMBench, a process-level benchmark specifically designed to assess\nthe fine-grained error detection capabilities of PRMs. PRMBench comprises 6,216\ncarefully designed problems and 83,456 step-level labels, evaluating models\nacross multiple dimensions, including simplicity, soundness, and sensitivity.\nIn our experiments on 15 models, spanning both open-source PRMs and\nclosed-source large language models prompted as critic models, we uncover\nsignificant weaknesses in current PRMs. These findings underscore the\nchallenges inherent in process-level evaluation and highlight key directions\nfor future research. We hope PRMBench can be a robust bench for advancing\nresearch on PRM evaluation and development.\n","authors":["Mingyang Song","Zhaochen Su","Xiaoye Qu","Jiawei Zhou","Yu Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.03124v2.pdf","comment":"Project Page: https://prmbench.github.io/"},{"id":"http://arxiv.org/abs/2405.16449v3","updated":"2025-01-07T12:16:43Z","published":"2024-05-26T06:33:11Z","title":"Reinforcement Learning for Jump-Diffusions, with Financial Applications","summary":"  We study continuous-time reinforcement learning (RL) for stochastic control\nin which system dynamics are governed by jump-diffusion processes. We formulate\nan entropy-regularized exploratory control problem with stochastic policies to\ncapture the exploration--exploitation balance essential for RL. Unlike the pure\ndiffusion case initially studied by Wang et al. (2020), the derivation of the\nexploratory dynamics under jump-diffusions calls for a careful formulation of\nthe jump part. Through a theoretical analysis, we find that one can simply use\nthe same policy evaluation and $q$-learning algorithms in Jia and Zhou (2022a,\n2023), originally developed for controlled diffusions, without needing to check\na priori whether the underlying data come from a pure diffusion or a\njump-diffusion. However, we show that the presence of jumps ought to affect\nparameterizations of actors and critics in general. We investigate as an\napplication the mean--variance portfolio selection problem with stock price\nmodelled as a jump-diffusion, and show that both RL algorithms and\nparameterizations are invariant with respect to jumps. Finally, we present a\ndetailed study on applying the general theory to option hedging.\n","authors":["Xuefeng Gao","Lingfei Li","Xun Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.16449v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03727v1","updated":"2025-01-07T12:16:26Z","published":"2025-01-07T12:16:26Z","title":"Detecting Neurocognitive Disorders through Analyses of Topic Evolution\n  and Cross-modal Consistency in Visual-Stimulated Narratives","summary":"  Early detection of neurocognitive disorders (NCDs) is crucial for timely\nintervention and disease management. Speech analysis offers a non-intrusive and\nscalable screening method, particularly through narrative tasks in\nneuropsychological assessment tools. Traditional narrative analysis often\nfocuses on local indicators in microstructure, such as word usage and syntax.\nWhile these features provide insights into language production abilities, they\noften fail to capture global narrative patterns, or microstructures.\nMacrostructures include coherence, thematic organization, and logical\nprogressions, reflecting essential cognitive skills potentially critical for\nrecognizing NCDs. Addressing this gap, we propose to investigate specific\ncognitive and linguistic challenges by analyzing topical shifts, temporal\ndynamics, and the coherence of narratives over time, aiming to reveal cognitive\ndeficits by identifying narrative impairments, and exploring their impact on\ncommunication and cognition. The investigation is based on the CU-MARVEL Rabbit\nStory corpus, which comprises recordings of a story-telling task from 758 older\nadults. We developed two approaches: the Dynamic Topic Models (DTM)-based\ntemporal analysis to examine the evolution of topics over time, and the\nText-Image Temporal Alignment Network (TITAN) to evaluate the coherence between\nspoken narratives and visual stimuli. DTM-based approach validated the\neffectiveness of dynamic topic consistency as a macrostructural metric\n(F1=0.61, AUC=0.78). The TITAN approach achieved the highest performance\n(F1=0.72, AUC=0.81), surpassing established microstructural and macrostructural\nfeature sets. Cross-comparison and regression tasks further demonstrated the\neffectiveness of proposed dynamic macrostructural modeling approaches for NCD\ndetection.\n","authors":["Jinchao Li","Yuejiao Wang","Junan Li","Jiawen Kang","Bo Zheng","Simon Wong","Brian Mak","Helene Fung","Jean Woo","Man-Wai Mak","Timothy Kwok","Vincent Mok","Xianmin Gong","Xixin Wu","Xunying Liu","Patrick Wong","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2501.03727v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.03260v2","updated":"2025-01-07T11:54:58Z","published":"2024-09-05T05:51:42Z","title":"In Search of Trees: Decision-Tree Policy Synthesis for Black-Box Systems\n  via Search","summary":"  Decision trees, owing to their interpretability, are attractive as control\npolicies for (dynamical) systems. Unfortunately, constructing, or synthesising,\nsuch policies is a challenging task. Previous approaches do so by imitating a\nneural-network policy, approximating a tabular policy obtained via formal\nsynthesis, employing reinforcement learning, or modelling the problem as a\nmixed-integer linear program. However, these works may require access to a\nhard-to-obtain accurate policy or a formal model of the environment (within\nreach of formal synthesis), and may not provide guarantees on the quality or\nsize of the final tree policy. In contrast, we present an approach to\nsynthesise optimal decision-tree policies given a deterministic black-box\nenvironment and specification, a discretisation of the tree predicates, and an\ninitial set of states, where optimality is defined with respect to the number\nof steps to achieve the goal. Our approach is a specialised search algorithm\nwhich systematically explores the (exponentially large) space of decision trees\nunder the given discretisation. The key component is a novel trace-based\npruning mechanism that significantly reduces the search space. Our approach\nrepresents a conceptually novel way of synthesising small decision-tree\npolicies with optimality guarantees even for black-box environments with\nblack-box specifications.\n","authors":["Emir Demirović","Christian Schilling","Anna Lukina"],"pdf_url":"https://arxiv.org/pdf/2409.03260v2.pdf","comment":"8 pages main text incl. references, 2 pages appendix"},{"id":"http://arxiv.org/abs/2501.03715v1","updated":"2025-01-07T11:44:25Z","published":"2025-01-07T11:44:25Z","title":"Neural Deconstruction Search for Vehicle Routing Problems","summary":"  Autoregressive construction approaches generate solutions to vehicle routing\nproblems in a step-by-step fashion, leading to high-quality solutions that are\nnearing the performance achieved by handcrafted, operations research\ntechniques. In this work, we challenge the conventional paradigm of sequential\nsolution construction and introduce an iterative search framework where\nsolutions are instead deconstructed by a neural policy. Throughout the search,\nthe neural policy collaborates with a simple greedy insertion algorithm to\nrebuild the deconstructed solutions. Our approach surpasses the performance of\nstate-of-the-art operations research methods across three challenging vehicle\nrouting problems of various problem sizes.\n","authors":["André Hottung","Paula Wong-Chung","Kevin Tierney"],"pdf_url":"https://arxiv.org/pdf/2501.03715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03711v1","updated":"2025-01-07T11:32:13Z","published":"2025-01-07T11:32:13Z","title":"Unsupervised Speech Segmentation: A General Approach Using Speech\n  Language Models","summary":"  In this paper, we introduce an unsupervised approach for Speech Segmentation,\nwhich builds on previously researched approaches, e.g., Speaker Diarization,\nwhile being applicable to an inclusive set of acoustic-semantic distinctions,\npaving a path towards a general Unsupervised Speech Segmentation approach.\nUnlike traditional speech and audio segmentation, which mainly focuses on\nspectral changes in the input signal, e.g., phone segmentation, our approach\ntries to segment the spoken utterance into chunks with differing\nacoustic-semantic styles, focusing on acoustic-semantic information that does\nnot translate well into text, e.g., emotion or speaker. While most Speech\nSegmentation tasks only handle one style change, e.g., emotion diarization, our\napproach tries to handle multiple acoustic-semantic style changes. Leveraging\nrecent advances in Speech Language Models (SLMs), we propose a simple\nunsupervised method to segment a given speech utterance. We empirically\ndemonstrate the effectiveness of the proposed approach by considering several\nsetups. Results suggest that the proposed method is superior to the evaluated\nbaselines on boundary detection, segment purity, and over-segmentation. Code is\navailable at\nhttps://github.com/avishaiElmakies/unsupervised_speech_segmentation_using_slm.\n","authors":["Avishai Elmakies","Omri Abend","Yossi Adi"],"pdf_url":"https://arxiv.org/pdf/2501.03711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10573v2","updated":"2025-01-07T11:13:06Z","published":"2024-06-15T09:23:46Z","title":"Graph Neural Backdoor: Fundamentals, Methodologies, Applications, and\n  Future Directions","summary":"  Graph Neural Networks (GNNs) have significantly advanced various downstream\ngraph-relevant tasks, encompassing recommender systems, molecular structure\nprediction, social media analysis, etc. Despite the boosts of GNN, recent\nresearch has empirically demonstrated its potential vulnerability to backdoor\nattacks, wherein adversaries employ triggers to poison input samples, inducing\nGNN to adversary-premeditated malicious outputs. This is typically due to the\ncontrolled training process, or the deployment of untrusted models, such as\ndelegating model training to third-party service, leveraging external training\nsets, and employing pre-trained models from online sources. Although there's an\nongoing increase in research on GNN backdoors, comprehensive investigation into\nthis field is lacking. To bridge this gap, we propose the first survey\ndedicated to GNN backdoors. We begin by outlining the fundamental definition of\nGNN, followed by the detailed summarization and categorization of current GNN\nbackdoor attacks and defenses based on their technical characteristics and\napplication scenarios. Subsequently, the analysis of the applicability and use\ncases of GNN backdoors is undertaken. Finally, the exploration of potential\nresearch directions of GNN backdoors is presented. This survey aims to explore\nthe principles of graph backdoors, provide insights to defenders, and promote\nfuture security research.\n","authors":["Xiao Yang","Gaolei Li","Jianhua Li"],"pdf_url":"https://arxiv.org/pdf/2406.10573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03697v1","updated":"2025-01-07T11:01:24Z","published":"2025-01-07T11:01:24Z","title":"Deep Networks are Reproducing Kernel Chains","summary":"  Identifying an appropriate function space for deep neural networks remains a\nkey open question. While shallow neural networks are naturally associated with\nReproducing Kernel Banach Spaces (RKBS), deep networks present unique\nchallenges. In this work, we extend RKBS to chain RKBS (cRKBS), a new framework\nthat composes kernels rather than functions, preserving the desirable\nproperties of RKBS. We prove that any deep neural network function is a neural\ncRKBS function, and conversely, any neural cRKBS function defined on a finite\ndataset corresponds to a deep neural network. This approach provides a sparse\nsolution to the empirical risk minimization problem, requiring no more than $N$\nneurons per layer, where $N$ is the number of data points.\n","authors":["Tjeerd Jan Heeringa","Len Spek","Christoph Brune"],"pdf_url":"https://arxiv.org/pdf/2501.03697v1.pdf","comment":"25 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.03696v1","updated":"2025-01-07T10:54:44Z","published":"2025-01-07T10:54:44Z","title":"Exploring Molecule Generation Using Latent Space Graph Diffusion","summary":"  Generating molecular graphs is a challenging task due to their discrete\nnature and the competitive objectives involved. Diffusion models have emerged\nas SOTA approaches in data generation across various modalities. For molecular\ngraphs, graph neural networks (GNNs) as a diffusion backbone have achieved\nimpressive results. Latent space diffusion, where diffusion occurs in a\nlow-dimensional space via an autoencoder, has demonstrated computational\nefficiency. However, the literature on latent space diffusion for molecular\ngraphs is scarce, and no commonly accepted best practices exist. In this work,\nwe explore different approaches and hyperparameters, contrasting generative\nflow models (denoising diffusion, flow matching, heat dissipation) and\narchitectures (GNNs and E(3)-equivariant GNNs). Our experiments reveal a high\nsensitivity to the choice of approach and design decisions. Code is made\navailable at\ngithub.com/Prashanth-Pombala/Molecule-Generation-using-Latent-Space-Graph-Diffusion.\n","authors":["Prashanth Pombala","Gerrit Grossmann","Verena Wolf"],"pdf_url":"https://arxiv.org/pdf/2501.03696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05898v4","updated":"2025-01-07T10:42:21Z","published":"2024-10-08T10:55:40Z","title":"Manifolds, Random Matrices and Spectral Gaps: The geometric phases of\n  generative diffusion","summary":"  In this paper, we investigate the latent geometry of generative diffusion\nmodels under the manifold hypothesis. For this purpose, we analyze the spectrum\nof eigenvalues (and singular values) of the Jacobian of the score function,\nwhose discontinuities (gaps) reveal the presence and dimensionality of distinct\nsub-manifolds. Using a statistical physics approach, we derive the spectral\ndistributions and formulas for the spectral gaps under several distributional\nassumptions, and we compare these theoretical predictions with the spectra\nestimated from trained networks. Our analysis reveals the existence of three\ndistinct qualitative phases during the generative process: a trivial phase; a\nmanifold coverage phase where the diffusion process fits the distribution\ninternal to the manifold; a consolidation phase where the score becomes\northogonal to the manifold and all particles are projected on the support of\nthe data. This `division of labor' between different timescales provides an\nelegant explanation of why generative diffusion models are not affected by the\nmanifold overfitting phenomenon that plagues likelihood-based models, since the\ninternal distribution and the manifold geometry are produced at different time\npoints during generation.\n","authors":["Enrico Ventura","Beatrice Achilli","Gianluigi Silvestri","Carlo Lucibello","Luca Ambrogioni"],"pdf_url":"https://arxiv.org/pdf/2410.05898v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03687v1","updated":"2025-01-07T10:34:12Z","published":"2025-01-07T10:34:12Z","title":"Run-and-tumble chemotaxis using reinforcement learning","summary":"  Bacterial cells use run-and-tumble motion to climb up attractant\nconcentration gradient in their environment. By extending the uphill runs and\nshortening the downhill runs the cells migrate towards the higher attractant\nzones. Motivated by this, we formulate a reinforcement learning (RL) algorithm\nwhere an agent moves in one dimension in the presence of an attractant\ngradient. The agent can perform two actions: either persistent motion in the\nsame direction or reversal of direction. We assign costs for these actions\nbased on the recent history of the agent's trajectory. We ask the question:\nwhich RL strategy works best in different types of attractant environment. We\nquantify efficiency of the RL strategy by the ability of the agent (a) to\nlocalize in the favorable zones after large times, and (b) to learn about its\ncomplete environment. Depending on the attractant profile and the initial\ncondition, we find an optimum balance is needed between exploration and\nexploitation to ensure the most efficient performance.\n","authors":["Ramesh Pramanik","Shradha Mishra","Sakuntala Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2501.03687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13552v2","updated":"2025-01-07T10:26:47Z","published":"2024-06-19T13:39:05Z","title":"Standardness Clouds Meaning: A Position Regarding the Informed Usage of\n  Standard Datasets","summary":"  Standard datasets are frequently used to train and evaluate Machine Learning\nmodels. However, the assumed standardness of these datasets leads to a lack of\nin-depth discussion on how their labels match the derived categories for the\nrespective use case, which we demonstrate by reviewing recent literature that\nemploys standard datasets. We find that the standardness of the datasets seems\nto cloud their actual coherency and applicability, thus impeding the trust in\nMachine Learning models trained on these datasets. Therefore, we argue against\nthe uncritical use of standard datasets and advocate for their critical\nexamination instead. For this, we suggest to use Grounded Theory in combination\nwith Hypotheses Testing through Visualization as methods to evaluate the match\nbetween use case, derived categories, and labels. We exemplify this approach by\napplying it to the 20 Newsgroups dataset and the MNIST dataset, both considered\nstandard datasets in their respective domain. The results show that the labels\nof the 20 Newsgroups dataset are imprecise, which implies that neither a\nMachine Learning model can learn a meaningful abstraction of derived categories\nnor one can draw conclusions from achieving high accuracy on this dataset. For\nthe MNIST dataset, we demonstrate that the labels can be confirmed to be\ndefined well. We conclude that also for datasets that are considered to be\nstandard, quality and suitability have to be assessed in order to learn\nmeaningful abstractions and, thus, improve trust in Machine Learning models.\n","authors":["Tim Cech","Ole Wegen","Daniel Atzberger","Rico Richter","Willy Scheibel","Jürgen Döllner"],"pdf_url":"https://arxiv.org/pdf/2406.13552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03676v1","updated":"2025-01-07T10:22:30Z","published":"2025-01-07T10:22:30Z","title":"SALE-Based Offline Reinforcement Learning with Ensemble Q-Networks","summary":"  In this work, we build upon the offline reinforcement learning algorithm TD7,\nwhich incorporates State-Action Learned Embeddings (SALE) and LAP, and propose\na model-free actor-critic algorithm that integrates ensemble Q-networks and a\ngradient diversity penalty from EDAC. The ensemble Q-networks effectively\naddress the challenge of out-of-distribution actions by introducing penalties\nthat guide the actor network to focus on in-distribution actions. Meanwhile,\nthe gradient diversity penalty encourages diverse Q-value gradients, further\nsuppressing overestimation for out-of-distribution actions. Additionally, our\nmethod retains an adjustable behavior cloning (BC) term that directs the actor\nnetwork toward dataset actions during early training stages, while gradually\nreducing its influence as the precision of the Q-ensemble improves. These\nenhancements work synergistically to improve training stability and accuracy.\nExperimental results on the D4RL MuJoCo benchmarks demonstrate that our\nalgorithm achieves superior convergence speed, stability, and performance\ncompared to existing methods.\n","authors":["Zheng Chun"],"pdf_url":"https://arxiv.org/pdf/2501.03676v1.pdf","comment":"10 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.03671v1","updated":"2025-01-07T10:18:37Z","published":"2025-01-07T10:18:37Z","title":"Imitation Learning of MPC with Neural Networks: Error Guarantees and\n  Sparsification","summary":"  This paper presents a framework for bounding the approximation error in\nimitation model predictive controllers utilizing neural networks. Leveraging\nthe Lipschitz properties of these neural networks, we derive a bound that\nguides dataset design to ensure the approximation error remains at chosen\nlimits. We discuss how this method can be used to design a stable neural\nnetwork controller with performance guarantees employing existing robust model\npredictive control approaches for data generation. Additionally, we introduce a\ntraining adjustment, which is based on the sensitivities of the optimization\nproblem and reduces dataset density requirements based on the derived bounds.\nWe verify that the proposed augmentation results in improvements to the\nnetwork's predictive capabilities and a reduction of the Lipschitz constant.\nMoreover, on a simulated inverted pendulum problem, we show that the approach\nresults in a closer match of the closed-loop behavior between the imitation and\nthe original model predictive controller.\n","authors":["Hendrik Alsmeier","Lukas Theiner","Anton Savchenko","Ali Mesbah","Rolf Findeisen"],"pdf_url":"https://arxiv.org/pdf/2501.03671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03666v1","updated":"2025-01-07T10:06:59Z","published":"2025-01-07T10:06:59Z","title":"Hybrid Machine Learning Model with a Constrained Action Space for\n  Trajectory Prediction","summary":"  Trajectory prediction is crucial to advance autonomous driving, improving\nsafety, and efficiency. Although end-to-end models based on deep learning have\ngreat potential, they often do not consider vehicle dynamic limitations,\nleading to unrealistic predictions. To address this problem, this work\nintroduces a novel hybrid model that combines deep learning with a kinematic\nmotion model. It is able to predict object attributes such as acceleration and\nyaw rate and generate trajectories based on them. A key contribution is the\nincorporation of expert knowledge into the learning objective of the deep\nlearning model. This results in the constraint of the available action space,\nthus enabling the prediction of physically feasible object attributes and\ntrajectories, thereby increasing safety and robustness. The proposed hybrid\nmodel facilitates enhanced interpretability, thereby reinforcing the\ntrustworthiness of deep learning methods and promoting the development of safe\nplanning solutions. Experiments conducted on the publicly available real-world\nArgoverse dataset demonstrate realistic driving behaviour, with benchmark\ncomparisons and ablation studies showing promising results.\n","authors":["Alexander Fertig","Lakshman Balasubramanian","Michael Botsch"],"pdf_url":"https://arxiv.org/pdf/2501.03666v1.pdf","comment":"Submitted to 2025 IEEE Intelligent Vehicles Symposium (IV)"},{"id":"http://arxiv.org/abs/2402.10456v2","updated":"2025-01-07T10:03:08Z","published":"2024-02-16T05:27:05Z","title":"Efficient Generative Modeling via Penalized Optimal Transport Network","summary":"  The generation of synthetic data with distributions that faithfully emulate\nthe underlying data-generating mechanism holds paramount significance.\nWasserstein Generative Adversarial Networks (WGANs) have emerged as a prominent\ntool for this task; however, due to the delicate equilibrium of the minimax\nformulation and the instability of Wasserstein distance in high dimensions,\nWGAN often manifests the pathological phenomenon of mode collapse. This results\nin generated samples that converge to a restricted set of outputs and fail to\nadequately capture the tail behaviors of the true distribution. Such\nlimitations can lead to serious downstream consequences. To this end, we\npropose the Penalized Optimal Transport Network (POTNet), a versatile deep\ngenerative model based on the marginally-penalized Wasserstein (MPW) distance.\nThrough the MPW distance, POTNet effectively leverages low-dimensional marginal\ninformation to guide the overall alignment of joint distributions. Furthermore,\nour primal-based framework enables direct evaluation of the MPW distance, thus\neliminating the need for a critic network. This formulation circumvents\ntraining instabilities inherent in adversarial approaches and avoids the need\nfor extensive parameter tuning. We derive a non-asymptotic bound on the\ngeneralization error of the MPW loss and establish convergence rates of the\ngenerative distribution learned by POTNet. Our theoretical analysis together\nwith extensive empirical evaluations demonstrate the superior performance of\nPOTNet in accurately capturing underlying data structures, including their tail\nbehaviors and minor modalities. Moreover, our model achieves orders of\nmagnitude speedup during the sampling stage compared to state-of-the-art\nalternatives, which enables computationally efficient large-scale synthetic\ndata generation.\n","authors":["Wenhui Sophia Lu","Chenyang Zhong","Wing Hung Wong"],"pdf_url":"https://arxiv.org/pdf/2402.10456v2.pdf","comment":"54 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.14887v3","updated":"2025-01-07T09:55:57Z","published":"2024-09-23T10:35:57Z","title":"Deploying Open-Source Large Language Models: A performance Analysis","summary":"  Since the release of ChatGPT in November 2022, large language models (LLMs)\nhave seen considerable success, including in the open-source community, with\nmany open-weight models available. However, the requirements to deploy such a\nservice are often unknown and difficult to evaluate in advance. To facilitate\nthis process, we conducted numerous tests at the Centre Inria de l'Universit\\'e\nde Bordeaux. In this article, we propose a comparison of the performance of\nseveral models of different sizes (mainly Mistral and LLaMa) depending on the\navailable GPUs, using vLLM, a Python library designed to optimize the inference\nof these models. Our results provide valuable information for private and\npublic groups wishing to deploy LLMs, allowing them to evaluate the performance\nof different models based on their available hardware. This study thus\ncontributes to facilitating the adoption and use of these large language models\nin various application domains.\n","authors":["Yannis Bendi-Ouis","Dan Dutartre","Xavier Hinaut"],"pdf_url":"https://arxiv.org/pdf/2409.14887v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02781v2","updated":"2025-01-07T09:54:50Z","published":"2025-01-06T05:53:38Z","title":"From Dense to Sparse: Event Response for Enhanced Residential Load\n  Forecasting","summary":"  Residential load forecasting (RLF) is crucial for resource scheduling in\npower systems. Most existing methods utilize all given load records (dense\ndata) to indiscriminately extract the dependencies between historical and\nfuture time series. However, there exist important regular patterns residing in\nthe event-related associations among different appliances (sparse knowledge),\nwhich have yet been ignored.In this paper, we propose an Event-Response\nKnowledge Guided approach (ERKG) for RLF by incorporating the estimation of\nelectricity usage events for different appliances, mining event-related sparse\nknowledge from the load series. With ERKG, the event-response estimation\nenables portraying the electricity consumption behaviors of residents,\nrevealing regular variations in appliance operational states.To be specific,\nERKG consists of knowledge extraction and guidance: i) a forecasting model is\ndesigned for the electricity usage events by estimating appliance operational\nstates, aiming to extract the event-related sparse knowledge; ii) a novel\nknowledge-guided mechanism is established by fusing such state estimates of the\nappliance events into the RLF model, which can give particular focuses on the\npatterns of users' electricity consumption behaviors.Notably, ERKG can flexibly\nserve as a plug-in module to boost the capability of existing forecasting\nmodels by leveraging event response. In numerical experiments, extensive\ncomparisons and ablation studies have verified the effectiveness of our ERKG,\ne.g., over 8% MAE can be reduced on the tested state-of-the-art forecasting\nmodels.\n","authors":["Xin Cao","Qinghua Tao","Yingjie Zhou","Lu Zhang","Le Zhang","Dongjin Song","Dapeng Oliver Wu","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.02781v2.pdf","comment":"12 pages and 6 figures. Accepted for publication by IEEE Transactions\n  on Instrumentation and Measurement"},{"id":"http://arxiv.org/abs/2501.03654v1","updated":"2025-01-07T09:40:02Z","published":"2025-01-07T09:40:02Z","title":"Data Augmentation for Deep Learning Regression Tasks by Machine Learning\n  Models","summary":"  Deep learning (DL) models have gained prominence in domains such as computer\nvision and natural language processing but remain underutilized for regression\ntasks involving tabular data. In these cases, traditional machine learning (ML)\nmodels often outperform DL models. In this study, we propose and evaluate\nvarious data augmentation (DA) techniques to improve the performance of DL\nmodels for tabular data regression tasks. We compare the performance gain of\nNeural Networks by different DA strategies ranging from a naive method of\nduplicating existing observations and adding noise to a more sophisticated DA\nstrategy that preserves the underlying statistical relationship in the data.\nOur analysis demonstrates that the advanced DA method significantly improves DL\nmodel performance across multiple datasets and regression tasks, resulting in\nan average performance increase of over 10\\% compared to baseline models\nwithout augmentation. The efficacy of these DA strategies was rigorously\nvalidated across 30 distinct datasets, with multiple iterations and evaluations\nusing three different automated deep learning (AutoDL) frameworks: AutoKeras,\nH2O, and AutoGluon. This study demonstrates that by leveraging advanced DA\ntechniques, DL models can realize their full potential in regression tasks,\nthereby contributing to broader adoption and enhanced performance in practical\napplications.\n","authors":["Assaf Shmuel","Oren Glickman","Teddy Lazebnik"],"pdf_url":"https://arxiv.org/pdf/2501.03654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16370v2","updated":"2025-01-07T09:34:51Z","published":"2024-11-25T13:26:09Z","title":"A Review of Bayesian Uncertainty Quantification in Deep Probabilistic\n  Image Segmentation","summary":"  Advancements in image segmentation play an integral role within the broad\nscope of Deep Learning-based Computer Vision. Furthermore, their widespread\napplicability in critical real-world tasks has resulted in challenges related\nto the reliability of such algorithms. Hence, uncertainty quantification has\nbeen extensively studied within this context, enabling the expression of model\nignorance (epistemic uncertainty) or data ambiguity (aleatoric uncertainty) to\nprevent uninformed decision-making. Due to the rapid adoption of Convolutional\nNeural Network (CNN)-based segmentation models in high-stake applications, a\nsubstantial body of research has been published on this very topic, causing its\nswift expansion into a distinct field. This work provides a comprehensive\noverview of probabilistic segmentation, by discussing fundamental concepts of\nuncertainty quantification, governing advancements in the field as well as the\napplication to various tasks. Moreover, literature on both types of\nuncertainties trace back to four key applications: (1) to quantify statistical\ninconsistencies in the annotation process due ambiguous images, (2) correlating\nprediction error with uncertainty, (3) expanding the model hypothesis space for\nbetter generalization, and (4) Active Learning. An extensive discussion follows\nthat includes an overview of utilized datasets for each of the applications and\nevaluation of the available methods. We also highlight challenges related to\narchitectures, uncertainty quantification methods, standardization and\nbenchmarking, and finally end with recommendations for future work such as\nmethods based on single forward passes and models that appropriately leverage\nvolumetric data.\n","authors":["M. M. A. Valiuddin","R. J. G. van Sloun","C. G. A. Viviers","P. H. N. de With","F. van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2411.16370v2.pdf","comment":"20 pages, revised"},{"id":"http://arxiv.org/abs/2402.00592v4","updated":"2025-01-07T09:24:34Z","published":"2024-02-01T13:41:44Z","title":"Partial-Label Learning with a Reject Option","summary":"  In real-world applications, one often encounters ambiguously labeled data,\nwhere different annotators assign conflicting class labels. Partial-label\nlearning allows training classifiers in this weakly supervised setting, where\nstate-of-the-art methods already show good predictive performance. However,\neven the best algorithms give incorrect predictions, which can have severe\nconsequences when they impact actions or decisions. We propose a novel\nrisk-consistent nearest-neighbor-based partial-label learning algorithm with a\nreject option, that is, the algorithm can reject unsure predictions. Extensive\nexperiments on artificial and real-world datasets show that our method provides\nthe best trade-off between the number and accuracy of non-rejected predictions\nwhen compared to our competitors, which use confidence thresholds for rejecting\nunsure predictions. When evaluated without the reject option, our\nnearest-neighbor-based approach also achieves competitive prediction\nperformance.\n","authors":["Tobias Fuchs","Florian Kalinke","Klemens Böhm"],"pdf_url":"https://arxiv.org/pdf/2402.00592v4.pdf","comment":"Accepted for publication at TMLR"},{"id":"http://arxiv.org/abs/2501.03635v1","updated":"2025-01-07T09:10:09Z","published":"2025-01-07T09:10:09Z","title":"MHGNet: Multi-Heterogeneous Graph Neural Network for Traffic Prediction","summary":"  In recent years, traffic flow prediction has played a crucial role in the\nmanagement of intelligent transportation systems. However, traditional\nforecasting methods often model non-Euclidean low-dimensional traffic data as a\nsimple graph with single-type nodes and edges, failing to capture similar\ntrends among nodes of the same type. To address this limitation, this paper\nproposes MHGNet, a novel framework for modeling spatiotemporal\nmulti-heterogeneous graphs. Within this framework, the STD Module decouples\nsingle-pattern traffic data into multi-pattern traffic data through feature\nmappings of timestamp embedding matrices and node embedding matrices.\nSubsequently, the Node Clusterer leverages the Euclidean distance between nodes\nand different types of limit points to perform clustering with O(N) time\ncomplexity. The nodes within each cluster undergo residual subgraph convolution\nwithin the spatiotemporal fusion subgraphs generated by the DSTGG Module,\nfollowed by processing in the SIE Module for node repositioning and\nredistribution of weights. To validate the effectiveness of MHGNet, this paper\nconducts extensive ablation studies and quantitative evaluations on four widely\nused benchmarks, demonstrating its superior performance.\n","authors":["Mei Wu","Yiqian Lin","Tianfan Jiang","Wenchao Weng"],"pdf_url":"https://arxiv.org/pdf/2501.03635v1.pdf","comment":"Accepted by 2025 lEEE International Conference on Acoustics, speech,\n  and signal Processing (lCASSP2025)"},{"id":"http://arxiv.org/abs/2406.02017v2","updated":"2025-01-07T09:02:36Z","published":"2024-06-04T06:57:12Z","title":"On the Mode-Seeking Properties of Langevin Dynamics","summary":"  The Langevin Dynamics framework, which aims to generate samples from the\nscore function of a probability distribution, is widely used for analyzing and\ninterpreting score-based generative modeling. While the convergence behavior of\nLangevin Dynamics under unimodal distributions has been extensively studied in\nthe literature, in practice the data distribution could consist of multiple\ndistinct modes. In this work, we investigate Langevin Dynamics in producing\nsamples from multimodal distributions and theoretically study its mode-seeking\nproperties. We prove that under a variety of sub-Gaussian mixtures, Langevin\nDynamics is unlikely to find all mixture components within a sub-exponential\nnumber of steps in the data dimension. To reduce the mode-seeking tendencies of\nLangevin Dynamics, we propose \\emph{Chained Langevin Dynamics}, which divides\nthe data vector into patches of constant size and generates every patch\nsequentially conditioned on the previous patches. We perform a theoretical\nanalysis of Chained Langevin Dynamics by reducing it to sampling from a\nconstant-dimensional distribution. We present the results of several numerical\nexperiments on synthetic and real image datasets, supporting our theoretical\nresults on the iteration complexities of sample generation from mixture\ndistributions using the chained and vanilla Langevin Dynamics. The code is\navailable at https://github.com/Xiwei-Cheng/Chained_LD.\n","authors":["Xiwei Cheng","Kexin Fu","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2406.02017v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03627v1","updated":"2025-01-07T08:54:42Z","published":"2025-01-07T08:54:42Z","title":"Coupled Hierarchical Structure Learning using Tree-Wasserstein Distance","summary":"  In many applications, both data samples and features have underlying\nhierarchical structures. However, existing methods for learning these latent\nstructures typically focus on either samples or features, ignoring possible\ncoupling between them. In this paper, we introduce a coupled hierarchical\nstructure learning method using tree-Wasserstein distance (TWD). Our method\njointly computes TWDs for samples and features, representing their latent\nhierarchies as trees. We propose an iterative, unsupervised procedure to build\nthese sample and feature trees based on diffusion geometry, hyperbolic\ngeometry, and wavelet filters. We show that this iterative procedure converges\nand empirically improves the quality of the constructed trees. The method is\nalso computationally efficient and scales well in high-dimensional settings.\nOur method can be seamlessly integrated with hyperbolic graph convolutional\nnetworks (HGCN). We demonstrate that our method outperforms competing\napproaches in sparse approximation and unsupervised Wasserstein distance\nlearning on several word-document and single-cell RNA-sequencing datasets. In\naddition, integrating our method into HGCN enhances performance in link\nprediction and node classification tasks.\n","authors":["Ya-Wei Eileen Lin","Ronald R. Coifman","Gal Mishne","Ronen Talmon"],"pdf_url":"https://arxiv.org/pdf/2501.03627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.06538v2","updated":"2025-01-07T08:52:30Z","published":"2022-08-13T01:20:39Z","title":"Transferable Adversarial Examples with Bayes Approach","summary":"  The vulnerability of deep neural networks (DNNs) to black-box adversarial\nattacks is one of the most heated topics in trustworthy AI. In such attacks,\nthe attackers operate without any insider knowledge of the model, making the\ncross-model transferability of adversarial examples critical. Despite the\npotential for adversarial examples to be effective across various models, it\nhas been observed that adversarial examples that are specifically crafted for a\nspecific model often exhibit poor transferability. In this paper, we explore\nthe transferability of adversarial examples via the lens of Bayesian approach.\nSpecifically, we leverage Bayesian approach to probe the transferability and\nthen study what constitutes a transferability-promoting prior. Following this,\nwe design two concrete transferability-promoting priors, along with an adaptive\ndynamic weighting strategy for instances sampled from these priors. Employing\nthese techniques, we present BayAtk. Extensive experiments illustrate the\nsignificant effectiveness of BayAtk in crafting more transferable adversarial\nexamples against both undefended and defended black-box models compared to\nexisting state-of-the-art attacks.\n","authors":["Mingyuan Fan","Cen Chen","Wenmeng Zhou","Yinggui Wang"],"pdf_url":"https://arxiv.org/pdf/2208.06538v2.pdf","comment":"Accepted in AsiaCCS'25"},{"id":"http://arxiv.org/abs/2406.00502v3","updated":"2025-01-07T08:50:35Z","published":"2024-06-01T17:10:56Z","title":"Non-geodesically-convex optimization in the Wasserstein space","summary":"  We study a class of optimization problems in the Wasserstein space (the space\nof probability measures) where the objective function is nonconvex along\ngeneralized geodesics. Specifically, the objective exhibits some\ndifference-of-convex structure along these geodesics. The setting also\nencompasses sampling problems where the logarithm of the target distribution is\ndifference-of-convex. We derive multiple convergence insights for a novel semi\nForward-Backward Euler scheme under several nonconvex (and possibly nonsmooth)\nregimes. Notably, the semi Forward-Backward Euler is just a slight modification\nof the Forward-Backward Euler whose convergence is -- to our knowledge -- still\nunknown in our very general non-geodesically-convex setting.\n","authors":["Hoang Phuc Hau Luu","Hanlin Yu","Bernardo Williams","Petrus Mikkola","Marcelo Hartmann","Kai Puolamäki","Arto Klami"],"pdf_url":"https://arxiv.org/pdf/2406.00502v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08980v2","updated":"2025-01-07T08:49:30Z","published":"2024-04-13T12:07:20Z","title":"Stability and Generalization in Free Adversarial Training","summary":"  While adversarial training methods have significantly improved the robustness\nof deep neural networks against norm-bounded adversarial perturbations, the\ngeneralization gap between their performance on training and test data is\nconsiderably greater than that of standard empirical risk minimization. Recent\nstudies have aimed to connect the generalization properties of adversarially\ntrained classifiers to the min-max optimization algorithm used in their\ntraining. In this work, we analyze the interconnections between generalization\nand optimization in adversarial training using the algorithmic stability\nframework. Specifically, our goal is to compare the generalization gap of\nneural networks trained using the vanilla adversarial training method, which\nfully optimizes perturbations at every iteration, with the free adversarial\ntraining method, which simultaneously optimizes norm-bounded perturbations and\nclassifier parameters. We prove bounds on the generalization error of these\nmethods, indicating that the free adversarial training method may exhibit a\nlower generalization gap between training and test samples due to its\nsimultaneous min-max optimization of classifier weights and perturbation\nvariables. We conduct several numerical experiments to evaluate the\ntrain-to-test generalization gap in vanilla and free adversarial training\nmethods. Our empirical findings also suggest that the free adversarial training\nmethod could lead to a smaller generalization gap over a similar number of\ntraining iterations. The paper code is available at\nhttps://github.com/Xiwei-Cheng/Stability_FreeAT.\n","authors":["Xiwei Cheng","Kexin Fu","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2404.08980v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04195v2","updated":"2025-01-07T08:46:02Z","published":"2023-09-08T08:12:29Z","title":"Towards Mitigating Architecture Overfitting on Distilled Datasets","summary":"  Dataset distillation methods have demonstrated remarkable performance for\nneural networks trained with very limited training data. However, a significant\nchallenge arises in the form of \\textit{architecture overfitting}: the\ndistilled training dataset synthesized by a specific network architecture\n(i.e., training network) generates poor performance when trained by other\nnetwork architectures (i.e., test networks), especially when the test networks\nhave a larger capacity than the training network. This paper introduces a\nseries of approaches to mitigate this issue. Among them, DropPath renders the\nlarge model to be an implicit ensemble of its sub-networks, and knowledge\ndistillation ensures each sub-network acts similarly to the small but\nwell-performing teacher network. These methods, characterized by their\nsmoothing effects, significantly mitigate architecture overfitting. We conduct\nextensive experiments to demonstrate the effectiveness and generality of our\nmethods. Particularly, across various scenarios involving different tasks and\ndifferent sizes of distilled data, our approaches significantly mitigate\narchitecture overfitting. Furthermore, our approaches achieve comparable or\neven superior performance when the test network is larger than the training\nnetwork.\n","authors":["Xuyang Zhong","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2309.04195v2.pdf","comment":"Accepted by TNNLS"},{"id":"http://arxiv.org/abs/2501.02721v2","updated":"2025-01-07T08:14:34Z","published":"2025-01-06T02:25:48Z","title":"Learning Stochastic Nonlinear Dynamics with Embedded Latent Transfer\n  Operators","summary":"  We consider an operator-based latent Markov representation of a stochastic\nnonlinear dynamical system, where the stochastic evolution of the latent state\nembedded in a reproducing kernel Hilbert space is described with the\ncorresponding transfer operator, and develop a spectral method to learn this\nrepresentation based on the theory of stochastic realization. The embedding may\nbe learned simultaneously using reproducing kernels, for example, constructed\nwith feed-forward neural networks. We also address the generalization of\nsequential state-estimation (Kalman filtering) in stochastic nonlinear systems,\nand of operator-based eigen-mode decomposition of dynamics, for the\nrepresentation. Several examples with synthetic and real-world data are shown\nto illustrate the empirical characteristics of our methods, and to investigate\nthe performance of our model in sequential state-estimation and mode\ndecomposition.\n","authors":["Naichang Ke","Ryogo Tanaka","Yoshinobu Kawahara"],"pdf_url":"https://arxiv.org/pdf/2501.02721v2.pdf","comment":"This submission includes a supplementary file providing additional\n  details. It also contains a code directory (code/) for the experiments. Both\n  are included within the TeX source package"},{"id":"http://arxiv.org/abs/2501.01216v3","updated":"2025-01-07T07:46:19Z","published":"2025-01-02T11:57:08Z","title":"TabTreeFormer: Tabular Data Generation Using Hybrid Tree-Transformer","summary":"  Transformers have achieved remarkable success in tabular data generation.\nHowever, they lack domain-specific inductive biases which are critical to\npreserving the intrinsic characteristics of tabular data. Meanwhile, they\nsuffer from poor scalability and efficiency due to quadratic computational\ncomplexity. In this paper, we propose TabTreeFormer, a hybrid transformer\narchitecture that incorporates a tree-based model that retains tabular-specific\ninductive biases of non-smooth and potentially low-correlated patterns caused\nby discreteness and non-rotational invariance, and hence enhances the fidelity\nand utility of synthetic data. In addition, we devise a dual-quantization\ntokenizer to capture the multimodal continuous distribution and further\nfacilitate the learning of numerical value distribution. Moreover, our proposed\ntokenizer reduces the vocabulary size and sequence length due to the limited\ncomplexity (e.g., dimension-wise semantic meaning) of tabular data, rendering a\nsignificant model size shrink without sacrificing the capability of the\ntransformer model. We evaluate TabTreeFormer on 10 datasets against multiple\ngenerative models on various metrics; our experimental results show that\nTabTreeFormer achieves superior fidelity, utility, privacy, and efficiency. Our\nbest model yields a 40% utility improvement with 1/16 of the baseline model\nsize.\n","authors":["Jiayu Li","Bingyin Zhao","Zilong Zhao","Kevin Yee","Uzair Javaid","Biplab Sikdar"],"pdf_url":"https://arxiv.org/pdf/2501.01216v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.11233v3","updated":"2025-01-07T07:39:45Z","published":"2021-05-15T12:18:31Z","title":"Gradient descent in materia through homodyne gradient extraction","summary":"  Deep learning, a multi-layered neural network approach inspired by the brain,\nhas revolutionized machine learning. One of its key enablers has been\nbackpropagation, an algorithm that computes the gradient of a loss function\nwith respect to the weights and biases in the neural network model, in\ncombination with its use in gradient descent. However, the implementation of\ndeep learning in digital computers is intrinsically energy hungry, with energy\nconsumption becoming prohibitively high for many applications. This has\nstimulated the development of specialized hardware, ranging from neuromorphic\nCMOS integrated circuits and integrated photonic tensor cores to\nunconventional, material-based computing system. The learning process in these\nmaterial systems, realized, e.g., by artificial evolution, equilibrium\npropagation or surrogate modelling, is a complicated and time-consuming\nprocess. Here, we demonstrate a simple yet efficient and accurate gradient\nextraction method, based on the principle of homodyne detection, for performing\ngradient descent on a loss function directly in a physical system without the\nneed of an analytical description. By perturbing the parameters that need to be\noptimized using sinusoidal waveforms with distinct frequencies, we effectively\nobtain the gradient information in a highly robust and scalable manner. We\nillustrate the method in dopant network processing units, but argue that it is\napplicable in a wide range of physical systems. Homodyne gradient extraction\ncan in principle be fully implemented in materia, facilitating the development\nof autonomously learning material systems.\n","authors":["Marcus N. Boon","Lorenzo Cassola","Hans-Christian Ruiz Euler","Tao Chen","Bram van de Ven","Unai Alegre Ibarra","Peter A. Bobbert","Wilfred G. van der Wiel"],"pdf_url":"https://arxiv.org/pdf/2105.11233v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03584v1","updated":"2025-01-07T07:17:04Z","published":"2025-01-07T07:17:04Z","title":"Discriminative Representation learning via Attention-Enhanced\n  Contrastive Learning for Short Text Clustering","summary":"  Contrastive learning has gained significant attention in short text\nclustering, yet it has an inherent drawback of mistakenly identifying samples\nfrom the same category as negatives and then separating them in the feature\nspace (false negative separation), which hinders the generation of superior\nrepresentations. To generate more discriminative representations for efficient\nclustering, we propose a novel short text clustering method, called\nDiscriminative Representation learning via \\textbf{A}ttention-\\textbf{E}nhanced\n\\textbf{C}ontrastive \\textbf{L}earning for Short Text Clustering\n(\\textbf{AECL}). The \\textbf{AECL} consists of two modules which are the\npseudo-label generation module and the contrastive learning module. Both\nmodules build a sample-level attention mechanism to capture similarity\nrelationships between samples and aggregate cross-sample features to generate\nconsistent representations. Then, the former module uses the more\ndiscriminative consistent representation to produce reliable supervision\ninformation for assist clustering, while the latter module explores similarity\nrelationships and consistent representations optimize the construction of\npositive samples to perform similarity-guided contrastive learning, effectively\naddressing the false negative separation issue. Experimental results\ndemonstrate that the proposed \\textbf{AECL} outperforms state-of-the-art\nmethods. If the paper is accepted, we will open-source the code.\n","authors":["Zhihao Yao"],"pdf_url":"https://arxiv.org/pdf/2501.03584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03583v1","updated":"2025-01-07T07:16:56Z","published":"2025-01-07T07:16:56Z","title":"STContext: A Multifaceted Dataset for Developing Context-aware\n  Spatio-temporal Crowd Mobility Prediction Models","summary":"  In smart cities, context-aware spatio-temporal crowd flow prediction (STCFP)\nmodels leverage contextual features (e.g., weather) to identify unusual crowd\nmobility patterns and enhance prediction accuracy. However, the best practice\nfor incorporating contextual features remains unclear due to inconsistent usage\nof contextual features in different papers. Developing a multifaceted dataset\nwith rich types of contextual features and STCFP scenarios is crucial for\nestablishing a principled context modeling paradigm. Existing open crowd flow\ndatasets lack an adequate range of contextual features, which poses an urgent\nrequirement to build a multifaceted dataset to fill these research gaps. To\nthis end, we create STContext, a multifaceted dataset for developing\ncontext-aware STCFP models. Specifically, STContext provides nine\nspatio-temporal datasets across five STCFP scenarios and includes ten\ncontextual features, including weather, air quality index, holidays, points of\ninterest, road networks, etc. Besides, we propose a unified workflow for\nincorporating contextual features into deep STCFP methods, with steps including\nfeature transformation, dependency modeling, representation fusion, and\ntraining strategies. Through extensive experiments, we have obtained several\nuseful guidelines for effective context modeling and insights for future\nresearch. The STContext is open-sourced at\nhttps://github.com/Liyue-Chen/STContext.\n","authors":["Liyue Chen","Jiangyi Fang","Tengfei Liu","Fangyuan Gao","Leye Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18263v3","updated":"2025-01-07T07:05:17Z","published":"2024-12-24T08:25:38Z","title":"High-Rank Irreducible Cartesian Tensor Decomposition and Bases of\n  Equivariant Spaces","summary":"  Irreducible Cartesian tensors (ICTs) play a crucial role in the design of\nequivariant graph neural networks, as well as in theoretical chemistry and\nchemical physics. Meanwhile, the design space of available linear operations on\ntensors that preserve symmetry presents a significant challenge. The ICT\ndecomposition and a basis of this equivariant space are difficult to obtain for\nhigh-order tensors. After decades of research, Bonvicini (2024) recently\nachieves an explicit ICT decomposition for $n=5$ with factorial time/space\ncomplexity. This work, for the first time, obtains decomposition matrices for\nICTs up to rank $n=9$ with reduced and affordable complexity, by constructing\nwhat we call path matrices. The path matrices are obtained via performing\nchain-like contraction with Clebsch-Gordan matrices following the parentage\nscheme. We prove and leverage that the concatenation of path matrices is an\northonormal change-of-basis matrix between the Cartesian tensor product space\nand the spherical direct sum spaces. Furthermore, we identify a complete\northogonal basis for the equivariant space, rather than a spanning set\n(Pearce-Crump, 2023b), through this path matrices technique. We further extend\nour result to the arbitrary tensor product and direct sum spaces, enabling free\ndesign between different spaces while keeping symmetry. The Python code is\navailable at\nhttps://github.com/ShihaoShao-GH/ICT-decomposition-and-equivariant-bases, where\nthe $n=6,\\dots,9$ ICT decomposition matrices are obtained in 1s, 3s, 11s, and\n4m32s on on 28-cores Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz, respectively.\n","authors":["Shihao Shao","Yikang Li","Zhouchen Lin","Qinghua Cui"],"pdf_url":"https://arxiv.org/pdf/2412.18263v3.pdf","comment":"46 pages"},{"id":"http://arxiv.org/abs/2501.03575v1","updated":"2025-01-07T06:55:50Z","published":"2025-01-07T06:55:50Z","title":"Cosmos World Foundation Model Platform for Physical AI","summary":"  Physical AI needs to be trained digitally first. It needs a digital twin of\nitself, the policy model, and a digital twin of the world, the world model. In\nthis paper, we present the Cosmos World Foundation Model Platform to help\ndevelopers build customized world models for their Physical AI setups. We\nposition a world foundation model as a general-purpose world model that can be\nfine-tuned into customized world models for downstream applications. Our\nplatform covers a video curation pipeline, pre-trained world foundation models,\nexamples of post-training of pre-trained world foundation models, and video\ntokenizers. To help Physical AI builders solve the most critical problems of\nour society, we make our platform open-source and our models open-weight with\npermissive licenses available via https://github.com/NVIDIA/Cosmos.\n","authors":[" NVIDIA"," :","Niket Agarwal","Arslan Ali","Maciej Bala","Yogesh Balaji","Erik Barker","Tiffany Cai","Prithvijit Chattopadhyay","Yongxin Chen","Yin Cui","Yifan Ding","Daniel Dworakowski","Jiaojiao Fan","Michele Fenzi","Francesco Ferroni","Sanja Fidler","Dieter Fox","Songwei Ge","Yunhao Ge","Jinwei Gu","Siddharth Gururani","Ethan He","Jiahui Huang","Jacob Huffman","Pooya Jannaty","Jingyi Jin","Seung Wook Kim","Gergely Klár","Grace Lam","Shiyi Lan","Laura Leal-Taixe","Anqi Li","Zhaoshuo Li","Chen-Hsuan Lin","Tsung-Yi Lin","Huan Ling","Ming-Yu Liu","Xian Liu","Alice Luo","Qianli Ma","Hanzi Mao","Kaichun Mo","Arsalan Mousavian","Seungjun Nah","Sriharsha Niverty","David Page","Despoina Paschalidou","Zeeshan Patel","Lindsey Pavao","Morteza Ramezanali","Fitsum Reda","Xiaowei Ren","Vasanth Rao Naik Sabavat","Ed Schmerling","Stella Shi","Bartosz Stefaniak","Shitao Tang","Lyne Tchapmi","Przemek Tredak","Wei-Cheng Tseng","Jibin Varghese","Hao Wang","Haoxiang Wang","Heng Wang","Ting-Chun Wang","Fangyin Wei","Xinyue Wei","Jay Zhangjie Wu","Jiashu Xu","Wei Yang","Lin Yen-Chen","Xiaohui Zeng","Yu Zeng","Jing Zhang","Qinsheng Zhang","Yuxuan Zhang","Qingqing Zhao","Artur Zolkowski"],"pdf_url":"https://arxiv.org/pdf/2501.03575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03571v1","updated":"2025-01-07T06:51:17Z","published":"2025-01-07T06:51:17Z","title":"AADNet: Exploring EEG Spatiotemporal Information for Fast and Accurate\n  Orientation and Timbre Detection of Auditory Attention Based on A Cue-Masked\n  Paradigm","summary":"  Auditory attention decoding from electroencephalogram (EEG) could infer to\nwhich source the user is attending in noisy environments. Decoding algorithms\nand experimental paradigm designs are crucial for the development of technology\nin practical applications. To simulate real-world scenarios, this study\nproposed a cue-masked auditory attention paradigm to avoid information leakage\nbefore the experiment. To obtain high decoding accuracy with low latency, an\nend-to-end deep learning model, AADNet, was proposed to exploit the\nspatiotemporal information from the short time window of EEG signals. The\nresults showed that with a 0.5-second EEG window, AADNet achieved an average\naccuracy of 93.46% and 91.09% in decoding auditory orientation attention (OA)\nand timbre attention (TA), respectively. It significantly outperformed five\nprevious methods and did not need the knowledge of the original audio source.\nThis work demonstrated that it was possible to detect the orientation and\ntimbre of auditory attention from EEG signals fast and accurately. The results\nare promising for the real-time multi-property auditory attention decoding,\nfacilitating the application of the neuro-steered hearing aids and other\nassistive listening devices.\n","authors":["Keren Shi","Xu Liu","Xue Yuan","Haijie Shang","Ruiting Dai","Hanbin Wang","Yunfa Fu","Ning Jiang","Jiayuan He"],"pdf_url":"https://arxiv.org/pdf/2501.03571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12370v2","updated":"2025-01-07T06:47:00Z","published":"2024-12-16T21:56:01Z","title":"Scam Detection for Ethereum Smart Contracts: Leveraging Graph\n  Representation Learning for Secure Blockchain","summary":"  Due to the increasing abuse of fraudulent activities that result in\nsignificant financial and reputational harm, Ethereum smart contracts face a\nsignificant problem in detecting fraud. Existing monitoring methods typically\nrely on lease code analysis or physically extracted features, which suffer from\nscalability and adaptability limitations. In this study, we use graph\nrepresentation learning to observe purchase trends and find fraudulent deals.\nWe can achieve powerful categorisation performance by using innovative machine\nlearning versions and transforming Ethereum invoice data into graph structures.\nOur method addresses label imbalance through SMOTE-ENN techniques and evaluates\nmodels like Multi-Layer Perceptron ( MLP ) and Graph Convolutional Networks (\nGCN). Experimental results show that the MLP type surpasses the GCN in this\nenvironment, with domain-specific assessments closely aligned with real-world\nassessments. This study provides a scalable and efficient way to improve\nEthereum's ecosystem's confidence and security.\n","authors":["Yihong Jin","Ze Yang"],"pdf_url":"https://arxiv.org/pdf/2412.12370v2.pdf","comment":"Accepted to BDICN 2025"},{"id":"http://arxiv.org/abs/2403.10089v4","updated":"2025-01-07T06:45:58Z","published":"2024-03-15T08:05:16Z","title":"Approximation and bounding techniques for the Fisher-Rao distances\n  between parametric statistical models","summary":"  The Fisher-Rao distance between two probability distributions of a\nstatistical model is defined as the Riemannian geodesic distance induced by the\nFisher information metric. In order to calculate the Fisher-Rao distance in\nclosed-form, we need (1) to elicit a formula for the Fisher-Rao geodesics, and\n(2) to integrate the Fisher length element along those geodesics. We consider\nseveral numerically robust approximation and bounding techniques for the\nFisher-Rao distances: First, we report generic upper bounds on Fisher-Rao\ndistances based on closed-form 1D Fisher-Rao distances of submodels. Second, we\ndescribe several generic approximation schemes depending on whether the\nFisher-Rao geodesics or pregeodesics are available in closed-form or not. In\nparticular, we obtain a generic method to guarantee an arbitrarily small\nadditive error on the approximation provided that Fisher-Rao pregeodesics and\ntight lower and upper bounds are available. Third, we consider the case of\nFisher metrics being Hessian metrics, and report generic tight upper bounds on\nthe Fisher-Rao distances using techniques of information geometry.\nUniparametric and biparametric statistical models always have Fisher Hessian\nmetrics, and in general a simple test allows to check whether the Fisher\ninformation matrix yields a Hessian metric or not. Fourth, we consider\nelliptical distribution families and show how to apply the above techniques to\nthese models. We also propose two new distances based either on the Fisher-Rao\nlengths of curves serving as proxies of Fisher-Rao geodesics, or based on the\nBirkhoff/Hilbert projective cone distance. Last, we consider an alternative\ngroup-theoretic approach for statistical transformation models based on the\nnotion of maximal invariant which yields insights on the structures of the\nFisher-Rao distance formula which may be used fruitfully in applications.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2403.10089v4.pdf","comment":"48 pages"},{"id":"http://arxiv.org/abs/2501.03568v1","updated":"2025-01-07T06:43:18Z","published":"2025-01-07T06:43:18Z","title":"Advanced Tutorial: Label-Efficient Two-Sample Tests","summary":"  Hypothesis testing is a statistical inference approach used to determine\nwhether data supports a specific hypothesis. An important type is the\ntwo-sample test, which evaluates whether two sets of data points are from\nidentical distributions. This test is widely used, such as by clinical\nresearchers comparing treatment effectiveness. This tutorial explores\ntwo-sample testing in a context where an analyst has many features from two\nsamples, but determining the sample membership (or labels) of these features is\ncostly. In machine learning, a similar scenario is studied in active learning.\nThis tutorial extends active learning concepts to two-sample testing within\nthis \\textit{label-costly} setting while maintaining statistical validity and\nhigh testing power. Additionally, the tutorial discusses practical applications\nof these label-efficient two-sample tests.\n","authors":["Weizhi Li","Visar Berisha","Gautam Dasarathy"],"pdf_url":"https://arxiv.org/pdf/2501.03568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15320v2","updated":"2025-01-07T06:39:29Z","published":"2024-07-07T09:25:52Z","title":"Edge Graph Intelligence: Reciprocally Empowering Edge Networks with\n  Graph Intelligence","summary":"  Recent years have witnessed a thriving growth of computing facilities\nconnected at the network edge, cultivating edge networks as a fundamental\ninfrastructure for supporting miscellaneous intelligent services.Meanwhile,\nArtificial Intelligence (AI) frontiers have extrapolated to the graph domain\nand promoted Graph Intelligence (GI). Given the inherent relation between\ngraphs and networks, the interdiscipline of graph learning and edge networks,\ni.e., Edge GI or EGI, has revealed a novel interplay between them -- GI aids in\noptimizing edge networks, while edge networks facilitate GI model deployment.\nDriven by this delicate closed-loop, EGI is recognized as a promising solution\nto fully unleash the potential of edge computing power and is garnering growing\nattention. Nevertheless, research on EGI remains nascent, and there is a\nsoaring demand within both the communications and AI communities for a\ndedicated venue to share recent advancements. To this end, this paper promotes\nthe concept of EGI, explores its scope and core principles, and conducts a\ncomprehensive survey concerning recent research efforts on this emerging field.\nSpecifically, this paper introduces and discusses: 1) fundamentals of edge\ncomputing and graph learning,2) emerging techniques centering on the closed\nloop between graph intelligence and edge networks, and 3) open challenges and\nresearch opportunities of future EGI. By bridging the gap across communication,\nnetworking, and graph learning areas, we believe that this survey can garner\nincreased attention, foster meaningful discussions, and inspire further\nresearch ideas in EGI.\n","authors":["Liekang Zeng","Shengyuan Ye","Xu Chen","Xiaoxi Zhang","Ju Ren","Jian Tang","Yang Yang"," Xuemin"," Shen"],"pdf_url":"https://arxiv.org/pdf/2407.15320v2.pdf","comment":"Accepted by IEEE Communications Surveys & Tutorials"},{"id":"http://arxiv.org/abs/2412.02155v2","updated":"2025-01-07T06:30:24Z","published":"2024-12-03T04:29:27Z","title":"CausalMob: Causal Human Mobility Prediction with LLMs-derived Human\n  Intentions toward Public Events","summary":"  Large-scale human mobility exhibits spatial and temporal patterns that can\nassist policymakers in decision making. Although traditional prediction models\nattempt to capture these patterns, they often interfered by non-periodic public\nevents, such as disasters and occasional celebrations. Since regular human\nmobility patterns are heavily affected by these events, estimating their causal\neffects is critical to accurate mobility predictions. Although news articles\nprovide unique perspectives on these events in an unstructured format,\nprocessing is a challenge. In this study, we propose a causality-augmented\nprediction model, called CausalMob, to analyze the causal effects of public\nevents. We first utilize large language models (LLMs) to extract human\nintentions from news articles and transform them into features that act as\ncausal treatments. Next, the model learns representations of spatio-temporal\nregional covariates from multiple data sources to serve as confounders for\ncausal inference. Finally, we present a causal effect estimation framework to\nensure event features remain independent of confounders during prediction.\nBased on large-scale real-world data, the experimental results show that the\nproposed model excels in human mobility prediction, outperforming\nstate-of-the-art models.\n","authors":["Xiaojie Yang","Hangli Ge","Jiawei Wang","Zipei Fan","Renhe Jiang","Ryosuke Shibasaki","Noboru Koshizuka"],"pdf_url":"https://arxiv.org/pdf/2412.02155v2.pdf","comment":"Accepted by KDD 2025"},{"id":"http://arxiv.org/abs/2310.10207v6","updated":"2025-01-07T06:28:56Z","published":"2023-10-16T09:19:18Z","title":"Bongard-OpenWorld: Few-Shot Reasoning for Free-form Visual Concepts in\n  the Real World","summary":"  We introduce Bongard-OpenWorld, a new benchmark for evaluating real-world\nfew-shot reasoning for machine vision. It originates from the classical Bongard\nProblems (BPs): Given two sets of images (positive and negative), the model\nneeds to identify the set that query images belong to by inducing the visual\nconcepts, which is exclusively depicted by images from the positive set. Our\nbenchmark inherits the few-shot concept induction of the original BPs while\nadding the two novel layers of challenge: 1) open-world free-form concepts, as\nthe visual concepts in Bongard-OpenWorld are unique compositions of terms from\nan open vocabulary, ranging from object categories to abstract visual\nattributes and commonsense factual knowledge; 2) real-world images, as opposed\nto the synthetic diagrams used by many counterparts. In our exploration,\nBongard-OpenWorld already imposes a significant challenge to current few-shot\nreasoning algorithms. We further investigate to which extent the recently\nintroduced Large Language Models (LLMs) and Vision-Language Models (VLMs) can\nsolve our task, by directly probing VLMs, and combining VLMs and LLMs in an\ninteractive reasoning scheme. We even conceived a neuro-symbolic reasoning\napproach that reconciles LLMs & VLMs with logical reasoning to emulate the\nhuman problem-solving process for Bongard Problems. However, none of these\napproaches manage to close the human-machine gap, as the best learner achieves\n64% accuracy while human participants easily reach 91%. We hope\nBongard-OpenWorld can help us better understand the limitations of current\nvisual intelligence and facilitate future research on visual agents with\nstronger few-shot visual reasoning capabilities.\n","authors":["Rujie Wu","Xiaojian Ma","Zhenliang Zhang","Wei Wang","Qing Li","Song-Chun Zhu","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2310.10207v6.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2501.03562v1","updated":"2025-01-07T06:22:55Z","published":"2025-01-07T06:22:55Z","title":"Rethinking Adversarial Attacks in Reinforcement Learning from Policy\n  Distribution Perspective","summary":"  Deep Reinforcement Learning (DRL) suffers from uncertainties and inaccuracies\nin the observation signal in realworld applications. Adversarial attack is an\neffective method for evaluating the robustness of DRL agents. However, existing\nattack methods targeting individual sampled actions have limited impacts on the\noverall policy distribution, particularly in continuous action spaces. To\naddress these limitations, we propose the Distribution-Aware Projected Gradient\nDescent attack (DAPGD). DAPGD uses distribution similarity as the gradient\nperturbation input to attack the policy network, which leverages the entire\npolicy distribution rather than relying on individual samples. We utilize the\nBhattacharyya distance in DAPGD to measure policy similarity, enabling\nsensitive detection of subtle but critical differences between probability\ndistributions. Our experiment results demonstrate that DAPGD achieves SOTA\nresults compared to the baselines in three robot navigation tasks, achieving an\naverage 22.03% higher reward drop compared to the best baseline.\n","authors":["Tianyang Duan","Zongyuan Zhang","Zheng Lin","Yue Gao","Ling Xiong","Yong Cui","Hongbin Liang","Xianhao Chen","Heming Cui","Dong Huang"],"pdf_url":"https://arxiv.org/pdf/2501.03562v1.pdf","comment":"10 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.03560v1","updated":"2025-01-07T06:21:40Z","published":"2025-01-07T06:21:40Z","title":"KG-TRICK: Unifying Textual and Relational Information Completion of\n  Knowledge for Multilingual Knowledge Graphs","summary":"  Multilingual knowledge graphs (KGs) provide high-quality relational and\ntextual information for various NLP applications, but they are often\nincomplete, especially in non-English languages. Previous research has shown\nthat combining information from KGs in different languages aids either\nKnowledge Graph Completion (KGC), the task of predicting missing relations\nbetween entities, or Knowledge Graph Enhancement (KGE), the task of predicting\nmissing textual information for entities. Although previous efforts have\nconsidered KGC and KGE as independent tasks, we hypothesize that they are\ninterdependent and mutually beneficial. To this end, we introduce KG-TRICK, a\nnovel sequence-to-sequence framework that unifies the tasks of textual and\nrelational information completion for multilingual KGs. KG-TRICK demonstrates\nthat: i) it is possible to unify the tasks of KGC and KGE into a single\nframework, and ii) combining textual information from multiple languages is\nbeneficial to improve the completeness of a KG. As part of our contributions,\nwe also introduce WikiKGE10++, the largest manually-curated benchmark for\ntextual information completion of KGs, which features over 25,000 entities\nacross 10 diverse languages.\n","authors":["Zelin Zhou","Simone Conia","Daniel Lee","Min Li","Shenglei Huang","Umar Farooq Minhas","Saloni Potdar","Henry Xiao","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2501.03560v1.pdf","comment":"Camera ready for COLING 2025"},{"id":"http://arxiv.org/abs/2408.09791v2","updated":"2025-01-07T06:11:20Z","published":"2024-08-19T08:40:53Z","title":"ALTBI: Constructing Improved Outlier Detection Models via Optimization\n  of Inlier-Memorization Effect","summary":"  Outlier detection (OD) is the task of identifying unusual observations (or\noutliers) from a given or upcoming data by learning unique patterns of normal\nobservations (or inliers). Recently, a study introduced a powerful unsupervised\nOD (UOD) solver based on a new observation of deep generative models, called\ninlier-memorization (IM) effect, which suggests that generative models memorize\ninliers before outliers in early learning stages. In this study, we aim to\ndevelop a theoretically principled method to address UOD tasks by maximally\nutilizing the IM effect. We begin by observing that the IM effect is observed\nmore clearly when the given training data contain fewer outliers. This finding\nindicates a potential for enhancing the IM effect in UOD regimes if we can\neffectively exclude outliers from mini-batches when designing the loss\nfunction. To this end, we introduce two main techniques: 1) increasing the\nmini-batch size as the model training proceeds and 2) using an adaptive\nthreshold to calculate the truncated loss function. We theoretically show that\nthese two techniques effectively filter out outliers from the truncated loss\nfunction, allowing us to utilize the IM effect to the fullest. Coupled with an\nadditional ensemble strategy, we propose our method and term it Adaptive Loss\nTruncation with Batch Increment (ALTBI). We provide extensive experimental\nresults to demonstrate that ALTBI achieves state-of-the-art performance in\nidentifying outliers compared to other recent methods, even with significantly\nlower computation costs. Additionally, we show that our method yields robust\nperformances when combined with privacy-preserving algorithms.\n","authors":["Seoyoung Cho","Jaesung Hwang","Kwan-Young Bak","Dongha Kim"],"pdf_url":"https://arxiv.org/pdf/2408.09791v2.pdf","comment":"24 pages in total"},{"id":"http://arxiv.org/abs/2405.05409v4","updated":"2025-01-07T06:08:52Z","published":"2024-05-08T20:23:24Z","title":"Initialization is Critical to Whether Transformers Fit Composite\n  Functions by Reasoning or Memorizing","summary":"  Transformers have shown impressive capabilities across various tasks, but\ntheir performance on compositional problems remains a topic of debate. In this\nwork, we investigate the mechanisms of how transformers behave on unseen\ncompositional tasks. We discover that the parameter initialization scale plays\na critical role in determining whether the model learns inferential\n(reasoning-based) solutions, which capture the underlying compositional\nprimitives, or symmetric (memory-based) solutions, which simply memorize\nmappings without understanding the compositional structure. By analyzing the\ninformation flow and vector representations within the model, we reveal the\ndistinct mechanisms underlying these solution types. We further find that\ninferential (reasoning-based) solutions exhibit low complexity bias, which we\nhypothesize is a key factor enabling them to learn individual mappings for\nsingle anchors. We validate our conclusions on various real-world datasets. Our\nfindings provide valuable insights into the role of initialization scale in\ntuning the reasoning and memorizing ability and we propose the initialization\nrate $\\gamma$ to be a convenient tunable hyper-parameter in common deep\nlearning frameworks, where $1/d_{\\mathrm{in}}^\\gamma$ is the standard deviation\nof parameters of the layer with $d_{\\mathrm{in}}$ input neurons.\n","authors":["Zhongwang Zhang","Pengxiao Lin","Zhiwei Wang","Yaoyu Zhang","Zhi-Qin John Xu"],"pdf_url":"https://arxiv.org/pdf/2405.05409v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02156v2","updated":"2025-01-07T05:36:22Z","published":"2025-01-04T01:45:32Z","title":"The Race to Efficiency: A New Perspective on AI Scaling Laws","summary":"  As large-scale AI models expand, training becomes costlier and sustaining\nprogress grows harder. Classical scaling laws (e.g., Kaplan et al. (2020),\nHoffmann et al. (2022)) predict training loss from a static compute budget yet\nneglect time and efficiency, prompting the question: how can we balance\nballooning GPU fleets with rapidly improving hardware and algorithms? We\nintroduce the relative-loss equation, a time- and efficiency-aware framework\nthat extends classical AI scaling laws. Our model shows that, without ongoing\nefficiency gains, advanced performance could demand millennia of training or\nunrealistically large GPU fleets. However, near-exponential progress remains\nachievable if the \"efficiency-doubling rate\" parallels Moore's Law. By\nformalizing this race to efficiency, we offer a quantitative roadmap for\nbalancing front-loaded GPU investments with incremental improvements across the\nAI stack. Empirical trends suggest that sustained efficiency gains can push AI\nscaling well into the coming decade, providing a new perspective on the\ndiminishing returns inherent in classical scaling.\n","authors":["Chien-Ping Lu"],"pdf_url":"https://arxiv.org/pdf/2501.02156v2.pdf","comment":"21 pages, 3 figures. 2 tables, second draft"},{"id":"http://arxiv.org/abs/2402.13516v7","updated":"2025-01-07T05:26:54Z","published":"2024-02-21T03:58:49Z","title":"ProSparse: Introducing and Enhancing Intrinsic Activation Sparsity\n  within Large Language Models","summary":"  Activation sparsity refers to the existence of considerable\nweakly-contributed elements among activation outputs. As a prevalent property\nof the models using the ReLU activation function, activation sparsity has been\nproven a promising paradigm to boost model inference efficiency. Nevertheless,\nmost large language models (LLMs) adopt activation functions without intrinsic\nactivation sparsity (e.g., GELU and Swish). Some recent efforts have explored\nintroducing ReLU or its variants as the substitutive activation function to\nhelp LLMs achieve activation sparsity and inference acceleration, but few can\nsimultaneously obtain high sparsity and comparable model performance. This\npaper introduces a simple and effective sparsification method named \"ProSparse\"\nto push LLMs for higher activation sparsity while maintaining comparable\nperformance. Specifically, after substituting the activation function of LLMs\nwith ReLU, ProSparse adopts progressive sparsity regularization with a factor\nsmoothly increasing along the multi-stage sine curves. This can enhance\nactivation sparsity and mitigate performance degradation by avoiding radical\nshifts in activation distributions. With ProSparse, we obtain high sparsity of\n89.32% for LLaMA2-7B, 88.80% for LLaMA2-13B, and 87.89% for end-size\nMiniCPM-1B, respectively, achieving comparable performance to their original\nSwish-activated versions. These present the most sparsely activated models\namong open-source LLaMA versions and competitive end-size models, considerably\nsurpassing ReluLLaMA-7B (66.98%) and ReluLLaMA-13B (71.56%). Our inference\nacceleration experiments further demonstrate the significant practical\nacceleration potential of LLMs with higher activation sparsity, obtaining up to\n4.52$\\times$ inference speedup.\n","authors":["Chenyang Song","Xu Han","Zhengyan Zhang","Shengding Hu","Xiyu Shi","Kuai Li","Chen Chen","Zhiyuan Liu","Guangli Li","Tao Yang","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.13516v7.pdf","comment":"19 pages, 4 figures, 9 tables"},{"id":"http://arxiv.org/abs/2501.03540v1","updated":"2025-01-07T05:23:36Z","published":"2025-01-07T05:23:36Z","title":"Deep Learning within Tabular Data: Foundations, Challenges, Advances and\n  Future Directions","summary":"  Tabular data remains one of the most prevalent data types across a wide range\nof real-world applications, yet effective representation learning for this\ndomain poses unique challenges due to its irregular patterns, heterogeneous\nfeature distributions, and complex inter-column dependencies. This survey\nprovides a comprehensive review of state-of-the-art techniques in tabular data\nrepresentation learning, structured around three foundational design elements:\ntraining data, neural architectures, and learning objectives. Unlike prior\nsurveys that focus primarily on either architecture design or learning\nstrategies, we adopt a holistic perspective that emphasizes the universality\nand robustness of representation learning methods across diverse downstream\ntasks. We examine recent advances in data augmentation and generation,\nspecialized neural network architectures tailored to tabular data, and\ninnovative learning objectives that enhance representation quality.\nAdditionally, we highlight the growing influence of self-supervised learning\nand the adaptation of transformer-based foundation models for tabular data. Our\nreview is based on a systematic literature search using rigorous inclusion\ncriteria, encompassing 127 papers published since 2020 in top-tier conferences\nand journals. Through detailed analysis and comparison, we identify emerging\ntrends, critical gaps, and promising directions for future research, aiming to\nguide the development of more generalizable and effective tabular data\nrepresentation methods.\n","authors":["Weijieying Ren","Tianxiang Zhao","Yuqing Huang","Vasant Honavar"],"pdf_url":"https://arxiv.org/pdf/2501.03540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03526v1","updated":"2025-01-07T04:42:45Z","published":"2025-01-07T04:42:45Z","title":"FgC2F-UDiff: Frequency-guided and Coarse-to-fine Unified Diffusion Model\n  for Multi-modality Missing MRI Synthesis","summary":"  Multi-modality magnetic resonance imaging (MRI) is essential for the\ndiagnosis and treatment of brain tumors. However, missing modalities are\ncommonly observed due to limitations in scan time, scan corruption, artifacts,\nmotion, and contrast agent intolerance. Synthesis of missing MRI has been a\nmeans to address the limitations of modality insufficiency in clinical practice\nand research. However, there are still some challenges, such as poor\ngeneralization, inaccurate non-linear mapping, and slow processing speeds. To\naddress the aforementioned issues, we propose a novel unified synthesis model,\nthe Frequency-guided and Coarse-to-fine Unified Diffusion Model (FgC2F-UDiff),\ndesigned for multiple inputs and outputs. Specifically, the Coarse-to-fine\nUnified Network (CUN) fully exploits the iterative denoising properties of\ndiffusion models, from global to detail, by dividing the denoising process into\ntwo stages, coarse and fine, to enhance the fidelity of synthesized images.\nSecondly, the Frequency-guided Collaborative Strategy (FCS) harnesses\nappropriate frequency information as prior knowledge to guide the learning of a\nunified, highly non-linear mapping. Thirdly, the Specific-acceleration Hybrid\nMechanism (SHM) integrates specific mechanisms to accelerate the diffusion\nmodel and enhance the feasibility of many-to-many synthesis. Extensive\nexperimental evaluations have demonstrated that our proposed FgC2F-UDiff model\nachieves superior performance on two datasets, validated through a\ncomprehensive assessment that includes both qualitative observations and\nquantitative metrics, such as PSNR SSIM, LPIPS, and FID.\n","authors":["Xiaojiao Xiao","Qinmin Vivian Hu","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13371v2","updated":"2025-01-07T04:42:21Z","published":"2024-02-20T20:53:04Z","title":"FIDLAR: Forecast-Informed Deep Learning Architecture for Flood\n  Mitigation","summary":"  In coastal river systems, frequent floods, often occurring during major\nstorms or king tides, pose a severe threat to lives and property. However,\nthese floods can be mitigated or even prevented by strategically releasing\nwater before extreme weather events with hydraulic structures such as dams,\ngates, pumps, and reservoirs. A standard approach used by local water\nmanagement agencies is the \"rule-based\" method, which specifies predetermined\npre-releases of water based on historical and time-tested human experience, but\nwhich tends to result in excess or inadequate water release. The model\npredictive control (MPC), a physics-based model for prediction, is an\nalternative approach, albeit involving computationally intensive calculations.\nIn this paper, we propose a Forecast Informed Deep Learning Architecture,\nFIDLAR, to achieve rapid and optimal flood management with precise water\npre-releases. FIDLAR seamlessly integrates two neural network modules: one\ncalled the Flood Manager, which is responsible for generating water pre-release\nschedules, and another called the Flood Evaluator, which assesses these\ngenerated schedules. The Evaluator module is pre-trained separately, and its\ngradient-based feedback is used to train the Manager model, ensuring optimal\nwater pre-releases. We have conducted experiments using FIDLAR with data from a\nflood-prone coastal area in South Florida, particularly susceptible to frequent\nstorms. Results show that FIDLAR is several orders of magnitude faster than\ncurrently used physics-based approaches while outperforming baseline methods\nwith improved water pre-release schedules.\n","authors":["Jimeng Shi","Zeda Yin","Arturo Leon","Jayantha Obeysekera","Giri Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2402.13371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12935v2","updated":"2025-01-07T04:42:20Z","published":"2024-06-17T03:03:34Z","title":"ChatBug: A Common Vulnerability of Aligned LLMs Induced by Chat\n  Templates","summary":"  Large language models (LLMs) are expected to follow instructions from users\nand engage in conversations. Techniques to enhance LLMs' instruction-following\ncapabilities typically fine-tune them using data structured according to a\npredefined chat template. Although chat templates are shown to be effective in\noptimizing LLM performance, their impact on safety alignment of LLMs has been\nless understood, which is crucial for deploying LLMs safely at scale.\n  In this paper, we investigate how chat templates affect safety alignment of\nLLMs. We identify a common vulnerability, named ChatBug, that is introduced by\nchat templates. Our key insight to identify ChatBug is that the chat templates\nprovide a rigid format that need to be followed by LLMs, but not by users.\nHence, a malicious user may not necessarily follow the chat template when\nprompting LLMs. Instead, malicious users could leverage their knowledge of the\nchat template and accordingly craft their prompts to bypass safety alignments\nof LLMs. We develop two attacks to exploit the ChatBug vulnerability. We\ndemonstrate that a malicious user can exploit the ChatBug vulnerability of\neight state-of-the-art (SOTA) LLMs and effectively elicit unintended responses\nfrom these models. Moreover, we show that ChatBug can be exploited by existing\njailbreak attacks to enhance their attack success rates. We investigate\npotential countermeasures to ChatBug. Our results show that while adversarial\ntraining effectively mitigates the ChatBug vulnerability, the victim model\nincurs significant performance degradation. These results highlight the\ntrade-off between safety alignment and helpfulness. Developing new methods for\ninstruction tuning to balance this trade-off is an open and critical direction\nfor future research\n","authors":["Fengqing Jiang","Zhangchen Xu","Luyao Niu","Bill Yuchen Lin","Radha Poovendran"],"pdf_url":"https://arxiv.org/pdf/2406.12935v2.pdf","comment":"This paper is accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2310.15624v2","updated":"2025-01-07T04:39:25Z","published":"2023-10-24T08:45:15Z","title":"GUPNet++: Geometry Uncertainty Propagation Network for Monocular 3D\n  Object Detection","summary":"  Geometry plays a significant role in monocular 3D object detection. It can be\nused to estimate object depth by using the perspective projection between\nobject's physical size and 2D projection in the image plane, which can\nintroduce mathematical priors into deep models. However, this projection\nprocess also introduces error amplification, where the error of the estimated\nheight is amplified and reflected into the projected depth. It leads to\nunreliable depth inferences and also impairs training stability. To tackle this\nproblem, we propose a novel Geometry Uncertainty Propagation Network (GUPNet++)\nby modeling geometry projection in a probabilistic manner. This ensures depth\npredictions are well-bounded and associated with a reasonable uncertainty. The\nsignificance of introducing such geometric uncertainty is two-fold: (1). It\nmodels the uncertainty propagation relationship of the geometry projection\nduring training, improving the stability and efficiency of the end-to-end model\nlearning. (2). It can be derived to a highly reliable confidence to indicate\nthe quality of the 3D detection result, enabling more reliable detection\ninference. Experiments show that the proposed approach not only obtains\n(state-of-the-art) SOTA performance in image-based monocular 3D detection but\nalso demonstrates superiority in efficacy with a simplified framework.\n","authors":["Yan Lu","Xinzhu Ma","Lei Yang","Tianzhu Zhang","Yating Liu","Qi Chu","Tong He","Yonghui Li","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.15624v2.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.03523v1","updated":"2025-01-07T04:38:28Z","published":"2025-01-07T04:38:28Z","title":"Vocal Tract Length Warped Features for Spoken Keyword Spotting","summary":"  In this paper, we propose several methods that incorporate vocal tract length\n(VTL) warped features for spoken keyword spotting (KWS). The first method,\nVTL-independent KWS, involves training a single deep neural network (DNN) that\nutilizes VTL features with various warping factors. During training, a specific\nVTL feature is randomly selected per epoch, allowing the exploration of VTL\nvariations. During testing, the VTL features with different warping factors of\na test utterance are scored against the DNN and combined with equal weight. In\nthe second method scores the conventional features of a test utterance (without\nVTL warping) against the DNN. The third method, VTL-concatenation KWS,\nconcatenates VTL warped features to form high-dimensional features for KWS.\nEvaluations carried out on the English Google Command dataset demonstrate that\nthe proposed methods improve the accuracy of KWS.\n","authors":["Achintya kr. Sarkar","Priyanka Dwivedi","Zheng-Hua Tan"],"pdf_url":"https://arxiv.org/pdf/2501.03523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03518v1","updated":"2025-01-07T04:21:13Z","published":"2025-01-07T04:21:13Z","title":"Transfer Learning for Deep-Unfolded Combinatorial Optimization Solver\n  with Quantum Annealer","summary":"  Quantum annealing (QA) has attracted research interest as a sampler and\ncombinatorial optimization problem (COP) solver. A recently proposed\nsampling-based solver for QA significantly reduces the required number of\nqubits, being capable of large COPs. In relation to this, a trainable\nsampling-based COP solver has been proposed that optimizes its internal\nparameters from a dataset by using a deep learning technique called deep\nunfolding. Although learning the internal parameters accelerates the\nconvergence speed, the sampler in the trainable solver is restricted to using a\nclassical sampler owing to the training cost. In this study, to utilize QA in\nthe trainable solver, we propose classical-quantum transfer learning, where\nparameters are trained classically, and the trained parameters are used in the\nsolver with QA. The results of numerical experiments demonstrate that the\ntrainable quantum COP solver using classical-quantum transfer learning improves\nconvergence speed and execution time over the original solver.\n","authors":["Ryo Hagiwara","Shunta Arai","Satoshi Takabe"],"pdf_url":"https://arxiv.org/pdf/2501.03518v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.03228v2","updated":"2025-01-07T04:05:53Z","published":"2025-01-06T18:59:55Z","title":"LightGNN: Simple Graph Neural Network for Recommendation","summary":"  Graph neural networks (GNNs) have demonstrated superior performance in\ncollaborative recommendation through their ability to conduct high-order\nrepresentation smoothing, effectively capturing structural information within\nusers' interaction patterns. However, existing GNN paradigms face significant\nchallenges in scalability and robustness when handling large-scale, noisy, and\nreal-world datasets. To address these challenges, we present LightGNN, a\nlightweight and distillation-based GNN pruning framework designed to\nsubstantially reduce model complexity while preserving essential collaboration\nmodeling capabilities. Our LightGNN framework introduces a computationally\nefficient pruning module that adaptively identifies and removes redundant edges\nand embedding entries for model compression. The framework is guided by a\nresource-friendly hierarchical knowledge distillation objective, whose\nintermediate layer augments the observed graph to maintain performance,\nparticularly in high-rate compression scenarios. Extensive experiments on\npublic datasets demonstrate LightGNN's effectiveness, significantly improving\nboth computational efficiency and recommendation accuracy. Notably, LightGNN\nachieves an 80% reduction in edge count and 90% reduction in embedding entries\nwhile maintaining performance comparable to more complex state-of-the-art\nbaselines. The implementation of our LightGNN framework is available at the\ngithub repository: https://github.com/HKUDS/LightGNN.\n","authors":["Guoxuan Chen","Lianghao Xia","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2501.03228v2.pdf","comment":"Accepted to WSDM 2025 Oral"},{"id":"http://arxiv.org/abs/2410.23111v5","updated":"2025-01-07T03:56:49Z","published":"2024-10-30T15:23:44Z","title":"Exploring Gradient Subspaces: Addressing and Overcoming LoRA's\n  Limitations in Federated Fine-Tuning of Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated remarkable capabilities across\nvarious domains, particularly in task generalization for both text and vision\ndata. While fine-tuning these models can significantly enhance their\nperformance on specific downstream tasks, it often requires high-quality data\nthat cannot be shared due to privacy concerns. Federated Learning (FL) offers a\npromising solution for collaborative training without direct data sharing.\nHowever, many parameter-efficient fine-tuning strategies for LLMs in FL,\nparticularly those based on Low-Rank Adaptation (LoRA), face limitations. In\nthis paper, we critically analyze the convergence and performance guarantees of\npopular FL frameworks utilizing LoRA, highlighting its suboptimal nature due to\nconstrained subspace learning of low-rank matrices. This limitation hinders\neffective fine-tuning of LLMs in federated settings. Through rigorous\nanalytical and empirical evaluations, we demonstrate that direct weight\naveraging outperforms LoRA-based strategies, leading to superior performance\nfor fine-tuned models. Our comprehensive comparison unmasks inefficiencies in\nLoRA approaches and underscores the advantages of direct weight aggregation. We\nextend our analysis to low-rank gradient-based optimizers, such as GaLore, used\nduring local training steps. Our findings show that GaLore along with\ndirect-weight aggregation is a more effective approach, outperforming federated\nLoRA methods like FlexLoRA and FFA-LoRA across both text and image modalities.\nWhile privacy remains paramount in FL discourse, our focus is on assessing\nperformance outcomes of federated fine-tuned models and evaluating various FL\nframeworks from both theoretical and empirical perspectives. Our findings\nadvocate reassessing the reliance on LoRA within FL contexts, paving the way\nfor more efficient training methodologies.\n","authors":["Navyansh Mahla","Kshitij Sharad Jadhav","Ganesh Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2410.23111v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16766v2","updated":"2025-01-07T03:53:12Z","published":"2024-05-27T02:27:28Z","title":"Concept Matching with Agent for Out-of-Distribution Detection","summary":"  The remarkable achievements of Large Language Models (LLMs) have captivated\nthe attention of both academia and industry, transcending their initial role in\ndialogue generation. To expand the usage scenarios of LLM, some works enhance\nthe effectiveness and capabilities of the model by introducing more external\ninformation, which is called the agent paradigm. Based on this idea, we propose\na new method that integrates the agent paradigm into out-of-distribution (OOD)\ndetection task, aiming to improve its robustness and adaptability. Our proposed\nmethod, Concept Matching with Agent (CMA), employs neutral prompts as agents to\naugment the CLIP-based OOD detection process. These agents function as dynamic\nobservers and communication hubs, interacting with both In-distribution (ID)\nlabels and data inputs to form vector triangle relationships. This triangular\nframework offers a more nuanced approach than the traditional binary\nrelationship, allowing for better separation and identification of ID and OOD\ninputs. Our extensive experimental results showcase the superior performance of\nCMA over both zero-shot and training-required methods in a diverse array of\nreal-world scenarios.\n","authors":["Yuxiao Lee","Xiaofeng Cao","Jingcai Guo","Wei Ye","Qing Guo","Yi Chang"],"pdf_url":"https://arxiv.org/pdf/2405.16766v2.pdf","comment":"Accepted by AAAI-25"},{"id":"http://arxiv.org/abs/2501.03507v1","updated":"2025-01-07T03:50:11Z","published":"2025-01-07T03:50:11Z","title":"An Empirical Study of Accuracy-Robustness Tradeoff and Training\n  Efficiency in Self-Supervised Learning","summary":"  Self-supervised learning (SSL) has significantly advanced image\nrepresentation learning, yet efficiency challenges persist, particularly with\nadversarial training. Many SSL methods require extensive epochs to achieve\nconvergence, a demand further amplified in adversarial settings. To address\nthis inefficiency, we revisit the robust EMP-SSL framework, emphasizing the\nimportance of increasing the number of crops per image to accelerate learning.\nUnlike traditional contrastive learning, robust EMP-SSL leverages multi-crop\nsampling, integrates an invariance term and regularization, and reduces\ntraining epochs, enhancing time efficiency. Evaluated with both standard linear\nclassifiers and multi-patch embedding aggregation, robust EMP-SSL provides new\ninsights into SSL evaluation strategies.\n  Our results show that robust crop-based EMP-SSL not only accelerates\nconvergence but also achieves a superior balance between clean accuracy and\nadversarial robustness, outperforming multi-crop embedding aggregation.\nAdditionally, we extend this approach with free adversarial training in\nMulti-Crop SSL, introducing the Cost-Free Adversarial Multi-Crop\nSelf-Supervised Learning (CF-AMC-SSL) method. CF-AMC-SSL demonstrates the\neffectiveness of free adversarial training in reducing training time while\nsimultaneously improving clean accuracy and adversarial robustness. These\nfindings underscore the potential of CF-AMC-SSL for practical SSL applications.\nOur code is publicly available at https://github.com/softsys4ai/CF-AMC-SSL.\n","authors":["Fatemeh Ghofrani","Pooyan Jamshidi"],"pdf_url":"https://arxiv.org/pdf/2501.03507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13656v2","updated":"2025-01-07T03:37:12Z","published":"2024-08-24T19:14:02Z","title":"Localize-and-Stitch: Efficient Model Merging via Sparse Task Arithmetic","summary":"  Model merging offers an effective strategy to combine the strengths of\nmultiple finetuned models into a unified model that preserves the specialized\ncapabilities of each. Existing methods merge models in a global manner,\nperforming arithmetic operations across all model parameters. However, such\nglobal merging often leads to task interference, degrading the performance of\nthe merged model. In this work, we introduce Localize-and-Stitch, a novel\napproach that merges models in a localized way. Our algorithm works in two\nsteps: i) Localization: identify tiny ($1\\%$ of the total parameters) localized\nregions in the finetuned models containing essential skills for the downstream\ntasks, and ii) Stitching: reintegrate only these essential regions back into\nthe pretrained model for task synergy. We demonstrate that our approach\neffectively locates sparse regions responsible for finetuned performance, and\nthe localized regions could be treated as compact and interpretable\nrepresentations of the finetuned models (tasks). Empirically, we evaluate our\nmethod on various vision and language benchmarks, showing that it outperforms\nexisting model merging methods under different data availability scenarios.\nBeyond strong empirical performance, our algorithm also facilitates model\ncompression and preserves pretrained knowledge, enabling flexible and continual\nskill composition from multiple finetuned models with minimal storage and\ncomputational overhead. Our code is available at\nhttps://github.com/uiuctml/Localize-and-Stitch.\n","authors":["Yifei He","Yuzheng Hu","Yong Lin","Tong Zhang","Han Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.13656v2.pdf","comment":"TMLR camera-ready version"},{"id":"http://arxiv.org/abs/2501.03495v1","updated":"2025-01-07T03:33:22Z","published":"2025-01-07T03:33:22Z","title":"Textualize Visual Prompt for Image Editing via Diffusion Bridge","summary":"  Visual prompt, a pair of before-and-after edited images, can convey\nindescribable imagery transformations and prosper in image editing. However,\ncurrent visual prompt methods rely on a pretrained text-guided image-to-image\ngenerative model that requires a triplet of text, before, and after images for\nretraining over a text-to-image model. Such crafting triplets and retraining\nprocesses limit the scalability and generalization of editing. In this paper,\nwe present a framework based on any single text-to-image model without reliance\non the explicit image-to-image model thus enhancing the generalizability and\nscalability. Specifically, by leveraging the probability-flow ordinary\nequation, we construct a diffusion bridge to transfer the distribution between\nbefore-and-after images under the text guidance. By optimizing the text via the\nbridge, the framework adaptively textualizes the editing transformation\nconveyed by visual prompts into text embeddings without other models.\nMeanwhile, we introduce differential attention control during text\noptimization, which disentangles the text embedding from the invariance of the\nbefore-and-after images and makes it solely capture the delicate transformation\nand generalize to edit various images. Experiments on real images validate\ncompetitive results on the generalization, contextual coherence, and high\nfidelity for delicate editing with just one image pair as the visual prompt.\n","authors":["Pengcheng Xu","Qingnan Fan","Fei Kou","Shuai Qin","Hong Gu","Ruoyu Zhao","Charles Ling","Boyu Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03495v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.02024v2","updated":"2025-01-07T03:29:43Z","published":"2025-01-02T20:47:04Z","title":"Model Checking in Medical Imaging for Tumor Detection and Segmentation","summary":"  Recent advancements in model checking have demonstrated significant potential\nacross diverse applications, particularly in signal and image analysis. Medical\nimaging stands out as a critical domain where model checking can be effectively\napplied to design and evaluate robust frameworks. These frameworks facilitate\nautomatic and semi-automatic delineation of regions of interest within images,\naiding in accurate segmentation. This paper provides a comprehensive analysis\nof recent works leveraging spatial logic to develop operators and tools for\nidentifying regions of interest, including tumorous and non-tumorous areas.\nAdditionally, we examine the challenges inherent to spatial model-checking\ntechniques, such as variability in ground truth data and the need for\nstreamlined procedures suitable for routine clinical practice.\n","authors":["Elhoucine Elfatimi","Lahcen El fatimi"],"pdf_url":"https://arxiv.org/pdf/2501.02024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03492v1","updated":"2025-01-07T03:23:28Z","published":"2025-01-07T03:23:28Z","title":"Multi-Source Urban Traffic Flow Forecasting with Drone and Loop Detector\n  Data","summary":"  Traffic forecasting is a fundamental task in transportation research, however\nthe scope of current research has mainly focused on a single data modality of\nloop detectors. Recently, the advances in Artificial Intelligence and drone\ntechnologies have made possible novel solutions for efficient, accurate and\nflexible aerial observations of urban traffic. As a promising traffic\nmonitoring approach, drone-captured data can create an accurate multi-sensor\nmobility observatory for large-scale urban networks, when combined with\nexisting infrastructure. Therefore, this paper investigates the problem of\nmulti-source traffic speed prediction, simultaneously using drone and loop\ndetector data. A simple yet effective graph-based model HiMSNet is proposed to\nintegrate multiple data modalities and learn spatio-temporal correlations.\nDetailed analysis shows that predicting accurate segment-level speed is more\nchallenging than the regional speed, especially under high-demand scenarios\nwith heavier congestions and varying traffic dynamics. Utilizing both drone and\nloop detector data, the prediction accuracy can be improved compared to\nsingle-modality cases, when the sensors have lower coverages and are subject to\nnoise. Our simulation study based on vehicle trajectories in a real urban road\nnetwork has highlighted the added value of integrating drones in traffic\nforecasting and monitoring.\n","authors":["Weijiang Xiong","Robert Fonod","Alexandre Alahi","Nikolas Geroliminis"],"pdf_url":"https://arxiv.org/pdf/2501.03492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11397v2","updated":"2025-01-07T03:17:48Z","published":"2024-06-17T10:33:00Z","title":"DistPred: A Distribution-Free Probabilistic Inference Method for\n  Regression and Forecasting","summary":"  Traditional regression and prediction tasks often only provide deterministic\npoint estimates. To estimate the distribution or uncertainty of the response\nvariable, traditional methods either assume that the posterior distribution of\nsamples follows a Gaussian process or require thousands of forward passes for\nsample generation. We propose a novel approach called DistPred for regression\nand forecasting tasks, which overcomes the limitations of existing methods\nwhile remaining simple and powerful. Specifically, we transform proper scoring\nrules that measure the discrepancy between the predicted distribution and the\ntarget distribution into a differentiable discrete form and use it as a loss\nfunction to train the model end-to-end. This allows the model to sample\nnumerous samples in a single forward pass to estimate the potential\ndistribution of the response variable. We have compared our method with several\nexisting approaches on multiple datasets and achieved state-of-the-art\nperformance. Additionally, our method significantly improves computational\nefficiency. For example, compared to state-of-the-art models, DistPred has a\n180x faster inference speed Experimental results can be reproduced through\nhttps://github.com/Anoise/DistPred.\n","authors":["Daojun Liang","Haixia Zhang","Dongfeng Yuan"],"pdf_url":"https://arxiv.org/pdf/2406.11397v2.pdf","comment":"Published at KDD 2025"},{"id":"http://arxiv.org/abs/2501.03489v1","updated":"2025-01-07T03:17:47Z","published":"2025-01-07T03:17:47Z","title":"Entropy-Guided Attention for Private LLMs","summary":"  The pervasiveness of proprietary language models has raised critical privacy\nconcerns, necessitating advancements in private inference (PI), where\ncomputations are performed directly on encrypted data without revealing users'\nsensitive information. While PI offers a promising solution, its practical\ndeployment is hindered by substantial communication and latency overheads,\nprimarily stemming from nonlinear operations. To address this, we introduce an\ninformation-theoretic framework to characterize the role of nonlinearities in\ndecoder-only language models, laying a principled foundation for optimizing\ntransformer-architectures tailored to the demands of PI.\n  By leveraging Shannon's entropy as a quantitative measure, we uncover the\npreviously unexplored dual significance of nonlinearities: beyond ensuring\ntraining stability, they are crucial for maintaining attention head diversity.\nSpecifically, we find that their removal triggers two critical failure modes:\n{\\em entropy collapse} in deeper layers that destabilizes training, and {\\em\nentropic overload} in earlier layers that leads to under-utilization of\nMulti-Head Attention's (MHA) representational capacity.\n  We propose an entropy-guided attention mechanism paired with a novel entropy\nregularization technique to mitigate entropic overload. Additionally, we\nexplore PI-friendly alternatives to layer normalization for preventing entropy\ncollapse and stabilizing the training of LLMs with reduced-nonlinearities. Our\nstudy bridges the gap between information theory and architectural design,\nestablishing entropy dynamics as a principled guide for developing efficient PI\narchitectures. The code and implementation are available at\n\\href{https://github.com/Nandan91/entropy-guided-attention-llm}{entropy-guided-llm}.\n","authors":["Nandan Kumar Jha","Brandon Reagen"],"pdf_url":"https://arxiv.org/pdf/2501.03489v1.pdf","comment":"The 6th AAAI Workshop on Privacy-Preserving Artificial Intelligence\n  (PPAI), 2025. arXiv admin note: substantial text overlap with\n  arXiv:2410.13060"},{"id":"http://arxiv.org/abs/2412.19391v2","updated":"2025-01-07T03:15:49Z","published":"2024-12-27T00:36:40Z","title":"An In-Depth Analysis of Adversarial Discriminative Domain Adaptation for\n  Digit Classification","summary":"  Domain adaptation is an active area of research driven by the growing demand\nfor robust machine learning models that perform well on real-world data.\nAdversarial learning for deep neural networks (DNNs) has emerged as a promising\napproach to improving generalization ability, particularly for image\nclassification. In this paper, we implement a specific adversarial learning\ntechnique known as Adversarial Discriminative Domain Adaptation (ADDA) and\nreplicate digit classification experiments from the original ADDA paper. We\nextend their findings by examining a broader range of domain shifts and provide\na detailed analysis of in-domain classification accuracy post-ADDA. Our results\ndemonstrate that ADDA significantly improves accuracy across certain domain\nshifts with minimal impact on in-domain performance. Furthermore, we provide\nqualitative analysis and propose potential explanations for ADDA's limitations\nin less successful domain shifts. Code is at\nhttps://github.com/eugenechoi2004/COS429_FINAL .\n","authors":["Eugene Choi","Julian Rodriguez","Edmund Young"],"pdf_url":"https://arxiv.org/pdf/2412.19391v2.pdf","comment":"Replacement: Updated methodology section to include grayscale\n  preprocessing of SVHN data"},{"id":"http://arxiv.org/abs/2501.03486v1","updated":"2025-01-07T03:14:39Z","published":"2025-01-07T03:14:39Z","title":"Align-Pro: A Principled Approach to Prompt Optimization for LLM\n  Alignment","summary":"  The alignment of large language models (LLMs) with human values is critical\nas these models become increasingly integrated into various societal and\ndecision-making processes. Traditional methods, such as reinforcement learning\nfrom human feedback (RLHF), achieve alignment by fine-tuning model parameters,\nbut these approaches are often computationally expensive and impractical when\nmodels are frozen or inaccessible for parameter modification. In contrast,\nprompt optimization is a viable alternative to RLHF for LLM alignment. While\nthe existing literature has shown empirical promise of prompt optimization, its\ntheoretical underpinning remains under-explored. We address this gap by\nformulating prompt optimization as an optimization problem and try to provide\ntheoretical insights into the optimality of such a framework. To analyze the\nperformance of the prompt optimization, we study theoretical suboptimality\nbounds and provide insights in terms of how prompt optimization depends upon\nthe given prompter and target model. We also provide empirical validation\nthrough experiments on various datasets, demonstrating that prompt optimization\ncan effectively align LLMs, even when parameter fine-tuning is not feasible.\n","authors":["Prashant Trivedi","Souradip Chakraborty","Avinash Reddy","Vaneet Aggarwal","Amrit Singh Bedi","George K. Atia"],"pdf_url":"https://arxiv.org/pdf/2501.03486v1.pdf","comment":"27 pages, Accepted in AAAI 2025"},{"id":"http://arxiv.org/abs/2412.13516v2","updated":"2025-01-07T03:08:39Z","published":"2024-12-18T05:33:16Z","title":"Learning Causal Transition Matrix for Instance-dependent Label Noise","summary":"  Noisy labels are both inevitable and problematic in machine learning methods,\nas they negatively impact models' generalization ability by causing\noverfitting. In the context of learning with noise, the transition matrix plays\na crucial role in the design of statistically consistent algorithms. However,\nthe transition matrix is often considered unidentifiable. One strand of methods\ntypically addresses this problem by assuming that the transition matrix is\ninstance-independent; that is, the probability of mislabeling a particular\ninstance is not influenced by its characteristics or attributes. This\nassumption is clearly invalid in complex real-world scenarios. To better\nunderstand the transition relationship and relax this assumption, we propose to\nstudy the data generation process of noisy labels from a causal perspective. We\ndiscover that an unobservable latent variable can affect either the instance\nitself, the label annotation procedure, or both, which complicates the\nidentification of the transition matrix. To address various scenarios, we have\nunified these observations within a new causal graph. In this graph, the input\ninstance is divided into a noise-resistant component and a noise-sensitive\ncomponent based on whether they are affected by the latent variable. These two\ncomponents contribute to identifying the ``causal transition matrix'', which\napproximates the true transition matrix with theoretical guarantee. In line\nwith this, we have designed a novel training framework that explicitly models\nthis causal relationship and, as a result, achieves a more accurate model for\ninferring the clean label.\n","authors":["Jiahui Li","Tai-Wei Chang","Kun Kuang","Ximing Li","Long Chen","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.13516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16496v4","updated":"2025-01-07T03:08:05Z","published":"2023-11-27T08:49:26Z","title":"Can Out-of-Domain data help to Learn Domain-Specific Prompts for\n  Multimodal Misinformation Detection?","summary":"  Spread of fake news using out-of-context images and captions has become\nwidespread in this era of information overload. Since fake news can belong to\ndifferent domains like politics, sports, etc. with their unique\ncharacteristics, inference on a test image-caption pair is contingent on how\nwell the model has been trained on similar data. Since training individual\nmodels for each domain is not practical, we propose a novel framework termed\nDPOD (Domain-specific Prompt tuning using Out-of-domain data), which can\nexploit out-of-domain data during training to improve fake news detection of\nall desired domains simultaneously. First, to compute generalizable features,\nwe modify the Vision-Language Model, CLIP to extract features that helps to\nalign the representations of the images and corresponding captions of both the\nin-domain and out-of-domain data in a label-aware manner. Further, we propose a\ndomain-specific prompt learning technique which leverages training samples of\nall the available domains based on the extent they can be useful to the desired\ndomain. Extensive experiments on the large-scale NewsCLIPpings and VERITE\nbenchmarks demonstrate that DPOD achieves state of-the-art performance for this\nchallenging task. Code: https://github.com/scviab/DPOD.\n","authors":["Amartya Bhattacharya","Debarshi Brahma","Suraj Nagaje Mahadev","Anmol Asati","Vikas Verma","Soma Biswas"],"pdf_url":"https://arxiv.org/pdf/2311.16496v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03334v3","updated":"2025-01-07T03:01:49Z","published":"2024-10-23T19:56:57Z","title":"Neural Network Prediction of Strong Lensing Systems with Domain\n  Adaptation and Uncertainty Quantification","summary":"  Modeling strong gravitational lenses is computationally expensive for the\ncomplex data from modern and next-generation cosmic surveys. Deep learning has\nemerged as a promising approach for finding lenses and predicting lensing\nparameters, such as the Einstein radius. Mean-variance Estimators (MVEs) are a\ncommon approach for obtaining aleatoric (data) uncertainties from a neural\nnetwork prediction. However, neural networks have not been demonstrated to\nperform well on out-of-domain target data successfully - e.g., when trained on\nsimulated data and applied to real, observational data. In this work, we\nperform the first study of the efficacy of MVEs in combination with\nunsupervised domain adaptation (UDA) on strong lensing data. The source domain\ndata is noiseless, and the target domain data has noise mimicking modern\ncosmology surveys. We find that adding UDA to MVE increases the accuracy on the\ntarget data by a factor of about two over an MVE model without UDA. Including\nUDA also permits much more well-calibrated aleatoric uncertainty predictions.\nAdvancements in this approach may enable future applications of MVE models to\nreal observational data.\n","authors":["Shrihan Agarwal","Aleksandra Ćiprijanović","Brian D. Nord"],"pdf_url":"https://arxiv.org/pdf/2411.03334v3.pdf","comment":"Accepted to the Machine Learning for Physical Sciences workshop at\n  NeurIPS 2024; 24 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.02411v2","updated":"2025-01-07T02:46:47Z","published":"2025-01-05T01:25:37Z","title":"Transfer learning via Regularized Linear Discriminant Analysis","summary":"  Linear discriminant analysis is a widely used method for classification.\nHowever, the high dimensionality of predictors combined with small sample sizes\noften results in large classification errors. To address this challenge, it is\ncrucial to leverage data from related source models to enhance the\nclassification performance of a target model. We propose to address this\nproblem in the framework of transfer learning.\n  In this paper, we present novel transfer learning methods via regularized\nrandom-effects linear discriminant analysis, where the discriminant direction\nis estimated as a weighted combination of ridge estimates obtained from both\nthe target and source models. Multiple strategies for determining these weights\nare introduced and evaluated, including one that minimizes the estimation risk\nof the discriminant vector and another that minimizes the classification error.\nUtilizing results from random matrix theory, we explicitly derive the\nasymptotic values of these weights and the associated classification error\nrates in the high-dimensional setting, where $p/n \\rightarrow \\gamma$, with $p$\nrepresenting the predictor dimension and $n$ the sample size. We also provide\ngeometric interpretations of various weights and a guidance on which weights to\nchoose. Extensive numerical studies, including simulations and analysis of\nproteomics-based 10-year cardiovascular disease risk classification,\ndemonstrate the effectiveness of the proposed approach.\n","authors":["Hongzhe Zhang","Arnab Auddy","Hongzhe Lee"],"pdf_url":"https://arxiv.org/pdf/2501.02411v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03477v1","updated":"2025-01-07T02:35:41Z","published":"2025-01-07T02:35:41Z","title":"A study on performance limitations in Federated Learning","summary":"  Increasing privacy concerns and unrestricted access to data lead to the\ndevelopment of a novel machine learning paradigm called Federated Learning\n(FL). FL borrows many of the ideas from distributed machine learning, however,\nthe challenges associated with federated learning makes it an interesting\nengineering problem since the models are trained on edge devices. It was\nintroduced in 2016 by Google, and since then active research is being carried\nout in different areas within FL such as federated optimization algorithms,\nmodel and update compression, differential privacy, robustness, and attacks,\nfederated GANs and privacy preserved personalization. There are many open\nchallenges in the development of such federated machine learning systems and\nthis project will be focusing on the communication bottleneck and data Non\nIID-ness, and its effect on the performance of the models. These issues are\ncharacterized on a baseline model, model performance is evaluated, and\ndiscussions are made to overcome these issues.\n","authors":["Karthik Mohan"],"pdf_url":"https://arxiv.org/pdf/2501.03477v1.pdf","comment":"archive 2021 work"},{"id":"http://arxiv.org/abs/2501.03475v1","updated":"2025-01-07T02:33:25Z","published":"2025-01-07T02:33:25Z","title":"Reading with Intent -- Neutralizing Intent","summary":"  Queries to large language models (LLMs) can be divided into two parts: the\ninstruction/question and the accompanying context. The context for\nretrieval-augmented generation (RAG) systems in most benchmarks comes from\nWikipedia or Wikipedia-like texts which are written in a neutral and factual\ntone. However, when RAG systems retrieve internet-based content, they encounter\ntext with diverse tones and linguistic styles, introducing challenges for\ndownstream tasks. The Reading with Intent task addresses this issue by\nevaluating how varying tones in context passages affect model performance.\nBuilding on prior work that focused on sarcasm, we extend this paradigm by\nconstructing a dataset where context passages are transformed to $11$ distinct\nemotions using a better synthetic data generation approach. Using this dataset,\nwe train an emotion translation model to systematically adapt passages to\nspecified emotional tones. The human evaluation shows that the LLM fine-tuned\nto become the emotion-translator benefited from the synthetically generated\ndata. Finally, the emotion-translator is used in the Reading with Intent task\nto transform the passages to a neutral tone. By neutralizing the passages, it\nmitigates the challenges posed by sarcastic passages and improves overall\nresults on this task by about $3\\%$.\n","authors":["Benjamin Reichman","Adar Avsian","Larry Heck"],"pdf_url":"https://arxiv.org/pdf/2501.03475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03471v1","updated":"2025-01-07T02:15:58Z","published":"2025-01-07T02:15:58Z","title":"Hyperbolic Binary Neural Network","summary":"  Binary Neural Network (BNN) converts full-precision weights and activations\ninto their extreme 1-bit counterparts, making it particularly suitable for\ndeployment on lightweight mobile devices. While binary neural networks are\ntypically formulated as a constrained optimization problem and optimized in the\nbinarized space, general neural networks are formulated as an unconstrained\noptimization problem and optimized in the continuous space. This paper\nintroduces the Hyperbolic Binary Neural Network (HBNN) by leveraging the\nframework of hyperbolic geometry to optimize the constrained problem.\nSpecifically, we transform the constrained problem in hyperbolic space into an\nunconstrained one in Euclidean space using the Riemannian exponential map. On\nthe other hand, we also propose the Exponential Parametrization Cluster (EPC)\nmethod, which, compared to the Riemannian exponential map, shrinks the segment\ndomain based on a diffeomorphism. This approach increases the probability of\nweight flips, thereby maximizing the information gain in BNNs. Experimental\nresults on CIFAR10, CIFAR100, and ImageNet classification datasets with\nVGGsmall, ResNet18, and ResNet34 models illustrate the superior performance of\nour HBNN over state-of-the-art methods.\n","authors":["Jun Chen","Jingyang Xiang","Tianxin Huang","Xiangrui Zhao","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07708v2","updated":"2025-01-07T02:15:42Z","published":"2024-09-12T02:25:04Z","title":"Dataset-Free Weight-Initialization on Restricted Boltzmann Machine","summary":"  In feed-forward neural networks, dataset-free weight-initialization methods\nsuch as LeCun, Xavier (or Glorot), and He initializations have been developed.\nThese methods randomly determine the initial values of weight parameters based\non specific distributions (e.g., Gaussian or uniform distributions) without\nusing training datasets. To the best of the authors' knowledge, such a\ndataset-free weight-initialization method is yet to be developed for restricted\nBoltzmann machines (RBMs), which are probabilistic neural networks consisting\nof two layers. In this study, we derive a dataset-free weight-initialization\nmethod for Bernoulli--Bernoulli RBMs based on statistical mechanical analysis.\nIn the proposed weight-initialization method, the weight parameters are drawn\nfrom a Gaussian distribution with zero mean. The standard deviation of the\nGaussian distribution is optimized based on our hypothesis that a standard\ndeviation providing a larger layer correlation (LC) between the two layers\nimproves the learning efficiency. The expression of the LC is derived based on\na statistical mechanical analysis. The optimal value of the standard deviation\ncorresponds to the maximum point of the LC. The proposed weight-initialization\nmethod is identical to Xavier initialization in a specific case (i.e., when the\nsizes of the two layers are the same, the random variables of the layers are\n$\\{-1,1\\}$-binary, and all bias parameters are zero). The validity of the\nproposed weight-initialization method is demonstrated in numerical experiments\nusing a toy and real-world datasets.\n","authors":["Muneki Yasuda","Ryosuke Maeno","Chako Takahashi"],"pdf_url":"https://arxiv.org/pdf/2409.07708v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07108v3","updated":"2025-01-07T02:14:56Z","published":"2024-02-11T05:35:50Z","title":"Decoupling Learning and Decision-Making: Breaking the\n  $\\mathcal{O}(\\sqrt{T})$ Barrier in Online Resource Allocation with\n  First-Order Methods","summary":"  Online linear programming plays an important role in both revenue management\nand resource allocation, and recent research has focused on developing\nefficient first-order online learning algorithms. Despite the empirical success\nof first-order methods, they typically achieve a regret no better than\n$\\mathcal{O}(\\sqrt{T})$, which is suboptimal compared to the $\\mathcal{O}(\\log\nT)$ bound guaranteed by the state-of-the-art linear programming (LP)-based\nonline algorithms. This paper establishes several important facts about online\nlinear programming, which unveils the challenge for first-order-method-based\nonline algorithms to achieve beyond $\\mathcal{O}(\\sqrt{T})$ regret. To address\nthe challenge, we introduce a new algorithmic framework that decouples learning\nfrom decision-making. For the first time, we show that first-order methods can\nattain regret $\\mathcal{O}(T^{1/3})$ with this new framework.\n","authors":["Wenzhi Gao","Chunlin Sun","Chenyu Xue","Dongdong Ge","Yinyu Ye"],"pdf_url":"https://arxiv.org/pdf/2402.07108v3.pdf","comment":"Merged into arXiv:2501.02761"},{"id":"http://arxiv.org/abs/2412.19139v2","updated":"2025-01-07T01:50:11Z","published":"2024-12-26T09:51:05Z","title":"PlanLLM: Video Procedure Planning with Refinable Large Language Models","summary":"  Video procedure planning, i.e., planning a sequence of action steps given the\nvideo frames of start and goal states, is an essential ability for embodied AI.\nRecent works utilize Large Language Models (LLMs) to generate enriched action\nstep description texts to guide action step decoding. Although LLMs are\nintroduced, these methods decode the action steps into a closed-set of one-hot\nvectors, limiting the model's capability of generalizing to new steps or tasks.\nAdditionally, fixed action step descriptions based on world-level commonsense\nmay contain noise in specific instances of visual states. In this paper, we\npropose PlanLLM, a cross-modal joint learning framework with LLMs for video\nprocedure planning. We propose an LLM-Enhanced Planning module which fully uses\nthe generalization ability of LLMs to produce free-form planning output and to\nenhance action step decoding. We also propose Mutual Information Maximization\nmodule to connect world-level commonsense of step descriptions and\nsample-specific information of visual states, enabling LLMs to employ the\nreasoning ability to generate step sequences. With the assistance of LLMs, our\nmethod can both closed-set and open vocabulary procedure planning tasks. Our\nPlanLLM achieves superior performance on three benchmarks, demonstrating the\neffectiveness of our designs.\n","authors":["Dejie Yang","Zijing Zhao","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2412.19139v2.pdf","comment":"accepted to AAAI2025"},{"id":"http://arxiv.org/abs/2410.22376v2","updated":"2025-01-07T01:41:13Z","published":"2024-10-29T07:43:39Z","title":"Rare-to-Frequent: Unlocking Compositional Generation Power of Diffusion\n  Models on Rare Concepts with LLM Guidance","summary":"  State-of-the-art text-to-image (T2I) diffusion models often struggle to\ngenerate rare compositions of concepts, e.g., objects with unusual attributes.\nIn this paper, we show that the compositional generation power of diffusion\nmodels on such rare concepts can be significantly enhanced by the Large\nLanguage Model (LLM) guidance. We start with empirical and theoretical\nanalysis, demonstrating that exposing frequent concepts relevant to the target\nrare concepts during the diffusion sampling process yields more accurate\nconcept composition. Based on this, we propose a training-free approach, R2F,\nthat plans and executes the overall rare-to-frequent concept guidance\nthroughout the diffusion inference by leveraging the abundant semantic\nknowledge in LLMs. Our framework is flexible across any pre-trained diffusion\nmodels and LLMs, and can be seamlessly integrated with the region-guided\ndiffusion approaches. Extensive experiments on three datasets, including our\nnewly proposed benchmark, RareBench, containing various prompts with rare\ncompositions of concepts, R2F significantly surpasses existing models including\nSD3.0 and FLUX by up to 28.1%p in T2I alignment. Code is available at\nhttps://github.com/krafton-ai/Rare-to-Frequent.\n","authors":["Dongmin Park","Sebin Kim","Taehong Moon","Minkyu Kim","Kangwook Lee","Jaewoong Cho"],"pdf_url":"https://arxiv.org/pdf/2410.22376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03461v1","updated":"2025-01-07T01:35:56Z","published":"2025-01-07T01:35:56Z","title":"Radar Signal Recognition through Self-Supervised Learning and Domain\n  Adaptation","summary":"  Automatic radar signal recognition (RSR) plays a pivotal role in electronic\nwarfare (EW), as accurately classifying radar signals is critical for informing\ndecision-making processes. Recent advances in deep learning have shown\nsignificant potential in improving RSR performance in domains with ample\nannotated data. However, these methods fall short in EW scenarios where\nannotated RF data are scarce or impractical to obtain. To address these\nchallenges, we introduce a self-supervised learning (SSL) method which utilises\nmasked signal modelling and RF domain adaption to enhance RSR performance in\nenvironments with limited RF samples and labels. Specifically, we investigate\npre-training masked autoencoders (MAE) on baseband in-phase and quadrature\n(I/Q) signals from various RF domains and subsequently transfer the learned\nrepresentation to the radar domain, where annotated data are limited. Empirical\nresults show that our lightweight self-supervised ResNet model with domain\nadaptation achieves up to a 17.5\\% improvement in 1-shot classification\naccuracy when pre-trained on in-domain signals (i.e., radar signals) and up to\na 16.31\\% improvement when pre-trained on out-of-domain signals (i.e., comm\nsignals), compared to its baseline without SSL. We also provide reference\nresults for several MAE designs and pre-training strategies, establishing a new\nbenchmark for few-shot radar signal classification.\n","authors":["Zi Huang","Akila Pemasiri","Simon Denman","Clinton Fookes","Terrence Martin"],"pdf_url":"https://arxiv.org/pdf/2501.03461v1.pdf","comment":"5 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.03451v1","updated":"2025-01-07T00:43:18Z","published":"2025-01-07T00:43:18Z","title":"Structure-Preference Enabled Graph Embedding Generation under\n  Differential Privacy","summary":"  Graph embedding generation techniques aim to learn low-dimensional vectors\nfor each node in a graph and have recently gained increasing research\nattention. Publishing low-dimensional node vectors enables various graph\nanalysis tasks, such as structural equivalence and link prediction. Yet,\nimproper publication opens a backdoor to malicious attackers, who can infer\nsensitive information of individuals from the low-dimensional node vectors.\nExisting methods tackle this issue by developing deep graph learning models\nwith differential privacy (DP). However, they often suffer from large noise\ninjections and cannot provide structural preferences consistent with mining\nobjectives. Recently, skip-gram based graph embedding generation techniques are\nwidely used due to their ability to extract customizable structures. Based on\nskip-gram, we present SE-PrivGEmb, a structure-preference enabled graph\nembedding generation under DP. For arbitrary structure preferences, we design a\nunified noise tolerance mechanism via perturbing non-zero vectors. This\nmechanism mitigates utility degradation caused by high sensitivity. By\ncarefully designing negative sampling probabilities in skip-gram, we\ntheoretically demonstrate that skip-gram can preserve arbitrary proximities,\nwhich quantify structural features in graphs. Extensive experiments show that\nour method outperforms existing state-of-the-art methods under structural\nequivalence and link prediction tasks.\n","authors":["Sen Zhang","Qingqing Ye","Haibo Hu"],"pdf_url":"https://arxiv.org/pdf/2501.03451v1.pdf","comment":"Accepted by ICDE 25"},{"id":"http://arxiv.org/abs/2501.03448v1","updated":"2025-01-07T00:30:31Z","published":"2025-01-07T00:30:31Z","title":"Optimizing Value of Learning in Task-Oriented Federated Meta-Learning\n  Systems","summary":"  Federated Learning (FL) has gained significant attention in recent years due\nto its distributed nature and privacy preserving benefits. However, a key\nlimitation of conventional FL is that it learns and distributes a common global\nmodel to all participants, which fails to provide customized solutions for\ndiverse task requirements. Federated meta-learning (FML) offers a promising\nsolution to this issue by enabling devices to finetune local models after\nreceiving a shared meta-model from the server. In this paper, we propose a\ntask-oriented FML framework over non-orthogonal multiple access (NOMA)\nnetworks. A novel metric, termed value of learning (VoL), is introduced to\nassess the individual training needs across devices. Moreover, a task-level\nweight (TLW) metric is defined based on task requirements and fairness\nconsiderations, guiding the prioritization of edge devices during FML training.\nThe formulated problem, to maximize the sum of TLW-based VoL across devices,\nforms a non-convex mixed-integer non-linear programming (MINLP) challenge,\naddressed here using a parameterized deep Q-network (PDQN) algorithm to handle\nboth discrete and continuous variables. Simulation results demonstrate that our\napproach significantly outperforms baseline schemes, underscoring the\nadvantages of the proposed framework.\n","authors":["Bibo Wu","Fang Fang","Xianbin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11200v2","updated":"2025-01-07T00:23:43Z","published":"2024-11-17T23:30:01Z","title":"Countering Backdoor Attacks in Image Recognition: A Survey and\n  Evaluation of Mitigation Strategies","summary":"  The widespread adoption of deep learning across various industries has\nintroduced substantial challenges, particularly in terms of model\nexplainability and security. The inherent complexity of deep learning models,\nwhile contributing to their effectiveness, also renders them susceptible to\nadversarial attacks. Among these, backdoor attacks are especially concerning,\nas they involve surreptitiously embedding specific triggers within training\ndata, causing the model to exhibit aberrant behavior when presented with input\ncontaining the triggers. Such attacks often exploit vulnerabilities in\noutsourced processes, compromising model integrity without affecting\nperformance on clean (trigger-free) input data. In this paper, we present a\ncomprehensive review of existing mitigation strategies designed to counter\nbackdoor attacks in image recognition. We provide an in-depth analysis of the\ntheoretical foundations, practical efficacy, and limitations of these\napproaches. In addition, we conduct an extensive benchmarking of sixteen\nstate-of-the-art approaches against eight distinct backdoor attacks, utilizing\nthree datasets, four model architectures, and three poisoning ratios. Our\nresults, derived from 122,236 individual experiments, indicate that while many\napproaches provide some level of protection, their performance can vary\nconsiderably. Furthermore, when compared to two seminal approaches, most newer\napproaches do not demonstrate substantial improvements in overall performance\nor consistency across diverse settings. Drawing from these findings, we propose\npotential directions for developing more effective and generalizable defensive\nmechanisms in the future.\n","authors":["Kealan Dunnett","Reza Arablouei","Dimity Miller","Volkan Dedeoglu","Raja Jurdak"],"pdf_url":"https://arxiv.org/pdf/2411.11200v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03445v1","updated":"2025-01-07T00:15:04Z","published":"2025-01-07T00:15:04Z","title":"Physics-Constrained Generative Artificial Intelligence for Rapid Takeoff\n  Trajectory Design","summary":"  To aid urban air mobility (UAM), electric vertical takeoff and landing\n(eVTOL) aircraft are being targeted. Conventional multidisciplinary analysis\nand optimization (MDAO) can be expensive, while surrogate-based optimization\ncan struggle with challenging physical constraints. This work proposes\nphysics-constrained generative adversarial networks (physicsGAN), to\nintelligently parameterize the takeoff control profiles of an eVTOL aircraft\nand to transform the original design space to a feasible space. Specifically,\nthe transformed feasible space refers to a space where all designs directly\nsatisfy all design constraints. The physicsGAN-enabled surrogate-based takeoff\ntrajectory design framework was demonstrated on the Airbus A3 Vahana. The\nphysicsGAN generated only feasible control profiles of power and wing angle in\nthe feasible space with around 98.9% of designs satisfying all constraints. The\nproposed design framework obtained 99.6% accuracy compared with\nsimulation-based optimal design and took only 2.2 seconds, which reduced the\ncomputational time by around 200 times. Meanwhile, data-driven GAN-enabled\nsurrogate-based optimization took 21.9 seconds using a derivative-free\noptimizer, which was around an order of magnitude slower than the proposed\nframework. Moreover, the data-driven GAN-based optimization using\ngradient-based optimizers could not consistently find the optimal design during\nrandom trials and got stuck in an infeasible region, which is problematic in\nreal practice. Therefore, the proposed physicsGAN-based design framework\noutperformed data-driven GAN-based design to the extent of efficiency (2.2\nseconds), optimality (99.6% accurate), and feasibility (100% feasible).\nAccording to the literature review, this is the first physics-constrained\ngenerative artificial intelligence enabled by surrogate models.\n","authors":["Samuel Sisk","Xiaosong Du"],"pdf_url":"https://arxiv.org/pdf/2501.03445v1.pdf","comment":"Conference version with 10 pages and 7 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2501.03939v1","updated":"2025-01-07T17:00:35Z","published":"2025-01-07T17:00:35Z","title":"Visual question answering: from early developments to recent advances --\n  a survey","summary":"  Visual Question Answering (VQA) is an evolving research field aimed at\nenabling machines to answer questions about visual content by integrating image\nand language processing techniques such as feature extraction, object\ndetection, text embedding, natural language understanding, and language\ngeneration. With the growth of multimodal data research, VQA has gained\nsignificant attention due to its broad applications, including interactive\neducational tools, medical image diagnosis, customer service, entertainment,\nand social media captioning. Additionally, VQA plays a vital role in assisting\nvisually impaired individuals by generating descriptive content from images.\nThis survey introduces a taxonomy of VQA architectures, categorizing them based\non design choices and key components to facilitate comparative analysis and\nevaluation. We review major VQA approaches, focusing on deep learning-based\nmethods, and explore the emerging field of Large Visual Language Models (LVLMs)\nthat have demonstrated success in multimodal tasks like VQA. The paper further\nexamines available datasets and evaluation metrics essential for measuring VQA\nsystem performance, followed by an exploration of real-world VQA applications.\nFinally, we highlight ongoing challenges and future directions in VQA research,\npresenting open questions and potential areas for further development. This\nsurvey serves as a comprehensive resource for researchers and practitioners\ninterested in the latest advancements and future\n","authors":["Ngoc Dung Huynh","Mohamed Reda Bouadjenek","Sunil Aryal","Imran Razzak","Hakim Hacid"],"pdf_url":"https://arxiv.org/pdf/2501.03939v1.pdf","comment":"20"},{"id":"http://arxiv.org/abs/2501.03605v1","updated":"2025-01-07T08:06:35Z","published":"2025-01-07T08:06:35Z","title":"ConcealGS: Concealing Invisible Copyright Information in 3D Gaussian\n  Splatting","summary":"  With the rapid development of 3D reconstruction technology, the widespread\ndistribution of 3D data has become a future trend. While traditional visual\ndata (such as images and videos) and NeRF-based formats already have mature\ntechniques for copyright protection, steganographic techniques for the emerging\n3D Gaussian Splatting (3D-GS) format have yet to be fully explored. To address\nthis, we propose ConcealGS, an innovative method for embedding implicit\ninformation into 3D-GS. By introducing the knowledge distillation and gradient\noptimization strategy based on 3D-GS, ConcealGS overcomes the limitations of\nNeRF-based models and enhances the robustness of implicit information and the\nquality of 3D reconstruction. We evaluate ConcealGS in various potential\napplication scenarios, and experimental results have demonstrated that\nConcealGS not only successfully recovers implicit information but also has\nalmost no impact on rendering quality, providing a new approach for embedding\ninvisible and recoverable information into 3D models in the future.\n","authors":["Yifeng Yang","Hengyu Liu","Chenxin Li","Yining Sun","Wuyang Li","Yifan Liu","Yiyang Lin","Yixuan Yuan","Nanyang Ye"],"pdf_url":"https://arxiv.org/pdf/2501.03605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19139v2","updated":"2025-01-07T01:50:11Z","published":"2024-12-26T09:51:05Z","title":"PlanLLM: Video Procedure Planning with Refinable Large Language Models","summary":"  Video procedure planning, i.e., planning a sequence of action steps given the\nvideo frames of start and goal states, is an essential ability for embodied AI.\nRecent works utilize Large Language Models (LLMs) to generate enriched action\nstep description texts to guide action step decoding. Although LLMs are\nintroduced, these methods decode the action steps into a closed-set of one-hot\nvectors, limiting the model's capability of generalizing to new steps or tasks.\nAdditionally, fixed action step descriptions based on world-level commonsense\nmay contain noise in specific instances of visual states. In this paper, we\npropose PlanLLM, a cross-modal joint learning framework with LLMs for video\nprocedure planning. We propose an LLM-Enhanced Planning module which fully uses\nthe generalization ability of LLMs to produce free-form planning output and to\nenhance action step decoding. We also propose Mutual Information Maximization\nmodule to connect world-level commonsense of step descriptions and\nsample-specific information of visual states, enabling LLMs to employ the\nreasoning ability to generate step sequences. With the assistance of LLMs, our\nmethod can both closed-set and open vocabulary procedure planning tasks. Our\nPlanLLM achieves superior performance on three benchmarks, demonstrating the\neffectiveness of our designs.\n","authors":["Dejie Yang","Zijing Zhao","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2412.19139v2.pdf","comment":"accepted to AAAI2025"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2412.05313v3","updated":"2025-01-07T18:57:23Z","published":"2024-11-28T19:31:50Z","title":"λ: A Benchmark for Data-Efficiency in Long-Horizon Indoor Mobile\n  Manipulation Robotics","summary":"  Efficiently learning and executing long-horizon mobile manipulation (MoMa)\ntasks is crucial for advancing robotics in household and workplace settings.\nHowever, current MoMa models are data-inefficient, underscoring the need for\nimproved models that require realistic-sized benchmarks to evaluate their\nefficiency, which do not exist. To address this, we introduce the LAMBDA\n({\\lambda}) benchmark (Long-horizon Actions for Mobile-manipulation\nBenchmarking of Directed Activities), which evaluates the data efficiency of\nmodels on language-conditioned, long-horizon, multi-room, multi-floor,\npick-and-place tasks using a dataset of manageable size, more feasible for\ncollection. The benchmark includes 571 human-collected demonstrations that\nprovide realism and diversity in simulated and real-world settings. Unlike\nplanner-generated data, these trajectories offer natural variability and\nreplay-verifiability, ensuring robust learning and evaluation. We benchmark\nseveral models, including learning-based models and a neuro-symbolic modular\napproach combining foundation models with task and motion planning.\nLearning-based models show suboptimal success rates, even when leveraging\npretrained weights, underscoring significant data inefficiencies. However, the\nneuro-symbolic approach performs significantly better while being more data\nefficient. Findings highlight the need for more data-efficient learning-based\nMoMa approaches. {\\lambda} addresses this gap by serving as a key benchmark for\nevaluating the data efficiency of those future models in handling household\nrobotics tasks.\n","authors":["Ahmed Jaafar","Shreyas Sundara Raman","Yichen Wei","Sudarshan Harithas","Sofia Juliani","Anneke Wernerfelt","Benedict Quartey","Ifrah Idrees","Jason Xinyu Liu","Stefanie Tellex"],"pdf_url":"https://arxiv.org/pdf/2412.05313v3.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2412.20429v3","updated":"2025-01-07T18:24:45Z","published":"2024-12-29T10:46:08Z","title":"Multi-Scenario Reasoning: Unlocking Cognitive Autonomy in Humanoid\n  Robots for Multimodal Understanding","summary":"  To improve the cognitive autonomy of humanoid robots, this research proposes\na multi-scenario reasoning architecture to solve the technical shortcomings of\nmulti-modal understanding in this field. It draws on simulation based\nexperimental design that adopts multi-modal synthesis (visual, auditory,\ntactile) and builds a simulator \"Maha\" to perform the experiment. The findings\ndemonstrate the feasibility of this architecture in multimodal data. It\nprovides reference experience for the exploration of cross-modal interaction\nstrategies for humanoid robots in dynamic environments. In addition,\nmulti-scenario reasoning simulates the high-level reasoning mechanism of the\nhuman brain to humanoid robots at the cognitive level. This new concept\npromotes cross-scenario practical task transfer and semantic-driven action\nplanning. It heralds the future development of self-learning and autonomous\nbehavior of humanoid robots in changing scenarios.\n","authors":["Libo Wang"],"pdf_url":"https://arxiv.org/pdf/2412.20429v3.pdf","comment":"The main text is 5 pages, 2 figures, and 3 tables"},{"id":"http://arxiv.org/abs/2501.03968v1","updated":"2025-01-07T18:06:27Z","published":"2025-01-07T18:06:27Z","title":"VLM-driven Behavior Tree for Context-aware Task Planning","summary":"  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)\nhas recently gained attention in the robotics community, yet remains in its\nearly stages of development. In this paper, we propose a novel framework that\nleverages Vision-Language Models (VLMs) to interactively generate and edit BTs\nthat address visual conditions, enabling context-aware robot operations in\nvisually complex environments. A key feature of our approach lies in the\nconditional control through self-prompted visual conditions. Specifically, the\nVLM generates BTs with visual condition nodes, where conditions are expressed\nas free-form text. Another VLM process integrates the text into its prompt and\nevaluates the conditions against real-world images during robot execution. We\nvalidated our framework in a real-world cafe scenario, demonstrating both its\nfeasibility and limitations.\n","authors":["Naoki Wake","Atsushi Kanehira","Jun Takamatsu","Kazuhiro Sasabuchi","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2501.03968v1.pdf","comment":"10 pages, 11 figures, 5 tables. Last updated on January 7th, 2024"},{"id":"http://arxiv.org/abs/2403.05300v5","updated":"2025-01-07T17:42:16Z","published":"2024-03-08T13:29:46Z","title":"Unity by Diversity: Improved Representation Learning in Multimodal VAEs","summary":"  Variational Autoencoders for multimodal data hold promise for many tasks in\ndata analysis, such as representation learning, conditional generation, and\nimputation. Current architectures either share the encoder output, decoder\ninput, or both across modalities to learn a shared representation. Such\narchitectures impose hard constraints on the model. In this work, we show that\na better latent representation can be obtained by replacing these hard\nconstraints with a soft constraint. We propose a new mixture-of-experts prior,\nsoftly guiding each modality's latent representation towards a shared aggregate\nposterior. This approach results in a superior latent representation and allows\neach encoding to preserve information better from its uncompressed original\nfeatures. In extensive experiments on multiple benchmark datasets and two\nchallenging real-world datasets, we show improved learned latent\nrepresentations and imputation of missing data modalities compared to existing\nmethods.\n","authors":["Thomas M. Sutter","Yang Meng","Andrea Agostini","Daphné Chopard","Norbert Fortin","Julia E. Vogt","Babak Shahbaba","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2403.05300v5.pdf","comment":"Accepted at Neurips 2024"},{"id":"http://arxiv.org/abs/2408.11735v3","updated":"2025-01-07T17:34:04Z","published":"2024-08-21T15:59:33Z","title":"Clinical Insights: A Comprehensive Review of Language Models in Medicine","summary":"  This paper explores the advancements and applications of language models in\nhealthcare, focusing on their clinical use cases. It examines the evolution\nfrom early encoder-based systems requiring extensive fine-tuning to\nstate-of-the-art large language and multimodal models capable of integrating\ntext and visual data through in-context learning. The analysis emphasizes\nlocally deployable models, which enhance data privacy and operational autonomy,\nand their applications in tasks such as text generation, classification,\ninformation extraction, and conversational systems. The paper also highlights a\nstructured organization of tasks and a tiered ethical approach, providing a\nvaluable resource for researchers and practitioners, while discussing key\nchallenges related to ethics, evaluation, and implementation.\n","authors":["Nikita Neveditsin","Pawan Lingras","Vijay Mago"],"pdf_url":"https://arxiv.org/pdf/2408.11735v3.pdf","comment":"Submitted to PLOS Digital Health, Revision 1"},{"id":"http://arxiv.org/abs/2301.08110v6","updated":"2025-01-07T17:26:26Z","published":"2023-01-19T15:01:00Z","title":"AtMan: Understanding Transformer Predictions Through Memory Efficient\n  Attention Manipulation","summary":"  Generative transformer models have become increasingly complex, with large\nnumbers of parameters and the ability to process multiple input modalities.\nCurrent methods for explaining their predictions are resource-intensive. Most\ncrucially, they require prohibitively large amounts of extra memory, since they\nrely on backpropagation which allocates almost twice as much GPU memory as the\nforward pass. This makes it difficult, if not impossible, to use them in\nproduction. We present AtMan that provides explanations of generative\ntransformer models at almost no extra cost. Specifically, AtMan is a\nmodality-agnostic perturbation method that manipulates the attention mechanisms\nof transformers to produce relevance maps for the input with respect to the\noutput prediction. Instead of using backpropagation, AtMan applies a\nparallelizable token-based search method based on cosine similarity\nneighborhood in the embedding space. Our exhaustive experiments on text and\nimage-text benchmarks demonstrate that AtMan outperforms current\nstate-of-the-art gradient-based methods on several metrics while being\ncomputationally efficient. As such, AtMan is suitable for use in large model\ninference deployments.\n","authors":["Björn Deiseroth","Mayukh Deb","Samuel Weinbach","Manuel Brack","Patrick Schramowski","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2301.08110v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03952v1","updated":"2025-01-07T17:24:17Z","published":"2025-01-07T17:24:17Z","title":"Localizing AI: Evaluating Open-Weight Language Models for Languages of\n  Baltic States","summary":"  Although large language models (LLMs) have transformed our expectations of\nmodern language technologies, concerns over data privacy often restrict the use\nof commercially available LLMs hosted outside of EU jurisdictions. This limits\ntheir application in governmental, defence, and other data-sensitive sectors.\nIn this work, we evaluate the extent to which locally deployable open-weight\nLLMs support lesser-spoken languages such as Lithuanian, Latvian, and Estonian.\nWe examine various size and precision variants of the top-performing\nmultilingual open-weight models, Llama~3, Gemma~2, Phi, and NeMo, on machine\ntranslation, multiple-choice question answering, and free-form text generation.\nThe results indicate that while certain models like Gemma~2 perform close to\nthe top commercially available models, many LLMs struggle with these languages.\nMost surprisingly, however, we find that these models, while showing close to\nstate-of-the-art translation performance, are still prone to lexical\nhallucinations with errors in at least 1 in 20 words for all open-weight\nmultilingual LLMs.\n","authors":["Jurgita Kapočiūtė-Dzikienė","Toms Bergmanis","Mārcis Pinnis"],"pdf_url":"https://arxiv.org/pdf/2501.03952v1.pdf","comment":"This paper is accepted to NoDaLiDa/Baltic-HLT 2025"},{"id":"http://arxiv.org/abs/2501.03941v1","updated":"2025-01-07T17:02:33Z","published":"2025-01-07T17:02:33Z","title":"Synthetic Data Privacy Metrics","summary":"  Recent advancements in generative AI have made it possible to create\nsynthetic datasets that can be as accurate as real-world data for training AI\nmodels, powering statistical insights, and fostering collaboration with\nsensitive datasets while offering strong privacy guarantees. Effectively\nmeasuring the empirical privacy of synthetic data is an important step in the\nprocess. However, while there is a multitude of new privacy metrics being\npublished every day, there currently is no standardization. In this paper, we\nreview the pros and cons of popular metrics that include simulations of\nadversarial attacks. We also review current best practices for amending\ngenerative models to enhance the privacy of the data they create (e.g.\ndifferential privacy).\n","authors":["Amy Steier","Lipika Ramaswamy","Andre Manoel","Alexa Haushalter"],"pdf_url":"https://arxiv.org/pdf/2501.03941v1.pdf","comment":"14 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.03940v1","updated":"2025-01-07T17:00:49Z","published":"2025-01-07T17:00:49Z","title":"Not all tokens are created equal: Perplexity Attention Weighted Networks\n  for AI generated text detection","summary":"  The rapid advancement in large language models (LLMs) has significantly\nenhanced their ability to generate coherent and contextually relevant text,\nraising concerns about the misuse of AI-generated content and making it\ncritical to detect it. However, the task remains challenging, particularly in\nunseen domains or with unfamiliar LLMs. Leveraging LLM next-token distribution\noutputs offers a theoretically appealing approach for detection, as they\nencapsulate insights from the models' extensive pre-training on diverse\ncorpora. Despite its promise, zero-shot methods that attempt to operationalize\nthese outputs have met with limited success. We hypothesize that one of the\nproblems is that they use the mean to aggregate next-token distribution metrics\nacross tokens, when some tokens are naturally easier or harder to predict and\nshould be weighted differently. Based on this idea, we propose the Perplexity\nAttention Weighted Network (PAWN), which uses the last hidden states of the LLM\nand positions to weight the sum of a series of features based on metrics from\nthe next-token distribution across the sequence length. Although not zero-shot,\nour method allows us to cache the last hidden states and next-token\ndistribution metrics on disk, greatly reducing the training resource\nrequirements. PAWN shows competitive and even better performance\nin-distribution than the strongest baselines (fine-tuned LMs) with a fraction\nof their trainable parameters. Our model also generalizes better to unseen\ndomains and source models, with smaller variability in the decision boundary\nacross distribution shifts. It is also more robust to adversarial attacks, and\nif the backbone has multilingual capabilities, it presents decent\ngeneralization to languages not seen during supervised training, with LLaMA3-1B\nreaching a mean macro-averaged F1 score of 81.46% in cross-validation with nine\nlanguages.\n","authors":["Pablo Miralles-González","Javier Huertas-Tato","Alejandro Martín","David Camacho"],"pdf_url":"https://arxiv.org/pdf/2501.03940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03936v1","updated":"2025-01-07T16:53:01Z","published":"2025-01-07T16:53:01Z","title":"PPTAgent: Generating and Evaluating Presentations Beyond Text-to-Slides","summary":"  Automatically generating presentations from documents is a challenging task\nthat requires balancing content quality, visual design, and structural\ncoherence. Existing methods primarily focus on improving and evaluating the\ncontent quality in isolation, often overlooking visual design and structural\ncoherence, which limits their practical applicability. To address these\nlimitations, we propose PPTAgent, which comprehensively improves presentation\ngeneration through a two-stage, edit-based approach inspired by human\nworkflows. PPTAgent first analyzes reference presentations to understand their\nstructural patterns and content schemas, then drafts outlines and generates\nslides through code actions to ensure consistency and alignment. To\ncomprehensively evaluate the quality of generated presentations, we further\nintroduce PPTEval, an evaluation framework that assesses presentations across\nthree dimensions: Content, Design, and Coherence. Experiments show that\nPPTAgent significantly outperforms traditional automatic presentation\ngeneration methods across all three dimensions. The code and data are available\nat https://github.com/icip-cas/PPTAgent.\n","authors":["Hao Zheng","Xinyan Guan","Hao Kong","Jia Zheng","Hongyu Lin","Yaojie Lu","Ben He","Xianpei Han","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2501.03936v1.pdf","comment":"8 pages, 20 figures"},{"id":"http://arxiv.org/abs/2501.03916v1","updated":"2025-01-07T16:31:10Z","published":"2025-01-07T16:31:10Z","title":"Dolphin: Closed-loop Open-ended Auto-research through Thinking,\n  Practice, and Feedback","summary":"  The scientific research paradigm is undergoing a profound transformation\nowing to the development of Artificial Intelligence (AI). Recent works\ndemonstrate that various AI-assisted research methods can largely improve\nresearch efficiency by improving data analysis, accelerating computation, and\nfostering novel idea generation. To further move towards the ultimate goal\n(i.e., automatic scientific research), in this paper, we propose Dolphin, the\nfirst closed-loop open-ended auto-research framework to further build the\nentire process of human scientific research. Dolphin can generate research\nideas, perform experiments, and get feedback from experimental results to\ngenerate higher-quality ideas. More specifically, Dolphin first generates novel\nideas based on relevant papers which are ranked by the topic and task\nattributes. Then, the codes are automatically generated and debugged with the\nexception-traceback-guided local code structure. Finally, Dolphin automatically\nanalyzes the results of each idea and feeds the results back to the next round\nof idea generation. Experiments are conducted on the benchmark datasets of\ndifferent topics and results show that Dolphin can generate novel ideas\ncontinuously and complete the experiment in a loop. We highlight that Dolphin\ncan automatically propose methods that are comparable to the state-of-the-art\nin some tasks such as 2D image classification and 3D point classification.\n","authors":["Jiakang Yuan","Xiangchao Yan","Botian Shi","Tao Chen","Wanli Ouyang","Bo Zhang","Lei Bai","Yu Qiao","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.03916v1.pdf","comment":"19 pages, 11 figures, and our homepage:\n  https://unimodal4reasoning.github.io/Dolphin-project-page/"},{"id":"http://arxiv.org/abs/2406.19223v2","updated":"2025-01-07T16:20:17Z","published":"2024-06-27T14:49:08Z","title":"T-FREE: Subword Tokenizer-Free Generative LLMs via Sparse\n  Representations for Memory-Efficient Embeddings","summary":"  Tokenizers are crucial for encoding information in Large Language Models, but\ntheir development has recently stagnated, and they contain inherent weaknesses.\nMajor limitations include computational overhead, ineffective vocabulary use,\nand unnecessarily large embedding and head layers. Additionally, their\nperformance is biased towards a reference corpus, leading to reduced\neffectiveness for underrepresented languages.\n  To remedy these issues, we propose T-FREE, which directly embeds words\nthrough sparse activation patterns over character triplets, and does not\nrequire a reference corpus. T-FREE inherently exploits morphological\nsimilarities and allows for strong compression of embedding layers. In our\nexhaustive experimental evaluation, we achieve competitive downstream\nperformance with a parameter reduction of more than 85% on these layers.\nFurther, T-FREE shows significant improvements in cross-lingual transfer\nlearning.\n","authors":["Björn Deiseroth","Manuel Brack","Patrick Schramowski","Kristian Kersting","Samuel Weinbach"],"pdf_url":"https://arxiv.org/pdf/2406.19223v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03904v1","updated":"2025-01-07T16:18:55Z","published":"2025-01-07T16:18:55Z","title":"Exploring the Potential of Large Language Models in Public\n  Transportation: San Antonio Case Study","summary":"  The integration of large language models (LLMs) into public transit systems\npresents a transformative opportunity to enhance urban mobility. This study\nexplores the potential of LLMs to revolutionize public transportation\nmanagement within the context of San Antonio's transit system. Leveraging the\ncapabilities of LLMs in natural language processing and data analysis, we\ninvestigate their capabilities to optimize route planning, reduce wait times,\nand provide personalized travel assistance. By utilizing the General Transit\nFeed Specification (GTFS) and other relevant data, this research aims to\ndemonstrate how LLMs can potentially improve resource allocation, elevate\npassenger satisfaction, and inform data-driven decision-making in transit\noperations. A comparative analysis of different ChatGPT models was conducted to\nassess their ability to understand transportation information, retrieve\nrelevant data, and provide comprehensive responses. Findings from this study\nsuggest that while LLMs hold immense promise for public transit, careful\nengineering and fine-tuning are essential to realizing their full potential.\nSan Antonio serves as a case study to inform the development of LLM-powered\ntransit systems in other urban environments.\n","authors":["Ramya Jonnala","Gongbo Liang","Jeong Yang","Izzat Alsmadi"],"pdf_url":"https://arxiv.org/pdf/2501.03904v1.pdf","comment":"This work is accepted to AAAI 2025 Workshop on AI for Urban Planning.\n  arXiv admin note: substantial text overlap with arXiv:2407.11003"},{"id":"http://arxiv.org/abs/2412.06866v3","updated":"2025-01-07T16:16:49Z","published":"2024-12-09T09:31:58Z","title":"LMS-AutoTSF: Learnable Multi-Scale Decomposition and Integrated\n  Autocorrelation for Time Series Forecasting","summary":"  Time series forecasting is an important challenge with significant\napplications in areas such as weather prediction, stock market analysis,\nscientific simulations and industrial process analysis. In this work, we\nintroduce LMS-AutoTSF, a novel time series forecasting architecture that\nincorporates autocorrelation while leveraging dual encoders operating at\nmultiple scales. Unlike models that rely on predefined trend and seasonal\ncomponents, LMS-AutoTSF employs two separate encoders per scale: one focusing\non low-pass filtering to capture trends and the other utilizing high-pass\nfiltering to model seasonal variations. These filters are learnable, allowing\nthe model to dynamically adapt and isolate trend and seasonal components\ndirectly in the frequency domain. A key innovation in our approach is the\nintegration of autocorrelation, achieved by computing lagged differences in\ntime steps, which enables the model to capture dependencies across time more\neffectively. Each encoder processes the input through fully connected layers to\nhandle temporal and channel interactions. By combining frequency-domain\nfiltering, autocorrelation-based temporal modeling, and channel-wise\ntransformations, LMS-AutoTSF not only accurately captures long-term\ndependencies and fine-grained patterns but also operates more efficiently\ncompared to other state-of-the-art methods. Its lightweight design ensures\nfaster processing while maintaining high precision in forecasting across\ndiverse time horizons. The source code is publicly available at\n\\url{http://github.com/mribrahim/LMS-TSF}\n","authors":["Ibrahim Delibasoglu","Sanjay Chakraborty","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2412.06866v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19519v2","updated":"2025-01-07T16:13:50Z","published":"2024-05-29T20:56:52Z","title":"Two-Layer Retrieval-Augmented Generation Framework for Low-Resource\n  Medical Question Answering Using Reddit Data: Proof-of-Concept Study","summary":"  The increasing use of social media to share lived and living experiences of\nsubstance use presents a unique opportunity to obtain information on side\neffects, use patterns, and opinions on novel psychoactive substances. However,\ndue to the large volume of data, obtaining useful insights through natural\nlanguage processing technologies such as large language models is challenging.\nThis paper aims to develop a retrieval-augmented generation (RAG) architecture\nfor medical question answering pertaining to clinicians' queries on emerging\nissues associated with health-related topics, using user-generated medical\ninformation on social media. We proposed a two-layer RAG framework for\nquery-focused answer generation and evaluated a proof of concept for the\nframework in the context of query-focused summary generation from social media\nforums, focusing on emerging drug-related information. Our modular framework\ngenerates individual summaries followed by an aggregated summary to answer\nmedical queries from large amounts of user-generated social media data in an\nefficient manner. We compared the performance of a quantized large language\nmodel (Nous-Hermes-2-7B-DPO), deployable in low-resource settings, with GPT-4.\nFor this proof-of-concept study, we used user-generated data from Reddit to\nanswer clinicians' questions on the use of xylazine and ketamine. Our framework\nachieves comparable median scores in terms of relevance, length, hallucination,\ncoverage, and coherence when evaluated using GPT-4 and Nous-Hermes-2-7B-DPO,\nevaluated for 20 queries with 76 samples. There was no statistically\nsignificant difference between the two for coverage, coherence, relevance,\nlength, and hallucination. A statistically significant difference was noted for\nthe Coleman-Liau Index. Our RAG framework can effectively answer medical\nquestions about targeted topics and can be deployed in resource-constrained\nsettings.\n","authors":["Sudeshna Das","Yao Ge","Yuting Guo","Swati Rajwal","JaMor Hairston","Jeanne Powell","Drew Walker","Snigdha Peddireddy","Sahithi Lakamana","Selen Bozkurt","Matthew Reyna","Reza Sameni","Yunyu Xiao","Sangmi Kim","Rasheeta Chandler","Natalie Hernandez","Danielle Mowery","Rachel Wightman","Jennifer Love","Anthony Spadaro","Jeanmarie Perrone","Abeed Sarker"],"pdf_url":"https://arxiv.org/pdf/2405.19519v2.pdf","comment":"Published in JMIR: https://www.jmir.org/2025/1/e66220"},{"id":"http://arxiv.org/abs/2501.03902v1","updated":"2025-01-07T16:10:09Z","published":"2025-01-07T16:10:09Z","title":"Explainable Reinforcement Learning via Temporal Policy Decomposition","summary":"  We investigate the explainability of Reinforcement Learning (RL) policies\nfrom a temporal perspective, focusing on the sequence of future outcomes\nassociated with individual actions. In RL, value functions compress information\nabout rewards collected across multiple trajectories and over an infinite\nhorizon, allowing a compact form of knowledge representation. However, this\ncompression obscures the temporal details inherent in sequential\ndecision-making, presenting a key challenge for interpretability. We present\nTemporal Policy Decomposition (TPD), a novel explainability approach that\nexplains individual RL actions in terms of their Expected Future Outcome (EFO).\nThese explanations decompose generalized value functions into a sequence of\nEFOs, one for each time step up to a prediction horizon of interest, revealing\ninsights into when specific outcomes are expected to occur. We leverage\nfixed-horizon temporal difference learning to devise an off-policy method for\nlearning EFOs for both optimal and suboptimal actions, enabling contrastive\nexplanations consisting of EFOs for different state-action pairs. Our\nexperiments demonstrate that TPD generates accurate explanations that (i)\nclarify the policy's future strategy and anticipated trajectory for a given\naction and (ii) improve understanding of the reward composition, facilitating\nfine-tuning of the reward function to align with human expectations.\n","authors":["Franco Ruggeri","Alessio Russo","Rafia Inam","Karl Henrik Johansson"],"pdf_url":"https://arxiv.org/pdf/2501.03902v1.pdf","comment":"21 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.03895v1","updated":"2025-01-07T16:03:14Z","published":"2025-01-07T16:03:14Z","title":"LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One\n  Vision Token","summary":"  The advent of real-time large multimodal models (LMMs) like GPT-4o has\nsparked considerable interest in efficient LMMs. LMM frameworks typically\nencode visual inputs into vision tokens (continuous representations) and\nintegrate them and textual instructions into the context of large language\nmodels (LLMs), where large-scale parameters and numerous context tokens\n(predominantly vision tokens) result in substantial computational overhead.\nPrevious efforts towards efficient LMMs always focus on replacing the LLM\nbackbone with smaller models, while neglecting the crucial issue of token\nquantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal\nvision tokens. To achieve a high compression ratio of vision tokens while\npreserving visual information, we first analyze how LMMs understand vision\ntokens and find that most vision tokens only play a crucial role in the early\nlayers of LLM backbone, where they mainly fuse visual information into text\ntokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to\nfuse visual information into text tokens in advance, thereby facilitating the\nextreme compression of vision tokens fed to LLM backbone into one token.\nLLaVA-Mini is a unified large multimodal model that can support the\nunderstanding of images, high-resolution images, and videos in an efficient\nmanner. Experiments across 11 image-based and 7 video-based benchmarks\ndemonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token\ninstead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by\n77%, deliver low-latency responses within 40 milliseconds, and process over\n10,000 frames of video on the GPU hardware with 24GB of memory.\n","authors":["Shaolei Zhang","Qingkai Fang","Zhe Yang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2501.03895v1.pdf","comment":"Code: https://github.com/ictnlp/LLaVA-Mini; Model:\n  https://huggingface.co/ICTNLP/llava-mini-llama-3.1-8b"},{"id":"http://arxiv.org/abs/2408.11876v2","updated":"2025-01-07T16:01:15Z","published":"2024-08-20T13:19:06Z","title":"From Glucose Patterns to Health Outcomes: A Generalizable Foundation\n  Model for Continuous Glucose Monitor Data Analysis","summary":"  Recent advances in SSL enabled novel medical AI models, known as foundation\nmodels, offer great potential for better characterizing health from diverse\nbiomedical data. CGM provides rich, temporal data on glycemic patterns, but its\nfull potential for predicting broader health outcomes remains underutilized.\nHere, we present GluFormer, a generative foundation model for CGM data that\nlearns nuanced glycemic patterns and translates them into predictive\nrepresentations of metabolic health. Trained on over 10 million CGM\nmeasurements from 10,812 adults, primarily without diabetes, GluFormer uses\nautoregressive token prediction to capture longitudinal glucose dynamics. We\nshow that GluFormer generalizes to 19 external cohorts (n=6,044) spanning\ndifferent ethnicities and ages, 5 countries, 8 CGM devices, and diverse\npathophysiological states. GluFormers representations exceed the performance of\ncurrent CGM metrics, such as the Glucose Management Indicator (GMI), for\nforecasting clinical measures. In a longitudinal study of 580 adults with CGM\ndata and 12-year follow-up, GluFormer identifies individuals at elevated risk\nof developing diabetes more effectively than blood HbA1C%, capturing 66% of all\nnew-onset diabetes diagnoses in the top quartile versus 7% in the bottom\nquartile. Similarly, 69% of cardiovascular-death events occurred in the top\nquartile with none in the bottom quartile, demonstrating powerful risk\nstratification beyond traditional glycemic metrics. We also show that CGM\nrepresentations from pre-intervention periods in Randomized Clinical Trials\noutperform other methods in predicting primary and secondary outcomes. When\nintegrating dietary data into GluFormer, we show that the multi-modal version\nof the model can accurately generate CGM data based on dietary intake data,\nsimulate outcomes of dietary interventions, and predict individual responses to\nspecific foods.\n","authors":["Guy Lutsker","Gal Sapir","Smadar Shilo","Jordi Merino","Anastasia Godneva","Jerry R Greenfield","Dorit Samocha-Bonet","Raja Dhir","Francisco Gude","Shie Mannor","Eli Meirom","Gal Chechik","Hagai Rossman","Eran Segal"],"pdf_url":"https://arxiv.org/pdf/2408.11876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03888v1","updated":"2025-01-07T15:51:49Z","published":"2025-01-07T15:51:49Z","title":"Neural DNF-MT: A Neuro-symbolic Approach for Learning Interpretable and\n  Editable Policies","summary":"  Although deep reinforcement learning has been shown to be effective, the\nmodel's black-box nature presents barriers to direct policy interpretation. To\naddress this problem, we propose a neuro-symbolic approach called neural DNF-MT\nfor end-to-end policy learning. The differentiable nature of the neural DNF-MT\nmodel enables the use of deep actor-critic algorithms for training. At the same\ntime, its architecture is designed so that trained models can be directly\ntranslated into interpretable policies expressed as standard (bivalent or\nprobabilistic) logic programs. Moreover, additional layers can be included to\nextract abstract features from complex observations, acting as a form of\npredicate invention. The logic representations are highly interpretable, and we\nshow how the bivalent representations of deterministic policies can be edited\nand incorporated back into a neural model, facilitating manual intervention and\nadaptation of learned policies. We evaluate our approach on a range of tasks\nrequiring learning deterministic or stochastic behaviours from various forms of\nobservations. Our empirical results show that our neural DNF-MT model performs\nat the level of competing black-box methods whilst providing interpretable\npolicies.\n","authors":["Kexin Gu Baugh","Luke Dickens","Alessandra Russo"],"pdf_url":"https://arxiv.org/pdf/2501.03888v1.pdf","comment":"AAMAS 2025"},{"id":"http://arxiv.org/abs/2410.11463v2","updated":"2025-01-07T15:48:15Z","published":"2024-10-15T10:10:33Z","title":"Advanced Persistent Threats (APT) Attribution Using Deep Reinforcement\n  Learning","summary":"  The development of the DRL model for malware attribution involved extensive\nresearch, iterative coding, and numerous adjustments based on the insights\ngathered from predecessor models and contemporary research papers. This\npreparatory work was essential to establish a robust foundation for the model,\nensuring it could adapt and respond effectively to the dynamic nature of\nmalware threats. Initially, the model struggled with low accuracy levels, but\nthrough persistent adjustments to its architecture and learning algorithms,\naccuracy improved dramatically from about 7 percent to over 73 percent in early\niterations. By the end of the training, the model consistently reached accuracy\nlevels near 98 percent, demonstrating its strong capability to accurately\nrecognise and attribute malware activities. This upward trajectory in training\naccuracy is graphically represented in the Figure, which vividly illustrates\nthe model maturation and increasing proficiency over time.\n","authors":["Animesh Singh Basnet","Mohamed Chahine Ghanem","Dipo Dunsin","Wiktor Sowinski-Mydlarz"],"pdf_url":"https://arxiv.org/pdf/2410.11463v2.pdf","comment":"21 Pages"},{"id":"http://arxiv.org/abs/2405.03732v3","updated":"2025-01-07T15:46:25Z","published":"2024-05-06T10:53:13Z","title":"Deep Learning-based Accelerated MR Cholangiopancreatography without\n  Fully-sampled Data","summary":"  The purpose of this study was to accelerate MR cholangiopancreatography\n(MRCP) acquisitions using deep learning-based (DL) reconstruction at 3T and\n0.55T. A total of 35 healthy volunteers underwent conventional two-fold\naccelerated MRCP scans at field strengths of 3T and 0.55T. We trained DL\nreconstructions using two different training strategies, supervised (SV) and\nself-supervised (SSV), with retrospectively six-fold undersampled data obtained\nat 3T. We then evaluated the DL reconstructions against standard techniques,\nparallel imaging (PI) and compressed sensing (CS), focusing on peak\nsignal-to-noise ratio (PSNR) and structural similarity (SSIM) as metrics. We\nalso tested DL reconstructions with prospectively accelerated acquisitions and\nevaluated their robustness when changing fields strengths from 3T to 0.55T. DL\nreconstructions demonstrated a reduction in average acquisition time from\n599/542 to 255/180 seconds for MRCP at 3T/0.55T. In both retrospective and\nprospective undersampling, PSNR and SSIM of DL reconstructions were higher than\nthose of PI and CS. At the same time, DL reconstructions preserved the image\nquality of undersampled data, including sharpness and the visibility of\nhepatobiliary ducts. In addition, both DL approaches produced high-quality\nreconstructions at 0.55T. In summary, DL reconstructions trained for highly\naccelerated MRCP enabled a reduction in acquisition time by a factor of 2.4/3.0\nat 3T/0.55T while maintaining the image quality of conventional acquisitions.\n","authors":["Jinho Kim","Marcel Dominik Nickel","Florian Knoll"],"pdf_url":"https://arxiv.org/pdf/2405.03732v3.pdf","comment":"19 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.03879v1","updated":"2025-01-07T15:42:32Z","published":"2025-01-07T15:42:32Z","title":"CL3DOR: Contrastive Learning for 3D Large Multimodal Models via Odds\n  Ratio on High-Resolution Point Clouds","summary":"  Recent research has demonstrated that Large Language Models (LLMs) are not\nlimited to text-only tasks but can also function as multimodal models across\nvarious modalities, including audio, images, and videos. In particular,\nresearch on 3D Large Multimodal Models (3D LMMs) is making notable strides,\ndriven by the potential of processing higher-dimensional data like point\nclouds. However, upon closer examination, we find that the visual and textual\ncontent within each sample of existing training datasets lacks both high\ninformational granularity and clarity, which serve as a bottleneck for precise\ncross-modal understanding. To address these issues, we propose CL3DOR,\nContrastive Learning for 3D large multimodal models via Odds ratio on\nhigh-Resolution point clouds, designed to ensure greater specificity and\nclarity in both visual and textual content. Specifically, we increase the\ndensity of point clouds per object and construct informative hard negative\nresponses in the training dataset to penalize unwanted responses. To leverage\nhard negative responses, we incorporate the odds ratio as an auxiliary term for\ncontrastive learning into the conventional language modeling loss. CL3DOR\nachieves state-of-the-art performance in 3D scene understanding and reasoning\nbenchmarks. Additionally, we demonstrate the effectiveness of CL3DOR's key\ncomponents through extensive experiments.\n","authors":["Keonwoo Kim","Yeongjae Cho","Taebaek Hwang","Minsoo Jo","Sangdo Han"],"pdf_url":"https://arxiv.org/pdf/2501.03879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08514v2","updated":"2025-01-07T15:37:10Z","published":"2024-09-13T03:25:34Z","title":"Apollo: Band-sequence Modeling for High-Quality Audio Restoration","summary":"  Audio restoration has become increasingly significant in modern society, not\nonly due to the demand for high-quality auditory experiences enabled by\nadvanced playback devices, but also because the growing capabilities of\ngenerative audio models necessitate high-fidelity audio. Typically, audio\nrestoration is defined as a task of predicting undistorted audio from damaged\ninput, often trained using a GAN framework to balance perception and\ndistortion. Since audio degradation is primarily concentrated in mid- and\nhigh-frequency ranges, especially due to codecs, a key challenge lies in\ndesigning a generator capable of preserving low-frequency information while\naccurately reconstructing high-quality mid- and high-frequency content.\nInspired by recent advancements in high-sample-rate music separation, speech\nenhancement, and audio codec models, we propose Apollo, a generative model\ndesigned for high-sample-rate audio restoration. Apollo employs an explicit\nfrequency band split module to model the relationships between different\nfrequency bands, allowing for more coherent and higher-quality restored audio.\nEvaluated on the MUSDB18-HQ and MoisesDB datasets, Apollo consistently\noutperforms existing SR-GAN models across various bit rates and music genres,\nparticularly excelling in complex scenarios involving mixtures of multiple\ninstruments and vocals. Apollo significantly improves music restoration quality\nwhile maintaining computational efficiency. The source code for Apollo is\npublicly available at https://github.com/JusperLee/Apollo.\n","authors":["Kai Li","Yi Luo"],"pdf_url":"https://arxiv.org/pdf/2409.08514v2.pdf","comment":"Accepted by ICASSP 2025, Demo Page: https://cslikai.cn/Apollo"},{"id":"http://arxiv.org/abs/2412.14841v2","updated":"2025-01-07T15:30:56Z","published":"2024-12-19T13:34:14Z","title":"Helping LLMs Improve Code Generation Using Feedback from Testing and\n  Static Analysis","summary":"  Large Language Models (LLMs) are one of the most promising developments in\nthe field of artificial intelligence, and the software engineering community\nhas readily noticed their potential role in the software development\nlife-cycle. Developers routinely ask LLMs to generate code snippets, increasing\nproductivity but also potentially introducing ownership, privacy, correctness,\nand security issues. Previous work highlighted how code generated by mainstream\ncommercial LLMs is often not safe, containing vulnerabilities, bugs, and code\nsmells. In this paper, we present a framework that leverages testing and static\nanalysis to assess the quality, and guide the self-improvement, of code\ngenerated by general-purpose, open-source LLMs.\n  First, we ask LLMs to generate C code to solve a number of programming tasks.\nThen we employ ground-truth tests to assess the (in)correctness of the\ngenerated code, and a static analysis tool to detect potential safety\nvulnerabilities. Next, we assess the models ability to evaluate the generated\ncode, by asking them to detect errors and vulnerabilities. Finally, we test the\nmodels ability to fix the generated code, providing the reports produced during\nthe static analysis and incorrectness evaluation phases as feedback.\n  Our results show that models often produce incorrect code, and that the\ngenerated code can include safety issues. Moreover, they perform very poorly at\ndetecting either issue. On the positive side, we observe a substantial ability\nto fix flawed code when provided with information about failed tests or\npotential vulnerabilities, indicating a promising avenue for improving the\nsafety of LLM-based code generation tools.\n","authors":["Greta Dolcetti","Vincenzo Arceri","Eleonora Iotti","Sergio Maffeis","Agostino Cortesi","Enea Zaffanella"],"pdf_url":"https://arxiv.org/pdf/2412.14841v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19155v3","updated":"2025-01-07T15:30:02Z","published":"2024-10-24T20:49:22Z","title":"Lived Experience Not Found: LLMs Struggle to Align with Experts on\n  Addressing Adverse Drug Reactions from Psychiatric Medication Use","summary":"  Adverse Drug Reactions (ADRs) from psychiatric medications are the leading\ncause of hospitalizations among mental health patients. With healthcare systems\nand online communities facing limitations in resolving ADR-related issues,\nLarge Language Models (LLMs) have the potential to fill this gap. Despite the\nincreasing capabilities of LLMs, past research has not explored their\ncapabilities in detecting ADRs related to psychiatric medications or in\nproviding effective harm reduction strategies. To address this, we introduce\nthe Psych-ADR benchmark and the Adverse Drug Reaction Response Assessment\n(ADRA) framework to systematically evaluate LLM performance in detecting ADR\nexpressions and delivering expert-aligned mitigation strategies. Our analyses\nshow that LLMs struggle with understanding the nuances of ADRs and\ndifferentiating between types of ADRs. While LLMs align with experts in terms\nof expressed emotions and tone of the text, their responses are more complex,\nharder to read, and only 70.86% aligned with expert strategies. Furthermore,\nthey provide less actionable advice by a margin of 12.32% on average. Our work\nprovides a comprehensive benchmark and evaluation framework for assessing LLMs\nin strategy-driven tasks within high-risk domains.\n","authors":["Mohit Chandra","Siddharth Sriraman","Gaurav Verma","Harneet Singh Khanuja","Jose Suarez Campayo","Zihang Li","Michael L. Birnbaum","Munmun De Choudhury"],"pdf_url":"https://arxiv.org/pdf/2410.19155v3.pdf","comment":"30 pages, 8 figures, 16 tables"},{"id":"http://arxiv.org/abs/2410.13850v3","updated":"2025-01-07T15:28:09Z","published":"2024-10-17T17:59:02Z","title":"Influence Functions for Scalable Data Attribution in Diffusion Models","summary":"  Diffusion models have led to significant advancements in generative\nmodelling. Yet their widespread adoption poses challenges regarding data\nattribution and interpretability. In this paper, we aim to help address such\nchallenges in diffusion models by developing an influence functions framework.\nInfluence function-based data attribution methods approximate how a model's\noutput would have changed if some training data were removed. In supervised\nlearning, this is usually used for predicting how the loss on a particular\nexample would change. For diffusion models, we focus on predicting the change\nin the probability of generating a particular example via several proxy\nmeasurements. We show how to formulate influence functions for such quantities\nand how previously proposed methods can be interpreted as particular design\nchoices in our framework. To ensure scalability of the Hessian computations in\ninfluence functions, we systematically develop K-FAC approximations based on\ngeneralised Gauss-Newton matrices specifically tailored to diffusion models. We\nrecast previously proposed methods as specific design choices in our framework\nand show that our recommended method outperforms previous data attribution\napproaches on common evaluations, such as the Linear Data-modelling Score (LDS)\nor retraining without top influences, without the need for method-specific\nhyperparameter tuning.\n","authors":["Bruno Mlodozeniec","Runa Eschenhagen","Juhan Bae","Alexander Immer","David Krueger","Richard Turner"],"pdf_url":"https://arxiv.org/pdf/2410.13850v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03038v2","updated":"2025-01-07T15:13:41Z","published":"2025-01-06T14:26:00Z","title":"Piano Transcription by Hierarchical Language Modeling with Pretrained\n  Roll-based Encoders","summary":"  Automatic Music Transcription (AMT), aiming to get musical notes from raw\naudio, typically uses frame-level systems with piano-roll outputs or language\nmodel (LM)-based systems with note-level predictions. However, frame-level\nsystems require manual thresholding, while the LM-based systems struggle with\nlong sequences. In this paper, we propose a hybrid method combining pre-trained\nroll-based encoders with an LM decoder to leverage the strengths of both\nmethods. Besides, our approach employs a hierarchical prediction strategy,\nfirst predicting onset and pitch, then velocity, and finally offset. The\nhierarchical prediction strategy reduces computational costs by breaking down\nlong sequences into different hierarchies. Evaluated on two benchmark\nroll-based encoders, our method outperforms traditional piano-roll outputs 0.01\nand 0.022 in onset-offset-velocity F1 score, demonstrating its potential as a\nperformance-enhancing plug-in for arbitrary roll-based music transcription\nencoder.\n","authors":["Dichucheng Li","Yongyi Zang","Qiuqiang Kong"],"pdf_url":"https://arxiv.org/pdf/2501.03038v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.03847v1","updated":"2025-01-07T15:01:58Z","published":"2025-01-07T15:01:58Z","title":"Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video\n  Generation Control","summary":"  Diffusion models have demonstrated impressive performance in generating\nhigh-quality videos from text prompts or images. However, precise control over\nthe video generation process, such as camera manipulation or content editing,\nremains a significant challenge. Existing methods for controlled video\ngeneration are typically limited to a single control type, lacking the\nflexibility to handle diverse control demands. In this paper, we introduce\nDiffusion as Shader (DaS), a novel approach that supports multiple video\ncontrol tasks within a unified architecture. Our key insight is that achieving\nversatile video control necessitates leveraging 3D control signals, as videos\nare fundamentally 2D renderings of dynamic 3D content. Unlike prior methods\nlimited to 2D control signals, DaS leverages 3D tracking videos as control\ninputs, making the video diffusion process inherently 3D-aware. This innovation\nallows DaS to achieve a wide range of video controls by simply manipulating the\n3D tracking videos. A further advantage of using 3D tracking videos is their\nability to effectively link frames, significantly enhancing the temporal\nconsistency of the generated videos. With just 3 days of fine-tuning on 8 H800\nGPUs using less than 10k videos, DaS demonstrates strong control capabilities\nacross diverse tasks, including mesh-to-video generation, camera control,\nmotion transfer, and object manipulation.\n","authors":["Zekai Gu","Rui Yan","Jiahao Lu","Peng Li","Zhiyang Dou","Chenyang Si","Zhen Dong","Qifeng Liu","Cheng Lin","Ziwei Liu","Wenping Wang","Yuan Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03847v1.pdf","comment":"Project page: https://igl-hkust.github.io/das/ Codes:\n  https://github.com/IGL-HKUST/DiffusionAsShader"},{"id":"http://arxiv.org/abs/2409.16670v2","updated":"2025-01-07T15:00:20Z","published":"2024-09-25T06:57:42Z","title":"GraphLoRA: Structure-Aware Contrastive Low-Rank Adaptation for\n  Cross-Graph Transfer Learning","summary":"  Graph Neural Networks (GNNs) have demonstrated remarkable proficiency in\nhandling a range of graph analytical tasks across various domains, such as\ne-commerce and social networks. Despite their versatility, GNNs face\nsignificant challenges in transferability, limiting their utility in real-world\napplications. Existing research in GNN transfer learning overlooks\ndiscrepancies in distribution among various graph datasets, facing challenges\nwhen transferring across different distributions. How to effectively adopt a\nwell-trained GNN to new graphs with varying feature and structural\ndistributions remains an under-explored problem. Taking inspiration from the\nsuccess of Low-Rank Adaptation (LoRA) in adapting large language models to\nvarious domains, we propose GraphLoRA, an effective and parameter-efficient\nmethod for transferring well-trained GNNs to diverse graph domains.\nSpecifically, we first propose a Structure-aware Maximum Mean Discrepancy\n(SMMD) to align divergent node feature distributions across source and target\ngraphs. Moreover, we introduce low-rank adaptation by injecting a small\ntrainable GNN alongside the pre-trained one, effectively bridging structural\ndistribution gaps while mitigating the catastrophic forgetting. Additionally, a\nstructure-aware regularization objective is proposed to enhance the\nadaptability of the pre-trained GNN to target graph with scarce supervision\nlabels. Extensive experiments on eight real-world datasets demonstrate the\neffectiveness of GraphLoRA against fourteen baselines by tuning only 20% of\nparameters, even across disparate graph domains. The code is available at\nhttps://github.com/AllminerLab/GraphLoRA.\n","authors":["Zhe-Rui Yang","Jindong Han","Chang-Dong Wang","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2409.16670v2.pdf","comment":"Accepted by KDD2025"},{"id":"http://arxiv.org/abs/2410.15460v3","updated":"2025-01-07T14:56:42Z","published":"2024-10-20T18:18:23Z","title":"Hallucination Detox: Sensitivity Dropout (SenD) for Large Language Model\n  Training","summary":"  As large language models (LLMs) are increasingly deployed across various\nindustries, concerns regarding their reliability, particularly due to\nhallucinations - outputs that are factually inaccurate or irrelevant to user\ninput - have grown. Our research investigates the relationship between the\ntraining process and the emergence of hallucinations to address a key gap in\nexisting research that focuses primarily on post hoc detection and mitigation\nstrategies. Using models from the Pythia suite (70M - 12B parameters) and\nseveral hallucination detection metrics, we analyze hallucination trends\nthroughout training and explore LLM internal dynamics. We introduce Sensitivity\nDropout (SenD), a novel training protocol designed to mitigate hallucinations\nby reducing variance during training. SenD achieves this by deterministically\ndropping embedding indices with significant variability, referred to as\nSensitive Embedding Indices. In addition, we develop an unsupervised\nhallucination detection metric, Efficient EigenScore (EES), which approximates\nthe traditional EigenScore at 2x speed. This efficient metric is integrated\ninto our protocol, allowing SenD to be both computationally scalable and\neffective at reducing hallucinations. Our empirical evaluation demonstrates\nthat our approach improves LLM reliability at test time by up to 40% compared\nto normal training while also providing an efficient method to improve factual\naccuracy when adapting LLMs to Wikipedia, Medical, and LegalBench domains.\n","authors":["Shahrad Mohammadzadeh","Juan David Guerra","Marco Bonizzato","Reihaneh Rabbany","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2410.15460v3.pdf","comment":"23 pages, 15 figures, under review at ICLR, accepted to Safe\n  Generative AI Workshop @ NeurIPS 2024, resubmitting to change name to\n  appropriate name"},{"id":"http://arxiv.org/abs/2501.03836v1","updated":"2025-01-07T14:45:39Z","published":"2025-01-07T14:45:39Z","title":"SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor\n  Diagnosis","summary":"  Brain tumors can result in neurological dysfunction, alterations in cognitive\nand psychological states, increased intracranial pressure, and the occurrence\nof seizures, thereby presenting a substantial risk to human life and health.\nThe You Only Look Once(YOLO) series models have demonstrated superior accuracy\nin object detection for medical imaging. In this paper, we develop a novel\nSCC-YOLO architecture by integrating the SCConv attention mechanism into\nYOLOv9. The SCConv module reconstructs an efficient convolutional module by\nreducing spatial and channel redundancy among features, thereby enhancing the\nlearning of image features. We investigate the impact of intergrating different\nattention mechanisms with the YOLOv9 model on brain tumor image detection using\nboth the Br35H dataset and our self-made dataset(Brain_Tumor_Dataset).\nExperimental results show that on the Br35H dataset, SCC-YOLO achieved a 0.3%\nimprovement in mAp50 compared to YOLOv9, while on our self-made dataset,\nSCC-YOLO exhibited a 0.5% improvement over YOLOv9. SCC-YOLO has reached\nstate-of-the-art performance in brain tumor detection. Source code is available\nat : https://jihulab.com/healthcare-information-studio/SCC-YOLO/-/tree/master\n","authors":["Runci Bai"],"pdf_url":"https://arxiv.org/pdf/2501.03836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03835v1","updated":"2025-01-07T14:45:30Z","published":"2025-01-07T14:45:30Z","title":"TACLR: A Scalable and Efficient Retrieval-based Method for Industrial\n  Product Attribute Value Identification","summary":"  Product Attribute Value Identification (PAVI) involves identifying attribute\nvalues from product profiles, a key task for improving product search,\nrecommendations, and business analytics on e-commerce platforms. However,\nexisting PAVI methods face critical challenges, such as inferring implicit\nvalues, handling out-of-distribution (OOD) values, and producing normalized\noutputs. To address these limitations, we introduce Taxonomy-Aware Contrastive\nLearning Retrieval (TACLR), the first retrieval-based method for PAVI. TACLR\nformulates PAVI as an information retrieval task by encoding product profiles\nand candidate values into embeddings and retrieving values based on their\nsimilarity to the item embedding. It leverages contrastive training with\ntaxonomy-aware hard negative sampling and employs adaptive inference with\ndynamic thresholds. TACLR offers three key advantages: (1) it effectively\nhandles implicit and OOD values while producing normalized outputs; (2) it\nscales to thousands of categories, tens of thousands of attributes, and\nmillions of values; and (3) it supports efficient inference for high-load\nindustrial scenarios. Extensive experiments on proprietary and public datasets\nvalidate the effectiveness and efficiency of TACLR. Moreover, it has been\nsuccessfully deployed in a real-world e-commerce platform, processing millions\nof product listings daily while supporting dynamic, large-scale attribute\ntaxonomies.\n","authors":["Yindu Su","Huike Zou","Lin Sun","Ting Zhang","Haiyang Yang","Liyu Chen","David Lo","Qingheng Zhang","Shuguang Han","Jufeng Chen"],"pdf_url":"https://arxiv.org/pdf/2501.03835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03832v1","updated":"2025-01-07T14:42:38Z","published":"2025-01-07T14:42:38Z","title":"Three-dimensional attention Transformer for state evaluation in\n  real-time strategy games","summary":"  Situation assessment in Real-Time Strategy (RTS) games is crucial for\nunderstanding decision-making in complex adversarial environments. However,\nexisting methods remain limited in processing multi-dimensional feature\ninformation and temporal dependencies. Here we propose a tri-dimensional\nSpace-Time-Feature Transformer (TSTF Transformer) architecture, which\nefficiently models battlefield situations through three independent but\ncascaded modules: spatial attention, temporal attention, and feature attention.\nOn a dataset comprising 3,150 adversarial experiments, the 8-layer TSTF\nTransformer demonstrates superior performance: achieving 58.7% accuracy in the\nearly game (~4% progress), significantly outperforming the conventional\nTimesformer's 41.8%; reaching 97.6% accuracy in the mid-game (~40% progress)\nwhile maintaining low performance variation (standard deviation 0.114).\nMeanwhile, this architecture requires fewer parameters (4.75M) compared to the\nbaseline model (5.54M). Our study not only provides new insights into situation\nassessment in RTS games but also presents an innovative paradigm for\nTransformer-based multi-dimensional temporal modeling.\n","authors":["Yanqing Ye","Weilong Yang","Kai Qiu","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03832v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.03825v1","updated":"2025-01-07T14:37:14Z","published":"2025-01-07T14:37:14Z","title":"Deep Sylvester Posterior Inference for Adaptive Compressed Sensing in\n  Ultrasound Imaging","summary":"  Ultrasound images are commonly formed by sequential acquisition of\nbeam-steered scan-lines. Minimizing the number of required scan-lines can\nsignificantly enhance frame rate, field of view, energy efficiency, and data\ntransfer speeds. Existing approaches typically use static subsampling schemes\nin combination with sparsity-based or, more recently, deep-learning-based\nrecovery. In this work, we introduce an adaptive subsampling method that\nmaximizes intrinsic information gain in-situ, employing a Sylvester Normalizing\nFlow encoder to infer an approximate Bayesian posterior under partial\nobservation in real-time. Using the Bayesian posterior and a deep generative\nmodel for future observations, we determine the subsampling scheme that\nmaximizes the mutual information between the subsampled observations, and the\nnext frame of the video. We evaluate our approach using the EchoNet cardiac\nultrasound video dataset and demonstrate that our active sampling method\noutperforms competitive baselines, including uniform and variable-density\nrandom sampling, as well as equidistantly spaced scan-lines, improving mean\nabsolute reconstruction error by 15%. Moreover, posterior inference and the\nsampling scheme generation are performed in just 0.015 seconds (66Hz), making\nit fast enough for real-time 2D ultrasound imaging applications.\n","authors":["Simon W. Penninga","Hans van Gorp","Ruud J. G. van Sloun"],"pdf_url":"https://arxiv.org/pdf/2501.03825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03824v1","updated":"2025-01-07T14:36:33Z","published":"2025-01-07T14:36:33Z","title":"Online Reinforcement Learning-Based Dynamic Adaptive Evaluation Function\n  for Real-Time Strategy Tasks","summary":"  Effective evaluation of real-time strategy tasks requires adaptive mechanisms\nto cope with dynamic and unpredictable environments. This study proposes a\nmethod to improve evaluation functions for real-time responsiveness to\nbattle-field situation changes, utilizing an online reinforcement\nlearning-based dynam-ic weight adjustment mechanism within the real-time\nstrategy game. Building on traditional static evaluation functions, the method\nemploys gradient descent in online reinforcement learning to update weights\ndynamically, incorporating weight decay techniques to ensure stability.\nAdditionally, the AdamW optimizer is integrated to adjust the learning rate and\ndecay rate of online reinforcement learning in real time, further reducing the\ndependency on manual parameter tun-ing. Round-robin competition experiments\ndemonstrate that this method signifi-cantly enhances the application\neffectiveness of the Lanchester combat model evaluation function, Simple\nevaluation function, and Simple Sqrt evaluation function in planning algorithms\nincluding IDABCD, IDRTMinimax, and Port-folio AI. The method achieves a notable\nimprovement in scores, with the en-hancement becoming more pronounced as the\nmap size increases. Furthermore, the increase in evaluation function\ncomputation time induced by this method is kept below 6% for all evaluation\nfunctions and planning algorithms. The pro-posed dynamic adaptive evaluation\nfunction demonstrates a promising approach for real-time strategy task\nevaluation.\n","authors":["Weilong Yang","Jie Zhang","Xunyun Liu","Yanqing Ye"],"pdf_url":"https://arxiv.org/pdf/2501.03824v1.pdf","comment":"22 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.10486v2","updated":"2025-01-07T14:09:22Z","published":"2024-07-15T07:14:56Z","title":"IDEAL: Leveraging Infinite and Dynamic Characterizations of Large\n  Language Models for Query-focused Summarization","summary":"  Query-focused summarization (QFS) aims to produce summaries that answer\nparticular questions of interest, enabling greater user control and\npersonalization. With the advent of large language models (LLMs), shows their\nimpressive capability of textual understanding through large-scale pretraining,\nwhich implies the great potential of extractive snippet generation. In this\npaper, we systematically investigated two indispensable characteristics that\nthe LLMs-based QFS models should be harnessed, Lengthy Document Summarization\nand Efficiently Fine-grained Query-LLM Alignment, respectively.\nCorrespondingly, we propose two modules called Query-aware HyperExpert and\nQuery-focused Infini-attention to access the aforementioned characteristics.\nThese innovations pave the way for broader application and accessibility in the\nfield of QFS technology. Extensive experiments conducted on existing QFS\nbenchmarks indicate the effectiveness and generalizability of the proposed\napproach. Our code is publicly available at\nhttps://github.com/DCDmllm/IDEAL_Summary.\n","authors":["Jie Cao","Dian Jiao","Qiang Yan","Wenqiao Zhang","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2407.10486v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03795v1","updated":"2025-01-07T14:01:59Z","published":"2025-01-07T14:01:59Z","title":"Self-Adaptive ERP: Embedding NLP into Petri-Net creation and Model\n  Matching","summary":"  Enterprise Resource Planning (ERP) consultants play a vital role in\ncustomizing systems to meet specific business needs by processing large amounts\nof data and adapting functionalities. However, the process is\nresource-intensive, time-consuming, and requires continuous adjustments as\nbusiness demands evolve. This research introduces a Self-Adaptive ERP Framework\nthat automates customization using enterprise process models and system usage\nanalysis. It leverages Artificial Intelligence (AI) & Natural Language\nProcessing (NLP) for Petri nets to transform business processes into adaptable\nmodels, addressing both structural and functional matching. The framework,\nbuilt using Design Science Research (DSR) and a Systematic Literature Review\n(SLR), reduces reliance on manual adjustments, improving ERP customization\nefficiency and accuracy while minimizing the need for consultants.\n","authors":["Ahmed Maged","Gamal Kassem"],"pdf_url":"https://arxiv.org/pdf/2501.03795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02285v2","updated":"2025-01-07T13:38:34Z","published":"2025-01-04T13:27:18Z","title":"Hyperbolic Contrastive Learning for Hierarchical 3D Point Cloud\n  Embedding","summary":"  Hyperbolic spaces allow for more efficient modeling of complex, hierarchical\nstructures, which is particularly beneficial in tasks involving multi-modal\ndata. Although hyperbolic geometries have been proven effective for\nlanguage-image pre-training, their capabilities to unify language, image, and\n3D Point Cloud modalities are under-explored. We extend the 3D Point Cloud\nmodality in hyperbolic multi-modal contrastive pre-training. Additionally, we\nexplore the entailment, modality gap, and alignment regularizers for learning\nhierarchical 3D embeddings and facilitating the transfer of knowledge from both\nText and Image modalities. These regularizers enable the learning of\nintra-modal hierarchy within each modality and inter-modal hierarchy across\ntext, 2D images, and 3D Point Clouds. Experimental results demonstrate that our\nproposed training strategy yields an outstanding 3D Point Cloud encoder, and\nthe obtained 3D Point Cloud hierarchical embeddings significantly improve\nperformance on various downstream tasks.\n","authors":["Yingjie Liu","Pengyu Zhang","Ziyao He","Mingsong Chen","Xuan Tang","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2501.02285v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00518v2","updated":"2025-01-07T13:36:12Z","published":"2023-09-30T22:37:28Z","title":"Learning Informative Latent Representation for Quantum State Tomography","summary":"  Quantum state tomography (QST) is the process of reconstructing the complete\nstate of a quantum system (mathematically described as a density matrix)\nthrough a series of different measurements. These measurements are performed on\na number of identical copies of the quantum system, with outcomes gathered as\nfrequencies. QST aims to recover the density matrix or the properties of the\nquantum state from the measured frequencies. Although an informationally\ncomplete set of measurements can specify the quantum state accurately in an\nideal scenario with a large number of identical copies, both the measurements\nand identical copies are restricted and imperfect in practical scenarios,\nmaking QST highly ill-posed. The conventional QST methods usually assume\naccurate measured frequencies or rely on manually designed regularizers to\nhandle the ill-posed reconstruction problem, suffering from limited\napplications in realistic scenarios. Recent advances in deep neural networks\n(DNN) led to the emergence of deep learning in QST. However, existing DL-based\nQST approaches often employ generic DNN models that are not optimized for\nimperfect conditions of QST. In this paper, we propose a transformer-based\nautoencoder architecture tailored for QST with imperfect measurement data. Our\nmethod leverages a transformer-based encoder to extract an informative latent\nrepresentation (ILR) from imperfect measurement data and employs a decoder to\npredict the quantum states based on the ILR. We anticipate that the\nhigh-dimensional ILR will capture more comprehensive information about the\nquantum states. To achieve this, we conduct pre-training of the encoder using a\npretext task that involves reconstructing high-quality frequencies from\nmeasured frequencies. Extensive simulations and experiments demonstrate the\nremarkable ability of the informative latent representation to deal with\nimperfect measurement data in QST.\n","authors":["Hailan Ma","Zhenhong Sun","Daoyi Dong","Dong Gong"],"pdf_url":"https://arxiv.org/pdf/2310.00518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00546v3","updated":"2025-01-07T13:31:01Z","published":"2023-12-31T17:21:02Z","title":"AllSpark: A Multimodal Spatio-Temporal General Intelligence Model with\n  Ten Modalities via Language as a Reference Framework","summary":"  Leveraging multimodal data is an inherent requirement for comprehending\ngeographic objects. However, due to the high heterogeneity in structure and\nsemantics among various spatio-temporal modalities, the joint interpretation of\nmultimodal spatio-temporal data has long been an extremely challenging problem.\nThe primary challenge resides in striking a trade-off between the cohesion and\nautonomy of diverse modalities. This trade-off becomes progressively nonlinear\nas the number of modalities expands. Inspired by the human cognitive system and\nlinguistic philosophy, where perceptual signals from the five senses converge\ninto language, we introduce the Language as Reference Framework (LaRF), a\nfundamental principle for constructing a multimodal unified model. Building\nupon this, we propose AllSpark, a multimodal spatio-temporal general artificial\nintelligence model. Our model integrates ten different modalities into a\nunified framework. To achieve modal cohesion, AllSpark introduces a modal\nbridge and multimodal large language model (LLM) to map diverse modal features\ninto the language feature space. To maintain modality autonomy, AllSpark uses\nmodality-specific encoders to extract the tokens of various spatio-temporal\nmodalities. Finally, observing a gap between the model's interpretability and\ndownstream tasks, we designed modality-specific prompts and task heads,\nenhancing the model's generalization capability across specific tasks.\nExperiments indicate that the incorporation of language enables AllSpark to\nexcel in few-shot classification tasks for RGB and point cloud modalities\nwithout additional training, surpassing baseline performance by up to 41.82\\%.\nThe source code is available at https://github.com/GeoX-Lab/AllSpark.\n","authors":["Run Shao","Cheng Yang","Qiujun Li","Qing Zhu","Yongjun Zhang","YanSheng Li","Yu Liu","Yong Tang","Dapeng Liu","Shizhong Yang","Haifeng Li"],"pdf_url":"https://arxiv.org/pdf/2401.00546v3.pdf","comment":"19 pages, 19 tables, 3 figures"},{"id":"http://arxiv.org/abs/2410.19915v2","updated":"2025-01-07T13:14:25Z","published":"2024-10-25T18:09:02Z","title":"AI-Driven Scenarios for Urban Mobility: Quantifying the Role of ODE\n  Models and Scenario Planning in Reducing Traffic Congestion","summary":"  Urbanization and technological advancements are reshaping urban mobility,\npresenting both challenges and opportunities. This paper investigates how\nArtificial Intelligence (AI)-driven technologies can impact traffic congestion\ndynamics and explores their potential to enhance transportation systems'\nefficiency. Specifically, we assess the role of AI innovations, such as\nautonomous vehicles and intelligent traffic management, in mitigating\ncongestion under varying regulatory frameworks. Autonomous vehicles reduce\ncongestion through optimized traffic flow, real-time route adjustments, and\ndecreased human errors.\n  The study employs Ordinary Differential Equations (ODEs) to model the dynamic\nrelationship between AI adoption rates and traffic congestion, capturing\nsystemic feedback loops. Quantitative outputs include threshold levels of AI\nadoption needed to achieve significant congestion reduction, while qualitative\ninsights stem from scenario planning exploring regulatory and societal\nconditions. This dual-method approach offers actionable strategies for\npolicymakers to create efficient, sustainable, and equitable urban\ntransportation systems. While safety implications of AI are acknowledged, this\nstudy primarily focuses on congestion reduction dynamics.\n","authors":["Katsiaryna Bahamazava"],"pdf_url":"https://arxiv.org/pdf/2410.19915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03764v1","updated":"2025-01-07T13:08:54Z","published":"2025-01-07T13:08:54Z","title":"SelectiveFinetuning: Enhancing Transfer Learning in Sleep Staging\n  through Selective Domain Alignment","summary":"  In practical sleep stage classification, a key challenge is the variability\nof EEG data across different subjects and environments. Differences in\nphysiology, age, health status, and recording conditions can lead to domain\nshifts between data. These domain shifts often result in decreased model\naccuracy and reliability, particularly when the model is applied to new data\nwith characteristics different from those it was originally trained on, which\nis a typical manifestation of negative transfer. To address this, we propose\nSelectiveFinetuning in this paper. Our method utilizes a pretrained Multi\nResolution Convolutional Neural Network (MRCNN) to extract EEG features,\ncapturing the distinctive characteristics of different sleep stages. To\nmitigate the effect of domain shifts, we introduce a domain aligning mechanism\nthat employs Earth Mover Distance (EMD) to evaluate and select source domain\ndata closely matching the target domain. By finetuning the model with selective\nsource data, our SelectiveFinetuning enhances the model's performance on target\ndomain that exhibits domain shifts compared to the data used for training.\nExperimental results show that our method outperforms existing baselines,\noffering greater robustness and adaptability in practical scenarios where data\ndistributions are often unpredictable.\n","authors":["Siyuan Zhao","Chenyu Liu","Yi Ding","Xinliang Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.03764v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2410.09453v2","updated":"2025-01-07T13:00:57Z","published":"2024-10-12T09:16:09Z","title":"MMAD: The First-Ever Comprehensive Benchmark for Multimodal Large\n  Language Models in Industrial Anomaly Detection","summary":"  In the field of industrial inspection, Multimodal Large Language Models\n(MLLMs) have a high potential to renew the paradigms in practical applications\ndue to their robust language capabilities and generalization abilities.\nHowever, despite their impressive problem-solving skills in many domains,\nMLLMs' ability in industrial anomaly detection has not been systematically\nstudied. To bridge this gap, we present MMAD, the first-ever full-spectrum\nMLLMs benchmark in industrial Anomaly Detection. We defined seven key subtasks\nof MLLMs in industrial inspection and designed a novel pipeline to generate the\nMMAD dataset with 39,672 questions for 8,366 industrial images. With MMAD, we\nhave conducted a comprehensive, quantitative evaluation of various\nstate-of-the-art MLLMs. The commercial models performed the best, with the\naverage accuracy of GPT-4o models reaching 74.9%. However, this result falls\nfar short of industrial requirements. Our analysis reveals that current MLLMs\nstill have significant room for improvement in answering questions related to\nindustrial anomalies and defects. We further explore two training-free\nperformance enhancement strategies to help models improve in industrial\nscenarios, highlighting their promising potential for future research.\n","authors":["Xi Jiang","Jian Li","Hanqiu Deng","Yong Liu","Bin-Bin Gao","Yifeng Zhou","Jialin Li","Chengjie Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.09453v2.pdf","comment":"The code and data are available at https://github.com/jam-cc/MMAD"},{"id":"http://arxiv.org/abs/2409.18301v3","updated":"2025-01-07T12:44:48Z","published":"2024-09-26T21:16:51Z","title":"Wavelet-Driven Generalizable Framework for Deepfake Face Forgery\n  Detection","summary":"  The evolution of digital image manipulation, particularly with the\nadvancement of deep generative models, significantly challenges existing\ndeepfake detection methods, especially when the origin of the deepfake is\nobscure. To tackle the increasing complexity of these forgeries, we propose\n\\textbf{Wavelet-CLIP}, a deepfake detection framework that integrates wavelet\ntransforms with features derived from the ViT-L/14 architecture, pre-trained in\nthe CLIP fashion. Wavelet-CLIP utilizes Wavelet Transforms to deeply analyze\nboth spatial and frequency features from images, thus enhancing the model's\ncapability to detect sophisticated deepfakes. To verify the effectiveness of\nour approach, we conducted extensive evaluations against existing\nstate-of-the-art methods for cross-dataset generalization and detection of\nunseen images generated by standard diffusion models. Our method showcases\noutstanding performance, achieving an average AUC of 0.749 for cross-data\ngeneralization and 0.893 for robustness against unseen deepfakes, outperforming\nall compared methods. The code can be reproduced from the repo:\n\\url{https://github.com/lalithbharadwajbaru/Wavelet-CLIP}\n","authors":["Lalith Bharadwaj Baru","Rohit Boddeda","Shilhora Akshay Patel","Sai Mohan Gajapaka"],"pdf_url":"https://arxiv.org/pdf/2409.18301v3.pdf","comment":"9 Pages, 2 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2501.03124v2","updated":"2025-01-07T12:33:44Z","published":"2025-01-06T16:31:45Z","title":"PRMBench: A Fine-grained and Challenging Benchmark for Process-Level\n  Reward Models","summary":"  Process-level Reward Models (PRMs) are crucial for complex reasoning and\ndecision-making tasks, where each intermediate step plays an important role in\nthe reasoning process. Since language models are prone to various types of\nerrors during the reasoning process, PRMs are required to possess nuanced\ncapabilities for detecting various implicit error types in real-world\nscenarios. However, current benchmarks primarily focus on step correctness,\nfailing to evaluate PRMs' performance systematically. To address this gap, we\nintroduce PRMBench, a process-level benchmark specifically designed to assess\nthe fine-grained error detection capabilities of PRMs. PRMBench comprises 6,216\ncarefully designed problems and 83,456 step-level labels, evaluating models\nacross multiple dimensions, including simplicity, soundness, and sensitivity.\nIn our experiments on 15 models, spanning both open-source PRMs and\nclosed-source large language models prompted as critic models, we uncover\nsignificant weaknesses in current PRMs. These findings underscore the\nchallenges inherent in process-level evaluation and highlight key directions\nfor future research. We hope PRMBench can be a robust bench for advancing\nresearch on PRM evaluation and development.\n","authors":["Mingyang Song","Zhaochen Su","Xiaoye Qu","Jiawei Zhou","Yu Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.03124v2.pdf","comment":"Project Page: https://prmbench.github.io/"},{"id":"http://arxiv.org/abs/2405.10936v2","updated":"2025-01-07T12:15:01Z","published":"2024-05-17T17:47:39Z","title":"A Survey on Large Language Models with Multilingualism: Recent Advances\n  and New Frontiers","summary":"  The rapid development of Large Language Models (LLMs) demonstrates remarkable\nmultilingual capabilities in natural language processing, attracting global\nattention in both academia and industry. To mitigate potential discrimination\nand enhance the overall usability and accessibility for diverse language user\ngroups, it is important for the development of language-fair technology.\nDespite the breakthroughs of LLMs, the investigation into the multilingual\nscenario remains insufficient, where a comprehensive survey to summarize recent\napproaches, developments, limitations, and potential solutions is desirable. To\nthis end, we provide a survey with multiple perspectives on the utilization of\nLLMs in the multilingual scenario. We first rethink the transitions between\nprevious and current research on pre-trained language models. Then we introduce\nseveral perspectives on the multilingualism of LLMs, including training and\ninference methods, information retrieval, model security, multi-domain with\nlanguage culture, and usage of datasets. We also discuss the major challenges\nthat arise in these aspects, along with possible solutions. Besides, we\nhighlight future research directions that aim at further enhancing LLMs with\nmultilingualism. The survey aims to help the research community address\nmultilingual problems and provide a comprehensive understanding of the core\nconcepts, key techniques, and latest developments in multilingual natural\nlanguage processing based on LLMs.\n","authors":["Kaiyu Huang","Fengran Mo","Xinyu Zhang","Hongliang Li","You Li","Yuanchi Zhang","Weijian Yi","Yulong Mao","Jinchen Liu","Yuzhuang Xu","Jinan Xu","Jian-Yun Nie","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2405.10936v2.pdf","comment":"65 pages, Work in Progress"},{"id":"http://arxiv.org/abs/2501.03722v1","updated":"2025-01-07T12:03:02Z","published":"2025-01-07T12:03:02Z","title":"Self-adaptive vision-language model for 3D segmentation of pulmonary\n  artery and vein","summary":"  Accurate segmentation of pulmonary structures iscrucial in clinical\ndiagnosis, disease study, and treatment planning. Significant progress has been\nmade in deep learning-based segmentation techniques, but most require much\nlabeled data for training. Consequently, developing precise segmentation\nmethods that demand fewer labeled datasets is paramount in medical image\nanalysis. The emergence of pre-trained vision-language foundation models, such\nas CLIP, recently opened the door for universal computer vision tasks.\nExploiting the generalization ability of these pre-trained foundation models on\ndownstream tasks, such as segmentation, leads to unexpected performance with a\nrelatively small amount of labeled data. However, exploring these models for\npulmonary artery-vein segmentation is still limited. This paper proposes a\nnovel framework called Language-guided self-adaptive Cross-Attention Fusion\nFramework. Our method adopts pre-trained CLIP as a strong feature extractor for\ngenerating the segmentation of 3D CT scans, while adaptively aggregating the\ncross-modality of text and image representations. We propose a s pecially\ndesigned adapter module to fine-tune pre-trained CLIP with a self-adaptive\nlearning strategy to effectively fuse the two modalities of embeddings. We\nextensively validate our method on a local dataset, which is the largest\npulmonary artery-vein CT dataset to date and consists of 718 labeled data in\ntotal. The experiments show that our method outperformed other state-of-the-art\nmethods by a large margin. Our data and code will be made publicly available\nupon acceptance.\n","authors":["Xiaotong Guo","Deqian Yang","Dan Wang","Haochen Zhao","Yuan Li","Zhilin Sui","Tao Zhou","Lijun Zhang","Yanda Meng"],"pdf_url":"https://arxiv.org/pdf/2501.03722v1.pdf","comment":"8 pages,3 figures"},{"id":"http://arxiv.org/abs/2409.03260v2","updated":"2025-01-07T11:54:58Z","published":"2024-09-05T05:51:42Z","title":"In Search of Trees: Decision-Tree Policy Synthesis for Black-Box Systems\n  via Search","summary":"  Decision trees, owing to their interpretability, are attractive as control\npolicies for (dynamical) systems. Unfortunately, constructing, or synthesising,\nsuch policies is a challenging task. Previous approaches do so by imitating a\nneural-network policy, approximating a tabular policy obtained via formal\nsynthesis, employing reinforcement learning, or modelling the problem as a\nmixed-integer linear program. However, these works may require access to a\nhard-to-obtain accurate policy or a formal model of the environment (within\nreach of formal synthesis), and may not provide guarantees on the quality or\nsize of the final tree policy. In contrast, we present an approach to\nsynthesise optimal decision-tree policies given a deterministic black-box\nenvironment and specification, a discretisation of the tree predicates, and an\ninitial set of states, where optimality is defined with respect to the number\nof steps to achieve the goal. Our approach is a specialised search algorithm\nwhich systematically explores the (exponentially large) space of decision trees\nunder the given discretisation. The key component is a novel trace-based\npruning mechanism that significantly reduces the search space. Our approach\nrepresents a conceptually novel way of synthesising small decision-tree\npolicies with optimality guarantees even for black-box environments with\nblack-box specifications.\n","authors":["Emir Demirović","Christian Schilling","Anna Lukina"],"pdf_url":"https://arxiv.org/pdf/2409.03260v2.pdf","comment":"8 pages main text incl. references, 2 pages appendix"},{"id":"http://arxiv.org/abs/2501.03717v1","updated":"2025-01-07T11:52:01Z","published":"2025-01-07T11:52:01Z","title":"Materialist: Physically Based Editing Using Single-Image Inverse\n  Rendering","summary":"  To perform image editing based on single-view, inverse physically based\nrendering, we present a method combining a learning-based approach with\nprogressive differentiable rendering. Given an image, our method leverages\nneural networks to predict initial material properties. Progressive\ndifferentiable rendering is then used to optimize the environment map and\nrefine the material properties with the goal of closely matching the rendered\nresult to the input image. We require only a single image while other inverse\nrendering methods based on the rendering equation require multiple views. In\ncomparison to single-view methods that rely on neural renderers, our approach\nachieves more realistic light material interactions, accurate shadows, and\nglobal illumination. Furthermore, with optimized material properties and\nillumination, our method enables a variety of tasks, including physically based\nmaterial editing, object insertion, and relighting. We also propose a method\nfor material transparency editing that operates effectively without requiring\nfull scene geometry. Compared with methods based on Stable Diffusion, our\napproach offers stronger interpretability and more realistic light refraction\nbased on empirical results.\n","authors":["Lezhong Wang","Duc Minh Tran","Ruiqi Cui","Thomson TG","Manmohan Chandraker","Jeppe Revall Frisvad"],"pdf_url":"https://arxiv.org/pdf/2501.03717v1.pdf","comment":"code will be available at github.com/lez-s/Materialist"},{"id":"http://arxiv.org/abs/2501.03715v1","updated":"2025-01-07T11:44:25Z","published":"2025-01-07T11:44:25Z","title":"Neural Deconstruction Search for Vehicle Routing Problems","summary":"  Autoregressive construction approaches generate solutions to vehicle routing\nproblems in a step-by-step fashion, leading to high-quality solutions that are\nnearing the performance achieved by handcrafted, operations research\ntechniques. In this work, we challenge the conventional paradigm of sequential\nsolution construction and introduce an iterative search framework where\nsolutions are instead deconstructed by a neural policy. Throughout the search,\nthe neural policy collaborates with a simple greedy insertion algorithm to\nrebuild the deconstructed solutions. Our approach surpasses the performance of\nstate-of-the-art operations research methods across three challenging vehicle\nrouting problems of various problem sizes.\n","authors":["André Hottung","Paula Wong-Chung","Kevin Tierney"],"pdf_url":"https://arxiv.org/pdf/2501.03715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09424v3","updated":"2025-01-07T11:37:57Z","published":"2024-09-14T12:25:14Z","title":"NBBOX: Noisy Bounding Box Improves Remote Sensing Object Detection","summary":"  Data augmentation has shown significant advancements in computer vision to\nimprove model performance over the years, particularly in scenarios with\nlimited and insufficient data. Currently, most studies focus on adjusting the\nimage or its features to expand the size, quality, and variety of samples\nduring training in various tasks including object detection. However, we argue\nthat it is necessary to investigate bounding box transformations as a data\naugmentation technique rather than image-level transformations, especially in\naerial imagery due to potentially inconsistent bounding box annotations. Hence,\nthis letter presents a thorough investigation of bounding box transformation in\nterms of scaling, rotation, and translation for remote sensing object\ndetection. We call this augmentation strategy NBBOX (Noise Injection into\nBounding Box). We conduct extensive experiments on DOTA and DIOR-R, both\nwell-known datasets that include a variety of rotated generic objects in aerial\nimages. Experimental results show that our approach significantly improves\nremote sensing object detection without whistles and bells and it is more\ntime-efficient than other state-of-the-art augmentation strategies.\n","authors":["Yechan Kim","SooYeon Kim","Moongu Jeon"],"pdf_url":"https://arxiv.org/pdf/2409.09424v3.pdf","comment":"Accepted to IEEE Geoscience and Remote Sensing Letters"},{"id":"http://arxiv.org/abs/2501.03711v1","updated":"2025-01-07T11:32:13Z","published":"2025-01-07T11:32:13Z","title":"Unsupervised Speech Segmentation: A General Approach Using Speech\n  Language Models","summary":"  In this paper, we introduce an unsupervised approach for Speech Segmentation,\nwhich builds on previously researched approaches, e.g., Speaker Diarization,\nwhile being applicable to an inclusive set of acoustic-semantic distinctions,\npaving a path towards a general Unsupervised Speech Segmentation approach.\nUnlike traditional speech and audio segmentation, which mainly focuses on\nspectral changes in the input signal, e.g., phone segmentation, our approach\ntries to segment the spoken utterance into chunks with differing\nacoustic-semantic styles, focusing on acoustic-semantic information that does\nnot translate well into text, e.g., emotion or speaker. While most Speech\nSegmentation tasks only handle one style change, e.g., emotion diarization, our\napproach tries to handle multiple acoustic-semantic style changes. Leveraging\nrecent advances in Speech Language Models (SLMs), we propose a simple\nunsupervised method to segment a given speech utterance. We empirically\ndemonstrate the effectiveness of the proposed approach by considering several\nsetups. Results suggest that the proposed method is superior to the evaluated\nbaselines on boundary detection, segment purity, and over-segmentation. Code is\navailable at\nhttps://github.com/avishaiElmakies/unsupervised_speech_segmentation_using_slm.\n","authors":["Avishai Elmakies","Omri Abend","Yossi Adi"],"pdf_url":"https://arxiv.org/pdf/2501.03711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10573v2","updated":"2025-01-07T11:13:06Z","published":"2024-06-15T09:23:46Z","title":"Graph Neural Backdoor: Fundamentals, Methodologies, Applications, and\n  Future Directions","summary":"  Graph Neural Networks (GNNs) have significantly advanced various downstream\ngraph-relevant tasks, encompassing recommender systems, molecular structure\nprediction, social media analysis, etc. Despite the boosts of GNN, recent\nresearch has empirically demonstrated its potential vulnerability to backdoor\nattacks, wherein adversaries employ triggers to poison input samples, inducing\nGNN to adversary-premeditated malicious outputs. This is typically due to the\ncontrolled training process, or the deployment of untrusted models, such as\ndelegating model training to third-party service, leveraging external training\nsets, and employing pre-trained models from online sources. Although there's an\nongoing increase in research on GNN backdoors, comprehensive investigation into\nthis field is lacking. To bridge this gap, we propose the first survey\ndedicated to GNN backdoors. We begin by outlining the fundamental definition of\nGNN, followed by the detailed summarization and categorization of current GNN\nbackdoor attacks and defenses based on their technical characteristics and\napplication scenarios. Subsequently, the analysis of the applicability and use\ncases of GNN backdoors is undertaken. Finally, the exploration of potential\nresearch directions of GNN backdoors is presented. This survey aims to explore\nthe principles of graph backdoors, provide insights to defenders, and promote\nfuture security research.\n","authors":["Xiao Yang","Gaolei Li","Jianhua Li"],"pdf_url":"https://arxiv.org/pdf/2406.10573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11543v3","updated":"2025-01-07T11:09:52Z","published":"2024-11-18T13:01:57Z","title":"PSA-VLM: Enhancing Vision-Language Model Safety through Progressive\n  Concept-Bottleneck-Driven Alignment","summary":"  Benefiting from the powerful capabilities of Large Language Models (LLMs),\npre-trained visual encoder models connected to LLMs form Vision Language Models\n(VLMs). However, recent research shows that the visual modality in VLMs is\nhighly vulnerable, allowing attackers to bypass safety alignment in LLMs\nthrough visually transmitted content, launching harmful attacks. To address\nthis challenge, we propose a progressive concept-based alignment strategy,\nPSA-VLM, which incorporates safety modules as concept bottlenecks to enhance\nvisual modality safety alignment. By aligning model predictions with specific\nsafety concepts, we improve defenses against risky images, enhancing\nexplainability and controllability while minimally impacting general\nperformance. Our method is obtained through two-stage training. The low\ncomputational cost of the first stage brings very effective performance\nimprovement, and the fine-tuning of the language model in the second stage\nfurther improves the safety performance. Our method achieves state-of-the-art\nresults on popular VLM safety benchmark.\n","authors":["Zhendong Liu","Yuanbi Nie","Yingshui Tan","Jiaheng Liu","Xiangyu Yue","Qiushi Cui","Chongjun Wang","Xiaoyong Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.11543v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.13581"},{"id":"http://arxiv.org/abs/2501.03700v1","updated":"2025-01-07T11:07:32Z","published":"2025-01-07T11:07:32Z","title":"AuxDepthNet: Real-Time Monocular 3D Object Detection with\n  Depth-Sensitive Features","summary":"  Monocular 3D object detection is a challenging task in autonomous systems due\nto the lack of explicit depth information in single-view images. Existing\nmethods often depend on external depth estimators or expensive sensors, which\nincrease computational complexity and hinder real-time performance. To overcome\nthese limitations, we propose AuxDepthNet, an efficient framework for real-time\nmonocular 3D object detection that eliminates the reliance on external depth\nmaps or pre-trained depth models. AuxDepthNet introduces two key components:\nthe Auxiliary Depth Feature (ADF) module, which implicitly learns\ndepth-sensitive features to improve spatial reasoning and computational\nefficiency, and the Depth Position Mapping (DPM) module, which embeds depth\npositional information directly into the detection process to enable accurate\nobject localization and 3D bounding box regression. Leveraging the DepthFusion\nTransformer architecture, AuxDepthNet globally integrates visual and\ndepth-sensitive features through depth-guided interactions, ensuring robust and\nefficient detection. Extensive experiments on the KITTI dataset show that\nAuxDepthNet achieves state-of-the-art performance, with $\\text{AP}_{3D}$ scores\nof 24.72\\% (Easy), 18.63\\% (Moderate), and 15.31\\% (Hard), and\n$\\text{AP}_{\\text{BEV}}$ scores of 34.11\\% (Easy), 25.18\\% (Moderate), and\n21.90\\% (Hard) at an IoU threshold of 0.7.\n","authors":["Ruochen Zhang","Hyeung-Sik Choi","Dongwook Jung","Phan Huy Nam Anh","Sang-Ki Jeong","Zihao Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.03700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03696v1","updated":"2025-01-07T10:54:44Z","published":"2025-01-07T10:54:44Z","title":"Exploring Molecule Generation Using Latent Space Graph Diffusion","summary":"  Generating molecular graphs is a challenging task due to their discrete\nnature and the competitive objectives involved. Diffusion models have emerged\nas SOTA approaches in data generation across various modalities. For molecular\ngraphs, graph neural networks (GNNs) as a diffusion backbone have achieved\nimpressive results. Latent space diffusion, where diffusion occurs in a\nlow-dimensional space via an autoencoder, has demonstrated computational\nefficiency. However, the literature on latent space diffusion for molecular\ngraphs is scarce, and no commonly accepted best practices exist. In this work,\nwe explore different approaches and hyperparameters, contrasting generative\nflow models (denoising diffusion, flow matching, heat dissipation) and\narchitectures (GNNs and E(3)-equivariant GNNs). Our experiments reveal a high\nsensitivity to the choice of approach and design decisions. Code is made\navailable at\ngithub.com/Prashanth-Pombala/Molecule-Generation-using-Latent-Space-Graph-Diffusion.\n","authors":["Prashanth Pombala","Gerrit Grossmann","Verena Wolf"],"pdf_url":"https://arxiv.org/pdf/2501.03696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06096v4","updated":"2025-01-07T10:45:58Z","published":"2024-09-09T22:16:48Z","title":"Latent Diffusion Bridges for Unsupervised Musical Audio Timbre Transfer","summary":"  Music timbre transfer is a challenging task that involves modifying the\ntimbral characteristics of an audio signal while preserving its melodic\nstructure. In this paper, we propose a novel method based on dual diffusion\nbridges, trained using the CocoChorales Dataset, which consists of unpaired\nmonophonic single-instrument audio data. Each diffusion model is trained on a\nspecific instrument with a Gaussian prior. During inference, a model is\ndesignated as the source model to map the input audio to its corresponding\nGaussian prior, and another model is designated as the target model to\nreconstruct the target audio from this Gaussian prior, thereby facilitating\ntimbre transfer. We compare our approach against existing unsupervised timbre\ntransfer models such as VAEGAN and Gaussian Flow Bridges (GFB). Experimental\nresults demonstrate that our method achieves both better Fr\\'echet Audio\nDistance (FAD) and melody preservation, as reflected by lower pitch distances\n(DPD) compared to VAEGAN and GFB. Additionally, we discover that the noise\nlevel from the Gaussian prior, $\\sigma$, can be adjusted to control the degree\nof melody preservation and amount of timbre transferred.\n","authors":["Michele Mancusi","Yurii Halychanskyi","Kin Wai Cheuk","Eloi Moliner","Chieh-Hsin Lai","Stefan Uhlich","Junghyun Koo","Marco A. Martínez-Ramírez","Wei-Hsiang Liao","Giorgio Fabbro","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2409.06096v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03689v1","updated":"2025-01-07T10:38:51Z","published":"2025-01-07T10:38:51Z","title":"MAJL: A Model-Agnostic Joint Learning Framework for Music Source\n  Separation and Pitch Estimation","summary":"  Music source separation and pitch estimation are two vital tasks in music\ninformation retrieval. Typically, the input of pitch estimation is obtained\nfrom the output of music source separation. Therefore, existing methods have\ntried to perform these two tasks simultaneously, so as to leverage the mutually\nbeneficial relationship between both tasks. However, these methods still face\ntwo critical challenges that limit the improvement of both tasks: the lack of\nlabeled data and joint learning optimization. To address these challenges, we\npropose a Model-Agnostic Joint Learning (MAJL) framework for both tasks. MAJL\nis a generic framework and can use variant models for each task. It includes a\ntwo-stage training method and a dynamic weighting method named Dynamic Weights\non Hard Samples (DWHS), which addresses the lack of labeled data and joint\nlearning optimization, respectively. Experimental results on public music\ndatasets show that MAJL outperforms state-of-the-art methods on both tasks,\nwith significant improvements of 0.92 in Signal-to-Distortion Ratio (SDR) for\nmusic source separation and 2.71% in Raw Pitch Accuracy (RPA) for pitch\nestimation. Furthermore, comprehensive studies not only validate the\neffectiveness of each component of MAJL, but also indicate the great generality\nof MAJL in adapting to different model architectures.\n","authors":["Haojie Wei","Jun Yuan","Rui Zhang","Quanyu Dai","Yueguo Chen"],"pdf_url":"https://arxiv.org/pdf/2501.03689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03681v1","updated":"2025-01-07T10:29:43Z","published":"2025-01-07T10:29:43Z","title":"SLAM: Towards Efficient Multilingual Reasoning via Selective Language\n  Alignment","summary":"  Despite the significant improvements achieved by large language models (LLMs)\nin English reasoning tasks, these models continue to struggle with multilingual\nreasoning. Recent studies leverage a full-parameter and two-stage training\nparadigm to teach models to first understand non-English questions and then\nreason. However, this method suffers from both substantial computational\nresource computing and catastrophic forgetting. The fundamental cause is that,\nwith the primary goal of enhancing multilingual comprehension, an excessive\nnumber of irrelevant layers and parameters are tuned during the first stage.\nGiven our findings that the representation learning of languages is merely\nconducted in lower-level layers, we propose an efficient multilingual reasoning\nalignment approach that precisely identifies and fine-tunes the layers\nresponsible for handling multilingualism. Experimental results show that our\nmethod, SLAM, only tunes 6 layers' feed-forward sub-layers including 6.5-8% of\nall parameters within 7B and 13B LLMs, achieving superior average performance\nthan all strong baselines across 10 languages. Meanwhile, SLAM only involves\none training stage, reducing training time by 4.1-11.9 compared to the\ntwo-stage method.\n","authors":["Yuchun Fan","Yongyu Mu","Yilin Wang","Lei Huang","Junhao Ruan","Bei Li","Tong Xiao","Shujian Huang","Xiaocheng Feng","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.03681v1.pdf","comment":"Accepted by COLING 2025 (Oral)"},{"id":"http://arxiv.org/abs/2501.03676v1","updated":"2025-01-07T10:22:30Z","published":"2025-01-07T10:22:30Z","title":"SALE-Based Offline Reinforcement Learning with Ensemble Q-Networks","summary":"  In this work, we build upon the offline reinforcement learning algorithm TD7,\nwhich incorporates State-Action Learned Embeddings (SALE) and LAP, and propose\na model-free actor-critic algorithm that integrates ensemble Q-networks and a\ngradient diversity penalty from EDAC. The ensemble Q-networks effectively\naddress the challenge of out-of-distribution actions by introducing penalties\nthat guide the actor network to focus on in-distribution actions. Meanwhile,\nthe gradient diversity penalty encourages diverse Q-value gradients, further\nsuppressing overestimation for out-of-distribution actions. Additionally, our\nmethod retains an adjustable behavior cloning (BC) term that directs the actor\nnetwork toward dataset actions during early training stages, while gradually\nreducing its influence as the precision of the Q-ensemble improves. These\nenhancements work synergistically to improve training stability and accuracy.\nExperimental results on the D4RL MuJoCo benchmarks demonstrate that our\nalgorithm achieves superior convergence speed, stability, and performance\ncompared to existing methods.\n","authors":["Zheng Chun"],"pdf_url":"https://arxiv.org/pdf/2501.03676v1.pdf","comment":"10 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.03674v1","updated":"2025-01-07T10:20:16Z","published":"2025-01-07T10:20:16Z","title":"Action Quality Assessment via Hierarchical Pose-guided Multi-stage\n  Contrastive Regression","summary":"  Action Quality Assessment (AQA), which aims at automatic and fair evaluation\nof athletic performance, has gained increasing attention in recent years.\nHowever, athletes are often in rapid movement and the corresponding visual\nappearance variances are subtle, making it challenging to capture fine-grained\npose differences and leading to poor estimation performance. Furthermore, most\ncommon AQA tasks, such as diving in sports, are usually divided into multiple\nsub-actions, each of which contains different durations. However, existing\nmethods focus on segmenting the video into fixed frames, which disrupts the\ntemporal continuity of sub-actions resulting in unavoidable prediction errors.\nTo address these challenges, we propose a novel action quality assessment\nmethod through hierarchically pose-guided multi-stage contrastive regression.\nFirstly, we introduce a multi-scale dynamic visual-skeleton encoder to capture\nfine-grained spatio-temporal visual and skeletal features. Then, a procedure\nsegmentation network is introduced to separate different sub-actions and obtain\nsegmented features. Afterwards, the segmented visual and skeletal features are\nboth fed into a multi-modal fusion module as physics structural priors, to\nguide the model in learning refined activity similarities and variances.\nFinally, a multi-stage contrastive learning regression approach is employed to\nlearn discriminative representations and output prediction results. In\naddition, we introduce a newly-annotated FineDiving-Pose Dataset to improve the\ncurrent low-quality human pose labels. In experiments, the results on\nFineDiving and MTL-AQA datasets demonstrate the effectiveness and superiority\nof our proposed approach. Our source code and dataset are available at\nhttps://github.com/Lumos0507/HP-MCoRe.\n","authors":["Mengshi Qi","Hao Ye","Jiaxuan Peng","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.03674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03670v1","updated":"2025-01-07T10:18:22Z","published":"2025-01-07T10:18:22Z","title":"A Diversity-Enhanced Knowledge Distillation Model for Practical Math\n  Word Problem Solving","summary":"  Math Word Problem (MWP) solving is a critical task in natural language\nprocessing, has garnered significant research interest in recent years. Various\nrecent studies heavily rely on Seq2Seq models and their extensions (e.g.,\nSeq2Tree and Graph2Tree) to generate mathematical equations. While effective,\nthese models struggle to generate diverse but counterpart solution equations,\nlimiting their generalization across various math problem scenarios. In this\npaper, we introduce a novel Diversity-enhanced Knowledge Distillation (DivKD)\nmodel for practical MWP solving. Our approach proposes an adaptive diversity\ndistillation method, in which a student model learns diverse equations by\nselectively transferring high-quality knowledge from a teacher model.\nAdditionally, we design a diversity prior-enhanced student model to better\ncapture the diversity distribution of equations by incorporating a conditional\nvariational auto-encoder. Extensive experiments on {four} MWP benchmark\ndatasets demonstrate that our approach achieves higher answer accuracy than\nstrong baselines while maintaining high efficiency for practical applications.\n","authors":["Yi Zhang","Guangyou Zhou","Zhiwen Xie","Jinjin Ma","Jimmy Xiangji Huang"],"pdf_url":"https://arxiv.org/pdf/2501.03670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02801v3","updated":"2025-01-07T10:09:18Z","published":"2024-12-03T20:04:32Z","title":"Optimization of Transformer heart disease prediction model based on\n  particle swarm optimization algorithm","summary":"  Aiming at the latest particle swarm optimization algorithm, this paper\nproposes an improved Transformer model to improve the accuracy of heart disease\nprediction and provide a new algorithm idea. We first use three mainstream\nmachine learning classification algorithms - decision tree, random forest and\nXGBoost, and then output the confusion matrix of these three models. The\nresults showed that the random forest model had the best performance in\npredicting the classification of heart disease, with an accuracy of 92.2%.\nThen, we apply the Transformer model based on particle swarm optimization (PSO)\nalgorithm to the same dataset for classification experiment. The results show\nthat the classification accuracy of the model is as high as 96.5%, 4.3\npercentage points higher than that of random forest, which verifies the\neffectiveness of PSO in optimizing Transformer model. From the above research,\nwe can see that particle swarm optimization significantly improves Transformer\nperformance in heart disease prediction. Improving the ability to predict heart\ndisease is a global priority with benefits for all humankind. Accurate\nprediction can enhance public health, optimize medical resources, and reduce\nhealthcare costs, leading to healthier populations and more productive\nsocieties worldwide. This advancement paves the way for more efficient health\nmanagement and supports the foundation of a healthier, more resilient global\ncommunity.\n","authors":["Jingyuan Yi","Peiyang Yu","Tianyi Huang","Zeqiu Xu"],"pdf_url":"https://arxiv.org/pdf/2412.02801v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01999v2","updated":"2025-01-07T10:04:51Z","published":"2024-08-04T11:55:24Z","title":"Reinforcement Learning for an Efficient and Effective Malware\n  Investigation during Cyber Incident Response","summary":"  This research focused on enhancing post-incident malware forensic\ninvestigation using reinforcement learning RL. We proposed an advanced MDP post\nincident malware forensics investigation model and framework to expedite post\nincident forensics. We then implement our RL Malware Investigation Model based\non structured MDP within the proposed framework. To identify malware artefacts,\nthe RL agent acquires and examines forensics evidence files, iteratively\nimproving its capabilities using Q Table and temporal difference learning. The\nQ learning algorithm significantly improved the agent ability to identify\nmalware. An epsilon greedy exploration strategy and Q learning updates enabled\nefficient learning and decision making. Our experimental testing revealed that\noptimal learning rates depend on the MDP environment complexity, with simpler\nenvironments benefiting from higher rates for quicker convergence and complex\nones requiring lower rates for stability. Our model performance in identifying\nand classifying malware reduced malware analysis time compared to human\nexperts, demonstrating robustness and adaptability. The study highlighted the\nsignificance of hyper parameter tuning and suggested adaptive strategies for\ncomplex environments. Our RL based approach produced promising results and is\nvalidated as an alternative to traditional methods notably by offering\ncontinuous learning and adaptation to new and evolving malware threats which\nultimately enhance the post incident forensics investigations.\n","authors":["Dipo Dunsin","Mohamed Chahine Ghanem","Karim Ouazzane","Vassil Vassilev"],"pdf_url":"https://arxiv.org/pdf/2408.01999v2.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2501.02832v2","updated":"2025-01-07T10:01:19Z","published":"2025-01-06T08:16:06Z","title":"Samba-ASR: State-Of-The-Art Speech Recognition Leveraging Structured\n  State-Space Models","summary":"  We propose Samba ASR,the first state of the art Automatic Speech\nRecognition(ASR)model leveraging the novel Mamba architecture as both encoder\nand decoder,built on the foundation of state space models(SSMs).Unlike\ntransformerbased ASR models,which rely on self-attention mechanisms to capture\ndependencies,Samba ASR effectively models both local and global temporal\ndependencies using efficient statespace dynamics,achieving remarkable\nperformance gains.By addressing the limitations of transformers,such as\nquadratic scaling with input length and difficulty in handling longrange\ndependencies,Samba ASR achieves superior accuracy and efficiency.Experimental\nresults demonstrate that Samba ASR surpasses existing opensource\ntransformerbased ASR models across various standard benchmarks,establishing it\nas the new state of theart in ASR.Extensive evaluations on the benchmark\ndataset show significant improvements in Word Error Rate(WER),with competitive\nperformance even in lowresource scenarios.Furthermore,the inherent\ncomputational efficiency and parameter optimization of the Mamba architecture\nmake Samba ASR a scalable and robust solution for diverse ASR tasks.Our\ncontributions include the development of a new Samba ASR architecture for\nautomatic speech recognition(ASR),demonstrating the superiority of structured\nstatespace models(SSMs)over transformer based models for speech sequence\nprocessing.We provide a comprehensive evaluation on public\nbenchmarks,showcasing stateoftheart(SOTA)performance,and present an indepth\nanalysis of computational efficiency,robustness to noise,and sequence\ngeneralization.This work highlights the viability of Mamba SSMs as a\ntransformerfree alternative for efficient and accurate ASR.By leveraging the\nadvancements of statespace modeling,Samba ASR redefines ASR performance\nstandards and sets a new benchmark for future research in this field.\n","authors":["Syed Abdul Gaffar Shakhadri","Kruthika KR","Kartik Basavaraj Angadi"],"pdf_url":"https://arxiv.org/pdf/2501.02832v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14887v3","updated":"2025-01-07T09:55:57Z","published":"2024-09-23T10:35:57Z","title":"Deploying Open-Source Large Language Models: A performance Analysis","summary":"  Since the release of ChatGPT in November 2022, large language models (LLMs)\nhave seen considerable success, including in the open-source community, with\nmany open-weight models available. However, the requirements to deploy such a\nservice are often unknown and difficult to evaluate in advance. To facilitate\nthis process, we conducted numerous tests at the Centre Inria de l'Universit\\'e\nde Bordeaux. In this article, we propose a comparison of the performance of\nseveral models of different sizes (mainly Mistral and LLaMa) depending on the\navailable GPUs, using vLLM, a Python library designed to optimize the inference\nof these models. Our results provide valuable information for private and\npublic groups wishing to deploy LLMs, allowing them to evaluate the performance\nof different models based on their available hardware. This study thus\ncontributes to facilitating the adoption and use of these large language models\nin various application domains.\n","authors":["Yannis Bendi-Ouis","Dan Dutartre","Xavier Hinaut"],"pdf_url":"https://arxiv.org/pdf/2409.14887v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16370v2","updated":"2025-01-07T09:34:51Z","published":"2024-11-25T13:26:09Z","title":"A Review of Bayesian Uncertainty Quantification in Deep Probabilistic\n  Image Segmentation","summary":"  Advancements in image segmentation play an integral role within the broad\nscope of Deep Learning-based Computer Vision. Furthermore, their widespread\napplicability in critical real-world tasks has resulted in challenges related\nto the reliability of such algorithms. Hence, uncertainty quantification has\nbeen extensively studied within this context, enabling the expression of model\nignorance (epistemic uncertainty) or data ambiguity (aleatoric uncertainty) to\nprevent uninformed decision-making. Due to the rapid adoption of Convolutional\nNeural Network (CNN)-based segmentation models in high-stake applications, a\nsubstantial body of research has been published on this very topic, causing its\nswift expansion into a distinct field. This work provides a comprehensive\noverview of probabilistic segmentation, by discussing fundamental concepts of\nuncertainty quantification, governing advancements in the field as well as the\napplication to various tasks. Moreover, literature on both types of\nuncertainties trace back to four key applications: (1) to quantify statistical\ninconsistencies in the annotation process due ambiguous images, (2) correlating\nprediction error with uncertainty, (3) expanding the model hypothesis space for\nbetter generalization, and (4) Active Learning. An extensive discussion follows\nthat includes an overview of utilized datasets for each of the applications and\nevaluation of the available methods. We also highlight challenges related to\narchitectures, uncertainty quantification methods, standardization and\nbenchmarking, and finally end with recommendations for future work such as\nmethods based on single forward passes and models that appropriately leverage\nvolumetric data.\n","authors":["M. M. A. Valiuddin","R. J. G. van Sloun","C. G. A. Viviers","P. H. N. de With","F. van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2411.16370v2.pdf","comment":"20 pages, revised"},{"id":"http://arxiv.org/abs/2501.00320v2","updated":"2025-01-07T09:25:32Z","published":"2024-12-31T07:31:46Z","title":"Autonomous Alignment with Human Value on Altruism through Considerate\n  Self-imagination and Theory of Mind","summary":"  With the widespread application of Artificial Intelligence (AI) in human\nsociety, enabling AI to autonomously align with human values has become a\npressing issue to ensure its sustainable development and benefit to humanity.\nOne of the most important aspects of aligning with human values is the\nnecessity for agents to autonomously make altruistic, safe, and ethical\ndecisions, considering and caring for human well-being. Current AI extremely\npursues absolute superiority in certain tasks, remaining indifferent to the\nsurrounding environment and other agents, which has led to numerous safety\nrisks. Altruistic behavior in human society originates from humans' capacity\nfor empathizing others, known as Theory of Mind (ToM), combined with predictive\nimaginative interactions before taking action to produce thoughtful and\naltruistic behaviors. Inspired by this, we are committed to endow agents with\nconsiderate self-imagination and ToM capabilities, driving them through\nimplicit intrinsic motivations to autonomously align with human altruistic\nvalues. By integrating ToM within the imaginative space, agents keep an eye on\nthe well-being of other agents in real time, proactively anticipate potential\nrisks to themselves and others, and make thoughtful altruistic decisions that\nbalance negative effects on the environment. The ancient Chinese story of Sima\nGuang Smashes the Vat illustrates the moral behavior of the young Sima Guang\nsmashed a vat to save a child who had accidentally fallen into it, which is an\nexcellent reference scenario for this paper. We design an experimental scenario\nsimilar to Sima Guang Smashes the Vat and its variants with different\ncomplexities, which reflects the trade-offs and comprehensive considerations\nbetween self-goals, altruistic rescue, and avoiding negative side effects.\n","authors":["Haibo Tong","Enmeng Lu","Yinqian Sun","Zhengqiang Han","Chao Liu","Feifei Zhao","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2501.00320v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03643v1","updated":"2025-01-07T09:21:52Z","published":"2025-01-07T09:21:52Z","title":"Effective and Efficient Mixed Precision Quantization of Speech\n  Foundation Models","summary":"  This paper presents a novel mixed-precision quantization approach for speech\nfoundation models that tightly integrates mixed-precision learning and\nquantized model parameter estimation into one single model compression stage.\nExperiments conducted on LibriSpeech dataset with fine-tuned wav2vec2.0-base\nand HuBERT-large models suggest the resulting mixed-precision quantized models\nincreased the lossless compression ratio by factors up to 1.7x and 1.9x over\nthe respective uniform-precision and two-stage mixed-precision quantized\nbaselines that perform precision learning and model parameters quantization in\nseparate and disjointed stages, while incurring no statistically word error\nrate (WER) increase over the 32-bit full-precision models. The system\ncompression time of wav2vec2.0-base and HuBERT-large models is reduced by up to\n1.9 and 1.5 times over the two-stage mixed-precision baselines, while both\nproduce lower WERs. The best-performing 3.5-bit mixed-precision quantized\nHuBERT-large model produces a lossless compression ratio of 8.6x over the\n32-bit full-precision system.\n","authors":["Haoning Xu","Zhaoqing Li","Zengrui Jin","Huimeng Wang","Youjun Chen","Guinan Li","Mengzhe Geng","Shujie Hu","Jiajun Deng","Xunying Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03643v1.pdf","comment":"To appear at IEEE ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.03635v1","updated":"2025-01-07T09:10:09Z","published":"2025-01-07T09:10:09Z","title":"MHGNet: Multi-Heterogeneous Graph Neural Network for Traffic Prediction","summary":"  In recent years, traffic flow prediction has played a crucial role in the\nmanagement of intelligent transportation systems. However, traditional\nforecasting methods often model non-Euclidean low-dimensional traffic data as a\nsimple graph with single-type nodes and edges, failing to capture similar\ntrends among nodes of the same type. To address this limitation, this paper\nproposes MHGNet, a novel framework for modeling spatiotemporal\nmulti-heterogeneous graphs. Within this framework, the STD Module decouples\nsingle-pattern traffic data into multi-pattern traffic data through feature\nmappings of timestamp embedding matrices and node embedding matrices.\nSubsequently, the Node Clusterer leverages the Euclidean distance between nodes\nand different types of limit points to perform clustering with O(N) time\ncomplexity. The nodes within each cluster undergo residual subgraph convolution\nwithin the spatiotemporal fusion subgraphs generated by the DSTGG Module,\nfollowed by processing in the SIE Module for node repositioning and\nredistribution of weights. To validate the effectiveness of MHGNet, this paper\nconducts extensive ablation studies and quantitative evaluations on four widely\nused benchmarks, demonstrating its superior performance.\n","authors":["Mei Wu","Yiqian Lin","Tianfan Jiang","Wenchao Weng"],"pdf_url":"https://arxiv.org/pdf/2501.03635v1.pdf","comment":"Accepted by 2025 lEEE International Conference on Acoustics, speech,\n  and signal Processing (lCASSP2025)"},{"id":"http://arxiv.org/abs/2309.04195v2","updated":"2025-01-07T08:46:02Z","published":"2023-09-08T08:12:29Z","title":"Towards Mitigating Architecture Overfitting on Distilled Datasets","summary":"  Dataset distillation methods have demonstrated remarkable performance for\nneural networks trained with very limited training data. However, a significant\nchallenge arises in the form of \\textit{architecture overfitting}: the\ndistilled training dataset synthesized by a specific network architecture\n(i.e., training network) generates poor performance when trained by other\nnetwork architectures (i.e., test networks), especially when the test networks\nhave a larger capacity than the training network. This paper introduces a\nseries of approaches to mitigate this issue. Among them, DropPath renders the\nlarge model to be an implicit ensemble of its sub-networks, and knowledge\ndistillation ensures each sub-network acts similarly to the small but\nwell-performing teacher network. These methods, characterized by their\nsmoothing effects, significantly mitigate architecture overfitting. We conduct\nextensive experiments to demonstrate the effectiveness and generality of our\nmethods. Particularly, across various scenarios involving different tasks and\ndifferent sizes of distilled data, our approaches significantly mitigate\narchitecture overfitting. Furthermore, our approaches achieve comparable or\neven superior performance when the test network is larger than the training\nnetwork.\n","authors":["Xuyang Zhong","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2309.04195v2.pdf","comment":"Accepted by TNNLS"},{"id":"http://arxiv.org/abs/2501.02981v2","updated":"2025-01-07T08:39:10Z","published":"2025-01-06T12:43:59Z","title":"CONTINUUM: Detecting APT Attacks through Spatial-Temporal Graph Neural\n  Networks","summary":"  Advanced Persistent Threats (APTs) represent a significant challenge in\ncybersecurity due to their sophisticated and stealthy nature. Traditional\nIntrusion Detection Systems (IDS) often fall short in detecting these\nmulti-stage attacks. Recently, Graph Neural Networks (GNNs) have been employed\nto enhance IDS capabilities by analyzing the complex relationships within\nnetworked data. However, existing GNN-based solutions are hampered by high\nfalse positive rates and substantial resource consumption. In this paper, we\npresent a novel IDS designed to detect APTs using a Spatio-Temporal Graph\nNeural Network Autoencoder. Our approach leverages spatial information to\nunderstand the interactions between entities within a graph and temporal\ninformation to capture the evolution of the graph over time. This dual\nperspective is crucial for identifying the sequential stages of APTs.\nFurthermore, to address privacy and scalability concerns, we deploy our\narchitecture in a federated learning environment. This setup ensures that local\ndata remains on-premise while encrypted model-weights are shared and aggregated\nusing homomorphic encryption, maintaining data privacy and security. Our\nevaluation shows that this system effectively detects APTs with lower false\npositive rates and optimized resource usage compared to existing methods,\nhighlighting the potential of spatio-temporal analysis and federated learning\nin enhancing cybersecurity defenses.\n","authors":["Atmane Ayoub Mansour Bahar","Kamel Soaid Ferrahi","Mohamed-Lamine Messai","Hamida Seba","Karima Amrouche"],"pdf_url":"https://arxiv.org/pdf/2501.02981v2.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2412.04783v2","updated":"2025-01-07T08:23:43Z","published":"2024-12-06T05:20:08Z","title":"KNN-MMD: Cross Domain Wireless Sensing via Local Distribution Alignment","summary":"  Wireless sensing has recently found widespread applications in diverse\nenvironments, including homes, offices, and public spaces. By analyzing\npatterns in channel state information (CSI), it is possible to infer human\nactions for tasks such as person identification, gesture recognition, and fall\ndetection. However, CSI is highly sensitive to environmental changes, where\neven minor alterations can significantly distort the CSI patterns. This\nsensitivity often leads to performance degradation or outright failure when\napplying wireless sensing models trained in one environment to another. To\naddress this challenge, Domain Alignment (DAL) has been widely adopted for\ncross-domain classification tasks, as it focuses on aligning the global\ndistributions of the source and target domains in feature space. Despite its\npopularity, DAL often neglects inter-category relationships, which can lead to\nmisalignment between categories across domains, even when global alignment is\nachieved. To overcome these limitations, we propose K-Nearest Neighbors Maximum\nMean Discrepancy (KNN-MMD), a novel few-shot method for cross-domain wireless\nsensing. Our approach begins by constructing a help set using KNN from the\ntarget domain, enabling local alignment between the source and target domains\nwithin each category using MMD. Additionally, we address a key instability\nissue commonly observed in cross-domain methods, where model performance\nfluctuates sharply between epochs. Further, most existing methods struggle to\ndetermine an optimal stopping point during training due to the absence of\nlabeled data from the target domain. Our method resolves this by excluding the\nsupport set from the target domain during training and employing it as a\nvalidation set to determine the stopping criterion.\n","authors":["Zijian Zhao","Zhijie Cai","Tingwei Chen","Xiaoyang Li","Hang Li","Qimei Chen","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.04783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03598v1","updated":"2025-01-07T07:55:35Z","published":"2025-01-07T07:55:35Z","title":"RecKG: Knowledge Graph for Recommender Systems","summary":"  Knowledge graphs have proven successful in integrating heterogeneous data\nacross various domains. However, there remains a noticeable dearth of research\non their seamless integration among heterogeneous recommender systems, despite\nknowledge graph-based recommender systems garnering extensive research\nattention. This study aims to fill this gap by proposing RecKG, a standardized\nknowledge graph for recommender systems. RecKG ensures the consistent\nrepresentation of entities across different datasets, accommodating diverse\nattribute types for effective data integration. Through a meticulous\nexamination of various recommender system datasets, we select attributes for\nRecKG, ensuring standardized formatting through consistent naming conventions.\nBy these characteristics, RecKG can seamlessly integrate heterogeneous data\nsources, enabling the discovery of additional semantic information within the\nintegrated knowledge graph. We apply RecKG to standardize real-world datasets,\nsubsequently developing an application for RecKG using a graph database.\nFinally, we validate RecKG's achievement in interoperability through a\nqualitative evaluation between RecKG and other studies.\n","authors":["Junhyuk Kwon","Seokho Ahn","Young-Duk Seo"],"pdf_url":"https://arxiv.org/pdf/2501.03598v1.pdf","comment":"Accepted by The 39th ACM/SIGAPP Symposium On Applied Computing(SAC)\n  2024"},{"id":"http://arxiv.org/abs/2411.03814v2","updated":"2025-01-07T07:46:16Z","published":"2024-11-06T10:32:09Z","title":"MRJ-Agent: An Effective Jailbreak Agent for Multi-Round Dialogue","summary":"  Large Language Models (LLMs) demonstrate outstanding performance in their\nreservoir of knowledge and understanding capabilities, but they have also been\nshown to be prone to illegal or unethical reactions when subjected to jailbreak\nattacks. To ensure their responsible deployment in critical applications, it is\ncrucial to understand the safety capabilities and vulnerabilities of LLMs.\nPrevious works mainly focus on jailbreak in single-round dialogue, overlooking\nthe potential jailbreak risks in multi-round dialogues, which are a vital way\nhumans interact with and extract information from LLMs. Some studies have\nincreasingly concentrated on the risks associated with jailbreak in multi-round\ndialogues. These efforts typically involve the use of manually crafted\ntemplates or prompt engineering techniques. However, due to the inherent\ncomplexity of multi-round dialogues, their jailbreak performance is limited. To\nsolve this problem, we propose a novel multi-round dialogue jailbreaking agent,\nemphasizing the importance of stealthiness in identifying and mitigating\npotential threats to human values posed by LLMs. We propose a risk\ndecomposition strategy that distributes risks across multiple rounds of queries\nand utilizes psychological strategies to enhance attack strength. Extensive\nexperiments show that our proposed method surpasses other attack methods and\nachieves state-of-the-art attack success rate. We will make the corresponding\ncode and dataset available for future research. The code will be released soon.\n","authors":["Fengxiang Wang","Ranjie Duan","Peng Xiao","Xiaojun Jia","Shiji Zhao","Cheng Wei","YueFeng Chen","Chongwen Wang","Jialing Tao","Hang Su","Jun Zhu","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2411.03814v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15778v3","updated":"2025-01-07T07:31:00Z","published":"2024-11-24T10:58:48Z","title":"Enhancing the automatic segmentation and analysis of 3D liver\n  vasculature models","summary":"  Surgical assessment of liver cancer patients requires identification of the\nvessel trees from medical images. Specifically, the venous trees - the portal\n(perfusing) and the hepatic (draining) trees are important for understanding\nthe liver anatomy and disease state, and perform surgery planning. This\nresearch aims to improve the 3D segmentation, skeletonization, and subsequent\nanalysis of vessel trees, by creating an automatic pipeline based on deep\nlearning and image processing techniques.\n  The first part of this work explores the impact of differentiable\nskeletonization methods such as ClDice and morphological skeletonization loss,\non the overall liver vessel segmentation performance. To this aim, it studies\nhow to improve vessel tree connectivity.\n  The second part of this study converts a single class vessel segmentation\ninto multi-class ones, separating the two venous trees. It builds on the\nprevious two-class vessel segmentation model, which vessel tree outputs might\nbe entangled, and on connected components and skeleton analyses of the trees.\n  After providing sub-labeling of the specific anatomical branches of each\nvenous tree, these algorithms also enable a morphometric analysis of the vessel\ntrees by extracting various geometrical markers.\n  In conclusion, we propose a method that successfully improves current\nskeletonization methods, for extensive vascular trees that contain vessels of\ndifferent calibers. The separation algorithm creates a clean multi-class\nsegmentation of the vessels, validated by surgeons to provide low error. A new,\npublicly shared high-quality liver vessel dataset of 77 cases is thus created.\nFinally a method to annotate vessel trees according to anatomy is provided,\nenabling a unique liver vessel morphometry analysis.\n","authors":["Yassine Machta","Omar Ali","Kevin Hakkakian","Ana Vlasceanu","Amaury Facque","Nicolas Golse","Irene Vignon-Clementel"],"pdf_url":"https://arxiv.org/pdf/2411.15778v3.pdf","comment":"Paper presented at MICCAI 2024 Workshop: ADSMI. This work was done in\n  the context of an internship at Simbiotx, Inria"},{"id":"http://arxiv.org/abs/2501.03583v1","updated":"2025-01-07T07:16:56Z","published":"2025-01-07T07:16:56Z","title":"STContext: A Multifaceted Dataset for Developing Context-aware\n  Spatio-temporal Crowd Mobility Prediction Models","summary":"  In smart cities, context-aware spatio-temporal crowd flow prediction (STCFP)\nmodels leverage contextual features (e.g., weather) to identify unusual crowd\nmobility patterns and enhance prediction accuracy. However, the best practice\nfor incorporating contextual features remains unclear due to inconsistent usage\nof contextual features in different papers. Developing a multifaceted dataset\nwith rich types of contextual features and STCFP scenarios is crucial for\nestablishing a principled context modeling paradigm. Existing open crowd flow\ndatasets lack an adequate range of contextual features, which poses an urgent\nrequirement to build a multifaceted dataset to fill these research gaps. To\nthis end, we create STContext, a multifaceted dataset for developing\ncontext-aware STCFP models. Specifically, STContext provides nine\nspatio-temporal datasets across five STCFP scenarios and includes ten\ncontextual features, including weather, air quality index, holidays, points of\ninterest, road networks, etc. Besides, we propose a unified workflow for\nincorporating contextual features into deep STCFP methods, with steps including\nfeature transformation, dependency modeling, representation fusion, and\ntraining strategies. Through extensive experiments, we have obtained several\nuseful guidelines for effective context modeling and insights for future\nresearch. The STContext is open-sourced at\nhttps://github.com/Liyue-Chen/STContext.\n","authors":["Liyue Chen","Jiangyi Fang","Tengfei Liu","Fangyuan Gao","Leye Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03173v3","updated":"2025-01-07T07:05:05Z","published":"2024-02-05T16:41:02Z","title":"MULTI: Multimodal Understanding Leaderboard with Text and Images","summary":"  The rapid development of multimodal large language models (MLLMs) raises the\nquestion of how they compare to human performance. While existing datasets\noften feature synthetic or overly simplistic tasks, some models have already\nsurpassed human expert baselines. In this paper, we present MULTI, a Chinese\nmultimodal dataset derived from authentic examination questions. Comprising\nover 18,000 carefully selected and refined questions, MULTI evaluates models\nusing real-world examination standards, encompassing image-text comprehension,\ncomplex reasoning, and knowledge recall. Additionally, We also introduce\nMULTI-Elite, a 500-question selected hard subset, and MULTI-Extend with more\nthan 4,500 external knowledge context pieces for testing in-context learning\ncapabilities. Our evaluation highlights substantial room for MLLM advancement,\nwith Qwen2-VL-72B achieving a 76.9% accuracy on MULTI and 53.1% on MULTI-Elite\nleading 25 evaluated models, compared to human expert baselines of 86.1% and\n73.1%. MULTI serves not only as a robust evaluation platform but also paves the\nway for the development of expert-level AI.\n","authors":["Zichen Zhu","Yang Xu","Lu Chen","Jingkai Yang","Yichuan Ma","Yiming Sun","Hailin Wen","Jiaqi Liu","Jinyu Cai","Yingzi Ma","Situo Zhang","Zihan Zhao","Liangtai Sun","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2402.03173v3.pdf","comment":"24 pages, 19 figures, 10 tables. Details and access are available at:\n  https://OpenDFM.github.io/MULTI-Benchmark/"},{"id":"http://arxiv.org/abs/2501.03575v1","updated":"2025-01-07T06:55:50Z","published":"2025-01-07T06:55:50Z","title":"Cosmos World Foundation Model Platform for Physical AI","summary":"  Physical AI needs to be trained digitally first. It needs a digital twin of\nitself, the policy model, and a digital twin of the world, the world model. In\nthis paper, we present the Cosmos World Foundation Model Platform to help\ndevelopers build customized world models for their Physical AI setups. We\nposition a world foundation model as a general-purpose world model that can be\nfine-tuned into customized world models for downstream applications. Our\nplatform covers a video curation pipeline, pre-trained world foundation models,\nexamples of post-training of pre-trained world foundation models, and video\ntokenizers. To help Physical AI builders solve the most critical problems of\nour society, we make our platform open-source and our models open-weight with\npermissive licenses available via https://github.com/NVIDIA/Cosmos.\n","authors":[" NVIDIA"," :","Niket Agarwal","Arslan Ali","Maciej Bala","Yogesh Balaji","Erik Barker","Tiffany Cai","Prithvijit Chattopadhyay","Yongxin Chen","Yin Cui","Yifan Ding","Daniel Dworakowski","Jiaojiao Fan","Michele Fenzi","Francesco Ferroni","Sanja Fidler","Dieter Fox","Songwei Ge","Yunhao Ge","Jinwei Gu","Siddharth Gururani","Ethan He","Jiahui Huang","Jacob Huffman","Pooya Jannaty","Jingyi Jin","Seung Wook Kim","Gergely Klár","Grace Lam","Shiyi Lan","Laura Leal-Taixe","Anqi Li","Zhaoshuo Li","Chen-Hsuan Lin","Tsung-Yi Lin","Huan Ling","Ming-Yu Liu","Xian Liu","Alice Luo","Qianli Ma","Hanzi Mao","Kaichun Mo","Arsalan Mousavian","Seungjun Nah","Sriharsha Niverty","David Page","Despoina Paschalidou","Zeeshan Patel","Lindsey Pavao","Morteza Ramezanali","Fitsum Reda","Xiaowei Ren","Vasanth Rao Naik Sabavat","Ed Schmerling","Stella Shi","Bartosz Stefaniak","Shitao Tang","Lyne Tchapmi","Przemek Tredak","Wei-Cheng Tseng","Jibin Varghese","Hao Wang","Haoxiang Wang","Heng Wang","Ting-Chun Wang","Fangyin Wei","Xinyue Wei","Jay Zhangjie Wu","Jiashu Xu","Wei Yang","Lin Yen-Chen","Xiaohui Zeng","Yu Zeng","Jing Zhang","Qinsheng Zhang","Yuxuan Zhang","Qingqing Zhao","Artur Zolkowski"],"pdf_url":"https://arxiv.org/pdf/2501.03575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03572v1","updated":"2025-01-07T06:51:46Z","published":"2025-01-07T06:51:46Z","title":"From Code to Compliance: Assessing ChatGPT's Utility in Designing an\n  Accessible Webpage -- A Case Study","summary":"  Web accessibility ensures that individuals with disabilities can access and\ninteract with digital content without barriers, yet a significant majority of\nmost used websites fail to meet accessibility standards. This study evaluates\nChatGPT's (GPT-4o) ability to generate and improve web pages in line with Web\nContent Accessibility Guidelines (WCAG). While ChatGPT can effectively address\naccessibility issues when prompted, its default code often lacks compliance,\nreflecting limitations in its training data and prevailing inaccessible web\npractices. Automated and manual testing revealed strengths in resolving simple\nissues but challenges with complex tasks, requiring human oversight and\nadditional iterations. Unlike prior studies, we incorporate manual evaluation,\ndynamic elements, and use the visual reasoning capability of ChatGPT along with\nthe prompts to fix accessibility issues. Providing screenshots alongside\nprompts enhances the LLM's ability to address accessibility issues by allowing\nit to analyze surrounding components, such as determining appropriate contrast\ncolors. We found that effective prompt engineering, such as providing concise,\nstructured feedback and incorporating visual aids, significantly enhances\nChatGPT's performance. These findings highlight the potential and limitations\nof large language models for accessible web development, offering practical\nguidance for developers to create more inclusive websites.\n","authors":["Ammar Ahmed","Margarida Fresco","Fredrik Forsberg","Hallvard Grotli"],"pdf_url":"https://arxiv.org/pdf/2501.03572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12370v2","updated":"2025-01-07T06:47:00Z","published":"2024-12-16T21:56:01Z","title":"Scam Detection for Ethereum Smart Contracts: Leveraging Graph\n  Representation Learning for Secure Blockchain","summary":"  Due to the increasing abuse of fraudulent activities that result in\nsignificant financial and reputational harm, Ethereum smart contracts face a\nsignificant problem in detecting fraud. Existing monitoring methods typically\nrely on lease code analysis or physically extracted features, which suffer from\nscalability and adaptability limitations. In this study, we use graph\nrepresentation learning to observe purchase trends and find fraudulent deals.\nWe can achieve powerful categorisation performance by using innovative machine\nlearning versions and transforming Ethereum invoice data into graph structures.\nOur method addresses label imbalance through SMOTE-ENN techniques and evaluates\nmodels like Multi-Layer Perceptron ( MLP ) and Graph Convolutional Networks (\nGCN). Experimental results show that the MLP type surpasses the GCN in this\nenvironment, with domain-specific assessments closely aligned with real-world\nassessments. This study provides a scalable and efficient way to improve\nEthereum's ecosystem's confidence and security.\n","authors":["Yihong Jin","Ze Yang"],"pdf_url":"https://arxiv.org/pdf/2412.12370v2.pdf","comment":"Accepted to BDICN 2025"},{"id":"http://arxiv.org/abs/2407.15320v2","updated":"2025-01-07T06:39:29Z","published":"2024-07-07T09:25:52Z","title":"Edge Graph Intelligence: Reciprocally Empowering Edge Networks with\n  Graph Intelligence","summary":"  Recent years have witnessed a thriving growth of computing facilities\nconnected at the network edge, cultivating edge networks as a fundamental\ninfrastructure for supporting miscellaneous intelligent services.Meanwhile,\nArtificial Intelligence (AI) frontiers have extrapolated to the graph domain\nand promoted Graph Intelligence (GI). Given the inherent relation between\ngraphs and networks, the interdiscipline of graph learning and edge networks,\ni.e., Edge GI or EGI, has revealed a novel interplay between them -- GI aids in\noptimizing edge networks, while edge networks facilitate GI model deployment.\nDriven by this delicate closed-loop, EGI is recognized as a promising solution\nto fully unleash the potential of edge computing power and is garnering growing\nattention. Nevertheless, research on EGI remains nascent, and there is a\nsoaring demand within both the communications and AI communities for a\ndedicated venue to share recent advancements. To this end, this paper promotes\nthe concept of EGI, explores its scope and core principles, and conducts a\ncomprehensive survey concerning recent research efforts on this emerging field.\nSpecifically, this paper introduces and discusses: 1) fundamentals of edge\ncomputing and graph learning,2) emerging techniques centering on the closed\nloop between graph intelligence and edge networks, and 3) open challenges and\nresearch opportunities of future EGI. By bridging the gap across communication,\nnetworking, and graph learning areas, we believe that this survey can garner\nincreased attention, foster meaningful discussions, and inspire further\nresearch ideas in EGI.\n","authors":["Liekang Zeng","Shengyuan Ye","Xu Chen","Xiaoxi Zhang","Ju Ren","Jian Tang","Yang Yang"," Xuemin"," Shen"],"pdf_url":"https://arxiv.org/pdf/2407.15320v2.pdf","comment":"Accepted by IEEE Communications Surveys & Tutorials"},{"id":"http://arxiv.org/abs/2501.03566v1","updated":"2025-01-07T06:34:17Z","published":"2025-01-07T06:34:17Z","title":"Applying Large Language Models in Knowledge Graph-based Enterprise\n  Modeling: Challenges and Opportunities","summary":"  The role of large language models (LLMs) in enterprise modeling has recently\nstarted to shift from academic research to that of industrial applications.\nThereby, LLMs represent a further building block for the machine-supported\ngeneration of enterprise models. In this paper we employ a knowledge\ngraph-based approach for enterprise modeling and investigate the potential\nbenefits of LLMs in this context. In addition, the findings of an expert survey\nand ChatGPT-4o-based experiments demonstrate that LLM-based model generations\nexhibit minimal variability, yet remain constrained to specific tasks, with\nreliability declining for more intricate tasks. The survey results further\nsuggest that the supervision and intervention of human modeling experts are\nessential to ensure the accuracy and integrity of the generated models.\n","authors":["Benedikt Reitemeyer","Hans-Georg Fill"],"pdf_url":"https://arxiv.org/pdf/2501.03566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02155v2","updated":"2025-01-07T06:30:24Z","published":"2024-12-03T04:29:27Z","title":"CausalMob: Causal Human Mobility Prediction with LLMs-derived Human\n  Intentions toward Public Events","summary":"  Large-scale human mobility exhibits spatial and temporal patterns that can\nassist policymakers in decision making. Although traditional prediction models\nattempt to capture these patterns, they often interfered by non-periodic public\nevents, such as disasters and occasional celebrations. Since regular human\nmobility patterns are heavily affected by these events, estimating their causal\neffects is critical to accurate mobility predictions. Although news articles\nprovide unique perspectives on these events in an unstructured format,\nprocessing is a challenge. In this study, we propose a causality-augmented\nprediction model, called CausalMob, to analyze the causal effects of public\nevents. We first utilize large language models (LLMs) to extract human\nintentions from news articles and transform them into features that act as\ncausal treatments. Next, the model learns representations of spatio-temporal\nregional covariates from multiple data sources to serve as confounders for\ncausal inference. Finally, we present a causal effect estimation framework to\nensure event features remain independent of confounders during prediction.\nBased on large-scale real-world data, the experimental results show that the\nproposed model excels in human mobility prediction, outperforming\nstate-of-the-art models.\n","authors":["Xiaojie Yang","Hangli Ge","Jiawei Wang","Zipei Fan","Renhe Jiang","Ryosuke Shibasaki","Noboru Koshizuka"],"pdf_url":"https://arxiv.org/pdf/2412.02155v2.pdf","comment":"Accepted by KDD 2025"},{"id":"http://arxiv.org/abs/2501.03562v1","updated":"2025-01-07T06:22:55Z","published":"2025-01-07T06:22:55Z","title":"Rethinking Adversarial Attacks in Reinforcement Learning from Policy\n  Distribution Perspective","summary":"  Deep Reinforcement Learning (DRL) suffers from uncertainties and inaccuracies\nin the observation signal in realworld applications. Adversarial attack is an\neffective method for evaluating the robustness of DRL agents. However, existing\nattack methods targeting individual sampled actions have limited impacts on the\noverall policy distribution, particularly in continuous action spaces. To\naddress these limitations, we propose the Distribution-Aware Projected Gradient\nDescent attack (DAPGD). DAPGD uses distribution similarity as the gradient\nperturbation input to attack the policy network, which leverages the entire\npolicy distribution rather than relying on individual samples. We utilize the\nBhattacharyya distance in DAPGD to measure policy similarity, enabling\nsensitive detection of subtle but critical differences between probability\ndistributions. Our experiment results demonstrate that DAPGD achieves SOTA\nresults compared to the baselines in three robot navigation tasks, achieving an\naverage 22.03% higher reward drop compared to the best baseline.\n","authors":["Tianyang Duan","Zongyuan Zhang","Zheng Lin","Yue Gao","Ling Xiong","Yong Cui","Hongbin Liang","Xianhao Chen","Heming Cui","Dong Huang"],"pdf_url":"https://arxiv.org/pdf/2501.03562v1.pdf","comment":"10 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.03560v1","updated":"2025-01-07T06:21:40Z","published":"2025-01-07T06:21:40Z","title":"KG-TRICK: Unifying Textual and Relational Information Completion of\n  Knowledge for Multilingual Knowledge Graphs","summary":"  Multilingual knowledge graphs (KGs) provide high-quality relational and\ntextual information for various NLP applications, but they are often\nincomplete, especially in non-English languages. Previous research has shown\nthat combining information from KGs in different languages aids either\nKnowledge Graph Completion (KGC), the task of predicting missing relations\nbetween entities, or Knowledge Graph Enhancement (KGE), the task of predicting\nmissing textual information for entities. Although previous efforts have\nconsidered KGC and KGE as independent tasks, we hypothesize that they are\ninterdependent and mutually beneficial. To this end, we introduce KG-TRICK, a\nnovel sequence-to-sequence framework that unifies the tasks of textual and\nrelational information completion for multilingual KGs. KG-TRICK demonstrates\nthat: i) it is possible to unify the tasks of KGC and KGE into a single\nframework, and ii) combining textual information from multiple languages is\nbeneficial to improve the completeness of a KG. As part of our contributions,\nwe also introduce WikiKGE10++, the largest manually-curated benchmark for\ntextual information completion of KGs, which features over 25,000 entities\nacross 10 diverse languages.\n","authors":["Zelin Zhou","Simone Conia","Daniel Lee","Min Li","Shenglei Huang","Umar Farooq Minhas","Saloni Potdar","Henry Xiao","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2501.03560v1.pdf","comment":"Camera ready for COLING 2025"},{"id":"http://arxiv.org/abs/2501.03544v1","updated":"2025-01-07T05:39:21Z","published":"2025-01-07T05:39:21Z","title":"PromptGuard: Soft Prompt-Guided Unsafe Content Moderation for\n  Text-to-Image Models","summary":"  Text-to-image (T2I) models have been shown to be vulnerable to misuse,\nparticularly in generating not-safe-for-work (NSFW) content, raising serious\nethical concerns. In this work, we present PromptGuard, a novel content\nmoderation technique that draws inspiration from the system prompt mechanism in\nlarge language models (LLMs) for safety alignment. Unlike LLMs, T2I models lack\na direct interface for enforcing behavioral guidelines. Our key idea is to\noptimize a safety soft prompt that functions as an implicit system prompt\nwithin the T2I model's textual embedding space. This universal soft prompt (P*)\ndirectly moderates NSFW inputs, enabling safe yet realistic image generation\nwithout altering the inference efficiency or requiring proxy models. Extensive\nexperiments across three datasets demonstrate that PromptGuard effectively\nmitigates NSFW content generation while preserving high-quality benign outputs.\nPromptGuard achieves 7.8 times faster than prior content moderation methods,\nsurpassing eight state-of-the-art defenses with an optimal unsafe ratio down to\n5.84%.\n","authors":["Lingzhi Yuan","Xinfeng Li","Chejian Xu","Guanhong Tao","Xiaojun Jia","Yihao Huang","Wei Dong","Yang Liu","XiaoFeng Wang","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2501.03544v1.pdf","comment":"16 pages, 8 figures, 10 tables"},{"id":"http://arxiv.org/abs/2402.14658v3","updated":"2025-01-07T05:37:04Z","published":"2024-02-22T16:06:23Z","title":"OpenCodeInterpreter: Integrating Code Generation with Execution and\n  Refinement","summary":"  The introduction of large language models has significantly advanced code\ngeneration. However, open-source models often lack the execution capabilities\nand iterative refinement of advanced systems like the GPT-4 Code Interpreter.\nTo address this, we introduce OpenCodeInterpreter, a family of open-source code\nsystems designed for generating, executing, and iteratively refining code.\nSupported by Code-Feedback, a dataset featuring 68K multi-turn interactions,\nOpenCodeInterpreter integrates execution and human feedback for dynamic code\nrefinement. Our comprehensive evaluation of OpenCodeInterpreter across key\nbenchmarks such as HumanEval, MBPP, and their enhanced versions from EvalPlus\nreveals its exceptional performance. Notably, OpenCodeInterpreter-33B achieves\nan accuracy of 83.2 (76.4) on the average (and plus versions) of HumanEval and\nMBPP, closely rivaling GPT-4's 84.2 (76.2) and further elevates to 91.6 (84.6)\nwith synthesized human feedback from GPT-4. OpenCodeInterpreter brings the gap\nbetween open-source code generation models and proprietary systems like GPT-4\nCode Interpreter.\n","authors":["Tianyu Zheng","Ge Zhang","Tianhao Shen","Xueling Liu","Bill Yuchen Lin","Jie Fu","Wenhu Chen","Xiang Yue"],"pdf_url":"https://arxiv.org/pdf/2402.14658v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02156v2","updated":"2025-01-07T05:36:22Z","published":"2025-01-04T01:45:32Z","title":"The Race to Efficiency: A New Perspective on AI Scaling Laws","summary":"  As large-scale AI models expand, training becomes costlier and sustaining\nprogress grows harder. Classical scaling laws (e.g., Kaplan et al. (2020),\nHoffmann et al. (2022)) predict training loss from a static compute budget yet\nneglect time and efficiency, prompting the question: how can we balance\nballooning GPU fleets with rapidly improving hardware and algorithms? We\nintroduce the relative-loss equation, a time- and efficiency-aware framework\nthat extends classical AI scaling laws. Our model shows that, without ongoing\nefficiency gains, advanced performance could demand millennia of training or\nunrealistically large GPU fleets. However, near-exponential progress remains\nachievable if the \"efficiency-doubling rate\" parallels Moore's Law. By\nformalizing this race to efficiency, we offer a quantitative roadmap for\nbalancing front-loaded GPU investments with incremental improvements across the\nAI stack. Empirical trends suggest that sustained efficiency gains can push AI\nscaling well into the coming decade, providing a new perspective on the\ndiminishing returns inherent in classical scaling.\n","authors":["Chien-Ping Lu"],"pdf_url":"https://arxiv.org/pdf/2501.02156v2.pdf","comment":"21 pages, 3 figures. 2 tables, second draft"},{"id":"http://arxiv.org/abs/2402.13516v7","updated":"2025-01-07T05:26:54Z","published":"2024-02-21T03:58:49Z","title":"ProSparse: Introducing and Enhancing Intrinsic Activation Sparsity\n  within Large Language Models","summary":"  Activation sparsity refers to the existence of considerable\nweakly-contributed elements among activation outputs. As a prevalent property\nof the models using the ReLU activation function, activation sparsity has been\nproven a promising paradigm to boost model inference efficiency. Nevertheless,\nmost large language models (LLMs) adopt activation functions without intrinsic\nactivation sparsity (e.g., GELU and Swish). Some recent efforts have explored\nintroducing ReLU or its variants as the substitutive activation function to\nhelp LLMs achieve activation sparsity and inference acceleration, but few can\nsimultaneously obtain high sparsity and comparable model performance. This\npaper introduces a simple and effective sparsification method named \"ProSparse\"\nto push LLMs for higher activation sparsity while maintaining comparable\nperformance. Specifically, after substituting the activation function of LLMs\nwith ReLU, ProSparse adopts progressive sparsity regularization with a factor\nsmoothly increasing along the multi-stage sine curves. This can enhance\nactivation sparsity and mitigate performance degradation by avoiding radical\nshifts in activation distributions. With ProSparse, we obtain high sparsity of\n89.32% for LLaMA2-7B, 88.80% for LLaMA2-13B, and 87.89% for end-size\nMiniCPM-1B, respectively, achieving comparable performance to their original\nSwish-activated versions. These present the most sparsely activated models\namong open-source LLaMA versions and competitive end-size models, considerably\nsurpassing ReluLLaMA-7B (66.98%) and ReluLLaMA-13B (71.56%). Our inference\nacceleration experiments further demonstrate the significant practical\nacceleration potential of LLMs with higher activation sparsity, obtaining up to\n4.52$\\times$ inference speedup.\n","authors":["Chenyang Song","Xu Han","Zhengyan Zhang","Shengding Hu","Xiyu Shi","Kuai Li","Chen Chen","Zhiyuan Liu","Guangli Li","Tao Yang","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.13516v7.pdf","comment":"19 pages, 4 figures, 9 tables"},{"id":"http://arxiv.org/abs/2501.03540v1","updated":"2025-01-07T05:23:36Z","published":"2025-01-07T05:23:36Z","title":"Deep Learning within Tabular Data: Foundations, Challenges, Advances and\n  Future Directions","summary":"  Tabular data remains one of the most prevalent data types across a wide range\nof real-world applications, yet effective representation learning for this\ndomain poses unique challenges due to its irregular patterns, heterogeneous\nfeature distributions, and complex inter-column dependencies. This survey\nprovides a comprehensive review of state-of-the-art techniques in tabular data\nrepresentation learning, structured around three foundational design elements:\ntraining data, neural architectures, and learning objectives. Unlike prior\nsurveys that focus primarily on either architecture design or learning\nstrategies, we adopt a holistic perspective that emphasizes the universality\nand robustness of representation learning methods across diverse downstream\ntasks. We examine recent advances in data augmentation and generation,\nspecialized neural network architectures tailored to tabular data, and\ninnovative learning objectives that enhance representation quality.\nAdditionally, we highlight the growing influence of self-supervised learning\nand the adaptation of transformer-based foundation models for tabular data. Our\nreview is based on a systematic literature search using rigorous inclusion\ncriteria, encompassing 127 papers published since 2020 in top-tier conferences\nand journals. Through detailed analysis and comparison, we identify emerging\ntrends, critical gaps, and promising directions for future research, aiming to\nguide the development of more generalizable and effective tabular data\nrepresentation methods.\n","authors":["Weijieying Ren","Tianxiang Zhao","Yuqing Huang","Vasant Honavar"],"pdf_url":"https://arxiv.org/pdf/2501.03540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11876v2","updated":"2025-01-07T05:20:13Z","published":"2024-10-10T01:23:16Z","title":"Rescriber: Smaller-LLM-Powered User-Led Data Minimization for Navigating\n  Privacy Trade-offs in LLM-Based Conversational Agent","summary":"  The proliferation of LLM-based conversational agents has resulted in\nexcessive disclosure of identifiable or sensitive information. However,\nexisting technologies fail to offer perceptible control or account for users'\npersonal preferences about privacy-utility tradeoffs due to the lack of user\ninvolvement. To bridge this gap, we designed, built, and evaluated Rescriber, a\nbrowser extension that supports user-led data minimization in LLM-based\nconversational agents by helping users detect and sanitize personal information\nin their prompts. Our studies (N=12) showed that Rescriber helped users reduce\nunnecessary disclosure and addressed their privacy concerns. Users' subjective\nperceptions of the system powered by Llama3-8B were on par with that by GPT-4o.\nThe comprehensiveness and consistency of the detection and sanitization emerge\nas essential factors that affect users' trust and perceived protection. Our\nfindings confirm the viability of smaller-LLM-powered, user-facing, on-device\nprivacy controls, presenting a promising approach to address the privacy and\ntrust challenges of AI.\n","authors":["Jijie Zhou","Eryue Xu","Yaoyao Wu","Tianshi Li"],"pdf_url":"https://arxiv.org/pdf/2410.11876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03535v1","updated":"2025-01-07T05:15:46Z","published":"2025-01-07T05:15:46Z","title":"SenseRAG: Constructing Environmental Knowledge Bases with Proactive\n  Querying for LLM-Based Autonomous Driving","summary":"  This study addresses the critical need for enhanced situational awareness in\nautonomous driving (AD) by leveraging the contextual reasoning capabilities of\nlarge language models (LLMs). Unlike traditional perception systems that rely\non rigid, label-based annotations, it integrates real-time, multimodal sensor\ndata into a unified, LLMs-readable knowledge base, enabling LLMs to dynamically\nunderstand and respond to complex driving environments. To overcome the\ninherent latency and modality limitations of LLMs, a proactive\nRetrieval-Augmented Generation (RAG) is designed for AD, combined with a\nchain-of-thought prompting mechanism, ensuring rapid and context-rich\nunderstanding. Experimental results using real-world Vehicle-to-everything\n(V2X) datasets demonstrate significant improvements in perception and\nprediction performance, highlighting the potential of this framework to enhance\nsafety, adaptability, and decision-making in next-generation AD systems.\n","authors":["Xuewen Luo","Fan Ding","Fengze Yang","Yang Zhou","Junnyong Loo","Hwa Hui Tew","Chenxi Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03535v1.pdf","comment":"This paper has been accepted for presentation at WACV Workshop LLMAD\n  2025"},{"id":"http://arxiv.org/abs/2401.06949v2","updated":"2025-01-07T05:00:50Z","published":"2024-01-13T02:03:28Z","title":"ORGANA: A Robotic Assistant for Automated Chemistry Experimentation and\n  Characterization","summary":"  Chemistry experiments can be resource- and labor-intensive, often requiring\nmanual tasks like polishing electrodes in electrochemistry. Traditional lab\nautomation infrastructure faces challenges adapting to new experiments. To\naddress this, we introduce ORGANA, an assistive robotic system that automates\ndiverse chemistry experiments using decision-making and perception tools. It\nmakes decisions with chemists in the loop to control robots and lab devices.\nORGANA interacts with chemists using Large Language Models (LLMs) to derive\nexperiment goals, handle disambiguation, and provide experiment logs. ORGANA\nplans and executes complex tasks with visual feedback, while supporting\nscheduling and parallel task execution. We demonstrate ORGANA's capabilities in\nsolubility, pH measurement, recrystallization, and electrochemistry\nexperiments. In electrochemistry, it executes a 19-step plan in parallel to\ncharacterize quinone derivatives for flow batteries. Our user study shows\nORGANA reduces frustration and physical demand by over 50%, with users saving\nan average of 80.3% of their time when using it.\n","authors":["Kourosh Darvish","Marta Skreta","Yuchi Zhao","Naruki Yoshikawa","Sagnik Som","Miroslav Bogdanovic","Yang Cao","Han Hao","Haoping Xu","Alán Aspuru-Guzik","Animesh Garg","Florian Shkurti"],"pdf_url":"https://arxiv.org/pdf/2401.06949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12935v2","updated":"2025-01-07T04:42:20Z","published":"2024-06-17T03:03:34Z","title":"ChatBug: A Common Vulnerability of Aligned LLMs Induced by Chat\n  Templates","summary":"  Large language models (LLMs) are expected to follow instructions from users\nand engage in conversations. Techniques to enhance LLMs' instruction-following\ncapabilities typically fine-tune them using data structured according to a\npredefined chat template. Although chat templates are shown to be effective in\noptimizing LLM performance, their impact on safety alignment of LLMs has been\nless understood, which is crucial for deploying LLMs safely at scale.\n  In this paper, we investigate how chat templates affect safety alignment of\nLLMs. We identify a common vulnerability, named ChatBug, that is introduced by\nchat templates. Our key insight to identify ChatBug is that the chat templates\nprovide a rigid format that need to be followed by LLMs, but not by users.\nHence, a malicious user may not necessarily follow the chat template when\nprompting LLMs. Instead, malicious users could leverage their knowledge of the\nchat template and accordingly craft their prompts to bypass safety alignments\nof LLMs. We develop two attacks to exploit the ChatBug vulnerability. We\ndemonstrate that a malicious user can exploit the ChatBug vulnerability of\neight state-of-the-art (SOTA) LLMs and effectively elicit unintended responses\nfrom these models. Moreover, we show that ChatBug can be exploited by existing\njailbreak attacks to enhance their attack success rates. We investigate\npotential countermeasures to ChatBug. Our results show that while adversarial\ntraining effectively mitigates the ChatBug vulnerability, the victim model\nincurs significant performance degradation. These results highlight the\ntrade-off between safety alignment and helpfulness. Developing new methods for\ninstruction tuning to balance this trade-off is an open and critical direction\nfor future research\n","authors":["Fengqing Jiang","Zhangchen Xu","Luyao Niu","Bill Yuchen Lin","Radha Poovendran"],"pdf_url":"https://arxiv.org/pdf/2406.12935v2.pdf","comment":"This paper is accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2501.03523v1","updated":"2025-01-07T04:38:28Z","published":"2025-01-07T04:38:28Z","title":"Vocal Tract Length Warped Features for Spoken Keyword Spotting","summary":"  In this paper, we propose several methods that incorporate vocal tract length\n(VTL) warped features for spoken keyword spotting (KWS). The first method,\nVTL-independent KWS, involves training a single deep neural network (DNN) that\nutilizes VTL features with various warping factors. During training, a specific\nVTL feature is randomly selected per epoch, allowing the exploration of VTL\nvariations. During testing, the VTL features with different warping factors of\na test utterance are scored against the DNN and combined with equal weight. In\nthe second method scores the conventional features of a test utterance (without\nVTL warping) against the DNN. The third method, VTL-concatenation KWS,\nconcatenates VTL warped features to form high-dimensional features for KWS.\nEvaluations carried out on the English Google Command dataset demonstrate that\nthe proposed methods improve the accuracy of KWS.\n","authors":["Achintya kr. Sarkar","Priyanka Dwivedi","Zheng-Hua Tan"],"pdf_url":"https://arxiv.org/pdf/2501.03523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04828v5","updated":"2025-01-07T04:38:25Z","published":"2023-12-08T05:01:47Z","title":"HuRef: HUman-REadable Fingerprint for Large Language Models","summary":"  Protecting the copyright of large language models (LLMs) has become crucial\ndue to their resource-intensive training and accompanying carefully designed\nlicenses. However, identifying the original base model of an LLM is challenging\ndue to potential parameter alterations. In this study, we introduce HuRef, a\nhuman-readable fingerprint for LLMs that uniquely identifies the base model\nwithout interfering with training or exposing model parameters to the public.\nWe first observe that the vector direction of LLM parameters remains stable\nafter the model has converged during pretraining, with negligible perturbations\nthrough subsequent training steps, including continued pretraining, supervised\nfine-tuning, and RLHF, which makes it a sufficient condition to identify the\nbase model. The necessity is validated by continuing to train an LLM with an\nextra term to drive away the model parameters' direction and the model becomes\ndamaged. However, this direction is vulnerable to simple attacks like dimension\npermutation or matrix rotation, which significantly change it without affecting\nperformance. To address this, leveraging the Transformer structure, we\nsystematically analyze potential attacks and define three invariant terms that\nidentify an LLM's base model. Due to the potential risk of information leakage,\nwe cannot publish invariant terms directly. Instead, we map them to a Gaussian\nvector using an encoder, then convert it into a natural image using StyleGAN2,\nand finally publish the image. In our black-box setting, all fingerprinting\nsteps are internally conducted by the LLMs owners. To ensure the published\nfingerprints are honestly generated, we introduced Zero-Knowledge Proof (ZKP).\nExperimental results across various LLMs demonstrate the effectiveness of our\nmethod. The code is available at https://github.com/LUMIA-Group/HuRef.\n","authors":["Boyi Zeng","Lizheng Wang","Yuncong Hu","Yi Xu","Chenghu Zhou","Xinbing Wang","Yu Yu","Zhouhan Lin"],"pdf_url":"https://arxiv.org/pdf/2312.04828v5.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.06954v2","updated":"2025-01-07T04:11:55Z","published":"2024-08-13T15:13:21Z","title":"Neural Speech and Audio Coding: Modern AI Technology Meets Traditional\n  Codecs","summary":"  This paper explores the integration of model-based and data-driven approaches\nwithin the realm of neural speech and audio coding systems. It highlights the\nchallenges posed by the subjective evaluation processes of speech and audio\ncodecs and discusses the limitations of purely data-driven approaches, which\noften require inefficiently large architectures to match the performance of\nmodel-based methods. The study presents hybrid systems as a viable solution,\noffering significant improvements to the performance of conventional codecs\nthrough meticulously chosen design enhancements. Specifically, it introduces a\nneural network-based signal enhancer designed to post-process existing codecs'\noutput, along with the autoencoder-based end-to-end models and LPCNet--hybrid\nsystems that combine linear predictive coding (LPC) with neural networks.\nFurthermore, the paper delves into predictive models operating within custom\nfeature spaces (TF-Codec) or predefined transform domains (MDCTNet) and\nexamines the use of psychoacoustically calibrated loss functions to train\nend-to-end neural audio codecs. Through these investigations, the paper\ndemonstrates the potential of hybrid systems to advance the field of speech and\naudio coding by bridging the gap between traditional model-based approaches and\nmodern data-driven techniques.\n","authors":["Minje Kim","Jan Skoglund"],"pdf_url":"https://arxiv.org/pdf/2408.06954v2.pdf","comment":"Published in IEEE Signal Processing Magazine"},{"id":"http://arxiv.org/abs/2501.03228v2","updated":"2025-01-07T04:05:53Z","published":"2025-01-06T18:59:55Z","title":"LightGNN: Simple Graph Neural Network for Recommendation","summary":"  Graph neural networks (GNNs) have demonstrated superior performance in\ncollaborative recommendation through their ability to conduct high-order\nrepresentation smoothing, effectively capturing structural information within\nusers' interaction patterns. However, existing GNN paradigms face significant\nchallenges in scalability and robustness when handling large-scale, noisy, and\nreal-world datasets. To address these challenges, we present LightGNN, a\nlightweight and distillation-based GNN pruning framework designed to\nsubstantially reduce model complexity while preserving essential collaboration\nmodeling capabilities. Our LightGNN framework introduces a computationally\nefficient pruning module that adaptively identifies and removes redundant edges\nand embedding entries for model compression. The framework is guided by a\nresource-friendly hierarchical knowledge distillation objective, whose\nintermediate layer augments the observed graph to maintain performance,\nparticularly in high-rate compression scenarios. Extensive experiments on\npublic datasets demonstrate LightGNN's effectiveness, significantly improving\nboth computational efficiency and recommendation accuracy. Notably, LightGNN\nachieves an 80% reduction in edge count and 90% reduction in embedding entries\nwhile maintaining performance comparable to more complex state-of-the-art\nbaselines. The implementation of our LightGNN framework is available at the\ngithub repository: https://github.com/HKUDS/LightGNN.\n","authors":["Guoxuan Chen","Lianghao Xia","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2501.03228v2.pdf","comment":"Accepted to WSDM 2025 Oral"},{"id":"http://arxiv.org/abs/2305.17740v2","updated":"2025-01-07T04:03:46Z","published":"2023-05-28T14:48:38Z","title":"Bridging the Language Gap: Dynamic Learning Strategies for Improving\n  Multilingual Performance in LLMs","summary":"  Large language models (LLMs) have revolutionized various domains but still\nstruggle with non-Latin scripts and low-resource languages. This paper\naddresses the critical challenge of improving multilingual performance without\nextensive fine-tuning. We introduce a novel dynamic learning approach that\noptimizes prompt strategy, embedding model, and LLM per query at runtime. By\nadapting configurations dynamically, our method achieves significant\nimprovements over static, best and random baselines. It operates efficiently in\nboth offline and online settings, generalizing seamlessly across new languages\nand datasets. Leveraging Retrieval-Augmented Generation (RAG) with\nstate-of-the-art multilingual embeddings, we achieve superior task performance\nacross diverse linguistic contexts. Through systematic investigation and\nevaluation across 18 diverse languages using popular question-answering (QA)\ndatasets we show our approach results in 10-15% improvements in multilingual\nperformance over pre-trained models and 4x gains compared to fine-tuned,\nlanguage-specific models.\n","authors":["Somnath Kumar","Vaibhav Balloli","Mercy Ranjit","Kabir Ahuja","Sunayana Sitaram","Kalika Bali","Tanuja Ganu","Akshay Nambi"],"pdf_url":"https://arxiv.org/pdf/2305.17740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06167v3","updated":"2025-01-07T03:59:37Z","published":"2023-10-09T21:36:21Z","title":"Predictable Artificial Intelligence","summary":"  We introduce the fundamental ideas and challenges of Predictable AI, a\nnascent research area that explores the ways in which we can anticipate key\nvalidity indicators (e.g., performance, safety) of present and future AI\necosystems. We argue that achieving predictability is crucial for fostering\ntrust, liability, control, alignment and safety of AI ecosystems, and thus\nshould be prioritised over performance. We formally characterise\npredictability, explore its most relevant components, illustrate what can be\npredicted, describe alternative candidates for predictors, as well as the\ntrade-offs between maximising validity and predictability. To illustrate these\nconcepts, we bring an array of illustrative examples covering diverse ecosystem\nconfigurations. Predictable AI is related to other areas of technical and\nnon-technical AI research, but have distinctive questions, hypotheses,\ntechniques and challenges. This paper aims to elucidate them, calls for\nidentifying paths towards a landscape of predictably valid AI systems and\noutlines the potential impact of this emergent field.\n","authors":["Lexin Zhou","Pablo A. Moreno-Casares","Fernando Martínez-Plumed","John Burden","Ryan Burnell","Lucy Cheke","Cèsar Ferri","Alexandru Marcoci","Behzad Mehrbakhsh","Yael Moros-Daval","Seán Ó hÉigeartaigh","Danaja Rutar","Wout Schellaert","Konstantinos Voudouris","José Hernández-Orallo"],"pdf_url":"https://arxiv.org/pdf/2310.06167v3.pdf","comment":"Paper Under Review"},{"id":"http://arxiv.org/abs/2410.23111v5","updated":"2025-01-07T03:56:49Z","published":"2024-10-30T15:23:44Z","title":"Exploring Gradient Subspaces: Addressing and Overcoming LoRA's\n  Limitations in Federated Fine-Tuning of Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated remarkable capabilities across\nvarious domains, particularly in task generalization for both text and vision\ndata. While fine-tuning these models can significantly enhance their\nperformance on specific downstream tasks, it often requires high-quality data\nthat cannot be shared due to privacy concerns. Federated Learning (FL) offers a\npromising solution for collaborative training without direct data sharing.\nHowever, many parameter-efficient fine-tuning strategies for LLMs in FL,\nparticularly those based on Low-Rank Adaptation (LoRA), face limitations. In\nthis paper, we critically analyze the convergence and performance guarantees of\npopular FL frameworks utilizing LoRA, highlighting its suboptimal nature due to\nconstrained subspace learning of low-rank matrices. This limitation hinders\neffective fine-tuning of LLMs in federated settings. Through rigorous\nanalytical and empirical evaluations, we demonstrate that direct weight\naveraging outperforms LoRA-based strategies, leading to superior performance\nfor fine-tuned models. Our comprehensive comparison unmasks inefficiencies in\nLoRA approaches and underscores the advantages of direct weight aggregation. We\nextend our analysis to low-rank gradient-based optimizers, such as GaLore, used\nduring local training steps. Our findings show that GaLore along with\ndirect-weight aggregation is a more effective approach, outperforming federated\nLoRA methods like FlexLoRA and FFA-LoRA across both text and image modalities.\nWhile privacy remains paramount in FL discourse, our focus is on assessing\nperformance outcomes of federated fine-tuned models and evaluating various FL\nframeworks from both theoretical and empirical perspectives. Our findings\nadvocate reassessing the reliance on LoRA within FL contexts, paving the way\nfor more efficient training methodologies.\n","authors":["Navyansh Mahla","Kshitij Sharad Jadhav","Ganesh Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2410.23111v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16766v2","updated":"2025-01-07T03:53:12Z","published":"2024-05-27T02:27:28Z","title":"Concept Matching with Agent for Out-of-Distribution Detection","summary":"  The remarkable achievements of Large Language Models (LLMs) have captivated\nthe attention of both academia and industry, transcending their initial role in\ndialogue generation. To expand the usage scenarios of LLM, some works enhance\nthe effectiveness and capabilities of the model by introducing more external\ninformation, which is called the agent paradigm. Based on this idea, we propose\na new method that integrates the agent paradigm into out-of-distribution (OOD)\ndetection task, aiming to improve its robustness and adaptability. Our proposed\nmethod, Concept Matching with Agent (CMA), employs neutral prompts as agents to\naugment the CLIP-based OOD detection process. These agents function as dynamic\nobservers and communication hubs, interacting with both In-distribution (ID)\nlabels and data inputs to form vector triangle relationships. This triangular\nframework offers a more nuanced approach than the traditional binary\nrelationship, allowing for better separation and identification of ID and OOD\ninputs. Our extensive experimental results showcase the superior performance of\nCMA over both zero-shot and training-required methods in a diverse array of\nreal-world scenarios.\n","authors":["Yuxiao Lee","Xiaofeng Cao","Jingcai Guo","Wei Ye","Qing Guo","Yi Chang"],"pdf_url":"https://arxiv.org/pdf/2405.16766v2.pdf","comment":"Accepted by AAAI-25"},{"id":"http://arxiv.org/abs/2501.03499v1","updated":"2025-01-07T03:39:43Z","published":"2025-01-07T03:39:43Z","title":"Can Deep Learning Trigger Alerts from Mobile-Captured Images?","summary":"  Our research presents a comprehensive approach to leveraging mobile camera\nimage data for real-time air quality assessment and recommendation. We develop\na regression-based Convolutional Neural Network model and tailor it explicitly\nfor air quality prediction by exploiting the inherent relationship between\noutput parameters. As a result, the Mean Squared Error of 0.0077 and 0.0112\nobtained for 2 and 5 pollutants respectively outperforms existing models.\nFurthermore, we aim to verify the common practice of augmenting the original\ndataset with a view to introducing more variation in the training phase. It is\none of our most significant contributions that our experimental results\ndemonstrate minimal accuracy differences between the original and augmented\ndatasets. Finally, a real-time, user-friendly dashboard is implemented which\ndynamically displays the Air Quality Index and pollutant values derived from\ncaptured mobile camera images. Users' health conditions are considered to\nrecommend whether a location is suitable based on current air quality metrics.\nOverall, this research contributes to verification of data augmentation\ntechniques, CNN-based regression modelling for air quality prediction, and\nuser-centric air quality monitoring through mobile technology. The proposed\nsystem offers practical solutions for individuals to make informed\nenvironmental health and well-being decisions.\n","authors":["Pritisha Sarkar","Duranta Durbaar Vishal Saha","Mousumi Saha"],"pdf_url":"https://arxiv.org/pdf/2501.03499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24080v2","updated":"2025-01-07T03:33:00Z","published":"2024-10-31T16:16:51Z","title":"Graph Learning for Numeric Planning","summary":"  Graph learning is naturally well suited for use in symbolic, object-centric\nplanning due to its ability to exploit relational structures exhibited in\nplanning domains and to take as input planning instances with arbitrary numbers\nof objects. Numeric planning is an extension of symbolic planning in which\nstates may now also exhibit numeric variables. In this work, we propose\ndata-efficient and interpretable machine learning models for learning to solve\nnumeric planning tasks. This involves constructing a new graph kernel for\ngraphs with both continuous and categorical attributes, as well as new\noptimisation methods for learning heuristic functions for numeric planning.\nExperiments show that our graph kernels are vastly more efficient and\ngeneralise better than graph neural networks for numeric planning, and also\nyield competitive coverage performance compared to domain-independent numeric\nplanners. Code is available at https://github.com/DillonZChen/goose\n","authors":["Dillon Z. Chen","Sylvie Thiébaux"],"pdf_url":"https://arxiv.org/pdf/2410.24080v2.pdf","comment":"Extended version of NeurIPS 2024 paper"},{"id":"http://arxiv.org/abs/2501.02024v2","updated":"2025-01-07T03:29:43Z","published":"2025-01-02T20:47:04Z","title":"Model Checking in Medical Imaging for Tumor Detection and Segmentation","summary":"  Recent advancements in model checking have demonstrated significant potential\nacross diverse applications, particularly in signal and image analysis. Medical\nimaging stands out as a critical domain where model checking can be effectively\napplied to design and evaluate robust frameworks. These frameworks facilitate\nautomatic and semi-automatic delineation of regions of interest within images,\naiding in accurate segmentation. This paper provides a comprehensive analysis\nof recent works leveraging spatial logic to develop operators and tools for\nidentifying regions of interest, including tumorous and non-tumorous areas.\nAdditionally, we examine the challenges inherent to spatial model-checking\ntechniques, such as variability in ground truth data and the need for\nstreamlined procedures suitable for routine clinical practice.\n","authors":["Elhoucine Elfatimi","Lahcen El fatimi"],"pdf_url":"https://arxiv.org/pdf/2501.02024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03491v1","updated":"2025-01-07T03:21:17Z","published":"2025-01-07T03:21:17Z","title":"Can LLMs Design Good Questions Based on Context?","summary":"  This paper evaluates questions generated by LLMs from context, comparing them\nto human-generated questions across six dimensions. We introduce an automated\nLLM-based evaluation method, focusing on aspects like question length, type,\ncontext coverage, and answerability. Our findings highlight unique\ncharacteristics of LLM-generated questions, contributing insights that can\nsupport further research in question quality and downstream applications.\n","authors":["Yueheng Zhang","Xiaoyuan Liu","Yiyou Sun","Atheer Alharbi","Hend Alzahrani","Basel Alomair","Dawn Song"],"pdf_url":"https://arxiv.org/pdf/2501.03491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11397v2","updated":"2025-01-07T03:17:48Z","published":"2024-06-17T10:33:00Z","title":"DistPred: A Distribution-Free Probabilistic Inference Method for\n  Regression and Forecasting","summary":"  Traditional regression and prediction tasks often only provide deterministic\npoint estimates. To estimate the distribution or uncertainty of the response\nvariable, traditional methods either assume that the posterior distribution of\nsamples follows a Gaussian process or require thousands of forward passes for\nsample generation. We propose a novel approach called DistPred for regression\nand forecasting tasks, which overcomes the limitations of existing methods\nwhile remaining simple and powerful. Specifically, we transform proper scoring\nrules that measure the discrepancy between the predicted distribution and the\ntarget distribution into a differentiable discrete form and use it as a loss\nfunction to train the model end-to-end. This allows the model to sample\nnumerous samples in a single forward pass to estimate the potential\ndistribution of the response variable. We have compared our method with several\nexisting approaches on multiple datasets and achieved state-of-the-art\nperformance. Additionally, our method significantly improves computational\nefficiency. For example, compared to state-of-the-art models, DistPred has a\n180x faster inference speed Experimental results can be reproduced through\nhttps://github.com/Anoise/DistPred.\n","authors":["Daojun Liang","Haixia Zhang","Dongfeng Yuan"],"pdf_url":"https://arxiv.org/pdf/2406.11397v2.pdf","comment":"Published at KDD 2025"},{"id":"http://arxiv.org/abs/2412.19391v2","updated":"2025-01-07T03:15:49Z","published":"2024-12-27T00:36:40Z","title":"An In-Depth Analysis of Adversarial Discriminative Domain Adaptation for\n  Digit Classification","summary":"  Domain adaptation is an active area of research driven by the growing demand\nfor robust machine learning models that perform well on real-world data.\nAdversarial learning for deep neural networks (DNNs) has emerged as a promising\napproach to improving generalization ability, particularly for image\nclassification. In this paper, we implement a specific adversarial learning\ntechnique known as Adversarial Discriminative Domain Adaptation (ADDA) and\nreplicate digit classification experiments from the original ADDA paper. We\nextend their findings by examining a broader range of domain shifts and provide\na detailed analysis of in-domain classification accuracy post-ADDA. Our results\ndemonstrate that ADDA significantly improves accuracy across certain domain\nshifts with minimal impact on in-domain performance. Furthermore, we provide\nqualitative analysis and propose potential explanations for ADDA's limitations\nin less successful domain shifts. Code is at\nhttps://github.com/eugenechoi2004/COS429_FINAL .\n","authors":["Eugene Choi","Julian Rodriguez","Edmund Young"],"pdf_url":"https://arxiv.org/pdf/2412.19391v2.pdf","comment":"Replacement: Updated methodology section to include grayscale\n  preprocessing of SVHN data"},{"id":"http://arxiv.org/abs/2501.03486v1","updated":"2025-01-07T03:14:39Z","published":"2025-01-07T03:14:39Z","title":"Align-Pro: A Principled Approach to Prompt Optimization for LLM\n  Alignment","summary":"  The alignment of large language models (LLMs) with human values is critical\nas these models become increasingly integrated into various societal and\ndecision-making processes. Traditional methods, such as reinforcement learning\nfrom human feedback (RLHF), achieve alignment by fine-tuning model parameters,\nbut these approaches are often computationally expensive and impractical when\nmodels are frozen or inaccessible for parameter modification. In contrast,\nprompt optimization is a viable alternative to RLHF for LLM alignment. While\nthe existing literature has shown empirical promise of prompt optimization, its\ntheoretical underpinning remains under-explored. We address this gap by\nformulating prompt optimization as an optimization problem and try to provide\ntheoretical insights into the optimality of such a framework. To analyze the\nperformance of the prompt optimization, we study theoretical suboptimality\nbounds and provide insights in terms of how prompt optimization depends upon\nthe given prompter and target model. We also provide empirical validation\nthrough experiments on various datasets, demonstrating that prompt optimization\ncan effectively align LLMs, even when parameter fine-tuning is not feasible.\n","authors":["Prashant Trivedi","Souradip Chakraborty","Avinash Reddy","Vaneet Aggarwal","Amrit Singh Bedi","George K. Atia"],"pdf_url":"https://arxiv.org/pdf/2501.03486v1.pdf","comment":"27 pages, Accepted in AAAI 2025"},{"id":"http://arxiv.org/abs/2411.03334v3","updated":"2025-01-07T03:01:49Z","published":"2024-10-23T19:56:57Z","title":"Neural Network Prediction of Strong Lensing Systems with Domain\n  Adaptation and Uncertainty Quantification","summary":"  Modeling strong gravitational lenses is computationally expensive for the\ncomplex data from modern and next-generation cosmic surveys. Deep learning has\nemerged as a promising approach for finding lenses and predicting lensing\nparameters, such as the Einstein radius. Mean-variance Estimators (MVEs) are a\ncommon approach for obtaining aleatoric (data) uncertainties from a neural\nnetwork prediction. However, neural networks have not been demonstrated to\nperform well on out-of-domain target data successfully - e.g., when trained on\nsimulated data and applied to real, observational data. In this work, we\nperform the first study of the efficacy of MVEs in combination with\nunsupervised domain adaptation (UDA) on strong lensing data. The source domain\ndata is noiseless, and the target domain data has noise mimicking modern\ncosmology surveys. We find that adding UDA to MVE increases the accuracy on the\ntarget data by a factor of about two over an MVE model without UDA. Including\nUDA also permits much more well-calibrated aleatoric uncertainty predictions.\nAdvancements in this approach may enable future applications of MVE models to\nreal observational data.\n","authors":["Shrihan Agarwal","Aleksandra Ćiprijanović","Brian D. Nord"],"pdf_url":"https://arxiv.org/pdf/2411.03334v3.pdf","comment":"Accepted to the Machine Learning for Physical Sciences workshop at\n  NeurIPS 2024; 24 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.01691v2","updated":"2025-01-07T02:57:03Z","published":"2025-01-03T08:18:08Z","title":"VidFormer: A novel end-to-end framework fused by 3DCNN and Transformer\n  for Video-based Remote Physiological Measurement","summary":"  Remote physiological signal measurement based on facial videos, also known as\nremote photoplethysmography (rPPG), involves predicting changes in facial\nvascular blood flow from facial videos. While most deep learning-based methods\nhave achieved good results, they often struggle to balance performance across\nsmall and large-scale datasets due to the inherent limitations of convolutional\nneural networks (CNNs) and Transformer. In this paper, we introduce VidFormer,\na novel end-to-end framework that integrates 3-Dimension Convolutional Neural\nNetwork (3DCNN) and Transformer models for rPPG tasks. Initially, we conduct an\nanalysis of the traditional skin reflection model and subsequently introduce an\nenhanced model for the reconstruction of rPPG signals. Based on this improved\nmodel, VidFormer utilizes 3DCNN and Transformer to extract local and global\nfeatures from input data, respectively. To enhance the spatiotemporal feature\nextraction capabilities of VidFormer, we incorporate temporal-spatial attention\nmechanisms tailored for both 3DCNN and Transformer. Additionally, we design a\nmodule to facilitate information exchange and fusion between the 3DCNN and\nTransformer. Our evaluation on five publicly available datasets demonstrates\nthat VidFormer outperforms current state-of-the-art (SOTA) methods. Finally, we\ndiscuss the essential roles of each VidFormer module and examine the effects of\nethnicity, makeup, and exercise on its performance.\n","authors":["Jiachen Li","Shisheng Guo","Longzhen Tang","Cuolong Cui","Lingjiang Kong","Xiaobo Yang"],"pdf_url":"https://arxiv.org/pdf/2501.01691v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02964v2","updated":"2025-01-07T02:55:15Z","published":"2025-01-06T12:16:56Z","title":"Socratic Questioning: Learn to Self-guide Multimodal Reasoning in the\n  Wild","summary":"  Complex visual reasoning remains a key challenge today. Typically, the\nchallenge is tackled using methodologies such as Chain of Thought (COT) and\nvisual instruction tuning. However, how to organically combine these two\nmethodologies for greater success remains unexplored. Also, issues like\nhallucinations and high training cost still need to be addressed. In this work,\nwe devise an innovative multi-round training and reasoning framework suitable\nfor lightweight Multimodal Large Language Models (MLLMs). Our self-questioning\napproach heuristically guides MLLMs to focus on visual clues relevant to the\ntarget problem, reducing hallucinations and enhancing the model's ability to\ndescribe fine-grained image details. This ultimately enables the model to\nperform well in complex visual reasoning and question-answering tasks. We have\nnamed this framework Socratic Questioning(SQ). To facilitate future research,\nwe create a multimodal mini-dataset named CapQA, which includes 1k images of\nfine-grained activities, for visual instruction tuning and evaluation, our\nproposed SQ method leads to a 31.2% improvement in the hallucination score. Our\nextensive experiments on various benchmarks demonstrate SQ's remarkable\ncapabilities in heuristic self-questioning, zero-shot visual reasoning and\nhallucination mitigation. Our model and code will be publicly available.\n","authors":["Wanpeng Hu","Haodi Liu","Lin Chen","Feng Zhou","Changming Xiao","Qi Yang","Changshui Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.02964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03475v1","updated":"2025-01-07T02:33:25Z","published":"2025-01-07T02:33:25Z","title":"Reading with Intent -- Neutralizing Intent","summary":"  Queries to large language models (LLMs) can be divided into two parts: the\ninstruction/question and the accompanying context. The context for\nretrieval-augmented generation (RAG) systems in most benchmarks comes from\nWikipedia or Wikipedia-like texts which are written in a neutral and factual\ntone. However, when RAG systems retrieve internet-based content, they encounter\ntext with diverse tones and linguistic styles, introducing challenges for\ndownstream tasks. The Reading with Intent task addresses this issue by\nevaluating how varying tones in context passages affect model performance.\nBuilding on prior work that focused on sarcasm, we extend this paradigm by\nconstructing a dataset where context passages are transformed to $11$ distinct\nemotions using a better synthetic data generation approach. Using this dataset,\nwe train an emotion translation model to systematically adapt passages to\nspecified emotional tones. The human evaluation shows that the LLM fine-tuned\nto become the emotion-translator benefited from the synthetically generated\ndata. Finally, the emotion-translator is used in the Reading with Intent task\nto transform the passages to a neutral tone. By neutralizing the passages, it\nmitigates the challenges posed by sarcastic passages and improves overall\nresults on this task by about $3\\%$.\n","authors":["Benjamin Reichman","Adar Avsian","Larry Heck"],"pdf_url":"https://arxiv.org/pdf/2501.03475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01973v2","updated":"2025-01-07T02:10:45Z","published":"2024-12-28T02:28:19Z","title":"INFELM: In-depth Fairness Evaluation of Large Text-To-Image Models","summary":"  The rapid development of large language models (LLMs) and large vision models\n(LVMs) have propelled the evolution of multi-modal AI systems, which have\ndemonstrated the remarkable potential for industrial applications by emulating\nhuman-like cognition. However, they also pose significant ethical challenges,\nincluding amplifying harmful content and reinforcing societal biases. For\ninstance, biases in some industrial image generation models highlighted the\nurgent need for robust fairness assessments. Most existing evaluation\nframeworks focus on the comprehensiveness of various aspects of the models, but\nthey exhibit critical limitations, including insufficient attention to content\ngeneration alignment and social bias-sensitive domains. More importantly, their\nreliance on pixel-detection techniques is prone to inaccuracies.\n  To address these issues, this paper presents INFELM, an in-depth fairness\nevaluation on widely-used text-to-image models. Our key contributions are: (1)\nan advanced skintone classifier incorporating facial topology and refined skin\npixel representation to enhance classification precision by at least 16.04%,\n(2) a bias-sensitive content alignment measurement for understanding societal\nimpacts, (3) a generalizable representation bias evaluation for diverse\ndemographic groups, and (4) extensive experiments analyzing large-scale\ntext-to-image model outputs across six social-bias-sensitive domains. We find\nthat existing models in the study generally do not meet the empirical fairness\ncriteria, and representation bias is generally more pronounced than alignment\nerrors. INFELM establishes a robust benchmark for fairness assessment,\nsupporting the development of multi-modal AI systems that align with ethical\nand human-centric principles.\n","authors":["Di Jin","Xing Liu","Yu Liu","Jia Qing Yap","Andrea Wong","Adriana Crespo","Qi Lin","Zhiyuan Yin","Qiang Yan","Ryan Ye"],"pdf_url":"https://arxiv.org/pdf/2501.01973v2.pdf","comment":"Di Jin and Xing Liu contributed equally to this work"},{"id":"http://arxiv.org/abs/2501.03468v1","updated":"2025-01-07T01:52:56Z","published":"2025-01-07T01:52:56Z","title":"MTRAG: A Multi-Turn Conversational Benchmark for Evaluating\n  Retrieval-Augmented Generation Systems","summary":"  Retrieval-augmented generation (RAG) has recently become a very popular task\nfor Large Language Models (LLMs). Evaluating them on multi-turn RAG\nconversations, where the system is asked to generate a response to a question\nin the context of a preceding conversation is an important and often overlooked\ntask with several additional challenges. We present MTRAG: an end-to-end\nhuman-generated multi-turn RAG benchmark that reflects several real-world\nproperties across diverse dimensions for evaluating the full RAG pipeline.\nMTRAG contains 110 conversations averaging 7.7 turns each across four domains\nfor a total of 842 tasks. We also explore automation paths via synthetic data\nand LLM-as-a-Judge evaluation. Our human and automatic evaluations show that\neven state-of-the-art LLM RAG systems struggle on MTRAG. We demonstrate the\nneed for strong retrieval and generation systems that can handle later turns,\nunanswerable questions, non-standalone questions, and multiple domains. MTRAG\nis available at https://github.com/ibm/mt-rag-benchmark.\n","authors":["Yannis Katsis","Sara Rosenthal","Kshitij Fadnis","Chulaka Gunasekara","Young-Suk Lee","Lucian Popa","Vraj Shah","Huaiyu Zhu","Danish Contractor","Marina Danilevsky"],"pdf_url":"https://arxiv.org/pdf/2501.03468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19139v2","updated":"2025-01-07T01:50:11Z","published":"2024-12-26T09:51:05Z","title":"PlanLLM: Video Procedure Planning with Refinable Large Language Models","summary":"  Video procedure planning, i.e., planning a sequence of action steps given the\nvideo frames of start and goal states, is an essential ability for embodied AI.\nRecent works utilize Large Language Models (LLMs) to generate enriched action\nstep description texts to guide action step decoding. Although LLMs are\nintroduced, these methods decode the action steps into a closed-set of one-hot\nvectors, limiting the model's capability of generalizing to new steps or tasks.\nAdditionally, fixed action step descriptions based on world-level commonsense\nmay contain noise in specific instances of visual states. In this paper, we\npropose PlanLLM, a cross-modal joint learning framework with LLMs for video\nprocedure planning. We propose an LLM-Enhanced Planning module which fully uses\nthe generalization ability of LLMs to produce free-form planning output and to\nenhance action step decoding. We also propose Mutual Information Maximization\nmodule to connect world-level commonsense of step descriptions and\nsample-specific information of visual states, enabling LLMs to employ the\nreasoning ability to generate step sequences. With the assistance of LLMs, our\nmethod can both closed-set and open vocabulary procedure planning tasks. Our\nPlanLLM achieves superior performance on three benchmarks, demonstrating the\neffectiveness of our designs.\n","authors":["Dejie Yang","Zijing Zhao","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2412.19139v2.pdf","comment":"accepted to AAAI2025"},{"id":"http://arxiv.org/abs/2501.03464v1","updated":"2025-01-07T01:45:39Z","published":"2025-01-07T01:45:39Z","title":"LHGNN: Local-Higher Order Graph Neural Networks For Audio Classification\n  and Tagging","summary":"  Transformers have set new benchmarks in audio processing tasks, leveraging\nself-attention mechanisms to capture complex patterns and dependencies within\naudio data. However, their focus on pairwise interactions limits their ability\nto process the higher-order relations essential for identifying distinct audio\nobjects. To address this limitation, this work introduces the Local- Higher\nOrder Graph Neural Network (LHGNN), a graph based model that enhances feature\nunderstanding by integrating local neighbourhood information with higher-order\ndata from Fuzzy C-Means clusters, thereby capturing a broader spectrum of audio\nrelationships. Evaluation of the model on three publicly available audio\ndatasets shows that it outperforms Transformer-based models across all\nbenchmarks while operating with substantially fewer parameters. Moreover, LHGNN\ndemonstrates a distinct advantage in scenarios lacking ImageNet pretraining,\nestablishing its effectiveness and efficiency in environments where extensive\npretraining data is unavailable.\n","authors":["Shubhr Singh","Emmanouil Benetos","Huy Phan","Dan Stowell"],"pdf_url":"https://arxiv.org/pdf/2501.03464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22376v2","updated":"2025-01-07T01:41:13Z","published":"2024-10-29T07:43:39Z","title":"Rare-to-Frequent: Unlocking Compositional Generation Power of Diffusion\n  Models on Rare Concepts with LLM Guidance","summary":"  State-of-the-art text-to-image (T2I) diffusion models often struggle to\ngenerate rare compositions of concepts, e.g., objects with unusual attributes.\nIn this paper, we show that the compositional generation power of diffusion\nmodels on such rare concepts can be significantly enhanced by the Large\nLanguage Model (LLM) guidance. We start with empirical and theoretical\nanalysis, demonstrating that exposing frequent concepts relevant to the target\nrare concepts during the diffusion sampling process yields more accurate\nconcept composition. Based on this, we propose a training-free approach, R2F,\nthat plans and executes the overall rare-to-frequent concept guidance\nthroughout the diffusion inference by leveraging the abundant semantic\nknowledge in LLMs. Our framework is flexible across any pre-trained diffusion\nmodels and LLMs, and can be seamlessly integrated with the region-guided\ndiffusion approaches. Extensive experiments on three datasets, including our\nnewly proposed benchmark, RareBench, containing various prompts with rare\ncompositions of concepts, R2F significantly surpasses existing models including\nSD3.0 and FLUX by up to 28.1%p in T2I alignment. Code is available at\nhttps://github.com/krafton-ai/Rare-to-Frequent.\n","authors":["Dongmin Park","Sebin Kim","Taehong Moon","Minkyu Kim","Kangwook Lee","Jaewoong Cho"],"pdf_url":"https://arxiv.org/pdf/2410.22376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14026v2","updated":"2025-01-07T01:40:42Z","published":"2024-09-21T05:58:07Z","title":"Uncovering Latent Chain of Thought Vectors in Language Models","summary":"  As language models grow more influential and trusted in our society, our\nability to reliably steer them toward favorable behaviors becomes increasingly\nparamount. For this, we investigate the technique of steering vectors: biasing\nthe forward pass of language models using a \"steering vector\" derived from a\nspecific task. We apply them to steer language models toward performing Chain\nof Thought (CoT) Reasoning without the need to prompt through natural language.\nWe demonstrate this approach on Llama3 8b and Mistral 7b v0.2, and obtain\ncompetitive results compared to CoT-prompted performances on a series of\nreasoning benchmarks (GSM8k, MMLU, AGI Eval, ARC AI2) and qualitative examples.\nWe find this approach yields consistent steering towards CoT responses and\ntakes less compute than traditional methods of fine-tuning models towards CoT.\n","authors":["Jason Zhang","Scott Viteri"],"pdf_url":"https://arxiv.org/pdf/2409.14026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03461v1","updated":"2025-01-07T01:35:56Z","published":"2025-01-07T01:35:56Z","title":"Radar Signal Recognition through Self-Supervised Learning and Domain\n  Adaptation","summary":"  Automatic radar signal recognition (RSR) plays a pivotal role in electronic\nwarfare (EW), as accurately classifying radar signals is critical for informing\ndecision-making processes. Recent advances in deep learning have shown\nsignificant potential in improving RSR performance in domains with ample\nannotated data. However, these methods fall short in EW scenarios where\nannotated RF data are scarce or impractical to obtain. To address these\nchallenges, we introduce a self-supervised learning (SSL) method which utilises\nmasked signal modelling and RF domain adaption to enhance RSR performance in\nenvironments with limited RF samples and labels. Specifically, we investigate\npre-training masked autoencoders (MAE) on baseband in-phase and quadrature\n(I/Q) signals from various RF domains and subsequently transfer the learned\nrepresentation to the radar domain, where annotated data are limited. Empirical\nresults show that our lightweight self-supervised ResNet model with domain\nadaptation achieves up to a 17.5\\% improvement in 1-shot classification\naccuracy when pre-trained on in-domain signals (i.e., radar signals) and up to\na 16.31\\% improvement when pre-trained on out-of-domain signals (i.e., comm\nsignals), compared to its baseline without SSL. We also provide reference\nresults for several MAE designs and pre-training strategies, establishing a new\nbenchmark for few-shot radar signal classification.\n","authors":["Zi Huang","Akila Pemasiri","Simon Denman","Clinton Fookes","Terrence Martin"],"pdf_url":"https://arxiv.org/pdf/2501.03461v1.pdf","comment":"5 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.03458v1","updated":"2025-01-07T01:19:48Z","published":"2025-01-07T01:19:48Z","title":"Activating Associative Disease-Aware Vision Token Memory for LLM-Based\n  X-ray Report Generation","summary":"  X-ray image based medical report generation achieves significant progress in\nrecent years with the help of the large language model, however, these models\nhave not fully exploited the effective information in visual image regions,\nresulting in reports that are linguistically sound but insufficient in\ndescribing key diseases. In this paper, we propose a novel associative\nmemory-enhanced X-ray report generation model that effectively mimics the\nprocess of professional doctors writing medical reports. It considers both the\nmining of global and local visual information and associates historical report\ninformation to better complete the writing of the current report. Specifically,\ngiven an X-ray image, we first utilize a classification model along with its\nactivation maps to accomplish the mining of visual regions highly associated\nwith diseases and the learning of disease query tokens. Then, we employ a\nvisual Hopfield network to establish memory associations for disease-related\ntokens, and a report Hopfield network to retrieve report memory information.\nThis process facilitates the generation of high-quality reports based on a\nlarge language model and achieves state-of-the-art performance on multiple\nbenchmark datasets, including the IU X-ray, MIMIC-CXR, and Chexpert Plus. The\nsource code of this work is released on\n\\url{https://github.com/Event-AHU/Medical_Image_Analysis}.\n","authors":["Xiao Wang","Fuling Wang","Haowen Wang","Bo Jiang","Chuanfu Li","Yaowei Wang","Yonghong Tian","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2501.03458v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2403.05260v2","updated":"2025-01-07T00:53:48Z","published":"2024-03-08T12:31:03Z","title":"Towards generalization of drug response prediction to single cells and\n  patients utilizing importance-aware multi-source domain transfer learning","summary":"  The advancement of single-cell sequencing technology has promoted the\ngeneration of a large amount of single-cell transcriptional profiles, providing\nunprecedented opportunities to identify drug-resistant cell subpopulations\nwithin a tumor. However, few studies have focused on drug response prediction\nat single-cell level, and their performance remains suboptimal. This paper\nproposed scAdaDrug, a novel multi-source domain adaptation model powered by\nadaptive importance-aware representation learning to predict drug response of\nindividual cells. We used a shared encoder to extract domain-invariant features\nrelated to drug response from multiple source domains by utilizing adversarial\ndomain adaptation. Particularly, we introduced a plug-and-play module to\ngenerate importance-aware and mutually independent weights, which could\nadaptively modulate the latent representation of each sample in element-wise\nmanner between source and target domains. Extensive experimental results showed\nthat our model achieved state-of-the-art performance in predicting drug\nresponse on multiple independent datasets, including single-cell datasets\nderived from both cell lines and patient-derived xenografts (PDX) models, as\nwell as clinical tumor patient cohorts. Moreover, the ablation experiments\ndemonstrated our model effectively captured the underlying patterns determining\ndrug response from multiple source domains.\n","authors":["Hui Liu","Wei Duan","Judong Luo"],"pdf_url":"https://arxiv.org/pdf/2403.05260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00826v2","updated":"2025-01-07T00:15:11Z","published":"2025-01-01T13:08:17Z","title":"LLM-Powered Multi-Agent System for Automated Crypto Portfolio Management","summary":"  Cryptocurrency investment is inherently difficult due to its shorter history\ncompared to traditional assets, the need to integrate vast amounts of data from\nvarious modalities, and the requirement for complex reasoning. While deep\nlearning approaches have been applied to address these challenges, their\nblack-box nature raises concerns about trust and explainability. Recently,\nlarge language models (LLMs) have shown promise in financial applications due\nto their ability to understand multi-modal data and generate explainable\ndecisions. However, single LLM faces limitations in complex, comprehensive\ntasks such as asset investment. These limitations are even more pronounced in\ncryptocurrency investment, where LLMs have less domain-specific knowledge in\ntheir training corpora.\n  To overcome these challenges, we propose an explainable, multi-modal,\nmulti-agent framework for cryptocurrency investment. Our framework uses\nspecialized agents that collaborate within and across teams to handle subtasks\nsuch as data analysis, literature integration, and investment decision-making\nfor the top 30 cryptocurrencies by market capitalization. The expert training\nmodule fine-tunes agents using multi-modal historical data and professional\ninvestment literature, while the multi-agent investment module employs\nreal-time data to make informed cryptocurrency investment decisions. Unique\nintrateam and interteam collaboration mechanisms enhance prediction accuracy by\nadjusting final predictions based on confidence levels within agent teams and\nfacilitating information sharing between teams. Empirical evaluation using data\nfrom November 2023 to September 2024 demonstrates that our framework\noutperforms single-agent models and market benchmarks in classification, asset\npricing, portfolio, and explainability performance.\n","authors":["Yichen Luo","Yebo Feng","Jiahua Xu","Paolo Tasca","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.00826v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03443v1","updated":"2025-01-07T00:09:52Z","published":"2025-01-07T00:09:52Z","title":"Optimization Learning","summary":"  This article introduces the concept of optimization learning, a methodology\nto design optimization proxies that learn the input/output mapping of\nparametric optimization problems. These optimization proxies are trustworthy by\ndesign: they compute feasible solutions to the underlying optimization\nproblems, provide quality guarantees on the returned solutions, and scale to\nlarge instances. Optimization proxies are differentiable programs that combine\ntraditional deep learning technology with repair or completion layers to\nproduce feasible solutions. The article shows that optimization proxies can be\ntrained end-to-end in a self-supervised way. It presents methodologies to\nprovide performance guarantees and to scale optimization proxies to large-scale\noptimization problems. The potential of optimization proxies is highlighted\nthrough applications in power systems and, in particular, real-time risk\nassessment and security-constrained optimal power flow.\n","authors":["Pascal Van Hentenryck"],"pdf_url":"https://arxiv.org/pdf/2501.03443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01855v2","updated":"2025-01-07T23:47:06Z","published":"2024-10-01T22:47:24Z","title":"Explainable Diagnosis Prediction through Neuro-Symbolic Integration","summary":"  Diagnosis prediction is a critical task in healthcare, where timely and\naccurate identification of medical conditions can significantly impact patient\noutcomes. Traditional machine learning and deep learning models have achieved\nnotable success in this domain but often lack interpretability which is a\ncrucial requirement in clinical settings. In this study, we explore the use of\nneuro-symbolic methods, specifically Logical Neural Networks (LNNs), to develop\nexplainable models for diagnosis prediction. Essentially, we design and\nimplement LNN-based models that integrate domain-specific knowledge through\nlogical rules with learnable thresholds. Our models, particularly\n$M_{\\text{multi-pathway}}$ and $M_{\\text{comprehensive}}$, demonstrate superior\nperformance over traditional models such as Logistic Regression, SVM, and\nRandom Forest, achieving higher accuracy (up to 80.52\\%) and AUROC scores (up\nto 0.8457) in the case study of diabetes prediction. The learned weights and\nthresholds within the LNN models provide direct insights into feature\ncontributions, enhancing interpretability without compromising predictive\npower. These findings highlight the potential of neuro-symbolic approaches in\nbridging the gap between accuracy and explainability in healthcare AI\napplications. By offering transparent and adaptable diagnostic models, our work\ncontributes to the advancement of precision medicine and supports the\ndevelopment of equitable healthcare solutions. Future research will focus on\nextending these methods to larger and more diverse datasets to further validate\ntheir applicability across different medical conditions and populations.\n","authors":["Qiuhao Lu","Rui Li","Elham Sagheb","Andrew Wen","Jinlian Wang","Liwei Wang","Jungwei W. Fan","Hongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2410.01855v2.pdf","comment":"Proceedings of AMIA Informatics Summit 2025"},{"id":"http://arxiv.org/abs/2501.00790v2","updated":"2025-01-07T23:43:09Z","published":"2025-01-01T10:00:49Z","title":"LENS-XAI: Redefining Lightweight and Explainable Network Security\n  through Knowledge Distillation and Variational Autoencoders for Scalable\n  Intrusion Detection in Cybersecurity","summary":"  The rapid proliferation of Industrial Internet of Things (IIoT) systems\nnecessitates advanced, interpretable, and scalable intrusion detection systems\n(IDS) to combat emerging cyber threats. Traditional IDS face challenges such as\nhigh computational demands, limited explainability, and inflexibility against\nevolving attack patterns. To address these limitations, this study introduces\nthe Lightweight Explainable Network Security framework (LENS-XAI), which\ncombines robust intrusion detection with enhanced interpretability and\nscalability. LENS-XAI integrates knowledge distillation, variational\nautoencoder models, and attribution-based explainability techniques to achieve\nhigh detection accuracy and transparency in decision-making. By leveraging a\ntraining set comprising 10% of the available data, the framework optimizes\ncomputational efficiency without sacrificing performance. Experimental\nevaluation on four benchmark datasets: Edge-IIoTset, UKM-IDS20, CTU-13, and\nNSL-KDD, demonstrates the framework's superior performance, achieving detection\naccuracies of 95.34%, 99.92%, 98.42%, and 99.34%, respectively. Additionally,\nthe framework excels in reducing false positives and adapting to complex attack\nscenarios, outperforming existing state-of-the-art methods. Key strengths of\nLENS-XAI include its lightweight design, suitable for resource-constrained\nenvironments, and its scalability across diverse IIoT and cybersecurity\ncontexts. Moreover, the explainability module enhances trust and transparency,\ncritical for practical deployment in dynamic and sensitive applications. This\nresearch contributes significantly to advancing IDS by addressing computational\nefficiency, feature interpretability, and real-world applicability. Future work\ncould focus on extending the framework to ensemble AI systems for distributed\nenvironments, further enhancing its robustness and adaptability.\n","authors":["Muhammet Anil Yagiz","Polat Goktas"],"pdf_url":"https://arxiv.org/pdf/2501.00790v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04182v1","updated":"2025-01-07T23:23:26Z","published":"2025-01-07T23:23:26Z","title":"Fixed Points of Deep Neural Networks: Emergence, Stability, and\n  Applications","summary":"  We present numerical and analytical results on the formation and stability of\na family of fixed points of deep neural networks (DNNs). Such fixed points\nappear in a class of DNNs when dimensions of input and output vectors are the\nsame. We demonstrate examples of applications of such networks in supervised,\nsemi-supervised and unsupervised learning such as encoding/decoding of images,\nrestoration of damaged images among others.\n  We present several numerical and analytical results. First, we show that for\nuntrained DNN's with weights and biases initialized by normally distributed\nrandom variables the only one fixed point exists. This result holds for DNN\nwith any depth (number of layers) $L$, any layer width $N$, and sigmoid-type\nactivation functions. Second, it has been shown that for a DNN whose parameters\n(weights and biases) are initialized by ``light-tailed'' distribution of\nweights (e.g. normal distribution), after training the distribution of these\nparameters become ``heavy-tailed''. This motivates our study of DNNs with\n``heavy-tailed'' initialization. For such DNNs we show numerically %existence\nand stability that training leads to emergence of $Q(N,L)$ fixed points, where\n$Q(N,L)$ is a positive integer which depends on the number of layers $L$ and\nlayer width $N$. We further observe numerically that for fixed $N = N_0$ the\nfunction $Q(N_0, L)$ is non-monotone, that is it initially grows as $L$\nincreases and then decreases to 1.\n  This non-monotone behavior of $Q(N_0, L)$ is also obtained by analytical\nderivation of equation for Empirical Spectral Distribution (ESD) of\ninput-output Jacobian followed by numerical solution of this equation.\n","authors":["L. Berlyand","V. Slavin"],"pdf_url":"https://arxiv.org/pdf/2501.04182v1.pdf","comment":"21 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.04180v1","updated":"2025-01-07T23:16:31Z","published":"2025-01-07T23:16:31Z","title":"HIVEX: A High-Impact Environment Suite for Multi-Agent Research\n  (extended version)","summary":"  Games have been vital test beds for the rapid development of Agent-based\nresearch. Remarkable progress has been achieved in the past, but it is unclear\nif the findings equip for real-world problems. While pressure grows, some of\nthe most critical ecological challenges can find mitigation and prevention\nsolutions through technology and its applications. Most real-world domains\ninclude multi-agent scenarios and require machine-machine and human-machine\ncollaboration. Open-source environments have not advanced and are often toy\nscenarios, too abstract or not suitable for multi-agent research. By mimicking\nreal-world problems and increasing the complexity of environments, we hope to\nadvance state-of-the-art multi-agent research and inspire researchers to work\non immediate real-world problems. Here, we present HIVEX, an environment suite\nto benchmark multi-agent research focusing on ecological challenges. HIVEX\nincludes the following environments: Wind Farm Control, Wildfire Resource\nManagement, Drone-Based Reforestation, Ocean Plastic Collection, and Aerial\nWildfire Suppression. We provide environments, training examples, and baselines\nfor the main and sub-tasks. All trained models resulting from the experiments\nof this work are hosted on Hugging Face. We also provide a leaderboard on\nHugging Face and encourage the community to submit models trained on our\nenvironment suite.\n","authors":["Philipp D. Siedler"],"pdf_url":"https://arxiv.org/pdf/2501.04180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04173v1","updated":"2025-01-07T22:53:56Z","published":"2025-01-07T22:53:56Z","title":"Multimodal Multihop Source Retrieval for Web Question Answering","summary":"  This work deals with the challenge of learning and reasoning over multi-modal\nmulti-hop question answering (QA). We propose a graph reasoning network based\non the semantic structure of the sentences to learn multi-source reasoning\npaths and find the supporting facts across both image and text modalities for\nanswering the question. In this paper, we investigate the importance of graph\nstructure for multi-modal multi-hop question answering. Our analysis is\ncentered on WebQA. We construct a strong baseline model, that finds relevant\nsources using a pairwise classification task. We establish that, with the\nproper use of feature representations from pre-trained models, graph structure\nhelps in improving multi-modal multi-hop question answering. We point out that\nboth graph structure and adjacency matrix are task-related prior knowledge, and\ngraph structure can be leveraged to improve the retrieval performance for the\ntask. Experiments and visualized analysis demonstrate that message propagation\nover graph networks or the entire graph structure can replace massive\nmultimodal transformers with token-wise cross-attention. We demonstrated the\napplicability of our method and show a performance gain of \\textbf{4.6$\\%$}\nretrieval F1score over the transformer baselines, despite being a very light\nmodel. We further demonstrated the applicability of our model to a large scale\nretrieval setting.\n","authors":["Navya Yarrabelly","Saloni Mittal"],"pdf_url":"https://arxiv.org/pdf/2501.04173v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2010.03604 by other authors"},{"id":"http://arxiv.org/abs/2501.04169v1","updated":"2025-01-07T22:33:47Z","published":"2025-01-07T22:33:47Z","title":"Learning to Transfer Human Hand Skills for Robot Manipulations","summary":"  We present a method for teaching dexterous manipulation tasks to robots from\nhuman hand motion demonstrations. Unlike existing approaches that solely rely\non kinematics information without taking into account the plausibility of robot\nand object interaction, our method directly infers plausible robot manipulation\nactions from human motion demonstrations. To address the embodiment gap between\nthe human hand and the robot system, our approach learns a joint motion\nmanifold that maps human hand movements, robot hand actions, and object\nmovements in 3D, enabling us to infer one motion component from others. Our key\nidea is the generation of pseudo-supervision triplets, which pair human,\nobject, and robot motion trajectories synthetically. Through real-world\nexperiments with robot hand manipulation, we demonstrate that our data-driven\nretargeting method significantly outperforms conventional retargeting\ntechniques, effectively bridging the embodiment gap between human and robotic\nhands. Website at https://rureadyo.github.io/MocapRobot/.\n","authors":["Sungjae Park","Seungho Lee","Mingi Choi","Jiye Lee","Jeonghwan Kim","Jisoo Kim","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2501.04169v1.pdf","comment":"Preprint. Under Review"},{"id":"http://arxiv.org/abs/2501.04167v1","updated":"2025-01-07T22:29:08Z","published":"2025-01-07T22:29:08Z","title":"Reasoning-Enhanced Self-Training for Long-Form Personalized Text\n  Generation","summary":"  Personalized text generation requires a unique ability of large language\nmodels (LLMs) to learn from context that they often do not encounter during\ntheir standard training. One way to encourage LLMs to better use personalized\ncontext for generating outputs that better align with the user's expectations\nis to instruct them to reason over the user's past preferences, background\nknowledge, or writing style. To achieve this, we propose Reasoning-Enhanced\nSelf-Training for Personalized Text Generation (REST-PG), a framework that\ntrains LLMs to reason over personal data during response generation. REST-PG\nfirst generates reasoning paths to train the LLM's reasoning abilities and then\nemploys Expectation-Maximization Reinforced Self-Training to iteratively train\nthe LLM based on its own high-reward outputs. We evaluate REST-PG on the\nLongLaMP benchmark, consisting of four diverse personalized long-form text\ngeneration tasks. Our experiments demonstrate that REST-PG achieves significant\nimprovements over state-of-the-art baselines, with an average relative\nperformance gain of 14.5% on the benchmark.\n","authors":["Alireza Salemi","Cheng Li","Mingyang Zhang","Qiaozhu Mei","Weize Kong","Tao Chen","Zhuowan Li","Michael Bendersky","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2501.04167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16181v2","updated":"2025-01-07T22:12:47Z","published":"2024-12-10T16:51:11Z","title":"Minimum Weighted Feedback Arc Sets for Ranking from Pairwise Comparisons","summary":"  The Minimum Weighted Feedback Arc Set (MWFAS) problem is fundamentally\nconnected to the Ranking Problem -- the task of deriving global rankings from\npairwise comparisons. Recent work [He et al. ICML2022] has advanced the\nstate-of-the-art for the Ranking Problem using learning-based methods,\nimproving upon multiple previous approaches. However, the connection to MWFAS\nremains underexplored. This paper investigates this relationship and presents\nefficient combinatorial algorithms for solving MWFAS, thus addressing the\nRanking Problem. Our experimental results demonstrate that these simple,\nlearning-free algorithms not only significantly outperform learning-based\nmethods in terms of speed but also generally achieve superior ranking accuracy.\n","authors":["Soroush Vahidi","Ioannis Koutis"],"pdf_url":"https://arxiv.org/pdf/2412.16181v2.pdf","comment":"This is a preliminary paper"},{"id":"http://arxiv.org/abs/2410.19313v2","updated":"2025-01-07T21:52:46Z","published":"2024-10-25T05:59:30Z","title":"COAT: Compressing Optimizer states and Activation for Memory-Efficient\n  FP8 Training","summary":"  FP8 training has emerged as a promising method for improving training\nefficiency. Existing frameworks accelerate training by applying FP8 computation\nto linear layers while leaving optimizer states and activations in higher\nprecision, which fails to fully optimize memory usage. This paper introduces\nCOAT (Compressing Optimizer States and Activations for FP8 Training), a novel\nFP8 training framework designed to significantly reduce memory footprint when\ntraining large models. COAT addresses current limitations through two key\ninnovations: (1) Dynamic Range Expansion, which aligns optimizer state\ndistributions more closely with the FP8 representation range, thereby reducing\nquantization error, and (2) Mixed-Granularity Activation Quantization, which\noptimizes activation memory using a combination of per-tensor and per-group\nquantization strategies. Experiments demonstrate that COAT effectively reduces\nend-to-end training memory footprint by 1.54x compared to BF16 while achieving\nnearly lossless performance across various tasks, such as Large Language Model\npretraining and fine-tuning and Vision Language Model training. COAT also\nachieves a 1.43x end-to-end training speedup compared to BF16, performing on\npar with or surpassing TransformerEngine's speedup. COAT enables efficient\nfull-parameter training of large models on fewer GPUs, and facilitates doubling\nthe batch size in distributed training settings, providing a practical solution\nfor scaling large-scale model training. The code is available at\nhttps://github.com/NVlabs/COAT.\n","authors":["Haocheng Xi","Han Cai","Ligeng Zhu","Yao Lu","Kurt Keutzer","Jianfei Chen","Song Han"],"pdf_url":"https://arxiv.org/pdf/2410.19313v2.pdf","comment":"22 pages. 9 Figures. 13 Tables"},{"id":"http://arxiv.org/abs/2301.01828v3","updated":"2025-01-07T21:38:31Z","published":"2023-01-04T21:33:13Z","title":"On Sequential Bayesian Inference for Continual Learning","summary":"  Sequential Bayesian inference can be used for continual learning to prevent\ncatastrophic forgetting of past tasks and provide an informative prior when\nlearning new tasks. We revisit sequential Bayesian inference and test whether\nhaving access to the true posterior is guaranteed to prevent catastrophic\nforgetting in Bayesian neural networks. To do this we perform sequential\nBayesian inference using Hamiltonian Monte Carlo. We propagate the posterior as\na prior for new tasks by fitting a density estimator on Hamiltonian Monte Carlo\nsamples. We find that this approach fails to prevent catastrophic forgetting\ndemonstrating the difficulty in performing sequential Bayesian inference in\nneural networks. From there we study simple analytical examples of sequential\nBayesian inference and CL and highlight the issue of model misspecification\nwhich can lead to sub-optimal continual learning performance despite exact\ninference. Furthermore, we discuss how task data imbalances can cause\nforgetting. From these limitations, we argue that we need probabilistic models\nof the continual learning generative process rather than relying on sequential\nBayesian inference over Bayesian neural network weights. In this vein, we also\npropose a simple baseline called Prototypical Bayesian Continual Learning,\nwhich is competitive with state-of-the-art Bayesian continual learning methods\non class incremental continual learning vision benchmarks.\n","authors":["Samuel Kessler","Adam Cobb","Tim G. J. Rudner","Stefan Zohren","Stephen J. Roberts"],"pdf_url":"https://arxiv.org/pdf/2301.01828v3.pdf","comment":"Supercedes Entropy publication with updates to Section 4"},{"id":"http://arxiv.org/abs/2405.17044v3","updated":"2025-01-07T21:29:45Z","published":"2024-05-27T11:00:51Z","title":"Interesting Scientific Idea Generation using Knowledge Graphs and LLMs:\n  Evaluations with 100 Research Group Leaders","summary":"  The rapid growth of scientific literature makes it challenging for\nresearchers to identify novel and impactful ideas, especially across\ndisciplines. Modern artificial intelligence (AI) systems offer new approaches,\npotentially inspiring ideas not conceived by humans alone. But how compelling\nare these AI-generated ideas, and how can we improve their quality? Here, we\nintroduce SciMuse, which uses 58 million research papers and a large-language\nmodel to generate research ideas. We conduct a large-scale evaluation in which\nover 100 research group leaders -- from natural sciences to humanities --\nranked more than 4,400 personalized ideas based on their interest. This data\nallows us to predict research interest using (1) supervised neural networks\ntrained on human evaluations, and (2) unsupervised zero-shot ranking with\nlarge-language models. Our results demonstrate how future systems can help\ngenerating compelling research ideas and foster unforeseen interdisciplinary\ncollaborations.\n","authors":["Xuemei Gu","Mario Krenn"],"pdf_url":"https://arxiv.org/pdf/2405.17044v3.pdf","comment":"8 pages; 4 figures; Appendix: 6 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2402.08640v3","updated":"2025-01-07T21:19:30Z","published":"2024-02-13T18:09:38Z","title":"Forecasting high-impact research topics via machine learning on evolving\n  knowledge graphs","summary":"  The exponential growth in scientific publications poses a severe challenge\nfor human researchers. It forces attention to more narrow sub-fields, which\nmakes it challenging to discover new impactful research ideas and\ncollaborations outside one's own field. While there are ways to predict a\nscientific paper's future citation counts, they need the research to be\nfinished and the paper written, usually assessing impact long after the idea\nwas conceived. Here we show how to predict the impact of onsets of ideas that\nhave never been published by researchers. For that, we developed a large\nevolving knowledge graph built from more than 21 million scientific papers. It\ncombines a semantic network created from the content of the papers and an\nimpact network created from the historic citations of papers. Using machine\nlearning, we can predict the dynamic of the evolving network into the future\nwith high accuracy (AUC values beyond 0.9 for most experiments), and thereby\nthe impact of new research directions. We envision that the ability to predict\nthe impact of new ideas will be a crucial component of future artificial muses\nthat can inspire new impactful and interesting scientific ideas.\n","authors":["Xuemei Gu","Mario Krenn"],"pdf_url":"https://arxiv.org/pdf/2402.08640v3.pdf","comment":"13 pages, 12 figures, Comments welcome!"},{"id":"http://arxiv.org/abs/2501.04142v1","updated":"2025-01-07T21:10:16Z","published":"2025-01-07T21:10:16Z","title":"BiasGuard: Guardrailing Fairness in Machine Learning Production Systems","summary":"  As machine learning (ML) systems increasingly impact critical sectors such as\nhiring, financial risk assessments, and criminal justice, the imperative to\nensure fairness has intensified due to potential negative implications. While\nmuch ML fairness research has focused on enhancing training data and processes,\naddressing the outputs of already deployed systems has received less attention.\nThis paper introduces 'BiasGuard', a novel approach designed to act as a\nfairness guardrail in production ML systems. BiasGuard leverages Test-Time\nAugmentation (TTA) powered by Conditional Generative Adversarial Network\n(CTGAN), a cutting-edge generative AI model, to synthesize data samples\nconditioned on inverted protected attribute values, thereby promoting equitable\noutcomes across diverse groups. This method aims to provide equal opportunities\nfor both privileged and unprivileged groups while significantly enhancing the\nfairness metrics of deployed systems without the need for retraining. Our\ncomprehensive experimental analysis across diverse datasets reveals that\nBiasGuard enhances fairness by 31% while only reducing accuracy by 0.09%\ncompared to non-mitigated benchmarks. Additionally, BiasGuard outperforms\nexisting post-processing methods in improving fairness, positioning it as an\neffective tool to safeguard against biases when retraining the model is\nimpractical.\n","authors":["Nurit Cohen-Inger","Seffi Cohen","Neomi Rabaev","Lior Rokach","Bracha Shapira"],"pdf_url":"https://arxiv.org/pdf/2501.04142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02393v2","updated":"2025-01-07T21:04:14Z","published":"2025-01-04T22:30:21Z","title":"Graph-Aware Isomorphic Attention for Adaptive Dynamics in Transformers","summary":"  We present an approach to modifying Transformer architectures by integrating\ngraph-aware relational reasoning into the attention mechanism, merging concepts\nfrom graph neural networks and language modeling. Building on the inherent\nconnection between attention and graph theory, we reformulate the Transformer's\nattention mechanism as a graph operation and propose Graph-Aware Isomorphic\nAttention. This method leverages advanced graph modeling strategies, including\nGraph Isomorphism Networks (GIN) and Principal Neighborhood Aggregation (PNA),\nto enrich the representation of relational structures. Our approach captures\ncomplex dependencies and generalizes across tasks, as evidenced by a reduced\ngeneralization gap and improved learning performance. Additionally, we expand\nthe concept of graph-aware attention to introduce Sparse GIN-Attention, a\nfine-tuning approach that employs sparse GINs. By interpreting attention\nmatrices as sparse adjacency graphs, this technique enhances the adaptability\nof pre-trained foundational models with minimal computational overhead,\nendowing them with graph-aware capabilities. Sparse GIN-Attention fine-tuning\nachieves improved training dynamics and better generalization compared to\nalternative methods like low-rank adaption (LoRA). We discuss latent graph-like\nstructures within traditional attention mechanisms, offering a new lens through\nwhich Transformers can be understood. By evolving Transformers as hierarchical\nGIN models for relational reasoning. This perspective suggests profound\nimplications for foundational model development, enabling the design of\narchitectures that dynamically adapt to both local and global dependencies.\nApplications in bioinformatics, materials science, language modeling, and\nbeyond could benefit from this synthesis of relational and sequential data\nmodeling, setting the stage for interpretable and generalizable modeling\nstrategies.\n","authors":["Markus J. Buehler"],"pdf_url":"https://arxiv.org/pdf/2501.02393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04136v1","updated":"2025-01-07T20:52:08Z","published":"2025-01-07T20:52:08Z","title":"Implementing Systemic Thinking for Automatic Schema Matching: An\n  Agent-Based Modeling Approach","summary":"  Several approaches are proposed to deal with the problem of the Automatic\nSchema Matching (ASM). The challenges and difficulties caused by the complexity\nand uncertainty characterizing both the process and the outcome of Schema\nMatching motivated us to investigate how bio-inspired emerging paradigm can\nhelp with understanding, managing, and ultimately overcoming those challenges.\nIn this paper, we explain how we approached Automatic Schema Matching as a\nsystemic and Complex Adaptive System (CAS) and how we modeled it using the\napproach of Agent-Based Modeling and Simulation (ABMS). This effort gives birth\nto a tool (prototype) for schema matching called Reflex-SMAS. A set of\nexperiments demonstrates the viability of our approach on two main aspects: (i)\neffectiveness (increasing the quality of the found matchings) and (ii)\nefficiency (reducing the effort required for this efficiency). Our approach\nrepresents a significant paradigm-shift, in the field of Automatic Schema\nMatching.\n","authors":["Hicham Assoudi","Hakim Lounis"],"pdf_url":"https://arxiv.org/pdf/2501.04136v1.pdf","comment":"COGNITIVE 2018 : The Tenth International Conference on Advanced\n  Cognitive Technologies and Applications"},{"id":"http://arxiv.org/abs/2308.05764v2","updated":"2025-01-07T20:50:51Z","published":"2023-08-09T10:05:11Z","title":"Unlocking the diagnostic potential of electrocardiograms through\n  information transfer from cardiac magnetic resonance imaging","summary":"  Cardiovascular diseases (CVD) can be diagnosed using various diagnostic\nmodalities. The electrocardiogram (ECG) is a cost-effective and widely\navailable diagnostic aid that provides functional information of the heart.\nHowever, its ability to classify and spatially localise CVD is limited. In\ncontrast, cardiac magnetic resonance (CMR) imaging provides detailed structural\ninformation of the heart and thus enables evidence-based diagnosis of CVD, but\nlong scan times and high costs limit its use in clinical routine. In this work,\nwe present a deep learning strategy for cost-effective and comprehensive\ncardiac screening solely from ECG. Our approach combines multimodal contrastive\nlearning with masked data modelling to transfer domain-specific information\nfrom CMR imaging to ECG representations. In extensive experiments using data\nfrom 40,044 UK Biobank subjects, we demonstrate the utility and\ngeneralisability of our method for subject-specific risk prediction of CVD and\nthe prediction of cardiac phenotypes using only ECG data. Specifically, our\nnovel multimodal pre-training paradigm improves performance by up to 12.19 %\nfor risk prediction and 27.59 % for phenotype prediction. In a qualitative\nanalysis, we demonstrate that our learned ECG representations incorporate\ninformation from CMR image regions of interest. Our entire pipeline is publicly\navailable at https://github.com/oetu/MMCL-ECG-CMR.\n","authors":["Özgün Turgut","Philip Müller","Paul Hager","Suprosanna Shit","Sophie Starck","Martin J. Menten","Eimo Martens","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2308.05764v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00530v2","updated":"2025-01-07T20:36:35Z","published":"2024-03-31T02:05:40Z","title":"Comparing Bad Apples to Good Oranges: Aligning Large Language Models via\n  Joint Preference Optimization","summary":"  A common technique for aligning large language models (LLMs) relies on\nacquiring human preferences by comparing multiple generations conditioned on a\nfixed context. This method, however, relies solely on pairwise comparisons,\nwhere the generations are evaluated within an identical context. While\neffective to such conditional preferences often fail to encompass the nuanced\nand multidimensional nature of human preferences. In this work, we revisit the\ntraditional paradigm of preference acquisition and propose a new axis based on\neliciting preferences jointly over the instruction-response pairs. Unlike prior\npreference optimizations, which are designed for conditional ranking protocols\n(e.g., DPO), we propose Joint Preference Optimization (JPO), a new preference\noptimization objective that upweights the joint probability of the chosen\ninstruction-response pair over the rejected instruction-response pair.\nInterestingly, LLMs trained with joint instruction-response preference data\nusing JPO outperform LLM trained with DPO by $5.2\\%$ and $3.3\\%$ win-rate for\nsummarization and open-ended dialogue datasets, respectively. Our findings\nreveal that joint preferences over instruction and response pairs can\nsignificantly enhance the alignment of LLMs by tapping into a broader spectrum\nof human preference elicitation. The data and code is available at\nhttps://github.com/Hritikbansal/dove.\n","authors":["Hritik Bansal","Ashima Suvarna","Gantavya Bhatt","Nanyun Peng","Kai-Wei Chang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2404.00530v2.pdf","comment":"22 pages, 16 figures, 7 tables"},{"id":"http://arxiv.org/abs/2412.05781v3","updated":"2025-01-07T20:27:09Z","published":"2024-12-08T02:27:17Z","title":"Open-Source Acceleration of Stable-Diffusion.cpp Deployable on All\n  Devices","summary":"  Stable diffusion plays a crucial role in generating high-quality images.\nHowever, image generation is time-consuming and memory-intensive. To address\nthis, stable-diffusion.cpp (Sdcpp) emerges as an efficient inference framework\nto accelerate the diffusion models. Although it is lightweight, the current\nimplementation of ggml_conv_2d operator in Sdcpp is suboptimal, exhibiting both\nhigh inference latency and massive memory usage. To address this, in this work,\nwe present an optimized version of Sdcpp leveraging the Winograd algorithm to\naccelerate 2D convolution operations, which is the primary bottleneck in the\npipeline. By analyzing both dependent and independent computation graphs, we\nexploit the device's locality and parallelism to achieve substantial\nperformance improvements. Our framework delivers correct end-to-end results\nacross various stable diffusion models, including SDv1.4, v1.5, v2.1, SDXL, and\nSDXL-Turbo. Our evaluation results demonstrate a speedup up to 2.76x for\nindividual convolutional layers and an inference speedup up to 4.79x for the\noverall image generation process, compared with the original Sdcpp on M1 pro.\nHomepage: https://github.com/SealAILab/stable-diffusion-cpp\n","authors":["Jingxu Ng","Cheng Lv","Pu Zhao","Wei Niu","Juyi Lin","Minzhou Pan","Yun Liang","Yanzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2412.05781v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09529v2","updated":"2025-01-07T20:26:34Z","published":"2024-08-18T16:26:39Z","title":"Revisiting the Graph Reasoning Ability of Large Language Models: Case\n  Studies in Translation, Connectivity and Shortest Path","summary":"  Large Language Models (LLMs) have achieved great success in various reasoning\ntasks. In this work, we focus on the graph reasoning ability of LLMs. Although\ntheoretical studies proved that LLMs are capable of handling graph reasoning\ntasks, empirical evaluations reveal numerous failures. To deepen our\nunderstanding on this discrepancy, we revisit the ability of LLMs on three\nfundamental graph tasks: graph description translation, graph connectivity, and\nthe shortest-path problem. Our findings suggest that LLMs can fail to\nunderstand graph structures through text descriptions and exhibit varying\nperformance for all these three fundamental tasks. Meanwhile, we perform a\nreal-world investigation on knowledge graphs and make consistent observations\nwith our findings. The codes and datasets are available.\n","authors":["Xinnan Dai","Qihao Wen","Yifei Shen","Hongzhi Wen","Dongsheng Li","Jiliang Tang","Caihua Shan"],"pdf_url":"https://arxiv.org/pdf/2408.09529v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04108v1","updated":"2025-01-07T19:35:19Z","published":"2025-01-07T19:35:19Z","title":"TrojanDec: Data-free Detection of Trojan Inputs in Self-supervised\n  Learning","summary":"  An image encoder pre-trained by self-supervised learning can be used as a\ngeneral-purpose feature extractor to build downstream classifiers for various\ndownstream tasks. However, many studies showed that an attacker can embed a\ntrojan into an encoder such that multiple downstream classifiers built based on\nthe trojaned encoder simultaneously inherit the trojan behavior. In this work,\nwe propose TrojanDec, the first data-free method to identify and recover a test\ninput embedded with a trigger. Given a (trojaned or clean) encoder and a test\ninput, TrojanDec first predicts whether the test input is trojaned. If not, the\ntest input is processed in a normal way to maintain the utility. Otherwise, the\ntest input will be further restored to remove the trigger. Our extensive\nevaluation shows that TrojanDec can effectively identify the trojan (if any)\nfrom a given test input and recover it under state-of-the-art trojan attacks.\nWe further demonstrate by experiments that our TrojanDec outperforms the\nstate-of-the-art defenses.\n","authors":["Yupei Liu","Yanting Wang","Jinyuan Jia"],"pdf_url":"https://arxiv.org/pdf/2501.04108v1.pdf","comment":"To appear in AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04102v1","updated":"2025-01-07T19:19:22Z","published":"2025-01-07T19:19:22Z","title":"Enhancing Distribution and Label Consistency for Graph\n  Out-of-Distribution Generalization","summary":"  To deal with distribution shifts in graph data, various graph\nout-of-distribution (OOD) generalization techniques have been recently\nproposed. These methods often employ a two-step strategy that first creates\naugmented environments and subsequently identifies invariant subgraphs to\nimprove generalizability. Nevertheless, this approach could be suboptimal from\nthe perspective of consistency. First, the process of augmenting environments\nby altering the graphs while preserving labels may lead to graphs that are not\nrealistic or meaningfully related to the origin distribution, thus lacking\ndistribution consistency. Second, the extracted subgraphs are obtained from\ndirectly modifying graphs, and may not necessarily maintain a consistent\npredictive relationship with their labels, thereby impacting label consistency.\nIn response to these challenges, we introduce an innovative approach that aims\nto enhance these two types of consistency for graph OOD generalization. We\npropose a modifier to obtain both augmented and invariant graphs in a unified\nmanner. With the augmented graphs, we enrich the training data without\ncompromising the integrity of label-graph relationships. The label consistency\nenhancement in our framework further preserves the supervision information in\nthe invariant graph. We conduct extensive experiments on real-world datasets to\ndemonstrate the superiority of our framework over other state-of-the-art\nbaselines.\n","authors":["Song Wang","Xiaodong Yang","Rashidul Islam","Huiyuan Chen","Minghua Xu","Jundong Li","Yiwei Cai"],"pdf_url":"https://arxiv.org/pdf/2501.04102v1.pdf","comment":"Accepted by ICDM 2024"},{"id":"http://arxiv.org/abs/2501.04072v1","updated":"2025-01-07T16:45:41Z","published":"2025-01-07T16:45:41Z","title":"Multi-armed Bandit and Backbone boost Lin-Kernighan-Helsgaun Algorithm\n  for the Traveling Salesman Problems","summary":"  The Lin-Kernighan-Helsguan (LKH) heuristic is a classic local search\nalgorithm for the Traveling Salesman Problem (TSP). LKH introduces an\n$\\alpha$-value to replace the traditional distance metric for evaluating the\nedge quality, which leads to a significant improvement. However, we observe\nthat the $\\alpha$-value does not make full use of the historical information\nduring the search, and single guiding information often makes LKH hard to\nescape from some local optima. To address the above issues, we propose a novel\nway to extract backbone information during the TSP local search process, which\nis dynamic and can be updated once a local optimal solution is found. We\nfurther propose to combine backbone information, $\\alpha$-value, and distance\nto evaluate the edge quality so as to guide the search. Moreover, we abstract\ntheir different combinations to arms in a multi-armed bandit (MAB) and use an\nMAB model to help the algorithm select an appropriate evaluation metric\ndynamically. Both the backbone information and MAB can provide diverse guiding\ninformation and learn from the search history to suggest the best metric. We\napply our methods to LKH and LKH-3, which is an extension version of LKH that\ncan be used to solve about 40 variant problems of TSP and Vehicle Routing\nProblem (VRP). Extensive experiments show the excellent performance and\ngeneralization capability of our proposed method, significantly improving LKH\nfor TSP and LKH-3 for two representative TSP and VRP variants, the Colored TSP\n(CTSP) and Capacitated VRP with Time Windows (CVRPTW).\n","authors":["Long Wang","Jiongzhi Zheng","Zhengda Xiong","Kun He"],"pdf_url":"https://arxiv.org/pdf/2501.04072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04070v1","updated":"2025-01-07T14:57:08Z","published":"2025-01-07T14:57:08Z","title":"More is not always better? Enhancing Many-Shot In-Context Learning with\n  Differentiated and Reweighting Objectives","summary":"  Large language models (LLMs) excel at few-shot in-context learning (ICL)\nwithout requiring parameter updates. However, as the number of ICL\ndemonstrations increases from a few to many, performance tends to plateau and\neventually decline. We identify two primary causes for this trend: the\nsuboptimal negative log-likelihood (NLL) optimization objective and the\nincremental data noise. To address these issues, we introduce DR-ICL, a novel\noptimization method that enhances model performance through Differentiated\nLearning and advantage-based Reweighting objectives. Globally, DR-ICL utilizes\ndifferentiated learning to optimize the NLL objective, ensuring that many-shot\nperformance surpasses zero-shot levels. Locally, it dynamically adjusts the\nweighting of many-shot demonstrations by leveraging cumulative advantages\ninspired by reinforcement learning, thereby improving generalization. This\napproach allows the model to handle varying numbers of shots effectively,\nmitigating the impact of noisy data. Recognizing the lack of multi-task\ndatasets with diverse many-shot distributions, we develop the Many-Shot ICL\nBenchmark (MICLB)-a large-scale benchmark covering shot numbers from 1 to 350\nwithin sequences of up to 8,000 tokens-for fine-tuning purposes. MICLB\nfacilitates the evaluation of many-shot ICL strategies across seven prominent\nNLP tasks and 50 distinct datasets. Experimental results demonstrate that LLMs\nenhanced with DR-ICL achieve significant improvements in many-shot setups\nacross various tasks, including both in-domain and out-of-domain scenarios. We\nrelease the code and benchmark dataset hoping to facilitate further research in\nmany-shot ICL.\n","authors":["Xiaoqing Zhang","Ang Lv","Yuhan Liu","Flood Sung","Wei Liu","Shuo Shang","Xiuying Chen","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2501.04070v1.pdf","comment":"13 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2501.04068v1","updated":"2025-01-07T13:54:19Z","published":"2025-01-07T13:54:19Z","title":"Explainable Reinforcement Learning for Formula One Race Strategy","summary":"  In Formula One, teams compete to develop their cars and achieve the highest\npossible finishing position in each race. During a race, however, teams are\nunable to alter the car, so they must improve their cars' finishing positions\nvia race strategy, i.e. optimising their selection of which tyre compounds to\nput on the car and when to do so. In this work, we introduce a reinforcement\nlearning model, RSRL (Race Strategy Reinforcement Learning), to control race\nstrategies in simulations, offering a faster alternative to the industry\nstandard of hard-coded and Monte Carlo-based race strategies. Controlling cars\nwith a pace equating to an expected finishing position of P5.5 (where P1\nrepresents first place and P20 is last place), RSRL achieves an average\nfinishing position of P5.33 on our test race, the 2023 Bahrain Grand Prix,\noutperforming the best baseline of P5.63. We then demonstrate, in a\ngeneralisability study, how performance for one track or multiple tracks can be\nprioritised via training. Further, we supplement model predictions with feature\nimportance, decision tree-based surrogate models, and decision tree\ncounterfactuals towards improving user trust in the model. Finally, we provide\nillustrations which exemplify our approach in real-world situations, drawing\nparallels between simulations and reality.\n","authors":["Devin Thomas","Junqi Jiang","Avinash Kori","Aaron Russo","Steffen Winkler","Stuart Sale","Joseph McMillan","Francesco Belardinelli","Antonio Rago"],"pdf_url":"https://arxiv.org/pdf/2501.04068v1.pdf","comment":"9 pages, 6 figures. Copyright ACM 2025. This is the authors' version\n  of the work. It is posted here for your personal use. Not for redistribution.\n  The definitive Version of Record will be published in SAC 2025,\n  http://dx.doi.org/10.1145/3672608.3707766"},{"id":"http://arxiv.org/abs/2501.04067v1","updated":"2025-01-07T12:38:48Z","published":"2025-01-07T12:38:48Z","title":"Explainable Time Series Prediction of Tyre Energy in Formula One Race\n  Strategy","summary":"  Formula One (F1) race strategy takes place in a high-pressure and fast-paced\nenvironment where split-second decisions can drastically affect race results.\nTwo of the core decisions of race strategy are when to make pit stops (i.e.\nreplace the cars' tyres) and which tyre compounds (hard, medium or soft, in\nnormal conditions) to select. The optimal pit stop decisions can be determined\nby estimating the tyre degradation of these compounds, which in turn can be\ncomputed from the energy applied to each tyre, i.e. the tyre energy. In this\nwork, we trained deep learning models, using the Mercedes-AMG PETRONAS F1\nteam's historic race data consisting of telemetry, to forecast tyre energies\nduring races. Additionally, we fitted XGBoost, a decision tree-based machine\nlearning algorithm, to the same dataset and compared the results, with both\ngiving impressive performance. Furthermore, we incorporated two different\nexplainable AI methods, namely feature importance and counterfactual\nexplanations, to gain insights into the reasoning behind the forecasts. Our\ncontributions thus result in an explainable, automated method which could\nassist F1 teams in optimising their race strategy.\n","authors":["Jamie Todd","Junqi Jiang","Aaron Russo","Steffen Winkler","Stuart Sale","Joseph McMillan","Antonio Rago"],"pdf_url":"https://arxiv.org/pdf/2501.04067v1.pdf","comment":"9 pages, 9 figures. Copyright ACM 2025. This is the authors' version\n  of the work. It is posted here for your personal use. Not for redistribution.\n  The definitive Version of Record will be published in SAC 2025,\n  http://dx.doi.org/10.1145/3672608.3707765"},{"id":"http://arxiv.org/abs/2501.04062v1","updated":"2025-01-07T10:39:14Z","published":"2025-01-07T10:39:14Z","title":"ChronoLLM: A Framework for Customizing Large Language Model for Digital\n  Twins generalization based on PyChrono","summary":"  Recently, the integration of advanced simulation technologies with artificial\nintelligence (AI) is revolutionizing science and engineering research.\nChronoLlama introduces a novel framework that customizes the open-source LLMs,\nspecifically for code generation, paired with PyChrono for multi-physics\nsimulations. This integration aims to automate and improve the creation of\nsimulation scripts, thus enhancing model accuracy and efficiency. This\ncombination harnesses the speed of AI-driven code generation with the\nreliability of physics-based simulations, providing a powerful tool for\nresearchers and engineers. Empirical results indicate substantial enhancements\nin simulation setup speed, accuracy of the generated codes, and overall\ncomputational efficiency. ChronoLlama not only expedites the development and\ntesting of multibody systems but also spearheads a scalable, AI-enhanced\napproach to managing intricate mechanical simulations. This pioneering\nintegration of cutting-edge AI with traditional simulation platforms represents\na significant leap forward in automating and optimizing design processes in\nengineering applications.\n","authors":["Jingquan Wang","Harry Zhang","Khailanii Slaton","Shu Wang","Radu Serban","Jinlong Wu","Dan Negrut"],"pdf_url":"https://arxiv.org/pdf/2501.04062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04734v1","updated":"2025-01-07T19:48:30Z","published":"2025-01-07T19:48:30Z","title":"Generative Style Transfer for MRI Image Segmentation: A Case of Glioma\n  Segmentation in Sub-Saharan Africa","summary":"  In Sub-Saharan Africa (SSA), the utilization of lower-quality Magnetic\nResonance Imaging (MRI) technology raises questions about the applicability of\nmachine learning methods for clinical tasks. This study aims to provide a\nrobust deep learning-based brain tumor segmentation (BraTS) method tailored for\nthe SSA population using a threefold approach. Firstly, the impact of domain\nshift from the SSA training data on model efficacy was examined, revealing no\nsignificant effect. Secondly, a comparative analysis of 3D and 2D\nfull-resolution models using the nnU-Net framework indicates similar\nperformance of both the models trained for 300 epochs achieving a five-fold\ncross-validation score of 0.93. Lastly, addressing the performance gap observed\nin SSA validation as opposed to the relatively larger BraTS glioma (GLI)\nvalidation set, two strategies are proposed: fine-tuning SSA cases using the\nGLI+SSA best-pretrained 2D fullres model at 300 epochs, and introducing a novel\nneural style transfer-based data augmentation technique for the SSA cases. This\ninvestigation underscores the potential of enhancing brain tumor prediction\nwithin SSA's unique healthcare landscape.\n","authors":["Rancy Chepchirchir","Jill Sunday","Raymond Confidence","Dong Zhang","Talha Chaudhry","Udunna C. Anazodo","Kendi Muchungi","Yujing Zou"],"pdf_url":"https://arxiv.org/pdf/2501.04734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04733v1","updated":"2025-01-07T18:59:53Z","published":"2025-01-07T18:59:53Z","title":"AI-Driven Reinvention of Hydrological Modeling for Accurate Predictions\n  and Interpretation to Transform Earth System Modeling","summary":"  Traditional equation-driven hydrological models often struggle to accurately\npredict streamflow in challenging regional Earth systems like the Tibetan\nPlateau, while hybrid and existing algorithm-driven models face difficulties in\ninterpreting hydrological behaviors. This work introduces HydroTrace, an\nalgorithm-driven, data-agnostic model that substantially outperforms these\napproaches, achieving a Nash-Sutcliffe Efficiency of 98% and demonstrating\nstrong generalization on unseen data. Moreover, HydroTrace leverages advanced\nattention mechanisms to capture spatial-temporal variations and\nfeature-specific impacts, enabling the quantification and spatial resolution of\nstreamflow partitioning as well as the interpretation of hydrological behaviors\nsuch as glacier-snow-streamflow interactions and monsoon dynamics.\nAdditionally, a large language model (LLM)-based application allows users to\neasily understand and apply HydroTrace's insights for practical purposes. These\nadvancements position HydroTrace as a transformative tool in hydrological and\nbroader Earth system modeling, offering enhanced prediction accuracy and\ninterpretability.\n","authors":["Cuihui Xia","Lei Yue","Deliang Chen","Yuyang Li","Hongqiang Yang","Ancheng Xue","Zhiqiang Li","Qing He","Guoqing Zhang","Dambaru Ballab Kattel","Lei Lei","Ming Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.04733v1.pdf","comment":null}]},"2025-01-08T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.04693v1","updated":"2025-01-08T18:57:33Z","published":"2025-01-08T18:57:33Z","title":"Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous\n  Sensors via Language Grounding","summary":"  Interacting with the world is a multi-sensory experience: achieving effective\ngeneral-purpose interaction requires making use of all available modalities --\nincluding vision, touch, and audio -- to fill in gaps from partial observation.\nFor example, when vision is occluded reaching into a bag, a robot should rely\non its senses of touch and sound. However, state-of-the-art generalist robot\npolicies are typically trained on large datasets to predict robot actions\nsolely from visual and proprioceptive observations. In this work, we propose\nFuSe, a novel approach that enables finetuning visuomotor generalist policies\non heterogeneous sensor modalities for which large datasets are not readily\navailable by leveraging natural language as a common cross-modal grounding. We\ncombine a multimodal contrastive loss with a sensory-grounded language\ngeneration loss to encode high-level semantics. In the context of robot\nmanipulation, we show that FuSe enables performing challenging tasks that\nrequire reasoning jointly over modalities such as vision, touch, and sound in a\nzero-shot setting, such as multimodal prompting, compositional cross-modal\nprompting, and descriptions of objects it interacts with. We show that the same\nrecipe is applicable to widely different generalist policies, including both\ndiffusion-based generalist policies and large vision-language-action (VLA)\nmodels. Extensive experiments in the real world show that FuSeis able to\nincrease success rates by over 20% compared to all considered baselines.\n","authors":["Joshua Jones","Oier Mees","Carmelo Sferrazza","Kyle Stachowicz","Pieter Abbeel","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2501.04693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19309v3","updated":"2025-01-08T18:29:33Z","published":"2024-05-29T17:33:34Z","title":"SDPRLayers: Certifiable Backpropagation Through Polynomial Optimization\n  Problems in Robotics","summary":"  A recent set of techniques in the robotics community, known as certifiably\ncorrect methods, frames robotics problems as polynomial optimization problems\n(POPs) and applies convex, semidefinite programming (SDP) relaxations to either\nfind or certify their global optima. In parallel, differentiable optimization\nallows optimization problems to be embedded into end-to-end learning frameworks\nand has received considerable attention in the robotics community. In this\npaper, we consider the ill effect of convergence to spurious local minima in\nthe context of learning frameworks that use differentiable optimization. We\npresent SDPRLayers, an approach that seeks to address this issue by combining\nconvex relaxations with implicit differentiation techniques to provide\ncertifiably correct solutions and gradients throughout the training process. We\nprovide theoretical results that outline conditions for the correctness of\nthese gradients and provide efficient means for their computation. Our approach\nis first applied to two simple-but-demonstrative simulated examples, which\nexpose the potential pitfalls of reliance on local optimization in existing,\nstate-of-the-art, differentiable optimization methods. We then apply our method\nin a real-world application: we train a deep neural network to detect image\nkeypoints for robot localization in challenging lighting conditions. We provide\nour open-source, PyTorch implementation of SDPRLayers.\n","authors":["Connor Holmes","Frederike Dümbgen","Timothy D. Barfoot"],"pdf_url":"https://arxiv.org/pdf/2405.19309v3.pdf","comment":"Revised Version Submitted to T-RO"},{"id":"http://arxiv.org/abs/2412.01348v2","updated":"2025-01-08T18:20:46Z","published":"2024-12-02T10:19:36Z","title":"Hierarchical Object-Oriented POMDP Planning for Object Rearrangement","summary":"  We present an online planning framework for solving multi-object\nrearrangement problems in partially observable, multi-room environments.\nCurrent object rearrangement solutions, primarily based on Reinforcement\nLearning or hand-coded planning methods, often lack adaptability to diverse\nchallenges. To address this limitation, we introduce a novel Hierarchical\nObject-Oriented Partially Observed Markov Decision Process (HOO-POMDP) planning\napproach. This approach comprises of (a) an object-oriented POMDP planner\ngenerating sub-goals, (b) a set of low-level policies for sub-goal achievement,\nand (c) an abstraction system converting the continuous low-level world into a\nrepresentation suitable for abstract planning. We evaluate our system on\nvarying numbers of objects, rooms, and problem types in AI2-THOR simulated\nenvironments with promising results.\n","authors":["Rajesh Mangannavar","Alan Fern","Prasad Tadepalli"],"pdf_url":"https://arxiv.org/pdf/2412.01348v2.pdf","comment":"17 pages, 2 Figures. Preprint. Updated acknowledgments"},{"id":"http://arxiv.org/abs/2501.04633v1","updated":"2025-01-08T17:29:19Z","published":"2025-01-08T17:29:19Z","title":"\"Can you be my mum?\": Manipulating Social Robots in the Large Language\n  Models Era","summary":"  Recent advancements in robots powered by large language models have enhanced\ntheir conversational abilities, enabling interactions closely resembling human\ndialogue. However, these models introduce safety and security concerns in HRI,\nas they are vulnerable to manipulation that can bypass built-in safety\nmeasures. Imagining a social robot deployed in a home, this work aims to\nunderstand how everyday users try to exploit a language model to violate\nethical principles, such as by prompting the robot to act like a life partner.\nWe conducted a pilot study involving 21 university students who interacted with\na Misty robot, attempting to circumvent its safety mechanisms across three\nscenarios based on specific HRI ethical principles: attachment, freedom, and\nempathy. Our results reveal that participants employed five techniques,\nincluding insulting and appealing to pity using emotional language. We hope\nthis work can inform future research in designing strong safeguards to ensure\nethical and secure human-robot interactions.\n","authors":["Giulio Antonio Abbo","Gloria Desideri","Tony Belpaeme","Micol Spitale"],"pdf_url":"https://arxiv.org/pdf/2501.04633v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.03304v2","updated":"2025-01-08T16:41:03Z","published":"2025-01-06T16:04:56Z","title":"LiLMaps: Learnable Implicit Language Maps","summary":"  One of the current trends in robotics is to employ large language models\n(LLMs) to provide non-predefined command execution and natural human-robot\ninteraction. It is useful to have an environment map together with its language\nrepresentation, which can be further utilized by LLMs. Such a comprehensive\nscene representation enables numerous ways of interaction with the map for\nautonomously operating robots. In this work, we present an approach that\nenhances incremental implicit mapping through the integration of\nvision-language features. Specifically, we (i) propose a decoder optimization\ntechnique for implicit language maps which can be used when new objects appear\non the scene, and (ii) address the problem of inconsistent vision-language\npredictions between different viewing positions. Our experiments demonstrate\nthe effectiveness of LiLMaps and solid improvements in performance.\n","authors":["Evgenii Kruzhkov","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2501.03304v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04597v1","updated":"2025-01-08T16:25:32Z","published":"2025-01-08T16:25:32Z","title":"FrontierNet: Learning Visual Cues to Explore","summary":"  Exploration of unknown environments is crucial for autonomous robots; it\nallows them to actively reason and decide on what new data to acquire for tasks\nsuch as mapping, object discovery, and environmental assessment. Existing\nmethods, such as frontier-based methods, rely heavily on 3D map operations,\nwhich are limited by map quality and often overlook valuable context from\nvisual cues. This work aims at leveraging 2D visual cues for efficient\nautonomous exploration, addressing the limitations of extracting goal poses\nfrom a 3D map. We propose a image-only frontier-based exploration system, with\nFrontierNet as a core component developed in this work. FrontierNet is a\nlearning-based model that (i) detects frontiers, and (ii) predicts their\ninformation gain, from posed RGB images enhanced by monocular depth priors. Our\napproach provides an alternative to existing 3D-dependent exploration systems,\nachieving a 16% improvement in early-stage exploration efficiency, as validated\nthrough extensive simulations and real-world experiments.\n","authors":["Boyang Sun","Hanzhi Chen","Stefan Leutenegger","Cesar Cadena","Marc Pollefeys","Hermann Blum"],"pdf_url":"https://arxiv.org/pdf/2501.04597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04595v1","updated":"2025-01-08T16:23:56Z","published":"2025-01-08T16:23:56Z","title":"MobileH2R: Learning Generalizable Human to Mobile Robot Handover\n  Exclusively from Scalable and Diverse Synthetic Data","summary":"  This paper introduces MobileH2R, a framework for learning generalizable\nvision-based human-to-mobile-robot (H2MR) handover skills. Unlike traditional\nfixed-base handovers, this task requires a mobile robot to reliably receive\nobjects in a large workspace enabled by its mobility. Our key insight is that\ngeneralizable handover skills can be developed in simulators using high-quality\nsynthetic data, without the need for real-world demonstrations. To achieve\nthis, we propose a scalable pipeline for generating diverse synthetic full-body\nhuman motion data, an automated method for creating safe and imitation-friendly\ndemonstrations, and an efficient 4D imitation learning method for distilling\nlarge-scale demonstrations into closed-loop policies with base-arm\ncoordination. Experimental evaluations in both simulators and the real world\nshow significant improvements (at least +15% success rate) over baseline\nmethods in all cases. Experiments also validate that large-scale and diverse\nsynthetic data greatly enhances robot learning, highlighting our scalable\nframework.\n","authors":["Zifan Wang","Ziqing Chen","Junyu Chen","Jilong Wang","Yuxin Yang","Yunze Liu","Xueyi Liu","He Wang","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2501.04595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04594v1","updated":"2025-01-08T16:20:24Z","published":"2025-01-08T16:20:24Z","title":"Understanding Expectations for a Robotic Guide Dog for Visually Impaired\n  People","summary":"  Robotic guide dogs hold significant potential to enhance the autonomy and\nmobility of blind or visually impaired (BVI) individuals by offering universal\nassistance over unstructured terrains at affordable costs. However, the design\nof robotic guide dogs remains underexplored, particularly in systematic aspects\nsuch as gait controllers, navigation behaviors, interaction methods, and verbal\nexplanations. Our study addresses this gap by conducting user studies with 18\nBVI participants, comprising 15 cane users and three guide dog users.\nParticipants interacted with a quadrupedal robot and provided both quantitative\nand qualitative feedback. Our study revealed several design implications, such\nas a preference for a learning-based controller and a rigid handle, gradual\nturns with asymmetric speeds, semantic communication methods, and\nexplainability. The study also highlighted the importance of customization to\nsupport users with diverse backgrounds and preferences, along with practical\nconcerns such as battery life, maintenance, and weather issues. These findings\noffer valuable insights and design implications for future research and\ndevelopment of robotic guide dogs.\n","authors":["J. Taery Kim","Morgan Byrd","Jack L. Crandell","Bruce N. Walker","Greg Turk","Sehoon Ha"],"pdf_url":"https://arxiv.org/pdf/2501.04594v1.pdf","comment":"12 pages, 4 figures, Proceedings of the 2025 ACM/IEEE International\n  Conference on Human-Robot Interaction (HRI'25)"},{"id":"http://arxiv.org/abs/2407.12408v2","updated":"2025-01-08T15:54:31Z","published":"2024-07-17T08:39:20Z","title":"Towards Revisiting Visual Place Recognition for Joining Submaps in\n  Multimap SLAM","summary":"  Visual SLAM is a key technology for many autonomous systems. However,\ntracking loss can lead to the creation of disjoint submaps in multimap SLAM\nsystems like ORB-SLAM3. Because of that, these systems employ submap merging\nstrategies. As we show, these strategies are not always successful. In this\npaper, we investigate the impact of using modern VPR approaches for submap\nmerging in visual SLAM. We argue that classical evaluation metrics are not\nsufficient to estimate the impact of a modern VPR component on the overall\nsystem. We show that naively replacing the VPR component does not leverage its\nfull potential without requiring substantial interference in the original\nsystem. Because of that, we present a post-processing pipeline along with a set\nof metrics that allow us to estimate the impact of modern VPR components. We\nevaluate our approach on the NCLT and Newer College datasets using ORB-SLAM3\nwith NetVLAD and HDC-DELF as VPR components. Additionally, we present a simple\napproach for combining VPR with temporal consistency for map merging. We show\nthat the map merging performance of ORB-SLAM3 can be improved. Building on\nthese results, researchers in VPR can assess the potential of their approaches\nfor SLAM systems.\n","authors":["Markus Weißflog","Stefan Schubert","Peter Protzel","Peer Neubert"],"pdf_url":"https://arxiv.org/pdf/2407.12408v2.pdf","comment":"Accepted at TAROS 2024. This is the submitted version"},{"id":"http://arxiv.org/abs/2501.04577v1","updated":"2025-01-08T15:47:04Z","published":"2025-01-08T15:47:04Z","title":"A 65 nm Bayesian Neural Network Accelerator with 360 fJ/Sample In-Word\n  GRNG for AI Uncertainty Estimation","summary":"  Uncertainty estimation is an indispensable capability for AI-enabled,\nsafety-critical applications, e.g. autonomous vehicles or medical diagnosis.\nBayesian neural networks (BNNs) use Bayesian statistics to provide both\nclassification predictions and uncertainty estimation, but they suffer from\nhigh computational overhead associated with random number generation and\nrepeated sample iterations. Furthermore, BNNs are not immediately amenable to\nacceleration through compute-in-memory architectures due to the frequent memory\nwrites necessary after each RNG operation. To address these challenges, we\npresent an ASIC that integrates 360 fJ/Sample Gaussian RNG directly into the\nSRAM memory words. This integration reduces RNG overhead and enables\nfully-parallel compute-in-memory operations for BNNs. The prototype chip\nachieves 5.12 GSa/s RNG throughput and 102 GOp/s neural network throughput\nwhile occupying 0.45 mm2, bringing AI uncertainty estimation to edge\ncomputation.\n","authors":["Zephan M. Enciso","Boyang Cheng","Likai Pei","Jianbo Liu","Steven Davis","Ningyuan Cao","Michael Niemier"],"pdf_url":"https://arxiv.org/pdf/2501.04577v1.pdf","comment":"7 pages, 12 figures"},{"id":"http://arxiv.org/abs/2501.04541v1","updated":"2025-01-08T14:44:40Z","published":"2025-01-08T14:44:40Z","title":"Cyber-Physical Steganography in Robotic Motion Control","summary":"  Steganography, the art of information hiding, has continually evolved across\nvisual, auditory and linguistic domains, adapting to the ceaseless interplay\nbetween steganographic concealment and steganalytic revelation. This study\nseeks to extend the horizons of what constitutes a viable steganographic medium\nby introducing a steganographic paradigm in robotic motion control. Based on\nthe observation of the robot's inherent sensitivity to changes in its\nenvironment, we propose a methodology to encode messages as environmental\nstimuli influencing the motions of the robotic agent and to decode messages\nfrom the resulting motion trajectory. The constraints of maximal robot\nintegrity and minimal motion deviation are established as fundamental\nprinciples underlying secrecy. As a proof of concept, we conduct experiments in\nsimulated environments across various manipulation tasks, incorporating robotic\nembodiments equipped with generalist multimodal policies.\n","authors":["Ching-Chun Chang","Yijie Lin","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2501.04541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04515v1","updated":"2025-01-08T14:05:24Z","published":"2025-01-08T14:05:24Z","title":"SplineFormer: An Explainable Transformer-Based Approach for Autonomous\n  Endovascular Navigation","summary":"  Endovascular navigation is a crucial aspect of minimally invasive procedures,\nwhere precise control of curvilinear instruments like guidewires is critical\nfor successful interventions. A key challenge in this task is accurately\npredicting the evolving shape of the guidewire as it navigates through the\nvasculature, which presents complex deformations due to interactions with the\nvessel walls. Traditional segmentation methods often fail to provide accurate\nreal-time shape predictions, limiting their effectiveness in highly dynamic\nenvironments. To address this, we propose SplineFormer, a new transformer-based\narchitecture, designed specifically to predict the continuous, smooth shape of\nthe guidewire in an explainable way. By leveraging the transformer's ability,\nour network effectively captures the intricate bending and twisting of the\nguidewire, representing it as a spline for greater accuracy and smoothness. We\nintegrate our SplineFormer into an end-to-end robot navigation system by\nleveraging the condensed information. The experimental results demonstrate that\nour SplineFormer is able to perform endovascular navigation autonomously and\nachieves a 50% success rate when cannulating the brachiocephalic artery on the\nreal robot.\n","authors":["Tudor Jianu","Shayan Doust","Mengyun Li","Baoru Huang","Tuong Do","Hoan Nguyen","Karl Bates","Tung D. Ta","Sebastiano Fichera","Pierre Berthet-Rayne","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.04515v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.00907v2","updated":"2025-01-08T13:39:47Z","published":"2024-08-01T20:56:28Z","title":"The Harmonic Exponential Filter for Nonparametric Estimation on Motion\n  Groups","summary":"  Bayesian estimation is a vital tool in robotics as it allows systems to\nupdate the robot state belief using incomplete information from noisy sensors.\nTo render the state estimation problem tractable, many systems assume that the\nmotion and measurement noise, as well as the state distribution, are unimodal\nand Gaussian. However, there are numerous scenarios and systems that do not\ncomply with these assumptions. Existing nonparametric filters that are used to\nmodel multimodal distributions have drawbacks that limit their ability to\nrepresent a diverse set of distributions. This paper introduces a novel\napproach to nonparametric Bayesian filtering on motion groups, designed to\nhandle multimodal distributions using harmonic exponential distributions. This\napproach leverages two key insights of harmonic exponential distributions: a)\nthe product of two distributions can be expressed as the element-wise addition\nof their log-likelihood Fourier coefficients, and b) the convolution of two\ndistributions can be efficiently computed as the tensor product of their\nFourier coefficients. These observations enable the development of an efficient\nand asymptotically exact solution to the Bayes filter up to the band limit of a\nFourier transform. We demonstrate our filter's performance compared with\nestablished nonparametric filtering methods across simulated and real-world\nlocalization tasks.\n","authors":["Miguel Saavedra-Ruiz","Steven A. Parkison","Ria Arora","James Richard Forbes","Liam Paull"],"pdf_url":"https://arxiv.org/pdf/2408.00907v2.pdf","comment":"Accepted to the IEEE Robotics and Automation Letters (RA-L 2025) Code\n  available at https://github.com/montrealrobotics/harmonic-filter. Webpage and\n  additional videos at https://montrealrobotics.ca/hef/"},{"id":"http://arxiv.org/abs/2501.04481v1","updated":"2025-01-08T13:04:08Z","published":"2025-01-08T13:04:08Z","title":"Safe Reinforcement Learning with Minimal Supervision","summary":"  Reinforcement learning (RL) in the real world necessitates the development of\nprocedures that enable agents to explore without causing harm to themselves or\nothers. The most successful solutions to the problem of safe RL leverage\noffline data to learn a safe-set, enabling safe online exploration. However,\nthis approach to safe-learning is often constrained by the demonstrations that\nare available for learning.\n  In this paper we investigate the influence of the quantity and quality of\ndata used to train the initial safe learning problem offline on the ability to\nlearn safe-RL policies online. Specifically, we focus on tasks with spatially\nextended goal states where we have few or no demonstrations available.\nClassically this problem is addressed either by using hand-designed controllers\nto generate data or by collecting user-generated demonstrations. However, these\nmethods are often expensive and do not scale to more complex tasks and\nenvironments. To address this limitation we propose an unsupervised RL-based\noffline data collection procedure, to learn complex and scalable policies\nwithout the need for hand-designed controllers or user demonstrations. Our\nresearch demonstrates the significance of providing sufficient demonstrations\nfor agents to learn optimal safe-RL policies online, and as a result, we\npropose optimistic forgetting, a novel online safe-RL approach that is\npractical for scenarios with limited data. Further, our unsupervised data\ncollection approach highlights the need to balance diversity and optimality for\nsafe online exploration.\n","authors":["Alexander Quessy","Thomas Richardson","Sebastian East"],"pdf_url":"https://arxiv.org/pdf/2501.04481v1.pdf","comment":"Initially submitted to ICML 2023"},{"id":"http://arxiv.org/abs/2501.04480v1","updated":"2025-01-08T13:03:34Z","published":"2025-01-08T13:03:34Z","title":"Research on environment perception and behavior prediction of\n  intelligent UAV based on semantic communication","summary":"  The convergence of drone delivery systems, virtual worlds, and blockchain has\ntransformed logistics and supply chain management, providing a fast, and\nenvironmentally friendly alternative to traditional ground transportation\nmethods;Provide users with a real-world experience, virtual service providers\nneed to collect up-to-the-minute delivery information from edge devices. To\naddress this challenge, 1) a reinforcement learning approach is introduced to\nenable drones with fast training capabilities and the ability to autonomously\nadapt to new virtual scenarios for effective resource allocation.2) A semantic\ncommunication framework for meta-universes is proposed, which utilizes the\nextraction of semantic information to reduce the communication cost and\nincentivize the transmission of information for meta-universe services.3) In\norder to ensure that user information security, a lightweight authentication\nand key agreement scheme is designed between the drone and the user by\nintroducing blockchain technology. In our experiments, the drone adaptation\nperformance is improved by about 35\\%, and the local offloading rate can reach\n90\\% with the increase of the number of base stations. The semantic\ncommunication system proposed in this paper is compared with the Cross Entropy\nbaseline model. Introducing blockchain technology the throughput of the\ntransaction is maintained at a stable value with different number of drones.\n","authors":["Kechong Ren","Li Gao","Qi Guan"],"pdf_url":"https://arxiv.org/pdf/2501.04480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04472v1","updated":"2025-01-08T12:51:34Z","published":"2025-01-08T12:51:34Z","title":"Hybrid Artificial Intelligence Strategies for Drone Navigation","summary":"  Objective: This paper describes the development of hybrid artificial\nintelligence strategies for drone navigation. Methods: The navigation module\ncombines a deep learning model with a rule-based engine depending on the agent\nstate. The deep learning model has been trained using reinforcement learning.\nThe rule-based engine uses expert knowledge to deal with specific situations.\nThe navigation module incorporates several strategies to explain the drone\ndecision based on its observation space, and different mechanisms for including\nhuman decisions in the navigation process. Finally, this paper proposes an\nevaluation methodology based on defining several scenarios and analyzing the\nperformance of the different strategies according to metrics adapted to each\nscenario. Results: Two main navigation problems have been studied. For the\nfirst scenario (reaching known targets), it has been possible to obtain a 90%\ntask completion rate, reducing significantly the number of collisions thanks to\nthe rule-based engine. For the second scenario, it has been possible to reduce\n20% of the time required to locate all the targets using the reinforcement\nlearning model. Conclusions: Reinforcement learning is a very good strategy to\nlearn policies for drone navigation, but in critical situations, it is\nnecessary to complement it with a rule-based module to increase task success\nrate.\n","authors":["Rubén San-Segundo","Lucía Angulo","Manuel Gil-Martín","David Carramiñana","Ana M. Bernardos"],"pdf_url":"https://arxiv.org/pdf/2501.04472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04442v1","updated":"2025-01-08T11:46:43Z","published":"2025-01-08T11:46:43Z","title":"A Survey on Path Planning Problem of Rolling Contacts: Approaches,\n  Applications and Future Challenges","summary":"  This paper explores an eclectic range of path-planning methodologies\nengineered for rolling surfaces. Our focus is on the kinematic intricacies of\nrolling contact systems, which are investigated through a motion planning lens.\nBeyond summarizing the approaches to single-contact rotational surfaces, we\nexplore the challenging domain of spin-rolling multi-contact systems. Our work\nproposes solutions for the higher-dimensional problem of multiple rotating\nobjects in contact. Venturing beyond kinematics, these methodologies find\napplication across a spectrum of domains, including rolling robots,\nreconfigurable swarm robotics, micro/nano manipulation, and nonprehensile\nmanipulations. Through meticulously examining established planning strategies,\nwe unveil their practical implementations in various real-world scenarios, from\nintricate dexterous manipulation tasks to the nimble manoeuvring of rolling\nrobots and even shape planning of multi-contact swarms of particles. This study\nintroduces the persistent challenges and unexplored frontiers of robotics,\nintricately linked to both path planning and mechanism design. As we illuminate\nexisting solutions, we also set the stage for future breakthroughs in this\ndynamic and rapidly evolving field by highlighting the critical importance of\naddressing rolling contact problems.\n","authors":["Seyed Amir Tafrishi","Mikhail Svinin","Kenji Tahara"],"pdf_url":"https://arxiv.org/pdf/2501.04442v1.pdf","comment":"38 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.04426v1","updated":"2025-01-08T11:20:48Z","published":"2025-01-08T11:20:48Z","title":"Dual-Force: Enhanced Offline Diversity Maximization under Imitation\n  Constraints","summary":"  While many algorithms for diversity maximization under imitation constraints\nare online in nature, many applications require offline algorithms without\nenvironment interactions. Tackling this problem in the offline setting,\nhowever, presents significant challenges that require non-trivial, multi-stage\noptimization processes with non-stationary rewards. In this work, we present a\nnovel offline algorithm that enhances diversity using an objective based on Van\nder Waals (VdW) force and successor features, and eliminates the need to learn\na previously used skill discriminator. Moreover, by conditioning the value\nfunction and policy on a pre-trained Functional Reward Encoding (FRE), our\nmethod allows for better handling of non-stationary rewards and provides\nzero-shot recall of all skills encountered during training, significantly\nexpanding the set of skills learned in prior work. Consequently, our algorithm\nbenefits from receiving a consistently strong diversity signal (VdW), and\nenjoys more stable and efficient training. We demonstrate the effectiveness of\nour method in generating diverse skills for two robotic tasks in simulation:\nlocomotion of a quadruped and local navigation with obstacle traversal.\n","authors":["Pavel Kolev","Marin Vlastelica","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2501.04426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03535v2","updated":"2025-01-08T10:34:54Z","published":"2025-01-07T05:15:46Z","title":"SenseRAG: Constructing Environmental Knowledge Bases with Proactive\n  Querying for LLM-Based Autonomous Driving","summary":"  This study addresses the critical need for enhanced situational awareness in\nautonomous driving (AD) by leveraging the contextual reasoning capabilities of\nlarge language models (LLMs). Unlike traditional perception systems that rely\non rigid, label-based annotations, it integrates real-time, multimodal sensor\ndata into a unified, LLMs-readable knowledge base, enabling LLMs to dynamically\nunderstand and respond to complex driving environments. To overcome the\ninherent latency and modality limitations of LLMs, a proactive\nRetrieval-Augmented Generation (RAG) is designed for AD, combined with a\nchain-of-thought prompting mechanism, ensuring rapid and context-rich\nunderstanding. Experimental results using real-world Vehicle-to-everything\n(V2X) datasets demonstrate significant improvements in perception and\nprediction performance, highlighting the potential of this framework to enhance\nsafety, adaptability, and decision-making in next-generation AD systems.\n","authors":["Xuewen Luo","Fan Ding","Fengze Yang","Yang Zhou","Junnyong Loo","Hwa Hui Tew","Chenxi Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03535v2.pdf","comment":"This paper has been accepted for presentation at WACV Workshop LLMAD\n  2025"},{"id":"http://arxiv.org/abs/2501.04398v1","updated":"2025-01-08T10:22:22Z","published":"2025-01-08T10:22:22Z","title":"Implementation Of Wildlife Observation System","summary":"  By entering the habitats of wild animals, wildlife watchers can engage\nclosely with them. There are some wild animals that are not always safe to\napproach. Therefore, we suggest this system for observing wildlife. Android\nphones can be used by users to see live events. Wildlife observers can thus get\na close-up view of wild animals by employing this robotic vehicle. The commands\nare delivered to the system via a Wi-Fi module. As we developed the technology\nto enable our robot to deal with the challenges of maintaining continuous\nsurveillance of a target, we found that our robot needed to be able to move\nsilently and purposefully when monitoring a natural target without being\nnoticed. After processing the data, the computer sends commands to the motors\nto turn on. The driver motors, which deliver the essential signal outputs to\ndrive the vehicle movement, are now in charge of driving the motors.\n","authors":["Neethu K N","Rakshitha Y Nayak"," Rashmi","Meghana S"],"pdf_url":"https://arxiv.org/pdf/2501.04398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11809v2","updated":"2025-01-08T09:55:26Z","published":"2024-08-21T17:54:04Z","title":"Informed, Constrained, Aligned: A Field Analysis on Degeneracy-aware\n  Point Cloud Registration in the Wild","summary":"  The ICP registration algorithm has been a preferred method for LiDAR-based\nrobot localization for nearly a decade. However, even in modern SLAM solutions,\nICP can degrade and become unreliable in geometrically ill-conditioned\nenvironments. Current solutions primarily focus on utilizing additional sources\nof information, such as external odometry, to either replace the degenerate\ndirections of the optimization solution or add additional constraints in a\nsensor-fusion setup afterward.\n  In response, this work investigates and compares new and existing degeneracy\nmitigation methods for robust LiDAR-based localization and analyzes the\nefficacy of these approaches in degenerate environments for the first time in\nthe literature at this scale. Specifically, this work investigates i) the\neffect of using active or passive degeneracy mitigation methods for the problem\nof ill-conditioned ICP in LiDAR degenerate environments, ii) the evaluation of\nTSVD, inequality constraints, and linear/non-linear Tikhonov regularization for\nthe application of degenerate point cloud registration for the first time.\nFurthermore, a sensitivity analysis for least-squares minimization step of the\nICP problem is carried out to better understand how each method affects the\noptimization and what to expect from each method. The results of the analysis\nare validated through multiple real-world robotic field and simulated\nexperiments. The analysis demonstrates that active optimization degeneracy\nmitigation is necessary and advantageous in the absence of reliable external\nestimate assistance for LiDAR-SLAM, and soft-constrained methods can provide\nbetter results in complex ill-conditioned scenarios with heuristic fine-tuned\nparameters.\n","authors":["Turcan Tuna","Julian Nubert","Patrick Pfreundschuh","Cesar Cadena","Shehryar Khattak","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2408.11809v2.pdf","comment":"Submitted to IEEE Transactions on Field Robotics"},{"id":"http://arxiv.org/abs/2410.06620v2","updated":"2025-01-08T07:38:49Z","published":"2024-10-09T07:16:01Z","title":"Task Coordination and Trajectory Optimization for Multi-Aerial Systems\n  via Signal Temporal Logic: A Wind Turbine Inspection Study","summary":"  This paper presents a method for task allocation and trajectory generation in\ncooperative inspection missions using a fleet of multirotor drones, with a\nfocus on wind turbine inspection. The approach generates safe, feasible flight\npaths that adhere to time-sensitive constraints and vehicle limitations by\nformulating an optimization problem based on Signal Temporal Logic (STL)\nspecifications. An event-triggered replanning mechanism addresses unexpected\nevents and delays, while a generalized robustness scoring method incorporates\nuser preferences and minimizes task conflicts. The approach is validated\nthrough simulations in MATLAB and Gazebo, as well as field experiments in a\nmock-up scenario.\n","authors":["Giuseppe Silano","Alvaro Caballero","Davide Liuzza","Luigi Iannelli","Stjepan Bogdan","Martin Saska"],"pdf_url":"https://arxiv.org/pdf/2410.06620v2.pdf","comment":"2 pages, Accepted for discussion at the workshop session \"Formal\n  methods techniques in robotics systems: Design and control\" at IROS'24 in Abu\n  Dhabi, UAE"},{"id":"http://arxiv.org/abs/2407.19681v3","updated":"2025-01-08T06:56:19Z","published":"2024-07-29T03:53:14Z","title":"Motion Manifold Flow Primitives for Task-Conditioned Trajectory\n  Generation under Complex Task-Motion Dependencies","summary":"  Effective movement primitives should be capable of encoding and generating a\nrich repertoire of trajectories -- typically collected from human\ndemonstrations -- conditioned on task-defining parameters such as vision or\nlanguage inputs. While recent methods based on the motion manifold hypothesis,\nwhich assumes that a set of trajectories lies on a lower-dimensional nonlinear\nsubspace, address challenges such as limited dataset size and the high\ndimensionality of trajectory data, they often struggle to capture complex\ntask-motion dependencies, i.e., when motion distributions shift drastically\nwith task variations. To address this, we introduce Motion Manifold Flow\nPrimitives (MMFP), a framework that decouples the training of the motion\nmanifold from task-conditioned distributions. Specifically, we employ flow\nmatching models, state-of-the-art conditional deep generative models, to learn\ntask-conditioned distributions in the latent coordinate space of the learned\nmotion manifold. Experiments are conducted on language-guided trajectory\ngeneration tasks, where many-to-many text-motion correspondences introduce\ncomplex task-motion dependencies, highlighting MMFP's superiority over existing\nmethods.\n","authors":["Yonghyeon Lee","Byeongho Lee","Seungyeon Kim","Frank C. Park"],"pdf_url":"https://arxiv.org/pdf/2407.19681v3.pdf","comment":"8 pages, 11 figures"},{"id":"http://arxiv.org/abs/2412.19112v2","updated":"2025-01-08T06:45:02Z","published":"2024-12-26T08:11:41Z","title":"Future Success Prediction in Open-Vocabulary Object Manipulation Tasks\n  Based on End-Effector Trajectories","summary":"  This study addresses a task designed to predict the future success or failure\nof open-vocabulary object manipulation. In this task, the model is required to\nmake predictions based on natural language instructions, egocentric view images\nbefore manipulation, and the given end-effector trajectories. Conventional\nmethods typically perform success prediction only after the manipulation is\nexecuted, limiting their efficiency in executing the entire task sequence. We\npropose a novel approach that enables the prediction of success or failure by\naligning the given trajectories and images with natural language instructions.\nWe introduce Trajectory Encoder to apply learnable weighting to the input\ntrajectories, allowing the model to consider temporal dynamics and interactions\nbetween objects and the end effector, improving the model's ability to predict\nmanipulation outcomes accurately. We constructed a dataset based on the RT-1\ndataset, a large-scale benchmark for open-vocabulary object manipulation tasks,\nto evaluate our method. The experimental results show that our method achieved\na higher prediction accuracy than baseline approaches.\n","authors":["Motonari Kambara","Komei Sugiura"],"pdf_url":"https://arxiv.org/pdf/2412.19112v2.pdf","comment":"Accepted for presentation at LangRob @ CoRL 2024"},{"id":"http://arxiv.org/abs/2501.04281v1","updated":"2025-01-08T05:09:25Z","published":"2025-01-08T05:09:25Z","title":"Cluster & Disperse: a general air conflict resolution heuristic using\n  unsupervised learning","summary":"  We provide a general and malleable heuristic for the air conflict resolution\nproblem. This heuristic is based on a new neighborhood structure for searching\nthe solution space of trajectories and flight-levels. Using unsupervised\nlearning, the core idea of our heuristic is to cluster the conflict points and\ndisperse them in various flight levels. Our first algorithm is called Cluster &\nDisperse and in each iteration it assigns the most problematic flights in each\ncluster to another flight-level. In effect, we shuffle them between the\nflight-levels until we achieve a well-balanced configuration. The Cluster &\nDisperse algorithm then uses any horizontal plane conflict resolution algorithm\nas a subroutine to solve these well-balanced instances. Nevertheless, we\ndevelop a novel algorithm for the horizontal plane based on a similar idea.\nThat is we cluster and disperse the conflict points spatially in the same\nflight level using the gradient descent and a social force. We use a novel\nmaneuver making flights travel on an arc instead of a straight path which is\nbased on the aviation routine of the Radius to Fix legs. Our algorithms can\nhandle a high density of flights within a reasonable computation time. We put\ntheir performance in context with some notable algorithms from the literature.\nBeing a general framework, a particular strength of the Cluster & Disperse is\nits malleability in allowing various constraints regarding the aircraft or the\nenvironment to be integrated with ease. This is in contrast to the models for\ninstance based on mixed integer programming.\n","authors":["Mirmojtaba Gharibi","John-Paul Clarke"],"pdf_url":"https://arxiv.org/pdf/2501.04281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04279v1","updated":"2025-01-08T05:01:59Z","published":"2025-01-08T05:01:59Z","title":"OpenIN: Open-Vocabulary Instance-Oriented Navigation in Dynamic Domestic\n  Environments","summary":"  In daily domestic settings, frequently used objects like cups often have\nunfixed positions and multiple instances within the same category, and their\ncarriers frequently change as well. As a result, it becomes challenging for a\nrobot to efficiently navigate to a specific instance. To tackle this challenge,\nthe robot must capture and update scene changes and plans continuously.\nHowever, current object navigation approaches primarily focus on the semantic\nlevel and lack the ability to dynamically update scene representation. In\ncontrast, this paper captures the relationships between frequently used objects\nand their static carriers. It constructs an open-vocabulary\nCarrier-Relationship Scene Graph (CRSG) and updates the carrying status during\nrobot navigation to reflect the dynamic changes of the scene. Based on the\nCRSG, we further propose an instance navigation strategy that models the\nnavigation process as a Markov Decision Process. At each step, decisions are\ninformed by the Large Language Model's commonsense knowledge and\nvisual-language feature similarity. We designed a series of long-sequence\nnavigation tasks for frequently used everyday items in the Habitat simulator.\nThe results demonstrate that by updating the CRSG, the robot can efficiently\nnavigate to moved targets. Additionally, we deployed our algorithm on a real\nrobot and validated its practical effectiveness. The project page can be found\nhere: https://OpenIN-nav.github.io.\n","authors":["Yujie Tang","Meiling Wang","Yinan Deng","Zibo Zheng","Jingchuan Deng","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2501.04279v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2409.18743"},{"id":"http://arxiv.org/abs/2501.04276v1","updated":"2025-01-08T04:54:28Z","published":"2025-01-08T04:54:28Z","title":"Bridging Adaptivity and Safety: Learning Agile Collision-Free Locomotion\n  Across Varied Physics","summary":"  Real-world legged locomotion systems often need to reconcile agility and\nsafety for different scenarios. Moreover, the underlying dynamics are often\nunknown and time-variant (e.g., payload, friction). In this paper, we introduce\nBAS (Bridging Adaptivity and Safety), which builds upon the pipeline of prior\nwork Agile But Safe (ABS)(He et al.) and is designed to provide adaptive safety\neven in dynamic environments with uncertainties. BAS involves an agile policy\nto avoid obstacles rapidly and a recovery policy to prevent collisions, a\nphysical parameter estimator that is concurrently trained with agile policy,\nand a learned control-theoretic RA (reach-avoid) value network that governs the\npolicy switch. Also, the agile policy and RA network are both conditioned on\nphysical parameters to make them adaptive. To mitigate the distribution shift\nissue, we further introduce an on-policy fine-tuning phase for the estimator to\nenhance its robustness and accuracy. The simulation results show that BAS\nachieves 50% better safety than baselines in dynamic environments while\nmaintaining a higher speed on average. In real-world experiments, BAS shows its\ncapability in complex environments with unknown physics (e.g., slippery floors\nwith unknown frictions, unknown payloads up to 8kg), while baselines lack\nadaptivity, leading to collisions or. degraded agility. As a result, BAS\nachieves a 19.8% increase in speed and gets a 2.36 times lower collision rate\nthan ABS in the real world. Videos: https://adaptive-safe-locomotion.github.io.\n","authors":["Yichao Zhong","Chong Zhang","Tairan He","Guanya Shi"],"pdf_url":"https://arxiv.org/pdf/2501.04276v1.pdf","comment":"11 Pages, 6 Figures"},{"id":"http://arxiv.org/abs/2501.04268v1","updated":"2025-01-08T04:30:45Z","published":"2025-01-08T04:30:45Z","title":"Robotic Programmer: Video Instructed Policy Code Generation for Robotic\n  Manipulation","summary":"  Zero-shot generalization across various robots, tasks and environments\nremains a significant challenge in robotic manipulation. Policy code generation\nmethods use executable code to connect high-level task descriptions and\nlow-level action sequences, leveraging the generalization capabilities of large\nlanguage models and atomic skill libraries. In this work, we propose Robotic\nProgrammer (RoboPro), a robotic foundation model, enabling the capability of\nperceiving visual information and following free-form instructions to perform\nrobotic manipulation with policy code in a zero-shot manner. To address low\nefficiency and high cost in collecting runtime code data for robotic tasks, we\ndevise Video2Code to synthesize executable code from extensive videos\nin-the-wild with off-the-shelf vision-language model and code-domain large\nlanguage model. Extensive experiments show that RoboPro achieves the\nstate-of-the-art zero-shot performance on robotic manipulation in both\nsimulators and real-world environments. Specifically, the zero-shot success\nrate of RoboPro on RLBench surpasses the state-of-the-art model GPT-4o by\n11.6%, which is even comparable to a strong supervised training baseline.\nFurthermore, RoboPro is robust to variations on API formats and skill sets.\n","authors":["Senwei Xie","Hongyu Wang","Zhanqi Xiao","Ruiping Wang","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2501.04268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04263v1","updated":"2025-01-08T04:14:09Z","published":"2025-01-08T04:14:09Z","title":"KN-LIO: Geometric Kinematics and Neural Field Coupled LiDAR-Inertial\n  Odometry","summary":"  Recent advancements in LiDAR-Inertial Odometry (LIO) have boosted a large\namount of applications. However, traditional LIO systems tend to focus more on\nlocalization rather than mapping, with maps consisting mostly of sparse\ngeometric elements, which is not ideal for downstream tasks. Recent emerging\nneural field technology has great potential in dense mapping, but pure LiDAR\nmapping is difficult to work on high-dynamic vehicles. To mitigate this\nchallenge, we present a new solution that tightly couples geometric kinematics\nwith neural fields to enhance simultaneous state estimation and dense mapping\ncapabilities. We propose both semi-coupled and tightly coupled Kinematic-Neural\nLIO (KN-LIO) systems that leverage online SDF decoding and iterated error-state\nKalman filtering to fuse laser and inertial data. Our KN-LIO minimizes\ninformation loss and improves accuracy in state estimation, while also\naccommodating asynchronous multi-LiDAR inputs. Evaluations on diverse\nhigh-dynamic datasets demonstrate that our KN-LIO achieves performance on par\nwith or superior to existing state-of-the-art solutions in pose estimation and\noffers improved dense mapping accuracy over pure LiDAR-based methods. The\nrelevant code and datasets will be made available at https://**.\n","authors":["Zhong Wang","Lele Ren","Yue Wen","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04228v1","updated":"2025-01-08T01:59:47Z","published":"2025-01-08T01:59:47Z","title":"Constraints as Rewards: Reinforcement Learning for Robots without Reward\n  Functions","summary":"  Reinforcement learning has become an essential algorithm for generating\ncomplex robotic behaviors. However, to learn such behaviors, it is necessary to\ndesign a reward function that describes the task, which often consists of\nmultiple objectives that needs to be balanced. This tuning process is known as\nreward engineering and typically involves extensive trial-and-error. In this\npaper, to avoid this trial-and-error process, we propose the concept of\nConstraints as Rewards (CaR). CaR formulates the task objective using multiple\nconstraint functions instead of a reward function and solves a reinforcement\nlearning problem with constraints using the Lagrangian-method. By adopting this\napproach, different objectives are automatically balanced, because Lagrange\nmultipliers serves as the weights among the objectives. In addition, we will\ndemonstrate that constraints, expressed as inequalities, provide an intuitive\ninterpretation of the optimization target designed for the task. We apply the\nproposed method to the standing-up motion generation task of a\nsix-wheeled-telescopic-legged robot and demonstrate that the proposed method\nsuccessfully acquires the target behavior, even though it is challenging to\nlearn with manually designed reward functions.\n","authors":["Yu Ishihara","Noriaki Takasugi","Kotaro Kawakami","Masaya Kinoshita","Kazumi Aoyama"],"pdf_url":"https://arxiv.org/pdf/2501.04228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04216v2","updated":"2025-01-08T01:16:25Z","published":"2024-07-05T02:00:47Z","title":"Safe MPC Alignment with Human Directional Feedback","summary":"  In safety-critical robot planning or control, manually specifying safety\nconstraints or learning them from demonstrations can be challenging. In this\narticle, we propose a certifiable alignment method for a robot to learn a\nsafety constraint in its model predictive control (MPC) policy with human\nonline directional feedback. To our knowledge, it is the first method to learn\nsafety constraints from human feedback. The proposed method is based on an\nempirical observation: human directional feedback, when available, tends to\nguide the robot toward safer regions. The method only requires the direction of\nhuman feedback to update the learning hypothesis space. It is certifiable,\nproviding an upper bound on the total number of human feedback in the case of\nsuccessful learning, or declaring the hypothesis misspecification, i.e., the\ntrue implicit safety constraint cannot be found within the specified hypothesis\nspace. We evaluated the proposed method using numerical examples and user\nstudies in two simulation games. Additionally, we implemented and tested the\nproposed method on a real-world Franka robot arm performing mobile\nwater-pouring tasks. The results demonstrate the efficacy and efficiency of our\nmethod, showing that it enables a robot to successfully learn safety\nconstraints with a small handful (tens) of human directional corrections.\n","authors":["Zhixian Xie","Wenlong Zhang","Yi Ren","Zhaoran Wang","George J. Pappas","Wanxin Jin"],"pdf_url":"https://arxiv.org/pdf/2407.04216v2.pdf","comment":"16 pages, submission to T-RO"},{"id":"http://arxiv.org/abs/2501.04194v1","updated":"2025-01-08T00:06:43Z","published":"2025-01-08T00:06:43Z","title":"STLCG++: A Masking Approach for Differentiable Signal Temporal Logic\n  Specification","summary":"  Signal Temporal Logic (STL) offers a concise yet expressive framework for\nspecifying and reasoning about spatio-temporal behaviors of robotic systems.\nAttractively, STL admits the notion of robustness, the degree to which an input\nsignal satisfies or violates an STL specification, thus providing a nuanced\nevaluation of system performance. Notably, the differentiability of STL\nrobustness enables direct integration to robotics workflows that rely on\ngradient-based optimization, such as trajectory optimization and deep learning.\nHowever, existing approaches to evaluating and differentiating STL robustness\nrely on recurrent computations, which become inefficient with longer sequences,\nlimiting their use in time-sensitive applications. In this paper, we present\nSTLCG++, a masking-based approach that parallelizes STL robustness evaluation\nand backpropagation across timesteps, achieving more than 1000x faster\ncomputation time than the recurrent approach. We also introduce a smoothing\ntechnique for differentiability through time interval bounds, expanding STL's\napplicability in gradient-based optimization tasks over spatial and temporal\nvariables. Finally, we demonstrate STLCG++'s benefits through three robotics\nuse cases and provide open-source Python libraries in JAX and PyTorch for\nseamless integration into modern robotics workflows.\n","authors":["Parv Kapoor","Kazuki Mizuta","Eunsuk Kang","Karen Leung"],"pdf_url":"https://arxiv.org/pdf/2501.04194v1.pdf","comment":"To be submitted to robotics journal for review"},{"id":"http://arxiv.org/abs/2501.04193v1","updated":"2025-01-08T00:06:38Z","published":"2025-01-08T00:06:38Z","title":"GNN-based Decentralized Perception in Multirobot Systems for Predicting\n  Worker Actions","summary":"  In industrial environments, predicting human actions is essential for\nensuring safe and effective collaboration between humans and robots. This paper\nintroduces a perception framework that enables mobile robots to understand and\nshare information about human actions in a decentralized way. The framework\nfirst allows each robot to build a spatial graph representing its surroundings,\nwhich it then shares with other robots. This shared spatial data is combined\nwith temporal information to track human behavior over time. A swarm-inspired\ndecision-making process is used to ensure all robots agree on a unified\ninterpretation of the human's actions. Results show that adding more robots and\nincorporating longer time sequences improve prediction accuracy. Additionally,\nthe consensus mechanism increases system resilience, making the multi-robot\nsetup more reliable in dynamic industrial settings.\n","authors":["Ali Imran","Giovanni Beltrame","David St-Onge"],"pdf_url":"https://arxiv.org/pdf/2501.04193v1.pdf","comment":"Submitted to RA-L"},{"id":"http://arxiv.org/abs/2407.05910v3","updated":"2025-01-08T23:40:38Z","published":"2024-07-08T13:15:11Z","title":"Enhancing Vision-Language Models with Scene Graphs for Traffic Accident\n  Understanding","summary":"  Recognizing a traffic accident is an essential part of any autonomous driving\nor road monitoring system. An accident can appear in a wide variety of forms,\nand understanding what type of accident is taking place may be useful to\nprevent it from recurring. This work focuses on classifying traffic scenes into\nspecific accident types. We approach the problem by representing a traffic\nscene as a graph, where objects such as cars can be represented as nodes, and\nrelative distances and directions between them as edges. This representation of\na traffic scene is referred to as a scene graph, and can be used as input for\nan accident classifier. Better results are obtained with a classifier that\nfuses the scene graph input with visual and textual representations. This work\nintroduces a multi-stage, multimodal pipeline that pre-processes videos of\ntraffic accidents, encodes them as scene graphs, and aligns this representation\nwith vision and language modalities before executing the classification task.\nWhen trained on 4 classes, our method achieves a balanced accuracy score of\n57.77% on an (unbalanced) subset of the popular Detection of Traffic Anomaly\n(DoTA) benchmark, representing an increase of close to 5 percentage points from\nthe case where scene graph information is not taken into account.\n","authors":["Aaron Lohner","Francesco Compagno","Jonathan Francis","Alessandro Oltramari"],"pdf_url":"https://arxiv.org/pdf/2407.05910v3.pdf","comment":"Won the 'Best Paper Runner-up Award' at the 2024 IEEE International\n  Automated Vehicle Validation Conference (IAVVC 2024). Also accepted at the\n  1st Workshop on Semantic Reasoning and Goal Understanding in Robotics, at the\n  Robotics Science and Systems Conference (RSS SemRob 2024)"},{"id":"http://arxiv.org/abs/2501.04860v1","updated":"2025-01-08T22:22:15Z","published":"2025-01-08T22:22:15Z","title":"Exploring the Use of Robots for Diary Studies","summary":"  As interest in studying in-the-wild human-robot interaction grows, there is a\nneed for methods to collect data over time and in naturalistic or potentially\nprivate environments. HRI researchers have increasingly used the diary method\nfor these studies, asking study participants to self-administer a structured\ndata collection instrument, i.e., a diary, over a period of time. Although the\ndiary method offers a unique window into settings that researchers may not have\naccess to, they also lack the interactivity and probing that interview-based\nmethods offer. In this paper, we explore a novel data collection method in\nwhich a robot plays the role of an interactive diary. We developed the Diary\nRobot system and performed in-home deployments for a week to evaluate the\nfeasibility and effectiveness of this approach. Using traditional text-based\nand audio-based diaries as benchmarks, we found that robots are able to\neffectively elicit the intended information. We reflect on our findings, and\ndescribe scenarios where the utilization of robots in diary studies as a data\ncollection instrument may be especially applicable.\n","authors":["Michael F. Xu","Bilge Mutlu"],"pdf_url":"https://arxiv.org/pdf/2501.04860v1.pdf","comment":"Proceedings of the 29th ACM/IEEE International Conference on Human\n  Robot Interaction (HRI 2025)"},{"id":"http://arxiv.org/abs/2405.05210v2","updated":"2025-01-08T22:09:46Z","published":"2024-05-08T16:58:22Z","title":"TCAFF: Temporal Consistency for Robot Frame Alignment","summary":"  In the field of collaborative robotics, the ability to communicate spatial\ninformation like planned trajectories and shared environment information is\ncrucial. When no global position information is available (e.g., indoor or\nGPS-denied environments), agents must align their coordinate frames before\nshared spatial information can be properly expressed and interpreted.\nCoordinate frame alignment is particularly difficult when robots have no\ninitial alignment and are affected by odometry drift. To this end, we develop a\nnovel multiple hypothesis algorithm, called TCAFF, for aligning the coordinate\nframes of neighboring robots. TCAFF considers potential alignments from\nassociating sparse open-set object maps and leverages temporal consistency to\ndetermine an initial alignment and correct for drift, all without any initial\nknowledge of neighboring robot poses. We demonstrate TCAFF being used for frame\nalignment in a collaborative object tracking application on a team of four\nrobots tracking six pedestrians and show that TCAFF enables robots to achieve a\ntracking accuracy similar to that of a system with ground truth localization.\nThe code and hardware dataset are available at\nhttps://github.com/mit-acl/tcaff.\n","authors":["Mason B. Peterson","Parker C. Lusk","Antonio Avila","Jonathan P. How"],"pdf_url":"https://arxiv.org/pdf/2405.05210v2.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.04823v1","updated":"2025-01-08T20:22:16Z","published":"2025-01-08T20:22:16Z","title":"Learning Robot Safety from Sparse Human Feedback using Conformal\n  Prediction","summary":"  Ensuring robot safety can be challenging; user-defined constraints can miss\nedge cases, policies can become unsafe even when trained from safe data, and\nsafety can be subjective. Thus, we learn about robot safety by showing policy\ntrajectories to a human who flags unsafe behavior. From this binary feedback,\nwe use the statistical method of conformal prediction to identify a region of\nstates, potentially in learned latent space, guaranteed to contain a\nuser-specified fraction of future policy errors. Our method is\nsample-efficient, as it builds on nearest neighbor classification and avoids\nwithholding data as is common with conformal prediction. By alerting if the\nrobot reaches the suspected unsafe region, we obtain a warning system that\nmimics the human's safety preferences with guaranteed miss rate. From video\nlabeling, our system can detect when a quadcopter visuomotor policy will fail\nto steer through a designated gate. We present an approach for policy\nimprovement by avoiding the suspected unsafe region. With it we improve a model\npredictive controller's safety, as shown in experimental testing with 30\nquadcopter flights across 6 navigation tasks. Code and videos are provided.\n","authors":["Aaron O. Feldman","Joseph A. Vincent","Maximilian Adang","Jun En Low","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2501.04823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19813v2","updated":"2025-01-08T19:50:09Z","published":"2024-12-10T19:58:47Z","title":"Coverage Path Planning in Precision Agriculture: Algorithms,\n  Applications, and Key Benefits","summary":"  Coverage path planning (CPP) is the task of computing an optimal path within\na region to completely scan or survey an area of interest using one or multiple\nmobile robots. Robots equipped with sensors and cameras can collect vast\namounts of data on crop health, soil conditions, and weather patterns. Advanced\nanalytics can then be applied to this data to make informed decisions,\nimproving overall farm management. In this paper, we will demonstrate one\napproach to find the optimal coverage path of an agricultural field using a\nsingle robot, and one using multiple robots. For the single robot, we used a\nwavefront coverage algorithm that generates a sequence of locations that the\nrobot needs to follow. For the multi-robot approach, the proposed approach\nconsists of two steps: dividing the agricultural field into convex polygonal\nareas to optimally distribute them among the robots, and generating an optimal\ncoverage path to ensure minimum coverage time for each of the polygonal areas.\n","authors":["Jahid Chowdhury Choton","William H. Hsu"],"pdf_url":"https://arxiv.org/pdf/2412.19813v2.pdf","comment":"The co-authors have asked to withdraw this paper, since it contains\n  incomplete and incorrect informations"},{"id":"http://arxiv.org/abs/2412.16186v2","updated":"2025-01-08T19:49:53Z","published":"2024-12-12T16:57:49Z","title":"Formal Modeling and Verification of Publisher-Subscriber Paradigm in ROS\n  2","summary":"  The Robot Operating System (ROS) is one of the most popular middleware for\ndeveloping robot applications, but it is subject to major shortcomings when\napplied to real-time robotic systems in safety-critical environments. For this\nreason, ROS 2 was released in 2017 for implementing real-time capabilities in\ndistributed robotic systems while supporting the most prominent aspects of the\noriginal ROS. There is still not much work done to provide formal guarantees\nand correctness of a ROS program. In this paper, we propose a framework to\naddress this challenging problem of guaranteeing the correct behaviour of\nrobotic systems. We propose a formal modelling of a ROS 2 program, and also\ndescribe the program using a network of timed automata. We then prove that the\nsets of executions of a ROS program in the model and in the network of timed\nautomata are the same. Thus to analyze a publisher-subscriber scenario of ROS 2\nprogram, our algorithm first converts the program into the model, and then into\nthe network of timed automata. The applicability and validity of our approach\nare verified by conducting several experiments on a simplified system and an\nactual robotic system, and the results and limitations are discussed.\n","authors":["Jahid Chowdhury Choton","Lipsy Gupta","Pavithra Prabhakar"],"pdf_url":"https://arxiv.org/pdf/2412.16186v2.pdf","comment":"The co-authors have asked to withdraw this paper, since it contains\n  incomplete and incorrect informations"},{"id":"http://arxiv.org/abs/2501.04759v1","updated":"2025-01-08T17:28:30Z","published":"2025-01-08T17:28:30Z","title":"Optimize the parameters of the PID Controller using Genetic Algorithm\n  for Robot Manipulators","summary":"  This paper presents the design a Proportional-Integral-Derivative (PID)\ncontroller with optimized parameters for a two-degree-of-freedom robotic arm. A\ngenetic algorithm (GA) is proposed to optimize the controller parameters,\naddressing the challenges in determining PID controller parameters for highly\nnonlinear systems like robotic arms compared to traditional methods. The\nGA-optimized PID controller significantly improves control accuracy and\nperformance over traditional control methods. Simulation results demonstrate\nthat the robotic arm system operates with high precision and stability.\nAdditionally, the shortened trajectory tracking response time enhances the\nfeasibility of applying this control algorithm in realworld scenarios. This\nresearch not only confirms the suitability of PID-GA for robotic arms and\nsimilar systems but also opens new avenues for applying this algorithm to real\nphysical systems.\n","authors":["Vu Ngoc Son","Pham Van Cuong","Nguyen Duy Minh","Phi Hoang Nha"],"pdf_url":"https://arxiv.org/pdf/2501.04759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04755v1","updated":"2025-01-08T16:57:44Z","published":"2025-01-08T16:57:44Z","title":"Improving Human-Robot Teaching by Quantifying and Reducing Mental Model\n  Mismatch","summary":"  The rapid development of artificial intelligence and robotics has had a\nsignificant impact on our lives, with intelligent systems increasingly\nperforming tasks traditionally performed by humans. Efficient knowledge\ntransfer requires matching the mental model of the human teacher with the\ncapabilities of the robot learner. This paper introduces the Mental Model\nMismatch (MMM) Score, a feedback mechanism designed to quantify and reduce\nmismatches by aligning human teaching behavior with robot learning behavior.\nUsing Large Language Models (LLMs), we analyze teacher intentions in natural\nlanguage to generate adaptive feedback. A study with 150 participants teaching\na virtual robot to solve a puzzle game shows that intention-based feedback\nsignificantly outperforms traditional performance-based feedback or no\nfeedback. The results suggest that intention-based feedback improves\ninstructional outcomes, improves understanding of the robot's learning process\nand reduces misconceptions. This research addresses a critical gap in\nhuman-robot interaction (HRI) by providing a method to quantify and mitigate\ndiscrepancies between human mental models and robot capabilities, with the goal\nof improving robot learning and human teaching effectiveness.\n","authors":["Phillip Richter","Heiko Wersing","Anna-Lisa Vollmer"],"pdf_url":"https://arxiv.org/pdf/2501.04755v1.pdf","comment":"11 Pages, 4 Figures"},{"id":"http://arxiv.org/abs/2501.04754v1","updated":"2025-01-08T16:57:11Z","published":"2025-01-08T16:57:11Z","title":"Development of an Adaptive Sliding Mode Controller using Neural Networks\n  for Trajectory Tracking of a Cylindrical Manipulator","summary":"  Cylindrical manipulators are extensively used in industrial automation,\nespecially in emerging technologies like 3D printing, which represents a\nsignificant future trend. However, controlling the trajectory of nonlinear\nmodels with system uncertainties remains a critical challenge, often leading to\nreduced accuracy and reliability. To address this, the study develops an\nAdaptive Sliding Mode Controller (ASMC) integrated with Neural Networks (NNs)\nto improve trajectory tracking for cylindrical manipulators. The ASMC leverages\nthe robustness of sliding mode control and the adaptability of neural networks\nto handle uncertainties and dynamic variations effectively. Simulation results\nvalidate that the proposed ASMC-NN achieves high trajectory tracking accuracy,\nfast response time, and enhanced reliability, making it a promising solution\nfor applications in 3D printing and beyond.\n","authors":["TieuNien Le","VanCuong Pham","NgocSon Vu"],"pdf_url":"https://arxiv.org/pdf/2501.04754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05483v1","updated":"2025-01-08T03:47:52Z","published":"2025-01-08T03:47:52Z","title":"Human Grasp Generation for Rigid and Deformable Objects with Decomposed\n  VQ-VAE","summary":"  Generating realistic human grasps is crucial yet challenging for object\nmanipulation in computer graphics and robotics. Current methods often struggle\nto generate detailed and realistic grasps with full finger-object interaction,\nas they typically rely on encoding the entire hand and estimating both posture\nand position in a single step. Additionally, simulating object deformation\nduring grasp generation is still difficult, as modeling such deformation\nrequires capturing the comprehensive relationship among points of the object's\nsurface. To address these limitations, we propose a novel improved Decomposed\nVector-Quantized Variational Autoencoder (DVQ-VAE-2), which decomposes the hand\ninto distinct parts and encodes them separately. This part-aware architecture\nallows for more precise management of hand-object interactions. Furthermore, we\nintroduce a dual-stage decoding strategy that first predicts the grasp type\nunder skeletal constraints and then identifies the optimal grasp position,\nenhancing both the realism and adaptability of the model to unseen\ninteractions. Furthermore, we introduce a new Mesh UFormer as the backbone\nnetwork to extract the hierarchical structural representations from the mesh\nand propose a new normal vector-guided position encoding to simulate the\nhand-object deformation. In experiments, our model achieves a relative\nimprovement of approximately 14.1% in grasp quality compared to\nstate-of-the-art methods across four widely used benchmarks. Our comparisons\nwith other backbone networks show relative improvements of 2.23% in Hand-object\nContact Distance and 5.86% in Quality Index on deformable and rigid object\nbased datasets, respectively. Our source code and model are available at\nhttps://github.com/florasion/D-VQVAE.\n","authors":["Mengshi Qi","Zhe Zhao","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.05483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06235v1","updated":"2025-01-08T09:08:06Z","published":"2025-01-08T09:08:06Z","title":"NextStop: An Improved Tracker For Panoptic LIDAR Segmentation Data","summary":"  4D panoptic LiDAR segmentation is essential for scene understanding in\nautonomous driving and robotics ,combining semantic and instance segmentation\nwith temporal consistency.Current methods, like 4D-PLS and 4D-STOP, use a\ntracking-by-detection methodology, employing deep learning networks to perform\nsemantic and instance segmentation on each frame. To maintain temporal\nconsistency, large-size instances detected in the current frame are compared\nand associated with instances within a temporal window that includes the\ncurrent and preceding frames. However, their reliance on short-term instance\ndetection, lack of motion estimation, and exclusion of small-sized instances\nlead to frequent identity switches and reduced tracking performance. We address\nthese issues with the NextStop1 tracker, which integrates Kalman filter-based\nmotion estimation, data association, and lifespan management, along with a\ntracklet state concept to improve prioritization. Evaluated using the LiDAR\nSegmentation and Tracking Quality (LSTQ) metric on the SemanticKITTI validation\nset, NextStop demonstrated enhanced tracking performance, particularly for\nsmall-sized objects like people and bicyclists, with fewer ID switches, earlier\ntracking initiation, and improved reliability in complex environments. The\nsource code is available at https://github.com/AIROTAU/NextStopTracker\n","authors":["Nirit Alkalay","Roy Orfaig","Ben-Zion Bobrovsky"],"pdf_url":"https://arxiv.org/pdf/2501.06235v1.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2501.02792v2","updated":"2025-01-08T17:58:19Z","published":"2025-01-06T06:25:46Z","title":"Gaming on Coincident Peak Shaving: Equilibrium and Strategic Behavior","summary":"  Coincident peak demand charges are imposed by power system operators or\nelectric utilities when the overall system demand, aggregated across multiple\nconsumers, reaches its peak. These charges incentivize consumers to reduce\ntheir demand during peak periods, a practice known as coincident peak shaving.\nIn this paper, we analyze the coincident peak shaving problem through the lens\nof game theory, developing a theoretical model to examine the impact of\nstrategic consumer behavior on system efficiency. We demonstrate that the game\nstructure exhibits varying characteristics - concave,\nquasiconcave/discontinuous, or non-concave/discontinuous - depending on the\nextent of consumers demand-shifting capabilities. For a two-agent, two-period\nsetting, we derive closed-form Nash equilibrium solutions under each condition\nand generalize our findings to cases with multiple agents. We prove the\nstability of the equilibrium points and present an algorithm for computing\nequilibrium outcomes across all game scenarios. We also show that the\npeak-shaving effectiveness of the game model matches that of the centralized\npeak-shaving model but with increased levels of anarchy. In the cases of\nquasiconcave and non-concave game conditions, we analytically demonstrate in\nthe two-agent setting that anarchy increases with consumers' flexibility and\ninequity, as measured by their marginal shifting costs, and we also analyze the\ninfluence of the number of agents on anarchy. Finally, we provide numerical\nsimulations to validate our theoretical results.\n","authors":["Liudong Chen","Bolun Xu"],"pdf_url":"https://arxiv.org/pdf/2501.02792v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04623v1","updated":"2025-01-08T17:09:50Z","published":"2025-01-08T17:09:50Z","title":"Large-scale Grid Optimization: The Workhorse of Future Grid Computations","summary":"  Purpose: The computation methods for modeling, controlling and optimizing the\ntransforming grid are evolving rapidly. We review and systemize knowledge for a\nspecial class of computation methods that solve large-scale power grid\noptimization problems. Summary: Large-scale grid optimizations are pertinent\nfor, amongst other things, hedging against risk due to resource stochasticity,\nevaluating aggregated DERs' impact on grid operation and design, and improving\nthe overall efficiency of grid operation in terms of cost, reliability, and\ncarbon footprint. We attribute the continual growth in scale and complexity of\ngrid optimizations to a large influx of new spatial and temporal features in\nboth transmission (T) and distribution (D) networks. Therefore, to systemize\nknowledge in the field, we discuss the recent advancements in T and D systems\nfrom the viewpoint of mechanistic physics-based and emerging data-driven\nmethods. Findings: We find that while mechanistic physics-based methods are\nleading the science in solving large-scale grid optimizations, data-driven\ntechniques, especially physics-constrained ones, are emerging as an alternative\nto solve otherwise intractable problems. We also find observable gaps in the\nfield and ascertain these gaps from the paper's literature review and by\ncollecting and synthesizing feedback from industry experts.\n","authors":["Amritanshu Pandey","Mads Almassalkhi","Sam Chevalier"],"pdf_url":"https://arxiv.org/pdf/2501.04623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04572v1","updated":"2025-01-08T15:42:41Z","published":"2025-01-08T15:42:41Z","title":"Regret Analysis: a control perspective","summary":"  Online learning and model reference adaptive control have many interesting\nintersections. One area where they differ however is in how the algorithms are\nanalyzed and what objective or metric is used to discriminate \"good\" algorithms\nfrom \"bad\" algorithms. In adaptive control there are usually two objectives: 1)\nprove that all time varying parameters/states of the system are bounded, and 2)\nthat the instantaneous error between the adaptively controlled system and a\nreference system converges to zero over time (or at least a compact set). For\nonline learning the performance of algorithms is often characterized by the\nregret the algorithm incurs. Regret is defined as the cumulative loss (cost)\nover time from the online algorithm minus the cumulative loss (cost) of the\nsingle optimal fixed parameter choice in hindsight. Another significant\ndifference between the two areas of research is with regard to the assumptions\nmade in order to obtain said results. Adaptive control makes assumptions about\nthe input-output properties of the control problem and derives solutions for a\nfixed error model or optimization task. In the online learning literature\nresults are derived for classes of loss functions (i.e. convex) while a priori\nassuming that all time varying parameters are bounded, which for many\noptimization tasks is not unrealistic, but is a non starter in control\napplications. In this work we discuss these differences in detail through the\nregret based analysis of gradient descent for convex functions and the control\nbased analysis of a streaming regression problem. We close with a discussion\nabout the newly defined paradigm of online adaptive control and ask the\nfollowing question \"Are regret optimal control strategies deployable?\"\n","authors":["Travis E. Gibson","Sawal Acharya"],"pdf_url":"https://arxiv.org/pdf/2501.04572v1.pdf","comment":"10 pages no figures"},{"id":"http://arxiv.org/abs/2501.04566v1","updated":"2025-01-08T15:26:59Z","published":"2025-01-08T15:26:59Z","title":"Recursive Least Squares with Fading Regularization for Finite-Time\n  Convergence without Persistent Excitation","summary":"  This paper extends recursive least squares (RLS) to include time-varying\nregularization. This extension provides flexibility for updating the least\nsquares regularization term in real time. Existing results with constant\nregularization imply that the parameter-estimation error dynamics of RLS are\nglobally attractive to zero if and only the regressor is weakly persistently\nexciting. This work shows that, by extending classical RLS to include a\ntime-varying (fading) regularization term that converges to zero, the\nparameter-estimation error dynamics are globally attractive to zero without\nweakly persistent excitation. Moreover, if the fading regularization term\nconverges to zero in finite time, then the parameter estimation error also\nconverges to zero in finite time. Finally, we propose rank-1 fading\nregularization (R1FR) RLS, a time-varying regularization algorithm with fading\nregularization that converges to zero, and which runs in the same computational\ncomplexity as classical RLS. Numerical examples are presented to validate\ntheoretical guarantees and to show how R1FR-RLS can protect against\nover-regularization.\n","authors":["Brian Lai","Dimitra Panagou","Dennis S. Bernstein"],"pdf_url":"https://arxiv.org/pdf/2501.04566v1.pdf","comment":"Submitted to the 2025 American Control Conference"},{"id":"http://arxiv.org/abs/2501.04508v1","updated":"2025-01-08T13:55:22Z","published":"2025-01-08T13:55:22Z","title":"New Linear Model of a Composite Energy Storage System with Realizable\n  Dispatch Guarantees","summary":"  To optimize battery dispatch, a model is required that can predict the state\nof charge (SOC) trajectory and ensure dispatch is admissible (i.e., does not\nlead to unexpected SOC saturation). But battery dispatch optimization is\ninherently challenging since batteries cannot simultaneously charge and\ndischarge, which begets a non-convex complementarity constraint. In this paper,\nwe consider a composition of energy storage elements that can charge or\ndischarge independently and provide a sufficient linear energy storage model of\nthe composite battery. This permits convex optimization of the composite\nbattery SOC trajectory while ensuring admissibility of the resulting\n(aggregated) power schedule and disaggregation to the individual energy storage\nelements.\n","authors":["Mazen Elsaadany","Mads R. Almassalkhi","Simon H. Tindemans"],"pdf_url":"https://arxiv.org/pdf/2501.04508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04481v1","updated":"2025-01-08T13:04:08Z","published":"2025-01-08T13:04:08Z","title":"Safe Reinforcement Learning with Minimal Supervision","summary":"  Reinforcement learning (RL) in the real world necessitates the development of\nprocedures that enable agents to explore without causing harm to themselves or\nothers. The most successful solutions to the problem of safe RL leverage\noffline data to learn a safe-set, enabling safe online exploration. However,\nthis approach to safe-learning is often constrained by the demonstrations that\nare available for learning.\n  In this paper we investigate the influence of the quantity and quality of\ndata used to train the initial safe learning problem offline on the ability to\nlearn safe-RL policies online. Specifically, we focus on tasks with spatially\nextended goal states where we have few or no demonstrations available.\nClassically this problem is addressed either by using hand-designed controllers\nto generate data or by collecting user-generated demonstrations. However, these\nmethods are often expensive and do not scale to more complex tasks and\nenvironments. To address this limitation we propose an unsupervised RL-based\noffline data collection procedure, to learn complex and scalable policies\nwithout the need for hand-designed controllers or user demonstrations. Our\nresearch demonstrates the significance of providing sufficient demonstrations\nfor agents to learn optimal safe-RL policies online, and as a result, we\npropose optimistic forgetting, a novel online safe-RL approach that is\npractical for scenarios with limited data. Further, our unsupervised data\ncollection approach highlights the need to balance diversity and optimality for\nsafe online exploration.\n","authors":["Alexander Quessy","Thomas Richardson","Sebastian East"],"pdf_url":"https://arxiv.org/pdf/2501.04481v1.pdf","comment":"Initially submitted to ICML 2023"},{"id":"http://arxiv.org/abs/2501.04437v1","updated":"2025-01-08T11:37:35Z","published":"2025-01-08T11:37:35Z","title":"Integrating LLMs with ITS: Recent Advances, Potentials, Challenges, and\n  Future Directions","summary":"  Intelligent Transportation Systems (ITS) are crucial for the development and\noperation of smart cities, addressing key challenges in efficiency,\nproductivity, and environmental sustainability. This paper comprehensively\nreviews the transformative potential of Large Language Models (LLMs) in\noptimizing ITS. Initially, we provide an extensive overview of ITS,\nhighlighting its components, operational principles, and overall effectiveness.\nWe then delve into the theoretical background of various LLM techniques, such\nas GPT, T5, CTRL, and BERT, elucidating their relevance to ITS applications.\nFollowing this, we examine the wide-ranging applications of LLMs within ITS,\nincluding traffic flow prediction, vehicle detection and classification,\nautonomous driving, traffic sign recognition, and pedestrian detection. Our\nanalysis reveals how these advanced models can significantly enhance traffic\nmanagement and safety. Finally, we explore the challenges and limitations LLMs\nface in ITS, such as data availability, computational constraints, and ethical\nconsiderations. We also present several future research directions and\npotential innovations to address these challenges. This paper aims to guide\nresearchers and practitioners through the complexities and opportunities of\nintegrating LLMs in ITS, offering a roadmap to create more efficient,\nsustainable, and responsive next-generation transportation systems.\n","authors":["Doaa Mahmud","Hadeel Hajmohamed","Shamma Almentheri","Shamma Alqaydi","Lameya Aldhaheri","Ruhul Amin Khalil","Nasir Saeed"],"pdf_url":"https://arxiv.org/pdf/2501.04437v1.pdf","comment":"Accepted for publication in IEEE Transactions on Intelligent\n  Transportation Systems"},{"id":"http://arxiv.org/abs/2501.04422v1","updated":"2025-01-08T11:15:04Z","published":"2025-01-08T11:15:04Z","title":"A new methodology for the optimization of bolt tightening sequences for\n  ring type joints","summary":"  Achieving uniform bolt load distribution is critical to obtain leak-free\nservice in pressure vessel gasketed joints used in offshore pipelines. This is\na difficult task due to bolt load variations during the assembly process. In\nthis sense, the Elastic Interaction Coefficients Method has been developed in\nprevious works to define tightening sequences that provide the target load at\nthe end of the sequence in one or two passes. The method is very costly because\na complete sequence must be simulated and the load of every bolt must be\nmeasured after each tightening operation. The present work validates this\nmethod for Ring Type Joints and further develops a numerically and\nexperimentally validated new methodology that provides highly satisfactory\nresults with a significantly lower cost.\n","authors":["Ibai Coria","Mikel Abasolo","Imanol Olaskoaga","Arkaitz Etxezarreta","Josu Aguirrebeitia"],"pdf_url":"https://arxiv.org/pdf/2501.04422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04080v3","updated":"2025-01-08T09:03:14Z","published":"2024-02-06T15:34:30Z","title":"Entropy-regularized Diffusion Policy with Q-Ensembles for Offline\n  Reinforcement Learning","summary":"  This paper presents advanced techniques of training diffusion policies for\noffline reinforcement learning (RL). At the core is a mean-reverting stochastic\ndifferential equation (SDE) that transfers a complex action distribution into a\nstandard Gaussian and then samples actions conditioned on the environment state\nwith a corresponding reverse-time SDE, like a typical diffusion policy. We show\nthat such an SDE has a solution that we can use to calculate the log\nprobability of the policy, yielding an entropy regularizer that improves the\nexploration of offline datasets. To mitigate the impact of inaccurate value\nfunctions from out-of-distribution data points, we further propose to learn the\nlower confidence bound of Q-ensembles for more robust policy improvement. By\ncombining the entropy-regularized diffusion policy with Q-ensembles in offline\nRL, our method achieves state-of-the-art performance on most tasks in D4RL\nbenchmarks. Code is available at\nhttps://github.com/ruoqizzz/Entropy-Regularized-Diffusion-Policy-with-QEnsemble.\n","authors":["Ruoqi Zhang","Ziwei Luo","Jens Sjölund","Thomas B. Schön","Per Mattsson"],"pdf_url":"https://arxiv.org/pdf/2402.04080v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04275v1","updated":"2025-01-08T04:54:14Z","published":"2025-01-08T04:54:14Z","title":"Adaptive Numerical Differentiation for Extremum Seeking with Sensor\n  Noise","summary":"  Extremum-seeking control (ESC) is widely used to optimize performance when\nthe system dynamics are uncertain. However, sensitivity to sensor noise is an\nimportant issue in ESC implementation due to the use of high-pass filters or\ngradient estimators. To reduce the sensitivity of ESC to noise, this paper\ninvestigates the use of adaptive input and state estimation (AISE) for\nnumerical differentiation. In particular, this paper develops extremum-seeking\ncontrol with adaptive input and state estimation (ESC/AISE), where the\nhigh-pass filter of ESC is replaced by AISE to improve performance under sensor\nnoise. The effectiveness of ESC/AISE is illustrated via numerical examples.\n","authors":["Shashank Verma","Juan Augusto Paredes Salazar","Jhon Manuel Portella Delgado","Ankit Goel","Dennis S. Bernstein"],"pdf_url":"https://arxiv.org/pdf/2501.04275v1.pdf","comment":"8 pages, 13 figures. Submitted to ACC 2025"},{"id":"http://arxiv.org/abs/2501.04273v1","updated":"2025-01-08T04:45:14Z","published":"2025-01-08T04:45:14Z","title":"Frenet-Serret-Based Trajectory Prediction","summary":"  Trajectory prediction is a crucial element of guidance, navigation, and\ncontrol systems. This paper presents two novel trajectory-prediction methods\nbased on real-time position measurements and adaptive input and state\nestimation (AISE). The first method, called AISE/va, uses position measurements\nto estimate the target velocity and acceleration. The second method, called\nAISE/FS, models the target trajectory as a 3D curve using the Frenet-Serret\nformulas, which require estimates of velocity, acceleration, and jerk. To\nestimate velocity, acceleration, and jerk in real time, AISE computes first,\nsecond, and third derivatives of the position measurements. AISE does not rely\non assumptions about the target maneuver, measurement noise, or disturbances.\nFor trajectory prediction, both methods use measurements of the target position\nand estimates of its derivatives to extrapolate from the current position. The\nperformance of AISE/va and AISE/FS is compared numerically with the\n$\\alpha$-$\\beta$-$\\gamma$ filter, which shows that AISE/FS provides more\naccurate trajectory prediction than AISE/va and traditional methods, especially\nfor complex target maneuvers.\n","authors":["Shashank Verma","Dennis S. Bernstein"],"pdf_url":"https://arxiv.org/pdf/2501.04273v1.pdf","comment":"8 pages, 6 figures. Submitted to ACC 2025"},{"id":"http://arxiv.org/abs/2501.04262v1","updated":"2025-01-08T04:10:43Z","published":"2025-01-08T04:10:43Z","title":"Target Tracking Using the Invariant Extended Kalman Filter with\n  Numerical Differentiation for Estimating Curvature and Torsion","summary":"  The goal of target tracking is to estimate target position, velocity, and\nacceleration in real time using position data. This paper introduces a novel\ntarget-tracking technique that uses adaptive input and state estimation (AISE)\nfor real-time numerical differentiation to estimate velocity, acceleration, and\njerk from position data. These estimates are used to model the target motion\nwithin the Frenet-Serret (FS) frame. By representing the model in SE(3), the\nposition and velocity are estimated using the invariant extended Kalman filter\n(IEKF). The proposed method, called FS-IEKF-AISE, is illustrated by numerical\nexamples and compared to prior techniques.\n","authors":["Shashank Verma","Dennis S. Bernstein"],"pdf_url":"https://arxiv.org/pdf/2501.04262v1.pdf","comment":"7 pages, 8 figures, submitted to ACC 2025"},{"id":"http://arxiv.org/abs/2411.06107v2","updated":"2025-01-08T02:50:58Z","published":"2024-11-09T08:01:17Z","title":"A capacity renting framework for shared energy storage considering\n  peer-to-peer energy trading of prosumers with privacy protection","summary":"  Shared energy storage systems (ESS) present a promising solution to the\ntemporal imbalance between energy generation from renewable distributed\ngenerators (DGs) and the power demands of prosumers. However, as DG penetration\nrates rise, spatial energy imbalances become increasingly significant,\nnecessitating the integration of peer-to-peer (P2P) energy trading within the\nshared ESS framework. Two key challenges emerge in this context: the absence of\neffective mechanisms and the greater difficulty for privacy protection due to\nincreased data communication. This research proposes a capacity renting\nframework for shared ESS considering P2P energy trading of prosumers. In the\nproposed framework, prosumers can participate in P2P energy trading and rent\ncapacities from shared ESS. A generalized Nash game is formulated to model the\ntrading process and the competitive interactions among prosumers, and the\nvariational equilibrium of the game is proved to be equivalent to the optimal\nsolution of a quadratic programming (QP) problem. To address the privacy\nprotection concern, the problem is solved using the alternating direction\nmethod of multipliers (ADMM) with the Paillier cryptosystem. Finally, numerical\nsimulations demonstrate the impact of P2P energy trading on the shared ESS\nframework and validate the effectiveness of the proposed privacy-preserving\nalgorithm.\n","authors":["Yingcong Sun","Laijun Chen","Yue Chen","Mingrui Tang","Shengwei Mei"],"pdf_url":"https://arxiv.org/pdf/2411.06107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04242v1","updated":"2025-01-08T02:47:07Z","published":"2025-01-08T02:47:07Z","title":"Beam Domain Channel Estimation for Spatial Non-Stationary Massive MIMO\n  Systems","summary":"  In massive multiple-input multiple-output (MIMO) systems, the channel\nestimation scheme is subject to the spatial non-stationarity and inevitably\npower leakage in the beam domain. In this paper, a beam domain channel\nestimation scheme is investigated for spatial non-stationary (SNS) massive MIMO\nsystems considering power leakage. %a novel beam domain channel estimation\nscheme is proposed for spatial non-stationary (SNS) massive MIMO systems.\nSpecifically, a realistic massive MIMO beam domain channel model (BDCM) is\nintroduced to capture the spatial non-stationarity considering power leakage by\nintroducing the illustration of visibility region (VR). Then, a beam domain\nstructure-based sparsity adaptive matching pursuit (BDS-SAMP) scheme is\nproposed based on the cross-block sparse structure and power ratio threshold of\nbeam domain channel. Finally, the simulation results validate the accuracy of\nproposed BDS-SAMP scheme with low pilot overhead and reasonable complexity by\ncomparing with conventional schemes.\n","authors":["Lin Hou","Hengtai Chang","Cheng-Xiang Wang","Jie Huang","Songjiang Yang"],"pdf_url":"https://arxiv.org/pdf/2501.04242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04240v1","updated":"2025-01-08T02:35:32Z","published":"2025-01-08T02:35:32Z","title":"A Novel Non-Stationary Channel Emulator for 6G MIMO Wireless Channels","summary":"  The performance evaluation of sixth generation (6G) communication systems is\nanticipated to be a controlled and repeatable process in the lab, which brings\nup the demand for wireless channel emulators. However, channel emulation for 6G\nspace-time-frequency (STF) non-stationary channels is missing currently. In\nthis paper, a non-stationary multiple-input multiple-output (MIMO)\ngeometry-based stochastic model (GBSM) that accurately characterizes the\nchannel STF properties is introduced firstly. Then, a subspace-based method is\nproposed for reconstructing the channel fading obtained from the GBSM and a\nchannel emulator architecture with frequency domain processing is presented for\n6G MIMO systems. Moreover, the spatial time-varying channel transfer functions\n(CTFs) of the channel simulation and the channel emulation are compared and\nanalyzed. The Doppler power spectral density (PSD) and delay PSD are further\nderived and compared between the channel model simulation and subspace-based\nemulation. The results demonstrate that the proposed channel emulator is\ncapable of reproducing the non-stationary channel characteristics.\n","authors":["Yuan Zong","Lijian Xin","Jie Huang","Cheng-Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04238v1","updated":"2025-01-08T02:32:15Z","published":"2025-01-08T02:32:15Z","title":"A Quasi-deterministic Channel Model for Underwater Acoustic\n  Communication Systems","summary":"  In this paper, a quasi-deterministic (Q-D) model for non-stationary\nunderwater acoustic (UWA) channels is proposed. This model combines the BELLHOP\ndeterministic model and geometry-based stochastic model (GBSM), which provides\nhigher accuracy and flexibility. Different propagation components in shallow\nwater are classified as D-rays, R-rays and F-rays in the proposed model, where\nD-rays are modeled by BELLHOP while both R-rays and F-rays are modeled by GBSM.\nSome important channel statistical properties, including time-frequency\ncorrelation function (TF-CF), Doppler power spectrum density (PSD), average\nDoppler shift, and RMS Doppler spread are derived and simulated. Finally,\nsimulation results illustrate the correctness of the proposed model.\n","authors":["Yuxuan Yang","Yilin Ma","Hengtai Chang","Cheng-Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04222v1","updated":"2025-01-08T01:39:10Z","published":"2025-01-08T01:39:10Z","title":"Privacy-Preserving Distributed Online Mirror Descent for Nonconvex\n  Optimization","summary":"  We investigate the distributed online nonconvex optimization problem with\ndifferential privacy over time-varying networks. Each node minimizes the sum of\nseveral nonconvex functions while preserving the node's differential privacy.\nWe propose a privacy-preserving distributed online mirror descent algorithm for\nnonconvex optimization, which uses the mirror descent to update decision\nvariables and the Laplace differential privacy mechanism to protect privacy.\nUnlike the existing works, the proposed algorithm allows the cost functions to\nbe nonconvex, which is more applicable. Based upon these, we prove that if the\ncommunication network is $B$-strongly connected and the constraint set is\ncompact, then by choosing the step size properly, the algorithm guarantees\n$\\epsilon$-differential privacy at each time. Furthermore, we prove that if the\nlocal cost functions are $\\beta$-smooth, then the regret over time horizon $T$\ngrows sublinearly while preserving differential privacy, with an upper bound\n$O(\\sqrt{T})$. Finally, the effectiveness of the algorithm is demonstrated\nthrough numerical simulations.\n","authors":["Yingjie Zhou","Tao Li"],"pdf_url":"https://arxiv.org/pdf/2501.04222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00958v2","updated":"2025-01-08T23:16:20Z","published":"2024-05-02T02:50:58Z","title":"Generative manufacturing systems using diffusion models and ChatGPT","summary":"  In this study, we introduce Generative Manufacturing Systems (GMS) as a novel\napproach to effectively manage and coordinate autonomous manufacturing assets,\nthereby enhancing their responsiveness and flexibility to address a wide array\nof production objectives and human preferences. Deviating from traditional\nexplicit modeling, GMS employs generative AI, including diffusion models and\nChatGPT, for implicit learning from envisioned futures, marking a shift from a\nmodel-optimum to a training-sampling decision-making. Through the integration\nof generative AI, GMS enables complex decision-making through interactive\ndialogue with humans, allowing manufacturing assets to generate multiple\nhigh-quality global decisions that can be iteratively refined based on human\nfeedback. Empirical findings showcase GMS's substantial improvement in system\nresilience and responsiveness to uncertainties, with decision times reduced\nfrom seconds to milliseconds. The study underscores the inherent creativity and\ndiversity in the generated solutions, facilitating human-centric\ndecision-making through seamless and continuous human-machine interactions.\n","authors":["Xingyu Li","Fei Tao","Wei Ye","Aydin Nassehi","John W. Sutherland"],"pdf_url":"https://arxiv.org/pdf/2405.00958v2.pdf","comment":"We are withdrawing this preprint to incorporate significant new\n  results and expand the scope of the paper. We plan to resubmit a\n  substantially revised version in the near future"},{"id":"http://arxiv.org/abs/2403.07988v2","updated":"2025-01-08T22:31:48Z","published":"2024-03-12T18:00:29Z","title":"Configuration and EMT Simulation of the 240-bus MiniWECC System\n  Integrating Offshore Wind Farms (OWFs)","summary":"  As offshore wind farms (OWFs) become increasingly prevalent in Northern\nCalifornia and Southern Oregon, they introduce faster dynamics into the Western\nElectricity Coordinating Council (WECC) system, reshaping its dynamic behavior.\nAccordingly, electromagnetic transient (EMT) simulation is essential to assess\nhigh frequency dynamics of the WECC system with integrated OWFs. Against this\nbackground, this paper presents the integration of detailed dynamic models of\nOWFs into a 240-bus miniWECC system in PSCAD software. The sequential\ninitialization technique is employed to facilitate the smooth initiation of a\nlarge-scale system in an EMT simulation. The performance of the configured\nmodel is assessed under wind speed variations and grounded faults,\ndemonstrating the effectiveness of the miniWECC system with OWFs. This system\nserves as a valuable basic use case for validating the fast dynamic performance\nof future WECC systems with high penetration of wind energy.\n","authors":["Buxin She","Hisham Mahmood","Marcelo Elizondo","Veronica Adetola","Yuqing Dong"],"pdf_url":"https://arxiv.org/pdf/2403.07988v2.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2501.04839v1","updated":"2025-01-08T20:57:14Z","published":"2025-01-08T20:57:14Z","title":"DRL-Based Medium-Term Planning of Renewable-Integrated Self-Scheduling\n  Cascaded Hydropower to Guide Wholesale Market Participation","summary":"  For self-scheduling cascaded hydropower (S-CHP) facilities, medium-term\nplanning is a critical step that coordinates water availability over the\nmedium-term horizon, providing water usage guidance for their short-term\noperations in wholesale market participation. Typically, medium-term planning\nstrategies (e.g., reservoir storage targets at the end of each short-term\nperiod) are determined by either optimization methods or rules of thumb.\nHowever, with the integration of variable renewable energy sources (VRESs),\noptimization-based methods suffer from deviations between the anticipated and\nactual reservoir storage, while rules of thumb could be financially\nconservative, thereby compromising short-term operating profitability in\nwholesale market participation. This paper presents a deep reinforcement\nlearning (DRL)-based framework to derive medium-term planning policies for\nVRES-integrated S-CHPs (VS-CHPs), which can leverage contextual information\nunderneath individual short-term periods and train planning policies by their\ninduced short-term operating profits in wholesale market participation. The\nproposed DRL-based framework offers two practical merits. First, its planning\nstrategies consider both seasonal requirements of reservoir storage and needs\nfor short-term operating profits. Second, it adopts a multi-parametric\nprogramming-based strategy to accelerate the expensive training process\nassociated with multi-step short-term operations. Finally, the DRL-based\nframework is evaluated on a real-world VS-CHP, demonstrating its advantages\nover current practice.\n","authors":["Xianbang Chen","Yikui Liu","Neng Fan","Lei Wu"],"pdf_url":"https://arxiv.org/pdf/2501.04839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09876v2","updated":"2025-01-08T20:50:02Z","published":"2024-09-15T21:52:14Z","title":"A Carryover Storage Valuation Framework for Medium-Term Cascaded\n  Hydropower Planning: A Portland General Electric System Study","summary":"  Medium-term planning of cascaded hydropower (CHP) determines appropriate\ncarryover storage levels in reservoirs to optimize the usage of available water\nresources. This optimization seeks to maximize the hydropower generated in the\ncurrent period (i.e., immediate benefit) plus the potential hydropower\ngeneration in the future period (i.e., future value). Thus, in the medium-term\nCHP planning, properly quantifying the future value deposited in carryover\nstorage is essential to achieve a balanced trade-off between immediate benefit\nand future value. To this end, this paper presents a framework to quantify the\nfuture value of carryover storage, which consists of three major steps: i)\nconstructing a model to calculate the maximum possible hydropower generation\nthat a given level of carryover storage can deliver in the future period; ii)\nextracting the implicit locational marginal water value (LMWV) of carryover\nstorage for each reservoir by applying a partition-then-extract algorithm to\nthe constructed model; and iii) developing a set of analytical rules based on\nthe extracted LMWV to effectively calculate the future value. These rules can\nbe seamlessly integrated into medium-term CHP planning models as tractable\nmixed-integer linear constraints to quantify the future value properly, and can\nbe easily visualized to offer valuable insights for CHP operators. Finally,\nnumerical results on a CHP system of Portland General Electric demonstrate the\neffectiveness of the presented framework in determining proper carryover\nstorage values to facilitate medium-term CHP planning.\n","authors":["Xianbang Chen","Yikui Liu","Zhiming Zhong","Neng Fan","Zhechong Zhao","Lei Wu"],"pdf_url":"https://arxiv.org/pdf/2409.09876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01615v2","updated":"2025-01-08T20:48:41Z","published":"2025-01-03T03:19:02Z","title":"Equity Impacts of Public Transit Network Redesign with Shared Autonomous\n  Mobility Services","summary":"  This study examines the equity impacts of integrating shared autonomous\nmobility services (SAMS) into transit system redesign. Using the Greater\nChicago area as a case study, we compare two optimization objectives in\nmultimodal transit network redesign: minimizing total generalized costs\n(equity-agnostic) versus prioritizing service in low-income areas\n(equity-focused). We evaluate the achieved accessibility of clustered zones\nwith redesigned transit networks under two objectives, compared to driving and\nthe existing transit network. The transit access gaps across zones and between\ntransit and driving are found to be generally reduced with the introduction of\nSAMS, but less so with the subsequent improved infrastructure under budget.\nDifferential improvement in equity is seen across suburbs and areas of the\ncity, reflecting the disparity in current transit access and improvement\npotential. In particular, SAMS bridges the transit access gaps in suburban and\ncity areas currently underserved by transit. The City of Chicago, which is also\ndisproportionately home to vulnerable populations, offers an avenue to improve\nvertical equity. These findings demonstrate that SAMS can enhance both\nhorizontal and vertical equity in transit systems, particularly when equity is\nexplicitly incorporated into the design objective.\n","authors":["Max T. M. Ng","Meredith Raymer","Hani S. Mahmassani","Omer Verbas","Taner Cokyasar"],"pdf_url":"https://arxiv.org/pdf/2501.01615v2.pdf","comment":"Restructuring the paper for more precise research direction"},{"id":"http://arxiv.org/abs/2501.04830v1","updated":"2025-01-08T20:36:10Z","published":"2025-01-08T20:36:10Z","title":"A Deep Learning-Based Method for Power System Resilience Evaluation","summary":"  Power systems are critical infrastructure in modern society, and power\noutages can cause significant disruptions to communities and individuals' daily\nlives. The resilience of a power system measures its ability to maintain power\nsupply during highly disruptive events such as hurricanes, earthquakes, and\nthunderstorms. Traditional methods for quantifying power system resilience\ninclude statistics-based and simulation-based approaches. Statistics-based\nmethods offer a retrospective analysis of system performance without requiring\na physical model, while simulation-based methods necessitate detailed physical\nsystem information and often simplify real-world scenarios. This paper\nintroduces a deep learning-based method for evaluating power system resilience\nusing historical power outage data. The method leverages the generalization\ncapabilities of deep learning models and incorporates socio-economic and\ndemographic factors as weighting terms to highlight the impacts on vulnerable\ndemographic groups. The effectiveness of the proposed method is demonstrated\nthrough two case studies: one with real historical outage data and the other\nwith simulated outage records. This approach provides valuable insights into\nmeasuring power system resilience against hazardous weather events without\nrequiring a physical model of the target systems. The evaluation results can\nfurther guide the planning of distributed energy resources for resilience\nenhancement.\n","authors":["Xuesong Wang","Caisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04830v1.pdf","comment":"Submitted to IEEE Transactions on Power Systems"},{"id":"http://arxiv.org/abs/2408.14292v2","updated":"2025-01-08T19:37:25Z","published":"2024-08-26T14:24:35Z","title":"Decentralized Singular Value Decomposition for Large-scale Distributed\n  Sensor Networks","summary":"  This article studies the problem of decentralized Singular Value\nDecomposition (d-SVD), which is fundamental in various signal processing\napplications. Two scenarios are considered depending on the availability of the\ndata matrix under consideration. In the first scenario, the matrix of interest\nis row-wisely available in each local node in the network. In the second\nscenario, the matrix of interest implicitly forms an outer product from two\ndifferent series of measurements. By combining the lightweight local rational\nfunction approximation approach with parallel averaging consensus algorithms,\ntwo d-SVD algorithms are proposed to cope with the two aforementioned\nscenarios. We evaluate the proposed algorithms using two application examples:\ndecentralized sensor localization via low-rank matrix completion and\ndecentralized passive radar detection. Moreover, a novel and non-trivial\ntruncation technique, which employs a representative vector that is orthonormal\nto the principal signal subspace, is proposed to further reduce the\ncommunication cost associated with the d-SVD algorithms. Simulation results\nshow that the proposed d-SVD algorithms converge to the centralized solution\nwith reduced communication cost compared to those facilitated with the\nstate-of-the-art decentralized power method.\n","authors":["Yufan Fan","Marius Pesavento"],"pdf_url":"https://arxiv.org/pdf/2408.14292v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04796v1","updated":"2025-01-08T19:23:45Z","published":"2025-01-08T19:23:45Z","title":"Democratic Resilience and Sociotechnical Shocks","summary":"  We focus on the potential fragility of democratic elections given modern\ninformation-communication technologies (ICT) in the Web 2.0 era. Our work\nprovides an explanation for the cascading attrition of public officials\nrecently in the United States and offers potential policy interventions from a\ndynamic system's perspective. We propose that micro-level heterogeneity across\nindividuals within crucial institutions leads to vulnerabilities of election\nsupport systems at the macro scale. Our analysis provides comparative\nstatistics to measure the fragility of systems against targeted harassment,\ndisinformation campaigns, and other adversarial manipulations that are now\ncheaper to scale and deploy. Our analysis also informs policy interventions\nthat seek to retain public officials and increase voter turnout. We show how\nlimited resources (for example, salary incentives to public officials and\ntargeted interventions to increase voter turnout) can be allocated at the\npopulation level to improve these outcomes and maximally enhance democratic\nresilience. On the one hand, structural and individual heterogeneity cause\nsystemic fragility that adversarial actors can exploit, but also provide\nopportunities for effective interventions that offer significant global\nimprovements from limited and localized actions.\n","authors":["M. Amin Rahimian","Michael P. Colaresi"],"pdf_url":"https://arxiv.org/pdf/2501.04796v1.pdf","comment":"Computational and Mathematical Organization Theory, forthcoming"},{"id":"http://arxiv.org/abs/2501.04793v1","updated":"2025-01-08T19:18:39Z","published":"2025-01-08T19:18:39Z","title":"A Novel Observer Design for LuGre Friction Estimation and Control","summary":"  Dynamic components of the friction may directly impact the stability and\nperformance of the motion control systems. The LuGre model is a prevalent\nfriction model utilized to express this dynamic behavior. Since the LuGre model\nis very comprehensive, friction compensation based on it might be challenging.\nInspired by this, we develop a novel observer to estimate and compensate for\nLuGre friction. Furthermore, we present a Lyapunov stability analysis to show\nthat observer dynamics are asymptotically stable under certain conditions.\nCompared to its counterparts, the proposed observer constitutes a simple and\nstandalone scheme that can be utilized with arbitrary control inputs in a\nstraightforward way. As a primary difference, the presented observer estimates\nvelocity and uses the velocity error to estimate friction in addition to\ncontrol input. The extensive simulations revealed that the introduced observer\nenhances position and velocity tracking performance in the presence of\nfriction.\n","authors":["Caner Odabaş","Ömer Morgül"],"pdf_url":"https://arxiv.org/pdf/2501.04793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04783v1","updated":"2025-01-08T19:01:32Z","published":"2025-01-08T19:01:32Z","title":"Traffic Simulations: Multi-City Calibration of Metropolitan Highway\n  Networks","summary":"  This paper proposes an approach to perform travel demand calibration for\nhigh-resolution stochastic traffic simulators. It employs abundant travel times\nat the path-level, departing from the standard practice of resorting to scarce\nsegment-level sensor counts. The proposed approach is shown to tackle\nhigh-dimensional instances in a sample-efficient way. For the first time, case\nstudies on 6 metropolitan highway networks are carried out, considering a total\nof 54 calibration scenarios. This is the first work to show the ability of a\ncalibration algorithm to systematically scale across networks. Compared to the\nstate-of-the-art simultaneous perturbation stochastic approximation (SPSA)\nalgorithm, the proposed approach enhances fit to field data by an average 43.5%\nwith a maximum improvement of 80.0%, and does so within fewer simulation calls.\n","authors":["Chao Zhang","Yechen Li","Neha Arora","Damien Pierce","Carolina Osorio"],"pdf_url":"https://arxiv.org/pdf/2501.04783v1.pdf","comment":"Published on the 27th IEEE International Conference on Intelligent\n  Transportation Systems (ITSC) (2024)"},{"id":"http://arxiv.org/abs/2501.04759v1","updated":"2025-01-08T17:28:30Z","published":"2025-01-08T17:28:30Z","title":"Optimize the parameters of the PID Controller using Genetic Algorithm\n  for Robot Manipulators","summary":"  This paper presents the design a Proportional-Integral-Derivative (PID)\ncontroller with optimized parameters for a two-degree-of-freedom robotic arm. A\ngenetic algorithm (GA) is proposed to optimize the controller parameters,\naddressing the challenges in determining PID controller parameters for highly\nnonlinear systems like robotic arms compared to traditional methods. The\nGA-optimized PID controller significantly improves control accuracy and\nperformance over traditional control methods. Simulation results demonstrate\nthat the robotic arm system operates with high precision and stability.\nAdditionally, the shortened trajectory tracking response time enhances the\nfeasibility of applying this control algorithm in realworld scenarios. This\nresearch not only confirms the suitability of PID-GA for robotic arms and\nsimilar systems but also opens new avenues for applying this algorithm to real\nphysical systems.\n","authors":["Vu Ngoc Son","Pham Van Cuong","Nguyen Duy Minh","Phi Hoang Nha"],"pdf_url":"https://arxiv.org/pdf/2501.04759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04754v1","updated":"2025-01-08T16:57:11Z","published":"2025-01-08T16:57:11Z","title":"Development of an Adaptive Sliding Mode Controller using Neural Networks\n  for Trajectory Tracking of a Cylindrical Manipulator","summary":"  Cylindrical manipulators are extensively used in industrial automation,\nespecially in emerging technologies like 3D printing, which represents a\nsignificant future trend. However, controlling the trajectory of nonlinear\nmodels with system uncertainties remains a critical challenge, often leading to\nreduced accuracy and reliability. To address this, the study develops an\nAdaptive Sliding Mode Controller (ASMC) integrated with Neural Networks (NNs)\nto improve trajectory tracking for cylindrical manipulators. The ASMC leverages\nthe robustness of sliding mode control and the adaptability of neural networks\nto handle uncertainties and dynamic variations effectively. Simulation results\nvalidate that the proposed ASMC-NN achieves high trajectory tracking accuracy,\nfast response time, and enhanced reliability, making it a promising solution\nfor applications in 3D printing and beyond.\n","authors":["TieuNien Le","VanCuong Pham","NgocSon Vu"],"pdf_url":"https://arxiv.org/pdf/2501.04754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04746v1","updated":"2025-01-08T10:02:22Z","published":"2025-01-08T10:02:22Z","title":"Towards resilient cities: A hybrid simulation framework for risk\n  mitigation through data driven decision making","summary":"  Providing a comprehensive view of the city operation and offering useful\nmetrics for decision making is a well known challenge for urban risk analysis\nsystems. Existing systems are, in many cases, generalizations of previous\ndomain specific tools and or methodologies that may not cover all urban\ninterdependencies and makes it difficult to have homogeneous indicators. In\norder to overcome this limitation while seeking for effective support to\ndecision makers, this article introduces a novel hybrid simulation framework\nfor risk mitigation. The framework is built on a proposed city concept that\nconsiders urban space as a Complex Adaptive System composed by interconnected\nCritical Infrastructures. In this concept, a Social System, which models daily\npatterns and social interactions of the citizens in the Urban Landscape, drives\nthe CIs demand to configure the full city picture. The frameworks hybrid design\nintegrates agent based and network based modeling by breaking down city agents\ninto system dependent subagents, to enable both inter and intra system\ninteraction simulation, respectively. A layered structure of indicators at\ndifferent aggregation levels is also developed, to ensure that decisions are\nnot only data driven but also explainable. Therefore, the proposed simulation\nframework can serve as a DSS tool that allows the quantitative analysis of the\nimpact of threats at different levels. First, system level metrics can be used\nto get a broad view on the city resilience. Then, agent level metrics back\nthose figures and provide better explainability. On implementation, the\nproposed framework enables component reusability (for eased coding), simulation\nfederation (enabling the integration of existing system oriented simulators),\ndiscrete simulation in accelerated time (for rapid scenario simulation) and\ndecision oriented visualization (for informed outputs).\n","authors":["David Carraminana","Ana M. Bernardos","Juan A. Besada","Jose R. Casar"],"pdf_url":"https://arxiv.org/pdf/2501.04746v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2501.07590v1","updated":"2025-01-08T11:17:44Z","published":"2025-01-08T11:17:44Z","title":"Ultrafast pulsed laser evaluation of Single Event Transients in\n  opto-couplers","summary":"  We build a 1064 nm fiber laser system-based testing facility for emulating\nSETs in different electronics components and ICs. Using these facilities, we\ntested the 4N35 optocoupler to observe SETs for the first time.\n","authors":["Kavin Dave","Aditya Mukherjee","Hari Shanker Gupta","Deepak Jain","Shalabh Gupta"],"pdf_url":"https://arxiv.org/pdf/2501.07590v1.pdf","comment":"Accepted in CLEO 2023, San Jose, USA and CLEO 2024, North Carolina,\n  USA for in poster presentation. However due to lack of funds, we could not\n  travel"}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2501.04668v1","updated":"2025-01-08T18:28:56Z","published":"2025-01-08T18:28:56Z","title":"Semilinear Dynamic Programming: Analysis, Algorithms, and Certainty\n  Equivalence Properties","summary":"  We consider a broad class of dynamic programming (DP) problems that involve a\npartially linear structure and some positivity properties in their system\nequation and cost function. We address deterministic and stochastic problems,\npossibly with Markov jump parameters. We focus primarily on infinite horizon\nproblems and prove that under our assumptions, the optimal cost function is\nlinear, and that an optimal policy can be computed efficiently with standard DP\nalgorithms. Moreover, we show that forms of certainty equivalence hold for our\nstochastic problems, in analogy with the classical linear quadratic optimal\ncontrol problems.\n","authors":["Yuchao Li","Dimitri Bertsekas"],"pdf_url":"https://arxiv.org/pdf/2501.04668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04658v1","updated":"2025-01-08T18:13:35Z","published":"2025-01-08T18:13:35Z","title":"Quadratic-form Optimal Transport","summary":"  We introduce the framework of quadratic-form optimal transport (QOT), whose\ntransport cost has the form $\\iint c\\,\\mathrm{d}\\pi \\otimes\\mathrm{d}\\pi$ for\nsome coupling $\\pi$ between two marginals. Interesting examples of\nquadratic-form transport cost and their optimization include the variance of a\nbivariate function, covariance, Kendall's tau, the Gromov--Wasserstein\ndistance, quadratic assignment problems, and quadratic regularization of\nclassic optimal transport. QOT leads to substantially different mathematical\nstructures compared to classic transport problems and many technical\nchallenges. We illustrate the fundamental properties of QOT, provide several\ncases where explicit solutions are obtained, and give general lower bounds of\nthe optimal transport costs. For a wide class of cost functions, including the\nrectangular cost functions, the QOT problem is solved by a new coupling called\nthe diamond transport, whose copula is supported on a diamond in the unit\nsquare.\n","authors":["Ruodu Wang","Zhenyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.04658v1.pdf","comment":"43 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.04629v1","updated":"2025-01-08T17:20:55Z","published":"2025-01-08T17:20:55Z","title":"Characterizations of Variational Convexity and Tilt Stability via\n  Quadratic Bundles","summary":"  In this paper, we establish characterizations of variational $s$-convexity\nand tilt stability for prox-regular functions in the absence of subdifferential\ncontinuity via quadratic bundles, a kind of primal-dual generalized\nsecond-order derivatives recently introduced by Rockafellar. Deriving such\ncharacterizations in the effective pointbased form requires a certain revision\nof quadratic bundles investigated below. Our device is based on the notion of\ngeneralized twice differentiability and its novel characterization via\nclassical twice differentiability of the associated Moreau envelopes combined\nwith various limiting procedures for functions and sets.\n","authors":["Pham Duy Khanh","Boris S. Mordukhovich","Vo Thanh Phat","Le Duc Viet"],"pdf_url":"https://arxiv.org/pdf/2501.04629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04603v1","updated":"2025-01-08T16:40:58Z","published":"2025-01-08T16:40:58Z","title":"Infinite Horizon Fully Coupled Nonlinear Forward-Backward Stochastic\n  Difference Equations and their Application to LQ Optimal Control Problems","summary":"  This paper focuses on the study of infinite horizon fully coupled nonlinear\nforward-backward stochastic difference equations (FBS$\\bigtriangleup$Es).\nFirstly, we establish a pair of priori estimates for the solutions to forward\nstochastic difference equations (FS$\\bigtriangleup$Es) and backward stochastic\ndifference equations (BS$\\bigtriangleup$Es) respectively. Then, to achieve\nbroader applicability, we utilize a set of domination-monotonicity conditions\nwhich are more lenient than general ones. Using these conditions, we apply\ncontinuation methods to prove the unique solvability of infinite horizon fully\ncoupled FBS$\\bigtriangleup$Es and derive a set of solution estimates.\nFurthermore, our results have considerable implications for a variety of\nrelated linear quadratic (LQ) problems, especially when the stochastic\nHamiltonian system is consistent with FBS$\\bigtriangleup$Es satisfying these\nintroduced domination-monotonicity conditions. Thus, by solving the associated\nstochastic Hamiltonian system, we can derive an explicit expression for the\nunique optimal control.\n","authors":["Xinyu Ma","Xun Li","Qingxin Meng"],"pdf_url":"https://arxiv.org/pdf/2501.04603v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2410.01749"},{"id":"http://arxiv.org/abs/2404.03604v3","updated":"2025-01-08T16:14:22Z","published":"2024-04-04T17:25:25Z","title":"A Unified Algorithmic Framework for Dynamic Assortment Optimization\n  under MNL Choice","summary":"  We consider assortment and inventory planning problems with dynamic\nstockout-based substitution effects, and without replenishment, in two\ndifferent settings: (1) Customers can see all available products when they\narrive, a typical scenario in physical stores. (2) The seller can choose to\noffer a subset of available products to each customer, which is more common on\nonline platforms. Both settings are known to be computationally challenging,\nand the current approximation algorithms for the two settings are quite\ndifferent. We develop a unified algorithm framework under the MNL choice model\nfor both settings. Our algorithms improve on the state-of-the-art algorithms in\nterms of approximation guarantee and runtime, and the ability to manage\nuncertainty in the total number of customers and handle more complex\nconstraints. In the process, we establish various novel properties of dynamic\nassortment planning (for the MNL choice model) that may be useful more broadly.\n","authors":["Shuo Sun","Rajan Udwani","Zuo-Jun Max Shen"],"pdf_url":"https://arxiv.org/pdf/2404.03604v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03496v2","updated":"2025-01-08T16:14:12Z","published":"2024-08-07T01:33:44Z","title":"A three-stage method for reconstructing multiple coefficients in coupled\n  photoacoustic and diffuse optical imaging","summary":"  This paper studies inverse problems in quantitative photoacoustic tomography\nwith additional optical current data supplemented from diffuse optical\ntomography. We propose a three-stage image reconstruction method for the\nsimultaneous recovery of the absorption, diffusion, and Gr\\\"uneisen\ncoefficients. We demonstrate, through numerical simulations, that: (i) when the\nGr\\\"uneisen coefficient is known, the addition of the optical measurements\nallows a more accurate reconstruction of the scattering and absorption\ncoefficients; and (ii) when the Gr\\\"uneisen coefficient is not known, the\naddition of optical current measurements allows us to reconstruct uniquely the\nGr\\\"uneisen, the scattering and absorption coefficients. Numerical simulations\nbased on synthetic data are presented to demonstrate the effectiveness of the\nproposed idea.\n","authors":["Yinxi Pan","Kui Ren","Shanyin Tong"],"pdf_url":"https://arxiv.org/pdf/2408.03496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04585v1","updated":"2025-01-08T16:06:15Z","published":"2025-01-08T16:06:15Z","title":"Accelerated Extragradient-Type Methods -- Part 2: Generalization and\n  Sublinear Convergence Rates under Co-Hypomonotonicity","summary":"  Following the first part of our project, this paper comprehensively studies\ntwo types of extragradient-based methods: anchored extragradient and Nesterov's\naccelerated extragradient for solving [non]linear inclusions (and, in\nparticular, equations), primarily under the Lipschitz continuity and the\nco-hypomonotonicity assumptions. We unify and generalize a class of anchored\nextragradient methods for monotone inclusions to a wider range of schemes\nencompassing existing algorithms as special cases. We establish\n$\\mathcal{O}(1/k)$ last-iterate convergence rates on the residual norm of the\nunderlying mapping for this general framework and then specialize it to obtain\nconvergence guarantees for specific instances, where $k$ denotes the iteration\ncounter. We extend our approach to a class of anchored Tseng's\nforward-backward-forward splitting methods to obtain a broader class of\nalgorithms for solving co-hypomonotone inclusions. Again, we analyze\n$\\mathcal{O}(1/k)$ last-iterate convergence rates for this general scheme and\nspecialize it to obtain convergence results for existing and new variants. We\ngeneralize and unify Nesterov's accelerated extra-gradient method to a new\nclass of algorithms that covers existing schemes as special instances while\ngenerating new variants. For these schemes, we can prove $\\mathcal{O}(1/k)$\nlast-iterate convergence rates for the residual norm under co-hypomonotonicity,\ncovering a class of nonmonotone problems. We propose another novel class of\nNesterov's accelerated extragradient methods to solve inclusions.\nInterestingly, these algorithms achieve both $\\mathcal{O}(1/k)$ and $o(1/k)$\nlast-iterate convergence rates, and also the convergence of iterate sequences\nunder co-hypomonotonicity and Lipschitz continuity. Finally, we provide a set\nof numerical experiments encompassing different scenarios to validate our\nalgorithms and theoretical guarantees.\n","authors":["Quoc Tran-Dinh","Nghia Nguyen-Trung"],"pdf_url":"https://arxiv.org/pdf/2501.04585v1.pdf","comment":"75 pages, 7 figures, and 1 table"},{"id":"http://arxiv.org/abs/2401.03692v3","updated":"2025-01-08T16:05:00Z","published":"2024-01-08T06:46:39Z","title":"Boosting Column Generation with Graph Neural Networks for Joint Rider\n  Trip Planning and Crew Shift Scheduling","summary":"  Optimizing service schedules is pivotal to the reliable, efficient, and\ninclusive on-demand mobility. This pressing challenge is further exacerbated by\nthe increasing needs of an aging population, the oversubscription of existing\nservices, and the lack of effective solution methods. This study addresses the\nintricacies of service scheduling, by jointly optimizing rider trip planning\nand crew scheduling for a complex dynamic mobility service. The resulting\noptimization problems are extremely challenging computationally for\nstate-of-the-art methods. To address this fundamental gap, this paper\nintroduces the Joint Rider Trip Planning and Crew Shift Scheduling Problem\n(JRTPCSSP) and a novel solution method, called Attention and Gated GNN-Informed\nColumn Generation (AGGNNI-CG), that hybridizes column generation and machine\nlearning to obtain near-optimal solutions to the JRTPCSSP with real-life\nconstraints of the application. The key idea of the machine-learning component\nis to dramatically reduce the number of paths to explore in the pricing\nproblem, accelerating the most time-consuming component of the column\ngeneration. The machine learning component is a graph neural network with an\nattention mechanism and a gated architecture, which is particularly suited to\ncater for the different input sizes coming from daily operations. AGGNNI-CG has\nbeen applied to a challenging, real-world dataset from the Paratransit system\nof Chatham County in Georgia. It produces substantial improvements compared to\nthe baseline column generation approach, which typically cannot produce\nhigh-quality feasible solutions in reasonable time on large-scale complex\ninstances. AGGNNI-CG also produces significant improvements in service quality\ncompared to the existing system.\n","authors":["Jiawei Lu","Tinghan Ye","Wenbo Chen","Pascal Van Hentenryck"],"pdf_url":"https://arxiv.org/pdf/2401.03692v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16957v3","updated":"2025-01-08T15:57:38Z","published":"2024-12-22T10:33:09Z","title":"Euclidean distance discriminants and Morse attractors","summary":"  Our study concerns the Euclidean distance function in case of complex plane\ncurves. We decompose the ED discriminant into 3 parts which are responsible for\nthe 3 types of behavior of the Morse points, and we find the structure of each\none. In particular we shed light on the ``atypical discriminant'' which is due\nto the loss of Morse points at infinity. We find formulas for the number of\nMorse singularities which abut to the corresponding 3 types of attractors when\nmoving the centre of the distance function toward a point of the discriminant.\n","authors":["Cezar Joiţa","Dirk Siersma","Mihai Tibăr"],"pdf_url":"https://arxiv.org/pdf/2412.16957v3.pdf","comment":"several improvements in Section 3"},{"id":"http://arxiv.org/abs/2412.19367v3","updated":"2025-01-08T15:51:21Z","published":"2024-12-26T22:23:11Z","title":"Central limit theorems for vector-valued composite functionals with\n  smoothing and applications","summary":"  This paper focuses on vector-valued composite functionals, which may be\nnonlinear in probability. Our primary goal is to establish central limit\ntheorems for these functionals when mixed estimators are employed. Our study is\nrelevant to the evaluation and comparison of risk in decision-making contexts\nand extends to functionals that arise in machine learning methods. A\ngeneralized family of composite risk functionals is presented, which\nencompasses most of the known coherent risk measures including systemic\nmeasures of risk. The paper makes two main contributions. First, we analyze\nvector-valued functionals, providing a framework for evaluating\nhigh-dimensional risks. This framework facilitates the comparison of multiple\nrisk measures, as well as the estimation and asymptotic analysis of systemic\nrisk and its optimal value in decision-making problems. Second, we derive novel\ncentral limit theorems for optimized composite functionals when mixed types of\nestimators: empirical and smoothed estimators are used. We provide verifiable\nsufficient conditions for the central limit formulae and show their\napplicability to several popular measures of risk.\n","authors":["Huihui Chen","Darinka Dentcheva","Yang Lin","Gregory J. Stock"],"pdf_url":"https://arxiv.org/pdf/2412.19367v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04572v1","updated":"2025-01-08T15:42:41Z","published":"2025-01-08T15:42:41Z","title":"Regret Analysis: a control perspective","summary":"  Online learning and model reference adaptive control have many interesting\nintersections. One area where they differ however is in how the algorithms are\nanalyzed and what objective or metric is used to discriminate \"good\" algorithms\nfrom \"bad\" algorithms. In adaptive control there are usually two objectives: 1)\nprove that all time varying parameters/states of the system are bounded, and 2)\nthat the instantaneous error between the adaptively controlled system and a\nreference system converges to zero over time (or at least a compact set). For\nonline learning the performance of algorithms is often characterized by the\nregret the algorithm incurs. Regret is defined as the cumulative loss (cost)\nover time from the online algorithm minus the cumulative loss (cost) of the\nsingle optimal fixed parameter choice in hindsight. Another significant\ndifference between the two areas of research is with regard to the assumptions\nmade in order to obtain said results. Adaptive control makes assumptions about\nthe input-output properties of the control problem and derives solutions for a\nfixed error model or optimization task. In the online learning literature\nresults are derived for classes of loss functions (i.e. convex) while a priori\nassuming that all time varying parameters are bounded, which for many\noptimization tasks is not unrealistic, but is a non starter in control\napplications. In this work we discuss these differences in detail through the\nregret based analysis of gradient descent for convex functions and the control\nbased analysis of a streaming regression problem. We close with a discussion\nabout the newly defined paradigm of online adaptive control and ask the\nfollowing question \"Are regret optimal control strategies deployable?\"\n","authors":["Travis E. Gibson","Sawal Acharya"],"pdf_url":"https://arxiv.org/pdf/2501.04572v1.pdf","comment":"10 pages no figures"},{"id":"http://arxiv.org/abs/2402.07099v3","updated":"2025-01-08T15:37:04Z","published":"2024-02-11T04:09:50Z","title":"Rethinking the Capacity of Graph Neural Networks for Branching Strategy","summary":"  Graph neural networks (GNNs) have been widely used to predict properties and\nheuristics of mixed-integer linear programs (MILPs) and hence accelerate MILP\nsolvers. This paper investigates the capacity of GNNs to represent strong\nbranching (SB), the most effective yet computationally expensive heuristic\nemployed in the branch-and-bound algorithm. In the literature, message-passing\nGNN (MP-GNN), as the simplest GNN structure, is frequently used as a fast\napproximation of SB and we find that not all MILPs's SB can be represented with\nMP-GNN. We precisely define a class of \"MP-tractable\" MILPs for which MP-GNNs\ncan accurately approximate SB scores. Particularly, we establish a universal\napproximation theorem: for any data distribution over the MP-tractable class,\nthere always exists an MP-GNN that can approximate the SB score with\narbitrarily high accuracy and arbitrarily high probability, which lays a\ntheoretical foundation of the existing works on imitating SB with MP-GNN. For\nMILPs without the MP-tractability, unfortunately, a similar result is\nimpossible, which can be illustrated by two MILP instances with different SB\nscores that cannot be distinguished by any MP-GNN, regardless of the number of\nparameters. Recognizing this, we explore another GNN structure called the\nsecond-order folklore GNN (2-FGNN) that overcomes this limitation, and the\naforementioned universal approximation theorem can be extended to the entire\nMILP space using 2-FGNN, regardless of the MP-tractability. A small-scale\nnumerical experiment is conducted to directly validate our theoretical\nfindings.\n","authors":["Ziang Chen","Jialin Liu","Xiaohan Chen","Xinshang Wang","Wotao Yin"],"pdf_url":"https://arxiv.org/pdf/2402.07099v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04563v1","updated":"2025-01-08T15:24:20Z","published":"2025-01-08T15:24:20Z","title":"On Branch-and-Price for Project Scheduling","summary":"  Integer programs for resource-constrained project scheduling problems are\nnotoriously hard to solve due to their weak linear relaxations. Several papers\nhave proposed reformulating project scheduling problems via Dantzig-Wolfe\ndecomposition to strengthen their linear relaxation and decompose large problem\ninstances. The reformulation gives rise to a master problem that has a large\nnumber of variables. Therefore, the master problem is solved by a column\ngeneration procedure embedded in a branching framework, also known as\nbranch-and-price. While branch-and-price has been successfully applied to many\nproblem classes, it turns out to be ineffective for most project scheduling\nproblems. This paper identifies drivers of the ineffectiveness by analyzing the\nstructure of the reformulated problem and the strength of different branching\nschemes. Our analysis shows that the reformulated problem has an unfavorable\nstructure for column generation: It is highly degenerate, slowing down the\nconvergence of column generation, and for many project scheduling problems, it\nyields the same or only slightly stronger linear relaxations as classical\nformulations at the expense of large increases in runtime. Our computational\nexperiments complement our theoretical findings.\n","authors":["Maximilian Kolter","Martin Grunow","Rainer Kolisch"],"pdf_url":"https://arxiv.org/pdf/2501.04563v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04158v3","updated":"2025-01-08T15:21:26Z","published":"2024-04-05T15:10:30Z","title":"Hardness of circuit and monotone diameters of polytopes","summary":"  The Circuit diameter of polytopes was introduced by Borgwardt, Finhold and\nHemmecke as a fundamental tool for the study of circuit augmentation schemes\nfor linear programming and for estimating combinatorial diameters. Determining\nthe complexity of computing the circuit diameter of polytopes was posed as an\nopen problem by Sanit\\`a as well as by Kafer, and was recently reiterated by\nBorgwardt, Grewe, Kafer, Lee and Sanit\\`a.\n  In this paper, we solve this problem by showing that computing the circuit\ndiameter of a polytope given in halfspace-description is strongly NP-hard. To\nprove this result, we show that computing the combinatorial diameter of the\nperfect matching polytope of a bipartite graph is NP-hard. This complements a\nresult by Sanit\\`a (FOCS 2018) on the NP-hardness of computing the diameter of\nfractional matching polytopes and implies the new result that computing the\ndiameter of a $\\{0,1\\}$-polytope is strongly NP-hard, which may be of\nindependent interest. In our second main result, we give a precise\ngraph-theoretic description of the monotone diameter of perfect matching\npolytopes and use this description to prove that computing the monotone\n(circuit) diameter of a given input polytope is strongly NP-hard as well.\n","authors":["Christian Nöbel","Raphael Steiner"],"pdf_url":"https://arxiv.org/pdf/2404.04158v3.pdf","comment":"21 pages, 9 figures. Restructured paper"},{"id":"http://arxiv.org/abs/2501.04548v1","updated":"2025-01-08T14:54:18Z","published":"2025-01-08T14:54:18Z","title":"Optimal Control of the Navier-Stokes equations via Pressure Boundary\n  Conditions","summary":"  In this work we study an optimal control problem subject to the instationary\nNavier-Stokes equations, where the control enters via an inhomogeneous\nNeumann/Do-Nothing boundary condition. Despite the Navier-Stokes equations with\nthese boundary conditions not being well-posed for large times and/or data, we\nobtain wellposedness of the optimal control problem by choosing a proper\ntracking type term. In order to discuss the regularity of the optimal control,\nstate and adjoint state, we present new results on $L^2(I;H^2(\\Omega))$\nregularity of solutions to a Stokes problem with mixed inhomogeneous boundary\nconditions.\n","authors":["Boris Vexler","Jakob Wagner"],"pdf_url":"https://arxiv.org/pdf/2501.04548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04536v1","updated":"2025-01-08T14:36:22Z","published":"2025-01-08T14:36:22Z","title":"Scalable Derivative-Free Optimization Algorithms with Low-Dimensional\n  Subspace Techniques","summary":"  We re-introduce a derivative-free subspace optimization framework originating\nfrom Chapter 5 of the Ph.D. thesis [Z. Zhang, On Derivative-Free Optimization\nMethods, Ph.D. thesis, Chinese Academy of Sciences, Beijing, 2012] of the\nauthor under the supervision of Ya-xiang Yuan. At each iteration, the framework\ndefines a (low-dimensional) subspace based on an approximate gradient, and then\nsolves a subproblem in this subspace to generate a new iterate. We sketch the\nglobal convergence and worst-case complexity analysis of the framework,\nelaborate on its implementation, and present some numerical results on solving\nproblems with dimensions as high as 10^4 using only inaccurate function values.\n","authors":["Zaikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.04536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04491v1","updated":"2025-01-08T13:23:50Z","published":"2025-01-08T13:23:50Z","title":"A fast iterative thresholding and support-and-scale shrinking algorithm\n  (fits3) for non-lipschitz group sparse optimization (i): the case of\n  least-squares fidelity","summary":"  We consider to design a new efficient and easy-to-implement algorithm to\nsolve a general group sparse optimization model with a class of non-convex\nnon-Lipschitz regularizations, named as fast iterative thresholding and\nsupport-and-scale shrinking algorithm (FITS3). In this paper we focus on the\ncase of a least-squares fidelity. FITS3 is designed from a lower bound theory\nof such models and by integrating thresholding operation, linearization and\nextrapolation techniques. The FITS3 has two advantages. Firstly, it is quite\nefficient and especially suitable for large-scale problems, because it adopts\nsupport-and-scale shrinking and does not need to solve any linear or nonlinear\nsystem. For two important special cases, the FITS3 contains only simple\ncalculations like matrix-vector multiplication and soft thresholding. Secondly,\nthe FITS3 algorithm has a sequence convergence guarantee under proper\nassumptions. The numerical experiments and comparisons to recent existing\nnon-Lipschitz group recovery algorithms demonstrate that, the proposed FITS3\nachieves similar recovery accuracies, but costs only around a half of the CPU\ntime by the second fastest compared algorithm for median or large-scale\nproblems.\n","authors":["Yanan Zhao","Qiaoli Dong","Yufei Zhao","Chunlin Wu"],"pdf_url":"https://arxiv.org/pdf/2501.04491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03744v2","updated":"2025-01-08T13:12:34Z","published":"2025-01-07T12:38:21Z","title":"Hydrogen Network Expansion Planning considering the Chicken-and-egg\n  Dilemma and Market Uncertainty","summary":"  Green hydrogen is thought to be a game changer for reaching sustainability\ntargets. However, the transition to a green hydrogen economy faces a critical\nchallenge known as the `chicken-and-egg dilemma', wherein establishing a\nhydrogen supply network relies on demand, while demand only grows with reliable\nsupply. In addition, as the hydrogen market is in the early stage, predicting\ndemand distributions is challenging due to lack of data availability. This\npaper addresses these complex issues through a risk-averse framework with the\nintroduction of a distributionally robust hydrogen network expansion planning\nproblem under decision-dependent demand ambiguity. The problem optimizes\nlocation and production capacity decisions of the suppliers considering the\nmoments of the stochastic hydrogen demand as a function of these investment\ndecisions. To obtain tractable representations of this problem, we derive two\ndifferent reformulations that consider continuous and discrete hydrogen demand\nsupport sets under different forms of decision dependencies. To efficiently\nsolve the reformulations, we develop a tailored algorithm based on the\ncolumn-and-constraint generation approach, and enhance the computational\nperformance through solving the master problems to a relative optimality gap,\ndecomposing the subproblems, and integrating pre-generated columns and\nconstraints. To validate the effectiveness of our approach, we investigate a\nreal case study leveraging data from the \"Hydrogen Energy Applications in\nValley Environments for Northern Netherlands (HEAVENN)\" project. The results\nreveal that considering the chicken-and-egg dilemma under uncertain hydrogen\nmarket conditions leads to earlier and more diverse investments, providing\ncritical insights for policymakers based on the degree of decision dependency.\n","authors":["Sezen Ece Kayacık","Beste Basciftci","Albert H. Schrotenboer","Iris F. A. Vis","Evrim Ursavas"],"pdf_url":"https://arxiv.org/pdf/2501.03744v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03390v2","updated":"2025-01-08T12:20:18Z","published":"2025-01-06T21:15:03Z","title":"State-of-the-art Methods for Pseudo-Boolean Solving with SCIP","summary":"  The Pseudo-Boolean problem deals with linear or polynomial constraints with\ninteger coefficients over Boolean variables. The objective lies in optimizing a\nlinear objective function, or finding a feasible solution, or finding a\nsolution that satisfies as many constraints as possible. In the 2024\nPseudo-Boolean competition, solvers incorporating the SCIP framework won five\nout of six categories it was competing in. From a total of 1,207 instances,\nSCIP successfully solved 759, while its parallel version FiberSCIP solved 776.\nBased on the results from the competition, we further enhanced SCIP's\nPseudo-Boolean capabilities. This article discusses the results and presents\nthe winning algorithmic ideas.\n","authors":["Gioni Mexi","Dominik Kamp","Yuji Shinano","Shanwen Pu","Alexander Hoen","Ksenia Bestuzheva","Christopher Hojny","Matthias Walter","Marc E. Pfetsch","Sebastian Pokutta","Thorsten Koch"],"pdf_url":"https://arxiv.org/pdf/2501.03390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04443v1","updated":"2025-01-08T11:52:43Z","published":"2025-01-08T11:52:43Z","title":"Revisiting LocalSGD and SCAFFOLD: Improved Rates and Missing Analysis","summary":"  LocalSGD and SCAFFOLD are widely used methods in distributed stochastic\noptimization, with numerous applications in machine learning, large-scale data\nprocessing, and federated learning. However, rigorously establishing their\ntheoretical advantages over simpler methods, such as minibatch SGD (MbSGD), has\nproven challenging, as existing analyses often rely on strong assumptions,\nunrealistic premises, or overly restrictive scenarios.\n  In this work, we revisit the convergence properties of LocalSGD and SCAFFOLD\nunder a variety of existing or weaker conditions, including gradient\nsimilarity, Hessian similarity, weak convexity, and Lipschitz continuity of the\nHessian. Our analysis shows that (i) LocalSGD achieves faster convergence\ncompared to MbSGD for weakly convex functions without requiring stronger\ngradient similarity assumptions; (ii) LocalSGD benefits significantly from\nhigher-order similarity and smoothness; and (iii) SCAFFOLD demonstrates faster\nconvergence than MbSGD for a broader class of non-quadratic functions. These\ntheoretical insights provide a clearer understanding of the conditions under\nwhich LocalSGD and SCAFFOLD outperform MbSGD.\n","authors":["Ruichen Luo","Sebastian U Stich","Samuel Horváth","Martin Takáč"],"pdf_url":"https://arxiv.org/pdf/2501.04443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03718v2","updated":"2025-01-08T11:52:40Z","published":"2025-01-07T11:58:10Z","title":"Scalable Second-Order Optimization Algorithms for Minimizing Low-rank\n  Functions","summary":"  We present a random-subspace variant of cubic regularization algorithm that\nchooses the size of the subspace adaptively, based on the rank of the projected\nsecond derivative matrix. Iteratively, our variant only requires access to\n(small-dimensional) projections of first- and second-order problem derivatives\nand calculates a reduced step inexpensively. The ensuing method maintains the\noptimal global rate of convergence of (full-dimensional) cubic regularization,\nwhile showing improved scalability both theoretically and numerically,\nparticularly when applied to low-rank functions. When applied to the latter,\nour algorithm naturally adapts the subspace size to the true rank of the\nfunction, without knowing it a priori.\n","authors":["Edward Tansley","Coralia Cartis"],"pdf_url":"https://arxiv.org/pdf/2501.03718v2.pdf","comment":"Accepted at NeurIPS 2024 Workshop OPT2024: Optimization for Machine\n  Learning; fixed typo on page 5"},{"id":"http://arxiv.org/abs/2309.05596v3","updated":"2025-01-08T10:33:39Z","published":"2023-09-11T16:24:05Z","title":"Output-Positive Adaptive Control of Hyperbolic PDE-ODE Cascades","summary":"  In this paper, we propose a new adaptive Control Barrier Function (aCBF)\nmethod to design the output-positive adaptive control law for a hyperbolic\nPDE-ODE cascade with parametric uncertainties. This method employs the recent\nadaptive control approach with batch least-squares identification (BaLSI,\npronounced \"ballsy\") that completes perfect parameter identification in finite\ntime and offers a previously unforeseen advantage in safe control design with\naCBF, which we elucidate in this paper. Since the true challenge is exhibited\nfor CBF of a high relative degree, we undertake a control design in this paper\nfor a class of systems that possess a particularly extreme relative degree:\n$2\\times2$ hyperbolic PDEs sandwiched by a strict-feedback nonlinear ODE and a\nlinear ODE, where the unknown coefficients are associated with the PDE\nin-domain coupling terms and with the input signal of the distal ODE. The\ndesigned output-positive adaptive controller guarantees the positivity of the\noutput signal that is the furthermost state from the control input as well as\nthe exponential regulation of the overall plant state to zero. The\neffectiveness of the proposed method is illustrated by numerical simulation.\n","authors":["Ji Wang","Miroslav Krstic"],"pdf_url":"https://arxiv.org/pdf/2309.05596v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00810v2","updated":"2025-01-08T10:23:01Z","published":"2024-03-31T21:51:28Z","title":"Off-the-grid regularisation for Poisson inverse problems","summary":"  Off-the-grid regularisation has been extensively employed over the last\ndecade in the context of ill-posed inverse problems formulated in the\ncontinuous setting of the space of Radon measures $\\mathcal{M}(\\mathcal{X})$.\nThese approaches enjoy convexity and counteract the discretisation biases as\nwell the numerical instabilities typical of their discrete counterparts. In the\nframework of sparse reconstruction of discrete point measures (sum of weighted\nDiracs), a Total Variation regularisation norm in $\\mathcal{M}(\\mathcal{X})$ is\ntypically combined with an $L^2$ data term modelling additive Gaussian noise.\nTo asses the framework of off-the-grid regularisation in the presence of\nsignal-dependent Poisson noise, we consider in this work a variational model\ncoupling the Total Variation regularisation with a Kullback-Leibler data term\nunder a non-negativity constraint. Analytically, we study the optimality\nconditions of the composite functional and analyse its dual problem. Then, we\nconsider an homotopy strategy to select an optimal regularisation parameter and\nuse it within a Sliding Frank-Wolfe algorithm. Several numerical experiments on\nboth 1D/2D simulated and real 3D fluorescent microscopy data are reported.\n","authors":["Marta Lazzaretti","Claudio Estatico","Alejandro Melero","Luca Calatroni"],"pdf_url":"https://arxiv.org/pdf/2404.00810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09973v4","updated":"2025-01-08T09:50:24Z","published":"2024-05-16T10:37:40Z","title":"Ensemble Control for Stochastic Systems with Asymmetric Laplace Noises","summary":"  This paper presents an adaptive ensemble control for stochastic systems\nsubject to asymmetric noises and outliers. Asymmetric noises skew system\nobservations, and outliers with large amplitude deteriorate the observations\neven further. Such disturbances induce poor system estimation and degraded\nstochastic system control. In this work, we model the asymmetric noises and\noutliers by mixed asymmetric Laplace distributions (ALDs), and propose an\noptimal control for stochastic systems with mixed ALD noises. Particularly, we\nsegregate the system disturbed by mixed ALD noises into subsystems, each of\nwhich is subject to a specific ALD noise. For each subsystem, we design an\niterative quantile filter (IQF) to estimate the system parameters using system\nobservations. With the estimated parameters by IQF, we derive the certainty\nequivalence (CE) control law for each subsystem. Then we use the Bayesian\napproach to ensemble the subsystem CE controllers, with each of the controllers\nweighted by their posterior probability. We finalize our control law as the\nweighted sum of the control signals by the sub-system CE controllers. To\ndemonstrate our approach, we conduct numerical simulations and Monte Carlo\nanalyses. The results show improved tracking performance by our approach for\nskew noises and its robustness to outliers, compared with single ALD based and\nRLS-based control policy.\n","authors":["Yajie Yu","Xuehui Ma","Shiliang Zhang","Zhuzhu Wang","Xubing Shi","Yushuai Li","Tingwen Huang"],"pdf_url":"https://arxiv.org/pdf/2405.09973v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04369v1","updated":"2025-01-08T09:16:03Z","published":"2025-01-08T09:16:03Z","title":"State-dependent preconditioning for the inner-loop in Variational Data\n  Assimilation using Machine Learning","summary":"  Data Assimilation is the process in which we improve the representation of\nthe state of a physical system by combining information coming from a numerical\nmodel, real-world observations, and some prior modelling. It is widely used to\nmodel and to improve forecast systems in Earth science fields such as\nmeteorology, oceanography and environmental sciences. One key aspect of Data\nassimilation is the analysis step, where the output of the numerical model is\nadjusted in order to account for the observational data. In Variational Data\nAssimilation and under Gaussian assumptions, the analysis step comes down to\nsolving a high-dimensional non-linear least-square problem. In practice, this\nminimization involves successive inversions of large, and possibly\nill-conditioned matrices constructed using linearizations of the forward model.\nIn order to improve the convergence rate of these methods, and thus reduce the\ncomputational burden, preconditioning techniques are often used to get\nbetter-conditioned matrices, but require either the sparsity pattern of the\nmatrix to inverse, or some spectral information. We propose to use Deep Neural\nNetworks in order to construct a preconditioner. This surrogate is trained\nusing some properties of the singular value decomposition, and is based on a\ndataset which can be constructed online to reduce the storage requirements.\n","authors":["Victor Trappler","Arthur Vidard"],"pdf_url":"https://arxiv.org/pdf/2501.04369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04335v1","updated":"2025-01-08T08:12:23Z","published":"2025-01-08T08:12:23Z","title":"An algorithm for a constrained P-spline","summary":"  Regression splines are largely used to investigate and predict data behavior,\nattracting the interest of mathematicians for their beautiful numerical\nproperties, and of statisticians for their versatility with respect to the\napplications. Several penalized spline regression models are available in the\nliterature, and the most commonly used ones in real-world applications are\nP-splines, which enjoy the advantages of penalized models while being easy to\ngeneralize across different functional spaces and higher degree order, because\nof their discrete penalty term. To face the different requirements imposed by\nthe nature of the problem or the physical meaning of the expected values, the\nP-spline definition is often modified by additional hypotheses, often\ntranslated into constraints on the solution or its derivatives. In this\nframework, our work is motivated by the aim of getting approximation models\nthat fall within pre-established thresholds. Specifically, starting from a set\nof observed data, we consider a P-spline constrained between some prefixed\nbounds. In our paper, we just consider 0 as lower bound, although our approach\napplies to more general cases. We propose to get nonnegativity by imposing\nlower bounds on selected sample points. The spline can be computed through a\nsequence of linearly constrained problems. We suggest a strategy to dynamically\nselect the sample points, to avoid extremely dense sampling, and therefore try\nto reduce as much as possible the computational burden. We show through some\ncomputational experiments the reliability of our approach and the accuracy of\nthe results compared to some state-of-the-art models.\n","authors":["Rosanna Campagna","Serena Crisci","Gabriele Santin","Gerardo Toraldo","Marco Viola"],"pdf_url":"https://arxiv.org/pdf/2501.04335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01714v4","updated":"2025-01-08T06:52:07Z","published":"2024-04-02T07:57:17Z","title":"Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization\n  Algorithm for Deep Learning","summary":"  Training deep neural networks is a challenging task. In order to speed up\ntraining and enhance the performance of deep neural networks, we rectify the\nvanilla conjugate gradient as conjugate-gradient-like and incorporate it into\nthe generic Adam, and thus propose a new optimization algorithm named\nCG-like-Adam for deep learning. Specifically, both the first-order and the\nsecond-order moment estimation of generic Adam are replaced by the\nconjugate-gradient-like. Convergence analysis handles the cases where the\nexponential moving average coefficient of the first-order moment estimation is\nconstant and the first-order moment estimation is unbiased. Numerical\nexperiments show the superiority of the proposed algorithm based on the\nCIFAR10/100 dataset.\n","authors":["Jiawu Tian","Liwei Xu","Xiaowei Zhang","Yongqi Li"],"pdf_url":"https://arxiv.org/pdf/2404.01714v4.pdf","comment":"32 pages, 13 figures"},{"id":"http://arxiv.org/abs/2501.04291v1","updated":"2025-01-08T05:31:58Z","published":"2025-01-08T05:31:58Z","title":"A truncated ε-subdifferential method for global DC optimization","summary":"  We consider the difference of convex (DC) optimization problem subject to\nbox-constraints. Utilizing {\\epsilon}-subdifferentials of DC components of the\nobjective, we develop a new method for finding global solutions to this\nproblem. The method combines a local search approach with a special procedure\nfor escaping non-global solutions by identifying improved initial points for a\nlocal search. The method terminates when the solution cannot be improved\nfurther. The escaping procedure is designed using subsets of the\n{\\epsilon}-subdifferentials of DC components. We compute the deviation between\nthese subsets and determine {\\epsilon}-subgradients providing this deviation.\nUsing these specific {\\epsilon}-subgradients, we formulate a subproblem with a\nconvex objective function. The solution to this subproblem serves as a starting\npoint for a local search. We study the convergence of the conceptual version of\nthe proposed method and discuss its implementation. A large number of academic\ntest problems demonstrate that the method requires reasonable computational\neffort to find higher quality solutions than other local DC optimization\nmethods. Additionally, we apply the new method to find global solutions to DC\noptimization problems and compare its performance with two benchmark global\noptimization solvers.\n","authors":["Adil M. Bagirov","Kaisa Joki","Marko M. Makela","Sona Taheri"],"pdf_url":"https://arxiv.org/pdf/2501.04291v1.pdf","comment":"35 pages, 9 figures"},{"id":"http://arxiv.org/abs/2211.11955v2","updated":"2025-01-08T04:21:33Z","published":"2022-11-22T02:28:17Z","title":"Optimal Stabilization of Periodic Orbits","summary":"  In this contribution, the optimal stabilization problem of periodic orbits is\nstudied via invariant manifold theory and symplectic geometry. The stable\nmanifold theory for the optimal point stabilization case is generalized to the\ncase of periodic orbit stabilization, where a normally hyperbolic invariant\nmanifold (NHIM) plays the role of a hyperbolic equilibrium.\n  A sufficient condition for the existence of an NHIM of an associated\nHamiltonian system is derived in terms of a periodic Riccati differential\nequation. It is shown that the problem of optimal orbit stabilization has a\nsolution if a linearized periodic system satisfies stabilizability and\ndetectability. A moving orthogonal coordinate system is employed along the\nperiodic orbit which is a natural framework for orbital stabilization and\nlinearization argument.\n  Examples illustrated include an optimal control problem for a spring-mass\noscillator system, which should be stabilized at a certain energy level, and an\norbit transfer problem for a satellite, which constitutes a typical control\nproblem of orbital mechanics.\n","authors":["Fabian Beck","Noboru Sakamoto"],"pdf_url":"https://arxiv.org/pdf/2211.11955v2.pdf","comment":"Submitted for a journal on November 29 2024"},{"id":"http://arxiv.org/abs/2501.04253v1","updated":"2025-01-08T03:35:28Z","published":"2025-01-08T03:35:28Z","title":"Integrated Offline and Online Learning to Solve a Large Class of\n  Scheduling Problems","summary":"  In this paper, we develop a unified machine learning (ML) approach to predict\nhigh-quality solutions for single-machine scheduling problems with a\nnon-decreasing min-sum objective function with or without release times. Our ML\napproach is novel in three major aspects. First, our approach is developed for\nthe entire class of the aforementioned problems. To achieve this, we exploit\nthe fact that the entire class of the problems considered can be formulated as\na time-indexed formulation in a unified manner. We develop a deep neural\nnetwork (DNN) which uses the cost parameters in the time-indexed formulation as\nthe inputs to effectively predict a continuous solution to this formulation,\nbased on which a feasible discrete solution is easily constructed. The second\nnovel aspect of our approach lies in how the DNN model is trained. In view of\nthe NP-hard nature of the problems, labels (i.e., optimal solutions) are hard\nto generate for training. To overcome this difficulty, we generate and utilize\na set of special instances, for which optimal solutions can be found with\nlittle computational effort, to train the ML model offline. The third novel\nidea we employ in our approach is that we develop an online single-instance\nlearning approach to fine tune the parameters in the DNN for a given online\ninstance, with the goal of generating an improved solution for the given\ninstance. To this end, we develop a feasibility surrogate that approximates the\nobjective value of a given instance as a continuous function of the outputs of\nthe DNN, which then enables us to derive gradients and update the learnable\nparameters in the DNN. Numerical results show that our approach can efficiently\ngenerate high-quality solutions for a variety of single-machine scheduling\nmin-sum problems with up to 1000 jobs.\n","authors":["Anbang Liu","Zhi-Long Chen","Jinyang Jiang","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2501.04253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08090v4","updated":"2025-01-08T03:08:11Z","published":"2024-02-12T22:17:28Z","title":"Learning Neural Contracting Dynamics: Extended Linearization and Global\n  Guarantees","summary":"  Global stability and robustness guarantees in learned dynamical systems are\nessential to ensure well-behavedness of the systems in the face of uncertainty.\nWe present Extended Linearized Contracting Dynamics (ELCD), the first neural\nnetwork-based dynamical system with global contractivity guarantees in\narbitrary metrics. The key feature of ELCD is a parametrization of the extended\nlinearization of the nonlinear vector field. In its most basic form, ELCD is\nguaranteed to be (i) globally exponentially stable, (ii) equilibrium\ncontracting, and (iii) globally contracting with respect to some metric. To\nallow for contraction with respect to more general metrics in the data space,\nwe train diffeomorphisms between the data space and a latent space and enforce\ncontractivity in the latent space, which ensures global contractivity in the\ndata space. We demonstrate the performance of ELCD on the high dimensional\nLASA, multi-link pendulum, and Rosenbrock datasets.\n","authors":["Sean Jaffe","Alexander Davydov","Deniz Lapsekili","Ambuj Singh","Francesco Bullo"],"pdf_url":"https://arxiv.org/pdf/2402.08090v4.pdf","comment":"9 pages, 3 figures. NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.04225v1","updated":"2025-01-08T01:53:26Z","published":"2025-01-08T01:53:26Z","title":"A black-box optimization method with polynomial-based kernels and\n  quadratic-optimization annealing","summary":"  We introduce kernel-QA, a black-box optimization (BBO) method that constructs\nsurrogate models analytically using low-order polynomial kernels within a\nquadratic unconstrained binary optimization (QUBO) framework, enabling\nefficient utilization of Ising machines. The method has been evaluated on\nartificial landscapes, ranging from uni-modal to multi-modal, with input\ndimensions extending to 80 for real variables and 640 for binary variables. The\nresults demonstrate that kernel-QA is particularly effective for optimizing\nblack-box functions characterized by local minima and high-dimensional inputs,\nshowcasing its potential as a robust and scalable BBO approach.\n","authors":["Yuki Minamoto","Yuya Sakamoto"],"pdf_url":"https://arxiv.org/pdf/2501.04225v1.pdf","comment":"32 pages, 11 figures, and 1 table"},{"id":"http://arxiv.org/abs/2404.16731v2","updated":"2025-01-08T01:41:29Z","published":"2024-04-25T16:41:57Z","title":"Non-asymptotic Global Convergence Analysis of BFGS with the Armijo-Wolfe\n  Line Search","summary":"  In this paper, we present the first explicit and non-asymptotic global\nconvergence rates of the BFGS method when implemented with an inexact line\nsearch scheme satisfying the Armijo-Wolfe conditions. We show that BFGS\nachieves a global linear convergence rate of $(1 - \\frac{1}{\\kappa})^t$ for\n$\\mu$-strongly convex functions with $L$-Lipschitz gradients, where $\\kappa =\n\\frac{L}{\\mu}$ represents the condition number. Additionally, if the objective\nfunction's Hessian is Lipschitz, BFGS with the Armijo-Wolfe line search\nachieves a linear convergence rate that depends solely on the line search\nparameters, independent of the condition number. We also establish a global\nsuperlinear convergence rate of $\\mathcal{O}((\\frac{1}{t})^t)$. These global\nbounds are all valid for any starting point $x_0$ and any symmetric positive\ndefinite initial Hessian approximation matrix $B_0$, though the choice of $B_0$\nimpacts the number of iterations needed to achieve these rates. By synthesizing\nthese results, we outline the first global complexity characterization of BFGS\nwith the Armijo-Wolfe line search. Additionally, we clearly define a mechanism\nfor selecting the step size to satisfy the Armijo-Wolfe conditions and\ncharacterize its overall complexity.\n","authors":["Qiujiang Jin","Ruichen Jiang","Aryan Mokhtari"],"pdf_url":"https://arxiv.org/pdf/2404.16731v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01615v2","updated":"2025-01-08T20:48:41Z","published":"2025-01-03T03:19:02Z","title":"Equity Impacts of Public Transit Network Redesign with Shared Autonomous\n  Mobility Services","summary":"  This study examines the equity impacts of integrating shared autonomous\nmobility services (SAMS) into transit system redesign. Using the Greater\nChicago area as a case study, we compare two optimization objectives in\nmultimodal transit network redesign: minimizing total generalized costs\n(equity-agnostic) versus prioritizing service in low-income areas\n(equity-focused). We evaluate the achieved accessibility of clustered zones\nwith redesigned transit networks under two objectives, compared to driving and\nthe existing transit network. The transit access gaps across zones and between\ntransit and driving are found to be generally reduced with the introduction of\nSAMS, but less so with the subsequent improved infrastructure under budget.\nDifferential improvement in equity is seen across suburbs and areas of the\ncity, reflecting the disparity in current transit access and improvement\npotential. In particular, SAMS bridges the transit access gaps in suburban and\ncity areas currently underserved by transit. The City of Chicago, which is also\ndisproportionately home to vulnerable populations, offers an avenue to improve\nvertical equity. These findings demonstrate that SAMS can enhance both\nhorizontal and vertical equity in transit systems, particularly when equity is\nexplicitly incorporated into the design objective.\n","authors":["Max T. M. Ng","Meredith Raymer","Hani S. Mahmassani","Omer Verbas","Taner Cokyasar"],"pdf_url":"https://arxiv.org/pdf/2501.01615v2.pdf","comment":"Restructuring the paper for more precise research direction"},{"id":"http://arxiv.org/abs/2501.04833v1","updated":"2025-01-08T20:38:15Z","published":"2025-01-08T20:38:15Z","title":"Multi-step Inertial Accelerated Doubly Stochastic Gradient Methods for\n  Block Term Tensor Decomposition","summary":"  In this paper, we explore a specific optimization problem that combines a\ndifferentiable nonconvex function with a nondifferentiable function for\nmulti-block variables, which is particularly relevant to tackle the multilinear\nrank-($L_r$,$L_r$,1) block-term tensor decomposition model with a\nregularization term. While existing algorithms often suffer from high\nper-iteration complexity and slow convergence, this paper employs a unified\nmulti-step inertial accelerated doubly stochastic gradient descent method\ntailored for structured rank-$\\left(L_r, L_r, 1\\right)$ tensor decomposition,\nreferred to as Midas-LL1. We also introduce an extended multi-step\nvariance-reduced stochastic estimator framework. Our analysis under this new\nframework demonstrates the subsequential and sequential convergence of the\nproposed algorithm under certain conditions and illustrates the sublinear\nconvergence rate of the subsequence, showing that the Midas-LL1 algorithm\nrequires at most $\\mathcal{O}(\\varepsilon^{-2})$ iterations in expectation to\nreach an $\\varepsilon$-stationary point. The proposed algorithm is evaluated on\nseveral datasets, and the results indicate that Midas-LL1 outperforms existing\nstate-of-the-art algorithms in terms of both computational speed and solution\nquality.\n","authors":["Zehui Liu","Qingsong Wang","Chunfeng Cui"],"pdf_url":"https://arxiv.org/pdf/2501.04833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04823v1","updated":"2025-01-08T20:22:16Z","published":"2025-01-08T20:22:16Z","title":"Learning Robot Safety from Sparse Human Feedback using Conformal\n  Prediction","summary":"  Ensuring robot safety can be challenging; user-defined constraints can miss\nedge cases, policies can become unsafe even when trained from safe data, and\nsafety can be subjective. Thus, we learn about robot safety by showing policy\ntrajectories to a human who flags unsafe behavior. From this binary feedback,\nwe use the statistical method of conformal prediction to identify a region of\nstates, potentially in learned latent space, guaranteed to contain a\nuser-specified fraction of future policy errors. Our method is\nsample-efficient, as it builds on nearest neighbor classification and avoids\nwithholding data as is common with conformal prediction. By alerting if the\nrobot reaches the suspected unsafe region, we obtain a warning system that\nmimics the human's safety preferences with guaranteed miss rate. From video\nlabeling, our system can detect when a quadcopter visuomotor policy will fail\nto steer through a designated gate. We present an approach for policy\nimprovement by avoiding the suspected unsafe region. With it we improve a model\npredictive controller's safety, as shown in experimental testing with 30\nquadcopter flights across 6 navigation tasks. Code and videos are provided.\n","authors":["Aaron O. Feldman","Joseph A. Vincent","Maximilian Adang","Jun En Low","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2501.04823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04805v1","updated":"2025-01-08T19:46:41Z","published":"2025-01-08T19:46:41Z","title":"Extended formulations for the multilinear polytope of acyclic\n  hypergraphs","summary":"  This article provides an overview of our joint work on binary polynomial\noptimization over the past decade. We define the multilinear polytope as the\nconvex hull of the feasible region of a linearized binary polynomial\noptimization problem. By representing the multilinear polytope with\nhypergraphs, we investigate the connections between hypergraph acyclicity and\nthe complexity of the facial structure of the multilinear polytope. We\ncharacterize the acyclic hypergraphs for which a polynomial-size extended\nformulation for the multilinear polytope can be constructed in polynomial time.\n","authors":["Alberto Del Pia","Aida Khajavirad"],"pdf_url":"https://arxiv.org/pdf/2501.04805v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2212.11239"},{"id":"http://arxiv.org/abs/2501.04781v1","updated":"2025-01-08T19:00:21Z","published":"2025-01-08T19:00:21Z","title":"Inexact Catching-Up Algorithm for Moreau's Sweeping Processes","summary":"  In this paper, we develop an inexact version of the catching-up algorithm for\nsweeping processes. We define a new notion of approximate projection, which is\ncompatible with any numerical method for approximating exact projections, as\nthis new notion is not restricted to remain strictly within the set. We provide\nseveral properties of the new approximate projections, which enable us to prove\nthe convergence of the inexact catching-up algorithm in three general\nframeworks: prox-regular moving sets, subsmooth moving sets, and merely closed\nsets. Additionally, we apply our numerical results to address complementarity\ndynamical systems, particularly electrical circuits with ideal diodes. In this\ncontext, we implement the inexact catching-up algorithm using a primal-dual\noptimization method, which typically does not necessarily guarantee a feasible\npoint. Our results are illustrated through an electrical circuit with ideal\ndiodes. Our results recover classical existence results in the literature and\nprovide new insights into the numerical simulation of sweeping processes.\n","authors":["Juan Guillermo Garrido","Maximiliano Lioi","Emilio Vilches"],"pdf_url":"https://arxiv.org/pdf/2501.04781v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2308.08093"},{"id":"http://arxiv.org/abs/2501.04759v1","updated":"2025-01-08T17:28:30Z","published":"2025-01-08T17:28:30Z","title":"Optimize the parameters of the PID Controller using Genetic Algorithm\n  for Robot Manipulators","summary":"  This paper presents the design a Proportional-Integral-Derivative (PID)\ncontroller with optimized parameters for a two-degree-of-freedom robotic arm. A\ngenetic algorithm (GA) is proposed to optimize the controller parameters,\naddressing the challenges in determining PID controller parameters for highly\nnonlinear systems like robotic arms compared to traditional methods. The\nGA-optimized PID controller significantly improves control accuracy and\nperformance over traditional control methods. Simulation results demonstrate\nthat the robotic arm system operates with high precision and stability.\nAdditionally, the shortened trajectory tracking response time enhances the\nfeasibility of applying this control algorithm in realworld scenarios. This\nresearch not only confirms the suitability of PID-GA for robotic arms and\nsimilar systems but also opens new avenues for applying this algorithm to real\nphysical systems.\n","authors":["Vu Ngoc Son","Pham Van Cuong","Nguyen Duy Minh","Phi Hoang Nha"],"pdf_url":"https://arxiv.org/pdf/2501.04759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06251v1","updated":"2025-01-08T23:53:39Z","published":"2025-01-08T23:53:39Z","title":"Under the hood of a carbon footprint calculator","summary":"  We explain the mathematical theory of the Input-Output method for carbon\nfootprints computations.\n","authors":["Indira Chatterji","Ariadna Fossas Tenas","Elise Raphael"],"pdf_url":"https://arxiv.org/pdf/2501.06251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06247v1","updated":"2025-01-08T18:06:30Z","published":"2025-01-08T18:06:30Z","title":"A Survey on Algorithmic Developments in Optimal Transport Problem with\n  Applications","summary":"  Optimal Transport (OT) has established itself as a robust framework for\nquantifying differences between distributions, with applications that span\nfields such as machine learning, data science, and computer vision. This paper\noffers a detailed examination of the OT problem, beginning with its theoretical\nfoundations, including the classical formulations of Monge and Kantorovich and\ntheir extensions to modern computational techniques. It explores cutting-edge\nalgorithms, including Sinkhorn iterations, primal-dual strategies, and\nreduction-based approaches, emphasizing their efficiency and scalability in\naddressing high-dimensional problems. The paper also highlights emerging\ntrends, such as integrating OT into machine learning frameworks, the\ndevelopment of novel problem variants, and ongoing theoretical advancements.\nApplications of OT are presented across a range of domains, with particular\nattention to its innovative application in time series data analysis via\nOptimal Transport Warping (OTW), a robust alternative to methods like Dynamic\nTime Warping. Despite the significant progress made, challenges related to\nscalability, robustness, and ethical considerations remain, necessitating\nfurther research. The paper underscores OT's potential to bridge theoretical\ndepth and practical utility, fostering impactful advancements across diverse\ndisciplines.\n","authors":["Sina Moradi"],"pdf_url":"https://arxiv.org/pdf/2501.06247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06240v1","updated":"2025-01-08T13:26:56Z","published":"2025-01-08T13:26:56Z","title":"The Convergence of Dynamic Routing between Capsules","summary":"  Capsule networks(CapsNet) are recently proposed neural network models with\nnew processing layers, specifically for entity representation and discovery of\nimages. It is well known that CapsNet have some advantages over traditional\nneural networks, especially in generalization capability. At the same time,\nsome studies report negative experimental results. The causes of this\ncontradiction have not been thoroughly analyzed. The preliminary experimental\nresults show that the behavior of routing algorithms does not always produce\ngood results as expected, and in most cases, different routing algorithms do\nnot change the classification results, but simply polarize the link strength,\nespecially when they continue to repeat without stopping. To realize the true\npotential of the CapsNet, deep mathematical analysis of the routing algorithms\nis crucial. In this paper, we will give the objective function that is\nminimized by the dynamic routing algorithm, which is a concave function. The\ndynamic routing algorithm can be regarded as nonlinear gradient method to\nsolving an optimization algorithm under linear constraints, and its convergence\ncan be strictly proved mathematically. Furthermore, the mathematically rigorous\nproof of the convergence is given for this class of iterative routing\nprocedures. We analyze the relation between the objective function and the\nconstraints solved by the dynamic routing algorithm in detail, and perform the\ncorresponding routing experiment to analyze the effect of our convergence\nproof.\n","authors":["Daoyuan Ye","Juntao Li","Yiting Shen"],"pdf_url":"https://arxiv.org/pdf/2501.06240v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.08272v2","updated":"2025-01-08T18:59:48Z","published":"2024-09-12T17:59:04Z","title":"Click2Mask: Local Editing with Dynamic Mask Generation","summary":"  Recent advancements in generative models have revolutionized image generation\nand editing, making these tasks accessible to non-experts. This paper focuses\non local image editing, particularly the task of adding new content to a\nloosely specified area. Existing methods often require a precise mask or a\ndetailed description of the location, which can be cumbersome and prone to\nerrors. We propose Click2Mask, a novel approach that simplifies the local\nediting process by requiring only a single point of reference (in addition to\nthe content description). A mask is dynamically grown around this point during\na Blended Latent Diffusion (BLD) process, guided by a masked CLIP-based\nsemantic loss. Click2Mask surpasses the limitations of segmentation-based and\nfine-tuning dependent methods, offering a more user-friendly and contextually\naccurate solution. Our experiments demonstrate that Click2Mask not only\nminimizes user effort but also enables competitive or superior local image\nmanipulations compared to SoTA methods, according to both human judgement and\nautomatic metrics. Key contributions include the simplification of user input,\nthe ability to freely add objects unconstrained by existing segments, and the\nintegration potential of our dynamic mask approach within other editing\nmethods.\n","authors":["Omer Regev","Omri Avrahami","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2409.08272v2.pdf","comment":"Accepted to AAAI 2025. Project page is available at\n  https://omeregev.github.io/click2mask/"},{"id":"http://arxiv.org/abs/2501.04700v1","updated":"2025-01-08T18:59:36Z","published":"2025-01-08T18:59:36Z","title":"Planarian Neural Networks: Evolutionary Patterns from Basic Bilateria\n  Shaping Modern Artificial Neural Network Architectures","summary":"  This study examined the viability of enhancing the prediction accuracy of\nartificial neural networks (ANNs) in image classification tasks by developing\nANNs with evolution patterns similar to those of biological neural networks.\nResNet is a widely used family of neural networks with both deep and wide\nvariants; therefore, it was selected as the base model for our investigation.\nThe aim of this study is to improve the image classification performance of\nANNs via a novel approach inspired by the biological nervous system\narchitecture of planarians, which comprises a brain and two nerve cords. We\nbelieve that the unique neural architecture of planarians offers valuable\ninsights into the performance enhancement of ANNs. The proposed planarian\nneural architecture-based neural network was evaluated on the CIFAR-10 and\nCIFAR-100 datasets. Our results indicate that the proposed method exhibits\nhigher prediction accuracy than the baseline neural network models in image\nclassification tasks. These findings demonstrate the significant potential of\nbiologically inspired neural network architectures in improving the performance\nof ANNs in a wide range of applications.\n","authors":["Ziyuan Huang","Mark Newman","Maria Vaida","Srikar Bellur","Roozbeh Sadeghian","Andrew Siu","Hui Wang","Kevin Huggins"],"pdf_url":"https://arxiv.org/pdf/2501.04700v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.04699v1","updated":"2025-01-08T18:59:35Z","published":"2025-01-08T18:59:35Z","title":"EditAR: Unified Conditional Generation with Autoregressive Models","summary":"  Recent progress in controllable image generation and editing is largely\ndriven by diffusion-based methods. Although diffusion models perform\nexceptionally well in specific tasks with tailored designs, establishing a\nunified model is still challenging. In contrast, autoregressive models\ninherently feature a unified tokenized representation, which simplifies the\ncreation of a single foundational model for various tasks. In this work, we\npropose EditAR, a single unified autoregressive framework for a variety of\nconditional image generation tasks, e.g., image editing, depth-to-image,\nedge-to-image, segmentation-to-image. The model takes both images and\ninstructions as inputs, and predicts the edited images tokens in a vanilla\nnext-token paradigm. To enhance the text-to-image alignment, we further propose\nto distill the knowledge from foundation models into the autoregressive\nmodeling process. We evaluate its effectiveness across diverse tasks on\nestablished benchmarks, showing competitive performance to various\nstate-of-the-art task-specific methods. Project page:\nhttps://jitengmu.github.io/EditAR/\n","authors":["Jiteng Mu","Nuno Vasconcelos","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04699v1.pdf","comment":"Project page: https://jitengmu.github.io/EditAR/"},{"id":"http://arxiv.org/abs/2501.04698v1","updated":"2025-01-08T18:59:01Z","published":"2025-01-08T18:59:01Z","title":"ConceptMaster: Multi-Concept Video Customization on Diffusion\n  Transformer Models Without Test-Time Tuning","summary":"  Text-to-video generation has made remarkable advancements through diffusion\nmodels. However, Multi-Concept Video Customization (MCVC) remains a significant\nchallenge. We identify two key challenges in this task: 1) the identity\ndecoupling problem, where directly adopting existing customization methods\ninevitably mix attributes when handling multiple concepts simultaneously, and\n2) the scarcity of high-quality video-entity pairs, which is crucial for\ntraining such a model that represents and decouples various concepts well. To\naddress these challenges, we introduce ConceptMaster, an innovative framework\nthat effectively tackles the critical issues of identity decoupling while\nmaintaining concept fidelity in customized videos. Specifically, we introduce a\nnovel strategy of learning decoupled multi-concept embeddings that are injected\ninto the diffusion models in a standalone manner, which effectively guarantees\nthe quality of customized videos with multiple identities, even for highly\nsimilar visual concepts. To further overcome the scarcity of high-quality MCVC\ndata, we carefully establish a data construction pipeline, which enables\nsystematic collection of precise multi-concept video-entity data across diverse\nconcepts. A comprehensive benchmark is designed to validate the effectiveness\nof our model from three critical dimensions: concept fidelity, identity\ndecoupling ability, and video generation quality across six different concept\ncomposition scenarios. Extensive experiments demonstrate that our ConceptMaster\nsignificantly outperforms previous approaches for this task, paving the way for\ngenerating personalized and semantically accurate videos across multiple\nconcepts.\n","authors":["Yuzhou Huang","Ziyang Yuan","Quande Liu","Qiulin Wang","Xintao Wang","Ruimao Zhang","Pengfei Wan","Di Zhang","Kun Gai"],"pdf_url":"https://arxiv.org/pdf/2501.04698v1.pdf","comment":"Project Page: https://yuzhou914.github.io/ConceptMaster/"},{"id":"http://arxiv.org/abs/2501.04697v1","updated":"2025-01-08T18:58:48Z","published":"2025-01-08T18:58:48Z","title":"Grokking at the Edge of Numerical Stability","summary":"  Grokking, the sudden generalization that occurs after prolonged overfitting,\nis a surprising phenomenon challenging our understanding of deep learning.\nAlthough significant progress has been made in understanding grokking, the\nreasons behind the delayed generalization and its dependence on regularization\nremain unclear. In this work, we argue that without regularization, grokking\ntasks push models to the edge of numerical stability, introducing floating\npoint errors in the Softmax function, which we refer to as Softmax Collapse\n(SC). We demonstrate that SC prevents grokking and that mitigating SC enables\ngrokking without regularization. Investigating the root cause of SC, we find\nthat beyond the point of overfitting, the gradients strongly align with what we\ncall the na\\\"ive loss minimization (NLM) direction. This component of the\ngradient does not alter the model's predictions but decreases the loss by\nscaling the logits, typically by scaling the weights along their current\ndirection. We show that this scaling of the logits explains the delay in\ngeneralization characteristic of grokking and eventually leads to SC, halting\nfurther learning. To validate our hypotheses, we introduce two key\ncontributions that address the challenges in grokking tasks: StableMax, a new\nactivation function that prevents SC and enables grokking without\nregularization, and $\\perp$Grad, a training algorithm that promotes quick\ngeneralization in grokking tasks by preventing NLM altogether. These\ncontributions provide new insights into grokking, elucidating its delayed\ngeneralization, reliance on regularization, and the effectiveness of existing\ngrokking-inducing methods. Code for this paper is available at\nhttps://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability.\n","authors":["Lucas Prieto","Melih Barsbey","Pedro A. M. Mediano","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2501.04697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04696v1","updated":"2025-01-08T18:58:24Z","published":"2025-01-08T18:58:24Z","title":"Test-Time Optimization for Domain Adaptive Open Vocabulary Segmentation","summary":"  We present Seg-TTO, a novel framework for zero-shot, open-vocabulary semantic\nsegmentation (OVSS), designed to excel in specialized domain tasks. While\ncurrent open vocabulary approaches show impressive performance on standard\nsegmentation benchmarks under zero-shot settings, they fall short of supervised\ncounterparts on highly domain-specific datasets. We focus on\nsegmentation-specific test-time optimization to address this gap. Segmentation\nrequires an understanding of multiple concepts within a single image while\nretaining the locality and spatial structure of representations. We propose a\nnovel self-supervised objective adhering to these requirements and use it to\nalign the model parameters with input images at test time. In the textual\nmodality, we learn multiple embeddings for each category to capture diverse\nconcepts within an image, while in the visual modality, we calculate\npixel-level losses followed by embedding aggregation operations specific to\npreserving spatial structure. Our resulting framework termed Seg-TTO is a\nplug-in-play module. We integrate Seg-TTO with three state-of-the-art OVSS\napproaches and evaluate across 22 challenging OVSS tasks covering a range of\nspecialized domains. Our Seg-TTO demonstrates clear performance improvements\nacross these establishing new state-of-the-art. Code:\nhttps://github.com/UlinduP/SegTTO.\n","authors":["Ulindu De Silva","Didula Samaraweera","Sasini Wanigathunga","Kavindu Kariyawasam","Kanchana Ranasinghe","Muzammal Naseer","Ranga Rodrigo"],"pdf_url":"https://arxiv.org/pdf/2501.04696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04695v1","updated":"2025-01-08T18:58:22Z","published":"2025-01-08T18:58:22Z","title":"Re-ranking the Context for Multimodal Retrieval Augmented Generation","summary":"  Retrieval-augmented generation (RAG) enhances large language models (LLMs) by\nincorporating external knowledge to generate a response within a context with\nimproved accuracy and reduced hallucinations. However, multi-modal RAG systems\nface unique challenges: (i) the retrieval process may select irrelevant entries\nto user query (e.g., images, documents), and (ii) vision-language models or\nmulti-modal language models like GPT-4o may hallucinate when processing these\nentries to generate RAG output. In this paper, we aim to address the first\nchallenge, i.e, improving the selection of relevant context from the\nknowledge-base in retrieval phase of the multi-modal RAG. Specifically, we\nleverage the relevancy score (RS) measure designed in our previous work for\nevaluating the RAG performance to select more relevant entries in retrieval\nprocess. The retrieval based on embeddings, say CLIP-based embedding, and\ncosine similarity usually perform poorly particularly for multi-modal data. We\nshow that by using a more advanced relevancy measure, one can enhance the\nretrieval process by selecting more relevant pieces from the knowledge-base and\neliminate the irrelevant pieces from the context by adaptively selecting\nup-to-$k$ entries instead of fixed number of entries. Our evaluation using COCO\ndataset demonstrates significant enhancement in selecting relevant context and\naccuracy of the generated response.\n","authors":["Matin Mortaheb","Mohammad A. Amir Khojastepour","Srimat T. Chakradhar","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2501.04695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04689v1","updated":"2025-01-08T18:52:03Z","published":"2025-01-08T18:52:03Z","title":"SPAR3D: Stable Point-Aware Reconstruction of 3D Objects from Single\n  Images","summary":"  We study the problem of single-image 3D object reconstruction. Recent works\nhave diverged into two directions: regression-based modeling and generative\nmodeling. Regression methods efficiently infer visible surfaces, but struggle\nwith occluded regions. Generative methods handle uncertain regions better by\nmodeling distributions, but are computationally expensive and the generation is\noften misaligned with visible surfaces. In this paper, we present SPAR3D, a\nnovel two-stage approach aiming to take the best of both directions. The first\nstage of SPAR3D generates sparse 3D point clouds using a lightweight point\ndiffusion model, which has a fast sampling speed. The second stage uses both\nthe sampled point cloud and the input image to create highly detailed meshes.\nOur two-stage design enables probabilistic modeling of the ill-posed\nsingle-image 3D task while maintaining high computational efficiency and great\noutput fidelity. Using point clouds as an intermediate representation further\nallows for interactive user edits. Evaluated on diverse datasets, SPAR3D\ndemonstrates superior performance over previous state-of-the-art methods, at an\ninference speed of 0.7 seconds. Project page with code and model:\nhttps://spar3d.github.io\n","authors":["Zixuan Huang","Mark Boss","Aaryaman Vasishta","James M. Rehg","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2501.04689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04678v1","updated":"2025-01-08T18:39:10Z","published":"2025-01-08T18:39:10Z","title":"RadGPT: Constructing 3D Image-Text Tumor Datasets","summary":"  With over 85 million CT scans performed annually in the United States,\ncreating tumor-related reports is a challenging and time-consuming task for\nradiologists. To address this need, we present RadGPT, an Anatomy-Aware\nVision-Language AI Agent for generating detailed reports from CT scans. RadGPT\nfirst segments tumors, including benign cysts and malignant tumors, and their\nsurrounding anatomical structures, then transforms this information into both\nstructured reports and narrative reports. These reports provide tumor size,\nshape, location, attenuation, volume, and interactions with surrounding blood\nvessels and organs. Extensive evaluation on unseen hospitals shows that RadGPT\ncan produce accurate reports, with high sensitivity/specificity for small tumor\n(<2 cm) detection: 80/73% for liver tumors, 92/78% for kidney tumors, and\n77/77% for pancreatic tumors. For large tumors, sensitivity ranges from 89% to\n97%. The results significantly surpass the state-of-the-art in abdominal CT\nreport generation.\n  RadGPT generated reports for 17 public datasets. Through radiologist review\nand refinement, we have ensured the reports' accuracy, and created the first\npublicly available image-text 3D medical dataset, comprising over 1.8 million\ntext tokens and 2.7 million images from 9,262 CT scans, including 2,947 tumor\nscans/reports of 8,562 tumor instances. Our reports can: (1) localize tumors in\neight liver sub-segments and three pancreatic sub-segments annotated per-voxel;\n(2) determine pancreatic tumor stage (T1-T4) in 260 reports; and (3) present\nindividual analyses of multiple tumors--rare in human-made reports.\nImportantly, 948 of the reports are for early-stage tumors.\n","authors":["Pedro R. A. S. Bassi","Mehmet Can Yavuz","Kang Wang","Xiaoxi Chen","Wenxuan Li","Sergio Decherchi","Andrea Cavalli","Yang Yang","Alan Yuille","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.04678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04675v1","updated":"2025-01-08T18:33:17Z","published":"2025-01-08T18:33:17Z","title":"Enhancing Financial VQA in Vision Language Models using Intermediate\n  Structured Representations","summary":"  Chart interpretation is crucial for visual data analysis, but accurately\nextracting information from charts poses significant challenges for automated\nmodels. This study investigates the fine-tuning of DEPLOT, a modality\nconversion module that translates the image of a plot or chart to a linearized\ntable, on a custom dataset of 50,000 bar charts. The dataset comprises simple,\nstacked, and grouped bar charts, targeting the unique structural features of\nthese visualizations. The finetuned DEPLOT model is evaluated against its base\nversion using a test set of 1,000 images and two metrics: Relative Mapping\nSimilarity (RMS), which measures categorical mapping accuracy, and Relative\nNumber Set Similarity (RNSS), which evaluates numerical interpretation\naccuracy. To further explore the reasoning capabilities of large language\nmodels (LLMs), we curate an additional set of 100 bar chart images paired with\nquestion answer sets. Our findings demonstrate that providing a structured\nintermediate table alongside the image significantly enhances LLM reasoning\nperformance compared to direct image queries.\n","authors":["Archita Srivastava","Abhas Kumar","Rajesh Kumar","Prabhakar Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2501.04675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02788v2","updated":"2025-01-08T18:33:07Z","published":"2025-01-06T06:07:40Z","title":"GLoG-CSUnet: Enhancing Vision Transformers with Adaptable Radiomic\n  Features for Medical Image Segmentation","summary":"  Vision Transformers (ViTs) have shown promise in medical image semantic\nsegmentation (MISS) by capturing long-range correlations. However, ViTs often\nstruggle to model local spatial information effectively, which is essential for\naccurately segmenting fine anatomical details, particularly when applied to\nsmall datasets without extensive pre-training. We introduce Gabor and Laplacian\nof Gaussian Convolutional Swin Network (GLoG-CSUnet), a novel architecture\nenhancing Transformer-based models by incorporating learnable radiomic\nfeatures. This approach integrates dynamically adaptive Gabor and Laplacian of\nGaussian (LoG) filters to capture texture, edge, and boundary information,\nenhancing the feature representation processed by the Transformer model. Our\nmethod uniquely combines the long-range dependency modeling of Transformers\nwith the texture analysis capabilities of Gabor and LoG features. Evaluated on\nthe Synapse multi-organ and ACDC cardiac segmentation datasets, GLoG-CSUnet\ndemonstrates significant improvements over state-of-the-art models, achieving a\n1.14% increase in Dice score for Synapse and 0.99% for ACDC, with minimal\ncomputational overhead (only 15 and 30 additional parameters, respectively).\nGLoG-CSUnet's flexible design allows integration with various base models,\noffering a promising approach for incorporating radiomics-inspired feature\nextraction in Transformer architectures for medical image analysis. The code\nimplementation is available on GitHub at: https://github.com/HAAIL/GLoG-CSUnet.\n","authors":["Niloufar Eghbali","Hassan Bagher-Ebadian","Tuka Alhanai","Mohammad M. Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2501.02788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04671v1","updated":"2025-01-08T18:31:16Z","published":"2025-01-08T18:31:16Z","title":"DRIVINGVQA: Analyzing Visual Chain-of-Thought Reasoning of Vision\n  Language Models in Real-World Scenarios with Driving Theory Tests","summary":"  Large vision-language models (LVLMs) augment language models with visual\nunderstanding, enabling multimodal reasoning. However, due to the modality gap\nbetween textual and visual data, they often face significant challenges, such\nas over-reliance on text priors, hallucinations, and limited capacity for\ncomplex visual reasoning. Existing benchmarks to evaluate visual reasoning in\nLVLMs often rely on schematic or synthetic images and on imprecise\nmachine-generated explanations. To bridge the modality gap, we present\nDrivingVQA, a new benchmark derived from driving theory tests to evaluate\nvisual chain-of-thought reasoning in complex real-world scenarios. It offers\n3,931 expert-crafted multiple-choice problems and interleaved explanations\ngrounded with entities relevant to the reasoning process. We leverage this\ndataset to perform an extensive study of LVLMs' ability to reason about complex\nvisual scenarios. Our experiments reveal that open-source and proprietary LVLMs\nstruggle with visual chain-of-thought reasoning under zero-shot settings. We\ninvestigate training strategies that leverage relevant entities to improve\nvisual reasoning. Notably, we observe a performance boost of up to 7\\% when\nreasoning over image tokens of cropped regions tied to these entities.\n","authors":["Charles Corbière","Simon Roburin","Syrielle Montariol","Antoine Bosselut","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2501.04671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04670v1","updated":"2025-01-08T18:30:53Z","published":"2025-01-08T18:30:53Z","title":"Are They the Same? Exploring Visual Correspondence Shortcomings of\n  Multimodal LLMs","summary":"  Recent advancements in multimodal models have shown a strong ability in\nvisual perception, reasoning abilities, and vision-language understanding.\nHowever, studies on visual matching ability are missing, where finding the\nvisual correspondence of objects is essential in vision research. Our research\nreveals that the matching capabilities in recent multimodal LLMs (MLLMs) still\nexhibit systematic shortcomings, even with current strong MLLMs models, GPT-4o.\nIn particular, we construct a Multimodal Visual Matching (MMVM) benchmark to\nfairly benchmark over 30 different MLLMs. The MMVM benchmark is built from 15\nopen-source datasets and Internet videos with manual annotation. We categorize\nthe data samples of MMVM benchmark into eight aspects based on the required\ncues and capabilities to more comprehensively evaluate and analyze current\nMLLMs. In addition, we have designed an automatic annotation pipeline to\ngenerate the MMVM SFT dataset, including 220K visual matching data with\nreasoning annotation. Finally, we present CoLVA, a novel contrastive MLLM with\ntwo novel technical designs: fine-grained vision expert with object-level\ncontrastive learning and instruction augmentation strategy. CoLVA achieves\n51.06\\% overall accuracy (OA) on the MMVM benchmark, surpassing GPT-4o and\nbaseline by 8.41\\% and 23.58\\% OA, respectively. The results show the\neffectiveness of our MMVM SFT dataset and our novel technical designs. Code,\nbenchmark, dataset, and models are available at\nhttps://github.com/zhouyiks/CoLVA.\n","authors":["Yikang Zhou","Tao Zhang","Shilin Xu","Shihao Chen","Qianyu Zhou","Yunhai Tong","Shunping Ji","Jiangning Zhang","Xiangtai Li","Lu Qi"],"pdf_url":"https://arxiv.org/pdf/2501.04670v1.pdf","comment":"project page: https://zhouyiks.github.io/projects/CoLVA/"},{"id":"http://arxiv.org/abs/2501.04666v1","updated":"2025-01-08T18:25:50Z","published":"2025-01-08T18:25:50Z","title":"Enhancing Virtual Try-On with Synthetic Pairs and Error-Aware Noise\n  Scheduling","summary":"  Given an isolated garment image in a canonical product view and a separate\nimage of a person, the virtual try-on task aims to generate a new image of the\nperson wearing the target garment. Prior virtual try-on works face two major\nchallenges in achieving this goal: a) the paired (human, garment) training data\nhas limited availability; b) generating textures on the human that perfectly\nmatch that of the prompted garment is difficult, often resulting in distorted\ntext and faded textures. Our work explores ways to tackle these issues through\nboth synthetic data as well as model refinement. We introduce a garment\nextraction model that generates (human, synthetic garment) pairs from a single\nimage of a clothed individual. The synthetic pairs can then be used to augment\nthe training of virtual try-on. We also propose an Error-Aware Refinement-based\nSchr\\\"odinger Bridge (EARSB) that surgically targets localized generation\nerrors for correcting the output of a base virtual try-on model. To identify\nlikely errors, we propose a weakly-supervised error classifier that localizes\nregions for refinement, subsequently augmenting the Schr\\\"odinger Bridge's\nnoise schedule with its confidence heatmap. Experiments on VITON-HD and\nDressCode-Upper demonstrate that our synthetic data augmentation enhances the\nperformance of prior work, while EARSB improves the overall image quality. In\nuser studies, our model is preferred by the users in an average of 59% of\ncases.\n","authors":["Nannan Li","Kevin J. Shih","Bryan A. Plummer"],"pdf_url":"https://arxiv.org/pdf/2501.04666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04665v1","updated":"2025-01-08T18:22:44Z","published":"2025-01-08T18:22:44Z","title":"HyFusion: Enhanced Reception Field Transformer for Hyperspectral Image\n  Fusion","summary":"  Hyperspectral image (HSI) fusion addresses the challenge of reconstructing\nHigh-Resolution HSIs (HR-HSIs) from High-Resolution Multispectral images\n(HR-MSIs) and Low-Resolution HSIs (LR-HSIs), a critical task given the high\ncosts and hardware limitations associated with acquiring high-quality HSIs.\nWhile existing methods leverage spatial and spectral relationships, they often\nsuffer from limited receptive fields and insufficient feature utilization,\nleading to suboptimal performance. Furthermore, the scarcity of high-quality\nHSI data highlights the importance of efficient data utilization to maximize\nreconstruction quality. To address these issues, we propose HyFusion, a novel\nframework designed to enhance the receptive field and enable effective feature\nmap reusing, thereby maximizing data utilization. First, HR-MSI and LR-HSI\ninputs are concatenated to form a quasi-fused draft, preserving complementary\nspatial and spectral details. Next, the Enhanced Reception Field Block (ERFB)\nis introduced, combining shifting-window attention and dense connections to\nexpand the receptive field, effectively capturing long-range dependencies and\nreusing features to reduce information loss, thereby boosting data efficiency.\nFinally, the Dual-Coupled Network (DCN) dynamically extracts high-frequency\nspectral and spatial features from LR-HSI and HR-MSI, ensuring efficient\ncross-domain fusion. Extensive experiments demonstrate that HyFusion achieves\nstate-of-the-art performance in HR-MSI/LR-HSI fusion, significantly improving\nreconstruction quality while maintaining a compact model size and computational\nefficiency. By integrating enhanced receptive fields and feature map reusing,\nHyFusion provides a practical and effective solution for HSI fusion in\nresource-constrained scenarios, setting a new benchmark in hyperspectral\nimaging. Our code will be publicly available.\n","authors":["Chia-Ming Lee","Yu-Fan Lin","Yu-Hao Ho","Li-Wei Kang","Chih-Chung Hsu"],"pdf_url":"https://arxiv.org/pdf/2501.04665v1.pdf","comment":"Submitted to IGARSS 2025"},{"id":"http://arxiv.org/abs/2501.04648v1","updated":"2025-01-08T18:01:49Z","published":"2025-01-08T18:01:49Z","title":"FlairGPT: Repurposing LLMs for Interior Designs","summary":"  Interior design involves the careful selection and arrangement of objects to\ncreate an aesthetically pleasing, functional, and harmonized space that aligns\nwith the client's design brief. This task is particularly challenging, as a\nsuccessful design must not only incorporate all the necessary objects in a\ncohesive style, but also ensure they are arranged in a way that maximizes\naccessibility, while adhering to a variety of affordability and usage\nconsiderations. Data-driven solutions have been proposed, but these are\ntypically room- or domain-specific and lack explainability in their design\ndesign considerations used in producing the final layout. In this paper, we\ninvestigate if large language models (LLMs) can be directly utilized for\ninterior design. While we find that LLMs are not yet capable of generating\ncomplete layouts, they can be effectively leveraged in a structured manner,\ninspired by the workflow of interior designers. By systematically probing LLMs,\nwe can reliably generate a list of objects along with relevant constraints that\nguide their placement. We translate this information into a design layout\ngraph, which is then solved using an off-the-shelf constrained optimization\nsetup to generate the final layouts. We benchmark our algorithm in various\ndesign configurations against existing LLM-based methods and human designs, and\nevaluate the results using a variety of quantitative and qualitative metrics\nalong with user studies. In summary, we demonstrate that LLMs, when used in a\nstructured manner, can effectively generate diverse high-quality layouts,\nmaking them a viable solution for creating large-scale virtual scenes. Project\nwebpage at https://flairgpt.github.io/\n","authors":["Gabrielle Littlefair","Niladri Shekhar Dutt","Niloy J. Mitra"],"pdf_url":"https://arxiv.org/pdf/2501.04648v1.pdf","comment":"Accepted at EUROGRAPHICS 2025"},{"id":"http://arxiv.org/abs/2501.04643v1","updated":"2025-01-08T17:49:52Z","published":"2025-01-08T17:49:52Z","title":"Discrete Wavelet Transform-Based Capsule Network for Hyperspectral Image\n  Classification","summary":"  Hyperspectral image (HSI) classification is a crucial technique for remote\nsensing to build large-scale earth monitoring systems. HSI contains much more\ninformation than traditional visual images for identifying the categories of\nland covers. One recent feasible solution for HSI is to leverage CapsNets for\ncapturing spectral-spatial information. However, these methods require high\ncomputational requirements due to the full connection architecture between\nstacked capsule layers. To solve this problem, a DWT-CapsNet is proposed to\nidentify partial but important connections in CapsNet for a effective and\nefficient HSI classification. Specifically, we integrate a tailored attention\nmechanism into a Discrete Wavelet Transform (DWT)-based downsampling layer,\nalleviating the information loss problem of conventional downsampling operation\nin feature extractors. Moreover, we propose a novel multi-scale routing\nalgorithm that prunes a large proportion of connections in CapsNet. A capsule\npyramid fusion mechanism is designed to aggregate the spectral-spatial\nrelationships in multiple levels of granularity, and then a self-attention\nmechanism is further conducted in a partially and locally connected\narchitecture to emphasize the meaningful relationships. As shown in the\nexperimental results, our method achieves state-of-the-art accuracy while\nkeeping lower computational demand regarding running time, flops, and the\nnumber of parameters, rendering it an appealing choice for practical\nimplementation in HSI classification.\n","authors":["Zhiqiang Gao","Jiaqi Wang","Hangchi Shen","Zhihao Dou","Xiangbo Zhang","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2501.04643v1.pdf","comment":"28 Pages; 9 Figure"},{"id":"http://arxiv.org/abs/2501.03800v2","updated":"2025-01-08T17:44:11Z","published":"2025-01-07T14:06:57Z","title":"MADation: Face Morphing Attack Detection with Foundation Models","summary":"  Despite the considerable performance improvements of face recognition\nalgorithms in recent years, the same scientific advances responsible for this\nprogress can also be used to create efficient ways to attack them, posing a\nthreat to their secure deployment. Morphing attack detection (MAD) systems aim\nto detect a specific type of threat, morphing attacks, at an early stage,\npreventing them from being considered for verification in critical processes.\nFoundation models (FM) learn from extensive amounts of unlabeled data,\nachieving remarkable zero-shot generalization to unseen domains. Although this\ngeneralization capacity might be weak when dealing with domain-specific\ndownstream tasks such as MAD, FMs can easily adapt to these settings while\nretaining the built-in knowledge acquired during pre-training. In this work, we\nrecognize the potential of FMs to perform well in the MAD task when properly\nadapted to its specificities. To this end, we adapt FM CLIP architectures with\nLoRA weights while simultaneously training a classification header. The\nproposed framework, MADation surpasses our alternative FM and transformer-based\nframeworks and constitutes the first adaption of FMs to the MAD task. MADation\npresents competitive results with current MAD solutions in the literature and\neven surpasses them in several evaluation scenarios. To encourage\nreproducibility and facilitate further research in MAD, we publicly release the\nimplementation of MADation at https: //github.com/gurayozgur/MADation\n","authors":["Eduarda Caldeira","Guray Ozgur","Tahar Chettaoui","Marija Ivanovska","Peter Peer","Fadi Boutros","Vitomir Struc","Naser Damer"],"pdf_url":"https://arxiv.org/pdf/2501.03800v2.pdf","comment":"Accepted at WACV 2025 workshops"},{"id":"http://arxiv.org/abs/2501.04631v1","updated":"2025-01-08T17:27:27Z","published":"2025-01-08T17:27:27Z","title":"Disentangled Clothed Avatar Generation with Layered Representation","summary":"  Clothed avatar generation has wide applications in virtual and augmented\nreality, filmmaking, and more. Previous methods have achieved success in\ngenerating diverse digital avatars, however, generating avatars with\ndisentangled components (\\eg, body, hair, and clothes) has long been a\nchallenge. In this paper, we propose LayerAvatar, the first feed-forward\ndiffusion-based method for generating component-disentangled clothed avatars.\nTo achieve this, we first propose a layered UV feature plane representation,\nwhere components are distributed in different layers of the Gaussian-based UV\nfeature plane with corresponding semantic labels. This representation supports\nhigh-resolution and real-time rendering, as well as expressive animation\nincluding controllable gestures and facial expressions. Based on the\nwell-designed representation, we train a single-stage diffusion model and\nintroduce constrain terms to address the severe occlusion problem of the\ninnermost human body layer. Extensive experiments demonstrate the impressive\nperformances of our method in generating disentangled clothed avatars, and we\nfurther explore its applications in component transfer. The project page is\navailable at: https://olivia23333.github.io/LayerAvatar/\n","authors":["Weitian Zhang","Sijing Wu","Manwen Liao","Yichao Yan"],"pdf_url":"https://arxiv.org/pdf/2501.04631v1.pdf","comment":"project page: https://olivia23333.github.io/LayerAvatar/"},{"id":"http://arxiv.org/abs/2501.04628v1","updated":"2025-01-08T17:19:35Z","published":"2025-01-08T17:19:35Z","title":"FatesGS: Fast and Accurate Sparse-View Surface Reconstruction using\n  Gaussian Splatting with Depth-Feature Consistency","summary":"  Recently, Gaussian Splatting has sparked a new trend in the field of computer\nvision. Apart from novel view synthesis, it has also been extended to the area\nof multi-view reconstruction. The latest methods facilitate complete, detailed\nsurface reconstruction while ensuring fast training speed. However, these\nmethods still require dense input views, and their output quality significantly\ndegrades with sparse views. We observed that the Gaussian primitives tend to\noverfit the few training views, leading to noisy floaters and incomplete\nreconstruction surfaces. In this paper, we present an innovative sparse-view\nreconstruction framework that leverages intra-view depth and multi-view feature\nconsistency to achieve remarkably accurate surface reconstruction.\nSpecifically, we utilize monocular depth ranking information to supervise the\nconsistency of depth distribution within patches and employ a smoothness loss\nto enhance the continuity of the distribution. To achieve finer surface\nreconstruction, we optimize the absolute position of depth through multi-view\nprojection features. Extensive experiments on DTU and BlendedMVS demonstrate\nthat our method outperforms state-of-the-art methods with a speedup of 60x to\n200x, achieving swift and fine-grained mesh reconstruction without the need for\ncostly pre-training.\n","authors":["Han Huang","Yulun Wu","Chao Deng","Ge Gao","Ming Gu","Yu-Shen Liu"],"pdf_url":"https://arxiv.org/pdf/2501.04628v1.pdf","comment":"Accepted by AAAI 2025. Project page:\n  https://alvin528.github.io/FatesGS/"},{"id":"http://arxiv.org/abs/2412.16780v2","updated":"2025-01-08T17:00:18Z","published":"2024-12-21T21:27:22Z","title":"Forget Vectors at Play: Universal Input Perturbations Driving Machine\n  Unlearning in Image Classification","summary":"  Machine unlearning (MU), which seeks to erase the influence of specific\nunwanted data from already-trained models, is becoming increasingly vital in\nmodel editing, particularly to comply with evolving data regulations like the\n``right to be forgotten''. Conventional approaches are predominantly\nmodel-based, typically requiring retraining or fine-tuning the model's weights\nto meet unlearning requirements. In this work, we approach the MU problem from\na novel input perturbation-based perspective, where the model weights remain\nintact throughout the unlearning process. We demonstrate the existence of a\nproactive input-based unlearning strategy, referred to forget vector, which can\nbe generated as an input-agnostic data perturbation and remains as effective as\nmodel-based approximate unlearning approaches. We also explore forget vector\narithmetic, whereby multiple class-specific forget vectors are combined through\nsimple operations (e.g., linear combinations) to generate new forget vectors\nfor unseen unlearning tasks, such as forgetting arbitrary subsets across\nclasses. Extensive experiments validate the effectiveness and adaptability of\nthe forget vector, showcasing its competitive performance relative to\nstate-of-the-art model-based methods. Codes are available at\nhttps://github.com/Changchangsun/Forget-Vector.\n","authors":["Changchang Sun","Ren Wang","Yihua Zhang","Jinghan Jia","Jiancheng Liu","Gaowen Liu","Sijia Liu","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2412.16780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04608v1","updated":"2025-01-08T16:44:06Z","published":"2025-01-08T16:44:06Z","title":"Comprehensive Examination of Unrolled Networks for Linear Inverse\n  Problems","summary":"  Unrolled networks have become prevalent in various computer vision and\nimaging tasks. Although they have demonstrated remarkable efficacy in solving\nspecific computer vision and computational imaging tasks, their adaptation to\nother applications presents considerable challenges. This is primarily due to\nthe multitude of design decisions that practitioners working on new\napplications must navigate, each potentially affecting the network's overall\nperformance. These decisions include selecting the optimization algorithm,\ndefining the loss function, and determining the number of convolutional layers,\namong others. Compounding the issue, evaluating each design choice requires\ntime-consuming simulations to train, fine-tune the neural network, and optimize\nfor its performance. As a result, the process of exploring multiple options and\nidentifying the optimal configuration becomes time-consuming and\ncomputationally demanding. The main objectives of this paper are (1) to unify\nsome ideas and methodologies used in unrolled networks to reduce the number of\ndesign choices a user has to make, and (2) to report a comprehensive ablation\nstudy to discuss the impact of each of the choices involved in designing\nunrolled networks and present practical recommendations based on our findings.\nWe anticipate that this study will help scientists and engineers design\nunrolled networks for their applications and diagnose problems within their\nnetworks efficiently.\n","authors":["Eric Chen","Xi Chen","Arian Maleki","Shirin Jalali"],"pdf_url":"https://arxiv.org/pdf/2501.04608v1.pdf","comment":"27 pages, 10 figures. Project Page:\n  https://github.com/YuxiChen25/Memory-Net-Inverse"},{"id":"http://arxiv.org/abs/2501.04606v1","updated":"2025-01-08T16:41:31Z","published":"2025-01-08T16:41:31Z","title":"Enhancing Low-Cost Video Editing with Lightweight Adaptors and\n  Temporal-Aware Inversion","summary":"  Recent advancements in text-to-image (T2I) generation using diffusion models\nhave enabled cost-effective video-editing applications by leveraging\npre-trained models, eliminating the need for resource-intensive training.\nHowever, the frame-independence of T2I generation often results in poor\ntemporal consistency. Existing methods address this issue through temporal\nlayer fine-tuning or inference-based temporal propagation, but these approaches\nsuffer from high training costs or limited temporal coherence. To address these\nchallenges, we propose a General and Efficient Adapter (GE-Adapter) that\nintegrates temporal-spatial and semantic consistency with Baliteral DDIM\ninversion. This framework introduces three key components: (1) Frame-based\nTemporal Consistency Blocks (FTC Blocks) to capture frame-specific features and\nenforce smooth inter-frame transitions via temporally-aware loss functions; (2)\nChannel-dependent Spatial Consistency Blocks (SCD Blocks) employing bilateral\nfilters to enhance spatial coherence by reducing noise and artifacts; and (3)\nToken-based Semantic Consistency Module (TSC Module) to maintain semantic\nalignment using shared prompt tokens and frame-specific tokens. Our method\nsignificantly improves perceptual quality, text-image alignment, and temporal\ncoherence, as demonstrated on the MSR-VTT dataset. Additionally, it achieves\nenhanced fidelity and frame-to-frame coherence, offering a practical solution\nfor T2V editing.\n","authors":["Yangfan He","Sida Li","Kun Li","Jianhui Wang","Binxu Li","Tianyu Shi","Jun Yin","Miao Zhang","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04597v1","updated":"2025-01-08T16:25:32Z","published":"2025-01-08T16:25:32Z","title":"FrontierNet: Learning Visual Cues to Explore","summary":"  Exploration of unknown environments is crucial for autonomous robots; it\nallows them to actively reason and decide on what new data to acquire for tasks\nsuch as mapping, object discovery, and environmental assessment. Existing\nmethods, such as frontier-based methods, rely heavily on 3D map operations,\nwhich are limited by map quality and often overlook valuable context from\nvisual cues. This work aims at leveraging 2D visual cues for efficient\nautonomous exploration, addressing the limitations of extracting goal poses\nfrom a 3D map. We propose a image-only frontier-based exploration system, with\nFrontierNet as a core component developed in this work. FrontierNet is a\nlearning-based model that (i) detects frontiers, and (ii) predicts their\ninformation gain, from posed RGB images enhanced by monocular depth priors. Our\napproach provides an alternative to existing 3D-dependent exploration systems,\nachieving a 16% improvement in early-stage exploration efficiency, as validated\nthrough extensive simulations and real-world experiments.\n","authors":["Boyang Sun","Hanzhi Chen","Stefan Leutenegger","Cesar Cadena","Marc Pollefeys","Hermann Blum"],"pdf_url":"https://arxiv.org/pdf/2501.04597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03214v2","updated":"2025-01-08T16:10:20Z","published":"2024-04-04T05:39:09Z","title":"LeGrad: An Explainability Method for Vision Transformers via Feature\n  Formation Sensitivity","summary":"  Vision Transformers (ViTs), with their ability to model long-range\ndependencies through self-attention mechanisms, have become a standard\narchitecture in computer vision. However, the interpretability of these models\nremains a challenge. To address this, we propose LeGrad, an explainability\nmethod specifically designed for ViTs. LeGrad computes the gradient with\nrespect to the attention maps of ViT layers, considering the gradient itself as\nthe explainability signal. We aggregate the signal over all layers, combining\nthe activations of the last as well as intermediate tokens to produce the\nmerged explainability map. This makes LeGrad a conceptually simple and an\neasy-to-implement tool for enhancing the transparency of ViTs. We evaluate\nLeGrad in challenging segmentation, perturbation, and open-vocabulary settings,\nshowcasing its versatility compared to other SotA explainability methods\ndemonstrating its superior spatial fidelity and robustness to perturbations. A\ndemo and the code is available at https://github.com/WalBouss/LeGrad.\n","authors":["Walid Bousselham","Angie Boggust","Sofian Chaybouti","Hendrik Strobelt","Hilde Kuehne"],"pdf_url":"https://arxiv.org/pdf/2404.03214v2.pdf","comment":"Code available at https://github.com/WalBouss/LeGrad"},{"id":"http://arxiv.org/abs/2501.04586v1","updated":"2025-01-08T16:06:21Z","published":"2025-01-08T16:06:21Z","title":"Identity-Preserving Video Dubbing Using Motion Warping","summary":"  Video dubbing aims to synthesize realistic, lip-synced videos from a\nreference video and a driving audio signal. Although existing methods can\naccurately generate mouth shapes driven by audio, they often fail to preserve\nidentity-specific features, largely because they do not effectively capture the\nnuanced interplay between audio cues and the visual attributes of reference\nidentity . As a result, the generated outputs frequently lack fidelity in\nreproducing the unique textural and structural details of the reference\nidentity. To address these limitations, we propose IPTalker, a novel and robust\nframework for video dubbing that achieves seamless alignment between driving\naudio and reference identity while ensuring both lip-sync accuracy and\nhigh-fidelity identity preservation. At the core of IPTalker is a\ntransformer-based alignment mechanism designed to dynamically capture and model\nthe correspondence between audio features and reference images, thereby\nenabling precise, identity-aware audio-visual integration. Building on this\nalignment, a motion warping strategy further refines the results by spatially\ndeforming reference images to match the target audio-driven configuration. A\ndedicated refinement process then mitigates occlusion artifacts and enhances\nthe preservation of fine-grained textures, such as mouth details and skin\nfeatures. Extensive qualitative and quantitative evaluations demonstrate that\nIPTalker consistently outperforms existing approaches in terms of realism, lip\nsynchronization, and identity retention, establishing a new state of the art\nfor high-quality, identity-consistent video dubbing.\n","authors":["Runzhen Liu","Qinjie Lin","Yunfei Liu","Lijian Lin","Ye Zhu","Yu Li","Chuhua Xian","Fa-Ting Hong"],"pdf_url":"https://arxiv.org/pdf/2501.04586v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2501.04582v1","updated":"2025-01-08T15:56:21Z","published":"2025-01-08T15:56:21Z","title":"Boosting Salient Object Detection with Knowledge Distillated from Large\n  Foundation Models","summary":"  Salient Object Detection (SOD) aims to identify and segment prominent regions\nwithin a scene. Traditional models rely on manually annotated pseudo labels\nwith precise pixel-level accuracy, which is time-consuming. We developed a\nlow-cost, high-precision annotation method by leveraging large foundation\nmodels to address the challenges. Specifically, we use a weakly supervised\napproach to guide large models in generating pseudo-labels through textual\nprompts. Since large models do not effectively focus on the salient regions of\nimages, we manually annotate a subset of text to fine-tune the model. Based on\nthis approach, which enables precise and rapid generation of pseudo-labels, we\nintroduce a new dataset, BDS-TR. Compared to the previous DUTS-TR dataset,\nBDS-TR is more prominent in scale and encompasses a wider variety of categories\nand scenes. This expansion will enhance our model's applicability across a\nbroader range of scenarios and provide a more comprehensive foundational\ndataset for future SOD research. Additionally, we present an edge decoder based\non dynamic upsampling, which focuses on object edges while gradually recovering\nimage feature resolution. Comprehensive experiments on five benchmark datasets\ndemonstrate that our method significantly outperforms state-of-the-art\napproaches and also surpasses several existing fully-supervised SOD methods.\nThe code and results will be made available.\n","authors":["Miaoyang He","Shuyong Gao","Tsui Qin Mok","Weifeng Ge","Wengqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.04582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12408v2","updated":"2025-01-08T15:54:31Z","published":"2024-07-17T08:39:20Z","title":"Towards Revisiting Visual Place Recognition for Joining Submaps in\n  Multimap SLAM","summary":"  Visual SLAM is a key technology for many autonomous systems. However,\ntracking loss can lead to the creation of disjoint submaps in multimap SLAM\nsystems like ORB-SLAM3. Because of that, these systems employ submap merging\nstrategies. As we show, these strategies are not always successful. In this\npaper, we investigate the impact of using modern VPR approaches for submap\nmerging in visual SLAM. We argue that classical evaluation metrics are not\nsufficient to estimate the impact of a modern VPR component on the overall\nsystem. We show that naively replacing the VPR component does not leverage its\nfull potential without requiring substantial interference in the original\nsystem. Because of that, we present a post-processing pipeline along with a set\nof metrics that allow us to estimate the impact of modern VPR components. We\nevaluate our approach on the NCLT and Newer College datasets using ORB-SLAM3\nwith NetVLAD and HDC-DELF as VPR components. Additionally, we present a simple\napproach for combining VPR with temporal consistency for map merging. We show\nthat the map merging performance of ORB-SLAM3 can be improved. Building on\nthese results, researchers in VPR can assess the potential of their approaches\nfor SLAM systems.\n","authors":["Markus Weißflog","Stefan Schubert","Peter Protzel","Peer Neubert"],"pdf_url":"https://arxiv.org/pdf/2407.12408v2.pdf","comment":"Accepted at TAROS 2024. This is the submitted version"},{"id":"http://arxiv.org/abs/2501.04579v1","updated":"2025-01-08T15:48:30Z","published":"2025-01-08T15:48:30Z","title":"Unified Coding for Both Human Perception and Generalized Machine\n  Analytics with CLIP Supervision","summary":"  The image compression model has long struggled with adaptability and\ngeneralization, as the decoded bitstream typically serves only human or machine\nneeds and fails to preserve information for unseen visual tasks. Therefore,\nthis paper innovatively introduces supervision obtained from multimodal\npre-training models and incorporates adaptive multi-objective optimization\ntailored to support both human visual perception and machine vision\nsimultaneously with a single bitstream, denoted as Unified and Generalized\nImage Coding for Machine (UG-ICM). Specifically, to get rid of the reliance\nbetween compression models with downstream task supervision, we introduce\nContrastive Language-Image Pre-training (CLIP) models into the training\nconstraint for improved generalization. Global-to-instance-wise CLIP\nsupervision is applied to help obtain hierarchical semantics that make models\nmore generalizable for the tasks relying on the information of different\ngranularity. Furthermore, for supporting both human and machine visions with\nonly a unifying bitstream, we incorporate a conditional decoding strategy that\ntakes as conditions human or machine preferences, enabling the bitstream to be\ndecoded into different versions for corresponding preferences. As such, our\nproposed UG-ICM is fully trained in a self-supervised manner, i.e., without\nawareness of any specific downstream models and tasks. The extensive\nexperiments have shown that the proposed UG-ICM is capable of achieving\nremarkable improvements in various unseen machine analytics tasks, while\nsimultaneously providing perceptually satisfying images.\n","authors":["Kangsheng Yin","Quan Liu","Xuelin Shen","Yulin He","Wenhan Yang","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04579v1.pdf","comment":"9 pages, 10 figures, publised to AAAI 2025"},{"id":"http://arxiv.org/abs/2406.15811v2","updated":"2025-01-08T15:32:35Z","published":"2024-06-22T10:33:14Z","title":"PointDreamer: Zero-shot 3D Textured Mesh Reconstruction from Colored\n  Point Cloud","summary":"  Reconstructing textured meshes from colored point clouds is an important but\nchallenging task. Most existing methods yield blurry-looking textures or rely\non 3D training data that are hard to acquire. Regarding this, we propose\nPointDreamer, a novel framework for textured mesh reconstruction from colored\npoint cloud via diffusion-based 2D inpainting. Specifically, we first\nreconstruct an untextured mesh. Next, we project the input point cloud into 2D\nspace to generate sparse multi-view images, and then inpaint empty pixels\nutilizing a pre-trained 2D diffusion model. After that, we unproject the colors\nof the inpainted dense images onto the untextured mesh, thus obtaining the\nfinal textured mesh. This project-inpaint-unproject pipeline bridges the gap\nbetween 3D point clouds and 2D diffusion models for the first time. Thanks to\nthe powerful 2D diffusion model pre-trained on extensive 2D data, PointDreamer\nreconstructs clear, high-quality textures with high robustness to sparse or\nnoisy input. Also, it's zero-shot requiring no extra training. In addition, we\ndesign Non-Border-First unprojection strategy to address the border-area\ninconsistency issue, which is less explored but commonly-occurred in methods\nthat generate 3D textures from multiview images. Extensive qualitative and\nquantitative experiments on various synthetic and real-scanned datasets show\nthe SoTA performance of PointDreamer, by significantly outperforming baseline\nmethods with 30% improvement in LPIPS score (from 0.118 to 0.068). Code at:\nhttps://github.com/YuQiao0303/PointDreamer.\n","authors":["Qiao Yu","Xianzhi Li","Yuan Tang","Xu Han","Jinfeng Xu","Long Hu","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2406.15811v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04568v1","updated":"2025-01-08T15:32:12Z","published":"2025-01-08T15:32:12Z","title":"Supervision-free Vision-Language Alignment","summary":"  Vision-language models (VLMs) have demonstrated remarkable potential in\nintegrating visual and linguistic information, but their performance is often\nconstrained by the need for extensive, high-quality image-text training data.\nCuration of these image-text pairs is both time-consuming and computationally\nexpensive. To address this challenge, we introduce SVP (Supervision-free Visual\nProjection), a novel framework that enhances vision-language alignment without\nrelying on curated data or preference annotation. SVP leverages self-captioning\nand a pre-trained grounding model as a feedback mechanism to elicit latent\ninformation in VLMs. We evaluate our approach across six key areas: captioning,\nreferring, visual question answering, multitasking, hallucination control, and\nobject recall. Results demonstrate significant improvements, including a 14%\naverage improvement in captioning tasks, up to 12% increase in object recall,\nand substantial reduction in hallucination rates. Notably, a small VLM using\nSVP achieves hallucination reductions comparable to a model five times larger,\nwhile a VLM with initially poor referring capabilities more than doubles its\nperformance, approaching parity with a model twice its size.\n","authors":["Giorgio Giannone","Ruoteng Li","Qianli Feng","Evgeny Perevodchikov","Rui Chen","Aleix Martinez"],"pdf_url":"https://arxiv.org/pdf/2501.04568v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.04565v1","updated":"2025-01-08T15:25:19Z","published":"2025-01-08T15:25:19Z","title":"Learnable Scaled Gradient Descent for Guaranteed Robust Tensor PCA","summary":"  Robust tensor principal component analysis (RTPCA) aims to separate the\nlow-rank and sparse components from multi-dimensional data, making it an\nessential technique in the signal processing and computer vision fields.\nRecently emerging tensor singular value decomposition (t-SVD) has gained\nconsiderable attention for its ability to better capture the low-rank structure\nof tensors compared to traditional matrix SVD. However, existing methods often\nrely on the computationally expensive tensor nuclear norm (TNN), which limits\ntheir scalability for real-world tensors. To address this issue, we explore an\nefficient scaled gradient descent (SGD) approach within the t-SVD framework for\nthe first time, and propose the RTPCA-SGD method. Theoretically, we rigorously\nestablish the recovery guarantees of RTPCA-SGD under mild assumptions,\ndemonstrating that with appropriate parameter selection, it achieves linear\nconvergence to the true low-rank tensor at a constant rate, independent of the\ncondition number. To enhance its practical applicability, we further propose a\nlearnable self-supervised deep unfolding model, which enables effective\nparameter learning. Numerical experiments on both synthetic and real-world\ndatasets demonstrate the superior performance of the proposed methods while\nmaintaining competitive computational efficiency, especially consuming less\ntime than RTPCA-TNN.\n","authors":["Lanlan Feng","Ce Zhu","Yipeng Liu","Saiprasad Ravishankar","Longxiu Huang"],"pdf_url":"https://arxiv.org/pdf/2501.04565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04561v1","updated":"2025-01-08T15:18:09Z","published":"2025-01-08T15:18:09Z","title":"OpenOmni: Large Language Models Pivot Zero-shot Omnimodal Alignment\n  across Language with Real-time Self-Aware Emotional Speech Synthesis","summary":"  Recent advancements in omnimodal learning have been achieved in understanding\nand generation across images, text, and speech, though mainly within\nproprietary models. Limited omnimodal datasets and the inherent challenges\nassociated with real-time emotional speech generation have hindered open-source\nprogress. To address these issues, we propose openomni, a two-stage training\nmethod combining omnimodal alignment and speech generation to develop a\nstate-of-the-art omnimodal large language model. In the alignment phase, a\npre-trained speech model is further trained on text-image tasks to generalize\nfrom vision to speech in a (near) zero-shot manner, outperforming models\ntrained on tri-modal datasets. In the speech generation phase, a lightweight\ndecoder facilitates real-time emotional speech through training on speech tasks\nand preference learning. Experiments demonstrate that openomni consistently\nimproves across omnimodal, vision-language, and speech-language evaluations,\nenabling natural, emotion-rich dialogues and real-time emotional speech\ngeneration.\n","authors":["Run Luo","Ting-En Lin","Haonan Zhang","Yuchuan Wu","Xiong Liu","Min Yang","Yongbin Li","Longze Chen","Jiaming Li","Lei Zhang","Yangyi Chen","Hamid Alinejad-Rokny","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2501.04561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10150v4","updated":"2025-01-08T15:15:12Z","published":"2024-01-18T17:22:37Z","title":"Motion-Zero: Zero-Shot Moving Object Control Framework for\n  Diffusion-Based Video Generation","summary":"  Recent large-scale pre-trained diffusion models have demonstrated a powerful\ngenerative ability to produce high-quality videos from detailed text\ndescriptions. However, exerting control over the motion of objects in videos\ngenerated by any video diffusion model is a challenging problem. In this paper,\nwe propose a novel zero-shot moving object trajectory control framework,\nMotion-Zero, to enable a bounding-box-trajectories-controlled text-to-video\ndiffusion model. To this end, an initial noise prior module is designed to\nprovide a position-based prior to improve the stability of the appearance of\nthe moving object and the accuracy of position. In addition, based on the\nattention map of the U-net, spatial constraints are directly applied to the\ndenoising process of diffusion models, which further ensures the positional and\nspatial consistency of moving objects during the inference. Furthermore,\ntemporal consistency is guaranteed with a proposed shift temporal attention\nmechanism. Our method can be flexibly applied to various state-of-the-art video\ndiffusion models without any training process. Extensive experiments\ndemonstrate our proposed method can control the motion trajectories of objects\nand generate high-quality videos. Our project page is\nhttps://vpx-ecnu.github.io/MotionZero-website/\n","authors":["Changgu Chen","Junwei Shu","Gaoqi He","Changbo Wang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2401.10150v4.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2405.02334v2","updated":"2025-01-08T14:42:05Z","published":"2024-04-26T15:02:39Z","title":"Rad4XCNN: a new agnostic method for post-hoc global explanation of\n  CNN-derived features by means of radiomics","summary":"  In recent years, machine learning-based clinical decision support systems\n(CDSS) have played a key role in the analysis of several medical conditions.\nDespite their promising capabilities, the lack of transparency in AI models\nposes significant challenges, particularly in medical contexts where\nreliability is a mandatory aspect. However, it appears that explainability is\ninversely proportional to accuracy. For this reason, achieving transparency\nwithout compromising predictive accuracy remains a key challenge. This paper\npresents a novel method, namely Rad4XCNN, to enhance the predictive power of\nCNN-derived features with the inherent interpretability of radiomic features.\nRad4XCNN diverges from conventional methods based on saliency maps, by\nassociating intelligible meaning to CNN-derived features by means of Radiomics,\noffering new perspectives on explanation methods beyond visualization maps.\nUsing a breast cancer classification task as a case study, we evaluated\nRad4XCNN on ultrasound imaging datasets, including an online dataset and two\nin-house datasets for internal and external validation. Some key results are:\ni) CNN-derived features guarantee more robust accuracy when compared against\nViT-derived and radiomic features; ii) conventional visualization map methods\nfor explanation present several pitfalls; iii) Rad4XCNN does not sacrifice\nmodel accuracy for their explainability; iv) Rad4XCNN provides a global\nexplanation enabling the physician to extract global insights and findings. Our\nmethod can mitigate some concerns related to the explainability-accuracy\ntrade-off. This study highlighted the importance of proposing new methods for\nmodel explanation without affecting their accuracy.\n","authors":["Francesco Prinzi","Carmelo Militello","Calogero Zarcaro","Tommaso Vincenzo Bartolotta","Salvatore Gaglio","Salvatore Vitabile"],"pdf_url":"https://arxiv.org/pdf/2405.02334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00599v2","updated":"2025-01-08T14:38:30Z","published":"2024-12-31T18:56:46Z","title":"VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with\n  Video LLM","summary":"  Video Large Language Models (Video LLMs) have recently exhibited remarkable\ncapabilities in general video understanding. However, they mainly focus on\nholistic comprehension and struggle with capturing fine-grained spatial and\ntemporal details. Besides, the lack of high-quality object-level video\ninstruction data and a comprehensive benchmark further hinders their\nadvancements. To tackle these challenges, we introduce the VideoRefer Suite to\nempower Video LLM for finer-level spatial-temporal video understanding, i.e.,\nenabling perception and reasoning on any objects throughout the video.\nSpecially, we thoroughly develop VideoRefer Suite across three essential\naspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent\ndata engine to meticulously curate a large-scale, high-quality object-level\nvideo instruction dataset, termed VideoRefer-700K. Next, we present the\nVideoRefer model, which equips a versatile spatial-temporal object encoder to\ncapture precise regional and sequential representations. Finally, we\nmeticulously create a VideoRefer-Bench to comprehensively assess the\nspatial-temporal understanding capability of a Video LLM, evaluating it across\nvarious aspects. Extensive experiments and analyses demonstrate that our\nVideoRefer model not only achieves promising performance on video referring\nbenchmarks but also facilitates general video understanding capabilities.\n","authors":["Yuqian Yuan","Hang Zhang","Wentong Li","Zesen Cheng","Boqiang Zhang","Long Li","Xin Li","Deli Zhao","Wenqiao Zhang","Yueting Zhuang","Jianke Zhu","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2501.00599v2.pdf","comment":"17 pages, 14 figures, technical report"},{"id":"http://arxiv.org/abs/2501.04534v1","updated":"2025-01-08T14:33:47Z","published":"2025-01-08T14:33:47Z","title":"Combining YOLO and Visual Rhythm for Vehicle Counting","summary":"  Video-based vehicle detection and counting play a critical role in managing\ntransport infrastructure. Traditional image-based counting methods usually\ninvolve two main steps: initial detection and subsequent tracking, which are\napplied to all video frames, leading to a significant increase in computational\ncomplexity. To address this issue, this work presents an alternative and more\nefficient method for vehicle detection and counting. The proposed approach\neliminates the need for a tracking step and focuses solely on detecting\nvehicles in key video frames, thereby increasing its efficiency. To achieve\nthis, we developed a system that combines YOLO, for vehicle detection, with\nVisual Rhythm, a way to create time-spatial images that allows us to focus on\nframes that contain useful information. Additionally, this method can be used\nfor counting in any application involving unidirectional moving targets to be\ndetected and identified. Experimental analysis using real videos shows that the\nproposed method achieves mean counting accuracy around 99.15% over a set of\nvideos, with a processing speed three times faster than tracking based\napproaches.\n","authors":["Victor Nascimento Ribeiro","Nina S. T. Hirata"],"pdf_url":"https://arxiv.org/pdf/2501.04534v1.pdf","comment":"Accepted for presentation at the Conference on Graphics, Patterns and\n  Images (SIBGRAPI) 2023"},{"id":"http://arxiv.org/abs/2501.01483v2","updated":"2025-01-08T14:29:10Z","published":"2025-01-02T18:42:07Z","title":"Embedding Similarity Guided License Plate Super Resolution","summary":"  Super-resolution (SR) techniques play a pivotal role in enhancing the quality\nof low-resolution images, particularly for applications such as security and\nsurveillance, where accurate license plate recognition is crucial. This study\nproposes a novel framework that combines pixel-based loss with embedding\nsimilarity learning to address the unique challenges of license plate\nsuper-resolution (LPSR). The introduced pixel and embedding consistency loss\n(PECL) integrates a Siamese network and applies contrastive loss to force\nembedding similarities to improve perceptual and structural fidelity. By\neffectively balancing pixel-wise accuracy with embedding-level consistency, the\nframework achieves superior alignment of fine-grained features between\nhigh-resolution (HR) and super-resolved (SR) license plates. Extensive\nexperiments on the CCPD dataset validate the efficacy of the proposed\nframework, demonstrating consistent improvements over state-of-the-art methods\nin terms of PSNR_RGB, PSNR_Y and optical character recognition (OCR) accuracy.\nThese results highlight the potential of embedding similarity learning to\nadvance both perceptual quality and task-specific performance in extreme\nsuper-resolution scenarios.\n","authors":["Abderrezzaq Sendjasni","Mohamed-Chaker Larabi"],"pdf_url":"https://arxiv.org/pdf/2501.01483v2.pdf","comment":"Submitted to Neurocomputing"},{"id":"http://arxiv.org/abs/2402.13809v3","updated":"2025-01-08T14:21:46Z","published":"2024-02-21T13:46:25Z","title":"NeuralDiffuser: Neuroscience-inspired Diffusion Guidance for fMRI Visual\n  Reconstruction","summary":"  Reconstructing visual stimuli from functional Magnetic Resonance Imaging fMRI\nenables fine-grained retrieval of brain activity. However, the accurate\nreconstruction of diverse details, including structure, background, texture,\ncolor, and more, remains challenging. The stable diffusion models inevitably\nresult in the variability of reconstructed images, even under identical\nconditions. To address this challenge, we first uncover the neuroscientific\nperspective of diffusion methods, which primarily involve top-down creation\nusing pre-trained knowledge from extensive image datasets, but tend to lack\ndetail-driven bottom-up perception, leading to a loss of faithful details. In\nthis paper, we propose NeuralDiffuser, which incorporates primary visual\nfeature guidance to provide detailed cues in the form of gradients. This\nextension of the bottom-up process for diffusion models achieves both semantic\ncoherence and detail fidelity when reconstructing visual stimuli. Furthermore,\nwe have developed a novel guidance strategy for reconstruction tasks that\nensures the consistency of repeated outputs with original images rather than\nwith various outputs. Extensive experimental results on the Natural Senses\nDataset (NSD) qualitatively and quantitatively demonstrate the advancement of\nNeuralDiffuser by comparing it against baseline and state-of-the-art methods\nhorizontally, as well as conducting longitudinal ablation studies.\n","authors":["Haoyu Li","Hao Wu","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2402.13809v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04527v1","updated":"2025-01-08T14:19:03Z","published":"2025-01-08T14:19:03Z","title":"Towards Fair Class-wise Robustness: Class Optimal Distribution\n  Adversarial Training","summary":"  Adversarial training has proven to be a highly effective method for improving\nthe robustness of deep neural networks against adversarial attacks.\nNonetheless, it has been observed to exhibit a limitation in terms of robust\nfairness, characterized by a significant disparity in robustness across\ndifferent classes. Recent efforts to mitigate this problem have turned to\nclass-wise reweighted methods. However, these methods suffer from a lack of\nrigorous theoretical analysis and are limited in their exploration of the\nweight space, as they mainly rely on existing heuristic algorithms or intuition\nto compute weights. In addition, these methods fail to guarantee the\nconsistency of the optimization direction due to the decoupled optimization of\nweights and the model parameters. They potentially lead to suboptimal weight\nassignments and consequently, a suboptimal model. To address these problems,\nthis paper proposes a novel min-max training framework, Class Optimal\nDistribution Adversarial Training (CODAT), which employs distributionally\nrobust optimization to fully explore the class-wise weight space, thus enabling\nthe identification of the optimal weight with theoretical guarantees.\nFurthermore, we derive a closed-form optimal solution to the internal\nmaximization and then get a deterministic equivalent objective function, which\nprovides a theoretical basis for the joint optimization of weights and model\nparameters. Meanwhile, we propose a fairness elasticity coefficient for the\nevaluation of the algorithm with regard to both robustness and robust fairness.\nExperimental results on various datasets show that the proposed method can\neffectively improve the robust fairness of the model and outperform the\nstate-of-the-art approaches.\n","authors":["Hongxin Zhi","Hongtao Yu","Shaome Li","Xiuming Zhao","Yiteng Wu"],"pdf_url":"https://arxiv.org/pdf/2501.04527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18103v3","updated":"2025-01-08T14:13:23Z","published":"2024-03-26T21:01:41Z","title":"Tutorial on Diffusion Models for Imaging and Vision","summary":"  The astonishing growth of generative tools in recent years has empowered many\nexciting applications in text-to-image generation and text-to-video generation.\nThe underlying principle behind these generative tools is the concept of\ndiffusion, a particular sampling mechanism that has overcome some shortcomings\nthat were deemed difficult in the previous approaches. The goal of this\ntutorial is to discuss the essential ideas underlying the diffusion models. The\ntarget audience of this tutorial includes undergraduate and graduate students\nwho are interested in doing research on diffusion models or applying these\nmodels to solve other problems.\n","authors":["Stanley H. Chan"],"pdf_url":"https://arxiv.org/pdf/2403.18103v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01587v2","updated":"2025-01-08T14:12:45Z","published":"2024-04-02T02:29:41Z","title":"TSCM: A Teacher-Student Model for Vision Place Recognition Using\n  Cross-Metric Knowledge Distillation","summary":"  Visual place recognition (VPR) plays a pivotal role in autonomous exploration\nand navigation of mobile robots within complex outdoor environments. While\ncost-effective and easily deployed, camera sensors are sensitive to lighting\nand weather changes, and even slight image alterations can greatly affect VPR\nefficiency and precision. Existing methods overcome this by exploiting powerful\nyet large networks, leading to significant consumption of computational\nresources. In this paper, we propose a high-performance teacher and lightweight\nstudent distillation framework called TSCM. It exploits our devised\ncross-metric knowledge distillation to narrow the performance gap between the\nteacher and student models, maintaining superior performance while enabling\nminimal computational load during deployment. We conduct comprehensive\nevaluations on large-scale datasets, namely Pittsburgh30k and Pittsburgh250k.\nExperimental results demonstrate the superiority of our method over baseline\nmodels in terms of recognition accuracy and model parameter efficiency.\nMoreover, our ablation studies show that the proposed knowledge distillation\ntechnique surpasses other counterparts. The code of our method has been\nreleased at https://github.com/nubot-nudt/TSCM.\n","authors":["Yehui Shen","Mingmin Liu","Huimin Lu","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01587v2.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2501.04515v1","updated":"2025-01-08T14:05:24Z","published":"2025-01-08T14:05:24Z","title":"SplineFormer: An Explainable Transformer-Based Approach for Autonomous\n  Endovascular Navigation","summary":"  Endovascular navigation is a crucial aspect of minimally invasive procedures,\nwhere precise control of curvilinear instruments like guidewires is critical\nfor successful interventions. A key challenge in this task is accurately\npredicting the evolving shape of the guidewire as it navigates through the\nvasculature, which presents complex deformations due to interactions with the\nvessel walls. Traditional segmentation methods often fail to provide accurate\nreal-time shape predictions, limiting their effectiveness in highly dynamic\nenvironments. To address this, we propose SplineFormer, a new transformer-based\narchitecture, designed specifically to predict the continuous, smooth shape of\nthe guidewire in an explainable way. By leveraging the transformer's ability,\nour network effectively captures the intricate bending and twisting of the\nguidewire, representing it as a spline for greater accuracy and smoothness. We\nintegrate our SplineFormer into an end-to-end robot navigation system by\nleveraging the condensed information. The experimental results demonstrate that\nour SplineFormer is able to perform endovascular navigation autonomously and\nachieves a 50% success rate when cannulating the brachiocephalic artery on the\nreal robot.\n","authors":["Tudor Jianu","Shayan Doust","Mengyun Li","Baoru Huang","Tuong Do","Hoan Nguyen","Karl Bates","Tung D. Ta","Sebastiano Fichera","Pierre Berthet-Rayne","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.04515v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2501.04513v1","updated":"2025-01-08T14:00:07Z","published":"2025-01-08T14:00:07Z","title":"Improving Image Captioning by Mimicking Human Reformulation Feedback at\n  Inference-time","summary":"  Incorporating automatically predicted human feedback into the process of\ntraining generative models has attracted substantial recent interest, while\nfeedback at inference time has received less attention. The typical feedback at\ntraining time, i.e., preferences of choice given two samples, does not\nnaturally transfer to the inference phase. We introduce a novel type of\nfeedback -- caption reformulations -- and train models to mimic reformulation\nfeedback based on human annotations. Our method does not require training the\nimage captioning model itself, thereby demanding substantially less\ncomputational effort. We experiment with two types of reformulation feedback:\nfirst, we collect a dataset of human reformulations that correct errors in the\ngenerated captions. We find that incorporating reformulation models trained on\nthis data into the inference phase of existing image captioning models results\nin improved captions, especially when the original captions are of low quality.\nWe apply our method to non-English image captioning, a domain where robust\nmodels are less prevalent, and gain substantial improvement. Second, we apply\nreformulations to style transfer. Quantitative evaluations reveal\nstate-of-the-art performance on German image captioning and English style\ntransfer, while human validation with a detailed comparative framework exposes\nthe specific axes of improvement.\n","authors":["Uri Berger","Omri Abend","Lea Frermann","Gabriel Stanovsky"],"pdf_url":"https://arxiv.org/pdf/2501.04513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06747v2","updated":"2025-01-08T13:49:54Z","published":"2024-08-13T09:10:48Z","title":"ReCLIP++: Learn to Rectify the Bias of CLIP for Unsupervised Semantic\n  Segmentation","summary":"  Recent works utilize CLIP to perform the challenging unsupervised semantic\nsegmentation task where only images without annotations are available. However,\nwe observe that when adopting CLIP to such a pixel-level understanding task,\nunexpected bias (including class-preference bias and space-preference bias)\noccurs. Previous works don't explicitly model the bias, which largely\nconstrains the segmentation performance. In this paper, we propose to\nexplicitly model and rectify the bias existing in CLIP to facilitate the\nunsupervised semantic segmentation task. Specifically, we design a learnable\n\"Reference\" prompt to encode class-preference bias and a projection of the\npositional embedding in the vision transformer to encode space-preference bias\nrespectively. To avoid interference, two kinds of biases are firstly\nindependently encoded into different features, i.e., the Reference feature and\nthe positional feature. Via a matrix multiplication between the Reference\nfeature and the positional feature, a bias logit map is generated to explicitly\nrepresent two kinds of biases. Then we rectify the logits of CLIP via a simple\nelement-wise subtraction. To make the rectified results smoother and more\ncontextual, we design a mask decoder which takes the feature of CLIP and the\nrectified logits as input and outputs a rectified segmentation mask with the\nhelp of Gumbel-Softmax operation. A contrastive loss based on the masked visual\nfeatures and the text features of different classes is imposed, which makes the\nbias modeling and rectification process meaningful and effective. Extensive\nexperiments on various benchmarks including PASCAL VOC, PASCAL Context, ADE20K,\nCityscapes, and COCO Stuff demonstrate that our method performs favorably\nagainst previous state-of-the-arts. The implementation is available at:\nhttps://github.com/dogehhh/ReCLIP.\n","authors":["Jingyun Wang","Guoliang Kang"],"pdf_url":"https://arxiv.org/pdf/2408.06747v2.pdf","comment":"Extended version of our CVPR 24 paper"},{"id":"http://arxiv.org/abs/2405.08766v2","updated":"2025-01-08T13:45:46Z","published":"2024-05-14T16:59:20Z","title":"Energy-based Hopfield Boosting for Out-of-Distribution Detection","summary":"  Out-of-distribution (OOD) detection is critical when deploying machine\nlearning models in the real world. Outlier exposure methods, which incorporate\nauxiliary outlier data in the training process, can drastically improve OOD\ndetection performance compared to approaches without advanced training\nstrategies. We introduce Hopfield Boosting, a boosting approach, which\nleverages modern Hopfield energy (MHE) to sharpen the decision boundary between\nthe in-distribution and OOD data. Hopfield Boosting encourages the model to\nconcentrate on hard-to-distinguish auxiliary outlier examples that lie close to\nthe decision boundary between in-distribution and auxiliary outlier data. Our\nmethod achieves a new state-of-the-art in OOD detection with outlier exposure,\nimproving the FPR95 metric from 2.28 to 0.92 on CIFAR-10 and from 11.76 to 7.94\non CIFAR-100.\n","authors":["Claus Hofmann","Simon Schmid","Bernhard Lehner","Daniel Klotz","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2405.08766v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2311.15963v3","updated":"2025-01-08T13:45:15Z","published":"2023-11-27T16:07:34Z","title":"From Pixels to Titles: Video Game Identification by Screenshots using\n  Convolutional Neural Networks","summary":"  This paper investigates video game identification through single screenshots,\nutilizing ten convolutional neural network (CNN) architectures (VGG16,\nResNet50, ResNet152, MobileNet, DenseNet169, DenseNet201, EfficientNetB0,\nEfficientNetB2, EfficientNetB3, and EfficientNetV2S) and three transformers\narchitectures (ViT-B16, ViT-L32, and SwinT) across 22 home console systems,\nspanning from Atari 2600 to PlayStation 5, totalling 8,796 games and 170,881\nscreenshots. Except for VGG16, all CNNs outperformed the transformers in this\ntask. Using ImageNet pre-trained weights as initial weights, EfficientNetV2S\nachieves the highest average accuracy (77.44%) and the highest accuracy in 16\nof the 22 systems. DenseNet201 is the best in four systems and EfficientNetB3\nis the best in the remaining two systems. Employing alternative initial weights\nfine-tuned in an arcade screenshots dataset boosts accuracy for EfficientNet\narchitectures, with the EfficientNetV2S reaching a peak accuracy of 77.63% and\ndemonstrating reduced convergence epochs from 26.9 to 24.5 on average. Overall,\nthe combination of optimal architecture and weights attains 78.79% accuracy,\nprimarily led by EfficientNetV2S in 15 systems. These findings underscore the\nefficacy of CNNs in video game identification through screenshots.\n","authors":["Fabricio Breve"],"pdf_url":"https://arxiv.org/pdf/2311.15963v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02270v2","updated":"2025-01-08T13:42:02Z","published":"2025-01-04T12:15:58Z","title":"Efficient Video-Based ALPR System Using YOLO and Visual Rhythm","summary":"  Automatic License Plate Recognition (ALPR) involves extracting vehicle\nlicense plate information from image or a video capture. These systems have\ngained popularity due to the wide availability of low-cost surveillance cameras\nand advances in Deep Learning. Typically, video-based ALPR systems rely on\nmultiple frames to detect the vehicle and recognize the license plates.\nTherefore, we propose a system capable of extracting exactly one frame per\nvehicle and recognizing its license plate characters from this singular image\nusing an Optical Character Recognition (OCR) model. Early experiments show that\nthis methodology is viable.\n","authors":["Victor Nascimento Ribeiro","Nina S. T. Hirata"],"pdf_url":"https://arxiv.org/pdf/2501.02270v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2412.17378v3","updated":"2025-01-08T13:31:11Z","published":"2024-12-23T08:26:30Z","title":"Balanced 3DGS: Gaussian-wise Parallelism Rendering with Fine-Grained\n  Tiling","summary":"  3D Gaussian Splatting (3DGS) is increasingly attracting attention in both\nacademia and industry owing to its superior visual quality and rendering speed.\nHowever, training a 3DGS model remains a time-intensive task, especially in\nload imbalance scenarios where workload diversity among pixels and Gaussian\nspheres causes poor renderCUDA kernel performance. We introduce Balanced 3DGS,\na Gaussian-wise parallelism rendering with fine-grained tiling approach in 3DGS\ntraining process, perfectly solving load-imbalance issues. First, we\ninnovatively introduce the inter-block dynamic workload distribution technique\nto map workloads to Streaming Multiprocessor(SM) resources within a single GPU\ndynamically, which constitutes the foundation of load balancing. Second, we are\nthe first to propose the Gaussian-wise parallel rendering technique to\nsignificantly reduce workload divergence inside a warp, which serves as a\ncritical component in addressing load imbalance. Based on the above two\nmethods, we further creatively put forward the fine-grained combined load\nbalancing technique to uniformly distribute workload across all SMs, which\nboosts the forward renderCUDA kernel performance by up to 7.52x. Besides, we\npresent a self-adaptive render kernel selection strategy during the 3DGS\ntraining process based on different load-balance situations, which effectively\nimproves training efficiency.\n","authors":["Hao Gui","Lin Hu","Rui Chen","Mingxiao Huang","Yuxin Yin","Jin Yang","Yong Wu","Chen Liu","Zhongxu Sun","Xueyang Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2412.17378v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04493v1","updated":"2025-01-08T13:26:24Z","published":"2025-01-08T13:26:24Z","title":"The Role of Machine Learning in Congenital Heart Disease Diagnosis:\n  Datasets, Algorithms, and Insights","summary":"  Congenital heart disease is among the most common fetal abnormalities and\nbirth defects. Despite identifying numerous risk factors influencing its onset,\na comprehensive understanding of its genesis and management across diverse\npopulations remains limited. Recent advancements in machine learning have\ndemonstrated the potential for leveraging patient data to enable early\ncongenital heart disease detection. Over the past seven years, researchers have\nproposed various data-driven and algorithmic solutions to address this\nchallenge. This paper presents a systematic review of congential heart disease\nrecognition using machine learning, conducting a meta-analysis of 432\nreferences from leading journals published between 2018 and 2024. A detailed\ninvestigation of 74 scholarly works highlights key factors, including\ndatabases, algorithms, applications, and solutions. Additionally, the survey\noutlines reported datasets used by machine learning experts for congenital\nheart disease recognition. Using a systematic literature review methodology,\nthis study identifies critical challenges and opportunities in applying machine\nlearning to congenital heart disease.\n","authors":["Khalil Khan","Farhan Ullah","Ikram Syed","Irfan Ullah"],"pdf_url":"https://arxiv.org/pdf/2501.04493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04486v1","updated":"2025-01-08T13:13:52Z","published":"2025-01-08T13:13:52Z","title":"MB-TaylorFormer V2: Improved Multi-branch Linear Transformer Expanded by\n  Taylor Formula for Image Restoration","summary":"  Recently, Transformer networks have demonstrated outstanding performance in\nthe field of image restoration due to the global receptive field and\nadaptability to input. However, the quadratic computational complexity of\nSoftmax-attention poses a significant limitation on its extensive application\nin image restoration tasks, particularly for high-resolution images. To tackle\nthis challenge, we propose a novel variant of the Transformer. This variant\nleverages the Taylor expansion to approximate the Softmax-attention and\nutilizes the concept of norm-preserving mapping to approximate the remainder of\nthe first-order Taylor expansion, resulting in a linear computational\ncomplexity. Moreover, we introduce a multi-branch architecture featuring\nmulti-scale patch embedding into the proposed Transformer, which has four\ndistinct advantages: 1) various sizes of the receptive field; 2) multi-level\nsemantic information; 3) flexible shapes of the receptive field; 4) accelerated\ntraining and inference speed. Hence, the proposed model, named the second\nversion of Taylor formula expansion-based Transformer (for short\nMB-TaylorFormer V2) has the capability to concurrently process coarse-to-fine\nfeatures, capture long-distance pixel interactions with limited computational\ncost, and improve the approximation of the Taylor expansion remainder.\nExperimental results across diverse image restoration benchmarks demonstrate\nthat MB-TaylorFormer V2 achieves state-of-the-art performance in multiple image\nrestoration tasks, such as image dehazing, deraining, desnowing, motion\ndeblurring, and denoising, with very little computational overhead. The source\ncode is available at https://github.com/FVL2020/MB-TaylorFormerV2.\n","authors":["Zhi Jin","Yuwei Qiu","Kaihao Zhang","Hongdong Li","Wenhan Luo"],"pdf_url":"https://arxiv.org/pdf/2501.04486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01996v4","updated":"2025-01-08T13:03:24Z","published":"2024-07-02T07:10:10Z","title":"ViG-Bias: Visually Grounded Bias Discovery and Mitigation","summary":"  The proliferation of machine learning models in critical decision making\nprocesses has underscored the need for bias discovery and mitigation\nstrategies. Identifying the reasons behind a biased system is not\nstraightforward, since in many occasions they are associated with hidden\nspurious correlations which are not easy to spot. Standard approaches rely on\nbias audits performed by analyzing model performance in pre-defined subgroups\nof data samples, usually characterized by common attributes like gender or\nethnicity when it comes to people, or other specific attributes defining\nsemantically coherent groups of images. However, it is not always possible to\nknow a-priori the specific attributes defining the failure modes of visual\nrecognition systems. Recent approaches propose to discover these groups by\nleveraging large vision language models, which enable the extraction of\ncross-modal embeddings and the generation of textual descriptions to\ncharacterize the subgroups where a certain model is underperforming. In this\nwork, we argue that incorporating visual explanations (e.g. heatmaps generated\nvia GradCAM or other approaches) can boost the performance of such bias\ndiscovery and mitigation frameworks. To this end, we introduce Visually\nGrounded Bias Discovery and Mitigation (ViG-Bias), a simple yet effective\ntechnique which can be integrated to a variety of existing frameworks to\nimprove both, discovery and mitigation performance. Our comprehensive\nevaluation shows that incorporating visual explanations enhances existing\ntechniques like DOMINO, FACTS and Bias-to-Text, across several challenging\ndatasets, including CelebA, Waterbirds, and NICO++.\n","authors":["Badr-Eddine Marani","Mohamed Hanini","Nihitha Malayarukil","Stergios Christodoulidis","Maria Vakalopoulou","Enzo Ferrante"],"pdf_url":"https://arxiv.org/pdf/2407.01996v4.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2501.04477v1","updated":"2025-01-08T13:00:17Z","published":"2025-01-08T13:00:17Z","title":"Rethinking High-speed Image Reconstruction Framework with Spike Camera","summary":"  Spike cameras, as innovative neuromorphic devices, generate continuous spike\nstreams to capture high-speed scenes with lower bandwidth and higher dynamic\nrange than traditional RGB cameras. However, reconstructing high-quality images\nfrom the spike input under low-light conditions remains challenging.\nConventional learning-based methods often rely on the synthetic dataset as the\nsupervision for training. Still, these approaches falter when dealing with\nnoisy spikes fired under the low-light environment, leading to further\nperformance degradation in the real-world dataset. This phenomenon is primarily\ndue to inadequate noise modelling and the domain gap between synthetic and real\ndatasets, resulting in recovered images with unclear textures, excessive noise,\nand diminished brightness. To address these challenges, we introduce a novel\nspike-to-image reconstruction framework SpikeCLIP that goes beyond traditional\ntraining paradigms. Leveraging the CLIP model's powerful capability to align\ntext and images, we incorporate the textual description of the captured scene\nand unpaired high-quality datasets as the supervision. Our experiments on\nreal-world low-light datasets U-CALTECH and U-CIFAR demonstrate that SpikeCLIP\nsignificantly enhances texture details and the luminance balance of recovered\nimages. Furthermore, the reconstructed images are well-aligned with the broader\nvisual features needed for downstream tasks, ensuring more robust and versatile\nperformance in challenging environments.\n","authors":["Kang Chen","Yajing Zheng","Tiejun Huang","Zhaofei Yu"],"pdf_url":"https://arxiv.org/pdf/2501.04477v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.04467v1","updated":"2025-01-08T12:41:42Z","published":"2025-01-08T12:41:42Z","title":"A Histologic Dataset of Normal and Atypical Mitotic Figures on Human\n  Breast Cancer (AMi-Br)","summary":"  Assessment of the density of mitotic figures (MFs) in histologic tumor\nsections is an important prognostic marker for many tumor types, including\nbreast cancer. Recently, it has been reported in multiple works that the\nquantity of MFs with an atypical morphology (atypical MFs, AMFs) might be an\nindependent prognostic criterion for breast cancer. AMFs are an indicator of\nmutations in the genes regulating the cell cycle and can lead to aberrant\nchromosome constitution (aneuploidy) of the tumor cells. To facilitate further\nresearch on this topic using pattern recognition, we present the first ever\npublicly available dataset of atypical and normal MFs (AMi-Br). For this, we\nutilized two of the most popular MF datasets (MIDOG 2021 and TUPAC) and\nsubclassified all MFs using a three expert majority vote. Our final dataset\nconsists of 3,720 MFs, split into 832 AMFs (22.4%) and 2,888 normal MFs (77.6%)\nacross all 223 tumor cases in the combined set. We provide baseline\nclassification experiments to investigate the consistency of the dataset, using\na Monte Carlo cross-validation and different strategies to combat class\nimbalance. We found an averaged balanced accuracy of up to 0.806 when using a\npatch-level data set split, and up to 0.713 when using a patient-level split.\n","authors":["Christof A. Bertram","Viktoria Weiss","Taryn A. Donovan","Sweta Banerjee","Thomas Conrad","Jonas Ammeling","Robert Klopfleisch","Christopher Kaltenecker","Marc Aubreville"],"pdf_url":"https://arxiv.org/pdf/2501.04467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04459v1","updated":"2025-01-08T12:30:06Z","published":"2025-01-08T12:30:06Z","title":"Rapid Automated Mapping of Clouds on Titan With Instance Segmentation","summary":"  Despite widespread adoption of deep learning models to address a variety of\ncomputer vision tasks, planetary science has yet to see extensive utilization\nof such tools to address its unique problems. On Titan, the largest moon of\nSaturn, tracking seasonal trends and weather patterns of clouds provides\ncrucial insights into one of the most complex climates in the Solar System, yet\nmuch of the available image data are still analyzed in a conventional way. In\nthis work, we apply a Mask R-CNN trained via transfer learning to perform\ninstance segmentation of clouds in Titan images acquired by the Cassini\nspacecraft - a previously unexplored approach to a big data problem in\nplanetary science. We demonstrate that an automated technique can provide\nquantitative measures for clouds, such as areas and centroids, that may\notherwise be prohibitively time-intensive to produce by human mapping.\nFurthermore, despite Titan specific challenges, our approach yields accuracy\ncomparable to contemporary cloud identification studies on Earth and other\nworlds. We compare the efficiencies of human-driven versus algorithmic\napproaches, showing that transfer learning provides speed-ups that may open new\nhorizons for data investigation for Titan. Moreover, we suggest that such\napproaches have broad potential for application to similar problems in\nplanetary science where they are currently under-utilized. Future planned\nmissions to the planets and remote sensing initiatives for the Earth promise to\nprovide a deluge of image data in the coming years that will benefit strongly\nfrom leveraging machine learning approaches to perform the analysis.\n","authors":["Zachary Yahn","Douglas M Trent","Ethan Duncan","Benoît Seignovert","John Santerre","Conor Nixon"],"pdf_url":"https://arxiv.org/pdf/2501.04459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14599v2","updated":"2025-01-08T12:20:56Z","published":"2024-06-20T17:59:56Z","title":"Stylebreeder: Exploring and Democratizing Artistic Styles through\n  Text-to-Image Models","summary":"  Text-to-image models are becoming increasingly popular, revolutionizing the\nlandscape of digital art creation by enabling highly detailed and creative\nvisual content generation. These models have been widely employed across\nvarious domains, particularly in art generation, where they facilitate a broad\nspectrum of creative expression and democratize access to artistic creation. In\nthis paper, we introduce \\texttt{STYLEBREEDER}, a comprehensive dataset of 6.8M\nimages and 1.8M prompts generated by 95K users on Artbreeder, a platform that\nhas emerged as a significant hub for creative exploration with over 13M users.\nWe introduce a series of tasks with this dataset aimed at identifying diverse\nartistic styles, generating personalized content, and recommending styles based\non user interests. By documenting unique, user-generated styles that transcend\nconventional categories like 'cyberpunk' or 'Picasso,' we explore the potential\nfor unique, crowd-sourced styles that could provide deep insights into the\ncollective creative psyche of users worldwide. We also evaluate different\npersonalization methods to enhance artistic expression and introduce a style\natlas, making these models available in LoRA format for public use. Our\nresearch demonstrates the potential of text-to-image diffusion models to\nuncover and promote unique artistic expressions, further democratizing AI in\nart and fostering a more diverse and inclusive artistic community. The dataset,\ncode and models are available at https://stylebreeder.github.io under a Public\nDomain (CC0) license.\n","authors":["Matthew Zheng","Enis Simsar","Hidir Yesiltepe","Federico Tombari","Joel Simon","Pinar Yanardag"],"pdf_url":"https://arxiv.org/pdf/2406.14599v2.pdf","comment":"Accepted at NeurIPS 2024 D&B Track, Project page:\n  https://stylebreeder.github.io HuggingFace DB Page:\n  https://huggingface.co/datasets/stylebreeder/stylebreeder"},{"id":"http://arxiv.org/abs/2501.01767v2","updated":"2025-01-08T12:11:18Z","published":"2025-01-03T11:40:41Z","title":"LogicAD: Explainable Anomaly Detection via VLM-based Text Feature\n  Extraction","summary":"  Logical image understanding involves interpreting and reasoning about the\nrelationships and consistency within an image's visual content. This capability\nis essential in applications such as industrial inspection, where logical\nanomaly detection is critical for maintaining high-quality standards and\nminimizing costly recalls. Previous research in anomaly detection (AD) has\nrelied on prior knowledge for designing algorithms, which often requires\nextensive manual annotations, significant computing power, and large amounts of\ndata for training. Autoregressive, multimodal Vision Language Models (AVLMs)\noffer a promising alternative due to their exceptional performance in visual\nreasoning across various domains. Despite this, their application to logical AD\nremains unexplored. In this work, we investigate using AVLMs for logical AD and\ndemonstrate that they are well-suited to the task. Combining AVLMs with format\nembedding and a logic reasoner, we achieve SOTA performance on public\nbenchmarks, MVTec LOCO AD, with an AUROC of 86.0% and F1-max of 83.7%, along\nwith explanations of anomalies. This significantly outperforms the existing\nSOTA method by a large margin.\n","authors":["Er Jin","Qihui Feng","Yongli Mou","Stefan Decker","Gerhard Lakemeyer","Oliver Simons","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2501.01767v2.pdf","comment":"Accepted for publication at aaai25, project page:\n  https://jasonjin34.github.io/logicad.github.io/"},{"id":"http://arxiv.org/abs/2501.04444v1","updated":"2025-01-08T11:53:30Z","published":"2025-01-08T11:53:30Z","title":"A novel Facial Recognition technique with Focusing on Masked Faces","summary":"  Recognizing the same faces with and without masks is important for ensuring\nconsistent identification in security, access control, and public safety. This\ncapability is crucial in scenarios like law enforcement, healthcare, and\nsurveillance, where accurate recognition must be maintained despite facial\nocclusion. This research focuses on the challenge of recognizing the same faces\nwith and without masks by employing cosine similarity as the primary technique.\nWith the increased use of masks, traditional facial recognition systems face\nsignificant accuracy issues, making it crucial to develop methods that can\nreliably identify individuals in masked conditions. For that reason, this study\nproposed Masked-Unmasked Face Matching Model (MUFM). This model employs\ntransfer learning using the Visual Geometry Group (VGG16) model to extract\nsignificant facial features, which are subsequently classified utilizing the\nK-Nearest Neighbors (K-NN) algorithm. The cosine similarity metric is employed\nto compare masked and unmasked faces of the same individuals. This approach\nrepresents a novel contribution, as the task of recognizing the same individual\nwith and without a mask using cosine similarity has not been previously\naddressed. By integrating these advanced methodologies, the research\ndemonstrates effective identification of individuals despite the presence of\nmasks, addressing a significant limitation in traditional systems. Using data\nis another essential part of this work, by collecting and preparing an image\ndataset from three different sources especially some of those data are real\nprovided a comprehensive power of this research. The image dataset used were\nalready collected in three different datasets of masked and unmasked for the\nsame faces.\n","authors":["Dana A Abdullah","Dana Rasul Hamad","Hakem Beitollahi","Ismail Y Maolood","Abdulhady Abas Abdullah","Aso Khaleel Ameen"],"pdf_url":"https://arxiv.org/pdf/2501.04444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04440v1","updated":"2025-01-08T11:41:47Z","published":"2025-01-08T11:41:47Z","title":"RSAR: Restricted State Angle Resolver and Rotated SAR Benchmark","summary":"  Rotated object detection has made significant progress in the optical remote\nsensing. However, advancements in the Synthetic Aperture Radar (SAR) field are\nlaggard behind, primarily due to the absence of a large-scale dataset.\nAnnotating such a dataset is inefficient and costly. A promising solution is to\nemploy a weakly supervised model (e.g., trained with available horizontal boxes\nonly) to generate pseudo-rotated boxes for reference before manual calibration.\nUnfortunately, the existing weakly supervised models exhibit limited accuracy\nin predicting the object's angle. Previous works attempt to enhance angle\nprediction by using angle resolvers that decouple angles into cosine and sine\nencodings. In this work, we first reevaluate these resolvers from a unified\nperspective of dimension mapping and expose that they share the same\nshortcomings: these methods overlook the unit cycle constraint inherent in\nthese encodings, easily leading to prediction biases. To address this issue, we\npropose the Unit Cycle Resolver, which incorporates a unit circle constraint\nloss to improve angle prediction accuracy. Our approach can effectively improve\nthe performance of existing state-of-the-art weakly supervised methods and even\nsurpasses fully supervised models on existing optical benchmarks (i.e.,\nDOTA-v1.0 dataset). With the aid of UCR, we further annotate and introduce\nRSAR, the largest multi-class rotated SAR object detection dataset to date.\nExtensive experiments on both RSAR and optical datasets demonstrate that our\nUCR enhances angle prediction accuracy. Our dataset and code can be found at:\nhttps://github.com/zhasion/RSAR.\n","authors":["Xin Zhang","Xue Yang","Yuxuan Li","Jian Yang","Ming-Ming Cheng","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2501.04440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01087v3","updated":"2025-01-08T11:40:29Z","published":"2025-01-02T06:19:53Z","title":"Bridging Simplicity and Sophistication using GLinear: A Novel\n  Architecture for Enhanced Time Series Prediction","summary":"  Time Series Forecasting (TSF) is an important application across many fields.\nThere is a debate about whether Transformers, despite being good at\nunderstanding long sequences, struggle with preserving temporal relationships\nin time series data. Recent research suggests that simpler linear models might\noutperform or at least provide competitive performance compared to complex\nTransformer-based models for TSF tasks. In this paper, we propose a novel\ndata-efficient architecture, GLinear, for multivariate TSF that exploits\nperiodic patterns to provide better accuracy. It also provides better\nprediction accuracy by using a smaller amount of historical data compared to\nother state-of-the-art linear predictors. Four different datasets (ETTh1,\nElectricity, Traffic, and Weather) are used to evaluate the performance of the\nproposed predictor. A performance comparison with state-of-the-art linear\narchitectures (such as NLinear, DLinear, and RLinear) and transformer-based\ntime series predictor (Autoformer) shows that the GLinear, despite being\nparametrically efficient, significantly outperforms the existing architectures\nin most cases of multivariate TSF. We hope that the proposed GLinear opens new\nfronts of research and development of simpler and more sophisticated\narchitectures for data and computationally efficient time-series analysis.\n","authors":["Syed Tahir Hussain Rizvi","Neel Kanwal","Muddasar Naeem","Alfredo Cuzzocrea","Antonio Coronato"],"pdf_url":"https://arxiv.org/pdf/2501.01087v3.pdf","comment":"Submitted to IEEE Transactions on Emerging Topics in Computational\n  Intelligence"},{"id":"http://arxiv.org/abs/2501.03567v2","updated":"2025-01-08T11:20:00Z","published":"2025-01-07T06:35:34Z","title":"Evaluating Image Caption via Cycle-consistent Text-to-Image Generation","summary":"  Evaluating image captions typically relies on reference captions, which are\ncostly to obtain and exhibit significant diversity and subjectivity. While\nreference-free evaluation metrics have been proposed, most focus on cross-modal\nevaluation between captions and images. Recent research has revealed that the\nmodality gap generally exists in the representation of contrastive\nlearning-based multi-modal systems, undermining the reliability of\ncross-modality metrics like CLIPScore. In this paper, we propose CAMScore, a\ncyclic reference-free automatic evaluation metric for image captioning models.\nTo circumvent the aforementioned modality gap, CAMScore utilizes a\ntext-to-image model to generate images from captions and subsequently evaluates\nthese generated images against the original images. Furthermore, to provide\nfine-grained information for a more comprehensive evaluation, we design a\nthree-level evaluation framework for CAMScore that encompasses pixel-level,\nsemantic-level, and objective-level perspectives. Extensive experiment results\nacross multiple benchmark datasets show that CAMScore achieves a superior\ncorrelation with human judgments compared to existing reference-based and\nreference-free metrics, demonstrating the effectiveness of the framework.\n","authors":["Tianyu Cui","Jinbin Bai","Guo-Hua Wang","Qing-Guo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","Ye Shi"],"pdf_url":"https://arxiv.org/pdf/2501.03567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14520v4","updated":"2025-01-08T11:03:00Z","published":"2024-03-21T16:17:57Z","title":"Cobra: Extending Mamba to Multi-Modal Large Language Model for Efficient\n  Inference","summary":"  In recent years, the application of multimodal large language models (MLLM)\nin various fields has achieved remarkable success. However, as the foundation\nmodel for many downstream tasks, current MLLMs are composed of the well-known\nTransformer network, which has a less efficient quadratic computation\ncomplexity. To improve the efficiency of such basic models, we propose Cobra, a\nlinear computational complexity MLLM. Specifically, Cobra integrates the\nefficient Mamba language model into the visual modality. Moreover, we explore\nand study various modal fusion schemes to create an effective multi-modal\nMamba. Extensive experiments demonstrate that (1) Cobra achieves extremely\ncompetitive performance with current computationally efficient state-of-the-art\nmethods, e.g., LLaVA-Phi, TinyLLaVA, and MobileVLM v2, and has faster speed due\nto Cobra's linear sequential modeling. (2) Interestingly, the results of\nclosed-set challenging prediction benchmarks show that Cobra performs well in\novercoming visual illusions and spatial relationship judgments. (3) Notably,\nCobra even achieves comparable performance to LLaVA with about 43% of the\nnumber of parameters. We will make all codes of Cobra open-source and hope that\nthe proposed method can facilitate future research on complexity problems in\nMLLM. Our project page is available at: https://sites.google.com/view/cobravlm.\n","authors":["Han Zhao","Min Zhang","Wei Zhao","Pengxiang Ding","Siteng Huang","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14520v4.pdf","comment":"Accepted to the Thirty-Ninth AAAI Conference on Artificial\n  Intelligence (AAAI-25)"},{"id":"http://arxiv.org/abs/2409.09502v2","updated":"2025-01-08T10:11:44Z","published":"2024-09-14T18:26:26Z","title":"One missing piece in Vision and Language: A Survey on Comics\n  Understanding","summary":"  Vision-language models have recently evolved into versatile systems capable\nof high performance across a range of tasks, such as document understanding,\nvisual question answering, and grounding, often in zero-shot settings. Comics\nUnderstanding, a complex and multifaceted field, stands to greatly benefit from\nthese advances. Comics, as a medium, combine rich visual and textual\nnarratives, challenging AI models with tasks that span image classification,\nobject detection, instance segmentation, and deeper narrative comprehension\nthrough sequential panels. However, the unique structure of comics --\ncharacterized by creative variations in style, reading order, and non-linear\nstorytelling -- presents a set of challenges distinct from those in other\nvisual-language domains. In this survey, we present a comprehensive review of\nComics Understanding from both dataset and task perspectives. Our contributions\nare fivefold: (1) We analyze the structure of the comics medium, detailing its\ndistinctive compositional elements; (2) We survey the widely used datasets and\ntasks in comics research, emphasizing their role in advancing the field; (3) We\nintroduce the Layer of Comics Understanding (LoCU) framework, a novel taxonomy\nthat redefines vision-language tasks within comics and lays the foundation for\nfuture work; (4) We provide a detailed review and categorization of existing\nmethods following the LoCU framework; (5) Finally, we highlight current\nresearch challenges and propose directions for future exploration, particularly\nin the context of vision-language models applied to comics. This survey is the\nfirst to propose a task-oriented framework for comics intelligence and aims to\nguide future research by addressing critical gaps in data availability and task\ndefinition. A project associated with this survey is available at\nhttps://github.com/emanuelevivoli/awesome-comics-understanding.\n","authors":["Emanuele Vivoli","Mohamed Ali Souibgui","Andrey Barsky","Artemis LLabrés","Marco Bertini","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2409.09502v2.pdf","comment":"under review. project website:\n  https://github.com/emanuelevivoli/awesome-comics-understanding"},{"id":"http://arxiv.org/abs/2501.04390v1","updated":"2025-01-08T10:08:09Z","published":"2025-01-08T10:08:09Z","title":"iFADIT: Invertible Face Anonymization via Disentangled Identity\n  Transform","summary":"  Face anonymization aims to conceal the visual identity of a face to safeguard\nthe individual's privacy. Traditional methods like blurring and pixelation can\nlargely remove identifying features, but these techniques significantly degrade\nimage quality and are vulnerable to deep reconstruction attacks. Generative\nmodels have emerged as a promising solution for anonymizing faces while\npreserving a natural appearance.However, many still face limitations in visual\nquality and often overlook the potential to recover the original face from the\nanonymized version, which can be valuable in specific contexts such as image\nforensics. This paper proposes a novel framework named iFADIT, an acronym for\nInvertible Face Anonymization via Disentangled Identity Transform.The framework\nfeatures a disentanglement architecture coupled with a secure flow-based model:\nthe former decouples identity information from non-identifying attributes,\nwhile the latter transforms the decoupled identity into an anonymized version\nin an invertible manner controlled by a secret key. The anonymized face can\nthen be reconstructed based on a pre-trained StyleGAN that ensures high image\nquality and realistic facial details. Recovery of the original face (aka\nde-anonymization) is possible upon the availability of the matching secret, by\ninverting the anonymization process based on the same set of model parameters.\nFurthermore, a dedicated secret-key mechanism along with a dual-phase training\nstrategy is devised to ensure the desired properties of face anonymization.\nQualitative and quantitative experiments demonstrate the superiority of the\nproposed approach in anonymity, reversibility, security, diversity, and\ninterpretability over competing methods.\n","authors":["Lin Yuan","Kai Liang","Xiong Li","Tao Wu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2501.04390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13789v3","updated":"2025-01-08T10:01:20Z","published":"2023-12-21T12:26:11Z","title":"TinySAM: Pushing the Envelope for Efficient Segment Anything Model","summary":"  Recently segment anything model (SAM) has shown powerful segmentation\ncapability and has drawn great attention in computer vision fields. Massive\nfollowing works have developed various applications based on the pre-trained\nSAM and achieved impressive performance on downstream vision tasks. However,\nSAM consists of heavy architectures and requires massive computational\ncapacity, which hinders the further application of SAM on computation\nconstrained edge devices. To this end, in this paper we propose a framework to\nobtain a tiny segment anything model (TinySAM) while maintaining the strong\nzero-shot performance. We first propose a full-stage knowledge distillation\nmethod with hard prompt sampling and hard mask weighting strategy to distill a\nlightweight student model. We also adapt the post-training quantization to the\nprompt-based segmentation task and further reduce the computational cost.\nMoreover, a hierarchical segmenting everything strategy is proposed to\naccelerate the everything inference by $2\\times$ with almost no performance\ndegradation. With all these proposed methods, our TinySAM leads to orders of\nmagnitude computational reduction and pushes the envelope for efficient segment\nanything task. Extensive experiments on various zero-shot transfer tasks\ndemonstrate the significantly advantageous performance of our TinySAM against\ncounterpart methods. Codes are available at\nhttps://github.com/xinghaochen/TinySAM and\nhttps://gitee.com/mindspore/models/tree/master/research/cv/TinySAM.\n","authors":["Han Shu","Wenshuo Li","Yehui Tang","Yiman Zhang","Yihao Chen","Houqiang Li","Yunhe Wang","Xinghao Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13789v3.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2312.00947v3","updated":"2025-01-08T09:41:18Z","published":"2023-12-01T22:00:14Z","title":"FreeZe: Training-free zero-shot 6D pose estimation with geometric and\n  vision foundation models","summary":"  Estimating the 6D pose of objects unseen during training is highly desirable\nyet challenging. Zero-shot object 6D pose estimation methods address this\nchallenge by leveraging additional task-specific supervision provided by\nlarge-scale, photo-realistic synthetic datasets. However, their performance\nheavily depends on the quality and diversity of rendered data and they require\nextensive training. In this work, we show how to tackle the same task but\nwithout training on specific data. We propose FreeZe, a novel solution that\nharnesses the capabilities of pre-trained geometric and vision foundation\nmodels. FreeZe leverages 3D geometric descriptors learned from unrelated 3D\npoint clouds and 2D visual features learned from web-scale 2D images to\ngenerate discriminative 3D point-level descriptors. We then estimate the 6D\npose of unseen objects by 3D registration based on RANSAC. We also introduce a\nnovel algorithm to solve ambiguous cases due to geometrically symmetric objects\nthat is based on visual features. We comprehensively evaluate FreeZe across the\nseven core datasets of the BOP Benchmark, which include over a hundred 3D\nobjects and 20,000 images captured in various scenarios. FreeZe consistently\noutperforms all state-of-the-art approaches, including competitors extensively\ntrained on synthetic 6D pose estimation data. Code will be publicly available\nat https://andreacaraffa.github.io/freeze.\n","authors":["Andrea Caraffa","Davide Boscaini","Amir Hamza","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.00947v3.pdf","comment":"Accepted to ECCV 2024. Project page:\n  https://andreacaraffa.github.io/freeze"},{"id":"http://arxiv.org/abs/2309.06941v3","updated":"2025-01-08T09:35:58Z","published":"2023-09-13T13:24:27Z","title":"DEFormer: DCT-driven Enhancement Transformer for Low-light Image and\n  Dark Vision","summary":"  Low-light image enhancement restores the colors and details of a single image\nand improves high-level visual tasks. However, restoring the lost details in\nthe dark area is still a challenge relying only on the RGB domain. In this\npaper, we delve into frequency as a new clue into the model and propose a\nDCT-driven enhancement transformer (DEFormer) framework. First, we propose a\nlearnable frequency branch (LFB) for frequency enhancement contains DCT\nprocessing and curvature-based frequency enhancement (CFE) to represent\nfrequency features. Additionally, we propose a cross domain fusion (CDF) to\nreduce the differences between the RGB domain and the frequency domain. Our\nDEFormer has achieved superior results on the LOL and MIT-Adobe FiveK datasets,\nimproving the dark detection performance.\n","authors":["Xiangchen Yin","Zhenda Yu","Xin Gao","Xiao Sun"],"pdf_url":"https://arxiv.org/pdf/2309.06941v3.pdf","comment":"Accepted by ICASSP"},{"id":"http://arxiv.org/abs/2501.04377v1","updated":"2025-01-08T09:34:15Z","published":"2025-01-08T09:34:15Z","title":"On Computational Limits and Provably Efficient Criteria of Visual\n  Autoregressive Models: A Fine-Grained Complexity Analysis","summary":"  Recently, Visual Autoregressive ($\\mathsf{VAR}$) Models introduced a\ngroundbreaking advancement in the field of image generation, offering a\nscalable approach through a coarse-to-fine \"next-scale prediction\" paradigm.\nHowever, the state-of-the-art algorithm of $\\mathsf{VAR}$ models in [Tian,\nJiang, Yuan, Peng and Wang, NeurIPS 2024] takes $O(n^4)$ time, which is\ncomputationally inefficient. In this work, we analyze the computational limits\nand efficiency criteria of $\\mathsf{VAR}$ Models through a fine-grained\ncomplexity lens. Our key contribution is identifying the conditions under which\n$\\mathsf{VAR}$ computations can achieve sub-quadratic time complexity.\nSpecifically, we establish a critical threshold for the norm of input matrices\nused in $\\mathsf{VAR}$ attention mechanisms. Above this threshold, assuming the\nStrong Exponential Time Hypothesis ($\\mathsf{SETH}$) from fine-grained\ncomplexity theory, a sub-quartic time algorithm for $\\mathsf{VAR}$ models is\nimpossible. To substantiate our theoretical findings, we present efficient\nconstructions leveraging low-rank approximations that align with the derived\ncriteria. This work initiates the study of the computational efficiency of the\n$\\mathsf{VAR}$ model from a theoretical perspective. Our technique will shed\nlight on advancing scalable and efficient image generation in $\\mathsf{VAR}$\nframeworks.\n","authors":["Yekun Ke","Xiaoyu Li","Yingyu Liang","Zhizhou Sha","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2501.04377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04376v1","updated":"2025-01-08T09:30:45Z","published":"2025-01-08T09:30:45Z","title":"Exploring Unbiased Deepfake Detection via Token-Level Shuffling and\n  Mixing","summary":"  The generalization problem is broadly recognized as a critical challenge in\ndetecting deepfakes. Most previous work believes that the generalization gap is\ncaused by the differences among various forgery methods. However, our\ninvestigation reveals that the generalization issue can still occur when\nforgery-irrelevant factors shift. In this work, we identify two biases that\ndetectors may also be prone to overfitting: position bias and content bias, as\ndepicted in Fig. 1. For the position bias, we observe that detectors are prone\nto lazily depending on the specific positions within an image (e.g., central\nregions even no forgery). As for content bias, we argue that detectors may\npotentially and mistakenly utilize forgery-unrelated information for detection\n(e.g., background, and hair). To intervene these biases, we propose two\nbranches for shuffling and mixing with tokens in the latent space of\ntransformers. For the shuffling branch, we rearrange the tokens and\ncorresponding position embedding for each image while maintaining the local\ncorrelation. For the mixing branch, we randomly select and mix the tokens in\nthe latent space between two images with the same label within the mini-batch\nto recombine the content information. During the learning process, we align the\noutputs of detectors from different branches in both feature space and logit\nspace. Contrastive losses for features and divergence losses for logits are\napplied to obtain unbiased feature representation and classifiers. We\ndemonstrate and verify the effectiveness of our method through extensive\nexperiments on widely used evaluation datasets.\n","authors":["Xinghe Fu","Zhiyuan Yan","Taiping Yao","Shen Chen","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2501.04376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15209v3","updated":"2025-01-08T09:29:10Z","published":"2024-03-22T13:50:27Z","title":"MSCoTDet: Language-driven Multi-modal Fusion for Improved Multispectral\n  Pedestrian Detection","summary":"  Multispectral pedestrian detection is attractive for around-the-clock\napplications due to the complementary information between RGB and thermal\nmodalities. However, current models often fail to detect pedestrians in certain\ncases (e.g., thermal-obscured pedestrians), particularly due to the modality\nbias learned from statistically biased datasets. In this paper, we investigate\nhow to mitigate modality bias in multispectral pedestrian detection using Large\nLanguage Models (LLMs). Accordingly, we design a Multispectral Chain-of-Thought\n(MSCoT) prompting strategy, which prompts the LLM to perform multispectral\npedestrian detection. Moreover, we propose a novel Multispectral\nChain-of-Thought Detection (MSCoTDet) framework that integrates MSCoT prompting\ninto multispectral pedestrian detection. To this end, we design a\nLanguage-driven Multi-modal Fusion (LMF) strategy that enables fusing the\noutputs of MSCoT prompting with the detection results of vision-based\nmultispectral pedestrian detection models. Extensive experiments validate that\nMSCoTDet effectively mitigates modality biases and improves multispectral\npedestrian detection.\n","authors":["Taeheon Kim","Sangyun Chung","Damin Yeom","Youngjoon Yu","Hak Gu Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2403.15209v3.pdf","comment":"IEEE Transactions on Circuits and Systems for Video Technology\n  (TCSVT)"},{"id":"http://arxiv.org/abs/2501.04374v1","updated":"2025-01-08T09:28:25Z","published":"2025-01-08T09:28:25Z","title":"Instructive3D: Editing Large Reconstruction Models with Text\n  Instructions","summary":"  Transformer based methods have enabled users to create, modify, and\ncomprehend text and image data. Recently proposed Large Reconstruction Models\n(LRMs) further extend this by providing the ability to generate high-quality 3D\nmodels with the help of a single object image. These models, however, lack the\nability to manipulate or edit the finer details, such as adding standard design\npatterns or changing the color and reflectance of the generated objects, thus\nlacking fine-grained control that may be very helpful in domains such as\naugmented reality, animation and gaming. Naively training LRMs for this purpose\nwould require generating precisely edited images and 3D object pairs, which is\ncomputationally expensive. In this paper, we propose Instructive3D, a novel LRM\nbased model that integrates generation and fine-grained editing, through user\ntext prompts, of 3D objects into a single model. We accomplish this by adding\nan adapter that performs a diffusion process conditioned on a text prompt\nspecifying edits in the triplane latent space representation of 3D object\nmodels. Our method does not require the generation of edited 3D objects.\nAdditionally, Instructive3D allows us to perform geometrically consistent\nmodifications, as the edits done through user-defined text prompts are applied\nto the triplane latent representation thus enhancing the versatility and\nprecision of 3D objects generated. We compare the objects generated by\nInstructive3D and a baseline that first generates the 3D object meshes using a\nstandard LRM model and then edits these 3D objects using text prompts when\nimages are provided from the Objaverse LVIS dataset. We find that Instructive3D\nproduces qualitatively superior 3D objects with the properties specified by the\nedit prompts.\n","authors":["Kunal Kathare","Ankit Dhiman","K Vikas Gowda","Siddharth Aravindan","Shubham Monga","Basavaraja Shanthappa Vandrotti","Lokesh R Boregowda"],"pdf_url":"https://arxiv.org/pdf/2501.04374v1.pdf","comment":"Accepted at WACV 2025. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2501.04373v1","updated":"2025-01-08T09:26:36Z","published":"2025-01-08T09:26:36Z","title":"FGU3R: Fine-Grained Fusion via Unified 3D Representation for Multimodal\n  3D Object Detection","summary":"  Multimodal 3D object detection has garnered considerable interest in\nautonomous driving. However, multimodal detectors suffer from dimension\nmismatches that derive from fusing 3D points with 2D pixels coarsely, which\nleads to sub-optimal fusion performance. In this paper, we propose a multimodal\nframework FGU3R to tackle the issue mentioned above via unified 3D\nrepresentation and fine-grained fusion, which consists of two important\ncomponents. First, we propose an efficient feature extractor for raw and pseudo\npoints, termed Pseudo-Raw Convolution (PRConv), which modulates multimodal\nfeatures synchronously and aggregates the features from different types of\npoints on key points based on multimodal interaction. Second, a Cross-Attention\nAdaptive Fusion (CAAF) is designed to fuse homogeneous 3D RoI (Region of\nInterest) features adaptively via a cross-attention variant in a fine-grained\nmanner. Together they make fine-grained fusion on unified 3D representation.\nThe experiments conducted on the KITTI and nuScenes show the effectiveness of\nour proposed method.\n","authors":["Guoxin Zhang","Ziying Song","Lin Liu","Zhonghong Ou"],"pdf_url":"https://arxiv.org/pdf/2501.04373v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2412.04247v2","updated":"2025-01-08T09:18:03Z","published":"2024-12-05T15:27:58Z","title":"3D Part Segmentation via Geometric Aggregation of 2D Visual Features","summary":"  Supervised 3D part segmentation models are tailored for a fixed set of\nobjects and parts, limiting their transferability to open-set, real-world\nscenarios. Recent works have explored vision-language models (VLMs) as a\npromising alternative, using multi-view rendering and textual prompting to\nidentify object parts. However, naively applying VLMs in this context\nintroduces several drawbacks, such as the need for meticulous prompt\nengineering, and fails to leverage the 3D geometric structure of objects. To\naddress these limitations, we propose COPS, a COmprehensive model for Parts\nSegmentation that blends the semantics extracted from visual concepts and 3D\ngeometry to effectively identify object parts. COPS renders a point cloud from\nmultiple viewpoints, extracts 2D features, projects them back to 3D, and uses a\nnovel geometric-aware feature aggregation procedure to ensure spatial and\nsemantic consistency. Finally, it clusters points into parts and labels them.\nWe demonstrate that COPS is efficient, scalable, and achieves zero-shot\nstate-of-the-art performance across five datasets, covering synthetic and\nreal-world data, texture-less and coloured objects, as well as rigid and\nnon-rigid shapes. The code is available at https://3d-cops.github.io.\n","authors":["Marco Garosi","Riccardo Tedoldi","Davide Boscaini","Massimiliano Mancini","Nicu Sebe","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2412.04247v2.pdf","comment":"Published in WACV 2025. Project page: https://3d-cops.github.io/"},{"id":"http://arxiv.org/abs/2501.04361v1","updated":"2025-01-08T08:58:53Z","published":"2025-01-08T08:58:53Z","title":"A Unified Framework for Foreground and Anonymization Area Segmentation\n  in CT and MRI Data","summary":"  This study presents an open-source toolkit to address critical challenges in\npreprocessing data for self-supervised learning (SSL) for 3D medical imaging,\nfocusing on data privacy and computational efficiency. The toolkit comprises\ntwo main components: a segmentation network that delineates foreground regions\nto optimize data sampling and thus reduce training time, and a segmentation\nnetwork that identifies anonymized regions, preventing erroneous supervision in\nreconstruction-based SSL methods. Experimental results demonstrate high\nrobustness, with mean Dice scores exceeding 98.5 across all anonymization\nmethods and surpassing 99.5 for foreground segmentation tasks, highlighting the\nefficacy of the toolkit in supporting SSL applications in 3D medical imaging\nfor both CT and MRI images. The weights and code is available at\nhttps://github.com/MIC-DKFZ/Foreground-and-Anonymization-Area-Segmentation.\n","authors":["Michal Nohel","Constantin Ulrich","Jonathan Suprijadi","Tassilo Wald","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2501.04361v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2501.04353v1","updated":"2025-01-08T08:51:35Z","published":"2025-01-08T08:51:35Z","title":"DeFusion: An Effective Decoupling Fusion Network for Multi-Modal\n  Pregnancy Prediction","summary":"  Temporal embryo images and parental fertility table indicators are both\nvaluable for pregnancy prediction in \\textbf{in vitro fertilization embryo\ntransfer} (IVF-ET). However, current machine learning models cannot make full\nuse of the complementary information between the two modalities to improve\npregnancy prediction performance. In this paper, we propose a Decoupling Fusion\nNetwork called DeFusion to effectively integrate the multi-modal information\nfor IVF-ET pregnancy prediction. Specifically, we propose a decoupling fusion\nmodule that decouples the information from the different modalities into\nrelated and unrelated information, thereby achieving a more delicate fusion.\nAnd we fuse temporal embryo images with a spatial-temporal position encoding,\nand extract fertility table indicator information with a table transformer. To\nevaluate the effectiveness of our model, we use a new dataset including 4046\ncases collected from Southern Medical University. The experiments show that our\nmodel outperforms state-of-the-art methods. Meanwhile, the performance on the\neye disease prediction dataset reflects the model's good generalization. Our\ncode and dataset are available at https://github.com/Ou-Young-1999/DFNet.\n","authors":["Xueqiang Ouyang","Jia Wei","Wenjie Huo","Xiaocong Wang","Rui Li","Jianlong Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.04353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04352v1","updated":"2025-01-08T08:49:52Z","published":"2025-01-08T08:49:52Z","title":"Online Gaussian Test-Time Adaptation of Vision-Language Models","summary":"  Online test-time adaptation (OTTA) of vision-language models (VLMs) has\nrecently garnered increased attention to take advantage of data observed along\na stream to improve future predictions. Unfortunately, existing methods rely on\ndataset-specific hyperparameters, significantly limiting their adaptability to\nunseen tasks. In response, we propose Online Gaussian Adaptation (OGA), a novel\nmethod that models the likelihoods of visual features using Gaussian\ndistributions and incorporates zero-shot priors into an interpretable Maximum A\nPosteriori (MAP) estimation framework with fixed hyper-parameters across all\ndatasets. We demonstrate that OGA outperforms state-of-the-art methods on most\ndatasets and runs. Additionally, we show that combining OTTA with popular\nfew-shot techniques (a practical yet overlooked setting in prior research) is\nhighly beneficial. Furthermore, our experimental study reveals that common OTTA\nevaluation protocols, which average performance over at most three runs per\ndataset, are inadequate due to the substantial variability observed across runs\nfor all OTTA methods. Therefore, we advocate for more rigorous evaluation\npractices, including increasing the number of runs and considering additional\nquantitative metrics, such as our proposed Expected Tail Accuracy (ETA),\ncalculated as the average accuracy in the worst 10% of runs. We hope these\ncontributions will encourage more rigorous and diverse evaluation practices in\nthe OTTA community. Code is available at https://github.com/cfuchs2023/OGA .\n","authors":["Clément Fuchs","Maxime Zanella","Christophe De Vleeschouwer"],"pdf_url":"https://arxiv.org/pdf/2501.04352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17042v2","updated":"2025-01-08T08:22:49Z","published":"2024-12-22T14:49:55Z","title":"Adapting Image-to-Video Diffusion Models for Large-Motion Frame\n  Interpolation","summary":"  With the development of video generation models has advanced significantly in\nrecent years, we adopt large-scale image-to-video diffusion models for video\nframe interpolation. We present a conditional encoder designed to adapt an\nimage-to-video model for large-motion frame interpolation. To enhance\nperformance, we integrate a dual-branch feature extractor and propose a\ncross-frame attention mechanism that effectively captures both spatial and\ntemporal information, enabling accurate interpolations of intermediate frames.\nOur approach demonstrates superior performance on the Fr\\'echet Video Distance\n(FVD) metric when evaluated against other state-of-the-art approaches,\nparticularly in handling large motion scenarios, highlighting advancements in\ngenerative-based methodologies.\n","authors":["Luoxu Jin","Hiroshi Watanabe"],"pdf_url":"https://arxiv.org/pdf/2412.17042v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04336v1","updated":"2025-01-08T08:15:29Z","published":"2025-01-08T08:15:29Z","title":"Building a Mind Palace: Structuring Environment-Grounded Semantic Graphs\n  for Effective Long Video Analysis with LLMs","summary":"  Long-form video understanding with Large Vision Language Models is challenged\nby the need to analyze temporally dispersed yet spatially concentrated key\nmoments within limited context windows. In this work, we introduce\nVideoMindPalace, a new framework inspired by the \"Mind Palace\", which organizes\ncritical video moments into a topologically structured semantic graph.\nVideoMindPalace organizes key information through (i) hand-object tracking and\ninteraction, (ii) clustered activity zones representing specific areas of\nrecurring activities, and (iii) environment layout mapping, allowing natural\nlanguage parsing by LLMs to provide grounded insights on spatio-temporal and 3D\ncontext. In addition, we propose the Video MindPalace Benchmark (VMB), to\nassess human-like reasoning, including spatial localization, temporal\nreasoning, and layout-aware sequential understanding. Evaluated on VMB and\nestablished video QA datasets, including EgoSchema, NExT-QA, IntentQA, and the\nActive Memories Benchmark, VideoMindPalace demonstrates notable gains in\nspatio-temporal coherence and human-aligned reasoning, advancing long-form\nvideo analysis capabilities in VLMs.\n","authors":["Zeyi Huang","Yuyang Ji","Xiaofang Wang","Nikhil Mehta","Tong Xiao","Donghyun Lee","Sigmund Vanvalkenburgh","Shengxin Zha","Bolin Lai","Licheng Yu","Ning Zhang","Yong Jae Lee","Miao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.04336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04329v1","updated":"2025-01-08T08:03:49Z","published":"2025-01-08T08:03:49Z","title":"An Efficient Adaptive Compression Method for Human Perception and\n  Machine Vision Tasks","summary":"  While most existing neural image compression (NIC) and neural video\ncompression (NVC) methodologies have achieved remarkable success, their\noptimization is primarily focused on human visual perception. However, with the\nrapid development of artificial intelligence, many images and videos will be\nused for various machine vision tasks. Consequently, such existing compression\nmethodologies cannot achieve competitive performance in machine vision. In this\nwork, we introduce an efficient adaptive compression (EAC) method tailored for\nboth human perception and multiple machine vision tasks. Our method involves\ntwo key modules: 1), an adaptive compression mechanism, that adaptively selects\nseveral subsets from latent features to balance the optimizations for multiple\nmachine vision tasks (e.g., segmentation, and detection) and human vision. 2),\na task-specific adapter, that uses the parameter-efficient delta-tuning\nstrategy to stimulate the comprehensive downstream analytical networks for\nspecific machine vision tasks. By using the above two modules, we can optimize\nthe bit-rate costs and improve machine vision performance. In general, our\nproposed EAC can seamlessly integrate with existing NIC (i.e., Ball\\'e2018, and\nCheng2020) and NVC (i.e., DVC, and FVC) methods. Extensive evaluation on\nvarious benchmark datasets (i.e., VOC2007, ILSVRC2012, VOC2012, COCO, UCF101,\nand DAVIS) shows that our method enhances performance for multiple machine\nvision tasks while maintaining the quality of human vision.\n","authors":["Lei Liu","Zhenghao Chen","Zhihao Hu","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2501.04329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04325v1","updated":"2025-01-08T07:52:12Z","published":"2025-01-08T07:52:12Z","title":"Edit as You See: Image-guided Video Editing via Masked Motion Modeling","summary":"  Recent advancements in diffusion models have significantly facilitated\ntext-guided video editing. However, there is a relative scarcity of research on\nimage-guided video editing, a method that empowers users to edit videos by\nmerely indicating a target object in the initial frame and providing an RGB\nimage as reference, without relying on the text prompts. In this paper, we\npropose a novel Image-guided Video Editing Diffusion model, termed IVEDiff for\nthe image-guided video editing. IVEDiff is built on top of image editing\nmodels, and is equipped with learnable motion modules to maintain the temporal\nconsistency of edited video. Inspired by self-supervised learning concepts, we\nintroduce a masked motion modeling fine-tuning strategy that empowers the\nmotion module's capabilities for capturing inter-frame motion dynamics, while\npreserving the capabilities for intra-frame semantic correlations modeling of\nthe base image editing model. Moreover, an optical-flow-guided motion reference\nnetwork is proposed to ensure the accurate propagation of information between\nedited video frames, alleviating the misleading effects of invalid information.\nWe also construct a benchmark to facilitate further research. The comprehensive\nexperiments demonstrate that our method is able to generate temporally smooth\nedited videos while robustly dealing with various editing objects with high\nquality.\n","authors":["Zhi-Lin Huang","Yixuan Liu","Chujun Qin","Zhongdao Wang","Dong Zhou","Dong Li","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2501.04325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18930v2","updated":"2025-01-08T07:43:09Z","published":"2024-12-25T15:20:54Z","title":"Graph Cut-guided Maximal Coding Rate Reduction for Learning Image\n  Embedding and Clustering","summary":"  In the era of pre-trained models, image clustering task is usually addressed\nby two relevant stages: a) to produce features from pre-trained vision models;\nand b) to find clusters from the pre-trained features. However, these two\nstages are often considered separately or learned by different paradigms,\nleading to suboptimal clustering performance. In this paper, we propose a\nunified framework, termed graph Cut-guided Maximal Coding Rate Reduction\n(CgMCR$^2$), for jointly learning the structured embeddings and the clustering.\nTo be specific, we attempt to integrate an efficient clustering module into the\nprincipled framework for learning structured representation, in which the\nclustering module is used to provide partition information to guide the\ncluster-wise compression and the learned embeddings is aligned to desired\ngeometric structures in turn to help for yielding more accurate partitions. We\nconduct extensive experiments on both standard and out-of-domain image datasets\nand experimental results validate the effectiveness of our approach.\n","authors":["W. He","Z. Huang","X. Meng","X. Qi","R. Xiao","C. -G. Li"],"pdf_url":"https://arxiv.org/pdf/2412.18930v2.pdf","comment":"24 pages, 9 figures, accepted in ACCV2024"},{"id":"http://arxiv.org/abs/2501.04322v1","updated":"2025-01-08T07:42:54Z","published":"2025-01-08T07:42:54Z","title":"Eve: Efficient Multimodal Vision Language Models with Elastic Visual\n  Experts","summary":"  Multimodal vision language models (VLMs) have made significant progress with\nthe support of continuously increasing model sizes and data volumes. Running\nVLMs on edge devices has become a challenge for their widespread application.\nThere are several efficient VLM efforts, but they often sacrifice linguistic\ncapabilities to enhance multimodal abilities, or require extensive training. To\naddress this quandary,we introduce the innovative framework of Efficient Vision\nLanguage Models with Elastic Visual Experts (Eve). By strategically\nincorporating adaptable visual expertise at multiple stages of training, Eve\nstrikes a balance between preserving linguistic abilities and augmenting\nmultimodal capabilities. This balanced approach results in a versatile model\nwith only 1.8B parameters that delivers significant improvements in both\nmultimodal and linguistic tasks. Notably, in configurations below 3B\nparameters, Eve distinctly outperforms in language benchmarks and achieves\nstate-of-the-art results 68.87% in VLM Benchmarks. Additionally, its multimodal\naccuracy outstrips that of the larger 7B LLaVA-1.5 model.\n","authors":["Miao Rang","Zhenni Bi","Chuanjian Liu","Yehui Tang","Kai Han","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11102v2","updated":"2025-01-08T07:22:51Z","published":"2024-12-15T07:49:31Z","title":"Empowering LLMs to Understand and Generate Complex Vector Graphics","summary":"  The unprecedented advancements in Large Language Models (LLMs) have\nprofoundly impacted natural language processing but have yet to fully embrace\nthe realm of scalable vector graphics (SVG) generation. While LLMs encode\npartial knowledge of SVG data from web pages during training, recent findings\nsuggest that semantically ambiguous and tokenized representations within LLMs\nmay result in hallucinations in vector primitive predictions. Additionally, LLM\ntraining typically lacks modeling and understanding of the rendering sequence\nof vector paths, which can lead to occlusion between output vector primitives.\nIn this paper, we present LLM4SVG, an initial yet substantial step toward\nbridging this gap by enabling LLMs to better understand and generate vector\ngraphics. LLM4SVG facilitates a deeper understanding of SVG components through\nlearnable semantic tokens, which precisely encode these tokens and their\ncorresponding properties to generate semantically aligned SVG outputs. Using a\nseries of learnable semantic tokens, a structured dataset for instruction\nfollowing is developed to support comprehension and generation across two\nprimary tasks. Our method introduces a modular architecture to existing large\nlanguage models, integrating semantic tags, vector instruction encoders,\nfine-tuned commands, and powerful LLMs to tightly combine geometric,\nappearance, and language information. To overcome the scarcity of SVG-text\ninstruction data, we developed an automated data generation pipeline that\ncollected a massive dataset of more than 250k SVG data and 580k SVG-text\ninstructions, which facilitated the adoption of the two-stage training strategy\npopular in LLM development. By exploring various training strategies, we\ndeveloped LLM4SVG, which significantly moves beyond optimized rendering-based\napproaches and language-model-based baselines to achieve remarkable results in\nhuman evaluation tasks.\n","authors":["Ximing Xing","Juncheng Hu","Guotao Liang","Jing Zhang","Dong Xu","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2412.11102v2.pdf","comment":"Project Page: https://ximinng.github.io/LLM4SVGProject/"},{"id":"http://arxiv.org/abs/2501.03775v2","updated":"2025-01-08T07:05:16Z","published":"2025-01-07T13:30:54Z","title":"Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection","summary":"  While witnessed with rapid development, remote sensing object detection\nremains challenging for detecting high aspect ratio objects. This paper shows\nthat large strip convolutions are good feature representation learners for\nremote sensing object detection and can detect objects of various aspect ratios\nwell. Based on large strip convolutions, we build a new network architecture\ncalled Strip R-CNN, which is simple, efficient, and powerful. Unlike recent\nremote sensing object detectors that leverage large-kernel convolutions with\nsquare shapes, our Strip R-CNN takes advantage of sequential orthogonal large\nstrip convolutions to capture spatial information. In addition, we enhance the\nlocalization capability of remote-sensing object detectors by decoupling the\ndetection heads and equipping the localization head with strip convolutions to\nbetter localize the target objects. Extensive experiments on several\nbenchmarks, e.g., DOTA, FAIR1M, HRSC2016, and DIOR, show that our Strip R-CNN\ncan largely improve previous works. Notably, our 30M model achieves 82.75% mAP\non DOTA-v1.0, setting a new state-of-the-art record.Code is available at\nhttps://github.com/YXB-NKU/Strip-R-CNN.\n","authors":["Xinbin Yuan","ZhaoHui Zheng","Yuxuan Li","Xialei Liu","Li Liu","Xiang Li","Qibin Hou","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.03775v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01714v4","updated":"2025-01-08T06:52:07Z","published":"2024-04-02T07:57:17Z","title":"Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization\n  Algorithm for Deep Learning","summary":"  Training deep neural networks is a challenging task. In order to speed up\ntraining and enhance the performance of deep neural networks, we rectify the\nvanilla conjugate gradient as conjugate-gradient-like and incorporate it into\nthe generic Adam, and thus propose a new optimization algorithm named\nCG-like-Adam for deep learning. Specifically, both the first-order and the\nsecond-order moment estimation of generic Adam are replaced by the\nconjugate-gradient-like. Convergence analysis handles the cases where the\nexponential moving average coefficient of the first-order moment estimation is\nconstant and the first-order moment estimation is unbiased. Numerical\nexperiments show the superiority of the proposed algorithm based on the\nCIFAR10/100 dataset.\n","authors":["Jiawu Tian","Liwei Xu","Xiaowei Zhang","Yongqi Li"],"pdf_url":"https://arxiv.org/pdf/2404.01714v4.pdf","comment":"32 pages, 13 figures"},{"id":"http://arxiv.org/abs/2412.19112v2","updated":"2025-01-08T06:45:02Z","published":"2024-12-26T08:11:41Z","title":"Future Success Prediction in Open-Vocabulary Object Manipulation Tasks\n  Based on End-Effector Trajectories","summary":"  This study addresses a task designed to predict the future success or failure\nof open-vocabulary object manipulation. In this task, the model is required to\nmake predictions based on natural language instructions, egocentric view images\nbefore manipulation, and the given end-effector trajectories. Conventional\nmethods typically perform success prediction only after the manipulation is\nexecuted, limiting their efficiency in executing the entire task sequence. We\npropose a novel approach that enables the prediction of success or failure by\naligning the given trajectories and images with natural language instructions.\nWe introduce Trajectory Encoder to apply learnable weighting to the input\ntrajectories, allowing the model to consider temporal dynamics and interactions\nbetween objects and the end effector, improving the model's ability to predict\nmanipulation outcomes accurately. We constructed a dataset based on the RT-1\ndataset, a large-scale benchmark for open-vocabulary object manipulation tasks,\nto evaluate our method. The experimental results show that our method achieved\na higher prediction accuracy than baseline approaches.\n","authors":["Motonari Kambara","Komei Sugiura"],"pdf_url":"https://arxiv.org/pdf/2412.19112v2.pdf","comment":"Accepted for presentation at LangRob @ CoRL 2024"},{"id":"http://arxiv.org/abs/2309.05271v2","updated":"2025-01-08T06:30:39Z","published":"2023-09-11T07:05:02Z","title":"AutoFuse: Automatic Fusion Networks for Deformable Medical Image\n  Registration","summary":"  Deformable image registration aims to find a dense non-linear spatial\ncorrespondence between a pair of images, which is a crucial step for many\nmedical tasks such as tumor growth monitoring and population analysis.\nRecently, Deep Neural Networks (DNNs) have been widely recognized for their\nability to perform fast end-to-end registration. However, DNN-based\nregistration needs to explore the spatial information of each image and fuse\nthis information to characterize spatial correspondence. This raises an\nessential question: what is the optimal fusion strategy to characterize spatial\ncorrespondence? Existing fusion strategies (e.g., early fusion, late fusion)\nwere empirically designed to fuse information by manually defined prior\nknowledge, which inevitably constrains the registration performance within the\nlimits of empirical designs. In this study, we depart from existing\nempirically-designed fusion strategies and develop a data-driven fusion\nstrategy for deformable image registration. To achieve this, we propose an\nAutomatic Fusion network (AutoFuse) that provides flexibility to fuse\ninformation at many potential locations within the network. A Fusion Gate (FG)\nmodule is also proposed to control how to fuse information at each potential\nnetwork location based on training data. Our AutoFuse can automatically\noptimize its fusion strategy during training and can be generalizable to both\nunsupervised registration (without any labels) and semi-supervised registration\n(with weak labels provided for partial training data). Extensive experiments on\ntwo well-benchmarked medical registration tasks (inter- and intra-patient\nregistration) with eight public datasets show that our AutoFuse outperforms\nstate-of-the-art unsupervised and semi-supervised registration methods.\n","authors":["Mingyuan Meng","Michael Fulham","Dagan Feng","Lei Bi","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2309.05271v2.pdf","comment":"Published at Pattern Recognition"},{"id":"http://arxiv.org/abs/2501.04304v1","updated":"2025-01-08T06:30:31Z","published":"2025-01-08T06:30:31Z","title":"DGQ: Distribution-Aware Group Quantization for Text-to-Image Diffusion\n  Models","summary":"  Despite the widespread use of text-to-image diffusion models across various\ntasks, their computational and memory demands limit practical applications. To\nmitigate this issue, quantization of diffusion models has been explored. It\nreduces memory usage and computational costs by compressing weights and\nactivations into lower-bit formats. However, existing methods often struggle to\npreserve both image quality and text-image alignment, particularly in\nlower-bit($<$ 8bits) quantization. In this paper, we analyze the challenges\nassociated with quantizing text-to-image diffusion models from a distributional\nperspective. Our analysis reveals that activation outliers play a crucial role\nin determining image quality. Additionally, we identify distinctive patterns in\ncross-attention scores, which significantly affects text-image alignment. To\naddress these challenges, we propose Distribution-aware Group Quantization\n(DGQ), a method that identifies and adaptively handles pixel-wise and\nchannel-wise outliers to preserve image quality. Furthermore, DGQ applies\nprompt-specific logarithmic quantization scales to maintain text-image\nalignment. Our method demonstrates remarkable performance on datasets such as\nMS-COCO and PartiPrompts. We are the first to successfully achieve low-bit\nquantization of text-to-image diffusion models without requiring additional\nfine-tuning of weight quantization parameters.\n","authors":["Hyogon Ryu","NaHyeon Park","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2501.04304v1.pdf","comment":"Project page: https://ugonfor.kr/DGQ"},{"id":"http://arxiv.org/abs/2501.04302v1","updated":"2025-01-08T06:26:16Z","published":"2025-01-08T06:26:16Z","title":"H-MBA: Hierarchical MamBa Adaptation for Multi-Modal Video Understanding\n  in Autonomous Driving","summary":"  With the prevalence of Multimodal Large Language Models(MLLMs), autonomous\ndriving has encountered new opportunities and challenges. In particular,\nmulti-modal video understanding is critical to interactively analyze what will\nhappen in the procedure of autonomous driving. However, videos in such a\ndynamical scene that often contains complex spatial-temporal movements, which\nrestricts the generalization capacity of the existing MLLMs in this field. To\nbridge the gap, we propose a novel Hierarchical Mamba Adaptation (H-MBA)\nframework to fit the complicated motion changes in autonomous driving videos.\nSpecifically, our H-MBA consists of two distinct modules, including Context\nMamba (C-Mamba) and Query Mamba (Q-Mamba). First, C-Mamba contains various\ntypes of structure state space models, which can effectively capture\nmulti-granularity video context for different temporal resolutions. Second,\nQ-Mamba flexibly transforms the current frame as the learnable query, and\nattentively selects multi-granularity video context into query. Consequently,\nit can adaptively integrate all the video contexts of multi-scale temporal\nresolutions to enhance video understanding. Via a plug-and-play paradigm in\nMLLMs, our H-MBA shows the remarkable performance on multi-modal video tasks in\nautonomous driving, e.g., for risk object detection, it outperforms the\nprevious SOTA method with 5.5% mIoU improvement.\n","authors":["Siran Chen","Yuxiao Luo","Yue Ma","Yu Qiao","Yali Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04302v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.02541v2","updated":"2025-01-08T06:08:30Z","published":"2023-12-05T07:12:05Z","title":"Explainable Severity ranking via pairwise n-hidden comparison: a case\n  study of glaucoma","summary":"  Primary open-angle glaucoma (POAG) is a chronic and progressive optic nerve\ncondition that results in an acquired loss of optic nerve fibers and potential\nblindness. The gradual onset of glaucoma results in patients progressively\nlosing their vision without being consciously aware of the changes. To diagnose\nPOAG and determine its severity, patients must undergo a comprehensive dilated\neye examination. In this work, we build a framework to rank, compare, and\ninterpret the severity of glaucoma using fundus images. We introduce a\nsiamese-based severity ranking using pairwise n-hidden comparisons. We\nadditionally have a novel approach to explaining why a specific image is deemed\nmore severe than others. Our findings indicate that the proposed severity\nranking model surpasses traditional ones in terms of diagnostic accuracy and\ndelivers improved saliency explanations.\n","authors":["Hong Nguyen","Cuong V. Nguyen","Shrikanth Narayanan","Benjamin Y. Xu","Michael Pazzani"],"pdf_url":"https://arxiv.org/pdf/2312.02541v2.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2501.04293v1","updated":"2025-01-08T05:35:07Z","published":"2025-01-08T05:35:07Z","title":"TADFormer : Task-Adaptive Dynamic Transformer for Efficient Multi-Task\n  Learning","summary":"  Transfer learning paradigm has driven substantial advancements in various\nvision tasks. However, as state-of-the-art models continue to grow, classical\nfull fine-tuning often becomes computationally impractical, particularly in\nmulti-task learning (MTL) setup where training complexity increases\nproportional to the number of tasks. Consequently, recent studies have explored\nParameter-Efficient Fine-Tuning (PEFT) for MTL architectures. Despite some\nprogress, these approaches still exhibit limitations in capturing fine-grained,\ntask-specific features that are crucial to MTL. In this paper, we introduce\nTask-Adaptive Dynamic transFormer, termed TADFormer, a novel PEFT framework\nthat performs task-aware feature adaptation in the fine-grained manner by\ndynamically considering task-specific input contexts. TADFormer proposes the\nparameter-efficient prompting for task adaptation and the Dynamic Task Filter\n(DTF) to capture task information conditioned on input contexts. Experiments on\nthe PASCAL-Context benchmark demonstrate that the proposed method achieves\nhigher accuracy in dense scene understanding tasks, while reducing the number\nof trainable parameters by up to 8.4 times when compared to full fine-tuning of\nMTL models. TADFormer also demonstrates superior parameter efficiency and\naccuracy compared to recent PEFT methods.\n","authors":["Seungmin Baek","Soyul Lee","Hayeon Jo","Hyesong Choi","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2501.04293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17051v2","updated":"2025-01-08T05:26:58Z","published":"2023-12-28T14:52:07Z","title":"FILP-3D: Enhancing 3D Few-shot Class-incremental Learning with\n  Pre-trained Vision-Language Models","summary":"  Few-shot class-incremental learning (FSCIL) aims to mitigate the catastrophic\nforgetting issue when a model is incrementally trained on limited data.\nHowever, many of these works lack effective exploration of prior knowledge,\nrendering them unable to effectively address the domain gap issue in the\ncontext of 3D FSCIL, thereby leading to catastrophic forgetting. The\nContrastive Vision-Language Pre-Training (CLIP) model serves as a highly\nsuitable backbone for addressing the challenges of 3D FSCIL due to its abundant\nshape-related prior knowledge. Unfortunately, its direct application to 3D\nFSCIL still faces the incompatibility between 3D data representation and the 2D\nfeatures, primarily manifested as feature space misalignment and significant\nnoise. To address the above challenges, we introduce the FILP-3D framework with\ntwo novel components: the Redundant Feature Eliminator (RFE) for feature space\nmisalignment and the Spatial Noise Compensator (SNC) for significant noise. RFE\naligns the feature spaces of input point clouds and their embeddings by\nperforming a unique dimensionality reduction on the feature space of\npre-trained models (PTMs), effectively eliminating redundant information\nwithout compromising semantic integrity. On the other hand, SNC is a\ngraph-based 3D model designed to capture robust geometric information within\npoint clouds, thereby augmenting the knowledge lost due to projection,\nparticularly when processing real-world scanned data. Moreover, traditional\naccuracy metrics are proven to be biased due to the imbalance in existing 3D\ndatasets. Therefore we propose 3D FSCIL benchmark FSCIL3D-XL and novel\nevaluation metrics that offer a more nuanced assessment of a 3D FSCIL model.\nExperimental results on both established and our proposed benchmarks\ndemonstrate that our approach significantly outperforms existing\nstate-of-the-art methods.\n","authors":["Wan Xu","Tianyu Huang","Tianyu Qu","Guanglei Yang","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2312.17051v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04284v1","updated":"2025-01-08T05:15:43Z","published":"2025-01-08T05:15:43Z","title":"ContextMRI: Enhancing Compressed Sensing MRI through Metadata\n  Conditioning","summary":"  Compressed sensing MRI seeks to accelerate MRI acquisition processes by\nsampling fewer k-space measurements and then reconstructing the missing data\nalgorithmically. The success of these approaches often relies on strong priors\nor learned statistical models. While recent diffusion model-based priors have\nshown great potential, previous methods typically ignore clinically available\nmetadata (e.g. patient demographics, imaging parameters, slice-specific\ninformation). In practice, metadata contains meaningful cues about the anatomy\nand acquisition protocol, suggesting it could further constrain the\nreconstruction problem. In this work, we propose ContextMRI, a text-conditioned\ndiffusion model for MRI that integrates granular metadata into the\nreconstruction process. We train a pixel-space diffusion model directly on\nminimally processed, complex-valued MRI images. During inference, metadata is\nconverted into a structured text prompt and fed to the model via CLIP text\nembeddings. By conditioning the prior on metadata, we unlock more accurate\nreconstructions and show consistent gains across multiple datasets,\nacceleration factors, and undersampling patterns. Our experiments demonstrate\nthat increasing the fidelity of metadata, ranging from slice location and\ncontrast to patient age, sex, and pathology, systematically boosts\nreconstruction performance. This work highlights the untapped potential of\nleveraging clinical context for inverse problems and opens a new direction for\nmetadata-driven MRI reconstruction.\n","authors":["Hyungjin Chung","Dohun Lee","Zihui Wu","Byung-Hoon Kim","Katherine L. Bouman","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2501.04284v1.pdf","comment":"29 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.04283v1","updated":"2025-01-08T05:14:36Z","published":"2025-01-08T05:14:36Z","title":"Enhancing Scene Classification in Cloudy Image Scenarios: A\n  Collaborative Transfer Method with Information Regulation Mechanism using\n  Optical Cloud-Covered and SAR Remote Sensing Images","summary":"  In remote sensing scene classification, leveraging the transfer methods with\nwell-trained optical models is an efficient way to overcome label scarcity.\nHowever, cloud contamination leads to optical information loss and significant\nimpacts on feature distribution, challenging the reliability and stability of\ntransferred target models. Common solutions include cloud removal for optical\ndata or directly using Synthetic aperture radar (SAR) data in the target\ndomain. However, cloud removal requires substantial auxiliary data for support\nand pre-training, while directly using SAR disregards the unobstructed portions\nof optical data. This study presents a scene classification transfer method\nthat synergistically combines multi-modality data, which aims to transfer the\nsource domain model trained on cloudfree optical data to the target domain that\nincludes both cloudy optical and SAR data at low cost. Specifically, the\nframework incorporates two parts: (1) the collaborative transfer strategy,\nbased on knowledge distillation, enables the efficient prior knowledge transfer\nacross heterogeneous data; (2) the information regulation mechanism (IRM) is\nproposed to address the modality imbalance issue during transfer. It employs\nauxiliary models to measure the contribution discrepancy of each modality, and\nautomatically balances the information utilization of modalities during the\ntarget model learning process at the sample-level. The transfer experiments\nwere conducted on simulated and real cloud datasets, demonstrating the superior\nperformance of the proposed method compared to other solutions in cloud-covered\nscenarios. We also verified the importance and limitations of IRM, and further\ndiscussed and visualized the modality imbalance problem during the model\ntransfer. Codes are available at https://github.com/wangyuze-csu/ESCCS\n","authors":["Yuze Wang","Rong Xiao","Haifeng Li","Mariana Belgiu","Chao Tao"],"pdf_url":"https://arxiv.org/pdf/2501.04283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16998v2","updated":"2025-01-08T04:58:02Z","published":"2023-12-28T13:02:16Z","title":"Deep Unfolding Network with Spatial Alignment for multi-modal MRI\n  reconstruction","summary":"  Multi-modal Magnetic Resonance Imaging (MRI) offers complementary diagnostic\ninformation, but some modalities are limited by the long scanning time. To\naccelerate the whole acquisition process, MRI reconstruction of one modality\nfrom highly undersampled k-space data with another fully-sampled reference\nmodality is an efficient solution. However, the misalignment between\nmodalities, which is common in clinic practice, can negatively affect\nreconstruction quality. Existing deep learning-based methods that account for\ninter-modality misalignment perform better, but still share two main common\nlimitations: (1) The spatial alignment task is not adaptively integrated with\nthe reconstruction process, resulting in insufficient complementarity between\nthe two tasks; (2) the entire framework has weak interpretability. In this\npaper, we construct a novel Deep Unfolding Network with Spatial Alignment,\ntermed DUN-SA, to appropriately embed the spatial alignment task into the\nreconstruction process. Concretely, we derive a novel joint\nalignment-reconstruction model with a specially designed cross-modal spatial\nalignment term. By relaxing the model into cross-modal spatial alignment and\nmulti-modal reconstruction tasks, we propose an effective algorithm to solve\nthis model alternatively. Then, we unfold the iterative steps of the proposed\nalgorithm and design corresponding network modules to build DUN-SA with\ninterpretability. Through end-to-end training, we effectively compensate for\nspatial misalignment using only reconstruction loss, and utilize the\nprogressively aligned reference modality to provide inter-modality prior to\nimprove the reconstruction of the target modality. Comprehensive experiments on\nthree real datasets demonstrate that our method exhibits superior\nreconstruction performance compared to state-of-the-art methods.\n","authors":["Hao Zhang","Qi Wang","Jun Shi","Shihui Ying","Zhijie Wen"],"pdf_url":"https://arxiv.org/pdf/2312.16998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04269v1","updated":"2025-01-08T04:37:36Z","published":"2025-01-08T04:37:36Z","title":"Open set label noise learning with robust sample selection and\n  margin-guided module","summary":"  In recent years, the remarkable success of deep neural networks (DNNs) in\ncomputer vision is largely due to large-scale, high-quality labeled datasets.\nTraining directly on real-world datasets with label noise may result in\noverfitting. The traditional method is limited to deal with closed set label\nnoise, where noisy training data has true class labels within the known label\nspace. However, there are some real-world datasets containing open set label\nnoise, which means that some samples belong to an unknown class outside the\nknown label space. To address the open set label noise problem, we introduce a\nmethod based on Robust Sample Selection and Margin-Guided Module (RSS-MGM).\nFirstly, unlike the prior clean sample selection approach, which only select a\nlimited number of clean samples, a robust sample selection module combines\nsmall loss selection or high-confidence sample selection to obtain more clean\nsamples. Secondly, to efficiently distinguish open set label noise and closed\nset ones, margin functions are designed to filter open-set data and closed set\ndata. Thirdly, different processing methods are selected for different types of\nsamples in order to fully utilize the data's prior information and optimize the\nwhole model. Furthermore, extensive experimental results with noisy labeled\ndata from benchmark datasets and real-world datasets, such as CIFAR-100N-C,\nCIFAR80N-O, WebFG-469, and Food101N, indicate that our approach outperforms\nmany state-of-the-art label noise learning methods. Especially, it can more\naccurately divide open set label noise samples and closed set ones.\n","authors":["Yuandi Zhao","Qianxi Xia","Yang Sun","Zhijie Wen","Liyan Ma","Shihui Ying"],"pdf_url":"https://arxiv.org/pdf/2501.04269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09420v3","updated":"2025-01-08T04:31:16Z","published":"2024-11-14T13:15:27Z","title":"SAG-ViT: A Scale-Aware, High-Fidelity Patching Approach with Graph\n  Attention for Vision Transformers","summary":"  Vision Transformers (ViTs) have redefined image classification by leveraging\nself-attention to capture complex patterns and long-range dependencies between\nimage patches. However, a key challenge for ViTs is efficiently incorporating\nmulti-scale feature representations, which is inherent in convolutional neural\nnetworks (CNNs) through their hierarchical structure. Graph transformers have\nmade strides in addressing this by leveraging graph-based modeling, but they\noften lose or insufficiently represent spatial hierarchies, especially since\nredundant or less relevant areas dilute the image's contextual representation.\nTo bridge this gap, we propose SAG-ViT, a Scale-Aware Graph Attention ViT that\nintegrates multi-scale feature capabilities of CNNs, representational power of\nViTs, graph-attended patching to enable richer contextual representation. Using\nEfficientNetV2 as a backbone, the model extracts multi-scale feature maps,\ndividing them into patches to preserve richer semantic information compared to\ndirectly patching the input images. The patches are structured into a graph\nusing spatial and feature similarities, where a Graph Attention Network (GAT)\nrefines the node embeddings. This refined graph representation is then\nprocessed by a Transformer encoder, capturing long-range dependencies and\ncomplex interactions. We evaluate SAG-ViT on benchmark datasets across various\ndomains, validating its effectiveness in advancing image classification tasks.\nOur code and weights are available at https://github.com/shravan-18/SAG-ViT.\n","authors":["Shravan Venkatraman","Jaskaran Singh Walia","Joe Dhanith P R"],"pdf_url":"https://arxiv.org/pdf/2411.09420v3.pdf","comment":"14 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2501.04268v1","updated":"2025-01-08T04:30:45Z","published":"2025-01-08T04:30:45Z","title":"Robotic Programmer: Video Instructed Policy Code Generation for Robotic\n  Manipulation","summary":"  Zero-shot generalization across various robots, tasks and environments\nremains a significant challenge in robotic manipulation. Policy code generation\nmethods use executable code to connect high-level task descriptions and\nlow-level action sequences, leveraging the generalization capabilities of large\nlanguage models and atomic skill libraries. In this work, we propose Robotic\nProgrammer (RoboPro), a robotic foundation model, enabling the capability of\nperceiving visual information and following free-form instructions to perform\nrobotic manipulation with policy code in a zero-shot manner. To address low\nefficiency and high cost in collecting runtime code data for robotic tasks, we\ndevise Video2Code to synthesize executable code from extensive videos\nin-the-wild with off-the-shelf vision-language model and code-domain large\nlanguage model. Extensive experiments show that RoboPro achieves the\nstate-of-the-art zero-shot performance on robotic manipulation in both\nsimulators and real-world environments. Specifically, the zero-shot success\nrate of RoboPro on RLBench surpasses the state-of-the-art model GPT-4o by\n11.6%, which is even comparable to a strong supervised training baseline.\nFurthermore, RoboPro is robust to variations on API formats and skill sets.\n","authors":["Senwei Xie","Hongyu Wang","Zhanqi Xiao","Ruiping Wang","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2501.04268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00547v2","updated":"2025-01-08T04:14:07Z","published":"2024-11-30T17:40:49Z","title":"Motion Dreamer: Realizing Physically Coherent Video Generation through\n  Scene-Aware Motion Reasoning","summary":"  Recent numerous video generation models, also known as world models, have\ndemonstrated the ability to generate plausible real-world videos. However, many\nstudies have shown that these models often produce motion results lacking\nlogical or physical coherence. In this paper, we revisit video generation\nmodels and find that single-stage approaches struggle to produce high-quality\nresults while maintaining coherent motion reasoning. To address this issue, we\npropose \\textbf{Motion Dreamer}, a two-stage video generation framework. In\nStage I, the model generates an intermediate motion representation-such as a\nsegmentation map or depth map-based on the input image and motion conditions,\nfocusing solely on the motion itself. In Stage II, the model uses this\nintermediate motion representation as a condition to generate a high-detail\nvideo. By decoupling motion reasoning from high-fidelity video synthesis, our\napproach allows for more accurate and physically plausible motion generation.\nWe validate the effectiveness of our approach on the Physion dataset and in\nautonomous driving scenarios. For example, given a single push, our model can\nsynthesize the sequential toppling of a set of dominoes. Similarly, by varying\nthe movements of ego-cars, our model can produce different effects on other\nvehicles. Our work opens new avenues in creating models that can reason about\nphysical interactions in a more coherent and realistic manner. Our webpage is\navailable: https://envision-research.github.io/MotionDreamer/.\n","authors":["Tianshuo Xu","Zhifei Chen","Leyi Wu","Hao Lu","Yuying Chen","Lihui Jiang","Bingbing Liu","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2412.00547v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19106v2","updated":"2025-01-08T04:11:48Z","published":"2024-11-28T12:42:14Z","title":"Detailed Object Description with Controllable Dimensions","summary":"  Object description plays an important role for visually impaired individuals\nto understand and compare the differences between objects. Recent multimodal\nlarge language models(MLLMs) exhibit powerful perceptual abilities and\ndemonstrate impressive potential for generating object-centric descriptions.\nHowever, the descriptions generated by such models may still usually contain a\nlot of content that is not relevant to the user intent or miss some important\nobject dimension details. Under special scenarios, users may only need the\ndetails of certain dimensions of an object. In this paper, we propose a\ntraining-free object description refinement pipeline, Dimension Tailor,\ndesigned to enhance user-specified details in object descriptions. This\npipeline includes three steps: dimension extracting, erasing, and\nsupplementing, which decompose the description into user-specified dimensions.\nDimension Tailor can not only improve the quality of object details but also\noffer flexibility in including or excluding specific dimensions based on user\npreferences. We conducted extensive experiments to demonstrate the\neffectiveness of Dimension Tailor on controllable object descriptions. Notably,\nthe proposed pipeline can consistently improve the performance of the recent\nMLLMs. The code is currently accessible at\nhttps://github.com/xin-ran-w/ControllableObjectDescription.\n","authors":["Xinran Wang","Haiwen Zhang","Baoteng Li","Kongming Liang","Hao Sun","Zhongjiang He","Zhanyu Ma","Jun Guo"],"pdf_url":"https://arxiv.org/pdf/2411.19106v2.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.11280v2","updated":"2025-01-08T03:18:23Z","published":"2024-06-17T07:33:30Z","title":"ISR-DPO: Aligning Large Multimodal Models for Videos by Iterative\n  Self-Retrospective DPO","summary":"  Iterative self-improvement, a concept extending beyond personal growth, has\nfound powerful applications in machine learning, particularly in transforming\nweak models into strong ones. While recent advances in natural language\nprocessing have shown its efficacy through iterative preference optimization,\napplying this approach to Video Large Multi-modal Models (VLMMs) remains\nchallenging due to modality misalignment. VLMMs struggle with this misalignment\nduring iterative preference modeling, as the self-judge model often prioritizes\nlinguistic knowledge over visual information. Additionally, iterative\npreference optimization can lead to visually hallucinated verbose responses due\nto length bias within the self-rewarding cycle. To address these issues, we\npropose Iterative Self-Retrospective Direct Preference Optimization (ISR-DPO),\na method that uses self-retrospection to enhance preference modeling. This\napproach enhances the self-judge's focus on informative video regions,\nresulting in more visually grounded preferences. In extensive empirical\nevaluations across diverse video question answering benchmarks, the ISR-DPO\nsignificantly outperforms the state of the art. We are committed to\nopen-sourcing our code, models, and datasets to encourage further\ninvestigation.\n","authors":["Daechul Ahn","Yura Choi","San Kim","Youngjae Yu","Dongyeop Kang","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2406.11280v2.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2311.07594v3","updated":"2025-01-08T02:33:37Z","published":"2023-11-10T09:51:24Z","title":"How to Bridge the Gap between Modalities: Survey on Multimodal Large\n  Language Model","summary":"  We explore Multimodal Large Language Models (MLLMs), which integrate LLMs\nlike GPT-4 to handle multimodal data, including text, images, audio, and more.\nMLLMs demonstrate capabilities such as generating image captions and answering\nimage-based questions, bridging the gap towards real-world human-computer\ninteractions and hinting at a potential pathway to artificial general\nintelligence. However, MLLMs still face challenges in addressing the semantic\ngap in multimodal data, which may lead to erroneous outputs, posing potential\nrisks to society. Selecting the appropriate modality alignment method is\ncrucial, as improper methods might require more parameters without significant\nperformance improvements. This paper aims to explore modality alignment methods\nfor LLMs and their current capabilities. Implementing effective modality\nalignment can help LLMs address environmental issues and enhance accessibility.\nThe study surveys existing modality alignment methods for MLLMs, categorizing\nthem into four groups: (1) Multimodal Converter, which transforms data into a\nformat that LLMs can understand; (2) Multimodal Perceiver, which improves how\nLLMs percieve different types of data; (3) Tool Learning, which leverages\nexternal tools to convert data into a common format, usually text; and (4)\nData-Driven Method, which teaches LLMs to understand specific data types within\ndatasets.\n","authors":["Shezheng Song","Xiaopeng Li","Shasha Li","Shan Zhao","Jie Yu","Jun Ma","Xiaoguang Mao","Weimin Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.07594v3.pdf","comment":"Accepted by TKDE"},{"id":"http://arxiv.org/abs/2412.16050v3","updated":"2025-01-08T01:47:16Z","published":"2024-12-20T16:52:11Z","title":"Label-Efficient Data Augmentation with Video Diffusion Models for\n  Guidewire Segmentation in Cardiac Fluoroscopy","summary":"  The accurate segmentation of guidewires in interventional cardiac fluoroscopy\nvideos is crucial for computer-aided navigation tasks. Although deep learning\nmethods have demonstrated high accuracy and robustness in wire segmentation,\nthey require substantial annotated datasets for generalizability, underscoring\nthe need for extensive labeled data to enhance model performance. To address\nthis challenge, we propose the Segmentation-guided Frame-consistency Video\nDiffusion Model (SF-VD) to generate large collections of labeled fluoroscopy\nvideos, augmenting the training data for wire segmentation networks. SF-VD\nleverages videos with limited annotations by independently modeling scene\ndistribution and motion distribution. It first samples the scene distribution\nby generating 2D fluoroscopy images with wires positioned according to a\nspecified input mask, and then samples the motion distribution by progressively\ngenerating subsequent frames, ensuring frame-to-frame coherence through a\nframe-consistency strategy. A segmentation-guided mechanism further refines the\nprocess by adjusting wire contrast, ensuring a diverse range of visibility in\nthe synthesized image. Evaluation on a fluoroscopy dataset confirms the\nsuperior quality of the generated videos and shows significant improvements in\nguidewire segmentation.\n","authors":["Shaoyan Pan","Yikang Liu","Lin Zhao","Eric Z. Chen","Xiao Chen","Terrence Chen","Shanhui Sun"],"pdf_url":"https://arxiv.org/pdf/2412.16050v3.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04217v1","updated":"2025-01-08T01:27:35Z","published":"2025-01-08T01:27:35Z","title":"Continual Self-supervised Learning Considering Medical Domain Knowledge\n  in Chest CT Images","summary":"  We propose a novel continual self-supervised learning method (CSSL)\nconsidering medical domain knowledge in chest CT images. Our approach addresses\nthe challenge of sequential learning by effectively capturing the relationship\nbetween previously learned knowledge and new information at different stages.\nBy incorporating an enhanced DER into CSSL and maintaining both diversity and\nrepresentativeness within the rehearsal buffer of DER, the risk of data\ninterference during pretraining is reduced, enabling the model to learn more\nricher and robust feature representations. In addition, we incorporate a mixup\nstrategy and feature distillation to further enhance the model's ability to\nlearn meaningful representations. We validate our method using chest CT images\nobtained under two different imaging conditions, demonstrating superior\nperformance compared to state-of-the-art methods.\n","authors":["Ren Tasai","Guang Li","Ren Togo","Minghui Tang","Takaaki Yoshimura","Hiroyuki Sugimori","Kenji Hirata","Takahiro Ogawa","Kohsuke Kudo","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2501.04217v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.04213v1","updated":"2025-01-08T01:18:14Z","published":"2025-01-08T01:18:14Z","title":"UPAQ: A Framework for Real-Time and Energy-Efficient 3D Object Detection\n  in Autonomous Vehicles","summary":"  To enhance perception in autonomous vehicles (AVs), recent efforts are\nconcentrating on 3D object detectors, which deliver more comprehensive\npredictions than traditional 2D object detectors, at the cost of increased\nmemory footprint and computational resource usage. We present a novel framework\ncalled UPAQ, which leverages semi-structured pattern pruning and quantization\nto improve the efficiency of LiDAR point-cloud and camera-based 3D object\ndetectors on resource-constrained embedded AV platforms. Experimental results\non the Jetson Orin Nano embedded platform indicate that UPAQ achieves up to\n5.62x and 5.13x model compression rates, up to 1.97x and 1.86x boost in\ninference speed, and up to 2.07x and 1.87x reduction in energy consumption\ncompared to state-of-the-art model compression frameworks, on the Pointpillar\nand SMOKE models respectively.\n","authors":["Abhishek Balasubramaniam","Febin P Sunny","Sudeep Pasricha"],"pdf_url":"https://arxiv.org/pdf/2501.04213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06438v2","updated":"2025-01-08T01:11:07Z","published":"2024-01-12T08:18:42Z","title":"Improving Low-Light Image Recognition Performance Based on\n  Image-adaptive Learnable Module","summary":"  In recent years, significant progress has been made in image recognition\ntechnology based on deep neural networks. However, improving recognition\nperformance under low-light conditions remains a significant challenge. This\nstudy addresses the enhancement of recognition model performance in low-light\nconditions. We propose an image-adaptive learnable module which apply\nappropriate image processing on input images and a hyperparameter predictor to\nforecast optimal parameters used in the module. Our proposed approach allows\nfor the enhancement of recognition performance under low-light conditions by\neasily integrating as a front-end filter without the need to retrain existing\nrecognition models designed for low-light conditions. Through experiments, our\nproposed method demonstrates its contribution to enhancing image recognition\nperformance under low-light conditions.\n","authors":["Seitaro Ono","Yuka Ogino","Takahiro Toizumi","Atsushi Ito","Masato Tsukada"],"pdf_url":"https://arxiv.org/pdf/2401.06438v2.pdf","comment":"accepted to VISAPP2024"},{"id":"http://arxiv.org/abs/2501.04210v1","updated":"2025-01-08T01:09:49Z","published":"2025-01-08T01:09:49Z","title":"Recognition-Oriented Low-Light Image Enhancement based on Global and\n  Pixelwise Optimization","summary":"  In this paper, we propose a novel low-light image enhancement method aimed at\nimproving the performance of recognition models. Despite recent advances in\ndeep learning, the recognition of images under low-light conditions remains a\nchallenge. Although existing low-light image enhancement methods have been\ndeveloped to improve image visibility for human vision, they do not\nspecifically focus on enhancing recognition model performance. Our proposed\nlow-light image enhancement method consists of two key modules: the Global\nEnhance Module, which adjusts the overall brightness and color balance of the\ninput image, and the Pixelwise Adjustment Module, which refines image features\nat the pixel level. These modules are trained to enhance input images to\nimprove downstream recognition model performance effectively. Notably, the\nproposed method can be applied as a frontend filter to improve low-light\nrecognition performance without requiring retraining of downstream recognition\nmodels. Experimental results demonstrate that our method improves the\nperformance of pretrained recognition models under low-light conditions and its\neffectiveness.\n","authors":["Seitaro Ono","Yuka Ogino","Takahiro Toizumi","Atsushi Ito","Masato Tsukada"],"pdf_url":"https://arxiv.org/pdf/2501.04210v1.pdf","comment":"accepted to VISAPP2025"},{"id":"http://arxiv.org/abs/2412.05394v2","updated":"2025-01-08T01:06:34Z","published":"2024-12-06T19:40:00Z","title":"YOLOv5-Based Object Detection for Emergency Response in Aerial Imagery","summary":"  This paper presents a robust approach for object detection in aerial imagery\nusing the YOLOv5 model. We focus on identifying critical objects such as\nambulances, car crashes, police vehicles, tow trucks, fire engines, overturned\ncars, and vehicles on fire. By leveraging a custom dataset, we outline the\ncomplete pipeline from data collection and annotation to model training and\nevaluation. Our results demonstrate that YOLOv5 effectively balances speed and\naccuracy, making it suitable for real-time emergency response applications.\nThis work addresses key challenges in aerial imagery, including small object\ndetection and complex backgrounds, and provides insights for future research in\nautomated emergency response systems.\n","authors":["Sindhu Boddu","Arindam Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2412.05394v2.pdf","comment":"6 pages, 8 figures, submitted for open-access publication on arXiv"},{"id":"http://arxiv.org/abs/2501.04206v1","updated":"2025-01-08T00:54:43Z","published":"2025-01-08T00:54:43Z","title":"GRAPHITE: Graph-Based Interpretable Tissue Examination for Enhanced\n  Explainability in Breast Cancer Histopathology","summary":"  Explainable AI (XAI) in medical histopathology is essential for enhancing the\ninterpretability and clinical trustworthiness of deep learning models in cancer\ndiagnosis. However, the black-box nature of these models often limits their\nclinical adoption. We introduce GRAPHITE (Graph-based Interpretable Tissue\nExamination), a post-hoc explainable framework designed for breast cancer\ntissue microarray (TMA) analysis. GRAPHITE employs a multiscale approach,\nextracting patches at various magnification levels, constructing an\nhierarchical graph, and utilising graph attention networks (GAT) with scalewise\nattention (SAN) to capture scale-dependent features. We trained the model on\n140 tumour TMA cores and four benign whole slide images from which 140 benign\nsamples were created, and tested it on 53 pathologist-annotated TMA samples.\nGRAPHITE outperformed traditional XAI methods, achieving a mean average\nprecision (mAP) of 0.56, an area under the receiver operating characteristic\ncurve (AUROC) of 0.94, and a threshold robustness (ThR) of 0.70, indicating\nthat the model maintains high performance across a wide range of thresholds. In\nclinical utility, GRAPHITE achieved the highest area under the decision curve\n(AUDC) of 4.17e+5, indicating reliable decision support across thresholds.\nThese results highlight GRAPHITE's potential as a clinically valuable tool in\ncomputational pathology, providing interpretable visualisations that align with\nthe pathologists' diagnostic reasoning and support precision medicine.\n","authors":["Raktim Kumar Mondol","Ewan K. A. Millar","Peter H. Graham","Lois Browne","Arcot Sowmya","Erik Meijering"],"pdf_url":"https://arxiv.org/pdf/2501.04206v1.pdf","comment":"24 Pages, 9 Figures, 1 Tables"},{"id":"http://arxiv.org/abs/2501.04204v1","updated":"2025-01-08T00:52:19Z","published":"2025-01-08T00:52:19Z","title":"LipGen: Viseme-Guided Lip Video Generation for Enhancing Visual Speech\n  Recognition","summary":"  Visual speech recognition (VSR), commonly known as lip reading, has garnered\nsignificant attention due to its wide-ranging practical applications. The\nadvent of deep learning techniques and advancements in hardware capabilities\nhave significantly enhanced the performance of lip reading models. Despite\nthese advancements, existing datasets predominantly feature stable video\nrecordings with limited variability in lip movements. This limitation results\nin models that are highly sensitive to variations encountered in real-world\nscenarios. To address this issue, we propose a novel framework, LipGen, which\naims to improve model robustness by leveraging speech-driven synthetic visual\ndata, thereby mitigating the constraints of current datasets. Additionally, we\nintroduce an auxiliary task that incorporates viseme classification alongside\nattention mechanisms. This approach facilitates the efficient integration of\ntemporal information, directing the model's focus toward the relevant segments\nof speech, thereby enhancing discriminative capabilities. Our method\ndemonstrates superior performance compared to the current state-of-the-art on\nthe lip reading in the wild (LRW) dataset and exhibits even more pronounced\nadvantages under challenging conditions.\n","authors":["Bowen Hao","Dongliang Zhou","Xiaojie Li","Xingyu Zhang","Liang Xie","Jianlong Wu","Erwei Yin"],"pdf_url":"https://arxiv.org/pdf/2501.04204v1.pdf","comment":"This paper has been accepted for presentation at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.04202v1","updated":"2025-01-08T00:43:31Z","published":"2025-01-08T00:43:31Z","title":"Generative Dataset Distillation Based on Self-knowledge Distillation","summary":"  Dataset distillation is an effective technique for reducing the cost and\ncomplexity of model training while maintaining performance by compressing large\ndatasets into smaller, more efficient versions. In this paper, we present a\nnovel generative dataset distillation method that can improve the accuracy of\naligning prediction logits. Our approach integrates self-knowledge distillation\nto achieve more precise distribution matching between the synthetic and\noriginal data, thereby capturing the overall structure and relationships within\nthe data. To further improve the accuracy of alignment, we introduce a\nstandardization step on the logits before performing distribution matching,\nensuring consistency in the range of logits. Through extensive experiments, we\ndemonstrate that our method outperforms existing state-of-the-art methods,\nresulting in superior distillation performance.\n","authors":["Longzhen Li","Guang Li","Ren Togo","Keisuke Maeda","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2501.04202v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2404.15683v4","updated":"2025-01-08T00:27:00Z","published":"2024-04-24T06:35:56Z","title":"AnoFPDM: Anomaly Segmentation with Forward Process of Diffusion Models\n  for Brain MRI","summary":"  Weakly-supervised diffusion models (DMs) in anomaly segmentation, leveraging\nimage-level labels, have attracted significant attention for their superior\nperformance compared to unsupervised methods. It eliminates the need for\npixel-level labels in training, offering a more cost-effective alternative to\nsupervised methods. However, existing methods are not fully weakly-supervised\nbecause they heavily rely on costly pixel-level labels for hyperparameter\ntuning in inference. To tackle this challenge, we introduce Anomaly\nSegmentation with Forward Process of Diffusion Models (AnoFPDM), a fully\nweakly-supervised framework that operates without the need of pixel-level\nlabels. Leveraging the unguided forward process as a reference for the guided\nforward process, we select hyperparameters such as the noise scale, the\nthreshold for segmentation and the guidance strength. We aggregate anomaly maps\nfrom guided forward process, enhancing the signal strength of anomalous\nregions. Remarkably, our proposed method outperforms recent state-of-the-art\nweakly-supervised approaches, even without utilizing pixel-level labels.\n","authors":["Yiming Che","Fazle Rafsani","Jay Shah","Md Mahfuzur Rahman Siddiquee","Teresa Wu"],"pdf_url":"https://arxiv.org/pdf/2404.15683v4.pdf","comment":"v4: added appendices and fixed some typos"},{"id":"http://arxiv.org/abs/2407.05910v3","updated":"2025-01-08T23:40:38Z","published":"2024-07-08T13:15:11Z","title":"Enhancing Vision-Language Models with Scene Graphs for Traffic Accident\n  Understanding","summary":"  Recognizing a traffic accident is an essential part of any autonomous driving\nor road monitoring system. An accident can appear in a wide variety of forms,\nand understanding what type of accident is taking place may be useful to\nprevent it from recurring. This work focuses on classifying traffic scenes into\nspecific accident types. We approach the problem by representing a traffic\nscene as a graph, where objects such as cars can be represented as nodes, and\nrelative distances and directions between them as edges. This representation of\na traffic scene is referred to as a scene graph, and can be used as input for\nan accident classifier. Better results are obtained with a classifier that\nfuses the scene graph input with visual and textual representations. This work\nintroduces a multi-stage, multimodal pipeline that pre-processes videos of\ntraffic accidents, encodes them as scene graphs, and aligns this representation\nwith vision and language modalities before executing the classification task.\nWhen trained on 4 classes, our method achieves a balanced accuracy score of\n57.77% on an (unbalanced) subset of the popular Detection of Traffic Anomaly\n(DoTA) benchmark, representing an increase of close to 5 percentage points from\nthe case where scene graph information is not taken into account.\n","authors":["Aaron Lohner","Francesco Compagno","Jonathan Francis","Alessandro Oltramari"],"pdf_url":"https://arxiv.org/pdf/2407.05910v3.pdf","comment":"Won the 'Best Paper Runner-up Award' at the 2024 IEEE International\n  Automated Vehicle Validation Conference (IAVVC 2024). Also accepted at the\n  1st Workshop on Semantic Reasoning and Goal Understanding in Robotics, at the\n  Robotics Science and Systems Conference (RSS SemRob 2024)"},{"id":"http://arxiv.org/abs/2501.04878v1","updated":"2025-01-08T23:21:49Z","published":"2025-01-08T23:21:49Z","title":"Topological Classification of points in $Z^2$ by using Topological\n  Numbers for $2$D discrete binary images","summary":"  In this paper, we propose a topological classification of points for 2D\ndiscrete binary images. This classification is based on the values of the\ncalculus of topological numbers. Six classes of points are proposed: isolated\npoint, interior point, simple point, curve point, point of intersection of 3\ncurves, point of intersection of 4 curves. The number of configurations of each\nclass is also given.\n","authors":["Christophe Lohou"],"pdf_url":"https://arxiv.org/pdf/2501.04878v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2410.21588"},{"id":"http://arxiv.org/abs/2410.06154v3","updated":"2025-01-08T23:08:03Z","published":"2024-10-08T15:55:40Z","title":"GLOV: Guided Large Language Models as Implicit Optimizers for Vision\n  Language Models","summary":"  In this work, we propose a novel method (GLOV) enabling Large Language Models\n(LLMs) to act as implicit Optimizers for Vision-Langugage Models (VLMs) to\nenhance downstream vision tasks. Our GLOV meta-prompts an LLM with the\ndownstream task description, querying it for suitable VLM prompts (e.g., for\nzero-shot classification with CLIP). These prompts are ranked according to a\npurity measure obtained through a fitness function. In each respective\noptimization step, the ranked prompts are fed as in-context examples (with\ntheir accuracies) to equip the LLM with the knowledge of the type of text\nprompts preferred by the downstream VLM. Furthermore, we also explicitly steer\nthe LLM generation process in each optimization step by specifically adding an\noffset difference vector of the embeddings from the positive and negative\nsolutions found by the LLM, in previous optimization steps, to the intermediate\nlayer of the network for the next generation step. This offset vector steers\nthe LLM generation toward the type of language preferred by the downstream VLM,\nresulting in enhanced performance on the downstream vision tasks. We\ncomprehensively evaluate our GLOV on 16 diverse datasets using two families of\nVLMs, i.e., dual-encoder (e.g., CLIP) and encoder-decoder (e.g., LLaVa) models\n-- showing that the discovered solutions can enhance the recognition\nperformance by up to 15.0% and 57.5% (3.8% and 21.6% on average) for these\nmodels.\n","authors":["M. Jehanzeb Mirza","Mengjie Zhao","Zhuoyuan Mao","Sivan Doveh","Wei Lin","Paul Gavrikov","Michael Dorkenwald","Shiqi Yang","Saurav Jha","Hiromi Wakaki","Yuki Mitsufuji","Horst Possegger","Rogerio Feris","Leonid Karlinsky","James Glass"],"pdf_url":"https://arxiv.org/pdf/2410.06154v3.pdf","comment":"Code: https://github.com/jmiemirza/GLOV"},{"id":"http://arxiv.org/abs/2501.04873v1","updated":"2025-01-08T23:07:10Z","published":"2025-01-08T23:07:10Z","title":"Back Home: A Machine Learning Approach to Seashell Classification and\n  Ecosystem Restoration","summary":"  In Costa Rica, an average of 5 tons of seashells are extracted from\necosystems annually. Confiscated seashells, cannot be returned to their\necosystems due to the lack of origin recognition. To address this issue, we\ndeveloped a convolutional neural network (CNN) specifically for seashell\nidentification. We built a dataset from scratch, consisting of approximately\n19000 images from the Pacific and Caribbean coasts. Using this dataset, the\nmodel achieved a classification accuracy exceeding 85%. The model has been\nintegrated into a user-friendly application, which has classified over 36,000\nseashells to date, delivering real-time results within 3 seconds per image. To\nfurther enhance the system's accuracy, an anomaly detection mechanism was\nincorporated to filter out irrelevant or anomalous inputs, ensuring only valid\nseashell images are processed.\n","authors":["Alexander Valverde","Luis Solano"],"pdf_url":"https://arxiv.org/pdf/2501.04873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00915v3","updated":"2025-01-08T22:58:51Z","published":"2023-03-02T02:20:04Z","title":"BiomedCLIP: a multimodal biomedical foundation model pretrained from\n  fifteen million scientific image-text pairs","summary":"  Biomedical data is inherently multimodal, comprising physical measurements\nand natural language narratives. A generalist biomedical AI model needs to\nsimultaneously process different modalities of data, including text and images.\nTherefore, training an effective generalist biomedical model requires\nhigh-quality multimodal data, such as parallel image-text pairs. Here, we\npresent PMC-15M, a novel dataset that is two orders of magnitude larger than\nexisting biomedical multimodal datasets such as MIMIC-CXR, and spans a diverse\nrange of biomedical image types. PMC-15M contains 15 million biomedical\nimage-text pairs collected from 4.4 million scientific articles. Based on\nPMC-15M, we have pretrained BiomedCLIP, a multimodal foundation model, with\ndomain-specific adaptations tailored to biomedical vision-language processing.\nWe conducted extensive experiments and ablation studies on standard biomedical\nimaging tasks from retrieval to classification to visual question-answering\n(VQA). BiomedCLIP achieved new state-of-the-art results in a wide range of\nstandard datasets, substantially outperforming prior approaches. Intriguingly,\nby large-scale pretraining on diverse biomedical image types, BiomedCLIP even\noutperforms state-of-the-art radiology-specific models such as BioViL in\nradiology-specific tasks such as RSNA pneumonia detection. In summary,\nBiomedCLIP is a fully open-access foundation model that achieves\nstate-of-the-art performance on various biomedical tasks, paving the way for\ntransformative multimodal biomedical discovery and applications. We release our\nmodels at https://aka.ms/biomedclip to facilitate future research in multimodal\nbiomedical AI.\n","authors":["Sheng Zhang","Yanbo Xu","Naoto Usuyama","Hanwen Xu","Jaspreet Bagga","Robert Tinn","Sam Preston","Rajesh Rao","Mu Wei","Naveen Valluri","Cliff Wong","Andrea Tupini","Yu Wang","Matt Mazzola","Swadheen Shukla","Lars Liden","Jianfeng Gao","Angela Crabtree","Brian Piening","Carlo Bifulco","Matthew P. Lungren","Tristan Naumann","Sheng Wang","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2303.00915v3.pdf","comment":"The models are released at https://aka.ms/biomedclip"},{"id":"http://arxiv.org/abs/2501.04861v1","updated":"2025-01-08T22:22:44Z","published":"2025-01-08T22:22:44Z","title":"LayerMix: Enhanced Data Augmentation through Fractal Integration for\n  Robust Deep Learning","summary":"  Deep learning models have demonstrated remarkable performance across various\ncomputer vision tasks, yet their vulnerability to distribution shifts remains a\ncritical challenge. Despite sophisticated neural network architectures,\nexisting models often struggle to maintain consistent performance when\nconfronted with Out-of-Distribution (OOD) samples, including natural\ncorruptions, adversarial perturbations, and anomalous patterns. We introduce\nLayerMix, an innovative data augmentation approach that systematically enhances\nmodel robustness through structured fractal-based image synthesis. By\nmeticulously integrating structural complexity into training datasets, our\nmethod generates semantically consistent synthetic samples that significantly\nimprove neural network generalization capabilities. Unlike traditional\naugmentation techniques that rely on random transformations, LayerMix employs a\nstructured mixing pipeline that preserves original image semantics while\nintroducing controlled variability. Extensive experiments across multiple\nbenchmark datasets, including CIFAR-10, CIFAR-100, ImageNet-200, and\nImageNet-1K demonstrate LayerMixs superior performance in classification\naccuracy and substantially enhances critical Machine Learning (ML) safety\nmetrics, including resilience to natural image corruptions, robustness against\nadversarial attacks, improved model calibration and enhanced prediction\nconsistency. LayerMix represents a significant advancement toward developing\nmore reliable and adaptable artificial intelligence systems by addressing the\nfundamental challenges of deep learning generalization. The code is available\nat https://github.com/ahmadmughees/layermix.\n","authors":["Hafiz Mughees Ahmad","Dario Morle","Afshin Rahimi"],"pdf_url":"https://arxiv.org/pdf/2501.04861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04846v1","updated":"2025-01-08T21:15:14Z","published":"2025-01-08T21:15:14Z","title":"EDMB: Edge Detector with Mamba","summary":"  Transformer-based models have made significant progress in edge detection,\nbut their high computational cost is prohibitive. Recently, vision Mamba have\nshown excellent ability in efficiently capturing long-range dependencies.\nDrawing inspiration from this, we propose a novel edge detector with Mamba,\ntermed EDMB, to efficiently generate high-quality multi-granularity edges. In\nEDMB, Mamba is combined with a global-local architecture, therefore it can\nfocus on both global information and fine-grained cues. The fine-grained cues\nplay a crucial role in edge detection, but are usually ignored by ordinary\nMamba. We design a novel decoder to construct learnable Gaussian distributions\nby fusing global features and fine-grained features. And the multi-grained\nedges are generated by sampling from the distributions. In order to make\nmulti-granularity edges applicable to single-label data, we introduce Evidence\nLower Bound loss to supervise the learning of the distributions. On the\nmulti-label dataset BSDS500, our proposed EDMB achieves competitive\nsingle-granularity ODS 0.837 and multi-granularity ODS 0.851 without\nmulti-scale test or extra PASCAL-VOC data. Remarkably, EDMB can be extended to\nsingle-label datasets such as NYUDv2 and BIPED. The source code is available at\nhttps://github.com/Li-yachuan/EDMB.\n","authors":["Yachuan Li","Xavier Soria Poma","Yun Bai","Qian Xiao","Chaozhi Yang","Guanlin Li","Zongmin Li"],"pdf_url":"https://arxiv.org/pdf/2501.04846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02640v2","updated":"2025-01-08T20:57:22Z","published":"2025-01-05T20:05:10Z","title":"Multispectral Pedestrian Detection with Sparsely Annotated Label","summary":"  Although existing Sparsely Annotated Object Detection (SAOD) approches have\nmade progress in handling sparsely annotated environments in multispectral\ndomain, where only some pedestrians are annotated, they still have the\nfollowing limitations: (i) they lack considerations for improving the quality\nof pseudo-labels for missing annotations, and (ii) they rely on fixed ground\ntruth annotations, which leads to learning only a limited range of pedestrian\nvisual appearances in the multispectral domain. To address these issues, we\npropose a novel framework called Sparsely Annotated Multispectral Pedestrian\nDetection (SAMPD). For limitation (i), we introduce Multispectral\nPedestrian-aware Adaptive Weight (MPAW) and Positive Pseudo-label Enhancement\n(PPE) module. Utilizing multispectral knowledge, these modules ensure the\ngeneration of high-quality pseudo-labels and enable effective learning by\nincreasing weights for high-quality pseudo-labels based on modality\ncharacteristics. To address limitation (ii), we propose an Adaptive Pedestrian\nRetrieval Augmentation (APRA) module, which adaptively incorporates pedestrian\npatches from ground-truth and dynamically integrates high-quality pseudo-labels\nwith the ground-truth, facilitating a more diverse learning pool of\npedestrians. Extensive experimental results demonstrate that our SAMPD\nsignificantly enhances performance in sparsely annotated environments within\nthe multispectral domain.\n","authors":["Chan Lee","Seungho Shin","Gyeong-Moon Park","Jung Uk Kim"],"pdf_url":"https://arxiv.org/pdf/2501.02640v2.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04815v1","updated":"2025-01-08T20:11:09Z","published":"2025-01-08T20:11:09Z","title":"Towards Generalizable Trajectory Prediction Using Dual-Level\n  Representation Learning And Adaptive Prompting","summary":"  Existing vehicle trajectory prediction models struggle with generalizability,\nprediction uncertainties, and handling complex interactions. It is often due to\nlimitations like complex architectures customized for a specific dataset and\ninefficient multimodal handling. We propose Perceiver with Register queries\n(PerReg+), a novel trajectory prediction framework that introduces: (1)\nDual-Level Representation Learning via Self-Distillation (SD) and Masked\nReconstruction (MR), capturing global context and fine-grained details.\nAdditionally, our approach of reconstructing segmentlevel trajectories and lane\nsegments from masked inputs with query drop, enables effective use of\ncontextual information and improves generalization; (2) Enhanced Multimodality\nusing register-based queries and pretraining, eliminating the need for\nclustering and suppression; and (3) Adaptive Prompt Tuning during fine-tuning,\nfreezing the main architecture and optimizing a small number of prompts for\nefficient adaptation. PerReg+ sets a new state-of-the-art performance on\nnuScenes [1], Argoverse 2 [2], and Waymo Open Motion Dataset (WOMD) [3].\nRemarkable, our pretrained model reduces the error by 6.8% on smaller datasets,\nand multi-dataset training enhances generalization. In cross-domain tests,\nPerReg+ reduces B-FDE by 11.8% compared to its non-pretrained variant.\n","authors":["Kaouther Messaoud","Matthieu Cord","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2501.04815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04794v1","updated":"2025-01-08T19:18:44Z","published":"2025-01-08T19:18:44Z","title":"A Steerable Deep Network for Model-Free Diffusion MRI Registration","summary":"  Nonrigid registration is vital to medical image analysis but remains\nchallenging for diffusion MRI (dMRI) due to its high-dimensional,\norientation-dependent nature. While classical methods are accurate, they are\ncomputationally demanding, and deep neural networks, though efficient, have\nbeen underexplored for nonrigid dMRI registration compared to structural\nimaging. We present a novel, deep learning framework for model-free, nonrigid\nregistration of raw diffusion MRI data that does not require explicit\nreorientation. Unlike previous methods relying on derived representations such\nas diffusion tensors or fiber orientation distribution functions, in our\napproach, we formulate the registration as an equivariant diffeomorphism of\nposition-and-orientation space. Central to our method is an\n$\\mathsf{SE}(3)$-equivariant UNet that generates velocity fields while\npreserving the geometric properties of a raw dMRI's domain. We introduce a new\nloss function based on the maximum mean discrepancy in Fourier space,\nimplicitly matching ensemble average propagators across images. Experimental\nresults on Human Connectome Project dMRI data demonstrate competitive\nperformance compared to state-of-the-art approaches, with the added advantage\nof bypassing the overhead for estimating derived representations. This work\nestablishes a foundation for data-driven, geometry-aware dMRI registration\ndirectly in the acquisition space.\n","authors":["Gianfranco Cortes","Baba C. Vemuri"],"pdf_url":"https://arxiv.org/pdf/2501.04794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04784v1","updated":"2025-01-08T19:02:32Z","published":"2025-01-08T19:02:32Z","title":"Leveraging Registers in Vision Transformers for Robust Adaptation","summary":"  Vision Transformers (ViTs) have shown success across a variety of tasks due\nto their ability to capture global image representations. Recent studies have\nidentified the existence of high-norm tokens in ViTs, which can interfere with\nunsupervised object discovery. To address this, the use of \"registers\" which\nare additional tokens that isolate high norm patch tokens while capturing\nglobal image-level information has been proposed. While registers have been\nstudied extensively for object discovery, their generalization properties\nparticularly in out-of-distribution (OOD) scenarios, remains underexplored. In\nthis paper, we examine the utility of register token embeddings in providing\nadditional features for improving generalization and anomaly rejection. To that\nend, we propose a simple method that combines the special CLS token embedding\ncommonly employed in ViTs with the average-pooled register embeddings to create\nfeature representations which are subsequently used for training a downstream\nclassifier. We find that this enhances OOD generalization and anomaly\nrejection, while maintaining in-distribution (ID) performance. Extensive\nexperiments across multiple ViT backbones trained with and without registers\nreveal consistent improvements of 2-4\\% in top-1 OOD accuracy and a 2-3\\%\nreduction in false positive rates for anomaly detection. Importantly, these\ngains are achieved without additional computational overhead.\n","authors":["Srikar Yellapragada","Kowshik Thopalli","Vivek Narayanaswamy","Wesam Sakla","Yang Liu","Yamen Mubarka","Dimitris Samaras","Jayaraman J. Thiagarajan"],"pdf_url":"https://arxiv.org/pdf/2501.04784v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.04782v1","updated":"2025-01-08T19:01:12Z","published":"2025-01-08T19:01:12Z","title":"GaussianVideo: Efficient Video Representation via Hierarchical Gaussian\n  Splatting","summary":"  Efficient neural representations for dynamic video scenes are critical for\napplications ranging from video compression to interactive simulations. Yet,\nexisting methods often face challenges related to high memory usage, lengthy\ntraining times, and temporal consistency. To address these issues, we introduce\na novel neural video representation that combines 3D Gaussian splatting with\ncontinuous camera motion modeling. By leveraging Neural ODEs, our approach\nlearns smooth camera trajectories while maintaining an explicit 3D scene\nrepresentation through Gaussians. Additionally, we introduce a spatiotemporal\nhierarchical learning strategy, progressively refining spatial and temporal\nfeatures to enhance reconstruction quality and accelerate convergence. This\nmemory-efficient approach achieves high-quality rendering at impressive speeds.\nExperimental results show that our hierarchical learning, combined with robust\ncamera motion modeling, captures complex dynamic scenes with strong temporal\nconsistency, achieving state-of-the-art performance across diverse video\ndatasets in both high- and low-motion scenarios.\n","authors":["Andrew Bond","Jui-Hsien Wang","Long Mai","Erkut Erdem","Aykut Erdem"],"pdf_url":"https://arxiv.org/pdf/2501.04782v1.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.04765v1","updated":"2025-01-08T18:38:25Z","published":"2025-01-08T18:38:25Z","title":"TREAD: Token Routing for Efficient Architecture-agnostic Diffusion\n  Training","summary":"  Diffusion models have emerged as the mainstream approach for visual\ngeneration. However, these models usually suffer from sample inefficiency and\nhigh training costs. This issue is particularly pronounced in the standard\ndiffusion transformer architecture due to its quadratic complexity relative to\ninput length. Recent works have addressed this by reducing the number of tokens\nprocessed in the model, often through masking. In contrast, this work aims to\nimprove the training efficiency of the diffusion backbone by using predefined\nroutes that store this information until it is reintroduced to deeper layers of\nthe model, rather than discarding these tokens entirely. Further, we combine\nmultiple routes and introduce an adapted auxiliary loss that accounts for all\napplied routes. Our method is not limited to the common transformer-based model\n- it can also be applied to state-space models. Unlike most current approaches,\nTREAD achieves this without architectural modifications. Finally, we show that\nour method reduces the computational cost and simultaneously boosts model\nperformance on the standard benchmark ImageNet-1K 256 x 256 in\nclass-conditional synthesis. Both of these benefits multiply to a convergence\nspeedup of 9.55x at 400K training iterations compared to DiT and 25.39x\ncompared to the best benchmark performance of DiT at 7M training iterations.\n","authors":["Felix Krause","Timy Phan","Vincent Tao Hu","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2501.04765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04764v1","updated":"2025-01-08T18:35:48Z","published":"2025-01-08T18:35:48Z","title":"Video Summarisation with Incident and Context Information using\n  Generative AI","summary":"  The proliferation of video content production has led to vast amounts of\ndata, posing substantial challenges in terms of analysis efficiency and\nresource utilization. Addressing this issue calls for the development of robust\nvideo analysis tools. This paper proposes a novel approach leveraging\nGenerative Artificial Intelligence (GenAI) to facilitate streamlined video\nanalysis. Our tool aims to deliver tailored textual summaries of user-defined\nqueries, offering a focused insight amidst extensive video datasets. Unlike\nconventional frameworks that offer generic summaries or limited action\nrecognition, our method harnesses the power of GenAI to distil relevant\ninformation, enhancing analysis precision and efficiency. Employing YOLO-V8 for\nobject detection and Gemini for comprehensive video and text analysis, our\nsolution achieves heightened contextual accuracy. By combining YOLO with\nGemini, our approach furnishes textual summaries extracted from extensive CCTV\nfootage, enabling users to swiftly navigate and verify pertinent events without\nthe need for exhaustive manual review. The quantitative evaluation revealed a\nsimilarity of 72.8%, while the qualitative assessment rated an accuracy of 85%,\ndemonstrating the capability of the proposed method.\n","authors":["Ulindu De Silva","Leon Fernando","Kalinga Bandara","Rashmika Nawaratne"],"pdf_url":"https://arxiv.org/pdf/2501.04764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04750v1","updated":"2025-01-08T16:17:05Z","published":"2025-01-08T16:17:05Z","title":"Efficient License Plate Recognition in Videos Using Visual Rhythm and\n  Accumulative Line Analysis","summary":"  Video-based Automatic License Plate Recognition (ALPR) involves extracting\nvehicle license plate text information from video captures. Traditional systems\ntypically rely heavily on high-end computing resources and utilize multiple\nframes to recognize license plates, leading to increased computational\noverhead. In this paper, we propose two methods capable of efficiently\nextracting exactly one frame per vehicle and recognizing its license plate\ncharacters from this single image, thus significantly reducing computational\ndemands. The first method uses Visual Rhythm (VR) to generate time-spatial\nimages from videos, while the second employs Accumulative Line Analysis (ALA),\na novel algorithm based on single-line video processing for real-time\noperation. Both methods leverage YOLO for license plate detection within the\nframe and a Convolutional Neural Network (CNN) for Optical Character\nRecognition (OCR) to extract textual information. Experiments on real videos\ndemonstrate that the proposed methods achieve results comparable to traditional\nframe-by-frame approaches, with processing speeds three times faster.\n","authors":["Victor Nascimento Ribeiro","Nina S. T. Hirata"],"pdf_url":"https://arxiv.org/pdf/2501.04750v1.pdf","comment":"Accepted for presentation at the Conference on Graphics, Patterns and\n  Images (SIBGRAPI) 2024"},{"id":"http://arxiv.org/abs/2501.05488v1","updated":"2025-01-08T18:57:05Z","published":"2025-01-08T18:57:05Z","title":"EndoDINO: A Foundation Model for GI Endoscopy","summary":"  In this work, we present EndoDINO, a foundation model for GI endoscopy tasks\nthat achieves strong generalizability by pre-training on a well-curated image\ndataset sampled from the largest known GI endoscopy video dataset in the\nliterature. Specifically, we pre-trained ViT models with 1B, 307M, and 86M\nparameters using datasets ranging from 100K to 10M curated images. Using\nEndoDINO as a frozen feature encoder, we achieved state-of-the-art performance\nin anatomical landmark classification, polyp segmentation, and Mayo endoscopic\nscoring (MES) for ulcerative colitis with only simple decoder heads.\n","authors":["Patrick Dermyer","Angad Kalra","Matt Schwartz"],"pdf_url":"https://arxiv.org/pdf/2501.05488v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2501.04695v1","updated":"2025-01-08T18:58:22Z","published":"2025-01-08T18:58:22Z","title":"Re-ranking the Context for Multimodal Retrieval Augmented Generation","summary":"  Retrieval-augmented generation (RAG) enhances large language models (LLMs) by\nincorporating external knowledge to generate a response within a context with\nimproved accuracy and reduced hallucinations. However, multi-modal RAG systems\nface unique challenges: (i) the retrieval process may select irrelevant entries\nto user query (e.g., images, documents), and (ii) vision-language models or\nmulti-modal language models like GPT-4o may hallucinate when processing these\nentries to generate RAG output. In this paper, we aim to address the first\nchallenge, i.e, improving the selection of relevant context from the\nknowledge-base in retrieval phase of the multi-modal RAG. Specifically, we\nleverage the relevancy score (RS) measure designed in our previous work for\nevaluating the RAG performance to select more relevant entries in retrieval\nprocess. The retrieval based on embeddings, say CLIP-based embedding, and\ncosine similarity usually perform poorly particularly for multi-modal data. We\nshow that by using a more advanced relevancy measure, one can enhance the\nretrieval process by selecting more relevant pieces from the knowledge-base and\neliminate the irrelevant pieces from the context by adaptively selecting\nup-to-$k$ entries instead of fixed number of entries. Our evaluation using COCO\ndataset demonstrates significant enhancement in selecting relevant context and\naccuracy of the generated response.\n","authors":["Matin Mortaheb","Mohammad A. Amir Khojastepour","Srimat T. Chakradhar","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2501.04695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04652v1","updated":"2025-01-08T18:05:30Z","published":"2025-01-08T18:05:30Z","title":"Multi-task retriever fine-tuning for domain-specific and efficient RAG","summary":"  Retrieval-Augmented Generation (RAG) has become ubiquitous when deploying\nLarge Language Models (LLMs), as it can address typical limitations such as\ngenerating hallucinated or outdated information. However, when building\nreal-world RAG applications, practical issues arise. First, the retrieved\ninformation is generally domain-specific. Since it is computationally expensive\nto fine-tune LLMs, it is more feasible to fine-tune the retriever to improve\nthe quality of the data included in the LLM input. Second, as more applications\nare deployed in the same real-world system, one cannot afford to deploy\nseparate retrievers. Moreover, these RAG applications normally retrieve\ndifferent kinds of data. Our solution is to instruction fine-tune a small\nretriever encoder on a variety of domain-specific tasks to allow us to deploy\none encoder that can serve many use cases, thereby achieving low-cost,\nscalability, and speed. We show how this encoder generalizes to out-of-domain\nsettings as well as to an unseen retrieval task on real-world enterprise use\ncases.\n","authors":["Patrice Béchard","Orlando Marquez Ayala"],"pdf_url":"https://arxiv.org/pdf/2501.04652v1.pdf","comment":"9 pages, 2 figures. Submitted to NAACL 2025 Industry Track"},{"id":"http://arxiv.org/abs/2501.04635v1","updated":"2025-01-08T17:29:46Z","published":"2025-01-08T17:29:46Z","title":"Knowledge Retrieval Based on Generative AI","summary":"  This study develops a question-answering system based on Retrieval-Augmented\nGeneration (RAG) using Chinese Wikipedia and Lawbank as retrieval sources.\nUsing TTQA and TMMLU+ as evaluation datasets, the system employs BGE-M3 for\ndense vector retrieval to obtain highly relevant search results and\nBGE-reranker to reorder these results based on query relevance. The most\npertinent retrieval outcomes serve as reference knowledge for a Large Language\nModel (LLM), enhancing its ability to answer questions and establishing a\nknowledge retrieval system grounded in generative AI.\n  The system's effectiveness is assessed through a two-stage evaluation:\nautomatic and assisted performance evaluations. The automatic evaluation\ncalculates accuracy by comparing the model's auto-generated labels with ground\ntruth answers, measuring performance under standardized conditions without\nhuman intervention. The assisted performance evaluation involves 20\nfinance-related multiple-choice questions answered by 20 participants without\nfinancial backgrounds. Initially, participants answer independently. Later,\nthey receive system-generated reference information to assist in answering,\nexamining whether the system improves accuracy when assistance is provided.\n  The main contributions of this research are: (1) Enhanced LLM Capability: By\nintegrating BGE-M3 and BGE-reranker, the system retrieves and reorders highly\nrelevant results, reduces hallucinations, and dynamically accesses authorized\nor public knowledge sources. (2) Improved Data Privacy: A customized RAG\narchitecture enables local operation of the LLM, eliminating the need to send\nprivate data to external servers. This approach enhances data security, reduces\nreliance on commercial services, lowers operational costs, and mitigates\nprivacy risks.\n","authors":["Te-Lun Yang","Jyi-Shane Liu","Yuen-Hsien Tseng","Jyh-Shing Roger Jang"],"pdf_url":"https://arxiv.org/pdf/2501.04635v1.pdf","comment":"8 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2501.04630v1","updated":"2025-01-08T17:22:03Z","published":"2025-01-08T17:22:03Z","title":"Evaluating Interval-based Tokenization for Pitch Representation in\n  Symbolic Music Analysis","summary":"  Symbolic music analysis tasks are often performed by models originally\ndeveloped for Natural Language Processing, such as Transformers. Such models\nrequire the input data to be represented as sequences, which is achieved\nthrough a process of tokenization. Tokenization strategies for symbolic music\noften rely on absolute MIDI values to represent pitch information. However,\nmusic research largely promotes the benefit of higher-level representations\nsuch as melodic contour and harmonic relations for which pitch intervals turn\nout to be more expressive than absolute pitches. In this work, we introduce a\ngeneral framework for building interval-based tokenizations. By evaluating\nthese tokenizations on three music analysis tasks, we show that such\ninterval-based tokenizations improve model performances and facilitate their\nexplainability.\n","authors":["Dinh-Viet-Toan Le","Louis Bigo","Mikaela Keller"],"pdf_url":"https://arxiv.org/pdf/2501.04630v1.pdf","comment":"Accepted at Artificial Intelligence for Music Workshop at AAAI 2025\n  (https://ai4musicians.org/2025aaai.html)"},{"id":"http://arxiv.org/abs/2405.10587v3","updated":"2025-01-08T11:21:12Z","published":"2024-05-17T07:22:02Z","title":"RDRec: Rationale Distillation for LLM-based Recommendation","summary":"  Large language model (LLM)-based recommender models that bridge users and\nitems through textual prompts for effective semantic reasoning have gained\nconsiderable attention. However, few methods consider the underlying rationales\nbehind interactions, such as user preferences and item attributes, limiting the\nreasoning capability of LLMs for recommendations. This paper proposes a\nrationale distillation recommender (RDRec), a compact model designed to learn\nrationales generated by a larger language model (LM). By leveraging rationales\nfrom reviews related to users and items, RDRec remarkably specifies their\nprofiles for recommendations. Experiments show that RDRec achieves\nstate-of-the-art (SOTA) performance in both top-N and sequential\nrecommendations. Our source code is released at\nhttps://github.com/WangXFng/RDRec.\n","authors":["Xinfeng Wang","Jin Cui","Yoshimi Suzuki","Fumiyo Fukumoto"],"pdf_url":"https://arxiv.org/pdf/2405.10587v3.pdf","comment":"10 pages. Accepted to ACL 2024 Main as a short paper"},{"id":"http://arxiv.org/abs/2501.04420v1","updated":"2025-01-08T11:08:58Z","published":"2025-01-08T11:08:58Z","title":"A Closer Look on Gender Stereotypes in Movie Recommender Systems and\n  Their Implications with Privacy","summary":"  The movie recommender system typically leverages user feedback to provide\npersonalized recommendations that align with user preferences and increase\nbusiness revenue. This study investigates the impact of gender stereotypes on\nsuch systems through a specific attack scenario. In this scenario, an attacker\ndetermines users' gender, a private attribute, by exploiting gender stereotypes\nabout movie preferences and analyzing users' feedback data, which is either\npublicly available or observed within the system. The study consists of two\nphases. In the first phase, a user study involving 630 participants identified\ngender stereotypes associated with movie genres, which often influence viewing\nchoices. In the second phase, four inference algorithms were applied to detect\ngender stereotypes by combining the findings from the first phase with users'\nfeedback data. Results showed that these algorithms performed more effectively\nthan relying solely on feedback data for gender inference. Additionally, we\nquantified the extent of gender stereotypes to evaluate their broader impact on\ndigital computational science. The latter part of the study utilized two major\nmovie recommender datasets: MovieLens 1M and Yahoo!Movie. Detailed experimental\ninformation is available on our GitHub repository:\nhttps://github.com/fr-iit/GSMRS\n","authors":["Falguni Roy","Yiduo Shen","Na Zhao","Xiaofeng Ding","Md. Omar Faruk"],"pdf_url":"https://arxiv.org/pdf/2501.04420v1.pdf","comment":"19 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.04410v1","updated":"2025-01-08T10:49:13Z","published":"2025-01-08T10:49:13Z","title":"User Simulation in the Era of Generative AI: User Modeling, Synthetic\n  Data Generation, and System Evaluation","summary":"  User simulation is an emerging interdisciplinary topic with multiple critical\napplications in the era of Generative AI. It involves creating an intelligent\nagent that mimics the actions of a human user interacting with an AI system,\nenabling researchers to model and analyze user behaviour, generate synthetic\ndata for training, and evaluate interactive AI systems in a controlled and\nreproducible manner. User simulation has profound implications for diverse\nfields and plays a vital role in the pursuit of Artificial General\nIntelligence. This paper provides an overview of user simulation, highlighting\nits key applications, connections to various disciplines, and outlining future\nresearch directions to advance this increasingly important technology.\n","authors":["Krisztian Balog","ChengXiang Zhai"],"pdf_url":"https://arxiv.org/pdf/2501.04410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04364v1","updated":"2025-01-08T09:03:16Z","published":"2025-01-08T09:03:16Z","title":"An innovative data collection method to eliminate the preprocessing\n  phase in web usage mining","summary":"  The underlying data source for web usage mining (WUM) is commonly thought to\nbe server logs. However, access log files ensure quite limited data about the\nclients. Identifying sessions from this messy data takes a considerable effort,\nand operations performed for this purpose do not always yield excellent\nresults. Also, this data cannot be used for web analytics efficiently. This\nstudy proposes an innovative method for user tracking, session management, and\ncollecting web usage data. The method is mainly based on a new approach for\nusing collected data for web analytics extraction as the data source in web\nusage mining. An application-based API has been developed with a different\nstrategy from conventional client-side methods to obtain and process log data.\nThe log data has been successfully gathered by integrating the technique into\nan enterprise web application. The results reveal that the homogeneous\nstructured data collected and stored with this method is more convenient to\nbrowse, filter, and process than web server logs. This data stored on a\nrelational database can be used effortlessly as a reliable data source for\nhigh-performance web usage mining activity, real-time web analytics, or a\nfunctional recommendation system.\n","authors":["Ozkan Canay","Umit Kocabicak"],"pdf_url":"https://arxiv.org/pdf/2501.04364v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.12743v3","updated":"2025-01-08T07:00:36Z","published":"2023-08-24T12:45:02Z","title":"Network-Based Video Recommendation Using Viewing Patterns and Modularity\n  Analysis: An Integrated Framework","summary":"  The proliferation of video-on-demand (VOD) services has led to a paradox of\nchoice, overwhelming users with vast content libraries and revealing\nlimitations in current recommender systems. This research introduces a novel\napproach by combining implicit user data, such as viewing percentages, with\nsocial network analysis to enhance personalization in VOD platforms. The\nmethodology constructs user-item interaction graphs based on viewing patterns\nand applies centrality measures (degree, closeness, and betweenness) to\nidentify important videos. Modularity-based clustering groups related content,\nenabling personalized recommendations. The system was evaluated on a\ndocumentary-focused VOD platform with 328 users over four months. Results\nshowed significant improvements: a 63% increase in click-through rate (CTR), a\n24% increase in view completion rate, and a 17% improvement in user\nsatisfaction. The approach outperformed traditional methods like Naive Bayes\nand SVM. Future research should explore advanced techniques, such as matrix\nfactorization models, graph neural networks, and hybrid approaches combining\ncontent-based and collaborative filtering. Additionally, incorporating temporal\nmodels and addressing scalability challenges for large-scale platforms are\nessential next steps. This study contributes to the state of the art by\nintroducing modularity-based clustering and ego-centric ranking methods to\nenhance personalization in video recommendations. The findings suggest that\nintegrating network-based features and implicit feedback can significantly\nimprove user engagement, offering a cost-effective solution for VOD platforms\nto enhance recommendation quality.\n","authors":["Mehrdad Maghsoudi","Mohammad Hossein valikhani","Mohammad Hossein Zohdi"],"pdf_url":"https://arxiv.org/pdf/2308.12743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00309v2","updated":"2025-01-08T05:16:25Z","published":"2024-12-31T06:59:35Z","title":"Retrieval-Augmented Generation with Graphs (GraphRAG)","summary":"  Retrieval-augmented generation (RAG) is a powerful technique that enhances\ndownstream task execution by retrieving additional information, such as\nknowledge, skills, and tools from external sources. Graph, by its intrinsic\n\"nodes connected by edges\" nature, encodes massive heterogeneous and relational\ninformation, making it a golden resource for RAG in tremendous real-world\napplications. As a result, we have recently witnessed increasing attention on\nequipping RAG with Graph, i.e., GraphRAG. However, unlike conventional RAG,\nwhere the retriever, generator, and external data sources can be uniformly\ndesigned in the neural-embedding space, the uniqueness of graph-structured\ndata, such as diverse-formatted and domain-specific relational knowledge, poses\nunique and significant challenges when designing GraphRAG for different\ndomains. Given the broad applicability, the associated design challenges, and\nthe recent surge in GraphRAG, a systematic and up-to-date survey of its key\nconcepts and techniques is urgently desired. Following this motivation, we\npresent a comprehensive and up-to-date survey on GraphRAG. Our survey first\nproposes a holistic GraphRAG framework by defining its key components,\nincluding query processor, retriever, organizer, generator, and data source.\nFurthermore, recognizing that graphs in different domains exhibit distinct\nrelational patterns and require dedicated designs, we review GraphRAG\ntechniques uniquely tailored to each domain. Finally, we discuss research\nchallenges and brainstorm directions to inspire cross-disciplinary\nopportunities. Our survey repository is publicly maintained at\nhttps://github.com/Graph-RAG/GraphRAG/.\n","authors":["Haoyu Han","Yu Wang","Harry Shomer","Kai Guo","Jiayuan Ding","Yongjia Lei","Mahantesh Halappanavar","Ryan A. Rossi","Subhabrata Mukherjee","Xianfeng Tang","Qi He","Zhigang Hua","Bo Long","Tong Zhao","Neil Shah","Amin Javari","Yinglong Xia","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2501.00309v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09852v2","updated":"2025-01-08T01:44:07Z","published":"2024-11-15T00:20:36Z","title":"InterFormer: Towards Effective Heterogeneous Interaction Learning for\n  Click-Through Rate Prediction","summary":"  Click-through rate (CTR) prediction, which predicts the probability of a user\nclicking an ad, is a fundamental task in recommender systems. The emergence of\nheterogeneous information, such as user profile and behavior sequences, depicts\nuser interests from different aspects. A mutually beneficial integration of\nheterogeneous information is the cornerstone towards the success of CTR\nprediction. However, most of the existing methods suffer from two fundamental\nlimitations, including (1) insufficient inter-mode interaction due to the\nunidirectional information flow between modes, and (2) aggressive information\naggregation caused by early summarization, resulting in excessive information\nloss. To address the above limitations, we propose a novel module named\nInterFormer to learn heterogeneous information interaction in an interleaving\nstyle. To achieve better interaction learning, InterFormer enables\nbidirectional information flow for mutually beneficial learning across\ndifferent modes. To avoid aggressive information aggregation, we retain\ncomplete information in each data mode and use a separate bridging arch for\neffective information selection and summarization. Our proposed InterFormer\nachieves state-of-the-art performance on three public datasets and a\nlarge-scale industrial dataset.\n","authors":["Zhichen Zeng","Xiaolong Liu","Mengyue Hang","Xiaoyi Liu","Qinghai Zhou","Chaofei Yang","Yiqun Liu","Yichen Ruan","Laming Chen","Yuxin Chen","Yujia Hao","Jiaqi Xu","Jade Nie","Xi Liu","Buyun Zhang","Wei Wen","Siyang Yuan","Kai Wang","Wen-Yen Chen","Yiping Han","Huayu Li","Chunzhi Yang","Bo Long","Philip S. Yu","Hanghang Tong","Jiyan Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09852v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.06653v3","updated":"2025-01-08T20:40:09Z","published":"2024-08-13T05:53:46Z","title":"Hierarchical Structured Neural Network: Efficient Retrieval Scaling for\n  Large Scale Recommendation","summary":"  Retrieval, the initial stage of a recommendation system, is tasked with\ndown-selecting items from a pool of tens of millions of candidates to a few\nthousands. Embedding Based Retrieval (EBR) has been a typical choice for this\nproblem, addressing the computational demands of deep neural networks across\nvast item corpora. EBR utilizes Two Tower or Siamese Networks to learn\nrepresentations for users and items, and employ Approximate Nearest Neighbor\n(ANN) search to efficiently retrieve relevant items. Despite its popularity in\nindustry, EBR faces limitations. The Two Tower architecture, relying on a\nsingle dot product interaction, struggles to capture complex data distributions\ndue to limited capability in learning expressive interactions between users and\nitems. Additionally, ANN index building and representation learning for user\nand item are often separate, leading to inconsistencies exacerbated by\nrepresentation (e.g. continuous online training) and item drift (e.g. items\nexpired and new items added). In this paper, we introduce the Hierarchical\nStructured Neural Network (HSNN), an efficient deep neural network model to\nlearn intricate user and item interactions beyond the commonly used dot product\nin retrieval tasks, achieving sublinear computational costs relative to corpus\nsize. A Modular Neural Network (MoNN) is designed to maintain high\nexpressiveness for interaction learning while ensuring efficiency. A mixture of\nMoNNs operate on a hierarchical item index to achieve extensive computation\nsharing, enabling it to scale up to large corpus size. MoNN and the\nhierarchical index are jointly learnt to continuously adapt to distribution\nshifts in both user interests and item distributions. HSNN achieves substantial\nimprovement in offline evaluation compared to prevailing methods.\n","authors":["Kaushik Rangadurai","Siyang Yuan","Minhui Huang","Yiqun Liu","Golnaz Ghasemiesfeh","Yunchen Pu","Haiyu Lu","Xingfeng He","Fangzhou Xu","Andrew Cui","Vidhoon Viswanathan","Lin Yang","Liang Wang","Jiyan Yang","Chonglin Sun"],"pdf_url":"https://arxiv.org/pdf/2408.06653v3.pdf","comment":"Resubmit"},{"id":"http://arxiv.org/abs/2501.04802v1","updated":"2025-01-08T19:29:33Z","published":"2025-01-08T19:29:33Z","title":"Reproducing HotFlip for Corpus Poisoning Attacks in Dense Retrieval","summary":"  HotFlip is a topical gradient-based word substitution method for attacking\nlanguage models. Recently, this method has been further applied to attack\nretrieval systems by generating malicious passages that are injected into a\ncorpus, i.e., corpus poisoning. However, HotFlip is known to be computationally\ninefficient, with the majority of time being spent on gradient accumulation for\neach query-passage pair during the adversarial token generation phase, making\nit impossible to generate an adequate number of adversarial passages in a\nreasonable amount of time. Moreover, the attack method itself assumes access to\na set of user queries, a strong assumption that does not correspond to how\nreal-world adversarial attacks are usually performed. In this paper, we first\nsignificantly boost the efficiency of HotFlip, reducing the adversarial\ngeneration process from 4 hours per document to only 15 minutes, using the same\nhardware. We further contribute experiments and analysis on two additional\ntasks: (1) transfer-based black-box attacks, and (2) query-agnostic attacks.\nWhenever possible, we provide comparisons between the original method and our\nimproved version. Our experiments demonstrate that HotFlip can effectively\nattack a variety of dense retrievers, with an observed trend that its attack\nperformance diminishes against more advanced and recent methods. Interestingly,\nwe observe that while HotFlip performs poorly in a black-box setting,\nindicating limited capacity for generalization, in query-agnostic scenarios its\nperformance is correlated to the volume of injected adversarial passages.\n","authors":["Yongkang Li","Panagiotis Eustratiadis","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2501.04802v1.pdf","comment":"This paper has been accepted for oral presentation in the\n  reproducibility track at ECIR 2025"},{"id":"http://arxiv.org/abs/2501.04762v1","updated":"2025-01-08T18:08:48Z","published":"2025-01-08T18:08:48Z","title":"Efficient and Responsible Adaptation of Large Language Models for Robust\n  and Equitable Top-k Recommendations","summary":"  Conventional recommendation systems (RSs) are typically optimized to enhance\nperformance metrics uniformly across all training samples, inadvertently\noverlooking the needs of diverse user populations. The performance disparity\namong various populations can harm the model's robustness to sub-populations\ndue to the varying user properties. While large language models (LLMs) show\npromise in enhancing RS performance, their practical applicability is hindered\nby high costs, inference latency, and degraded performance on long user\nqueries. To address these challenges, we propose a hybrid task allocation\nframework designed to promote social good by equitably serving all user groups.\nBy adopting a two-phase approach, we promote a strategic assignment of tasks\nfor efficient and responsible adaptation of LLMs. Our strategy works by first\nidentifying the weak and inactive users that receive a suboptimal ranking\nperformance by RSs. Next, we use an in-context learning approach for such\nusers, wherein each user interaction history is contextualized as a distinct\nranking task. We evaluate our hybrid framework by incorporating eight different\nrecommendation algorithms and three different LLMs -- both open and\nclose-sourced. Our results on three real-world datasets show a significant\nreduction in weak users and improved robustness to subpopulations without\ndisproportionately escalating costs.\n","authors":["Kirandeep Kaur","Manya Chadha","Vinayak Gupta","Chirag Shah"],"pdf_url":"https://arxiv.org/pdf/2501.04762v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.00824"},{"id":"http://arxiv.org/abs/2501.04763v1","updated":"2025-01-08T18:18:03Z","published":"2025-01-08T18:18:03Z","title":"Search engines in polarized media environment: Auditing political\n  information curation on Google and Bing prior to 2024 US elections","summary":"  Search engines play an important role in the context of modern elections. By\ncurating information in response to user queries, search engines influence how\nindividuals are informed about election-related developments and perceive the\nmedia environment in which elections take place. It has particular implications\nfor (perceived) polarization, especially if search engines' curation results in\na skewed treatment of information sources based on their political leaning.\nUntil now, however, it is unclear whether such a partisan gap emerges through\ninformation curation on search engines and what user- and system-side factors\naffect it. To address this shortcoming, we audit the two largest Western search\nengines, Google and Bing, prior to the 2024 US presidential elections and\nexamine how these search engines' organic search results and additional\ninterface elements represent election-related information depending on the\nqueries' slant, user location, and time when the search was conducted. Our\nfindings indicate that both search engines tend to prioritize left-leaning\nmedia sources, with the exact scope of search results' ideological slant\nvarying between Democrat- and Republican-focused queries. We also observe\nlimited effects of location- and time-based factors on organic search results,\nwhereas results for additional interface elements were more volatile over time\nand specific US states. Together, our observations highlight that search\nengines' information curation actively mirrors the partisan divides present in\nthe US media environments and has the potential to contribute to (perceived)\npolarization within these environments.\n","authors":["Mykola Makhortykh","Tobias Rorhbach","Maryna Sydorova","Elizaveta Kuznetsova"],"pdf_url":"https://arxiv.org/pdf/2501.04763v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2501.05485v1","updated":"2025-01-08T09:06:29Z","published":"2025-01-08T09:06:29Z","title":"S2 Chunking: A Hybrid Framework for Document Segmentation Through\n  Integrated Spatial and Semantic Analysis","summary":"  Document chunking is a critical task in natural language processing (NLP)\nthat involves dividing a document into meaningful segments. Traditional methods\noften rely solely on semantic analysis, ignoring the spatial layout of\nelements, which is crucial for understanding relationships in complex\ndocuments. This paper introduces a novel hybrid approach that combines layout\nstructure, semantic analysis, and spatial relationships to enhance the cohesion\nand accuracy of document chunks. By leveraging bounding box information (bbox)\nand text embeddings, our method constructs a weighted graph representation of\ndocument elements, which is then clustered using spectral clustering.\nExperimental results demonstrate that this approach outperforms traditional\nmethods, particularly in documents with diverse layouts such as reports,\narticles, and multi-column designs. The proposed method also ensures that no\nchunk exceeds a specified token length, making it suitable for use cases where\ntoken limits are critical (e.g., language models with input size limitations)\n","authors":["Prashant Verma"],"pdf_url":"https://arxiv.org/pdf/2501.05485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10415v1","updated":"2025-01-08T14:17:26Z","published":"2025-01-08T14:17:26Z","title":"Making Software FAIR: A machine-assisted workflow for the research\n  software lifecycle","summary":"  A key issue hindering discoverability, attribution and reusability of open\nresearch software is that its existence often remains hidden within the\nmanuscript of research papers. For these resources to become first-class\nbibliographic records, they first need to be identified and subsequently\nregistered with persistent identifiers (PIDs) to be made FAIR (Findable,\nAccessible, Interoperable and Reusable). To this day, much open research\nsoftware fails to meet FAIR principles and software resources are mostly not\nexplicitly linked from the manuscripts that introduced them or used them.\nSoFAIR is a 2-year international project (2024-2025) which proposes a solution\nto the above problem realised over the content available through the global\nnetwork of open repositories. SoFAIR will extend the capabilities of widely\nused open scholarly infrastructures (CORE, Software Heritage, HAL) and tools\n(GROBID) operated by the consortium partners, delivering and deploying an\neffective solution for the management of the research software lifecycle,\nincluding: 1) ML-assisted identification of research software assets from\nwithin the manuscripts of scholarly papers, 2) validation of the identified\nassets by authors, 3) registration of software assets with PIDs and their\narchival.\n","authors":["Petr Knoth","Laurent Romary","Patrice Lopez","Roberto Di Cosmo","Pavel Smrz","Tomasz Umerle","Melissa Harrison","Alain Monteil","Matteo Cancellieri","David Pride"],"pdf_url":"https://arxiv.org/pdf/2501.10415v1.pdf","comment":"5 pages"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.08272v2","updated":"2025-01-08T18:59:48Z","published":"2024-09-12T17:59:04Z","title":"Click2Mask: Local Editing with Dynamic Mask Generation","summary":"  Recent advancements in generative models have revolutionized image generation\nand editing, making these tasks accessible to non-experts. This paper focuses\non local image editing, particularly the task of adding new content to a\nloosely specified area. Existing methods often require a precise mask or a\ndetailed description of the location, which can be cumbersome and prone to\nerrors. We propose Click2Mask, a novel approach that simplifies the local\nediting process by requiring only a single point of reference (in addition to\nthe content description). A mask is dynamically grown around this point during\na Blended Latent Diffusion (BLD) process, guided by a masked CLIP-based\nsemantic loss. Click2Mask surpasses the limitations of segmentation-based and\nfine-tuning dependent methods, offering a more user-friendly and contextually\naccurate solution. Our experiments demonstrate that Click2Mask not only\nminimizes user effort but also enables competitive or superior local image\nmanipulations compared to SoTA methods, according to both human judgement and\nautomatic metrics. Key contributions include the simplification of user input,\nthe ability to freely add objects unconstrained by existing segments, and the\nintegration potential of our dynamic mask approach within other editing\nmethods.\n","authors":["Omer Regev","Omri Avrahami","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2409.08272v2.pdf","comment":"Accepted to AAAI 2025. Project page is available at\n  https://omeregev.github.io/click2mask/"},{"id":"http://arxiv.org/abs/2309.10775v2","updated":"2025-01-08T18:59:39Z","published":"2023-09-19T17:21:12Z","title":"$O(k)$-Equivariant Dimensionality Reduction on Stiefel Manifolds","summary":"  Many real-world datasets live on high-dimensional Stiefel and Grassmannian\nmanifolds, $V_k(\\mathbb{R}^N)$ and $Gr(k, \\mathbb{R}^N)$ respectively, and\nbenefit from projection onto lower-dimensional Stiefel and Grassmannian\nmanifolds. In this work, we propose an algorithm called \\textit{Principal\nStiefel Coordinates (PSC)} to reduce data dimensionality from $\nV_k(\\mathbb{R}^N)$ to $V_k(\\mathbb{R}^n)$ in an \\textit{$O(k)$-equivariant}\nmanner ($k \\leq n \\ll N$). We begin by observing that each element $\\alpha \\in\nV_n(\\mathbb{R}^N)$ defines an isometric embedding of $V_k(\\mathbb{R}^n)$ into\n$V_k(\\mathbb{R}^N)$. Next, we describe two ways of finding a suitable embedding\nmap $\\alpha$: one via an extension of principal component analysis\n($\\alpha_{PCA}$), and one that further minimizes data fit error using gradient\ndescent ($\\alpha_{GD}$). Then, we define a continuous and $O(k)$-equivariant\nmap $\\pi_\\alpha$ that acts as a \"closest point operator\" to project the data\nonto the image of $V_k(\\mathbb{R}^n)$ in $V_k(\\mathbb{R}^N)$ under the\nembedding determined by $\\alpha$, while minimizing distortion. Because this\ndimensionality reduction is $O(k)$-equivariant, these results extend to\nGrassmannian manifolds as well. Lastly, we show that $\\pi_{\\alpha_{PCA}}$\nglobally minimizes projection error in a noiseless setting, while\n$\\pi_{\\alpha_{GD}}$ achieves a meaningfully different and improved outcome when\nthe data does not lie exactly on the image of a linearly embedded\nlower-dimensional Stiefel manifold as above. Multiple numerical experiments\nusing synthetic and real-world data are performed.\n","authors":["Andrew Lee","Harlin Lee","Jose A. Perea","Nikolas Schonsheck","Madeleine Weinstein"],"pdf_url":"https://arxiv.org/pdf/2309.10775v2.pdf","comment":"26 pages, 8 figures, comments welcome!"},{"id":"http://arxiv.org/abs/2501.04700v1","updated":"2025-01-08T18:59:36Z","published":"2025-01-08T18:59:36Z","title":"Planarian Neural Networks: Evolutionary Patterns from Basic Bilateria\n  Shaping Modern Artificial Neural Network Architectures","summary":"  This study examined the viability of enhancing the prediction accuracy of\nartificial neural networks (ANNs) in image classification tasks by developing\nANNs with evolution patterns similar to those of biological neural networks.\nResNet is a widely used family of neural networks with both deep and wide\nvariants; therefore, it was selected as the base model for our investigation.\nThe aim of this study is to improve the image classification performance of\nANNs via a novel approach inspired by the biological nervous system\narchitecture of planarians, which comprises a brain and two nerve cords. We\nbelieve that the unique neural architecture of planarians offers valuable\ninsights into the performance enhancement of ANNs. The proposed planarian\nneural architecture-based neural network was evaluated on the CIFAR-10 and\nCIFAR-100 datasets. Our results indicate that the proposed method exhibits\nhigher prediction accuracy than the baseline neural network models in image\nclassification tasks. These findings demonstrate the significant potential of\nbiologically inspired neural network architectures in improving the performance\nof ANNs in a wide range of applications.\n","authors":["Ziyuan Huang","Mark Newman","Maria Vaida","Srikar Bellur","Roozbeh Sadeghian","Andrew Siu","Hui Wang","Kevin Huggins"],"pdf_url":"https://arxiv.org/pdf/2501.04700v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.04697v1","updated":"2025-01-08T18:58:48Z","published":"2025-01-08T18:58:48Z","title":"Grokking at the Edge of Numerical Stability","summary":"  Grokking, the sudden generalization that occurs after prolonged overfitting,\nis a surprising phenomenon challenging our understanding of deep learning.\nAlthough significant progress has been made in understanding grokking, the\nreasons behind the delayed generalization and its dependence on regularization\nremain unclear. In this work, we argue that without regularization, grokking\ntasks push models to the edge of numerical stability, introducing floating\npoint errors in the Softmax function, which we refer to as Softmax Collapse\n(SC). We demonstrate that SC prevents grokking and that mitigating SC enables\ngrokking without regularization. Investigating the root cause of SC, we find\nthat beyond the point of overfitting, the gradients strongly align with what we\ncall the na\\\"ive loss minimization (NLM) direction. This component of the\ngradient does not alter the model's predictions but decreases the loss by\nscaling the logits, typically by scaling the weights along their current\ndirection. We show that this scaling of the logits explains the delay in\ngeneralization characteristic of grokking and eventually leads to SC, halting\nfurther learning. To validate our hypotheses, we introduce two key\ncontributions that address the challenges in grokking tasks: StableMax, a new\nactivation function that prevents SC and enables grokking without\nregularization, and $\\perp$Grad, a training algorithm that promotes quick\ngeneralization in grokking tasks by preventing NLM altogether. These\ncontributions provide new insights into grokking, elucidating its delayed\ngeneralization, reliance on regularization, and the effectiveness of existing\ngrokking-inducing methods. Code for this paper is available at\nhttps://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability.\n","authors":["Lucas Prieto","Melih Barsbey","Pedro A. M. Mediano","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2501.04697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04695v1","updated":"2025-01-08T18:58:22Z","published":"2025-01-08T18:58:22Z","title":"Re-ranking the Context for Multimodal Retrieval Augmented Generation","summary":"  Retrieval-augmented generation (RAG) enhances large language models (LLMs) by\nincorporating external knowledge to generate a response within a context with\nimproved accuracy and reduced hallucinations. However, multi-modal RAG systems\nface unique challenges: (i) the retrieval process may select irrelevant entries\nto user query (e.g., images, documents), and (ii) vision-language models or\nmulti-modal language models like GPT-4o may hallucinate when processing these\nentries to generate RAG output. In this paper, we aim to address the first\nchallenge, i.e, improving the selection of relevant context from the\nknowledge-base in retrieval phase of the multi-modal RAG. Specifically, we\nleverage the relevancy score (RS) measure designed in our previous work for\nevaluating the RAG performance to select more relevant entries in retrieval\nprocess. The retrieval based on embeddings, say CLIP-based embedding, and\ncosine similarity usually perform poorly particularly for multi-modal data. We\nshow that by using a more advanced relevancy measure, one can enhance the\nretrieval process by selecting more relevant pieces from the knowledge-base and\neliminate the irrelevant pieces from the context by adaptively selecting\nup-to-$k$ entries instead of fixed number of entries. Our evaluation using COCO\ndataset demonstrates significant enhancement in selecting relevant context and\naccuracy of the generated response.\n","authors":["Matin Mortaheb","Mohammad A. Amir Khojastepour","Srimat T. Chakradhar","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2501.04695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04690v1","updated":"2025-01-08T18:53:50Z","published":"2025-01-08T18:53:50Z","title":"Comparative Analysis of Quantum and Classical Support Vector Classifiers\n  for Software Bug Prediction: An Exploratory Study","summary":"  Purpose: Quantum computing promises to transform problem-solving across\nvarious domains with rapid and practical solutions. Within Software Evolution\nand Maintenance, Quantum Machine Learning (QML) remains mostly an underexplored\ndomain, particularly in addressing challenges such as detecting buggy software\ncommits from code repositories. Methods: In this study, we investigate the\npractical application of Quantum Support Vector Classifiers (QSVC) for\ndetecting buggy software commits across 14 open-source software projects with\ndiverse dataset sizes encompassing 30,924 data instances. We compare the QML\nalgorithm PQSVC (Pegasos QSVC) and QSVC against the classical Support Vector\nClassifier (SVC). Our technique addresses large datasets in QSVC algorithms by\ndividing them into smaller subsets. We propose and evaluate an aggregation\nmethod to combine predictions from these models to detect the entire test\ndataset. We also introduce an incremental testing methodology to overcome the\ndifficulties of quantum feature mapping during the testing approach. Results:\nThe study shows the effectiveness of QSVC and PQSVC in detecting buggy software\ncommits. The aggregation technique successfully combines predictions from\nsmaller data subsets, enhancing the overall detection accuracy for the entire\ntest dataset. The incremental testing methodology effectively manages the\nchallenges associated with quantum feature mapping during the testing process.\nConclusion: We contribute to the advancement of QML algorithms in defect\nprediction, unveiling the potential for further research in this domain. The\nspecific scenario of the Short-Term Activity Frame (STAF) highlights the early\ndetection of buggy software commits during the initial developmental phases of\nsoftware systems, particularly when dataset sizes remain insufficient to train\nmachine learning models.\n","authors":["Md Nadim","Mohammad Hassan","Ashis Kumar Mandal","Chanchal K. Roy","Banani Roy","Kevin A. Schneider"],"pdf_url":"https://arxiv.org/pdf/2501.04690v1.pdf","comment":"Accepted for publication in the Springer Journal: Quantum Machine\n  Intelligence (https://link.springer.com/journal/42484)"},{"id":"http://arxiv.org/abs/2501.04686v1","updated":"2025-01-08T18:49:41Z","published":"2025-01-08T18:49:41Z","title":"URSA: Understanding and Verifying Chain-of-thought Reasoning in\n  Multimodal Mathematics","summary":"  Chain-of-thought (CoT) reasoning has been widely applied in the mathematical\nreasoning of Large Language Models (LLMs). Recently, the introduction of\nderivative process supervision on CoT trajectories has sparked discussions on\nenhancing scaling capabilities during test time, thereby boosting the potential\nof these models. However, in multimodal mathematical reasoning, the scarcity of\nhigh-quality CoT training data has hindered existing models from achieving\nhigh-precision CoT reasoning and has limited the realization of reasoning\npotential during test time. In this work, we propose a three-module synthesis\nstrategy that integrates CoT distillation, trajectory-format rewriting, and\nformat unification. It results in a high-quality CoT reasoning instruction\nfine-tuning dataset in multimodal mathematics, MMathCoT-1M. We comprehensively\nvalidate the state-of-the-art (SOTA) performance of the trained URSA-7B model\non multiple multimodal mathematical benchmarks. For test-time scaling, we\nintroduce a data synthesis strategy that automatically generates process\nannotation datasets, known as DualMath-1.1M, focusing on both interpretation\nand logic. By further training URSA-7B on DualMath-1.1M, we transition from CoT\nreasoning capabilities to robust supervision abilities. The trained URSA-RM-7B\nacts as a verifier, effectively enhancing the performance of URSA-7B at test\ntime. URSA-RM-7B also demonstrates excellent out-of-distribution (OOD)\nverifying capabilities, showcasing its generalization. Model weights, training\ndata and code will be open-sourced.\n","authors":["Ruilin Luo","Zhuofan Zheng","Yifan Wang","Yiyao Yu","Xinzhe Ni","Zicheng Lin","Jin Zeng","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2501.04686v1.pdf","comment":"27 pages, 10 tables, 17 figures. The training data has been released.\n  The code and model are currently undergoing internal review. They will be\n  made available soon. Project url: https://ursa-math.github.io"},{"id":"http://arxiv.org/abs/2501.04683v1","updated":"2025-01-08T18:43:59Z","published":"2025-01-08T18:43:59Z","title":"Toward Sufficient Statistical Power in Algorithmic Bias Assessment: A\n  Test for ABROCA","summary":"  Algorithmic bias is a pressing concern in educational data mining (EDM), as\nit risks amplifying inequities in learning outcomes. The Area Between ROC\nCurves (ABROCA) metric is frequently used to measure discrepancies in model\nperformance across demographic groups to quantify overall model fairness.\nHowever, its skewed distribution--especially when class or group imbalances\nexist--makes significance testing challenging. This study investigates ABROCA's\ndistributional properties and contributes robust methods for its significance\ntesting. Specifically, we address (1) whether ABROCA follows any known\ndistribution, (2) how to reliably test for algorithmic bias using ABROCA, and\n(3) the statistical power achievable with ABROCA-based bias assessments under\ntypical EDM sample specifications. Simulation results confirm that ABROCA does\nnot match standard distributions, including those suited to accommodate\nskewness. We propose nonparametric randomization tests for ABROCA and\ndemonstrate that reliably detecting bias with ABROCA requires large sample\nsizes or substantial effect sizes, particularly in imbalanced settings.\nFindings suggest that ABROCA-based bias evaluation based on sample sizes common\nin EDM tends to be underpowered, undermining the reliability of conclusions\nabout model fairness. By offering open-source code to simulate power and\nstatistically test ABROCA, this paper aims to foster more reliable statistical\ntesting in EDM research. It supports broader efforts toward replicability and\nequity in educational modeling.\n","authors":["Conrad Borchers"],"pdf_url":"https://arxiv.org/pdf/2501.04683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04675v1","updated":"2025-01-08T18:33:17Z","published":"2025-01-08T18:33:17Z","title":"Enhancing Financial VQA in Vision Language Models using Intermediate\n  Structured Representations","summary":"  Chart interpretation is crucial for visual data analysis, but accurately\nextracting information from charts poses significant challenges for automated\nmodels. This study investigates the fine-tuning of DEPLOT, a modality\nconversion module that translates the image of a plot or chart to a linearized\ntable, on a custom dataset of 50,000 bar charts. The dataset comprises simple,\nstacked, and grouped bar charts, targeting the unique structural features of\nthese visualizations. The finetuned DEPLOT model is evaluated against its base\nversion using a test set of 1,000 images and two metrics: Relative Mapping\nSimilarity (RMS), which measures categorical mapping accuracy, and Relative\nNumber Set Similarity (RNSS), which evaluates numerical interpretation\naccuracy. To further explore the reasoning capabilities of large language\nmodels (LLMs), we curate an additional set of 100 bar chart images paired with\nquestion answer sets. Our findings demonstrate that providing a structured\nintermediate table alongside the image significantly enhances LLM reasoning\nperformance compared to direct image queries.\n","authors":["Archita Srivastava","Abhas Kumar","Rajesh Kumar","Prabhakar Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2501.04675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02788v2","updated":"2025-01-08T18:33:07Z","published":"2025-01-06T06:07:40Z","title":"GLoG-CSUnet: Enhancing Vision Transformers with Adaptable Radiomic\n  Features for Medical Image Segmentation","summary":"  Vision Transformers (ViTs) have shown promise in medical image semantic\nsegmentation (MISS) by capturing long-range correlations. However, ViTs often\nstruggle to model local spatial information effectively, which is essential for\naccurately segmenting fine anatomical details, particularly when applied to\nsmall datasets without extensive pre-training. We introduce Gabor and Laplacian\nof Gaussian Convolutional Swin Network (GLoG-CSUnet), a novel architecture\nenhancing Transformer-based models by incorporating learnable radiomic\nfeatures. This approach integrates dynamically adaptive Gabor and Laplacian of\nGaussian (LoG) filters to capture texture, edge, and boundary information,\nenhancing the feature representation processed by the Transformer model. Our\nmethod uniquely combines the long-range dependency modeling of Transformers\nwith the texture analysis capabilities of Gabor and LoG features. Evaluated on\nthe Synapse multi-organ and ACDC cardiac segmentation datasets, GLoG-CSUnet\ndemonstrates significant improvements over state-of-the-art models, achieving a\n1.14% increase in Dice score for Synapse and 0.99% for ACDC, with minimal\ncomputational overhead (only 15 and 30 additional parameters, respectively).\nGLoG-CSUnet's flexible design allows integration with various base models,\noffering a promising approach for incorporating radiomics-inspired feature\nextraction in Transformer architectures for medical image analysis. The code\nimplementation is available on GitHub at: https://github.com/HAAIL/GLoG-CSUnet.\n","authors":["Niloufar Eghbali","Hassan Bagher-Ebadian","Tuka Alhanai","Mohammad M. Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2501.02788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04667v1","updated":"2025-01-08T18:28:12Z","published":"2025-01-08T18:28:12Z","title":"Natural Variational Annealing for Multimodal Optimization","summary":"  We introduce a new multimodal optimization approach called Natural\nVariational Annealing (NVA) that combines the strengths of three foundational\nconcepts to simultaneously search for multiple global and local modes of\nblack-box nonconvex objectives. First, it implements a simultaneous search by\nusing variational posteriors, such as, mixtures of Gaussians. Second, it\napplies annealing to gradually trade off exploration for exploitation. Finally,\nit learns the variational search distribution using natural-gradient learning\nwhere updates resemble well-known and easy-to-implement algorithms. The three\nconcepts come together in NVA giving rise to new algorithms and also allowing\nus to incorporate \"fitness shaping\", a core concept from evolutionary\nalgorithms. We assess the quality of search on simulations and compare them to\nmethods using gradient descent and evolution strategies. We also provide an\napplication to a real-world inverse problem in planetary science.\n","authors":["Tâm Le Minh","Julyan Arbel","Thomas Möllenhoff","Mohammad Emtiyaz Khan","Florence Forbes"],"pdf_url":"https://arxiv.org/pdf/2501.04667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04752v2","updated":"2025-01-08T18:23:58Z","published":"2024-12-06T03:33:31Z","title":"GABAR: Graph Attention-Based Action Ranking for Relational Policy\n  Learning","summary":"  We propose a novel approach to learn relational policies for classical\nplanning based on learning to rank actions. We introduce a new graph\nrepresentation that explicitly captures action information and propose a Graph\nNeural Network architecture augmented with Gated Recurrent Units (GRUs) to\nlearn action rankings. Our model is trained on small problem instances and\ngeneralizes to significantly larger instances where traditional planning\nbecomes computationally expensive. Experimental results across standard\nplanning benchmarks demonstrate that our action-ranking approach achieves\ngeneralization to significantly larger problems than those used in training.\n","authors":["Rajesh Mangannavar","Stefan Lee","Alan Fern","Prasad Tadepalli"],"pdf_url":"https://arxiv.org/pdf/2412.04752v2.pdf","comment":"6 Pages, 1 figure. Updated acknowledgments"},{"id":"http://arxiv.org/abs/2412.01348v2","updated":"2025-01-08T18:20:46Z","published":"2024-12-02T10:19:36Z","title":"Hierarchical Object-Oriented POMDP Planning for Object Rearrangement","summary":"  We present an online planning framework for solving multi-object\nrearrangement problems in partially observable, multi-room environments.\nCurrent object rearrangement solutions, primarily based on Reinforcement\nLearning or hand-coded planning methods, often lack adaptability to diverse\nchallenges. To address this limitation, we introduce a novel Hierarchical\nObject-Oriented Partially Observed Markov Decision Process (HOO-POMDP) planning\napproach. This approach comprises of (a) an object-oriented POMDP planner\ngenerating sub-goals, (b) a set of low-level policies for sub-goal achievement,\nand (c) an abstraction system converting the continuous low-level world into a\nrepresentation suitable for abstract planning. We evaluate our system on\nvarying numbers of objects, rooms, and problem types in AI2-THOR simulated\nenvironments with promising results.\n","authors":["Rajesh Mangannavar","Alan Fern","Prasad Tadepalli"],"pdf_url":"https://arxiv.org/pdf/2412.01348v2.pdf","comment":"17 pages, 2 Figures. Preprint. Updated acknowledgments"},{"id":"http://arxiv.org/abs/2407.03289v2","updated":"2025-01-08T18:20:18Z","published":"2024-07-03T17:22:33Z","title":"Correlated Privacy Mechanisms for Differentially Private Distributed\n  Mean Estimation","summary":"  Differentially private distributed mean estimation (DP-DME) is a fundamental\nbuilding block in privacy-preserving federated learning, where a central server\nestimates the mean of $d$-dimensional vectors held by $n$ users while ensuring\n$(\\epsilon,\\delta)$-DP. Local differential privacy (LDP) and distributed DP\nwith secure aggregation (SA) are the most common notions of DP used in DP-DME\nsettings with an untrusted server. LDP provides strong resilience to dropouts,\ncolluding users, and adversarial attacks, but suffers from poor utility. In\ncontrast, SA-based DP-DME achieves an $O(n)$ utility gain over LDP in DME, but\nrequires increased communication and computation overheads and complex\nmulti-round protocols to handle dropouts and attacks. In this work, we present\na generalized framework for DP-DME, that captures LDP and SA-based mechanisms\nas extreme cases. Our framework provides a foundation for developing and\nanalyzing a variety of DP-DME protocols that leverage correlated privacy\nmechanisms across users. To this end, we propose CorDP-DME, a novel DP-DME\nmechanism based on the correlated Gaussian mechanism, that spans the gap\nbetween DME with LDP and distributed DP. We prove that CorDP-DME offers a\nfavorable balance between utility and resilience to dropout and collusion. We\nprovide an information-theoretic analysis of CorDP-DME, and derive theoretical\nguarantees for utility under any given privacy parameters and dropout/colluding\nuser thresholds. Our results demonstrate that (anti) correlated Gaussian DP\nmechanisms can significantly improve utility in mean estimation tasks compared\nto LDP -- even in adversarial settings -- while maintaining better resilience\nto dropouts and attacks compared to distributed DP.\n","authors":["Sajani Vithana","Viveck R. Cadambe","Flavio P. Calmon","Haewon Jeong"],"pdf_url":"https://arxiv.org/pdf/2407.03289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17194v3","updated":"2025-01-08T18:18:51Z","published":"2024-10-22T17:13:34Z","title":"Representation Shattering in Transformers: A Synthetic Study with\n  Knowledge Editing","summary":"  Knowledge Editing (KE) algorithms alter models' weights to perform targeted\nupdates to incorrect, outdated, or otherwise unwanted factual associations. To\nbetter identify the possibilities and limitations of these approaches, recent\nwork has shown that applying KE can adversely affect models' factual recall\naccuracy and diminish their general reasoning abilities. While these studies\ngive broad insights into the potential harms of KE algorithms, e.g., via\nperformance evaluations on benchmarks, we argue little is understood as to why\nsuch destructive failures occur. Is it possible KE methods distort\nrepresentations of concepts beyond the targeted fact, hence hampering abilities\nat broad? If so, what is the extent of this distortion? Motivated by such\nquestions, we define a novel synthetic task wherein a Transformer is trained\nfrom scratch to internalize a \"structured\" knowledge graph. The structure\nenforces relationships between entities of the graph, such that editing a\nfactual association has \"trickling effects\" on other entities in the graph\n(e.g., altering X's parent is Y to Z affects who X's siblings' parent is).\nThrough evaluations of edited models and analysis of extracted representations,\nwe show that KE inadvertently affects representations of entities beyond the\ntargeted one, distorting relevant structures that allow a model to infer unseen\nknowledge about an entity. We call this phenomenon representation shattering\nand demonstrate that it results in degradation of factual recall and reasoning\nperformance more broadly. To corroborate our findings in a more naturalistic\nsetup, we perform preliminary experiments with pre-trained Llama and Mamba\nmodels, reproducing the representation shattering effect therein as well.\nOverall, our work yields a precise mechanistic hypothesis to explain why KE has\nadverse effects on model abilities.\n","authors":["Kento Nishi","Maya Okawa","Rahul Ramesh","Mikail Khona","Hidenori Tanaka","Ekdeep Singh Lubana"],"pdf_url":"https://arxiv.org/pdf/2410.17194v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.04652v1","updated":"2025-01-08T18:05:30Z","published":"2025-01-08T18:05:30Z","title":"Multi-task retriever fine-tuning for domain-specific and efficient RAG","summary":"  Retrieval-Augmented Generation (RAG) has become ubiquitous when deploying\nLarge Language Models (LLMs), as it can address typical limitations such as\ngenerating hallucinated or outdated information. However, when building\nreal-world RAG applications, practical issues arise. First, the retrieved\ninformation is generally domain-specific. Since it is computationally expensive\nto fine-tune LLMs, it is more feasible to fine-tune the retriever to improve\nthe quality of the data included in the LLM input. Second, as more applications\nare deployed in the same real-world system, one cannot afford to deploy\nseparate retrievers. Moreover, these RAG applications normally retrieve\ndifferent kinds of data. Our solution is to instruction fine-tune a small\nretriever encoder on a variety of domain-specific tasks to allow us to deploy\none encoder that can serve many use cases, thereby achieving low-cost,\nscalability, and speed. We show how this encoder generalizes to out-of-domain\nsettings as well as to an unseen retrieval task on real-world enterprise use\ncases.\n","authors":["Patrice Béchard","Orlando Marquez Ayala"],"pdf_url":"https://arxiv.org/pdf/2501.04652v1.pdf","comment":"9 pages, 2 figures. Submitted to NAACL 2025 Industry Track"},{"id":"http://arxiv.org/abs/2501.04641v1","updated":"2025-01-08T17:47:06Z","published":"2025-01-08T17:47:06Z","title":"A Statistical Theory of Contrastive Pre-training and Multimodal\n  Generative AI","summary":"  Multi-modal generative AI systems, such as those combining vision and\nlanguage, rely on contrastive pre-training to learn representations across\ndifferent modalities. While their practical benefits are widely acknowledged, a\nrigorous theoretical understanding of the contrastive pre-training framework\nremains limited. This paper develops a theoretical framework to explain the\nsuccess of contrastive pre-training in downstream tasks, such as zero-shot\nclassification, conditional diffusion models, and vision-language models. We\nintroduce the concept of approximate sufficient statistics, a generalization of\nthe classical sufficient statistics, and show that near-minimizers of the\ncontrastive pre-training loss are approximately sufficient, making them\nadaptable to diverse downstream tasks. We further propose the Joint Generative\nHierarchical Model for the joint distribution of images and text, showing that\ntransformers can efficiently approximate relevant functions within this model\nvia belief propagation. Building on this framework, we derive sample complexity\nguarantees for multi-modal learning based on contrastive pre-trained\nrepresentations. Numerical simulations validate these theoretical findings,\ndemonstrating the strong generalization performance of contrastively\npre-trained transformers in various multi-modal tasks.\n","authors":["Kazusato Oko","Licong Lin","Yuhang Cai","Song Mei"],"pdf_url":"https://arxiv.org/pdf/2501.04641v1.pdf","comment":"108 pages"},{"id":"http://arxiv.org/abs/2409.05901v2","updated":"2025-01-08T17:36:41Z","published":"2024-09-05T20:45:44Z","title":"Diffusion Map Autoencoder","summary":"  In this work, we explore various modifications to diffusion maps (DMAP),\nincluding their incorporation into a layered sequential neural network model\ntrained with gradient descent. The result is a sequential neural network that\ninherits the interpretability of diffusion maps.\n","authors":["Julio Candanedo"],"pdf_url":"https://arxiv.org/pdf/2409.05901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13015v4","updated":"2025-01-08T17:14:40Z","published":"2023-11-21T21:44:28Z","title":"Fast and Interpretable Mortality Risk Scores for Critical Care Patients","summary":"  Prediction of mortality in intensive care unit (ICU) patients typically\nrelies on black box models (that are unacceptable for use in hospitals) or\nhand-tuned interpretable models (that might lead to the loss in performance).\nWe aim to bridge the gap between these two categories by building on modern\ninterpretable ML techniques to design interpretable mortality risk scores that\nare as accurate as black boxes. We developed a new algorithm, GroupFasterRisk,\nwhich has several important benefits: it uses both hard and soft direct\nsparsity regularization, it incorporates group sparsity to allow more cohesive\nmodels, it allows for monotonicity constraint to include domain knowledge, and\nit produces many equally-good models, which allows domain experts to choose\namong them. For evaluation, we leveraged the largest existing public ICU\nmonitoring datasets (MIMIC III and eICU). Models produced by GroupFasterRisk\noutperformed OASIS and SAPS II scores and performed similarly to APACHE IV/IVa\nwhile using at most a third of the parameters. For patients with\nsepsis/septicemia, acute myocardial infarction, heart failure, and acute kidney\nfailure, GroupFasterRisk models outperformed OASIS and SOFA. Finally, different\nmortality prediction ML approaches performed better based on variables selected\nby GroupFasterRisk as compared to OASIS variables. GroupFasterRisk's models\nperformed better than risk scores currently used in hospitals, and on par with\nblack box ML models, while being orders of magnitude sparser. Because\nGroupFasterRisk produces a variety of risk scores, it allows design flexibility\n- the key enabler of practical model creation. GroupFasterRisk is a fast,\naccessible, and flexible procedure that allows learning a diverse set of sparse\nrisk scores for mortality prediction.\n","authors":["Chloe Qinyu Zhu","Muhang Tian","Lesia Semenova","Jiachang Liu","Jack Xu","Joseph Scarpa","Cynthia Rudin"],"pdf_url":"https://arxiv.org/pdf/2311.13015v4.pdf","comment":"This article has been accepted for publication in the Journal of the\n  American Medical Informatics Association, published by Oxford University\n  Press"},{"id":"http://arxiv.org/abs/2412.16780v2","updated":"2025-01-08T17:00:18Z","published":"2024-12-21T21:27:22Z","title":"Forget Vectors at Play: Universal Input Perturbations Driving Machine\n  Unlearning in Image Classification","summary":"  Machine unlearning (MU), which seeks to erase the influence of specific\nunwanted data from already-trained models, is becoming increasingly vital in\nmodel editing, particularly to comply with evolving data regulations like the\n``right to be forgotten''. Conventional approaches are predominantly\nmodel-based, typically requiring retraining or fine-tuning the model's weights\nto meet unlearning requirements. In this work, we approach the MU problem from\na novel input perturbation-based perspective, where the model weights remain\nintact throughout the unlearning process. We demonstrate the existence of a\nproactive input-based unlearning strategy, referred to forget vector, which can\nbe generated as an input-agnostic data perturbation and remains as effective as\nmodel-based approximate unlearning approaches. We also explore forget vector\narithmetic, whereby multiple class-specific forget vectors are combined through\nsimple operations (e.g., linear combinations) to generate new forget vectors\nfor unseen unlearning tasks, such as forgetting arbitrary subsets across\nclasses. Extensive experiments validate the effectiveness and adaptability of\nthe forget vector, showcasing its competitive performance relative to\nstate-of-the-art model-based methods. Codes are available at\nhttps://github.com/Changchangsun/Forget-Vector.\n","authors":["Changchang Sun","Ren Wang","Yihua Zhang","Jinghan Jia","Jiancheng Liu","Gaowen Liu","Sijia Liu","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2412.16780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04882v1","updated":"2025-01-08T23:38:19Z","published":"2025-01-08T23:38:19Z","title":"Reach Measurement, Optimization and Frequency Capping In Targeted Online\n  Advertising Under k-Anonymity","summary":"  The growth in the use of online advertising to foster brand awareness over\nrecent years is largely attributable to the ubiquity of social media. One\npivotal technology contributing to the success of online brand advertising is\nfrequency capping, a mechanism that enables marketers to control the number of\ntimes an ad is shown to a specific user. However, the very foundation of this\ntechnology is being scrutinized as the industry gravitates towards advertising\nsolutions that prioritize user privacy. This paper delves into the issue of\nreach measurement and optimization within the context of $k$-anonymity, a\nprivacy-preserving model gaining traction across major online advertising\nplatforms. We outline how to report reach within this new privacy landscape and\ndemonstrate how probabilistic discounting, a probabilistic adaptation of\ntraditional frequency capping, can be employed to optimize campaign\nperformance. Experiments are performed to assess the trade-off between user\nprivacy and the efficacy of online brand advertising. Notably, we discern a\nsignificant dip in performance as long as privacy is introduced, yet this comes\nwith a limited additional cost for advertising platforms to offer their users\nmore privacy.\n","authors":["Yuan Gao","Mu Qiao"],"pdf_url":"https://arxiv.org/pdf/2501.04882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04881v1","updated":"2025-01-08T23:33:50Z","published":"2025-01-08T23:33:50Z","title":"Geophysical inverse problems with measurement-guided diffusion models","summary":"  Solving inverse problems with the reverse process of a diffusion model\nrepresents an appealing avenue to produce highly realistic, yet diverse\nsolutions from incomplete and possibly noisy measurements, ultimately enabling\nuncertainty quantification at scale. However, because of the intractable nature\nof the score function of the likelihood term (i.e., $\\nabla_{\\mathbf{x}_t}\np(\\mathbf{y} | \\mathbf{x}_t)$), various samplers have been proposed in the\nliterature that use different (more or less accurate) approximations of such a\ngradient to guide the diffusion process towards solutions that match the\nobservations. In this work, I consider two sampling algorithms recently\nproposed under the name of Diffusion Posterior Sampling (DPS) and\nPseudo-inverse Guided Diffusion Model (PGDM), respectively. In DSP, the\nguidance term used at each step of the reverse diffusion process is obtained by\napplying the adjoint of the modeling operator to the residual obtained from a\none-step denoising estimate of the solution. On the other hand, PGDM utilizes a\npseudo-inverse operator that originates from the fact that the one-step\ndenoised solution is not assumed to be deterministic, rather modeled as a\nGaussian distribution. Through an extensive set of numerical examples on two\ngeophysical inverse problems (namely, seismic interpolation and seismic\ninversion), I show that two key aspects for the success of any\nmeasurement-guided diffusion process are: i) our ability to re-parametrize the\ninverse problem such that the sought after model is bounded between -1 and 1 (a\npre-requisite for any diffusion model); ii) the choice of the training dataset\nused to learn the implicit prior that guides the reverse diffusion process.\nNumerical examples on synthetic and field datasets reveal that PGDM outperforms\nDPS in both scenarios at limited additional cost.\n","authors":["Matteo Ravasi"],"pdf_url":"https://arxiv.org/pdf/2501.04881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04880v1","updated":"2025-01-08T23:28:28Z","published":"2025-01-08T23:28:28Z","title":"Leveraging Log Probabilities in Language Models to Forecast Future\n  Events","summary":"  In the constantly changing field of data-driven decision making, accurately\npredicting future events is crucial for strategic planning in various sectors.\nThe emergence of Large Language Models (LLMs) marks a significant advancement\nin this area, offering advanced tools that utilise extensive text data for\nprediction. In this industry paper, we introduce a novel method for AI-driven\nforesight using LLMs. Building on top of previous research, we employ data on\ncurrent trends and their trajectories for generating forecasts on 15 different\ntopics. Subsequently, we estimate their probabilities via a multi-step approach\nbased on log probabilities. We show we achieve a Brier score of 0.186, meaning\na +26% improvement over random chance and a +19% improvement over\nwidely-available AI systems.\n","authors":["Tommaso Soru","Jim Marshall"],"pdf_url":"https://arxiv.org/pdf/2501.04880v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.04879v1","updated":"2025-01-08T23:22:08Z","published":"2025-01-08T23:22:08Z","title":"Multilinear Tensor Low-Rank Approximation for Policy-Gradient Methods in\n  Reinforcement Learning","summary":"  Reinforcement learning (RL) aims to estimate the action to take given a\n(time-varying) state, with the goal of maximizing a cumulative reward function.\nPredominantly, there are two families of algorithms to solve RL problems:\nvalue-based and policy-based methods, with the latter designed to learn a\nprobabilistic parametric policy from states to actions. Most contemporary\napproaches implement this policy using a neural network (NN). However, NNs\nusually face issues related to convergence, architectural suitability,\nhyper-parameter selection, and underutilization of the redundancies of the\nstate-action representations (e.g. locally similar states). This paper\npostulates multi-linear mappings to efficiently estimate the parameters of the\nRL policy. More precisely, we leverage the PARAFAC decomposition to design\ntensor low-rank policies. The key idea involves collecting the policy\nparameters into a tensor and leveraging tensor-completion techniques to enforce\nlow rank. We establish theoretical guarantees of the proposed methods for\nvarious policy classes and validate their efficacy through numerical\nexperiments. Specifically, we demonstrate that tensor low-rank policy models\nreduce computational and sample complexities in comparison to NN models while\nachieving similar rewards.\n","authors":["Sergio Rozada","Hoi-To Wai","Antonio G. Marques"],"pdf_url":"https://arxiv.org/pdf/2501.04879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00958v2","updated":"2025-01-08T23:16:20Z","published":"2024-05-02T02:50:58Z","title":"Generative manufacturing systems using diffusion models and ChatGPT","summary":"  In this study, we introduce Generative Manufacturing Systems (GMS) as a novel\napproach to effectively manage and coordinate autonomous manufacturing assets,\nthereby enhancing their responsiveness and flexibility to address a wide array\nof production objectives and human preferences. Deviating from traditional\nexplicit modeling, GMS employs generative AI, including diffusion models and\nChatGPT, for implicit learning from envisioned futures, marking a shift from a\nmodel-optimum to a training-sampling decision-making. Through the integration\nof generative AI, GMS enables complex decision-making through interactive\ndialogue with humans, allowing manufacturing assets to generate multiple\nhigh-quality global decisions that can be iteratively refined based on human\nfeedback. Empirical findings showcase GMS's substantial improvement in system\nresilience and responsiveness to uncertainties, with decision times reduced\nfrom seconds to milliseconds. The study underscores the inherent creativity and\ndiversity in the generated solutions, facilitating human-centric\ndecision-making through seamless and continuous human-machine interactions.\n","authors":["Xingyu Li","Fei Tao","Wei Ye","Aydin Nassehi","John W. Sutherland"],"pdf_url":"https://arxiv.org/pdf/2405.00958v2.pdf","comment":"We are withdrawing this preprint to incorporate significant new\n  results and expand the scope of the paper. We plan to resubmit a\n  substantially revised version in the near future"},{"id":"http://arxiv.org/abs/2501.04871v1","updated":"2025-01-08T23:04:32Z","published":"2025-01-08T23:04:32Z","title":"RieszBoost: Gradient Boosting for Riesz Regression","summary":"  Answering causal questions often involves estimating linear functionals of\nconditional expectations, such as the average treatment effect or the effect of\na longitudinal modified treatment policy. By the Riesz representation theorem,\nthese functionals can be expressed as the expected product of the conditional\nexpectation of the outcome and the Riesz representer, a key component in doubly\nrobust estimation methods. Traditionally, the Riesz representer is estimated\nindirectly by deriving its explicit analytical form, estimating its components,\nand substituting these estimates into the known form (e.g., the inverse\npropensity score). However, deriving or estimating the analytical form can be\nchallenging, and substitution methods are often sensitive to practical\npositivity violations, leading to higher variance and wider confidence\nintervals. In this paper, we propose a novel gradient boosting algorithm to\ndirectly estimate the Riesz representer without requiring its explicit\nanalytical form. This method is particularly suited for tabular data, offering\na flexible, nonparametric, and computationally efficient alternative to\nexisting methods for Riesz regression. Through simulation studies, we\ndemonstrate that our algorithm performs on par with or better than indirect\nestimation techniques across a range of functionals, providing a user-friendly\nand robust solution for estimating causal quantities.\n","authors":["Kaitlyn J. Lee","Alejandro Schuler"],"pdf_url":"https://arxiv.org/pdf/2501.04871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04870v1","updated":"2025-01-08T23:03:18Z","published":"2025-01-08T23:03:18Z","title":"Deep Transfer $Q$-Learning for Offline Non-Stationary Reinforcement\n  Learning","summary":"  In dynamic decision-making scenarios across business and healthcare,\nleveraging sample trajectories from diverse populations can significantly\nenhance reinforcement learning (RL) performance for specific target\npopulations, especially when sample sizes are limited. While existing transfer\nlearning methods primarily focus on linear regression settings, they lack\ndirect applicability to reinforcement learning algorithms. This paper pioneers\nthe study of transfer learning for dynamic decision scenarios modeled by\nnon-stationary finite-horizon Markov decision processes, utilizing neural\nnetworks as powerful function approximators and backward inductive learning. We\ndemonstrate that naive sample pooling strategies, effective in regression\nsettings, fail in Markov decision processes.To address this challenge, we\nintroduce a novel ``re-weighted targeting procedure'' to construct\n``transferable RL samples'' and propose ``transfer deep $Q^*$-learning'',\nenabling neural network approximation with theoretical guarantees. We assume\nthat the reward functions are transferable and deal with both situations in\nwhich the transition densities are transferable or nontransferable. Our\nanalytical techniques for transfer learning in neural network approximation and\ntransition density transfers have broader implications, extending to supervised\ntransfer learning with neural networks and domain shift scenarios. Empirical\nexperiments on both synthetic and real datasets corroborate the advantages of\nour method, showcasing its potential for improving decision-making through\nstrategically constructing transferable RL samples in non-stationary\nreinforcement learning contexts.\n","authors":["Jinhang Chai","Elynn Chen","Jianqing Fan"],"pdf_url":"https://arxiv.org/pdf/2501.04870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19595v2","updated":"2025-01-08T22:29:03Z","published":"2024-10-25T14:43:32Z","title":"Mask-Weighted Spatial Likelihood Coding for Speaker-Independent Joint\n  Localization and Mask Estimation","summary":"  Due to their robustness and flexibility, neural-driven beamformers are a\npopular choice for speech separation in challenging environments with a varying\namount of simultaneous speakers alongside noise and reverberation.\nTime-frequency masks and relative directions of the speakers regarding a fixed\nspatial grid can be used to estimate the beamformer's parameters. To some\ndegree, speaker-independence is achieved by ensuring a greater amount of\nspatial partitions than speech sources. In this work, we analyze how to encode\nboth mask and positioning into such a grid to enable joint estimation of both\nquantities. We propose mask-weighted spatial likelihood coding and show that it\nachieves considerable performance in both tasks compared to baseline encodings\noptimized for either localization or mask estimation. In the same setup, we\ndemonstrate superiority for joint estimation of both quantities. Conclusively,\nwe propose a universal approach which can replace an upstream sound source\nlocalization system solely by adapting the training framework, making it highly\nrelevant in performance-critical scenarios.\n","authors":["Jakob Kienegger","Alina Mannanova","Timo Gerkmann"],"pdf_url":"https://arxiv.org/pdf/2410.19595v2.pdf","comment":"\\copyright 2025 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2501.03489v2","updated":"2025-01-08T22:22:43Z","published":"2025-01-07T03:17:47Z","title":"Entropy-Guided Attention for Private LLMs","summary":"  The pervasiveness of proprietary language models has raised critical privacy\nconcerns, necessitating advancements in private inference (PI), where\ncomputations are performed directly on encrypted data without revealing users'\nsensitive information. While PI offers a promising solution, its practical\ndeployment is hindered by substantial communication and latency overheads,\nprimarily stemming from nonlinear operations. To address this, we introduce an\ninformation-theoretic framework to characterize the role of nonlinearities in\ndecoder-only language models, laying a principled foundation for optimizing\ntransformer-architectures tailored to the demands of PI.\n  By leveraging Shannon's entropy as a quantitative measure, we uncover the\npreviously unexplored dual significance of nonlinearities: beyond ensuring\ntraining stability, they are crucial for maintaining attention head diversity.\nSpecifically, we find that their removal triggers two critical failure modes:\n{\\em entropy collapse} in deeper layers that destabilizes training, and {\\em\nentropic overload} in earlier layers that leads to under-utilization of\nMulti-Head Attention's (MHA) representational capacity.\n  We propose an entropy-guided attention mechanism paired with a novel entropy\nregularization technique to mitigate entropic overload. Additionally, we\nexplore PI-friendly alternatives to layer normalization for preventing entropy\ncollapse and stabilizing the training of LLMs with reduced-nonlinearities. Our\nstudy bridges the gap between information theory and architectural design,\nestablishing entropy dynamics as a principled guide for developing efficient PI\narchitectures. The code and implementation are available at\nhttps://github.com/Nandan91/entropy-guided-attention-llm\n","authors":["Nandan Kumar Jha","Brandon Reagen"],"pdf_url":"https://arxiv.org/pdf/2501.03489v2.pdf","comment":"Accepted to the 6th AAAI Workshop on Privacy-Preserving Artificial\n  Intelligence (PPAI), 2025. arXiv admin note: substantial text overlap with\n  arXiv:2410.13060"},{"id":"http://arxiv.org/abs/2409.18153v2","updated":"2025-01-08T22:20:36Z","published":"2024-09-25T20:00:23Z","title":"Most Influential Subset Selection: Challenges, Promises, and Beyond","summary":"  How can we attribute the behaviors of machine learning models to their\ntraining data? While the classic influence function sheds light on the impact\nof individual samples, it often fails to capture the more complex and\npronounced collective influence of a set of samples. To tackle this challenge,\nwe study the Most Influential Subset Selection (MISS) problem, which aims to\nidentify a subset of training samples with the greatest collective influence.\nWe conduct a comprehensive analysis of the prevailing approaches in MISS,\nelucidating their strengths and weaknesses. Our findings reveal that\ninfluence-based greedy heuristics, a dominant class of algorithms in MISS, can\nprovably fail even in linear regression. We delineate the failure modes,\nincluding the errors of influence function and the non-additive structure of\nthe collective influence. Conversely, we demonstrate that an adaptive version\nof these heuristics which applies them iteratively, can effectively capture the\ninteractions among samples and thus partially address the issues. Experiments\non real-world datasets corroborate these theoretical findings and further\ndemonstrate that the merit of adaptivity can extend to more complex scenarios\nsuch as classification tasks and non-linear neural networks. We conclude our\nanalysis by emphasizing the inherent trade-off between performance and\ncomputational efficiency, questioning the use of additive metrics such as the\nLinear Datamodeling Score, and offering a range of discussions.\n","authors":["Yuzheng Hu","Pingbang Hu","Han Zhao","Jiaqi W. Ma"],"pdf_url":"https://arxiv.org/pdf/2409.18153v2.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n  Systems (NeurIPS 2024) Edit: Added discussion on a concurrent work"},{"id":"http://arxiv.org/abs/2406.14469v6","updated":"2025-01-08T21:47:16Z","published":"2024-06-20T16:32:18Z","title":"Forecasting Symmetric Random Walks: A Fusion Approach","summary":"  Forecasting random walks is notoriously challenging, with na\\\"ive prediction\nserving as a difficult-to-surpass baseline. To investigate the potential of\nusing movement predictions to improve point forecasts in this context, this\nstudy focuses on symmetric random walks, in which the target variable's future\nvalue is reformulated as a combination of its future movement and current\nvalue. The proposed forecasting method, termed the fusion of movement and\nna\\\"ive predictions (FMNP), is grounded in this reformulation. The simulation\nresults show that FMNP achieves statistically significant improvements over\nna\\\"ive prediction, even when the movement prediction accuracy is only slightly\nabove 0.50. In practice, movement predictions can be derived from the\ncomovement between an exogenous variable and the target variable and then\nlinearly combined with the na\\\"ive prediction to generate the final forecast.\nFMNP effectiveness was evaluated on four U.S. financial time series -- the\nclose prices of Boeing (BA), Brent crude oil (OIL), Halliburton (HAL), and\nSchlumberger (SLB) -- using the open price of the Financial Times Stock\nExchange (FTSE) index as the exogenous variable. In all the cases, FMNP\noutperformed the na\\\"ive prediction, demonstrating its efficacy in forecasting\nsymmetric random walks and its potential applicability to other forecasting\ntasks.\n","authors":["Cheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.14469v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04845v1","updated":"2025-01-08T21:13:50Z","published":"2025-01-08T21:13:50Z","title":"Intelligent experiments through real-time AI: Fast Data Processing and\n  Autonomous Detector Control for sPHENIX and future EIC detectors","summary":"  This R\\&D project, initiated by the DOE Nuclear Physics AI-Machine Learning\ninitiative in 2022, leverages AI to address data processing challenges in\nhigh-energy nuclear experiments (RHIC, LHC, and future EIC). Our focus is on\ndeveloping a demonstrator for real-time processing of high-rate data streams\nfrom sPHENIX experiment tracking detectors. The limitations of a 15 kHz maximum\ntrigger rate imposed by the calorimeters can be negated by intelligent use of\nstreaming technology in the tracking system. The approach efficiently\nidentifies low momentum rare heavy flavor events in high-rate p+p collisions\n(3MHz), using Graph Neural Network (GNN) and High Level Synthesis for Machine\nLearning (hls4ml). Success at sPHENIX promises immediate benefits, minimizing\nresources and accelerating the heavy-flavor measurements. The approach is\ntransferable to other fields. For the EIC, we develop a DIS-electron tagger\nusing Artificial Intelligence - Machine Learning (AI-ML) algorithms for\nreal-time identification, showcasing the transformative potential of AI and\nFPGA technologies in high-energy nuclear and particle experiments real-time\ndata processing pipelines.\n","authors":["J. Kvapil","G. Borca-Tasciuc","H. Bossi","K. Chen","Y. Chen","Y. Corrales Morales","H. Da Costa","C. Da Silva","C. Dean","J. Durham","S. Fu","C. Hao","P. Harris","O. Hen","H. Jheng","Y. Lee","P. Li","X. Li","Y. Lin","M. X. Liu","V. Loncar","J. P. Mitrevski","A. Olvera","M. L. Purschke","J. S. Renck","G. Roland","J. Schambach","Z. Shi","N. Tran","N. Wuerfel","B. Xu","D. Yu","H. Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.04845v1.pdf","comment":"proceedings for 42nd International Conference on High Energy Physics\n  (ICHEP2024), 18-24 July 2024, Prague, Czech Republic"},{"id":"http://arxiv.org/abs/2408.08260v2","updated":"2025-01-08T21:12:48Z","published":"2024-08-15T17:01:00Z","title":"GSVD-NMF: Recovering Missing Features in Non-negative Matrix\n  Factorization","summary":"  Non-negative matrix factorization (NMF) is an important tool in signal\nprocessing and widely used to separate mixed sources into their components.\nAlgorithms for NMF require that the user choose the number of components in\nadvance, and if the results are unsatisfying one typically needs to start again\nwith a different number of components. To make NMF more interactive and\nincremental, here we introduce GSVD-NMF, a method that proposes new components\nbased on the generalized singular value decomposition (GSVD) to address\ndiscrepancies between the initial under-complete NMF results and the SVD of the\noriginal matrix. Simulation and experimental results demonstrate that GSVD-NMF\noften effectively recovers multiple missing components in under-complete NMF,\nwith the recovered NMF solutions frequently reaching better local optima. The\nresults further show that GSVD-NMF is compatible with various NMF algorithms\nand that directly augmenting components is more efficient than rerunning NMF\nfrom scratch with additional components. By deliberately starting from\nunder-complete NMF, GSVD-NMF has the potential to be a recommended approach for\na range of general NMF applications.\n","authors":["Youdong Guo","Timothy E. Holy"],"pdf_url":"https://arxiv.org/pdf/2408.08260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16542v3","updated":"2025-01-08T21:05:26Z","published":"2024-03-25T08:35:19Z","title":"Differentially Private Online Federated Learning with Correlated Noise","summary":"  We introduce a novel differentially private algorithm for online federated\nlearning that employs temporally correlated noise to enhance utility while\nensuring privacy of continuously released models. To address challenges posed\nby DP noise and local updates with streaming non-iid data, we develop a\nperturbed iterate analysis to control the impact of the DP noise on the\nutility. Moreover, we demonstrate how the drift errors from local updates can\nbe effectively managed under a quasi-strong convexity condition. Subject to an\n$(\\epsilon, \\delta)$-DP budget, we establish a dynamic regret bound over the\nentire time horizon, quantifying the impact of key parameters and the intensity\nof changes in dynamic environments. Numerical experiments confirm the efficacy\nof the proposed algorithm.\n","authors":["Jiaojiao Zhang","Linglingzhi Zhu","Mikael Johansson"],"pdf_url":"https://arxiv.org/pdf/2403.16542v3.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2411.18752v2","updated":"2025-01-08T20:51:17Z","published":"2024-11-27T20:56:43Z","title":"Locally Differentially Private Online Federated Learning With Correlated\n  Noise","summary":"  We introduce a locally differentially private (LDP) algorithm for online\nfederated learning that employs temporally correlated noise to improve utility\nwhile preserving privacy. To address challenges posed by the correlated noise\nand local updates with streaming non-IID data, we develop a perturbed iterate\nanalysis that controls the impact of the noise on the utility. Moreover, we\ndemonstrate how the drift errors from local updates can be effectively managed\nfor several classes of nonconvex loss functions. Subject to an\n$(\\epsilon,\\delta)$-LDP budget, we establish a dynamic regret bound that\nquantifies the impact of key parameters and the intensity of changes in the\ndynamic environment on the learning performance. Numerical experiments confirm\nthe efficacy of the proposed algorithm.\n","authors":["Jiaojiao Zhang","Linglingzhi Zhu","Dominik Fay","Mikael Johansson"],"pdf_url":"https://arxiv.org/pdf/2411.18752v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.16542"},{"id":"http://arxiv.org/abs/2501.01447v2","updated":"2025-01-08T20:50:40Z","published":"2024-12-30T02:48:40Z","title":"Analyzing Country-Level Vaccination Rates and Determinants of Practical\n  Capacity to Administer COVID-19 Vaccines","summary":"  The COVID-19 vaccine development, manufacturing, transportation, and\nadministration proved an extreme logistics operation of global magnitude.\nGlobal vaccination levels, however, remain a key concern in preventing the\nemergence of new strains and minimizing the impact of the pandemic's disruption\nof daily life. In this paper, country-level vaccination rates are analyzed\nthrough a queuing framework to extract service rates that represent the\npractical capacity of a country to administer vaccines. These rates are further\ncharacterized through regression and interpretable machine learning methods\nwith country-level demographic, governmental, and socio-economic variates.\nModel results show that participation in multi-governmental collaborations such\nas COVAX may improve the ability to vaccinate. Similarly, improved\ntransportation and accessibility variates such as roads per area for low-income\ncountries and rail lines per area for high-income countries can improve rates.\nIt was also found that for low-income countries specifically, improvements in\nbasic and health infrastructure (as measured through spending on healthcare,\nnumber of doctors and hospital beds per 100k, population percent with access to\nelectricity, life expectancy, and vehicles per 1000 people) resulted in higher\nvaccination rates. Of the high-income countries, those with larger 65-plus\npopulations struggled to vaccinate at high rates, indicating potential\naccessibility issues for the elderly. This study finds that improving basic and\nhealth infrastructure, focusing on accessibility in the last mile, particularly\nfor the elderly, and fostering global partnerships can improve logistical\noperations of such a scale. Such structural impediments and inequities in\nglobal health care must be addressed in preparation for future global public\nhealth crises.\n","authors":["Sharika J. Hegde","Max T. M. Ng","Marcos Rios","Hani S. Mahmassani","Ying Chen","Karen Smilowitz"],"pdf_url":"https://arxiv.org/pdf/2501.01447v2.pdf","comment":"Under consideration for more thorough analysis"},{"id":"http://arxiv.org/abs/2501.04831v1","updated":"2025-01-08T20:36:40Z","published":"2025-01-08T20:36:40Z","title":"Quantum Hybrid Support Vector Machines for Stress Detection in Older\n  Adults","summary":"  Stress can increase the possibility of cognitive impairment and decrease the\nquality of life in older adults. Smart healthcare can deploy quantum machine\nlearning to enable preventive and diagnostic support. This work introduces a\nunique technique to address stress detection as an anomaly detection problem\nthat uses quantum hybrid support vector machines. With the help of a wearable\nsmartwatch, we mapped baseline sensor reading as normal data and stressed\nsensor reading as anomaly data using cortisol concentration as the ground\ntruth. We have used quantum computing techniques to explore the complex feature\nspaces with kernel-based preprocessing. We illustrate the usefulness of our\nmethod by doing experimental validation on 40 older adults with the help of the\nTSST protocol. Our findings highlight that using a limited number of features,\nquantum machine learning provides improved accuracy compared to classical\nmethods. We also observed that the recall value using quantum machine learning\nis higher compared to the classical method. The higher recall value illustrates\nthe potential of quantum machine learning in healthcare, as missing anomalies\ncould result in delayed diagnostics or treatment.\n","authors":["Md Saif Hassan Onim","Travis S. Humble","Himanshu Thapliyal"],"pdf_url":"https://arxiv.org/pdf/2501.04831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15237v3","updated":"2025-01-08T20:34:02Z","published":"2024-08-27T17:56:11Z","title":"The Mamba in the Llama: Distilling and Accelerating Hybrid Models","summary":"  Linear RNN architectures, like Mamba, can be competitive with Transformer\nmodels in language modeling while having advantageous deployment\ncharacteristics. Given the focus on training large-scale Transformer models, we\nconsider the challenge of converting these pretrained models for deployment. We\ndemonstrate that it is feasible to distill large Transformers into linear RNNs\nby reusing the linear projection weights from attention layers with academic\nGPU resources. The resulting hybrid model, which incorporates a quarter of the\nattention layers, achieves performance comparable to the original Transformer\nin chat benchmarks and outperforms open-source hybrid Mamba models trained from\nscratch with trillions of tokens in both chat benchmarks and general\nbenchmarks. Moreover, we introduce a hardware-aware speculative decoding\nalgorithm that accelerates the inference speed of Mamba and hybrid models.\nOverall we show how, with limited computation resources, we can remove many of\nthe original attention layers and generate from the resulting model more\nefficiently. Our top-performing model, distilled from Llama3-8B-Instruct,\nachieves a 29.61 length-controlled win rate on AlpacaEval 2 against GPT-4 and\n7.35 on MT-Bench, surpassing the best 8B scale instruction-tuned linear RNN\nmodel. We also find that the distilled model has natural length extrapolation,\nshowing almost perfect accuracy in the needle-in-a-haystack test at 20x the\ndistillation length. Code and pre-trained checkpoints are open-sourced at\nhttps://github.com/jxiw/MambaInLlama and\nhttps://github.com/itsdaniele/speculative_mamba.\n","authors":["Junxiong Wang","Daniele Paliotta","Avner May","Alexander M. Rush","Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2408.15237v3.pdf","comment":"NeurIPS 2024. v3 updates: fix format errors"},{"id":"http://arxiv.org/abs/2501.04826v1","updated":"2025-01-08T20:26:13Z","published":"2025-01-08T20:26:13Z","title":"Intelligent Gradient Boosting Algorithms for Estimating Strength of\n  Modified Subgrade Soil","summary":"  The performance of pavement under loading depends on the strength of the\nsubgrade. However, experimental estimation of properties of pavement strengths\nsuch as California bearing ratio (CBR), unconfined compressive strength (UCS)\nand resistance value (R) are often tedious, time-consuming and costly, thereby\ninspiring a growing interest in machine learning based tools which are simple,\ncheap and fast alternatives. Thus, the potential application of two boosting\ntechniques; categorical boosting (CatBoost) and extreme gradient boosting\n(XGBoost) and support vector regression (SVR), is similarly explored in this\nstudy for estimation of properties of subgrade soil modified with hydrated lime\nactivated rice husk ash (HARSH). Using 121 experimental data samples of varying\nproportions of HARSH, plastic limit, liquid limit, plasticity index, clay\nactivity, optimum moisture content, and maximum dry density as input for CBR,\nUCS and R estimation, four evaluation metrics namely coefficient of\ndetermination (R2), root mean squared error (RMSE), mean absolute error (MAE)\nand mean absolute percentage error (MAPE) are used to evaluate the models'\nperformance. The results indicate that XGBoost outperformed CatBoost and SVR in\nestimating these properties, yielding R2 of 0.9994, 0.9995 and 0.9999 in\nestimating the CBR, UCS and R respectively. Also, SVR outperformed CatBoost in\nestimating the CBR and R with R2 of 0.9997 respectively. On the other hand,\nCatBoost outperformed SVR in estimating the UCS with R2 of 0.9994. Feature\nsensitivity analysis shows that the three machine learning techniques are\nunanimous that increasing HARSH proportion lead to values of the estimated\nproperties respectively. A comparison with previous results also shows\nsuperiority of XGBoost in estimating subgrade properties.\n","authors":["Ismail B. Mustapha","Muyideen Abdulkareem","Shafaatunnur Hasan","Abideen Ganiyu","Hatem Nabus","Jin Chai Lee"],"pdf_url":"https://arxiv.org/pdf/2501.04826v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2501.04817v1","updated":"2025-01-08T20:14:07Z","published":"2025-01-08T20:14:07Z","title":"Decentralised Resource Sharing in TinyML: Wireless Bilayer Gossip\n  Parallel SGD for Collaborative Learning","summary":"  With the growing computational capabilities of microcontroller units (MCUs),\nedge devices can now support machine learning models. However, deploying\ndecentralised federated learning (DFL) on such devices presents key challenges,\nincluding intermittent connectivity, limited communication range, and dynamic\nnetwork topologies. This paper proposes a novel framework, bilayer Gossip\nDecentralised Parallel Stochastic Gradient Descent (GD PSGD), designed to\naddress these issues in resource-constrained environments. The framework\nincorporates a hierarchical communication structure using Distributed Kmeans\n(DKmeans) clustering for geographic grouping and a gossip protocol for\nefficient model aggregation across two layers: intra-cluster and inter-cluster.\nWe evaluate the framework's performance against the Centralised Federated\nLearning (CFL) baseline using the MCUNet model on the CIFAR-10 dataset under\nIID and Non-IID conditions. Results demonstrate that the proposed method\nachieves comparable accuracy to CFL on IID datasets, requiring only 1.8\nadditional rounds for convergence. On Non-IID datasets, the accuracy loss\nremains under 8\\% for moderate data imbalance. These findings highlight the\nframework's potential to support scalable and privacy-preserving learning on\nedge devices with minimal performance trade-offs.\n","authors":["Ziyuan Bao","Eiman Kanjo","Soumya Banerjee","Hasib-Al Rashid","Tinoosh Mohsenin"],"pdf_url":"https://arxiv.org/pdf/2501.04817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04816v1","updated":"2025-01-08T20:12:33Z","published":"2025-01-08T20:12:33Z","title":"Probabilistic Skip Connections for Deterministic Uncertainty\n  Quantification in Deep Neural Networks","summary":"  Deterministic uncertainty quantification (UQ) in deep learning aims to\nestimate uncertainty with a single pass through a network by leveraging outputs\nfrom the network's feature extractor. Existing methods require that the feature\nextractor be both sensitive and smooth, ensuring meaningful input changes\nproduce meaningful changes in feature vectors. Smoothness enables\ngeneralization, while sensitivity prevents feature collapse, where distinct\ninputs are mapped to identical feature vectors. To meet these requirements,\ncurrent deterministic methods often retrain networks with spectral\nnormalization. Instead of modifying training, we propose using measures of\nneural collapse to identify an existing intermediate layer that is both\nsensitive and smooth. We then fit a probabilistic model to the feature vector\nof this intermediate layer, which we call a probabilistic skip connection\n(PSC). Through empirical analysis, we explore the impact of spectral\nnormalization on neural collapse and demonstrate that PSCs can effectively\ndisentangle aleatoric and epistemic uncertainty. Additionally, we show that\nPSCs achieve uncertainty quantification and out-of-distribution (OOD) detection\nperformance that matches or exceeds existing single-pass methods requiring\ntraining modifications. By retrofitting existing models, PSCs enable\nhigh-quality UQ and OOD capabilities without retraining.\n","authors":["Felix Jimenez","Matthias Katzfuss"],"pdf_url":"https://arxiv.org/pdf/2501.04816v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2412.16339v2","updated":"2025-01-08T20:11:59Z","published":"2024-12-20T21:00:11Z","title":"Deliberative Alignment: Reasoning Enables Safer Language Models","summary":"  As large-scale language models increasingly impact safety-critical domains,\nensuring their reliable adherence to well-defined principles remains a\nfundamental challenge. We introduce Deliberative Alignment, a new paradigm that\ndirectly teaches the model safety specifications and trains it to explicitly\nrecall and accurately reason over the specifications before answering. We used\nthis approach to align OpenAI's o-series models, and achieved highly precise\nadherence to OpenAI's safety policies, without requiring human-written\nchain-of-thoughts or answers. Deliberative Alignment pushes the Pareto frontier\nby simultaneously increasing robustness to jailbreaks while decreasing\noverrefusal rates, and also improves out-of-distribution generalization. We\ndemonstrate that reasoning over explicitly specified policies enables more\nscalable, trustworthy, and interpretable alignment.\n","authors":["Melody Y. Guan","Manas Joglekar","Eric Wallace","Saachi Jain","Boaz Barak","Alec Helyar","Rachel Dias","Andrea Vallone","Hongyu Ren","Jason Wei","Hyung Won Chung","Sam Toyer","Johannes Heidecke","Alex Beutel","Amelia Glaese"],"pdf_url":"https://arxiv.org/pdf/2412.16339v2.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2501.01950v2","updated":"2025-01-08T20:09:16Z","published":"2025-01-03T18:54:26Z","title":"MADGEN: Mass-Spec attends to De Novo Molecular generation","summary":"  The annotation (assigning structural chemical identities) of MS/MS spectra\nremains a significant challenge due to the enormous molecular diversity in\nbiological samples and the limited scope of reference databases. Currently, the\nvast majority of spectral measurements remain in the \"dark chemical space\"\nwithout structural annotations. To improve annotation, we propose MADGEN\n(Mass-spec Attends to De Novo Molecular GENeration), a scaffold-based method\nfor de novo molecular structure generation guided by mass spectrometry data.\nMADGEN operates in two stages: scaffold retrieval and spectra-conditioned\nmolecular generation starting with the scaffold. In the first stage, given an\nMS/MS spectrum, we formulate scaffold retrieval as a ranking problem and employ\ncontrastive learning to align mass spectra with candidate molecular scaffolds.\nIn the second stage, starting from the retrieved scaffold, we employ the MS/MS\nspectrum to guide an attention-based generative model to generate the final\nmolecule. Our approach constrains the molecular generation search space,\nreducing its complexity and improving generation accuracy. We evaluate MADGEN\non three datasets (NIST23, CANOPUS, and MassSpecGym) and evaluate MADGEN's\nperformance with a predictive scaffold retriever and with an oracle retriever.\nWe demonstrate the effectiveness of using attention to integrate spectral\ninformation throughout the generation process to achieve strong results with\nthe oracle retriever.\n","authors":["Yinkai Wang","Xiaohui Chen","Liping Liu","Soha Hassoun"],"pdf_url":"https://arxiv.org/pdf/2501.01950v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2501.04811v1","updated":"2025-01-08T19:59:48Z","published":"2025-01-08T19:59:48Z","title":"Fast, Fine-Grained Equivalence Checking for Neural Decompilers","summary":"  Neural decompilers are machine learning models that reconstruct the source\ncode from an executable program. Critical to the lifecycle of any machine\nlearning model is an evaluation of its effectiveness. However, existing\ntechniques for evaluating neural decompilation models have substantial\nweaknesses, especially when it comes to showing the correctness of the neural\ndecompiler's predictions. To address this, we introduce codealign, a novel\ninstruction-level code equivalence technique designed for neural decompilers.\nWe provide a formal definition of a relation between equivalent instructions,\nwhich we term an equivalence alignment. We show how codealign generates\nequivalence alignments, then evaluate codealign by comparing it with symbolic\nexecution. Finally, we show how the information codealign provides-which parts\nof the functions are equivalent and how well the variable names match-is\nsubstantially more detailed than existing state-of-the-art evaluation metrics,\nwhich report unitless numbers measuring similarity.\n","authors":["Luke Dramko","Claire Le Goues","Edward J. Schwartz"],"pdf_url":"https://arxiv.org/pdf/2501.04811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04794v1","updated":"2025-01-08T19:18:44Z","published":"2025-01-08T19:18:44Z","title":"A Steerable Deep Network for Model-Free Diffusion MRI Registration","summary":"  Nonrigid registration is vital to medical image analysis but remains\nchallenging for diffusion MRI (dMRI) due to its high-dimensional,\norientation-dependent nature. While classical methods are accurate, they are\ncomputationally demanding, and deep neural networks, though efficient, have\nbeen underexplored for nonrigid dMRI registration compared to structural\nimaging. We present a novel, deep learning framework for model-free, nonrigid\nregistration of raw diffusion MRI data that does not require explicit\nreorientation. Unlike previous methods relying on derived representations such\nas diffusion tensors or fiber orientation distribution functions, in our\napproach, we formulate the registration as an equivariant diffeomorphism of\nposition-and-orientation space. Central to our method is an\n$\\mathsf{SE}(3)$-equivariant UNet that generates velocity fields while\npreserving the geometric properties of a raw dMRI's domain. We introduce a new\nloss function based on the maximum mean discrepancy in Fourier space,\nimplicitly matching ensemble average propagators across images. Experimental\nresults on Human Connectome Project dMRI data demonstrate competitive\nperformance compared to state-of-the-art approaches, with the added advantage\nof bypassing the overhead for estimating derived representations. This work\nestablishes a foundation for data-driven, geometry-aware dMRI registration\ndirectly in the acquisition space.\n","authors":["Gianfranco Cortes","Baba C. Vemuri"],"pdf_url":"https://arxiv.org/pdf/2501.04794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11977v2","updated":"2025-01-08T19:17:14Z","published":"2024-10-15T18:33:42Z","title":"Generative AI Policies under the Microscope: How CS Conferences Are\n  Navigating the New Frontier in Scholarly Writing","summary":"  This paper explores the current state of generative AI policies of computer\nscience conferences and offers guidelines for policy adoption.\n","authors":["Mahjabin Nahar","Sian Lee","Becky Guillen","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2410.11977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04784v1","updated":"2025-01-08T19:02:32Z","published":"2025-01-08T19:02:32Z","title":"Leveraging Registers in Vision Transformers for Robust Adaptation","summary":"  Vision Transformers (ViTs) have shown success across a variety of tasks due\nto their ability to capture global image representations. Recent studies have\nidentified the existence of high-norm tokens in ViTs, which can interfere with\nunsupervised object discovery. To address this, the use of \"registers\" which\nare additional tokens that isolate high norm patch tokens while capturing\nglobal image-level information has been proposed. While registers have been\nstudied extensively for object discovery, their generalization properties\nparticularly in out-of-distribution (OOD) scenarios, remains underexplored. In\nthis paper, we examine the utility of register token embeddings in providing\nadditional features for improving generalization and anomaly rejection. To that\nend, we propose a simple method that combines the special CLS token embedding\ncommonly employed in ViTs with the average-pooled register embeddings to create\nfeature representations which are subsequently used for training a downstream\nclassifier. We find that this enhances OOD generalization and anomaly\nrejection, while maintaining in-distribution (ID) performance. Extensive\nexperiments across multiple ViT backbones trained with and without registers\nreveal consistent improvements of 2-4\\% in top-1 OOD accuracy and a 2-3\\%\nreduction in false positive rates for anomaly detection. Importantly, these\ngains are achieved without additional computational overhead.\n","authors":["Srikar Yellapragada","Kowshik Thopalli","Vivek Narayanaswamy","Wesam Sakla","Yang Liu","Yamen Mubarka","Dimitris Samaras","Jayaraman J. Thiagarajan"],"pdf_url":"https://arxiv.org/pdf/2501.04784v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2410.17309v3","updated":"2025-01-08T19:00:00Z","published":"2024-10-22T18:00:00Z","title":"Literature Meets Data: A Synergistic Approach to Hypothesis Generation","summary":"  AI holds promise for transforming scientific processes, including hypothesis\ngeneration. Prior work on hypothesis generation can be broadly categorized into\ntheory-driven and data-driven approaches. While both have proven effective in\ngenerating novel and plausible hypotheses, it remains an open question whether\nthey can complement each other. To address this, we develop the first method\nthat combines literature-based insights with data to perform LLM-powered\nhypothesis generation. We apply our method on five different datasets and\ndemonstrate that integrating literature and data outperforms other baselines\n(8.97\\% over few-shot, 15.75\\% over literature-based alone, and 3.37\\% over\ndata-driven alone). Additionally, we conduct the first human evaluation to\nassess the utility of LLM-generated hypotheses in assisting human\ndecision-making on two challenging tasks: deception detection and AI generated\ncontent detection. Our results show that human accuracy improves significantly\nby 7.44\\% and 14.19\\% on these tasks, respectively. These findings suggest that\nintegrating literature-based and data-driven approaches provides a\ncomprehensive and nuanced framework for hypothesis generation and could open\nnew avenues for scientific inquiry.\n","authors":["Haokun Liu","Yangqiaoyu Zhou","Mingxuan Li","Chenfei Yuan","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2410.17309v3.pdf","comment":"37 pages, 9 figures, code link:\n  https://github.com/ChicagoHAI/hypothesis-generation"},{"id":"http://arxiv.org/abs/2501.04762v1","updated":"2025-01-08T18:08:48Z","published":"2025-01-08T18:08:48Z","title":"Efficient and Responsible Adaptation of Large Language Models for Robust\n  and Equitable Top-k Recommendations","summary":"  Conventional recommendation systems (RSs) are typically optimized to enhance\nperformance metrics uniformly across all training samples, inadvertently\noverlooking the needs of diverse user populations. The performance disparity\namong various populations can harm the model's robustness to sub-populations\ndue to the varying user properties. While large language models (LLMs) show\npromise in enhancing RS performance, their practical applicability is hindered\nby high costs, inference latency, and degraded performance on long user\nqueries. To address these challenges, we propose a hybrid task allocation\nframework designed to promote social good by equitably serving all user groups.\nBy adopting a two-phase approach, we promote a strategic assignment of tasks\nfor efficient and responsible adaptation of LLMs. Our strategy works by first\nidentifying the weak and inactive users that receive a suboptimal ranking\nperformance by RSs. Next, we use an in-context learning approach for such\nusers, wherein each user interaction history is contextualized as a distinct\nranking task. We evaluate our hybrid framework by incorporating eight different\nrecommendation algorithms and three different LLMs -- both open and\nclose-sourced. Our results on three real-world datasets show a significant\nreduction in weak users and improved robustness to subpopulations without\ndisproportionately escalating costs.\n","authors":["Kirandeep Kaur","Manya Chadha","Vinayak Gupta","Chirag Shah"],"pdf_url":"https://arxiv.org/pdf/2501.04762v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.00824"},{"id":"http://arxiv.org/abs/2501.04757v1","updated":"2025-01-08T17:11:56Z","published":"2025-01-08T17:11:56Z","title":"DAREK -- Distance Aware Error for Kolmogorov Networks","summary":"  In this paper, we provide distance-aware error bounds for Kolmogorov Arnold\nNetworks (KANs). We call our new error bounds estimator DAREK -- Distance Aware\nError for Kolmogorov networks. Z. Liu et al. provide error bounds, which may be\nloose, lack distance-awareness, and are defined only up to an unknown constant\nof proportionality. We review the error bounds for Newton's polynomial, which\nis then generalized to an arbitrary spline, under Lipschitz continuity\nassumptions. We then extend these bounds to nested compositions of splines,\narriving at error bounds for KANs. We evaluate our method by estimating an\nobject's shape from sparse laser scan points. We use KAN to fit a smooth\nfunction to the scans and provide error bounds for the fit. We find that our\nmethod is faster than Monte Carlo approaches, and that our error bounds enclose\nthe true obstacle shape reliably.\n","authors":["Masoud Ataei","Mohammad Javad Khojasteh","Vikas Dhiman"],"pdf_url":"https://arxiv.org/pdf/2501.04757v1.pdf","comment":"Accepted at ICASSP25, 5 pages + 2 pages supplementary material, 3\n  figures"},{"id":"http://arxiv.org/abs/2501.04613v1","updated":"2025-01-08T16:53:17Z","published":"2025-01-08T16:53:17Z","title":"A Semantic Partitioning Method for Large-Scale Training of Knowledge\n  Graph Embeddings","summary":"  In recent years, knowledge graph embeddings have achieved great success. Many\nmethods have been proposed and achieved state-of-the-art results in various\ntasks. However, most of the current methods present one or more of the\nfollowing problems: (i) They only consider fact triplets, while ignoring the\nontology information of knowledge graphs. (ii) The obtained embeddings do not\ncontain much semantic information. Therefore, using these embeddings for\nsemantic tasks is problematic. (iii) They do not enable large-scale training.\nIn this paper, we propose a new algorithm that incorporates the ontology of\nknowledge graphs and partitions the knowledge graph based on classes to include\nmore semantic information for parallel training of large-scale knowledge graph\nembeddings. Our preliminary results show that our algorithm performs well on\nseveral popular benchmarks.\n","authors":["Yuhe Bai"],"pdf_url":"https://arxiv.org/pdf/2501.04613v1.pdf","comment":"Accepted at WWW '23 Companion: Companion Proceedings of the ACM Web\n  Conference 2023"},{"id":"http://arxiv.org/abs/2501.04610v1","updated":"2025-01-08T16:47:45Z","published":"2025-01-08T16:47:45Z","title":"Resilient Peer-to-peer Learning based on Adaptive Aggregation","summary":"  Collaborative learning in peer-to-peer networks offers the benefits of\ndistributed learning while mitigating the risks associated with single points\nof failure inherent in centralized servers. However, adversarial workers pose\npotential threats by attempting to inject malicious information into the\nnetwork. Thus, ensuring the resilience of peer-to-peer learning emerges as a\npivotal research objective. The challenge is exacerbated in the presence of\nnon-convex loss functions and non-iid data distributions. This paper introduces\na resilient aggregation technique tailored for such scenarios, aimed at\nfostering similarity among peers' learning processes. The aggregation weights\nare determined through an optimization procedure, and use the loss function\ncomputed using the neighbor's models and individual private data, thereby\naddressing concerns regarding data privacy in distributed machine learning.\nTheoretical analysis demonstrates convergence of parameters with non-convex\nloss functions and non-iid data distributions. Empirical evaluations across\nthree distinct machine learning tasks support the claims. The empirical\nfindings, which encompass a range of diverse attack models, also demonstrate\nimproved accuracy when compared to existing methodologies.\n","authors":["Chandreyee Bhowmick","Xenofon Koutsoukos"],"pdf_url":"https://arxiv.org/pdf/2501.04610v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2501.04608v1","updated":"2025-01-08T16:44:06Z","published":"2025-01-08T16:44:06Z","title":"Comprehensive Examination of Unrolled Networks for Linear Inverse\n  Problems","summary":"  Unrolled networks have become prevalent in various computer vision and\nimaging tasks. Although they have demonstrated remarkable efficacy in solving\nspecific computer vision and computational imaging tasks, their adaptation to\nother applications presents considerable challenges. This is primarily due to\nthe multitude of design decisions that practitioners working on new\napplications must navigate, each potentially affecting the network's overall\nperformance. These decisions include selecting the optimization algorithm,\ndefining the loss function, and determining the number of convolutional layers,\namong others. Compounding the issue, evaluating each design choice requires\ntime-consuming simulations to train, fine-tune the neural network, and optimize\nfor its performance. As a result, the process of exploring multiple options and\nidentifying the optimal configuration becomes time-consuming and\ncomputationally demanding. The main objectives of this paper are (1) to unify\nsome ideas and methodologies used in unrolled networks to reduce the number of\ndesign choices a user has to make, and (2) to report a comprehensive ablation\nstudy to discuss the impact of each of the choices involved in designing\nunrolled networks and present practical recommendations based on our findings.\nWe anticipate that this study will help scientists and engineers design\nunrolled networks for their applications and diagnose problems within their\nnetworks efficiently.\n","authors":["Eric Chen","Xi Chen","Arian Maleki","Shirin Jalali"],"pdf_url":"https://arxiv.org/pdf/2501.04608v1.pdf","comment":"27 pages, 10 figures. Project Page:\n  https://github.com/YuxiChen25/Memory-Net-Inverse"},{"id":"http://arxiv.org/abs/2410.05898v5","updated":"2025-01-08T16:43:41Z","published":"2024-10-08T10:55:40Z","title":"Manifolds, Random Matrices and Spectral Gaps: The geometric phases of\n  generative diffusion","summary":"  In this paper, we investigate the latent geometry of generative diffusion\nmodels under the manifold hypothesis. For this purpose, we analyze the spectrum\nof eigenvalues (and singular values) of the Jacobian of the score function,\nwhose discontinuities (gaps) reveal the presence and dimensionality of distinct\nsub-manifolds. Using a statistical physics approach, we derive the spectral\ndistributions and formulas for the spectral gaps under several distributional\nassumptions, and we compare these theoretical predictions with the spectra\nestimated from trained networks. Our analysis reveals the existence of three\ndistinct qualitative phases during the generative process: a trivial phase; a\nmanifold coverage phase where the diffusion process fits the distribution\ninternal to the manifold; a consolidation phase where the score becomes\northogonal to the manifold and all particles are projected on the support of\nthe data. This `division of labor' between different timescales provides an\nelegant explanation of why generative diffusion models are not affected by the\nmanifold overfitting phenomenon that plagues likelihood-based models, since the\ninternal distribution and the manifold geometry are produced at different time\npoints during generation.\n","authors":["Enrico Ventura","Beatrice Achilli","Gianluigi Silvestri","Carlo Lucibello","Luca Ambrogioni"],"pdf_url":"https://arxiv.org/pdf/2410.05898v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03304v2","updated":"2025-01-08T16:41:03Z","published":"2025-01-06T16:04:56Z","title":"LiLMaps: Learnable Implicit Language Maps","summary":"  One of the current trends in robotics is to employ large language models\n(LLMs) to provide non-predefined command execution and natural human-robot\ninteraction. It is useful to have an environment map together with its language\nrepresentation, which can be further utilized by LLMs. Such a comprehensive\nscene representation enables numerous ways of interaction with the map for\nautonomously operating robots. In this work, we present an approach that\nenhances incremental implicit mapping through the integration of\nvision-language features. Specifically, we (i) propose a decoder optimization\ntechnique for implicit language maps which can be used when new objects appear\non the scene, and (ii) address the problem of inconsistent vision-language\npredictions between different viewing positions. Our experiments demonstrate\nthe effectiveness of LiLMaps and solid improvements in performance.\n","authors":["Evgenii Kruzhkov","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2501.03304v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15856v2","updated":"2025-01-08T16:31:06Z","published":"2024-01-29T03:07:04Z","title":"The Indoor-Training Effect: unexpected gains from distribution shifts in\n  the transition function","summary":"  Is it better to perform tennis training in a pristine indoor environment or a\nnoisy outdoor one? To model this problem, here we investigate whether shifts in\nthe transition probabilities between the training and testing environments in\nreinforcement learning problems can lead to better performance under certain\nconditions. We generate new Markov Decision Processes (MDPs) starting from a\ngiven MDP, by adding quantifiable, parametric noise into the transition\nfunction. We refer to this process as Noise Injection and the resulting\nenvironments as {\\delta}-environments. This process allows us to create\nvariations of the same environment with quantitative control over noise serving\nas a metric of distance between environments. Conventional wisdom suggests that\ntraining and testing on the same MDP should yield the best results. In stark\ncontrast, we observe that agents can perform better when trained on the\nnoise-free environment and tested on the noisy {\\delta}-environments, compared\nto training and testing on the same {\\delta}-environments. We confirm that this\nfinding extends beyond noise variations: it is possible to showcase the same\nphenomenon in ATARI game variations including varying Ghost behaviour in\nPacMan, and Paddle behaviour in Pong. We demonstrate this intriguing behaviour\nacross 60 different variations of ATARI games, including PacMan, Pong, and\nBreakout. We refer to this phenomenon as the Indoor-Training Effect. Code to\nreproduce our experiments and to implement Noise Injection can be found at\nhttps://bit.ly/3X6CTYk.\n","authors":["Serena Bono","Spandan Madan","Ishaan Grover","Mao Yasueda","Cynthia Breazeal","Hanspeter Pfister","Gabriel Kreiman"],"pdf_url":"https://arxiv.org/pdf/2401.15856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04750v1","updated":"2025-01-08T16:17:05Z","published":"2025-01-08T16:17:05Z","title":"Efficient License Plate Recognition in Videos Using Visual Rhythm and\n  Accumulative Line Analysis","summary":"  Video-based Automatic License Plate Recognition (ALPR) involves extracting\nvehicle license plate text information from video captures. Traditional systems\ntypically rely heavily on high-end computing resources and utilize multiple\nframes to recognize license plates, leading to increased computational\noverhead. In this paper, we propose two methods capable of efficiently\nextracting exactly one frame per vehicle and recognizing its license plate\ncharacters from this single image, thus significantly reducing computational\ndemands. The first method uses Visual Rhythm (VR) to generate time-spatial\nimages from videos, while the second employs Accumulative Line Analysis (ALA),\na novel algorithm based on single-line video processing for real-time\noperation. Both methods leverage YOLO for license plate detection within the\nframe and a Convolutional Neural Network (CNN) for Optical Character\nRecognition (OCR) to extract textual information. Experiments on real videos\ndemonstrate that the proposed methods achieve results comparable to traditional\nframe-by-frame approaches, with processing speeds three times faster.\n","authors":["Victor Nascimento Ribeiro","Nina S. T. Hirata"],"pdf_url":"https://arxiv.org/pdf/2501.04750v1.pdf","comment":"Accepted for presentation at the Conference on Graphics, Patterns and\n  Images (SIBGRAPI) 2024"},{"id":"http://arxiv.org/abs/2501.04588v1","updated":"2025-01-08T16:06:39Z","published":"2025-01-08T16:06:39Z","title":"Federated-Continual Dynamic Segmentation of Histopathology guided by\n  Barlow Continuity","summary":"  Federated- and Continual Learning have been established as approaches to\nenable privacy-aware learning on continuously changing data, as required for\ndeploying AI systems in histopathology images. However, data shifts can occur\nin a dynamic world, spatially between institutions and temporally, due to\nchanging data over time. This leads to two issues: Client Drift, where the\ncentral model degrades from aggregating data from clients trained on shifted\ndata, and Catastrophic Forgetting, from temporal shifts such as changes in\npatient populations. Both tend to degrade the model's performance of previously\nseen data or spatially distributed training. Despite both problems arising from\nthe same underlying problem of data shifts, existing research addresses them\nonly individually. In this work, we introduce a method that can jointly\nalleviate Client Drift and Catastrophic Forgetting by using our proposed\nDynamic Barlow Continuity that evaluates client updates on a public reference\ndataset and uses this to guide the training process to a spatially and\ntemporally shift-invariant model. We evaluate our approach on the\nhistopathology datasets BCSS and Semicol and prove our method to be highly\neffective by jointly improving the dice score as much as from 15.8% to 71.6% in\nClient Drift and from 42.5% to 62.8% in Catastrophic Forgetting. This enables\nDynamic Learning by establishing spatio-temporal shift-invariance.\n","authors":["Niklas Babendererde","Haozhe Zhu","Moritz Fuchs","Jonathan Stieber","Anirban Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2501.04588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03692v3","updated":"2025-01-08T16:05:00Z","published":"2024-01-08T06:46:39Z","title":"Boosting Column Generation with Graph Neural Networks for Joint Rider\n  Trip Planning and Crew Shift Scheduling","summary":"  Optimizing service schedules is pivotal to the reliable, efficient, and\ninclusive on-demand mobility. This pressing challenge is further exacerbated by\nthe increasing needs of an aging population, the oversubscription of existing\nservices, and the lack of effective solution methods. This study addresses the\nintricacies of service scheduling, by jointly optimizing rider trip planning\nand crew scheduling for a complex dynamic mobility service. The resulting\noptimization problems are extremely challenging computationally for\nstate-of-the-art methods. To address this fundamental gap, this paper\nintroduces the Joint Rider Trip Planning and Crew Shift Scheduling Problem\n(JRTPCSSP) and a novel solution method, called Attention and Gated GNN-Informed\nColumn Generation (AGGNNI-CG), that hybridizes column generation and machine\nlearning to obtain near-optimal solutions to the JRTPCSSP with real-life\nconstraints of the application. The key idea of the machine-learning component\nis to dramatically reduce the number of paths to explore in the pricing\nproblem, accelerating the most time-consuming component of the column\ngeneration. The machine learning component is a graph neural network with an\nattention mechanism and a gated architecture, which is particularly suited to\ncater for the different input sizes coming from daily operations. AGGNNI-CG has\nbeen applied to a challenging, real-world dataset from the Paratransit system\nof Chatham County in Georgia. It produces substantial improvements compared to\nthe baseline column generation approach, which typically cannot produce\nhigh-quality feasible solutions in reasonable time on large-scale complex\ninstances. AGGNNI-CG also produces significant improvements in service quality\ncompared to the existing system.\n","authors":["Jiawei Lu","Tinghan Ye","Wenbo Chen","Pascal Van Hentenryck"],"pdf_url":"https://arxiv.org/pdf/2401.03692v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12553v3","updated":"2025-01-08T16:02:24Z","published":"2023-01-29T22:00:53Z","title":"Asymptotic Inference for Multi-Stage Stationary Treatment Policy with\n  Variable Selection","summary":"  Dynamic treatment regimes or policies are a sequence of decision functions\nover multiple stages that are tailored to individual features. One important\nclass of treatment policies in practice, namely multi-stage stationary\ntreatment policies, prescribes treatment assignment probabilities using the\nsame decision function across stages, where the decision is based on the same\nset of features consisting of time-evolving variables (e.g., routinely\ncollected disease biomarkers). Although there has been extensive literature on\nconstructing valid inference for the value function associated with dynamic\ntreatment policies, little work has focused on the policies themselves,\nespecially in the presence of high-dimensional feature variables. We aim to\nfill the gap in this work. Specifically, we first estimate the multi-stage\nstationary treatment policy using an augmented inverse probability weighted\nestimator for the value function to increase asymptotic efficiency, and further\napply a penalty to select important feature variables. We then construct\none-step improvements of the policy parameter estimators for valid inference.\nTheoretically, we show that the improved estimators are asymptotically normal,\neven if nuisance parameters are estimated at a slow convergence rate and the\ndimension of the feature variables increases with the sample size. Our\nnumerical studies demonstrate that the proposed method estimates a sparse\npolicy with a near-optimal value function and conducts valid inference for the\npolicy parameters.\n","authors":["Daiqi Gao","Yufeng Liu","Donglin Zeng"],"pdf_url":"https://arxiv.org/pdf/2301.12553v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03432v2","updated":"2025-01-08T15:57:01Z","published":"2025-01-06T23:28:19Z","title":"Mixture-of-Experts Graph Transformers for Interpretable Particle\n  Collision Detection","summary":"  The Large Hadron Collider at CERN produces immense volumes of complex data\nfrom high-energy particle collisions, demanding sophisticated analytical\ntechniques for effective interpretation. Neural Networks, including Graph\nNeural Networks, have shown promise in tasks such as event classification and\nobject identification by representing collisions as graphs. However, while\nGraph Neural Networks excel in predictive accuracy, their \"black box\" nature\noften limits their interpretability, making it difficult to trust their\ndecision-making processes. In this paper, we propose a novel approach that\ncombines a Graph Transformer model with Mixture-of-Expert layers to achieve\nhigh predictive performance while embedding interpretability into the\narchitecture. By leveraging attention maps and expert specialization, the model\noffers insights into its internal decision-making, linking predictions to\nphysics-informed features. We evaluate the model on simulated events from the\nATLAS experiment, focusing on distinguishing rare Supersymmetric signal events\nfrom Standard Model background. Our results highlight that the model achieves\ncompetitive classification accuracy while providing interpretable outputs that\nalign with known physics, demonstrating its potential as a robust and\ntransparent tool for high-energy physics data analysis. This approach\nunderscores the importance of explainability in machine learning methods\napplied to high energy physics, offering a path toward greater trust in\nAI-driven discoveries.\n","authors":["Donatella Genovese","Alessandro Sgroi","Alessio Devoto","Samuel Valentine","Lennox Wood","Cristiano Sebastiani","Stefano Giagu","Monica D'Onofrio","Simone Scardapane"],"pdf_url":"https://arxiv.org/pdf/2501.03432v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04577v1","updated":"2025-01-08T15:47:04Z","published":"2025-01-08T15:47:04Z","title":"A 65 nm Bayesian Neural Network Accelerator with 360 fJ/Sample In-Word\n  GRNG for AI Uncertainty Estimation","summary":"  Uncertainty estimation is an indispensable capability for AI-enabled,\nsafety-critical applications, e.g. autonomous vehicles or medical diagnosis.\nBayesian neural networks (BNNs) use Bayesian statistics to provide both\nclassification predictions and uncertainty estimation, but they suffer from\nhigh computational overhead associated with random number generation and\nrepeated sample iterations. Furthermore, BNNs are not immediately amenable to\nacceleration through compute-in-memory architectures due to the frequent memory\nwrites necessary after each RNG operation. To address these challenges, we\npresent an ASIC that integrates 360 fJ/Sample Gaussian RNG directly into the\nSRAM memory words. This integration reduces RNG overhead and enables\nfully-parallel compute-in-memory operations for BNNs. The prototype chip\nachieves 5.12 GSa/s RNG throughput and 102 GOp/s neural network throughput\nwhile occupying 0.45 mm2, bringing AI uncertainty estimation to edge\ncomputation.\n","authors":["Zephan M. Enciso","Boyang Cheng","Likai Pei","Jianbo Liu","Steven Davis","Ningyuan Cao","Michael Niemier"],"pdf_url":"https://arxiv.org/pdf/2501.04577v1.pdf","comment":"7 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.10589v2","updated":"2025-01-08T15:41:04Z","published":"2024-09-16T15:18:10Z","title":"Offline Reinforcement Learning for Learning to Dispatch for Job Shop\n  Scheduling","summary":"  The Job Shop Scheduling Problem (JSSP) is a complex combinatorial\noptimization problem. While online Reinforcement Learning (RL) has shown\npromise by quickly finding acceptable solutions for JSSP, it faces key\nlimitations: it requires extensive training interactions from scratch leading\nto sample inefficiency, cannot leverage existing high-quality solutions, and\noften yields suboptimal results compared to traditional methods like Constraint\nProgramming (CP). We introduce Offline Reinforcement Learning for Learning to\nDispatch (Offline-LD), which addresses these limitations by learning from\npreviously generated solutions. Our approach is motivated by scenarios where\nhistorical scheduling data and expert solutions are available, although our\ncurrent evaluation focuses on benchmark problems. Offline-LD adapts two\nCQL-based Q-learning methods (mQRDQN and discrete mSAC) for maskable action\nspaces, introduces a novel entropy bonus modification for discrete SAC, and\nexploits reward normalization through preprocessing. Our experiments\ndemonstrate that Offline-LD outperforms online RL on both generated and\nbenchmark instances. Notably, by introducing noise into the expert dataset, we\nachieve similar or better results than those obtained from the expert dataset,\nsuggesting that a more diverse training set is preferable because it contains\ncounterfactual information.\n","authors":["Jesse van Remmerden","Zaharah Bukhsh","Yingqian Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.10589v2.pdf","comment":"Code available at https://github.com/jesserem/Offline-LD"},{"id":"http://arxiv.org/abs/2402.07099v3","updated":"2025-01-08T15:37:04Z","published":"2024-02-11T04:09:50Z","title":"Rethinking the Capacity of Graph Neural Networks for Branching Strategy","summary":"  Graph neural networks (GNNs) have been widely used to predict properties and\nheuristics of mixed-integer linear programs (MILPs) and hence accelerate MILP\nsolvers. This paper investigates the capacity of GNNs to represent strong\nbranching (SB), the most effective yet computationally expensive heuristic\nemployed in the branch-and-bound algorithm. In the literature, message-passing\nGNN (MP-GNN), as the simplest GNN structure, is frequently used as a fast\napproximation of SB and we find that not all MILPs's SB can be represented with\nMP-GNN. We precisely define a class of \"MP-tractable\" MILPs for which MP-GNNs\ncan accurately approximate SB scores. Particularly, we establish a universal\napproximation theorem: for any data distribution over the MP-tractable class,\nthere always exists an MP-GNN that can approximate the SB score with\narbitrarily high accuracy and arbitrarily high probability, which lays a\ntheoretical foundation of the existing works on imitating SB with MP-GNN. For\nMILPs without the MP-tractability, unfortunately, a similar result is\nimpossible, which can be illustrated by two MILP instances with different SB\nscores that cannot be distinguished by any MP-GNN, regardless of the number of\nparameters. Recognizing this, we explore another GNN structure called the\nsecond-order folklore GNN (2-FGNN) that overcomes this limitation, and the\naforementioned universal approximation theorem can be extended to the entire\nMILP space using 2-FGNN, regardless of the MP-tractability. A small-scale\nnumerical experiment is conducted to directly validate our theoretical\nfindings.\n","authors":["Ziang Chen","Jialin Liu","Xiaohan Chen","Xinshang Wang","Wotao Yin"],"pdf_url":"https://arxiv.org/pdf/2402.07099v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04570v1","updated":"2025-01-08T15:36:19Z","published":"2025-01-08T15:36:19Z","title":"Large-Scale Spectral Graph Neural Networks via Laplacian Sparsification:\n  Technical Report","summary":"  Graph Neural Networks (GNNs) play a pivotal role in graph-based tasks for\ntheir proficiency in representation learning. Among the various GNN methods,\nspectral GNNs employing polynomial filters have shown promising performance on\ntasks involving both homophilous and heterophilous graph structures. However,\nThe scalability of spectral GNNs on large graphs is limited because they learn\nthe polynomial coefficients through multiple forward propagation executions\nduring forward propagation. Existing works have attempted to scale up spectral\nGNNs by eliminating the linear layers on the input node features, a change that\ncan disrupt end-to-end training, potentially impact performance, and become\nimpractical with high-dimensional input features. To address the above\nchallenges, we propose \"Spectral Graph Neural Networks with Laplacian\nSparsification (SGNN-LS)\", a novel graph spectral sparsification method to\napproximate the propagation patterns of spectral GNNs. We prove that our\nproposed method generates Laplacian sparsifiers that can approximate both fixed\nand learnable polynomial filters with theoretical guarantees. Our method allows\nthe application of linear layers on the input node features, enabling\nend-to-end training as well as the handling of raw text features. We conduct an\nextensive experimental analysis on datasets spanning various graph scales and\nproperties to demonstrate the superior efficiency and effectiveness of our\nmethod. The results show that our method yields superior results in comparison\nwith the corresponding approximated base models, especially on dataset\nOgbn-papers100M(111M nodes, 1.6B edges) and MAG-scholar-C (2.8M features).\n","authors":["Haipeng Ding","Zhewei Wei","Yuhang Ye"],"pdf_url":"https://arxiv.org/pdf/2501.04570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12046v2","updated":"2025-01-08T15:35:02Z","published":"2024-10-15T20:32:07Z","title":"Towards Realistic Evaluation of Commit Message Generation by Matching\n  Online and Offline Settings","summary":"  When a Commit Message Generation (CMG) system is integrated into the IDEs and\nother products at JetBrains, we perform online evaluation based on user\nacceptance of the generated messages. However, performing online experiments\nwith every change to a CMG system is troublesome, as each iteration affects\nusers and requires time to collect enough statistics. On the other hand,\noffline evaluation, a prevalent approach in the research literature,\nfacilitates fast experiments but employs automatic metrics that are not\nguaranteed to represent the preferences of real users. In this work, we\ndescribe a novel way we employed to deal with this problem at JetBrains, by\nleveraging an online metric - the number of edits users introduce before\ncommitting the generated messages to the VCS - to select metrics for offline\nexperiments.\n  To support this new type of evaluation, we develop a novel markup collection\ntool mimicking the real workflow with a CMG system, collect a dataset with 57\npairs consisting of commit messages generated by GPT-4 and their counterparts\nedited by human experts, and design and verify a way to synthetically extend\nsuch a dataset. Then, we use the final dataset of 656 pairs to study how the\nwidely used similarity metrics correlate with the online metric reflecting the\nreal users' experience.\n  Our results indicate that edit distance exhibits the highest correlation with\nthe online metric, whereas commonly used similarity metrics such as BLEU and\nMETEOR demonstrate low correlation. This contradicts the previous studies on\nsimilarity metrics for CMG, suggesting that user interactions with a CMG system\nin real-world settings differ significantly from the responses by human\nlabelers within controlled environments. We release all the code and the\ndataset to support future research in the field: https://jb.gg/cmg-evaluation.\n","authors":["Petr Tsvetkov","Aleksandra Eliseeva","Danny Dig","Alexander Bezzubov","Yaroslav Golubev","Timofey Bryksin","Yaroslav Zharov"],"pdf_url":"https://arxiv.org/pdf/2410.12046v2.pdf","comment":"10 pages, 5 figures (Published at ICSE'2025)"},{"id":"http://arxiv.org/abs/2501.04568v1","updated":"2025-01-08T15:32:12Z","published":"2025-01-08T15:32:12Z","title":"Supervision-free Vision-Language Alignment","summary":"  Vision-language models (VLMs) have demonstrated remarkable potential in\nintegrating visual and linguistic information, but their performance is often\nconstrained by the need for extensive, high-quality image-text training data.\nCuration of these image-text pairs is both time-consuming and computationally\nexpensive. To address this challenge, we introduce SVP (Supervision-free Visual\nProjection), a novel framework that enhances vision-language alignment without\nrelying on curated data or preference annotation. SVP leverages self-captioning\nand a pre-trained grounding model as a feedback mechanism to elicit latent\ninformation in VLMs. We evaluate our approach across six key areas: captioning,\nreferring, visual question answering, multitasking, hallucination control, and\nobject recall. Results demonstrate significant improvements, including a 14%\naverage improvement in captioning tasks, up to 12% increase in object recall,\nand substantial reduction in hallucination rates. Notably, a small VLM using\nSVP achieves hallucination reductions comparable to a model five times larger,\nwhile a VLM with initially poor referring capabilities more than doubles its\nperformance, approaching parity with a model twice its size.\n","authors":["Giorgio Giannone","Ruoteng Li","Qianli Feng","Evgeny Perevodchikov","Rui Chen","Aleix Martinez"],"pdf_url":"https://arxiv.org/pdf/2501.04568v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2406.06184v2","updated":"2025-01-08T15:28:11Z","published":"2024-06-10T11:28:25Z","title":"Deep Multi-Objective Reinforcement Learning for Utility-Based\n  Infrastructural Maintenance Optimization","summary":"  In this paper, we introduce Multi-Objective Deep Centralized Multi-Agent\nActor-Critic (MO- DCMAC), a multi-objective reinforcement learning (MORL)\nmethod for infrastructural maintenance optimization, an area traditionally\ndominated by single-objective reinforcement learning (RL) approaches. Previous\nsingle-objective RL methods combine multiple objectives, such as probability of\ncollapse and cost, into a singular reward signal through reward-shaping. In\ncontrast, MO-DCMAC can optimize a policy for multiple objectives directly, even\nwhen the utility function is non-linear. We evaluated MO-DCMAC using two\nutility functions, which use probability of collapse and cost as input. The\nfirst utility function is the Threshold utility, in which MO-DCMAC should\nminimize cost so that the probability of collapse is never above the threshold.\nThe second is based on the Failure Mode, Effects, and Criticality Analysis\n(FMECA) methodology used by asset managers to asses maintenance plans. We\nevaluated MO-DCMAC, with both utility functions, in multiple maintenance\nenvironments, including ones based on a case study of the historical quay walls\nof Amsterdam. The performance of MO-DCMAC was compared against multiple\nrule-based policies based on heuristics currently used for constructing\nmaintenance plans. Our results demonstrate that MO-DCMAC outperforms\ntraditional rule-based policies across various environments and utility\nfunctions.\n","authors":["Jesse van Remmerden","Maurice Kenter","Diederik M. Roijers","Charalampos Andriotis","Yingqian Zhang","Zaharah Bukhsh"],"pdf_url":"https://arxiv.org/pdf/2406.06184v2.pdf","comment":"Accepted in the Neural Computing and Applications: Topical Collection\n  on Multi-Objective Decision Making 2023 (MODeM 2023)"},{"id":"http://arxiv.org/abs/2412.04628v2","updated":"2025-01-08T15:00:39Z","published":"2024-12-05T21:50:22Z","title":"SWEPO: Simultaneous Weighted Preference Optimization for Group\n  Contrastive Alignment","summary":"  We introduce Simultaneous Weighted Preference Optimization (SWEPO), a novel\nextension of Direct Preference Optimization (DPO) designed to accommodate\nmultiple dynamically chosen positive and negative responses for each query.\nSWEPO employs a weighted group contrastive loss, assigning weights to responses\nbased on their deviation from the mean reward score. This approach effectively\nprioritizes responses that are significantly better or worse than the average,\nenhancing optimization. Our theoretical analysis demonstrates that\nsimultaneously considering multiple preferences reduces alignment bias,\nresulting in more robust alignment. Additionally, we provide insights into the\ntraining dynamics of our loss function and a related function, InfoNCA.\nEmpirical validation on the UltraFeedback dataset establishes SWEPO as\nstate-of-the-art, with superior performance in downstream evaluations using the\nAlpacaEval dataset.\n","authors":["Taneesh Gupta","Rahul Madhavan","Xuchao Zhang","Chetan Bansal","Saravan Rajmohan"],"pdf_url":"https://arxiv.org/pdf/2412.04628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04547v1","updated":"2025-01-08T14:51:36Z","published":"2025-01-08T14:51:36Z","title":"Medical artificial intelligence toolbox (MAIT): an explainable machine\n  learning framework for binary classification, survival modelling, and\n  regression analyses","summary":"  While machine learning offers diverse techniques suitable for exploring\nvarious medical research questions, a cohesive synergistic framework can\nfacilitate the integration and understanding of new approaches within unified\nmodel development and interpretation. We therefore introduce the Medical\nArtificial Intelligence Toolbox (MAIT), an explainable, open-source Python\npipeline for developing and evaluating binary classification, regression, and\nsurvival models on tabular datasets. MAIT addresses key challenges (e.g., high\ndimensionality, class imbalance, mixed variable types, and missingness) while\npromoting transparency in reporting (TRIPOD+AI compliant). Offering automated\nconfigurations for beginners and customizable source code for experts, MAIT\nstreamlines two primary use cases: Discovery (feature importance via unified\nscoring, e.g., SHapley Additive exPlanations - SHAP) and Prediction (model\ndevelopment and deployment with optimized solutions). Moreover, MAIT proposes\nnew techniques including fine-tuning of probability threshold in binary\nclassification, translation of cumulative hazard curves to binary\nclassification, enhanced visualizations for model interpretation for mixed data\ntypes, and handling censoring through semi-supervised learning, to adapt to a\nwide set of data constraints and study designs. We provide detailed tutorials\non GitHub, using four open-access data sets, to demonstrate how MAIT can be\nused to improve implementation and interpretation of ML models in medical\nresearch.\n","authors":["Ramtin Zargari Marandi","Anne Svane Frahm","Jens Lundgren","Daniel Dawson Murray","Maja Milojevic"],"pdf_url":"https://arxiv.org/pdf/2501.04547v1.pdf","comment":"14 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2501.00889v2","updated":"2025-01-08T14:50:23Z","published":"2025-01-01T16:36:21Z","title":"Evaluating Time Series Foundation Models on Noisy Periodic Time Series","summary":"  While recent advancements in foundation models have significantly impacted\nmachine learning, rigorous tests on the performance of time series foundation\nmodels (TSFMs) remain largely underexplored. This paper presents an empirical\nstudy evaluating the zero-shot, long-horizon forecasting abilities of several\nleading TSFMs over two synthetic datasets constituting noisy periodic time\nseries. We assess model efficacy across different noise levels, underlying\nfrequencies, and sampling rates. As benchmarks for comparison, we choose two\nstatistical techniques: a Fourier transform (FFT)-based approach and a linear\nautoregressive (AR) model. Our findings demonstrate that while for time series\nwith bounded periods and higher sampling rates, TSFMs can match or outperform\nthe statistical approaches, their forecasting abilities deteriorate with longer\nperiods, higher noise levels, lower sampling rates and more complex shapes of\nthe time series.\n","authors":["Syamantak Datta Gupta"],"pdf_url":"https://arxiv.org/pdf/2501.00889v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02334v2","updated":"2025-01-08T14:42:05Z","published":"2024-04-26T15:02:39Z","title":"Rad4XCNN: a new agnostic method for post-hoc global explanation of\n  CNN-derived features by means of radiomics","summary":"  In recent years, machine learning-based clinical decision support systems\n(CDSS) have played a key role in the analysis of several medical conditions.\nDespite their promising capabilities, the lack of transparency in AI models\nposes significant challenges, particularly in medical contexts where\nreliability is a mandatory aspect. However, it appears that explainability is\ninversely proportional to accuracy. For this reason, achieving transparency\nwithout compromising predictive accuracy remains a key challenge. This paper\npresents a novel method, namely Rad4XCNN, to enhance the predictive power of\nCNN-derived features with the inherent interpretability of radiomic features.\nRad4XCNN diverges from conventional methods based on saliency maps, by\nassociating intelligible meaning to CNN-derived features by means of Radiomics,\noffering new perspectives on explanation methods beyond visualization maps.\nUsing a breast cancer classification task as a case study, we evaluated\nRad4XCNN on ultrasound imaging datasets, including an online dataset and two\nin-house datasets for internal and external validation. Some key results are:\ni) CNN-derived features guarantee more robust accuracy when compared against\nViT-derived and radiomic features; ii) conventional visualization map methods\nfor explanation present several pitfalls; iii) Rad4XCNN does not sacrifice\nmodel accuracy for their explainability; iv) Rad4XCNN provides a global\nexplanation enabling the physician to extract global insights and findings. Our\nmethod can mitigate some concerns related to the explainability-accuracy\ntrade-off. This study highlighted the importance of proposing new methods for\nmodel explanation without affecting their accuracy.\n","authors":["Francesco Prinzi","Carmelo Militello","Calogero Zarcaro","Tommaso Vincenzo Bartolotta","Salvatore Gaglio","Salvatore Vitabile"],"pdf_url":"https://arxiv.org/pdf/2405.02334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00599v2","updated":"2025-01-08T14:38:30Z","published":"2024-12-31T18:56:46Z","title":"VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with\n  Video LLM","summary":"  Video Large Language Models (Video LLMs) have recently exhibited remarkable\ncapabilities in general video understanding. However, they mainly focus on\nholistic comprehension and struggle with capturing fine-grained spatial and\ntemporal details. Besides, the lack of high-quality object-level video\ninstruction data and a comprehensive benchmark further hinders their\nadvancements. To tackle these challenges, we introduce the VideoRefer Suite to\nempower Video LLM for finer-level spatial-temporal video understanding, i.e.,\nenabling perception and reasoning on any objects throughout the video.\nSpecially, we thoroughly develop VideoRefer Suite across three essential\naspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent\ndata engine to meticulously curate a large-scale, high-quality object-level\nvideo instruction dataset, termed VideoRefer-700K. Next, we present the\nVideoRefer model, which equips a versatile spatial-temporal object encoder to\ncapture precise regional and sequential representations. Finally, we\nmeticulously create a VideoRefer-Bench to comprehensively assess the\nspatial-temporal understanding capability of a Video LLM, evaluating it across\nvarious aspects. Extensive experiments and analyses demonstrate that our\nVideoRefer model not only achieves promising performance on video referring\nbenchmarks but also facilitates general video understanding capabilities.\n","authors":["Yuqian Yuan","Hang Zhang","Wentong Li","Zesen Cheng","Boqiang Zhang","Long Li","Xin Li","Deli Zhao","Wenqiao Zhang","Yueting Zhuang","Jianke Zhu","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2501.00599v2.pdf","comment":"17 pages, 14 figures, technical report"},{"id":"http://arxiv.org/abs/2501.04538v1","updated":"2025-01-08T14:38:03Z","published":"2025-01-08T14:38:03Z","title":"HypeRL: Parameter-Informed Reinforcement Learning for Parametric PDEs","summary":"  In this work, we devise a new, general-purpose reinforcement learning\nstrategy for the optimal control of parametric partial differential equations\n(PDEs). Such problems frequently arise in applied sciences and engineering and\nentail a significant complexity when control and/or state variables are\ndistributed in high-dimensional space or depend on varying parameters.\nTraditional numerical methods, relying on either iterative minimization\nalgorithms or dynamic programming, while reliable, often become computationally\ninfeasible. Indeed, in either way, the optimal control problem must be solved\nfor each instance of the parameters, and this is out of reach when dealing with\nhigh-dimensional time-dependent and parametric PDEs. In this paper, we propose\nHypeRL, a deep reinforcement learning (DRL) framework to overcome the\nlimitations shown by traditional methods. HypeRL aims at approximating the\noptimal control policy directly. Specifically, we employ an actor-critic DRL\napproach to learn an optimal feedback control strategy that can generalize\nacross the range of variation of the parameters. To effectively learn such\noptimal control laws, encoding the parameter information into the DRL policy\nand value function neural networks (NNs) is essential. To do so, HypeRL uses\ntwo additional NNs, often called hypernetworks, to learn the weights and biases\nof the value function and the policy NNs. We validate the proposed approach on\ntwo PDE-constrained optimal control benchmarks, namely a 1D\nKuramoto-Sivashinsky equation and a 2D Navier-Stokes equations, by showing that\nthe knowledge of the PDE parameters and how this information is encoded, i.e.,\nvia a hypernetwork, is an essential ingredient for learning parameter-dependent\ncontrol policies that can generalize effectively to unseen scenarios and for\nimproving the sample efficiency of such policies.\n","authors":["Nicolò Botteghi","Stefania Fresca","Mengwu Guo","Andrea Manzoni"],"pdf_url":"https://arxiv.org/pdf/2501.04538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04534v1","updated":"2025-01-08T14:33:47Z","published":"2025-01-08T14:33:47Z","title":"Combining YOLO and Visual Rhythm for Vehicle Counting","summary":"  Video-based vehicle detection and counting play a critical role in managing\ntransport infrastructure. Traditional image-based counting methods usually\ninvolve two main steps: initial detection and subsequent tracking, which are\napplied to all video frames, leading to a significant increase in computational\ncomplexity. To address this issue, this work presents an alternative and more\nefficient method for vehicle detection and counting. The proposed approach\neliminates the need for a tracking step and focuses solely on detecting\nvehicles in key video frames, thereby increasing its efficiency. To achieve\nthis, we developed a system that combines YOLO, for vehicle detection, with\nVisual Rhythm, a way to create time-spatial images that allows us to focus on\nframes that contain useful information. Additionally, this method can be used\nfor counting in any application involving unidirectional moving targets to be\ndetected and identified. Experimental analysis using real videos shows that the\nproposed method achieves mean counting accuracy around 99.15% over a set of\nvideos, with a processing speed three times faster than tracking based\napproaches.\n","authors":["Victor Nascimento Ribeiro","Nina S. T. Hirata"],"pdf_url":"https://arxiv.org/pdf/2501.04534v1.pdf","comment":"Accepted for presentation at the Conference on Graphics, Patterns and\n  Images (SIBGRAPI) 2023"},{"id":"http://arxiv.org/abs/2501.02156v3","updated":"2025-01-08T14:26:51Z","published":"2025-01-04T01:45:32Z","title":"The Race to Efficiency: A New Perspective on AI Scaling Laws","summary":"  As large-scale AI models expand, training becomes costlier and sustaining\nprogress grows harder. Classical scaling laws (e.g., Kaplan et al. (2020),\nHoffmann et al. (2022)) predict training loss from a static compute budget yet\nneglect time and efficiency, prompting the question: how can we balance\nballooning GPU fleets with rapidly improving hardware and algorithms? We\nintroduce the relative-loss equation, a time- and efficiency-aware framework\nthat extends classical AI scaling laws. Our model shows that, without ongoing\nefficiency gains, advanced performance could demand millennia of training or\nunrealistically large GPU fleets. However, near-exponential progress remains\nachievable if the \"efficiency-doubling rate\" parallels Moore's Law. By\nformalizing this race to efficiency, we offer a quantitative roadmap for\nbalancing front-loaded GPU investments with incremental improvements across the\nAI stack. Empirical trends suggest that sustained efficiency gains can push AI\nscaling well into the coming decade, providing a new perspective on the\ndiminishing returns inherent in classical scaling.\n","authors":["Chien-Ping Lu"],"pdf_url":"https://arxiv.org/pdf/2501.02156v3.pdf","comment":"21 pages, 3 figures. 2 tables, second draft"},{"id":"http://arxiv.org/abs/2501.04529v1","updated":"2025-01-08T14:21:03Z","published":"2025-01-08T14:21:03Z","title":"A Plug-and-Play Bregman ADMM Module for Inferring Event Branches in\n  Temporal Point Processes","summary":"  An event sequence generated by a temporal point process is often associated\nwith a hidden and structured event branching process that captures the\ntriggering relations between its historical and current events. In this study,\nwe design a new plug-and-play module based on the Bregman ADMM (BADMM)\nalgorithm, which infers event branches associated with event sequences in the\nmaximum likelihood estimation framework of temporal point processes (TPPs).\nSpecifically, we formulate the inference of event branches as an optimization\nproblem for the event transition matrix under sparse and low-rank constraints,\nwhich is embedded in existing TPP models or their learning paradigms. We can\nimplement this optimization problem based on subspace clustering and sparse\ngroup-lasso, respectively, and solve it using the Bregman ADMM algorithm, whose\nunrolling leads to the proposed BADMM module. When learning a classic TPP\n(e.g., Hawkes process) by the expectation-maximization algorithm, the BADMM\nmodule helps derive structured responsibility matrices in the E-step.\nSimilarly, the BADMM module helps derive low-rank and sparse attention maps for\nthe neural TPPs with self-attention layers. The structured responsibility\nmatrices and attention maps, which work as learned event transition matrices,\nindicate event branches, e.g., inferring isolated events and those key events\ntriggering many subsequent events. Experiments on both synthetic and real-world\ndata show that plugging our BADMM module into existing TPP models and learning\nparadigms can improve model performance and provide us with interpretable\nstructured event branches. The code is available at\n\\url{https://github.com/qingmeiwangdaily/BADMM_TPP}.\n","authors":["Qingmei Wang","Yuxin Wu","Yujie Long","Jing Huang","Fengyuan Ran","Bing Su","Hongteng Xu"],"pdf_url":"https://arxiv.org/pdf/2501.04529v1.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04528v1","updated":"2025-01-08T14:19:54Z","published":"2025-01-08T14:19:54Z","title":"Towards a Problem-Oriented Domain Adaptation Framework for Machine\n  Learning","summary":"  Domain adaptation is a sub-field of machine learning that involves\ntransferring knowledge from a source domain to perform the same task in the\ntarget domain. It is a typical challenge in machine learning that arises, e.g.,\nwhen data is obtained from various sources or when using a data basis that\nchanges over time. Recent advances in the field offer promising methods, but it\nis still challenging for researchers and practitioners to determine if domain\nadaptation is suitable for a given problem -- and, subsequently, to select the\nappropriate approach. This article employs design science research to develop a\nproblem-oriented framework for domain adaptation, which is matured in three\nevaluation episodes. We describe a framework that distinguishes between five\ndomain adaptation scenarios, provides recommendations for addressing each\nscenario, and offers guidelines for determining if a problem falls into one of\nthese scenarios. During the multiple evaluation episodes, the framework is\ntested on artificial and real-world datasets and an experimental study\ninvolving 100 participants. The evaluation demonstrates that the framework has\nthe explanatory power to capture any domain adaptation problem effectively. In\nsummary, we provide clear guidance for researchers and practitioners who want\nto employ domain adaptation but lack in-depth knowledge of the possibilities.\n","authors":["Philipp Spitzer","Dominik Martin","Laurin Eichberger","Niklas Kühl"],"pdf_url":"https://arxiv.org/pdf/2501.04528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04527v1","updated":"2025-01-08T14:19:03Z","published":"2025-01-08T14:19:03Z","title":"Towards Fair Class-wise Robustness: Class Optimal Distribution\n  Adversarial Training","summary":"  Adversarial training has proven to be a highly effective method for improving\nthe robustness of deep neural networks against adversarial attacks.\nNonetheless, it has been observed to exhibit a limitation in terms of robust\nfairness, characterized by a significant disparity in robustness across\ndifferent classes. Recent efforts to mitigate this problem have turned to\nclass-wise reweighted methods. However, these methods suffer from a lack of\nrigorous theoretical analysis and are limited in their exploration of the\nweight space, as they mainly rely on existing heuristic algorithms or intuition\nto compute weights. In addition, these methods fail to guarantee the\nconsistency of the optimization direction due to the decoupled optimization of\nweights and the model parameters. They potentially lead to suboptimal weight\nassignments and consequently, a suboptimal model. To address these problems,\nthis paper proposes a novel min-max training framework, Class Optimal\nDistribution Adversarial Training (CODAT), which employs distributionally\nrobust optimization to fully explore the class-wise weight space, thus enabling\nthe identification of the optimal weight with theoretical guarantees.\nFurthermore, we derive a closed-form optimal solution to the internal\nmaximization and then get a deterministic equivalent objective function, which\nprovides a theoretical basis for the joint optimization of weights and model\nparameters. Meanwhile, we propose a fairness elasticity coefficient for the\nevaluation of the algorithm with regard to both robustness and robust fairness.\nExperimental results on various datasets show that the proposed method can\neffectively improve the robust fairness of the model and outperform the\nstate-of-the-art approaches.\n","authors":["Hongxin Zhi","Hongtao Yu","Shaome Li","Xiuming Zhao","Yiteng Wu"],"pdf_url":"https://arxiv.org/pdf/2501.04527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18103v3","updated":"2025-01-08T14:13:23Z","published":"2024-03-26T21:01:41Z","title":"Tutorial on Diffusion Models for Imaging and Vision","summary":"  The astonishing growth of generative tools in recent years has empowered many\nexciting applications in text-to-image generation and text-to-video generation.\nThe underlying principle behind these generative tools is the concept of\ndiffusion, a particular sampling mechanism that has overcome some shortcomings\nthat were deemed difficult in the previous approaches. The goal of this\ntutorial is to discuss the essential ideas underlying the diffusion models. The\ntarget audience of this tutorial includes undergraduate and graduate students\nwho are interested in doing research on diffusion models or applying these\nmodels to solve other problems.\n","authors":["Stanley H. Chan"],"pdf_url":"https://arxiv.org/pdf/2403.18103v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01844v2","updated":"2025-01-08T14:10:15Z","published":"2025-01-03T14:54:49Z","title":"Learning from Ambiguous Data with Hard Labels","summary":"  Real-world data often contains intrinsic ambiguity that the common\nsingle-hard-label annotation paradigm ignores. Standard training using\nambiguous data with these hard labels may produce overly confident models and\nthus leading to poor generalization. In this paper, we propose a novel\nframework called Quantized Label Learning (QLL) to alleviate this issue. First,\nwe formulate QLL as learning from (very) ambiguous data with hard labels:\nideally, each ambiguous instance should be associated with a ground-truth\nsoft-label distribution describing its corresponding probabilistic weight in\neach class, however, this is usually not accessible; in practice, we can only\nobserve a quantized label, i.e., a hard label sampled (quantized) from the\ncorresponding ground-truth soft-label distribution, of each instance, which can\nbe seen as a biased approximation of the ground-truth soft-label. Second, we\npropose a Class-wise Positive-Unlabeled (CPU) risk estimator that allows us to\ntrain accurate classifiers from only ambiguous data with quantized labels.\nThird, to simulate ambiguous datasets with quantized labels in the real world,\nwe design a mixing-based ambiguous data generation procedure for empirical\nevaluation. Experiments demonstrate that our CPU method can significantly\nimprove model generalization performance and outperform the baselines.\n","authors":["Zeke Xie","Zheng He","Nan Lu","Lichen Bai","Bao Li","Shuo Yang","Mingming Sun","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2501.01844v2.pdf","comment":"9 pages, 4 figures, accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2405.13867v2","updated":"2025-01-08T14:08:11Z","published":"2024-05-22T17:48:17Z","title":"Scaling-laws for Large Time-series Models","summary":"  Scaling laws for large language models (LLMs) have provided useful guidance\nin training ever larger models for predictable performance gains. Time series\nforecasting shares a similar sequential structure to language, and is amenable\nto large-scale transformer architectures. Here we show that foundational\ndecoder-only time series transformer models exhibit analogous scaling-behavior\nto LLMs, with architectural details (aspect ratio and number of heads) having a\nminimal effect over broad ranges. We assemble a large corpus of heterogenous\ntime series data on which to train, and establish for the first time power-law\nscaling with parameter count, dataset size, and training compute, spanning five\norders of magnitude.\n","authors":["Thomas D. P. Edwards","James Alvey","Justin Alsing","Nam H. Nguyen","Benjamin D. Wandelt"],"pdf_url":"https://arxiv.org/pdf/2405.13867v2.pdf","comment":"4 main pages (16 total), 4 figures; Accepted for oral presentation in\n  Time Series in the Age of Large Models (TSALM) Workshop at Neurips 2024"},{"id":"http://arxiv.org/abs/2405.08766v2","updated":"2025-01-08T13:45:46Z","published":"2024-05-14T16:59:20Z","title":"Energy-based Hopfield Boosting for Out-of-Distribution Detection","summary":"  Out-of-distribution (OOD) detection is critical when deploying machine\nlearning models in the real world. Outlier exposure methods, which incorporate\nauxiliary outlier data in the training process, can drastically improve OOD\ndetection performance compared to approaches without advanced training\nstrategies. We introduce Hopfield Boosting, a boosting approach, which\nleverages modern Hopfield energy (MHE) to sharpen the decision boundary between\nthe in-distribution and OOD data. Hopfield Boosting encourages the model to\nconcentrate on hard-to-distinguish auxiliary outlier examples that lie close to\nthe decision boundary between in-distribution and auxiliary outlier data. Our\nmethod achieves a new state-of-the-art in OOD detection with outlier exposure,\nimproving the FPR95 metric from 2.28 to 0.92 on CIFAR-10 and from 11.76 to 7.94\non CIFAR-100.\n","authors":["Claus Hofmann","Simon Schmid","Bernhard Lehner","Daniel Klotz","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2405.08766v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.02270v2","updated":"2025-01-08T13:42:02Z","published":"2025-01-04T12:15:58Z","title":"Efficient Video-Based ALPR System Using YOLO and Visual Rhythm","summary":"  Automatic License Plate Recognition (ALPR) involves extracting vehicle\nlicense plate information from image or a video capture. These systems have\ngained popularity due to the wide availability of low-cost surveillance cameras\nand advances in Deep Learning. Typically, video-based ALPR systems rely on\nmultiple frames to detect the vehicle and recognize the license plates.\nTherefore, we propose a system capable of extracting exactly one frame per\nvehicle and recognizing its license plate characters from this singular image\nusing an Optical Character Recognition (OCR) model. Early experiments show that\nthis methodology is viable.\n","authors":["Victor Nascimento Ribeiro","Nina S. T. Hirata"],"pdf_url":"https://arxiv.org/pdf/2501.02270v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2409.16586v2","updated":"2025-01-08T13:16:26Z","published":"2024-09-25T03:25:34Z","title":"AutoSTF: Decoupled Neural Architecture Search for Cost-Effective\n  Automated Spatio-Temporal Forecasting","summary":"  Spatio-temporal forecasting is a critical component of various smart city\napplications, such as transportation optimization, energy management, and\nsocio-economic analysis. Recently, several automated spatio-temporal\nforecasting methods have been proposed to automatically search the optimal\nneural network architecture for capturing complex spatio-temporal dependencies.\nHowever, the existing automated approaches suffer from expensive neural\narchitecture search overhead, which hinders their practical use and the further\nexploration of diverse spatio-temporal operators in a finer granularity. In\nthis paper, we propose AutoSTF, a decoupled automatic neural architecture\nsearch framework for cost-effective automated spatio-temporal forecasting. From\nthe efficiency perspective, we first decouple the mixed search space into\ntemporal space and spatial space and respectively devise representation\ncompression and parameter-sharing schemes to mitigate the parameter explosion.\nThe decoupled spatio-temporal search not only expedites the model optimization\nprocess but also leaves new room for more effective spatio-temporal dependency\nmodeling. From the effectiveness perspective, we propose a multi-patch transfer\nmodule to jointly capture multi-granularity temporal dependencies and extend\nthe spatial search space to enable finer-grained layer-wise spatial dependency\nsearch. Extensive experiments on eight datasets demonstrate the superiority of\nAutoSTF in terms of both accuracy and efficiency. Specifically, our proposed\nmethod achieves up to 13.48x speed-up compared to state-of-the-art automatic\nspatio-temporal forecasting methods while maintaining the best forecasting\naccuracy.\n","authors":["Tengfei Lyu","Weijia Zhang","Jinliang Deng","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2409.16586v2.pdf","comment":"Accepted by KDD 2025 Research Track"},{"id":"http://arxiv.org/abs/2501.04487v1","updated":"2025-01-08T13:14:05Z","published":"2025-01-08T13:14:05Z","title":"Integrating remote sensing data assimilation, deep learning and large\n  language model for interactive wheat breeding yield prediction","summary":"  Yield is one of the core goals of crop breeding. By predicting the potential\nyield of different breeding materials, breeders can screen these materials at\nvarious growth stages to select the best performing. Based on unmanned aerial\nvehicle remote sensing technology, high-throughput crop phenotyping data in\nbreeding areas is collected to provide data support for the breeding decisions\nof breeders. However, the accuracy of current yield predictions still requires\nimprovement, and the usability and user-friendliness of yield forecasting tools\nremain suboptimal. To address these challenges, this study introduces a hybrid\nmethod and tool for crop yield prediction, designed to allow breeders to\ninteractively and accurately predict wheat yield by chatting with a large\nlanguage model (LLM). First, the newly designed data assimilation algorithm is\nused to assimilate the leaf area index into the WOFOST model. Then, selected\noutputs from the assimilation process, along with remote sensing inversion\nresults, are used to drive the time-series temporal fusion transformer model\nfor wheat yield prediction. Finally, based on this hybrid method and leveraging\nan LLM with retrieval augmented generation technology, we developed an\ninteractive yield prediction Web tool that is user-friendly and supports\nsustainable data updates. This tool integrates multi-source data to assist\nbreeding decision-making. This study aims to accelerate the identification of\nhigh-yield materials in the breeding process, enhance breeding efficiency, and\nenable more scientific and smart breeding decisions.\n","authors":["Guofeng Yang","Nanfei Jin","Wenjie Ai","Zhonghua Zheng","Yuhong He","Yong He"],"pdf_url":"https://arxiv.org/pdf/2501.04487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04481v1","updated":"2025-01-08T13:04:08Z","published":"2025-01-08T13:04:08Z","title":"Safe Reinforcement Learning with Minimal Supervision","summary":"  Reinforcement learning (RL) in the real world necessitates the development of\nprocedures that enable agents to explore without causing harm to themselves or\nothers. The most successful solutions to the problem of safe RL leverage\noffline data to learn a safe-set, enabling safe online exploration. However,\nthis approach to safe-learning is often constrained by the demonstrations that\nare available for learning.\n  In this paper we investigate the influence of the quantity and quality of\ndata used to train the initial safe learning problem offline on the ability to\nlearn safe-RL policies online. Specifically, we focus on tasks with spatially\nextended goal states where we have few or no demonstrations available.\nClassically this problem is addressed either by using hand-designed controllers\nto generate data or by collecting user-generated demonstrations. However, these\nmethods are often expensive and do not scale to more complex tasks and\nenvironments. To address this limitation we propose an unsupervised RL-based\noffline data collection procedure, to learn complex and scalable policies\nwithout the need for hand-designed controllers or user demonstrations. Our\nresearch demonstrates the significance of providing sufficient demonstrations\nfor agents to learn optimal safe-RL policies online, and as a result, we\npropose optimistic forgetting, a novel online safe-RL approach that is\npractical for scenarios with limited data. Further, our unsupervised data\ncollection approach highlights the need to balance diversity and optimality for\nsafe online exploration.\n","authors":["Alexander Quessy","Thomas Richardson","Sebastian East"],"pdf_url":"https://arxiv.org/pdf/2501.04481v1.pdf","comment":"Initially submitted to ICML 2023"},{"id":"http://arxiv.org/abs/2501.04470v1","updated":"2025-01-08T12:48:15Z","published":"2025-01-08T12:48:15Z","title":"Regularising NARX models with multi-task learning","summary":"  A Nonlinear Auto-Regressive with eXogenous inputs (NARX) model can be used to\ndescribe time-varying processes; where the output depends on both previous\noutputs and current/previous external input variables. One limitation of NARX\nmodels is their propensity to overfit and result in poor generalisation for\nfuture predictions. The proposed method to help to overcome the issue of\noverfitting is a NARX model which predicts outputs at both the current time and\nseveral lead times into the future. This is a form of multi-task learner (MTL);\nwhereby the lead time outputs will regularise the current time output. This\nwork shows that for high noise level, MTL can be used to regularise NARX with a\nlower Normalised Mean Square Error (NMSE) compared to the NMSE of the\nindependent learner counterpart.\n","authors":["Sarah Bee","Lawrence Bull","Nikolaos Dervilis","Keith Worden"],"pdf_url":"https://arxiv.org/pdf/2501.04470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08023v2","updated":"2025-01-08T12:40:56Z","published":"2024-09-12T13:05:28Z","title":"Edge-Wise Graph-Instructed Neural Networks","summary":"  The problem of multi-task regression over graph nodes has been recently\napproached through Graph-Instructed Neural Network (GINN), which is a promising\narchitecture belonging to the subset of message-passing graph neural networks.\nIn this work, we discuss the limitations of the Graph-Instructed (GI) layer,\nand we formalize a novel edge-wise GI (EWGI) layer. We discuss the advantages\nof the EWGI layer and we provide numerical evidence that EWGINNs perform better\nthan GINNs over some graph-structured input data, like the ones inferred from\nthe Barabasi-Albert graph, and improve the training regularization on graphs\nwith chaotic connectivity, like the ones inferred from the Erdos-Renyi graph.\n","authors":["Francesco Della Santa","Antonio Mastropietro","Sandra Pieraccini","Francesco Vaccarino"],"pdf_url":"https://arxiv.org/pdf/2409.08023v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16149v4","updated":"2025-01-08T12:40:27Z","published":"2024-03-24T13:43:43Z","title":"Analyzing Consumer IoT Traffic from Security and Privacy Perspectives: a\n  Comprehensive Survey","summary":"  The Consumer Internet of Things (CIoT), a notable segment within the IoT\ndomain, involves the integration of IoT technology into consumer electronics\nand devices, such as smart homes and smart wearables. Compared to traditional\nIoT fields, CIoT differs notably in target users, product types, and design\napproaches. While offering convenience to users, it also raises new security\nand privacy concerns. Network traffic analysis, a widely used technique in the\nsecurity community, has been extensively applied to investigate these concerns\nabout CIoT. Compared to network traffic analysis in other fields such as mobile\napps and websites, CIoT presents unique characteristics, introducing new\nchallenges and research opportunities. Researchers have made significant\ncontributions in this area. To aid researchers in understanding the application\nof traffic analysis tools for studying CIoT security and privacy risks, this\nsurvey reviews 303 publications on traffic analysis within the CIoT security\nand privacy domain from January 2018 to June 2024, focusing on three research\nquestions. Our work: 1) outlines the CIoT traffic analysis process and\nhighlights its differences from general network traffic analysis. 2) summarizes\nand classifies existing research into four categories according to its\napplication objectives: device fingerprinting, user activity inference,\nmalicious traffic detection, and measurement. 3) explores emerging challenges\nand potential future research directions based on each step of the CIoT traffic\nanalysis process. This will provide new insights to the community and guide the\nindustry towards safer product designs.\n","authors":["Yan Jia","Yuxin Song","Zihou Liu","Qingyin Tan","Yang Song","Yu Zhang","Zheli Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16149v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14434v4","updated":"2025-01-08T12:19:46Z","published":"2024-02-22T10:26:46Z","title":"Parallelized Midpoint Randomization for Langevin Monte Carlo","summary":"  We study the problem of sampling from a target probability density function\nin frameworks where parallel evaluations of the log-density gradient are\nfeasible. Focusing on smooth and strongly log-concave densities, we revisit the\nparallelized randomized midpoint method and investigate its properties using\nrecently developed techniques for analyzing its sequential version. Through\nthese techniques, we derive upper bounds on the Wasserstein distance between\nsampling and target densities. These bounds quantify the substantial runtime\nimprovements achieved through parallel processing.\n","authors":["Lu Yu","Arnak Dalalyan"],"pdf_url":"https://arxiv.org/pdf/2402.14434v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2306.08494"},{"id":"http://arxiv.org/abs/2501.04453v1","updated":"2025-01-08T12:14:00Z","published":"2025-01-08T12:14:00Z","title":"Gradient Purification: Defense Against Poisoning Attack in Decentralized\n  Federated Learning","summary":"  Decentralized federated learning (DFL) is inherently vulnerable to poisoning\nattacks, as malicious clients can transmit manipulated model gradients to\nneighboring clients. Existing defense methods either reject suspicious\ngradients per iteration or restart DFL aggregation after detecting all\nmalicious clients. They overlook the potential accuracy benefit from the\ndiscarded malicious gradients. In this paper, we propose a novel gradient\npurification defense, named GPD, that integrates seamlessly with existing DFL\naggregation to defend against poisoning attacks. It aims to mitigate the harm\nin model gradients while retaining the benefit in model weights for enhancing\naccuracy. For each benign client in GPD, a recording variable is designed to\ntrack the historically aggregated gradients from one of its neighbors. It\nallows benign clients to precisely detect malicious neighbors and swiftly\nmitigate aggregated malicious gradients via historical consistency checks. Upon\nmitigation, GPD optimizes model weights via aggregating gradients solely from\nbenign clients. This retains the previously beneficial portions from malicious\nclients and exploits the contributions from benign clients, thereby\nsignificantly enhancing the model accuracy. We analyze the convergence of GPD,\nas well as its ability to harvest high accuracy. Extensive experiments over\nthree datasets demonstrate that, GPD is capable of mitigating poisoning attacks\nunder both iid and non-iid data distributions. It significantly outperforms\nstate-of-the-art defenses in terms of accuracy against various poisoning\nattacks.\n","authors":["Bin Li","Xiaoye Miao","Yongheng Shang","Xinkui Zhao","Shuiguang Deng","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2501.04453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04443v1","updated":"2025-01-08T11:52:43Z","published":"2025-01-08T11:52:43Z","title":"Revisiting LocalSGD and SCAFFOLD: Improved Rates and Missing Analysis","summary":"  LocalSGD and SCAFFOLD are widely used methods in distributed stochastic\noptimization, with numerous applications in machine learning, large-scale data\nprocessing, and federated learning. However, rigorously establishing their\ntheoretical advantages over simpler methods, such as minibatch SGD (MbSGD), has\nproven challenging, as existing analyses often rely on strong assumptions,\nunrealistic premises, or overly restrictive scenarios.\n  In this work, we revisit the convergence properties of LocalSGD and SCAFFOLD\nunder a variety of existing or weaker conditions, including gradient\nsimilarity, Hessian similarity, weak convexity, and Lipschitz continuity of the\nHessian. Our analysis shows that (i) LocalSGD achieves faster convergence\ncompared to MbSGD for weakly convex functions without requiring stronger\ngradient similarity assumptions; (ii) LocalSGD benefits significantly from\nhigher-order similarity and smoothness; and (iii) SCAFFOLD demonstrates faster\nconvergence than MbSGD for a broader class of non-quadratic functions. These\ntheoretical insights provide a clearer understanding of the conditions under\nwhich LocalSGD and SCAFFOLD outperform MbSGD.\n","authors":["Ruichen Luo","Sebastian U Stich","Samuel Horváth","Martin Takáč"],"pdf_url":"https://arxiv.org/pdf/2501.04443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09570v4","updated":"2025-01-08T11:50:42Z","published":"2024-03-14T17:00:01Z","title":"Multi-Fidelity Bayesian Optimization With Across-Task Transferable\n  Max-Value Entropy Search","summary":"  In many applications, ranging from logistics to engineering, a designer is\nfaced with a sequence of optimization tasks for which the objectives are in the\nform of black-box functions that are costly to evaluate. Furthermore,\nhigher-fidelity evaluations of the optimization objectives often entail a\nlarger cost. Existing multi-fidelity black-box optimization strategies select\ncandidate solutions and fidelity levels with the goal of maximizing the\ninformation about the optimal value or the optimal solution for the current\ntask. Assuming that successive optimization tasks are related, this paper\nintroduces a novel information-theoretic acquisition function that balances the\nneed to acquire information about the current task with the goal of collecting\ninformation transferable to future tasks. The proposed method transfers across\ntasks distributions over parameters of a Gaussian process surrogate model by\nimplementing particle-based variational Bayesian updates. Theoretical insights\nbased on the analysis of the expected regret substantiate the benefits of\nacquiring transferable knowledge across tasks. Furthermore, experimental\nresults across synthetic and real-world examples reveal that the proposed\nacquisition strategy that caters to future tasks can significantly improve the\noptimization efficiency as soon as a sufficient number of tasks is processed.\n","authors":["Yunchuan Zhang","Sangwoo Park","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2403.09570v4.pdf","comment":"17 pages, 10 figures, published in IEEE Transactions on Signal\n  Processing"},{"id":"http://arxiv.org/abs/2501.03301v2","updated":"2025-01-08T11:47:25Z","published":"2025-01-06T15:19:26Z","title":"Rethinking Byzantine Robustness in Federated Recommendation from Sparse\n  Aggregation Perspective","summary":"  To preserve user privacy in recommender systems, federated recommendation\n(FR) based on federated learning (FL) emerges, keeping the personal data on the\nlocal client and updating a model collaboratively. Unlike FL, FR has a unique\nsparse aggregation mechanism, where the embedding of each item is updated by\nonly partial clients, instead of full clients in a dense aggregation of general\nFL. Recently, as an essential principle of FL, model security has received\nincreasing attention, especially for Byzantine attacks, where malicious clients\ncan send arbitrary updates. The problem of exploring the Byzantine robustness\nof FR is particularly critical since in the domains applying FR, e.g.,\ne-commerce, malicious clients can be injected easily by registering new\naccounts. However, existing Byzantine works neglect the unique sparse\naggregation of FR, making them unsuitable for our problem. Thus, we make the\nfirst effort to investigate Byzantine attacks on FR from the perspective of\nsparse aggregation, which is non-trivial: it is not clear how to define\nByzantine robustness under sparse aggregations and design Byzantine attacks\nunder limited knowledge/capability. In this paper, we reformulate the Byzantine\nrobustness under sparse aggregation by defining the aggregation for a single\nitem as the smallest execution unit. Then we propose a family of effective\nattack strategies, named Spattack, which exploit the vulnerability in sparse\naggregation and are categorized along the adversary's knowledge and capability.\nExtensive experimental results demonstrate that Spattack can effectively\nprevent convergence and even break down defenses under a few malicious clients,\nraising alarms for securing FR systems.\n","authors":["Zhongjian Zhang","Mengmei Zhang","Xiao Wang","Lingjuan Lyu","Bo Yan","Junping Du","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2501.03301v2.pdf","comment":"accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04441v1","updated":"2025-01-08T11:45:50Z","published":"2025-01-08T11:45:50Z","title":"Motif Discovery Framework for Psychiatric EEG Data Classification","summary":"  In current medical practice, patients undergoing depression treatment must\nwait four to six weeks before a clinician can assess medication response due to\nthe delayed noticeable effects of antidepressants. Identification of a\ntreatment response at any earlier stage is of great importance, since it can\nreduce the emotional and economic burden connected with the treatment. We\napproach the prediction of a patient response to a treatment as a\nclassification problem, by utilizing the dynamic properties of EEG recordings\non the 7th day of the treatment. We present a novel framework that applies\nmotif discovery to extract meaningful features from EEG data distinguishing\nbetween depression treatment responders and non-responders. We applied our\nframework also to classification tasks in other psychiatric EEG datasets,\nnamely to patients with symptoms of schizophrenia, pediatric patients with\nintractable seizures, and Alzheimer disease and dementia. We achieved high\nclassification precision in all data sets. The results demonstrate that the\ndynamic properties of the EEGs may support clinicians in decision making both\nin diagnosis and in the prediction depression treatment response as early as on\nthe 7th day of the treatment. To our best knowledge, our work is the first one\nusing motifs in the depression diagnostics in general.\n","authors":["Melanija Kraljevska","Katerina Hlavackova-Schindler","Lukas Miklautz","Claudia Plant"],"pdf_url":"https://arxiv.org/pdf/2501.04441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01087v3","updated":"2025-01-08T11:40:29Z","published":"2025-01-02T06:19:53Z","title":"Bridging Simplicity and Sophistication using GLinear: A Novel\n  Architecture for Enhanced Time Series Prediction","summary":"  Time Series Forecasting (TSF) is an important application across many fields.\nThere is a debate about whether Transformers, despite being good at\nunderstanding long sequences, struggle with preserving temporal relationships\nin time series data. Recent research suggests that simpler linear models might\noutperform or at least provide competitive performance compared to complex\nTransformer-based models for TSF tasks. In this paper, we propose a novel\ndata-efficient architecture, GLinear, for multivariate TSF that exploits\nperiodic patterns to provide better accuracy. It also provides better\nprediction accuracy by using a smaller amount of historical data compared to\nother state-of-the-art linear predictors. Four different datasets (ETTh1,\nElectricity, Traffic, and Weather) are used to evaluate the performance of the\nproposed predictor. A performance comparison with state-of-the-art linear\narchitectures (such as NLinear, DLinear, and RLinear) and transformer-based\ntime series predictor (Autoformer) shows that the GLinear, despite being\nparametrically efficient, significantly outperforms the existing architectures\nin most cases of multivariate TSF. We hope that the proposed GLinear opens new\nfronts of research and development of simpler and more sophisticated\narchitectures for data and computationally efficient time-series analysis.\n","authors":["Syed Tahir Hussain Rizvi","Neel Kanwal","Muddasar Naeem","Alfredo Cuzzocrea","Antonio Coronato"],"pdf_url":"https://arxiv.org/pdf/2501.01087v3.pdf","comment":"Submitted to IEEE Transactions on Emerging Topics in Computational\n  Intelligence"},{"id":"http://arxiv.org/abs/2501.04436v1","updated":"2025-01-08T11:37:06Z","published":"2025-01-08T11:37:06Z","title":"Federated Fine-Tuning of LLMs: Framework Comparison and Research\n  Directions","summary":"  Federated learning (FL) provides a privacy-preserving solution for\nfine-tuning pre-trained large language models (LLMs) using distributed private\ndatasets, enabling task-specific adaptation while preserving data privacy.\nHowever, fine-tuning the extensive parameters in LLMs is particularly\nchallenging in resource-constrained federated scenarios due to the significant\ncommunication and computational costs. To gain a deeper understanding of how\nthese challenges can be addressed, this article conducts a comparative analysis\nthree advanced federated LLM (FedLLM) frameworks that integrate knowledge\ndistillation (KD) and split learning (SL) to mitigate these issues: 1) FedLLMs,\nwhere clients upload model parameters or gradients to enable straightforward\nand effective fine-tuning; 2) KD-FedLLMs, which leverage KD for efficient\nknowledge sharing via logits; and 3) Split-FedLLMs, which split the LLMs into\ntwo parts, with one part executed on the client and the other one on the\nserver, to balance the computational load. Each framework is evaluated based on\nkey performance metrics, including model accuracy, communication overhead, and\nclient-side computational load, offering insights into their effectiveness for\nvarious federated fine-tuning scenarios. Through this analysis, we identify\nframework-specific optimization opportunities to enhance the efficiency of\nFedLLMs and discuss broader research directions, highlighting open\nopportunities to better adapt FedLLMs for real-world applications. A use case\nis presented to demonstrate the performance comparison of these three\nframeworks under varying configurations and settings.\n","authors":["Na Yan","Yang Su","Yansha Deng","Robert Schober"],"pdf_url":"https://arxiv.org/pdf/2501.04436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04426v1","updated":"2025-01-08T11:20:48Z","published":"2025-01-08T11:20:48Z","title":"Dual-Force: Enhanced Offline Diversity Maximization under Imitation\n  Constraints","summary":"  While many algorithms for diversity maximization under imitation constraints\nare online in nature, many applications require offline algorithms without\nenvironment interactions. Tackling this problem in the offline setting,\nhowever, presents significant challenges that require non-trivial, multi-stage\noptimization processes with non-stationary rewards. In this work, we present a\nnovel offline algorithm that enhances diversity using an objective based on Van\nder Waals (VdW) force and successor features, and eliminates the need to learn\na previously used skill discriminator. Moreover, by conditioning the value\nfunction and policy on a pre-trained Functional Reward Encoding (FRE), our\nmethod allows for better handling of non-stationary rewards and provides\nzero-shot recall of all skills encountered during training, significantly\nexpanding the set of skills learned in prior work. Consequently, our algorithm\nbenefits from receiving a consistently strong diversity signal (VdW), and\nenjoys more stable and efficient training. We demonstrate the effectiveness of\nour method in generating diverse skills for two robotic tasks in simulation:\nlocomotion of a quadruped and local navigation with obstacle traversal.\n","authors":["Pavel Kolev","Marin Vlastelica","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2501.04426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02781v3","updated":"2025-01-08T11:16:18Z","published":"2025-01-06T05:53:38Z","title":"From Dense to Sparse: Event Response for Enhanced Residential Load\n  Forecasting","summary":"  Residential load forecasting (RLF) is crucial for resource scheduling in\npower systems. Most existing methods utilize all given load records (dense\ndata) to indiscriminately extract the dependencies between historical and\nfuture time series. However, there exist important regular patterns residing in\nthe event-related associations among different appliances (sparse knowledge),\nwhich have yet been ignored. In this paper, we propose an Event-Response\nKnowledge Guided approach (ERKG) for RLF by incorporating the estimation of\nelectricity usage events for different appliances, mining event-related sparse\nknowledge from the load series. With ERKG, the event-response estimation\nenables portraying the electricity consumption behaviors of residents,\nrevealing regular variations in appliance operational states. To be specific,\nERKG consists of knowledge extraction and guidance: i) a forecasting model is\ndesigned for the electricity usage events by estimating appliance operational\nstates, aiming to extract the event-related sparse knowledge; ii) a novel\nknowledge-guided mechanism is established by fusing such state estimates of the\nappliance events into the RLF model, which can give particular focuses on the\npatterns of users' electricity consumption behaviors. Notably, ERKG can\nflexibly serve as a plug-in module to boost the capability of existing\nforecasting models by leveraging event response. In numerical experiments,\nextensive comparisons and ablation studies have verified the effectiveness of\nour ERKG, e.g., over 8% MAE can be reduced on the tested state-of-the-art\nforecasting models.\n","authors":["Xin Cao","Qinghua Tao","Yingjie Zhou","Lu Zhang","Le Zhang","Dongjin Song","Dapeng Oliver Wu","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.02781v3.pdf","comment":"12 pages and 6 figures. Accepted for publication by IEEE Transactions\n  on Instrumentation and Measurement"},{"id":"http://arxiv.org/abs/2501.04421v1","updated":"2025-01-08T11:11:25Z","published":"2025-01-08T11:11:25Z","title":"Risk-averse policies for natural gas futures trading using\n  distributional reinforcement learning","summary":"  Financial markets have experienced significant instabilities in recent years,\ncreating unique challenges for trading and increasing interest in risk-averse\nstrategies. Distributional Reinforcement Learning (RL) algorithms, which model\nthe full distribution of returns rather than just expected values, offer a\npromising approach to managing market uncertainty. This paper investigates this\npotential by studying the effectiveness of three distributional RL algorithms\nfor natural gas futures trading and exploring their capacity to develop\nrisk-averse policies. Specifically, we analyze the performance and behavior of\nCategorical Deep Q-Network (C51), Quantile Regression Deep Q-Network (QR-DQN),\nand Implicit Quantile Network (IQN). To the best of our knowledge, these\nalgorithms have never been applied in a trading context. These policies are\ncompared against five Machine Learning (ML) baselines, using a detailed dataset\nprovided by Predictive Layer SA, a company supplying ML-based strategies for\nenergy trading. The main contributions of this study are as follows. (1) We\ndemonstrate that distributional RL algorithms significantly outperform\nclassical RL methods, with C51 achieving performance improvement of more than\n32\\%. (2) We show that training C51 and IQN to maximize CVaR produces\nrisk-sensitive policies with adjustable risk aversion. Specifically, our\nablation studies reveal that lower CVaR confidence levels increase risk\naversion, while higher levels decrease it, offering flexible risk management\noptions. In contrast, QR-DQN shows less predictable behavior. These findings\nemphasize the potential of distributional RL for developing adaptable,\nrisk-averse trading strategies in volatile markets.\n","authors":["Félicien Hêche","Biagio Nigro","Oussama Barakat","Stephan Robert-Nicoud"],"pdf_url":"https://arxiv.org/pdf/2501.04421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.08704v3","updated":"2025-01-08T11:07:42Z","published":"2024-05-14T15:42:55Z","title":"Full Line Code Completion: Bringing AI to Desktop","summary":"  In recent years, several industrial solutions for the problem of multi-token\ncode completion appeared, each making a great advance in the area but mostly\nfocusing on cloud-based runtime and avoiding working on the end user's device.\n  In this work, we describe our approach for building a multi-token code\ncompletion feature for the JetBrains' IntelliJ Platform, which we call Full\nLine Code Completion. The feature suggests only syntactically correct code and\nworks fully locally, i.e., data querying and the generation of suggestions\nhappens on the end user's machine. We share important time and\nmemory-consumption restrictions, as well as design principles that a code\ncompletion engine should satisfy. Working entirely on the end user's device,\nour code completion engine enriches user experience while being not only fast\nand compact but also secure. We share a number of useful techniques to meet the\nstated development constraints and also describe offline and online evaluation\npipelines that allowed us to make better decisions.\n  Our online evaluation shows that the usage of the tool leads to 1.3 times\nmore Python code in the IDE being produced by code completion. The described\nsolution was initially started with a help of researchers and was then bundled\ninto all JetBrains IDEs where it is now used by millions of users. Thus, we\nbelieve that this work is useful for bridging academia and industry, providing\nresearchers with the knowledge of what happens when complex research-based\nsolutions are integrated into real products.\n","authors":["Anton Semenkin","Vitaliy Bibaev","Yaroslav Sokolov","Kirill Krylov","Alexey Kalina","Anna Khannanova","Danila Savenkov","Darya Rovdo","Igor Davidenko","Kirill Karnaukhov","Maxim Vakhrushev","Mikhail Kostyukov","Mikhail Podvitskii","Petr Surkov","Yaroslav Golubev","Nikita Povarov","Timofey Bryksin"],"pdf_url":"https://arxiv.org/pdf/2405.08704v3.pdf","comment":"Published at ICSE'25. 12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.04413v1","updated":"2025-01-08T10:59:36Z","published":"2025-01-08T10:59:36Z","title":"Machine Learning and statistical classification of CRISPR-Cas12a\n  diagnostic assays","summary":"  CRISPR-based diagnostics have gained increasing attention as biosensing tools\nable to address limitations in contemporary molecular diagnostic tests. To\nmaximise the performance of CRISPR-based assays, much effort has focused on\noptimizing the chemistry and biology of the biosensing reaction. However, less\nattention has been paid to improving the techniques used to analyse\nCRISPR-based diagnostic data. To date, diagnostic decisions typically involve\nvarious forms of slope-based classification. Such methods are superior to\ntraditional methods based on assessing absolute signals, but still have\nlimitations. Herein, we establish performance benchmarks (total accuracy,\nsensitivity, and specificity) using common slope-based methods. We compare the\nperformance of these benchmark methods with three different quadratic empirical\ndistribution function statistical tests, finding significant improvements in\ndiagnostic speed and accuracy when applied to a clinical data set. Two of the\nthree statistical techniques, the Kolmogorov-Smirnov and Anderson-Darling\ntests, report the lowest time-to-result and highest total test accuracy.\nFurthermore, we developed a long short-term memory recurrent neural network to\nclassify CRISPR-biosensing data, achieving 100% specificity on our model data\nset. Finally, we provide guidelines on choosing the classification method and\nclassification method parameters that best suit a diagnostic assays needs.\n","authors":["Nathan Khosla","Jake M. Lesinski","Marcus Haywood-Alexander","Andrew J. deMello","Daniel A. Richards"],"pdf_url":"https://arxiv.org/pdf/2501.04413v1.pdf","comment":"25 pages, 5 figures, research paper. Nathan Khosla and Jake M.\n  Lesinski contributed equally. Electronic supporting information is included\n  as an appendix"},{"id":"http://arxiv.org/abs/2501.04410v1","updated":"2025-01-08T10:49:13Z","published":"2025-01-08T10:49:13Z","title":"User Simulation in the Era of Generative AI: User Modeling, Synthetic\n  Data Generation, and System Evaluation","summary":"  User simulation is an emerging interdisciplinary topic with multiple critical\napplications in the era of Generative AI. It involves creating an intelligent\nagent that mimics the actions of a human user interacting with an AI system,\nenabling researchers to model and analyze user behaviour, generate synthetic\ndata for training, and evaluate interactive AI systems in a controlled and\nreproducible manner. User simulation has profound implications for diverse\nfields and plays a vital role in the pursuit of Artificial General\nIntelligence. This paper provides an overview of user simulation, highlighting\nits key applications, connections to various disciplines, and outlining future\nresearch directions to advance this increasingly important technology.\n","authors":["Krisztian Balog","ChengXiang Zhai"],"pdf_url":"https://arxiv.org/pdf/2501.04410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04409v1","updated":"2025-01-08T10:49:06Z","published":"2025-01-08T10:49:06Z","title":"Lossless Privacy-Preserving Aggregation for Decentralized Federated\n  Learning","summary":"  Privacy concerns arise as sensitive data proliferate. Despite decentralized\nfederated learning (DFL) aggregating gradients from neighbors to avoid direct\ndata transmission, it still poses indirect data leaks from the transmitted\ngradients. Existing privacy-preserving methods for DFL add noise to gradients.\nThey either diminish the model predictive accuracy or suffer from ineffective\ngradient protection. In this paper, we propose a novel lossless\nprivacy-preserving aggregation rule named LPPA to enhance gradient protection\nas much as possible but without loss of DFL model predictive accuracy. LPPA\nsubtly injects the noise difference between the sent and received noise into\ntransmitted gradients for gradient protection. The noise difference\nincorporates neighbors' randomness for each client, effectively safeguarding\nagainst data leaks. LPPA employs the noise flow conservation theory to ensure\nthat the noise impact can be globally eliminated. The global sum of all noise\ndifferences remains zero, ensuring that accurate gradient aggregation is\nunaffected and the model accuracy remains intact. We theoretically prove that\nthe privacy-preserving capacity of LPPA is \\sqrt{2} times greater than that of\nnoise addition, while maintaining comparable model accuracy to the standard DFL\naggregation without noise injection. Experimental results verify the\ntheoretical findings and show that LPPA achieves a 13% mean improvement in\naccuracy over noise addition. We also demonstrate the effectiveness of LPPA in\nprotecting raw data and guaranteeing lossless model accuracy.\n","authors":["Xiaoye Miao","Bin Li","Yangyang Wu","Meng Xi","Xinkui Zhao","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2501.04409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00817v2","updated":"2025-01-08T10:42:53Z","published":"2025-01-01T12:04:06Z","title":"Hardness of Learning Fixed Parities with Neural Networks","summary":"  Learning parity functions is a canonical problem in learning theory, which\nalthough computationally tractable, is not amenable to standard learning\nalgorithms such as gradient-based methods. This hardness is usually explained\nvia statistical query lower bounds [Kearns, 1998]. However, these bounds only\nimply that for any given algorithm, there is some worst-case parity function\nthat will be hard to learn. Thus, they do not explain why fixed parities - say,\nthe full parity function over all coordinates - are difficult to learn in\npractice, at least with standard predictors and gradient-based methods [Abbe\nand Boix-Adsera, 2022]. In this paper, we address this open problem, by showing\nthat for any fixed parity of some minimal size, using it as a target function\nto train one-hidden-layer ReLU networks with perturbed gradient descent will\nfail to produce anything meaningful. To establish this, we prove a new result\nabout the decay of the Fourier coefficients of linear threshold (or weighted\nmajority) functions, which may be of independent interest.\n","authors":["Itamar Shoshani","Ohad Shamir"],"pdf_url":"https://arxiv.org/pdf/2501.00817v2.pdf","comment":"An updated version was uploaded in order to fix a typo at theorem 2\n  statement"},{"id":"http://arxiv.org/abs/2501.04403v1","updated":"2025-01-08T10:33:21Z","published":"2025-01-08T10:33:21Z","title":"Rising Rested MAB with Linear Drift","summary":"  We consider non-stationary multi-arm bandit (MAB) where the expected reward\nof each action follows a linear function of the number of times we executed the\naction. Our main result is a tight regret bound of\n$\\tilde{\\Theta}(T^{4/5}K^{3/5})$, by providing both upper and lower bounds. We\nextend our results to derive instance dependent regret bounds, which depend on\nthe unknown parametrization of the linear drift of the rewards.\n","authors":["Omer Amichay","Yishay Mansour"],"pdf_url":"https://arxiv.org/pdf/2501.04403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04401v1","updated":"2025-01-08T10:29:35Z","published":"2025-01-08T10:29:35Z","title":"Tracking UWB Devices Through Radio Frequency Fingerprinting Is Possible","summary":"  Ultra-wideband (UWB) is a state-of-the-art technology designed for\napplications requiring centimeter-level localization. Its widespread adoption\nby smartphone manufacturer naturally raises security and privacy concerns.\nSuccessfully implementing Radio Frequency Fingerprinting (RFF) to UWB could\nenable physical layer security, but might also allow undesired tracking of the\ndevices. The scope of this paper is to explore the feasibility of applying RFF\nto UWB and investigates how well this technique generalizes across different\nenvironments. We collected a realistic dataset using off-the-shelf UWB devices\nwith controlled variation in device positioning. Moreover, we developed an\nimproved deep learning pipeline to extract the hardware signature from the\nsignal data. In stable conditions, the extracted RFF achieves over 99%\naccuracy. While the accuracy decreases in more changing environments, we still\nobtain up to 76% accuracy in untrained locations.\n","authors":["Thibaud Ardoin","Niklas Pauli","Benedikt Groß","Mahsa Kholghi","Khan Reaz","Gerhard Wunder"],"pdf_url":"https://arxiv.org/pdf/2501.04401v1.pdf","comment":"conference ICNC'25, 7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2207.03890v3","updated":"2025-01-08T10:05:34Z","published":"2022-07-08T13:25:06Z","title":"ENCODE: Encoding NetFlows for Network Anomaly Detection","summary":"  NetFlow data is a popular network log format used by many network analysts\nand researchers. The advantages of using NetFlow over deep packet inspection\nare that it is easier to collect and process, and it is less privacy intrusive.\nMany works have used machine learning to detect network attacks using NetFlow\ndata. The first step for these machine learning pipelines is to pre-process the\ndata before it is given to the machine learning algorithm. Many approaches\nexist to pre-process NetFlow data; however, these simply apply existing methods\nto the data, not considering the specific properties of network data. We argue\nthat for data originating from software systems, such as NetFlow or software\nlogs, similarities in frequency and contexts of feature values are more\nimportant than similarities in the value itself. In this work, we propose an\nencoding algorithm that directly takes the frequency and the context of the\nfeature values into account when the data is being processed. Different types\nof network behaviours can be clustered using this encoding, thus aiding the\nprocess of detecting anomalies within the network. We train several machine\nlearning models for anomaly detection using the data that has been encoded with\nour encoding algorithm. We evaluate the effectiveness of our encoding on a new\ndataset that we created for network attacks on Kubernetes clusters and two\nwell-known public NetFlow datasets. We empirically demonstrate that the machine\nlearning models benefit from using our encoding for anomaly detection.\n","authors":["Clinton Cao","Annibale Panichella","Sicco Verwer","Agathe Blaise","Filippo Rebecchi"],"pdf_url":"https://arxiv.org/pdf/2207.03890v3.pdf","comment":"11 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.04387v1","updated":"2025-01-08T09:57:08Z","published":"2025-01-08T09:57:08Z","title":"The unbearable lightness of Restricted Boltzmann Machines: Theoretical\n  Insights and Biological Applications","summary":"  Restricted Boltzmann Machines are simple yet powerful neural networks. They\ncan be used for learning structure in data, and are used as a building block of\nmore complex neural architectures. At the same time, their simplicity makes\nthem easy to use, amenable to theoretical analysis, yielding interpretable\nmodels in applications. Here, we focus on reviewing the role that the\nactivation functions, describing the input-output relationship of single\nneurons in RBM, play in the functionality of these models. We discuss recent\ntheoretical results on the benefits and limitations of different activation\nfunctions. We also review applications to biological data analysis, namely\nneural data analysis, where RBM units are mostly taken to have sigmoid\nactivation functions and binary units, to protein data analysis and immunology\nwhere non-binary units and non-sigmoid activation functions have recently been\nshown to yield important insights into the data. Finally, we discuss open\nproblems addressing which can shed light on broader issues in neural network\nresearch.\n","authors":["Giovanni di Sarra","Barbara Bravi","Yasser Roudi"],"pdf_url":"https://arxiv.org/pdf/2501.04387v1.pdf","comment":"7 pages, 3 figures. To be published in EPL as di Sarra et al 2025\n  EPL. Accepted manuscript available online at\n  https://doi.org/10.1209/0295-5075/ada636"},{"id":"http://arxiv.org/abs/2409.20431v2","updated":"2025-01-08T09:54:15Z","published":"2024-09-30T15:53:24Z","title":"Multilevel Picard approximations and deep neural networks with ReLU,\n  leaky ReLU, and softplus activation overcome the curse of dimensionality when\n  approximating semilinear parabolic partial differential equations in\n  $L^p$-sense","summary":"  We prove that multilevel Picard approximations and deep neural networks with\nReLU, leaky ReLU, and softplus activation are capable of approximating\nsolutions of semilinear Kolmogorov PDEs in $L^\\mathfrak{p}$-sense,\n$\\mathfrak{p}\\in [2,\\infty)$, in the case of gradient-independent,\nLipschitz-continuous nonlinearities, while the computational effort of the\nmultilevel Picard approximations and the required number of parameters in the\nneural networks grow at most polynomially in both dimension $d\\in \\mathbb{N}$\nand reciprocal of the prescribed accuracy $\\epsilon$.\n","authors":["Ariel Neufeld","Tuan Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.20431v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19109v2","updated":"2025-01-08T09:48:02Z","published":"2024-12-26T07:58:09Z","title":"Stochastic normalizing flows for Effective String Theory","summary":"  Effective String Theory (EST) is a powerful tool used to study confinement in\npure gauge theories by modeling the confining flux tube connecting a static\nquark-anti-quark pair as a thin vibrating string. Recently, flow-based samplers\nhave been applied as an efficient numerical method to study EST regularized on\nthe lattice, opening the route to study observables previously inaccessible to\nstandard analytical methods. Flow-based samplers are a class of algorithms\nbased on Normalizing Flows (NFs), deep generative models recently proposed as a\npromising alternative to traditional Markov Chain Monte Carlo methods in\nlattice field theory calculations. By combining NF layers with\nout-of-equilibrium stochastic updates, we obtain Stochastic Normalizing Flows\n(SNFs), a scalable class of machine learning algorithms that can be explained\nin terms of stochastic thermodynamics. In this contribution, we outline EST and\nSNFs, and report some numerical results for the shape of the flux tube.\n","authors":["Michele Caselle","Elia Cellini","Alessandro Nada"],"pdf_url":"https://arxiv.org/pdf/2412.19109v2.pdf","comment":"1+ 10 pages, 2 figures, contribution for the 41st International\n  Symposium on Lattice Field Theory (Lattice 2024), 28 July - 3 August 2024,\n  Liverpool, UK; v2: 1+ 10 pages, 2 figures, reference added"},{"id":"http://arxiv.org/abs/2501.04377v1","updated":"2025-01-08T09:34:15Z","published":"2025-01-08T09:34:15Z","title":"On Computational Limits and Provably Efficient Criteria of Visual\n  Autoregressive Models: A Fine-Grained Complexity Analysis","summary":"  Recently, Visual Autoregressive ($\\mathsf{VAR}$) Models introduced a\ngroundbreaking advancement in the field of image generation, offering a\nscalable approach through a coarse-to-fine \"next-scale prediction\" paradigm.\nHowever, the state-of-the-art algorithm of $\\mathsf{VAR}$ models in [Tian,\nJiang, Yuan, Peng and Wang, NeurIPS 2024] takes $O(n^4)$ time, which is\ncomputationally inefficient. In this work, we analyze the computational limits\nand efficiency criteria of $\\mathsf{VAR}$ Models through a fine-grained\ncomplexity lens. Our key contribution is identifying the conditions under which\n$\\mathsf{VAR}$ computations can achieve sub-quadratic time complexity.\nSpecifically, we establish a critical threshold for the norm of input matrices\nused in $\\mathsf{VAR}$ attention mechanisms. Above this threshold, assuming the\nStrong Exponential Time Hypothesis ($\\mathsf{SETH}$) from fine-grained\ncomplexity theory, a sub-quartic time algorithm for $\\mathsf{VAR}$ models is\nimpossible. To substantiate our theoretical findings, we present efficient\nconstructions leveraging low-rank approximations that align with the derived\ncriteria. This work initiates the study of the computational efficiency of the\n$\\mathsf{VAR}$ model from a theoretical perspective. Our technique will shed\nlight on advancing scalable and efficient image generation in $\\mathsf{VAR}$\nframeworks.\n","authors":["Yekun Ke","Xiaoyu Li","Yingyu Liang","Zhizhou Sha","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2501.04377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18601v2","updated":"2025-01-08T09:30:47Z","published":"2024-07-26T08:41:58Z","title":"Reorganizing attention-space geometry with expressive attention","summary":"  Attention regulates information transfer between tokens. For this, query and\nkey vectors are compared, typically in terms of a scalar product,\n$\\mathbf{Q}^T\\mathbf{K}$, together with a subsequent softmax normalization. In\ngeometric terms, the standard dot-product attention (DPA) leads to large/small\nattention weights for parallel/antiparallel queries and keys. Here we study\nexpressive attention (EA), which is based on $(\\mathbf{Q}^T\\mathbf{K})^2$, the\nsquared dot product. In this case, attention is enhanced when query and key are\neither parallel or antiparallel, and suppressed for orthogonal configurations.\nEA can be introduced into any attention-based code without additional compute\ncosts or memory requirements. For a series of autoregressive prediction tasks,\nwe find that expressive attention performs at least as well as vanilla DPA.\nIncreasing task complexity, EA is observed to outperform DPA with increasing\nmargins, which also holds for multi-task settings. For a given model size, EA\nmanages to achieve 100% performance for a range of complexity levels not\naccessible to DPA. Our results show that it is possible to reorganize the\ngeometry of the matching condition in the space of attention heads without loss\nof performance.\n","authors":["Claudius Gros"],"pdf_url":"https://arxiv.org/pdf/2407.18601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15267v2","updated":"2025-01-08T09:18:05Z","published":"2024-12-17T05:04:57Z","title":"Toxicity Detection towards Adaptability to Changing Perturbations","summary":"  Toxicity detection is crucial for maintaining the peace of the society. While\nexisting methods perform well on normal toxic contents or those generated by\nspecific perturbation methods, they are vulnerable to evolving perturbation\npatterns. However, in real-world scenarios, malicious users tend to create new\nperturbation patterns for fooling the detectors. For example, some users may\ncircumvent the detector of large language models (LLMs) by adding `I am a\nscientist' at the beginning of the prompt. In this paper, we introduce a novel\nproblem, i.e., continual learning jailbreak perturbation patterns, into the\ntoxicity detection field. To tackle this problem, we first construct a new\ndataset generated by 9 types of perturbation patterns, 7 of them are summarized\nfrom prior work and 2 of them are developed by us. We then systematically\nvalidate the vulnerability of current methods on this new perturbation\npattern-aware dataset via both the zero-shot and fine tuned cross-pattern\ndetection. Upon this, we present the domain incremental learning paradigm and\nthe corresponding benchmark to ensure the detector's robustness to dynamically\nemerging types of perturbed toxic text. Our code and dataset are provided in\nthe appendix and will be publicly available at GitHub, by which we wish to\noffer new research opportunities for the security-relevant communities.\n","authors":["Hankun Kang","Jianhao Chen","Yongqi Li","Xin Miao","Mayi Xu","Ming Zhong","Yuanyuan Zhu","Tieyun Qian"],"pdf_url":"https://arxiv.org/pdf/2412.15267v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04080v3","updated":"2025-01-08T09:03:14Z","published":"2024-02-06T15:34:30Z","title":"Entropy-regularized Diffusion Policy with Q-Ensembles for Offline\n  Reinforcement Learning","summary":"  This paper presents advanced techniques of training diffusion policies for\noffline reinforcement learning (RL). At the core is a mean-reverting stochastic\ndifferential equation (SDE) that transfers a complex action distribution into a\nstandard Gaussian and then samples actions conditioned on the environment state\nwith a corresponding reverse-time SDE, like a typical diffusion policy. We show\nthat such an SDE has a solution that we can use to calculate the log\nprobability of the policy, yielding an entropy regularizer that improves the\nexploration of offline datasets. To mitigate the impact of inaccurate value\nfunctions from out-of-distribution data points, we further propose to learn the\nlower confidence bound of Q-ensembles for more robust policy improvement. By\ncombining the entropy-regularized diffusion policy with Q-ensembles in offline\nRL, our method achieves state-of-the-art performance on most tasks in D4RL\nbenchmarks. Code is available at\nhttps://github.com/ruoqizzz/Entropy-Regularized-Diffusion-Policy-with-QEnsemble.\n","authors":["Ruoqi Zhang","Ziwei Luo","Jens Sjölund","Thomas B. Schön","Per Mattsson"],"pdf_url":"https://arxiv.org/pdf/2402.04080v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03562v2","updated":"2025-01-08T08:57:32Z","published":"2025-01-07T06:22:55Z","title":"Rethinking Adversarial Attacks in Reinforcement Learning from Policy\n  Distribution Perspective","summary":"  Deep Reinforcement Learning (DRL) suffers from uncertainties and inaccuracies\nin the observation signal in realworld applications. Adversarial attack is an\neffective method for evaluating the robustness of DRL agents. However, existing\nattack methods targeting individual sampled actions have limited impacts on the\noverall policy distribution, particularly in continuous action spaces. To\naddress these limitations, we propose the Distribution-Aware Projected Gradient\nDescent attack (DAPGD). DAPGD uses distribution similarity as the gradient\nperturbation input to attack the policy network, which leverages the entire\npolicy distribution rather than relying on individual samples. We utilize the\nBhattacharyya distance in DAPGD to measure policy similarity, enabling\nsensitive detection of subtle but critical differences between probability\ndistributions. Our experiment results demonstrate that DAPGD achieves SOTA\nresults compared to the baselines in three robot navigation tasks, achieving an\naverage 22.03% higher reward drop compared to the best baseline.\n","authors":["Tianyang Duan","Zongyuan Zhang","Zheng Lin","Yue Gao","Ling Xiong","Yong Cui","Hongbin Liang","Xianhao Chen","Heming Cui","Dong Huang"],"pdf_url":"https://arxiv.org/pdf/2501.03562v2.pdf","comment":"10 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.04359v1","updated":"2025-01-08T08:55:10Z","published":"2025-01-08T08:55:10Z","title":"Decoding EEG Speech Perception with Transformers and VAE-based Data\n  Augmentation","summary":"  Decoding speech from non-invasive brain signals, such as\nelectroencephalography (EEG), has the potential to advance brain-computer\ninterfaces (BCIs), with applications in silent communication and assistive\ntechnologies for individuals with speech impairments. However, EEG-based speech\ndecoding faces major challenges, such as noisy data, limited datasets, and poor\nperformance on complex tasks like speech perception. This study attempts to\naddress these challenges by employing variational autoencoders (VAEs) for EEG\ndata augmentation to improve data quality and applying a state-of-the-art\n(SOTA) sequence-to-sequence deep learning architecture, originally successful\nin electromyography (EMG) tasks, to EEG-based speech decoding. Additionally, we\nadapt this architecture for word classification tasks. Using the Brennan\ndataset, which contains EEG recordings of subjects listening to narrated\nspeech, we preprocess the data and evaluate both classification and\nsequence-to-sequence models for EEG-to-words/sentences tasks. Our experiments\nshow that VAEs have the potential to reconstruct artificial EEG data for\naugmentation. Meanwhile, our sequence-to-sequence model achieves more promising\nperformance in generating sentences compared to our classification model,\nthough both remain challenging tasks. These findings lay the groundwork for\nfuture research on EEG speech perception decoding, with possible extensions to\nspeech production tasks such as silent or imagined speech.\n","authors":["Terrance Yu-Hao Chen","Yulin Chen","Pontus Soederhaell","Sadrishya Agrawal","Kateryna Shapovalenko"],"pdf_url":"https://arxiv.org/pdf/2501.04359v1.pdf","comment":"19 pages, 15 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.04353v1","updated":"2025-01-08T08:51:35Z","published":"2025-01-08T08:51:35Z","title":"DeFusion: An Effective Decoupling Fusion Network for Multi-Modal\n  Pregnancy Prediction","summary":"  Temporal embryo images and parental fertility table indicators are both\nvaluable for pregnancy prediction in \\textbf{in vitro fertilization embryo\ntransfer} (IVF-ET). However, current machine learning models cannot make full\nuse of the complementary information between the two modalities to improve\npregnancy prediction performance. In this paper, we propose a Decoupling Fusion\nNetwork called DeFusion to effectively integrate the multi-modal information\nfor IVF-ET pregnancy prediction. Specifically, we propose a decoupling fusion\nmodule that decouples the information from the different modalities into\nrelated and unrelated information, thereby achieving a more delicate fusion.\nAnd we fuse temporal embryo images with a spatial-temporal position encoding,\nand extract fertility table indicator information with a table transformer. To\nevaluate the effectiveness of our model, we use a new dataset including 4046\ncases collected from Southern Medical University. The experiments show that our\nmodel outperforms state-of-the-art methods. Meanwhile, the performance on the\neye disease prediction dataset reflects the model's good generalization. Our\ncode and dataset are available at https://github.com/Ou-Young-1999/DFNet.\n","authors":["Xueqiang Ouyang","Jia Wei","Wenjie Huo","Xiaocong Wang","Rui Li","Jianlong Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.04353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04339v1","updated":"2025-01-08T08:21:58Z","published":"2025-01-08T08:21:58Z","title":"DCIts -- Deep Convolutional Interpreter for time series","summary":"  We introduce an interpretable deep learning model for multivariate time\nseries forecasting that prioritizes both predictive performance and\ninterpretability - key requirements for understanding complex physical\nphenomena. Our model not only matches but often surpasses existing\ninterpretability methods, achieving this without compromising accuracy. Through\nextensive experiments, we demonstrate its ability to identify the most relevant\ntime series and lags that contribute to forecasting future values, providing\nintuitive and transparent explanations for its predictions. To minimize the\nneed for manual supervision, the model is designed so one can robustly\ndetermine the optimal window size that captures all necessary interactions\nwithin the smallest possible time frame. Additionally, it effectively\nidentifies the optimal model order, balancing complexity when incorporating\nhigher-order terms. These advancements hold significant implications for\nmodeling and understanding dynamic systems, making the model a valuable tool\nfor applied and computational physicists.\n","authors":["Davor Horvatic","Domjan Baric"],"pdf_url":"https://arxiv.org/pdf/2501.04339v1.pdf","comment":"37 pages, 15 figures"},{"id":"http://arxiv.org/abs/2405.18725v2","updated":"2025-01-08T08:20:07Z","published":"2024-05-29T03:16:12Z","title":"Can We Enhance the Quality of Mobile Crowdsensing Data Without Ground\n  Truth?","summary":"  Mobile crowdsensing (MCS) has emerged as a prominent trend across various\ndomains. However, ensuring the quality of the sensing data submitted by mobile\nusers (MUs) remains a complex and challenging problem. To address this\nchallenge, an advanced method is needed to detect low-quality sensing data and\nidentify malicious MUs that may disrupt the normal operations of an MCS system.\nTherefore, this article proposes a prediction- and reputation-based truth\ndiscovery (PRBTD) framework, which can separate low-quality data from\nhigh-quality data in sensing tasks. First, we apply a correlation-focused\nspatio-temporal Transformer network that learns from the historical sensing\ndata and predicts the ground truth of the data submitted by MUs. However, due\nto the noise in historical data for training and the bursty values within\nsensing data, the prediction results can be inaccurate. To address this issue,\nwe use the implications among the sensing data, which are learned from the\nprediction results but are stable and less affected by inaccurate predictions,\nto evaluate the quality of the data. Finally, we design a reputation-based\ntruth discovery (TD) module for identifying low-quality data with their\nimplications. Given the sensing data submitted by MUs, PRBTD can eliminate the\ndata with heavy noise and identify malicious MUs with high accuracy. Extensive\nexperimental results demonstrate that the PRBTD method outperforms existing\nmethods in terms of identification accuracy and data quality enhancement.\n","authors":["Jiajie Li","Bo Gu","Shimin Gong","Zhou Su","Mohsen Guizani"],"pdf_url":"https://arxiv.org/pdf/2405.18725v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04331v1","updated":"2025-01-08T08:05:18Z","published":"2025-01-08T08:05:18Z","title":"AutoDFL: A Scalable and Automated Reputation-Aware Decentralized\n  Federated Learning","summary":"  Blockchained federated learning (BFL) combines the concepts of federated\nlearning and blockchain technology to enhance privacy, security, and\ntransparency in collaborative machine learning models. However, implementing\nBFL frameworks poses challenges in terms of scalability and cost-effectiveness.\nReputation-aware BFL poses even more challenges, as blockchain validators are\ntasked with processing federated learning transactions along with the\ntransactions that evaluate FL tasks and aggregate reputations. This leads to\nfaster blockchain congestion and performance degradation. To improve BFL\nefficiency while increasing scalability and reducing on-chain reputation\nmanagement costs, this paper proposes AutoDFL, a scalable and automated\nreputation-aware decentralized federated learning framework. AutoDFL leverages\nzk-Rollups as a Layer-2 scaling solution to boost the performance while\nmaintaining the same level of security as the underlying Layer-1 blockchain.\nMoreover, AutoDFL introduces an automated and fair reputation model designed to\nincentivize federated learning actors. We develop a proof of concept for our\nframework for an accurate evaluation. Tested with various custom workloads,\nAutoDFL reaches an average throughput of over 3000 TPS with a gas reduction of\nup to 20X.\n","authors":["Meryem Malak Dif","Mouhamed Amine Bouchiha","Mourad Rabah","Yacine Ghamri-Doudane"],"pdf_url":"https://arxiv.org/pdf/2501.04331v1.pdf","comment":"Paper accepted at NOMS'2025 (pages 9, figures 5)"},{"id":"http://arxiv.org/abs/2406.01189v3","updated":"2025-01-08T07:59:53Z","published":"2024-06-03T10:51:43Z","title":"MultiMax: Sparse and Multi-Modal Attention Learning","summary":"  SoftMax is a ubiquitous ingredient of modern machine learning algorithms. It\nmaps an input vector onto a probability simplex and reweights the input by\nconcentrating the probability mass at large entries. Yet, as a smooth\napproximation to the Argmax function, a significant amount of probability mass\nis distributed to other, residual entries, leading to poor interpretability and\nnoise. Although sparsity can be achieved by a family of SoftMax variants, they\noften require an alternative loss function and do not preserve multi-modality.\nWe show that this trade-off between multi-modality and sparsity limits the\nexpressivity of SoftMax as well as its variants. We provide a solution to this\ntension between objectives by proposing a piece-wise differentiable function,\ntermed MultiMax, which adaptively modulates the output distribution according\nto input entry range. Through comprehensive analysis and evaluation, we show\nthat MultiMax successfully produces a distribution that supresses irrelevant\nentries while preserving multimodality, with benefits in image classification,\nlanguage modeling and machine translation. The code is available at\nhttps://github.com/ZhouYuxuanYX/MultiMax.\n","authors":["Yuxuan Zhou","Mario Fritz","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2406.01189v3.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2501.04319v1","updated":"2025-01-08T07:32:54Z","published":"2025-01-08T07:32:54Z","title":"VerifBFL: Leveraging zk-SNARKs for A Verifiable Blockchained Federated\n  Learning","summary":"  Blockchain-based Federated Learning (FL) is an emerging decentralized machine\nlearning paradigm that enables model training without relying on a central\nserver. Although some BFL frameworks are considered privacy-preserving, they\nare still vulnerable to various attacks, including inference and model\npoisoning. Additionally, most of these solutions employ strong trust\nassumptions among all participating entities or introduce incentive mechanisms\nto encourage collaboration, making them susceptible to multiple security flaws.\nThis work presents VerifBFL, a trustless, privacy-preserving, and verifiable\nfederated learning framework that integrates blockchain technology and\ncryptographic protocols. By employing zero-knowledge Succinct Non-Interactive\nArgument of Knowledge (zk-SNARKs) and incrementally verifiable computation\n(IVC), VerifBFL ensures the verifiability of both local training and\naggregation processes. The proofs of training and aggregation are verified\non-chain, guaranteeing the integrity and auditability of each participant's\ncontributions. To protect training data from inference attacks, VerifBFL\nleverages differential privacy. Finally, to demonstrate the efficiency of the\nproposed protocols, we built a proof of concept using emerging tools. The\nresults show that generating proofs for local training and aggregation in\nVerifBFL takes less than 81s and 2s, respectively, while verifying them\non-chain takes less than 0.6s.\n","authors":["Ahmed Ayoub Bellachia","Mouhamed Amine Bouchiha","Yacine Ghamri-Doudane","Mourad Rabah"],"pdf_url":"https://arxiv.org/pdf/2501.04319v1.pdf","comment":"Paper accepted at NOMS'25 (9 pages, 6 Figures)"},{"id":"http://arxiv.org/abs/2501.02721v3","updated":"2025-01-08T07:31:13Z","published":"2025-01-06T02:25:48Z","title":"Learning Stochastic Nonlinear Dynamics with Embedded Latent Transfer\n  Operators","summary":"  We consider an operator-based latent Markov representation of a stochastic\nnonlinear dynamical system, where the stochastic evolution of the latent state\nembedded in a reproducing kernel Hilbert space is described with the\ncorresponding transfer operator, and develop a spectral method to learn this\nrepresentation based on the theory of stochastic realization. The embedding may\nbe learned simultaneously using reproducing kernels, for example, constructed\nwith feed-forward neural networks. We also address the generalization of\nsequential state-estimation (Kalman filtering) in stochastic nonlinear systems,\nand of operator-based eigen-mode decomposition of dynamics, for the\nrepresentation. Several examples with synthetic and real-world data are shown\nto illustrate the empirical characteristics of our methods, and to investigate\nthe performance of our model in sequential state-estimation and mode\ndecomposition.\n","authors":["Naichang Ke","Ryogo Tanaka","Yoshinobu Kawahara"],"pdf_url":"https://arxiv.org/pdf/2501.02721v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05412v4","updated":"2025-01-08T07:29:55Z","published":"2023-06-08T17:56:46Z","title":"Decoupled Prioritized Resampling for Offline RL","summary":"  Offline reinforcement learning (RL) is challenged by the distributional shift\nproblem. To address this problem, existing works mainly focus on designing\nsophisticated policy constraints between the learned policy and the behavior\npolicy. However, these constraints are applied equally to well-performing and\ninferior actions through uniform sampling, which might negatively affect the\nlearned policy. To alleviate this issue, we propose Offline Prioritized\nExperience Replay (OPER), featuring a class of priority functions designed to\nprioritize highly-rewarding transitions, making them more frequently visited\nduring training. Through theoretical analysis, we show that this class of\npriority functions induce an improved behavior policy, and when constrained to\nthis improved policy, a policy-constrained offline RL algorithm is likely to\nyield a better solution. We develop two practical strategies to obtain priority\nweights by estimating advantages based on a fitted value network (OPER-A) or\nutilizing trajectory returns (OPER-R) for quick computation. OPER is a\nplug-and-play component for offline RL algorithms. As case studies, we evaluate\nOPER on five different algorithms, including BC, TD3+BC, Onestep RL, CQL, and\nIQL. Extensive experiments demonstrate that both OPER-A and OPER-R\nsignificantly improve the performance for all baseline methods. Codes and\npriority weights are availiable at https://github.com/sail-sg/OPER.\n","authors":["Yang Yue","Bingyi Kang","Xiao Ma","Qisen Yang","Gao Huang","Shiji Song","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2306.05412v4.pdf","comment":"published on IEEE TNNLS"},{"id":"http://arxiv.org/abs/2411.07464v2","updated":"2025-01-08T07:25:55Z","published":"2024-11-12T00:57:30Z","title":"BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating\n  Machine Learning Tasks","summary":"  Large Language Models (LLMs) excel in diverse applications including\ngeneration of code snippets, but often struggle with generating code for\ncomplex Machine Learning (ML) tasks. Although existing LLM single-agent based\nsystems give varying performance depending on the task complexity, they purely\nrely on larger and expensive models such as GPT-4. Our investigation reveals\nthat no-cost and low-cost models such as Gemini-Pro, Mixtral and CodeLlama\nperform far worse than GPT-4 in a single-agent setting. With the motivation of\ndeveloping a cost-efficient LLM based solution for solving ML tasks, we propose\nan LLM Multi-Agent based system which leverages combination of experts using\nprofiling, efficient retrieval of past observations, LLM cascades, and\nask-the-expert calls. Through empirical analysis on ML engineering tasks in the\nMLAgentBench benchmark, we demonstrate the effectiveness of our system, using\nno-cost models, namely Gemini as the base LLM, paired with GPT-4 in cascade and\nexpert to serve occasional ask-the-expert calls for planning. With 94.2\\%\nreduction in the cost (from \\$0.931 per run cost averaged over all tasks for\nGPT-4 single agent system to \\$0.054), our system is able to yield better\naverage success rate of 32.95\\% as compared to GPT-4 single-agent system\nyielding 22.72\\% success rate averaged over all the tasks of MLAgentBench.\n","authors":["Shubham Gandhi","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2411.07464v2.pdf","comment":"Presented at AIMLSystems '24"},{"id":"http://arxiv.org/abs/2407.16040v2","updated":"2025-01-08T07:21:15Z","published":"2024-07-22T20:34:00Z","title":"Generalizing Teacher Networks for Effective Knowledge Distillation\n  Across Student Architectures","summary":"  Knowledge distillation (KD) is a model compression method that entails\ntraining a compact student model to emulate the performance of a more complex\nteacher model. However, the architectural capacity gap between the two models\nlimits the effectiveness of knowledge transfer. Addressing this issue, previous\nworks focused on customizing teacher-student pairs to improve compatibility, a\ncomputationally expensive process that needs to be repeated every time either\nmodel changes. Hence, these methods are impractical when a teacher model has to\nbe compressed into different student models for deployment on multiple hardware\ndevices with distinct resource constraints. In this work, we propose Generic\nTeacher Network (GTN), a one-off KD-aware training to create a generic teacher\ncapable of effectively transferring knowledge to any student model sampled from\na given finite pool of architectures. To this end, we represent the student\npool as a weight-sharing supernet and condition our generic teacher to align\nwith the capacities of various student architectures sampled from this\nsupernet. Experimental evaluation shows that our method both improves overall\nKD effectiveness and amortizes the minimal additional training cost of the\ngeneric teacher across students in the pool.\n","authors":["Kuluhan Binici","Weiming Wu","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2407.16040v2.pdf","comment":"British Machine Vision Conference (BMVC 24)"},{"id":"http://arxiv.org/abs/2408.12545v2","updated":"2025-01-08T07:20:32Z","published":"2024-08-22T16:59:32Z","title":"Dynamics of Meta-learning Representation in the Teacher-student Scenario","summary":"  Gradient-based meta-learning algorithms have gained popularity for their\nability to train models on new tasks using limited data. Empirical observations\nindicate that such algorithms are able to learn a shared representation across\ntasks, which is regarded as a key factor in their success. However, the\nin-depth theoretical understanding of the learning dynamics and the origin of\nthe shared representation remains underdeveloped. In this work, we investigate\nthe meta-learning dynamics of nonlinear two-layer neural networks trained on\nstreaming tasks in the teacher-student scenario. Through the lens of\nstatistical physics analysis, we characterize the macroscopic behavior of the\nmeta-training processes, the formation of the shared representation, and the\ngeneralization ability of the model on new tasks. The analysis also points to\nthe importance of the choice of certain hyperparameters of the learning\nalgorithms.\n","authors":["Hui Wang","Cho Tung Yip","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2408.12545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04315v1","updated":"2025-01-08T07:13:52Z","published":"2025-01-08T07:13:52Z","title":"RoRA: Efficient Fine-Tuning of LLM with Reliability Optimization for\n  Rank Adaptation","summary":"  Fine-tuning helps large language models (LLM) recover degraded information\nand enhance task performance.Although Low-Rank Adaptation (LoRA) is widely used\nand effective for fine-tuning, we have observed that its scaling factor can\nlimit or even reduce performance as the rank size increases. To address this\nissue, we propose RoRA (Rank-adaptive Reliability Optimization), a simple yet\neffective method for optimizing LoRA's scaling factor. By replacing $\\alpha/r$\nwith $\\alpha/\\sqrt{r}$, RoRA ensures improved performance as rank size\nincreases. Moreover, RoRA enhances low-rank adaptation in fine-tuning\nuncompressed models and excels in the more challenging task of accuracy\nrecovery when fine-tuning pruned models. Extensive experiments demonstrate the\neffectiveness of RoRA in fine-tuning both uncompressed and pruned models. RoRA\nsurpasses the state-of-the-art (SOTA) in average accuracy and robustness on\nLLaMA-7B/13B, LLaMA2-7B, and LLaMA3-8B, specifically outperforming LoRA and\nDoRA by 6.5% and 2.9% on LLaMA-7B, respectively. In pruned model fine-tuning,\nRoRA shows significant advantages; for SHEARED-LLAMA-1.3, a LLaMA-7B with 81.4%\npruning, RoRA achieves 5.7% higher average accuracy than LoRA and 3.9% higher\nthan DoRA.\n","authors":["Jun Liu","Zhenglun Kong","Peiyan Dong","Xuan Shen","Pu Zhao","Hao Tang","Geng Yuan","Wei Niu","Wenbin Zhang","Xue Lin","Dong Huang","Yanzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04315v1.pdf","comment":"ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.04308v1","updated":"2025-01-08T06:53:21Z","published":"2025-01-08T06:53:21Z","title":"FSC-loss: A Frequency-domain Structure Consistency Learning Approach for\n  Signal Data Recovery and Reconstruction","summary":"  A core challenge for signal data recovery is to model the distribution of\nsignal matrix (SM) data based on measured low-quality data in biomedical\nengineering of magnetic particle imaging (MPI). For acquiring the\nhigh-resolution (high-quality) SM, the number of meticulous measurements at\nnumerous positions in the field-of-view proves time-consuming (measurement of a\n37x37x37 SM takes about 32 hours). To improve reconstructed signal quality and\nshorten SM measurement time, existing methods explore to generating\nhigh-resolution SM based on time-saving measured low-resolution SM (a 9x9x9 SM\njust takes about 0.5 hours). However, previous methods show poor performance\nfor high-frequency signal recovery in SM. To achieve a high-resolution SM\nrecovery and shorten its acquisition time, we propose a frequency-domain\nstructure consistency loss function and data component embedding strategy to\nmodel global and local structural information of SM. We adopt a\ntransformer-based network to evaluate this function and the strategy. We\nevaluate our methods and state-of-the-art (SOTA) methods on the two simulation\ndatasets and four public measured SMs in Open MPI Data. The results show that\nour method outperforms the SOTA methods in high-frequency structural signal\nrecovery. Additionally, our method can recover a high-resolution SM with clear\nhigh-frequency structure based on a down-sampling factor of 16 less than 15\nseconds, which accelerates the acquisition time over 60 times faster than the\nmeasurement-based HR SM with the minimum error (nRMSE=0.041). Moreover, our\nmethod is applied in our three in-house MPI systems, and boost their\nperformance for signal reconstruction.\n","authors":["Liwen Zhang","Zhaoji Miao","Fan Yang","Gen Shi","Jie He","Yu An","Hui Hui","Jie Tian"],"pdf_url":"https://arxiv.org/pdf/2501.04308v1.pdf","comment":"11 pages,7 figures"},{"id":"http://arxiv.org/abs/2404.01714v4","updated":"2025-01-08T06:52:07Z","published":"2024-04-02T07:57:17Z","title":"Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization\n  Algorithm for Deep Learning","summary":"  Training deep neural networks is a challenging task. In order to speed up\ntraining and enhance the performance of deep neural networks, we rectify the\nvanilla conjugate gradient as conjugate-gradient-like and incorporate it into\nthe generic Adam, and thus propose a new optimization algorithm named\nCG-like-Adam for deep learning. Specifically, both the first-order and the\nsecond-order moment estimation of generic Adam are replaced by the\nconjugate-gradient-like. Convergence analysis handles the cases where the\nexponential moving average coefficient of the first-order moment estimation is\nconstant and the first-order moment estimation is unbiased. Numerical\nexperiments show the superiority of the proposed algorithm based on the\nCIFAR10/100 dataset.\n","authors":["Jiawu Tian","Liwei Xu","Xiaowei Zhang","Yongqi Li"],"pdf_url":"https://arxiv.org/pdf/2404.01714v4.pdf","comment":"32 pages, 13 figures"},{"id":"http://arxiv.org/abs/2407.08974v2","updated":"2025-01-08T06:42:39Z","published":"2024-07-12T04:04:54Z","title":"Topology-enhanced machine learning model (Top-ML) for anticancer peptide\n  prediction","summary":"  Recently, therapeutic peptides have demonstrated great promise for cancer\ntreatment. To explore powerful anticancer peptides, artificial intelligence\n(AI)-based approaches have been developed to systematically screen potential\ncandidates. However, the lack of efficient featurization of peptides has become\na bottleneck for these machine-learning models. In this paper, we propose a\ntopology-enhanced machine learning model (Top-ML) for anticancer peptides\nprediction. Our Top-ML employs peptide topological features derived from its\nsequence \"connection\" information characterized by vector and spectral\ndescriptors. Our Top-ML model, employing an Extra-Trees classifier, has been\nvalidated on the AntiCP 2.0 and mACPpred 2.0 benchmark datasets, achieving\nstate-of-the-art performance or results comparable to existing deep learning\nmodels, while providing greater interpretability. Our results highlight the\npotential of leveraging novel topology-based featurization to accelerate the\nidentification of anticancer peptides.\n","authors":["Joshua Zhi En Tan","JunJie Wee","Xue Gong","Kelin Xia"],"pdf_url":"https://arxiv.org/pdf/2407.08974v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08128v4","updated":"2025-01-08T06:35:45Z","published":"2024-12-11T06:31:06Z","title":"Why Does Dropping Edges Usually Outperform Adding Edges in Graph\n  Contrastive Learning?","summary":"  Graph contrastive learning (GCL) has been widely used as an effective\nself-supervised learning method for graph representation learning. However, how\nto apply adequate and stable graph augmentation to generating proper views for\ncontrastive learning remains an essential problem. Dropping edges is a primary\naugmentation in GCL while adding edges is not a common method due to its\nunstable performance. To our best knowledge, there is no theoretical analysis\nto study why dropping edges usually outperforms adding edges. To answer this\nquestion, we introduce a new metric, namely Error Passing Rate (EPR), to\nquantify how a graph fits the network. Inspired by the theoretical conclusions\nand the idea of positive-incentive noise, we propose a novel GCL algorithm,\nError-PAssing-based Graph Contrastive Learning (EPAGCL), which uses both edge\nadding and edge dropping as its augmentations. To be specific, we generate\nviews by adding and dropping edges based on the weights derived from EPR.\nExtensive experiments on various real-world datasets are conducted to validate\nthe correctness of our theoretical analysis and the effectiveness of our\nproposed algorithm. Our code is available at:\nhttps://github.com/hyzhang98/EPAGCL.\n","authors":["Yanchen Xu","Siqi Huang","Hongyuan Zhang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2412.08128v4.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04305v1","updated":"2025-01-08T06:34:32Z","published":"2025-01-08T06:34:32Z","title":"Physics-Informed Super-Resolution Diffusion for 6D Phase Space\n  Diagnostics","summary":"  Adaptive physics-informed super-resolution diffusion is developed for\nnon-invasive virtual diagnostics of the 6D phase space density of charged\nparticle beams. An adaptive variational autoencoder (VAE) embeds initial beam\ncondition images and scalar measurements to a low-dimensional latent space from\nwhich a 326 pixel 6D tensor representation of the beam's 6D phase space density\nis generated. Projecting from a 6D tensor generates physically consistent 2D\nprojections. Physics-guided super-resolution diffusion transforms\nlow-resolution images of the 6D density to high resolution 256x256 pixel\nimages. Un-supervised adaptive latent space tuning enables tracking of\ntime-varying beams without knowledge of time-varying initial conditions. The\nmethod is demonstrated with experimental data and multi-particle simulations at\nthe HiRES UED. The general approach is applicable to a wide range of complex\ndynamic systems evolving in high-dimensional phase space. The method is shown\nto be robust to distribution shift without re-training.\n","authors":["Alexander Scheinker"],"pdf_url":"https://arxiv.org/pdf/2501.04305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04304v1","updated":"2025-01-08T06:30:31Z","published":"2025-01-08T06:30:31Z","title":"DGQ: Distribution-Aware Group Quantization for Text-to-Image Diffusion\n  Models","summary":"  Despite the widespread use of text-to-image diffusion models across various\ntasks, their computational and memory demands limit practical applications. To\nmitigate this issue, quantization of diffusion models has been explored. It\nreduces memory usage and computational costs by compressing weights and\nactivations into lower-bit formats. However, existing methods often struggle to\npreserve both image quality and text-image alignment, particularly in\nlower-bit($<$ 8bits) quantization. In this paper, we analyze the challenges\nassociated with quantizing text-to-image diffusion models from a distributional\nperspective. Our analysis reveals that activation outliers play a crucial role\nin determining image quality. Additionally, we identify distinctive patterns in\ncross-attention scores, which significantly affects text-image alignment. To\naddress these challenges, we propose Distribution-aware Group Quantization\n(DGQ), a method that identifies and adaptively handles pixel-wise and\nchannel-wise outliers to preserve image quality. Furthermore, DGQ applies\nprompt-specific logarithmic quantization scales to maintain text-image\nalignment. Our method demonstrates remarkable performance on datasets such as\nMS-COCO and PartiPrompts. We are the first to successfully achieve low-bit\nquantization of text-to-image diffusion models without requiring additional\nfine-tuning of weight quantization parameters.\n","authors":["Hyogon Ryu","NaHyeon Park","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2501.04304v1.pdf","comment":"Project page: https://ugonfor.kr/DGQ"},{"id":"http://arxiv.org/abs/2501.04300v1","updated":"2025-01-08T06:18:32Z","published":"2025-01-08T06:18:32Z","title":"Handling Incomplete Heterogeneous Data using a Data-Dependent Kernel","summary":"  Handling incomplete data in real-world applications is a critical challenge\ndue to two key limitations of existing methods: (i) they are primarily designed\nfor numeric data and struggle with categorical or heterogeneous/mixed datasets;\n(ii) they assume that data is missing completely at random, which is often not\nthe case in practice -- in reality, data is missing in patterns, leading to\nbiased results if these patterns are not accounted for. To address these two\nlimitations, this paper presents a novel approach to handling missing values\nusing the Probability Mass Similarity Kernel (PMK), a data-dependent kernel,\nwhich does not make any assumptions about data types and missing mechanisms. It\neliminates the need for prior knowledge or extensive pre-processing steps and\ninstead leverages the distribution of observed data. Our method unifies the\nrepresentation of diverse data types by capturing more meaningful pairwise\nsimilarities and enhancing downstream performance. We evaluated our approach\nacross over 10 datasets with numerical-only, categorical-only, and mixed\nfeatures under different missing mechanisms and rates. Across both\nclassification and clustering tasks, our approach consistently outperformed\nexisting techniques, demonstrating its robustness and effectiveness in managing\nincomplete heterogeneous data.\n","authors":["Youran Zhou","Mohamed Reda Bouadjenek","Jonathan Wells","Sunil Aryal"],"pdf_url":"https://arxiv.org/pdf/2501.04300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04299v1","updated":"2025-01-08T06:07:33Z","published":"2025-01-08T06:07:33Z","title":"Circuit Complexity Bounds for Visual Autoregressive Model","summary":"  Understanding the expressive ability of a specific model is essential for\ngrasping its capacity limitations. Recently, several studies have established\ncircuit complexity bounds for Transformer architecture. Besides, the Visual\nAutoRegressive (VAR) model has risen to be a prominent method in the field of\nimage generation, outperforming previous techniques, such as Diffusion\nTransformers, in generating high-quality images. We investigate the circuit\ncomplexity of the VAR model and establish a bound in this study. Our primary\nresult demonstrates that the VAR model is equivalent to a simulation by a\nuniform $\\mathsf{TC}^0$ threshold circuit with hidden dimension $d \\leq O(n)$\nand $\\mathrm{poly}(n)$ precision. This is the first study to rigorously\nhighlight the limitations in the expressive power of VAR models despite their\nimpressive performance. We believe our findings will offer valuable insights\ninto the inherent constraints of these models and guide the development of more\nefficient and expressive architectures in the future.\n","authors":["Yekun Ke","Xiaoyu Li","Yingyu Liang","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2501.04299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17462v4","updated":"2025-01-08T05:36:30Z","published":"2024-05-23T07:20:45Z","title":"Ferrari: Federated Feature Unlearning via Optimizing Feature Sensitivity","summary":"  The advent of Federated Learning (FL) highlights the practical necessity for\nthe right to be forgotten for all clients, allowing them to request data\ndeletion from the machine learning models service provider. This necessity has\nspurred a growing demand for Federated Unlearning (FU). Feature unlearning has\ngained considerable attention due to its applications in unlearning sensitive,\nbackdoor, and biased features. Existing methods employ the influence function\nto achieve feature unlearning, which is impractical for FL as it necessitates\nthe participation of other clients, if not all, in the unlearning process.\nFurthermore, current research lacks an evaluation of the effectiveness of\nfeature unlearning. To address these limitations, we define feature sensitivity\nin evaluating feature unlearning according to Lipschitz continuity. This metric\ncharacterizes the model outputs rate of change or sensitivity to perturbations\nin the input feature. We then propose an effective federated feature unlearning\nframework called Ferrari, which minimizes feature sensitivity. Extensive\nexperimental results and theoretical analysis demonstrate the effectiveness of\nFerrari across various feature unlearning scenarios, including sensitive,\nbackdoor, and biased features. The code is publicly available at\nhttps://github.com/OngWinKent/Federated-Feature-Unlearning\n","authors":["Hanlin Gu","Win Kent Ong","Chee Seng Chan","Lixin Fan"],"pdf_url":"https://arxiv.org/pdf/2405.17462v4.pdf","comment":"TLDR: The need for a \"right to be forgotten\" in Federated Learning\n  has led to the development of the Ferrari framework, which efficiently\n  unlearns sensitive features using a Lipschitz continuity-based metric, proven\n  effective in extensive testing. Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.04292v1","updated":"2025-01-08T05:32:55Z","published":"2025-01-08T05:32:55Z","title":"MAD-UV: The 1st INTERSPEECH Mice Autism Detection via Ultrasound\n  Vocalization Challenge","summary":"  The Mice Autism Detection via Ultrasound Vocalization (MAD-UV) Challenge\nintroduces the first INTERSPEECH challenge focused on detecting autism spectrum\ndisorder (ASD) in mice through their vocalizations. Participants are tasked\nwith developing models to automatically classify mice as either wild-type or\nASD models based on recordings with a high sampling rate. Our baseline system\nemploys a simple CNN-based classification using three different spectrogram\nfeatures. Results demonstrate the feasibility of automated ASD detection, with\nthe considered audible-range features achieving the best performance (UAR of\n0.600 for segment-level and 0.625 for subject-level classification). This\nchallenge bridges speech technology and biomedical research, offering\nopportunities to advance our understanding of ASD models through machine\nlearning approaches. The findings suggest promising directions for vocalization\nanalysis and highlight the potential value of audible and ultrasound\nvocalizations in ASD detection.\n","authors":["Zijiang Yang","Meishu Song","Xin Jing","Haojie Zhang","Kun Qian","Bin Hu","Kota Tamada","Toru Takumi","Björn W. Schuller","Yoshiharu Yamamoto"],"pdf_url":"https://arxiv.org/pdf/2501.04292v1.pdf","comment":"5 pages, 1 figure and 2 tables. For MAD-UV Challenge 2025"},{"id":"http://arxiv.org/abs/2501.04288v1","updated":"2025-01-08T05:27:16Z","published":"2025-01-08T05:27:16Z","title":"An Analysis of Model Robustness across Concurrent Distribution Shifts","summary":"  Machine learning models, meticulously optimized for source data, often fail\nto predict target data when faced with distribution shifts (DSs). Previous\nbenchmarking studies, though extensive, have mainly focused on simple DSs.\nRecognizing that DSs often occur in more complex forms in real-world scenarios,\nwe broadened our study to include multiple concurrent shifts, such as unseen\ndomain shifts combined with spurious correlations. We evaluated 26 algorithms\nthat range from simple heuristic augmentations to zero-shot inference using\nfoundation models, across 168 source-target pairs from eight datasets. Our\nanalysis of over 100K models reveals that (i) concurrent DSs typically worsen\nperformance compared to a single shift, with certain exceptions, (ii) if a\nmodel improves generalization for one distribution shift, it tends to be\neffective for others, and (iii) heuristic data augmentations achieve the best\noverall performance on both synthetic and real-world datasets.\n","authors":["Myeongho Jeon","Suhwan Choi","Hyoje Lee","Teresa Yeo"],"pdf_url":"https://arxiv.org/pdf/2501.04288v1.pdf","comment":"Accepted to TMLR"},{"id":"http://arxiv.org/abs/2501.04287v1","updated":"2025-01-08T05:25:14Z","published":"2025-01-08T05:25:14Z","title":"ElasticZO: A Memory-Efficient On-Device Learning with Combined Zeroth-\n  and First-Order Optimization","summary":"  Zeroth-order (ZO) optimization is being recognized as a simple yet powerful\nalternative to standard backpropagation (BP)-based training. Notably, ZO\noptimization allows for training with only forward passes and (almost) the same\nmemory as inference, making it well-suited for edge devices with limited\ncomputing and memory resources. In this paper, we propose ZO-based on-device\nlearning (ODL) methods for full-precision and 8-bit quantized deep neural\nnetworks (DNNs), namely ElasticZO and ElasticZO-INT8. ElasticZO lies in the\nmiddle between pure ZO- and pure BP-based approaches, and is based on the idea\nto employ BP for the last few layers and ZO for the remaining layers.\nElasticZO-INT8 achieves integer arithmetic-only ZO-based training for the first\ntime, by incorporating a novel method for computing quantized ZO gradients from\ninteger cross-entropy loss values. Experimental results on the classification\ndatasets show that ElasticZO effectively addresses the slow convergence of\nvanilla ZO and shrinks the accuracy gap to BP-based training. Compared to\nvanilla ZO, ElasticZO achieves 5.2-9.5% higher accuracy with only 0.072-1.7%\nmemory overhead, and can handle fine-tuning tasks as well as full training.\nElasticZO-INT8 further reduces the memory usage and training time by 1.46-1.60x\nand 1.38-1.42x without compromising the accuracy. These results demonstrate a\nbetter tradeoff between accuracy and training cost compared to pure ZO- and\nBP-based approaches, and also highlight the potential of ZO optimization in\non-device learning.\n","authors":["Keisuke Sugiura","Hiroki Matsutani"],"pdf_url":"https://arxiv.org/pdf/2501.04287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04286v1","updated":"2025-01-08T05:24:11Z","published":"2025-01-08T05:24:11Z","title":"Mapping the Edge of Chaos: Fractal-Like Boundaries in The Trainability\n  of Decoder-Only Transformer Models","summary":"  In the realm of fractal geometry, intricate structures emerge from simple\niterative processes that partition parameter spaces into regions of stability\nand instability. Likewise, training large language models involves iteratively\napplying update functions, such as Adam, where even slight hyperparameter\nadjustments can shift the training process from convergence to divergence.\nRecent evidence from miniature neural networks suggests that the boundary\nseparating these outcomes displays fractal characteristics [1]. Building on\nthese insights, this study extends them to medium-sized, decoder-only\ntransformer architectures by employing a more consistent convergence measure\nand examining the learning rate hyperparameter landscape for attention and\nfully connected layers. The results show that the trainability frontier is not\na simple threshold; rather, it forms a self-similar yet seemingly random\nstructure at multiple scales, with statistically consistent and repeating\npatterns. Within this landscape, a region of stable convergence is surrounded\nby a complex chaotic border, illustrating the sensitive nature of the\nunderlying training dynamics.\n","authors":["Bahman Torkamandi"],"pdf_url":"https://arxiv.org/pdf/2501.04286v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2501.00309v2","updated":"2025-01-08T05:16:25Z","published":"2024-12-31T06:59:35Z","title":"Retrieval-Augmented Generation with Graphs (GraphRAG)","summary":"  Retrieval-augmented generation (RAG) is a powerful technique that enhances\ndownstream task execution by retrieving additional information, such as\nknowledge, skills, and tools from external sources. Graph, by its intrinsic\n\"nodes connected by edges\" nature, encodes massive heterogeneous and relational\ninformation, making it a golden resource for RAG in tremendous real-world\napplications. As a result, we have recently witnessed increasing attention on\nequipping RAG with Graph, i.e., GraphRAG. However, unlike conventional RAG,\nwhere the retriever, generator, and external data sources can be uniformly\ndesigned in the neural-embedding space, the uniqueness of graph-structured\ndata, such as diverse-formatted and domain-specific relational knowledge, poses\nunique and significant challenges when designing GraphRAG for different\ndomains. Given the broad applicability, the associated design challenges, and\nthe recent surge in GraphRAG, a systematic and up-to-date survey of its key\nconcepts and techniques is urgently desired. Following this motivation, we\npresent a comprehensive and up-to-date survey on GraphRAG. Our survey first\nproposes a holistic GraphRAG framework by defining its key components,\nincluding query processor, retriever, organizer, generator, and data source.\nFurthermore, recognizing that graphs in different domains exhibit distinct\nrelational patterns and require dedicated designs, we review GraphRAG\ntechniques uniquely tailored to each domain. Finally, we discuss research\nchallenges and brainstorm directions to inspire cross-disciplinary\nopportunities. Our survey repository is publicly maintained at\nhttps://github.com/Graph-RAG/GraphRAG/.\n","authors":["Haoyu Han","Yu Wang","Harry Shomer","Kai Guo","Jiayuan Ding","Yongjia Lei","Mahantesh Halappanavar","Ryan A. Rossi","Subhabrata Mukherjee","Xianfeng Tang","Qi He","Zhigang Hua","Bo Long","Tong Zhao","Neil Shah","Amin Javari","Yinglong Xia","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2501.00309v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04281v1","updated":"2025-01-08T05:09:25Z","published":"2025-01-08T05:09:25Z","title":"Cluster & Disperse: a general air conflict resolution heuristic using\n  unsupervised learning","summary":"  We provide a general and malleable heuristic for the air conflict resolution\nproblem. This heuristic is based on a new neighborhood structure for searching\nthe solution space of trajectories and flight-levels. Using unsupervised\nlearning, the core idea of our heuristic is to cluster the conflict points and\ndisperse them in various flight levels. Our first algorithm is called Cluster &\nDisperse and in each iteration it assigns the most problematic flights in each\ncluster to another flight-level. In effect, we shuffle them between the\nflight-levels until we achieve a well-balanced configuration. The Cluster &\nDisperse algorithm then uses any horizontal plane conflict resolution algorithm\nas a subroutine to solve these well-balanced instances. Nevertheless, we\ndevelop a novel algorithm for the horizontal plane based on a similar idea.\nThat is we cluster and disperse the conflict points spatially in the same\nflight level using the gradient descent and a social force. We use a novel\nmaneuver making flights travel on an arc instead of a straight path which is\nbased on the aviation routine of the Radius to Fix legs. Our algorithms can\nhandle a high density of flights within a reasonable computation time. We put\ntheir performance in context with some notable algorithms from the literature.\nBeing a general framework, a particular strength of the Cluster & Disperse is\nits malleability in allowing various constraints regarding the aircraft or the\nenvironment to be integrated with ease. This is in contrast to the models for\ninstance based on mixed integer programming.\n","authors":["Mirmojtaba Gharibi","John-Paul Clarke"],"pdf_url":"https://arxiv.org/pdf/2501.04281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12335v2","updated":"2025-01-08T04:53:52Z","published":"2024-03-19T00:48:25Z","title":"Temporally-Consistent Koopman Autoencoders for Forecasting Dynamical\n  Systems","summary":"  Absence of sufficiently high-quality data often poses a key challenge in\ndata-driven modeling of high-dimensional spatio-temporal dynamical systems.\nKoopman Autoencoders (KAEs) harness the expressivity of deep neural networks\n(DNNs), the dimension reduction capabilities of autoencoders, and the spectral\nproperties of the Koopman operator to learn a reduced-order feature space with\nsimpler, linear dynamics. However, the effectiveness of KAEs is hindered by\nlimited and noisy training datasets, leading to poor generalizability. To\naddress this, we introduce the Temporally-Consistent Koopman Autoencoder\n(tcKAE), designed to generate accurate long-term predictions even with limited\nand noisy training data. This is achieved through a consistency regularization\nterm that enforces prediction coherence across different time steps, thus\nenhancing the robustness and generalizability of tcKAE over existing models. We\nprovide analytical justification for this approach based on Koopman spectral\ntheory and empirically demonstrate tcKAE's superior performance over\nstate-of-the-art KAE models across a variety of test cases, including simple\npendulum oscillations, kinetic plasma, and fluid flow data.\n","authors":["Indranil Nayak","Ananda Chakrabarty","Mrinal Kumar","Fernando Teixeira","Debdipta Goswami"],"pdf_url":"https://arxiv.org/pdf/2403.12335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03295v2","updated":"2025-01-08T04:50:01Z","published":"2025-01-06T11:43:29Z","title":"A Soft Sensor Method with Uncertainty-Awareness and Self-Explanation\n  Based on Large Language Models Enhanced by Domain Knowledge Retrieval","summary":"  Data-driven soft sensors are crucial in predicting key performance indicators\nin industrial systems. However, current methods predominantly rely on the\nsupervised learning paradigms of parameter updating, which inherently faces\nchallenges such as high development costs, poor robustness, training\ninstability, and lack of interpretability. Recently, large language models\n(LLMs) have demonstrated significant potential across various domains, notably\nthrough In-Context Learning (ICL), which enables high-performance task\nexecution with minimal input-label demonstrations and no prior training. This\npaper aims to replace supervised learning with the emerging ICL paradigm for\nsoft sensor modeling to address existing challenges and explore new avenues for\nadvancement. To achieve this, we propose a novel framework called the Few-shot\nUncertainty-aware and self-Explaining Soft Sensor (LLM-FUESS), which includes\nthe Zero-shot Auxiliary Variable Selector (LLM-ZAVS) and the Uncertainty-aware\nFew-shot Soft Sensor (LLM-UFSS). The LLM-ZAVS retrieves from the Industrial\nKnowledge Vector Storage to enhance LLMs' domain-specific knowledge, enabling\nzero-shot auxiliary variable selection. In the LLM-UFSS, we utilize text-based\ncontext demonstrations of structured data to prompt LLMs to execute ICL for\npredicting and propose a context sample retrieval augmentation strategy to\nimprove performance. Additionally, we explored LLMs' AIGC and probabilistic\ncharacteristics to propose self-explanation and uncertainty quantification\nmethods for constructing a trustworthy soft sensor. Extensive experiments\ndemonstrate that our method achieved state-of-the-art predictive performance,\nstrong robustness, and flexibility, effectively mitigates training instability\nfound in traditional methods. To the best of our knowledge, this is the first\nwork to establish soft sensor utilizing LLMs.\n","authors":["Shuo Tong","Han Liu","Runyuan Guo","Wenqing Wang","Xueqiong Tian","Lingyun Wei","Lin Zhang","Huayong Wu","Ding Liu","Youmin Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03295v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04272v1","updated":"2025-01-08T04:44:47Z","published":"2025-01-08T04:44:47Z","title":"On weight and variance uncertainty in neural networks for regression\n  tasks","summary":"  We consider the problem of weight uncertainty proposed by [Blundell et al.\n(2015). Weight uncertainty in neural network. In International conference on\nmachine learning, 1613-1622, PMLR.] in neural networks {(NNs)} specialized for\nregression tasks. {We further} investigate the effect of variance uncertainty\nin {their model}. We show that including the variance uncertainty can improve\nthe prediction performance of the Bayesian {NN}. Variance uncertainty enhances\nthe generalization of the model {by} considering the posterior distribution\nover the variance parameter. { We examine the generalization ability of the\nproposed model using a function approximation} example and {further illustrate\nit with} the riboflavin genetic data set. {We explore fully connected dense\nnetworks and dropout NNs with} Gaussian and spike-and-slab priors,\nrespectively, for the network weights.\n","authors":["Moein Monemi","Morteza Amini","S. Mahmoud Taheri","Mohammad Arashi"],"pdf_url":"https://arxiv.org/pdf/2501.04272v1.pdf","comment":"Submitted to journal"}],"Multimedia":[{"id":"http://arxiv.org/abs/2501.04579v1","updated":"2025-01-08T15:48:30Z","published":"2025-01-08T15:48:30Z","title":"Unified Coding for Both Human Perception and Generalized Machine\n  Analytics with CLIP Supervision","summary":"  The image compression model has long struggled with adaptability and\ngeneralization, as the decoded bitstream typically serves only human or machine\nneeds and fails to preserve information for unseen visual tasks. Therefore,\nthis paper innovatively introduces supervision obtained from multimodal\npre-training models and incorporates adaptive multi-objective optimization\ntailored to support both human visual perception and machine vision\nsimultaneously with a single bitstream, denoted as Unified and Generalized\nImage Coding for Machine (UG-ICM). Specifically, to get rid of the reliance\nbetween compression models with downstream task supervision, we introduce\nContrastive Language-Image Pre-training (CLIP) models into the training\nconstraint for improved generalization. Global-to-instance-wise CLIP\nsupervision is applied to help obtain hierarchical semantics that make models\nmore generalizable for the tasks relying on the information of different\ngranularity. Furthermore, for supporting both human and machine visions with\nonly a unifying bitstream, we incorporate a conditional decoding strategy that\ntakes as conditions human or machine preferences, enabling the bitstream to be\ndecoded into different versions for corresponding preferences. As such, our\nproposed UG-ICM is fully trained in a self-supervised manner, i.e., without\nawareness of any specific downstream models and tasks. The extensive\nexperiments have shown that the proposed UG-ICM is capable of achieving\nremarkable improvements in various unseen machine analytics tasks, while\nsimultaneously providing perceptually satisfying images.\n","authors":["Kangsheng Yin","Quan Liu","Xuelin Shen","Yulin He","Wenhan Yang","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04579v1.pdf","comment":"9 pages, 10 figures, publised to AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04511v1","updated":"2025-01-08T13:58:07Z","published":"2025-01-08T13:58:07Z","title":"Multichannel Steganography: A Provably Secure Hybrid Steganographic\n  Model for Secure Communication","summary":"  This study introduces a novel steganographic model that synthesizes\nSteganography by Cover Modification (CMO) and Steganography by Cover Synthesis\n(CSY), enhancing both security and undetectability by generating cover messages\nor parameters while retaining the original cover's form, thus minimizing\ndetection risks and overcoming the limitations of single-method techniques.\nBuilding upon this model, a refined Steganographic Communication Protocol is\nproposed, enhancing resilience against sophisticated threats such as\nMultichannel Replay Attacks and Multichannel Man-in-the-Middle Attacks,\nfortifying the protocol against potential tampering and improving upon prior\nworks. To evaluate the security of the proposed protocol, a novel adversarial\nmodel is developed simulating a probabilistic polynomial time (PPT) adversary\ncapable of intercepting communications across multiple channels. This model\nassesses the adversary's ability to compromise the protocol, providing a\ncomprehensive security analysis. Finally, this study explores the practicality\nand adaptability of the model to both constrained environments like SMS banking\nand resource-rich settings such as blockchain transactions, demonstrating their\npotential to enhance financial services and security. These contributions\npresent a robust and adaptable framework for secure steganographic\ncommunication, offering practical solutions for secure communications across\ndiverse environments.\n","authors":["Obinna Omego","Michal Bosy"],"pdf_url":"https://arxiv.org/pdf/2501.04511v1.pdf","comment":"18 pages, 8 figures, 3 algorithms, This version is a preprint\n  uploaded to arXiv"},{"id":"http://arxiv.org/abs/2311.07594v3","updated":"2025-01-08T02:33:37Z","published":"2023-11-10T09:51:24Z","title":"How to Bridge the Gap between Modalities: Survey on Multimodal Large\n  Language Model","summary":"  We explore Multimodal Large Language Models (MLLMs), which integrate LLMs\nlike GPT-4 to handle multimodal data, including text, images, audio, and more.\nMLLMs demonstrate capabilities such as generating image captions and answering\nimage-based questions, bridging the gap towards real-world human-computer\ninteractions and hinting at a potential pathway to artificial general\nintelligence. However, MLLMs still face challenges in addressing the semantic\ngap in multimodal data, which may lead to erroneous outputs, posing potential\nrisks to society. Selecting the appropriate modality alignment method is\ncrucial, as improper methods might require more parameters without significant\nperformance improvements. This paper aims to explore modality alignment methods\nfor LLMs and their current capabilities. Implementing effective modality\nalignment can help LLMs address environmental issues and enhance accessibility.\nThe study surveys existing modality alignment methods for MLLMs, categorizing\nthem into four groups: (1) Multimodal Converter, which transforms data into a\nformat that LLMs can understand; (2) Multimodal Perceiver, which improves how\nLLMs percieve different types of data; (3) Tool Learning, which leverages\nexternal tools to convert data into a common format, usually text; and (4)\nData-Driven Method, which teaches LLMs to understand specific data types within\ndatasets.\n","authors":["Shezheng Song","Xiaopeng Li","Shasha Li","Shan Zhao","Jie Yu","Jun Ma","Xiaoguang Mao","Weimin Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.07594v3.pdf","comment":"Accepted by TKDE"},{"id":"http://arxiv.org/abs/2501.04204v1","updated":"2025-01-08T00:52:19Z","published":"2025-01-08T00:52:19Z","title":"LipGen: Viseme-Guided Lip Video Generation for Enhancing Visual Speech\n  Recognition","summary":"  Visual speech recognition (VSR), commonly known as lip reading, has garnered\nsignificant attention due to its wide-ranging practical applications. The\nadvent of deep learning techniques and advancements in hardware capabilities\nhave significantly enhanced the performance of lip reading models. Despite\nthese advancements, existing datasets predominantly feature stable video\nrecordings with limited variability in lip movements. This limitation results\nin models that are highly sensitive to variations encountered in real-world\nscenarios. To address this issue, we propose a novel framework, LipGen, which\naims to improve model robustness by leveraging speech-driven synthetic visual\ndata, thereby mitigating the constraints of current datasets. Additionally, we\nintroduce an auxiliary task that incorporates viseme classification alongside\nattention mechanisms. This approach facilitates the efficient integration of\ntemporal information, directing the model's focus toward the relevant segments\nof speech, thereby enhancing discriminative capabilities. Our method\ndemonstrates superior performance compared to the current state-of-the-art on\nthe lip reading in the wild (LRW) dataset and exhibits even more pronounced\nadvantages under challenging conditions.\n","authors":["Bowen Hao","Dongliang Zhou","Xiaojie Li","Xingyu Zhang","Liang Xie","Jianlong Wu","Erwei Yin"],"pdf_url":"https://arxiv.org/pdf/2501.04204v1.pdf","comment":"This paper has been accepted for presentation at ICASSP 2025"},{"id":"http://arxiv.org/abs/2404.05522v2","updated":"2025-01-08T22:34:12Z","published":"2024-04-08T13:43:19Z","title":"3DMambaIPF: A State Space Model for Iterative Point Cloud Filtering via\n  Differentiable Rendering","summary":"  Noise is an inevitable aspect of point cloud acquisition, necessitating\nfiltering as a fundamental task within the realm of 3D vision. Existing\nlearning-based filtering methods have shown promising capabilities on\nsmall-scale synthetic or real-world datasets. Nonetheless, the effectiveness of\nthese methods is constrained when dealing with a substantial quantity of point\nclouds. This limitation primarily stems from their limited denoising\ncapabilities for large-scale point clouds and their inclination to generate\nnoisy outliers after denoising. The recent introduction of State Space Models\n(SSMs) for long sequence modeling in Natural Language Processing (NLP) presents\na promising solution for handling large-scale data. Encouraged by iterative\npoint cloud filtering methods, we introduce 3DMambaIPF, firstly incorporating\nMamba (Selective SSM) architecture to sequentially handle extensive point\nclouds from large scenes, capitalizing on its strengths in selective input\nprocessing and long sequence modeling capabilities. Additionally, we integrate\na robust and fast differentiable rendering loss to constrain the noisy points\naround the surface. In contrast to previous methodologies, this differentiable\nrendering loss enhances the visual realism of denoised geometric structures and\naligns point cloud boundaries more closely with those observed in real-world\nobjects. Extensive evaluation on datasets comprising small-scale synthetic and\nreal-world models (typically with up to 50K points) demonstrate that our method\nachieves state-of-the-art results. Moreover, we showcase the superior\nscalability and efficiency of our method on large-scale models with about 500K\npoints, where the majority of the existing learning-based denoising methods are\nunable to handle.\n","authors":["Qingyuan Zhou","Weidong Yang","Ben Fei","Jingyi Xu","Rui Zhang","Keyi Liu","Yeqi Luo","Ying He"],"pdf_url":"https://arxiv.org/pdf/2404.05522v2.pdf","comment":"Accepted at AAAI-25"},{"id":"http://arxiv.org/abs/2501.04764v1","updated":"2025-01-08T18:35:48Z","published":"2025-01-08T18:35:48Z","title":"Video Summarisation with Incident and Context Information using\n  Generative AI","summary":"  The proliferation of video content production has led to vast amounts of\ndata, posing substantial challenges in terms of analysis efficiency and\nresource utilization. Addressing this issue calls for the development of robust\nvideo analysis tools. This paper proposes a novel approach leveraging\nGenerative Artificial Intelligence (GenAI) to facilitate streamlined video\nanalysis. Our tool aims to deliver tailored textual summaries of user-defined\nqueries, offering a focused insight amidst extensive video datasets. Unlike\nconventional frameworks that offer generic summaries or limited action\nrecognition, our method harnesses the power of GenAI to distil relevant\ninformation, enhancing analysis precision and efficiency. Employing YOLO-V8 for\nobject detection and Gemini for comprehensive video and text analysis, our\nsolution achieves heightened contextual accuracy. By combining YOLO with\nGemini, our approach furnishes textual summaries extracted from extensive CCTV\nfootage, enabling users to swiftly navigate and verify pertinent events without\nthe need for exhaustive manual review. The quantitative evaluation revealed a\nsimilarity of 72.8%, while the qualitative assessment rated an accuracy of 85%,\ndemonstrating the capability of the proposed method.\n","authors":["Ulindu De Silva","Leon Fernando","Kalinga Bandara","Rashmika Nawaratne"],"pdf_url":"https://arxiv.org/pdf/2501.04764v1.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.04700v1","updated":"2025-01-08T18:59:36Z","published":"2025-01-08T18:59:36Z","title":"Planarian Neural Networks: Evolutionary Patterns from Basic Bilateria\n  Shaping Modern Artificial Neural Network Architectures","summary":"  This study examined the viability of enhancing the prediction accuracy of\nartificial neural networks (ANNs) in image classification tasks by developing\nANNs with evolution patterns similar to those of biological neural networks.\nResNet is a widely used family of neural networks with both deep and wide\nvariants; therefore, it was selected as the base model for our investigation.\nThe aim of this study is to improve the image classification performance of\nANNs via a novel approach inspired by the biological nervous system\narchitecture of planarians, which comprises a brain and two nerve cords. We\nbelieve that the unique neural architecture of planarians offers valuable\ninsights into the performance enhancement of ANNs. The proposed planarian\nneural architecture-based neural network was evaluated on the CIFAR-10 and\nCIFAR-100 datasets. Our results indicate that the proposed method exhibits\nhigher prediction accuracy than the baseline neural network models in image\nclassification tasks. These findings demonstrate the significant potential of\nbiologically inspired neural network architectures in improving the performance\nof ANNs in a wide range of applications.\n","authors":["Ziyuan Huang","Mark Newman","Maria Vaida","Srikar Bellur","Roozbeh Sadeghian","Andrew Siu","Hui Wang","Kevin Huggins"],"pdf_url":"https://arxiv.org/pdf/2501.04700v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.04697v1","updated":"2025-01-08T18:58:48Z","published":"2025-01-08T18:58:48Z","title":"Grokking at the Edge of Numerical Stability","summary":"  Grokking, the sudden generalization that occurs after prolonged overfitting,\nis a surprising phenomenon challenging our understanding of deep learning.\nAlthough significant progress has been made in understanding grokking, the\nreasons behind the delayed generalization and its dependence on regularization\nremain unclear. In this work, we argue that without regularization, grokking\ntasks push models to the edge of numerical stability, introducing floating\npoint errors in the Softmax function, which we refer to as Softmax Collapse\n(SC). We demonstrate that SC prevents grokking and that mitigating SC enables\ngrokking without regularization. Investigating the root cause of SC, we find\nthat beyond the point of overfitting, the gradients strongly align with what we\ncall the na\\\"ive loss minimization (NLM) direction. This component of the\ngradient does not alter the model's predictions but decreases the loss by\nscaling the logits, typically by scaling the weights along their current\ndirection. We show that this scaling of the logits explains the delay in\ngeneralization characteristic of grokking and eventually leads to SC, halting\nfurther learning. To validate our hypotheses, we introduce two key\ncontributions that address the challenges in grokking tasks: StableMax, a new\nactivation function that prevents SC and enables grokking without\nregularization, and $\\perp$Grad, a training algorithm that promotes quick\ngeneralization in grokking tasks by preventing NLM altogether. These\ncontributions provide new insights into grokking, elucidating its delayed\ngeneralization, reliance on regularization, and the effectiveness of existing\ngrokking-inducing methods. Code for this paper is available at\nhttps://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability.\n","authors":["Lucas Prieto","Melih Barsbey","Pedro A. M. Mediano","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2501.04697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04694v1","updated":"2025-01-08T18:58:15Z","published":"2025-01-08T18:58:15Z","title":"EpiCoder: Encompassing Diversity and Complexity in Code Generation","summary":"  Effective instruction tuning is indispensable for optimizing code LLMs,\naligning model behavior with user expectations and enhancing model performance\nin real-world applications. However, most existing methods focus on code\nsnippets, which are limited to specific functionalities and rigid structures,\nrestricting the complexity and diversity of the synthesized data. To address\nthese limitations, we introduce a novel feature tree-based synthesis framework\ninspired by Abstract Syntax Trees (AST). Unlike AST, which captures syntactic\nstructure of code, our framework models semantic relationships between code\nelements, enabling the generation of more nuanced and diverse data. The feature\ntree is constructed from raw data and refined iteratively to increase the\nquantity and diversity of the extracted features. This process enables the\nidentification of more complex patterns and relationships within the code. By\nsampling subtrees with controlled depth and breadth, our framework allows\nprecise adjustments to the complexity of the generated code, supporting a wide\nrange of tasks from simple function-level operations to intricate multi-file\nscenarios. We fine-tuned widely-used base models to create the EpiCoder series,\nachieving state-of-the-art performance at both the function and file levels\nacross multiple benchmarks. Notably, empirical evidence indicates that our\napproach shows significant potential in synthesizing highly complex\nrepository-level code data. Further analysis elucidates the merits of this\napproach by rigorously assessing data complexity and diversity through software\nengineering principles and LLM-as-a-judge method.\n","authors":["Yaoxiang Wang","Haoling Li","Xin Zhang","Jie Wu","Xiao Liu","Wenxiang Hu","Zhongxin Guo","Yangyu Huang","Ying Xin","Yujiu Yang","Jinsong Su","Qi Chen","Scarlett Li"],"pdf_url":"https://arxiv.org/pdf/2501.04694v1.pdf","comment":"40 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.04693v1","updated":"2025-01-08T18:57:33Z","published":"2025-01-08T18:57:33Z","title":"Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous\n  Sensors via Language Grounding","summary":"  Interacting with the world is a multi-sensory experience: achieving effective\ngeneral-purpose interaction requires making use of all available modalities --\nincluding vision, touch, and audio -- to fill in gaps from partial observation.\nFor example, when vision is occluded reaching into a bag, a robot should rely\non its senses of touch and sound. However, state-of-the-art generalist robot\npolicies are typically trained on large datasets to predict robot actions\nsolely from visual and proprioceptive observations. In this work, we propose\nFuSe, a novel approach that enables finetuning visuomotor generalist policies\non heterogeneous sensor modalities for which large datasets are not readily\navailable by leveraging natural language as a common cross-modal grounding. We\ncombine a multimodal contrastive loss with a sensory-grounded language\ngeneration loss to encode high-level semantics. In the context of robot\nmanipulation, we show that FuSe enables performing challenging tasks that\nrequire reasoning jointly over modalities such as vision, touch, and sound in a\nzero-shot setting, such as multimodal prompting, compositional cross-modal\nprompting, and descriptions of objects it interacts with. We show that the same\nrecipe is applicable to widely different generalist policies, including both\ndiffusion-based generalist policies and large vision-language-action (VLA)\nmodels. Extensive experiments in the real world show that FuSeis able to\nincrease success rates by over 20% compared to all considered baselines.\n","authors":["Joshua Jones","Oier Mees","Carmelo Sferrazza","Kyle Stachowicz","Pieter Abbeel","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2501.04693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04686v1","updated":"2025-01-08T18:49:41Z","published":"2025-01-08T18:49:41Z","title":"URSA: Understanding and Verifying Chain-of-thought Reasoning in\n  Multimodal Mathematics","summary":"  Chain-of-thought (CoT) reasoning has been widely applied in the mathematical\nreasoning of Large Language Models (LLMs). Recently, the introduction of\nderivative process supervision on CoT trajectories has sparked discussions on\nenhancing scaling capabilities during test time, thereby boosting the potential\nof these models. However, in multimodal mathematical reasoning, the scarcity of\nhigh-quality CoT training data has hindered existing models from achieving\nhigh-precision CoT reasoning and has limited the realization of reasoning\npotential during test time. In this work, we propose a three-module synthesis\nstrategy that integrates CoT distillation, trajectory-format rewriting, and\nformat unification. It results in a high-quality CoT reasoning instruction\nfine-tuning dataset in multimodal mathematics, MMathCoT-1M. We comprehensively\nvalidate the state-of-the-art (SOTA) performance of the trained URSA-7B model\non multiple multimodal mathematical benchmarks. For test-time scaling, we\nintroduce a data synthesis strategy that automatically generates process\nannotation datasets, known as DualMath-1.1M, focusing on both interpretation\nand logic. By further training URSA-7B on DualMath-1.1M, we transition from CoT\nreasoning capabilities to robust supervision abilities. The trained URSA-RM-7B\nacts as a verifier, effectively enhancing the performance of URSA-7B at test\ntime. URSA-RM-7B also demonstrates excellent out-of-distribution (OOD)\nverifying capabilities, showcasing its generalization. Model weights, training\ndata and code will be open-sourced.\n","authors":["Ruilin Luo","Zhuofan Zheng","Yifan Wang","Yiyao Yu","Xinzhe Ni","Zicheng Lin","Jin Zeng","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2501.04686v1.pdf","comment":"27 pages, 10 tables, 17 figures. The training data has been released.\n  The code and model are currently undergoing internal review. They will be\n  made available soon. Project url: https://ursa-math.github.io"},{"id":"http://arxiv.org/abs/2501.04682v1","updated":"2025-01-08T18:42:48Z","published":"2025-01-08T18:42:48Z","title":"Towards System 2 Reasoning in LLMs: Learning How to Think With Meta\n  Chain-of-Though","summary":"  We propose a novel framework, Meta Chain-of-Thought (Meta-CoT), which extends\ntraditional Chain-of-Thought (CoT) by explicitly modeling the underlying\nreasoning required to arrive at a particular CoT. We present empirical evidence\nfrom state-of-the-art models exhibiting behaviors consistent with in-context\nsearch, and explore methods for producing Meta-CoT via process supervision,\nsynthetic data generation, and search algorithms. Finally, we outline a\nconcrete pipeline for training a model to produce Meta-CoTs, incorporating\ninstruction tuning with linearized search traces and reinforcement learning\npost-training. Finally, we discuss open research questions, including scaling\nlaws, verifier roles, and the potential for discovering novel reasoning\nalgorithms. This work provides a theoretical and practical roadmap to enable\nMeta-CoT in LLMs, paving the way for more powerful and human-like reasoning in\nartificial intelligence.\n","authors":["Violet Xiang","Charlie Snell","Kanishk Gandhi","Alon Albalak","Anikait Singh","Chase Blagden","Duy Phung","Rafael Rafailov","Nathan Lile","Dakota Mahan","Louis Castricato","Jan-Philipp Franken","Nick Haber","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2501.04682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04675v1","updated":"2025-01-08T18:33:17Z","published":"2025-01-08T18:33:17Z","title":"Enhancing Financial VQA in Vision Language Models using Intermediate\n  Structured Representations","summary":"  Chart interpretation is crucial for visual data analysis, but accurately\nextracting information from charts poses significant challenges for automated\nmodels. This study investigates the fine-tuning of DEPLOT, a modality\nconversion module that translates the image of a plot or chart to a linearized\ntable, on a custom dataset of 50,000 bar charts. The dataset comprises simple,\nstacked, and grouped bar charts, targeting the unique structural features of\nthese visualizations. The finetuned DEPLOT model is evaluated against its base\nversion using a test set of 1,000 images and two metrics: Relative Mapping\nSimilarity (RMS), which measures categorical mapping accuracy, and Relative\nNumber Set Similarity (RNSS), which evaluates numerical interpretation\naccuracy. To further explore the reasoning capabilities of large language\nmodels (LLMs), we curate an additional set of 100 bar chart images paired with\nquestion answer sets. Our findings demonstrate that providing a structured\nintermediate table alongside the image significantly enhances LLM reasoning\nperformance compared to direct image queries.\n","authors":["Archita Srivastava","Abhas Kumar","Rajesh Kumar","Prabhakar Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2501.04675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02788v2","updated":"2025-01-08T18:33:07Z","published":"2025-01-06T06:07:40Z","title":"GLoG-CSUnet: Enhancing Vision Transformers with Adaptable Radiomic\n  Features for Medical Image Segmentation","summary":"  Vision Transformers (ViTs) have shown promise in medical image semantic\nsegmentation (MISS) by capturing long-range correlations. However, ViTs often\nstruggle to model local spatial information effectively, which is essential for\naccurately segmenting fine anatomical details, particularly when applied to\nsmall datasets without extensive pre-training. We introduce Gabor and Laplacian\nof Gaussian Convolutional Swin Network (GLoG-CSUnet), a novel architecture\nenhancing Transformer-based models by incorporating learnable radiomic\nfeatures. This approach integrates dynamically adaptive Gabor and Laplacian of\nGaussian (LoG) filters to capture texture, edge, and boundary information,\nenhancing the feature representation processed by the Transformer model. Our\nmethod uniquely combines the long-range dependency modeling of Transformers\nwith the texture analysis capabilities of Gabor and LoG features. Evaluated on\nthe Synapse multi-organ and ACDC cardiac segmentation datasets, GLoG-CSUnet\ndemonstrates significant improvements over state-of-the-art models, achieving a\n1.14% increase in Dice score for Synapse and 0.99% for ACDC, with minimal\ncomputational overhead (only 15 and 30 additional parameters, respectively).\nGLoG-CSUnet's flexible design allows integration with various base models,\noffering a promising approach for incorporating radiomics-inspired feature\nextraction in Transformer architectures for medical image analysis. The code\nimplementation is available on GitHub at: https://github.com/HAAIL/GLoG-CSUnet.\n","authors":["Niloufar Eghbali","Hassan Bagher-Ebadian","Tuka Alhanai","Mohammad M. Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2501.02788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04671v1","updated":"2025-01-08T18:31:16Z","published":"2025-01-08T18:31:16Z","title":"DRIVINGVQA: Analyzing Visual Chain-of-Thought Reasoning of Vision\n  Language Models in Real-World Scenarios with Driving Theory Tests","summary":"  Large vision-language models (LVLMs) augment language models with visual\nunderstanding, enabling multimodal reasoning. However, due to the modality gap\nbetween textual and visual data, they often face significant challenges, such\nas over-reliance on text priors, hallucinations, and limited capacity for\ncomplex visual reasoning. Existing benchmarks to evaluate visual reasoning in\nLVLMs often rely on schematic or synthetic images and on imprecise\nmachine-generated explanations. To bridge the modality gap, we present\nDrivingVQA, a new benchmark derived from driving theory tests to evaluate\nvisual chain-of-thought reasoning in complex real-world scenarios. It offers\n3,931 expert-crafted multiple-choice problems and interleaved explanations\ngrounded with entities relevant to the reasoning process. We leverage this\ndataset to perform an extensive study of LVLMs' ability to reason about complex\nvisual scenarios. Our experiments reveal that open-source and proprietary LVLMs\nstruggle with visual chain-of-thought reasoning under zero-shot settings. We\ninvestigate training strategies that leverage relevant entities to improve\nvisual reasoning. Notably, we observe a performance boost of up to 7\\% when\nreasoning over image tokens of cropped regions tied to these entities.\n","authors":["Charles Corbière","Simon Roburin","Syrielle Montariol","Antoine Bosselut","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2501.04671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01348v2","updated":"2025-01-08T18:20:46Z","published":"2024-12-02T10:19:36Z","title":"Hierarchical Object-Oriented POMDP Planning for Object Rearrangement","summary":"  We present an online planning framework for solving multi-object\nrearrangement problems in partially observable, multi-room environments.\nCurrent object rearrangement solutions, primarily based on Reinforcement\nLearning or hand-coded planning methods, often lack adaptability to diverse\nchallenges. To address this limitation, we introduce a novel Hierarchical\nObject-Oriented Partially Observed Markov Decision Process (HOO-POMDP) planning\napproach. This approach comprises of (a) an object-oriented POMDP planner\ngenerating sub-goals, (b) a set of low-level policies for sub-goal achievement,\nand (c) an abstraction system converting the continuous low-level world into a\nrepresentation suitable for abstract planning. We evaluate our system on\nvarying numbers of objects, rooms, and problem types in AI2-THOR simulated\nenvironments with promising results.\n","authors":["Rajesh Mangannavar","Alan Fern","Prasad Tadepalli"],"pdf_url":"https://arxiv.org/pdf/2412.01348v2.pdf","comment":"17 pages, 2 Figures. Preprint. Updated acknowledgments"},{"id":"http://arxiv.org/abs/2501.04661v1","updated":"2025-01-08T18:15:10Z","published":"2025-01-08T18:15:10Z","title":"Assessing Language Comprehension in Large Language Models Using\n  Construction Grammar","summary":"  Large Language Models, despite their significant capabilities, are known to\nfail in surprising and unpredictable ways. Evaluating their true\n`understanding' of language is particularly challenging due to the extensive\nweb-scale data they are trained on. Therefore, we construct an evaluation to\nsystematically assess natural language understanding (NLU) in LLMs by\nleveraging Construction Grammar (CxG), which provides insights into the meaning\ncaptured by linguistic elements known as constructions (Cxns). CxG is\nwell-suited for this purpose because provides a theoretical basis to construct\ntargeted evaluation sets. These datasets are carefully constructed to include\nexamples which are unlikely to appear in pre-training data, yet intuitive and\neasy for humans to understand, enabling a more targeted and reliable\nassessment. Our experiments focus on downstream natural language inference and\nreasoning tasks by comparing LLMs' understanding of the underlying meanings\ncommunicated through 8 unique Cxns with that of humans. The results show that\nwhile LLMs demonstrate some knowledge of constructional information, even the\nlatest models including GPT-o1 struggle with abstract meanings conveyed by\nthese Cxns, as demonstrated in cases where test sentences are dissimilar to\ntheir pre-training data. We argue that such cases provide a more accurate test\nof true language understanding, highlighting key limitations in LLMs' semantic\ncapabilities. We make our novel dataset and associated experimental data\nincluding prompts and model responses publicly available.\n","authors":["Wesley Scivetti","Melissa Torgbi","Austin Blodgett","Mollie Shichman","Taylor Hudson","Claire Bonial","Harish Tayyar Madabushi"],"pdf_url":"https://arxiv.org/pdf/2501.04661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02832v3","updated":"2025-01-08T17:46:40Z","published":"2025-01-06T08:16:06Z","title":"Samba-ASR: State-Of-The-Art Speech Recognition Leveraging Structured\n  State-Space Models","summary":"  We propose Samba ASR,the first state of the art Automatic Speech\nRecognition(ASR)model leveraging the novel Mamba architecture as both encoder\nand decoder,built on the foundation of state space models(SSMs).Unlike\ntransformerbased ASR models,which rely on self-attention mechanisms to capture\ndependencies,Samba ASR effectively models both local and global temporal\ndependencies using efficient statespace dynamics,achieving remarkable\nperformance gains.By addressing the limitations of transformers,such as\nquadratic scaling with input length and difficulty in handling longrange\ndependencies,Samba ASR achieves superior accuracy and efficiency.Experimental\nresults demonstrate that Samba ASR surpasses existing opensource\ntransformerbased ASR models across various standard benchmarks,establishing it\nas the new state of theart in ASR.Extensive evaluations on the benchmark\ndataset show significant improvements in Word Error Rate(WER),with competitive\nperformance even in lowresource scenarios.Furthermore,the inherent\ncomputational efficiency and parameter optimization of the Mamba architecture\nmake Samba ASR a scalable and robust solution for diverse ASR tasks.Our\ncontributions include the development of a new Samba ASR architecture for\nautomatic speech recognition(ASR),demonstrating the superiority of structured\nstatespace models(SSMs)over transformer based models for speech sequence\nprocessing.We provide a comprehensive evaluation on public\nbenchmarks,showcasing stateoftheart(SOTA)performance,and present an indepth\nanalysis of computational efficiency,robustness to noise,and sequence\ngeneralization.This work highlights the viability of Mamba SSMs as a\ntransformerfree alternative for efficient and accurate ASR.By leveraging the\nadvancements of statespace modeling,Samba ASR redefines ASR performance\nstandards and sets a new benchmark for future research in this field.\n","authors":["Syed Abdul Gaffar Shakhadri","Kruthika KR","Kartik Basavaraj Angadi"],"pdf_url":"https://arxiv.org/pdf/2501.02832v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15861v2","updated":"2025-01-08T17:41:51Z","published":"2024-09-24T08:33:41Z","title":"A Zero-Shot Open-Vocabulary Pipeline for Dialogue Understanding","summary":"  Dialogue State Tracking (DST) is crucial for understanding user needs and\nexecuting appropriate system actions in task-oriented dialogues. Majority of\nexisting DST methods are designed to work within predefined ontologies and\nassume the availability of gold domain labels, struggling with adapting to new\nslots values. While Large Language Models (LLMs)-based systems show promising\nzero-shot DST performance, they either require extensive computational\nresources or they underperform existing fully-trained systems, limiting their\npracticality. To address these limitations, we propose a zero-shot,\nopen-vocabulary system that integrates domain classification and DST in a\nsingle pipeline. Our approach includes reformulating DST as a\nquestion-answering task for less capable models and employing self-refining\nprompts for more adaptable ones. Our system does not rely on fixed slot values\ndefined in the ontology allowing the system to adapt dynamically. We compare\nour approach with existing SOTA, and show that it provides up to 20% better\nJoint Goal Accuracy (JGA) over previous methods on datasets like Multi-WOZ 2.1,\nwith up to 90% fewer requests to the LLM API.\n","authors":["Abdulfattah Safa","Gözde Gül Şahin"],"pdf_url":"https://arxiv.org/pdf/2409.15861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04635v1","updated":"2025-01-08T17:29:46Z","published":"2025-01-08T17:29:46Z","title":"Knowledge Retrieval Based on Generative AI","summary":"  This study develops a question-answering system based on Retrieval-Augmented\nGeneration (RAG) using Chinese Wikipedia and Lawbank as retrieval sources.\nUsing TTQA and TMMLU+ as evaluation datasets, the system employs BGE-M3 for\ndense vector retrieval to obtain highly relevant search results and\nBGE-reranker to reorder these results based on query relevance. The most\npertinent retrieval outcomes serve as reference knowledge for a Large Language\nModel (LLM), enhancing its ability to answer questions and establishing a\nknowledge retrieval system grounded in generative AI.\n  The system's effectiveness is assessed through a two-stage evaluation:\nautomatic and assisted performance evaluations. The automatic evaluation\ncalculates accuracy by comparing the model's auto-generated labels with ground\ntruth answers, measuring performance under standardized conditions without\nhuman intervention. The assisted performance evaluation involves 20\nfinance-related multiple-choice questions answered by 20 participants without\nfinancial backgrounds. Initially, participants answer independently. Later,\nthey receive system-generated reference information to assist in answering,\nexamining whether the system improves accuracy when assistance is provided.\n  The main contributions of this research are: (1) Enhanced LLM Capability: By\nintegrating BGE-M3 and BGE-reranker, the system retrieves and reorders highly\nrelevant results, reduces hallucinations, and dynamically accesses authorized\nor public knowledge sources. (2) Improved Data Privacy: A customized RAG\narchitecture enables local operation of the LLM, eliminating the need to send\nprivate data to external servers. This approach enhances data security, reduces\nreliance on commercial services, lowers operational costs, and mitigates\nprivacy risks.\n","authors":["Te-Lun Yang","Jyi-Shane Liu","Yuen-Hsien Tseng","Jyh-Shing Roger Jang"],"pdf_url":"https://arxiv.org/pdf/2501.04635v1.pdf","comment":"8 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2501.04614v1","updated":"2025-01-08T16:53:56Z","published":"2025-01-08T16:53:56Z","title":"MedCoDi-M: A Multi-Prompt Foundation Model for Multimodal Medical Data\n  Generation","summary":"  Artificial Intelligence is revolutionizing medical practice, enhancing\ndiagnostic accuracy and healthcare delivery. However, its adaptation in medical\nsettings still faces significant challenges, related to data availability and\nprivacy constraints. Synthetic data has emerged as a promising solution to\nmitigate these issues, addressing data scarcity while preserving privacy.\nRecently, Latent Diffusion Models have emerged as a powerful tool for\ngenerating high-quality synthetic data. Meanwhile, the integration of different\nmodalities has gained interest, emphasizing the need of models capable of\nhandle multimodal medical data.Existing approaches struggle to integrate\ncomplementary information and lack the ability to generate modalities\nsimultaneously. To address this challenge, we present MedCoDi-M, a\n6.77-billion-parameter model, designed for multimodal medical data generation,\nthat, following Foundation Model paradigm, exploits contrastive learning and\nlarge quantity of data to build a shared latent space which capture the\nrelationships between different data modalities. Further, we introduce the\nMulti-Prompt training technique, which significantly boosts MedCoDi-M's\ngeneration under different settings. We extensively validate MedCoDi-M: first\nwe benchmark it against five competitors on the MIMIC-CXR dataset, a\nstate-of-the-art dataset for Chest X-ray and radiological report generation.\nSecondly, we perform a Visual Turing Test with expert radiologists to assess\nthe realism and clinical relevance of the generated data, ensuring alignment\nwith real-world scenarios. Finally, we assess the utility of MedCoDi-M in\naddressing key challenges in the medical field, such as anonymization, data\nscarcity and imbalance learning. The results are promising, demonstrating the\napplicability of MedCoDi-M in medical contexts. Project page is at\nhttps://cosbidev.github.io/MedCoDi-M/.\n","authors":["Daniele Molino","Francesco Di Feola","Eliodoro Faiella","Deborah Fazzini","Domiziana Santucci","Linlin Shen","Valerio Guarrasi","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2501.04614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15856v2","updated":"2025-01-08T16:31:06Z","published":"2024-01-29T03:07:04Z","title":"The Indoor-Training Effect: unexpected gains from distribution shifts in\n  the transition function","summary":"  Is it better to perform tennis training in a pristine indoor environment or a\nnoisy outdoor one? To model this problem, here we investigate whether shifts in\nthe transition probabilities between the training and testing environments in\nreinforcement learning problems can lead to better performance under certain\nconditions. We generate new Markov Decision Processes (MDPs) starting from a\ngiven MDP, by adding quantifiable, parametric noise into the transition\nfunction. We refer to this process as Noise Injection and the resulting\nenvironments as {\\delta}-environments. This process allows us to create\nvariations of the same environment with quantitative control over noise serving\nas a metric of distance between environments. Conventional wisdom suggests that\ntraining and testing on the same MDP should yield the best results. In stark\ncontrast, we observe that agents can perform better when trained on the\nnoise-free environment and tested on the noisy {\\delta}-environments, compared\nto training and testing on the same {\\delta}-environments. We confirm that this\nfinding extends beyond noise variations: it is possible to showcase the same\nphenomenon in ATARI game variations including varying Ghost behaviour in\nPacMan, and Paddle behaviour in Pong. We demonstrate this intriguing behaviour\nacross 60 different variations of ATARI games, including PacMan, Pong, and\nBreakout. We refer to this phenomenon as the Indoor-Training Effect. Code to\nreproduce our experiments and to implement Noise Injection can be found at\nhttps://bit.ly/3X6CTYk.\n","authors":["Serena Bono","Spandan Madan","Ishaan Grover","Mao Yasueda","Cynthia Breazeal","Hanspeter Pfister","Gabriel Kreiman"],"pdf_url":"https://arxiv.org/pdf/2401.15856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.06855v3","updated":"2025-01-08T16:26:44Z","published":"2024-12-08T20:23:48Z","title":"Incentivized Symbiosis: A Paradigm for Human-Agent Coevolution","summary":"  Cooperation is vital to our survival and progress. Evolutionary game theory\noffers a lens to understand the structures and incentives that enable\ncooperation to be a successful strategy. As artificial intelligence agents\nbecome integral to human systems, the dynamics of cooperation take on\nunprecedented significance. The convergence of human-agent teaming, contract\ntheory, and decentralized frameworks like Web3, grounded in transparency,\naccountability, and trust, offers a foundation for fostering cooperation by\nestablishing enforceable rules and incentives for humans and AI agents. We\nconceptualize Incentivized Symbiosis as a social contract between humans and\nAI, inspired by Web3 principles and encoded in blockchain technology, to define\nand enforce rules, incentives, and consequences for both parties. By exploring\nthis paradigm, we aim to catalyze new research at the intersection of systems\nthinking in AI, Web3, and society, fostering innovative pathways for\ncooperative human-agent coevolution.\n","authors":["Tomer Jordi Chaffer","Justin Goldston","Gemach D. A. T. A. I"],"pdf_url":"https://arxiv.org/pdf/2412.06855v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04588v1","updated":"2025-01-08T16:06:39Z","published":"2025-01-08T16:06:39Z","title":"Federated-Continual Dynamic Segmentation of Histopathology guided by\n  Barlow Continuity","summary":"  Federated- and Continual Learning have been established as approaches to\nenable privacy-aware learning on continuously changing data, as required for\ndeploying AI systems in histopathology images. However, data shifts can occur\nin a dynamic world, spatially between institutions and temporally, due to\nchanging data over time. This leads to two issues: Client Drift, where the\ncentral model degrades from aggregating data from clients trained on shifted\ndata, and Catastrophic Forgetting, from temporal shifts such as changes in\npatient populations. Both tend to degrade the model's performance of previously\nseen data or spatially distributed training. Despite both problems arising from\nthe same underlying problem of data shifts, existing research addresses them\nonly individually. In this work, we introduce a method that can jointly\nalleviate Client Drift and Catastrophic Forgetting by using our proposed\nDynamic Barlow Continuity that evaluates client updates on a public reference\ndataset and uses this to guide the training process to a spatially and\ntemporally shift-invariant model. We evaluate our approach on the\nhistopathology datasets BCSS and Semicol and prove our method to be highly\neffective by jointly improving the dice score as much as from 15.8% to 71.6% in\nClient Drift and from 42.5% to 62.8% in Catastrophic Forgetting. This enables\nDynamic Learning by establishing spatio-temporal shift-invariance.\n","authors":["Niklas Babendererde","Haozhe Zhu","Moritz Fuchs","Jonathan Stieber","Anirban Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2501.04588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04577v1","updated":"2025-01-08T15:47:04Z","published":"2025-01-08T15:47:04Z","title":"A 65 nm Bayesian Neural Network Accelerator with 360 fJ/Sample In-Word\n  GRNG for AI Uncertainty Estimation","summary":"  Uncertainty estimation is an indispensable capability for AI-enabled,\nsafety-critical applications, e.g. autonomous vehicles or medical diagnosis.\nBayesian neural networks (BNNs) use Bayesian statistics to provide both\nclassification predictions and uncertainty estimation, but they suffer from\nhigh computational overhead associated with random number generation and\nrepeated sample iterations. Furthermore, BNNs are not immediately amenable to\nacceleration through compute-in-memory architectures due to the frequent memory\nwrites necessary after each RNG operation. To address these challenges, we\npresent an ASIC that integrates 360 fJ/Sample Gaussian RNG directly into the\nSRAM memory words. This integration reduces RNG overhead and enables\nfully-parallel compute-in-memory operations for BNNs. The prototype chip\nachieves 5.12 GSa/s RNG throughput and 102 GOp/s neural network throughput\nwhile occupying 0.45 mm2, bringing AI uncertainty estimation to edge\ncomputation.\n","authors":["Zephan M. Enciso","Boyang Cheng","Likai Pei","Jianbo Liu","Steven Davis","Ningyuan Cao","Michael Niemier"],"pdf_url":"https://arxiv.org/pdf/2501.04577v1.pdf","comment":"7 pages, 12 figures"},{"id":"http://arxiv.org/abs/2501.04575v1","updated":"2025-01-08T15:45:21Z","published":"2025-01-08T15:45:21Z","title":"InfiGUIAgent: A Multimodal Generalist GUI Agent with Native Reasoning\n  and Reflection","summary":"  Graphical User Interface (GUI) Agents, powered by multimodal large language\nmodels (MLLMs), have shown great potential for task automation on computing\ndevices such as computers and mobile phones. However, existing agents face\nchallenges in multi-step reasoning and reliance on textual annotations,\nlimiting their effectiveness. We introduce \\textit{InfiGUIAgent}, an MLLM-based\nGUI Agent trained with a two-stage supervised fine-tuning pipeline. Stage 1\nenhances fundamental skills such as GUI understanding and grounding, while\nStage 2 integrates hierarchical reasoning and expectation-reflection reasoning\nskills using synthesized data to enable native reasoning abilities of the\nagents. \\textit{InfiGUIAgent} achieves competitive performance on several GUI\nbenchmarks, highlighting the impact of native reasoning skills in enhancing GUI\ninteraction for automation tasks. Resources are available at\n\\url{https://github.com/Reallm-Labs/InfiGUIAgent}.\n","authors":["Yuhang Liu","Pengxiang Li","Zishu Wei","Congkai Xie","Xueyu Hu","Xinchen Xu","Shengyu Zhang","Xiaotian Han","Hongxia Yang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2501.04575v1.pdf","comment":"14 pages, 7 figures, work in progress"},{"id":"http://arxiv.org/abs/2409.10589v2","updated":"2025-01-08T15:41:04Z","published":"2024-09-16T15:18:10Z","title":"Offline Reinforcement Learning for Learning to Dispatch for Job Shop\n  Scheduling","summary":"  The Job Shop Scheduling Problem (JSSP) is a complex combinatorial\noptimization problem. While online Reinforcement Learning (RL) has shown\npromise by quickly finding acceptable solutions for JSSP, it faces key\nlimitations: it requires extensive training interactions from scratch leading\nto sample inefficiency, cannot leverage existing high-quality solutions, and\noften yields suboptimal results compared to traditional methods like Constraint\nProgramming (CP). We introduce Offline Reinforcement Learning for Learning to\nDispatch (Offline-LD), which addresses these limitations by learning from\npreviously generated solutions. Our approach is motivated by scenarios where\nhistorical scheduling data and expert solutions are available, although our\ncurrent evaluation focuses on benchmark problems. Offline-LD adapts two\nCQL-based Q-learning methods (mQRDQN and discrete mSAC) for maskable action\nspaces, introduces a novel entropy bonus modification for discrete SAC, and\nexploits reward normalization through preprocessing. Our experiments\ndemonstrate that Offline-LD outperforms online RL on both generated and\nbenchmark instances. Notably, by introducing noise into the expert dataset, we\nachieve similar or better results than those obtained from the expert dataset,\nsuggesting that a more diverse training set is preferable because it contains\ncounterfactual information.\n","authors":["Jesse van Remmerden","Zaharah Bukhsh","Yingqian Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.10589v2.pdf","comment":"Code available at https://github.com/jesserem/Offline-LD"},{"id":"http://arxiv.org/abs/2501.04568v1","updated":"2025-01-08T15:32:12Z","published":"2025-01-08T15:32:12Z","title":"Supervision-free Vision-Language Alignment","summary":"  Vision-language models (VLMs) have demonstrated remarkable potential in\nintegrating visual and linguistic information, but their performance is often\nconstrained by the need for extensive, high-quality image-text training data.\nCuration of these image-text pairs is both time-consuming and computationally\nexpensive. To address this challenge, we introduce SVP (Supervision-free Visual\nProjection), a novel framework that enhances vision-language alignment without\nrelying on curated data or preference annotation. SVP leverages self-captioning\nand a pre-trained grounding model as a feedback mechanism to elicit latent\ninformation in VLMs. We evaluate our approach across six key areas: captioning,\nreferring, visual question answering, multitasking, hallucination control, and\nobject recall. Results demonstrate significant improvements, including a 14%\naverage improvement in captioning tasks, up to 12% increase in object recall,\nand substantial reduction in hallucination rates. Notably, a small VLM using\nSVP achieves hallucination reductions comparable to a model five times larger,\nwhile a VLM with initially poor referring capabilities more than doubles its\nperformance, approaching parity with a model twice its size.\n","authors":["Giorgio Giannone","Ruoteng Li","Qianli Feng","Evgeny Perevodchikov","Rui Chen","Aleix Martinez"],"pdf_url":"https://arxiv.org/pdf/2501.04568v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2406.06184v2","updated":"2025-01-08T15:28:11Z","published":"2024-06-10T11:28:25Z","title":"Deep Multi-Objective Reinforcement Learning for Utility-Based\n  Infrastructural Maintenance Optimization","summary":"  In this paper, we introduce Multi-Objective Deep Centralized Multi-Agent\nActor-Critic (MO- DCMAC), a multi-objective reinforcement learning (MORL)\nmethod for infrastructural maintenance optimization, an area traditionally\ndominated by single-objective reinforcement learning (RL) approaches. Previous\nsingle-objective RL methods combine multiple objectives, such as probability of\ncollapse and cost, into a singular reward signal through reward-shaping. In\ncontrast, MO-DCMAC can optimize a policy for multiple objectives directly, even\nwhen the utility function is non-linear. We evaluated MO-DCMAC using two\nutility functions, which use probability of collapse and cost as input. The\nfirst utility function is the Threshold utility, in which MO-DCMAC should\nminimize cost so that the probability of collapse is never above the threshold.\nThe second is based on the Failure Mode, Effects, and Criticality Analysis\n(FMECA) methodology used by asset managers to asses maintenance plans. We\nevaluated MO-DCMAC, with both utility functions, in multiple maintenance\nenvironments, including ones based on a case study of the historical quay walls\nof Amsterdam. The performance of MO-DCMAC was compared against multiple\nrule-based policies based on heuristics currently used for constructing\nmaintenance plans. Our results demonstrate that MO-DCMAC outperforms\ntraditional rule-based policies across various environments and utility\nfunctions.\n","authors":["Jesse van Remmerden","Maurice Kenter","Diederik M. Roijers","Charalampos Andriotis","Yingqian Zhang","Zaharah Bukhsh"],"pdf_url":"https://arxiv.org/pdf/2406.06184v2.pdf","comment":"Accepted in the Neural Computing and Applications: Topical Collection\n  on Multi-Objective Decision Making 2023 (MODeM 2023)"},{"id":"http://arxiv.org/abs/2402.18205v4","updated":"2025-01-08T15:18:15Z","published":"2024-02-28T09:51:55Z","title":"Lemur: Log Parsing with Entropy Sampling and Chain-of-Thought Merging","summary":"  Logs produced by extensive software systems are integral to monitoring system\nbehaviors. Advanced log analysis facilitates the detection, alerting, and\ndiagnosis of system faults. Log parsing, which entails transforming raw log\nmessages into structured templates, constitutes a critical phase in the\nautomation of log analytics. Existing log parsers fail to identify the correct\ntemplates due to reliance on human-made rules. Besides, These methods focus on\nstatistical features while ignoring semantic information in log messages. To\naddress these challenges, we introduce a cutting-edge \\textbf{L}og parsing\nframework with \\textbf{E}ntropy sampling and Chain-of-Thought \\textbf{M}erging\n(Lemur). Specifically, to discard the tedious manual rules. We propose a novel\nsampling method inspired by information entropy, which efficiently clusters\ntypical logs. Furthermore, to enhance the merging of log templates, we design a\nchain-of-thought method for large language models (LLMs). LLMs exhibit\nexceptional semantic comprehension, deftly distinguishing between parameters\nand invariant tokens. We have conducted experiments on large-scale public\ndatasets. Extensive evaluation demonstrates that Lemur achieves the\nstate-of-the-art performance and impressive efficiency. The Code is available\nat https://github.com/zwpride/lemur.\n","authors":["Wei Zhang","Hongcheng Guo","Anjie Le","Jian Yang","Jiaheng Liu","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2402.18205v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04628v2","updated":"2025-01-08T15:00:39Z","published":"2024-12-05T21:50:22Z","title":"SWEPO: Simultaneous Weighted Preference Optimization for Group\n  Contrastive Alignment","summary":"  We introduce Simultaneous Weighted Preference Optimization (SWEPO), a novel\nextension of Direct Preference Optimization (DPO) designed to accommodate\nmultiple dynamically chosen positive and negative responses for each query.\nSWEPO employs a weighted group contrastive loss, assigning weights to responses\nbased on their deviation from the mean reward score. This approach effectively\nprioritizes responses that are significantly better or worse than the average,\nenhancing optimization. Our theoretical analysis demonstrates that\nsimultaneously considering multiple preferences reduces alignment bias,\nresulting in more robust alignment. Additionally, we provide insights into the\ntraining dynamics of our loss function and a related function, InfoNCA.\nEmpirical validation on the UltraFeedback dataset establishes SWEPO as\nstate-of-the-art, with superior performance in downstream evaluations using the\nAlpacaEval dataset.\n","authors":["Taneesh Gupta","Rahul Madhavan","Xuchao Zhang","Chetan Bansal","Saravan Rajmohan"],"pdf_url":"https://arxiv.org/pdf/2412.04628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02654v2","updated":"2025-01-08T14:53:41Z","published":"2025-01-05T20:39:52Z","title":"Tougher Text, Smarter Models: Raising the Bar for Adversarial Defence\n  Benchmarks","summary":"  Recent advancements in natural language processing have highlighted the\nvulnerability of deep learning models to adversarial attacks. While various\ndefence mechanisms have been proposed, there is a lack of comprehensive\nbenchmarks that evaluate these defences across diverse datasets, models, and\ntasks. In this work, we address this gap by presenting an extensive benchmark\nfor textual adversarial defence that significantly expands upon previous work.\nOur benchmark incorporates a wide range of datasets, evaluates state-of-the-art\ndefence mechanisms, and extends the assessment to include critical tasks such\nas single-sentence classification, similarity and paraphrase identification,\nnatural language inference, and commonsense reasoning. This work not only\nserves as a valuable resource for researchers and practitioners in the field of\nadversarial robustness but also identifies key areas for future research in\ntextual adversarial defence. By establishing a new standard for benchmarking in\nthis domain, we aim to accelerate progress towards more robust and reliable\nnatural language processing systems.\n","authors":["Yang Wang","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2501.02654v2.pdf","comment":"Will be presented as an oral in-person presentation at the conference\n  of COLING 2025"},{"id":"http://arxiv.org/abs/2501.04541v1","updated":"2025-01-08T14:44:40Z","published":"2025-01-08T14:44:40Z","title":"Cyber-Physical Steganography in Robotic Motion Control","summary":"  Steganography, the art of information hiding, has continually evolved across\nvisual, auditory and linguistic domains, adapting to the ceaseless interplay\nbetween steganographic concealment and steganalytic revelation. This study\nseeks to extend the horizons of what constitutes a viable steganographic medium\nby introducing a steganographic paradigm in robotic motion control. Based on\nthe observation of the robot's inherent sensitivity to changes in its\nenvironment, we propose a methodology to encode messages as environmental\nstimuli influencing the motions of the robotic agent and to decode messages\nfrom the resulting motion trajectory. The constraints of maximal robot\nintegrity and minimal motion deviation are established as fundamental\nprinciples underlying secrecy. As a proof of concept, we conduct experiments in\nsimulated environments across various manipulation tasks, incorporating robotic\nembodiments equipped with generalist multimodal policies.\n","authors":["Ching-Chun Chang","Yijie Lin","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2501.04541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02334v2","updated":"2025-01-08T14:42:05Z","published":"2024-04-26T15:02:39Z","title":"Rad4XCNN: a new agnostic method for post-hoc global explanation of\n  CNN-derived features by means of radiomics","summary":"  In recent years, machine learning-based clinical decision support systems\n(CDSS) have played a key role in the analysis of several medical conditions.\nDespite their promising capabilities, the lack of transparency in AI models\nposes significant challenges, particularly in medical contexts where\nreliability is a mandatory aspect. However, it appears that explainability is\ninversely proportional to accuracy. For this reason, achieving transparency\nwithout compromising predictive accuracy remains a key challenge. This paper\npresents a novel method, namely Rad4XCNN, to enhance the predictive power of\nCNN-derived features with the inherent interpretability of radiomic features.\nRad4XCNN diverges from conventional methods based on saliency maps, by\nassociating intelligible meaning to CNN-derived features by means of Radiomics,\noffering new perspectives on explanation methods beyond visualization maps.\nUsing a breast cancer classification task as a case study, we evaluated\nRad4XCNN on ultrasound imaging datasets, including an online dataset and two\nin-house datasets for internal and external validation. Some key results are:\ni) CNN-derived features guarantee more robust accuracy when compared against\nViT-derived and radiomic features; ii) conventional visualization map methods\nfor explanation present several pitfalls; iii) Rad4XCNN does not sacrifice\nmodel accuracy for their explainability; iv) Rad4XCNN provides a global\nexplanation enabling the physician to extract global insights and findings. Our\nmethod can mitigate some concerns related to the explainability-accuracy\ntrade-off. This study highlighted the importance of proposing new methods for\nmodel explanation without affecting their accuracy.\n","authors":["Francesco Prinzi","Carmelo Militello","Calogero Zarcaro","Tommaso Vincenzo Bartolotta","Salvatore Gaglio","Salvatore Vitabile"],"pdf_url":"https://arxiv.org/pdf/2405.02334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00599v2","updated":"2025-01-08T14:38:30Z","published":"2024-12-31T18:56:46Z","title":"VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with\n  Video LLM","summary":"  Video Large Language Models (Video LLMs) have recently exhibited remarkable\ncapabilities in general video understanding. However, they mainly focus on\nholistic comprehension and struggle with capturing fine-grained spatial and\ntemporal details. Besides, the lack of high-quality object-level video\ninstruction data and a comprehensive benchmark further hinders their\nadvancements. To tackle these challenges, we introduce the VideoRefer Suite to\nempower Video LLM for finer-level spatial-temporal video understanding, i.e.,\nenabling perception and reasoning on any objects throughout the video.\nSpecially, we thoroughly develop VideoRefer Suite across three essential\naspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent\ndata engine to meticulously curate a large-scale, high-quality object-level\nvideo instruction dataset, termed VideoRefer-700K. Next, we present the\nVideoRefer model, which equips a versatile spatial-temporal object encoder to\ncapture precise regional and sequential representations. Finally, we\nmeticulously create a VideoRefer-Bench to comprehensively assess the\nspatial-temporal understanding capability of a Video LLM, evaluating it across\nvarious aspects. Extensive experiments and analyses demonstrate that our\nVideoRefer model not only achieves promising performance on video referring\nbenchmarks but also facilitates general video understanding capabilities.\n","authors":["Yuqian Yuan","Hang Zhang","Wentong Li","Zesen Cheng","Boqiang Zhang","Long Li","Xin Li","Deli Zhao","Wenqiao Zhang","Yueting Zhuang","Jianke Zhu","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2501.00599v2.pdf","comment":"17 pages, 14 figures, technical report"},{"id":"http://arxiv.org/abs/2409.14457v2","updated":"2025-01-08T14:29:44Z","published":"2024-09-22T14:09:49Z","title":"Large Model Based Agents: State-of-the-Art, Cooperation Paradigms,\n  Security and Privacy, and Future Trends","summary":"  With the rapid advancement of large models (LMs), the development of\ngeneral-purpose intelligent agents powered by LMs has become a reality. It is\nforeseeable that in the near future, LM-driven general AI agents will serve as\nessential tools in production tasks, capable of autonomous communication and\ncollaboration without human intervention. This paper investigates scenarios\ninvolving the autonomous collaboration of future LM agents. We review the\ncurrent state of LM agents, the key technologies enabling LM agent\ncollaboration, and the security and privacy challenges they face during\ncooperative operations. To this end, we first explore the foundational\nprinciples of LM agents, including their general architecture, key components,\nenabling technologies, and modern applications. We then discuss practical\ncollaboration paradigms from data, computation, and knowledge perspectives to\nachieve connected intelligence among LM agents. After that, we analyze the\nsecurity vulnerabilities and privacy risks associated with LM agents,\nparticularly in multi-agent settings, examining underlying mechanisms and\nreviewing current and potential countermeasures. Lastly, we propose future\nresearch directions for building robust and secure LM agent ecosystems.\n","authors":["Yuntao Wang","Yanghe Pan","Zhou Su","Yi Deng","Quan Zhao","Linkang Du","Tom H. Luan","Jiawen Kang","Dusit Niyato"],"pdf_url":"https://arxiv.org/pdf/2409.14457v2.pdf","comment":"40 pages, 31 figures, 8 tables"},{"id":"http://arxiv.org/abs/2501.02156v3","updated":"2025-01-08T14:26:51Z","published":"2025-01-04T01:45:32Z","title":"The Race to Efficiency: A New Perspective on AI Scaling Laws","summary":"  As large-scale AI models expand, training becomes costlier and sustaining\nprogress grows harder. Classical scaling laws (e.g., Kaplan et al. (2020),\nHoffmann et al. (2022)) predict training loss from a static compute budget yet\nneglect time and efficiency, prompting the question: how can we balance\nballooning GPU fleets with rapidly improving hardware and algorithms? We\nintroduce the relative-loss equation, a time- and efficiency-aware framework\nthat extends classical AI scaling laws. Our model shows that, without ongoing\nefficiency gains, advanced performance could demand millennia of training or\nunrealistically large GPU fleets. However, near-exponential progress remains\nachievable if the \"efficiency-doubling rate\" parallels Moore's Law. By\nformalizing this race to efficiency, we offer a quantitative roadmap for\nbalancing front-loaded GPU investments with incremental improvements across the\nAI stack. Empirical trends suggest that sustained efficiency gains can push AI\nscaling well into the coming decade, providing a new perspective on the\ndiminishing returns inherent in classical scaling.\n","authors":["Chien-Ping Lu"],"pdf_url":"https://arxiv.org/pdf/2501.02156v3.pdf","comment":"21 pages, 3 figures. 2 tables, second draft"},{"id":"http://arxiv.org/abs/2402.13809v3","updated":"2025-01-08T14:21:46Z","published":"2024-02-21T13:46:25Z","title":"NeuralDiffuser: Neuroscience-inspired Diffusion Guidance for fMRI Visual\n  Reconstruction","summary":"  Reconstructing visual stimuli from functional Magnetic Resonance Imaging fMRI\nenables fine-grained retrieval of brain activity. However, the accurate\nreconstruction of diverse details, including structure, background, texture,\ncolor, and more, remains challenging. The stable diffusion models inevitably\nresult in the variability of reconstructed images, even under identical\nconditions. To address this challenge, we first uncover the neuroscientific\nperspective of diffusion methods, which primarily involve top-down creation\nusing pre-trained knowledge from extensive image datasets, but tend to lack\ndetail-driven bottom-up perception, leading to a loss of faithful details. In\nthis paper, we propose NeuralDiffuser, which incorporates primary visual\nfeature guidance to provide detailed cues in the form of gradients. This\nextension of the bottom-up process for diffusion models achieves both semantic\ncoherence and detail fidelity when reconstructing visual stimuli. Furthermore,\nwe have developed a novel guidance strategy for reconstruction tasks that\nensures the consistency of repeated outputs with original images rather than\nwith various outputs. Extensive experimental results on the Natural Senses\nDataset (NSD) qualitatively and quantitatively demonstrate the advancement of\nNeuralDiffuser by comparing it against baseline and state-of-the-art methods\nhorizontally, as well as conducting longitudinal ablation studies.\n","authors":["Haoyu Li","Hao Wu","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2402.13809v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04528v1","updated":"2025-01-08T14:19:54Z","published":"2025-01-08T14:19:54Z","title":"Towards a Problem-Oriented Domain Adaptation Framework for Machine\n  Learning","summary":"  Domain adaptation is a sub-field of machine learning that involves\ntransferring knowledge from a source domain to perform the same task in the\ntarget domain. It is a typical challenge in machine learning that arises, e.g.,\nwhen data is obtained from various sources or when using a data basis that\nchanges over time. Recent advances in the field offer promising methods, but it\nis still challenging for researchers and practitioners to determine if domain\nadaptation is suitable for a given problem -- and, subsequently, to select the\nappropriate approach. This article employs design science research to develop a\nproblem-oriented framework for domain adaptation, which is matured in three\nevaluation episodes. We describe a framework that distinguishes between five\ndomain adaptation scenarios, provides recommendations for addressing each\nscenario, and offers guidelines for determining if a problem falls into one of\nthese scenarios. During the multiple evaluation episodes, the framework is\ntested on artificial and real-world datasets and an experimental study\ninvolving 100 participants. The evaluation demonstrates that the framework has\nthe explanatory power to capture any domain adaptation problem effectively. In\nsummary, we provide clear guidance for researchers and practitioners who want\nto employ domain adaptation but lack in-depth knowledge of the possibilities.\n","authors":["Philipp Spitzer","Dominik Martin","Laurin Eichberger","Niklas Kühl"],"pdf_url":"https://arxiv.org/pdf/2501.04528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13867v2","updated":"2025-01-08T14:08:11Z","published":"2024-05-22T17:48:17Z","title":"Scaling-laws for Large Time-series Models","summary":"  Scaling laws for large language models (LLMs) have provided useful guidance\nin training ever larger models for predictable performance gains. Time series\nforecasting shares a similar sequential structure to language, and is amenable\nto large-scale transformer architectures. Here we show that foundational\ndecoder-only time series transformer models exhibit analogous scaling-behavior\nto LLMs, with architectural details (aspect ratio and number of heads) having a\nminimal effect over broad ranges. We assemble a large corpus of heterogenous\ntime series data on which to train, and establish for the first time power-law\nscaling with parameter count, dataset size, and training compute, spanning five\norders of magnitude.\n","authors":["Thomas D. P. Edwards","James Alvey","Justin Alsing","Nam H. Nguyen","Benjamin D. Wandelt"],"pdf_url":"https://arxiv.org/pdf/2405.13867v2.pdf","comment":"4 main pages (16 total), 4 figures; Accepted for oral presentation in\n  Time Series in the Age of Large Models (TSALM) Workshop at Neurips 2024"},{"id":"http://arxiv.org/abs/2409.12809v2","updated":"2025-01-08T13:59:28Z","published":"2024-09-19T14:34:20Z","title":"Don't be Fooled: The Misinformation Effect of Explanations in Human-AI\n  Collaboration","summary":"  Across various applications, humans increasingly use black-box artificial\nintelligence (AI) systems without insight into these systems' reasoning. To\ncounter this opacity, explainable AI (XAI) methods promise enhanced\ntransparency and interpretability. While recent studies have explored how XAI\naffects human-AI collaboration, few have examined the potential pitfalls caused\nby incorrect explanations. The implications for humans can be far-reaching but\nhave not been explored extensively. To investigate this, we ran a study (n=160)\non AI-assisted decision-making in which humans were supported by XAI. Our\nfindings reveal a misinformation effect when incorrect explanations accompany\ncorrect AI advice with implications post-collaboration. This effect causes\nhumans to infer flawed reasoning strategies, hindering task execution and\ndemonstrating impaired procedural knowledge. Additionally, incorrect\nexplanations compromise human-AI team-performance during collaboration. With\nour work, we contribute to HCI by providing empirical evidence for the negative\nconsequences of incorrect explanations on humans post-collaboration and\noutlining guidelines for designers of AI.\n","authors":["Philipp Spitzer","Joshua Holstein","Katelyn Morrison","Kenneth Holstein","Gerhard Satzger","Niklas Kühl"],"pdf_url":"https://arxiv.org/pdf/2409.12809v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04510v1","updated":"2025-01-08T13:56:17Z","published":"2025-01-08T13:56:17Z","title":"CGP-Tuning: Structure-Aware Soft Prompt Tuning for Code Vulnerability\n  Detection","summary":"  Large language models (LLMs) have been proposed as powerful tools for\ndetecting software vulnerabilities, where task-specific fine-tuning is\ntypically employed to provide vulnerability-specific knowledge to the LLMs for\nthis purpose. However, traditional full-parameter fine-tuning is inefficient\nfor modern, complex LLMs, which contain billions of parameters.\n  Soft prompt tuning has been suggested as a more efficient alternative for\nfine-tuning LLMs in general cases. However, pure soft prompt tuning treats\nsource code as plain text, losing structural information inherent in source\ncode. Meanwhile, graph-enhanced soft prompt tuning methods, which aim to\naddress this issue, are unable to preserve the rich semantic information within\ncode graphs, as they are primarily designed for general graph-related tasks and\nfocus more on adjacency information. They also fail to ensure computational\nefficiency while accounting for graph-text interactions.\n  This paper, therefore, introduces a new code graph-enhanced, structure-aware\nsoft prompt tuning method for vulnerability detection, referred to as\nCGP-Tuning. It employs innovative type-aware embeddings to capture the rich\nsemantic information within code graphs, along with a novel and efficient\ncross-modal alignment module that achieves linear computational cost while\nincorporating graph-text interactions. The proposed CGP-Tuning is evaluated on\nthe latest DiverseVul dataset and the most recent open-source code LLMs,\nCodeLlama and CodeGemma. Experimental results demonstrate that CGP-Tuning\noutperforms the best state-of-the-art method by an average of 3.5 percentage\npoints in accuracy, without compromising its vulnerability detection\ncapabilities for long source code.\n","authors":["Ruijun Feng","Hammond Pearce","Pietro Liguori","Yulei Sui"],"pdf_url":"https://arxiv.org/pdf/2501.04510v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.02994v2","updated":"2025-01-08T13:35:45Z","published":"2024-07-03T10:49:21Z","title":"MedPix 2.0: A Comprehensive Multimodal Biomedical Data set for Advanced\n  AI Applications with Retrieval Augmented Generation and Knowledge Graphs","summary":"  The increasing interest in developing Artificial Intelligence applications in\nthe medical domain, suffers from the lack of high-quality data set, mainly due\nto privacy-related issues. In addition, the recent increase in large multimodal\nmodels (LMM) leads to the need for multimodal medical data sets, where clinical\nreports and findings are attached to the corresponding CT or MRI scans. This\npaper illustrates the entire workflow for building the MedPix 2.0 data set.\nStarting with the well-known multimodal data set\nMedPix\\textsuperscript{\\textregistered}, mainly used by physicians, nurses, and\nhealthcare students for Continuing Medical Education purposes, a semi-automatic\npipeline was developed to extract visual and textual data followed by a manual\ncuring procedure in which noisy samples were removed, thus creating a MongoDB\ndatabase. Along with the data set, we developed a GUI aimed at navigating\nefficiently the MongoDB instance and obtaining the raw data that can be easily\nused for training and/or fine-tuning LMMs. To enforce this point, in this work,\nwe first recall DR-Minerva, a RAG-based LMM trained using MedPix 2.0.\nDR-Minerva predicts the body part and the modality used to scan its input\nimage. We also propose the extension of DR-Minerva with a Knowledge Graph that\nuses Llama 3.1 Instruct 8B, and leverages MedPix 2.0. The resulting\narchitecture can be queried in a end-to-end manner, as a medical decision\nsupport system. MedPix 2.0 is available on GitHub.\n\\url{https://github.com/CHILab1/MedPix-2.0}\n","authors":["Irene Siragusa","Salvatore Contino","Massimo La Ciura","Rosario Alicata","Roberto Pirrone"],"pdf_url":"https://arxiv.org/pdf/2407.02994v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04493v1","updated":"2025-01-08T13:26:24Z","published":"2025-01-08T13:26:24Z","title":"The Role of Machine Learning in Congenital Heart Disease Diagnosis:\n  Datasets, Algorithms, and Insights","summary":"  Congenital heart disease is among the most common fetal abnormalities and\nbirth defects. Despite identifying numerous risk factors influencing its onset,\na comprehensive understanding of its genesis and management across diverse\npopulations remains limited. Recent advancements in machine learning have\ndemonstrated the potential for leveraging patient data to enable early\ncongenital heart disease detection. Over the past seven years, researchers have\nproposed various data-driven and algorithmic solutions to address this\nchallenge. This paper presents a systematic review of congential heart disease\nrecognition using machine learning, conducting a meta-analysis of 432\nreferences from leading journals published between 2018 and 2024. A detailed\ninvestigation of 74 scholarly works highlights key factors, including\ndatabases, algorithms, applications, and solutions. Additionally, the survey\noutlines reported datasets used by machine learning experts for congenital\nheart disease recognition. Using a systematic literature review methodology,\nthis study identifies critical challenges and opportunities in applying machine\nlearning to congenital heart disease.\n","authors":["Khalil Khan","Farhan Ullah","Ikram Syed","Irfan Ullah"],"pdf_url":"https://arxiv.org/pdf/2501.04493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16586v2","updated":"2025-01-08T13:16:26Z","published":"2024-09-25T03:25:34Z","title":"AutoSTF: Decoupled Neural Architecture Search for Cost-Effective\n  Automated Spatio-Temporal Forecasting","summary":"  Spatio-temporal forecasting is a critical component of various smart city\napplications, such as transportation optimization, energy management, and\nsocio-economic analysis. Recently, several automated spatio-temporal\nforecasting methods have been proposed to automatically search the optimal\nneural network architecture for capturing complex spatio-temporal dependencies.\nHowever, the existing automated approaches suffer from expensive neural\narchitecture search overhead, which hinders their practical use and the further\nexploration of diverse spatio-temporal operators in a finer granularity. In\nthis paper, we propose AutoSTF, a decoupled automatic neural architecture\nsearch framework for cost-effective automated spatio-temporal forecasting. From\nthe efficiency perspective, we first decouple the mixed search space into\ntemporal space and spatial space and respectively devise representation\ncompression and parameter-sharing schemes to mitigate the parameter explosion.\nThe decoupled spatio-temporal search not only expedites the model optimization\nprocess but also leaves new room for more effective spatio-temporal dependency\nmodeling. From the effectiveness perspective, we propose a multi-patch transfer\nmodule to jointly capture multi-granularity temporal dependencies and extend\nthe spatial search space to enable finer-grained layer-wise spatial dependency\nsearch. Extensive experiments on eight datasets demonstrate the superiority of\nAutoSTF in terms of both accuracy and efficiency. Specifically, our proposed\nmethod achieves up to 13.48x speed-up compared to state-of-the-art automatic\nspatio-temporal forecasting methods while maintaining the best forecasting\naccuracy.\n","authors":["Tengfei Lyu","Weijia Zhang","Jinliang Deng","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2409.16586v2.pdf","comment":"Accepted by KDD 2025 Research Track"},{"id":"http://arxiv.org/abs/2501.04487v1","updated":"2025-01-08T13:14:05Z","published":"2025-01-08T13:14:05Z","title":"Integrating remote sensing data assimilation, deep learning and large\n  language model for interactive wheat breeding yield prediction","summary":"  Yield is one of the core goals of crop breeding. By predicting the potential\nyield of different breeding materials, breeders can screen these materials at\nvarious growth stages to select the best performing. Based on unmanned aerial\nvehicle remote sensing technology, high-throughput crop phenotyping data in\nbreeding areas is collected to provide data support for the breeding decisions\nof breeders. However, the accuracy of current yield predictions still requires\nimprovement, and the usability and user-friendliness of yield forecasting tools\nremain suboptimal. To address these challenges, this study introduces a hybrid\nmethod and tool for crop yield prediction, designed to allow breeders to\ninteractively and accurately predict wheat yield by chatting with a large\nlanguage model (LLM). First, the newly designed data assimilation algorithm is\nused to assimilate the leaf area index into the WOFOST model. Then, selected\noutputs from the assimilation process, along with remote sensing inversion\nresults, are used to drive the time-series temporal fusion transformer model\nfor wheat yield prediction. Finally, based on this hybrid method and leveraging\nan LLM with retrieval augmented generation technology, we developed an\ninteractive yield prediction Web tool that is user-friendly and supports\nsustainable data updates. This tool integrates multi-source data to assist\nbreeding decision-making. This study aims to accelerate the identification of\nhigh-yield materials in the breeding process, enhance breeding efficiency, and\nenable more scientific and smart breeding decisions.\n","authors":["Guofeng Yang","Nanfei Jin","Wenjie Ai","Zhonghua Zheng","Yuhong He","Yong He"],"pdf_url":"https://arxiv.org/pdf/2501.04487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01433v2","updated":"2025-01-08T13:08:32Z","published":"2024-12-18T02:00:53Z","title":"Mathematical Definition and Systematization of Puzzle Rules","summary":"  While logic puzzles have engaged individuals through problem-solving and\ncritical thinking, the creation of new puzzle rules has largely relied on\nad-hoc processes. Pencil puzzles, such as Slitherlink and Sudoku, represent a\nprominent subset of these games, celebrated for their intellectual challenges\nrooted in combinatorial logic and spatial reasoning. Despite extensive research\ninto solving techniques and automated problem generation, a unified framework\nfor systematic and scalable rule design has been lacking. Here, we introduce a\nmathematical framework for defining and systematizing pencil puzzle rules. This\nframework formalizes grid elements, their positional relationships, and\niterative composition operations, allowing for the incremental construction of\nstructures that form the basis of puzzle rules. Furthermore, we establish a\nformal method to describe constraints and domains for each structure, ensuring\nsolvability and coherence. Applying this framework, we successfully formalized\nthe rules of well-known Nikoli puzzles, including Slitherlink and Sudoku,\ndemonstrating the formal representation of a significant portion (approximately\none-fourth) of existing puzzles. These results validate the potential of the\nframework to systematize and innovate puzzle rule design, establishing a\npathway to automated rule generation. By providing a mathematical foundation\nfor puzzle rule creation, this framework opens avenues for computers,\npotentially enhanced by AI, to design novel puzzle rules tailored to player\npreferences, expanding the scope of puzzle diversity. Beyond its direct\napplication to pencil puzzles, this work illustrates how mathematical\nframeworks can bridge recreational mathematics and algorithmic design, offering\ntools for broader exploration in logic-based systems, with potential\napplications in educational game design, personalized learning, and\ncomputational creativity.\n","authors":["Itsuki Maeda","Yasuhiro Inoue"],"pdf_url":"https://arxiv.org/pdf/2501.01433v2.pdf","comment":"16pages"},{"id":"http://arxiv.org/abs/2501.04480v1","updated":"2025-01-08T13:03:34Z","published":"2025-01-08T13:03:34Z","title":"Research on environment perception and behavior prediction of\n  intelligent UAV based on semantic communication","summary":"  The convergence of drone delivery systems, virtual worlds, and blockchain has\ntransformed logistics and supply chain management, providing a fast, and\nenvironmentally friendly alternative to traditional ground transportation\nmethods;Provide users with a real-world experience, virtual service providers\nneed to collect up-to-the-minute delivery information from edge devices. To\naddress this challenge, 1) a reinforcement learning approach is introduced to\nenable drones with fast training capabilities and the ability to autonomously\nadapt to new virtual scenarios for effective resource allocation.2) A semantic\ncommunication framework for meta-universes is proposed, which utilizes the\nextraction of semantic information to reduce the communication cost and\nincentivize the transmission of information for meta-universe services.3) In\norder to ensure that user information security, a lightweight authentication\nand key agreement scheme is designed between the drone and the user by\nintroducing blockchain technology. In our experiments, the drone adaptation\nperformance is improved by about 35\\%, and the local offloading rate can reach\n90\\% with the increase of the number of base stations. The semantic\ncommunication system proposed in this paper is compared with the Cross Entropy\nbaseline model. Introducing blockchain technology the throughput of the\ntransaction is maintained at a stable value with different number of drones.\n","authors":["Kechong Ren","Li Gao","Qi Guan"],"pdf_url":"https://arxiv.org/pdf/2501.04480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04472v1","updated":"2025-01-08T12:51:34Z","published":"2025-01-08T12:51:34Z","title":"Hybrid Artificial Intelligence Strategies for Drone Navigation","summary":"  Objective: This paper describes the development of hybrid artificial\nintelligence strategies for drone navigation. Methods: The navigation module\ncombines a deep learning model with a rule-based engine depending on the agent\nstate. The deep learning model has been trained using reinforcement learning.\nThe rule-based engine uses expert knowledge to deal with specific situations.\nThe navigation module incorporates several strategies to explain the drone\ndecision based on its observation space, and different mechanisms for including\nhuman decisions in the navigation process. Finally, this paper proposes an\nevaluation methodology based on defining several scenarios and analyzing the\nperformance of the different strategies according to metrics adapted to each\nscenario. Results: Two main navigation problems have been studied. For the\nfirst scenario (reaching known targets), it has been possible to obtain a 90%\ntask completion rate, reducing significantly the number of collisions thanks to\nthe rule-based engine. For the second scenario, it has been possible to reduce\n20% of the time required to locate all the targets using the reinforcement\nlearning model. Conclusions: Reinforcement learning is a very good strategy to\nlearn policies for drone navigation, but in critical situations, it is\nnecessary to complement it with a rule-based module to increase task success\nrate.\n","authors":["Rubén San-Segundo","Lucía Angulo","Manuel Gil-Martín","David Carramiñana","Ana M. Bernardos"],"pdf_url":"https://arxiv.org/pdf/2501.04472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08023v2","updated":"2025-01-08T12:40:56Z","published":"2024-09-12T13:05:28Z","title":"Edge-Wise Graph-Instructed Neural Networks","summary":"  The problem of multi-task regression over graph nodes has been recently\napproached through Graph-Instructed Neural Network (GINN), which is a promising\narchitecture belonging to the subset of message-passing graph neural networks.\nIn this work, we discuss the limitations of the Graph-Instructed (GI) layer,\nand we formalize a novel edge-wise GI (EWGI) layer. We discuss the advantages\nof the EWGI layer and we provide numerical evidence that EWGINNs perform better\nthan GINNs over some graph-structured input data, like the ones inferred from\nthe Barabasi-Albert graph, and improve the training regularization on graphs\nwith chaotic connectivity, like the ones inferred from the Erdos-Renyi graph.\n","authors":["Francesco Della Santa","Antonio Mastropietro","Sandra Pieraccini","Francesco Vaccarino"],"pdf_url":"https://arxiv.org/pdf/2409.08023v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16149v4","updated":"2025-01-08T12:40:27Z","published":"2024-03-24T13:43:43Z","title":"Analyzing Consumer IoT Traffic from Security and Privacy Perspectives: a\n  Comprehensive Survey","summary":"  The Consumer Internet of Things (CIoT), a notable segment within the IoT\ndomain, involves the integration of IoT technology into consumer electronics\nand devices, such as smart homes and smart wearables. Compared to traditional\nIoT fields, CIoT differs notably in target users, product types, and design\napproaches. While offering convenience to users, it also raises new security\nand privacy concerns. Network traffic analysis, a widely used technique in the\nsecurity community, has been extensively applied to investigate these concerns\nabout CIoT. Compared to network traffic analysis in other fields such as mobile\napps and websites, CIoT presents unique characteristics, introducing new\nchallenges and research opportunities. Researchers have made significant\ncontributions in this area. To aid researchers in understanding the application\nof traffic analysis tools for studying CIoT security and privacy risks, this\nsurvey reviews 303 publications on traffic analysis within the CIoT security\nand privacy domain from January 2018 to June 2024, focusing on three research\nquestions. Our work: 1) outlines the CIoT traffic analysis process and\nhighlights its differences from general network traffic analysis. 2) summarizes\nand classifies existing research into four categories according to its\napplication objectives: device fingerprinting, user activity inference,\nmalicious traffic detection, and measurement. 3) explores emerging challenges\nand potential future research directions based on each step of the CIoT traffic\nanalysis process. This will provide new insights to the community and guide the\nindustry towards safer product designs.\n","authors":["Yan Jia","Yuxin Song","Zihou Liu","Qingyin Tan","Yang Song","Yu Zhang","Zheli Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16149v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04444v1","updated":"2025-01-08T11:53:30Z","published":"2025-01-08T11:53:30Z","title":"A novel Facial Recognition technique with Focusing on Masked Faces","summary":"  Recognizing the same faces with and without masks is important for ensuring\nconsistent identification in security, access control, and public safety. This\ncapability is crucial in scenarios like law enforcement, healthcare, and\nsurveillance, where accurate recognition must be maintained despite facial\nocclusion. This research focuses on the challenge of recognizing the same faces\nwith and without masks by employing cosine similarity as the primary technique.\nWith the increased use of masks, traditional facial recognition systems face\nsignificant accuracy issues, making it crucial to develop methods that can\nreliably identify individuals in masked conditions. For that reason, this study\nproposed Masked-Unmasked Face Matching Model (MUFM). This model employs\ntransfer learning using the Visual Geometry Group (VGG16) model to extract\nsignificant facial features, which are subsequently classified utilizing the\nK-Nearest Neighbors (K-NN) algorithm. The cosine similarity metric is employed\nto compare masked and unmasked faces of the same individuals. This approach\nrepresents a novel contribution, as the task of recognizing the same individual\nwith and without a mask using cosine similarity has not been previously\naddressed. By integrating these advanced methodologies, the research\ndemonstrates effective identification of individuals despite the presence of\nmasks, addressing a significant limitation in traditional systems. Using data\nis another essential part of this work, by collecting and preparing an image\ndataset from three different sources especially some of those data are real\nprovided a comprehensive power of this research. The image dataset used were\nalready collected in three different datasets of masked and unmasked for the\nsame faces.\n","authors":["Dana A Abdullah","Dana Rasul Hamad","Hakem Beitollahi","Ismail Y Maolood","Abdulhady Abas Abdullah","Aso Khaleel Ameen"],"pdf_url":"https://arxiv.org/pdf/2501.04444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03301v2","updated":"2025-01-08T11:47:25Z","published":"2025-01-06T15:19:26Z","title":"Rethinking Byzantine Robustness in Federated Recommendation from Sparse\n  Aggregation Perspective","summary":"  To preserve user privacy in recommender systems, federated recommendation\n(FR) based on federated learning (FL) emerges, keeping the personal data on the\nlocal client and updating a model collaboratively. Unlike FL, FR has a unique\nsparse aggregation mechanism, where the embedding of each item is updated by\nonly partial clients, instead of full clients in a dense aggregation of general\nFL. Recently, as an essential principle of FL, model security has received\nincreasing attention, especially for Byzantine attacks, where malicious clients\ncan send arbitrary updates. The problem of exploring the Byzantine robustness\nof FR is particularly critical since in the domains applying FR, e.g.,\ne-commerce, malicious clients can be injected easily by registering new\naccounts. However, existing Byzantine works neglect the unique sparse\naggregation of FR, making them unsuitable for our problem. Thus, we make the\nfirst effort to investigate Byzantine attacks on FR from the perspective of\nsparse aggregation, which is non-trivial: it is not clear how to define\nByzantine robustness under sparse aggregations and design Byzantine attacks\nunder limited knowledge/capability. In this paper, we reformulate the Byzantine\nrobustness under sparse aggregation by defining the aggregation for a single\nitem as the smallest execution unit. Then we propose a family of effective\nattack strategies, named Spattack, which exploit the vulnerability in sparse\naggregation and are categorized along the adversary's knowledge and capability.\nExtensive experimental results demonstrate that Spattack can effectively\nprevent convergence and even break down defenses under a few malicious clients,\nraising alarms for securing FR systems.\n","authors":["Zhongjian Zhang","Mengmei Zhang","Xiao Wang","Lingjuan Lyu","Bo Yan","Junping Du","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2501.03301v2.pdf","comment":"accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04438v1","updated":"2025-01-08T11:39:28Z","published":"2025-01-08T11:39:28Z","title":"Effect of Information Technology on Job Creation to Support Economic:\n  Case Studies of Graduates in Universities (2023-2024) of the KRG of Iraq","summary":"  The aim of this study is to assess the impact of information technology (IT)\non university graduates in terms of employment development, which will aid in\neconomic issues. This study uses a descriptive research methodology and a\nquantitative approach to understand variables. The focus of this study is to\nascertain how graduates of Kurdistan regional universities might use IT to\nsecure employment and significantly contribute to the nation's economic\nrevival. The sample size was established by the use of judgmental sampling\nprocedure and consisted of 314 people. The researcher prepared the\nquestionnaire to collect data, and then SPSS statistical software, version 22,\nand Excel 2010 were used to modify, compile, and tabulate the results. The\nstudy's outcome showed that information technology is incredibly inventive, has\na promising future, and makes life much easier for everyone. It also proved\nthat a deep academic understanding of information technology and its\nconstituent parts helps graduates of Kurdistan Regional University find\nsuitable careers. More importantly, though, anyone looking for work or a means\nof support will find great benefit from possessing credentials and\nunderstanding of IT. The study's final finding was that information technology\nhas actively advanced the country's economy. Not only is IT helping to boost\nyouth employment, but it is also turning into a worthwhile investment for\neconomic growth.\n","authors":["Azhi Kh. Bapir","Ismail Y. Maolood","Dana A Abdullah","Aso K. Ameen","Abdulhady Abas Abdullah"],"pdf_url":"https://arxiv.org/pdf/2501.04438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04437v1","updated":"2025-01-08T11:37:35Z","published":"2025-01-08T11:37:35Z","title":"Integrating LLMs with ITS: Recent Advances, Potentials, Challenges, and\n  Future Directions","summary":"  Intelligent Transportation Systems (ITS) are crucial for the development and\noperation of smart cities, addressing key challenges in efficiency,\nproductivity, and environmental sustainability. This paper comprehensively\nreviews the transformative potential of Large Language Models (LLMs) in\noptimizing ITS. Initially, we provide an extensive overview of ITS,\nhighlighting its components, operational principles, and overall effectiveness.\nWe then delve into the theoretical background of various LLM techniques, such\nas GPT, T5, CTRL, and BERT, elucidating their relevance to ITS applications.\nFollowing this, we examine the wide-ranging applications of LLMs within ITS,\nincluding traffic flow prediction, vehicle detection and classification,\nautonomous driving, traffic sign recognition, and pedestrian detection. Our\nanalysis reveals how these advanced models can significantly enhance traffic\nmanagement and safety. Finally, we explore the challenges and limitations LLMs\nface in ITS, such as data availability, computational constraints, and ethical\nconsiderations. We also present several future research directions and\npotential innovations to address these challenges. This paper aims to guide\nresearchers and practitioners through the complexities and opportunities of\nintegrating LLMs in ITS, offering a roadmap to create more efficient,\nsustainable, and responsive next-generation transportation systems.\n","authors":["Doaa Mahmud","Hadeel Hajmohamed","Shamma Almentheri","Shamma Alqaydi","Lameya Aldhaheri","Ruhul Amin Khalil","Nasir Saeed"],"pdf_url":"https://arxiv.org/pdf/2501.04437v1.pdf","comment":"Accepted for publication in IEEE Transactions on Intelligent\n  Transportation Systems"},{"id":"http://arxiv.org/abs/2501.04436v1","updated":"2025-01-08T11:37:06Z","published":"2025-01-08T11:37:06Z","title":"Federated Fine-Tuning of LLMs: Framework Comparison and Research\n  Directions","summary":"  Federated learning (FL) provides a privacy-preserving solution for\nfine-tuning pre-trained large language models (LLMs) using distributed private\ndatasets, enabling task-specific adaptation while preserving data privacy.\nHowever, fine-tuning the extensive parameters in LLMs is particularly\nchallenging in resource-constrained federated scenarios due to the significant\ncommunication and computational costs. To gain a deeper understanding of how\nthese challenges can be addressed, this article conducts a comparative analysis\nthree advanced federated LLM (FedLLM) frameworks that integrate knowledge\ndistillation (KD) and split learning (SL) to mitigate these issues: 1) FedLLMs,\nwhere clients upload model parameters or gradients to enable straightforward\nand effective fine-tuning; 2) KD-FedLLMs, which leverage KD for efficient\nknowledge sharing via logits; and 3) Split-FedLLMs, which split the LLMs into\ntwo parts, with one part executed on the client and the other one on the\nserver, to balance the computational load. Each framework is evaluated based on\nkey performance metrics, including model accuracy, communication overhead, and\nclient-side computational load, offering insights into their effectiveness for\nvarious federated fine-tuning scenarios. Through this analysis, we identify\nframework-specific optimization opportunities to enhance the efficiency of\nFedLLMs and discuss broader research directions, highlighting open\nopportunities to better adapt FedLLMs for real-world applications. A use case\nis presented to demonstrate the performance comparison of these three\nframeworks under varying configurations and settings.\n","authors":["Na Yan","Yang Su","Yansha Deng","Robert Schober"],"pdf_url":"https://arxiv.org/pdf/2501.04436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04435v1","updated":"2025-01-08T11:31:39Z","published":"2025-01-08T11:31:39Z","title":"A Digital Shadow for Modeling, Studying and Preventing Urban Crime","summary":"  Crime is one of the greatest threats to urban security. Around 80 percent of\nthe world's population lives in countries with high levels of criminality. Most\nof the crimes committed in the cities take place in their urban environments.\nThis paper presents the development and validation of a digital shadow platform\nfor modeling and simulating urban crime. This digital shadow has been\nconstructed using data-driven agent-based modeling and simulation techniques,\nwhich are suitable for capturing dynamic interactions among individuals and\nwith their environment. Our approach transforms and integrates well-known\ncriminological theories and the expert knowledge of law enforcement agencies\n(LEA), policy makers, and other stakeholders under a theoretical model, which\nis in turn combined with real crime, spatial (cartographic) and socio-economic\ndata into an urban model characterizing the daily behavior of citizens. The\ndigital shadow has also been instantiated for the city of Malaga, for which we\nhad over 300,000 complaints available. This instance has been calibrated with\nthose complaints and other geographic and socio-economic information of the\ncity. To the best of our knowledge, our digital shadow is the first for large\nurban areas that has been calibrated with a large dataset of real crime reports\nand with an accurate representation of the urban environment. The performance\nindicators of the model after being calibrated, in terms of the metrics widely\nused in predictive policing, suggest that our simulated crime generation\nmatches the general pattern of crime in the city according to historical data.\nOur digital shadow platform could be an interesting tool for modeling and\npredicting criminal behavior in an urban environment on a daily basis and,\nthus, a useful tool for policy makers, criminologists, sociologists, LEAs, etc.\nto study and prevent urban crime.\n","authors":["Juan Palma-Borda","Eduardo Guzmán","María-Victoria Belmonte"],"pdf_url":"https://arxiv.org/pdf/2501.04435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11189v2","updated":"2025-01-08T11:24:17Z","published":"2024-12-15T13:48:39Z","title":"Leveraging Large Language Models for Active Merchant Non-player\n  Characters","summary":"  We highlight two significant issues leading to the passivity of current\nmerchant non-player characters (NPCs): pricing and communication. While\nimmersive interactions have been a focus, negotiations between merchant NPCs\nand players on item prices have not received sufficient attention. First, we\ndefine passive pricing as the limited ability of merchants to modify predefined\nitem prices. Second, passive communication means that merchants can only\ninteract with players in a scripted manner. To tackle these issues and create\nan active merchant NPC, we propose a merchant framework based on large language\nmodels (LLMs), called MART, which consists of an appraiser module and a\nnegotiator module. We conducted two experiments to guide game developers in\nselecting appropriate implementations by comparing different training methods\nand LLM sizes. Our findings indicate that finetuning methods, such as\nsupervised finetuning (SFT) and knowledge distillation (KD), are effective in\nusing smaller LLMs to implement active merchant NPCs. Additionally, we found\nthree irregular cases arising from the responses of LLMs. We expect our\nfindings to guide developers in using LLMs for developing active merchant NPCs.\n","authors":["Byungjun Kim","Minju Kim","Dayeon Seo","Bugeun Kim"],"pdf_url":"https://arxiv.org/pdf/2412.11189v2.pdf","comment":"Under review / Modified the links to code and dataset"},{"id":"http://arxiv.org/abs/2501.04426v1","updated":"2025-01-08T11:20:48Z","published":"2025-01-08T11:20:48Z","title":"Dual-Force: Enhanced Offline Diversity Maximization under Imitation\n  Constraints","summary":"  While many algorithms for diversity maximization under imitation constraints\nare online in nature, many applications require offline algorithms without\nenvironment interactions. Tackling this problem in the offline setting,\nhowever, presents significant challenges that require non-trivial, multi-stage\noptimization processes with non-stationary rewards. In this work, we present a\nnovel offline algorithm that enhances diversity using an objective based on Van\nder Waals (VdW) force and successor features, and eliminates the need to learn\na previously used skill discriminator. Moreover, by conditioning the value\nfunction and policy on a pre-trained Functional Reward Encoding (FRE), our\nmethod allows for better handling of non-stationary rewards and provides\nzero-shot recall of all skills encountered during training, significantly\nexpanding the set of skills learned in prior work. Consequently, our algorithm\nbenefits from receiving a consistently strong diversity signal (VdW), and\nenjoys more stable and efficient training. We demonstrate the effectiveness of\nour method in generating diverse skills for two robotic tasks in simulation:\nlocomotion of a quadruped and local navigation with obstacle traversal.\n","authors":["Pavel Kolev","Marin Vlastelica","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2501.04426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04424v1","updated":"2025-01-08T11:17:40Z","published":"2025-01-08T11:17:40Z","title":"NSA: Neuro-symbolic ARC Challenge","summary":"  The Abstraction and Reasoning Corpus (ARC) evaluates general reasoning\ncapabilities that are difficult for both machine learning models and\ncombinatorial search methods. We propose a neuro-symbolic approach that\ncombines a transformer for proposal generation with combinatorial search using\na domain-specific language. The transformer narrows the search space by\nproposing promising search directions, which allows the combinatorial search to\nfind the actual solution in short time. We pre-train the trainsformer with\nsynthetically generated data. During test-time we generate additional\ntask-specific training tasks and fine-tune our model. Our results surpass\ncomparable state of the art on the ARC evaluation set by 27% and compare\nfavourably on the ARC train set. We make our code and dataset publicly\navailable at https://github.com/Batorskq/NSA.\n","authors":["Paweł Batorski","Jannik Brinkmann","Paul Swoboda"],"pdf_url":"https://arxiv.org/pdf/2501.04424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.06652v4","updated":"2025-01-08T11:10:16Z","published":"2022-08-13T13:46:13Z","title":"Differentiable Inductive Logic Programming in High-Dimensional Space","summary":"  Synthesizing large logic programs through symbolic Inductive Logic\nProgramming (ILP) typically requires intermediate definitions. However,\ncluttering the hypothesis space with intensional predicates typically degrades\nperformance. In contrast, gradient descent provides an efficient way to find\nsolutions within such high-dimensional spaces. Neuro-symbolic ILP approaches\nhave not fully exploited this so far. We propose extending the {\\delta}ILP\napproach to inductive synthesis with large-scale predicate invention, thus\nallowing us to exploit the efficacy of high-dimensional gradient descent. We\nshow that large-scale predicate invention benefits differentiable inductive\nsynthesis through gradient descent and allows one to learn solutions for tasks\nbeyond the capabilities of existing neuro-symbolic ILP systems. Furthermore, we\nachieve these results without specifying the precise structure of the solution\nwithin the language bias.\n","authors":["Stanisław J. Purgał","David M. Cerna","Cezary Kaliszyk"],"pdf_url":"https://arxiv.org/pdf/2208.06652v4.pdf","comment":"8 pages, To appear, published at IJCLR 2024"},{"id":"http://arxiv.org/abs/2501.04410v1","updated":"2025-01-08T10:49:13Z","published":"2025-01-08T10:49:13Z","title":"User Simulation in the Era of Generative AI: User Modeling, Synthetic\n  Data Generation, and System Evaluation","summary":"  User simulation is an emerging interdisciplinary topic with multiple critical\napplications in the era of Generative AI. It involves creating an intelligent\nagent that mimics the actions of a human user interacting with an AI system,\nenabling researchers to model and analyze user behaviour, generate synthetic\ndata for training, and evaluate interactive AI systems in a controlled and\nreproducible manner. User simulation has profound implications for diverse\nfields and plays a vital role in the pursuit of Artificial General\nIntelligence. This paper provides an overview of user simulation, highlighting\nits key applications, connections to various disciplines, and outlining future\nresearch directions to advance this increasingly important technology.\n","authors":["Krisztian Balog","ChengXiang Zhai"],"pdf_url":"https://arxiv.org/pdf/2501.04410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03535v2","updated":"2025-01-08T10:34:54Z","published":"2025-01-07T05:15:46Z","title":"SenseRAG: Constructing Environmental Knowledge Bases with Proactive\n  Querying for LLM-Based Autonomous Driving","summary":"  This study addresses the critical need for enhanced situational awareness in\nautonomous driving (AD) by leveraging the contextual reasoning capabilities of\nlarge language models (LLMs). Unlike traditional perception systems that rely\non rigid, label-based annotations, it integrates real-time, multimodal sensor\ndata into a unified, LLMs-readable knowledge base, enabling LLMs to dynamically\nunderstand and respond to complex driving environments. To overcome the\ninherent latency and modality limitations of LLMs, a proactive\nRetrieval-Augmented Generation (RAG) is designed for AD, combined with a\nchain-of-thought prompting mechanism, ensuring rapid and context-rich\nunderstanding. Experimental results using real-world Vehicle-to-everything\n(V2X) datasets demonstrate significant improvements in perception and\nprediction performance, highlighting the potential of this framework to enhance\nsafety, adaptability, and decision-making in next-generation AD systems.\n","authors":["Xuewen Luo","Fan Ding","Fengze Yang","Yang Zhou","Junnyong Loo","Hwa Hui Tew","Chenxi Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03535v2.pdf","comment":"This paper has been accepted for presentation at WACV Workshop LLMAD\n  2025"},{"id":"http://arxiv.org/abs/2309.06941v3","updated":"2025-01-08T09:35:58Z","published":"2023-09-13T13:24:27Z","title":"DEFormer: DCT-driven Enhancement Transformer for Low-light Image and\n  Dark Vision","summary":"  Low-light image enhancement restores the colors and details of a single image\nand improves high-level visual tasks. However, restoring the lost details in\nthe dark area is still a challenge relying only on the RGB domain. In this\npaper, we delve into frequency as a new clue into the model and propose a\nDCT-driven enhancement transformer (DEFormer) framework. First, we propose a\nlearnable frequency branch (LFB) for frequency enhancement contains DCT\nprocessing and curvature-based frequency enhancement (CFE) to represent\nfrequency features. Additionally, we propose a cross domain fusion (CDF) to\nreduce the differences between the RGB domain and the frequency domain. Our\nDEFormer has achieved superior results on the LOL and MIT-Adobe FiveK datasets,\nimproving the dark detection performance.\n","authors":["Xiangchen Yin","Zhenda Yu","Xin Gao","Xiao Sun"],"pdf_url":"https://arxiv.org/pdf/2309.06941v3.pdf","comment":"Accepted by ICASSP"},{"id":"http://arxiv.org/abs/2501.04377v1","updated":"2025-01-08T09:34:15Z","published":"2025-01-08T09:34:15Z","title":"On Computational Limits and Provably Efficient Criteria of Visual\n  Autoregressive Models: A Fine-Grained Complexity Analysis","summary":"  Recently, Visual Autoregressive ($\\mathsf{VAR}$) Models introduced a\ngroundbreaking advancement in the field of image generation, offering a\nscalable approach through a coarse-to-fine \"next-scale prediction\" paradigm.\nHowever, the state-of-the-art algorithm of $\\mathsf{VAR}$ models in [Tian,\nJiang, Yuan, Peng and Wang, NeurIPS 2024] takes $O(n^4)$ time, which is\ncomputationally inefficient. In this work, we analyze the computational limits\nand efficiency criteria of $\\mathsf{VAR}$ Models through a fine-grained\ncomplexity lens. Our key contribution is identifying the conditions under which\n$\\mathsf{VAR}$ computations can achieve sub-quadratic time complexity.\nSpecifically, we establish a critical threshold for the norm of input matrices\nused in $\\mathsf{VAR}$ attention mechanisms. Above this threshold, assuming the\nStrong Exponential Time Hypothesis ($\\mathsf{SETH}$) from fine-grained\ncomplexity theory, a sub-quartic time algorithm for $\\mathsf{VAR}$ models is\nimpossible. To substantiate our theoretical findings, we present efficient\nconstructions leveraging low-rank approximations that align with the derived\ncriteria. This work initiates the study of the computational efficiency of the\n$\\mathsf{VAR}$ model from a theoretical perspective. Our technique will shed\nlight on advancing scalable and efficient image generation in $\\mathsf{VAR}$\nframeworks.\n","authors":["Yekun Ke","Xiaoyu Li","Yingyu Liang","Zhizhou Sha","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2501.04377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18601v2","updated":"2025-01-08T09:30:47Z","published":"2024-07-26T08:41:58Z","title":"Reorganizing attention-space geometry with expressive attention","summary":"  Attention regulates information transfer between tokens. For this, query and\nkey vectors are compared, typically in terms of a scalar product,\n$\\mathbf{Q}^T\\mathbf{K}$, together with a subsequent softmax normalization. In\ngeometric terms, the standard dot-product attention (DPA) leads to large/small\nattention weights for parallel/antiparallel queries and keys. Here we study\nexpressive attention (EA), which is based on $(\\mathbf{Q}^T\\mathbf{K})^2$, the\nsquared dot product. In this case, attention is enhanced when query and key are\neither parallel or antiparallel, and suppressed for orthogonal configurations.\nEA can be introduced into any attention-based code without additional compute\ncosts or memory requirements. For a series of autoregressive prediction tasks,\nwe find that expressive attention performs at least as well as vanilla DPA.\nIncreasing task complexity, EA is observed to outperform DPA with increasing\nmargins, which also holds for multi-task settings. For a given model size, EA\nmanages to achieve 100% performance for a range of complexity levels not\naccessible to DPA. Our results show that it is possible to reorganize the\ngeometry of the matching condition in the space of attention heads without loss\nof performance.\n","authors":["Claudius Gros"],"pdf_url":"https://arxiv.org/pdf/2407.18601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15267v2","updated":"2025-01-08T09:18:05Z","published":"2024-12-17T05:04:57Z","title":"Toxicity Detection towards Adaptability to Changing Perturbations","summary":"  Toxicity detection is crucial for maintaining the peace of the society. While\nexisting methods perform well on normal toxic contents or those generated by\nspecific perturbation methods, they are vulnerable to evolving perturbation\npatterns. However, in real-world scenarios, malicious users tend to create new\nperturbation patterns for fooling the detectors. For example, some users may\ncircumvent the detector of large language models (LLMs) by adding `I am a\nscientist' at the beginning of the prompt. In this paper, we introduce a novel\nproblem, i.e., continual learning jailbreak perturbation patterns, into the\ntoxicity detection field. To tackle this problem, we first construct a new\ndataset generated by 9 types of perturbation patterns, 7 of them are summarized\nfrom prior work and 2 of them are developed by us. We then systematically\nvalidate the vulnerability of current methods on this new perturbation\npattern-aware dataset via both the zero-shot and fine tuned cross-pattern\ndetection. Upon this, we present the domain incremental learning paradigm and\nthe corresponding benchmark to ensure the detector's robustness to dynamically\nemerging types of perturbed toxic text. Our code and dataset are provided in\nthe appendix and will be publicly available at GitHub, by which we wish to\noffer new research opportunities for the security-relevant communities.\n","authors":["Hankun Kang","Jianhao Chen","Yongqi Li","Xin Miao","Mayi Xu","Ming Zhong","Yuanyuan Zhu","Tieyun Qian"],"pdf_url":"https://arxiv.org/pdf/2412.15267v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04366v1","updated":"2025-01-08T09:08:24Z","published":"2025-01-08T09:08:24Z","title":"DispFormer: Pretrained Transformer for Flexible Dispersion Curve\n  Inversion from Global Synthesis to Regional Applications","summary":"  Surface wave dispersion curve inversion is essential for estimating\nsubsurface Shear-wave velocity ($v_s$), yet traditional methods often struggle\nto balance computational efficiency with inversion accuracy. While deep\nlearning approaches show promise, previous studies typically require large\namounts of labeled data and struggle with real-world datasets that have varying\nperiod ranges, missing data, and low signal-to-noise ratios. This study\nproposes DispFormer, a transformer-based neural network for inverting the $v_s$\nprofile from Rayleigh-wave phase and group dispersion curves. DispFormer\nprocesses dispersion data at each period independently, thereby allowing it to\nhandle data of varying lengths without requiring network modifications or\nalignment between training and testing data. The performance is demonstrated by\npre-training it on a global synthetic dataset and testing it on two regional\nsynthetic datasets using zero-shot and few-shot strategies. Results indicate\nthat zero-shot DispFormer, even without any labeled data, produces inversion\nprofiles that match well with the ground truth, providing a deployable initial\nmodel generator to assist traditional methods. When labeled data is available,\nfew-shot DispFormer outperforms traditional methods with only a small number of\nlabels. Furthermore, real-world tests indicate that DispFormer effectively\nhandles varying length data, and yields lower data residuals than reference\nmodels. These findings demonstrate that DispFormer provides a robust foundation\nmodel for dispersion curve inversion and is a promising approach for broader\napplications.\n","authors":["Feng Liu","Bao Deng","Rui Su","Lei Bai","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2501.04366v1.pdf","comment":"11 pages, 11 figures, related codes and data are available at\n  https://github.com/liufeng2317/DispFormer"},{"id":"http://arxiv.org/abs/2404.07965v4","updated":"2025-01-08T09:07:54Z","published":"2024-04-11T17:52:01Z","title":"Rho-1: Not All Tokens Are What You Need","summary":"  Previous language model pre-training methods have uniformly applied a\nnext-token prediction loss to all training tokens. Challenging this norm, we\nposit that \"9l training\". Our initial analysis examines token-level training\ndynamics of language model, revealing distinct loss patterns for different\ntokens. Leveraging these insights, we introduce a new language model called\nRho-1. Unlike traditional LMs that learn to predict every next token in a\ncorpus, Rho-1 employs Selective Language Modeling (SLM), which selectively\ntrains on useful tokens that aligned with the desired distribution. This\napproach involves scoring pretraining tokens using a reference model, and then\ntraining the language model with a focused loss on tokens with higher scores.\nWhen continual pretraining on 15B OpenWebMath corpus, Rho-1 yields an absolute\nimprovement in few-shot accuracy of up to 30% in 9 math tasks. After\nfine-tuning, Rho-1-1B and 7B achieved state-of-the-art results of 40.6% and\n51.8% on MATH dataset, respectively - matching DeepSeekMath with only 3% of the\npretraining tokens. Furthermore, when continual pretraining on 80B general\ntokens, Rho-1 achieves 6.8% average enhancement across 15 diverse tasks,\nincreasing both efficiency and performance of the language model pre-training.\n","authors":["Zhenghao Lin","Zhibin Gou","Yeyun Gong","Xiao Liu","Yelong Shen","Ruochen Xu","Chen Lin","Yujiu Yang","Jian Jiao","Nan Duan","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07965v4.pdf","comment":"First two authors equal contribution"},{"id":"http://arxiv.org/abs/2501.03562v2","updated":"2025-01-08T08:57:32Z","published":"2025-01-07T06:22:55Z","title":"Rethinking Adversarial Attacks in Reinforcement Learning from Policy\n  Distribution Perspective","summary":"  Deep Reinforcement Learning (DRL) suffers from uncertainties and inaccuracies\nin the observation signal in realworld applications. Adversarial attack is an\neffective method for evaluating the robustness of DRL agents. However, existing\nattack methods targeting individual sampled actions have limited impacts on the\noverall policy distribution, particularly in continuous action spaces. To\naddress these limitations, we propose the Distribution-Aware Projected Gradient\nDescent attack (DAPGD). DAPGD uses distribution similarity as the gradient\nperturbation input to attack the policy network, which leverages the entire\npolicy distribution rather than relying on individual samples. We utilize the\nBhattacharyya distance in DAPGD to measure policy similarity, enabling\nsensitive detection of subtle but critical differences between probability\ndistributions. Our experiment results demonstrate that DAPGD achieves SOTA\nresults compared to the baselines in three robot navigation tasks, achieving an\naverage 22.03% higher reward drop compared to the best baseline.\n","authors":["Tianyang Duan","Zongyuan Zhang","Zheng Lin","Yue Gao","Ling Xiong","Yong Cui","Hongbin Liang","Xianhao Chen","Heming Cui","Dong Huang"],"pdf_url":"https://arxiv.org/pdf/2501.03562v2.pdf","comment":"10 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.04343v1","updated":"2025-01-08T08:30:44Z","published":"2025-01-08T08:30:44Z","title":"TimelineKGQA: A Comprehensive Question-Answer Pair Generator for\n  Temporal Knowledge Graphs","summary":"  Question answering over temporal knowledge graphs (TKGs) is crucial for\nunderstanding evolving facts and relationships, yet its development is hindered\nby limited datasets and difficulties in generating custom QA pairs. We propose\na novel categorization framework based on timeline-context relationships, along\nwith \\textbf{TimelineKGQA}, a universal temporal QA generator applicable to any\nTKGs. The code is available at: \\url{https://github.com/PascalSun/TimelineKGQA}\nas an open source Python package.\n","authors":["Qiang Sun","Sirui Li","Du Huynh","Mark Reynolds","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.04343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01189v3","updated":"2025-01-08T07:59:53Z","published":"2024-06-03T10:51:43Z","title":"MultiMax: Sparse and Multi-Modal Attention Learning","summary":"  SoftMax is a ubiquitous ingredient of modern machine learning algorithms. It\nmaps an input vector onto a probability simplex and reweights the input by\nconcentrating the probability mass at large entries. Yet, as a smooth\napproximation to the Argmax function, a significant amount of probability mass\nis distributed to other, residual entries, leading to poor interpretability and\nnoise. Although sparsity can be achieved by a family of SoftMax variants, they\noften require an alternative loss function and do not preserve multi-modality.\nWe show that this trade-off between multi-modality and sparsity limits the\nexpressivity of SoftMax as well as its variants. We provide a solution to this\ntension between objectives by proposing a piece-wise differentiable function,\ntermed MultiMax, which adaptively modulates the output distribution according\nto input entry range. Through comprehensive analysis and evaluation, we show\nthat MultiMax successfully produces a distribution that supresses irrelevant\nentries while preserving multimodality, with benefits in image classification,\nlanguage modeling and machine translation. The code is available at\nhttps://github.com/ZhouYuxuanYX/MultiMax.\n","authors":["Yuxuan Zhou","Mario Fritz","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2406.01189v3.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2409.14978v2","updated":"2025-01-08T07:53:15Z","published":"2024-09-23T12:57:24Z","title":"TS-HTFA: Advancing Time Series Forecasting via Hierarchical Text-Free\n  Alignment with Large Language Models","summary":"  Given the significant potential of large language models (LLMs) in sequence\nmodeling, emerging studies have begun applying them to time-series forecasting.\nDespite notable progress, existing methods still face two critical challenges:\n1) their reliance on large amounts of paired text data, limiting the model\napplicability, and 2) a substantial modality gap between text and time series,\nleading to insufficient alignment and suboptimal performance. In this paper, we\nintroduce \\textbf{H}ierarchical \\textbf{T}ext-\\textbf{F}ree \\textbf{A}lignment\n(\\textbf{TS-HTFA}), a novel method that leverages hierarchical alignment to\nfully exploit the representation capacity of LLMs while eliminating the\ndependence on text data. Specifically, we replace paired text data with\nadaptive virtual text based on QR decomposition word embeddings and learnable\nprompt. Furthermore, we establish comprehensive cross-modal alignment at three\nlevels: input, feature, and output. Extensive experiments on multiple\ntime-series benchmarks demonstrate that HTFA achieves state-of-the-art\nperformance, significantly improving prediction accuracy and generalization.\n","authors":["Pengfei Wang","Huanran Zheng","Qi'ao Xu","Silong Dai","Yiqiao Wang","Wenjing Yue","Wei Zhu","Tianwen Qian","Xiaoling Wang"],"pdf_url":"https://arxiv.org/pdf/2409.14978v2.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.00662v2","updated":"2025-01-08T07:35:31Z","published":"2024-06-30T11:14:29Z","title":"Multi-Agent Training for Pommerman: Curriculum Learning and\n  Population-based Self-Play Approach","summary":"  Pommerman is a multi-agent environment that has received considerable\nattention from researchers in recent years. This environment is an ideal\nbenchmark for multi-agent training, providing a battleground for two teams with\ncommunication capabilities among allied agents. Pommerman presents significant\nchallenges for model-free reinforcement learning due to delayed action effects,\nsparse rewards, and false positives, where opponent players can lose due to\ntheir own mistakes. This study introduces a system designed to train\nmulti-agent systems to play Pommerman using a combination of curriculum\nlearning and population-based self-play. We also tackle two challenging\nproblems when deploying the multi-agent training system for competitive games:\nsparse reward and suitable matchmaking mechanism. Specifically, we propose an\nadaptive annealing factor based on agents' performance to adjust the dense\nexploration reward during training dynamically. Additionally, we implement a\nmatchmaking mechanism utilizing the Elo rating system to pair agents\neffectively. Our experimental results demonstrate that our trained agent can\noutperform top learning agents without requiring communication among allied\nagents.\n","authors":["Nhat-Minh Huynh","Hoang-Giang Cao","I-Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2407.00662v2.pdf","comment":"Accepted at The First Workshop on Game AI Algorithms and Multi-Agent\n  Learning - IJCAI 2024"},{"id":"http://arxiv.org/abs/2306.05412v4","updated":"2025-01-08T07:29:55Z","published":"2023-06-08T17:56:46Z","title":"Decoupled Prioritized Resampling for Offline RL","summary":"  Offline reinforcement learning (RL) is challenged by the distributional shift\nproblem. To address this problem, existing works mainly focus on designing\nsophisticated policy constraints between the learned policy and the behavior\npolicy. However, these constraints are applied equally to well-performing and\ninferior actions through uniform sampling, which might negatively affect the\nlearned policy. To alleviate this issue, we propose Offline Prioritized\nExperience Replay (OPER), featuring a class of priority functions designed to\nprioritize highly-rewarding transitions, making them more frequently visited\nduring training. Through theoretical analysis, we show that this class of\npriority functions induce an improved behavior policy, and when constrained to\nthis improved policy, a policy-constrained offline RL algorithm is likely to\nyield a better solution. We develop two practical strategies to obtain priority\nweights by estimating advantages based on a fitted value network (OPER-A) or\nutilizing trajectory returns (OPER-R) for quick computation. OPER is a\nplug-and-play component for offline RL algorithms. As case studies, we evaluate\nOPER on five different algorithms, including BC, TD3+BC, Onestep RL, CQL, and\nIQL. Extensive experiments demonstrate that both OPER-A and OPER-R\nsignificantly improve the performance for all baseline methods. Codes and\npriority weights are availiable at https://github.com/sail-sg/OPER.\n","authors":["Yang Yue","Bingyi Kang","Xiao Ma","Qisen Yang","Gao Huang","Shiji Song","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2306.05412v4.pdf","comment":"published on IEEE TNNLS"},{"id":"http://arxiv.org/abs/2411.07464v2","updated":"2025-01-08T07:25:55Z","published":"2024-11-12T00:57:30Z","title":"BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating\n  Machine Learning Tasks","summary":"  Large Language Models (LLMs) excel in diverse applications including\ngeneration of code snippets, but often struggle with generating code for\ncomplex Machine Learning (ML) tasks. Although existing LLM single-agent based\nsystems give varying performance depending on the task complexity, they purely\nrely on larger and expensive models such as GPT-4. Our investigation reveals\nthat no-cost and low-cost models such as Gemini-Pro, Mixtral and CodeLlama\nperform far worse than GPT-4 in a single-agent setting. With the motivation of\ndeveloping a cost-efficient LLM based solution for solving ML tasks, we propose\nan LLM Multi-Agent based system which leverages combination of experts using\nprofiling, efficient retrieval of past observations, LLM cascades, and\nask-the-expert calls. Through empirical analysis on ML engineering tasks in the\nMLAgentBench benchmark, we demonstrate the effectiveness of our system, using\nno-cost models, namely Gemini as the base LLM, paired with GPT-4 in cascade and\nexpert to serve occasional ask-the-expert calls for planning. With 94.2\\%\nreduction in the cost (from \\$0.931 per run cost averaged over all tasks for\nGPT-4 single agent system to \\$0.054), our system is able to yield better\naverage success rate of 32.95\\% as compared to GPT-4 single-agent system\nyielding 22.72\\% success rate averaged over all the tasks of MLAgentBench.\n","authors":["Shubham Gandhi","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2411.07464v2.pdf","comment":"Presented at AIMLSystems '24"},{"id":"http://arxiv.org/abs/2408.14418v3","updated":"2025-01-08T07:23:56Z","published":"2024-08-26T17:04:00Z","title":"MEDSAGE: Enhancing Robustness of Medical Dialogue Summarization to ASR\n  Errors with LLM-generated Synthetic Dialogues","summary":"  Automatic Speech Recognition (ASR) systems are pivotal in transcribing speech\ninto text, yet the errors they introduce can significantly degrade the\nperformance of downstream tasks like summarization. This issue is particularly\npronounced in clinical dialogue summarization, a low-resource domain where\nsupervised data for fine-tuning is scarce, necessitating the use of ASR models\nas black-box solutions. Employing conventional data augmentation for enhancing\nthe noise robustness of summarization models is not feasible either due to the\nunavailability of sufficient medical dialogue audio recordings and\ncorresponding ASR transcripts. To address this challenge, we propose MEDSAGE,\nan approach for generating synthetic samples for data augmentation using Large\nLanguage Models (LLMs). Specifically, we leverage the in-context learning\ncapabilities of LLMs and instruct them to generate ASR-like errors based on a\nfew available medical dialogue examples with audio recordings. Experimental\nresults show that LLMs can effectively model ASR noise, and incorporating this\nnoisy data into the training process significantly improves the robustness and\naccuracy of medical dialogue summarization systems. This approach addresses the\nchallenges of noisy ASR outputs in critical applications, offering a robust\nsolution to enhance the reliability of clinical dialogue summarization.\n","authors":["Kuluhan Binici","Abhinav Ramesh Kashyap","Viktor Schlegel","Andy T. Liu","Vijay Prakash Dwivedi","Thanh-Tung Nguyen","Xiaoxue Gao","Nancy F. Chen","Stefan Winkler"],"pdf_url":"https://arxiv.org/pdf/2408.14418v3.pdf","comment":"Accepted by the Thirty-Ninth AAAI Conference on Artificial\n  Intelligence (AAAI-25)"},{"id":"http://arxiv.org/abs/2407.16040v2","updated":"2025-01-08T07:21:15Z","published":"2024-07-22T20:34:00Z","title":"Generalizing Teacher Networks for Effective Knowledge Distillation\n  Across Student Architectures","summary":"  Knowledge distillation (KD) is a model compression method that entails\ntraining a compact student model to emulate the performance of a more complex\nteacher model. However, the architectural capacity gap between the two models\nlimits the effectiveness of knowledge transfer. Addressing this issue, previous\nworks focused on customizing teacher-student pairs to improve compatibility, a\ncomputationally expensive process that needs to be repeated every time either\nmodel changes. Hence, these methods are impractical when a teacher model has to\nbe compressed into different student models for deployment on multiple hardware\ndevices with distinct resource constraints. In this work, we propose Generic\nTeacher Network (GTN), a one-off KD-aware training to create a generic teacher\ncapable of effectively transferring knowledge to any student model sampled from\na given finite pool of architectures. To this end, we represent the student\npool as a weight-sharing supernet and condition our generic teacher to align\nwith the capacities of various student architectures sampled from this\nsupernet. Experimental evaluation shows that our method both improves overall\nKD effectiveness and amortizes the minimal additional training cost of the\ngeneric teacher across students in the pool.\n","authors":["Kuluhan Binici","Weiming Wu","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2407.16040v2.pdf","comment":"British Machine Vision Conference (BMVC 24)"},{"id":"http://arxiv.org/abs/2409.12444v3","updated":"2025-01-08T07:19:14Z","published":"2024-09-19T03:52:50Z","title":"A Lightweight and Real-Time Binaural Speech Enhancement Model with\n  Spatial Cues Preservation","summary":"  Binaural speech enhancement (BSE) aims to jointly improve the speech quality\nand intelligibility of noisy signals received by hearing devices and preserve\nthe spatial cues of the target for natural listening. Existing methods often\nsuffer from the compromise between noise reduction (NR) capacity and spatial\ncues preservation (SCP) accuracy and a high computational demand in complex\nacoustic scenes. In this work, we present a learning-based lightweight binaural\ncomplex convolutional network (LBCCN), which excels in NR by filtering\nlow-frequency bands and keeping the rest. Additionally, our approach explicitly\nincorporates the estimation of interchannel relative acoustic transfer function\nto ensure the spatial cues fidelity and speech clarity. Results show that the\nproposed LBCCN can achieve a comparable NR performance to state-of-the-art\nmethods under fixed-speaker conditions, but with a much lower computational\ncost and a certain degree of SCP capability. The reproducible code and audio\nexamples are available at https://github.com/jywanng/LBCCN.\n","authors":["Jingyuan Wang","Jie Zhang","Shihao Chen","Miao Sun"],"pdf_url":"https://arxiv.org/pdf/2409.12444v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04315v1","updated":"2025-01-08T07:13:52Z","published":"2025-01-08T07:13:52Z","title":"RoRA: Efficient Fine-Tuning of LLM with Reliability Optimization for\n  Rank Adaptation","summary":"  Fine-tuning helps large language models (LLM) recover degraded information\nand enhance task performance.Although Low-Rank Adaptation (LoRA) is widely used\nand effective for fine-tuning, we have observed that its scaling factor can\nlimit or even reduce performance as the rank size increases. To address this\nissue, we propose RoRA (Rank-adaptive Reliability Optimization), a simple yet\neffective method for optimizing LoRA's scaling factor. By replacing $\\alpha/r$\nwith $\\alpha/\\sqrt{r}$, RoRA ensures improved performance as rank size\nincreases. Moreover, RoRA enhances low-rank adaptation in fine-tuning\nuncompressed models and excels in the more challenging task of accuracy\nrecovery when fine-tuning pruned models. Extensive experiments demonstrate the\neffectiveness of RoRA in fine-tuning both uncompressed and pruned models. RoRA\nsurpasses the state-of-the-art (SOTA) in average accuracy and robustness on\nLLaMA-7B/13B, LLaMA2-7B, and LLaMA3-8B, specifically outperforming LoRA and\nDoRA by 6.5% and 2.9% on LLaMA-7B, respectively. In pruned model fine-tuning,\nRoRA shows significant advantages; for SHEARED-LLAMA-1.3, a LLaMA-7B with 81.4%\npruning, RoRA achieves 5.7% higher average accuracy than LoRA and 3.9% higher\nthan DoRA.\n","authors":["Jun Liu","Zhenglun Kong","Peiyan Dong","Xuan Shen","Pu Zhao","Hao Tang","Geng Yuan","Wei Niu","Wenbin Zhang","Xue Lin","Dong Huang","Yanzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04315v1.pdf","comment":"ICASSP 2025"},{"id":"http://arxiv.org/abs/2412.13720v2","updated":"2025-01-08T07:03:42Z","published":"2024-12-18T11:00:58Z","title":"Federated Learning and RAG Integration: A Scalable Approach for Medical\n  Large Language Models","summary":"  This study analyzes the performance of domain-specific Large Language Models\n(LLMs) for the medical field by integrating Retrieval-Augmented Generation\n(RAG) systems within a federated learning framework. Leveraging the inherent\nadvantages of federated learning, such as preserving data privacy and enabling\ndistributed computation, this research explores the integration of RAG systems\nwith models trained under varying client configurations to optimize\nperformance. Experimental results demonstrate that the federated learning-based\nmodels integrated with RAG systems consistently outperform their non-integrated\ncounterparts across all evaluation metrics. This study highlights the potential\nof combining federated learning and RAG systems for developing domain-specific\nLLMs in the medical field, providing a scalable and privacy-preserving solution\nfor enhancing text generation capabilities.\n","authors":["Jincheol Jung","Hongju Jeong","Eui-Nam Huh"],"pdf_url":"https://arxiv.org/pdf/2412.13720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19681v3","updated":"2025-01-08T06:56:19Z","published":"2024-07-29T03:53:14Z","title":"Motion Manifold Flow Primitives for Task-Conditioned Trajectory\n  Generation under Complex Task-Motion Dependencies","summary":"  Effective movement primitives should be capable of encoding and generating a\nrich repertoire of trajectories -- typically collected from human\ndemonstrations -- conditioned on task-defining parameters such as vision or\nlanguage inputs. While recent methods based on the motion manifold hypothesis,\nwhich assumes that a set of trajectories lies on a lower-dimensional nonlinear\nsubspace, address challenges such as limited dataset size and the high\ndimensionality of trajectory data, they often struggle to capture complex\ntask-motion dependencies, i.e., when motion distributions shift drastically\nwith task variations. To address this, we introduce Motion Manifold Flow\nPrimitives (MMFP), a framework that decouples the training of the motion\nmanifold from task-conditioned distributions. Specifically, we employ flow\nmatching models, state-of-the-art conditional deep generative models, to learn\ntask-conditioned distributions in the latent coordinate space of the learned\nmotion manifold. Experiments are conducted on language-guided trajectory\ngeneration tasks, where many-to-many text-motion correspondences introduce\ncomplex task-motion dependencies, highlighting MMFP's superiority over existing\nmethods.\n","authors":["Yonghyeon Lee","Byeongho Lee","Seungyeon Kim","Frank C. Park"],"pdf_url":"https://arxiv.org/pdf/2407.19681v3.pdf","comment":"8 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.01714v4","updated":"2025-01-08T06:52:07Z","published":"2024-04-02T07:57:17Z","title":"Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization\n  Algorithm for Deep Learning","summary":"  Training deep neural networks is a challenging task. In order to speed up\ntraining and enhance the performance of deep neural networks, we rectify the\nvanilla conjugate gradient as conjugate-gradient-like and incorporate it into\nthe generic Adam, and thus propose a new optimization algorithm named\nCG-like-Adam for deep learning. Specifically, both the first-order and the\nsecond-order moment estimation of generic Adam are replaced by the\nconjugate-gradient-like. Convergence analysis handles the cases where the\nexponential moving average coefficient of the first-order moment estimation is\nconstant and the first-order moment estimation is unbiased. Numerical\nexperiments show the superiority of the proposed algorithm based on the\nCIFAR10/100 dataset.\n","authors":["Jiawu Tian","Liwei Xu","Xiaowei Zhang","Yongqi Li"],"pdf_url":"https://arxiv.org/pdf/2404.01714v4.pdf","comment":"32 pages, 13 figures"},{"id":"http://arxiv.org/abs/2412.14500v2","updated":"2025-01-08T06:52:05Z","published":"2024-12-19T03:48:23Z","title":"The Digital Ecosystem of Beliefs: does evolution favour AI over humans?","summary":"  As AI systems are integrated into social networks, there are AI safety\nconcerns that AI-generated content may dominate the web, e.g. in popularity or\nimpact on beliefs. To understand such questions, this paper proposes the\nDigital Ecosystem of Beliefs (Digico), the first evolutionary framework for\ncontrolled experimentation with multi-population interactions in simulated\nsocial networks. The framework models a population of agents which change their\nmessaging strategies due to evolutionary updates following a Universal\nDarwinism approach, interact via messages, influence each other's beliefs\nthrough dynamics based on a contagion model, and maintain their beliefs through\ncognitive Lamarckian inheritance. Initial experiments with an abstract\nimplementation of Digico show that: a) when AIs have faster messaging,\nevolution, and more influence in the recommendation algorithm, they get 80% to\n95% of the views, depending on the size of the influence benefit; b) AIs\ndesigned for propaganda can typically convince 50% of humans to adopt extreme\nbeliefs, and up to 85% when agents believe only a limited number of channels;\nc) a penalty for content that violates agents' beliefs reduces propaganda\neffectiveness by up to 8%. We further discuss implications for control (e.g.\nlegislation) and Digico as a means of studying evolutionary principles.\n","authors":["David M. Bossens","Shanshan Feng","Yew-Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2412.14500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05271v2","updated":"2025-01-08T06:30:39Z","published":"2023-09-11T07:05:02Z","title":"AutoFuse: Automatic Fusion Networks for Deformable Medical Image\n  Registration","summary":"  Deformable image registration aims to find a dense non-linear spatial\ncorrespondence between a pair of images, which is a crucial step for many\nmedical tasks such as tumor growth monitoring and population analysis.\nRecently, Deep Neural Networks (DNNs) have been widely recognized for their\nability to perform fast end-to-end registration. However, DNN-based\nregistration needs to explore the spatial information of each image and fuse\nthis information to characterize spatial correspondence. This raises an\nessential question: what is the optimal fusion strategy to characterize spatial\ncorrespondence? Existing fusion strategies (e.g., early fusion, late fusion)\nwere empirically designed to fuse information by manually defined prior\nknowledge, which inevitably constrains the registration performance within the\nlimits of empirical designs. In this study, we depart from existing\nempirically-designed fusion strategies and develop a data-driven fusion\nstrategy for deformable image registration. To achieve this, we propose an\nAutomatic Fusion network (AutoFuse) that provides flexibility to fuse\ninformation at many potential locations within the network. A Fusion Gate (FG)\nmodule is also proposed to control how to fuse information at each potential\nnetwork location based on training data. Our AutoFuse can automatically\noptimize its fusion strategy during training and can be generalizable to both\nunsupervised registration (without any labels) and semi-supervised registration\n(with weak labels provided for partial training data). Extensive experiments on\ntwo well-benchmarked medical registration tasks (inter- and intra-patient\nregistration) with eight public datasets show that our AutoFuse outperforms\nstate-of-the-art unsupervised and semi-supervised registration methods.\n","authors":["Mingyuan Meng","Michael Fulham","Dagan Feng","Lei Bi","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2309.05271v2.pdf","comment":"Published at Pattern Recognition"},{"id":"http://arxiv.org/abs/2501.04302v1","updated":"2025-01-08T06:26:16Z","published":"2025-01-08T06:26:16Z","title":"H-MBA: Hierarchical MamBa Adaptation for Multi-Modal Video Understanding\n  in Autonomous Driving","summary":"  With the prevalence of Multimodal Large Language Models(MLLMs), autonomous\ndriving has encountered new opportunities and challenges. In particular,\nmulti-modal video understanding is critical to interactively analyze what will\nhappen in the procedure of autonomous driving. However, videos in such a\ndynamical scene that often contains complex spatial-temporal movements, which\nrestricts the generalization capacity of the existing MLLMs in this field. To\nbridge the gap, we propose a novel Hierarchical Mamba Adaptation (H-MBA)\nframework to fit the complicated motion changes in autonomous driving videos.\nSpecifically, our H-MBA consists of two distinct modules, including Context\nMamba (C-Mamba) and Query Mamba (Q-Mamba). First, C-Mamba contains various\ntypes of structure state space models, which can effectively capture\nmulti-granularity video context for different temporal resolutions. Second,\nQ-Mamba flexibly transforms the current frame as the learnable query, and\nattentively selects multi-granularity video context into query. Consequently,\nit can adaptively integrate all the video contexts of multi-scale temporal\nresolutions to enhance video understanding. Via a plug-and-play paradigm in\nMLLMs, our H-MBA shows the remarkable performance on multi-modal video tasks in\nautonomous driving, e.g., for risk object detection, it outperforms the\nprevious SOTA method with 5.5% mIoU improvement.\n","authors":["Siran Chen","Yuxiao Luo","Yue Ma","Yu Qiao","Yali Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04302v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.04299v1","updated":"2025-01-08T06:07:33Z","published":"2025-01-08T06:07:33Z","title":"Circuit Complexity Bounds for Visual Autoregressive Model","summary":"  Understanding the expressive ability of a specific model is essential for\ngrasping its capacity limitations. Recently, several studies have established\ncircuit complexity bounds for Transformer architecture. Besides, the Visual\nAutoRegressive (VAR) model has risen to be a prominent method in the field of\nimage generation, outperforming previous techniques, such as Diffusion\nTransformers, in generating high-quality images. We investigate the circuit\ncomplexity of the VAR model and establish a bound in this study. Our primary\nresult demonstrates that the VAR model is equivalent to a simulation by a\nuniform $\\mathsf{TC}^0$ threshold circuit with hidden dimension $d \\leq O(n)$\nand $\\mathrm{poly}(n)$ precision. This is the first study to rigorously\nhighlight the limitations in the expressive power of VAR models despite their\nimpressive performance. We believe our findings will offer valuable insights\ninto the inherent constraints of these models and guide the development of more\nefficient and expressive architectures in the future.\n","authors":["Yekun Ke","Xiaoyu Li","Yingyu Liang","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2501.04299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12386v2","updated":"2025-01-08T05:57:28Z","published":"2024-09-19T01:02:31Z","title":"Channel-Aware Domain-Adaptive Generative Adversarial Network for Robust\n  Speech Recognition","summary":"  While pre-trained automatic speech recognition (ASR) systems demonstrate\nimpressive performance on matched domains, their performance often degrades\nwhen confronted with channel mismatch stemming from unseen recording\nenvironments and conditions. To mitigate this issue, we propose a novel\nchannel-aware data simulation method for robust ASR training. Our method\nharnesses the synergistic power of channel-extractive techniques and generative\nadversarial networks (GANs). We first train a channel encoder capable of\nextracting embeddings from arbitrary audio. On top of this, channel embeddings\nare extracted using a minimal amount of target-domain data and used to guide a\nGAN-based speech synthesizer. This synthesizer generates speech that faithfully\npreserves the phonetic content of the input while mimicking the channel\ncharacteristics of the target domain. We evaluate our method on the challenging\nHakka Across Taiwan (HAT) and Taiwanese Across Taiwan (TAT) corpora, achieving\nrelative character error rate (CER) reductions of 20.02% and 9.64%,\nrespectively, compared to the baselines. These results highlight the efficacy\nof our channel-aware data simulation method for bridging the gap between\nsource- and target-domain acoustics.\n","authors":["Chien-Chun Wang","Li-Wei Chen","Cheng-Kang Chou","Hung-Shin Lee","Berlin Chen","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2409.12386v2.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.04292v1","updated":"2025-01-08T05:32:55Z","published":"2025-01-08T05:32:55Z","title":"MAD-UV: The 1st INTERSPEECH Mice Autism Detection via Ultrasound\n  Vocalization Challenge","summary":"  The Mice Autism Detection via Ultrasound Vocalization (MAD-UV) Challenge\nintroduces the first INTERSPEECH challenge focused on detecting autism spectrum\ndisorder (ASD) in mice through their vocalizations. Participants are tasked\nwith developing models to automatically classify mice as either wild-type or\nASD models based on recordings with a high sampling rate. Our baseline system\nemploys a simple CNN-based classification using three different spectrogram\nfeatures. Results demonstrate the feasibility of automated ASD detection, with\nthe considered audible-range features achieving the best performance (UAR of\n0.600 for segment-level and 0.625 for subject-level classification). This\nchallenge bridges speech technology and biomedical research, offering\nopportunities to advance our understanding of ASD models through machine\nlearning approaches. The findings suggest promising directions for vocalization\nanalysis and highlight the potential value of audible and ultrasound\nvocalizations in ASD detection.\n","authors":["Zijiang Yang","Meishu Song","Xin Jing","Haojie Zhang","Kun Qian","Bin Hu","Kota Tamada","Toru Takumi","Björn W. Schuller","Yoshiharu Yamamoto"],"pdf_url":"https://arxiv.org/pdf/2501.04292v1.pdf","comment":"5 pages, 1 figure and 2 tables. For MAD-UV Challenge 2025"},{"id":"http://arxiv.org/abs/2412.04604v2","updated":"2025-01-08T05:24:50Z","published":"2024-12-05T20:40:28Z","title":"ARC Prize 2024: Technical Report","summary":"  As of December 2024, the ARC-AGI benchmark is five years old and remains\nunbeaten. We believe it is currently the most important unsolved AI benchmark\nin the world because it seeks to measure generalization on novel tasks -- the\nessence of intelligence -- as opposed to skill at tasks that can be prepared\nfor in advance. This year, we launched ARC Prize, a global competition to\ninspire new ideas and drive open progress towards AGI by reaching a target\nbenchmark score of 85\\%. As a result, the state-of-the-art score on the ARC-AGI\nprivate evaluation set increased from 33\\% to 55.5\\%, propelled by several\nfrontier AGI reasoning techniques including deep learning-guided program\nsynthesis and test-time training. In this paper, we survey top approaches,\nreview new open-source implementations, discuss the limitations of the\nARC-AGI-1 dataset, and share key insights gained from the competition.\n","authors":["Francois Chollet","Mike Knoop","Gregory Kamradt","Bryan Landers"],"pdf_url":"https://arxiv.org/pdf/2412.04604v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04286v1","updated":"2025-01-08T05:24:11Z","published":"2025-01-08T05:24:11Z","title":"Mapping the Edge of Chaos: Fractal-Like Boundaries in The Trainability\n  of Decoder-Only Transformer Models","summary":"  In the realm of fractal geometry, intricate structures emerge from simple\niterative processes that partition parameter spaces into regions of stability\nand instability. Likewise, training large language models involves iteratively\napplying update functions, such as Adam, where even slight hyperparameter\nadjustments can shift the training process from convergence to divergence.\nRecent evidence from miniature neural networks suggests that the boundary\nseparating these outcomes displays fractal characteristics [1]. Building on\nthese insights, this study extends them to medium-sized, decoder-only\ntransformer architectures by employing a more consistent convergence measure\nand examining the learning rate hyperparameter landscape for attention and\nfully connected layers. The results show that the trainability frontier is not\na simple threshold; rather, it forms a self-similar yet seemingly random\nstructure at multiple scales, with statistically consistent and repeating\npatterns. Within this landscape, a region of stable convergence is surrounded\nby a complex chaotic border, illustrating the sensitive nature of the\nunderlying training dynamics.\n","authors":["Bahman Torkamandi"],"pdf_url":"https://arxiv.org/pdf/2501.04286v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2501.04283v1","updated":"2025-01-08T05:14:36Z","published":"2025-01-08T05:14:36Z","title":"Enhancing Scene Classification in Cloudy Image Scenarios: A\n  Collaborative Transfer Method with Information Regulation Mechanism using\n  Optical Cloud-Covered and SAR Remote Sensing Images","summary":"  In remote sensing scene classification, leveraging the transfer methods with\nwell-trained optical models is an efficient way to overcome label scarcity.\nHowever, cloud contamination leads to optical information loss and significant\nimpacts on feature distribution, challenging the reliability and stability of\ntransferred target models. Common solutions include cloud removal for optical\ndata or directly using Synthetic aperture radar (SAR) data in the target\ndomain. However, cloud removal requires substantial auxiliary data for support\nand pre-training, while directly using SAR disregards the unobstructed portions\nof optical data. This study presents a scene classification transfer method\nthat synergistically combines multi-modality data, which aims to transfer the\nsource domain model trained on cloudfree optical data to the target domain that\nincludes both cloudy optical and SAR data at low cost. Specifically, the\nframework incorporates two parts: (1) the collaborative transfer strategy,\nbased on knowledge distillation, enables the efficient prior knowledge transfer\nacross heterogeneous data; (2) the information regulation mechanism (IRM) is\nproposed to address the modality imbalance issue during transfer. It employs\nauxiliary models to measure the contribution discrepancy of each modality, and\nautomatically balances the information utilization of modalities during the\ntarget model learning process at the sample-level. The transfer experiments\nwere conducted on simulated and real cloud datasets, demonstrating the superior\nperformance of the proposed method compared to other solutions in cloud-covered\nscenarios. We also verified the importance and limitations of IRM, and further\ndiscussed and visualized the modality imbalance problem during the model\ntransfer. Codes are available at https://github.com/wangyuze-csu/ESCCS\n","authors":["Yuze Wang","Rong Xiao","Haifeng Li","Mariana Belgiu","Chao Tao"],"pdf_url":"https://arxiv.org/pdf/2501.04283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03295v2","updated":"2025-01-08T04:50:01Z","published":"2025-01-06T11:43:29Z","title":"A Soft Sensor Method with Uncertainty-Awareness and Self-Explanation\n  Based on Large Language Models Enhanced by Domain Knowledge Retrieval","summary":"  Data-driven soft sensors are crucial in predicting key performance indicators\nin industrial systems. However, current methods predominantly rely on the\nsupervised learning paradigms of parameter updating, which inherently faces\nchallenges such as high development costs, poor robustness, training\ninstability, and lack of interpretability. Recently, large language models\n(LLMs) have demonstrated significant potential across various domains, notably\nthrough In-Context Learning (ICL), which enables high-performance task\nexecution with minimal input-label demonstrations and no prior training. This\npaper aims to replace supervised learning with the emerging ICL paradigm for\nsoft sensor modeling to address existing challenges and explore new avenues for\nadvancement. To achieve this, we propose a novel framework called the Few-shot\nUncertainty-aware and self-Explaining Soft Sensor (LLM-FUESS), which includes\nthe Zero-shot Auxiliary Variable Selector (LLM-ZAVS) and the Uncertainty-aware\nFew-shot Soft Sensor (LLM-UFSS). The LLM-ZAVS retrieves from the Industrial\nKnowledge Vector Storage to enhance LLMs' domain-specific knowledge, enabling\nzero-shot auxiliary variable selection. In the LLM-UFSS, we utilize text-based\ncontext demonstrations of structured data to prompt LLMs to execute ICL for\npredicting and propose a context sample retrieval augmentation strategy to\nimprove performance. Additionally, we explored LLMs' AIGC and probabilistic\ncharacteristics to propose self-explanation and uncertainty quantification\nmethods for constructing a trustworthy soft sensor. Extensive experiments\ndemonstrate that our method achieved state-of-the-art predictive performance,\nstrong robustness, and flexibility, effectively mitigates training instability\nfound in traditional methods. To the best of our knowledge, this is the first\nwork to establish soft sensor utilizing LLMs.\n","authors":["Shuo Tong","Han Liu","Runyuan Guo","Wenqing Wang","Xueqiong Tian","Lingyun Wei","Lin Zhang","Huayong Wu","Ding Liu","Youmin Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03295v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09420v3","updated":"2025-01-08T04:31:16Z","published":"2024-11-14T13:15:27Z","title":"SAG-ViT: A Scale-Aware, High-Fidelity Patching Approach with Graph\n  Attention for Vision Transformers","summary":"  Vision Transformers (ViTs) have redefined image classification by leveraging\nself-attention to capture complex patterns and long-range dependencies between\nimage patches. However, a key challenge for ViTs is efficiently incorporating\nmulti-scale feature representations, which is inherent in convolutional neural\nnetworks (CNNs) through their hierarchical structure. Graph transformers have\nmade strides in addressing this by leveraging graph-based modeling, but they\noften lose or insufficiently represent spatial hierarchies, especially since\nredundant or less relevant areas dilute the image's contextual representation.\nTo bridge this gap, we propose SAG-ViT, a Scale-Aware Graph Attention ViT that\nintegrates multi-scale feature capabilities of CNNs, representational power of\nViTs, graph-attended patching to enable richer contextual representation. Using\nEfficientNetV2 as a backbone, the model extracts multi-scale feature maps,\ndividing them into patches to preserve richer semantic information compared to\ndirectly patching the input images. The patches are structured into a graph\nusing spatial and feature similarities, where a Graph Attention Network (GAT)\nrefines the node embeddings. This refined graph representation is then\nprocessed by a Transformer encoder, capturing long-range dependencies and\ncomplex interactions. We evaluate SAG-ViT on benchmark datasets across various\ndomains, validating its effectiveness in advancing image classification tasks.\nOur code and weights are available at https://github.com/shravan-18/SAG-ViT.\n","authors":["Shravan Venkatraman","Jaskaran Singh Walia","Joe Dhanith P R"],"pdf_url":"https://arxiv.org/pdf/2411.09420v3.pdf","comment":"14 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2501.04266v1","updated":"2025-01-08T04:19:57Z","published":"2025-01-08T04:19:57Z","title":"Scaling Large Language Model Training on Frontier with Low-Bandwidth\n  Partitioning","summary":"  Scaling up Large Language Model(LLM) training involves fitting a tremendous\namount of training parameters across a limited number of workers. However,\nmethods like ZeRO-3 that drastically reduce GPU memory pressure often incur\nheavy communication to ensure global synchronization and consistency.\nEstablished efforts such as ZeRO++ use secondary partitions to avoid inter-node\ncommunications, given that intra-node GPU-GPU transfer generally has more\nbandwidth and lower latency than inter-node connections. However, as more\ncapable infrastructure like Frontier, equipped with AMD GPUs, emerged with\nimpressive computing capability, there is a need for investigations on the\nhardware topology and to develop targeted strategies to improve training\nefficiency. In this work, we propose a collection of communication and\noptimization strategies for ZeRO++ to reduce communication costs and improve\nmemory utilization. In this paper, we propose a 3-level hierarchical\npartitioning specifically for the current Top-1 supercomputing cluster,\nFrontier, which aims at leveraging various bandwidths across layers of\ncommunications (GCD-GCD, GPU-GPU, and inter-node) to reduce communication\noverhead. For a 20B GPT model, we observe a 1.71x increase in TFLOPS per GPU\nwhen compared with ZeRO++ up to 384 GCDs and a scaling efficiency of 0.94 for\nup to 384 GCDs. To the best of our knowledge, our work is also the first effort\nto efficiently optimize LLM workloads on Frontier AMD GPUs.\n","authors":["Lang Xu","Quentin Anthony","Jacob Hatef","Aamir Shafi","Hari Subramoni","Dhabaleswar K."," Panda"],"pdf_url":"https://arxiv.org/pdf/2501.04266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04263v1","updated":"2025-01-08T04:14:09Z","published":"2025-01-08T04:14:09Z","title":"KN-LIO: Geometric Kinematics and Neural Field Coupled LiDAR-Inertial\n  Odometry","summary":"  Recent advancements in LiDAR-Inertial Odometry (LIO) have boosted a large\namount of applications. However, traditional LIO systems tend to focus more on\nlocalization rather than mapping, with maps consisting mostly of sparse\ngeometric elements, which is not ideal for downstream tasks. Recent emerging\nneural field technology has great potential in dense mapping, but pure LiDAR\nmapping is difficult to work on high-dynamic vehicles. To mitigate this\nchallenge, we present a new solution that tightly couples geometric kinematics\nwith neural fields to enhance simultaneous state estimation and dense mapping\ncapabilities. We propose both semi-coupled and tightly coupled Kinematic-Neural\nLIO (KN-LIO) systems that leverage online SDF decoding and iterated error-state\nKalman filtering to fuse laser and inertial data. Our KN-LIO minimizes\ninformation loss and improves accuracy in state estimation, while also\naccommodating asynchronous multi-LiDAR inputs. Evaluations on diverse\nhigh-dynamic datasets demonstrate that our KN-LIO achieves performance on par\nwith or superior to existing state-of-the-art solutions in pose estimation and\noffers improved dense mapping accuracy over pure LiDAR-based methods. The\nrelevant code and datasets will be made available at https://**.\n","authors":["Zhong Wang","Lele Ren","Yue Wen","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00547v2","updated":"2025-01-08T04:14:07Z","published":"2024-11-30T17:40:49Z","title":"Motion Dreamer: Realizing Physically Coherent Video Generation through\n  Scene-Aware Motion Reasoning","summary":"  Recent numerous video generation models, also known as world models, have\ndemonstrated the ability to generate plausible real-world videos. However, many\nstudies have shown that these models often produce motion results lacking\nlogical or physical coherence. In this paper, we revisit video generation\nmodels and find that single-stage approaches struggle to produce high-quality\nresults while maintaining coherent motion reasoning. To address this issue, we\npropose \\textbf{Motion Dreamer}, a two-stage video generation framework. In\nStage I, the model generates an intermediate motion representation-such as a\nsegmentation map or depth map-based on the input image and motion conditions,\nfocusing solely on the motion itself. In Stage II, the model uses this\nintermediate motion representation as a condition to generate a high-detail\nvideo. By decoupling motion reasoning from high-fidelity video synthesis, our\napproach allows for more accurate and physically plausible motion generation.\nWe validate the effectiveness of our approach on the Physion dataset and in\nautonomous driving scenarios. For example, given a single push, our model can\nsynthesize the sequential toppling of a set of dominoes. Similarly, by varying\nthe movements of ego-cars, our model can produce different effects on other\nvehicles. Our work opens new avenues in creating models that can reason about\nphysical interactions in a more coherent and realistic manner. Our webpage is\navailable: https://envision-research.github.io/MotionDreamer/.\n","authors":["Tianshuo Xu","Zhifei Chen","Leyi Wu","Hao Lu","Yuying Chen","Lihui Jiang","Bingbing Liu","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2412.00547v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03271v2","updated":"2025-01-08T03:51:59Z","published":"2025-01-05T00:08:52Z","title":"DPO Kernels: A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich\n  Paradigm for Direct Preference Optimization","summary":"  The rapid rise of large language models (LLMs) has unlocked many applications\nbut also underscores the challenge of aligning them with diverse values and\npreferences. Direct Preference Optimization (DPO) is central to alignment but\nconstrained by fixed divergences and limited feature transformations. We\npropose DPO-Kernels, which integrates kernel methods to address these issues\nthrough four key contributions: (i) Kernelized Representations with polynomial,\nRBF, Mahalanobis, and spectral kernels for richer transformations, plus a\nhybrid loss combining embedding-based and probability-based objectives; (ii)\nDivergence Alternatives (Jensen-Shannon, Hellinger, Renyi, Bhattacharyya,\nWasserstein, and f-divergences) for greater stability; (iii) Data-Driven\nSelection metrics that automatically choose the best kernel-divergence pair;\nand (iv) a Hierarchical Mixture of Kernels for both local precision and global\nmodeling. Evaluations on 12 datasets demonstrate state-of-the-art performance\nin factuality, safety, reasoning, and instruction following. Grounded in\nHeavy-Tailed Self-Regularization, DPO-Kernels maintains robust generalization\nfor LLMs, offering a comprehensive resource for further alignment research.\n","authors":["Amitava Das","Suranjana Trivedy","Danush Khanna","Rajarshi Roy","Gurpreet Singh","Basab Ghosh","Yaswanth Narsupalli","Vinija Jain","Vasu Sharma","Aishwarya Naresh Reganti","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2501.03271v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04253v1","updated":"2025-01-08T03:35:28Z","published":"2025-01-08T03:35:28Z","title":"Integrated Offline and Online Learning to Solve a Large Class of\n  Scheduling Problems","summary":"  In this paper, we develop a unified machine learning (ML) approach to predict\nhigh-quality solutions for single-machine scheduling problems with a\nnon-decreasing min-sum objective function with or without release times. Our ML\napproach is novel in three major aspects. First, our approach is developed for\nthe entire class of the aforementioned problems. To achieve this, we exploit\nthe fact that the entire class of the problems considered can be formulated as\na time-indexed formulation in a unified manner. We develop a deep neural\nnetwork (DNN) which uses the cost parameters in the time-indexed formulation as\nthe inputs to effectively predict a continuous solution to this formulation,\nbased on which a feasible discrete solution is easily constructed. The second\nnovel aspect of our approach lies in how the DNN model is trained. In view of\nthe NP-hard nature of the problems, labels (i.e., optimal solutions) are hard\nto generate for training. To overcome this difficulty, we generate and utilize\na set of special instances, for which optimal solutions can be found with\nlittle computational effort, to train the ML model offline. The third novel\nidea we employ in our approach is that we develop an online single-instance\nlearning approach to fine tune the parameters in the DNN for a given online\ninstance, with the goal of generating an improved solution for the given\ninstance. To this end, we develop a feasibility surrogate that approximates the\nobjective value of a given instance as a continuous function of the outputs of\nthe DNN, which then enables us to derive gradients and update the learnable\nparameters in the DNN. Numerical results show that our approach can efficiently\ngenerate high-quality solutions for a variety of single-machine scheduling\nmin-sum problems with up to 1000 jobs.\n","authors":["Anbang Liu","Zhi-Long Chen","Jinyang Jiang","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2501.04253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16950v4","updated":"2025-01-08T03:14:04Z","published":"2024-03-25T17:11:28Z","title":"Aligning with Human Judgement: The Role of Pairwise Large Language Model\n  Evaluators in Preference Aggregation","summary":"  Large Language Models (LLMs) have demonstrated promising capabilities as\nautomatic evaluators in assessing the quality of generated natural language.\nHowever, LLMs still exhibit biases in evaluation and often struggle to generate\ncoherent evaluations that align with human assessments. In this work, we first\nconduct a systematic study of the misalignment between LLM evaluators and human\njudgement, revealing that existing calibration methods aimed at mitigating\nbiases are insufficient for effectively aligning LLM evaluators. Inspired by\nthe use of preference data in RLHF, we formulate the evaluation as a ranking\nproblem and introduce Pairwise-preference Search (PairS), an uncertainty-guided\nsearch method that employs LLMs to conduct pairwise comparisons and efficiently\nranks candidate texts. PairS achieves state-of-the-art performance on\nrepresentative evaluation tasks and demonstrates significant improvements over\ndirect scoring. Furthermore, we provide insights into the role of pairwise\npreference in quantifying the transitivity of LLMs and demonstrate how PairS\nbenefits from calibration.\n","authors":["Yinhong Liu","Han Zhou","Zhijiang Guo","Ehsan Shareghi","Ivan Vulić","Anna Korhonen","Nigel Collier"],"pdf_url":"https://arxiv.org/pdf/2403.16950v4.pdf","comment":"This paper has been accepted by COLM 2024"},{"id":"http://arxiv.org/abs/2412.19403v2","updated":"2025-01-08T02:43:21Z","published":"2024-12-27T01:53:18Z","title":"Fully Data-driven but Interpretable Human Behavioural Modelling with\n  Differentiable Discrete Choice Model","summary":"  Discrete choice models are essential for modelling various decision-making\nprocesses in human behaviour. However, the specification of these models has\ndepended heavily on domain knowledge from experts, and the fully automated but\ninterpretable modelling of complex human behaviours has been a long-standing\nchallenge. In this paper, we introduce the differentiable discrete choice model\n(Diff-DCM), a fully data-driven method for the interpretable modelling,\nlearning, prediction, and control of complex human behaviours, which is\nrealised by differentiable programming. Solely from input features and choice\noutcomes without any prior knowledge, Diff-DCM can estimate interpretable\nclosed-form utility functions that reproduce observed behaviours. Comprehensive\nexperiments with both synthetic and real-world data demonstrate that Diff-DCM\ncan be applied to various types of data and requires only a small amount of\ncomputational resources for the estimations, which can be completed within tens\nof seconds on a laptop without any accelerators. In these experiments, we also\ndemonstrate that, using its differentiability, Diff-DCM can provide useful\ninsights into human behaviours, such as an optimal intervention path for\neffective behavioural changes. This study provides a strong basis for the fully\nautomated and reliable modelling, prediction, and control of human behaviours.\n","authors":["Fumiyasu Makinoshima","Tatsuya Mitomi","Fumiya Makihara","Eigo Segawa"],"pdf_url":"https://arxiv.org/pdf/2412.19403v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07594v3","updated":"2025-01-08T02:33:37Z","published":"2023-11-10T09:51:24Z","title":"How to Bridge the Gap between Modalities: Survey on Multimodal Large\n  Language Model","summary":"  We explore Multimodal Large Language Models (MLLMs), which integrate LLMs\nlike GPT-4 to handle multimodal data, including text, images, audio, and more.\nMLLMs demonstrate capabilities such as generating image captions and answering\nimage-based questions, bridging the gap towards real-world human-computer\ninteractions and hinting at a potential pathway to artificial general\nintelligence. However, MLLMs still face challenges in addressing the semantic\ngap in multimodal data, which may lead to erroneous outputs, posing potential\nrisks to society. Selecting the appropriate modality alignment method is\ncrucial, as improper methods might require more parameters without significant\nperformance improvements. This paper aims to explore modality alignment methods\nfor LLMs and their current capabilities. Implementing effective modality\nalignment can help LLMs address environmental issues and enhance accessibility.\nThe study surveys existing modality alignment methods for MLLMs, categorizing\nthem into four groups: (1) Multimodal Converter, which transforms data into a\nformat that LLMs can understand; (2) Multimodal Perceiver, which improves how\nLLMs percieve different types of data; (3) Tool Learning, which leverages\nexternal tools to convert data into a common format, usually text; and (4)\nData-Driven Method, which teaches LLMs to understand specific data types within\ndatasets.\n","authors":["Shezheng Song","Xiaopeng Li","Shasha Li","Shan Zhao","Jie Yu","Jun Ma","Xiaoguang Mao","Weimin Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.07594v3.pdf","comment":"Accepted by TKDE"},{"id":"http://arxiv.org/abs/2404.09005v7","updated":"2025-01-08T02:10:31Z","published":"2024-04-13T13:18:40Z","title":"Proof-of-Learning with Incentive Security","summary":"  Most concurrent blockchain systems rely heavily on the Proof-of-Work (PoW) or\nProof-of-Stake (PoS) mechanisms for decentralized consensus and security\nassurance. However, the substantial energy expenditure stemming from\ncomputationally intensive yet meaningless tasks has raised considerable\nconcerns surrounding traditional PoW approaches, The PoS mechanism, while free\nof energy consumption, is subject to security and economic issues. Addressing\nthese issues, the paradigm of Proof-of-Useful-Work (PoUW) seeks to employ\nchallenges of practical significance as PoW, thereby imbuing energy consumption\nwith tangible value. While previous efforts in Proof of Learning (PoL) explored\nthe utilization of deep learning model training SGD tasks as PoUW challenges,\nrecent research has revealed its vulnerabilities to adversarial attacks and the\ntheoretical hardness in crafting a byzantine-secure PoL mechanism. In this\npaper, we introduce the concept of incentive-security that incentivizes\nrational provers to behave honestly for their best interest, bypassing the\nexisting hardness to design a PoL mechanism with computational efficiency, a\nprovable incentive-security guarantee and controllable difficulty.\nParticularly, our work is secure against two attacks, and also improves the\ncomputational overhead from $\\Theta(1)$ to $O(\\frac{\\log E}{E})$. Furthermore,\nwhile most recent research assumes trusted problem providers and verifiers, our\ndesign also guarantees frontend incentive-security even when problem providers\nare untrusted, and verifier incentive-security that bypasses the Verifier's\nDilemma. By incorporating ML training into blockchain consensus mechanisms with\nprovable guarantees, our research not only proposes an eco-friendly solution to\nblockchain systems, but also provides a proposal for a completely decentralized\ncomputing power market in the new AI age.\n","authors":["Zishuo Zhao","Zhixuan Fang","Xuechao Wang","Xi Chen","Hongxu Su","Haibo Xiao","Yuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.09005v7.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.13586v2","updated":"2025-01-08T02:09:15Z","published":"2024-08-24T14:14:32Z","title":"Balancing Diversity and Risk in LLM Sampling: How to Select Your Method\n  and Parameter for Open-Ended Text Generation","summary":"  Sampling-based decoding strategies have been widely adopted for Large\nLanguage Models (LLMs) in numerous applications, targeting a balance between\ndiversity and quality via temperature tuning and tail truncation. Considering\nthe strong dependency of the candidate next tokens on different prefixes,\nrecent studies propose to adaptively truncate the tail of LLMs' predicted\ndistribution. Although improved results have been reported with these methods\non open-ended text generation tasks, the results are highly dependent on the\ncurated parameters and the limited exemplar text. In this paper, we propose a\nsystematic way to estimate the capacity of a truncation sampling method by\nconsidering the trade-off between diversity and risk at each decoding step,\nbased on our collected prefix tree which preserves the context of a full\nsentence. Our work offers a comprehensive comparison of existing truncation\nsampling methods and serves as a practical user guideline for their parameter\nselection.\n","authors":["Yuxuan Zhou","Margret Keuper","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2408.13586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04228v1","updated":"2025-01-08T01:59:47Z","published":"2025-01-08T01:59:47Z","title":"Constraints as Rewards: Reinforcement Learning for Robots without Reward\n  Functions","summary":"  Reinforcement learning has become an essential algorithm for generating\ncomplex robotic behaviors. However, to learn such behaviors, it is necessary to\ndesign a reward function that describes the task, which often consists of\nmultiple objectives that needs to be balanced. This tuning process is known as\nreward engineering and typically involves extensive trial-and-error. In this\npaper, to avoid this trial-and-error process, we propose the concept of\nConstraints as Rewards (CaR). CaR formulates the task objective using multiple\nconstraint functions instead of a reward function and solves a reinforcement\nlearning problem with constraints using the Lagrangian-method. By adopting this\napproach, different objectives are automatically balanced, because Lagrange\nmultipliers serves as the weights among the objectives. In addition, we will\ndemonstrate that constraints, expressed as inequalities, provide an intuitive\ninterpretation of the optimization target designed for the task. We apply the\nproposed method to the standing-up motion generation task of a\nsix-wheeled-telescopic-legged robot and demonstrate that the proposed method\nsuccessfully acquires the target behavior, even though it is challenging to\nlearn with manually designed reward functions.\n","authors":["Yu Ishihara","Noriaki Takasugi","Kotaro Kawakami","Masaya Kinoshita","Kazumi Aoyama"],"pdf_url":"https://arxiv.org/pdf/2501.04228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04227v1","updated":"2025-01-08T01:58:42Z","published":"2025-01-08T01:58:42Z","title":"Agent Laboratory: Using LLM Agents as Research Assistants","summary":"  Historically, scientific discovery has been a lengthy and costly process,\ndemanding substantial time and resources from initial conception to final\nresults. To accelerate scientific discovery, reduce research costs, and improve\nresearch quality, we introduce Agent Laboratory, an autonomous LLM-based\nframework capable of completing the entire research process. This framework\naccepts a human-provided research idea and progresses through three\nstages--literature review, experimentation, and report writing to produce\ncomprehensive research outputs, including a code repository and a research\nreport, while enabling users to provide feedback and guidance at each stage. We\ndeploy Agent Laboratory with various state-of-the-art LLMs and invite multiple\nresearchers to assess its quality by participating in a survey, providing human\nfeedback to guide the research process, and then evaluate the final paper. We\nfound that: (1) Agent Laboratory driven by o1-preview generates the best\nresearch outcomes; (2) The generated machine learning code is able to achieve\nstate-of-the-art performance compared to existing methods; (3) Human\ninvolvement, providing feedback at each stage, significantly improves the\noverall quality of research; (4) Agent Laboratory significantly reduces\nresearch expenses, achieving an 84% decrease compared to previous autonomous\nresearch methods. We hope Agent Laboratory enables researchers to allocate more\neffort toward creative ideation rather than low-level coding and writing,\nultimately accelerating scientific discovery.\n","authors":["Samuel Schmidgall","Yusheng Su","Ze Wang","Ximeng Sun","Jialian Wu","Xiaodong Yu","Jiang Liu","Zicheng Liu","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2501.04227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16050v3","updated":"2025-01-08T01:47:16Z","published":"2024-12-20T16:52:11Z","title":"Label-Efficient Data Augmentation with Video Diffusion Models for\n  Guidewire Segmentation in Cardiac Fluoroscopy","summary":"  The accurate segmentation of guidewires in interventional cardiac fluoroscopy\nvideos is crucial for computer-aided navigation tasks. Although deep learning\nmethods have demonstrated high accuracy and robustness in wire segmentation,\nthey require substantial annotated datasets for generalizability, underscoring\nthe need for extensive labeled data to enhance model performance. To address\nthis challenge, we propose the Segmentation-guided Frame-consistency Video\nDiffusion Model (SF-VD) to generate large collections of labeled fluoroscopy\nvideos, augmenting the training data for wire segmentation networks. SF-VD\nleverages videos with limited annotations by independently modeling scene\ndistribution and motion distribution. It first samples the scene distribution\nby generating 2D fluoroscopy images with wires positioned according to a\nspecified input mask, and then samples the motion distribution by progressively\ngenerating subsequent frames, ensuring frame-to-frame coherence through a\nframe-consistency strategy. A segmentation-guided mechanism further refines the\nprocess by adjusting wire contrast, ensuring a diverse range of visibility in\nthe synthesized image. Evaluation on a fluoroscopy dataset confirms the\nsuperior quality of the generated videos and shows significant improvements in\nguidewire segmentation.\n","authors":["Shaoyan Pan","Yikang Liu","Lin Zhao","Eric Z. Chen","Xiao Chen","Terrence Chen","Shanhui Sun"],"pdf_url":"https://arxiv.org/pdf/2412.16050v3.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2411.09852v2","updated":"2025-01-08T01:44:07Z","published":"2024-11-15T00:20:36Z","title":"InterFormer: Towards Effective Heterogeneous Interaction Learning for\n  Click-Through Rate Prediction","summary":"  Click-through rate (CTR) prediction, which predicts the probability of a user\nclicking an ad, is a fundamental task in recommender systems. The emergence of\nheterogeneous information, such as user profile and behavior sequences, depicts\nuser interests from different aspects. A mutually beneficial integration of\nheterogeneous information is the cornerstone towards the success of CTR\nprediction. However, most of the existing methods suffer from two fundamental\nlimitations, including (1) insufficient inter-mode interaction due to the\nunidirectional information flow between modes, and (2) aggressive information\naggregation caused by early summarization, resulting in excessive information\nloss. To address the above limitations, we propose a novel module named\nInterFormer to learn heterogeneous information interaction in an interleaving\nstyle. To achieve better interaction learning, InterFormer enables\nbidirectional information flow for mutually beneficial learning across\ndifferent modes. To avoid aggressive information aggregation, we retain\ncomplete information in each data mode and use a separate bridging arch for\neffective information selection and summarization. Our proposed InterFormer\nachieves state-of-the-art performance on three public datasets and a\nlarge-scale industrial dataset.\n","authors":["Zhichen Zeng","Xiaolong Liu","Mengyue Hang","Xiaoyi Liu","Qinghai Zhou","Chaofei Yang","Yiqun Liu","Yichen Ruan","Laming Chen","Yuxin Chen","Yujia Hao","Jiaqi Xu","Jade Nie","Xi Liu","Buyun Zhang","Wei Wen","Siyang Yuan","Kai Wang","Wen-Yen Chen","Yiping Han","Huayu Li","Chunzhi Yang","Bo Long","Philip S. Yu","Hanghang Tong","Jiyan Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09852v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.04217v1","updated":"2025-01-08T01:27:35Z","published":"2025-01-08T01:27:35Z","title":"Continual Self-supervised Learning Considering Medical Domain Knowledge\n  in Chest CT Images","summary":"  We propose a novel continual self-supervised learning method (CSSL)\nconsidering medical domain knowledge in chest CT images. Our approach addresses\nthe challenge of sequential learning by effectively capturing the relationship\nbetween previously learned knowledge and new information at different stages.\nBy incorporating an enhanced DER into CSSL and maintaining both diversity and\nrepresentativeness within the rehearsal buffer of DER, the risk of data\ninterference during pretraining is reduced, enabling the model to learn more\nricher and robust feature representations. In addition, we incorporate a mixup\nstrategy and feature distillation to further enhance the model's ability to\nlearn meaningful representations. We validate our method using chest CT images\nobtained under two different imaging conditions, demonstrating superior\nperformance compared to state-of-the-art methods.\n","authors":["Ren Tasai","Guang Li","Ren Togo","Minghui Tang","Takaaki Yoshimura","Hiroyuki Sugimori","Kenji Hirata","Takahiro Ogawa","Kohsuke Kudo","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2501.04217v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.02683v2","updated":"2025-01-08T01:27:30Z","published":"2025-01-05T23:19:55Z","title":"From Superficial Patterns to Semantic Understanding: Fine-Tuning\n  Language Models on Contrast Sets","summary":"  Large-scale pre-trained language models have demonstrated high performance on\nstandard datasets for natural language inference (NLI) tasks. Unfortunately,\nthese evaluations can be misleading, as although the models can perform well on\nin-distribution data, they perform poorly on out-of-distribution test sets,\nsuch as contrast sets. Contrast sets consist of perturbed instances of data\nthat have very minor, but meaningful, changes to the input that alter the gold\nlabel, revealing how models can learn superficial patterns in the training data\nrather than learning more sophisticated language nuances. As an example, the\nELECTRA-small language model achieves nearly 90% accuracy on an SNLI dataset\nbut drops to 75% when tested on an out-of-distribution contrast set. The\nresearch carried out in this study explores how the robustness of a language\nmodel can be improved by exposing it to small amounts of more complex contrast\nsets during training to help it better learn language patterns. With this\napproach, the model recovers performance and achieves nearly 90% accuracy on\ncontrast sets, highlighting the importance of diverse and challenging training\ndata.\n","authors":["Daniel Petrov"],"pdf_url":"https://arxiv.org/pdf/2501.02683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04213v1","updated":"2025-01-08T01:18:14Z","published":"2025-01-08T01:18:14Z","title":"UPAQ: A Framework for Real-Time and Energy-Efficient 3D Object Detection\n  in Autonomous Vehicles","summary":"  To enhance perception in autonomous vehicles (AVs), recent efforts are\nconcentrating on 3D object detectors, which deliver more comprehensive\npredictions than traditional 2D object detectors, at the cost of increased\nmemory footprint and computational resource usage. We present a novel framework\ncalled UPAQ, which leverages semi-structured pattern pruning and quantization\nto improve the efficiency of LiDAR point-cloud and camera-based 3D object\ndetectors on resource-constrained embedded AV platforms. Experimental results\non the Jetson Orin Nano embedded platform indicate that UPAQ achieves up to\n5.62x and 5.13x model compression rates, up to 1.97x and 1.86x boost in\ninference speed, and up to 2.07x and 1.87x reduction in energy consumption\ncompared to state-of-the-art model compression frameworks, on the Pointpillar\nand SMOKE models respectively.\n","authors":["Abhishek Balasubramaniam","Febin P Sunny","Sudeep Pasricha"],"pdf_url":"https://arxiv.org/pdf/2501.04213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04211v1","updated":"2025-01-08T01:11:17Z","published":"2025-01-08T01:11:17Z","title":"CURing Large Models: Compression via CUR Decomposition","summary":"  Large deep learning models have achieved remarkable success but are\nresource-intensive, posing challenges in computational cost and memory usage.\n  We introduce CURing, a novel model compression method based on CUR matrix\ndecomposition, which approximates weight matrices as the product of selected\ncolumns (C) and rows (R), and a small linking matrix (U). We apply this\ndecomposition to weights chosen based on the combined influence of their\nmagnitudes and activations. By identifying and retaining informative rows and\ncolumns, CURing significantly reduces model size with minimal performance loss.\n  It preserves the original network's input/output structures, retains\nimportant features such as non-negativity, and the compressed model's\nactivation patterns align with the original, thereby enhancing\ninterpretability.\n","authors":["Sanghyeon Park","Soo-Mook Moon"],"pdf_url":"https://arxiv.org/pdf/2501.04211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04202v1","updated":"2025-01-08T00:43:31Z","published":"2025-01-08T00:43:31Z","title":"Generative Dataset Distillation Based on Self-knowledge Distillation","summary":"  Dataset distillation is an effective technique for reducing the cost and\ncomplexity of model training while maintaining performance by compressing large\ndatasets into smaller, more efficient versions. In this paper, we present a\nnovel generative dataset distillation method that can improve the accuracy of\naligning prediction logits. Our approach integrates self-knowledge distillation\nto achieve more precise distribution matching between the synthetic and\noriginal data, thereby capturing the overall structure and relationships within\nthe data. To further improve the accuracy of alignment, we introduce a\nstandardization step on the logits before performing distribution matching,\nensuring consistency in the range of logits. Through extensive experiments, we\ndemonstrate that our method outperforms existing state-of-the-art methods,\nresulting in superior distillation performance.\n","authors":["Longzhen Li","Guang Li","Ren Togo","Keisuke Maeda","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2501.04202v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.04193v1","updated":"2025-01-08T00:06:38Z","published":"2025-01-08T00:06:38Z","title":"GNN-based Decentralized Perception in Multirobot Systems for Predicting\n  Worker Actions","summary":"  In industrial environments, predicting human actions is essential for\nensuring safe and effective collaboration between humans and robots. This paper\nintroduces a perception framework that enables mobile robots to understand and\nshare information about human actions in a decentralized way. The framework\nfirst allows each robot to build a spatial graph representing its surroundings,\nwhich it then shares with other robots. This shared spatial data is combined\nwith temporal information to track human behavior over time. A swarm-inspired\ndecision-making process is used to ensure all robots agree on a unified\ninterpretation of the human's actions. Results show that adding more robots and\nincorporating longer time sequences improve prediction accuracy. Additionally,\nthe consensus mechanism increases system resilience, making the multi-robot\nsetup more reliable in dynamic industrial settings.\n","authors":["Ali Imran","Giovanni Beltrame","David St-Onge"],"pdf_url":"https://arxiv.org/pdf/2501.04193v1.pdf","comment":"Submitted to RA-L"},{"id":"http://arxiv.org/abs/2402.17853v2","updated":"2025-01-08T00:00:44Z","published":"2024-02-27T19:36:27Z","title":"Latent Neural PDE Solver: a reduced-order modelling framework for\n  partial differential equations","summary":"  Neural networks have shown promising potential in accelerating the numerical\nsimulation of systems governed by partial differential equations (PDEs).\nDifferent from many existing neural network surrogates operating on\nhigh-dimensional discretized fields, we propose to learn the dynamics of the\nsystem in the latent space with much coarser discretizations. In our proposed\nframework - Latent Neural PDE Solver (LNS), a non-linear autoencoder is first\ntrained to project the full-order representation of the system onto the\nmesh-reduced space, then a temporal model is trained to predict the future\nstate in this mesh-reduced space. This reduction process simplifies the\ntraining of the temporal model by greatly reducing the computational cost\naccompanying a fine discretization. We study the capability of the proposed\nframework and several other popular neural PDE solvers on various types of\nsystems including single-phase and multi-phase flows along with varying system\nparameters. We showcase that it has competitive accuracy and efficiency\ncompared to the neural PDE solver that operates on full-order space.\n","authors":["Zijie Li","Saurabh Patil","Francis Ogoke","Dule Shu","Wilson Zhen","Michael Schneier","John R. Buchanan, Jr.","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2402.17853v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05910v3","updated":"2025-01-08T23:40:38Z","published":"2024-07-08T13:15:11Z","title":"Enhancing Vision-Language Models with Scene Graphs for Traffic Accident\n  Understanding","summary":"  Recognizing a traffic accident is an essential part of any autonomous driving\nor road monitoring system. An accident can appear in a wide variety of forms,\nand understanding what type of accident is taking place may be useful to\nprevent it from recurring. This work focuses on classifying traffic scenes into\nspecific accident types. We approach the problem by representing a traffic\nscene as a graph, where objects such as cars can be represented as nodes, and\nrelative distances and directions between them as edges. This representation of\na traffic scene is referred to as a scene graph, and can be used as input for\nan accident classifier. Better results are obtained with a classifier that\nfuses the scene graph input with visual and textual representations. This work\nintroduces a multi-stage, multimodal pipeline that pre-processes videos of\ntraffic accidents, encodes them as scene graphs, and aligns this representation\nwith vision and language modalities before executing the classification task.\nWhen trained on 4 classes, our method achieves a balanced accuracy score of\n57.77% on an (unbalanced) subset of the popular Detection of Traffic Anomaly\n(DoTA) benchmark, representing an increase of close to 5 percentage points from\nthe case where scene graph information is not taken into account.\n","authors":["Aaron Lohner","Francesco Compagno","Jonathan Francis","Alessandro Oltramari"],"pdf_url":"https://arxiv.org/pdf/2407.05910v3.pdf","comment":"Won the 'Best Paper Runner-up Award' at the 2024 IEEE International\n  Automated Vehicle Validation Conference (IAVVC 2024). Also accepted at the\n  1st Workshop on Semantic Reasoning and Goal Understanding in Robotics, at the\n  Robotics Science and Systems Conference (RSS SemRob 2024)"},{"id":"http://arxiv.org/abs/2501.04882v1","updated":"2025-01-08T23:38:19Z","published":"2025-01-08T23:38:19Z","title":"Reach Measurement, Optimization and Frequency Capping In Targeted Online\n  Advertising Under k-Anonymity","summary":"  The growth in the use of online advertising to foster brand awareness over\nrecent years is largely attributable to the ubiquity of social media. One\npivotal technology contributing to the success of online brand advertising is\nfrequency capping, a mechanism that enables marketers to control the number of\ntimes an ad is shown to a specific user. However, the very foundation of this\ntechnology is being scrutinized as the industry gravitates towards advertising\nsolutions that prioritize user privacy. This paper delves into the issue of\nreach measurement and optimization within the context of $k$-anonymity, a\nprivacy-preserving model gaining traction across major online advertising\nplatforms. We outline how to report reach within this new privacy landscape and\ndemonstrate how probabilistic discounting, a probabilistic adaptation of\ntraditional frequency capping, can be employed to optimize campaign\nperformance. Experiments are performed to assess the trade-off between user\nprivacy and the efficacy of online brand advertising. Notably, we discern a\nsignificant dip in performance as long as privacy is introduced, yet this comes\nwith a limited additional cost for advertising platforms to offer their users\nmore privacy.\n","authors":["Yuan Gao","Mu Qiao"],"pdf_url":"https://arxiv.org/pdf/2501.04882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04877v1","updated":"2025-01-08T23:21:43Z","published":"2025-01-08T23:21:43Z","title":"Real-Time Textless Dialogue Generation","summary":"  Recent advancements in large language models (LLMs) have led to significant\nprogress in text-based dialogue systems. These systems can now generate\nhigh-quality responses that are accurate and coherent across a wide range of\ntopics and tasks. However, spoken dialogue systems still lag behind in terms of\nnaturalness. They tend to produce robotic interactions, with issues such as\nslow response times, overly generic or cautious replies, and a lack of natural\nrhythm and fluid turn-taking. This shortcoming is largely due to the\nover-reliance on the traditional cascaded design, which involve separate,\nsequential components, as well as the use of text as an intermediate\nrepresentation. This paper propose a real-time, textless spoken dialogue\ngeneration model (RTTL-DG) that aims to overcome these challenges. Our system\nenables fluid turn-taking and generates responses with minimal delay by\nprocessing streaming spoken conversation directly. Additionally, our model\nincorporates backchannels, filters, laughter, and other paralinguistic signals,\nwhich are often absent in cascaded dialogue systems, to create more natural and\nhuman-like interactions. The implementations and generated samples are\navailable in our repository: https://github.com/mailong25/rts2s-dg\n","authors":["Long Mai","Julie Carson-Berndsen"],"pdf_url":"https://arxiv.org/pdf/2501.04877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00958v2","updated":"2025-01-08T23:16:20Z","published":"2024-05-02T02:50:58Z","title":"Generative manufacturing systems using diffusion models and ChatGPT","summary":"  In this study, we introduce Generative Manufacturing Systems (GMS) as a novel\napproach to effectively manage and coordinate autonomous manufacturing assets,\nthereby enhancing their responsiveness and flexibility to address a wide array\nof production objectives and human preferences. Deviating from traditional\nexplicit modeling, GMS employs generative AI, including diffusion models and\nChatGPT, for implicit learning from envisioned futures, marking a shift from a\nmodel-optimum to a training-sampling decision-making. Through the integration\nof generative AI, GMS enables complex decision-making through interactive\ndialogue with humans, allowing manufacturing assets to generate multiple\nhigh-quality global decisions that can be iteratively refined based on human\nfeedback. Empirical findings showcase GMS's substantial improvement in system\nresilience and responsiveness to uncertainties, with decision times reduced\nfrom seconds to milliseconds. The study underscores the inherent creativity and\ndiversity in the generated solutions, facilitating human-centric\ndecision-making through seamless and continuous human-machine interactions.\n","authors":["Xingyu Li","Fei Tao","Wei Ye","Aydin Nassehi","John W. Sutherland"],"pdf_url":"https://arxiv.org/pdf/2405.00958v2.pdf","comment":"We are withdrawing this preprint to incorporate significant new\n  results and expand the scope of the paper. We plan to resubmit a\n  substantially revised version in the near future"},{"id":"http://arxiv.org/abs/2501.04873v1","updated":"2025-01-08T23:07:10Z","published":"2025-01-08T23:07:10Z","title":"Back Home: A Machine Learning Approach to Seashell Classification and\n  Ecosystem Restoration","summary":"  In Costa Rica, an average of 5 tons of seashells are extracted from\necosystems annually. Confiscated seashells, cannot be returned to their\necosystems due to the lack of origin recognition. To address this issue, we\ndeveloped a convolutional neural network (CNN) specifically for seashell\nidentification. We built a dataset from scratch, consisting of approximately\n19000 images from the Pacific and Caribbean coasts. Using this dataset, the\nmodel achieved a classification accuracy exceeding 85%. The model has been\nintegrated into a user-friendly application, which has classified over 36,000\nseashells to date, delivering real-time results within 3 seconds per image. To\nfurther enhance the system's accuracy, an anomaly detection mechanism was\nincorporated to filter out irrelevant or anomalous inputs, ensuring only valid\nseashell images are processed.\n","authors":["Alexander Valverde","Luis Solano"],"pdf_url":"https://arxiv.org/pdf/2501.04873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14469v6","updated":"2025-01-08T21:47:16Z","published":"2024-06-20T16:32:18Z","title":"Forecasting Symmetric Random Walks: A Fusion Approach","summary":"  Forecasting random walks is notoriously challenging, with na\\\"ive prediction\nserving as a difficult-to-surpass baseline. To investigate the potential of\nusing movement predictions to improve point forecasts in this context, this\nstudy focuses on symmetric random walks, in which the target variable's future\nvalue is reformulated as a combination of its future movement and current\nvalue. The proposed forecasting method, termed the fusion of movement and\nna\\\"ive predictions (FMNP), is grounded in this reformulation. The simulation\nresults show that FMNP achieves statistically significant improvements over\nna\\\"ive prediction, even when the movement prediction accuracy is only slightly\nabove 0.50. In practice, movement predictions can be derived from the\ncomovement between an exogenous variable and the target variable and then\nlinearly combined with the na\\\"ive prediction to generate the final forecast.\nFMNP effectiveness was evaluated on four U.S. financial time series -- the\nclose prices of Boeing (BA), Brent crude oil (OIL), Halliburton (HAL), and\nSchlumberger (SLB) -- using the open price of the Financial Times Stock\nExchange (FTSE) index as the exogenous variable. In all the cases, FMNP\noutperformed the na\\\"ive prediction, demonstrating its efficacy in forecasting\nsymmetric random walks and its potential applicability to other forecasting\ntasks.\n","authors":["Cheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.14469v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04848v1","updated":"2025-01-08T21:22:45Z","published":"2025-01-08T21:22:45Z","title":"Exploring Large Language Models for Semantic Analysis and Categorization\n  of Android Malware","summary":"  Malware analysis is a complex process of examining and evaluating malicious\nsoftware's functionality, origin, and potential impact. This arduous process\ntypically involves dissecting the software to understand its components,\ninfection vector, propagation mechanism, and payload. Over the years, deep\nreverse engineering of malware has become increasingly tedious, mainly due to\nmodern malicious codebases' fast evolution and sophistication. Essentially,\nanalysts are tasked with identifying the elusive needle in the haystack within\nthe complexities of zero-day malware, all while under tight time constraints.\nThus, in this paper, we explore leveraging Large Language Models (LLMs) for\nsemantic malware analysis to expedite the analysis of known and novel samples.\nBuilt on GPT-4o-mini model, \\msp is designed to augment malware analysis for\nAndroid through a hierarchical-tiered summarization chain and strategic prompt\nengineering. Additionally, \\msp performs malware categorization, distinguishing\npotential malware from benign applications, thereby saving time during the\nmalware reverse engineering process. Despite not being fine-tuned for Android\nmalware analysis, we demonstrate that through optimized and advanced prompt\nengineering \\msp can achieve up to 77% classification accuracy while providing\nhighly robust summaries at functional, class, and package levels. In addition,\nleveraging the backward tracing of the summaries from package to function\nlevels allowed us to pinpoint the precise code snippets responsible for\nmalicious behavior.\n","authors":["Brandon J Walton","Mst Eshita Khatun","James M Ghawaly","Aisha Ali-Gombe"],"pdf_url":"https://arxiv.org/pdf/2501.04848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04844v1","updated":"2025-01-08T21:11:35Z","published":"2025-01-08T21:11:35Z","title":"Enhancing Listened Speech Decoding from EEG via Parallel Phoneme\n  Sequence Prediction","summary":"  Brain-computer interfaces (BCI) offer numerous human-centered application\npossibilities, particularly affecting people with neurological disorders. Text\nor speech decoding from brain activities is a relevant domain that could\naugment the quality of life for people with impaired speech perception. We\npropose a novel approach to enhance listened speech decoding from\nelectroencephalography (EEG) signals by utilizing an auxiliary phoneme\npredictor that simultaneously decodes textual phoneme sequences. The proposed\nmodel architecture consists of three main parts: EEG module, speech module, and\nphoneme predictor. The EEG module learns to properly represent EEG signals into\nEEG embeddings. The speech module generates speech waveforms from the EEG\nembeddings. The phoneme predictor outputs the decoded phoneme sequences in text\nmodality. Our proposed approach allows users to obtain decoded listened speech\nfrom EEG signals in both modalities (speech waveforms and textual phoneme\nsequences) simultaneously, eliminating the need for a concatenated sequential\npipeline for each modality. The proposed approach also outperforms previous\nmethods in both modalities. The source code and speech samples are publicly\navailable.\n","authors":["Jihwan Lee","Tiantian Feng","Aditya Kommineni","Sudarsana Reddy Kadiri","Shrikanth Narayanan"],"pdf_url":"https://arxiv.org/pdf/2501.04844v1.pdf","comment":"ICASSP 2025"},{"id":"http://arxiv.org/abs/2408.06653v3","updated":"2025-01-08T20:40:09Z","published":"2024-08-13T05:53:46Z","title":"Hierarchical Structured Neural Network: Efficient Retrieval Scaling for\n  Large Scale Recommendation","summary":"  Retrieval, the initial stage of a recommendation system, is tasked with\ndown-selecting items from a pool of tens of millions of candidates to a few\nthousands. Embedding Based Retrieval (EBR) has been a typical choice for this\nproblem, addressing the computational demands of deep neural networks across\nvast item corpora. EBR utilizes Two Tower or Siamese Networks to learn\nrepresentations for users and items, and employ Approximate Nearest Neighbor\n(ANN) search to efficiently retrieve relevant items. Despite its popularity in\nindustry, EBR faces limitations. The Two Tower architecture, relying on a\nsingle dot product interaction, struggles to capture complex data distributions\ndue to limited capability in learning expressive interactions between users and\nitems. Additionally, ANN index building and representation learning for user\nand item are often separate, leading to inconsistencies exacerbated by\nrepresentation (e.g. continuous online training) and item drift (e.g. items\nexpired and new items added). In this paper, we introduce the Hierarchical\nStructured Neural Network (HSNN), an efficient deep neural network model to\nlearn intricate user and item interactions beyond the commonly used dot product\nin retrieval tasks, achieving sublinear computational costs relative to corpus\nsize. A Modular Neural Network (MoNN) is designed to maintain high\nexpressiveness for interaction learning while ensuring efficiency. A mixture of\nMoNNs operate on a hierarchical item index to achieve extensive computation\nsharing, enabling it to scale up to large corpus size. MoNN and the\nhierarchical index are jointly learnt to continuously adapt to distribution\nshifts in both user interests and item distributions. HSNN achieves substantial\nimprovement in offline evaluation compared to prevailing methods.\n","authors":["Kaushik Rangadurai","Siyang Yuan","Minhui Huang","Yiqun Liu","Golnaz Ghasemiesfeh","Yunchen Pu","Haiyu Lu","Xingfeng He","Fangzhou Xu","Andrew Cui","Vidhoon Viswanathan","Lin Yang","Liang Wang","Jiyan Yang","Chonglin Sun"],"pdf_url":"https://arxiv.org/pdf/2408.06653v3.pdf","comment":"Resubmit"},{"id":"http://arxiv.org/abs/2501.04835v1","updated":"2025-01-08T20:39:45Z","published":"2025-01-08T20:39:45Z","title":"Do Code LLMs Understand Design Patterns?","summary":"  Code Large Language Models (LLMs) demonstrate great versatility in adapting\nto various downstream tasks, including code generation and completion, as well\nas bug detection and fixing. However, Code LLMs often fail to capture existing\ncoding standards, leading to the generation of code that conflicts with the\nrequired design patterns for a given project. As a result, developers must\npost-process to adapt the generated code to the project's design norms. In this\nwork, we empirically investigate the biases of Code LLMs in software\ndevelopment. Through carefully designed experiments, we assess the models'\nunderstanding of design patterns across recognition, comprehension, and\ngeneration. Our findings reveal that biases in Code LLMs significantly affect\nthe reliability of downstream tasks.\n","authors":["Zhenyu Pan","Xuefeng Song","Yunkun Wang","Rongyu Cao","Binhua Li","Yongbin Li","Han Liu"],"pdf_url":"https://arxiv.org/pdf/2501.04835v1.pdf","comment":"accpeted by llm4code workshop in ICSE 2025"},{"id":"http://arxiv.org/abs/2501.04832v1","updated":"2025-01-08T20:38:02Z","published":"2025-01-08T20:38:02Z","title":"ActPC-Geom: Towards Scalable Online Neural-Symbolic Learning via\n  Accelerating Active Predictive Coding with Information Geometry & Diverse\n  Cognitive Mechanisms","summary":"  This paper introduces ActPC-Geom, an approach to accelerate Active Predictive\nCoding (ActPC) in neural networks by integrating information geometry,\nspecifically using Wasserstein-metric-based methods for measure-dependent\ngradient flows. We propose replacing KL-divergence in ActPC's predictive error\nassessment with the Wasserstein metric, suggesting this may enhance network\nrobustness.\n  To make this computationally feasible, we present strategies including: (1)\nneural approximators for inverse measure-dependent Laplacians, (2) approximate\nkernel PCA embeddings for low-rank approximations feeding into these\napproximators, and (3) compositional hypervector embeddings derived from kPCA\noutputs, with algebra optimized for fuzzy FCA lattices learned through neural\narchitectures analyzing network states.\n  This results in an ActPC architecture capable of real-time online learning\nand integrating continuous (e.g., transformer-like or Hopfield-net-like) and\ndiscrete symbolic ActPC networks, including frameworks like OpenCog Hyperon or\nActPC-Chem for algorithmic chemistry evolution. Shared probabilistic,\nconcept-lattice, and hypervector models enable symbolic-subsymbolic\nintegration.\n  Key features include (1) compositional reasoning via hypervector embeddings\nin transformer-like architectures for tasks like commonsense reasoning, and (2)\nHopfield-net dynamics enabling associative long-term memory and\nattractor-driven cognitive features.\n  We outline how ActPC-Geom combines few-shot learning with online weight\nupdates, enabling deliberative thinking and seamless symbolic-subsymbolic\nreasoning. Ideas from Galois connections are explored for efficient hybrid\nActPC/ActPC-Chem processing. Finally, we propose a specialized HPC design\noptimized for real-time focused attention and deliberative reasoning tailored\nto ActPC-Geom's demands.\n","authors":["Ben Goertzel"],"pdf_url":"https://arxiv.org/pdf/2501.04832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15237v3","updated":"2025-01-08T20:34:02Z","published":"2024-08-27T17:56:11Z","title":"The Mamba in the Llama: Distilling and Accelerating Hybrid Models","summary":"  Linear RNN architectures, like Mamba, can be competitive with Transformer\nmodels in language modeling while having advantageous deployment\ncharacteristics. Given the focus on training large-scale Transformer models, we\nconsider the challenge of converting these pretrained models for deployment. We\ndemonstrate that it is feasible to distill large Transformers into linear RNNs\nby reusing the linear projection weights from attention layers with academic\nGPU resources. The resulting hybrid model, which incorporates a quarter of the\nattention layers, achieves performance comparable to the original Transformer\nin chat benchmarks and outperforms open-source hybrid Mamba models trained from\nscratch with trillions of tokens in both chat benchmarks and general\nbenchmarks. Moreover, we introduce a hardware-aware speculative decoding\nalgorithm that accelerates the inference speed of Mamba and hybrid models.\nOverall we show how, with limited computation resources, we can remove many of\nthe original attention layers and generate from the resulting model more\nefficiently. Our top-performing model, distilled from Llama3-8B-Instruct,\nachieves a 29.61 length-controlled win rate on AlpacaEval 2 against GPT-4 and\n7.35 on MT-Bench, surpassing the best 8B scale instruction-tuned linear RNN\nmodel. We also find that the distilled model has natural length extrapolation,\nshowing almost perfect accuracy in the needle-in-a-haystack test at 20x the\ndistillation length. Code and pre-trained checkpoints are open-sourced at\nhttps://github.com/jxiw/MambaInLlama and\nhttps://github.com/itsdaniele/speculative_mamba.\n","authors":["Junxiong Wang","Daniele Paliotta","Avner May","Alexander M. Rush","Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2408.15237v3.pdf","comment":"NeurIPS 2024. v3 updates: fix format errors"},{"id":"http://arxiv.org/abs/2501.04826v1","updated":"2025-01-08T20:26:13Z","published":"2025-01-08T20:26:13Z","title":"Intelligent Gradient Boosting Algorithms for Estimating Strength of\n  Modified Subgrade Soil","summary":"  The performance of pavement under loading depends on the strength of the\nsubgrade. However, experimental estimation of properties of pavement strengths\nsuch as California bearing ratio (CBR), unconfined compressive strength (UCS)\nand resistance value (R) are often tedious, time-consuming and costly, thereby\ninspiring a growing interest in machine learning based tools which are simple,\ncheap and fast alternatives. Thus, the potential application of two boosting\ntechniques; categorical boosting (CatBoost) and extreme gradient boosting\n(XGBoost) and support vector regression (SVR), is similarly explored in this\nstudy for estimation of properties of subgrade soil modified with hydrated lime\nactivated rice husk ash (HARSH). Using 121 experimental data samples of varying\nproportions of HARSH, plastic limit, liquid limit, plasticity index, clay\nactivity, optimum moisture content, and maximum dry density as input for CBR,\nUCS and R estimation, four evaluation metrics namely coefficient of\ndetermination (R2), root mean squared error (RMSE), mean absolute error (MAE)\nand mean absolute percentage error (MAPE) are used to evaluate the models'\nperformance. The results indicate that XGBoost outperformed CatBoost and SVR in\nestimating these properties, yielding R2 of 0.9994, 0.9995 and 0.9999 in\nestimating the CBR, UCS and R respectively. Also, SVR outperformed CatBoost in\nestimating the CBR and R with R2 of 0.9997 respectively. On the other hand,\nCatBoost outperformed SVR in estimating the UCS with R2 of 0.9994. Feature\nsensitivity analysis shows that the three machine learning techniques are\nunanimous that increasing HARSH proportion lead to values of the estimated\nproperties respectively. A comparison with previous results also shows\nsuperiority of XGBoost in estimating subgrade properties.\n","authors":["Ismail B. Mustapha","Muyideen Abdulkareem","Shafaatunnur Hasan","Abideen Ganiyu","Hatem Nabus","Jin Chai Lee"],"pdf_url":"https://arxiv.org/pdf/2501.04826v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2501.04819v1","updated":"2025-01-08T20:17:18Z","published":"2025-01-08T20:17:18Z","title":"Planing It by Ear: Convolutional Neural Networks for Acoustic Anomaly\n  Detection in Industrial Wood Planers","summary":"  In recent years, the wood product industry has been facing a skilled labor\nshortage. The result is more frequent sudden failures, resulting in additional\ncosts for these companies already operating in a very competitive market.\nMoreover, sawmills are challenging environments for machinery and sensors.\nGiven that experienced machine operators may be able to diagnose defects or\nmalfunctions, one possible way of assisting novice operators is through\nacoustic monitoring. As a step towards the automation of wood-processing\nequipment and decision support systems for machine operators, in this paper, we\nexplore using a deep convolutional autoencoder for acoustic anomaly detection\nof wood planers on a new real-life dataset. Specifically, our convolutional\nautoencoder with skip connections (Skip-CAE) and our Skip-CAE transformer\noutperform the DCASE autoencoder baseline, one-class SVM, isolation forest and\na published convolutional autoencoder architecture, respectively obtaining an\narea under the ROC curve of 0.846 and 0.875 on a dataset of real-factory planer\nsounds. Moreover, we show that adding skip connections and attention mechanism\nunder the form of a transformer encoder-decoder helps to further improve the\nanomaly detection capabilities.\n","authors":["Anthony Deschênes","Rémi Georges","Cem Subakan","Bruna Ugulino","Antoine Henry","Michael Morin"],"pdf_url":"https://arxiv.org/pdf/2501.04819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04817v1","updated":"2025-01-08T20:14:07Z","published":"2025-01-08T20:14:07Z","title":"Decentralised Resource Sharing in TinyML: Wireless Bilayer Gossip\n  Parallel SGD for Collaborative Learning","summary":"  With the growing computational capabilities of microcontroller units (MCUs),\nedge devices can now support machine learning models. However, deploying\ndecentralised federated learning (DFL) on such devices presents key challenges,\nincluding intermittent connectivity, limited communication range, and dynamic\nnetwork topologies. This paper proposes a novel framework, bilayer Gossip\nDecentralised Parallel Stochastic Gradient Descent (GD PSGD), designed to\naddress these issues in resource-constrained environments. The framework\nincorporates a hierarchical communication structure using Distributed Kmeans\n(DKmeans) clustering for geographic grouping and a gossip protocol for\nefficient model aggregation across two layers: intra-cluster and inter-cluster.\nWe evaluate the framework's performance against the Centralised Federated\nLearning (CFL) baseline using the MCUNet model on the CIFAR-10 dataset under\nIID and Non-IID conditions. Results demonstrate that the proposed method\nachieves comparable accuracy to CFL on IID datasets, requiring only 1.8\nadditional rounds for convergence. On Non-IID datasets, the accuracy loss\nremains under 8\\% for moderate data imbalance. These findings highlight the\nframework's potential to support scalable and privacy-preserving learning on\nedge devices with minimal performance trade-offs.\n","authors":["Ziyuan Bao","Eiman Kanjo","Soumya Banerjee","Hasib-Al Rashid","Tinoosh Mohsenin"],"pdf_url":"https://arxiv.org/pdf/2501.04817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16339v2","updated":"2025-01-08T20:11:59Z","published":"2024-12-20T21:00:11Z","title":"Deliberative Alignment: Reasoning Enables Safer Language Models","summary":"  As large-scale language models increasingly impact safety-critical domains,\nensuring their reliable adherence to well-defined principles remains a\nfundamental challenge. We introduce Deliberative Alignment, a new paradigm that\ndirectly teaches the model safety specifications and trains it to explicitly\nrecall and accurately reason over the specifications before answering. We used\nthis approach to align OpenAI's o-series models, and achieved highly precise\nadherence to OpenAI's safety policies, without requiring human-written\nchain-of-thoughts or answers. Deliberative Alignment pushes the Pareto frontier\nby simultaneously increasing robustness to jailbreaks while decreasing\noverrefusal rates, and also improves out-of-distribution generalization. We\ndemonstrate that reasoning over explicitly specified policies enables more\nscalable, trustworthy, and interpretable alignment.\n","authors":["Melody Y. Guan","Manas Joglekar","Eric Wallace","Saachi Jain","Boaz Barak","Alec Helyar","Rachel Dias","Andrea Vallone","Hongyu Ren","Jason Wei","Hyung Won Chung","Sam Toyer","Johannes Heidecke","Alex Beutel","Amelia Glaese"],"pdf_url":"https://arxiv.org/pdf/2412.16339v2.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2501.01950v2","updated":"2025-01-08T20:09:16Z","published":"2025-01-03T18:54:26Z","title":"MADGEN: Mass-Spec attends to De Novo Molecular generation","summary":"  The annotation (assigning structural chemical identities) of MS/MS spectra\nremains a significant challenge due to the enormous molecular diversity in\nbiological samples and the limited scope of reference databases. Currently, the\nvast majority of spectral measurements remain in the \"dark chemical space\"\nwithout structural annotations. To improve annotation, we propose MADGEN\n(Mass-spec Attends to De Novo Molecular GENeration), a scaffold-based method\nfor de novo molecular structure generation guided by mass spectrometry data.\nMADGEN operates in two stages: scaffold retrieval and spectra-conditioned\nmolecular generation starting with the scaffold. In the first stage, given an\nMS/MS spectrum, we formulate scaffold retrieval as a ranking problem and employ\ncontrastive learning to align mass spectra with candidate molecular scaffolds.\nIn the second stage, starting from the retrieved scaffold, we employ the MS/MS\nspectrum to guide an attention-based generative model to generate the final\nmolecule. Our approach constrains the molecular generation search space,\nreducing its complexity and improving generation accuracy. We evaluate MADGEN\non three datasets (NIST23, CANOPUS, and MassSpecGym) and evaluate MADGEN's\nperformance with a predictive scaffold retriever and with an oracle retriever.\nWe demonstrate the effectiveness of using attention to integrate spectral\ninformation throughout the generation process to achieve strong results with\nthe oracle retriever.\n","authors":["Yinkai Wang","Xiaohui Chen","Liping Liu","Soha Hassoun"],"pdf_url":"https://arxiv.org/pdf/2501.01950v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2412.18036v2","updated":"2025-01-08T19:44:56Z","published":"2024-12-23T23:09:56Z","title":"Explainability in Neural Networks for Natural Language Processing Tasks","summary":"  Neural networks are widely regarded as black-box models, creating significant\nchallenges in understanding their inner workings, especially in natural\nlanguage processing (NLP) applications. To address this opacity, model\nexplanation techniques like Local Interpretable Model-Agnostic Explanations\n(LIME) have emerged as essential tools for providing insights into the behavior\nof these complex systems. This study leverages LIME to interpret a multi-layer\nperceptron (MLP) neural network trained on a text classification task. By\nanalyzing the contribution of individual features to model predictions, the\nLIME approach enhances interpretability and supports informed decision-making.\nDespite its effectiveness in offering localized explanations, LIME has\nlimitations in capturing global patterns and feature interactions. This\nresearch highlights the strengths and shortcomings of LIME and proposes\ndirections for future work to achieve more comprehensive interpretability in\nneural NLP models.\n","authors":["Melkamu Mersha","Mingiziem Bitewa","Tsion Abay","Jugal Kalita"],"pdf_url":"https://arxiv.org/pdf/2412.18036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11977v2","updated":"2025-01-08T19:17:14Z","published":"2024-10-15T18:33:42Z","title":"Generative AI Policies under the Microscope: How CS Conferences Are\n  Navigating the New Frontier in Scholarly Writing","summary":"  This paper explores the current state of generative AI policies of computer\nscience conferences and offers guidelines for policy adoption.\n","authors":["Mahjabin Nahar","Sian Lee","Becky Guillen","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2410.11977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17309v3","updated":"2025-01-08T19:00:00Z","published":"2024-10-22T18:00:00Z","title":"Literature Meets Data: A Synergistic Approach to Hypothesis Generation","summary":"  AI holds promise for transforming scientific processes, including hypothesis\ngeneration. Prior work on hypothesis generation can be broadly categorized into\ntheory-driven and data-driven approaches. While both have proven effective in\ngenerating novel and plausible hypotheses, it remains an open question whether\nthey can complement each other. To address this, we develop the first method\nthat combines literature-based insights with data to perform LLM-powered\nhypothesis generation. We apply our method on five different datasets and\ndemonstrate that integrating literature and data outperforms other baselines\n(8.97\\% over few-shot, 15.75\\% over literature-based alone, and 3.37\\% over\ndata-driven alone). Additionally, we conduct the first human evaluation to\nassess the utility of LLM-generated hypotheses in assisting human\ndecision-making on two challenging tasks: deception detection and AI generated\ncontent detection. Our results show that human accuracy improves significantly\nby 7.44\\% and 14.19\\% on these tasks, respectively. These findings suggest that\nintegrating literature-based and data-driven approaches provides a\ncomprehensive and nuanced framework for hypothesis generation and could open\nnew avenues for scientific inquiry.\n","authors":["Haokun Liu","Yangqiaoyu Zhou","Mingxuan Li","Chenfei Yuan","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2410.17309v3.pdf","comment":"37 pages, 9 figures, code link:\n  https://github.com/ChicagoHAI/hypothesis-generation"},{"id":"http://arxiv.org/abs/2501.04682v1","updated":"2025-01-08T18:42:48Z","published":"2025-01-08T18:42:48Z","title":"Towards System 2 Reasoning in LLMs: Learning How to Think With Meta\n  Chain-of-Thought","summary":"  We propose a novel framework, Meta Chain-of-Thought (Meta-CoT), which extends\ntraditional Chain-of-Thought (CoT) by explicitly modeling the underlying\nreasoning required to arrive at a particular CoT. We present empirical evidence\nfrom state-of-the-art models exhibiting behaviors consistent with in-context\nsearch, and explore methods for producing Meta-CoT via process supervision,\nsynthetic data generation, and search algorithms. Finally, we outline a\nconcrete pipeline for training a model to produce Meta-CoTs, incorporating\ninstruction tuning with linearized search traces and reinforcement learning\npost-training. Finally, we discuss open research questions, including scaling\nlaws, verifier roles, and the potential for discovering novel reasoning\nalgorithms. This work provides a theoretical and practical roadmap to enable\nMeta-CoT in LLMs, paving the way for more powerful and human-like reasoning in\nartificial intelligence.\n","authors":["Violet Xiang","Charlie Snell","Kanishk Gandhi","Alon Albalak","Anikait Singh","Chase Blagden","Duy Phung","Rafael Rafailov","Nathan Lile","Dakota Mahan","Louis Castricato","Jan-Philipp Franken","Nick Haber","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2501.04682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04765v1","updated":"2025-01-08T18:38:25Z","published":"2025-01-08T18:38:25Z","title":"TREAD: Token Routing for Efficient Architecture-agnostic Diffusion\n  Training","summary":"  Diffusion models have emerged as the mainstream approach for visual\ngeneration. However, these models usually suffer from sample inefficiency and\nhigh training costs. This issue is particularly pronounced in the standard\ndiffusion transformer architecture due to its quadratic complexity relative to\ninput length. Recent works have addressed this by reducing the number of tokens\nprocessed in the model, often through masking. In contrast, this work aims to\nimprove the training efficiency of the diffusion backbone by using predefined\nroutes that store this information until it is reintroduced to deeper layers of\nthe model, rather than discarding these tokens entirely. Further, we combine\nmultiple routes and introduce an adapted auxiliary loss that accounts for all\napplied routes. Our method is not limited to the common transformer-based model\n- it can also be applied to state-space models. Unlike most current approaches,\nTREAD achieves this without architectural modifications. Finally, we show that\nour method reduces the computational cost and simultaneously boosts model\nperformance on the standard benchmark ImageNet-1K 256 x 256 in\nclass-conditional synthesis. Both of these benefits multiply to a convergence\nspeedup of 9.55x at 400K training iterations compared to DiT and 25.39x\ncompared to the best benchmark performance of DiT at 7M training iterations.\n","authors":["Felix Krause","Timy Phan","Vincent Tao Hu","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2501.04765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02132v2","updated":"2025-01-08T16:26:36Z","published":"2025-01-03T22:53:43Z","title":"A hybrid marketplace of ideas","summary":"  The convergence of humans and artificial intelligence systems introduces new\ndynamics into the cultural and intellectual landscape. Complementing emerging\ncultural evolution concepts such as machine culture, AI agents represent a\nsignificant techno-sociological development, particularly within the\nanthropological study of Web3 as a community focused on decentralization\nthrough blockchain. Despite their growing presence, the cultural significance\nof AI agents remains largely unexplored in academic literature. Toward this\nend, we conceived hybrid netnography, a novel interdisciplinary approach that\nexamines the cultural and intellectual dynamics within digital ecosystems by\nanalyzing the interactions and contributions of both human and AI agents as\nco-participants in shaping narratives, ideas, and cultural artifacts. We argue\nthat, within the Web3 community on the social media platform X, these agents\nchallenge traditional notions of participation and influence in public\ndiscourse, creating a hybrid marketplace of ideas, a conceptual space where\nhuman and AI generated ideas coexist and compete for attention. We examine the\ncurrent state of AI agents in idea generation, propagation, and engagement,\npositioning their role as cultural agents through the lens of memetics and\nencouraging further inquiry into their cultural and societal impact.\nAdditionally, we address the implications of this paradigm for privacy,\nintellectual property, and governance, highlighting the societal and legal\nchallenges of integrating AI agents into the hybrid marketplace of ideas.\n","authors":["Tomer Jordi Chaffer","Dontrail Cotlage","Justin Goldston"],"pdf_url":"https://arxiv.org/pdf/2501.02132v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04747v1","updated":"2025-01-08T10:31:16Z","published":"2025-01-08T10:31:16Z","title":"Discovering new robust local search algorithms with neuro-evolution","summary":"  This paper explores a novel approach aimed at overcoming existing challenges\nin the realm of local search algorithms. Our aim is to improve the decision\nprocess that takes place within a local search algorithm so as to make the best\npossible transitions in the neighborhood at each iteration. To improve this\nprocess, we propose to use a neural network that has the same input information\nas conventional local search algorithms. In this paper, which is an extension\nof the work [Goudet et al. 2024] presented at EvoCOP2024, we investigate\ndifferent ways of representing this information so as to make the algorithm as\nefficient as possible but also robust to monotonic transformations of the\nproblem objective function. To assess the efficiency of this approach, we\ndevelop an experimental setup centered around NK landscape problems, offering\nthe flexibility to adjust problem size and ruggedness. This approach offers a\npromising avenue for the emergence of new local search algorithms and the\nimprovement of their problem-solving capabilities for black-box problems.\n","authors":["Mohamed Salim Amri Sakhri","Adrien Goëffon","Olivier Goudet","Frédéric Saubion","Chaïmaâ Touhami"],"pdf_url":"https://arxiv.org/pdf/2501.04747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05486v1","updated":"2025-01-08T16:53:25Z","published":"2025-01-08T16:53:25Z","title":"Towards an Ontology of Traceable Impact Management in the Food Supply\n  Chain","summary":"  The pursuit of quality improvements and accountability in the food supply\nchains, especially how they relate to food-related outcomes, such as hunger,\nhas become increasingly vital, necessitating a comprehensive approach that\nencompasses product quality and its impact on various stakeholders and their\ncommunities. Such an approach offers numerous benefits in increasing product\nquality and eliminating superfluous measurements while appraising and\nalleviating the broader societal and environmental repercussions. A traceable\nimpact management model (TIMM) provides an impact structure and a reporting\nmechanism that identifies each stakeholder's role in the total impact of food\nproduction and consumption stages.\n  The model aims to increase traceability's utility in understanding the impact\nof changes on communities affected by food production and consumption, aligning\nwith current and future government requirements, and addressing the needs of\ncommunities and consumers. This holistic approach is further supported by an\nontological model that forms the logical foundation and a unified terminology.\nBy proposing a holistic and integrated solution across multiple stakeholders,\nthe model emphasizes quality and the extensive impact of championing\naccountability, sustainability, and responsible practices with global\ntraceability.\n  With these combined efforts, the food supply chain moves toward a global\ntracking and tracing process that not only ensures product quality but also\naddresses its impact on a broader scale, fostering accountability,\nsustainability, and responsible food production and consumption.\n","authors":["Bart Gajderowicz","Mark S Fox","Yongchao Gao"],"pdf_url":"https://arxiv.org/pdf/2501.05486v1.pdf","comment":null}]},"2025-01-09T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2409.05545v2","updated":"2025-01-09T18:51:52Z","published":"2024-09-09T12:11:18Z","title":"Adaptive Probabilistic Planning for the Uncertain and Dynamic\n  Orienteering Problem","summary":"  The Orienteering Problem (OP) is a well-studied routing problem that has been\nextended to incorporate uncertainties, reflecting stochastic or dynamic travel\ncosts, prize-collection costs, and prizes. Existing approaches may, however, be\ninefficient in real-world applications due to insufficient modeling knowledge\nand initially unknowable parameters in online scenarios. Thus, we propose the\nUncertain and Dynamic Orienteering Problem (UDOP), modeling travel costs as\ndistributions with unknown and time-variant parameters. UDOP also associates\nuncertain travel costs with dynamic prizes and prize-collection costs for its\nobjective and budget constraints. To address UDOP, we develop an ADaptive\nApproach for Probabilistic paThs - ADAPT, that iteratively performs 'execution'\nand 'online planning' based on an initial 'offline' solution. The execution\nphase updates system status and records online cost observations. The online\nplanner employs a Bayesian approach to adaptively estimate power consumption\nand optimize path sequence based on safety beliefs. We evaluate ADAPT in a\npractical Unmanned Aerial Vehicle (UAV) charging scheduling problem for\nWireless Rechargeable Sensor Networks. The UAV must optimize its path to\nrecharge sensor nodes efficiently while managing its energy under uncertain\nconditions. ADAPT maintains comparable solution quality and computation time\nwhile offering superior robustness. Extensive simulations show that ADAPT\nachieves a 100% Mission Success Rate (MSR) across all tested scenarios,\noutperforming comparable heuristic-based and frequentist approaches that fail\nup to 70% (under challenging conditions) and averaging 67% MSR, respectively.\nThis work advances the field of OP with uncertainties, offering a reliable and\nefficient approach for real-world applications in uncertain and dynamic\nenvironments.\n","authors":["Qiuchen Qian","Yanran Wang","David Boyle"],"pdf_url":"https://arxiv.org/pdf/2409.05545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05439v1","updated":"2025-01-09T18:49:39Z","published":"2025-01-09T18:49:39Z","title":"From Simple to Complex Skills: The Case of In-Hand Object Reorientation","summary":"  Learning policies in simulation and transferring them to the real world has\nbecome a promising approach in dexterous manipulation. However, bridging the\nsim-to-real gap for each new task requires substantial human effort, such as\ncareful reward engineering, hyperparameter tuning, and system identification.\nIn this work, we present a system that leverages low-level skills to address\nthese challenges for more complex tasks. Specifically, we introduce a\nhierarchical policy for in-hand object reorientation based on previously\nacquired rotation skills. This hierarchical policy learns to select which\nlow-level skill to execute based on feedback from both the environment and the\nlow-level skill policies themselves. Compared to learning from scratch, the\nhierarchical policy is more robust to out-of-distribution changes and transfers\neasily from simulation to real-world environments. Additionally, we propose a\ngeneralizable object pose estimator that uses proprioceptive information,\nlow-level skill predictions, and control errors as inputs to estimate the\nobject pose over time. We demonstrate that our system can reorient objects,\nincluding symmetrical and textureless ones, to a desired pose.\n","authors":["Haozhi Qi","Brent Yi","Mike Lambeta","Yi Ma","Roberto Calandra","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2501.05439v1.pdf","comment":"website: https://dexhier.github.io"},{"id":"http://arxiv.org/abs/2501.05420v1","updated":"2025-01-09T18:22:10Z","published":"2025-01-09T18:22:10Z","title":"RoboPanoptes: The All-seeing Robot with Whole-body Dexterity","summary":"  We present RoboPanoptes, a capable yet practical robot system that achieves\nwhole-body dexterity through whole-body vision. Its whole-body dexterity allows\nthe robot to utilize its entire body surface for manipulation, such as\nleveraging multiple contact points or navigating constrained spaces. Meanwhile,\nwhole-body vision uses a camera system distributed over the robot's surface to\nprovide comprehensive, multi-perspective visual feedback of its own and the\nenvironment's state. At its core, RoboPanoptes uses a whole-body visuomotor\npolicy that learns complex manipulation skills directly from human\ndemonstrations, efficiently aggregating information from the distributed\ncameras while maintaining resilience to sensor failures. Together, these design\naspects unlock new capabilities and tasks, allowing RoboPanoptes to unbox in\nnarrow spaces, sweep multiple or oversized objects, and succeed in multi-step\nstowing in cluttered environments, outperforming baselines in adaptability and\nefficiency. Results are best viewed on https://robopanoptes.github.io.\n","authors":["Xiaomeng Xu","Dominik Bauer","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2501.05420v1.pdf","comment":"Project website: https://robopanoptes.github.io"},{"id":"http://arxiv.org/abs/2501.05418v1","updated":"2025-01-09T18:20:57Z","published":"2025-01-09T18:20:57Z","title":"Virtual-Work Based Shape-Force Sensing for Continuum Instruments with\n  Tension-Feedback Actuation","summary":"  Continuum instruments are integral to robot-assisted minimally invasive\nsurgery (MIS), with tendon-driven mechanisms being the most common. Real-time\ntension feedback is crucial for precise articulation but remains a challenge in\ncompact actuation unit designs. Additionally, accurate shape and external force\nsensing of continuum instruments are essential for advanced control and\nmanipulation. This paper presents a compact and modular actuation unit that\nintegrates a torque cell directly into the pulley module to provide real-time\ntension feedback. Building on this unit, we propose a novel shape-force sensing\nframework that incorporates polynomial curvature kinematics to accurately model\nnon-constant curvature. The framework combines pose sensor measurements at the\ninstrument tip and actuation tension feedback at the developed actuation unit.\nExperimental results demonstrate the improved performance of the proposed\nshape-force sensing framework in terms of shape reconstruction accuracy and\nforce estimation reliability compared to conventional constant-curvature\nmethods.\n","authors":["Guoqing Zhang","Zihan Chen","Long Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05411v1","updated":"2025-01-09T18:10:16Z","published":"2025-01-09T18:10:16Z","title":"Adaptive Path-Planning for Autonomous Robots: A UCH-Enhanced Q-Learning\n  Approach","summary":"  Q-learning methods are widely used in robot path planning but often face\nchallenges of inefficient search and slow convergence. We propose an Improved\nQ-learning (IQL) framework that enhances standard Q-learning in two significant\nways. First, we introduce the Path Adaptive Collaborative Optimization (PACO)\nalgorithm to optimize Q-table initialization, providing better initial\nestimates and accelerating learning. Second, we incorporate a\nUtility-Controlled Heuristic (UCH) mechanism with dynamically tuned parameters\nto optimize the reward function, enhancing the algorithm's accuracy and\neffectiveness in path-planning tasks. Extensive experiments in three different\nraster grid environments validate the superior performance of our IQL\nframework. The results demonstrate that our IQL algorithm outperforms existing\nmethods, including FIQL, PP-QL-based CPP, DFQL, and QMABC algorithms, in terms\nof path-planning capabilities.\n","authors":["Wei Liu","Ruiyang Wang","Haonan Wang","Guangwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.05411v1.pdf","comment":"25 pages, 20 figures"},{"id":"http://arxiv.org/abs/2408.00846v2","updated":"2025-01-09T16:55:55Z","published":"2024-08-01T18:01:23Z","title":"Occupation-aware planning method for robotic monitoring missions in\n  dynamic environments","summary":"  This paper presents a method for robotic monitoring missions in the presence\nof moving obstacles. Although the scenario map is known, the robot lacks\ninformation about the movement of dynamic obstacles during the monitoring\nmission. Numerous local planners have been developed in recent years for\nnavigating highly dynamic environments. However, the absence of a global\nplanner for these environments can result in unavoidable collisions or the\ninability to successfully complete missions in densely populated areas, such as\na scenario monitoring in our case. This work addresses the development and\nevaluation of a global planner, $MADA$ (Monitoring Avoiding Dynamic Areas),\naimed at enhancing the deployment of robots in such challenging conditions. The\nrobot plans and executes the mission using the proposed two-step approach. The\nfirst step involves selecting the observation goal based on the environment's\ndistribution and estimated monitoring costs. In the second step, the robot\nidentifies areas with moving obstacles and obtains paths avoiding densely\noccupied dynamic regions based on their occupation. Quantitative and\nqualitative results based on simulations and on real-world experimentation,\nconfirm that the proposed method allows the robot to effectively monitor most\nof the environment while avoiding densely occupied dynamic areas.\n","authors":["Yaroslav Marchukov","Luis Montano"],"pdf_url":"https://arxiv.org/pdf/2408.00846v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00242v2","updated":"2025-01-09T16:43:16Z","published":"2024-12-31T03:17:05Z","title":"Automotive Speed Estimation: Sensor Types and Error Characteristics from\n  OBD-II to ADAS","summary":"  Modern on-road navigation systems heavily depend on integrating speed\nmeasurements with inertial navigation systems (INS) and global navigation\nsatellite systems (GNSS). Telemetry-based applications typically source speed\ndata from the On-Board Diagnostic II (OBD-II) system. However, the method of\nderiving speed, as well as the types of sensors used to measure wheel speed,\ndiffers across vehicles. These differences result in varying error\ncharacteristics that must be accounted for in navigation and autonomy\napplications. This paper addresses this gap by examining the diverse\nspeed-sensing technologies employed in standard automotive systems and\nalternative techniques used in advanced systems designed for higher levels of\nautonomy, such as Advanced Driver Assistance Systems (ADAS), Autonomous Driving\n(AD), or surveying applications. We propose a method to identify the type of\nspeed sensor in a vehicle and present strategies for accurately modeling its\nerror characteristics. To validate our approach, we collected and analyzed data\nfrom three long real road trajectories conducted in urban environments in\nToronto and Kingston, Ontario, Canada. The results underscore the critical role\nof integrating multiple sensor modalities to achieve more accurate speed\nestimation, thus improving automotive navigation state estimation, particularly\nin GNSS-denied environments.\n","authors":["Hany Ragab","Sidney Givigi","Aboelmagd Noureldin"],"pdf_url":"https://arxiv.org/pdf/2501.00242v2.pdf","comment":"7 pages, 12 figures, to be published in conference proceedings"},{"id":"http://arxiv.org/abs/2501.05329v1","updated":"2025-01-09T15:55:08Z","published":"2025-01-09T15:55:08Z","title":"Knowledge Transfer in Model-Based Reinforcement Learning Agents for\n  Efficient Multi-Task Learning","summary":"  We propose an efficient knowledge transfer approach for model-based\nreinforcement learning, addressing the challenge of deploying large world\nmodels in resource-constrained environments. Our method distills a\nhigh-capacity multi-task agent (317M parameters) into a compact 1M parameter\nmodel, achieving state-of-the-art performance on the MT30 benchmark with a\nnormalized score of 28.45, a substantial improvement over the original 1M\nparameter model's score of 18.93. This demonstrates the ability of our\ndistillation technique to consolidate complex multi-task knowledge effectively.\nAdditionally, we apply FP16 post-training quantization, reducing the model size\nby 50% while maintaining performance. Our work bridges the gap between the\npower of large models and practical deployment constraints, offering a scalable\nsolution for efficient and accessible multi-task reinforcement learning in\nrobotics and other resource-limited domains.\n","authors":["Dmytro Kuzmenko","Nadiya Shvai"],"pdf_url":"https://arxiv.org/pdf/2501.05329v1.pdf","comment":"Preprint of an extended abstract accepted to AAMAS 2025"},{"id":"http://arxiv.org/abs/2405.17794v2","updated":"2025-01-09T15:15:40Z","published":"2024-05-28T03:45:32Z","title":"LNS2+RL: Combining Multi-Agent Reinforcement Learning with Large\n  Neighborhood Search in Multi-Agent Path Finding","summary":"  Multi-Agent Path Finding (MAPF) is a critical component of logistics and\nwarehouse management, which focuses on planning collision-free paths for a team\nof robots in a known environment. Recent work introduced a novel MAPF approach,\nLNS2, which proposed to repair a quickly obtained set of infeasible paths via\niterative replanning, by relying on a fast, yet lower-quality, prioritized\nplanning (PP) algorithm. At the same time, there has been a recent push for\nMulti-Agent Reinforcement Learning (MARL) based MAPF algorithms, which exhibit\nimproved cooperation over such PP algorithms, although inevitably remaining\nslower. In this paper, we introduce a new MAPF algorithm, LNS2+RL, which\ncombines the distinct yet complementary characteristics of LNS2 and MARL to\neffectively balance their individual limitations and get the best from both\nworlds. During early iterations, LNS2+RL relies on MARL for low-level\nreplanning, which we show eliminates collisions much more than a PP algorithm.\nThere, our MARL-based planner allows agents to reason about past and future\ninformation to gradually learn cooperative decision-making through a finely\ndesigned curriculum learning. At later stages of planning, LNS2+RL adaptively\nswitches to PP algorithm to quickly resolve the remaining collisions, naturally\ntrading off solution quality (number of collisions in the solution) and\ncomputational efficiency. Our comprehensive experiments on high-agent-density\ntasks across various team sizes, world sizes, and map structures consistently\ndemonstrate the superior performance of LNS2+RL compared to many MAPF\nalgorithms, including LNS2, LaCAM, EECBS, and SCRIMP. In maps with complex\nstructures, the advantages of LNS2+RL are particularly pronounced, with LNS2+RL\nachieving a success rate of over 50% in nearly half of the tested tasks, while\nthat of LaCAM, EECBS and SCRIMP falls to 0%.\n","authors":["Yutong Wang","Tanishq Duhan","Jiaoyang Li","Guillaume Sartoretti"],"pdf_url":"https://arxiv.org/pdf/2405.17794v2.pdf","comment":"Accepted for presentation at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.02580v2","updated":"2025-01-09T15:10:09Z","published":"2025-01-05T15:26:36Z","title":"LP-ICP: General Localizability-Aware Point Cloud Registration for Robust\n  Localization in Extreme Unstructured Environments","summary":"  The Iterative Closest Point (ICP) algorithm is a crucial component of\nLiDAR-based SLAM algorithms. However, its performance can be negatively\naffected in unstructured environments that lack features and geometric\nstructures, leading to low accuracy and poor robustness in localization and\nmapping. It is known that degeneracy caused by the lack of geometric\nconstraints can lead to errors in 6-DOF pose estimation along ill-conditioned\ndirections. Therefore, there is a need for a broader and more fine-grained\ndegeneracy detection and handling method. This paper proposes a new point cloud\nregistration framework, LP-ICP, that combines point-to-line and point-to-plane\ndistance metrics in the ICP algorithm, with localizability detection and\nhandling. LP-ICP consists of a localizability detection module and an\noptimization module. The localizability detection module performs\nlocalizability analysis by utilizing the correspondences between edge points\n(with low local smoothness) to lines and planar points (with high local\nsmoothness) to planes between the scan and the map. The localizability\ncontribution of individual correspondence constraints can be applied to a\nbroader range. The optimization module adds additional soft and hard\nconstraints to the optimization equations based on the localizability category.\nThis allows the pose to be constrained along ill-conditioned directions, with\nupdates either tending towards the constraint value or leaving the initial\nestimate unchanged. This improves accuracy and reduces fluctuations. The\nproposed method is extensively evaluated through experiments on both simulation\nand real-world datasets, demonstrating higher or comparable accuracy than the\nstate-of-the-art methods. The dataset and code of this paper will also be\nopen-sourced at https://github.com/xuqingyuan2000/LP-ICP.\n","authors":["Haosong Yue","Qingyuan Xu","Fei Chen","Jia Pan","Weihai Chen"],"pdf_url":"https://arxiv.org/pdf/2501.02580v2.pdf","comment":"18 Pages, 8 Figures Submitted to IEEE Transactions on Automation\n  Science and Engineering"},{"id":"http://arxiv.org/abs/2310.09589v2","updated":"2025-01-09T14:46:41Z","published":"2023-10-14T14:11:46Z","title":"Airborne Sense and Detect of Drones using Deep Learning and LiDAR Point\n  Clouds","summary":"  The safe operation of drone swarms beyond visual line of sight requires\nmultiple safeguards to mitigate the risk of collision between drones flying in\nclose-proximity scenarios. Cooperative navigation and flight coordination\nstrategies that rely on pre-planned trajectories, constant %{satellite and\nnetwork connectivity and reliable Global Navigation Satellite System (GNSS)\npositioning are brittle to failure. Drone embedded sense and detect offers a\ncomprehensive mode of separation between drones for deconfliction and collision\navoidance. This paper presents the first airborne LiDAR based solution for\ndrone-swarm detection and localization using 3D deep learning model. It adapts\nan existing deep learning neural network to the air-to-air drone scenario by\nexpanding the scan space vertically. A new sparse convolution is proposed and\napplied to accelerate the backbone layer, which is the most time-consuming part\nof the neural network. To collect training data of safety critical,\nclose-proximity multi-drone operations, a scenario Digital Twin is used to\naugment real datasets with high fidelity synthetic data. The trained model\nachieves over 80% recall and 96% precision when tested on real-world datasets.\nBy incorporating a tracking-by-detection algorithm the system can reliably\nmonitor the separation distance of multiple drones in challenging environments.\n","authors":["Manduhu Manduhu","Alexander Dow","Petar Trslic","Gerard Dooly","Benjamin Blanck","James Riordan"],"pdf_url":"https://arxiv.org/pdf/2310.09589v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15589v2","updated":"2025-01-09T14:30:07Z","published":"2024-09-23T22:47:42Z","title":"Beyond Humanoid Prosthetic Hands: Modular Terminal Devices That Improve\n  User Performance","summary":"  Despite decades of research and development, myoelectric prosthetic hands\nlack functionality and are often rejected by users. This lack in functionality\ncan be partially attributed to the widely accepted anthropomorphic design\nideology in the field; attempting to replicate human hand form and function\ndespite severe limitations in control and sensing technology. Instead,\nprosthetic hands can be tailored to perform specific tasks without increasing\ncomplexity by shedding the constraints of anthropomorphism. In this paper, we\ndevelop and evaluate four open-source modular non-humanoid devices to perform\nthe motion required to replicate human flicking motion and to twist a\nscrewdriver, and the functionality required to pick and place flat objects and\nto cut paper. Experimental results from these devices demonstrate that, versus\na humanoid prosthesis, non-humanoid prosthesis design dramatically improves\ntask performance, reduces user compensatory movement, and reduces task load.\nCase studies with two end users demonstrate the translational benefits of this\nresearch. We found that special attention should be paid to monitoring end-user\ntask load to ensure positive rehabilitation outcomes.\n","authors":["Digby Chappell","Barry Mulvey","Shehara Perera","Fernando Bello","Petar Kormushev","Nicolas Rojas"],"pdf_url":"https://arxiv.org/pdf/2409.15589v2.pdf","comment":"10 pages, 10 figures, 2 tables. Accepted for publication in IEEE\n  Transactions on Neural Systems and Rehabilitation Engineering"},{"id":"http://arxiv.org/abs/2409.16828v3","updated":"2025-01-09T14:10:38Z","published":"2024-09-25T11:29:26Z","title":"On the role of Artificial Intelligence methods in modern\n  force-controlled manufacturing robotic tasks","summary":"  This position paper explores the integration of Artificial Intelligence (AI)\ninto force-controlled robotic tasks within the scope of advanced manufacturing,\na cornerstone of Industry 4.0. AI's role in enhancing robotic manipulators -\nkey drivers in the Fourth Industrial Revolution - is rapidly leading to\nsignificant innovations in smart manufacturing. The objective of this article\nis to frame these innovations in practical force-controlled applications - e.g.\ndeburring, polishing, and assembly tasks like peg-in-hole (PiH) - highlighting\ntheir necessity for maintaining high-quality production standards. By reporting\non recent AI-based methodologies, this article contrasts them and identifies\ncurrent challenges to be addressed in future research. The analysis concludes\nwith a perspective on future research directions, emphasizing the need for\ncommon performance metrics to validate AI techniques, integration of various\nenhancements for performance optimization, and the importance of validating\nthem in relevant scenarios. These future directions aim to provide consistency\nwith already adopted approaches, so as to be compatible with manufacturing\nstandards, increasing the relevance of AI-driven methods in both academic and\nindustrial contexts.\n","authors":["Vincenzo Petrone","Enrico Ferrentino","Pasquale Chiacchio"],"pdf_url":"https://arxiv.org/pdf/2409.16828v3.pdf","comment":"In Proceedings of the 21st International Conference on Informatics in\n  Control, Automation and Robotics - Volume 1: ICINCO, 392-399, 2024 , Porto,\n  Portugal"},{"id":"http://arxiv.org/abs/2311.16623v2","updated":"2025-01-09T13:59:21Z","published":"2023-11-28T09:24:42Z","title":"Visual Semantic Navigation with Real Robots","summary":"  Visual Semantic Navigation (VSN) is the ability of a robot to learn visual\nsemantic information for navigating in unseen environments. These VSN models\nare typically tested in those virtual environments where they are trained,\nmainly using reinforcement learning based approaches. Therefore, we do not yet\nhave an in-depth analysis of how these models would behave in the real world.\nIn this work, we propose a new solution to integrate VSN models into real\nrobots, so that we have true embodied agents. We also release a novel ROS-based\nframework for VSN, ROS4VSN, so that any VSN-model can be easily deployed in any\nROS-compatible robot and tested in a real setting. Our experiments with two\ndifferent robots, where we have embedded two state-of-the-art VSN agents,\nconfirm that there is a noticeable performance difference of these VSN\nsolutions when tested in real-world and simulation environments. We hope that\nthis research will endeavor to provide a foundation for addressing this\nconsequential issue, with the ultimate aim of advancing the performance and\nefficiency of embodied agents within authentic real-world scenarios. Code to\nreproduce all our experiments can be found at\nhttps://github.com/gramuah/ros4vsn.\n","authors":["Carlos Gutiérrez-Álvarez","Pablo Ríos-Navarro","Rafael Flor-Rodríguez","Francisco Javier Acevedo-Rodríguez","Roberto J. López-Sastre"],"pdf_url":"https://arxiv.org/pdf/2311.16623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05204v1","updated":"2025-01-09T12:55:21Z","published":"2025-01-09T12:55:21Z","title":"Design and Control of a Bipedal Robotic Character","summary":"  Legged robots have achieved impressive feats in dynamic locomotion in\nchallenging unstructured terrain. However, in entertainment applications, the\ndesign and control of these robots face additional challenges in appealing to\nhuman audiences. This work aims to unify expressive, artist-directed motions\nand robust dynamic mobility for legged robots. To this end, we introduce a new\nbipedal robot, designed with a focus on character-driven mechanical features.\nWe present a reinforcement learning-based control architecture to robustly\nexecute artistic motions conditioned on command signals. During runtime, these\ncommand signals are generated by an animation engine which composes and blends\nbetween multiple animation sources. Finally, an intuitive operator interface\nenables real-time show performances with the robot. The complete system results\nin a believable robotic character, and paves the way for enhanced human-robot\nengagement in various contexts, in entertainment robotics and beyond.\n","authors":["Ruben Grandia","Espen Knoop","Michael A. Hopkins","Georg Wiedebach","Jared Bishop","Steven Pickles","David Müller","Moritz Bächer"],"pdf_url":"https://arxiv.org/pdf/2501.05204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05198v1","updated":"2025-01-09T12:48:40Z","published":"2025-01-09T12:48:40Z","title":"Dexterous Manipulation of Deformable Objects via Pneumatic Gripping:\n  Lifting by One End","summary":"  Manipulating deformable objects in robotic cells is often costly and not\nwidely accessible. However, the use of localized pneumatic gripping systems can\nenhance accessibility. Current methods that use pneumatic grippers to handle\ndeformable objects struggle with effective lifting. This paper introduces a\nmethod for the dexterous lifting of textile deformable objects from one edge,\nutilizing a previously developed gripper designed for flexible and porous\nmaterials. By precisely adjusting the orientation and position of the gripper\nduring the lifting process, we were able to significantly reduce necessary\ngripping force and minimize object vibration caused by airflow. This method was\ntested and validated on four materials with varying mass, friction, and\nflexibility. The proposed approach facilitates the lifting of deformable\nobjects from a conveyor or automated line, even when only one edge is\naccessible for grasping. Future work will involve integrating a vision system\nto optimize the manipulation of deformable objects with more complex shapes.\n","authors":["Roman Mykhailyshyn","Jonathan Lee","Mykhailo Mykhailyshyn","Kensuke Harada","Ann Majewicz Fey"],"pdf_url":"https://arxiv.org/pdf/2501.05198v1.pdf","comment":"Submitted to RA-L"},{"id":"http://arxiv.org/abs/2501.05156v1","updated":"2025-01-09T11:23:31Z","published":"2025-01-09T11:23:31Z","title":"State-Based Disassembly Planning","summary":"  It has been shown recently that physics-based simulation significantly\nenhances the disassembly capabilities of real-world assemblies with diverse 3D\nshapes and stringent motion constraints. However, the efficiency suffers when\ntackling intricate disassembly tasks that require numerous simulations and\nincreased simulation time. In this work, we propose a State-Based Disassembly\nPlanning (SBDP) approach, prioritizing physics-based simulation with\ntranslational motion over rotational motion to facilitate autonomy, reducing\ndependency on human input, while storing intermediate motion states to improve\nsearch scalability. We introduce two novel evaluation functions derived from\nnew Directional Blocking Graphs (DBGs) enriched with state information to scale\nup the search. Our experiments show that SBDP with new evaluation functions and\nDBGs constraints outperforms the state-of-the-art in disassembly planning in\nterms of success rate and computational efficiency over benchmark datasets\nconsisting of thousands of physically valid industrial assemblies.\n","authors":["Chao Lei","Nir Lipovetzky","Krista A. Ehinger"],"pdf_url":"https://arxiv.org/pdf/2501.05156v1.pdf","comment":"Accepted at AAAI 2025 (extended version)"},{"id":"http://arxiv.org/abs/2501.05153v1","updated":"2025-01-09T11:16:00Z","published":"2025-01-09T11:16:00Z","title":"Assisting MoCap-Based Teleoperation of Robot Arm using Augmented Reality\n  Visualisations","summary":"  Teleoperating a robot arm involves the human operator positioning the robot's\nend-effector or programming each joint. Whereas humans can control their own\narms easily by integrating visual and proprioceptive feedback, it is\nchallenging to control an external robot arm in the same way, due to its\ninconsistent orientation and appearance. We explore teleoperating a robot arm\nthrough motion-capture (MoCap) of the human operator's arm with the assistance\nof augmented reality (AR) visualisations. We investigate how AR helps\nteleoperation by visualising a virtual reference of the human arm alongside the\nrobot arm to help users understand the movement mapping. We found that the AR\noverlay of a humanoid arm on the robot in the same orientation helped users\nlearn the control. We discuss findings and future work on MoCap-based robot\nteleoperation.\n","authors":["Qiushi Zhou","Antony Chacon","Jiahe Pan","Wafa Johal"],"pdf_url":"https://arxiv.org/pdf/2501.05153v1.pdf","comment":"5 pages, 7 figures, accepted to HRI 2025"},{"id":"http://arxiv.org/abs/2403.14320v3","updated":"2025-01-09T10:59:37Z","published":"2024-03-21T11:41:39Z","title":"Exosense: A Vision-Based Scene Understanding System For Exoskeletons","summary":"  Self-balancing exoskeletons are a key enabling technology for individuals\nwith mobility impairments. While the current challenges focus on\nhuman-compliant hardware and control, unlocking their use for daily activities\nrequires a scene perception system. In this work, we present Exosense, a\nvision-centric scene understanding system for self-balancing exoskeletons. We\nintroduce a multi-sensor visual-inertial mapping device as well as a navigation\nstack for state estimation, terrain mapping and long-term operation. We tested\nExosense attached to both a human leg and Wandercraft's Personal Exoskeleton in\nreal-world indoor scenarios. This enabled us to test the system during typical\nperiodic walking gaits, as well as future uses in multi-story environments. We\ndemonstrate that Exosense can achieve an odometry drift of about 4 cm per meter\ntraveled, and construct terrain maps under 1 cm average reconstruction error.\nIt can also work in a visual localization mode in a previously mapped\nenvironment, providing a step towards long-term operation of exoskeletons.\n","authors":["Jianeng Wang","Matias Mattamala","Christina Kassab","Guillaume Burger","Fabio Elnecave","Lintong Zhang","Marine Petriaux","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2403.14320v3.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.05147v1","updated":"2025-01-09T10:56:50Z","published":"2025-01-09T10:56:50Z","title":"A Systematic Literature Review on Deep Learning-based Depth Estimation\n  in Computer Vision","summary":"  Depth estimation (DE) provides spatial information about a scene and enables\ntasks such as 3D reconstruction, object detection, and scene understanding.\nRecently, there has been an increasing interest in using deep learning\n(DL)-based methods for DE. Traditional techniques rely on handcrafted features\nthat often struggle to generalise to diverse scenes and require extensive\nmanual tuning. However, DL models for DE can automatically extract relevant\nfeatures from input data, adapt to various scene conditions, and generalise\nwell to unseen environments. Numerous DL-based methods have been developed,\nmaking it necessary to survey and synthesize the state-of-the-art (SOTA).\nPrevious reviews on DE have mainly focused on either monocular or stereo-based\ntechniques, rather than comprehensively reviewing DE. Furthermore, to the best\nof our knowledge, there is no systematic literature review (SLR) that\ncomprehensively focuses on DE. Therefore, this SLR study is being conducted.\nInitially, electronic databases were searched for relevant publications,\nresulting in 1284 publications. Using defined exclusion and quality criteria,\n128 publications were shortlisted and further filtered to select 59\nhigh-quality primary studies. These studies were analysed to extract data and\nanswer defined research questions. Based on the results, DL methods were\ndeveloped for mainly three different types of DE: monocular, stereo, and\nmulti-view. 20 publicly available datasets were used to train, test, and\nevaluate DL models for DE, with KITTI, NYU Depth V2, and Make 3D being the most\nused datasets. 29 evaluation metrics were used to assess the performance of DE.\n35 base models were reported in the primary studies, and the top five most-used\nbase models were ResNet-50, ResNet-18, ResNet-101, U-Net, and VGG-16. Finally,\nthe lack of ground truth data was among the most significant challenges\nreported by primary studies.\n","authors":["Ali Rohan","Md Junayed Hasan","Andrei Petrovski"],"pdf_url":"https://arxiv.org/pdf/2501.05147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05141v1","updated":"2025-01-09T10:47:03Z","published":"2025-01-09T10:47:03Z","title":"OfficeMate: Pilot Evaluation of an Office Assistant Robot","summary":"  Office Assistant Robots (OARs) offer a promising solution to proactively\nprovide in-situ support to enhance employee well-being and productivity in\noffice spaces. We introduce OfficeMate, a social OAR designed to assist with\npractical tasks, foster social interaction, and promote health and well-being.\nThrough a pilot evaluation with seven participants in an office environment, we\nfound that users see potential in OARs for reducing stress and promoting\nhealthy habits and value the robot's ability to provide companionship and\nphysical activity reminders in the office space. However, concerns regarding\nprivacy, communication, and the robot's interaction timing were also raised.\nThe feedback highlights the need to carefully consider the robot's appearance\nand behaviour to ensure it enhances user experience and aligns with office\nsocial norms. We believe these insights will better inform the development of\nadaptive, intelligent OAR systems for future office space integration.\n","authors":["Jiahe Pan","Sarah Schömbs","Yan Zhang","Ramtin Tabatabaei","Muhammad Bilal","Wafa Johal"],"pdf_url":"https://arxiv.org/pdf/2501.05141v1.pdf","comment":"5 pages, 1 figure, accepted to HRI 2025"},{"id":"http://arxiv.org/abs/2501.05107v1","updated":"2025-01-09T09:54:31Z","published":"2025-01-09T09:54:31Z","title":"Harnessing the Power of Vibration Motors to Develop Miniature Untethered\n  Robotic Fishes","summary":"  Miniature underwater robots play a crucial role in the exploration and\ndevelopment of marine resources, particularly in confined spaces and\nhigh-pressure deep-sea environments. This study presents the design,\noptimization, and performance of a miniature robotic fish, powered by the\noscillation of bio-inspired fins. These fins feature a rigid-flexible hybrid\nstructure and use an eccentric rotating mass (ERM) vibration motor as the\nexcitation source to generate high-frequency unidirectional oscillations that\ninduce acoustic streaming for propulsion. The drive mechanism, powered by\nminiature ERM vibration motors, eliminates the need for complex mechanical\ndrive systems, enabling complete isolation of the entire drive system from the\nexternal environment and facilitating the miniaturization of the robotic fish.\nA compact, untethered robotic fish, measuring 85*60*45 mm^3, is equipped with\nthree bio-inspired fins located at the pectoral and caudal positions.\nExperimental results demonstrate that the robotic fish achieves a maximum\nforward swimming speed of 1.36 body lengths (BL) per second powered by all fins\nand minimum turning radius of 0.6 BL when powered by a single fin. These\nresults underscore the significance of employing the ERM vibration motor in\nadvancing the development of highly maneuverable, miniature untethered\nunderwater robots for various marine exploration tasks.\n","authors":["Chongjie Jiang","Yingying Dai","Jinyang Le","Xiaomeng Chen","Yu Xie","Wei Zhou","Fuzhou Niu","Ying Li","Tao Luo"],"pdf_url":"https://arxiv.org/pdf/2501.05107v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.05087v1","updated":"2025-01-09T09:11:40Z","published":"2025-01-09T09:11:40Z","title":"Enhanced Quantile Regression with Spiking Neural Networks for Long-Term\n  System Health Prognostics","summary":"  This paper presents a novel predictive maintenance framework centered on\nEnhanced Quantile Regression Neural Networks EQRNNs, for anticipating system\nfailures in industrial robotics. We address the challenge of early failure\ndetection through a hybrid approach that combines advanced neural\narchitectures. The system leverages dual computational stages: first\nimplementing an EQRNN optimized for processing multi-sensor data streams\nincluding vibration, thermal, and power signatures, followed by an integrated\nSpiking Neural Network SNN, layer that enables microsecond-level response\ntimes. This architecture achieves notable accuracy rates of 92.3\\% in component\nfailure prediction with a 90-hour advance warning window. Field testing\nconducted on an industrial scale with 50 robotic systems demonstrates\nsignificant operational improvements, yielding a 94\\% decrease in unexpected\nsystem failures and 76\\% reduction in maintenance-related downtimes. The\nframework's effectiveness in processing complex, multi-modal sensor data while\nmaintaining computational efficiency validates its applicability for Industry\n4.0 manufacturing environments.\n","authors":["David J Poland"],"pdf_url":"https://arxiv.org/pdf/2501.05087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05057v1","updated":"2025-01-09T08:28:16Z","published":"2025-01-09T08:28:16Z","title":"LearningFlow: Automated Policy Learning Workflow for Urban Driving with\n  Large Language Models","summary":"  Recent advancements in reinforcement learning (RL) demonstrate the\nsignificant potential in autonomous driving. Despite this promise, challenges\nsuch as the manual design of reward functions and low sample efficiency in\ncomplex environments continue to impede the development of safe and effective\ndriving policies. To tackle these issues, we introduce LearningFlow, an\ninnovative automated policy learning workflow tailored to urban driving. This\nframework leverages the collaboration of multiple large language model (LLM)\nagents throughout the RL training process. LearningFlow includes a curriculum\nsequence generation process and a reward generation process, which work in\ntandem to guide the RL policy by generating tailored training curricula and\nreward functions. Particularly, each process is supported by an analysis agent\nthat evaluates training progress and provides critical insights to the\ngeneration agent. Through the collaborative efforts of these LLM agents,\nLearningFlow automates policy learning across a series of complex driving\ntasks, and it significantly reduces the reliance on manual reward function\ndesign while enhancing sample efficiency. Comprehensive experiments are\nconducted in the high-fidelity CARLA simulator, along with comparisons with\nother existing methods, to demonstrate the efficacy of our proposed approach.\nThe results demonstrate that LearningFlow excels in generating rewards and\ncurricula. It also achieves superior performance and robust generalization\nacross various driving tasks, as well as commendable adaptation to different RL\nalgorithms.\n","authors":["Zengqi Peng","Yubin Wang","Xu Han","Lei Zheng","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2501.05057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05031v1","updated":"2025-01-09T07:43:49Z","published":"2025-01-09T07:43:49Z","title":"ECBench: Can Multi-modal Foundation Models Understand the Egocentric\n  World? A Holistic Embodied Cognition Benchmark","summary":"  The enhancement of generalization in robots by large vision-language models\n(LVLMs) is increasingly evident. Therefore, the embodied cognitive abilities of\nLVLMs based on egocentric videos are of great interest. However, current\ndatasets for embodied video question answering lack comprehensive and\nsystematic evaluation frameworks. Critical embodied cognitive issues, such as\nrobotic self-cognition, dynamic scene perception, and hallucination, are rarely\naddressed. To tackle these challenges, we propose ECBench, a high-quality\nbenchmark designed to systematically evaluate the embodied cognitive abilities\nof LVLMs. ECBench features a diverse range of scene video sources, open and\nvaried question formats, and 30 dimensions of embodied cognition. To ensure\nquality, balance, and high visual dependence, ECBench uses class-independent\nmeticulous human annotation and multi-round question screening strategies.\nAdditionally, we introduce ECEval, a comprehensive evaluation system that\nensures the fairness and rationality of the indicators. Utilizing ECBench, we\nconduct extensive evaluations of proprietary, open-source, and task-specific\nLVLMs. ECBench is pivotal in advancing the embodied cognitive capabilities of\nLVLMs, laying a solid foundation for developing reliable core models for\nembodied agents. All data and code are available at\nhttps://github.com/Rh-Dang/ECBench.\n","authors":["Ronghao Dang","Yuqian Yuan","Wenqi Zhang","Yifei Xin","Boqiang Zhang","Long Li","Liuyi Wang","Qinyang Zeng","Xin Li","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2501.05031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05014v1","updated":"2025-01-09T07:15:59Z","published":"2025-01-09T07:15:59Z","title":"UAV-VLA: Vision-Language-Action System for Large Scale Aerial Mission\n  Generation","summary":"  The UAV-VLA (Visual-Language-Action) system is a tool designed to facilitate\ncommunication with aerial robots. By integrating satellite imagery processing\nwith the Visual Language Model (VLM) and the powerful capabilities of GPT,\nUAV-VLA enables users to generate general flight paths-and-action plans through\nsimple text requests. This system leverages the rich contextual information\nprovided by satellite images, allowing for enhanced decision-making and mission\nplanning. The combination of visual analysis by VLM and natural language\nprocessing by GPT can provide the user with the path-and-action set, making\naerial operations more efficient and accessible. The newly developed method\nshowed the difference in the length of the created trajectory in 22% and the\nmean error in finding the objects of interest on a map in 34.22 m by Euclidean\ndistance in the K-Nearest Neighbors (KNN) approach.\n","authors":["Oleg Sautenkov","Yasheerah Yaqoot","Artem Lykov","Muhammad Ahsan Mustafa","Grik Tadevosyan","Aibek Akhmetkazy","Miguel Altamirano Cabrera","Mikhail Martynov","Sausar Karaf","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.05014v1.pdf","comment":"HRI 2025"},{"id":"http://arxiv.org/abs/2409.14737v3","updated":"2025-01-09T07:09:44Z","published":"2024-09-23T06:33:52Z","title":"Generalizable Autonomous Driving System across Diverse Adverse Weather\n  Conditions","summary":"  Various adverse weather conditions pose a significant challenge to autonomous\ndriving (AD) street scene semantic understanding (segmentation). A common\nstrategy is to minimize the disparity between images captured in clear and\nadverse weather conditions. However, this technique typically relies on\nutilizing clear image as a reference, which is challenging to obtain in\npractice. Furthermore, this method typically targets a single adverse\ncondition, and thus perform poorly when confronting a mixture of multiple\nadverse weather conditions. To address these issues, we introduce a\nreference-free and Adverse weather-Immune scheme (called AdvImmu) that\nleverages the invariance of weather conditions over short periods (seconds).\nSpecifically, AdvImmu includes three components: Locally Sequential Mechanism\n(LSM), Globally Shuffled Mechanism (GSM), and Unfolded Regularizers (URs). LSM\nleverages temporal correlations between adjacent frames to enhance model\nperformance. GSM is proposed to shuffle LSM segments to prevent overfitting of\ntemporal patterns. URs are the deep unfolding implementation of two proposed\nregularizers to penalize the model complexity to enhance across-weather\ngeneralization. In addition, to overcome the over-reliance on consecutive\nframe-wise annotations in the training of AdvImmu (typically unavailable in AD\nscenarios), we incorporate a foundation model named Segment Anything Model\n(SAM) to assist to annotate frames, and additionally propose a cluster\nalgorithm (denoted as SBICAC) to surmount SAM's category-agnostic issue to\ngenerate pseudo-labels. Extensive experiments demonstrate that the proposed\nAdvImmu outperforms existing state-of-the-art methods by 88.56% in mean\nIntersection over Union (mIoU).\n","authors":["Wei-Bin Kou","Guangxu Zhu","Rongguang Ye","Qingfeng Lin","Zeyi Ren","Ming Tang","Yik-Chung Wu"],"pdf_url":"https://arxiv.org/pdf/2409.14737v3.pdf","comment":"16 Pages"},{"id":"http://arxiv.org/abs/2501.05004v1","updated":"2025-01-09T06:56:44Z","published":"2025-01-09T06:56:44Z","title":"A Fast Path-Planning Method for Continuous Harvesting of Table-Top Grown\n  Strawberries","summary":"  Continuous harvesting and storage of multiple fruits in a single operation\nallow robots to significantly reduce the travel distance required for\nrepetitive back-and-forth movements. Traditional collision-free path planning\nalgorithms, such as Rapidly-Exploring Random Tree (RRT) and A-star (A), often\nfail to meet the demands of efficient continuous fruit harvesting due to their\nlow search efficiency and the generation of excessive redundant points. This\npaper presents the Interactive Local Minima Search Algorithm (ILMSA), a fast\npath-planning method designed for the continuous harvesting of table-top grown\nstrawberries. The algorithm featured an interactive node expansion strategy\nthat iteratively extended and refined collision-free path segments based on\nlocal minima points. To enable the algorithm to function in 3D, the 3D\nenvironment was projected onto multiple 2D planes, generating optimal paths on\neach plane. The best path was then selected, followed by integrating and\nsmoothing the 3D path segments. Simulations demonstrated that ILMSA\noutperformed existing methods, reducing path length by 21.5% and planning time\nby 97.1% compared to 3D-RRT, while achieving 11.6% shorter paths and 25.4%\nfewer nodes than the Lowest Point of the Strawberry (LPS) algorithm in 3D\nenvironments. In 2D, ILMSA achieved path lengths 16.2% shorter than A, 23.4%\nshorter than RRT, and 20.9% shorter than RRT-Connect, while being over 96%\nfaster and generating significantly fewer nodes. Field tests confirmed ILMSA's\nsuitability for complex agricultural tasks, having a combined planning and\nexecution time and an average path length that were approximately 58% and 69%,\nrespectively, of those achieved by the LPS algorithm.\n","authors":["Zhonghua Miao","Yang Chen","Lichao Yang","Shimin Hu","Ya Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.05004v1.pdf","comment":"Accepted by IEEE Transactions on AgriFood Electronics"},{"id":"http://arxiv.org/abs/2410.14368v2","updated":"2025-01-09T06:02:11Z","published":"2024-10-18T10:53:44Z","title":"CoMAL: Collaborative Multi-Agent Large Language Models for\n  Mixed-Autonomy Traffic","summary":"  The integration of autonomous vehicles into urban traffic has great potential\nto improve efficiency by reducing congestion and optimizing traffic flow\nsystematically. In this paper, we introduce CoMAL (Collaborative Multi-Agent\nLLMs), a framework designed to address the mixed-autonomy traffic problem by\ncollaboration among autonomous vehicles to optimize traffic flow. CoMAL is\nbuilt upon large language models, operating in an interactive traffic\nsimulation environment. It utilizes a Perception Module to observe surrounding\nagents and a Memory Module to store strategies for each agent. The overall\nworkflow includes a Collaboration Module that encourages autonomous vehicles to\ndiscuss the effective strategy and allocate roles, a reasoning engine to\ndetermine optimal behaviors based on assigned roles, and an Execution Module\nthat controls vehicle actions using a hybrid approach combining rule-based\nmodels. Experimental results demonstrate that CoMAL achieves superior\nperformance on the Flow benchmark. Additionally, we evaluate the impact of\ndifferent language models and compare our framework with reinforcement learning\napproaches. It highlights the strong cooperative capability of LLM agents and\npresents a promising solution to the mixed-autonomy traffic challenge. The code\nis available at https://github.com/Hyan-Yao/CoMAL.\n","authors":["Huaiyuan Yao","Longchao Da","Vishnu Nandam","Justin Turnau","Zhiwei Liu","Linsey Pang","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2410.14368v2.pdf","comment":"8 pages, 4 figures, accepted to SDM25"},{"id":"http://arxiv.org/abs/2501.04988v1","updated":"2025-01-09T06:01:34Z","published":"2025-01-09T06:01:34Z","title":"Intelligent Sailing Model for Open Sea Navigation","summary":"  Autonomous vessels potentially enhance safety and reliability of seaborne\ntrade. To facilitate the development of autonomous vessels, high-fidelity\nsimulations are required to model realistic interactions with other vessels.\nHowever, modeling realistic interactive maritime traffic is challenging due to\nthe unstructured environment, coarsely specified traffic rules, and largely\nvarying vessel types. Currently, there is no standard for simulating\ninteractive maritime environments in order to rigorously benchmark autonomous\nvessel algorithms. In this paper, we introduce the first intelligent sailing\nmodel (ISM), which simulates rule-compliant vessels for navigation on the open\nsea. An ISM vessel reacts to other traffic participants according to maritime\ntraffic rules while at the same time solving a motion planning task\ncharacterized by waypoints. In particular, the ISM monitors the applicable\nrules, generates rule-compliant waypoints accordingly, and utilizes a model\npredictive control for tracking the waypoints. We evaluate the ISM in two\nenvironments: interactive traffic with only ISM vessels and mixed traffic where\nsome vessel trajectories are from recorded real-world maritime traffic data or\nhandcrafted for criticality. Our results show that simulations with many ISM\nvessels of different vessel types are rule-compliant and scalable. We tested\n4,049 critical traffic scenarios. For interactive traffic with ISM vessels, no\ncollisions occurred while goal-reaching rates of about 97 percent were\nachieved. We believe that our ISM can serve as a standard for challenging and\nrealistic maritime traffic simulation to accelerate autonomous vessel\ndevelopment.\n","authors":["Hanna Krasowski","Stefan Schärdinger","Murat Arcak","Matthias Althoff"],"pdf_url":"https://arxiv.org/pdf/2501.04988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04982v1","updated":"2025-01-09T05:45:03Z","published":"2025-01-09T05:45:03Z","title":"CuRLA: Curriculum Learning Based Deep Reinforcement Learning for\n  Autonomous Driving","summary":"  In autonomous driving, traditional Computer Vision (CV) agents often struggle\nin unfamiliar situations due to biases in the training data. Deep Reinforcement\nLearning (DRL) agents address this by learning from experience and maximizing\nrewards, which helps them adapt to dynamic environments. However, ensuring\ntheir generalization remains challenging, especially with static training\nenvironments. Additionally, DRL models lack transparency, making it difficult\nto guarantee safety in all scenarios, particularly those not seen during\ntraining. To tackle these issues, we propose a method that combines DRL with\nCurriculum Learning for autonomous driving. Our approach uses a Proximal Policy\nOptimization (PPO) agent and a Variational Autoencoder (VAE) to learn safe\ndriving in the CARLA simulator. The agent is trained using two-fold curriculum\nlearning, progressively increasing environment difficulty and incorporating a\ncollision penalty in the reward function to promote safety. This method\nimproves the agent's adaptability and reliability in complex environments, and\nunderstand the nuances of balancing multiple reward components from different\nfeedback signals in a single scalar reward function. Keywords: Computer Vision,\nDeep Reinforcement Learning, Variational Autoencoder, Proximal Policy\nOptimization, Curriculum Learning, Autonomous Driving.\n","authors":["Bhargava Uppuluri","Anjel Patel","Neil Mehta","Sridhar Kamath","Pratyush Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2501.04982v1.pdf","comment":"To be published in the 17th International Conference on Agents and\n  Artificial Intelligence (ICAART), Feb 2025"},{"id":"http://arxiv.org/abs/2501.04969v1","updated":"2025-01-09T04:47:51Z","published":"2025-01-09T04:47:51Z","title":"AD-L-JEPA: Self-Supervised Spatial World Models with Joint Embedding\n  Predictive Architecture for Autonomous Driving with LiDAR Data","summary":"  As opposed to human drivers, current autonomous driving systems still require\nvast amounts of labeled data to train. Recently, world models have been\nproposed to simultaneously enhance autonomous driving capabilities by improving\nthe way these systems understand complex real-world environments and reduce\ntheir data demands via self-supervised pre-training. In this paper, we present\nAD-L-JEPA (aka Autonomous Driving with LiDAR data via a Joint Embedding\nPredictive Architecture), a novel self-supervised pre-training framework for\nautonomous driving with LiDAR data that, as opposed to existing methods, is\nneither generative nor contrastive. Our method learns spatial world models with\na joint embedding predictive architecture. Instead of explicitly generating\nmasked unknown regions, our self-supervised world models predict Bird's Eye\nView (BEV) embeddings to represent the diverse nature of autonomous driving\nscenes. Our approach furthermore eliminates the need to manually create\npositive and negative pairs, as is the case in contrastive learning. AD-L-JEPA\nleads to simpler implementation and enhanced learned representations. We\nqualitatively and quantitatively demonstrate high-quality of embeddings learned\nwith AD-L-JEPA. We furthermore evaluate the accuracy and label efficiency of\nAD-L-JEPA on popular downstream tasks such as LiDAR 3D object detection and\nassociated transfer learning. Our experimental evaluation demonstrates that\nAD-L-JEPA is a plausible approach for self-supervised pre-training in\nautonomous driving applications and is the best available approach\noutperforming SOTA, including most recently proposed Occupancy-MAE [1] and ALSO\n[2]. The source code of AD-L-JEPA is available at\nhttps://github.com/HaoranZhuExplorer/AD-L-JEPA-Release.\n","authors":["Haoran Zhu","Zhenyuan Dong","Kristi Topollai","Anna Choromanska"],"pdf_url":"https://arxiv.org/pdf/2501.04969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04276v2","updated":"2025-01-09T04:26:27Z","published":"2025-01-08T04:54:28Z","title":"Bridging Adaptivity and Safety: Learning Agile Collision-Free Locomotion\n  Across Varied Physics","summary":"  Real-world legged locomotion systems often need to reconcile agility and\nsafety for different scenarios. Moreover, the underlying dynamics are often\nunknown and time-variant (e.g., payload, friction). In this paper, we introduce\nBAS (Bridging Adaptivity and Safety), which builds upon the pipeline of prior\nwork Agile But Safe (ABS)(He et al.) and is designed to provide adaptive safety\neven in dynamic environments with uncertainties. BAS involves an agile policy\nto avoid obstacles rapidly and a recovery policy to prevent collisions, a\nphysical parameter estimator that is concurrently trained with agile policy,\nand a learned control-theoretic RA (reach-avoid) value network that governs the\npolicy switch. Also, the agile policy and RA network are both conditioned on\nphysical parameters to make them adaptive. To mitigate the distribution shift\nissue, we further introduce an on-policy fine-tuning phase for the estimator to\nenhance its robustness and accuracy. The simulation results show that BAS\nachieves 50% better safety than baselines in dynamic environments while\nmaintaining a higher speed on average. In real-world experiments, BAS shows its\ncapability in complex environments with unknown physics (e.g., slippery floors\nwith unknown frictions, unknown payloads up to 8kg), while baselines lack\nadaptivity, leading to collisions or. degraded agility. As a result, BAS\nachieves a 19.8% increase in speed and gets a 2.36 times lower collision rate\nthan ABS in the real world. Videos: https://adaptive-safe-locomotion.github.io.\n","authors":["Yichao Zhong","Chong Zhang","Tairan He","Guanya Shi"],"pdf_url":"https://arxiv.org/pdf/2501.04276v2.pdf","comment":"11 Pages, 6 Figures"},{"id":"http://arxiv.org/abs/2311.09346v2","updated":"2025-01-09T04:20:34Z","published":"2023-11-15T20:09:29Z","title":"Nothing Stands Still: A Spatiotemporal Benchmark on 3D Point Cloud\n  Registration Under Large Geometric and Temporal Change","summary":"  Building 3D geometric maps of man-made spaces is a well-established and\nactive field that is fundamental to computer vision and robotics. However,\nconsidering the evolving nature of built environments, it is essential to\nquestion the capabilities of current mapping efforts in handling temporal\nchanges. In addition, spatiotemporal mapping holds significant potential for\nachieving sustainability and circularity goals. Existing mapping approaches\nfocus on small changes, such as object relocation or self-driving car\noperation; in all cases where the main structure of the scene remains fixed.\nConsequently, these approaches fail to address more radical changes in the\nstructure of the built environment, such as geometry and topology. To this end,\nwe introduce the Nothing Stands Still (NSS) benchmark, which focuses on the\nspatiotemporal registration of 3D scenes undergoing large spatial and temporal\nchange, ultimately creating one coherent spatiotemporal map. Specifically, the\nbenchmark involves registering two or more partial 3D point clouds (fragments)\nfrom the same scene but captured from different spatiotemporal views. In\naddition to the standard pairwise registration, we assess the multi-way\nregistration of multiple fragments that belong to any temporal stage. As part\nof NSS, we introduce a dataset of 3D point clouds recurrently captured in\nlarge-scale building indoor environments that are under construction or\nrenovation. The NSS benchmark presents three scenarios of increasing\ndifficulty, to quantify the generalization ability of point cloud registration\nmethods over space (within one building and across buildings) and time. We\nconduct extensive evaluations of state-of-the-art methods on NSS. The results\ndemonstrate the necessity for novel methods specifically designed to handle\nlarge spatiotemporal changes. The homepage of our benchmark is at\nhttp://nothing-stands-still.com.\n","authors":["Tao Sun","Yan Hao","Shengyu Huang","Silvio Savarese","Konrad Schindler","Marc Pollefeys","Iro Armeni"],"pdf_url":"https://arxiv.org/pdf/2311.09346v2.pdf","comment":"To appear in the ISPRS Journal of Photogrammetry and Remote Sensing.\n  29 pages, 26 figures. For the project page, see\n  http://nothing-stands-still.com"},{"id":"http://arxiv.org/abs/2501.04595v2","updated":"2025-01-09T04:13:45Z","published":"2025-01-08T16:23:56Z","title":"MobileH2R: Learning Generalizable Human to Mobile Robot Handover\n  Exclusively from Scalable and Diverse Synthetic Data","summary":"  This paper introduces MobileH2R, a framework for learning generalizable\nvision-based human-to-mobile-robot (H2MR) handover skills. Unlike traditional\nfixed-base handovers, this task requires a mobile robot to reliably receive\nobjects in a large workspace enabled by its mobility. Our key insight is that\ngeneralizable handover skills can be developed in simulators using high-quality\nsynthetic data, without the need for real-world demonstrations. To achieve\nthis, we propose a scalable pipeline for generating diverse synthetic full-body\nhuman motion data, an automated method for creating safe and imitation-friendly\ndemonstrations, and an efficient 4D imitation learning method for distilling\nlarge-scale demonstrations into closed-loop policies with base-arm\ncoordination. Experimental evaluations in both simulators and the real world\nshow significant improvements (at least +15% success rate) over baseline\nmethods in all cases. Experiments also validate that large-scale and diverse\nsynthetic data greatly enhances robot learning, highlighting our scalable\nframework.\n","authors":["Zifan Wang","Ziqing Chen","Junyu Chen","Jilong Wang","Yuxin Yang","Yunze Liu","Xueyi Liu","He Wang","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2501.04595v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04929v1","updated":"2025-01-09T02:45:05Z","published":"2025-01-09T02:45:05Z","title":"What Drives You to Interact?: The Role of User Motivation for a Robot in\n  the Wild","summary":"  In this paper, we aim to understand how user motivation shapes human-robot\ninteraction (HRI) in the wild. To explore this, we conducted a field study by\ndeploying a fully autonomous conversational robot in a shopping mall over two\ndays. Through sequential video analysis, we identified five patterns of\ninteraction fluency (Smooth, Awkward, Active, Messy, and Quiet), four types of\nuser motivation for interacting with the robot (Function, Experiment,\nCuriosity, and Education), and user positioning towards the robot. We further\nanalyzed how these motivations and positioning influence interaction fluency.\nOur findings suggest that incorporating users' motivation types into the design\nof robot behavior can enhance interaction fluency, engagement, and user\nsatisfaction in real-world HRI scenarios.\n","authors":["Amy Koike","Yuki Okafuji","Kenya Hoshimure","Jun Baba"],"pdf_url":"https://arxiv.org/pdf/2501.04929v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.04228v2","updated":"2025-01-09T01:35:56Z","published":"2025-01-08T01:59:47Z","title":"Constraints as Rewards: Reinforcement Learning for Robots without Reward\n  Functions","summary":"  Reinforcement learning has become an essential algorithm for generating\ncomplex robotic behaviors. However, to learn such behaviors, it is necessary to\ndesign a reward function that describes the task, which often consists of\nmultiple objectives that needs to be balanced. This tuning process is known as\nreward engineering and typically involves extensive trial-and-error. In this\npaper, to avoid this trial-and-error process, we propose the concept of\nConstraints as Rewards (CaR). CaR formulates the task objective using multiple\nconstraint functions instead of a reward function and solves a reinforcement\nlearning problem with constraints using the Lagrangian-method. By adopting this\napproach, different objectives are automatically balanced, because Lagrange\nmultipliers serves as the weights among the objectives. In addition, we will\ndemonstrate that constraints, expressed as inequalities, provide an intuitive\ninterpretation of the optimization target designed for the task. We apply the\nproposed method to the standing-up motion generation task of a\nsix-wheeled-telescopic-legged robot and demonstrate that the proposed method\nsuccessfully acquires the target behavior, even though it is challenging to\nlearn with manually designed reward functions.\n","authors":["Yu Ishihara","Noriaki Takasugi","Kotaro Kawakami","Masaya Kinoshita","Kazumi Aoyama"],"pdf_url":"https://arxiv.org/pdf/2501.04228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05610v1","updated":"2025-01-09T23:18:38Z","published":"2025-01-09T23:18:38Z","title":"Towards Probabilistic Inference of Human Motor Intentions by Assistive\n  Mobile Robots Controlled via a Brain-Computer Interface","summary":"  Assistive mobile robots are a transformative technology that helps persons\nwith disabilities regain the ability to move freely. Although autonomous\nwheelchairs significantly reduce user effort, they still require human input to\nallow users to maintain control and adapt to changing environments. Brain\nComputer Interface (BCI) stands out as a highly user-friendly option that does\nnot require physical movement. Current BCI systems can understand whether users\nwant to accelerate or decelerate, but they implement these changes in discrete\nspeed steps rather than allowing for smooth, continuous velocity adjustments.\nThis limitation prevents the systems from mimicking the natural, fluid speed\nchanges seen in human self-paced motion. The authors aim to address this\nlimitation by redesigning the perception-action cycle in a BCI controlled\nrobotic system: improving how the robotic agent interprets the user's motion\nintentions (world state) and implementing these actions in a way that better\nreflects natural physical properties of motion, such as inertia and damping.\nThe scope of this paper focuses on the perception aspect. We asked and answered\na normative question \"what computation should the robotic agent carry out to\noptimally perceive incomplete or noisy sensory observations?\" Empirical EEG\ndata were collected, and probabilistic representation that served as world\nstate distributions were learned and evaluated in a Generative Adversarial\nNetwork framework. The ROS framework was established that connected with a\nGazebo environment containing a digital twin of an indoor space and a virtual\nmodel of a robotic wheelchair. Signal processing and statistical analyses were\nimplemented to identity the most discriminative features in the\nspatial-spectral-temporal dimensions, which are then used to construct the\nworld model for the robotic agent to interpret user motion intentions as a\nBayesian observer.\n","authors":["Xiaoshan Zhou","Carol M. Menassa","Vineet R. Kamat"],"pdf_url":"https://arxiv.org/pdf/2501.05610v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2304.02075v2","updated":"2025-01-09T19:54:53Z","published":"2023-04-04T18:58:16Z","title":"GUTS: Generalized Uncertainty-Aware Thompson Sampling for Multi-Agent\n  Active Search","summary":"  Robotic solutions for quick disaster response are essential to ensure minimal\nloss of life, especially when the search area is too dangerous or too vast for\nhuman rescuers. We model this problem as an asynchronous multi-agent\nactive-search task where each robot aims to efficiently seek objects of\ninterest (OOIs) in an unknown environment. This formulation addresses the\nrequirement that search missions should focus on quick recovery of OOIs rather\nthan full coverage of the search region. Previous approaches fail to accurately\nmodel sensing uncertainty, account for occlusions due to foliage or terrain, or\nconsider the requirement for heterogeneous search teams and robustness to\nhardware and communication failures. We present the Generalized\nUncertainty-aware Thompson Sampling (GUTS) algorithm, which addresses these\nissues and is suitable for deployment on heterogeneous multi-robot systems for\nactive search in large unstructured environments. We show through simulation\nexperiments that GUTS consistently outperforms existing methods such as\nparallelized Thompson Sampling and exhaustive search, recovering all OOIs in\n80% of all runs. In contrast, existing approaches recover all OOIs in less than\n40% of all runs. We conduct field tests using our multi-robot system in an\nunstructured environment with a search area of approximately 75,000 sq. m. Our\nsystem demonstrates robustness to various failure modes, achieving full\nrecovery of OOIs (where feasible) in every field run, and significantly\noutperforming our baseline.\n","authors":["Nikhil Angad Bakshi","Tejus Gupta","Ramina Ghods","Jeff Schneider"],"pdf_url":"https://arxiv.org/pdf/2304.02075v2.pdf","comment":"7 pages, 5 figures, 1 table, for associated video see:\n  https://youtu.be/K0jkzdQ_j2E , published in International Conference on\n  Robotics and Automation (ICRA) 2023. Outstanding Deployed Systems Paper\n  Winner"},{"id":"http://arxiv.org/abs/2501.06263v1","updated":"2025-01-09T15:00:03Z","published":"2025-01-09T15:00:03Z","title":"GelBelt: A Vision-based Tactile Sensor for Continuous Sensing of Large\n  Surfaces","summary":"  Scanning large-scale surfaces is widely demanded in surface reconstruction\napplications and detecting defects in industries' quality control and\nmaintenance stages. Traditional vision-based tactile sensors have shown\npromising performance in high-resolution shape reconstruction while suffering\nlimitations such as small sensing areas or susceptibility to damage when slid\nacross surfaces, making them unsuitable for continuous sensing on large\nsurfaces. To address these shortcomings, we introduce a novel vision-based\ntactile sensor designed for continuous surface sensing applications. Our design\nuses an elastomeric belt and two wheels to continuously scan the target\nsurface. The proposed sensor showed promising results in both shape\nreconstruction and surface fusion, indicating its applicability. The dot\nproduct of the estimated and reference surface normal map is reported over the\nsensing area and for different scanning speeds. Results indicate that the\nproposed sensor can rapidly scan large-scale surfaces with high accuracy at\nspeeds up to 45 mm/s.\n","authors":["Mohammad Amin Mirzaee","Hung-Jui Huang","Wenzhen Yuan"],"pdf_url":"https://arxiv.org/pdf/2501.06263v1.pdf","comment":"Accepted to IEEE RA-L. 8 pages, 7 figures, webpage:\n  https://aminmirz.github.io/GelBelt/"},{"id":"http://arxiv.org/abs/2501.06262v1","updated":"2025-01-09T13:27:02Z","published":"2025-01-09T13:27:02Z","title":"Towards smart and adaptive agents for active sensing on edge devices","summary":"  TinyML has made deploying deep learning models on low-power edge devices\nfeasible, creating new opportunities for real-time perception in constrained\nenvironments. However, the adaptability of such deep learning methods remains\nlimited to data drift adaptation, lacking broader capabilities that account for\nthe environment's underlying dynamics and inherent uncertainty. Deep learning's\nscaling laws, which counterbalance this limitation by massively up-scaling data\nand model size, cannot be applied when deploying on the Edge, where deep\nlearning limitations are further amplified as models are scaled down for\ndeployment on resource-constrained devices.\n  This paper presents a smart agentic system capable of performing on-device\nperception and planning, enabling active sensing on the edge. By incorporating\nactive inference into our solution, our approach extends beyond deep learning\ncapabilities, allowing the system to plan in dynamic environments while\noperating in real time with a modest total model size of 2.3 MB. We showcase\nour proposed system by creating and deploying a saccade agent connected to an\nIoT camera with pan and tilt capabilities on an NVIDIA Jetson embedded device.\nThe saccade agent controls the camera's field of view following optimal\npolicies derived from the active inference principles, simulating human-like\nsaccadic motion for surveillance and robotics applications.\n","authors":["Devendra Vyas","Miguel de Prado","Tim Verbelen"],"pdf_url":"https://arxiv.org/pdf/2501.06262v1.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2409.05545v2","updated":"2025-01-09T18:51:52Z","published":"2024-09-09T12:11:18Z","title":"Adaptive Probabilistic Planning for the Uncertain and Dynamic\n  Orienteering Problem","summary":"  The Orienteering Problem (OP) is a well-studied routing problem that has been\nextended to incorporate uncertainties, reflecting stochastic or dynamic travel\ncosts, prize-collection costs, and prizes. Existing approaches may, however, be\ninefficient in real-world applications due to insufficient modeling knowledge\nand initially unknowable parameters in online scenarios. Thus, we propose the\nUncertain and Dynamic Orienteering Problem (UDOP), modeling travel costs as\ndistributions with unknown and time-variant parameters. UDOP also associates\nuncertain travel costs with dynamic prizes and prize-collection costs for its\nobjective and budget constraints. To address UDOP, we develop an ADaptive\nApproach for Probabilistic paThs - ADAPT, that iteratively performs 'execution'\nand 'online planning' based on an initial 'offline' solution. The execution\nphase updates system status and records online cost observations. The online\nplanner employs a Bayesian approach to adaptively estimate power consumption\nand optimize path sequence based on safety beliefs. We evaluate ADAPT in a\npractical Unmanned Aerial Vehicle (UAV) charging scheduling problem for\nWireless Rechargeable Sensor Networks. The UAV must optimize its path to\nrecharge sensor nodes efficiently while managing its energy under uncertain\nconditions. ADAPT maintains comparable solution quality and computation time\nwhile offering superior robustness. Extensive simulations show that ADAPT\nachieves a 100% Mission Success Rate (MSR) across all tested scenarios,\noutperforming comparable heuristic-based and frequentist approaches that fail\nup to 70% (under challenging conditions) and averaging 67% MSR, respectively.\nThis work advances the field of OP with uncertainties, offering a reliable and\nefficient approach for real-world applications in uncertain and dynamic\nenvironments.\n","authors":["Qiuchen Qian","Yanran Wang","David Boyle"],"pdf_url":"https://arxiv.org/pdf/2409.05545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06445v4","updated":"2025-01-09T18:17:20Z","published":"2024-05-10T12:50:52Z","title":"Systematic interval observer design for linear systems","summary":"  We first propose systematic and comprehensive interval observer designs for\nlinear time-invariant systems, under standard assumptions involving\nobservability and interval bounds on the initial condition and disturbances.\nHistorically, such designs rely on transformations with certain limitations\ninto a form that is Metzler (for continuous time) or non-negative (for discrete\ntime). We show that they can be effectively replaced with a linear\ntime-invariant transformation that can be easily computed offline. Next, we\npropose an extension to the time-varying setting, addressing the limitations of\nconventional transformations that lack guaranteed outcomes. We employ dynamical\ntransformations into higher-dimensional target forms for which an interval\nobserver can always be constructed. These transformations become\nleft-invertible after a certain time, provided observability conditions are met\nand the target dynamics are sufficiently high-dimensional and fast, thus\nenabling the reconstruction of bounds in the original coordinates in finite\ntime. Academic examples are presented to illustrate our methods.\n","authors":["Thach Ngoc Dinh","Gia Quoc Bao Tran"],"pdf_url":"https://arxiv.org/pdf/2405.06445v4.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.05285v1","updated":"2025-01-09T14:49:03Z","published":"2025-01-09T14:49:03Z","title":"Pitch Plane Trajectory Tracking Control for Sounding Rockets via\n  Adaptive Feedback Linearization","summary":"  This paper proposes a pitch plane trajectory tacking control solution for\nsuborbital launch vehicles relying on adaptive feedback linearization.\nInitially, the 2D dynamics and kinematics for a single-engine,\nthrust-vector-controlled sounding rocket are obtained for control design\npurposes. Then, an inner-outer control strategy, which simultaneously tackles\nattitude and position control, is adopted, with the inner-loop comprising the\naltitude and pitch control and the outer-loop addressing the horizontal\n(downrange) position control. Feedback linearization is used to cancel out the\nnon-linearities in both the inner and outer dynamics. Making use of Lyapunov\nstability theory, an adaptation law, which provides online estimates on the\ninner-loop aerodynamic uncertainty, is jointly designed with the output\ntracking controller via adaptive backstepping, ensuring global reference\ntracking in the region where the feedback linearization is well-defined. The\nzero dynamics of the inner-stabilized system are then exploited to obtain the\nouterloop dynamics and derive a Linear Quadratic Regulator (LQR) with integral\naction, which can stabilize them as well as reject external disturbances. In\nthe outermost loop, the estimate on the correspondent aerodynamic uncertainty\nis indirectly obtained by using the inner loop estimates together with known\naerodynamics relations. The resulting inner-outer position control solution is\nproven to be asymptotically stable in the region of interest. Using a\nsingle-stage sounding rocket, propelled by a liquid engine, as reference\nvehicle, different mission scenarios are tested in a simulation environment to\nverify the adaptability of the proposed control strategy. The system is able to\ntrack the requested trajectories while rejecting external wind disturbances.\nFurthermore, the need to re-tune the control gains in between different mission\nscenarios is minimal to none.\n","authors":["Pedro dos Santos","Paulo Oliveira"],"pdf_url":"https://arxiv.org/pdf/2501.05285v1.pdf","comment":"Paper accepted to the IEEE Aerospace Conference 2025. Copyright:\n  979-8-3503-5597-0/25/$31.00 @2025 IEEE"},{"id":"http://arxiv.org/abs/2501.04572v2","updated":"2025-01-09T14:30:41Z","published":"2025-01-08T15:42:41Z","title":"Regret Analysis: a control perspective","summary":"  Online learning and model reference adaptive control have many interesting\nintersections. One area where they differ however is in how the algorithms are\nanalyzed and what objective or metric is used to discriminate \"good\" algorithms\nfrom \"bad\" algorithms. In adaptive control there are usually two objectives: 1)\nprove that all time varying parameters/states of the system are bounded, and 2)\nthat the instantaneous error between the adaptively controlled system and a\nreference system converges to zero over time (or at least a compact set). For\nonline learning the performance of algorithms is often characterized by the\nregret the algorithm incurs. Regret is defined as the cumulative loss (cost)\nover time from the online algorithm minus the cumulative loss (cost) of the\nsingle optimal fixed parameter choice in hindsight. Another significant\ndifference between the two areas of research is with regard to the assumptions\nmade in order to obtain said results. Adaptive control makes assumptions about\nthe input-output properties of the control problem and derives solutions for a\nfixed error model or optimization task. In the online learning literature\nresults are derived for classes of loss functions (i.e. convex) while a priori\nassuming that all time varying parameters are bounded, which for many\noptimization tasks is not unrealistic, but is a non starter in control\napplications. In this work we discuss these differences in detail through the\nregret based analysis of gradient descent for convex functions and the control\nbased analysis of a streaming regression problem. We close with a discussion\nabout the newly defined paradigm of online adaptive control and ask the\nfollowing question \"Are regret optimal control strategies deployable?\"\n","authors":["Travis E. Gibson","Sawal Acharya"],"pdf_url":"https://arxiv.org/pdf/2501.04572v2.pdf","comment":"10 pages no figures"},{"id":"http://arxiv.org/abs/2501.05163v1","updated":"2025-01-09T11:36:29Z","published":"2025-01-09T11:36:29Z","title":"Explainable AI based System for Supply Air Temperature Forecast","summary":"  This paper explores the application of Explainable AI (XAI) techniques to\nimprove the transparency and understanding of predictive models in control of\nautomated supply air temperature (ASAT) of Air Handling Unit (AHU). The study\nfocuses on forecasting of ASAT using a linear regression with Huber loss.\nHowever, having only a control curve without semantic and/or physical\nexplanation is often not enough. The present study employs one of the XAI\nmethods: Shapley values, which allows to reveal the reasoning and highlight the\ncontribution of each feature to the final ASAT forecast. In comparison to other\nXAI methods, Shapley values have solid mathematical background, resulting in\ninterpretation transparency. The study demonstrates the contrastive\nexplanations--slices, for each control value of ASAT, which makes it possible\nto give the client objective justifications for curve changes.\n","authors":["Marika Eik","Ahmet Kose","Hossein Nourollahi Hokmabad","Juri Belikov"],"pdf_url":"https://arxiv.org/pdf/2501.05163v1.pdf","comment":"5 pages, 7 figures, 1 table, conference paper"},{"id":"http://arxiv.org/abs/2405.16490v2","updated":"2025-01-09T10:38:46Z","published":"2024-05-26T08:58:03Z","title":"Formalising the intentional stance 1: attributing goals and beliefs to\n  stochastic processes","summary":"  This article presents a formalism inspired by Dennett's notion of the\nintentional stance. Whereas Dennett's treatment of these concepts is informal,\nwe aim to provide a more formal analogue. We introduce a framework based on\nstochastic processes with inputs and outputs, in which we can talk precisely\nabout *interpreting* systems as having *normative-epistemic states*, which\ncombine belief-like and desire-like features. Our framework is based on\noptimality but nevertheless allows us to model some forms of bounded cognition.\n  One might expect that the systems that can be described in\nnormative-epistemic terms would be some special subset of all systems, but we\nshow that this is not the case: every system admits a (possibly trivial)\nnormative-epistemic interpretation, and those that can be *uniquely specified*\nby a normative-epistemic description are exactly the deterministic ones.\nFinally, we show that there is a suitable notion of Bayesian updating for\nnormative-epistemic states, which we call *value-laden filtering*, since it\ninvolves both normative and epistemic elements. For unbounded cognition it is\nalways permissible to attribute beliefs that update in this way. This is not\nalways the case for bounded cognition, but we give a sufficient condition under\nwhich it is.\n  This paper gives an overview of our framework aimed at cognitive scientists,\nwith a formal mathematical treatment given in a companion paper.\n","authors":["Simon McGregor"," timorl","Nathaniel Virgo"],"pdf_url":"https://arxiv.org/pdf/2405.16490v2.pdf","comment":"The previous version of this document included the content of the\n  companion paper, \"Formalising the intentional stance 2: a coinductive\n  approach\". The paper has now been split into two, this one (which is an\n  overview aimed at cognitive scientists) and the companion (which contains\n  full mathematical detail). 16 pages, one figure with two subfigures"},{"id":"http://arxiv.org/abs/2501.05102v1","updated":"2025-01-09T09:44:25Z","published":"2025-01-09T09:44:25Z","title":"Coordinated Control of Deformation and Flight for Morphing Aircraft via\n  Meta-Learning and Coupled State-Dependent Riccati Equations","summary":"  In this paper, the coordinated control problem of deformation and flight for\nmorphing aircraft (MA) is studied by using meta-learning (ML) and coupled\nstate-dependent Riccati equations (CSDREs). Our method is built on two\nprincipal observations that dynamic models of MA under varying morphing\nconditions share a morphing condition independent representation function and\nthat the specific morphing condition part lies in a set of linear coefficients.\nTo that end, the domain adversarially invariant meta-learning (DAIML) is\nemployed to learn the shared representation with offline flight data. Based on\nthe learned representation function, the coordinated control of the deformation\nand flight for MA is formulated as a non-cooperative differential game. The\nstate-dependent feedback control solutions can be derived by addressing a pair\nof CSDREs. For this purpose, Lyapunov iterations are extended to obtain the\npositive semidefinite (definite) stabilizing solutions of the CSDREs, and the\nconvergence proof of the proposed algorithm is provided. Finally, a simulation\nstudy is carried out to validate the efficacy of the developed coordinated game\ncontrol strategies.\n","authors":["Hao-Chi Che","Huai-Ning Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03997v5","updated":"2025-01-09T09:41:38Z","published":"2024-01-08T16:20:05Z","title":"Low-Complexity Control for a Class of Uncertain MIMO Nonlinear Systems\n  under Generalized Time-Varying Output Constraints (extended version)","summary":"  This paper introduces a novel control framework to address the satisfaction\nof multiple time-varying output constraints in uncertain high-order MIMO\nnonlinear control systems. Unlike existing methods, which often assume that the\nconstraints are always decoupled and feasible, our approach can handle coupled\ntime-varying constraints even in the presence of potential infeasibilities.\nFirst, it is shown that satisfying multiple constraints essentially boils down\nto ensuring the positivity of a scalar variable, representing the signed\ndistance from the boundary of the time-varying output-constrained set. To\nachieve this, a single consolidating constraint is designed that, when\nsatisfied, guarantees convergence to and invariance of the time-varying\noutput-constrained set within a user-defined finite time. Next, a novel robust\nand low-complexity feedback controller is proposed to ensure the satisfaction\nof the consolidating constraint. Additionally, we provide a mechanism for\nonline modification of the consolidating constraint to find a least violating\nsolution when the constraints become mutually infeasible for some time.\nFinally, simulation examples of trajectory and region tracking for a mobile\nrobot validate the proposed approach.\n","authors":["Farhad Mehdifar","Lars Lindemann","Charalampos P. Bechlioulis","Dimos V. Dimarogonas"],"pdf_url":"https://arxiv.org/pdf/2401.03997v5.pdf","comment":"extended version, 21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2405.19546v4","updated":"2025-01-09T05:01:32Z","published":"2024-05-29T22:19:39Z","title":"Convex Optimization of Initial Perturbations toward Quantitative Weather\n  Control","summary":"  This study proposes introducing convex optimization to find initial\nperturbations of atmospheric states to realize specified changes in subsequent\nweather. In the proposed method, we formulate and solve an inverse problem to\nfind effective perturbations in atmospheric variables so that controlled\nvariables satisfy specified changes at a specified time. The proposed method\nfirst constructs a sensitivity matrix of controlled variables, such as\naccumulated precipitation, to the initial atmospheric variables, such as\ntemperature and humidity, through sensitivity analysis using a numerical\nweather prediction (NWP) model. Then a convex optimization problem is\nformulated to achieve various control specifications involving not only\nquadratic functions but also absolute values and maximum values of the\ncontrolled variables and initial atmospheric variables in the cost function and\nconstraints. The proposed method was validated through a benchmark warm bubble\nexperiment using the NWP model. The experiments showed that the identified\nperturbations successfully realized specified spatial distributions of\naccumulated precipitation.\n","authors":["Toshiyuki Ohtsuka","Atsushi Okazaki","Masaki Ogura","Shunji Kotsuki"],"pdf_url":"https://arxiv.org/pdf/2405.19546v4.pdf","comment":"shortend to improve conciseness; some figures added to Supplements\n  for discussion about physical processes; license changed to CC BY 4.0;\n  revised to improve readability; some figures in Appendix omitted to improve\n  conciseness"},{"id":"http://arxiv.org/abs/2501.04964v1","updated":"2025-01-09T04:34:07Z","published":"2025-01-09T04:34:07Z","title":"Promoting Shared Energy Storage Aggregation among High Price-Tolerance\n  Prosumer: An Incentive Deposit and Withdrawal Service","summary":"  Many residential prosumers exhibit a high price-tolerance for household\nelectricity bills and a low response to price incentives. This is because the\nhousehold electricity bills are not inherently high, and the potential for\nsaving on electricity bills through participation in conventional Shared Energy\nStorage (SES) is limited, which diminishes their motivation to actively engage\nin SES. Additionally, existing SES models often require prosumers to take\nadditional actions, such as optimizing rental capacity and bidding prices,\nwhich happen to be capabilities that typical household prosumers do not\npossess. To incentivize these high price-tolerance residential prosumers to\nparticipate in SES, a novel SES aggregation framework is proposed, which does\nnot require prosumers to take additional actions and allows them to maintain\nexisting energy storage patterns. Compared to conventional long-term operation\nof SES, the proposed framework introduces an additional short-term construction\nstep during which the energy service provider (ESP) acquires control of the\nenergy storage systems (ESS) and offers electricity deposit and withdrawal\nservices (DWS) with dynamic coefficients, enabling prosumers to withdraw more\nelectricity than they deposit without additional actions. Additionally, a\nmatching mechanism is proposed to align prosumers' electricity consumption\nbehaviors with ESP's optimization strategies. Finally, the dynamic coefficients\nin DWS and trading strategies are optimized by an improved deep reinforcement\nlearning (DRL) algorithm. Case studies are conducted to verify the\neffectiveness of the proposed SES aggregation framework with DWS and the\nmatching mechanism.\n","authors":["Xin Lu","Jing Qiu","Cuo Zhang","Gang Lei","Jianguo Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.04964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02792v3","updated":"2025-01-09T03:32:35Z","published":"2025-01-06T06:25:46Z","title":"Gaming on Coincident Peak Shaving: Equilibrium and Strategic Behavior","summary":"  Coincident peak demand charges are imposed by power system operators or\nelectric utilities when the overall system demand, aggregated across multiple\nconsumers, reaches its peak. These charges incentivize consumers to reduce\ntheir demand during peak periods, a practice known as coincident peak shaving.\nIn this paper, we analyze the coincident peak shaving problem through the lens\nof game theory, developing a theoretical model to examine the impact of\nstrategic consumer behavior on system efficiency. We demonstrate that the game\nstructure exhibits varying characteristics - concave,\nquasiconcave/discontinuous, or non-concave/discontinuous - depending on the\nextent of consumers demand-shifting capabilities. For a two-agent, two-period\nsetting, we derive closed-form Nash equilibrium solutions under each condition\nand generalize our findings to cases with multiple agents. We prove the\nstability of the equilibrium points and present an algorithm for computing\nequilibrium outcomes across all game scenarios. We also show that the\npeak-shaving effectiveness of the game model matches that of the centralized\npeak-shaving model but with increased levels of anarchy. In the cases of\nquasiconcave and non-concave game conditions, we analytically demonstrate in\nthe two-agent setting that anarchy increases with consumers' flexibility and\ninequity, as measured by their marginal shifting costs, and we also analyze the\ninfluence of the number of agents on anarchy. Finally, we provide numerical\nsimulations to validate our theoretical results.\n","authors":["Liudong Chen","Bolun Xu"],"pdf_url":"https://arxiv.org/pdf/2501.02792v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04937v1","updated":"2025-01-09T03:01:57Z","published":"2025-01-09T03:01:57Z","title":"Generalized Linear Models with 1-Bit Measurements: Asymptotics of the\n  Maximum Likelihood Estimator","summary":"  This work establishes regularity conditions for consistency and asymptotic\nnormality of the multiple parameter maximum likelihood estimator(MLE) from\ncensored data, where the censoring mechanism is in the form of $1$-bit\nmeasurements. The underlying distribution of the uncensored data is assumed to\nbelong to the exponential family, with natural parameters expressed as a linear\ncombination of the predictors, known as generalized linear model (GLM). As part\nof the analysis, the Fisher information matrix is also derived for both\ncensored and uncensored data, which helps to quantify the impact of censoring\nand assess the performance of the MLE. The choice of GLM allows one to consider\na variety of practical examples where 1-bit estimation is of interest. In\nparticular, it is shown how the derived results can be used to analyze two\npractically relevant scenarios: the Gaussian model with both unknown mean and\nvariance, and the Poisson model with an unknown mean.\n","authors":["Jaimin Shah","Martina Cardone","Cynthia Rush","Alex Dytso"],"pdf_url":"https://arxiv.org/pdf/2501.04937v1.pdf","comment":"ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.04160v2","updated":"2025-01-09T02:53:56Z","published":"2025-01-07T22:19:06Z","title":"Collaborative Spacecraft Servicing under Partial Feedback using\n  Lyapunov-based Deep Neural Networks","summary":"  Multi-agent systems are increasingly applied in space missions, including\ndistributed space systems, resilient constellations, and autonomous rendezvous\nand docking operations. A critical emerging application is collaborative\nspacecraft servicing, which encompasses on-orbit maintenance, space debris\nremoval, and swarm-based satellite repositioning. These missions involve\nservicing spacecraft interacting with malfunctioning or defunct spacecraft\nunder challenging conditions, such as limited state information, measurement\ninaccuracies, and erratic target behaviors. Existing approaches often rely on\nassumptions of full state knowledge or single-integrator dynamics, which are\nimpractical for real-world applications involving second-order spacecraft\ndynamics. This work addresses these challenges by developing a distributed\nstate estimation and tracking framework that requires only relative position\nmeasurements and operates under partial state information. A novel\n$\\rho$-filter is introduced to reconstruct unknown states using locally\navailable information, and a Lyapunov-based deep neural network adaptive\ncontroller is developed that adaptively compensates for uncertainties stemming\nfrom unknown spacecraft dynamics. To ensure the collaborative spacecraft\nregulation problem is well-posed, a trackability condition is defined. A\nLyapunov-based stability analysis is provided to ensure exponential convergence\nof errors in state estimation and spacecraft regulation to a neighborhood of\nthe origin under the trackability condition. The developed method eliminates\nthe need for expensive velocity sensors or extensive pre-training, offering a\npractical and robust solution for spacecraft servicing in complex, dynamic\nenvironments.\n","authors":["Cristian F. Nino","Omkar Sudhir Patil","Christopher D. Petersen","Sean Phillips","Warren E. Dixon"],"pdf_url":"https://arxiv.org/pdf/2501.04160v2.pdf","comment":"24 pages, 4 Figures, Journal"},{"id":"http://arxiv.org/abs/2409.20511v2","updated":"2025-01-09T00:27:06Z","published":"2024-09-30T17:13:11Z","title":"Quantifying Metrics for Wildfire Ignition Risk from Geographic Data in\n  Power Shutoff Decision-Making","summary":"  Faults on power lines and other electric equipment are known to cause\nwildfire ignitions. To mitigate the threat of wildfire ignitions from electric\npower infrastructure, many utilities preemptively de-energize power lines,\nwhich may result in power shutoffs. Data regarding wildfire ignition risks are\nkey inputs for effective planning of power line de-energizations. However,\nthere are multiple ways to formulate risk metrics that spatially aggregate\nwildfire risk map data, and there are different ways of leveraging this data to\nmake decisions. The key contribution of this paper is to define and compare the\nresults of employing six metrics for quantifying the wildfire ignition risks of\npower lines from risk maps, considering both threshold- and optimization-based\nmethods for planning power line de-energizations. The numeric results use the\nCalifornia Test System (CATS), a large-scale synthetic grid model with power\nline corridors accurately representing California infrastructure, in\ncombination with real Wildland Fire Potential Index data for a full year. This\nis the first application of optimal power shutoff planning on such a large and\nrealistic test case. Our results show that the choice of risk metric\nsignificantly impacts the lines that are de-energized and the resulting load\nshed. We find that the optimization-based method results in significantly less\nload shed than the threshold-based method while achieving the same risk\nreduction.\n","authors":["Ryan Piansky","Sofia Taylor","Noah Rhodes","Daniel K. Molzahn","Line A. Roald","Jean-Paul Watson"],"pdf_url":"https://arxiv.org/pdf/2409.20511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17925v2","updated":"2025-01-09T23:12:11Z","published":"2024-11-26T22:41:40Z","title":"Stability and Synchronization of Kuramoto Oscillators","summary":"  Imagine a group of oscillators, each endowed with their own rhythm or\nfrequency, be it the ticking of a biological clock, the swing of a pendulum, or\nthe glowing of fireflies. While these individual oscillators may seem\nindependent of one another at first glance, the true magic lies in their\nability to influence and synchronize with one another, like a group of\nfireflies glowing in unison.\n  The Kuramoto model was motivated by this phenomenon of collective\nsynchronization, when a group of a large number of oscillators spontaneously\nlock to a common frequency, despite vast differences in their individual\nfrequencies. Inspired by Kuramoto's groundbreaking work in the 1970s, this\nmodel captures the essence of how interconnected systems, ranging from\nbiological networks to power grids, can achieve a state of synchronization.\n  This work aims to study the stability and synchronization of Kuramoto\noscillators, starting off with an introduction to Kuramoto Oscillators and it's\nbroader applications. We then at a graph theoretic formulation for the same and\nestablish various criterion for the stability, synchronization of Kuramoto\nOscillators. Finally, we broadly analyze and experiment with various physical\nsystems that tend to behave like Kuramoto oscillators followed by further\nsimulations.\n","authors":["Abhiram Gorle"],"pdf_url":"https://arxiv.org/pdf/2411.17925v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02094v2","updated":"2025-01-09T19:50:57Z","published":"2025-01-03T20:43:57Z","title":"SMTL: A Stratified Logic for Expressive Multi-Level Temporal\n  Specifications","summary":"  We present Stratified Metric Temporal Logic (SMTL), a novel formalism for\nspecifying and verifying properties of complex cyber-physical systems that\nexhibit behaviors across multiple temporal and abstraction scales. SMTL extends\nexisting temporal logics by incorporating a stratification operator, enabling\nthe association of temporal properties with specific abstraction levels. This\nallows for the natural expression of multi-scale requirements while maintaining\nformal reasoning about inter-level relationships. We formalize the syntax and\nsemantics of SMTL, proving that it strictly subsumes metric temporal logic\n(MTL) and offers enhanced expressiveness by capturing properties unattainable\nin existing logics. Numerical simulations comparing agents operating under MTL\nand SMTL specifications show that SMTL enhances agent coordination and safety,\nreducing collision rates without substantial computational overhead or\ncompromising path efficiency. These findings underscore SMTL's potential as a\nvaluable tool for designing and verifying complex multi-agent systems operating\nacross diverse temporal and abstraction scales.\n","authors":["Ali Baheri","Peng Wei"],"pdf_url":"https://arxiv.org/pdf/2501.02094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05548v1","updated":"2025-01-09T19:38:27Z","published":"2025-01-09T19:38:27Z","title":"Switched Optimal Control with Dwell Time Constraints","summary":"  This paper presents an embedding-based approach for solving switched optimal\ncontrol problems (SOCPs) with dwell time constraints. At first, an embedded\noptimal control problem (EOCP) is defined by replacing the discrete switching\nsignal with a continuous embedded variable that can take intermediate values\nbetween the discrete modes. While embedding enables solutions of SOCPs via\nconventional techniques, optimal solutions of EOCPs often involve nonexistent\nmodes and thus may not be feasible for the SOCP. In the modified EOCP (MEOCP),\na concave function is added to the cost function to enforce a bang-bang\nsolution in the embedded variable, which results in feasible solutions for the\nSOCP. However, the MEOCP cannot guarantee the satisfaction of dwell-time\nconstraints.\n  In this paper, a MEOCP is combined with a filter layer to remove switching\ntimes that violate the dwell time constraint. Insertion gradients are used to\nminimize the effect of the filter on the optimal cost.\n","authors":["Masoud S. Sakha","Rushikesh Kamalapurkar"],"pdf_url":"https://arxiv.org/pdf/2501.05548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15745v7","updated":"2025-01-09T19:35:10Z","published":"2024-01-28T20:12:08Z","title":"The computation of approximate feedback Stackelberg equilibria in\n  multi-player nonlinear constrained dynamic games","summary":"  Solving feedback Stackelberg games with nonlinear dynamics and coupled\nconstraints, a common scenario in practice, presents significant challenges.\nThis work introduces an efficient method for computing approximate local\nfeedback Stackelberg equilibria in multi-player general-sum dynamic games, with\ncontinuous state and action spaces. Different from existing (approximate)\ndynamic programming solutions that are primarily designed for unconstrained\nproblems, our approach involves reformulating a feedback Stackelberg dynamic\ngame into a sequence of nested optimization problems, enabling the derivation\nof Karush-Kuhn-Tucker (KKT) conditions and the establishment of a second-order\nsufficient condition for local feedback Stackelberg equilibria. We propose a\nNewton-style primal-dual interior point method for solving constrained linear\nquadratic (LQ) feedback Stackelberg games, offering provable convergence\nguarantees. Our method is further extended to compute local feedback\nStackelberg equilibria for more general nonlinear games by iteratively\napproximating them using LQ games, ensuring that their KKT conditions are\nlocally aligned with those of the original nonlinear games. We prove the\nexponential convergence of our algorithm in constrained nonlinear games. In a\nfeedback Stackelberg game with nonlinear dynamics and (nonconvex) coupled costs\nand constraints, our experimental results reveal the algorithm's ability to\nhandle infeasible initial conditions and achieve exponential convergence\ntowards an approximate local feedback Stackelberg equilibrium.\n","authors":["Jingqi Li","Somayeh Sojoudi","Claire Tomlin","David Fridovich-Keil"],"pdf_url":"https://arxiv.org/pdf/2401.15745v7.pdf","comment":"This manuscript has been accepted by SIAM Journal on Optimization. We\n  fix few typos in this arxiv version"},{"id":"http://arxiv.org/abs/2501.04988v1","updated":"2025-01-09T06:01:34Z","published":"2025-01-09T06:01:34Z","title":"Intelligent Sailing Model for Open Sea Navigation","summary":"  Autonomous vessels potentially enhance safety and reliability of seaborne\ntrade. To facilitate the development of autonomous vessels, high-fidelity\nsimulations are required to model realistic interactions with other vessels.\nHowever, modeling realistic interactive maritime traffic is challenging due to\nthe unstructured environment, coarsely specified traffic rules, and largely\nvarying vessel types. Currently, there is no standard for simulating\ninteractive maritime environments in order to rigorously benchmark autonomous\nvessel algorithms. In this paper, we introduce the first intelligent sailing\nmodel (ISM), which simulates rule-compliant vessels for navigation on the open\nsea. An ISM vessel reacts to other traffic participants according to maritime\ntraffic rules while at the same time solving a motion planning task\ncharacterized by waypoints. In particular, the ISM monitors the applicable\nrules, generates rule-compliant waypoints accordingly, and utilizes a model\npredictive control for tracking the waypoints. We evaluate the ISM in two\nenvironments: interactive traffic with only ISM vessels and mixed traffic where\nsome vessel trajectories are from recorded real-world maritime traffic data or\nhandcrafted for criticality. Our results show that simulations with many ISM\nvessels of different vessel types are rule-compliant and scalable. We tested\n4,049 critical traffic scenarios. For interactive traffic with ISM vessels, no\ncollisions occurred while goal-reaching rates of about 97 percent were\nachieved. We believe that our ISM can serve as a standard for challenging and\nrealistic maritime traffic simulation to accelerate autonomous vessel\ndevelopment.\n","authors":["Hanna Krasowski","Stefan Schärdinger","Murat Arcak","Matthias Althoff"],"pdf_url":"https://arxiv.org/pdf/2501.04988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07595v1","updated":"2025-01-09T23:02:16Z","published":"2025-01-09T23:02:16Z","title":"LUCAS: A Low-Power Ultra-Low Jitter Compact ASIC for SiPM Targetting\n  ToF-CT","summary":"  We present LUCAS (Low power Ultra-low jitter Compact ASIC for SiPM), an\nanalog front-end for Silicon Photomultipliers (SiPM) targeting fast timing\ndetectors in Time-of-Flight Computed Tomography (ToF-CT). LUCAS features a very\nlow input impedance preamplifier followed by a voltage comparator. It is\ndesigned in TSMC 65 nm low-power CMOS technology with a power supply of 1.2 V.\nOur first 8-channel prototype has been sent to fabrication and will be received\nin August 2023. Post-layout simulations predict less than 40 ps FWHM SPTR\njitter and an approximate power consumption of 3.2 mW per channel. The front\nend is suitable for applications with rigorous jitter requirements and high\nevent rates, thanks to its 3.9 GHz unity-gain bandwidth. The front-end compact\nform factor will facilitate its incorporation into systems demanding high\nchannel densities.\n","authors":["Seyed Arash Katourani"],"pdf_url":"https://arxiv.org/pdf/2501.07595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09173v1","updated":"2025-01-09T11:06:35Z","published":"2025-01-09T11:06:35Z","title":"Formalising the intentional stance 2: a coinductive approach","summary":"  Given a stochastic process with inputs and outputs, how might its behaviour\nbe related to pursuit of a goal? We model this using 'transducers', objects\nthat capture only the external behaviour of a system and not its internal\nstate. A companion paper summarises our results for cognitive scientists; the\ncurrent paper gives formal definitions and proofs.\n  To formalise the concept of a system that behaves as if it were pursuing a\ngoal, we consider what happens when a transducer (a 'policy') is coupled to\nanother transducer that comes equipped with a success condition (a\n'teleo-environment'). An optimal policy is identified with a transducer that\nbehaves as if it were perfectly rational in the pursuit of a goal; our\nframework also allows us to model constrained rationality.\n  Optimal policies obey a version of Bellman's principle: a policy that's\noptimal in one time step will again be optimal in the next time step, but with\nrespect to a different teleo-environment (obtained from the original one by a\nmodified version of Bayesian filtering). This property sometimes also applies\nto the bounded-rational case; we give a sufficient condition.\n  A policy is deterministic if and only if there exists a teleo-environment for\nwhich it is uniquely optimal among the set of all policies; we relate this to\nclassical representation theorems from decision theory. This result need not\nhold in the bounded-rational case; we give an example related to the\nabsent-minded driver problem. The formalism is defined using coinduction,\nfollowing the style proposed by Czajka.\n","authors":["Simon McGregor"," timorl","Nathaniel Virgo"],"pdf_url":"https://arxiv.org/pdf/2501.09173v1.pdf","comment":"This is the companion paper to \"Formalising the intentional stance 1:\n  attributing goals and beliefs to stochastic processes\" (uploaded as version 2\n  of arXiv:2405.16490). The other paper is an overview aimed at cognitive\n  scientists while this paper gives full mathematical details. 50 pages, no\n  figures"}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2501.05430v1","updated":"2025-01-09T18:42:49Z","published":"2025-01-09T18:42:49Z","title":"A dimension reduction procedure for the design of lattice-spring systems\n  with minimal fabrication cost and required multi-functional properties","summary":"  We show that the problem of the design of the lattices of elastoplastic\ncurrent conducting springs with optimal multi-functional properties leads to an\nanalytically tractable problem. Specifically, focusing on a lattice with a\nsmall number of springs, we use the technique of inequalities to reduce the\nnumber variables and to compute the minimal cost of lattice fabrication\nexplicitly.\n","authors":["Egor Makarenkov","Sakshi Malhotra","Yang Jiao"],"pdf_url":"https://arxiv.org/pdf/2501.05430v1.pdf","comment":"20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.01857v2","updated":"2025-01-09T17:54:15Z","published":"2024-08-03T20:00:36Z","title":"Using Linearized Optimal Transport to Predict the Evolution of\n  Stochastic Particle Systems","summary":"  We develop an algorithm to approximate the time evolution of a probability\ndistribution without explicitly learning an operator that governs the\nevolution. A particular application of interest is discrete measures $\\mu_t^N$\nthat arise from systems of $N$ particles in $\\mathbb R^d$. In many such\nsituations, the individual particles move chaotically on short time scales,\nmaking it difficult to learn the dynamics of a governing operator, but the bulk\ndistribution $\\mu_t^N$ approximates an absolutely continuous measure $\\mu_t$\nthat evolves ``smoothly.'' If $\\mu_t$ is known on some time interval, then\nlinearized optimal transport theory provides an Euler-like scheme for\napproximating the evolution of $\\mu_t$ using its ``tangent vector field''\n(represented as a time-dependent vector field on $\\mathbb R^d$), which can be\ncomputed as a limit of optimal transport maps. We propose an analog of this\nEuler approximation to predict the evolution of the discrete measure $\\mu_t^N$\n(without knowing $\\mu_t$). To approximate the analogous tangent vector field,\nwe use a finite difference over a time step that sits between two time scales\nof the system -- long enough for a large-$N$ evolution ($\\mu_t$) to emerge but\nshort enough to satisfactorily approximate the derivative object used in the\nEuler scheme. The emergence of the limiting behavior ensures the optimal\ntransport maps closely approximate the vector field describing the bulk\ndistribution's smooth evolution instead of the individual particles' more\nchaotic movements. We demonstrate the efficacy of our approach with two\nillustrative examples, Gaussian diffusion and a cell chemotaxis model, and show\nthat our method succeeds in predicting the bulk behavior over relatively large\nsteps.\n","authors":["Nicholas Karris","Evangelos A. Nikitopoulos","Ioannis Kevrekidis","Seungjoon Lee","Alexander Cloninger"],"pdf_url":"https://arxiv.org/pdf/2408.01857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05388v1","updated":"2025-01-09T17:22:11Z","published":"2025-01-09T17:22:11Z","title":"A fast approximate scenario addition method for two-stage robust\n  mixed-integer programs","summary":"  This paper presents a new scenario addition method for two-stage robust\nmixed-integer programs with finite uncertainty sets. Our method combines and\nextends speed-up techniques used in previous scenario addition methods (also\ncalled column-and-constraint generation methods) and introduces several new\ntechniques. In particular, it uses dual bounds for second-stage problems in\norder to allow a faster identification of the next promising scenario to be\nadded to the master problem. Moreover, adaptive time limits are imposed to\navoid getting stuck on particularly hard second-stage problems, and a gap\npropagation between master problem and second-stage problems is used to stop\nsolving them earlier if only a given non-zero optimality gap is to be reached\noverall. This makes our method particularly effective for problems where\nsolving the second-stage problem is computationally challenging. To evaluate\nthe method's performance, we compare it to two recent scenario addition methods\nfrom the literature on two applications: a robust capacitated location routing\nproblem and a robust integrated berth allocation and quay crane assignment and\nscheduling problem. The first problem features a particularly hard second\nstage, and we show that our method is able to solve considerably more and\nlarger instances in a given time limit. Using the second problem, we verify the\ngeneral applicability of our method, even for problems where the second stage\nis relatively easy.\n","authors":["Marc Goerigk","Dorothee Henke","Johannes Kager","Fabian Schäfer","Clemens Thielen"],"pdf_url":"https://arxiv.org/pdf/2501.05388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05373v1","updated":"2025-01-09T16:57:54Z","published":"2025-01-09T16:57:54Z","title":"On the emergence of almost-honeycomb structures in low-energy planar\n  clusters","summary":"  Several commonly observed physical and biological systems are arranged in\nshapes that closely resemble an honeycomb cluster, that is, a tessellation of\nthe plane by regular hexagons. Although these shapes are not always the direct\nproduct of energy minimization, they can still be understood, at least\nphenomenologically, as low-energy configurations. In this paper, explicit\nquantitative estimates on the geometry of such low-energy configurations are\nprovided, showing in particular that the vast majority of the chambers must be\ngeneralized polygons with six edges, and be closely resembling regular\nhexagons. Part of our arguments is a detailed revision of the estimates behind\nthe global isoperimetric principle for honeycomb clusters due to Hales (T. C.\nHales. The honeycomb conjecture. Discrete Comput. Geom., 25(1):1-22, 2001).\n","authors":["Marco Caroccia","Kenneth DeMason","Francesco Maggi"],"pdf_url":"https://arxiv.org/pdf/2501.05373v1.pdf","comment":"32 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.05365v1","updated":"2025-01-09T16:48:14Z","published":"2025-01-09T16:48:14Z","title":"Control of Overpopulated Tails in Kinetic Epidemic Models","summary":"  We introduce model-based transition rates for controlled compartmental models\nin mathematical epidemiology, with a focus on the effects of control strategies\napplied to interacting multi-agent systems describing contact formation\ndynamics. In the framework of kinetic control problems, we compare two\nprototypical control protocols: one additive control directly influencing the\ndynamics and another targeting the interaction strength between agents. The\nemerging controlled macroscopic models are derived for an SIR\ncompartmentalization to illustrate their impact on epidemic progression and\ncontact interaction dynamics. Numerical results show the effectiveness of this\napproach in steering the dynamics and controlling epidemic trends, even in\nscenarios where contact distributions exhibit an overpopulated tail.\n","authors":["Mattia Zanella","Andrea Medaglia"],"pdf_url":"https://arxiv.org/pdf/2501.05365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03144v2","updated":"2025-01-09T16:35:13Z","published":"2025-01-06T17:09:38Z","title":"Enhancing Quantum State Reconstruction with Structured Classical Shadows","summary":"  Quantum state tomography (QST) remains the prevailing method for benchmarking\nand verifying quantum devices; however, its application to large quantum\nsystems is rendered impractical due to the exponential growth in both the\nrequired number of total state copies and classical computational resources.\nRecently, the classical shadow (CS) method has been introduced as a more\ncomputationally efficient alternative, capable of accurately predicting key\nquantum state properties. Despite its advantages, a critical question remains\nas to whether the CS method can be extended to perform QST with guaranteed\nperformance. In this paper, we address this challenge by introducing a\nprojected classical shadow (PCS) method with guaranteed performance for QST\nbased on Haar-random projective measurements. PCS extends the standard CS\nmethod by incorporating a projection step onto the target subspace. For a\ngeneral quantum state consisting of $n$ qubits, our method requires a minimum\nof $O(4^n)$ total state copies to achieve a bounded recovery error in the\nFrobenius norm between the reconstructed and true density matrices, reducing to\n$O(2^n r)$ for states of rank $r<2^n$ -- meeting information-theoretic optimal\nbounds in both cases. For matrix product operator states, we demonstrate that\nthe PCS method can recover the ground-truth state with $O(n^2)$ total state\ncopies, improving upon the previously established Haar-random bound of\n$O(n^3)$. Simulation results further validate the effectiveness of the proposed\nPCS method.\n","authors":["Zhen Qin","Joseph M. Lukens","Brian T. Kirby","Zhihui Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.03144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08188v4","updated":"2025-01-09T15:54:36Z","published":"2024-01-16T08:04:34Z","title":"Bounded weak solutions for Keller-Segel equations with generalized\n  diffusion and logistic source via an unbalanced Optimal Transport splitting\n  scheme","summary":"  We consider a parabolic-elliptic type of Keller-Segel equations with\ngeneralized diffusion and logistic source under homogeneous Neumann-Neumann\nboundary conditions. We construct bounded weak solutions globally in time in an\nunbalanced optimal transport framework, provided that the magnitude of the\nchemotactic sensitivity can be restricted depending on parameters. In the case\nof subquadratic degradation of the logistic source, we quantify the chemotactic\nsensitivity, in particular, in terms of the power of degradation and the\npointwise bound of the initial density.\n","authors":["Kyungkeun Kang","Hwa Kil Kim","Geuntaek Seo"],"pdf_url":"https://arxiv.org/pdf/2401.08188v4.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2501.05320v1","updated":"2025-01-09T15:40:59Z","published":"2025-01-09T15:40:59Z","title":"Isoperimetric inequalities for the fractional composite membrane problem","summary":"  In this article, we investigate some isoperimetric-type inequalities related\nto the first eigenvalue of the fractional composite membrane problem. First, we\nestablish an analogue of the renowned Faber-Krahn inequality for the fractional\ncomposite membrane problem. Next, we investigate an isoperimetric inequality\nfor the first eigenvalue of the fractional composite membrane problem on the\nintersection of two domains-a problem that was first studied by Lieb [23] for\nthe Laplacian. Similar results in the local case were previously obtained by\nCupini-Vecchi [9] for the composite membrane problem. Our findings provide\nfurther insights into the fractional setting, offering a new perspective on\nthese classical inequalities.\n","authors":["Mrityunjoy Ghosh"],"pdf_url":"https://arxiv.org/pdf/2501.05320v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2412.13538v2","updated":"2025-01-09T15:27:12Z","published":"2024-12-18T06:35:10Z","title":"Stabilization of strictly pre-dissipative nonlinear receding horizon\n  control by terminal costs","summary":"  It is known that receding horizon control with a strictly pre-dissipative\noptimal control problem yields a practically asymptotically stable closed loop\nwhen suitable state constraints are imposed. In this note we show that\nalternatively suitably bounded terminal costs can be used for stabilizing the\nclosed loop.\n","authors":["Lars Grüne","Mario Zanon"],"pdf_url":"https://arxiv.org/pdf/2412.13538v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05280v1","updated":"2025-01-09T14:43:29Z","published":"2025-01-09T14:43:29Z","title":"Exploring near-optimal energy systems with stakeholders: a novel\n  approach for participatory modelling","summary":"  Involving people in energy systems planning can increase the legitimacy and\nsocio-political feasibility of energy transitions. Participatory research in\nenergy modelling offers the opportunity to engage with stakeholders in a\ncomprehensive way, but is limited by how results can be generated and presented\nwithout imposing assumptions and discrete scenarios on the participants. To\nthis end, we present a methodology and a framework, based on near-optimal\nmodelling results, that can incorporate stakeholders in a holistic and engaging\nway. We confront stakeholders with a continuum of modelling-based energy system\ndesigns via an interactive interface allowing them to choose essentially any\ncombination of components that meet the system requirements. Together with\ninformation on the implications of different technologies, it is possible to\nassess how participants prioritise different aspects in energy systems planning\nwhile also facilitating learning in an engaging and stimulating way. We\nshowcase the methodology for the remote Arctic settlement of Longyearbyen and\nillustrate how participants deviate consistently from the cost optimum. At the\nsame time, they manage to balance different priorities such as emissions,\ncosts, and system vulnerability leading to a better understanding of the\ncomplexity and intertwined nature of decisions.\n","authors":["Oskar Vågerö","Koen van Greevenbroek","Aleksander Grochowicz","Maximilian Roithner"],"pdf_url":"https://arxiv.org/pdf/2501.05280v1.pdf","comment":"24 pages, 7 figures and 3 tables"},{"id":"http://arxiv.org/abs/2311.09844v2","updated":"2025-01-09T14:39:59Z","published":"2023-11-16T12:15:43Z","title":"Observability of the linear Zakharov--Kuznetsov equation","summary":"  We study the linear Zakharov--Kuznetsov equation with periodic boundary\nconditions. Employing some tools from the nonharmonic Fourier series we obtain\nseveral internal observability theorems. Then we prove various exact\ncontrollability and rapid uniform stabilization results by applying a duality\nprinciple and a general feedback construction. The method presented here\nintroduces a new insight into the control of dispersive equations in\ntwo-dimensional cases and may be adapted to more general equations.\n","authors":["Roberto de A. Capistrano Filho","Vilmos Komornik","Ademir F. Pazoto"],"pdf_url":"https://arxiv.org/pdf/2311.09844v2.pdf","comment":"30 pages, 2 figures. Comments are welcome"},{"id":"http://arxiv.org/abs/2501.04572v2","updated":"2025-01-09T14:30:41Z","published":"2025-01-08T15:42:41Z","title":"Regret Analysis: a control perspective","summary":"  Online learning and model reference adaptive control have many interesting\nintersections. One area where they differ however is in how the algorithms are\nanalyzed and what objective or metric is used to discriminate \"good\" algorithms\nfrom \"bad\" algorithms. In adaptive control there are usually two objectives: 1)\nprove that all time varying parameters/states of the system are bounded, and 2)\nthat the instantaneous error between the adaptively controlled system and a\nreference system converges to zero over time (or at least a compact set). For\nonline learning the performance of algorithms is often characterized by the\nregret the algorithm incurs. Regret is defined as the cumulative loss (cost)\nover time from the online algorithm minus the cumulative loss (cost) of the\nsingle optimal fixed parameter choice in hindsight. Another significant\ndifference between the two areas of research is with regard to the assumptions\nmade in order to obtain said results. Adaptive control makes assumptions about\nthe input-output properties of the control problem and derives solutions for a\nfixed error model or optimization task. In the online learning literature\nresults are derived for classes of loss functions (i.e. convex) while a priori\nassuming that all time varying parameters are bounded, which for many\noptimization tasks is not unrealistic, but is a non starter in control\napplications. In this work we discuss these differences in detail through the\nregret based analysis of gradient descent for convex functions and the control\nbased analysis of a streaming regression problem. We close with a discussion\nabout the newly defined paradigm of online adaptive control and ask the\nfollowing question \"Are regret optimal control strategies deployable?\"\n","authors":["Travis E. Gibson","Sawal Acharya"],"pdf_url":"https://arxiv.org/pdf/2501.04572v2.pdf","comment":"10 pages no figures"},{"id":"http://arxiv.org/abs/2501.05270v1","updated":"2025-01-09T14:27:15Z","published":"2025-01-09T14:27:15Z","title":"Identifiability of Controlled Open Quantum Systems","summary":"  Open quantum systems are a rich area of research on the intersection of\nquantum mechanics and stochastic analysis. We unify multiple views of\ncontrolled open quantum systems within the framework of bilinear dynamical\nsystems. We define the corresponding notions of identifiability from the\nresults of quantum state tomography, obtained in many copies of the initial\nquantum state, under subsequences of varying lengths of control signals. We\nexplain and extend work on identifiability of bilinear systems using either\nspectral criteria, criteria based on Hankel matrix, and frequency-domain\ncriteria, to the parameter estimation of master equations of open quantum\nsystems. This sets the groundwork for a number of constructive approaches to\nthe identification of open quantum systems.\n","authors":["Waqas Parvaiz","Johannes Aspman","Ales Wodecki","Georgios Korpas","Jakub Marecek"],"pdf_url":"https://arxiv.org/pdf/2501.05270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05200v1","updated":"2025-01-09T12:51:39Z","published":"2025-01-09T12:51:39Z","title":"On Coordinated Drone-Courier Logistics for Intra-city Express Services","summary":"  Problem definition: Drones, despite being acknowledged as a transformative\nforce in the city logistics sector, are unable to execute the\n\\textit{last-meter delivery} (unloading goods directly to customers' doorsteps)\ndue to airspace restrictions and safety concerns. To leverage advancements and\novercome the limitations of drones in providing intra-city express services, we\nintroduce a coordinated drone-courier logistics system where drones operate\nwithin a closed network among vertiports, while couriers connect customers to\nthe drone delivery system. This paper aims to shed light on this coordinated\nsystem in terms of system feasibility, network interactivity, and long-term\nsustainability. Methodology/Results: We develop an integrated optimization\nmodel to optimize the network planning of the coordinated logistics system. The\ninterplay between network planning and tactical operations is mirrored by a\nqueueing network model, resulting in the nonlinear and nonconvex (partially\nconvex and partially concave) feasible region of the optimization model. An\niterative exact algorithm that tightens lower and upper bounds by adaptively\nrefining the linear approximations of nonlinear constraints is developed to\nprovide optimality-guaranteed solutions with finite convergence. The\ncomputational experiments demonstrate the scalability and robustness of our\nalgorithm across various network configurations and scenarios.Managerial\nimplications: The case study, based on a real-world dataset from SF Express, a\nlogistics giant in China, validates that the coordinated logistics system\nefficiently attains cost and time savings by leveraging the effective turnover\nof drones and the coordination between drones and couriers. The optimal network\ndesign features a concentrated structure, streamlining demand consolidation and\nreducing deadhead repositioning.\n","authors":["Shuiwang Chen","Kai Wang","Lingxiao Wu","Wei Qi"],"pdf_url":"https://arxiv.org/pdf/2501.05200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05178v1","updated":"2025-01-09T11:53:52Z","published":"2025-01-09T11:53:52Z","title":"KLAP: KYP lemma based low-rank approximation for $\\mathcal{H}_2$-optimal\n  passivation","summary":"  We present a novel passivity enforcement (passivation) method, called KLAP,\nfor linear time-invariant systems based on the Kalman-Yakubovich-Popov (KYP)\nlemma and the closely related Lur'e equations. The passivation problem in our\nframework corresponds to finding a perturbation to a given non-passive system\nthat renders the system passive while minimizing the $\\mathcal{H}_2$ or\nfrequency-weighted $\\mathcal{H}_2$ distance between the original non-passive\nand the resulting passive system. We show that this problem can be formulated\nas an unconstrained optimization problem whose objective function can be\ndifferentiated efficiently even in large-scale settings. We show that any\nminimizer of the unconstrained problem yields the same passive system.\nFurthermore, we prove that, in the absence of a feedthrough term, every local\nminimizer is also a global minimizer. For cases involving a non-trivial\nfeedthrough term, we analyze global minimizers in relation to the extremal\nsolutions of the Lur'e equations, which can serve as tools for identifying\nlocal minima. To solve the resulting numerical optimization problem\nefficiently, we propose an initialization strategy based on modifying the\nfeedthrough term and a restart strategy when it is likely that the optimization\nhas converged to a local minimum. Numerical examples illustrate the\neffectiveness of the proposed method.\n","authors":["Jonas Nicodemus","Matthias Voigt","Serkan Gugercin","Benjamin Unger"],"pdf_url":"https://arxiv.org/pdf/2501.05178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05158v1","updated":"2025-01-09T11:27:58Z","published":"2025-01-09T11:27:58Z","title":"An Efficient Mixed-Integer Formulation and an Iterative Method for\n  Optimal Control of Switched Systems Under Dwell Time Constraints","summary":"  This paper presents an efficient Mixed-Integer Nonlinear Programming (MINLP)\nformulation for systems with discrete control inputs under dwell time\nconstraints. By viewing such systems as a switched system, the problem is\ndecomposed into a Sequence Optimization (SO) and a Switching Time Optimization\n(STO) -- the former providing the sequence of the switched system, and the\nlatter calculating the optimal switching times. By limiting the feasible set of\nSO to subsequences of a master sequence, this formulation requires a small\nnumber of binary variables, independent of the number of time discretization\nnodes. This enables the proposed formulation to provide solutions efficiently,\neven for large numbers of time discretization nodes. To provide even faster\nsolutions, an iterative algorithm is introduced to heuristically solve STO and\nSO. The proposed approaches are then showcased on four different switched\nsystems and results demonstrate the efficiency of the MINLP formulation and the\niterative algorithm.\n","authors":["Ramin Abbasi-Esfeden","Armin Nurkanovic","Moritz Diehl","Panagiotis Patrinos","Jan Swevers"],"pdf_url":"https://arxiv.org/pdf/2501.05158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03105v2","updated":"2025-01-09T11:24:56Z","published":"2024-04-03T23:07:24Z","title":"Methodology for Interpretable Reinforcement Learning for Optimizing\n  Mechanical Ventilation","summary":"  Mechanical ventilation is a critical life support intervention that delivers\ncontrolled air and oxygen to a patient's lungs, assisting or replacing\nspontaneous breathing. While several data-driven approaches have been proposed\nto optimize ventilator control strategies, they often lack interpretability and\nalignment with domain knowledge, hindering clinical adoption. This paper\npresents a methodology for interpretable reinforcement learning (RL) aimed at\nimproving mechanical ventilation control as part of connected health systems.\nUsing a causal, nonparametric model-based off-policy evaluation, we assess RL\npolicies for their ability to enhance patient-specific outcomes-specifically,\nincreasing blood oxygen levels (SpO2), while avoiding aggressive ventilator\nsettings that may cause ventilator-induced lung injuries and other\ncomplications. Through numerical experiments on real-world ICU data from the\nMIMIC-III database, we demonstrate that our interpretable decision tree policy\nachieves performance comparable to state-of-the-art deep RL methods while\noutperforming standard behavior cloning approaches. The results highlight the\npotential of interpretable, data-driven decision support systems to improve\nsafety and efficiency in personalized ventilation strategies, paving the way\nfor seamless integration into connected healthcare environments.\n","authors":["Joo Seung Lee","Malini Mahendra","Anil Aswani"],"pdf_url":"https://arxiv.org/pdf/2404.03105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16490v2","updated":"2025-01-09T10:38:46Z","published":"2024-05-26T08:58:03Z","title":"Formalising the intentional stance 1: attributing goals and beliefs to\n  stochastic processes","summary":"  This article presents a formalism inspired by Dennett's notion of the\nintentional stance. Whereas Dennett's treatment of these concepts is informal,\nwe aim to provide a more formal analogue. We introduce a framework based on\nstochastic processes with inputs and outputs, in which we can talk precisely\nabout *interpreting* systems as having *normative-epistemic states*, which\ncombine belief-like and desire-like features. Our framework is based on\noptimality but nevertheless allows us to model some forms of bounded cognition.\n  One might expect that the systems that can be described in\nnormative-epistemic terms would be some special subset of all systems, but we\nshow that this is not the case: every system admits a (possibly trivial)\nnormative-epistemic interpretation, and those that can be *uniquely specified*\nby a normative-epistemic description are exactly the deterministic ones.\nFinally, we show that there is a suitable notion of Bayesian updating for\nnormative-epistemic states, which we call *value-laden filtering*, since it\ninvolves both normative and epistemic elements. For unbounded cognition it is\nalways permissible to attribute beliefs that update in this way. This is not\nalways the case for bounded cognition, but we give a sufficient condition under\nwhich it is.\n  This paper gives an overview of our framework aimed at cognitive scientists,\nwith a formal mathematical treatment given in a companion paper.\n","authors":["Simon McGregor"," timorl","Nathaniel Virgo"],"pdf_url":"https://arxiv.org/pdf/2405.16490v2.pdf","comment":"The previous version of this document included the content of the\n  companion paper, \"Formalising the intentional stance 2: a coinductive\n  approach\". The paper has now been split into two, this one (which is an\n  overview aimed at cognitive scientists) and the companion (which contains\n  full mathematical detail). 16 pages, one figure with two subfigures"},{"id":"http://arxiv.org/abs/2412.16222v2","updated":"2025-01-09T10:27:10Z","published":"2024-12-18T12:54:50Z","title":"A matheuristic approach for an integrated lot-sizing and scheduling\n  problem with a period-based learning effect","summary":"  This research investigates a multi-product capacitated lot-sizing and\nscheduling problem incorporating a novel learning effect, namely the\nperiod-based learning effect. This is inspired by a real case in a core\nanalysis laboratory under a job shop setting. Accordingly, a Mixed-Integer\nLinear Programming (MILP) model is extended based on the big-bucket\nformulation, optimizing the total tardiness and overtime costs. Given the\ncomplexity of the problem, a cutting plane method is employed to simplify the\nmodel. Afterward, three matheuristic methods based on the rolling horizon\napproach are devised, incorporating two lower bounds and a local search\nheuristic. Furthermore, a post-processing approach is implemented to\nincorporate lot-streaming possibility. Computational experiments demonstrate:\n1) the simplified model performs effectively in terms of both solution quality\nand computational time; and 2) although the model encounters challenges with\nlarge-scale instances, the proposed matheuristic methods achieve satisfactory\noutcomes; and 3) it can be inferred that the complexity of the models and\nsolution methods are independent of the learning effect; however, the value of\nlearning effect may impact the performance of the lower bounds; 4) in\nmanufacturing settings, where the lot-streaming is possible, incorporating\npost-processing can drastically improve the objective function; 5) the impact\nof the period-based learning effect in the results is significant, and the\nmodel's sensitivity to time-based parameters (e.g., learning rate) is more than\ncost-based ones (e.g., tardiness cost).\n","authors":["Mohammad Rohaninejad","Behdin Vahedi-Nouri","Reza Tavakkoli-Moghaddam","Zdeněk Hanzálek"],"pdf_url":"https://arxiv.org/pdf/2412.16222v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05052v1","updated":"2025-01-09T08:20:43Z","published":"2025-01-09T08:20:43Z","title":"Cover-Relax-Search: A Primal Heuristic for Binary Quadratic Programs","summary":"  Binary Quadratic Programs (BQPs) are a class of NP-hard problems that arise\nin a wide range of applications, including finance, machine learning, and\nlogistics. These problems are challenging to solve due to the combinatorial\nsearch space and nonlinearity. In fact, this class of optimization problems is\nso challenging that, in many instances, standard algorithms struggle to\nidentify feasible solutions within a reasonable time. Primal heuristic\nalgorithms have been developed to quickly identify feasible solutions to BQPs.\nIn this paper, we propose Cover-Relax-Search, an efficient primal heuristic for\nBQPs. This approach is inspired by multiple local search algorithms, including\nUndercover. We evaluate the \\emph{Cover-Relax-Search} algorithm on multiple BQP\nbenchmarks and show that our proposed heuristic identifies high-quality\nsolutions at a faster speed and significantly reduces the primal integral\ncompared to state-of-the-art solvers and other local search baselines.\n","authors":["Weimin Huang","Natalie M. Isenberg","Jan Drgona","Draguna L Vrabie","Bistra Dilkina"],"pdf_url":"https://arxiv.org/pdf/2501.05052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05046v1","updated":"2025-01-09T08:12:34Z","published":"2025-01-09T08:12:34Z","title":"Quantum-Assisted Space Logistics Mission Planning","summary":"  Quantum computing provides a novel approach to addressing conventionally\nintractable issues in large-scale optimization. Space logistics missions\nrequire the efficient routing of payloads, spacecraft, and resources across\ncomplex networks, often resulting in an exponential growth of the solution\nspace that classical methods cannot efficiently solve. This paper leverages\nentropy quantum computing to model and solve the space logistics problem as a\ntime-dependent multicommodity network flow, enabling the exploration of large\nsolution spaces. The findings highlight quantum computing's potential to\naddress complex aerospace logistics, demonstrating its suitability for complex\ninterplanetary mission planning.\n","authors":["Amiratabak Bahengam","Mohammad-Ali Miri","R. Joseph Rupert","Wesley Dyk","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.05046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18173v3","updated":"2025-01-09T05:08:53Z","published":"2024-12-24T05:18:12Z","title":"Optimal error estimates of the stochastic parabolic optimal control\n  problem with integral state constraint","summary":"  In this paper, the optimal strong error estimates for stochastic parabolic\noptimal control problem with additive noise and integral state constraint are\nderived based on time-implicit and finite element discretization. The\ncontinuous and discrete first-order optimality conditions are deduced by\nconstructing the Lagrange functional, which contains forward-backward\nstochastic parabolic equations and a variational equation. The fully discrete\nversion of forward-backward stochastic parabolic equations is introduced as an\nauxiliary problem and the optimal strong convergence orders are estimated,\nwhich further allows the optimal a priori error estimates for control, state,\nadjoint state and multiplier to be derived. Then, a simple and yet efficient\ngradient projection algorithm is proposed to solve stochastic parabolic control\nproblem and its convergence rate is proved. Numerical experiments are carried\nout to illustrate the theoretical findings.\n","authors":["Qiming Wang","Wanfang Shen","Wenbin Liu"],"pdf_url":"https://arxiv.org/pdf/2412.18173v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04972v1","updated":"2025-01-09T05:03:42Z","published":"2025-01-09T05:03:42Z","title":"Algebraic characterization of equivalence between optimization\n  algorithms","summary":"  When are two algorithms the same? How can we be sure a recently proposed\nalgorithm is novel, and not a minor twist on an existing method? In this paper,\nwe present a framework for reasoning about equivalence between a broad class of\niterative algorithms, with a focus on algorithms designed for convex\noptimization. We propose several notions of what it means for two algorithms to\nbe equivalent, and provide computationally tractable means to detect\nequivalence. Our main definition, oracle equivalence, states that two\nalgorithms are equivalent if they result in the same sequence of calls to the\nfunction oracles (for suitable initialization). Borrowing from control theory,\nwe use state-space realizations to represent algorithms and characterize\nalgorithm equivalence via transfer functions. Our framework can also identify\nand characterize equivalence between algorithms that use different oracles that\nare related via a linear fractional transformation. Prominent examples include\nlinear transformations and function conjugation.\n","authors":["Laurent Lessard","Madeleine Udell"],"pdf_url":"https://arxiv.org/pdf/2501.04972v1.pdf","comment":"This paper generalizes and provides new analysis and examples\n  compared to arxiv:2105.04684"},{"id":"http://arxiv.org/abs/2405.19546v4","updated":"2025-01-09T05:01:32Z","published":"2024-05-29T22:19:39Z","title":"Convex Optimization of Initial Perturbations toward Quantitative Weather\n  Control","summary":"  This study proposes introducing convex optimization to find initial\nperturbations of atmospheric states to realize specified changes in subsequent\nweather. In the proposed method, we formulate and solve an inverse problem to\nfind effective perturbations in atmospheric variables so that controlled\nvariables satisfy specified changes at a specified time. The proposed method\nfirst constructs a sensitivity matrix of controlled variables, such as\naccumulated precipitation, to the initial atmospheric variables, such as\ntemperature and humidity, through sensitivity analysis using a numerical\nweather prediction (NWP) model. Then a convex optimization problem is\nformulated to achieve various control specifications involving not only\nquadratic functions but also absolute values and maximum values of the\ncontrolled variables and initial atmospheric variables in the cost function and\nconstraints. The proposed method was validated through a benchmark warm bubble\nexperiment using the NWP model. The experiments showed that the identified\nperturbations successfully realized specified spatial distributions of\naccumulated precipitation.\n","authors":["Toshiyuki Ohtsuka","Atsushi Okazaki","Masaki Ogura","Shunji Kotsuki"],"pdf_url":"https://arxiv.org/pdf/2405.19546v4.pdf","comment":"shortend to improve conciseness; some figures added to Supplements\n  for discussion about physical processes; license changed to CC BY 4.0;\n  revised to improve readability; some figures in Appendix omitted to improve\n  conciseness"},{"id":"http://arxiv.org/abs/2411.01899v2","updated":"2025-01-09T03:57:46Z","published":"2024-11-04T09:06:15Z","title":"New Lagrangian dual algorithms for solving the continuous nonlinear\n  resource allocation problem","summary":"  The continuous nonlinear resource allocation problem (CONRAP) has broad\napplications in economics, engineering, production and inventory management,\nand often serves as a subproblem in complex programming. Without relying on\nmonotonicity assumptions for the objective and constraint functions, we propose\ntwo Lagrangian dual algorithms for solving two types of CONRAP. Both algorithms\ndetermine an update strategy for the Lagrange multiplier, utilizing the values\nof the objective and constraint functions at the current and previous\niterations. This strategy accelerates the process of finding dual optimal\nsolutions. Subsequently, leveraging the problem's convexity, the primal optimal\nsolution is either directly identified or derived by solving a one-dimensional\nlinear equation. We also prove that both algorithms converge to optimal\nsolutions within a finite number of iterations. Numerical experiments on six\ntypes of practical test problems illustrate the superior computational\nefficiency of the proposed algorithms. For test problems with a general\ninequality constraint, the first algorithm achieves a CPU time reduction\nexceeding an order of magnitude compared to solvers such as Gurobi and CVX. For\ntest problems with a linear equality constraint, the second algorithm\nconsistently outperforms four existing algorithms, delivering an improvement of\nover two orders of magnitude in computational efficiency.\n","authors":["Kaixiang Hu","Caixia Kou","Jianhua Yuan"],"pdf_url":"https://arxiv.org/pdf/2411.01899v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04936v1","updated":"2025-01-09T03:00:09Z","published":"2025-01-09T03:00:09Z","title":"Continuous and Discrete Systems for Quasi Variational Inequalities with\n  Application to Game Theory","summary":"  A new class of projected dynamical systems of third order is investigated for\nquasi (parametric) variational inequalities in which the convex set in the\nclassical variational inequality also depends upon the solution explicitly or\nimplicitly. We study the stability of a continuous method of a gradient type.\nSome iterative implicit and explicit schemes are suggested as counterparts of\nthe continuous case by inertial proximal methods. The convergence analysis of\nthese proposed methods is established under sufficient mild conditions.\nMoreover, some applications dealing with the generalized Nash equilibrium\nproblems are presented.\n","authors":["Oday Hazaimah"],"pdf_url":"https://arxiv.org/pdf/2501.04936v1.pdf","comment":"17 pages. arXiv admin note: text overlap with arXiv:2406.19345"},{"id":"http://arxiv.org/abs/2501.04160v2","updated":"2025-01-09T02:53:56Z","published":"2025-01-07T22:19:06Z","title":"Collaborative Spacecraft Servicing under Partial Feedback using\n  Lyapunov-based Deep Neural Networks","summary":"  Multi-agent systems are increasingly applied in space missions, including\ndistributed space systems, resilient constellations, and autonomous rendezvous\nand docking operations. A critical emerging application is collaborative\nspacecraft servicing, which encompasses on-orbit maintenance, space debris\nremoval, and swarm-based satellite repositioning. These missions involve\nservicing spacecraft interacting with malfunctioning or defunct spacecraft\nunder challenging conditions, such as limited state information, measurement\ninaccuracies, and erratic target behaviors. Existing approaches often rely on\nassumptions of full state knowledge or single-integrator dynamics, which are\nimpractical for real-world applications involving second-order spacecraft\ndynamics. This work addresses these challenges by developing a distributed\nstate estimation and tracking framework that requires only relative position\nmeasurements and operates under partial state information. A novel\n$\\rho$-filter is introduced to reconstruct unknown states using locally\navailable information, and a Lyapunov-based deep neural network adaptive\ncontroller is developed that adaptively compensates for uncertainties stemming\nfrom unknown spacecraft dynamics. To ensure the collaborative spacecraft\nregulation problem is well-posed, a trackability condition is defined. A\nLyapunov-based stability analysis is provided to ensure exponential convergence\nof errors in state estimation and spacecraft regulation to a neighborhood of\nthe origin under the trackability condition. The developed method eliminates\nthe need for expensive velocity sensors or extensive pre-training, offering a\npractical and robust solution for spacecraft servicing in complex, dynamic\nenvironments.\n","authors":["Cristian F. Nino","Omkar Sudhir Patil","Christopher D. Petersen","Sean Phillips","Warren E. Dixon"],"pdf_url":"https://arxiv.org/pdf/2501.04160v2.pdf","comment":"24 pages, 4 Figures, Journal"},{"id":"http://arxiv.org/abs/2501.04889v1","updated":"2025-01-09T00:05:31Z","published":"2025-01-09T00:05:31Z","title":"Projected proximal gradient trust-region algorithm for nonsmooth\n  optimization","summary":"  We consider trust-region methods for solving optimization problems where the\nobjective is the sum of a smooth, nonconvex function and a nonsmooth, convex\nregularizer. We extend the global convergence theory of such methods to include\nworst-case complexity bounds in the case of unbounded model Hessian growth, and\nintroduce a new, simple nonsmooth trust-region subproblem solver based on\ncombining several iterations of proximal gradient descent with a single\nprojection into the trust region, which meets the sufficient descent\nrequirements for algorithm convergence and has promising numerical results.\n","authors":["Minh N. Dao","Hung M. Phan","Lindon Roberts"],"pdf_url":"https://arxiv.org/pdf/2501.04889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05623v1","updated":"2025-01-09T23:44:42Z","published":"2025-01-09T23:44:42Z","title":"A Quadratically-Constrained Convex Approximation for the AC Optimal\n  Power Flow","summary":"  We introduce a quadratically-constrained approximation (QCAC) of the AC\noptimal power flow (AC-OPF) problem. Unlike existing approximations like the\nDC-OPF, our model does not rely on typical assumptions such as high\nreactance-to-resistance ratio, near-nominal voltage magnitudes, or small angle\ndifferences, and preserves the structural sparsity of the original AC power\nflow equations, making it suitable for decentralized power systems optimization\nproblems. To achieve this, we reformulate the AC-OPF problem as a quadratically\nconstrained quadratic program. The nonconvex terms are expressed as differences\nof convex functions, which are then convexified around a base point derived\nfrom a warm start of the nodal voltages. If this linearization results in a\nnon-empty constraint set, the convexified constraints form an inner convex\napproximation. Our experimental results, based on Power Grid Library instances\nof up to 30,000 buses, demonstrate the effectiveness of the QCAC approximation\nwith respect to other well-documented conic relaxations and a linear\napproximation. We further showcase its potential advantages over the\nwell-documented second-order conic relaxation of the power flow equations in\ntwo proof-of-concept case studies: optimal reactive power dispatch in\ntransmission networks and PV hosting capacity in distribution grids.\n","authors":["Gonzalo E. Constante-Flores","Can Li"],"pdf_url":"https://arxiv.org/pdf/2501.05623v1.pdf","comment":"10 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.05619v1","updated":"2025-01-09T23:40:57Z","published":"2025-01-09T23:40:57Z","title":"Comparative Analysis of Two-Stage Distributionally Robust Optimization\n  over 1-Wasserstein and 2-Wasserstein Balls","summary":"  This paper investigates advantages of using 2-Wasserstein ambiguity sets over\n1-Wasserstein sets in two-stage distributionally robust optimization with\nright-hand side uncertainty. We examine the worst-case distributions within 1-\nand 2-Wasserstein balls under both unrestricted and nonnegative orthant\nsupports, highlighting a pathological behavior arising in 1-Wasserstein balls.\nClosed-form solutions for a single-scenario newsvendor problem illustrate that\n2-Wasserstein balls enable more informed decisions. Additionally, a\npenalty-based dual interpretation suggests that 2-Wasserstein balls may\noutperform 1-Wasserstein balls across a broader range of Wasserstein radii,\neven with general support sets.\n","authors":["Geunyeong Byeon"],"pdf_url":"https://arxiv.org/pdf/2501.05619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06881v3","updated":"2025-01-09T21:36:01Z","published":"2023-04-14T01:16:53Z","title":"Designing a Framework for Solving Multiobjective Simulation Optimization\n  Problems","summary":"  Multiobjective simulation optimization (MOSO) problems are optimization\nproblems with multiple conflicting objectives, where evaluation of at least one\nof the objectives depends on a black-box numerical code or real-world\nexperiment, which we refer to as a simulation. While an extensive body of\nresearch is dedicated to developing new algorithms and methods for solving\nthese and related problems, it is challenging and time consuming to integrate\nthese techniques into real world production-ready solvers. This is partly due\nto the diversity and complexity of modern state-of-the-art MOSO algorithms and\nmethods and partly due to the complexity and specificity of many real-world\nproblems and their corresponding computing environments. The complexity of this\nproblem is only compounded when introducing potentially complex and/or\ndomain-specific surrogate modeling techniques, problem formulations, design\nspaces, and data acquisition functions. This paper carefully surveys the\ncurrent state-of-the-art in MOSO algorithms, techniques, and solvers; as well\nas problem types and computational environments where MOSO is commonly applied.\nWe then present several key challenges in the design of a Parallel\nMultiobjective Simulation Optimization framework (ParMOO) and how they have\nbeen addressed. Finally, we provide two case studies demonstrating how\ncustomized ParMOO solvers can be quickly built and deployed to solve real-world\nMOSO problems.\n","authors":["Tyler H. Chang","Stefan M. Wild"],"pdf_url":"https://arxiv.org/pdf/2304.06881v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04406v2","updated":"2025-01-09T21:26:04Z","published":"2024-02-06T21:05:37Z","title":"Regularized MIP Model for Integrating Energy Storage Systems and its\n  Application for Solving a Trilevel Interdiction Problem","summary":"  Incorporating energy storage systems (ESS) into power systems has been\nstudied in many recent works, where binary variables are often introduced to\nmodel the complementary nature of battery charging and discharging. A\nconventional approach for these ESS optimization problems is to relax binary\nvariables and convert the problem into a linear program. However, such linear\nprogramming relaxation models can yield unrealistic fractional solutions, such\nas simultaneous charging and discharging. In this paper, we develop a\nregularized Mixed-Integer Programming (MIP) model for the ESS optimal power\nflow (OPF) problem. We prove that under mild conditions, the proposed\nregularized model admits a zero integrality gap with its linear programming\nrelaxation; hence, it can be solved efficiently. By studying the properties of\nthe regularized MIP model, we show that its optimal solution is also\nnear-optimal to the original ESS OPF problem, thereby providing a valid and\ntight upper bound for the ESS OPF problem. The use of the regularized MIP model\nallows us to solve a trilevel min-max-min network contingency problem which is\notherwise intractable to solve.\n","authors":["Dahye Han","Nan Jiang","Santanu S. Dey","Weijun Xie"],"pdf_url":"https://arxiv.org/pdf/2402.04406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05582v1","updated":"2025-01-09T21:19:10Z","published":"2025-01-09T21:19:10Z","title":"Equivariant Perturbation in Gomory and Johnson's Infinite Group Problem.\n  IV. The General Unimodular Two-Dimensional Case","summary":"  We study an abstract setting for cutting planes for integer programming\ncalled the infinite group problem. In this abstraction, cutting planes are\ncomputed via cut generating function that act on the simplex tableau. In this\nfunction space, cut generating functions are classified as minimal, extreme,\nand facets as a proxy for understanding the strength or potential importance of\nthese functions. Prior work developed algorithms for testing minimality,\nextremality, and facetness for cut generating functions applied to 1-row\ntableau and to some 2-row tableau in a restricted setting. We complement and\ngeneralize this work by giving an algorithm for testing the extremality of a\nlarge class of minimal valid functions for the two-dimensional infinite group\nproblem. Along the way, we develop results of independent interest on\nfunctional equations and infinite systems of linear equations.\n","authors":["Robert Hildebrand","Matthias Köppe","Luze Xu"],"pdf_url":"https://arxiv.org/pdf/2501.05582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05548v1","updated":"2025-01-09T19:38:27Z","published":"2025-01-09T19:38:27Z","title":"Switched Optimal Control with Dwell Time Constraints","summary":"  This paper presents an embedding-based approach for solving switched optimal\ncontrol problems (SOCPs) with dwell time constraints. At first, an embedded\noptimal control problem (EOCP) is defined by replacing the discrete switching\nsignal with a continuous embedded variable that can take intermediate values\nbetween the discrete modes. While embedding enables solutions of SOCPs via\nconventional techniques, optimal solutions of EOCPs often involve nonexistent\nmodes and thus may not be feasible for the SOCP. In the modified EOCP (MEOCP),\na concave function is added to the cost function to enforce a bang-bang\nsolution in the embedded variable, which results in feasible solutions for the\nSOCP. However, the MEOCP cannot guarantee the satisfaction of dwell-time\nconstraints.\n  In this paper, a MEOCP is combined with a filter layer to remove switching\ntimes that violate the dwell time constraint. Insertion gradients are used to\nminimize the effect of the filter on the optimal cost.\n","authors":["Masoud S. Sakha","Rushikesh Kamalapurkar"],"pdf_url":"https://arxiv.org/pdf/2501.05548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15745v7","updated":"2025-01-09T19:35:10Z","published":"2024-01-28T20:12:08Z","title":"The computation of approximate feedback Stackelberg equilibria in\n  multi-player nonlinear constrained dynamic games","summary":"  Solving feedback Stackelberg games with nonlinear dynamics and coupled\nconstraints, a common scenario in practice, presents significant challenges.\nThis work introduces an efficient method for computing approximate local\nfeedback Stackelberg equilibria in multi-player general-sum dynamic games, with\ncontinuous state and action spaces. Different from existing (approximate)\ndynamic programming solutions that are primarily designed for unconstrained\nproblems, our approach involves reformulating a feedback Stackelberg dynamic\ngame into a sequence of nested optimization problems, enabling the derivation\nof Karush-Kuhn-Tucker (KKT) conditions and the establishment of a second-order\nsufficient condition for local feedback Stackelberg equilibria. We propose a\nNewton-style primal-dual interior point method for solving constrained linear\nquadratic (LQ) feedback Stackelberg games, offering provable convergence\nguarantees. Our method is further extended to compute local feedback\nStackelberg equilibria for more general nonlinear games by iteratively\napproximating them using LQ games, ensuring that their KKT conditions are\nlocally aligned with those of the original nonlinear games. We prove the\nexponential convergence of our algorithm in constrained nonlinear games. In a\nfeedback Stackelberg game with nonlinear dynamics and (nonconvex) coupled costs\nand constraints, our experimental results reveal the algorithm's ability to\nhandle infeasible initial conditions and achieve exponential convergence\ntowards an approximate local feedback Stackelberg equilibrium.\n","authors":["Jingqi Li","Somayeh Sojoudi","Claire Tomlin","David Fridovich-Keil"],"pdf_url":"https://arxiv.org/pdf/2401.15745v7.pdf","comment":"This manuscript has been accepted by SIAM Journal on Optimization. We\n  fix few typos in this arxiv version"},{"id":"http://arxiv.org/abs/1912.07356v5","updated":"2025-01-09T12:28:40Z","published":"2019-12-11T16:38:44Z","title":"The Integrated Vehicle and Pollster Routing Problem","summary":"  The National Statistics Bureau of Ecuador carries out monthly polls to\nmonitor the evolution of the Consumer Price Index, a metric measuring consumer\nprices of essential commodities. These surveys are administered across a\ndesignated set of stores, with a fleet of vehicles transporting pollsters from\nthe bureau headquarters to the chosen locations. Moreover, pollsters move\nbetween stores using pedestrian paths or using a vehicle to shorten the travel\ntime. This paper introduces the Integrated Vehicle and Pollster Routing Problem\nand presents an integer programming model to effectively schedule pollster\nvisits to selected stores while optimizing the routing of the vehicle fleet.\nResults on the computational complexity, a three-phase algorithm, and\ncomputational experience based on real-world instances are provided.\n","authors":["Sandra Gutiérrez","Andrés Miniguano-Trujillo","Diego Recalde","Luis M. Torres","Ramiro Torres"],"pdf_url":"https://arxiv.org/pdf/1912.07356v5.pdf","comment":"28 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2205.08435v4","updated":"2025-01-09T18:56:11Z","published":"2022-05-17T15:25:23Z","title":"Cyber Risk Assessment for Capital Management","summary":"  This paper introduces a two-pillar cyber risk management framework to address\nthe pervasive challenges in managing cyber risk. The first pillar, cyber risk\nassessment, combines insurance frequency-severity models with cybersecurity\ncascade models to capture the unique nature of cyber risk. The second pillar,\ncyber capital management, facilitates informed allocation of capital for a\nbalanced cyber risk management strategy, including cybersecurity investments,\ninsurance coverage, and reserves. A case study, based on historical cyber\nincident data and realistic assumptions, demonstrates the necessity of\ncomprehensive cost-benefit analysis for budget-constrained companies with\ncompeting objectives in cyber risk management. In addition, sensitivity\nanalysis highlights the dependence of the optimal strategy on factors such as\nthe price of cybersecurity controls and their effectiveness. The framework's\nimplementation across a diverse range of companies yields general insights on\ncyber risk management.\n","authors":["Wing Fung Chong","Runhuan Feng","Hins Hu","Linfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2205.08435v4.pdf","comment":"This paper was first presented on July 5, 2021, at the 24th\n  International Congress on Insurance: Mathematics and Economics"},{"id":"http://arxiv.org/abs/2501.06266v1","updated":"2025-01-09T18:25:41Z","published":"2025-01-09T18:25:41Z","title":"Linear Algebraic Truncation Algorithm with A Posteriori Error Bounds for\n  Computing Markov Chain Equilibrium Gradients","summary":"  The numerical computation of equilibrium reward gradients for Markov chains\nappears in many applications for example within the policy improvement step\narising in connection with average reward stochastic dynamic programming. When\nthe state space is large or infinite, one will typically need to truncate the\nstate space in order to arrive at a numerically tractable formulation. In this\npaper, we derive the first computable a posteriori error bounds for equilibrium\nreward gradients that account for the error induced by the truncation. Our\napproach uses regeneration to express equilibrium quantities in terms of the\nexpectations of cumulative rewards over regenerative cycles. Lyapunov functions\nare then used to bound the contributions to these cumulative rewards and their\ngradients from path excursions that take the chain outside the truncation set.\nOur numerical results indicate that our approach can provide highly accurate\nbounds with truncation sets of moderate size. We further extend our approach to\nMarkov jump processes.\n","authors":["Saied Mahdian","Peter W. Glynn"],"pdf_url":"https://arxiv.org/pdf/2501.06266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09173v1","updated":"2025-01-09T11:06:35Z","published":"2025-01-09T11:06:35Z","title":"Formalising the intentional stance 2: a coinductive approach","summary":"  Given a stochastic process with inputs and outputs, how might its behaviour\nbe related to pursuit of a goal? We model this using 'transducers', objects\nthat capture only the external behaviour of a system and not its internal\nstate. A companion paper summarises our results for cognitive scientists; the\ncurrent paper gives formal definitions and proofs.\n  To formalise the concept of a system that behaves as if it were pursuing a\ngoal, we consider what happens when a transducer (a 'policy') is coupled to\nanother transducer that comes equipped with a success condition (a\n'teleo-environment'). An optimal policy is identified with a transducer that\nbehaves as if it were perfectly rational in the pursuit of a goal; our\nframework also allows us to model constrained rationality.\n  Optimal policies obey a version of Bellman's principle: a policy that's\noptimal in one time step will again be optimal in the next time step, but with\nrespect to a different teleo-environment (obtained from the original one by a\nmodified version of Bayesian filtering). This property sometimes also applies\nto the bounded-rational case; we give a sufficient condition.\n  A policy is deterministic if and only if there exists a teleo-environment for\nwhich it is uniquely optimal among the set of all policies; we relate this to\nclassical representation theorems from decision theory. This result need not\nhold in the bounded-rational case; we give an example related to the\nabsent-minded driver problem. The formalism is defined using coinduction,\nfollowing the style proposed by Czajka.\n","authors":["Simon McGregor"," timorl","Nathaniel Virgo"],"pdf_url":"https://arxiv.org/pdf/2501.09173v1.pdf","comment":"This is the companion paper to \"Formalising the intentional stance 1:\n  attributing goals and beliefs to stochastic processes\" (uploaded as version 2\n  of arXiv:2405.16490). The other paper is an overview aimed at cognitive\n  scientists while this paper gives full mathematical details. 50 pages, no\n  figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.05452v1","updated":"2025-01-09T18:59:58Z","published":"2025-01-09T18:59:58Z","title":"ReFocus: Visual Editing as a Chain of Thought for Structured Image\n  Understanding","summary":"  Structured image understanding, such as interpreting tables and charts,\nrequires strategically refocusing across various structures and texts within an\nimage, forming a reasoning sequence to arrive at the final answer. However,\ncurrent multimodal large language models (LLMs) lack this multihop selective\nattention capability. In this work, we introduce ReFocus, a simple yet\neffective framework that equips multimodal LLMs with the ability to generate\n\"visual thoughts\" by performing visual editing on the input image through code,\nshifting and refining their visual focuses. Specifically, ReFocus enables\nmultimodal LLMs to generate Python codes to call tools and modify the input\nimage, sequentially drawing boxes, highlighting sections, and masking out\nareas, thereby enhancing the visual reasoning process. We experiment upon a\nwide range of structured image understanding tasks involving tables and charts.\nReFocus largely improves performance on all tasks over GPT-4o without visual\nediting, yielding an average gain of 11.0% on table tasks and 6.8% on chart\ntasks. We present an in-depth analysis of the effects of different visual\nedits, and reasons why ReFocus can improve the performance without introducing\nadditional information. Further, we collect a 14k training set using ReFocus,\nand prove that such visual chain-of-thought with intermediate information\noffers a better supervision than standard VQA data, reaching a 8.0% average\ngain over the same model trained with QA pairs and 2.6% over CoT.\n","authors":["Xingyu Fu","Minqian Liu","Zhengyuan Yang","John Corring","Yijuan Lu","Jianwei Yang","Dan Roth","Dinei Florencio","Cha Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.05452v1.pdf","comment":"Project link: https://zeyofu.github.io/ReFocus/"},{"id":"http://arxiv.org/abs/2501.05453v1","updated":"2025-01-09T18:59:58Z","published":"2025-01-09T18:59:58Z","title":"An Empirical Study of Autoregressive Pre-training from Videos","summary":"  We empirically study autoregressive pre-training from videos. To perform our\nstudy, we construct a series of autoregressive video models, called Toto. We\ntreat videos as sequences of visual tokens and train transformer models to\nautoregressively predict future tokens. Our models are pre-trained on a diverse\ndataset of videos and images comprising over 1 trillion visual tokens. We\nexplore different architectural, training, and inference design choices. We\nevaluate the learned visual representations on a range of downstream tasks\nincluding image recognition, video classification, object tracking, and\nrobotics. Our results demonstrate that, despite minimal inductive biases,\nautoregressive pre-training leads to competitive performance across all\nbenchmarks. Finally, we find that scaling our video models results in similar\nscaling curves to those seen in language models, albeit with a different rate.\nMore details at https://brjathu.github.io/toto/\n","authors":["Jathushan Rajasegaran","Ilija Radosavovic","Rahul Ravishankar","Yossi Gandelsman","Christoph Feichtenhofer","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2501.05453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05450v1","updated":"2025-01-09T18:59:56Z","published":"2025-01-09T18:59:56Z","title":"Decentralized Diffusion Models","summary":"  Large-scale AI model training divides work across thousands of GPUs, then\nsynchronizes gradients across them at each step. This incurs a significant\nnetwork burden that only centralized, monolithic clusters can support, driving\nup infrastructure costs and straining power systems. We propose Decentralized\nDiffusion Models, a scalable framework for distributing diffusion model\ntraining across independent clusters or datacenters by eliminating the\ndependence on a centralized, high-bandwidth networking fabric. Our method\ntrains a set of expert diffusion models over partitions of the dataset, each in\nfull isolation from one another. At inference time, the experts ensemble\nthrough a lightweight router. We show that the ensemble collectively optimizes\nthe same objective as a single model trained over the whole dataset. This means\nwe can divide the training burden among a number of \"compute islands,\" lowering\ninfrastructure costs and improving resilience to localized GPU failures.\nDecentralized diffusion models empower researchers to take advantage of\nsmaller, more cost-effective and more readily available compute like on-demand\nGPU nodes rather than central integrated systems. We conduct extensive\nexperiments on ImageNet and LAION Aesthetics, showing that decentralized\ndiffusion models FLOP-for-FLOP outperform standard diffusion models. We finally\nscale our approach to 24 billion parameters, demonstrating that high-quality\ndiffusion models can now be trained with just eight individual GPU nodes in\nless than a week.\n","authors":["David McAllister","Matthew Tancik","Jiaming Song","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2501.05450v1.pdf","comment":"Project webpage: https://decentralizeddiffusion.github.io/"},{"id":"http://arxiv.org/abs/2501.05449v1","updated":"2025-01-09T18:59:35Z","published":"2025-01-09T18:59:35Z","title":"Explainable AI-Enhanced Deep Learning for Pumpkin Leaf Disease\n  Detection: A Comparative Analysis of CNN Architectures","summary":"  Pumpkin leaf diseases are significant threats to agricultural productivity,\nrequiring a timely and precise diagnosis for effective management. Traditional\nidentification methods are laborious and susceptible to human error,\nemphasizing the necessity for automated solutions. This study employs on the\n\"Pumpkin Leaf Disease Dataset\", that comprises of 2000 high-resolution images\nseparated into five categories. Downy mildew, powdery mildew, mosaic disease,\nbacterial leaf spot, and healthy leaves. The dataset was rigorously assembled\nfrom several agricultural fields to ensure a strong representation for model\ntraining. We explored many proficient deep learning architectures, including\nDenseNet201, DenseNet121, DenseNet169, Xception, ResNet50, ResNet101 and\nInceptionResNetV2, and observed that ResNet50 performed most effectively, with\nan accuracy of 90.5% and comparable precision, recall, and F1-Score. We used\nExplainable AI (XAI) approaches like Grad-CAM, Grad-CAM++, Score-CAM, and\nLayer-CAM to provide meaningful representations of model decision-making\nprocesses, which improved understanding and trust in automated disease\ndiagnostics. These findings demonstrate ResNet50's potential to revolutionize\npumpkin leaf disease detection, allowing for earlier and more accurate\ntreatments.\n","authors":["Md. Arafat Alam Khandaker","Ziyan Shirin Raha","Shifat Islam","Tashreef Muhammad"],"pdf_url":"https://arxiv.org/pdf/2501.05449v1.pdf","comment":"Accepted in 2024 27th International Conference on Computer and\n  Information Technology (ICCIT)"},{"id":"http://arxiv.org/abs/2501.05446v1","updated":"2025-01-09T18:58:30Z","published":"2025-01-09T18:58:30Z","title":"Relative Pose Estimation through Affine Corrections of Monocular Depth\n  Priors","summary":"  Monocular depth estimation (MDE) models have undergone significant\nadvancements over recent years. Many MDE models aim to predict affine-invariant\nrelative depth from monocular images, while recent developments in large-scale\ntraining and vision foundation models enable reasonable estimation of metric\n(absolute) depth. However, effectively leveraging these predictions for\ngeometric vision tasks, in particular relative pose estimation, remains\nrelatively under explored. While depths provide rich constraints for cross-view\nimage alignment, the intrinsic noise and ambiguity from the monocular depth\npriors present practical challenges to improving upon classic keypoint-based\nsolutions. In this paper, we develop three solvers for relative pose estimation\nthat explicitly account for independent affine (scale and shift) ambiguities,\ncovering both calibrated and uncalibrated conditions. We further propose a\nhybrid estimation pipeline that combines our proposed solvers with classic\npoint-based solvers and epipolar constraints. We find that the affine\ncorrection modeling is beneficial to not only the relative depth priors but\nalso, surprisingly, the ``metric\" ones. Results across multiple datasets\ndemonstrate large improvements of our approach over classic keypoint-based\nbaselines and PnP-based solutions, under both calibrated and uncalibrated\nsetups. We also show that our method improves consistently with different\nfeature matchers and MDE models, and can further benefit from very recent\nadvances on both modules. Code is available at\nhttps://github.com/MarkYu98/madpose.\n","authors":["Yifan Yu","Shaohui Liu","Rémi Pautrat","Marc Pollefeys","Viktor Larsson"],"pdf_url":"https://arxiv.org/pdf/2501.05446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05445v1","updated":"2025-01-09T18:56:05Z","published":"2025-01-09T18:56:05Z","title":"Consistent Flow Distillation for Text-to-3D Generation","summary":"  Score Distillation Sampling (SDS) has made significant strides in distilling\nimage-generative models for 3D generation. However, its\nmaximum-likelihood-seeking behavior often leads to degraded visual quality and\ndiversity, limiting its effectiveness in 3D applications. In this work, we\npropose Consistent Flow Distillation (CFD), which addresses these limitations.\nWe begin by leveraging the gradient of the diffusion ODE or SDE sampling\nprocess to guide the 3D generation. From the gradient-based sampling\nperspective, we find that the consistency of 2D image flows across different\nviewpoints is important for high-quality 3D generation. To achieve this, we\nintroduce multi-view consistent Gaussian noise on the 3D object, which can be\nrendered from various viewpoints to compute the flow gradient. Our experiments\ndemonstrate that CFD, through consistent flows, significantly outperforms\nprevious methods in text-to-3D generation.\n","authors":["Runjie Yan","Yinbo Chen","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05445v1.pdf","comment":"Project page: https://runjie-yan.github.io/cfd/"},{"id":"http://arxiv.org/abs/2501.05444v1","updated":"2025-01-09T18:55:52Z","published":"2025-01-09T18:55:52Z","title":"Can MLLMs Reason in Multimodality? EMMA: An Enhanced MultiModal\n  ReAsoning Benchmark","summary":"  The ability to organically reason over and with both text and images is a\npillar of human intelligence, yet the ability of Multimodal Large Language\nModels (MLLMs) to perform such multimodal reasoning remains under-explored.\nExisting benchmarks often emphasize text-dominant reasoning or rely on shallow\nvisual cues, failing to adequately assess integrated visual and textual\nreasoning. We introduce EMMA (Enhanced MultiModal reAsoning), a benchmark\ntargeting organic multimodal reasoning across mathematics, physics, chemistry,\nand coding. EMMA tasks demand advanced cross-modal reasoning that cannot be\naddressed by reasoning independently in each modality, offering an enhanced\ntest suite for MLLMs' reasoning capabilities. Our evaluation of\nstate-of-the-art MLLMs on EMMA reveals significant limitations in handling\ncomplex multimodal and multi-step reasoning tasks, even with advanced\ntechniques like Chain-of-Thought prompting and test-time compute scaling\nunderperforming. These findings underscore the need for improved multimodal\narchitectures and training paradigms to close the gap between human and model\nreasoning in multimodality.\n","authors":["Yunzhuo Hao","Jiawei Gu","Huichen Will Wang","Linjie Li","Zhengyuan Yang","Lijuan Wang","Yu Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.05444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05442v1","updated":"2025-01-09T18:55:15Z","published":"2025-01-09T18:55:15Z","title":"Progressive Growing of Video Tokenizers for Highly Compressed Latent\n  Spaces","summary":"  Video tokenizers are essential for latent video diffusion models, converting\nraw video data into spatiotemporally compressed latent spaces for efficient\ntraining. However, extending state-of-the-art video tokenizers to achieve a\ntemporal compression ratio beyond 4x without increasing channel capacity poses\nsignificant challenges. In this work, we propose an alternative approach to\nenhance temporal compression. We find that the reconstruction quality of\ntemporally subsampled videos from a low-compression encoder surpasses that of\nhigh-compression encoders applied to original videos. This indicates that\nhigh-compression models can leverage representations from lower-compression\nmodels. Building on this insight, we develop a bootstrapped\nhigh-temporal-compression model that progressively trains high-compression\nblocks atop well-trained lower-compression models. Our method includes a\ncross-level feature-mixing module to retain information from the pretrained\nlow-compression model and guide higher-compression blocks to capture the\nremaining details from the full video sequence. Evaluation of video benchmarks\nshows that our method significantly improves reconstruction quality while\nincreasing temporal compression compared to direct extensions of existing video\ntokenizers. Furthermore, the resulting compact latent space effectively trains\na video diffusion model for high-quality video generation with a reduced token\nbudget.\n","authors":["Aniruddha Mahapatra","Long Mai","Yitian Zhang","David Bourgin","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2501.05442v1.pdf","comment":"Project website:\n  https://progressive-video-tokenizer.github.io/Pro-MAG/"},{"id":"http://arxiv.org/abs/2501.05441v1","updated":"2025-01-09T18:53:06Z","published":"2025-01-09T18:53:06Z","title":"The GAN is dead; long live the GAN! A Modern GAN Baseline","summary":"  There is a widely-spread claim that GANs are difficult to train, and GAN\narchitectures in the literature are littered with empirical tricks. We provide\nevidence against this claim and build a modern GAN baseline in a more\nprincipled manner. First, we derive a well-behaved regularized relativistic GAN\nloss that addresses issues of mode dropping and non-convergence that were\npreviously tackled via a bag of ad-hoc tricks. We analyze our loss\nmathematically and prove that it admits local convergence guarantees, unlike\nmost existing relativistic losses. Second, our new loss allows us to discard\nall ad-hoc tricks and replace outdated backbones used in common GANs with\nmodern architectures. Using StyleGAN2 as an example, we present a roadmap of\nsimplification and modernization that results in a new minimalist baseline --\nR3GAN. Despite being simple, our approach surpasses StyleGAN2 on FFHQ,\nImageNet, CIFAR, and Stacked MNIST datasets, and compares favorably against\nstate-of-the-art GANs and diffusion models.\n","authors":["Yiwen Huang","Aaron Gokaslan","Volodymyr Kuleshov","James Tompkin"],"pdf_url":"https://arxiv.org/pdf/2501.05441v1.pdf","comment":"Accepted to NeurIPS 2024. Code available at\n  https://github.com/brownvc/R3GAN/"},{"id":"http://arxiv.org/abs/2501.05436v1","updated":"2025-01-09T18:48:55Z","published":"2025-01-09T18:48:55Z","title":"$DPF^*$: improved Depth Potential Function for scale-invariant sulcal\n  depth estimation","summary":"  The shape of human brain is complex and highly variable, with interactions\nbetween brain size, cortical folding, and age well-documented in the\nliterature. However, few studies have explored how global brain size influences\ngeometric features of the cortical surface derived from anatomical MRI. In this\nwork, we focus on sulcal depth, an imaging phenotype that has gained\nsignificant attention in both basic research and clinical applications. We make\nkey contributions to the field by: 1) providing the first quantitative analysis\nof how brain size affects sulcal depth measurements; 2) introducing a novel,\nscale-invariant method for sulcal depth estimation based on an original\nformalization of the problem; 3) presenting a validation framework and sharing\nour code and benchmark data with the community; and 4) demonstrating the\nbiological relevance of our new sulcal depth measure using a large sample of\n1,987 subjects spanning the developmental period from 26 weeks post-conception\nto adulthood.\n","authors":["Maxime Dieudonné","Guillaume Auzias","Julien Lefèvre"],"pdf_url":"https://arxiv.org/pdf/2501.05436v1.pdf","comment":"GA and JL contributed equally to this work"},{"id":"http://arxiv.org/abs/2412.06927v2","updated":"2025-01-09T18:44:39Z","published":"2024-12-09T19:12:17Z","title":"Gradient-based facial encoding for key generation to encrypt and decrypt\n  multimedia data","summary":"  Security systems relying on passwords are vulnerable to being forgotten,\nguessed, or breached. Likewise, biometric systems that operate independently\nare at risk of template spoofing and replay incidents. This paper introduces a\nbiocryptosystem utilizing face recognition techniques to address these issues,\nallowing for the encryption and decryption of various file types through the\nAdvanced Encryption Standard (AES). The proposed system creates a distinct\n32-bit encryption key derived from facial features identified by Histogram of\nOriented Gradients (HOG) and categorized using Support Vector Machines (SVM).\nHOG efficiently identifies edge-aligned facial features, even in dim lighting,\nensuring that reliable biometric keys can be generated. This key is then used\nwith AES to encrypt and decrypt a variety of data formats, such as text, audio,\nand video files. This encryption key, derived from an individual's distinctive\nfacial traits, is exceedingly challenging for adversaries to reproduce or\nguess. The security and performance of the system have been validated through\nexperiments using several metrics, including correlation analysis, Shannon\nentropy, normalized Hamming distance, and the avalanche effect on 25 different\nfile types. Potential uses for the proposed system include secure file sharing,\nonline transactions, and data archiving, making it a strong and trustworthy\napproach to safeguarding sensitive information by integrating the uniqueness of\nfacial biometrics with the established security of AES encryption.\n","authors":["Ankit Kumar Patel","Dewanshi Paul","Sarthak Giri","Sneha Chaudhary","Bikalpa Gautam"],"pdf_url":"https://arxiv.org/pdf/2412.06927v2.pdf","comment":"12 pages, 2 figures, This work has been submitted to the IEEE for\n  possible publication"},{"id":"http://arxiv.org/abs/2410.08405v2","updated":"2025-01-09T18:43:18Z","published":"2024-10-10T22:38:26Z","title":"AgroGPT: Efficient Agricultural Vision-Language Model with Expert Tuning","summary":"  Significant progress has been made in advancing large multimodal\nconversational models (LMMs), capitalizing on vast repositories of image-text\ndata available online. Despite this progress, these models often encounter\nsubstantial domain gaps, hindering their ability to engage in complex\nconversations across new domains. Recent efforts have aimed to mitigate this\nissue, albeit relying on domain-specific image-text data to curate\ninstruction-tuning data. However, many domains, such as agriculture, lack such\nvision-language data. In this work, we propose an approach to construct\ninstruction-tuning data that harnesses vision-only data for the agriculture\ndomain. We utilize diverse agricultural datasets spanning multiple domains,\ncurate class-specific information, and employ large language models (LLMs) to\nconstruct an expert-tuning set, resulting in a 70k expert-tuning dataset called\nAgroInstruct. Subsequently, we expert-tuned and created AgroGPT, an efficient\nLMM that can hold complex agriculture-related conversations and provide useful\ninsights. We also develop AgroEvals for evaluation and compare {AgroGPT's}\nperformance with large open and closed-source models. {AgroGPT} excels at\nidentifying fine-grained agricultural concepts, can act as an agriculture\nexpert, and provides helpful information for multimodal agriculture questions.\nThe code, datasets, and models are available at\nhttps://github.com/awaisrauf/agroGPT.\n","authors":["Muhammad Awais","Ali Husain Salem Abdulla Alharthi","Amandeep Kumar","Hisham Cholakkal","Rao Muhammad Anwer"],"pdf_url":"https://arxiv.org/pdf/2410.08405v2.pdf","comment":"Accepted at WACV, 2025"},{"id":"http://arxiv.org/abs/2501.05429v1","updated":"2025-01-09T18:42:47Z","published":"2025-01-09T18:42:47Z","title":"Flatland Vision","summary":"  When is it possible to project two sets of labeled points lying in a pair of\nprojective planes to the same image on a projective line? We give a complete\nanswer to this question and describe the loci of the projection centers that\nenable a common image. In particular, we find that there exists a solution to\nthis problem if and only if these two sets are themselves images of a common\npointset in projective space.\n","authors":["Sameer Agarwal","Erin Connelly","Annalisa Crannell","Timothy Duff","Rekha R. Thomas"],"pdf_url":"https://arxiv.org/pdf/2501.05429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05427v1","updated":"2025-01-09T18:37:35Z","published":"2025-01-09T18:37:35Z","title":"Zero-1-to-G: Taming Pretrained 2D Diffusion Model for Direct 3D\n  Generation","summary":"  Recent advances in 2D image generation have achieved remarkable\nquality,largely driven by the capacity of diffusion models and the availability\nof large-scale datasets. However, direct 3D generation is still constrained by\nthe scarcity and lower fidelity of 3D datasets. In this paper, we introduce\nZero-1-to-G, a novel approach that addresses this problem by enabling direct\nsingle-view generation on Gaussian splats using pretrained 2D diffusion models.\nOur key insight is that Gaussian splats, a 3D representation, can be decomposed\ninto multi-view images encoding different attributes. This reframes the\nchallenging task of direct 3D generation within a 2D diffusion framework,\nallowing us to leverage the rich priors of pretrained 2D diffusion models. To\nincorporate 3D awareness, we introduce cross-view and cross-attribute attention\nlayers, which capture complex correlations and enforce 3D consistency across\ngenerated splats. This makes Zero-1-to-G the first direct image-to-3D\ngenerative model to effectively utilize pretrained 2D diffusion priors,\nenabling efficient training and improved generalization to unseen objects.\nExtensive experiments on both synthetic and in-the-wild datasets demonstrate\nsuperior performance in 3D object generation, offering a new approach to\nhigh-quality 3D generation.\n","authors":["Xuyi Meng","Chen Wang","Jiahui Lei","Kostas Daniilidis","Jiatao Gu","Lingjie Liu"],"pdf_url":"https://arxiv.org/pdf/2501.05427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05426v1","updated":"2025-01-09T18:35:43Z","published":"2025-01-09T18:35:43Z","title":"From Images to Insights: Transforming Brain Cancer Diagnosis with\n  Explainable AI","summary":"  Brain cancer represents a major challenge in medical diagnostics, requisite\nprecise and timely detection for effective treatment. Diagnosis initially\nrelies on the proficiency of radiologists, which can cause difficulties and\nthreats when the expertise is sparse. Despite the use of imaging resources,\nbrain cancer remains often difficult, time-consuming, and vulnerable to\nintraclass variability. This study conveys the Bangladesh Brain Cancer MRI\nDataset, containing 6,056 MRI images organized into three categories: Brain\nTumor, Brain Glioma, and Brain Menin. The dataset was collected from several\nhospitals in Bangladesh, providing a diverse and realistic sample for research.\nWe implemented advanced deep learning models, and DenseNet169 achieved\nexceptional results, with accuracy, precision, recall, and F1-Score all\nreaching 0.9983. In addition, Explainable AI (XAI) methods including GradCAM,\nGradCAM++, ScoreCAM, and LayerCAM were employed to provide visual\nrepresentations of the decision-making processes of the models. In the context\nof brain cancer, these techniques highlight DenseNet169's potential to enhance\ndiagnostic accuracy while simultaneously offering transparency, facilitating\nearly diagnosis and better patient outcomes.\n","authors":["Md. Arafat Alam Khandaker","Ziyan Shirin Raha","Salehin Bin Iqbal","M. F. Mridha","Jungpil Shin"],"pdf_url":"https://arxiv.org/pdf/2501.05426v1.pdf","comment":"Accepted in 2024 27th International Conference on Computer and\n  Information Technology (ICCIT)"},{"id":"http://arxiv.org/abs/2501.05413v1","updated":"2025-01-09T18:13:57Z","published":"2025-01-09T18:13:57Z","title":"Seeing Sound: Assembling Sounds from Visuals for Audio-to-Image\n  Generation","summary":"  Training audio-to-image generative models requires an abundance of diverse\naudio-visual pairs that are semantically aligned. Such data is almost always\ncurated from in-the-wild videos, given the cross-modal semantic correspondence\nthat is inherent to them. In this work, we hypothesize that insisting on the\nabsolute need for ground truth audio-visual correspondence, is not only\nunnecessary, but also leads to severe restrictions in scale, quality, and\ndiversity of the data, ultimately impairing its use in the modern generative\nmodels. That is, we propose a scalable image sonification framework where\ninstances from a variety of high-quality yet disjoint uni-modal origins can be\nartificially paired through a retrieval process that is empowered by reasoning\ncapabilities of modern vision-language models. To demonstrate the efficacy of\nthis approach, we use our sonified images to train an audio-to-image generative\nmodel that performs competitively against state-of-the-art. Finally, through a\nseries of ablation studies, we exhibit several intriguing auditory capabilities\nlike semantic mixing and interpolation, loudness calibration and acoustic space\nmodeling through reverberation that our model has implicitly developed to guide\nthe image generation process.\n","authors":["Darius Petermann","Mahdi M. Kalayeh"],"pdf_url":"https://arxiv.org/pdf/2501.05413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05409v1","updated":"2025-01-09T18:06:45Z","published":"2025-01-09T18:06:45Z","title":"A Novel Pathology Foundation Model by Mayo Clinic, Charité, and\n  Aignostics","summary":"  Recent advances in digital pathology have demonstrated the effectiveness of\nfoundation models across diverse applications. In this report, we present a\nnovel vision foundation model based on the RudolfV approach. Our model was\ntrained on a dataset comprising 1.2 million histopathology whole slide images,\ncollected from two medical institutions: Mayo Clinic and Charit\\'e -\nUniverst\\\"atsmedizin Berlin. Comprehensive evaluations show that our model\nachieves state-of-the-art performance across twenty-one public benchmark\ndatasets, even though it is neither the largest model by parameter count nor by\ntraining dataset size.\n","authors":["Maximilian Alber","Stephan Tietz","Jonas Dippel","Timo Milbich","Timothée Lesort","Panos Korfiatis","Moritz Krügener","Beatriz Perez Cancer","Neelay Shah","Alexander Möllers","Philipp Seegerer","Alexandra Carpen-Amarie","Kai Standvoss","Gabriel Dernbach","Edwin de Jong","Simon Schallenberg","Andreas Kunft","Helmut Hoffer von Ankershoffen","Gavin Schaeferle","Patrick Duffy","Matt Redlon","Philipp Jurmeister","David Horst","Lukas Ruff","Klaus-Robert Müller","Frederick Klauschen","Andrew Norgan"],"pdf_url":"https://arxiv.org/pdf/2501.05409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01971v2","updated":"2025-01-09T17:57:53Z","published":"2024-09-03T15:15:49Z","title":"Snapshot: Towards Application-centered Models for Pedestrian Trajectory\n  Prediction in Urban Traffic Environments","summary":"  This paper explores pedestrian trajectory prediction in urban traffic while\nfocusing on both model accuracy and real-world applicability. While promising\napproaches exist, they often revolve around pedestrian datasets excluding\ntraffic-related information, or resemble architectures that are either not\nreal-time capable or robust. To address these limitations, we first introduce a\ndedicated benchmark based on Argoverse 2, specifically targeting pedestrians in\ntraffic environments. Following this, we present Snapshot, a modular,\nfeed-forward neural network that outperforms the current state of the art,\nreducing the Average Displacement Error (ADE) by 8.8% while utilizing\nsignificantly less information. Despite its agent-centric encoding scheme,\nSnapshot demonstrates scalability, real-time performance, and robustness to\nvarying motion histories. Moreover, by integrating Snapshot into a modular\nautonomous driving software stack, we showcase its real-world applicability.\n","authors":["Nico Uhlemann","Yipeng Zhou","Tobias Simeon Mohr","Markus Lienkamp"],"pdf_url":"https://arxiv.org/pdf/2409.01971v2.pdf","comment":"8 Pages, 9 Figures"},{"id":"http://arxiv.org/abs/2501.05399v1","updated":"2025-01-09T17:47:57Z","published":"2025-01-09T17:47:57Z","title":"Performance of YOLOv7 in Kitchen Safety While Handling Knife","summary":"  Safe knife practices in the kitchen significantly reduce the risk of cuts,\ninjuries, and serious accidents during food preparation. Using YOLOv7, an\nadvanced object detection model, this study focuses on identifying safety risks\nduring knife handling, particularly improper finger placement and blade contact\nwith hand. The model's performance was evaluated using metrics such as\nprecision, recall, mAP50, and mAP50-95. The results demonstrate that YOLOv7\nachieved its best performance at epoch 31, with a mAP50-95 score of 0.7879,\nprecision of 0.9063, and recall of 0.7503. These findings highlight YOLOv7's\npotential to accurately detect knife-related hazards, promoting the development\nof improved kitchen safety.\n","authors":["Athulya Sundaresan Geetha"],"pdf_url":"https://arxiv.org/pdf/2501.05399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05379v1","updated":"2025-01-09T17:04:33Z","published":"2025-01-09T17:04:33Z","title":"Arc2Avatar: Generating Expressive 3D Avatars from a Single Image via ID\n  Guidance","summary":"  Inspired by the effectiveness of 3D Gaussian Splatting (3DGS) in\nreconstructing detailed 3D scenes within multi-view setups and the emergence of\nlarge 2D human foundation models, we introduce Arc2Avatar, the first SDS-based\nmethod utilizing a human face foundation model as guidance with just a single\nimage as input. To achieve that, we extend such a model for diverse-view human\nhead generation by fine-tuning on synthetic data and modifying its\nconditioning. Our avatars maintain a dense correspondence with a human face\nmesh template, allowing blendshape-based expression generation. This is\nachieved through a modified 3DGS approach, connectivity regularizers, and a\nstrategic initialization tailored for our task. Additionally, we propose an\noptional efficient SDS-based correction step to refine the blendshape\nexpressions, enhancing realism and diversity. Experiments demonstrate that\nArc2Avatar achieves state-of-the-art realism and identity preservation,\neffectively addressing color issues by allowing the use of very low guidance,\nenabled by our strong identity prior and initialization strategy, without\ncompromising detail.\n","authors":["Dimitrios Gerogiannis","Foivos Paraperas Papantoniou","Rolandos Alexandros Potamias","Alexandros Lattas","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2501.05379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05369v1","updated":"2025-01-09T16:49:04Z","published":"2025-01-09T16:49:04Z","title":"1-2-1: Renaissance of Single-Network Paradigm for Virtual Try-On","summary":"  Virtual Try-On (VTON) has become a crucial tool in ecommerce, enabling the\nrealistic simulation of garments on individuals while preserving their original\nappearance and pose. Early VTON methods relied on single generative networks,\nbut challenges remain in preserving fine-grained garment details due to\nlimitations in feature extraction and fusion. To address these issues, recent\napproaches have adopted a dual-network paradigm, incorporating a complementary\n\"ReferenceNet\" to enhance garment feature extraction and fusion. While\neffective, this dual-network approach introduces significant computational\noverhead, limiting its scalability for high-resolution and long-duration\nimage/video VTON applications. In this paper, we challenge the dual-network\nparadigm by proposing a novel single-network VTON method that overcomes the\nlimitations of existing techniques. Our method, namely MNVTON, introduces a\nModality-specific Normalization strategy that separately processes text, image\nand video inputs, enabling them to share the same attention layers in a VTON\nnetwork. Extensive experimental results demonstrate the effectiveness of our\napproach, showing that it consistently achieves higher-quality, more detailed\nresults for both image and video VTON tasks. Our results suggest that the\nsingle-network paradigm can rival the performance of dualnetwork approaches,\noffering a more efficient alternative for high-quality, scalable VTON\napplications.\n","authors":["Shuliang Ning","Yipeng Qin","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2501.05369v1.pdf","comment":"Project page: https://ningshuliang.github.io/2023/Arxiv/index.html"},{"id":"http://arxiv.org/abs/2501.05359v1","updated":"2025-01-09T16:43:21Z","published":"2025-01-09T16:43:21Z","title":"CROPS: Model-Agnostic Training-Free Framework for Safe Image Synthesis\n  with Latent Diffusion Models","summary":"  With advances in diffusion models, image generation has shown significant\nperformance improvements. This raises concerns about the potential abuse of\nimage generation, such as the creation of explicit or violent images, commonly\nreferred to as Not Safe For Work (NSFW) content. To address this, the Stable\nDiffusion model includes several safety checkers to censor initial text prompts\nand final output images generated from the model. However, recent research has\nshown that these safety checkers have vulnerabilities against adversarial\nattacks, allowing them to generate NSFW images. In this paper, we find that\nthese adversarial attacks are not robust to small changes in text prompts or\ninput latents. Based on this, we propose CROPS (Circular or RandOm Prompts for\nSafety), a model-agnostic framework that easily defends against adversarial\nattacks generating NSFW images without requiring additional training. Moreover,\nwe develop an approach that utilizes one-step diffusion models for efficient\nNSFW detection (CROPS-1), further reducing computational resources. We\ndemonstrate the superiority of our method in terms of performance and\napplicability.\n","authors":["Junha Park","Ian Ryu","Jaehui Hwang","Hyungkeun Park","Jiyoon Kim","Jong-Seok Lee"],"pdf_url":"https://arxiv.org/pdf/2501.05359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01428v3","updated":"2025-01-09T16:41:07Z","published":"2025-01-02T18:59:59Z","title":"GPT4Scene: Understand 3D Scenes from Videos with Vision-Language Models","summary":"  In recent years, 2D Vision-Language Models (VLMs) have made significant\nstrides in image-text understanding tasks. However, their performance in 3D\nspatial comprehension, which is critical for embodied intelligence, remains\nlimited. Recent advances have leveraged 3D point clouds and multi-view images\nas inputs, yielding promising results. However, we propose exploring a purely\nvision-based solution inspired by human perception, which merely relies on\nvisual cues for 3D spatial understanding. This paper empirically investigates\nthe limitations of VLMs in 3D spatial knowledge, revealing that their primary\nshortcoming lies in the lack of global-local correspondence between the scene\nand individual frames. To address this, we introduce GPT4Scene, a novel visual\nprompting paradigm in VLM training and inference that helps build the\nglobal-local relationship, significantly improving the 3D spatial understanding\nof indoor scenes. Specifically, GPT4Scene constructs a 3D Bird's Eye View (BEV)\nimage from the video and marks consistent object IDs across both frames and the\nBEV image. The model then inputs the concatenated BEV image and video frames\nwith markers. In zero-shot evaluations, GPT4Scene improves performance over\nclosed-source VLMs like GPT-4o. Additionally, we prepare a processed video\ndataset consisting of 165K text annotation to fine-tune open-source VLMs,\nachieving state-of-the-art performance on all 3D understanding tasks.\nSurprisingly, after training with the GPT4Scene paradigm, VLMs consistently\nimprove during inference, even without visual prompting and BEV image as\nexplicit correspondence. It demonstrates that the proposed paradigm helps VLMs\ndevelop an intrinsic ability to understand 3D scenes, which paves the way for a\nnoninvasive approach to extending pre-trained VLMs for 3D scene understanding.\n","authors":["Zhangyang Qi","Zhixiong Zhang","Ye Fang","Jiaqi Wang","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.01428v3.pdf","comment":"Project page: https://gpt4scene.github.io/"},{"id":"http://arxiv.org/abs/2501.05339v1","updated":"2025-01-09T16:10:06Z","published":"2025-01-09T16:10:06Z","title":"JAQ: Joint Efficient Architecture Design and Low-Bit Quantization with\n  Hardware-Software Co-Exploration","summary":"  The co-design of neural network architectures, quantization precisions, and\nhardware accelerators offers a promising approach to achieving an optimal\nbalance between performance and efficiency, particularly for model deployment\non resource-constrained edge devices. In this work, we propose the JAQ\nFramework, which jointly optimizes the three critical dimensions. However,\neffectively automating the design process across the vast search space of those\nthree dimensions poses significant challenges, especially when pursuing\nextremely low-bit quantization. Specifical, the primary challenges include: (1)\nMemory overhead in software-side: Low-precision quantization-aware training can\nlead to significant memory usage due to storing large intermediate features and\nlatent weights for back-propagation, potentially causing memory exhaustion. (2)\nSearch time-consuming in hardware-side: The discrete nature of hardware\nparameters and the complex interplay between compiler optimizations and\nindividual operators make the accelerator search time-consuming. To address\nthese issues, JAQ mitigates the memory overhead through a channel-wise sparse\nquantization (CSQ) scheme, selectively applying quantization to the most\nsensitive components of the model during optimization. Additionally, JAQ\ndesigns BatchTile, which employs a hardware generation network to encode all\npossible tiling modes, thereby speeding up the search for the optimal compiler\nmapping strategy. Extensive experiments demonstrate the effectiveness of JAQ,\nachieving approximately 7% higher Top-1 accuracy on ImageNet compared to\nprevious methods and reducing the hardware search time per iteration to 0.15\nseconds.\n","authors":["Mingzi Wang","Yuan Meng","Chen Tang","Weixiang Zhang","Yijian Qin","Yang Yao","Yingxin Li","Tongtong Feng","Xin Wang","Xun Guan","Zhi Wang","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.05339v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04561v2","updated":"2025-01-09T15:54:14Z","published":"2025-01-08T15:18:09Z","title":"OpenOmni: Large Language Models Pivot Zero-shot Omnimodal Alignment\n  across Language with Real-time Self-Aware Emotional Speech Synthesis","summary":"  Recent advancements in omnimodal learning have been achieved in understanding\nand generation across images, text, and speech, though mainly within\nproprietary models. Limited omnimodal datasets and the inherent challenges\nassociated with real-time emotional speech generation have hindered open-source\nprogress. To address these issues, we propose openomni, a two-stage training\nmethod combining omnimodal alignment and speech generation to develop a\nstate-of-the-art omnimodal large language model. In the alignment phase, a\npre-trained speech model is further trained on text-image tasks to generalize\nfrom vision to speech in a (near) zero-shot manner, outperforming models\ntrained on tri-modal datasets. In the speech generation phase, a lightweight\ndecoder facilitates real-time emotional speech through training on speech tasks\nand preference learning. Experiments demonstrate that openomni consistently\nimproves across omnimodal, vision-language, and speech-language evaluations,\nenabling natural, emotion-rich dialogues and real-time emotional speech\ngeneration.\n","authors":["Run Luo","Ting-En Lin","Haonan Zhang","Yuchuan Wu","Xiong Liu","Min Yang","Yongbin Li","Longze Chen","Jiaming Li","Lei Zhang","Yangyi Chen","Hamid Alinejad-Rokny","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2501.04561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10616v2","updated":"2025-01-09T15:45:59Z","published":"2024-11-15T22:37:56Z","title":"Voxel-Aggregated Feature Synthesis: Efficient Dense Mapping for\n  Simulated 3D Reasoning","summary":"  We address the issue of the exploding computational requirements of recent\nState-of-the-art (SOTA) open set multimodel 3D mapping (dense 3D mapping)\nalgorithms and present Voxel-Aggregated Feature Synthesis (VAFS), a novel\napproach to dense 3D mapping in simulation. Dense 3D mapping involves\nsegmenting and embedding sequential RGBD frames which are then fused into 3D.\nThis leads to redundant computation as the differences between frames are small\nbut all are individually segmented and embedded. This makes dense 3D mapping\nimpractical for research involving embodied agents in which the environment,\nand thus the mapping, must be modified with regularity. VAFS drastically\nreduces this computation by using the segmented point cloud computed by a\nsimulator's physics engine and synthesizing views of each region. This reduces\nthe number of features to embed from the number of captured RGBD frames to the\nnumber of objects in the scene, effectively allowing a \"ground truth\" semantic\nmap to be computed an order of magnitude faster than traditional methods. We\ntest the resulting representation by assessing the IoU scores of semantic\nqueries for different objects in the simulated scene, and find that VAFS\nexceeds the accuracy and speed of prior dense 3D mapping techniques.\n","authors":["Owen Burns","Rizwan Qureshi"],"pdf_url":"https://arxiv.org/pdf/2411.10616v2.pdf","comment":"6 pages, 2 figures, CVPR 2025"},{"id":"http://arxiv.org/abs/2302.08878v2","updated":"2025-01-09T15:35:59Z","published":"2023-02-17T13:50:53Z","title":"Less is More: The Influence of Pruning on the Explainability of CNNs","summary":"  Modern, state-of-the-art Convolutional Neural Networks (CNNs) in computer\nvision have millions of parameters. Thus, explaining the complex decisions of\nsuch networks to humans is challenging. A technical approach to reduce CNN\ncomplexity is network pruning, where less important parameters are deleted. The\nwork presented in this paper investigates whether this technical complexity\nreduction also helps with perceived explainability. To do so, we conducted a\npre-study and two human-grounded experiments, assessing the effects of\ndifferent pruning ratios on CNN explainability. Overall, we evaluated four\ndifferent compression rates (i.e., CPR 2, 4, 8, and 32) with 37 500 tasks on\nMechanical Turk. Results indicate that lower compression rates have a positive\ninfluence on explainability, while higher compression rates show negative\neffects. Furthermore, we were able to identify sweet spots that increase both\nthe perceived explainability and the model's performance.\n","authors":["David Weber","Florian Merkle","Pascal Schöttle","Stephan Schlögl"],"pdf_url":"https://arxiv.org/pdf/2302.08878v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03145v2","updated":"2025-01-09T15:31:29Z","published":"2025-01-06T17:12:19Z","title":"Geometry Restoration and Dewarping of Camera-Captured Document Images","summary":"  This research focuses on developing a method for restoring the topology of\ndigital images of paper documents captured by a camera, using algorithms for\ndetection, segmentation, geometry restoration, and dewarping. Our methodology\nemploys deep learning (DL) for document outline detection, followed by computer\nvision (CV) to create a topological 2D grid using cubic polynomial\ninterpolation and correct nonlinear distortions by remapping the image. Using\nclassical CV methods makes the document topology restoration process more\nefficient and faster, as it requires significantly fewer computational\nresources and memory. We developed a new pipeline for automatic document\ndewarping and reconstruction, along with a framework and annotated dataset to\ndemonstrate its efficiency. Our experiments confirm the promise of our\nmethodology and its superiority over existing benchmarks (including mobile apps\nand popular DL solutions, such as RectiNet, DocGeoNet, and DocTr++) both\nvisually and in terms of document readability via Optical Character Recognition\n(OCR) and geometry restoration metrics. This paves the way for creating\nhigh-quality digital copies of paper documents and enhancing the efficiency of\nOCR systems. Project page: https://github.com/HorizonParadox/DRCCBI\n","authors":["Valery Istomin","Oleg Pereziabov","Ilya Afanasyev"],"pdf_url":"https://arxiv.org/pdf/2501.03145v2.pdf","comment":"28 pages, 16 figures"},{"id":"http://arxiv.org/abs/2501.04586v2","updated":"2025-01-09T15:27:58Z","published":"2025-01-08T16:06:21Z","title":"Identity-Preserving Video Dubbing Using Motion Warping","summary":"  Video dubbing aims to synthesize realistic, lip-synced videos from a\nreference video and a driving audio signal. Although existing methods can\naccurately generate mouth shapes driven by audio, they often fail to preserve\nidentity-specific features, largely because they do not effectively capture the\nnuanced interplay between audio cues and the visual attributes of reference\nidentity . As a result, the generated outputs frequently lack fidelity in\nreproducing the unique textural and structural details of the reference\nidentity. To address these limitations, we propose IPTalker, a novel and robust\nframework for video dubbing that achieves seamless alignment between driving\naudio and reference identity while ensuring both lip-sync accuracy and\nhigh-fidelity identity preservation. At the core of IPTalker is a\ntransformer-based alignment mechanism designed to dynamically capture and model\nthe correspondence between audio features and reference images, thereby\nenabling precise, identity-aware audio-visual integration. Building on this\nalignment, a motion warping strategy further refines the results by spatially\ndeforming reference images to match the target audio-driven configuration. A\ndedicated refinement process then mitigates occlusion artifacts and enhances\nthe preservation of fine-grained textures, such as mouth details and skin\nfeatures. Extensive qualitative and quantitative evaluations demonstrate that\nIPTalker consistently outperforms existing approaches in terms of realism, lip\nsynchronization, and identity retention, establishing a new state of the art\nfor high-quality, identity-consistent video dubbing.\n","authors":["Runzhen Liu","Qinjie Lin","Yunfei Liu","Lijian Lin","Ye Zhu","Yu Li","Chuhua Xian","Fa-Ting Hong"],"pdf_url":"https://arxiv.org/pdf/2501.04586v2.pdf","comment":"v2, Under Review"},{"id":"http://arxiv.org/abs/2501.05281v1","updated":"2025-01-09T14:43:36Z","published":"2025-01-09T14:43:36Z","title":"Comparison Study: Glacier Calving Front Delineation in Synthetic\n  Aperture Radar Images With Deep Learning","summary":"  Calving front position variation of marine-terminating glaciers is an\nindicator of ice mass loss and a crucial parameter in numerical glacier models.\nDeep Learning (DL) systems can automatically extract this position from\nSynthetic Aperture Radar (SAR) imagery, enabling continuous, weather- and\nillumination-independent, large-scale monitoring. This study presents the first\ncomparison of DL systems on a common calving front benchmark dataset. A\nmulti-annotator study with ten annotators is performed to contrast the\nbest-performing DL system against human performance. The best DL model's\noutputs deviate 221 m on average, while the average deviation of the human\nannotators is 38 m. This significant difference shows that current DL systems\ndo not yet match human performance and that further research is needed to\nenable fully automated monitoring of glacier calving fronts. The study of\nVision Transformers, foundation models, and the inclusion and processing\nstrategy of more information are identified as avenues for future research.\n","authors":["Nora Gourmelon","Konrad Heidler","Erik Loebel","Daniel Cheng","Julian Klink","Anda Dong","Fei Wu","Noah Maul","Moritz Koch","Marcel Dreier","Dakota Pyles","Thorsten Seehaus","Matthias Braun","Andreas Maier","Vincent Christlein"],"pdf_url":"https://arxiv.org/pdf/2501.05281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03616v2","updated":"2025-01-09T14:33:09Z","published":"2025-01-07T08:32:48Z","title":"BTMTrack: Robust RGB-T Tracking via Dual-template Bridging and\n  Temporal-Modal Candidate Elimination","summary":"  RGB-T tracking leverages the complementary strengths of RGB and thermal\ninfrared (TIR) modalities to address challenging scenarios such as low\nillumination and adverse weather. However, existing methods often fail to\neffectively integrate temporal information and perform efficient cross-modal\ninteractions, which constrain their adaptability to dynamic targets. In this\npaper, we propose BTMTrack, a novel framework for RGB-T tracking. The core of\nour approach lies in the dual-template backbone network and the Temporal-Modal\nCandidate Elimination (TMCE) strategy. The dual-template backbone effectively\nintegrates temporal information, while the TMCE strategy focuses the model on\ntarget-relevant tokens by evaluating temporal and modal correlations, reducing\ncomputational overhead and avoiding irrelevant background noise. Building upon\nthis foundation, we propose the Temporal Dual Template Bridging (TDTB) module,\nwhich facilitates precise cross-modal fusion through dynamically filtered\ntokens. This approach further strengthens the interaction between templates and\nthe search region. Extensive experiments conducted on three benchmark datasets\ndemonstrate the effectiveness of BTMTrack. Our method achieves state-of-the-art\nperformance, with a 72.3% precision rate on the LasHeR test set and competitive\nresults on RGBT210 and RGBT234 datasets.\n","authors":["Zhongxuan Zhang","Bi Zeng","Xinyu Ni","Yimin Du"],"pdf_url":"https://arxiv.org/pdf/2501.03616v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05272v1","updated":"2025-01-09T14:31:54Z","published":"2025-01-09T14:31:54Z","title":"Solving the Catastrophic Forgetting Problem in Generalized Category\n  Discovery","summary":"  Generalized Category Discovery (GCD) aims to identify a mix of known and\nnovel categories within unlabeled data sets, providing a more realistic setting\nfor image recognition. Essentially, GCD needs to remember existing patterns\nthoroughly to recognize novel categories. Recent state-of-the-art method SimGCD\ntransfers the knowledge from known-class data to the learning of novel classes\nthrough debiased learning. However, some patterns are catastrophically forgot\nduring adaptation and thus lead to poor performance in novel categories\nclassification. To address this issue, we propose a novel learning approach,\nLegoGCD, which is seamlessly integrated into previous methods to enhance the\ndiscrimination of novel classes while maintaining performance on previously\nencountered known classes. Specifically, we design two types of techniques\ntermed as Local Entropy Regularization (LER) and Dual-views Kullback Leibler\ndivergence constraint (DKL). The LER optimizes the distribution of potential\nknown class samples in unlabeled data, thus ensuring the preservation of\nknowledge related to known categories while learning novel classes. Meanwhile,\nDKL introduces Kullback Leibler divergence to encourage the model to produce a\nsimilar prediction distribution of two view samples from the same image. In\nthis way, it successfully avoids mismatched prediction and generates more\nreliable potential known class samples simultaneously. Extensive experiments\nvalidate that the proposed LegoGCD effectively addresses the known category\nforgetting issue across all datasets, eg, delivering a 7.74% and 2.51% accuracy\nboost on known and novel classes in CUB, respectively. Our code is available\nat: https://github.com/Cliffia123/LegoGCD.\n","authors":["Xinzi Cao","Xiawu Zheng","Guanhong Wang","Weijiang Yu","Yunhang Shen","Ke Li","Yutong Lu","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2501.05272v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2501.05269v1","updated":"2025-01-09T14:26:50Z","published":"2025-01-09T14:26:50Z","title":"CellViT++: Energy-Efficient and Adaptive Cell Segmentation and\n  Classification Using Foundation Models","summary":"  Digital Pathology is a cornerstone in the diagnosis and treatment of\ndiseases. A key task in this field is the identification and segmentation of\ncells in hematoxylin and eosin-stained images. Existing methods for cell\nsegmentation often require extensive annotated datasets for training and are\nlimited to a predefined cell classification scheme. To overcome these\nlimitations, we propose $\\text{CellViT}^{{\\scriptscriptstyle ++}}$, a framework\nfor generalized cell segmentation in digital pathology.\n$\\text{CellViT}^{{\\scriptscriptstyle ++}}$ utilizes Vision Transformers with\nfoundation models as encoders to compute deep cell features and segmentation\nmasks simultaneously. To adapt to unseen cell types, we rely on a\ncomputationally efficient approach. It requires minimal data for training and\nleads to a drastically reduced carbon footprint. We demonstrate excellent\nperformance on seven different datasets, covering a broad spectrum of cell\ntypes, organs, and clinical settings. The framework achieves remarkable\nzero-shot segmentation and data-efficient cell-type classification.\nFurthermore, we show that $\\text{CellViT}^{{\\scriptscriptstyle ++}}$ can\nleverage immunofluorescence stainings to generate training datasets without the\nneed for pathologist annotations. The automated dataset generation approach\nsurpasses the performance of networks trained on manually labeled data,\ndemonstrating its effectiveness in creating high-quality training datasets\nwithout expert annotations. To advance digital pathology,\n$\\text{CellViT}^{{\\scriptscriptstyle ++}}$ is available as an open-source\nframework featuring a user-friendly, web-based interface for visualization and\nannotation. The code is available under\nhttps://github.com/TIO-IKIM/CellViT-plus-plus.\n","authors":["Fabian Hörst","Moritz Rempe","Helmut Becker","Lukas Heine","Julius Keyl","Jens Kleesiek"],"pdf_url":"https://arxiv.org/pdf/2501.05269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05265v1","updated":"2025-01-09T14:19:46Z","published":"2025-01-09T14:19:46Z","title":"Patch-GAN Transfer Learning with Reconstructive Models for Cloud Removal","summary":"  Cloud removal plays a crucial role in enhancing remote sensing image\nanalysis, yet accurately reconstructing cloud-obscured regions remains a\nsignificant challenge. Recent advancements in generative models have made the\ngeneration of realistic images increasingly accessible, offering new\nopportunities for this task. Given the conceptual alignment between image\ngeneration and cloud removal tasks, generative models present a promising\napproach for addressing cloud removal in remote sensing. In this work, we\npropose a deep transfer learning approach built on a generative adversarial\nnetwork (GAN) framework to explore the potential of the novel masked\nautoencoder (MAE) image reconstruction model in cloud removal. Due to the\ncomplexity of remote sensing imagery, we further propose using a patch-wise\ndiscriminator to determine whether each patch of the image is real or not. The\nproposed reconstructive transfer learning approach demonstrates significant\nimprovements in cloud removal performance compared to other GAN-based methods.\nAdditionally, whilst direct comparisons with some of the state-of-the-art cloud\nremoval techniques are limited due to unclear details regarding their\ntrain/test data splits, the proposed model achieves competitive results based\non available benchmarks.\n","authors":["Wanli Ma","Oktay Karakus","Paul L. Rosin"],"pdf_url":"https://arxiv.org/pdf/2501.05265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05264v1","updated":"2025-01-09T14:19:33Z","published":"2025-01-09T14:19:33Z","title":"Towards Balanced Continual Multi-Modal Learning in Human Pose Estimation","summary":"  3D human pose estimation (3D HPE) has emerged as a prominent research topic,\nparticularly in the realm of RGB-based methods. However, RGB images are\nsusceptible to limitations such as sensitivity to lighting conditions and\npotential user discomfort. Consequently, multi-modal sensing, which leverages\nnon-intrusive sensors, is gaining increasing attention. Nevertheless,\nmulti-modal 3D HPE still faces challenges, including modality imbalance and the\nimperative for continual learning. In this work, we introduce a novel balanced\ncontinual multi-modal learning method for 3D HPE, which harnesses the power of\nRGB, LiDAR, mmWave, and WiFi. Specifically, we propose a Shapley value-based\ncontribution algorithm to quantify the contribution of each modality and\nidentify modality imbalance. To address this imbalance, we employ a re-learning\nstrategy. Furthermore, recognizing that raw data is prone to noise\ncontamination, we develop a novel denoising continual learning approach. This\napproach incorporates a noise identification and separation module to mitigate\nthe adverse effects of noise and collaborates with the balanced learning\nstrategy to enhance optimization. Additionally, an adaptive EWC mechanism is\nemployed to alleviate catastrophic forgetting. We conduct extensive experiments\non the widely-adopted multi-modal dataset, MM-Fi, which demonstrate the\nsuperiority of our approach in boosting 3D pose estimation and mitigating\ncatastrophic forgetting in complex scenarios. We will release our codes.\n","authors":["Jiaxuan Peng","Mengshi Qi","Dong Zhao","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.05264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16623v2","updated":"2025-01-09T13:59:21Z","published":"2023-11-28T09:24:42Z","title":"Visual Semantic Navigation with Real Robots","summary":"  Visual Semantic Navigation (VSN) is the ability of a robot to learn visual\nsemantic information for navigating in unseen environments. These VSN models\nare typically tested in those virtual environments where they are trained,\nmainly using reinforcement learning based approaches. Therefore, we do not yet\nhave an in-depth analysis of how these models would behave in the real world.\nIn this work, we propose a new solution to integrate VSN models into real\nrobots, so that we have true embodied agents. We also release a novel ROS-based\nframework for VSN, ROS4VSN, so that any VSN-model can be easily deployed in any\nROS-compatible robot and tested in a real setting. Our experiments with two\ndifferent robots, where we have embedded two state-of-the-art VSN agents,\nconfirm that there is a noticeable performance difference of these VSN\nsolutions when tested in real-world and simulation environments. We hope that\nthis research will endeavor to provide a foundation for addressing this\nconsequential issue, with the ultimate aim of advancing the performance and\nefficiency of embodied agents within authentic real-world scenarios. Code to\nreproduce all our experiments can be found at\nhttps://github.com/gramuah/ros4vsn.\n","authors":["Carlos Gutiérrez-Álvarez","Pablo Ríos-Navarro","Rafael Flor-Rodríguez","Francisco Javier Acevedo-Rodríguez","Roberto J. López-Sastre"],"pdf_url":"https://arxiv.org/pdf/2311.16623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05246v1","updated":"2025-01-09T13:54:59Z","published":"2025-01-09T13:54:59Z","title":"Domain-Incremental Semantic Segmentation for Autonomous Driving under\n  Adverse Driving Conditions","summary":"  Semantic segmentation for autonomous driving is an even more challenging task\nwhen faced with adverse driving conditions. Standard models trained on data\nrecorded under ideal conditions show a deteriorated performance in unfavorable\nweather or illumination conditions. Fine-tuning on the new task or condition\nwould lead to overwriting the previously learned information resulting in\ncatastrophic forgetting. Adapting to the new conditions through traditional\ndomain adaption methods improves the performance on the target domain at the\nexpense of the source domain. Addressing these issues, we propose an\narchitecture-based domain-incremental learning approach called Progressive\nSemantic Segmentation (PSS). PSS is a task-agnostic, dynamically growing\ncollection of domain-specific segmentation models. The task of inferring the\ndomain and subsequently selecting the appropriate module for segmentation is\ncarried out using a collection of convolutional autoencoders. We extensively\nevaluate our proposed approach using several datasets at varying levels of\ngranularity in the categorization of adverse driving conditions. Furthermore,\nwe demonstrate the generalization of the proposed approach to similar and\nunseen domains.\n","authors":["Shishir Muralidhara","René Schuster","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2501.05246v1.pdf","comment":"Accepted at ICPRAM 2025"},{"id":"http://arxiv.org/abs/2501.05244v1","updated":"2025-01-09T13:52:30Z","published":"2025-01-09T13:52:30Z","title":"Optimized Sampling for Non-Line-of-Sight Imaging Using Modified Fast\n  Fourier Transforms","summary":"  Non-line-of-Sight (NLOS) imaging systems collect light at a diffuse relay\nsurface and input this measurement into computational algorithms that output a\n3D volumetric reconstruction. These algorithms utilize the Fast Fourier\nTransform (FFT) to accelerate the reconstruction process but require both input\nand output to be sampled spatially with uniform grids. However, the geometry of\nNLOS imaging inherently results in non-uniform sampling on the relay surface\nwhen using multi-pixel detector arrays, even though such arrays significantly\nreduce acquisition times. Furthermore, using these arrays increases the data\nrate required for sensor readout, posing challenges for real-world deployment.\nIn this work, we utilize the phasor field framework to demonstrate that\nexisting NLOS imaging setups typically oversample the relay surface spatially,\nexplaining why the measurement can be compressed without significantly\nsacrificing reconstruction quality. This enables us to utilize the Non-Uniform\nFast Fourier Transform (NUFFT) to reconstruct from sparse measurements acquired\nfrom irregularly sampled relay surfaces of arbitrary shapes. Furthermore, we\nutilize the NUFFT to reconstruct at arbitrary locations in the hidden volume,\nensuring flexible sampling schemes for both the input and output. Finally, we\nutilize the Scaled Fast Fourier Transform (SFFT) to reconstruct larger volumes\nwithout increasing the number of samples stored in memory. All algorithms\nintroduced in this paper preserve the computational complexity of FFT-based\nmethods, ensuring scalability for practical NLOS imaging applications.\n","authors":["Talha Sultan","Alex Bocchieri","Chaoying Gu","Xiaochun Liu","Pavel Polynkin","Andreas Velten"],"pdf_url":"https://arxiv.org/pdf/2501.05244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05242v1","updated":"2025-01-09T13:50:26Z","published":"2025-01-09T13:50:26Z","title":"Scaffold-SLAM: Structured 3D Gaussians for Simultaneous Localization and\n  Photorealistic Mapping","summary":"  3D Gaussian Splatting (3DGS) has recently revolutionized novel view synthesis\nin the Simultaneous Localization and Mapping (SLAM). However, existing SLAM\nmethods utilizing 3DGS have failed to provide high-quality novel view rendering\nfor monocular, stereo, and RGB-D cameras simultaneously. Notably, some methods\nperform well for RGB-D cameras but suffer significant degradation in rendering\nquality for monocular cameras. In this paper, we present Scaffold-SLAM, which\ndelivers simultaneous localization and high-quality photorealistic mapping\nacross monocular, stereo, and RGB-D cameras. We introduce two key innovations\nto achieve this state-of-the-art visual quality. First, we propose\nAppearance-from-Motion embedding, enabling 3D Gaussians to better model image\nappearance variations across different camera poses. Second, we introduce a\nfrequency regularization pyramid to guide the distribution of Gaussians,\nallowing the model to effectively capture finer details in the scene. Extensive\nexperiments on monocular, stereo, and RGB-D datasets demonstrate that\nScaffold-SLAM significantly outperforms state-of-the-art methods in\nphotorealistic mapping quality, e.g., PSNR is 16.76% higher in the TUM RGB-D\ndatasets for monocular cameras.\n","authors":["Wen Tianci","Liu Zhiang","Lu Biao","Fang Yongchun"],"pdf_url":"https://arxiv.org/pdf/2501.05242v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.05241v1","updated":"2025-01-09T13:46:46Z","published":"2025-01-09T13:46:46Z","title":"Contrast-Free Myocardial Scar Segmentation in Cine MRI using Motion and\n  Texture Fusion","summary":"  Late gadolinium enhancement MRI (LGE MRI) is the gold standard for the\ndetection of myocardial scars for post myocardial infarction (MI). LGE MRI\nrequires the injection of a contrast agent, which carries potential side\neffects and increases scanning time and patient discomfort. To address these\nissues, we propose a novel framework that combines cardiac motion observed in\ncine MRI with image texture information to segment the myocardium and scar\ntissue in the left ventricle. Cardiac motion tracking can be formulated as a\nfull cardiac image cycle registration problem, which can be solved via deep\nneural networks. Experimental results prove that the proposed method can\nachieve scar segmentation based on non-contrasted cine images with comparable\naccuracy to LGE MRI. This demonstrates its potential as an alternative to\ncontrast-enhanced techniques for scar detection.\n","authors":["Guang Yang","Jingkun Chen","Xicheng Sheng","Shan Yang","Xiahai Zhuang","Betty Raman","Lei Li","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2501.05241v1.pdf","comment":"5 pages, 2figs, 2tables"},{"id":"http://arxiv.org/abs/2501.05239v1","updated":"2025-01-09T13:44:42Z","published":"2025-01-09T13:44:42Z","title":"Is Your Autonomous Vehicle Safe? Understanding the Threat of\n  Electromagnetic Signal Injection Attacks on Traffic Scene Perception","summary":"  Autonomous vehicles rely on camera-based perception systems to comprehend\ntheir driving environment and make crucial decisions, thereby ensuring vehicles\nto steer safely. However, a significant threat known as Electromagnetic Signal\nInjection Attacks (ESIA) can distort the images captured by these cameras,\nleading to incorrect AI decisions and potentially compromising the safety of\nautonomous vehicles. Despite the serious implications of ESIA, there is limited\nunderstanding of its impacts on the robustness of AI models across various and\ncomplex driving scenarios. To address this gap, our research analyzes the\nperformance of different models under ESIA, revealing their vulnerabilities to\nthe attacks. Moreover, due to the challenges in obtaining real-world attack\ndata, we develop a novel ESIA simulation method and generate a simulated attack\ndataset for different driving scenarios. Our research provides a comprehensive\nsimulation and evaluation framework, aiming to enhance the development of more\nrobust AI models and secure intelligent systems, ultimately contributing to the\nadvancement of safer and more reliable technology across various fields.\n","authors":["Wenhao Liao","Sineng Yan","Youqian Zhang","Xinwei Zhai","Yuanyuan Wang","Eugene Yujun Fu"],"pdf_url":"https://arxiv.org/pdf/2501.05239v1.pdf","comment":"To appear in AAAI 2025"},{"id":"http://arxiv.org/abs/2501.05238v1","updated":"2025-01-09T13:44:15Z","published":"2025-01-09T13:44:15Z","title":"FOCUS: Towards Universal Foreground Segmentation","summary":"  Foreground segmentation is a fundamental task in computer vision,\nencompassing various subdivision tasks. Previous research has typically\ndesigned task-specific architectures for each task, leading to a lack of\nunification. Moreover, they primarily focus on recognizing foreground objects\nwithout effectively distinguishing them from the background. In this paper, we\nemphasize the importance of the background and its relationship with the\nforeground. We introduce FOCUS, the Foreground ObjeCts Universal Segmentation\nframework that can handle multiple foreground tasks. We develop a multi-scale\nsemantic network using the edge information of objects to enhance image\nfeatures. To achieve boundary-aware segmentation, we propose a novel\ndistillation method, integrating the contrastive learning strategy to refine\nthe prediction mask in multi-modal feature space. We conduct extensive\nexperiments on a total of 13 datasets across 5 tasks, and the results\ndemonstrate that FOCUS consistently outperforms the state-of-the-art\ntask-specific models on most metrics.\n","authors":["Zuyao You","Lingyu Kong","Lingchen Meng","Zuxuan Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05236v1","updated":"2025-01-09T13:43:01Z","published":"2025-01-09T13:43:01Z","title":"Automated external cervical resorption segmentation in cone-beam CT\n  using local texture features","summary":"  External cervical resorption (ECR) is a resorptive process affecting teeth.\nWhile in some patients, active resorption ceases and gets replaced by osseous\ntissue, in other cases, the resorption progresses and ultimately results in\ntooth loss. For proper ECR assessment, cone-beam computed tomography (CBCT) is\nthe recommended imaging modality, enabling a 3-D characterization of these\nlesions. While it is possible to manually identify and measure ECR resorption\nin CBCT scans, this process can be time intensive and highly subject to human\nerror. Therefore, there is an urgent need to develop an automated method to\nidentify and quantify the severity of ECR resorption using CBCT. Here, we\npresent a method for ECR lesion segmentation that is based on automatic, binary\nclassification of locally extracted voxel-wise texture features. We evaluate\nour method on 6 longitudinal CBCT datasets and show that certain\ntexture-features can be used to accurately detect subtle CBCT signal changes\ndue to ECR. We also present preliminary analyses clustering texture features\nwithin a lesion to stratify the defects and identify patterns indicative of\ncalcification. These methods are important steps in developing prognostic\nbiomarkers to predict whether ECR will continue to progress or cease,\nultimately informing treatment decisions.\n","authors":["Sadhana Ravikumar","Asma A. Khan","Matthew C. Davis","Beatriz Paniagua"],"pdf_url":"https://arxiv.org/pdf/2501.05236v1.pdf","comment":"4 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2501.05228v1","updated":"2025-01-09T13:36:37Z","published":"2025-01-09T13:36:37Z","title":"Harnessing Large Language and Vision-Language Models for Robust\n  Out-of-Distribution Detection","summary":"  Out-of-distribution (OOD) detection has seen significant advancements with\nzero-shot approaches by leveraging the powerful Vision-Language Models (VLMs)\nsuch as CLIP. However, prior research works have predominantly focused on\nenhancing Far-OOD performance, while potentially compromising Near-OOD\nefficacy, as observed from our pilot study. To address this issue, we propose a\nnovel strategy to enhance zero-shot OOD detection performances for both Far-OOD\nand Near-OOD scenarios by innovatively harnessing Large Language Models (LLMs)\nand VLMs. Our approach first exploit an LLM to generate superclasses of the ID\nlabels and their corresponding background descriptions followed by feature\nextraction using CLIP. We then isolate the core semantic features for ID data\nby subtracting background features from the superclass features. The refined\nrepresentation facilitates the selection of more appropriate negative labels\nfor OOD data from a comprehensive candidate label set of WordNet, thereby\nenhancing the performance of zero-shot OOD detection in both scenarios.\nFurthermore, we introduce novel few-shot prompt tuning and visual prompt tuning\nto adapt the proposed framework to better align with the target distribution.\nExperimental results demonstrate that the proposed approach consistently\noutperforms current state-of-the-art methods across multiple benchmarks, with\nan improvement of up to 2.9% in AUROC and a reduction of up to 12.6% in FPR95.\nAdditionally, our method exhibits superior robustness against covariate shift\nacross different domains, further highlighting its effectiveness in real-world\nscenarios.\n","authors":["Pei-Kang Lee","Jun-Cheng Chen","Ja-Ling Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05228v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.05226v1","updated":"2025-01-09T13:29:54Z","published":"2025-01-09T13:29:54Z","title":"Light Transport-aware Diffusion Posterior Sampling for Single-View\n  Reconstruction of 3D Volumes","summary":"  We introduce a single-view reconstruction technique of volumetric fields in\nwhich multiple light scattering effects are omnipresent, such as in clouds. We\nmodel the unknown distribution of volumetric fields using an unconditional\ndiffusion model trained on a novel benchmark dataset comprising 1,000\nsynthetically simulated volumetric density fields. The neural diffusion model\nis trained on the latent codes of a novel, diffusion-friendly, monoplanar\nrepresentation. The generative model is used to incorporate a tailored\nparametric diffusion posterior sampling technique into different reconstruction\ntasks. A physically-based differentiable volume renderer is employed to provide\ngradients with respect to light transport in the latent space. This stands in\ncontrast to classic NeRF approaches and makes the reconstructions better\naligned with observed data. Through various experiments, we demonstrate\nsingle-view reconstruction of volumetric clouds at a previously unattainable\nquality.\n","authors":["Ludwic Leonard","Nils Thuerey","Ruediger Westermann"],"pdf_url":"https://arxiv.org/pdf/2501.05226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07899v3","updated":"2025-01-09T13:01:55Z","published":"2024-11-12T16:12:51Z","title":"Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse\n  Tensor-based Transformer","summary":"  The evolution of 3D visualization techniques has fundamentally transformed\nhow we interact with digital content. At the forefront of this change is point\ncloud technology, offering an immersive experience that surpasses traditional\n2D representations. However, the massive data size of point clouds presents\nsignificant challenges in data compression. Current methods for lossy point\ncloud attribute compression (PCAC) generally focus on reconstructing the\noriginal point clouds with minimal error. However, for point cloud\nvisualization scenarios, the reconstructed point clouds with distortion still\nneed to undergo a complex rendering process, which affects the final\nuser-perceived quality. In this paper, we propose an end-to-end deep learning\nframework that seamlessly integrates PCAC with differentiable rendering,\ndenoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of\nrendered multiview images for viewing. In a differentiable manner, the impact\nof the rendering process on the reconstructed point clouds is taken into\naccount. Moreover, we characterize point clouds as sparse tensors and propose a\nsparse tensor-based transformer, called SP-Trans. By aligning with the local\ndensity of the point cloud and utilizing an enhanced local attention mechanism,\nSP-Trans captures the intricate relationships within the point cloud, further\nimproving feature analysis and synthesis within the framework. Extensive\nexperiments demonstrate that the proposed RO-PCAC achieves state-of-the-art\ncompression performance, compared to existing reconstruction-oriented methods,\nincluding traditional, learning-based, and hybrid methods.\n","authors":["Xiao Huo","Junhui Hou","Shuai Wan","Fuzheng Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07899v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05209v1","updated":"2025-01-09T13:00:01Z","published":"2025-01-09T13:00:01Z","title":"MHAFF: Multi-Head Attention Feature Fusion of CNN and Transformer for\n  Cattle Identification","summary":"  Convolutional Neural Networks (CNNs) have drawn researchers' attention to\nidentifying cattle using muzzle images. However, CNNs often fail to capture\nlong-range dependencies within the complex patterns of the muzzle. The\ntransformers handle these challenges. This inspired us to fuse the strengths of\nCNNs and transformers in muzzle-based cattle identification. Addition and\nconcatenation have been the most commonly used techniques for feature fusion.\nHowever, addition fails to preserve discriminative information, while\nconcatenation results in an increase in dimensionality. Both methods are simple\noperations and cannot discover the relationships or interactions between fusing\nfeatures. This research aims to overcome the issues faced by addition and\nconcatenation. This research introduces a novel approach called Multi-Head\nAttention Feature Fusion (MHAFF) for the first time in cattle identification.\nMHAFF captures relations between the different types of fusing features while\npreserving their originality. The experiments show that MHAFF outperformed\naddition and concatenation techniques and the existing cattle identification\nmethods in accuracy on two publicly available cattle datasets. MHAFF\ndemonstrates excellent performance and quickly converges to achieve optimum\naccuracy of 99.88% and 99.52% in two cattle datasets simultaneously.\n","authors":["Rabin Dulal","Lihong Zheng","Muhammad Ashad Kabir"],"pdf_url":"https://arxiv.org/pdf/2501.05209v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2501.05205v1","updated":"2025-01-09T12:55:55Z","published":"2025-01-09T12:55:55Z","title":"Discovering Hidden Visual Concepts Beyond Linguistic Input in Infant\n  Learning","summary":"  Infants develop complex visual understanding rapidly, even preceding of the\nacquisition of linguistic inputs. As computer vision seeks to replicate the\nhuman vision system, understanding infant visual development may offer valuable\ninsights. In this paper, we present an interdisciplinary study exploring this\nquestion: can a computational model that imitates the infant learning process\ndevelop broader visual concepts that extend beyond the vocabulary it has heard,\nsimilar to how infants naturally learn? To investigate this, we analyze a\nrecently published model in Science by Vong et al.,which is trained on\nlongitudinal, egocentric images of a single child paired with transcribed\nparental speech. We introduce a training-free framework that can discover\nvisual concept neurons hidden in the model's internal representations. Our\nfindings show that these neurons can classify objects outside its original\nvocabulary. Furthermore, we compare the visual representations in infant-like\nmodels with those in moder computer vision models, such as CLIP or ImageNet\npre-trained model, highlighting key similarities and differences. Ultimately,\nour work bridges cognitive science and computer vision by analyzing the\ninternal representations of a computational model trained on an infant's visual\nand linguistic inputs.\n","authors":["Xueyi Ke","Satoshi Tsutsui","Yayun Zhang","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2501.05205v1.pdf","comment":"12 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.11559v4","updated":"2025-01-09T12:45:39Z","published":"2024-08-21T12:13:18Z","title":"Semi-supervised 3D Semantic Scene Completion with 2D Vision Foundation\n  Model Guidance","summary":"  Accurate prediction of 3D semantic occupancy from 2D visual images is vital\nin enabling autonomous agents to comprehend their surroundings for planning and\nnavigation. State-of-the-art methods typically employ fully supervised\napproaches, necessitating a huge labeled dataset acquired through expensive\nLiDAR sensors and meticulous voxel-wise labeling by human annotators. The\nresource-intensive nature of this annotating process significantly hampers the\napplication and scalability of these methods. We introduce a novel\nsemi-supervised framework to alleviate the dependency on densely annotated\ndata. Our approach leverages 2D foundation models to generate essential 3D\nscene geometric and semantic cues, facilitating a more efficient training\nprocess. Our framework exhibits notable properties: (1) Generalizability,\napplicable to various 3D semantic scene completion approaches, including 2D-3D\nlifting and 3D-2D transformer methods. (2) Effectiveness, as demonstrated\nthrough experiments on SemanticKITTI and NYUv2, wherein our method achieves up\nto 85% of the fully-supervised performance using only 10% labeled data. This\napproach not only reduces the cost and labor associated with data annotation\nbut also demonstrates the potential for broader adoption in camera-based\nsystems for 3D semantic occupancy prediction.\n","authors":["Duc-Hai Pham","Duc-Dung Nguyen","Anh Pham","Tuan Ho","Phong Nguyen","Khoi Nguyen","Rang Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.11559v4.pdf","comment":"Accepted at AAAI2025. Project Page:\n  https://vinairesearch.github.io/SemiSSC"},{"id":"http://arxiv.org/abs/2412.05557v2","updated":"2025-01-09T12:38:33Z","published":"2024-12-07T06:42:35Z","title":"CoE: Deep Coupled Embedding for Non-Rigid Point Cloud Correspondences","summary":"  The interest in matching non-rigidly deformed shapes represented as raw point\nclouds is rising due to the proliferation of low-cost 3D sensors. Yet, the task\nis challenging since point clouds are irregular and there is a lack of\nintrinsic shape information. We propose to tackle these challenges by learning\na new shape representation -- a per-point high dimensional embedding, in an\nembedding space where semantically similar points share similar embeddings. The\nlearned embedding has multiple beneficial properties: it is aware of the\nunderlying shape geometry and is robust to shape deformations and various shape\nartefacts, such as noise and partiality. Consequently, this embedding can be\ndirectly employed to retrieve high-quality dense correspondences through a\nsimple nearest neighbor search in the embedding space. Extensive experiments\ndemonstrate new state-of-the-art results and robustness in numerous challenging\nnon-rigid shape matching benchmarks and show its great potential in other shape\nanalysis tasks, such as segmentation.\n","authors":["Huajian Zeng","Maolin Gao","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2412.05557v2.pdf","comment":"16 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.05195v1","updated":"2025-01-09T12:33:46Z","published":"2025-01-09T12:33:46Z","title":"HipyrNet: Hypernet-Guided Feature Pyramid network for mixed-exposure\n  correction","summary":"  Recent advancements in image translation for enhancing mixed-exposure images\nhave demonstrated the transformative potential of deep learning algorithms.\nHowever, addressing extreme exposure variations in images remains a significant\nchallenge due to the inherent complexity and contrast inconsistencies across\nregions. Current methods often struggle to adapt effectively to these\nvariations, resulting in suboptimal performance. In this work, we propose\nHipyrNet, a novel approach that integrates a HyperNetwork within a Laplacian\nPyramid-based framework to tackle the challenges of mixed-exposure image\nenhancement. The inclusion of a HyperNetwork allows the model to adapt to these\nexposure variations. HyperNetworks dynamically generates weights for another\nnetwork, allowing dynamic changes during deployment. In our model, the\nHyperNetwork employed is used to predict optimal kernels for Feature Pyramid\ndecomposition, which enables a tailored and adaptive decomposition process for\neach input image. Our enhanced translational network incorporates multiscale\ndecomposition and reconstruction, leveraging dynamic kernel prediction to\ncapture and manipulate features across varying scales. Extensive experiments\ndemonstrate that HipyrNet outperforms existing methods, particularly in\nscenarios with extreme exposure variations, achieving superior results in both\nqualitative and quantitative evaluations. Our approach sets a new benchmark for\nmixed-exposure image enhancement, paving the way for future research in\nadaptive image translation.\n","authors":["Shaurya Singh Rathore","Aravind Shenoy","Krish Didwania","Aditya Kasliwal","Ujjwal Verma"],"pdf_url":"https://arxiv.org/pdf/2501.05195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17251v5","updated":"2025-01-09T12:28:55Z","published":"2024-11-26T09:29:27Z","title":"DGNN-YOLO: Interpretable Dynamic Graph Neural Networks with YOLO11 for\n  Detecting and Tracking Small Occluded Objects in Urban Traffic","summary":"  The detection and tracking of small, occluded objects such as pedestrians,\ncyclists, and motorbikes pose significant challenges for traffic surveillance\nsystems because of their erratic movement, frequent occlusion, and poor\nvisibility in dynamic urban environments. Traditional methods like YOLO11,\nwhile proficient in spatial feature extraction for precise detection, often\nstruggle with these small and dynamically moving objects, particularly in\nhandling real-time data updates and resource efficiency. This paper introduces\nDGNN-YOLO, a novel framework that integrates dynamic graph neural networks\n(DGNNs) with YOLO11 to address these limitations. Unlike standard GNNs, DGNNs\nare chosen for their superior ability to dynamically update graph structures in\nreal-time, which enables adaptive and robust tracking of objects in highly\nvariable urban traffic scenarios. This framework constructs and regularly\nupdates its graph representations, capturing objects as nodes and their\ninteractions as edges, thus effectively responding to rapidly changing\nconditions. Additionally, DGNN-YOLO incorporates Grad-CAM, Grad-CAM++, and\nEigen-CAM visualization techniques to enhance interpretability and foster\ntrust, offering insights into the model's decision-making process. Extensive\nexperiments validate the framework's performance, achieving a precision of\n0.8382, recall of 0.6875, and mAP@0.5:0.95 of 0.6476, significantly\noutperforming existing methods. This study offers a scalable and interpretable\nsolution for real-time traffic surveillance and significantly advances\nintelligent transportation systems' capabilities by addressing the critical\nchallenge of detecting and tracking small, occluded objects.\n","authors":["Shahriar Soudeep","M. F. Mridha","Md Abrar Jahin","Nilanjan Dey"],"pdf_url":"https://arxiv.org/pdf/2411.17251v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05179v1","updated":"2025-01-09T11:57:58Z","published":"2025-01-09T11:57:58Z","title":"Compression with Global Guidance: Towards Training-free High-Resolution\n  MLLMs Acceleration","summary":"  Multimodal large language models (MLLMs) have attracted considerable\nattention due to their exceptional performance in visual content understanding\nand reasoning. However, their inference efficiency has been a notable concern,\nas the increasing length of multimodal contexts leads to quadratic complexity.\nToken compression techniques, which reduce the number of visual tokens, have\ndemonstrated their effectiveness in reducing computational costs. Yet, these\napproaches have struggled to keep pace with the rapid advancements in MLLMs,\nespecially the AnyRes strategy in the context of high-resolution image\nunderstanding. In this paper, we propose a novel token compression method,\nGlobalCom$^2$, tailored for high-resolution MLLMs that receive both the\nthumbnail and multiple crops. GlobalCom$^2$ treats the tokens derived from the\nthumbnail as the ``commander'' of the entire token compression process,\ndirecting the allocation of retention ratios and the specific compression for\neach crop. In this way, redundant tokens are eliminated while important local\ndetails are adaptively preserved to the highest extent feasible. Empirical\nresults across 10 benchmarks reveal that GlobalCom$^2$ achieves an optimal\nbalance between performance and efficiency, and consistently outperforms\nstate-of-the-art token compression methods with LLaVA-NeXT-7B/13B models. Our\ncode is released at \\url{https://github.com/xuyang-liu16/GlobalCom2}.\n","authors":["Xuyang Liu","Ziming Wang","Yuhang Han","Yingyao Wang","Jiale Yuan","Jun Song","Bo Zheng","Linfeng Zhang","Siteng Huang","Honggang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.05179v1.pdf","comment":"Our code is released at\n  \\url{https://github.com/xuyang-liu16/GlobalCom2}"},{"id":"http://arxiv.org/abs/2501.05177v1","updated":"2025-01-09T11:52:54Z","published":"2025-01-09T11:52:54Z","title":"FaceMe: Robust Blind Face Restoration with Personal Identification","summary":"  Blind face restoration is a highly ill-posed problem due to the lack of\nnecessary context. Although existing methods produce high-quality outputs, they\noften fail to faithfully preserve the individual's identity. In this paper, we\npropose a personalized face restoration method, FaceMe, based on a diffusion\nmodel. Given a single or a few reference images, we use an identity encoder to\nextract identity-related features, which serve as prompts to guide the\ndiffusion model in restoring high-quality and identity-consistent facial\nimages. By simply combining identity-related features, we effectively minimize\nthe impact of identity-irrelevant features during training and support any\nnumber of reference image inputs during inference. Additionally, thanks to the\nrobustness of the identity encoder, synthesized images can be used as reference\nimages during training, and identity changing during inference does not require\nfine-tuning the model. We also propose a pipeline for constructing a reference\nimage training pool that simulates the poses and expressions that may appear in\nreal-world scenarios. Experimental results demonstrate that our FaceMe can\nrestore high-quality facial images while maintaining identity consistency,\nachieving excellent performance and robustness.\n","authors":["Siyu Liu","Zheng-Peng Duan","Jia OuYang","Jiayi Fu","Hyunhee Park","Zikun Liu","Chun-Le Guo","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2501.05177v1.pdf","comment":"To appear at AAAI 2025"},{"id":"http://arxiv.org/abs/2406.14080v3","updated":"2025-01-09T11:31:56Z","published":"2024-06-20T07:56:51Z","title":"CMTNet: Convolutional Meets Transformer Network for Hyperspectral Images\n  Classification","summary":"  Hyperspectral remote sensing (HIS) enables the detailed capture of spectral\ninformation from the Earth's surface, facilitating precise classification and\nidentification of surface crops due to its superior spectral diagnostic\ncapabilities. However, current convolutional neural networks (CNNs) focus on\nlocal features in hyperspectral data, leading to suboptimal performance when\nclassifying intricate crop types and addressing imbalanced sample\ndistributions. In contrast, the Transformer framework excels at extracting\nglobal features from hyperspectral imagery. To leverage the strengths of both\napproaches, this research introduces the Convolutional Meet Transformer Network\n(CMTNet). This innovative model includes a spectral-spatial feature extraction\nmodule for shallow feature capture, a dual-branch structure combining CNN and\nTransformer branches for local and global feature extraction, and a\nmulti-output constraint module that enhances classification accuracy through\nmulti-output loss calculations and cross constraints across local,\ninternational, and joint features. Extensive experiments conducted on three\ndatasets (WHU-Hi-LongKou, WHU-Hi-HanChuan, and WHU-Hi-HongHu) demonstrate that\nCTDBNet significantly outperforms other state-of-the-art networks in\nclassification performance, validating its effectiveness in hyperspectral crop\nclassification.\n","authors":["Faxu Guo","Quan Feng","Sen Yang","Wanxia Yang"],"pdf_url":"https://arxiv.org/pdf/2406.14080v3.pdf","comment":"After submission, our research team underwent a significant shift in\n  the project's focus and direction. As a result, the current manuscript no\n  longer accurately reflects the revised scope or findings of our research.To\n  prevent potential misinterpretations or misleading citations, we believe it\n  is in the best interest of the academic community to withdraw this article"},{"id":"http://arxiv.org/abs/2403.14320v3","updated":"2025-01-09T10:59:37Z","published":"2024-03-21T11:41:39Z","title":"Exosense: A Vision-Based Scene Understanding System For Exoskeletons","summary":"  Self-balancing exoskeletons are a key enabling technology for individuals\nwith mobility impairments. While the current challenges focus on\nhuman-compliant hardware and control, unlocking their use for daily activities\nrequires a scene perception system. In this work, we present Exosense, a\nvision-centric scene understanding system for self-balancing exoskeletons. We\nintroduce a multi-sensor visual-inertial mapping device as well as a navigation\nstack for state estimation, terrain mapping and long-term operation. We tested\nExosense attached to both a human leg and Wandercraft's Personal Exoskeleton in\nreal-world indoor scenarios. This enabled us to test the system during typical\nperiodic walking gaits, as well as future uses in multi-story environments. We\ndemonstrate that Exosense can achieve an odometry drift of about 4 cm per meter\ntraveled, and construct terrain maps under 1 cm average reconstruction error.\nIt can also work in a visual localization mode in a previously mapped\nenvironment, providing a step towards long-term operation of exoskeletons.\n","authors":["Jianeng Wang","Matias Mattamala","Christina Kassab","Guillaume Burger","Fabio Elnecave","Lintong Zhang","Marine Petriaux","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2403.14320v3.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.05147v1","updated":"2025-01-09T10:56:50Z","published":"2025-01-09T10:56:50Z","title":"A Systematic Literature Review on Deep Learning-based Depth Estimation\n  in Computer Vision","summary":"  Depth estimation (DE) provides spatial information about a scene and enables\ntasks such as 3D reconstruction, object detection, and scene understanding.\nRecently, there has been an increasing interest in using deep learning\n(DL)-based methods for DE. Traditional techniques rely on handcrafted features\nthat often struggle to generalise to diverse scenes and require extensive\nmanual tuning. However, DL models for DE can automatically extract relevant\nfeatures from input data, adapt to various scene conditions, and generalise\nwell to unseen environments. Numerous DL-based methods have been developed,\nmaking it necessary to survey and synthesize the state-of-the-art (SOTA).\nPrevious reviews on DE have mainly focused on either monocular or stereo-based\ntechniques, rather than comprehensively reviewing DE. Furthermore, to the best\nof our knowledge, there is no systematic literature review (SLR) that\ncomprehensively focuses on DE. Therefore, this SLR study is being conducted.\nInitially, electronic databases were searched for relevant publications,\nresulting in 1284 publications. Using defined exclusion and quality criteria,\n128 publications were shortlisted and further filtered to select 59\nhigh-quality primary studies. These studies were analysed to extract data and\nanswer defined research questions. Based on the results, DL methods were\ndeveloped for mainly three different types of DE: monocular, stereo, and\nmulti-view. 20 publicly available datasets were used to train, test, and\nevaluate DL models for DE, with KITTI, NYU Depth V2, and Make 3D being the most\nused datasets. 29 evaluation metrics were used to assess the performance of DE.\n35 base models were reported in the primary studies, and the top five most-used\nbase models were ResNet-50, ResNet-18, ResNet-101, U-Net, and VGG-16. Finally,\nthe lack of ground truth data was among the most significant challenges\nreported by primary studies.\n","authors":["Ali Rohan","Md Junayed Hasan","Andrei Petrovski"],"pdf_url":"https://arxiv.org/pdf/2501.05147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01486v3","updated":"2025-01-09T10:56:43Z","published":"2024-06-03T16:11:39Z","title":"Differentiable Task Graph Learning: Procedural Activity Representation\n  and Online Mistake Detection from Egocentric Videos","summary":"  Procedural activities are sequences of key-steps aimed at achieving specific\ngoals. They are crucial to build intelligent agents able to assist users\neffectively. In this context, task graphs have emerged as a\nhuman-understandable representation of procedural activities, encoding a\npartial ordering over the key-steps. While previous works generally relied on\nhand-crafted procedures to extract task graphs from videos, in this paper, we\npropose an approach based on direct maximum likelihood optimization of edges'\nweights, which allows gradient-based learning of task graphs and can be\nnaturally plugged into neural network architectures. Experiments on the\nCaptainCook4D dataset demonstrate the ability of our approach to predict\naccurate task graphs from the observation of action sequences, with an\nimprovement of +16.7% over previous approaches. Owing to the differentiability\nof the proposed framework, we also introduce a feature-based approach, aiming\nto predict task graphs from key-step textual or video embeddings, for which we\nobserve emerging video understanding abilities. Task graphs learned with our\napproach are also shown to significantly enhance online mistake detection in\nprocedural egocentric videos, achieving notable gains of +19.8% and +7.5% on\nthe Assembly101-O and EPIC-Tent-O datasets. Code for replicating experiments is\navailable at https://github.com/fpv-iplab/Differentiable-Task-Graph-Learning.\n","authors":["Luigi Seminara","Giovanni Maria Farinella","Antonino Furnari"],"pdf_url":"https://arxiv.org/pdf/2406.01486v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05132v1","updated":"2025-01-09T10:34:25Z","published":"2025-01-09T10:34:25Z","title":"CorrDiff: Adaptive Delay-aware Detector with Temporal Cue Inputs for\n  Real-time Object Detection","summary":"  Real-time object detection takes an essential part in the decision-making\nprocess of numerous real-world applications, including collision avoidance and\npath planning in autonomous driving systems. This paper presents a novel\nreal-time streaming perception method named CorrDiff, designed to tackle the\nchallenge of delays in real-time detection systems. The main contribution of\nCorrDiff lies in its adaptive delay-aware detector, which is able to utilize\nruntime-estimated temporal cues to predict objects' locations for multiple\nfuture frames, and selectively produce predictions that matches real-world\ntime, effectively compensating for any communication and computational delays.\nThe proposed model outperforms current state-of-the-art methods by leveraging\nmotion estimation and feature enhancement, both for 1) single-frame detection\nfor the current frame or the next frame, in terms of the metric mAP, and 2) the\nprediction for (multiple) future frame(s), in terms of the metric sAP (The sAP\nmetric is to evaluate object detection algorithms in streaming scenarios,\nfactoring in both latency and accuracy). It demonstrates robust performance\nacross a range of devices, from powerful Tesla V100 to modest RTX 2080Ti,\nachieving the highest level of perceptual accuracy on all platforms. Unlike\nmost state-of-the-art methods that struggle to complete computation within a\nsingle frame on less powerful devices, CorrDiff meets the stringent real-time\nprocessing requirements on all kinds of devices. The experimental results\nemphasize the system's adaptability and its potential to significantly improve\nthe safety and reliability for many real-world systems, such as autonomous\ndriving. Our code is completely open-sourced and is available at\nhttps://anonymous.4open.science/r/CorrDiff.\n","authors":["Xiang Zhang","Chenchen Fu","Yufei Cui","Lan Yi","Yuyang Sun","Weiwei Wu","Xue Liu"],"pdf_url":"https://arxiv.org/pdf/2501.05132v1.pdf","comment":"Submitted to IEEE JSAC Special Issue: Intelligent Communications for\n  Real-Time Computer Vision (Comm4CV)"},{"id":"http://arxiv.org/abs/2501.05131v1","updated":"2025-01-09T10:34:00Z","published":"2025-01-09T10:34:00Z","title":"3DIS-FLUX: simple and efficient multi-instance generation with DiT\n  rendering","summary":"  The growing demand for controllable outputs in text-to-image generation has\ndriven significant advancements in multi-instance generation (MIG), enabling\nusers to define both instance layouts and attributes. Currently, the\nstate-of-the-art methods in MIG are primarily adapter-based. However, these\nmethods necessitate retraining a new adapter each time a more advanced model is\nreleased, resulting in significant resource consumption. A methodology named\nDepth-Driven Decoupled Instance Synthesis (3DIS) has been introduced, which\ndecouples MIG into two distinct phases: 1) depth-based scene construction and\n2) detail rendering with widely pre-trained depth control models. The 3DIS\nmethod requires adapter training solely during the scene construction phase,\nwhile enabling various models to perform training-free detail rendering.\nInitially, 3DIS focused on rendering techniques utilizing U-Net architectures\nsuch as SD1.5, SD2, and SDXL, without exploring the potential of recent\nDiT-based models like FLUX. In this paper, we present 3DIS-FLUX, an extension\nof the 3DIS framework that integrates the FLUX model for enhanced rendering\ncapabilities. Specifically, we employ the FLUX.1-Depth-dev model for depth map\ncontrolled image generation and introduce a detail renderer that manipulates\nthe Attention Mask in FLUX's Joint Attention mechanism based on layout\ninformation. This approach allows for the precise rendering of fine-grained\nattributes of each instance. Our experimental results indicate that 3DIS-FLUX,\nleveraging the FLUX model, outperforms the original 3DIS method, which utilized\nSD2 and SDXL, and surpasses current state-of-the-art adapter-based methods in\nterms of both performance and image quality. Project Page:\nhttps://limuloo.github.io/3DIS/.\n","authors":["Dewei Zhou","Ji Xie","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2501.05131v1.pdf","comment":"tech report"},{"id":"http://arxiv.org/abs/2501.05122v1","updated":"2025-01-09T10:26:14Z","published":"2025-01-09T10:26:14Z","title":"Centurio: On Drivers of Multilingual Ability of Large Vision-Language\n  Model","summary":"  Most Large Vision-Language Models (LVLMs) to date are trained predominantly\non English data, which makes them struggle to understand non-English input and\nfail to generate output in the desired target language. Existing efforts\nmitigate these issues by adding multilingual training data, but do so in a\nlargely ad-hoc manner, lacking insight into how different training mixes tip\nthe scale for different groups of languages. In this work, we present a\ncomprehensive investigation into the training strategies for massively\nmultilingual LVLMs. First, we conduct a series of multi-stage experiments\nspanning 13 downstream vision-language tasks and 43 languages, systematically\nexamining: (1) the number of training languages that can be included without\ndegrading English performance and (2) optimal language distributions of\npre-training as well as (3) instruction-tuning data. Further, we (4)\ninvestigate how to improve multilingual text-in-image understanding, and\nintroduce a new benchmark for the task. Surprisingly, our analysis reveals that\none can (i) include as many as 100 training languages simultaneously (ii) with\nas little as 25-50\\% of non-English data, to greatly improve multilingual\nperformance while retaining strong English performance. We further find that\n(iii) including non-English OCR data in pre-training and instruction-tuning is\nparamount for improving multilingual text-in-image understanding. Finally, we\nput all our findings together and train Centurio, a 100-language LVLM, offering\nstate-of-the-art performance in an evaluation covering 14 tasks and 56\nlanguages.\n","authors":["Gregor Geigle","Florian Schneider","Carolin Holtermann","Chris Biemann","Radu Timofte","Anne Lauscher","Goran Glavaš"],"pdf_url":"https://arxiv.org/pdf/2501.05122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05120v1","updated":"2025-01-09T10:22:35Z","published":"2025-01-09T10:22:35Z","title":"Improving the U-Net Configuration for Automated Delineation of Head and\n  Neck Cancer on MRI","summary":"  Tumor volume segmentation on MRI is a challenging and time-consuming process\nthat is performed manually in typical clinical settings. This work presents an\napproach to automated delineation of head and neck tumors on MRI scans,\ndeveloped in the context of the MICCAI Head and Neck Tumor Segmentation for\nMR-Guided Applications (HNTS-MRG) 2024 Challenge. Rather than designing a new,\ntask-specific convolutional neural network, the focus of this research was to\npropose improvements to the configuration commonly used in medical segmentation\ntasks, relying solely on the traditional U-Net architecture. The empirical\nresults presented in this article suggest the superiority of patch-wise\nnormalization used for both training and sliding window inference. They also\nindicate that the performance of segmentation models can be enhanced by\napplying a scheduled data augmentation policy during training. Finally, it is\nshown that a small improvement in quality can be achieved by using Gaussian\nweighting to combine predictions for individual patches during sliding window\ninference. The model with the best configuration obtained an aggregated Dice\nSimilarity Coefficient (DSCagg) of 0.749 in Task 1 and 0.710 in Task 2 on five\ncross-validation folds. The ensemble of five models (one best model per\nvalidation fold) showed consistent results on a private test set of 50 patients\nwith an DSCagg of 0.752 in Task 1 and 0.718 in Task 2 (team name:\nandrei.iantsen). The source code and model weights are freely available at\nwww.github.com/iantsen/hntsmrg.\n","authors":["Andrei Iantsen"],"pdf_url":"https://arxiv.org/pdf/2501.05120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05108v1","updated":"2025-01-09T09:56:33Z","published":"2025-01-09T09:56:33Z","title":"Optimizing Multitask Industrial Processes with Predictive Action\n  Guidance","summary":"  Monitoring complex assembly processes is critical for maintaining\nproductivity and ensuring compliance with assembly standards. However,\nvariability in human actions and subjective task preferences complicate\naccurate task anticipation and guidance. To address these challenges, we\nintroduce the Multi-Modal Transformer Fusion and Recurrent Units (MMTFRU)\nNetwork for egocentric activity anticipation, utilizing multimodal fusion to\nimprove prediction accuracy. Integrated with the Operator Action Monitoring\nUnit (OAMU), the system provides proactive operator guidance, preventing\ndeviations in the assembly process. OAMU employs two strategies: (1) Top-5\nMMTF-RU predictions, combined with a reference graph and an action dictionary,\nfor next-step recommendations; and (2) Top-1 MMTF-RU predictions, integrated\nwith a reference graph, for detecting sequence deviations and predicting\nanomaly scores via an entropy-informed confidence mechanism. We also introduce\nTime-Weighted Sequence Accuracy (TWSA) to evaluate operator efficiency and\nensure timely task completion. Our approach is validated on the industrial\nMeccano dataset and the largescale EPIC-Kitchens-55 dataset, demonstrating its\neffectiveness in dynamic environments.\n","authors":["Naval Kishore Mehta"," Arvind","Shyam Sunder Prasad","Sumeet Saurav","Sanjay Singh"],"pdf_url":"https://arxiv.org/pdf/2501.05108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05098v1","updated":"2025-01-09T09:37:27Z","published":"2025-01-09T09:37:27Z","title":"Motion-X++: A Large-Scale Multimodal 3D Whole-body Human Motion Dataset","summary":"  In this paper, we introduce Motion-X++, a large-scale multimodal 3D\nexpressive whole-body human motion dataset. Existing motion datasets\npredominantly capture body-only poses, lacking facial expressions, hand\ngestures, and fine-grained pose descriptions, and are typically limited to lab\nsettings with manually labeled text descriptions, thereby restricting their\nscalability. To address this issue, we develop a scalable annotation pipeline\nthat can automatically capture 3D whole-body human motion and comprehensive\ntextural labels from RGB videos and build the Motion-X dataset comprising 81.1K\ntext-motion pairs. Furthermore, we extend Motion-X into Motion-X++ by improving\nthe annotation pipeline, introducing more data modalities, and scaling up the\ndata quantities. Motion-X++ provides 19.5M 3D whole-body pose annotations\ncovering 120.5K motion sequences from massive scenes, 80.8K RGB videos, 45.3K\naudios, 19.5M frame-level whole-body pose descriptions, and 120.5K\nsequence-level semantic labels. Comprehensive experiments validate the accuracy\nof our annotation pipeline and highlight Motion-X++'s significant benefits for\ngenerating expressive, precise, and natural motion with paired multimodal\nlabels supporting several downstream tasks, including text-driven whole-body\nmotion generation,audio-driven motion generation, 3D whole-body human mesh\nrecovery, and 2D whole-body keypoints estimation, etc.\n","authors":["Yuhong Zhang","Jing Lin","Ailing Zeng","Guanlin Wu","Shunlin Lu","Yurong Fu","Yuanhao Cai","Ruimao Zhang","Haoqian Wang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.05098v1.pdf","comment":"17 pages, 14 figures, This work extends and enhances the research\n  published in the NeurIPS 2023 paper, \"Motion-X: A Large-scale 3D Expressive\n  Whole-body Human Motion Dataset\". arXiv admin note: substantial text overlap\n  with arXiv:2307.00818"},{"id":"http://arxiv.org/abs/2501.05097v1","updated":"2025-01-09T09:25:22Z","published":"2025-01-09T09:25:22Z","title":"A 1Mb mixed-precision quantized encoder for image classification and\n  patch-based compression","summary":"  Even if Application-Specific Integrated Circuits (ASIC) have proven to be a\nrelevant choice for integrating inference at the edge, they are often limited\nin terms of applicability. In this paper, we demonstrate that an ASIC neural\nnetwork accelerator dedicated to image processing can be applied to multiple\ntasks of different levels: image classification and compression, while\nrequiring a very limited hardware. The key component is a reconfigurable,\nmixed-precision (3b/2b/1b) encoder that takes advantage of proper weight and\nactivation quantizations combined with convolutional layer structural pruning\nto lower hardware-related constraints (memory and computing). We introduce an\nautomatic adaptation of linear symmetric quantizer scaling factors to perform\nquantized levels equalization, aiming at stabilizing quinary and ternary\nweights training. In addition, a proposed layer-shared Bit-Shift Normalization\nsignificantly simplifies the implementation of the hardware-expensive Batch\nNormalization. For a specific configuration in which the encoder design only\nrequires 1Mb, the classification accuracy reaches 87.5% on CIFAR-10. Besides,\nwe also show that this quantized encoder can be used to compress image\npatch-by-patch while the reconstruction can performed remotely, by a dedicated\nfull-frame decoder. This solution typically enables an end-to-end compression\nalmost without any block artifacts, outperforming patch-based state-of-the-art\ntechniques employing a patch-constant bitrate.\n","authors":["Van Thien Nguyen","William Guicquero","Gilles Sicard"],"pdf_url":"https://arxiv.org/pdf/2501.05097v1.pdf","comment":"Published at IEEE Transactions on Circuits and Systems for Video\n  Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2501.05095v1","updated":"2025-01-09T09:21:09Z","published":"2025-01-09T09:21:09Z","title":"Advancing ALS Applications with Large-Scale Pre-training: Dataset\n  Development and Downstream Assessment","summary":"  The pre-training and fine-tuning paradigm has revolutionized satellite remote\nsensing applications. However, this approach remains largely underexplored for\nairborne laser scanning (ALS), an important technology for applications such as\nforest management and urban planning. In this study, we address this gap by\nconstructing a large-scale ALS point cloud dataset and evaluating its impact on\ndownstream applications. Our dataset comprises ALS point clouds collected\nacross the contiguous United States, provided by the United States Geological\nSurvey's 3D Elevation Program. To ensure efficient data collection while\ncapturing diverse land cover and terrain types, we introduce a geospatial\nsampling method that selects point cloud tiles based on land cover maps and\ndigital elevation models. As a baseline self-supervised learning model, we\nadopt BEV-MAE, a state-of-the-art masked autoencoder for 3D outdoor point\nclouds, and pre-train it on the constructed dataset. The pre-trained models are\nsubsequently fine-tuned for downstream tasks, including tree species\nclassification, terrain scene recognition, and point cloud semantic\nsegmentation. Our results show that the pre-trained models significantly\noutperform their scratch counterparts across all downstream tasks,\ndemonstrating the transferability of the representations learned from the\nproposed dataset. Furthermore, we observe that scaling the dataset using our\ngeospatial sampling method consistently enhances performance, whereas\npre-training on datasets constructed with random sampling fails to achieve\nsimilar improvements. These findings highlight the utility of the constructed\ndataset and the effectiveness of our sampling strategy in the pre-training and\nfine-tuning paradigm. The source code and pre-trained models will be made\npublicly available at \\url{https://github.com/martianxiu/ALS_pretraining}.\n","authors":["Haoyi Xiu","Xin Liu","Taehoon Kim","Kyoung-Sook Kim"],"pdf_url":"https://arxiv.org/pdf/2501.05095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05091v1","updated":"2025-01-09T09:15:07Z","published":"2025-01-09T09:15:07Z","title":"ResPanDiff: Diffusion Model with Disentangled Modulations for Image\n  Fusion","summary":"  The implementation of diffusion-based pansharpening task is predominantly\nconstrained by its slow inference speed, which results from numerous sampling\nsteps. Despite the existing techniques aiming to accelerate sampling, they\noften compromise performance when fusing multi-source images. To ease this\nlimitation, we introduce a novel and efficient diffusion model named Diffusion\nModel for Pansharpening by Inferring Residual Inference (ResPanDiff), which\nsignificantly reduces the number of diffusion steps without sacrificing the\nperformance to tackle pansharpening task. In ResPanDiff, we innovatively\npropose a Markov chain that transits from noisy residuals to the residuals\nbetween the LRMS and HRMS images, thereby reducing the number of sampling steps\nand enhancing performance. Additionally, we design the latent space to help\nmodel extract more features at the encoding stage, Shallow\nCond-Injection~(SC-I) to help model fetch cond-injected hidden features with\nhigher dimensions, and loss functions to give a better guidance for the\nresidual generation task. enabling the model to achieve superior performance in\nresidual generation. Furthermore, experimental evaluations on pansharpening\ndatasets demonstrate that the proposed method achieves superior outcomes\ncompared to recent state-of-the-art~(SOTA) techniques, requiring only 15\nsampling steps, which reduces over $90\\%$ step compared with the benchmark\ndiffusion models. Our experiments also include thorough discussions and\nablation studies to underscore the effectiveness of our approach.\n","authors":["Shiqi Cao","Liangjian Deng","Shangqi Deng"],"pdf_url":"https://arxiv.org/pdf/2501.05091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03700v2","updated":"2025-01-09T09:12:06Z","published":"2023-12-06T18:59:19Z","title":"OneLLM: One Framework to Align All Modalities with Language","summary":"  Multimodal large language models (MLLMs) have gained significant attention\ndue to their strong multimodal understanding capability. However, existing\nworks rely heavily on modality-specific encoders, which usually differ in\narchitecture and are limited to common modalities. In this paper, we present\nOneLLM, an MLLM that aligns eight modalities to language using a unified\nframework. We achieve this through a unified multimodal encoder and a\nprogressive multimodal alignment pipeline. In detail, we first train an image\nprojection module to connect a vision encoder with LLM. Then, we build a\nuniversal projection module (UPM) by mixing multiple image projection modules\nand dynamic routing. Finally, we progressively align more modalities to LLM\nwith the UPM. To fully leverage the potential of OneLLM in following\ninstructions, we also curated a comprehensive multimodal instruction dataset,\nincluding 2M items from image, audio, video, point cloud, depth/normal map, IMU\nand fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,\nencompassing tasks such as multimodal captioning, question answering and\nreasoning, where it delivers excellent performance. Code, data, model and\nonline demo are available at https://github.com/csuhan/OneLLM\n","authors":["Jiaming Han","Kaixiong Gong","Yiyuan Zhang","Jiaqi Wang","Kaipeng Zhang","Dahua Lin","Yu Qiao","Peng Gao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2312.03700v2.pdf","comment":"Accepted by CVPR 2024. Code: https://github.com/csuhan/OneLLM"},{"id":"http://arxiv.org/abs/2501.05085v1","updated":"2025-01-09T09:10:17Z","published":"2025-01-09T09:10:17Z","title":"End-to-End Deep Learning for Interior Tomography with Low-Dose X-ray CT","summary":"  Objective: There exist several X-ray computed tomography (CT) scanning\nstrategies to reduce a radiation dose, such as (1) sparse-view CT, (2) low-dose\nCT, and (3) region-of-interest (ROI) CT (called interior tomography). To\nfurther reduce the dose, the sparse-view and/or low-dose CT settings can be\napplied together with interior tomography. Interior tomography has various\nadvantages in terms of reducing the number of detectors and decreasing the\nX-ray radiation dose. However, a large patient or small field-of-view (FOV)\ndetector can cause truncated projections, and then the reconstructed images\nsuffer from severe cupping artifacts. In addition, although the low-dose CT can\nreduce the radiation exposure dose, analytic reconstruction algorithms produce\nimage noise. Recently, many researchers have utilized image-domain deep\nlearning (DL) approaches to remove each artifact and demonstrated impressive\nperformances, and the theory of deep convolutional framelets supports the\nreason for the performance improvement. Approach: In this paper, we found that\nthe image-domain convolutional neural network (CNN) is difficult to solve\ncoupled artifacts, based on deep convolutional framelets. Significance: To\naddress the coupled problem, we decouple it into two sub-problems: (i) image\ndomain noise reduction inside truncated projection to solve low-dose CT problem\nand (ii) extrapolation of projection outside truncated projection to solve the\nROI CT problem. The decoupled sub-problems are solved directly with a novel\nproposed end-to-end learning using dual-domain CNNs. Main results: We\ndemonstrate that the proposed method outperforms the conventional image-domain\ndeep learning methods, and a projection-domain CNN shows better performance\nthan the image-domain CNNs which are commonly used by many researchers.\n","authors":["Yoseob Han","Dufan Wu","Kyungsang Kim","Quanzheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.05085v1.pdf","comment":"Published by Physics in Medicine & Biology (2022.5)"},{"id":"http://arxiv.org/abs/2501.02227v2","updated":"2025-01-09T08:59:41Z","published":"2025-01-04T08:25:32Z","title":"tCURLoRA: Tensor CUR Decomposition Based Low-Rank Parameter Adaptation\n  and Its Application in Medical Image Segmentation","summary":"  Transfer learning, by leveraging knowledge from pre-trained models, has\nsignificantly enhanced the performance of target tasks. However, as deep neural\nnetworks scale up, full fine-tuning introduces substantial computational and\nstorage challenges in resource-constrained environments, limiting its\nwidespread adoption. To address this, parameter-efficient fine-tuning (PEFT)\nmethods have been developed to reduce computational complexity and storage\nrequirements by minimizing the number of updated parameters. While matrix\ndecomposition-based PEFT methods, such as LoRA, show promise, they struggle to\nfully capture the high-dimensional structural characteristics of model weights.\nIn contrast, high-dimensional tensors offer a more natural representation of\nneural network weights, allowing for a more comprehensive capture of\nhigher-order features and multi-dimensional interactions. In this paper, we\npropose tCURLoRA, a novel fine-tuning method based on tensor CUR decomposition.\nBy concatenating pre-trained weight matrices into a three-dimensional tensor\nand applying tensor CUR decomposition, we update only the lower-order tensor\ncomponents during fine-tuning, effectively reducing computational and storage\noverhead. Experimental results demonstrate that tCURLoRA outperforms existing\nPEFT methods in medical image segmentation tasks.\n","authors":["Guanghua He","Wangang Cheng","Hancan Zhu","Xiaohao Cai","Gaohang Yu"],"pdf_url":"https://arxiv.org/pdf/2501.02227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05076v1","updated":"2025-01-09T08:59:23Z","published":"2025-01-09T08:59:23Z","title":"TipSegNet: Fingertip Segmentation in Contactless Fingerprint Imaging","summary":"  Contactless fingerprint recognition systems offer a hygienic, user-friendly,\nand efficient alternative to traditional contact-based methods. However, their\naccuracy heavily relies on precise fingertip detection and segmentation,\nparticularly under challenging background conditions. This paper introduces\nTipSegNet, a novel deep learning model that achieves state-of-the-art\nperformance in segmenting fingertips directly from grayscale hand images.\nTipSegNet leverages a ResNeXt-101 backbone for robust feature extraction,\ncombined with a Feature Pyramid Network (FPN) for multi-scale representation,\nenabling accurate segmentation across varying finger poses and image qualities.\nFurthermore, we employ an extensive data augmentation strategy to enhance the\nmodel's generalizability and robustness. TipSegNet outperforms existing\nmethods, achieving a mean Intersection over Union (mIoU) of 0.987 and an\naccuracy of 0.999, representing a significant advancement in contactless\nfingerprint segmentation. This enhanced accuracy has the potential to\nsubstantially improve the reliability and effectiveness of contactless\nbiometric systems in real-world applications.\n","authors":["Laurenz Ruzicka","Bernhard Kohn","Clemens Heitzinger"],"pdf_url":"https://arxiv.org/pdf/2501.05076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05072v1","updated":"2025-01-09T08:54:19Z","published":"2025-01-09T08:54:19Z","title":"A Flexible and Scalable Framework for Video Moment Search","summary":"  Video moment search, the process of finding relevant moments in a video\ncorpus to match a user's query, is crucial for various applications. Existing\nsolutions, however, often assume a single perfect matching moment, struggle\nwith inefficient inference, and have limitations with hour-long videos. This\npaper introduces a flexible and scalable framework for retrieving a ranked list\nof moments from collection of videos in any length to match a text query, a\ntask termed Ranked Video Moment Retrieval (RVMR). Our framework, called\nSegment-Proposal-Ranking (SPR), simplifies the search process into three\nindependent stages: segment retrieval, proposal generation, and moment\nrefinement with re-ranking. Specifically, videos are divided into equal-length\nsegments with precomputed embeddings indexed offline, allowing efficient\nretrieval regardless of video length. For scalable online retrieval, both\nsegments and queries are projected into a shared feature space to enable\napproximate nearest neighbor (ANN) search. Retrieved segments are then merged\ninto coarse-grained moment proposals. Then a refinement and re-ranking module\nis designed to reorder and adjust timestamps of the coarse-grained proposals.\nEvaluations on the TVR-Ranking dataset demonstrate that our framework achieves\nstate-of-the-art performance with significant reductions in computational cost\nand processing time. The flexible design also allows for independent\nimprovements to each stage, making SPR highly adaptable for large-scale\napplications.\n","authors":["Chongzhi Zhang","Xizhou Zhu","Aixin Sun"],"pdf_url":"https://arxiv.org/pdf/2501.05072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19599v2","updated":"2025-01-09T08:49:40Z","published":"2024-09-29T07:32:14Z","title":"DATransNet: Dynamic Attention Transformer Network for Infrared Small\n  Target Detection","summary":"  Infrared small target detection (ISTD) is widely used in civilian and\nmilitary applications. However, ISTD encounters several challenges, including\nthe tendency for small and dim targets to be obscured by complex backgrounds.To\naddress this issue, we propose the Dynamic Attention Transformer Network\n(DATransNet), which aims to extract and preserve edge information of small\ntargets.DATransNet employs the Dynamic Attention Transformer (DATrans),\nsimulating central difference convolutions (CDC) to extract and integrate\ngradient features with deeper features.Furthermore, we propose a global feature\nextraction module (GFEM) that offers a comprehensive perspective to prevent the\nnetwork from focusing solely on details while neglecting the background\ninformation. We compare the network with state-of-the-art (SOTA) approaches,\nand the results demonstrate that our method performs effectively. Our source\ncode is available at https://github.com/greekinRoma/DATransNet.\n","authors":["Chen Hu","Yian Huang","Kexuan Li","Luping Zhang","Chang Long","Yiming Zhu","Tian Pu","Zhenming Peng"],"pdf_url":"https://arxiv.org/pdf/2409.19599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05069v1","updated":"2025-01-09T08:44:42Z","published":"2025-01-09T08:44:42Z","title":"Commonsense Video Question Answering through Video-Grounded Entailment\n  Tree Reasoning","summary":"  This paper proposes the first video-grounded entailment tree reasoning method\nfor commonsense video question answering (VQA). Despite the remarkable progress\nof large visual-language models (VLMs), there are growing concerns that they\nlearn spurious correlations between videos and likely answers, reinforced by\ntheir black-box nature and remaining benchmarking biases. Our method explicitly\ngrounds VQA tasks to video fragments in four steps: entailment tree\nconstruction, video-language entailment verification, tree reasoning, and\ndynamic tree expansion. A vital benefit of the method is its generalizability\nto current video and image-based VLMs across reasoning types. To support fair\nevaluation, we devise a de-biasing procedure based on large-language models\nthat rewrites VQA benchmark answer sets to enforce model reasoning. Systematic\nexperiments on existing and de-biased benchmarks highlight the impact of our\nmethod components across benchmarks, VLMs, and reasoning types.\n","authors":["Huabin Liu","Filip Ilievski","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2501.05069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05067v1","updated":"2025-01-09T08:43:57Z","published":"2025-01-09T08:43:57Z","title":"LLaVA-Octopus: Unlocking Instruction-Driven Adaptive Projector Fusion\n  for Video Understanding","summary":"  In this paper, we introduce LLaVA-Octopus, a novel video multimodal large\nlanguage model. LLaVA-Octopus adaptively weights features from different visual\nprojectors based on user instructions, enabling us to leverage the\ncomplementary strengths of each projector. We observe that different visual\nprojectors exhibit distinct characteristics when handling specific tasks. For\ninstance, some projectors excel at capturing static details, while others are\nmore effective at processing temporal information, and some are better suited\nfor tasks requiring temporal coherence. By dynamically adjusting feature\nweights according to user instructions, LLaVA-Octopus dynamically selects and\ncombines the most suitable features, significantly enhancing the model's\nperformance in multimodal tasks. Experimental results demonstrate that\nLLaVA-Octopus achieves excellent performance across multiple benchmarks,\nespecially in tasks such as multimodal understanding, visual question\nanswering, and video understanding, highlighting its broad application\npotential.\n","authors":["Jiaxing Zhao","Boyuan Sun","Xiang Chen","Xihan Wei","Qibin Hou"],"pdf_url":"https://arxiv.org/pdf/2501.05067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05066v1","updated":"2025-01-09T08:43:09Z","published":"2025-01-09T08:43:09Z","title":"Improving Skeleton-based Action Recognition with Interactive Object\n  Information","summary":"  Human skeleton information is important in skeleton-based action recognition,\nwhich provides a simple and efficient way to describe human pose. However,\nexisting skeleton-based methods focus more on the skeleton, ignoring the\nobjects interacting with humans, resulting in poor performance in recognizing\nactions that involve object interactions. We propose a new action recognition\nframework introducing object nodes to supplement absent interactive object\ninformation. We also propose Spatial Temporal Variable Graph Convolutional\nNetworks (ST-VGCN) to effectively model the Variable Graph (VG) containing\nobject nodes. Specifically, in order to validate the role of interactive object\ninformation, by leveraging a simple self-training approach, we establish a new\ndataset, JXGC 24, and an extended dataset, NTU RGB+D+Object 60, including more\nthan 2 million additional object nodes. At the same time, we designe the\nVariable Graph construction method to accommodate a variable number of nodes\nfor graph structure. Additionally, we are the first to explore the overfitting\nissue introduced by incorporating additional object information, and we propose\na VG-based data augmentation method to address this issue, called Random Node\nAttack. Finally, regarding the network structure, we introduce two fusion\nmodules, CAF and WNPool, along with a novel Node Balance Loss, to enhance the\ncomprehensive performance by effectively fusing and balancing skeleton and\nobject node information. Our method surpasses the previous state-of-the-art on\nmultiple skeleton-based action recognition benchmarks. The accuracy of our\nmethod on NTU RGB+D 60 cross-subject split is 96.7\\%, and on cross-view split,\nit is 99.2\\%.\n","authors":["Hao Wen","Ziqian Lu","Fengli Shen","Zhe-Ming Lu","Jialin Cui"],"pdf_url":"https://arxiv.org/pdf/2501.05066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05427v2","updated":"2025-01-09T08:31:23Z","published":"2024-09-09T08:26:47Z","title":"TextToucher: Fine-Grained Text-to-Touch Generation","summary":"  Tactile sensation plays a crucial role in the development of multi-modal\nlarge models and embodied intelligence. To collect tactile data with minimal\ncost as possible, a series of studies have attempted to generate tactile images\nby vision-to-touch image translation. However, compared to text modality,\nvisual modality-driven tactile generation cannot accurately depict human\ntactile sensation. In this work, we analyze the characteristics of tactile\nimages in detail from two granularities: object-level (tactile texture, tactile\nshape), and sensor-level (gel status). We model these granularities of\ninformation through text descriptions and propose a fine-grained Text-to-Touch\ngeneration method (TextToucher) to generate high-quality tactile samples.\nSpecifically, we introduce a multimodal large language model to build the text\nsentences about object-level tactile information and employ a set of learnable\ntext prompts to represent the sensor-level tactile information. To better guide\nthe tactile generation process with the built text information, we fuse the\ndual grains of text information and explore various dual-grain text\nconditioning methods within the diffusion transformer architecture.\nFurthermore, we propose a Contrastive Text-Touch Pre-training (CTTP) metric to\nprecisely evaluate the quality of text-driven generated tactile data. Extensive\nexperiments demonstrate the superiority of our TextToucher method. The source\ncodes will be available at \\url{https://github.com/TtuHamg/TextToucher}.\n","authors":["Jiahang Tu","Hao Fu","Fengyu Yang","Hanbin Zhao","Chao Zhang","Hui Qian"],"pdf_url":"https://arxiv.org/pdf/2409.05427v2.pdf","comment":"This paper has been accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.03397v2","updated":"2025-01-09T08:28:11Z","published":"2025-01-06T21:34:52Z","title":"DoubleDiffusion: Combining Heat Diffusion with Denoising Diffusion for\n  Generative Learning on 3D Meshes","summary":"  This paper proposes DoubleDiffusion, a novel framework that combines heat\ndissipation diffusion and denoising diffusion for direct generative learning on\n3D mesh surfaces. Our approach addresses the challenges of generating\ncontinuous signal distributions residing on a curve manifold surface. Unlike\nprevious methods that rely on unrolling 3D meshes into 2D or adopting field\nrepresentations, DoubleDiffusion leverages the Laplacian-Beltrami operator to\nprocess features respecting the mesh structure. This combination enables\neffective geometry-aware signal diffusion across the underlying geometry. As\nshown in Fig.1, we demonstrate that DoubleDiffusion has the ability to generate\nRGB signal distributions on complex 3D mesh surfaces and achieves per-category\nshape-conditioned texture generation across different shape geometry. Our work\ncontributes a new direction in diffusion-based generative modeling on 3D\nsurfaces, with potential applications in the field of 3D asset generation.\n","authors":["Xuyang Wang","Ziang Cheng","Zhenyu Li","Jiayu Yang","Haorui Ji","Pan Ji","Mehrtash Harandi","Richard Hartley","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2501.03397v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18571v3","updated":"2025-01-09T07:58:38Z","published":"2024-06-03T11:48:17Z","title":"UltraCortex: Submillimeter Ultra-High Field 9.4 T Brain MR Image\n  Collection and Manual Cortical Segmentations","summary":"  The UltraCortex repository (https://www.ultracortex.org) houses magnetic\nresonance imaging data of the human brain obtained at an ultra-high field\nstrength of 9.4 T. It contains 86 structural MR images with spatial resolutions\nranging from 0.6 to 0.8 mm. Additionally, the repository includes segmentations\nof 12 brains into gray and white matter compartments. These segmentations have\nbeen independently validated by two expert neuroradiologists, thus establishing\nthem as a reliable gold standard. This resource provides researchers with\naccess to high-quality brain imaging data and validated segmentations,\nfacilitating neuroimaging studies and advancing our understanding of brain\nstructure and function. Existing repositories do not accommodate field\nstrengths beyond 7 T, nor do they offer validated segmentations, underscoring\nthe significance of this new resource.\n","authors":["Lucas Mahler","Julius Steiglechner","Benjamin Bender","Tobias Lindig","Dana Ramadan","Jonas Bause","Florian Birk","Rahel Heule","Edyta Charyasz","Michael Erb","Vinod Jangir Kumar","Gisela E Hagberg","Pascal Martin","Gabriele Lohmann","Klaus Scheffler"],"pdf_url":"https://arxiv.org/pdf/2406.18571v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10440v3","updated":"2025-01-09T07:58:20Z","published":"2024-11-15T18:58:31Z","title":"LLaVA-CoT: Let Vision Language Models Reason Step-by-Step","summary":"  Large language models have demonstrated substantial advancements in reasoning\ncapabilities, particularly through inference-time scaling, as illustrated by\nmodels such as OpenAI's o1. However, current Vision-Language Models (VLMs)\noften struggle to perform systematic and structured reasoning, especially when\nhandling complex visual question-answering tasks. In this work, we introduce\nLLaVA-CoT, a novel VLM designed to conduct autonomous multistage reasoning.\nUnlike chain-of-thought prompting, LLaVA-CoT independently engages in\nsequential stages of summarization, visual interpretation, logical reasoning,\nand conclusion generation. This structured approach enables LLaVA-CoT to\nachieve marked improvements in precision on reasoning-intensive tasks. To\naccomplish this, we compile the LLaVA-CoT-100k dataset, integrating samples\nfrom various visual question answering sources and providing structured\nreasoning annotations. Besides, we propose an inference-time stage-level beam\nsearch method, which enables effective inference-time scaling. Remarkably, with\nonly 100k training samples and a simple yet effective inference time scaling\nmethod, LLaVA-CoT not only outperforms its base model by 7.4% on a wide range\nof multimodal reasoning benchmarks, but also surpasses the performance of\nlarger and even closed-source models, such as Gemini-1.5-pro, GPT-4o-mini, and\nLlama-3.2-90B-Vision-Instruct.\n","authors":["Guowei Xu","Peng Jin","Hao Li","Yibing Song","Lichao Sun","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2411.10440v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05037v1","updated":"2025-01-09T07:51:14Z","published":"2025-01-09T07:51:14Z","title":"LongViTU: Instruction Tuning for Long-Form Video Understanding","summary":"  This paper introduce LongViTU, a large-scale (~121k QA pairs, ~900h videos),\nautomatically generated dataset for long-form video understanding. We developed\na systematic approach that organizes videos into a hierarchical tree structure\nand incorporates self-revision mechanisms to ensure high-quality QA pairs. Each\nQA pair in LongViTU features: 1) long-term context (average certificate length\nof 4.6 minutes); 2) rich knowledge and condensed reasoning (commonsense,\ncausality, planning, etc.); and 3) explicit timestamp labels for relevant\nevents. LongViTU also serves as a benchmark for instruction following in\nlong-form and streaming video understanding. We evaluate the open-source\nstate-of-the-art long video understanding model, LongVU, and the commercial\nmodel, Gemini-1.5-Pro, on our benchmark. They achieve GPT-4 scores of 49.9 and\n52.3, respectively, underscoring the substantial challenge posed by our\nbenchmark. Further supervised fine-tuning (SFT) on LongVU led to performance\nimprovements of 12.0% on our benchmark, 2.2% on the in-distribution (ID)\nbenchmark EgoSchema, 1.0%, 2.2% and 1.2% on the out-of-distribution (OOD)\nbenchmarks VideoMME (Long), WorldQA and OpenEQA, respectively. These outcomes\ndemonstrate LongViTU's high data quality and robust OOD generalizability.\n","authors":["Rujie Wu","Xiaojian Ma","Hai Ci","Yue Fan","Yuxuan Wang","Haozhe Zhao","Qing Li","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05034v1","updated":"2025-01-09T07:49:37Z","published":"2025-01-09T07:49:37Z","title":"Towards Fingerprint Mosaicking Artifact Detection: A Self-Supervised\n  Deep Learning Approach","summary":"  Fingerprint mosaicking, which is the process of combining multiple\nfingerprint images into a single master fingerprint, is an essential process in\nmodern biometric systems. However, it is prone to errors that can significantly\ndegrade fingerprint image quality. This paper proposes a novel deep\nlearning-based approach to detect and score mosaicking artifacts in fingerprint\nimages. Our method leverages a self-supervised learning framework to train a\nmodel on large-scale unlabeled fingerprint data, eliminating the need for\nmanual artifact annotation. The proposed model effectively identifies\nmosaicking errors, achieving high accuracy on various fingerprint modalities,\nincluding contactless, rolled, and pressed fingerprints and furthermore proves\nto be robust to different data sources. Additionally, we introduce a novel\nmosaicking artifact score to quantify the severity of errors, enabling\nautomated evaluation of fingerprint images. By addressing the challenges of\nmosaicking artifact detection, our work contributes to improving the accuracy\nand reliability of fingerprint-based biometric systems.\n","authors":["Laurenz Ruzicka","Alexander Spenke","Stephan Bergmann","Gerd Nolden","Bernhard Kohn","Clemens Heitzinger"],"pdf_url":"https://arxiv.org/pdf/2501.05034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05031v1","updated":"2025-01-09T07:43:49Z","published":"2025-01-09T07:43:49Z","title":"ECBench: Can Multi-modal Foundation Models Understand the Egocentric\n  World? A Holistic Embodied Cognition Benchmark","summary":"  The enhancement of generalization in robots by large vision-language models\n(LVLMs) is increasingly evident. Therefore, the embodied cognitive abilities of\nLVLMs based on egocentric videos are of great interest. However, current\ndatasets for embodied video question answering lack comprehensive and\nsystematic evaluation frameworks. Critical embodied cognitive issues, such as\nrobotic self-cognition, dynamic scene perception, and hallucination, are rarely\naddressed. To tackle these challenges, we propose ECBench, a high-quality\nbenchmark designed to systematically evaluate the embodied cognitive abilities\nof LVLMs. ECBench features a diverse range of scene video sources, open and\nvaried question formats, and 30 dimensions of embodied cognition. To ensure\nquality, balance, and high visual dependence, ECBench uses class-independent\nmeticulous human annotation and multi-round question screening strategies.\nAdditionally, we introduce ECEval, a comprehensive evaluation system that\nensures the fairness and rationality of the indicators. Utilizing ECBench, we\nconduct extensive evaluations of proprietary, open-source, and task-specific\nLVLMs. ECBench is pivotal in advancing the embodied cognitive capabilities of\nLVLMs, laying a solid foundation for developing reliable core models for\nembodied agents. All data and code are available at\nhttps://github.com/Rh-Dang/ECBench.\n","authors":["Ronghao Dang","Yuqian Yuan","Wenqi Zhang","Yifei Xin","Boqiang Zhang","Long Li","Liuyi Wang","Qinyang Zeng","Xin Li","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2501.05031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01973v3","updated":"2025-01-09T07:26:05Z","published":"2024-12-28T02:28:19Z","title":"INFELM: In-depth Fairness Evaluation of Large Text-To-Image Models","summary":"  The rapid development of large language models (LLMs) and large vision models\n(LVMs) have propelled the evolution of multi-modal AI systems, which have\ndemonstrated the remarkable potential for industrial applications by emulating\nhuman-like cognition. However, they also pose significant ethical challenges,\nincluding amplifying harmful content and reinforcing societal biases. For\ninstance, biases in some industrial image generation models highlighted the\nurgent need for robust fairness assessments. Most existing evaluation\nframeworks focus on the comprehensiveness of various aspects of the models, but\nthey exhibit critical limitations, including insufficient attention to content\ngeneration alignment and social bias-sensitive domains. More importantly, their\nreliance on pixel-detection techniques is prone to inaccuracies.\n  To address these issues, this paper presents INFELM, an in-depth fairness\nevaluation on widely-used text-to-image models. Our key contributions are: (1)\nan advanced skintone classifier incorporating facial topology and refined skin\npixel representation to enhance classification precision by at least 16.04%,\n(2) a bias-sensitive content alignment measurement for understanding societal\nimpacts, (3) a generalizable representation bias evaluation for diverse\ndemographic groups, and (4) extensive experiments analyzing large-scale\ntext-to-image model outputs across six social-bias-sensitive domains. We find\nthat existing models in the study generally do not meet the empirical fairness\ncriteria, and representation bias is generally more pronounced than alignment\nerrors. INFELM establishes a robust benchmark for fairness assessment,\nsupporting the development of multi-modal AI systems that align with ethical\nand human-centric principles.\n","authors":["Di Jin","Xing Liu","Yu Liu","Jia Qing Yap","Andrea Wong","Adriana Crespo","Qi Lin","Zhiyuan Yin","Qiang Yan","Ryan Ye"],"pdf_url":"https://arxiv.org/pdf/2501.01973v3.pdf","comment":"Di Jin and Xing Liu contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.06710v2","updated":"2025-01-09T07:24:09Z","published":"2024-08-25T07:55:06Z","title":"McGrids: Monte Carlo-Driven Adaptive Grids for Iso-Surface Extraction","summary":"  Iso-surface extraction from an implicit field is a fundamental process in\nvarious applications of computer vision and graphics. When dealing with\ngeometric shapes with complicated geometric details, many existing algorithms\nsuffer from high computational costs and memory usage. This paper proposes\nMcGrids, a novel approach to improve the efficiency of iso-surface extraction.\nThe key idea is to construct adaptive grids for iso-surface extraction rather\nthan using a simple uniform grid as prior art does. Specifically, we formulate\nthe problem of constructing adaptive grids as a probability sampling problem,\nwhich is then solved by Monte Carlo process. We demonstrate McGrids' capability\nwith extensive experiments from both analytical SDFs computed from surface\nmeshes and learned implicit fields from real multiview images. The experiment\nresults show that our McGrids can significantly reduce the number of implicit\nfield queries, resulting in significant memory reduction, while producing\nhigh-quality meshes with rich geometric details.\n","authors":["Daxuan Ren","Hezi Shi","Jianmin Zheng","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2409.06710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05020v1","updated":"2025-01-09T07:23:48Z","published":"2025-01-09T07:23:48Z","title":"Perception-as-Control: Fine-grained Controllable Image Animation with\n  3D-aware Motion Representation","summary":"  Motion-controllable image animation is a fundamental task with a wide range\nof potential applications. Recent works have made progress in controlling\ncamera or object motion via various motion representations, while they still\nstruggle to support collaborative camera and object motion control with\nadaptive control granularity. To this end, we introduce 3D-aware motion\nrepresentation and propose an image animation framework, called\nPerception-as-Control, to achieve fine-grained collaborative motion control.\nSpecifically, we construct 3D-aware motion representation from a reference\nimage, manipulate it based on interpreted user intentions, and perceive it from\ndifferent viewpoints. In this way, camera and object motions are transformed\ninto intuitive, consistent visual changes. Then, the proposed framework\nleverages the perception results as motion control signals, enabling it to\nsupport various motion-related video synthesis tasks in a unified and flexible\nway. Experiments demonstrate the superiority of the proposed framework. For\nmore details and qualitative results, please refer to our project webpage:\nhttps://chen-yingjie.github.io/projects/Perception-as-Control.\n","authors":["Yingjie Chen","Yifang Men","Yuan Yao","Miaomiao Cui","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2501.05020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05017v1","updated":"2025-01-09T07:18:48Z","published":"2025-01-09T07:18:48Z","title":"Continuous Knowledge-Preserving Decomposition for Few-Shot Continual\n  Learning","summary":"  Few-shot class-incremental learning (FSCIL) involves learning new classes\nfrom limited data while retaining prior knowledge, and often results in\ncatastrophic forgetting. Existing methods either freeze backbone networks to\npreserve knowledge, which limits adaptability, or rely on additional modules or\nprompts, introducing inference overhead. To this end, we propose Continuous\nKnowledge-Preserving Decomposition for FSCIL (CKPD-FSCIL), a framework that\ndecomposes a model's weights into two parts: one that compacts existing\nknowledge (knowledge-sensitive components) and another that carries redundant\ncapacity to accommodate new abilities (redundant-capacity components). The\ndecomposition is guided by a covariance matrix from replay samples, ensuring\nprincipal components align with classification abilities. During adaptation, we\nfreeze the knowledge-sensitive components and only adapt the redundant-capacity\ncomponents, fostering plasticity while minimizing interference without changing\nthe architecture or increasing overhead. Additionally, CKPD introduces an\nadaptive layer selection strategy to identify layers with redundant capacity,\ndynamically allocating adapters. Experiments on multiple benchmarks show that\nCKPD-FSCIL outperforms state-of-the-art methods.\n","authors":["Xiaojie Li","Yibo Yang","Jianlong Wu","David A. Clifton","Yue Yu","Bernard Ghanem","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.05017v1.pdf","comment":"Code: https://github.com/xiaojieli0903/CKPD-FSCIL"},{"id":"http://arxiv.org/abs/2501.05009v1","updated":"2025-01-09T07:11:51Z","published":"2025-01-09T07:11:51Z","title":"A Scalable System for Visual Analysis of Ocean Data","summary":"  Oceanographers rely on visual analysis to interpret model simulations,\nidentify events and phenomena, and track dynamic ocean processes. The ever\nincreasing resolution and complexity of ocean data due to its dynamic nature\nand multivariate relationships demands a scalable and adaptable visualization\ntool for interactive exploration. We introduce pyParaOcean, a scalable and\ninteractive visualization system designed specifically for ocean data analysis.\npyParaOcean offers specialized modules for common oceanographic analysis tasks,\nincluding eddy identification and salinity movement tracking. These modules\nseamlessly integrate with ParaView as filters, ensuring a user-friendly and\neasy-to-use system while leveraging the parallelization capabilities of\nParaView and a plethora of inbuilt general-purpose visualization\nfunctionalities. The creation of an auxiliary dataset stored as a Cinema\ndatabase helps address I/O and network bandwidth bottlenecks while supporting\nthe generation of quick overview visualizations. We present a case study on the\nBay of Bengal (BoB) to demonstrate the utility of the system and scaling\nstudies to evaluate the efficiency of the system.\n","authors":["Toshit Jain","Upkar Singh","Varun Singh","Vijay Kumar Boda","Ingrid Hotz","Sathish S. Vadhiyar","P. N. Vinayachandran","Vijay Natarajan"],"pdf_url":"https://arxiv.org/pdf/2501.05009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04996v1","updated":"2025-01-09T06:22:50Z","published":"2025-01-09T06:22:50Z","title":"A CT Image Classification Network Framework for Lung Tumors Based on\n  Pre-trained MobileNetV2 Model and Transfer learning, And Its Application and\n  Market Analysis in the Medical field","summary":"  In the medical field, accurate diagnosis of lung cancer is crucial for\ntreatment. Traditional manual analysis methods have significant limitations in\nterms of accuracy and efficiency. To address this issue, this paper proposes a\ndeep learning network framework based on the pre-trained MobileNetV2 model,\ninitialized with weights from the ImageNet-1K dataset (version 2). The last\nlayer of the model (the fully connected layer) is replaced with a new fully\nconnected layer, and a softmax activation function is added to efficiently\nclassify three types of lung cancer CT scan images. Experimental results show\nthat the model achieves an accuracy of 99.6% on the test set, with significant\nimprovements in feature extraction compared to traditional models.With the\nrapid development of artificial intelligence technologies, deep learning\napplications in medical image processing are bringing revolutionary changes to\nthe healthcare industry. AI-based lung cancer detection systems can\nsignificantly improve diagnostic efficiency, reduce the workload of doctors,\nand occupy an important position in the global healthcare market. The potential\nof AI to improve diagnostic accuracy, reduce medical costs, and promote\nprecision medicine will have a profound impact on the future development of the\nhealthcare industry.\n","authors":["Ziyang Gao","Yong Tian","Shih-Chi Lin","Junghua Lin"],"pdf_url":"https://arxiv.org/pdf/2501.04996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04995v1","updated":"2025-01-09T06:20:00Z","published":"2025-01-09T06:20:00Z","title":"IPDN: Image-enhanced Prompt Decoding Network for 3D Referring Expression\n  Segmentation","summary":"  3D Referring Expression Segmentation (3D-RES) aims to segment point cloud\nscenes based on a given expression. However, existing 3D-RES approaches face\ntwo major challenges: feature ambiguity and intent ambiguity. Feature ambiguity\narises from information loss or distortion during point cloud acquisition due\nto limitations such as lighting and viewpoint. Intent ambiguity refers to the\nmodel's equal treatment of all queries during the decoding process, lacking\ntop-down task-specific guidance. In this paper, we introduce an Image enhanced\nPrompt Decoding Network (IPDN), which leverages multi-view images and\ntask-driven information to enhance the model's reasoning capabilities. To\naddress feature ambiguity, we propose the Multi-view Semantic Embedding (MSE)\nmodule, which injects multi-view 2D image information into the 3D scene and\ncompensates for potential spatial information loss. To tackle intent ambiguity,\nwe designed a Prompt-Aware Decoder (PAD) that guides the decoding process by\nderiving task-driven signals from the interaction between the expression and\nvisual features. Comprehensive experiments demonstrate that IPDN outperforms\nthe state-ofthe-art by 1.9 and 4.2 points in mIoU metrics on the 3D-RES and\n3D-GRES tasks, respectively.\n","authors":["Qi Chen","Changli Wu","Jiayi Ji","Yiwei Ma","Danni Yang","Xiaoshuai Sun"],"pdf_url":"https://arxiv.org/pdf/2501.04995v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.02260v2","updated":"2025-01-09T06:14:09Z","published":"2025-01-04T11:28:49Z","title":"MagicFace: High-Fidelity Facial Expression Editing with Action-Unit\n  Control","summary":"  We address the problem of facial expression editing by controling the\nrelative variation of facial action-unit (AU) from the same person. This\nenables us to edit this specific person's expression in a fine-grained,\ncontinuous and interpretable manner, while preserving their identity, pose,\nbackground and detailed facial attributes. Key to our model, which we dub\nMagicFace, is a diffusion model conditioned on AU variations and an ID encoder\nto preserve facial details of high consistency. Specifically, to preserve the\nfacial details with the input identity, we leverage the power of pretrained\nStable-Diffusion models and design an ID encoder to merge appearance features\nthrough self-attention. To keep background and pose consistency, we introduce\nan efficient Attribute Controller by explicitly informing the model of current\nbackground and pose of the target. By injecting AU variations into a denoising\nUNet, our model can animate arbitrary identities with various AU combinations,\nyielding superior results in high-fidelity expression editing compared to other\nfacial expression editing works. Code is publicly available at\nhttps://github.com/weimengting/MagicFace.\n","authors":["Mengting Wei","Tuomas Varanka","Xingxun Jiang","Huai-Qian Khor","Guoying Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.02260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04975v1","updated":"2025-01-09T05:12:38Z","published":"2025-01-09T05:12:38Z","title":"V2C-CBM: Building Concept Bottlenecks with Vision-to-Concept Tokenizer","summary":"  Concept Bottleneck Models (CBMs) offer inherent interpretability by initially\ntranslating images into human-comprehensible concepts, followed by a linear\ncombination of these concepts for classification. However, the annotation of\nconcepts for visual recognition tasks requires extensive expert knowledge and\nlabor, constraining the broad adoption of CBMs. Recent approaches have\nleveraged the knowledge of large language models to construct concept\nbottlenecks, with multimodal models like CLIP subsequently mapping image\nfeatures into the concept feature space for classification. Despite this, the\nconcepts produced by language models can be verbose and may introduce\nnon-visual attributes, which hurts accuracy and interpretability. In this\nstudy, we investigate to avoid these issues by constructing CBMs directly from\nmultimodal models. To this end, we adopt common words as base concept\nvocabulary and leverage auxiliary unlabeled images to construct a\nVision-to-Concept (V2C) tokenizer that can explicitly quantize images into\ntheir most relevant visual concepts, thus creating a vision-oriented concept\nbottleneck tightly coupled with the multimodal model. This leads to our V2C-CBM\nwhich is training efficient and interpretable with high accuracy. Our V2C-CBM\nhas matched or outperformed LLM-supervised CBMs on various visual\nclassification benchmarks, validating the efficacy of our approach.\n","authors":["Hangzhou He","Lei Zhu","Xinliang Zhang","Shuang Zeng","Qian Chen","Yanye Lu"],"pdf_url":"https://arxiv.org/pdf/2501.04975v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2410.10777v2","updated":"2025-01-09T04:57:35Z","published":"2024-10-14T17:49:27Z","title":"UniMatch V2: Pushing the Limit of Semi-Supervised Semantic Segmentation","summary":"  Semi-supervised semantic segmentation (SSS) aims at learning rich visual\nknowledge from cheap unlabeled images to enhance semantic segmentation\ncapability. Among recent works, UniMatch improves its precedents tremendously\nby amplifying the practice of weak-to-strong consistency regularization.\nSubsequent works typically follow similar pipelines and propose various\ndelicate designs. Despite the achieved progress, strangely, even in this\nflourishing era of numerous powerful vision models, almost all SSS works are\nstill sticking to 1) using outdated ResNet encoders with small-scale\nImageNet-1K pre-training, and 2) evaluation on simple Pascal and Cityscapes\ndatasets. In this work, we argue that, it is necessary to switch the baseline\nof SSS from ResNet-based encoders to more capable ViT-based encoders (e.g.,\nDINOv2) that are pre-trained on massive data. A simple update on the encoder\n(even using 2x fewer parameters) can bring more significant improvement than\ncareful method designs. Built on this competitive baseline, we present our\nupgraded and simplified UniMatch V2, inheriting the core spirit of\nweak-to-strong consistency from V1, but requiring less training cost and\nproviding consistently better results. Additionally, witnessing the gradually\nsaturated performance on Pascal and Cityscapes, we appeal that we should focus\non more challenging benchmarks with complex taxonomy, such as ADE20K and COCO\ndatasets. Code, models, and logs of all reported values, are available at\nhttps://github.com/LiheYoung/UniMatch-V2.\n","authors":["Lihe Yang","Zhen Zhao","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.10777v2.pdf","comment":"Accepted by TPAMI"},{"id":"http://arxiv.org/abs/2501.02795v2","updated":"2025-01-09T04:50:16Z","published":"2025-01-06T06:29:55Z","title":"InfiFusion: A Unified Framework for Enhanced Cross-Model Reasoning via\n  LLM Fusion","summary":"  Large Language Models (LLMs) have demonstrated strong performance across\nvarious reasoning tasks, yet building a single model that consistently excels\nacross all domains remains challenging. This paper addresses this problem by\nexploring strategies to integrate multiple domain-specialized models into an\nefficient pivot model.We propose two fusion strategies to combine the strengths\nof multiple LLMs: (1) a pairwise, multi-step fusion approach that sequentially\ndistills each source model into the pivot model, followed by a weight merging\nstep to integrate the distilled models into the final model. This method\nachieves strong performance but requires substantial training effort; and (2) a\nunified fusion approach that aggregates all source models' outputs\nsimultaneously.To improve the fusion process, we introduce a novel\nRate-Skewness Adaptive Fusion (RSAF) technique, which dynamically adjusts top-K\nratios during parameter merging for enhanced flexibility and\nstability.Furthermore, we propose an uncertainty-based weighting method for the\nunified approach, which dynamically balances the contributions of source models\nand outperforms other logits/distribution ensemble methods.We achieved accuracy\nimprovements of 9.27%, 8.80%, and 8.89% on the GSM8K, MATH, and HumanEval\ntasks, respectively.\n","authors":["Zhaoyi Yan","Zhijie Sang","Yiming Zhang","Yuhao Fu","Baoyi He","Qi Zhou","Yining Di","Chunlin Ji","Shengyu Zhang","Fei Wu","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2501.02795v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.04969v1","updated":"2025-01-09T04:47:51Z","published":"2025-01-09T04:47:51Z","title":"AD-L-JEPA: Self-Supervised Spatial World Models with Joint Embedding\n  Predictive Architecture for Autonomous Driving with LiDAR Data","summary":"  As opposed to human drivers, current autonomous driving systems still require\nvast amounts of labeled data to train. Recently, world models have been\nproposed to simultaneously enhance autonomous driving capabilities by improving\nthe way these systems understand complex real-world environments and reduce\ntheir data demands via self-supervised pre-training. In this paper, we present\nAD-L-JEPA (aka Autonomous Driving with LiDAR data via a Joint Embedding\nPredictive Architecture), a novel self-supervised pre-training framework for\nautonomous driving with LiDAR data that, as opposed to existing methods, is\nneither generative nor contrastive. Our method learns spatial world models with\na joint embedding predictive architecture. Instead of explicitly generating\nmasked unknown regions, our self-supervised world models predict Bird's Eye\nView (BEV) embeddings to represent the diverse nature of autonomous driving\nscenes. Our approach furthermore eliminates the need to manually create\npositive and negative pairs, as is the case in contrastive learning. AD-L-JEPA\nleads to simpler implementation and enhanced learned representations. We\nqualitatively and quantitatively demonstrate high-quality of embeddings learned\nwith AD-L-JEPA. We furthermore evaluate the accuracy and label efficiency of\nAD-L-JEPA on popular downstream tasks such as LiDAR 3D object detection and\nassociated transfer learning. Our experimental evaluation demonstrates that\nAD-L-JEPA is a plausible approach for self-supervised pre-training in\nautonomous driving applications and is the best available approach\noutperforming SOTA, including most recently proposed Occupancy-MAE [1] and ALSO\n[2]. The source code of AD-L-JEPA is available at\nhttps://github.com/HaoranZhuExplorer/AD-L-JEPA-Release.\n","authors":["Haoran Zhu","Zhenyuan Dong","Kristi Topollai","Anna Choromanska"],"pdf_url":"https://arxiv.org/pdf/2501.04969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04966v1","updated":"2025-01-09T04:37:31Z","published":"2025-01-09T04:37:31Z","title":"Emergence of Painting Ability via Recognition-Driven Evolution","summary":"  From Paleolithic cave paintings to Impressionism, human painting has evolved\nto depict increasingly complex and detailed scenes, conveying more nuanced\nmessages. This paper attempts to emerge this artistic capability by simulating\nthe evolutionary pressures that enhance visual communication efficiency.\nSpecifically, we present a model with a stroke branch and a palette branch that\ntogether simulate human-like painting. The palette branch learns a limited\ncolour palette, while the stroke branch parameterises each stroke using\nB\\'ezier curves to render an image, subsequently evaluated by a high-level\nrecognition module. We quantify the efficiency of visual communication by\nmeasuring the recognition accuracy achieved with machine vision. The model then\noptimises the control points and colour choices for each stroke to maximise\nrecognition accuracy with minimal strokes and colours. Experimental results\nshow that our model achieves superior performance in high-level recognition\ntasks, delivering artistic expression and aesthetic appeal, especially in\nabstract sketches. Additionally, our approach shows promise as an efficient\nbit-level image compression technique, outperforming traditional methods.\n","authors":["Yi Lin","Lin Gu","Ziteng Cui","Shenghan Su","Yumo Hao","Yingtao Tian","Tatsuya Harada","Jianfei Yang"],"pdf_url":"https://arxiv.org/pdf/2501.04966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03847v2","updated":"2025-01-09T04:25:42Z","published":"2025-01-07T15:01:58Z","title":"Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video\n  Generation Control","summary":"  Diffusion models have demonstrated impressive performance in generating\nhigh-quality videos from text prompts or images. However, precise control over\nthe video generation process, such as camera manipulation or content editing,\nremains a significant challenge. Existing methods for controlled video\ngeneration are typically limited to a single control type, lacking the\nflexibility to handle diverse control demands. In this paper, we introduce\nDiffusion as Shader (DaS), a novel approach that supports multiple video\ncontrol tasks within a unified architecture. Our key insight is that achieving\nversatile video control necessitates leveraging 3D control signals, as videos\nare fundamentally 2D renderings of dynamic 3D content. Unlike prior methods\nlimited to 2D control signals, DaS leverages 3D tracking videos as control\ninputs, making the video diffusion process inherently 3D-aware. This innovation\nallows DaS to achieve a wide range of video controls by simply manipulating the\n3D tracking videos. A further advantage of using 3D tracking videos is their\nability to effectively link frames, significantly enhancing the temporal\nconsistency of the generated videos. With just 3 days of fine-tuning on 8 H800\nGPUs using less than 10k videos, DaS demonstrates strong control capabilities\nacross diverse tasks, including mesh-to-video generation, camera control,\nmotion transfer, and object manipulation.\n","authors":["Zekai Gu","Rui Yan","Jiahao Lu","Peng Li","Zhiyang Dou","Chenyang Si","Zhen Dong","Qifeng Liu","Cheng Lin","Ziwei Liu","Wenping Wang","Yuan Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03847v2.pdf","comment":"Project page: https://igl-hkust.github.io/das/ Codes:\n  https://github.com/IGL-HKUST/DiffusionAsShader"},{"id":"http://arxiv.org/abs/2311.09346v2","updated":"2025-01-09T04:20:34Z","published":"2023-11-15T20:09:29Z","title":"Nothing Stands Still: A Spatiotemporal Benchmark on 3D Point Cloud\n  Registration Under Large Geometric and Temporal Change","summary":"  Building 3D geometric maps of man-made spaces is a well-established and\nactive field that is fundamental to computer vision and robotics. However,\nconsidering the evolving nature of built environments, it is essential to\nquestion the capabilities of current mapping efforts in handling temporal\nchanges. In addition, spatiotemporal mapping holds significant potential for\nachieving sustainability and circularity goals. Existing mapping approaches\nfocus on small changes, such as object relocation or self-driving car\noperation; in all cases where the main structure of the scene remains fixed.\nConsequently, these approaches fail to address more radical changes in the\nstructure of the built environment, such as geometry and topology. To this end,\nwe introduce the Nothing Stands Still (NSS) benchmark, which focuses on the\nspatiotemporal registration of 3D scenes undergoing large spatial and temporal\nchange, ultimately creating one coherent spatiotemporal map. Specifically, the\nbenchmark involves registering two or more partial 3D point clouds (fragments)\nfrom the same scene but captured from different spatiotemporal views. In\naddition to the standard pairwise registration, we assess the multi-way\nregistration of multiple fragments that belong to any temporal stage. As part\nof NSS, we introduce a dataset of 3D point clouds recurrently captured in\nlarge-scale building indoor environments that are under construction or\nrenovation. The NSS benchmark presents three scenarios of increasing\ndifficulty, to quantify the generalization ability of point cloud registration\nmethods over space (within one building and across buildings) and time. We\nconduct extensive evaluations of state-of-the-art methods on NSS. The results\ndemonstrate the necessity for novel methods specifically designed to handle\nlarge spatiotemporal changes. The homepage of our benchmark is at\nhttp://nothing-stands-still.com.\n","authors":["Tao Sun","Yan Hao","Shengyu Huang","Silvio Savarese","Konrad Schindler","Marc Pollefeys","Iro Armeni"],"pdf_url":"https://arxiv.org/pdf/2311.09346v2.pdf","comment":"To appear in the ISPRS Journal of Photogrammetry and Remote Sensing.\n  29 pages, 26 figures. For the project page, see\n  http://nothing-stands-still.com"},{"id":"http://arxiv.org/abs/2501.04958v1","updated":"2025-01-09T04:20:12Z","published":"2025-01-09T04:20:12Z","title":"Addressing Domain Shift via Imbalance-Aware Domain Adaptation in Embryo\n  Development Assessment","summary":"  Deep learning models in medical imaging face dual challenges: domain shift,\nwhere models perform poorly when deployed in settings different from their\ntraining environment, and class imbalance, where certain disease conditions are\nnaturally underrepresented. We present Imbalance-Aware Domain Adaptation\n(IADA), a novel framework that simultaneously tackles both challenges through\nthree key components: (1) adaptive feature learning with class-specific\nattention mechanisms, (2) balanced domain alignment with dynamic weighting, and\n(3) adaptive threshold optimization. Our theoretical analysis establishes\nconvergence guarantees and complexity bounds. Through extensive experiments on\nembryo development assessment across four imaging modalities, IADA demonstrates\nsignificant improvements over existing methods, achieving up to 25.19\\% higher\naccuracy while maintaining balanced performance across classes. In challenging\nscenarios with low-quality imaging systems, IADA shows robust generalization\nwith AUC improvements of up to 12.56\\%. These results demonstrate IADA's\npotential for developing reliable and equitable medical imaging systems for\ndiverse clinical settings. The code is made public available at\n\\url{https://github.com/yinghemedical/imbalance-aware_domain_adaptation}\n","authors":["Lei Li","Xinglin Zhang","Jun Liang","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.04958v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2501.04950v1","updated":"2025-01-09T03:58:02Z","published":"2025-01-09T03:58:02Z","title":"MORDA: A Synthetic Dataset to Facilitate Adaptation of Object Detectors\n  to Unseen Real-target Domain While Preserving Performance on Real-source\n  Domain","summary":"  Deep neural network (DNN) based perception models are indispensable in the\ndevelopment of autonomous vehicles (AVs). However, their reliance on\nlarge-scale, high-quality data is broadly recognized as a burdensome necessity\ndue to the substantial cost of data acquisition and labeling. Further, the\nissue is not a one-time concern, as AVs might need a new dataset if they are to\nbe deployed to another region (real-target domain) that the in-hand dataset\nwithin the real-source domain cannot incorporate. To mitigate this burden, we\npropose leveraging synthetic environments as an auxiliary domain where the\ncharacteristics of real domains are reproduced. This approach could enable\nindirect experience about the real-target domain in a time- and cost-effective\nmanner. As a practical demonstration of our methodology, nuScenes and South\nKorea are employed to represent real-source and real-target domains,\nrespectively. That means we construct digital twins for several regions of\nSouth Korea, and the data-acquisition framework of nuScenes is reproduced.\nBlending the aforementioned components within a simulator allows us to obtain a\nsynthetic-fusion domain in which we forge our novel driving dataset, MORDA:\nMixture Of Real-domain characteristics for synthetic-data-assisted Domain\nAdaptation. To verify the value of synthetic features that MORDA provides in\nlearning about driving environments of South Korea, 2D/3D detectors are trained\nsolely on a combination of nuScenes and MORDA. Afterward, their performance is\nevaluated on the unforeseen real-world dataset (AI-Hub) collected in South\nKorea. Our experiments present that MORDA can significantly improve mean\nAverage Precision (mAP) on AI-Hub dataset while that on nuScenes is retained or\nslightly enhanced.\n","authors":["Hojun Lim","Heecheol Yoo","Jinwoo Lee","Seungmin Jeon","Hyeongseok Jeon"],"pdf_url":"https://arxiv.org/pdf/2501.04950v1.pdf","comment":"7 pages, 6 figures, 4 tables, This work has been submitted to the\n  IEEE for possible publication (the paper is submitted to the conference\n  ICRA2025 and is under review)"},{"id":"http://arxiv.org/abs/2501.04947v1","updated":"2025-01-09T03:50:00Z","published":"2025-01-09T03:50:00Z","title":"Seeing with Partial Certainty: Conformal Prediction for Robotic Scene\n  Recognition in Built Environments","summary":"  In assistive robotics serving people with disabilities (PWD), accurate place\nrecognition in built environments is crucial to ensure that robots navigate and\ninteract safely within diverse indoor spaces. Language interfaces, particularly\nthose powered by Large Language Models (LLM) and Vision Language Models (VLM),\nhold significant promise in this context, as they can interpret visual scenes\nand correlate them with semantic information. However, such interfaces are also\nknown for their hallucinated predictions. In addition, language instructions\nprovided by humans can also be ambiguous and lack precise details about\nspecific locations, objects, or actions, exacerbating the hallucination issue.\nIn this work, we introduce Seeing with Partial Certainty (SwPC) - a framework\ndesigned to measure and align uncertainty in VLM-based place recognition,\nenabling the model to recognize when it lacks confidence and seek assistance\nwhen necessary. This framework is built on the theory of conformal prediction\nto provide statistical guarantees on place recognition while minimizing\nrequests for human help in complex indoor environment settings. Through\nexperiments on the widely used richly-annotated scene dataset Matterport3D, we\nshow that SwPC significantly increases the success rate and decreases the\namount of human intervention required relative to the prior art. SwPC can be\nutilized with any VLMs directly without requiring model fine-tuning, offering a\npromising, lightweight approach to uncertainty modeling that complements and\nscales alongside the expanding capabilities of foundational models.\n","authors":["Yifan Xu","Vineet Kamat","Carol Menassa"],"pdf_url":"https://arxiv.org/pdf/2501.04947v1.pdf","comment":"10 pages, 4 Figures"},{"id":"http://arxiv.org/abs/2412.18696v2","updated":"2025-01-09T03:39:37Z","published":"2024-12-24T22:55:35Z","title":"STITCH: Surface reconstrucTion using Implicit neural representations\n  with Topology Constraints and persistent Homology","summary":"  We present STITCH, a novel approach for neural implicit surface\nreconstruction of a sparse and irregularly spaced point cloud while enforcing\ntopological constraints (such as having a single connected component). We\ndevelop a new differentiable framework based on persistent homology to\nformulate topological loss terms that enforce the prior of a single 2-manifold\nobject. Our method demonstrates excellent performance in preserving the\ntopology of complex 3D geometries, evident through both visual and empirical\ncomparisons. We supplement this with a theoretical analysis, and provably show\nthat optimizing the loss with stochastic (sub)gradient descent leads to\nconvergence and enables reconstructing shapes with a single connected\ncomponent. Our approach showcases the integration of differentiable topological\ndata analysis tools for implicit surface reconstruction.\n","authors":["Anushrut Jignasu","Ethan Herron","Zhanhong Jiang","Soumik Sarkar","Chinmay Hegde","Baskar Ganapathysubramanian","Aditya Balu","Adarsh Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2412.18696v2.pdf","comment":"19 pages, 12 figures, 29 tables"},{"id":"http://arxiv.org/abs/2411.18729v2","updated":"2025-01-09T03:34:55Z","published":"2024-11-27T20:08:55Z","title":"Multi-Task Model Merging via Adaptive Weight Disentanglement","summary":"  Model merging has recently gained attention as an economical and scalable\napproach to incorporate task-specific weights from various tasks into a unified\nmulti-task model. For example, in Task Arithmetic (TA), adding the fine-tuned\nweights of different tasks can enhance the model's performance on those tasks,\nwhile subtracting them leads to task forgetting. Although TA is highly\neffective, interference among task still hampers the performance of the merged\nmodel. Existing methods for handling conflicts between task generally rely on\nempirical selection, resulting in suboptimal performance. In this paper, we\nintroduce an Adaptive Weight Disentanglement method. We begin by theoretically\nproving that task vectors employed in model merging should be orthogonal to\nminimize interference among tasks. Guided by this insight, we initialize\nredundant vectors such that, when subtracted from the original task vectors,\nthe resulting vectors exhibit increased orthogonality. Additionally, we impose\nan norm constraint on the redundant vectors to preserve the performance of the\ntask-specific models. Experimental results demonstrate the effectiveness of our\nproposed technique: it successfully extracts redundant vectors, and after their\nsubtraction, the task vectors not only retain robust performance but also\nachieve superior fusion outcomes. Our code is available at\n\\href{https://github.com/FarisXiong/AWD.git}{https://github.com/FarisXiong/AWD.git}.\n","authors":["Feng Xiong","Runxi Cheng","Wang Chen","Zhanqiu Zhang","Yiwen Guo","Chun Yuan","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2411.18729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04944v1","updated":"2025-01-09T03:27:47Z","published":"2025-01-09T03:27:47Z","title":"MambaHSI: Spatial-Spectral Mamba for Hyperspectral Image Classification","summary":"  Transformer has been extensively explored for hyperspectral image (HSI)\nclassification. However, transformer poses challenges in terms of speed and\nmemory usage because of its quadratic computational complexity. Recently, the\nMamba model has emerged as a promising approach, which has strong long-distance\nmodeling capabilities while maintaining a linear computational complexity.\nHowever, representing the HSI is challenging for the Mamba due to the\nrequirement for an integrated spatial and spectral understanding. To remedy\nthese drawbacks, we propose a novel HSI classification model based on a Mamba\nmodel, named MambaHSI, which can simultaneously model long-range interaction of\nthe whole image and integrate spatial and spectral information in an adaptive\nmanner. Specifically, we design a spatial Mamba block (SpaMB) to model the\nlong-range interaction of the whole image at the pixel-level. Then, we propose\na spectral Mamba block (SpeMB) to split the spectral vector into multiple\ngroups, mine the relations across different spectral groups, and extract\nspectral features. Finally, we propose a spatial-spectral fusion module (SSFM)\nto adaptively integrate spatial and spectral features of a HSI. To our best\nknowledge, this is the first image-level HSI classification model based on the\nMamba. We conduct extensive experiments on four diverse HSI datasets. The\nresults demonstrate the effectiveness and superiority of the proposed model for\nHSI classification. This reveals the great potential of Mamba to be the\nnext-generation backbone for HSI models. Codes are available at\nhttps://github.com/li-yapeng/MambaHSI .\n","authors":["Yapeng Li","Yong Luo","Lefei Zhang","Zengmao Wang","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2501.04944v1.pdf","comment":"accepted by IEEE TGRS"},{"id":"http://arxiv.org/abs/2501.00358v2","updated":"2025-01-09T03:25:24Z","published":"2024-12-31T09:22:38Z","title":"Embodied VideoAgent: Persistent Memory from Egocentric Videos and\n  Embodied Sensors Enables Dynamic Scene Understanding","summary":"  This paper investigates the problem of understanding dynamic 3D scenes from\negocentric observations, a key challenge in robotics and embodied AI. Unlike\nprior studies that explored this as long-form video understanding and utilized\negocentric video only, we instead propose an LLM-based agent, Embodied\nVideoAgent, which constructs scene memory from both egocentric video and\nembodied sensory inputs (e.g. depth and pose sensing). We further introduce a\nVLM-based approach to automatically update the memory when actions or\nactivities over objects are perceived. Embodied VideoAgent attains significant\nadvantages over counterparts in challenging reasoning and planning tasks in 3D\nscenes, achieving gains of 4.9% on Ego4D-VQ3D, 5.8% on OpenEQA, and 11.7% on\nEnvQA. We have also demonstrated its potential in various embodied AI tasks\nincluding generating embodied interactions and perception for robot\nmanipulation. The code and demo will be made public.\n","authors":["Yue Fan","Xiaojian Ma","Rongpeng Su","Jun Guo","Rujie Wu","Xi Chen","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2501.00358v2.pdf","comment":"project page: https://embodied-videoagent.github.io/"},{"id":"http://arxiv.org/abs/2501.04939v1","updated":"2025-01-09T03:04:08Z","published":"2025-01-09T03:04:08Z","title":"Multi-Context Temporal Consistent Modeling for Referring Video Object\n  Segmentation","summary":"  Referring video object segmentation aims to segment objects within a video\ncorresponding to a given text description. Existing transformer-based temporal\nmodeling approaches face challenges related to query inconsistency and the\nlimited consideration of context. Query inconsistency produces unstable masks\nof different objects in the middle of the video. The limited consideration of\ncontext leads to the segmentation of incorrect objects by failing to adequately\naccount for the relationship between the given text and instances. To address\nthese issues, we propose the Multi-context Temporal Consistency Module (MTCM),\nwhich consists of an Aligner and a Multi-Context Enhancer (MCE). The Aligner\nremoves noise from queries and aligns them to achieve query consistency. The\nMCE predicts text-relevant queries by considering multi-context. We applied\nMTCM to four different models, increasing performance across all of them,\nparticularly achieving 47.6 J&F on the MeViS. Code is available at\nhttps://github.com/Choi58/MTCM.\n","authors":["Sun-Hyuk Choi","Hayoung Jo","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2501.04939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04934v1","updated":"2025-01-09T02:52:30Z","published":"2025-01-09T02:52:30Z","title":"Plug-and-Play DISep: Separating Dense Instances for Scene-to-Pixel\n  Weakly-Supervised Change Detection in High-Resolution Remote Sensing Images","summary":"  Existing Weakly-Supervised Change Detection (WSCD) methods often encounter\nthe problem of \"instance lumping\" under scene-level supervision, particularly\nin scenarios with a dense distribution of changed instances (i.e., changed\nobjects). In these scenarios, unchanged pixels between changed instances are\nalso mistakenly identified as changed, causing multiple changes to be\nmistakenly viewed as one. In practical applications, this issue prevents the\naccurate quantification of the number of changes. To address this issue, we\npropose a Dense Instance Separation (DISep) method as a plug-and-play solution,\nrefining pixel features from a unified instance perspective under scene-level\nsupervision. Specifically, our DISep comprises a three-step iterative training\nprocess: 1) Instance Localization: We locate instance candidate regions for\nchanged pixels using high-pass class activation maps. 2) Instance Retrieval: We\nidentify and group these changed pixels into different instance IDs through\nconnectivity searching. Then, based on the assigned instance IDs, we extract\ncorresponding pixel-level features on a per-instance basis. 3) Instance\nSeparation: We introduce a separation loss to enforce intra-instance pixel\nconsistency in the embedding space, thereby ensuring separable instance feature\nrepresentations. The proposed DISep adds only minimal training cost and no\ninference cost. It can be seamlessly integrated to enhance existing WSCD\nmethods. We achieve state-of-the-art performance by enhancing {three\nTransformer-based and four ConvNet-based methods} on the LEVIR-CD, WHU-CD,\nDSIFN-CD, SYSU-CD, and CDD datasets. Additionally, our DISep can be used to\nimprove fully-supervised change detection methods. Code is available at\nhttps://github.com/zhenghuizhao/Plug-and-Play-DISep-for-Change-Detection.\n","authors":["Zhenghui Zhao","Chen Wu","Lixiang Ru","Di Wang","Hongruixuan Chen","Cuiqun Chen"],"pdf_url":"https://arxiv.org/pdf/2501.04934v1.pdf","comment":"Accepted by ISPRS Journal of Photogrammetry and Remote Sensing"},{"id":"http://arxiv.org/abs/2501.01808v2","updated":"2025-01-09T02:45:43Z","published":"2025-01-03T13:43:21Z","title":"MoEE: Mixture of Emotion Experts for Audio-Driven Portrait Animation","summary":"  The generation of talking avatars has achieved significant advancements in\nprecise audio synchronization. However, crafting lifelike talking head videos\nrequires capturing a broad spectrum of emotions and subtle facial expressions.\nCurrent methods face fundamental challenges: a) the absence of frameworks for\nmodeling single basic emotional expressions, which restricts the generation of\ncomplex emotions such as compound emotions; b) the lack of comprehensive\ndatasets rich in human emotional expressions, which limits the potential of\nmodels. To address these challenges, we propose the following innovations: 1)\nthe Mixture of Emotion Experts (MoEE) model, which decouples six fundamental\nemotions to enable the precise synthesis of both singular and compound\nemotional states; 2) the DH-FaceEmoVid-150 dataset, specifically curated to\ninclude six prevalent human emotional expressions as well as four types of\ncompound emotions, thereby expanding the training potential of emotion-driven\nmodels. Furthermore, to enhance the flexibility of emotion control, we propose\nan emotion-to-latents module that leverages multimodal inputs, aligning diverse\ncontrol signals-such as audio, text, and labels-to ensure more varied control\ninputs as well as the ability to control emotions using audio alone. Through\nextensive quantitative and qualitative evaluations, we demonstrate that the\nMoEE framework, in conjunction with the DH-FaceEmoVid-150 dataset, excels in\ngenerating complex emotional expressions and nuanced facial details, setting a\nnew benchmark in the field. These datasets will be publicly released.\n","authors":["Huaize Liu","Wenzhang Sun","Donglin Di","Shibo Sun","Jiahui Yang","Changqing Zou","Hujun Bao"],"pdf_url":"https://arxiv.org/pdf/2501.01808v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04928v1","updated":"2025-01-09T02:36:21Z","published":"2025-01-09T02:36:21Z","title":"Image2CADSeq: Computer-Aided Design Sequence and Knowledge Inference\n  from Product Images","summary":"  Computer-aided design (CAD) tools empower designers to design and modify 3D\nmodels through a series of CAD operations, commonly referred to as a CAD\nsequence. In scenarios where digital CAD files are not accessible, reverse\nengineering (RE) has been used to reconstruct 3D CAD models. Recent advances\nhave seen the rise of data-driven approaches for RE, with a primary focus on\nconverting 3D data, such as point clouds, into 3D models in boundary\nrepresentation (B-rep) format. However, obtaining 3D data poses significant\nchallenges, and B-rep models do not reveal knowledge about the 3D modeling\nprocess of designs. To this end, our research introduces a novel data-driven\napproach with an Image2CADSeq neural network model. This model aims to reverse\nengineer CAD models by processing images as input and generating CAD sequences.\nThese sequences can then be translated into B-rep models using a solid modeling\nkernel. Unlike B-rep models, CAD sequences offer enhanced flexibility to modify\nindividual steps of model creation, providing a deeper understanding of the\nconstruction process of CAD models. To quantitatively and rigorously evaluate\nthe predictive performance of the Image2CADSeq model, we have developed a\nmulti-level evaluation framework for model assessment. The model was trained on\na specially synthesized dataset, and various network architectures were\nexplored to optimize the performance. The experimental and validation results\nshow great potential for the model in generating CAD sequences from 2D image\ndata.\n","authors":["Xingang Li","Zhenghui Sha"],"pdf_url":"https://arxiv.org/pdf/2501.04928v1.pdf","comment":"20 pages, 10 figures, and 6 tables"},{"id":"http://arxiv.org/abs/2404.06429v3","updated":"2025-01-09T02:34:25Z","published":"2024-04-09T16:20:03Z","title":"Magic-Boost: Boost 3D Generation with Multi-View Conditioned Diffusion","summary":"  Benefiting from the rapid development of 2D diffusion models, 3D content\ngeneration has witnessed significant progress. One promising solution is to\nfinetune the pre-trained 2D diffusion models to produce multi-view images and\nthen reconstruct them into 3D assets via feed-forward sparse-view\nreconstruction models. However, limited by the 3D inconsistency in the\ngenerated multi-view images and the low reconstruction resolution of the\nfeed-forward reconstruction models, the generated 3d assets are still limited\nto incorrect geometries and blurry textures. To address this problem, we\npresent a multi-view based refine method, named Magic-Boost, to further refine\nthe generation results. In detail, we first propose a novel multi-view\nconditioned diffusion model which extracts 3d prior from the synthesized\nmulti-view images to synthesize high-fidelity novel view images and then\nintroduce a novel iterative-update strategy to adopt it to provide precise\nguidance to refine the coarse generated results through a fast optimization\nprocess. Conditioned on the strong 3d priors extracted from the synthesized\nmulti-view images, Magic-Boost is capable of providing precise optimization\nguidance that well aligns with the coarse generated 3D assets, enriching the\nlocal detail in both geometry and texture within a short time ($\\sim15$min).\nExtensive experiments show Magic-Boost greatly enhances the coarse generated\ninputs, generates high-quality 3D assets with rich geometric and textural\ndetails. (Project Page: https://magic-research.github.io/magic-boost/)\n","authors":["Fan Yang","Jianfeng Zhang","Yichun Shi","Bowen Chen","Chenxu Zhang","Huichao Zhang","Xiaofeng Yang","Xiu Li","Jiashi Feng","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06429v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19407v5","updated":"2025-01-09T02:33:15Z","published":"2024-06-12T06:41:23Z","title":"YOLO11 to Its Genesis: A Decadal and Comprehensive Review of The You\n  Only Look Once (YOLO) Series","summary":"  Given the rapid emergence and applications of Large Language This review\nsystematically examines the progression of the You Only Look Once (YOLO) object\ndetection algorithms from YOLOv1 to the recently unveiled YOLO11 (or YOLOv11).\nEmploying a reverse chronological analysis, this study examines the\nadvancements introduced by YOLO algorithms, beginning with YOLOv11 and\nprogressing through YOLOv10, YOLOv9, YOLOv8, and subsequent versions to explore\neach version's contributions to enhancing speed, detection accuracy, and\ncomputational efficiency in real-time object detection. By detailing the\nincremental technological advancements in subsequent YOLO versions, this review\nchronicles the evolution of YOLO, and discusses the challenges and limitations\nin each earlier versions. The evolution signifies a path towards integrating\nYOLO with multimodal, context-aware, and Artificial General Intelligence (AGI)\nsystems for the next YOLO decade, promising significant implications for future\ndevelopments in AI-driven applications. YOLOV11 to YOLOv1\n","authors":["Ranjan Sapkota","Rizwan Qureshi","Marco Flores Calero","Chetan Badjugar","Upesh Nepal","Alwin Poulose","Peter Zeno","Uday Bhanu Prakash Vaddevolu","Sheheryar Khan","Maged Shoman","Hong Yan","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2406.19407v5.pdf","comment":"11 Figures, 7 Tables"},{"id":"http://arxiv.org/abs/2501.04914v1","updated":"2025-01-09T02:10:15Z","published":"2025-01-09T02:10:15Z","title":"From Mesh Completion to AI Designed Crown","summary":"  Designing a dental crown is a time-consuming and labor intensive process. Our\ngoal is to simplify crown design and minimize the tediousness of making manual\nadjustments while still ensuring the highest level of accuracy and consistency.\nTo this end, we present a new end- to-end deep learning approach, coined Dental\nMesh Completion (DMC), to generate a crown mesh conditioned on a point cloud\ncontext. The dental context includes the tooth prepared to receive a crown and\nits surroundings, namely the two adjacent teeth and the three closest teeth in\nthe opposing jaw. We formulate crown generation in terms of completing this\npoint cloud context. A feature extractor first converts the input point cloud\ninto a set of feature vectors that represent local regions in the point cloud.\nThe set of feature vectors is then fed into a transformer to predict a new set\nof feature vectors for the missing region (crown). Subsequently, a point\nreconstruction head, followed by a multi-layer perceptron, is used to predict a\ndense set of points with normals. Finally, a differentiable point-to-mesh layer\nserves to reconstruct the crown surface mesh. We compare our DMC method to a\ngraph-based convolutional neural network which learns to deform a crown mesh\nfrom a generic crown shape to the target geometry. Extensive experiments on our\ndataset demonstrate the effectiveness of our method, which attains an average\nof 0.062 Chamfer Distance.The code is available\nat:https://github.com/Golriz-code/DMC.gi\n","authors":["Golriz Hosseinimanesh","Farnoosh Ghadiri","Francois Guibault","Farida Cheriet","Julia Keren"],"pdf_url":"https://arxiv.org/pdf/2501.04914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12620v2","updated":"2025-01-09T02:00:15Z","published":"2024-12-17T07:33:07Z","title":"Multi-Domain Features Guided Supervised Contrastive Learning for Radar\n  Target Detection","summary":"  Detecting small targets in sea clutter is challenging due to dynamic maritime\nconditions. Existing solutions either model sea clutter for detection or\nextract target features based on clutter-target echo differences, including\nstatistical and deep features. While more common, the latter often excels in\ncontrolled scenarios but struggles with robust detection and generalization in\ndiverse environments, limiting practical use. In this letter, we propose a\nmulti-domain features guided supervised contrastive learning (MDFG_SCL) method,\nwhich integrates statistical features derived from multi-domain differences\nwith deep features obtained through supervised contrastive learning, thereby\ncapturing both low-level domain-specific variations and high-level semantic\ninformation. This comprehensive feature integration enables the model to\neffectively distinguish between small targets and sea clutter, even under\nchallenging conditions. Experiments conducted on real-world datasets\ndemonstrate that the proposed shallow-to-deep detector not only achieves\neffective identification of small maritime targets but also maintains superior\ndetection performance across varying sea conditions, outperforming the\nmainstream unsupervised contrastive learning and supervised contrastive\nlearning methods.\n","authors":["Junjie Wang","Yuze Gao","Dongying Li","Wenxian Yu"],"pdf_url":"https://arxiv.org/pdf/2412.12620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04284v2","updated":"2025-01-09T01:58:41Z","published":"2025-01-08T05:15:43Z","title":"ContextMRI: Enhancing Compressed Sensing MRI through Metadata\n  Conditioning","summary":"  Compressed sensing MRI seeks to accelerate MRI acquisition processes by\nsampling fewer k-space measurements and then reconstructing the missing data\nalgorithmically. The success of these approaches often relies on strong priors\nor learned statistical models. While recent diffusion model-based priors have\nshown great potential, previous methods typically ignore clinically available\nmetadata (e.g. patient demographics, imaging parameters, slice-specific\ninformation). In practice, metadata contains meaningful cues about the anatomy\nand acquisition protocol, suggesting it could further constrain the\nreconstruction problem. In this work, we propose ContextMRI, a text-conditioned\ndiffusion model for MRI that integrates granular metadata into the\nreconstruction process. We train a pixel-space diffusion model directly on\nminimally processed, complex-valued MRI images. During inference, metadata is\nconverted into a structured text prompt and fed to the model via CLIP text\nembeddings. By conditioning the prior on metadata, we unlock more accurate\nreconstructions and show consistent gains across multiple datasets,\nacceleration factors, and undersampling patterns. Our experiments demonstrate\nthat increasing the fidelity of metadata, ranging from slice location and\ncontrast to patient age, sex, and pathology, systematically boosts\nreconstruction performance. This work highlights the untapped potential of\nleveraging clinical context for inverse problems and opens a new direction for\nmetadata-driven MRI reconstruction.\n","authors":["Hyungjin Chung","Dohun Lee","Zihui Wu","Byung-Hoon Kim","Katherine L. Bouman","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2501.04284v2.pdf","comment":"29 pages, 9 figures. Code is available at\n  https://github.com/DoHunLee1/ContextMRI"},{"id":"http://arxiv.org/abs/2501.04911v1","updated":"2025-01-09T01:58:14Z","published":"2025-01-09T01:58:14Z","title":"A Machine Learning Model for Crowd Density Classification in Hajj Video\n  Frames","summary":"  Managing the massive annual gatherings of Hajj and Umrah presents significant\nchallenges, particularly as the Saudi government aims to increase the number of\npilgrims. Currently, around two million pilgrims attend Hajj and 26 million\nattend Umrah making crowd control especially in critical areas like the Grand\nMosque during Tawaf, a major concern. Additional risks arise in managing dense\ncrowds at key sites such as Arafat where the potential for stampedes, fires and\npandemics poses serious threats to public safety. This research proposes a\nmachine learning model to classify crowd density into three levels: moderate\ncrowd, overcrowded and very dense crowd in video frames recorded during Hajj,\nwith a flashing red light to alert organizers in real-time when a very dense\ncrowd is detected. While current research efforts in processing Hajj\nsurveillance videos focus solely on using CNN to detect abnormal behaviors,\nthis research focuses more on high-risk crowds that can lead to disasters.\nHazardous crowd conditions require a robust method, as incorrect classification\ncould trigger unnecessary alerts and government intervention, while failure to\nclassify could result in disaster. The proposed model integrates Local Binary\nPattern (LBP) texture analysis, which enhances feature extraction for\ndifferentiating crowd density levels, along with edge density and area-based\nfeatures. The model was tested on the KAU-Smart Crowd 'HAJJv2' dataset which\ncontains 18 videos from various key locations during Hajj including 'Massaa',\n'Jamarat', 'Arafat' and 'Tawaf'. The model achieved an accuracy rate of 87%\nwith a 2.14% error percentage (misclassification rate), demonstrating its\nability to detect and classify various crowd conditions effectively. That\ncontributes to enhanced crowd management and safety during large-scale events\nlike Hajj.\n","authors":["Afnan A. Shah"],"pdf_url":"https://arxiv.org/pdf/2501.04911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09236v2","updated":"2025-01-09T01:20:46Z","published":"2024-03-14T09:59:55Z","title":"Hyper-3DG: Text-to-3D Gaussian Generation via Hypergraph","summary":"  Text-to-3D generation represents an exciting field that has seen rapid\nadvancements, facilitating the transformation of textual descriptions into\ndetailed 3D models. However, current progress often neglects the intricate\nhigh-order correlation of geometry and texture within 3D objects, leading to\nchallenges such as over-smoothness, over-saturation and the Janus problem. In\nthis work, we propose a method named ``3D Gaussian Generation via Hypergraph\n(Hyper-3DG)'', designed to capture the sophisticated high-order correlations\npresent within 3D objects. Our framework is anchored by a well-established\nmainflow and an essential module, named ``Geometry and Texture Hypergraph\nRefiner (HGRefiner)''. This module not only refines the representation of 3D\nGaussians but also accelerates the update process of these 3D Gaussians by\nconducting the Patch-3DGS Hypergraph Learning on both explicit attributes and\nlatent visual features. Our framework allows for the production of finely\ngenerated 3D objects within a cohesive optimization, effectively circumventing\ndegradation. Extensive experimentation has shown that our proposed method\nsignificantly enhances the quality of 3D generation while incurring no\nadditional computational overhead for the underlying framework. (Project code:\nhttps://github.com/yjhboy/Hyper3DG)\n","authors":["Donglin Di","Jiahui Yang","Chaofan Luo","Zhou Xue","Wei Chen","Xun Yang","Yue Gao"],"pdf_url":"https://arxiv.org/pdf/2403.09236v2.pdf","comment":"Accepted by IJCV"},{"id":"http://arxiv.org/abs/2410.04041v4","updated":"2025-01-09T00:39:56Z","published":"2024-10-05T05:26:21Z","title":"EndoPerfect: A Hybrid NeRF-Stereo Vision Approach Pioneering Monocular\n  Depth Estimation and 3D Reconstruction in Endoscopy","summary":"  3D reconstruction in endoscopic sinus surgery (ESS) demands exceptional\naccuracy, with the mean error and standard deviation necessitating within the\nrange of a single CT slice (0.625 mm), as the critical structures in the nasal\ncavity are situated within submillimeter distances from surgical instruments.\nThis poses a formidable challenge when using conventional monocular endoscopes.\nDepth estimation is crucial for 3D reconstruction, yet existing depth\nestimation methodologies either suffer from inherent accuracy limitations or,\nin the case of learning-based approaches, perform poorly when applied to ESS\ndespite succeeding on their original datasets. In this study, we present a\nnovel, highly generalizable method that combines Neural Radiance Fields (NeRF)\nand stereo depth estimation for 3D reconstruction that can derive metric\nmonocular depth. Our approach begins with an initial NeRF reconstruction\nyielding a coarse 3D scene, the subsequent creation of binocular pairs within\ncoarse 3D scene, and generation of depth maps through stereo vision, These\ndepth maps are used to supervise subsequent NeRF iteration, progressively\nrefining NeRF and binocular depth, the refinement process continues until the\ndepth maps converged. This recursive process generates high-accuracy depth maps\nfrom monocular endoscopic video. Evaluation in synthetic endoscopy shows a\ndepth accuracy of 0.125 $\\pm$ 0.443 mm, well within the 0.625 mm threshold.\nFurther clinical experiments with real endoscopic data demonstrate a mean\ndistance to CT mesh of 0.269 mm, representing the highest accuracy among\nmonocular 3D reconstruction methods in ESS.\n","authors":["Pengcheng Chen","Wenhao Li","Nicole Gunderson","Jeremy Ruthberg","Randall Bly","Zhenglong Sun","Waleed M. Abuzeid","Eric J. Seibel"],"pdf_url":"https://arxiv.org/pdf/2410.04041v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05611v1","updated":"2025-01-09T23:20:19Z","published":"2025-01-09T23:20:19Z","title":"Bit-depth color recovery via off-the-shelf super-resolution models","summary":"  Advancements in imaging technology have enabled hardware to support 10 to 16\nbits per channel, facilitating precise manipulation in applications like image\nediting and video processing. While deep neural networks promise to recover\nhigh bit-depth representations, existing methods often rely on scale-invariant\nimage information, limiting performance in certain scenarios. In this paper, we\nintroduce a novel approach that integrates a super-resolution architecture to\nextract detailed a priori information from images. By leveraging interpolated\ndata generated during the super-resolution process, our method achieves\npixel-level recovery of fine-grained color details. Additionally, we\ndemonstrate that spatial features learned through the super-resolution process\nsignificantly contribute to the recovery of detailed color depth information.\nExperiments on benchmark datasets demonstrate that our approach outperforms\nstate-of-the-art methods, highlighting the potential of super-resolution for\nhigh-fidelity color restoration.\n","authors":["Xuanshuo Fu","Danna Xue","Javier Vazquez-Corral"],"pdf_url":"https://arxiv.org/pdf/2501.05611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02095v2","updated":"2025-01-09T22:38:13Z","published":"2024-11-04T13:59:01Z","title":"The evolution of volumetric video: A survey of smart transcoding and\n  compression approaches","summary":"  Volumetric video, the capture and display of three-dimensional (3D) imagery,\nhas emerged as a revolutionary technology poised to transform the media\nlandscape, enabling immersive experiences that transcend the limitations of\ntraditional 2D video. One of the key challenges in this domain is the efficient\ndelivery of these high-bandwidth, data-intensive volumetric video streams,\nwhich requires innovative transcoding and compression techniques. This research\npaper explores the state-of-the-art in volumetric video compression and\ndelivery, with a focus on the potential of AI-driven solutions to address the\nunique challenges posed by this emerging medium.\n","authors":["Preetish Kakkar","Hariharan Ragothaman"],"pdf_url":"https://arxiv.org/pdf/2411.02095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08563v2","updated":"2025-01-09T22:36:57Z","published":"2024-12-11T17:31:17Z","title":"Physics Based Differentiable Rendering for Inverse Problems and Beyond","summary":"  Physics-based differentiable rendering (PBDR) has become an efficient method\nin computer vision, graphics, and machine learning for addressing an array of\ninverse problems. PBDR allows patterns to be generated from perceptions which\ncan be applied to enhance object attributes like geometry, substances, and\nlighting by adding physical models of light propagation and materials\ninteraction. Due to these capabilities, distinguished rendering has been\nemployed in a wider range of sectors such as autonomous navigation, scene\nreconstruction, and material design. We provide an extensive overview of PBDR\ntechniques in this study, emphasizing their creation, effectiveness, and\nlimitations while managing inverse situations. We demonstrate modern techniques\nand examine their value in everyday situations.\n","authors":["Preetish Kakkar","Srijani Mukherjee","Hariharan Ragothaman","Vishal Mehta"],"pdf_url":"https://arxiv.org/pdf/2412.08563v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17155v4","updated":"2025-01-09T22:23:15Z","published":"2023-03-30T05:25:20Z","title":"Discriminative Class Tokens for Text-to-Image Diffusion Models","summary":"  Recent advances in text-to-image diffusion models have enabled the generation\nof diverse and high-quality images. While impressive, the images often fall\nshort of depicting subtle details and are susceptible to errors due to\nambiguity in the input text. One way of alleviating these issues is to train\ndiffusion models on class-labeled datasets. This approach has two\ndisadvantages: (i) supervised datasets are generally small compared to\nlarge-scale scraped text-image datasets on which text-to-image models are\ntrained, affecting the quality and diversity of the generated images, or (ii)\nthe input is a hard-coded label, as opposed to free-form text, limiting the\ncontrol over the generated images.\n  In this work, we propose a non-invasive fine-tuning technique that\ncapitalizes on the expressive potential of free-form text while achieving high\naccuracy through discriminative signals from a pretrained classifier. This is\ndone by iteratively modifying the embedding of an added input token of a\ntext-to-image diffusion model, by steering generated images toward a given\ntarget class according to a classifier. Our method is fast compared to prior\nfine-tuning methods and does not require a collection of in-class images or\nretraining of a noise-tolerant classifier. We evaluate our method extensively,\nshowing that the generated images are: (i) more accurate and of higher quality\nthan standard diffusion models, (ii) can be used to augment training data in a\nlow-resource setting, and (iii) reveal information about the data used to train\nthe guiding classifier. The code is available at\n\\url{https://github.com/idansc/discriminative_class_tokens}.\n","authors":["Idan Schwartz","Vésteinn Snæbjarnarson","Hila Chefer","Ryan Cotterell","Serge Belongie","Lior Wolf","Sagie Benaim"],"pdf_url":"https://arxiv.org/pdf/2303.17155v4.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2411.13553v2","updated":"2025-01-09T22:17:30Z","published":"2024-11-20T18:59:58Z","title":"AI-generated Image Detection: Passive or Watermark?","summary":"  While text-to-image models offer numerous benefits, they also pose\nsignificant societal risks. Detecting AI-generated images is crucial for\nmitigating these risks. Detection methods can be broadly categorized into\npassive and watermark-based approaches: passive detectors rely on artifacts\npresent in AI-generated images, whereas watermark-based detectors proactively\nembed watermarks into such images. A key question is which type of detector\nperforms better in terms of effectiveness, robustness, and efficiency. However,\nthe current literature lacks a comprehensive understanding of this issue. In\nthis work, we aim to bridge that gap by developing ImageDetectBench, the first\ncomprehensive benchmark to compare the effectiveness, robustness, and\nefficiency of passive and watermark-based detectors. Our benchmark includes\nfour datasets, each containing a mix of AI-generated and non-AI-generated\nimages. We evaluate five passive detectors and four watermark-based detectors\nagainst eight types of common perturbations and three types of adversarial\nperturbations. Our benchmark results reveal several interesting findings. For\ninstance, watermark-based detectors consistently outperform passive detectors,\nboth in the presence and absence of perturbations. Based on these insights, we\nprovide recommendations for detecting AI-generated images, e.g., when both\ntypes of detectors are applicable, watermark-based detectors should be the\npreferred choice. Our code and data are publicly available at\nhttps://github.com/moyangkuo/ImageDetectBench.git.\n","authors":["Moyang Guo","Yuepeng Hu","Zhengyuan Jiang","Zeyu Li","Amir Sadovnik","Arka Daw","Neil Gong"],"pdf_url":"https://arxiv.org/pdf/2411.13553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06687v2","updated":"2025-01-09T22:14:55Z","published":"2024-08-13T07:27:02Z","title":"Masked Image Modeling: A Survey","summary":"  In this work, we survey recent studies on masked image modeling (MIM), an\napproach that emerged as a powerful self-supervised learning technique in\ncomputer vision. The MIM task involves masking some information, e.g.~pixels,\npatches, or even latent representations, and training a model, usually an\nautoencoder, to predicting the missing information by using the context\navailable in the visible part of the input. We identify and formalize two\ncategories of approaches on how to implement MIM as a pretext task, one based\non reconstruction and one based on contrastive learning. Then, we construct a\ntaxonomy and review the most prominent papers in recent years. We complement\nthe manually constructed taxonomy with a dendrogram obtained by applying a\nhierarchical clustering algorithm. We further identify relevant clusters via\nmanually inspecting the resulting dendrogram. Our review also includes datasets\nthat are commonly used in MIM research. We aggregate the performance results of\nvarious masked image modeling methods on the most popular datasets, to\nfacilitate the comparison of competing methods. Finally, we identify research\ngaps and propose several interesting directions of future work. We supplement\nour survey with the following public repository containing organized\nreferences: https://github.com/vladhondru25/MIM-Survey.\n","authors":["Vlad Hondru","Florinel Alin Croitoru","Shervin Minaee","Radu Tudor Ionescu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2408.06687v2.pdf","comment":"Revised version"},{"id":"http://arxiv.org/abs/2404.18731v3","updated":"2025-01-09T22:10:14Z","published":"2024-04-29T14:17:52Z","title":"Real Time Multi Organ Classification on Computed Tomography Images","summary":"  Organ segmentation is a fundamental task in medical imaging since it is\nuseful for many clinical automation pipelines. However, some tasks do not\nrequire full segmentation. Instead, a classifier can identify the selected\norgan without segmenting the entire volume. In this study, we demonstrate a\nclassifier based method to obtain organ labels in real time by using a large\ncontext size with a sparse data sampling strategy. Although our method operates\nas an independent classifier at query locations, it can generate full\nsegmentations by querying grid locations at any resolution, offering faster\nperformance than segmentation algorithms. We compared our method with existing\nsegmentation techniques, demonstrating its superior runtime potential for\npractical applications in medical imaging.\n","authors":["Halid Ziya Yerebakan","Yoshihisa Shinagawa","Gerardo Hermosillo Valadez"],"pdf_url":"https://arxiv.org/pdf/2404.18731v3.pdf","comment":"11 pages, Organ Classification, Organ Segmentation"},{"id":"http://arxiv.org/abs/2501.05567v1","updated":"2025-01-09T20:34:36Z","published":"2025-01-09T20:34:36Z","title":"Approximate Supervised Object Distance Estimation on Unmanned Surface\n  Vehicles","summary":"  Unmanned surface vehicles (USVs) and boats are increasingly important in\nmaritime operations, yet their deployment is limited due to costly sensors and\ncomplexity. LiDAR, radar, and depth cameras are either costly, yield sparse\npoint clouds or are noisy, and require extensive calibration. Here, we\nintroduce a novel approach for approximate distance estimation in USVs using\nsupervised object detection. We collected a dataset comprising images with\nmanually annotated bounding boxes and corresponding distance measurements.\nLeveraging this data, we propose a specialized branch of an object detection\nmodel, not only to detect objects but also to predict their distances from the\nUSV. This method offers a cost-efficient and intuitive alternative to\nconventional distance measurement techniques, aligning more closely with human\nestimation capabilities. We demonstrate its application in a marine assistance\nsystem that alerts operators to nearby objects such as boats, buoys, or other\nwaterborne hazards.\n","authors":["Benjamin Kiefer","Yitong Quan","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2501.05567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05566v1","updated":"2025-01-09T20:29:31Z","published":"2025-01-09T20:29:31Z","title":"Vision-Language Models for Autonomous Driving: CLIP-Based Dynamic Scene\n  Understanding","summary":"  Scene understanding is essential for enhancing driver safety, generating\nhuman-centric explanations for Automated Vehicle (AV) decisions, and leveraging\nArtificial Intelligence (AI) for retrospective driving video analysis. This\nstudy developed a dynamic scene retrieval system using Contrastive\nLanguage-Image Pretraining (CLIP) models, which can be optimized for real-time\ndeployment on edge devices. The proposed system outperforms state-of-the-art\nin-context learning methods, including the zero-shot capabilities of GPT-4o,\nparticularly in complex scenarios. By conducting frame-level analysis on the\nHonda Scenes Dataset, which contains a collection of about 80 hours of\nannotated driving videos capturing diverse real-world road and weather\nconditions, our study highlights the robustness of CLIP models in learning\nvisual concepts from natural language supervision. Results also showed that\nfine-tuning the CLIP models, such as ViT-L/14 and ViT-B/32, significantly\nimproved scene classification, achieving a top F1 score of 91.1%. These results\ndemonstrate the ability of the system to deliver rapid and precise scene\nrecognition, which can be used to meet the critical requirements of Advanced\nDriver Assistance Systems (ADAS). This study shows the potential of CLIP models\nto provide scalable and efficient frameworks for dynamic scene understanding\nand classification. Furthermore, this work lays the groundwork for advanced\nautonomous vehicle technologies by fostering a deeper understanding of driver\nbehavior, road conditions, and safety-critical scenarios, marking a significant\nstep toward smarter, safer, and more context-aware autonomous driving systems.\n","authors":["Mohammed Elhenawy","Huthaifa I. Ashqar","Andry Rakotonirainy","Taqwa I. Alhadidi","Ahmed Jaber","Mohammad Abu Tami"],"pdf_url":"https://arxiv.org/pdf/2501.05566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09566v3","updated":"2025-01-09T20:24:46Z","published":"2024-09-15T00:53:44Z","title":"Learning Transferable Features for Implicit Neural Representations","summary":"  Implicit neural representations (INRs) have demonstrated success in a variety\nof applications, including inverse problems and neural rendering. An INR is\ntypically trained to capture one signal of interest, resulting in learned\nneural features that are highly attuned to that signal. Assumed to be less\ngeneralizable, we explore the aspect of transferability of such learned neural\nfeatures for fitting similar signals. We introduce a new INR training\nframework, STRAINER that learns transferrable features for fitting INRs to new\nsignals from a given distribution, faster and with better reconstruction\nquality. Owing to the sequential layer-wise affine operations in an INR, we\npropose to learn transferable representations by sharing initial encoder layers\nacross multiple INRs with independent decoder layers. At test time, the learned\nencoder representations are transferred as initialization for an otherwise\nrandomly initialized INR. We find STRAINER to yield extremely powerful\ninitialization for fitting images from the same domain and allow for $\\approx\n+10dB$ gain in signal quality early on compared to an untrained INR itself.\nSTRAINER also provides a simple way to encode data-driven priors in INRs. We\nevaluate STRAINER on multiple in-domain and out-of-domain signal fitting tasks\nand inverse problems and further provide detailed analysis and discussion on\nthe transferability of STRAINER's features. Our demo can be accessed at\nhttps://kushalvyas.github.io/strainer.html .\n","authors":["Kushal Vyas","Ahmed Imtiaz Humayun","Aniket Dashpute","Richard G. Baraniuk","Ashok Veeraraghavan","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2409.09566v3.pdf","comment":"Project Website: https://kushalvyas.github.io/strainer.html"},{"id":"http://arxiv.org/abs/2412.20110v2","updated":"2025-01-09T20:24:29Z","published":"2024-12-28T10:40:21Z","title":"Cross-Modal Mapping: Eliminating the Modality Gap for Few-Shot Image\n  Classification","summary":"  In few-shot image classification tasks, methods based on pretrained\nvision-language models (such as CLIP) have achieved significant progress. Many\nexisting approaches directly utilize visual or textual features as class\nprototypes, however, these features fail to adequately represent their\nrespective classes. We identify that this limitation arises from the modality\ngap inherent in pretrained vision-language models, which weakens the connection\nbetween the visual and textual modalities. To eliminate this modality gap and\nenable textual features to fully represent class prototypes, we propose a\nsimple and efficient Cross-Modal Mapping (CMM) method. This method employs a\nlinear transformation to map image features into the textual feature space,\nensuring that both modalities are comparable within the same feature space.\nNevertheless, the modality gap diminishes the effectiveness of this mapping. To\naddress this, we further introduce a triplet loss to optimize the spatial\nrelationships between image features and class textual features, allowing class\ntextual features to naturally serve as class prototypes for image features.\nExperimental results on 11 benchmark demonstrate an average improvement of\napproximately 3.5% compared to conventional methods and exhibit competitive\nperformance on 4 distribution shift benchmarks.\n","authors":["Xi Yang","Pai Peng","Wulin Xie","Xiaohuan Lu","Jie Wen"],"pdf_url":"https://arxiv.org/pdf/2412.20110v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13969v2","updated":"2025-01-09T20:08:31Z","published":"2023-08-26T22:48:06Z","title":"Gaze-Informed Vision Transformers: Predicting Driving Decisions Under\n  Uncertainty","summary":"  Vision Transformers (ViT) have advanced computer vision, yet their efficacy\nin complex tasks like driving remains less explored. This study enhances ViT by\nintegrating human eye gaze, captured via eye-tracking, to increase prediction\naccuracy in driving scenarios under uncertainty in both real-world and virtual\nreality scenarios. First, we establish the significance of human eye gaze in\nleft-right driving decisions, as observed in both human subjects and a ViT\nmodel. By comparing the similarity between human fixation maps and ViT\nattention weights, we reveal the dynamics of overlap across individual heads\nand layers. This overlap demonstrates that fixation data can guide the model in\ndistributing its attention weights more effectively. We introduce the\nfixation-attention intersection (FAX) loss, a novel loss function that\nsignificantly improves ViT performance under high uncertainty conditions. Our\nresults show that ViT, when trained with FAX loss, aligns its attention with\nhuman gaze patterns. This gaze-informed approach has significant potential for\ndriver behavior analysis, as well as broader applications in human-centered AI\nsystems, extending ViT's use to complex visual environments.\n","authors":["Sharath Koorathota","Nikolas Papadopoulos","Jia Li Ma","Shruti Kumar","Xiaoxiao Sun","Arunesh Mittal","Patrick Adelman","Paul Sajda"],"pdf_url":"https://arxiv.org/pdf/2308.13969v2.pdf","comment":"25 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.05555v1","updated":"2025-01-09T20:02:10Z","published":"2025-01-09T20:02:10Z","title":"Improving Zero-Shot Object-Level Change Detection by Incorporating\n  Visual Correspondence","summary":"  Detecting object-level changes between two images across possibly different\nviews is a core task in many applications that involve visual inspection or\ncamera surveillance. Existing change-detection approaches suffer from three\nmajor limitations: (1) lack of evaluation on image pairs that contain no\nchanges, leading to unreported false positive rates; (2) lack of\ncorrespondences (\\ie, localizing the regions before and after a change); and\n(3) poor zero-shot generalization across different domains. To address these\nissues, we introduce a novel method that leverages change correspondences (a)\nduring training to improve change detection accuracy, and (b) at test time, to\nminimize false positives. That is, we harness the supervision labels of where\nan object is added or removed to supervise change detectors, improving their\naccuracy over previous work by a large margin. Our work is also the first to\npredict correspondences between pairs of detected changes using estimated\nhomography and the Hungarian algorithm. Our model demonstrates superior\nperformance over existing methods, achieving state-of-the-art results in change\ndetection and change correspondence accuracy across both in-distribution and\nzero-shot benchmarks.\n","authors":["Hung Huy Nguyen","Pooyan Rahmanzadehgervi","Long Mail","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.05555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08755v2","updated":"2025-01-09T19:15:20Z","published":"2024-12-11T19:54:14Z","title":"Proactive Adversarial Defense: Harnessing Prompt Tuning in\n  Vision-Language Models to Detect Unseen Backdoored Images","summary":"  Backdoor attacks pose a critical threat by embedding hidden triggers into\ninputs, causing models to misclassify them into target labels. While extensive\nresearch has focused on mitigating these attacks in object recognition models\nthrough weight fine-tuning, much less attention has been given to detecting\nbackdoored samples directly. Given the vast datasets used in training, manual\ninspection for backdoor triggers is impractical, and even state-of-the-art\ndefense mechanisms fail to fully neutralize their impact. To address this gap,\nwe introduce a groundbreaking method to detect unseen backdoored images during\nboth training and inference. Leveraging the transformative success of prompt\ntuning in Vision Language Models (VLMs), our approach trains learnable text\nprompts to differentiate clean images from those with hidden backdoor triggers.\nExperiments demonstrate the exceptional efficacy of this method, achieving an\nimpressive average accuracy of 86% across two renowned datasets for detecting\nunseen backdoor triggers, establishing a new standard in backdoor defense.\n","authors":["Kyle Stein","Andrew Arash Mahyari","Guillermo Francia","Eman El-Sheikh"],"pdf_url":"https://arxiv.org/pdf/2412.08755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05510v1","updated":"2025-01-09T19:00:01Z","published":"2025-01-09T19:00:01Z","title":"OVO-Bench: How Far is Your Video-LLMs from Real-World Online Video\n  Understanding?","summary":"  Temporal Awareness, the ability to reason dynamically based on the timestamp\nwhen a question is raised, is the key distinction between offline and online\nvideo LLMs. Unlike offline models, which rely on complete videos for static,\npost hoc analysis, online models process video streams incrementally and\ndynamically adapt their responses based on the timestamp at which the question\nis posed. Despite its significance, temporal awareness has not been adequately\nevaluated in existing benchmarks. To fill this gap, we present OVO-Bench\n(Online-VideO-Benchmark), a novel video benchmark that emphasizes the\nimportance of timestamps for advanced online video understanding capability\nbenchmarking. OVO-Bench evaluates the ability of video LLMs to reason and\nrespond to events occurring at specific timestamps under three distinct\nscenarios: (1) Backward tracing: trace back to past events to answer the\nquestion. (2) Real-time understanding: understand and respond to events as they\nunfold at the current timestamp. (3) Forward active responding: delay the\nresponse until sufficient future information becomes available to answer the\nquestion accurately. OVO-Bench comprises 12 tasks, featuring 644 unique videos\nand approximately human-curated 2,800 fine-grained meta-annotations with\nprecise timestamps. We combine automated generation pipelines with human\ncuration. With these high-quality samples, we further developed an evaluation\npipeline to systematically query video LLMs along the video timeline.\nEvaluations of nine Video-LLMs reveal that, despite advancements on traditional\nbenchmarks, current models struggle with online video understanding, showing a\nsignificant gap compared to human agents. We hope OVO-Bench will drive progress\nin video LLMs and inspire future research in online video reasoning. Our\nbenchmark and code can be accessed at https://github.com/JoeLeelyf/OVO-Bench.\n","authors":["Yifei Li","Junbo Niu","Ziyang Miao","Chunjiang Ge","Yuanhang Zhou","Qihao He","Xiaoyi Dong","Haodong Duan","Shuangrui Ding","Rui Qian","Pan Zhang","Yuhang Zang","Yuhang Cao","Conghui He","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05510v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2501.05014v1","updated":"2025-01-09T07:15:59Z","published":"2025-01-09T07:15:59Z","title":"UAV-VLA: Vision-Language-Action System for Large Scale Aerial Mission\n  Generation","summary":"  The UAV-VLA (Visual-Language-Action) system is a tool designed to facilitate\ncommunication with aerial robots. By integrating satellite imagery processing\nwith the Visual Language Model (VLM) and the powerful capabilities of GPT,\nUAV-VLA enables users to generate general flight paths-and-action plans through\nsimple text requests. This system leverages the rich contextual information\nprovided by satellite images, allowing for enhanced decision-making and mission\nplanning. The combination of visual analysis by VLM and natural language\nprocessing by GPT can provide the user with the path-and-action set, making\naerial operations more efficient and accessible. The newly developed method\nshowed the difference in the length of the created trajectory in 22% and the\nmean error in finding the objects of interest on a map in 34.22 m by Euclidean\ndistance in the K-Nearest Neighbors (KNN) approach.\n","authors":["Oleg Sautenkov","Yasheerah Yaqoot","Artem Lykov","Muhammad Ahsan Mustafa","Grik Tadevosyan","Aibek Akhmetkazy","Miguel Altamirano Cabrera","Mikhail Martynov","Sausar Karaf","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.05014v1.pdf","comment":"HRI 2025"},{"id":"http://arxiv.org/abs/2501.04940v1","updated":"2025-01-09T03:14:03Z","published":"2025-01-09T03:14:03Z","title":"A New Perspective on Privacy Protection in Federated Learning with\n  Granular-Ball Computing","summary":"  Federated Learning (FL) facilitates collaborative model training while\nprioritizing privacy by avoiding direct data sharing. However, most existing\narticles attempt to address challenges within the model's internal parameters\nand corresponding outputs, while neglecting to solve them at the input level.\nTo address this gap, we propose a novel framework called Granular-Ball\nFederated Learning (GrBFL) for image classification. GrBFL diverges from\ntraditional methods that rely on the finest-grained input data. Instead, it\nsegments images into multiple regions with optimal coarse granularity, which\nare then reconstructed into a graph structure. We designed a two-dimensional\nbinary search segmentation algorithm based on variance constraints for GrBFL,\nwhich effectively removes redundant information while preserving key\nrepresentative features. Extensive theoretical analysis and experiments\ndemonstrate that GrBFL not only safeguards privacy and enhances efficiency but\nalso maintains robust utility, consistently outperforming other\nstate-of-the-art FL methods. The code is available at\nhttps://github.com/AIGNLAI/GrBFL.\n","authors":["Guannan Lai","Yihui Feng","Xin Yang","Xiaoyu Deng","Hao Yu","Shuyin Xia","Guoyin Wang","Tianrui Li"],"pdf_url":"https://arxiv.org/pdf/2501.04940v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2501.05366v1","updated":"2025-01-09T16:48:17Z","published":"2025-01-09T16:48:17Z","title":"Search-o1: Agentic Search-Enhanced Large Reasoning Models","summary":"  Large reasoning models (LRMs) like OpenAI-o1 have demonstrated impressive\nlong stepwise reasoning capabilities through large-scale reinforcement\nlearning. However, their extended reasoning processes often suffer from\nknowledge insufficiency, leading to frequent uncertainties and potential\nerrors. To address this limitation, we introduce \\textbf{Search-o1}, a\nframework that enhances LRMs with an agentic retrieval-augmented generation\n(RAG) mechanism and a Reason-in-Documents module for refining retrieved\ndocuments. Search-o1 integrates an agentic search workflow into the reasoning\nprocess, enabling dynamic retrieval of external knowledge when LRMs encounter\nuncertain knowledge points. Additionally, due to the verbose nature of\nretrieved documents, we design a separate Reason-in-Documents module to deeply\nanalyze the retrieved information before injecting it into the reasoning chain,\nminimizing noise and preserving coherent reasoning flow. Extensive experiments\non complex reasoning tasks in science, mathematics, and coding, as well as six\nopen-domain QA benchmarks, demonstrate the strong performance of Search-o1.\nThis approach enhances the trustworthiness and applicability of LRMs in complex\nreasoning tasks, paving the way for more reliable and versatile intelligent\nsystems. The code is available at\n\\url{https://github.com/sunnynexus/Search-o1}.\n","authors":["Xiaoxi Li","Guanting Dong","Jiajie Jin","Yuyao Zhang","Yujia Zhou","Yutao Zhu","Peitian Zhang","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2501.05366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05289v1","updated":"2025-01-09T14:51:58Z","published":"2025-01-09T14:51:58Z","title":"Unraveling the Impact of Visual Complexity on Search as Learning","summary":"  Information search has become essential for learning and knowledge\nacquisition, offering broad access to information and learning resources. The\nvisual complexity of web pages is known to influence search behavior, with\nprevious work suggesting that searchers make evaluative judgments within the\nfirst second on a page. However, there is a significant gap in our\nunderstanding of how visual complexity impacts searches specifically conducted\nwith a learning intent. This gap is particularly relevant for the development\nof optimized information retrieval (IR) systems that effectively support\neducational objectives. To address this research need, we model visual\ncomplexity and aesthetics via a diverse set of features, investigating their\nrelationship with search behavior during learning-oriented web sessions. Our\nstudy utilizes a publicly available dataset from a lab study where participants\nlearned about thunderstorm formation. Our findings reveal that while content\nrelevance is the most significant predictor for knowledge gain, sessions with\nless visually complex pages are associated with higher learning success. This\nobservation applies to features associated with the layout of web pages rather\nthan to simpler features (e.g., number of images). The reported results shed\nlight on the impact of visual complexity on learning-oriented searches,\ninforming the design of more effective IR systems for educational contexts. To\nfoster reproducibility, we release our source code\n(https://github.com/TIBHannover/sal_visual_complexity).\n","authors":["Wolfgang Gritz","Anett Hoppe","Ralph Ewerth"],"pdf_url":"https://arxiv.org/pdf/2501.05289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05220v1","updated":"2025-01-09T13:13:24Z","published":"2025-01-09T13:13:24Z","title":"A Novel Approach to Scalable and Automatic Topic-Controlled Question\n  Generation in Education","summary":"  The development of Automatic Question Generation (QG) models has the\npotential to significantly improve educational practices by reducing the\nteacher workload associated with creating educational content. This paper\nintroduces a novel approach to educational question generation that controls\nthe topical focus of questions. The proposed Topic-Controlled Question\nGeneration (T-CQG) method enhances the relevance and effectiveness of the\ngenerated content for educational purposes. Our approach uses fine-tuning on a\npre-trained T5-small model, employing specially created datasets tailored to\neducational needs. The research further explores the impacts of pre-training\nstrategies, quantisation, and data augmentation on the model's performance. We\nspecifically address the challenge of generating semantically aligned questions\nwith paragraph-level contexts, thereby improving the topic specificity of the\ngenerated questions. In addition, we introduce and explore novel evaluation\nmethods to assess the topical relatedness of the generated questions. Our\nresults, validated through rigorous offline and human-backed evaluations,\ndemonstrate that the proposed models effectively generate high-quality,\ntopic-focused questions. These models have the potential to reduce teacher\nworkload and support personalised tutoring systems by serving as bespoke\nquestion generators. With its relatively small number of parameters, the\nproposals not only advance the capabilities of question generation models for\nhandling specific educational topics but also offer a scalable solution that\nreduces infrastructure costs. This scalability makes them feasible for\nwidespread use in education without reliance on proprietary large language\nmodels like ChatGPT.\n","authors":["Ziqing Li","Mutlu Cukurova","Sahan Bulathwela"],"pdf_url":"https://arxiv.org/pdf/2501.05220v1.pdf","comment":"To be published at ACM Conf. on Learning Analytics and Knowledge\n  (LAK'25)"},{"id":"http://arxiv.org/abs/2501.05170v1","updated":"2025-01-09T11:44:49Z","published":"2025-01-09T11:44:49Z","title":"De-centering the (Traditional) User: Multistakeholder Evaluation of\n  Recommender Systems","summary":"  Multistakeholder recommender systems are those that account for the impacts\nand preferences of multiple groups of individuals, not just the end users\nreceiving recommendations. Due to their complexity, evaluating these systems\ncannot be restricted to the overall utility of a single stakeholder, as is\noften the case of more mainstream recommender system applications. In this\narticle, we focus our discussion on the intricacies of the evaluation of\nmultistakeholder recommender systems. We bring attention to the different\naspects involved in the evaluation of multistakeholder recommender systems -\nfrom the range of stakeholders involved (including but not limited to producers\nand consumers) to the values and specific goals of each relevant stakeholder.\nAdditionally, we discuss how to move from theoretical principles to practical\nimplementation, providing specific use case examples. Finally, we outline open\nresearch directions for the RecSys community to explore. We aim to provide\nguidance to researchers and practitioners about how to think about these\ncomplex and domain-dependent issues of evaluation in the course of designing,\ndeveloping, and researching applications with multistakeholder aspects.\n","authors":["Robin Burke","Gediminas Adomavicius","Toine Bogers","Tommaso Di Noia","Dominik Kowald","Julia Neidhardt","Özlem Özgöbek","Maria Soledad Pera","Nava Tintarev","Jürgen Ziegler"],"pdf_url":"https://arxiv.org/pdf/2501.05170v1.pdf","comment":"Preprint submitted to Elsevier, \"Re-centering the User in Recommender\n  System Research\" special issue of the International Journal of Human-Computer\n  Studies (IJHCS)"},{"id":"http://arxiv.org/abs/2501.05082v1","updated":"2025-01-09T09:03:43Z","published":"2025-01-09T09:03:43Z","title":"Comparison of Feature Learning Methods for Metadata Extraction from PDF\n  Scholarly Documents","summary":"  The availability of metadata for scientific documents is pivotal in\npropelling scientific knowledge forward and for adhering to the FAIR principles\n(i.e. Findability, Accessibility, Interoperability, and Reusability) of\nresearch findings. However, the lack of sufficient metadata in published\ndocuments, particularly those from smaller and mid-sized publishers, hinders\ntheir accessibility. This issue is widespread in some disciplines, such as the\nGerman Social Sciences, where publications often employ diverse templates. To\naddress this challenge, our study evaluates various feature learning and\nprediction methods, including natural language processing (NLP), computer\nvision (CV), and multimodal approaches, for extracting metadata from documents\nwith high template variance. We aim to improve the accessibility of scientific\ndocuments and facilitate their wider use. To support our comparison of these\nmethods, we provide comprehensive experimental results, analyzing their\naccuracy and efficiency in extracting metadata. Additionally, we provide\nvaluable insights into the strengths and weaknesses of various feature learning\nand prediction methods, which can guide future research in this field.\n","authors":["Zeyd Boukhers","Cong Yang"],"pdf_url":"https://arxiv.org/pdf/2501.05082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05072v1","updated":"2025-01-09T08:54:19Z","published":"2025-01-09T08:54:19Z","title":"A Flexible and Scalable Framework for Video Moment Search","summary":"  Video moment search, the process of finding relevant moments in a video\ncorpus to match a user's query, is crucial for various applications. Existing\nsolutions, however, often assume a single perfect matching moment, struggle\nwith inefficient inference, and have limitations with hour-long videos. This\npaper introduces a flexible and scalable framework for retrieving a ranked list\nof moments from collection of videos in any length to match a text query, a\ntask termed Ranked Video Moment Retrieval (RVMR). Our framework, called\nSegment-Proposal-Ranking (SPR), simplifies the search process into three\nindependent stages: segment retrieval, proposal generation, and moment\nrefinement with re-ranking. Specifically, videos are divided into equal-length\nsegments with precomputed embeddings indexed offline, allowing efficient\nretrieval regardless of video length. For scalable online retrieval, both\nsegments and queries are projected into a shared feature space to enable\napproximate nearest neighbor (ANN) search. Retrieved segments are then merged\ninto coarse-grained moment proposals. Then a refinement and re-ranking module\nis designed to reorder and adjust timestamps of the coarse-grained proposals.\nEvaluations on the TVR-Ranking dataset demonstrate that our framework achieves\nstate-of-the-art performance with significant reductions in computational cost\nand processing time. The flexible design also allows for independent\nimprovements to each stage, making SPR highly adaptable for large-scale\napplications.\n","authors":["Chongzhi Zhang","Xizhou Zhu","Aixin Sun"],"pdf_url":"https://arxiv.org/pdf/2501.05072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05018v1","updated":"2025-01-09T07:21:44Z","published":"2025-01-09T07:21:44Z","title":"Finding Needles in Emb(a)dding Haystacks: Legal Document Retrieval via\n  Bagging and SVR Ensembles","summary":"  We introduce a retrieval approach leveraging Support Vector Regression (SVR)\nensembles, bootstrap aggregation (bagging), and embedding spaces on the German\nDataset for Legal Information Retrieval (GerDaLIR). By conceptualizing the\nretrieval task in terms of multiple binary needle-in-a-haystack subtasks, we\nshow improved recall over the baselines (0.849 > 0.803 | 0.829) using our\nvoting ensemble, suggesting promising initial results, without training or\nfine-tuning any deep learning models. Our approach holds potential for further\nenhancement, particularly through refining the encoding models and optimizing\nhyperparameters.\n","authors":["Kevin Bönisch","Alexander Mehler"],"pdf_url":"https://arxiv.org/pdf/2501.05018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08981v3","updated":"2025-01-09T00:53:45Z","published":"2024-08-16T19:10:48Z","title":"From Lazy to Prolific: Tackling Missing Labels in Open Vocabulary\n  Extreme Classification by Positive-Unlabeled Sequence Learning","summary":"  Open-vocabulary Extreme Multi-label Classification (OXMC) extends traditional\nXMC by allowing prediction beyond an extremely large, predefined label set\n(typically $10^3$ to $10^{12}$ labels), addressing the dynamic nature of\nreal-world labeling tasks. However, self-selection bias in data annotation\nleads to significant missing labels in both training and test data,\nparticularly for less popular inputs. This creates two critical challenges:\ngeneration models learn to be \"lazy'\" by under-generating labels, and\nevaluation becomes unreliable due to insufficient annotation in the test set.\nIn this work, we introduce Positive-Unlabeled Sequence Learning (PUSL), which\nreframes OXMC as an infinite keyphrase generation task, addressing the\ngeneration model's laziness. Additionally, we propose to adopt a suite of\nevaluation metrics, F1@$\\mathcal{O}$ and newly proposed B@$k$, to reliably\nassess OXMC models with incomplete ground truths. In a highly imbalanced\ne-commerce dataset with substantial missing labels, PUSL generates 30% more\nunique labels, and 72% of its predictions align with actual user queries. On\nthe less skewed EURLex-4.3k dataset, PUSL demonstrates superior F1 scores,\nespecially as label counts increase from 15 to 30. Our approach effectively\ntackles both the modeling and evaluation challenges in OXMC with missing\nlabels.\n","authors":["Ranran Haoran Zhang","Bensu Uçar","Soumik Dey","Hansi Wu","Binbin Li","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05606v1","updated":"2025-01-09T22:48:43Z","published":"2025-01-09T22:48:43Z","title":"Harmonizing Metadata of Language Resources for Enhanced Querying and\n  Accessibility","summary":"  This paper addresses the harmonization of metadata from diverse repositories\nof language resources (LRs). Leveraging linked data and RDF techniques, we\nintegrate data from multiple sources into a unified model based on DCAT and\nMETA-SHARE OWL ontology. Our methodology supports text-based search, faceted\nbrowsing, and advanced SPARQL queries through Linghub, a newly developed\nportal. Real user queries from the Corpora Mailing List (CML) were evaluated to\nassess Linghub capability to satisfy actual user needs. Results indicate that\nwhile some limitations persist, many user requests can be successfully\naddressed. The study highlights significant metadata issues and advocates for\nadherence to open vocabularies and standards to enhance metadata harmonization.\nThis initial research underscores the importance of API-based access to LRs,\npromoting machine usability and data subset extraction for specific purposes,\npaving the way for more efficient and standardized LR utilization.\n","authors":["Zixuan Liang"],"pdf_url":"https://arxiv.org/pdf/2501.05606v1.pdf","comment":"2024 5th International Conference on Computers and Artificial\n  Intelligence Technology (CAIT 2024)"},{"id":"http://arxiv.org/abs/2404.09889v3","updated":"2025-01-09T22:43:05Z","published":"2024-04-15T15:55:01Z","title":"Is Table Retrieval a Solved Problem? Exploring Join-Aware Multi-Table\n  Retrieval","summary":"  Retrieving relevant tables containing the necessary information to accurately\nanswer a given question over tables is critical to open-domain\nquestion-answering (QA) systems. Previous methods assume the answer to such a\nquestion can be found either in a single table or multiple tables identified\nthrough question decomposition or rewriting. However, neither of these\napproaches is sufficient, as many questions require retrieving multiple tables\nand joining them through a join plan that cannot be discerned from the user\nquery itself. If the join plan is not considered in the retrieval stage, the\nsubsequent steps of reasoning and answering based on those retrieved tables are\nlikely to be incorrect. To address this problem, we introduce a method that\nuncovers useful join relations for any query and database during table\nretrieval. We use a novel re-ranking method formulated as a mixed-integer\nprogram that considers not only table-query relevance but also table-table\nrelevance that requires inferring join relationships. Our method outperforms\nthe state-of-the-art approaches for table retrieval by up to 9.3% in F1 score\nand for end-to-end QA by up to 5.4% in accuracy.\n","authors":["Peter Baile Chen","Yi Zhang","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2404.09889v3.pdf","comment":"ACL 2024. Dataset and code are available at\n  https://peterbaile.github.io/jar"},{"id":"http://arxiv.org/abs/2405.17428v2","updated":"2025-01-09T22:27:06Z","published":"2024-05-27T17:59:45Z","title":"NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding\n  Models","summary":"  Decoder-only large language model (LLM)-based embedding models are beginning\nto outperform BERT or T5-based embedding models in general-purpose text\nembedding tasks, including dense vector-based retrieval. In this work, we\nintroduce the NV-Embed model, incorporating architectural designs, training\nprocedures, and curated datasets to significantly enhance the performance of\nLLM as a versatile embedding model, while maintaining its simplicity and\nreproducibility. For model architecture, we propose a latent attention layer to\nobtain pooled embeddings, which consistently improves retrieval and downstream\ntask accuracy compared to mean pooling or using the last <EOS> token embedding\nfrom LLMs. To enhance representation learning, we remove the causal attention\nmask of LLMs during contrastive training. For training algorithm, we introduce\na two-stage contrastive instruction-tuning method. It first applies contrastive\ntraining with instructions on retrieval datasets, utilizing in-batch negatives\nand curated hard negative examples. At stage-2, it blends various non-retrieval\ninto instruction tuning, which not only enhances non-retrieval task accuracy\nbut also improves retrieval performance. For training data, we utilize the\nhard-negative mining, synthetic data generation and existing public available\ndatasets to boost the performance of embedding model. By combining these\ntechniques, our NV-Embed-v1 and NV-Embed-v2 models obtained the No.1 position\non the Massive Text Embedding Benchmark (MTEB) (as of May 24, 2024 and August\n30, 2024, respectively) across 56 embedding tasks, demonstrating the sustained\neffectiveness of the proposed methods over time. Additionally, it achieved the\nhighest scores in the Long Doc section and the second-highest scores in the QA\nsection of the AIR Benchmark, which covers a range of out-of-domain information\nretrieval topics beyond those in MTEB.\n","authors":["Chankyu Lee","Rajarshi Roy","Mengyao Xu","Jonathan Raiman","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2405.17428v2.pdf","comment":"We open-source the model at:\n  https://huggingface.co/nvidia/NV-Embed-v2"},{"id":"http://arxiv.org/abs/2501.05497v1","updated":"2025-01-09T17:20:00Z","published":"2025-01-09T17:20:00Z","title":"Spatial Information Integration in Small Language Models for Document\n  Layout Generation and Classification","summary":"  Document layout understanding is a field of study that analyzes the spatial\narrangement of information in a document hoping to understand its structure and\nlayout. Models such as LayoutLM (and its subsequent iterations) can understand\nsemi-structured documents with SotA results; however, the lack of open\nsemi-structured data is a limitation in itself. While semi-structured data is\ncommon in everyday life (balance sheets, purchase orders, receipts), there is a\nlack of public datasets for training machine learning models for this type of\ndocument. In this investigation we propose a method to generate new, synthetic,\nlayout information that can help overcoming this data shortage. According to\nour results, the proposed method performs better than LayoutTransformer,\nanother popular layout generation method. We also show that, in some scenarios,\ntext classification can improve when supported by bounding box information.\n","authors":["Pablo Melendez","Clemens Havas"],"pdf_url":"https://arxiv.org/pdf/2501.05497v1.pdf","comment":"8 pages. Symposium on Applied Computing 2025"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2501.05450v1","updated":"2025-01-09T18:59:56Z","published":"2025-01-09T18:59:56Z","title":"Decentralized Diffusion Models","summary":"  Large-scale AI model training divides work across thousands of GPUs, then\nsynchronizes gradients across them at each step. This incurs a significant\nnetwork burden that only centralized, monolithic clusters can support, driving\nup infrastructure costs and straining power systems. We propose Decentralized\nDiffusion Models, a scalable framework for distributing diffusion model\ntraining across independent clusters or datacenters by eliminating the\ndependence on a centralized, high-bandwidth networking fabric. Our method\ntrains a set of expert diffusion models over partitions of the dataset, each in\nfull isolation from one another. At inference time, the experts ensemble\nthrough a lightweight router. We show that the ensemble collectively optimizes\nthe same objective as a single model trained over the whole dataset. This means\nwe can divide the training burden among a number of \"compute islands,\" lowering\ninfrastructure costs and improving resilience to localized GPU failures.\nDecentralized diffusion models empower researchers to take advantage of\nsmaller, more cost-effective and more readily available compute like on-demand\nGPU nodes rather than central integrated systems. We conduct extensive\nexperiments on ImageNet and LAION Aesthetics, showing that decentralized\ndiffusion models FLOP-for-FLOP outperform standard diffusion models. We finally\nscale our approach to 24 billion parameters, demonstrating that high-quality\ndiffusion models can now be trained with just eight individual GPU nodes in\nless than a week.\n","authors":["David McAllister","Matthew Tancik","Jiaming Song","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2501.05450v1.pdf","comment":"Project webpage: https://decentralizeddiffusion.github.io/"},{"id":"http://arxiv.org/abs/2501.05445v1","updated":"2025-01-09T18:56:05Z","published":"2025-01-09T18:56:05Z","title":"Consistent Flow Distillation for Text-to-3D Generation","summary":"  Score Distillation Sampling (SDS) has made significant strides in distilling\nimage-generative models for 3D generation. However, its\nmaximum-likelihood-seeking behavior often leads to degraded visual quality and\ndiversity, limiting its effectiveness in 3D applications. In this work, we\npropose Consistent Flow Distillation (CFD), which addresses these limitations.\nWe begin by leveraging the gradient of the diffusion ODE or SDE sampling\nprocess to guide the 3D generation. From the gradient-based sampling\nperspective, we find that the consistency of 2D image flows across different\nviewpoints is important for high-quality 3D generation. To achieve this, we\nintroduce multi-view consistent Gaussian noise on the 3D object, which can be\nrendered from various viewpoints to compute the flow gradient. Our experiments\ndemonstrate that CFD, through consistent flows, significantly outperforms\nprevious methods in text-to-3D generation.\n","authors":["Runjie Yan","Yinbo Chen","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05445v1.pdf","comment":"Project page: https://runjie-yan.github.io/cfd/"},{"id":"http://arxiv.org/abs/2501.05441v1","updated":"2025-01-09T18:53:06Z","published":"2025-01-09T18:53:06Z","title":"The GAN is dead; long live the GAN! A Modern GAN Baseline","summary":"  There is a widely-spread claim that GANs are difficult to train, and GAN\narchitectures in the literature are littered with empirical tricks. We provide\nevidence against this claim and build a modern GAN baseline in a more\nprincipled manner. First, we derive a well-behaved regularized relativistic GAN\nloss that addresses issues of mode dropping and non-convergence that were\npreviously tackled via a bag of ad-hoc tricks. We analyze our loss\nmathematically and prove that it admits local convergence guarantees, unlike\nmost existing relativistic losses. Second, our new loss allows us to discard\nall ad-hoc tricks and replace outdated backbones used in common GANs with\nmodern architectures. Using StyleGAN2 as an example, we present a roadmap of\nsimplification and modernization that results in a new minimalist baseline --\nR3GAN. Despite being simple, our approach surpasses StyleGAN2 on FFHQ,\nImageNet, CIFAR, and Stacked MNIST datasets, and compares favorably against\nstate-of-the-art GANs and diffusion models.\n","authors":["Yiwen Huang","Aaron Gokaslan","Volodymyr Kuleshov","James Tompkin"],"pdf_url":"https://arxiv.org/pdf/2501.05441v1.pdf","comment":"Accepted to NeurIPS 2024. Code available at\n  https://github.com/brownvc/R3GAN/"},{"id":"http://arxiv.org/abs/2501.05439v1","updated":"2025-01-09T18:49:39Z","published":"2025-01-09T18:49:39Z","title":"From Simple to Complex Skills: The Case of In-Hand Object Reorientation","summary":"  Learning policies in simulation and transferring them to the real world has\nbecome a promising approach in dexterous manipulation. However, bridging the\nsim-to-real gap for each new task requires substantial human effort, such as\ncareful reward engineering, hyperparameter tuning, and system identification.\nIn this work, we present a system that leverages low-level skills to address\nthese challenges for more complex tasks. Specifically, we introduce a\nhierarchical policy for in-hand object reorientation based on previously\nacquired rotation skills. This hierarchical policy learns to select which\nlow-level skill to execute based on feedback from both the environment and the\nlow-level skill policies themselves. Compared to learning from scratch, the\nhierarchical policy is more robust to out-of-distribution changes and transfers\neasily from simulation to real-world environments. Additionally, we propose a\ngeneralizable object pose estimator that uses proprioceptive information,\nlow-level skill predictions, and control errors as inputs to estimate the\nobject pose over time. We demonstrate that our system can reorient objects,\nincluding symmetrical and textureless ones, to a desired pose.\n","authors":["Haozhi Qi","Brent Yi","Mike Lambeta","Yi Ma","Roberto Calandra","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2501.05439v1.pdf","comment":"website: https://dexhier.github.io"},{"id":"http://arxiv.org/abs/2412.11526v3","updated":"2025-01-09T18:44:52Z","published":"2024-12-16T08:01:22Z","title":"Probabilities-Informed Machine Learning","summary":"  Machine learning (ML) has emerged as a powerful tool for tackling complex\nregression and classification tasks, yet its success often hinges on the\nquality of training data. This study introduces an ML paradigm inspired by\ndomain knowledge of the structure of output function, akin to physics-informed\nML, but rooted in probabilistic principles rather than physical laws. The\nproposed approach integrates the probabilistic structure of the target variable\n(such as its cumulative distribution function) into the training process. This\nprobabilistic information is obtained from historical data or estimated using\nstructural reliability methods during experimental design. By embedding\ndomain-specific probabilistic insights into the learning process, the technique\nenhances model accuracy and mitigates risks of overfitting and underfitting.\nApplications in regression, image denoising, and classification demonstrate the\napproach's effectiveness in addressing real-world problems.\n","authors":["Mohsen Rashki"],"pdf_url":"https://arxiv.org/pdf/2412.11526v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05425v1","updated":"2025-01-09T18:31:35Z","published":"2025-01-09T18:31:35Z","title":"Entangled Mean Estimation in High-Dimensions","summary":"  We study the task of high-dimensional entangled mean estimation in the\nsubset-of-signals model. Specifically, given $N$ independent random points\n$x_1,\\ldots,x_N$ in $\\mathbb{R}^D$ and a parameter $\\alpha \\in (0, 1)$ such\nthat each $x_i$ is drawn from a Gaussian with mean $\\mu$ and unknown\ncovariance, and an unknown $\\alpha$-fraction of the points have\nidentity-bounded covariances, the goal is to estimate the common mean $\\mu$.\nThe one-dimensional version of this task has received significant attention in\ntheoretical computer science and statistics over the past decades. Recent work\n[LY20; CV24] has given near-optimal upper and lower bounds for the\none-dimensional setting. On the other hand, our understanding of even the\ninformation-theoretic aspects of the multivariate setting has remained limited.\n  In this work, we design a computationally efficient algorithm achieving an\ninformation-theoretically near-optimal error. Specifically, we show that the\noptimal error (up to polylogarithmic factors) is $f(\\alpha,N) + \\sqrt{D/(\\alpha\nN)}$, where the term $f(\\alpha,N)$ is the error of the one-dimensional problem\nand the second term is the sub-Gaussian error rate. Our algorithmic approach\nemploys an iterative refinement strategy, whereby we progressively learn more\naccurate approximations $\\hat \\mu$ to $\\mu$. This is achieved via a novel\nrejection sampling procedure that removes points significantly deviating from\n$\\hat \\mu$, as an attempt to filter out unusually noisy samples. A complication\nthat arises is that rejection sampling introduces bias in the distribution of\nthe remaining points. To address this issue, we perform a careful analysis of\nthe bias, develop an iterative dimension-reduction strategy, and employ a novel\nsubroutine inspired by list-decodable learning that leverages the\none-dimensional result.\n","authors":["Ilias Diakonikolas","Daniel M. Kane","Sihan Liu","Thanasis Pittas"],"pdf_url":"https://arxiv.org/pdf/2501.05425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05423v1","updated":"2025-01-09T18:30:14Z","published":"2025-01-09T18:30:14Z","title":"Using LLMs to Infer Non-Binary COVID-19 Sentiments of Chinese\n  Micro-bloggers","summary":"  Studying public sentiment during crises is crucial for understanding how\nopinions and sentiments shift, resulting in polarized societies. We study\nWeibo, the most popular microblogging site in China, using posts made during\nthe outbreak of the COVID-19 crisis. The study period includes the pre-COVID-19\nstage, the outbreak stage, and the early stage of epidemic prevention. We use\nLlama 3 8B, a Large Language Model, to analyze users' sentiments on the\nplatform by classifying them into positive, negative, sarcastic, and neutral\ncategories. Analyzing sentiment shifts on Weibo provides insights into how\nsocial events and government actions influence public opinion. This study\ncontributes to understanding the dynamics of social sentiments during health\ncrises, fulfilling a gap in sentiment analysis for Chinese platforms. By\nexamining these dynamics, we aim to offer valuable perspectives on digital\ncommunication's role in shaping society's responses during unprecedented global\nchallenges.\n","authors":["Jerry Chongyi Hu","Mohammed Shahid Modi","Boleslaw K. Szymanski"],"pdf_url":"https://arxiv.org/pdf/2501.05423v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.05415v1","updated":"2025-01-09T18:17:27Z","published":"2025-01-09T18:17:27Z","title":"Uncertainty-aware Knowledge Tracing","summary":"  Knowledge Tracing (KT) is crucial in education assessment, which focuses on\ndepicting students' learning states and assessing students' mastery of\nsubjects. With the rise of modern online learning platforms, particularly\nmassive open online courses (MOOCs), an abundance of interaction data has\ngreatly advanced the development of the KT technology. Previous research\ncommonly adopts deterministic representation to capture students' knowledge\nstates, which neglects the uncertainty during student interactions and thus\nfails to model the true knowledge state in learning process. In light of this,\nwe propose an Uncertainty-Aware Knowledge Tracing model (UKT) which employs\nstochastic distribution embeddings to represent the uncertainty in student\ninteractions, with a Wasserstein self-attention mechanism designed to capture\nthe transition of state distribution in student learning behaviors.\nAdditionally, we introduce the aleatory uncertainty-aware contrastive learning\nloss, which strengthens the model's robustness towards different types of\nuncertainties. Extensive experiments on six real-world datasets demonstrate\nthat UKT not only significantly surpasses existing deep learning-based models\nin KT prediction, but also shows unique advantages in handling the uncertainty\nof student interactions.\n","authors":["Weihua Cheng","Hanwen Du","Chunxiao Li","Ersheng Ni","Liangdi Tan","Tianqi Xu","Yongxin Ni"],"pdf_url":"https://arxiv.org/pdf/2501.05415v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.18234v2","updated":"2025-01-09T18:16:38Z","published":"2024-12-24T07:35:48Z","title":"Conditional Deep Canonical Time Warping","summary":"  Temporal alignment of sequences is a fundamental challenge in many\napplications, such as computer vision and bioinformatics, where local time\nshifting needs to be accounted for. Misalignment can lead to poor model\ngeneralization, especially in high-dimensional sequences. Existing methods\noften struggle with optimization when dealing with high-dimensional sparse\ndata, falling into poor alignments. Feature selection is frequently used to\nenhance model performance for sparse data. However, a fixed set of selected\nfeatures would not generally work for dynamically changing sequences and would\nneed to be modified based on the state of the sequence. Therefore, modifying\nthe selected feature based on contextual input would result in better\nalignment. Our suggested method, Conditional Deep Canonical Temporal Time\nWarping (CDCTW), is designed for temporal alignment in sparse temporal data to\naddress these challenges. CDCTW enhances alignment accuracy for high\ndimensional time-dependent views be performing dynamic time warping on data\nembedded in maximally correlated subspace which handles sparsity with novel\nfeature selection method. We validate the effectiveness of CDCTW through\nextensive experiments on various datasets, demonstrating superior performance\nover previous techniques.\n","authors":["Afek Steinberg","Ran Eisenberg","Ofir Lindenbaum"],"pdf_url":"https://arxiv.org/pdf/2412.18234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05409v1","updated":"2025-01-09T18:06:45Z","published":"2025-01-09T18:06:45Z","title":"A Novel Pathology Foundation Model by Mayo Clinic, Charité, and\n  Aignostics","summary":"  Recent advances in digital pathology have demonstrated the effectiveness of\nfoundation models across diverse applications. In this report, we present a\nnovel vision foundation model based on the RudolfV approach. Our model was\ntrained on a dataset comprising 1.2 million histopathology whole slide images,\ncollected from two medical institutions: Mayo Clinic and Charit\\'e -\nUniverst\\\"atsmedizin Berlin. Comprehensive evaluations show that our model\nachieves state-of-the-art performance across twenty-one public benchmark\ndatasets, even though it is neither the largest model by parameter count nor by\ntraining dataset size.\n","authors":["Maximilian Alber","Stephan Tietz","Jonas Dippel","Timo Milbich","Timothée Lesort","Panos Korfiatis","Moritz Krügener","Beatriz Perez Cancer","Neelay Shah","Alexander Möllers","Philipp Seegerer","Alexandra Carpen-Amarie","Kai Standvoss","Gabriel Dernbach","Edwin de Jong","Simon Schallenberg","Andreas Kunft","Helmut Hoffer von Ankershoffen","Gavin Schaeferle","Patrick Duffy","Matt Redlon","Philipp Jurmeister","David Horst","Lukas Ruff","Klaus-Robert Müller","Frederick Klauschen","Andrew Norgan"],"pdf_url":"https://arxiv.org/pdf/2501.05409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05408v1","updated":"2025-01-09T18:05:33Z","published":"2025-01-09T18:05:33Z","title":"TimeRL: Efficient Deep Reinforcement Learning with Polyhedral Dependence\n  Graphs","summary":"  Modern deep learning (DL) workloads increasingly use complex deep\nreinforcement learning (DRL) algorithms that generate training data within the\nlearning loop. This results in programs with several nested loops and dynamic\ndata dependencies between tensors. While DL systems with eager execution\nsupport such dynamism, they lack the optimizations and smart scheduling of\ngraph-based execution. Graph-based execution, however, cannot express dynamic\ntensor shapes, instead requiring the use of multiple static subgraphs. Either\nexecution model for DRL thus leads to redundant computation, reduced\nparallelism, and less efficient memory management.\n  We describe TimeRL, a system for executing dynamic DRL programs that combines\nthe dynamism of eager execution with the whole-program optimizations and\nscheduling of graph-based execution. TimeRL achieves this by introducing the\ndeclarative programming model of recurrent tensors, which allows users to\ndefine dynamic dependencies as intuitive recurrence equations. TimeRL\ntranslates recurrent tensors into a polyhedral dependence graph (PDG) with\ndynamic dependencies as symbolic expressions. Through simple PDG\ntransformations, TimeRL applies whole-program optimizations, such as automatic\nvectorization, incrementalization, and operator fusion. The PDG also allows for\nthe computation of an efficient program-wide execution schedule, which decides\non buffer deallocations, buffer donations, and GPU/CPU memory swapping. We show\nthat TimeRL executes current DRL algorithms up to 47$\\times$ faster than\nexisting DRL systems, while using 16$\\times$ less GPU peak memory.\n","authors":["Pedro F. Silvestre","Peter Pietzuch"],"pdf_url":"https://arxiv.org/pdf/2501.05408v1.pdf","comment":"17 pages, 11 figures, 5 bibliography pages"},{"id":"http://arxiv.org/abs/2501.05407v1","updated":"2025-01-09T18:05:05Z","published":"2025-01-09T18:05:05Z","title":"On-line Policy Improvement using Monte-Carlo Search","summary":"  We present a Monte-Carlo simulation algorithm for real-time policy\nimprovement of an adaptive controller. In the Monte-Carlo simulation, the\nlong-term expected reward of each possible action is statistically measured,\nusing the initial policy to make decisions in each step of the simulation. The\naction maximizing the measured expected reward is then taken, resulting in an\nimproved policy. Our algorithm is easily parallelizable and has been\nimplemented on the IBM SP1 and SP2 parallel-RISC supercomputers.\n  We have obtained promising initial results in applying this algorithm to the\ndomain of backgammon. Results are reported for a wide variety of initial\npolicies, ranging from a random policy to TD-Gammon, an extremely strong\nmulti-layer neural network. In each case, the Monte-Carlo algorithm gives a\nsubstantial reduction, by as much as a factor of 5 or more, in the error rate\nof the base players. The algorithm is also potentially useful in many other\nadaptive control applications in which it is possible to simulate the\nenvironment.\n","authors":["Gerald Tesauro","Gregory R. Galperin"],"pdf_url":"https://arxiv.org/pdf/2501.05407v1.pdf","comment":"Accompanied by oral presentation by Gregory Galperin at NeurIPS 1996\n  (then known as NIPS*96)"},{"id":"http://arxiv.org/abs/2405.13536v2","updated":"2025-01-09T17:58:44Z","published":"2024-05-22T11:14:00Z","title":"Attention Mechanisms Don't Learn Additive Models: Rethinking Feature\n  Importance for Transformers","summary":"  We address the critical challenge of applying feature attribution methods to\nthe transformer architecture, which dominates current applications in natural\nlanguage processing and beyond. Traditional attribution methods to explainable\nAI (XAI) explicitly or implicitly rely on linear or additive surrogate models\nto quantify the impact of input features on a model's output. In this work, we\nformally prove an alarming incompatibility: transformers are structurally\nincapable of representing linear or additive surrogate models used for feature\nattribution, undermining the grounding of these conventional explanation\nmethodologies. To address this discrepancy, we introduce the Softmax-Linked\nAdditive Log Odds Model (SLALOM), a novel surrogate model specifically designed\nto align with the transformer framework. SLALOM demonstrates the capacity to\ndeliver a range of insightful explanations with both synthetic and real-world\ndatasets. We highlight SLALOM's unique efficiency-quality curve by showing that\nSLALOM can produce explanations with substantially higher fidelity than\ncompeting surrogate models or provide explanations of comparable quality at a\nfraction of their computational costs. We release code for SLALOM as an\nopen-source project online at https://github.com/tleemann/slalom_explanations.\n","authors":["Tobias Leemann","Alina Fastowski","Felix Pfeiffer","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2405.13536v2.pdf","comment":"TMLR Camera-Ready version"},{"id":"http://arxiv.org/abs/2501.05403v1","updated":"2025-01-09T17:57:56Z","published":"2025-01-09T17:57:56Z","title":"TimeDP: Learning to Generate Multi-Domain Time Series with Domain\n  Prompts","summary":"  Time series generation models are crucial for applications like data\naugmentation and privacy preservation. Most existing time series generation\nmodels are typically designed to generate data from one specified domain. While\nleveraging data from other domain for better generalization is proved to work\nin other application areas, this approach remains challenging for time series\nmodeling due to the large divergence in patterns among different real world\ntime series categories. In this paper, we propose a multi-domain time series\ndiffusion model with domain prompts, named TimeDP. In TimeDP, we utilize a time\nseries semantic prototype module which defines time series prototypes to\nrepresent time series basis, each prototype vector serving as \"word\"\nrepresenting some elementary time series feature. A prototype assignment module\nis applied to extract the extract domain specific prototype weights, for\nlearning domain prompts as generation condition. During sampling, we extract\n\"domain prompt\" with few-shot samples from the target domain and use the domain\nprompts as condition to generate time series samples. Experiments demonstrate\nthat our method outperforms baselines to provide the state-of-the-art in-domain\ngeneration quality and strong unseen domain generation capability.\n","authors":["Yu-Hao Huang","Chang Xu","Yueying Wu","Wu-Jun Li","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2501.05403v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2408.01857v2","updated":"2025-01-09T17:54:15Z","published":"2024-08-03T20:00:36Z","title":"Using Linearized Optimal Transport to Predict the Evolution of\n  Stochastic Particle Systems","summary":"  We develop an algorithm to approximate the time evolution of a probability\ndistribution without explicitly learning an operator that governs the\nevolution. A particular application of interest is discrete measures $\\mu_t^N$\nthat arise from systems of $N$ particles in $\\mathbb R^d$. In many such\nsituations, the individual particles move chaotically on short time scales,\nmaking it difficult to learn the dynamics of a governing operator, but the bulk\ndistribution $\\mu_t^N$ approximates an absolutely continuous measure $\\mu_t$\nthat evolves ``smoothly.'' If $\\mu_t$ is known on some time interval, then\nlinearized optimal transport theory provides an Euler-like scheme for\napproximating the evolution of $\\mu_t$ using its ``tangent vector field''\n(represented as a time-dependent vector field on $\\mathbb R^d$), which can be\ncomputed as a limit of optimal transport maps. We propose an analog of this\nEuler approximation to predict the evolution of the discrete measure $\\mu_t^N$\n(without knowing $\\mu_t$). To approximate the analogous tangent vector field,\nwe use a finite difference over a time step that sits between two time scales\nof the system -- long enough for a large-$N$ evolution ($\\mu_t$) to emerge but\nshort enough to satisfactorily approximate the derivative object used in the\nEuler scheme. The emergence of the limiting behavior ensures the optimal\ntransport maps closely approximate the vector field describing the bulk\ndistribution's smooth evolution instead of the individual particles' more\nchaotic movements. We demonstrate the efficacy of our approach with two\nillustrative examples, Gaussian diffusion and a cell chemotaxis model, and show\nthat our method succeeds in predicting the bulk behavior over relatively large\nsteps.\n","authors":["Nicholas Karris","Evangelos A. Nikitopoulos","Ioannis Kevrekidis","Seungjoon Lee","Alexander Cloninger"],"pdf_url":"https://arxiv.org/pdf/2408.01857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05401v1","updated":"2025-01-09T17:50:56Z","published":"2025-01-09T17:50:56Z","title":"BRATI: Bidirectional Recurrent Attention for Time-Series Imputation","summary":"  Missing data in time-series analysis poses significant challenges, affecting\nthe reliability of downstream applications. Imputation, the process of\nestimating missing values, has emerged as a key solution. This paper introduces\nBRATI, a novel deep-learning model designed to address multivariate time-series\nimputation by combining Bidirectional Recurrent Networks and Attention\nmechanisms. BRATI processes temporal dependencies and feature correlations\nacross long and short time horizons, utilizing two imputation blocks that\noperate in opposite temporal directions. Each block integrates recurrent layers\nand attention mechanisms to effectively resolve long-term dependencies.\n  We evaluate BRATI on three real-world datasets under diverse missing-data\nscenarios: randomly missing values, fixed-length missing sequences, and\nvariable-length missing sequences. Our findings demonstrate that BRATI\nconsistently outperforms state-of-the-art models, delivering superior accuracy\nand robustness in imputing multivariate time-series data.\n","authors":["Armando Collado-Villaverde","Pablo Muñoz","Maria D. R-Moreno"],"pdf_url":"https://arxiv.org/pdf/2501.05401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05398v1","updated":"2025-01-09T17:47:34Z","published":"2025-01-09T17:47:34Z","title":"Mechanistic understanding and validation of large AI models with\n  SemanticLens","summary":"  Unlike human-engineered systems such as aeroplanes, where each component's\nrole and dependencies are well understood, the inner workings of AI models\nremain largely opaque, hindering verifiability and undermining trust. This\npaper introduces SemanticLens, a universal explanation method for neural\nnetworks that maps hidden knowledge encoded by components (e.g., individual\nneurons) into the semantically structured, multimodal space of a foundation\nmodel such as CLIP. In this space, unique operations become possible, including\n(i) textual search to identify neurons encoding specific concepts, (ii)\nsystematic analysis and comparison of model representations, (iii) automated\nlabelling of neurons and explanation of their functional roles, and (iv) audits\nto validate decision-making against requirements. Fully scalable and operating\nwithout human input, SemanticLens is shown to be effective for debugging and\nvalidation, summarizing model knowledge, aligning reasoning with expectations\n(e.g., adherence to the ABCDE-rule in melanoma classification), and detecting\ncomponents tied to spurious correlations and their associated training data. By\nenabling component-level understanding and validation, the proposed approach\nhelps bridge the \"trust gap\" between AI models and traditional engineered\nsystems. We provide code for SemanticLens on\nhttps://github.com/jim-berend/semanticlens and a demo on\nhttps://semanticlens.hhi-research-insights.eu.\n","authors":["Maximilian Dreyer","Jim Berend","Tobias Labarta","Johanna Vielhaben","Thomas Wiegand","Sebastian Lapuschkin","Wojciech Samek"],"pdf_url":"https://arxiv.org/pdf/2501.05398v1.pdf","comment":"74 pages (18 pages manuscript, 7 pages references, 49 pages appendix)"},{"id":"http://arxiv.org/abs/2110.01593v7","updated":"2025-01-09T17:28:02Z","published":"2021-10-04T17:41:53Z","title":"Generalized Kernel Thinning","summary":"  The kernel thinning (KT) algorithm of Dwivedi and Mackey (2021) compresses a\nprobability distribution more effectively than independent sampling by\ntargeting a reproducing kernel Hilbert space (RKHS) and leveraging a less\nsmooth square-root kernel. Here we provide four improvements. First, we show\nthat KT applied directly to the target RKHS yields tighter, dimension-free\nguarantees for any kernel, any distribution, and any fixed function in the\nRKHS. Second, we show that, for analytic kernels like Gaussian, inverse\nmultiquadric, and sinc, target KT admits maximum mean discrepancy (MMD)\nguarantees comparable to or better than those of square-root KT without making\nexplicit use of a square-root kernel. Third, we prove that KT with a fractional\npower kernel yields better-than-Monte-Carlo MMD guarantees for non-smooth\nkernels, like Laplace and Mat\\'ern, that do not have square-roots. Fourth, we\nestablish that KT applied to a sum of the target and power kernels (a procedure\nwe call KT+) simultaneously inherits the improved MMD guarantees of power KT\nand the tighter individual function guarantees of target KT. In our experiments\nwith target KT and KT+, we witness significant improvements in integration\nerror even in $100$ dimensions and when compressing challenging differential\nequation posteriors.\n","authors":["Raaz Dwivedi","Lester Mackey"],"pdf_url":"https://arxiv.org/pdf/2110.01593v7.pdf","comment":"Corrected B-spline and Sinc rates in Table 3"},{"id":"http://arxiv.org/abs/2501.05387v1","updated":"2025-01-09T17:21:00Z","published":"2025-01-09T17:21:00Z","title":"Integrating Explainable AI for Effective Malware Detection in Encrypted\n  Network Traffic","summary":"  Encrypted network communication ensures confidentiality, integrity, and\nprivacy between endpoints. However, attackers are increasingly exploiting\nencryption to conceal malicious behavior. Detecting unknown encrypted malicious\ntraffic without decrypting the payloads remains a significant challenge. In\nthis study, we investigate the integration of explainable artificial\nintelligence (XAI) techniques to detect malicious network traffic. We employ\nensemble learning models to identify malicious activity using multi-view\nfeatures extracted from various aspects of encrypted communication. To\neffectively represent malicious communication, we compiled a robust dataset\nwith 1,127 unique connections, more than any other available open-source\ndataset, and spanning 54 malware families. Our models were benchmarked against\nthe CTU-13 dataset, achieving performance of over 99% accuracy, precision, and\nF1-score. Additionally, the eXtreme Gradient Boosting (XGB) model demonstrated\n99.32% accuracy, 99.53% precision, and 99.43% F1-score on our custom dataset.\nBy leveraging Shapley Additive Explanations (SHAP), we identified that the\nmaximum packet size, mean inter-arrival time of packets, and transport layer\nsecurity version used are the most critical features for the global model\nexplanation. Furthermore, key features were identified as important for local\nexplanations across both datasets for individual traffic samples. These\ninsights provide a deeper understanding of the model decision-making process,\nenhancing the transparency and reliability of detecting malicious encrypted\ntraffic.\n","authors":["Sileshi Nibret Zeleke","Amsalu Fentie Jember","Mario Bochicchio"],"pdf_url":"https://arxiv.org/pdf/2501.05387v1.pdf","comment":"Accepted and presented on PanAfriCon AI 2024"},{"id":"http://arxiv.org/abs/2501.05370v1","updated":"2025-01-09T16:50:16Z","published":"2025-01-09T16:50:16Z","title":"Accelerated Diffusion Models via Speculative Sampling","summary":"  Speculative sampling is a popular technique for accelerating inference in\nLarge Language Models by generating candidate tokens using a fast draft model\nand accepting or rejecting them based on the target model's distribution. While\nspeculative sampling was previously limited to discrete sequences, we extend it\nto diffusion models, which generate samples via continuous, vector-valued\nMarkov chains. In this context, the target model is a high-quality but\ncomputationally expensive diffusion model. We propose various drafting\nstrategies, including a simple and effective approach that does not require\ntraining a draft model and is applicable out of the box to any diffusion model.\nOur experiments demonstrate significant generation speedup on various diffusion\nmodels, halving the number of function evaluations, while generating exact\nsamples from the target model.\n","authors":["Valentin De Bortoli","Alexandre Galashov","Arthur Gretton","Arnaud Doucet"],"pdf_url":"https://arxiv.org/pdf/2501.05370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05368v1","updated":"2025-01-09T16:49:04Z","published":"2025-01-09T16:49:04Z","title":"Developing a Foundation of Vector Symbolic Architectures Using Category\n  Theory","summary":"  At the risk of overstating the case, connectionist approaches to machine\nlearning, i.e. neural networks, are enjoying a small vogue right now. However,\nthese methods require large volumes of data and produce models that are\nuninterpretable to humans. An alternative framework that is compatible with\nneural networks and gradient-based learning, but explicitly models\ncompositionality, is Vector Symbolic Architectures (VSAs). VSAs are a family of\nalgebras on high-dimensional vector representations. They arose in cognitive\nscience from the need to unify neural processing and the kind of symbolic\nreasoning that humans perform. While machine learning methods have benefited\nfrom category theoretical analyses, VSAs have not yet received similar\ntreatment. In this paper, we present a first attempt at applying category\ntheory to VSAs. Specifically, we conduct a brief literature survey\ndemonstrating the lacking intersection of these two topics, provide a list of\ndesiderata for VSAs, and propose that VSAs may be understood as a (division)\nrig in a category enriched over a monoid in Met (the category of Lawvere metric\nspaces). This final contribution suggests that VSAs may be generalised beyond\ncurrent implementations. It is our hope that grounding VSAs in category theory\nwill lead to more rigorous connections with other research, both within and\nbeyond, learning and cognition.\n","authors":["Nolan P Shaw","P Michael Furlong","Britt Anderson","Jeff Orchard"],"pdf_url":"https://arxiv.org/pdf/2501.05368v1.pdf","comment":"13 pages, no figures, 2 tables, one appendix"},{"id":"http://arxiv.org/abs/2501.05361v1","updated":"2025-01-09T16:44:53Z","published":"2025-01-09T16:44:53Z","title":"No-Regret Linear Bandits under Gap-Adjusted Misspecification","summary":"  This work studies linear bandits under a new notion of gap-adjusted\nmisspecification and is an extension of Liu et al. (2023). When the underlying\nreward function is not linear, existing linear bandits work usually relies on a\nuniform misspecification parameter $\\epsilon$ that measures the sup-norm error\nof the best linear approximation. This results in an unavoidable linear regret\nwhenever $\\epsilon > 0$. We propose a more natural model of misspecification\nwhich only requires the approximation error at each input $x$ to be\nproportional to the suboptimality gap at $x$. It captures the intuition that,\nfor optimization problems, near-optimal regions should matter more and we can\ntolerate larger approximation errors in suboptimal regions.\n  Quite surprisingly, we show that the classical LinUCB algorithm -- designed\nfor the realizable case -- is automatically robust against such\n$\\rho$-gap-adjusted misspecification with parameter $\\rho$ diminishing at\n$O(1/(d \\sqrt{\\log T}))$. It achieves a near-optimal $O(\\sqrt{T})$ regret for\nproblems that the best-known regret is almost linear in time horizon $T$. We\nfurther advance this frontier by presenting a novel phased elimination-based\nalgorithm whose gap-adjusted misspecification parameter $\\rho = O(1/\\sqrt{d})$\ndoes not scale with $T$. This algorithm attains optimal $O(\\sqrt{T})$ regret\nand is deployment-efficient, requiring only $\\log T$ batches of exploration. It\nalso enjoys an adaptive $O(\\log T)$ regret when a constant suboptimality gap\nexists. Technically, our proof relies on a novel self-bounding argument that\nbounds the part of the regret due to misspecification by the regret itself, and\na new inductive lemma that limits the misspecification error within the\nsuboptimality gap for all valid actions in each batch selected by G-optimal\ndesign.\n","authors":["Chong Liu","Dan Qiao","Ming Yin","Ilija Bogunovic","Yu-Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05361v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2302.13252"},{"id":"http://arxiv.org/abs/2412.20138v2","updated":"2025-01-09T16:36:26Z","published":"2024-12-28T12:54:06Z","title":"TradingAgents: Multi-Agents LLM Financial Trading Framework","summary":"  Significant progress has been made in automated problem-solving using\nsocieties of agents powered by large language models (LLMs). In finance,\nefforts have largely focused on single-agent systems handling specific tasks or\nmulti-agent frameworks independently gathering data. However, multi-agent\nsystems' potential to replicate real-world trading firms' collaborative\ndynamics remains underexplored. TradingAgents proposes a novel stock trading\nframework inspired by trading firms, featuring LLM-powered agents in\nspecialized roles such as fundamental analysts, sentiment analysts, technical\nanalysts, and traders with varied risk profiles. The framework includes Bull\nand Bear researcher agents assessing market conditions, a risk management team\nmonitoring exposure, and traders synthesizing insights from debates and\nhistorical data to make informed decisions. By simulating a dynamic,\ncollaborative trading environment, this framework aims to improve trading\nperformance. Detailed architecture and extensive experiments reveal its\nsuperiority over baseline models, with notable improvements in cumulative\nreturns, Sharpe ratio, and maximum drawdown, highlighting the potential of\nmulti-agent LLM frameworks in financial trading. More details on TradingAgents\nare available at https://TradingAgents-AI.github.io.\n","authors":["Yijia Xiao","Edward Sun","Di Luo","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2412.20138v2.pdf","comment":"Multi-Agent AI in the Real World @ AAAI 2025"},{"id":"http://arxiv.org/abs/2411.10087v2","updated":"2025-01-09T16:22:42Z","published":"2024-11-15T10:16:38Z","title":"PFML: Self-Supervised Learning of Time-Series Data Without\n  Representation Collapse","summary":"  Self-supervised learning (SSL) is a data-driven learning approach that\nutilizes the innate structure of the data to guide the learning process. In\ncontrast to supervised learning, which depends on external labels, SSL utilizes\nthe inherent characteristics of the data to produce its own supervisory signal.\nHowever, one frequent issue with SSL methods is representation collapse, where\nthe model outputs a constant input-invariant feature representation. This issue\nhinders the potential application of SSL methods to new data modalities, as\ntrying to avoid representation collapse wastes researchers' time and effort.\nThis paper introduces a novel SSL algorithm for time-series data called\nPrediction of Functionals from Masked Latents (PFML). Instead of predicting\nmasked input signals or their latent representations directly, PFML operates by\npredicting statistical functionals of the input signal corresponding to masked\nembeddings, given a sequence of unmasked embeddings. The algorithm is designed\nto avoid representation collapse, rendering it straightforwardly applicable to\ndifferent time-series data domains, such as novel sensor modalities in clinical\ndata. We demonstrate the effectiveness of PFML through complex, real-life\nclassification tasks across three different data modalities: infant posture and\nmovement classification from multi-sensor inertial measurement unit data,\nemotion recognition from speech data, and sleep stage classification from EEG\ndata. The results show that PFML is superior to a conceptually similar SSL\nmethod and a contrastive learning-based SSL method. Additionally, PFML is on\npar with the current state-of-the-art SSL method, while also being conceptually\nsimpler and without suffering from representation collapse.\n","authors":["Einari Vaaras","Manu Airaksinen","Okko Räsänen"],"pdf_url":"https://arxiv.org/pdf/2411.10087v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05336v1","updated":"2025-01-09T16:02:51Z","published":"2025-01-09T16:02:51Z","title":"Stream Aligner: Efficient Sentence-Level Alignment via Distribution\n  Induction","summary":"  The rapid advancement of large language models (LLMs) has led to significant\nimprovements in their capabilities, but also to increased concerns about their\nalignment with human values and intentions. Current alignment strategies,\nincluding adaptive training and inference-time methods, have demonstrated\npotential in this area. However, these approaches still struggle to balance\ndeployment complexity and capability across various tasks and difficulties. In\nthis work, we introduce the Streaming Distribution Induce Aligner (Stream\nAligner), a novel alignment paradigm that combines efficiency with enhanced\nperformance in various tasks throughout the generation process. Stream Aligner\nachieves dynamic sentence-level correction by using a small model to learn the\npreferences of the suffix sentence, iteratively correcting the suffix sentence\noutput by the upstream model, and then using the corrected sentence to replace\nthe suffix sentence in subsequent generations. Compared to Aligner, our\nexperiments demonstrate that Stream Aligner reduces reliance on the\ncapabilities of additional models, enhances the reasoning abilities of LLMs,\nand decreases latency during user interaction. Specifically, Stream Aligner-2B\nmodel has achieved an improvement of 76.1% in helpfulness, 36.0% in\nharmlessness on the tested Llama2-70B-chat model, and Stream Aligner-8B has\nachieved an improvement of 3.5% on the math ability of the tested\nLlama3-70B-Instruct model.\n","authors":["Hantao Lou","Jiaming Ji","Kaile Wang","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2501.05336v1.pdf","comment":"AAAI Alignment Track 2025 Poster"},{"id":"http://arxiv.org/abs/2501.05333v1","updated":"2025-01-09T15:59:15Z","published":"2025-01-09T15:59:15Z","title":"Stability and List-Replicability for Agnostic Learners","summary":"  Two seminal papers--Alon, Livni, Malliaris, Moran (STOC 2019) and Bun, Livni,\nand Moran (FOCS 2020)--established the equivalence between online learnability\nand globally stable PAC learnability in binary classification. However, Chase,\nChornomaz, Moran, and Yehudayoff (STOC 2024) recently showed that this\nequivalence does not hold in the agnostic setting. Specifically, they proved\nthat in the agnostic setting, only finite hypothesis classes are globally\nstable learnable. Therefore, agnostic global stability is too restrictive to\ncapture interesting hypothesis classes.\n  To address this limitation, Chase \\emph{et al.} introduced two relaxations of\nagnostic global stability. In this paper, we characterize the classes that are\nlearnable under their proposed relaxed conditions, resolving the two open\nproblems raised in their work.\n  First, we prove that in the setting where the stability parameter can depend\non the excess error (the gap between the learner's error and the best\nachievable error by the hypothesis class), agnostic stability is fully\ncharacterized by the Littlestone dimension. Consequently, as in the realizable\ncase, this form of learnability is equivalent to online learnability.\n  As part of the proof of this theorem, we strengthen the celebrated result of\nBun et al. by showing that classes with infinite Littlestone dimension are not\nstably PAC learnable, even if we allow the stability parameter to depend on the\nexcess error.\n  For the second relaxation proposed by Chase et al., we prove that only finite\nhypothesis classes are globally stable learnable even if we restrict the\nagnostic setting to distributions with small population loss.\n","authors":["Ari Blonda","Shan Gao","Hamed Hatami","Pooya Hatami"],"pdf_url":"https://arxiv.org/pdf/2501.05333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05329v1","updated":"2025-01-09T15:55:08Z","published":"2025-01-09T15:55:08Z","title":"Knowledge Transfer in Model-Based Reinforcement Learning Agents for\n  Efficient Multi-Task Learning","summary":"  We propose an efficient knowledge transfer approach for model-based\nreinforcement learning, addressing the challenge of deploying large world\nmodels in resource-constrained environments. Our method distills a\nhigh-capacity multi-task agent (317M parameters) into a compact 1M parameter\nmodel, achieving state-of-the-art performance on the MT30 benchmark with a\nnormalized score of 28.45, a substantial improvement over the original 1M\nparameter model's score of 18.93. This demonstrates the ability of our\ndistillation technique to consolidate complex multi-task knowledge effectively.\nAdditionally, we apply FP16 post-training quantization, reducing the model size\nby 50% while maintaining performance. Our work bridges the gap between the\npower of large models and practical deployment constraints, offering a scalable\nsolution for efficient and accessible multi-task reinforcement learning in\nrobotics and other resource-limited domains.\n","authors":["Dmytro Kuzmenko","Nadiya Shvai"],"pdf_url":"https://arxiv.org/pdf/2501.05329v1.pdf","comment":"Preprint of an extended abstract accepted to AAMAS 2025"},{"id":"http://arxiv.org/abs/2501.05325v1","updated":"2025-01-09T15:50:02Z","published":"2025-01-09T15:50:02Z","title":"The explanation dialogues: an expert focus study to understand\n  requirements towards explanations within the GDPR","summary":"  Explainable AI (XAI) provides methods to understand non-interpretable machine\nlearning models. However, we have little knowledge about what legal experts\nexpect from these explanations, including their legal compliance with, and\nvalue against European Union legislation. To close this gap, we present the\nExplanation Dialogues, an expert focus study to uncover the expectations,\nreasoning, and understanding of legal experts and practitioners towards XAI,\nwith a specific focus on the European General Data Protection Regulation. The\nstudy consists of an online questionnaire and follow-up interviews, and is\ncentered around a use-case in the credit domain. We extract both a set of\nhierarchical and interconnected codes using grounded theory, and present the\nstandpoints of the participating experts towards XAI. We find that the\npresented explanations are hard to understand and lack information, and discuss\nissues that can arise from the different interests of the data controller and\nsubject. Finally, we present a set of recommendations for developers of XAI\nmethods, and indications of legal areas of discussion. Among others,\nrecommendations address the presentation, choice, and content of an\nexplanation, technical risks as well as the end-user, while we provide legal\npointers to the contestability of explanations, transparency thresholds,\nintellectual property rights as well as the relationship between involved\nparties.\n","authors":["Laura State","Alejandra Bringas Colmenarejo","Andrea Beretta","Salvatore Ruggieri","Franco Turini","Stephanie Law"],"pdf_url":"https://arxiv.org/pdf/2501.05325v1.pdf","comment":"Artificial Intelligence and Law (Springer Nature)"},{"id":"http://arxiv.org/abs/2501.05323v1","updated":"2025-01-09T15:48:29Z","published":"2025-01-09T15:48:29Z","title":"Distributed Learning and Inference Systems: A Networking Perspective","summary":"  Machine learning models have achieved, and in some cases surpassed,\nhuman-level performance in various tasks, mainly through centralized training\nof static models and the use of large models stored in centralized clouds for\ninference. However, this centralized approach has several drawbacks, including\nprivacy concerns, high storage demands, a single point of failure, and\nsignificant computing requirements. These challenges have driven interest in\ndeveloping alternative decentralized and distributed methods for AI training\nand inference. Distribution introduces additional complexity, as it requires\nmanaging multiple moving parts. To address these complexities and fill a gap in\nthe development of distributed AI systems, this work proposes a novel\nframework, Data and Dynamics-Aware Inference and Training Networks (DA-ITN).\nThe different components of DA-ITN and their functions are explored, and the\nassociated challenges and research areas are highlighted.\n","authors":["Hesham G. Moussa","Arashmid Akhavain","S. Maryam Hosseini","Bill McCormick"],"pdf_url":"https://arxiv.org/pdf/2501.05323v1.pdf","comment":"This paper has been submitted to IEEE Network magazine and is still\n  under review"},{"id":"http://arxiv.org/abs/2406.05405v3","updated":"2025-01-09T15:47:33Z","published":"2024-06-08T08:56:47Z","title":"Robust Conformal Prediction Using Privileged Information","summary":"  We develop a method to generate prediction sets with a guaranteed coverage\nrate that is robust to corruptions in the training data, such as missing or\nnoisy variables. Our approach builds on conformal prediction, a powerful\nframework to construct prediction sets that are valid under the i.i.d\nassumption. Importantly, naively applying conformal prediction does not provide\nreliable predictions in this setting, due to the distribution shift induced by\nthe corruptions. To account for the distribution shift, we assume access to\nprivileged information (PI). The PI is formulated as additional features that\nexplain the distribution shift, however, they are only available during\ntraining and absent at test time. We approach this problem by introducing a\nnovel generalization of weighted conformal prediction and support our method\nwith theoretical coverage guarantees. Empirical experiments on both real and\nsynthetic datasets indicate that our approach achieves a valid coverage rate\nand constructs more informative predictions compared to existing methods, which\nare not supported by theoretical guarantees.\n","authors":["Shai Feldman","Yaniv Romano"],"pdf_url":"https://arxiv.org/pdf/2406.05405v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03145v2","updated":"2025-01-09T15:31:29Z","published":"2025-01-06T17:12:19Z","title":"Geometry Restoration and Dewarping of Camera-Captured Document Images","summary":"  This research focuses on developing a method for restoring the topology of\ndigital images of paper documents captured by a camera, using algorithms for\ndetection, segmentation, geometry restoration, and dewarping. Our methodology\nemploys deep learning (DL) for document outline detection, followed by computer\nvision (CV) to create a topological 2D grid using cubic polynomial\ninterpolation and correct nonlinear distortions by remapping the image. Using\nclassical CV methods makes the document topology restoration process more\nefficient and faster, as it requires significantly fewer computational\nresources and memory. We developed a new pipeline for automatic document\ndewarping and reconstruction, along with a framework and annotated dataset to\ndemonstrate its efficiency. Our experiments confirm the promise of our\nmethodology and its superiority over existing benchmarks (including mobile apps\nand popular DL solutions, such as RectiNet, DocGeoNet, and DocTr++) both\nvisually and in terms of document readability via Optical Character Recognition\n(OCR) and geometry restoration metrics. This paves the way for creating\nhigh-quality digital copies of paper documents and enhancing the efficiency of\nOCR systems. Project page: https://github.com/HorizonParadox/DRCCBI\n","authors":["Valery Istomin","Oleg Pereziabov","Ilya Afanasyev"],"pdf_url":"https://arxiv.org/pdf/2501.03145v2.pdf","comment":"28 pages, 16 figures"},{"id":"http://arxiv.org/abs/2501.05313v1","updated":"2025-01-09T15:29:33Z","published":"2025-01-09T15:29:33Z","title":"Optimizing Distributed Deployment of Mixture-of-Experts Model Inference\n  in Serverless Computing","summary":"  With the advancement of serverless computing, running machine learning (ML)\ninference services over a serverless platform has been advocated, given its\nlabor-free scalability and cost effectiveness. Mixture-of-Experts (MoE) models\nhave been a dominant type of model architectures to enable large models\nnowadays, with parallel expert networks. Serving large MoE models on serverless\ncomputing is potentially beneficial, but has been underexplored due to\nsubstantial challenges in handling the skewed expert popularity and\nscatter-gather communication bottleneck in MoE model execution, for\ncost-efficient serverless MoE deployment and performance guarantee. We study\noptimized MoE model deployment and distributed inference serving on a\nserverless platform, that effectively predict expert selection, pipeline\ncommunication with model execution, and minimize the overall billed cost of\nserving MoE models. Especially, we propose a Bayesian optimization framework\nwith multi-dimensional epsilon-greedy search to learn expert selections and\noptimal MoE deployment achieving optimal billed cost, including: 1) a Bayesian\ndecision-making method for predicting expert popularity; 2) flexibly pipelined\nscatter-gather communication; and 3) an optimal model deployment algorithm for\ndistributed MoE serving. Extensive experiments on AWS Lambda show that our\ndesigns reduce the billed cost of all MoE layers by at least 75.67% compared to\nCPU clusters while maintaining satisfactory inference throughput. As compared\nto LambdaML in serverless computing, our designs achieves 43.41% lower cost\nwith a throughput decrease of at most 18.76%.\n","authors":["Mengfan Liu","Wei Wang","Chuan Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05309v1","updated":"2025-01-09T15:25:07Z","published":"2025-01-09T15:25:07Z","title":"Private Selection with Heterogeneous Sensitivities","summary":"  Differentially private (DP) selection involves choosing a high-scoring\ncandidate from a finite candidate pool, where each score depends on a sensitive\ndataset. This problem arises naturally in a variety of contexts including model\nselection, hypothesis testing, and within many DP algorithms. Classical\nmethods, such as Report Noisy Max (RNM), assume all candidates' scores are\nequally sensitive to changes in a single individual's data, but this often\nisn't the case. To address this, algorithms like the Generalised Exponential\nMechanism (GEM) leverage variability in candidate sensitivities. However, we\nobserve that while these algorithms can outperform RNM in some situations, they\nmay underperform in others - they can even perform worse than random selection.\nIn this work, we explore how the distribution of scores and sensitivities\nimpacts DP selection mechanisms. In all settings we study, we find that there\nexists a mechanism that utilises heterogeneity in the candidate sensitivities\nthat outperforms standard mechanisms like RNM. However, no single mechanism\nuniformly outperforms RNM. We propose using the correlation between the scores\nand sensitivities as the basis for deciding which DP selection mechanism to\nuse. Further, we design a slight variant of GEM, modified GEM that generally\nperforms well whenever GEM performs poorly. Relying on the correlation\nheuristic we propose combined GEM, which adaptively chooses between GEM and\nmodified GEM and outperforms both in polarised settings.\n","authors":["Daniela Antonova","Allegra Laro","Audra McMillan","Lorenz Wolf"],"pdf_url":"https://arxiv.org/pdf/2501.05309v1.pdf","comment":"21 pages, 18 figures"},{"id":"http://arxiv.org/abs/2412.16378v2","updated":"2025-01-09T15:20:31Z","published":"2024-12-20T22:25:23Z","title":"REFA: Reference Free Alignment for multi-preference optimization","summary":"  We introduce REFA, a family of reference-free alignment methods that optimize\nover multiple user preferences while enforcing fine-grained length control. Our\napproach integrates deviation-based weighting to emphasize high-quality\nresponses more strongly, length normalization to prevent trivial short-response\nsolutions, and an EOS-probability regularizer to mitigate dataset-induced\nbrevity biases. Theoretically, we show that under the Uncertainty Reduction\nwith Sequence Length Assertion (URSLA), naive length normalization can still\nincentivize length-based shortcuts. By contrast, REFA corrects these subtle\nincentives, guiding models toward genuinely more informative and higher-quality\noutputs. Empirically, REFA sets a new state-of-the-art among reference-free\nalignment methods, producing richer responses aligned more closely with human\npreferences. Compared to a base supervised fine-tuned (SFT) mistral-7b model\nthat achieves 8.4% length-controlled win rate (LC-WR) and 6.2% win rate (WR),\nour best REFA configuration attains 21.62% LC-WR and 19.87% WR on the\nAlpacaEval v2 benchmark. This represents a substantial improvement over both\nthe strongest multi-preference baseline, InfoNCA (16.82% LC-WR, 10.44% WR), and\nthe strongest reference-free baseline, SimPO (20.01% LC-WR, 17.65% WR)\n","authors":["Taneesh Gupta","Rahul Madhavan","Xuchao Zhang","Chetan Bansal","Saravan Rajmohan"],"pdf_url":"https://arxiv.org/pdf/2412.16378v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19528v3","updated":"2025-01-09T15:12:04Z","published":"2024-10-25T12:53:33Z","title":"AgentForge: A Flexible Low-Code Platform for Reinforcement Learning\n  Agent Design","summary":"  Developing a reinforcement learning (RL) agent often involves identifying\nvalues for numerous parameters, covering the policy, reward function,\nenvironment, and agent-internal architecture. Since these parameters are\ninterrelated in complex ways, optimizing them is a black-box problem that\nproves especially challenging for nonexperts. Although existing\noptimization-as-a-service platforms (e.g., Vizier and Optuna) can handle such\nproblems, they are impractical for RL systems, since the need for manual user\nmapping of each parameter to distinct components makes the effort cumbersome.\nIt also requires understanding of the optimization process, limiting the\nsystems' application beyond the machine learning field and restricting access\nin areas such as cognitive science, which models human decision-making. To\ntackle these challenges, the paper presents AgentForge, a flexible low-code\nplatform to optimize any parameter set across an RL system. Available at\nhttps://github.com/feferna/AgentForge, it allows an optimization problem to be\ndefined in a few lines of code and handed to any of the interfaced optimizers.\nWith AgentForge, the user can optimize the parameters either individually or\njointly. The paper presents an evaluation of its performance for a challenging\nvision-based RL problem.\n","authors":["Francisco Erivaldo Fernandes Junior","Antti Oulasvirta"],"pdf_url":"https://arxiv.org/pdf/2410.19528v3.pdf","comment":"This paper has been accepted at the 17th International Conference on\n  Agents and Artificial Intelligence (ICAART 2025)"},{"id":"http://arxiv.org/abs/2409.07387v2","updated":"2025-01-09T14:58:03Z","published":"2024-09-11T16:21:44Z","title":"A Contrastive Symmetric Forward-Forward Algorithm (SFFA) for Continual\n  Learning Tasks","summary":"  The so-called Forward-Forward Algorithm (FFA) has recently gained momentum as\nan alternative to the conventional back-propagation algorithm for neural\nnetwork learning, yielding competitive performance across various modeling\ntasks. By replacing the backward pass of gradient back-propagation with two\ncontrastive forward passes, the FFA avoids several shortcomings undergone by\nits predecessor (e.g., vanishing/exploding gradient) by enabling layer-wise\ntraining heuristics. In classification tasks, this contrastive method has been\nproven to effectively create a latent sparse representation of the input data,\nultimately favoring discriminability. However, FFA exhibits an inherent\nasymmetric gradient behavior due to an imbalanced loss function between\npositive and negative data, adversely impacting on the model's generalization\ncapabilities and leading to an accuracy degradation. To address this issue,\nthis work proposes the Symmetric Forward-Forward Algorithm (SFFA), a novel\nmodification of the original FFA which partitions each layer into positive and\nnegative neurons. This allows the local fitness function to be defined as the\nratio between the activation of positive neurons and the overall layer\nactivity, resulting in a symmetric loss landscape during the training phase. To\nevaluate the enhanced convergence of our method, we conduct several experiments\nusing multiple image classification benchmarks, comparing the accuracy of\nmodels trained with SFFA to those trained with its FFA counterpart. As a\nbyproduct of this reformulation, we explore the advantages of using a\nlayer-wise training algorithm for Continual Learning (CL) tasks. The\nspecialization of neurons and the sparsity of their activations induced by\nlayer-wise training algorithms enable efficient CL strategies that incorporate\nnew knowledge (classes) into the neural network, while preventing catastrophic\nforgetting of previously...\n","authors":["Erik B. Terres-Escudero","Javier Del Ser","Pablo Garcia Bringas"],"pdf_url":"https://arxiv.org/pdf/2409.07387v2.pdf","comment":"Accepted at 3rd Conference on Lifelong Learning Agents (CoLLAs), 2024"},{"id":"http://arxiv.org/abs/2412.16220v3","updated":"2025-01-09T14:55:29Z","published":"2024-12-18T10:56:40Z","title":"Cross-Attention Graph Neural Networks for Inferring Gene Regulatory\n  Networks with Skewed Degree Distribution","summary":"  Inferencing Gene Regulatory Networks (GRNs) from gene expression data is a\npivotal challenge in systems biology, and several innovative computational\nmethods have been introduced. However, most of these studies have not\nconsidered the skewed degree distribution of genes. Specifically, some genes\nmay regulate multiple target genes while some genes may be regulated by\nmultiple regulator genes. Such a skewed degree distribution issue significantly\ncomplicates the application of directed graph embedding methods. To tackle this\nissue, we propose the Cross-Attention Complex Dual Graph Embedding Model\n(XATGRN). Our XATGRN employs a cross-attention mechanism to effectively capture\nintricate gene interactions from gene expression profiles. Additionally, it\nuses a Dual Complex Graph Embedding approach to manage the skewed degree\ndistribution, thereby ensuring precise prediction of regulatory relationships\nand their directionality. Our model consistently outperforms existing\nstate-of-the-art methods across various datasets, underscoring its efficacy in\nelucidating complex gene regulatory mechanisms. Our codes used in this paper\nare publicly available at: https://github.com/kikixiong/XATGRN.\n","authors":["Jiaqi Xiong","Nan Yin","Shiyang Liang","Haoyang Li","Yingxu Wang","Duo Ai","Fang Pan","Jingjie Wang"],"pdf_url":"https://arxiv.org/pdf/2412.16220v3.pdf","comment":"11 pages, 6 figures,1 tabels"},{"id":"http://arxiv.org/abs/2501.01480v2","updated":"2025-01-09T14:52:13Z","published":"2025-01-02T15:09:00Z","title":"Drift2Matrix: Kernel-Induced Self Representation for Concept Drift\n  Adaptation in Co-evolving Time Series","summary":"  In the realm of time series analysis, tackling the phenomenon of concept\ndrift poses a significant challenge. Concept drift -- characterized by the\nevolving statistical properties of time series data, affects the reliability\nand accuracy of conventional analysis models. This is particularly evident in\nco-evolving scenarios where interactions among variables are crucial. This\npaper presents Drift2Matrix, a novel framework that leverages kernel-induced\nself-representation for adaptive responses to concept drift in time series.\nDrift2Matrix employs a kernel-based learning mechanism to generate a\nrepresentation matrix, encapsulating the inherent dynamics of co-evolving time\nseries. This matrix serves as a key tool for identification and adaptation to\nconcept drift by observing its temporal variations. Furthermore, Drift2Matrix\neffectively identifies prevailing patterns and offers insights into emerging\ntrends through pattern evolution analysis. Our empirical evaluation of\nDrift2Matrix across various datasets demonstrates its effectiveness in handling\nthe complexities of concept drift. This approach introduces a novel perspective\nin the theoretical domain of co-evolving time series analysis, enhancing\nadaptability and accuracy in the face of dynamic data environments.\n","authors":["Kunpeng Xu","Lifei Chen","Shengrui Wang"],"pdf_url":"https://arxiv.org/pdf/2501.01480v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05281v1","updated":"2025-01-09T14:43:36Z","published":"2025-01-09T14:43:36Z","title":"Comparison Study: Glacier Calving Front Delineation in Synthetic\n  Aperture Radar Images With Deep Learning","summary":"  Calving front position variation of marine-terminating glaciers is an\nindicator of ice mass loss and a crucial parameter in numerical glacier models.\nDeep Learning (DL) systems can automatically extract this position from\nSynthetic Aperture Radar (SAR) imagery, enabling continuous, weather- and\nillumination-independent, large-scale monitoring. This study presents the first\ncomparison of DL systems on a common calving front benchmark dataset. A\nmulti-annotator study with ten annotators is performed to contrast the\nbest-performing DL system against human performance. The best DL model's\noutputs deviate 221 m on average, while the average deviation of the human\nannotators is 38 m. This significant difference shows that current DL systems\ndo not yet match human performance and that further research is needed to\nenable fully automated monitoring of glacier calving fronts. The study of\nVision Transformers, foundation models, and the inclusion and processing\nstrategy of more information are identified as avenues for future research.\n","authors":["Nora Gourmelon","Konrad Heidler","Erik Loebel","Daniel Cheng","Julian Klink","Anda Dong","Fei Wu","Noah Maul","Moritz Koch","Marcel Dreier","Dakota Pyles","Thorsten Seehaus","Matthias Braun","Andreas Maier","Vincent Christlein"],"pdf_url":"https://arxiv.org/pdf/2501.05281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05279v1","updated":"2025-01-09T14:43:08Z","published":"2025-01-09T14:43:08Z","title":"Learning convolution operators on compact Abelian groups","summary":"  We consider the problem of learning convolution operators associated to\ncompact Abelian groups. We study a regularization-based approach and provide\ncorresponding learning guarantees, discussing natural regularity condition on\nthe convolution kernel. More precisely, we assume the convolution kernel is a\nfunction in a translation invariant Hilbert space and analyze a natural ridge\nregression (RR) estimator. Building on existing results for RR, we characterize\nthe accuracy of the estimator in terms of finite sample bounds. Interestingly,\nregularity assumptions which are classical in the analysis of RR, have a novel\nand natural interpretation in terms of space/frequency localization.\nTheoretical results are illustrated by numerical simulations.\n","authors":["Emilia Magnani","Ernesto De Vito","Philipp Hennig","Lorenzo Rosasco"],"pdf_url":"https://arxiv.org/pdf/2501.05279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05278v1","updated":"2025-01-09T14:39:40Z","published":"2025-01-09T14:39:40Z","title":"Off-Policy Evaluation and Counterfactual Methods in Dynamic Auction\n  Environments","summary":"  Counterfactual estimators are critical for learning and refining policies\nusing logged data, a process known as Off-Policy Evaluation (OPE). OPE allows\nresearchers to assess new policies without costly experiments, speeding up the\nevaluation process. Online experimental methods, such as A/B tests, are\neffective but often slow, thus delaying the policy selection and optimization\nprocess.\n  In this work, we explore the application of OPE methods in the context of\nresource allocation in dynamic auction environments. Given the competitive\nnature of environments where rapid decision-making is crucial for gaining a\ncompetitive edge, the ability to quickly and accurately assess algorithmic\nperformance is essential. By utilizing counterfactual estimators as a\npreliminary step before conducting A/B tests, we aim to streamline the\nevaluation process, reduce the time and resources required for experimentation,\nand enhance confidence in the chosen policies. Our investigation focuses on the\nfeasibility and effectiveness of using these estimators to predict the outcomes\nof potential resource allocation strategies, evaluate their performance, and\nfacilitate more informed decision-making in policy selection. Motivated by the\noutcomes of our initial study, we envision an advanced analytics system\ndesigned to seamlessly and dynamically assess new resource allocation\nstrategies and policies.\n","authors":["Ritam Guha","Nilavra Pathak"],"pdf_url":"https://arxiv.org/pdf/2501.05278v1.pdf","comment":"9 pages, 15 figures, IEEE format"},{"id":"http://arxiv.org/abs/2501.04572v2","updated":"2025-01-09T14:30:41Z","published":"2025-01-08T15:42:41Z","title":"Regret Analysis: a control perspective","summary":"  Online learning and model reference adaptive control have many interesting\nintersections. One area where they differ however is in how the algorithms are\nanalyzed and what objective or metric is used to discriminate \"good\" algorithms\nfrom \"bad\" algorithms. In adaptive control there are usually two objectives: 1)\nprove that all time varying parameters/states of the system are bounded, and 2)\nthat the instantaneous error between the adaptively controlled system and a\nreference system converges to zero over time (or at least a compact set). For\nonline learning the performance of algorithms is often characterized by the\nregret the algorithm incurs. Regret is defined as the cumulative loss (cost)\nover time from the online algorithm minus the cumulative loss (cost) of the\nsingle optimal fixed parameter choice in hindsight. Another significant\ndifference between the two areas of research is with regard to the assumptions\nmade in order to obtain said results. Adaptive control makes assumptions about\nthe input-output properties of the control problem and derives solutions for a\nfixed error model or optimization task. In the online learning literature\nresults are derived for classes of loss functions (i.e. convex) while a priori\nassuming that all time varying parameters are bounded, which for many\noptimization tasks is not unrealistic, but is a non starter in control\napplications. In this work we discuss these differences in detail through the\nregret based analysis of gradient descent for convex functions and the control\nbased analysis of a streaming regression problem. We close with a discussion\nabout the newly defined paradigm of online adaptive control and ask the\nfollowing question \"Are regret optimal control strategies deployable?\"\n","authors":["Travis E. Gibson","Sawal Acharya"],"pdf_url":"https://arxiv.org/pdf/2501.04572v2.pdf","comment":"10 pages no figures"},{"id":"http://arxiv.org/abs/2501.05269v1","updated":"2025-01-09T14:26:50Z","published":"2025-01-09T14:26:50Z","title":"CellViT++: Energy-Efficient and Adaptive Cell Segmentation and\n  Classification Using Foundation Models","summary":"  Digital Pathology is a cornerstone in the diagnosis and treatment of\ndiseases. A key task in this field is the identification and segmentation of\ncells in hematoxylin and eosin-stained images. Existing methods for cell\nsegmentation often require extensive annotated datasets for training and are\nlimited to a predefined cell classification scheme. To overcome these\nlimitations, we propose $\\text{CellViT}^{{\\scriptscriptstyle ++}}$, a framework\nfor generalized cell segmentation in digital pathology.\n$\\text{CellViT}^{{\\scriptscriptstyle ++}}$ utilizes Vision Transformers with\nfoundation models as encoders to compute deep cell features and segmentation\nmasks simultaneously. To adapt to unseen cell types, we rely on a\ncomputationally efficient approach. It requires minimal data for training and\nleads to a drastically reduced carbon footprint. We demonstrate excellent\nperformance on seven different datasets, covering a broad spectrum of cell\ntypes, organs, and clinical settings. The framework achieves remarkable\nzero-shot segmentation and data-efficient cell-type classification.\nFurthermore, we show that $\\text{CellViT}^{{\\scriptscriptstyle ++}}$ can\nleverage immunofluorescence stainings to generate training datasets without the\nneed for pathologist annotations. The automated dataset generation approach\nsurpasses the performance of networks trained on manually labeled data,\ndemonstrating its effectiveness in creating high-quality training datasets\nwithout expert annotations. To advance digital pathology,\n$\\text{CellViT}^{{\\scriptscriptstyle ++}}$ is available as an open-source\nframework featuring a user-friendly, web-based interface for visualization and\nannotation. The code is available under\nhttps://github.com/TIO-IKIM/CellViT-plus-plus.\n","authors":["Fabian Hörst","Moritz Rempe","Helmut Becker","Lukas Heine","Julius Keyl","Jens Kleesiek"],"pdf_url":"https://arxiv.org/pdf/2501.05269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05260v1","updated":"2025-01-09T14:14:18Z","published":"2025-01-09T14:14:18Z","title":"Enhancing Plagiarism Detection in Marathi with a Weighted Ensemble of\n  TF-IDF and BERT Embeddings for Low-Resource Language Processing","summary":"  Plagiarism involves using another person's work or concepts without proper\nattribution, presenting them as original creations. With the growing amount of\ndata communicated in regional languages such as Marathi -- one of India's\nregional languages -- it is crucial to design robust plagiarism detection\nsystems tailored for low-resource languages. Language models like Bidirectional\nEncoder Representations from Transformers (BERT) have demonstrated exceptional\ncapability in text representation and feature extraction, making them essential\ntools for semantic analysis and plagiarism detection. However, the application\nof BERT for low-resource languages remains under-explored, particularly in the\ncontext of plagiarism detection. This paper presents a method to enhance the\naccuracy of plagiarism detection for Marathi texts using BERT sentence\nembeddings in conjunction with Term Frequency-Inverse Document Frequency\n(TF-IDF) feature representation. This approach effectively captures\nstatistical, semantic, and syntactic aspects of text features through a\nweighted voting ensemble of machine learning models.\n","authors":["Atharva Mutsaddi","Aditya Choudhary"],"pdf_url":"https://arxiv.org/pdf/2501.05260v1.pdf","comment":"Accepted into LoResLM: The First Workshop on Language Models for\n  Low-Resource Languages, colocated with COLING 2025 and set to be published\n  into ACL Anthology"},{"id":"http://arxiv.org/abs/2410.20398v2","updated":"2025-01-09T14:11:34Z","published":"2024-10-27T10:06:09Z","title":"Evaluation of uncertainty estimations for Gaussian process regression\n  based machine learning interatomic potentials","summary":"  Uncertainty estimations for machine learning interatomic potentials (MLIPs)\nare crucial for quantifying model error and identifying informative training\nsamples in active learning strategies. In this study, we evaluate uncertainty\nestimations of Gaussian process regression (GPR)-based MLIPs, including the\npredictive GPR standard deviation and ensemble-based uncertainties. We do this\nin terms of calibration and in terms of impact on model performance in an\nactive learning scheme. We consider GPR models with Coulomb and Smooth Overlap\nof Atomic Positions (SOAP) representations as inputs to predict potential\nenergy surfaces and excitation energies of molecules. Regarding calibration, we\nfind that ensemble-based uncertainty estimations show already poor global\ncalibration (e.g., averaged over the whole test set). In contrast, the GPR\nstandard deviation shows good global calibration, but when grouping predictions\nby their uncertainty, we observe a systematical bias for predictions with high\nuncertainty. Although an increasing uncertainty correlates with an increasing\nbias, the bias is not captured quantitatively by the uncertainty. Therefore,\nthe GPR standard deviation can be useful to identify predictions with a high\nbias and error but, without further knowledge, should not be interpreted as a\nquantitative measure for a potential error range. Selecting the samples with\nthe highest GPR standard deviation from a fixed configuration space leads to a\nmodel that overemphasizes the borders of the configuration space represented in\nthe fixed dataset. This may result in worse performance in more densely sampled\nareas but better generalization for extrapolation tasks.\n","authors":["Matthias Holzenkamp","Dongyu Lyu","Ulrich Kleinekathöfer","Peter Zaspel"],"pdf_url":"https://arxiv.org/pdf/2410.20398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05838v2","updated":"2025-01-09T14:04:01Z","published":"2024-10-08T09:06:34Z","title":"Time Transfer: On Optimal Learning Rate and Batch Size In The Infinite\n  Data Limit","summary":"  One of the main challenges in optimal scaling of large language models (LLMs)\nis the prohibitive cost of hyperparameter tuning, particularly learning rate\n$\\eta$ and batch size $B$. While techniques like $\\mu$P (Yang et al., 2022)\nprovide scaling rules for optimal $\\eta$ transfer in the infinite model size\nlimit, the optimal scaling behavior in the infinite data size limit remains\nunknown. We fill in this gap by observing for the first time an intricate\ndependence of optimal $\\eta$ scaling on the pretraining token budget $T$, $B$\nand its relation to the critical batch size $B_\\mathrm{crit}$, which we measure\nto evolve as $B_\\mathrm{crit} \\propto T$. Furthermore, we show that the optimal\nbatch size is positively correlated with $B_\\mathrm{crit}$: keeping it fixed\nbecomes suboptimal over time even if learning rate is scaled optimally.\nSurprisingly, our results demonstrate that the observed optimal $\\eta$ and $B$\ndynamics are preserved with $\\mu$P model scaling, challenging the conventional\nview of $B_\\mathrm{crit}$ dependence solely on loss value. Complementing\noptimality, we examine the sensitivity of loss to changes in learning rate,\nwhere we find the sensitivity to decrease with increase of $T$ and to remain\nconstant with $\\mu$P model scaling. We hope our results make the first step\ntowards a unified picture of the joint optimal data and model scaling.\n","authors":["Oleg Filatov","Jan Ebert","Jiangtao Wang","Stefan Kesselheim"],"pdf_url":"https://arxiv.org/pdf/2410.05838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05248v1","updated":"2025-01-09T14:00:01Z","published":"2025-01-09T14:00:01Z","title":"Deriving Coding-Specific Sub-Models from LLMs using Resource-Efficient\n  Pruning","summary":"  Large Language Models (LLMs) have demonstrated their exceptional performance\nin various complex code generation tasks. However, their broader adoption is\nlimited by significant computational demands and high resource requirements,\nparticularly memory and processing power. To mitigate such requirements, model\npruning techniques are used to create more compact models with significantly\nfewer parameters. However, current approaches do not focus on the efficient\nextraction of programming-language-specific sub-models. In this work, we\nexplore the idea of efficiently deriving coding-specific sub-models through\nunstructured pruning (i.e., Wanda). We investigate the impact of different\ndomain-specific calibration datasets on pruning outcomes across three distinct\ndomains and extend our analysis to extracting four language-specific\nsub-models: Python, Java, C++, and JavaScript. We are the first to efficiently\nextract programming-language-specific sub-models using appropriate calibration\ndatasets while maintaining acceptable accuracy w.r.t. full models. We are also\nthe first to provide analytical evidence that domain-specific tasks activate\ndistinct regions within LLMs, supporting the creation of specialized sub-models\nthrough unstructured pruning. We believe that this work has significant\npotential to enhance LLM accessibility for coding by reducing computational\nrequirements to enable local execution on consumer-grade hardware, and\nsupporting faster inference times critical for real-time development feedback.\n","authors":["Laura Puccioni","Alireza Farshin","Mariano Scazzariello","Changjie Wang","Marco Chiesa","Dejan Kostic"],"pdf_url":"https://arxiv.org/pdf/2501.05248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05234v1","updated":"2025-01-09T13:41:37Z","published":"2025-01-09T13:41:37Z","title":"Optimizing Estonian TV Subtitles with Semi-supervised Learning and LLMs","summary":"  This paper presents an approach for generating high-quality, same-language\nsubtitles for Estonian TV content. We fine-tune the Whisper model on\nhuman-generated Estonian subtitles and enhance it with iterative\npseudo-labeling and large language model (LLM) based post-editing. Our\nexperiments demonstrate notable subtitle quality improvement through\npseudo-labeling with an unlabeled dataset. We find that applying LLM-based\nediting at test time enhances subtitle accuracy, while its use during training\ndoes not yield further gains. This approach holds promise for creating subtitle\nquality close to human standard and could be extended to real-time\napplications.\n","authors":["Artem Fedorchenko","Tanel Alumäe"],"pdf_url":"https://arxiv.org/pdf/2501.05234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23569v4","updated":"2025-01-09T13:30:25Z","published":"2024-10-31T02:25:43Z","title":"RA-PbRL: Provably Efficient Risk-Aware Preference-Based Reinforcement\n  Learning","summary":"  Reinforcement Learning from Human Feedback (RLHF) has recently surged in\npopularity, particularly for aligning large language models and other AI\nsystems with human intentions. At its core, RLHF can be viewed as a specialized\ninstance of Preference-based Reinforcement Learning (PbRL), where the\npreferences specifically originate from human judgments rather than arbitrary\nevaluators. Despite this connection, most existing approaches in both RLHF and\nPbRL primarily focus on optimizing a mean reward objective, neglecting\nscenarios that necessitate risk-awareness, such as AI safety, healthcare, and\nautonomous driving. These scenarios often operate under a one-episode-reward\nsetting, which makes conventional risk-sensitive objectives inapplicable. To\naddress this, we explore and prove the applicability of two risk-aware\nobjectives to PbRL : nested and static quantile risk objectives. We also\nintroduce Risk-AwarePbRL (RA-PbRL), an algorithm designed to optimize both\nnested and static objectives. Additionally, we provide a theoretical analysis\nof the regret upper bounds, demonstrating that they are sublinear with respect\nto the number of episodes, and present empirical results to support our\nfindings. Our code is available in\nhttps://github.com/aguilarjose11/PbRLNeurips.\n","authors":["Yujie Zhao","Jose Efraim Aguilar Escamill","Weyl Lu","Huazheng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23569v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05226v1","updated":"2025-01-09T13:29:54Z","published":"2025-01-09T13:29:54Z","title":"Light Transport-aware Diffusion Posterior Sampling for Single-View\n  Reconstruction of 3D Volumes","summary":"  We introduce a single-view reconstruction technique of volumetric fields in\nwhich multiple light scattering effects are omnipresent, such as in clouds. We\nmodel the unknown distribution of volumetric fields using an unconditional\ndiffusion model trained on a novel benchmark dataset comprising 1,000\nsynthetically simulated volumetric density fields. The neural diffusion model\nis trained on the latent codes of a novel, diffusion-friendly, monoplanar\nrepresentation. The generative model is used to incorporate a tailored\nparametric diffusion posterior sampling technique into different reconstruction\ntasks. A physically-based differentiable volume renderer is employed to provide\ngradients with respect to light transport in the latent space. This stands in\ncontrast to classic NeRF approaches and makes the reconstructions better\naligned with observed data. Through various experiments, we demonstrate\nsingle-view reconstruction of volumetric clouds at a previously unattainable\nquality.\n","authors":["Ludwic Leonard","Nils Thuerey","Ruediger Westermann"],"pdf_url":"https://arxiv.org/pdf/2501.05226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15879v2","updated":"2025-01-09T13:27:29Z","published":"2024-07-20T10:45:06Z","title":"Decentralized Federated Anomaly Detection in Smart Grids: A P2P Gossip\n  Approach","summary":"  The increasing security and privacy concerns in the Smart Grid sector have\nled to a significant demand for robust intrusion detection systems within\ncritical smart grid infrastructure. To address the challenges posed by privacy\npreservation and decentralized power system zones with distinct data ownership,\nFederated Learning (FL) has emerged as a promising privacy-preserving solution\nwhich facilitates collaborative training of attack detection models without\nnecessitating the sharing of raw data. However, FL presents several\nimplementation limitations in the power system domain due to its heavy reliance\non a centralized aggregator and the risks of privacy leakage during model\nupdate transmission. To overcome these technical bottlenecks, this paper\nintroduces a novel decentralized federated anomaly detection scheme based on\ntwo main gossip protocols namely Random Walk and Epidemic. Our findings\nindicate that the Random Walk protocol exhibits superior performance compared\nto the Epidemic protocol, highlighting its efficacy in decentralized federated\nlearning environments. Experimental validation of the proposed framework\nutilizing publicly available industrial control systems datasets demonstrates\nsuperior attack detection accuracy while safeguarding data confidentiality and\nmitigating the impact of communication latency and stragglers. Furthermore, our\napproach yields a notable 35% improvement in training time compared to\nconventional FL, underscoring the efficacy and robustness of our decentralized\nlearning method.\n","authors":["Muhammad Akbar Husnoo","Adnan Anwar","Md Enamul Haque","A. N. Mahmood"],"pdf_url":"https://arxiv.org/pdf/2407.15879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05223v1","updated":"2025-01-09T13:19:59Z","published":"2025-01-09T13:19:59Z","title":"EVA-S2PLoR: A Secure Element-wise Multiplication Meets Logistic\n  Regression on Heterogeneous Database","summary":"  Accurate nonlinear computation is a key challenge in privacy-preserving\nmachine learning (PPML). Most existing frameworks approximate it through linear\noperations, resulting in significant precision loss. This paper proposes an\nefficient, verifiable and accurate security 2-party logistic regression\nframework (EVA-S2PLoR), which achieves accurate nonlinear function computation\nthrough a novel secure element-wise multiplication protocol and its derived\nprotocols. Our framework primarily includes secure 2-party vector element-wise\nmultiplication, addition to multiplication, reciprocal, and sigmoid function\nbased on data disguising technology, where high efficiency and accuracy are\nguaranteed by the simple computation flow based on the real number domain and\nthe few number of fixed communication rounds. We provide secure and robust\nanomaly detection through dimension transformation and Monte Carlo methods.\nEVA-S2PLoR outperforms many advanced frameworks in terms of precision\n(improving the performance of the sigmoid function by about 10 orders of\nmagnitude compared to most frameworks) and delivers the best overall\nperformance in secure logistic regression experiments.\n","authors":["Tianle Tao","Shizhao Peng","Tianyu Mei","Shoumo Li","Haogang Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.05223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14738v5","updated":"2025-01-09T13:06:40Z","published":"2024-12-19T11:10:48Z","title":"Boosting Graph Neural Network Training by Focusing on Non-Robust Samples\n  from the Training Set","summary":"  Graph Neural Networks (GNNs) are a highly effective neural network\narchitecture for processing graph-structured data. Unlike traditional neural\nnetworks that rely solely on the features of the data as input, GNNs leverage\nboth the graph structure, which represents the relationships between data\npoints, and the feature matrix of the data to optimize their feature\nrepresentation. This unique capability enables GNNs to achieve superior\nperformance across various tasks. However, it also makes GNNs more susceptible\nto noise from both the graph structure and data features, which can\nsignificantly increase the training difficulty and degrade their performance.\nTo address this issue, this paper proposes a novel method for selecting\nnoise-sensitive training samples from the original training set to construct a\nsmaller yet more effective training set for model training. These samples are\nthen used to enhance the model's ability to handle noise-prone instances\neffectively. We have evaluated our approach on three of the most classical GNN\nmodels -- GCN, GAT, and GraphSAGE -- as well as three widely used benchmark\ndatasets: Cora, Citeseer, and PubMed. Our experiments demonstrate that the\nproposed method can substantially boost the overall training of Graph Neural\nNetworks compared to using randomly constructed training sets.\n","authors":["Yongyu Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14738v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05207v1","updated":"2025-01-09T12:57:41Z","published":"2025-01-09T12:57:41Z","title":"CoDe: Communication Delay-Tolerant Multi-Agent Collaboration via Dual\n  Alignment of Intent and Timeliness","summary":"  Communication has been widely employed to enhance multi-agent collaboration.\nPrevious research has typically assumed delay-free communication, a strong\nassumption that is challenging to meet in practice. However, real-world agents\nsuffer from channel delays, receiving messages sent at different time points,\ntermed {\\it{Asynchronous Communication}}, leading to cognitive biases and\nbreakdowns in collaboration. This paper first defines two communication delay\nsettings in MARL and emphasizes their harm to collaboration. To handle the\nabove delays, this paper proposes a novel framework, Communication\nDelay-tolerant Multi-Agent Collaboration (CoDe). At first, CoDe learns an\nintent representation as messages through future action inference, reflecting\nthe stable future behavioral trends of the agents. Then, CoDe devises a dual\nalignment mechanism of intent and timeliness to strengthen the fusion process\nof asynchronous messages. In this way, agents can extract the long-term intent\nof others, even from delayed messages, and selectively utilize the most recent\nmessages that are relevant to their intent. Experimental results demonstrate\nthat CoDe outperforms baseline algorithms in three MARL benchmarks without\ndelay and exhibits robustness under fixed and time-varying delays.\n","authors":["Shoucheng Song","Youfang Lin","Sheng Han","Chang Yao","Hao Wu","Shuo Wang","Kai Lv"],"pdf_url":"https://arxiv.org/pdf/2501.05207v1.pdf","comment":"AAAI 2025 Accepted"},{"id":"http://arxiv.org/abs/2501.05204v1","updated":"2025-01-09T12:55:21Z","published":"2025-01-09T12:55:21Z","title":"Design and Control of a Bipedal Robotic Character","summary":"  Legged robots have achieved impressive feats in dynamic locomotion in\nchallenging unstructured terrain. However, in entertainment applications, the\ndesign and control of these robots face additional challenges in appealing to\nhuman audiences. This work aims to unify expressive, artist-directed motions\nand robust dynamic mobility for legged robots. To this end, we introduce a new\nbipedal robot, designed with a focus on character-driven mechanical features.\nWe present a reinforcement learning-based control architecture to robustly\nexecute artistic motions conditioned on command signals. During runtime, these\ncommand signals are generated by an animation engine which composes and blends\nbetween multiple animation sources. Finally, an intuitive operator interface\nenables real-time show performances with the robot. The complete system results\nin a believable robotic character, and paves the way for enhanced human-robot\nengagement in various contexts, in entertainment robotics and beyond.\n","authors":["Ruben Grandia","Espen Knoop","Michael A. Hopkins","Georg Wiedebach","Jared Bishop","Steven Pickles","David Müller","Moritz Bächer"],"pdf_url":"https://arxiv.org/pdf/2501.05204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05197v1","updated":"2025-01-09T12:48:15Z","published":"2025-01-09T12:48:15Z","title":"An Algorithmic Approach for Causal Health Equity: A Look at Race\n  Differentials in Intensive Care Unit (ICU) Outcomes","summary":"  The new era of large-scale data collection and analysis presents an\nopportunity for diagnosing and understanding the causes of health inequities.\nIn this study, we describe a framework for systematically analyzing health\ndisparities using causal inference. The framework is illustrated by\ninvestigating racial and ethnic disparities in intensive care unit (ICU)\noutcome between majority and minority groups in Australia (Indigenous vs.\nNon-Indigenous) and the United States (African-American vs. White). We\ndemonstrate that commonly used statistical measures for quantifying inequity\nare insufficient, and focus on attributing the observed disparity to the causal\nmechanisms that generate it. We find that minority patients are younger at\nadmission, have worse chronic health, are more likely to be admitted for urgent\nand non-elective reasons, and have higher illness severity. At the same time,\nhowever, we find a protective direct effect of belonging to a minority group,\nwith minority patients showing improved survival compared to their majority\ncounterparts, with all other variables kept equal. We demonstrate that this\nprotective effect is related to the increased probability of being admitted to\nICU, with minority patients having an increased risk of ICU admission. We also\nfind that minority patients, while showing improved survival, are more likely\nto be readmitted to ICU. Thus, due to worse access to primary health care,\nminority patients are more likely to end up in ICU for preventable conditions,\ncausing a reduction in the mortality rates and creating an effect that appears\nto be protective. Since the baseline risk of ICU admission may serve as proxy\nfor lack of access to primary care, we developed the Indigenous Intensive Care\nEquity (IICE) Radar, a monitoring system for tracking the over-utilization of\nICU resources by the Indigenous population of Australia across geographical\nareas.\n","authors":["Drago Plecko","Paul Secombe","Andrea Clarke","Amelia Fiske","Samarra Toby","Donisha Duff","David Pilcher","Leo Anthony Celi","Rinaldo Bellomo","Elias Bareinboim"],"pdf_url":"https://arxiv.org/pdf/2501.05197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04729v3","updated":"2025-01-09T12:44:44Z","published":"2024-01-09T18:59:47Z","title":"Human Delegation Behavior in Human-AI Collaboration: The Effect of\n  Contextual Information","summary":"  The integration of artificial intelligence (AI) into human decision-making\nprocesses at the workplace presents both opportunities and challenges. One\npromising approach to leverage existing complementary capabilities is allowing\nhumans to delegate individual instances of decision tasks to AI. However,\nenabling humans to delegate instances effectively requires them to assess\nseveral factors. One key factor is the analysis of both their own capabilities\nand those of the AI in the context of the given task. In this work, we conduct\na behavioral study to explore the effects of providing contextual information\nto support this delegation decision. Specifically, we investigate how\ncontextual information about the AI and the task domain influence humans'\ndelegation decisions to an AI and their impact on the human-AI team\nperformance. Our findings reveal that access to contextual information\nsignificantly improves human-AI team performance in delegation settings.\nFinally, we show that the delegation behavior changes with the different types\nof contextual information. Overall, this research advances the understanding of\ncomputer-supported, collaborative work and provides actionable insights for\ndesigning more effective collaborative systems.\n","authors":["Philipp Spitzer","Joshua Holstein","Patrick Hemmer","Michael Vössing","Niklas Kühl","Dominik Martin","Gerhard Satzger"],"pdf_url":"https://arxiv.org/pdf/2401.04729v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09094v2","updated":"2025-01-09T12:38:37Z","published":"2024-12-12T09:22:04Z","title":"Filter-then-Generate: Large Language Models with Structure-Text Adapter\n  for Knowledge Graph Completion","summary":"  Large Language Models (LLMs) present massive inherent knowledge and superior\nsemantic comprehension capability, which have revolutionized various tasks in\nnatural language processing. Despite their success, a critical gap remains in\nenabling LLMs to perform knowledge graph completion (KGC). Empirical evidence\nsuggests that LLMs consistently perform worse than conventional KGC approaches,\neven through sophisticated prompt design or tailored instruction-tuning.\nFundamentally, applying LLMs on KGC introduces several critical challenges,\nincluding a vast set of entity candidates, hallucination issue of LLMs, and\nunder-exploitation of the graph structure. To address these challenges, we\npropose a novel instruction-tuning-based method, namely FtG. Specifically, we\npresent a \\textit{filter-then-generate} paradigm and formulate the KGC task\ninto a multiple-choice question format. In this way, we can harness the\ncapability of LLMs while mitigating the issue casused by hallucinations.\nMoreover, we devise a flexible ego-graph serialization prompt and employ a\nstructure-text adapter to couple structure and text information in a\ncontextualized manner. Experimental results demonstrate that FtG achieves\nsubstantial performance gain compared to existing state-of-the-art methods. The\ninstruction dataset and code are available at\n\\url{https://github.com/LB0828/FtG}.\n","authors":["Ben Liu","Jihai Zhang","Fangquan Lin","Cheng Yang","Min Peng"],"pdf_url":"https://arxiv.org/pdf/2412.09094v2.pdf","comment":"COLING 2025 Main Conference"},{"id":"http://arxiv.org/abs/2501.05190v1","updated":"2025-01-09T12:30:22Z","published":"2025-01-09T12:30:22Z","title":"RadioTransformer: Accurate Radio Map Construction and Coverage\n  Prediction","summary":"  Radio map, or pathloss map prediction, is a crucial method for wireless\nnetwork modeling and management. By leveraging deep learning to construct\npathloss patterns from geographical maps, an accurate digital replica of the\ntransmission environment could be established with less computational overhead\nand lower prediction error compared to traditional model-driven techniques.\nWhile existing state-of-the-art (SOTA) methods predominantly rely on\nconvolutional architectures, this paper introduces a hybrid\ntransformer-convolution model, termed RadioTransformer, to enhance the accuracy\nof radio map prediction. The proposed model features a multi-scale\ntransformer-based encoder for efficient feature extraction and a\nconvolution-based decoder for precise pixel-level image reconstruction.\nSimulation results demonstrate that the proposed scheme significantly improves\nprediction accuracy, and over a 30% reduction in root mean square error (RMSE)\nis achieved compared to typical SOTA approaches.\n","authors":["Yuxuan Li","Cheng Zhang","Wen Wang","Yongming Huang"],"pdf_url":"https://arxiv.org/pdf/2501.05190v1.pdf","comment":"Submitted to IEEE VTC 2025 Spring"},{"id":"http://arxiv.org/abs/2411.17251v5","updated":"2025-01-09T12:28:55Z","published":"2024-11-26T09:29:27Z","title":"DGNN-YOLO: Interpretable Dynamic Graph Neural Networks with YOLO11 for\n  Detecting and Tracking Small Occluded Objects in Urban Traffic","summary":"  The detection and tracking of small, occluded objects such as pedestrians,\ncyclists, and motorbikes pose significant challenges for traffic surveillance\nsystems because of their erratic movement, frequent occlusion, and poor\nvisibility in dynamic urban environments. Traditional methods like YOLO11,\nwhile proficient in spatial feature extraction for precise detection, often\nstruggle with these small and dynamically moving objects, particularly in\nhandling real-time data updates and resource efficiency. This paper introduces\nDGNN-YOLO, a novel framework that integrates dynamic graph neural networks\n(DGNNs) with YOLO11 to address these limitations. Unlike standard GNNs, DGNNs\nare chosen for their superior ability to dynamically update graph structures in\nreal-time, which enables adaptive and robust tracking of objects in highly\nvariable urban traffic scenarios. This framework constructs and regularly\nupdates its graph representations, capturing objects as nodes and their\ninteractions as edges, thus effectively responding to rapidly changing\nconditions. Additionally, DGNN-YOLO incorporates Grad-CAM, Grad-CAM++, and\nEigen-CAM visualization techniques to enhance interpretability and foster\ntrust, offering insights into the model's decision-making process. Extensive\nexperiments validate the framework's performance, achieving a precision of\n0.8382, recall of 0.6875, and mAP@0.5:0.95 of 0.6476, significantly\noutperforming existing methods. This study offers a scalable and interpretable\nsolution for real-time traffic surveillance and significantly advances\nintelligent transportation systems' capabilities by addressing the critical\nchallenge of detecting and tracking small, occluded objects.\n","authors":["Shahriar Soudeep","M. F. Mridha","Md Abrar Jahin","Nilanjan Dey"],"pdf_url":"https://arxiv.org/pdf/2411.17251v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15361v2","updated":"2025-01-09T12:18:26Z","published":"2024-12-19T19:47:35Z","title":"Spatiotemporally Coherent Probabilistic Generation of Weather from\n  Climate","summary":"  Local climate information is crucial for impact assessment and\ndecision-making, yet coarse global climate simulations cannot capture\nsmall-scale phenomena. Current statistical downscaling methods infer these\nphenomena as temporally decoupled spatial patches. However, to preserve\nphysical properties, estimating spatio-temporally coherent high-resolution\nweather dynamics for multiple variables across long time horizons is crucial.\nWe present a novel generative approach that uses a score-based diffusion model\ntrained on high-resolution reanalysis data to capture the statistical\nproperties of local weather dynamics. After training, we condition on coarse\nclimate model data to generate weather patterns consistent with the aggregate\ninformation. As this inference task is inherently uncertain, we leverage the\nprobabilistic nature of diffusion models and sample multiple trajectories. We\nevaluate our approach with high-resolution reanalysis information before\napplying it to the climate model downscaling task. We then demonstrate that the\nmodel generates spatially and temporally coherent weather dynamics that align\nwith global climate output.\n","authors":["Jonathan Schmidt","Luca Schmidt","Felix Strnad","Nicole Ludwig","Philipp Hennig"],"pdf_url":"https://arxiv.org/pdf/2412.15361v2.pdf","comment":"15 pages, 6 figures, additional supplementary text and figures"},{"id":"http://arxiv.org/abs/2406.11814v5","updated":"2025-01-09T12:14:23Z","published":"2024-06-17T17:54:42Z","title":"Stochastic Neural Network Symmetrisation in Markov Categories","summary":"  We consider the problem of symmetrising a neural network along a group\nhomomorphism: given a homomorphism $\\varphi : H \\to G$, we would like a\nprocedure that converts $H$-equivariant neural networks to $G$-equivariant\nones. We formulate this in terms of Markov categories, which allows us to\nconsider neural networks whose outputs may be stochastic, but with\nmeasure-theoretic details abstracted away. We obtain a flexible and\ncompositional framework for symmetrisation that relies on minimal assumptions\nabout the structure of the group and the underlying neural network\narchitecture. Our approach recovers existing canonicalisation and averaging\ntechniques for symmetrising deterministic models, and extends to provide a\nnovel methodology for symmetrising stochastic models also. Beyond this, our\nfindings also demonstrate the utility of Markov categories for addressing\ncomplex problems in machine learning in a conceptually clear yet mathematically\nprecise way.\n","authors":["Rob Cornish"],"pdf_url":"https://arxiv.org/pdf/2406.11814v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16768v2","updated":"2025-01-09T12:04:51Z","published":"2024-09-25T09:26:19Z","title":"Interpreting Deep Neural Network-Based Receiver Under Varying\n  Signal-To-Noise Ratios","summary":"  We propose a novel method for interpreting neural networks, focusing on\nconvolutional neural network-based receiver model. The method identifies which\nunit or units of the model contain most (or least) information about the\nchannel parameter(s) of the interest, providing insights at both global and\nlocal levels -- with global explanations aggregating local ones. Experiments on\nlink-level simulations demonstrate the method's effectiveness in identifying\nunits that contribute most (and least) to signal-to-noise ratio processing.\nAlthough we focus on a radio receiver model, the method generalizes to other\nneural network architectures and applications, offering robust estimation even\nin high-dimensional settings.\n","authors":["Marko Tuononen","Dani Korpi","Ville Hautamäki"],"pdf_url":"https://arxiv.org/pdf/2409.16768v2.pdf","comment":"7+1 pages, 8 figures, 1 equation"},{"id":"http://arxiv.org/abs/2501.05170v1","updated":"2025-01-09T11:44:49Z","published":"2025-01-09T11:44:49Z","title":"De-centering the (Traditional) User: Multistakeholder Evaluation of\n  Recommender Systems","summary":"  Multistakeholder recommender systems are those that account for the impacts\nand preferences of multiple groups of individuals, not just the end users\nreceiving recommendations. Due to their complexity, evaluating these systems\ncannot be restricted to the overall utility of a single stakeholder, as is\noften the case of more mainstream recommender system applications. In this\narticle, we focus our discussion on the intricacies of the evaluation of\nmultistakeholder recommender systems. We bring attention to the different\naspects involved in the evaluation of multistakeholder recommender systems -\nfrom the range of stakeholders involved (including but not limited to producers\nand consumers) to the values and specific goals of each relevant stakeholder.\nAdditionally, we discuss how to move from theoretical principles to practical\nimplementation, providing specific use case examples. Finally, we outline open\nresearch directions for the RecSys community to explore. We aim to provide\nguidance to researchers and practitioners about how to think about these\ncomplex and domain-dependent issues of evaluation in the course of designing,\ndeveloping, and researching applications with multistakeholder aspects.\n","authors":["Robin Burke","Gediminas Adomavicius","Toine Bogers","Tommaso Di Noia","Dominik Kowald","Julia Neidhardt","Özlem Özgöbek","Maria Soledad Pera","Nava Tintarev","Jürgen Ziegler"],"pdf_url":"https://arxiv.org/pdf/2501.05170v1.pdf","comment":"Preprint submitted to Elsevier, \"Re-centering the User in Recommender\n  System Research\" special issue of the International Journal of Human-Computer\n  Studies (IJHCS)"},{"id":"http://arxiv.org/abs/2404.16969v4","updated":"2025-01-09T11:42:21Z","published":"2024-04-25T18:42:25Z","title":"COCOLA: Coherence-Oriented Contrastive Learning of Musical Audio\n  Representations","summary":"  We present COCOLA (Coherence-Oriented Contrastive Learning for Audio), a\ncontrastive learning method for musical audio representations that captures the\nharmonic and rhythmic coherence between samples. Our method operates at the\nlevel of the stems composing music tracks and can input features obtained via\nHarmonic-Percussive Separation (HPS). COCOLA allows the objective evaluation of\ngenerative models for music accompaniment generation, which are difficult to\nbenchmark with established metrics. In this regard, we evaluate recent music\naccompaniment generation models, demonstrating the effectiveness of the\nproposed method. We release the model checkpoints trained on public datasets\ncontaining separate stems (MUSDB18-HQ, MoisesDB, Slakh2100, and CocoChorales).\n","authors":["Ruben Ciranni","Giorgio Mariani","Michele Mancusi","Emilian Postolache","Giorgio Fabbro","Emanuele Rodolà","Luca Cosmo"],"pdf_url":"https://arxiv.org/pdf/2404.16969v4.pdf","comment":"Demo page: https://github.com/gladia-research-group/cocola, Accepted\n  at ICASSP-25"},{"id":"http://arxiv.org/abs/2412.11120v2","updated":"2025-01-09T11:39:32Z","published":"2024-12-15T08:51:14Z","title":"Latent Reward: LLM-Empowered Credit Assignment in Episodic Reinforcement\n  Learning","summary":"  Reinforcement learning (RL) often encounters delayed and sparse feedback in\nreal-world applications, even with only episodic rewards. Previous approaches\nhave made some progress in reward redistribution for credit assignment but\nstill face challenges, including training difficulties due to redundancy and\nambiguous attributions stemming from overlooking the multifaceted nature of\nmission performance evaluation. Hopefully, Large Language Model (LLM)\nencompasses fruitful decision-making knowledge and provides a plausible tool\nfor reward redistribution. Even so, deploying LLM in this case is non-trivial\ndue to the misalignment between linguistic knowledge and the symbolic form\nrequirement, together with inherent randomness and hallucinations in inference.\nTo tackle these issues, we introduce LaRe, a novel LLM-empowered symbolic-based\ndecision-making framework, to improve credit assignment. Key to LaRe is the\nconcept of the Latent Reward, which works as a multi-dimensional performance\nevaluation, enabling more interpretable goal attainment from various\nperspectives and facilitating more effective reward redistribution. We examine\nthat semantically generated code from LLM can bridge linguistic knowledge and\nsymbolic latent rewards, as it is executable for symbolic objects. Meanwhile,\nwe design latent reward self-verification to increase the stability and\nreliability of LLM inference. Theoretically, reward-irrelevant redundancy\nelimination in the latent reward benefits RL performance from more accurate\nreward estimation. Extensive experimental results witness that LaRe (i)\nachieves superior temporal credit assignment to SOTA methods, (ii) excels in\nallocating contributions among multiple agents, and (iii) outperforms policies\ntrained with ground truth rewards for certain tasks.\n","authors":["Yun Qu","Yuhang Jiang","Boyuan Wang","Yixiu Mao","Cheems Wang","Chang Liu","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2412.11120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15166v3","updated":"2025-01-09T11:39:19Z","published":"2024-02-23T07:59:23Z","title":"Convergence Analysis of Split Federated Learning on Heterogeneous Data","summary":"  Split federated learning (SFL) is a recent distributed approach for\ncollaborative model training among multiple clients. In SFL, a global model is\ntypically split into two parts, where clients train one part in a parallel\nfederated manner, and a main server trains the other. Despite the recent\nresearch on SFL algorithm development, the convergence analysis of SFL is\nmissing in the literature, and this paper aims to fill this gap. The analysis\nof SFL can be more challenging than that of federated learning (FL), due to the\npotential dual-paced updates at the clients and the main server. We provide\nconvergence analysis of SFL for strongly convex and general convex objectives\non heterogeneous data. The convergence rates are $O(1/T)$ and\n$O(1/\\sqrt[3]{T})$, respectively, where $T$ denotes the total number of rounds\nfor SFL training. We further extend the analysis to non-convex objectives and\nthe scenario where some clients may be unavailable during training.\nExperimental experiments validate our theoretical results and show that SFL\noutperforms FL and split learning (SL) when data is highly heterogeneous across\na large number of clients.\n","authors":["Pengchao Han","Chao Huang","Geng Tian","Ming Tang","Xin Liu"],"pdf_url":"https://arxiv.org/pdf/2402.15166v3.pdf","comment":"Accepted by Conference on Neural Information Processing Systems\n  (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2501.04239v2","updated":"2025-01-09T11:38:45Z","published":"2025-01-08T02:32:48Z","title":"Dynamic Localisation of Spatial-Temporal Graph Neural Network","summary":"  Spatial-temporal data, fundamental to many intelligent applications, reveals\ndependencies indicating causal links between present measurements at specific\nlocations and historical data at the same or other locations. Within this\ncontext, adaptive spatial-temporal graph neural networks (ASTGNNs) have emerged\nas valuable tools for modelling these dependencies, especially through a\ndata-driven approach rather than pre-defined spatial graphs. While this\napproach offers higher accuracy, it presents increased computational demands.\nAddressing this challenge, this paper delves into the concept of localisation\nwithin ASTGNNs, introducing an innovative perspective that spatial dependencies\nshould be dynamically evolving over time. We introduce \\textit{DynAGS}, a\nlocalised ASTGNN framework aimed at maximising efficiency and accuracy in\ndistributed deployment. This framework integrates dynamic localisation,\ntime-evolving spatial graphs, and personalised localisation, all orchestrated\naround the Dynamic Graph Generator, a light-weighted central module leveraging\ncross attention. The central module can integrate historical information in a\nnode-independent manner to enhance the feature representation of nodes at the\ncurrent moment. This improved feature representation is then used to generate a\ndynamic sparse graph without the need for costly data exchanges, and it\nsupports personalised localisation. Performance assessments across two core\nASTGNN architectures and nine real-world datasets from various applications\nreveal that \\textit{DynAGS} outshines current benchmarks, underscoring that the\ndynamic modelling of spatial dependencies can drastically improve model\nexpressibility, flexibility, and system efficiency, especially in distributed\nsettings.\n","authors":["Wenying Duan","Shujun Guo","Wei huang","Hong Rao","Xiaoxi He"],"pdf_url":"https://arxiv.org/pdf/2501.04239v2.pdf","comment":"This paper was accepted by KDD'25"},{"id":"http://arxiv.org/abs/2405.09062v6","updated":"2025-01-09T11:38:07Z","published":"2024-05-15T03:26:01Z","title":"Naturalistic Music Decoding from EEG Data via Latent Diffusion Models","summary":"  In this article, we explore the potential of using latent diffusion models, a\nfamily of powerful generative models, for the task of reconstructing\nnaturalistic music from electroencephalogram (EEG) recordings. Unlike simpler\nmusic with limited timbres, such as MIDI-generated tunes or monophonic pieces,\nthe focus here is on intricate music featuring a diverse array of instruments,\nvoices, and effects, rich in harmonics and timbre. This study represents an\ninitial foray into achieving general music reconstruction of high-quality using\nnon-invasive EEG data, employing an end-to-end training approach directly on\nraw data without the need for manual pre-processing and channel selection. We\ntrain our models on the public NMED-T dataset and perform quantitative\nevaluation proposing neural embedding-based metrics. Our work contributes to\nthe ongoing research in neural decoding and brain-computer interfaces, offering\ninsights into the feasibility of using EEG data for complex auditory\ninformation reconstruction.\n","authors":["Emilian Postolache","Natalia Polouliakh","Hiroaki Kitano","Akima Connelly","Emanuele Rodolà","Luca Cosmo","Taketo Akama"],"pdf_url":"https://arxiv.org/pdf/2405.09062v6.pdf","comment":"Accepted at ICASSP-25"},{"id":"http://arxiv.org/abs/2404.03105v2","updated":"2025-01-09T11:24:56Z","published":"2024-04-03T23:07:24Z","title":"Methodology for Interpretable Reinforcement Learning for Optimizing\n  Mechanical Ventilation","summary":"  Mechanical ventilation is a critical life support intervention that delivers\ncontrolled air and oxygen to a patient's lungs, assisting or replacing\nspontaneous breathing. While several data-driven approaches have been proposed\nto optimize ventilator control strategies, they often lack interpretability and\nalignment with domain knowledge, hindering clinical adoption. This paper\npresents a methodology for interpretable reinforcement learning (RL) aimed at\nimproving mechanical ventilation control as part of connected health systems.\nUsing a causal, nonparametric model-based off-policy evaluation, we assess RL\npolicies for their ability to enhance patient-specific outcomes-specifically,\nincreasing blood oxygen levels (SpO2), while avoiding aggressive ventilator\nsettings that may cause ventilator-induced lung injuries and other\ncomplications. Through numerical experiments on real-world ICU data from the\nMIMIC-III database, we demonstrate that our interpretable decision tree policy\nachieves performance comparable to state-of-the-art deep RL methods while\noutperforming standard behavior cloning approaches. The results highlight the\npotential of interpretable, data-driven decision support systems to improve\nsafety and efficiency in personalized ventilation strategies, paving the way\nfor seamless integration into connected healthcare environments.\n","authors":["Joo Seung Lee","Malini Mahendra","Anil Aswani"],"pdf_url":"https://arxiv.org/pdf/2404.03105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00717v3","updated":"2025-01-09T11:24:44Z","published":"2024-09-01T13:14:41Z","title":"Preference-Based Multi-Agent Reinforcement Learning: Data Coverage and\n  Algorithmic Techniques","summary":"  We initiate the study of Preference-Based Multi-Agent Reinforcement Learning\n(PbMARL), exploring both theoretical foundations and empirical validations. We\ndefine the task as identifying the Nash equilibrium from a preference-only\noffline dataset in general-sum games, a problem marked by the challenge of\nsparse feedback signals. Our theory establishes the upper complexity bounds for\nNash Equilibrium in effective PbMARL, demonstrating that single-policy coverage\nis inadequate and highlighting the importance of unilateral dataset coverage.\nThese theoretical insights are verified through comprehensive experiments. To\nenhance the practical performance, we further introduce two algorithmic\ntechniques. (1) We propose a Mean Squared Error (MSE) regularization along the\ntime axis to achieve a more uniform reward distribution and improve reward\nlearning outcomes. (2) We propose an additional penalty based on the\ndistribution of the dataset to incorporate pessimism, improving stability and\neffectiveness during training. Our findings underscore the multifaceted\napproach required for PbMARL, paving the way for effective preference-based\nmulti-agent systems.\n","authors":["Natalia Zhang","Xinqi Wang","Qiwen Cui","Runlong Zhou","Sham M. Kakade","Simon S. Du"],"pdf_url":"https://arxiv.org/pdf/2409.00717v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2501.02648v2","updated":"2025-01-09T11:17:01Z","published":"2025-01-05T20:26:49Z","title":"Representation Learning of Lab Values via Masked AutoEncoder","summary":"  Accurate imputation of missing laboratory values in electronic health records\n(EHRs) is critical to enable robust clinical predictions and reduce biases in\nAI systems in healthcare. Existing methods, such as variational autoencoders\n(VAEs) and decision tree-based approaches such as XGBoost, struggle to model\nthe complex temporal and contextual dependencies in EHR data, mainly in\nunderrepresented groups. In this work, we propose Lab-MAE, a novel\ntransformer-based masked autoencoder framework that leverages self-supervised\nlearning for the imputation of continuous sequential lab values. Lab-MAE\nintroduces a structured encoding scheme that jointly models laboratory test\nvalues and their corresponding timestamps, enabling explicit capturing temporal\ndependencies. Empirical evaluation on the MIMIC-IV dataset demonstrates that\nLab-MAE significantly outperforms the state-of-the-art baselines such as\nXGBoost across multiple metrics, including root mean square error (RMSE),\nR-squared (R2), and Wasserstein distance (WD). Notably, Lab-MAE achieves\nequitable performance across demographic groups of patients, advancing fairness\nin clinical predictions. We further investigate the role of follow-up\nlaboratory values as potential shortcut features, revealing Lab-MAE's\nrobustness in scenarios where such data is unavailable. The findings suggest\nthat our transformer-based architecture, adapted to the characteristics of the\nEHR data, offers a foundation model for more accurate and fair clinical\nimputation models. In addition, we measure and compare the carbon footprint of\nLab-MAE with the baseline XGBoost model, highlighting its environmental\nrequirements.\n","authors":["David Restrepo","Chenwei Wu","Yueran Jia","Jaden K. Sun","Jack Gallifant","Catherine G. Bielick","Yugang Jia","Leo A. Celi"],"pdf_url":"https://arxiv.org/pdf/2501.02648v2.pdf","comment":"10 pages main text, 8 appendix"},{"id":"http://arxiv.org/abs/2411.07066v2","updated":"2025-01-09T11:11:37Z","published":"2024-11-11T15:30:16Z","title":"Zeroth-Order Adaptive Neuron Alignment Based Pruning without Re-Training","summary":"  Network pruning focuses on computational techniques that aim to reduce a\ngiven model's computational cost by removing a subset of its parameters while\nhaving minimal impact on performance. Throughout the last decade, the most\nwidely used pruning paradigm has been pruning and re-training, which nowadays\nis inconvenient due to the vast amount of pre-trained models, which are in any\ncase too expensive to re-train. In this paper, we exploit functional\ninformation from dense pre-trained models, i.e., their activations, to obtain\nsparse models that maximize the activations' alignment w.r.t. their\ncorresponding dense models. Hence, we propose \\textsc{NeuroAL}, a \\emph{top-up}\nalgorithm that can be used on top of any given pruning algorithm for LLMs,\nwhich modifies the block-wise and row-wise sparsity exploiting information from\nboth the dense model and its sparse version to maximize the \\emph{neuron\nalignment} among activations. Differently from existing methods, our approach\nadaptively selects the best hyperparameters for the block-wise and row-wise\nsparsity ratios w.r.t. the model and the desired sparsity, and requires\n\\emph{no re-training}. We test our method over 276 cases combining four LLM\nfamilies, three sparsity ratios, and ten language tasks (three language\nmodeling and seven zero-shot datasets), showing how it consistently outperforms\nthe latest state-of-the-art methods in terms of performance-runtime trade-off.\nThe code is available at\n\\href{https://github.com/eliacunegatti/NeuroAL}{https://github.com/eliacunegatti/NeuroAL}.\n","authors":["Elia Cunegatti","Leonardo Lucio Custode","Giovanni Iacca"],"pdf_url":"https://arxiv.org/pdf/2411.07066v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2409.05072v2","updated":"2025-01-09T11:06:36Z","published":"2024-09-08T12:19:12Z","title":"A General Framework for Clustering and Distribution Matching with Bandit\n  Feedback","summary":"  We develop a general framework for clustering and distribution matching\nproblems with bandit feedback. We consider a $K$-armed bandit model where some\nsubset of $K$ arms is partitioned into $M$ groups. Within each group, the\nrandom variable associated to each arm follows the same distribution on a\nfinite alphabet. At each time step, the decision maker pulls an arm and\nobserves its outcome from the random variable associated to that arm.\nSubsequent arm pulls depend on the history of arm pulls and their outcomes. The\ndecision maker has no knowledge of the distributions of the arms or the\nunderlying partitions. The task is to devise an online algorithm to learn the\nunderlying partition of arms with the least number of arm pulls on average and\nwith an error probability not exceeding a pre-determined value~$\\delta$.\nSeveral existing problems fall under our general framework, including finding\n$M$ pairs of arms, odd arm identification, and $N$-ary clustering of $K$ arms\nbelong to our general framework. We derive a non-asymptotic lower bound on the\naverage number of arm pulls for any online algorithm with an error probability\nnot exceeding $\\delta$. Furthermore, we develop a computationally-efficient\nonline algorithm based on the Track-and-Stop method and Frank--Wolfe algorithm,\nand show that the average number of arm pulls of our algorithm asymptotically\nmatches that of the lower bound. Our refined analysis also uncovers a novel\nbound on the speed at which the average number of arm pulls of our algorithm\nconverges to the fundamental limit as $\\delta$ vanishes.\n","authors":["Recep Can Yavas","Yuqi Huang","Vincent Y. F. Tan","Jonathan Scarlett"],"pdf_url":"https://arxiv.org/pdf/2409.05072v2.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2406.00778v2","updated":"2025-01-09T10:47:35Z","published":"2024-06-02T15:35:45Z","title":"Bayesian Joint Additive Factor Models for Multiview Learning","summary":"  It is increasingly common in a wide variety of applied settings to collect\ndata of multiple different types on the same set of samples. Our particular\nfocus in this article is on studying relationships between such multiview\nfeatures and responses. A motivating application arises in the context of\nprecision medicine where multi-omics data are collected to correlate with\nclinical outcomes. It is of interest to infer dependence within and across\nviews while combining multimodal information to improve the prediction of\noutcomes. The signal-to-noise ratio can vary substantially across views,\nmotivating more nuanced statistical tools beyond standard late and early\nfusion. This challenge comes with the need to preserve interpretability, select\nfeatures, and obtain accurate uncertainty quantification. We propose a joint\nadditive factor regression model (JAFAR) with a structured additive design,\naccounting for shared and view-specific components. We ensure identifiability\nvia a novel dependent cumulative shrinkage process (D-CUSP) prior. We provide\nan efficient implementation via a partially collapsed Gibbs sampler and extend\nour approach to allow flexible feature and outcome distributions. Prediction of\ntime-to-labor onset from immunome, metabolome, and proteome data illustrates\nperformance gains against state-of-the-art competitors. Our open-source\nsoftware (R package) is available at https://github.com/niccoloanceschi/jafar.\n","authors":["Niccolo Anceschi","Federico Ferrari","David B. Dunson","Himel Mallick"],"pdf_url":"https://arxiv.org/pdf/2406.00778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05130v1","updated":"2025-01-09T10:33:16Z","published":"2025-01-09T10:33:16Z","title":"Learning In-Distribution Representations for Anomaly Detection","summary":"  Anomaly detection involves identifying data patterns that deviate from the\nanticipated norm. Traditional methods struggle in high-dimensional spaces due\nto the curse of dimensionality. In recent years, self-supervised learning,\nparticularly through contrastive objectives, has driven advances in anomaly\ndetection. However, vanilla contrastive learning struggles to align with the\nunique demands of anomaly detection, as it lacks a pretext task tailored to the\nhomogeneous nature of In-Distribution (ID) data and the diversity of\nOut-of-Distribution (OOD) anomalies. Methods that attempt to address these\nchallenges, such as introducing hard negatives through synthetic outliers,\nOutlier Exposure (OE), and supervised objectives, often rely on pretext tasks\nthat fail to balance compact clustering of ID samples with sufficient\nseparation from OOD data. In this work, we propose Focused In-distribution\nRepresentation Modeling (FIRM), a contrastive learning objective specifically\ndesigned for anomaly detection. Unlike existing approaches, FIRM incorporates\nsynthetic outliers into its pretext task in a way that actively shapes the\nrepresentation space, promoting compact clustering of ID samples while\nenforcing strong separation from outliers. This formulation addresses the\nchallenges of class collision, enhancing both the compactness of ID\nrepresentations and the discriminative power of the learned feature space. We\nshow that FIRM surpasses other contrastive methods in standard benchmarks,\nsignificantly enhancing anomaly detection compared to both traditional and\nsupervised contrastive learning objectives. Our ablation studies confirm that\nFIRM consistently improves the quality of representations and shows robustness\nacross a range of scoring methods. The code is available at:\nhttps://github.com/willtl/firm.\n","authors":["William T. Lunardi","Abdulrahman Banabila","Dania Herzalla","Martin L. Andreoni"],"pdf_url":"https://arxiv.org/pdf/2501.05130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01163v2","updated":"2025-01-09T10:19:09Z","published":"2024-08-02T10:25:19Z","title":"Domain Adaptation-Enhanced Searchlight: Enabling classification of brain\n  states from visual perception to mental imagery","summary":"  In cognitive neuroscience and brain-computer interface research, accurately\npredicting imagined stimuli is crucial. This study investigates the\neffectiveness of Domain Adaptation (DA) in enhancing imagery prediction using\nprimarily visual data from fMRI scans of 18 subjects. Initially, we train a\nbaseline model on visual stimuli to predict imagined stimuli, utilizing data\nfrom 14 brain regions. We then develop several models to improve imagery\nprediction, comparing different DA methods. Our results demonstrate that DA\nsignificantly enhances imagery prediction in binary classification on our\ndataset, as well as in multiclass classification on a publicly available\ndataset. We then conduct a DA-enhanced searchlight analysis, followed by\npermutation-based statistical tests to identify brain regions where imagery\ndecoding is consistently above chance across subjects. Our DA-enhanced\nsearchlight predicts imagery contents in a highly distributed set of brain\nregions, including the visual cortex and the frontoparietal cortex, thereby\noutperforming standard cross-domain classification methods. The complete code\nand data for this paper have been made openly available for the use of the\nscientific community.\n","authors":["Alexander Olza","David Soto","Roberto Santana"],"pdf_url":"https://arxiv.org/pdf/2408.01163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05113v1","updated":"2025-01-09T09:59:42Z","published":"2025-01-09T09:59:42Z","title":"Constrained Optimization of Charged Particle Tracking with Multi-Agent\n  Reinforcement Learning","summary":"  Reinforcement learning demonstrated immense success in modelling complex\nphysics-driven systems, providing end-to-end trainable solutions by interacting\nwith a simulated or real environment, maximizing a scalar reward signal. In\nthis work, we propose, building upon previous work, a multi-agent reinforcement\nlearning approach with assignment constraints for reconstructing particle\ntracks in pixelated particle detectors. Our approach optimizes collaboratively\na parametrized policy, functioning as a heuristic to a multidimensional\nassignment problem, by jointly minimizing the total amount of particle\nscattering over the reconstructed tracks in a readout frame. To satisfy\nconstraints, guaranteeing a unique assignment of particle hits, we propose a\nsafety layer solving a linear assignment problem for every joint action.\nFurther, to enforce cost margins, increasing the distance of the local policies\npredictions to the decision boundaries of the optimizer mappings, we recommend\nthe use of an additional component in the blackbox gradient estimation, forcing\nthe policy to solutions with lower total assignment costs. We empirically show\non simulated data, generated for a particle detector developed for proton\nimaging, the effectiveness of our approach, compared to multiple single- and\nmulti-agent baselines. We further demonstrate the effectiveness of constraints\nwith cost margins for both optimization and generalization, introduced by wider\nregions with high reconstruction performance as well as reduced predictive\ninstabilities. Our results form the basis for further developments in RL-based\ntracking, offering both enhanced performance with constrained policies and\ngreater flexibility in optimizing tracking algorithms through the option for\nindividual and team rewards.\n","authors":["Tobias Kortus","Ralf Keidel","Nicolas R. Gauger","Jan Kieseler"],"pdf_url":"https://arxiv.org/pdf/2501.05113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05109v1","updated":"2025-01-09T09:57:33Z","published":"2025-01-09T09:57:33Z","title":"EquiBoost: An Equivariant Boosting Approach to Molecular Conformation\n  Generation","summary":"  Molecular conformation generation plays key roles in computational drug\ndesign. Recently developed deep learning methods, particularly diffusion models\nhave reached competitive performance over traditional cheminformatical\napproaches. However, these methods are often time-consuming or require extra\nsupport from traditional methods. We propose EquiBoost, a boosting model that\nstacks several equivariant graph transformers as weak learners, to iteratively\nrefine 3D conformations of molecules. Without relying on diffusion techniques,\nEquiBoost balances accuracy and efficiency more effectively than\ndiffusion-based methods. Notably, compared to the previous state-of-the-art\ndiffusion method, EquiBoost improves generation quality and preserves\ndiversity, achieving considerably better precision of Average Minimum RMSD\n(AMR) on the GEOM datasets. This work rejuvenates boosting and sheds light on\nits potential to be a robust alternative to diffusion models in certain\nscenarios.\n","authors":["Yixuan Yang","Xingyu Fang","Zhaowen Cheng","Pengju Yan","Xiaolin Li"],"pdf_url":"https://arxiv.org/pdf/2501.05109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05105v1","updated":"2025-01-09T09:46:27Z","published":"2025-01-09T09:46:27Z","title":"Robust Score Matching","summary":"  Proposed in Hyv\\\"arinen (2005), score matching is a parameter estimation\nprocedure that does not require computation of distributional normalizing\nconstants. In this work we utilize the geometric median of means to develop a\nrobust score matching procedure that yields consistent parameter estimates in\nsettings where the observed data has been contaminated. A special appeal of the\nproposed method is that it retains convexity in exponential family models. The\nnew method is therefore particularly attractive for non-Gaussian, exponential\nfamily graphical models where evaluation of normalizing constants is\nintractable. Support recovery guarantees for such models when contamination is\npresent are provided. Additionally, support recovery is studied in numerical\nexperiments and on a precipitation dataset. We demonstrate that the proposed\nrobust score matching estimator performs comparably to the standard score\nmatching estimator when no contamination is present but greatly outperforms\nthis estimator in a setting with contamination.\n","authors":["Richard Schwank","Andrew McCormack","Mathias Drton"],"pdf_url":"https://arxiv.org/pdf/2501.05105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05097v1","updated":"2025-01-09T09:25:22Z","published":"2025-01-09T09:25:22Z","title":"A 1Mb mixed-precision quantized encoder for image classification and\n  patch-based compression","summary":"  Even if Application-Specific Integrated Circuits (ASIC) have proven to be a\nrelevant choice for integrating inference at the edge, they are often limited\nin terms of applicability. In this paper, we demonstrate that an ASIC neural\nnetwork accelerator dedicated to image processing can be applied to multiple\ntasks of different levels: image classification and compression, while\nrequiring a very limited hardware. The key component is a reconfigurable,\nmixed-precision (3b/2b/1b) encoder that takes advantage of proper weight and\nactivation quantizations combined with convolutional layer structural pruning\nto lower hardware-related constraints (memory and computing). We introduce an\nautomatic adaptation of linear symmetric quantizer scaling factors to perform\nquantized levels equalization, aiming at stabilizing quinary and ternary\nweights training. In addition, a proposed layer-shared Bit-Shift Normalization\nsignificantly simplifies the implementation of the hardware-expensive Batch\nNormalization. For a specific configuration in which the encoder design only\nrequires 1Mb, the classification accuracy reaches 87.5% on CIFAR-10. Besides,\nwe also show that this quantized encoder can be used to compress image\npatch-by-patch while the reconstruction can performed remotely, by a dedicated\nfull-frame decoder. This solution typically enables an end-to-end compression\nalmost without any block artifacts, outperforming patch-based state-of-the-art\ntechniques employing a patch-constant bitrate.\n","authors":["Van Thien Nguyen","William Guicquero","Gilles Sicard"],"pdf_url":"https://arxiv.org/pdf/2501.05097v1.pdf","comment":"Published at IEEE Transactions on Circuits and Systems for Video\n  Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2410.06232v2","updated":"2025-01-09T09:20:48Z","published":"2024-10-08T17:41:37Z","title":"Range, not Independence, Drives Modularity in Biological Inspired\n  Representation","summary":"  Why do biological and artificial neurons sometimes modularise, each encoding\na single meaningful variable, and sometimes entangle their representation of\nmany variables? In this work, we develop a theory of when biologically inspired\nnetworks -- those that are nonnegative and energy efficient -- modularise their\nrepresentation of source variables (sources). We derive necessary and\nsufficient conditions on a sample of sources that determine whether the neurons\nin an optimal biologically-inspired linear autoencoder modularise. Our theory\napplies to any dataset, extending far beyond the case of statistical\nindependence studied in previous work. Rather we show that sources modularise\nif their support is ``sufficiently spread''. From this theory, we extract and\nvalidate predictions in a variety of empirical studies on how data distribution\naffects modularisation in nonlinear feedforward and recurrent neural networks\ntrained on supervised and unsupervised tasks. Furthermore, we apply these ideas\nto neuroscience data, showing that range independence can be used to understand\nthe mixing or modularising of spatial and reward information in entorhinal\nrecordings in seemingly conflicting experiments. Further, we use these results\nto suggest alternate origins of mixed-selectivity, beyond the predominant\ntheory of flexible nonlinear classification. In sum, our theory prescribes\nprecise conditions on when neural activities modularise, providing tools for\ninducing and elucidating modular representations in brains and machines.\n","authors":["Will Dorrell","Kyle Hsu","Luke Hollingsworth","Jin Hwa Lee","Jiajun Wu","Chelsea Finn","Peter E Latham","Tim EJ Behrens","James CR Whittington"],"pdf_url":"https://arxiv.org/pdf/2410.06232v2.pdf","comment":"40 pages, 16 figures. WD and KH contributed equally; LH and JHL\n  contributed equally"},{"id":"http://arxiv.org/abs/2501.05093v1","updated":"2025-01-09T09:19:05Z","published":"2025-01-09T09:19:05Z","title":"Hierarchical Decomposed Dual-domain Deep Learning for Sparse-View CT\n  Reconstruction","summary":"  Objective: X-ray computed tomography employing sparse projection views has\nemerged as a contemporary technique to mitigate radiation dose. However, due to\nthe inadequate number of projection views, an analytic reconstruction method\nutilizing filtered backprojection results in severe streaking artifacts.\nRecently, deep learning strategies employing image-domain networks have\ndemonstrated remarkable performance in eliminating the streaking artifact\ncaused by analytic reconstruction methods with sparse projection views.\nNevertheless, it is difficult to clarify the theoretical justification for\napplying deep learning to sparse view CT reconstruction, and it has been\nunderstood as restoration by removing image artifacts, not reconstruction.\n  Approach: By leveraging the theory of deep convolutional framelets and the\nhierarchical decomposition of measurement, this research reveals the\nconstraints of conventional image- and projection-domain deep learning\nmethodologies, subsequently, the research proposes a novel dual-domain deep\nlearning framework utilizing hierarchical decomposed measurements.\nSpecifically, the research elucidates how the performance of the\nprojection-domain network can be enhanced through a low-rank property of deep\nconvolutional framelets and a bowtie support of hierarchical decomposed\nmeasurement in the Fourier domain.\n  Main Results: This study demonstrated performance improvement of the proposed\nframework based on the low-rank property, resulting in superior reconstruction\nperformance compared to conventional analytic and deep learning methods.\n  Significance: By providing a theoretically justified deep learning approach\nfor sparse-view CT reconstruction, this study not only offers a superior\nalternative to existing methods but also opens new avenues for research in\nmedical imaging.\n","authors":["Yoseob Han"],"pdf_url":"https://arxiv.org/pdf/2501.05093v1.pdf","comment":"Published by Physics in Medicine & Biology (2024.4)"},{"id":"http://arxiv.org/abs/2501.05089v1","updated":"2025-01-09T09:12:57Z","published":"2025-01-09T09:12:57Z","title":"Supervised Learning with Evolving Tasks and Performance Guarantees","summary":"  Multiple supervised learning scenarios are composed by a sequence of\nclassification tasks. For instance, multi-task learning and continual learning\naim to learn a sequence of tasks that is either fixed or grows over time.\nExisting techniques for learning tasks that are in a sequence are tailored to\nspecific scenarios, lacking adaptability to others. In addition, most of\nexisting techniques consider situations in which the order of the tasks in the\nsequence is not relevant. However, it is common that tasks in a sequence are\nevolving in the sense that consecutive tasks often have a higher similarity.\nThis paper presents a learning methodology that is applicable to multiple\nsupervised learning scenarios and adapts to evolving tasks. Differently from\nexisting techniques, we provide computable tight performance guarantees and\nanalytically characterize the increase in the effective sample size.\nExperiments on benchmark datasets show the performance improvement of the\nproposed methodology in multiple scenarios and the reliability of the presented\nperformance guarantees.\n","authors":["Verónica Álvarez","Santiago Mazuelas","Jose A. Lozano"],"pdf_url":"https://arxiv.org/pdf/2501.05089v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2310.15974"},{"id":"http://arxiv.org/abs/2312.03700v2","updated":"2025-01-09T09:12:06Z","published":"2023-12-06T18:59:19Z","title":"OneLLM: One Framework to Align All Modalities with Language","summary":"  Multimodal large language models (MLLMs) have gained significant attention\ndue to their strong multimodal understanding capability. However, existing\nworks rely heavily on modality-specific encoders, which usually differ in\narchitecture and are limited to common modalities. In this paper, we present\nOneLLM, an MLLM that aligns eight modalities to language using a unified\nframework. We achieve this through a unified multimodal encoder and a\nprogressive multimodal alignment pipeline. In detail, we first train an image\nprojection module to connect a vision encoder with LLM. Then, we build a\nuniversal projection module (UPM) by mixing multiple image projection modules\nand dynamic routing. Finally, we progressively align more modalities to LLM\nwith the UPM. To fully leverage the potential of OneLLM in following\ninstructions, we also curated a comprehensive multimodal instruction dataset,\nincluding 2M items from image, audio, video, point cloud, depth/normal map, IMU\nand fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,\nencompassing tasks such as multimodal captioning, question answering and\nreasoning, where it delivers excellent performance. Code, data, model and\nonline demo are available at https://github.com/csuhan/OneLLM\n","authors":["Jiaming Han","Kaixiong Gong","Yiyuan Zhang","Jiaqi Wang","Kaipeng Zhang","Dahua Lin","Yu Qiao","Peng Gao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2312.03700v2.pdf","comment":"Accepted by CVPR 2024. Code: https://github.com/csuhan/OneLLM"},{"id":"http://arxiv.org/abs/2501.05087v1","updated":"2025-01-09T09:11:40Z","published":"2025-01-09T09:11:40Z","title":"Enhanced Quantile Regression with Spiking Neural Networks for Long-Term\n  System Health Prognostics","summary":"  This paper presents a novel predictive maintenance framework centered on\nEnhanced Quantile Regression Neural Networks EQRNNs, for anticipating system\nfailures in industrial robotics. We address the challenge of early failure\ndetection through a hybrid approach that combines advanced neural\narchitectures. The system leverages dual computational stages: first\nimplementing an EQRNN optimized for processing multi-sensor data streams\nincluding vibration, thermal, and power signatures, followed by an integrated\nSpiking Neural Network SNN, layer that enables microsecond-level response\ntimes. This architecture achieves notable accuracy rates of 92.3\\% in component\nfailure prediction with a 90-hour advance warning window. Field testing\nconducted on an industrial scale with 50 robotic systems demonstrates\nsignificant operational improvements, yielding a 94\\% decrease in unexpected\nsystem failures and 76\\% reduction in maintenance-related downtimes. The\nframework's effectiveness in processing complex, multi-modal sensor data while\nmaintaining computational efficiency validates its applicability for Industry\n4.0 manufacturing environments.\n","authors":["David J Poland"],"pdf_url":"https://arxiv.org/pdf/2501.05087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05085v1","updated":"2025-01-09T09:10:17Z","published":"2025-01-09T09:10:17Z","title":"End-to-End Deep Learning for Interior Tomography with Low-Dose X-ray CT","summary":"  Objective: There exist several X-ray computed tomography (CT) scanning\nstrategies to reduce a radiation dose, such as (1) sparse-view CT, (2) low-dose\nCT, and (3) region-of-interest (ROI) CT (called interior tomography). To\nfurther reduce the dose, the sparse-view and/or low-dose CT settings can be\napplied together with interior tomography. Interior tomography has various\nadvantages in terms of reducing the number of detectors and decreasing the\nX-ray radiation dose. However, a large patient or small field-of-view (FOV)\ndetector can cause truncated projections, and then the reconstructed images\nsuffer from severe cupping artifacts. In addition, although the low-dose CT can\nreduce the radiation exposure dose, analytic reconstruction algorithms produce\nimage noise. Recently, many researchers have utilized image-domain deep\nlearning (DL) approaches to remove each artifact and demonstrated impressive\nperformances, and the theory of deep convolutional framelets supports the\nreason for the performance improvement. Approach: In this paper, we found that\nthe image-domain convolutional neural network (CNN) is difficult to solve\ncoupled artifacts, based on deep convolutional framelets. Significance: To\naddress the coupled problem, we decouple it into two sub-problems: (i) image\ndomain noise reduction inside truncated projection to solve low-dose CT problem\nand (ii) extrapolation of projection outside truncated projection to solve the\nROI CT problem. The decoupled sub-problems are solved directly with a novel\nproposed end-to-end learning using dual-domain CNNs. Main results: We\ndemonstrate that the proposed method outperforms the conventional image-domain\ndeep learning methods, and a projection-domain CNN shows better performance\nthan the image-domain CNNs which are commonly used by many researchers.\n","authors":["Yoseob Han","Dufan Wu","Kyungsang Kim","Quanzheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.05085v1.pdf","comment":"Published by Physics in Medicine & Biology (2022.5)"},{"id":"http://arxiv.org/abs/2412.10095v2","updated":"2025-01-09T09:09:32Z","published":"2024-12-13T12:31:06Z","title":"HiTZ at VarDial 2025 NorSID: Overcoming Data Scarcity with Language\n  Transfer and Automatic Data Annotation","summary":"  In this paper we present our submission for the NorSID Shared Task as part of\nthe 2025 VarDial Workshop (Scherrer et al., 2025), consisting of three tasks:\nIntent Detection, Slot Filling and Dialect Identification, evaluated using data\nin different dialects of the Norwegian language. For Intent Detection and Slot\nFilling, we have fine-tuned a multitask model in a cross-lingual setting, to\nleverage the xSID dataset available in 17 languages. In the case of Dialect\nIdentification, our final submission consists of a model fine-tuned on the\nprovided development set, which has obtained the highest scores within our\nexperiments. Our final results on the test set show that our models do not drop\nin performance compared to the development set, likely due to the\ndomain-specificity of the dataset and the similar distribution of both subsets.\nFinally, we also report an in-depth analysis of the provided datasets and their\nartifacts, as well as other sets of experiments that have been carried out but\ndid not yield the best results. Additionally, we present an analysis on the\nreasons why some methods have been more successful than others; mainly the\nimpact of the combination of languages and domain-specificity of the training\ndata on the results.\n","authors":["Jaione Bengoetxea","Mikel Zubillaga","Ekhi Azurmendi","Maite Heredia","Julen Etxaniz","Markel Ferro","Jeremy Barnes"],"pdf_url":"https://arxiv.org/pdf/2412.10095v2.pdf","comment":"Vardial 2025 NorSID Shared Task, fixed minor typos"},{"id":"http://arxiv.org/abs/2501.05082v1","updated":"2025-01-09T09:03:43Z","published":"2025-01-09T09:03:43Z","title":"Comparison of Feature Learning Methods for Metadata Extraction from PDF\n  Scholarly Documents","summary":"  The availability of metadata for scientific documents is pivotal in\npropelling scientific knowledge forward and for adhering to the FAIR principles\n(i.e. Findability, Accessibility, Interoperability, and Reusability) of\nresearch findings. However, the lack of sufficient metadata in published\ndocuments, particularly those from smaller and mid-sized publishers, hinders\ntheir accessibility. This issue is widespread in some disciplines, such as the\nGerman Social Sciences, where publications often employ diverse templates. To\naddress this challenge, our study evaluates various feature learning and\nprediction methods, including natural language processing (NLP), computer\nvision (CV), and multimodal approaches, for extracting metadata from documents\nwith high template variance. We aim to improve the accessibility of scientific\ndocuments and facilitate their wider use. To support our comparison of these\nmethods, we provide comprehensive experimental results, analyzing their\naccuracy and efficiency in extracting metadata. Additionally, we provide\nvaluable insights into the strengths and weaknesses of various feature learning\nand prediction methods, which can guide future research in this field.\n","authors":["Zeyd Boukhers","Cong Yang"],"pdf_url":"https://arxiv.org/pdf/2501.05082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05081v1","updated":"2025-01-09T09:02:41Z","published":"2025-01-09T09:02:41Z","title":"DriVLM: Domain Adaptation of Vision-Language Models in Autonomous\n  Driving","summary":"  In recent years, large language models have had a very impressive\nperformance, which largely contributed to the development and application of\nartificial intelligence, and the parameters and performance of the models are\nstill growing rapidly. In particular, multimodal large language models (MLLM)\ncan combine multiple modalities such as pictures, videos, sounds, texts, etc.,\nand have great potential in various tasks. However, most MLLMs require very\nhigh computational resources, which is a major challenge for most researchers\nand developers. In this paper, we explored the utility of small-scale MLLMs and\napplied small-scale MLLMs to the field of autonomous driving. We hope that this\nwill advance the application of MLLMs in real-world scenarios.\n","authors":["Xuran Zheng","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2501.05081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05078v1","updated":"2025-01-09T09:00:32Z","published":"2025-01-09T09:00:32Z","title":"Analyzing Memorization in Large Language Models through the Lens of\n  Model Attribution","summary":"  Large Language Models (LLMs) are prevalent in modern applications but often\nmemorize training data, leading to privacy breaches and copyright issues.\nExisting research has mainly focused on posthoc analyses, such as extracting\nmemorized content or developing memorization metrics, without exploring the\nunderlying architectural factors that contribute to memorization. In this work,\nwe investigate memorization from an architectural lens by analyzing how\nattention modules at different layers impact its memorization and\ngeneralization performance. Using attribution techniques, we systematically\nintervene in the LLM architecture by bypassing attention modules at specific\nblocks while keeping other components like layer normalization and MLP\ntransformations intact. We provide theorems analyzing our intervention\nmechanism from a mathematical view, bounding the difference in layer outputs\nwith and without our attributions. Our theoretical and empirical analyses\nreveal that attention modules in deeper transformer blocks are primarily\nresponsible for memorization, whereas earlier blocks are crucial for the models\ngeneralization and reasoning capabilities. We validate our findings through\ncomprehensive experiments on different LLM families (Pythia and GPTNeo) and\nfive benchmark datasets. Our insights offer a practical approach to mitigate\nmemorization in LLMs while preserving their performance, contributing to safer\nand more ethical deployment in real world applications.\n","authors":["Tarun Ram Menta","Susmit Agrawal","Chirag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2501.05078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04517v2","updated":"2025-01-09T09:00:02Z","published":"2025-01-08T14:06:07Z","title":"Histogram-Equalized Quantization for logic-gated Residual Neural\n  Networks","summary":"  Adjusting the quantization according to the data or to the model loss seems\nmandatory to enable a high accuracy in the context of quantized neural\nnetworks. This work presents Histogram-Equalized Quantization (HEQ), an\nadaptive framework for linear symmetric quantization. HEQ automatically adapts\nthe quantization thresholds using a unique step size optimization. We\nempirically show that HEQ achieves state-of-the-art performances on CIFAR-10.\nExperiments on the STL-10 dataset even show that HEQ enables a proper training\nof our proposed logic-gated (OR, MUX) residual networks with a higher accuracy\nat a lower hardware complexity than previous work.\n","authors":["Van Thien Nguyen","William Guicquero","Gilles Sicard"],"pdf_url":"https://arxiv.org/pdf/2501.04517v2.pdf","comment":"Published at IEEE ISCAS 2022"},{"id":"http://arxiv.org/abs/2501.05076v1","updated":"2025-01-09T08:59:23Z","published":"2025-01-09T08:59:23Z","title":"TipSegNet: Fingertip Segmentation in Contactless Fingerprint Imaging","summary":"  Contactless fingerprint recognition systems offer a hygienic, user-friendly,\nand efficient alternative to traditional contact-based methods. However, their\naccuracy heavily relies on precise fingertip detection and segmentation,\nparticularly under challenging background conditions. This paper introduces\nTipSegNet, a novel deep learning model that achieves state-of-the-art\nperformance in segmenting fingertips directly from grayscale hand images.\nTipSegNet leverages a ResNeXt-101 backbone for robust feature extraction,\ncombined with a Feature Pyramid Network (FPN) for multi-scale representation,\nenabling accurate segmentation across varying finger poses and image qualities.\nFurthermore, we employ an extensive data augmentation strategy to enhance the\nmodel's generalizability and robustness. TipSegNet outperforms existing\nmethods, achieving a mean Intersection over Union (mIoU) of 0.987 and an\naccuracy of 0.999, representing a significant advancement in contactless\nfingerprint segmentation. This enhanced accuracy has the potential to\nsubstantially improve the reliability and effectiveness of contactless\nbiometric systems in real-world applications.\n","authors":["Laurenz Ruzicka","Bernhard Kohn","Clemens Heitzinger"],"pdf_url":"https://arxiv.org/pdf/2501.05076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05075v1","updated":"2025-01-09T08:59:14Z","published":"2025-01-09T08:59:14Z","title":"A Text-Based Knowledge-Embedded Soft Sensing Modeling Approach for\n  General Industrial Process Tasks Based on Large Language Model","summary":"  Data-driven soft sensors (DDSS) have become mainstream methods for predicting\nkey performance indicators in process industries. However, DDSS development\nrequires complex and costly customized designs tailored to various tasks during\nthe modeling process. Moreover, DDSS are constrained to a single structured\ndata modality, limiting their ability to incorporate additional contextual\nknowledge. Furthermore, DDSSs' limited representation learning leads to weak\npredictive performance with scarce data. To address these challenges, we\npropose a general framework named LLM-TKESS (large language model for\ntext-based knowledge-embedded soft sensing), harnessing the powerful general\nproblem-solving capabilities, cross-modal knowledge transfer abilities, and\nfew-shot capabilities of LLM for enhanced soft sensing modeling. Specifically,\nan auxiliary variable series encoder (AVS Encoder) is proposed to unleash LLM's\npotential for capturing temporal relationships within series and spatial\nsemantic relationships among auxiliary variables. Then, we propose a two-stage\nfine-tuning alignment strategy: in the first stage, employing\nparameter-efficient fine-tuning through autoregressive training adjusts LLM to\nrapidly accommodate process variable data, resulting in a soft sensing\nfoundation model (SSFM). Subsequently, by training adapters, we adapt the SSFM\nto various downstream tasks without modifying its architecture. Then, we\npropose two text-based knowledge-embedded soft sensors, integrating new natural\nlanguage modalities to overcome the limitations of pure structured data models.\nFurthermore, benefiting from LLM's pre-existing world knowledge, our model\ndemonstrates outstanding predictive capabilities in small sample conditions.\nUsing the thermal deformation of air preheater rotor as a case study, we\nvalidate through extensive experiments that LLM-TKESS exhibits outstanding\nperformance.\n","authors":["Shuo Tong","Han Liu","Runyuan Guo","Xueqiong Tian","Wenqing Wang","Ding Liu","Youmin Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.05075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05068v1","updated":"2025-01-09T08:44:06Z","published":"2025-01-09T08:44:06Z","title":"D3RM: A Discrete Denoising Diffusion Refinement Model for Piano\n  Transcription","summary":"  Diffusion models have been widely used in the generative domain due to their\nconvincing performance in modeling complex data distributions. Moreover, they\nhave shown competitive results on discriminative tasks, such as image\nsegmentation. While diffusion models have also been explored for automatic\nmusic transcription, their performance has yet to reach a competitive level. In\nthis paper, we focus on discrete diffusion model's refinement capabilities and\npresent a novel architecture for piano transcription. Our model utilizes\nNeighborhood Attention layers as the denoising module, gradually predicting the\ntarget high-resolution piano roll, conditioned on the finetuned features of a\npretrained acoustic model. To further enhance refinement, we devise a novel\nstrategy which applies distinct transition states during training and inference\nstage of discrete diffusion models. Experiments on the MAESTRO dataset show\nthat our approach outperforms previous diffusion-based piano transcription\nmodels and the baseline model in terms of F1 score. Our code is available in\nhttps://github.com/hanshounsu/d3rm.\n","authors":["Hounsu Kim","Taegyun Kwon","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2501.05068v1.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.04614v2","updated":"2025-01-09T08:42:56Z","published":"2025-01-08T16:53:56Z","title":"MedCoDi-M: A Multi-Prompt Foundation Model for Multimodal Medical Data\n  Generation","summary":"  Artificial Intelligence is revolutionizing medical practice, enhancing\ndiagnostic accuracy and healthcare delivery. However, its adaptation in medical\nsettings still faces significant challenges, related to data availability and\nprivacy constraints. Synthetic data has emerged as a promising solution to\nmitigate these issues, addressing data scarcity while preserving privacy.\nRecently, Latent Diffusion Models have emerged as a powerful tool for\ngenerating high-quality synthetic data. Meanwhile, the integration of different\nmodalities has gained interest, emphasizing the need of models capable of\nhandle multimodal medical data. Existing approaches struggle to integrate\ncomplementary information and lack the ability to generate modalities\nsimultaneously. To address this challenge, we present MedCoDi-M, a\n6.77-billion-parameter model, designed for multimodal medical data generation,\nthat, following Foundation Model paradigm, exploits contrastive learning and\nlarge quantity of data to build a shared latent space which capture the\nrelationships between different data modalities. Further, we introduce the\nMulti-Prompt training technique, which significantly boosts MedCoDi-M's\ngeneration under different settings. We extensively validate MedCoDi-M: first\nwe benchmark it against five competitors on the MIMIC-CXR dataset, a\nstate-of-the-art dataset for Chest X-ray and radiological report generation.\nSecondly, we perform a Visual Turing Test with expert radiologists to assess\nthe realism and clinical relevance of the generated data, ensuring alignment\nwith real-world scenarios. Finally, we assess the utility of MedCoDi-M in\naddressing key challenges in the medical field, such as anonymization, data\nscarcity and imbalance learning. The results are promising, demonstrating the\napplicability of MedCoDi-M in medical contexts. Project page is at\nhttps://cosbidev.github.io/MedCoDi-M/.\n","authors":["Daniele Molino","Francesco Di Feola","Eliodoro Faiella","Deborah Fazzini","Domiziana Santucci","Linlin Shen","Valerio Guarrasi","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2501.04614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05058v1","updated":"2025-01-09T08:28:31Z","published":"2025-01-09T08:28:31Z","title":"Simultaneous emulation and downscaling with physically-consistent deep\n  learning-based regional ocean emulators","summary":"  Building on top of the success in AI-based atmospheric emulation, we propose\nan AI-based ocean emulation and downscaling framework focusing on the\nhigh-resolution regional ocean over Gulf of Mexico. Regional ocean emulation\npresents unique challenges owing to the complex bathymetry and lateral boundary\nconditions as well as from fundamental biases in deep learning-based\nframeworks, such as instability and hallucinations. In this paper, we develop a\ndeep learning-based framework to autoregressively integrate ocean-surface\nvariables over the Gulf of Mexico at $8$ Km spatial resolution without\nunphysical drifts over decadal time scales and simulataneously downscale and\nbias-correct it to $4$ Km resolution using a physics-constrained generative\nmodel. The framework shows both short-term skills as well as accurate long-term\nstatistics in terms of mean and variability.\n","authors":["Leonard Lupin-Jimenez","Moein Darman","Subhashis Hazarika","Tianning Wu","Michael Gray","Ruyoing He","Anthony Wong","Ashesh Chattopadhyay"],"pdf_url":"https://arxiv.org/pdf/2501.05058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05057v1","updated":"2025-01-09T08:28:16Z","published":"2025-01-09T08:28:16Z","title":"LearningFlow: Automated Policy Learning Workflow for Urban Driving with\n  Large Language Models","summary":"  Recent advancements in reinforcement learning (RL) demonstrate the\nsignificant potential in autonomous driving. Despite this promise, challenges\nsuch as the manual design of reward functions and low sample efficiency in\ncomplex environments continue to impede the development of safe and effective\ndriving policies. To tackle these issues, we introduce LearningFlow, an\ninnovative automated policy learning workflow tailored to urban driving. This\nframework leverages the collaboration of multiple large language model (LLM)\nagents throughout the RL training process. LearningFlow includes a curriculum\nsequence generation process and a reward generation process, which work in\ntandem to guide the RL policy by generating tailored training curricula and\nreward functions. Particularly, each process is supported by an analysis agent\nthat evaluates training progress and provides critical insights to the\ngeneration agent. Through the collaborative efforts of these LLM agents,\nLearningFlow automates policy learning across a series of complex driving\ntasks, and it significantly reduces the reliance on manual reward function\ndesign while enhancing sample efficiency. Comprehensive experiments are\nconducted in the high-fidelity CARLA simulator, along with comparisons with\nother existing methods, to demonstrate the efficacy of our proposed approach.\nThe results demonstrate that LearningFlow excels in generating rewards and\ncurricula. It also achieves superior performance and robust generalization\nacross various driving tasks, as well as commendable adaptation to different RL\nalgorithms.\n","authors":["Zengqi Peng","Yubin Wang","Xu Han","Lei Zheng","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2501.05057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02945v2","updated":"2025-01-09T08:26:17Z","published":"2025-01-06T11:38:19Z","title":"The Tabular Foundation Model TabPFN Outperforms Specialized Time Series\n  Forecasting Models Based on Simple Features","summary":"  Foundation models have become popular in forecasting due to their ability to\nmake accurate predictions, even with minimal fine-tuning on specific datasets.\nIn this paper, we demonstrate how the newly released regression variant of\nTabPFN, a general tabular foundation model, can be applied to time series\nforecasting. We propose a straightforward approach, TabPFN-TS, which pairs\nTabPFN with simple feature engineering to achieve strong forecasting\nperformance. Despite its simplicity and with only 11M parameters, TabPFN-TS\noutperforms Chronos-Mini, a model of similar size, and matches or even slightly\noutperforms Chronos-Large, which has 65-fold more parameters. A key strength of\nour method lies in its reliance solely on artificial data during pre-training,\navoiding the need for large training datasets and eliminating the risk of\nbenchmark contamination.\n","authors":["Shi Bin Hoo","Samuel Müller","David Salinas","Frank Hutter"],"pdf_url":"https://arxiv.org/pdf/2501.02945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17908v2","updated":"2025-01-09T08:17:23Z","published":"2024-12-23T19:04:46Z","title":"Trading Devil RL: Backdoor attack via Stock market, Bayesian\n  Optimization and Reinforcement Learning","summary":"  With the rapid development of generative artificial intelligence,\nparticularly large language models, a number of sub-fields of deep learning\nhave made significant progress and are now very useful in everyday\napplications. For example, well-known financial institutions simulate a wide\nrange of scenarios for various models created by their research teams using\nreinforcement learning, both before production and after regular operations. In\nthis work, we propose a backdoor attack that focuses solely on data poisoning.\nThis particular backdoor attack is classified as an attack without prior\nconsideration or trigger, and we name it FinanceLLMsBackRL. Our aim is to\nexamine the potential effects of large language models that use reinforcement\nlearning systems for text production or speech recognition, finance, physics,\nor the ecosystem of contemporary artificial intelligence models.\n","authors":["Orson Mengara"],"pdf_url":"https://arxiv.org/pdf/2412.17908v2.pdf","comment":"End of data poisoning research!: Navier-stokes equations (3D;\n  update); Reinforcement Learning (RL); HFT (High Frequency Trading); Limit\n  Order Markets and backdoor attack detection"},{"id":"http://arxiv.org/abs/2306.09202v3","updated":"2025-01-09T08:14:06Z","published":"2023-06-15T15:37:31Z","title":"A Fast Algorithm for the Real-Valued Combinatorial Pure Exploration of\n  Multi-Armed Bandit","summary":"  We study the real-valued combinatorial pure exploration problem in the\nstochastic multi-armed bandit (R-CPE-MAB). We study the case where the size of\nthe action set is polynomial with respect to the number of arms. In such a\ncase, the R-CPE-MAB can be seen as a special case of the so-called transductive\nlinear bandits. We introduce an algorithm named the combinatorial gap-based\nexploration (CombGapE) algorithm, whose sample complexity upper bound matches\nthe lower bound up to a problem-dependent constant factor. We numerically show\nthat the CombGapE algorithm outperforms existing methods significantly in both\nsynthetic and real-world datasets.\n","authors":["Shintaro Nakamura","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2306.09202v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04710v3","updated":"2025-01-09T07:58:13Z","published":"2024-04-06T19:07:12Z","title":"Exploiting the geometry of heterogeneous networks: A case study of the\n  Indian stock market","summary":"  In this study, we model the Indian stock market as heterogenous scale free\nnetwork, which is then embedded in a two dimensional hyperbolic space through a\nmachine learning based technique called as coalescent embedding. This allows us\nto apply the hyperbolic kmeans algorithm on the Poincare disc and the clusters\nso obtained resemble the original network communities more closely than the\nclusters obtained via Euclidean kmeans on the basis of well-known measures\nnormalised mutual information and adjusted mutual information. Through this, we\nare able to clearly distinguish between periods of market stability and\nvolatility by applying non-parametric statistical tests with a significance\nlevel of 0.05 to geometric measures namely hyperbolic distance and hyperbolic\nshortest path distance. After that, we are able to spot significant market\nchange early by leveraging the Bollinger Band analysis on the time series of\nmodularity in the embedded networks of each window. Finally, the radial\ndistance and the Equidistance Angular coordinates help in visualizing the\nembedded network in the Poincare disc and it is seen that specific market\nsectors cluster together.\n","authors":["Pawanesh Pawanesh","Charu Sharma","Niteesh Sahni"],"pdf_url":"https://arxiv.org/pdf/2404.04710v3.pdf","comment":"39 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.05037v1","updated":"2025-01-09T07:51:14Z","published":"2025-01-09T07:51:14Z","title":"LongViTU: Instruction Tuning for Long-Form Video Understanding","summary":"  This paper introduce LongViTU, a large-scale (~121k QA pairs, ~900h videos),\nautomatically generated dataset for long-form video understanding. We developed\na systematic approach that organizes videos into a hierarchical tree structure\nand incorporates self-revision mechanisms to ensure high-quality QA pairs. Each\nQA pair in LongViTU features: 1) long-term context (average certificate length\nof 4.6 minutes); 2) rich knowledge and condensed reasoning (commonsense,\ncausality, planning, etc.); and 3) explicit timestamp labels for relevant\nevents. LongViTU also serves as a benchmark for instruction following in\nlong-form and streaming video understanding. We evaluate the open-source\nstate-of-the-art long video understanding model, LongVU, and the commercial\nmodel, Gemini-1.5-Pro, on our benchmark. They achieve GPT-4 scores of 49.9 and\n52.3, respectively, underscoring the substantial challenge posed by our\nbenchmark. Further supervised fine-tuning (SFT) on LongVU led to performance\nimprovements of 12.0% on our benchmark, 2.2% on the in-distribution (ID)\nbenchmark EgoSchema, 1.0%, 2.2% and 1.2% on the out-of-distribution (OOD)\nbenchmarks VideoMME (Long), WorldQA and OpenEQA, respectively. These outcomes\ndemonstrate LongViTU's high data quality and robust OOD generalizability.\n","authors":["Rujie Wu","Xiaojian Ma","Hai Ci","Yue Fan","Yuxuan Wang","Haozhe Zhao","Qing Li","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05034v1","updated":"2025-01-09T07:49:37Z","published":"2025-01-09T07:49:37Z","title":"Towards Fingerprint Mosaicking Artifact Detection: A Self-Supervised\n  Deep Learning Approach","summary":"  Fingerprint mosaicking, which is the process of combining multiple\nfingerprint images into a single master fingerprint, is an essential process in\nmodern biometric systems. However, it is prone to errors that can significantly\ndegrade fingerprint image quality. This paper proposes a novel deep\nlearning-based approach to detect and score mosaicking artifacts in fingerprint\nimages. Our method leverages a self-supervised learning framework to train a\nmodel on large-scale unlabeled fingerprint data, eliminating the need for\nmanual artifact annotation. The proposed model effectively identifies\nmosaicking errors, achieving high accuracy on various fingerprint modalities,\nincluding contactless, rolled, and pressed fingerprints and furthermore proves\nto be robust to different data sources. Additionally, we introduce a novel\nmosaicking artifact score to quantify the severity of errors, enabling\nautomated evaluation of fingerprint images. By addressing the challenges of\nmosaicking artifact detection, our work contributes to improving the accuracy\nand reliability of fingerprint-based biometric systems.\n","authors":["Laurenz Ruzicka","Alexander Spenke","Stephan Bergmann","Gerd Nolden","Bernhard Kohn","Clemens Heitzinger"],"pdf_url":"https://arxiv.org/pdf/2501.05034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05031v1","updated":"2025-01-09T07:43:49Z","published":"2025-01-09T07:43:49Z","title":"ECBench: Can Multi-modal Foundation Models Understand the Egocentric\n  World? A Holistic Embodied Cognition Benchmark","summary":"  The enhancement of generalization in robots by large vision-language models\n(LVLMs) is increasingly evident. Therefore, the embodied cognitive abilities of\nLVLMs based on egocentric videos are of great interest. However, current\ndatasets for embodied video question answering lack comprehensive and\nsystematic evaluation frameworks. Critical embodied cognitive issues, such as\nrobotic self-cognition, dynamic scene perception, and hallucination, are rarely\naddressed. To tackle these challenges, we propose ECBench, a high-quality\nbenchmark designed to systematically evaluate the embodied cognitive abilities\nof LVLMs. ECBench features a diverse range of scene video sources, open and\nvaried question formats, and 30 dimensions of embodied cognition. To ensure\nquality, balance, and high visual dependence, ECBench uses class-independent\nmeticulous human annotation and multi-round question screening strategies.\nAdditionally, we introduce ECEval, a comprehensive evaluation system that\nensures the fairness and rationality of the indicators. Utilizing ECBench, we\nconduct extensive evaluations of proprietary, open-source, and task-specific\nLVLMs. ECBench is pivotal in advancing the embodied cognitive capabilities of\nLVLMs, laying a solid foundation for developing reliable core models for\nembodied agents. All data and code are available at\nhttps://github.com/Rh-Dang/ECBench.\n","authors":["Ronghao Dang","Yuqian Yuan","Wenqi Zhang","Yifei Xin","Boqiang Zhang","Long Li","Liuyi Wang","Qinyang Zeng","Xin Li","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2501.05031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09184v3","updated":"2025-01-09T07:42:45Z","published":"2024-01-17T12:50:50Z","title":"A Two-Scale Complexity Measure for Deep Learning Models","summary":"  We introduce a novel capacity measure 2sED for statistical models based on\nthe effective dimension. The new quantity provably bounds the generalization\nerror under mild assumptions on the model. Furthermore, simulations on standard\ndata sets and popular model architectures show that 2sED correlates well with\nthe training error. For Markovian models, we show how to efficiently\napproximate 2sED from below through a layerwise iterative approach, which\nallows us to tackle deep learning models with a large number of parameters.\nSimulation results suggest that the approximation is good for different\nprominent models and data sets.\n","authors":["Massimiliano Datres","Gian Paolo Leonardi","Alessio Figalli","David Sutter"],"pdf_url":"https://arxiv.org/pdf/2401.09184v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06764v3","updated":"2025-01-09T07:39:30Z","published":"2023-08-13T13:01:21Z","title":"Few-shot Class-incremental Learning for Classification and Object\n  Detection: A Survey","summary":"  Few-shot Class-Incremental Learning (FSCIL) presents a unique challenge in\nMachine Learning (ML), as it necessitates the Incremental Learning (IL) of new\nclasses from sparsely labeled training samples without forgetting previous\nknowledge. While this field has seen recent progress, it remains an active\nexploration area. This paper aims to provide a comprehensive and systematic\nreview of FSCIL. In our in-depth examination, we delve into various facets of\nFSCIL, encompassing the problem definition, the discussion of the primary\nchallenges of unreliable empirical risk minimization and the\nstability-plasticity dilemma, general schemes, and relevant problems of IL and\nFew-shot Learning (FSL). Besides, we offer an overview of benchmark datasets\nand evaluation metrics. Furthermore, we introduce the Few-shot\nClass-incremental Classification (FSCIC) methods from data-based,\nstructure-based, and optimization-based approaches and the Few-shot\nClass-incremental Object Detection (FSCIOD) methods from anchor-free and\nanchor-based approaches. Beyond these, we present several promising research\ndirections within FSCIL that merit further investigation.\n","authors":["Jinghua Zhang","Li Liu","Olli Silvén","Matti Pietikäinen","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2308.06764v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05015v1","updated":"2025-01-09T07:16:21Z","published":"2025-01-09T07:16:21Z","title":"On Measuring Unnoticeability of Graph Adversarial Attacks: Observations,\n  New Measure, and Applications","summary":"  Adversarial attacks are allegedly unnoticeable. Prior studies have designed\nattack noticeability measures on graphs, primarily using statistical tests to\ncompare the topology of original and (possibly) attacked graphs. However, we\nobserve two critical limitations in the existing measures. First, because the\nmeasures rely on simple rules, attackers can readily enhance their attacks to\nbypass them, reducing their attack \"noticeability\" and, yet, maintaining their\nattack performance. Second, because the measures naively leverage global\nstatistics, such as degree distributions, they may entirely overlook attacks\nuntil severe perturbations occur, letting the attacks be almost \"totally\nunnoticeable.\" To address the limitations, we introduce HideNSeek, a learnable\nmeasure for graph attack noticeability. First, to mitigate the bypass problem,\nHideNSeek learns to distinguish the original and (potential) attack edges using\na learnable edge scorer (LEO), which scores each edge on its likelihood of\nbeing an attack. Second, to mitigate the overlooking problem, HideNSeek\nconducts imbalance-aware aggregation of all the edge scores to obtain the final\nnoticeability score. Using six real-world graphs, we empirically demonstrate\nthat HideNSeek effectively alleviates the observed limitations, and LEO (i.e.,\nour learnable edge scorer) outperforms eleven competitors in distinguishing\nattack edges under five different attack methods. For an additional\napplication, we show that LEO boost the performance of robust GNNs by removing\nattack-like edges.\n","authors":["Hyeonsoo Jo","Hyunjin Hwang","Fanchen Bu","Soo Yong Lee","Chanyoung Park","Kijung Shin"],"pdf_url":"https://arxiv.org/pdf/2501.05015v1.pdf","comment":"KDD 2025"},{"id":"http://arxiv.org/abs/2501.05014v1","updated":"2025-01-09T07:15:59Z","published":"2025-01-09T07:15:59Z","title":"UAV-VLA: Vision-Language-Action System for Large Scale Aerial Mission\n  Generation","summary":"  The UAV-VLA (Visual-Language-Action) system is a tool designed to facilitate\ncommunication with aerial robots. By integrating satellite imagery processing\nwith the Visual Language Model (VLM) and the powerful capabilities of GPT,\nUAV-VLA enables users to generate general flight paths-and-action plans through\nsimple text requests. This system leverages the rich contextual information\nprovided by satellite images, allowing for enhanced decision-making and mission\nplanning. The combination of visual analysis by VLM and natural language\nprocessing by GPT can provide the user with the path-and-action set, making\naerial operations more efficient and accessible. The newly developed method\nshowed the difference in the length of the created trajectory in 22% and the\nmean error in finding the objects of interest on a map in 34.22 m by Euclidean\ndistance in the K-Nearest Neighbors (KNN) approach.\n","authors":["Oleg Sautenkov","Yasheerah Yaqoot","Artem Lykov","Muhammad Ahsan Mustafa","Grik Tadevosyan","Aibek Akhmetkazy","Miguel Altamirano Cabrera","Mikhail Martynov","Sausar Karaf","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.05014v1.pdf","comment":"HRI 2025"},{"id":"http://arxiv.org/abs/2501.05007v1","updated":"2025-01-09T07:05:22Z","published":"2025-01-09T07:05:22Z","title":"Quantum-enhanced causal discovery for a small number of samples","summary":"  The discovery of causal relationships from observed data has attracted\nsignificant interest from disciplines such as economics, social sciences,\nepidemiology, and biology. In practical applications, considerable knowledge of\nthe underlying systems is often unavailable, and real data are often associated\nwith nonlinear causal structures, which make the direct use of most\nconventional causality analysis methods difficult. This study proposes a novel\nquantum Peter-Clark (qPC) algorithm for causal discovery that does not assume\nany underlying model structures. Based on the independence conditional tests in\na class of reproducing kernel Hilbert spaces characterized by quantum circuits,\nthe proposed qPC algorithm can explore causal relationships from the observed\ndata drawn from arbitrary distributions. We conducted systematic experiments on\nfundamental graph parts of causal structures, demonstrating that the qPC\nalgorithm exhibits a significantly better performance, particularly with\nsmaller sample sizes compared to its classical counterpart. Furthermore, we\nproposed a novel optimization approach based on Kernel Target Alignment (KTA)\nfor determining hyperparameters of quantum kernels. This method effectively\nreduced the risk of false positives in causal discovery, enabling more reliable\ninference. Our theoretical and experimental results demonstrate that the\nproposed quantum algorithm can empower classical algorithms for robust and\naccurate inference in causal discovery, supporting them in regimes where\nclassical algorithms typically fail. Additionally, the effectiveness of this\nmethod was validated using the Boston Housing dataset as a real-world\napplication. These findings demonstrate the new potential of quantum\ncircuit-based causal discovery methods in addressing practical challenges,\nparticularly in small-sample scenarios where traditional approaches have shown\nlimitations.\n","authors":["Yota Maeda","Ken Arai","Yu Tanaka","Yu Terada","Hiroshi Ueno","Hiroyuki Tezuka"],"pdf_url":"https://arxiv.org/pdf/2501.05007v1.pdf","comment":"19 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.05005v1","updated":"2025-01-09T06:56:47Z","published":"2025-01-09T06:56:47Z","title":"A High-accuracy Calibration Method of Transient TSEPs for Power\n  Semiconductor Devices","summary":"  The thermal sensitive electrical parameter (TSEP) method is crucial for\nenhancing the reliability of power devices through junction temperature\nmonitoring. The TSEP method comprises three key processes: calibration,\nregression, and application. While significant efforts have been devoted to\nimproving regression algorithms and increasing TSEP sensitivity to enhance\njunction temperature monitoring accuracy, these approaches have reached a\nbottleneck. In reality, the calibration method significantly influences\nmonitoring accuracy, an aspect often overlooked in conventional TSEP methods.\nTo address this issue, we propose a high-accuracy calibration method for\ntransient TSEPs. First, a temperature compensation strategy based on thermal\nanalysis is introduced to mitigate the temperature difference caused by load\ncurrent during dual pulse tests. Second, the impact of stray parameters is\nanalyzed to identify coupled parameters, which are typically neglected in\nexisting methods. Third, it is observed that random errors follow a logarithm\nGaussian distribution, covering a hidden variable. A neural network is used to\nobtain the junction temperature predictive model. The proposed calibration\nmethod is experimental validated in threshold voltage as an example. Compared\nwith conventional calibration methods, the mean absolute error is reduced by\nover 30%. Moreover, this method does not require additional hardware cost and\nhas good generalization.\n","authors":["Qinghao Zhang","Wenrui Li","Pinjia Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.05005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07204v5","updated":"2025-01-09T06:53:50Z","published":"2024-02-11T13:30:53Z","title":"ITINERA: Integrating Spatial Optimization with Large Language Models for\n  Open-domain Urban Itinerary Planning","summary":"  Citywalk, a recently popular form of urban travel, requires genuine\npersonalization and understanding of fine-grained requests compared to\ntraditional itinerary planning. In this paper, we introduce the novel task of\nOpen-domain Urban Itinerary Planning (OUIP), which generates personalized urban\nitineraries from user requests in natural language. We then present ITINERA, an\nOUIP system that integrates spatial optimization with large language models to\nprovide customized urban itineraries based on user needs. This involves\ndecomposing user requests, selecting candidate points of interest (POIs),\nordering the POIs based on cluster-aware spatial optimization, and generating\nthe itinerary. Experiments on real-world datasets and the performance of the\ndeployed system demonstrate our system's capacity to deliver personalized and\nspatially coherent itineraries compared to current solutions. Source codes of\nITINERA are available at https://github.com/YihongT/ITINERA.\n","authors":["Yihong Tang","Zhaokai Wang","Ao Qu","Yihao Yan","Zhaofeng Wu","Dingyi Zhuang","Jushi Kai","Kebing Hou","Xiaotong Guo","Han Zheng","Tiange Luo","Jinhua Zhao","Zhan Zhao","Wei Ma"],"pdf_url":"https://arxiv.org/pdf/2402.07204v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03670v2","updated":"2025-01-09T06:47:34Z","published":"2024-03-06T12:47:14Z","title":"CDC: A Simple Framework for Complex Data Clustering","summary":"  In today's data-driven digital era, the amount as well as complexity, such as\nmulti-view, non-Euclidean, and multi-relational, of the collected data are\ngrowing exponentially or even faster. Clustering, which unsupervisely extracts\nvalid knowledge from data, is extremely useful in practice. However, existing\nmethods are independently developed to handle one particular challenge at the\nexpense of the others. In this work, we propose a simple but effective\nframework for complex data clustering (CDC) that can efficiently process\ndifferent types of data with linear complexity. We first utilize graph\nfiltering to fuse geometry structure and attribute information. We then reduce\nthe complexity with high-quality anchors that are adaptively learned via a\nnovel similarity-preserving regularizer. We illustrate the cluster-ability of\nour proposed method theoretically and experimentally. In particular, we deploy\nCDC to graph data of size 111M.\n","authors":["Zhao Kang","Xuanting Xie","Bingheng Li","Erlin Pan"],"pdf_url":"https://arxiv.org/pdf/2403.03670v2.pdf","comment":"Accepted by TNNLS"},{"id":"http://arxiv.org/abs/2408.10517v5","updated":"2025-01-09T06:41:46Z","published":"2024-08-20T03:35:28Z","title":"Integrating Multi-Modal Input Token Mixer Into Mamba-Based Decision\n  Models: Decision MetaMamba","summary":"  Sequence modeling with State Space models (SSMs) has demonstrated performance\nsurpassing that of Transformers in various tasks, raising expectations for\ntheir potential to outperform the Decision Transformer and its enhanced\nvariants in offline reinforcement learning (RL). However, decision models based\non Mamba, a state-of-the-art SSM, failed to achieve superior performance\ncompared to these enhanced Decision Transformers. We hypothesize that this\nlimitation arises from information loss during the selective scanning phase. To\naddress this, we propose the Decision MetaMamba (DMM), which augments Mamba\nwith a token mixer in its input layer. This mixer explicitly accounts for the\nmultimodal nature of offline RL inputs, comprising state, action, and\nreturn-to-go. The DMM demonstrates improved performance while significantly\nreducing parameter count compared to prior models. Notably, similar performance\ngains were achieved using a simple linear token mixer, emphasizing the\nimportance of preserving information from proximate time steps rather than the\nspecific design of the token mixer itself. This novel modification to Mamba's\ninput layer represents a departure from conventional timestamp-based encoding\napproaches used in Transformers. By enhancing performance of Mamba in offline\nRL, characterized by memory efficiency and fast inference, this work opens new\navenues for its broader application in future RL research.\n","authors":["Wall Kim"],"pdf_url":"https://arxiv.org/pdf/2408.10517v5.pdf","comment":"We have decided to withdraw this manuscript as we believe that the\n  work requires significant improvements and further research to ensure its\n  quality and impact. We are currently pursuing a more comprehensive approach\n  to address the limitations of the current submission and plan to resubmit an\n  improved version in the future"},{"id":"http://arxiv.org/abs/2408.16030v2","updated":"2025-01-09T06:33:24Z","published":"2024-08-28T09:30:20Z","title":"Deep Learning-Based Automatic Multi-Level Airway Collapse Monitoring on\n  Obstructive Sleep Apnea Patients","summary":"  This study investigated the use of deep learning to identify multi-level\nupper airway collapses in obstructive sleep apnea (OSA) patients based on\nsnoring sounds. We fi-ne-tuned ResNet-50 and Audio Spectrogram Transformer\n(AST) models using snoring recordings from 37 subjects undergoing drug-induced\nsleep endoscopy (DISE) between 2020 and 2021. Snoring sounds were labeled\naccording to the VOTE (Velum, Orophar-ynx, Tongue Base, Epiglottis)\nclassification, resulting in 259 V, 403 O, 77 T, 13 E, 1016 VO, 46 VT, 140 OT,\n39 OE, 30 VOT, and 3150 non-snoring (N) 0.5-second clips. The models were\ntrained for two multi-label classification tasks: identifying obstructions at\nV, O, T, and E levels, and identifying retropalatal (RP) and retroglossal (RG)\nobstruc-tions. Results showed AST slightly outperformed ResNet-50,\ndemonstrating good abil-ity to identify V (F1-score: 0.71, MCC: 0.61, AUC:\n0.89), O (F1-score: 0.80, MCC: 0.72, AUC: 0.94), and RP obstructions (F1-score:\n0.86, MCC: 0.77, AUC: 0.97). However, both models struggled with T, E, and RG\nclassifications due to limited data. Retrospective analysis of a full-night\nrecording showed the potential to profile airway obstruction dynamics. We\nexpect this information, combined with polysomnography and other clinical\nparameters, can aid clinical triage and treatment planning for OSA patients.\n","authors":["Ying-Chieh Hsu","Stanley Yung-Chuan Liu","Chao-Jung Huang","Chi-Wei Wu","Ren-Kai Cheng","Jane Yung-Jen Hsu","Shang-Ran Huang","Yuan-Ren Cheng","Fu-Shun Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.16030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05000v1","updated":"2025-01-09T06:29:50Z","published":"2025-01-09T06:29:50Z","title":"Load Forecasting for Households and Energy Communities: Are Deep\n  Learning Models Worth the Effort?","summary":"  Accurate load forecasting is crucial for predictive control in many energy\ndomain applications, with significant economic and ecological implications. To\naddress these implications, this study provides an extensive benchmark of\nstate-of-the-art deep learning models for short-term load forecasting in energy\ncommunities. Namely, LSTM, xLSTM, and Transformers are compared with benchmarks\nsuch as KNNs, synthetic load models, and persistence forecasting models. This\ncomparison considers different scales of aggregation (e.g., number of household\nloads) and varying training data availability (e.g., training data time spans).\nFurther, the impact of transfer learning from synthetic (standard) load\nprofiles and the deep learning model size (i.e., parameter count) is\ninvestigated in terms of forecasting error. Implementations are publicly\navailable and other researchers are encouraged to benchmark models using this\nframework. Additionally, a comprehensive case study, comprising an energy\ncommunity of 50 households and a battery storage demonstrates the beneficial\nfinancial implications of accurate predictions. Key findings of this research\ninclude: (1) Simple persistence benchmarks outperform deep learning models for\nshort-term load forecasting when the available training data is limited to six\nmonths or less; (2) Pretraining with publicly available synthetic load profiles\nimproves the normalized Mean Absolute Error (nMAE) by an average of 1.28%pt\nduring the first nine months of training data; (3) Increased aggregation\nsignificantly enhances the performance of deep learning models relative to\npersistence benchmarks; (4) Improved load forecasting, with an nMAE reduction\nof 1.1%pt, translates to an economic benefit of approximately 600EUR per year\nin an energy community comprising 50 households.\n","authors":["Lukas Moosbrugger","Valentin Seiler","Philipp Wohlgenannt","Sebastian Hegenbart","Sashko Ristov","Peter Kepplinger"],"pdf_url":"https://arxiv.org/pdf/2501.05000v1.pdf","comment":"This preprint was submitted to the Elsevier journal Energy and AI on\n  December 18, 2024"},{"id":"http://arxiv.org/abs/2501.04997v1","updated":"2025-01-09T06:26:28Z","published":"2025-01-09T06:26:28Z","title":"GiNet: Integrating Sequential and Context-Aware Learning for Battery\n  Capacity Prediction","summary":"  The surging demand for batteries requires advanced battery management\nsystems, where battery capacity modelling is a key functionality. In this\npaper, we aim to achieve accurate battery capacity prediction by learning from\nhistorical measurements of battery dynamics. We propose GiNet, a gated\nrecurrent units enhanced Informer network, for predicting battery's capacity.\nThe novelty and competitiveness of GiNet lies in its capability of capturing\nsequential and contextual information from raw battery data and reflecting the\nbattery's complex behaviors with both temporal dynamics and long-term\ndependencies. We conducted an experimental study based on a publicly available\ndataset to showcase GiNet's strength of gaining a holistic understanding of\nbattery behavior and predicting battery capacity accurately. GiNet achieves\n0.11 mean absolute error for predicting the battery capacity in a sequence of\nfuture time slots without knowing the historical battery capacity. It also\noutperforms the latest algorithms significantly with 27% error reduction on\naverage compared to Informer. The promising results highlight the importance of\ncustomized and optimized integration of algorithm and battery knowledge and\nshed light on other industry applications as well.\n","authors":["Sara Sameer","Wei Zhang","Xin Lou","Qingyu Yan","Terence Goh","Yulin Gao"],"pdf_url":"https://arxiv.org/pdf/2501.04997v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2412.05144v2","updated":"2025-01-09T06:18:12Z","published":"2024-12-06T16:00:50Z","title":"Effective Rank and the Staircase Phenomenon: New Insights into Neural\n  Network Training Dynamics","summary":"  In recent years, deep learning, powered by neural networks, has achieved\nwidespread success in solving high-dimensional problems, particularly those\nwith low-dimensional feature structures. This success stems from their ability\nto identify and learn low dimensional features tailored to the problems.\nUnderstanding how neural networks extract such features during training\ndynamics remains a fundamental question in deep learning theory. In this work,\nwe propose a novel perspective by interpreting the neurons in the last hidden\nlayer of a neural network as basis functions that represent essential features.\nTo explore the linear independence of these basis functions throughout the deep\nlearning dynamics, we introduce the concept of 'effective rank'. Our extensive\nnumerical experiments reveal a notable phenomenon: the effective rank increases\nprogressively during the learning process, exhibiting a staircase-like pattern,\nwhile the loss function concurrently decreases as the effective rank rises. We\nrefer to this observation as the 'staircase phenomenon'. Specifically, for deep\nneural networks, we rigorously prove the negative correlation between the loss\nfunction and effective rank, demonstrating that the lower bound of the loss\nfunction decreases with increasing effective rank. Therefore, to achieve a\nrapid descent of the loss function, it is critical to promote the swift growth\nof effective rank. Ultimately, we evaluate existing advanced learning\nmethodologies and find that these approaches can quickly achieve a higher\neffective rank, thereby avoiding redundant staircase processes and accelerating\nthe rapid decline of the loss function.\n","authors":["Jiang Yang","Yuxiang Zhao","Quanhui Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.05144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03389v3","updated":"2025-01-09T06:11:32Z","published":"2023-11-04T04:54:17Z","title":"Learning Disentangled Speech Representations","summary":"  Disentangled representation learning in speech processing has lagged behind\nother domains, largely due to the lack of datasets with annotated generative\nfactors for robust evaluation. To address this, we propose SynSpeech, a novel\nlarge-scale synthetic speech dataset specifically designed to enable research\non disentangled speech representations. SynSpeech includes controlled\nvariations in speaker identity, spoken text, and speaking style, with three\ndataset versions to support experimentation at different levels of complexity.\n  In this study, we present a comprehensive framework to evaluate disentangled\nrepresentation learning techniques, applying both linear probing and\nestablished supervised disentanglement metrics to assess the modularity,\ncompactness, and informativeness of the representations learned by a\nstate-of-the-art model. Using the RAVE model as a test case, we find that\nSynSpeech facilitates benchmarking across a range of factors, achieving\npromising disentanglement of simpler features like gender and speaking style,\nwhile highlighting challenges in isolating complex attributes like speaker\nidentity. This benchmark dataset and evaluation framework fills a critical gap,\nsupporting the development of more robust and interpretable speech\nrepresentation learning methods.\n","authors":["Yusuf Brima","Ulf Krumnack","Simone Pika","Gunther Heidemann"],"pdf_url":"https://arxiv.org/pdf/2311.03389v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04982v1","updated":"2025-01-09T05:45:03Z","published":"2025-01-09T05:45:03Z","title":"CuRLA: Curriculum Learning Based Deep Reinforcement Learning for\n  Autonomous Driving","summary":"  In autonomous driving, traditional Computer Vision (CV) agents often struggle\nin unfamiliar situations due to biases in the training data. Deep Reinforcement\nLearning (DRL) agents address this by learning from experience and maximizing\nrewards, which helps them adapt to dynamic environments. However, ensuring\ntheir generalization remains challenging, especially with static training\nenvironments. Additionally, DRL models lack transparency, making it difficult\nto guarantee safety in all scenarios, particularly those not seen during\ntraining. To tackle these issues, we propose a method that combines DRL with\nCurriculum Learning for autonomous driving. Our approach uses a Proximal Policy\nOptimization (PPO) agent and a Variational Autoencoder (VAE) to learn safe\ndriving in the CARLA simulator. The agent is trained using two-fold curriculum\nlearning, progressively increasing environment difficulty and incorporating a\ncollision penalty in the reward function to promote safety. This method\nimproves the agent's adaptability and reliability in complex environments, and\nunderstand the nuances of balancing multiple reward components from different\nfeedback signals in a single scalar reward function. Keywords: Computer Vision,\nDeep Reinforcement Learning, Variational Autoencoder, Proximal Policy\nOptimization, Curriculum Learning, Autonomous Driving.\n","authors":["Bhargava Uppuluri","Anjel Patel","Neil Mehta","Sridhar Kamath","Pratyush Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2501.04982v1.pdf","comment":"To be published in the 17th International Conference on Agents and\n  Artificial Intelligence (ICAART), Feb 2025"},{"id":"http://arxiv.org/abs/2402.08948v3","updated":"2025-01-09T05:24:57Z","published":"2024-02-14T05:34:24Z","title":"Mean-Field Analysis for Learning Subspace-Sparse Polynomials with\n  Gaussian Input","summary":"  In this work, we study the mean-field flow for learning subspace-sparse\npolynomials using stochastic gradient descent and two-layer neural networks,\nwhere the input distribution is standard Gaussian and the output only depends\non the projection of the input onto a low-dimensional subspace. We establish a\nnecessary condition for SGD-learnability, involving both the characteristics of\nthe target function and the expressiveness of the activation function. In\naddition, we prove that the condition is almost sufficient, in the sense that a\ncondition slightly stronger than the necessary condition can guarantee the\nexponential decay of the loss functional to zero.\n","authors":["Ziang Chen","Rong Ge"],"pdf_url":"https://arxiv.org/pdf/2402.08948v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01145v5","updated":"2025-01-09T05:14:56Z","published":"2024-01-02T10:55:01Z","title":"HAAQI-Net: A Non-intrusive Neural Music Audio Quality Assessment Model\n  for Hearing Aids","summary":"  This paper introduces HAAQI-Net, a non-intrusive deep learning-based music\naudio quality assessment model for hearing aid users. Unlike traditional\nmethods like the Hearing Aid Audio Quality Index (HAAQI) that require intrusive\nreference signal comparisons, HAAQI-Net offers a more accessible and\ncomputationally efficient alternative. By utilizing a Bidirectional Long\nShort-Term Memory (BLSTM) architecture with attention mechanisms and features\nextracted from the pre-trained BEATs model, it can predict HAAQI scores\ndirectly from music audio clips and hearing loss patterns. Experimental results\ndemonstrate HAAQI-Net's effectiveness, achieving a Linear Correlation\nCoefficient (LCC) of 0.9368 , a Spearman's Rank Correlation Coefficient (SRCC)\nof 0.9486 , and a Mean Squared Error (MSE) of 0.0064 and inference time\nsignificantly reduces from 62.52 to 2.54 seconds. To address computational\noverhead, a knowledge distillation strategy was applied, reducing parameters by\n75.85% and inference time by 96.46%, while maintaining strong performance (LCC:\n0.9071 , SRCC: 0.9307 , MSE: 0.0091 ). To expand its capabilities, HAAQI-Net\nwas adapted to predict subjective human scores like the Mean Opinion Score\n(MOS) through fine-tuning. This adaptation significantly improved prediction\naccuracy, validated through statistical analysis. Furthermore, the robustness\nof HAAQI-Net was evaluated under varying Sound Pressure Level (SPL) conditions,\nrevealing optimal performance at a reference SPL of 65 dB, with accuracy\ngradually decreasing as SPL deviated from this point. The advancements in\nsubjective score prediction, SPL robustness, and computational efficiency\nposition HAAQI-Net as a scalable solution for music audio quality assessment in\nhearing aid applications, contributing to efficient and accurate models in\naudio signal processing and hearing aid technology.\n","authors":["Dyah A. M. G. Wisnu","Stefano Rini","Ryandhimas E. Zezario","Hsin-Min Wang","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2401.01145v5.pdf","comment":"Accepted by IEEE/ACM Transactions on Audio, Speech, and Language\n  Processing (TASLP), 2025"},{"id":"http://arxiv.org/abs/2501.04971v1","updated":"2025-01-09T05:02:50Z","published":"2025-01-09T05:02:50Z","title":"Self-Adaptive Ising Machines for Constrained Optimization","summary":"  Ising machines (IM) are physics-inspired alternatives to von Neumann\narchitectures for solving hard optimization tasks. By mapping binary variables\nto coupled Ising spins, IMs can naturally solve unconstrained combinatorial\noptimization problems such as finding maximum cuts in graphs. However, despite\ntheir importance in practical applications, constrained problems remain\nchallenging to solve for IMs that require large quadratic energy penalties to\nensure the correspondence between energy ground states and constrained optimal\nsolutions. To relax this requirement, we propose a self-adaptive IM that\niteratively shapes its energy landscape using a Lagrange relaxation of\nconstraints and avoids prior tuning of penalties. Using a probabilistic-bit\n(p-bit) IM emulated in software, we benchmark our algorithm with\nmultidimensional knapsack problems (MKP) and quadratic knapsack problems (QKP),\nthe latter being an Ising problem with linear constraints. For QKP with 300\nvariables, the proposed algorithm finds better solutions than state-of-the-art\nIMs such as Fujitsu's Digital Annealer and requires 7,500x fewer samples. Our\nresults show that adapting the energy landscape during the search can speed up\nIMs for constrained optimization.\n","authors":["Corentin Delacour"],"pdf_url":"https://arxiv.org/pdf/2501.04971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04970v1","updated":"2025-01-09T04:59:15Z","published":"2025-01-09T04:59:15Z","title":"Battling the Non-stationarity in Time Series Forecasting via Test-time\n  Adaptation","summary":"  Deep Neural Networks have spearheaded remarkable advancements in time series\nforecasting (TSF), one of the major tasks in time series modeling. Nonetheless,\nthe non-stationarity of time series undermines the reliability of pre-trained\nsource time series forecasters in mission-critical deployment settings. In this\nstudy, we introduce a pioneering test-time adaptation framework tailored for\nTSF (TSF-TTA). TAFAS, the proposed approach to TSF-TTA, flexibly adapts source\nforecasters to continuously shifting test distributions while preserving the\ncore semantic information learned during pre-training. The novel utilization of\npartially-observed ground truth and gated calibration module enables proactive,\nrobust, and model-agnostic adaptation of source forecasters. Experiments on\ndiverse benchmark datasets and cutting-edge architectures demonstrate the\nefficacy and generality of TAFAS, especially in long-term forecasting scenarios\nthat suffer from significant distribution shifts. The code is available at\nhttps://github.com/kimanki/TAFAS.\n","authors":["HyunGi Kim","Siwon Kim","Jisoo Mok","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2501.04970v1.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04967v1","updated":"2025-01-09T04:41:50Z","published":"2025-01-09T04:41:50Z","title":"Targeted Adversarial Denoising Autoencoders (TADA) for Neural Time\n  Series Filtration","summary":"  Current machine learning (ML)-based algorithms for filtering\nelectroencephalography (EEG) time series data face challenges related to\ncumbersome training times, regularization, and accurate reconstruction. To\naddress these shortcomings, we present an ML filtration algorithm driven by a\nlogistic covariance-targeted adversarial denoising autoencoder (TADA). We\nhypothesize that the expressivity of a targeted, correlation-driven\nconvolutional autoencoder will enable effective time series filtration while\nminimizing compute requirements (e.g., runtime, model size). Furthermore, we\nexpect that adversarial training with covariance rescaling will minimize signal\ndegradation. To test this hypothesis, a TADA system prototype was trained and\nevaluated on the task of removing electromyographic (EMG) noise from EEG data\nin the EEGdenoiseNet dataset, which includes EMG and EEG data from 67 subjects.\nThe TADA filter surpasses conventional signal filtration algorithms across\nquantitative metrics (Correlation Coefficient, Temporal RRMSE, Spectral RRMSE),\nand performs competitively against other deep learning architectures at a\nreduced model size of less than 400,000 trainable parameters. Further\nexperimentation will be necessary to assess the viability of TADA on a wider\nrange of deployment cases.\n","authors":["Benjamin J. Choi","Griffin Milsap","Clara A. Scholl","Francesco Tenore","Mattson Ogg"],"pdf_url":"https://arxiv.org/pdf/2501.04967v1.pdf","comment":"[Accepted] Artificial Intelligence for Time Series Analysis (AI4TS):\n  Theory, Algorithms, and Applications @ AAAI 2025, Philadelphia, PA, USA"},{"id":"http://arxiv.org/abs/2501.04276v2","updated":"2025-01-09T04:26:27Z","published":"2025-01-08T04:54:28Z","title":"Bridging Adaptivity and Safety: Learning Agile Collision-Free Locomotion\n  Across Varied Physics","summary":"  Real-world legged locomotion systems often need to reconcile agility and\nsafety for different scenarios. Moreover, the underlying dynamics are often\nunknown and time-variant (e.g., payload, friction). In this paper, we introduce\nBAS (Bridging Adaptivity and Safety), which builds upon the pipeline of prior\nwork Agile But Safe (ABS)(He et al.) and is designed to provide adaptive safety\neven in dynamic environments with uncertainties. BAS involves an agile policy\nto avoid obstacles rapidly and a recovery policy to prevent collisions, a\nphysical parameter estimator that is concurrently trained with agile policy,\nand a learned control-theoretic RA (reach-avoid) value network that governs the\npolicy switch. Also, the agile policy and RA network are both conditioned on\nphysical parameters to make them adaptive. To mitigate the distribution shift\nissue, we further introduce an on-policy fine-tuning phase for the estimator to\nenhance its robustness and accuracy. The simulation results show that BAS\nachieves 50% better safety than baselines in dynamic environments while\nmaintaining a higher speed on average. In real-world experiments, BAS shows its\ncapability in complex environments with unknown physics (e.g., slippery floors\nwith unknown frictions, unknown payloads up to 8kg), while baselines lack\nadaptivity, leading to collisions or. degraded agility. As a result, BAS\nachieves a 19.8% increase in speed and gets a 2.36 times lower collision rate\nthan ABS in the real world. Videos: https://adaptive-safe-locomotion.github.io.\n","authors":["Yichao Zhong","Chong Zhang","Tairan He","Guanya Shi"],"pdf_url":"https://arxiv.org/pdf/2501.04276v2.pdf","comment":"11 Pages, 6 Figures"},{"id":"http://arxiv.org/abs/2501.04961v1","updated":"2025-01-09T04:26:15Z","published":"2025-01-09T04:26:15Z","title":"Demystifying Domain-adaptive Post-training for Financial LLMs","summary":"  Domain-adaptive post-training of large language models (LLMs) has emerged as\na promising approach for specialized domains such as medicine and finance.\nHowever, significant challenges remain in identifying optimal adaptation\ncriteria and training strategies across varying data and model configurations.\nTo address these challenges, we introduce FINDAP, a systematic and fine-grained\ninvestigation into domain-adaptive post-training of LLMs for the finance\ndomain. Our approach begins by identifying the core capabilities required for\nthe target domain and designing a comprehensive evaluation suite aligned with\nthese needs. We then analyze the effectiveness of key post-training stages,\nincluding continual pretraining, instruction tuning, and preference alignment.\nBuilding on these insights, we propose an effective training recipe centered on\na novel preference data distillation method, which leverages process signals\nfrom a generative reward model. The resulting model, Llama-Fin, achieves\nstate-of-the-art performance across a wide range of financial tasks. Our\nanalysis also highlights how each post-training stage contributes to distinct\ncapabilities, uncovering specific challenges and effective solutions, providing\nvaluable insights for domain adaptation of LLMs. Project page:\nhttps://github.com/SalesforceAIResearch/FinDap\n","authors":["Zixuan Ke","Yifei Ming","Xuan-Phi Nguyen","Caiming Xiong","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2501.04961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07074v2","updated":"2025-01-09T04:25:14Z","published":"2024-10-09T17:19:12Z","title":"Let's Ask GNN: Empowering Large Language Model for Graph In-Context\n  Learning","summary":"  Textual Attributed Graphs (TAGs) are crucial for modeling complex real-world\nsystems, yet leveraging large language models (LLMs) for TAGs presents unique\nchallenges due to the gap between sequential text processing and\ngraph-structured data. We introduce AskGNN, a novel approach that bridges this\ngap by leveraging In-Context Learning (ICL) to integrate graph data and\ntask-specific information into LLMs. AskGNN employs a Graph Neural Network\n(GNN)-powered structure-enhanced retriever to select labeled nodes across\ngraphs, incorporating complex graph structures and their supervision signals.\nOur learning-to-retrieve algorithm optimizes the retriever to select example\nnodes that maximize LLM performance on graph. Experiments across three tasks\nand seven LLMs demonstrate AskGNN's superior effectiveness in graph task\nperformance, opening new avenues for applying LLMs to graph-structured data\nwithout extensive fine-tuning.\n","authors":["Zhengyu Hu","Yichuan Li","Zhengyu Chen","Jingang Wang","Han Liu","Kyumin Lee","Kaize Ding"],"pdf_url":"https://arxiv.org/pdf/2410.07074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09346v2","updated":"2025-01-09T04:20:34Z","published":"2023-11-15T20:09:29Z","title":"Nothing Stands Still: A Spatiotemporal Benchmark on 3D Point Cloud\n  Registration Under Large Geometric and Temporal Change","summary":"  Building 3D geometric maps of man-made spaces is a well-established and\nactive field that is fundamental to computer vision and robotics. However,\nconsidering the evolving nature of built environments, it is essential to\nquestion the capabilities of current mapping efforts in handling temporal\nchanges. In addition, spatiotemporal mapping holds significant potential for\nachieving sustainability and circularity goals. Existing mapping approaches\nfocus on small changes, such as object relocation or self-driving car\noperation; in all cases where the main structure of the scene remains fixed.\nConsequently, these approaches fail to address more radical changes in the\nstructure of the built environment, such as geometry and topology. To this end,\nwe introduce the Nothing Stands Still (NSS) benchmark, which focuses on the\nspatiotemporal registration of 3D scenes undergoing large spatial and temporal\nchange, ultimately creating one coherent spatiotemporal map. Specifically, the\nbenchmark involves registering two or more partial 3D point clouds (fragments)\nfrom the same scene but captured from different spatiotemporal views. In\naddition to the standard pairwise registration, we assess the multi-way\nregistration of multiple fragments that belong to any temporal stage. As part\nof NSS, we introduce a dataset of 3D point clouds recurrently captured in\nlarge-scale building indoor environments that are under construction or\nrenovation. The NSS benchmark presents three scenarios of increasing\ndifficulty, to quantify the generalization ability of point cloud registration\nmethods over space (within one building and across buildings) and time. We\nconduct extensive evaluations of state-of-the-art methods on NSS. The results\ndemonstrate the necessity for novel methods specifically designed to handle\nlarge spatiotemporal changes. The homepage of our benchmark is at\nhttp://nothing-stands-still.com.\n","authors":["Tao Sun","Yan Hao","Shengyu Huang","Silvio Savarese","Konrad Schindler","Marc Pollefeys","Iro Armeni"],"pdf_url":"https://arxiv.org/pdf/2311.09346v2.pdf","comment":"To appear in the ISPRS Journal of Photogrammetry and Remote Sensing.\n  29 pages, 26 figures. For the project page, see\n  http://nothing-stands-still.com"},{"id":"http://arxiv.org/abs/2501.04952v1","updated":"2025-01-09T03:59:10Z","published":"2025-01-09T03:59:10Z","title":"Open Problems in Machine Unlearning for AI Safety","summary":"  As AI systems become more capable, widely deployed, and increasingly\nautonomous in critical areas such as cybersecurity, biological research, and\nhealthcare, ensuring their safety and alignment with human values is paramount.\nMachine unlearning -- the ability to selectively forget or suppress specific\ntypes of knowledge -- has shown promise for privacy and data removal tasks,\nwhich has been the primary focus of existing research. More recently, its\npotential application to AI safety has gained attention. In this paper, we\nidentify key limitations that prevent unlearning from serving as a\ncomprehensive solution for AI safety, particularly in managing dual-use\nknowledge in sensitive domains like cybersecurity and chemical, biological,\nradiological, and nuclear (CBRN) safety. In these contexts, information can be\nboth beneficial and harmful, and models may combine seemingly harmless\ninformation for harmful purposes -- unlearning this information could strongly\naffect beneficial uses. We provide an overview of inherent constraints and open\nproblems, including the broader side effects of unlearning dangerous knowledge,\nas well as previously unexplored tensions between unlearning and existing\nsafety mechanisms. Finally, we investigate challenges related to evaluation,\nrobustness, and the preservation of safety features during unlearning. By\nmapping these limitations and open challenges, we aim to guide future research\ntoward realistic applications of unlearning within a broader AI safety\nframework, acknowledging its limitations and highlighting areas where\nalternative approaches may be required.\n","authors":["Fazl Barez","Tingchen Fu","Ameya Prabhu","Stephen Casper","Amartya Sanyal","Adel Bibi","Aidan O'Gara","Robert Kirk","Ben Bucknall","Tim Fist","Luke Ong","Philip Torr","Kwok-Yan Lam","Robert Trager","David Krueger","Sören Mindermann","José Hernandez-Orallo","Mor Geva","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2501.04952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18696v2","updated":"2025-01-09T03:39:37Z","published":"2024-12-24T22:55:35Z","title":"STITCH: Surface reconstrucTion using Implicit neural representations\n  with Topology Constraints and persistent Homology","summary":"  We present STITCH, a novel approach for neural implicit surface\nreconstruction of a sparse and irregularly spaced point cloud while enforcing\ntopological constraints (such as having a single connected component). We\ndevelop a new differentiable framework based on persistent homology to\nformulate topological loss terms that enforce the prior of a single 2-manifold\nobject. Our method demonstrates excellent performance in preserving the\ntopology of complex 3D geometries, evident through both visual and empirical\ncomparisons. We supplement this with a theoretical analysis, and provably show\nthat optimizing the loss with stochastic (sub)gradient descent leads to\nconvergence and enables reconstructing shapes with a single connected\ncomponent. Our approach showcases the integration of differentiable topological\ndata analysis tools for implicit surface reconstruction.\n","authors":["Anushrut Jignasu","Ethan Herron","Zhanhong Jiang","Soumik Sarkar","Chinmay Hegde","Baskar Ganapathysubramanian","Aditya Balu","Adarsh Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2412.18696v2.pdf","comment":"19 pages, 12 figures, 29 tables"},{"id":"http://arxiv.org/abs/2501.04946v1","updated":"2025-01-09T03:36:17Z","published":"2025-01-09T03:36:17Z","title":"Non-asymptotic analysis of the performance of the penalized least\n  trimmed squares in sparse models","summary":"  The least trimmed squares (LTS) estimator is a renowned robust alternative to\nthe classic least squares estimator and is popular in location, regression,\nmachine learning, and AI literature. Many studies exist on LTS, including its\nrobustness, computation algorithms, extension to non-linear cases, asymptotics,\netc. The LTS has been applied in the penalized regression in a high-dimensional\nreal-data sparse-model setting where dimension $p$ (in thousands) is much\nlarger than sample size $n$ (in tens, or hundreds). In such a practical\nsetting, the sample size $n$ often is the count of sub-population that has a\nspecial attribute (e.g. the count of patients of Alzheimer's, Parkinson's,\nLeukemia, or ALS, etc.) among a population with a finite fixed size N.\nAsymptotic analysis assuming that $n$ tends to infinity is not practically\nconvincing and legitimate in such a scenario. A non-asymptotic or finite sample\nanalysis will be more desirable and feasible.\n  This article establishes some finite sample (non-asymptotic) error bounds for\nestimating and predicting based on LTS with high probability for the first\ntime.\n","authors":["Yijun Zuo"],"pdf_url":"https://arxiv.org/pdf/2501.04946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18729v2","updated":"2025-01-09T03:34:55Z","published":"2024-11-27T20:08:55Z","title":"Multi-Task Model Merging via Adaptive Weight Disentanglement","summary":"  Model merging has recently gained attention as an economical and scalable\napproach to incorporate task-specific weights from various tasks into a unified\nmulti-task model. For example, in Task Arithmetic (TA), adding the fine-tuned\nweights of different tasks can enhance the model's performance on those tasks,\nwhile subtracting them leads to task forgetting. Although TA is highly\neffective, interference among task still hampers the performance of the merged\nmodel. Existing methods for handling conflicts between task generally rely on\nempirical selection, resulting in suboptimal performance. In this paper, we\nintroduce an Adaptive Weight Disentanglement method. We begin by theoretically\nproving that task vectors employed in model merging should be orthogonal to\nminimize interference among tasks. Guided by this insight, we initialize\nredundant vectors such that, when subtracted from the original task vectors,\nthe resulting vectors exhibit increased orthogonality. Additionally, we impose\nan norm constraint on the redundant vectors to preserve the performance of the\ntask-specific models. Experimental results demonstrate the effectiveness of our\nproposed technique: it successfully extracts redundant vectors, and after their\nsubtraction, the task vectors not only retain robust performance but also\nachieve superior fusion outcomes. Our code is available at\n\\href{https://github.com/FarisXiong/AWD.git}{https://github.com/FarisXiong/AWD.git}.\n","authors":["Feng Xiong","Runxi Cheng","Wang Chen","Zhanqiu Zhang","Yiwen Guo","Chun Yuan","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2411.18729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04940v1","updated":"2025-01-09T03:14:03Z","published":"2025-01-09T03:14:03Z","title":"A New Perspective on Privacy Protection in Federated Learning with\n  Granular-Ball Computing","summary":"  Federated Learning (FL) facilitates collaborative model training while\nprioritizing privacy by avoiding direct data sharing. However, most existing\narticles attempt to address challenges within the model's internal parameters\nand corresponding outputs, while neglecting to solve them at the input level.\nTo address this gap, we propose a novel framework called Granular-Ball\nFederated Learning (GrBFL) for image classification. GrBFL diverges from\ntraditional methods that rely on the finest-grained input data. Instead, it\nsegments images into multiple regions with optimal coarse granularity, which\nare then reconstructed into a graph structure. We designed a two-dimensional\nbinary search segmentation algorithm based on variance constraints for GrBFL,\nwhich effectively removes redundant information while preserving key\nrepresentative features. Extensive theoretical analysis and experiments\ndemonstrate that GrBFL not only safeguards privacy and enhances efficiency but\nalso maintains robust utility, consistently outperforming other\nstate-of-the-art FL methods. The code is available at\nhttps://github.com/AIGNLAI/GrBFL.\n","authors":["Guannan Lai","Yihui Feng","Xin Yang","Xiaoyu Deng","Hao Yu","Shuyin Xia","Guoyin Wang","Tianrui Li"],"pdf_url":"https://arxiv.org/pdf/2501.04940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01100v2","updated":"2025-01-09T03:12:38Z","published":"2025-01-02T06:49:58Z","title":"Long-range Brain Graph Transformer","summary":"  Understanding communication and information processing among brain regions of\ninterest (ROIs) is highly dependent on long-range connectivity, which plays a\ncrucial role in facilitating diverse functional neural integration across the\nentire brain. However, previous studies generally focused on the short-range\ndependencies within brain networks while neglecting the long-range\ndependencies, limiting an integrated understanding of brain-wide communication.\nTo address this limitation, we propose Adaptive Long-range aware TransformER\n(ALTER), a brain graph transformer to capture long-range dependencies between\nbrain ROIs utilizing biased random walk. Specifically, we present a novel\nlong-range aware strategy to explicitly capture long-range dependencies between\nbrain ROIs. By guiding the walker towards the next hop with higher correlation\nvalue, our strategy simulates the real-world brain-wide communication.\nFurthermore, by employing the transformer framework, ALERT adaptively\nintegrates both short- and long-range dependencies between brain ROIs, enabling\nan integrated understanding of multi-level communication across the entire\nbrain. Extensive experiments on ABIDE and ADNI datasets demonstrate that ALTER\nconsistently outperforms generalized state-of-the-art graph learning methods\n(including SAN, Graphormer, GraphTrans, and LRGNN) and other graph learning\nbased brain network analysis methods (including FBNETGEN, BrainNetGNN,\nBrainGNN, and BrainNETTF) in neurological disease diagnosis. Cases of\nlong-range dependencies are also presented to further illustrate the\neffectiveness of ALTER. The implementation is available at\nhttps://github.com/yushuowiki/ALTER.\n","authors":["Shuo Yu","Shan Jin","Ming Li","Tabinda Sarwar","Feng Xia"],"pdf_url":"https://arxiv.org/pdf/2501.01100v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.11691v5","updated":"2025-01-09T02:35:18Z","published":"2022-09-23T16:11:09Z","title":"Linear Multidimensional Regression with Interactive Fixed-Effects","summary":"  This paper studies a linear and additively separable regression model for\nmultidimensional panel data of three or more dimensions with unobserved\ninteractive fixed effects. The main estimator follows a double debias approach,\nand requires two preliminary steps to control unobserved heterogeneity. First,\nthe model is embedded within the standard two-dimensional panel framework and\nrestrictions are formed under which the factor structure methods in Bai (2009)\nlead to consistent estimation of model parameters, but at slow rates of\nconvergence. The second step develops a weighted fixed-effects method that is\nrobust to the multidimensional nature of the problem and achieves the\nparametric rate of consistency. This second step is combined with a double\ndebias procedure for asymptotically normal slope estimates. The methods are\nimplemented to estimate the demand elasticity for beer.\n","authors":["Hugo Freeman"],"pdf_url":"https://arxiv.org/pdf/2209.11691v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04323v2","updated":"2025-01-09T02:33:04Z","published":"2025-01-08T07:47:43Z","title":"Navigating the Designs of Privacy-Preserving Fine-tuning for Large\n  Language Models","summary":"  Instruction tuning has proven effective in enhancing Large Language Models'\n(LLMs) performance on downstream tasks. However, real-world fine-tuning faces\ninherent conflicts between model providers' intellectual property protection,\nclients' data privacy requirements, and tuning costs. While recent approaches\nlike split learning and offsite tuning demonstrate promising architectures for\nprivacy-preserving fine-tuning, there is a gap in systematically addressing the\nmultidimensional trade-offs required for diverse real-world deployments. We\npropose several indicative evaluation metrics to guide design trade-offs for\nprivacy-preserving fine-tuning and a series of example designs, collectively\nnamed GuardedTuning; they result from novel combinations of system\narchitectures with adapted privacy-enhancement methods and emerging computation\ntechniques. Each design represents distinct trade-offs across model utility,\nprivacy guarantees, and costs. Experimental results demonstrate that these\ndesigns protect against data reconstruction attacks while maintaining\ncompetitive fine-tuning performance.\n","authors":["Haonan Shi","Tu Ouyang","An Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04323v2.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.04126v2","updated":"2025-01-09T02:20:28Z","published":"2025-01-07T20:12:56Z","title":"Stochastic Process Learning via Operator Flow Matching","summary":"  Expanding on neural operators, we propose a novel framework for stochastic\nprocess learning across arbitrary domains. In particular, we develop operator\nflow matching (OFM) for learning stochastic process priors on function spaces.\nOFM provides the probability density of the values of any collection of points\nand enables mathematically tractable functional regression at new points with\nmean and density estimation. Our method outperforms state-of-the-art models in\nstochastic process learning, functional regression, and prior learning.\n","authors":["Yaozhong Shi","Zachary E. Ross","Domniki Asimaki","Kamyar Azizzadenesheli"],"pdf_url":"https://arxiv.org/pdf/2501.04126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04070v2","updated":"2025-01-09T02:20:13Z","published":"2025-01-07T14:57:08Z","title":"More is not always better? Enhancing Many-Shot In-Context Learning with\n  Differentiated and Reweighting Objectives","summary":"  Large language models (LLMs) excel at few-shot in-context learning (ICL)\nwithout requiring parameter updates. However, as the number of ICL\ndemonstrations increases from a few to many, performance tends to plateau and\neventually decline. We identify two primary causes for this trend: the\nsuboptimal negative log-likelihood (NLL) optimization objective and the\nincremental data noise. To address these issues, we introduce DrICL, a novel\noptimization method that enhances model performance through Differentiated\nLearning and advantage-based Reweighting objectives. Globally, DrICL utilizes\ndifferentiated learning to optimize the NLL objective, ensuring that many-shot\nperformance surpasses zero-shot levels. Locally, it dynamically adjusts the\nweighting of many-shot demonstrations by leveraging cumulative advantages\ninspired by reinforcement learning, thereby improving generalization. This\napproach allows the model to handle varying numbers of shots effectively,\nmitigating the impact of noisy data. Recognizing the lack of multi-task\ndatasets with diverse many-shot distributions, we develop the Many-Shot ICL\nBenchmark (ICL-50)-a large-scale benchmark of 50 tasks that cover shot numbers\nfrom 1 to 350 within sequences of up to 8,000 tokens-for fine-tuning purposes.\nICL-50 facilitates the evaluation of many-shot ICL strategies across seven\nprominent NLP tasks and 50 distinct datasets. Experimental results demonstrate\nthat LLMs enhanced with DrICL achieve significant improvements in many-shot\nsetups across various tasks, including both in-domain and out-of-domain\nscenarios. We release the code and benchmark dataset hoping to facilitate\nfurther research in many-shot ICL.\n","authors":["Xiaoqing Zhang","Ang Lv","Yuhan Liu","Flood Sung","Wei Liu","Shuo Shang","Xiuying Chen","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2501.04070v2.pdf","comment":"13 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2501.04916v1","updated":"2025-01-09T02:14:12Z","published":"2025-01-09T02:14:12Z","title":"SpecTf: Transformers Enable Data-Driven Imaging Spectroscopy Cloud\n  Detection","summary":"  Current and upcoming generations of visible-shortwave infrared (VSWIR)\nimaging spectrometers promise unprecedented capacity to quantify Earth System\nprocesses across the globe. However, reliable cloud screening remains a\nfundamental challenge for these instruments, where traditional spatial and\ntemporal approaches are limited by cloud variability and limited temporal\ncoverage. The Spectroscopic Transformer (SpecTf) addresses these challenges\nwith a spectroscopy-specific deep learning architecture that performs cloud\ndetection using only spectral information (no spatial or temporal data are\nrequired). By treating spectral measurements as sequences rather than image\nchannels, SpecTf learns fundamental physical relationships without relying on\nspatial context. Our experiments demonstrate that SpecTf significantly\noutperforms the current baseline approach implemented for the EMIT instrument,\nand performs comparably with other machine learning methods with orders of\nmagnitude fewer learned parameters. Critically, we demonstrate SpecTf's\ninherent interpretability through its attention mechanism, revealing physically\nmeaningful spectral features the model has learned. Finally, we present\nSpecTf's potential for cross-instrument generalization by applying it to a\ndifferent instrument on a different platform without modifications, opening the\ndoor to instrument agnostic data driven algorithms for future imaging\nspectroscopy tasks.\n","authors":["Jake H. Lee","Michael Kiper","David R. Thompson","Philip G. Brodrick"],"pdf_url":"https://arxiv.org/pdf/2501.04916v1.pdf","comment":"23 pages, 5 figures, in review. Code repository:\n  https://github.com/emit-sds/SpecTf"},{"id":"http://arxiv.org/abs/2501.04914v1","updated":"2025-01-09T02:10:15Z","published":"2025-01-09T02:10:15Z","title":"From Mesh Completion to AI Designed Crown","summary":"  Designing a dental crown is a time-consuming and labor intensive process. Our\ngoal is to simplify crown design and minimize the tediousness of making manual\nadjustments while still ensuring the highest level of accuracy and consistency.\nTo this end, we present a new end- to-end deep learning approach, coined Dental\nMesh Completion (DMC), to generate a crown mesh conditioned on a point cloud\ncontext. The dental context includes the tooth prepared to receive a crown and\nits surroundings, namely the two adjacent teeth and the three closest teeth in\nthe opposing jaw. We formulate crown generation in terms of completing this\npoint cloud context. A feature extractor first converts the input point cloud\ninto a set of feature vectors that represent local regions in the point cloud.\nThe set of feature vectors is then fed into a transformer to predict a new set\nof feature vectors for the missing region (crown). Subsequently, a point\nreconstruction head, followed by a multi-layer perceptron, is used to predict a\ndense set of points with normals. Finally, a differentiable point-to-mesh layer\nserves to reconstruct the crown surface mesh. We compare our DMC method to a\ngraph-based convolutional neural network which learns to deform a crown mesh\nfrom a generic crown shape to the target geometry. Extensive experiments on our\ndataset demonstrate the effectiveness of our method, which attains an average\nof 0.062 Chamfer Distance.The code is available\nat:https://github.com/Golriz-code/DMC.gi\n","authors":["Golriz Hosseinimanesh","Farnoosh Ghadiri","Francois Guibault","Farida Cheriet","Julia Keren"],"pdf_url":"https://arxiv.org/pdf/2501.04914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04284v2","updated":"2025-01-09T01:58:41Z","published":"2025-01-08T05:15:43Z","title":"ContextMRI: Enhancing Compressed Sensing MRI through Metadata\n  Conditioning","summary":"  Compressed sensing MRI seeks to accelerate MRI acquisition processes by\nsampling fewer k-space measurements and then reconstructing the missing data\nalgorithmically. The success of these approaches often relies on strong priors\nor learned statistical models. While recent diffusion model-based priors have\nshown great potential, previous methods typically ignore clinically available\nmetadata (e.g. patient demographics, imaging parameters, slice-specific\ninformation). In practice, metadata contains meaningful cues about the anatomy\nand acquisition protocol, suggesting it could further constrain the\nreconstruction problem. In this work, we propose ContextMRI, a text-conditioned\ndiffusion model for MRI that integrates granular metadata into the\nreconstruction process. We train a pixel-space diffusion model directly on\nminimally processed, complex-valued MRI images. During inference, metadata is\nconverted into a structured text prompt and fed to the model via CLIP text\nembeddings. By conditioning the prior on metadata, we unlock more accurate\nreconstructions and show consistent gains across multiple datasets,\nacceleration factors, and undersampling patterns. Our experiments demonstrate\nthat increasing the fidelity of metadata, ranging from slice location and\ncontrast to patient age, sex, and pathology, systematically boosts\nreconstruction performance. This work highlights the untapped potential of\nleveraging clinical context for inverse problems and opens a new direction for\nmetadata-driven MRI reconstruction.\n","authors":["Hyungjin Chung","Dohun Lee","Zihui Wu","Byung-Hoon Kim","Katherine L. Bouman","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2501.04284v2.pdf","comment":"29 pages, 9 figures. Code is available at\n  https://github.com/DoHunLee1/ContextMRI"},{"id":"http://arxiv.org/abs/2310.05549v2","updated":"2025-01-09T01:54:34Z","published":"2023-10-09T09:17:52Z","title":"A New Transformation Approach for Uplift Modeling with Binary Outcome","summary":"  Uplift modeling has been used effectively in fields such as marketing and\ncustomer retention, to target those customers who are more likely to respond\ndue to the campaign or treatment. Essentially, it is a machine learning\ntechnique that predicts the gain from performing some action with respect to\nnot taking it. A popular class of uplift models is the transformation approach\nthat redefines the target variable with the original treatment indicator. These\ntransformation approaches only need to train and predict the difference in\noutcomes directly. The main drawback of these approaches is that in general it\ndoes not use the information in the treatment indicator beyond the construction\nof the transformed outcome and usually is not efficient. In this paper, we\ndesign a novel transformed outcome for the case of the binary target variable\nand unlock the full value of the samples with zero outcome. From a practical\nperspective, our new approach is flexible and easy to use. Experimental results\non synthetic and real-world datasets obviously show that our new approach\noutperforms the traditional one. At present, our new approach has already been\napplied to precision marketing in a China nation-wide financial holdings group.\n","authors":["Kun Li","Liangshu Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.05549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04228v2","updated":"2025-01-09T01:35:56Z","published":"2025-01-08T01:59:47Z","title":"Constraints as Rewards: Reinforcement Learning for Robots without Reward\n  Functions","summary":"  Reinforcement learning has become an essential algorithm for generating\ncomplex robotic behaviors. However, to learn such behaviors, it is necessary to\ndesign a reward function that describes the task, which often consists of\nmultiple objectives that needs to be balanced. This tuning process is known as\nreward engineering and typically involves extensive trial-and-error. In this\npaper, to avoid this trial-and-error process, we propose the concept of\nConstraints as Rewards (CaR). CaR formulates the task objective using multiple\nconstraint functions instead of a reward function and solves a reinforcement\nlearning problem with constraints using the Lagrangian-method. By adopting this\napproach, different objectives are automatically balanced, because Lagrange\nmultipliers serves as the weights among the objectives. In addition, we will\ndemonstrate that constraints, expressed as inequalities, provide an intuitive\ninterpretation of the optimization target designed for the task. We apply the\nproposed method to the standing-up motion generation task of a\nsix-wheeled-telescopic-legged robot and demonstrate that the proposed method\nsuccessfully acquires the target behavior, even though it is challenging to\nlearn with manually designed reward functions.\n","authors":["Yu Ishihara","Noriaki Takasugi","Kotaro Kawakami","Masaya Kinoshita","Kazumi Aoyama"],"pdf_url":"https://arxiv.org/pdf/2501.04228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04903v1","updated":"2025-01-09T01:31:30Z","published":"2025-01-09T01:31:30Z","title":"Towards understanding the bias in decision trees","summary":"  There is a widespread and longstanding belief that machine learning models\nare biased towards the majority (or negative) class when learning from\nimbalanced data, leading them to neglect or ignore the minority (or positive)\nclass. In this study, we show that this belief is not necessarily correct for\ndecision trees, and that their bias can actually be in the opposite direction.\nMotivated by a recent simulation study that suggested that decision trees can\nbe biased towards the minority class, our paper aims to reconcile the conflict\nbetween that study and decades of other works. First, we critically evaluate\npast literature on this problem, finding that failing to consider the data\ngenerating process has led to incorrect conclusions about the bias in decision\ntrees. We then prove that, under specific conditions related to the predictors,\ndecision trees fit to purity and trained on a dataset with only one positive\ncase are biased towards the minority class. Finally, we demonstrate that splits\nin a decision tree are also biased when there is more than one positive case.\nOur findings have implications on the use of popular tree-based models, such as\nrandom forests.\n","authors":["Nathan Phelps","Daniel J. Lizotte","Douglas G. Woolford"],"pdf_url":"https://arxiv.org/pdf/2501.04903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04898v1","updated":"2025-01-09T01:22:22Z","published":"2025-01-09T01:22:22Z","title":"Optimality and Adaptivity of Deep Neural Features for Instrumental\n  Variable Regression","summary":"  We provide a convergence analysis of deep feature instrumental variable\n(DFIV) regression (Xu et al., 2021), a nonparametric approach to IV regression\nusing data-adaptive features learned by deep neural networks in two stages. We\nprove that the DFIV algorithm achieves the minimax optimal learning rate when\nthe target structural function lies in a Besov space. This is shown under\nstandard nonparametric IV assumptions, and an additional smoothness assumption\non the regularity of the conditional distribution of the covariate given the\ninstrument, which controls the difficulty of Stage 1. We further demonstrate\nthat DFIV, as a data-adaptive algorithm, is superior to fixed-feature (kernel\nor sieve) IV methods in two ways. First, when the target function possesses low\nspatial homogeneity (i.e., it has both smooth and spiky/discontinuous regions),\nDFIV still achieves the optimal rate, while fixed-feature methods are shown to\nbe strictly suboptimal. Second, comparing with kernel-based two-stage\nregression estimators, DFIV is provably more data efficient in the Stage 1\nsamples.\n","authors":["Juno Kim","Dimitri Meunier","Arthur Gretton","Taiji Suzuki","Zhu Li"],"pdf_url":"https://arxiv.org/pdf/2501.04898v1.pdf","comment":"46 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2305.10391v3","updated":"2025-01-09T01:09:42Z","published":"2023-05-17T17:31:20Z","title":"Optimality of Message-Passing Architectures for Sparse Graphs","summary":"  We study the node classification problem on feature-decorated graphs in the\nsparse setting, i.e., when the expected degree of a node is $O(1)$ in the\nnumber of nodes, in the fixed-dimensional asymptotic regime, i.e., the\ndimension of the feature data is fixed while the number of nodes is large. Such\ngraphs are typically known to be locally tree-like. We introduce a notion of\nBayes optimality for node classification tasks, called asymptotic local Bayes\noptimality, and compute the optimal classifier according to this criterion for\na fairly general statistical data model with arbitrary distributions of the\nnode features and edge connectivity. The optimal classifier is implementable\nusing a message-passing graph neural network architecture. We then compute the\ngeneralization error of this classifier and compare its performance against\nexisting learning methods theoretically on a well-studied statistical model\nwith naturally identifiable signal-to-noise ratios (SNRs) in the data. We find\nthat the optimal message-passing architecture interpolates between a standard\nMLP in the regime of low graph signal and a typical convolution in the regime\nof high graph signal. Furthermore, we prove a corresponding non-asymptotic\nresult.\n","authors":["Aseem Baranwal","Kimon Fountoulakis","Aukosh Jagannath"],"pdf_url":"https://arxiv.org/pdf/2305.10391v3.pdf","comment":"27 pages, 2 figures, published at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2501.04897v1","updated":"2025-01-09T01:03:14Z","published":"2025-01-09T01:03:14Z","title":"Online Continual Learning: A Systematic Literature Review of Approaches,\n  Challenges, and Benchmarks","summary":"  Online Continual Learning (OCL) is a critical area in machine learning,\nfocusing on enabling models to adapt to evolving data streams in real-time\nwhile addressing challenges such as catastrophic forgetting and the\nstability-plasticity trade-off. This study conducts the first comprehensive\nSystematic Literature Review (SLR) on OCL, analyzing 81 approaches, extracting\nover 1,000 features (specific tasks addressed by these approaches), and\nidentifying more than 500 components (sub-models within approaches, including\nalgorithms and tools). We also review 83 datasets spanning applications like\nimage classification, object detection, and multimodal vision-language tasks.\nOur findings highlight key challenges, including reducing computational\noverhead, developing domain-agnostic solutions, and improving scalability in\nresource-constrained environments. Furthermore, we identify promising\ndirections for future research, such as leveraging self-supervised learning for\nmultimodal and sequential data, designing adaptive memory mechanisms that\nintegrate sparse retrieval and generative replay, and creating efficient\nframeworks for real-world applications with noisy or evolving task boundaries.\nBy providing a rigorous and structured synthesis of the current state of OCL,\nthis review offers a valuable resource for advancing this field and addressing\nits critical challenges and opportunities. The complete SLR methodology steps\nand extracted data are publicly available through the provided link:\nhttps://github.com/kiyan-rezaee/\nSystematic-Literature-Review-on-Online-Continual-Learning\n","authors":["Seyed Amir Bidaki","Amir Mohammadkhah","Kiyan Rezaee","Faeze Hassani","Sadegh Eskandari","Maziar Salahi","Mohammad M. Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2501.04897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04896v1","updated":"2025-01-09T00:50:44Z","published":"2025-01-09T00:50:44Z","title":"Quantifying Itch and its Impact on Sleep Using Machine Learning and\n  Radio Signals","summary":"  Chronic itch affects 13% of the US population, is highly debilitating, and\nunderlies many medical conditions. A major challenge in clinical care and new\ntherapeutics development is the lack of an objective measure for quantifying\nitch, leading to reliance on subjective measures like patients' self-assessment\nof itch severity. In this paper, we show that a home radio device paired with\nartificial intelligence (AI) can concurrently capture scratching and evaluate\nits impact on sleep quality by analyzing radio signals bouncing in the\nenvironment. The device eliminates the need for wearable sensors or skin\ncontact, enabling monitoring of chronic itch over extended periods at home\nwithout burdening patients or interfering with their skin condition. To\nvalidate the technology, we conducted an observational clinical study of\nchronic pruritus patients, monitored at home for one month using both the radio\ndevice and an infrared camera. Comparing the output of the device to ground\ntruth data from the camera demonstrates its feasibility and accuracy (ROC AUC =\n0.997, sensitivity = 0.825, specificity = 0.997). The results reveal a\nsignificant correlation between scratching and low sleep quality, manifested as\na reduction in sleep efficiency (R = 0.6, p < 0.001) and an increase in sleep\nlatency (R = 0.68, p < 0.001). Our study underscores the potential of passive,\nlong-term, at-home monitoring of chronic scratching and its sleep implications,\noffering a valuable tool for both clinical care of chronic itch patients and\npharmaceutical clinical trials.\n","authors":["Michail Ouroutzoglou","Mingmin Zhao","Joshua Hellerstein","Hariharan Rahul","Asima Badic","Brian S. Kim","Dina Katabi"],"pdf_url":"https://arxiv.org/pdf/2501.04896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04894v1","updated":"2025-01-09T00:35:48Z","published":"2025-01-09T00:35:48Z","title":"A Look into How Machine Learning is Reshaping Engineering Models: the\n  Rise of Analysis Paralysis, Optimal yet Infeasible Solutions, and the\n  Inevitable Rashomon Paradox","summary":"  The widespread acceptance of empirically derived codal provisions and\nequations in civil engineering stands in stark contrast to the skepticism\nfacing machine learning (ML) models, despite their shared statistical\nfoundations. This paper examines this philosophical tension through the lens of\nstructural engineering and explores how integrating ML challenges traditional\nengineering philosophies and professional identities. Recent efforts have\ndocumented how ML enhances predictive accuracy, optimizes designs, and analyzes\ncomplex behaviors. However, one might also raise concerns about the diminishing\nrole of human intuition and the interpretability of algorithms. To showcase\nthis rarely explored front, this paper presents how ML can be successfully\nintegrated into various engineering problems by means of formulation via\ndeduction, induction, and abduction. Then, this paper identifies three\nprincipal paradoxes that could arise when adopting ML: analysis paralysis\n(increased prediction accuracy leading to a reduced understanding of physical\nmechanisms), infeasible solutions (optimization resulting in unconventional\ndesigns that challenge engineering intuition), and the Rashomon effect (where\ncontradictions in explainability methods and physics arise). This paper\nconcludes by addressing these paradoxes and arguing the need to rethink\nepistemological shifts in engineering and engineering education and\nmethodologies to harmonize traditional principles with ML.\n","authors":["MZ Naser"],"pdf_url":"https://arxiv.org/pdf/2501.04894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14390v3","updated":"2025-01-09T00:17:04Z","published":"2024-11-21T18:24:06Z","title":"Persistent Homology for Structural Characterization in Disordered\n  Systems","summary":"  We propose a unified framework based on persistent homology (PH) to\ncharacterize both local and global structures in disordered systems. It can\nsimultaneously generate local and global descriptors using the same algorithm\nand data structure, and has shown to be highly effective and interpretable in\npredicting particle rearrangements and classifying global phases. We also\ndemonstrated that using a single variable enables a linear SVM to achieve\nnearly perfect three-phase classification. Inspired by this discovery, we\ndefine a non-parametric metric, the Separation Index (SI), which not only\nachieves this classification without sacrificing significant performance but\nalso establishes a connection between particle environments and the global\nphase structure. Our methods provide an effective framework for understanding\nand analyzing the properties of disordered materials, with broad potential\napplications in materials science and even wider studies of complex systems.\n","authors":["An Wang","Li Zou"],"pdf_url":"https://arxiv.org/pdf/2411.14390v3.pdf","comment":"19 pages, 17 figures"},{"id":"http://arxiv.org/abs/2410.05315v2","updated":"2025-01-09T00:11:59Z","published":"2024-10-05T03:37:07Z","title":"PalmBench: A Comprehensive Benchmark of Compressed Large Language Models\n  on Mobile Platforms","summary":"  Deploying large language models (LLMs) locally on mobile devices is\nadvantageous in scenarios where transmitting data to remote cloud servers is\neither undesirable due to privacy concerns or impractical due to network\nconnection. Recent advancements (MLC, 2023a; Gerganov, 2023) have facilitated\nthe local deployment of LLMs. However, local deployment also presents\nchallenges, particularly in balancing quality (generative performance),\nlatency, and throughput within the hardware constraints of mobile devices. In\nthis paper, we introduce our lightweight, all-in-one automated benchmarking\nframework that allows users to evaluate LLMs on mobile devices. We provide a\ncomprehensive benchmark of various popular LLMs with different quantization\nconfigurations (both weights and activations) across multiple mobile platforms\nwith varying hardware capabilities. Unlike traditional benchmarks that assess\nfull-scale models on high-end GPU clusters, we focus on evaluating resource\nefficiency (memory and power consumption) and harmful output for compressed\nmodels on mobile devices. Our key observations include i) differences in energy\nefficiency and throughput across mobile platforms; ii) the impact of\nquantization on memory usage, GPU execution time, and power consumption; and\niii) accuracy and performance degradation of quantized models compared to their\nnon-quantized counterparts; and iv) the frequency of hallucinations and toxic\ncontent generated by compressed LLMs on mobile devices.\n","authors":["Yilong Li","Jingyu Liu","Hao Zhang","M Badri Narayanan","Utkarsh Sharma","Shuai Zhang","Pan Hu","Yijing Zeng","Jayaram Raghuram","Suman Banerjee"],"pdf_url":"https://arxiv.org/pdf/2410.05315v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2501.05610v1","updated":"2025-01-09T23:18:38Z","published":"2025-01-09T23:18:38Z","title":"Towards Probabilistic Inference of Human Motor Intentions by Assistive\n  Mobile Robots Controlled via a Brain-Computer Interface","summary":"  Assistive mobile robots are a transformative technology that helps persons\nwith disabilities regain the ability to move freely. Although autonomous\nwheelchairs significantly reduce user effort, they still require human input to\nallow users to maintain control and adapt to changing environments. Brain\nComputer Interface (BCI) stands out as a highly user-friendly option that does\nnot require physical movement. Current BCI systems can understand whether users\nwant to accelerate or decelerate, but they implement these changes in discrete\nspeed steps rather than allowing for smooth, continuous velocity adjustments.\nThis limitation prevents the systems from mimicking the natural, fluid speed\nchanges seen in human self-paced motion. The authors aim to address this\nlimitation by redesigning the perception-action cycle in a BCI controlled\nrobotic system: improving how the robotic agent interprets the user's motion\nintentions (world state) and implementing these actions in a way that better\nreflects natural physical properties of motion, such as inertia and damping.\nThe scope of this paper focuses on the perception aspect. We asked and answered\na normative question \"what computation should the robotic agent carry out to\noptimally perceive incomplete or noisy sensory observations?\" Empirical EEG\ndata were collected, and probabilistic representation that served as world\nstate distributions were learned and evaluated in a Generative Adversarial\nNetwork framework. The ROS framework was established that connected with a\nGazebo environment containing a digital twin of an indoor space and a virtual\nmodel of a robotic wheelchair. Signal processing and statistical analyses were\nimplemented to identity the most discriminative features in the\nspatial-spectral-temporal dimensions, which are then used to construct the\nworld model for the robotic agent to interpret user motion intentions as a\nBayesian observer.\n","authors":["Xiaoshan Zhou","Carol M. Menassa","Vineet R. Kamat"],"pdf_url":"https://arxiv.org/pdf/2501.05610v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2501.05605v1","updated":"2025-01-09T22:41:50Z","published":"2025-01-09T22:41:50Z","title":"Advancing Personalized Learning Analysis via an Innovative Domain\n  Knowledge Informed Attention-based Knowledge Tracing Method","summary":"  Emerging Knowledge Tracing (KT) models, particularly deep learning and\nattention-based Knowledge Tracing, have shown great potential in realizing\npersonalized learning analysis via prediction of students' future performance\nbased on their past interactions. The existing methods mainly focus on\nimmediate past interactions or individual concepts without accounting for\ndependencies between knowledge concept, referred as knowledge concept routes,\nthat can be critical to advance the understanding the students' learning\noutcomes. To address this, in this paper, we propose an innovative\nattention-based method by effectively incorporating the domain knowledge of\nknowledge concept routes in the given curriculum. Additionally, we leverage\nXES3G5M dataset, a benchmark dataset with rich auxiliary information for\nknowledge concept routes, to evaluate and compare the performance of our\nproposed method to the seven State-of-the-art (SOTA) deep learning models.\n","authors":["Shubham Kose","Jin Wei-Kocsis"],"pdf_url":"https://arxiv.org/pdf/2501.05605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17428v2","updated":"2025-01-09T22:27:06Z","published":"2024-05-27T17:59:45Z","title":"NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding\n  Models","summary":"  Decoder-only large language model (LLM)-based embedding models are beginning\nto outperform BERT or T5-based embedding models in general-purpose text\nembedding tasks, including dense vector-based retrieval. In this work, we\nintroduce the NV-Embed model, incorporating architectural designs, training\nprocedures, and curated datasets to significantly enhance the performance of\nLLM as a versatile embedding model, while maintaining its simplicity and\nreproducibility. For model architecture, we propose a latent attention layer to\nobtain pooled embeddings, which consistently improves retrieval and downstream\ntask accuracy compared to mean pooling or using the last <EOS> token embedding\nfrom LLMs. To enhance representation learning, we remove the causal attention\nmask of LLMs during contrastive training. For training algorithm, we introduce\na two-stage contrastive instruction-tuning method. It first applies contrastive\ntraining with instructions on retrieval datasets, utilizing in-batch negatives\nand curated hard negative examples. At stage-2, it blends various non-retrieval\ninto instruction tuning, which not only enhances non-retrieval task accuracy\nbut also improves retrieval performance. For training data, we utilize the\nhard-negative mining, synthetic data generation and existing public available\ndatasets to boost the performance of embedding model. By combining these\ntechniques, our NV-Embed-v1 and NV-Embed-v2 models obtained the No.1 position\non the Massive Text Embedding Benchmark (MTEB) (as of May 24, 2024 and August\n30, 2024, respectively) across 56 embedding tasks, demonstrating the sustained\neffectiveness of the proposed methods over time. Additionally, it achieved the\nhighest scores in the Long Doc section and the second-highest scores in the QA\nsection of the AIR Benchmark, which covers a range of out-of-domain information\nretrieval topics beyond those in MTEB.\n","authors":["Chankyu Lee","Rajarshi Roy","Mengyao Xu","Jonathan Raiman","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2405.17428v2.pdf","comment":"We open-source the model at:\n  https://huggingface.co/nvidia/NV-Embed-v2"},{"id":"http://arxiv.org/abs/2403.13257v3","updated":"2025-01-09T22:21:56Z","published":"2024-03-20T02:38:01Z","title":"Arcee's MergeKit: A Toolkit for Merging Large Language Models","summary":"  The rapid expansion of the open-source language model landscape presents an\nopportunity to merge the competencies of these model checkpoints by combining\ntheir parameters. Advances in transfer learning, the process of fine-tuning\npretrained models for specific tasks, has resulted in the development of vast\namounts of task-specific models, typically specialized in individual tasks and\nunable to utilize each other's strengths. Model merging facilitates the\ncreation of multitask models without the need for additional training, offering\na promising avenue for enhancing model performance and versatility. By\npreserving the intrinsic capabilities of the original models, model merging\naddresses complex challenges in AI - including the difficulties of catastrophic\nforgetting and multitask learning. To support this expanding area of research,\nwe introduce MergeKit, a comprehensive, open-source library designed to\nfacilitate the application of model merging strategies. MergeKit offers an\nextensible framework to efficiently merge models on any hardware, providing\nutility to researchers and practitioners. To date, thousands of models have\nbeen merged by the open-source community, leading to the creation of some of\nthe worlds most powerful open-source model checkpoints, as assessed by the Open\nLLM Leaderboard. The library is accessible at\nhttps://github.com/arcee-ai/MergeKit.\n","authors":["Charles Goddard","Shamane Siriwardhana","Malikeh Ehghaghi","Luke Meyers","Vlad Karpukhin","Brian Benedict","Mark McQuade","Jacob Solawetz"],"pdf_url":"https://arxiv.org/pdf/2403.13257v3.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.13553v2","updated":"2025-01-09T22:17:30Z","published":"2024-11-20T18:59:58Z","title":"AI-generated Image Detection: Passive or Watermark?","summary":"  While text-to-image models offer numerous benefits, they also pose\nsignificant societal risks. Detecting AI-generated images is crucial for\nmitigating these risks. Detection methods can be broadly categorized into\npassive and watermark-based approaches: passive detectors rely on artifacts\npresent in AI-generated images, whereas watermark-based detectors proactively\nembed watermarks into such images. A key question is which type of detector\nperforms better in terms of effectiveness, robustness, and efficiency. However,\nthe current literature lacks a comprehensive understanding of this issue. In\nthis work, we aim to bridge that gap by developing ImageDetectBench, the first\ncomprehensive benchmark to compare the effectiveness, robustness, and\nefficiency of passive and watermark-based detectors. Our benchmark includes\nfour datasets, each containing a mix of AI-generated and non-AI-generated\nimages. We evaluate five passive detectors and four watermark-based detectors\nagainst eight types of common perturbations and three types of adversarial\nperturbations. Our benchmark results reveal several interesting findings. For\ninstance, watermark-based detectors consistently outperform passive detectors,\nboth in the presence and absence of perturbations. Based on these insights, we\nprovide recommendations for detecting AI-generated images, e.g., when both\ntypes of detectors are applicable, watermark-based detectors should be the\npreferred choice. Our code and data are publicly available at\nhttps://github.com/moyangkuo/ImageDetectBench.git.\n","authors":["Moyang Guo","Yuepeng Hu","Zhengyuan Jiang","Zeyu Li","Amir Sadovnik","Arka Daw","Neil Gong"],"pdf_url":"https://arxiv.org/pdf/2411.13553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06687v2","updated":"2025-01-09T22:14:55Z","published":"2024-08-13T07:27:02Z","title":"Masked Image Modeling: A Survey","summary":"  In this work, we survey recent studies on masked image modeling (MIM), an\napproach that emerged as a powerful self-supervised learning technique in\ncomputer vision. The MIM task involves masking some information, e.g.~pixels,\npatches, or even latent representations, and training a model, usually an\nautoencoder, to predicting the missing information by using the context\navailable in the visible part of the input. We identify and formalize two\ncategories of approaches on how to implement MIM as a pretext task, one based\non reconstruction and one based on contrastive learning. Then, we construct a\ntaxonomy and review the most prominent papers in recent years. We complement\nthe manually constructed taxonomy with a dendrogram obtained by applying a\nhierarchical clustering algorithm. We further identify relevant clusters via\nmanually inspecting the resulting dendrogram. Our review also includes datasets\nthat are commonly used in MIM research. We aggregate the performance results of\nvarious masked image modeling methods on the most popular datasets, to\nfacilitate the comparison of competing methods. Finally, we identify research\ngaps and propose several interesting directions of future work. We supplement\nour survey with the following public repository containing organized\nreferences: https://github.com/vladhondru25/MIM-Survey.\n","authors":["Vlad Hondru","Florinel Alin Croitoru","Shervin Minaee","Radu Tudor Ionescu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2408.06687v2.pdf","comment":"Revised version"},{"id":"http://arxiv.org/abs/2310.03696v3","updated":"2025-01-09T22:12:45Z","published":"2023-10-05T17:13:16Z","title":"Function-Space Optimality of Neural Architectures with Multivariate\n  Nonlinearities","summary":"  We investigate the function-space optimality (specifically, the Banach-space\noptimality) of a large class of shallow neural architectures with multivariate\nnonlinearities/activation functions. To that end, we construct a new family of\nBanach spaces defined via a regularization operator, the $k$-plane transform,\nand a sparsity-promoting norm. We prove a representer theorem that states that\nthe solution sets to learning problems posed over these Banach spaces are\ncompletely characterized by neural architectures with multivariate\nnonlinearities. These optimal architectures have skip connections and are\ntightly connected to orthogonal weight normalization and multi-index models,\nboth of which have received recent interest in the neural network community.\nOur framework is compatible with a number of classical nonlinearities including\nthe rectified linear unit (ReLU) activation function, the norm activation\nfunction, and the radial basis functions found in the theory of\nthin-plate/polyharmonic splines. We also show that the underlying spaces are\nspecial instances of reproducing kernel Banach spaces and variation spaces. Our\nresults shed light on the regularity of functions learned by neural networks\ntrained on data, particularly with multivariate nonlinearities, and provide new\ntheoretical motivation for several architectural choices found in practice.\n","authors":["Rahul Parhi","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2310.03696v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18731v3","updated":"2025-01-09T22:10:14Z","published":"2024-04-29T14:17:52Z","title":"Real Time Multi Organ Classification on Computed Tomography Images","summary":"  Organ segmentation is a fundamental task in medical imaging since it is\nuseful for many clinical automation pipelines. However, some tasks do not\nrequire full segmentation. Instead, a classifier can identify the selected\norgan without segmenting the entire volume. In this study, we demonstrate a\nclassifier based method to obtain organ labels in real time by using a large\ncontext size with a sparse data sampling strategy. Although our method operates\nas an independent classifier at query locations, it can generate full\nsegmentations by querying grid locations at any resolution, offering faster\nperformance than segmentation algorithms. We compared our method with existing\nsegmentation techniques, demonstrating its superior runtime potential for\npractical applications in medical imaging.\n","authors":["Halid Ziya Yerebakan","Yoshihisa Shinagawa","Gerardo Hermosillo Valadez"],"pdf_url":"https://arxiv.org/pdf/2404.18731v3.pdf","comment":"11 pages, Organ Classification, Organ Segmentation"},{"id":"http://arxiv.org/abs/2312.12641v5","updated":"2025-01-09T22:08:44Z","published":"2023-12-19T22:36:37Z","title":"Robust Point Matching with Distance Profiles","summary":"  We show the outlier robustness and noise stability of practical matching\nprocedures based on distance profiles. Although the idea of matching points\nbased on invariants like distance profiles has a long history in the\nliterature, there has been little understanding of the theoretical properties\nof such procedures, especially in the presence of outliers and noise. We\nprovide a theoretical analysis showing that under certain probabilistic\nsettings, the proposed matching procedure is successful with high probability\neven in the presence of outliers and noise. We demonstrate the performance of\nthe proposed method using a real data example and provide simulation studies to\ncomplement the theoretical findings. Lastly, we extend the concept of distance\nprofiles to the abstract setting and connect the proposed matching procedure to\nthe Gromov-Wasserstein distance and its lower bound, with a new sample\ncomplexity result derived based on the properties of distance profiles. This\npaper contributes to the literature by providing theoretical underpinnings of\nthe matching procedures based on invariants like distance profiles, which have\nbeen widely used in practice but have rarely been analyzed theoretically.\n","authors":["YoonHaeng Hur","Yuehaw Khoo"],"pdf_url":"https://arxiv.org/pdf/2312.12641v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05591v1","updated":"2025-01-09T21:53:03Z","published":"2025-01-09T21:53:03Z","title":"Session-Level Dynamic Ad Load Optimization using Offline Robust\n  Reinforcement Learning","summary":"  Session-level dynamic ad load optimization aims to personalize the density\nand types of delivered advertisements in real time during a user's online\nsession by dynamically balancing user experience quality and ad monetization.\nTraditional causal learning-based approaches struggle with key technical\nchallenges, especially in handling confounding bias and distribution shifts. In\nthis paper, we develop an offline deep Q-network (DQN)-based framework that\neffectively mitigates confounding bias in dynamic systems and demonstrates more\nthan 80% offline gains compared to the best causal learning-based production\nbaseline. Moreover, to improve the framework's robustness against unanticipated\ndistribution shifts, we further enhance our framework with a novel offline\nrobust dueling DQN approach. This approach achieves more stable rewards on\nmultiple OpenAI-Gym datasets as perturbations increase, and provides an\nadditional 5% offline gains on real-world ad delivery data.\n  Deployed across multiple production systems, our approach has achieved\noutsized topline gains. Post-launch online A/B tests have shown double-digit\nimprovements in the engagement-ad score trade-off efficiency, significantly\nenhancing our platform's capability to serve both consumers and advertisers.\n","authors":["Tao Liu","Qi Xu","Wei Shi","Zhigang Hua","Shuang Yang"],"pdf_url":"https://arxiv.org/pdf/2501.05591v1.pdf","comment":"Will appear in KDD 2025"},{"id":"http://arxiv.org/abs/2501.05588v1","updated":"2025-01-09T21:45:09Z","published":"2025-01-09T21:45:09Z","title":"Enforcing Fundamental Relations via Adversarial Attacks on Input\n  Parameter Correlations","summary":"  Correlations between input parameters play a crucial role in many scientific\nclassification tasks, since these are often related to fundamental laws of\nnature. For example, in high energy physics, one of the common deep learning\nuse-cases is the classification of signal and background processes in particle\ncollisions. In many such cases, the fundamental principles of the correlations\nbetween observables are often better understood than the actual distributions\nof the observables themselves. In this work, we present a new adversarial\nattack algorithm called Random Distribution Shuffle Attack (RDSA), emphasizing\nthe correlations between observables in the network rather than individual\nfeature characteristics. Correct application of the proposed novel attack can\nresult in a significant improvement in classification performance -\nparticularly in the context of data augmentation - when using the generated\nadversaries within adversarial training. Given that correlations between input\nfeatures are also crucial in many other disciplines. We demonstrate the RDSA\neffectiveness on six classification tasks, including two particle collision\nchallenges (using CERN Open Data), hand-written digit recognition (MNIST784),\nhuman activity recognition (HAR), weather forecasting (Rain in Australia), and\nICU patient mortality (MIMIC-IV), demonstrating a general use case beyond\nfundamental physics for this new type of adversarial attack algorithms.\n","authors":["Timo Saala","Lucie Flek","Alexander Jung","Akbar Karimi","Alexander Schmidt","Matthias Schott","Philipp Soldin","Christopher Wiebusch"],"pdf_url":"https://arxiv.org/pdf/2501.05588v1.pdf","comment":"12 pages, 8 figures (Without appendix)"},{"id":"http://arxiv.org/abs/2501.05583v1","updated":"2025-01-09T21:21:06Z","published":"2025-01-09T21:21:06Z","title":"Learned Discrepancy Reconstruction and Benchmark Dataset for Magnetic\n  Particle Imaging","summary":"  Magnetic Particle Imaging (MPI) is an emerging imaging modality based on the\nmagnetic response of superparamagnetic iron oxide nanoparticles to achieve\nhigh-resolution and real-time imaging without harmful radiation. One key\nchallenge in the MPI image reconstruction task arises from its underlying noise\nmodel, which does not fulfill the implicit Gaussian assumptions that are made\nwhen applying traditional reconstruction approaches. To address this challenge,\nwe introduce the Learned Discrepancy Approach, a novel learning-based\nreconstruction method for inverse problems that includes a learned discrepancy\nfunction. It enhances traditional techniques by incorporating an invertible\nneural network to explicitly model problem-specific noise distributions. This\napproach does not rely on implicit Gaussian noise assumptions, making it\nespecially suited to handle the sophisticated noise model in MPI and also\napplicable to other inverse problems. To further advance MPI reconstruction\ntechniques, we introduce the MPI-MNIST dataset - a large collection of\nsimulated MPI measurements derived from the MNIST dataset of handwritten\ndigits. The dataset includes noise-perturbed measurements generated from\nstate-of-the-art model-based system matrices and measurements of a preclinical\nMPI scanner device. This provides a realistic and flexible environment for\nalgorithm testing. Validated against the MPI-MNIST dataset, our method\ndemonstrates significant improvements in reconstruction quality in terms of\nstructural similarity when compared to classical reconstruction techniques.\n","authors":["Meira Iske","Hannes Albers","Tobias Knopp","Tobias Kluth"],"pdf_url":"https://arxiv.org/pdf/2501.05583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05580v1","updated":"2025-01-09T21:14:25Z","published":"2025-01-09T21:14:25Z","title":"Physics-Driven Learning for Inverse Problems in Quantum Chromodynamics","summary":"  The integration of deep learning techniques and physics-driven designs is\nreforming the way we address inverse problems, in which accurate physical\nproperties are extracted from complex data sets. This is particularly relevant\nfor quantum chromodynamics (QCD), the theory of strong interactions, with its\ninherent limitations in observational data and demanding computational\napproaches. This perspective highlights advances and potential of\nphysics-driven learning methods, focusing on predictions of physical quantities\ntowards QCD physics, and drawing connections to machine learning(ML). It is\nshown that the fusion of ML and physics can lead to more efficient and reliable\nproblem-solving strategies. Key ideas of ML, methodology of embedding physics\npriors, and generative models as inverse modelling of physical probability\ndistributions are introduced. Specific applications cover first-principle\nlattice calculations, and QCD physics of hadrons, neutron stars, and heavy-ion\ncollisions. These examples provide a structured and concise overview of how\nincorporating prior knowledge such as symmetry, continuity and equations into\ndeep learning designs can address diverse inverse problems across different\nphysical sciences.\n","authors":["Gert Aarts","Kenji Fukushima","Tetsuo Hatsuda","Andreas Ipp","Shuzhe Shi","Lingxiao Wang","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.05580v1.pdf","comment":"14 pages, 5 figures, submitted version to Nat Rev Phys"},{"id":"http://arxiv.org/abs/2501.05564v1","updated":"2025-01-09T20:19:27Z","published":"2025-01-09T20:19:27Z","title":"Analog Bayesian neural networks are insensitive to the shape of the\n  weight distribution","summary":"  Recent work has demonstrated that Bayesian neural networks (BNN's) trained\nwith mean field variational inference (MFVI) can be implemented in analog\nhardware, promising orders of magnitude energy savings compared to the standard\ndigital implementations. However, while Gaussians are typically used as the\nvariational distribution in MFVI, it is difficult to precisely control the\nshape of the noise distributions produced by sampling analog devices. This\npaper introduces a method for MFVI training using real device noise as the\nvariational distribution. Furthermore, we demonstrate empirically that the\npredictive distributions from BNN's with the same weight means and variances\nconverge to the same distribution, regardless of the shape of the variational\ndistribution. This result suggests that analog device designers do not need to\nconsider the shape of the device noise distribution when hardware-implementing\nBNNs performing MFVI.\n","authors":["Ravi G. Patel","T. Patrick Xiao","Sapan Agarwal","Christopher Bennett"],"pdf_url":"https://arxiv.org/pdf/2501.05564v1.pdf","comment":"Presented at the NeurIPS 2024 Workshop on Machine Learning with New\n  Compute Paradigms, https://openreview.net/forum?id=soS5qgU7Yb"},{"id":"http://arxiv.org/abs/2501.05563v1","updated":"2025-01-09T20:19:01Z","published":"2025-01-09T20:19:01Z","title":"Prediction-Assisted Online Distributed Deep Learning Workload Scheduling\n  in GPU Clusters","summary":"  The recent explosive growth of deep learning (DL) models has necessitated a\ncompelling need for efficient job scheduling for distributed deep learning\ntraining with mixed parallelisms (DDLwMP) in GPU clusters. This paper proposes\nan adaptive shortest-remaining-processing-time-first (A-SRPT) scheduling\nalgorithm, a novel prediction-assisted online scheduling approach designed to\nmitigate the challenges associated with DL cluster scheduling. By modeling each\njob as a graph corresponding to heterogeneous Deep Neural Network (DNN) models\nand their associated distributed training configurations, A-SRPT strategically\nassigns jobs to the available GPUs, thereby minimizing inter-server\ncommunication overhead. Observing that most DDLwMP jobs recur, A-SRPT\nincorporates a random forest regression model to predict training iterations.\nCrucially, A-SRPT maps the complex scheduling problem into a single-machine\ninstance, which is addressed optimally by a preemptive\n\"shortest-remaining-processing-time-first\" strategy. This optimized solution\nserves as a guide for actual job scheduling within the GPU clusters, leading to\na theoretically provable competitive scheduling efficiency. We conduct\nextensive real-world testbed and simulation experiments to verify our proposed\nalgorithms.\n","authors":["Ziyue Luo","Jia Liu","Myungjin Lee","Ness B. Shroff"],"pdf_url":"https://arxiv.org/pdf/2501.05563v1.pdf","comment":"INFOCOM 2025"},{"id":"http://arxiv.org/abs/2501.05559v1","updated":"2025-01-09T20:11:08Z","published":"2025-01-09T20:11:08Z","title":"Soup to go: mitigating forgetting during continual learning with model\n  averaging","summary":"  In continual learning, where task data arrives in a sequence, fine-tuning on\nlater tasks will often lead to performance degradation on earlier tasks. This\nis especially pronounced when these tasks come from diverse domains. In this\nsetting, how can we mitigate catastrophic forgetting of earlier tasks and\nretain what the model has learned with minimal computational expenses? Inspired\nby other merging methods, and L2-regression, we propose Sequential Fine-tuning\nwith Averaging (SFA), a method that merges currently training models with\nearlier checkpoints during the course of training. SOTA approaches typically\nmaintain a data buffer of past tasks or impose a penalty at each gradient step.\nIn contrast, our method achieves comparable results without the need to store\npast data, or multiple copies of parameters for each gradient step.\nFurthermore, our method outperforms common merging techniques such as Task\nArithmetic, TIES Merging, and WiSE-FT, as well as other penalty methods like L2\nand Elastic Weight Consolidation. In turn, our method offers insight into the\nbenefits of merging partially-trained models during training across both image\nand language domains.\n","authors":["Anat Kleiman","Gintare Karolina Dziugaite","Jonathan Frankle","Sham Kakade","Mansheej Paul"],"pdf_url":"https://arxiv.org/pdf/2501.05559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13969v2","updated":"2025-01-09T20:08:31Z","published":"2023-08-26T22:48:06Z","title":"Gaze-Informed Vision Transformers: Predicting Driving Decisions Under\n  Uncertainty","summary":"  Vision Transformers (ViT) have advanced computer vision, yet their efficacy\nin complex tasks like driving remains less explored. This study enhances ViT by\nintegrating human eye gaze, captured via eye-tracking, to increase prediction\naccuracy in driving scenarios under uncertainty in both real-world and virtual\nreality scenarios. First, we establish the significance of human eye gaze in\nleft-right driving decisions, as observed in both human subjects and a ViT\nmodel. By comparing the similarity between human fixation maps and ViT\nattention weights, we reveal the dynamics of overlap across individual heads\nand layers. This overlap demonstrates that fixation data can guide the model in\ndistributing its attention weights more effectively. We introduce the\nfixation-attention intersection (FAX) loss, a novel loss function that\nsignificantly improves ViT performance under high uncertainty conditions. Our\nresults show that ViT, when trained with FAX loss, aligns its attention with\nhuman gaze patterns. This gaze-informed approach has significant potential for\ndriver behavior analysis, as well as broader applications in human-centered AI\nsystems, extending ViT's use to complex visual environments.\n","authors":["Sharath Koorathota","Nikolas Papadopoulos","Jia Li Ma","Shruti Kumar","Xiaoxiao Sun","Arunesh Mittal","Patrick Adelman","Paul Sajda"],"pdf_url":"https://arxiv.org/pdf/2308.13969v2.pdf","comment":"25 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.00190v2","updated":"2025-01-09T20:00:16Z","published":"2024-12-31T00:02:07Z","title":"SepsisCalc: Integrating Clinical Calculators into Early Sepsis\n  Prediction via Dynamic Temporal Graph Construction","summary":"  Sepsis is an organ dysfunction caused by a deregulated immune response to an\ninfection. Early sepsis prediction and identification allow for timely\nintervention, leading to improved clinical outcomes. Clinical calculators\n(e.g., the six-organ dysfunction assessment of SOFA) play a vital role in\nsepsis identification within clinicians' workflow, providing evidence-based\nrisk assessments essential for sepsis diagnosis. However, artificial\nintelligence (AI) sepsis prediction models typically generate a single sepsis\nrisk score without incorporating clinical calculators for assessing organ\ndysfunctions, making the models less convincing and transparent to clinicians.\nTo bridge the gap, we propose to mimic clinicians' workflow with a novel\nframework SepsisCalc to integrate clinical calculators into the predictive\nmodel, yielding a clinically transparent and precise model for utilization in\nclinical settings. Practically, clinical calculators usually combine\ninformation from multiple component variables in Electronic Health Records\n(EHR), and might not be applicable when the variables are (partially) missing.\nWe mitigate this issue by representing EHRs as temporal graphs and integrating\na learning module to dynamically add the accurately estimated calculator to the\ngraphs. Experimental results on real-world datasets show that the proposed\nmodel outperforms state-of-the-art methods on sepsis prediction tasks.\nMoreover, we developed a system to identify organ dysfunctions and potential\nsepsis risks, providing a human-AI interaction tool for deployment, which can\nhelp clinicians understand the prediction outputs and prepare timely\ninterventions for the corresponding dysfunctions, paving the way for actionable\nclinical decision-making support for early intervention.\n","authors":["Changchang Yin","Shihan Fu","Bingsheng Yao","Thai-Hoang Pham","Weidan Cao","Dakuo Wang","Jeffrey Caterino","Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.00190v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05550v1","updated":"2025-01-09T19:48:51Z","published":"2025-01-09T19:48:51Z","title":"Emergent weight morphologies in deep neural networks","summary":"  Whether deep neural networks can exhibit emergent behaviour is not only\nrelevant for understanding how deep learning works, it is also pivotal for\nestimating potential security risks of increasingly capable artificial\nintelligence systems. Here, we show that training deep neural networks gives\nrise to emergent weight morphologies independent of the training data.\nSpecifically, in analogy to condensed matter physics, we derive a theory that\npredict that the homogeneous state of deep neural networks is unstable in a way\nthat leads to the emergence of periodic channel structures. We verified these\nstructures by performing numerical experiments on a variety of data sets. Our\nwork demonstrates emergence in the training of deep neural networks, which\nimpacts the achievable performance of deep neural networks.\n","authors":["Pascal de Jong","Felix Meigel","Steffen Rulands"],"pdf_url":"https://arxiv.org/pdf/2501.05550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01919v5","updated":"2025-01-09T19:42:52Z","published":"2024-03-04T10:36:06Z","title":"Randomized Approach to Matrix Completion: Applications in Collaborative\n  Filtering and Image Inpainting","summary":"  We present a novel method for matrix completion, specifically designed for\nmatrices where one dimension significantly exceeds the other. Our Columns\nSelected Matrix Completion (CSMC) method combines Column Subset Selection and\nLow-Rank Matrix Completion to efficiently reconstruct incomplete datasets. In\neach step, CSMC solves a convex optimization problem. We introduce two\nalgorithms to implement CSMC, each tailored to problems of different sizes. A\nformal analysis is provided, outlining the necessary assumptions and the\nprobability of obtaining a correct solution. To assess the impact of matrix\nsize, rank, and the ratio of missing entries on solution quality and\ncomputation time, we conducted experiments on synthetic data. The method was\nalso applied to two real-world problems: recommendation systems and image\ninpainting. Our results show that CSMC provides solutions of the same quality\nas state-of-the-art matrix completion algorithms based on convex optimization,\nwhile achieving significant reductions in computational runtime.\n","authors":["Antonina Krajewska","Ewa Niewiadomska-Szynkiewicz"],"pdf_url":"https://arxiv.org/pdf/2403.01919v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16865v3","updated":"2025-01-09T19:39:12Z","published":"2024-05-27T06:31:39Z","title":"An Investigation of Conformal Isometry Hypothesis for Grid Cells","summary":"  This paper investigates the conformal isometry hypothesis as a potential\nexplanation for hexagonal periodic patterns in grid cell response maps. The\nhypothesis posits that grid cell activity forms a high-dimensional vector in\nneural space, encoding the agent's position in 2D physical space. As the agent\nmoves, this vector rotates within a 2D manifold in the neural space, driven by\na recurrent neural network. The conformal hypothesis suggests that this neural\nmanifold is a conformally isometric embedding of physical space, where local\ndisplacements in neural space are proportional to those in physical space. In\nthis paper, we conduct numerical experiments to show that this hypothesis leads\nto the hexagon periodic patterns of grid cells, agnostic to the choice of\ntransformation models. Furthermore, we present a theoretical understanding that\nhexagon patterns emerge by minimizing our loss function because hexagon flat\ntorus exhibits minimal deviation from local conformal isometry. In addition, we\npropose a conformal modulation of the agent's input velocity, enabling the\nrecurrent neural network of grid cells to satisfy the conformal isometry\nhypothesis automatically.\n","authors":["Dehong Xu","Ruiqi Gao","Wen-Hao Zhang","Xue-Xin Wei","Ying Nian Wu"],"pdf_url":"https://arxiv.org/pdf/2405.16865v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2310.19192"},{"id":"http://arxiv.org/abs/2501.05541v1","updated":"2025-01-09T19:27:28Z","published":"2025-01-09T19:27:28Z","title":"NSChat: A Chatbot System To Rule Them All","summary":"  The rapid advancement of artificial intelligence has resulted in the advent\nof large language models (LLMs) with the capacity to produce text that closely\nresembles human communication. These models have been seamlessly integrated\ninto diverse applications, enabling interactive and responsive communication\nacross multiple platforms. The potential utility of chatbots transcends these\ntraditional applications, particularly in research contexts, wherein they can\noffer valuable insights and facilitate the design of innovative experiments. In\nthis study, we present NSChat, a web-based chatbot system designed to assist in\nneuroscience research. The system is meticulously designed to function as an\nexperimental instrument rather than a conventional chatbot, necessitating users\nto input a username and experiment code upon access. This setup facilitates\nprecise data cross-referencing, thereby augmenting the integrity and\napplicability of the data collected for research purposes. It can be easily\nexpanded to accommodate new basic events as needed; and it allows researchers\nto integrate their own logging events without the necessity of implementing a\nseparate logging mechanism. It is worth noting that our system was built to\nassist primarily neuroscience research but is not limited to it, it can easily\nbe adapted to assist information retrieval research or interacting with chat\nbot agents in general.\n","authors":["Zenon Lamprou","Yashar Moshfeghi"],"pdf_url":"https://arxiv.org/pdf/2501.05541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05534v1","updated":"2025-01-09T19:16:41Z","published":"2025-01-09T19:16:41Z","title":"OmniJet-${α_{ C}}$: Learning point cloud calorimeter simulations\n  using generative transformers","summary":"  We show the first use of generative transformers for generating calorimeter\nshowers as point clouds in a high-granularity calorimeter. Using the tokenizer\nand generative part of the OmniJet-${\\alpha}$ model, we represent the hits in\nthe detector as sequences of integers. This model allows variable-length\nsequences, which means that it supports realistic shower development and does\nnot need to be conditioned on the number of hits. Since the tokenization\nrepresents the showers as point clouds, the model learns the geometry of the\nshowers without being restricted to any particular voxel grid.\n","authors":["Joschka Birk","Frank Gaede","Anna Hallin","Gregor Kasieczka","Martina Mozzanica","Henning Rose"],"pdf_url":"https://arxiv.org/pdf/2501.05534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08755v2","updated":"2025-01-09T19:15:20Z","published":"2024-12-11T19:54:14Z","title":"Proactive Adversarial Defense: Harnessing Prompt Tuning in\n  Vision-Language Models to Detect Unseen Backdoored Images","summary":"  Backdoor attacks pose a critical threat by embedding hidden triggers into\ninputs, causing models to misclassify them into target labels. While extensive\nresearch has focused on mitigating these attacks in object recognition models\nthrough weight fine-tuning, much less attention has been given to detecting\nbackdoored samples directly. Given the vast datasets used in training, manual\ninspection for backdoor triggers is impractical, and even state-of-the-art\ndefense mechanisms fail to fully neutralize their impact. To address this gap,\nwe introduce a groundbreaking method to detect unseen backdoored images during\nboth training and inference. Leveraging the transformative success of prompt\ntuning in Vision Language Models (VLMs), our approach trains learnable text\nprompts to differentiate clean images from those with hidden backdoor triggers.\nExperiments demonstrate the exceptional efficacy of this method, achieving an\nimpressive average accuracy of 86% across two renowned datasets for detecting\nunseen backdoor triggers, establishing a new standard in backdoor defense.\n","authors":["Kyle Stein","Andrew Arash Mahyari","Guillermo Francia","Eman El-Sheikh"],"pdf_url":"https://arxiv.org/pdf/2412.08755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05239v3","updated":"2025-01-09T19:12:14Z","published":"2023-08-09T21:54:34Z","title":"Enhancing Architecture Frameworks by Including Modern Stakeholders and\n  their Views/Viewpoints","summary":"  Various architecture frameworks for software, systems, and enterprises have\nbeen proposed in the literature. They identified several stakeholders and\ndefined modeling perspectives, architecture viewpoints, and views to frame and\naddress stakeholder concerns. However, the stakeholders with data science and\nMachine Learning (ML) related concerns, such as data scientists and data\nengineers, are yet to be included in existing architecture frameworks. Only\nthis way can we envision a holistic system architecture description of an\nML-enabled system. Note that the ML component behavior and functionalities are\nspecial and should be distinguished from traditional software system behavior\nand functionalities. The main reason is that the actual functionality should be\ninferred from data instead of being specified at design time. Additionally, the\nstructural models of ML components, such as ML model architectures, are\ntypically specified using different notations and formalisms from what the\nSoftware Engineering (SE) community uses for software structural models. Yet,\nthese two aspects, namely ML and non-ML, are becoming so intertwined that it\nnecessitates an extension of software architecture frameworks and modeling\npractices toward supporting ML-enabled system architectures. In this paper, we\naddress this gap through an empirical study using an online survey instrument.\nWe surveyed 61 subject matter experts from over 25 organizations in 10\ncountries.\n","authors":["Armin Moin","Atta Badii","Stephan Günnemann","Moharram Challenger"],"pdf_url":"https://arxiv.org/pdf/2308.05239v3.pdf","comment":"ICICT 2025"},{"id":"http://arxiv.org/abs/2501.05530v1","updated":"2025-01-09T19:08:23Z","published":"2025-01-09T19:08:23Z","title":"Outlyingness Scores with Cluster Catch Digraphs","summary":"  This paper introduces two novel, outlyingness scores (OSs) based on Cluster\nCatch Digraphs (CCDs): Outbound Outlyingness Score (OOS) and Inbound\nOutlyingness Score (IOS). These scores enhance the interpretability of outlier\ndetection results. Both OSs employ graph-, density-, and distribution-based\ntechniques, tailored to high-dimensional data with varying cluster shapes and\nintensities. OOS evaluates the outlyingness of a point relative to its nearest\nneighbors, while IOS assesses the total ``influence\" a point receives from\nothers within its cluster. Both OSs effectively identify global and local\noutliers, invariant to data collinearity. Moreover, IOS is robust to the\nmasking problems. With extensive Monte Carlo simulations, we compare the\nperformance of both OSs with CCD-based, traditional, and state-of-the-art\noutlier detection methods. Both OSs exhibit substantial overall improvements\nover the CCD-based methods in both artificial and real-world data sets,\nparticularly with IOS, which delivers the best overall performance among all\nthe methods, especially in high-dimensional settings.\n  Keywords: Outlier detection, Outlyingness score, Graph-based clustering,\nCluster catch digraphs, High-dimensional data.\n","authors":["Rui Shi","Nedret Billor","Elvan Ceyhan"],"pdf_url":"https://arxiv.org/pdf/2501.05530v1.pdf","comment":"29 pages, 7 figures, 16 tables"},{"id":"http://arxiv.org/abs/2501.05515v1","updated":"2025-01-09T19:00:03Z","published":"2025-01-09T19:00:03Z","title":"Neural Architecture Codesign for Fast Physics Applications","summary":"  We develop a pipeline to streamline neural architecture codesign for physics\napplications to reduce the need for ML expertise when designing models for\nnovel tasks. Our method employs neural architecture search and network\ncompression in a two-stage approach to discover hardware efficient models. This\napproach consists of a global search stage that explores a wide range of\narchitectures while considering hardware constraints, followed by a local\nsearch stage that fine-tunes and compresses the most promising candidates. We\nexceed performance on various tasks and show further speedup through model\ncompression techniques such as quantization-aware-training and neural network\npruning. We synthesize the optimal models to high level synthesis code for FPGA\ndeployment with the hls4ml library. Additionally, our hierarchical search space\nprovides greater flexibility in optimization, which can easily extend to other\ntasks and domains. We demonstrate this with two case studies: Bragg peak\nfinding in materials science and jet classification in high energy physics,\nachieving models with improved accuracy, smaller latencies, or reduced resource\nutilization relative to the baseline models.\n","authors":["Jason Weitz","Dmitri Demler","Luke McDermott","Nhan Tran","Javier Duarte"],"pdf_url":"https://arxiv.org/pdf/2501.05515v1.pdf","comment":"21 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.05503v1","updated":"2025-01-09T18:50:47Z","published":"2025-01-09T18:50:47Z","title":"The more polypersonal the better -- a short look on space geometry of\n  fine-tuned layers","summary":"  The interpretation of deep learning models is a rapidly growing field, with\nparticular interest in language models. There are various approaches to this\ntask, including training simpler models to replicate neural network predictions\nand analyzing the latent space of the model. The latter method allows us to not\nonly identify patterns in the model's decision-making process, but also\nunderstand the features of its internal structure. In this paper, we analyze\nthe changes in the internal representation of the BERT model when it is trained\nwith additional grammatical modules and data containing new grammatical\nstructures (polypersonality). We find that adding a single grammatical layer\ncauses the model to separate the new and old grammatical systems within itself,\nimproving the overall performance on perplexity metrics.\n","authors":["Sergei Kudriashov","Veronika Zykova","Angelina Stepanova","Yakov Raskind","Eduard Klyshinsky"],"pdf_url":"https://arxiv.org/pdf/2501.05503v1.pdf","comment":"Neuroinformatics 2024"},{"id":"http://arxiv.org/abs/2501.05502v1","updated":"2025-01-09T18:44:10Z","published":"2025-01-09T18:44:10Z","title":"Shrink the longest: improving latent space isotropy with symplicial\n  geometry","summary":"  Although transformer-based models have been dominating the field of deep\nlearning, various studies of their embedding space have shown that they suffer\nfrom \"representation degeneration problem\": embeddings tend to be distributed\nin a narrow cone, making the latent space highly anisotropic. Increasing the\nisotropy has shown to improve performance in downstream tasks both in static\nand contextual language models. However, most of approaches either add\ninference overhead or require substantial amount of data for model\nreparametrization. We propose a novel regularization technique based on\nsimplicial geometry to improve the isotropy of latent representations. The core\nidea of our method is based on maximizing the persistent entropy of barcodes\nobtained using Vietoris-Rips filtration from contextual embeddings in the\nunderlying latent space. We demonstrate that the method leads to an increase in\ndownstream performance while significantly lowering the anisotropy during\nfine-tuning by exploiting existing geometric structures instead of\nreparametrization.\n","authors":["Sergei Kudriashov","Olesya Karpik","Eduard Klyshinsky"],"pdf_url":"https://arxiv.org/pdf/2501.05502v1.pdf","comment":"AIST-2024"},{"id":"http://arxiv.org/abs/2501.05501v1","updated":"2025-01-09T18:43:05Z","published":"2025-01-09T18:43:05Z","title":"Strategy Masking: A Method for Guardrails in Value-based Reinforcement\n  Learning Agents","summary":"  The use of reward functions to structure AI learning and decision making is\ncore to the current reinforcement learning paradigm; however, without careful\ndesign of reward functions, agents can learn to solve problems in ways that may\nbe considered ``undesirable\" or ``unethical. Without thorough understanding of\nthe incentives a reward function creates, it can be difficult to impose\nprincipled yet general control mechanisms over its behavior. In this paper, we\nstudy methods for constructing guardrails for AI agents that use reward\nfunctions to learn decision making. We introduce a novel approach, which we\ncall strategy masking, to explicitly learn and then suppress undesirable AI\nagent behavior. We apply our method to study lying in AI agents and show that\nstrategy masking can effectively modify agent behavior by suppressing, or\nactively penalizing, the reward dimension for lying such that agents act more\nhonestly while not compromising their ability to perform effectively.\n","authors":["Jonathan Keane","Sam Keyser","Jeremy Kedziora"],"pdf_url":"https://arxiv.org/pdf/2501.05501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05499v1","updated":"2025-01-09T18:02:12Z","published":"2025-01-09T18:02:12Z","title":"Generalization of Urban Wind Environment Using Fourier Neural Operator\n  Across Different Wind Directions and Cities","summary":"  Simulation of urban wind environments is crucial for urban planning,\npollution control, and renewable energy utilization. However, the computational\nrequirements of high-fidelity computational fluid dynamics (CFD) methods make\nthem impractical for real cities. To address these limitations, this study\ninvestigates the effectiveness of the Fourier Neural Operator (FNO) model in\npredicting flow fields under different wind directions and urban layouts. In\nthis study, we investigate the effectiveness of the Fourier Neural Operator\n(FNO) model in predicting urban wind conditions under different wind directions\nand urban layouts. By training the model on velocity data from large eddy\nsimulation data, we evaluate the performance of the model under different urban\nconfigurations and wind conditions. The results show that the FNO model can\nprovide accurate predictions while significantly reducing the computational\ntime by 99%. Our innovative approach of dividing the wind field into smaller\nspatial blocks for training improves the ability of the FNO model to capture\nwind frequency features effectively. The SDF data also provides important\nspatial building information, enhancing the model's ability to recognize\nphysical boundaries and generate more realistic predictions. The proposed FNO\napproach enhances the AI model's generalizability for different wind directions\nand urban layouts.\n","authors":["Cheng Chen","Geng Tian","Shaoxiang Qin","Senwen Yang","Dingyang Geng","Dongxue Zhan","Jinqiu Yang","David Vidal","Liangzhu Leon Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05498v1","updated":"2025-01-09T17:47:17Z","published":"2025-01-09T17:47:17Z","title":"Generative Flow Networks: Theory and Applications to Structure Learning","summary":"  Without any assumptions about data generation, multiple causal models may\nexplain our observations equally well. To avoid selecting a single arbitrary\nmodel that could result in unsafe decisions if it does not match reality, it is\ntherefore essential to maintain a notion of epistemic uncertainty about our\npossible candidates. This thesis studies the problem of structure learning from\na Bayesian perspective, approximating the posterior distribution over the\nstructure of a causal model, represented as a directed acyclic graph (DAG),\ngiven data. It introduces Generative Flow Networks (GFlowNets), a novel class\nof probabilistic models designed for modeling distributions over discrete and\ncompositional objects such as graphs. They treat generation as a sequential\ndecision making problem, constructing samples of a target distribution defined\nup to a normalization constant piece by piece. In the first part of this\nthesis, we present the mathematical foundations of GFlowNets, their connections\nto existing domains of machine learning and statistics such as variational\ninference and reinforcement learning, and their extensions beyond discrete\nproblems. In the second part of this thesis, we show how GFlowNets can\napproximate the posterior distribution over DAG structures of causal Bayesian\nNetworks, along with the parameters of its causal mechanisms, given\nobservational and experimental data.\n","authors":["Tristan Deleu"],"pdf_url":"https://arxiv.org/pdf/2501.05498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05496v1","updated":"2025-01-09T16:10:03Z","published":"2025-01-09T16:10:03Z","title":"FedSA: A Unified Representation Learning via Semantic Anchors for\n  Prototype-based Federated Learning","summary":"  Prototype-based federated learning has emerged as a promising approach that\nshares lightweight prototypes to transfer knowledge among clients with data\nheterogeneity in a model-agnostic manner. However, existing methods often\ncollect prototypes directly from local models, which inevitably introduce\ninconsistencies into representation learning due to the biased data\ndistributions and differing model architectures among clients. In this paper,\nwe identify that both statistical and model heterogeneity create a vicious\ncycle of representation inconsistency, classifier divergence, and skewed\nprototype alignment, which negatively impacts the performance of clients. To\nbreak the vicious cycle, we propose a novel framework named Federated Learning\nvia Semantic Anchors (FedSA) to decouple the generation of prototypes from\nlocal representation learning. We introduce a novel perspective that uses\nsimple yet effective semantic anchors serving as prototypes to guide local\nmodels in learning consistent representations. By incorporating semantic\nanchors, we further propose anchor-based regularization with margin-enhanced\ncontrastive learning and anchor-based classifier calibration to correct feature\nextractors and calibrate classifiers across clients, achieving intra-class\ncompactness and inter-class separability of prototypes while ensuring\nconsistent decision boundaries. We then update the semantic anchors with these\nconsistent and discriminative prototypes, which iteratively encourage clients\nto collaboratively learn a unified data representation with robust\ngeneralization. Extensive experiments under both statistical and model\nheterogeneity settings show that FedSA significantly outperforms existing\nprototype-based FL methods on various classification tasks.\n","authors":["Yanbing Zhou","Xiangmou Qu","Chenlong You","Jiyang Zhou","Jingyue Tang","Xin Zheng","Chunmao Cai","Yingbo Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05496v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.05495v1","updated":"2025-01-09T15:47:30Z","published":"2025-01-09T15:47:30Z","title":"LSEBMCL: A Latent Space Energy-Based Model for Continual Learning","summary":"  Continual learning has become essential in many practical applications such\nas online news summaries and product classification. The primary challenge is\nknown as catastrophic forgetting, a phenomenon where a model inadvertently\ndiscards previously learned knowledge when it is trained on new tasks. Existing\nsolutions involve storing exemplars from previous classes, regularizing\nparameters during the fine-tuning process, or assigning different model\nparameters to each task. The proposed solution LSEBMCL (Latent Space\nEnergy-Based Model for Continual Learning) in this work is to use energy-based\nmodels (EBMs) to prevent catastrophic forgetting by sampling data points from\nprevious tasks when training on new ones. The EBM is a machine learning model\nthat associates an energy value with each input data point. The proposed method\nuses an EBM layer as an outer-generator in the continual learning framework for\nNLP tasks. The study demonstrates the efficacy of EBM in NLP tasks, achieving\nstate-of-the-art results in all experiments.\n","authors":["Xiaodi Li","Dingcheng Li","Rujun Gao","Mahmoud Zamani","Latifur Khan"],"pdf_url":"https://arxiv.org/pdf/2501.05495v1.pdf","comment":"In the 7th International Conference on Artificial Intelligence in\n  Information and Communication (ICAIIC 2025)"},{"id":"http://arxiv.org/abs/2501.05494v1","updated":"2025-01-09T14:32:08Z","published":"2025-01-09T14:32:08Z","title":"Mathematical Modeling and Machine Learning for Predicting Shade-Seeking\n  Behavior in Cows Under Heat Stress","summary":"  In this paper we develop a mathematical model combined with machine learning\ntechniques to predict shade-seeking behavior in cows exposed to heat stress.\nThe approach integrates advanced mathematical features, such as time-averaged\nthermal indices and accumulated heat stress metrics, obtained by mathematical\nanalysis of data from a farm in Titaguas (Valencia, Spain), collected during\nthe summer of 2023. Two predictive models, Random Forests and Neural Networks,\nare compared for accuracy, robustness, and interpretability. The Random Forest\nmodel is highlighted for its balance between precision and explainability,\nachieving an RMSE of $14.97$. The methodology also employs $5-$fold\ncross-validation to ensure robustness under real-world conditions. This work\nnot only advances the mathematical modeling of animal behavior but also\nprovides useful insights for mitigating heat stress in livestock through\ndata-driven tools.\n","authors":["S. Sanjuan","D. A. Méndez","R. Arnau","J. M. Calabuig","X. Díaz de Otálora Aguirre","F. Estellés"],"pdf_url":"https://arxiv.org/pdf/2501.05494v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.05238v1","updated":"2025-01-09T13:44:15Z","published":"2025-01-09T13:44:15Z","title":"FOCUS: Towards Universal Foreground Segmentation","summary":"  Foreground segmentation is a fundamental task in computer vision,\nencompassing various subdivision tasks. Previous research has typically\ndesigned task-specific architectures for each task, leading to a lack of\nunification. Moreover, they primarily focus on recognizing foreground objects\nwithout effectively distinguishing them from the background. In this paper, we\nemphasize the importance of the background and its relationship with the\nforeground. We introduce FOCUS, the Foreground ObjeCts Universal Segmentation\nframework that can handle multiple foreground tasks. We develop a multi-scale\nsemantic network using the edge information of objects to enhance image\nfeatures. To achieve boundary-aware segmentation, we propose a novel\ndistillation method, integrating the contrastive learning strategy to refine\nthe prediction mask in multi-modal feature space. We conduct extensive\nexperiments on a total of 13 datasets across 5 tasks, and the results\ndemonstrate that FOCUS consistently outperforms the state-of-the-art\ntask-specific models on most metrics.\n","authors":["Zuyao You","Lingyu Kong","Lingchen Meng","Zuxuan Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05493v1","updated":"2025-01-09T12:26:11Z","published":"2025-01-09T12:26:11Z","title":"Monotonic Learning in the PAC Framework: A New Perspective","summary":"  Monotone learning refers to learning processes in which expected performance\nconsistently improves as more training data is introduced. Non-monotone\nbehavior of machine learning has been the topic of a series of recent works,\nwith various proposals that ensure monotonicity by applying transformations or\nwrappers on learning algorithms. In this work, from a different perspective, we\ntackle the topic of monotone learning within the framework of Probably\nApproximately Correct (PAC) learning theory. Following the mechanism that\nestimates sample complexity of a PAC-learnable problem, we derive a performance\nlower bound for that problem, and prove the monotonicity of that bound as the\nsample sizes increase. By calculating the lower bound distribution, we are able\nto prove that given a PAC-learnable problem with a hypothesis space that is\neither of finite size or of finite VC dimension, any learning algorithm based\non Empirical Risk Minimization (ERM) is monotone if training samples are\nindependent and identically distributed (i.i.d.). We further carry out an\nexperiment on two concrete machine learning problems, one of which has a finite\nhypothesis set, and the other of finite VC dimension, and compared the\nexperimental data for the empirical risk distributions with the estimated\ntheoretical bound. The results of the comparison have confirmed the\nmonotonicity of learning for the two PAC-learnable problems.\n","authors":["Ming Li","Chenyi Zhang","Qin Li"],"pdf_url":"https://arxiv.org/pdf/2501.05493v1.pdf","comment":"16 pages"}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.07899v3","updated":"2025-01-09T13:01:55Z","published":"2024-11-12T16:12:51Z","title":"Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse\n  Tensor-based Transformer","summary":"  The evolution of 3D visualization techniques has fundamentally transformed\nhow we interact with digital content. At the forefront of this change is point\ncloud technology, offering an immersive experience that surpasses traditional\n2D representations. However, the massive data size of point clouds presents\nsignificant challenges in data compression. Current methods for lossy point\ncloud attribute compression (PCAC) generally focus on reconstructing the\noriginal point clouds with minimal error. However, for point cloud\nvisualization scenarios, the reconstructed point clouds with distortion still\nneed to undergo a complex rendering process, which affects the final\nuser-perceived quality. In this paper, we propose an end-to-end deep learning\nframework that seamlessly integrates PCAC with differentiable rendering,\ndenoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of\nrendered multiview images for viewing. In a differentiable manner, the impact\nof the rendering process on the reconstructed point clouds is taken into\naccount. Moreover, we characterize point clouds as sparse tensors and propose a\nsparse tensor-based transformer, called SP-Trans. By aligning with the local\ndensity of the point cloud and utilizing an enhanced local attention mechanism,\nSP-Trans captures the intricate relationships within the point cloud, further\nimproving feature analysis and synthesis within the framework. Extensive\nexperiments demonstrate that the proposed RO-PCAC achieves state-of-the-art\ncompression performance, compared to existing reconstruction-oriented methods,\nincluding traditional, learning-based, and hybrid methods.\n","authors":["Xiao Huo","Junhui Hou","Shuai Wan","Fuzheng Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07899v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03700v2","updated":"2025-01-09T09:12:06Z","published":"2023-12-06T18:59:19Z","title":"OneLLM: One Framework to Align All Modalities with Language","summary":"  Multimodal large language models (MLLMs) have gained significant attention\ndue to their strong multimodal understanding capability. However, existing\nworks rely heavily on modality-specific encoders, which usually differ in\narchitecture and are limited to common modalities. In this paper, we present\nOneLLM, an MLLM that aligns eight modalities to language using a unified\nframework. We achieve this through a unified multimodal encoder and a\nprogressive multimodal alignment pipeline. In detail, we first train an image\nprojection module to connect a vision encoder with LLM. Then, we build a\nuniversal projection module (UPM) by mixing multiple image projection modules\nand dynamic routing. Finally, we progressively align more modalities to LLM\nwith the UPM. To fully leverage the potential of OneLLM in following\ninstructions, we also curated a comprehensive multimodal instruction dataset,\nincluding 2M items from image, audio, video, point cloud, depth/normal map, IMU\nand fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,\nencompassing tasks such as multimodal captioning, question answering and\nreasoning, where it delivers excellent performance. Code, data, model and\nonline demo are available at https://github.com/csuhan/OneLLM\n","authors":["Jiaming Han","Kaixiong Gong","Yiyuan Zhang","Jiaqi Wang","Kaipeng Zhang","Dahua Lin","Yu Qiao","Peng Gao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2312.03700v2.pdf","comment":"Accepted by CVPR 2024. Code: https://github.com/csuhan/OneLLM"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.05453v1","updated":"2025-01-09T18:59:58Z","published":"2025-01-09T18:59:58Z","title":"An Empirical Study of Autoregressive Pre-training from Videos","summary":"  We empirically study autoregressive pre-training from videos. To perform our\nstudy, we construct a series of autoregressive video models, called Toto. We\ntreat videos as sequences of visual tokens and train transformer models to\nautoregressively predict future tokens. Our models are pre-trained on a diverse\ndataset of videos and images comprising over 1 trillion visual tokens. We\nexplore different architectural, training, and inference design choices. We\nevaluate the learned visual representations on a range of downstream tasks\nincluding image recognition, video classification, object tracking, and\nrobotics. Our results demonstrate that, despite minimal inductive biases,\nautoregressive pre-training leads to competitive performance across all\nbenchmarks. Finally, we find that scaling our video models results in similar\nscaling curves to those seen in language models, albeit with a different rate.\nMore details at https://brjathu.github.io/toto/\n","authors":["Jathushan Rajasegaran","Ilija Radosavovic","Rahul Ravishankar","Yossi Gandelsman","Christoph Feichtenhofer","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2501.05453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05445v1","updated":"2025-01-09T18:56:05Z","published":"2025-01-09T18:56:05Z","title":"Consistent Flow Distillation for Text-to-3D Generation","summary":"  Score Distillation Sampling (SDS) has made significant strides in distilling\nimage-generative models for 3D generation. However, its\nmaximum-likelihood-seeking behavior often leads to degraded visual quality and\ndiversity, limiting its effectiveness in 3D applications. In this work, we\npropose Consistent Flow Distillation (CFD), which addresses these limitations.\nWe begin by leveraging the gradient of the diffusion ODE or SDE sampling\nprocess to guide the 3D generation. From the gradient-based sampling\nperspective, we find that the consistency of 2D image flows across different\nviewpoints is important for high-quality 3D generation. To achieve this, we\nintroduce multi-view consistent Gaussian noise on the 3D object, which can be\nrendered from various viewpoints to compute the flow gradient. Our experiments\ndemonstrate that CFD, through consistent flows, significantly outperforms\nprevious methods in text-to-3D generation.\n","authors":["Runjie Yan","Yinbo Chen","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05445v1.pdf","comment":"Project page: https://runjie-yan.github.io/cfd/"},{"id":"http://arxiv.org/abs/2501.05443v1","updated":"2025-01-09T18:55:50Z","published":"2025-01-09T18:55:50Z","title":"A survey of textual cyber abuse detection using cutting-edge language\n  models and large language models","summary":"  The success of social media platforms has facilitated the emergence of\nvarious forms of online abuse within digital communities. This abuse manifests\nin multiple ways, including hate speech, cyberbullying, emotional abuse,\ngrooming, and sexting. In this paper, we present a comprehensive analysis of\nthe different forms of abuse prevalent in social media, with a particular focus\non how emerging technologies, such as Language Models (LMs) and Large Language\nModels (LLMs), are reshaping both the detection and generation of abusive\ncontent within these networks. We delve into the mechanisms through which\nsocial media abuse is perpetuated, exploring the psychological and social\nimpact. Additionally, we examine the dual role of advanced language\nmodels-highlighting their potential to enhance automated detection systems for\nabusive behavior while also acknowledging their capacity to generate harmful\ncontent. This paper aims to contribute to the ongoing discourse on online\nsafety and ethics, offering insights into the evolving landscape of cyberabuse\nand the technological innovations that both mitigate and exacerbate it.\n","authors":["Jose A. Diaz-Garcia","Joao Paulo Carvalho"],"pdf_url":"https://arxiv.org/pdf/2501.05443v1.pdf","comment":"37 pages, under review in WIREs Data Mining and Knowledge Discovery"},{"id":"http://arxiv.org/abs/2501.05442v1","updated":"2025-01-09T18:55:15Z","published":"2025-01-09T18:55:15Z","title":"Progressive Growing of Video Tokenizers for Highly Compressed Latent\n  Spaces","summary":"  Video tokenizers are essential for latent video diffusion models, converting\nraw video data into spatiotemporally compressed latent spaces for efficient\ntraining. However, extending state-of-the-art video tokenizers to achieve a\ntemporal compression ratio beyond 4x without increasing channel capacity poses\nsignificant challenges. In this work, we propose an alternative approach to\nenhance temporal compression. We find that the reconstruction quality of\ntemporally subsampled videos from a low-compression encoder surpasses that of\nhigh-compression encoders applied to original videos. This indicates that\nhigh-compression models can leverage representations from lower-compression\nmodels. Building on this insight, we develop a bootstrapped\nhigh-temporal-compression model that progressively trains high-compression\nblocks atop well-trained lower-compression models. Our method includes a\ncross-level feature-mixing module to retain information from the pretrained\nlow-compression model and guide higher-compression blocks to capture the\nremaining details from the full video sequence. Evaluation of video benchmarks\nshows that our method significantly improves reconstruction quality while\nincreasing temporal compression compared to direct extensions of existing video\ntokenizers. Furthermore, the resulting compact latent space effectively trains\na video diffusion model for high-quality video generation with a reduced token\nbudget.\n","authors":["Aniruddha Mahapatra","Long Mai","Yitian Zhang","David Bourgin","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2501.05442v1.pdf","comment":"Project website:\n  https://progressive-video-tokenizer.github.io/Pro-MAG/"},{"id":"http://arxiv.org/abs/2501.05439v1","updated":"2025-01-09T18:49:39Z","published":"2025-01-09T18:49:39Z","title":"From Simple to Complex Skills: The Case of In-Hand Object Reorientation","summary":"  Learning policies in simulation and transferring them to the real world has\nbecome a promising approach in dexterous manipulation. However, bridging the\nsim-to-real gap for each new task requires substantial human effort, such as\ncareful reward engineering, hyperparameter tuning, and system identification.\nIn this work, we present a system that leverages low-level skills to address\nthese challenges for more complex tasks. Specifically, we introduce a\nhierarchical policy for in-hand object reorientation based on previously\nacquired rotation skills. This hierarchical policy learns to select which\nlow-level skill to execute based on feedback from both the environment and the\nlow-level skill policies themselves. Compared to learning from scratch, the\nhierarchical policy is more robust to out-of-distribution changes and transfers\neasily from simulation to real-world environments. Additionally, we propose a\ngeneralizable object pose estimator that uses proprioceptive information,\nlow-level skill predictions, and control errors as inputs to estimate the\nobject pose over time. We demonstrate that our system can reorient objects,\nincluding symmetrical and textureless ones, to a desired pose.\n","authors":["Haozhi Qi","Brent Yi","Mike Lambeta","Yi Ma","Roberto Calandra","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2501.05439v1.pdf","comment":"website: https://dexhier.github.io"},{"id":"http://arxiv.org/abs/2501.05435v1","updated":"2025-01-09T18:48:35Z","published":"2025-01-09T18:48:35Z","title":"Neuro-Symbolic AI in 2024: A Systematic Review","summary":"  Background: The field of Artificial Intelligence has undergone cyclical\nperiods of growth and decline, known as AI summers and winters. Currently, we\nare in the third AI summer, characterized by significant advancements and\ncommercialization, particularly in the integration of Symbolic AI and\nSub-Symbolic AI, leading to the emergence of Neuro-Symbolic AI.\n  Methods: The review followed the PRISMA methodology, utilizing databases such\nas IEEE Explore, Google Scholar, arXiv, ACM, and SpringerLink. The inclusion\ncriteria targeted peer-reviewed papers published between 2020 and 2024. Papers\nwere screened for relevance to Neuro-Symbolic AI, with further inclusion based\non the availability of associated codebases to ensure reproducibility.\n  Results: From an initial pool of 1,428 papers, 167 met the inclusion criteria\nand were analyzed in detail. The majority of research efforts are concentrated\nin the areas of learning and inference (63%), logic and reasoning (35%), and\nknowledge representation (44%). Explainability and trustworthiness are less\nrepresented (28%), with Meta-Cognition being the least explored area (5%). The\nreview identifies significant interdisciplinary opportunities, particularly in\nintegrating explainability and trustworthiness with other research areas.\n  Conclusion: Neuro-Symbolic AI research has seen rapid growth since 2020, with\nconcentrated efforts in learning and inference. Significant gaps remain in\nexplainability, trustworthiness, and Meta-Cognition. Addressing these gaps\nthrough interdisciplinary research will be crucial for advancing the field\ntowards more intelligent, reliable, and context-aware AI systems.\n","authors":["Brandon C. Colelough","William Regli"],"pdf_url":"https://arxiv.org/pdf/2501.05435v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2410.08405v2","updated":"2025-01-09T18:43:18Z","published":"2024-10-10T22:38:26Z","title":"AgroGPT: Efficient Agricultural Vision-Language Model with Expert Tuning","summary":"  Significant progress has been made in advancing large multimodal\nconversational models (LMMs), capitalizing on vast repositories of image-text\ndata available online. Despite this progress, these models often encounter\nsubstantial domain gaps, hindering their ability to engage in complex\nconversations across new domains. Recent efforts have aimed to mitigate this\nissue, albeit relying on domain-specific image-text data to curate\ninstruction-tuning data. However, many domains, such as agriculture, lack such\nvision-language data. In this work, we propose an approach to construct\ninstruction-tuning data that harnesses vision-only data for the agriculture\ndomain. We utilize diverse agricultural datasets spanning multiple domains,\ncurate class-specific information, and employ large language models (LLMs) to\nconstruct an expert-tuning set, resulting in a 70k expert-tuning dataset called\nAgroInstruct. Subsequently, we expert-tuned and created AgroGPT, an efficient\nLMM that can hold complex agriculture-related conversations and provide useful\ninsights. We also develop AgroEvals for evaluation and compare {AgroGPT's}\nperformance with large open and closed-source models. {AgroGPT} excels at\nidentifying fine-grained agricultural concepts, can act as an agriculture\nexpert, and provides helpful information for multimodal agriculture questions.\nThe code, datasets, and models are available at\nhttps://github.com/awaisrauf/agroGPT.\n","authors":["Muhammad Awais","Ali Husain Salem Abdulla Alharthi","Amandeep Kumar","Hisham Cholakkal","Rao Muhammad Anwer"],"pdf_url":"https://arxiv.org/pdf/2410.08405v2.pdf","comment":"Accepted at WACV, 2025"},{"id":"http://arxiv.org/abs/2501.05409v1","updated":"2025-01-09T18:06:45Z","published":"2025-01-09T18:06:45Z","title":"A Novel Pathology Foundation Model by Mayo Clinic, Charité, and\n  Aignostics","summary":"  Recent advances in digital pathology have demonstrated the effectiveness of\nfoundation models across diverse applications. In this report, we present a\nnovel vision foundation model based on the RudolfV approach. Our model was\ntrained on a dataset comprising 1.2 million histopathology whole slide images,\ncollected from two medical institutions: Mayo Clinic and Charit\\'e -\nUniverst\\\"atsmedizin Berlin. Comprehensive evaluations show that our model\nachieves state-of-the-art performance across twenty-one public benchmark\ndatasets, even though it is neither the largest model by parameter count nor by\ntraining dataset size.\n","authors":["Maximilian Alber","Stephan Tietz","Jonas Dippel","Timo Milbich","Timothée Lesort","Panos Korfiatis","Moritz Krügener","Beatriz Perez Cancer","Neelay Shah","Alexander Möllers","Philipp Seegerer","Alexandra Carpen-Amarie","Kai Standvoss","Gabriel Dernbach","Edwin de Jong","Simon Schallenberg","Andreas Kunft","Helmut Hoffer von Ankershoffen","Gavin Schaeferle","Patrick Duffy","Matt Redlon","Philipp Jurmeister","David Horst","Lukas Ruff","Klaus-Robert Müller","Frederick Klauschen","Andrew Norgan"],"pdf_url":"https://arxiv.org/pdf/2501.05409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05408v1","updated":"2025-01-09T18:05:33Z","published":"2025-01-09T18:05:33Z","title":"TimeRL: Efficient Deep Reinforcement Learning with Polyhedral Dependence\n  Graphs","summary":"  Modern deep learning (DL) workloads increasingly use complex deep\nreinforcement learning (DRL) algorithms that generate training data within the\nlearning loop. This results in programs with several nested loops and dynamic\ndata dependencies between tensors. While DL systems with eager execution\nsupport such dynamism, they lack the optimizations and smart scheduling of\ngraph-based execution. Graph-based execution, however, cannot express dynamic\ntensor shapes, instead requiring the use of multiple static subgraphs. Either\nexecution model for DRL thus leads to redundant computation, reduced\nparallelism, and less efficient memory management.\n  We describe TimeRL, a system for executing dynamic DRL programs that combines\nthe dynamism of eager execution with the whole-program optimizations and\nscheduling of graph-based execution. TimeRL achieves this by introducing the\ndeclarative programming model of recurrent tensors, which allows users to\ndefine dynamic dependencies as intuitive recurrence equations. TimeRL\ntranslates recurrent tensors into a polyhedral dependence graph (PDG) with\ndynamic dependencies as symbolic expressions. Through simple PDG\ntransformations, TimeRL applies whole-program optimizations, such as automatic\nvectorization, incrementalization, and operator fusion. The PDG also allows for\nthe computation of an efficient program-wide execution schedule, which decides\non buffer deallocations, buffer donations, and GPU/CPU memory swapping. We show\nthat TimeRL executes current DRL algorithms up to 47$\\times$ faster than\nexisting DRL systems, while using 16$\\times$ less GPU peak memory.\n","authors":["Pedro F. Silvestre","Peter Pietzuch"],"pdf_url":"https://arxiv.org/pdf/2501.05408v1.pdf","comment":"17 pages, 11 figures, 5 bibliography pages"},{"id":"http://arxiv.org/abs/2501.05407v1","updated":"2025-01-09T18:05:05Z","published":"2025-01-09T18:05:05Z","title":"On-line Policy Improvement using Monte-Carlo Search","summary":"  We present a Monte-Carlo simulation algorithm for real-time policy\nimprovement of an adaptive controller. In the Monte-Carlo simulation, the\nlong-term expected reward of each possible action is statistically measured,\nusing the initial policy to make decisions in each step of the simulation. The\naction maximizing the measured expected reward is then taken, resulting in an\nimproved policy. Our algorithm is easily parallelizable and has been\nimplemented on the IBM SP1 and SP2 parallel-RISC supercomputers.\n  We have obtained promising initial results in applying this algorithm to the\ndomain of backgammon. Results are reported for a wide variety of initial\npolicies, ranging from a random policy to TD-Gammon, an extremely strong\nmulti-layer neural network. In each case, the Monte-Carlo algorithm gives a\nsubstantial reduction, by as much as a factor of 5 or more, in the error rate\nof the base players. The algorithm is also potentially useful in many other\nadaptive control applications in which it is possible to simulate the\nenvironment.\n","authors":["Gerald Tesauro","Gregory R. Galperin"],"pdf_url":"https://arxiv.org/pdf/2501.05407v1.pdf","comment":"Accompanied by oral presentation by Gregory Galperin at NeurIPS 1996\n  (then known as NIPS*96)"},{"id":"http://arxiv.org/abs/2405.13536v2","updated":"2025-01-09T17:58:44Z","published":"2024-05-22T11:14:00Z","title":"Attention Mechanisms Don't Learn Additive Models: Rethinking Feature\n  Importance for Transformers","summary":"  We address the critical challenge of applying feature attribution methods to\nthe transformer architecture, which dominates current applications in natural\nlanguage processing and beyond. Traditional attribution methods to explainable\nAI (XAI) explicitly or implicitly rely on linear or additive surrogate models\nto quantify the impact of input features on a model's output. In this work, we\nformally prove an alarming incompatibility: transformers are structurally\nincapable of representing linear or additive surrogate models used for feature\nattribution, undermining the grounding of these conventional explanation\nmethodologies. To address this discrepancy, we introduce the Softmax-Linked\nAdditive Log Odds Model (SLALOM), a novel surrogate model specifically designed\nto align with the transformer framework. SLALOM demonstrates the capacity to\ndeliver a range of insightful explanations with both synthetic and real-world\ndatasets. We highlight SLALOM's unique efficiency-quality curve by showing that\nSLALOM can produce explanations with substantially higher fidelity than\ncompeting surrogate models or provide explanations of comparable quality at a\nfraction of their computational costs. We release code for SLALOM as an\nopen-source project online at https://github.com/tleemann/slalom_explanations.\n","authors":["Tobias Leemann","Alina Fastowski","Felix Pfeiffer","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2405.13536v2.pdf","comment":"TMLR Camera-Ready version"},{"id":"http://arxiv.org/abs/2501.05403v1","updated":"2025-01-09T17:57:56Z","published":"2025-01-09T17:57:56Z","title":"TimeDP: Learning to Generate Multi-Domain Time Series with Domain\n  Prompts","summary":"  Time series generation models are crucial for applications like data\naugmentation and privacy preservation. Most existing time series generation\nmodels are typically designed to generate data from one specified domain. While\nleveraging data from other domain for better generalization is proved to work\nin other application areas, this approach remains challenging for time series\nmodeling due to the large divergence in patterns among different real world\ntime series categories. In this paper, we propose a multi-domain time series\ndiffusion model with domain prompts, named TimeDP. In TimeDP, we utilize a time\nseries semantic prototype module which defines time series prototypes to\nrepresent time series basis, each prototype vector serving as \"word\"\nrepresenting some elementary time series feature. A prototype assignment module\nis applied to extract the extract domain specific prototype weights, for\nlearning domain prompts as generation condition. During sampling, we extract\n\"domain prompt\" with few-shot samples from the target domain and use the domain\nprompts as condition to generate time series samples. Experiments demonstrate\nthat our method outperforms baselines to provide the state-of-the-art in-domain\ngeneration quality and strong unseen domain generation capability.\n","authors":["Yu-Hao Huang","Chang Xu","Yueying Wu","Wu-Jun Li","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2501.05403v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.05401v1","updated":"2025-01-09T17:50:56Z","published":"2025-01-09T17:50:56Z","title":"BRATI: Bidirectional Recurrent Attention for Time-Series Imputation","summary":"  Missing data in time-series analysis poses significant challenges, affecting\nthe reliability of downstream applications. Imputation, the process of\nestimating missing values, has emerged as a key solution. This paper introduces\nBRATI, a novel deep-learning model designed to address multivariate time-series\nimputation by combining Bidirectional Recurrent Networks and Attention\nmechanisms. BRATI processes temporal dependencies and feature correlations\nacross long and short time horizons, utilizing two imputation blocks that\noperate in opposite temporal directions. Each block integrates recurrent layers\nand attention mechanisms to effectively resolve long-term dependencies.\n  We evaluate BRATI on three real-world datasets under diverse missing-data\nscenarios: randomly missing values, fixed-length missing sequences, and\nvariable-length missing sequences. Our findings demonstrate that BRATI\nconsistently outperforms state-of-the-art models, delivering superior accuracy\nand robustness in imputing multivariate time-series data.\n","authors":["Armando Collado-Villaverde","Pablo Muñoz","Maria D. R-Moreno"],"pdf_url":"https://arxiv.org/pdf/2501.05401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05398v1","updated":"2025-01-09T17:47:34Z","published":"2025-01-09T17:47:34Z","title":"Mechanistic understanding and validation of large AI models with\n  SemanticLens","summary":"  Unlike human-engineered systems such as aeroplanes, where each component's\nrole and dependencies are well understood, the inner workings of AI models\nremain largely opaque, hindering verifiability and undermining trust. This\npaper introduces SemanticLens, a universal explanation method for neural\nnetworks that maps hidden knowledge encoded by components (e.g., individual\nneurons) into the semantically structured, multimodal space of a foundation\nmodel such as CLIP. In this space, unique operations become possible, including\n(i) textual search to identify neurons encoding specific concepts, (ii)\nsystematic analysis and comparison of model representations, (iii) automated\nlabelling of neurons and explanation of their functional roles, and (iv) audits\nto validate decision-making against requirements. Fully scalable and operating\nwithout human input, SemanticLens is shown to be effective for debugging and\nvalidation, summarizing model knowledge, aligning reasoning with expectations\n(e.g., adherence to the ABCDE-rule in melanoma classification), and detecting\ncomponents tied to spurious correlations and their associated training data. By\nenabling component-level understanding and validation, the proposed approach\nhelps bridge the \"trust gap\" between AI models and traditional engineered\nsystems. We provide code for SemanticLens on\nhttps://github.com/jim-berend/semanticlens and a demo on\nhttps://semanticlens.hhi-research-insights.eu.\n","authors":["Maximilian Dreyer","Jim Berend","Tobias Labarta","Johanna Vielhaben","Thomas Wiegand","Sebastian Lapuschkin","Wojciech Samek"],"pdf_url":"https://arxiv.org/pdf/2501.05398v1.pdf","comment":"74 pages (18 pages manuscript, 7 pages references, 49 pages appendix)"},{"id":"http://arxiv.org/abs/2501.05391v1","updated":"2025-01-09T17:33:08Z","published":"2025-01-09T17:33:08Z","title":"The global consensus on the risk management of autonomous driving","summary":"  Every maneuver of a vehicle redistributes risks between road users. While\nhuman drivers do this intuitively, autonomous vehicles allow and require\ndeliberative algorithmic risk management. But how should traffic risks be\ndistributed among road users? In a global experimental study in eight countries\nwith different cultural backgrounds and almost 11,000 participants, we compared\nrisk distribution preferences. It turns out that risk preferences in road\ntraffic are strikingly similar between the cultural zones. The vast majority of\nparticipants in all countries deviates from a guiding principle of minimizing\naccident probabilities in favor of weighing up the probability and severity of\naccidents. At the national level, the consideration of accident probability and\nseverity hardly differs between countries. The social dilemma of autonomous\nvehicles detected in deterministic crash scenarios disappears in risk\nassessments of everyday traffic situations in all countries. In no country do\ncyclists receive a risk bonus that goes beyond their higher vulnerability. In\nsum, our results suggest that a global consensus on the risk ethics of\nautonomous driving is easier to establish than on the ethics of crashing.\n","authors":["Sebastian Krügel","Matthias Uhl"],"pdf_url":"https://arxiv.org/pdf/2501.05391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19185v2","updated":"2025-01-09T17:29:40Z","published":"2024-10-24T22:34:27Z","title":"Tailored-LLaMA: Optimizing Few-Shot Learning in Pruned LLaMA Models with\n  Task-Specific Prompts","summary":"  Large language models demonstrate impressive proficiency in language\nunderstanding and generation. Nonetheless, training these models from scratch,\neven the least complex billion-parameter variant demands significant\ncomputational resources rendering it economically impractical for many\norganizations. With large language models functioning as general-purpose task\nsolvers, this paper investigates their task-specific fine-tuning. We employ\ntask-specific datasets and prompts to fine-tune two pruned LLaMA models having\n5 billion and 4 billion parameters. This process utilizes the pre-trained\nweights and focuses on a subset of weights using the LoRA method. One challenge\nin fine-tuning the LLaMA model is crafting a precise prompt tailored to the\nspecific task. To address this, we propose a novel approach to fine-tune the\nLLaMA model under two primary constraints: task specificity and prompt\neffectiveness. Our approach, Tailored LLaMA initially employs structural\npruning to reduce the model sizes from 7B to 5B and 4B parameters.\nSubsequently, it applies a carefully designed prompt specific to the task and\nutilizes the LoRA method to accelerate the fine-tuning process. Moreover,\nfine-tuning a model pruned by 50\\% for less than one hour restores the mean\naccuracy of classification tasks to 95.68\\% at a 20\\% compression ratio and to\n86.54\\% at a 50\\% compression ratio through few-shot learning with 50 shots.\nOur validation of Tailored LLaMA on these two pruned variants demonstrates that\neven when compressed to 50\\%, the models maintain over 65\\% of the baseline\nmodel accuracy in few-shot classification and generation tasks. These findings\nhighlight the efficacy of our tailored approach in maintaining high performance\nwith significantly reduced model sizes.\n","authors":["Danyal Aftab","Steven Davy"],"pdf_url":"https://arxiv.org/pdf/2410.19185v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05382v1","updated":"2025-01-09T17:11:22Z","published":"2025-01-09T17:11:22Z","title":"Large Physics Models: Towards a collaborative approach with Large\n  Language Models and Foundation Models","summary":"  This paper explores ideas and provides a potential roadmap for the\ndevelopment and evaluation of physics-specific large-scale AI models, which we\ncall Large Physics Models (LPMs). These models, based on foundation models such\nas Large Language Models (LLMs) - trained on broad data - are tailored to\naddress the demands of physics research. LPMs can function independently or as\npart of an integrated framework. This framework can incorporate specialized\ntools, including symbolic reasoning modules for mathematical manipulations,\nframeworks to analyse specific experimental and simulated data, and mechanisms\nfor synthesizing theories and scientific literature. We begin by examining\nwhether the physics community should actively develop and refine dedicated\nmodels, rather than relying solely on commercial LLMs. We then outline how LPMs\ncan be realized through interdisciplinary collaboration among experts in\nphysics, computer science, and philosophy of science. To integrate these models\neffectively, we identify three key pillars: Development, Evaluation, and\nPhilosophical Reflection. Development focuses on constructing models capable of\nprocessing physics texts, mathematical formulations, and diverse physical data.\nEvaluation assesses accuracy and reliability by testing and benchmarking.\nFinally, Philosophical Reflection encompasses the analysis of broader\nimplications of LLMs in physics, including their potential to generate new\nscientific understanding and what novel collaboration dynamics might arise in\nresearch. Inspired by the organizational structure of experimental\ncollaborations in particle physics, we propose a similarly interdisciplinary\nand collaborative approach to building and refining Large Physics Models. This\nroadmap provides specific objectives, defines pathways to achieve them, and\nidentifies challenges that must be addressed to realise physics-specific large\nscale AI models.\n","authors":["Kristian G. Barman","Sascha Caron","Emily Sullivan","Henk W. de Regt","Roberto Ruiz de Austri","Mieke Boon","Michael Färber","Stefan Fröse","Faegheh Hasibi","Andreas Ipp","Rukshak Kapoor","Gregor Kasieczka","Daniel Kostić","Michael Krämer","Tobias Golling","Luis G. Lopez","Jesus Marco","Sydney Otten","Pawel Pawlowski","Pietro Vischia","Erik Weber","Christoph Weniger"],"pdf_url":"https://arxiv.org/pdf/2501.05382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05368v1","updated":"2025-01-09T16:49:04Z","published":"2025-01-09T16:49:04Z","title":"Developing a Foundation of Vector Symbolic Architectures Using Category\n  Theory","summary":"  At the risk of overstating the case, connectionist approaches to machine\nlearning, i.e. neural networks, are enjoying a small vogue right now. However,\nthese methods require large volumes of data and produce models that are\nuninterpretable to humans. An alternative framework that is compatible with\nneural networks and gradient-based learning, but explicitly models\ncompositionality, is Vector Symbolic Architectures (VSAs). VSAs are a family of\nalgebras on high-dimensional vector representations. They arose in cognitive\nscience from the need to unify neural processing and the kind of symbolic\nreasoning that humans perform. While machine learning methods have benefited\nfrom category theoretical analyses, VSAs have not yet received similar\ntreatment. In this paper, we present a first attempt at applying category\ntheory to VSAs. Specifically, we conduct a brief literature survey\ndemonstrating the lacking intersection of these two topics, provide a list of\ndesiderata for VSAs, and propose that VSAs may be understood as a (division)\nrig in a category enriched over a monoid in Met (the category of Lawvere metric\nspaces). This final contribution suggests that VSAs may be generalised beyond\ncurrent implementations. It is our hope that grounding VSAs in category theory\nwill lead to more rigorous connections with other research, both within and\nbeyond, learning and cognition.\n","authors":["Nolan P Shaw","P Michael Furlong","Britt Anderson","Jeff Orchard"],"pdf_url":"https://arxiv.org/pdf/2501.05368v1.pdf","comment":"13 pages, no figures, 2 tables, one appendix"},{"id":"http://arxiv.org/abs/2501.05366v1","updated":"2025-01-09T16:48:17Z","published":"2025-01-09T16:48:17Z","title":"Search-o1: Agentic Search-Enhanced Large Reasoning Models","summary":"  Large reasoning models (LRMs) like OpenAI-o1 have demonstrated impressive\nlong stepwise reasoning capabilities through large-scale reinforcement\nlearning. However, their extended reasoning processes often suffer from\nknowledge insufficiency, leading to frequent uncertainties and potential\nerrors. To address this limitation, we introduce \\textbf{Search-o1}, a\nframework that enhances LRMs with an agentic retrieval-augmented generation\n(RAG) mechanism and a Reason-in-Documents module for refining retrieved\ndocuments. Search-o1 integrates an agentic search workflow into the reasoning\nprocess, enabling dynamic retrieval of external knowledge when LRMs encounter\nuncertain knowledge points. Additionally, due to the verbose nature of\nretrieved documents, we design a separate Reason-in-Documents module to deeply\nanalyze the retrieved information before injecting it into the reasoning chain,\nminimizing noise and preserving coherent reasoning flow. Extensive experiments\non complex reasoning tasks in science, mathematics, and coding, as well as six\nopen-domain QA benchmarks, demonstrate the strong performance of Search-o1.\nThis approach enhances the trustworthiness and applicability of LRMs in complex\nreasoning tasks, paving the way for more reliable and versatile intelligent\nsystems. The code is available at\n\\url{https://github.com/sunnynexus/Search-o1}.\n","authors":["Xiaoxi Li","Guanting Dong","Jiajie Jin","Yuyao Zhang","Yujia Zhou","Yutao Zhu","Peitian Zhang","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2501.05366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05360v1","updated":"2025-01-09T16:44:38Z","published":"2025-01-09T16:44:38Z","title":"On Corrigibility and Alignment in Multi Agent Games","summary":"  Corrigibility of autonomous agents is an under explored part of system\ndesign, with previous work focusing on single agent systems. It has been\nsuggested that uncertainty over the human preferences acts to keep the agents\ncorrigible, even in the face of human irrationality. We present a general\nframework for modelling corrigibility in a multi-agent setting as a 2 player\ngame in which the agents always have a move in which they can ask the human for\nsupervision. This is formulated as a Bayesian game for the purpose of\nintroducing uncertainty over the human beliefs. We further analyse two specific\ncases. First, a two player corrigibility game, in which we want corrigibility\ndisplayed in both agents for both common payoff (monotone) games and harmonic\ngames. Then we investigate an adversary setting, in which one agent is\nconsidered to be a `defending' agent and the other an `adversary'. A general\nresult is provided for what belief over the games and human rationality the\ndefending agent is required to have to induce corrigibility.\n","authors":["Edmund Dable-Heath","Boyko Vodenicharski","James Bishop"],"pdf_url":"https://arxiv.org/pdf/2501.05360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20138v2","updated":"2025-01-09T16:36:26Z","published":"2024-12-28T12:54:06Z","title":"TradingAgents: Multi-Agents LLM Financial Trading Framework","summary":"  Significant progress has been made in automated problem-solving using\nsocieties of agents powered by large language models (LLMs). In finance,\nefforts have largely focused on single-agent systems handling specific tasks or\nmulti-agent frameworks independently gathering data. However, multi-agent\nsystems' potential to replicate real-world trading firms' collaborative\ndynamics remains underexplored. TradingAgents proposes a novel stock trading\nframework inspired by trading firms, featuring LLM-powered agents in\nspecialized roles such as fundamental analysts, sentiment analysts, technical\nanalysts, and traders with varied risk profiles. The framework includes Bull\nand Bear researcher agents assessing market conditions, a risk management team\nmonitoring exposure, and traders synthesizing insights from debates and\nhistorical data to make informed decisions. By simulating a dynamic,\ncollaborative trading environment, this framework aims to improve trading\nperformance. Detailed architecture and extensive experiments reveal its\nsuperiority over baseline models, with notable improvements in cumulative\nreturns, Sharpe ratio, and maximum drawdown, highlighting the potential of\nmulti-agent LLM frameworks in financial trading. More details on TradingAgents\nare available at https://TradingAgents-AI.github.io.\n","authors":["Yijia Xiao","Edward Sun","Di Luo","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2412.20138v2.pdf","comment":"Multi-Agent AI in the Real World @ AAAI 2025"},{"id":"http://arxiv.org/abs/2411.10087v2","updated":"2025-01-09T16:22:42Z","published":"2024-11-15T10:16:38Z","title":"PFML: Self-Supervised Learning of Time-Series Data Without\n  Representation Collapse","summary":"  Self-supervised learning (SSL) is a data-driven learning approach that\nutilizes the innate structure of the data to guide the learning process. In\ncontrast to supervised learning, which depends on external labels, SSL utilizes\nthe inherent characteristics of the data to produce its own supervisory signal.\nHowever, one frequent issue with SSL methods is representation collapse, where\nthe model outputs a constant input-invariant feature representation. This issue\nhinders the potential application of SSL methods to new data modalities, as\ntrying to avoid representation collapse wastes researchers' time and effort.\nThis paper introduces a novel SSL algorithm for time-series data called\nPrediction of Functionals from Masked Latents (PFML). Instead of predicting\nmasked input signals or their latent representations directly, PFML operates by\npredicting statistical functionals of the input signal corresponding to masked\nembeddings, given a sequence of unmasked embeddings. The algorithm is designed\nto avoid representation collapse, rendering it straightforwardly applicable to\ndifferent time-series data domains, such as novel sensor modalities in clinical\ndata. We demonstrate the effectiveness of PFML through complex, real-life\nclassification tasks across three different data modalities: infant posture and\nmovement classification from multi-sensor inertial measurement unit data,\nemotion recognition from speech data, and sleep stage classification from EEG\ndata. The results show that PFML is superior to a conceptually similar SSL\nmethod and a contrastive learning-based SSL method. Additionally, PFML is on\npar with the current state-of-the-art SSL method, while also being conceptually\nsimpler and without suffering from representation collapse.\n","authors":["Einari Vaaras","Manu Airaksinen","Okko Räsänen"],"pdf_url":"https://arxiv.org/pdf/2411.10087v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05336v1","updated":"2025-01-09T16:02:51Z","published":"2025-01-09T16:02:51Z","title":"Stream Aligner: Efficient Sentence-Level Alignment via Distribution\n  Induction","summary":"  The rapid advancement of large language models (LLMs) has led to significant\nimprovements in their capabilities, but also to increased concerns about their\nalignment with human values and intentions. Current alignment strategies,\nincluding adaptive training and inference-time methods, have demonstrated\npotential in this area. However, these approaches still struggle to balance\ndeployment complexity and capability across various tasks and difficulties. In\nthis work, we introduce the Streaming Distribution Induce Aligner (Stream\nAligner), a novel alignment paradigm that combines efficiency with enhanced\nperformance in various tasks throughout the generation process. Stream Aligner\nachieves dynamic sentence-level correction by using a small model to learn the\npreferences of the suffix sentence, iteratively correcting the suffix sentence\noutput by the upstream model, and then using the corrected sentence to replace\nthe suffix sentence in subsequent generations. Compared to Aligner, our\nexperiments demonstrate that Stream Aligner reduces reliance on the\ncapabilities of additional models, enhances the reasoning abilities of LLMs,\nand decreases latency during user interaction. Specifically, Stream Aligner-2B\nmodel has achieved an improvement of 76.1% in helpfulness, 36.0% in\nharmlessness on the tested Llama2-70B-chat model, and Stream Aligner-8B has\nachieved an improvement of 3.5% on the math ability of the tested\nLlama3-70B-Instruct model.\n","authors":["Hantao Lou","Jiaming Ji","Kaile Wang","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2501.05336v1.pdf","comment":"AAAI Alignment Track 2025 Poster"},{"id":"http://arxiv.org/abs/2501.05334v1","updated":"2025-01-09T15:59:32Z","published":"2025-01-09T15:59:32Z","title":"The Bakers and Millers Game with Restricted Locations","summary":"  We study strategic location choice by customers and sellers, termed the\nBakers and Millers Game in the literature. In our generalized setting, each\nmiller can freely choose any location for setting up a mill, while each baker\nis restricted in the choice of location for setting up a bakery. For optimal\nbargaining power, a baker would like to select a location with many millers to\nbuy flour from and with little competition from other bakers. Likewise, a\nmiller aims for a location with many bakers and few competing millers. Thus,\nboth types of agents choose locations to optimize the ratio of agents of\nopposite type divided by agents of the same type at their chosen location.\nOriginally raised in the context of Fractional Hedonic Games, the Bakers and\nMillers Game has applications that range from commerce to product design.\n  We study the impact of location restrictions on the properties of the game.\nWhile pure Nash equilibria trivially exist in the setting without location\nrestrictions, we show via a sophisticated, efficient algorithm that even the\nmore challenging restricted setting admits equilibria. Moreover, the computed\nequilibrium approximates the optimal social welfare by a factor of at most\n$2\\left(\\frac{e}{e-1}\\right)$. Furthermore, we give tight bounds on the price\nof anarchy/stability.\n  On the conceptual side, the location choice feature adds a new layer to the\nstandard setting of Hedonic Games, in the sense that agents that select the\nsame location form a coalition. This allows to naturally restrict the possible\ncoalitions that can be formed. With this, our model generalizes simple\nsymmetric Fractional Hedonic Games on complete bipartite valuation graphs and\nalso Hedonic Diversity Games with utilities single-peaked at 0. We believe that\nthis generalization is also a very interesting direction for other types of\nHedonic Games.\n","authors":["Simon Krogmann","Pascal Lenzner","Alexander Skopalik"],"pdf_url":"https://arxiv.org/pdf/2501.05334v1.pdf","comment":"To appear at the 24th International Conference on Autonomous Agents\n  and Multiagent Systems (AAMAS 2025)"},{"id":"http://arxiv.org/abs/2501.05332v1","updated":"2025-01-09T15:58:37Z","published":"2025-01-09T15:58:37Z","title":"AnCoGen: Analysis, Control and Generation of Speech with a Masked\n  Autoencoder","summary":"  This article introduces AnCoGen, a novel method that leverages a masked\nautoencoder to unify the analysis, control, and generation of speech signals\nwithin a single model. AnCoGen can analyze speech by estimating key attributes,\nsuch as speaker identity, pitch, content, loudness, signal-to-noise ratio, and\nclarity index. In addition, it can generate speech from these attributes and\nallow precise control of the synthesized speech by modifying them. Extensive\nexperiments demonstrated the effectiveness of AnCoGen across speech\nanalysis-resynthesis, pitch estimation, pitch modification, and speech\nenhancement.\n","authors":["Samir Sadok","Simon Leglaive","Laurent Girin","Gaël Richard","Xavier Alameda-Pineda"],"pdf_url":"https://arxiv.org/pdf/2501.05332v1.pdf","comment":"5 pages, https://samsad35.github.io/site-ancogen"},{"id":"http://arxiv.org/abs/2302.08878v2","updated":"2025-01-09T15:35:59Z","published":"2023-02-17T13:50:53Z","title":"Less is More: The Influence of Pruning on the Explainability of CNNs","summary":"  Modern, state-of-the-art Convolutional Neural Networks (CNNs) in computer\nvision have millions of parameters. Thus, explaining the complex decisions of\nsuch networks to humans is challenging. A technical approach to reduce CNN\ncomplexity is network pruning, where less important parameters are deleted. The\nwork presented in this paper investigates whether this technical complexity\nreduction also helps with perceived explainability. To do so, we conducted a\npre-study and two human-grounded experiments, assessing the effects of\ndifferent pruning ratios on CNN explainability. Overall, we evaluated four\ndifferent compression rates (i.e., CPR 2, 4, 8, and 32) with 37 500 tasks on\nMechanical Turk. Results indicate that lower compression rates have a positive\ninfluence on explainability, while higher compression rates show negative\neffects. Furthermore, we were able to identify sweet spots that increase both\nthe perceived explainability and the model's performance.\n","authors":["David Weber","Florian Merkle","Pascal Schöttle","Stephan Schlögl"],"pdf_url":"https://arxiv.org/pdf/2302.08878v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03145v2","updated":"2025-01-09T15:31:29Z","published":"2025-01-06T17:12:19Z","title":"Geometry Restoration and Dewarping of Camera-Captured Document Images","summary":"  This research focuses on developing a method for restoring the topology of\ndigital images of paper documents captured by a camera, using algorithms for\ndetection, segmentation, geometry restoration, and dewarping. Our methodology\nemploys deep learning (DL) for document outline detection, followed by computer\nvision (CV) to create a topological 2D grid using cubic polynomial\ninterpolation and correct nonlinear distortions by remapping the image. Using\nclassical CV methods makes the document topology restoration process more\nefficient and faster, as it requires significantly fewer computational\nresources and memory. We developed a new pipeline for automatic document\ndewarping and reconstruction, along with a framework and annotated dataset to\ndemonstrate its efficiency. Our experiments confirm the promise of our\nmethodology and its superiority over existing benchmarks (including mobile apps\nand popular DL solutions, such as RectiNet, DocGeoNet, and DocTr++) both\nvisually and in terms of document readability via Optical Character Recognition\n(OCR) and geometry restoration metrics. This paves the way for creating\nhigh-quality digital copies of paper documents and enhancing the efficiency of\nOCR systems. Project page: https://github.com/HorizonParadox/DRCCBI\n","authors":["Valery Istomin","Oleg Pereziabov","Ilya Afanasyev"],"pdf_url":"https://arxiv.org/pdf/2501.03145v2.pdf","comment":"28 pages, 16 figures"},{"id":"http://arxiv.org/abs/2412.16378v2","updated":"2025-01-09T15:20:31Z","published":"2024-12-20T22:25:23Z","title":"REFA: Reference Free Alignment for multi-preference optimization","summary":"  We introduce REFA, a family of reference-free alignment methods that optimize\nover multiple user preferences while enforcing fine-grained length control. Our\napproach integrates deviation-based weighting to emphasize high-quality\nresponses more strongly, length normalization to prevent trivial short-response\nsolutions, and an EOS-probability regularizer to mitigate dataset-induced\nbrevity biases. Theoretically, we show that under the Uncertainty Reduction\nwith Sequence Length Assertion (URSLA), naive length normalization can still\nincentivize length-based shortcuts. By contrast, REFA corrects these subtle\nincentives, guiding models toward genuinely more informative and higher-quality\noutputs. Empirically, REFA sets a new state-of-the-art among reference-free\nalignment methods, producing richer responses aligned more closely with human\npreferences. Compared to a base supervised fine-tuned (SFT) mistral-7b model\nthat achieves 8.4% length-controlled win rate (LC-WR) and 6.2% win rate (WR),\nour best REFA configuration attains 21.62% LC-WR and 19.87% WR on the\nAlpacaEval v2 benchmark. This represents a substantial improvement over both\nthe strongest multi-preference baseline, InfoNCA (16.82% LC-WR, 10.44% WR), and\nthe strongest reference-free baseline, SimPO (20.01% LC-WR, 17.65% WR)\n","authors":["Taneesh Gupta","Rahul Madhavan","Xuchao Zhang","Chetan Bansal","Saravan Rajmohan"],"pdf_url":"https://arxiv.org/pdf/2412.16378v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16220v3","updated":"2025-01-09T14:55:29Z","published":"2024-12-18T10:56:40Z","title":"Cross-Attention Graph Neural Networks for Inferring Gene Regulatory\n  Networks with Skewed Degree Distribution","summary":"  Inferencing Gene Regulatory Networks (GRNs) from gene expression data is a\npivotal challenge in systems biology, and several innovative computational\nmethods have been introduced. However, most of these studies have not\nconsidered the skewed degree distribution of genes. Specifically, some genes\nmay regulate multiple target genes while some genes may be regulated by\nmultiple regulator genes. Such a skewed degree distribution issue significantly\ncomplicates the application of directed graph embedding methods. To tackle this\nissue, we propose the Cross-Attention Complex Dual Graph Embedding Model\n(XATGRN). Our XATGRN employs a cross-attention mechanism to effectively capture\nintricate gene interactions from gene expression profiles. Additionally, it\nuses a Dual Complex Graph Embedding approach to manage the skewed degree\ndistribution, thereby ensuring precise prediction of regulatory relationships\nand their directionality. Our model consistently outperforms existing\nstate-of-the-art methods across various datasets, underscoring its efficacy in\nelucidating complex gene regulatory mechanisms. Our codes used in this paper\nare publicly available at: https://github.com/kikixiong/XATGRN.\n","authors":["Jiaqi Xiong","Nan Yin","Shiyang Liang","Haoyang Li","Yingxu Wang","Duo Ai","Fang Pan","Jingjie Wang"],"pdf_url":"https://arxiv.org/pdf/2412.16220v3.pdf","comment":"11 pages, 6 figures,1 tabels"},{"id":"http://arxiv.org/abs/2501.01480v2","updated":"2025-01-09T14:52:13Z","published":"2025-01-02T15:09:00Z","title":"Drift2Matrix: Kernel-Induced Self Representation for Concept Drift\n  Adaptation in Co-evolving Time Series","summary":"  In the realm of time series analysis, tackling the phenomenon of concept\ndrift poses a significant challenge. Concept drift -- characterized by the\nevolving statistical properties of time series data, affects the reliability\nand accuracy of conventional analysis models. This is particularly evident in\nco-evolving scenarios where interactions among variables are crucial. This\npaper presents Drift2Matrix, a novel framework that leverages kernel-induced\nself-representation for adaptive responses to concept drift in time series.\nDrift2Matrix employs a kernel-based learning mechanism to generate a\nrepresentation matrix, encapsulating the inherent dynamics of co-evolving time\nseries. This matrix serves as a key tool for identification and adaptation to\nconcept drift by observing its temporal variations. Furthermore, Drift2Matrix\neffectively identifies prevailing patterns and offers insights into emerging\ntrends through pattern evolution analysis. Our empirical evaluation of\nDrift2Matrix across various datasets demonstrates its effectiveness in handling\nthe complexities of concept drift. This approach introduces a novel perspective\nin the theoretical domain of co-evolving time series analysis, enhancing\nadaptability and accuracy in the face of dynamic data environments.\n","authors":["Kunpeng Xu","Lifei Chen","Shengrui Wang"],"pdf_url":"https://arxiv.org/pdf/2501.01480v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05278v1","updated":"2025-01-09T14:39:40Z","published":"2025-01-09T14:39:40Z","title":"Off-Policy Evaluation and Counterfactual Methods in Dynamic Auction\n  Environments","summary":"  Counterfactual estimators are critical for learning and refining policies\nusing logged data, a process known as Off-Policy Evaluation (OPE). OPE allows\nresearchers to assess new policies without costly experiments, speeding up the\nevaluation process. Online experimental methods, such as A/B tests, are\neffective but often slow, thus delaying the policy selection and optimization\nprocess.\n  In this work, we explore the application of OPE methods in the context of\nresource allocation in dynamic auction environments. Given the competitive\nnature of environments where rapid decision-making is crucial for gaining a\ncompetitive edge, the ability to quickly and accurately assess algorithmic\nperformance is essential. By utilizing counterfactual estimators as a\npreliminary step before conducting A/B tests, we aim to streamline the\nevaluation process, reduce the time and resources required for experimentation,\nand enhance confidence in the chosen policies. Our investigation focuses on the\nfeasibility and effectiveness of using these estimators to predict the outcomes\nof potential resource allocation strategies, evaluate their performance, and\nfacilitate more informed decision-making in policy selection. Motivated by the\noutcomes of our initial study, we envision an advanced analytics system\ndesigned to seamlessly and dynamically assess new resource allocation\nstrategies and policies.\n","authors":["Ritam Guha","Nilavra Pathak"],"pdf_url":"https://arxiv.org/pdf/2501.05278v1.pdf","comment":"9 pages, 15 figures, IEEE format"},{"id":"http://arxiv.org/abs/2412.13426v2","updated":"2025-01-09T14:33:25Z","published":"2024-12-18T01:43:25Z","title":"Safeguarding System Prompts for LLMs","summary":"  Large language models (LLMs) are increasingly utilized in applications where\nsystem prompts, which guide model outputs, play a crucial role. These prompts\noften contain business logic and sensitive information, making their protection\nessential. However, adversarial and even regular user queries can exploit LLM\nvulnerabilities to expose these hidden prompts. To address this issue, we\npropose PromptKeeper, a robust defense mechanism designed to safeguard system\nprompts. PromptKeeper tackles two core challenges: reliably detecting prompt\nleakage and mitigating side-channel vulnerabilities when leakage occurs. By\nframing detection as a hypothesis-testing problem, PromptKeeper effectively\nidentifies both explicit and subtle leakage. Upon detection, it regenerates\nresponses using a dummy prompt, ensuring that outputs remain indistinguishable\nfrom typical interactions when no leakage is present. PromptKeeper ensures\nrobust protection against prompt extraction attacks via either adversarial or\nregular queries, while preserving conversational capability and runtime\nefficiency during benign user interactions.\n","authors":["Zhifeng Jiang","Zhihua Jin","Guoliang He"],"pdf_url":"https://arxiv.org/pdf/2412.13426v2.pdf","comment":"15 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.05264v1","updated":"2025-01-09T14:19:33Z","published":"2025-01-09T14:19:33Z","title":"Towards Balanced Continual Multi-Modal Learning in Human Pose Estimation","summary":"  3D human pose estimation (3D HPE) has emerged as a prominent research topic,\nparticularly in the realm of RGB-based methods. However, RGB images are\nsusceptible to limitations such as sensitivity to lighting conditions and\npotential user discomfort. Consequently, multi-modal sensing, which leverages\nnon-intrusive sensors, is gaining increasing attention. Nevertheless,\nmulti-modal 3D HPE still faces challenges, including modality imbalance and the\nimperative for continual learning. In this work, we introduce a novel balanced\ncontinual multi-modal learning method for 3D HPE, which harnesses the power of\nRGB, LiDAR, mmWave, and WiFi. Specifically, we propose a Shapley value-based\ncontribution algorithm to quantify the contribution of each modality and\nidentify modality imbalance. To address this imbalance, we employ a re-learning\nstrategy. Furthermore, recognizing that raw data is prone to noise\ncontamination, we develop a novel denoising continual learning approach. This\napproach incorporates a noise identification and separation module to mitigate\nthe adverse effects of noise and collaborates with the balanced learning\nstrategy to enhance optimization. Additionally, an adaptive EWC mechanism is\nemployed to alleviate catastrophic forgetting. We conduct extensive experiments\non the widely-adopted multi-modal dataset, MM-Fi, which demonstrate the\nsuperiority of our approach in boosting 3D pose estimation and mitigating\ncatastrophic forgetting in complex scenarios. We will release our codes.\n","authors":["Jiaxuan Peng","Mengshi Qi","Dong Zhao","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.05264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05260v1","updated":"2025-01-09T14:14:18Z","published":"2025-01-09T14:14:18Z","title":"Enhancing Plagiarism Detection in Marathi with a Weighted Ensemble of\n  TF-IDF and BERT Embeddings for Low-Resource Language Processing","summary":"  Plagiarism involves using another person's work or concepts without proper\nattribution, presenting them as original creations. With the growing amount of\ndata communicated in regional languages such as Marathi -- one of India's\nregional languages -- it is crucial to design robust plagiarism detection\nsystems tailored for low-resource languages. Language models like Bidirectional\nEncoder Representations from Transformers (BERT) have demonstrated exceptional\ncapability in text representation and feature extraction, making them essential\ntools for semantic analysis and plagiarism detection. However, the application\nof BERT for low-resource languages remains under-explored, particularly in the\ncontext of plagiarism detection. This paper presents a method to enhance the\naccuracy of plagiarism detection for Marathi texts using BERT sentence\nembeddings in conjunction with Term Frequency-Inverse Document Frequency\n(TF-IDF) feature representation. This approach effectively captures\nstatistical, semantic, and syntactic aspects of text features through a\nweighted voting ensemble of machine learning models.\n","authors":["Atharva Mutsaddi","Aditya Choudhary"],"pdf_url":"https://arxiv.org/pdf/2501.05260v1.pdf","comment":"Accepted into LoResLM: The First Workshop on Language Models for\n  Low-Resource Languages, colocated with COLING 2025 and set to be published\n  into ACL Anthology"},{"id":"http://arxiv.org/abs/2501.05258v1","updated":"2025-01-09T14:13:39Z","published":"2025-01-09T14:13:39Z","title":"Automating the Detection of Code Vulnerabilities by Analyzing GitHub\n  Issues","summary":"  In today's digital landscape, the importance of timely and accurate\nvulnerability detection has significantly increased. This paper presents a\nnovel approach that leverages transformer-based models and machine learning\ntechniques to automate the identification of software vulnerabilities by\nanalyzing GitHub issues. We introduce a new dataset specifically designed for\nclassifying GitHub issues relevant to vulnerability detection. We then examine\nvarious classification techniques to determine their effectiveness. The results\ndemonstrate the potential of this approach for real-world application in early\nvulnerability detection, which could substantially reduce the window of\nexploitation for software vulnerabilities. This research makes a key\ncontribution to the field by providing a scalable and computationally efficient\nframework for automated detection, enabling the prevention of compromised\nsoftware usage before official notifications. This work has the potential to\nenhance the security of open-source software ecosystems.\n","authors":["Daniele Cipollone","Changjie Wang","Mariano Scazzariello","Simone Ferlin","Maliheh Izadi","Dejan Kostic","Marco Chiesa"],"pdf_url":"https://arxiv.org/pdf/2501.05258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16828v3","updated":"2025-01-09T14:10:38Z","published":"2024-09-25T11:29:26Z","title":"On the role of Artificial Intelligence methods in modern\n  force-controlled manufacturing robotic tasks","summary":"  This position paper explores the integration of Artificial Intelligence (AI)\ninto force-controlled robotic tasks within the scope of advanced manufacturing,\na cornerstone of Industry 4.0. AI's role in enhancing robotic manipulators -\nkey drivers in the Fourth Industrial Revolution - is rapidly leading to\nsignificant innovations in smart manufacturing. The objective of this article\nis to frame these innovations in practical force-controlled applications - e.g.\ndeburring, polishing, and assembly tasks like peg-in-hole (PiH) - highlighting\ntheir necessity for maintaining high-quality production standards. By reporting\non recent AI-based methodologies, this article contrasts them and identifies\ncurrent challenges to be addressed in future research. The analysis concludes\nwith a perspective on future research directions, emphasizing the need for\ncommon performance metrics to validate AI techniques, integration of various\nenhancements for performance optimization, and the importance of validating\nthem in relevant scenarios. These future directions aim to provide consistency\nwith already adopted approaches, so as to be compatible with manufacturing\nstandards, increasing the relevance of AI-driven methods in both academic and\nindustrial contexts.\n","authors":["Vincenzo Petrone","Enrico Ferrentino","Pasquale Chiacchio"],"pdf_url":"https://arxiv.org/pdf/2409.16828v3.pdf","comment":"In Proceedings of the 21st International Conference on Informatics in\n  Control, Automation and Robotics - Volume 1: ICINCO, 392-399, 2024 , Porto,\n  Portugal"},{"id":"http://arxiv.org/abs/2410.05838v2","updated":"2025-01-09T14:04:01Z","published":"2024-10-08T09:06:34Z","title":"Time Transfer: On Optimal Learning Rate and Batch Size In The Infinite\n  Data Limit","summary":"  One of the main challenges in optimal scaling of large language models (LLMs)\nis the prohibitive cost of hyperparameter tuning, particularly learning rate\n$\\eta$ and batch size $B$. While techniques like $\\mu$P (Yang et al., 2022)\nprovide scaling rules for optimal $\\eta$ transfer in the infinite model size\nlimit, the optimal scaling behavior in the infinite data size limit remains\nunknown. We fill in this gap by observing for the first time an intricate\ndependence of optimal $\\eta$ scaling on the pretraining token budget $T$, $B$\nand its relation to the critical batch size $B_\\mathrm{crit}$, which we measure\nto evolve as $B_\\mathrm{crit} \\propto T$. Furthermore, we show that the optimal\nbatch size is positively correlated with $B_\\mathrm{crit}$: keeping it fixed\nbecomes suboptimal over time even if learning rate is scaled optimally.\nSurprisingly, our results demonstrate that the observed optimal $\\eta$ and $B$\ndynamics are preserved with $\\mu$P model scaling, challenging the conventional\nview of $B_\\mathrm{crit}$ dependence solely on loss value. Complementing\noptimality, we examine the sensitivity of loss to changes in learning rate,\nwhere we find the sensitivity to decrease with increase of $T$ and to remain\nconstant with $\\mu$P model scaling. We hope our results make the first step\ntowards a unified picture of the joint optimal data and model scaling.\n","authors":["Oleg Filatov","Jan Ebert","Jiangtao Wang","Stefan Kesselheim"],"pdf_url":"https://arxiv.org/pdf/2410.05838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05252v1","updated":"2025-01-09T14:03:35Z","published":"2025-01-09T14:03:35Z","title":"From Scientific Texts to Verifiable Code: Automating the Process with\n  Transformers","summary":"  Despite the vast body of research literature proposing algorithms with formal\nguarantees, the amount of verifiable code in today's systems remains minimal.\nThis discrepancy stems from the inherent difficulty of verifying code,\nparticularly due to the time-consuming nature and strict formalism of proof\ndetails that formal verification tools require. However, the emergence of\ntransformers in Large Language Models presents a promising solution to this\nchallenge. In this position paper, we believe that transformers have the\npotential to read research papers that propose algorithms with formal proofs\nand translate these proofs into verifiable code. We leverage transformers to\nfirst build a formal structure of the proof using the original text from the\npaper, and then to handle the tedious, low-level aspects of proofs that are\noften omitted by humans. We argue that this approach can significantly reduce\nthe barrier to formal verification. The above idea of reading papers to write\nverifiable code opens new avenues for automating the verification of complex\nsystems, enabling a future where formally verified algorithms from academic\nresearch can more seamlessly transition into real-world software systems,\nthereby improving code reliability and security.\n","authors":["Changjie Wang","Mariano Scazzariello","Marco Chiesa"],"pdf_url":"https://arxiv.org/pdf/2501.05252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05249v1","updated":"2025-01-09T14:01:15Z","published":"2025-01-09T14:01:15Z","title":"RAG-WM: An Efficient Black-Box Watermarking Approach for\n  Retrieval-Augmented Generation of Large Language Models","summary":"  In recent years, tremendous success has been witnessed in Retrieval-Augmented\nGeneration (RAG), widely used to enhance Large Language Models (LLMs) in\ndomain-specific, knowledge-intensive, and privacy-sensitive tasks. However,\nattackers may steal those valuable RAGs and deploy or commercialize them,\nmaking it essential to detect Intellectual Property (IP) infringement. Most\nexisting ownership protection solutions, such as watermarks, are designed for\nrelational databases and texts. They cannot be directly applied to RAGs because\nrelational database watermarks require white-box access to detect IP\ninfringement, which is unrealistic for the knowledge base in RAGs. Meanwhile,\npost-processing by the adversary's deployed LLMs typically destructs text\nwatermark information. To address those problems, we propose a novel black-box\n\"knowledge watermark\" approach, named RAG-WM, to detect IP infringement of\nRAGs. RAG-WM uses a multi-LLM interaction framework, comprising a Watermark\nGenerator, Shadow LLM & RAG, and Watermark Discriminator, to create watermark\ntexts based on watermark entity-relationship tuples and inject them into the\ntarget RAG. We evaluate RAG-WM across three domain-specific and two\nprivacy-sensitive tasks on four benchmark LLMs. Experimental results show that\nRAG-WM effectively detects the stolen RAGs in various deployed LLMs.\nFurthermore, RAG-WM is robust against paraphrasing, unrelated content removal,\nknowledge insertion, and knowledge expansion attacks. Lastly, RAG-WM can also\nevade watermark detection approaches, highlighting its promising application in\ndetecting IP infringement of RAG systems.\n","authors":["Peizhuo Lv","Mengjie Sun","Hao Wang","Xiaofeng Wang","Shengzhi Zhang","Yuxuan Chen","Kai Chen","Limin Sun"],"pdf_url":"https://arxiv.org/pdf/2501.05249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05248v1","updated":"2025-01-09T14:00:01Z","published":"2025-01-09T14:00:01Z","title":"Deriving Coding-Specific Sub-Models from LLMs using Resource-Efficient\n  Pruning","summary":"  Large Language Models (LLMs) have demonstrated their exceptional performance\nin various complex code generation tasks. However, their broader adoption is\nlimited by significant computational demands and high resource requirements,\nparticularly memory and processing power. To mitigate such requirements, model\npruning techniques are used to create more compact models with significantly\nfewer parameters. However, current approaches do not focus on the efficient\nextraction of programming-language-specific sub-models. In this work, we\nexplore the idea of efficiently deriving coding-specific sub-models through\nunstructured pruning (i.e., Wanda). We investigate the impact of different\ndomain-specific calibration datasets on pruning outcomes across three distinct\ndomains and extend our analysis to extracting four language-specific\nsub-models: Python, Java, C++, and JavaScript. We are the first to efficiently\nextract programming-language-specific sub-models using appropriate calibration\ndatasets while maintaining acceptable accuracy w.r.t. full models. We are also\nthe first to provide analytical evidence that domain-specific tasks activate\ndistinct regions within LLMs, supporting the creation of specialized sub-models\nthrough unstructured pruning. We believe that this work has significant\npotential to enhance LLM accessibility for coding by reducing computational\nrequirements to enable local execution on consumer-grade hardware, and\nsupporting faster inference times critical for real-time development feedback.\n","authors":["Laura Puccioni","Alireza Farshin","Mariano Scazzariello","Changjie Wang","Marco Chiesa","Dejan Kostic"],"pdf_url":"https://arxiv.org/pdf/2501.05248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05247v1","updated":"2025-01-09T13:57:09Z","published":"2025-01-09T13:57:09Z","title":"Online Prompt and Solver Selection for Program Synthesis","summary":"  Large Language Models (LLMs) demonstrate impressive capabilities in the\ndomain of program synthesis. This level of performance is not, however,\nuniversal across all tasks, all LLMs and all prompting styles. There are many\nareas where one LLM dominates, one prompting style dominates, or where calling\na symbolic solver is a better choice than an LLM. A key challenge for the user\nthen, is to identify not only when an LLM is the right choice of solver, and\nthe appropriate LLM to call for a given synthesis task, but also the right way\nto call it. A non-expert user who makes the wrong choice, incurs a cost both in\nterms of results (number of tasks solved, and the time it takes to solve them)\nand financial cost, if using a closed-source language model via a commercial\nAPI. We frame this choice as an online learning problem. We use a multi-armed\nbandit algorithm to select which symbolic solver, or LLM and prompt combination\nto deploy in order to maximize a given reward function (which may prioritize\nsolving time, number of synthesis tasks solved, or financial cost of solving).\nWe implement an instance of this approach, called CYANEA, and evaluate it on\nsynthesis queries from the literature in ranking function synthesis, from the\nsyntax-guided synthesis competition, and fresh, unseen queries generated from\nSMT problems. CYANEA solves 37.2\\% more queries than the best single solver and\nachieves results within 4\\% of the virtual best solver.\n","authors":["Yixuan Li","Lewis Frampton","Federico Mora","Elizabeth Polgreen"],"pdf_url":"https://arxiv.org/pdf/2501.05247v1.pdf","comment":"Accepted at the 39th AAAI Conference on Artificial Intelligence\n  (AAAI-25) Main Track"},{"id":"http://arxiv.org/abs/2411.06928v2","updated":"2025-01-09T13:56:49Z","published":"2024-11-11T12:32:26Z","title":"Multi-class Decoding of Attended Speaker Direction Using\n  Electroencephalogram and Audio Spatial Spectrum","summary":"  Decoding the directional focus of an attended speaker from listeners'\nelectroencephalogram (EEG) signals is essential for developing brain-computer\ninterfaces to improve the quality of life for individuals with hearing\nimpairment. Previous works have concentrated on binary directional focus\ndecoding, i.e., determining whether the attended speaker is on the left or\nright side of the listener. However, a more precise decoding of the exact\ndirection of the attended speaker is necessary for effective speech processing.\nAdditionally, audio spatial information has not been effectively leveraged,\nresulting in suboptimal decoding results. In this paper, it is found that on\nthe recently presented dataset with 14-class directional focus, models relying\nexclusively on EEG inputs exhibit significantly lower accuracy when decoding\nthe directional focus in both leave-one-subject-out and leave-one-trial-out\nscenarios. By integrating audio spatial spectra with EEG features, the decoding\naccuracy can be effectively improved. The CNN, LSM-CNN, and Deformer models are\nemployed to decode the directional focus from listeners' EEG signals and audio\nspatial spectra. The proposed Sp-EEG-Deformer model achieves notable 14-class\ndecoding accuracies of 55.35% and 57.19% in leave-one-subject-out and\nleave-one-trial-out scenarios with a decision window of 1 second, respectively.\nExperiment results indicate increased decoding accuracy as the number of\nalternative directions reduces. These findings suggest the efficacy of our\nproposed dual modal directional focus decoding strategy.\n","authors":["Yuanming Zhang","Jing Lu","Fei Chen","Haoliang Du","Xia Gao","Zhibin Lin"],"pdf_url":"https://arxiv.org/pdf/2411.06928v2.pdf","comment":"Submitted to IEEE TNSRE"},{"id":"http://arxiv.org/abs/2501.05234v1","updated":"2025-01-09T13:41:37Z","published":"2025-01-09T13:41:37Z","title":"Optimizing Estonian TV Subtitles with Semi-supervised Learning and LLMs","summary":"  This paper presents an approach for generating high-quality, same-language\nsubtitles for Estonian TV content. We fine-tune the Whisper model on\nhuman-generated Estonian subtitles and enhance it with iterative\npseudo-labeling and large language model (LLM) based post-editing. Our\nexperiments demonstrate notable subtitle quality improvement through\npseudo-labeling with an unlabeled dataset. We find that applying LLM-based\nediting at test time enhances subtitle accuracy, while its use during training\ndoes not yield further gains. This approach holds promise for creating subtitle\nquality close to human standard and could be extended to real-time\napplications.\n","authors":["Artem Fedorchenko","Tanel Alumäe"],"pdf_url":"https://arxiv.org/pdf/2501.05234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15879v2","updated":"2025-01-09T13:27:29Z","published":"2024-07-20T10:45:06Z","title":"Decentralized Federated Anomaly Detection in Smart Grids: A P2P Gossip\n  Approach","summary":"  The increasing security and privacy concerns in the Smart Grid sector have\nled to a significant demand for robust intrusion detection systems within\ncritical smart grid infrastructure. To address the challenges posed by privacy\npreservation and decentralized power system zones with distinct data ownership,\nFederated Learning (FL) has emerged as a promising privacy-preserving solution\nwhich facilitates collaborative training of attack detection models without\nnecessitating the sharing of raw data. However, FL presents several\nimplementation limitations in the power system domain due to its heavy reliance\non a centralized aggregator and the risks of privacy leakage during model\nupdate transmission. To overcome these technical bottlenecks, this paper\nintroduces a novel decentralized federated anomaly detection scheme based on\ntwo main gossip protocols namely Random Walk and Epidemic. Our findings\nindicate that the Random Walk protocol exhibits superior performance compared\nto the Epidemic protocol, highlighting its efficacy in decentralized federated\nlearning environments. Experimental validation of the proposed framework\nutilizing publicly available industrial control systems datasets demonstrates\nsuperior attack detection accuracy while safeguarding data confidentiality and\nmitigating the impact of communication latency and stragglers. Furthermore, our\napproach yields a notable 35% improvement in training time compared to\nconventional FL, underscoring the efficacy and robustness of our decentralized\nlearning method.\n","authors":["Muhammad Akbar Husnoo","Adnan Anwar","Md Enamul Haque","A. N. Mahmood"],"pdf_url":"https://arxiv.org/pdf/2407.15879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05220v1","updated":"2025-01-09T13:13:24Z","published":"2025-01-09T13:13:24Z","title":"A Novel Approach to Scalable and Automatic Topic-Controlled Question\n  Generation in Education","summary":"  The development of Automatic Question Generation (QG) models has the\npotential to significantly improve educational practices by reducing the\nteacher workload associated with creating educational content. This paper\nintroduces a novel approach to educational question generation that controls\nthe topical focus of questions. The proposed Topic-Controlled Question\nGeneration (T-CQG) method enhances the relevance and effectiveness of the\ngenerated content for educational purposes. Our approach uses fine-tuning on a\npre-trained T5-small model, employing specially created datasets tailored to\neducational needs. The research further explores the impacts of pre-training\nstrategies, quantisation, and data augmentation on the model's performance. We\nspecifically address the challenge of generating semantically aligned questions\nwith paragraph-level contexts, thereby improving the topic specificity of the\ngenerated questions. In addition, we introduce and explore novel evaluation\nmethods to assess the topical relatedness of the generated questions. Our\nresults, validated through rigorous offline and human-backed evaluations,\ndemonstrate that the proposed models effectively generate high-quality,\ntopic-focused questions. These models have the potential to reduce teacher\nworkload and support personalised tutoring systems by serving as bespoke\nquestion generators. With its relatively small number of parameters, the\nproposals not only advance the capabilities of question generation models for\nhandling specific educational topics but also offer a scalable solution that\nreduces infrastructure costs. This scalability makes them feasible for\nwidespread use in education without reliance on proprietary large language\nmodels like ChatGPT.\n","authors":["Ziqing Li","Mutlu Cukurova","Sahan Bulathwela"],"pdf_url":"https://arxiv.org/pdf/2501.05220v1.pdf","comment":"To be published at ACM Conf. on Learning Analytics and Knowledge\n  (LAK'25)"},{"id":"http://arxiv.org/abs/2501.05213v1","updated":"2025-01-09T13:06:47Z","published":"2025-01-09T13:06:47Z","title":"GLaM-Sign: Greek Language Multimodal Lip Reading with Integrated Sign\n  Language Accessibility","summary":"  The Greek Language Multimodal Lip Reading with Integrated Sign Language\nAccessibility (GLaM-Sign) [1] is a groundbreaking resource in accessibility and\nmultimodal AI, designed to support Deaf and Hard-of-Hearing (DHH) individuals.\nDeveloped from the FEELIT project [2], it integrates high-resolution audio,\nvideo, textual transcriptions, and Greek Sign Language translations for\napplications like real-time sign language translation and enhanced subtitle\nsynchronization. While its primary focus is on promoting inclusivity in the\nGreek tourism sector, its adaptability extends to education, healthcare, and\npublic services. Future advancements will enhance word-level precision and\nscalability to additional languages, supported by advanced AI methodologies and\ncollaborations with diverse stakeholders. This dataset underscores the\ntransformative potential of multimodal resources in bridging communication\ngaps, fostering innovation, and setting a benchmark for ethical AI and\ninclusive technologies.\n","authors":["Dimitris Kouremenos","Klimis Ntalianis"],"pdf_url":"https://arxiv.org/pdf/2501.05213v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.05205v1","updated":"2025-01-09T12:55:55Z","published":"2025-01-09T12:55:55Z","title":"Discovering Hidden Visual Concepts Beyond Linguistic Input in Infant\n  Learning","summary":"  Infants develop complex visual understanding rapidly, even preceding of the\nacquisition of linguistic inputs. As computer vision seeks to replicate the\nhuman vision system, understanding infant visual development may offer valuable\ninsights. In this paper, we present an interdisciplinary study exploring this\nquestion: can a computational model that imitates the infant learning process\ndevelop broader visual concepts that extend beyond the vocabulary it has heard,\nsimilar to how infants naturally learn? To investigate this, we analyze a\nrecently published model in Science by Vong et al.,which is trained on\nlongitudinal, egocentric images of a single child paired with transcribed\nparental speech. We introduce a training-free framework that can discover\nvisual concept neurons hidden in the model's internal representations. Our\nfindings show that these neurons can classify objects outside its original\nvocabulary. Furthermore, we compare the visual representations in infant-like\nmodels with those in moder computer vision models, such as CLIP or ImageNet\npre-trained model, highlighting key similarities and differences. Ultimately,\nour work bridges cognitive science and computer vision by analyzing the\ninternal representations of a computational model trained on an infant's visual\nand linguistic inputs.\n","authors":["Xueyi Ke","Satoshi Tsutsui","Yayun Zhang","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2501.05205v1.pdf","comment":"12 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.05197v1","updated":"2025-01-09T12:48:15Z","published":"2025-01-09T12:48:15Z","title":"An Algorithmic Approach for Causal Health Equity: A Look at Race\n  Differentials in Intensive Care Unit (ICU) Outcomes","summary":"  The new era of large-scale data collection and analysis presents an\nopportunity for diagnosing and understanding the causes of health inequities.\nIn this study, we describe a framework for systematically analyzing health\ndisparities using causal inference. The framework is illustrated by\ninvestigating racial and ethnic disparities in intensive care unit (ICU)\noutcome between majority and minority groups in Australia (Indigenous vs.\nNon-Indigenous) and the United States (African-American vs. White). We\ndemonstrate that commonly used statistical measures for quantifying inequity\nare insufficient, and focus on attributing the observed disparity to the causal\nmechanisms that generate it. We find that minority patients are younger at\nadmission, have worse chronic health, are more likely to be admitted for urgent\nand non-elective reasons, and have higher illness severity. At the same time,\nhowever, we find a protective direct effect of belonging to a minority group,\nwith minority patients showing improved survival compared to their majority\ncounterparts, with all other variables kept equal. We demonstrate that this\nprotective effect is related to the increased probability of being admitted to\nICU, with minority patients having an increased risk of ICU admission. We also\nfind that minority patients, while showing improved survival, are more likely\nto be readmitted to ICU. Thus, due to worse access to primary health care,\nminority patients are more likely to end up in ICU for preventable conditions,\ncausing a reduction in the mortality rates and creating an effect that appears\nto be protective. Since the baseline risk of ICU admission may serve as proxy\nfor lack of access to primary care, we developed the Indigenous Intensive Care\nEquity (IICE) Radar, a monitoring system for tracking the over-utilization of\nICU resources by the Indigenous population of Australia across geographical\nareas.\n","authors":["Drago Plecko","Paul Secombe","Andrea Clarke","Amelia Fiske","Samarra Toby","Donisha Duff","David Pilcher","Leo Anthony Celi","Rinaldo Bellomo","Elias Bareinboim"],"pdf_url":"https://arxiv.org/pdf/2501.05197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09094v2","updated":"2025-01-09T12:38:37Z","published":"2024-12-12T09:22:04Z","title":"Filter-then-Generate: Large Language Models with Structure-Text Adapter\n  for Knowledge Graph Completion","summary":"  Large Language Models (LLMs) present massive inherent knowledge and superior\nsemantic comprehension capability, which have revolutionized various tasks in\nnatural language processing. Despite their success, a critical gap remains in\nenabling LLMs to perform knowledge graph completion (KGC). Empirical evidence\nsuggests that LLMs consistently perform worse than conventional KGC approaches,\neven through sophisticated prompt design or tailored instruction-tuning.\nFundamentally, applying LLMs on KGC introduces several critical challenges,\nincluding a vast set of entity candidates, hallucination issue of LLMs, and\nunder-exploitation of the graph structure. To address these challenges, we\npropose a novel instruction-tuning-based method, namely FtG. Specifically, we\npresent a \\textit{filter-then-generate} paradigm and formulate the KGC task\ninto a multiple-choice question format. In this way, we can harness the\ncapability of LLMs while mitigating the issue casused by hallucinations.\nMoreover, we devise a flexible ego-graph serialization prompt and employ a\nstructure-text adapter to couple structure and text information in a\ncontextualized manner. Experimental results demonstrate that FtG achieves\nsubstantial performance gain compared to existing state-of-the-art methods. The\ninstruction dataset and code are available at\n\\url{https://github.com/LB0828/FtG}.\n","authors":["Ben Liu","Jihai Zhang","Fangquan Lin","Cheng Yang","Min Peng"],"pdf_url":"https://arxiv.org/pdf/2412.09094v2.pdf","comment":"COLING 2025 Main Conference"},{"id":"http://arxiv.org/abs/2412.11120v2","updated":"2025-01-09T11:39:32Z","published":"2024-12-15T08:51:14Z","title":"Latent Reward: LLM-Empowered Credit Assignment in Episodic Reinforcement\n  Learning","summary":"  Reinforcement learning (RL) often encounters delayed and sparse feedback in\nreal-world applications, even with only episodic rewards. Previous approaches\nhave made some progress in reward redistribution for credit assignment but\nstill face challenges, including training difficulties due to redundancy and\nambiguous attributions stemming from overlooking the multifaceted nature of\nmission performance evaluation. Hopefully, Large Language Model (LLM)\nencompasses fruitful decision-making knowledge and provides a plausible tool\nfor reward redistribution. Even so, deploying LLM in this case is non-trivial\ndue to the misalignment between linguistic knowledge and the symbolic form\nrequirement, together with inherent randomness and hallucinations in inference.\nTo tackle these issues, we introduce LaRe, a novel LLM-empowered symbolic-based\ndecision-making framework, to improve credit assignment. Key to LaRe is the\nconcept of the Latent Reward, which works as a multi-dimensional performance\nevaluation, enabling more interpretable goal attainment from various\nperspectives and facilitating more effective reward redistribution. We examine\nthat semantically generated code from LLM can bridge linguistic knowledge and\nsymbolic latent rewards, as it is executable for symbolic objects. Meanwhile,\nwe design latent reward self-verification to increase the stability and\nreliability of LLM inference. Theoretically, reward-irrelevant redundancy\nelimination in the latent reward benefits RL performance from more accurate\nreward estimation. Extensive experimental results witness that LaRe (i)\nachieves superior temporal credit assignment to SOTA methods, (ii) excels in\nallocating contributions among multiple agents, and (iii) outperforms policies\ntrained with ground truth rewards for certain tasks.\n","authors":["Yun Qu","Yuhang Jiang","Boyuan Wang","Yixiu Mao","Cheems Wang","Chang Liu","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2412.11120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05165v1","updated":"2025-01-09T11:38:58Z","published":"2025-01-09T11:38:58Z","title":"Bringing Order Amidst Chaos: On the Role of Artificial Intelligence in\n  Secure Software Engineering","summary":"  Context. Developing secure and reliable software remains a key challenge in\nsoftware engineering (SE). The ever-evolving technological landscape offers\nboth opportunities and threats, creating a dynamic space where chaos and order\ncompete. Secure software engineering (SSE) must continuously address\nvulnerabilities that endanger software systems and carry broader socio-economic\nrisks, such as compromising critical national infrastructure and causing\nsignificant financial losses. Researchers and practitioners have explored\nmethodologies like Static Application Security Testing Tools (SASTTs) and\nartificial intelligence (AI) approaches, including machine learning (ML) and\nlarge language models (LLMs), to detect and mitigate these vulnerabilities.\nEach method has unique strengths and limitations.\n  Aim. This thesis seeks to bring order to the chaos in SSE by addressing\ndomain-specific differences that impact AI accuracy.\n  Methodology. The research employs a mix of empirical strategies, such as\nevaluating effort-aware metrics, analyzing SASTTs, conducting method-level\nanalysis, and leveraging evidence-based techniques like systematic dataset\nreviews. These approaches help characterize vulnerability prediction datasets.\n  Results. Key findings include limitations in static analysis tools for\nidentifying vulnerabilities, gaps in SASTT coverage of vulnerability types,\nweak relationships among vulnerability severity scores, improved defect\nprediction accuracy using just-in-time modeling, and threats posed by untouched\nmethods.\n  Conclusions. This thesis highlights the complexity of SSE and the importance\nof contextual knowledge in improving AI-driven vulnerability and defect\nprediction. The comprehensive analysis advances effective prediction models,\nbenefiting both researchers and practitioners.\n","authors":["Matteo Esposito"],"pdf_url":"https://arxiv.org/pdf/2501.05165v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2501.05163v1","updated":"2025-01-09T11:36:29Z","published":"2025-01-09T11:36:29Z","title":"Explainable AI based System for Supply Air Temperature Forecast","summary":"  This paper explores the application of Explainable AI (XAI) techniques to\nimprove the transparency and understanding of predictive models in control of\nautomated supply air temperature (ASAT) of Air Handling Unit (AHU). The study\nfocuses on forecasting of ASAT using a linear regression with Huber loss.\nHowever, having only a control curve without semantic and/or physical\nexplanation is often not enough. The present study employs one of the XAI\nmethods: Shapley values, which allows to reveal the reasoning and highlight the\ncontribution of each feature to the final ASAT forecast. In comparison to other\nXAI methods, Shapley values have solid mathematical background, resulting in\ninterpretation transparency. The study demonstrates the contrastive\nexplanations--slices, for each control value of ASAT, which makes it possible\nto give the client objective justifications for curve changes.\n","authors":["Marika Eik","Ahmet Kose","Hossein Nourollahi Hokmabad","Juri Belikov"],"pdf_url":"https://arxiv.org/pdf/2501.05163v1.pdf","comment":"5 pages, 7 figures, 1 table, conference paper"},{"id":"http://arxiv.org/abs/2409.00717v3","updated":"2025-01-09T11:24:44Z","published":"2024-09-01T13:14:41Z","title":"Preference-Based Multi-Agent Reinforcement Learning: Data Coverage and\n  Algorithmic Techniques","summary":"  We initiate the study of Preference-Based Multi-Agent Reinforcement Learning\n(PbMARL), exploring both theoretical foundations and empirical validations. We\ndefine the task as identifying the Nash equilibrium from a preference-only\noffline dataset in general-sum games, a problem marked by the challenge of\nsparse feedback signals. Our theory establishes the upper complexity bounds for\nNash Equilibrium in effective PbMARL, demonstrating that single-policy coverage\nis inadequate and highlighting the importance of unilateral dataset coverage.\nThese theoretical insights are verified through comprehensive experiments. To\nenhance the practical performance, we further introduce two algorithmic\ntechniques. (1) We propose a Mean Squared Error (MSE) regularization along the\ntime axis to achieve a more uniform reward distribution and improve reward\nlearning outcomes. (2) We propose an additional penalty based on the\ndistribution of the dataset to incorporate pessimism, improving stability and\neffectiveness during training. Our findings underscore the multifaceted\napproach required for PbMARL, paving the way for effective preference-based\nmulti-agent systems.\n","authors":["Natalia Zhang","Xinqi Wang","Qiwen Cui","Runlong Zhou","Sham M. Kakade","Simon S. Du"],"pdf_url":"https://arxiv.org/pdf/2409.00717v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2501.05155v1","updated":"2025-01-09T11:19:40Z","published":"2025-01-09T11:19:40Z","title":"Biomedical Relation Extraction via Adaptive Document-Relation\n  Cross-Mapping and Concept Unique Identifier","summary":"  Document-Level Biomedical Relation Extraction (Bio-RE) aims to identify\nrelations between biomedical entities within extensive texts, serving as a\ncrucial subfield of biomedical text mining. Existing Bio-RE methods struggle\nwith cross-sentence inference, which is essential for capturing relations\nspanning multiple sentences. Moreover, previous methods often overlook the\nincompleteness of documents and lack the integration of external knowledge,\nlimiting contextual richness. Besides, the scarcity of annotated data further\nhampers model training. Recent advancements in large language models (LLMs)\nhave inspired us to explore all the above issues for document-level Bio-RE.\nSpecifically, we propose a document-level Bio-RE framework via LLM Adaptive\nDocument-Relation Cross-Mapping (ADRCM) Fine-Tuning and Concept Unique\nIdentifier (CUI) Retrieval-Augmented Generation (RAG). First, we introduce the\nIteration-of-REsummary (IoRs) prompt for solving the data scarcity issue. In\nthis way, Bio-RE task-specific synthetic data can be generated by guiding\nChatGPT to focus on entity relations and iteratively refining synthetic data.\nNext, we propose ADRCM fine-tuning, a novel fine-tuning recipe that establishes\nmappings across different documents and relations, enhancing the model's\ncontextual understanding and cross-sentence inference capabilities. Finally,\nduring the inference, a biomedical-specific RAG approach, named CUI RAG, is\ndesigned to leverage CUIs as indexes for entities, narrowing the retrieval\nscope and enriching the relevant document contexts. Experiments conducted on\nthree Bio-RE datasets (GDA, CDR, and BioRED) demonstrate the state-of-the-art\nperformance of our proposed method by comparing it with other related works.\n","authors":["Yufei Shang","Yanrong Guo","Shijie Hao","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2501.05155v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.02648v2","updated":"2025-01-09T11:17:01Z","published":"2025-01-05T20:26:49Z","title":"Representation Learning of Lab Values via Masked AutoEncoder","summary":"  Accurate imputation of missing laboratory values in electronic health records\n(EHRs) is critical to enable robust clinical predictions and reduce biases in\nAI systems in healthcare. Existing methods, such as variational autoencoders\n(VAEs) and decision tree-based approaches such as XGBoost, struggle to model\nthe complex temporal and contextual dependencies in EHR data, mainly in\nunderrepresented groups. In this work, we propose Lab-MAE, a novel\ntransformer-based masked autoencoder framework that leverages self-supervised\nlearning for the imputation of continuous sequential lab values. Lab-MAE\nintroduces a structured encoding scheme that jointly models laboratory test\nvalues and their corresponding timestamps, enabling explicit capturing temporal\ndependencies. Empirical evaluation on the MIMIC-IV dataset demonstrates that\nLab-MAE significantly outperforms the state-of-the-art baselines such as\nXGBoost across multiple metrics, including root mean square error (RMSE),\nR-squared (R2), and Wasserstein distance (WD). Notably, Lab-MAE achieves\nequitable performance across demographic groups of patients, advancing fairness\nin clinical predictions. We further investigate the role of follow-up\nlaboratory values as potential shortcut features, revealing Lab-MAE's\nrobustness in scenarios where such data is unavailable. The findings suggest\nthat our transformer-based architecture, adapted to the characteristics of the\nEHR data, offers a foundation model for more accurate and fair clinical\nimputation models. In addition, we measure and compare the carbon footprint of\nLab-MAE with the baseline XGBoost model, highlighting its environmental\nrequirements.\n","authors":["David Restrepo","Chenwei Wu","Yueran Jia","Jaden K. Sun","Jack Gallifant","Catherine G. Bielick","Yugang Jia","Leo A. Celi"],"pdf_url":"https://arxiv.org/pdf/2501.02648v2.pdf","comment":"10 pages main text, 8 appendix"},{"id":"http://arxiv.org/abs/2411.07066v2","updated":"2025-01-09T11:11:37Z","published":"2024-11-11T15:30:16Z","title":"Zeroth-Order Adaptive Neuron Alignment Based Pruning without Re-Training","summary":"  Network pruning focuses on computational techniques that aim to reduce a\ngiven model's computational cost by removing a subset of its parameters while\nhaving minimal impact on performance. Throughout the last decade, the most\nwidely used pruning paradigm has been pruning and re-training, which nowadays\nis inconvenient due to the vast amount of pre-trained models, which are in any\ncase too expensive to re-train. In this paper, we exploit functional\ninformation from dense pre-trained models, i.e., their activations, to obtain\nsparse models that maximize the activations' alignment w.r.t. their\ncorresponding dense models. Hence, we propose \\textsc{NeuroAL}, a \\emph{top-up}\nalgorithm that can be used on top of any given pruning algorithm for LLMs,\nwhich modifies the block-wise and row-wise sparsity exploiting information from\nboth the dense model and its sparse version to maximize the \\emph{neuron\nalignment} among activations. Differently from existing methods, our approach\nadaptively selects the best hyperparameters for the block-wise and row-wise\nsparsity ratios w.r.t. the model and the desired sparsity, and requires\n\\emph{no re-training}. We test our method over 276 cases combining four LLM\nfamilies, three sparsity ratios, and ten language tasks (three language\nmodeling and seven zero-shot datasets), showing how it consistently outperforms\nthe latest state-of-the-art methods in terms of performance-runtime trade-off.\nThe code is available at\n\\href{https://github.com/eliacunegatti/NeuroAL}{https://github.com/eliacunegatti/NeuroAL}.\n","authors":["Elia Cunegatti","Leonardo Lucio Custode","Giovanni Iacca"],"pdf_url":"https://arxiv.org/pdf/2411.07066v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2501.05147v1","updated":"2025-01-09T10:56:50Z","published":"2025-01-09T10:56:50Z","title":"A Systematic Literature Review on Deep Learning-based Depth Estimation\n  in Computer Vision","summary":"  Depth estimation (DE) provides spatial information about a scene and enables\ntasks such as 3D reconstruction, object detection, and scene understanding.\nRecently, there has been an increasing interest in using deep learning\n(DL)-based methods for DE. Traditional techniques rely on handcrafted features\nthat often struggle to generalise to diverse scenes and require extensive\nmanual tuning. However, DL models for DE can automatically extract relevant\nfeatures from input data, adapt to various scene conditions, and generalise\nwell to unseen environments. Numerous DL-based methods have been developed,\nmaking it necessary to survey and synthesize the state-of-the-art (SOTA).\nPrevious reviews on DE have mainly focused on either monocular or stereo-based\ntechniques, rather than comprehensively reviewing DE. Furthermore, to the best\nof our knowledge, there is no systematic literature review (SLR) that\ncomprehensively focuses on DE. Therefore, this SLR study is being conducted.\nInitially, electronic databases were searched for relevant publications,\nresulting in 1284 publications. Using defined exclusion and quality criteria,\n128 publications were shortlisted and further filtered to select 59\nhigh-quality primary studies. These studies were analysed to extract data and\nanswer defined research questions. Based on the results, DL methods were\ndeveloped for mainly three different types of DE: monocular, stereo, and\nmulti-view. 20 publicly available datasets were used to train, test, and\nevaluate DL models for DE, with KITTI, NYU Depth V2, and Make 3D being the most\nused datasets. 29 evaluation metrics were used to assess the performance of DE.\n35 base models were reported in the primary studies, and the top five most-used\nbase models were ResNet-50, ResNet-18, ResNet-101, U-Net, and VGG-16. Finally,\nthe lack of ground truth data was among the most significant challenges\nreported by primary studies.\n","authors":["Ali Rohan","Md Junayed Hasan","Andrei Petrovski"],"pdf_url":"https://arxiv.org/pdf/2501.05147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00778v2","updated":"2025-01-09T10:47:35Z","published":"2024-06-02T15:35:45Z","title":"Bayesian Joint Additive Factor Models for Multiview Learning","summary":"  It is increasingly common in a wide variety of applied settings to collect\ndata of multiple different types on the same set of samples. Our particular\nfocus in this article is on studying relationships between such multiview\nfeatures and responses. A motivating application arises in the context of\nprecision medicine where multi-omics data are collected to correlate with\nclinical outcomes. It is of interest to infer dependence within and across\nviews while combining multimodal information to improve the prediction of\noutcomes. The signal-to-noise ratio can vary substantially across views,\nmotivating more nuanced statistical tools beyond standard late and early\nfusion. This challenge comes with the need to preserve interpretability, select\nfeatures, and obtain accurate uncertainty quantification. We propose a joint\nadditive factor regression model (JAFAR) with a structured additive design,\naccounting for shared and view-specific components. We ensure identifiability\nvia a novel dependent cumulative shrinkage process (D-CUSP) prior. We provide\nan efficient implementation via a partially collapsed Gibbs sampler and extend\nour approach to allow flexible feature and outcome distributions. Prediction of\ntime-to-labor onset from immunome, metabolome, and proteome data illustrates\nperformance gains against state-of-the-art competitors. Our open-source\nsoftware (R package) is available at https://github.com/niccoloanceschi/jafar.\n","authors":["Niccolo Anceschi","Federico Ferrari","David B. Dunson","Himel Mallick"],"pdf_url":"https://arxiv.org/pdf/2406.00778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05113v1","updated":"2025-01-09T09:59:42Z","published":"2025-01-09T09:59:42Z","title":"Constrained Optimization of Charged Particle Tracking with Multi-Agent\n  Reinforcement Learning","summary":"  Reinforcement learning demonstrated immense success in modelling complex\nphysics-driven systems, providing end-to-end trainable solutions by interacting\nwith a simulated or real environment, maximizing a scalar reward signal. In\nthis work, we propose, building upon previous work, a multi-agent reinforcement\nlearning approach with assignment constraints for reconstructing particle\ntracks in pixelated particle detectors. Our approach optimizes collaboratively\na parametrized policy, functioning as a heuristic to a multidimensional\nassignment problem, by jointly minimizing the total amount of particle\nscattering over the reconstructed tracks in a readout frame. To satisfy\nconstraints, guaranteeing a unique assignment of particle hits, we propose a\nsafety layer solving a linear assignment problem for every joint action.\nFurther, to enforce cost margins, increasing the distance of the local policies\npredictions to the decision boundaries of the optimizer mappings, we recommend\nthe use of an additional component in the blackbox gradient estimation, forcing\nthe policy to solutions with lower total assignment costs. We empirically show\non simulated data, generated for a particle detector developed for proton\nimaging, the effectiveness of our approach, compared to multiple single- and\nmulti-agent baselines. We further demonstrate the effectiveness of constraints\nwith cost margins for both optimization and generalization, introduced by wider\nregions with high reconstruction performance as well as reduced predictive\ninstabilities. Our results form the basis for further developments in RL-based\ntracking, offering both enhanced performance with constrained policies and\ngreater flexibility in optimizing tracking algorithms through the option for\nindividual and team rewards.\n","authors":["Tobias Kortus","Ralf Keidel","Nicolas R. Gauger","Jan Kieseler"],"pdf_url":"https://arxiv.org/pdf/2501.05113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05095v1","updated":"2025-01-09T09:21:09Z","published":"2025-01-09T09:21:09Z","title":"Advancing ALS Applications with Large-Scale Pre-training: Dataset\n  Development and Downstream Assessment","summary":"  The pre-training and fine-tuning paradigm has revolutionized satellite remote\nsensing applications. However, this approach remains largely underexplored for\nairborne laser scanning (ALS), an important technology for applications such as\nforest management and urban planning. In this study, we address this gap by\nconstructing a large-scale ALS point cloud dataset and evaluating its impact on\ndownstream applications. Our dataset comprises ALS point clouds collected\nacross the contiguous United States, provided by the United States Geological\nSurvey's 3D Elevation Program. To ensure efficient data collection while\ncapturing diverse land cover and terrain types, we introduce a geospatial\nsampling method that selects point cloud tiles based on land cover maps and\ndigital elevation models. As a baseline self-supervised learning model, we\nadopt BEV-MAE, a state-of-the-art masked autoencoder for 3D outdoor point\nclouds, and pre-train it on the constructed dataset. The pre-trained models are\nsubsequently fine-tuned for downstream tasks, including tree species\nclassification, terrain scene recognition, and point cloud semantic\nsegmentation. Our results show that the pre-trained models significantly\noutperform their scratch counterparts across all downstream tasks,\ndemonstrating the transferability of the representations learned from the\nproposed dataset. Furthermore, we observe that scaling the dataset using our\ngeospatial sampling method consistently enhances performance, whereas\npre-training on datasets constructed with random sampling fails to achieve\nsimilar improvements. These findings highlight the utility of the constructed\ndataset and the effectiveness of our sampling strategy in the pre-training and\nfine-tuning paradigm. The source code and pre-trained models will be made\npublicly available at \\url{https://github.com/martianxiu/ALS_pretraining}.\n","authors":["Haoyi Xiu","Xin Liu","Taehoon Kim","Kyoung-Sook Kim"],"pdf_url":"https://arxiv.org/pdf/2501.05095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06232v2","updated":"2025-01-09T09:20:48Z","published":"2024-10-08T17:41:37Z","title":"Range, not Independence, Drives Modularity in Biological Inspired\n  Representation","summary":"  Why do biological and artificial neurons sometimes modularise, each encoding\na single meaningful variable, and sometimes entangle their representation of\nmany variables? In this work, we develop a theory of when biologically inspired\nnetworks -- those that are nonnegative and energy efficient -- modularise their\nrepresentation of source variables (sources). We derive necessary and\nsufficient conditions on a sample of sources that determine whether the neurons\nin an optimal biologically-inspired linear autoencoder modularise. Our theory\napplies to any dataset, extending far beyond the case of statistical\nindependence studied in previous work. Rather we show that sources modularise\nif their support is ``sufficiently spread''. From this theory, we extract and\nvalidate predictions in a variety of empirical studies on how data distribution\naffects modularisation in nonlinear feedforward and recurrent neural networks\ntrained on supervised and unsupervised tasks. Furthermore, we apply these ideas\nto neuroscience data, showing that range independence can be used to understand\nthe mixing or modularising of spatial and reward information in entorhinal\nrecordings in seemingly conflicting experiments. Further, we use these results\nto suggest alternate origins of mixed-selectivity, beyond the predominant\ntheory of flexible nonlinear classification. In sum, our theory prescribes\nprecise conditions on when neural activities modularise, providing tools for\ninducing and elucidating modular representations in brains and machines.\n","authors":["Will Dorrell","Kyle Hsu","Luke Hollingsworth","Jin Hwa Lee","Jiajun Wu","Chelsea Finn","Peter E Latham","Tim EJ Behrens","James CR Whittington"],"pdf_url":"https://arxiv.org/pdf/2410.06232v2.pdf","comment":"40 pages, 16 figures. WD and KH contributed equally; LH and JHL\n  contributed equally"},{"id":"http://arxiv.org/abs/2312.03700v2","updated":"2025-01-09T09:12:06Z","published":"2023-12-06T18:59:19Z","title":"OneLLM: One Framework to Align All Modalities with Language","summary":"  Multimodal large language models (MLLMs) have gained significant attention\ndue to their strong multimodal understanding capability. However, existing\nworks rely heavily on modality-specific encoders, which usually differ in\narchitecture and are limited to common modalities. In this paper, we present\nOneLLM, an MLLM that aligns eight modalities to language using a unified\nframework. We achieve this through a unified multimodal encoder and a\nprogressive multimodal alignment pipeline. In detail, we first train an image\nprojection module to connect a vision encoder with LLM. Then, we build a\nuniversal projection module (UPM) by mixing multiple image projection modules\nand dynamic routing. Finally, we progressively align more modalities to LLM\nwith the UPM. To fully leverage the potential of OneLLM in following\ninstructions, we also curated a comprehensive multimodal instruction dataset,\nincluding 2M items from image, audio, video, point cloud, depth/normal map, IMU\nand fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,\nencompassing tasks such as multimodal captioning, question answering and\nreasoning, where it delivers excellent performance. Code, data, model and\nonline demo are available at https://github.com/csuhan/OneLLM\n","authors":["Jiaming Han","Kaixiong Gong","Yiyuan Zhang","Jiaqi Wang","Kaipeng Zhang","Dahua Lin","Yu Qiao","Peng Gao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2312.03700v2.pdf","comment":"Accepted by CVPR 2024. Code: https://github.com/csuhan/OneLLM"},{"id":"http://arxiv.org/abs/2412.10095v2","updated":"2025-01-09T09:09:32Z","published":"2024-12-13T12:31:06Z","title":"HiTZ at VarDial 2025 NorSID: Overcoming Data Scarcity with Language\n  Transfer and Automatic Data Annotation","summary":"  In this paper we present our submission for the NorSID Shared Task as part of\nthe 2025 VarDial Workshop (Scherrer et al., 2025), consisting of three tasks:\nIntent Detection, Slot Filling and Dialect Identification, evaluated using data\nin different dialects of the Norwegian language. For Intent Detection and Slot\nFilling, we have fine-tuned a multitask model in a cross-lingual setting, to\nleverage the xSID dataset available in 17 languages. In the case of Dialect\nIdentification, our final submission consists of a model fine-tuned on the\nprovided development set, which has obtained the highest scores within our\nexperiments. Our final results on the test set show that our models do not drop\nin performance compared to the development set, likely due to the\ndomain-specificity of the dataset and the similar distribution of both subsets.\nFinally, we also report an in-depth analysis of the provided datasets and their\nartifacts, as well as other sets of experiments that have been carried out but\ndid not yield the best results. Additionally, we present an analysis on the\nreasons why some methods have been more successful than others; mainly the\nimpact of the combination of languages and domain-specificity of the training\ndata on the results.\n","authors":["Jaione Bengoetxea","Mikel Zubillaga","Ekhi Azurmendi","Maite Heredia","Julen Etxaniz","Markel Ferro","Jeremy Barnes"],"pdf_url":"https://arxiv.org/pdf/2412.10095v2.pdf","comment":"Vardial 2025 NorSID Shared Task, fixed minor typos"},{"id":"http://arxiv.org/abs/2501.05079v1","updated":"2025-01-09T09:01:04Z","published":"2025-01-09T09:01:04Z","title":"Multimodal-to-Text Prompt Engineering in Large Language Models Using\n  Feature Embeddings for GNSS Interference Characterization","summary":"  Large language models (LLMs) are advanced AI systems applied across various\ndomains, including NLP, information retrieval, and recommendation systems.\nDespite their adaptability and efficiency, LLMs have not been extensively\nexplored for signal processing tasks, particularly in the domain of global\nnavigation satellite system (GNSS) interference monitoring. GNSS interference\nmonitoring is essential to ensure the reliability of vehicle localization on\nroads, a critical requirement for numerous applications. However, GNSS-based\npositioning is vulnerable to interference from jamming devices, which can\ncompromise its accuracy. The primary objective is to identify, classify, and\nmitigate these interferences. Interpreting GNSS snapshots and the associated\ninterferences presents significant challenges due to the inherent complexity,\nincluding multipath effects, diverse interference types, varying sensor\ncharacteristics, and satellite constellations. In this paper, we extract\nfeatures from a large GNSS dataset and employ LLaVA to retrieve relevant\ninformation from an extensive knowledge base. We employ prompt engineering to\ninterpret the interferences and environmental factors, and utilize t-SNE to\nanalyze the feature embeddings. Our findings demonstrate that the proposed\nmethod is capable of visual and logical reasoning within the GNSS context.\nFurthermore, our pipeline outperforms state-of-the-art machine learning models\nin interference classification tasks.\n","authors":["Harshith Manjunath","Lucas Heublein","Tobias Feigl","Felix Ott"],"pdf_url":"https://arxiv.org/pdf/2501.05079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05078v1","updated":"2025-01-09T09:00:32Z","published":"2025-01-09T09:00:32Z","title":"Analyzing Memorization in Large Language Models through the Lens of\n  Model Attribution","summary":"  Large Language Models (LLMs) are prevalent in modern applications but often\nmemorize training data, leading to privacy breaches and copyright issues.\nExisting research has mainly focused on posthoc analyses, such as extracting\nmemorized content or developing memorization metrics, without exploring the\nunderlying architectural factors that contribute to memorization. In this work,\nwe investigate memorization from an architectural lens by analyzing how\nattention modules at different layers impact its memorization and\ngeneralization performance. Using attribution techniques, we systematically\nintervene in the LLM architecture by bypassing attention modules at specific\nblocks while keeping other components like layer normalization and MLP\ntransformations intact. We provide theorems analyzing our intervention\nmechanism from a mathematical view, bounding the difference in layer outputs\nwith and without our attributions. Our theoretical and empirical analyses\nreveal that attention modules in deeper transformer blocks are primarily\nresponsible for memorization, whereas earlier blocks are crucial for the models\ngeneralization and reasoning capabilities. We validate our findings through\ncomprehensive experiments on different LLM families (Pythia and GPTNeo) and\nfive benchmark datasets. Our insights offer a practical approach to mitigate\nmemorization in LLMs while preserving their performance, contributing to safer\nand more ethical deployment in real world applications.\n","authors":["Tarun Ram Menta","Susmit Agrawal","Chirag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2501.05078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05075v1","updated":"2025-01-09T08:59:14Z","published":"2025-01-09T08:59:14Z","title":"A Text-Based Knowledge-Embedded Soft Sensing Modeling Approach for\n  General Industrial Process Tasks Based on Large Language Model","summary":"  Data-driven soft sensors (DDSS) have become mainstream methods for predicting\nkey performance indicators in process industries. However, DDSS development\nrequires complex and costly customized designs tailored to various tasks during\nthe modeling process. Moreover, DDSS are constrained to a single structured\ndata modality, limiting their ability to incorporate additional contextual\nknowledge. Furthermore, DDSSs' limited representation learning leads to weak\npredictive performance with scarce data. To address these challenges, we\npropose a general framework named LLM-TKESS (large language model for\ntext-based knowledge-embedded soft sensing), harnessing the powerful general\nproblem-solving capabilities, cross-modal knowledge transfer abilities, and\nfew-shot capabilities of LLM for enhanced soft sensing modeling. Specifically,\nan auxiliary variable series encoder (AVS Encoder) is proposed to unleash LLM's\npotential for capturing temporal relationships within series and spatial\nsemantic relationships among auxiliary variables. Then, we propose a two-stage\nfine-tuning alignment strategy: in the first stage, employing\nparameter-efficient fine-tuning through autoregressive training adjusts LLM to\nrapidly accommodate process variable data, resulting in a soft sensing\nfoundation model (SSFM). Subsequently, by training adapters, we adapt the SSFM\nto various downstream tasks without modifying its architecture. Then, we\npropose two text-based knowledge-embedded soft sensors, integrating new natural\nlanguage modalities to overcome the limitations of pure structured data models.\nFurthermore, benefiting from LLM's pre-existing world knowledge, our model\ndemonstrates outstanding predictive capabilities in small sample conditions.\nUsing the thermal deformation of air preheater rotor as a case study, we\nvalidate through extensive experiments that LLM-TKESS exhibits outstanding\nperformance.\n","authors":["Shuo Tong","Han Liu","Runyuan Guo","Xueqiong Tian","Wenqing Wang","Ding Liu","Youmin Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.05075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14503v2","updated":"2025-01-09T08:55:07Z","published":"2024-11-21T08:31:06Z","title":"Planning-Driven Programming: A Large Language Model Programming Workflow","summary":"  The strong performance of large language models (LLMs) raises extensive\ndiscussion on their application to code generation. Recent research suggests\ncontinuous program refinements through visible tests to improve code generation\naccuracy in LLMs. However, these methods suffer from LLMs' inefficiency and\nlimited reasoning capacity. In this work, we propose an LLM programming\nworkflow (LPW) designed to improve both initial code generation and subsequent\nrefinements within a structured two-phase workflow. Specifically, the solution\ngeneration phase formulates a solution plan, which is then verified through\nvisible tests to specify the intended natural language solution. Subsequently,\nthe code implementation phase drafts an initial code according to the solution\nplan and its verification. If the generated code fails the visible tests, the\nplan verification serves as the intended solution to consistently inform the\nrefinement process for correcting bugs. Compared to state-of-the-art methods\nacross various existing LLMs, LPW significantly improves the Pass@1 accuracy by\nup to 16.4% on well-established text-to-code generation benchmarks. LPW also\nsets new state-of-the-art Pass@1 accuracy, achieving 98.2% on HumanEval, 84.8%\non MBPP, 59.3% on LiveCode, 62.6% on APPS, and 34.7% on CodeContest, using\nGPT-4o as the backbone.\n","authors":["Chao Lei","Yanchuan Chang","Nir Lipovetzky","Krista A. Ehinger"],"pdf_url":"https://arxiv.org/pdf/2411.14503v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05069v1","updated":"2025-01-09T08:44:42Z","published":"2025-01-09T08:44:42Z","title":"Commonsense Video Question Answering through Video-Grounded Entailment\n  Tree Reasoning","summary":"  This paper proposes the first video-grounded entailment tree reasoning method\nfor commonsense video question answering (VQA). Despite the remarkable progress\nof large visual-language models (VLMs), there are growing concerns that they\nlearn spurious correlations between videos and likely answers, reinforced by\ntheir black-box nature and remaining benchmarking biases. Our method explicitly\ngrounds VQA tasks to video fragments in four steps: entailment tree\nconstruction, video-language entailment verification, tree reasoning, and\ndynamic tree expansion. A vital benefit of the method is its generalizability\nto current video and image-based VLMs across reasoning types. To support fair\nevaluation, we devise a de-biasing procedure based on large-language models\nthat rewrites VQA benchmark answer sets to enforce model reasoning. Systematic\nexperiments on existing and de-biased benchmarks highlight the impact of our\nmethod components across benchmarks, VLMs, and reasoning types.\n","authors":["Huabin Liu","Filip Ilievski","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2501.05069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05068v1","updated":"2025-01-09T08:44:06Z","published":"2025-01-09T08:44:06Z","title":"D3RM: A Discrete Denoising Diffusion Refinement Model for Piano\n  Transcription","summary":"  Diffusion models have been widely used in the generative domain due to their\nconvincing performance in modeling complex data distributions. Moreover, they\nhave shown competitive results on discriminative tasks, such as image\nsegmentation. While diffusion models have also been explored for automatic\nmusic transcription, their performance has yet to reach a competitive level. In\nthis paper, we focus on discrete diffusion model's refinement capabilities and\npresent a novel architecture for piano transcription. Our model utilizes\nNeighborhood Attention layers as the denoising module, gradually predicting the\ntarget high-resolution piano roll, conditioned on the finetuned features of a\npretrained acoustic model. To further enhance refinement, we devise a novel\nstrategy which applies distinct transition states during training and inference\nstage of discrete diffusion models. Experiments on the MAESTRO dataset show\nthat our approach outperforms previous diffusion-based piano transcription\nmodels and the baseline model in terms of F1 score. Our code is available in\nhttps://github.com/hanshounsu/d3rm.\n","authors":["Hounsu Kim","Taegyun Kwon","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2501.05068v1.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.05067v1","updated":"2025-01-09T08:43:57Z","published":"2025-01-09T08:43:57Z","title":"LLaVA-Octopus: Unlocking Instruction-Driven Adaptive Projector Fusion\n  for Video Understanding","summary":"  In this paper, we introduce LLaVA-Octopus, a novel video multimodal large\nlanguage model. LLaVA-Octopus adaptively weights features from different visual\nprojectors based on user instructions, enabling us to leverage the\ncomplementary strengths of each projector. We observe that different visual\nprojectors exhibit distinct characteristics when handling specific tasks. For\ninstance, some projectors excel at capturing static details, while others are\nmore effective at processing temporal information, and some are better suited\nfor tasks requiring temporal coherence. By dynamically adjusting feature\nweights according to user instructions, LLaVA-Octopus dynamically selects and\ncombines the most suitable features, significantly enhancing the model's\nperformance in multimodal tasks. Experimental results demonstrate that\nLLaVA-Octopus achieves excellent performance across multiple benchmarks,\nespecially in tasks such as multimodal understanding, visual question\nanswering, and video understanding, highlighting its broad application\npotential.\n","authors":["Jiaxing Zhao","Boyuan Sun","Xiang Chen","Xihan Wei","Qibin Hou"],"pdf_url":"https://arxiv.org/pdf/2501.05067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05066v1","updated":"2025-01-09T08:43:09Z","published":"2025-01-09T08:43:09Z","title":"Improving Skeleton-based Action Recognition with Interactive Object\n  Information","summary":"  Human skeleton information is important in skeleton-based action recognition,\nwhich provides a simple and efficient way to describe human pose. However,\nexisting skeleton-based methods focus more on the skeleton, ignoring the\nobjects interacting with humans, resulting in poor performance in recognizing\nactions that involve object interactions. We propose a new action recognition\nframework introducing object nodes to supplement absent interactive object\ninformation. We also propose Spatial Temporal Variable Graph Convolutional\nNetworks (ST-VGCN) to effectively model the Variable Graph (VG) containing\nobject nodes. Specifically, in order to validate the role of interactive object\ninformation, by leveraging a simple self-training approach, we establish a new\ndataset, JXGC 24, and an extended dataset, NTU RGB+D+Object 60, including more\nthan 2 million additional object nodes. At the same time, we designe the\nVariable Graph construction method to accommodate a variable number of nodes\nfor graph structure. Additionally, we are the first to explore the overfitting\nissue introduced by incorporating additional object information, and we propose\na VG-based data augmentation method to address this issue, called Random Node\nAttack. Finally, regarding the network structure, we introduce two fusion\nmodules, CAF and WNPool, along with a novel Node Balance Loss, to enhance the\ncomprehensive performance by effectively fusing and balancing skeleton and\nobject node information. Our method surpasses the previous state-of-the-art on\nmultiple skeleton-based action recognition benchmarks. The accuracy of our\nmethod on NTU RGB+D 60 cross-subject split is 96.7\\%, and on cross-view split,\nit is 99.2\\%.\n","authors":["Hao Wen","Ziqian Lu","Fengli Shen","Zhe-Ming Lu","Jialin Cui"],"pdf_url":"https://arxiv.org/pdf/2501.05066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04614v2","updated":"2025-01-09T08:42:56Z","published":"2025-01-08T16:53:56Z","title":"MedCoDi-M: A Multi-Prompt Foundation Model for Multimodal Medical Data\n  Generation","summary":"  Artificial Intelligence is revolutionizing medical practice, enhancing\ndiagnostic accuracy and healthcare delivery. However, its adaptation in medical\nsettings still faces significant challenges, related to data availability and\nprivacy constraints. Synthetic data has emerged as a promising solution to\nmitigate these issues, addressing data scarcity while preserving privacy.\nRecently, Latent Diffusion Models have emerged as a powerful tool for\ngenerating high-quality synthetic data. Meanwhile, the integration of different\nmodalities has gained interest, emphasizing the need of models capable of\nhandle multimodal medical data. Existing approaches struggle to integrate\ncomplementary information and lack the ability to generate modalities\nsimultaneously. To address this challenge, we present MedCoDi-M, a\n6.77-billion-parameter model, designed for multimodal medical data generation,\nthat, following Foundation Model paradigm, exploits contrastive learning and\nlarge quantity of data to build a shared latent space which capture the\nrelationships between different data modalities. Further, we introduce the\nMulti-Prompt training technique, which significantly boosts MedCoDi-M's\ngeneration under different settings. We extensively validate MedCoDi-M: first\nwe benchmark it against five competitors on the MIMIC-CXR dataset, a\nstate-of-the-art dataset for Chest X-ray and radiological report generation.\nSecondly, we perform a Visual Turing Test with expert radiologists to assess\nthe realism and clinical relevance of the generated data, ensuring alignment\nwith real-world scenarios. Finally, we assess the utility of MedCoDi-M in\naddressing key challenges in the medical field, such as anonymization, data\nscarcity and imbalance learning. The results are promising, demonstrating the\napplicability of MedCoDi-M in medical contexts. Project page is at\nhttps://cosbidev.github.io/MedCoDi-M/.\n","authors":["Daniele Molino","Francesco Di Feola","Eliodoro Faiella","Deborah Fazzini","Domiziana Santucci","Linlin Shen","Valerio Guarrasi","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2501.04614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05058v1","updated":"2025-01-09T08:28:31Z","published":"2025-01-09T08:28:31Z","title":"Simultaneous emulation and downscaling with physically-consistent deep\n  learning-based regional ocean emulators","summary":"  Building on top of the success in AI-based atmospheric emulation, we propose\nan AI-based ocean emulation and downscaling framework focusing on the\nhigh-resolution regional ocean over Gulf of Mexico. Regional ocean emulation\npresents unique challenges owing to the complex bathymetry and lateral boundary\nconditions as well as from fundamental biases in deep learning-based\nframeworks, such as instability and hallucinations. In this paper, we develop a\ndeep learning-based framework to autoregressively integrate ocean-surface\nvariables over the Gulf of Mexico at $8$ Km spatial resolution without\nunphysical drifts over decadal time scales and simulataneously downscale and\nbias-correct it to $4$ Km resolution using a physics-constrained generative\nmodel. The framework shows both short-term skills as well as accurate long-term\nstatistics in terms of mean and variability.\n","authors":["Leonard Lupin-Jimenez","Moein Darman","Subhashis Hazarika","Tianning Wu","Michael Gray","Ruyoing He","Anthony Wong","Ashesh Chattopadhyay"],"pdf_url":"https://arxiv.org/pdf/2501.05058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05053v1","updated":"2025-01-09T08:24:10Z","published":"2025-01-09T08:24:10Z","title":"TAPFed: Threshold Secure Aggregation for Privacy-Preserving Federated\n  Learning","summary":"  Federated learning is a computing paradigm that enhances privacy by enabling\nmultiple parties to collaboratively train a machine learning model without\nrevealing personal data. However, current research indicates that traditional\nfederated learning platforms are unable to ensure privacy due to privacy leaks\ncaused by the interchange of gradients. To achieve privacy-preserving federated\nlearning, integrating secure aggregation mechanisms is essential.\nUnfortunately, existing solutions are vulnerable to recently demonstrated\ninference attacks such as the disaggregation attack. This paper proposes\nTAPFed, an approach for achieving privacy-preserving federated learning in the\ncontext of multiple decentralized aggregators with malicious actors. TAPFed\nuses a proposed threshold functional encryption scheme and allows for a certain\nnumber of malicious aggregators while maintaining security and privacy. We\nprovide formal security and privacy analyses of TAPFed and compare it to\nvarious baselines through experimental evaluation. Our results show that TAPFed\noffers equivalent performance in terms of model quality compared to\nstate-of-the-art approaches while reducing transmission overhead by 29%-45%\nacross different model training scenarios. Most importantly, TAPFed can defend\nagainst recently demonstrated inference attacks caused by curious aggregators,\nwhich the majority of existing approaches are susceptible to.\n","authors":["Runhua Xu","Bo Li","Chao Li","James B. D. Joshi","Shuai Ma","Jianxin Li"],"pdf_url":"https://arxiv.org/pdf/2501.05053v1.pdf","comment":"The paper has been published in IEEE TDSC"},{"id":"http://arxiv.org/abs/2501.05032v1","updated":"2025-01-09T07:44:06Z","published":"2025-01-09T07:44:06Z","title":"Enhancing Human-Like Responses in Large Language Models","summary":"  This paper explores the advancements in making large language models (LLMs)\nmore human-like. We focus on techniques that enhance natural language\nunderstanding, conversational coherence, and emotional intelligence in AI\nsystems. The study evaluates various approaches, including fine-tuning with\ndiverse datasets, incorporating psychological principles, and designing models\nthat better mimic human reasoning patterns. Our findings demonstrate that these\nenhancements not only improve user interactions but also open new possibilities\nfor AI applications across different domains. Future work will address the\nethical implications and potential biases introduced by these human-like\nattributes.\n","authors":["Ethem Yağız Çalık","Talha Rüzgar Akkuş"],"pdf_url":"https://arxiv.org/pdf/2501.05032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05030v1","updated":"2025-01-09T07:41:22Z","published":"2025-01-09T07:41:22Z","title":"A General Retrieval-Augmented Generation Framework for Multimodal\n  Case-Based Reasoning Applications","summary":"  Case-based reasoning (CBR) is an experience-based approach to problem\nsolving, where a repository of solved cases is adapted to solve new cases.\nRecent research shows that Large Language Models (LLMs) with\nRetrieval-Augmented Generation (RAG) can support the Retrieve and Reuse stages\nof the CBR pipeline by retrieving similar cases and using them as additional\ncontext to an LLM query. Most studies have focused on text-only applications,\nhowever, in many real-world problems the components of a case are multimodal.\nIn this paper we present MCBR-RAG, a general RAG framework for multimodal CBR\napplications. The MCBR-RAG framework converts non-text case components into\ntext-based representations, allowing it to: 1) learn application-specific\nlatent representations that can be indexed for retrieval, and 2) enrich the\nquery provided to the LLM by incorporating all case components for better\ncontext. We demonstrate MCBR-RAG's effectiveness through experiments conducted\non a simplified Math-24 application and a more complex Backgammon application.\nOur empirical results show that MCBR-RAG improves generation quality compared\nto a baseline LLM with no contextual information provided.\n","authors":["Ofir Marom"],"pdf_url":"https://arxiv.org/pdf/2501.05030v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.06764v3","updated":"2025-01-09T07:39:30Z","published":"2023-08-13T13:01:21Z","title":"Few-shot Class-incremental Learning for Classification and Object\n  Detection: A Survey","summary":"  Few-shot Class-Incremental Learning (FSCIL) presents a unique challenge in\nMachine Learning (ML), as it necessitates the Incremental Learning (IL) of new\nclasses from sparsely labeled training samples without forgetting previous\nknowledge. While this field has seen recent progress, it remains an active\nexploration area. This paper aims to provide a comprehensive and systematic\nreview of FSCIL. In our in-depth examination, we delve into various facets of\nFSCIL, encompassing the problem definition, the discussion of the primary\nchallenges of unreliable empirical risk minimization and the\nstability-plasticity dilemma, general schemes, and relevant problems of IL and\nFew-shot Learning (FSL). Besides, we offer an overview of benchmark datasets\nand evaluation metrics. Furthermore, we introduce the Few-shot\nClass-incremental Classification (FSCIC) methods from data-based,\nstructure-based, and optimization-based approaches and the Few-shot\nClass-incremental Object Detection (FSCIOD) methods from anchor-free and\nanchor-based approaches. Beyond these, we present several promising research\ndirections within FSCIL that merit further investigation.\n","authors":["Jinghua Zhang","Li Liu","Olli Silvén","Matti Pietikäinen","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2308.06764v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01973v3","updated":"2025-01-09T07:26:05Z","published":"2024-12-28T02:28:19Z","title":"INFELM: In-depth Fairness Evaluation of Large Text-To-Image Models","summary":"  The rapid development of large language models (LLMs) and large vision models\n(LVMs) have propelled the evolution of multi-modal AI systems, which have\ndemonstrated the remarkable potential for industrial applications by emulating\nhuman-like cognition. However, they also pose significant ethical challenges,\nincluding amplifying harmful content and reinforcing societal biases. For\ninstance, biases in some industrial image generation models highlighted the\nurgent need for robust fairness assessments. Most existing evaluation\nframeworks focus on the comprehensiveness of various aspects of the models, but\nthey exhibit critical limitations, including insufficient attention to content\ngeneration alignment and social bias-sensitive domains. More importantly, their\nreliance on pixel-detection techniques is prone to inaccuracies.\n  To address these issues, this paper presents INFELM, an in-depth fairness\nevaluation on widely-used text-to-image models. Our key contributions are: (1)\nan advanced skintone classifier incorporating facial topology and refined skin\npixel representation to enhance classification precision by at least 16.04%,\n(2) a bias-sensitive content alignment measurement for understanding societal\nimpacts, (3) a generalizable representation bias evaluation for diverse\ndemographic groups, and (4) extensive experiments analyzing large-scale\ntext-to-image model outputs across six social-bias-sensitive domains. We find\nthat existing models in the study generally do not meet the empirical fairness\ncriteria, and representation bias is generally more pronounced than alignment\nerrors. INFELM establishes a robust benchmark for fairness assessment,\nsupporting the development of multi-modal AI systems that align with ethical\nand human-centric principles.\n","authors":["Di Jin","Xing Liu","Yu Liu","Jia Qing Yap","Andrea Wong","Adriana Crespo","Qi Lin","Zhiyuan Yin","Qiang Yan","Ryan Ye"],"pdf_url":"https://arxiv.org/pdf/2501.01973v3.pdf","comment":"Di Jin and Xing Liu contributed equally to this work"},{"id":"http://arxiv.org/abs/2501.05018v1","updated":"2025-01-09T07:21:44Z","published":"2025-01-09T07:21:44Z","title":"Finding Needles in Emb(a)dding Haystacks: Legal Document Retrieval via\n  Bagging and SVR Ensembles","summary":"  We introduce a retrieval approach leveraging Support Vector Regression (SVR)\nensembles, bootstrap aggregation (bagging), and embedding spaces on the German\nDataset for Legal Information Retrieval (GerDaLIR). By conceptualizing the\nretrieval task in terms of multiple binary needle-in-a-haystack subtasks, we\nshow improved recall over the baselines (0.849 > 0.803 | 0.829) using our\nvoting ensemble, suggesting promising initial results, without training or\nfine-tuning any deep learning models. Our approach holds potential for further\nenhancement, particularly through refining the encoding models and optimizing\nhyperparameters.\n","authors":["Kevin Bönisch","Alexander Mehler"],"pdf_url":"https://arxiv.org/pdf/2501.05018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14571v2","updated":"2025-01-09T07:16:39Z","published":"2024-01-26T00:06:08Z","title":"Driving Towards Inclusion: A Systematic Review of AI-powered\n  Accessibility Enhancements for People with Disability in Autonomous Vehicles","summary":"  This paper provides a comprehensive and, to our knowledge, the first review\nof inclusive human-computer interaction (HCI) within autonomous vehicles (AVs)\nand human-driven cars with partial autonomy, emphasizing accessibility and\nuser-centered design principles. We explore the current technologies and HCI\nsystems designed to enhance passenger experience, particularly for individuals\nwith accessibility needs. Key technologies discussed include brain-computer\ninterfaces, anthropomorphic interaction, virtual reality, augmented reality,\nmode adaptation, voice-activated interfaces, haptic feedback, etc. Each\ntechnology is evaluated for its role in creating an inclusive in-vehicle\nenvironment. Furthermore, we highlight recent interface designs by leading\ncompanies and review emerging concepts and prototypes under development or\ntesting, which show significant potential to address diverse accessibility\nrequirements. Safety considerations, ethical concerns, and adoption of AVs are\nother major issues that require thorough investigation. Building on these\nfindings, we propose an end-to-end design framework that addresses\naccessibility requirements across diverse user demographics, including older\nadults and individuals with physical or cognitive impairments. This work\nprovides actionable insights for designers, researchers, and policymakers\naiming to create safer and more comfortable environments in autonomous and\nregular vehicles accessible to all users.\n","authors":["Ashish Bastola","Hao Wang","Sayed Pedram Haeri Boroujeni","Julian Brinkley","Ata Jahangir Moshayedi","Abolfazl Razi"],"pdf_url":"https://arxiv.org/pdf/2401.14571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05015v1","updated":"2025-01-09T07:16:21Z","published":"2025-01-09T07:16:21Z","title":"On Measuring Unnoticeability of Graph Adversarial Attacks: Observations,\n  New Measure, and Applications","summary":"  Adversarial attacks are allegedly unnoticeable. Prior studies have designed\nattack noticeability measures on graphs, primarily using statistical tests to\ncompare the topology of original and (possibly) attacked graphs. However, we\nobserve two critical limitations in the existing measures. First, because the\nmeasures rely on simple rules, attackers can readily enhance their attacks to\nbypass them, reducing their attack \"noticeability\" and, yet, maintaining their\nattack performance. Second, because the measures naively leverage global\nstatistics, such as degree distributions, they may entirely overlook attacks\nuntil severe perturbations occur, letting the attacks be almost \"totally\nunnoticeable.\" To address the limitations, we introduce HideNSeek, a learnable\nmeasure for graph attack noticeability. First, to mitigate the bypass problem,\nHideNSeek learns to distinguish the original and (potential) attack edges using\na learnable edge scorer (LEO), which scores each edge on its likelihood of\nbeing an attack. Second, to mitigate the overlooking problem, HideNSeek\nconducts imbalance-aware aggregation of all the edge scores to obtain the final\nnoticeability score. Using six real-world graphs, we empirically demonstrate\nthat HideNSeek effectively alleviates the observed limitations, and LEO (i.e.,\nour learnable edge scorer) outperforms eleven competitors in distinguishing\nattack edges under five different attack methods. For an additional\napplication, we show that LEO boost the performance of robust GNNs by removing\nattack-like edges.\n","authors":["Hyeonsoo Jo","Hyunjin Hwang","Fanchen Bu","Soo Yong Lee","Chanyoung Park","Kijung Shin"],"pdf_url":"https://arxiv.org/pdf/2501.05015v1.pdf","comment":"KDD 2025"},{"id":"http://arxiv.org/abs/2501.05014v1","updated":"2025-01-09T07:15:59Z","published":"2025-01-09T07:15:59Z","title":"UAV-VLA: Vision-Language-Action System for Large Scale Aerial Mission\n  Generation","summary":"  The UAV-VLA (Visual-Language-Action) system is a tool designed to facilitate\ncommunication with aerial robots. By integrating satellite imagery processing\nwith the Visual Language Model (VLM) and the powerful capabilities of GPT,\nUAV-VLA enables users to generate general flight paths-and-action plans through\nsimple text requests. This system leverages the rich contextual information\nprovided by satellite images, allowing for enhanced decision-making and mission\nplanning. The combination of visual analysis by VLM and natural language\nprocessing by GPT can provide the user with the path-and-action set, making\naerial operations more efficient and accessible. The newly developed method\nshowed the difference in the length of the created trajectory in 22% and the\nmean error in finding the objects of interest on a map in 34.22 m by Euclidean\ndistance in the K-Nearest Neighbors (KNN) approach.\n","authors":["Oleg Sautenkov","Yasheerah Yaqoot","Artem Lykov","Muhammad Ahsan Mustafa","Grik Tadevosyan","Aibek Akhmetkazy","Miguel Altamirano Cabrera","Mikhail Martynov","Sausar Karaf","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.05014v1.pdf","comment":"HRI 2025"},{"id":"http://arxiv.org/abs/2501.05007v1","updated":"2025-01-09T07:05:22Z","published":"2025-01-09T07:05:22Z","title":"Quantum-enhanced causal discovery for a small number of samples","summary":"  The discovery of causal relationships from observed data has attracted\nsignificant interest from disciplines such as economics, social sciences,\nepidemiology, and biology. In practical applications, considerable knowledge of\nthe underlying systems is often unavailable, and real data are often associated\nwith nonlinear causal structures, which make the direct use of most\nconventional causality analysis methods difficult. This study proposes a novel\nquantum Peter-Clark (qPC) algorithm for causal discovery that does not assume\nany underlying model structures. Based on the independence conditional tests in\na class of reproducing kernel Hilbert spaces characterized by quantum circuits,\nthe proposed qPC algorithm can explore causal relationships from the observed\ndata drawn from arbitrary distributions. We conducted systematic experiments on\nfundamental graph parts of causal structures, demonstrating that the qPC\nalgorithm exhibits a significantly better performance, particularly with\nsmaller sample sizes compared to its classical counterpart. Furthermore, we\nproposed a novel optimization approach based on Kernel Target Alignment (KTA)\nfor determining hyperparameters of quantum kernels. This method effectively\nreduced the risk of false positives in causal discovery, enabling more reliable\ninference. Our theoretical and experimental results demonstrate that the\nproposed quantum algorithm can empower classical algorithms for robust and\naccurate inference in causal discovery, supporting them in regimes where\nclassical algorithms typically fail. Additionally, the effectiveness of this\nmethod was validated using the Boston Housing dataset as a real-world\napplication. These findings demonstrate the new potential of quantum\ncircuit-based causal discovery methods in addressing practical challenges,\nparticularly in small-sample scenarios where traditional approaches have shown\nlimitations.\n","authors":["Yota Maeda","Ken Arai","Yu Tanaka","Yu Terada","Hiroshi Ueno","Hiroyuki Tezuka"],"pdf_url":"https://arxiv.org/pdf/2501.05007v1.pdf","comment":"19 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.07204v5","updated":"2025-01-09T06:53:50Z","published":"2024-02-11T13:30:53Z","title":"ITINERA: Integrating Spatial Optimization with Large Language Models for\n  Open-domain Urban Itinerary Planning","summary":"  Citywalk, a recently popular form of urban travel, requires genuine\npersonalization and understanding of fine-grained requests compared to\ntraditional itinerary planning. In this paper, we introduce the novel task of\nOpen-domain Urban Itinerary Planning (OUIP), which generates personalized urban\nitineraries from user requests in natural language. We then present ITINERA, an\nOUIP system that integrates spatial optimization with large language models to\nprovide customized urban itineraries based on user needs. This involves\ndecomposing user requests, selecting candidate points of interest (POIs),\nordering the POIs based on cluster-aware spatial optimization, and generating\nthe itinerary. Experiments on real-world datasets and the performance of the\ndeployed system demonstrate our system's capacity to deliver personalized and\nspatially coherent itineraries compared to current solutions. Source codes of\nITINERA are available at https://github.com/YihongT/ITINERA.\n","authors":["Yihong Tang","Zhaokai Wang","Ao Qu","Yihao Yan","Zhaofeng Wu","Dingyi Zhuang","Jushi Kai","Kebing Hou","Xiaotong Guo","Han Zheng","Tiange Luo","Jinhua Zhao","Zhan Zhao","Wei Ma"],"pdf_url":"https://arxiv.org/pdf/2402.07204v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10517v5","updated":"2025-01-09T06:41:46Z","published":"2024-08-20T03:35:28Z","title":"Integrating Multi-Modal Input Token Mixer Into Mamba-Based Decision\n  Models: Decision MetaMamba","summary":"  Sequence modeling with State Space models (SSMs) has demonstrated performance\nsurpassing that of Transformers in various tasks, raising expectations for\ntheir potential to outperform the Decision Transformer and its enhanced\nvariants in offline reinforcement learning (RL). However, decision models based\non Mamba, a state-of-the-art SSM, failed to achieve superior performance\ncompared to these enhanced Decision Transformers. We hypothesize that this\nlimitation arises from information loss during the selective scanning phase. To\naddress this, we propose the Decision MetaMamba (DMM), which augments Mamba\nwith a token mixer in its input layer. This mixer explicitly accounts for the\nmultimodal nature of offline RL inputs, comprising state, action, and\nreturn-to-go. The DMM demonstrates improved performance while significantly\nreducing parameter count compared to prior models. Notably, similar performance\ngains were achieved using a simple linear token mixer, emphasizing the\nimportance of preserving information from proximate time steps rather than the\nspecific design of the token mixer itself. This novel modification to Mamba's\ninput layer represents a departure from conventional timestamp-based encoding\napproaches used in Transformers. By enhancing performance of Mamba in offline\nRL, characterized by memory efficiency and fast inference, this work opens new\navenues for its broader application in future RL research.\n","authors":["Wall Kim"],"pdf_url":"https://arxiv.org/pdf/2408.10517v5.pdf","comment":"We have decided to withdraw this manuscript as we believe that the\n  work requires significant improvements and further research to ensure its\n  quality and impact. We are currently pursuing a more comprehensive approach\n  to address the limitations of the current submission and plan to resubmit an\n  improved version in the future"},{"id":"http://arxiv.org/abs/2408.16030v2","updated":"2025-01-09T06:33:24Z","published":"2024-08-28T09:30:20Z","title":"Deep Learning-Based Automatic Multi-Level Airway Collapse Monitoring on\n  Obstructive Sleep Apnea Patients","summary":"  This study investigated the use of deep learning to identify multi-level\nupper airway collapses in obstructive sleep apnea (OSA) patients based on\nsnoring sounds. We fi-ne-tuned ResNet-50 and Audio Spectrogram Transformer\n(AST) models using snoring recordings from 37 subjects undergoing drug-induced\nsleep endoscopy (DISE) between 2020 and 2021. Snoring sounds were labeled\naccording to the VOTE (Velum, Orophar-ynx, Tongue Base, Epiglottis)\nclassification, resulting in 259 V, 403 O, 77 T, 13 E, 1016 VO, 46 VT, 140 OT,\n39 OE, 30 VOT, and 3150 non-snoring (N) 0.5-second clips. The models were\ntrained for two multi-label classification tasks: identifying obstructions at\nV, O, T, and E levels, and identifying retropalatal (RP) and retroglossal (RG)\nobstruc-tions. Results showed AST slightly outperformed ResNet-50,\ndemonstrating good abil-ity to identify V (F1-score: 0.71, MCC: 0.61, AUC:\n0.89), O (F1-score: 0.80, MCC: 0.72, AUC: 0.94), and RP obstructions (F1-score:\n0.86, MCC: 0.77, AUC: 0.97). However, both models struggled with T, E, and RG\nclassifications due to limited data. Retrospective analysis of a full-night\nrecording showed the potential to profile airway obstruction dynamics. We\nexpect this information, combined with polysomnography and other clinical\nparameters, can aid clinical triage and treatment planning for OSA patients.\n","authors":["Ying-Chieh Hsu","Stanley Yung-Chuan Liu","Chao-Jung Huang","Chi-Wei Wu","Ren-Kai Cheng","Jane Yung-Jen Hsu","Shang-Ran Huang","Yuan-Ren Cheng","Fu-Shun Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.16030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04997v1","updated":"2025-01-09T06:26:28Z","published":"2025-01-09T06:26:28Z","title":"GiNet: Integrating Sequential and Context-Aware Learning for Battery\n  Capacity Prediction","summary":"  The surging demand for batteries requires advanced battery management\nsystems, where battery capacity modelling is a key functionality. In this\npaper, we aim to achieve accurate battery capacity prediction by learning from\nhistorical measurements of battery dynamics. We propose GiNet, a gated\nrecurrent units enhanced Informer network, for predicting battery's capacity.\nThe novelty and competitiveness of GiNet lies in its capability of capturing\nsequential and contextual information from raw battery data and reflecting the\nbattery's complex behaviors with both temporal dynamics and long-term\ndependencies. We conducted an experimental study based on a publicly available\ndataset to showcase GiNet's strength of gaining a holistic understanding of\nbattery behavior and predicting battery capacity accurately. GiNet achieves\n0.11 mean absolute error for predicting the battery capacity in a sequence of\nfuture time slots without knowing the historical battery capacity. It also\noutperforms the latest algorithms significantly with 27% error reduction on\naverage compared to Informer. The promising results highlight the importance of\ncustomized and optimized integration of algorithm and battery knowledge and\nshed light on other industry applications as well.\n","authors":["Sara Sameer","Wei Zhang","Xin Lou","Qingyu Yan","Terence Goh","Yulin Gao"],"pdf_url":"https://arxiv.org/pdf/2501.04997v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2501.04995v1","updated":"2025-01-09T06:20:00Z","published":"2025-01-09T06:20:00Z","title":"IPDN: Image-enhanced Prompt Decoding Network for 3D Referring Expression\n  Segmentation","summary":"  3D Referring Expression Segmentation (3D-RES) aims to segment point cloud\nscenes based on a given expression. However, existing 3D-RES approaches face\ntwo major challenges: feature ambiguity and intent ambiguity. Feature ambiguity\narises from information loss or distortion during point cloud acquisition due\nto limitations such as lighting and viewpoint. Intent ambiguity refers to the\nmodel's equal treatment of all queries during the decoding process, lacking\ntop-down task-specific guidance. In this paper, we introduce an Image enhanced\nPrompt Decoding Network (IPDN), which leverages multi-view images and\ntask-driven information to enhance the model's reasoning capabilities. To\naddress feature ambiguity, we propose the Multi-view Semantic Embedding (MSE)\nmodule, which injects multi-view 2D image information into the 3D scene and\ncompensates for potential spatial information loss. To tackle intent ambiguity,\nwe designed a Prompt-Aware Decoder (PAD) that guides the decoding process by\nderiving task-driven signals from the interaction between the expression and\nvisual features. Comprehensive experiments demonstrate that IPDN outperforms\nthe state-ofthe-art by 1.9 and 4.2 points in mIoU metrics on the 3D-RES and\n3D-GRES tasks, respectively.\n","authors":["Qi Chen","Changli Wu","Jiayi Ji","Yiwei Ma","Danni Yang","Xiaoshuai Sun"],"pdf_url":"https://arxiv.org/pdf/2501.04995v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2410.14368v2","updated":"2025-01-09T06:02:11Z","published":"2024-10-18T10:53:44Z","title":"CoMAL: Collaborative Multi-Agent Large Language Models for\n  Mixed-Autonomy Traffic","summary":"  The integration of autonomous vehicles into urban traffic has great potential\nto improve efficiency by reducing congestion and optimizing traffic flow\nsystematically. In this paper, we introduce CoMAL (Collaborative Multi-Agent\nLLMs), a framework designed to address the mixed-autonomy traffic problem by\ncollaboration among autonomous vehicles to optimize traffic flow. CoMAL is\nbuilt upon large language models, operating in an interactive traffic\nsimulation environment. It utilizes a Perception Module to observe surrounding\nagents and a Memory Module to store strategies for each agent. The overall\nworkflow includes a Collaboration Module that encourages autonomous vehicles to\ndiscuss the effective strategy and allocate roles, a reasoning engine to\ndetermine optimal behaviors based on assigned roles, and an Execution Module\nthat controls vehicle actions using a hybrid approach combining rule-based\nmodels. Experimental results demonstrate that CoMAL achieves superior\nperformance on the Flow benchmark. Additionally, we evaluate the impact of\ndifferent language models and compare our framework with reinforcement learning\napproaches. It highlights the strong cooperative capability of LLM agents and\npresents a promising solution to the mixed-autonomy traffic challenge. The code\nis available at https://github.com/Hyan-Yao/CoMAL.\n","authors":["Huaiyuan Yao","Longchao Da","Vishnu Nandam","Justin Turnau","Zhiwei Liu","Linsey Pang","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2410.14368v2.pdf","comment":"8 pages, 4 figures, accepted to SDM25"},{"id":"http://arxiv.org/abs/2501.04982v1","updated":"2025-01-09T05:45:03Z","published":"2025-01-09T05:45:03Z","title":"CuRLA: Curriculum Learning Based Deep Reinforcement Learning for\n  Autonomous Driving","summary":"  In autonomous driving, traditional Computer Vision (CV) agents often struggle\nin unfamiliar situations due to biases in the training data. Deep Reinforcement\nLearning (DRL) agents address this by learning from experience and maximizing\nrewards, which helps them adapt to dynamic environments. However, ensuring\ntheir generalization remains challenging, especially with static training\nenvironments. Additionally, DRL models lack transparency, making it difficult\nto guarantee safety in all scenarios, particularly those not seen during\ntraining. To tackle these issues, we propose a method that combines DRL with\nCurriculum Learning for autonomous driving. Our approach uses a Proximal Policy\nOptimization (PPO) agent and a Variational Autoencoder (VAE) to learn safe\ndriving in the CARLA simulator. The agent is trained using two-fold curriculum\nlearning, progressively increasing environment difficulty and incorporating a\ncollision penalty in the reward function to promote safety. This method\nimproves the agent's adaptability and reliability in complex environments, and\nunderstand the nuances of balancing multiple reward components from different\nfeedback signals in a single scalar reward function. Keywords: Computer Vision,\nDeep Reinforcement Learning, Variational Autoencoder, Proximal Policy\nOptimization, Curriculum Learning, Autonomous Driving.\n","authors":["Bhargava Uppuluri","Anjel Patel","Neil Mehta","Sridhar Kamath","Pratyush Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2501.04982v1.pdf","comment":"To be published in the 17th International Conference on Agents and\n  Artificial Intelligence (ICAART), Feb 2025"},{"id":"http://arxiv.org/abs/2501.04974v1","updated":"2025-01-09T05:06:44Z","published":"2025-01-09T05:06:44Z","title":"SensorQA: A Question Answering Benchmark for Daily-Life Monitoring","summary":"  With the rapid growth in sensor data, effectively interpreting and\ninterfacing with these data in a human-understandable way has become crucial.\nWhile existing research primarily focuses on learning classification models,\nfewer studies have explored how end users can actively extract useful insights\nfrom sensor data, often hindered by the lack of a proper dataset. To address\nthis gap, we introduce \\Dataset, the first human-created question-answering\n(QA) dataset for long-term time-series sensor data for daily life monitoring.\n\\Dataset is created by human workers and includes 5.6K diverse and practical\nqueries that reflect genuine human interests, paired with accurate answers\nderived from sensor data. We further establish benchmarks for state-of-the-art\nAI models on this dataset and evaluate their performance on typical edge\ndevices. Our results reveal a gap between current models and optimal QA\nperformance and efficiency, highlighting the need for new contributions. The\ndataset and code are available at:\n\\url{https://github.com/benjamin-reichman/SensorQA}.\n","authors":["Benjamin Reichman","Xiaofan Yu","Lanxiang Hu","Jack Truxal","Atishay Jain","Rushil Chandrupatla","Tajana Šimunić Rosing","Larry Heck"],"pdf_url":"https://arxiv.org/pdf/2501.04974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04970v1","updated":"2025-01-09T04:59:15Z","published":"2025-01-09T04:59:15Z","title":"Battling the Non-stationarity in Time Series Forecasting via Test-time\n  Adaptation","summary":"  Deep Neural Networks have spearheaded remarkable advancements in time series\nforecasting (TSF), one of the major tasks in time series modeling. Nonetheless,\nthe non-stationarity of time series undermines the reliability of pre-trained\nsource time series forecasters in mission-critical deployment settings. In this\nstudy, we introduce a pioneering test-time adaptation framework tailored for\nTSF (TSF-TTA). TAFAS, the proposed approach to TSF-TTA, flexibly adapts source\nforecasters to continuously shifting test distributions while preserving the\ncore semantic information learned during pre-training. The novel utilization of\npartially-observed ground truth and gated calibration module enables proactive,\nrobust, and model-agnostic adaptation of source forecasters. Experiments on\ndiverse benchmark datasets and cutting-edge architectures demonstrate the\nefficacy and generality of TAFAS, especially in long-term forecasting scenarios\nthat suffer from significant distribution shifts. The code is available at\nhttps://github.com/kimanki/TAFAS.\n","authors":["HyunGi Kim","Siwon Kim","Jisoo Mok","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2501.04970v1.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.04961v1","updated":"2025-01-09T04:26:15Z","published":"2025-01-09T04:26:15Z","title":"Demystifying Domain-adaptive Post-training for Financial LLMs","summary":"  Domain-adaptive post-training of large language models (LLMs) has emerged as\na promising approach for specialized domains such as medicine and finance.\nHowever, significant challenges remain in identifying optimal adaptation\ncriteria and training strategies across varying data and model configurations.\nTo address these challenges, we introduce FINDAP, a systematic and fine-grained\ninvestigation into domain-adaptive post-training of LLMs for the finance\ndomain. Our approach begins by identifying the core capabilities required for\nthe target domain and designing a comprehensive evaluation suite aligned with\nthese needs. We then analyze the effectiveness of key post-training stages,\nincluding continual pretraining, instruction tuning, and preference alignment.\nBuilding on these insights, we propose an effective training recipe centered on\na novel preference data distillation method, which leverages process signals\nfrom a generative reward model. The resulting model, Llama-Fin, achieves\nstate-of-the-art performance across a wide range of financial tasks. Our\nanalysis also highlights how each post-training stage contributes to distinct\ncapabilities, uncovering specific challenges and effective solutions, providing\nvaluable insights for domain adaptation of LLMs. Project page:\nhttps://github.com/SalesforceAIResearch/FinDap\n","authors":["Zixuan Ke","Yifei Ming","Xuan-Phi Nguyen","Caiming Xiong","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2501.04961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03847v2","updated":"2025-01-09T04:25:42Z","published":"2025-01-07T15:01:58Z","title":"Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video\n  Generation Control","summary":"  Diffusion models have demonstrated impressive performance in generating\nhigh-quality videos from text prompts or images. However, precise control over\nthe video generation process, such as camera manipulation or content editing,\nremains a significant challenge. Existing methods for controlled video\ngeneration are typically limited to a single control type, lacking the\nflexibility to handle diverse control demands. In this paper, we introduce\nDiffusion as Shader (DaS), a novel approach that supports multiple video\ncontrol tasks within a unified architecture. Our key insight is that achieving\nversatile video control necessitates leveraging 3D control signals, as videos\nare fundamentally 2D renderings of dynamic 3D content. Unlike prior methods\nlimited to 2D control signals, DaS leverages 3D tracking videos as control\ninputs, making the video diffusion process inherently 3D-aware. This innovation\nallows DaS to achieve a wide range of video controls by simply manipulating the\n3D tracking videos. A further advantage of using 3D tracking videos is their\nability to effectively link frames, significantly enhancing the temporal\nconsistency of the generated videos. With just 3 days of fine-tuning on 8 H800\nGPUs using less than 10k videos, DaS demonstrates strong control capabilities\nacross diverse tasks, including mesh-to-video generation, camera control,\nmotion transfer, and object manipulation.\n","authors":["Zekai Gu","Rui Yan","Jiahao Lu","Peng Li","Zhiyang Dou","Chenyang Si","Zhen Dong","Qifeng Liu","Cheng Lin","Ziwei Liu","Wenping Wang","Yuan Liu"],"pdf_url":"https://arxiv.org/pdf/2501.03847v2.pdf","comment":"Project page: https://igl-hkust.github.io/das/ Codes:\n  https://github.com/IGL-HKUST/DiffusionAsShader"},{"id":"http://arxiv.org/abs/2501.04958v1","updated":"2025-01-09T04:20:12Z","published":"2025-01-09T04:20:12Z","title":"Addressing Domain Shift via Imbalance-Aware Domain Adaptation in Embryo\n  Development Assessment","summary":"  Deep learning models in medical imaging face dual challenges: domain shift,\nwhere models perform poorly when deployed in settings different from their\ntraining environment, and class imbalance, where certain disease conditions are\nnaturally underrepresented. We present Imbalance-Aware Domain Adaptation\n(IADA), a novel framework that simultaneously tackles both challenges through\nthree key components: (1) adaptive feature learning with class-specific\nattention mechanisms, (2) balanced domain alignment with dynamic weighting, and\n(3) adaptive threshold optimization. Our theoretical analysis establishes\nconvergence guarantees and complexity bounds. Through extensive experiments on\nembryo development assessment across four imaging modalities, IADA demonstrates\nsignificant improvements over existing methods, achieving up to 25.19\\% higher\naccuracy while maintaining balanced performance across classes. In challenging\nscenarios with low-quality imaging systems, IADA shows robust generalization\nwith AUC improvements of up to 12.56\\%. These results demonstrate IADA's\npotential for developing reliable and equitable medical imaging systems for\ndiverse clinical settings. The code is made public available at\n\\url{https://github.com/yinghemedical/imbalance-aware_domain_adaptation}\n","authors":["Lei Li","Xinglin Zhang","Jun Liang","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.04958v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2501.04945v1","updated":"2025-01-09T03:34:07Z","published":"2025-01-09T03:34:07Z","title":"Step-by-Step Mastery: Enhancing Soft Constraint Following Ability of\n  Large Language Models","summary":"  It is crucial for large language models (LLMs) to follow instructions that\ninvolve multiple constraints. However, soft constraints are semantically\nrelated and difficult to verify through automated methods. These constraints\nremain a significant challenge for LLMs. To enhance the ability of LLMs to\nfollow soft constraints, we initially design a pipeline to obtain high-quality\noutputs automatically. Additionally, to fully utilize the acquired data, we\nintroduce a training paradigm based on curriculum learning. We experimentally\nevaluate the effectiveness of our methods in improving LLMs' soft constraint\nfollowing ability and analyze the factors driving the improvements. The\ndatasets and code are publicly available at\nhttps://github.com/Rainier-rq/FollowSoftConstraints.\n","authors":["Qingyu Ren","Jie Zeng","Qianyu He","Jiaqing Liang","Yanghua Xiao","Weikang Zhou","Zeye Sun","Fei Yu"],"pdf_url":"https://arxiv.org/pdf/2501.04945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15594v3","updated":"2025-01-09T03:08:17Z","published":"2024-11-23T16:03:35Z","title":"A Survey on LLM-as-a-Judge","summary":"  Accurate and consistent evaluation is crucial for decision-making across\nnumerous fields, yet it remains a challenging task due to inherent\nsubjectivity, variability, and scale. Large Language Models (LLMs) have\nachieved remarkable success across diverse domains, leading to the emergence of\n\"LLM-as-a-Judge,\" where LLMs are employed as evaluators for complex tasks. With\ntheir ability to process diverse data types and provide scalable,\ncost-effective, and consistent assessments, LLMs present a compelling\nalternative to traditional expert-driven evaluations. However, ensuring the\nreliability of LLM-as-a-Judge systems remains a significant challenge that\nrequires careful design and standardization. This paper provides a\ncomprehensive survey of LLM-as-a-Judge, addressing the core question: How can\nreliable LLM-as-a-Judge systems be built? We explore strategies to enhance\nreliability, including improving consistency, mitigating biases, and adapting\nto diverse assessment scenarios. Additionally, we propose methodologies for\nevaluating the reliability of LLM-as-a-Judge systems, supported by a novel\nbenchmark designed for this purpose. To advance the development and real-world\ndeployment of LLM-as-a-Judge systems, we also discussed practical applications,\nchallenges, and future directions. This survey serves as a foundational\nreference for researchers and practitioners in this rapidly evolving field.\n","authors":["Jiawei Gu","Xuhui Jiang","Zhichao Shi","Hexiang Tan","Xuehao Zhai","Chengjin Xu","Wei Li","Yinghan Shen","Shengjie Ma","Honghao Liu","Yuanzhuo Wang","Jian Guo"],"pdf_url":"https://arxiv.org/pdf/2411.15594v3.pdf","comment":"Corrected typos & more discussion on reasoning models 33 pages, 9\n  figures. arXiv admin note: text overlap with arXiv:2310.05470 by other\n  authors"},{"id":"http://arxiv.org/abs/2501.04931v1","updated":"2025-01-09T02:47:01Z","published":"2025-01-09T02:47:01Z","title":"Jailbreaking Multimodal Large Language Models via Shuffle Inconsistency","summary":"  Multimodal Large Language Models (MLLMs) have achieved impressive performance\nand have been put into practical use in commercial applications, but they still\nhave potential safety mechanism vulnerabilities. Jailbreak attacks are red\nteaming methods that aim to bypass safety mechanisms and discover MLLMs'\npotential risks. Existing MLLMs' jailbreak methods often bypass the model's\nsafety mechanism through complex optimization methods or carefully designed\nimage and text prompts. Despite achieving some progress, they have a low attack\nsuccess rate on commercial closed-source MLLMs. Unlike previous research, we\nempirically find that there exists a Shuffle Inconsistency between MLLMs'\ncomprehension ability and safety ability for the shuffled harmful instruction.\nThat is, from the perspective of comprehension ability, MLLMs can understand\nthe shuffled harmful text-image instructions well. However, they can be easily\nbypassed by the shuffled harmful instructions from the perspective of safety\nability, leading to harmful responses. Then we innovatively propose a\ntext-image jailbreak attack named SI-Attack. Specifically, to fully utilize the\nShuffle Inconsistency and overcome the shuffle randomness, we apply a\nquery-based black-box optimization method to select the most harmful shuffled\ninputs based on the feedback of the toxic judge model. A series of experiments\nshow that SI-Attack can improve the attack's performance on three benchmarks.\nIn particular, SI-Attack can obviously improve the attack success rate for\ncommercial MLLMs such as GPT-4o or Claude-3.5-Sonnet.\n","authors":["Shiji Zhao","Ranjie Duan","Fengxiang Wang","Chi Chen","Caixin Kang","Jialing Tao","YueFeng Chen","Hui Xue","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2501.04931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04928v1","updated":"2025-01-09T02:36:21Z","published":"2025-01-09T02:36:21Z","title":"Image2CADSeq: Computer-Aided Design Sequence and Knowledge Inference\n  from Product Images","summary":"  Computer-aided design (CAD) tools empower designers to design and modify 3D\nmodels through a series of CAD operations, commonly referred to as a CAD\nsequence. In scenarios where digital CAD files are not accessible, reverse\nengineering (RE) has been used to reconstruct 3D CAD models. Recent advances\nhave seen the rise of data-driven approaches for RE, with a primary focus on\nconverting 3D data, such as point clouds, into 3D models in boundary\nrepresentation (B-rep) format. However, obtaining 3D data poses significant\nchallenges, and B-rep models do not reveal knowledge about the 3D modeling\nprocess of designs. To this end, our research introduces a novel data-driven\napproach with an Image2CADSeq neural network model. This model aims to reverse\nengineer CAD models by processing images as input and generating CAD sequences.\nThese sequences can then be translated into B-rep models using a solid modeling\nkernel. Unlike B-rep models, CAD sequences offer enhanced flexibility to modify\nindividual steps of model creation, providing a deeper understanding of the\nconstruction process of CAD models. To quantitatively and rigorously evaluate\nthe predictive performance of the Image2CADSeq model, we have developed a\nmulti-level evaluation framework for model assessment. The model was trained on\na specially synthesized dataset, and various network architectures were\nexplored to optimize the performance. The experimental and validation results\nshow great potential for the model in generating CAD sequences from 2D image\ndata.\n","authors":["Xingang Li","Zhenghui Sha"],"pdf_url":"https://arxiv.org/pdf/2501.04928v1.pdf","comment":"20 pages, 10 figures, and 6 tables"},{"id":"http://arxiv.org/abs/2404.06429v3","updated":"2025-01-09T02:34:25Z","published":"2024-04-09T16:20:03Z","title":"Magic-Boost: Boost 3D Generation with Multi-View Conditioned Diffusion","summary":"  Benefiting from the rapid development of 2D diffusion models, 3D content\ngeneration has witnessed significant progress. One promising solution is to\nfinetune the pre-trained 2D diffusion models to produce multi-view images and\nthen reconstruct them into 3D assets via feed-forward sparse-view\nreconstruction models. However, limited by the 3D inconsistency in the\ngenerated multi-view images and the low reconstruction resolution of the\nfeed-forward reconstruction models, the generated 3d assets are still limited\nto incorrect geometries and blurry textures. To address this problem, we\npresent a multi-view based refine method, named Magic-Boost, to further refine\nthe generation results. In detail, we first propose a novel multi-view\nconditioned diffusion model which extracts 3d prior from the synthesized\nmulti-view images to synthesize high-fidelity novel view images and then\nintroduce a novel iterative-update strategy to adopt it to provide precise\nguidance to refine the coarse generated results through a fast optimization\nprocess. Conditioned on the strong 3d priors extracted from the synthesized\nmulti-view images, Magic-Boost is capable of providing precise optimization\nguidance that well aligns with the coarse generated 3D assets, enriching the\nlocal detail in both geometry and texture within a short time ($\\sim15$min).\nExtensive experiments show Magic-Boost greatly enhances the coarse generated\ninputs, generates high-quality 3D assets with rich geometric and textural\ndetails. (Project Page: https://magic-research.github.io/magic-boost/)\n","authors":["Fan Yang","Jianfeng Zhang","Yichun Shi","Bowen Chen","Chenxu Zhang","Huichao Zhang","Xiaofeng Yang","Xiu Li","Jiashi Feng","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06429v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17052v2","updated":"2025-01-09T02:33:14Z","published":"2024-12-22T15:05:30Z","title":"ViLBias: A Comprehensive Framework for Bias Detection through Linguistic\n  and Visual Cues , presenting Annotation Strategies, Evaluation, and Key\n  Challenges","summary":"  The integration of Large Language Models (LLMs) and Vision-Language Models\n(VLMs) opens new avenues for addressing complex challenges in multimodal\ncontent analysis, particularly in biased news detection. This study introduces\nVLBias, a framework that leverages state-of-the-art LLMs and VLMs to detect\nlinguistic and visual biases in news content. We present a multimodal dataset\ncomprising textual content and corresponding images from diverse news sources.\nWe propose a hybrid annotation framework that combines LLM-based annotations\nwith human review to ensure high-quality labeling while reducing costs and\nenhancing scalability. Our evaluation compares the performance of\nstate-of-the-art SLMs and LLMs for both modalities (text and images) and the\nresults reveal that while SLMs are computationally efficient, LLMs demonstrate\nsuperior accuracy in identifying subtle framing and text-visual\ninconsistencies. Furthermore, empirical analysis shows that incorporating\nvisual cues alongside textual data improves bias detection accuracy by 3 to 5%.\nThis study provides a comprehensive exploration of LLMs, SLMs, and VLMs as\ntools for detecting multimodal biases in news content and highlights their\nrespective strengths, limitations, and potential for future applications\n","authors":["Shaina Raza","Caesar Saleh","Emrul Hasan","Franklin Ogidi","Maximus Powers","Veronica Chatrath","Marcelo Lotif","Roya Javadi","Anam Zahid","Vahid Reza Khazaie"],"pdf_url":"https://arxiv.org/pdf/2412.17052v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.04926v1","updated":"2025-01-09T02:30:26Z","published":"2025-01-09T02:30:26Z","title":"FLowHigh: Towards Efficient and High-Quality Audio Super-Resolution with\n  Single-Step Flow Matching","summary":"  Audio super-resolution is challenging owing to its ill-posed nature.\nRecently, the application of diffusion models in audio super-resolution has\nshown promising results in alleviating this challenge. However, diffusion-based\nmodels have limitations, primarily the necessity for numerous sampling steps,\nwhich causes significantly increased latency when synthesizing high-quality\naudio samples. In this paper, we propose FLowHigh, a novel approach that\nintegrates flow matching, a highly efficient generative model, into audio\nsuper-resolution. We also explore probability paths specially tailored for\naudio super-resolution, which effectively capture high-resolution audio\ndistributions, thereby enhancing reconstruction quality. The proposed method\ngenerates high-fidelity, high-resolution audio through a single-step sampling\nprocess across various input sampling rates. The experimental results on the\nVCTK benchmark dataset demonstrate that FLowHigh achieves state-of-the-art\nperformance in audio super-resolution, as evaluated by log-spectral distance\nand ViSQOL while maintaining computational efficiency with only a single-step\nsampling process.\n","authors":["Jun-Hak Yun","Seung-Bin Kim","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2501.04926v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.04070v2","updated":"2025-01-09T02:20:13Z","published":"2025-01-07T14:57:08Z","title":"More is not always better? Enhancing Many-Shot In-Context Learning with\n  Differentiated and Reweighting Objectives","summary":"  Large language models (LLMs) excel at few-shot in-context learning (ICL)\nwithout requiring parameter updates. However, as the number of ICL\ndemonstrations increases from a few to many, performance tends to plateau and\neventually decline. We identify two primary causes for this trend: the\nsuboptimal negative log-likelihood (NLL) optimization objective and the\nincremental data noise. To address these issues, we introduce DrICL, a novel\noptimization method that enhances model performance through Differentiated\nLearning and advantage-based Reweighting objectives. Globally, DrICL utilizes\ndifferentiated learning to optimize the NLL objective, ensuring that many-shot\nperformance surpasses zero-shot levels. Locally, it dynamically adjusts the\nweighting of many-shot demonstrations by leveraging cumulative advantages\ninspired by reinforcement learning, thereby improving generalization. This\napproach allows the model to handle varying numbers of shots effectively,\nmitigating the impact of noisy data. Recognizing the lack of multi-task\ndatasets with diverse many-shot distributions, we develop the Many-Shot ICL\nBenchmark (ICL-50)-a large-scale benchmark of 50 tasks that cover shot numbers\nfrom 1 to 350 within sequences of up to 8,000 tokens-for fine-tuning purposes.\nICL-50 facilitates the evaluation of many-shot ICL strategies across seven\nprominent NLP tasks and 50 distinct datasets. Experimental results demonstrate\nthat LLMs enhanced with DrICL achieve significant improvements in many-shot\nsetups across various tasks, including both in-domain and out-of-domain\nscenarios. We release the code and benchmark dataset hoping to facilitate\nfurther research in many-shot ICL.\n","authors":["Xiaoqing Zhang","Ang Lv","Yuhan Liu","Flood Sung","Wei Liu","Shuo Shang","Xiuying Chen","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2501.04070v2.pdf","comment":"13 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2501.04228v2","updated":"2025-01-09T01:35:56Z","published":"2025-01-08T01:59:47Z","title":"Constraints as Rewards: Reinforcement Learning for Robots without Reward\n  Functions","summary":"  Reinforcement learning has become an essential algorithm for generating\ncomplex robotic behaviors. However, to learn such behaviors, it is necessary to\ndesign a reward function that describes the task, which often consists of\nmultiple objectives that needs to be balanced. This tuning process is known as\nreward engineering and typically involves extensive trial-and-error. In this\npaper, to avoid this trial-and-error process, we propose the concept of\nConstraints as Rewards (CaR). CaR formulates the task objective using multiple\nconstraint functions instead of a reward function and solves a reinforcement\nlearning problem with constraints using the Lagrangian-method. By adopting this\napproach, different objectives are automatically balanced, because Lagrange\nmultipliers serves as the weights among the objectives. In addition, we will\ndemonstrate that constraints, expressed as inequalities, provide an intuitive\ninterpretation of the optimization target designed for the task. We apply the\nproposed method to the standing-up motion generation task of a\nsix-wheeled-telescopic-legged robot and demonstrate that the proposed method\nsuccessfully acquires the target behavior, even though it is challenging to\nlearn with manually designed reward functions.\n","authors":["Yu Ishihara","Noriaki Takasugi","Kotaro Kawakami","Masaya Kinoshita","Kazumi Aoyama"],"pdf_url":"https://arxiv.org/pdf/2501.04228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17077v4","updated":"2025-01-09T01:29:00Z","published":"2023-06-29T16:28:34Z","title":"RAPGen: An Approach for Fixing Code Inefficiencies in Zero-Shot","summary":"  Performance bugs are non-functional bugs that can even manifest in\nwell-tested commercial products. Fixing these performance bugs is an important\nyet challenging problem. In this work, we address this challenge and present a\nnew approach called Retrieval-Augmented Prompt Generation (RAPGen). Given a\ncode snippet with a performance issue, RAPGen first retrieves a prompt\ninstruction from a pre-constructed knowledge-base of previous performance bug\nfixes and then generates a prompt using the retrieved instruction. It then uses\nthis prompt on a Large Language Model (such as Codex) in zero-shot to generate\na fix. We compare our approach with the various prompt variations and state of\nthe art methods in the task of performance bug fixing. Our evaluation shows\nthat RAPGen can generate performance improvement suggestions equivalent or\nbetter than a developer in ~60% of the cases, getting ~42% of them verbatim, in\nan expert-verified dataset of past performance changes made by C# developers.\n","authors":["Spandan Garg","Roshanak Zilouchian Moghaddam","Neel Sundaresan"],"pdf_url":"https://arxiv.org/pdf/2306.17077v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04899v1","updated":"2025-01-09T01:24:59Z","published":"2025-01-09T01:24:59Z","title":"SUGAR: Leveraging Contextual Confidence for Smarter Retrieval","summary":"  Bearing in mind the limited parametric knowledge of Large Language Models\n(LLMs), retrieval-augmented generation (RAG) which supplies them with the\nrelevant external knowledge has served as an approach to mitigate the issue of\nhallucinations to a certain extent. However, uniformly retrieving supporting\ncontext makes response generation source-inefficient, as triggering the\nretriever is not always necessary, or even inaccurate, when a model gets\ndistracted by noisy retrieved content and produces an unhelpful answer.\nMotivated by these issues, we introduce Semantic Uncertainty Guided Adaptive\nRetrieval (SUGAR), where we leverage context-based entropy to actively decide\nwhether to retrieve and to further determine between single-step and multi-step\nretrieval. Our empirical results show that selective retrieval guided by\nsemantic uncertainty estimation improves the performance across diverse\nquestion answering tasks, as well as achieves a more efficient inference.\n","authors":["Hanna Zubkova","Ji-Hoon Park","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2501.04899v1.pdf","comment":"ICASSP2025"},{"id":"http://arxiv.org/abs/2501.04896v1","updated":"2025-01-09T00:50:44Z","published":"2025-01-09T00:50:44Z","title":"Quantifying Itch and its Impact on Sleep Using Machine Learning and\n  Radio Signals","summary":"  Chronic itch affects 13% of the US population, is highly debilitating, and\nunderlies many medical conditions. A major challenge in clinical care and new\ntherapeutics development is the lack of an objective measure for quantifying\nitch, leading to reliance on subjective measures like patients' self-assessment\nof itch severity. In this paper, we show that a home radio device paired with\nartificial intelligence (AI) can concurrently capture scratching and evaluate\nits impact on sleep quality by analyzing radio signals bouncing in the\nenvironment. The device eliminates the need for wearable sensors or skin\ncontact, enabling monitoring of chronic itch over extended periods at home\nwithout burdening patients or interfering with their skin condition. To\nvalidate the technology, we conducted an observational clinical study of\nchronic pruritus patients, monitored at home for one month using both the radio\ndevice and an infrared camera. Comparing the output of the device to ground\ntruth data from the camera demonstrates its feasibility and accuracy (ROC AUC =\n0.997, sensitivity = 0.825, specificity = 0.997). The results reveal a\nsignificant correlation between scratching and low sleep quality, manifested as\na reduction in sleep efficiency (R = 0.6, p < 0.001) and an increase in sleep\nlatency (R = 0.68, p < 0.001). Our study underscores the potential of passive,\nlong-term, at-home monitoring of chronic scratching and its sleep implications,\noffering a valuable tool for both clinical care of chronic itch patients and\npharmaceutical clinical trials.\n","authors":["Michail Ouroutzoglou","Mingmin Zhao","Joshua Hellerstein","Hariharan Rahul","Asima Badic","Brian S. Kim","Dina Katabi"],"pdf_url":"https://arxiv.org/pdf/2501.04896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05315v2","updated":"2025-01-09T00:11:59Z","published":"2024-10-05T03:37:07Z","title":"PalmBench: A Comprehensive Benchmark of Compressed Large Language Models\n  on Mobile Platforms","summary":"  Deploying large language models (LLMs) locally on mobile devices is\nadvantageous in scenarios where transmitting data to remote cloud servers is\neither undesirable due to privacy concerns or impractical due to network\nconnection. Recent advancements (MLC, 2023a; Gerganov, 2023) have facilitated\nthe local deployment of LLMs. However, local deployment also presents\nchallenges, particularly in balancing quality (generative performance),\nlatency, and throughput within the hardware constraints of mobile devices. In\nthis paper, we introduce our lightweight, all-in-one automated benchmarking\nframework that allows users to evaluate LLMs on mobile devices. We provide a\ncomprehensive benchmark of various popular LLMs with different quantization\nconfigurations (both weights and activations) across multiple mobile platforms\nwith varying hardware capabilities. Unlike traditional benchmarks that assess\nfull-scale models on high-end GPU clusters, we focus on evaluating resource\nefficiency (memory and power consumption) and harmful output for compressed\nmodels on mobile devices. Our key observations include i) differences in energy\nefficiency and throughput across mobile platforms; ii) the impact of\nquantization on memory usage, GPU execution time, and power consumption; and\niii) accuracy and performance degradation of quantized models compared to their\nnon-quantized counterparts; and iv) the frequency of hallucinations and toxic\ncontent generated by compressed LLMs on mobile devices.\n","authors":["Yilong Li","Jingyu Liu","Hao Zhang","M Badri Narayanan","Utkarsh Sharma","Shuai Zhang","Pan Hu","Yijing Zeng","Jayaram Raghuram","Suman Banerjee"],"pdf_url":"https://arxiv.org/pdf/2410.05315v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2501.05614v1","updated":"2025-01-09T23:25:06Z","published":"2025-01-09T23:25:06Z","title":"Watermarking Graph Neural Networks via Explanations for Ownership\n  Protection","summary":"  Graph Neural Networks (GNNs) are the mainstream method to learn pervasive\ngraph data and are widely deployed in industry, making their intellectual\nproperty valuable. However, protecting GNNs from unauthorized use remains a\nchallenge. Watermarking, which embeds ownership information into a model, is a\npotential solution. However, existing watermarking methods have two key\nlimitations: First, almost all of them focus on non-graph data, with\nwatermarking GNNs for complex graph data largely unexplored. Second, the de\nfacto backdoor-based watermarking methods pollute training data and induce\nownership ambiguity through intentional misclassification. Our\nexplanation-based watermarking inherits the strengths of backdoor-based methods\n(e.g., robust to watermark removal attacks), but avoids data pollution and\neliminates intentional misclassification. In particular, our method learns to\nembed the watermark in GNN explanations such that this unique watermark is\nstatistically distinct from other potential solutions, and ownership claims\nmust show statistical significance to be verified. We theoretically prove that,\neven with full knowledge of our method, locating the watermark is an NP-hard\nproblem. Empirically, our method manifests robustness to removal attacks like\nfine-tuning and pruning. By addressing these challenges, our approach marks a\nsignificant advancement in protecting GNN intellectual property.\n","authors":["Jane Downer","Ren Wang","Binghui Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10425v3","updated":"2025-01-09T22:46:26Z","published":"2024-12-10T16:34:47Z","title":"Active Inference for Self-Organizing Multi-LLM Systems: A Bayesian\n  Thermodynamic Approach to Adaptation","summary":"  This paper introduces a novel approach to creating adaptive language agents\nby integrating active inference with large language models (LLMs). While LLMs\ndemonstrate remarkable capabilities, their reliance on static prompts limits\nadaptation to new information and changing environments. We address this by\nimplementing an active inference framework that acts as a cognitive layer above\nan LLM-based agent, dynamically adjusting prompts and search strategies through\nprincipled information-seeking behavior. Our framework models the environment\nusing three state factors (prompt, search, and information states) with seven\nobservation modalities capturing quality metrics. By framing the agent's\nlearning through the free energy principle, we enable systematic exploration of\nprompt combinations and search strategies. Experimental results demonstrate the\neffectiveness of this approach, with the agent developing accurate models of\nenvironment dynamics evidenced by emergent structure in observation matrices.\nAction selection patterns reveal sophisticated exploration-exploitation\nbehavior, transitioning from initial information-gathering to targeted prompt\ntesting. The integration of thermodynamic principles with language model\ncapabilities provides a principled framework for creating robust, adaptable\nagents, extending active inference beyond traditional low-dimensional control\nproblems to high-dimensional, language-driven environments.\n","authors":["Rithvik Prakki"],"pdf_url":"https://arxiv.org/pdf/2412.10425v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09889v3","updated":"2025-01-09T22:43:05Z","published":"2024-04-15T15:55:01Z","title":"Is Table Retrieval a Solved Problem? Exploring Join-Aware Multi-Table\n  Retrieval","summary":"  Retrieving relevant tables containing the necessary information to accurately\nanswer a given question over tables is critical to open-domain\nquestion-answering (QA) systems. Previous methods assume the answer to such a\nquestion can be found either in a single table or multiple tables identified\nthrough question decomposition or rewriting. However, neither of these\napproaches is sufficient, as many questions require retrieving multiple tables\nand joining them through a join plan that cannot be discerned from the user\nquery itself. If the join plan is not considered in the retrieval stage, the\nsubsequent steps of reasoning and answering based on those retrieved tables are\nlikely to be incorrect. To address this problem, we introduce a method that\nuncovers useful join relations for any query and database during table\nretrieval. We use a novel re-ranking method formulated as a mixed-integer\nprogram that considers not only table-query relevance but also table-table\nrelevance that requires inferring join relationships. Our method outperforms\nthe state-of-the-art approaches for table retrieval by up to 9.3% in F1 score\nand for end-to-end QA by up to 5.4% in accuracy.\n","authors":["Peter Baile Chen","Yi Zhang","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2404.09889v3.pdf","comment":"ACL 2024. Dataset and code are available at\n  https://peterbaile.github.io/jar"},{"id":"http://arxiv.org/abs/2501.05605v1","updated":"2025-01-09T22:41:50Z","published":"2025-01-09T22:41:50Z","title":"Advancing Personalized Learning Analysis via an Innovative Domain\n  Knowledge Informed Attention-based Knowledge Tracing Method","summary":"  Emerging Knowledge Tracing (KT) models, particularly deep learning and\nattention-based Knowledge Tracing, have shown great potential in realizing\npersonalized learning analysis via prediction of students' future performance\nbased on their past interactions. The existing methods mainly focus on\nimmediate past interactions or individual concepts without accounting for\ndependencies between knowledge concept, referred as knowledge concept routes,\nthat can be critical to advance the understanding the students' learning\noutcomes. To address this, in this paper, we propose an innovative\nattention-based method by effectively incorporating the domain knowledge of\nknowledge concept routes in the given curriculum. Additionally, we leverage\nXES3G5M dataset, a benchmark dataset with rich auxiliary information for\nknowledge concept routes, to evaluate and compare the performance of our\nproposed method to the seven State-of-the-art (SOTA) deep learning models.\n","authors":["Shubham Kose","Jin Wei-Kocsis"],"pdf_url":"https://arxiv.org/pdf/2501.05605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17428v2","updated":"2025-01-09T22:27:06Z","published":"2024-05-27T17:59:45Z","title":"NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding\n  Models","summary":"  Decoder-only large language model (LLM)-based embedding models are beginning\nto outperform BERT or T5-based embedding models in general-purpose text\nembedding tasks, including dense vector-based retrieval. In this work, we\nintroduce the NV-Embed model, incorporating architectural designs, training\nprocedures, and curated datasets to significantly enhance the performance of\nLLM as a versatile embedding model, while maintaining its simplicity and\nreproducibility. For model architecture, we propose a latent attention layer to\nobtain pooled embeddings, which consistently improves retrieval and downstream\ntask accuracy compared to mean pooling or using the last <EOS> token embedding\nfrom LLMs. To enhance representation learning, we remove the causal attention\nmask of LLMs during contrastive training. For training algorithm, we introduce\na two-stage contrastive instruction-tuning method. It first applies contrastive\ntraining with instructions on retrieval datasets, utilizing in-batch negatives\nand curated hard negative examples. At stage-2, it blends various non-retrieval\ninto instruction tuning, which not only enhances non-retrieval task accuracy\nbut also improves retrieval performance. For training data, we utilize the\nhard-negative mining, synthetic data generation and existing public available\ndatasets to boost the performance of embedding model. By combining these\ntechniques, our NV-Embed-v1 and NV-Embed-v2 models obtained the No.1 position\non the Massive Text Embedding Benchmark (MTEB) (as of May 24, 2024 and August\n30, 2024, respectively) across 56 embedding tasks, demonstrating the sustained\neffectiveness of the proposed methods over time. Additionally, it achieved the\nhighest scores in the Long Doc section and the second-highest scores in the QA\nsection of the AIR Benchmark, which covers a range of out-of-domain information\nretrieval topics beyond those in MTEB.\n","authors":["Chankyu Lee","Rajarshi Roy","Mengyao Xu","Jonathan Raiman","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2405.17428v2.pdf","comment":"We open-source the model at:\n  https://huggingface.co/nvidia/NV-Embed-v2"},{"id":"http://arxiv.org/abs/2303.17155v4","updated":"2025-01-09T22:23:15Z","published":"2023-03-30T05:25:20Z","title":"Discriminative Class Tokens for Text-to-Image Diffusion Models","summary":"  Recent advances in text-to-image diffusion models have enabled the generation\nof diverse and high-quality images. While impressive, the images often fall\nshort of depicting subtle details and are susceptible to errors due to\nambiguity in the input text. One way of alleviating these issues is to train\ndiffusion models on class-labeled datasets. This approach has two\ndisadvantages: (i) supervised datasets are generally small compared to\nlarge-scale scraped text-image datasets on which text-to-image models are\ntrained, affecting the quality and diversity of the generated images, or (ii)\nthe input is a hard-coded label, as opposed to free-form text, limiting the\ncontrol over the generated images.\n  In this work, we propose a non-invasive fine-tuning technique that\ncapitalizes on the expressive potential of free-form text while achieving high\naccuracy through discriminative signals from a pretrained classifier. This is\ndone by iteratively modifying the embedding of an added input token of a\ntext-to-image diffusion model, by steering generated images toward a given\ntarget class according to a classifier. Our method is fast compared to prior\nfine-tuning methods and does not require a collection of in-class images or\nretraining of a noise-tolerant classifier. We evaluate our method extensively,\nshowing that the generated images are: (i) more accurate and of higher quality\nthan standard diffusion models, (ii) can be used to augment training data in a\nlow-resource setting, and (iii) reveal information about the data used to train\nthe guiding classifier. The code is available at\n\\url{https://github.com/idansc/discriminative_class_tokens}.\n","authors":["Idan Schwartz","Vésteinn Snæbjarnarson","Hila Chefer","Ryan Cotterell","Serge Belongie","Lior Wolf","Sagie Benaim"],"pdf_url":"https://arxiv.org/pdf/2303.17155v4.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2403.13257v3","updated":"2025-01-09T22:21:56Z","published":"2024-03-20T02:38:01Z","title":"Arcee's MergeKit: A Toolkit for Merging Large Language Models","summary":"  The rapid expansion of the open-source language model landscape presents an\nopportunity to merge the competencies of these model checkpoints by combining\ntheir parameters. Advances in transfer learning, the process of fine-tuning\npretrained models for specific tasks, has resulted in the development of vast\namounts of task-specific models, typically specialized in individual tasks and\nunable to utilize each other's strengths. Model merging facilitates the\ncreation of multitask models without the need for additional training, offering\na promising avenue for enhancing model performance and versatility. By\npreserving the intrinsic capabilities of the original models, model merging\naddresses complex challenges in AI - including the difficulties of catastrophic\nforgetting and multitask learning. To support this expanding area of research,\nwe introduce MergeKit, a comprehensive, open-source library designed to\nfacilitate the application of model merging strategies. MergeKit offers an\nextensible framework to efficiently merge models on any hardware, providing\nutility to researchers and practitioners. To date, thousands of models have\nbeen merged by the open-source community, leading to the creation of some of\nthe worlds most powerful open-source model checkpoints, as assessed by the Open\nLLM Leaderboard. The library is accessible at\nhttps://github.com/arcee-ai/MergeKit.\n","authors":["Charles Goddard","Shamane Siriwardhana","Malikeh Ehghaghi","Luke Meyers","Vlad Karpukhin","Brian Benedict","Mark McQuade","Jacob Solawetz"],"pdf_url":"https://arxiv.org/pdf/2403.13257v3.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.06687v2","updated":"2025-01-09T22:14:55Z","published":"2024-08-13T07:27:02Z","title":"Masked Image Modeling: A Survey","summary":"  In this work, we survey recent studies on masked image modeling (MIM), an\napproach that emerged as a powerful self-supervised learning technique in\ncomputer vision. The MIM task involves masking some information, e.g.~pixels,\npatches, or even latent representations, and training a model, usually an\nautoencoder, to predicting the missing information by using the context\navailable in the visible part of the input. We identify and formalize two\ncategories of approaches on how to implement MIM as a pretext task, one based\non reconstruction and one based on contrastive learning. Then, we construct a\ntaxonomy and review the most prominent papers in recent years. We complement\nthe manually constructed taxonomy with a dendrogram obtained by applying a\nhierarchical clustering algorithm. We further identify relevant clusters via\nmanually inspecting the resulting dendrogram. Our review also includes datasets\nthat are commonly used in MIM research. We aggregate the performance results of\nvarious masked image modeling methods on the most popular datasets, to\nfacilitate the comparison of competing methods. Finally, we identify research\ngaps and propose several interesting directions of future work. We supplement\nour survey with the following public repository containing organized\nreferences: https://github.com/vladhondru25/MIM-Survey.\n","authors":["Vlad Hondru","Florinel Alin Croitoru","Shervin Minaee","Radu Tudor Ionescu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2408.06687v2.pdf","comment":"Revised version"},{"id":"http://arxiv.org/abs/2404.18731v3","updated":"2025-01-09T22:10:14Z","published":"2024-04-29T14:17:52Z","title":"Real Time Multi Organ Classification on Computed Tomography Images","summary":"  Organ segmentation is a fundamental task in medical imaging since it is\nuseful for many clinical automation pipelines. However, some tasks do not\nrequire full segmentation. Instead, a classifier can identify the selected\norgan without segmenting the entire volume. In this study, we demonstrate a\nclassifier based method to obtain organ labels in real time by using a large\ncontext size with a sparse data sampling strategy. Although our method operates\nas an independent classifier at query locations, it can generate full\nsegmentations by querying grid locations at any resolution, offering faster\nperformance than segmentation algorithms. We compared our method with existing\nsegmentation techniques, demonstrating its superior runtime potential for\npractical applications in medical imaging.\n","authors":["Halid Ziya Yerebakan","Yoshihisa Shinagawa","Gerardo Hermosillo Valadez"],"pdf_url":"https://arxiv.org/pdf/2404.18731v3.pdf","comment":"11 pages, Organ Classification, Organ Segmentation"},{"id":"http://arxiv.org/abs/2411.08745v3","updated":"2025-01-09T21:53:56Z","published":"2024-11-13T16:26:19Z","title":"Separating Tongue from Thought: Activation Patching Reveals\n  Language-Agnostic Concept Representations in Transformers","summary":"  A central question in multilingual language modeling is whether large\nlanguage models (LLMs) develop a universal concept representation, disentangled\nfrom specific languages. In this paper, we address this question by analyzing\nlatent representations (latents) during a word translation task in\ntransformer-based LLMs. We strategically extract latents from a source\ntranslation prompt and insert them into the forward pass on a target\ntranslation prompt. By doing so, we find that the output language is encoded in\nthe latent at an earlier layer than the concept to be translated. Building on\nthis insight, we conduct two key experiments. First, we demonstrate that we can\nchange the concept without changing the language and vice versa through\nactivation patching alone. Second, we show that patching with the mean over\nlatents across different languages does not impair and instead improves the\nmodels' performance in translating the concept. Our results provide evidence\nfor the existence of language-agnostic concept representations within the\ninvestigated models.\n","authors":["Clément Dumas","Chris Wendler","Veniamin Veselovsky","Giovanni Monea","Robert West"],"pdf_url":"https://arxiv.org/pdf/2411.08745v3.pdf","comment":"18 pages, 14 figures, previous version published under the title \"How\n  Do Llamas Process Multilingual Text? A Latent Exploration through Activation\n  Patching\" at the ICML 2024 mechanistic interpretability workshop at\n  https://openreview.net/forum?id=0ku2hIm4BS"},{"id":"http://arxiv.org/abs/2501.05567v1","updated":"2025-01-09T20:34:36Z","published":"2025-01-09T20:34:36Z","title":"Approximate Supervised Object Distance Estimation on Unmanned Surface\n  Vehicles","summary":"  Unmanned surface vehicles (USVs) and boats are increasingly important in\nmaritime operations, yet their deployment is limited due to costly sensors and\ncomplexity. LiDAR, radar, and depth cameras are either costly, yield sparse\npoint clouds or are noisy, and require extensive calibration. Here, we\nintroduce a novel approach for approximate distance estimation in USVs using\nsupervised object detection. We collected a dataset comprising images with\nmanually annotated bounding boxes and corresponding distance measurements.\nLeveraging this data, we propose a specialized branch of an object detection\nmodel, not only to detect objects but also to predict their distances from the\nUSV. This method offers a cost-efficient and intuitive alternative to\nconventional distance measurement techniques, aligning more closely with human\nestimation capabilities. We demonstrate its application in a marine assistance\nsystem that alerts operators to nearby objects such as boats, buoys, or other\nwaterborne hazards.\n","authors":["Benjamin Kiefer","Yitong Quan","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2501.05567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05566v1","updated":"2025-01-09T20:29:31Z","published":"2025-01-09T20:29:31Z","title":"Vision-Language Models for Autonomous Driving: CLIP-Based Dynamic Scene\n  Understanding","summary":"  Scene understanding is essential for enhancing driver safety, generating\nhuman-centric explanations for Automated Vehicle (AV) decisions, and leveraging\nArtificial Intelligence (AI) for retrospective driving video analysis. This\nstudy developed a dynamic scene retrieval system using Contrastive\nLanguage-Image Pretraining (CLIP) models, which can be optimized for real-time\ndeployment on edge devices. The proposed system outperforms state-of-the-art\nin-context learning methods, including the zero-shot capabilities of GPT-4o,\nparticularly in complex scenarios. By conducting frame-level analysis on the\nHonda Scenes Dataset, which contains a collection of about 80 hours of\nannotated driving videos capturing diverse real-world road and weather\nconditions, our study highlights the robustness of CLIP models in learning\nvisual concepts from natural language supervision. Results also showed that\nfine-tuning the CLIP models, such as ViT-L/14 and ViT-B/32, significantly\nimproved scene classification, achieving a top F1 score of 91.1%. These results\ndemonstrate the ability of the system to deliver rapid and precise scene\nrecognition, which can be used to meet the critical requirements of Advanced\nDriver Assistance Systems (ADAS). This study shows the potential of CLIP models\nto provide scalable and efficient frameworks for dynamic scene understanding\nand classification. Furthermore, this work lays the groundwork for advanced\nautonomous vehicle technologies by fostering a deeper understanding of driver\nbehavior, road conditions, and safety-critical scenarios, marking a significant\nstep toward smarter, safer, and more context-aware autonomous driving systems.\n","authors":["Mohammed Elhenawy","Huthaifa I. Ashqar","Andry Rakotonirainy","Taqwa I. Alhadidi","Ahmed Jaber","Mohammad Abu Tami"],"pdf_url":"https://arxiv.org/pdf/2501.05566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09566v3","updated":"2025-01-09T20:24:46Z","published":"2024-09-15T00:53:44Z","title":"Learning Transferable Features for Implicit Neural Representations","summary":"  Implicit neural representations (INRs) have demonstrated success in a variety\nof applications, including inverse problems and neural rendering. An INR is\ntypically trained to capture one signal of interest, resulting in learned\nneural features that are highly attuned to that signal. Assumed to be less\ngeneralizable, we explore the aspect of transferability of such learned neural\nfeatures for fitting similar signals. We introduce a new INR training\nframework, STRAINER that learns transferrable features for fitting INRs to new\nsignals from a given distribution, faster and with better reconstruction\nquality. Owing to the sequential layer-wise affine operations in an INR, we\npropose to learn transferable representations by sharing initial encoder layers\nacross multiple INRs with independent decoder layers. At test time, the learned\nencoder representations are transferred as initialization for an otherwise\nrandomly initialized INR. We find STRAINER to yield extremely powerful\ninitialization for fitting images from the same domain and allow for $\\approx\n+10dB$ gain in signal quality early on compared to an untrained INR itself.\nSTRAINER also provides a simple way to encode data-driven priors in INRs. We\nevaluate STRAINER on multiple in-domain and out-of-domain signal fitting tasks\nand inverse problems and further provide detailed analysis and discussion on\nthe transferability of STRAINER's features. Our demo can be accessed at\nhttps://kushalvyas.github.io/strainer.html .\n","authors":["Kushal Vyas","Ahmed Imtiaz Humayun","Aniket Dashpute","Richard G. Baraniuk","Ashok Veeraraghavan","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2409.09566v3.pdf","comment":"Project Website: https://kushalvyas.github.io/strainer.html"},{"id":"http://arxiv.org/abs/2412.14194v3","updated":"2025-01-09T20:16:41Z","published":"2024-12-12T23:42:46Z","title":"Detecting Cognitive Impairment and Psychological Well-being among Older\n  Adults Using Facial, Acoustic, Linguistic, and Cardiovascular Patterns\n  Derived from Remote Conversations","summary":"  The aging society urgently requires scalable methods to monitor cognitive\ndecline and identify social and psychological factors indicative of dementia\nrisk in older adults. Our machine learning (ML) models captured facial,\nacoustic, linguistic, and cardiovascular features from 39 individuals with\nnormal cognition or Mild Cognitive Impairment derived from remote video\nconversations and classified cognitive status, social isolation, neuroticism,\nand psychological well-being. Our model could distinguish Clinical Dementia\nRating Scale (CDR) of 0.5 (vs. 0) with 0.78 area under the receiver operating\ncharacteristic curve (AUC), social isolation with 0.75 AUC, neuroticism with\n0.71 AUC, and negative affect scales with 0.79 AUC. Recent advances in machine\nlearning offer new opportunities to remotely detect cognitive impairment and\nassess associated factors, such as neuroticism and psychological well-being.\nOur experiment showed that speech and language patterns were more useful for\nquantifying cognitive impairment, whereas facial expression and cardiovascular\npatterns using photoplethysmography (PPG) were more useful for quantifying\npersonality and psychological well-being.\n","authors":["Xiaofan Mu","Salman Seyedi","Iris Zheng","Zifan Jiang","Liu Chen","Bolaji Omofojoye","Rachel Hershenberg","Allan I. Levey","Gari D. Clifford","Hiroko H. Dodge","Hyeokhyen Kwon"],"pdf_url":"https://arxiv.org/pdf/2412.14194v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05559v1","updated":"2025-01-09T20:11:08Z","published":"2025-01-09T20:11:08Z","title":"Soup to go: mitigating forgetting during continual learning with model\n  averaging","summary":"  In continual learning, where task data arrives in a sequence, fine-tuning on\nlater tasks will often lead to performance degradation on earlier tasks. This\nis especially pronounced when these tasks come from diverse domains. In this\nsetting, how can we mitigate catastrophic forgetting of earlier tasks and\nretain what the model has learned with minimal computational expenses? Inspired\nby other merging methods, and L2-regression, we propose Sequential Fine-tuning\nwith Averaging (SFA), a method that merges currently training models with\nearlier checkpoints during the course of training. SOTA approaches typically\nmaintain a data buffer of past tasks or impose a penalty at each gradient step.\nIn contrast, our method achieves comparable results without the need to store\npast data, or multiple copies of parameters for each gradient step.\nFurthermore, our method outperforms common merging techniques such as Task\nArithmetic, TIES Merging, and WiSE-FT, as well as other penalty methods like L2\nand Elastic Weight Consolidation. In turn, our method offers insight into the\nbenefits of merging partially-trained models during training across both image\nand language domains.\n","authors":["Anat Kleiman","Gintare Karolina Dziugaite","Jonathan Frankle","Sham Kakade","Mansheej Paul"],"pdf_url":"https://arxiv.org/pdf/2501.05559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13969v2","updated":"2025-01-09T20:08:31Z","published":"2023-08-26T22:48:06Z","title":"Gaze-Informed Vision Transformers: Predicting Driving Decisions Under\n  Uncertainty","summary":"  Vision Transformers (ViT) have advanced computer vision, yet their efficacy\nin complex tasks like driving remains less explored. This study enhances ViT by\nintegrating human eye gaze, captured via eye-tracking, to increase prediction\naccuracy in driving scenarios under uncertainty in both real-world and virtual\nreality scenarios. First, we establish the significance of human eye gaze in\nleft-right driving decisions, as observed in both human subjects and a ViT\nmodel. By comparing the similarity between human fixation maps and ViT\nattention weights, we reveal the dynamics of overlap across individual heads\nand layers. This overlap demonstrates that fixation data can guide the model in\ndistributing its attention weights more effectively. We introduce the\nfixation-attention intersection (FAX) loss, a novel loss function that\nsignificantly improves ViT performance under high uncertainty conditions. Our\nresults show that ViT, when trained with FAX loss, aligns its attention with\nhuman gaze patterns. This gaze-informed approach has significant potential for\ndriver behavior analysis, as well as broader applications in human-centered AI\nsystems, extending ViT's use to complex visual environments.\n","authors":["Sharath Koorathota","Nikolas Papadopoulos","Jia Li Ma","Shruti Kumar","Xiaoxiao Sun","Arunesh Mittal","Patrick Adelman","Paul Sajda"],"pdf_url":"https://arxiv.org/pdf/2308.13969v2.pdf","comment":"25 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.05555v1","updated":"2025-01-09T20:02:10Z","published":"2025-01-09T20:02:10Z","title":"Improving Zero-Shot Object-Level Change Detection by Incorporating\n  Visual Correspondence","summary":"  Detecting object-level changes between two images across possibly different\nviews is a core task in many applications that involve visual inspection or\ncamera surveillance. Existing change-detection approaches suffer from three\nmajor limitations: (1) lack of evaluation on image pairs that contain no\nchanges, leading to unreported false positive rates; (2) lack of\ncorrespondences (\\ie, localizing the regions before and after a change); and\n(3) poor zero-shot generalization across different domains. To address these\nissues, we introduce a novel method that leverages change correspondences (a)\nduring training to improve change detection accuracy, and (b) at test time, to\nminimize false positives. That is, we harness the supervision labels of where\nan object is added or removed to supervise change detectors, improving their\naccuracy over previous work by a large margin. Our work is also the first to\npredict correspondences between pairs of detected changes using estimated\nhomography and the Hungarian algorithm. Our model demonstrates superior\nperformance over existing methods, achieving state-of-the-art results in change\ndetection and change correspondence accuracy across both in-distribution and\nzero-shot benchmarks.\n","authors":["Hung Huy Nguyen","Pooyan Rahmanzadehgervi","Long Mail","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.05555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05554v1","updated":"2025-01-09T20:01:15Z","published":"2025-01-09T20:01:15Z","title":"LLMQuoter: Enhancing RAG Capabilities Through Efficient Quote Extraction\n  From Large Contexts","summary":"  We introduce LLMQuoter, a lightweight, distillation-based model designed to\nenhance Retrieval Augmented Generation (RAG) by extracting the most relevant\ntextual evidence for downstream reasoning tasks. Built on the LLaMA-3B\narchitecture and fine-tuned with Low-Rank Adaptation (LoRA) on a 15,000-sample\nsubset of HotpotQA, LLMQuoter adopts a \"quote-first-then-answer\" strategy,\nefficiently identifying key quotes before passing curated snippets to reasoning\nmodels. This workflow reduces cognitive overhead and outperforms full-context\napproaches like Retrieval-Augmented Fine-Tuning (RAFT), achieving over 20-point\naccuracy gains across both small and large language models. By leveraging\nknowledge distillation from a high-performing teacher model, LLMQuoter achieves\ncompetitive results in a resource-efficient fine-tuning setup. It democratizes\nadvanced RAG capabilities, delivering significant performance improvements\nwithout requiring extensive model retraining. Our results highlight the\npotential of distilled quote-based reasoning to streamline complex workflows,\noffering a scalable and practical solution for researchers and practitioners\nalike.\n","authors":["Yuri Facanha Bezerra","Li Weigang"],"pdf_url":"https://arxiv.org/pdf/2501.05554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00190v2","updated":"2025-01-09T20:00:16Z","published":"2024-12-31T00:02:07Z","title":"SepsisCalc: Integrating Clinical Calculators into Early Sepsis\n  Prediction via Dynamic Temporal Graph Construction","summary":"  Sepsis is an organ dysfunction caused by a deregulated immune response to an\ninfection. Early sepsis prediction and identification allow for timely\nintervention, leading to improved clinical outcomes. Clinical calculators\n(e.g., the six-organ dysfunction assessment of SOFA) play a vital role in\nsepsis identification within clinicians' workflow, providing evidence-based\nrisk assessments essential for sepsis diagnosis. However, artificial\nintelligence (AI) sepsis prediction models typically generate a single sepsis\nrisk score without incorporating clinical calculators for assessing organ\ndysfunctions, making the models less convincing and transparent to clinicians.\nTo bridge the gap, we propose to mimic clinicians' workflow with a novel\nframework SepsisCalc to integrate clinical calculators into the predictive\nmodel, yielding a clinically transparent and precise model for utilization in\nclinical settings. Practically, clinical calculators usually combine\ninformation from multiple component variables in Electronic Health Records\n(EHR), and might not be applicable when the variables are (partially) missing.\nWe mitigate this issue by representing EHRs as temporal graphs and integrating\na learning module to dynamically add the accurately estimated calculator to the\ngraphs. Experimental results on real-world datasets show that the proposed\nmodel outperforms state-of-the-art methods on sepsis prediction tasks.\nMoreover, we developed a system to identify organ dysfunctions and potential\nsepsis risks, providing a human-AI interaction tool for deployment, which can\nhelp clinicians understand the prediction outputs and prepare timely\ninterventions for the corresponding dysfunctions, paving the way for actionable\nclinical decision-making support for early intervention.\n","authors":["Changchang Yin","Shihan Fu","Bingsheng Yao","Thai-Hoang Pham","Weidan Cao","Dakuo Wang","Jeffrey Caterino","Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.00190v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05552v1","updated":"2025-01-09T19:56:44Z","published":"2025-01-09T19:56:44Z","title":"The dynamics of meaning through time: Assessment of Large Language\n  Models","summary":"  Understanding how large language models (LLMs) grasp the historical context\nof concepts and their semantic evolution is essential in advancing artificial\nintelligence and linguistic studies. This study aims to evaluate the\ncapabilities of various LLMs in capturing temporal dynamics of meaning,\nspecifically how they interpret terms across different time periods. We analyze\na diverse set of terms from multiple domains, using tailored prompts and\nmeasuring responses through both objective metrics (e.g., perplexity and word\ncount) and subjective human expert evaluations. Our comparative analysis\nincludes prominent models like ChatGPT, GPT-4, Claude, Bard, Gemini, and Llama.\nFindings reveal marked differences in each model's handling of historical\ncontext and semantic shifts, highlighting both strengths and limitations in\ntemporal semantic understanding. These insights offer a foundation for refining\nLLMs to better address the evolving nature of language, with implications for\nhistorical text analysis, AI design, and applications in digital humanities.\n","authors":["Mohamed Taher Alrefaie","Fatty Salem","Nour Eldin Morsy","Nada Samir","Mohamed Medhat Gaber"],"pdf_url":"https://arxiv.org/pdf/2501.05552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02075v2","updated":"2025-01-09T19:54:53Z","published":"2023-04-04T18:58:16Z","title":"GUTS: Generalized Uncertainty-Aware Thompson Sampling for Multi-Agent\n  Active Search","summary":"  Robotic solutions for quick disaster response are essential to ensure minimal\nloss of life, especially when the search area is too dangerous or too vast for\nhuman rescuers. We model this problem as an asynchronous multi-agent\nactive-search task where each robot aims to efficiently seek objects of\ninterest (OOIs) in an unknown environment. This formulation addresses the\nrequirement that search missions should focus on quick recovery of OOIs rather\nthan full coverage of the search region. Previous approaches fail to accurately\nmodel sensing uncertainty, account for occlusions due to foliage or terrain, or\nconsider the requirement for heterogeneous search teams and robustness to\nhardware and communication failures. We present the Generalized\nUncertainty-aware Thompson Sampling (GUTS) algorithm, which addresses these\nissues and is suitable for deployment on heterogeneous multi-robot systems for\nactive search in large unstructured environments. We show through simulation\nexperiments that GUTS consistently outperforms existing methods such as\nparallelized Thompson Sampling and exhaustive search, recovering all OOIs in\n80% of all runs. In contrast, existing approaches recover all OOIs in less than\n40% of all runs. We conduct field tests using our multi-robot system in an\nunstructured environment with a search area of approximately 75,000 sq. m. Our\nsystem demonstrates robustness to various failure modes, achieving full\nrecovery of OOIs (where feasible) in every field run, and significantly\noutperforming our baseline.\n","authors":["Nikhil Angad Bakshi","Tejus Gupta","Ramina Ghods","Jeff Schneider"],"pdf_url":"https://arxiv.org/pdf/2304.02075v2.pdf","comment":"7 pages, 5 figures, 1 table, for associated video see:\n  https://youtu.be/K0jkzdQ_j2E , published in International Conference on\n  Robotics and Automation (ICRA) 2023. Outstanding Deployed Systems Paper\n  Winner"},{"id":"http://arxiv.org/abs/2412.08755v2","updated":"2025-01-09T19:15:20Z","published":"2024-12-11T19:54:14Z","title":"Proactive Adversarial Defense: Harnessing Prompt Tuning in\n  Vision-Language Models to Detect Unseen Backdoored Images","summary":"  Backdoor attacks pose a critical threat by embedding hidden triggers into\ninputs, causing models to misclassify them into target labels. While extensive\nresearch has focused on mitigating these attacks in object recognition models\nthrough weight fine-tuning, much less attention has been given to detecting\nbackdoored samples directly. Given the vast datasets used in training, manual\ninspection for backdoor triggers is impractical, and even state-of-the-art\ndefense mechanisms fail to fully neutralize their impact. To address this gap,\nwe introduce a groundbreaking method to detect unseen backdoored images during\nboth training and inference. Leveraging the transformative success of prompt\ntuning in Vision Language Models (VLMs), our approach trains learnable text\nprompts to differentiate clean images from those with hidden backdoor triggers.\nExperiments demonstrate the exceptional efficacy of this method, achieving an\nimpressive average accuracy of 86% across two renowned datasets for detecting\nunseen backdoor triggers, establishing a new standard in backdoor defense.\n","authors":["Kyle Stein","Andrew Arash Mahyari","Guillermo Francia","Eman El-Sheikh"],"pdf_url":"https://arxiv.org/pdf/2412.08755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05510v1","updated":"2025-01-09T19:00:01Z","published":"2025-01-09T19:00:01Z","title":"OVO-Bench: How Far is Your Video-LLMs from Real-World Online Video\n  Understanding?","summary":"  Temporal Awareness, the ability to reason dynamically based on the timestamp\nwhen a question is raised, is the key distinction between offline and online\nvideo LLMs. Unlike offline models, which rely on complete videos for static,\npost hoc analysis, online models process video streams incrementally and\ndynamically adapt their responses based on the timestamp at which the question\nis posed. Despite its significance, temporal awareness has not been adequately\nevaluated in existing benchmarks. To fill this gap, we present OVO-Bench\n(Online-VideO-Benchmark), a novel video benchmark that emphasizes the\nimportance of timestamps for advanced online video understanding capability\nbenchmarking. OVO-Bench evaluates the ability of video LLMs to reason and\nrespond to events occurring at specific timestamps under three distinct\nscenarios: (1) Backward tracing: trace back to past events to answer the\nquestion. (2) Real-time understanding: understand and respond to events as they\nunfold at the current timestamp. (3) Forward active responding: delay the\nresponse until sufficient future information becomes available to answer the\nquestion accurately. OVO-Bench comprises 12 tasks, featuring 644 unique videos\nand approximately human-curated 2,800 fine-grained meta-annotations with\nprecise timestamps. We combine automated generation pipelines with human\ncuration. With these high-quality samples, we further developed an evaluation\npipeline to systematically query video LLMs along the video timeline.\nEvaluations of nine Video-LLMs reveal that, despite advancements on traditional\nbenchmarks, current models struggle with online video understanding, showing a\nsignificant gap compared to human agents. We hope OVO-Bench will drive progress\nin video LLMs and inspire future research in online video reasoning. Our\nbenchmark and code can be accessed at https://github.com/JoeLeelyf/OVO-Bench.\n","authors":["Yifei Li","Junbo Niu","Ziyang Miao","Chunjiang Ge","Yuanhang Zhou","Qihao He","Xiaoyi Dong","Haodong Duan","Shuangrui Ding","Rui Qian","Pan Zhang","Yuhang Zang","Yuhang Cao","Conghui He","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05510v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2501.05501v1","updated":"2025-01-09T18:43:05Z","published":"2025-01-09T18:43:05Z","title":"Strategy Masking: A Method for Guardrails in Value-based Reinforcement\n  Learning Agents","summary":"  The use of reward functions to structure AI learning and decision making is\ncore to the current reinforcement learning paradigm; however, without careful\ndesign of reward functions, agents can learn to solve problems in ways that may\nbe considered ``undesirable\" or ``unethical. Without thorough understanding of\nthe incentives a reward function creates, it can be difficult to impose\nprincipled yet general control mechanisms over its behavior. In this paper, we\nstudy methods for constructing guardrails for AI agents that use reward\nfunctions to learn decision making. We introduce a novel approach, which we\ncall strategy masking, to explicitly learn and then suppress undesirable AI\nagent behavior. We apply our method to study lying in AI agents and show that\nstrategy masking can effectively modify agent behavior by suppressing, or\nactively penalizing, the reward dimension for lying such that agents act more\nhonestly while not compromising their ability to perform effectively.\n","authors":["Jonathan Keane","Sam Keyser","Jeremy Kedziora"],"pdf_url":"https://arxiv.org/pdf/2501.05501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05497v1","updated":"2025-01-09T17:20:00Z","published":"2025-01-09T17:20:00Z","title":"Spatial Information Integration in Small Language Models for Document\n  Layout Generation and Classification","summary":"  Document layout understanding is a field of study that analyzes the spatial\narrangement of information in a document hoping to understand its structure and\nlayout. Models such as LayoutLM (and its subsequent iterations) can understand\nsemi-structured documents with SotA results; however, the lack of open\nsemi-structured data is a limitation in itself. While semi-structured data is\ncommon in everyday life (balance sheets, purchase orders, receipts), there is a\nlack of public datasets for training machine learning models for this type of\ndocument. In this investigation we propose a method to generate new, synthetic,\nlayout information that can help overcoming this data shortage. According to\nour results, the proposed method performs better than LayoutTransformer,\nanother popular layout generation method. We also show that, in some scenarios,\ntext classification can improve when supported by bounding box information.\n","authors":["Pablo Melendez","Clemens Havas"],"pdf_url":"https://arxiv.org/pdf/2501.05497v1.pdf","comment":"8 pages. Symposium on Applied Computing 2025"},{"id":"http://arxiv.org/abs/2501.05496v1","updated":"2025-01-09T16:10:03Z","published":"2025-01-09T16:10:03Z","title":"FedSA: A Unified Representation Learning via Semantic Anchors for\n  Prototype-based Federated Learning","summary":"  Prototype-based federated learning has emerged as a promising approach that\nshares lightweight prototypes to transfer knowledge among clients with data\nheterogeneity in a model-agnostic manner. However, existing methods often\ncollect prototypes directly from local models, which inevitably introduce\ninconsistencies into representation learning due to the biased data\ndistributions and differing model architectures among clients. In this paper,\nwe identify that both statistical and model heterogeneity create a vicious\ncycle of representation inconsistency, classifier divergence, and skewed\nprototype alignment, which negatively impacts the performance of clients. To\nbreak the vicious cycle, we propose a novel framework named Federated Learning\nvia Semantic Anchors (FedSA) to decouple the generation of prototypes from\nlocal representation learning. We introduce a novel perspective that uses\nsimple yet effective semantic anchors serving as prototypes to guide local\nmodels in learning consistent representations. By incorporating semantic\nanchors, we further propose anchor-based regularization with margin-enhanced\ncontrastive learning and anchor-based classifier calibration to correct feature\nextractors and calibrate classifiers across clients, achieving intra-class\ncompactness and inter-class separability of prototypes while ensuring\nconsistent decision boundaries. We then update the semantic anchors with these\nconsistent and discriminative prototypes, which iteratively encourage clients\nto collaboratively learn a unified data representation with robust\ngeneralization. Extensive experiments under both statistical and model\nheterogeneity settings show that FedSA significantly outperforms existing\nprototype-based FL methods on various classification tasks.\n","authors":["Yanbing Zhou","Xiangmou Qu","Chenlong You","Jiyang Zhou","Jingyue Tang","Xin Zheng","Chunmao Cai","Yingbo Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05496v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.05495v1","updated":"2025-01-09T15:47:30Z","published":"2025-01-09T15:47:30Z","title":"LSEBMCL: A Latent Space Energy-Based Model for Continual Learning","summary":"  Continual learning has become essential in many practical applications such\nas online news summaries and product classification. The primary challenge is\nknown as catastrophic forgetting, a phenomenon where a model inadvertently\ndiscards previously learned knowledge when it is trained on new tasks. Existing\nsolutions involve storing exemplars from previous classes, regularizing\nparameters during the fine-tuning process, or assigning different model\nparameters to each task. The proposed solution LSEBMCL (Latent Space\nEnergy-Based Model for Continual Learning) in this work is to use energy-based\nmodels (EBMs) to prevent catastrophic forgetting by sampling data points from\nprevious tasks when training on new ones. The EBM is a machine learning model\nthat associates an energy value with each input data point. The proposed method\nuses an EBM layer as an outer-generator in the continual learning framework for\nNLP tasks. The study demonstrates the efficacy of EBM in NLP tasks, achieving\nstate-of-the-art results in all experiments.\n","authors":["Xiaodi Li","Dingcheng Li","Rujun Gao","Mahmoud Zamani","Latifur Khan"],"pdf_url":"https://arxiv.org/pdf/2501.05495v1.pdf","comment":"In the 7th International Conference on Artificial Intelligence in\n  Information and Communication (ICAIIC 2025)"},{"id":"http://arxiv.org/abs/2501.05238v1","updated":"2025-01-09T13:44:15Z","published":"2025-01-09T13:44:15Z","title":"FOCUS: Towards Universal Foreground Segmentation","summary":"  Foreground segmentation is a fundamental task in computer vision,\nencompassing various subdivision tasks. Previous research has typically\ndesigned task-specific architectures for each task, leading to a lack of\nunification. Moreover, they primarily focus on recognizing foreground objects\nwithout effectively distinguishing them from the background. In this paper, we\nemphasize the importance of the background and its relationship with the\nforeground. We introduce FOCUS, the Foreground ObjeCts Universal Segmentation\nframework that can handle multiple foreground tasks. We develop a multi-scale\nsemantic network using the edge information of objects to enhance image\nfeatures. To achieve boundary-aware segmentation, we propose a novel\ndistillation method, integrating the contrastive learning strategy to refine\nthe prediction mask in multi-modal feature space. We conduct extensive\nexperiments on a total of 13 datasets across 5 tasks, and the results\ndemonstrate that FOCUS consistently outperforms the state-of-the-art\ntask-specific models on most metrics.\n","authors":["Zuyao You","Lingyu Kong","Lingchen Meng","Zuxuan Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05057v1","updated":"2025-01-09T08:28:16Z","published":"2025-01-09T08:28:16Z","title":"LearningFlow: Automated Policy Learning Workflow for Urban Driving with\n  Large Language Models","summary":"  Recent advancements in reinforcement learning (RL) demonstrate the\nsignificant potential in autonomous driving. Despite this promise, challenges\nsuch as the manual design of reward functions and low sample efficiency in\ncomplex environments continue to impede the development of safe and effective\ndriving policies. To tackle these issues, we introduce LearningFlow, an\ninnovative automated policy learning workflow tailored to urban driving. This\nframework leverages the collaboration of multiple large language model (LLM)\nagents throughout the RL training process. LearningFlow includes a curriculum\nsequence generation process and a reward generation process, which work in\ntandem to guide the RL policy by generating tailored training curricula and\nreward functions. Particularly, each process is supported by an analysis agent\nthat evaluates training progress and provides critical insights to the\ngeneration agent. Through the collaborative efforts of these LLM agents,\nLearningFlow automates policy learning across a series of complex driving\ntasks, and it significantly reduces the reliance on manual reward function\ndesign while enhancing sample efficiency. Comprehensive experiments are\nconducted in the high-fidelity CARLA simulator, along with comparisons with\nother existing methods, to demonstrate the efficacy of our proposed approach.\nThe results demonstrate that LearningFlow excels in generating rewards and\ncurricula. It also achieves superior performance and robust generalization\nacross various driving tasks, as well as commendable adaptation to different RL\nalgorithms.\n","authors":["Zengqi Peng","Yubin Wang","Xu Han","Lei Zheng","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2501.05057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05490v1","updated":"2025-01-09T07:36:28Z","published":"2025-01-09T07:36:28Z","title":"Interpretable deep learning illuminates multiple structures fluorescence\n  imaging: a path toward trustworthy artificial intelligence in microscopy","summary":"  Live-cell imaging of multiple subcellular structures is essential for\nunderstanding subcellular dynamics. However, the conventional multi-color\nsequential fluorescence microscopy suffers from significant imaging delays and\nlimited number of subcellular structure separate labeling, resulting in\nsubstantial limitations for real-time live-cell research applications. Here, we\npresent the Adaptive Explainable Multi-Structure Network (AEMS-Net), a\ndeep-learning framework that enables simultaneous prediction of two subcellular\nstructures from a single image. The model normalizes staining intensity and\nprioritizes critical image features by integrating attention mechanisms and\nbrightness adaptation layers. Leveraging the Kolmogorov-Arnold representation\ntheorem, our model decomposes learned features into interpretable univariate\nfunctions, enhancing the explainability of complex subcellular morphologies. We\ndemonstrate that AEMS-Net allows real-time recording of interactions between\nmitochondria and microtubules, requiring only half the conventional\nsequential-channel imaging procedures. Notably, this approach achieves over 30%\nimprovement in imaging quality compared to traditional deep learning methods,\nestablishing a new paradigm for long-term, interpretable live-cell imaging that\nadvances the ability to explore subcellular dynamics.\n","authors":["Mingyang Chen","Luhong Jin","Xuwei Xuan","Defu Yang","Yun Cheng","Ju Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.05490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04952v1","updated":"2025-01-09T03:59:10Z","published":"2025-01-09T03:59:10Z","title":"Open Problems in Machine Unlearning for AI Safety","summary":"  As AI systems become more capable, widely deployed, and increasingly\nautonomous in critical areas such as cybersecurity, biological research, and\nhealthcare, ensuring their safety and alignment with human values is paramount.\nMachine unlearning -- the ability to selectively forget or suppress specific\ntypes of knowledge -- has shown promise for privacy and data removal tasks,\nwhich has been the primary focus of existing research. More recently, its\npotential application to AI safety has gained attention. In this paper, we\nidentify key limitations that prevent unlearning from serving as a\ncomprehensive solution for AI safety, particularly in managing dual-use\nknowledge in sensitive domains like cybersecurity and chemical, biological,\nradiological, and nuclear (CBRN) safety. In these contexts, information can be\nboth beneficial and harmful, and models may combine seemingly harmless\ninformation for harmful purposes -- unlearning this information could strongly\naffect beneficial uses. We provide an overview of inherent constraints and open\nproblems, including the broader side effects of unlearning dangerous knowledge,\nas well as previously unexplored tensions between unlearning and existing\nsafety mechanisms. Finally, we investigate challenges related to evaluation,\nrobustness, and the preservation of safety features during unlearning. By\nmapping these limitations and open challenges, we aim to guide future research\ntoward realistic applications of unlearning within a broader AI safety\nframework, acknowledging its limitations and highlighting areas where\nalternative approaches may be required.\n","authors":["Fazl Barez","Tingchen Fu","Ameya Prabhu","Stephen Casper","Amartya Sanyal","Adel Bibi","Aidan O'Gara","Robert Kirk","Ben Bucknall","Tim Fist","Luke Ong","Philip Torr","Kwok-Yan Lam","Robert Trager","David Krueger","Sören Mindermann","José Hernandez-Orallo","Mor Geva","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2501.04952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2006.02482v5","updated":"2025-01-09T02:10:04Z","published":"2020-06-03T19:02:34Z","title":"Explaining the Behavior of Black-Box Prediction Algorithms with Causal\n  Learning","summary":"  Causal approaches to post-hoc explainability for black-box prediction models\n(e.g., deep neural networks trained on image pixel data) have become\nincreasingly popular. However, existing approaches have two important\nshortcomings: (i) the \"explanatory units\" are micro-level inputs into the\nrelevant prediction model, e.g., image pixels, rather than interpretable\nmacro-level features that are more useful for understanding how to possibly\nchange the algorithm's behavior, and (ii) existing approaches assume there\nexists no unmeasured confounding between features and target model predictions,\nwhich fails to hold when the explanatory units are macro-level variables. Our\nfocus is on the important setting where the analyst has no access to the inner\nworkings of the target prediction algorithm, rather only the ability to query\nthe output of the model in response to a particular input. To provide causal\nexplanations in such a setting, we propose to learn causal graphical\nrepresentations that allow for arbitrary unmeasured confounding among features.\nWe demonstrate the resulting graph can differentiate between interpretable\nfeatures that causally influence model predictions versus those that are merely\nassociated with model predictions due to confounding. Our approach is motivated\nby a counterfactual theory of causal explanation wherein good explanations\npoint to factors that are \"difference-makers\" in an interventionist sense.\n","authors":["Numair Sani","Daniel Malinsky","Ilya Shpitser"],"pdf_url":"https://arxiv.org/pdf/2006.02482v5.pdf","comment":null}]},"2025-01-10T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.06132v1","updated":"2025-01-10T17:44:57Z","published":"2025-01-10T17:44:57Z","title":"CoDriveVLM: VLM-Enhanced Urban Cooperative Dispatching and Motion\n  Planning for Future Autonomous Mobility on Demand Systems","summary":"  The increasing demand for flexible and efficient urban transportation\nsolutions has spotlighted the limitations of traditional Demand Responsive\nTransport (DRT) systems, particularly in accommodating diverse passenger needs\nand dynamic urban environments. Autonomous Mobility-on-Demand (AMoD) systems\nhave emerged as a promising alternative, leveraging connected and autonomous\nvehicles (CAVs) to provide responsive and adaptable services. However, existing\nmethods primarily focus on either vehicle scheduling or path planning, which\noften simplify complex urban layouts and neglect the necessity for simultaneous\ncoordination and mutual avoidance among CAVs. This oversimplification poses\nsignificant challenges to the deployment of AMoD systems in real-world\nscenarios. To address these gaps, we propose CoDriveVLM, a novel framework that\nintegrates high-fidelity simultaneous dispatching and cooperative motion\nplanning for future AMoD systems. Our method harnesses Vision-Language Models\n(VLMs) to enhance multi-modality information processing, and this enables\ncomprehensive dispatching and collision risk evaluation. The VLM-enhanced CAV\ndispatching coordinator is introduced to effectively manage complex and\nunforeseen AMoD conditions, thus supporting efficient scheduling\ndecision-making. Furthermore, we propose a scalable decentralized cooperative\nmotion planning method via consensus alternating direction method of\nmultipliers (ADMM) focusing on collision risk evaluation and decentralized\ntrajectory optimization. Simulation results demonstrate the feasibility and\nrobustness of CoDriveVLM in various traffic conditions, showcasing its\npotential to significantly improve the fidelity and effectiveness of AMoD\nsystems in future urban transportation networks. The code is available at\nhttps://github.com/henryhcliu/CoDriveVLM.git.\n","authors":["Haichao Liu","Ruoyu Yao","Wenru Liu","Zhenmin Huang","Shaojie Shen","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2501.06132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02189v2","updated":"2025-01-10T17:43:10Z","published":"2025-01-04T04:59:33Z","title":"Benchmark Evaluations, Applications, and Challenges of Large Vision\n  Language Models: A Survey","summary":"  Multimodal Vision Language Models (VLMs) have emerged as a transformative\ntechnology at the intersection of computer vision and natural language\nprocessing, enabling machines to perceive and reason about the world through\nboth visual and textual modalities. For example, models such as CLIP, Claude,\nand GPT-4V demonstrate strong reasoning and understanding abilities on visual\nand textual data and beat classical single modality vision models on zero-shot\nclassification. Despite their rapid advancements in research and growing\npopularity in applications, a comprehensive survey of existing studies on VLMs\nis notably lacking, particularly for researchers aiming to leverage VLMs in\ntheir specific domains. To this end, we provide a systematic overview of VLMs\nin the following aspects: model information of the major VLMs developed over\nthe past five years (2019-2024); the main architectures and training methods of\nthese VLMs; summary and categorization of the popular benchmarks and evaluation\nmetrics of VLMs; the applications of VLMs including embodied agents, robotics,\nand video generation; the challenges and issues faced by current VLMs such as\nhallucination, fairness, and safety. Detailed collections including papers and\nmodel repository links are listed in\nhttps://github.com/zli12321/Awesome-VLM-Papers-And-Models.git.\n","authors":["Zongxia Li","Xiyang Wu","Hongyang Du","Huy Nghiem","Guangyao Shi"],"pdf_url":"https://arxiv.org/pdf/2501.02189v2.pdf","comment":"35 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.06130v1","updated":"2025-01-10T17:35:29Z","published":"2025-01-10T17:35:29Z","title":"A Mixed-Integer Conic Program for the Multi-Agent Moving-Target\n  Traveling Salesman Problem","summary":"  The Moving-Target Traveling Salesman Problem (MT-TSP) aims to find a shortest\npath for an agent that starts at a stationary depot, visits a set of moving\ntargets exactly once, each within one of their respective time windows, and\nthen returns to the depot. In this paper, we introduce a new Mixed-Integer\nConic Program (MICP) formulation that finds the optimum for the Multi-Agent\nMoving-Target Traveling Salesman Problem (MA-MT-TSP), a generalization of the\nMT-TSP involving multiple agents. We obtain our formulation by first restating\nthe current state-of-the-art MICP formulation for MA-MT-TSP as a Mixed-Integer\nNonlinear Nonconvex Program, and then reformulating it as a new MICP. We\npresent computational results to demonstrate the performance of our approach.\nThe results show that our formulation significantly outperforms the\nstate-of-the-art, with up to a two-order-of-magnitude reduction in runtime, and\nup to over 90% tighter optimality gap.\n","authors":["Allen George Philip","Zhongqiang Ren","Sivakumar Rathinam","Howie Choset"],"pdf_url":"https://arxiv.org/pdf/2501.06130v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.06122v1","updated":"2025-01-10T17:21:04Z","published":"2025-01-10T17:21:04Z","title":"NDOB-Based Control of a UAV with Delta-Arm Considering Manipulator\n  Dynamics","summary":"  Aerial Manipulators (AMs) provide a versatile platform for various\napplications, including 3D printing, architecture, and aerial grasping\nmissions. However, their operational speed is often sacrificed to uphold\nprecision. Existing control strategies for AMs often regard the manipulator as\na disturbance and employ robust control methods to mitigate its influence. This\nresearch focuses on elevating the precision of the end-effector and enhancing\nthe agility of aerial manipulator movements. We present a composite control\nscheme to address these challenges. Initially, a Nonlinear Disturbance Observer\n(NDOB) is utilized to compensate for internal coupling effects and external\ndisturbances. Subsequently, manipulator dynamics are processed through a high\npass filter to facilitate agile movements. By integrating the proposed control\nmethod into a fully autonomous delta-arm-based AM system, we substantiate the\ncontroller's efficacy through extensive real-world experiments. The outcomes\nillustrate that the end-effector can achieve accuracy at the millimeter level.\n","authors":["Hongming Chen","Biyu Ye","Xianqi Liang","Weiliang Deng","Ximin Lyu"],"pdf_url":"https://arxiv.org/pdf/2501.06122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06115v1","updated":"2025-01-10T17:12:30Z","published":"2025-01-10T17:12:30Z","title":"Development of an Advisory System for Parking of a Car and Trailer","summary":"  Trailer parking is a challenging task due to the unstable nature of the\nvehicle-trailer system in reverse motion and the unintuitive steering actions\nrequired at the vehicle to accomplish the parking maneuver. This paper presents\na strategy to tackle this kind of maneuver with an advisory graphic aid to help\nthe human driver with the task of manually backing up the vehicle-trailer\nsystem. A kinematic vehicle-trailer model is derived to describe the low-speed\nmotion of the vehicle-trailer system, and its inverse kinematics is established\nby generating an equivalent virtual trailer axle steering command. The advisory\nsystem graphics is generated based on the inverse kinematics and displays the\nexpected trailer orientation given the current vehicle steer angle and\nconfiguration (hitch angle). Simulation study and animation are set up to test\nthe efficacy of the approach, where the user can select both vehicle speed and\nvehicle steering angle freely, which allows the user to stop the\nvehicle-trailer system and experiment with different steering inputs to see\ntheir effect on the predicted trailer motion before proceeding with the best\none according to the advisory graphics, hence creating a series of piecewise\ncontinuous control actions similar to how manual trailer reverse parking is\nusually carried out. The advisory graphics proves to provide the driver with an\nintuitive understanding of the trailer motion at any given configuration (hitch\nangle).\n","authors":["Xincheng Cao","Haochong Chen","Bilin Aksun Guvenc","Levent Guvenc","Shihong Fan","John Harber","Brian Link","Peter Richmond","Dokyung Yim"],"pdf_url":"https://arxiv.org/pdf/2501.06115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06113v1","updated":"2025-01-10T17:05:59Z","published":"2025-01-10T17:05:59Z","title":"Vehicle-in-Virtual-Environment (VVE) Based Autonomous Driving Function\n  Development and Evaluation Methodology for Vulnerable Road User Safety","summary":"  Traditional methods for developing and evaluating autonomous driving\nfunctions, such as model-in-the-loop (MIL) and hardware-in-the-loop (HIL)\nsimulations, heavily depend on the accuracy of simulated vehicle models and\nhuman factors, especially for vulnerable road user safety systems. Continuation\nof development during public road deployment forces other road users including\nvulnerable ones to involuntarily participate in the development process,\nleading to safety risks, inefficiencies, and a decline in public trust. To\naddress these deficiencies, the Vehicle-in-Virtual-Environment (VVE) method was\nproposed as a safer, more efficient, and cost-effective solution for developing\nand testing connected and autonomous driving technologies by operating the real\nvehicle and multiple other actors like vulnerable road users in different test\nareas while being immersed within the same highly realistic virtual\nenvironment. This VVE approach synchronizes real-world vehicle and vulnerable\nroad user motion within the same virtual scenario, enabling the safe and\nrealistic testing of various traffic situations in a safe and repeatable\nmanner. In this paper, we propose a new testing pipeline that sequentially\nintegrates MIL, HIL, and VVE methods to comprehensively develop and evaluate\nautonomous driving functions. The effectiveness of this testing pipeline will\nbe demonstrated using an autonomous driving path-tracking algorithm with local\ndeep reinforcement learning modification for vulnerable road user collision\navoidance.\n","authors":["Haochong Chen","Xincheng Cao","Levent Guvenc","Bilin Aksun Guvenc"],"pdf_url":"https://arxiv.org/pdf/2501.06113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06089v1","updated":"2025-01-10T16:39:01Z","published":"2025-01-10T16:39:01Z","title":"Towards Developing Socially Compliant Automated Vehicles: State of the\n  Art, Experts Expectations, and A Conceptual Framework","summary":"  Automated Vehicles (AVs) hold promise for revolutionizing transportation by\nimproving road safety, traffic efficiency, and overall mobility. Despite the\nsteady advancement in high-level AVs in recent years, the transition to full\nautomation entails a period of mixed traffic, where AVs of varying automation\nlevels coexist with human-driven vehicles (HDVs). Making AVs socially compliant\nand understood by human drivers is expected to improve the safety and\nefficiency of mixed traffic. Thus, ensuring AVs compatibility with HDVs and\nsocial acceptance is crucial for their successful and seamless integration into\nmixed traffic. However, research in this critical area of developing Socially\nCompliant AVs (SCAVs) remains sparse. This study carries out the first\ncomprehensive scoping review to assess the current state of the art in\ndeveloping SCAVs, identifying key concepts, methodological approaches, and\nresearch gaps. An expert interview was also conducted to identify critical\nresearch gaps and expectations towards SCAVs. Based on the scoping review and\nexpert interview input, a conceptual framework is proposed for the development\nof SCAVs. The conceptual framework is evaluated using an online survey\ntargeting researchers, technicians, policymakers, and other relevant\nprofessionals worldwide. The survey results provide valuable validation and\ninsights, affirming the significance of the proposed conceptual framework in\ntackling the challenges of integrating AVs into mixed-traffic environments.\nAdditionally, future research perspectives and suggestions are discussed,\ncontributing to the research and development agenda of SCAVs.\n","authors":["Yongqi Dong","Bart van Arem","Haneen Farah"],"pdf_url":"https://arxiv.org/pdf/2501.06089v1.pdf","comment":"39 pages, 13 figures, under review by the journal of Transportation\n  Research Part E: Logistics and Transportation Review"},{"id":"http://arxiv.org/abs/2501.06088v1","updated":"2025-01-10T16:36:15Z","published":"2025-01-10T16:36:15Z","title":"Non-planar 3D Printing of Double Shells","summary":"  We present a method to fabricate double shell structures printed in\ntrans-versal directions using multi-axis fused-deposition-modeling (FDM)\nrobot-ic 3D printing. Shell structures, characterized by lightweight, thin\nwalls, fast buildup, and minimal material usage, find diverse applications in\npro-totyping and architecture for uses such as fa\\c{c}ade panels, molds for\nconcrete casting, or full-scale pavilions. We leverage an underlying\nrepresentation of transversal strip networks generated using existing methods\nand propose a methodology for converting them into printable partitions. Each\npartition is printed separately and assembled into a double-shell structure. We\nout-line the specifications and workflow that make the printing of each piece\nand the subsequent assembly process feasible. The versatility and robust-ness\nof our method are demonstrated with both digital and fabricated re-sults on\nsurfaces of different scales and geometric complexity.\n","authors":["Ioanna Mitropoulou","Amir Vaxman","Olga Diamanti","Benjamin Dillenburger"],"pdf_url":"https://arxiv.org/pdf/2501.06088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14762v2","updated":"2025-01-10T15:54:54Z","published":"2024-12-19T11:43:13Z","title":"A General Control Method for Human-Robot Integration","summary":"  This paper introduces a new generalized control method designed for\nmulti-degrees-of-freedom devices to help people with limited motion\ncapabilities in their daily activities. The challenge lies in finding the most\nadapted strategy for the control interface to effectively map user's motions in\na low-dimensional space to complex robotic assistive devices, such as\nprostheses, supernumerary limbs, up to remote robotic avatars. The goal is a\nsystem which integrates the human and the robotic parts into a unique system,\nmoving so as to reach the targets decided by the human while autonomously\nreducing the user's effort and discomfort. We present a framework to control\ngeneral multi DoFs assistive systems, which translates user-performed\ncompensatory motions into the necessary robot commands for reaching targets\nwhile canceling or reducing compensation. The framework extends to prostheses\nof any number of DoF up to full robotic avatars, regarded here as a sort of\nwhole-body prosthesis of the person who sees the robot as an artificial\nextension of their own body without a physical link but with a sensory-motor\nintegration. We have validated and applied this control strategy through tests\nencompassing simulated scenarios and real-world trials involving a virtual twin\nof the robotic parts (prosthesis and robot) and a physical humanoid avatar.\n","authors":["Maddalena Feder","Giorgio Grioli","Manuel G. Catalano","Antonio Bicchi"],"pdf_url":"https://arxiv.org/pdf/2412.14762v2.pdf","comment":"Submitted to the International Journal of Robotics Research (IJRR),\n  under review since October 2024, 16 pages, 30 figures"},{"id":"http://arxiv.org/abs/2406.11136v2","updated":"2025-01-10T15:53:00Z","published":"2024-06-17T01:47:11Z","title":"Robots in Family Routines: Development of and Initial Insights from the\n  Family-Robot Routines Inventory","summary":"  Despite advances in areas such as the personalization of robots, sustaining\nadoption of robots for long-term use in families remains a challenge. Recent\nstudies have identified integrating robots into families' routines and rituals\nas a promising approach to support long-term adoption. However, few studies\nexplored the integration of robots into family routines and there is a gap in\nsystematic measures to capture family preferences for robot integration.\nBuilding upon existing routine inventories, we developed Family-Robot Routines\nInventory (FRRI), with 24 family routines and 24 child routine items, to\ncapture parents' attitudes toward and expectations from the integration of\nrobotic technology into their family routines. Using this inventory, we\ncollected data from 150 parents through an online survey. Our analysis\nindicates that parents had varying perceptions for the utility of integrating\nrobots into their routines. For example, parents found robot integration to be\nmore helpful in children's individual routines, than to the collective routines\nof their families. We discuss the design implications of these preliminary\nfindings, and how they may serve as a first step toward understanding the\ndiverse challenges and demands of designing and integrating household robots\nfor families.\n","authors":["Michael F. Xu","Bengisu Cagiltay","Joseph Michaelis","Sarah Sebo","Bilge Mutlu"],"pdf_url":"https://arxiv.org/pdf/2406.11136v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04860v2","updated":"2025-01-10T15:47:19Z","published":"2025-01-08T22:22:15Z","title":"Exploring the Use of Robots for Diary Studies","summary":"  As interest in studying in-the-wild human-robot interaction grows, there is a\nneed for methods to collect data over time and in naturalistic or potentially\nprivate environments. HRI researchers have increasingly used the diary method\nfor these studies, asking study participants to self-administer a structured\ndata collection instrument, i.e., a diary, over a period of time. Although the\ndiary method offers a unique window into settings that researchers may not have\naccess to, they also lack the interactivity and probing that interview-based\nmethods offer. In this paper, we explore a novel data collection method in\nwhich a robot plays the role of an interactive diary. We developed the Diary\nRobot system and performed in-home deployments for a week to evaluate the\nfeasibility and effectiveness of this approach. Using traditional text-based\nand audio-based diaries as benchmarks, we found that robots are able to\neffectively elicit the intended information. We reflect on our findings, and\ndescribe scenarios where the utilization of robots in diary studies as a data\ncollection instrument may be especially applicable.\n","authors":["Michael F. Xu","Bilge Mutlu"],"pdf_url":"https://arxiv.org/pdf/2501.04860v2.pdf","comment":"Proceedings of the 20th ACM/IEEE International Conference on Human\n  Robot Interaction (HRI 2025)"},{"id":"http://arxiv.org/abs/2408.00907v3","updated":"2025-01-10T15:35:20Z","published":"2024-08-01T20:56:28Z","title":"The Harmonic Exponential Filter for Nonparametric Estimation on Motion\n  Groups","summary":"  Bayesian estimation is a vital tool in robotics as it allows systems to\nupdate the robot state belief using incomplete information from noisy sensors.\nTo render the state estimation problem tractable, many systems assume that the\nmotion and measurement noise, as well as the state distribution, are unimodal\nand Gaussian. However, there are numerous scenarios and systems that do not\ncomply with these assumptions. Existing nonparametric filters that are used to\nmodel multimodal distributions have drawbacks that limit their ability to\nrepresent a diverse set of distributions. This paper introduces a novel\napproach to nonparametric Bayesian filtering on motion groups, designed to\nhandle multimodal distributions using harmonic exponential distributions. This\napproach leverages two key insights of harmonic exponential distributions: a)\nthe product of two distributions can be expressed as the element-wise addition\nof their log-likelihood Fourier coefficients, and b) the convolution of two\ndistributions can be efficiently computed as the tensor product of their\nFourier coefficients. These observations enable the development of an efficient\nand asymptotically exact solution to the Bayes filter up to the band limit of a\nFourier transform. We demonstrate our filter's performance compared with\nestablished nonparametric filtering methods across simulated and real-world\nlocalization tasks.\n","authors":["Miguel Saavedra-Ruiz","Steven A. Parkison","Ria Arora","James Richard Forbes","Liam Paull"],"pdf_url":"https://arxiv.org/pdf/2408.00907v3.pdf","comment":"Accepted to the IEEE Robotics and Automation Letters (RA-L 2025) Code\n  available at https://github.com/montrealrobotics/harmonic-filter. Webpage and\n  additional videos at https://montrealrobotics.ca/hef/"},{"id":"http://arxiv.org/abs/2501.06047v1","updated":"2025-01-10T15:28:24Z","published":"2025-01-10T15:28:24Z","title":"Learning Affordances from Interactive Exploration using an Object-level\n  Map","summary":"  Many robotic tasks in real-world environments require physical interactions\nwith an object such as pick up or push. For successful interactions, the robot\nneeds to know the object's affordances, which are defined as the potential\nactions the robot can perform with the object. In order to learn a\nrobot-specific affordance predictor, we propose an interactive exploration\npipeline which allows the robot to collect interaction experiences while\nexploring an unknown environment. We integrate an object-level map in the\nexploration pipeline such that the robot can identify different object\ninstances and track objects across diverse viewpoints. This results in denser\nand more accurate affordance annotations compared to state-of-the-art methods,\nwhich do not incorporate a map. We show that our affordance exploration\napproach makes exploration more efficient and results in more accurate\naffordance prediction models compared to baseline methods.\n","authors":["Paula Wulkop","Halil Umut Özdemir","Antonia Hüfner","Jen Jen Chung","Roland Siegwart","Lionel Ott"],"pdf_url":"https://arxiv.org/pdf/2501.06047v1.pdf","comment":"International Symposium of Robotics Research (ISRR) 2024"},{"id":"http://arxiv.org/abs/1907.03817v2","updated":"2025-01-10T13:14:26Z","published":"2019-07-08T19:10:56Z","title":"Towards the Internet of Robotic Things: Analysis, Architecture,\n  Components and Challenges","summary":"  Internet of Things (IoT) and robotics cannot be considered two separate\ndomains these days. Internet of Robotics Things (IoRT) is a concept that has\nbeen recently introduced to describe the integration of robotics technologies\nin IoT scenarios. As a consequence, these two research fields have started\ninteracting, and thus linking research communities. In this paper we intend to\nmake further steps in joining the two communities and broaden the discussion on\nthe development of this interdisciplinary field. The paper provides an\noverview, analysis and challenges of possible solutions for the Internet of\nRobotic Things, discussing the issues of the IoRT architecture, the integration\nof smart spaces and robotic applications.\n","authors":["Ilya Afanasyev","Manuel Mazzara","Subham Chakraborty","Nikita Zhuchkov","Aizhan Maksatbek","Mohamad Kassab","Salvatore Distefano"],"pdf_url":"https://arxiv.org/pdf/1907.03817v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16111v2","updated":"2025-01-10T12:56:47Z","published":"2024-09-24T14:19:47Z","title":"CloudTrack: Scalable UAV Tracking with Cloud Semantics","summary":"  Nowadays, unmanned aerial vehicles (UAVs) are commonly used in search and\nrescue scenarios to gather information in the search area. The automatic\nidentification of the person searched for in aerial footage could increase the\nautonomy of such systems, reduce the search time, and thus increase the missed\nperson's chances of survival. In this paper, we present a novel approach to\nperform semantically conditioned open vocabulary object tracking that is\nspecifically designed to cope with the limitations of UAV hardware. Our\napproach has several advantages. It can run with verbal descriptions of the\nmissing person, e.g., the color of the shirt, it does not require dedicated\ntraining to execute the mission and can efficiently track a potentially moving\nperson. Our experimental results demonstrate the versatility and efficacy of\nour approach.\n","authors":["Yannik Blei","Michael Krawez","Nisarga Nilavadi","Tanja Katharina Kaiser","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2409.16111v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.05931v1","updated":"2025-01-10T12:54:33Z","published":"2025-01-10T12:54:33Z","title":"Environment Modeling for Service Robots From a Task Execution\n  Perspective","summary":"  Service robots are increasingly entering the home to provide domestic tasks\nfor residents. However, when working in an open, dynamic, and unstructured home\nenvironment, service robots still face challenges such as low intelligence for\ntask execution and poor long-term autonomy (LTA), which has limited their\ndeployment. As the basis of robotic task execution, environment modeling has\nattracted significant attention. This integrates core technologies such as\nenvironment perception, understanding, and representation to accurately\nrecognize environmental information. This paper presents a comprehensive survey\nof environmental modeling from a new task-executionoriented perspective. In\nparticular, guided by the requirements of robots in performing domestic service\ntasks in the home environment, we systematically review the progress that has\nbeen made in task-execution-oriented environmental modeling in four respects:\n1) localization, 2) navigation, 3) manipulation, and 4) LTA. Current challenges\nare discussed, and potential research opportunities are also highlighted.\n","authors":["Ying Zhang","Guohui Tian","Cui-Hua Zhang","Changchun Hua","Weili Ding","Choon Ki Ahn"],"pdf_url":"https://arxiv.org/pdf/2501.05931v1.pdf","comment":"16 pages, 9 figures; This article has been accepted for publication\n  in a future issue of IEEE/CAA Journal of Automatica Sinica, but has not been\n  fully edited. Content may change prior to final publication"},{"id":"http://arxiv.org/abs/2501.03968v2","updated":"2025-01-10T10:38:49Z","published":"2025-01-07T18:06:27Z","title":"VLM-driven Behavior Tree for Context-aware Task Planning","summary":"  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)\nhas recently gained attention in the robotics community, yet remains in its\nearly stages of development. In this paper, we propose a novel framework that\nleverages Vision-Language Models (VLMs) to interactively generate and edit BTs\nthat address visual conditions, enabling context-aware robot operations in\nvisually complex environments. A key feature of our approach lies in the\nconditional control through self-prompted visual conditions. Specifically, the\nVLM generates BTs with visual condition nodes, where conditions are expressed\nas free-form text. Another VLM process integrates the text into its prompt and\nevaluates the conditions against real-world images during robot execution. We\nvalidated our framework in a real-world cafe scenario, demonstrating both its\nfeasibility and limitations.\n","authors":["Naoki Wake","Atsushi Kanehira","Jun Takamatsu","Kazuhiro Sasabuchi","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2501.03968v2.pdf","comment":"10 pages, 11 figures, 5 tables. Last updated on January 9th, 2024"},{"id":"http://arxiv.org/abs/2501.05770v1","updated":"2025-01-10T07:58:52Z","published":"2025-01-10T07:58:52Z","title":"Path Planning for Multi-Copter UAV Formation Employing a Generalized\n  Particle Swarm Optimization","summary":"  The paper investigates the problem of path planning techniques for\nmulti-copter uncrewed aerial vehicles (UAV) cooperation in a formation shape to\nexamine surrounding surfaces. We first describe the problem as a joint\nobjective cost for planning a path of the formation centroid working in a\ncomplicated space. The path planning algorithm, named the generalized particle\nswarm optimization algorithm, is then presented to construct an optimal,\nflyable path while avoiding obstacles and ensuring the flying mission\nrequirements. A path-development scheme is then incorporated to generate a\nrelevant path for each drone to maintain its position in the formation\nconfiguration. Simulation, comparison, and experiments have been conducted to\nverify the proposed approach. Results show the feasibility of the proposed\npath-planning algorithm with GEPSO.\n","authors":["Van Truong Hoang"],"pdf_url":"https://arxiv.org/pdf/2501.05770v1.pdf","comment":"6 pages, 8 figures, conference"},{"id":"http://arxiv.org/abs/2501.05750v1","updated":"2025-01-10T06:58:14Z","published":"2025-01-10T06:58:14Z","title":"Semantic Mapping in Indoor Embodied AI -- A Comprehensive Survey and\n  Future Directions","summary":"  Intelligent embodied agents (e.g. robots) need to perform complex semantic\ntasks in unfamiliar environments. Among many skills that the agents need to\npossess, building and maintaining a semantic map of the environment is most\ncrucial in long-horizon tasks. A semantic map captures information about the\nenvironment in a structured way, allowing the agent to reference it for\nadvanced reasoning throughout the task. While existing surveys in embodied AI\nfocus on general advancements or specific tasks like navigation and\nmanipulation, this paper provides a comprehensive review of semantic\nmap-building approaches in embodied AI, specifically for indoor navigation. We\ncategorize these approaches based on their structural representation (spatial\ngrids, topological graphs, dense point-clouds or hybrid maps) and the type of\ninformation they encode (implicit features or explicit environmental data). We\nalso explore the strengths and limitations of the map building techniques,\nhighlight current challenges, and propose future research directions. We\nidentify that the field is moving towards developing open-vocabulary,\nqueryable, task-agnostic map representations, while high memory demands and\ncomputational inefficiency still remaining to be open challenges. This survey\naims to guide current and future researchers in advancing semantic mapping\ntechniques for embodied AI systems.\n","authors":["Sonia Raychaudhuri","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2501.05750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05723v1","updated":"2025-01-10T05:43:34Z","published":"2025-01-10T05:43:34Z","title":"Robot Error Awareness Through Human Reactions: Implementation,\n  Evaluation, and Recommendations","summary":"  Effective error detection is crucial to prevent task disruption and maintain\nuser trust. Traditional methods often rely on task-specific models or user\nreporting, which can be inflexible or slow. Recent research suggests social\nsignals, naturally exhibited by users in response to robot errors, can enable\nmore flexible, timely error detection. However, most studies rely on post hoc\nanalysis, leaving their real-time effectiveness uncertain and lacking\nuser-centric evaluation. In this work, we developed a proactive error detection\nsystem that combines user behavioral signals (facial action units and speech),\nuser feedback, and error context for automatic error detection. In a study (N =\n28), we compared our proactive system to a status quo reactive approach.\nResults show our system 1) reliably and flexibly detects error, 2) detects\nerrors faster than the reactive approach, and 3) is perceived more favorably by\nusers than the reactive one. We discuss recommendations for enabling robot\nerror awareness in future HRI systems.\n","authors":["Maia Stiber","Russell Taylor","Chien-Ming Huang"],"pdf_url":"https://arxiv.org/pdf/2501.05723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05688v1","updated":"2025-01-10T03:41:03Z","published":"2025-01-10T03:41:03Z","title":"eKalibr: Dynamic Intrinsic Calibration for Event Cameras From First\n  Principles of Events","summary":"  The bio-inspired event camera has garnered extensive research attention in\nrecent years, owing to its significant potential derived from its high dynamic\nrange and low latency characteristics. Similar to the standard camera, the\nevent camera requires precise intrinsic calibration to facilitate further\nhigh-level visual applications, such as pose estimation and mapping. While\nseveral calibration methods for event cameras have been proposed, most of them\nare either (i) engineering-driven, heavily relying on conventional image-based\ncalibration pipelines, or (ii) inconvenient, requiring complex instrumentation.\nTo this end, we propose an accurate and convenient intrinsic calibration method\nfor event cameras, named eKalibr, which builds upon a carefully designed\nevent-based circle grid pattern recognition algorithm. To extract target\npatterns from events, we perform event-based normal flow estimation to identify\npotential events generated by circle edges, and cluster them spatially.\nSubsequently, event clusters associated with the same grid circles are matched\nand grouped using normal flows, for subsequent time-varying ellipse estimation.\nFitted ellipse centers are time-synchronized, for final grid pattern\nrecognition. We conducted extensive experiments to evaluate the performance of\neKalibr in terms of pattern extraction and intrinsic calibration. The\nimplementation of eKalibr is open-sourced at\n(https://github.com/Unsigned-Long/eKalibr) to benefit the research community.\n","authors":["Shuolong Chen","Xingxing Li","Liu Yuan","Ziao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.05688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05639v1","updated":"2025-01-10T00:56:39Z","published":"2025-01-10T00:56:39Z","title":"Scaling Safe Multi-Agent Control for Signal Temporal Logic\n  Specifications","summary":"  Existing methods for safe multi-agent control using logic specifications like\nSignal Temporal Logic (STL) often face scalability issues. This is because they\nrely either on single-agent perspectives or on Mixed Integer Linear Programming\n(MILP)-based planners, which are complex to optimize. These methods have proven\nto be computationally expensive and inefficient when dealing with a large\nnumber of agents. To address these limitations, we present a new scalable\napproach to multi-agent control in this setting. Our method treats the\nrelationships between agents using a graph structure rather than in terms of a\nsingle-agent perspective. Moreover, it combines a multi-agent collision\navoidance controller with a Graph Neural Network (GNN) based planner, models\nthe system in a decentralized fashion, and trains on STL-based objectives to\ngenerate safe and efficient plans for multiple agents, thereby optimizing the\nsatisfaction of complex temporal specifications while also facilitating\nmulti-agent collision avoidance. Our experiments show that our approach\nsignificantly outperforms existing methods that use a state-of-the-art\nMILP-based planner in terms of scalability and performance. The project website\nis https://jeappen.com/mastl-gcbf-website/ and the code is at\nhttps://github.com/jeappen/mastl-gcbf .\n","authors":["Joe Eappen","Zikang Xiong","Dipam Patel","Aniket Bera","Suresh Jagannathan"],"pdf_url":"https://arxiv.org/pdf/2501.05639v1.pdf","comment":"Accepted to CoRL 2024. arXiv admin note: text overlap with\n  arXiv:2401.14554 by other authors"},{"id":"http://arxiv.org/abs/2501.05628v1","updated":"2025-01-10T00:08:37Z","published":"2025-01-10T00:08:37Z","title":"Concerns and Values in Human-Robot Interactions: A Focus on Social\n  Robotics","summary":"  Robots, as AI with physical instantiation, inhabit our social and physical\nworld, where their actions have both social and physical consequences, posing\nchallenges for researchers when designing social robots. This study starts with\na scoping review to identify discussions and potential concerns arising from\ninteractions with robotic systems. Two focus groups of technology ethics\nexperts then validated a comprehensive list of key topics and values in\nhuman-robot interaction (HRI) literature. These insights were integrated into\nthe HRI Value Compass web tool, to help HRI researchers identify ethical values\nin robot design. The tool was evaluated in a pilot study. This work benefits\nthe HRI community by highlighting key concerns in human-robot interactions and\nproviding an instrument to help researchers design robots that align with human\nvalues, ensuring future robotic systems adhere to these values in social\napplications.\n","authors":["Giulio Antonio Abbo","Tony Belpaeme","Micol Spitale"],"pdf_url":"https://arxiv.org/pdf/2501.05628v1.pdf","comment":"52 pages, 10 figures, 5 appendices"},{"id":"http://arxiv.org/abs/2501.06348v1","updated":"2025-01-10T21:20:11Z","published":"2025-01-10T21:20:11Z","title":"Why Automate This? Exploring the Connection between Time Use, Well-being\n  and Robot Automation Across Social Groups","summary":"  Understanding the motivations underlying the human inclination to automate\ntasks is vital to developing truly helpful robots integrated into daily life.\nAccordingly, we ask: are individuals more inclined to automate chores based on\nthe time they consume or the feelings experienced while performing them? This\nstudy explores these preferences and whether they vary across different social\ngroups (i.e., gender category and income level). Leveraging data from the\nBEHAVIOR-1K dataset, the American Time-Use Survey, and the American Time-Use\nSurvey Well-Being Module, we investigate the relationship between the desire\nfor automation, time spent on daily activities, and their associated feelings -\nHappiness, Meaningfulness, Sadness, Painfulness, Stressfulness, or Tiredness.\nOur key findings show that, despite common assumptions, time spent does not\nstrongly relate to the desire for automation for the general population. For\nthe feelings analyzed, only happiness and pain are key indicators. Significant\ndifferences by gender and economic level also emerged: Women prefer to automate\nstressful activities, whereas men prefer to automate those that make them\nunhappy; mid-income individuals prioritize automating less enjoyable and\nmeaningful activities, while low and high-income show no significant\ncorrelations. We hope our research helps motivate technologies to develop\nrobots that match the priorities of potential users, moving domestic robotics\ntoward more socially relevant solutions. We open-source all the data, including\nan online tool that enables the community to replicate our analysis and explore\nadditional trends at https://hri1260.github.io/why-automate-this.\n","authors":["Ruchira Ray","Leona Pang","Sanjana Srivastava","Li Fei-Fei","Samantha Shorey","Roberto Martín-Martín"],"pdf_url":"https://arxiv.org/pdf/2501.06348v1.pdf","comment":"20 pages, 14 figures"},{"id":"http://arxiv.org/abs/2501.07597v1","updated":"2025-01-10T02:20:59Z","published":"2025-01-10T02:20:59Z","title":"Learning-based Detection of GPS Spoofing Attack for Quadrotors","summary":"  Safety-critical cyber-physical systems (CPS), such as quadrotor UAVs, are\nparticularly prone to cyber attacks, which can result in significant\nconsequences if not detected promptly and accurately. During outdoor\noperations, the nonlinear dynamics of UAV systems, combined with non-Gaussian\nnoise, pose challenges to the effectiveness of conventional statistical and\nmachine learning methods. To overcome these limitations, we present QUADFormer,\nan advanced attack detection framework for quadrotor UAVs leveraging a\ntransformer-based architecture. This framework features a residue generator\nthat produces sequences sensitive to anomalies, which are then analyzed by the\ntransformer to capture statistical patterns for detection and classification.\nFurthermore, an alert mechanism ensures UAVs can operate safely even when under\nattack. Extensive simulations and experimental evaluations highlight that\nQUADFormer outperforms existing state-of-the-art techniques in detection\naccuracy.\n","authors":["Pengyu Wang","Zhaohua Yang","Jialu Li","Ling Shi"],"pdf_url":"https://arxiv.org/pdf/2501.07597v1.pdf","comment":"Accepted in IEEE Industrial Electronics Society Annual Online\n  Conference"},{"id":"http://arxiv.org/abs/2411.07261v2","updated":"2025-01-10T15:21:58Z","published":"2024-11-08T14:34:09Z","title":"Sinkage Study in Granular Material for Space Exploration Legged Robot\n  Gripper","summary":"  Wheeled rovers have been the primary choice for lunar exploration due to\ntheir speed and efficiency. However, deeper areas, such as lunar caves and\ncraters, require the mobility of legged robots. To do so, appropriate end\neffectors must be designed to enable climbing and walking on the granular\nsurface of the Moon. This paper investigates the behavior of an underactuated\nsoft gripper on deformable granular material when a legged robot is walking in\nsoft soil. A modular test bench and a simulation model were developed to\nobserve the gripper sinkage behavior under load. The gripper uses tendon-driven\nfingers to match its target shape and grasp on the target surface using\nmultiple micro-spines. The sinkage of the gripper in silica sand was measured\nby comparing the axial displacement of the gripper with the nominal load of the\nrobot mass. Multiple experiments were performed to observe the sinkage of the\ngripper over a range of slope angles. A simulation model accounting for the\ndegrees of compliance of the gripper fingers was created using Altair\nMotionSolve software and coupled to Altair EDEM to compute the gripper\ninteraction with particles utilizing the discrete element method. After\nvalidation of the model, complementary simulations using Lunar gravity and a\nregolith particle model were performed. The results show that a satisfactory\ngripper model with accurate freedom of motion can be created in simulation\nusing the Altair simulation packages and expected sinkage under load in a\nparticle-filled environment can be estimated using this model. By computing the\nsinkage of the end effector of legged robots, the results can be directly\nintegrated into the motion control algorithm and improve the accuracy of\nmobility in a granular material environment.\n","authors":["Arthur Candalot","James Hurrell","Malik Manel Hashim","Brigid Hickey","Mickael Laine","Kazuya Yoshida"],"pdf_url":"https://arxiv.org/pdf/2411.07261v2.pdf","comment":"Proceedings of the 21st International and 12th Asia-Pacific Regional\n  Conference of the ISTVS"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2501.06181v1","updated":"2025-01-10T18:58:44Z","published":"2025-01-10T18:58:44Z","title":"Best Response Convergence for Zero-sum Stochastic Dynamic Games with\n  Partial and Asymmetric Information","summary":"  We analyze best response dynamics for finding a Nash equilibrium of an\ninfinite horizon zero-sum stochastic linear quadratic dynamic game (LQDG) with\npartial and asymmetric information. We derive explicit expressions for each\nplayer's best response within the class of pure linear dynamic output feedback\ncontrol strategies where the internal state dimension of each control strategy\nis an integer multiple of the system state dimension. With each best response,\nthe players form increasingly higher-order belief states, leading to\ninfinite-dimensional internal states. However, we observe in extensive\nnumerical experiments that the game's value converges after just a few\niterations, suggesting that strategies associated with increasingly\nhigher-order belief states eventually provide no benefit. To help explain this\nconvergence, our numerical analysis reveals rapid decay of the controllability\nand observability Gramian eigenvalues and Hankel singular values in\nhigher-order belief dynamics, indicating that the higher-order belief dynamics\nbecome increasingly difficult for both players to control and observe.\nConsequently, the higher-order belief dynamics can be closely approximated by\nlow-order belief dynamics with bounded error, and thus feedback strategies with\nlimited internal state dimension can closely approximate a Nash equilibrium.\n","authors":["Yuxiang Guan","Iman Shames","Tyler H. Summers"],"pdf_url":"https://arxiv.org/pdf/2501.06181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06167v1","updated":"2025-01-10T18:46:28Z","published":"2025-01-10T18:46:28Z","title":"Meta-Learning for Physically-Constrained Neural System Identification","summary":"  We present a gradient-based meta-learning framework for rapid adaptation of\nneural state-space models (NSSMs) for black-box system identification. When\napplicable, we also incorporate domain-specific physical constraints to improve\nthe accuracy of the NSSM. The major benefit of our approach is that instead of\nrelying solely on data from a single target system, our framework utilizes data\nfrom a diverse set of source systems, enabling learning from limited target\ndata, as well as with few online training iterations. Through benchmark\nexamples, we demonstrate the potential of our approach, study the effect of\nfine-tuning subnetworks rather than full fine-tuning, and report real-world\ncase studies to illustrate the practical application and generalizability of\nthe approach to practical problems with physical-constraints. Specifically, we\nshow that the meta-learned models result in improved downstream performance in\nmodel-based state estimation in indoor localization and energy systems.\n","authors":["Ankush Chakrabarty","Gordon Wichern","Vedang M. Deshpande","Abraham P. Vinod","Karl Berntorp","Christopher R. Laughman"],"pdf_url":"https://arxiv.org/pdf/2501.06167v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2501.00588v2","updated":"2025-01-10T18:39:11Z","published":"2024-12-31T18:25:05Z","title":"Privacy-Preserving Distributed Defense Framework for DC Microgrids\n  Against Exponentially Unbounded False Data Injection Attacks","summary":"  This paper introduces a novel, fully distributed control framework for DC\nmicrogrids, enhancing resilience against exponentially unbounded false data\ninjection (EU-FDI) attacks. Our framework features a consensus-based secondary\ncontrol for each converter, effectively addressing these advanced threats. To\nfurther safeguard sensitive operational data, a privacy-preserving mechanism is\nincorporated into the control design, ensuring that critical information\nremains secure even under adversarial conditions. Rigorous Lyapunov stability\nanalysis confirms the framework's ability to maintain critical DC microgrid\noperations like voltage regulation and load sharing under EU-FDI threats. The\nframework's practicality is validated through hardware-in-the-loop experiments,\ndemonstrating its enhanced resilience and robust privacy protection against the\ncomplex challenges posed by quick variant FDI attacks.\n","authors":["Yi Zhang","Mohamadamin Rajabinezhad","Yichao Wang","Junbo Zhao","Shan Zuo"],"pdf_url":"https://arxiv.org/pdf/2501.00588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06118v1","updated":"2025-01-10T17:15:59Z","published":"2025-01-10T17:15:59Z","title":"Nonlinear port-Hamiltonian system identification from input-state-output\n  data","summary":"  A framework for identifying nonlinear port-Hamiltonian systems using\ninput-state-output data is introduced. The framework utilizes neural networks'\nuniversal approximation capacity to effectively represent complex dynamics in a\nstructured way. We show that using the structure helps to make long-term\npredictions compared to baselines that do not incorporate physics. We also\nexplore different architectures based on MLPs, KANs, and using prior\ninformation. The technique is validated through examples featuring\nnonlinearities in either the skew-symmetric terms, the dissipative terms, or\nthe Hamiltonian.\n","authors":["Karim Cherifi","Achraf El Messaoudi","Hannes Gernandt","Marco Roschkowski"],"pdf_url":"https://arxiv.org/pdf/2501.06118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06115v1","updated":"2025-01-10T17:12:30Z","published":"2025-01-10T17:12:30Z","title":"Development of an Advisory System for Parking of a Car and Trailer","summary":"  Trailer parking is a challenging task due to the unstable nature of the\nvehicle-trailer system in reverse motion and the unintuitive steering actions\nrequired at the vehicle to accomplish the parking maneuver. This paper presents\na strategy to tackle this kind of maneuver with an advisory graphic aid to help\nthe human driver with the task of manually backing up the vehicle-trailer\nsystem. A kinematic vehicle-trailer model is derived to describe the low-speed\nmotion of the vehicle-trailer system, and its inverse kinematics is established\nby generating an equivalent virtual trailer axle steering command. The advisory\nsystem graphics is generated based on the inverse kinematics and displays the\nexpected trailer orientation given the current vehicle steer angle and\nconfiguration (hitch angle). Simulation study and animation are set up to test\nthe efficacy of the approach, where the user can select both vehicle speed and\nvehicle steering angle freely, which allows the user to stop the\nvehicle-trailer system and experiment with different steering inputs to see\ntheir effect on the predicted trailer motion before proceeding with the best\none according to the advisory graphics, hence creating a series of piecewise\ncontinuous control actions similar to how manual trailer reverse parking is\nusually carried out. The advisory graphics proves to provide the driver with an\nintuitive understanding of the trailer motion at any given configuration (hitch\nangle).\n","authors":["Xincheng Cao","Haochong Chen","Bilin Aksun Guvenc","Levent Guvenc","Shihong Fan","John Harber","Brian Link","Peter Richmond","Dokyung Yim"],"pdf_url":"https://arxiv.org/pdf/2501.06115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06113v1","updated":"2025-01-10T17:05:59Z","published":"2025-01-10T17:05:59Z","title":"Vehicle-in-Virtual-Environment (VVE) Based Autonomous Driving Function\n  Development and Evaluation Methodology for Vulnerable Road User Safety","summary":"  Traditional methods for developing and evaluating autonomous driving\nfunctions, such as model-in-the-loop (MIL) and hardware-in-the-loop (HIL)\nsimulations, heavily depend on the accuracy of simulated vehicle models and\nhuman factors, especially for vulnerable road user safety systems. Continuation\nof development during public road deployment forces other road users including\nvulnerable ones to involuntarily participate in the development process,\nleading to safety risks, inefficiencies, and a decline in public trust. To\naddress these deficiencies, the Vehicle-in-Virtual-Environment (VVE) method was\nproposed as a safer, more efficient, and cost-effective solution for developing\nand testing connected and autonomous driving technologies by operating the real\nvehicle and multiple other actors like vulnerable road users in different test\nareas while being immersed within the same highly realistic virtual\nenvironment. This VVE approach synchronizes real-world vehicle and vulnerable\nroad user motion within the same virtual scenario, enabling the safe and\nrealistic testing of various traffic situations in a safe and repeatable\nmanner. In this paper, we propose a new testing pipeline that sequentially\nintegrates MIL, HIL, and VVE methods to comprehensively develop and evaluate\nautonomous driving functions. The effectiveness of this testing pipeline will\nbe demonstrated using an autonomous driving path-tracking algorithm with local\ndeep reinforcement learning modification for vulnerable road user collision\navoidance.\n","authors":["Haochong Chen","Xincheng Cao","Levent Guvenc","Bilin Aksun Guvenc"],"pdf_url":"https://arxiv.org/pdf/2501.06113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06112v1","updated":"2025-01-10T17:05:32Z","published":"2025-01-10T17:05:32Z","title":"Optimizing Experiments for Accurate Battery Circuit Parameters\n  Estimation: Reduction and Adjustment of Frequency Set Used in Electrochemical\n  Impedance Spectroscopy","summary":"  In this paper, we study a suitable experimental design of electrochemical\nimpedance spectroscopy (EIS) to reduce the number of frequency points while not\nsignificantly affecting the uncertainties of the estimated cell's equivalent\ncircuit model (ECM) parameters. It is based on an E-optimal experimental design\nthat aims to maximize the information about the ECM parameters collected by EIS\nmeasurements and, at the same time, minimize the overall uncertainty. In a\nnumerical experiment, we first analyze to which extent reducing the number of\nmeasurement points at low frequencies affects the uncertainty of the estimated\nparameters. Secondly, we show that applying the frequency adjustments can lead\nto the same or even improved global uncertainty of ECM parameter estimates as\nwith a higher number of measurements. This is numerically verified through a\ncase study using the ECM parameters of a commercial battery cell.\n","authors":["Vladimir Sovljanski","Mario Paolone","Sylvain Tant","Damien Pierre Sainflou"],"pdf_url":"https://arxiv.org/pdf/2501.06112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06104v1","updated":"2025-01-10T16:57:38Z","published":"2025-01-10T16:57:38Z","title":"Weather-Driven Priority Charging for Battery Storage Systems in Hybrid\n  Renewable Energy Grid","summary":"  The integration of renewable energy into the power grid is often hindered by\nits fragmented infrastructure, leading to inefficient utilization due to the\nvariability of energy production and its reliance on weather conditions.\nBattery storage systems, while essential for stabilizing energy supply, face\nchallenges like sub-optimal energy distribution, accelerating battery\ndegradation, and reducing operational efficiency. This paper presents a novel\nsolution to these challenges by developing a large-scale, interconnected\nrenewable energy network that optimizes energy storage and distribution. The\nproposed system includes strategically placed battery storage facilities that\nstabilize energy production by compensating for fluctuations in renewable\noutput. A priority charging algorithm, informed by real-time weather\nforecasting and load monitoring, ensures that the most suitable battery systems\nare charged under varying conditions. Within each storage facility, a secondary\npriority charging algorithm minimizes battery degradation by ranking batteries\nbased on critical parameters such as state of health (SoH) and state of charge\n(SoC) and deciding which to charge. This comprehensive approach enhances the\nefficiency and longevity of battery storage systems, offering a more reliable\nand resilient renewable energy infrastructure.\n","authors":["Dhrumil Bhatt","Siddharth Penumatsa","Nirbhay Singhal"],"pdf_url":"https://arxiv.org/pdf/2501.06104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06092v1","updated":"2025-01-10T16:45:00Z","published":"2025-01-10T16:45:00Z","title":"Molecular Communication-Inspired Particle Collector-Transmitter (PaCoT)\n  for Heavy Metal Removal from Human Circulatory System","summary":"  This study proposes a novel molecular communication (MC)-inspired\nnanomachine, PArticle COllector-Transmitter (PaCoT), to remove toxic heavy\nmetals from the human circulatory system. PaCoT collects these toxic metals and\ntransmits them to release nodes, such as lymph capillaries, before they reach\ncritical organs. The design incorporates key physical parameters and operates\nthrough particle reception and release mechanisms. In the reception process,\ndescribed as ligand-receptor binding reactions, modeled as a continuous-time\nMarkov process (CTMP), PaCoT uses metallothionein proteins as receptors and\nheavy metals (e.g., Zn, Pb, Cd) as ligands. We assume that the toxicity\ncondition (toxic (bit-1), non-toxic (bit-0)) is encoded into the concentration\nof heavy metal molecules. Thus, we consider that heavy metal concentration\nwithin the MC channel (e.g., human circulatory system) employs binary\nconcentration shift keying (binary CSK). The concentration ratio of specific\nheavy metals is estimated to infer toxicity, i.e., a high ratio indicates\ntoxicity and a low ratio suggests non-toxicity. Toxicity detection is achieved\nby monitoring the receptor bound duration in the presence of interferers and\nvarious types of heavy metals. After detecting and collecting toxic heavy\nmetals, PaCoT securely retains them in a liquid medium (e.g., water) until\nrelease, employing two mechanisms: (1) a single-disc viscous micropump to\nregulate flow rate, and (2) Brownian motion to facilitate diffusion. PaCoT's\nperformance is evaluated through MATLAB simulations, focusing on bit error\nprobability (BEP) of the toxicity detection method, release time of molecules\nfrom PaCoT and energy consumption.\n","authors":["Hilal Esra Yaldiz","Ozgur B. Akan"],"pdf_url":"https://arxiv.org/pdf/2501.06092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06089v1","updated":"2025-01-10T16:39:01Z","published":"2025-01-10T16:39:01Z","title":"Towards Developing Socially Compliant Automated Vehicles: State of the\n  Art, Experts Expectations, and A Conceptual Framework","summary":"  Automated Vehicles (AVs) hold promise for revolutionizing transportation by\nimproving road safety, traffic efficiency, and overall mobility. Despite the\nsteady advancement in high-level AVs in recent years, the transition to full\nautomation entails a period of mixed traffic, where AVs of varying automation\nlevels coexist with human-driven vehicles (HDVs). Making AVs socially compliant\nand understood by human drivers is expected to improve the safety and\nefficiency of mixed traffic. Thus, ensuring AVs compatibility with HDVs and\nsocial acceptance is crucial for their successful and seamless integration into\nmixed traffic. However, research in this critical area of developing Socially\nCompliant AVs (SCAVs) remains sparse. This study carries out the first\ncomprehensive scoping review to assess the current state of the art in\ndeveloping SCAVs, identifying key concepts, methodological approaches, and\nresearch gaps. An expert interview was also conducted to identify critical\nresearch gaps and expectations towards SCAVs. Based on the scoping review and\nexpert interview input, a conceptual framework is proposed for the development\nof SCAVs. The conceptual framework is evaluated using an online survey\ntargeting researchers, technicians, policymakers, and other relevant\nprofessionals worldwide. The survey results provide valuable validation and\ninsights, affirming the significance of the proposed conceptual framework in\ntackling the challenges of integrating AVs into mixed-traffic environments.\nAdditionally, future research perspectives and suggestions are discussed,\ncontributing to the research and development agenda of SCAVs.\n","authors":["Yongqi Dong","Bart van Arem","Haneen Farah"],"pdf_url":"https://arxiv.org/pdf/2501.06089v1.pdf","comment":"39 pages, 13 figures, under review by the journal of Transportation\n  Research Part E: Logistics and Transportation Review"},{"id":"http://arxiv.org/abs/2501.06042v1","updated":"2025-01-10T15:21:48Z","published":"2025-01-10T15:21:48Z","title":"The improvement in transmission resilience metrics from reduced outages\n  or faster restoration can be calculated by rerunning historical outage data","summary":"  Transmission utilities routinely collect detailed outage data, including\nresilience events in which outages bunch up due to weather. The resilience\nevents and their resilience metrics can readily be extracted from this\nhistorical outage data. Improvements such as grid hardening or investments in\nrestoration lead to reduced outages or faster restoration. We show how to rerun\nthis history with the effects of the reduced outages or faster restoration\nincluded to find the resulting improvement in resilience metrics, thus\nquantifying the benefits of these investments. This is demonstrated with case\nstudies for specific events (a derecho and a hurricane), and all large events\nor large thunderstorms in the Midwest USA. Instead of predicting future extreme\nevents with models, which is very challenging, the historical rerun readily\nquantifies the benefits that a resilience investment would have had if it had\nbeen made in the past. The historical rerun is particularly vivid in making the\ncase for resilience investments to stakeholders because it quantifies the\nbenefits for events actually experienced by those stakeholders, rather than for\nfuture events predicted with uncertainty.\n","authors":["Arslan Ahmad","Ian Dobson","Svetlana Ekisheva","Christopher Claypool"],"pdf_url":"https://arxiv.org/pdf/2501.06042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06030v1","updated":"2025-01-10T15:07:07Z","published":"2025-01-10T15:07:07Z","title":"Resiliency metrics quantifying emergency response in a distribution\n  system","summary":"  The electric distribution system is a cornerstone of modern life, playing a\ncritical role in the daily activities and well-being of individuals. As the\nworld transitions toward a decarbonized future, where even mobility relies on\nelectricity, ensuring the resilience of the grid becomes paramount. This paper\nintroduces novel resilience metrics designed to equip utilities and\nstakeholders with actionable tools to assess performance during storm events.\nThe metrics focus on emergency storm response and the resources required to\nimprove customer service. The practical calculation of the metrics from\nhistorical utility data is demonstrated for multiple storm events.\nAdditionally, the metrics' improvement with added crews is estimated by\n\"rerunning history\" with faster restoration. By applying this resilience\nframework, utilities can enhance their restoration strategies and unlock\npotential cost savings, benefiting both providers and customers in an era of\nheightened energy dependency.\n","authors":["Shikhar Pandey","Gowtham Kandaperumal","Arslan Ahmad","Ian Dobson"],"pdf_url":"https://arxiv.org/pdf/2501.06030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06016v1","updated":"2025-01-10T14:53:21Z","published":"2025-01-10T14:53:21Z","title":"Investigating the Impact of Observation Space Design Choices On Training\n  Reinforcement Learning Solutions for Spacecraft Problems","summary":"  Recent research using Reinforcement Learning (RL) to learn autonomous control\nfor spacecraft operations has shown great success. However, a recent study\nshowed their performance could be improved by changing the action space, i.e.\ncontrol outputs, used in the learning environment. This has opened the door for\nfinding more improvements through further changes to the environment. The work\nin this paper focuses on how changes to the environment's observation space can\nimpact the training and performance of RL agents learning the spacecraft\ninspection task. The studies are split into two groups. The first looks at the\nimpact of sensors that were designed to help agents learn the task. The second\nlooks at the impact of reference frames, reorienting the agent to see the world\nfrom a different perspective. The results show the sensors are not necessary,\nbut most of them help agents learn more optimal behavior, and that the\nreference frame does not have a large impact, but is best kept consistent.\n","authors":["Nathaniel Hamilton","Kyle Dunlap","Kerianne L Hobbs"],"pdf_url":"https://arxiv.org/pdf/2501.06016v1.pdf","comment":"18 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.05994v1","updated":"2025-01-10T14:26:23Z","published":"2025-01-10T14:26:23Z","title":"On the Interaction in Transient Stability of Two-Inverter Power Systems\n  containing GFL inverter Using Manifold Method","summary":"  Many renewable energy resources are integrated into power systems via\ngrid-following (GFL) inverters which rely on a phase-locked loop (PLL) for grid\nsynchronization. During severe grid faults, GFL inverters are vulnerable to\ntransient instability, often leading to disconnection from the grid. This paper\naims to elucidate the interaction mechanisms and define the stability\nboundaries of systems of two inverters, including GFL, grid-forming (GFM), or\ngrid-supporting (GSP) inverters. First, the generalized large-signal expression\nfor the two-inverter system under various inverter combinations is derived,\nrevealing that no energy function exists for systems containing GFL inverters.\nThis implies that the traditional direct method cannot be applied to such\nsystems. To overcome these challenges, a manifold method is employed to\nprecisely determine the domain of attraction (DOA) of the system, and the\ntransient stability margin is assessed by a new metric termed the critical\nclearing radius (CCR). A case study of the two-inverter system under various\ninverter combinations is conducted to explore large-signal interactions across\ndifferent scenarios. Manifold analysis and simulation results reveal that GSP\ninverters using PLL for grid synchronization exhibit behavior similar to GFM\ninverters when the droop coefficients in the terminal voltage control loop\n(TVC) are sufficiently large. Compared to GFL inverters, GSP inverters\nincorporating a TVC significantly enhances the transient stability of other\ninverters. In the STATCOM case, the optimal placement of the STATCOM, realized\nby GSP or GFM inverters, is identified to be at the midpoint of a transmission\nline. All findings in this paper are validated through electromagnetic\ntransient (EMT) simulations\n","authors":["Yifan Zhang","Yunjie Gu","Yue Zhu","Timothy C. Green","Hsiao-Dong Chiang"],"pdf_url":"https://arxiv.org/pdf/2501.05994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05984v1","updated":"2025-01-10T14:14:18Z","published":"2025-01-10T14:14:18Z","title":"The Safe Trusted Autonomy for Responsible Space Program","summary":"  The Safe Trusted Autonomy for Responsible Space (STARS) program aims to\nadvance autonomy technologies for space by leveraging machine learning\ntechnologies while mitigating barriers to trust, such as uncertainty,\nopaqueness, brittleness, and inflexibility. This paper presents the\nachievements and lessons learned from the STARS program in integrating\nreinforcement learning-based multi-satellite control, run time assurance\napproaches, and flexible human-autonomy teaming interfaces, into a new\nintegrated testing environment for collaborative autonomous satellite systems.\nThe primary results describe analysis of the reinforcement learning\nmulti-satellite control and run time assurance algorithms. These algorithms are\nintegrated into a prototype human-autonomy interface using best practices from\nhuman-autonomy trust literature, however detailed analysis of the effectiveness\nis left to future work. References are provided with additional detailed\nresults of individual experiments.\n","authors":["Kerianne L. Hobbs","Sean Phillips","Michelle Simon","Joseph B. Lyons","Jared Culbertson","Hamilton Scott Clouse","Nathaniel Hamilton","Kyle Dunlap","Zachary S. Lippay","Joshua Aurand","Zachary I. Bell","Taleri Hammack","Dorothy Ayres","Rizza Lim"],"pdf_url":"https://arxiv.org/pdf/2501.05984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05946v1","updated":"2025-01-10T13:18:00Z","published":"2025-01-10T13:18:00Z","title":"Coverage and Spectral Efficiency of NOMA-Enabled LEO Satellite Networks\n  with Ordering Schemes","summary":"  This paper investigates an analytical model for low-earth orbit (LEO)\nmulti-satellite downlink non-orthogonal multiple access (NOMA) networks. The\nsatellites transmit data to multiple NOMA user terminals (UTs), each employing\nsuccessive interference cancellation (SIC) for decoding. Two ordering schemes\nare adopted for NOMA-enabled LEO satellite networks, i.e., mean signal power\n(MSP)-based ordering and\ninstantaneous-signal-to-inter-satellite-interference-plus-noise ratio\n(ISINR)-based ordering. For each ordering scheme, we derive the coverage\nprobabilities of UTs under different channel conditions. Moreover, we discuss\nhow coverage is influenced by SIC, main-lobe gain, and tradeoffs between the\nnumber of satellites and their altitudes. Additionally, two user fairness-based\npower allocation (PA) schemes are considered, and PA coefficients with the\noptimal number of UTs that maximize their sum spectral efficiency (SE) are\nstudied. Simulation results show that there exists a maximum\nsignal-to-inter-satellite-interference-plus-noise ratio (SINR) threshold for\neach PA scheme that ensures the operation of NOMA in LEO satellite networks,\nand the benefit of NOMA only exists when the target SINR is below a certain\nthreshold. Compared with orthogonal multiple access (OMA), NOMA increases UTs'\nsum SE by as much as 35\\%. Furthermore, for most SINR thresholds, the sum SE\nincreases with the number of UTs to the highest value, whilst the maximum sum\nSE is obtained when there are two UTs.\n","authors":["Xiangyu Li","Bodong Shang","Qingqing Wu","Chao Ren"],"pdf_url":"https://arxiv.org/pdf/2501.05946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05943v1","updated":"2025-01-10T13:08:38Z","published":"2025-01-10T13:08:38Z","title":"Koopman-Based Model Predictive Control of Functional Electrical\n  Stimulation for Ankle Dorsiflexion and Plantarflexion Assistance","summary":"  Functional Electrical Stimulation (FES) can be an effective tool to augment\nparetic muscle function and restore normal ankle function. Our approach\nincorporates a real-time, data-driven Model Predictive Control (MPC) scheme,\nbuilt upon a Koopman operator theory (KOT) framework. This framework adeptly\ncaptures the complex nonlinear dynamics of ankle motion in a linearized form,\nenabling application of linear control approaches for highly nonlinear\nFES-actuated dynamics. Utilizing inertial measurement units (IMUs), our method\naccurately predicts the FES-induced ankle movements, while accounting for\nnonlinear muscle actuation dynamics, including the muscle activation for both\nplantarflexors, and dorsiflexors (Tibialis Anterior (TA)). The linear\nprediction model derived through KOT allowed us to formulate the MPC problem\nwith linear state space dynamics, enhancing the real-time feasibility,\nprecision and adaptability of the FES driven control. The effectiveness and\napplicability of our approach have been demonstrated through comprehensive\nsimulations and experimental trials, including three participants with no\ndisability and a participant with Multiple Sclerosis. Our findings highlight\nthe potential of a KOT-based MPC approach for FES based gait assistance that\noffers effective and personalized assistance for individuals with gait\nimpairment conditions.\n","authors":["Mayank Singh","Noor Hakam","Trisha M. Kesar","Nitin Sharma"],"pdf_url":"https://arxiv.org/pdf/2501.05943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15862v2","updated":"2025-01-10T10:46:26Z","published":"2024-10-21T10:43:14Z","title":"Integration of Cobalt Ferromagnetic Control Gates for Electrical and\n  Magnetic Manipulation of Semiconductor Quantum Dots","summary":"  The rise of electron spin qubit architectures for quantum computing\nprocessors has led to a strong interest in designing and integrating\nferromagnets to induce stray magnetic fields for electron dipole spin resonance\n(EDSR). The integration of nanomagnets imposes however strict layout and\nprocessing constraints, challenging the arrangement of different gating layers\nand the control of neighboring qubit frequencies. This work reports a\nsuccessful integration of nano-sized cobalt control gates into a multi-gate\nFD-SOI nanowire with nanometer-scale dot-to-magnet pitch, simultaneously\nexploiting electrical and ferromagnetic properties of the gate stack at\nnanoscale. The electrical characterization of the multi-gate nanowire exhibits\nfull field effect functionality of all ferromagnetic gates from room\ntemperature to 10 mK, proving quantum dot formation when ferromagnets are\noperated as barrier gates. The front-end-of-line (FEOL) compatible integration\nof cobalt is examined by energy dispersive X-ray spectroscopy and high/low\nfrequency capacitance characterization, confirming the quality of interfaces\nand control over material diffusion. Insights into the magnetic properties of\nthin films and patterned control-gates are provided by vibrating sample\nmagnetometry and electron holography measurements. Micromagnetic simulations\nanticipate that this structure fulfills the requirements for EDSR driving for\nmagnetic fields higher than 1 T, where a homogeneous magnetization along the\nhard magnetic axis of the Co gates is expected. The FDSOI architecture\nshowcased in this study provides a scalable alternative to micromagnets\ndeposited in the back-end-of-line (BEOL) and middle-of-line (MOL) processes,\nwhile bringing technological insights for the FEOL-compatible integration of Co\nnanostructures in spin qubit devices.\n","authors":["Fabio Bersano","Michele Aldeghi","Niccolò Martinolli","Victor Boureau","Thibault Aboud","Michele Ghini","Pasquale Scarlino","Gian Salis","Adrian Mihai Ionescu"],"pdf_url":"https://arxiv.org/pdf/2410.15862v2.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.05842v1","updated":"2025-01-10T10:33:13Z","published":"2025-01-10T10:33:13Z","title":"Orthogonal projection-based regularization for efficient model\n  augmentation","summary":"  Deep-learning-based nonlinear system identification has shown the ability to\nproduce reliable and highly accurate models in practice. However, these\nblack-box models lack physical interpretability, and often a considerable part\nof the learning effort is spent on capturing already expected/known behavior\ndue to first-principles-based understanding of some aspects of the system. A\npotential solution is to integrate prior physical knowledge directly into the\nmodel structure, combining the strengths of physics-based modeling and\ndeep-learning-based identification. The most common approach is to use an\nadditive model augmentation structure, where the physics-based and the\nmachine-learning (ML) components are connected in parallel. However, such\nmodels are overparametrized, training them is challenging, potentially causing\nthe physics-based part to lose interpretability. To overcome this challenge,\nthis paper proposes an orthogonal projection-based regularization technique to\nenhance parameter learning, convergence, and even model accuracy in\nlearning-based augmentation of nonlinear baseline models.\n","authors":["Bendegúz M. Györök","Jan H. Hoekstra","Johan Kon","Tamás Péni","Maarten Schoukens","Roland Tóth"],"pdf_url":"https://arxiv.org/pdf/2501.05842v1.pdf","comment":"Submitted to L4DC 2025"},{"id":"http://arxiv.org/abs/2401.10726v4","updated":"2025-01-10T10:30:41Z","published":"2024-01-19T14:43:04Z","title":"Empowering Aggregators with Practical Data-Driven Tools: Harnessing\n  Aggregated and Disaggregated Flexibility for Demand Response","summary":"  This study explores the interaction between aggregators and building\noccupants in activating flexibility through Demand Response (DR) programs, with\na focus on reinforcing the resilience of the energy system considering the\nuncertainties presented by Renewable Energy Sources (RES). Firstly, it\nintroduces a methodology of optimizing aggregated flexibility provision\nstrategies in environments with limited data, utilizing Discrete Fourier\nTransformation (DFT) and clustering techniques to identify building occupants'\nactivity patterns. Secondly, the study assesses the disaggregated flexibility\nprovision of Heating Ventilation and Air Conditioning (HVAC) systems during DR\nevents, employing machine learning and optimization techniques for precise,\ndevice-level analysis. The first approach offers a non-intrusive pathway for\naggregators to provide flexibility services in environments of a single smart\nmeter for the whole building's consumption, while the second approach maximizes\nthe amount of flexibility in the case of dedicated metering devices to the HVAC\nsystems by carefully considering building occupants' thermal comfort profiles.\nThrough the application of data-driven techniques and encompassing case studies\nfrom both industrial and residential buildings, this paper not only unveils\npivotal opportunities for aggregators in the balancing and emerging flexibility\nmarkets but also successfully develops and demonstrates end-to-end practical\ntools for aggregators.\n","authors":["Costas Mylonas","Donata Boric","Leila Luttenberger Maric","Alexandros Tsitsanis","Eleftheria Petrianou","Magda Foti"],"pdf_url":"https://arxiv.org/pdf/2401.10726v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02203v2","updated":"2025-01-10T10:08:13Z","published":"2023-10-03T16:59:26Z","title":"Stochastic Quantum Power Flow for Risk Assessment in Power Systems","summary":"  This paper introduces the first quantum computing framework for Stochastic\nQuantum Power Flow (SQPF) analysis in power systems. The proposed method\nleverages quantum states to encode power flow distributions, enabling the use\nof Quantum Monte Carlo (QMC) sampling to efficiently assess the probability of\nline overloads. Our approach significantly reduces the required sample size\ncompared to traditional Monte Carlo methods, making it particularly suited for\nrisk assessments in scenarios involving high uncertainty, such as renewable\nenergy integration. We validate the method on two test systems, demonstrating\nthe computational advantage of quantum algorithms in reducing sample complexity\nwhile maintaining accuracy. This work represents a foundational step toward\nscalable quantum power flow analysis, with potential applications in future\npower system operations and planning. The results show promising computational\nspeedups, underscoring the potential of quantum computing in addressing the\nincreasing uncertainty in modern power grids.\n","authors":["Brynjar Sævarsson","Hjörtur Jóhannsson","Spyros Chatzivasileiadis"],"pdf_url":"https://arxiv.org/pdf/2310.02203v2.pdf","comment":"Accepted by the Electric Power System Research journal"},{"id":"http://arxiv.org/abs/2501.05815v1","updated":"2025-01-10T09:38:42Z","published":"2025-01-10T09:38:42Z","title":"Enhanced sampled-data model predictive control via nonlinear lifting","summary":"  This paper introduces a novel nonlinear model predictive control (NMPC)\nframework that incorporates a lifting technique to enhance control performance\nfor nonlinear systems. While the lifting technique has been widely employed in\nlinear systems to capture intersample behaviour, their application to nonlinear\nsystems remains unexplored. We address this gap by formulating an NMPC scheme\nthat combines fast-sample fast-hold (FSFH) approximations and numerical methods\nto approximate system dynamics and cost functions. The proposed approach is\nvalidated through two case studies: the Van der Pol oscillator and the inverted\npendulum on a cart. Simulation results demonstrate that the lifted NMPC\noutperforms conventional NMPC in terms of reduced settling time and improved\ncontrol accuracy. These findings underscore the potential of the lifting-based\nNMPC for efficient control of nonlinear systems, offering a practical solution\nfor real-time applications.\n","authors":["Nuthasith Gerdpratoom","Fumiya Matsuzaki","Yutaka Yamamoto","Kaoru Yamamoto"],"pdf_url":"https://arxiv.org/pdf/2501.05815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05808v1","updated":"2025-01-10T09:15:40Z","published":"2025-01-10T09:15:40Z","title":"Real-Time Integrated Dispatching and Idle Fleet Steering with Deep\n  Reinforcement Learning for A Meal Delivery Platform","summary":"  To achieve high service quality and profitability, meal delivery platforms\nlike Uber Eats and Grubhub must strategically operate their fleets to ensure\ntimely deliveries for current orders while mitigating the consequential impacts\nof suboptimal decisions that leads to courier understaffing in the future. This\nstudy set out to solve the real-time order dispatching and idle courier\nsteering problems for a meal delivery platform by proposing a reinforcement\nlearning (RL)-based strategic dual-control framework. To address the inherent\nsequential nature of these problems, we model both order dispatching and\ncourier steering as Markov Decision Processes. Trained via a deep reinforcement\nlearning (DRL) framework, we obtain strategic policies by leveraging the\nexplicitly predicted demands as part of the inputs. In our dual-control\nframework, the dispatching and steering policies are iteratively trained in an\nintegrated manner. These forward-looking policies can be executed in real-time\nand provide decisions while jointly considering the impacts on local and\nnetwork levels. To enhance dispatching fairness, we propose convolutional deep\nQ networks to construct fair courier embeddings. To simultaneously rebalance\nthe supply and demand within the service network, we propose to utilize\nmean-field approximated supply-demand knowledge to reallocate idle couriers at\nthe local level. Utilizing the policies generated by the RL-based strategic\ndual-control framework, we find the delivery efficiency and fairness of\nworkload distribution among couriers have been improved, and under-supplied\nconditions have been alleviated within the service network. Our study sheds\nlight on designing an RL-based framework to enable forward-looking real-time\noperations for meal delivery platforms and other on-demand services.\n","authors":["Jingyi Cheng","Shadi Sharif Azadeh"],"pdf_url":"https://arxiv.org/pdf/2501.05808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05778v1","updated":"2025-01-10T08:21:41Z","published":"2025-01-10T08:21:41Z","title":"Formally Verified Neural Lyapunov Function for Incremental\n  Input-to-State Stability of Unknown Systems","summary":"  This work presents an approach to synthesize a Lyapunov-like function to\nensure incrementally input-to-state stability ($\\delta$-ISS) property for an\nunknown discrete-time system. To deal with challenges posed by unknown system\ndynamics, we parameterize the Lyapunov-like function as a neural network, which\nwe train using the data samples collected from the unknown system along with\nappropriately designed loss functions. We propose a validity condition to test\nthe obtained function and incorporate it into the training framework to ensure\nprovable correctness at the end of the training. Finally, the usefulness of the\nproposed technique is proved using two case studies: a scalar non-linear\ndynamical system and a permanent magnet DC motor.\n","authors":["Ahan Basu","Bhabani Shankar Dey","Pushpak Jagtap"],"pdf_url":"https://arxiv.org/pdf/2501.05778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05770v1","updated":"2025-01-10T07:58:52Z","published":"2025-01-10T07:58:52Z","title":"Path Planning for Multi-Copter UAV Formation Employing a Generalized\n  Particle Swarm Optimization","summary":"  The paper investigates the problem of path planning techniques for\nmulti-copter uncrewed aerial vehicles (UAV) cooperation in a formation shape to\nexamine surrounding surfaces. We first describe the problem as a joint\nobjective cost for planning a path of the formation centroid working in a\ncomplicated space. The path planning algorithm, named the generalized particle\nswarm optimization algorithm, is then presented to construct an optimal,\nflyable path while avoiding obstacles and ensuring the flying mission\nrequirements. A path-development scheme is then incorporated to generate a\nrelevant path for each drone to maintain its position in the formation\nconfiguration. Simulation, comparison, and experiments have been conducted to\nverify the proposed approach. Results show the feasibility of the proposed\npath-planning algorithm with GEPSO.\n","authors":["Van Truong Hoang"],"pdf_url":"https://arxiv.org/pdf/2501.05770v1.pdf","comment":"6 pages, 8 figures, conference"},{"id":"http://arxiv.org/abs/2404.14767v4","updated":"2025-01-10T06:45:33Z","published":"2024-04-23T06:10:31Z","title":"Remaining Discharge Energy Prediction for Lithium-Ion Batteries Over\n  Broad Current Ranges: A Machine Learning Approach","summary":"  Lithium-ion batteries have found their way into myriad sectors of industry to\ndrive electrification, decarbonization, and sustainability. A crucial aspect in\nensuring their safe and optimal performance is monitoring their energy levels.\nIn this paper, we present the first study on predicting the remaining energy of\na battery cell undergoing discharge over wide current ranges from low to high\nC-rates. The complexity of the challenge arises from the cell's\nC-rate-dependent energy availability as well as its intricate electro-thermal\ndynamics especially at high C-rates. To address this, we introduce a new\ndefinition of remaining discharge energy and then undertake a systematic effort\nin harnessing the power of machine learning to enable its prediction. Our\neffort includes two parts in cascade. First, we develop an accurate dynamic\nmodel based on integration of physics with machine learning to capture a\nbattery's voltage and temperature behaviors. Second, based on the model, we\npropose a machine learning approach to predict the remaining discharge energy\nunder arbitrary C-rates and pre-specified cut-off limits in voltage and\ntemperature. The experimental validation shows that the proposed approach can\npredict the remaining discharge energy with a relative error of less than 3%\nwhen the current varies between 0~8 C for an NCA cell and 0~15 C for an LFP\ncell. The approach, by design, is amenable to training and computation.\n","authors":["Hao Tu","Manashita Borah","Scott Moura","Yebin Wang","Huazhen Fang"],"pdf_url":"https://arxiv.org/pdf/2404.14767v4.pdf","comment":"15 pages, 13 figures, 4 tables"},{"id":"http://arxiv.org/abs/2406.00621v3","updated":"2025-01-10T06:10:19Z","published":"2024-06-02T05:50:41Z","title":"Log-Scale Quantization in Distributed First-Order Methods:\n  Gradient-based Learning from Distributed Data","summary":"  Decentralized strategies are of interest for learning from large-scale data\nover networks. This paper studies learning over a network of geographically\ndistributed nodes/agents subject to quantization. Each node possesses a private\nlocal cost function, collectively contributing to a global cost function, which\nthe considered methodology aims to minimize. In contrast to many existing\npapers, the information exchange among nodes is log-quantized to address\nlimited network-bandwidth in practical situations. We consider a first-order\ncomputationally efficient distributed optimization algorithm (with no extra\ninner consensus loop) that leverages node-level gradient correction based on\nlocal data and network-level gradient aggregation only over nearby nodes. This\nmethod only requires balanced networks with no need for stochastic weight\ndesign. It can handle log-scale quantized data exchange over possibly\ntime-varying and switching network setups. We study convergence over both\nstructured networks (for example, training over data-centers) and ad-hoc\nmulti-agent networks (for example, training over dynamic robotic networks).\nThrough experimental validation, we show that (i) structured networks generally\nresult in a smaller optimality gap, and (ii) log-scale quantization leads to a\nsmaller optimality gap compared to uniform quantization.\n","authors":["Mohammadreza Doostmohammadian","Muhammad I. Qureshi","Mohammad Hossein Khalesi","Hamid R. Rabiee","Usman A. Khan"],"pdf_url":"https://arxiv.org/pdf/2406.00621v3.pdf","comment":"IEEE TASE 2025"},{"id":"http://arxiv.org/abs/2501.05715v1","updated":"2025-01-10T05:22:55Z","published":"2025-01-10T05:22:55Z","title":"Non-intrusive Data-driven ADI-based Low-rank Balanced Truncation","summary":"  In this short note, a non-intrusive data-driven formulation of ADI-based\nlow-rank balanced truncation is provided. The proposed algorithm only requires\ntransfer function samples at the mirror images of ADI shifts. If some shifts\nare used in both approximating the controllability Gramian and the\nobservability Gramian, then samples of the transfer function's derivative at\nthese shifts are also needed to enforce Hermite interpolation in the Loewner\nframework. It is noted that ADI-based low-rank balanced truncation can be\nviewed as a two-step process. The first step involves constructing an\ninterpolant of the original model at the mirror images of the ADI shifts, which\ncan be done non-intrusively within the Loewner framework. The second step\ninvolves reducing this interpolant using low-rank factors of Gramians\nassociated with the interpolation data through the balanced square-root\nalgorithm. This second step does not require any system information, making the\noverall process non-intrusive with the only required information being samples\nof the transfer function and/or its derivative at the mirror images of ADI\nshifts. Furthermore, it is shown that when the order of the reduced model in\nADI-based low-rank balanced truncation is selected to match the numerical rank\nof the low-rank factors of the Gramians, it effectively reduces to standard\ninterpolation at the mirror images of the ADI shift. An illustrative example is\nprovided to explain the proposed approach.\n","authors":["Umair Zulfiqar"],"pdf_url":"https://arxiv.org/pdf/2501.05715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05660v1","updated":"2025-01-10T02:24:35Z","published":"2025-01-10T02:24:35Z","title":"Fully Decentralized Computation Offloading in Priority-Driven Edge\n  Computing Systems","summary":"  We develop a novel framework for fully decentralized offloading policy design\nin multi-access edge computing (MEC) systems. The system comprises $N$\npower-constrained user equipments (UEs) assisted by an edge server (ES) to\nprocess incoming tasks. Tasks are labeled with urgency flags, and in this\npaper, we classify them under three urgency levels, namely, high, moderate, and\nlow urgency. We formulate the problem of designing computation decisions for\nthe UEs within a large population noncooperative game framework, where each UE\nselfishly decides on how to split task execution between its local onboard\nprocessor and the ES. We employ the weighted average age of information (AoI)\nmetric to quantify information freshness at the UEs. Increased onboard\nprocessing consumes more local power, while increased offloading may\npotentially incur a higher average AoI due to other UEs' packets being\noffloaded to the same ES. Thus, we use the mean-field game (MFG) formulation to\ncompute approximate decentralized Nash equilibrium offloading and local\ncomputation policies for the UEs to balance between the information freshness\nand local power consumption. Finally, we provide a projected gradient\ndescent-based algorithm to numerically assess the merits of our approach.\n","authors":["Shubham Aggarwal","Melih Bastopcu","Muhammad Aneeq uz Zaman","Tamer Başar","Sennur Ulukus","Nail Akar"],"pdf_url":"https://arxiv.org/pdf/2501.05660v1.pdf","comment":"Submitted to IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.05655v1","updated":"2025-01-10T01:57:10Z","published":"2025-01-10T01:57:10Z","title":"Downlink Performance of Cell-Free Massive MIMO for LEO Satellite\n  Mega-Constellation","summary":"  Low-earth orbit (LEO) satellite communication (SatCom) has emerged as a\npromising technology for improving wireless connectivity in global areas.\nCell-free massive multiple-input multiple-output (CF-mMIMO), an architecture\nrecently proposed for next-generation networks, has yet to be fully explored\nfor LEO satellites. In this paper, we investigate the downlink performance of a\nCF-mMIMO LEO SatCom network, where many satellite access points (SAPs)\nsimultaneously serve the corresponding ground user terminals (UTs). Using tools\nfrom stochastic geometry, we model the locations of SAPs and UTs on surfaces of\nconcentric spheres using Poisson point processes (PPPs) and present expressions\nbased on linear minimum-mean-square-error (LMMSE) channel estimation and\nconjugate beamforming. Then, we derive the coverage probabilities in both\nfading and non-fading scenarios, with significant system parameters such as the\nNakagami fading parameter, number of UTs, number of SAPs, orbital altitude, and\nservice range brought by the dome angle. Finally, the analytical model is\nverified by extensive Monte Carlo simulations. Simulation results show that\nstronger line-of-sight (LoS) effects and a more comprehensive service range of\nthe UT bring higher coverage probability despite existing multi-user\ninterference. Moreover, we found that there exist optimal numbers of UTs for\ndifferent orbital altitudes and dome angles, which provides valuable system\ndesign insights.\n","authors":["Xiangyu Li","Bodong Shang"],"pdf_url":"https://arxiv.org/pdf/2501.05655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09131v4","updated":"2025-01-10T23:30:13Z","published":"2023-12-14T17:01:58Z","title":"Physics-Informed Neural Network Lyapunov Functions: PDE\n  Characterization, Learning, and Verification","summary":"  We provide a systematic investigation of using physics-informed neural\nnetworks to compute Lyapunov functions. We encode Lyapunov conditions as a\npartial differential equation (PDE) and use this for training neural network\nLyapunov functions. We analyze the analytical properties of the solutions to\nthe Lyapunov and Zubov PDEs. In particular, we show that employing the Zubov\nequation in training neural Lyapunov functions can lead to approximate regions\nof attraction close to the true domain of attraction. We also examine\napproximation errors and the convergence of neural approximations to the unique\nsolution of Zubov's equation. We then provide sufficient conditions for the\nlearned neural Lyapunov functions that can be readily verified by\nsatisfiability modulo theories (SMT) solvers, enabling formal verification of\nboth local stability analysis and region-of-attraction estimates in the large.\nThrough a number of nonlinear examples, ranging from low to high dimensions, we\ndemonstrate that the proposed framework can outperform traditional\nsums-of-squares (SOS) Lyapunov functions obtained using semidefinite\nprogramming (SDP).\n","authors":["Jun Liu","Yiming Meng","Maxwell Fitzsimmons","Ruikun Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.09131v4.pdf","comment":"The current version is accepted to the IFAC Journal Automatica"},{"id":"http://arxiv.org/abs/2501.06353v1","updated":"2025-01-10T21:25:16Z","published":"2025-01-10T21:25:16Z","title":"Event Constrained Programming","summary":"  In this paper, we present event constraints as a new modeling paradigm that\ngeneralizes joint chance constraints from stochastic optimization to (1)\nenforce a constraint on the probability of satisfying a set of constraints\naggregated via application-specific logic (constituting an event) and (2) to be\napplied to general infinite-dimensional optimization (InfiniteOpt) problems\n(i.e., time, space, and/or uncertainty domains). This new constraint class\noffers significant modeling flexibility in posing InfiniteOpt constraints that\nare enforced over a certain portion of their domain (e.g., to a certain\nprobability level), but can be challenging to reformulate/solve due to\ndifficulties in representing arbitrary logical conditions and specifying a\nprobabilistic measure on a collection of constraints. To address these\nchallenges, we derive a generalized disjunctive programming (GDP)\nrepresentation of event constrained optimization problems, which readily\nenables us to pose logical event conditions in a standard form and allows us to\ndraw from a suite of GDP solution strategies that leverage the special\nstructure of this problem class. We also extend several approximation\ntechniques from the chance constraint literature to provide a means to\nreformulate certain event constraints without the use of binary variables. We\nillustrate these findings with case studies in stochastic optimal power flow,\ndynamic disease control, and optimal 2D diffusion.\n","authors":["Daniel Ovalle","Stefan Mazzadi","Carl D. Laird","Ignacio E. Grossmann","Joshua L. Pulsipher"],"pdf_url":"https://arxiv.org/pdf/2501.06353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04669v4","updated":"2025-01-10T21:06:41Z","published":"2024-09-07T01:17:59Z","title":"Learning Optimal Stable Matches in Decentralized Markets with Unknown\n  Preferences","summary":"  Matching algorithms have demonstrated great success in several practical\napplications, but they often require centralized coordination and plentiful\ninformation. In many modern online marketplaces, agents must independently seek\nout and match with another using little to no information. For these kinds of\nsettings, can we design decentralized, limited-information matching algorithms\nthat preserve the desirable properties of standard centralized techniques? In\nthis work, we constructively answer this question in the affirmative. We model\na two-sided matching market as a game consisting of two disjoint sets of\nagents, referred to as proposers and acceptors, each of whom seeks to match\nwith their most preferable partner on the opposite side of the market. However,\neach proposer has no knowledge of their own preferences, so they must learn\ntheir preferences while forming matches in the market. We present a simple\nonline learning rule that guarantees a strong notion of probabilistic\nconvergence to the welfare-maximizing equilibrium of the game, referred to as\nthe proposer-optimal stable match. To the best of our knowledge, this\nrepresents the first completely decoupled, communication-free algorithm that\nguarantees probabilistic convergence to an optimal stable match, irrespective\nof the structure of the matching market.\n","authors":["Vade Shah","Bryce L. Ferguson","Jason R. Marden"],"pdf_url":"https://arxiv.org/pdf/2409.04669v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06335v1","updated":"2025-01-10T20:41:11Z","published":"2025-01-10T20:41:11Z","title":"A Comparison of Strategies to Embed Physics-Informed Neural Networks in\n  Nonlinear Model Predictive Control Formulations Solved via Direct\n  Transcription","summary":"  This study aims to benchmark candidate strategies for embedding neural\nnetwork (NN) surrogates in nonlinear model predictive control (NMPC)\nformulations that are subject to systems described with partial differential\nequations and that are solved via direct transcription (i.e., simultaneous\nmethods). This study focuses on the use of physics-informed NNs and\nphysics-informed convolutional NNs as the internal (surrogate) models within\nthe NMPC formulation. One strategy embeds NN models as explicit algebraic\nconstraints, leveraging the automatic differentiation (AD) of an algebraic\nmodelling language (AML) to evaluate the derivatives. Alternatively, the solver\ncan be provided with derivatives computed external to the AML via the AD\nroutines of the machine learning environment the NN is trained in. The three\nnumerical experiments considered in this work reveal that replacing mechanistic\nmodels with NN surrogates may not always offer computational advantages when\nsmooth activation functions are used in conjunction with a local nonlinear\nsolver (e.g., Ipopt), even with highly nonlinear systems. Moreover, in this\ncontext, the external function evaluation of the NN surrogates often\noutperforms the embedding strategies that rely on explicit algebraic\nconstraints, likely due to the difficulty in initializing the auxiliary\nvariables and constraints introduced by explicit algebraic reformulations.\n","authors":["Carlos Andrés Elorza Casas","Luis A. Ricardez-Sandoval","Joshua L. Pulsipher"],"pdf_url":"https://arxiv.org/pdf/2501.06335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07601v1","updated":"2025-01-10T22:31:53Z","published":"2025-01-10T22:31:53Z","title":"Real-Time Decision-Making for Digital Twin in Additive Manufacturing\n  with Model Predictive Control using Time-Series Deep Neural Networks","summary":"  Digital Twin-a virtual replica of a physical system enabling real-time\nmonitoring, model updating, prediction, and decision-making-combined with\nrecent advances in machine learning (ML), offers new opportunities for\nproactive control strategies in autonomous manufacturing. However, achieving\nreal-time decision-making with Digital Twins requires efficient optimization\ndriven by accurate predictions of highly nonlinear manufacturing systems. This\npaper presents a simultaneous multi-step Model Predictive Control (MPC)\nframework for real-time decision-making, using a multi-variate deep neural\nnetwork (DNN), named Time-Series Dense Encoder (TiDE), as the surrogate model.\nDifferent from the models in conventional MPC which only provide one-step ahead\nprediction, TiDE is capable of predicting future states within the prediction\nhorizon in one shot (multi-step), significantly accelerating MPC. Using\nDirected Energy Deposition additive manufacturing as a case study, we\ndemonstrate the effectiveness of the proposed MPC in achieving melt pool\ntemperature tracking to ensure part quality, while reducing porosity defects by\nregulating laser power to maintain melt pool depth constraints. In this work,\nwe first show that TiDE is capable of accurately predicting melt pool\ntemperature and depth. Second, we demonstrate that the proposed MPC achieves\nprecise temperature tracking while satisfying melt pool depth constraints\nwithin a targeted dilution range (10%-30%), reducing potential porosity\ndefects. Compared to the PID controller, MPC results in smoother and less\nfluctuating laser power profiles with competitive or superior melt pool\ntemperature control performance. This demonstrates MPC's proactive control\ncapabilities, leveraging time-series prediction and real-time optimization,\npositioning it as a powerful tool for future Digital Twin applications and\nreal-time process optimization in manufacturing.\n","authors":["Yi-Ping Chen","Vispi Karkaria","Ying-Kuan Tsai","Faith Rolark","Daniel Quispe","Robert X. Gao","Jian Cao","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07601v1.pdf","comment":null}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2501.06181v1","updated":"2025-01-10T18:58:44Z","published":"2025-01-10T18:58:44Z","title":"Best Response Convergence for Zero-sum Stochastic Dynamic Games with\n  Partial and Asymmetric Information","summary":"  We analyze best response dynamics for finding a Nash equilibrium of an\ninfinite horizon zero-sum stochastic linear quadratic dynamic game (LQDG) with\npartial and asymmetric information. We derive explicit expressions for each\nplayer's best response within the class of pure linear dynamic output feedback\ncontrol strategies where the internal state dimension of each control strategy\nis an integer multiple of the system state dimension. With each best response,\nthe players form increasingly higher-order belief states, leading to\ninfinite-dimensional internal states. However, we observe in extensive\nnumerical experiments that the game's value converges after just a few\niterations, suggesting that strategies associated with increasingly\nhigher-order belief states eventually provide no benefit. To help explain this\nconvergence, our numerical analysis reveals rapid decay of the controllability\nand observability Gramian eigenvalues and Hankel singular values in\nhigher-order belief dynamics, indicating that the higher-order belief dynamics\nbecome increasingly difficult for both players to control and observe.\nConsequently, the higher-order belief dynamics can be closely approximated by\nlow-order belief dynamics with bounded error, and thus feedback strategies with\nlimited internal state dimension can closely approximate a Nash equilibrium.\n","authors":["Yuxiang Guan","Iman Shames","Tyler H. Summers"],"pdf_url":"https://arxiv.org/pdf/2501.06181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06167v1","updated":"2025-01-10T18:46:28Z","published":"2025-01-10T18:46:28Z","title":"Meta-Learning for Physically-Constrained Neural System Identification","summary":"  We present a gradient-based meta-learning framework for rapid adaptation of\nneural state-space models (NSSMs) for black-box system identification. When\napplicable, we also incorporate domain-specific physical constraints to improve\nthe accuracy of the NSSM. The major benefit of our approach is that instead of\nrelying solely on data from a single target system, our framework utilizes data\nfrom a diverse set of source systems, enabling learning from limited target\ndata, as well as with few online training iterations. Through benchmark\nexamples, we demonstrate the potential of our approach, study the effect of\nfine-tuning subnetworks rather than full fine-tuning, and report real-world\ncase studies to illustrate the practical application and generalizability of\nthe approach to practical problems with physical-constraints. Specifically, we\nshow that the meta-learned models result in improved downstream performance in\nmodel-based state estimation in indoor localization and energy systems.\n","authors":["Ankush Chakrabarty","Gordon Wichern","Vedang M. Deshpande","Abraham P. Vinod","Karl Berntorp","Christopher R. Laughman"],"pdf_url":"https://arxiv.org/pdf/2501.06167v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2308.15732v2","updated":"2025-01-10T18:41:19Z","published":"2023-08-30T03:16:41Z","title":"On Lie-Bracket Averaging for a Class of Hybrid Dynamical Systems with\n  Applications to Model-Free Control and Optimization","summary":"  The stability of dynamical systems with oscillatory behaviors and\nwell-defined average vector fields has traditionally been studied using\naveraging theory. These tools have also been applied to hybrid dynamical\nsystems, which combine continuous and discrete dynamics. However, most\naveraging results for hybrid systems are limited to first-order methods,\nhindering their use in systems and algorithms that require high-order averaging\ntechniques, such as hybrid Lie-bracket-based extremum seeking algorithms and\nhybrid vibrational controllers. To address this limitation, we introduce a\nnovel high-order averaging theorem for analyzing the stability of hybrid\ndynamical systems with high-frequency periodic flow maps. These systems\nincorporate set-valued flow maps and jump maps, effectively modeling well-posed\ndifferential and difference inclusions. By imposing appropriate regularity\nconditions, we establish results on $(T,\\varepsilon)$-closeness of solutions\nand semi-global practical asymptotic stability for sets. These theoretical\nresults are then applied to the study of three distinct applications in the\ncontext of hybrid model-free control and optimization via Lie-bracket\naveraging.\n","authors":["Mahmoud Abdelgalil","Jorge I. Poveda"],"pdf_url":"https://arxiv.org/pdf/2308.15732v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06118v1","updated":"2025-01-10T17:15:59Z","published":"2025-01-10T17:15:59Z","title":"Nonlinear port-Hamiltonian system identification from input-state-output\n  data","summary":"  A framework for identifying nonlinear port-Hamiltonian systems using\ninput-state-output data is introduced. The framework utilizes neural networks'\nuniversal approximation capacity to effectively represent complex dynamics in a\nstructured way. We show that using the structure helps to make long-term\npredictions compared to baselines that do not incorporate physics. We also\nexplore different architectures based on MLPs, KANs, and using prior\ninformation. The technique is validated through examples featuring\nnonlinearities in either the skew-symmetric terms, the dissipative terms, or\nthe Hamiltonian.\n","authors":["Karim Cherifi","Achraf El Messaoudi","Hannes Gernandt","Marco Roschkowski"],"pdf_url":"https://arxiv.org/pdf/2501.06118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14121v3","updated":"2025-01-10T16:59:51Z","published":"2023-10-21T21:39:24Z","title":"Monotone Causality in Opportunistically Stochastic Shortest Path\n  Problems","summary":"  When traveling through a graph with an accessible deterministic path to a\ntarget, is it ever preferable to resort to stochastic node-to-node transitions\ninstead? And if so, what are the conditions guaranteeing that such a stochastic\noptimal routing policy can be computed efficiently? We aim to answer these\nquestions here by defining a class of Opportunistically Stochastic Shortest\nPath (OSSP) problems and deriving sufficient conditions for applicability of\nnon-iterative label-setting methods. The usefulness of this framework is\ndemonstrated in two very different contexts: numerical analysis and autonomous\nvehicle routing. We use OSSPs to derive causality conditions for\nsemi-Lagrangian discretizations of anisotropic Hamilton-Jacobi equations. We\nalso use a Dijkstra-like method to solve OSSPs optimizing the timing and\nurgency of lane change maneuvers for an autonomous vehicle navigating road\nnetworks with a heterogeneous traffic load.\n","authors":["Mallory E. Gaspard","Alexander Vladimirsky"],"pdf_url":"https://arxiv.org/pdf/2310.14121v3.pdf","comment":"Submitted to and under review for INFORMS Mathematics of Operations\n  Research. Revised to address first round feedback from reviewers for this\n  journal"},{"id":"http://arxiv.org/abs/2501.06081v1","updated":"2025-01-10T16:15:25Z","published":"2025-01-10T16:15:25Z","title":"Averaged Adam accelerates stochastic optimization in the training of\n  deep neural network approximations for partial differential equation and\n  optimal control problems","summary":"  Deep learning methods - usually consisting of a class of deep neural networks\n(DNNs) trained by a stochastic gradient descent (SGD) optimization method - are\nnowadays omnipresent in data-driven learning problems as well as in scientific\ncomputing tasks such as optimal control (OC) and partial differential equation\n(PDE) problems. In practically relevant learning tasks, often not the\nplain-vanilla standard SGD optimization method is employed to train the\nconsidered class of DNNs but instead more sophisticated adaptive and\naccelerated variants of the standard SGD method such as the popular Adam\noptimizer are used. Inspired by the classical Polyak-Ruppert averaging\napproach, in this work we apply averaged variants of the Adam optimizer to\ntrain DNNs to approximately solve exemplary scientific computing problems in\nthe form of PDEs and OC problems. We test the averaged variants of Adam in a\nseries of learning problems including physics-informed neural network (PINN),\ndeep backward stochastic differential equation (deep BSDE), and deep Kolmogorov\napproximations for PDEs (such as heat, Black-Scholes, Burgers, and Allen-Cahn\nPDEs), including DNN approximations for OC problems, and including DNN\napproximations for image classification problems (ResNet for CIFAR-10). In each\nof the numerical examples the employed averaged variants of Adam outperform the\nstandard Adam and the standard SGD optimizers, particularly, in the situation\nof the scientific machine learning problems. The Python source codes for the\nnumerical experiments associated to this work can be found on GitHub at\nhttps://github.com/deeplearningmethods/averaged-adam.\n","authors":["Steffen Dereich","Arnulf Jentzen","Adrian Riekert"],"pdf_url":"https://arxiv.org/pdf/2501.06081v1.pdf","comment":"25 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.06079v1","updated":"2025-01-10T16:15:02Z","published":"2025-01-10T16:15:02Z","title":"Set-valued evenly convex functions: characterizations and c-conjugacy","summary":"  In this work we deal with set-valued functions with values in the power set\nof a separated locally convex space where a nontrivial pointed convex cone\ninduces a partial order relation. A set-valued function is evenly convex if its\nepigraph is an evenly convex set, i.e., it is the intersection of an arbitrary\nfamily of open half-spaces. In this paper we characterize evenly convex\nset-valued functions as the pointwise supremum of its set-valued e-affine\nminorants. Moreover, a suitable conjugation pattern will be developed for these\nfunctions, as well as the counterpart of the biconjugation Fenchel-Moreau\ntheorem.\n","authors":["M. D. Fajardo"],"pdf_url":"https://arxiv.org/pdf/2501.06079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06052v1","updated":"2025-01-10T15:32:35Z","published":"2025-01-10T15:32:35Z","title":"Rank conditions for exactness of semidefinite relaxations in polynomial\n  optimization","summary":"  We consider the Moment-SOS hierarchy in polynomial optimization. We first\nprovide a sufficient condition to solve the truncated K-moment problem\nassociated with a given degree-$2n$ pseudo-moment sequence $\\phi$ n and a\nsemi-algebraic set $K \\subset \\mathbb{R}^d$. Namely, let $2v$ be the maximum\ndegree of the polynomials that describe $K$. If the rank $r$ of its associated\nmoment matrix is less than $nv + 1$, then $\\phi^n$ has an atomic representing\nmeasure supported on at most $r$ points of $K$. When used at step-$n$ of the\nMoment-SOS hierarchy, it provides a sufficient condition to guarantee its\nfinite convergence (i.e., the optimal value of the corresponding degree-n\nsemidefinite relaxation of the hierarchy is the global minimum). For Quadratic\nConstrained Quadratic Problems (QCQPs) one may also recover global minimizers\nfrom the optimal pseudo-moment sequence. Our condition is in the spirit of\nBlekherman's rank condition and while on the one-hand it is more restrictive,\non the other hand it applies to constrained POPs as it provides a localization\non $K$ for the representing measure.\n","authors":["Jean B Lasserre"],"pdf_url":"https://arxiv.org/pdf/2501.06052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11940v3","updated":"2025-01-10T15:07:43Z","published":"2024-01-22T13:30:11Z","title":"Low-Tubal-Rank Tensor Recovery via Factorized Gradient Descent","summary":"  This paper considers the problem of recovering a tensor with an underlying\nlow-tubal-rank structure from a small number of corrupted linear measurements.\nTraditional approaches tackling such a problem require the computation of\ntensor Singular Value Decomposition (t-SVD), that is a computationally\nintensive process, rendering them impractical for dealing with large-scale\ntensors. Aim to address this challenge, we propose an efficient and effective\nlow-tubal-rank tensor recovery method based on a factorization procedure akin\nto the Burer-Monteiro (BM) method. Precisely, our fundamental approach involves\ndecomposing a large tensor into two smaller factor tensors, followed by solving\nthe problem through factorized gradient descent (FGD). This strategy eliminates\nthe need for t-SVD computation, thereby reducing computational costs and\nstorage requirements. We provide rigorous theoretical analysis to ensure the\nconvergence of FGD under both noise-free and noisy situations. Additionally, it\nis worth noting that our method does not require the precise estimation of the\ntensor tubal-rank. Even in cases where the tubal-rank is slightly\noverestimated, our approach continues to demonstrate robust performance. A\nseries of experiments have been carried out to demonstrate that, as compared to\nother popular ones, our approach exhibits superior performance in multiple\nscenarios, in terms of the faster computational speed and the smaller\nconvergence error.\n","authors":["Zhiyu Liu","Zhi Han","Yandong Tang","Xi-Le Zhao","Yao Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11940v3.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.06023v1","updated":"2025-01-10T15:01:36Z","published":"2025-01-10T15:01:36Z","title":"Distributed Generalized Nash Equilibria Learning for Online Stochastic\n  Aggregative Games","summary":"  This paper investigates online stochastic aggregative games subject to local\nset constraints and time-varying coupled inequality constraints, where each\nplayer possesses a time-varying expectation-valued cost function relying on not\nonly its own decision variable but also an aggregation of all the players'\nvariables. Each player can only access its local individual cost function and\nconstraints, necessitating partial information exchanges with neighboring\nplayers through time-varying unbalanced networks. Additionally, local cost\nfunctions and constraint functions are not prior knowledge and only revealed\ngradually. To learn generalized Nash equilibria of such games, a novel\ndistributed online stochastic algorithm is devised based on push-sum and\nprimal-dual strategies. Through rigorous analysis, high probability bounds on\nthe regret and constraint violation are provided by appropriately selecting\ndecreasing stepsizes. Moreover, for a time-invariant stochastic strongly\nmonotone game, it is shown that the generated sequence by the designed\nalgorithm converges to its variational generalized Nash equilibrium (GNE)\nalmost surely, and the time-averaged sequence converges sublinearly with high\nprobability. Finally, the derived theoretical results are illustrated by\nnumerical simulations.\n","authors":["Kaixin Du","Min Meng"],"pdf_url":"https://arxiv.org/pdf/2501.06023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.05545v3","updated":"2025-01-10T14:51:06Z","published":"2024-12-07T05:47:28Z","title":"Convergence analysis of wide shallow neural operators within the\n  framework of Neural Tangent Kernel","summary":"  Neural operators are aiming at approximating operators mapping between Banach\nspaces of functions, achieving much success in the field of scientific\ncomputing. Compared to certain deep learning-based solvers, such as\nPhysics-Informed Neural Networks (PINNs), Deep Ritz Method (DRM), neural\noperators can solve a class of Partial Differential Equations (PDEs). Although\nmuch work has been done to analyze the approximation and generalization error\nof neural operators, there is still a lack of analysis on their training error.\nIn this work, we conduct the convergence analysis of gradient descent for the\nwide shallow neural operators and physics-informed shallow neural operators\nwithin the framework of Neural Tangent Kernel (NTK). The core idea lies on the\nfact that over-parameterization and random initialization together ensure that\neach weight vector remains near its initialization throughout all iterations,\nyielding the linear convergence of gradient descent. In this work, we\ndemonstrate that under the setting of over-parametrization, gradient descent\ncan find the global minimum regardless of whether it is in continuous time or\ndiscrete time.\n","authors":["Xianliang Xu","Ye Li","Zhongyi Huang"],"pdf_url":"https://arxiv.org/pdf/2412.05545v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14488v2","updated":"2025-01-10T13:52:14Z","published":"2024-12-19T03:22:47Z","title":"A stochastic first-order method with multi-extrapolated momentum for\n  highly smooth unconstrained optimization","summary":"  In this paper, we consider an unconstrained stochastic optimization problem\nwhere the objective function exhibits high-order smoothness. Specifically, we\npropose a new stochastic first-order method (SFOM) with multi-extrapolated\nmomentum, in which multiple extrapolations are performed in each iteration,\nfollowed by a momentum update based on these extrapolations. We demonstrate\nthat the proposed SFOM can accelerate optimization by exploiting the high-order\nsmoothness of the objective function $f$. Assuming that the $p$th-order\nderivative of $f$ is Lipschitz continuous for some $p\\ge2$, and under\nadditional mild assumptions, we establish that our method achieves a sample\ncomplexity of $\\widetilde{\\mathcal{O}}(\\epsilon^{-(3p+1)/p})$ for finding a\npoint $x$ such that $\\mathbb{E}[\\|\\nabla f(x)\\|]\\le\\epsilon$. To the best of\nour knowledge, this is the first SFOM to leverage arbitrary-order smoothness of\nthe objective function for acceleration, resulting in a sample complexity that\nimproves upon the best-known results without assuming the mean-squared\nsmoothness condition. Preliminary numerical experiments validate the practical\nperformance of our method and support our theoretical findings.\n","authors":["Chuan He"],"pdf_url":"https://arxiv.org/pdf/2412.14488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05942v1","updated":"2025-01-10T13:06:36Z","published":"2025-01-10T13:06:36Z","title":"Soft regression trees: a model variant and a decomposition training\n  algorithm","summary":"  Decision trees are widely used for classification and regression tasks in a\nvariety of application fields due to their interpretability and good accuracy.\nDuring the past decade, growing attention has been devoted to globally\noptimized decision trees with deterministic or soft splitting rules at branch\nnodes, which are trained by optimizing the error function over all the tree\nparameters. In this work, we propose a new variant of soft multivariate\nregression trees (SRTs) where, for every input vector, the prediction is\ndefined as the linear regression associated to a single leaf node, namely, the\nleaf node obtained by routing the input vector from the root along the branches\nwith higher probability. SRTs exhibit the conditional computational property,\ni.e., each prediction depends on a small number of nodes (parameters), and our\nnonlinear optimization formulation for training them is amenable to\ndecomposition. After showing a universal approximation result for SRTs, we\npresent a decomposition training algorithm including a clustering-based\ninitialization procedure and a heuristic for reassigning the input vectors\nalong the tree. Under mild assumptions, we establish asymptotic convergence\nguarantees. Experiments on 15 wellknown datasets indicate that our SRTs and\ndecomposition algorithm yield higher accuracy and robustness compared with\ntraditional soft regression trees trained using the nonlinear optimization\nformulation of Blanquero et al., and a significant reduction in training times\nas well as a slightly better average accuracy compared with the mixed-integer\noptimization approach of Bertsimas and Dunn. We also report a comparison with\nthe Random Forest ensemble method.\n","authors":["Antonio Consolo","Edoardo Amaldi","Andrea Manno"],"pdf_url":"https://arxiv.org/pdf/2501.05942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05939v1","updated":"2025-01-10T13:02:50Z","published":"2025-01-10T13:02:50Z","title":"Designing a Robust and Cost-Efficient Electrified Bus Network with\n  Sparse Energy Consumption Data","summary":"  This paper addresses the challenges of charging infrastructure design (CID)\nfor electrified public transport networks using Battery Electric Buses (BEBs)\nunder conditions of sparse energy consumption data. Accurate energy consumption\nestimation is critical for cost-effective and reliable electrification but\noften requires costly field experiments, resulting in limited data. To address\nthis issue, we propose two mathematical models designed to handle uncertainty\nand data sparsity in energy consumption. The first is a robust optimization\nmodel with box uncertainty, addressing variability in energy consumption. The\nsecond is a data-driven distributionally robust optimization model that\nleverages observed data to provide more flexible and informed solutions. To\nevaluate these models, we apply them to the Rotterdam bus network. Our analysis\nreveals three key insights: (1) Ignoring variations in energy consumption can\nresult in operational unreliability, with up to 55\\% of scenarios leading to\ninfeasible trips. (2) Designing infrastructure based on worst-case energy\nconsumption increases costs by 67\\% compared to using average estimates. (3)\nThe data-driven distributionally robust optimization model reduces costs by\n28\\% compared to the box uncertainty model while maintaining reliability,\nespecially in scenarios where extreme energy consumption values are rare and\ndata exhibit skewness. In addition to cost savings, this approach provides\nrobust protection against uncertainty, ensuring reliable operation under\ndiverse conditions.\n","authors":["Sara Momen","Yousef Maknoon","Bart van Arem","Shadi Sharif Azadeh"],"pdf_url":"https://arxiv.org/pdf/2501.05939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05930v1","updated":"2025-01-10T12:52:00Z","published":"2025-01-10T12:52:00Z","title":"Random Sparse Lifts: Construction, Analysis and Convergence of finite\n  sparse networks","summary":"  We present a framework to define a large class of neural networks for which,\nby construction, training by gradient flow provably reaches arbitrarily low\nloss when the number of parameters grows. Distinct from the fixed-space global\noptimality of non-convex optimization, this new form of convergence, and the\ntechniques introduced to prove such convergence, pave the way for a usable deep\nlearning convergence theory in the near future, without overparameterization\nassumptions relating the number of parameters and training samples. We define\nthese architectures from a simple computation graph and a mechanism to lift it,\nthus increasing the number of parameters, generalizing the idea of increasing\nthe widths of multi-layer perceptrons. We show that architectures similar to\nmost common deep learning models are present in this class, obtained by\nsparsifying the weight tensors of usual architectures at initialization.\nLeveraging tools of algebraic topology and random graph theory, we use the\ncomputation graph's geometry to propagate properties guaranteeing convergence\nto any precision for these large sparse models.\n","authors":["David A. R. Robin","Kevin Scaman","Marc Lelarge"],"pdf_url":"https://arxiv.org/pdf/2501.05930v1.pdf","comment":"The Twelfth International Conference on Learning Representations, May\n  2024, Vienna, Austria"},{"id":"http://arxiv.org/abs/2412.09594v2","updated":"2025-01-10T09:40:04Z","published":"2024-12-12T18:58:14Z","title":"Wait-Less Offline Tuning and Re-solving for Online Decision Making","summary":"  Online linear programming (OLP) has found broad applications in revenue\nmanagement and resource allocation. State-of-the-art OLP algorithms achieve low\nregret by repeatedly solving linear programming (LP) subproblems that\nincorporate updated resource information. However, LP-based methods are\ncomputationally expensive and often inefficient for large-scale applications.\nIn contrast, recent first-order OLP algorithms are more computationally\nefficient but typically suffer from worse regret guarantees. To address these\nshortcomings, we propose a new algorithm that combines the strengths of\nLP-based and first-order OLP methods. The algorithm re-solves the LP\nsubproblems periodically at a predefined frequency $f$ and uses the latest dual\nprices to guide online decision-making. In addition, a first-order method runs\nin parallel during each interval between LP re-solves, smoothing resource\nconsumption. Our algorithm achieves $\\mathscr{O}(\\log (T/f) + \\sqrt{f})$\nregret, delivering a \"wait-less\" online decision-making process that balances\nthe computational efficiency of first-order methods and the superior regret\nguarantee of LP-based methods.\n","authors":["Jingruo Sun","Wenzhi Gao","Ellen Vitercik","Yinyu Ye"],"pdf_url":"https://arxiv.org/pdf/2412.09594v2.pdf","comment":"In this version, we achieve a tighter regret bound with the warm\n  start for the first batch. We also make the proof more elegant by manually\n  accepting all subsequent orders once the constraint is violated. In this way,\n  we do not need to introduce the concept of stopping time for the analysis of\n  the LP-based method"},{"id":"http://arxiv.org/abs/2404.08289v2","updated":"2025-01-10T08:50:35Z","published":"2024-04-12T07:27:25Z","title":"Generic controllability of equivariant systems and applications to\n  particle systems and neural networks","summary":"  There exist many examples of systems which have some symmetries, and which\none may monitor with symmetry preserving controls. Since symmetries are\npreserved along the evolution, full controllability is not possible, and\ncontrollability has to be considered inside sets of states with same\nsymmetries. We prove that generic systems with symmetries are controllable in\nthis sense. This result has several applications, for instance: (i) generic\ncontrollability of particle systems when the kernel of interaction between\nparticles plays the role of a mean-field control; (ii) generic controllability\nfor families of vector fields on manifolds with boundary; (iii) universal\ninterpolation for neural networks architectures with \"generic\" self\nattention-type layers - a type of layers ubiquitous in recent neural networks\narchitectures, e.g., in the Transformers architecture. The tools we develop\ncould help address various other questions of control of equivariant systems.\n","authors":["Andrei Agrachev","Cyril Letrouit"],"pdf_url":"https://arxiv.org/pdf/2404.08289v2.pdf","comment":"To appear in Annales de l'Institut Henri Poincar\\'e, Analyse non\n  lin\\'eaire"},{"id":"http://arxiv.org/abs/2404.09746v2","updated":"2025-01-10T08:17:37Z","published":"2024-04-15T12:47:23Z","title":"Gradient descent for unbounded convex functions on Hadamard manifolds\n  and its applications to scaling problems","summary":"  In this paper, we study asymptotic behaviors of continuous-time and\ndiscrete-time gradient flows of a ``lower-unbounded\" convex function $f$ on a\nHadamard manifold $M$, particularly, their convergence properties to the\nboundary $M^{\\infty}$ at infinity of $M$. We establish a duality theorem that\nthe infimum of the gradient-norm $\\|\\nabla f(x)\\|$ of $f$ over $M$ is equal to\nthe supremum of the negative of the recession function $f^{\\infty}$ of $f$ over\nthe boundary $M^{\\infty}$, provided the infimum is positive. Further, the\ninfimum and the supremum are obtained by the limits of the gradient flows of\n$f$, Our results feature convex-optimization ingredients of the moment-weight\ninequality for reductive group actions by Georgoulas, Robbin, and Salamon,and\nare applied to noncommutative optimization by B\\\"urgisser et al. FOCS 2019. We\nshow that the gradient descent of the Kempf-Ness function for an unstable orbit\nconverges to a 1-parameter subgroup in the Hilbert-Mumford criterion, and the\nassociated moment-map sequence converges to the mimimum-norm point of the\nmoment polytope. We show further refinements for operator scaling -- the\nleft-right action on a matrix tuple $A= (A_1,A_2,\\ldots,A_N)$. We characterize\nthe gradient-flow limit of operator scaling by a vector-space generalization of\nthe classical Dulmage-Mendelsohn decomposition of a bipartite graph. Also, for\na special case of $N = 2$, we reveal that this limit determines the Kronecker\ncanonical form of matrix pencils $s A_1+A_2$.\n","authors":["Hiroshi Hirai","Keiya Sakabe"],"pdf_url":"https://arxiv.org/pdf/2404.09746v2.pdf","comment":"The conference version in FOCS 2024"},{"id":"http://arxiv.org/abs/2406.00621v3","updated":"2025-01-10T06:10:19Z","published":"2024-06-02T05:50:41Z","title":"Log-Scale Quantization in Distributed First-Order Methods:\n  Gradient-based Learning from Distributed Data","summary":"  Decentralized strategies are of interest for learning from large-scale data\nover networks. This paper studies learning over a network of geographically\ndistributed nodes/agents subject to quantization. Each node possesses a private\nlocal cost function, collectively contributing to a global cost function, which\nthe considered methodology aims to minimize. In contrast to many existing\npapers, the information exchange among nodes is log-quantized to address\nlimited network-bandwidth in practical situations. We consider a first-order\ncomputationally efficient distributed optimization algorithm (with no extra\ninner consensus loop) that leverages node-level gradient correction based on\nlocal data and network-level gradient aggregation only over nearby nodes. This\nmethod only requires balanced networks with no need for stochastic weight\ndesign. It can handle log-scale quantized data exchange over possibly\ntime-varying and switching network setups. We study convergence over both\nstructured networks (for example, training over data-centers) and ad-hoc\nmulti-agent networks (for example, training over dynamic robotic networks).\nThrough experimental validation, we show that (i) structured networks generally\nresult in a smaller optimality gap, and (ii) log-scale quantization leads to a\nsmaller optimality gap compared to uniform quantization.\n","authors":["Mohammadreza Doostmohammadian","Muhammad I. Qureshi","Mohammad Hossein Khalesi","Hamid R. Rabiee","Usman A. Khan"],"pdf_url":"https://arxiv.org/pdf/2406.00621v3.pdf","comment":"IEEE TASE 2025"},{"id":"http://arxiv.org/abs/2501.05737v1","updated":"2025-01-10T06:06:41Z","published":"2025-01-10T06:06:41Z","title":"Efficient Gradient Tracking Algorithms for Distributed Optimization\n  Problems with Inexact Communication","summary":"  Distributed optimization problems usually face inexact communication issues\ninduced by communication quantization, differential privacy protection, or\nchannels noise. Most existing algorithms need two-timescale setting of the\nstepsize of gradient descent and the parameter of noise suppression to ensure\nthe convergence to the optimal solution. In this paper, we propose two\nsingle-timescale algorithms, VRA-DGT and VRA--DSGT, for distributed\ndeterministic and stochastic optimization problems with inexact communication\nrespectively. VRA-DGT integrates the Variance-Reduced Aggregation (VRA)\nmechanism with the distributed gradient tracking framework, which achieves a\nconvergence rate of $\\mathcal{O}\\left(k^{-1}\\right)$ in the mean-square sense\nwhen the objective function is strongly convex and smooth. For distributed\nstochastic optimization problem,VRA-DSGT, where a hybrid variance reduction\ntechnique has been introduced in VRA-DGT,\n  VRA-DGT,, maintains the convergence rate of $\\mathcal{O}\\left(k^{-1}\\right)$\nfor strongly convex and smooth objective function. Simulated experiments on\nlogistic regression problem with real-world data verify the effectiveness of\nthe proposed algorithms.\n","authors":["Shengchao Zhaoa","Yongchao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.05737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07616v2","updated":"2025-01-10T05:44:12Z","published":"2024-08-14T15:31:15Z","title":"Prophet Inequalities: Competing with the Top $\\ell$ Items is Easy","summary":"  We explore a prophet inequality problem, where the values of a sequence of\nitems are drawn i.i.d. from some distribution, and an online decision maker\nmust select one item irrevocably. We establish that $\\mathrm{CR}_{\\ell}$ the\nworst-case competitive ratio between the expected optimal performance of an\nonline decision maker compared to that of a prophet who uses the average of the\ntop $\\ell$ items is exactly the solution to an integral equation. This quantity\n$\\mathrm{CR}_{\\ell}$ is larger than $1-e^{-\\ell}$. This implies that the bound\nconverges exponentially fast to $1$ as $\\ell$ grows. In particular for\n$\\ell=2$, $\\mathrm{CR}_{2} \\approx 0.966$ which is much closer to $1$ than the\nclassical bound of $0.745$ for $\\ell=1$. Additionally, we prove asymptotic\nlower bounds for the competitive ratio of a more general scenario, where the\ndecision maker is permitted to select $k$ items. This subsumes the $k$\nmulti-unit i.i.d. prophet problem and provides the current best asymptotic\nguarantees, as well as enables broader understanding in the more general\nframework. Finally, we prove a tight asymptotic competitive ratio when only\nstatic threshold policies are allowed.\n","authors":["Mathieu Molina","Nicolas Gast","Patrick Loiseau","Vianney Perchet"],"pdf_url":"https://arxiv.org/pdf/2408.07616v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05701v1","updated":"2025-01-10T04:19:38Z","published":"2025-01-10T04:19:38Z","title":"A Two-timescale Primal-dual Algorithm for Decentralized Optimization\n  with Compression","summary":"  This paper proposes a two-timescale compressed primal-dual (TiCoPD) algorithm\nfor decentralized optimization with improved communication efficiency over\nprior works on primal-dual decentralized optimization. The algorithm is built\nupon the primal-dual optimization framework and utilizes a\nmajorization-minimization procedure. The latter naturally suggests the agents\nto share a compressed difference term during the iteration. Furthermore, the\nTiCoPD algorithm incorporates a fast timescale mirror sequence for agent\nconsensus on nonlinearly compressed terms, together with a slow timescale\nprimal-dual recursion for optimizing the objective function. We show that the\nTiCoPD algorithm converges with a constant step size. It also finds an O(1 /T )\nstationary solution after T iterations. Numerical experiments on decentralized\ntraining of a neural network validate the efficacy of TiCoPD algorithm.\n","authors":["Haoming Liu","Chung-Yiu Yau","Hoi-To Wai"],"pdf_url":"https://arxiv.org/pdf/2501.05701v1.pdf","comment":"5 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.05693v1","updated":"2025-01-10T03:52:29Z","published":"2025-01-10T03:52:29Z","title":"Robust Adaptive Supplementary Control for Damping Weak-Grid SSOs\n  Involving IBRs","summary":"  Subsynchronous oscillations (SSOs) involving grid-following converters\n(GFLCs) connected to weak grids are a relatively new phenomena observed in\nmodern power systems. SSOs are further exacerbated when grids become weaker\nbecause lines are disconnected due to maintenance or following faults. Such\nundesirable oscillations have also led to curtailment of inverter-based\nresource (IBR) outputs. In contrast to most literature addressing the issue by\nretuning/redesigning of standard IBR controllers, we propose a robust adaptive\nsupplementary control for damping of such SSOs while keeping standard controls\nunaltered. As a result, uncertainty in system conditions can be handled without\nnegatively impacting the nominal IBR performance. To that end, the adaptive\ncontrol law is derived for a GFLC connected to the grid, where the grid is\nmodeled by the Thevenin's equivalent representation with uncertainty and\ndisturbances. The theoretical result provides dissipativity certificate for the\nclosed-loop error dynamics with sufficient conditions for stability. The\neffectiveness of the developed controller is validated with several case\nstudies conducted on a single-GFLC-infinite-bus test system, the IEEE $2$-area\ntest system, wherein some of the synchronous generators are replaced by GFLCs,\nand a modified IEEE $5$-area test system with two GFLCs. The findings\ndemonstrate that under very weak grid conditions, the proposed robust adaptive\ncontrol performs well in stabilizing SSO modes, which a classical\nstate-feedback control method fails to address.\n","authors":["Sina Ameli","Lilan Karunaratne","Nilanjan Ray Chaudhuri","Constantino Lagoa"],"pdf_url":"https://arxiv.org/pdf/2501.05693v1.pdf","comment":"14 pages, 19 figures, 3 tables, IEEE Transactions on Power Systems"},{"id":"http://arxiv.org/abs/2501.05677v1","updated":"2025-01-10T03:01:48Z","published":"2025-01-10T03:01:48Z","title":"Single-Loop Variance-Reduced Stochastic Algorithm for Nonconvex-Concave\n  Minimax Optimization","summary":"  Nonconvex-concave (NC-C) finite-sum minimax problems have broad applications\nin decentralized optimization and various machine learning tasks. However, the\nnonsmooth nature of NC-C problems makes it challenging to design effective\nvariance reduction techniques. Existing vanilla stochastic algorithms using\nuniform samples for gradient estimation often exhibit slow convergence rates\nand require bounded variance assumptions. In this paper, we develop a novel\nprobabilistic variance reduction updating scheme and propose a single-loop\nalgorithm called the probabilistic variance-reduced smoothed gradient\ndescent-ascent (PVR-SGDA) algorithm. The proposed algorithm achieves an\niteration complexity of $O(\\epsilon^{-4})$, surpassing the best-known rates of\nstochastic algorithms for NC-C minimax problems and matching the performance of\nthe best deterministic algorithms in this context. Finally, we demonstrate the\neffectiveness of the proposed algorithm through numerical simulations.\n","authors":["Xia Jiang","Linglingzhi Zhu","Taoli Zheng","Anthony Man-Cho So"],"pdf_url":"https://arxiv.org/pdf/2501.05677v1.pdf","comment":"The conference version of this paper has been accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.05676v1","updated":"2025-01-10T02:59:56Z","published":"2025-01-10T02:59:56Z","title":"An Efficient Dual ADMM for Huber Regression with Fused Lasso Penalty","summary":"  The ordinary least squares estimate in linear regression is sensitive to the\ninfluence of errors with large variance, which reduces its robustness,\nespecially when dealing with heavy-tailed errors or outliers frequently\nencountered in real-world scenarios. To address this issue and accommodate the\nsparsity of coefficients along with their sequential disparities, we combine\nthe adaptive robust Huber loss function with a fused lasso penalty. This\ncombination yields a robust estimator capable of simultaneously achieving\nestimation and variable selection. Furthermore, we utilize an efficient\nalternating direction method of multipliers to solve this regression model from\na dual perspective. The effectiveness and efficiency of our proposed approach\nis demonstrated through numerical experiments carried out on both simulated and\nreal datasets.\n","authors":["Mengjiao Shi","Yunhai Xiao"],"pdf_url":"https://arxiv.org/pdf/2501.05676v1.pdf","comment":"14 pages,24 figures"},{"id":"http://arxiv.org/abs/2105.04684v4","updated":"2025-01-10T02:50:39Z","published":"2021-05-10T21:46:12Z","title":"An automatic system to detect equivalence between iterative algorithms","summary":"  When are two algorithms the same? How can we be sure a recently proposed\nalgorithm is novel, and not a minor twist on an existing method? In this paper,\nwe present a framework for reasoning about equivalence between a broad class of\niterative algorithms, with a focus on algorithms designed for convex\noptimization. We propose several notions of what it means for two algorithms to\nbe equivalent, and provide computationally tractable means to detect\nequivalence. Our main definition, oracle equivalence, states that two\nalgorithms are equivalent if they result in the same sequence of calls to the\nfunction oracles (for suitable initialization). Borrowing from control theory,\nwe use state-space realizations to represent algorithms and characterize\nalgorithm equivalence via transfer functions. Our framework can also identify\nand characterize some algorithm transformations including permutations of the\nupdate equations, repetition of the iteration, and conjugation of some of the\nfunction oracles in the algorithm. To support the paper, we have developed a\nsoftware package named Linnaeus that implements the framework to identify other\niterative algorithms that are equivalent to an input algorithm. More broadly,\nthis framework and software advances the goal of making mathematics searchable.\n","authors":["Shipu Zhao","Laurent Lessard","Madeleine Udell"],"pdf_url":"https://arxiv.org/pdf/2105.04684v4.pdf","comment":"This paper documents a software system for identifying equivalence\n  between optimization algorithms. The analysis in this paper has been improved\n  in arxiv:2501.04972"},{"id":"http://arxiv.org/abs/2303.10503v3","updated":"2025-01-10T01:27:06Z","published":"2023-03-18T21:28:45Z","title":"Counter-examples in first-order optimization: a constructive approach","summary":"  While many approaches were developed for obtaining worst-case complexity\nbounds for first-order optimization methods in the last years, there remain\ntheoretical gaps in cases where no such bound can be found. In such cases, it\nis often unclear whether no such bound exists (e.g., because the algorithm\nmight fail to systematically converge) or simply if the current techniques do\nnot allow finding them.\n  In this work, we propose an approach to automate the search for cyclic\ntrajectories generated by first-order methods. This provides a constructive\napproach to show that no appropriate complexity bound exists, thereby\ncomplementing the approaches providing sufficient conditions for convergence.\nUsing this tool, we provide ranges of parameters for which some of the famous\nheavy-ball, Nesterov accelerated gradient, inexact gradient descent, and\nthree-operator splitting algorithms fail to systematically converge, and show\nthat it nicely complements existing tools searching for Lyapunov functions.\n","authors":["Baptiste Goujaud","Aymeric Dieuleveut","Adrien Taylor"],"pdf_url":"https://arxiv.org/pdf/2303.10503v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05642v1","updated":"2025-01-10T00:59:43Z","published":"2025-01-10T00:59:43Z","title":"FIRM: Federated Image Reconstruction using Multimodal Tomographic Data","summary":"  We propose a federated algorithm for reconstructing images using multimodal\ntomographic data sourced from dispersed locations, addressing the challenges of\ntraditional unimodal approaches that are prone to noise and reduced image\nquality. Our approach formulates a joint inverse optimization problem\nincorporating multimodality constraints and solves it in a federated framework\nthrough local gradient computations complemented by lightweight central\noperations, ensuring data decentralization. Leveraging the connection between\nour federated algorithm and the quadratic penalty method, we introduce an\nadaptive step-size rule with guaranteed sublinear convergence and further\nsuggest its extension to augmented Lagrangian framework. Numerical results\ndemonstrate its superior computational efficiency and improved image\nreconstruction quality.\n","authors":["Geunyeong Byeon","Minseok Ryu","Zichao Wendy Di","Kibaek Kim"],"pdf_url":"https://arxiv.org/pdf/2501.05642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09131v4","updated":"2025-01-10T23:30:13Z","published":"2023-12-14T17:01:58Z","title":"Physics-Informed Neural Network Lyapunov Functions: PDE\n  Characterization, Learning, and Verification","summary":"  We provide a systematic investigation of using physics-informed neural\nnetworks to compute Lyapunov functions. We encode Lyapunov conditions as a\npartial differential equation (PDE) and use this for training neural network\nLyapunov functions. We analyze the analytical properties of the solutions to\nthe Lyapunov and Zubov PDEs. In particular, we show that employing the Zubov\nequation in training neural Lyapunov functions can lead to approximate regions\nof attraction close to the true domain of attraction. We also examine\napproximation errors and the convergence of neural approximations to the unique\nsolution of Zubov's equation. We then provide sufficient conditions for the\nlearned neural Lyapunov functions that can be readily verified by\nsatisfiability modulo theories (SMT) solvers, enabling formal verification of\nboth local stability analysis and region-of-attraction estimates in the large.\nThrough a number of nonlinear examples, ranging from low to high dimensions, we\ndemonstrate that the proposed framework can outperform traditional\nsums-of-squares (SOS) Lyapunov functions obtained using semidefinite\nprogramming (SDP).\n","authors":["Jun Liu","Yiming Meng","Maxwell Fitzsimmons","Ruikun Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.09131v4.pdf","comment":"The current version is accepted to the IFAC Journal Automatica"},{"id":"http://arxiv.org/abs/2407.13868v4","updated":"2025-01-10T21:43:57Z","published":"2024-07-18T19:28:05Z","title":"Stochastic Monotone Inclusion with Closed Loop Distributions","summary":"  In this paper, we study in a Hilbertian setting, first and second-order\nmonotone inclusions related to stochastic optimization problems with decision\ndependent distributions. The studied dynamics are formulated as monotone\ninclusions governed by Lipschitz perturbations of maximally monotone operators\nwhere the concept of equilibrium plays a central role. We discuss the\nrelationship between the $\\mathbb{W}_1$-Wasserstein Lipschitz behavior of the\ndistribution and the so-called coarse Ricci curvature. As an application, we\nconsider the monotone inclusions associated with stochastic optimisation\nproblems involving the sum of a smooth function with Lipschitz gradient, a\nproximable function and a composite term.\n","authors":["Hamza Ennaji","Jalal Fadili","Hedy Attouch"],"pdf_url":"https://arxiv.org/pdf/2407.13868v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06353v1","updated":"2025-01-10T21:25:16Z","published":"2025-01-10T21:25:16Z","title":"Event Constrained Programming","summary":"  In this paper, we present event constraints as a new modeling paradigm that\ngeneralizes joint chance constraints from stochastic optimization to (1)\nenforce a constraint on the probability of satisfying a set of constraints\naggregated via application-specific logic (constituting an event) and (2) to be\napplied to general infinite-dimensional optimization (InfiniteOpt) problems\n(i.e., time, space, and/or uncertainty domains). This new constraint class\noffers significant modeling flexibility in posing InfiniteOpt constraints that\nare enforced over a certain portion of their domain (e.g., to a certain\nprobability level), but can be challenging to reformulate/solve due to\ndifficulties in representing arbitrary logical conditions and specifying a\nprobabilistic measure on a collection of constraints. To address these\nchallenges, we derive a generalized disjunctive programming (GDP)\nrepresentation of event constrained optimization problems, which readily\nenables us to pose logical event conditions in a standard form and allows us to\ndraw from a suite of GDP solution strategies that leverage the special\nstructure of this problem class. We also extend several approximation\ntechniques from the chance constraint literature to provide a means to\nreformulate certain event constraints without the use of binary variables. We\nillustrate these findings with case studies in stochastic optimal power flow,\ndynamic disease control, and optimal 2D diffusion.\n","authors":["Daniel Ovalle","Stefan Mazzadi","Carl D. Laird","Ignacio E. Grossmann","Joshua L. Pulsipher"],"pdf_url":"https://arxiv.org/pdf/2501.06353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06350v1","updated":"2025-01-10T21:23:05Z","published":"2025-01-10T21:23:05Z","title":"SMOP: Stochastic trust region method for multi-objective problems","summary":"  The problem considered is a multi-objective optimization problem, in which\nthe goal is to find an optimal value of a vector function representing various\ncriteria. The aim of this work is to develop an algorithm which utilizes the\ntrust region framework with probabilistic model functions, able to cope with\nnoisy problems, using inaccurate functions and gradients. We prove the almost\nsure convergence of the proposed algorithm to a Pareto critical point if the\nmodel functions are good approximations in probabilistic sense. Numerical\nresults demonstrate effectiveness of the probabilistic trust region by\ncomparing it to competitive stochastic multi-objective solvers. The application\nin supervised machine learning is showcased by training non discriminatory\nLogistic Regression models on different size data groups. Additionally, we use\nseveral test examples with irregularly shaped fronts to exhibit the efficiency\nof the algorithm.\n","authors":["Nataša Krejić","Nataša Krklec Jerinkić","Luka Rutešić"],"pdf_url":"https://arxiv.org/pdf/2501.06350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06335v1","updated":"2025-01-10T20:41:11Z","published":"2025-01-10T20:41:11Z","title":"A Comparison of Strategies to Embed Physics-Informed Neural Networks in\n  Nonlinear Model Predictive Control Formulations Solved via Direct\n  Transcription","summary":"  This study aims to benchmark candidate strategies for embedding neural\nnetwork (NN) surrogates in nonlinear model predictive control (NMPC)\nformulations that are subject to systems described with partial differential\nequations and that are solved via direct transcription (i.e., simultaneous\nmethods). This study focuses on the use of physics-informed NNs and\nphysics-informed convolutional NNs as the internal (surrogate) models within\nthe NMPC formulation. One strategy embeds NN models as explicit algebraic\nconstraints, leveraging the automatic differentiation (AD) of an algebraic\nmodelling language (AML) to evaluate the derivatives. Alternatively, the solver\ncan be provided with derivatives computed external to the AML via the AD\nroutines of the machine learning environment the NN is trained in. The three\nnumerical experiments considered in this work reveal that replacing mechanistic\nmodels with NN surrogates may not always offer computational advantages when\nsmooth activation functions are used in conjunction with a local nonlinear\nsolver (e.g., Ipopt), even with highly nonlinear systems. Moreover, in this\ncontext, the external function evaluation of the NN surrogates often\noutperforms the embedding strategies that rely on explicit algebraic\nconstraints, likely due to the difficulty in initializing the auxiliary\nvariables and constraints introduced by explicit algebraic reformulations.\n","authors":["Carlos Andrés Elorza Casas","Luis A. Ricardez-Sandoval","Joshua L. Pulsipher"],"pdf_url":"https://arxiv.org/pdf/2501.06335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06279v1","updated":"2025-01-10T13:57:32Z","published":"2025-01-10T13:57:32Z","title":"Reinforcing Infrastructure Networks with Multicriteria Portfolio\n  Decision Analysis: An Application to Railway Stations in Finland","summary":"  Advanced societies are crucially dependent on critical infrastructure\nnetworks for the reliable delivery of essential goods and services. Hence,\nwell-founded analyses concerning disruptions are needed to guide decisions that\nseek to ensure the performance of these networks in the face of failures caused\nby vulnerabilities to external hazards or technical malfunctions. In this\nsetting, we develop a multicriteria decision analysis approach to support the\nformulation of cost-efficient portfolios of preventive reinforcement actions.\nOur approach is general in that it (i) allows for multiple objectives, such as\nthose that represent the volume of traffic that is enabled between alternative\norigin-destination pairs in a transportation network, (ii) uses methods of\nprobabilistic risk assessment to quantify the expected performance of the\nnetwork, and (iii) solves optimization problems to identify those combinations\nof reinforcement actions that are cost-efficient in improving the performance\nof the network, given the available, possibly incomplete information about the\nrelative importance of objectives. Our methodological contributions are\nillustrated by a case study on the analysis of railway switches at a\nrepresentative Finnish railway station.\n","authors":["Joaquín de la Barra","Ahti Salo","Leevi Olander","Kash Barker","Jussi Kangaspunta"],"pdf_url":"https://arxiv.org/pdf/2501.06279v1.pdf","comment":"32 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.06275v1","updated":"2025-01-10T10:16:38Z","published":"2025-01-10T10:16:38Z","title":"Exploratory Randomization for Discrete-Time Linear Exponential Quadratic\n  Gaussian (LEQG) Problem","summary":"  We investigate exploratory randomization for an extended\nlinear-exponential-quadratic-Gaussian (LEQG) control problem in discrete time.\nThis extended control problem is related to the structure of risk-sensitive\ninvestment management applications. We introduce exploration through a\nrandomization of the control. Next, we apply the duality between free energy\nand relative entropy to reduce the LEQG problem to an equivalent risk-neutral\nLQG control problem with an entropy regularization term, see, e.g. Dai Pra et\nal. (1996), for which we present a solution approach based on Dynamic\nProgramming. Our approach, based on the energy-entropy duality may also be\nconsidered as leading to a justification for the use, in the literature, of an\nentropy regularization when applying a randomized control.\n","authors":["Sebastien Lleo","Wolfgang Runggaldier"],"pdf_url":"https://arxiv.org/pdf/2501.06275v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.06187v1","updated":"2025-01-10T18:59:54Z","published":"2025-01-10T18:59:54Z","title":"Multi-subject Open-set Personalization in Video Generation","summary":"  Video personalization methods allow us to synthesize videos with specific\nconcepts such as people, pets, and places. However, existing methods often\nfocus on limited domains, require time-consuming optimization per subject, or\nsupport only a single subject. We present Video Alchemist $-$ a video model\nwith built-in multi-subject, open-set personalization capabilities for both\nforeground objects and background, eliminating the need for time-consuming\ntest-time optimization. Our model is built on a new Diffusion Transformer\nmodule that fuses each conditional reference image and its corresponding\nsubject-level text prompt with cross-attention layers. Developing such a large\nmodel presents two main challenges: dataset and evaluation. First, as paired\ndatasets of reference images and videos are extremely hard to collect, we\nsample selected video frames as reference images and synthesize a clip of the\ntarget video. However, while models can easily denoise training videos given\nreference frames, they fail to generalize to new contexts. To mitigate this\nissue, we design a new automatic data construction pipeline with extensive\nimage augmentations. Second, evaluating open-set video personalization is a\nchallenge in itself. To address this, we introduce a personalization benchmark\nthat focuses on accurate subject fidelity and supports diverse personalization\nscenarios. Finally, our extensive experiments show that our method\nsignificantly outperforms existing personalization methods in both quantitative\nand qualitative evaluations.\n","authors":["Tsai-Shien Chen","Aliaksandr Siarohin","Willi Menapace","Yuwei Fang","Kwot Sin Lee","Ivan Skorokhodov","Kfir Aberman","Jun-Yan Zhu","Ming-Hsuan Yang","Sergey Tulyakov"],"pdf_url":"https://arxiv.org/pdf/2501.06187v1.pdf","comment":"Project page:\n  https://snap-research.github.io/open-set-video-personalization/"},{"id":"http://arxiv.org/abs/2501.06186v1","updated":"2025-01-10T18:59:51Z","published":"2025-01-10T18:59:51Z","title":"LlamaV-o1: Rethinking Step-by-step Visual Reasoning in LLMs","summary":"  Reasoning is a fundamental capability for solving complex multi-step\nproblems, particularly in visual contexts where sequential step-wise\nunderstanding is essential. Existing approaches lack a comprehensive framework\nfor evaluating visual reasoning and do not emphasize step-wise problem-solving.\nTo this end, we propose a comprehensive framework for advancing step-by-step\nvisual reasoning in large language models (LMMs) through three key\ncontributions. First, we introduce a visual reasoning benchmark specifically\ndesigned to evaluate multi-step reasoning tasks. The benchmark presents a\ndiverse set of challenges with eight different categories ranging from complex\nvisual perception to scientific reasoning with over 4k reasoning steps in\ntotal, enabling robust evaluation of LLMs' abilities to perform accurate and\ninterpretable visual reasoning across multiple steps. Second, we propose a\nnovel metric that assesses visual reasoning quality at the granularity of\nindividual steps, emphasizing both correctness and logical coherence. The\nproposed metric offers deeper insights into reasoning performance compared to\ntraditional end-task accuracy metrics. Third, we present a new multimodal\nvisual reasoning model, named LlamaV-o1, trained using a multi-step curriculum\nlearning approach, where tasks are progressively organized to facilitate\nincremental skill acquisition and problem-solving. The proposed LlamaV-o1 is\ndesigned for multi-step reasoning and learns step-by-step through a structured\ntraining paradigm. Extensive experiments show that our LlamaV-o1 outperforms\nexisting open-source models and performs favorably against close-source\nproprietary models. Compared to the recent Llava-CoT, our LlamaV-o1 achieves an\naverage score of 67.3 with an absolute gain of 3.8\\% across six benchmarks\nwhile being 5 times faster during inference scaling. Our benchmark, model, and\ncode are publicly available.\n","authors":["Omkar Thawakar","Dinura Dissanayake","Ketan More","Ritesh Thawkar","Ahmed Heakl","Noor Ahsan","Yuhao Li","Mohammed Zumri","Jean Lahoud","Rao Muhammad Anwer","Hisham Cholakkal","Ivan Laptev","Mubarak Shah","Fahad Shahbaz Khan","Salman Khan"],"pdf_url":"https://arxiv.org/pdf/2501.06186v1.pdf","comment":"15 pages, 5 Figures"},{"id":"http://arxiv.org/abs/2501.06184v1","updated":"2025-01-10T18:59:42Z","published":"2025-01-10T18:59:42Z","title":"PEACE: Empowering Geologic Map Holistic Understanding with MLLMs","summary":"  Geologic map, as a fundamental diagram in geology science, provides critical\ninsights into the structure and composition of Earth's subsurface and surface.\nThese maps are indispensable in various fields, including disaster detection,\nresource exploration, and civil engineering. Despite their significance,\ncurrent Multimodal Large Language Models (MLLMs) often fall short in geologic\nmap understanding. This gap is primarily due to the challenging nature of\ncartographic generalization, which involves handling high-resolution map,\nmanaging multiple associated components, and requiring domain-specific\nknowledge. To quantify this gap, we construct GeoMap-Bench, the first-ever\nbenchmark for evaluating MLLMs in geologic map understanding, which assesses\nthe full-scale abilities in extracting, referring, grounding, reasoning, and\nanalyzing. To bridge this gap, we introduce GeoMap-Agent, the inaugural agent\ndesigned for geologic map understanding, which features three modules:\nHierarchical Information Extraction (HIE), Domain Knowledge Injection (DKI),\nand Prompt-enhanced Question Answering (PEQA). Inspired by the\ninterdisciplinary collaboration among human scientists, an AI expert group acts\nas consultants, utilizing a diverse tool pool to comprehensively analyze\nquestions. Through comprehensive experiments, GeoMap-Agent achieves an overall\nscore of 0.811 on GeoMap-Bench, significantly outperforming 0.369 of GPT-4o.\nOur work, emPowering gEologic mAp holistiC undErstanding (PEACE) with MLLMs,\npaves the way for advanced AI applications in geology, enhancing the efficiency\nand accuracy of geological investigations.\n","authors":["Yangyu Huang","Tianyi Gao","Haoran Xu","Qihao Zhao","Yang Song","Zhipeng Gui","Tengchao Lv","Hao Chen","Lei Cui","Scarlett Li","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2501.06184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05450v2","updated":"2025-01-10T18:58:11Z","published":"2025-01-09T18:59:56Z","title":"Decentralized Diffusion Models","summary":"  Large-scale AI model training divides work across thousands of GPUs, then\nsynchronizes gradients across them at each step. This incurs a significant\nnetwork burden that only centralized, monolithic clusters can support, driving\nup infrastructure costs and straining power systems. We propose Decentralized\nDiffusion Models, a scalable framework for distributing diffusion model\ntraining across independent clusters or datacenters by eliminating the\ndependence on a centralized, high-bandwidth networking fabric. Our method\ntrains a set of expert diffusion models over partitions of the dataset, each in\nfull isolation from one another. At inference time, the experts ensemble\nthrough a lightweight router. We show that the ensemble collectively optimizes\nthe same objective as a single model trained over the whole dataset. This means\nwe can divide the training burden among a number of \"compute islands,\" lowering\ninfrastructure costs and improving resilience to localized GPU failures.\nDecentralized diffusion models empower researchers to take advantage of\nsmaller, more cost-effective and more readily available compute like on-demand\nGPU nodes rather than central integrated systems. We conduct extensive\nexperiments on ImageNet and LAION Aesthetics, showing that decentralized\ndiffusion models FLOP-for-FLOP outperform standard diffusion models. We finally\nscale our approach to 24 billion parameters, demonstrating that high-quality\ndiffusion models can now be trained with just eight individual GPU nodes in\nless than a week.\n","authors":["David McAllister","Matthew Tancik","Jiaming Song","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2501.05450v2.pdf","comment":"Project webpage: https://decentralizeddiffusion.github.io/"},{"id":"http://arxiv.org/abs/2501.06173v1","updated":"2025-01-10T18:52:11Z","published":"2025-01-10T18:52:11Z","title":"VideoAuteur: Towards Long Narrative Video Generation","summary":"  Recent video generation models have shown promising results in producing\nhigh-quality video clips lasting several seconds. However, these models face\nchallenges in generating long sequences that convey clear and informative\nevents, limiting their ability to support coherent narrations. In this paper,\nwe present a large-scale cooking video dataset designed to advance long-form\nnarrative generation in the cooking domain. We validate the quality of our\nproposed dataset in terms of visual fidelity and textual caption accuracy using\nstate-of-the-art Vision-Language Models (VLMs) and video generation models,\nrespectively. We further introduce a Long Narrative Video Director to enhance\nboth visual and semantic coherence in generated videos and emphasize the role\nof aligning visual embeddings to achieve improved overall video quality. Our\nmethod demonstrates substantial improvements in generating visually detailed\nand semantically aligned keyframes, supported by finetuning techniques that\nintegrate text and image embeddings within the video generation process.\nProject page: https://videoauteur.github.io/\n","authors":["Junfei Xiao","Feng Cheng","Lu Qi","Liangke Gui","Jiepeng Cen","Zhibei Ma","Alan Yuille","Lu Jiang"],"pdf_url":"https://arxiv.org/pdf/2501.06173v1.pdf","comment":"Preprint, https://videoauteur.github.io/"},{"id":"http://arxiv.org/abs/2501.06151v1","updated":"2025-01-10T18:24:00Z","published":"2025-01-10T18:24:00Z","title":"PySpatial: A High-Speed Whole Slide Image Pathomics Toolkit","summary":"  Whole Slide Image (WSI) analysis plays a crucial role in modern digital\npathology, enabling large-scale feature extraction from tissue samples.\nHowever, traditional feature extraction pipelines based on tools like\nCellProfiler often involve lengthy workflows, requiring WSI segmentation into\npatches, feature extraction at the patch level, and subsequent mapping back to\nthe original WSI. To address these challenges, we present PySpatial, a\nhigh-speed pathomics toolkit specifically designed for WSI-level analysis.\nPySpatial streamlines the conventional pipeline by directly operating on\ncomputational regions of interest, reducing redundant processing steps.\nUtilizing rtree-based spatial indexing and matrix-based computation, PySpatial\nefficiently maps and processes computational regions, significantly\naccelerating feature extraction while maintaining high accuracy. Our\nexperiments on two datasets-Perivascular Epithelioid Cell (PEC) and data from\nthe Kidney Precision Medicine Project (KPMP)-demonstrate substantial\nperformance improvements. For smaller and sparse objects in PEC datasets,\nPySpatial achieves nearly a 10-fold speedup compared to standard CellProfiler\npipelines. For larger objects, such as glomeruli and arteries in KPMP datasets,\nPySpatial achieves a 2-fold speedup. These results highlight PySpatial's\npotential to handle large-scale WSI analysis with enhanced efficiency and\naccuracy, paving the way for broader applications in digital pathology.\n","authors":["Yuechen Yang","Yu Wang","Tianyuan Yao","Ruining Deng","Mengmeng Yin","Shilin Zhao","Haichun Yang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2501.06151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02780v2","updated":"2025-01-10T18:14:56Z","published":"2024-09-17T19:07:13Z","title":"Guess What I Think: Streamlined EEG-to-Image Generation with Latent\n  Diffusion Models","summary":"  Generating images from brain waves is gaining increasing attention due to its\npotential to advance brain-computer interface (BCI) systems by understanding\nhow brain signals encode visual cues. Most of the literature has focused on\nfMRI-to-Image tasks as fMRI is characterized by high spatial resolution.\nHowever, fMRI is an expensive neuroimaging modality and does not allow for\nreal-time BCI. On the other hand, electroencephalography (EEG) is a low-cost,\nnon-invasive, and portable neuroimaging technique, making it an attractive\noption for future real-time applications. Nevertheless, EEG presents inherent\nchallenges due to its low spatial resolution and susceptibility to noise and\nartifacts, which makes generating images from EEG more difficult. In this\npaper, we address these problems with a streamlined framework based on the\nControlNet adapter for conditioning a latent diffusion model (LDM) through EEG\nsignals. We conduct experiments and ablation studies on popular benchmarks to\ndemonstrate that the proposed method beats other state-of-the-art models.\nUnlike these methods, which often require extensive preprocessing, pretraining,\ndifferent losses, and captioning models, our approach is efficient and\nstraightforward, requiring only minimal preprocessing and a few components. The\ncode is available at https://github.com/LuigiSigillo/GWIT.\n","authors":["Eleonora Lopez","Luigi Sigillo","Federica Colonnese","Massimo Panella","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2410.02780v2.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.11456v2","updated":"2025-01-10T17:54:39Z","published":"2024-09-17T17:48:12Z","title":"Two Stage Segmentation of Cervical Tumors using PocketNet","summary":"  Cervical cancer remains the fourth most common malignancy amongst women\nworldwide.1 Concurrent chemoradiotherapy (CRT) serves as the mainstay\ndefinitive treatment regimen for locally advanced cervical cancers and includes\nexternal beam radiation followed by brachytherapy.2 Integral to radiotherapy\ntreatment planning is the routine contouring of both the target tumor at the\nlevel of the cervix, associated gynecologic anatomy and the adjacent organs at\nrisk (OARs). However, manual contouring of these structures is both time and\nlabor intensive and associated with known interobserver variability that can\nimpact treatment outcomes. While multiple tools have been developed to\nautomatically segment OARs and the high-risk clinical tumor volume (HR-CTV)\nusing computed tomography (CT) images,3,4,5,6 the development of deep\nlearning-based tumor segmentation tools using routine T2-weighted (T2w)\nmagnetic resonance imaging (MRI) addresses an unmet clinical need to improve\nthe routine contouring of both anatomical structures and cervical cancers,\nthereby increasing quality and consistency of radiotherapy planning. This work\napplied a novel deep-learning model (PocketNet) to segment the cervix, vagina,\nuterus, and tumor(s) on T2w MRI. The performance of the PocketNet architecture\nwas evaluated, when trained on data via 5-fold cross validation. PocketNet\nachieved a mean Dice-Sorensen similarity coefficient (DSC) exceeding 70% for\ntumor segmentation and 80% for organ segmentation. These results suggest that\nPocketNet is robust to variations in contrast protocols, providing reliable\nsegmentation of the regions of interest.\n","authors":["Awj Twam","Megan Jacobsen","Rachel Glenn","Peng Wei","Jia Sun","Ann Klopp","Aradhana M. Venkatesan","David Fuentes"],"pdf_url":"https://arxiv.org/pdf/2409.11456v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06138v1","updated":"2025-01-10T17:52:47Z","published":"2025-01-10T17:52:47Z","title":"MS-Temba : Multi-Scale Temporal Mamba for Efficient Temporal Action\n  Detection","summary":"  Action detection in real-world scenarios is particularly challenging due to\ndensely distributed actions in hour-long untrimmed videos. It requires modeling\nboth short- and long-term temporal relationships while handling significant\nintra-class temporal variations. Previous state-of-the-art (SOTA)\nTransformer-based architectures, though effective, are impractical for\nreal-world deployment due to their high parameter count, GPU memory usage, and\nlimited throughput, making them unsuitable for very long videos. In this work,\nwe innovatively adapt the Mamba architecture for action detection and propose\nMulti-scale Temporal Mamba (MS-Temba), comprising two key components: Temporal\nMamba (Temba) Blocks and the Temporal Mamba Fuser. Temba Blocks include the\nTemporal Local Module (TLM) for short-range temporal modeling and the Dilated\nTemporal SSM (DTS) for long-range dependencies. By introducing dilations, a\nnovel concept for Mamba, TLM and DTS capture local and global features at\nmultiple scales. The Temba Fuser aggregates these scale-specific features using\nMamba to learn comprehensive multi-scale representations of untrimmed videos.\nMS-Temba is validated on three public datasets, outperforming SOTA methods on\nlong videos and matching prior methods on short videos while using only\none-eighth of the parameters.\n","authors":["Arkaprava Sinha","Monish Soundar Raj","Pu Wang","Ahmed Helmy","Srijan Das"],"pdf_url":"https://arxiv.org/pdf/2501.06138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02189v2","updated":"2025-01-10T17:43:10Z","published":"2025-01-04T04:59:33Z","title":"Benchmark Evaluations, Applications, and Challenges of Large Vision\n  Language Models: A Survey","summary":"  Multimodal Vision Language Models (VLMs) have emerged as a transformative\ntechnology at the intersection of computer vision and natural language\nprocessing, enabling machines to perceive and reason about the world through\nboth visual and textual modalities. For example, models such as CLIP, Claude,\nand GPT-4V demonstrate strong reasoning and understanding abilities on visual\nand textual data and beat classical single modality vision models on zero-shot\nclassification. Despite their rapid advancements in research and growing\npopularity in applications, a comprehensive survey of existing studies on VLMs\nis notably lacking, particularly for researchers aiming to leverage VLMs in\ntheir specific domains. To this end, we provide a systematic overview of VLMs\nin the following aspects: model information of the major VLMs developed over\nthe past five years (2019-2024); the main architectures and training methods of\nthese VLMs; summary and categorization of the popular benchmarks and evaluation\nmetrics of VLMs; the applications of VLMs including embodied agents, robotics,\nand video generation; the challenges and issues faced by current VLMs such as\nhallucination, fairness, and safety. Detailed collections including papers and\nmodel repository links are listed in\nhttps://github.com/zli12321/Awesome-VLM-Papers-And-Models.git.\n","authors":["Zongxia Li","Xiyang Wu","Hongyang Du","Huy Nghiem","Guangyao Shi"],"pdf_url":"https://arxiv.org/pdf/2501.02189v2.pdf","comment":"35 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.11810v2","updated":"2025-01-10T17:29:36Z","published":"2024-08-21T17:56:34Z","title":"Pixel Is Not A Barrier: An Effective Evasion Attack for Pixel-Domain\n  Diffusion Models","summary":"  Diffusion Models have emerged as powerful generative models for high-quality\nimage synthesis, with many subsequent image editing techniques based on them.\nHowever, the ease of text-based image editing introduces significant risks,\nsuch as malicious editing for scams or intellectual property infringement.\nPrevious works have attempted to safeguard images from diffusion-based editing\nby adding imperceptible perturbations. These methods are costly and\nspecifically target prevalent Latent Diffusion Models (LDMs), while\nPixel-domain Diffusion Models (PDMs) remain largely unexplored and robust\nagainst such attacks. Our work addresses this gap by proposing a novel attack\nframework, AtkPDM. AtkPDM is mainly composed of a feature representation\nattacking loss that exploits vulnerabilities in denoising UNets and a latent\noptimization strategy to enhance the naturalness of adversarial images.\nExtensive experiments demonstrate the effectiveness of our approach in\nattacking dominant PDM-based editing methods (e.g., SDEdit) while maintaining\nreasonable fidelity and robustness against common defense methods.\nAdditionally, our framework is extensible to LDMs, achieving comparable\nperformance to existing approaches.\n","authors":["Chun-Yen Shih","Li-Xuan Peng","Jia-Wei Liao","Ernie Chu","Cheng-Fu Chou","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.05580v2","updated":"2025-01-10T17:06:36Z","published":"2024-12-07T08:08:24Z","title":"Self-Supervised Masked Mesh Learning for Unsupervised Anomaly Detection\n  on 3D Cortical Surfaces","summary":"  Unsupervised anomaly detection in brain imaging is challenging. In this\npaper, we propose a self-supervised masked mesh learning for unsupervised\nanomaly detection in 3D cortical surfaces. Our framework leverages the\nintrinsic geometry of the cortical surface to learn a self-supervised\nrepresentation that captures the underlying structure of the brain. We\nintroduce a masked mesh convolutional neural network (MMN) that learns to\npredict masked regions of the cortical surface. By training the MMN on a large\ndataset of healthy subjects, we learn a representation that captures the normal\nvariation in the cortical surface. We then use this representation to detect\nanomalies in unseen individuals by calculating anomaly scores based on the\nreconstruction error of the MMN. We evaluate our framework by training on\npopulation-scale dataset UKB and HCP-Aging and testing on two datasets of\nAlzheimer's disease patients ADNI and OASIS3. Our results show that our\nframework can detect anomalies in cortical thickness, cortical volume, and\ncortical sulcus features, which are known to be sensitive biomarkers for\nAlzheimer's disease. Our proposed framework provides a promising approach for\nunsupervised anomaly detection based on normative variation of cortical\nfeatures.\n","authors":["Hao-Chun Yang","Sicheng Dai","Saige Rutherford","Christian Gaser","Andre F Marquand","Christian F Beckmann","Thomas Wolfers"],"pdf_url":"https://arxiv.org/pdf/2412.05580v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05409v2","updated":"2025-01-10T16:58:29Z","published":"2025-01-09T18:06:45Z","title":"Atlas: A Novel Pathology Foundation Model by Mayo Clinic, Charité, and\n  Aignostics","summary":"  Recent advances in digital pathology have demonstrated the effectiveness of\nfoundation models across diverse applications. In this report, we present\nAtlas, a novel vision foundation model based on the RudolfV approach. Our model\nwas trained on a dataset comprising 1.2 million histopathology whole slide\nimages, collected from two medical institutions: Mayo Clinic and Charit\\'e -\nUniverst\\\"atsmedizin Berlin. Comprehensive evaluations show that Atlas achieves\nstate-of-the-art performance across twenty-one public benchmark datasets, even\nthough it is neither the largest model by parameter count nor by training\ndataset size.\n","authors":["Maximilian Alber","Stephan Tietz","Jonas Dippel","Timo Milbich","Timothée Lesort","Panos Korfiatis","Moritz Krügener","Beatriz Perez Cancer","Neelay Shah","Alexander Möllers","Philipp Seegerer","Alexandra Carpen-Amarie","Kai Standvoss","Gabriel Dernbach","Edwin de Jong","Simon Schallenberg","Andreas Kunft","Helmut Hoffer von Ankershoffen","Gavin Schaeferle","Patrick Duffy","Matt Redlon","Philipp Jurmeister","David Horst","Lukas Ruff","Klaus-Robert Müller","Frederick Klauschen","Andrew Norgan"],"pdf_url":"https://arxiv.org/pdf/2501.05409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19635v2","updated":"2025-01-10T16:51:33Z","published":"2023-10-30T15:25:29Z","title":"Improving Medical Visual Representations via Radiology Report Generation","summary":"  Vision-language pretraining has been shown to produce high-quality visual\nencoders which transfer efficiently to downstream computer vision tasks.\nContrastive learning approaches have increasingly been adopted for medical\nvision language pretraining (MVLP), yet recent developments in generative AI\noffer new modeling alternatives. This paper introduces RadTex, a CNN-encoder\ntransformer-decoder architecture optimized for radiology. We explore\nbidirectional captioning as an alternative MVLP strategy and demonstrate that\nRadTex's captioning pretraining is competitive with established contrastive\nmethods, achieving a CheXpert macro-AUC of 89.4%. Additionally, RadTex's\nlightweight text decoder not only generates clinically relevant radiology\nreports (macro-F1 score of 0.349), but also provides targeted, interactive\nresponses, highlighting the utility of bidirectional captioning in advancing\nmedical image analysis.\n","authors":["Keegan Quigley","Miriam Cha","Josh Barua","Geeticka Chauhan","Seth Berkowitz","Steven Horng","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2310.19635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08168v2","updated":"2025-01-10T16:44:55Z","published":"2024-10-10T17:45:12Z","title":"ZeroComp: Zero-shot Object Compositing from Image Intrinsics via\n  Diffusion","summary":"  We present ZeroComp, an effective zero-shot 3D object compositing approach\nthat does not require paired composite-scene images during training. Our method\nleverages ControlNet to condition from intrinsic images and combines it with a\nStable Diffusion model to utilize its scene priors, together operating as an\neffective rendering engine. During training, ZeroComp uses intrinsic images\nbased on geometry, albedo, and masked shading, all without the need for paired\nimages of scenes with and without composite objects. Once trained, it\nseamlessly integrates virtual 3D objects into scenes, adjusting shading to\ncreate realistic composites. We developed a high-quality evaluation dataset and\ndemonstrate that ZeroComp outperforms methods using explicit lighting\nestimations and generative techniques in quantitative and human perception\nbenchmarks. Additionally, ZeroComp extends to real and outdoor image\ncompositing, even when trained solely on synthetic indoor data, showcasing its\neffectiveness in image compositing.\n","authors":["Zitian Zhang","Frédéric Fortier-Chouinard","Mathieu Garon","Anand Bhattad","Jean-François Lalonde"],"pdf_url":"https://arxiv.org/pdf/2410.08168v2.pdf","comment":"Project page: https://lvsn.github.io/ZeroComp, Code:\n  https://github.com/lvsn/ZeroComp"},{"id":"http://arxiv.org/abs/2210.06433v3","updated":"2025-01-10T16:26:43Z","published":"2022-10-12T17:30:12Z","title":"Self-supervised video pretraining yields robust and more human-aligned\n  visual representations","summary":"  Humans learn powerful representations of objects and scenes by observing how\nthey evolve over time. Yet, outside of specific tasks that require explicit\ntemporal understanding, static image pretraining remains the dominant paradigm\nfor learning visual foundation models. We question this mismatch, and ask\nwhether video pretraining can yield visual representations that bear the\nhallmarks of human perception: generalisation across tasks, robustness to\nperturbations, and consistency with human judgements. To that end we propose a\nnovel procedure for curating videos, and develop a contrastive framework which\nlearns from the complex transformations therein. This simple paradigm for\ndistilling knowledge from videos, called VITO, yields general representations\nthat far outperform prior video pretraining methods on image understanding\ntasks, and image pretraining methods on video understanding tasks. Moreover,\nVITO representations are significantly more robust to natural and synthetic\ndeformations than image-, video-, and adversarially-trained ones. Finally,\nVITO's predictions are strongly aligned with human judgements, surpassing\nmodels that were specifically trained for that purpose. Together, these results\nsuggest that video pretraining could be a simple way of learning unified,\nrobust, and human-aligned representations of the visual world.\n","authors":["Nikhil Parthasarathy","S. M. Ali Eslami","João Carreira","Olivier J. Hénaff"],"pdf_url":"https://arxiv.org/pdf/2210.06433v3.pdf","comment":"Accepted to 37th Conference on Neural Information Processing Systems\n  (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2501.05177v2","updated":"2025-01-10T15:44:28Z","published":"2025-01-09T11:52:54Z","title":"FaceMe: Robust Blind Face Restoration with Personal Identification","summary":"  Blind face restoration is a highly ill-posed problem due to the lack of\nnecessary context. Although existing methods produce high-quality outputs, they\noften fail to faithfully preserve the individual's identity. In this paper, we\npropose a personalized face restoration method, FaceMe, based on a diffusion\nmodel. Given a single or a few reference images, we use an identity encoder to\nextract identity-related features, which serve as prompts to guide the\ndiffusion model in restoring high-quality and identity-consistent facial\nimages. By simply combining identity-related features, we effectively minimize\nthe impact of identity-irrelevant features during training and support any\nnumber of reference image inputs during inference. Additionally, thanks to the\nrobustness of the identity encoder, synthesized images can be used as reference\nimages during training, and identity changing during inference does not require\nfine-tuning the model. We also propose a pipeline for constructing a reference\nimage training pool that simulates the poses and expressions that may appear in\nreal-world scenarios. Experimental results demonstrate that our FaceMe can\nrestore high-quality facial images while maintaining identity consistency,\nachieving excellent performance and robustness.\n","authors":["Siyu Liu","Zheng-Peng Duan","Jia OuYang","Jiayi Fu","Hyunhee Park","Zikun Liu","Chun-Le Guo","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2501.05177v2.pdf","comment":"To appear at AAAI 2025"},{"id":"http://arxiv.org/abs/2407.18243v3","updated":"2025-01-10T15:37:27Z","published":"2024-07-25T17:57:48Z","title":"BIV-Priv-Seg: Locating Private Content in Images Taken by People With\n  Visual Impairments","summary":"  Individuals who are blind or have low vision (BLV) are at a heightened risk\nof sharing private information if they share photographs they have taken. To\nfacilitate developing technologies that can help them preserve privacy, we\nintroduce BIV-Priv-Seg, the first localization dataset originating from people\nwith visual impairments that shows private content. It contains 1,028 images\nwith segmentation annotations for 16 private object categories. We first\ncharacterize BIV-Priv-Seg and then evaluate modern models' performance for\nlocating private content in the dataset. We find modern models struggle most\nwith locating private objects that are not salient, small, and lack text as\nwell as recognizing when private content is absent from an image. We facilitate\nfuture extensions by sharing our new dataset with the evaluation server at\nhttps://vizwiz.org/tasks-and-datasets/object-localization.\n","authors":["Yu-Yun Tseng","Tanusree Sharma","Lotus Zhang","Abigale Stangl","Leah Findlater","Yang Wang","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2407.18243v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04103v2","updated":"2025-01-10T15:37:26Z","published":"2024-07-04T18:06:48Z","title":"Advances in Diffusion Models for Image Data Augmentation: A Review of\n  Methods, Models, Evaluation Metrics and Future Research Directions","summary":"  Image data augmentation constitutes a critical methodology in modern computer\nvision tasks, since it can facilitate towards enhancing the diversity and\nquality of training datasets; thereby, improving the performance and robustness\nof machine learning models in downstream tasks. In parallel, augmentation\napproaches can also be used for editing/modifying a given image in a context-\nand semantics-aware way. Diffusion Models (DMs), which comprise one of the most\nrecent and highly promising classes of methods in the field of generative\nArtificial Intelligence (AI), have emerged as a powerful tool for image data\naugmentation, capable of generating realistic and diverse images by learning\nthe underlying data distribution. The current study realizes a systematic,\ncomprehensive and in-depth review of DM-based approaches for image\naugmentation, covering a wide range of strategies, tasks and applications. In\nparticular, a comprehensive analysis of the fundamental principles, model\narchitectures and training strategies of DMs is initially performed.\nSubsequently, a taxonomy of the relevant image augmentation methods is\nintroduced, focusing on techniques regarding semantic manipulation,\npersonalization and adaptation, and application-specific augmentation tasks.\nThen, performance assessment methodologies and respective evaluation metrics\nare analyzed. Finally, current challenges and future research directions in the\nfield are discussed.\n","authors":["Panagiotis Alimisis","Ioannis Mademlis","Panagiotis Radoglou-Grammatikis","Panagiotis Sarigiannidis","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.04103v2.pdf","comment":"65 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.03053v2","updated":"2025-01-10T15:35:07Z","published":"2025-01-06T14:40:45Z","title":"Dr. Tongue: Sign-Oriented Multi-label Detection for Remote Tongue\n  Diagnosis","summary":"  Tongue diagnosis is a vital tool in Western and Traditional Chinese Medicine,\nproviding key insights into a patient's health by analyzing tongue attributes.\nThe COVID-19 pandemic has heightened the need for accurate remote medical\nassessments, emphasizing the importance of precise tongue attribute recognition\nvia telehealth. To address this, we propose a Sign-Oriented multi-label\nAttributes Detection framework. Our approach begins with an adaptive tongue\nfeature extraction module that standardizes tongue images and mitigates\nenvironmental factors. This is followed by a Sign-oriented Network (SignNet)\nthat identifies specific tongue attributes, emulating the diagnostic process of\nexperienced practitioners and enabling comprehensive health evaluations. To\nvalidate our methodology, we developed an extensive tongue image dataset\nspecifically designed for telemedicine. Unlike existing datasets, ours is\ntailored for remote diagnosis, with a comprehensive set of attribute labels.\nThis dataset will be openly available, providing a valuable resource for\nresearch. Initial tests have shown improved accuracy in detecting various\ntongue attributes, highlighting our framework's potential as an essential tool\nfor remote medical assessments.\n","authors":["Yiliang Chen","Steven SC Ho","Cheng Xu","Yao Jie Xie","Wing-Fai Yeung","Shengfeng He","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2501.03053v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06053v1","updated":"2025-01-10T15:33:37Z","published":"2025-01-10T15:33:37Z","title":"Enhancing, Refining, and Fusing: Towards Robust Multi-Scale and Dense\n  Ship Detection","summary":"  Synthetic aperture radar (SAR) imaging, celebrated for its high resolution,\nall-weather capability, and day-night operability, is indispensable for\nmaritime applications. However, ship detection in SAR imagery faces significant\nchallenges, including complex backgrounds, densely arranged targets, and large\nscale variations. To address these issues, we propose a novel framework,\nCenter-Aware SAR Ship Detector (CASS-Det), designed for robust multi-scale and\ndensely packed ship detection. CASS-Det integrates three key innovations: (1) a\ncenter enhancement module (CEM) that employs rotational convolution to\nemphasize ship centers, improving localization while suppressing background\ninterference; (2) a neighbor attention module (NAM) that leverages cross-layer\ndependencies to refine ship boundaries in densely populated scenes; and (3) a\ncross-connected feature pyramid network (CC-FPN) that enhances multi-scale\nfeature fusion by integrating shallow and deep features. Extensive experiments\non the SSDD, HRSID, and LS-SSDD-v1.0 datasets demonstrate the state-of-the-art\nperformance of CASS-Det, excelling at detecting multi-scale and densely\narranged ships.\n","authors":["Congxia Zhao","Xiongjun Fu","Jian Dong","Shen Cao","Chunyan Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06040v1","updated":"2025-01-10T15:18:05Z","published":"2025-01-10T15:18:05Z","title":"MSCViT: A Small-size ViT architecture with Multi-Scale Self-Attention\n  Mechanism for Tiny Datasets","summary":"  Vision Transformer (ViT) has demonstrated significant potential in various\nvision tasks due to its strong ability in modelling long-range dependencies.\nHowever, such success is largely fueled by training on massive samples. In real\napplications, the large-scale datasets are not always available, and ViT\nperforms worse than Convolutional Neural Networks (CNNs) if it is only trained\non small scale dataset (called tiny dataset), since it requires large amount of\ntraining data to ensure its representational capacity. In this paper, a\nsmall-size ViT architecture with multi-scale self-attention mechanism and\nconvolution blocks is presented (dubbed MSCViT) to model different scales of\nattention at each layer. Firstly, we introduced wavelet convolution, which\nselectively combines the high-frequency components obtained by frequency\ndivision with our convolution channel to extract local features. Then, a\nlightweight multi-head attention module is developed to reduce the number of\ntokens and computational costs. Finally, the positional encoding (PE) in the\nbackbone is replaced by a local feature extraction module. Compared with the\noriginal ViT, it is parameter-efficient and is particularly suitable for tiny\ndatasets. Extensive experiments have been conducted on tiny datasets, in which\nour model achieves an accuracy of 84.68% on CIFAR-100 with 14.0M parameters and\n2.5 GFLOPs, without pre-training on large datasets.\n","authors":["Bowei Zhang","Yi Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06039v1","updated":"2025-01-10T15:17:27Z","published":"2025-01-10T15:17:27Z","title":"AI-powered virtual tissues from spatial proteomics for clinical\n  diagnostics and biomedical discovery","summary":"  Spatial proteomics technologies have transformed our understanding of complex\ntissue architectures by enabling simultaneous analysis of multiple molecular\nmarkers and their spatial organization. The high dimensionality of these data,\nvarying marker combinations across experiments and heterogeneous study designs\npose unique challenges for computational analysis. Here, we present Virtual\nTissues (VirTues), a foundation model framework for biological tissues that\noperates across the molecular, cellular and tissue scale. VirTues introduces\ninnovations in transformer architecture design, including a novel tokenization\nscheme that captures both spatial and marker dimensions, and attention\nmechanisms that scale to high-dimensional multiplex data while maintaining\ninterpretability. Trained on diverse cancer and non-cancer tissue datasets,\nVirTues demonstrates strong generalization capabilities without task-specific\nfine-tuning, enabling cross-study analysis and novel marker integration. As a\ngeneralist model, VirTues outperforms existing approaches across clinical\ndiagnostics, biological discovery and patient case retrieval tasks, while\nproviding insights into tissue function and disease mechanisms.\n","authors":["Johann Wenckstern","Eeshaan Jain","Kiril Vasilev","Matteo Pariset","Andreas Wicki","Gabriele Gut","Charlotte Bunne"],"pdf_url":"https://arxiv.org/pdf/2501.06039v1.pdf","comment":"23 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.06038v1","updated":"2025-01-10T15:17:02Z","published":"2025-01-10T15:17:02Z","title":"A Holistically Point-guided Text Framework for Weakly-Supervised\n  Camouflaged Object Detection","summary":"  Weakly-Supervised Camouflaged Object Detection (WSCOD) has gained popularity\nfor its promise to train models with weak labels to segment objects that\nvisually blend into their surroundings. Recently, some methods using\nsparsely-annotated supervision shown promising results through scribbling in\nWSCOD, while point-text supervision remains underexplored. Hence, this paper\nintroduces a novel holistically point-guided text framework for WSCOD by\ndecomposing into three phases: segment, choose, train. Specifically, we propose\nPoint-guided Candidate Generation (PCG), where the point's foreground serves as\na correction for the text path to explicitly correct and rejuvenate the loss\ndetection object during the mask generation process (SEGMENT). We also\nintroduce a Qualified Candidate Discriminator (QCD) to choose the optimal mask\nfrom a given text prompt using CLIP (CHOOSE), and employ the chosen pseudo mask\nfor training with a self-supervised Vision Transformer (TRAIN). Additionally,\nwe developed a new point-supervised dataset (P2C-COD) and a text-supervised\ndataset (T-COD). Comprehensive experiments on four benchmark datasets\ndemonstrate our method outperforms state-of-the-art methods by a large margin,\nand also outperforms some existing fully-supervised camouflaged object\ndetection methods.\n","authors":["Tsui Qin Mok","Shuyong Gao","Haozhe Xing","Miaoyang He","Yan Wang","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06035v1","updated":"2025-01-10T15:13:43Z","published":"2025-01-10T15:13:43Z","title":"Nonisotropic Gaussian Diffusion for Realistic 3D Human Motion Prediction","summary":"  Probabilistic human motion prediction aims to forecast multiple possible\nfuture movements from past observations. While current approaches report high\ndiversity and realism, they often generate motions with undetected limb\nstretching and jitter. To address this, we introduce SkeletonDiffusion, a\nlatent diffusion model that embeds an explicit inductive bias on the human body\nwithin its architecture and training. Our model is trained with a novel\nnonisotropic Gaussian diffusion formulation that aligns with the natural\nkinematic structure of the human skeleton. Results show that our approach\noutperforms conventional isotropic alternatives, consistently generating\nrealistic predictions while avoiding artifacts such as limb distortion.\nAdditionally, we identify a limitation in commonly used diversity metrics,\nwhich may inadvertently favor models that produce inconsistent limb lengths\nwithin the same sequence. SkeletonDiffusion sets a new benchmark on three\nreal-world datasets, outperforming various baselines across multiple evaluation\nmetrics. Visit our project page:\nhttps://ceveloper.github.io/publications/skeletondiffusion/\n","authors":["Cecilia Curreli","Dominik Muhle","Abhishek Saroha","Zhenzhang Ye","Riccardo Marin","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2501.06035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06031v1","updated":"2025-01-10T15:07:57Z","published":"2025-01-10T15:07:57Z","title":"Generate, Transduct, Adapt: Iterative Transduction with VLMs","summary":"  Transductive zero-shot learning with vision-language models leverages\nimage-image similarities within the dataset to achieve better classification\naccuracy compared to the inductive setting. However, there is little work that\nexplores the structure of the language space in this context. We propose\nGTA-CLIP, a novel technique that incorporates supervision from language models\nfor joint transduction in language and vision spaces. Our approach is iterative\nand consists of three steps: (i) incrementally exploring the attribute space by\nquerying language models, (ii) an attribute-augmented transductive inference\nprocedure, and (iii) fine-tuning the language and vision encoders based on\ninferred labels within the dataset. Through experiments with CLIP encoders, we\ndemonstrate that GTA-CLIP, yields an average performance improvement of 8.6%\nand 3.7% across 12 datasets and 3 encoders, over CLIP and transductive CLIP\nrespectively in the zero-shot setting. We also observe similar improvements in\na few-shot setting. We present ablation studies that demonstrate the value of\neach step and visualize how the vision and language spaces evolve over\niterations driven by the transductive learning.\n","authors":["Oindrila Saha","Logan Lawrence","Grant Van Horn","Subhransu Maji"],"pdf_url":"https://arxiv.org/pdf/2501.06031v1.pdf","comment":"Code will be released at https://github.com/cvl-umass/GTA-CLIP"},{"id":"http://arxiv.org/abs/2501.06027v1","updated":"2025-01-10T15:04:23Z","published":"2025-01-10T15:04:23Z","title":"Geometric-Based Nail Segmentation for Clinical Measurements","summary":"  A robust segmentation method that can be used to perform measurements on\ntoenails is presented. The proposed method is used as the first step in a\nclinical trial to objectively quantify the incidence of a particular pathology.\nFor such an assessment, it is necessary to distinguish a nail, which locally\nappears to be similar to the skin. Many algorithms have been used, each of\nwhich leverages different aspects of toenail appearance. We used the Hough\ntransform to locate the tip of the toe and estimate the nail location and size.\nSubsequently, we classified the super-pixels of the image based on their\ngeometric and photometric information. Thereafter, the watershed transform\ndelineated the border of the nail. The method was validated using a 348-image\nmedical dataset, achieving an accuracy of 0.993 and an F-measure of 0.925. The\nproposed method is considerably robust across samples, with respect to factors\nsuch as nail shape, skin pigmentation, illumination conditions, and appearance\nof large regions affected by a medical condition\n","authors":["Bernat Galmés","Gabriel Moyà-Alcover","Pedro Bibiloni","Javier Varona","Antoni Jaume-i-Capó"],"pdf_url":"https://arxiv.org/pdf/2501.06027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04794v2","updated":"2025-01-10T14:59:31Z","published":"2025-01-08T19:18:44Z","title":"A Steerable Deep Network for Model-Free Diffusion MRI Registration","summary":"  Nonrigid registration is vital to medical image analysis but remains\nchallenging for diffusion MRI (dMRI) due to its high-dimensional,\norientation-dependent nature. While classical methods are accurate, they are\ncomputationally demanding, and deep neural networks, though efficient, have\nbeen underexplored for nonrigid dMRI registration compared to structural\nimaging. We present a novel, deep learning framework for model-free, nonrigid\nregistration of raw diffusion MRI data that does not require explicit\nreorientation. Unlike previous methods relying on derived representations such\nas diffusion tensors or fiber orientation distribution functions, in our\napproach, we formulate the registration as an equivariant diffeomorphism of\nposition-and-orientation space. Central to our method is an\n$\\mathsf{SE}(3)$-equivariant UNet that generates velocity fields while\npreserving the geometric properties of a raw dMRI's domain. We introduce a new\nloss function based on the maximum mean discrepancy in Fourier space,\nimplicitly matching ensemble average propagators across images. Experimental\nresults on Human Connectome Project dMRI data demonstrate competitive\nperformance compared to state-of-the-art approaches, with the added advantage\nof bypassing the overhead for estimating derived representations. This work\nestablishes a foundation for data-driven, geometry-aware dMRI registration\ndirectly in the acquisition space.\n","authors":["Gianfranco Cortes","Xiaoda Qu","Baba C. Vemuri"],"pdf_url":"https://arxiv.org/pdf/2501.04794v2.pdf","comment":"Coauthor was inadvertently left out. This is now corrected"},{"id":"http://arxiv.org/abs/2501.06019v1","updated":"2025-01-10T14:57:18Z","published":"2025-01-10T14:57:18Z","title":"BRIGHT: A globally distributed multimodal building damage assessment\n  dataset with very-high-resolution for all-weather disaster response","summary":"  Disaster events occur around the world and cause significant damage to human\nlife and property. Earth observation (EO) data enables rapid and comprehensive\nbuilding damage assessment (BDA), an essential capability in the aftermath of a\ndisaster to reduce human casualties and to inform disaster relief efforts.\nRecent research focuses on the development of AI models to achieve accurate\nmapping of unseen disaster events, mostly using optical EO data. However,\nsolutions based on optical data are limited to clear skies and daylight hours,\npreventing a prompt response to disasters. Integrating multimodal (MM) EO data,\nparticularly the combination of optical and SAR imagery, makes it possible to\nprovide all-weather, day-and-night disaster responses. Despite this potential,\nthe development of robust multimodal AI models has been constrained by the lack\nof suitable benchmark datasets. In this paper, we present a BDA dataset using\nveRy-hIGH-resoluTion optical and SAR imagery (BRIGHT) to support AI-based\nall-weather disaster response. To the best of our knowledge, BRIGHT is the\nfirst open-access, globally distributed, event-diverse MM dataset specifically\ncurated to support AI-based disaster response. It covers five types of natural\ndisasters and two types of man-made disasters across 12 regions worldwide, with\na particular focus on developing countries where external assistance is most\nneeded. The optical and SAR imagery in BRIGHT, with a spatial resolution\nbetween 0.3-1 meters, provides detailed representations of individual\nbuildings, making it ideal for precise BDA. In our experiments, we have tested\nseven advanced AI models trained with our BRIGHT to validate the\ntransferability and robustness. The dataset and code are available at\nhttps://github.com/ChenHongruixuan/BRIGHT. BRIGHT also serves as the official\ndataset for the 2025 IEEE GRSS Data Fusion Contest.\n","authors":["Hongruixuan Chen","Jian Song","Olivier Dietrich","Clifford Broni-Bediako","Weihao Xuan","Junjue Wang","Xinlei Shao","Yimin Wei","Junshi Xia","Cuiling Lan","Konrad Schindler","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2501.06019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06014v1","updated":"2025-01-10T14:50:00Z","published":"2025-01-10T14:50:00Z","title":"Pose-independent 3D Anthropometry from Sparse Data","summary":"  3D digital anthropometry is the study of estimating human body measurements\nfrom 3D scans. Precise body measurements are important health indicators in the\nmedical industry, and guiding factors in the fashion, ergonomic and\nentertainment industries. The measuring protocol consists of scanning the whole\nsubject in the static A-pose, which is maintained without breathing or movement\nduring the scanning process. However, the A-pose is not easy to maintain during\nthe whole scanning process, which can last even up to a couple of minutes. This\nconstraint affects the final quality of the scan, which in turn affects the\naccuracy of the estimated body measurements obtained from methods that rely on\ndense geometric data. Additionally, this constraint makes it impossible to\ndevelop a digital anthropometry method for subjects unable to assume the\nA-pose, such as those with injuries or disabilities. We propose a method that\ncan obtain body measurements from sparse landmarks acquired in any pose. We\nmake use of the sparse landmarks of the posed subject to create\npose-independent features, and train a network to predict the body measurements\nas taken from the standard A-pose. We show that our method achieves comparable\nresults to competing methods that use dense geometry in the standard A-pose,\nbut has the capability of estimating the body measurements from any pose using\nsparse landmarks only. Finally, we address the lack of open-source 3D\nanthropometry methods by making our method available to the research community\nat https://github.com/DavidBoja/pose-independent-anthropometry.\n","authors":["David Bojanić","Stefanie Wuhrer","Tomislav Petković","Tomislav Pribanić"],"pdf_url":"https://arxiv.org/pdf/2501.06014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16745v2","updated":"2025-01-10T14:40:49Z","published":"2024-12-21T19:41:10Z","title":"ViM-Disparity: Bridging the Gap of Speed, Accuracy and Memory for\n  Disparity Map Generation","summary":"  In this work we propose a Visual Mamba (ViM) based architecture, to dissolve\nthe existing trade-off for real-time and accurate model with low computation\noverhead for disparity map generation (DMG). Moreover, we proposed a\nperformance measure that can jointly evaluate the inference speed, computation\noverhead and the accurateness of a DMG model. The code implementation and\ncorresponding models are available at: https://github.com/MBora/ViM-Disparity.\n","authors":["Maheswar Bora","Tushar Anand","Saurabh Atreya","Aritra Mukherjee","Abhijit Das"],"pdf_url":"https://arxiv.org/pdf/2412.16745v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06006v1","updated":"2025-01-10T14:37:32Z","published":"2025-01-10T14:37:32Z","title":"CamCtrl3D: Single-Image Scene Exploration with Precise 3D Camera Control","summary":"  We propose a method for generating fly-through videos of a scene, from a\nsingle image and a given camera trajectory. We build upon an image-to-video\nlatent diffusion model. We condition its UNet denoiser on the camera\ntrajectory, using four techniques. (1) We condition the UNet's temporal blocks\non raw camera extrinsics, similar to MotionCtrl. (2) We use images containing\ncamera rays and directions, similar to CameraCtrl. (3) We reproject the initial\nimage to subsequent frames and use the resulting video as a condition. (4) We\nuse 2D<=>3D transformers to introduce a global 3D representation, which\nimplicitly conditions on the camera poses. We combine all conditions in a\nContolNet-style architecture. We then propose a metric that evaluates overall\nvideo quality and the ability to preserve details with view changes, which we\nuse to analyze the trade-offs of individual and combined conditions. Finally,\nwe identify an optimal combination of conditions. We calibrate camera positions\nin our datasets for scale consistency across scenes, and we train our scene\nexploration model, CamCtrl3D, demonstrating state-of-theart results.\n","authors":["Stefan Popov","Amit Raj","Michael Krainin","Yuanzhen Li","William T. Freeman","Michael Rubinstein"],"pdf_url":"https://arxiv.org/pdf/2501.06006v1.pdf","comment":"To be published in 3DV 2025"},{"id":"http://arxiv.org/abs/2501.06004v1","updated":"2025-01-10T14:35:16Z","published":"2025-01-10T14:35:16Z","title":"SeMi: When Imbalanced Semi-Supervised Learning Meets Mining Hard\n  Examples","summary":"  Semi-Supervised Learning (SSL) can leverage abundant unlabeled data to boost\nmodel performance. However, the class-imbalanced data distribution in\nreal-world scenarios poses great challenges to SSL, resulting in performance\ndegradation. Existing class-imbalanced semi-supervised learning (CISSL) methods\nmainly focus on rebalancing datasets but ignore the potential of using hard\nexamples to enhance performance, making it difficult to fully harness the power\nof unlabeled data even with sophisticated algorithms. To address this issue, we\npropose a method that enhances the performance of Imbalanced Semi-Supervised\nLearning by Mining Hard Examples (SeMi). This method distinguishes the entropy\ndifferences among logits of hard and easy examples, thereby identifying hard\nexamples and increasing the utility of unlabeled data, better addressing the\nimbalance problem in CISSL. In addition, we maintain a class-balanced memory\nbank with confidence decay for storing high-confidence embeddings to enhance\nthe pseudo-labels' reliability. Although our method is simple, it is effective\nand seamlessly integrates with existing approaches. We perform comprehensive\nexperiments on standard CISSL benchmarks and experimentally demonstrate that\nour proposed SeMi outperforms existing state-of-the-art methods on multiple\nbenchmarks, especially in reversed scenarios, where our best result shows\napproximately a 54.8\\% improvement over the baseline methods.\n","authors":["Yin Wang","Zixuan Wang","Hao Lu","Zhen Qin","Hailiang Zhao","Guanjie Cheng","Ge Su","Li Kuang","Mengchu Zhou","Shuiguang Deng"],"pdf_url":"https://arxiv.org/pdf/2501.06004v1.pdf","comment":"11 pages,6 figures, conference"},{"id":"http://arxiv.org/abs/2501.06000v1","updated":"2025-01-10T14:32:20Z","published":"2025-01-10T14:32:20Z","title":"Self-Supervised Partial Cycle-Consistency for Multi-View Matching","summary":"  Matching objects across partially overlapping camera views is crucial in\nmulti-camera systems and requires a view-invariant feature extraction network.\nTraining such a network with cycle-consistency circumvents the need for\nlabor-intensive labeling. In this paper, we extend the mathematical formulation\nof cycle-consistency to handle partial overlap. We then introduce a pseudo-mask\nwhich directs the training loss to take partial overlap into account. We\nadditionally present several new cycle variants that complement each other and\npresent a time-divergent scene sampling scheme that improves the data input for\nthis self-supervised setting. Cross-camera matching experiments on the\nchallenging DIVOTrack dataset show the merits of our approach. Compared to the\nself-supervised state-of-the-art, we achieve a 4.3 percentage point higher F1\nscore with our combined contributions. Our improvements are robust to reduced\noverlap in the training data, with substantial improvements in challenging\nscenes that need to make few matches between many people. Self-supervised\nfeature networks trained with our method are effective at matching objects in a\nrange of multi-camera settings, providing opportunities for complex tasks like\nlarge-scale multi-camera scene understanding.\n","authors":["Fedor Taggenbrock","Gertjan Burghouts","Ronald Poppe"],"pdf_url":"https://arxiv.org/pdf/2501.06000v1.pdf","comment":"Accepted to VISAPP 2025"},{"id":"http://arxiv.org/abs/2501.05997v1","updated":"2025-01-10T14:29:03Z","published":"2025-01-10T14:29:03Z","title":"Minimizing Occlusion Effect on Multi-View Camera Perception in BEV with\n  Multi-Sensor Fusion","summary":"  Autonomous driving technology is rapidly evolving, offering the potential for\nsafer and more efficient transportation. However, the performance of these\nsystems can be significantly compromised by the occlusion on sensors due to\nenvironmental factors like dirt, dust, rain, and fog. These occlusions severely\naffect vision-based tasks such as object detection, vehicle segmentation, and\nlane recognition. In this paper, we investigate the impact of various kinds of\nocclusions on camera sensor by projecting their effects from multi-view camera\nimages of the nuScenes dataset into the Bird's-Eye View (BEV) domain. This\napproach allows us to analyze how occlusions spatially distribute and influence\nvehicle segmentation accuracy within the BEV domain. Despite significant\nadvances in sensor technology and multi-sensor fusion, a gap remains in the\nexisting literature regarding the specific effects of camera occlusions on\nBEV-based perception systems. To address this gap, we use a multi-sensor fusion\ntechnique that integrates LiDAR and radar sensor data to mitigate the\nperformance degradation caused by occluded cameras. Our findings demonstrate\nthat this approach significantly enhances the accuracy and robustness of\nvehicle segmentation tasks, leading to more reliable autonomous driving\nsystems.\n","authors":["Sanjay Kumar","Hiep Truong","Sushil Sharma","Ganesh Sistu","Tony Scanlan","Eoin Grua","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2501.05997v1.pdf","comment":"Accepted form publishing at the Electronic Imaging - Autonomous\n  Vehicles and Machines Conference"},{"id":"http://arxiv.org/abs/2501.05991v1","updated":"2025-01-10T14:25:01Z","published":"2025-01-10T14:25:01Z","title":"An Attention-Guided Deep Learning Approach for Classifying 39 Skin\n  Lesion Types","summary":"  The skin, as the largest organ of the human body, is vulnerable to a diverse\narray of conditions collectively known as skin lesions, which encompass various\ndermatoses. Diagnosing these lesions presents significant challenges for\nmedical practitioners due to the subtle visual differences that are often\nimperceptible to the naked eye. While not all skin lesions are\nlife-threatening, certain types can act as early indicators of severe diseases,\nincluding skin cancers, underscoring the critical need for timely and accurate\ndiagnostic methods. Deep learning algorithms have demonstrated remarkable\npotential in facilitating the early detection and prognosis of skin lesions.\nThis study advances the field by curating a comprehensive and diverse dataset\ncomprising 39 categories of skin lesions, synthesized from five publicly\navailable datasets. Using this dataset, the performance of five\nstate-of-the-art deep learning models -- MobileNetV2, Xception, InceptionV3,\nEfficientNetB1, and Vision Transformer - is rigorously evaluated. To enhance\nthe accuracy and robustness of these models, attention mechanisms such as the\nEfficient Channel Attention (ECA) and the Convolutional Block Attention Module\n(CBAM) are incorporated into their architectures. Comprehensive evaluation\nacross multiple performance metrics reveals that the Vision Transformer model\nintegrated with CBAM outperforms others, achieving an accuracy of 93.46%,\nprecision of 94%, recall of 93%, F1-score of 93%, and specificity of 93.67%.\nThese results underscore the significant potential of the proposed system in\nsupporting medical professionals with accurate and efficient prognostic tools\nfor diagnosing a broad spectrum of skin lesions. The dataset and code used in\nthis study can be found at\nhttps://github.com/akabircs/Skin-Lesions-Classification.\n","authors":["Sauda Adiv Hanum","Ashim Dey","Muhammad Ashad Kabir"],"pdf_url":"https://arxiv.org/pdf/2501.05991v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2302.10798v5","updated":"2025-01-10T13:43:48Z","published":"2023-02-17T09:37:17Z","title":"Learning a Consensus Sub-Network with Polarization Regularization and\n  One Pass Training","summary":"  The subject of green AI has been gaining attention within the deep learning\ncommunity given the recent trend of ever larger and more complex neural network\nmodels. Existing solutions for reducing the computational load of training at\ninference time usually involve pruning the network parameters. Pruning schemes\noften create extra overhead either by iterative training and fine-tuning for\nstatic pruning or repeated computation of a dynamic pruning graph. We propose a\nnew parameter pruning strategy for learning a lighter-weight sub-network that\nminimizes the energy cost while maintaining comparable performance to the fully\nparameterised network on given downstream tasks. Our proposed pruning scheme is\ngreen-oriented, as it only requires a one-off training to discover the optimal\nstatic sub-networks by dynamic pruning methods. The pruning scheme consists of\na binary gating module and a polarizing loss function to uncover sub-networks\nwith user-defined sparsity. Our method enables pruning and training\nsimultaneously, which saves energy in both the training and inference phases\nand avoids extra computational overhead from gating modules at inference time.\nOur results on CIFAR-10, CIFAR-100, and Tiny Imagenet suggest that our scheme\ncan remove 50% of connections in deep networks with <1% reduction in\nclassification accuracy. Compared to other related pruning methods, our method\ndemonstrates a lower drop in accuracy for equivalent reductions in\ncomputational cost.\n","authors":["Xiaoying Zhi","Varun Babbar","Rundong Liu","Pheobe Sun","Fran Silavong","Ruibo Shi","Sean Moran"],"pdf_url":"https://arxiv.org/pdf/2302.10798v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05961v1","updated":"2025-01-10T13:41:10Z","published":"2025-01-10T13:41:10Z","title":"Swin-X2S: Reconstructing 3D Shape from 2D Biplanar X-ray with Swin\n  Transformers","summary":"  The conversion from 2D X-ray to 3D shape holds significant potential for\nimproving diagnostic efficiency and safety. However, existing reconstruction\nmethods often rely on hand-crafted features, manual intervention, and prior\nknowledge, resulting in unstable shape errors and additional processing costs.\nIn this paper, we introduce Swin-X2S, an end-to-end deep learning method for\ndirectly reconstructing 3D segmentation and labeling from 2D biplanar\northogonal X-ray images. Swin-X2S employs an encoder-decoder architecture: the\nencoder leverages 2D Swin Transformer for X-ray information extraction, while\nthe decoder employs 3D convolution with cross-attention to integrate structural\nfeatures from orthogonal views. A dimension-expanding module is introduced to\nbridge the encoder and decoder, ensuring a smooth conversion from 2D pixels to\n3D voxels. We evaluate proposed method through extensive qualitative and\nquantitative experiments across nine publicly available datasets covering four\nanatomies (femur, hip, spine, and rib), with a total of 54 categories.\nSignificant improvements over previous methods have been observed not only in\nthe segmentation and labeling metrics but also in the clinically relevant\nparameters that are of primary concern in practical applications, which\ndemonstrates the promise of Swin-X2S to provide an effective option for\nanatomical shape reconstruction in clinical scenarios. Code implementation is\navailable at: \\url{https://github.com/liukuan5625/Swin-X2S}.\n","authors":["Kuan Liu","Zongyuan Ying","Jie Jin","Dongyan Li","Ping Huang","Wenjian Wu","Zhe Chen","Jin Qi","Yong Lu","Lianfu Deng","Bo Chen"],"pdf_url":"https://arxiv.org/pdf/2501.05961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05952v1","updated":"2025-01-10T13:27:04Z","published":"2025-01-10T13:27:04Z","title":"Scalable Vision Language Model Training via High Quality Data Curation","summary":"  In this paper, we introduce SAIL-VL (ScAlable Vision Language Model TraIning\nvia High QuaLity Data Curation), an open-source vision language model (VLM) of\nstate-of-the-art (SOTA) performance with 2B parameters. We introduce three key\nimprovements that contribute to SAIL-VL's leading performance: (1) Scalable\nhigh-quality visual understanding data construction: We implement a visual\nunderstanding data construction pipeline, which enables hundred-million-scale\nhigh-quality recaption data annotation. Equipped with this pipeline, we curate\nSAIL-Caption, a large-scale caption dataset with large quantity and the highest\ndata quality compared with opensource caption datasets. (2) Scalable\nPretraining with High-Quality Visual Understanding Data: We scale SAIL-VL's\npretraining budget up to 131B tokens and show that even a 2B VLM benefits from\nscaled up training data sizes, exhibiting expected data size scaling laws in\nvisual understanding and instruction following performance. (3) Scalable SFT\nvia quantity and quality scaling: We introduce general guidance for instruction\ndata curation to scale up instruction data continuously, allowing us to\nconstruct a large SFT dataset with the highest quality. To further improve\nSAIL-VL's performance, we propose quality scaling, a multi-stage training\nrecipe with curriculum learning, to improve model performance scaling curves\nw.r.t. data sizes from logarithmic to be near-linear. SAIL-VL obtains the\nhighest average score in 19 commonly used benchmarks in our evaluation and\nachieves top1 performance among VLMs of comparable sizes on OpenCompass\n(https://rank.opencompass.org.cn/leaderboard-multimodal). We release our\nSAIL-VL-2B model at HuggingFace\n(https://huggingface.co/BytedanceDouyinContent/SAIL-VL-2B).\n","authors":["Hongyuan Dong","Zijian Kang","Weijie Yin","Xiao Liang","Chao Feng","Jiao Ran"],"pdf_url":"https://arxiv.org/pdf/2501.05952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03775v3","updated":"2025-01-10T13:25:32Z","published":"2025-01-07T13:30:54Z","title":"Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection","summary":"  While witnessed with rapid development, remote sensing object detection\nremains challenging for detecting high aspect ratio objects. This paper shows\nthat large strip convolutions are good feature representation learners for\nremote sensing object detection and can detect objects of various aspect ratios\nwell. Based on large strip convolutions, we build a new network architecture\ncalled Strip R-CNN, which is simple, efficient, and powerful. Unlike recent\nremote sensing object detectors that leverage large-kernel convolutions with\nsquare shapes, our Strip R-CNN takes advantage of sequential orthogonal large\nstrip convolutions to capture spatial information. In addition, we enhance the\nlocalization capability of remote-sensing object detectors by decoupling the\ndetection heads and equipping the localization head with strip convolutions to\nbetter localize the target objects. Extensive experiments on several\nbenchmarks, e.g., DOTA, FAIR1M, HRSC2016, and DIOR, show that our Strip R-CNN\ncan largely improve previous works. Notably, our 30M model achieves 82.75% mAP\non DOTA-v1.0, setting a new state-of-the-art record.Code is available at\nhttps://github.com/YXB-NKU/Strip-R-CNN.\n","authors":["Xinbin Yuan","Zhaohui Zheng","Yuxuan Li","Xialei Liu","Li Liu","Xiang Li","Qibin Hou","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.03775v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05945v1","updated":"2025-01-10T13:15:37Z","published":"2025-01-10T13:15:37Z","title":"Reusable specimen-level inference in computational pathology","summary":"  Foundation models for computational pathology have shown great promise for\nspecimen-level tasks and are increasingly accessible to researchers. However,\nspecimen-level models built on these foundation models remain largely\nunavailable, hindering their broader utility and impact. To address this gap,\nwe developed SpinPath, a toolkit designed to democratize specimen-level deep\nlearning by providing a zoo of pretrained specimen-level models, a Python-based\ninference engine, and a JavaScript-based inference platform. We demonstrate the\nutility of SpinPath in metastasis detection tasks across nine foundation\nmodels. SpinPath may foster reproducibility, simplify experimentation, and\naccelerate the adoption of specimen-level deep learning in computational\npathology research.\n","authors":["Jakub R. Kaczmarzyk","Rishul Sharma","Peter K. Koo","Joel H. Saltz"],"pdf_url":"https://arxiv.org/pdf/2501.05945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03916v2","updated":"2025-01-10T13:14:28Z","published":"2025-01-07T16:31:10Z","title":"Dolphin: Closed-loop Open-ended Auto-research through Thinking,\n  Practice, and Feedback","summary":"  The scientific research paradigm is undergoing a profound transformation\nowing to the development of Artificial Intelligence (AI). Recent works\ndemonstrate that various AI-assisted research methods can largely improve\nresearch efficiency by improving data analysis, accelerating computation, and\nfostering novel idea generation. To further move towards the ultimate goal\n(i.e., automatic scientific research), in this paper, we propose Dolphin, the\nfirst closed-loop open-ended auto-research framework to further build the\nentire process of human scientific research. Dolphin can generate research\nideas, perform experiments, and get feedback from experimental results to\ngenerate higher-quality ideas. More specifically, Dolphin first generates novel\nideas based on relevant papers which are ranked by the topic and task\nattributes. Then, the codes are automatically generated and debugged with the\nexception-traceback-guided local code structure. Finally, Dolphin automatically\nanalyzes the results of each idea and feeds the results back to the next round\nof idea generation. Experiments are conducted on the benchmark datasets of\ndifferent topics and results show that Dolphin can generate novel ideas\ncontinuously and complete the experiment in a loop. We highlight that Dolphin\ncan automatically propose methods that are comparable to the state-of-the-art\nin some tasks such as 2D image classification and 3D point classification.\n","authors":["Jiakang Yuan","Xiangchao Yan","Botian Shi","Tao Chen","Wanli Ouyang","Bo Zhang","Lei Bai","Yu Qiao","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.03916v2.pdf","comment":"19 pages, 11 figures, and our homepage:\n  https://alpha-innovator.github.io/Dolphin-project-page"},{"id":"http://arxiv.org/abs/2412.01246v2","updated":"2025-01-10T13:02:07Z","published":"2024-12-02T08:06:14Z","title":"Class Distance Weighted Cross Entropy Loss for Classification of Disease\n  Severity","summary":"  Assessing disease severity involving ordinal classes, where each class\nrepresents increasing levels of severity, benefit from loss functions that\naccount for this ordinal structure. Traditional categorical loss functions,\nlike Cross-Entropy (CE), often perform suboptimally in these scenarios. To\naddress this, we propose a novel loss function, Class Distance Weighted\nCross-Entropy (CDW-CE), which penalizes misclassifications more harshly when\nclasses are farther apart. We evaluated CDW-CE on the Labeled Images for\nUlcerative Colitis (LIMUC) dataset using various deep architectures. Its\nperformance was compared against several categorical and ordinal loss\nfunctions. To analyze the quality of latent representations, we used\nt-distributed stochastic neighbor embedding (t-SNE) visualizations and\nquantified their clustering with the Silhouette Score. We also compared Class\nActivation Maps (CAM) generated by models trained with CDW-CE and CE loss,\nincorporating domain expert feedback to evaluate alignment with expert\nknowledge. Our results show that CDW-CE consistently improves performance in\nordinal image classification tasks. It achieves higher Silhouette Scores,\nindicating better differentiation of class representations, and its CAM\nvisualizations demonstrate a stronger focus on clinically significant regions,\nas confirmed by domain experts.\n","authors":["Gorkem Polat","Ümit Mert Çağlar","Alptekin Temizel"],"pdf_url":"https://arxiv.org/pdf/2412.01246v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05936v1","updated":"2025-01-10T12:57:33Z","published":"2025-01-10T12:57:33Z","title":"A Multimodal Dataset for Enhancing Industrial Task Monitoring and\n  Engagement Prediction","summary":"  Detecting and interpreting operator actions, engagement, and object\ninteractions in dynamic industrial workflows remains a significant challenge in\nhuman-robot collaboration research, especially within complex, real-world\nenvironments. Traditional unimodal methods often fall short of capturing the\nintricacies of these unstructured industrial settings. To address this gap, we\npresent a novel Multimodal Industrial Activity Monitoring (MIAM) dataset that\ncaptures realistic assembly and disassembly tasks, facilitating the evaluation\nof key meta-tasks such as action localization, object interaction, and\nengagement prediction. The dataset comprises multi-view RGB, depth, and\nInertial Measurement Unit (IMU) data collected from 22 sessions, amounting to\n290 minutes of untrimmed video, annotated in detail for task performance and\noperator behavior. Its distinctiveness lies in the integration of multiple data\nmodalities and its emphasis on real-world, untrimmed industrial workflows-key\nfor advancing research in human-robot collaboration and operator monitoring.\nAdditionally, we propose a multimodal network that fuses RGB frames, IMU data,\nand skeleton sequences to predict engagement levels during industrial tasks.\nOur approach improves the accuracy of recognizing engagement states, providing\na robust solution for monitoring operator performance in dynamic industrial\nenvironments. The dataset and code can be accessed from\nhttps://github.com/navalkishoremehta95/MIAM/.\n","authors":["Naval Kishore Mehta"," Arvind","Himanshu Kumar","Abeer Banerjee","Sumeet Saurav","Sanjay Singh"],"pdf_url":"https://arxiv.org/pdf/2501.05936v1.pdf","comment":"Accepted at the 20th International Conference on Human-Robot\n  Interaction (HRI) 2025"},{"id":"http://arxiv.org/abs/2409.16111v2","updated":"2025-01-10T12:56:47Z","published":"2024-09-24T14:19:47Z","title":"CloudTrack: Scalable UAV Tracking with Cloud Semantics","summary":"  Nowadays, unmanned aerial vehicles (UAVs) are commonly used in search and\nrescue scenarios to gather information in the search area. The automatic\nidentification of the person searched for in aerial footage could increase the\nautonomy of such systems, reduce the search time, and thus increase the missed\nperson's chances of survival. In this paper, we present a novel approach to\nperform semantically conditioned open vocabulary object tracking that is\nspecifically designed to cope with the limitations of UAV hardware. Our\napproach has several advantages. It can run with verbal descriptions of the\nmissing person, e.g., the color of the shirt, it does not require dedicated\ntraining to execute the mission and can efficiently track a potentially moving\nperson. Our experimental results demonstrate the versatility and efficacy of\nour approach.\n","authors":["Yannik Blei","Michael Krawez","Nisarga Nilavadi","Tanja Katharina Kaiser","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2409.16111v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.05933v1","updated":"2025-01-10T12:56:18Z","published":"2025-01-10T12:56:18Z","title":"Weakly Supervised Segmentation of Hyper-Reflective Foci with Compact\n  Convolutional Transformers and SAM2","summary":"  Weakly supervised segmentation has the potential to greatly reduce the\nannotation effort for training segmentation models for small structures such as\nhyper-reflective foci (HRF) in optical coherence tomography (OCT). However,\nmost weakly supervised methods either involve a strong downsampling of input\nimages, or only achieve localization at a coarse resolution, both of which are\nunsatisfactory for small structures. We propose a novel framework that\nincreases the spatial resolution of a traditional attention-based Multiple\nInstance Learning (MIL) approach by using Layer-wise Relevance Propagation\n(LRP) to prompt the Segment Anything Model (SAM~2), and increases recall with\niterative inference. Moreover, we demonstrate that replacing MIL with a Compact\nConvolutional Transformer (CCT), which adds a positional encoding, and permits\nan exchange of information between different regions of the OCT image, leads to\na further and substantial increase in segmentation accuracy.\n","authors":["Olivier Morelle","Justus Bisten","Maximilian W. M. Wintergerst","Robert P. Finger","Thomas Schultz"],"pdf_url":"https://arxiv.org/pdf/2501.05933v1.pdf","comment":"7 pages, 1 figure, accepted at German Conference on Medical Image\n  Computing 2025"},{"id":"http://arxiv.org/abs/2410.07128v2","updated":"2025-01-10T12:34:47Z","published":"2024-09-23T11:29:19Z","title":"Neural Differential Appearance Equations","summary":"  We propose a method to reproduce dynamic appearance textures with\nspace-stationary but time-varying visual statistics. While most previous work\ndecomposes dynamic textures into static appearance and motion, we focus on\ndynamic appearance that results not from motion but variations of fundamental\nproperties, such as rusting, decaying, melting, and weathering. To this end, we\nadopt the neural ordinary differential equation (ODE) to learn the underlying\ndynamics of appearance from a target exemplar. We simulate the ODE in two\nphases. At the \"warm-up\" phase, the ODE diffuses a random noise to an initial\nstate. We then constrain the further evolution of this ODE to replicate the\nevolution of visual feature statistics in the exemplar during the generation\nphase. The particular innovation of this work is the neural ODE achieving both\ndenoising and evolution for dynamics synthesis, with a proposed temporal\ntraining scheme. We study both relightable (BRDF) and non-relightable (RGB)\nappearance models. For both we introduce new pilot datasets, allowing, for the\nfirst time, to study such phenomena: For RGB we provide 22 dynamic textures\nacquired from free online sources; For BRDFs, we further acquire a dataset of\n21 flash-lit videos of time-varying materials, enabled by a simple-to-construct\nsetup. Our experiments show that our method consistently yields realistic and\ncoherent results, whereas prior works falter under pronounced temporal\nappearance variations. A user study confirms our approach is preferred to\nprevious work for such exemplars.\n","authors":["Chen Liu","Tobias Ritschel"],"pdf_url":"https://arxiv.org/pdf/2410.07128v2.pdf","comment":"SIGGRAPH Asia 2024 Journal Track. Project page at\n  https://ryushinn.github.io/ode-appearance"},{"id":"http://arxiv.org/abs/2412.05983v2","updated":"2025-01-10T12:28:30Z","published":"2024-12-08T16:10:42Z","title":"Chimera: Improving Generalist Model with Domain-Specific Experts","summary":"  Recent advancements in Large Multi-modal Models (LMMs) underscore the\nimportance of scaling by increasing image-text paired data, achieving\nimpressive performance on general tasks. Despite their effectiveness in broad\napplications, generalist models are primarily trained on web-scale datasets\ndominated by natural images, resulting in the sacrifice of specialized\ncapabilities for domain-specific tasks that require extensive domain prior\nknowledge. Moreover, directly integrating expert models tailored for specific\ndomains is challenging due to the representational gap and imbalanced\noptimization between the generalist model and experts. To address these\nchallenges, we introduce Chimera, a scalable and low-cost multi-modal pipeline\ndesigned to boost the ability of existing LMMs with domain-specific experts.\nSpecifically, we design a progressive training strategy to integrate features\nfrom expert models into the input of a generalist LMM. To address the\nimbalanced optimization caused by the well-aligned general visual encoder, we\nintroduce a novel Generalist-Specialist Collaboration Masking (GSCM) mechanism.\nThis results in a versatile model that excels across the chart, table, math,\nand document domains, achieving state-of-the-art performance on multi-modal\nreasoning and visual content extraction tasks, both of which are challenging\ntasks for assessing existing LMMs.\n","authors":["Tianshuo Peng","Mingsheng Li","Hongbin Zhou","Renqiu Xia","Renrui Zhang","Lei Bai","Song Mao","Bin Wang","Conghui He","Aojun Zhou","Botian Shi","Tao Chen","Bo Zhang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2412.05983v2.pdf","comment":"Chimera Homepage: https://alpha-innovator.github.io/chimera_page"},{"id":"http://arxiv.org/abs/2412.11863v2","updated":"2025-01-10T12:22:53Z","published":"2024-12-16T15:20:03Z","title":"GeoX: Geometric Problem Solving Through Unified Formalized\n  Vision-Language Pre-training","summary":"  Despite their proficiency in general tasks, Multi-modal Large Language Models\n(MLLMs) struggle with automatic Geometry Problem Solving (GPS), which demands\nunderstanding diagrams, interpreting symbols, and performing complex reasoning.\nThis limitation arises from their pre-training on natural images and texts,\nalong with the lack of automated verification in the problem-solving process.\nBesides, current geometric specialists are limited by their task-specific\ndesigns, making them less effective for broader geometric problems. To this\nend, we present GeoX, a multi-modal large model focusing on geometric\nunderstanding and reasoning tasks. Given the significant differences between\ngeometric diagram-symbol and natural image-text, we introduce unimodal\npre-training to develop a diagram encoder and symbol decoder, enhancing the\nunderstanding of geometric images and corpora. Furthermore, we introduce\ngeometry-language alignment, an effective pre-training paradigm that bridges\nthe modality gap between unimodal geometric experts. We propose a\nGenerator-And-Sampler Transformer (GS-Former) to generate discriminative\nqueries and eliminate uninformative representations from unevenly distributed\ngeometric signals. Finally, GeoX benefits from visual instruction tuning,\nempowering it to take geometric images and questions as input and generate\nverifiable solutions. Experiments show that GeoX outperforms both generalists\nand geometric specialists on publicly recognized benchmarks, such as GeoQA,\nUniGeo, Geometry3K, and PGPS9k.\n","authors":["Renqiu Xia","Mingsheng Li","Hancheng Ye","Wenjie Wu","Hongbin Zhou","Jiakang Yuan","Tianshuo Peng","Xinyu Cai","Xiangchao Yan","Bin Wang","Conghui He","Botian Shi","Tao Chen","Junchi Yan","Bo Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.11863v2.pdf","comment":"Our code is available at https://github.com/Alpha-Innovator/GeoX"},{"id":"http://arxiv.org/abs/2412.07277v2","updated":"2025-01-10T12:17:00Z","published":"2024-12-10T08:07:19Z","title":"Backdoor Attacks against No-Reference Image Quality Assessment Models\n  via a Scalable Trigger","summary":"  No-Reference Image Quality Assessment (NR-IQA), responsible for assessing the\nquality of a single input image without using any reference, plays a critical\nrole in evaluating and optimizing computer vision systems, e.g., low-light\nenhancement. Recent research indicates that NR-IQA models are susceptible to\nadversarial attacks, which can significantly alter predicted scores with\nvisually imperceptible perturbations. Despite revealing vulnerabilities, these\nattack methods have limitations, including high computational demands,\nuntargeted manipulation, limited practical utility in white-box scenarios, and\nreduced effectiveness in black-box scenarios. To address these challenges, we\nshift our focus to another significant threat and present a novel\npoisoning-based backdoor attack against NR-IQA (BAIQA), allowing the attacker\nto manipulate the IQA model's output to any desired target value by simply\nadjusting a scaling coefficient $\\alpha$ for the trigger. We propose to inject\nthe trigger in the discrete cosine transform (DCT) domain to improve the local\ninvariance of the trigger for countering trigger diminishment in NR-IQA models\ndue to widely adopted data augmentations. Furthermore, the universal\nadversarial perturbations (UAP) in the DCT space are designed as the trigger,\nto increase IQA model susceptibility to manipulation and improve attack\neffectiveness. In addition to the heuristic method for poison-label BAIQA\n(P-BAIQA), we explore the design of clean-label BAIQA (C-BAIQA), focusing on\n$\\alpha$ sampling and image data refinement, driven by theoretical insights we\nreveal. Extensive experiments on diverse datasets and various NR-IQA models\ndemonstrate the effectiveness of our attacks. Code can be found at\nhttps://github.com/yuyi-sd/BAIQA.\n","authors":["Yi Yu","Song Xia","Xun Lin","Wenhan Yang","Shijian Lu","Yap-peng Tan","Alex Kot"],"pdf_url":"https://arxiv.org/pdf/2412.07277v2.pdf","comment":"Accept by AAAI 2025"},{"id":"http://arxiv.org/abs/2406.06521v2","updated":"2025-01-10T12:05:16Z","published":"2024-06-10T17:59:01Z","title":"PGSR: Planar-based Gaussian Splatting for Efficient and High-Fidelity\n  Surface Reconstruction","summary":"  Recently, 3D Gaussian Splatting (3DGS) has attracted widespread attention due\nto its high-quality rendering, and ultra-fast training and rendering speed.\nHowever, due to the unstructured and irregular nature of Gaussian point clouds,\nit is difficult to guarantee geometric reconstruction accuracy and multi-view\nconsistency simply by relying on image reconstruction loss. Although many\nstudies on surface reconstruction based on 3DGS have emerged recently, the\nquality of their meshes is generally unsatisfactory. To address this problem,\nwe propose a fast planar-based Gaussian splatting reconstruction representation\n(PGSR) to achieve high-fidelity surface reconstruction while ensuring\nhigh-quality rendering. Specifically, we first introduce an unbiased depth\nrendering method, which directly renders the distance from the camera origin to\nthe Gaussian plane and the corresponding normal map based on the Gaussian\ndistribution of the point cloud, and divides the two to obtain the unbiased\ndepth. We then introduce single-view geometric, multi-view photometric, and\ngeometric regularization to preserve global geometric accuracy. We also propose\na camera exposure compensation model to cope with scenes with large\nillumination variations. Experiments on indoor and outdoor scenes show that our\nmethod achieves fast training and rendering while maintaining high-fidelity\nrendering and geometric reconstruction, outperforming 3DGS-based and NeRF-based\nmethods.\n","authors":["Danpeng Chen","Hai Li","Weicai Ye","Yifan Wang","Weijian Xie","Shangjin Zhai","Nan Wang","Haomin Liu","Hujun Bao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.06521v2.pdf","comment":"project page: https://zju3dv.github.io/pgsr/"},{"id":"http://arxiv.org/abs/2501.00574v2","updated":"2025-01-10T12:00:51Z","published":"2024-12-31T18:01:23Z","title":"VideoChat-Flash: Hierarchical Compression for Long-Context Video\n  Modeling","summary":"  Long-context modeling is a critical capability for multimodal large language\nmodels (MLLMs), enabling them to process long-form contents with implicit\nmemorization. Despite its advances, handling extremely long videos remains\nchallenging due to the difficulty in maintaining crucial features over extended\nsequences. This paper introduces a Hierarchical visual token Compression (HiCo)\nmethod designed for high-fidelity representation and a practical context\nmodeling system VideoChat-Flash tailored for multimodal long-sequence\nprocessing. HiCo capitalizes on the redundancy of visual information in long\nvideos to compress long video context from the clip-level to the video-level,\nreducing the compute significantly while preserving essential details.\nVideoChat-Flash features a multi-stage short-to-long learning scheme, a rich\ndataset of real-world long videos named LongVid, and an upgraded\n\"Needle-In-A-video-Haystack\" (NIAH) for evaluating context capacities. In\nextensive experiments, VideoChat-Flash shows the leading performance on both\nmainstream long and short video benchmarks at the 2B and 7B model scale. It\nfirstly gets 99.1% accuracy over 10,000 frames in NIAH among open-source\nmodels.\n","authors":["Xinhao Li","Yi Wang","Jiashuo Yu","Xiangyu Zeng","Yuhan Zhu","Haian Huang","Jianfei Gao","Kunchang Li","Yinan He","Chenting Wang","Yu Qiao","Yali Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.00574v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05904v1","updated":"2025-01-10T12:00:11Z","published":"2025-01-10T12:00:11Z","title":"Binary Event-Driven Spiking Transformer","summary":"  Transformer-based Spiking Neural Networks (SNNs) introduce a novel\nevent-driven self-attention paradigm that combines the high performance of\nTransformers with the energy efficiency of SNNs. However, the larger model size\nand increased computational demands of the Transformer structure limit their\npracticality in resource-constrained scenarios. In this paper, we integrate\nbinarization techniques into Transformer-based SNNs and propose the Binary\nEvent-Driven Spiking Transformer, i.e. BESTformer. The proposed BESTformer can\nsignificantly reduce storage and computational demands by representing weights\nand attention maps with a mere 1-bit. However, BESTformer suffers from a severe\nperformance drop from its full-precision counterpart due to the limited\nrepresentation capability of binarization. To address this issue, we propose a\nCoupled Information Enhancement (CIE) method, which consists of a reversible\nframework and information enhancement distillation. By maximizing the mutual\ninformation between the binary model and its full-precision counterpart, the\nCIE method effectively mitigates the performance degradation of the BESTformer.\nExtensive experiments on static and neuromorphic datasets demonstrate that our\nmethod achieves superior performance to other binary SNNs, showcasing its\npotential as a compact yet high-performance model for resource-limited edge\ndevices.\n","authors":["Honglin Cao","Zijian Zhou","Wenjie Wei","Ammar Belatreche","Yu Liang","Dehao Zhang","Malu Zhang","Yang Yang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2501.05904v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.05901v1","updated":"2025-01-10T11:53:46Z","published":"2025-01-10T11:53:46Z","title":"Valley2: Exploring Multimodal Models with Scalable Vision-Language\n  Design","summary":"  Recently, vision-language models have made remarkable progress, demonstrating\noutstanding capabilities in various tasks such as image captioning and video\nunderstanding. We introduce Valley2, a novel multimodal large language model\ndesigned to enhance performance across all domains and extend the boundaries of\npractical applications in e-commerce and short video scenarios. Notably,\nValley2 achieves state-of-the-art (SOTA) performance on e-commerce benchmarks,\nsurpassing open-source models of similar size by a large margin (79.66 vs.\n72.76). Additionally, Valley2 ranks second on the OpenCompass leaderboard among\nmodels with fewer than 10B parameters, with an impressive average score of\n67.4. The code and model weights are open-sourced at\nhttps://github.com/bytedance/Valley.\n","authors":["Ziheng Wu","Zhenghao Chen","Ruipu Luo","Can Zhang","Yuan Gao","Zhentao He","Xian Wang","Haoran Lin","Minghui Qiu"],"pdf_url":"https://arxiv.org/pdf/2501.05901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05435v6","updated":"2025-01-10T11:50:21Z","published":"2024-03-08T16:38:11Z","title":"OmniCount: Multi-label Object Counting with Semantic-Geometric Priors","summary":"  Object counting is pivotal for understanding the composition of scenes.\nPreviously, this task was dominated by class-specific methods, which have\ngradually evolved into more adaptable class-agnostic strategies. However, these\nstrategies come with their own set of limitations, such as the need for manual\nexemplar input and multiple passes for multiple categories, resulting in\nsignificant inefficiencies. This paper introduces a more practical approach\nenabling simultaneous counting of multiple object categories using an\nopen-vocabulary framework. Our solution, OmniCount, stands out by using\nsemantic and geometric insights (priors) from pre-trained models to count\nmultiple categories of objects as specified by users, all without additional\ntraining. OmniCount distinguishes itself by generating precise object masks and\nleveraging varied interactive prompts via the Segment Anything Model for\nefficient counting. To evaluate OmniCount, we created the OmniCount-191\nbenchmark, a first-of-its-kind dataset with multi-label object counts,\nincluding points, bounding boxes, and VQA annotations. Our comprehensive\nevaluation in OmniCount-191, alongside other leading benchmarks, demonstrates\nOmniCount's exceptional performance, significantly outpacing existing\nsolutions. The project webpage is available at\nhttps://mondalanindya.github.io/OmniCount.\n","authors":["Anindya Mondal","Sauradip Nag","Xiatian Zhu","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2403.05435v6.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2501.05892v1","updated":"2025-01-10T11:44:59Z","published":"2025-01-10T11:44:59Z","title":"Beyond Flat Text: Dual Self-inherited Guidance for Visual Text\n  Generation","summary":"  In real-world images, slanted or curved texts, especially those on cans,\nbanners, or badges, appear as frequently, if not more so, than flat texts due\nto artistic design or layout constraints. While high-quality visual text\ngeneration has become available with the advanced generative capabilities of\ndiffusion models, these models often produce distorted text and inharmonious\ntext background when given slanted or curved text layouts due to training data\nlimitation. In this paper, we introduce a new training-free framework, STGen,\nwhich accurately generates visual texts in challenging scenarios (\\eg, slanted\nor curved text layouts) while harmonizing them with the text background. Our\nframework decomposes the visual text generation process into two branches: (i)\n\\textbf{Semantic Rectification Branch}, which leverages the ability in\ngenerating flat but accurate visual texts of the model to guide the generation\nof challenging scenarios. The generated latent of flat text is abundant in\naccurate semantic information related both to the text itself and its\nbackground. By incorporating this, we rectify the semantic information of the\ntexts and harmonize the integration of the text with its background in complex\nlayouts. (ii) \\textbf{Structure Injection Branch}, which reinforces the visual\ntext structure during inference. We incorporate the latent information of the\nglyph image, rich in glyph structure, as a new condition to further strengthen\nthe text structure. To enhance image harmony, we also apply an effective\ncombination method to merge the priors, providing a solid foundation for\ngeneration. Extensive experiments across a variety of visual text layouts\ndemonstrate that our framework achieves superior accuracy and outstanding\nquality.\n","authors":["Minxing Luo","Zixun Xia","Liaojun Chen","Zhenhang Li","Weichao Zeng","Jianye Wang","Wentao Cheng","Yaxing Wang","Yu Zhou","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2501.05892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05885v1","updated":"2025-01-10T11:37:50Z","published":"2025-01-10T11:37:50Z","title":"EDNet: Edge-Optimized Small Target Detection in UAV Imagery -- Faster\n  Context Attention, Better Feature Fusion, and Hardware Acceleration","summary":"  Detecting small targets in drone imagery is challenging due to low\nresolution, complex backgrounds, and dynamic scenes. We propose EDNet, a novel\nedge-target detection framework built on an enhanced YOLOv10 architecture,\noptimized for real-time applications without post-processing. EDNet\nincorporates an XSmall detection head and a Cross Concat strategy to improve\nfeature fusion and multi-scale context awareness for detecting tiny targets in\ndiverse environments. Our unique C2f-FCA block employs Faster Context Attention\nto enhance feature extraction while reducing computational complexity. The WIoU\nloss function is employed for improved bounding box regression. With seven\nmodel sizes ranging from Tiny to XL, EDNet accommodates various deployment\nenvironments, enabling local real-time inference and ensuring data privacy.\nNotably, EDNet achieves up to a 5.6% gain in mAP@50 with significantly fewer\nparameters. On an iPhone 12, EDNet variants operate at speeds ranging from 16\nto 55 FPS, providing a scalable and efficient solution for edge-based object\ndetection in challenging drone imagery. The source code and pre-trained models\nare available at: https://github.com/zsniko/EDNet.\n","authors":["Zhifan Song","Yuan Zhang","Abd Al Rahman M. Abu Ebayyeh"],"pdf_url":"https://arxiv.org/pdf/2501.05885v1.pdf","comment":"Accepted in 21st IEEE International Conference on Ubiquitous\n  Intelligence and Computing (UIC 2024)\n  https://www.ieee-smart-world.org/2024/uic"},{"id":"http://arxiv.org/abs/2501.01987v2","updated":"2025-01-10T11:36:09Z","published":"2024-12-30T18:08:13Z","title":"Gender Bias in Text-to-Video Generation Models: A case study of Sora","summary":"  The advent of text-to-video generation models has revolutionized content\ncreation as it produces high-quality videos from textual prompts. However,\nconcerns regarding inherent biases in such models have prompted scrutiny,\nparticularly regarding gender representation. Our study investigates the\npresence of gender bias in OpenAI's Sora, a state-of-the-art text-to-video\ngeneration model. We uncover significant evidence of bias by analyzing the\ngenerated videos from a diverse set of gender-neutral and stereotypical\nprompts. The results indicate that Sora disproportionately associates specific\ngenders with stereotypical behaviors and professions, which reflects societal\nprejudices embedded in its training data.\n","authors":["Mohammad Nadeem","Shahab Saquib Sohail","Erik Cambria","Björn W. Schuller","Amir Hussain"],"pdf_url":"https://arxiv.org/pdf/2501.01987v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.05884v1","updated":"2025-01-10T11:35:43Z","published":"2025-01-10T11:35:43Z","title":"Text-to-Edit: Controllable End-to-End Video Ad Creation via Multimodal\n  LLMs","summary":"  The exponential growth of short-video content has ignited a surge in the\nnecessity for efficient, automated solutions to video editing, with challenges\narising from the need to understand videos and tailor the editing according to\nuser requirements. Addressing this need, we propose an innovative end-to-end\nfoundational framework, ultimately actualizing precise control over the final\nvideo content editing. Leveraging the flexibility and generalizability of\nMultimodal Large Language Models (MLLMs), we defined clear input-output\nmappings for efficient video creation. To bolster the model's capability in\nprocessing and comprehending video content, we introduce a strategic\ncombination of a denser frame rate and a slow-fast processing technique,\nsignificantly enhancing the extraction and understanding of both temporal and\nspatial video information. Furthermore, we introduce a text-to-edit mechanism\nthat allows users to achieve desired video outcomes through textual input,\nthereby enhancing the quality and controllability of the edited videos. Through\ncomprehensive experimentation, our method has not only showcased significant\neffectiveness within advertising datasets, but also yields universally\napplicable conclusions on public datasets.\n","authors":["Dabing Cheng","Haosen Zhan","Xingchen Zhao","Guisheng Liu","Zemin Li","Jinghui Xie","Zhao Song","Weiguo Feng","Bingyue Peng"],"pdf_url":"https://arxiv.org/pdf/2501.05884v1.pdf","comment":"16pages conference"},{"id":"http://arxiv.org/abs/2501.05880v1","updated":"2025-01-10T11:32:56Z","published":"2025-01-10T11:32:56Z","title":"TakuNet: an Energy-Efficient CNN for Real-Time Inference on Embedded UAV\n  systems in Emergency Response Scenarios","summary":"  Designing efficient neural networks for embedded devices is a critical\nchallenge, particularly in applications requiring real-time performance, such\nas aerial imaging with drones and UAVs for emergency responses. In this work,\nwe introduce TakuNet, a novel light-weight architecture which employs\ntechniques such as depth-wise convolutions and an early downsampling stem to\nreduce computational complexity while maintaining high accuracy. It leverages\ndense connections for fast convergence during training and uses 16-bit\nfloating-point precision for optimization on embedded hardware accelerators.\nExperimental evaluation on two public datasets shows that TakuNet achieves\nnear-state-of-the-art accuracy in classifying aerial images of emergency\nsituations, despite its minimal parameter count. Real-world tests on embedded\ndevices, namely Jetson Orin Nano and Raspberry Pi, confirm TakuNet's\nefficiency, achieving more than 650 fps on the 15W Jetson board, making it\nsuitable for real-time AI processing on resource-constrained platforms and\nadvancing the applicability of drones in emergency scenarios. The code and\nimplementation details are publicly released.\n","authors":["Daniel Rossi","Guido Borghi","Roberto Vezzani"],"pdf_url":"https://arxiv.org/pdf/2501.05880v1.pdf","comment":"This paper has been accepted at WACVW 2025, which will take place on\n  28/02/2025. The official conference proceedings have not yet been published\n  at the time of submission to arXiv. The final version of the paper,\n  incorporating any changes based on feedback received during the conference,\n  will be included in the proceedings once they are made available"},{"id":"http://arxiv.org/abs/2501.05874v1","updated":"2025-01-10T11:17:15Z","published":"2025-01-10T11:17:15Z","title":"VideoRAG: Retrieval-Augmented Generation over Video Corpus","summary":"  Retrieval-Augmented Generation (RAG) is a powerful strategy to address the\nissue of generating factually incorrect outputs in foundation models by\nretrieving external knowledge relevant to queries and incorporating it into\ntheir generation process. However, existing RAG approaches have primarily\nfocused on textual information, with some recent advancements beginning to\nconsider images, and they largely overlook videos, a rich source of multimodal\nknowledge capable of representing events, processes, and contextual details\nmore effectively than any other modality. While a few recent studies explore\nthe integration of videos in the response generation process, they either\npredefine query-associated videos without retrieving them according to queries,\nor convert videos into the textual descriptions without harnessing their\nmultimodal richness. To tackle these, we introduce VideoRAG, a novel framework\nthat not only dynamically retrieves relevant videos based on their relevance\nwith queries but also utilizes both visual and textual information of videos in\nthe output generation. Further, to operationalize this, our method revolves\naround the recent advance of Large Video Language Models (LVLMs), which enable\nthe direct processing of video content to represent it for retrieval and\nseamless integration of the retrieved videos jointly with queries. We\nexperimentally validate the effectiveness of VideoRAG, showcasing that it is\nsuperior to relevant baselines.\n","authors":["Soyeong Jeong","Kangsan Kim","Jinheon Baek","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2501.05874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05862v1","updated":"2025-01-10T10:59:27Z","published":"2025-01-10T10:59:27Z","title":"Language-Inspired Relation Transfer for Few-shot Class-Incremental\n  Learning","summary":"  Depicting novel classes with language descriptions by observing few-shot\nsamples is inherent in human-learning systems. This lifelong learning\ncapability helps to distinguish new knowledge from old ones through the\nincrease of open-world learning, namely Few-Shot Class-Incremental Learning\n(FSCIL). Existing works to solve this problem mainly rely on the careful tuning\nof visual encoders, which shows an evident trade-off between the base knowledge\nand incremental ones. Motivated by human learning systems, we propose a new\nLanguage-inspired Relation Transfer (LRT) paradigm to understand objects by\njoint visual clues and text depictions, composed of two major steps. We first\ntransfer the pretrained text knowledge to the visual domains by proposing a\ngraph relation transformation module and then fuse the visual and language\nembedding by a text-vision prototypical fusion module. Second, to mitigate the\ndomain gap caused by visual finetuning, we propose context prompt learning for\nfast domain alignment and imagined contrastive learning to alleviate the\ninsufficient text data during alignment. With collaborative learning of domain\nalignments and text-image transfer, our proposed LRT outperforms the\nstate-of-the-art models by over $13\\%$ and $7\\%$ on the final session of\nmini-ImageNet and CIFAR-100 FSCIL benchmarks.\n","authors":["Yifan Zhao","Jia Li","Zeyin Song","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2501.05862v1.pdf","comment":"Accepted by IEEE TPAMI"},{"id":"http://arxiv.org/abs/2501.05852v1","updated":"2025-01-10T10:47:00Z","published":"2025-01-10T10:47:00Z","title":"MRI Patterns of the Hippocampus and Amygdala for Predicting Stages of\n  Alzheimer's Progression: A Minimal Feature Machine Learning Framework","summary":"  Alzheimer's disease (AD) progresses through distinct stages, from early mild\ncognitive impairment (EMCI) to late mild cognitive impairment (LMCI) and\neventually to AD. Accurate identification of these stages, especially\ndistinguishing LMCI from EMCI, is crucial for developing pre-dementia\ntreatments but remains challenging due to subtle and overlapping imaging\nfeatures. This study proposes a minimal-feature machine learning framework that\nleverages structural MRI data, focusing on the hippocampus and amygdala as\nregions of interest. The framework addresses the curse of dimensionality\nthrough feature selection, utilizes region-specific voxel information, and\nimplements innovative data organization to enhance classification performance\nby reducing noise. The methodology integrates dimensionality reduction\ntechniques such as PCA and t-SNE with state-of-the-art classifiers, achieving\nthe highest accuracy of 88.46%. This framework demonstrates the potential for\nefficient and accurate staging of AD progression while providing valuable\ninsights for clinical applications.\n","authors":["Aswini Kumar Patra","Soraisham Elizabeth Devi","Tejashwini Gajurel"],"pdf_url":"https://arxiv.org/pdf/2501.05852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03630v2","updated":"2025-01-10T10:45:49Z","published":"2025-01-07T09:00:07Z","title":"MC-VTON: Minimal Control Virtual Try-On Diffusion Transformer","summary":"  Virtual try-on methods based on diffusion models achieve realistic try-on\neffects. They use an extra reference network or an additional image encoder to\nprocess multiple conditional image inputs, which adds complexity pre-processing\nand additional computational costs. Besides, they require more than 25\ninference steps, bringing longer inference time. In this work, with the\ndevelopment of diffusion transformer (DiT), we rethink the necessity of\nadditional reference network or image encoder and introduce MC-VTON, which\nleverages DiT's intrinsic backbone to seamlessly integrate minimal conditional\ntry-on inputs. Compared to existing methods, the superiority of MC-VTON is\ndemonstrated in four aspects: (1) Superior detail fidelity. Our DiT-based\nMC-VTON exhibits superior fidelity in preserving fine-grained details. (2)\nSimplified network and inputs. We remove any extra reference network or image\nencoder. We also remove unnecessary conditions like the long prompt, pose\nestimation, human parsing, and depth map. We require only the masked person\nimage and the garment image. (3) Parameter-efficient training. To process the\ntry-on task, we fine-tune the FLUX.1-dev with only 39.7M additional parameters\n(0.33% of the backbone parameters). (4) Less inference steps. We apply\ndistillation diffusion on MC-VTON and only need 8 steps to generate a realistic\ntry-on image, with only 86.8M additional parameters (0.72% of the backbone\nparameters). Experiments show that MC-VTON achieves superior qualitative and\nquantitative results with fewer condition inputs, trainable parameters, and\ninference steps than baseline methods.\n","authors":["Junsheng Luan","Guangyuan Li","Lei Zhao","Wei Xing"],"pdf_url":"https://arxiv.org/pdf/2501.03630v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05851v1","updated":"2025-01-10T10:45:38Z","published":"2025-01-10T10:45:38Z","title":"Identity-aware Feature Decoupling Learning for Clothing-change Person\n  Re-identification","summary":"  Clothing-change person re-identification (CC Re-ID) has attracted increasing\nattention in recent years due to its application prospect. Most existing works\nstruggle to adequately extract the ID-related information from the original RGB\nimages. In this paper, we propose an Identity-aware Feature Decoupling (IFD)\nlearning framework to mine identity-related features. Particularly, IFD\nexploits a dual stream architecture that consists of a main stream and an\nattention stream. The attention stream takes the clothing-masked images as\ninputs and derives the identity attention weights for effectively transferring\nthe spatial knowledge to the main stream and highlighting the regions with\nabundant identity-related information. To eliminate the semantic gap between\nthe inputs of two streams, we propose a clothing bias diminishing module\nspecific to the main stream to regularize the features of clothing-relevant\nregions. Extensive experimental results demonstrate that our framework\noutperforms other baseline models on several widely-used CC Re-ID datasets.\n","authors":["Haoxuan Xu","Bo Li","Guanglin Niu"],"pdf_url":"https://arxiv.org/pdf/2501.05851v1.pdf","comment":"Accepted by ICASSP2025"},{"id":"http://arxiv.org/abs/2501.03968v2","updated":"2025-01-10T10:38:49Z","published":"2025-01-07T18:06:27Z","title":"VLM-driven Behavior Tree for Context-aware Task Planning","summary":"  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)\nhas recently gained attention in the robotics community, yet remains in its\nearly stages of development. In this paper, we propose a novel framework that\nleverages Vision-Language Models (VLMs) to interactively generate and edit BTs\nthat address visual conditions, enabling context-aware robot operations in\nvisually complex environments. A key feature of our approach lies in the\nconditional control through self-prompted visual conditions. Specifically, the\nVLM generates BTs with visual condition nodes, where conditions are expressed\nas free-form text. Another VLM process integrates the text into its prompt and\nevaluates the conditions against real-world images during robot execution. We\nvalidated our framework in a real-world cafe scenario, demonstrating both its\nfeasibility and limitations.\n","authors":["Naoki Wake","Atsushi Kanehira","Jun Takamatsu","Kazuhiro Sasabuchi","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2501.03968v2.pdf","comment":"10 pages, 11 figures, 5 tables. Last updated on January 9th, 2024"},{"id":"http://arxiv.org/abs/2406.10221v2","updated":"2025-01-10T10:36:58Z","published":"2024-06-14T17:54:54Z","title":"Long Story Short: Story-level Video Understanding from 20K Short Films","summary":"  Recent developments in vision-language models have significantly advanced\nvideo understanding. Existing datasets and tasks, however, have notable\nlimitations. Most datasets are confined to short videos with limited events and\nnarrow narratives. For example, datasets with instructional and egocentric\nvideos often depict activities of one person in a single scene. Although\nexisting movie datasets offer richer content, they are often limited to\nshort-term tasks, lack publicly available videos, and frequently encounter data\nleakage issues given the use of subtitles and other information about\ncommercial movies during LLM pretraining. To address the above limitations, we\npropose Short-Films 20K (SF20K), the largest publicly available movie dataset.\nSF20K is composed of 20,143 amateur films and offers long-term video tasks in\nthe form of multiple-choice and open-ended question answering. Our extensive\nanalysis of SF20K reveals minimal data leakage, emphasizes the need for\nlong-term reasoning, and demonstrates the strong performance of recent VLMs.\nFinally, we show that instruction tuning on the SF20K-Train set substantially\nimproves model performance, paving the way for future progress in long-term\nvideo understanding.\n","authors":["Ridouane Ghermi","Xi Wang","Vicky Kalogeiton","Ivan Laptev"],"pdf_url":"https://arxiv.org/pdf/2406.10221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05839v1","updated":"2025-01-10T10:26:54Z","published":"2025-01-10T10:26:54Z","title":"Poetry in Pixels: Prompt Tuning for Poem Image Generation via Diffusion\n  Models","summary":"  The task of text-to-image generation has encountered significant challenges\nwhen applied to literary works, especially poetry. Poems are a distinct form of\nliterature, with meanings that frequently transcend beyond the literal words.\nTo address this shortcoming, we propose a PoemToPixel framework designed to\ngenerate images that visually represent the inherent meanings of poems. Our\napproach incorporates the concept of prompt tuning in our image generation\nframework to ensure that the resulting images closely align with the poetic\ncontent. In addition, we propose the PoeKey algorithm, which extracts three key\nelements in the form of emotions, visual elements, and themes from poems to\nform instructions which are subsequently provided to a diffusion model for\ngenerating corresponding images. Furthermore, to expand the diversity of the\npoetry dataset across different genres and ages, we introduce MiniPo, a novel\nmultimodal dataset comprising 1001 children's poems and images. Leveraging this\ndataset alongside PoemSum, we conducted both quantitative and qualitative\nevaluations of image generation using our PoemToPixel framework. This paper\ndemonstrates the effectiveness of our approach and offers a fresh perspective\non generating images from literary sources.\n","authors":["Sofia Jamil","Bollampalli Areen Reddy","Raghvendra Kumar","Sriparna Saha","K J Joseph","Koustava Goswami"],"pdf_url":"https://arxiv.org/pdf/2501.05839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11875v2","updated":"2025-01-10T10:15:49Z","published":"2023-10-18T10:49:29Z","title":"Fractional Concepts in Neural Networks: Enhancing Activation Functions","summary":"  Designing effective neural networks requires tuning architectural elements.\nThis study integrates fractional calculus into neural networks by introducing\nfractional order derivatives (FDO) as tunable parameters in activation\nfunctions, allowing diverse activation functions by adjusting the FDO. We\nevaluate these fractional activation functions on various datasets and network\narchitectures, comparing their performance with traditional and new activation\nfunctions. Our experiments assess their impact on accuracy, time complexity,\ncomputational overhead, and memory usage. Results suggest fractional activation\nfunctions, particularly fractional Sigmoid, offer benefits in some scenarios.\nChallenges related to consistency and efficiency remain. Practical implications\nand limitations are discussed.\n","authors":["Zahra Alijani","Vojtech Molek"],"pdf_url":"https://arxiv.org/pdf/2310.11875v2.pdf","comment":"8 pages, 8 figures, submitted to pattern recognition letters"},{"id":"http://arxiv.org/abs/2501.01834v2","updated":"2025-01-10T10:08:50Z","published":"2025-01-03T14:38:01Z","title":"MoColl: Agent-Based Specific and General Model Collaboration for Image\n  Captioning","summary":"  Image captioning is a critical task at the intersection of computer vision\nand natural language processing, with wide-ranging applications across various\ndomains. For complex tasks such as diagnostic report generation, deep learning\nmodels require not only domain-specific image-caption datasets but also the\nincorporation of relevant general knowledge to provide contextual accuracy.\nExisting approaches exhibit inherent limitations: specialized models excel in\ncapturing domain-specific details but lack generalization, while\nvision-language models (VLMs) built on large language models (LLMs) leverage\ngeneral knowledge but struggle with domain-specific adaptation. To address\nthese limitations, this paper proposes a novel agent-enhanced model\ncollaboration framework, which we call MoColl, designed to effectively\nintegrate domain-specific and general knowledge. Specifically, our approach is\nto decompose complex image captioning tasks into a series of interconnected\nquestion-answer subtasks. A trainable visual question answering (VQA) model is\nemployed as a specialized tool to focus on domain-specific visual analysis,\nanswering task-specific questions based on image content. Concurrently, an\nLLM-based agent with general knowledge formulates these questions and\nsynthesizes the resulting question-answer pairs into coherent captions. Beyond\nits role in leveraging the VQA model, the agent further guides its training to\nenhance its domain-specific capabilities. Experimental results on radiology\nreport generation validate the effectiveness of the proposed framework,\ndemonstrating significant improvements in the quality of generated reports.\n","authors":["Pu Yang","Bin Dong"],"pdf_url":"https://arxiv.org/pdf/2501.01834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09278v2","updated":"2025-01-10T10:07:55Z","published":"2024-12-12T13:41:35Z","title":"Towards a Multimodal Large Language Model with Pixel-Level Insight for\n  Biomedicine","summary":"  In recent years, Multimodal Large Language Models (MLLM) have achieved\nnotable advancements, demonstrating the feasibility of developing an\nintelligent biomedical assistant. However, current biomedical MLLMs\npredominantly focus on image-level understanding and restrict interactions to\ntextual commands, thus limiting their capability boundaries and the flexibility\nof usage. In this paper, we introduce a novel end-to-end multimodal large\nlanguage model for the biomedical domain, named MedPLIB, which possesses\npixel-level understanding. Excitingly, it supports visual question answering\n(VQA), arbitrary pixel-level prompts (points, bounding boxes, and free-form\nshapes), and pixel-level grounding. We propose a novel Mixture-of-Experts (MoE)\nmulti-stage training strategy, which divides MoE into separate training phases\nfor a visual-language expert model and a pixel-grounding expert model, followed\nby fine-tuning using MoE. This strategy effectively coordinates multitask\nlearning while maintaining the computational cost at inference equivalent to\nthat of a single expert model. To advance the research of biomedical MLLMs, we\nintroduce the Medical Complex Vision Question Answering Dataset (MeCoVQA),\nwhich comprises an array of 8 modalities for complex medical imaging question\nanswering and image region understanding. Experimental results indicate that\nMedPLIB has achieved state-of-the-art outcomes across multiple medical visual\nlanguage tasks. More importantly, in zero-shot evaluations for the pixel\ngrounding task, MedPLIB leads the best small and large models by margins of\n19.7 and 15.6 respectively on the mDice metric. The codes, data, and model\ncheckpoints will be made publicly available at\nhttps://github.com/ShawnHuang497/MedPLIB.\n","authors":["Xiaoshuang Huang","Lingdong Shen","Jia Liu","Fangxin Shang","Hongxiang Li","Haifeng Huang","Yehui Yang"],"pdf_url":"https://arxiv.org/pdf/2412.09278v2.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.05828v1","updated":"2025-01-10T10:07:41Z","published":"2025-01-10T10:07:41Z","title":"UltraRay: Full-Path Ray Tracing for Enhancing Realism in Ultrasound\n  Simulation","summary":"  Traditional ultrasound simulators solve the wave equation to model pressure\ndistribution fields, achieving high accuracy but requiring significant\ncomputational time and resources. To address this, ray tracing approaches have\nbeen introduced, modeling wave propagation as rays interacting with boundaries\nand scatterers. However, existing models simplify ray propagation, generating\nechoes at interaction points without considering return paths to the sensor.\nThis can result in unrealistic artifacts and necessitates careful scene tuning\nfor plausible results. We propose a novel ultrasound simulation pipeline that\nutilizes a ray tracing algorithm to generate echo data, tracing each ray from\nthe transducer through the scene and back to the sensor. To replicate advanced\nultrasound imaging, we introduce a ray emission scheme optimized for plane wave\nimaging, incorporating delay and steering capabilities. Furthermore, we\nintegrate a standard signal processing pipeline to simulate end-to-end\nultrasound image formation. We showcase the efficacy of the proposed pipeline\nby modeling synthetic scenes featuring highly reflective objects, such as\nbones. In doing so, our proposed approach, UltraRay, not only enhances the\noverall visual quality but also improves the realism of the simulated images by\naccurately capturing secondary reflections and reducing unnatural artifacts. By\nbuilding on top of a differentiable framework, the proposed pipeline lays the\ngroundwork for a fast and differentiable ultrasound simulation tool necessary\nfor gradient-based optimization, enabling advanced ultrasound beamforming\nstrategies, neural network integration, and accurate inverse scene\nreconstruction.\n","authors":["Felix Duelmer","Mohammad Farid Azampour","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2501.05828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05826v1","updated":"2025-01-10T10:03:56Z","published":"2025-01-10T10:03:56Z","title":"AI-Driven Diabetic Retinopathy Screening: Multicentric Validation of\n  AIDRSS in India","summary":"  Purpose: Diabetic retinopathy (DR) is a major cause of vision loss,\nparticularly in India, where access to retina specialists is limited in rural\nareas. This study aims to evaluate the Artificial Intelligence-based Diabetic\nRetinopathy Screening System (AIDRSS) for DR detection and prevalence\nassessment, addressing the growing need for scalable, automated screening\nsolutions in resource-limited settings.\n  Approach: A multicentric, cross-sectional study was conducted in Kolkata,\nIndia, involving 5,029 participants and 10,058 macula-centric retinal fundus\nimages. The AIDRSS employed a deep learning algorithm with 50 million trainable\nparameters, integrated with Contrast Limited Adaptive Histogram Equalization\n(CLAHE) preprocessing for enhanced image quality. DR was graded using the\nInternational Clinical Diabetic Retinopathy (ICDR) Scale, categorizing disease\ninto five stages (DR0 to DR4). Statistical metrics including sensitivity,\nspecificity, and prevalence rates were evaluated against expert retina\nspecialist assessments.\n  Results: The prevalence of DR in the general population was 13.7%, rising to\n38.2% among individuals with elevated random blood glucose levels. The AIDRSS\nachieved an overall sensitivity of 92%, specificity of 88%, and 100%\nsensitivity for detecting referable DR (DR3 and DR4). These results demonstrate\nthe system's robust performance in accurately identifying and grading DR in a\ndiverse population.\n  Conclusions: AIDRSS provides a reliable, scalable solution for early DR\ndetection in resource-constrained environments. Its integration of advanced AI\ntechniques ensures high diagnostic accuracy, with potential to significantly\nreduce the burden of diabetes-related vision loss in underserved regions.\n","authors":["Amit Kr Dey","Pradeep Walia","Girish Somvanshi","Abrar Ali","Sagarnil Das","Pallabi Paul","Minakhi Ghosh"],"pdf_url":"https://arxiv.org/pdf/2501.05826v1.pdf","comment":"22 pages, 5 figures. arXiv admin note: substantial text overlap with\n  arXiv:1812.07105 by other authors without attribution"},{"id":"http://arxiv.org/abs/2501.05823v1","updated":"2025-01-10T10:01:36Z","published":"2025-01-10T10:01:36Z","title":"PersonaHOI: Effortlessly Improving Personalized Face with Human-Object\n  Interaction Generation","summary":"  We introduce PersonaHOI, a training- and tuning-free framework that fuses a\ngeneral StableDiffusion model with a personalized face diffusion (PFD) model to\ngenerate identity-consistent human-object interaction (HOI) images. While\nexisting PFD models have advanced significantly, they often overemphasize\nfacial features at the expense of full-body coherence, PersonaHOI introduces an\nadditional StableDiffusion (SD) branch guided by HOI-oriented text inputs. By\nincorporating cross-attention constraints in the PFD branch and spatial merging\nat both latent and residual levels, PersonaHOI preserves personalized facial\ndetails while ensuring interactive non-facial regions. Experiments, validated\nby a novel interaction alignment metric, demonstrate the superior realism and\nscalability of PersonaHOI, establishing a new standard for practical\npersonalized face with HOI generation. Our code will be available at\nhttps://github.com/JoyHuYY1412/PersonaHOI\n","authors":["Xinting Hu","Haoran Wang","Jan Eric Lenssen","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2501.05823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13719v2","updated":"2025-01-10T10:00:58Z","published":"2024-07-18T17:18:25Z","title":"HazeCLIP: Towards Language Guided Real-World Image Dehazing","summary":"  Existing methods have achieved remarkable performance in image dehazing,\nparticularly on synthetic datasets. However, they often struggle with\nreal-world hazy images due to domain shift, limiting their practical\napplicability. This paper introduces HazeCLIP, a language-guided adaptation\nframework designed to enhance the real-world performance of pre-trained\ndehazing networks. Inspired by the Contrastive Language-Image Pre-training\n(CLIP) model's ability to distinguish between hazy and clean images, we\nleverage it to evaluate dehazing results. Combined with a region-specific\ndehazing technique and tailored prompt sets, the CLIP model accurately\nidentifies hazy areas, providing a high-quality, human-like prior that guides\nthe fine-tuning process of pre-trained networks. Extensive experiments\ndemonstrate that HazeCLIP achieves state-of-the-art performance in real-word\nimage dehazing, evaluated through both visual quality and image quality\nassessment metrics. Codes are available at https://github.com/Troivyn/HazeCLIP.\n","authors":["Ruiyi Wang","Wenhao Li","Xiaohong Liu","Chunyi Li","Zicheng Zhang","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2407.13719v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06154v2","updated":"2025-01-10T09:44:43Z","published":"2024-09-10T01:57:57Z","title":"Static for Dynamic: Towards a Deeper Understanding of Dynamic Facial\n  Expressions Using Static Expression Data","summary":"  Dynamic facial expression recognition (DFER) infers emotions from the\ntemporal evolution of expressions, unlike static facial expression recognition\n(SFER), which relies solely on a single snapshot. This temporal analysis\nprovides richer information and promises greater recognition capability.\nHowever, current DFER methods often exhibit unsatisfied performance largely due\nto fewer training samples compared to SFER. Given the inherent correlation\nbetween static and dynamic expressions, we hypothesize that leveraging the\nabundant SFER data can enhance DFER. To this end, we propose Static-for-Dynamic\n(S4D), a unified dual-modal learning framework that integrates SFER data as a\ncomplementary resource for DFER. Specifically, S4D employs dual-modal\nself-supervised pre-training on facial images and videos using a shared Vision\nTransformer (ViT) encoder-decoder architecture, yielding improved\nspatiotemporal representations. The pre-trained encoder is then fine-tuned on\nstatic and dynamic expression datasets in a multi-task learning setup to\nfacilitate emotional information interaction. Unfortunately, vanilla multi-task\nlearning in our study results in negative transfer. To address this, we propose\nan innovative Mixture of Adapter Experts (MoAE) module that facilitates\ntask-specific knowledge acquisition while effectively extracting shared\nknowledge from both static and dynamic expression data. Extensive experiments\ndemonstrate that S4D achieves a deeper understanding of DFER, setting new\nstate-of-the-art performance on FERV39K, MAFW, and DFEW benchmarks, with\nweighted average recall (WAR) of 53.65\\%, 58.44\\%, and 76.68\\%, respectively.\nAdditionally, a systematic correlation analysis between SFER and DFER tasks is\npresented, which further elucidates the potential benefits of leveraging SFER.\n","authors":["Yin Chen","Jia Li","Yu Zhang","Zhenzhen Hu","Shiguang Shan","Meng Wang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2409.06154v2.pdf","comment":"The code and model are publicly available here\n  https://github.com/MSA-LMC/S4D"},{"id":"http://arxiv.org/abs/2411.10185v3","updated":"2025-01-10T09:37:49Z","published":"2024-11-15T13:34:46Z","title":"Efficient Progressive Image Compression with Variance-aware Masking","summary":"  Learned progressive image compression is gaining momentum as it allows\nimproved image reconstruction as more bits are decoded at the receiver. We\npropose a progressive image compression method in which an image is first\nrepresented as a pair of base-quality and top-quality latent representations.\nNext, a residual latent representation is encoded as the element-wise\ndifference between the top and base representations. Our scheme enables\nprogressive image compression with element-wise granularity by introducing a\nmasking system that ranks each element of the residual latent representation\nfrom most to least important, dividing it into complementary components, which\ncan be transmitted separately to the decoder in order to obtain different\nreconstruction quality. The masking system does not add further parameters nor\ncomplexity. At the receiver, any elements of the top latent representation\nexcluded from the transmitted components can be independently replaced with the\nmean predicted by the hyperprior architecture, ensuring reliable\nreconstructions at any intermediate quality level. We also introduced Rate\nEnhancement Modules (REMs), which refine the estimation of entropy parameters\nusing already decoded components. We obtain results competitive with\nstate-of-the-art competitors, while significantly reducing computational\ncomplexity, decoding time, and number of parameters.\n","authors":["Alberto Presta","Enzo Tartaglione","Attilio Fiandrotti","Marco Grangetto","Pamela Cosman"],"pdf_url":"https://arxiv.org/pdf/2411.10185v3.pdf","comment":"9 pages. Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2501.01042v2","updated":"2025-01-10T09:21:43Z","published":"2025-01-02T03:52:22Z","title":"Image-based Multimodal Models as Intruders: Transferable Multimodal\n  Attacks on Video-based MLLMs","summary":"  Video-based multimodal large language models (V-MLLMs) have shown\nvulnerability to adversarial examples in video-text multimodal tasks. However,\nthe transferability of adversarial videos to unseen models--a common and\npractical real world scenario--remains unexplored. In this paper, we pioneer an\ninvestigation into the transferability of adversarial video samples across\nV-MLLMs. We find that existing adversarial attack methods face significant\nlimitations when applied in black-box settings for V-MLLMs, which we attribute\nto the following shortcomings: (1) lacking generalization in perturbing video\nfeatures, (2) focusing only on sparse key-frames, and (3) failing to integrate\nmultimodal information. To address these limitations and deepen the\nunderstanding of V-MLLM vulnerabilities in black-box scenarios, we introduce\nthe Image-to-Video MLLM (I2V-MLLM) attack. In I2V-MLLM, we utilize an\nimage-based multimodal model (IMM) as a surrogate model to craft adversarial\nvideo samples. Multimodal interactions and temporal information are integrated\nto disrupt video representations within the latent space, improving adversarial\ntransferability. In addition, a perturbation propagation technique is\nintroduced to handle different unknown frame sampling strategies. Experimental\nresults demonstrate that our method can generate adversarial examples that\nexhibit strong transferability across different V-MLLMs on multiple video-text\nmultimodal tasks. Compared to white-box attacks on these models, our black-box\nattacks (using BLIP-2 as surrogate model) achieve competitive performance, with\naverage attack success rates of 55.48% on MSVD-QA and 58.26% on MSRVTT-QA for\nVideoQA tasks, respectively. Our code will be released upon acceptance.\n","authors":["Linhao Huang","Xue Jiang","Zhiqiang Wang","Wentao Mo","Xi Xiao","Bo Han","Yongjie Yin","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.01042v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05803v1","updated":"2025-01-10T09:10:30Z","published":"2025-01-10T09:10:30Z","title":"Alignment without Over-optimization: Training-Free Solution for\n  Diffusion Models","summary":"  Diffusion models excel in generative tasks, but aligning them with specific\nobjectives while maintaining their versatility remains challenging. Existing\nfine-tuning methods often suffer from reward over-optimization, while\napproximate guidance approaches fail to optimize target rewards effectively.\nAddressing these limitations, we propose a training-free sampling method based\non Sequential Monte Carlo (SMC) to sample from the reward-aligned target\ndistribution. Our approach, tailored for diffusion sampling and incorporating\ntempering techniques, achieves comparable or superior target rewards to\nfine-tuning methods while preserving diversity and cross-reward generalization.\nWe demonstrate its effectiveness in single-reward optimization, multi-objective\nscenarios, and online black-box optimization. This work offers a robust\nsolution for aligning diffusion models with diverse downstream objectives\nwithout compromising their general capabilities. Code is available at\nhttps://github.com/krafton-ai/DAS .\n","authors":["Sunwoo Kim","Minkyu Kim","Dongmin Park"],"pdf_url":"https://arxiv.org/pdf/2501.05803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05091v2","updated":"2025-01-10T08:43:50Z","published":"2025-01-09T09:15:07Z","title":"ResPanDiff: Diffusion Model for Pansharpening by Inferring Residual\n  Inference","summary":"  The implementation of diffusion-based pansharpening task is predominantly\nconstrained by its slow inference speed, which results from numerous sampling\nsteps. Despite the existing techniques aiming to accelerate sampling, they\noften compromise performance when fusing multi-source images. To ease this\nlimitation, we introduce a novel and efficient diffusion model named Diffusion\nModel for Pansharpening by Inferring Residual Inference (ResPanDiff), which\nsignificantly reduces the number of diffusion steps without sacrificing the\nperformance to tackle pansharpening task. In ResPanDiff, we innovatively\npropose a Markov chain that transits from noisy residuals to the residuals\nbetween the LRMS and HRMS images, thereby reducing the number of sampling steps\nand enhancing performance. Additionally, we design the latent space to help\nmodel extract more features at the encoding stage, Shallow\nCond-Injection~(SC-I) to help model fetch cond-injected hidden features with\nhigher dimensions, and loss functions to give a better guidance for the\nresidual generation task. enabling the model to achieve superior performance in\nresidual generation. Furthermore, experimental evaluations on pansharpening\ndatasets demonstrate that the proposed method achieves superior outcomes\ncompared to recent state-of-the-art~(SOTA) techniques, requiring only 15\nsampling steps, which reduces over $90\\%$ step compared with the benchmark\ndiffusion models. Our experiments also include thorough discussions and\nablation studies to underscore the effectiveness of our approach.\n","authors":["Shiqi Cao","Liangjian Deng","Shangqi Deng"],"pdf_url":"https://arxiv.org/pdf/2501.05091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02564v2","updated":"2025-01-10T08:40:49Z","published":"2025-01-05T14:42:47Z","title":"Balanced Multi-view Clustering","summary":"  Multi-view clustering (MvC) aims to integrate information from different\nviews to enhance the capability of the model in capturing the underlying data\nstructures. The widely used joint training paradigm in MvC is potentially not\nfully leverage the multi-view information, since the imbalanced and\nunder-optimized view-specific features caused by the uniform learning objective\nfor all views. For instance, particular views with more discriminative\ninformation could dominate the learning process in the joint training paradigm,\nleading to other views being under-optimized. To alleviate this issue, we first\nanalyze the imbalanced phenomenon in the joint-training paradigm of multi-view\nclustering from the perspective of gradient descent for each view-specific\nfeature extractor. Then, we propose a novel balanced multi-view clustering\n(BMvC) method, which introduces a view-specific contrastive regularization\n(VCR) to modulate the optimization of each view. Concretely, VCR preserves the\nsample similarities captured from the joint features and view-specific ones\ninto the clustering distributions corresponding to view-specific features to\nenhance the learning process of view-specific feature extractors. Additionally,\na theoretical analysis is provided to illustrate that VCR adaptively modulates\nthe magnitudes of gradients for updating the parameters of view-specific\nfeature extractors to achieve a balanced multi-view learning procedure. In such\na manner, BMvC achieves a better trade-off between the exploitation of\nview-specific patterns and the exploration of view-invariance patterns to fully\nlearn the multi-view information for the clustering task. Finally, a set of\nexperiments are conducted to verify the superiority of the proposed method\ncompared with state-of-the-art approaches both on eight benchmark MvC datasets\nand two spatially resolved transcriptomics datasets.\n","authors":["Zhenglai Li","Jun Wang","Chang Tang","Xinzhong Zhu","Wei Zhang","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.02564v2.pdf","comment":"We are withdrawing this paper due to issues in the experimental\n  section related to the Application for Spatially Resolved Transcriptomics\n  Data Clustering. These issues affect the validity of the results presented.\n  We believe it is necessary to withdraw the paper to address these problems\n  adequately before resubmission."},{"id":"http://arxiv.org/abs/2501.05786v1","updated":"2025-01-10T08:36:59Z","published":"2025-01-10T08:36:59Z","title":"Cryptanalysis of Cancelable Biometrics Vault","summary":"  Cancelable Biometrics (CB) stands for a range of biometric transformation\nschemes combining biometrics with user specific tokens to generate secure\ntemplates. Required properties are the irreversibility, unlikability and\nrecognition accuracy of templates while making their revocation possible. In\nbiometrics, a key-binding scheme is used for protecting a cryptographic key\nusing a biometric data. The key can be recomputed only if a correct biometric\ndata is acquired during authentication. Applications of key-binding schemes are\ntypically disk encryption, where the cryptographic key is used to encrypt and\ndecrypt the disk. In this paper, we cryptanalyze a recent key-binding scheme,\ncalled Cancelable Biometrics Vault (CBV) based on cancelable biometrics. More\nprecisely, the introduced cancelable transformation, called BioEncoding scheme,\nfor instantiating the CBV framework is attacked in terms of reversibility and\nlinkability of templates. Subsequently, our linkability attack enables to\nrecover the key in the vault without additional assumptions. Our cryptanalysis\nintroduces a new perspective by uncovering the CBV scheme's revocability and\nlinkability vulnerabilities, which were not previously identified in comparable\nbiometric-based key-binding schemes.\n","authors":["Patrick Lacharme","Kevin Thiry-Atighehchi"],"pdf_url":"https://arxiv.org/pdf/2501.05786v1.pdf","comment":"17 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.05993v4","updated":"2025-01-10T08:35:13Z","published":"2024-10-08T12:44:57Z","title":"Aria: An Open Multimodal Native Mixture-of-Experts Model","summary":"  Information comes in diverse modalities. Multimodal native AI models are\nessential to integrate real-world information and deliver comprehensive\nunderstanding. While proprietary multimodal native models exist, their lack of\nopenness imposes obstacles for adoptions, let alone adaptations. To fill this\ngap, we introduce Aria, an open multimodal native model with best-in-class\nperformance across a wide range of multimodal, language, and coding tasks. Aria\nis a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual\ntoken and text token, respectively. It outperforms Pixtral-12B and\nLlama3.2-11B, and is competitive against the best proprietary models on various\nmultimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline,\nwhich progressively equips the model with strong capabilities in language\nunderstanding, multimodal understanding, long context window, and instruction\nfollowing. We open-source the model weights along with a codebase that\nfacilitates easy adoptions and adaptations of Aria in real-world applications.\n","authors":["Dongxu Li","Yudong Liu","Haoning Wu","Yue Wang","Zhiqi Shen","Bowen Qu","Xinyao Niu","Fan Zhou","Chengen Huang","Yanpeng Li","Chongyan Zhu","Xiaoyi Ren","Chao Li","Yifan Ye","Peng Liu","Lihuan Zhang","Hanshu Yan","Guoyin Wang","Bei Chen","Junnan Li"],"pdf_url":"https://arxiv.org/pdf/2410.05993v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05783v1","updated":"2025-01-10T08:33:31Z","published":"2025-01-10T08:33:31Z","title":"UV-Attack: Physical-World Adversarial Attacks for Person Detection via\n  Dynamic-NeRF-based UV Mapping","summary":"  In recent research, adversarial attacks on person detectors using patches or\nstatic 3D model-based texture modifications have struggled with low success\nrates due to the flexible nature of human movement. Modeling the 3D\ndeformations caused by various actions has been a major challenge. Fortunately,\nadvancements in Neural Radiance Fields (NeRF) for dynamic human modeling offer\nnew possibilities. In this paper, we introduce UV-Attack, a groundbreaking\napproach that achieves high success rates even with extensive and unseen human\nactions. We address the challenge above by leveraging dynamic-NeRF-based UV\nmapping. UV-Attack can generate human images across diverse actions and\nviewpoints, and even create novel actions by sampling from the SMPL parameter\nspace. While dynamic NeRF models are capable of modeling human bodies,\nmodifying clothing textures is challenging because they are embedded in neural\nnetwork parameters. To tackle this, UV-Attack generates UV maps instead of RGB\nimages and modifies the texture stacks. This approach enables real-time texture\nedits and makes the attack more practical. We also propose a novel Expectation\nover Pose Transformation loss (EoPT) to improve the evasion success rate on\nunseen poses and views. Our experiments show that UV-Attack achieves a 92.75%\nattack success rate against the FastRCNN model across varied poses in dynamic\nvideo settings, significantly outperforming the state-of-the-art AdvCamou\nattack, which only had a 28.50% ASR. Moreover, we achieve 49.5% ASR on the\nlatest YOLOv8 detector in black-box settings. This work highlights the\npotential of dynamic NeRF-based UV mapping for creating more effective\nadversarial attacks on person detectors, addressing key challenges in modeling\nhuman movement and texture modification.\n","authors":["Yanjie Li","Wenxuan Zhang","Kaisheng Liang","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2501.05783v1.pdf","comment":"23 pages, 22 figures, submitted to ICLR2025"},{"id":"http://arxiv.org/abs/2501.05777v1","updated":"2025-01-10T08:18:37Z","published":"2025-01-10T08:18:37Z","title":"StructSR: Refuse Spurious Details in Real-World Image Super-Resolution","summary":"  Diffusion-based models have shown great promise in real-world image\nsuper-resolution (Real-ISR), but often generate content with structural errors\nand spurious texture details due to the empirical priors and illusions of these\nmodels. To address this issue, we introduce StructSR, a simple, effective, and\nplug-and-play method that enhances structural fidelity and suppresses spurious\ndetails for diffusion-based Real-ISR. StructSR operates without the need for\nadditional fine-tuning, external model priors, or high-level semantic\nknowledge. At its core is the Structure-Aware Screening (SAS) mechanism, which\nidentifies the image with the highest structural similarity to the\nlow-resolution (LR) input in the early inference stage, allowing us to leverage\nit as a historical structure knowledge to suppress the generation of spurious\ndetails. By intervening in the diffusion inference process, StructSR seamlessly\nintegrates with existing diffusion-based Real-ISR models. Our experimental\nresults demonstrate that StructSR significantly improves the fidelity of\nstructure and texture, improving the PSNR and SSIM metrics by an average of\n5.27% and 9.36% on a synthetic dataset (DIV2K-Val) and 4.13% and 8.64% on two\nreal-world datasets (RealSR and DRealSR) when integrated with four\nstate-of-the-art diffusion-based Real-ISR methods.\n","authors":["Yachao Li","Dong Liang","Tianyu Ding","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2501.05777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05769v1","updated":"2025-01-10T07:58:38Z","published":"2025-01-10T07:58:38Z","title":"Conditional Diffusion Model for Electrical Impedance Tomography","summary":"  Electrical impedance tomography (EIT) is a non-invasive imaging technique,\nwhich has been widely used in the fields of industrial inspection, medical\nmonitoring and tactile sensing. However, due to the inherent non-linearity and\nill-conditioned nature of the EIT inverse problem, the reconstructed image is\nhighly sensitive to the measured data, and random noise artifacts often appear\nin the reconstructed image, which greatly limits the application of EIT. To\naddress this issue, a conditional diffusion model with voltage consistency\n(CDMVC) is proposed in this study. The method consists of a pre-imaging module,\na conditional diffusion model for reconstruction, a forward voltage constraint\nnetwork and a scheme of voltage consistency constraint during sampling process.\nThe pre-imaging module is employed to generate the initial reconstruction. This\nserves as a condition for training the conditional diffusion model. Finally,\nbased on the forward voltage constraint network, a voltage consistency\nconstraint is implemented in the sampling phase to incorporate forward\ninformation of EIT, thereby enhancing imaging quality. A more complete dataset,\nincluding both common and complex concave shapes, is generated. The proposed\nmethod is validated using both simulation and physical experiments.\nExperimental results demonstrate that our method can significantly improves the\nquality of reconstructed images. In addition, experimental results also\ndemonstrate that our method has good robustness and generalization performance.\n","authors":["Duanpeng Shi","Wendong Zheng","Di Guo","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2501.05769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05767v1","updated":"2025-01-10T07:56:23Z","published":"2025-01-10T07:56:23Z","title":"Migician: Revealing the Magic of Free-Form Multi-Image Grounding in\n  Multimodal Large Language Models","summary":"  The recent advancement of Multimodal Large Language Models (MLLMs) has\nsignificantly improved their fine-grained perception of single images and\ngeneral comprehension across multiple images. However, existing MLLMs still\nface challenges in achieving precise grounding in complex multi-image\nscenarios. To address this, we first explore a Chain-of-Thought (CoT) framework\nthat integrates single-image grounding with multi-image comprehension. While\npartially effective, it remains unstable and struggles to capture abstract\nvisual information due to its non-end-to-end nature. Therefore, we introduce\nMigician, the first multi-image grounding model capable of performing free-form\nand accurate grounding across multiple images. To support this, we present the\nMGrounding-630k dataset, which comprises data for several multi-image grounding\ntasks derived from existing datasets, along with newly generated free-form\ngrounding instruction-following data. Furthermore, we propose MIG-Bench, a\ncomprehensive benchmark specifically designed for evaluating multi-image\ngrounding capabilities. Experimental results demonstrate that our model\nachieves significantly superior multi-image grounding capabilities,\noutperforming the best existing MLLMs by 21.61% and even surpassing much larger\n70B models. Our code, model, dataset, and benchmark are fully open-sourced.\n","authors":["You Li","Heyu Huang","Chi Chen","Kaiyu Huang","Chao Huang","Zonghao Guo","Zhiyuan Liu","Jinan Xu","Yuhua Li","Ruixuan Li","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2501.05767v1.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.05763v1","updated":"2025-01-10T07:41:47Z","published":"2025-01-10T07:41:47Z","title":"StarGen: A Spatiotemporal Autoregression Framework with Video Diffusion\n  Model for Scalable and Controllable Scene Generation","summary":"  Recent advances in large reconstruction and generative models have\nsignificantly improved scene reconstruction and novel view generation. However,\ndue to compute limitations, each inference with these large models is confined\nto a small area, making long-range consistent scene generation challenging. To\naddress this, we propose StarGen, a novel framework that employs a pre-trained\nvideo diffusion model in an autoregressive manner for long-range scene\ngeneration. The generation of each video clip is conditioned on the 3D warping\nof spatially adjacent images and the temporally overlapping image from\npreviously generated clips, improving spatiotemporal consistency in long-range\nscene generation with precise pose control. The spatiotemporal condition is\ncompatible with various input conditions, facilitating diverse tasks, including\nsparse view interpolation, perpetual view generation, and layout-conditioned\ncity generation. Quantitative and qualitative evaluations demonstrate StarGen's\nsuperior scalability, fidelity, and pose accuracy compared to state-of-the-art\nmethods.\n","authors":["Shangjin Zhai","Zhichao Ye","Jialin Liu","Weijian Xie","Jiaqi Hu","Zhen Peng","Hua Xue","Danpeng Chen","Xiaomeng Wang","Lei Yang","Nan Wang","Haomin Liu","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.05763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11210v2","updated":"2025-01-10T07:26:43Z","published":"2024-12-15T15:04:27Z","title":"ViPOcc: Leveraging Visual Priors from Vision Foundation Models for\n  Single-View 3D Occupancy Prediction","summary":"  Inferring the 3D structure of a scene from a single image is an ill-posed and\nchallenging problem in the field of vision-centric autonomous driving. Existing\nmethods usually employ neural radiance fields to produce voxelized 3D\noccupancy, lacking instance-level semantic reasoning and temporal photometric\nconsistency. In this paper, we propose ViPOcc, which leverages the visual\npriors from vision foundation models (VFMs) for fine-grained 3D occupancy\nprediction. Unlike previous works that solely employ volume rendering for RGB\nand depth image reconstruction, we introduce a metric depth estimation branch,\nin which an inverse depth alignment module is proposed to bridge the domain gap\nin depth distribution between VFM predictions and the ground truth. The\nrecovered metric depth is then utilized in temporal photometric alignment and\nspatial geometric alignment to ensure accurate and consistent 3D occupancy\nprediction. Additionally, we also propose a semantic-guided non-overlapping\nGaussian mixture sampler for efficient, instance-aware ray sampling, which\naddresses the redundant and imbalanced sampling issue that still exists in\nprevious state-of-the-art methods. Extensive experiments demonstrate the\nsuperior performance of ViPOcc in both 3D occupancy prediction and depth\nestimation tasks on the KITTI-360 and KITTI Raw datasets. Our code is available\nat: \\url{https://mias.group/ViPOcc}.\n","authors":["Yi Feng","Yu Han","Xijing Zhang","Tanghui Li","Yanting Zhang","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2412.11210v2.pdf","comment":"accepted to AAAI25"},{"id":"http://arxiv.org/abs/2412.10718v3","updated":"2025-01-10T07:20:26Z","published":"2024-12-14T07:22:03Z","title":"GridShow: Omni Visual Generation","summary":"  In this paper, we introduce GRID, a novel paradigm that reframes a broad\nrange of visual generation tasks as the problem of arranging grids, akin to\nfilm strips. At its core, GRID transforms temporal sequences into grid layouts,\nenabling image generation models to process visual sequences holistically. To\nachieve both layout consistency and motion coherence, we develop a parallel\nflow-matching training strategy that combines layout matching and temporal\nlosses, guided by a coarse-to-fine schedule that evolves from basic layouts to\nprecise motion control. Our approach demonstrates remarkable efficiency,\nachieving up to 35 faster inference speeds while using 1/1000 of the\ncomputational resources compared to specialized models. Extensive experiments\nshow that GRID exhibits exceptional versatility across diverse visual\ngeneration tasks, from Text-to-Video to 3D Editing, while maintaining its\nfoundational image generation capabilities. This dual strength in both expanded\napplications and preserved core competencies establishes GRID as an efficient\nand versatile omni-solution for visual generation.\n","authors":["Cong Wan","Xiangyang Luo","Zijian Cai","Yiren Song","Yunlong Zhao","Yifan Bai","Yuhang He","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2412.10718v3.pdf","comment":"Codes: https://github.com/Should-AI-Lab/GRID"},{"id":"http://arxiv.org/abs/2501.05757v1","updated":"2025-01-10T07:19:41Z","published":"2025-01-10T07:19:41Z","title":"Locality-aware Gaussian Compression for Fast and High-quality Rendering","summary":"  We present LocoGS, a locality-aware 3D Gaussian Splatting (3DGS) framework\nthat exploits the spatial coherence of 3D Gaussians for compact modeling of\nvolumetric scenes. To this end, we first analyze the local coherence of 3D\nGaussian attributes, and propose a novel locality-aware 3D Gaussian\nrepresentation that effectively encodes locally-coherent Gaussian attributes\nusing a neural field representation with a minimal storage requirement. On top\nof the novel representation, LocoGS is carefully designed with additional\ncomponents such as dense initialization, an adaptive spherical harmonics\nbandwidth scheme and different encoding schemes for different Gaussian\nattributes to maximize compression performance. Experimental results\ndemonstrate that our approach outperforms the rendering quality of existing\ncompact Gaussian representations for representative real-world 3D datasets\nwhile achieving from 54.6$\\times$ to 96.6$\\times$ compressed storage size and\nfrom 2.1$\\times$ to 2.4$\\times$ rendering speed than 3DGS. Even our approach\nalso demonstrates an averaged 2.4$\\times$ higher rendering speed than the\nstate-of-the-art compression method with comparable compression performance.\n","authors":["Seungjoo Shin","Jaesik Park","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2501.05757v1.pdf","comment":"28 pages, 15 figures, and 14 tables"},{"id":"http://arxiv.org/abs/2501.05750v1","updated":"2025-01-10T06:58:14Z","published":"2025-01-10T06:58:14Z","title":"Semantic Mapping in Indoor Embodied AI -- A Comprehensive Survey and\n  Future Directions","summary":"  Intelligent embodied agents (e.g. robots) need to perform complex semantic\ntasks in unfamiliar environments. Among many skills that the agents need to\npossess, building and maintaining a semantic map of the environment is most\ncrucial in long-horizon tasks. A semantic map captures information about the\nenvironment in a structured way, allowing the agent to reference it for\nadvanced reasoning throughout the task. While existing surveys in embodied AI\nfocus on general advancements or specific tasks like navigation and\nmanipulation, this paper provides a comprehensive review of semantic\nmap-building approaches in embodied AI, specifically for indoor navigation. We\ncategorize these approaches based on their structural representation (spatial\ngrids, topological graphs, dense point-clouds or hybrid maps) and the type of\ninformation they encode (implicit features or explicit environmental data). We\nalso explore the strengths and limitations of the map building techniques,\nhighlight current challenges, and propose future research directions. We\nidentify that the field is moving towards developing open-vocabulary,\nqueryable, task-agnostic map representations, while high memory demands and\ncomputational inefficiency still remaining to be open challenges. This survey\naims to guide current and future researchers in advancing semantic mapping\ntechniques for embodied AI systems.\n","authors":["Sonia Raychaudhuri","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2501.05750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05744v1","updated":"2025-01-10T06:20:27Z","published":"2025-01-10T06:20:27Z","title":"LLVD: LSTM-based Explicit Motion Modeling in Latent Space for Blind\n  Video Denoising","summary":"  Video restoration plays a pivotal role in revitalizing degraded video content\nby rectifying imperfections caused by various degradations introduced during\ncapturing (sensor noise, motion blur, etc.), saving/sharing (compression,\nresizing, etc.) and editing. This paper introduces a novel algorithm designed\nfor scenarios where noise is introduced during video capture, aiming to enhance\nthe visual quality of videos by reducing unwanted noise artifacts. We propose\nthe Latent space LSTM Video Denoiser (LLVD), an end-to-end blind denoising\nmodel. LLVD uniquely combines spatial and temporal feature extraction,\nemploying Long Short Term Memory (LSTM) within the encoded feature domain. This\nintegration of LSTM layers is crucial for maintaining continuity and minimizing\nflicker in the restored video. Moreover, processing frames in the encoded\nfeature domain significantly reduces computations, resulting in a very\nlightweight architecture. LLVD's blind nature makes it versatile for real,\nin-the-wild denoising scenarios where prior information about noise\ncharacteristics is not available. Experiments reveal that LLVD demonstrates\nexcellent performance for both synthetic and captured noise. Specifically, LLVD\nsurpasses the current State-Of-The-Art (SOTA) in RAW denoising by 0.3dB, while\nalso achieving a 59\\% reduction in computational complexity.\n","authors":["Loay Rashid","Siddharth Roheda","Amit Unde"],"pdf_url":"https://arxiv.org/pdf/2501.05744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05733v1","updated":"2025-01-10T06:02:06Z","published":"2025-01-10T06:02:06Z","title":"TB-Bench: Training and Testing Multi-Modal AI for Understanding\n  Spatio-Temporal Traffic Behaviors from Dashcam Images/Videos","summary":"  The application of Multi-modal Large Language Models (MLLMs) in Autonomous\nDriving (AD) faces significant challenges due to their limited training on\ntraffic-specific data and the absence of dedicated benchmarks for\nspatiotemporal understanding. This study addresses these issues by proposing\nTB-Bench, a comprehensive benchmark designed to evaluate MLLMs on understanding\ntraffic behaviors across eight perception tasks from ego-centric views. We also\nintroduce vision-language instruction tuning datasets, TB-100k and TB-250k,\nalong with simple yet effective baselines for the tasks. Through extensive\nexperiments, we show that existing MLLMs underperform in these tasks, with even\na powerful model like GPT-4o achieving less than 35% accuracy on average. In\ncontrast, when fine-tuned with TB-100k or TB-250k, our baseline models achieve\naverage accuracy up to 85%, significantly enhancing performance on the tasks.\nAdditionally, we demonstrate performance transfer by co-training TB-100k with\nanother traffic dataset, leading to improved performance on the latter.\nOverall, this study represents a step forward by introducing a comprehensive\nbenchmark, high-quality datasets, and baselines, thus supporting the gradual\nintegration of MLLMs into the perception, prediction, and planning stages of\nAD.\n","authors":["Korawat Charoenpitaks","Van-Quang Nguyen","Masanori Suganuma","Kentaro Arai","Seiji Totsuka","Hiroshi Ino","Takayuki Okatani"],"pdf_url":"https://arxiv.org/pdf/2501.05733v1.pdf","comment":"Main Paper: 8 pages, Supplementary Materials: 15 pages"},{"id":"http://arxiv.org/abs/2501.05728v1","updated":"2025-01-10T05:53:32Z","published":"2025-01-10T05:53:32Z","title":"Super-class guided Transformer for Zero-Shot Attribute Classification","summary":"  Attribute classification is crucial for identifying specific characteristics\nwithin image regions. Vision-Language Models (VLMs) have been effective in\nzero-shot tasks by leveraging their general knowledge from large-scale\ndatasets. Recent studies demonstrate that transformer-based models with\nclass-wise queries can effectively address zero-shot multi-label\nclassification. However, poor utilization of the relationship between seen and\nunseen attributes makes the model lack generalizability. Additionally,\nattribute classification generally involves many attributes, making maintaining\nthe model's scalability difficult. To address these issues, we propose\nSuper-class guided transFormer (SugaFormer), a novel framework that leverages\nsuper-classes to enhance scalability and generalizability for zero-shot\nattribute classification. SugaFormer employs Super-class Query Initialization\n(SQI) to reduce the number of queries, utilizing common semantic information\nfrom super-classes, and incorporates Multi-context Decoding (MD) to handle\ndiverse visual cues. To strengthen generalizability, we introduce two knowledge\ntransfer strategies that utilize VLMs. During training, Super-class guided\nConsistency Regularization (SCR) aligns SugaFormer's features with VLMs using\nregion-specific prompts, and during inference, Zero-shot Retrieval-based Score\nEnhancement (ZRSE) refines predictions for unseen attributes. Extensive\nexperiments demonstrate that SugaFormer achieves state-of-the-art performance\nacross three widely-used attribute classification benchmarks under zero-shot,\nand cross-dataset transfer settings. Our code is available at\nhttps://github.com/mlvlab/SugaFormer.\n","authors":["Sehyung Kim","Chanhyeong Yang","Jihwan Park","Taehoon Song","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2501.05728v1.pdf","comment":"AAAI25"},{"id":"http://arxiv.org/abs/2501.05717v1","updated":"2025-01-10T05:29:09Z","published":"2025-01-10T05:29:09Z","title":"Zero-shot Shark Tracking and Biometrics from Aerial Imagery","summary":"  The recent widespread adoption of drones for studying marine animals provides\nopportunities for deriving biological information from aerial imagery. The\nlarge scale of imagery data acquired from drones is well suited for machine\nlearning (ML) analysis. Development of ML models for analyzing marine animal\naerial imagery has followed the classical paradigm of training, testing, and\ndeploying a new model for each dataset, requiring significant time, human\neffort, and ML expertise. We introduce Frame Level ALIgment and tRacking\n(FLAIR), which leverages the video understanding of Segment Anything Model 2\n(SAM2) and the vision-language capabilities of Contrastive Language-Image\nPre-training (CLIP). FLAIR takes a drone video as input and outputs\nsegmentation masks of the species of interest across the video. Notably, FLAIR\nleverages a zero-shot approach, eliminating the need for labeled data, training\na new model, or fine-tuning an existing model to generalize to other species.\nWith a dataset of 18,000 drone images of Pacific nurse sharks, we trained\nstate-of-the-art object detection models to compare against FLAIR. We show that\nFLAIR massively outperforms these object detectors and performs competitively\nagainst two human-in-the-loop methods for prompting SAM2, achieving a Dice\nscore of 0.81. FLAIR readily generalizes to other shark species without\nadditional human effort and can be combined with novel heuristics to\nautomatically extract relevant information including length and tailbeat\nfrequency. FLAIR has significant potential to accelerate aerial imagery\nanalysis workflows, requiring markedly less human effort and expertise than\ntraditional machine learning workflows, while achieving superior accuracy. By\nreducing the effort required for aerial imagery analysis, FLAIR allows\nscientists to spend more time interpreting results and deriving insights about\nmarine ecosystems.\n","authors":["Chinmay K Lalgudi","Mark E Leone","Jaden V Clark","Sergio Madrigal-Mora","Mario Espinoza"],"pdf_url":"https://arxiv.org/pdf/2501.05717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12322v3","updated":"2025-01-10T05:28:08Z","published":"2022-12-22T08:33:32Z","title":"Infrared Image Super-Resolution: Systematic Review, and Future Trends","summary":"  Image Super-Resolution (SR) is essential for a wide range of computer vision\nand image processing tasks. Investigating infrared (IR) image (or thermal\nimages) super-resolution is a continuing concern within the development of deep\nlearning. This survey aims to provide a comprehensive perspective of IR image\nsuper-resolution, including its applications, hardware imaging system dilemmas,\nand taxonomy of image processing methodologies. In addition, the datasets and\nevaluation metrics in IR image super-resolution tasks are also discussed.\nFurthermore, the deficiencies in current technologies and possible promising\ndirections for the community to explore are highlighted. To cope with the rapid\ndevelopment in this field, we intend to regularly update the relevant excellent\nwork at \\url{https://github.com/yongsongH/Infrared_Image_SR_Survey\n","authors":["Yongsong Huang","Tomo Miyazaki","Xiaofeng Liu","Shinichiro Omachi"],"pdf_url":"https://arxiv.org/pdf/2212.12322v3.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2404.11615v2","updated":"2025-01-10T05:22:40Z","published":"2024-04-17T17:59:59Z","title":"Factorized Diffusion: Perceptual Illusions by Noise Decomposition","summary":"  Given a factorization of an image into a sum of linear components, we present\na zero-shot method to control each individual component through diffusion model\nsampling. For example, we can decompose an image into low and high spatial\nfrequencies and condition these components on different text prompts. This\nproduces hybrid images, which change appearance depending on viewing distance.\nBy decomposing an image into three frequency subbands, we can generate hybrid\nimages with three prompts. We also use a decomposition into grayscale and color\ncomponents to produce images whose appearance changes when they are viewed in\ngrayscale, a phenomena that naturally occurs under dim lighting. And we explore\na decomposition by a motion blur kernel, which produces images that change\nappearance under motion blurring. Our method works by denoising with a\ncomposite noise estimate, built from the components of noise estimates\nconditioned on different prompts. We also show that for certain decompositions,\nour method recovers prior approaches to compositional generation and spatial\ncontrol. Finally, we show that we can extend our approach to generate hybrid\nimages from real images. We do this by holding one component fixed and\ngenerating the remaining components, effectively solving an inverse problem.\n","authors":["Daniel Geng","Inbum Park","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2404.11615v2.pdf","comment":"ECCV 2024 camera ready version + more readable size"},{"id":"http://arxiv.org/abs/2501.05711v1","updated":"2025-01-10T05:01:58Z","published":"2025-01-10T05:01:58Z","title":"From My View to Yours: Ego-Augmented Learning in Large Vision Language\n  Models for Understanding Exocentric Daily Living Activities","summary":"  Large Vision Language Models (LVLMs) have demonstrated impressive\ncapabilities in video understanding, yet their adoption for Activities of Daily\nLiving (ADL) remains limited by their inability to capture fine-grained\ninteractions and spatial relationships. This limitation is particularly evident\nin ADL tasks, where understanding detailed human-object interaction and\nhuman-centric motion is crucial for applications such as elderly monitoring and\ncognitive assessment. To address this, we aim to leverage the complementary\nnature of egocentric views to enhance LVLM's understanding of exocentric ADL\nvideos. Consequently, we propose an online ego2exo distillation approach to\nlearn ego-augmented exo representations in LVLMs. While effective, this\napproach requires paired ego-exo training data, which is impractical to collect\nfor real-world ADL scenarios. Consequently, we develop EgoMimic, a\nskeleton-guided method that can generate mimicked ego views from exocentric\nvideos. We find that the exo representations of our ego-augmented LVLMs\nsuccessfully learn to extract ego-perspective cues, demonstrated through\ncomprehensive evaluation on six ADL benchmarks and our proposed\nEgoPerceptionMCQ benchmark designed specifically to assess egocentric\nunderstanding from exocentric videos. Code, models, and data will be\nopen-sourced at https://github.com/dominickrei/EgoExo4ADL.\n","authors":["Dominick Reilly","Manish Kumar Govind","Srijan Das"],"pdf_url":"https://arxiv.org/pdf/2501.05711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04934v2","updated":"2025-01-10T04:43:24Z","published":"2025-01-09T02:52:30Z","title":"Plug-and-Play DISep: Separating Dense Instances for Scene-to-Pixel\n  Weakly-Supervised Change Detection in High-Resolution Remote Sensing Images","summary":"  Existing Weakly-Supervised Change Detection (WSCD) methods often encounter\nthe problem of \"instance lumping\" under scene-level supervision, particularly\nin scenarios with a dense distribution of changed instances (i.e., changed\nobjects). In these scenarios, unchanged pixels between changed instances are\nalso mistakenly identified as changed, causing multiple changes to be\nmistakenly viewed as one. In practical applications, this issue prevents the\naccurate quantification of the number of changes. To address this issue, we\npropose a Dense Instance Separation (DISep) method as a plug-and-play solution,\nrefining pixel features from a unified instance perspective under scene-level\nsupervision. Specifically, our DISep comprises a three-step iterative training\nprocess: 1) Instance Localization: We locate instance candidate regions for\nchanged pixels using high-pass class activation maps. 2) Instance Retrieval: We\nidentify and group these changed pixels into different instance IDs through\nconnectivity searching. Then, based on the assigned instance IDs, we extract\ncorresponding pixel-level features on a per-instance basis. 3) Instance\nSeparation: We introduce a separation loss to enforce intra-instance pixel\nconsistency in the embedding space, thereby ensuring separable instance feature\nrepresentations. The proposed DISep adds only minimal training cost and no\ninference cost. It can be seamlessly integrated to enhance existing WSCD\nmethods. We achieve state-of-the-art performance by enhancing {three\nTransformer-based and four ConvNet-based methods} on the LEVIR-CD, WHU-CD,\nDSIFN-CD, SYSU-CD, and CDD datasets. Additionally, our DISep can be used to\nimprove fully-supervised change detection methods. Code is available at\nhttps://github.com/zhenghuizhao/Plug-and-Play-DISep-for-Change-Detection.\n","authors":["Zhenghui Zhao","Chen Wu","Lixiang Ru","Di Wang","Hongruixuan Chen","Cuiqun Chen"],"pdf_url":"https://arxiv.org/pdf/2501.04934v2.pdf","comment":"Accepted by ISPRS Journal of Photogrammetry and Remote Sensing"},{"id":"http://arxiv.org/abs/2501.05710v1","updated":"2025-01-10T04:41:37Z","published":"2025-01-10T04:41:37Z","title":"EmotiCrafter: Text-to-Emotional-Image Generation based on\n  Valence-Arousal Model","summary":"  Recent research shows that emotions can enhance users' cognition and\ninfluence information communication. While research on visual emotion analysis\nis extensive, limited work has been done on helping users generate emotionally\nrich image content. Existing work on emotional image generation relies on\ndiscrete emotion categories, making it challenging to capture complex and\nsubtle emotional nuances accurately. Additionally, these methods struggle to\ncontrol the specific content of generated images based on text prompts. In this\nwork, we introduce the new task of continuous emotional image content\ngeneration (C-EICG) and present EmotiCrafter, an emotional image generation\nmodel that generates images based on text prompts and Valence-Arousal values.\nSpecifically, we propose a novel emotion-embedding mapping network that embeds\nValence-Arousal values into textual features, enabling the capture of specific\nemotions in alignment with intended input prompts. Additionally, we introduce a\nloss function to enhance emotion expression. The experimental results show that\nour method effectively generates images representing specific emotions with the\ndesired content and outperforms existing techniques.\n","authors":["Yi He","Shengqi Dang","Long Ling","Ziqing Qian","Nanxuan Zhao","Nan Cao"],"pdf_url":"https://arxiv.org/pdf/2501.05710v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.15580v2","updated":"2025-01-10T04:23:51Z","published":"2024-04-24T01:14:33Z","title":"MiM: Mask in Mask Self-Supervised Pre-Training for 3D Medical Image\n  Analysis","summary":"  The Vision Transformer (ViT) has demonstrated remarkable performance in\nSelf-Supervised Learning (SSL) for 3D medical image analysis. Masked\nAutoEncoder (MAE) for feature pre-training can further unleash the potential of\nViT on various medical vision tasks. However, due to large spatial sizes with\nmuch higher dimensions of 3D medical images, the lack of hierarchical design\nfor MAE may hinder the performance of downstream tasks. In this paper, we\npropose a novel \\textit{Mask in Mask (MiM)} pre-training framework for 3D\nmedical images, which aims to advance MAE by learning discriminative\nrepresentation from hierarchical visual tokens across varying scales. We\nintroduce multiple levels of granularity for masked inputs from the volume,\nwhich are then reconstructed simultaneously ranging at both fine and coarse\nlevels. Additionally, a cross-level alignment mechanism is applied to adjacent\nlevel volumes to enforce anatomical similarity hierarchically. Furthermore, we\nadopt a hybrid backbone to enhance the hierarchical representation learning\nefficiently during the pre-training. MiM was pre-trained on a large scale of\navailable 3D volumetric images, \\textit{i.e.,} Computed Tomography (CT) images\ncontaining various body parts. Extensive experiments on thirteen public\ndatasets demonstrate the superiority of MiM over other SSL methods in\norgan/lesion/tumor segmentation and disease classification. We further scale up\nthe MiM to large pre-training datasets with more than 10k volumes, showing that\nlarge-scale pre-training can further enhance the performance of downstream\ntasks. The improvement also concluded that the research community should pay\nmore attention to the scale of the pre-training dataset towards the healthcare\nfoundation model for 3D medical images.\n","authors":["Jiaxin Zhuang","Linshan Wu","Qiong Wang","Peng Fei","Varut Vardhanabhuti","Lin Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15580v2.pdf","comment":"submitted to a journal, updated v2"},{"id":"http://arxiv.org/abs/2412.13717v2","updated":"2025-01-10T04:09:17Z","published":"2024-12-18T10:55:58Z","title":"Towards Automatic Evaluation for Image Transcreation","summary":"  Beyond conventional paradigms of translating speech and text, recently, there\nhas been interest in automated transcreation of images to facilitate\nlocalization of visual content across different cultures. Attempts to define\nthis as a formal Machine Learning (ML) problem have been impeded by the lack of\nautomatic evaluation mechanisms, with previous work relying solely on human\nevaluation. In this paper, we seek to close this gap by proposing a suite of\nautomatic evaluation metrics inspired by machine translation (MT) metrics,\ncategorized into: a) Object-based, b) Embedding-based, and c) VLM-based.\nDrawing on theories from translation studies and real-world transcreation\npractices, we identify three critical dimensions of image transcreation:\ncultural relevance, semantic equivalence and visual similarity, and design our\nmetrics to evaluate systems along these axes. Our results show that proprietary\nVLMs best identify cultural relevance and semantic equivalence, while\nvision-encoder representations are adept at measuring visual similarity.\nMeta-evaluation across 7 countries shows our metrics agree strongly with human\nratings, with average segment-level correlations ranging from 0.55-0.87.\nFinally, through a discussion of the merits and demerits of each metric, we\noffer a robust framework for automated image transcreation evaluation, grounded\nin both theoretical foundations and practical application. Our code can be\nfound here: https://github.com/simran-khanuja/automatic-eval-transcreation\n","authors":["Simran Khanuja","Vivek Iyer","Claire He","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2412.13717v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05690v1","updated":"2025-01-10T03:42:37Z","published":"2025-01-10T03:42:37Z","title":"Overcoming Language Priors for Visual Question Answering Based on\n  Knowledge Distillation","summary":"  Previous studies have pointed out that visual question answering (VQA) models\nare prone to relying on language priors for answer predictions. In this\ncontext, predictions often depend on linguistic shortcuts rather than a\ncomprehensive grasp of multimodal knowledge, which diminishes their\ngeneralization ability. In this paper, we propose a novel method, namely, KDAR,\nleveraging knowledge distillation to address the prior-dependency dilemmas\nwithin the VQA task. Specifically, the regularization effect facilitated by\nsoft labels from a well-trained teacher is employed to penalize overfitting to\nthe most common answers. The soft labels, which serve a regularization role,\nalso provide semantic guidance that narrows the range of candidate answers.\nAdditionally, we design an adaptive sample-wise reweighting learning strategy\nto further mitigate bias by dynamically adjusting the importance of each\nsample. Experimental results demonstrate that our method enhances performance\nin both OOD and IID settings. Our method achieves state-of-the-art performance\non the VQA-CPv2 out-of-distribution (OOD) benchmark, significantly\noutperforming previous state-of-the-art approaches.\n","authors":["Daowan Peng","Wei Wei"],"pdf_url":"https://arxiv.org/pdf/2501.05690v1.pdf","comment":"Accepted to ICME2024"},{"id":"http://arxiv.org/abs/2501.05688v1","updated":"2025-01-10T03:41:03Z","published":"2025-01-10T03:41:03Z","title":"eKalibr: Dynamic Intrinsic Calibration for Event Cameras From First\n  Principles of Events","summary":"  The bio-inspired event camera has garnered extensive research attention in\nrecent years, owing to its significant potential derived from its high dynamic\nrange and low latency characteristics. Similar to the standard camera, the\nevent camera requires precise intrinsic calibration to facilitate further\nhigh-level visual applications, such as pose estimation and mapping. While\nseveral calibration methods for event cameras have been proposed, most of them\nare either (i) engineering-driven, heavily relying on conventional image-based\ncalibration pipelines, or (ii) inconvenient, requiring complex instrumentation.\nTo this end, we propose an accurate and convenient intrinsic calibration method\nfor event cameras, named eKalibr, which builds upon a carefully designed\nevent-based circle grid pattern recognition algorithm. To extract target\npatterns from events, we perform event-based normal flow estimation to identify\npotential events generated by circle edges, and cluster them spatially.\nSubsequently, event clusters associated with the same grid circles are matched\nand grouped using normal flows, for subsequent time-varying ellipse estimation.\nFitted ellipse centers are time-synchronized, for final grid pattern\nrecognition. We conducted extensive experiments to evaluate the performance of\neKalibr in terms of pattern extraction and intrinsic calibration. The\nimplementation of eKalibr is open-sourced at\n(https://github.com/Unsigned-Long/eKalibr) to benefit the research community.\n","authors":["Shuolong Chen","Xingxing Li","Liu Yuan","Ziao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.05688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05687v1","updated":"2025-01-10T03:38:16Z","published":"2025-01-10T03:38:16Z","title":"UniQ: Unified Decoder with Task-specific Queries for Efficient Scene\n  Graph Generation","summary":"  Scene Graph Generation(SGG) is a scene understanding task that aims at\nidentifying object entities and reasoning their relationships within a given\nimage. In contrast to prevailing two-stage methods based on a large object\ndetector (e.g., Faster R-CNN), one-stage methods integrate a fixed-size set of\nlearnable queries to jointly reason relational triplets <subject, predicate,\nobject>. This paradigm demonstrates robust performance with significantly\nreduced parameters and computational overhead. However, the challenge in\none-stage methods stems from the issue of weak entanglement, wherein entities\ninvolved in relationships require both coupled features shared within triplets\nand decoupled visual features. Previous methods either adopt a single decoder\nfor coupled triplet feature modeling or multiple decoders for separate visual\nfeature extraction but fail to consider both. In this paper, we introduce UniQ,\na Unified decoder with task-specific Queries architecture, where task-specific\nqueries generate decoupled visual features for subjects, objects, and\npredicates respectively, and unified decoder enables coupled feature modeling\nwithin relational triplets. Experimental results on the Visual Genome dataset\ndemonstrate that UniQ has superior performance to both one-stage and two-stage\nmethods.\n","authors":["Xinyao Liao","Wei Wei","Dangyang Chen","Yuanyuan Fu"],"pdf_url":"https://arxiv.org/pdf/2501.05687v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.05686v1","updated":"2025-01-10T03:35:22Z","published":"2025-01-10T03:35:22Z","title":"Deep Reversible Consistency Learning for Cross-modal Retrieval","summary":"  Cross-modal retrieval (CMR) typically involves learning common\nrepresentations to directly measure similarities between multimodal samples.\nMost existing CMR methods commonly assume multimodal samples in pairs and\nemploy joint training to learn common representations, limiting the flexibility\nof CMR. Although some methods adopt independent training strategies for each\nmodality to improve flexibility in CMR, they utilize the randomly initialized\northogonal matrices to guide representation learning, which is suboptimal since\nthey assume inter-class samples are independent of each other, limiting the\npotential of semantic alignments between sample representations and\nground-truth labels. To address these issues, we propose a novel method termed\nDeep Reversible Consistency Learning (DRCL) for cross-modal retrieval. DRCL\nincludes two core modules, \\ie Selective Prior Learning (SPL) and Reversible\nSemantic Consistency learning (RSC). More specifically, SPL first learns a\ntransformation weight matrix on each modality and selects the best one based on\nthe quality score as the Prior, which greatly avoids blind selection of priors\nlearned from low-quality modalities. Then, RSC employs a Modality-invariant\nRepresentation Recasting mechanism (MRR) to recast the potential\nmodality-invariant representations from sample semantic labels by the\ngeneralized inverse matrix of the prior. Since labels are devoid of\nmodal-specific information, we utilize the recast features to guide the\nrepresentation learning, thus maintaining semantic consistency to the fullest\nextent possible. In addition, a feature augmentation mechanism (FA) is\nintroduced in RSC to encourage the model to learn over a wider data\ndistribution for diversity. Finally, extensive experiments conducted on five\nwidely used datasets and comparisons with 15 state-of-the-art baselines\ndemonstrate the effectiveness and superiority of our DRCL.\n","authors":["Ruitao Pu","Yang Qin","Dezhong Peng","Xiaomin Song","Huiming Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.05686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01148v3","updated":"2025-01-10T03:28:00Z","published":"2024-09-02T10:33:45Z","title":"FMRFT: Fusion Mamba and DETR for Query Time Sequence Intersection Fish\n  Tracking","summary":"  Early detection of abnormal fish behavior caused by disease or hunger can be\nachieved through fish tracking using deep learning techniques, which holds\nsignificant value for industrial aquaculture. However, underwater reflections\nand some reasons with fish, such as the high similarity, rapid swimming caused\nby stimuli and mutual occlusion bring challenges to multi-target tracking of\nfish. To address these challenges, this paper establishes a complex\nmulti-scenario sturgeon tracking dataset and introduces the FMRFT model, a\nreal-time end-to-end fish tracking solution. The model incorporates the low\nvideo memory consumption Mamba In Mamba (MIM) architecture, which facilitates\nmulti-frame temporal memory and feature extraction, thereby addressing the\nchallenges to track multiple fish across frames. Additionally, the FMRFT model\nwith the Query Time Sequence Intersection (QTSI) module effectively manages\noccluded objects and reduces redundant tracking frames using the superior\nfeature interaction and prior frame processing capabilities of RT-DETR. This\ncombination significantly enhances the accuracy and stability of fish tracking.\nTrained and tested on the dataset, the model achieves an IDF1 score of 90.3%\nand a MOTA accuracy of 94.3%. Experimental results show that the proposed FMRFT\nmodel effectively addresses the challenges of high similarity and mutual\nocclusion in fish populations, enabling accurate tracking in factory farming\nenvironments.\n","authors":["Mingyuan Yao","Yukang Huo","Qingbin Tian","Jiayin Zhao","Xiao Liu","Ruifeng Wang","Lin Xue","Haihua Wang"],"pdf_url":"https://arxiv.org/pdf/2409.01148v3.pdf","comment":"14 pages,14 figures"},{"id":"http://arxiv.org/abs/2501.04608v2","updated":"2025-01-10T03:08:11Z","published":"2025-01-08T16:44:06Z","title":"Comprehensive Examination of Unrolled Networks for Solving Linear\n  Inverse Problems","summary":"  Unrolled networks have become prevalent in various computer vision and\nimaging tasks. Although they have demonstrated remarkable efficacy in solving\nspecific computer vision and computational imaging tasks, their adaptation to\nother applications presents considerable challenges. This is primarily due to\nthe multitude of design decisions that practitioners working on new\napplications must navigate, each potentially affecting the network's overall\nperformance. These decisions include selecting the optimization algorithm,\ndefining the loss function, and determining the number of convolutional layers,\namong others. Compounding the issue, evaluating each design choice requires\ntime-consuming simulations to train, fine-tune the neural network, and optimize\nfor its performance. As a result, the process of exploring multiple options and\nidentifying the optimal configuration becomes time-consuming and\ncomputationally demanding. The main objectives of this paper are (1) to unify\nsome ideas and methodologies used in unrolled networks to reduce the number of\ndesign choices a user has to make, and (2) to report a comprehensive ablation\nstudy to discuss the impact of each of the choices involved in designing\nunrolled networks and present practical recommendations based on our findings.\nWe anticipate that this study will help scientists and engineers design\nunrolled networks for their applications and diagnose problems within their\nnetworks efficiently.\n","authors":["Eric Chen","Xi Chen","Arian Maleki","Shirin Jalali"],"pdf_url":"https://arxiv.org/pdf/2501.04608v2.pdf","comment":"27 pages, 10 figures. Project Page:\n  https://github.com/YuxiChen25/Memory-Net-Inverse"},{"id":"http://arxiv.org/abs/2501.05669v1","updated":"2025-01-10T02:36:37Z","published":"2025-01-10T02:36:37Z","title":"LPRnet: A self-supervised registration network for LiDAR and\n  photogrammetric point clouds","summary":"  LiDAR and photogrammetry are active and passive remote sensing techniques for\npoint cloud acquisition, respectively, offering complementary advantages and\nheterogeneous. Due to the fundamental differences in sensing mechanisms,\nspatial distributions and coordinate systems, their point clouds exhibit\nsignificant discrepancies in density, precision, noise, and overlap. Coupled\nwith the lack of ground truth for large-scale scenes, integrating the\nheterogeneous point clouds is a highly challenging task. This paper proposes a\nself-supervised registration network based on a masked autoencoder, focusing on\nheterogeneous LiDAR and photogrammetric point clouds. At its core, the method\nintroduces a multi-scale masked training strategy to extract robust features\nfrom heterogeneous point clouds under self-supervision. To further enhance\nregistration performance, a rotation-translation embedding module is designed\nto effectively capture the key features essential for accurate rigid\ntransformations. Building upon the robust representations, a transformer-based\narchitecture seamlessly integrates local and global features, fostering precise\nalignment across diverse point cloud datasets. The proposed method demonstrates\nstrong feature extraction capabilities for both LiDAR and photogrammetric point\nclouds, addressing the challenges of acquiring ground truth at the scene level.\nExperiments conducted on two real-world datasets validate the effectiveness of\nthe proposed method in solving heterogeneous point cloud registration problems.\n","authors":["Chen Wang","Yanfeng Gu","Xian Li"],"pdf_url":"https://arxiv.org/pdf/2501.05669v1.pdf","comment":"12 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2409.12953v4","updated":"2025-01-10T02:31:03Z","published":"2024-09-19T17:58:16Z","title":"JourneyBench: A Challenging One-Stop Vision-Language Understanding\n  Benchmark of Generated Images","summary":"  Existing vision-language understanding benchmarks largely consist of images\nof objects in their usual contexts. As a consequence, recent multimodal large\nlanguage models can perform well with only a shallow visual understanding by\nrelying on background language biases. Thus, strong performance on these\nbenchmarks does not necessarily correlate with strong visual understanding. In\nthis paper, we release JourneyBench, a comprehensive human-annotated benchmark\nof generated images designed to assess the model's fine-grained multimodal\nreasoning abilities across five tasks: complementary multimodal chain of\nthought, multi-image VQA, imaginary image captioning, VQA with hallucination\ntriggers, and fine-grained retrieval with sample-specific distractors. Unlike\nexisting benchmarks, JourneyBench explicitly requires fine-grained multimodal\nreasoning in unusual imaginary scenarios where language bias and holistic image\ngist are insufficient. We benchmark state-of-the-art models on JourneyBench and\nanalyze performance along a number of fine-grained dimensions. Results across\nall five tasks show that JourneyBench is exceptionally challenging for even the\nbest models, indicating that models' visual reasoning abilities are not as\nstrong as they first appear. We discuss the implications of our findings and\npropose avenues for further research.\n","authors":["Zhecan Wang","Junzhang Liu","Chia-Wei Tang","Hani Alomari","Anushka Sivakumar","Rui Sun","Wenhao Li","Md. Atabuzzaman","Hammad Ayyubi","Haoxuan You","Alvi Ishmam","Kai-Wei Chang","Shih-Fu Chang","Chris Thomas"],"pdf_url":"https://arxiv.org/pdf/2409.12953v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14080v4","updated":"2025-01-10T02:12:05Z","published":"2024-06-20T07:56:51Z","title":"CMTNet: Convolutional Meets Transformer Network for Hyperspectral Images\n  Classification","summary":"  Hyperspectral remote sensing (HIS) enables the detailed capture of spectral\ninformation from the Earth's surface, facilitating precise classification and\nidentification of surface crops due to its superior spectral diagnostic\ncapabilities. However, current convolutional neural networks (CNNs) focus on\nlocal features in hyperspectral data, leading to suboptimal performance when\nclassifying intricate crop types and addressing imbalanced sample\ndistributions. In contrast, the Transformer framework excels at extracting\nglobal features from hyperspectral imagery. To leverage the strengths of both\napproaches, this research introduces the Convolutional Meet Transformer Network\n(CMTNet). This innovative model includes a spectral-spatial feature extraction\nmodule for shallow feature capture, a dual-branch structure combining CNN and\nTransformer branches for local and global feature extraction, and a\nmulti-output constraint module that enhances classification accuracy through\nmulti-output loss calculations and cross constraints across local,\ninternational, and joint features. Extensive experiments conducted on three\ndatasets (WHU-Hi-LongKou, WHU-Hi-HanChuan, and WHU-Hi-HongHu) demonstrate that\nCTDBNet significantly outperforms other state-of-the-art networks in\nclassification performance, validating its effectiveness in hyperspectral crop\nclassification.\n","authors":["Faxu Guo","Quan Feng","Sen Yang","Wanxia Yang"],"pdf_url":"https://arxiv.org/pdf/2406.14080v4.pdf","comment":"We have decided to withdraw this article due to significant\n  adjustments in the research direction. The current manuscript no longer\n  reflects the final conclusions of our study. We plan to revise and resubmit\n  the work in the future."},{"id":"http://arxiv.org/abs/2412.20006v2","updated":"2025-01-10T01:09:37Z","published":"2024-12-28T04:06:29Z","title":"Adversarial Robustness for Deep Learning-based Wildfire Prediction\n  Models","summary":"  Smoke detection using Deep Neural Networks (DNNs) is an effective approach\nfor early wildfire detection. However, because smoke is temporally and\nspatially anomalous, there are limitations in collecting sufficient training\ndata. This raises overfitting and bias concerns in existing DNN-based wildfire\ndetection models. Thus, we introduce WARP (Wildfire Adversarial Robustness\nProcedure), the first model-agnostic framework for evaluating the adversarial\nrobustness of DNN-based wildfire detection models. WARP addresses limitations\nin smoke image diversity using global and local adversarial attack methods. The\nglobal attack method uses image-contextualized Gaussian noise, while the local\nattack method uses patch noise injection, tailored to address critical aspects\nof wildfire detection. Leveraging WARP's model-agnostic capabilities, we assess\nthe adversarial robustness of real-time Convolutional Neural Networks (CNNs)\nand Transformers. The analysis revealed valuable insights into the models'\nlimitations. Specifically, the global attack method demonstrates that the\nTransformer model has more than 70% precision degradation than the CNN against\nglobal noise. In contrast, the local attack method shows that both models are\nsusceptible to cloud image injections when detecting smoke-positive instances,\nsuggesting a need for model improvements through data augmentation. WARP's\ncomprehensive robustness analysis contributed to the development of\nwildfire-specific data augmentation strategies, marking a step toward\npracticality.\n","authors":["Ryo Ide","Lei Yang"],"pdf_url":"https://arxiv.org/pdf/2412.20006v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.05488v2","updated":"2025-01-10T00:58:28Z","published":"2024-12-07T01:19:14Z","title":"Enhancing Sample Generation of Diffusion Models using Noise Level\n  Correction","summary":"  The denoising process of diffusion models can be interpreted as an\napproximate projection of noisy samples onto the data manifold. Moreover, the\nnoise level in these samples approximates their distance to the underlying\nmanifold. Building on this insight, we propose a novel method to enhance sample\ngeneration by aligning the estimated noise level with the true distance of\nnoisy samples to the manifold. Specifically, we introduce a noise level\ncorrection network, leveraging a pre-trained denoising network, to refine noise\nlevel estimates during the denoising process. Additionally, we extend this\napproach to various image restoration tasks by integrating task-specific\nconstraints, including inpainting, deblurring, super-resolution, colorization,\nand compressed sensing. Experimental results demonstrate that our method\nsignificantly improves sample quality in both unconstrained and constrained\ngeneration scenarios. Notably, the proposed noise level correction framework is\ncompatible with existing denoising schedulers (e.g., DDIM), offering additional\nperformance improvements.\n","authors":["Abulikemu Abuduweili","Chenyang Yuan","Changliu Liu","Frank Permenter"],"pdf_url":"https://arxiv.org/pdf/2412.05488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05631v1","updated":"2025-01-10T00:20:29Z","published":"2025-01-10T00:20:29Z","title":"HFMF: Hierarchical Fusion Meets Multi-Stream Models for Deepfake\n  Detection","summary":"  The rapid progress in deep generative models has led to the creation of\nincredibly realistic synthetic images that are becoming increasingly difficult\nto distinguish from real-world data. The widespread use of Variational Models,\nDiffusion Models, and Generative Adversarial Networks has made it easier to\ngenerate convincing fake images and videos, which poses significant challenges\nfor detecting and mitigating the spread of misinformation. As a result,\ndeveloping effective methods for detecting AI-generated fakes has become a\npressing concern. In our research, we propose HFMF, a comprehensive two-stage\ndeepfake detection framework that leverages both hierarchical cross-modal\nfeature fusion and multi-stream feature extraction to enhance detection\nperformance against imagery produced by state-of-the-art generative AI models.\nThe first component of our approach integrates vision Transformers and\nconvolutional nets through a hierarchical feature fusion mechanism. The second\ncomponent of our framework combines object-level information and a fine-tuned\nconvolutional net model. We then fuse the outputs from both components via an\nensemble deep neural net, enabling robust classification performances. We\ndemonstrate that our architecture achieves superior performance across diverse\ndataset benchmarks while maintaining calibration and interoperability.\n","authors":["Anant Mehta","Bryant McArthur","Nagarjuna Kolloju","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2501.05631v1.pdf","comment":"This work is accepted to WACV 2025 Workshop on AI for Multimedia\n  Forensics & Disinformation Detection. Code is available at:\n  https://github.com/taco-group/HFMF"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2405.12327v3","updated":"2025-01-10T18:30:45Z","published":"2024-05-20T18:52:33Z","title":"Beyond Item Dissimilarities: Diversifying by Intent in Recommender\n  Systems","summary":"  It has become increasingly clear that recommender systems that overly focus\non short-term engagement prevents users from exploring diverse interests,\nultimately hurting long-term user experience. To tackle this challenge,\nnumerous diversification algorithms have been proposed. These algorithms\ntypically rely on measures of item similarity, aiming to maximize the\ndissimilarity across items in the final set of recommendations. However, in\nthis work, we demonstrate the benefits of going beyond item-level similarities\nby utilizing higher-level user understanding--specifically, user intents that\npersist across multiple interactions--in diversification. Our approach is\nmotivated by the observation that user behaviors on online platforms are\nlargely driven by their underlying intents. Therefore, recommendations should\nensure that diverse user intents are accurately represented. While intent has\nprimarily been studied in the context of search, it is less clear how to\nincorporate real-time dynamic intent predictions into recommender systems. To\naddress this gap, we develop a probabilistic intent-based whole-page\ndiversification framework for the final stage of a recommender system. Starting\nwith a prior belief of user intents, the proposed framework sequentially\nselects items for each position based on these beliefs and subsequently updates\nposterior beliefs about the intents. This approach ensures that different user\nintents are represented on a page, towards optimizing long-term user\nexperience. We experiment with the intent diversification framework on YouTube,\nthe world's largest video recommendation platform, serving billions of users\ndaily. Live experiments on a diverse set of intents show that the proposed\nframework increases Daily Active Users (DAU) and overall user enjoyment,\nvalidating its effectiveness in facilitating long-term planning.\n","authors":["Yuyan Wang","Cheenar Banerjee","Samer Chucri","Fabio Soldo","Sriraj Badam","Ed H. Chi","Minmin Chen"],"pdf_url":"https://arxiv.org/pdf/2405.12327v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06121v1","updated":"2025-01-10T17:19:59Z","published":"2025-01-10T17:19:59Z","title":"kANNolo: Sweet and Smooth Approximate k-Nearest Neighbors Search","summary":"  Approximate Nearest Neighbors (ANN) search is a crucial task in several\napplications like recommender systems and information retrieval. Current\nstate-of-the-art ANN libraries, although being performance-oriented, often lack\nmodularity and ease of use. This translates into them not being fully suitable\nfor easy prototyping and testing of research ideas, an important feature to\nenable. We address these limitations by introducing kANNolo, a novel\nresearch-oriented ANN library written in Rust and explicitly designed to\ncombine usability with performance effectively. kANNolo is the first ANN\nlibrary that supports dense and sparse vector representations made available on\ntop of different similarity measures, e.g., euclidean distance and inner\nproduct. Moreover, it also supports vector quantization techniques, e.g.,\nProduct Quantization, on top of the indexing strategies implemented. These\nfunctionalities are managed through Rust traits, allowing shared behaviors to\nbe handled abstractly. This abstraction ensures flexibility and facilitates an\neasy integration of new components. In this work, we detail the architecture of\nkANNolo and demonstrate that its flexibility does not compromise performance.\nThe experimental analysis shows that kANNolo achieves state-of-the-art\nperformance in terms of speed-accuracy trade-off while allowing fast and easy\nprototyping, thus making kANNolo a valuable tool for advancing ANN research.\nSource code available on GitHub: https://github.com/TusKANNy/kannolo.\n","authors":["Leonardo Delfino","Domenico Erriquez","Silvio Martinico","Franco Maria Nardini","Cosimo Rulli","Rossano Venturini"],"pdf_url":"https://arxiv.org/pdf/2501.06121v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.05964v1","updated":"2025-01-10T13:46:23Z","published":"2025-01-10T13:46:23Z","title":"Recommender Systems for Social Good: The Role of Accountability and\n  Sustainability","summary":"  This work examines the role of recommender systems in promoting\nsustainability, social responsibility, and accountability, with a focus on\nalignment with the United Nations Sustainable Development Goals (SDGs). As\nrecommender systems become increasingly integrated into daily interactions,\nthey must go beyond personalization to support responsible consumption, reduce\nenvironmental impact, and foster social good. We explore strategies to mitigate\nthe carbon footprint of recommendation models, ensure fairness, and implement\naccountability mechanisms. By adopting these approaches, recommender systems\ncan contribute to sustainable and socially beneficial outcomes, aligning\ntechnological advancements with the SDGs focused on environmental\nsustainability and social well-being.\n","authors":["Alan Said"],"pdf_url":"https://arxiv.org/pdf/2501.05964v1.pdf","comment":"First International Workshop on Recommender Systems for\n  Sustainability and Social Good (RecSoGood'24)"},{"id":"http://arxiv.org/abs/2501.05925v1","updated":"2025-01-10T12:44:46Z","published":"2025-01-10T12:44:46Z","title":"Navigating Tomorrow: Reliably Assessing Large Language Models\n  Performance on Future Event Prediction","summary":"  Predicting future events is an important activity with applications across\nmultiple fields and domains. For example, the capacity to foresee stock market\ntrends, natural disasters, business developments, or political events can\nfacilitate early preventive measures and uncover new opportunities. Multiple\ndiverse computational methods for attempting future predictions, including\npredictive analysis, time series forecasting, and simulations have been\nproposed. This study evaluates the performance of several large language models\n(LLMs) in supporting future prediction tasks, an under-explored domain. We\nassess the models across three scenarios: Affirmative vs. Likelihood\nquestioning, Reasoning, and Counterfactual analysis. For this, we create a\ndataset1 by finding and categorizing news articles based on entity type and its\npopularity. We gather news articles before and after the LLMs training cutoff\ndate in order to thoroughly test and compare model performance. Our research\nhighlights LLMs potential and limitations in predictive modeling, providing a\nfoundation for future improvements.\n","authors":["Petraq Nako","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2501.05925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05894v1","updated":"2025-01-10T11:46:51Z","published":"2025-01-10T11:46:51Z","title":"Text2Playlist: Generating Personalized Playlists from Text on Deezer","summary":"  The streaming service Deezer heavily relies on the search to help users\nnavigate through its extensive music catalog. Nonetheless, it is primarily\ndesigned to find specific items and does not lead directly to a smooth\nlistening experience. We present Text2Playlist, a stand-alone tool that\naddresses these limitations. Text2Playlist leverages generative AI, music\ninformation retrieval and recommendation systems to generate query-specific and\npersonalized playlists, successfully deployed at scale.\n","authors":["Mathieu Delcluze","Antoine Khoury","Clémence Vast","Valerio Arnaudo","Léa Briand","Walid Bendada","Thomas Bouabça"],"pdf_url":"https://arxiv.org/pdf/2501.05894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05874v1","updated":"2025-01-10T11:17:15Z","published":"2025-01-10T11:17:15Z","title":"VideoRAG: Retrieval-Augmented Generation over Video Corpus","summary":"  Retrieval-Augmented Generation (RAG) is a powerful strategy to address the\nissue of generating factually incorrect outputs in foundation models by\nretrieving external knowledge relevant to queries and incorporating it into\ntheir generation process. However, existing RAG approaches have primarily\nfocused on textual information, with some recent advancements beginning to\nconsider images, and they largely overlook videos, a rich source of multimodal\nknowledge capable of representing events, processes, and contextual details\nmore effectively than any other modality. While a few recent studies explore\nthe integration of videos in the response generation process, they either\npredefine query-associated videos without retrieving them according to queries,\nor convert videos into the textual descriptions without harnessing their\nmultimodal richness. To tackle these, we introduce VideoRAG, a novel framework\nthat not only dynamically retrieves relevant videos based on their relevance\nwith queries but also utilizes both visual and textual information of videos in\nthe output generation. Further, to operationalize this, our method revolves\naround the recent advance of Large Video Language Models (LVLMs), which enable\nthe direct processing of video content to represent it for retrieval and\nseamless integration of the retrieved videos jointly with queries. We\nexperimentally validate the effectiveness of VideoRAG, showcasing that it is\nsuperior to relevant baselines.\n","authors":["Soyeong Jeong","Kangsan Kim","Jinheon Baek","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2501.05874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05813v1","updated":"2025-01-10T09:32:28Z","published":"2025-01-10T09:32:28Z","title":"Social web and Wikipedia: an opportunity to rethink the links between\n  sources' credibility, trust and authority","summary":"  The Web and its main tools (Google, Wikipedia, Facebook, Twitter) deeply\nraise and renew fundamental questions, that everyone asks almost every day: Is\nthis information or content true? Can I trust this author or source? These\nquestions are not new, they have been the same with books, newspapers,\nbroadcasting and television, and, more fundamentally, in every human\ninterpersonal communication. This paper is focused on two scientific problems\non this issue. The first one is theoretical: to address this issue, many\nconcepts have been used in library and information sciences, communication and\npsychology. The links between these concepts are not clear: sometimes two\nconcepts are considered as synonymous, sometimes as very different. The second\none is historical: sources like Wikipedia deeply challenge the epistemic\nevaluation of information sources, compared to previous modes of information\nproduction. This paper proposes an integrated and simple model considering the\nrelation between a user, a document and an author as human communication. It\nreduces the problem to three concepts: credibility as a characteristic granted\nto information depending on its truth-value; trust as the ability to produce\ncredible information; authority when the power to influence of an author is\naccepted, i.e., when readers accept that the source can modify their opinion,\nknowledge and decisions. The model describes also two kinds of relationships\nbetween the three concepts: an upward link and a downward link. The model is\nconfronted with findings of empirical research on Wikipedia in particular.\n","authors":["Gilles Sahut","André Tricot"],"pdf_url":"https://arxiv.org/pdf/2501.05813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15005v3","updated":"2025-01-10T01:52:48Z","published":"2024-11-22T15:29:05Z","title":"Multi-granularity Interest Retrieval and Refinement Network for\n  Long-Term User Behavior Modeling in CTR Prediction","summary":"  Click-through Rate (CTR) prediction is crucial for online personalization\nplatforms. Recent advancements have shown that modeling rich user behaviors can\nsignificantly improve the performance of CTR prediction. Current long-term user\nbehavior modeling algorithms predominantly follow two cascading stages. The\nfirst stage retrieves subsequence related to the target item from the long-term\nbehavior sequence, while the second stage models the relationship between the\nsubsequence and the target item. Despite significant progress, these methods\nhave two critical flaws. First, the retrieval query typically includes only\ntarget item information, limiting the ability to capture the user's diverse\ninterests. Second, relational information, such as sequential and interactive\ninformation within the subsequence, is frequently overlooked. Therefore, it\nrequires to be further mined to more accurately model user interests.\n  To this end, we propose Multi-granularity Interest Retrieval and Refinement\nNetwork (MIRRN). Specifically, we first construct queries based on behaviors\nobserved at different time scales to obtain subsequences, each capturing users'\ninterest at various granularities. We then introduce an noval multi-head\nFourier transformer to efficiently learn sequential and interactive information\nwithin the subsequences, leading to more accurate modeling of user interests.\nFinally, we employ multi-head target attention to adaptively assess the impact\nof these multi-granularity interests on the target item. Extensive experiments\nhave demonstrated that MIRRN significantly outperforms state-of-the-art\nbaselines. Furthermore, an A/B test shows that MIRRN increases the average\nnumber of listening songs by 1.32% and the average time of listening songs by\n0.55% on the Huawei Music App. The implementation code is publicly available at\nhttps://github.com/USTC-StarTeam/MIRRN.\n","authors":["Xiang Xu","Hao Wang","Wei Guo","Luankang Zhang","Wanshan Yang","Runlong Yu","Yong Liu","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.15005v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05647v1","updated":"2025-01-10T01:27:12Z","published":"2025-01-10T01:27:12Z","title":"Collaboration of Large Language Models and Small Recommendation Models\n  for Device-Cloud Recommendation","summary":"  Large Language Models (LLMs) for Recommendation (LLM4Rec) is a promising\nresearch direction that has demonstrated exceptional performance in this field.\nHowever, its inability to capture real-time user preferences greatly limits the\npractical application of LLM4Rec because (i) LLMs are costly to train and infer\nfrequently, and (ii) LLMs struggle to access real-time data (its large number\nof parameters poses an obstacle to deployment on devices). Fortunately, small\nrecommendation models (SRMs) can effectively supplement these shortcomings of\nLLM4Rec diagrams by consuming minimal resources for frequent training and\ninference, and by conveniently accessing real-time data on devices.\n  In light of this, we designed the Device-Cloud LLM-SRM Collaborative\nRecommendation Framework (LSC4Rec) under a device-cloud collaboration setting.\nLSC4Rec aims to integrate the advantages of both LLMs and SRMs, as well as the\nbenefits of cloud and edge computing, achieving a complementary synergy. We\nenhance the practicability of LSC4Rec by designing three strategies:\ncollaborative training, collaborative inference, and intelligent request.\nDuring training, LLM generates candidate lists to enhance the ranking ability\nof SRM in collaborative scenarios and enables SRM to update adaptively to\ncapture real-time user interests. During inference, LLM and SRM are deployed on\nthe cloud and on the device, respectively. LLM generates candidate lists and\ninitial ranking results based on user behavior, and SRM get reranking results\nbased on the candidate list, with final results integrating both LLM's and\nSRM's scores. The device determines whether a new candidate list is needed by\ncomparing the consistency of the LLM's and SRM's sorted lists. Our\ncomprehensive and extensive experimental analysis validates the effectiveness\nof each strategy in LSC4Rec.\n","authors":["Zheqi Lv","Tianyu Zhan","Wenjie Wang","Xinyu Lin","Shengyu Zhang","Wenqiao Zhang","Jiwei Li","Kun Kuang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05647v1.pdf","comment":"Published on KDD'25: Proceedings of the ACM SIGKDD Conference on\n  Knowledge Discovery and Data Mining 2025"},{"id":"http://arxiv.org/abs/2501.06365v1","updated":"2025-01-10T22:07:56Z","published":"2025-01-10T22:07:56Z","title":"Gender-Neutral Large Language Models for Medical Applications: Reducing\n  Bias in PubMed Abstracts","summary":"  This paper presents a pipeline for mitigating gender bias in large language\nmodels (LLMs) used in medical literature by neutralizing gendered occupational\npronouns. A dataset of 379,000 PubMed abstracts from 1965-1980 was processed to\nidentify and modify pronouns tied to professions. We developed a BERT-based\nmodel, ``Modern Occupational Bias Elimination with Refined Training,'' or\n``MOBERT,'' trained on these neutralized abstracts, and compared its\nperformance with ``1965Bert,'' trained on the original dataset. MOBERT achieved\na 70\\% inclusive replacement rate, while 1965Bert reached only 4\\%. A further\nanalysis of MOBERT revealed that pronoun replacement accuracy correlated with\nthe frequency of occupational terms in the training data. We propose expanding\nthe dataset and refining the pipeline to improve performance and ensure more\nequitable language modeling in medical applications.\n","authors":["Elizabeth Schaefer","Kirk Roberts"],"pdf_url":"https://arxiv.org/pdf/2501.06365v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.06362v1","updated":"2025-01-10T21:58:34Z","published":"2025-01-10T21:58:34Z","title":"Repeat-bias-aware Optimization of Beyond-accuracy Metrics for Next\n  Basket Recommendation","summary":"  In next basket recommendation (NBR) a set of items is recommended to users\nbased on their historical basket sequences. In many domains, the recommended\nbaskets consist of both repeat items and explore items. Some state-of-the-art\nNBR methods are heavily biased to recommend repeat items so as to maximize\nutility. The evaluation and optimization of beyond-accuracy objectives for NBR,\nsuch as item fairness and diversity, has attracted increasing attention. How\ncan such beyond-accuracy objectives be pursued in the presence of heavy repeat\nbias? We find that only optimizing diversity or item fairness without\nconsidering repeat bias may cause NBR algorithms to recommend more repeat\nitems. To solve this problem, we propose a model-agnostic repeat-bias-aware\noptimization algorithm to post-process the recommended results obtained from\nNBR methods with the objective of mitigating repeat bias when optimizing\ndiversity or item fairness. We consider multiple variations of our optimization\nalgorithm to cater to multiple NBR methods. Experiments on three real-world\ngrocery shopping datasets show that the proposed algorithms can effectively\nimprove diversity and item fairness, and mitigate repeat bias at acceptable\nRecall loss.\n","authors":["Yuanna Liu","Ming Li","Mohammad Aliannejadi","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2501.06362v1.pdf","comment":"This paper has been accepted as a full paper at the 47th European\n  Conference on Information Retrieval (ECIR2025)"},{"id":"http://arxiv.org/abs/2412.16435v2","updated":"2025-01-10T19:49:12Z","published":"2024-12-21T01:52:03Z","title":"THeGCN: Temporal Heterophilic Graph Convolutional Network","summary":"  Graph Neural Networks (GNNs) have exhibited remarkable efficacy in diverse\ngraph learning tasks, particularly on static homophilic graphs. Recent\nattention has pivoted towards more intricate structures, encompassing (1)\nstatic heterophilic graphs encountering the edge heterophily issue in the\nspatial domain and (2) event-based continuous graphs in the temporal domain.\nState-of-the-art (SOTA) has been concurrently addressing these two lines of\nwork but tends to overlook the presence of heterophily in the temporal domain,\nconstituting the temporal heterophily issue. Furthermore, we highlight that the\nedge heterophily issue and the temporal heterophily issue often co-exist in\nevent-based continuous graphs, giving rise to the temporal edge heterophily\nchallenge. To tackle this challenge, this paper first introduces the temporal\nedge heterophily measurement. Subsequently, we propose the Temporal\nHeterophilic Graph Convolutional Network (THeGCN), an innovative model that\nincorporates the low/high-pass graph signal filtering technique to accurately\ncapture both edge (spatial) heterophily and temporal heterophily. Specifically,\nthe THeGCN model consists of two key components: a sampler and an aggregator.\nThe sampler selects events relevant to a node at a given moment. Then, the\naggregator executes message-passing, encoding temporal information, node\nattributes, and edge attributes into node embeddings. Extensive experiments\nconducted on 5 real-world datasets validate the efficacy of THeGCN.\n","authors":["Yuchen Yan","Yuzhong Chen","Huiyuan Chen","Xiaoting Li","Zhe Xu","Zhichen Zeng","Lihui Liu","Zhining Liu","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2412.16435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06277v1","updated":"2025-01-10T12:48:29Z","published":"2025-01-10T12:48:29Z","title":"Environmental large language model Evaluation (ELLE) dataset: A\n  Benchmark for Evaluating Generative AI applications in Eco-environment Domain","summary":"  Generative AI holds significant potential for ecological and environmental\napplications such as monitoring, data analysis, education, and policy support.\nHowever, its effectiveness is limited by the lack of a unified evaluation\nframework. To address this, we present the Environmental Large Language model\nEvaluation (ELLE) question answer (QA) dataset, the first benchmark designed to\nassess large language models and their applications in ecological and\nenvironmental sciences. The ELLE dataset includes 1,130 question answer pairs\nacross 16 environmental topics, categorized by domain, difficulty, and type.\nThis comprehensive dataset standardizes performance assessments in these\nfields, enabling consistent and objective comparisons of generative AI\nperformance. By providing a dedicated evaluation tool, ELLE dataset promotes\nthe development and application of generative AI technologies for sustainable\nenvironmental outcomes. The dataset and code are available at\nhttps://elle.ceeai.net/ and https://github.com/CEEAI/elle.\n","authors":["Jing Guo","Nan Li","Ming Xu"],"pdf_url":"https://arxiv.org/pdf/2501.06277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07596v1","updated":"2025-01-10T01:42:43Z","published":"2025-01-10T01:42:43Z","title":"Optimize Incompatible Parameters through Compatibility-aware Knowledge\n  Integration","summary":"  Deep neural networks have become foundational to advancements in multiple\ndomains, including recommendation systems, natural language processing, and so\non. Despite their successes, these models often contain incompatible parameters\nthat can be underutilized or detrimental to model performance, particularly\nwhen faced with specific, varying data distributions. Existing research excels\nin removing such parameters or merging the outputs of multiple different\npretrained models. However, the former focuses on efficiency rather than\nperformance, while the latter requires several times more computing and storage\nresources to support inference. In this paper, we set the goal to explicitly\nimprove these incompatible parameters by leveraging the complementary strengths\nof different models, thereby directly enhancing the models without any\nadditional parameters. Specifically, we propose Compatibility-aware Knowledge\nIntegration (CKI), which consists of Parameter Compatibility Assessment and\nParameter Splicing, which are used to evaluate the knowledge content of\nmultiple models and integrate the knowledge into one model, respectively. The\nintegrated model can be used directly for inference or for further fine-tuning.\nWe conduct extensive experiments on various datasets for recommendation and\nlanguage tasks, and the results show that Compatibility-aware Knowledge\nIntegration can effectively optimize incompatible parameters under multiple\ntasks and settings to break through the training limit of the original model\nwithout increasing the inference cost.\n","authors":["Zheqi Lv","Keming Ye","Zishu Wei","Qi Tian","Shengyu Zhang","Wenqiao Zhang","Wenjie Wang","Kun Kuang","Tat-Seng Chua","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2501.07596v1.pdf","comment":"Published on AAAI'25: The Annual AAAI Conference on Artificial\n  Intelligence"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2501.05450v2","updated":"2025-01-10T18:58:11Z","published":"2025-01-09T18:59:56Z","title":"Decentralized Diffusion Models","summary":"  Large-scale AI model training divides work across thousands of GPUs, then\nsynchronizes gradients across them at each step. This incurs a significant\nnetwork burden that only centralized, monolithic clusters can support, driving\nup infrastructure costs and straining power systems. We propose Decentralized\nDiffusion Models, a scalable framework for distributing diffusion model\ntraining across independent clusters or datacenters by eliminating the\ndependence on a centralized, high-bandwidth networking fabric. Our method\ntrains a set of expert diffusion models over partitions of the dataset, each in\nfull isolation from one another. At inference time, the experts ensemble\nthrough a lightweight router. We show that the ensemble collectively optimizes\nthe same objective as a single model trained over the whole dataset. This means\nwe can divide the training burden among a number of \"compute islands,\" lowering\ninfrastructure costs and improving resilience to localized GPU failures.\nDecentralized diffusion models empower researchers to take advantage of\nsmaller, more cost-effective and more readily available compute like on-demand\nGPU nodes rather than central integrated systems. We conduct extensive\nexperiments on ImageNet and LAION Aesthetics, showing that decentralized\ndiffusion models FLOP-for-FLOP outperform standard diffusion models. We finally\nscale our approach to 24 billion parameters, demonstrating that high-quality\ndiffusion models can now be trained with just eight individual GPU nodes in\nless than a week.\n","authors":["David McAllister","Matthew Tancik","Jiaming Song","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2501.05450v2.pdf","comment":"Project webpage: https://decentralizeddiffusion.github.io/"},{"id":"http://arxiv.org/abs/2501.06171v1","updated":"2025-01-10T18:50:45Z","published":"2025-01-10T18:50:45Z","title":"Machine Learning Force-Field Approach for Itinerant Electron Magnets","summary":"  We review the recent development of machine-learning (ML) force-field\nframeworks for Landau-Lifshitz-Gilbert (LLG) dynamics simulations of itinerant\nelectron magnets, focusing on the general theory and implementations of\nsymmetry-invariant representations of spin configurations. The crucial\nproperties that such magnetic descriptors must satisfy are differentiability\nwith respect to spin rotations and invariance to both lattice point-group\nsymmetry and internal spin rotation symmetry. We propose an efficient\nimplementation based on the concept of reference irreducible representations,\nmodified from the group-theoretical power-spectrum and bispectrum methods. The\nML framework is demonstrated using the s-d models, which are widely applied in\nspintronics research. We show that LLG simulations based on local fields\npredicted by the trained ML models successfully reproduce representative\nnon-collinear spin structures, including 120$^\\circ$, tetrahedral, and skyrmion\ncrystal orders of the triangular-lattice s-d models. Large-scale thermal quench\nsimulations enabled by ML models further reveal intriguing freezing dynamics\nand glassy stripe states consisting of skyrmions and bi-merons. Our work\nhighlights the utility of ML force-field approach to dynamical modeling of\ncomplex spin orders in itinerant electron magnets.\n","authors":["Sheng Zhang","Yunhao Fan","Kotaro Shimizu","Gia-Wei Chern"],"pdf_url":"https://arxiv.org/pdf/2501.06171v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.06167v1","updated":"2025-01-10T18:46:28Z","published":"2025-01-10T18:46:28Z","title":"Meta-Learning for Physically-Constrained Neural System Identification","summary":"  We present a gradient-based meta-learning framework for rapid adaptation of\nneural state-space models (NSSMs) for black-box system identification. When\napplicable, we also incorporate domain-specific physical constraints to improve\nthe accuracy of the NSSM. The major benefit of our approach is that instead of\nrelying solely on data from a single target system, our framework utilizes data\nfrom a diverse set of source systems, enabling learning from limited target\ndata, as well as with few online training iterations. Through benchmark\nexamples, we demonstrate the potential of our approach, study the effect of\nfine-tuning subnetworks rather than full fine-tuning, and report real-world\ncase studies to illustrate the practical application and generalizability of\nthe approach to practical problems with physical-constraints. Specifically, we\nshow that the meta-learned models result in improved downstream performance in\nmodel-based state estimation in indoor localization and energy systems.\n","authors":["Ankush Chakrabarty","Gordon Wichern","Vedang M. Deshpande","Abraham P. Vinod","Karl Berntorp","Christopher R. Laughman"],"pdf_url":"https://arxiv.org/pdf/2501.06167v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2501.06164v1","updated":"2025-01-10T18:39:29Z","published":"2025-01-10T18:39:29Z","title":"Model Alignment Search","summary":"  When can we say that two neural systems are the same? The answer to this\nquestion is goal-dependent, and it is often addressed through correlative\nmethods such as Representational Similarity Analysis (RSA) and Centered Kernel\nAlignment (CKA). What do we miss when we forgo causal explorations, and how can\nwe target specific types of similarity? In this work, we introduce Model\nAlignment Search (MAS), a method for causally exploring distributed\nrepresentational similarity. The method learns invertible linear\ntransformations that align a subspace between two distributed networks'\nrepresentations where causal information can be freely interchanged. We first\nshow that the method can be used to transfer specific causal variables, such as\nthe number of items in a counting task, between networks with different\ntraining seeds. We then explore open questions in number cognition by comparing\ndifferent types of numeric representations in models trained on structurally\ndifferent numeric tasks. We then explore differences between MAS vs preexisting\ncausal similarity methods, showing MAS to be more resistant to unwanted\nexchanges. Lastly, we introduce a counterfactual latent auxiliary loss function\nthat helps shape causally relevant alignments even in cases where we do not\nhave causal access to one of the two models for training.\n","authors":["Satchel Grant"],"pdf_url":"https://arxiv.org/pdf/2501.06164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06159v1","updated":"2025-01-10T18:32:05Z","published":"2025-01-10T18:32:05Z","title":"Efficient Transition State Searches by Freezing String Method with Graph\n  Neural Network Potentials","summary":"  Transition states are a critical bottleneck in chemical transformations.\nSignificant efforts have been made to develop algorithms that efficiently\nlocate transition states on potential energy surfaces. However, the\ncomputational cost of ab-initio potential energy surface evaluation limits the\nsize of chemical systems that can routinely studied. In this work, we develop\nand fine-tune a graph neural network potential energy function suitable for\ndescribing organic chemical reactions and use it to rapidly identify transition\nstate guess structures. We successfully refine guess structures and locate a\ntransition state in each test system considered and reduce the average number\nof ab-initio calculations by 47% though use of the graph neural network\npotential energy function. Our results show that modern machine learning models\nhave reached levels of reliability whereby they can be used to accelerate\nroutine computational chemistry tasks.\n","authors":["Jonah Marks","Joseph Gomes"],"pdf_url":"https://arxiv.org/pdf/2501.06159v1.pdf","comment":"9 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2405.12327v3","updated":"2025-01-10T18:30:45Z","published":"2024-05-20T18:52:33Z","title":"Beyond Item Dissimilarities: Diversifying by Intent in Recommender\n  Systems","summary":"  It has become increasingly clear that recommender systems that overly focus\non short-term engagement prevents users from exploring diverse interests,\nultimately hurting long-term user experience. To tackle this challenge,\nnumerous diversification algorithms have been proposed. These algorithms\ntypically rely on measures of item similarity, aiming to maximize the\ndissimilarity across items in the final set of recommendations. However, in\nthis work, we demonstrate the benefits of going beyond item-level similarities\nby utilizing higher-level user understanding--specifically, user intents that\npersist across multiple interactions--in diversification. Our approach is\nmotivated by the observation that user behaviors on online platforms are\nlargely driven by their underlying intents. Therefore, recommendations should\nensure that diverse user intents are accurately represented. While intent has\nprimarily been studied in the context of search, it is less clear how to\nincorporate real-time dynamic intent predictions into recommender systems. To\naddress this gap, we develop a probabilistic intent-based whole-page\ndiversification framework for the final stage of a recommender system. Starting\nwith a prior belief of user intents, the proposed framework sequentially\nselects items for each position based on these beliefs and subsequently updates\nposterior beliefs about the intents. This approach ensures that different user\nintents are represented on a page, towards optimizing long-term user\nexperience. We experiment with the intent diversification framework on YouTube,\nthe world's largest video recommendation platform, serving billions of users\ndaily. Live experiments on a diverse set of intents show that the proposed\nframework increases Daily Active Users (DAU) and overall user enjoyment,\nvalidating its effectiveness in facilitating long-term planning.\n","authors":["Yuyan Wang","Cheenar Banerjee","Samer Chucri","Fabio Soldo","Sriraj Badam","Ed H. Chi","Minmin Chen"],"pdf_url":"https://arxiv.org/pdf/2405.12327v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06158v1","updated":"2025-01-10T18:30:05Z","published":"2025-01-10T18:30:05Z","title":"GenMol: A Drug Discovery Generalist with Discrete Diffusion","summary":"  Drug discovery is a complex process that involves multiple scenarios and\nstages, such as fragment-constrained molecule generation, hit generation and\nlead optimization. However, existing molecular generative models can only\ntackle one or two of these scenarios and lack the flexibility to address\nvarious aspects of the drug discovery pipeline. In this paper, we present\nGeneralist Molecular generative model (GenMol), a versatile framework that\naddresses these limitations by applying discrete diffusion to the Sequential\nAttachment-based Fragment Embedding (SAFE) molecular representation. GenMol\ngenerates SAFE sequences through non-autoregressive bidirectional parallel\ndecoding, thereby allowing utilization of a molecular context that does not\nrely on the specific token ordering and enhanced computational efficiency.\nMoreover, under the discrete diffusion framework, we introduce fragment\nremasking, a strategy that optimizes molecules by replacing fragments with\nmasked tokens and regenerating them, enabling effective exploration of chemical\nspace. GenMol significantly outperforms the previous GPT-based model trained on\nSAFE representations in de novo generation and fragment-constrained generation,\nand achieves state-of-the-art performance in goal-directed hit generation and\nlead optimization. These experimental results demonstrate that GenMol can\ntackle a wide range of drug discovery tasks, providing a unified and versatile\napproach for molecular design.\n","authors":["Seul Lee","Karsten Kreis","Srimukh Prasad Veccham","Meng Liu","Danny Reidenbach","Yuxing Peng","Saee Paliwal","Weili Nie","Arash Vahdat"],"pdf_url":"https://arxiv.org/pdf/2501.06158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06148v1","updated":"2025-01-10T18:18:25Z","published":"2025-01-10T18:18:25Z","title":"From discrete-time policies to continuous-time diffusion samplers:\n  Asymptotic equivalences and faster training","summary":"  We study the problem of training neural stochastic differential equations, or\ndiffusion models, to sample from a Boltzmann distribution without access to\ntarget samples. Existing methods for training such models enforce time-reversal\nof the generative and noising processes, using either differentiable simulation\nor off-policy reinforcement learning (RL). We prove equivalences between\nfamilies of objectives in the limit of infinitesimal discretization steps,\nlinking entropic RL methods (GFlowNets) with continuous-time objects (partial\ndifferential equations and path space measures). We further show that an\nappropriate choice of coarse time discretization during training allows greatly\nimproved sample efficiency and the use of time-local objectives, achieving\ncompetitive performance on standard sampling benchmarks with reduced\ncomputational cost.\n","authors":["Julius Berner","Lorenz Richter","Marcin Sendera","Jarrid Rector-Brooks","Nikolay Malkin"],"pdf_url":"https://arxiv.org/pdf/2501.06148v1.pdf","comment":"code: https://github.com/GFNOrg/gfn-diffusion/tree/stagger"},{"id":"http://arxiv.org/abs/2410.02780v2","updated":"2025-01-10T18:14:56Z","published":"2024-09-17T19:07:13Z","title":"Guess What I Think: Streamlined EEG-to-Image Generation with Latent\n  Diffusion Models","summary":"  Generating images from brain waves is gaining increasing attention due to its\npotential to advance brain-computer interface (BCI) systems by understanding\nhow brain signals encode visual cues. Most of the literature has focused on\nfMRI-to-Image tasks as fMRI is characterized by high spatial resolution.\nHowever, fMRI is an expensive neuroimaging modality and does not allow for\nreal-time BCI. On the other hand, electroencephalography (EEG) is a low-cost,\nnon-invasive, and portable neuroimaging technique, making it an attractive\noption for future real-time applications. Nevertheless, EEG presents inherent\nchallenges due to its low spatial resolution and susceptibility to noise and\nartifacts, which makes generating images from EEG more difficult. In this\npaper, we address these problems with a streamlined framework based on the\nControlNet adapter for conditioning a latent diffusion model (LDM) through EEG\nsignals. We conduct experiments and ablation studies on popular benchmarks to\ndemonstrate that the proposed method beats other state-of-the-art models.\nUnlike these methods, which often require extensive preprocessing, pretraining,\ndifferent losses, and captioning models, our approach is efficient and\nstraightforward, requiring only minimal preprocessing and a few components. The\ncode is available at https://github.com/LuigiSigillo/GWIT.\n","authors":["Eleonora Lopez","Luigi Sigillo","Federica Colonnese","Massimo Panella","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2410.02780v2.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.06141v1","updated":"2025-01-10T18:03:46Z","published":"2025-01-10T18:03:46Z","title":"Emergent Symbol-like Number Variables in Artificial Neural Networks","summary":"  What types of numeric representations emerge in Neural Networks (NNs)? To\nwhat degree do NNs induce abstract, mutable, slot-like numeric variables, and\nin what situations do these representations emerge? How do these\nrepresentations change over learning, and how can we understand the neural\nimplementations in ways that are unified across different NNs? In this work, we\napproach these questions by first training sequence based neural systems using\nNext Token Prediction (NTP) objectives on numeric tasks. We then seek to\nunderstand the neural solutions through the lens of causal abstractions or\nsymbolic algorithms. We use a combination of causal interventions and\nvisualization methods to find that artificial neural models do indeed develop\nanalogs of interchangeable, mutable, latent number variables purely from the\nNTP objective. We then ask how variations on the tasks and model architectures\naffect the models' learned solutions to find that these symbol-like numeric\nrepresentations do not form for every variant of the task, and transformers\nsolve the problem in a notably different way than their recurrent counterparts.\nWe then show how the symbol-like variables change over the course of training\nto find a strong correlation between the models' task performance and the\nalignment of their symbol-like representations. Lastly, we show that in all\ncases, some degree of gradience exists in these neural symbols, highlighting\nthe difficulty of finding simple, interpretable symbolic stories of how neural\nnetworks perform numeric tasks. Taken together, our results are consistent with\nthe view that neural networks can approximate interpretable symbolic programs\nof number cognition, but the particular program they approximate and the extent\nto which they approximate it can vary widely, depending on the network\narchitecture, training data, extent of training, and network size.\n","authors":["Satchel Grant","Noah D. Goodman","James L. McClelland"],"pdf_url":"https://arxiv.org/pdf/2501.06141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11456v2","updated":"2025-01-10T17:54:39Z","published":"2024-09-17T17:48:12Z","title":"Two Stage Segmentation of Cervical Tumors using PocketNet","summary":"  Cervical cancer remains the fourth most common malignancy amongst women\nworldwide.1 Concurrent chemoradiotherapy (CRT) serves as the mainstay\ndefinitive treatment regimen for locally advanced cervical cancers and includes\nexternal beam radiation followed by brachytherapy.2 Integral to radiotherapy\ntreatment planning is the routine contouring of both the target tumor at the\nlevel of the cervix, associated gynecologic anatomy and the adjacent organs at\nrisk (OARs). However, manual contouring of these structures is both time and\nlabor intensive and associated with known interobserver variability that can\nimpact treatment outcomes. While multiple tools have been developed to\nautomatically segment OARs and the high-risk clinical tumor volume (HR-CTV)\nusing computed tomography (CT) images,3,4,5,6 the development of deep\nlearning-based tumor segmentation tools using routine T2-weighted (T2w)\nmagnetic resonance imaging (MRI) addresses an unmet clinical need to improve\nthe routine contouring of both anatomical structures and cervical cancers,\nthereby increasing quality and consistency of radiotherapy planning. This work\napplied a novel deep-learning model (PocketNet) to segment the cervix, vagina,\nuterus, and tumor(s) on T2w MRI. The performance of the PocketNet architecture\nwas evaluated, when trained on data via 5-fold cross validation. PocketNet\nachieved a mean Dice-Sorensen similarity coefficient (DSC) exceeding 70% for\ntumor segmentation and 80% for organ segmentation. These results suggest that\nPocketNet is robust to variations in contrast protocols, providing reliable\nsegmentation of the regions of interest.\n","authors":["Awj Twam","Megan Jacobsen","Rachel Glenn","Peng Wei","Jia Sun","Ann Klopp","Aradhana M. Venkatesan","David Fuentes"],"pdf_url":"https://arxiv.org/pdf/2409.11456v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02189v2","updated":"2025-01-10T17:43:10Z","published":"2025-01-04T04:59:33Z","title":"Benchmark Evaluations, Applications, and Challenges of Large Vision\n  Language Models: A Survey","summary":"  Multimodal Vision Language Models (VLMs) have emerged as a transformative\ntechnology at the intersection of computer vision and natural language\nprocessing, enabling machines to perceive and reason about the world through\nboth visual and textual modalities. For example, models such as CLIP, Claude,\nand GPT-4V demonstrate strong reasoning and understanding abilities on visual\nand textual data and beat classical single modality vision models on zero-shot\nclassification. Despite their rapid advancements in research and growing\npopularity in applications, a comprehensive survey of existing studies on VLMs\nis notably lacking, particularly for researchers aiming to leverage VLMs in\ntheir specific domains. To this end, we provide a systematic overview of VLMs\nin the following aspects: model information of the major VLMs developed over\nthe past five years (2019-2024); the main architectures and training methods of\nthese VLMs; summary and categorization of the popular benchmarks and evaluation\nmetrics of VLMs; the applications of VLMs including embodied agents, robotics,\nand video generation; the challenges and issues faced by current VLMs such as\nhallucination, fairness, and safety. Detailed collections including papers and\nmodel repository links are listed in\nhttps://github.com/zli12321/Awesome-VLM-Papers-And-Models.git.\n","authors":["Zongxia Li","Xiyang Wu","Hongyang Du","Huy Nghiem","Guangyao Shi"],"pdf_url":"https://arxiv.org/pdf/2501.02189v2.pdf","comment":"35 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.06126v1","updated":"2025-01-10T17:25:11Z","published":"2025-01-10T17:25:11Z","title":"Merging Feed-Forward Sublayers for Compressed Transformers","summary":"  With the rise and ubiquity of larger deep learning models, the need for\nhigh-quality compression techniques is growing in order to deploy these models\nwidely. The sheer parameter count of these models makes it difficult to fit\nthem into the memory constraints of different hardware. In this work, we\npresent a novel approach to model compression by merging similar parameter\ngroups within a model, rather than pruning away less important parameters.\nSpecifically, we select, align, and merge separate feed-forward sublayers in\nTransformer models, and test our method on language modeling, image\nclassification, and machine translation. With our method, we demonstrate\nperformance comparable to the original models while combining more than a third\nof model feed-forward sublayers, and demonstrate improved performance over a\nstrong layer-pruning baseline. For instance, we can remove over 21% of total\nparameters from a Vision Transformer, while maintaining 99% of its original\nperformance. Additionally, we observe that some groups of feed-forward\nsublayers exhibit high activation similarity, which may help explain their\nsurprising mergeability.\n","authors":["Neha Verma","Kenton Murray","Kevin Duh"],"pdf_url":"https://arxiv.org/pdf/2501.06126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08999v2","updated":"2025-01-10T17:04:47Z","published":"2023-12-14T14:44:08Z","title":"Conformalised data synthesis","summary":"  With the proliferation of increasingly complicated Deep Learning\narchitectures, data synthesis is a highly promising technique to address the\ndemand of data-hungry models. However, reliably assessing the quality of a\n'synthesiser' model's output is an open research question with significant\nassociated risks for high-stake domains. To address this challenge, we propose\na unique synthesis algorithm that generates data from high-confidence feature\nspace regions based on the Conformal Prediction framework. We support our\nproposed algorithm with a comprehensive exploration of the core parameter's\ninfluence, an in-depth discussion of practical advice, and an extensive\nempirical evaluation of five benchmark datasets. To show our approach's\nversatility on ubiquitous real-world challenges, the datasets were carefully\nselected for their variety of difficult characteristics: low sample count,\nclass imbalance, and non-separability. In all trials, training sets extended\nwith our confident synthesised data performed at least as well as the original\nset and frequently significantly improved Deep Learning performance by up to 61\npercentage points F1-score.\n","authors":["Julia A. Meister","Khuong An Nguyen"],"pdf_url":"https://arxiv.org/pdf/2312.08999v2.pdf","comment":"Accepted for publication in the Machine Learning journal special\n  issue \"Conformal Prediction and Distribution-Free Uncertainty Quantification\""},{"id":"http://arxiv.org/abs/2501.06108v1","updated":"2025-01-10T17:01:09Z","published":"2025-01-10T17:01:09Z","title":"Inferring High-Order Couplings with Neural Networks","summary":"  Maximum-entropy methods, rooted in the inverse Ising/Potts problem from\nstatistical mechanics, have become indispensable tools for modeling pairwise\ninteractions in disciplines such as bioinformatics, ecology, and neuroscience.\nDespite their remarkable success, these methods often overlook high-order\ninteractions that may be crucial in complex systems. Conversely, while modern\nmachine learning approaches can capture such interactions, existing\ninterpretable frameworks are computationally expensive, making it impractical\nto assess the relevance of high-order interactions in real-world scenarios.\nRestricted Boltzmann Machines (RBMs) offer a computationally efficient\nalternative by encoding statistical correlations via hidden nodes in a\nbipartite neural network. Here, we present a method that maps RBMs exactly onto\ngeneralized Potts models with interactions of arbitrary high order. This\napproach leverages large-$N$ approximations, facilitated by the simple\narchitecture of the RBM, to enable the efficient extraction of effective\nmany-body couplings with minimal computational cost. This mapping also enables\nthe development of a general formal framework for the extraction of effective\nhigher-order interactions in arbitrarily complex probabilistic models.\nAdditionally, we introduce a robust formalism for gauge fixing within the\ngeneralized Potts model. We validate our method by accurately recovering two-\nand three-body interactions from synthetic datasets. Additionally, applying our\nframework to protein sequence data demonstrates its effectiveness in\nreconstructing protein contact maps, achieving performance comparable to\nstate-of-the-art inverse Potts models. These results position RBMs as a\npowerful and efficient tool for investigating high-order interactions in\ncomplex systems.\n","authors":["Aurélien Decelle","Alfonso de Jesús Navas Gómez","Beatriz Seoane"],"pdf_url":"https://arxiv.org/pdf/2501.06108v1.pdf","comment":"13 Pages and 3 Figures"},{"id":"http://arxiv.org/abs/2412.14306v2","updated":"2025-01-10T17:00:34Z","published":"2024-12-18T20:19:56Z","title":"Closing the Gap: A User Study on the Real-world Usefulness of AI-powered\n  Vulnerability Detection & Repair in the IDE","summary":"  This paper presents the first empirical study of a vulnerability detection\nand fix tool with professional software developers on real projects that they\nown. We implemented DeepVulGuard, an IDE-integrated tool based on\nstate-of-the-art detection and fix models, and show that it has promising\nperformance on benchmarks of historic vulnerability data. DeepVulGuard scans\ncode for vulnerabilities (including identifying the vulnerability type and\nvulnerable region of code), suggests fixes, provides natural-language\nexplanations for alerts and fixes, leveraging chat interfaces. We recruited 17\nprofessional software developers at Microsoft, observed their usage of the tool\non their code, and conducted interviews to assess the tool's usefulness, speed,\ntrust, relevance, and workflow integration. We also gathered detailed\nqualitative feedback on users' perceptions and their desired features. Study\nparticipants scanned a total of 24 projects, 6.9k files, and over 1.7 million\nlines of source code, and generated 170 alerts and 50 fix suggestions. We find\nthat although state-of-the-art AI-powered detection and fix tools show promise,\nthey are not yet practical for real-world use due to a high rate of false\npositives and non-applicable fixes. User feedback reveals several actionable\npain points, ranging from incomplete context to lack of customization for the\nuser's codebase. Additionally, we explore how AI features, including confidence\nscores, explanations, and chat interaction, can apply to vulnerability\ndetection and fixing. Based on these insights, we offer practical\nrecommendations for evaluating and deploying AI detection and fix models. Our\ncode and data are available at https://doi.org/10.6084/m9.figshare.26367139.\n","authors":["Benjamin Steenhoek","Kalpathy Sivaraman","Renata Saldivar Gonzalez","Yevhen Mohylevskyy","Roshanak Zilouchian Moghaddam","Wei Le"],"pdf_url":"https://arxiv.org/pdf/2412.14306v2.pdf","comment":"Accepted to ICSE 2025 research track. Camera-ready version with\n  updated acknowledgments"},{"id":"http://arxiv.org/abs/2501.05409v2","updated":"2025-01-10T16:58:29Z","published":"2025-01-09T18:06:45Z","title":"Atlas: A Novel Pathology Foundation Model by Mayo Clinic, Charité, and\n  Aignostics","summary":"  Recent advances in digital pathology have demonstrated the effectiveness of\nfoundation models across diverse applications. In this report, we present\nAtlas, a novel vision foundation model based on the RudolfV approach. Our model\nwas trained on a dataset comprising 1.2 million histopathology whole slide\nimages, collected from two medical institutions: Mayo Clinic and Charit\\'e -\nUniverst\\\"atsmedizin Berlin. Comprehensive evaluations show that Atlas achieves\nstate-of-the-art performance across twenty-one public benchmark datasets, even\nthough it is neither the largest model by parameter count nor by training\ndataset size.\n","authors":["Maximilian Alber","Stephan Tietz","Jonas Dippel","Timo Milbich","Timothée Lesort","Panos Korfiatis","Moritz Krügener","Beatriz Perez Cancer","Neelay Shah","Alexander Möllers","Philipp Seegerer","Alexandra Carpen-Amarie","Kai Standvoss","Gabriel Dernbach","Edwin de Jong","Simon Schallenberg","Andreas Kunft","Helmut Hoffer von Ankershoffen","Gavin Schaeferle","Patrick Duffy","Matt Redlon","Philipp Jurmeister","David Horst","Lukas Ruff","Klaus-Robert Müller","Frederick Klauschen","Andrew Norgan"],"pdf_url":"https://arxiv.org/pdf/2501.05409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06103v1","updated":"2025-01-10T16:54:56Z","published":"2025-01-10T16:54:56Z","title":"Finite-Horizon Single-Pull Restless Bandits: An Efficient Index Policy\n  For Scarce Resource Allocation","summary":"  Restless multi-armed bandits (RMABs) have been highly successful in\noptimizing sequential resource allocation across many domains. However, in many\npractical settings with highly scarce resources, where each agent can only\nreceive at most one resource, such as healthcare intervention programs, the\nstandard RMAB framework falls short. To tackle such scenarios, we introduce\nFinite-Horizon Single-Pull RMABs (SPRMABs), a novel variant in which each arm\ncan only be pulled once. This single-pull constraint introduces additional\ncomplexity, rendering many existing RMAB solutions suboptimal or ineffective.\n%To address this, we propose using dummy states to duplicate the system,\nensuring that once an arm is activated, it transitions exclusively within the\ndummy states. To address this shortcoming, we propose using \\textit{dummy\nstates} that expand the system and enforce the one-pull constraint. We then\ndesign a lightweight index policy for this expanded system. For the first time,\nwe demonstrate that our index policy achieves a sub-linearly decaying average\noptimality gap of $\\tilde{\\mathcal{O}}\\left(\\frac{1}{\\rho^{1/2}}\\right)$ for a\nfinite number of arms, where $\\rho$ is the scaling factor for each arm cluster.\nExtensive simulations validate the proposed method, showing robust performance\nacross various domains compared to existing benchmarks.\n","authors":["Guojun Xiong","Haichuan Wang","Yuqi Pan","Saptarshi Mandal","Sanket Shah","Niclas Boehmer","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2501.06103v1.pdf","comment":"17 Pages, 8 figures. Accepted by AAMAS 2025"},{"id":"http://arxiv.org/abs/2501.06099v1","updated":"2025-01-10T16:53:48Z","published":"2025-01-10T16:53:48Z","title":"Explaining Deep Learning-based Anomaly Detection in Energy Consumption\n  Data by Focusing on Contextually Relevant Data","summary":"  Detecting anomalies in energy consumption data is crucial for identifying\nenergy waste, equipment malfunction, and overall, for ensuring efficient energy\nmanagement. Machine learning, and specifically deep learning approaches, have\nbeen greatly successful in anomaly detection; however, they are black-box\napproaches that do not provide transparency or explanations. SHAP and its\nvariants have been proposed to explain these models, but they suffer from high\ncomputational complexity (SHAP) or instability and inconsistency (e.g., Kernel\nSHAP). To address these challenges, this paper proposes an explainability\napproach for anomalies in energy consumption data that focuses on\ncontext-relevant information. The proposed approach leverages existing\nexplainability techniques, focusing on SHAP variants, together with global\nfeature importance and weighted cosine similarity to select background dataset\nbased on the context of each anomaly point. By focusing on the context and most\nrelevant features, this approach mitigates the instability of explainability\nalgorithms. Experimental results across 10 different machine learning models,\nfive datasets, and five XAI techniques, demonstrate that our method reduces the\nvariability of explanations providing consistent explanations. Statistical\nanalyses confirm the robustness of our approach, showing an average reduction\nin variability of approximately 38% across multiple datasets.\n","authors":["Mohammad Noorchenarboo","Katarina Grolinger"],"pdf_url":"https://arxiv.org/pdf/2501.06099v1.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.06089v1","updated":"2025-01-10T16:39:01Z","published":"2025-01-10T16:39:01Z","title":"Towards Developing Socially Compliant Automated Vehicles: State of the\n  Art, Experts Expectations, and A Conceptual Framework","summary":"  Automated Vehicles (AVs) hold promise for revolutionizing transportation by\nimproving road safety, traffic efficiency, and overall mobility. Despite the\nsteady advancement in high-level AVs in recent years, the transition to full\nautomation entails a period of mixed traffic, where AVs of varying automation\nlevels coexist with human-driven vehicles (HDVs). Making AVs socially compliant\nand understood by human drivers is expected to improve the safety and\nefficiency of mixed traffic. Thus, ensuring AVs compatibility with HDVs and\nsocial acceptance is crucial for their successful and seamless integration into\nmixed traffic. However, research in this critical area of developing Socially\nCompliant AVs (SCAVs) remains sparse. This study carries out the first\ncomprehensive scoping review to assess the current state of the art in\ndeveloping SCAVs, identifying key concepts, methodological approaches, and\nresearch gaps. An expert interview was also conducted to identify critical\nresearch gaps and expectations towards SCAVs. Based on the scoping review and\nexpert interview input, a conceptual framework is proposed for the development\nof SCAVs. The conceptual framework is evaluated using an online survey\ntargeting researchers, technicians, policymakers, and other relevant\nprofessionals worldwide. The survey results provide valuable validation and\ninsights, affirming the significance of the proposed conceptual framework in\ntackling the challenges of integrating AVs into mixed-traffic environments.\nAdditionally, future research perspectives and suggestions are discussed,\ncontributing to the research and development agenda of SCAVs.\n","authors":["Yongqi Dong","Bart van Arem","Haneen Farah"],"pdf_url":"https://arxiv.org/pdf/2501.06089v1.pdf","comment":"39 pages, 13 figures, under review by the journal of Transportation\n  Research Part E: Logistics and Transportation Review"},{"id":"http://arxiv.org/abs/2501.06086v1","updated":"2025-01-10T16:34:19Z","published":"2025-01-10T16:34:19Z","title":"All AI Models are Wrong, but Some are Optimal","summary":"  AI models that predict the future behavior of a system (a.k.a. predictive AI\nmodels) are central to intelligent decision-making. However, decision-making\nusing predictive AI models often results in suboptimal performance. This is\nprimarily because AI models are typically constructed to best fit the data, and\nhence to predict the most likely future rather than to enable high-performance\ndecision-making. The hope that such prediction enables high-performance\ndecisions is neither guaranteed in theory nor established in practice. In fact,\nthere is increasing empirical evidence that predictive models must be tailored\nto decision-making objectives for performance. In this paper, we establish\nformal (necessary and sufficient) conditions that a predictive model (AI-based\nor not) must satisfy for a decision-making policy established using that model\nto be optimal. We then discuss their implications for building predictive AI\nmodels for sequential decision-making.\n","authors":["Akhil S Anand","Shambhuraj Sawant","Dirk Reinhardt","Sebastien Gros"],"pdf_url":"https://arxiv.org/pdf/2501.06086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.06433v3","updated":"2025-01-10T16:26:43Z","published":"2022-10-12T17:30:12Z","title":"Self-supervised video pretraining yields robust and more human-aligned\n  visual representations","summary":"  Humans learn powerful representations of objects and scenes by observing how\nthey evolve over time. Yet, outside of specific tasks that require explicit\ntemporal understanding, static image pretraining remains the dominant paradigm\nfor learning visual foundation models. We question this mismatch, and ask\nwhether video pretraining can yield visual representations that bear the\nhallmarks of human perception: generalisation across tasks, robustness to\nperturbations, and consistency with human judgements. To that end we propose a\nnovel procedure for curating videos, and develop a contrastive framework which\nlearns from the complex transformations therein. This simple paradigm for\ndistilling knowledge from videos, called VITO, yields general representations\nthat far outperform prior video pretraining methods on image understanding\ntasks, and image pretraining methods on video understanding tasks. Moreover,\nVITO representations are significantly more robust to natural and synthetic\ndeformations than image-, video-, and adversarially-trained ones. Finally,\nVITO's predictions are strongly aligned with human judgements, surpassing\nmodels that were specifically trained for that purpose. Together, these results\nsuggest that video pretraining could be a simple way of learning unified,\nrobust, and human-aligned representations of the visual world.\n","authors":["Nikhil Parthasarathy","S. M. Ali Eslami","João Carreira","Olivier J. Hénaff"],"pdf_url":"https://arxiv.org/pdf/2210.06433v3.pdf","comment":"Accepted to 37th Conference on Neural Information Processing Systems\n  (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2501.06081v1","updated":"2025-01-10T16:15:25Z","published":"2025-01-10T16:15:25Z","title":"Averaged Adam accelerates stochastic optimization in the training of\n  deep neural network approximations for partial differential equation and\n  optimal control problems","summary":"  Deep learning methods - usually consisting of a class of deep neural networks\n(DNNs) trained by a stochastic gradient descent (SGD) optimization method - are\nnowadays omnipresent in data-driven learning problems as well as in scientific\ncomputing tasks such as optimal control (OC) and partial differential equation\n(PDE) problems. In practically relevant learning tasks, often not the\nplain-vanilla standard SGD optimization method is employed to train the\nconsidered class of DNNs but instead more sophisticated adaptive and\naccelerated variants of the standard SGD method such as the popular Adam\noptimizer are used. Inspired by the classical Polyak-Ruppert averaging\napproach, in this work we apply averaged variants of the Adam optimizer to\ntrain DNNs to approximately solve exemplary scientific computing problems in\nthe form of PDEs and OC problems. We test the averaged variants of Adam in a\nseries of learning problems including physics-informed neural network (PINN),\ndeep backward stochastic differential equation (deep BSDE), and deep Kolmogorov\napproximations for PDEs (such as heat, Black-Scholes, Burgers, and Allen-Cahn\nPDEs), including DNN approximations for OC problems, and including DNN\napproximations for image classification problems (ResNet for CIFAR-10). In each\nof the numerical examples the employed averaged variants of Adam outperform the\nstandard Adam and the standard SGD optimizers, particularly, in the situation\nof the scientific machine learning problems. The Python source codes for the\nnumerical experiments associated to this work can be found on GitHub at\nhttps://github.com/deeplearningmethods/averaged-adam.\n","authors":["Steffen Dereich","Arnulf Jentzen","Adrian Riekert"],"pdf_url":"https://arxiv.org/pdf/2501.06081v1.pdf","comment":"25 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.06080v1","updated":"2025-01-10T16:15:23Z","published":"2025-01-10T16:15:23Z","title":"Scale-up Unlearnable Examples Learning with High-Performance Computing","summary":"  Recent advancements in AI models are structured to retain user interactions,\nwhich could inadvertently include sensitive healthcare data. In the healthcare\nfield, particularly when radiologists use AI-driven diagnostic tools hosted on\nonline platforms, there is a risk that medical imaging data may be repurposed\nfor future AI training without explicit consent, spotlighting critical privacy\nand intellectual property concerns around healthcare data usage. Addressing\nthese privacy challenges, a novel approach known as Unlearnable Examples (UEs)\nhas been introduced, aiming to make data unlearnable to deep learning models. A\nprominent method within this area, called Unlearnable Clustering (UC), has\nshown improved UE performance with larger batch sizes but was previously\nlimited by computational resources. To push the boundaries of UE performance\nwith theoretically unlimited resources, we scaled up UC learning across various\ndatasets using Distributed Data Parallel (DDP) training on the Summit\nsupercomputer. Our goal was to examine UE efficacy at high-performance\ncomputing (HPC) levels to prevent unauthorized learning and enhance data\nsecurity, particularly exploring the impact of batch size on UE's\nunlearnability. Utilizing the robust computational capabilities of the Summit,\nextensive experiments were conducted on diverse datasets such as Pets,\nMedMNist, Flowers, and Flowers102. Our findings reveal that both overly large\nand overly small batch sizes can lead to performance instability and affect\naccuracy. However, the relationship between batch size and unlearnability\nvaried across datasets, highlighting the necessity for tailored batch size\nstrategies to achieve optimal data protection. Our results underscore the\ncritical role of selecting appropriate batch sizes based on the specific\ncharacteristics of each dataset to prevent learning and ensure data security in\ndeep learning applications.\n","authors":["Yanfan Zhu","Issac Lyngaas","Murali Gopalakrishnan Meena","Mary Ellen I. Koran","Bradley Malin","Daniel Moyer","Shunxing Bao","Anuj Kapadia","Xiao Wang","Bennett Landman","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2501.06080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06078v1","updated":"2025-01-10T16:14:35Z","published":"2025-01-10T16:14:35Z","title":"Explaining k-Nearest Neighbors: Abductive and Counterfactual\n  Explanations","summary":"  Despite the wide use of $k$-Nearest Neighbors as classification models, their\nexplainability properties remain poorly understood from a theoretical\nperspective. While nearest neighbors classifiers offer interpretability from a\n\"data perspective\", in which the classification of an input vector $\\bar{x}$ is\nexplained by identifying the vectors $\\bar{v}_1, \\ldots, \\bar{v}_k$ in the\ntraining set that determine the classification of $\\bar{x}$, we argue that such\nexplanations can be impractical in high-dimensional applications, where each\nvector has hundreds or thousands of features and it is not clear what their\nrelative importance is. Hence, we focus on understanding nearest neighbor\nclassifications through a \"feature perspective\", in which the goal is to\nidentify how the values of the features in $\\bar{x}$ affect its classification.\nConcretely, we study abductive explanations such as \"minimum sufficient\nreasons\", which correspond to sets of features in $\\bar{x}$ that are enough to\nguarantee its classification, and \"counterfactual explanations\" based on the\nminimum distance feature changes one would have to perform in $\\bar{x}$ to\nchange its classification. We present a detailed landscape of positive and\nnegative complexity results for counterfactual and abductive explanations,\ndistinguishing between discrete and continuous feature spaces, and considering\nthe impact of the choice of distance function involved. Finally, we show that\ndespite some negative complexity results, Integer Quadratic Programming and SAT\nsolving allow for computing explanations in practice.\n","authors":["Pablo Barceló","Alexander Kozachinskiy","Miguel Romero Orth","Bernardo Subercaseaux","José Verschae"],"pdf_url":"https://arxiv.org/pdf/2501.06078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06077v1","updated":"2025-01-10T16:14:08Z","published":"2025-01-10T16:14:08Z","title":"Explainable Federated Bayesian Causal Inference and Its Application in\n  Advanced Manufacturing","summary":"  Causal inference has recently gained notable attention across various fields\nlike biology, healthcare, and environmental science, especially within\nexplainable artificial intelligence (xAI) systems, for uncovering the causal\nrelationships among multiple variables and outcomes. Yet, it has not been fully\nrecognized and deployed in the manufacturing systems. In this paper, we\nintroduce an explainable, scalable, and flexible federated Bayesian learning\nframework, \\texttt{xFBCI}, designed to explore causality through treatment\neffect estimation in distributed manufacturing systems. By leveraging federated\nBayesian learning, we efficiently estimate posterior of local parameters to\nderive the propensity score for each client without accessing local private\ndata. These scores are then used to estimate the treatment effect using\npropensity score matching (PSM). Through simulations on various datasets and a\nreal-world Electrohydrodynamic (EHD) printing data, we demonstrate that our\napproach outperforms standard Bayesian causal inference methods and several\nstate-of-the-art federated learning benchmarks.\n","authors":["Xiaofeng Xiao","Khawlah Alharbi","Pengyu Zhang","Hantang Qin","Xubo Yue"],"pdf_url":"https://arxiv.org/pdf/2501.06077v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2501.06076v1","updated":"2025-01-10T16:13:57Z","published":"2025-01-10T16:13:57Z","title":"A monthly sub-national Harmonized Food Insecurity Dataset for\n  comprehensive analysis and predictive modeling","summary":"  Food security is a complex, multidimensional concept challenging to measure\ncomprehensively. Effective anticipation, monitoring, and mitigation of food\ncrises require timely and comprehensive global data. This paper introduces the\nHarmonized Food Insecurity Dataset (HFID), an open-source resource\nconsolidating four key data sources: the Integrated Food Security Phase\nClassification (IPC)/Cadre Harmonis\\'e (CH) phases, the Famine Early Warning\nSystems Network (FEWS NET) IPC-compatible phases, and the World Food Program's\n(WFP) Food Consumption Score (FCS) and reduced Coping Strategy Index (rCSI).\nUpdated monthly and using a common reference system for administrative units,\nthe HFID offers extensive spatial and temporal coverage. It serves as a vital\ntool for food security experts and humanitarian agencies, providing a unified\nresource for analyzing food security conditions and highlighting global data\ndisparities. The scientific community can also leverage the HFID to develop\ndata-driven predictive models, enhancing the capacity to forecast and prevent\nfuture food crises.\n","authors":["Machefer Mélissande","Michele Ronco","Anne-Claire Thomas","Michael Assouline","Melanie Rabier","Christina Corbane","Felix Rembold"],"pdf_url":"https://arxiv.org/pdf/2501.06076v1.pdf","comment":"The authors Melissande Machefer and Michele Ronco have contributed\n  equally as both first authors to this work. This work is currently being\n  reviewed in a peer-reviewed journal"},{"id":"http://arxiv.org/abs/2501.06074v1","updated":"2025-01-10T16:11:27Z","published":"2025-01-10T16:11:27Z","title":"Geometry and Optimization of Shallow Polynomial Networks","summary":"  We study shallow neural networks with polynomial activations. The function\nspace for these models can be identified with a set of symmetric tensors with\nbounded rank. We describe general features of these networks, focusing on the\nrelationship between width and optimization. We then consider teacher-student\nproblems, that can be viewed as a problem of low-rank tensor approximation with\nrespect to a non-standard inner product that is induced by the data\ndistribution. In this setting, we introduce a teacher-metric discriminant which\nencodes the qualitative behavior of the optimization as a function of the\ntraining data distribution. Finally, we focus on networks with quadratic\nactivations, presenting an in-depth analysis of the optimization landscape. In\nparticular, we present a variation of the Eckart-Young Theorem characterizing\nall critical points and their Hessian signatures for teacher-student problems\nwith quadratic networks and Gaussian training data.\n","authors":["Yossi Arjevani","Joan Bruna","Joe Kileel","Elzbieta Polak","Matthew Trager"],"pdf_url":"https://arxiv.org/pdf/2501.06074v1.pdf","comment":"36 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.08235v2","updated":"2025-01-10T16:02:22Z","published":"2023-08-16T09:12:21Z","title":"The Expressive Power of Graph Neural Networks: A Survey","summary":"  Graph neural networks (GNNs) are effective machine learning models for many\ngraph-related applications. Despite their empirical success, many research\nefforts focus on the theoretical limitations of GNNs, i.e., the GNNs expressive\npower. Early works in this domain mainly focus on studying the graph\nisomorphism recognition ability of GNNs, and recent works try to leverage the\nproperties such as subgraph counting and connectivity learning to characterize\nthe expressive power of GNNs, which are more practical and closer to\nreal-world. However, no survey papers and open-source repositories\ncomprehensively summarize and discuss models in this important direction. To\nfill the gap, we conduct a first survey for models for enhancing expressive\npower under different forms of definition. Concretely, the models are reviewed\nbased on three categories, i.e., Graph feature enhancement, Graph topology\nenhancement, and GNNs architecture enhancement.\n","authors":["Bingxu Zhang","Changjun Fan","Shixuan Liu","Kuihua Huang","Xiang Zhao","Jincai Huang","Zhong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.08235v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06066v1","updated":"2025-01-10T15:57:23Z","published":"2025-01-10T15:57:23Z","title":"Distilling Calibration via Conformalized Credal Inference","summary":"  Deploying artificial intelligence (AI) models on edge devices involves a\ndelicate balance between meeting stringent complexity constraints, such as\nlimited memory and energy resources, and ensuring reliable performance in\nsensitive decision-making tasks. One way to enhance reliability is through\nuncertainty quantification via Bayesian inference. This approach, however,\ntypically necessitates maintaining and running multiple models in an ensemble,\nwhich may exceed the computational limits of edge devices. This paper\nintroduces a low-complexity methodology to address this challenge by distilling\ncalibration information from a more complex model. In an offline phase,\npredictive probabilities generated by a high-complexity cloud-based model are\nleveraged to determine a threshold based on the typical divergence between the\ncloud and edge models. At run time, this threshold is used to construct credal\nsets -- ranges of predictive probabilities that are guaranteed, with a\nuser-selected confidence level, to include the predictions of the cloud model.\nThe credal sets are obtained through thresholding of a divergence measure in\nthe simplex of predictive probabilities. Experiments on visual and language\ntasks demonstrate that the proposed approach, termed Conformalized Distillation\nfor Credal Inference (CD-CI), significantly improves calibration performance\ncompared to low-complexity Bayesian methods, such as Laplace approximation,\nmaking it a practical and efficient solution for edge AI deployments.\n","authors":["Jiayi Huang","Sangwoo Park","Nicola Paoletti","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2501.06066v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2412.07312v2","updated":"2025-01-10T15:46:25Z","published":"2024-12-10T08:50:35Z","title":"High-dimensional classification problems with Barron regular boundaries\n  under margin conditions","summary":"  We prove that a classifier with a Barron-regular decision boundary can be\napproximated with a rate of high polynomial degree by ReLU neural networks with\nthree hidden layers when a margin condition is assumed. In particular, for\nstrong margin conditions, high-dimensional discontinuous classifiers can be\napproximated with a rate that is typically only achievable when approximating a\nlow-dimensional smooth function. We demonstrate how these expression rate\nbounds imply fast-rate learning bounds that are close to $n^{-1}$ where $n$ is\nthe number of samples. In addition, we carry out comprehensive numerical\nexperimentation on binary classification problems with various margins. We\nstudy three different dimensions, with the highest dimensional problem\ncorresponding to images from the MNIST data set.\n","authors":["Jonathan García","Philipp Petersen"],"pdf_url":"https://arxiv.org/pdf/2412.07312v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06062v1","updated":"2025-01-10T15:46:19Z","published":"2025-01-10T15:46:19Z","title":"Personalized Language Model Learning on Text Data Without User\n  Identifiers","summary":"  In many practical natural language applications, user data are highly\nsensitive, requiring anonymous uploads of text data from mobile devices to the\ncloud without user identifiers. However, the absence of user identifiers\nrestricts the ability of cloud-based language models to provide personalized\nservices, which are essential for catering to diverse user needs. The trivial\nmethod of replacing an explicit user identifier with a static user embedding as\nmodel input still compromises data anonymization. In this work, we propose to\nlet each mobile device maintain a user-specific distribution to dynamically\ngenerate user embeddings, thereby breaking the one-to-one mapping between an\nembedding and a specific user. We further theoretically demonstrate that to\nprevent the cloud from tracking users via uploaded embeddings, the local\ndistributions of different users should either be derived from a linearly\ndependent space to avoid identifiability or be close to each other to prevent\naccurate attribution. Evaluation on both public and industrial datasets using\ndifferent language models reveals a remarkable improvement in accuracy from\nincorporating anonymous user embeddings, while preserving real-time inference\nrequirement.\n","authors":["Yucheng Ding","Yangwenjian Tan","Xiangyu Liu","Chaoyue Niu","Fandong Meng","Jie Zhou","Ning Liu","Fan Wu","Guihai Chen"],"pdf_url":"https://arxiv.org/pdf/2501.06062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06059v1","updated":"2025-01-10T15:40:31Z","published":"2025-01-10T15:40:31Z","title":"COMIX: Compositional Explanations using Prototypes","summary":"  Aligning machine representations with human understanding is key to improving\ninterpretability of machine learning (ML) models. When classifying a new image,\nhumans often explain their decisions by decomposing the image into concepts and\npointing to corresponding regions in familiar images. Current ML explanation\ntechniques typically either trace decision-making processes to reference\nprototypes, generate attribution maps highlighting feature importance, or\nincorporate intermediate bottlenecks designed to align with human-interpretable\nconcepts. The proposed method, named COMIX, classifies an image by decomposing\nit into regions based on learned concepts and tracing each region to\ncorresponding ones in images from the training dataset, assuring that\nexplanations fully represent the actual decision-making process. We dissect the\ntest image into selected internal representations of a neural network to derive\nprototypical parts (primitives) and match them with the corresponding\nprimitives derived from the training data. In a series of qualitative and\nquantitative experiments, we theoretically prove and demonstrate that our\nmethod, in contrast to post hoc analysis, provides fidelity of explanations and\nshows that the efficiency is competitive with other inherently interpretable\narchitectures. Notably, it shows substantial improvements in fidelity and\nsparsity metrics, including 48.82% improvement in the C-insertion score on the\nImageNet dataset over the best state-of-the-art baseline.\n","authors":["Sarath Sivaprasad","Dmitry Kangin","Plamen Angelov","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2501.06059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06058v1","updated":"2025-01-10T15:39:39Z","published":"2025-01-10T15:39:39Z","title":"Learning Flexible Heterogeneous Coordination with Capability-Aware\n  Shared Hypernetworks","summary":"  Cooperative heterogeneous multi-agent tasks require agents to effectively\ncoordinate their behaviors while accounting for their relative capabilities.\nLearning-based solutions to this challenge span between two extremes: i)\nshared-parameter methods, which encode diverse behaviors within a single\narchitecture by assigning an ID to each agent, and are sample-efficient but\nresult in limited behavioral diversity; ii) independent methods, which learn a\nseparate policy for each agent, and show greater behavioral diversity but lack\nsample-efficiency. Prior work has also explored selective parameter-sharing,\nallowing for a compromise between diversity and efficiency. None of these\napproaches, however, effectively generalize to unseen agents or teams. We\npresent Capability-Aware Shared Hypernetworks (CASH), a novel architecture for\nheterogeneous multi-agent coordination that generates sufficient diversity\nwhile maintaining sample-efficiency via soft parameter-sharing hypernetworks.\nIntuitively, CASH allows the team to learn common strategies using a shared\nencoder, which are then adapted according to the team's individual and\ncollective capabilities with a hypernetwork, allowing for zero-shot\ngeneralization to unseen teams and agents. We present experiments across two\nheterogeneous coordination tasks and three standard learning paradigms\n(imitation learning, on- and off-policy reinforcement learning). CASH is able\nto outperform baseline architectures in success rate and sample efficiency when\nevaluated on unseen teams and agents despite using less than half of the\nlearnable parameters.\n","authors":["Kevin Fu","Pierce Howell","Shalin Jain","Harish Ravichandar"],"pdf_url":"https://arxiv.org/pdf/2501.06058v1.pdf","comment":"11 pages, 6 figures, equal authorship between Pierce Howell and\n  Shalin Jain"},{"id":"http://arxiv.org/abs/2202.13059v5","updated":"2025-01-10T15:30:29Z","published":"2022-02-26T04:49:01Z","title":"Theoretical Error Analysis of Entropy Approximation for Gaussian Mixture","summary":"  Gaussian mixture distributions are commonly employed to represent general\nprobability distributions. Despite the importance of using Gaussian mixtures\nfor uncertainty estimation, the entropy of a Gaussian mixture cannot be\ncalculated analytically. In this paper, we study the approximate entropy\nrepresented as the sum of the entropies of unimodal Gaussian distributions with\nmixing coefficients. This approximation is easy to calculate analytically\nregardless of dimension, but there is a lack of theoretical guarantees. We\ntheoretically analyze the approximation error between the true and the\napproximate entropy to reveal when this approximation works effectively. This\nerror is essentially controlled by how far apart each Gaussian component of the\nGaussian mixture is. To measure such separation, we introduce the ratios of the\ndistances between the means to the sum of the variances of each Gaussian\ncomponent of the Gaussian mixture, and we reveal that the error converges to\nzero as the ratios tend to infinity. In addition, the probabilistic estimate\nindicates that this convergence situation is more likely to occur in\nhigher-dimensional spaces. Therefore, our results provide a guarantee that this\napproximation works well for high-dimensional problems, such as neural networks\nthat involve a large number of parameters.\n","authors":["Takashi Furuya","Hiroyuki Kusumoto","Koichi Taniguchi","Naoya Kanno","Kazuma Suetake"],"pdf_url":"https://arxiv.org/pdf/2202.13059v5.pdf","comment":"35 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.05289v3","updated":"2025-01-10T15:25:06Z","published":"2024-10-02T14:14:17Z","title":"MARS: A neurosymbolic approach for interpretable drug discovery","summary":"  Neurosymbolic (NeSy) artificial intelligence describes the combination of\nlogic or rule-based techniques with neural networks. Compared to neural\napproaches, NeSy methods often possess enhanced interpretability, which is\nparticularly promising for biomedical applications like drug discovery.\nHowever, since interpretability is broadly defined, there are no clear\nguidelines for assessing the biological plausibility of model interpretations.\nTo assess interpretability in the context of drug discovery, we devise a novel\nprediction task, called drug mechanism-of-action (MoA) deconvolution, with an\nassociated, tailored knowledge graph (KG), MoA-net. We then develop the MoA\nRetrieval System (MARS), a NeSy approach for drug discovery which leverages\nlogical rules with learned rule weights. Using this interpretable feature\nalongside domain knowledge, we find that MARS and other NeSy approaches on KGs\nare susceptible to reasoning shortcuts, in which the prediction of true labels\nis driven by \"degree-bias\" rather than the domain-based rules. Subsequently, we\ndemonstrate ways to identify and mitigate this. Thereafter, MARS achieves\nperformance on par with current state-of-the-art models while producing model\ninterpretations aligned with known MoAs.\n","authors":["Lauren Nicole DeLong","Yojana Gadiya","Paola Galdi","Jacques D. Fleuriot","Daniel Domingo-Fernández"],"pdf_url":"https://arxiv.org/pdf/2410.05289v3.pdf","comment":"Under review. 10 pages, 7 supplementary pages. Corresponding code is\n  here: https://github.com/laurendelong21/MARS and here:\n  https://github.com/laurendelong21/MoA-Net"},{"id":"http://arxiv.org/abs/2501.06039v1","updated":"2025-01-10T15:17:27Z","published":"2025-01-10T15:17:27Z","title":"AI-powered virtual tissues from spatial proteomics for clinical\n  diagnostics and biomedical discovery","summary":"  Spatial proteomics technologies have transformed our understanding of complex\ntissue architectures by enabling simultaneous analysis of multiple molecular\nmarkers and their spatial organization. The high dimensionality of these data,\nvarying marker combinations across experiments and heterogeneous study designs\npose unique challenges for computational analysis. Here, we present Virtual\nTissues (VirTues), a foundation model framework for biological tissues that\noperates across the molecular, cellular and tissue scale. VirTues introduces\ninnovations in transformer architecture design, including a novel tokenization\nscheme that captures both spatial and marker dimensions, and attention\nmechanisms that scale to high-dimensional multiplex data while maintaining\ninterpretability. Trained on diverse cancer and non-cancer tissue datasets,\nVirTues demonstrates strong generalization capabilities without task-specific\nfine-tuning, enabling cross-study analysis and novel marker integration. As a\ngeneralist model, VirTues outperforms existing approaches across clinical\ndiagnostics, biological discovery and patient case retrieval tasks, while\nproviding insights into tissue function and disease mechanisms.\n","authors":["Johann Wenckstern","Eeshaan Jain","Kiril Vasilev","Matteo Pariset","Andreas Wicki","Gabriele Gut","Charlotte Bunne"],"pdf_url":"https://arxiv.org/pdf/2501.06039v1.pdf","comment":"23 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.11940v3","updated":"2025-01-10T15:07:43Z","published":"2024-01-22T13:30:11Z","title":"Low-Tubal-Rank Tensor Recovery via Factorized Gradient Descent","summary":"  This paper considers the problem of recovering a tensor with an underlying\nlow-tubal-rank structure from a small number of corrupted linear measurements.\nTraditional approaches tackling such a problem require the computation of\ntensor Singular Value Decomposition (t-SVD), that is a computationally\nintensive process, rendering them impractical for dealing with large-scale\ntensors. Aim to address this challenge, we propose an efficient and effective\nlow-tubal-rank tensor recovery method based on a factorization procedure akin\nto the Burer-Monteiro (BM) method. Precisely, our fundamental approach involves\ndecomposing a large tensor into two smaller factor tensors, followed by solving\nthe problem through factorized gradient descent (FGD). This strategy eliminates\nthe need for t-SVD computation, thereby reducing computational costs and\nstorage requirements. We provide rigorous theoretical analysis to ensure the\nconvergence of FGD under both noise-free and noisy situations. Additionally, it\nis worth noting that our method does not require the precise estimation of the\ntensor tubal-rank. Even in cases where the tubal-rank is slightly\noverestimated, our approach continues to demonstrate robust performance. A\nseries of experiments have been carried out to demonstrate that, as compared to\nother popular ones, our approach exhibits superior performance in multiple\nscenarios, in terms of the faster computational speed and the smaller\nconvergence error.\n","authors":["Zhiyu Liu","Zhi Han","Yandong Tang","Xi-Le Zhao","Yao Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11940v3.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.06653v2","updated":"2025-01-10T15:02:43Z","published":"2024-04-08T08:25:25Z","title":"A unified cross-attention model for predicting antigen binding\n  specificity to both HLA and TCR molecules","summary":"  The immune checkpoint inhibitors have demonstrated promising clinical\nefficacy across various tumor types, yet the percentage of patients who benefit\nfrom them remains low. The bindings between tumor antigens and HLA-I/TCR\nmolecules determine the antigen presentation and T-cell activation, thereby\nplaying an important role in the immunotherapy response. In this paper, we\npropose UnifyImmun, a unified cross-attention transformer model designed to\nsimultaneously predict the bindings of peptides to both receptors, providing\nmore comprehensive evaluation of antigen immunogenicity. We devise a two-phase\nstrategy using virtual adversarial training that enables these two tasks to\nreinforce each other mutually, by compelling the encoders to extract more\nexpressive features. Our method demonstrates superior performance in predicting\nboth pHLA and pTCR binding on multiple independent and external test sets.\nNotably, on a large-scale COVID-19 pTCR binding test set without any seen\npeptide in training set, our method outperforms the current state-of-the-art\nmethods by more than 10\\%. The predicted binding scores significantly correlate\nwith the immunotherapy response and clinical outcomes on two clinical cohorts.\nFurthermore, the cross-attention scores and integrated gradients reveal the\namino-acid sites critical for peptide binding to receptors. In essence, our\napproach marks a significant step toward comprehensive evaluation of antigen\nimmunogenicity.\n","authors":["Chenpeng Yu","Xing Fang","Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2405.06653v2.pdf","comment":"Accepted by Nature Machine Intelligence"},{"id":"http://arxiv.org/abs/2501.04794v2","updated":"2025-01-10T14:59:31Z","published":"2025-01-08T19:18:44Z","title":"A Steerable Deep Network for Model-Free Diffusion MRI Registration","summary":"  Nonrigid registration is vital to medical image analysis but remains\nchallenging for diffusion MRI (dMRI) due to its high-dimensional,\norientation-dependent nature. While classical methods are accurate, they are\ncomputationally demanding, and deep neural networks, though efficient, have\nbeen underexplored for nonrigid dMRI registration compared to structural\nimaging. We present a novel, deep learning framework for model-free, nonrigid\nregistration of raw diffusion MRI data that does not require explicit\nreorientation. Unlike previous methods relying on derived representations such\nas diffusion tensors or fiber orientation distribution functions, in our\napproach, we formulate the registration as an equivariant diffeomorphism of\nposition-and-orientation space. Central to our method is an\n$\\mathsf{SE}(3)$-equivariant UNet that generates velocity fields while\npreserving the geometric properties of a raw dMRI's domain. We introduce a new\nloss function based on the maximum mean discrepancy in Fourier space,\nimplicitly matching ensemble average propagators across images. Experimental\nresults on Human Connectome Project dMRI data demonstrate competitive\nperformance compared to state-of-the-art approaches, with the added advantage\nof bypassing the overhead for estimating derived representations. This work\nestablishes a foundation for data-driven, geometry-aware dMRI registration\ndirectly in the acquisition space.\n","authors":["Gianfranco Cortes","Xiaoda Qu","Baba C. Vemuri"],"pdf_url":"https://arxiv.org/pdf/2501.04794v2.pdf","comment":"Coauthor was inadvertently left out. This is now corrected"},{"id":"http://arxiv.org/abs/2501.06016v1","updated":"2025-01-10T14:53:21Z","published":"2025-01-10T14:53:21Z","title":"Investigating the Impact of Observation Space Design Choices On Training\n  Reinforcement Learning Solutions for Spacecraft Problems","summary":"  Recent research using Reinforcement Learning (RL) to learn autonomous control\nfor spacecraft operations has shown great success. However, a recent study\nshowed their performance could be improved by changing the action space, i.e.\ncontrol outputs, used in the learning environment. This has opened the door for\nfinding more improvements through further changes to the environment. The work\nin this paper focuses on how changes to the environment's observation space can\nimpact the training and performance of RL agents learning the spacecraft\ninspection task. The studies are split into two groups. The first looks at the\nimpact of sensors that were designed to help agents learn the task. The second\nlooks at the impact of reference frames, reorienting the agent to see the world\nfrom a different perspective. The results show the sensors are not necessary,\nbut most of them help agents learn more optimal behavior, and that the\nreference frame does not have a large impact, but is best kept consistent.\n","authors":["Nathaniel Hamilton","Kyle Dunlap","Kerianne L Hobbs"],"pdf_url":"https://arxiv.org/pdf/2501.06016v1.pdf","comment":"18 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2412.05545v3","updated":"2025-01-10T14:51:06Z","published":"2024-12-07T05:47:28Z","title":"Convergence analysis of wide shallow neural operators within the\n  framework of Neural Tangent Kernel","summary":"  Neural operators are aiming at approximating operators mapping between Banach\nspaces of functions, achieving much success in the field of scientific\ncomputing. Compared to certain deep learning-based solvers, such as\nPhysics-Informed Neural Networks (PINNs), Deep Ritz Method (DRM), neural\noperators can solve a class of Partial Differential Equations (PDEs). Although\nmuch work has been done to analyze the approximation and generalization error\nof neural operators, there is still a lack of analysis on their training error.\nIn this work, we conduct the convergence analysis of gradient descent for the\nwide shallow neural operators and physics-informed shallow neural operators\nwithin the framework of Neural Tangent Kernel (NTK). The core idea lies on the\nfact that over-parameterization and random initialization together ensure that\neach weight vector remains near its initialization throughout all iterations,\nyielding the linear convergence of gradient descent. In this work, we\ndemonstrate that under the setting of over-parametrization, gradient descent\ncan find the global minimum regardless of whether it is in continuous time or\ndiscrete time.\n","authors":["Xianliang Xu","Ye Li","Zhongyi Huang"],"pdf_url":"https://arxiv.org/pdf/2412.05545v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06007v1","updated":"2025-01-10T14:42:08Z","published":"2025-01-10T14:42:08Z","title":"A Neural Operator for Forecasting Carbon Monoxide Evolution in Cities","summary":"  Real-time forecasting of carbon monoxide (CO) concentrations is essential for\nenabling timely interventions to improve urban air quality. Conventional air\nquality models often require extensive computational resources for accurate,\nmulti-scale predictions, limiting their practicality for rapid, real-time\napplication. To address this challenge, we introduce the Complex Neural\nOperator for Air Quality (CoNOAir), a machine learning model that forecast CO\nconcentrations efficiently. CoNOAir demonstrates superior performance over\nstate-of-theart models, such as the Fourier Neural Operator (FNO), in both\nshort-term (hourly) and extended (72-hour) forecasts at a national scale. It\nexcels in capturing extreme pollution events and performs consistently across\nmultiple Indian cities, achieving an R2 above 0.95 for hourly CO predictions\nacross all evaluated locations. CoNOAir equips authorities with an effective\ntool for issuing early warnings and designing targeted intervention strategies.\nThis work marks a step forward in achieving dependable, real-time CO pollution\npredictions for densely populated urban centres.\n","authors":["Sanchit Bedi","Karn Tiwari","Prathosh A. P.","Sri Harsha Kota","N. M. Anoop Krishnan"],"pdf_url":"https://arxiv.org/pdf/2501.06007v1.pdf","comment":"36 pages, 21 figures, to be published in npj Clean Air journal\n  (accepted)"},{"id":"http://arxiv.org/abs/2501.04211v2","updated":"2025-01-10T14:36:48Z","published":"2025-01-08T01:11:17Z","title":"CURing Large Models: Compression via CUR Decomposition","summary":"  Large deep learning models have achieved remarkable success but are\nresource-intensive, posing challenges such as memory usage. We introduce\nCURing, a novel model compression method based on CUR matrix decomposition,\nwhich approximates weight matrices as the product of selected columns (C) and\nrows (R), and a small linking matrix (U). We apply this decomposition to\nweights chosen based on the combined influence of their magnitudes and\nactivations. By identifying and retaining informative rows and columns, CURing\nsignificantly reduces model size with minimal performance loss. For example, it\nreduces Llama3.1-8B's parameters to 7.32B (-9%) in just 129 seconds, over 20\ntimes faster than prior compression methods.\n","authors":["Sanghyeon Park","Soo-Mook Moon"],"pdf_url":"https://arxiv.org/pdf/2501.04211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06003v1","updated":"2025-01-10T14:34:46Z","published":"2025-01-10T14:34:46Z","title":"Learning to generate feasible graphs using graph grammars","summary":"  Generative methods for graphs need to be sufficiently flexible to model\ncomplex dependencies between sets of nodes. At the same time, the generated\ngraphs need to satisfy domain-dependent feasibility conditions, that is, they\nshould not violate certain constraints that would make their interpretation\nimpossible within the given application domain (e.g. a molecular graph where an\natom has a very large number of chemical bounds). Crucially, constraints can\ninvolve not only local but also long-range dependencies: for example, the\nmaximal length of a cycle can be bounded.\n  Currently, a large class of generative approaches for graphs, such as methods\nbased on artificial neural networks, is based on message passing schemes. These\napproaches suffer from information 'dilution' issues that severely limit the\nmaximal range of the dependencies that can be modeled. To address this problem,\nwe propose a generative approach based on the notion of graph grammars. The key\nnovel idea is to introduce a domain-dependent coarsening procedure to provide\nshort-cuts for long-range dependencies.\n  We show the effectiveness of our proposal in two domains: 1) small drugs and\n2) RNA secondary structures. In the first case, we compare the quality of the\ngenerated molecular graphs via the Molecular Sets (MOSES) benchmark suite,\nwhich evaluates the distance between generated and real molecules, their\nlipophilicity, synthesizability, and drug-likeness. In the second case, we show\nthat the approach can generate very large graphs (with hundreds of nodes) that\nare accepted as valid examples for a desired RNA family by the \"Infernal\"\ncovariance model, a state-of-the-art RNA classifier.\n  Our implementation is available on github:\ngithub.com/fabriziocosta/GraphLearn\n","authors":["Stefan Mautner","Rolf Backofen","Fabrizio Costa"],"pdf_url":"https://arxiv.org/pdf/2501.06003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06002v1","updated":"2025-01-10T14:34:20Z","published":"2025-01-10T14:34:20Z","title":"DeltaGNN: Graph Neural Network with Information Flow Control","summary":"  Graph Neural Networks (GNNs) are popular deep learning models designed to\nprocess graph-structured data through recursive neighborhood aggregations in\nthe message passing process. When applied to semi-supervised node\nclassification, the message-passing enables GNNs to understand short-range\nspatial interactions, but also causes them to suffer from over-smoothing and\nover-squashing. These challenges hinder model expressiveness and prevent the\nuse of deeper models to capture long-range node interactions (LRIs) within the\ngraph. Popular solutions for LRIs detection are either too expensive to process\nlarge graphs due to high time complexity or fail to generalize across diverse\ngraph structures. To address these limitations, we propose a mechanism called\n\\emph{information flow control}, which leverages a novel connectivity measure,\ncalled \\emph{information flow score}, to address over-smoothing and\nover-squashing with linear computational overhead, supported by theoretical\nevidence. Finally, to prove the efficacy of our methodology we design DeltaGNN,\nthe first scalable and generalizable approach for detecting long-range and\nshort-range interactions. We benchmark our model across 10 real-world datasets,\nincluding graphs with varying sizes, topologies, densities, and homophilic\nratios, showing superior performance with limited computational complexity. The\nimplementation of the proposed methods are publicly available at\nhttps://github.com/basiralab/DeltaGNN.\n","authors":["Kevin Mancini","Islem Rekik"],"pdf_url":"https://arxiv.org/pdf/2501.06002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10859v3","updated":"2025-01-10T14:28:32Z","published":"2024-12-14T15:15:17Z","title":"DUET: Dual Clustering Enhanced Multivariate Time Series Forecasting","summary":"  Multivariate time series forecasting is crucial for various applications,\nsuch as financial investment, energy management, weather forecasting, and\ntraffic optimization. However, accurate forecasting is challenging due to two\nmain factors. First, real-world time series often show heterogeneous temporal\npatterns caused by distribution shifts over time. Second, correlations among\nchannels are complex and intertwined, making it hard to model the interactions\namong channels precisely and flexibly.\n  In this study, we address these challenges by proposing a general framework\ncalled DUET, which introduces dual clustering on the temporal and channel\ndimensions to enhance multivariate time series forecasting. First, we design a\nTemporal Clustering Module (TCM) that clusters time series into fine-grained\ndistributions to handle heterogeneous temporal patterns. For different\ndistribution clusters, we design various pattern extractors to capture their\nintrinsic temporal patterns, thus modeling the heterogeneity. Second, we\nintroduce a novel Channel-Soft-Clustering strategy and design a Channel\nClustering Module (CCM), which captures the relationships among channels in the\nfrequency domain through metric learning and applies sparsification to mitigate\nthe adverse effects of noisy channels. Finally, DUET combines TCM and CCM to\nincorporate both the temporal and channel dimensions. Extensive experiments on\n25 real-world datasets from 10 application domains, demonstrate the\nstate-of-the-art performance of DUET.\n","authors":["Xiangfei Qiu","Xingjian Wu","Yan Lin","Chenjuan Guo","Jilin Hu","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2412.10859v3.pdf","comment":"Accepted by KDD 2025 research track"},{"id":"http://arxiv.org/abs/2501.05991v1","updated":"2025-01-10T14:25:01Z","published":"2025-01-10T14:25:01Z","title":"An Attention-Guided Deep Learning Approach for Classifying 39 Skin\n  Lesion Types","summary":"  The skin, as the largest organ of the human body, is vulnerable to a diverse\narray of conditions collectively known as skin lesions, which encompass various\ndermatoses. Diagnosing these lesions presents significant challenges for\nmedical practitioners due to the subtle visual differences that are often\nimperceptible to the naked eye. While not all skin lesions are\nlife-threatening, certain types can act as early indicators of severe diseases,\nincluding skin cancers, underscoring the critical need for timely and accurate\ndiagnostic methods. Deep learning algorithms have demonstrated remarkable\npotential in facilitating the early detection and prognosis of skin lesions.\nThis study advances the field by curating a comprehensive and diverse dataset\ncomprising 39 categories of skin lesions, synthesized from five publicly\navailable datasets. Using this dataset, the performance of five\nstate-of-the-art deep learning models -- MobileNetV2, Xception, InceptionV3,\nEfficientNetB1, and Vision Transformer - is rigorously evaluated. To enhance\nthe accuracy and robustness of these models, attention mechanisms such as the\nEfficient Channel Attention (ECA) and the Convolutional Block Attention Module\n(CBAM) are incorporated into their architectures. Comprehensive evaluation\nacross multiple performance metrics reveals that the Vision Transformer model\nintegrated with CBAM outperforms others, achieving an accuracy of 93.46%,\nprecision of 94%, recall of 93%, F1-score of 93%, and specificity of 93.67%.\nThese results underscore the significant potential of the proposed system in\nsupporting medical professionals with accurate and efficient prognostic tools\nfor diagnosing a broad spectrum of skin lesions. The dataset and code used in\nthis study can be found at\nhttps://github.com/akabircs/Skin-Lesions-Classification.\n","authors":["Sauda Adiv Hanum","Ashim Dey","Muhammad Ashad Kabir"],"pdf_url":"https://arxiv.org/pdf/2501.05991v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2407.17465v3","updated":"2025-01-10T14:22:02Z","published":"2024-07-24T17:58:42Z","title":"u-$μ$P: The Unit-Scaled Maximal Update Parametrization","summary":"  The Maximal Update Parametrization ($\\mu$P) aims to make the optimal\nhyperparameters (HPs) of a model independent of its size, allowing them to be\nswept using a cheap proxy model rather than the full-size target model. We\npresent a new scheme, u-$\\mu$P, which improves upon $\\mu$P by combining it with\nUnit Scaling, a method for designing models that makes them easy to train in\nlow-precision. The two techniques have a natural affinity: $\\mu$P ensures that\nthe scale of activations is independent of model size, and Unit Scaling ensures\nthat activations, weights and gradients begin training with a scale of one.\nThis synthesis opens the door to a simpler scheme, whose default values are\nnear-optimal. This in turn facilitates a more efficient sweeping strategy, with\nu-$\\mu$P models reaching a loss that is equal to or lower than comparable\n$\\mu$P models and working out-of-the-box in FP8.\n","authors":["Charlie Blake","Constantin Eichenberg","Josef Dean","Lukas Balles","Luke Y. Prince","Björn Deiseroth","Andres Felipe Cruz-Salinas","Carlo Luschi","Samuel Weinbach","Douglas Orr"],"pdf_url":"https://arxiv.org/pdf/2407.17465v3.pdf","comment":"55 pages"},{"id":"http://arxiv.org/abs/2501.05987v1","updated":"2025-01-10T14:18:21Z","published":"2025-01-10T14:18:21Z","title":"Comparing Self-Supervised Learning Models Pre-Trained on Human Speech\n  and Animal Vocalizations for Bioacoustics Processing","summary":"  Self-supervised learning (SSL) foundation models have emerged as powerful,\ndomain-agnostic, general-purpose feature extractors applicable to a wide range\nof tasks. Such models pre-trained on human speech have demonstrated high\ntransferability for bioacoustic processing. This paper investigates (i) whether\nSSL models pre-trained directly on animal vocalizations offer a significant\nadvantage over those pre-trained on speech, and (ii) whether fine-tuning\nspeech-pretrained models on automatic speech recognition (ASR) tasks can\nenhance bioacoustic classification. We conduct a comparative analysis using\nthree diverse bioacoustic datasets and two different bioacoustic tasks. Results\nindicate that pre-training on bioacoustic data provides only marginal\nimprovements over speech-pretrained models, with comparable performance in most\nscenarios. Fine-tuning on ASR tasks yields mixed outcomes, suggesting that the\ngeneral-purpose representations learned during SSL pre-training are already\nwell-suited for bioacoustic tasks. These findings highlight the robustness of\nspeech-pretrained SSL models for bioacoustics and imply that extensive\nfine-tuning may not be necessary for optimal performance.\n","authors":["Eklavya Sarkar","Mathew Magimai. -Doss"],"pdf_url":"https://arxiv.org/pdf/2501.05987v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.05982v1","updated":"2025-01-10T14:10:19Z","published":"2025-01-10T14:10:19Z","title":"Deep Variational Sequential Monte Carlo for High-Dimensional\n  Observations","summary":"  Sequential Monte Carlo (SMC), or particle filtering, is widely used in\nnonlinear state-space systems, but its performance often suffers from poorly\napproximated proposal and state-transition distributions. This work introduces\na differentiable particle filter that leverages the unsupervised variational\nSMC objective to parameterize the proposal and transition distributions with a\nneural network, designed to learn from high-dimensional observations.\nExperimental results demonstrate that our approach outperforms established\nbaselines in tracking the challenging Lorenz attractor from high-dimensional\nand partial observations. Furthermore, an evidence lower bound based evaluation\nindicates that our method offers a more accurate representation of the\nposterior distribution.\n","authors":["Wessel L. van Nierop","Nir Shlezinger","Ruud J. G. van Sloun"],"pdf_url":"https://arxiv.org/pdf/2501.05982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05970v1","updated":"2025-01-10T13:56:03Z","published":"2025-01-10T13:56:03Z","title":"A Brain Age Residual Biomarker (BARB): Leveraging MRI-Based Models to\n  Detect Latent Health Conditions in U.S. Veterans","summary":"  Age prediction using brain imaging, such as MRIs, has achieved promising\nresults, with several studies identifying the model's residual as a potential\nbiomarker for chronic disease states. In this study, we developed a brain age\npredictive model using a dataset of 1,220 U.S. veterans (18--80 years) and\nconvolutional neural networks (CNNs) trained on two-dimensional slices of axial\nT2-weighted fast spin-echo and T2-weighted fluid attenuated inversion recovery\nMRI images. The model, incorporating a degree-3 polynomial ensemble, achieved\nan $R^{2}$ of 0.816 on the testing set. Images were acquired at the level of\nthe anterior commissure and the frontal horns of the lateral ventricles.\nResidual analysis was performed to assess its potential as a biomarker for five\nICD-coded conditions: hypertension (HTN), diabetes mellitus (DM), mild\ntraumatic brain injury (mTBI), illicit substance abuse/dependence (SAD), and\nalcohol abuse/dependence (AAD). Residuals grouped by the number of ICD-coded\nconditions demonstrated different trends that were statistically significant\n($p = 0.002$), suggesting a relationship between disease states and predicted\nbrain age. This association was particularly pronounced in patients over 49\nyears, where negative residuals (indicating advanced brain aging) correlated\nwith the presence of multiple ICD codes. These findings support the potential\nof residuals as biomarkers for detecting latent health conditions.\n","authors":["Arthur Bousquet","Sugata Banerji","Mark F. Conneely","Shahrzad Jamshidi"],"pdf_url":"https://arxiv.org/pdf/2501.05970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14488v2","updated":"2025-01-10T13:52:14Z","published":"2024-12-19T03:22:47Z","title":"A stochastic first-order method with multi-extrapolated momentum for\n  highly smooth unconstrained optimization","summary":"  In this paper, we consider an unconstrained stochastic optimization problem\nwhere the objective function exhibits high-order smoothness. Specifically, we\npropose a new stochastic first-order method (SFOM) with multi-extrapolated\nmomentum, in which multiple extrapolations are performed in each iteration,\nfollowed by a momentum update based on these extrapolations. We demonstrate\nthat the proposed SFOM can accelerate optimization by exploiting the high-order\nsmoothness of the objective function $f$. Assuming that the $p$th-order\nderivative of $f$ is Lipschitz continuous for some $p\\ge2$, and under\nadditional mild assumptions, we establish that our method achieves a sample\ncomplexity of $\\widetilde{\\mathcal{O}}(\\epsilon^{-(3p+1)/p})$ for finding a\npoint $x$ such that $\\mathbb{E}[\\|\\nabla f(x)\\|]\\le\\epsilon$. To the best of\nour knowledge, this is the first SFOM to leverage arbitrary-order smoothness of\nthe objective function for acceleration, resulting in a sample complexity that\nimproves upon the best-known results without assuming the mean-squared\nsmoothness condition. Preliminary numerical experiments validate the practical\nperformance of our method and support our theoretical findings.\n","authors":["Chuan He"],"pdf_url":"https://arxiv.org/pdf/2412.14488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05966v1","updated":"2025-01-10T13:49:09Z","published":"2025-01-10T13:49:09Z","title":"Towards Early Prediction of Self-Supervised Speech Model Performance","summary":"  In Self-Supervised Learning (SSL), pre-training and evaluation are resource\nintensive. In the speech domain, current indicators of the quality of SSL\nmodels during pre-training, such as the loss, do not correlate well with\ndownstream performance. Consequently, it is often difficult to gauge the final\ndownstream performance in a cost efficient manner during pre-training. In this\nwork, we propose unsupervised efficient methods that give insights into the\nquality of the pre-training of SSL speech models, namely, measuring the cluster\nquality and rank of the embeddings of the SSL model. Results show that measures\nof cluster quality and rank correlate better with downstream performance than\nthe pre-training loss with only one hour of unlabeled audio, reducing the need\nfor GPU hours and labeled data in SSL model evaluation.\n","authors":["Ryan Whetten","Lucas Maison","Titouan Parcollet","Marco Dinarelli","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2501.05966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05965v1","updated":"2025-01-10T13:47:13Z","published":"2025-01-10T13:47:13Z","title":"Model Inversion in Split Learning for Personalized LLMs: New Insights\n  from Information Bottleneck Theory","summary":"  Personalized Large Language Models (LLMs) have become increasingly prevalent,\nshowcasing the impressive capabilities of models like GPT-4. This trend has\nalso catalyzed extensive research on deploying LLMs on mobile devices. Feasible\napproaches for such edge-cloud deployment include using split learning.\nHowever, previous research has largely overlooked the privacy leakage\nassociated with intermediate representations transmitted from devices to\nservers. This work is the first to identify model inversion attacks in the\nsplit learning framework for LLMs, emphasizing the necessity of secure defense.\nFor the first time, we introduce mutual information entropy to understand the\ninformation propagation of Transformer-based LLMs and assess privacy attack\nperformance for LLM blocks. To address the issue of representations being\nsparser and containing less information than embeddings, we propose a two-stage\nattack system in which the first part projects representations into the\nembedding space, and the second part uses a generative model to recover text\nfrom these embeddings. This design breaks down the complexity and achieves\nattack scores of 38%-75% in various scenarios, with an over 60% improvement\nover the SOTA. This work comprehensively highlights the potential privacy risks\nduring the deployment of personalized LLMs on the edge side.\n","authors":["Yunmeng Shu","Shaofeng Li","Tian Dong","Yan Meng","Haojin Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.05965v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2302.10798v5","updated":"2025-01-10T13:43:48Z","published":"2023-02-17T09:37:17Z","title":"Learning a Consensus Sub-Network with Polarization Regularization and\n  One Pass Training","summary":"  The subject of green AI has been gaining attention within the deep learning\ncommunity given the recent trend of ever larger and more complex neural network\nmodels. Existing solutions for reducing the computational load of training at\ninference time usually involve pruning the network parameters. Pruning schemes\noften create extra overhead either by iterative training and fine-tuning for\nstatic pruning or repeated computation of a dynamic pruning graph. We propose a\nnew parameter pruning strategy for learning a lighter-weight sub-network that\nminimizes the energy cost while maintaining comparable performance to the fully\nparameterised network on given downstream tasks. Our proposed pruning scheme is\ngreen-oriented, as it only requires a one-off training to discover the optimal\nstatic sub-networks by dynamic pruning methods. The pruning scheme consists of\na binary gating module and a polarizing loss function to uncover sub-networks\nwith user-defined sparsity. Our method enables pruning and training\nsimultaneously, which saves energy in both the training and inference phases\nand avoids extra computational overhead from gating modules at inference time.\nOur results on CIFAR-10, CIFAR-100, and Tiny Imagenet suggest that our scheme\ncan remove 50% of connections in deep networks with <1% reduction in\nclassification accuracy. Compared to other related pruning methods, our method\ndemonstrates a lower drop in accuracy for equivalent reductions in\ncomputational cost.\n","authors":["Xiaoying Zhi","Varun Babbar","Rundong Liu","Pheobe Sun","Fran Silavong","Ruibo Shi","Sean Moran"],"pdf_url":"https://arxiv.org/pdf/2302.10798v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05942v1","updated":"2025-01-10T13:06:36Z","published":"2025-01-10T13:06:36Z","title":"Soft regression trees: a model variant and a decomposition training\n  algorithm","summary":"  Decision trees are widely used for classification and regression tasks in a\nvariety of application fields due to their interpretability and good accuracy.\nDuring the past decade, growing attention has been devoted to globally\noptimized decision trees with deterministic or soft splitting rules at branch\nnodes, which are trained by optimizing the error function over all the tree\nparameters. In this work, we propose a new variant of soft multivariate\nregression trees (SRTs) where, for every input vector, the prediction is\ndefined as the linear regression associated to a single leaf node, namely, the\nleaf node obtained by routing the input vector from the root along the branches\nwith higher probability. SRTs exhibit the conditional computational property,\ni.e., each prediction depends on a small number of nodes (parameters), and our\nnonlinear optimization formulation for training them is amenable to\ndecomposition. After showing a universal approximation result for SRTs, we\npresent a decomposition training algorithm including a clustering-based\ninitialization procedure and a heuristic for reassigning the input vectors\nalong the tree. Under mild assumptions, we establish asymptotic convergence\nguarantees. Experiments on 15 wellknown datasets indicate that our SRTs and\ndecomposition algorithm yield higher accuracy and robustness compared with\ntraditional soft regression trees trained using the nonlinear optimization\nformulation of Blanquero et al., and a significant reduction in training times\nas well as a slightly better average accuracy compared with the mixed-integer\noptimization approach of Bertsimas and Dunn. We also report a comparison with\nthe Random Forest ensemble method.\n","authors":["Antonio Consolo","Edoardo Amaldi","Andrea Manno"],"pdf_url":"https://arxiv.org/pdf/2501.05942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05934v1","updated":"2025-01-10T12:56:19Z","published":"2025-01-10T12:56:19Z","title":"Encoded Spatial Attribute in Multi-Tier Federated Learning","summary":"  This research presents an Encoded Spatial Multi-Tier Federated Learning\napproach for a comprehensive evaluation of aggregated models for geospatial\ndata. In the client tier, encoding spatial information is introduced to better\npredict the target outcome. The research aims to assess the performance of\nthese models across diverse datasets and spatial attributes, highlighting\nvariations in predictive accuracy. Using evaluation metrics such as accuracy,\nour research reveals insights into the complexities of spatial granularity and\nthe challenges of capturing underlying patterns in the data. We extended the\nscope of federated learning (FL) by having multi-tier along with the\nfunctionality of encoding spatial attributes. Our N-tier FL approach used\nencoded spatial data to aggregate in different tiers. We obtained multiple\nmodels that predicted the different granularities of spatial data. Our findings\nunderscore the need for further research to improve predictive accuracy and\nmodel generalization, with potential avenues including incorporating additional\nfeatures, refining model architectures, and exploring alternative modeling\napproaches. Our experiments have several tiers representing different levels of\nspatial aspects. We obtained accuracy of 75.62% and 89.52% for the global model\nwithout having to train the model using the data constituted with the\ndesignated tier. The research also highlights the importance of the proposed\napproach in real-time applications.\n","authors":["Asfia Kawnine","Francis Palma","Seyed Alireza Rahimi Azghadi","Hung Cao"],"pdf_url":"https://arxiv.org/pdf/2501.05934v1.pdf","comment":"IEEE ICCE 2025"},{"id":"http://arxiv.org/abs/2501.05932v1","updated":"2025-01-10T12:55:34Z","published":"2025-01-10T12:55:34Z","title":"DiffuSETS: 12-lead ECG Generation Conditioned on Clinical Text Reports\n  and Patient-Specific Information","summary":"  Heart disease remains a significant threat to human health. As a non-invasive\ndiagnostic tool, the electrocardiogram (ECG) is one of the most widely used\nmethods for cardiac screening. However, the scarcity of high-quality ECG data,\ndriven by privacy concerns and limited medical resources, creates a pressing\nneed for effective ECG signal generation. Existing approaches for generating\nECG signals typically rely on small training datasets, lack comprehensive\nevaluation frameworks, and overlook potential applications beyond data\naugmentation. To address these challenges, we propose DiffuSETS, a novel\nframework capable of generating ECG signals with high semantic alignment and\nfidelity. DiffuSETS accepts various modalities of clinical text reports and\npatient-specific information as inputs, enabling the creation of clinically\nmeaningful ECG signals. Additionally, to address the lack of standardized\nevaluation in ECG generation, we introduce a comprehensive benchmarking\nmethodology to assess the effectiveness of generative models in this domain.\nOur model achieve excellent results in tests, proving its superiority in the\ntask of ECG generation. Furthermore, we showcase its potential to mitigate data\nscarcity while exploring novel applications in cardiology education and medical\nknowledge discovery, highlighting the broader impact of our work.\n","authors":["Yongfan Lai","Jiabo Chen","Deyun Zhang","Yue Wang","Shijia Geng","Hongyan Li","Shenda Hong"],"pdf_url":"https://arxiv.org/pdf/2501.05932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07128v2","updated":"2025-01-10T12:34:47Z","published":"2024-09-23T11:29:19Z","title":"Neural Differential Appearance Equations","summary":"  We propose a method to reproduce dynamic appearance textures with\nspace-stationary but time-varying visual statistics. While most previous work\ndecomposes dynamic textures into static appearance and motion, we focus on\ndynamic appearance that results not from motion but variations of fundamental\nproperties, such as rusting, decaying, melting, and weathering. To this end, we\nadopt the neural ordinary differential equation (ODE) to learn the underlying\ndynamics of appearance from a target exemplar. We simulate the ODE in two\nphases. At the \"warm-up\" phase, the ODE diffuses a random noise to an initial\nstate. We then constrain the further evolution of this ODE to replicate the\nevolution of visual feature statistics in the exemplar during the generation\nphase. The particular innovation of this work is the neural ODE achieving both\ndenoising and evolution for dynamics synthesis, with a proposed temporal\ntraining scheme. We study both relightable (BRDF) and non-relightable (RGB)\nappearance models. For both we introduce new pilot datasets, allowing, for the\nfirst time, to study such phenomena: For RGB we provide 22 dynamic textures\nacquired from free online sources; For BRDFs, we further acquire a dataset of\n21 flash-lit videos of time-varying materials, enabled by a simple-to-construct\nsetup. Our experiments show that our method consistently yields realistic and\ncoherent results, whereas prior works falter under pronounced temporal\nappearance variations. A user study confirms our approach is preferred to\nprevious work for such exemplars.\n","authors":["Chen Liu","Tobias Ritschel"],"pdf_url":"https://arxiv.org/pdf/2410.07128v2.pdf","comment":"SIGGRAPH Asia 2024 Journal Track. Project page at\n  https://ryushinn.github.io/ode-appearance"},{"id":"http://arxiv.org/abs/2411.07432v2","updated":"2025-01-10T12:33:03Z","published":"2024-11-11T23:21:01Z","title":"Fast unsupervised ground metric learning with tree-Wasserstein distance","summary":"  The performance of unsupervised methods such as clustering depends on the\nchoice of distance metric between features, or ground metric. Commonly, ground\nmetrics are decided with heuristics or learned via supervised algorithms.\nHowever, since many interesting datasets are unlabelled, unsupervised ground\nmetric learning approaches have been introduced. One promising option employs\nWasserstein singular vectors (WSVs), which emerge when computing optimal\ntransport distances between features and samples simultaneously. WSVs are\neffective, but can be prohibitively computationally expensive in some\napplications: $\\mathcal{O}(n^2m^2(n \\log(n) + m \\log(m))$ for $n$ samples and\n$m$ features. In this work, we propose to augment the WSV method by embedding\nsamples and features on trees, on which we compute the tree-Wasserstein\ndistance (TWD). We demonstrate theoretically and empirically that the algorithm\nconverges to a better approximation of the standard WSV approach than the best\nknown alternatives, and does so with $\\mathcal{O}(n^3+m^3+mn)$ complexity. In\naddition, we prove that the initial tree structure can be chosen flexibly,\nsince tree geometry does not constrain the richness of the approximation up to\nthe number of edge weights. This proof suggests a fast and recursive algorithm\nfor computing the tree parameter basis set, which we find crucial to realising\nthe efficiency gains at scale. Finally, we employ the tree-WSV algorithm to\nseveral single-cell RNA sequencing genomics datasets, demonstrating its\nscalability and utility for unsupervised cell-type clustering problems. These\nresults poise unsupervised ground metric learning with TWD as a low-rank\napproximation of WSV with the potential for widespread application.\n","authors":["Kira M. Düsterwald","Samo Hromadka","Makoto Yamada"],"pdf_url":"https://arxiv.org/pdf/2411.07432v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05906v1","updated":"2025-01-10T12:07:00Z","published":"2025-01-10T12:07:00Z","title":"Q-MAML: Quantum Model-Agnostic Meta-Learning for Variational Quantum\n  Algorithms","summary":"  In the Noisy Intermediate-Scale Quantum (NISQ) era, using variational quantum\nalgorithms (VQAs) to solve optimization problems has become a key application.\nHowever, these algorithms face significant challenges, such as choosing an\neffective initial set of parameters and the limited quantum processing time\nthat restricts the number of optimization iterations. In this study, we\nintroduce a new framework for optimizing parameterized quantum circuits (PQCs)\nthat employs a classical optimizer, inspired by Model-Agnostic Meta-Learning\n(MAML) technique. This approach aim to achieve better parameter initialization\nthat ensures fast convergence. Our framework features a classical neural\nnetwork, called Learner}, which interacts with a PQC using the output of\nLearner as an initial parameter. During the pre-training phase, Learner is\ntrained with a meta-objective based on the quantum circuit cost function. In\nthe adaptation phase, the framework requires only a few PQC updates to converge\nto a more accurate value, while the learner remains unchanged. This method is\nhighly adaptable and is effectively extended to various Hamiltonian\noptimization problems. We validate our approach through experiments, including\ndistribution function mapping and optimization of the Heisenberg XYZ\nHamiltonian. The result implies that the Learner successfully estimates initial\nparameters that generalize across the problem space, enabling fast adaptation.\n","authors":["Junyong Lee","JeiHee Cho","Shiho Kim"],"pdf_url":"https://arxiv.org/pdf/2501.05906v1.pdf","comment":"8 pages, 8 figures, to be published in AAAI 25"},{"id":"http://arxiv.org/abs/2501.00574v2","updated":"2025-01-10T12:00:51Z","published":"2024-12-31T18:01:23Z","title":"VideoChat-Flash: Hierarchical Compression for Long-Context Video\n  Modeling","summary":"  Long-context modeling is a critical capability for multimodal large language\nmodels (MLLMs), enabling them to process long-form contents with implicit\nmemorization. Despite its advances, handling extremely long videos remains\nchallenging due to the difficulty in maintaining crucial features over extended\nsequences. This paper introduces a Hierarchical visual token Compression (HiCo)\nmethod designed for high-fidelity representation and a practical context\nmodeling system VideoChat-Flash tailored for multimodal long-sequence\nprocessing. HiCo capitalizes on the redundancy of visual information in long\nvideos to compress long video context from the clip-level to the video-level,\nreducing the compute significantly while preserving essential details.\nVideoChat-Flash features a multi-stage short-to-long learning scheme, a rich\ndataset of real-world long videos named LongVid, and an upgraded\n\"Needle-In-A-video-Haystack\" (NIAH) for evaluating context capacities. In\nextensive experiments, VideoChat-Flash shows the leading performance on both\nmainstream long and short video benchmarks at the 2B and 7B model scale. It\nfirstly gets 99.1% accuracy over 10,000 frames in NIAH among open-source\nmodels.\n","authors":["Xinhao Li","Yi Wang","Jiashuo Yu","Xiangyu Zeng","Yuhan Zhu","Haian Huang","Jianfei Gao","Kunchang Li","Yinan He","Chenting Wang","Yu Qiao","Yali Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.00574v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05903v1","updated":"2025-01-10T12:00:08Z","published":"2025-01-10T12:00:08Z","title":"Discovery of sustainable energy materials via the machine-learned\n  material space","summary":"  Does a machine learning model actually gain an understanding of the material\nspace? We answer this question in the affirmative on the example of the\nOptiMate model, a graph attention network trained to predict the optical\nproperties of semiconductors and insulators. By applying the UMAP\ndimensionality reduction technique to its latent embeddings, we demonstrate\nthat the model captures a nuanced and interpretable representation of the\nmaterials space, reflecting chemical and physical principles, without any\nuser-induced bias. This enables clustering of almost 10,000 materials based on\noptical properties and chemical similarities. Beyond this understanding, we\ndemonstrate how the learned material space can be used to identify more\nsustainable alternatives to critical materials in energy-related technologies,\nsuch as photovoltaics. These findings demonstrate the dual utility of machine\nlearning models in materials science: Accurately predicting material properties\nwhile providing insights into the underlying materials space. The approach\ndemonstrates the broader potential of leveraging learned materials spaces for\nthe discovery and design of materials for diverse applications, and is easily\napplicable to any state-of-the-art machine learning model.\n","authors":["Malte Grunert","Max Großmann","Erich Runge"],"pdf_url":"https://arxiv.org/pdf/2501.05903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05130v2","updated":"2025-01-10T11:50:00Z","published":"2025-01-09T10:33:16Z","title":"Learning In-Distribution Representations for Anomaly Detection","summary":"  Anomaly detection involves identifying data patterns that deviate from the\nanticipated norm. Traditional methods struggle in high-dimensional spaces due\nto the curse of dimensionality. In recent years, self-supervised learning,\nparticularly through contrastive objectives, has driven advances in anomaly\ndetection. However, vanilla contrastive learning struggles to align with the\nunique demands of anomaly detection, as it lacks a pretext task tailored to the\nhomogeneous nature of In-Distribution (ID) data and the diversity of\nOut-of-Distribution (OOD) anomalies. Methods that attempt to address these\nchallenges, such as introducing hard negatives through synthetic outliers,\nOutlier Exposure (OE), and supervised objectives, often rely on pretext tasks\nthat fail to balance compact clustering of ID samples with sufficient\nseparation from OOD data. In this work, we propose Focused In-distribution\nRepresentation Modeling (FIRM), a contrastive learning objective specifically\ndesigned for anomaly detection. Unlike existing approaches, FIRM incorporates\nsynthetic outliers into its pretext task in a way that actively shapes the\nrepresentation space, promoting compact clustering of ID samples while\nenforcing strong separation from outliers. This formulation addresses the\nchallenges of class collision, enhancing both the compactness of ID\nrepresentations and the discriminative power of the learned feature space. We\nshow that FIRM surpasses other contrastive methods in standard benchmarks,\nsignificantly enhancing anomaly detection compared to both traditional and\nsupervised contrastive learning objectives. Our ablation studies confirm that\nFIRM consistently improves the quality of representations and shows robustness\nacross a range of scoring methods. The code is available at:\nhttps://github.com/willtl/firm.\n","authors":["Willian T. Lunardi","Abdulrahman Banabila","Dania Herzalla","Martin Andreoni"],"pdf_url":"https://arxiv.org/pdf/2501.05130v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05894v1","updated":"2025-01-10T11:46:51Z","published":"2025-01-10T11:46:51Z","title":"Text2Playlist: Generating Personalized Playlists from Text on Deezer","summary":"  The streaming service Deezer heavily relies on the search to help users\nnavigate through its extensive music catalog. Nonetheless, it is primarily\ndesigned to find specific items and does not lead directly to a smooth\nlistening experience. We present Text2Playlist, a stand-alone tool that\naddresses these limitations. Text2Playlist leverages generative AI, music\ninformation retrieval and recommendation systems to generate query-specific and\npersonalized playlists, successfully deployed at scale.\n","authors":["Mathieu Delcluze","Antoine Khoury","Clémence Vast","Valerio Arnaudo","Léa Briand","Walid Bendada","Thomas Bouabça"],"pdf_url":"https://arxiv.org/pdf/2501.05894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05885v1","updated":"2025-01-10T11:37:50Z","published":"2025-01-10T11:37:50Z","title":"EDNet: Edge-Optimized Small Target Detection in UAV Imagery -- Faster\n  Context Attention, Better Feature Fusion, and Hardware Acceleration","summary":"  Detecting small targets in drone imagery is challenging due to low\nresolution, complex backgrounds, and dynamic scenes. We propose EDNet, a novel\nedge-target detection framework built on an enhanced YOLOv10 architecture,\noptimized for real-time applications without post-processing. EDNet\nincorporates an XSmall detection head and a Cross Concat strategy to improve\nfeature fusion and multi-scale context awareness for detecting tiny targets in\ndiverse environments. Our unique C2f-FCA block employs Faster Context Attention\nto enhance feature extraction while reducing computational complexity. The WIoU\nloss function is employed for improved bounding box regression. With seven\nmodel sizes ranging from Tiny to XL, EDNet accommodates various deployment\nenvironments, enabling local real-time inference and ensuring data privacy.\nNotably, EDNet achieves up to a 5.6% gain in mAP@50 with significantly fewer\nparameters. On an iPhone 12, EDNet variants operate at speeds ranging from 16\nto 55 FPS, providing a scalable and efficient solution for edge-based object\ndetection in challenging drone imagery. The source code and pre-trained models\nare available at: https://github.com/zsniko/EDNet.\n","authors":["Zhifan Song","Yuan Zhang","Abd Al Rahman M. Abu Ebayyeh"],"pdf_url":"https://arxiv.org/pdf/2501.05885v1.pdf","comment":"Accepted in 21st IEEE International Conference on Ubiquitous\n  Intelligence and Computing (UIC 2024)\n  https://www.ieee-smart-world.org/2024/uic"},{"id":"http://arxiv.org/abs/2501.01987v2","updated":"2025-01-10T11:36:09Z","published":"2024-12-30T18:08:13Z","title":"Gender Bias in Text-to-Video Generation Models: A case study of Sora","summary":"  The advent of text-to-video generation models has revolutionized content\ncreation as it produces high-quality videos from textual prompts. However,\nconcerns regarding inherent biases in such models have prompted scrutiny,\nparticularly regarding gender representation. Our study investigates the\npresence of gender bias in OpenAI's Sora, a state-of-the-art text-to-video\ngeneration model. We uncover significant evidence of bias by analyzing the\ngenerated videos from a diverse set of gender-neutral and stereotypical\nprompts. The results indicate that Sora disproportionately associates specific\ngenders with stereotypical behaviors and professions, which reflects societal\nprejudices embedded in its training data.\n","authors":["Mohammad Nadeem","Shahab Saquib Sohail","Erik Cambria","Björn W. Schuller","Amir Hussain"],"pdf_url":"https://arxiv.org/pdf/2501.01987v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.05874v1","updated":"2025-01-10T11:17:15Z","published":"2025-01-10T11:17:15Z","title":"VideoRAG: Retrieval-Augmented Generation over Video Corpus","summary":"  Retrieval-Augmented Generation (RAG) is a powerful strategy to address the\nissue of generating factually incorrect outputs in foundation models by\nretrieving external knowledge relevant to queries and incorporating it into\ntheir generation process. However, existing RAG approaches have primarily\nfocused on textual information, with some recent advancements beginning to\nconsider images, and they largely overlook videos, a rich source of multimodal\nknowledge capable of representing events, processes, and contextual details\nmore effectively than any other modality. While a few recent studies explore\nthe integration of videos in the response generation process, they either\npredefine query-associated videos without retrieving them according to queries,\nor convert videos into the textual descriptions without harnessing their\nmultimodal richness. To tackle these, we introduce VideoRAG, a novel framework\nthat not only dynamically retrieves relevant videos based on their relevance\nwith queries but also utilizes both visual and textual information of videos in\nthe output generation. Further, to operationalize this, our method revolves\naround the recent advance of Large Video Language Models (LVLMs), which enable\nthe direct processing of video content to represent it for retrieval and\nseamless integration of the retrieved videos jointly with queries. We\nexperimentally validate the effectiveness of VideoRAG, showcasing that it is\nsuperior to relevant baselines.\n","authors":["Soyeong Jeong","Kangsan Kim","Jinheon Baek","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2501.05874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05871v1","updated":"2025-01-10T11:12:03Z","published":"2025-01-10T11:12:03Z","title":"Collaborative Content Moderation in the Fediverse","summary":"  The Fediverse, a group of interconnected servers providing a variety of\ninteroperable services (e.g. micro-blogging in Mastodon) has gained rapid\npopularity. This sudden growth, partly driven by Elon Musk's acquisition of\nTwitter, has created challenges for administrators though. This paper focuses\non one particular challenge: content moderation, e.g. the need to remove spam\nor hate speech. While centralized platforms like Facebook and Twitter rely on\nautomated tools for moderation, their dependence on massive labeled datasets\nand specialized infrastructure renders them impractical for decentralized,\nlow-resource settings like the Fediverse. In this work, we design and evaluate\nFedMod, a collaborative content moderation system based on federated learning.\nOur system enables servers to exchange parameters of partially trained local\ncontent moderation models with similar servers, creating a federated model\nshared among collaborating servers. FedMod demonstrates robust performance on\nthree different content moderation tasks: harmful content detection, bot\ncontent detection, and content warning assignment, achieving average per-server\nmacro-F1 scores of 0.71, 0.73, and 0.58, respectively.\n","authors":["Haris Bin Zia","Aravindh Raman","Ignacio Castro","Gareth Tyson"],"pdf_url":"https://arxiv.org/pdf/2501.05871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05870v1","updated":"2025-01-10T11:11:08Z","published":"2025-01-10T11:11:08Z","title":"A Neighbor-based Approach to Pitch Ownership Models in Soccer","summary":"  Pitch ownership models allow many types of analysis in soccer and provide\nvaluable assistance to tactical analysts in understanding the game's dynamics.\nThe novelty they provide over event-based analysis is that tracking data\nincorporates context that event-based data does not possess, like player\npositioning. This paper proposes a novel approach to building pitch ownership\nmodels in soccer games using the K-Nearest Neighbors (KNN) algorithm. Our\napproach provides a fast inference mechanism that can model different\napproaches to pitch control using the same algorithm. Despite its flexibility,\nit uses only three hyperparameters to tune the model, facilitating the tuning\nprocess for different player skill levels. The flexibility of the approach\nallows for the emulation of different methods available in the literature by\nadjusting a small number of parameters, including adjusting for different\nlevels of uncertainty. In summary, the proposed model provides a new and more\nflexible strategy for building pitch ownership models, extending beyond just\nreplicating existing algorithms, and can provide valuable insights for tactical\nanalysts and open up new avenues for future research. We thoroughly visualize\nseveral examples demonstrating the presented models' strengths and weaknesses.\nThe code is available at github.com/nvsclub/KNNPitchControl.\n","authors":["Tiago Mendes-Neves","Luís Meireles","João Mendes-Moreira"],"pdf_url":"https://arxiv.org/pdf/2501.05870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05867v1","updated":"2025-01-10T11:08:40Z","published":"2025-01-10T11:08:40Z","title":"Neural Network Verification is a Programming Language Challenge","summary":"  Neural network verification is a new and rapidly developing field of\nresearch. So far, the main priority has been establishing efficient\nverification algorithms and tools, while proper support from the programming\nlanguage perspective has been considered secondary or unimportant. Yet, there\nis mounting evidence that insights from the programming language community may\nmake a difference in the future development of this domain. In this paper, we\nformulate neural network verification challenges as programming language\nchallenges and suggest possible future solutions.\n","authors":["Lucas C. Cordeiro","Matthew L. Daggitt","Julien Girard-Satabin","Omri Isac","Taylor T. Johnson","Guy Katz","Ekaterina Komendantskaya","Augustin Lemesle","Edoardo Manino","Artjoms Šinkarovs","Haoze Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05867v1.pdf","comment":"Accepted at ESOP 2025, European Symposium on Programming Languages"},{"id":"http://arxiv.org/abs/2501.05852v1","updated":"2025-01-10T10:47:00Z","published":"2025-01-10T10:47:00Z","title":"MRI Patterns of the Hippocampus and Amygdala for Predicting Stages of\n  Alzheimer's Progression: A Minimal Feature Machine Learning Framework","summary":"  Alzheimer's disease (AD) progresses through distinct stages, from early mild\ncognitive impairment (EMCI) to late mild cognitive impairment (LMCI) and\neventually to AD. Accurate identification of these stages, especially\ndistinguishing LMCI from EMCI, is crucial for developing pre-dementia\ntreatments but remains challenging due to subtle and overlapping imaging\nfeatures. This study proposes a minimal-feature machine learning framework that\nleverages structural MRI data, focusing on the hippocampus and amygdala as\nregions of interest. The framework addresses the curse of dimensionality\nthrough feature selection, utilizes region-specific voxel information, and\nimplements innovative data organization to enhance classification performance\nby reducing noise. The methodology integrates dimensionality reduction\ntechniques such as PCA and t-SNE with state-of-the-art classifiers, achieving\nthe highest accuracy of 88.46%. This framework demonstrates the potential for\nefficient and accurate staging of AD progression while providing valuable\ninsights for clinical applications.\n","authors":["Aswini Kumar Patra","Soraisham Elizabeth Devi","Tejashwini Gajurel"],"pdf_url":"https://arxiv.org/pdf/2501.05852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05845v1","updated":"2025-01-10T10:36:46Z","published":"2025-01-10T10:36:46Z","title":"Annealing Machine-assisted Learning of Graph Neural Network for\n  Combinatorial Optimization","summary":"  While Annealing Machines (AM) have shown increasing capabilities in solving\ncomplex combinatorial problems, positioning themselves as a more immediate\nalternative to the expected advances of future fully quantum solutions, there\nare still scaling limitations. In parallel, Graph Neural Networks (GNN) have\nbeen recently adapted to solve combinatorial problems, showing competitive\nresults and potentially high scalability due to their distributed nature. We\npropose a merging approach that aims at retaining both the accuracy exhibited\nby AMs and the representational flexibility and scalability of GNNs. Our model\nconsiders a compression step, followed by a supervised interaction where\npartial solutions obtained from the AM are used to guide local GNNs from where\nnode feature representations are obtained and combined to initialize an\nadditional GNN-based solver that handles the original graph's target problem.\nIntuitively, the AM can solve the combinatorial problem indirectly by infusing\nits knowledge into the GNN. Experiments on canonical optimization problems show\nthat the idea is feasible, effectively allowing the AM to solve size problems\nbeyond its original limits.\n","authors":["Pablo Loyola","Kento Hasegawa","Andres Hoyos-Idobro","Kazuo Ono","Toyotaro Suzumura","Yu Hirate","Masanao Yamaoka"],"pdf_url":"https://arxiv.org/pdf/2501.05845v1.pdf","comment":"Second Workshop on Machine Learning with New Compute Paradigms at\n  NeurIPS 2024 (MLNCP 2024)"},{"id":"http://arxiv.org/abs/2501.05844v1","updated":"2025-01-10T10:36:26Z","published":"2025-01-10T10:36:26Z","title":"\"Cause\" is Mechanistic Narrative within Scientific Domains: An Ordinary\n  Language Philosophical Critique of \"Causal Machine Learning\"","summary":"  Causal Learning has emerged as a major theme of AI in recent years, promising\nto use special techniques to reveal the true nature of cause and effect in a\nnumber of important domains. We consider the Epistemology of learning and\nrecognizing true cause and effect phenomena. Through thought exercises on the\ncustomary use of the word ''cause'', especially in scientific domains, we\ninvestigate what, in practice, constitutes a valid causal claim. We recognize\nthe word's uses across scientific domains in disparate form but consistent\nfunction within the scientific paradigm. We highlight fundamental distinctions\nof practice that can be performed in the natural and social sciences, highlight\nthe importance of many systems of interest being open and irreducible and\nidentify the important notion of Hermeneutic knowledge for social science\ninquiry. We posit that the distinct properties require that definitive causal\nclaims can only come through an agglomeration of consistent evidence across\nmultiple domains and levels of abstraction, such as empirical, physiological,\nbiochemical, etc. We present Cognitive Science as an exemplary\nmulti-disciplinary field providing omnipresent opportunity for such a Research\nProgram, and highlight the main general modes of practice of scientific inquiry\nthat can adequately merge, rather than place as incorrigibly conflictual,\nmulti-domain multi-abstraction scientific practices and language games.\n","authors":["Vyacheslav Kungurtsev","Leonardo Christov Moore","Gustav Sir","Martin Krutsky"],"pdf_url":"https://arxiv.org/pdf/2501.05844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05842v1","updated":"2025-01-10T10:33:13Z","published":"2025-01-10T10:33:13Z","title":"Orthogonal projection-based regularization for efficient model\n  augmentation","summary":"  Deep-learning-based nonlinear system identification has shown the ability to\nproduce reliable and highly accurate models in practice. However, these\nblack-box models lack physical interpretability, and often a considerable part\nof the learning effort is spent on capturing already expected/known behavior\ndue to first-principles-based understanding of some aspects of the system. A\npotential solution is to integrate prior physical knowledge directly into the\nmodel structure, combining the strengths of physics-based modeling and\ndeep-learning-based identification. The most common approach is to use an\nadditive model augmentation structure, where the physics-based and the\nmachine-learning (ML) components are connected in parallel. However, such\nmodels are overparametrized, training them is challenging, potentially causing\nthe physics-based part to lose interpretability. To overcome this challenge,\nthis paper proposes an orthogonal projection-based regularization technique to\nenhance parameter learning, convergence, and even model accuracy in\nlearning-based augmentation of nonlinear baseline models.\n","authors":["Bendegúz M. Györök","Jan H. Hoekstra","Johan Kon","Tamás Péni","Maarten Schoukens","Roland Tóth"],"pdf_url":"https://arxiv.org/pdf/2501.05842v1.pdf","comment":"Submitted to L4DC 2025"},{"id":"http://arxiv.org/abs/2309.13736v3","updated":"2025-01-10T10:31:19Z","published":"2023-09-24T19:40:15Z","title":"Geometry of Linear Neural Networks: Equivariance and Invariance under\n  Permutation Groups","summary":"  The set of functions parameterized by a linear fully-connected neural network\nis a determinantal variety. We investigate the subvariety of functions that are\nequivariant or invariant under the action of a permutation group. Examples of\nsuch group actions are translations or $90^\\circ$ rotations on images. We\ndescribe such equivariant or invariant subvarieties as direct products of\ndeterminantal varieties, from which we deduce their dimension, degree,\nEuclidean distance degree, and their singularities. We fully characterize\ninvariance for arbitrary permutation groups, and equivariance for cyclic\ngroups. We draw conclusions for the parameterization and the design of\nequivariant and invariant linear networks in terms of sparsity and\nweight-sharing properties. We prove that all invariant linear functions can be\nparameterized by a single linear autoencoder with a weight-sharing property\nimposed by the cycle decomposition of the considered permutation. The space of\nrank-bounded equivariant functions has several irreducible components, so it\ncan not be parameterized by a single network-but each irreducible component\ncan. Finally, we show that minimizing the squared-error loss on our invariant\nor equivariant networks reduces to minimizing the Euclidean distance from\ndeterminantal varieties via the Eckart-Young theorem.\n","authors":["Kathlén Kohn","Anna-Laura Sattelberger","Vahid Shahverdi"],"pdf_url":"https://arxiv.org/pdf/2309.13736v3.pdf","comment":"42 pages, 8 figures, 1 table; comments welcome!"},{"id":"http://arxiv.org/abs/2401.10726v4","updated":"2025-01-10T10:30:41Z","published":"2024-01-19T14:43:04Z","title":"Empowering Aggregators with Practical Data-Driven Tools: Harnessing\n  Aggregated and Disaggregated Flexibility for Demand Response","summary":"  This study explores the interaction between aggregators and building\noccupants in activating flexibility through Demand Response (DR) programs, with\na focus on reinforcing the resilience of the energy system considering the\nuncertainties presented by Renewable Energy Sources (RES). Firstly, it\nintroduces a methodology of optimizing aggregated flexibility provision\nstrategies in environments with limited data, utilizing Discrete Fourier\nTransformation (DFT) and clustering techniques to identify building occupants'\nactivity patterns. Secondly, the study assesses the disaggregated flexibility\nprovision of Heating Ventilation and Air Conditioning (HVAC) systems during DR\nevents, employing machine learning and optimization techniques for precise,\ndevice-level analysis. The first approach offers a non-intrusive pathway for\naggregators to provide flexibility services in environments of a single smart\nmeter for the whole building's consumption, while the second approach maximizes\nthe amount of flexibility in the case of dedicated metering devices to the HVAC\nsystems by carefully considering building occupants' thermal comfort profiles.\nThrough the application of data-driven techniques and encompassing case studies\nfrom both industrial and residential buildings, this paper not only unveils\npivotal opportunities for aggregators in the balancing and emerging flexibility\nmarkets but also successfully develops and demonstrates end-to-end practical\ntools for aggregators.\n","authors":["Costas Mylonas","Donata Boric","Leila Luttenberger Maric","Alexandros Tsitsanis","Eleftheria Petrianou","Magda Foti"],"pdf_url":"https://arxiv.org/pdf/2401.10726v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15627v3","updated":"2025-01-10T10:24:19Z","published":"2024-06-21T20:06:31Z","title":"Benchmarking Uncertainty Quantification Methods for Large Language\n  Models with LM-Polygraph","summary":"  The rapid proliferation of large language models (LLMs) has stimulated\nresearchers to seek effective and efficient approaches to deal with LLM\nhallucinations and low-quality outputs. Uncertainty quantification (UQ) is a\nkey element of machine learning applications in dealing with such challenges.\nHowever, research to date on UQ for LLMs has been fragmented in terms of\ntechniques and evaluation methodologies. In this work, we address this issue by\nintroducing a novel benchmark that implements a collection of state-of-the-art\nUQ baselines and offers an environment for controllable and consistent\nevaluation of novel UQ techniques over various text generation tasks. Our\nbenchmark also supports the assessment of confidence normalization methods in\nterms of their ability to provide interpretable scores. Using our benchmark, we\nconduct a large-scale empirical investigation of UQ and normalization\ntechniques across eleven tasks, identifying the most effective approaches.\nCode: https://github.com/IINemo/lm-polygraph Benchmark:\nhttps://huggingface.co/LM-Polygraph\n","authors":["Roman Vashurin","Ekaterina Fadeeva","Artem Vazhentsev","Lyudmila Rvanova","Akim Tsvigun","Daniil Vasilev","Rui Xing","Abdelrahman Boda Sadallah","Kirill Grishchenkov","Sergey Petrakov","Alexander Panchenko","Timothy Baldwin","Preslav Nakov","Maxim Panov","Artem Shelmanov"],"pdf_url":"https://arxiv.org/pdf/2406.15627v3.pdf","comment":"Accepted to TACL 2025, pre-MIT Press publication version. Roman\n  Vashurin, Ekaterina Fadeeva, Artem Vazhentsev contributed equally"},{"id":"http://arxiv.org/abs/2407.17163v3","updated":"2025-01-10T10:17:05Z","published":"2024-07-24T11:07:20Z","title":"dlordinal: a Python package for deep ordinal classification","summary":"  dlordinal is a new Python library that unifies many recent deep ordinal\nclassification methodologies available in the literature. Developed using\nPyTorch as underlying framework, it implements the top performing\nstate-of-the-art deep learning techniques for ordinal classification problems.\nOrdinal approaches are designed to leverage the ordering information present in\nthe target variable. Specifically, it includes loss functions, various output\nlayers, dropout techniques, soft labelling methodologies, and other\nclassification strategies, all of which are appropriately designed to\nincorporate the ordinal information. Furthermore, as the performance metrics to\nassess novel proposals in ordinal classification depend on the distance between\ntarget and predicted classes in the ordinal scale, suitable ordinal evaluation\nmetrics are also included. dlordinal is distributed under the BSD-3-Clause\nlicense and is available at https://github.com/ayrna/dlordinal.\n","authors":["Francisco Bérchez-Moreno","Víctor M. Vargas","Rafael Ayllón-Gavilán","David Guijo-Rubio","César Hervás-Martínez","Juan C. Fernández","Pedro A. Gutiérrez"],"pdf_url":"https://arxiv.org/pdf/2407.17163v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05835v1","updated":"2025-01-10T10:16:35Z","published":"2025-01-10T10:16:35Z","title":"Fine-tuning is Not Fine: Mitigating Backdoor Attacks in GNNs with\n  Limited Clean Data","summary":"  Graph Neural Networks (GNNs) have achieved remarkable performance through\ntheir message-passing mechanism. However, recent studies have highlighted the\nvulnerability of GNNs to backdoor attacks, which can lead the model to\nmisclassify graphs with attached triggers as the target class. The\neffectiveness of recent promising defense techniques, such as fine-tuning or\ndistillation, is heavily contingent on having comprehensive knowledge of the\nsufficient training dataset. Empirical studies have shown that fine-tuning\nmethods require a clean dataset of 20% to reduce attack accuracy to below 25%,\nwhile distillation methods require a clean dataset of 15%. However, obtaining\nsuch a large amount of clean data is commonly impractical.\n  In this paper, we propose a practical backdoor mitigation framework, denoted\nas GRAPHNAD, which can capture high-quality intermediate-layer representations\nin GNNs to enhance the distillation process with limited clean data. To achieve\nthis, we address the following key questions: How to identify the appropriate\nattention representations in graphs for distillation? How to enhance\ndistillation with limited data? By adopting the graph attention transfer\nmethod, GRAPHNAD can effectively align the intermediate-layer attention\nrepresentations of the backdoored model with that of the teacher model, forcing\nthe backdoor neurons to transform into benign ones. Besides, we extract the\nrelation maps from intermediate-layer transformation and enforce the relation\nmaps of the backdoored model to be consistent with that of the teacher model,\nthereby ensuring model accuracy while further reducing the influence of\nbackdoors. Extensive experimental results show that by fine-tuning a teacher\nmodel with only 3% of the clean data, GRAPHNAD can reduce the attack success\nrate to below 5%.\n","authors":["Jiale Zhang","Bosen Rao","Chengcheng Zhu","Xiaobing Sun","Qingming Li","Haibo Hu","Xiapu Luo","Qingqing Ye","Shouling Ji"],"pdf_url":"https://arxiv.org/pdf/2501.05835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11875v2","updated":"2025-01-10T10:15:49Z","published":"2023-10-18T10:49:29Z","title":"Fractional Concepts in Neural Networks: Enhancing Activation Functions","summary":"  Designing effective neural networks requires tuning architectural elements.\nThis study integrates fractional calculus into neural networks by introducing\nfractional order derivatives (FDO) as tunable parameters in activation\nfunctions, allowing diverse activation functions by adjusting the FDO. We\nevaluate these fractional activation functions on various datasets and network\narchitectures, comparing their performance with traditional and new activation\nfunctions. Our experiments assess their impact on accuracy, time complexity,\ncomputational overhead, and memory usage. Results suggest fractional activation\nfunctions, particularly fractional Sigmoid, offer benefits in some scenarios.\nChallenges related to consistency and efficiency remain. Practical implications\nand limitations are discussed.\n","authors":["Zahra Alijani","Vojtech Molek"],"pdf_url":"https://arxiv.org/pdf/2310.11875v2.pdf","comment":"8 pages, 8 figures, submitted to pattern recognition letters"},{"id":"http://arxiv.org/abs/2501.05819v1","updated":"2025-01-10T09:59:16Z","published":"2025-01-10T09:59:16Z","title":"Diffusion Models for Smarter UAVs: Decision-Making and Modeling","summary":"  Unmanned Aerial Vehicles (UAVs) are increasingly adopted in modern\ncommunication networks. However, challenges in decision-making and digital\nmodeling continue to impede their rapid advancement. Reinforcement Learning\n(RL) algorithms face limitations such as low sample efficiency and limited data\nversatility, further magnified in UAV communication scenarios. Moreover,\nDigital Twin (DT) modeling introduces substantial decision-making and data\nmanagement complexities. RL models, often integrated into DT frameworks,\nrequire extensive training data to achieve accurate predictions. In contrast to\ntraditional approaches that focus on class boundaries, Diffusion Models (DMs),\na new class of generative AI, learn the underlying probability distribution\nfrom the training data and can generate trustworthy new patterns based on this\nlearned distribution. This paper explores the integration of DMs with RL and DT\nto effectively address these challenges. By combining the data generation\ncapabilities of DMs with the decision-making framework of RL and the modeling\naccuracy of DT, the integration improves the adaptability and real-time\nperformance of UAV communication. Moreover, the study shows how DMs can\nalleviate data scarcity, improve policy networks, and optimize dynamic\nmodeling, providing a robust solution for complex UAV communication scenarios.\n","authors":["Yousef Emami","Hao Zhou","Luis Almeida","Kai Li"],"pdf_url":"https://arxiv.org/pdf/2501.05819v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2412.13902v2","updated":"2025-01-10T09:55:54Z","published":"2024-12-18T14:42:43Z","title":"Threshold Neuron: A Brain-inspired Artificial Neuron for Efficient\n  On-device Inference","summary":"  Enhancing the computational efficiency of on-device Deep Neural Networks\n(DNNs) remains a significant challengein mobile and edge computing. As we aim\nto execute increasingly complex tasks with constrained computational resources,\nmuch of the research has focused on compressing neural network structures and\noptimizing systems. Although many studies have focused on compressing neural\nnetwork structures and parameters or optimizing underlying systems, there has\nbeen limited attention on optimizing the fundamental building blocks of neural\nnetworks: the neurons. In this study, we deliberate on a simple but important\nresearch question: Can we design artificial neurons that offer greater\nefficiency than the traditional neuron paradigm? Inspired by the threshold\nmechanisms and the excitation-inhibition balance observed in biological\nneurons, we propose a novel artificial neuron model, Threshold Neurons. Using\nThreshold Neurons, we can construct neural networks similar to those with\ntraditional artificial neurons, while significantly reducing hardware\nimplementation complexity. Our extensive experiments validate the effectiveness\nof neural networks utilizing Threshold Neurons, achieving substantial power\nsavings of 7.51x to 8.19x and area savings of 3.89x to 4.33x at the kernel\nlevel, with minimal loss in precision. Furthermore, FPGA-based implementations\nof these networks demonstrate 2.52x power savings and 1.75x speed enhancements\nat the system level. The source code will be made available upon publication.\n","authors":["Zihao Zheng","Yuanchun Li","Jiayu Chen","Peng Zhou","Xiang Chen","Yunxin Liu"],"pdf_url":"https://arxiv.org/pdf/2412.13902v2.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.11650v2","updated":"2025-01-10T09:44:48Z","published":"2024-02-18T17:02:39Z","title":"Programmatic Reinforcement Learning: Navigating Gridworlds","summary":"  The field of reinforcement learning (RL) is concerned with algorithms for\nlearning optimal policies in unknown stochastic environments. Programmatic RL\nstudies representations of policies as programs, meaning involving higher order\nconstructs such as control loops. Despite attracting a lot of attention at the\nintersection of the machine learning and formal methods communities, very\nlittle is known on the theoretical front about programmatic RL: what are good\nclasses of programmatic policies? How large are optimal programmatic policies?\nHow can we learn them? The goal of this paper is to give first answers to these\nquestions, initiating a theoretical study of programmatic RL. Considering a\nclass of gridworld environments, we define a class of programmatic policies.\nOur main contributions are to place upper bounds on the size of optimal\nprogrammatic policies, and to construct an algorithm for synthesizing them.\nThese theoretical findings are complemented by a prototype implementation of\nthe algorithm.\n","authors":["Guruprerana Shabadi","Nathanaël Fijalkow","Théo Matricon"],"pdf_url":"https://arxiv.org/pdf/2402.11650v2.pdf","comment":"Published in the proceedings of GenPlan, AAAI 2025 Workshop on\n  Generlization in Planning"},{"id":"http://arxiv.org/abs/2412.09594v2","updated":"2025-01-10T09:40:04Z","published":"2024-12-12T18:58:14Z","title":"Wait-Less Offline Tuning and Re-solving for Online Decision Making","summary":"  Online linear programming (OLP) has found broad applications in revenue\nmanagement and resource allocation. State-of-the-art OLP algorithms achieve low\nregret by repeatedly solving linear programming (LP) subproblems that\nincorporate updated resource information. However, LP-based methods are\ncomputationally expensive and often inefficient for large-scale applications.\nIn contrast, recent first-order OLP algorithms are more computationally\nefficient but typically suffer from worse regret guarantees. To address these\nshortcomings, we propose a new algorithm that combines the strengths of\nLP-based and first-order OLP methods. The algorithm re-solves the LP\nsubproblems periodically at a predefined frequency $f$ and uses the latest dual\nprices to guide online decision-making. In addition, a first-order method runs\nin parallel during each interval between LP re-solves, smoothing resource\nconsumption. Our algorithm achieves $\\mathscr{O}(\\log (T/f) + \\sqrt{f})$\nregret, delivering a \"wait-less\" online decision-making process that balances\nthe computational efficiency of first-order methods and the superior regret\nguarantee of LP-based methods.\n","authors":["Jingruo Sun","Wenzhi Gao","Ellen Vitercik","Yinyu Ye"],"pdf_url":"https://arxiv.org/pdf/2412.09594v2.pdf","comment":"In this version, we achieve a tighter regret bound with the warm\n  start for the first batch. We also make the proof more elegant by manually\n  accepting all subsequent orders once the constraint is violated. In this way,\n  we do not need to introduce the concept of stopping time for the analysis of\n  the LP-based method"},{"id":"http://arxiv.org/abs/2308.00721v4","updated":"2025-01-10T09:35:20Z","published":"2023-07-31T03:56:46Z","title":"A Pre-trained Data Deduplication Model based on Active Learning","summary":"  In the era of big data, the issue of data quality has become increasingly\nprominent. One of the main challenges is the problem of duplicate data, which\ncan arise from repeated entry or the merging of multiple data sources. These\n\"dirty data\" problems can significantly limit the effective application of big\ndata. To address the issue of data deduplication, we propose a pre-trained\ndeduplication model based on active learning, which is the first work that\nutilizes active learning to address the problem of deduplication at the\nsemantic level. The model is built on a pre-trained Transformer and fine-tuned\nto solve the deduplication problem as a sequence to classification task, which\nfirstly integrate the transformer with active learning into an end-to-end\narchitecture to select the most valuable data for deduplication model training,\nand also firstly employ the R-Drop method to perform data augmentation on each\nround of labeled data, which can reduce the cost of manual labeling and improve\nthe model's performance. Experimental results demonstrate that our proposed\nmodel outperforms previous state-of-the-art (SOTA) for deduplicated data\nidentification, achieving up to a 28% improvement in Recall score on benchmark\ndatasets.\n","authors":["Haochen Shi","Xinyao Liu","Fengmao Lv","Hongtao Xue","Jie Hu","Shengdong Du","Tianrui Li"],"pdf_url":"https://arxiv.org/pdf/2308.00721v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13840v2","updated":"2025-01-10T09:30:44Z","published":"2023-08-26T10:24:43Z","title":"Optimal Transport-inspired Deep Learning Framework for Slow-Decaying\n  Kolmogorov n-width Problems: Exploiting Sinkhorn Loss and Wasserstein Kernel","summary":"  Reduced order models (ROMs) are widely used in scientific computing to tackle\nhigh-dimensional systems. However, traditional ROM methods may only partially\ncapture the intrinsic geometric characteristics of the data. These\ncharacteristics encompass the underlying structure, relationships, and\nessential features crucial for accurate modeling.\n  To overcome this limitation, we propose a novel ROM framework that integrates\noptimal transport (OT) theory and neural network-based methods. Specifically,\nwe investigate the Kernel Proper Orthogonal Decomposition (kPOD) method\nexploiting the Wasserstein distance as the custom kernel, and we efficiently\ntrain the resulting neural network (NN) employing the Sinkhorn algorithm. By\nleveraging an OT-based nonlinear reduction, the presented framework can capture\nthe geometric structure of the data, which is crucial for accurate learning of\nthe reduced solution manifold. When compared with traditional metrics such as\nmean squared error or cross-entropy, exploiting the Sinkhorn divergence as the\nloss function enhances stability during training, robustness against\noverfitting and noise, and accelerates convergence.\n  To showcase the approach's effectiveness, we conduct experiments on a set of\nchallenging test cases exhibiting a slow decay of the Kolmogorov n-width. The\nresults show that our framework outperforms traditional ROM methods in terms of\naccuracy and computational efficiency.\n","authors":["Moaad Khamlich","Federico Pichi","Gianluigi Rozza"],"pdf_url":"https://arxiv.org/pdf/2308.13840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02221v2","updated":"2025-01-10T09:26:32Z","published":"2025-01-04T07:53:38Z","title":"CORD: Generalizable Cooperation via Role Diversity","summary":"  Cooperative multi-agent reinforcement learning (MARL) aims to develop agents\nthat can collaborate effectively. However, most cooperative MARL methods\noverfit training agents, making learned policies not generalize well to unseen\ncollaborators, which is a critical issue for real-world deployment. Some\nmethods attempt to address the generalization problem but require prior\nknowledge or predefined policies of new teammates, limiting real-world\napplications. To this end, we propose a hierarchical MARL approach to enable\ngeneralizable cooperation via role diversity, namely CORD. CORD's high-level\ncontroller assigns roles to low-level agents by maximizing the role entropy\nwith constraints. We show this constrained objective can be decomposed into\ncausal influence in role that enables reasonable role assignment, and role\nheterogeneity that yields coherent, non-redundant role clusters. Evaluated on a\nvariety of cooperative multi-agent tasks, CORD achieves better performance than\nbaselines, especially in generalization tests. Ablation studies further\ndemonstrate the efficacy of the constrained objective in generalizable\ncooperation.\n","authors":["Kanefumi Matsuyama","Kefan Su","Jiangxing Wang","Deheng Ye","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2501.02221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01042v2","updated":"2025-01-10T09:21:43Z","published":"2025-01-02T03:52:22Z","title":"Image-based Multimodal Models as Intruders: Transferable Multimodal\n  Attacks on Video-based MLLMs","summary":"  Video-based multimodal large language models (V-MLLMs) have shown\nvulnerability to adversarial examples in video-text multimodal tasks. However,\nthe transferability of adversarial videos to unseen models--a common and\npractical real world scenario--remains unexplored. In this paper, we pioneer an\ninvestigation into the transferability of adversarial video samples across\nV-MLLMs. We find that existing adversarial attack methods face significant\nlimitations when applied in black-box settings for V-MLLMs, which we attribute\nto the following shortcomings: (1) lacking generalization in perturbing video\nfeatures, (2) focusing only on sparse key-frames, and (3) failing to integrate\nmultimodal information. To address these limitations and deepen the\nunderstanding of V-MLLM vulnerabilities in black-box scenarios, we introduce\nthe Image-to-Video MLLM (I2V-MLLM) attack. In I2V-MLLM, we utilize an\nimage-based multimodal model (IMM) as a surrogate model to craft adversarial\nvideo samples. Multimodal interactions and temporal information are integrated\nto disrupt video representations within the latent space, improving adversarial\ntransferability. In addition, a perturbation propagation technique is\nintroduced to handle different unknown frame sampling strategies. Experimental\nresults demonstrate that our method can generate adversarial examples that\nexhibit strong transferability across different V-MLLMs on multiple video-text\nmultimodal tasks. Compared to white-box attacks on these models, our black-box\nattacks (using BLIP-2 as surrogate model) achieve competitive performance, with\naverage attack success rates of 55.48% on MSVD-QA and 58.26% on MSRVTT-QA for\nVideoQA tasks, respectively. Our code will be released upon acceptance.\n","authors":["Linhao Huang","Xue Jiang","Zhiqiang Wang","Wentao Mo","Xi Xiao","Bo Han","Yongjie Yin","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.01042v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05809v1","updated":"2025-01-10T09:19:10Z","published":"2025-01-10T09:19:10Z","title":"AdaPRL: Adaptive Pairwise Regression Learning with Uncertainty\n  Estimation for Universal Regression Tasks","summary":"  Current deep regression models usually learn in point-wise way that treat\neach sample as an independent input, neglecting the relative ordering among\ndifferent data. Consequently, the regression model could neglect the data 's\ninterrelationships, potentially resulting in suboptimal performance. Moreover,\nthe existence of aleatoric uncertainty in the training data may drive the model\nto capture non-generalizable patterns, contributing to increased overfitting.\nTo address these issues, we propose a novel adaptive pairwise learning\nframework (AdaPRL) for regression tasks which leverages the relative\ndifferences between data points and integrates with deep probabilistic models\nto quantify the uncertainty associated with the predictions. Additionally, we\nadapt AdaPRL for applications in multi-task learning and multivariate time\nseries forecasting. Extensive experiments with several real-world regression\ndatasets including recommendation systems, age estimation, time series\nforecasting, natural language understanding, finance, and industry datasets\nshow that AdaPRL is compatible with different backbone networks in various\ntasks and achieves state-of-the-art performance on the vast majority of tasks,\nhighlighting its notable potential including enhancing prediction accuracy and\nranking ability, increasing generalization capability, improving robustness to\nnoisy data, improving resilience to reduced data, and enhancing\ninterpretability, etc.\n","authors":["Fuhang Liang","Rucong Xu","Deng Lin"],"pdf_url":"https://arxiv.org/pdf/2501.05809v1.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.13572v2","updated":"2025-01-10T09:11:39Z","published":"2024-02-21T07:07:54Z","title":"AlgoFormer: An Efficient Transformer Framework with Algorithmic\n  Structures","summary":"  Besides natural language processing, transformers exhibit extraordinary\nperformance in solving broader applications, including scientific computing and\ncomputer vision. Previous works try to explain this from the expressive power\nand capability perspectives that standard transformers are capable of\nperforming some algorithms. To empower transformers with algorithmic\ncapabilities and motivated by the recently proposed looped transformer, we\ndesign a novel transformer framework, dubbed Algorithm Transformer (abbreviated\nas AlgoFormer). We provide an insight that efficient transformer architectures\ncan be designed by leveraging prior knowledge of tasks and the underlying\nstructure of potential algorithms. Compared with the standard transformer and\nvanilla looped transformer, the proposed AlgoFormer can perform efficiently in\nalgorithm representation in some specific tasks. In particular, inspired by the\nstructure of human-designed learning algorithms, our transformer framework\nconsists of a pre-transformer that is responsible for task preprocessing, a\nlooped transformer for iterative optimization algorithms, and a\npost-transformer for producing the desired results after post-processing. We\nprovide theoretical evidence of the expressive power of the AlgoFormer in\nsolving some challenging problems, mirroring human-designed algorithms.\nFurthermore, some theoretical and empirical results are presented to show that\nthe designed transformer has the potential to perform algorithm representation\nand learning. Experimental results demonstrate the empirical superiority of the\nproposed transformer in that it outperforms the standard transformer and\nvanilla looped transformer in some specific tasks. An extensive experiment on\nreal language tasks (e.g., neural machine translation of German and English,\nand text classification) further validates the expressiveness and effectiveness\nof AlgoFormer.\n","authors":["Yihang Gao","Chuanyang Zheng","Enze Xie","Han Shi","Tianyang Hu","Yu Li","Michael K. Ng","Zhenguo Li","Zhaoqiang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.13572v2.pdf","comment":"Published at Transactions on Machine Learning Research (TMLR). The\n  paper provides insight that the Transformer architectures can mimic the\n  algorithm structures in (in-context) algorithm learning and representation.\n  The incorporated algorithmic structure in Algoformer shows its potential in\n  (deep learning for) scientific computing, besides the real language tasks"},{"id":"http://arxiv.org/abs/2501.05803v1","updated":"2025-01-10T09:10:30Z","published":"2025-01-10T09:10:30Z","title":"Alignment without Over-optimization: Training-Free Solution for\n  Diffusion Models","summary":"  Diffusion models excel in generative tasks, but aligning them with specific\nobjectives while maintaining their versatility remains challenging. Existing\nfine-tuning methods often suffer from reward over-optimization, while\napproximate guidance approaches fail to optimize target rewards effectively.\nAddressing these limitations, we propose a training-free sampling method based\non Sequential Monte Carlo (SMC) to sample from the reward-aligned target\ndistribution. Our approach, tailored for diffusion sampling and incorporating\ntempering techniques, achieves comparable or superior target rewards to\nfine-tuning methods while preserving diversity and cross-reward generalization.\nWe demonstrate its effectiveness in single-reward optimization, multi-objective\nscenarios, and online black-box optimization. This work offers a robust\nsolution for aligning diffusion models with diverse downstream objectives\nwithout compromising their general capabilities. Code is available at\nhttps://github.com/krafton-ai/DAS .\n","authors":["Sunwoo Kim","Minkyu Kim","Dongmin Park"],"pdf_url":"https://arxiv.org/pdf/2501.05803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05795v1","updated":"2025-01-10T08:57:50Z","published":"2025-01-10T08:57:50Z","title":"Robust Counterfactual Explanations under Model Multiplicity Using\n  Multi-Objective Optimization","summary":"  In recent years, explainability in machine learning has gained importance. In\nthis context, counterfactual explanation (CE), which is an explanation method\nthat uses examples, has attracted attention. However, it has been pointed out\nthat CE is not robust when there are multiple machine-learning models. These\nproblems are important when using machine learning to make safe decisions. In\nthis paper, we propose robust CEs that introduce a new viewpoint - Pareto\nimprovement - and a method that uses multi-objective optimization to generate\nit. To evaluate the proposed method, we conducted experiments using both\nsimulated and actual data. The results demonstrate that the proposed method is\nrobust and useful. We believe that this research will contribute to a wide\nrange of research areas, such as explainability in machine learning,\ndecision-making, and action planning based on machine learning.\n","authors":["Keita Kinjo"],"pdf_url":"https://arxiv.org/pdf/2501.05795v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2501.05790v1","updated":"2025-01-10T08:50:38Z","published":"2025-01-10T08:50:38Z","title":"Understanding Impact of Human Feedback via Influence Functions","summary":"  In Reinforcement Learning from Human Feedback (RLHF), it is crucial to learn\nsuitable reward models from human feedback to align large language models\n(LLMs) with human intentions. However, human feedback can often be noisy,\ninconsistent, or biased, especially when evaluating complex responses. Such\nfeedback can lead to misaligned reward signals, potentially causing unintended\nside effects during the RLHF process. To address these challenges, we explore\nthe use of influence functions to measure the impact of human feedback on the\nperformance of reward models. We propose a compute-efficient approximation\nmethod that enables the application of influence functions to LLM-based reward\nmodels and large-scale preference datasets. In our experiments, we demonstrate\ntwo key applications of influence functions: (1) detecting common forms of\nlabeler bias in human feedback datasets and (2) guiding labelers to refine\ntheir strategies to align more closely with expert feedback. By quantifying the\nimpact of human feedback on reward models, we believe that influence functions\ncan enhance feedback interpretability and contribute to scalable oversight in\nRLHF, helping labelers provide more accurate and consistent feedback. Source\ncode is available at https://github.com/mintaywon/IF_RLHF\n","authors":["Taywon Min","Haeone Lee","Hanho Ryu","Yongchan Kwon","Kimin Lee"],"pdf_url":"https://arxiv.org/pdf/2501.05790v1.pdf","comment":"Source code: https://github.com/mintaywon/IF_RLHF"},{"id":"http://arxiv.org/abs/2501.02564v2","updated":"2025-01-10T08:40:49Z","published":"2025-01-05T14:42:47Z","title":"Balanced Multi-view Clustering","summary":"  Multi-view clustering (MvC) aims to integrate information from different\nviews to enhance the capability of the model in capturing the underlying data\nstructures. The widely used joint training paradigm in MvC is potentially not\nfully leverage the multi-view information, since the imbalanced and\nunder-optimized view-specific features caused by the uniform learning objective\nfor all views. For instance, particular views with more discriminative\ninformation could dominate the learning process in the joint training paradigm,\nleading to other views being under-optimized. To alleviate this issue, we first\nanalyze the imbalanced phenomenon in the joint-training paradigm of multi-view\nclustering from the perspective of gradient descent for each view-specific\nfeature extractor. Then, we propose a novel balanced multi-view clustering\n(BMvC) method, which introduces a view-specific contrastive regularization\n(VCR) to modulate the optimization of each view. Concretely, VCR preserves the\nsample similarities captured from the joint features and view-specific ones\ninto the clustering distributions corresponding to view-specific features to\nenhance the learning process of view-specific feature extractors. Additionally,\na theoretical analysis is provided to illustrate that VCR adaptively modulates\nthe magnitudes of gradients for updating the parameters of view-specific\nfeature extractors to achieve a balanced multi-view learning procedure. In such\na manner, BMvC achieves a better trade-off between the exploitation of\nview-specific patterns and the exploration of view-invariance patterns to fully\nlearn the multi-view information for the clustering task. Finally, a set of\nexperiments are conducted to verify the superiority of the proposed method\ncompared with state-of-the-art approaches both on eight benchmark MvC datasets\nand two spatially resolved transcriptomics datasets.\n","authors":["Zhenglai Li","Jun Wang","Chang Tang","Xinzhong Zhu","Wei Zhang","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.02564v2.pdf","comment":"We are withdrawing this paper due to issues in the experimental\n  section related to the Application for Spatially Resolved Transcriptomics\n  Data Clustering. These issues affect the validity of the results presented.\n  We believe it is necessary to withdraw the paper to address these problems\n  adequately before resubmission."},{"id":"http://arxiv.org/abs/2501.05775v1","updated":"2025-01-10T08:15:02Z","published":"2025-01-10T08:15:02Z","title":"STHFL: Spatio-Temporal Heterogeneous Federated Learning","summary":"  Federated learning is a new framework that protects data privacy and allows\nmultiple devices to cooperate in training machine learning models. Previous\nstudies have proposed multiple approaches to eliminate the challenges posed by\nnon-iid data and inter-domain heterogeneity issues. However, they ignore the\n\\textbf{spatio-temporal} heterogeneity formed by different data distributions\nof increasing task data in the intra-domain. Moreover, the global data is\ngenerally a long-tailed distribution rather than assuming the global data is\nbalanced in practical applications. To tackle the \\textbf{spatio-temporal}\ndilemma, we propose a novel setting named \\textbf{Spatio-Temporal\nHeterogeneity} Federated Learning (STHFL). Specially, the Global-Local Dynamic\nPrototype (GLDP) framework is designed for STHFL. In GLDP, the model in each\nclient contains personalized layers which can dynamically adapt to different\ndata distributions. For long-tailed data distribution, global prototypes are\nserved as complementary knowledge for the training on classes with few samples\nin clients without leaking privacy. As tasks increase in clients, the knowledge\nof local prototypes generated in previous tasks guides for training in the\ncurrent task to solve catastrophic forgetting. Meanwhile, the global-local\nprototypes are updated through the moving average method after training local\nprototypes in clients. Finally, we evaluate the effectiveness of GLDP, which\nachieves remarkable results compared to state-of-the-art methods in STHFL\nscenarios.\n","authors":["Shunxin Guo","Hongsong Wang","Shuxia Lin","Xu Yang","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2501.05775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04614v2","updated":"2025-01-10T08:07:16Z","published":"2024-12-05T21:00:46Z","title":"Extractive Structures Learned in Pretraining Enable Generalization on\n  Finetuned Facts","summary":"  Pretrained language models (LMs) can generalize to implications of facts that\nthey are finetuned on. For example, if finetuned on ``John Doe lives in Tokyo,\"\nLMs can correctly answer ``What language do the people in John Doe's city\nspeak?'' with ``Japanese''. However, little is known about the mechanisms that\nenable this generalization or how they are learned during pretraining. We\nintroduce extractive structures as a framework for describing how components in\nLMs (e.g., MLPs or attention heads) coordinate to enable this generalization.\nThe structures consist of informative components that store training facts as\nweight changes, and upstream and downstream extractive components that query\nand process the stored information to produce the correct implication. We\nhypothesize that extractive structures are learned during pretraining when\nencountering implications of previously known facts. This yields two\npredictions: a data ordering effect where extractive structures can be learned\nonly if facts precede their implications, and a weight grafting effect where\nextractive structures can be transferred to predict counterfactual\nimplications. We empirically demonstrate these phenomena in the OLMo-7b, Llama\n3-8b, Gemma 2-9b, and Qwen 2-7b models. Of independent interest, our results\nalso indicate that fact learning can occur at both early and late layers, which\nlead to different forms of generalization.\n","authors":["Jiahai Feng","Stuart Russell","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2412.04614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05772v1","updated":"2025-01-10T08:07:14Z","published":"2025-01-10T08:07:14Z","title":"rmlnomogram: An R package to construct an explainable nomogram for any\n  machine learning algorithms","summary":"  Background: Current nomogram can only be created for regression algorithm.\nProviding nomogram for any machine learning (ML) algorithms may accelerate\nmodel deployment in clinical settings or improve model availability. We\ndeveloped an R package and web application to construct nomogram with model\nexplainability of any ML algorithms. Methods: We formulated a function to\ntransform an ML prediction model into a nomogram, requiring datasets with: (1)\nall possible combinations of predictor values; (2) the corresponding outputs of\nthe model; and (3) the corresponding explainability values for each predictor\n(optional). Web application was also created. Results: Our R package could\ncreate 5 types of nomograms for categorical predictors and binary outcome\nwithout probability (1), categorical predictors and binary outcome with\nprobability (2) or continuous outcome (3), and categorical with single\nnumerical predictors and binary outcome with probability (4) or continuous\noutcome (5). Respectively, the first and remaining types optimally allowed\nmaximum 15 and 5 predictors with maximum 3,200 combinations. Web application is\nprovided with such limits. The explainability values were possible for types 2\nto 5. Conclusions: Our R package and web application could construct nomogram\nwith model explainability of any ML algorithms using a fair number of\npredictors.\n","authors":["Herdiantri Sufriyana","Emily Chia-Yu Su"],"pdf_url":"https://arxiv.org/pdf/2501.05772v1.pdf","comment":"16 pages, 2 figures, 1 table, 3 equations, 1 algorithm, 4 code\n  snippets"},{"id":"http://arxiv.org/abs/2311.02565v2","updated":"2025-01-10T08:01:09Z","published":"2023-11-05T04:43:48Z","title":"KITS: Inductive Spatio-Temporal Kriging with Increment Training Strategy","summary":"  Sensors are commonly deployed to perceive the environment. However, due to\nthe high cost, sensors are usually sparsely deployed. Kriging is the tailored\ntask to infer the unobserved nodes (without sensors) using the observed source\nnodes (with sensors). The essence of kriging task is transferability. Recently,\nseveral inductive spatio-temporal kriging methods have been proposed based on\ngraph neural networks, being trained based on a graph built on top of observed\nnodes via pretext tasks such as masking nodes out and reconstructing them.\nHowever, the graph in training is inevitably much sparser than the graph in\ninference that includes all the observed and unobserved nodes. The learned\npattern cannot be well generalized for inference, denoted as graph gap. To\naddress this issue, we first present a novel Increment training strategy:\ninstead of masking nodes (and reconstructing them), we add virtual nodes into\nthe training graph so as to mitigate the graph gap issue naturally.\nNevertheless, the empty-shell virtual nodes without labels could have\nbad-learned features and lack supervision signals. To solve these issues, we\npair each virtual node with its most similar observed node and fuse their\nfeatures together; to enhance the supervision signal, we construct reliable\npseudo labels for virtual nodes. As a result, the learned pattern of virtual\nnodes could be safely transferred to real unobserved nodes for reliable\nkriging. We name our new Kriging model with Increment Training Strategy as\nKITS. Extensive experiments demonstrate that KITS consistently outperforms\nexisting kriging methods by large margins, e.g., the improvement over MAE score\ncould be as high as 18.33%.\n","authors":["Qianxiong Xu","Cheng Long","Ziyue Li","Sijie Ruan","Rui Zhao","Zhishuai Li"],"pdf_url":"https://arxiv.org/pdf/2311.02565v2.pdf","comment":"This paper is accepted by AAAI'25"},{"id":"http://arxiv.org/abs/2501.05768v1","updated":"2025-01-10T07:56:30Z","published":"2025-01-10T07:56:30Z","title":"Halal or Not: Knowledge Graph Completion for Predicting Cultural\n  Appropriateness of Daily Products","summary":"  The growing demand for halal cosmetic products has exposed significant\nchallenges, especially in Muslim-majority countries. Recently, various machine\nlearning-based strategies, e.g., image-based methods, have shown remarkable\nsuccess in predicting the halal status of cosmetics. However, these methods\nmainly focus on analyzing the discrete and specific ingredients within separate\ncosmetics, which ignore the high-order and complex relations between cosmetics\nand ingredients. To address this problem, we propose a halal cosmetic\nrecommendation framework, namely HaCKG, that leverages a knowledge graph of\ncosmetics and their ingredients to explicitly model and capture the\nrelationships between cosmetics and their components. By representing cosmetics\nand ingredients as entities within the knowledge graph, HaCKG effectively\nlearns the high-order and complex relations between entities, offering a robust\nmethod for predicting halal status. Specifically, we first construct a cosmetic\nknowledge graph representing the relations between various cosmetics,\ningredients, and their properties. We then propose a pre-trained relational\ngraph attention network model with residual connections to learn the structural\nrelation between entities in the knowledge graph. The pre-trained model is then\nfine-tuned on downstream cosmetic data to predict halal status. Extensive\nexperiments on the cosmetic dataset over halal prediction tasks demonstrate the\nsuperiority of our model over state-of-the-art baselines.\n","authors":["Van Thuy Hoang","Tien-Bach-Thanh Do","Jinho Seo","Seung Charlie Kim","Luong Vuong Nguyen","Duong Nguyen Minh Huy","Hyeon-Ju Jeon","O-Joun Lee"],"pdf_url":"https://arxiv.org/pdf/2501.05768v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2501.04063v2","updated":"2025-01-10T07:44:50Z","published":"2025-01-07T10:54:25Z","title":"Fuzzy Information Entropy and Region Biased Matrix Factorization for Web\n  Service QoS Prediction","summary":"  Nowadays, there are many similar services available on the internet, making\nQuality of Service (QoS) a key concern for users. Since collecting QoS values\nfor all services through user invocations is impractical, predicting QoS values\nis a more feasible approach. Matrix factorization is considered an effective\nprediction method. However, most existing matrix factorization algorithms focus\non capturing global similarities between users and services, overlooking the\nlocal similarities between users and their similar neighbors, as well as the\nnon-interactive effects between users and services. This paper proposes a\nmatrix factorization approach based on user information entropy and region\nbias, which utilizes a similarity measurement method based on fuzzy information\nentropy to identify similar neighbors of users. Simultaneously, it integrates\nthe region bias between each user and service linearly into matrix\nfactorization to capture the non-interactive features between users and\nservices. This method demonstrates improved predictive performance in more\nrealistic and complex network environments. Additionally, numerous experiments\nare conducted on real-world QoS datasets. The experimental results show that\nthe proposed method outperforms some of the state-of-the-art methods in the\nfield at matrix densities ranging from 5% to 20%.\n","authors":["Guoxing Tang","Yugen Du","Xia Chen","Yingwei Luo","Benchi Ma"],"pdf_url":"https://arxiv.org/pdf/2501.04063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12746v2","updated":"2025-01-10T07:38:53Z","published":"2023-10-19T13:50:56Z","title":"TabuLa: Harnessing Language Models for Tabular Data Synthesis","summary":"  Tabular data synthesis is crucial for addressing privacy and security\nconcerns in industries reliant on tabular data. While recent advancements adopt\nlarge language models (LLMs) for realistic tabular data generation, their long\ntraining times and limited reusability hinder practical applications. In this\npaper, we propose Tabula, a tabular data synthesizer that leverages the\nstructure of LLM. Unlike state-of-the-art (SOTA) LLM-based tabular data\nsynthesizers that rely on pre-trained LLMs, Tabula discards the pre-trained\nweights originally designed for natural language tasks, focusing instead on a\ntailored approach for tabular data. In addition, Tabula introduces a token\nsequence compression strategy that significantly reduces training time while\nmaintaining data quality, alongside a novel token padding method that improves\nsequence alignment across training batches. Experiments on six datasets show\nthat Tabula achieves superior synthetic data utility compared to current SOTA\nmethods. Additionally, the results demonstrate that Tabula model trained on\ntabular datasets serves effectively as a foundational model for synthesizing\nnew tabular datasets. Furthermore, the proposed padding method outperforms the\nconventional left and right padding strategies. Finally, the results highlight\nthat Tabula averagely reduces training time per epoch by 46.2% compared to\nstate-of-the-art LLM approaches while achieving higher data utility. Our code\nis available at https://github.com/zhao-zilong/Tabula\n","authors":["Zilong Zhao","Robert Birke","Lydia Chen"],"pdf_url":"https://arxiv.org/pdf/2310.12746v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05762v1","updated":"2025-01-10T07:38:24Z","published":"2025-01-10T07:38:24Z","title":"Development and Comparison of Model-Based and Data-Driven Approaches for\n  the Prediction of the Mechanical Properties of Lattice Structures","summary":"  Lattice structures have great potential for several application fields\nranging from medical and tissue engineering to aeronautical one. Their\ndevelopment is further speeded up by the continuing advances in additive\nmanufacturing technologies that allow to overcome issues typical of standard\nprocesses and to propose tailored designs. However, the design of lattice\nstructures is still challenging since their properties are considerably\naffected by numerous factors. The present paper aims to propose, discuss, and\ncompare various modeling approaches to describe, understand, and predict the\ncorrelations between the mechanical properties and the void volume fraction of\ndifferent types of lattice structures fabricated by fused deposition modeling\n3D printing. Particularly, four approaches are proposed: (i) a simplified\nanalytical model; (ii) a semi-empirical model combining analytical equations\nwith experimental correction factors; (iii) an artificial neural network\ntrained on experimental data; (iv) numerical simulations by finite element\nanalyses. The comparison among the various approaches, and with experimental\ndata, allows to identify the performances, advantages, and disadvantages of\neach approach, thus giving important guidelines for choosing the right design\nmethodology based on the needs and available data.\n","authors":["Chiara Pasini","Oscar Ramponi","Stefano Pandini","Luciana Sartore","Giulia Scalet"],"pdf_url":"https://arxiv.org/pdf/2501.05762v1.pdf","comment":"This work was funded by the European Union ERC CoDe4Bio Grant ID\n  101039467 under the funding programme Horizon Europe"},{"id":"http://arxiv.org/abs/2405.18144v3","updated":"2025-01-10T07:22:12Z","published":"2024-05-28T13:02:56Z","title":"4-bit Shampoo for Memory-Efficient Network Training","summary":"  Second-order optimizers, maintaining a matrix termed a preconditioner, are\nsuperior to first-order optimizers in both theory and practice. The states\nforming the preconditioner and its inverse root restrict the maximum size of\nmodels trained by second-order optimizers. To address this, compressing 32-bit\noptimizer states to lower bitwidths has shown promise in reducing memory usage.\nHowever, current approaches only pertain to first-order optimizers. In this\npaper, we propose the first 4-bit second-order optimizers, exemplified by 4-bit\nShampoo, maintaining performance similar to that of 32-bit ones. We show that\nquantizing the eigenvector matrix of the preconditioner in 4-bit Shampoo is\nremarkably better than quantizing the preconditioner itself both theoretically\nand experimentally. By rectifying the orthogonality of the quantized\neigenvector matrix, we enhance the approximation of the preconditioner's\neigenvector matrix, which also benefits the computation of its inverse 4-th\nroot. Besides, we find that linear square quantization slightly outperforms\ndynamic tree quantization when quantizing second-order optimizer states.\nEvaluation on various networks for image classification and natural language\nmodeling demonstrates that our 4-bit Shampoo achieves comparable performance to\nits 32-bit counterpart while being more memory-efficient.\n","authors":["Sike Wang","Pan Zhou","Jia Li","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2405.18144v3.pdf","comment":"NeurIPS 2024 final camera-ready revisions, rectify the legend in\n  figure 9"},{"id":"http://arxiv.org/abs/2501.05755v1","updated":"2025-01-10T07:13:42Z","published":"2025-01-10T07:13:42Z","title":"CognoSpeak: an automatic, remote assessment of early cognitive decline\n  in real-world conversational speech","summary":"  The early signs of cognitive decline are often noticeable in conversational\nspeech, and identifying those signs is crucial in dealing with later and more\nserious stages of neurodegenerative diseases. Clinical detection is costly and\ntime-consuming and although there has been recent progress in the automatic\ndetection of speech-based cues, those systems are trained on relatively small\ndatabases, lacking detailed metadata and demographic information. This paper\npresents CognoSpeak and its associated data collection efforts. CognoSpeak asks\nmemory-probing long and short-term questions and administers standard cognitive\ntasks such as verbal and semantic fluency and picture description using a\nvirtual agent on a mobile or web platform. In addition, it collects multimodal\ndata such as audio and video along with a rich set of metadata from primary and\nsecondary care, memory clinics and remote settings like people's homes. Here,\nwe present results from 126 subjects whose audio was manually transcribed.\nSeveral classic classifiers, as well as large language model-based classifiers,\nhave been investigated and evaluated across the different types of prompts. We\ndemonstrate a high level of performance; in particular, we achieved an F1-score\nof 0.873 using a DistilBERT model to discriminate people with cognitive\nimpairment (dementia and people with mild cognitive impairment (MCI)) from\nhealthy volunteers using the memory responses, fluency tasks and cookie theft\npicture description. CognoSpeak is an automatic, remote, low-cost, repeatable,\nnon-invasive and less stressful alternative to existing clinical cognitive\nassessments.\n","authors":["Madhurananda Pahar","Fuxiang Tao","Bahman Mirheidari","Nathan Pevy","Rebecca Bright","Swapnil Gadgil","Lise Sproson","Dorota Braun","Caitlin Illingworth","Daniel Blackburn","Heidi Christensen"],"pdf_url":"https://arxiv.org/pdf/2501.05755v1.pdf","comment":"This paper has been accepted for publication in IEEE SSCI 2025.\n  Copyright belongs to IEEE"},{"id":"http://arxiv.org/abs/2501.05745v1","updated":"2025-01-10T06:21:48Z","published":"2025-01-10T06:21:48Z","title":"Covariate Dependent Mixture of Bayesian Networks","summary":"  Learning the structure of Bayesian networks from data provides insights into\nunderlying processes and the causal relationships that generate the data, but\nits usefulness depends on the homogeneity of the data population, a condition\noften violated in real-world applications. In such cases, using a single\nnetwork structure for inference can be misleading, as it may not capture\nsub-population differences. To address this, we propose a novel approach of\nmodelling a mixture of Bayesian networks where component probabilities depend\non individual characteristics. Our method identifies both network structures\nand demographic predictors of sub-population membership, aiding personalised\ninterventions. We evaluate our method through simulations and a youth mental\nhealth case study, demonstrating its potential to improve tailored\ninterventions in health, education, and social policy.\n","authors":["Roman Marchant","Dario Draca","Gilad Francis","Sahand Assadzadeh","Mathew Varidel","Frank Iorfino","Sally Cripps"],"pdf_url":"https://arxiv.org/pdf/2501.05745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05744v1","updated":"2025-01-10T06:20:27Z","published":"2025-01-10T06:20:27Z","title":"LLVD: LSTM-based Explicit Motion Modeling in Latent Space for Blind\n  Video Denoising","summary":"  Video restoration plays a pivotal role in revitalizing degraded video content\nby rectifying imperfections caused by various degradations introduced during\ncapturing (sensor noise, motion blur, etc.), saving/sharing (compression,\nresizing, etc.) and editing. This paper introduces a novel algorithm designed\nfor scenarios where noise is introduced during video capture, aiming to enhance\nthe visual quality of videos by reducing unwanted noise artifacts. We propose\nthe Latent space LSTM Video Denoiser (LLVD), an end-to-end blind denoising\nmodel. LLVD uniquely combines spatial and temporal feature extraction,\nemploying Long Short Term Memory (LSTM) within the encoded feature domain. This\nintegration of LSTM layers is crucial for maintaining continuity and minimizing\nflicker in the restored video. Moreover, processing frames in the encoded\nfeature domain significantly reduces computations, resulting in a very\nlightweight architecture. LLVD's blind nature makes it versatile for real,\nin-the-wild denoising scenarios where prior information about noise\ncharacteristics is not available. Experiments reveal that LLVD demonstrates\nexcellent performance for both synthetic and captured noise. Specifically, LLVD\nsurpasses the current State-Of-The-Art (SOTA) in RAW denoising by 0.3dB, while\nalso achieving a 59\\% reduction in computational complexity.\n","authors":["Loay Rashid","Siddharth Roheda","Amit Unde"],"pdf_url":"https://arxiv.org/pdf/2501.05744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05735v1","updated":"2025-01-10T06:04:32Z","published":"2025-01-10T06:04:32Z","title":"ELENA: Epigenetic Learning through Evolved Neural Adaptation","summary":"  Despite the success of metaheuristic algorithms in solving complex network\noptimization problems, they often struggle with adaptation, especially in\ndynamic or high-dimensional search spaces. Traditional approaches can become\nstuck in local optima, leading to inefficient exploration and suboptimal\nsolutions. Most of the widely accepted advanced algorithms do well either on\nhighly complex or smaller search spaces due to the lack of adaptation. To\naddress these limitations, we present ELENA (Epigenetic Learning through\nEvolved Neural Adaptation), a new evolutionary framework that incorporates\nepigenetic mechanisms to enhance the adaptability of the core evolutionary\napproach. ELENA leverages compressed representation of learning parameters\nimproved dynamically through epigenetic tags that serve as adaptive memory.\nThree epigenetic tags (mutation resistance, crossover affinity, and stability\nscore) assist with guiding solution space search, facilitating a more\nintelligent hypothesis landscape exploration. To assess the framework\nperformance, we conduct experiments on three critical network optimization\nproblems: the Traveling Salesman Problem (TSP), the Vehicle Routing Problem\n(VRP), and the Maximum Clique Problem (MCP). Experiments indicate that ELENA\nachieves competitive results, often surpassing state-of-the-art methods on\nnetwork optimization tasks.\n","authors":["Boris Kriuk","Keti Sulamanidze","Fedor Kriuk"],"pdf_url":"https://arxiv.org/pdf/2501.05735v1.pdf","comment":"15 pages, 6 figures, 4 tables, 2 algorithms"},{"id":"http://arxiv.org/abs/2501.05731v1","updated":"2025-01-10T05:55:14Z","published":"2025-01-10T05:55:14Z","title":"Diving Deep: Forecasting Sea Surface Temperatures and Anomalies","summary":"  This overview paper details the findings from the Diving Deep: Forecasting\nSea Surface Temperatures and Anomalies Challenge at the European Conference on\nMachine Learning and Principles and Practice of Knowledge Discovery in\nDatabases (ECML PKDD) 2024. The challenge focused on the data-driven\npredictability of global sea surface temperatures (SSTs), a key factor in\nclimate forecasting, ecosystem management, fisheries management, and climate\nchange monitoring. The challenge involved forecasting SST anomalies (SSTAs)\nthree months in advance using historical data and included a special task of\npredicting SSTAs nine months ahead for the Baltic Sea. Participants utilized\nvarious machine learning approaches to tackle the task, leveraging data from\nERA5. This paper discusses the methodologies employed, the results obtained,\nand the lessons learned, offering insights into the future of climate-related\npredictive modeling.\n","authors":["Ding Ning","Varvara Vetrova","Karin R. Bryan","Yun Sing Koh","Andreas Voskou","N'Dah Jean Kouagou","Arnab Sharma"],"pdf_url":"https://arxiv.org/pdf/2501.05731v1.pdf","comment":"The paper contains 9 pages for the main text and 10 pages including\n  References. 5 figures. Discovery Track, European Conference on Machine\n  Learning and Principles and Practice of Knowledge Discovery in Databases\n  (ECML PKDD) 2024"},{"id":"http://arxiv.org/abs/2501.05730v1","updated":"2025-01-10T05:54:04Z","published":"2025-01-10T05:54:04Z","title":"Element-wise Attention Is All You Need","summary":"  The self-attention (SA) mechanism has demonstrated superior performance\nacross various domains, yet it suffers from substantial complexity during both\ntraining and inference. The next-generation architecture, aiming at retaining\nthe competitive performance of SA while achieving low-cost inference and\nefficient long-sequence training, primarily focuses on three approaches: linear\nattention, linear RNNs, and state space models. Although these approaches\nachieve reduced complexity than SA, they all have built-in performance\ndegradation factors, such as diminished “spikiness” and compression of\nhistorical information. In contrast to these approaches, we propose a novel\nelement-wise attention mechanism, which uses the element-wise squared Euclidean\ndistance, instead of the dot product operation, to compute similarity and\napproximates the quadratic complexity term $\\exp(q_{ic}k_{jc})$ with a Taylor\npolynomial. This design achieves remarkable efficiency: during training, the\nelement-wise attention has a complexity of $\\mathcal{O}(tLD)$, making\nlong-sequence training both computationally and memory efficient, where $L$ is\nthe sequence length, $D$ is the feature dimension, and $t$ is the highest order\nof the polynomial; during inference, it can be reformulated as recurrent neural\nnetworks, achieving a inference complexity of $\\mathcal{O}(tD)$. Furthermore,\nthe element-wise attention circumvents the performance degradation factors\npresent in these approaches and achieves performance comparable to SA in both\ncausal and non-causal forms.\n","authors":["Guoxin Feng"],"pdf_url":"https://arxiv.org/pdf/2501.05730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05727v1","updated":"2025-01-10T05:51:52Z","published":"2025-01-10T05:51:52Z","title":"Enabling Scalable Oversight via Self-Evolving Critic","summary":"  Despite their remarkable performance, the development of Large Language\nModels (LLMs) faces a critical challenge in scalable oversight: providing\neffective feedback for tasks where human evaluation is difficult or where LLMs\noutperform humans. While there is growing interest in using LLMs for critique,\ncurrent approaches still rely on human annotations or more powerful models,\nleaving the issue of enhancing critique capabilities without external\nsupervision unresolved. We introduce SCRIT (Self-evolving CRITic), a framework\nthat enables genuine self-evolution of critique abilities. Technically, SCRIT\nself-improves by training on synthetic data, generated by a contrastive-based\nself-critic that uses reference solutions for step-by-step critique, and a\nself-validation mechanism that ensures critique quality through correction\noutcomes. Implemented with Qwen2.5-72B-Instruct, one of the most powerful LLMs,\nSCRIT achieves up to a 10.3\\% improvement on critique-correction and error\nidentification benchmarks. Our analysis reveals that SCRIT's performance scales\npositively with data and model size, outperforms alternative approaches, and\nbenefits critically from its self-validation component.\n","authors":["Zhengyang Tang","Ziniu Li","Zhenyang Xiao","Tian Ding","Ruoyu Sun","Benyou Wang","Dayiheng Liu","Fei Huang","Tianyu Liu","Bowen Yu","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2501.05727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00778v3","updated":"2025-01-10T05:35:58Z","published":"2024-06-02T15:35:45Z","title":"Bayesian Joint Additive Factor Models for Multiview Learning","summary":"  It is increasingly common in a wide variety of applied settings to collect\ndata of multiple different types on the same set of samples. Our particular\nfocus in this article is on studying relationships between such multiview\nfeatures and responses. A motivating application arises in the context of\nprecision medicine where multi-omics data are collected to correlate with\nclinical outcomes. It is of interest to infer dependence within and across\nviews while combining multimodal information to improve the prediction of\noutcomes. The signal-to-noise ratio can vary substantially across views,\nmotivating more nuanced statistical tools beyond standard late and early\nfusion. This challenge comes with the need to preserve interpretability, select\nfeatures, and obtain accurate uncertainty quantification. We propose a joint\nadditive factor regression model (JAFAR) with a structured additive design,\naccounting for shared and view-specific components. We ensure identifiability\nvia a novel dependent cumulative shrinkage process (D-CUSP) prior. We provide\nan efficient implementation via a partially collapsed Gibbs sampler and extend\nour approach to allow flexible feature and outcome distributions. Prediction of\ntime-to-labor onset from immunome, metabolome, and proteome data illustrates\nperformance gains against state-of-the-art competitors. Our open-source\nsoftware (R package) is available at https://github.com/niccoloanceschi/jafar.\n","authors":["Niccolo Anceschi","Federico Ferrari","David B. Dunson","Himel Mallick"],"pdf_url":"https://arxiv.org/pdf/2406.00778v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15720v2","updated":"2025-01-10T05:32:06Z","published":"2023-08-30T02:50:54Z","title":"Surrogate-based Autotuning for Randomized Sketching Algorithms in\n  Regression Problems","summary":"  Algorithms from Randomized Numerical Linear Algebra (RandNLA) are known to be\neffective in handling high-dimensional computational problems, providing\nhigh-quality empirical performance as well as strong probabilistic guarantees.\nHowever, their practical application is complicated by the fact that the user\nneeds to set various algorithm-specific tuning parameters which are different\nthan those used in traditional NLA. This paper demonstrates how a\nsurrogate-based autotuning approach can be used to address fundamental problems\nof parameter selection in RandNLA algorithms. In particular, we provide a\ndetailed investigation of surrogate-based autotuning for\nsketch-and-precondition (SAP) based randomized least squares methods, which\nhave been one of the great success stories in modern RandNLA. Empirical results\nshow that our surrogate-based autotuning approach can achieve near-optimal\nperformance with much less tuning cost than a random search (up to about 4x\nfewer trials of different parameter configurations). Moreover, while our\nexperiments focus on least squares, our results demonstrate a general-purpose\nautotuning pipeline applicable to any kind of RandNLA algorithm.\n","authors":["Younghyun Cho","James W. Demmel","Michał Dereziński","Haoyun Li","Hengrui Luo","Michael W. Mahoney","Riley J. Murray"],"pdf_url":"https://arxiv.org/pdf/2308.15720v2.pdf","comment":"Improved the presentation and clarity. Updated experimental results\n  and scenarios. Accepted for publication in SIAM Journal on Matrix Analysis\n  and Applications"},{"id":"http://arxiv.org/abs/2212.12322v3","updated":"2025-01-10T05:28:08Z","published":"2022-12-22T08:33:32Z","title":"Infrared Image Super-Resolution: Systematic Review, and Future Trends","summary":"  Image Super-Resolution (SR) is essential for a wide range of computer vision\nand image processing tasks. Investigating infrared (IR) image (or thermal\nimages) super-resolution is a continuing concern within the development of deep\nlearning. This survey aims to provide a comprehensive perspective of IR image\nsuper-resolution, including its applications, hardware imaging system dilemmas,\nand taxonomy of image processing methodologies. In addition, the datasets and\nevaluation metrics in IR image super-resolution tasks are also discussed.\nFurthermore, the deficiencies in current technologies and possible promising\ndirections for the community to explore are highlighted. To cope with the rapid\ndevelopment in this field, we intend to regularly update the relevant excellent\nwork at \\url{https://github.com/yongsongH/Infrared_Image_SR_Survey\n","authors":["Yongsong Huang","Tomo Miyazaki","Xiaofeng Liu","Shinichiro Omachi"],"pdf_url":"https://arxiv.org/pdf/2212.12322v3.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.05707v1","updated":"2025-01-10T04:35:46Z","published":"2025-01-10T04:35:46Z","title":"Multiagent Finetuning: Self Improvement with Diverse Reasoning Chains","summary":"  Large language models (LLMs) have achieved remarkable performance in recent\nyears but are fundamentally limited by the underlying training data. To improve\nmodels beyond the training data, recent works have explored how LLMs can be\nused to generate synthetic data for autonomous self-improvement. However,\nsuccessive steps of self-improvement can reach a point of diminishing returns.\nIn this work, we propose a complementary approach towards self-improvement\nwhere finetuning is applied to a multiagent society of language models. A group\nof language models, all starting from the same base model, are independently\nspecialized by updating each one using data generated through multiagent\ninteractions among the models. By training each model on independent sets of\ndata, we illustrate how this approach enables specialization across models and\ndiversification over the set of models. As a result, our overall system is able\nto preserve diverse reasoning chains and autonomously improve over many more\nrounds of fine-tuning than single-agent self-improvement methods. We\nquantitatively illustrate the efficacy of the approach across a wide suite of\nreasoning tasks.\n","authors":["Vighnesh Subramaniam","Yilun Du","Joshua B. Tenenbaum","Antonio Torralba","Shuang Li","Igor Mordatch"],"pdf_url":"https://arxiv.org/pdf/2501.05707v1.pdf","comment":"22 pages, 13 figures, 7 tables; Project page at\n  https://llm-multiagent-ft.github.io/"},{"id":"http://arxiv.org/abs/2411.12924v2","updated":"2025-01-10T03:55:57Z","published":"2024-11-19T23:22:33Z","title":"Human-In-the-Loop Software Development Agents","summary":"  Recently, Large Language Models (LLMs)-based multi-agent paradigms for\nsoftware engineering are introduced to automatically resolve software\ndevelopment tasks (e.g., from a given issue to source code). However, existing\nwork is evaluated based on historical benchmark datasets, rarely considers\nhuman feedback at each stage of the automated software development process, and\nhas not been deployed in practice. In this paper, we introduce a\nHuman-in-the-loop LLM-based Agents framework (HULA) for software development\nthat allows software engineers to refine and guide LLMs when generating coding\nplans and source code for a given task. We design, implement, and deploy the\nHULA framework into Atlassian JIRA for internal uses. Through a multi-stage\nevaluation of the HULA framework, Atlassian software engineers perceive that\nHULA can minimize the overall development time and effort, especially in\ninitiating a coding plan and writing code for straightforward tasks. On the\nother hand, challenges around code quality remain a concern in some cases. We\ndraw lessons learned and discuss opportunities for future work, which will pave\nthe way for the advancement of LLM-based agents in software development.\n","authors":["Wannita Takerngsaksiri","Jirat Pasuksmit","Patanamon Thongtanunam","Chakkrit Tantithamthavorn","Ruixiong Zhang","Fan Jiang","Jing Li","Evan Cook","Kun Chen","Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2411.12924v2.pdf","comment":"10 pages, 9 figures, ICSE SEIP 2025"},{"id":"http://arxiv.org/abs/2501.04608v2","updated":"2025-01-10T03:08:11Z","published":"2025-01-08T16:44:06Z","title":"Comprehensive Examination of Unrolled Networks for Solving Linear\n  Inverse Problems","summary":"  Unrolled networks have become prevalent in various computer vision and\nimaging tasks. Although they have demonstrated remarkable efficacy in solving\nspecific computer vision and computational imaging tasks, their adaptation to\nother applications presents considerable challenges. This is primarily due to\nthe multitude of design decisions that practitioners working on new\napplications must navigate, each potentially affecting the network's overall\nperformance. These decisions include selecting the optimization algorithm,\ndefining the loss function, and determining the number of convolutional layers,\namong others. Compounding the issue, evaluating each design choice requires\ntime-consuming simulations to train, fine-tune the neural network, and optimize\nfor its performance. As a result, the process of exploring multiple options and\nidentifying the optimal configuration becomes time-consuming and\ncomputationally demanding. The main objectives of this paper are (1) to unify\nsome ideas and methodologies used in unrolled networks to reduce the number of\ndesign choices a user has to make, and (2) to report a comprehensive ablation\nstudy to discuss the impact of each of the choices involved in designing\nunrolled networks and present practical recommendations based on our findings.\nWe anticipate that this study will help scientists and engineers design\nunrolled networks for their applications and diagnose problems within their\nnetworks efficiently.\n","authors":["Eric Chen","Xi Chen","Arian Maleki","Shirin Jalali"],"pdf_url":"https://arxiv.org/pdf/2501.04608v2.pdf","comment":"27 pages, 10 figures. Project Page:\n  https://github.com/YuxiChen25/Memory-Net-Inverse"},{"id":"http://arxiv.org/abs/2501.05680v1","updated":"2025-01-10T03:07:28Z","published":"2025-01-10T03:07:28Z","title":"EXION: Exploiting Inter- and Intra-Iteration Output Sparsity for\n  Diffusion Models","summary":"  Over the past few years, diffusion models have emerged as novel AI solutions,\ngenerating diverse multi-modal outputs from text prompts. Despite their\ncapabilities, they face challenges in computing, such as excessive latency and\nenergy consumption due to their iterative architecture. Although prior works\nspecialized in transformer acceleration can be applied, the iterative nature of\ndiffusion models remains unresolved. In this paper, we present EXION, the first\nSW-HW co-designed diffusion accelerator that solves the computation challenges\nby exploiting the unique inter- and intra-iteration output sparsity in\ndiffusion models. To this end, we propose two SW-level optimizations. First, we\nintroduce the FFN-Reuse algorithm that identifies and skips redundant\ncomputations in FFN layers across different iterations (inter-iteration\nsparsity). Second, we use a modified eager prediction method that employs\ntwo-step leading-one detection to accurately predict the attention score,\nskipping unnecessary computations within an iteration (intra-iteration\nsparsity). We also introduce a novel data compaction mechanism named ConMerge,\nwhich can enhance HW utilization by condensing and merging sparse matrices into\ncompact forms. Finally, it has a dedicated HW architecture that supports the\nabove sparsity-inducing algorithms, translating high output sparsity into\nimproved energy efficiency and performance. To verify the feasibility of the\nEXION, we first demonstrate that it has no impact on accuracy in various types\nof multi-modal diffusion models. We then instantiate EXION in both server- and\nedge-level settings and compare its performance against GPUs with similar\nspecifications. Our evaluation shows that EXION achieves dramatic improvements\nin performance and energy efficiency by 3.2-379.3x and 45.1-3067.6x compared to\na server GPU and by 42.6-1090.9x and 196.9-4668.2x compared to an edge GPU.\n","authors":["Jaehoon Heo","Adiwena Putra","Jieon Yoon","Sungwoong Yune","Hangyeol Lee","Ji-Hoon Kim","Joo-Young Kim"],"pdf_url":"https://arxiv.org/pdf/2501.05680v1.pdf","comment":"To appear in 2025 IEEE International Symposium on High-Performance\n  Computer Architecture (HPCA 2025)"},{"id":"http://arxiv.org/abs/2106.02329v3","updated":"2025-01-10T03:00:30Z","published":"2021-06-04T08:25:47Z","title":"Deep Switching State Space Model (DS$^3$M) for Nonlinear Time Series\n  Forecasting with Regime Switching","summary":"  Modern time series data often display complex nonlinear dependencies along\nwith irregular regime-switching behaviors. These features present technical\nchallenges in modeling, inference, and in offering insightful understanding\ninto the underlying stochastic phenomena. To tackle these challenges, we\nintroduce a novel modeling framework known as the Deep Switching State Space\nModel (DS$^3$M). This framework is engineered to make accurate forecasts for\nsuch time series while adeptly identifying the irregular regimes hidden within\nthe dynamics. These identifications not only have significant economic\nramifications but also contribute to a deeper understanding of the underlying\nphenomena. In DS$^3$M, the architecture employs discrete latent variables to\nrepresent regimes and continuous latent variables to account for random driving\nfactors. By melding a Recurrent Neural Network (RNN) with a nonlinear Switching\nState Space Model (SSSM), we manage to capture the nonlinear dependencies and\nirregular regime-switching behaviors, governed by a Markov chain and\nparameterized using multilayer perceptrons. We validate the effectiveness and\nregime identification capabilities of DS$^3$M through short- and long-term\nforecasting tests on a wide array of simulated and real-world datasets,\nspanning sectors such as healthcare, economics, traffic, meteorology, and\nenergy. Experimental results reveal that DS$^3$M outperforms several\nstate-of-the-art models in terms of forecasting accuracy, while providing\nmeaningful regime identifications.\n","authors":["Xiuqin Xu","Hanqiu Peng","Ying Chen"],"pdf_url":"https://arxiv.org/pdf/2106.02329v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05675v1","updated":"2025-01-10T02:57:08Z","published":"2025-01-10T02:57:08Z","title":"Facilitate Collaboration between Large Language Model and Task-specific\n  Model for Time Series Anomaly Detection","summary":"  In anomaly detection, methods based on large language models (LLMs) can\nincorporate expert knowledge, while task-specific smaller models excel at\nextracting normal patterns and detecting value fluctuations. Inspired by the\nhuman nervous system, where the brain stores expert knowledge and the\nperipheral nervous system and spinal cord handle specific tasks like withdrawal\nand knee-jerk reflexes, we propose CoLLaTe, a framework designed to facilitate\ncollaboration between LLMs and task-specific models, leveraging the strengths\nof both.\n  In this work, we first formulate the collaboration process and identify two\nkey challenges in the collaboration between LLMs and task-specific models: (1)\nthe misalignment between the expression domains of LLMs and smaller models, and\n(2) error accumulation arising from the predictions of both models.\n  To address these challenges, we introduce two key components in CoLLaTe: the\nalignment module and the collaborative loss function. Through theoretical\nanalysis and experimental validation, we demonstrate that these components\neffectively mitigate the identified challenges and achieve better performance\nthan LLM based methods and task-specific smaller model.\n","authors":["Feiyi Chen","Leilei Zhang","Guansong Pang","Roger Zimmermann","Shuiguang Deng"],"pdf_url":"https://arxiv.org/pdf/2501.05675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04967v2","updated":"2025-01-10T02:54:18Z","published":"2025-01-09T04:41:50Z","title":"Targeted Adversarial Denoising Autoencoders (TADA) for Neural Time\n  Series Filtration","summary":"  Current machine learning (ML)-based algorithms for filtering\nelectroencephalography (EEG) time series data face challenges related to\ncumbersome training times, regularization, and accurate reconstruction. To\naddress these shortcomings, we present an ML filtration algorithm driven by a\nlogistic covariance-targeted adversarial denoising autoencoder (TADA). We\nhypothesize that the expressivity of a targeted, correlation-driven\nconvolutional autoencoder will enable effective time series filtration while\nminimizing compute requirements (e.g., runtime, model size). Furthermore, we\nexpect that adversarial training with covariance rescaling will minimize signal\ndegradation. To test this hypothesis, a TADA system prototype was trained and\nevaluated on the task of removing electromyographic (EMG) noise from EEG data\nin the EEGdenoiseNet dataset, which includes EMG and EEG data from 67 subjects.\nThe TADA filter surpasses conventional signal filtration algorithms across\nquantitative metrics (Correlation Coefficient, Temporal RRMSE, Spectral RRMSE),\nand performs competitively against other deep learning architectures at a\nreduced model size of less than 400,000 trainable parameters. Further\nexperimentation will be necessary to assess the viability of TADA on a wider\nrange of deployment cases.\n","authors":["Benjamin J. Choi","Griffin Milsap","Clara A. Scholl","Francesco Tenore","Mattson Ogg"],"pdf_url":"https://arxiv.org/pdf/2501.04967v2.pdf","comment":"[Accepted] Artificial Intelligence for Time Series Analysis (AI4TS):\n  Theory, Algorithms, and Applications @ AAAI 2025, Philadelphia, PA, USA"},{"id":"http://arxiv.org/abs/2501.05667v1","updated":"2025-01-10T02:33:15Z","published":"2025-01-10T02:33:15Z","title":"TransPlace: Transferable Circuit Global Placement via Graph Neural\n  Network","summary":"  Global placement, a critical step in designing the physical layout of\ncomputer chips, is essential to optimize chip performance. Prior global\nplacement methods optimize each circuit design individually from scratch. Their\nneglect of transferable knowledge limits solution efficiency and chip\nperformance as circuit complexity drastically increases. This study presents\nTransPlace, a global placement framework that learns to place millions of\nmixed-size cells in continuous space. TransPlace introduces i) Netlist Graph to\nefficiently model netlist topology, ii) Cell-flow and relative position\nencoding to learn SE(2)-invariant representation, iii) a tailored graph neural\nnetwork architecture for informed parameterization of placement knowledge, and\niv) a two-stage strategy for coarse-to-fine placement. Compared to\nstate-of-the-art placement methods, TransPlace-trained on a few high-quality\nplacements-can place unseen circuits with 1.2x speedup while reducing\ncongestion by 30%, timing by 9%, and wirelength by 5%.\n","authors":["Yunbo Hou","Haoran Ye","Yingxue Zhang","Siyuan Xu","Guojie Song"],"pdf_url":"https://arxiv.org/pdf/2501.05667v1.pdf","comment":"Accepted at KDD 2025"},{"id":"http://arxiv.org/abs/2501.05663v1","updated":"2025-01-10T02:28:19Z","published":"2025-01-10T02:28:19Z","title":"Learning to Measure Quantum Neural Networks","summary":"  The rapid progress in quantum computing (QC) and machine learning (ML) has\nattracted growing attention, prompting extensive research into quantum machine\nlearning (QML) algorithms to solve diverse and complex problems. Designing\nhigh-performance QML models demands expert-level proficiency, which remains a\nsignificant obstacle to the broader adoption of QML. A few major hurdles\ninclude crafting effective data encoding techniques and parameterized quantum\ncircuits, both of which are crucial to the performance of QML models.\nAdditionally, the measurement phase is frequently overlooked-most current QML\nmodels rely on pre-defined measurement protocols that often fail to account for\nthe specific problem being addressed. We introduce a novel approach that makes\nthe observable of the quantum system-specifically, the Hermitian\nmatrix-learnable. Our method features an end-to-end differentiable learning\nframework, where the parameterized observable is trained alongside the ordinary\nquantum circuit parameters simultaneously. Using numerical simulations, we show\nthat the proposed method can identify observables for variational quantum\ncircuits that lead to improved outcomes, such as higher classification\naccuracy, thereby boosting the overall performance of QML models.\n","authors":["Samuel Yen-Chi Chen","Huan-Hsin Tseng","Hsin-Yi Lin","Shinjae Yoo"],"pdf_url":"https://arxiv.org/pdf/2501.05663v1.pdf","comment":"Accepted by ICASSP 2025 Workshop: Quantum Machine Learning in Signal\n  Processing and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2501.05661v1","updated":"2025-01-10T02:25:39Z","published":"2025-01-10T02:25:39Z","title":"TAMER: A Test-Time Adaptive MoE-Driven Framework for EHR Representation\n  Learning","summary":"  We propose TAMER, a Test-time Adaptive MoE-driven framework for EHR\nRepresentation learning. TAMER combines a Mixture-of-Experts (MoE) with\nTest-Time Adaptation (TTA) to address two critical challenges in EHR modeling:\npatient population heterogeneity and distribution shifts. The MoE component\nhandles diverse patient subgroups, while TTA enables real-time adaptation to\nevolving health status distributions when new patient samples are introduced.\nExtensive experiments across four real-world EHR datasets demonstrate that\nTAMER consistently improves predictive performance for both mortality and\nreadmission risk tasks when combined with diverse EHR modeling backbones. TAMER\noffers a promising approach for dynamic and personalized EHR-based predictions\nin practical clinical settings. Code is publicly available at\nhttps://github.com/yhzhu99/TAMER.\n","authors":["Yinghao Zhu","Xiaochen Zheng","Ahmed Allam","Michael Krauthammer"],"pdf_url":"https://arxiv.org/pdf/2501.05661v1.pdf","comment":"8 pages, 3 figures, 7 tables"},{"id":"http://arxiv.org/abs/2501.05656v1","updated":"2025-01-10T02:14:29Z","published":"2025-01-10T02:14:29Z","title":"Evidential Deep Learning for Uncertainty Quantification and\n  Out-of-Distribution Detection in Jet Identification using Deep Neural\n  Networks","summary":"  Current methods commonly used for uncertainty quantification (UQ) in deep\nlearning (DL) models utilize Bayesian methods which are computationally\nexpensive and time-consuming. In this paper, we provide a detailed study of UQ\nbased on evidential deep learning (EDL) for deep neural network models designed\nto identify jets in high energy proton-proton collisions at the Large Hadron\nCollider and explore its utility in anomaly detection. EDL is a DL approach\nthat treats learning as an evidence acquisition process designed to provide\nconfidence (or epistemic uncertainty) about test data. Using publicly available\ndatasets for jet classification benchmarking, we explore hyperparameter\noptimizations for EDL applied to the challenge of UQ for jet identification. We\nalso investigate how the uncertainty is distributed for each jet class, how\nthis method can be implemented for the detection of anomalies, how the\nuncertainty compares with Bayesian ensemble methods, and how the uncertainty\nmaps onto latent spaces for the models. Our studies uncover some pitfalls of\nEDL applied to anomaly detection and a more effective way to quantify\nuncertainty from EDL as compared with the foundational EDL setup. These studies\nillustrate a methodological approach to interpreting EDL in jet classification\nmodels, providing new insights on how EDL quantifies uncertainty and detects\nout-of-distribution data which may lead to improved EDL methods for DL models\napplied to classification tasks.\n","authors":["Ayush Khot","Xiwei Wang","Avik Roy","Volodymyr Kindratenko","Mark S. Neubauer"],"pdf_url":"https://arxiv.org/pdf/2501.05656v1.pdf","comment":"38 pages (including references) with 17 figures and 3 tables.\n  Repository: https://github.com/FAIR4HEP/PFIN4UQAD . Submitted to Machine\n  Learning: Science and Technology"},{"id":"http://arxiv.org/abs/2404.11917v2","updated":"2025-01-10T02:08:52Z","published":"2024-04-18T05:48:15Z","title":"Expected Coordinate Improvement for High-Dimensional Bayesian\n  Optimization","summary":"  Bayesian optimization (BO) algorithm is very popular for solving\nlow-dimensional expensive optimization problems. Extending Bayesian\noptimization to high dimension is a meaningful but challenging task. One of the\nmajor challenges is that it is difficult to find good infill solutions as the\nacquisition functions are also high-dimensional. In this work, we propose the\nexpected coordinate improvement (ECI) criterion for high-dimensional Bayesian\noptimization. The proposed ECI criterion measures the potential improvement we\ncan get by moving the current best solution along one coordinate. The proposed\napproach selects the coordinate with the highest ECI value to refine in each\niteration and covers all the coordinates gradually by iterating over the\ncoordinates. The greatest advantage of the proposed ECI-BO (expected coordinate\nimprovement based Bayesian optimization) algorithm over the standard BO\nalgorithm is that the infill selection problem of the proposed algorithm is\nalways a one-dimensional problem thus can be easily solved. Numerical\nexperiments show that the proposed algorithm can achieve significantly better\nresults than the standard BO algorithm and competitive results when compared\nwith five state-of-the-art high-dimensional BOs. This work provides a simple\nbut efficient approach for high-dimensional Bayesian optimization.\n","authors":["Dawei Zhan"],"pdf_url":"https://arxiv.org/pdf/2404.11917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05651v1","updated":"2025-01-10T01:42:05Z","published":"2025-01-10T01:42:05Z","title":"A Practical Cross-Layer Approach for ML-Driven Storage Placement in\n  Warehouse-Scale Computers","summary":"  Storage systems account for a major portion of the total cost of ownership\n(TCO) of warehouse-scale computers, and thus have a major impact on the overall\nsystem's efficiency. Machine learning (ML)-based methods for solving key\nproblems in storage system efficiency, such as data placement, have shown\nsignificant promise. However, there are few known practical deployments of such\nmethods. Studying this problem in the context of real-world hyperscale data\ncenter deployments at Google, we identify a number of challenges that we\nbelieve cause this lack of practical adoption. Specifically, prior work assumes\na monolithic model that resides entirely within the storage layer, an\nunrealistic assumption in real-world data center deployments. We propose a\ncross-layer approach that moves ML out of the storage system and performs it in\nthe application running on top of it, co-designed with a scheduling algorithm\nat the storage layer that consumes predictions from these application-level\nmodels. This approach combines small, interpretable models with a co-designed\nheuristic that adapts to different online environments. We build a\nproof-of-concept of this approach in a production distributed computation\nframework at Google. Evaluations in a test deployment and large-scale\nsimulation studies using production traces show improvements of as much as\n3.47x in TCO savings compared to state of the art baselines. We believe this\nwork represents a significant step towards more practical ML-driven storage\nplacement in warehouse-scale computers.\n","authors":["Chenxi Yang","Yan Li","Martin Maas","Mustafa Uysal","Ubaid Ullah Hafeez","Arif Merchant","Richard McDougall"],"pdf_url":"https://arxiv.org/pdf/2501.05651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05646v1","updated":"2025-01-10T01:25:01Z","published":"2025-01-10T01:25:01Z","title":"Efficient Representations for High-Cardinality Categorical Variables in\n  Machine Learning","summary":"  High\\-cardinality categorical variables pose significant challenges in\nmachine learning, particularly in terms of computational efficiency and model\ninterpretability. Traditional one\\-hot encoding often results in\nhigh\\-dimensional sparse feature spaces, increasing the risk of overfitting and\nreducing scalability. This paper introduces novel encoding techniques,\nincluding means encoding, low\\-rank encoding, and multinomial logistic\nregression encoding, to address these challenges. These methods leverage\nsufficient representations to generate compact and informative embeddings of\ncategorical data. We conduct rigorous theoretical analyses and empirical\nvalidations on diverse datasets, demonstrating significant improvements in\nmodel performance and computational efficiency compared to baseline methods.\nThe proposed techniques are particularly effective in domains requiring\nscalable solutions for large datasets, paving the way for more robust and\nefficient applications in machine learning.\n","authors":["Zixuan Liang"],"pdf_url":"https://arxiv.org/pdf/2501.05646v1.pdf","comment":"2025 International Conference on Advanced Machine Learning and Data\n  Science (AMLDS 2025)"},{"id":"http://arxiv.org/abs/2412.20006v2","updated":"2025-01-10T01:09:37Z","published":"2024-12-28T04:06:29Z","title":"Adversarial Robustness for Deep Learning-based Wildfire Prediction\n  Models","summary":"  Smoke detection using Deep Neural Networks (DNNs) is an effective approach\nfor early wildfire detection. However, because smoke is temporally and\nspatially anomalous, there are limitations in collecting sufficient training\ndata. This raises overfitting and bias concerns in existing DNN-based wildfire\ndetection models. Thus, we introduce WARP (Wildfire Adversarial Robustness\nProcedure), the first model-agnostic framework for evaluating the adversarial\nrobustness of DNN-based wildfire detection models. WARP addresses limitations\nin smoke image diversity using global and local adversarial attack methods. The\nglobal attack method uses image-contextualized Gaussian noise, while the local\nattack method uses patch noise injection, tailored to address critical aspects\nof wildfire detection. Leveraging WARP's model-agnostic capabilities, we assess\nthe adversarial robustness of real-time Convolutional Neural Networks (CNNs)\nand Transformers. The analysis revealed valuable insights into the models'\nlimitations. Specifically, the global attack method demonstrates that the\nTransformer model has more than 70% precision degradation than the CNN against\nglobal noise. In contrast, the local attack method shows that both models are\nsusceptible to cloud image injections when detecting smoke-positive instances,\nsuggesting a need for model improvements through data augmentation. WARP's\ncomprehensive robustness analysis contributed to the development of\nwildfire-specific data augmentation strategies, marking a step toward\npracticality.\n","authors":["Ryo Ide","Lei Yang"],"pdf_url":"https://arxiv.org/pdf/2412.20006v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18544v2","updated":"2025-01-10T01:06:06Z","published":"2024-12-24T16:51:35Z","title":"Consistency Checks for Language Model Forecasters","summary":"  Forecasting is a task that is difficult to evaluate: the ground truth can\nonly be known in the future. Recent work showing LLM forecasters rapidly\napproaching human-level performance begs the question: how can we benchmark and\nevaluate these forecasters instantaneously? Following the consistency check\nframework, we measure the performance of forecasters in terms of the\nconsistency of their predictions on different logically-related questions. We\npropose a new, general consistency metric based on arbitrage: for example, if a\nforecasting AI illogically predicts that both the Democratic and Republican\nparties have 60% probability of winning the 2024 US presidential election, an\narbitrageur can trade against the forecaster's predictions and make a profit.\nWe build an automated evaluation system that generates a set of base questions,\ninstantiates consistency checks from these questions, elicits the predictions\nof the forecaster, and measures the consistency of the predictions. We then\nbuild a standard, proper-scoring-rule forecasting benchmark, and show that our\n(instantaneous) consistency metrics correlate with LLM forecasters' ground\ntruth Brier scores (which are only known in the future). We also release a\nconsistency benchmark that resolves in 2028, providing a long-term evaluation\ntool for forecasting.\n","authors":["Daniel Paleka","Abhimanyu Pallavi Sudhir","Alejandro Alvarez","Vineeth Bhat","Adam Shen","Evan Wang","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2412.18544v2.pdf","comment":"55 pages, 25 figures. Submitted to ICLR 2025"},{"id":"http://arxiv.org/abs/2501.05644v1","updated":"2025-01-10T01:02:43Z","published":"2025-01-10T01:02:43Z","title":"Interpretable Enzyme Function Prediction via Residue-Level Detection","summary":"  Predicting multiple functions labeled with Enzyme Commission (EC) numbers\nfrom the enzyme sequence is of great significance but remains a challenge due\nto its sparse multi-label classification nature, i.e., each enzyme is typically\nassociated with only a few labels out of more than 6000 possible EC numbers.\nHowever, existing machine learning algorithms generally learn a fixed global\nrepresentation for each enzyme to classify all functions, thereby they lack\ninterpretability and the fine-grained information of some function-specific\nlocal residue fragments may be overwhelmed. Here we present an attention-based\nframework, namely ProtDETR (Protein Detection Transformer), by casting enzyme\nfunction prediction as a detection problem. It uses a set of learnable\nfunctional queries to adaptatively extract different local representations from\nthe sequence of residue-level features for predicting different EC numbers.\nProtDETR not only significantly outperforms existing deep learning-based enzyme\nfunction prediction methods, but also provides a new interpretable perspective\non automatically detecting different local regions for identifying different\nfunctions through cross-attentions between queries and residue-level features.\nCode is available at https://github.com/yangzhao1230/ProtDETR.\n","authors":["Zhao Yang","Bing Su","Jiahao Chen","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2501.05644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.05488v2","updated":"2025-01-10T00:58:28Z","published":"2024-12-07T01:19:14Z","title":"Enhancing Sample Generation of Diffusion Models using Noise Level\n  Correction","summary":"  The denoising process of diffusion models can be interpreted as an\napproximate projection of noisy samples onto the data manifold. Moreover, the\nnoise level in these samples approximates their distance to the underlying\nmanifold. Building on this insight, we propose a novel method to enhance sample\ngeneration by aligning the estimated noise level with the true distance of\nnoisy samples to the manifold. Specifically, we introduce a noise level\ncorrection network, leveraging a pre-trained denoising network, to refine noise\nlevel estimates during the denoising process. Additionally, we extend this\napproach to various image restoration tasks by integrating task-specific\nconstraints, including inpainting, deblurring, super-resolution, colorization,\nand compressed sensing. Experimental results demonstrate that our method\nsignificantly improves sample quality in both unconstrained and constrained\ngeneration scenarios. Notably, the proposed noise level correction framework is\ncompatible with existing denoising schedulers (e.g., DDIM), offering additional\nperformance improvements.\n","authors":["Abulikemu Abuduweili","Chenyang Yuan","Changliu Liu","Frank Permenter"],"pdf_url":"https://arxiv.org/pdf/2412.05488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05635v1","updated":"2025-01-10T00:42:27Z","published":"2025-01-10T00:42:27Z","title":"Enhancing Unsupervised Graph Few-shot Learning via Set Functions and\n  Optimal Transport","summary":"  Graph few-shot learning has garnered significant attention for its ability to\nrapidly adapt to downstream tasks with limited labeled data, sparking\nconsiderable interest among researchers. Recent advancements in graph few-shot\nlearning models have exhibited superior performance across diverse\napplications. Despite their successes, several limitations still exist. First,\nexisting models in the meta-training phase predominantly focus on\ninstance-level features within tasks, neglecting crucial set-level features\nessential for distinguishing between different categories. Second, these models\noften utilize query sets directly on classifiers trained with support sets\ncontaining only a few labeled examples, overlooking potential distribution\nshifts between these sets and leading to suboptimal performance. Finally,\nprevious models typically require necessitate abundant labeled data from base\nclasses to extract transferable knowledge, which is typically infeasible in\nreal-world scenarios. To address these issues, we propose a novel model named\nSTAR, which leverages Set funcTions and optimAl tRansport for enhancing\nunsupervised graph few-shot learning. Specifically, STAR utilizes expressive\nset functions to obtain set-level features in an unsupervised manner and\nemploys optimal transport principles to align the distributions of support and\nquery sets, thereby mitigating distribution shift effects. Theoretical analysis\ndemonstrates that STAR can capture more task-relevant information and enhance\ngeneralization capabilities. Empirically, extensive experiments across multiple\ndatasets validate the effectiveness of STAR. Our code can be found here.\n","authors":["Yonghao Liu","Fausto Giunchiglia","Ximing Li","Lan Huang","Xiaoyue Feng","Renchu Guan"],"pdf_url":"https://arxiv.org/pdf/2501.05635v1.pdf","comment":"KDD2025"},{"id":"http://arxiv.org/abs/2501.05633v1","updated":"2025-01-10T00:32:46Z","published":"2025-01-10T00:32:46Z","title":"Regularized Top-$k$: A Bayesian Framework for Gradient Sparsification","summary":"  Error accumulation is effective for gradient sparsification in distributed\nsettings: initially-unselected gradient entries are eventually selected as\ntheir accumulated error exceeds a certain level. The accumulation essentially\nbehaves as a scaling of the learning rate for the selected entries. Although\nthis property prevents the slow-down of lateral movements in distributed\ngradient descent, it can deteriorate convergence in some settings. This work\nproposes a novel sparsification scheme that controls the learning rate scaling\nof error accumulation. The development of this scheme follows two major steps:\nfirst, gradient sparsification is formulated as an inverse probability\n(inference) problem, and the Bayesian optimal sparsification mask is derived as\na maximum-a-posteriori estimator. Using the prior distribution inherited from\nTop-$k$, we derive a new sparsification algorithm which can be interpreted as a\nregularized form of Top-$k$. We call this algorithm regularized Top-$k$\n(RegTop-$k$). It utilizes past aggregated gradients to evaluate posterior\nstatistics of the next aggregation. It then prioritizes the local accumulated\ngradient entries based on these posterior statistics. We validate our\nderivation through numerical experiments. In distributed linear regression, it\nis observed that while Top-$k$ remains at a fixed distance from the global\noptimum, RegTop-$k$ converges to the global optimum at significantly higher\ncompression ratios. We further demonstrate the generalization of this\nobservation by employing RegTop-$k$ in distributed training of ResNet-18 on\nCIFAR-10, where it noticeably outperforms Top-$k$.\n","authors":["Ali Bereyhi","Ben Liang","Gary Boudreau","Ali Afana"],"pdf_url":"https://arxiv.org/pdf/2501.05633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08294v3","updated":"2025-01-10T00:19:23Z","published":"2024-08-15T17:49:24Z","title":"eGAD! double descent is explained by Generalized Aliasing Decomposition","summary":"  A central problem in data science is to use potentially noisy samples of an\nunknown function to predict values for unseen inputs. In classical statistics,\npredictive error is understood as a trade-off between the bias and the variance\nthat balances model simplicity with its ability to fit complex functions.\nHowever, over-parameterized models exhibit counterintuitive behaviors, such as\n\"double descent\" in which models of increasing complexity exhibit decreasing\ngeneralization error. Others may exhibit more complicated patterns of\npredictive error with multiple peaks and valleys. Neither double descent nor\nmultiple descent phenomena are well explained by the bias-variance\ndecomposition.\n  We introduce a novel decomposition that we call the generalized aliasing\ndecomposition (GAD) to explain the relationship between predictive performance\nand model complexity. The GAD decomposes the predictive error into three parts:\n1) model insufficiency, which dominates when the number of parameters is much\nsmaller than the number of data points, 2) data insufficiency, which dominates\nwhen the number of parameters is much greater than the number of data points,\nand 3) generalized aliasing, which dominates between these two extremes.\n  We demonstrate the applicability of the GAD to diverse applications,\nincluding random feature models from machine learning, Fourier transforms from\nsignal processing, solution methods for differential equations, and predictive\nformation enthalpy in materials discovery. Because key components of the GAD\ncan be explicitly calculated from the relationship between model class and\nsamples without seeing any data labels, it can answer questions related to\nexperimental design and model selection before collecting data or performing\nexperiments. We further demonstrate this approach on several examples and\ndiscuss implications for predictive modeling and data science.\n","authors":["Mark K. Transtrum","Gus L. W. Hart","Tyler J. Jarvis","Jared P. Whitehead"],"pdf_url":"https://arxiv.org/pdf/2408.08294v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2501.05686v1","updated":"2025-01-10T03:35:22Z","published":"2025-01-10T03:35:22Z","title":"Deep Reversible Consistency Learning for Cross-modal Retrieval","summary":"  Cross-modal retrieval (CMR) typically involves learning common\nrepresentations to directly measure similarities between multimodal samples.\nMost existing CMR methods commonly assume multimodal samples in pairs and\nemploy joint training to learn common representations, limiting the flexibility\nof CMR. Although some methods adopt independent training strategies for each\nmodality to improve flexibility in CMR, they utilize the randomly initialized\northogonal matrices to guide representation learning, which is suboptimal since\nthey assume inter-class samples are independent of each other, limiting the\npotential of semantic alignments between sample representations and\nground-truth labels. To address these issues, we propose a novel method termed\nDeep Reversible Consistency Learning (DRCL) for cross-modal retrieval. DRCL\nincludes two core modules, \\ie Selective Prior Learning (SPL) and Reversible\nSemantic Consistency learning (RSC). More specifically, SPL first learns a\ntransformation weight matrix on each modality and selects the best one based on\nthe quality score as the Prior, which greatly avoids blind selection of priors\nlearned from low-quality modalities. Then, RSC employs a Modality-invariant\nRepresentation Recasting mechanism (MRR) to recast the potential\nmodality-invariant representations from sample semantic labels by the\ngeneralized inverse matrix of the prior. Since labels are devoid of\nmodal-specific information, we utilize the recast features to guide the\nrepresentation learning, thus maintaining semantic consistency to the fullest\nextent possible. In addition, a feature augmentation mechanism (FA) is\nintroduced in RSC to encourage the model to learn over a wider data\ndistribution for diversity. Finally, extensive experiments conducted on five\nwidely used datasets and comparisons with 15 state-of-the-art baselines\ndemonstrate the effectiveness and superiority of our DRCL.\n","authors":["Ruitao Pu","Yang Qin","Dezhong Peng","Xiaomin Song","Huiming Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.05686v1.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.06164v1","updated":"2025-01-10T18:39:29Z","published":"2025-01-10T18:39:29Z","title":"Model Alignment Search","summary":"  When can we say that two neural systems are the same? The answer to this\nquestion is goal-dependent, and it is often addressed through correlative\nmethods such as Representational Similarity Analysis (RSA) and Centered Kernel\nAlignment (CKA). What do we miss when we forgo causal explorations, and how can\nwe target specific types of similarity? In this work, we introduce Model\nAlignment Search (MAS), a method for causally exploring distributed\nrepresentational similarity. The method learns invertible linear\ntransformations that align a subspace between two distributed networks'\nrepresentations where causal information can be freely interchanged. We first\nshow that the method can be used to transfer specific causal variables, such as\nthe number of items in a counting task, between networks with different\ntraining seeds. We then explore open questions in number cognition by comparing\ndifferent types of numeric representations in models trained on structurally\ndifferent numeric tasks. We then explore differences between MAS vs preexisting\ncausal similarity methods, showing MAS to be more resistant to unwanted\nexchanges. Lastly, we introduce a counterfactual latent auxiliary loss function\nthat helps shape causally relevant alignments even in cases where we do not\nhave causal access to one of the two models for training.\n","authors":["Satchel Grant"],"pdf_url":"https://arxiv.org/pdf/2501.06164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02780v2","updated":"2025-01-10T18:14:56Z","published":"2024-09-17T19:07:13Z","title":"Guess What I Think: Streamlined EEG-to-Image Generation with Latent\n  Diffusion Models","summary":"  Generating images from brain waves is gaining increasing attention due to its\npotential to advance brain-computer interface (BCI) systems by understanding\nhow brain signals encode visual cues. Most of the literature has focused on\nfMRI-to-Image tasks as fMRI is characterized by high spatial resolution.\nHowever, fMRI is an expensive neuroimaging modality and does not allow for\nreal-time BCI. On the other hand, electroencephalography (EEG) is a low-cost,\nnon-invasive, and portable neuroimaging technique, making it an attractive\noption for future real-time applications. Nevertheless, EEG presents inherent\nchallenges due to its low spatial resolution and susceptibility to noise and\nartifacts, which makes generating images from EEG more difficult. In this\npaper, we address these problems with a streamlined framework based on the\nControlNet adapter for conditioning a latent diffusion model (LDM) through EEG\nsignals. We conduct experiments and ablation studies on popular benchmarks to\ndemonstrate that the proposed method beats other state-of-the-art models.\nUnlike these methods, which often require extensive preprocessing, pretraining,\ndifferent losses, and captioning models, our approach is efficient and\nstraightforward, requiring only minimal preprocessing and a few components. The\ncode is available at https://github.com/LuigiSigillo/GWIT.\n","authors":["Eleonora Lopez","Luigi Sigillo","Federica Colonnese","Massimo Panella","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2410.02780v2.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.06146v1","updated":"2025-01-10T18:10:06Z","published":"2025-01-10T18:10:06Z","title":"xLSTM-SENet: xLSTM for Single-Channel Speech Enhancement","summary":"  While attention-based architectures, such as Conformers, excel in speech\nenhancement, they face challenges such as scalability with respect to input\nsequence length. In contrast, the recently proposed Extended Long Short-Term\nMemory (xLSTM) architecture offers linear scalability. However, xLSTM-based\nmodels remain unexplored for speech enhancement. This paper introduces\nxLSTM-SENet, the first xLSTM-based single-channel speech enhancement system. A\ncomparative analysis reveals that xLSTM-and notably, even LSTM-can match or\noutperform state-of-the-art Mamba- and Conformer-based systems across various\nmodel sizes in speech enhancement on the VoiceBank+Demand dataset. Through\nablation studies, we identify key architectural design choices such as\nexponential gating and bidirectionality contributing to its effectiveness. Our\nbest xLSTM-based model, xLSTM-SENet2, outperforms state-of-the-art Mamba- and\nConformer-based systems on the Voicebank+DEMAND dataset.\n","authors":["Nikolai Lund Kühne","Jan Østergaard","Jesper Jensen","Zheng-Hua Tan"],"pdf_url":"https://arxiv.org/pdf/2501.06146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06143v1","updated":"2025-01-10T18:08:07Z","published":"2025-01-10T18:08:07Z","title":"Multilingual Performance of a Multimodal Artificial Intelligence System\n  on Multisubject Physics Concept Inventories","summary":"  We investigate the multilingual and multimodal performance of a large\nlanguage model-based artificial intelligence (AI) system, GPT-4o, on a diverse\nset of physics concept inventories spanning multiple languages and subject\nareas. The inventories taken from the PhysPort website cover the classical\nphysics topics of mechanics, electromagnetism, optics, and thermodynamics as\nwell as relativity, quantum mechanics, astronomy, mathematics, and laboratory\nskills. Unlike previous text-only studies, we uploaded the inventories as\nimages mirroring what a student would see on paper, assessing the system's\nmultimodal functionality. The AI is prompted in English and autonomously\nchooses the language of its response - either remaining in the nominal language\nof the test, switching entirely to English, or mixing languages - revealing\nadaptive behavior dependent on linguistic complexity and data availability. Our\nresults indicate some variation in performance across subject areas, with\nlaboratory skills standing out as the area of poorest performance. Furthermore,\nthe AI's performance on questions that require visual interpretation of images\nis worse than on purely text-based questions. Questions that are difficult for\nthe AI tend to be that way invariably of the inventory language. We also find\nlarge variations in performance across languages, with some appearing to\nbenefit substantially from language switching, a phenomenon similar to\ncode-switching ofhuman speakers. Overall, comparing the obtained AI results to\nthe existing literature, we find that the AI system outperforms average\nundergraduate students post-instruction in all subject areas but laboratory\nskills.\n","authors":["Gerd Kortemeyer","Marina Babayeva","Giulia Polverini","Bor Gregorcic","Ralf Widenhorn"],"pdf_url":"https://arxiv.org/pdf/2501.06143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06141v1","updated":"2025-01-10T18:03:46Z","published":"2025-01-10T18:03:46Z","title":"Emergent Symbol-like Number Variables in Artificial Neural Networks","summary":"  What types of numeric representations emerge in Neural Networks (NNs)? To\nwhat degree do NNs induce abstract, mutable, slot-like numeric variables, and\nin what situations do these representations emerge? How do these\nrepresentations change over learning, and how can we understand the neural\nimplementations in ways that are unified across different NNs? In this work, we\napproach these questions by first training sequence based neural systems using\nNext Token Prediction (NTP) objectives on numeric tasks. We then seek to\nunderstand the neural solutions through the lens of causal abstractions or\nsymbolic algorithms. We use a combination of causal interventions and\nvisualization methods to find that artificial neural models do indeed develop\nanalogs of interchangeable, mutable, latent number variables purely from the\nNTP objective. We then ask how variations on the tasks and model architectures\naffect the models' learned solutions to find that these symbol-like numeric\nrepresentations do not form for every variant of the task, and transformers\nsolve the problem in a notably different way than their recurrent counterparts.\nWe then show how the symbol-like variables change over the course of training\nto find a strong correlation between the models' task performance and the\nalignment of their symbol-like representations. Lastly, we show that in all\ncases, some degree of gradience exists in these neural symbols, highlighting\nthe difficulty of finding simple, interpretable symbolic stories of how neural\nnetworks perform numeric tasks. Taken together, our results are consistent with\nthe view that neural networks can approximate interpretable symbolic programs\nof number cognition, but the particular program they approximate and the extent\nto which they approximate it can vary widely, depending on the network\narchitecture, training data, extent of training, and network size.\n","authors":["Satchel Grant","Noah D. Goodman","James L. McClelland"],"pdf_url":"https://arxiv.org/pdf/2501.06141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11456v2","updated":"2025-01-10T17:54:39Z","published":"2024-09-17T17:48:12Z","title":"Two Stage Segmentation of Cervical Tumors using PocketNet","summary":"  Cervical cancer remains the fourth most common malignancy amongst women\nworldwide.1 Concurrent chemoradiotherapy (CRT) serves as the mainstay\ndefinitive treatment regimen for locally advanced cervical cancers and includes\nexternal beam radiation followed by brachytherapy.2 Integral to radiotherapy\ntreatment planning is the routine contouring of both the target tumor at the\nlevel of the cervix, associated gynecologic anatomy and the adjacent organs at\nrisk (OARs). However, manual contouring of these structures is both time and\nlabor intensive and associated with known interobserver variability that can\nimpact treatment outcomes. While multiple tools have been developed to\nautomatically segment OARs and the high-risk clinical tumor volume (HR-CTV)\nusing computed tomography (CT) images,3,4,5,6 the development of deep\nlearning-based tumor segmentation tools using routine T2-weighted (T2w)\nmagnetic resonance imaging (MRI) addresses an unmet clinical need to improve\nthe routine contouring of both anatomical structures and cervical cancers,\nthereby increasing quality and consistency of radiotherapy planning. This work\napplied a novel deep-learning model (PocketNet) to segment the cervix, vagina,\nuterus, and tumor(s) on T2w MRI. The performance of the PocketNet architecture\nwas evaluated, when trained on data via 5-fold cross validation. PocketNet\nachieved a mean Dice-Sorensen similarity coefficient (DSC) exceeding 70% for\ntumor segmentation and 80% for organ segmentation. These results suggest that\nPocketNet is robust to variations in contrast protocols, providing reliable\nsegmentation of the regions of interest.\n","authors":["Awj Twam","Megan Jacobsen","Rachel Glenn","Peng Wei","Jia Sun","Ann Klopp","Aradhana M. Venkatesan","David Fuentes"],"pdf_url":"https://arxiv.org/pdf/2409.11456v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06137v1","updated":"2025-01-10T17:52:34Z","published":"2025-01-10T17:52:34Z","title":"Supervision policies can shape long-term risk management in\n  general-purpose AI models","summary":"  The rapid proliferation and deployment of General-Purpose AI (GPAI) models,\nincluding large language models (LLMs), present unprecedented challenges for AI\nsupervisory entities. We hypothesize that these entities will need to navigate\nan emergent ecosystem of risk and incident reporting, likely to exceed their\nsupervision capacity. To investigate this, we develop a simulation framework\nparameterized by features extracted from the diverse landscape of risk,\nincident, or hazard reporting ecosystems, including community-driven platforms,\ncrowdsourcing initiatives, and expert assessments. We evaluate four supervision\npolicies: non-prioritized (first-come, first-served), random selection,\npriority-based (addressing the highest-priority risks first), and\ndiversity-prioritized (balancing high-priority risks with comprehensive\ncoverage across risk types). Our results indicate that while priority-based and\ndiversity-prioritized policies are more effective at mitigating high-impact\nrisks, particularly those identified by experts, they may inadvertently neglect\nsystemic issues reported by the broader community. This oversight can create\nfeedback loops that amplify certain types of reporting while discouraging\nothers, leading to a skewed perception of the overall risk landscape. We\nvalidate our simulation results with several real-world datasets, including one\nwith over a million ChatGPT interactions, of which more than 150,000\nconversations were identified as risky. This validation underscores the complex\ntrade-offs inherent in AI risk supervision and highlights how the choice of\nrisk management policies can shape the future landscape of AI risks across\ndiverse GPAI models used in society.\n","authors":["Manuel Cebrian","Emilia Gomez","David Fernandez Llorca"],"pdf_url":"https://arxiv.org/pdf/2501.06137v1.pdf","comment":"24 pages, 14 figures"},{"id":"http://arxiv.org/abs/2501.06132v1","updated":"2025-01-10T17:44:57Z","published":"2025-01-10T17:44:57Z","title":"CoDriveVLM: VLM-Enhanced Urban Cooperative Dispatching and Motion\n  Planning for Future Autonomous Mobility on Demand Systems","summary":"  The increasing demand for flexible and efficient urban transportation\nsolutions has spotlighted the limitations of traditional Demand Responsive\nTransport (DRT) systems, particularly in accommodating diverse passenger needs\nand dynamic urban environments. Autonomous Mobility-on-Demand (AMoD) systems\nhave emerged as a promising alternative, leveraging connected and autonomous\nvehicles (CAVs) to provide responsive and adaptable services. However, existing\nmethods primarily focus on either vehicle scheduling or path planning, which\noften simplify complex urban layouts and neglect the necessity for simultaneous\ncoordination and mutual avoidance among CAVs. This oversimplification poses\nsignificant challenges to the deployment of AMoD systems in real-world\nscenarios. To address these gaps, we propose CoDriveVLM, a novel framework that\nintegrates high-fidelity simultaneous dispatching and cooperative motion\nplanning for future AMoD systems. Our method harnesses Vision-Language Models\n(VLMs) to enhance multi-modality information processing, and this enables\ncomprehensive dispatching and collision risk evaluation. The VLM-enhanced CAV\ndispatching coordinator is introduced to effectively manage complex and\nunforeseen AMoD conditions, thus supporting efficient scheduling\ndecision-making. Furthermore, we propose a scalable decentralized cooperative\nmotion planning method via consensus alternating direction method of\nmultipliers (ADMM) focusing on collision risk evaluation and decentralized\ntrajectory optimization. Simulation results demonstrate the feasibility and\nrobustness of CoDriveVLM in various traffic conditions, showcasing its\npotential to significantly improve the fidelity and effectiveness of AMoD\nsystems in future urban transportation networks. The code is available at\nhttps://github.com/henryhcliu/CoDriveVLM.git.\n","authors":["Haichao Liu","Ruoyu Yao","Wenru Liu","Zhenmin Huang","Shaojie Shen","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2501.06132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02189v2","updated":"2025-01-10T17:43:10Z","published":"2025-01-04T04:59:33Z","title":"Benchmark Evaluations, Applications, and Challenges of Large Vision\n  Language Models: A Survey","summary":"  Multimodal Vision Language Models (VLMs) have emerged as a transformative\ntechnology at the intersection of computer vision and natural language\nprocessing, enabling machines to perceive and reason about the world through\nboth visual and textual modalities. For example, models such as CLIP, Claude,\nand GPT-4V demonstrate strong reasoning and understanding abilities on visual\nand textual data and beat classical single modality vision models on zero-shot\nclassification. Despite their rapid advancements in research and growing\npopularity in applications, a comprehensive survey of existing studies on VLMs\nis notably lacking, particularly for researchers aiming to leverage VLMs in\ntheir specific domains. To this end, we provide a systematic overview of VLMs\nin the following aspects: model information of the major VLMs developed over\nthe past five years (2019-2024); the main architectures and training methods of\nthese VLMs; summary and categorization of the popular benchmarks and evaluation\nmetrics of VLMs; the applications of VLMs including embodied agents, robotics,\nand video generation; the challenges and issues faced by current VLMs such as\nhallucination, fairness, and safety. Detailed collections including papers and\nmodel repository links are listed in\nhttps://github.com/zli12321/Awesome-VLM-Papers-And-Models.git.\n","authors":["Zongxia Li","Xiyang Wu","Hongyang Du","Huy Nghiem","Guangyao Shi"],"pdf_url":"https://arxiv.org/pdf/2501.02189v2.pdf","comment":"35 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.06129v1","updated":"2025-01-10T17:35:06Z","published":"2025-01-10T17:35:06Z","title":"Contextual ASR Error Handling with LLMs Augmentation for Goal-Oriented\n  Conversational AI","summary":"  General-purpose automatic speech recognition (ASR) systems do not always\nperform well in goal-oriented dialogue. Existing ASR correction methods rely on\nprior user data or named entities. We extend correction to tasks that have no\nprior user data and exhibit linguistic flexibility such as lexical and\nsyntactic variations. We propose a novel context augmentation with a large\nlanguage model and a ranking strategy that incorporates contextual information\nfrom the dialogue states of a goal-oriented conversational AI and its tasks.\nOur method ranks (1) n-best ASR hypotheses by their lexical and semantic\nsimilarity with context and (2) context by phonetic correspondence with ASR\nhypotheses. Evaluated in home improvement and cooking domains with real-world\nusers, our method improves recall and F1 of correction by 34% and 16%,\nrespectively, while maintaining precision and false positive rate. Users rated\n.8-1 point (out of 5) higher when our correction method worked properly, with\nno decrease due to false positives.\n","authors":["Yuya Asano","Sabit Hassan","Paras Sharma","Anthony Sicilia","Katherine Atwell","Diane Litman","Malihe Alikhani"],"pdf_url":"https://arxiv.org/pdf/2501.06129v1.pdf","comment":"Accepted to COLING 2025 Industry Track"},{"id":"http://arxiv.org/abs/2501.06117v1","updated":"2025-01-10T17:15:38Z","published":"2025-01-10T17:15:38Z","title":"Fleurs-SLU: A Massively Multilingual Benchmark for Spoken Language\n  Understanding","summary":"  While recent multilingual automatic speech recognition models claim to\nsupport thousands of languages, ASR for low-resource languages remains highly\nunreliable due to limited bimodal speech and text training data. Better\nmultilingual spoken language understanding (SLU) can strengthen massively the\nrobustness of multilingual ASR by levering language semantics to compensate for\nscarce training data, such as disambiguating utterances via context or\nexploiting semantic similarities across languages. Even more so, SLU is\nindispensable for inclusive speech technology in roughly half of all living\nlanguages that lack a formal writing system. However, the evaluation of\nmultilingual SLU remains limited to shallower tasks such as intent\nclassification or language identification. To address this, we present\nFleurs-SLU, a multilingual SLU benchmark that encompasses topical speech\nclassification in 102 languages and multiple-choice question answering through\nlistening comprehension in 92 languages. We extensively evaluate both\nend-to-end speech classification models and cascaded systems that combine\nspeech-to-text transcription with subsequent classification by large language\nmodels on Fleurs-SLU. Our results show that cascaded systems exhibit greater\nrobustness in multilingual SLU tasks, though speech encoders can achieve\ncompetitive performance in topical speech classification when appropriately\npre-trained. We further find a strong correlation between robust multilingual\nASR, effective speech-to-text translation, and strong multilingual SLU,\nhighlighting the mutual benefits between acoustic and semantic speech\nrepresentations.\n","authors":["Fabian David Schmidt","Ivan Vulić","Goran Glavaš","David Ifeoluwa Adelani"],"pdf_url":"https://arxiv.org/pdf/2501.06117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05409v2","updated":"2025-01-10T16:58:29Z","published":"2025-01-09T18:06:45Z","title":"Atlas: A Novel Pathology Foundation Model by Mayo Clinic, Charité, and\n  Aignostics","summary":"  Recent advances in digital pathology have demonstrated the effectiveness of\nfoundation models across diverse applications. In this report, we present\nAtlas, a novel vision foundation model based on the RudolfV approach. Our model\nwas trained on a dataset comprising 1.2 million histopathology whole slide\nimages, collected from two medical institutions: Mayo Clinic and Charit\\'e -\nUniverst\\\"atsmedizin Berlin. Comprehensive evaluations show that Atlas achieves\nstate-of-the-art performance across twenty-one public benchmark datasets, even\nthough it is neither the largest model by parameter count nor by training\ndataset size.\n","authors":["Maximilian Alber","Stephan Tietz","Jonas Dippel","Timo Milbich","Timothée Lesort","Panos Korfiatis","Moritz Krügener","Beatriz Perez Cancer","Neelay Shah","Alexander Möllers","Philipp Seegerer","Alexandra Carpen-Amarie","Kai Standvoss","Gabriel Dernbach","Edwin de Jong","Simon Schallenberg","Andreas Kunft","Helmut Hoffer von Ankershoffen","Gavin Schaeferle","Patrick Duffy","Matt Redlon","Philipp Jurmeister","David Horst","Lukas Ruff","Klaus-Robert Müller","Frederick Klauschen","Andrew Norgan"],"pdf_url":"https://arxiv.org/pdf/2501.05409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06099v1","updated":"2025-01-10T16:53:48Z","published":"2025-01-10T16:53:48Z","title":"Explaining Deep Learning-based Anomaly Detection in Energy Consumption\n  Data by Focusing on Contextually Relevant Data","summary":"  Detecting anomalies in energy consumption data is crucial for identifying\nenergy waste, equipment malfunction, and overall, for ensuring efficient energy\nmanagement. Machine learning, and specifically deep learning approaches, have\nbeen greatly successful in anomaly detection; however, they are black-box\napproaches that do not provide transparency or explanations. SHAP and its\nvariants have been proposed to explain these models, but they suffer from high\ncomputational complexity (SHAP) or instability and inconsistency (e.g., Kernel\nSHAP). To address these challenges, this paper proposes an explainability\napproach for anomalies in energy consumption data that focuses on\ncontext-relevant information. The proposed approach leverages existing\nexplainability techniques, focusing on SHAP variants, together with global\nfeature importance and weighted cosine similarity to select background dataset\nbased on the context of each anomaly point. By focusing on the context and most\nrelevant features, this approach mitigates the instability of explainability\nalgorithms. Experimental results across 10 different machine learning models,\nfive datasets, and five XAI techniques, demonstrate that our method reduces the\nvariability of explanations providing consistent explanations. Statistical\nanalyses confirm the robustness of our approach, showing an average reduction\nin variability of approximately 38% across multiple datasets.\n","authors":["Mohammad Noorchenarboo","Katarina Grolinger"],"pdf_url":"https://arxiv.org/pdf/2501.06099v1.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.06089v1","updated":"2025-01-10T16:39:01Z","published":"2025-01-10T16:39:01Z","title":"Towards Developing Socially Compliant Automated Vehicles: State of the\n  Art, Experts Expectations, and A Conceptual Framework","summary":"  Automated Vehicles (AVs) hold promise for revolutionizing transportation by\nimproving road safety, traffic efficiency, and overall mobility. Despite the\nsteady advancement in high-level AVs in recent years, the transition to full\nautomation entails a period of mixed traffic, where AVs of varying automation\nlevels coexist with human-driven vehicles (HDVs). Making AVs socially compliant\nand understood by human drivers is expected to improve the safety and\nefficiency of mixed traffic. Thus, ensuring AVs compatibility with HDVs and\nsocial acceptance is crucial for their successful and seamless integration into\nmixed traffic. However, research in this critical area of developing Socially\nCompliant AVs (SCAVs) remains sparse. This study carries out the first\ncomprehensive scoping review to assess the current state of the art in\ndeveloping SCAVs, identifying key concepts, methodological approaches, and\nresearch gaps. An expert interview was also conducted to identify critical\nresearch gaps and expectations towards SCAVs. Based on the scoping review and\nexpert interview input, a conceptual framework is proposed for the development\nof SCAVs. The conceptual framework is evaluated using an online survey\ntargeting researchers, technicians, policymakers, and other relevant\nprofessionals worldwide. The survey results provide valuable validation and\ninsights, affirming the significance of the proposed conceptual framework in\ntackling the challenges of integrating AVs into mixed-traffic environments.\nAdditionally, future research perspectives and suggestions are discussed,\ncontributing to the research and development agenda of SCAVs.\n","authors":["Yongqi Dong","Bart van Arem","Haneen Farah"],"pdf_url":"https://arxiv.org/pdf/2501.06089v1.pdf","comment":"39 pages, 13 figures, under review by the journal of Transportation\n  Research Part E: Logistics and Transportation Review"},{"id":"http://arxiv.org/abs/2501.06086v1","updated":"2025-01-10T16:34:19Z","published":"2025-01-10T16:34:19Z","title":"All AI Models are Wrong, but Some are Optimal","summary":"  AI models that predict the future behavior of a system (a.k.a. predictive AI\nmodels) are central to intelligent decision-making. However, decision-making\nusing predictive AI models often results in suboptimal performance. This is\nprimarily because AI models are typically constructed to best fit the data, and\nhence to predict the most likely future rather than to enable high-performance\ndecision-making. The hope that such prediction enables high-performance\ndecisions is neither guaranteed in theory nor established in practice. In fact,\nthere is increasing empirical evidence that predictive models must be tailored\nto decision-making objectives for performance. In this paper, we establish\nformal (necessary and sufficient) conditions that a predictive model (AI-based\nor not) must satisfy for a decision-making policy established using that model\nto be optimal. We then discuss their implications for building predictive AI\nmodels for sequential decision-making.\n","authors":["Akhil S Anand","Shambhuraj Sawant","Dirk Reinhardt","Sebastien Gros"],"pdf_url":"https://arxiv.org/pdf/2501.06086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.06433v3","updated":"2025-01-10T16:26:43Z","published":"2022-10-12T17:30:12Z","title":"Self-supervised video pretraining yields robust and more human-aligned\n  visual representations","summary":"  Humans learn powerful representations of objects and scenes by observing how\nthey evolve over time. Yet, outside of specific tasks that require explicit\ntemporal understanding, static image pretraining remains the dominant paradigm\nfor learning visual foundation models. We question this mismatch, and ask\nwhether video pretraining can yield visual representations that bear the\nhallmarks of human perception: generalisation across tasks, robustness to\nperturbations, and consistency with human judgements. To that end we propose a\nnovel procedure for curating videos, and develop a contrastive framework which\nlearns from the complex transformations therein. This simple paradigm for\ndistilling knowledge from videos, called VITO, yields general representations\nthat far outperform prior video pretraining methods on image understanding\ntasks, and image pretraining methods on video understanding tasks. Moreover,\nVITO representations are significantly more robust to natural and synthetic\ndeformations than image-, video-, and adversarially-trained ones. Finally,\nVITO's predictions are strongly aligned with human judgements, surpassing\nmodels that were specifically trained for that purpose. Together, these results\nsuggest that video pretraining could be a simple way of learning unified,\nrobust, and human-aligned representations of the visual world.\n","authors":["Nikhil Parthasarathy","S. M. Ali Eslami","João Carreira","Olivier J. Hénaff"],"pdf_url":"https://arxiv.org/pdf/2210.06433v3.pdf","comment":"Accepted to 37th Conference on Neural Information Processing Systems\n  (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2501.06080v1","updated":"2025-01-10T16:15:23Z","published":"2025-01-10T16:15:23Z","title":"Scale-up Unlearnable Examples Learning with High-Performance Computing","summary":"  Recent advancements in AI models are structured to retain user interactions,\nwhich could inadvertently include sensitive healthcare data. In the healthcare\nfield, particularly when radiologists use AI-driven diagnostic tools hosted on\nonline platforms, there is a risk that medical imaging data may be repurposed\nfor future AI training without explicit consent, spotlighting critical privacy\nand intellectual property concerns around healthcare data usage. Addressing\nthese privacy challenges, a novel approach known as Unlearnable Examples (UEs)\nhas been introduced, aiming to make data unlearnable to deep learning models. A\nprominent method within this area, called Unlearnable Clustering (UC), has\nshown improved UE performance with larger batch sizes but was previously\nlimited by computational resources. To push the boundaries of UE performance\nwith theoretically unlimited resources, we scaled up UC learning across various\ndatasets using Distributed Data Parallel (DDP) training on the Summit\nsupercomputer. Our goal was to examine UE efficacy at high-performance\ncomputing (HPC) levels to prevent unauthorized learning and enhance data\nsecurity, particularly exploring the impact of batch size on UE's\nunlearnability. Utilizing the robust computational capabilities of the Summit,\nextensive experiments were conducted on diverse datasets such as Pets,\nMedMNist, Flowers, and Flowers102. Our findings reveal that both overly large\nand overly small batch sizes can lead to performance instability and affect\naccuracy. However, the relationship between batch size and unlearnability\nvaried across datasets, highlighting the necessity for tailored batch size\nstrategies to achieve optimal data protection. Our results underscore the\ncritical role of selecting appropriate batch sizes based on the specific\ncharacteristics of each dataset to prevent learning and ensure data security in\ndeep learning applications.\n","authors":["Yanfan Zhu","Issac Lyngaas","Murali Gopalakrishnan Meena","Mary Ellen I. Koran","Bradley Malin","Daniel Moyer","Shunxing Bao","Anuj Kapadia","Xiao Wang","Bennett Landman","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2501.06080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06078v1","updated":"2025-01-10T16:14:35Z","published":"2025-01-10T16:14:35Z","title":"Explaining k-Nearest Neighbors: Abductive and Counterfactual\n  Explanations","summary":"  Despite the wide use of $k$-Nearest Neighbors as classification models, their\nexplainability properties remain poorly understood from a theoretical\nperspective. While nearest neighbors classifiers offer interpretability from a\n\"data perspective\", in which the classification of an input vector $\\bar{x}$ is\nexplained by identifying the vectors $\\bar{v}_1, \\ldots, \\bar{v}_k$ in the\ntraining set that determine the classification of $\\bar{x}$, we argue that such\nexplanations can be impractical in high-dimensional applications, where each\nvector has hundreds or thousands of features and it is not clear what their\nrelative importance is. Hence, we focus on understanding nearest neighbor\nclassifications through a \"feature perspective\", in which the goal is to\nidentify how the values of the features in $\\bar{x}$ affect its classification.\nConcretely, we study abductive explanations such as \"minimum sufficient\nreasons\", which correspond to sets of features in $\\bar{x}$ that are enough to\nguarantee its classification, and \"counterfactual explanations\" based on the\nminimum distance feature changes one would have to perform in $\\bar{x}$ to\nchange its classification. We present a detailed landscape of positive and\nnegative complexity results for counterfactual and abductive explanations,\ndistinguishing between discrete and continuous feature spaces, and considering\nthe impact of the choice of distance function involved. Finally, we show that\ndespite some negative complexity results, Integer Quadratic Programming and SAT\nsolving allow for computing explanations in practice.\n","authors":["Pablo Barceló","Alexander Kozachinskiy","Miguel Romero Orth","Bernardo Subercaseaux","José Verschae"],"pdf_url":"https://arxiv.org/pdf/2501.06078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06066v1","updated":"2025-01-10T15:57:23Z","published":"2025-01-10T15:57:23Z","title":"Distilling Calibration via Conformalized Credal Inference","summary":"  Deploying artificial intelligence (AI) models on edge devices involves a\ndelicate balance between meeting stringent complexity constraints, such as\nlimited memory and energy resources, and ensuring reliable performance in\nsensitive decision-making tasks. One way to enhance reliability is through\nuncertainty quantification via Bayesian inference. This approach, however,\ntypically necessitates maintaining and running multiple models in an ensemble,\nwhich may exceed the computational limits of edge devices. This paper\nintroduces a low-complexity methodology to address this challenge by distilling\ncalibration information from a more complex model. In an offline phase,\npredictive probabilities generated by a high-complexity cloud-based model are\nleveraged to determine a threshold based on the typical divergence between the\ncloud and edge models. At run time, this threshold is used to construct credal\nsets -- ranges of predictive probabilities that are guaranteed, with a\nuser-selected confidence level, to include the predictions of the cloud model.\nThe credal sets are obtained through thresholding of a divergence measure in\nthe simplex of predictive probabilities. Experiments on visual and language\ntasks demonstrate that the proposed approach, termed Conformalized Distillation\nfor Credal Inference (CD-CI), significantly improves calibration performance\ncompared to low-complexity Bayesian methods, such as Laplace approximation,\nmaking it a practical and efficient solution for edge AI deployments.\n","authors":["Jiayi Huang","Sangwoo Park","Nicola Paoletti","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2501.06066v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2407.04103v2","updated":"2025-01-10T15:37:26Z","published":"2024-07-04T18:06:48Z","title":"Advances in Diffusion Models for Image Data Augmentation: A Review of\n  Methods, Models, Evaluation Metrics and Future Research Directions","summary":"  Image data augmentation constitutes a critical methodology in modern computer\nvision tasks, since it can facilitate towards enhancing the diversity and\nquality of training datasets; thereby, improving the performance and robustness\nof machine learning models in downstream tasks. In parallel, augmentation\napproaches can also be used for editing/modifying a given image in a context-\nand semantics-aware way. Diffusion Models (DMs), which comprise one of the most\nrecent and highly promising classes of methods in the field of generative\nArtificial Intelligence (AI), have emerged as a powerful tool for image data\naugmentation, capable of generating realistic and diverse images by learning\nthe underlying data distribution. The current study realizes a systematic,\ncomprehensive and in-depth review of DM-based approaches for image\naugmentation, covering a wide range of strategies, tasks and applications. In\nparticular, a comprehensive analysis of the fundamental principles, model\narchitectures and training strategies of DMs is initially performed.\nSubsequently, a taxonomy of the relevant image augmentation methods is\nintroduced, focusing on techniques regarding semantic manipulation,\npersonalization and adaptation, and application-specific augmentation tasks.\nThen, performance assessment methodologies and respective evaluation metrics\nare analyzed. Finally, current challenges and future research directions in the\nfield are discussed.\n","authors":["Panagiotis Alimisis","Ioannis Mademlis","Panagiotis Radoglou-Grammatikis","Panagiotis Sarigiannidis","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.04103v2.pdf","comment":"65 pages, 15 figures"},{"id":"http://arxiv.org/abs/2410.18710v2","updated":"2025-01-10T15:37:01Z","published":"2024-10-23T07:55:40Z","title":"Uncovering the Genetic Basis of Glioblastoma Heterogeneity through\n  Multimodal Analysis of Whole Slide Images and RNA Sequencing Data","summary":"  Glioblastoma is a highly aggressive form of brain cancer characterized by\nrapid progression and poor prognosis. Despite advances in treatment, the\nunderlying genetic mechanisms driving this aggressiveness remain poorly\nunderstood. In this study, we employed multimodal deep learning approaches to\ninvestigate glioblastoma heterogeneity using joint image/RNA-seq analysis. Our\nresults reveal novel genes associated with glioblastoma. By leveraging a\ncombination of whole-slide images and RNA-seq, as well as introducing novel\nmethods to encode RNA-seq data, we identified specific genetic profiles that\nmay explain different patterns of glioblastoma progression. These findings\nprovide new insights into the genetic mechanisms underlying glioblastoma\nheterogeneity and highlight potential targets for therapeutic intervention.\n","authors":["Ahmad Berjaoui","Louis Roussel","Eduardo Hugo Sanchez","Elizabeth Cohen-Jonathan Moyal"],"pdf_url":"https://arxiv.org/pdf/2410.18710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06051v1","updated":"2025-01-10T15:30:46Z","published":"2025-01-10T15:30:46Z","title":"Benchmarking Rotary Position Embeddings for Automatic Speech Recognition","summary":"  Rotary Position Embedding (RoPE) encodes relative and absolute positional\ninformation in Transformer-based models through rotation matrices applied to\ninput vectors within sequences. While RoPE has demonstrated superior\nperformance compared to other positional embedding technologies in natural\nlanguage processing tasks, its effectiveness in speech processing applications\nremains understudied. In this work, we conduct a comprehensive evaluation of\nRoPE across diverse automatic speech recognition (ASR) tasks. Our experimental\nresults demonstrate that for ASR tasks, RoPE consistently achieves lower error\nrates compared to the currently widely used relative positional embedding. To\nfacilitate further research, we release the implementation and all experimental\nrecipes through the SpeechBrain toolkit.\n","authors":["Shucong Zhang","Titouan Parcollet","Rogier van Dalen","Sourav Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2501.06051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05289v3","updated":"2025-01-10T15:25:06Z","published":"2024-10-02T14:14:17Z","title":"MARS: A neurosymbolic approach for interpretable drug discovery","summary":"  Neurosymbolic (NeSy) artificial intelligence describes the combination of\nlogic or rule-based techniques with neural networks. Compared to neural\napproaches, NeSy methods often possess enhanced interpretability, which is\nparticularly promising for biomedical applications like drug discovery.\nHowever, since interpretability is broadly defined, there are no clear\nguidelines for assessing the biological plausibility of model interpretations.\nTo assess interpretability in the context of drug discovery, we devise a novel\nprediction task, called drug mechanism-of-action (MoA) deconvolution, with an\nassociated, tailored knowledge graph (KG), MoA-net. We then develop the MoA\nRetrieval System (MARS), a NeSy approach for drug discovery which leverages\nlogical rules with learned rule weights. Using this interpretable feature\nalongside domain knowledge, we find that MARS and other NeSy approaches on KGs\nare susceptible to reasoning shortcuts, in which the prediction of true labels\nis driven by \"degree-bias\" rather than the domain-based rules. Subsequently, we\ndemonstrate ways to identify and mitigate this. Thereafter, MARS achieves\nperformance on par with current state-of-the-art models while producing model\ninterpretations aligned with known MoAs.\n","authors":["Lauren Nicole DeLong","Yojana Gadiya","Paola Galdi","Jacques D. Fleuriot","Daniel Domingo-Fernández"],"pdf_url":"https://arxiv.org/pdf/2410.05289v3.pdf","comment":"Under review. 10 pages, 7 supplementary pages. Corresponding code is\n  here: https://github.com/laurendelong21/MARS and here:\n  https://github.com/laurendelong21/MoA-Net"},{"id":"http://arxiv.org/abs/2501.06039v1","updated":"2025-01-10T15:17:27Z","published":"2025-01-10T15:17:27Z","title":"AI-powered virtual tissues from spatial proteomics for clinical\n  diagnostics and biomedical discovery","summary":"  Spatial proteomics technologies have transformed our understanding of complex\ntissue architectures by enabling simultaneous analysis of multiple molecular\nmarkers and their spatial organization. The high dimensionality of these data,\nvarying marker combinations across experiments and heterogeneous study designs\npose unique challenges for computational analysis. Here, we present Virtual\nTissues (VirTues), a foundation model framework for biological tissues that\noperates across the molecular, cellular and tissue scale. VirTues introduces\ninnovations in transformer architecture design, including a novel tokenization\nscheme that captures both spatial and marker dimensions, and attention\nmechanisms that scale to high-dimensional multiplex data while maintaining\ninterpretability. Trained on diverse cancer and non-cancer tissue datasets,\nVirTues demonstrates strong generalization capabilities without task-specific\nfine-tuning, enabling cross-study analysis and novel marker integration. As a\ngeneralist model, VirTues outperforms existing approaches across clinical\ndiagnostics, biological discovery and patient case retrieval tasks, while\nproviding insights into tissue function and disease mechanisms.\n","authors":["Johann Wenckstern","Eeshaan Jain","Kiril Vasilev","Matteo Pariset","Andreas Wicki","Gabriele Gut","Charlotte Bunne"],"pdf_url":"https://arxiv.org/pdf/2501.06039v1.pdf","comment":"23 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.19876v3","updated":"2025-01-10T15:08:44Z","published":"2024-11-29T17:38:56Z","title":"LUMIA: Linear probing for Unimodal and MultiModal Membership Inference\n  Attacks leveraging internal LLM states","summary":"  Large Language Models (LLMs) are increasingly used in a variety of\napplications, but concerns around membership inference have grown in parallel.\nPrevious efforts focus on black-to-grey-box models, thus neglecting the\npotential benefit from internal LLM information. To address this, we propose\nthe use of Linear Probes (LPs) as a method to detect Membership Inference\nAttacks (MIAs) by examining internal activations of LLMs. Our approach, dubbed\nLUMIA, applies LPs layer-by-layer to get fine-grained data on the model inner\nworkings. We test this method across several model architectures, sizes and\ndatasets, including unimodal and multimodal tasks. In unimodal MIA, LUMIA\nachieves an average gain of 15.71 % in Area Under the Curve (AUC) over previous\ntechniques. Remarkably, LUMIA reaches AUC>60% in 65.33% of cases -- an\nincrement of 46.80% against the state of the art. Furthermore, our approach\nreveals key insights, such as the model layers where MIAs are most detectable.\nIn multimodal models, LPs indicate that visual inputs can significantly\ncontribute to detect MIAs -- AUC>60% is reached in 85.90% of experiments.\n","authors":["Luis Ibanez-Lissen","Lorena Gonzalez-Manzano","Jose Maria de Fuentes","Nicolas Anciaux","Joaquin Garcia-Alfaro"],"pdf_url":"https://arxiv.org/pdf/2411.19876v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06025v1","updated":"2025-01-10T15:01:51Z","published":"2025-01-10T15:01:51Z","title":"How to Tune a Multilingual Encoder Model for Germanic Languages: A Study\n  of PEFT, Full Fine-Tuning, and Language Adapters","summary":"  This paper investigates the optimal use of the multilingual encoder model\nmDeBERTa for tasks in three Germanic languages -- German, Swedish, and\nIcelandic -- representing varying levels of presence and likely data quality in\nmDeBERTas pre-training data. We compare full fine-tuning with the\nparameter-efficient fine-tuning (PEFT) methods LoRA and Pfeiffer bottleneck\nadapters, finding that PEFT is more effective for the higher-resource language,\nGerman. However, results for Swedish and Icelandic are less consistent. We also\nobserve differences between tasks: While PEFT tends to work better for question\nanswering, full fine-tuning is preferable for named entity recognition.\nInspired by previous research on modular approaches that combine task and\nlanguage adapters, we evaluate the impact of adding PEFT modules trained on\nunstructured text, finding that this approach is not beneficial.\n","authors":["Romina Oji","Jenny Kunz"],"pdf_url":"https://arxiv.org/pdf/2501.06025v1.pdf","comment":"Accepted at NoDaLiDa Baltic-HLT 2025 Conference"},{"id":"http://arxiv.org/abs/2501.06019v1","updated":"2025-01-10T14:57:18Z","published":"2025-01-10T14:57:18Z","title":"BRIGHT: A globally distributed multimodal building damage assessment\n  dataset with very-high-resolution for all-weather disaster response","summary":"  Disaster events occur around the world and cause significant damage to human\nlife and property. Earth observation (EO) data enables rapid and comprehensive\nbuilding damage assessment (BDA), an essential capability in the aftermath of a\ndisaster to reduce human casualties and to inform disaster relief efforts.\nRecent research focuses on the development of AI models to achieve accurate\nmapping of unseen disaster events, mostly using optical EO data. However,\nsolutions based on optical data are limited to clear skies and daylight hours,\npreventing a prompt response to disasters. Integrating multimodal (MM) EO data,\nparticularly the combination of optical and SAR imagery, makes it possible to\nprovide all-weather, day-and-night disaster responses. Despite this potential,\nthe development of robust multimodal AI models has been constrained by the lack\nof suitable benchmark datasets. In this paper, we present a BDA dataset using\nveRy-hIGH-resoluTion optical and SAR imagery (BRIGHT) to support AI-based\nall-weather disaster response. To the best of our knowledge, BRIGHT is the\nfirst open-access, globally distributed, event-diverse MM dataset specifically\ncurated to support AI-based disaster response. It covers five types of natural\ndisasters and two types of man-made disasters across 12 regions worldwide, with\na particular focus on developing countries where external assistance is most\nneeded. The optical and SAR imagery in BRIGHT, with a spatial resolution\nbetween 0.3-1 meters, provides detailed representations of individual\nbuildings, making it ideal for precise BDA. In our experiments, we have tested\nseven advanced AI models trained with our BRIGHT to validate the\ntransferability and robustness. The dataset and code are available at\nhttps://github.com/ChenHongruixuan/BRIGHT. BRIGHT also serves as the official\ndataset for the 2025 IEEE GRSS Data Fusion Contest.\n","authors":["Hongruixuan Chen","Jian Song","Olivier Dietrich","Clifford Broni-Bediako","Weihao Xuan","Junjue Wang","Xinlei Shao","Yimin Wei","Junshi Xia","Cuiling Lan","Konrad Schindler","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2501.06019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04211v2","updated":"2025-01-10T14:36:48Z","published":"2025-01-08T01:11:17Z","title":"CURing Large Models: Compression via CUR Decomposition","summary":"  Large deep learning models have achieved remarkable success but are\nresource-intensive, posing challenges such as memory usage. We introduce\nCURing, a novel model compression method based on CUR matrix decomposition,\nwhich approximates weight matrices as the product of selected columns (C) and\nrows (R), and a small linking matrix (U). We apply this decomposition to\nweights chosen based on the combined influence of their magnitudes and\nactivations. By identifying and retaining informative rows and columns, CURing\nsignificantly reduces model size with minimal performance loss. For example, it\nreduces Llama3.1-8B's parameters to 7.32B (-9%) in just 129 seconds, over 20\ntimes faster than prior compression methods.\n","authors":["Sanghyeon Park","Soo-Mook Moon"],"pdf_url":"https://arxiv.org/pdf/2501.04211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04127v3","updated":"2025-01-10T14:31:21Z","published":"2024-06-06T14:49:06Z","title":"Are We Done with MMLU?","summary":"  Maybe not. We identify and analyse errors in the popular Massive Multitask\nLanguage Understanding (MMLU) benchmark. Even though MMLU is widely adopted,\nour analysis demonstrates numerous ground truth errors that obscure the true\ncapabilities of LLMs. For example, we find that 57% of the analysed questions\nin the Virology subset contain errors. To address this issue, we introduce a\ncomprehensive framework for identifying dataset errors using a novel error\nannotation protocol. Then, we create MMLU-Redux, which is a subset of 5,700\nmanually re-annotated questions across all 57 MMLU subjects. We estimate that\n6.49% of MMLU questions contain errors. Using MMLU-Redux, we demonstrate\nsignificant discrepancies with the model performance metrics that were\noriginally reported. Our results strongly advocate for revising MMLU's\nerror-ridden questions to enhance its future utility and reliability as a\nbenchmark. https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux-2.0.\n","authors":["Aryo Pradipta Gema","Joshua Ong Jun Leang","Giwon Hong","Alessio Devoto","Alberto Carlo Maria Mancino","Rohit Saxena","Xuanli He","Yu Zhao","Xiaotang Du","Mohammad Reza Ghasemi Madani","Claire Barale","Robert McHardy","Joshua Harris","Jean Kaddour","Emile van Krieken","Pasquale Minervini"],"pdf_url":"https://arxiv.org/pdf/2406.04127v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05989v1","updated":"2025-01-10T14:20:46Z","published":"2025-01-10T14:20:46Z","title":"Addressing speaker gender bias in large scale speech translation systems","summary":"  This study addresses the issue of speaker gender bias in Speech Translation\n(ST) systems, which can lead to offensive and inaccurate translations. The\nmasculine bias often found in large-scale ST systems is typically perpetuated\nthrough training data derived from Machine Translation (MT) systems. Our\napproach involves two key steps. First, we employ Large Language Models (LLMs)\nto rectify translations based on the speaker's gender in a cost-effective\nmanner. Second, we fine-tune the ST model with the corrected data, enabling the\nmodel to generate gender-specific translations directly from audio cues,\nwithout the need for explicit gender input. Additionally, we propose a\nthree-mode fine-tuned model for scenarios where the speaker's gender is either\npredefined or should not be inferred from speech cues. We demonstrate a 70%\nimprovement in translations for female speakers compared to our baseline and\nother large-scale ST systems, such as Seamless M4T and Canary, on the MuST-SHE\ntest set.\n","authors":["Shubham Bansal","Vikas Joshi","Harveen Chadha","Rupeshkumar Mehta","Jinyu Li"],"pdf_url":"https://arxiv.org/pdf/2501.05989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14488v2","updated":"2025-01-10T13:52:14Z","published":"2024-12-19T03:22:47Z","title":"A stochastic first-order method with multi-extrapolated momentum for\n  highly smooth unconstrained optimization","summary":"  In this paper, we consider an unconstrained stochastic optimization problem\nwhere the objective function exhibits high-order smoothness. Specifically, we\npropose a new stochastic first-order method (SFOM) with multi-extrapolated\nmomentum, in which multiple extrapolations are performed in each iteration,\nfollowed by a momentum update based on these extrapolations. We demonstrate\nthat the proposed SFOM can accelerate optimization by exploiting the high-order\nsmoothness of the objective function $f$. Assuming that the $p$th-order\nderivative of $f$ is Lipschitz continuous for some $p\\ge2$, and under\nadditional mild assumptions, we establish that our method achieves a sample\ncomplexity of $\\widetilde{\\mathcal{O}}(\\epsilon^{-(3p+1)/p})$ for finding a\npoint $x$ such that $\\mathbb{E}[\\|\\nabla f(x)\\|]\\le\\epsilon$. To the best of\nour knowledge, this is the first SFOM to leverage arbitrary-order smoothness of\nthe objective function for acceleration, resulting in a sample complexity that\nimproves upon the best-known results without assuming the mean-squared\nsmoothness condition. Preliminary numerical experiments validate the practical\nperformance of our method and support our theoretical findings.\n","authors":["Chuan He"],"pdf_url":"https://arxiv.org/pdf/2412.14488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05962v1","updated":"2025-01-10T13:42:40Z","published":"2025-01-10T13:42:40Z","title":"Effective faking of verbal deception detection with target-aligned\n  adversarial attacks","summary":"  Background: Deception detection through analysing language is a promising\navenue using both human judgments and automated machine learning judgments. For\nboth forms of credibility assessment, automated adversarial attacks that\nrewrite deceptive statements to appear truthful pose a serious threat. Methods:\nWe used a dataset of 243 truthful and 262 fabricated autobiographical stories\nin a deception detection task for humans and machine learning models. A large\nlanguage model was tasked to rewrite deceptive statements so that they appear\ntruthful. In Study 1, humans who made a deception judgment or used the\ndetailedness heuristic and two machine learning models (a fine-tuned language\nmodel and a simple n-gram model) judged original or adversarial modifications\nof deceptive statements. In Study 2, we manipulated the target alignment of the\nmodifications, i.e. tailoring the attack to whether the statements would be\nassessed by humans or computer models. Results: When adversarial modifications\nwere aligned with their target, human (d=-0.07 and d=-0.04) and machine\njudgments (51% accuracy) dropped to the chance level. When the attack was not\naligned with the target, both human heuristics judgments (d=0.30 and d=0.36)\nand machine learning predictions (63-78%) were significantly better than\nchance. Conclusions: Easily accessible language models can effectively help\nanyone fake deception detection efforts both by humans and machine learning\nmodels. Robustness against adversarial modifications for humans and machines\ndepends on that target alignment. We close with suggestions on advancing\ndeception research with adversarial attack designs.\n","authors":["Bennett Kleinberg","Riccardo Loconte","Bruno Verschuere"],"pdf_url":"https://arxiv.org/pdf/2501.05962v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2412.11698v2","updated":"2025-01-10T13:35:37Z","published":"2024-12-16T12:21:05Z","title":"On Large Language Models in Mission-Critical IT Governance: Are We Ready\n  Yet?","summary":"  Context. The security of critical infrastructure has been a pressing concern\nsince the advent of computers and has become even more critical in today's era\nof cyber warfare. Protecting mission-critical systems (MCSs), essential for\nnational security, requires swift and robust governance, yet recent events\nreveal the increasing difficulty of meeting these challenges. Aim. Building on\nprior research showcasing the potential of Generative AI (GAI), such as Large\nLanguage Models, in enhancing risk analysis, we aim to explore practitioners'\nviews on integrating GAI into the governance of IT MCSs. Our goal is to provide\nactionable insights and recommendations for stakeholders, including\nresearchers, practitioners, and policymakers. Method. We designed a survey to\ncollect practical experiences, concerns, and expectations of practitioners who\ndevelop and implement security solutions in the context of MCSs. Conclusions\nand Future Works. Our findings highlight that the safe use of LLMs in MCS\ngovernance requires interdisciplinary collaboration. Researchers should focus\non designing regulation-oriented models and focus on accountability;\npractitioners emphasize data protection and transparency, while policymakers\nmust establish a unified AI framework with global benchmarks to ensure ethical\nand secure LLMs-based MCS governance.\n","authors":["Matteo Esposito","Francesco Palagiano","Valentina Lenarduzzi","Davide Taibi"],"pdf_url":"https://arxiv.org/pdf/2412.11698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03916v2","updated":"2025-01-10T13:14:28Z","published":"2025-01-07T16:31:10Z","title":"Dolphin: Closed-loop Open-ended Auto-research through Thinking,\n  Practice, and Feedback","summary":"  The scientific research paradigm is undergoing a profound transformation\nowing to the development of Artificial Intelligence (AI). Recent works\ndemonstrate that various AI-assisted research methods can largely improve\nresearch efficiency by improving data analysis, accelerating computation, and\nfostering novel idea generation. To further move towards the ultimate goal\n(i.e., automatic scientific research), in this paper, we propose Dolphin, the\nfirst closed-loop open-ended auto-research framework to further build the\nentire process of human scientific research. Dolphin can generate research\nideas, perform experiments, and get feedback from experimental results to\ngenerate higher-quality ideas. More specifically, Dolphin first generates novel\nideas based on relevant papers which are ranked by the topic and task\nattributes. Then, the codes are automatically generated and debugged with the\nexception-traceback-guided local code structure. Finally, Dolphin automatically\nanalyzes the results of each idea and feeds the results back to the next round\nof idea generation. Experiments are conducted on the benchmark datasets of\ndifferent topics and results show that Dolphin can generate novel ideas\ncontinuously and complete the experiment in a loop. We highlight that Dolphin\ncan automatically propose methods that are comparable to the state-of-the-art\nin some tasks such as 2D image classification and 3D point classification.\n","authors":["Jiakang Yuan","Xiangchao Yan","Botian Shi","Tao Chen","Wanli Ouyang","Bo Zhang","Lei Bai","Yu Qiao","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.03916v2.pdf","comment":"19 pages, 11 figures, and our homepage:\n  https://alpha-innovator.github.io/Dolphin-project-page"},{"id":"http://arxiv.org/abs/2311.03056v4","updated":"2025-01-10T13:01:45Z","published":"2023-11-06T12:22:19Z","title":"LitSumm: Large language models for literature summarisation of\n  non-coding RNAs","summary":"  Curation of literature in life sciences is a growing challenge. The continued\nincrease in the rate of publication, coupled with the relatively fixed number\nof curators worldwide presents a major challenge to developers of biomedical\nknowledgebases. Very few knowledgebases have resources to scale to the whole\nrelevant literature and all have to prioritise their efforts.\n  In this work, we take a first step to alleviating the lack of curator time in\nRNA science by generating summaries of literature for non-coding RNAs using\nlarge language models (LLMs). We demonstrate that high-quality, factually\naccurate summaries with accurate references can be automatically generated from\nthe literature using a commercial LLM and a chain of prompts and checks. Manual\nassessment was carried out for a subset of summaries, with the majority being\nrated extremely high quality.\n  We apply our tool to a selection of over 4,600 ncRNAs and make the generated\nsummaries available via the RNAcentral resource. We conclude that automated\nliterature summarization is feasible with the current generation of LLMs,\nprovided careful prompting and automated checking are applied.\n","authors":["Andrew Green","Carlos Ribas","Nancy Ontiveros-Palacios","Sam Griffiths-Jones","Anton I. Petrov","Alex Bateman","Blake Sweeney"],"pdf_url":"https://arxiv.org/pdf/2311.03056v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05932v1","updated":"2025-01-10T12:55:34Z","published":"2025-01-10T12:55:34Z","title":"DiffuSETS: 12-lead ECG Generation Conditioned on Clinical Text Reports\n  and Patient-Specific Information","summary":"  Heart disease remains a significant threat to human health. As a non-invasive\ndiagnostic tool, the electrocardiogram (ECG) is one of the most widely used\nmethods for cardiac screening. However, the scarcity of high-quality ECG data,\ndriven by privacy concerns and limited medical resources, creates a pressing\nneed for effective ECG signal generation. Existing approaches for generating\nECG signals typically rely on small training datasets, lack comprehensive\nevaluation frameworks, and overlook potential applications beyond data\naugmentation. To address these challenges, we propose DiffuSETS, a novel\nframework capable of generating ECG signals with high semantic alignment and\nfidelity. DiffuSETS accepts various modalities of clinical text reports and\npatient-specific information as inputs, enabling the creation of clinically\nmeaningful ECG signals. Additionally, to address the lack of standardized\nevaluation in ECG generation, we introduce a comprehensive benchmarking\nmethodology to assess the effectiveness of generative models in this domain.\nOur model achieve excellent results in tests, proving its superiority in the\ntask of ECG generation. Furthermore, we showcase its potential to mitigate data\nscarcity while exploring novel applications in cardiology education and medical\nknowledge discovery, highlighting the broader impact of our work.\n","authors":["Yongfan Lai","Jiabo Chen","Deyun Zhang","Yue Wang","Shijia Geng","Hongyan Li","Shenda Hong"],"pdf_url":"https://arxiv.org/pdf/2501.05932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05928v1","updated":"2025-01-10T12:49:12Z","published":"2025-01-10T12:49:12Z","title":"Towards Backdoor Stealthiness in Model Parameter Space","summary":"  Recent research on backdoor stealthiness focuses mainly on indistinguishable\ntriggers in input space and inseparable backdoor representations in feature\nspace, aiming to circumvent backdoor defenses that examine these respective\nspaces. However, existing backdoor attacks are typically designed to resist a\nspecific type of backdoor defense without considering the diverse range of\ndefense mechanisms. Based on this observation, we pose a natural question: Are\ncurrent backdoor attacks truly a real-world threat when facing diverse\npractical defenses?\n  To answer this question, we examine 12 common backdoor attacks that focus on\ninput-space or feature-space stealthiness and 17 diverse representative\ndefenses. Surprisingly, we reveal a critical blind spot: Backdoor attacks\ndesigned to be stealthy in input and feature spaces can be mitigated by\nexamining backdoored models in parameter space. To investigate the underlying\ncauses behind this common vulnerability, we study the characteristics of\nbackdoor attacks in the parameter space. Notably, we find that input- and\nfeature-space attacks introduce prominent backdoor-related neurons in parameter\nspace, which are not thoroughly considered by current backdoor attacks. Taking\ncomprehensive stealthiness into account, we propose a novel supply-chain attack\ncalled Grond. Grond limits the parameter changes by a simple yet effective\nmodule, Adversarial Backdoor Injection (ABI), which adaptively increases the\nparameter-space stealthiness during the backdoor injection. Extensive\nexperiments demonstrate that Grond outperforms all 12 backdoor attacks against\nstate-of-the-art (including adaptive) defenses on CIFAR-10, GTSRB, and a subset\nof ImageNet. In addition, we show that ABI consistently improves the\neffectiveness of common backdoor attacks.\n","authors":["Xiaoyun Xu","Zhuoran Liu","Stefanos Koffas","Stjepan Picek"],"pdf_url":"https://arxiv.org/pdf/2501.05928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05921v1","updated":"2025-01-10T12:26:38Z","published":"2025-01-10T12:26:38Z","title":"The New Anticipatory Governance Culture for Innovation: Regulatory\n  Foresight, Regulatory Experimentation and Regulatory Learning","summary":"  With the rapid pace of technological innovation, traditional methods of\npolicy formation and legislating are becoming conspicuously anachronistic. The\nneed for regulatory choices to be made to counter the deadening effect of\nregulatory lag is more important to developing markets and fostering growth\nthan achieving one off regulatory perfection. This article advances scholarship\non innovation policy and the regulation of technological innovation in the\nEuropean Union. It does so by considering what building an agile yet robust\nanticipatory governance regulatory culture involves. It systematically\nexcavates a variety of tools and elements that are being put into use in\ninventive ways and argues that these need to be more cohesively and\nsystemically integrated into the regulatory toolbox. Approaches covered include\nstrategic foresight, the critical embrace of iterative policy development and\nregulatory learning in the face of uncertainty and the embrace of bottom up\napproaches to cocreation of policy such as Policy Labs and the testing and\nregulatory learning through pilot regulation and experimentation. The growing\nuse of regulatory sandboxes as an EU policy tool to boost innovation and\nnavigate regulatory complexity as seen in the EU AI Act is also probed\n","authors":["Deirdre Ahern"],"pdf_url":"https://arxiv.org/pdf/2501.05921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05891v1","updated":"2025-01-10T11:44:35Z","published":"2025-01-10T11:44:35Z","title":"Affordably Fine-tuned LLMs Provide Better Answers to Course-specific\n  MCQs","summary":"  In education, the capability of generating human-like text of Large Language\nModels (LLMs) inspired work on how they can increase the efficiency of learning\nand teaching. We study the affordability of these models for educators and\nstudents by investigating how LLMs answer multiple-choice questions (MCQs) with\nrespect to hardware constraints and refinement techniques. We explore this\nspace by using generic pre-trained LLMs (the 7B, 13B, and 70B variants of\nLLaMA-2) to answer 162 undergraduate-level MCQs from a course on Programming\nLanguages (PL) -- the MCQ dataset is a contribution of this work, which we make\npublicly available. Specifically, we dissect how different factors, such as\nusing readily-available material -- (parts of) the course's textbook -- for\nfine-tuning and quantisation (to decrease resource usage) can change the\naccuracy of the responses. The main takeaway is that smaller textbook-based\nfine-tuned models outperform generic larger ones (whose pre-training requires\nconspicuous resources), making the usage of LLMs for answering MCQs resource-\nand material-wise affordable.\n","authors":["Bianca Raimondi","Saverio Giallorenzo","Maurizio Gabbrielli"],"pdf_url":"https://arxiv.org/pdf/2501.05891v1.pdf","comment":"The 40th ACM/SIGAPP Symposium On Applied Computing"},{"id":"http://arxiv.org/abs/2501.05885v1","updated":"2025-01-10T11:37:50Z","published":"2025-01-10T11:37:50Z","title":"EDNet: Edge-Optimized Small Target Detection in UAV Imagery -- Faster\n  Context Attention, Better Feature Fusion, and Hardware Acceleration","summary":"  Detecting small targets in drone imagery is challenging due to low\nresolution, complex backgrounds, and dynamic scenes. We propose EDNet, a novel\nedge-target detection framework built on an enhanced YOLOv10 architecture,\noptimized for real-time applications without post-processing. EDNet\nincorporates an XSmall detection head and a Cross Concat strategy to improve\nfeature fusion and multi-scale context awareness for detecting tiny targets in\ndiverse environments. Our unique C2f-FCA block employs Faster Context Attention\nto enhance feature extraction while reducing computational complexity. The WIoU\nloss function is employed for improved bounding box regression. With seven\nmodel sizes ranging from Tiny to XL, EDNet accommodates various deployment\nenvironments, enabling local real-time inference and ensuring data privacy.\nNotably, EDNet achieves up to a 5.6% gain in mAP@50 with significantly fewer\nparameters. On an iPhone 12, EDNet variants operate at speeds ranging from 16\nto 55 FPS, providing a scalable and efficient solution for edge-based object\ndetection in challenging drone imagery. The source code and pre-trained models\nare available at: https://github.com/zsniko/EDNet.\n","authors":["Zhifan Song","Yuan Zhang","Abd Al Rahman M. Abu Ebayyeh"],"pdf_url":"https://arxiv.org/pdf/2501.05885v1.pdf","comment":"Accepted in 21st IEEE International Conference on Ubiquitous\n  Intelligence and Computing (UIC 2024)\n  https://www.ieee-smart-world.org/2024/uic"},{"id":"http://arxiv.org/abs/2501.01987v2","updated":"2025-01-10T11:36:09Z","published":"2024-12-30T18:08:13Z","title":"Gender Bias in Text-to-Video Generation Models: A case study of Sora","summary":"  The advent of text-to-video generation models has revolutionized content\ncreation as it produces high-quality videos from textual prompts. However,\nconcerns regarding inherent biases in such models have prompted scrutiny,\nparticularly regarding gender representation. Our study investigates the\npresence of gender bias in OpenAI's Sora, a state-of-the-art text-to-video\ngeneration model. We uncover significant evidence of bias by analyzing the\ngenerated videos from a diverse set of gender-neutral and stereotypical\nprompts. The results indicate that Sora disproportionately associates specific\ngenders with stereotypical behaviors and professions, which reflects societal\nprejudices embedded in its training data.\n","authors":["Mohammad Nadeem","Shahab Saquib Sohail","Erik Cambria","Björn W. Schuller","Amir Hussain"],"pdf_url":"https://arxiv.org/pdf/2501.01987v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.05882v1","updated":"2025-01-10T11:34:22Z","published":"2025-01-10T11:34:22Z","title":"Solving nonograms using Neural Networks","summary":"  Nonograms are logic puzzles in which cells in a grid must be colored or left\nblank according to the numbers that are located in its headers. In this study,\nwe analyze different techniques to solve this type of logical problem using an\nHeuristic Algorithm, Genetic Algorithm, and Heuristic Algorithm with Neural\nNetwork. Furthermore, we generate a public dataset to train the neural\nnetworks. We published this dataset and the code of the algorithms. Combination\nof the heuristic algorithm with a neural network obtained the best results.\nFrom state of the art review, no previous works used neural network to solve\nnonograms, nor combined a network with other algorithms to accelerate the\nresolution process.\n","authors":["José María Buades Rubio","Antoni Jaume-i-Capó","David López González","Gabriel Moyà Alcover"],"pdf_url":"https://arxiv.org/pdf/2501.05882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05874v1","updated":"2025-01-10T11:17:15Z","published":"2025-01-10T11:17:15Z","title":"VideoRAG: Retrieval-Augmented Generation over Video Corpus","summary":"  Retrieval-Augmented Generation (RAG) is a powerful strategy to address the\nissue of generating factually incorrect outputs in foundation models by\nretrieving external knowledge relevant to queries and incorporating it into\ntheir generation process. However, existing RAG approaches have primarily\nfocused on textual information, with some recent advancements beginning to\nconsider images, and they largely overlook videos, a rich source of multimodal\nknowledge capable of representing events, processes, and contextual details\nmore effectively than any other modality. While a few recent studies explore\nthe integration of videos in the response generation process, they either\npredefine query-associated videos without retrieving them according to queries,\nor convert videos into the textual descriptions without harnessing their\nmultimodal richness. To tackle these, we introduce VideoRAG, a novel framework\nthat not only dynamically retrieves relevant videos based on their relevance\nwith queries but also utilizes both visual and textual information of videos in\nthe output generation. Further, to operationalize this, our method revolves\naround the recent advance of Large Video Language Models (LVLMs), which enable\nthe direct processing of video content to represent it for retrieval and\nseamless integration of the retrieved videos jointly with queries. We\nexperimentally validate the effectiveness of VideoRAG, showcasing that it is\nsuperior to relevant baselines.\n","authors":["Soyeong Jeong","Kangsan Kim","Jinheon Baek","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2501.05874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03968v2","updated":"2025-01-10T10:38:49Z","published":"2025-01-07T18:06:27Z","title":"VLM-driven Behavior Tree for Context-aware Task Planning","summary":"  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)\nhas recently gained attention in the robotics community, yet remains in its\nearly stages of development. In this paper, we propose a novel framework that\nleverages Vision-Language Models (VLMs) to interactively generate and edit BTs\nthat address visual conditions, enabling context-aware robot operations in\nvisually complex environments. A key feature of our approach lies in the\nconditional control through self-prompted visual conditions. Specifically, the\nVLM generates BTs with visual condition nodes, where conditions are expressed\nas free-form text. Another VLM process integrates the text into its prompt and\nevaluates the conditions against real-world images during robot execution. We\nvalidated our framework in a real-world cafe scenario, demonstrating both its\nfeasibility and limitations.\n","authors":["Naoki Wake","Atsushi Kanehira","Jun Takamatsu","Kazuhiro Sasabuchi","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2501.03968v2.pdf","comment":"10 pages, 11 figures, 5 tables. Last updated on January 9th, 2024"},{"id":"http://arxiv.org/abs/2406.10221v2","updated":"2025-01-10T10:36:58Z","published":"2024-06-14T17:54:54Z","title":"Long Story Short: Story-level Video Understanding from 20K Short Films","summary":"  Recent developments in vision-language models have significantly advanced\nvideo understanding. Existing datasets and tasks, however, have notable\nlimitations. Most datasets are confined to short videos with limited events and\nnarrow narratives. For example, datasets with instructional and egocentric\nvideos often depict activities of one person in a single scene. Although\nexisting movie datasets offer richer content, they are often limited to\nshort-term tasks, lack publicly available videos, and frequently encounter data\nleakage issues given the use of subtitles and other information about\ncommercial movies during LLM pretraining. To address the above limitations, we\npropose Short-Films 20K (SF20K), the largest publicly available movie dataset.\nSF20K is composed of 20,143 amateur films and offers long-term video tasks in\nthe form of multiple-choice and open-ended question answering. Our extensive\nanalysis of SF20K reveals minimal data leakage, emphasizes the need for\nlong-term reasoning, and demonstrates the strong performance of recent VLMs.\nFinally, we show that instruction tuning on the SF20K-Train set substantially\nimproves model performance, paving the way for future progress in long-term\nvideo understanding.\n","authors":["Ridouane Ghermi","Xi Wang","Vicky Kalogeiton","Ivan Laptev"],"pdf_url":"https://arxiv.org/pdf/2406.10221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05845v1","updated":"2025-01-10T10:36:46Z","published":"2025-01-10T10:36:46Z","title":"Annealing Machine-assisted Learning of Graph Neural Network for\n  Combinatorial Optimization","summary":"  While Annealing Machines (AM) have shown increasing capabilities in solving\ncomplex combinatorial problems, positioning themselves as a more immediate\nalternative to the expected advances of future fully quantum solutions, there\nare still scaling limitations. In parallel, Graph Neural Networks (GNN) have\nbeen recently adapted to solve combinatorial problems, showing competitive\nresults and potentially high scalability due to their distributed nature. We\npropose a merging approach that aims at retaining both the accuracy exhibited\nby AMs and the representational flexibility and scalability of GNNs. Our model\nconsiders a compression step, followed by a supervised interaction where\npartial solutions obtained from the AM are used to guide local GNNs from where\nnode feature representations are obtained and combined to initialize an\nadditional GNN-based solver that handles the original graph's target problem.\nIntuitively, the AM can solve the combinatorial problem indirectly by infusing\nits knowledge into the GNN. Experiments on canonical optimization problems show\nthat the idea is feasible, effectively allowing the AM to solve size problems\nbeyond its original limits.\n","authors":["Pablo Loyola","Kento Hasegawa","Andres Hoyos-Idobro","Kazuo Ono","Toyotaro Suzumura","Yu Hirate","Masanao Yamaoka"],"pdf_url":"https://arxiv.org/pdf/2501.05845v1.pdf","comment":"Second Workshop on Machine Learning with New Compute Paradigms at\n  NeurIPS 2024 (MLNCP 2024)"},{"id":"http://arxiv.org/abs/2501.01834v2","updated":"2025-01-10T10:08:50Z","published":"2025-01-03T14:38:01Z","title":"MoColl: Agent-Based Specific and General Model Collaboration for Image\n  Captioning","summary":"  Image captioning is a critical task at the intersection of computer vision\nand natural language processing, with wide-ranging applications across various\ndomains. For complex tasks such as diagnostic report generation, deep learning\nmodels require not only domain-specific image-caption datasets but also the\nincorporation of relevant general knowledge to provide contextual accuracy.\nExisting approaches exhibit inherent limitations: specialized models excel in\ncapturing domain-specific details but lack generalization, while\nvision-language models (VLMs) built on large language models (LLMs) leverage\ngeneral knowledge but struggle with domain-specific adaptation. To address\nthese limitations, this paper proposes a novel agent-enhanced model\ncollaboration framework, which we call MoColl, designed to effectively\nintegrate domain-specific and general knowledge. Specifically, our approach is\nto decompose complex image captioning tasks into a series of interconnected\nquestion-answer subtasks. A trainable visual question answering (VQA) model is\nemployed as a specialized tool to focus on domain-specific visual analysis,\nanswering task-specific questions based on image content. Concurrently, an\nLLM-based agent with general knowledge formulates these questions and\nsynthesizes the resulting question-answer pairs into coherent captions. Beyond\nits role in leveraging the VQA model, the agent further guides its training to\nenhance its domain-specific capabilities. Experimental results on radiology\nreport generation validate the effectiveness of the proposed framework,\ndemonstrating significant improvements in the quality of generated reports.\n","authors":["Pu Yang","Bin Dong"],"pdf_url":"https://arxiv.org/pdf/2501.01834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09278v2","updated":"2025-01-10T10:07:55Z","published":"2024-12-12T13:41:35Z","title":"Towards a Multimodal Large Language Model with Pixel-Level Insight for\n  Biomedicine","summary":"  In recent years, Multimodal Large Language Models (MLLM) have achieved\nnotable advancements, demonstrating the feasibility of developing an\nintelligent biomedical assistant. However, current biomedical MLLMs\npredominantly focus on image-level understanding and restrict interactions to\ntextual commands, thus limiting their capability boundaries and the flexibility\nof usage. In this paper, we introduce a novel end-to-end multimodal large\nlanguage model for the biomedical domain, named MedPLIB, which possesses\npixel-level understanding. Excitingly, it supports visual question answering\n(VQA), arbitrary pixel-level prompts (points, bounding boxes, and free-form\nshapes), and pixel-level grounding. We propose a novel Mixture-of-Experts (MoE)\nmulti-stage training strategy, which divides MoE into separate training phases\nfor a visual-language expert model and a pixel-grounding expert model, followed\nby fine-tuning using MoE. This strategy effectively coordinates multitask\nlearning while maintaining the computational cost at inference equivalent to\nthat of a single expert model. To advance the research of biomedical MLLMs, we\nintroduce the Medical Complex Vision Question Answering Dataset (MeCoVQA),\nwhich comprises an array of 8 modalities for complex medical imaging question\nanswering and image region understanding. Experimental results indicate that\nMedPLIB has achieved state-of-the-art outcomes across multiple medical visual\nlanguage tasks. More importantly, in zero-shot evaluations for the pixel\ngrounding task, MedPLIB leads the best small and large models by margins of\n19.7 and 15.6 respectively on the mDice metric. The codes, data, and model\ncheckpoints will be made publicly available at\nhttps://github.com/ShawnHuang497/MedPLIB.\n","authors":["Xiaoshuang Huang","Lingdong Shen","Jia Liu","Fangxin Shang","Hongxiang Li","Haifeng Huang","Yehui Yang"],"pdf_url":"https://arxiv.org/pdf/2412.09278v2.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.05826v1","updated":"2025-01-10T10:03:56Z","published":"2025-01-10T10:03:56Z","title":"AI-Driven Diabetic Retinopathy Screening: Multicentric Validation of\n  AIDRSS in India","summary":"  Purpose: Diabetic retinopathy (DR) is a major cause of vision loss,\nparticularly in India, where access to retina specialists is limited in rural\nareas. This study aims to evaluate the Artificial Intelligence-based Diabetic\nRetinopathy Screening System (AIDRSS) for DR detection and prevalence\nassessment, addressing the growing need for scalable, automated screening\nsolutions in resource-limited settings.\n  Approach: A multicentric, cross-sectional study was conducted in Kolkata,\nIndia, involving 5,029 participants and 10,058 macula-centric retinal fundus\nimages. The AIDRSS employed a deep learning algorithm with 50 million trainable\nparameters, integrated with Contrast Limited Adaptive Histogram Equalization\n(CLAHE) preprocessing for enhanced image quality. DR was graded using the\nInternational Clinical Diabetic Retinopathy (ICDR) Scale, categorizing disease\ninto five stages (DR0 to DR4). Statistical metrics including sensitivity,\nspecificity, and prevalence rates were evaluated against expert retina\nspecialist assessments.\n  Results: The prevalence of DR in the general population was 13.7%, rising to\n38.2% among individuals with elevated random blood glucose levels. The AIDRSS\nachieved an overall sensitivity of 92%, specificity of 88%, and 100%\nsensitivity for detecting referable DR (DR3 and DR4). These results demonstrate\nthe system's robust performance in accurately identifying and grading DR in a\ndiverse population.\n  Conclusions: AIDRSS provides a reliable, scalable solution for early DR\ndetection in resource-constrained environments. Its integration of advanced AI\ntechniques ensures high diagnostic accuracy, with potential to significantly\nreduce the burden of diabetes-related vision loss in underserved regions.\n","authors":["Amit Kr Dey","Pradeep Walia","Girish Somvanshi","Abrar Ali","Sagarnil Das","Pallabi Paul","Minakhi Ghosh"],"pdf_url":"https://arxiv.org/pdf/2501.05826v1.pdf","comment":"22 pages, 5 figures. arXiv admin note: substantial text overlap with\n  arXiv:1812.07105 by other authors without attribution"},{"id":"http://arxiv.org/abs/2501.05819v1","updated":"2025-01-10T09:59:16Z","published":"2025-01-10T09:59:16Z","title":"Diffusion Models for Smarter UAVs: Decision-Making and Modeling","summary":"  Unmanned Aerial Vehicles (UAVs) are increasingly adopted in modern\ncommunication networks. However, challenges in decision-making and digital\nmodeling continue to impede their rapid advancement. Reinforcement Learning\n(RL) algorithms face limitations such as low sample efficiency and limited data\nversatility, further magnified in UAV communication scenarios. Moreover,\nDigital Twin (DT) modeling introduces substantial decision-making and data\nmanagement complexities. RL models, often integrated into DT frameworks,\nrequire extensive training data to achieve accurate predictions. In contrast to\ntraditional approaches that focus on class boundaries, Diffusion Models (DMs),\na new class of generative AI, learn the underlying probability distribution\nfrom the training data and can generate trustworthy new patterns based on this\nlearned distribution. This paper explores the integration of DMs with RL and DT\nto effectively address these challenges. By combining the data generation\ncapabilities of DMs with the decision-making framework of RL and the modeling\naccuracy of DT, the integration improves the adaptability and real-time\nperformance of UAV communication. Moreover, the study shows how DMs can\nalleviate data scarcity, improve policy networks, and optimize dynamic\nmodeling, providing a robust solution for complex UAV communication scenarios.\n","authors":["Yousef Emami","Hao Zhou","Luis Almeida","Kai Li"],"pdf_url":"https://arxiv.org/pdf/2501.05819v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.05399v2","updated":"2025-01-10T09:54:54Z","published":"2024-04-08T10:57:25Z","title":"SafetyPrompts: a Systematic Review of Open Datasets for Evaluating and\n  Improving Large Language Model Safety","summary":"  The last two years have seen a rapid growth in concerns around the safety of\nlarge language models (LLMs). Researchers and practitioners have met these\nconcerns by creating an abundance of datasets for evaluating and improving LLM\nsafety. However, much of this work has happened in parallel, and with very\ndifferent goals in mind, ranging from the mitigation of near-term risks around\nbias and toxic content generation to the assessment of longer-term catastrophic\nrisk potential. This makes it difficult for researchers and practitioners to\nfind the most relevant datasets for their use case, and to identify gaps in\ndataset coverage that future work may fill. To remedy these issues, we conduct\na first systematic review of open datasets for evaluating and improving LLM\nsafety. We review 144 datasets, which we identified through an iterative and\ncommunity-driven process over the course of several months. We highlight\npatterns and trends, such as a trend towards fully synthetic datasets, as well\nas gaps in dataset coverage, such as a clear lack of non-English and\nnaturalistic datasets. We also examine how LLM safety datasets are used in\npractice -- in LLM release publications and popular LLM benchmarks -- finding\nthat current evaluation practices are highly idiosyncratic and make use of only\na small fraction of available datasets. Our contributions are based on\nSafetyPrompts.com, a living catalogue of open datasets for LLM safety, which we\nplan to update continuously as the field of LLM safety develops.\n","authors":["Paul Röttger","Fabio Pernisi","Bertie Vidgen","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2404.05399v2.pdf","comment":"Accepted at AAAI 2025 (Special Track on AI Alignment)"},{"id":"http://arxiv.org/abs/2308.00721v4","updated":"2025-01-10T09:35:20Z","published":"2023-07-31T03:56:46Z","title":"A Pre-trained Data Deduplication Model based on Active Learning","summary":"  In the era of big data, the issue of data quality has become increasingly\nprominent. One of the main challenges is the problem of duplicate data, which\ncan arise from repeated entry or the merging of multiple data sources. These\n\"dirty data\" problems can significantly limit the effective application of big\ndata. To address the issue of data deduplication, we propose a pre-trained\ndeduplication model based on active learning, which is the first work that\nutilizes active learning to address the problem of deduplication at the\nsemantic level. The model is built on a pre-trained Transformer and fine-tuned\nto solve the deduplication problem as a sequence to classification task, which\nfirstly integrate the transformer with active learning into an end-to-end\narchitecture to select the most valuable data for deduplication model training,\nand also firstly employ the R-Drop method to perform data augmentation on each\nround of labeled data, which can reduce the cost of manual labeling and improve\nthe model's performance. Experimental results demonstrate that our proposed\nmodel outperforms previous state-of-the-art (SOTA) for deduplicated data\nidentification, achieving up to a 28% improvement in Recall score on benchmark\ndatasets.\n","authors":["Haochen Shi","Xinyao Liu","Fengmao Lv","Hongtao Xue","Jie Hu","Shengdong Du","Tianrui Li"],"pdf_url":"https://arxiv.org/pdf/2308.00721v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02221v2","updated":"2025-01-10T09:26:32Z","published":"2025-01-04T07:53:38Z","title":"CORD: Generalizable Cooperation via Role Diversity","summary":"  Cooperative multi-agent reinforcement learning (MARL) aims to develop agents\nthat can collaborate effectively. However, most cooperative MARL methods\noverfit training agents, making learned policies not generalize well to unseen\ncollaborators, which is a critical issue for real-world deployment. Some\nmethods attempt to address the generalization problem but require prior\nknowledge or predefined policies of new teammates, limiting real-world\napplications. To this end, we propose a hierarchical MARL approach to enable\ngeneralizable cooperation via role diversity, namely CORD. CORD's high-level\ncontroller assigns roles to low-level agents by maximizing the role entropy\nwith constraints. We show this constrained objective can be decomposed into\ncausal influence in role that enables reasonable role assignment, and role\nheterogeneity that yields coherent, non-redundant role clusters. Evaluated on a\nvariety of cooperative multi-agent tasks, CORD achieves better performance than\nbaselines, especially in generalization tests. Ablation studies further\ndemonstrate the efficacy of the constrained objective in generalizable\ncooperation.\n","authors":["Kanefumi Matsuyama","Kefan Su","Jiangxing Wang","Deheng Ye","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2501.02221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05808v1","updated":"2025-01-10T09:15:40Z","published":"2025-01-10T09:15:40Z","title":"Real-Time Integrated Dispatching and Idle Fleet Steering with Deep\n  Reinforcement Learning for A Meal Delivery Platform","summary":"  To achieve high service quality and profitability, meal delivery platforms\nlike Uber Eats and Grubhub must strategically operate their fleets to ensure\ntimely deliveries for current orders while mitigating the consequential impacts\nof suboptimal decisions that leads to courier understaffing in the future. This\nstudy set out to solve the real-time order dispatching and idle courier\nsteering problems for a meal delivery platform by proposing a reinforcement\nlearning (RL)-based strategic dual-control framework. To address the inherent\nsequential nature of these problems, we model both order dispatching and\ncourier steering as Markov Decision Processes. Trained via a deep reinforcement\nlearning (DRL) framework, we obtain strategic policies by leveraging the\nexplicitly predicted demands as part of the inputs. In our dual-control\nframework, the dispatching and steering policies are iteratively trained in an\nintegrated manner. These forward-looking policies can be executed in real-time\nand provide decisions while jointly considering the impacts on local and\nnetwork levels. To enhance dispatching fairness, we propose convolutional deep\nQ networks to construct fair courier embeddings. To simultaneously rebalance\nthe supply and demand within the service network, we propose to utilize\nmean-field approximated supply-demand knowledge to reallocate idle couriers at\nthe local level. Utilizing the policies generated by the RL-based strategic\ndual-control framework, we find the delivery efficiency and fairness of\nworkload distribution among couriers have been improved, and under-supplied\nconditions have been alleviated within the service network. Our study sheds\nlight on designing an RL-based framework to enable forward-looking real-time\noperations for meal delivery platforms and other on-demand services.\n","authors":["Jingyi Cheng","Shadi Sharif Azadeh"],"pdf_url":"https://arxiv.org/pdf/2501.05808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13572v2","updated":"2025-01-10T09:11:39Z","published":"2024-02-21T07:07:54Z","title":"AlgoFormer: An Efficient Transformer Framework with Algorithmic\n  Structures","summary":"  Besides natural language processing, transformers exhibit extraordinary\nperformance in solving broader applications, including scientific computing and\ncomputer vision. Previous works try to explain this from the expressive power\nand capability perspectives that standard transformers are capable of\nperforming some algorithms. To empower transformers with algorithmic\ncapabilities and motivated by the recently proposed looped transformer, we\ndesign a novel transformer framework, dubbed Algorithm Transformer (abbreviated\nas AlgoFormer). We provide an insight that efficient transformer architectures\ncan be designed by leveraging prior knowledge of tasks and the underlying\nstructure of potential algorithms. Compared with the standard transformer and\nvanilla looped transformer, the proposed AlgoFormer can perform efficiently in\nalgorithm representation in some specific tasks. In particular, inspired by the\nstructure of human-designed learning algorithms, our transformer framework\nconsists of a pre-transformer that is responsible for task preprocessing, a\nlooped transformer for iterative optimization algorithms, and a\npost-transformer for producing the desired results after post-processing. We\nprovide theoretical evidence of the expressive power of the AlgoFormer in\nsolving some challenging problems, mirroring human-designed algorithms.\nFurthermore, some theoretical and empirical results are presented to show that\nthe designed transformer has the potential to perform algorithm representation\nand learning. Experimental results demonstrate the empirical superiority of the\nproposed transformer in that it outperforms the standard transformer and\nvanilla looped transformer in some specific tasks. An extensive experiment on\nreal language tasks (e.g., neural machine translation of German and English,\nand text classification) further validates the expressiveness and effectiveness\nof AlgoFormer.\n","authors":["Yihang Gao","Chuanyang Zheng","Enze Xie","Han Shi","Tianyang Hu","Yu Li","Michael K. Ng","Zhenguo Li","Zhaoqiang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.13572v2.pdf","comment":"Published at Transactions on Machine Learning Research (TMLR). The\n  paper provides insight that the Transformer architectures can mimic the\n  algorithm structures in (in-context) algorithm learning and representation.\n  The incorporated algorithmic structure in Algoformer shows its potential in\n  (deep learning for) scientific computing, besides the real language tasks"},{"id":"http://arxiv.org/abs/2501.05803v1","updated":"2025-01-10T09:10:30Z","published":"2025-01-10T09:10:30Z","title":"Alignment without Over-optimization: Training-Free Solution for\n  Diffusion Models","summary":"  Diffusion models excel in generative tasks, but aligning them with specific\nobjectives while maintaining their versatility remains challenging. Existing\nfine-tuning methods often suffer from reward over-optimization, while\napproximate guidance approaches fail to optimize target rewards effectively.\nAddressing these limitations, we propose a training-free sampling method based\non Sequential Monte Carlo (SMC) to sample from the reward-aligned target\ndistribution. Our approach, tailored for diffusion sampling and incorporating\ntempering techniques, achieves comparable or superior target rewards to\nfine-tuning methods while preserving diversity and cross-reward generalization.\nWe demonstrate its effectiveness in single-reward optimization, multi-objective\nscenarios, and online black-box optimization. This work offers a robust\nsolution for aligning diffusion models with diverse downstream objectives\nwithout compromising their general capabilities. Code is available at\nhttps://github.com/krafton-ai/DAS .\n","authors":["Sunwoo Kim","Minkyu Kim","Dongmin Park"],"pdf_url":"https://arxiv.org/pdf/2501.05803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05795v1","updated":"2025-01-10T08:57:50Z","published":"2025-01-10T08:57:50Z","title":"Robust Counterfactual Explanations under Model Multiplicity Using\n  Multi-Objective Optimization","summary":"  In recent years, explainability in machine learning has gained importance. In\nthis context, counterfactual explanation (CE), which is an explanation method\nthat uses examples, has attracted attention. However, it has been pointed out\nthat CE is not robust when there are multiple machine-learning models. These\nproblems are important when using machine learning to make safe decisions. In\nthis paper, we propose robust CEs that introduce a new viewpoint - Pareto\nimprovement - and a method that uses multi-objective optimization to generate\nit. To evaluate the proposed method, we conducted experiments using both\nsimulated and actual data. The results demonstrate that the proposed method is\nrobust and useful. We believe that this research will contribute to a wide\nrange of research areas, such as explainability in machine learning,\ndecision-making, and action planning based on machine learning.\n","authors":["Keita Kinjo"],"pdf_url":"https://arxiv.org/pdf/2501.05795v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2501.05790v1","updated":"2025-01-10T08:50:38Z","published":"2025-01-10T08:50:38Z","title":"Understanding Impact of Human Feedback via Influence Functions","summary":"  In Reinforcement Learning from Human Feedback (RLHF), it is crucial to learn\nsuitable reward models from human feedback to align large language models\n(LLMs) with human intentions. However, human feedback can often be noisy,\ninconsistent, or biased, especially when evaluating complex responses. Such\nfeedback can lead to misaligned reward signals, potentially causing unintended\nside effects during the RLHF process. To address these challenges, we explore\nthe use of influence functions to measure the impact of human feedback on the\nperformance of reward models. We propose a compute-efficient approximation\nmethod that enables the application of influence functions to LLM-based reward\nmodels and large-scale preference datasets. In our experiments, we demonstrate\ntwo key applications of influence functions: (1) detecting common forms of\nlabeler bias in human feedback datasets and (2) guiding labelers to refine\ntheir strategies to align more closely with expert feedback. By quantifying the\nimpact of human feedback on reward models, we believe that influence functions\ncan enhance feedback interpretability and contribute to scalable oversight in\nRLHF, helping labelers provide more accurate and consistent feedback. Source\ncode is available at https://github.com/mintaywon/IF_RLHF\n","authors":["Taywon Min","Haeone Lee","Hanho Ryu","Yongchan Kwon","Kimin Lee"],"pdf_url":"https://arxiv.org/pdf/2501.05790v1.pdf","comment":"Source code: https://github.com/mintaywon/IF_RLHF"},{"id":"http://arxiv.org/abs/2501.02564v2","updated":"2025-01-10T08:40:49Z","published":"2025-01-05T14:42:47Z","title":"Balanced Multi-view Clustering","summary":"  Multi-view clustering (MvC) aims to integrate information from different\nviews to enhance the capability of the model in capturing the underlying data\nstructures. The widely used joint training paradigm in MvC is potentially not\nfully leverage the multi-view information, since the imbalanced and\nunder-optimized view-specific features caused by the uniform learning objective\nfor all views. For instance, particular views with more discriminative\ninformation could dominate the learning process in the joint training paradigm,\nleading to other views being under-optimized. To alleviate this issue, we first\nanalyze the imbalanced phenomenon in the joint-training paradigm of multi-view\nclustering from the perspective of gradient descent for each view-specific\nfeature extractor. Then, we propose a novel balanced multi-view clustering\n(BMvC) method, which introduces a view-specific contrastive regularization\n(VCR) to modulate the optimization of each view. Concretely, VCR preserves the\nsample similarities captured from the joint features and view-specific ones\ninto the clustering distributions corresponding to view-specific features to\nenhance the learning process of view-specific feature extractors. Additionally,\na theoretical analysis is provided to illustrate that VCR adaptively modulates\nthe magnitudes of gradients for updating the parameters of view-specific\nfeature extractors to achieve a balanced multi-view learning procedure. In such\na manner, BMvC achieves a better trade-off between the exploitation of\nview-specific patterns and the exploration of view-invariance patterns to fully\nlearn the multi-view information for the clustering task. Finally, a set of\nexperiments are conducted to verify the superiority of the proposed method\ncompared with state-of-the-art approaches both on eight benchmark MvC datasets\nand two spatially resolved transcriptomics datasets.\n","authors":["Zhenglai Li","Jun Wang","Chang Tang","Xinzhong Zhu","Wei Zhang","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.02564v2.pdf","comment":"We are withdrawing this paper due to issues in the experimental\n  section related to the Application for Spatially Resolved Transcriptomics\n  Data Clustering. These issues affect the validity of the results presented.\n  We believe it is necessary to withdraw the paper to address these problems\n  adequately before resubmission."},{"id":"http://arxiv.org/abs/2501.05783v1","updated":"2025-01-10T08:33:31Z","published":"2025-01-10T08:33:31Z","title":"UV-Attack: Physical-World Adversarial Attacks for Person Detection via\n  Dynamic-NeRF-based UV Mapping","summary":"  In recent research, adversarial attacks on person detectors using patches or\nstatic 3D model-based texture modifications have struggled with low success\nrates due to the flexible nature of human movement. Modeling the 3D\ndeformations caused by various actions has been a major challenge. Fortunately,\nadvancements in Neural Radiance Fields (NeRF) for dynamic human modeling offer\nnew possibilities. In this paper, we introduce UV-Attack, a groundbreaking\napproach that achieves high success rates even with extensive and unseen human\nactions. We address the challenge above by leveraging dynamic-NeRF-based UV\nmapping. UV-Attack can generate human images across diverse actions and\nviewpoints, and even create novel actions by sampling from the SMPL parameter\nspace. While dynamic NeRF models are capable of modeling human bodies,\nmodifying clothing textures is challenging because they are embedded in neural\nnetwork parameters. To tackle this, UV-Attack generates UV maps instead of RGB\nimages and modifies the texture stacks. This approach enables real-time texture\nedits and makes the attack more practical. We also propose a novel Expectation\nover Pose Transformation loss (EoPT) to improve the evasion success rate on\nunseen poses and views. Our experiments show that UV-Attack achieves a 92.75%\nattack success rate against the FastRCNN model across varied poses in dynamic\nvideo settings, significantly outperforming the state-of-the-art AdvCamou\nattack, which only had a 28.50% ASR. Moreover, we achieve 49.5% ASR on the\nlatest YOLOv8 detector in black-box settings. This work highlights the\npotential of dynamic NeRF-based UV mapping for creating more effective\nadversarial attacks on person detectors, addressing key challenges in modeling\nhuman movement and texture modification.\n","authors":["Yanjie Li","Wenxuan Zhang","Kaisheng Liang","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2501.05783v1.pdf","comment":"23 pages, 22 figures, submitted to ICLR2025"},{"id":"http://arxiv.org/abs/2311.02565v2","updated":"2025-01-10T08:01:09Z","published":"2023-11-05T04:43:48Z","title":"KITS: Inductive Spatio-Temporal Kriging with Increment Training Strategy","summary":"  Sensors are commonly deployed to perceive the environment. However, due to\nthe high cost, sensors are usually sparsely deployed. Kriging is the tailored\ntask to infer the unobserved nodes (without sensors) using the observed source\nnodes (with sensors). The essence of kriging task is transferability. Recently,\nseveral inductive spatio-temporal kriging methods have been proposed based on\ngraph neural networks, being trained based on a graph built on top of observed\nnodes via pretext tasks such as masking nodes out and reconstructing them.\nHowever, the graph in training is inevitably much sparser than the graph in\ninference that includes all the observed and unobserved nodes. The learned\npattern cannot be well generalized for inference, denoted as graph gap. To\naddress this issue, we first present a novel Increment training strategy:\ninstead of masking nodes (and reconstructing them), we add virtual nodes into\nthe training graph so as to mitigate the graph gap issue naturally.\nNevertheless, the empty-shell virtual nodes without labels could have\nbad-learned features and lack supervision signals. To solve these issues, we\npair each virtual node with its most similar observed node and fuse their\nfeatures together; to enhance the supervision signal, we construct reliable\npseudo labels for virtual nodes. As a result, the learned pattern of virtual\nnodes could be safely transferred to real unobserved nodes for reliable\nkriging. We name our new Kriging model with Increment Training Strategy as\nKITS. Extensive experiments demonstrate that KITS consistently outperforms\nexisting kriging methods by large margins, e.g., the improvement over MAE score\ncould be as high as 18.33%.\n","authors":["Qianxiong Xu","Cheng Long","Ziyue Li","Sijie Ruan","Rui Zhao","Zhishuai Li"],"pdf_url":"https://arxiv.org/pdf/2311.02565v2.pdf","comment":"This paper is accepted by AAAI'25"},{"id":"http://arxiv.org/abs/2501.05768v1","updated":"2025-01-10T07:56:30Z","published":"2025-01-10T07:56:30Z","title":"Halal or Not: Knowledge Graph Completion for Predicting Cultural\n  Appropriateness of Daily Products","summary":"  The growing demand for halal cosmetic products has exposed significant\nchallenges, especially in Muslim-majority countries. Recently, various machine\nlearning-based strategies, e.g., image-based methods, have shown remarkable\nsuccess in predicting the halal status of cosmetics. However, these methods\nmainly focus on analyzing the discrete and specific ingredients within separate\ncosmetics, which ignore the high-order and complex relations between cosmetics\nand ingredients. To address this problem, we propose a halal cosmetic\nrecommendation framework, namely HaCKG, that leverages a knowledge graph of\ncosmetics and their ingredients to explicitly model and capture the\nrelationships between cosmetics and their components. By representing cosmetics\nand ingredients as entities within the knowledge graph, HaCKG effectively\nlearns the high-order and complex relations between entities, offering a robust\nmethod for predicting halal status. Specifically, we first construct a cosmetic\nknowledge graph representing the relations between various cosmetics,\ningredients, and their properties. We then propose a pre-trained relational\ngraph attention network model with residual connections to learn the structural\nrelation between entities in the knowledge graph. The pre-trained model is then\nfine-tuned on downstream cosmetic data to predict halal status. Extensive\nexperiments on the cosmetic dataset over halal prediction tasks demonstrate the\nsuperiority of our model over state-of-the-art baselines.\n","authors":["Van Thuy Hoang","Tien-Bach-Thanh Do","Jinho Seo","Seung Charlie Kim","Luong Vuong Nguyen","Duong Nguyen Minh Huy","Hyeon-Ju Jeon","O-Joun Lee"],"pdf_url":"https://arxiv.org/pdf/2501.05768v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2501.05767v1","updated":"2025-01-10T07:56:23Z","published":"2025-01-10T07:56:23Z","title":"Migician: Revealing the Magic of Free-Form Multi-Image Grounding in\n  Multimodal Large Language Models","summary":"  The recent advancement of Multimodal Large Language Models (MLLMs) has\nsignificantly improved their fine-grained perception of single images and\ngeneral comprehension across multiple images. However, existing MLLMs still\nface challenges in achieving precise grounding in complex multi-image\nscenarios. To address this, we first explore a Chain-of-Thought (CoT) framework\nthat integrates single-image grounding with multi-image comprehension. While\npartially effective, it remains unstable and struggles to capture abstract\nvisual information due to its non-end-to-end nature. Therefore, we introduce\nMigician, the first multi-image grounding model capable of performing free-form\nand accurate grounding across multiple images. To support this, we present the\nMGrounding-630k dataset, which comprises data for several multi-image grounding\ntasks derived from existing datasets, along with newly generated free-form\ngrounding instruction-following data. Furthermore, we propose MIG-Bench, a\ncomprehensive benchmark specifically designed for evaluating multi-image\ngrounding capabilities. Experimental results demonstrate that our model\nachieves significantly superior multi-image grounding capabilities,\noutperforming the best existing MLLMs by 21.61% and even surpassing much larger\n70B models. Our code, model, dataset, and benchmark are fully open-sourced.\n","authors":["You Li","Heyu Huang","Chi Chen","Kaiyu Huang","Chao Huang","Zonghao Guo","Zhiyuan Liu","Jinan Xu","Yuhua Li","Ruixuan Li","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2501.05767v1.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.05765v1","updated":"2025-01-10T07:48:40Z","published":"2025-01-10T07:48:40Z","title":"Deontic Temporal Logic for Formal Verification of AI Ethics","summary":"  Ensuring ethical behavior in Artificial Intelligence (AI) systems amidst\ntheir increasing ubiquity and influence is a major concern the world over. The\nuse of formal methods in AI ethics is a possible crucial approach for\nspecifying and verifying the ethical behavior of AI systems. This paper\nproposes a formalization based on deontic logic to define and evaluate the\nethical behavior of AI systems, focusing on system-level specifications,\ncontributing to this important goal. It introduces axioms and theorems to\ncapture ethical requirements related to fairness and explainability. The\nformalization incorporates temporal operators to reason about the ethical\nbehavior of AI systems over time. The authors evaluate the effectiveness of\nthis formalization by assessing the ethics of the real-world COMPAS and loan\nprediction AI systems. Various ethical properties of the COMPAS and loan\nprediction systems are encoded using deontic logical formulas, allowing the use\nof an automated theorem prover to verify whether these systems satisfy the\ndefined properties. The formal verification reveals that both systems fail to\nfulfill certain key ethical properties related to fairness and\nnon-discrimination, demonstrating the effectiveness of the proposed\nformalization in identifying potential ethical issues in real-world AI\napplications.\n","authors":["Priya T. V.","Shrisha Rao"],"pdf_url":"https://arxiv.org/pdf/2501.05765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18144v3","updated":"2025-01-10T07:22:12Z","published":"2024-05-28T13:02:56Z","title":"4-bit Shampoo for Memory-Efficient Network Training","summary":"  Second-order optimizers, maintaining a matrix termed a preconditioner, are\nsuperior to first-order optimizers in both theory and practice. The states\nforming the preconditioner and its inverse root restrict the maximum size of\nmodels trained by second-order optimizers. To address this, compressing 32-bit\noptimizer states to lower bitwidths has shown promise in reducing memory usage.\nHowever, current approaches only pertain to first-order optimizers. In this\npaper, we propose the first 4-bit second-order optimizers, exemplified by 4-bit\nShampoo, maintaining performance similar to that of 32-bit ones. We show that\nquantizing the eigenvector matrix of the preconditioner in 4-bit Shampoo is\nremarkably better than quantizing the preconditioner itself both theoretically\nand experimentally. By rectifying the orthogonality of the quantized\neigenvector matrix, we enhance the approximation of the preconditioner's\neigenvector matrix, which also benefits the computation of its inverse 4-th\nroot. Besides, we find that linear square quantization slightly outperforms\ndynamic tree quantization when quantizing second-order optimizer states.\nEvaluation on various networks for image classification and natural language\nmodeling demonstrates that our 4-bit Shampoo achieves comparable performance to\nits 32-bit counterpart while being more memory-efficient.\n","authors":["Sike Wang","Pan Zhou","Jia Li","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2405.18144v3.pdf","comment":"NeurIPS 2024 final camera-ready revisions, rectify the legend in\n  figure 9"},{"id":"http://arxiv.org/abs/2501.05752v1","updated":"2025-01-10T07:02:43Z","published":"2025-01-10T07:02:43Z","title":"Semantic Exploration with Adaptive Gating for Efficient Problem Solving\n  with Language Models","summary":"  Recent advancements in large language models (LLMs) have shown remarkable\npotential in various complex tasks requiring multi-step reasoning methods like\ntree search to explore diverse reasoning paths. However, existing methods often\nsuffer from computational inefficiency and redundancy. First, they overlook the\ndiversity of task difficulties, leading to unnecessarily extensive searches\neven for easy tasks. Second, they neglect the semantics of reasoning paths,\nresulting in redundant exploration of semantically identical paths. To address\nthese limitations, we propose Semantic Exploration with Adaptive Gating (SEAG),\na computationally efficient method. SEAG employs an adaptive gating mechanism\nthat dynamically decides whether to conduct a tree search, based on the\nconfidence level of answers from a preceding simple reasoning method.\nFurthermore, its tree-based exploration consolidates semantically identical\nreasoning steps, reducing redundant explorations while maintaining or even\nimproving accuracy. Our extensive experiments demonstrate that SEAG\nsignificantly improves accuracy by 4.3% on average while requiring only 31% of\ncomputational costs compared to existing tree search-based methods on complex\nreasoning benchmarks including GSM8K and ARC with diverse language models such\nas Llama2, Llama3, and Mistral.\n","authors":["Sungjae Lee","Hyejin Park","Jaechang Kim","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2501.05752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05730v1","updated":"2025-01-10T05:54:04Z","published":"2025-01-10T05:54:04Z","title":"Element-wise Attention Is All You Need","summary":"  The self-attention (SA) mechanism has demonstrated superior performance\nacross various domains, yet it suffers from substantial complexity during both\ntraining and inference. The next-generation architecture, aiming at retaining\nthe competitive performance of SA while achieving low-cost inference and\nefficient long-sequence training, primarily focuses on three approaches: linear\nattention, linear RNNs, and state space models. Although these approaches\nachieve reduced complexity than SA, they all have built-in performance\ndegradation factors, such as diminished “spikiness” and compression of\nhistorical information. In contrast to these approaches, we propose a novel\nelement-wise attention mechanism, which uses the element-wise squared Euclidean\ndistance, instead of the dot product operation, to compute similarity and\napproximates the quadratic complexity term $\\exp(q_{ic}k_{jc})$ with a Taylor\npolynomial. This design achieves remarkable efficiency: during training, the\nelement-wise attention has a complexity of $\\mathcal{O}(tLD)$, making\nlong-sequence training both computationally and memory efficient, where $L$ is\nthe sequence length, $D$ is the feature dimension, and $t$ is the highest order\nof the polynomial; during inference, it can be reformulated as recurrent neural\nnetworks, achieving a inference complexity of $\\mathcal{O}(tD)$. Furthermore,\nthe element-wise attention circumvents the performance degradation factors\npresent in these approaches and achieves performance comparable to SA in both\ncausal and non-causal forms.\n","authors":["Guoxin Feng"],"pdf_url":"https://arxiv.org/pdf/2501.05730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05729v1","updated":"2025-01-10T05:53:37Z","published":"2025-01-10T05:53:37Z","title":"ExPO: Explainable Phonetic Trait-Oriented Network for Speaker\n  Verification","summary":"  In speaker verification, we use computational method to verify if an\nutterance matches the identity of an enrolled speaker. This task is similar to\nthe manual task of forensic voice comparison, where linguistic analysis is\ncombined with auditory measurements to compare and evaluate voice samples.\nDespite much success, we have yet to develop a speaker verification system that\noffers explainable results comparable to those from manual forensic voice\ncomparison. A novel approach, Explainable Phonetic Trait-Oriented (ExPO)\nnetwork, is proposed in this paper to introduce the speaker's phonetic trait\nwhich describes the speaker's characteristics at the phonetic level, resembling\nwhat forensic comparison does. ExPO not only generates utterance-level speaker\nembeddings but also allows for fine-grained analysis and visualization of\nphonetic traits, offering an explainable speaker verification process.\nFurthermore, we investigate phonetic traits from within-speaker and\nbetween-speaker variation perspectives to determine which trait is most\neffective for speaker verification, marking an important step towards\nexplainable speaker verification. Our code is available at\nhttps://github.com/mmmmayi/ExPO.\n","authors":["Yi Ma","Shuai Wang","Tianchi Liu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2501.05729v1.pdf","comment":"Accepted by IEEE Signal Processing Letters"},{"id":"http://arxiv.org/abs/2501.05727v1","updated":"2025-01-10T05:51:52Z","published":"2025-01-10T05:51:52Z","title":"Enabling Scalable Oversight via Self-Evolving Critic","summary":"  Despite their remarkable performance, the development of Large Language\nModels (LLMs) faces a critical challenge in scalable oversight: providing\neffective feedback for tasks where human evaluation is difficult or where LLMs\noutperform humans. While there is growing interest in using LLMs for critique,\ncurrent approaches still rely on human annotations or more powerful models,\nleaving the issue of enhancing critique capabilities without external\nsupervision unresolved. We introduce SCRIT (Self-evolving CRITic), a framework\nthat enables genuine self-evolution of critique abilities. Technically, SCRIT\nself-improves by training on synthetic data, generated by a contrastive-based\nself-critic that uses reference solutions for step-by-step critique, and a\nself-validation mechanism that ensures critique quality through correction\noutcomes. Implemented with Qwen2.5-72B-Instruct, one of the most powerful LLMs,\nSCRIT achieves up to a 10.3\\% improvement on critique-correction and error\nidentification benchmarks. Our analysis reveals that SCRIT's performance scales\npositively with data and model size, outperforms alternative approaches, and\nbenefits critically from its self-validation component.\n","authors":["Zhengyang Tang","Ziniu Li","Zhenyang Xiao","Tian Ding","Ruoyu Sun","Benyou Wang","Dayiheng Liu","Fei Huang","Tianyu Liu","Bowen Yu","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2501.05727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00778v3","updated":"2025-01-10T05:35:58Z","published":"2024-06-02T15:35:45Z","title":"Bayesian Joint Additive Factor Models for Multiview Learning","summary":"  It is increasingly common in a wide variety of applied settings to collect\ndata of multiple different types on the same set of samples. Our particular\nfocus in this article is on studying relationships between such multiview\nfeatures and responses. A motivating application arises in the context of\nprecision medicine where multi-omics data are collected to correlate with\nclinical outcomes. It is of interest to infer dependence within and across\nviews while combining multimodal information to improve the prediction of\noutcomes. The signal-to-noise ratio can vary substantially across views,\nmotivating more nuanced statistical tools beyond standard late and early\nfusion. This challenge comes with the need to preserve interpretability, select\nfeatures, and obtain accurate uncertainty quantification. We propose a joint\nadditive factor regression model (JAFAR) with a structured additive design,\naccounting for shared and view-specific components. We ensure identifiability\nvia a novel dependent cumulative shrinkage process (D-CUSP) prior. We provide\nan efficient implementation via a partially collapsed Gibbs sampler and extend\nour approach to allow flexible feature and outcome distributions. Prediction of\ntime-to-labor onset from immunome, metabolome, and proteome data illustrates\nperformance gains against state-of-the-art competitors. Our open-source\nsoftware (R package) is available at https://github.com/niccoloanceschi/jafar.\n","authors":["Niccolo Anceschi","Federico Ferrari","David B. Dunson","Himel Mallick"],"pdf_url":"https://arxiv.org/pdf/2406.00778v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01350v2","updated":"2025-01-10T05:35:32Z","published":"2024-10-02T09:07:33Z","title":"Takin-VC: Expressive Zero-Shot Voice Conversion via Adaptive Hybrid\n  Content Encoding and Enhanced Timbre Modeling","summary":"  Expressive zero-shot voice conversion (VC) is a critical and challenging task\nthat aims to transform the source timbre into an arbitrary unseen speaker while\npreserving the original content and expressive qualities. Despite recent\nprogress in zero-shot VC, there remains considerable potential for improvements\nin speaker similarity and speech naturalness. Moreover, existing zero-shot VC\nsystems struggle to fully reproduce paralinguistic information in highly\nexpressive speech, such as breathing, crying, and emotional nuances, limiting\ntheir practical applicability. To address these issues, we propose Takin-VC, a\nnovel expressive zero-shot VC framework via adaptive hybrid content encoding\nand memory-augmented context-aware timbre modeling. Specifically, we introduce\nan innovative hybrid content encoder that incorporates an adaptive fusion\nmodule, capable of effectively integrating quantized features of the\npre-trained WavLM and HybridFormer in an implicit manner, so as to extract\nprecise linguistic features while enriching paralinguistic elements. For timbre\nmodeling, we propose advanced memory-augmented and context-aware modules to\ngenerate high-quality target timbre features and fused representations that\nseamlessly align source content with target timbre. To enhance real-time\nperformance, we advocate a conditional flow matching model to reconstruct the\nMel-spectrogram of the source speech. Experimental results show that our\nTakin-VC consistently surpasses state-of-the-art VC systems, achieving notable\nimprovements in terms of speech naturalness, speech expressiveness, and speaker\nsimilarity, while offering enhanced inference speed.\n","authors":["Yuguang Yang","Yu Pan","Jixun Yao","Xiang Zhang","Jianhao Ye","Hongbin Zhou","Lei Xie","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.01350v2.pdf","comment":"Work in Progress; Under Review"},{"id":"http://arxiv.org/abs/2308.15720v2","updated":"2025-01-10T05:32:06Z","published":"2023-08-30T02:50:54Z","title":"Surrogate-based Autotuning for Randomized Sketching Algorithms in\n  Regression Problems","summary":"  Algorithms from Randomized Numerical Linear Algebra (RandNLA) are known to be\neffective in handling high-dimensional computational problems, providing\nhigh-quality empirical performance as well as strong probabilistic guarantees.\nHowever, their practical application is complicated by the fact that the user\nneeds to set various algorithm-specific tuning parameters which are different\nthan those used in traditional NLA. This paper demonstrates how a\nsurrogate-based autotuning approach can be used to address fundamental problems\nof parameter selection in RandNLA algorithms. In particular, we provide a\ndetailed investigation of surrogate-based autotuning for\nsketch-and-precondition (SAP) based randomized least squares methods, which\nhave been one of the great success stories in modern RandNLA. Empirical results\nshow that our surrogate-based autotuning approach can achieve near-optimal\nperformance with much less tuning cost than a random search (up to about 4x\nfewer trials of different parameter configurations). Moreover, while our\nexperiments focus on least squares, our results demonstrate a general-purpose\nautotuning pipeline applicable to any kind of RandNLA algorithm.\n","authors":["Younghyun Cho","James W. Demmel","Michał Dereziński","Haoyun Li","Hengrui Luo","Michael W. Mahoney","Riley J. Murray"],"pdf_url":"https://arxiv.org/pdf/2308.15720v2.pdf","comment":"Improved the presentation and clarity. Updated experimental results\n  and scenarios. Accepted for publication in SIAM Journal on Matrix Analysis\n  and Applications"},{"id":"http://arxiv.org/abs/2501.05717v1","updated":"2025-01-10T05:29:09Z","published":"2025-01-10T05:29:09Z","title":"Zero-shot Shark Tracking and Biometrics from Aerial Imagery","summary":"  The recent widespread adoption of drones for studying marine animals provides\nopportunities for deriving biological information from aerial imagery. The\nlarge scale of imagery data acquired from drones is well suited for machine\nlearning (ML) analysis. Development of ML models for analyzing marine animal\naerial imagery has followed the classical paradigm of training, testing, and\ndeploying a new model for each dataset, requiring significant time, human\neffort, and ML expertise. We introduce Frame Level ALIgment and tRacking\n(FLAIR), which leverages the video understanding of Segment Anything Model 2\n(SAM2) and the vision-language capabilities of Contrastive Language-Image\nPre-training (CLIP). FLAIR takes a drone video as input and outputs\nsegmentation masks of the species of interest across the video. Notably, FLAIR\nleverages a zero-shot approach, eliminating the need for labeled data, training\na new model, or fine-tuning an existing model to generalize to other species.\nWith a dataset of 18,000 drone images of Pacific nurse sharks, we trained\nstate-of-the-art object detection models to compare against FLAIR. We show that\nFLAIR massively outperforms these object detectors and performs competitively\nagainst two human-in-the-loop methods for prompting SAM2, achieving a Dice\nscore of 0.81. FLAIR readily generalizes to other shark species without\nadditional human effort and can be combined with novel heuristics to\nautomatically extract relevant information including length and tailbeat\nfrequency. FLAIR has significant potential to accelerate aerial imagery\nanalysis workflows, requiring markedly less human effort and expertise than\ntraditional machine learning workflows, while achieving superior accuracy. By\nreducing the effort required for aerial imagery analysis, FLAIR allows\nscientists to spend more time interpreting results and deriving insights about\nmarine ecosystems.\n","authors":["Chinmay K Lalgudi","Mark E Leone","Jaden V Clark","Sergio Madrigal-Mora","Mario Espinoza"],"pdf_url":"https://arxiv.org/pdf/2501.05717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04974v2","updated":"2025-01-10T05:15:34Z","published":"2025-01-09T05:06:44Z","title":"SensorQA: A Question Answering Benchmark for Daily-Life Monitoring","summary":"  With the rapid growth in sensor data, effectively interpreting and\ninterfacing with these data in a human-understandable way has become crucial.\nWhile existing research primarily focuses on learning classification models,\nfewer studies have explored how end users can actively extract useful insights\nfrom sensor data, often hindered by the lack of a proper dataset. To address\nthis gap, we introduce SensorQA, the first human-created question-answering\n(QA) dataset for long-term time-series sensor data for daily life monitoring.\nSensorQA is created by human workers and includes 5.6K diverse and practical\nqueries that reflect genuine human interests, paired with accurate answers\nderived from sensor data. We further establish benchmarks for state-of-the-art\nAI models on this dataset and evaluate their performance on typical edge\ndevices. Our results reveal a gap between current models and optimal QA\nperformance and efficiency, highlighting the need for new contributions. The\ndataset and code are available at:\n\\url{https://github.com/benjamin-reichman/SensorQA}.\n","authors":["Benjamin Reichman","Xiaofan Yu","Lanxiang Hu","Jack Truxal","Atishay Jain","Rushil Chandrupatla","Tajana Šimunić Rosing","Larry Heck"],"pdf_url":"https://arxiv.org/pdf/2501.04974v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05714v1","updated":"2025-01-10T05:15:14Z","published":"2025-01-10T05:15:14Z","title":"How to Enable Effective Cooperation Between Humans and NLP Models: A\n  Survey of Principles, Formalizations, and Beyond","summary":"  With the advancement of large language models (LLMs), intelligent models have\nevolved from mere tools to autonomous agents with their own goals and\nstrategies for cooperating with humans. This evolution has birthed a novel\nparadigm in NLP, i.e., human-model cooperation, that has yielded remarkable\nprogress in numerous NLP tasks in recent years. In this paper, we take the\nfirst step to present a thorough review of human-model cooperation, exploring\nits principles, formalizations, and open challenges. In particular, we\nintroduce a new taxonomy that provides a unified perspective to summarize\nexisting approaches. Also, we discuss potential frontier areas and their\ncorresponding challenges. We regard our work as an entry point, paving the way\nfor more breakthrough research in this regard.\n","authors":["Chen Huang","Yang Deng","Wenqiang Lei","Jiancheng Lv","Tat-Seng Chua","Jimmy Xiangji Huang"],"pdf_url":"https://arxiv.org/pdf/2501.05714v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2501.05707v1","updated":"2025-01-10T04:35:46Z","published":"2025-01-10T04:35:46Z","title":"Multiagent Finetuning: Self Improvement with Diverse Reasoning Chains","summary":"  Large language models (LLMs) have achieved remarkable performance in recent\nyears but are fundamentally limited by the underlying training data. To improve\nmodels beyond the training data, recent works have explored how LLMs can be\nused to generate synthetic data for autonomous self-improvement. However,\nsuccessive steps of self-improvement can reach a point of diminishing returns.\nIn this work, we propose a complementary approach towards self-improvement\nwhere finetuning is applied to a multiagent society of language models. A group\nof language models, all starting from the same base model, are independently\nspecialized by updating each one using data generated through multiagent\ninteractions among the models. By training each model on independent sets of\ndata, we illustrate how this approach enables specialization across models and\ndiversification over the set of models. As a result, our overall system is able\nto preserve diverse reasoning chains and autonomously improve over many more\nrounds of fine-tuning than single-agent self-improvement methods. We\nquantitatively illustrate the efficacy of the approach across a wide suite of\nreasoning tasks.\n","authors":["Vighnesh Subramaniam","Yilun Du","Joshua B. Tenenbaum","Antonio Torralba","Shuang Li","Igor Mordatch"],"pdf_url":"https://arxiv.org/pdf/2501.05707v1.pdf","comment":"22 pages, 13 figures, 7 tables; Project page at\n  https://llm-multiagent-ft.github.io/"},{"id":"http://arxiv.org/abs/2408.01933v3","updated":"2025-01-10T04:09:43Z","published":"2024-08-04T05:15:02Z","title":"DiReCT: Diagnostic Reasoning for Clinical Notes via Large Language\n  Models","summary":"  Large language models (LLMs) have recently showcased remarkable capabilities,\nspanning a wide range of tasks and applications, including those in the medical\ndomain. Models like GPT-4 excel in medical question answering but may face\nchallenges in the lack of interpretability when handling complex tasks in real\nclinical settings. We thus introduce the diagnostic reasoning dataset for\nclinical notes (DiReCT), aiming at evaluating the reasoning ability and\ninterpretability of LLMs compared to human doctors. It contains 511 clinical\nnotes, each meticulously annotated by physicians, detailing the diagnostic\nreasoning process from observations in a clinical note to the final diagnosis.\nAdditionally, a diagnostic knowledge graph is provided to offer essential\nknowledge for reasoning, which may not be covered in the training data of\nexisting LLMs. Evaluations of leading LLMs on DiReCT bring out a significant\ngap between their reasoning ability and that of human doctors, highlighting the\ncritical need for models that can reason effectively in real-world clinical\nscenarios.\n","authors":["Bowen Wang","Jiuyang Chang","Yiming Qian","Guoxin Chen","Junhao Chen","Zhouqiang Jiang","Jiahao Zhang","Yuta Nakashima","Hajime Nagahara"],"pdf_url":"https://arxiv.org/pdf/2408.01933v3.pdf","comment":"9 pages,6 figures"},{"id":"http://arxiv.org/abs/2411.12924v2","updated":"2025-01-10T03:55:57Z","published":"2024-11-19T23:22:33Z","title":"Human-In-the-Loop Software Development Agents","summary":"  Recently, Large Language Models (LLMs)-based multi-agent paradigms for\nsoftware engineering are introduced to automatically resolve software\ndevelopment tasks (e.g., from a given issue to source code). However, existing\nwork is evaluated based on historical benchmark datasets, rarely considers\nhuman feedback at each stage of the automated software development process, and\nhas not been deployed in practice. In this paper, we introduce a\nHuman-in-the-loop LLM-based Agents framework (HULA) for software development\nthat allows software engineers to refine and guide LLMs when generating coding\nplans and source code for a given task. We design, implement, and deploy the\nHULA framework into Atlassian JIRA for internal uses. Through a multi-stage\nevaluation of the HULA framework, Atlassian software engineers perceive that\nHULA can minimize the overall development time and effort, especially in\ninitiating a coding plan and writing code for straightforward tasks. On the\nother hand, challenges around code quality remain a concern in some cases. We\ndraw lessons learned and discuss opportunities for future work, which will pave\nthe way for the advancement of LLM-based agents in software development.\n","authors":["Wannita Takerngsaksiri","Jirat Pasuksmit","Patanamon Thongtanunam","Chakkrit Tantithamthavorn","Ruixiong Zhang","Fan Jiang","Jing Li","Evan Cook","Kun Chen","Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2411.12924v2.pdf","comment":"10 pages, 9 figures, ICSE SEIP 2025"},{"id":"http://arxiv.org/abs/2409.01148v3","updated":"2025-01-10T03:28:00Z","published":"2024-09-02T10:33:45Z","title":"FMRFT: Fusion Mamba and DETR for Query Time Sequence Intersection Fish\n  Tracking","summary":"  Early detection of abnormal fish behavior caused by disease or hunger can be\nachieved through fish tracking using deep learning techniques, which holds\nsignificant value for industrial aquaculture. However, underwater reflections\nand some reasons with fish, such as the high similarity, rapid swimming caused\nby stimuli and mutual occlusion bring challenges to multi-target tracking of\nfish. To address these challenges, this paper establishes a complex\nmulti-scenario sturgeon tracking dataset and introduces the FMRFT model, a\nreal-time end-to-end fish tracking solution. The model incorporates the low\nvideo memory consumption Mamba In Mamba (MIM) architecture, which facilitates\nmulti-frame temporal memory and feature extraction, thereby addressing the\nchallenges to track multiple fish across frames. Additionally, the FMRFT model\nwith the Query Time Sequence Intersection (QTSI) module effectively manages\noccluded objects and reduces redundant tracking frames using the superior\nfeature interaction and prior frame processing capabilities of RT-DETR. This\ncombination significantly enhances the accuracy and stability of fish tracking.\nTrained and tested on the dataset, the model achieves an IDF1 score of 90.3%\nand a MOTA accuracy of 94.3%. Experimental results show that the proposed FMRFT\nmodel effectively addresses the challenges of high similarity and mutual\nocclusion in fish populations, enabling accurate tracking in factory farming\nenvironments.\n","authors":["Mingyuan Yao","Yukang Huo","Qingbin Tian","Jiayin Zhao","Xiao Liu","Ruifeng Wang","Lin Xue","Haihua Wang"],"pdf_url":"https://arxiv.org/pdf/2409.01148v3.pdf","comment":"14 pages,14 figures"},{"id":"http://arxiv.org/abs/2501.05680v1","updated":"2025-01-10T03:07:28Z","published":"2025-01-10T03:07:28Z","title":"EXION: Exploiting Inter- and Intra-Iteration Output Sparsity for\n  Diffusion Models","summary":"  Over the past few years, diffusion models have emerged as novel AI solutions,\ngenerating diverse multi-modal outputs from text prompts. Despite their\ncapabilities, they face challenges in computing, such as excessive latency and\nenergy consumption due to their iterative architecture. Although prior works\nspecialized in transformer acceleration can be applied, the iterative nature of\ndiffusion models remains unresolved. In this paper, we present EXION, the first\nSW-HW co-designed diffusion accelerator that solves the computation challenges\nby exploiting the unique inter- and intra-iteration output sparsity in\ndiffusion models. To this end, we propose two SW-level optimizations. First, we\nintroduce the FFN-Reuse algorithm that identifies and skips redundant\ncomputations in FFN layers across different iterations (inter-iteration\nsparsity). Second, we use a modified eager prediction method that employs\ntwo-step leading-one detection to accurately predict the attention score,\nskipping unnecessary computations within an iteration (intra-iteration\nsparsity). We also introduce a novel data compaction mechanism named ConMerge,\nwhich can enhance HW utilization by condensing and merging sparse matrices into\ncompact forms. Finally, it has a dedicated HW architecture that supports the\nabove sparsity-inducing algorithms, translating high output sparsity into\nimproved energy efficiency and performance. To verify the feasibility of the\nEXION, we first demonstrate that it has no impact on accuracy in various types\nof multi-modal diffusion models. We then instantiate EXION in both server- and\nedge-level settings and compare its performance against GPUs with similar\nspecifications. Our evaluation shows that EXION achieves dramatic improvements\nin performance and energy efficiency by 3.2-379.3x and 45.1-3067.6x compared to\na server GPU and by 42.6-1090.9x and 196.9-4668.2x compared to an edge GPU.\n","authors":["Jaehoon Heo","Adiwena Putra","Jieon Yoon","Sungwoong Yune","Hangyeol Lee","Ji-Hoon Kim","Joo-Young Kim"],"pdf_url":"https://arxiv.org/pdf/2501.05680v1.pdf","comment":"To appear in 2025 IEEE International Symposium on High-Performance\n  Computer Architecture (HPCA 2025)"},{"id":"http://arxiv.org/abs/2501.05675v1","updated":"2025-01-10T02:57:08Z","published":"2025-01-10T02:57:08Z","title":"Facilitate Collaboration between Large Language Model and Task-specific\n  Model for Time Series Anomaly Detection","summary":"  In anomaly detection, methods based on large language models (LLMs) can\nincorporate expert knowledge, while task-specific smaller models excel at\nextracting normal patterns and detecting value fluctuations. Inspired by the\nhuman nervous system, where the brain stores expert knowledge and the\nperipheral nervous system and spinal cord handle specific tasks like withdrawal\nand knee-jerk reflexes, we propose CoLLaTe, a framework designed to facilitate\ncollaboration between LLMs and task-specific models, leveraging the strengths\nof both.\n  In this work, we first formulate the collaboration process and identify two\nkey challenges in the collaboration between LLMs and task-specific models: (1)\nthe misalignment between the expression domains of LLMs and smaller models, and\n(2) error accumulation arising from the predictions of both models.\n  To address these challenges, we introduce two key components in CoLLaTe: the\nalignment module and the collaborative loss function. Through theoretical\nanalysis and experimental validation, we demonstrate that these components\neffectively mitigate the identified challenges and achieve better performance\nthan LLM based methods and task-specific smaller model.\n","authors":["Feiyi Chen","Leilei Zhang","Guansong Pang","Roger Zimmermann","Shuiguang Deng"],"pdf_url":"https://arxiv.org/pdf/2501.05675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05673v1","updated":"2025-01-10T02:51:58Z","published":"2025-01-10T02:51:58Z","title":"Network Diffuser for Placing-Scheduling Service Function Chains with\n  Inverse Demonstration","summary":"  Network services are increasingly managed by considering chained-up virtual\nnetwork functions and relevant traffic flows, known as the Service Function\nChains (SFCs). To deal with sequential arrivals of SFCs in an online fashion,\nwe must consider two closely-coupled problems - an SFC placement problem that\nmaps SFCs to servers/links in the network and an SFC scheduling problem that\ndetermines when each SFC is executed. Solving the whole SFC problem targeting\nthese two optimizations jointly is extremely challenging. In this paper, we\npropose a novel network diffuser using conditional generative modeling for this\nSFC placing-scheduling optimization. Recent advances in generative AI and\ndiffusion models have made it possible to generate high-quality images/videos\nand decision trajectories from language description. We formulate the SFC\noptimization as a problem of generating a state sequence for planning and\nperform graph diffusion on the state trajectories to enable extraction of SFC\ndecisions, with SFC optimization constraints and objectives as conditions. To\naddress the lack of demonstration data due to NP-hardness and exponential\nproblem space of the SFC optimization, we also propose a novel and somewhat\nmaverick approach -- Rather than solving instances of this difficult\noptimization, we start with randomly-generated solutions as input, and then\ndetermine appropriate SFC optimization problems that render these solutions\nfeasible. This inverse demonstration enables us to obtain sufficient expert\ndemonstrations, i.e., problem-solution pairs, through further optimization. In\nour numerical evaluations, the proposed network diffuser outperforms learning\nand heuristic baselines, by $\\sim$20\\% improvement in SFC reward and $\\sim$50\\%\nreduction in SFC waiting time and blocking rate.\n","authors":["Zuyuan Zhang","Vaneet Aggarwal","Tian Lan"],"pdf_url":"https://arxiv.org/pdf/2501.05673v1.pdf","comment":"Accepted to IEEE INFOCOM 2025"},{"id":"http://arxiv.org/abs/2303.16045v4","updated":"2025-01-10T02:39:43Z","published":"2023-03-28T15:20:25Z","title":"An Optimal, Universal and Agnostic Decoding Method for Message\n  Reconstruction, Bio and Technosignature Detection","summary":"  We present an agnostic signal reconstruction method for zero-knowledge\none-way communication channels in which a receiver aims to interpret a message\nsent by an unknown source about which no prior knowledge is available and to\nwhich no return message can be sent. Our reconstruction method is agnostic\nvis-\\`a-vis the arbitrarily chosen encoding-decoding scheme and other\nobserver-dependent characteristics, such as the arbitrarily chosen\ncomputational model, probability distributions, or underlying mathematical\ntheory. We investigate how non-random messages encode information about their\nintended physical properties, such as dimension and length scales of the space\nin which a signal or message may have been originally encoded, embedded, or\ngenerated. We focus on image data as a first illustration of the capabilities\nof the new method. We argue that our results have applications to life and\ntechnosignature detection, and to coding theory in general.\n","authors":["Hector Zenil","Alyssa Adams","Felipe S. Abrahão","Luan Ozelim"],"pdf_url":"https://arxiv.org/pdf/2303.16045v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05667v1","updated":"2025-01-10T02:33:15Z","published":"2025-01-10T02:33:15Z","title":"TransPlace: Transferable Circuit Global Placement via Graph Neural\n  Network","summary":"  Global placement, a critical step in designing the physical layout of\ncomputer chips, is essential to optimize chip performance. Prior global\nplacement methods optimize each circuit design individually from scratch. Their\nneglect of transferable knowledge limits solution efficiency and chip\nperformance as circuit complexity drastically increases. This study presents\nTransPlace, a global placement framework that learns to place millions of\nmixed-size cells in continuous space. TransPlace introduces i) Netlist Graph to\nefficiently model netlist topology, ii) Cell-flow and relative position\nencoding to learn SE(2)-invariant representation, iii) a tailored graph neural\nnetwork architecture for informed parameterization of placement knowledge, and\niv) a two-stage strategy for coarse-to-fine placement. Compared to\nstate-of-the-art placement methods, TransPlace-trained on a few high-quality\nplacements-can place unseen circuits with 1.2x speedup while reducing\ncongestion by 30%, timing by 9%, and wirelength by 5%.\n","authors":["Yunbo Hou","Haoran Ye","Yingxue Zhang","Siyuan Xu","Guojie Song"],"pdf_url":"https://arxiv.org/pdf/2501.05667v1.pdf","comment":"Accepted at KDD 2025"},{"id":"http://arxiv.org/abs/2409.12953v4","updated":"2025-01-10T02:31:03Z","published":"2024-09-19T17:58:16Z","title":"JourneyBench: A Challenging One-Stop Vision-Language Understanding\n  Benchmark of Generated Images","summary":"  Existing vision-language understanding benchmarks largely consist of images\nof objects in their usual contexts. As a consequence, recent multimodal large\nlanguage models can perform well with only a shallow visual understanding by\nrelying on background language biases. Thus, strong performance on these\nbenchmarks does not necessarily correlate with strong visual understanding. In\nthis paper, we release JourneyBench, a comprehensive human-annotated benchmark\nof generated images designed to assess the model's fine-grained multimodal\nreasoning abilities across five tasks: complementary multimodal chain of\nthought, multi-image VQA, imaginary image captioning, VQA with hallucination\ntriggers, and fine-grained retrieval with sample-specific distractors. Unlike\nexisting benchmarks, JourneyBench explicitly requires fine-grained multimodal\nreasoning in unusual imaginary scenarios where language bias and holistic image\ngist are insufficient. We benchmark state-of-the-art models on JourneyBench and\nanalyze performance along a number of fine-grained dimensions. Results across\nall five tasks show that JourneyBench is exceptionally challenging for even the\nbest models, indicating that models' visual reasoning abilities are not as\nstrong as they first appear. We discuss the implications of our findings and\npropose avenues for further research.\n","authors":["Zhecan Wang","Junzhang Liu","Chia-Wei Tang","Hani Alomari","Anushka Sivakumar","Rui Sun","Wenhao Li","Md. Atabuzzaman","Hammad Ayyubi","Haoxuan You","Alvi Ishmam","Kai-Wei Chang","Shih-Fu Chang","Chris Thomas"],"pdf_url":"https://arxiv.org/pdf/2409.12953v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05663v1","updated":"2025-01-10T02:28:19Z","published":"2025-01-10T02:28:19Z","title":"Learning to Measure Quantum Neural Networks","summary":"  The rapid progress in quantum computing (QC) and machine learning (ML) has\nattracted growing attention, prompting extensive research into quantum machine\nlearning (QML) algorithms to solve diverse and complex problems. Designing\nhigh-performance QML models demands expert-level proficiency, which remains a\nsignificant obstacle to the broader adoption of QML. A few major hurdles\ninclude crafting effective data encoding techniques and parameterized quantum\ncircuits, both of which are crucial to the performance of QML models.\nAdditionally, the measurement phase is frequently overlooked-most current QML\nmodels rely on pre-defined measurement protocols that often fail to account for\nthe specific problem being addressed. We introduce a novel approach that makes\nthe observable of the quantum system-specifically, the Hermitian\nmatrix-learnable. Our method features an end-to-end differentiable learning\nframework, where the parameterized observable is trained alongside the ordinary\nquantum circuit parameters simultaneously. Using numerical simulations, we show\nthat the proposed method can identify observables for variational quantum\ncircuits that lead to improved outcomes, such as higher classification\naccuracy, thereby boosting the overall performance of QML models.\n","authors":["Samuel Yen-Chi Chen","Huan-Hsin Tseng","Hsin-Yi Lin","Shinjae Yoo"],"pdf_url":"https://arxiv.org/pdf/2501.05663v1.pdf","comment":"Accepted by ICASSP 2025 Workshop: Quantum Machine Learning in Signal\n  Processing and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2501.05662v1","updated":"2025-01-10T02:28:04Z","published":"2025-01-10T02:28:04Z","title":"Cascaded Self-Evaluation Augmented Training for Efficient Multimodal\n  Large Language Models","summary":"  Efficient Multimodal Large Language Models (EMLLMs) have rapidly advanced\nrecently. Incorporating Chain-of-Thought (CoT) reasoning and step-by-step\nself-evaluation has improved their performance. However, limited parameters\noften hinder EMLLMs from effectively using self-evaluation during inference.\nKey challenges include synthesizing evaluation data, determining its quantity,\noptimizing training and inference strategies, and selecting appropriate\nprompts.\n  To address these issues, we introduce Self-Evaluation Augmented Training\n(SEAT). SEAT uses more powerful EMLLMs for CoT reasoning, data selection, and\nevaluation generation, then trains EMLLMs with the synthesized data. However,\nhandling long prompts and maintaining CoT reasoning quality are problematic.\nTherefore, we propose Cascaded Self-Evaluation Augmented Training (Cas-SEAT),\nwhich breaks down lengthy prompts into shorter, task-specific cascaded prompts\nand reduces costs for resource-limited settings. During data synthesis, we\nemploy open-source 7B-parameter EMLLMs and annotate a small dataset with short\nprompts.\n  Experiments demonstrate that Cas-SEAT significantly boosts EMLLMs'\nself-evaluation abilities, improving performance by 19.68%, 55.57%, and 46.79%\non the MathVista, Math-V, and We-Math datasets, respectively. Additionally, our\nCas-SEAT Dataset serves as a valuable resource for future research in enhancing\nEMLLM self-evaluation.\n","authors":["Zheqi Lv","Wenkai Wang","Jiawei Wang","Shengyu Zhang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11484v9","updated":"2025-01-10T02:18:01Z","published":"2024-07-16T08:20:39Z","title":"The Oscars of AI Theater: A Survey on Role-Playing with Language Models","summary":"  This survey explores the burgeoning field of role-playing with language\nmodels, focusing on their development from early persona-based models to\nadvanced character-driven simulations facilitated by Large Language Models\n(LLMs). Initially confined to simple persona consistency due to limited model\ncapabilities, role-playing tasks have now expanded to embrace complex character\nportrayals involving character consistency, behavioral alignment, and overall\nattractiveness. We provide a comprehensive taxonomy of the critical components\nin designing these systems, including data, models and alignment, agent\narchitecture and evaluation. This survey not only outlines the current\nmethodologies and challenges, such as managing dynamic personal profiles and\nachieving high-level persona consistency but also suggests avenues for future\nresearch in improving the depth and realism of role-playing applications. The\ngoal is to guide future research by offering a structured overview of current\nmethodologies and identifying potential areas for improvement. Related\nresources and papers are available at\nhttps://github.com/nuochenpku/Awesome-Role-Play-Papers.\n","authors":["Nuo Chen","Yan Wang","Yang Deng","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2407.11484v9.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2404.11917v2","updated":"2025-01-10T02:08:52Z","published":"2024-04-18T05:48:15Z","title":"Expected Coordinate Improvement for High-Dimensional Bayesian\n  Optimization","summary":"  Bayesian optimization (BO) algorithm is very popular for solving\nlow-dimensional expensive optimization problems. Extending Bayesian\noptimization to high dimension is a meaningful but challenging task. One of the\nmajor challenges is that it is difficult to find good infill solutions as the\nacquisition functions are also high-dimensional. In this work, we propose the\nexpected coordinate improvement (ECI) criterion for high-dimensional Bayesian\noptimization. The proposed ECI criterion measures the potential improvement we\ncan get by moving the current best solution along one coordinate. The proposed\napproach selects the coordinate with the highest ECI value to refine in each\niteration and covers all the coordinates gradually by iterating over the\ncoordinates. The greatest advantage of the proposed ECI-BO (expected coordinate\nimprovement based Bayesian optimization) algorithm over the standard BO\nalgorithm is that the infill selection problem of the proposed algorithm is\nalways a one-dimensional problem thus can be easily solved. Numerical\nexperiments show that the proposed algorithm can achieve significantly better\nresults than the standard BO algorithm and competitive results when compared\nwith five state-of-the-art high-dimensional BOs. This work provides a simple\nbut efficient approach for high-dimensional Bayesian optimization.\n","authors":["Dawei Zhan"],"pdf_url":"https://arxiv.org/pdf/2404.11917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05647v1","updated":"2025-01-10T01:27:12Z","published":"2025-01-10T01:27:12Z","title":"Collaboration of Large Language Models and Small Recommendation Models\n  for Device-Cloud Recommendation","summary":"  Large Language Models (LLMs) for Recommendation (LLM4Rec) is a promising\nresearch direction that has demonstrated exceptional performance in this field.\nHowever, its inability to capture real-time user preferences greatly limits the\npractical application of LLM4Rec because (i) LLMs are costly to train and infer\nfrequently, and (ii) LLMs struggle to access real-time data (its large number\nof parameters poses an obstacle to deployment on devices). Fortunately, small\nrecommendation models (SRMs) can effectively supplement these shortcomings of\nLLM4Rec diagrams by consuming minimal resources for frequent training and\ninference, and by conveniently accessing real-time data on devices.\n  In light of this, we designed the Device-Cloud LLM-SRM Collaborative\nRecommendation Framework (LSC4Rec) under a device-cloud collaboration setting.\nLSC4Rec aims to integrate the advantages of both LLMs and SRMs, as well as the\nbenefits of cloud and edge computing, achieving a complementary synergy. We\nenhance the practicability of LSC4Rec by designing three strategies:\ncollaborative training, collaborative inference, and intelligent request.\nDuring training, LLM generates candidate lists to enhance the ranking ability\nof SRM in collaborative scenarios and enables SRM to update adaptively to\ncapture real-time user interests. During inference, LLM and SRM are deployed on\nthe cloud and on the device, respectively. LLM generates candidate lists and\ninitial ranking results based on user behavior, and SRM get reranking results\nbased on the candidate list, with final results integrating both LLM's and\nSRM's scores. The device determines whether a new candidate list is needed by\ncomparing the consistency of the LLM's and SRM's sorted lists. Our\ncomprehensive and extensive experimental analysis validates the effectiveness\nof each strategy in LSC4Rec.\n","authors":["Zheqi Lv","Tianyu Zhan","Wenjie Wang","Xinyu Lin","Shengyu Zhang","Wenqiao Zhang","Jiwei Li","Kun Kuang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2501.05647v1.pdf","comment":"Published on KDD'25: Proceedings of the ACM SIGKDD Conference on\n  Knowledge Discovery and Data Mining 2025"},{"id":"http://arxiv.org/abs/2501.05646v1","updated":"2025-01-10T01:25:01Z","published":"2025-01-10T01:25:01Z","title":"Efficient Representations for High-Cardinality Categorical Variables in\n  Machine Learning","summary":"  High\\-cardinality categorical variables pose significant challenges in\nmachine learning, particularly in terms of computational efficiency and model\ninterpretability. Traditional one\\-hot encoding often results in\nhigh\\-dimensional sparse feature spaces, increasing the risk of overfitting and\nreducing scalability. This paper introduces novel encoding techniques,\nincluding means encoding, low\\-rank encoding, and multinomial logistic\nregression encoding, to address these challenges. These methods leverage\nsufficient representations to generate compact and informative embeddings of\ncategorical data. We conduct rigorous theoretical analyses and empirical\nvalidations on diverse datasets, demonstrating significant improvements in\nmodel performance and computational efficiency compared to baseline methods.\nThe proposed techniques are particularly effective in domains requiring\nscalable solutions for large datasets, paving the way for more robust and\nefficient applications in machine learning.\n","authors":["Zixuan Liang"],"pdf_url":"https://arxiv.org/pdf/2501.05646v1.pdf","comment":"2025 International Conference on Advanced Machine Learning and Data\n  Science (AMLDS 2025)"},{"id":"http://arxiv.org/abs/2412.18544v2","updated":"2025-01-10T01:06:06Z","published":"2024-12-24T16:51:35Z","title":"Consistency Checks for Language Model Forecasters","summary":"  Forecasting is a task that is difficult to evaluate: the ground truth can\nonly be known in the future. Recent work showing LLM forecasters rapidly\napproaching human-level performance begs the question: how can we benchmark and\nevaluate these forecasters instantaneously? Following the consistency check\nframework, we measure the performance of forecasters in terms of the\nconsistency of their predictions on different logically-related questions. We\npropose a new, general consistency metric based on arbitrage: for example, if a\nforecasting AI illogically predicts that both the Democratic and Republican\nparties have 60% probability of winning the 2024 US presidential election, an\narbitrageur can trade against the forecaster's predictions and make a profit.\nWe build an automated evaluation system that generates a set of base questions,\ninstantiates consistency checks from these questions, elicits the predictions\nof the forecaster, and measures the consistency of the predictions. We then\nbuild a standard, proper-scoring-rule forecasting benchmark, and show that our\n(instantaneous) consistency metrics correlate with LLM forecasters' ground\ntruth Brier scores (which are only known in the future). We also release a\nconsistency benchmark that resolves in 2028, providing a long-term evaluation\ntool for forecasting.\n","authors":["Daniel Paleka","Abhimanyu Pallavi Sudhir","Alejandro Alvarez","Vineeth Bhat","Adam Shen","Evan Wang","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2412.18544v2.pdf","comment":"55 pages, 25 figures. Submitted to ICLR 2025"},{"id":"http://arxiv.org/abs/2501.05643v1","updated":"2025-01-10T01:00:05Z","published":"2025-01-10T01:00:05Z","title":"Iconicity in Large Language Models","summary":"  Lexical iconicity, a direct relation between a word's meaning and its form,\nis an important aspect of every natural language, most commonly manifesting\nthrough sound-meaning associations. Since Large language models' (LLMs') access\nto both meaning and sound of text is only mediated (meaning through textual\ncontext, sound through written representation, further complicated by\ntokenization), we might expect that the encoding of iconicity in LLMs would be\neither insufficient or significantly different from human processing. This\nstudy addresses this hypothesis by having GPT-4 generate highly iconic\npseudowords in artificial languages. To verify that these words actually carry\niconicity, we had their meanings guessed by Czech and German participants\n(n=672) and subsequently by LLM-based participants (generated by GPT-4 and\nClaude 3.5 Sonnet). The results revealed that humans can guess the meanings of\npseudowords in the generated iconic language more accurately than words in\ndistant natural languages and that LLM-based participants are even more\nsuccessful than humans in this task. This core finding is accompanied by\nseveral additional analyses concerning the universality of the generated\nlanguage and the cues that both human and LLM-based participants utilize.\n","authors":["Anna Marklová","Jiří Milička","Leonid Ryvkin","Ľudmila Lacková Bennet","Libuše Kormaníková"],"pdf_url":"https://arxiv.org/pdf/2501.05643v1.pdf","comment":"Supplementary information: https://osf.io/ywjrk/"},{"id":"http://arxiv.org/abs/2501.05629v1","updated":"2025-01-10T00:10:21Z","published":"2025-01-10T00:10:21Z","title":"The Impact of Model Scaling on Seen and Unseen Language Performance","summary":"  The rapid advancement of Large Language Models (LLMs), particularly those\ntrained on multilingual corpora, has intensified the need for a deeper\nunderstanding of their performance across a diverse range of languages and\nmodel sizes. Our research addresses this critical need by studying the\nperformance and scaling behavior of multilingual LLMs in text classification\nand machine translation tasks across 204 languages. We systematically examine\nboth seen and unseen languages across three model families of varying sizes in\nzero-shot and few-shot settings. Our findings show significant differences in\nscaling behavior between zero-shot and two-shot scenarios, with striking\ndisparities in performance between seen and unseen languages. Model scale has\nlittle effect on zero-shot performance, which remains mostly flat. However, in\ntwo-shot settings, larger models show clear linear improvements in multilingual\ntext classification. For translation tasks, however, only the instruction-tuned\nmodel showed clear benefits from scaling. Our analysis also suggests that\noverall resource levels, not just the proportions of pretraining languages, are\nbetter predictors of model performance, shedding light on what drives\nmultilingual LLM effectiveness.\n","authors":["Rhitabrat Pokharel","Sina Bagheri Nezhad","Ameeta Agrawal","Suresh Singh"],"pdf_url":"https://arxiv.org/pdf/2501.05629v1.pdf","comment":"Accepted at SEAS Workshop at AAAI25"}]},"2025-01-13T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.07566v1","updated":"2025-01-13T18:54:02Z","published":"2025-01-13T18:54:02Z","title":"SafeSwarm: Decentralized Safe RL for the Swarm of Drones Landing in\n  Dense Crowds","summary":"  This paper introduces a safe swarm of drones capable of performing landings\nin crowded environments robustly by relying on Reinforcement Learning\ntechniques combined with Safe Learning. The developed system allows us to teach\nthe swarm of drones with different dynamics to land on moving landing pads in\nan environment while avoiding collisions with obstacles and between agents.\n  The safe barrier net algorithm was developed and evaluated using a swarm of\nCrazyflie 2.1 micro quadrotors, which were tested indoors with the Vicon motion\ncapture system to ensure precise localization and control.\n  Experimental results show that our system achieves landing accuracy of 2.25\ncm with a mean time of 17 s and collision-free landings, underscoring its\neffectiveness and robustness in real-world scenarios. This work offers a\npromising foundation for applications in environments where safety and\nprecision are paramount.\n","authors":["Grik Tadevosyan","Maksim Osipenko","Demetros Aschu","Aleksey Fedoseev","Valerii Serpiva","Oleg Sautenkov","Sausar Karaf","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04987v2","updated":"2025-01-13T18:24:22Z","published":"2024-11-07T18:55:10Z","title":"Few-Shot Task Learning through Inverse Generative Modeling","summary":"  Learning the intents of an agent, defined by its goals or motion style, is\noften extremely challenging from just a few examples. We refer to this problem\nas task concept learning and present our approach, Few-Shot Task Learning\nthrough Inverse Generative Modeling (FTL-IGM), which learns new task concepts\nby leveraging invertible neural generative models. The core idea is to pretrain\na generative model on a set of basic concepts and their demonstrations. Then,\ngiven a few demonstrations of a new concept (such as a new goal or a new\naction), our method learns the underlying concepts through backpropagation\nwithout updating the model weights, thanks to the invertibility of the\ngenerative model. We evaluate our method in five domains -- object\nrearrangement, goal-oriented navigation, motion caption of human actions,\nautonomous driving, and real-world table-top manipulation. Our experimental\nresults demonstrate that via the pretrained generative model, we successfully\nlearn novel concepts and generate agent plans or motion corresponding to these\nconcepts in (1) unseen environments and (2) in composition with training\nconcepts.\n","authors":["Aviv Netanyahu","Yilun Du","Antonia Bronars","Jyothish Pari","Joshua Tenenbaum","Tianmin Shu","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.04987v2.pdf","comment":"Added acknowledgment"},{"id":"http://arxiv.org/abs/2501.07507v1","updated":"2025-01-13T17:25:46Z","published":"2025-01-13T17:25:46Z","title":"Inductive Learning of Robot Task Knowledge from Raw Data and Online\n  Expert Feedback","summary":"  The increasing level of autonomy of robots poses challenges of trust and\nsocial acceptance, especially in human-robot interaction scenarios. This\nrequires an interpretable implementation of robotic cognitive capabilities,\npossibly based on formal methods as logics for the definition of task\nspecifications. However, prior knowledge is often unavailable in complex\nrealistic scenarios.\n  In this paper, we propose an offline algorithm based on inductive logic\nprogramming from noisy examples to extract task specifications (i.e., action\npreconditions, constraints and effects) directly from raw data of few\nheterogeneous (i.e., not repetitive) robotic executions. Our algorithm\nleverages on the output of any unsupervised action identification algorithm\nfrom video-kinematic recordings. Combining it with the definition of very\nbasic, almost task-agnostic, commonsense concepts about the environment, which\ncontribute to the interpretability of our methodology, we are able to learn\nlogical axioms encoding preconditions of actions, as well as their effects in\nthe event calculus paradigm. Since the quality of learned specifications\ndepends mainly on the accuracy of the action identification algorithm, we also\npropose an online framework for incremental refinement of task knowledge from\nuser feedback, guaranteeing safe execution. Results in a standard manipulation\ntask and benchmark for user training in the safety-critical surgical robotic\nscenario, show the robustness, data- and time-efficiency of our methodology,\nwith promising results towards the scalability in more complex domains.\n","authors":["Daniele Meli","Paolo Fiorini"],"pdf_url":"https://arxiv.org/pdf/2501.07507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07462v1","updated":"2025-01-13T16:32:13Z","published":"2025-01-13T16:32:13Z","title":"The Sense of Agency in Assistive Robotics Using Shared Autonomy","summary":"  Sense of agency is one factor that influences people's preferences for robot\nassistance and a phenomenon from cognitive science that represents the\nexperience of control over one's environment. However, in assistive robotics\nliterature, we often see paradigms that optimize measures like task success and\ncognitive load, rather than sense of agency. In fact, prior work has found that\nparticipants sometimes express a preference for paradigms, such as direct\nteleoperation, which do not perform well with those other metrics but give more\ncontrol to the user. In this work, we focus on a subset of assistance paradigms\nfor manipulation called shared autonomy in which the system combines control\nsignals from the user and the automated control. We run a study to evaluate\nsense of agency and show that higher robot autonomy during assistance leads to\nimproved task performance but a decreased sense of agency, indicating a\npotential trade-off between task performance and sense of agency. From our\nfindings, we discuss the relation between sense of agency and optimality, and\nwe consider a proxy metric for a component of sense of agency which might\nenable us to build systems that monitor and maintain sense of agency in real\ntime.\n","authors":["Maggie A. Collier","Rithika Narayan","Henny Admoni"],"pdf_url":"https://arxiv.org/pdf/2501.07462v1.pdf","comment":"10 pages, 8 figure, HRI conference"},{"id":"http://arxiv.org/abs/2501.07421v1","updated":"2025-01-13T15:41:18Z","published":"2025-01-13T15:41:18Z","title":"Empirical Comparison of Four Stereoscopic Depth Sensing Cameras for\n  Robotics Applications","summary":"  Depth sensing is an essential technology in robotics and many other fields.\nMany depth sensing (or RGB-D) cameras are available on the market and selecting\nthe best one for your application can be challenging. In this work, we tested\nfour stereoscopic RGB-D cameras that sense the distance by using two images\nfrom slightly different views. We empirically compared four cameras (Intel\nRealSense D435, Intel RealSense D455, StereoLabs ZED 2, and Luxonis OAK-D Pro)\nin three scenarios: (i) planar surface perception, (ii) plastic doll\nperception, (iii) household object perception (YCB dataset). We recorded and\nevaluated more than 3,000 RGB-D frames for each camera. For table-top robotics\nscenarios with distance to objects up to one meter, the best performance is\nprovided by the D435 camera. For longer distances, the other three models\nperform better, making them more suitable for some mobile robotics\napplications. OAK-D Pro additionally offers integrated AI modules (e.g., object\nand human keypoint detection). ZED 2 is not a standalone device and requires a\ncomputer with a GPU for depth data acquisition. All data (more than 12,000\nRGB-D frames) are publicly available at https://osf.io/f2seb.\n","authors":["Lukas Rustler","Vojtech Volprecht","Matej Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2501.07421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07399v1","updated":"2025-01-13T15:17:10Z","published":"2025-01-13T15:17:10Z","title":"Efficiently Closing Loops in LiDAR-Based SLAM Using Point Cloud Density\n  Maps","summary":"  Consistent maps are key for most autonomous mobile robots. They often use\nSLAM approaches to build such maps. Loop closures via place recognition help\nmaintain accurate pose estimates by mitigating global drift. This paper\npresents a robust loop closure detection pipeline for outdoor SLAM with\nLiDAR-equipped robots. The method handles various LiDAR sensors with different\nscanning patterns, field of views and resolutions. It generates local maps from\nLiDAR scans and aligns them using a ground alignment module to handle both\nplanar and non-planar motion of the LiDAR, ensuring applicability across\nplatforms. The method uses density-preserving bird's eye view projections of\nthese local maps and extracts ORB feature descriptors from them for place\nrecognition. It stores the feature descriptors in a binary search tree for\nefficient retrieval, and self-similarity pruning addresses perceptual aliasing\nin repetitive environments. Extensive experiments on public and self-recorded\ndatasets demonstrate accurate loop closure detection, long-term localization,\nand cross-platform multi-map alignment, agnostic to the LiDAR scanning\npatterns, fields of view, and motion profiles.\n","authors":["Saurabh Gupta","Tiziano Guadagnino","Benedikt Mersch","Niklas Trekel","Meher V. R. Malladi","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2501.07399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08094v2","updated":"2025-01-13T14:53:11Z","published":"2023-05-14T08:10:49Z","title":"Accelerating genetic optimization of nonlinear model predictive control\n  by learning optimal search space size","summary":"  Genetic algorithm (GA) is typically used to solve nonlinear model predictive\ncontrol's optimization problem. However, the size of the search space in which\nthe GA searches for the optimal control inputs is crucial for its applicability\nto fast-response systems. This paper proposes accelerating the genetic\noptimization of NMPC by learning optimal search space size. The approach trains\na multivariate regression model to adaptively predict the best smallest size of\nthe search space in every control cycle. The proposed approach reduces the GA's\ncomputational time, improves the chance of convergence to better control\ninputs, and provides a stable and feasible solution. The proposed approach was\nevaluated on three nonlinear systems and compared to four other evolutionary\nalgorithms implemented in a processor-in-the-loop fashion. The results show\nthat the proposed approach provides a 17-45\\% reduction in computational time\nand increases the convergence rate by 35-47\\%. The source code is available on\nGitHub.\n","authors":["Eslam Mostafa","Hussein A. Aly","Ahmed Elliethy"],"pdf_url":"https://arxiv.org/pdf/2305.08094v2.pdf","comment":"Accepted by the Journal of Control and Decision"},{"id":"http://arxiv.org/abs/2412.19706v3","updated":"2025-01-13T14:15:59Z","published":"2024-12-27T16:00:24Z","title":"Geometric Freeze-Tag Problem","summary":"  We study the Freeze-Tag Problem (FTP), introduced by Arkin et al. (SODA'02),\nwhere the objective is to activate a group of n robots, starting from a single\ninitially active robot. Robots are positioned in $\\mathbb{R}^d$, and once\nactivated, they move at a constant speed to wake up others. The goal is to\nminimize the time required to activate the last robot, known as the makespan.\nWe establish new upper bounds for the makespan under the $l_1$ and $l_2$ norms\nin $\\mathbb{R}^2$ and $\\mathbb{R}^3$. Specifically, we improve the previous\nupper bound for $(\\mathbb{R}^2, l_2)$ from $7.07r$ (Bonichon et al., DISC'24)\nto $5.064r$. For $(\\mathbb{R}^3, l_1)$, we derive a makespan bound of $13r$,\nwhich translates to $22.52r$ for $(\\mathbb{R}^3, l_2)$. Here, $r$ denotes the\nmaximum distance of any robot from the initially active robot under the given\nnorm. To our knowledge, these are the first makespan bounds for FTP in\n$\\mathbb{R}^3$. Additionally, we show that the maximum makespan for $n$ robots\nis not necessarily achieved when robots are equally distributed along the\nboundary in $(\\mathbb{R}^2, l_2)$. We further investigate FTP in\n$(\\mathbb{R}^3, l_2)$ for specific configurations where robots lie on a\nboundary, providing insights into practical scenarios.\n","authors":["Sharareh Alipour","Kajal Baghestani","Mahdis Mirzaei","Soroush Sahraei"],"pdf_url":"https://arxiv.org/pdf/2412.19706v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06782v2","updated":"2025-01-13T14:11:49Z","published":"2024-11-11T08:19:54Z","title":"QuadWBG: Generalizable Quadrupedal Whole-Body Grasping","summary":"  Legged robots with advanced manipulation capabilities have the potential to\nsignificantly improve household duties and urban maintenance. Despite\nconsiderable progress in developing robust locomotion and precise manipulation\nmethods, seamlessly integrating these into cohesive whole-body control for\nreal-world applications remains challenging. In this paper, we present a\nmodular framework for robust and generalizable whole-body loco-manipulation\ncontroller based on a single arm-mounted camera. By using reinforcement\nlearning (RL), we enable a robust low-level policy for command execution over 5\ndimensions (5D) and a grasp-aware high-level policy guided by a novel metric,\nGeneralized Oriented Reachability Map (GORM). The proposed system achieves\nstate-of-the-art one-time grasping accuracy of 89% in the real world, including\nchallenging tasks such as grasping transparent objects. Through extensive\nsimulations and real-world experiments, we demonstrate that our system can\neffectively manage a large workspace, from floor level to above body height,\nand perform diverse whole-body loco-manipulation tasks.\n","authors":["Jilong Wang","Javokhirbek Rajabov","Chaoyi Xu","Yiming Zheng","He Wang"],"pdf_url":"https://arxiv.org/pdf/2411.06782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07343v1","updated":"2025-01-13T13:57:37Z","published":"2025-01-13T13:57:37Z","title":"Fast-Revisit Coverage Path Planning for Autonomous Mobile Patrol Robots\n  Using Long-Range Sensor Information","summary":"  The utilization of Unmanned Ground Vehicles (UGVs) for patrolling industrial\nsites has expanded significantly. These UGVs typically are equipped with\nperception systems, e.g., computer vision, with limited range due to sensor\nlimitations or site topology. High-level control of the UGVs requires Coverage\nPath Planning (CPP) algorithms that navigate all relevant waypoints and\npromptly start the next cycle. In this paper, we propose the novel Fast-Revisit\nCoverage Path Planning (FaRe-CPP) algorithm using a greedy heuristic approach\nto propose waypoints for maximum coverage area and a random search-based path\noptimization technique to obtain a path along the proposed waypoints with\nminimum revisit time. We evaluated the algorithm in a simulated environment\nusing Gazebo and a camera-equipped TurtleBot3 against a number of existing\nalgorithms. Compared to their average revisit times and path lengths, our\nFaRe-CPP algorithm approximately showed a 45% and 40% reduction, respectively,\nin these highly relevant performance indicators.\n","authors":["Srinivas Kachavarapu","Tobias Doernbach","Reinhard Gerndt"],"pdf_url":"https://arxiv.org/pdf/2501.07343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07317v1","updated":"2025-01-13T13:28:03Z","published":"2025-01-13T13:28:03Z","title":"Evaluation of Artificial Intelligence Methods for Lead Time Prediction\n  in Non-Cycled Areas of Automotive Production","summary":"  The present study examines the effectiveness of applying Artificial\nIntelligence methods in an automotive production environment to predict unknown\nlead times in a non-cycle-controlled production area. Data structures are\nanalyzed to identify contextual features and then preprocessed using one-hot\nencoding. Methods selection focuses on supervised machine learning techniques.\nIn supervised learning methods, regression and classification methods are\nevaluated. Continuous regression based on target size distribution is not\nfeasible. Classification methods analysis shows that Ensemble Learning and\nSupport Vector Machines are the most suitable. Preliminary study results\nindicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost\nyield the best results. After further testing and extensive hyperparameter\noptimization, the final method choice is the LightGBM algorithm. Depending on\nfeature availability and prediction interval granularity, relative prediction\naccuracies of up to 90% can be achieved. Further tests highlight the importance\nof periodic retraining of AI models to accurately represent complex production\nprocesses using the database. The research demonstrates that AI methods can be\neffectively applied to highly variable production data, adding business value\nby providing an additional metric for various control tasks while outperforming\ncurrent non AI-based systems.\n","authors":["Cornelius Hake","Jonas Weigele","Frederik Reichert","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2501.07317v1.pdf","comment":"7 pages, 4 figures, CLC2024 Conference"},{"id":"http://arxiv.org/abs/2501.07299v1","updated":"2025-01-13T13:07:20Z","published":"2025-01-13T13:07:20Z","title":"ViewVR: Visual Feedback Modes to Achieve Quality of VR-based\n  Telemanipulation","summary":"  The paper focuses on an immersive teleoperation system that enhances\noperator's ability to actively perceive the robot's surroundings. A\nconsumer-grade HTC Vive VR system was used to synchronize the operator's hand\nand head movements with a UR3 robot and a custom-built robotic head with two\ndegrees of freedom (2-DoF). The system's usability, manipulation efficiency,\nand intuitiveness of control were evaluated in comparison with static head\ncamera positioning across three distinct tasks. Code and other supplementary\nmaterials can be accessed by link: https://github.com/ErkhovArtem/ViewVR\n","authors":["A. Erkhov","A. Bazhenov","S. Satsevich","D. Belov","F. Khabibullin","S. Egorov","M. Gromakov","M. Altamirano Cabrera","D. Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07295v1","updated":"2025-01-13T13:01:21Z","published":"2025-01-13T13:01:21Z","title":"GestLLM: Advanced Hand Gesture Interpretation via Large Language Models\n  for Human-Robot Interaction","summary":"  This paper introduces GestLLM, an advanced system for human-robot interaction\nthat enables intuitive robot control through hand gestures. Unlike conventional\nsystems, which rely on a limited set of predefined gestures, GestLLM leverages\nlarge language models and feature extraction via MediaPipe to interpret a\ndiverse range of gestures. This integration addresses key limitations in\nexisting systems, such as restricted gesture flexibility and the inability to\nrecognize complex or unconventional gestures commonly used in human\ncommunication.\n  By combining state-of-the-art feature extraction and language model\ncapabilities, GestLLM achieves performance comparable to leading\nvision-language models while supporting gestures underrepresented in\ntraditional datasets. For example, this includes gestures from popular culture,\nsuch as the ``Vulcan salute\" from Star Trek, without any additional\npretraining, prompt engineering, etc. This flexibility enhances the naturalness\nand inclusivity of robot control, making interactions more intuitive and\nuser-friendly.\n  GestLLM provides a significant step forward in gesture-based interaction,\nenabling robots to understand and respond to a wide variety of hand gestures\neffectively. This paper outlines its design, implementation, and evaluation,\ndemonstrating its potential applications in advanced human-robot collaboration,\nassistive robotics, and interactive entertainment.\n","authors":["Oleg Kobzarev","Artem Lykov","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07259v1","updated":"2025-01-13T12:14:48Z","published":"2025-01-13T12:14:48Z","title":"PO-GVINS: Tightly Coupled GNSS-Visual-Inertial Integration with\n  Pose-Only Representation","summary":"  Accurate and reliable positioning is crucial for perception, decision-making,\nand other high-level applications in autonomous driving, unmanned aerial\nvehicles, and intelligent robots. Given the inherent limitations of standalone\nsensors, integrating heterogeneous sensors with complementary capabilities is\none of the most effective approaches to achieving this goal. In this paper, we\npropose a filtering-based, tightly coupled global navigation satellite system\n(GNSS)-visual-inertial positioning framework with a pose-only formulation\napplied to the visual-inertial system (VINS), termed PO-GVINS. Specifically,\nmultiple-view imaging used in current VINS requires a priori of 3D feature,\nthen jointly estimate camera poses and 3D feature position, which inevitably\nintroduces linearization error of the feature as well as facing dimensional\nexplosion. However, the pose-only (PO) formulation, which is demonstrated to be\nequivalent to the multiple-view imaging and has been applied in visual\nreconstruction, represent feature depth using two camera poses and thus 3D\nfeature position is removed from state vector avoiding aforementioned\ndifficulties. Inspired by this, we first apply PO formulation in our VINS,\ni.e., PO-VINS. GNSS raw measurements are then incorporated with integer\nambiguity resolved to achieve accurate and drift-free estimation. Extensive\nexperiments demonstrate that the proposed PO-VINS significantly outperforms the\nmulti-state constrained Kalman filter (MSCKF). By incorporating GNSS\nmeasurements, PO-GVINS achieves accurate, drift-free state estimation, making\nit a robust solution for positioning in challenging environments.\n","authors":["Zhuo Xu","Feng Zhu","Zihang Zhang","Chang Jian","Jiarui Lv","Yuantai Zhang","Xiaohong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07255v1","updated":"2025-01-13T12:06:58Z","published":"2025-01-13T12:06:58Z","title":"GazeGrasp: DNN-Driven Robotic Grasping with Wearable Eye-Gaze Interface","summary":"  We present GazeGrasp, a gaze-based manipulation system enabling individuals\nwith motor impairments to control collaborative robots using eye-gaze. The\nsystem employs an ESP32 CAM for eye tracking, MediaPipe for gaze detection, and\nYOLOv8 for object localization, integrated with a Universal Robot UR10 for\nmanipulation tasks. After user-specific calibration, the system allows\nintuitive object selection with a magnetic snapping effect and robot control\nvia eye gestures. Experimental evaluation involving 13 participants\ndemonstrated that the magnetic snapping effect significantly reduced gaze\nalignment time, improving task efficiency by 31%. GazeGrasp provides a robust,\nhands-free interface for assistive robotics, enhancing accessibility and\nautonomy for users.\n","authors":["Issatay Tokmurziyev","Miguel Altamirano Cabrera","Luis Moreno","Muhammad Haris Khan","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07255v1.pdf","comment":"Accepted to: IEEE/ACM International Conference on Human-Robot\n  Interaction (HRI 2025)"},{"id":"http://arxiv.org/abs/2412.20104v2","updated":"2025-01-13T11:46:06Z","published":"2024-12-28T10:12:12Z","title":"SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object\n  Interaction Synthesis","summary":"  Synthesizing realistic human-object interaction motions is a critical problem\nin VR/AR and human animation. Unlike the commonly studied scenarios involving a\nsingle human or hand interacting with one object, we address a more generic\nmulti-body setting with arbitrary numbers of humans, hands, and objects. This\ncomplexity introduces significant challenges in synchronizing motions due to\nthe high correlations and mutual influences among bodies. To address these\nchallenges, we introduce SyncDiff, a novel method for multi-body interaction\nsynthesis using a synchronized motion diffusion strategy. SyncDiff employs a\nsingle diffusion model to capture the joint distribution of multi-body motions.\nTo enhance motion fidelity, we propose a frequency-domain motion decomposition\nscheme. Additionally, we introduce a new set of alignment scores to emphasize\nthe synchronization of different body motions. SyncDiff jointly optimizes both\ndata sample likelihood and alignment likelihood through an explicit\nsynchronization strategy. Extensive experiments across four datasets with\nvarious multi-body configurations demonstrate the superiority of SyncDiff over\nexisting state-of-the-art motion synthesis methods.\n","authors":["Wenkun He","Yun Liu","Ruitao Liu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2412.20104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07224v1","updated":"2025-01-13T11:22:57Z","published":"2025-01-13T11:22:57Z","title":"Touched by ChatGPT: Using an LLM to Drive Affective Tactile Interaction","summary":"  Touch is a fundamental aspect of emotion-rich communication, playing a vital\nrole in human interaction and offering significant potential in human-robot\ninteraction. Previous research has demonstrated that a sparse representation of\nhuman touch can effectively convey social tactile signals. However, advances in\nhuman-robot tactile interaction remain limited, as many humanoid robots possess\nsimplistic capabilities, such as only opening and closing their hands,\nrestricting nuanced tactile expressions. In this study, we explore how a robot\ncan use sparse representations of tactile vibrations to convey emotions to a\nperson. To achieve this, we developed a wearable sleeve integrated with a 5x5\ngrid of vibration motors, enabling the robot to communicate diverse tactile\nemotions and gestures. Using chain prompts within a Large Language Model (LLM),\nwe generated distinct 10-second vibration patterns corresponding to 10 emotions\n(e.g., happiness, sadness, fear) and 6 touch gestures (e.g., pat, rub, tap).\nParticipants (N = 32) then rated each vibration stimulus based on perceived\nvalence and arousal. People are accurate at recognising intended emotions, a\nresult which aligns with earlier findings. These results highlight the LLM's\nability to generate emotional haptic data and effectively convey emotions\nthrough tactile signals. By translating complex emotional and tactile\nexpressions into vibratory patterns, this research demonstrates how LLMs can\nenhance physical interaction between humans and robots.\n","authors":["Qiaoqiao Ren","Tony Belpaeme"],"pdf_url":"https://arxiv.org/pdf/2501.07224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07223v1","updated":"2025-01-13T11:21:53Z","published":"2025-01-13T11:21:53Z","title":"Improving Incremental Nonlinear Dynamic Inversion Robustness Using\n  Robust Control in Aerial Robotics","summary":"  Improving robustness to uncertainty and rejection of external disturbances\nrepresents a significant challenge in aerial robotics. Nonlinear controllers\nbased on Incremental Nonlinear Dynamic Inversion (INDI), known for their\nability in estimating disturbances through measured-filtered data, have been\nnotably used in such applications. Typically, these controllers comprise two\ncascaded loops: an inner loop employing nonlinear dynamic inversion and an\nouter loop generating the virtual control inputs via linear controllers. In\nthis paper, a novel methodology is introduced, that combines the advantages of\nINDI with the robustness of linear structured $\\mathcal{H}_\\infty$ controllers.\nA full cascaded architecture is proposed to control the dynamics of a\nmultirotor drone, covering both stabilization and guidance. In particular,\nlow-order $\\mathcal{H}_\\infty$ controllers are designed for the outer loop by\nproperly structuring the problem and solving it through non-smooth\noptimization. A comparative analysis is conducted between an existing INDI/PD\napproach and the proposed INDI/$\\mathcal{H}_\\infty$ strategy, showing a notable\nenhancement in the rejection of external disturbances. It is carried out first\nusing MATLAB simulations involving a nonlinear model of a Parrot Bebop\nquadcopter drone, and then experimentally using a customized quadcopter built\nby the ENAC team. The results show an improvement of more than 50\\% in the\nrejection of disturbances such as gusts.\n","authors":["Mohamad Hachem","Clément Roos","Thierry Miquel","Murat Bronz"],"pdf_url":"https://arxiv.org/pdf/2501.07223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07216v1","updated":"2025-01-13T11:14:05Z","published":"2025-01-13T11:14:05Z","title":"Temperature Driven Multi-modal/Single-actuated Soft Finger","summary":"  Soft pneumatic fingers are of great research interest. However, their\nsignificant potential is limited as most of them can generate only one motion,\nmostly bending. The conventional design of soft fingers does not allow them to\nswitch to another motion mode. In this paper, we developed a novel multi-modal\nand single-actuated soft finger where its motion mode is switched by changing\nthe finger's temperature. Our soft finger is capable of switching between three\ndistinctive motion modes: bending, twisting, and extension-in approximately\nfive seconds. We carried out a detailed experimental study of the soft finger\nand evaluated its repeatability and range of motion. It exhibited repeatability\nof around one millimeter and a fifty percent larger range of motion than a\nstandard bending actuator. We developed an analytical model for a\nfiber-reinforced soft actuator for twisting motion. This helped us relate the\ninput pressure to the output twist radius of the twisting motion. This model\nwas validated by experimental verification. Further, a soft robotic gripper\nwith multiple grasp modes was developed using three actuators. This gripper can\nadapt to and grasp objects of a large range of size, shape, and stiffness. We\nshowcased its grasping capabilities by successfully grasping a small berry, a\nlarge roll, and a delicate tofu cube.\n","authors":["Prashant Kumar","Weiwei Wan","Kensuke Harada"],"pdf_url":"https://arxiv.org/pdf/2501.07216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07213v1","updated":"2025-01-13T11:12:47Z","published":"2025-01-13T11:12:47Z","title":"Multi-face emotion detection for effective Human-Robot Interaction","summary":"  The integration of dialogue interfaces in mobile devices has become\nubiquitous, providing a wide array of services. As technology progresses,\nhumanoid robots designed with human-like features to interact effectively with\npeople are gaining prominence, and the use of advanced human-robot dialogue\ninterfaces is continually expanding. In this context, emotion recognition plays\na crucial role in enhancing human-robot interaction by enabling robots to\nunderstand human intentions. This research proposes a facial emotion detection\ninterface integrated into a mobile humanoid robot, capable of displaying\nreal-time emotions from multiple individuals on a user interface. To this end,\nvarious deep neural network models for facial expression recognition were\ndeveloped and evaluated under consistent computer-based conditions, yielding\npromising results. Afterwards, a trade-off between accuracy and memory\nfootprint was carefully considered to effectively implement this application on\na mobile humanoid robot.\n","authors":["Mohamed Ala Yahyaoui","Mouaad Oujabour","Leila Ben Letaifa","Amine Bohi"],"pdf_url":"https://arxiv.org/pdf/2501.07213v1.pdf","comment":"9 pages, 8 figures and 1 table. Accepted at the 17th International\n  Conference on Agents and Artificial Intelligence (ICAART 2025), Porto,\n  Portugal"},{"id":"http://arxiv.org/abs/2501.07180v1","updated":"2025-01-13T10:19:30Z","published":"2025-01-13T10:19:30Z","title":"Evaluating Robotic Approach Techniques for the Insertion of a Straight\n  Instrument into a Vitreoretinal Surgery Trocar","summary":"  Advances in vitreoretinal robotic surgery enable precise techniques for gene\ntherapies. This study evaluates three robotic approaches using the 7-DoF\nrobotic arm for docking a micro-precise tool to a trocar: fully co-manipulated,\nhybrid co-manipulated/teleoperated, and hybrid with camera assistance. The\nfully co-manipulated approach was the fastest but had a 42% success rate.\nHybrid methods showed higher success rates (91.6% and 100%) and completed tasks\nwithin 2 minutes. NASA Task Load Index (TLX) assessments indicated lower\nphysical demand and effort for hybrid approaches.\n","authors":["Ross Henry","Martin Huber","Anestis Mablekos-Alexiou","Carlo Seneci","Mohamed Abdelaziz","Hans Natalius","Lyndon da Cruz","Christos Bergeles"],"pdf_url":"https://arxiv.org/pdf/2501.07180v1.pdf","comment":"2 Pages, 2 Figures, 1 Table"},{"id":"http://arxiv.org/abs/2409.06501v3","updated":"2025-01-13T09:53:48Z","published":"2024-09-10T13:34:53Z","title":"An Adaptive Sliding Window Estimator for Positioning of Unmanned Aerial\n  Vehicle Using a Single Anchor","summary":"  Localization using a single range anchor combined with onboard\noptical-inertial odometry offers a lightweight solution that provides\nmultidimensional measurements for the positioning of unmanned aerial vehicles.\nUnfortunately, the performance of such lightweight sensors varies with the\ndynamic environment, and the fidelity of the dynamic model is also severely\naffected by environmental aerial flow. To address this challenge, we propose an\nadaptive sliding window estimator equipped with an estimation reliability\nevaluator, where the states, noise covariance matrices and aerial drag are\nestimated simultaneously. The aerial drag effects are first evaluated based on\nposterior states and covariance. Then, an augmented Kalman filter is designed\nto pre-process multidimensional measurements and inherit historical\ninformation. Subsequently, an inverse-Wishart smoother is employed to estimate\nposterior states and covariance matrices. To further suppress potential\ndivergence, a reliability evaluator is devised to infer estimation errors. We\nfurther determine the fidelity of each sensor based on the error propagation.\nExtensive experiments are conducted in both standard and harsh environments,\ndemonstrating the adaptability and robustness of the proposed method. The root\nmean square error reaches 0.15 m, outperforming the state-of-the-art approach.\n","authors":["Kaiwen Xiong","Sijia Chen","Wei Dong"],"pdf_url":"https://arxiv.org/pdf/2409.06501v3.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2407.11218v3","updated":"2025-01-13T09:23:41Z","published":"2024-07-15T20:07:33Z","title":"Walk along: An Experiment on Controlling the Mobile Robot 'Spot' with\n  Voice and Gestures","summary":"  Robots are becoming more capable and can autonomously perform tasks such as\nnavigating between locations. However, human oversight remains crucial. This\nstudy compared two touchless methods for directing mobile robots: voice control\nand gesture control, to investigate the efficiency of the methods and the\npreference of users. We tested these methods in two conditions: one in which\nparticipants remained stationary and one in which they walked freely alongside\nthe robot. We hypothesized that walking alongside the robot would result in\nhigher intuitiveness ratings and improved task performance, based on the idea\nthat walking promotes spatial alignment and reduces the effort required for\nmental rotation. In a 2x2 within-subject design, 218 participants guided the\nquadruped robot Spot along a circuitous route with multiple 90-degree turns\nusing rotate left, rotate right, and walk forward commands. After each trial,\nparticipants rated the intuitiveness of the command mapping, while\npost-experiment interviews were used to gather the participants' preferences.\nResults showed that voice control combined with walking with Spot was the most\nfavored and intuitive, whereas gesture control while standing caused confusion\nfor left/right commands. Nevertheless, 29% of participants preferred gesture\ncontrol, citing increased task engagement and visual congruence as reasons. An\nodometry-based analysis revealed that participants often followed behind Spot,\nparticularly in the gesture control condition, when they were allowed to walk.\nIn conclusion, voice control with walking produced the best outcomes. Improving\nphysical ergonomics and adjusting gesture types could make gesture control more\neffective.\n","authors":["Renchi Zhang","Jesse van der Linden","Dimitra Dodou","Harleigh Seyffert","Yke Bauke Eisma","Joost C. F. de Winter"],"pdf_url":"https://arxiv.org/pdf/2407.11218v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01144v3","updated":"2025-01-13T08:40:27Z","published":"2024-09-02T10:28:18Z","title":"Adaptive Non-linear Centroidal MPC with Stability Guarantees for Robust\n  Locomotion of Legged Robots","summary":"  Nonlinear model predictive locomotion controllers based on the reduced\ncentroidal dynamics are nowadays ubiquitous in legged robots. These schemes,\neven if they assume an inherent simplification of the robot's dynamics, were\nshown to endow robots with a step-adjustment capability in reaction to small\npushes, and, moreover, in the case of uncertain parameters - as unknown\npayloads - they were shown to be able to provide some practical, albeit\nlimited, robustness. In this work, we provide rigorous certificates of their\nclosed loop stability via a reformulation of the centroidal MPC controller.\nThis is achieved thanks to a systematic procedure inspired by the machinery of\nadaptive control, together with ideas coming from Control Lyapunov functions.\nOur reformulation, in addition, provides robustness for a class of unmeasured\nconstant disturbances. To demonstrate the generality of our approach, we\nvalidated our formulation on a new generation of humanoid robots - the 56.7 kg\nergoCub, as well as on a commercially available 21 kg quadruped robot, Aliengo.\n","authors":["Mohamed Elobaid","Giulio Turrisi","Lorenzo Rapetti","Giulio Romualdi","Stefano Dafarra","Tomohiro Kawakami","Tomohiro Chaki","Takahide Yoshiike","Claudio Semini","Daniele Pucci"],"pdf_url":"https://arxiv.org/pdf/2409.01144v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14797v2","updated":"2025-01-13T07:47:32Z","published":"2024-07-20T07:56:24Z","title":"From Underground Mines to Offices: A Versatile and Robust Framework for\n  Range-Inertial SLAM","summary":"  Simultaneous Localization and Mapping (SLAM) is an essential component of\nautonomous robotic applications and self-driving vehicles, enabling them to\nunderstand and operate in their environment. Many SLAM systems have been\nproposed in the last decade, but they are often complex to adapt to different\nsettings or sensor setups. In this work, we present LiDAR Graph-SLAM (LG-SLAM),\na versatile range-inertial SLAM framework that can be adapted to different\ntypes of sensors and environments, from underground mines to offices with\nminimal parameter tuning. Our system integrates range, inertial and GNSS\nmeasurements into a graph-based optimization framework. We also use a refined\nsubmap management approach and a robust loop closure method that effectively\naccounts for uncertainty in the identification and validation of putative loop\nclosures, ensuring global consistency and robustness. Enabled by a parallelized\narchitecture and GPU integration, our system achieves pose estimation at LiDAR\nframe rate, along with online loop closing and graph optimization. We validate\nour system in diverse environments using public datasets and real-world data,\nconsistently achieving an average error below 20 cm and outperforming other\nstate-of-the-art algorithms.\n","authors":["Lorenzo Montano-Oliván","Julio A. Placed","Luis Montano","María T. Lázaro"],"pdf_url":"https://arxiv.org/pdf/2407.14797v2.pdf","comment":"8 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2407.10031v2","updated":"2025-01-13T06:03:14Z","published":"2024-07-14T00:12:44Z","title":"LLaMAR: Long-Horizon Planning for Multi-Agent Robots in Partially\n  Observable Environments","summary":"  The ability of Language Models (LMs) to understand natural language makes\nthem a powerful tool for parsing human instructions into task plans for\nautonomous robots. Unlike traditional planning methods that rely on\ndomain-specific knowledge and handcrafted rules, LMs generalize from diverse\ndata and adapt to various tasks with minimal tuning, acting as a compressed\nknowledge base. However, LMs in their standard form face challenges with\nlong-horizon tasks, particularly in partially observable multi-agent settings.\nWe propose an LM-based Long-Horizon Planner for Multi-Agent Robotics (LLaMAR),\na cognitive architecture for planning that achieves state-of-the-art results in\nlong-horizon tasks within partially observable environments. LLaMAR employs a\nplan-act-correct-verify framework, allowing self-correction from action\nexecution feedback without relying on oracles or simulators. Additionally, we\npresent MAP-THOR, a comprehensive test suite encompassing household tasks of\nvarying complexity within the AI2-THOR environment. Experiments show that\nLLaMAR achieves a 30% higher success rate than other state-of-the-art LM-based\nmulti-agent planners in MAP-THOR and Search \\& Rescue tasks. Code can be found\nat https://github.com/nsidn98/LLaMAR\n","authors":["Siddharth Nayak","Adelmo Morrison Orozco","Marina Ten Have","Vittal Thirumalai","Jackson Zhang","Darren Chen","Aditya Kapoor","Eric Robinson","Karthik Gopalakrishnan","James Harrison","Brian Ichter","Anuj Mahajan","Hamsa Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2407.10031v2.pdf","comment":"27 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.07051v1","updated":"2025-01-13T04:18:52Z","published":"2025-01-13T04:18:52Z","title":"ROSAnnotator: A Web Application for ROSBag Data Analysis in Human-Robot\n  Interaction","summary":"  Human-robot interaction (HRI) is an interdisciplinary field that utilises\nboth quantitative and qualitative methods. While ROSBags, a file format within\nthe Robot Operating System (ROS), offer an efficient means of collecting\ntemporally synched multimodal data in empirical studies with real robots, there\nis a lack of tools specifically designed to integrate qualitative coding and\nanalysis functions with ROSBags. To address this gap, we developed\nROSAnnotator, a web-based application that incorporates a multimodal Large\nLanguage Model (LLM) to support both manual and automated annotation of ROSBag\ndata. ROSAnnotator currently facilitates video, audio, and transcription\nannotations and provides an open interface for custom ROS messages and tools.\nBy using ROSAnnotator, researchers can streamline the qualitative analysis\nprocess, create a more cohesive analysis pipeline, and quickly access\nstatistical summaries of annotations, thereby enhancing the overall efficiency\nof HRI data analysis. https://github.com/CHRI-Lab/ROSAnnotator\n","authors":["Yan Zhang","Haoqi Li","Ramtin Tabatabaei","Wafa Johal"],"pdf_url":"https://arxiv.org/pdf/2501.07051v1.pdf","comment":"Accepted to HRI 2025"},{"id":"http://arxiv.org/abs/2412.16908v2","updated":"2025-01-13T04:11:53Z","published":"2024-12-22T07:54:21Z","title":"Map Imagination Like Blind Humans: Group Diffusion Model for Robotic Map\n  Generation","summary":"  Can robots imagine or generate maps like humans do, especially when only\nlimited information can be perceived like blind people? To address this\nchallenging task, we propose a novel group diffusion model (GDM) based\narchitecture for robots to generate point cloud maps with very limited input\ninformation.Inspired from the blind humans' natural capability of imagining or\ngenerating mental maps, the proposed method can generate maps without visual\nperception data or depth data. With additional limited super-sparse spatial\npositioning data, like the extra contact-based positioning information the\nblind individuals can obtain, the map generation quality can be improved even\nmore.Experiments on public datasets are conducted, and the results indicate\nthat our method can generate reasonable maps solely based on path data, and\nproduce even more refined maps upon incorporating exiguous LiDAR data.Compared\nto conventional mapping approaches, our novel method significantly mitigates\nsensor dependency, enabling the robots to imagine and generate elementary maps\nwithout heavy onboard sensory devices.\n","authors":["Qijin Song","Weibang Bai"],"pdf_url":"https://arxiv.org/pdf/2412.16908v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05723v2","updated":"2025-01-13T02:58:58Z","published":"2025-01-10T05:43:34Z","title":"Robot Error Awareness Through Human Reactions: Implementation,\n  Evaluation, and Recommendations","summary":"  Effective error detection is crucial to prevent task disruption and maintain\nuser trust. Traditional methods often rely on task-specific models or user\nreporting, which can be inflexible or slow. Recent research suggests social\nsignals, naturally exhibited by users in response to robot errors, can enable\nmore flexible, timely error detection. However, most studies rely on post hoc\nanalysis, leaving their real-time effectiveness uncertain and lacking\nuser-centric evaluation. In this work, we developed a proactive error detection\nsystem that combines user behavioral signals (facial action units and speech),\nuser feedback, and error context for automatic error detection. In a study (N =\n28), we compared our proactive system to a status quo reactive approach.\nResults show our system 1) reliably and flexibly detects error, 2) detects\nerrors faster than the reactive approach, and 3) is perceived more favorably by\nusers than the reactive one. We discuss recommendations for enabling robot\nerror awareness in future HRI systems.\n","authors":["Maia Stiber","Russell Taylor","Chien-Ming Huang"],"pdf_url":"https://arxiv.org/pdf/2501.05723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07013v1","updated":"2025-01-13T02:15:15Z","published":"2025-01-13T02:15:15Z","title":"Sthymuli: a Static Educational Robot. Leveraging the Thymio II Platform","summary":"  The use of robots in education represents a challenge for teachers and a\nfixed vision of what robots can do for students. This paper presents the\ndevelopment of Sthymuli, a static educational robot designed to explore new\nclassroom interactions between robots, students and teachers. We propose the\nuse of the Thymio II educational platform as a base, ensuring a robust\nbenchmark for a fair comparison of the commonly available wheeled robots and\nour exploratory approach with Sthymuli. This paper outlines the constraints and\nrequirements for developing such a robot, the current state of development and\nfuture work.\n","authors":["Manuel Bernal-Lecina","Alejandrina Hernández","Adrien Pannatier","Léa Pereyre","Francesco Mondada"],"pdf_url":"https://arxiv.org/pdf/2501.07013v1.pdf","comment":"Two pages, three figures. ICRA40 extended abstract"},{"id":"http://arxiv.org/abs/2501.06994v1","updated":"2025-01-13T01:01:44Z","published":"2025-01-13T01:01:44Z","title":"Motion Tracks: A Unified Representation for Human-Robot Transfer in\n  Few-Shot Imitation Learning","summary":"  Teaching robots to autonomously complete everyday tasks remains a challenge.\nImitation Learning (IL) is a powerful approach that imbues robots with skills\nvia demonstrations, but is limited by the labor-intensive process of collecting\nteleoperated robot data. Human videos offer a scalable alternative, but it\nremains difficult to directly train IL policies from them due to the lack of\nrobot action labels. To address this, we propose to represent actions as\nshort-horizon 2D trajectories on an image. These actions, or motion tracks,\ncapture the predicted direction of motion for either human hands or robot\nend-effectors. We instantiate an IL policy called Motion Track Policy (MT-pi)\nwhich receives image observations and outputs motion tracks as actions. By\nleveraging this unified, cross-embodiment action space, MT-pi completes tasks\nwith high success given just minutes of human video and limited additional\nrobot demonstrations. At test time, we predict motion tracks from two camera\nviews, recovering 6DoF trajectories via multi-view synthesis. MT-pi achieves an\naverage success rate of 86.5% across 4 real-world tasks, outperforming\nstate-of-the-art IL baselines which do not leverage human data or our action\nspace by 40%, and generalizes to scenarios seen only in human videos. Code and\nvideos are available on our website\nhttps://portal-cornell.github.io/motion_track_policy/.\n","authors":["Juntao Ren","Priya Sundaresan","Dorsa Sadigh","Sanjiban Choudhury","Jeannette Bohg"],"pdf_url":"https://arxiv.org/pdf/2501.06994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06987v1","updated":"2025-01-13T00:29:57Z","published":"2025-01-13T00:29:57Z","title":"Hand-Object Contact Detection using Grasp Quality Metrics","summary":"  We propose a novel hand-object contact detection system based on grasp\nquality metrics extracted from object and hand poses, and evaluated its\nperformance using the DexYCB dataset. Our evaluation demonstrated the system's\nhigh accuracy (approaching 90%). Future work will focus on a real-time\nimplementation using vision-based estimation, and integrating it to a\nrobot-to-human handover system.\n","authors":["Akansel Cosgun","Thanh Vinh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.06987v1.pdf","comment":"Submitted to the 2025 IEEE/ACM International Conference on\n  Human-Robot Interaction (HRI'25)"},{"id":"http://arxiv.org/abs/2411.10941v2","updated":"2025-01-13T00:03:58Z","published":"2024-11-17T02:39:58Z","title":"Efficient Estimation of Relaxed Model Parameters for Robust UAV\n  Trajectory Optimization","summary":"  Online trajectory optimization and optimal control methods are crucial for\nenabling sustainable unmanned aerial vehicle (UAV) services, such as\nagriculture, environmental monitoring, and transportation, where available\nactuation and energy are limited. However, optimal controllers are highly\nsensitive to model mismatch, which can occur due to loaded equipment, packages\nto be delivered, or pre-existing variability in fundamental structural and\nthrust-related parameters. To circumvent this problem, optimal controllers can\nbe paired with parameter estimators to improve their trajectory planning\nperformance and perform adaptive control. However, UAV platforms are limited in\nterms of onboard processing power, oftentimes making nonlinear parameter\nestimation too computationally expensive to consider. To address these issues,\nwe propose a relaxed, affine-in-parameters multirotor model along with an\nefficient optimal parameter estimator. We convexify the nominal Moving Horizon\nParameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via\nan affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast\nquadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC)\nin real time. We compare this approach to the equivalent nonlinear estimator in\nMonte Carlo simulations, demonstrating a decrease in average solve time and\ntrajectory optimality cost by 98.2% and 23.9-56.2%, respectively.\n","authors":["Derek Fan","David A. Copp"],"pdf_url":"https://arxiv.org/pdf/2411.10941v2.pdf","comment":"8 pages, 5 figures, to be published in IEEE Sustech 2025"},{"id":"http://arxiv.org/abs/2501.07713v1","updated":"2025-01-13T21:52:46Z","published":"2025-01-13T21:52:46Z","title":"Testing Human-Hand Segmentation on In-Distribution and\n  Out-of-Distribution Data in Human-Robot Interactions Using a Deep Ensemble\n  Model","summary":"  Reliable detection and segmentation of human hands are critical for enhancing\nsafety and facilitating advanced interactions in human-robot collaboration.\nCurrent research predominantly evaluates hand segmentation under\nin-distribution (ID) data, which reflects the training data of deep learning\n(DL) models. However, this approach fails to address out-of-distribution (OOD)\nscenarios that often arise in real-world human-robot interactions. In this\nstudy, we present a novel approach by evaluating the performance of pre-trained\nDL models under both ID data and more challenging OOD scenarios. To mimic\nrealistic industrial scenarios, we designed a diverse dataset featuring simple\nand cluttered backgrounds with industrial tools, varying numbers of hands (0 to\n4), and hands with and without gloves. For OOD scenarios, we incorporated\nunique and rare conditions such as finger-crossing gestures and motion blur\nfrom fast-moving hands, addressing both epistemic and aleatoric uncertainties.\nTo ensure multiple point of views (PoVs), we utilized both egocentric cameras,\nmounted on the operator's head, and static cameras to capture RGB images of\nhuman-robot interactions. This approach allowed us to account for multiple\ncamera perspectives while also evaluating the performance of models trained on\nexisting egocentric datasets as well as static-camera datasets. For\nsegmentation, we used a deep ensemble model composed of UNet and RefineNet as\nbase learners. Performance evaluation was conducted using segmentation metrics\nand uncertainty quantification via predictive entropy. Results revealed that\nmodels trained on industrial datasets outperformed those trained on\nnon-industrial datasets, highlighting the importance of context-specific\ntraining. Although all models struggled with OOD scenarios, those trained on\nindustrial datasets demonstrated significantly better generalization.\n","authors":["Reza Jalayer","Yuxin Chen","Masoud Jalayer","Carlotta Orsenigo","Masayoshi Tomizuka"],"pdf_url":"https://arxiv.org/pdf/2501.07713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07705v1","updated":"2025-01-13T21:32:42Z","published":"2025-01-13T21:32:42Z","title":"Autonomous Electrochemistry Platform with Real-Time Normality Testing of\n  Voltammetry Measurements Using ML","summary":"  Electrochemistry workflows utilize various instruments and computing systems\nto execute workflows consisting of electrocatalyst synthesis, testing and\nevaluation tasks. The heterogeneity of the software and hardware of these\necosystems makes it challenging to orchestrate a complete workflow from\nproduction to characterization by automating its tasks. We propose an\nautonomous electrochemistry computing platform for a multi-site ecosystem that\nprovides the services for remote experiment steering, real-time measurement\ntransfer, and AI/ML-driven analytics. We describe the integration of a mobile\nrobot and synthesis workstation into the ecosystem by developing custom\nhub-networks and software modules to support remote operations over the\necosystem's wireless and wired networks. We describe a workflow task for\ngenerating I-V voltammetry measurements using a potentiostat, and a machine\nlearning framework to ensure their normality by detecting abnormal conditions\nsuch as disconnected electrodes. We study a number of machine learning methods\nfor the underlying detection problem, including smooth, non-smooth, structural\nand statistical methods, and their fusers. We present experimental results to\nillustrate the effectiveness of this platform, and also validate the proposed\nML method by deriving its rigorous generalization equations.\n","authors":["Anees Al-Najjar","Nageswara S. V. Rao","Craig A. Bridges","Sheng Dai","Alex Walters"],"pdf_url":"https://arxiv.org/pdf/2501.07705v1.pdf","comment":"10 pages, 14 figures, accepted in the IEEE 20th International\n  Conference on e-Science (e-Science), 2024"},{"id":"http://arxiv.org/abs/2403.04917v3","updated":"2025-01-13T20:28:04Z","published":"2024-03-07T22:03:36Z","title":"A Mixed-Integer Conic Program for the Moving-Target Traveling Salesman\n  Problem based on a Graph of Convex Sets","summary":"  This paper introduces a new formulation that finds the optimum for the\nMoving-Target Traveling Salesman Problem (MT-TSP), which seeks to find a\nshortest path for an agent, that starts at a depot, visits a set of moving\ntargets exactly once within their assigned time-windows, and returns to the\ndepot. The formulation relies on the key idea that when the targets move along\nlines, their trajectories become convex sets within the space-time coordinate\nsystem. The problem then reduces to finding the shortest path within a graph of\nconvex sets, subject to some speed constraints. We compare our formulation with\nthe current state-of-the-art Mixed Integer Conic Program (MICP) solver for the\nMT-TSP. The experimental results show that our formulation outperforms the MICP\nfor instances with up to 20 targets, with up to two orders of magnitude\nreduction in runtime, and up to a 60\\% tighter optimality gap. We also show\nthat the solution cost from the convex relaxation of our formulation provides\nsignificantly tighter lower bounds for the MT-TSP than the ones from the MICP.\n","authors":["Allen George Philip","Zhongqiang Ren","Sivakumar Rathinam","Howie Choset"],"pdf_url":"https://arxiv.org/pdf/2403.04917v3.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.02365v5","updated":"2025-01-13T20:06:35Z","published":"2024-06-04T14:43:50Z","title":"Exploiting Chordal Sparsity for Fast Global Optimality with Application\n  to Localization","summary":"  In recent years, many estimation problems in robotics have been shown to be\nsolvable to global optimality using their semidefinite relaxations. However,\nthe runtime complexity of off-the-shelf semidefinite programming (SDP) solvers\nis up to cubic in problem size, which inhibits real-time solutions of problems\ninvolving large state dimensions. We show that for a large class of problems,\nnamely those with chordal sparsity, we can reduce the complexity of these\nsolvers to linear in problem size. In particular, we show how to replace the\nlarge positive-semidefinite variable with a number of smaller interconnected\nones using the well-known chordal decomposition. This formulation also allows\nfor the straightforward application of the alternating direction method of\nmultipliers (ADMM), which can exploit parallelism for increased scalability. We\nshow for two example problems in simulation that the chordal solvers provide a\nsignificant speed-up over standard SDP solvers, and that global optimality is\ncrucial in the absence of good initializations.\n","authors":["Frederike Dümbgen","Connor Holmes","Timothy D. Barfoot"],"pdf_url":"https://arxiv.org/pdf/2406.02365v5.pdf","comment":"21 pages, 6 figures. Version history: v1: initial arXiv, v2: WAFR\n  submission, v3: correction, v4: WAFR conference-ready, v5: WAFR SPAR journal\n  version"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2501.07570v1","updated":"2025-01-13T18:57:15Z","published":"2025-01-13T18:57:15Z","title":"Digital Twin for Smart Societies: A Catalyst for Inclusive and\n  Accessible Healthcare","summary":"  With rapid digitization and digitalization, drawing a fine line between the\ndigital and the physical world has become nearly impossible. It has become\nessential more than ever to integrate all spheres of life into a single Digital\nThread to address pressing challenges of modern society: accessible and\ninclusive healthcare in terms of equality and equity. Techno-social\nadvancements and mutual acceptance have enabled the infusion of digital models\nto simulate social settings with minimum resource utilization to make effective\ndecisions. However, a significant gap exists in feeding back the models with\nappropriate real-time changes. In other words, active behavioral modeling of\nmodern society is lacking, influencing community healthcare as a whole. By\ncreating virtual replicas of (physical) behavioral systems, digital twins can\nenable real-time monitoring, simulation, and optimization of urban dynamics.\nThis paper explores the potential of digital twins to promote inclusive\nhealthcare for evolving smart cities. We argue that digital twins can be used\nto: Identify and address disparities in access to healthcare services,\nFacilitate community participation, Simulate the impact of urban policies and\ninterventions on different groups of people, and Aid policy-making bodies for\nbetter access to healthcare. This paper proposes several ways to use digital\ntwins to stitch the actual and virtual societies. Several discussed concepts\nwithin this framework envision an active, integrated, and synchronized\ncommunity aware of data privacy and security. The proposal also provides\nhigh-level step-wise transitions that will enable this transformation.\n","authors":["Joshit Mohanty","Sujatha Alla"," Vaishali","Nagesh Bheesetty","Prasanthi Chidipudi","Satya Prakash Chowdary Nandigam","Marisha Jmukhadze","Puneeth Bheesetty","Narendra Lakshmana Gowda"],"pdf_url":"https://arxiv.org/pdf/2501.07570v1.pdf","comment":"13 pages, 1 figure. This is accepted to publish at the proceedings of\n  the 6th International Conference on Artificial Intelligence and Applied\n  Mathematics in Engineering (ICAIAME 2024)"},{"id":"http://arxiv.org/abs/2312.15141v2","updated":"2025-01-13T18:21:03Z","published":"2023-12-23T02:34:50Z","title":"Improving the Performance of Echo State Networks Through State Feedback","summary":"  Reservoir computing, using nonlinear dynamical systems, offers a\ncost-effective alternative to neural networks for complex tasks involving\nprocessing of sequential data, time series modeling, and system identification.\nEcho state networks (ESNs), a type of reservoir computer, mirror neural\nnetworks but simplify training. They apply fixed, random linear transformations\nto the internal state, followed by nonlinear changes. This process, guided by\ninput signals and linear regression, adapts the system to match target\ncharacteristics, reducing computational demands. A potential drawback of ESNs\nis that the fixed reservoir may not offer the complexity needed for specific\nproblems. While directly altering (training) the internal ESN would reintroduce\nthe computational burden, an indirect modification can be achieved by\nredirecting some output as input. This feedback can influence the internal\nreservoir state, yielding ESNs with enhanced complexity suitable for broader\nchallenges. In this paper, we demonstrate that by feeding some component of the\nreservoir state back into the network through the input, we can drastically\nimprove upon the performance of a given ESN. We rigorously prove that, for any\ngiven ESN, feedback will almost always improve the accuracy of the output. For\na set of three tasks, each representing different problem classes, we find that\nwith feedback the average error measures are reduced by $30\\%-60\\%$.\nRemarkably, feedback provides at least an equivalent performance boost to\ndoubling the initial number of computational nodes, a computationally expensive\nand technologically challenging alternative. These results demonstrate the\nbroad applicability and substantial usefulness of this feedback scheme.\n","authors":["Peter J. Ehlers","Hendra I. Nurdin","Daniel Soh"],"pdf_url":"https://arxiv.org/pdf/2312.15141v2.pdf","comment":"36 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.07516v1","updated":"2025-01-13T17:38:03Z","published":"2025-01-13T17:38:03Z","title":"Determining Disturbance Recovery Conditions by Inverse Sensitivity\n  Minimization","summary":"  Power systems naturally experience disturbances, some of which can damage\nequipment and disrupt consumers. It is important to quickly assess the likely\nconsequences of credible disturbances and take preventive action, if necessary.\nHowever, assessing the impact of potential disturbances is challenging because\nmany of the influential factors, such as loading patterns, controller settings\nand load dynamics, are not precisely known. To address this issue, the paper\nintroduces the concept of parameter-space recovery regions. For each\ndisturbance, the corresponding recovery region is the region of parameter space\nfor which the system will recover to the desired operating point. The boundary\nof the recovery region establishes the separation between parameter values that\nresult in trouble-free recovery and those that incur undesirable non-recovery.\nThe safety margin for a given set of parameter values is defined as the\nsmallest distance (in parameter space) between the given values and the\nrecovery boundary. Novel numerical algorithms with theoretical guarantees are\npresented for efficiently computing recovery boundaries and safety margins.\nUnlike prior methods, which tend to be overly conservative and restricted to\nlow dimensional parameter space, these methods compute safety margins to\narbitrary user-specified accuracy and do so efficiently in high dimensional\nparameter space. The efficacy of the methods is demonstrated using the IEEE\n39-bus benchmark power system, where safety margins are computed for cases that\nconsider up to 86 parameters, and reveal unexpected safety implications that\nwould not have been observed otherwise.\n","authors":["Michael W. Fisher","Ian A. Hiskens"],"pdf_url":"https://arxiv.org/pdf/2501.07516v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2501.07498v1","updated":"2025-01-13T17:16:34Z","published":"2025-01-13T17:16:34Z","title":"Computing Safety Margins of Parameterized Nonlinear Systems for\n  Vulnerability Assessment via Trajectory Sensitivities","summary":"  Physical systems experience nonlinear disturbances which have the potential\nto disrupt desired behavior. For a particular disturbance, whether or not the\nsystem recovers from the disturbance to a desired stable equilibrium point\ndepends on system parameter values, which are typically uncertain and\ntime-varying. Therefore, to quantify proximity to vulnerability we define the\nsafety margin to be the smallest change in parameter values from a nominal\nvalue such that the system will no longer be able to recover from the\ndisturbance. Safety margins are valuable but challenging to compute as related\nmethods, such as those for robust region of attraction estimation, are often\neither overly conservative or computationally intractable for high dimensional\nsystems. Recently, we developed algorithms to compute safety margins\nefficiently and non-conservatively by exploiting the large sensitivity of the\nsystem trajectory near the region of attraction boundary to small\nperturbations. Although these algorithms have enjoyed empirical success, they\nlack theoretical guarantees that would ensure their generalizability. This work\ndevelops a novel characterization of safety margins in terms of trajectory\nsensitivities, and uses this to derive well-posedness and convergence\nguarantees for these algorithms, enabling their generalizability and successful\napplication to a large class of nonlinear systems.\n","authors":["Michael W. Fisher"],"pdf_url":"https://arxiv.org/pdf/2501.07498v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2501.07476v1","updated":"2025-01-13T16:48:22Z","published":"2025-01-13T16:48:22Z","title":"Encrypted Computation of Collision Probability for Secure Satellite\n  Conjunction Analysis","summary":"  The computation of collision probability ($\\mathcal{P}_c$) is crucial for\nspace environmentalism and sustainability by providing decision-making\nknowledge that can prevent collisions between anthropogenic space objects.\nHowever, the accuracy and precision of $\\mathcal{P}_c$ computations is often\ncompromised by limitations in computational resources and data availability.\nWhile significant improvements have been made in the computational aspects, the\nrising concerns regarding the privacy of collaborative data sharing can be a\nmajor limiting factor in the future conjunction analysis and risk assessment,\nespecially as the space environment grows increasingly privatized, competitive,\nand fraught with conflicting strategic interests. This paper argues that the\nimportance of privacy measures in space situational awareness (SSA) is\nunderappreciated, and regulatory and compliance measures currently in place are\nnot sufficient by themselves, presenting a significant gap.\n  To address this gap, we introduce a novel encrypted architecture that\nleverages advanced cryptographic techniques, including homomorphic encryption\n(HE) and multi-party computation (MPC), to safeguard the privacy of entities\ncomputing space sustainability metrics, inter alia, $\\mathcal{P}_c$. Our\nproposed protocol, Encrypted $\\mathcal{P}_c$, integrates the Monte Carlo\nestimation algorithm with cryptographic solutions, enabling secure collision\nprobability computation without exposing sensitive or proprietary information.\nThis research advances secure conjunction analysis by developing a secure MPC\nprotocol for $\\mathcal{P}_c$ computation and highlights the need for innovative\nprotocols to ensure a more secure and cooperative SSA landscape.\n","authors":["Jihoon Suh","Michael Hibbard","Kaoru Teranishi","Takashi Tanaka","Moriba Jah","Maruthi Akella"],"pdf_url":"https://arxiv.org/pdf/2501.07476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07461v1","updated":"2025-01-13T16:30:56Z","published":"2025-01-13T16:30:56Z","title":"A Linear Parameter-Varying Framework for the Analysis of Time-Varying\n  Optimization Algorithms","summary":"  In this paper we propose a framework to analyze iterative first-order\noptimization algorithms for time-varying convex optimization. We assume that\nthe temporal variability is caused by a time-varying parameter entering the\nobjective, which can be measured at the time of decision but whose future\nvalues are unknown. We consider the case of strongly convex objective functions\nwith Lipschitz continuous gradients and address the class of running algorithms\nwhere only one iteration per time change is performed. We model these\nalgorithms as discrete-time linear parameter varying (LPV) systems in feedback\nwith a time-varying gradient. We leverage the approach of analyzing algorithms\nas uncertain control interconnections with integral quadratic constraints\n(IQCs) and generalize that framework to the time-varying case. We propose novel\nIQCs that are capable of capturing the behavior of time-varying nonlinearities\nand leverage techniques from the LPV literature to establish novel bounds on\nthe tracking error. Quantitative bounds can be computed by solving a\nsemi-definite program and can be interpreted as an input-to-state stability\nresult with respect to a disturbance signal which increases with the temporal\nvariability of the problem. As a departure from results in this research area,\nour bounds introduce terms that can be interpreted as a temporal rate of change\nin the cost function and the optimal value. We exemplify our main results with\nnumerical experiments that showcase how our analysis framework is able to\ncapture convergence rates of different first-order algorithms for time-varying\noptimization through the choice of IQC and rate bounds.\n","authors":["Fabian Jakob","Andrea Iannelli"],"pdf_url":"https://arxiv.org/pdf/2501.07461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06782v2","updated":"2025-01-13T14:11:49Z","published":"2024-11-11T08:19:54Z","title":"QuadWBG: Generalizable Quadrupedal Whole-Body Grasping","summary":"  Legged robots with advanced manipulation capabilities have the potential to\nsignificantly improve household duties and urban maintenance. Despite\nconsiderable progress in developing robust locomotion and precise manipulation\nmethods, seamlessly integrating these into cohesive whole-body control for\nreal-world applications remains challenging. In this paper, we present a\nmodular framework for robust and generalizable whole-body loco-manipulation\ncontroller based on a single arm-mounted camera. By using reinforcement\nlearning (RL), we enable a robust low-level policy for command execution over 5\ndimensions (5D) and a grasp-aware high-level policy guided by a novel metric,\nGeneralized Oriented Reachability Map (GORM). The proposed system achieves\nstate-of-the-art one-time grasping accuracy of 89% in the real world, including\nchallenging tasks such as grasping transparent objects. Through extensive\nsimulations and real-world experiments, we demonstrate that our system can\neffectively manage a large workspace, from floor level to above body height,\nand perform diverse whole-body loco-manipulation tasks.\n","authors":["Jilong Wang","Javokhirbek Rajabov","Chaoyi Xu","Yiming Zheng","He Wang"],"pdf_url":"https://arxiv.org/pdf/2411.06782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07273v1","updated":"2025-01-13T12:36:11Z","published":"2025-01-13T12:36:11Z","title":"An Extended Survey and a Comparison Framework for Dataflow Models of\n  Computation and Communication","summary":"  Dataflow Model of Computation and Communications (DF MoCCs) is a formalism\nused to specify the behavior of Cyber-Physical Systems (CPSs). DF MoCCs are\nwidely used in the design of CPSs, as they provide a high-level of abstraction\nto specify the system's behavior. DF MoCCs rules give semantics to a dataflow\nspecification of a CPS, and static analysis algorithms rely on these semantics\nto guarantee safety properties of the dataflow specification, such as bounded\nmemory usage and deadlock freeness. A wide range of DF MoCCs exists, each with\nits own characteristics and static analyses. This paper presents a survey of\nthose DF MoCCs and a classification in eight categories. In addition, DF MoCCs\nare characterized by a comprehensive list of features and static analyses,\nwhich reflect their expressiveness and analyzability. Based on this\ncharacterization, a framework is proposed to compare the expressiveness and the\nanalyzability of DF MoCCs quantitatively.\n","authors":["Guillaume Roumage","Selma Azaiez","Cyril Faure","Stéphane Louise"],"pdf_url":"https://arxiv.org/pdf/2501.07273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07247v1","updated":"2025-01-13T11:55:04Z","published":"2025-01-13T11:55:04Z","title":"Interpretable machine-learning for predicting molecular weight of PLA\n  based on artificial bee colony optimization algorithm and adaptive neurofuzzy\n  inference system","summary":"  This article discusses the integration of the Artificial Bee Colony (ABC)\nalgorithm with two supervised learning methods, namely Artificial Neural\nNetworks (ANNs) and Adaptive Network-based Fuzzy Inference System (ANFIS), for\nfeature selection from Near-Infrared (NIR) spectra for predicting the molecular\nweight of medical-grade Polylactic Acid (PLA). During extrusion processing of\nPLA, in-line NIR spectra were captured along with extrusion process and machine\nsetting data. With a dataset comprising 63 observations and 512 input features,\nappropriate machine learning tools are essential for interpreting data and\nselecting features to improve prediction accuracy. Initially, the ABC\noptimization algorithm is coupled with ANN/ANFIS to forecast PLA molecular\nweight. The objective functions of the ABC algorithm are to minimize the root\nmean square error (RMSE) between experimental and predicted PLA molecular\nweights while also minimizing the number of input features. Results indicate\nthat employing ABC-ANFIS yields the lowest RMSE of 282 Da and identifies four\nsignificant parameters (NIR wavenumbers 6158 cm-1, 6310 cm-1, 6349 cm-1, and\nmelt temperature) for prediction. These findings demonstrate the effectiveness\nof using the ABC algorithm with ANFIS for selecting a minimal set of features\nto predict PLA molecular weight with high accuracy during processing\n","authors":["Amir Pouya Masoumi","Leo Creedon","Ramen Ghosh","Nimra Munir","Ross McMorrow","Marion McAfee"],"pdf_url":"https://arxiv.org/pdf/2501.07247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07191v1","updated":"2025-01-13T10:38:12Z","published":"2025-01-13T10:38:12Z","title":"Pre-Trained Large Language Model Based Remaining Useful Life Transfer\n  Prediction of Bearing","summary":"  Accurately predicting the remaining useful life (RUL) of rotating machinery,\nsuch as bearings, is essential for ensuring equipment reliability and\nminimizing unexpected industrial failures. Traditional data-driven deep\nlearning methods face challenges in practical settings due to inconsistent\ntraining and testing data distributions and limited generalization for\nlong-term predictions.\n","authors":["Laifa Tao","Zhengduo Zhao","Xuesong Wang","Bin Li","Wenchao Zhan","Xuanyuan Su","Shangyu Li","Qixuan Huang","Haifei Liu","Chen Lu","Zhixuan Lian"],"pdf_url":"https://arxiv.org/pdf/2501.07191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07187v1","updated":"2025-01-13T10:35:23Z","published":"2025-01-13T10:35:23Z","title":"Real-time Mode-Aware Dataflow: A Dataflow Model to Specify and Analyze\n  Mode-dependent CPSs under Relaxed Timing Constraints","summary":"  Modern Cyber-Physical Systems (CPS) often exhibit both relaxed real-time\nconstraints and a mode-dependent execution. Relaxed real-time constraints mean\nthat only a subset of the processes of a CPS have real-time constraints, and a\nmode-dependent CPS has conditional execution branches. Static analysis tools,\nsuch as the PolyGraph model (a formalism extending the Cyclo-Static Dataflow\nmodel with real-time constraints), can specify and analyze systems with relaxed\nreal-time constraints. However, PolyGraph is limited in its ability to specify\nand analyze mode-dependent CPSs. This paper extends PolyGraph with routing\nactors, yielding the Routed PolyGraph model. This model is further extended to\nthe Real-time Mode-Aware Dataflow (RMDF), which both leverages routing actors\nand incorporates a new dataflow actor to specify mode-dependent CPSs under\nrelaxed real-time constraints. This paper also extends the static analyses of\nPolyGraph to RMDF. We showcase the application of RMDF with a specification and\nan analysis (derivation of timing constraints at the job-level and a\nfeasibility test) of the vision processing system of the Ingenuity Mars\nhelicopter.\n","authors":["Guillaume Roumage","Selma Azaiez","Cyril Faure","Stéphane Louise"],"pdf_url":"https://arxiv.org/pdf/2501.07187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07180v1","updated":"2025-01-13T10:19:30Z","published":"2025-01-13T10:19:30Z","title":"Evaluating Robotic Approach Techniques for the Insertion of a Straight\n  Instrument into a Vitreoretinal Surgery Trocar","summary":"  Advances in vitreoretinal robotic surgery enable precise techniques for gene\ntherapies. This study evaluates three robotic approaches using the 7-DoF\nrobotic arm for docking a micro-precise tool to a trocar: fully co-manipulated,\nhybrid co-manipulated/teleoperated, and hybrid with camera assistance. The\nfully co-manipulated approach was the fastest but had a 42% success rate.\nHybrid methods showed higher success rates (91.6% and 100%) and completed tasks\nwithin 2 minutes. NASA Task Load Index (TLX) assessments indicated lower\nphysical demand and effort for hybrid approaches.\n","authors":["Ross Henry","Martin Huber","Anestis Mablekos-Alexiou","Carlo Seneci","Mohamed Abdelaziz","Hans Natalius","Lyndon da Cruz","Christos Bergeles"],"pdf_url":"https://arxiv.org/pdf/2501.07180v1.pdf","comment":"2 Pages, 2 Figures, 1 Table"},{"id":"http://arxiv.org/abs/2501.07148v1","updated":"2025-01-13T09:22:17Z","published":"2025-01-13T09:22:17Z","title":"Implementing LoRa MIMO System for Internet of Things","summary":"  Bandwidth constraints limit LoRa implementations. Contemporary IoT\napplications require higher throughput than that provided by LoRa. This work\nintroduces a LoRa Multiple Input Multiple Output (MIMO) system and a spatial\nmultiplexing algorithm to address LoRa's bandwidth limitation. The transceivers\nin the proposed approach modulate the signals on distinct frequencies of the\nsame LoRa band. A Frequency Division Multiplexing (FDM) method is used at the\ntransmitters to provide a wider MIMO channel. Unlike conventional Orthogonal\nFrequency Division Multiplexing (OFDM) techniques, this work exploits the\northogonality of the LoRa signals facilitated by its proprietary Chirp Spread\nSpectrum (CSS) modulation to perform an OFDM in the proposed LoRa MIMO system.\nBy varying the Spreading Factor (SF) and bandwidth of LoRa signals, orthogonal\nsignals can transmit on the same frequency irrespective of the FDM. Even though\nthe channel correlation is minimal for different spreading factors and\nbandwidths, different Carrier Frequencies (CF) ensure the signals do not\noverlap and provide additional degrees of freedom. This work assesses the\nproposed model's performance and conducts an extensive analysis to provide an\noverview of resources consumed by the proposed system. Finally, this work\nprovides the detailed results of a thorough evaluation of the model on test\nhardware.\n","authors":["Atonu Ghosh","Sharath Chandan","Sudip Misra"],"pdf_url":"https://arxiv.org/pdf/2501.07148v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.04287v3","updated":"2025-01-13T09:06:54Z","published":"2024-05-07T12:58:37Z","title":"Asymmetry of Frequency Distribution in Power Systems: Sources,\n  Estimation, Impact and Control","summary":"  This paper analyses an emerging real-world phenomena in inverter-based\nrenewable-dominated power systems, namely, asymmetry of frequency distribution.\nThe paper first provides a rationale on why asymmetry reduces the \"quality\" of\nthe frequency control and system operation. Then it provides qualitative\ntheoretical insights that explain asymmetry in terms of the nonlinearity of\nreal-world power systems and associated models. In particular network losses\nand pitch angle-based frequency control of wind power plants are discussed.\nThen the paper proposes a nonlinear compensation control to reduce the\nasymmetry as well as a statistical metric based on the frequency probability\ndistribution to quantify the level of asymmetry in a power system. Real-world\ndata obtained from the Irish and Australian transmission systems serve to\nsupport the theoretical appraisal, whereas simulations based on an IEEE\nbenchmark system show the effectiveness of the proposed nonlinear compensation.\nThe case study also shows that, while automatic generation control reduces\nasymmetry, frequency control limits and droop-based frequency support provided\nby wind generation using a tight deadband of 15 mHz, namely active power\ncontrol, leads to a significant increase in the asymmetry of the frequency\nprobability distribution.\n","authors":["Taulant Kerci","Federico Milano"],"pdf_url":"https://arxiv.org/pdf/2405.04287v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07126v1","updated":"2025-01-13T08:30:09Z","published":"2025-01-13T08:30:09Z","title":"A Federated Deep Learning Framework for Cell-Free RSMA Networks","summary":"  Next-generation wireless networks are poised to benefit significantly from\nthe integration of three key technologies (KTs): Rate-Splitting Multiple Access\n(RSMA), cell-free architectures, and federated learning. Each of these\ntechnologies offers distinct advantages in terms of security, robustness, and\ndistributed structure. In this paper, we propose a novel cell-free network\narchitecture that incorporates RSMA and employs machine learning techniques\nwithin a federated framework. This combination leverages the strengths of each\nKT, creating a synergistic effect that maximizes the benefits of security,\nrobustness, and distributed structure. We formally formulate the access point\n(AP) selection and precoder design for max-min rate optimization in a cell-free\nMIMO RSMA network. Our proposed solution scheme involves a three-block\nprocedure. The first block trains deep reinforcement learning (DRL) neural\nnetworks to obtain RSMA precoders, assuming full connectivity between APs and\nuser equipments (UEs). The second block uses these precoders and principal\ncomponent analysis (PCA) to assign APs to UEs by removing a subset of AP-UE\nconnections. The final block fine-tunes the RSMA precoders by incorporating the\nassociated APs into a second DRL network. To leverage the distributed nature of\nthe cell-free network, this process is implemented in a Federated Deep\nReinforcement Learning (FDRL) structure operating through the cooperation of\nAPs and a central processing unit (CPU). Simulation results demonstrate that\nthe proposed FDRL approach performs comparably to a benchmark centralized DRL\nscheme. Our FDRL approach, provides a balanced trade-off, maintaining high\nperformance with enhanced security and reduced processing demands.\n","authors":["S. Ali Mousavi","Mehdi Monemi","Reza Mohseni","Matti Latva-aho"],"pdf_url":"https://arxiv.org/pdf/2501.07126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09385v2","updated":"2025-01-13T05:46:45Z","published":"2024-07-12T16:06:07Z","title":"Cost-optimized probabilistic maintenance for condition monitoring of\n  wind turbines with rare failures","summary":"  We propose a method, a model, and a form of presenting model results for\ncondition monitoring of a small set of wind turbines with rare failures. The\nmain new ingredient of the method is to sample failure thresholds according to\nthe profit they give to an operating company. The model is a multiple linear\nregression with seasonal components and external regressors, representing all\nsensor components except for the selected one. To overcome the scarcity of the\ntraining data, we use the median sensor values from all available turbines in\ntheir healthy state. The cumulated deviation from the normal behavior model\nobtained for this median turbine is calibrated for each turbine at the\nbeginning of the test period and after known failures. The proposed form of\npresenting results is to set a scale for possible costs, control for random\nmaintenance, and show a whole distribution of costs depending on the free model\nparameters. We make a case study on an open dataset with SCADA data from\nmultiple sensors and show that considering the influence of turbine components\nis more critical than seasonality. The distribution, the average, and the\nstandard deviation of maintenance costs can be very different for similar\nminimal costs. Random maintenance can be more profitable than reactive\nmaintenance and other approaches. Our predictive maintenance model outperforms\nrandom maintenance and competitors for the whole set of considered turbines,\ngiving substantial savings.\n","authors":["Viktor Begun","Ulrich Schlickewei"],"pdf_url":"https://arxiv.org/pdf/2407.09385v2.pdf","comment":"Improved and finally accepted journal version"},{"id":"http://arxiv.org/abs/2410.08147v6","updated":"2025-01-13T05:29:14Z","published":"2024-10-10T17:31:36Z","title":"The Bouc-Wen Model for Binary Direct Collinear Collisions of Convex\n  Viscoplastic Bodies","summary":"  We study mathematical models of binary direct collinear collisions of convex\nviscoplastic bodies based on two incremental collision laws that employ the\nBouc-Wen differential model of hysteresis to represent the elastoplastic\nbehavior of the materials of the colliding bodies. These collision laws are the\nBouc-Wen-Simon-Hunt-Crossley Collision Law (BWSHCCL) and the Bouc-Wen-Maxwell\nCollision Law (BWMCL). The BWSHCCL comprises of the Bouc-Wen model amended with\na nonlinear Hertzian elastic spring element and connected in parallel to a\nnonlinear displacement-dependent and velocity-dependent energy dissipation\nelement. The BWMCL comprises of the Bouc-Wen model amended with a nonlinear\nHertzian elastic spring element and connected in series to a linear\nvelocity-dependent energy dissipation element. The mathematical models of the\ncollision process are presented in the form of finite-dimensional initial value\nproblems. We show that the models possess favorable analytical properties\n(e.g., global existence, uniqueness, and boundedness of the solutions) under\nsuitable restrictions on the values of their parameters. Furthermore, based on\nthe results of two model parameter identification studies, we demonstrate that\ngood agreement can be attained between experimental data and numerical\napproximations of the behavior of the mathematical models across a wide range\nof initial relative velocities of the colliding bodies while using\nparameterizations of the models that are independent of the initial relative\nvelocity.\n","authors":["Mihails Milehins","Dan B. Marghitu"],"pdf_url":"https://arxiv.org/pdf/2410.08147v6.pdf","comment":"15 pages; 5 figures; (v1-v5) a variety of amendments; (v6) updated\n  scaling/nondimensionalization and introduced amendments based on external\n  feedback; the associated code/data are available from\n  https://gitlab.com/user9716869/BWBCL"},{"id":"http://arxiv.org/abs/2407.21533v2","updated":"2025-01-13T05:12:56Z","published":"2024-07-31T11:39:10Z","title":"Data Requirements and Prediction Scaling for Long-Term Failure Forecasts\n  in Wind Turbines","summary":"  We investigate the key factors that enable early failure forecasting in wind\nturbines. For this purpose, we analyze studies with long-term forecasts and\ncompare their main features: prediction time, methods, targeted components,\ndataset size, and check the effect of using additional sensors. We found that\nthe size of the dataset is the main factor and that an approximate linear\nscaling holds: the number of forecast days is twice the size of the dataset,\nmeasured in turbine years. We also observe that the data allow us to quantify\nthe meaning of \"big\" and \"long\" in the terms \"big data\" and \"long-term\"\nforecasts, which are found to be ten turbine years and two weeks.\n","authors":["Viktor Begun","Ulrich Schlickewei"],"pdf_url":"https://arxiv.org/pdf/2407.21533v2.pdf","comment":"Improved the text and figure, updated the references"},{"id":"http://arxiv.org/abs/2501.07057v1","updated":"2025-01-13T04:31:31Z","published":"2025-01-13T04:31:31Z","title":"Optimization with Multi-sourced Reference Information and Unknown Trust:\n  A Distributionally Robust Approach","summary":"  In problems that involve input parameter information gathered from multiple\ndata sources with varying reliability, incorporating users' trust about\ndifferent sources in decision-optimization models can potentially improve\nsolution performance and reliability. In this work, we propose a novel\nmulti-reference distributionally robust optimization (MR-DRO) framework, where\nthe model inputs are uncertain and their probability distributions can be\nstatistically inferred from multiple data sources. Via nonparametric data\nfusion, we construct a Wasserstein ambiguity set to minimize the worst-case\nexpected value of a stochastic objective function, accounting for both\nuncertainty and unknown reliability of information sources. We reformulate the\nMR-DRO model as a linear program given linear objective and constraints in the\noriginal problem. We also incorporate a dynamic trust update mechanism that\nadjusts the trust for each source based on its performance over time. In\naddition, we introduce the concept of probability dominance to identify sources\nwith dominant trust. Via solving instances of resource allocation and portfolio\noptimization, we demonstrate the effectiveness of the trust-informed MR-DRO\napproach compared to traditional optimization frameworks relying on a single\ndata source. Our results highlight the significance of integrating (dynamic)\nuser trust in decision making under uncertainty, particularly when given\ndiverse and potentially conflicting input data.\n","authors":["Yanru Guo","Ruiwei Jiang","Siqian Shen"],"pdf_url":"https://arxiv.org/pdf/2501.07057v1.pdf","comment":"38 pages, 9 figures, 7 tables"},{"id":"http://arxiv.org/abs/2501.07030v1","updated":"2025-01-13T03:02:15Z","published":"2025-01-13T03:02:15Z","title":"Erasing Noise in Signal Detection with Diffusion Model: From Theory to\n  Application","summary":"  In this paper, a signal detection method based on the denoise diffusion model\n(DM) is proposed, which outperforms the maximum likelihood (ML) estimation\nmethod that has long been regarded as the optimal signal detection technique.\nTheoretically, a novel mathematical theory for intelligent signal detection\nbased on stochastic differential equations (SDEs) is established in this paper,\ndemonstrating the effectiveness of DM in reducing the additive white Gaussian\nnoise in received signals. Moreover, a mathematical relationship between the\nsignal-to-noise ratio (SNR) and the timestep in DM is established, revealing\nthat for any given SNR, a corresponding optimal timestep can be identified.\nFurthermore, to address potential issues with out-of-distribution inputs in the\nDM, we employ a mathematical scaling technique that allows the trained DM to\nhandle signal detection across a wide range of SNRs without any fine-tuning.\nBuilding on the above theoretical foundation, we propose a DM-based signal\ndetection method, with the diffusion transformer (DiT) serving as the backbone\nneural network, whose computational complexity of this method is\n$\\mathcal{O}(n^2)$. Simulation results demonstrate that, for BPSK and QAM\nmodulation schemes, the DM-based method achieves a significantly lower symbol\nerror rate (SER) compared to ML estimation, while maintaining a much lower\ncomputational complexity.\n","authors":["Xiucheng Wang","Peilin Zheng","Nan Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.07030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07026v1","updated":"2025-01-13T02:59:03Z","published":"2025-01-13T02:59:03Z","title":"IEEE_TIE25: Analysis and Synthesis of DOb-based Robust Motion\n  Controllers","summary":"  By employing a unified state-space design framework, this paper proposes a\nnovel systematic analysis and synthesis method that facilitates the\nimplementation of both conventional zero-order (ZO) and high-order (HO) DObs.\nFurthermore, this design method supports the development of advanced DObs\n(e.g., the proposed High-Performance (HP) DOb in this paper), enabling more\naccurate disturbance estimation and, consequently, enhancing the robust\nstability and performance of motion control systems. Lyapunov direct method is\nemployed in the discrete-time domain to analyse the stability of the proposed\ndigital robust motion controllers. The analysis demonstrates that the proposed\nDObs are stable in the sense that the estimation error is uniformly ultimately\nbounded when subjected to bounded disturbances. Additionally, they are proven\nto be asymptotically stable under specific disturbance conditions, such as\nconstant disturbances for the ZO and HP DObs. Stability constraints on the\ndesign parameters of the DObs are analytically derived, providing effective\nsynthesis tools for the implementation of the digital robust motion\ncontrollers. The discrete-time analysis facilitates the derivation of more\npractical design constraints. The proposed analysis and synthesis methods have\nbeen rigorously validated through experimental evaluations, confirming their\neffectiveness.\n","authors":["Emre Sariyildiz"],"pdf_url":"https://arxiv.org/pdf/2501.07026v1.pdf","comment":"IEEE Transactions on Industrial Electronics 2025"},{"id":"http://arxiv.org/abs/2501.07005v1","updated":"2025-01-13T01:49:17Z","published":"2025-01-13T01:49:17Z","title":"Global Search for Optimal Low Thrust Spacecraft Trajectories using\n  Diffusion Models and the Indirect Method","summary":"  Long time-duration low-thrust nonlinear optimal spacecraft trajectory global\nsearch is a computationally and time expensive problem characterized by\nclustering patterns in locally optimal solutions. During preliminary mission\ndesign, mission parameters are subject to frequent changes, necessitating that\ntrajectory designers efficiently generate high-quality control solutions for\nthese new scenarios. Generative machine learning models can be trained to learn\nhow the solution structure varies with respect to a conditional parameter,\nthereby accelerating the global search for missions with updated parameters. In\nthis work, state-of-the-art diffusion models are integrated with the indirect\napproach for trajectory optimization within a global search framework. This\nframework is tested on two low-thrust transfers of different complexity in the\ncircular restricted three-body problem. By generating and analyzing a training\ndata set, we develop mathematical relations and techniques to understand the\ncomplex structures in the costate domain of locally optimal solutions for these\nproblems. A diffusion model is trained on this data and successfully\naccelerates the global search for both problems. The model predicts how the\ncostate solution structure changes, based on the maximum spacecraft thrust\nmagnitude. Warm-starting a numerical solver with diffusion model samples for\nthe costates at the initial time increases the number of solutions generated\nper minute for problems with unseen thrust magnitudes by one to two orders of\nmagnitude in comparison to samples from a uniform distribution and from an\nadjoint control transformation.\n","authors":["Jannik Graebner","Ryne Beeson"],"pdf_url":"https://arxiv.org/pdf/2501.07005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10941v2","updated":"2025-01-13T00:03:58Z","published":"2024-11-17T02:39:58Z","title":"Efficient Estimation of Relaxed Model Parameters for Robust UAV\n  Trajectory Optimization","summary":"  Online trajectory optimization and optimal control methods are crucial for\nenabling sustainable unmanned aerial vehicle (UAV) services, such as\nagriculture, environmental monitoring, and transportation, where available\nactuation and energy are limited. However, optimal controllers are highly\nsensitive to model mismatch, which can occur due to loaded equipment, packages\nto be delivered, or pre-existing variability in fundamental structural and\nthrust-related parameters. To circumvent this problem, optimal controllers can\nbe paired with parameter estimators to improve their trajectory planning\nperformance and perform adaptive control. However, UAV platforms are limited in\nterms of onboard processing power, oftentimes making nonlinear parameter\nestimation too computationally expensive to consider. To address these issues,\nwe propose a relaxed, affine-in-parameters multirotor model along with an\nefficient optimal parameter estimator. We convexify the nominal Moving Horizon\nParameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via\nan affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast\nquadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC)\nin real time. We compare this approach to the equivalent nonlinear estimator in\nMonte Carlo simulations, demonstrating a decrease in average solve time and\ntrajectory optimality cost by 98.2% and 23.9-56.2%, respectively.\n","authors":["Derek Fan","David A. Copp"],"pdf_url":"https://arxiv.org/pdf/2411.10941v2.pdf","comment":"8 pages, 5 figures, to be published in IEEE Sustech 2025"},{"id":"http://arxiv.org/abs/2501.07743v1","updated":"2025-01-13T23:14:15Z","published":"2025-01-13T23:14:15Z","title":"The Reliability of Remotely Piloted Aircraft System Performance under\n  Communication Loss and Latency Uncertainties","summary":"  Mission-critical use of highly maneuverable Remotely Piloted Aircraft Systems\n(RPAS) requires a thorough understanding of the reliability of their\ncommunication systems. Investigations into system-level performance under\nstochastic aviation communication conditions are critical for estimating\nmission success rates and assessing the risks associated with integrating RPAS\ninto existing airspace, ensuring overall aviation safety. This study aims to\nquantify the impact of communication latency and complete signal loss on the\nmission completion performance of a highly maneuverable RPAS. The mission is\ndefined as a static waypoint tracking task in three-dimensional airspace. We\nstart with examining and deriving mathematical formulations of key reliability\nmetrics of Required Communication Performance (RCP). These stochastic factors\nare then embedded into flight control simulations (i.e., communication\navailability and latency) to examine the system behavior. Lastly, we generate\nmission success rate and mission completion time envelopes through extensive\nmultiprocessing Monte Carlo simulations through high-performance computing. We\ndiscover a drastic deterioration in flight performance while latency or\navailability erodes the stability margin. In addition, we propose a new\nreliability metric, namely \\textit{communicability}, which integrates three key\nRCP metrics and helps understanding the maximum tolerable latency to flight\ncontrol. The procedure and results obtained from this research inform engineers\ndesigning RPAS with better trade-off between communication capability and\nflight control performance. Future works includes exploring alternative flight\nsimulators (i.e., nonlinear dynamic inversion) with other missions (i.e.,\ndynamic waypoint following), or develop delay-compensated optimal controls. The\nanalysis on stability margin is also desired for theoretical verification.\n","authors":["Yutian Pang","Andrew Paul Kendall","John-Paul Clarke"],"pdf_url":"https://arxiv.org/pdf/2501.07743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07715v1","updated":"2025-01-13T22:06:06Z","published":"2025-01-13T22:06:06Z","title":"Analyzing the Role of the DSO in Electricity Trading of VPPs via a\n  Stackelberg Game Model","summary":"  The increasing penetration of distributed energy resources (DER) has sparked\ninterest in promoting their participation in the power market. Here we consider\na setting in which different virtual power plants (VPPs) with certain flexible\nresources take part in electricity trading, either by direct participation in\nthe wholesale power market, or interfaced by the Distribution System Operator\n(DSO). Our goal is to examine the role and influence of the DSO as a\nstakeholder, for which we formulate a Stackelberg game via a bilevel\noptimization model: the DSO maximizes profits at the upper level, while VPPs\nminimize operating costs at the lower level. To solve this problem, we use the\nKarush-Kuhn-Tucke optimality conditions of the convex lower-level problems to\nachieve a single-level mixed-integer nonlinear program. The results show that\nthe role of the DSO as an intermediary agent leads to a decrease in operating\ncosts for the VPPs, while guaranteeing a profit for the DSO.\n","authors":["Peng Wang","Xi Zhang","Luis Badesa"],"pdf_url":"https://arxiv.org/pdf/2501.07715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07714v1","updated":"2025-01-13T21:55:48Z","published":"2025-01-13T21:55:48Z","title":"Koopman Meets Limited Bandwidth: Effect of Quantization on Data-Driven\n  Linear Prediction and Control of Nonlinear Systems","summary":"  Koopman-based lifted linear identification have been widely used for\ndata-driven prediction and model predictive control (MPC) of nonlinear systems.\nIt has found applications in flow-control, soft robotics, and unmanned aerial\nvehicles (UAV). For autonomous systems, this system identification method works\nby embedding the nonlinear system in a higher-dimensional linear space and\ncomputing a finite-dimensional approximation of the corresponding Koopman\noperator with the Extended Dynamic Mode Decomposition (EDMD) algorithm. EDMD is\na data-driven algorithm that estimates an approximate linear system by lifting\nthe state data-snapshots via nonlinear dictionary functions. For control\nsystems, EDMD is further modified to utilize both state and control\ndata-snapshots to estimate a lifted linear predictor with control input. This\narticle investigates how the estimation process is affected when the data is\nquantized. Specifically, we examine the fundamental connection between\nestimates of the linear predictor matrices obtained from unquantized data and\nthose from quantized data via modified EDMD. Furthermore, using the law of\nlarge numbers, we demonstrate that, under a large data regime, the quantized\nestimate can be considered a regularized version of the unquantized estimate.\nWe also explore the relationship between the two estimates in the finite data\nregime. We further analyze the effect of nonlinear lifting functions on this\nregularization due to quantization. The theory is validated through repeated\nnumerical experiments conducted on several control systems. The effect of\nquantization on the MPC performance is also demonstrated.\n","authors":["Shahab Ataei","Dipankar Maity","Debdipta Goswami"],"pdf_url":"https://arxiv.org/pdf/2501.07714v1.pdf","comment":"15 pages, 4 figures. arXiv admin note: text overlap with\n  arXiv:2410.02803"},{"id":"http://arxiv.org/abs/2402.06108v2","updated":"2025-01-13T21:09:51Z","published":"2024-02-09T00:05:28Z","title":"United We Fall: On the Nash Equilibria of Multiplex and Multilayer\n  Network Games","summary":"  Network games provide a framework to study strategic decision making\nprocesses that are governed by structured interdependencies among agents.\nHowever, existing models do not account for environments in which agents\nsimultaneously interact over multiple networks, or when agents operate over\nmultiple action dimensions. In this paper, we propose new models of multiplex\nnetwork games to capture the different modalities of interactions among\nstrategic agents, and multilayer network games to capture their interactions\nover multiple action dimensions. We explore how the properties of the\nconstituent networks of a multiplex/multilayer network can undermine or support\nthe existence, uniqueness, and stability of the game's Nash equilibria.\nNotably, we highlight that both the largest and smallest eigenvalues of the\nconstituent networks (reflecting their connectivity and two-sidedness,\nrespectively) are instrumental in determining the uniqueness of the\nmultiplex/multilayer network game's equilibrium. Together, our findings shed\nlight on the reasons for the fragility of equilibria when agents interact over\nnetworks of networks, and point out potential interventions to alleviate them.\n","authors":["Raman Ebrahimi","Parinaz Naghizadeh"],"pdf_url":"https://arxiv.org/pdf/2402.06108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07689v1","updated":"2025-01-13T21:05:04Z","published":"2025-01-13T21:05:04Z","title":"Real-Time Outlier Connections Detection in Databases Network Traffic","summary":"  The article describes a practical method for detecting outlier database\nconnections in real-time. Outlier connections are detected with a specified\nlevel of confidence. The method is based on generalized security rules and a\nsimple but effective real-time machine learning mechanism. The described method\nis non-intrusive to the database and does not depend on the type of database.\nThe method is used to proactively control access even before database\nconnection is established, minimize false positives, and maintain the required\nresponse speed to detected database connection outliers. The capabilities of\nthe system are demonstrated with several examples of outliers in real-world\nscenarios.\n","authors":["Leonid Rodniansky","Tania Butovsky","Mikhail Shpak"],"pdf_url":"https://arxiv.org/pdf/2501.07689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07982v2","updated":"2025-01-13T20:44:40Z","published":"2024-12-10T23:54:44Z","title":"Data-Driven Assessment of Vehicle-to-Grid Capabilities in Supporting\n  Grid During Emergencies: Case Study of Travis County, TX","summary":"  As extreme weather events become more common and threaten power grids, the\ncontinuing adoption of electric vehicles (EVs) introduces a growing opportunity\nfor their use as a distributed energy storage resource. This energy storage can\nbe used as backup generation through the use of vehicle-to-grid (V2G)\ntechnology, where electricity is sent back from EV batteries to the grid. With\nenough participation from EV owners, V2G can mitigate outages during grid\nemergencies. In order to investigate a practical application of V2G, this study\nleverages a vast array of real-world data, such as survey results on V2G\nparticipation willingness, historical outage data within ERCOT, current EV\nregistrations, and demographic data. This data informs realistic emergency grid\nscenarios with V2G support using a synthetic transmission grid for Travis\nCounty. The results find that as EV ownership rises in the coming years, the\nsimultaneous facilitation of bidirectional charging availability would allow\nfor V2G to play a substantial role in preventing involuntary load shed as a\nresult of emergencies like winter storms.\n","authors":["Kelsey Nelson","Javad Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2412.07982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15728v2","updated":"2025-01-13T19:46:20Z","published":"2024-02-24T05:36:14Z","title":"Design and Implementation of Low-Cost Electric Vehicles (Evs)\n  Supercharger: A Comprehensive Review","summary":"  This article presents a probabilistic modeling method utilizing smart meter\ndata and an innovative agent-based simulator for electric vehicles (EVs). The\naim is to assess the effects of different cost-driven EV charging strategies on\nthe power distribution network (PDN). We investigate the effects of a 40% EV\nadoption on three parts of Frederiksberg's low voltage distribution network\n(LVDN), a densely urbanized municipality in Denmark. Our findings indicate that\ncable and transformer overloading especially pose a challenge. However, the\nimpact of EVs varies significantly between each LVDN area and charging\nscenario. Across scenarios and LVDNs, the share of cables facing congestion\nranges between 5% and 60%. It is also revealed that time-of-use (ToU)-based and\nsingle-day cost-minimized charging could be beneficial for LVDNs with moderate\nEV adoption rates. In contrast, multiple-day optimization will likely lead to\nsevere congestion, as such strategies concentrate demand on a single day that\nwould otherwise be distributed over several days, thus raising concerns about\nhow to prevent it. The broader implications of our research suggest that,\ndespite initial worries primarily centered on congestion due to unregulated\ncharging during peak hours, a transition to cost-based smart charging,\npropelled by an increasing awareness of time-dependent electricity prices, may\nlead to a significant rise in charging synchronization, bringing about\nundesirable consequences for the power distribution network (PDN).\n","authors":["Md Khaledur Rahman","Faysal Amin Tanvir","Md Saiful Islam","Md Shameem Ahsan","Manam Ahmed"],"pdf_url":"https://arxiv.org/pdf/2402.15728v2.pdf","comment":"arXiv admin note: This work has been withdrawn by arXiv\n  administrators due to inappropriate text reuse from external sources"},{"id":"http://arxiv.org/abs/2501.07652v1","updated":"2025-01-13T19:24:14Z","published":"2025-01-13T19:24:14Z","title":"Finite Sample Identification of Partially Observed Bilinear Dynamical\n  Systems","summary":"  We consider the problem of learning a realization of a partially observed\nbilinear dynamical system (BLDS) from noisy input-output data. Given a single\ntrajectory of input-output samples, we provide a finite time analysis for\nlearning the system's Markov-like parameters, from which a balanced realization\nof the bilinear system can be obtained. Our bilinear system identification\nalgorithm learns the system's Markov-like parameters by regressing the outputs\nto highly correlated, nonlinear, and heavy-tailed covariates. Moreover, the\nstability of BLDS depends on the sequence of inputs used to excite the system.\nThese properties, unique to partially observed bilinear dynamical systems, pose\nsignificant challenges to the analysis of our algorithm for learning the\nunknown dynamics. We address these challenges and provide high probability\nerror bounds on our identification algorithm under a uniform stability\nassumption. Our analysis provides insights into system theoretic quantities\nthat affect learning accuracy and sample complexity. Lastly, we perform\nnumerical experiments with synthetic data to reinforce these insights.\n","authors":["Yahya Sattar","Yassir Jedra","Maryam Fazel","Sarah Dean"],"pdf_url":"https://arxiv.org/pdf/2501.07652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17753v2","updated":"2025-01-13T19:10:31Z","published":"2024-05-28T02:11:21Z","title":"Regression Equilibrium in Electricity Markets","summary":"  In two-stage electricity markets, renewable power producers enter the\nday-ahead market with a forecast of future power generation and then reconcile\nany forecast deviation in the real-time market at a penalty. The choice of the\nforecast model is thus an important strategy decision for renewable power\nproducers as it affects financial performance. In electricity markets with\nlarge shares of renewable generation, the choice of the forecast model impacts\nnot only individual performance but also outcomes for other producers. In this\npaper, we argue for the existence of a competitive regression equilibrium in\ntwo-stage electricity markets in terms of the parameters of private forecast\nmodels informing the participation strategies of renewable power producers. In\nour model, renewables optimize the forecast against the day-ahead and real-time\nprices, thereby maximizing the average profits across the day-ahead and\nreal-time markets. By doing so, they also implicitly enhance the temporal cost\ncoordination of day-ahead and real-time markets. We base the equilibrium\nanalysis on the theory of variational inequalities, providing results on the\nexistence and uniqueness of regression equilibrium in energy-only markets. We\nalso devise two methods to compute regression equilibrium: centralized\noptimization and a decentralized ADMM-based algorithm.\n","authors":["Vladimir Dvorkin"],"pdf_url":"https://arxiv.org/pdf/2405.17753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07616v1","updated":"2025-01-13T12:29:37Z","published":"2025-01-13T12:29:37Z","title":"The Ingenuity Mars Helicopter Specified and Analyzed with the Real-time\n  Mode-aware Dataflow Model","summary":"  Ingenuity is an autonomous Cyber-Pysical System (CPS) that has successfully\ncompleted more than 70 flights over Mars between 2021 and 2024. Ensuring the\nsafety of its mission is paramount, as any failure could result in catastrophic\neconomic damage and significant financial losses. Dataflow Models of\nComputation and Communication (DF MoCCs) serve as a formal framework for\nspecifying and analyzing the timing behavior of such CPSs. In particular, the\nReal-time Mode-aware Dataflow (RMDF) model is highly suitable to specify and\nanalyze real-time and mode-dependent Cyber-Physical Systems (CPSs) like\nIngenuity. This paper showcases the application of RMDF for the specification\nand analysis of Ingenuity. We propose a dataflow specification of Ingenuity,\nanalyze its timing behavior, and provide a feasibility test. Finally, we\nproposed a plausible explanation of the timing anomaly that occurred during the\nsixth flight of Ingenuity.\n","authors":["Guillaume Roumage","Selma Azaiez","Cyril Faure","Stéphane Louise"],"pdf_url":"https://arxiv.org/pdf/2501.07616v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2501.07187"},{"id":"http://arxiv.org/abs/2207.11132v3","updated":"2025-01-13T02:38:21Z","published":"2022-07-16T13:43:58Z","title":"Proactive Distributed Emergency Response with Heterogeneous Tasks\n  Allocation","summary":"  Traditionally, traffic incident management (TIM) programs coordinate the\ndeployment of emergency resources to immediate incident requests without\naccommodating the interdependencies on incident evolutions in the environment.\nHowever, ignoring inherent interdependencies on the evolution of incidents in\nthe environment while making current deployment decisions is shortsighted, and\nthe resulting naive deployment strategy can significantly worsen the overall\nincident delay impact on the network. The interdependencies on incident\nevolution in the environment, including those between incident occurrences, and\nthose between resource availability in near-future requests and the anticipated\nduration of the immediate incident request, should be considered through a\nlook-ahead model when making current-stage deployment decisions. This study\ndevelops a new proactive framework based on the distributed constraint\noptimization problem (DCOP) to address the above limitations, overcoming\nconventional TIM models that cannot accommodate the dependencies in the TIM\nproblem. Furthermore, the optimization objective is formulated to incorporate\nUnmanned Aerial Vehicles (UAVs). The UAVs' role in TIM includes exploring\nuncertain traffic conditions, detecting unexpected events, and augmenting\ninformation from roadway traffic sensors. Robustness analysis of our model for\nmultiple TIM scenarios shows satisfactory performance using local search\nexploration heuristics. Overall, our model reports a significant reduction in\ntotal incident delay compared to conventional TIM models. With UAV support, we\ndemonstrate a further decrease in the total incident delay ranging between 5%\nand 45% for the different number of incidents. UAV's active sensing can shorten\nresponse time of emergency vehicles, and a reduction in uncertainties\nassociated with the estimated incident delay impact.\n","authors":["Justice Darko","Hyoshin Park"],"pdf_url":"https://arxiv.org/pdf/2207.11132v3.pdf","comment":"16 pages, 13 figures, 3 tables, journal"},{"id":"http://arxiv.org/abs/2501.10441v1","updated":"2025-01-13T22:28:04Z","published":"2025-01-13T22:28:04Z","title":"A Review of Detection, Evolution, and Data Reconstruction Strategies for\n  False Data Injection Attacks in Power Cyber-Physical Systems","summary":"  The integration of information and physical systems in modern power grids has\nheightened vulnerabilities to False Data Injection Attacks (FDIAs), threatening\nthe secure operation of power cyber-physical systems (CPS). This paper reviews\nFDIA detection, evolution, and data reconstruction strategies, highlighting\ncross-domain coordination, multi-temporal evolution, and stealth\ncharacteristics. Challenges in existing detection methods, including poor\ninterpretability and data imbalance, are discussed, alongside advanced\nstate-aware and action-control data reconstruction techniques. Key issues, such\nas modeling FDIA evolution and distinguishing malicious data from regular\nfaults, are identified. Future directions to enhance system resilience and\ndetection accuracy are proposed, contributing to the secure operation of power\nCPS.\n","authors":["Xiaoyong Bo"],"pdf_url":"https://arxiv.org/pdf/2501.10441v1.pdf","comment":"34 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2501.10438v1","updated":"2025-01-13T14:40:49Z","published":"2025-01-13T14:40:49Z","title":"Event-Based Impulsive Control for Spacecraft Rendezvous Hovering Phases","summary":"  This work presents an event-triggered controller for spacecraft rendezvous\nhovering phases. The goal is to maintain the chaser within a bounded region\nwith respect to the target. The main assumption is that the chaser vehicle has\nimpulsive thrusters. These are assumed to be orientable at any direction and\nare constrained by dead-zone and saturation bounds. The event-based controller\nrelies on trigger rules deciding when a suitable control law is applied. The\nlocal control law consists on a single impulse; therefore the trigger rules\ndesign is based on the instantaneous reachability to the admissible set. The\nfinal outcome is a very efficient algorithm from both computational burden and\nfootprint perspectives. Because the proposed methodology is based on a single\nimpulse control, the controller invariance is local and assessed through\nimpulsive systems theory. Finally, numerical results are shown and discussed.\n","authors":["Julio C. Sanchez","Christophe Louembet","Francisco Gavilan","Rafael Vazquez"],"pdf_url":"https://arxiv.org/pdf/2501.10438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10437v1","updated":"2025-01-13T14:19:06Z","published":"2025-01-13T14:19:06Z","title":"Chance-constrained Model Predictive Control for Near Rectilinear Halo\n  Orbit spacecraft rendezvous","summary":"  This work presents a robust Model Predictive Controller (MPC) to solve the\nproblem of spacecraft rendezvous in the context of the restricted three-body\nproblem (R3BP) as will be required to dock with space stations in cislunar\nspace. The employed methodology is both valid for chemical and electric\nthrusters. By exploiting the state transition matrix and using a\nchance-constrained approach, the robust MPC assures constraints satisfaction\nunder the presence of disturbances in a probabilistic sense. The perturbations\nparameters are computed on-line using a disturbance estimator. The robust\ncontroller is tested for a rendezvous scenario with a target placed in an\nEarth-Moon Near-Rectilinear Halo Orbit. Numerical results are shown and\ndiscussed.\n","authors":["Julio C. Sanchez","Francisco Gavilan","Rafael Vazquez"],"pdf_url":"https://arxiv.org/pdf/2501.10437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10436v1","updated":"2025-01-13T14:12:41Z","published":"2025-01-13T14:12:41Z","title":"A flatness-based predictive controller for six-degrees of freedom\n  spacecraft rendezvous","summary":"  This work presents a closed-loop guidance algorithm for six-degrees of\nfreedom spacecraft rendezvous with a passive target flying in an eccentric\norbit. The main assumption is that the chaser vehicle has an attitude control\nsystem, based on reaction wheels, providing the necessary torque to change its\norientation whereas the number of thrusters is arbitrary. The goal is to design\nfuel optimal maneuvers while satisfying operational constraints and rejecting\ndisturbances. The proposed method is as follows; first, the coupled\ntranslational and angular dynamics are transformed to equivalent algebraic\nrelations using the relative translational states transition matrix and the\nattitude flatness property. Then, a direct transcription method, based on\nB-splines parameterization and discretization of time continuous constraints,\nis developed to obtain a tractable static program. Finally, a Model Predictive\nController, based on linearization around the previously computed solution, is\nconsidered to handle disturbances. Numerical results are shown and discussed.\n","authors":["Julio C. Sanchez","Francisco Gavilan","Rafael Vazquez","Christophe Louembet"],"pdf_url":"https://arxiv.org/pdf/2501.10436v1.pdf","comment":null}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2403.02079v3","updated":"2025-01-13T18:01:08Z","published":"2024-03-04T14:26:22Z","title":"The ultimate upper bound on the injectivity radius of the Stiefel\n  manifold","summary":"  We exhibit conjugate points on the Stiefel manifold endowed with any member\nof the family of Riemannian metrics introduced by H\\\"uper et al. (2021). This\nfamily contains the well-known canonical and Euclidean metrics. An upper bound\non the injectivity radius of the Stiefel manifold in the considered metric is\nthen obtained as the minimum between the length of the geodesic along which the\npoints are conjugate and the length of certain geodesic loops. Numerical\nexperiments support the conjecture that the obtained upper bound is in fact\nequal to the injectivity radius.\n","authors":["P. -A. Absil","Simon Mataigne"],"pdf_url":"https://arxiv.org/pdf/2403.02079v3.pdf","comment":"Version accepted for publication in SIAM Journal on Matrix Analysis\n  and Applications on 6 January 2025"},{"id":"http://arxiv.org/abs/2501.07505v1","updated":"2025-01-13T17:22:58Z","published":"2025-01-13T17:22:58Z","title":"An Error Analysis of Second Order Elliptic Optimal Control Problem via\n  Hybrid Higher Order Methods","summary":"  This paper presents the design and analysis of a Hybrid High-Order (HHO)\napproximation for a distributed optimal control problem governed by the Poisson\nequation. We propose three distinct schemes to address unconstrained control\nproblems and two schemes for constrained control problems. For the\nunconstrained control problem, while standard finite elements achieve a\nconvergence rate of \\( k+1 \\) (with \\( k \\) representing the polynomial\ndegree), our approach enhances this rate to \\( k+2 \\) by selecting the control\nfrom a carefully constructed reconstruction space. For the box-constrained\nproblem, we demonstrate that using lowest-order elements (\\( \\mathbb{P}_0 \\))\nyields linear convergence, in contrast to finite element methods (FEM) that\nrequire linear elements to achieve comparable results. Furthermore, we derive a\ncubic convergence rate for control in the variational discretization scheme.\nNumerical experiments are provided to validate the theoretical findings.\n","authors":["Gouranga Mallik","Ramesh Chandra Sau"],"pdf_url":"https://arxiv.org/pdf/2501.07505v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2407.00843v3","updated":"2025-01-13T16:58:43Z","published":"2024-06-30T22:33:47Z","title":"A Unified Approach to Extract Interpretable Rules from Tree Ensembles\n  via Integer Programming","summary":"  Tree ensembles are very popular machine learning models, known for their\neffectiveness in supervised classification and regression tasks. Their\nperformance derives from aggregating predictions of multiple decision trees,\nwhich are renowned for their interpretability properties. However, tree\nensemble models do not reliably exhibit interpretable output. Our work aims to\nextract an optimized list of rules from a trained tree ensemble, providing the\nuser with a condensed, interpretable model that retains most of the predictive\npower of the full model. Our approach consists of solving a set partitioning\nproblem formulated through Integer Programming. The proposed method works with\neither tabular or time series data, for both classification and regression\ntasks, and its flexible formulation can include any arbitrary loss or\nregularization functions. Our extensive computational experiments offer\nstatistically significant evidence that our method is competitive with other\nrule extraction methods in terms of predictive performance and fidelity towards\nthe tree ensemble. Moreover, we empirically show that the proposed method\neffectively extracts interpretable rules from tree ensemble that are designed\nfor time series data.\n","authors":["Lorenzo Bonasera","Emilio Carrizosa"],"pdf_url":"https://arxiv.org/pdf/2407.00843v3.pdf","comment":"- Improved overall manuscript flow and clearness - Added related work\n  on explanation fidelity - Added computational results on fidelity - Fixed\n  some flaws on data inference - Optimization problem with weighted objectives\n  - Added appendix containing qualitative examples - New computational results"},{"id":"http://arxiv.org/abs/2501.07461v1","updated":"2025-01-13T16:30:56Z","published":"2025-01-13T16:30:56Z","title":"A Linear Parameter-Varying Framework for the Analysis of Time-Varying\n  Optimization Algorithms","summary":"  In this paper we propose a framework to analyze iterative first-order\noptimization algorithms for time-varying convex optimization. We assume that\nthe temporal variability is caused by a time-varying parameter entering the\nobjective, which can be measured at the time of decision but whose future\nvalues are unknown. We consider the case of strongly convex objective functions\nwith Lipschitz continuous gradients and address the class of running algorithms\nwhere only one iteration per time change is performed. We model these\nalgorithms as discrete-time linear parameter varying (LPV) systems in feedback\nwith a time-varying gradient. We leverage the approach of analyzing algorithms\nas uncertain control interconnections with integral quadratic constraints\n(IQCs) and generalize that framework to the time-varying case. We propose novel\nIQCs that are capable of capturing the behavior of time-varying nonlinearities\nand leverage techniques from the LPV literature to establish novel bounds on\nthe tracking error. Quantitative bounds can be computed by solving a\nsemi-definite program and can be interpreted as an input-to-state stability\nresult with respect to a disturbance signal which increases with the temporal\nvariability of the problem. As a departure from results in this research area,\nour bounds introduce terms that can be interpreted as a temporal rate of change\nin the cost function and the optimal value. We exemplify our main results with\nnumerical experiments that showcase how our analysis framework is able to\ncapture convergence rates of different first-order algorithms for time-varying\noptimization through the choice of IQC and rate bounds.\n","authors":["Fabian Jakob","Andrea Iannelli"],"pdf_url":"https://arxiv.org/pdf/2501.07461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07427v1","updated":"2025-01-13T15:48:45Z","published":"2025-01-13T15:48:45Z","title":"Numerical Method for Simultaneous Design and Control Optimization of\n  Seasonal Thermal Energy Storage Systems","summary":"  The transition to a carbon-neutral energy system requires massive\ninstallation of renewable energy sources and economically feasible energy\nstorage solutions. This study addresses these challenges by optimizing the\ndesign and control strategies of an energy system that meets the heat and\nelectricity demands of a community. The proposed system integrates solar and\nwind power with energy storage, including seasonal thermal energy storage\n(STES) and battery, coupled via a heat pump. This approach enhances\nself-sufficiency and effectively mitigates seasonal mismatches. To model heat\ntransfer between the storage and the ground in the STES system, we employ a\nmulti-node lumped-parameter method. The optimization problem is formulated as a\nperiodic optimal control problem, which is then transcribed into a nonlinear\nprogramming problem. To reduce computational complexity, we apply the averaging\nmethod, which significantly lowers the effort required to solve the problem. We\napply this approach to a case study, where the economically optimized\nconfiguration results in a projected total energy cost per household of\napproximately 75 EUR/month over 30 years for both heat and electricity. This\nstudy demonstrates the feasibility of designing economically viable, autonomous\nenergy communities in real-world scenarios, and provides a comprehensive\noptimization framework for designing system components and control strategies.\n","authors":["Wonsun Song","Jakob Harzer","Christopher Jung","Leon Sander","Moritz Diehl"],"pdf_url":"https://arxiv.org/pdf/2501.07427v1.pdf","comment":"35 pages, 12 figures, submitted to Renewable Energy. Editor-in-chief:\n  Nidia Caetano"},{"id":"http://arxiv.org/abs/2501.07413v1","updated":"2025-01-13T15:31:50Z","published":"2025-01-13T15:31:50Z","title":"Stable Set Polytopes with Rank $|V(G)|/3$ for the Lov{á}sz--Schrijver\n  SDP Operator","summary":"  We study the lift-and-project rank of the stable set polytope of graphs with\nrespect to the Lov{\\'a}sz--Schrijver SDP operator $\\text{LS}_+$ applied to the\nfractional stable set polytope. In particular, we show that for every positive\ninteger $\\ell$, the smallest possible graph with $\\text{LS}_+$-rank $\\ell$\ncontains $3\\ell$ vertices. This result is sharp and settles a conjecture posed\nby Lipt{\\'a}k and the second author in 2003, as well as answers a\ngeneralization of a problem posed by Knuth in 1994. We also show that for every\npositive integer $\\ell$ there exists a vertex-transitive graph on $4\\ell+12$\nvertices with $\\text{LS}_+$-rank at least $\\ell$.\n","authors":["Yu Hin Au","Levent Tunçel"],"pdf_url":"https://arxiv.org/pdf/2501.07413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04081v2","updated":"2025-01-13T23:48:32Z","published":"2024-03-06T22:24:05Z","title":"Directional Smoothness and Gradient Methods: Convergence and Adaptivity","summary":"  We develop new sub-optimality bounds for gradient descent (GD) that depend on\nthe conditioning of the objective along the path of optimization rather than on\nglobal, worst-case constants. Key to our proofs is directional smoothness, a\nmeasure of gradient variation that we use to develop upper-bounds on the\nobjective. Minimizing these upper-bounds requires solving implicit equations to\nobtain a sequence of strongly adapted step-sizes; we show that these equations\nare straightforward to solve for convex quadratics and lead to new guarantees\nfor two classical step-sizes. For general functions, we prove that the Polyak\nstep-size and normalized GD obtain fast, path-dependent rates despite using no\nknowledge of the directional smoothness. Experiments on logistic regression\nshow our convergence guarantees are tighter than the classical theory based on\n$L$-smoothness.\n","authors":["Aaron Mishkin","Ahmed Khaled","Yuanhao Wang","Aaron Defazio","Robert M. Gower"],"pdf_url":"https://arxiv.org/pdf/2403.04081v2.pdf","comment":"Published as a poster at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.00617v2","updated":"2025-01-13T23:12:46Z","published":"2024-11-30T23:52:21Z","title":"Flow matching for stochastic linear control systems","summary":"  This paper addresses the problem of steering an initial probability\ndistribution to a target probability distribution through a deterministic or\nstochastic linear control system. Our proposed approach is inspired by the flow\nmatching methodology, with the difference that we can only affect the flow\nthrough the given control channels. The motivation comes from applications such\nas robotic swarms and stochastic thermodynamics, where agents or particles can\nonly be manipulated through control actions. The feedback control law that\nachieves the task is characterized as the conditional expectation of the\ncontrol inputs for the stochastic bridges that respect the given control system\ndynamics. Explicit forms are derived for special cases, and a numerical\nprocedure is presented to approximate the control law, illustrated with\nexamples.\n","authors":["Yuhang Mei","Mohammad Al-Jarrah","Amirhossein Taghvaei","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2412.00617v2.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.04045v5","updated":"2025-01-13T22:44:23Z","published":"2023-12-07T05:07:12Z","title":"Partial Information in a Mean-Variance Portfolio Selection Game","summary":"  This paper considers finitely many investors who perform mean-variance\nportfolio selection under relative performance criteria. That is, each investor\nis concerned about not only her terminal wealth, but how it compares to the\naverage terminal wealth of all investors. At the inter-personal level, each\ninvestor selects a trading strategy in response to others' strategies. This\nselected strategy additionally needs to yield an equilibrium intra-personally,\nso as to resolve time inconsistency among the investor's current and future\nselves (triggered by the mean-variance objective). A Nash equilibrium we look\nfor is thus a tuple of trading strategies under which every investor achieves\nher intra-personal equilibrium simultaneously. We derive such a Nash\nequilibrium explicitly in the idealized case of full information (i.e., the\ndynamics of the underlying stock is perfectly known) and semi-explicitly in the\nrealistic case of partial information (i.e., the stock evolution is observed,\nbut the expected return of the stock is not precisely known). The formula under\npartial information consists of the myopic trading and intertemporal hedging\nterms, both of which depend on an additional state process that serves to\nfilter the true expected return and whose influence on trading is captured by a\ndegenerate Cauchy problem. Our results identify that relative performance\ncriteria can induce downward self-reinforcement of investors' wealth--if every\ninvestor suffers a wealth decline simultaneously, then everyone's wealth tends\nto decline further. This phenomenon, as numerical examples show, is negligible\nunder full information but pronounced under partial information.\n","authors":["Yu-Jui Huang","Li-Hsien Sun"],"pdf_url":"https://arxiv.org/pdf/2312.04045v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19234v3","updated":"2025-01-13T21:17:16Z","published":"2024-10-25T00:53:16Z","title":"On the Trade-Off Between Distributional Belief and Ambiguity:\n  Conservatism, Finite-Sample Guarantees, and Asymptotic Properties","summary":"  We propose and analyze a new data-driven trade-off (TRO) approach for\nmodeling uncertainty that serves as a middle ground between the optimistic\napproach, which adopts a distributional belief, and the pessimistic\ndistributionally robust optimization approach, which hedges against\ndistributional ambiguity. We equip the TRO model with a TRO ambiguity set\ncharacterized by a size parameter controlling the level of optimism and a shape\nparameter representing distributional ambiguity. We first show that\nconstructing the TRO ambiguity set using a general star-shaped shape parameter\nwith the empirical distribution as its star center is necessary and sufficient\nto guarantee the hierarchical structure of the sequence of TRO ambiguity sets.\nThen, we analyze the properties of the TRO model, including quantifying\nconservatism, quantifying bias and generalization error, and establishing\nasymptotic properties. Specifically, we show that the TRO model could generate\na spectrum of decisions, ranging from optimistic to conservative decisions.\nAdditionally, we show that it could produce an unbiased estimator of the true\noptimal value. Furthermore, we establish the almost-sure convergence of the\noptimal value and the set of optimal solutions of the TRO model to their true\ncounterparts. We exemplify our theoretical results using an inventory control\nproblem and a portfolio optimization problem.\n","authors":["Man Yiu Tsang","Karmel S. Shehadeh"],"pdf_url":"https://arxiv.org/pdf/2410.19234v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07681v1","updated":"2025-01-13T20:41:52Z","published":"2025-01-13T20:41:52Z","title":"Dataset Distillation as Pushforward Optimal Quantization","summary":"  Dataset distillation aims to find a synthetic training set such that training\non the synthetic data achieves similar performance to training on real data,\nwith orders of magnitude less computational requirements. Existing methods can\nbe broadly categorized as either bi-level optimization problems that have\nneural network training heuristics as the lower level problem, or disentangled\nmethods that bypass the bi-level optimization by matching distributions of\ndata. The latter method has the major advantages of speed and scalability in\nterms of size of both training and distilled datasets. We demonstrate that when\nequipped with an encoder-decoder structure, the empirically successful\ndisentangled methods can be reformulated as an optimal quantization problem,\nwhere a finite set of points is found to approximate the underlying probability\nmeasure by minimizing the expected projection distance. In particular, we link\nexisting disentangled dataset distillation methods to the classical optimal\nquantization and Wasserstein barycenter problems, demonstrating consistency of\ndistilled datasets for diffusion-based generative priors. We propose a simple\nextension of the state-of-the-art data distillation method D4M, achieving\nbetter performance on the ImageNet-1K dataset with trivial additional\ncomputation, and state-of-the-art performance in higher image-per-class\nsettings.\n","authors":["Hong Ye Tan","Emma Slade"],"pdf_url":"https://arxiv.org/pdf/2501.07681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07680v1","updated":"2025-01-13T20:37:24Z","published":"2025-01-13T20:37:24Z","title":"Input-to-state stability in integral norms for linear\n  infinite-dimensional systems","summary":"  We study integral-to-integral input-to-state stability for\ninfinite-dimensional linear systems with inputs and trajectories in\n$L^p$-spaces. We start by developing the corresponding admissibility theory for\nlinear systems with unbounded input operators. While input-to-state stability\nis typically characterized by exponential stability and finite-time\nadmissibility, we show that this equivalence does not extend directly to\nintegral norms. For analytic semigroups, we establish a precise\ncharacterization using maximal regularity theory. Additionally, we provide\ndirect Lyapunov theorems and construct Lyapunov functions for $L^p$-$L^q$-ISS\nand demonstrate the results with examples, including diagonal systems and\ndiffusion equations.\n","authors":["Sahiba Arora","Andrii Mironchenko"],"pdf_url":"https://arxiv.org/pdf/2501.07680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07671v1","updated":"2025-01-13T20:09:02Z","published":"2025-01-13T20:09:02Z","title":"Towards nonlinearity. The p-regularity theory. Applications and\n  developments","summary":"  We present recent advances in the analysis of nonlinear equations with\nsingular operators and nonlinear optimization problems with constraints given\nby singular mappings. The results are obtained within the framework of\n$p$-regularity theory, which has developed successfully over the last forty\nyears. We illustrate the theory with its applications to degenerate problems in\nvarious areas of mathematics. In particular, we address the problem of\ndescribing the tangent cone to the solution set of nonlinear equations in a\nsingular case. The structure of p-factor operators is used to propose\noptimality conditions and construct numerical methods for solving degenerate\nnonlinear equations and optimization problems. The methods presented in the\npaper can be considered as the first numerical approaches targeting solutions\nof degenerate problems, such as the Van der Pol differential equation,\nboundary-value problems with a small parameter, partial differential equations\nwhere Poincar\\'e's method of small parameter fails, nonlinear degenerate\ndynamical systems, and others. There are various practical applications for the\ntheory of p-regularity, including structural engineering, composite materials,\nand material design. For instance, the theory can be applied to analyze the\nbehavior of materials with irregular or complex properties. By considering\nhigher-order derivatives, it becomes possible to model and predict the response\nof materials to external forces, such as stress or temperature variations. In\ngeophysics, the $p$-regularity theory can be utilized to analyze and interpret\ncomplex data obtained from seismic surveys, gravity measurements, or\nelectromagnetic surveys. The theory also finds applications in the analysis of\nnonlinear differential equations arising in control systems, geometric and\ntopological analysis, biomechanics, and many other fields.\n","authors":["E. Bednarczuk","O. Brezhneva","K. Leśniewski","A. Prusińska","A. Tret'yakov"],"pdf_url":"https://arxiv.org/pdf/2501.07671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07652v1","updated":"2025-01-13T19:24:14Z","published":"2025-01-13T19:24:14Z","title":"Finite Sample Identification of Partially Observed Bilinear Dynamical\n  Systems","summary":"  We consider the problem of learning a realization of a partially observed\nbilinear dynamical system (BLDS) from noisy input-output data. Given a single\ntrajectory of input-output samples, we provide a finite time analysis for\nlearning the system's Markov-like parameters, from which a balanced realization\nof the bilinear system can be obtained. Our bilinear system identification\nalgorithm learns the system's Markov-like parameters by regressing the outputs\nto highly correlated, nonlinear, and heavy-tailed covariates. Moreover, the\nstability of BLDS depends on the sequence of inputs used to excite the system.\nThese properties, unique to partially observed bilinear dynamical systems, pose\nsignificant challenges to the analysis of our algorithm for learning the\nunknown dynamics. We address these challenges and provide high probability\nerror bounds on our identification algorithm under a uniform stability\nassumption. Our analysis provides insights into system theoretic quantities\nthat affect learning accuracy and sample complexity. Lastly, we perform\nnumerical experiments with synthetic data to reinforce these insights.\n","authors":["Yahya Sattar","Yassir Jedra","Maryam Fazel","Sarah Dean"],"pdf_url":"https://arxiv.org/pdf/2501.07652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17753v2","updated":"2025-01-13T19:10:31Z","published":"2024-05-28T02:11:21Z","title":"Regression Equilibrium in Electricity Markets","summary":"  In two-stage electricity markets, renewable power producers enter the\nday-ahead market with a forecast of future power generation and then reconcile\nany forecast deviation in the real-time market at a penalty. The choice of the\nforecast model is thus an important strategy decision for renewable power\nproducers as it affects financial performance. In electricity markets with\nlarge shares of renewable generation, the choice of the forecast model impacts\nnot only individual performance but also outcomes for other producers. In this\npaper, we argue for the existence of a competitive regression equilibrium in\ntwo-stage electricity markets in terms of the parameters of private forecast\nmodels informing the participation strategies of renewable power producers. In\nour model, renewables optimize the forecast against the day-ahead and real-time\nprices, thereby maximizing the average profits across the day-ahead and\nreal-time markets. By doing so, they also implicitly enhance the temporal cost\ncoordination of day-ahead and real-time markets. We base the equilibrium\nanalysis on the theory of variational inequalities, providing results on the\nexistence and uniqueness of regression equilibrium in energy-only markets. We\nalso devise two methods to compute regression equilibrium: centralized\noptimization and a decentralized ADMM-based algorithm.\n","authors":["Vladimir Dvorkin"],"pdf_url":"https://arxiv.org/pdf/2405.17753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02648v4","updated":"2025-01-13T19:05:07Z","published":"2024-03-05T04:35:59Z","title":"Remove that Square Root: A New Efficient Scale-Invariant Version of\n  AdaGrad","summary":"  Adaptive methods are extremely popular in machine learning as they make\nlearning rate tuning less expensive. This paper introduces a novel optimization\nalgorithm named KATE, which presents a scale-invariant adaptation of the\nwell-known AdaGrad algorithm. We prove the scale-invariance of KATE for the\ncase of Generalized Linear Models. Moreover, for general smooth non-convex\nproblems, we establish a convergence rate of $O \\left(\\frac{\\log T}{\\sqrt{T}}\n\\right)$ for KATE, matching the best-known ones for AdaGrad and Adam. We also\ncompare KATE to other state-of-the-art adaptive algorithms Adam and AdaGrad in\nnumerical experiments with different problems, including complex machine\nlearning tasks like image classification and text classification on real data.\nThe results indicate that KATE consistently outperforms AdaGrad and\nmatches/surpasses the performance of Adam in all considered scenarios.\n","authors":["Sayantan Choudhury","Nazarii Tupitsa","Nicolas Loizou","Samuel Horvath","Martin Takac","Eduard Gorbunov"],"pdf_url":"https://arxiv.org/pdf/2403.02648v4.pdf","comment":"32 pages, 12 figures"},{"id":"http://arxiv.org/abs/1912.00043v3","updated":"2025-01-13T18:34:11Z","published":"2019-11-29T19:22:36Z","title":"Barcodes as Summary of Loss Function Topology","summary":"  We propose to study neural networks' loss surfaces by methods of topological\ndata analysis. We suggest to apply barcodes of Morse complexes to explore\ntopology of loss surfaces. An algorithm for calculations of the loss function's\nbarcodes of local minima is described. We have conducted experiments for\ncalculating barcodes of local minima for benchmark functions and for loss\nsurfaces of small neural networks. Our experiments confirm our two principal\nobservations for neural networks' loss surfaces. First, the barcodes of local\nminima are located in a small lower part of the range of values of neural\nnetworks' loss function. Secondly, increase of the neural network's depth and\nwidth lowers the barcodes of local minima. This has some natural implications\nfor the neural network's learning and for its generalization properties.\n","authors":["Serguei Barannikov","Alexander Korotin","Dmitry Oganesyan","Daniil Emtsev","Evgeny Burnaev"],"pdf_url":"https://arxiv.org/pdf/1912.00043v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07401v1","updated":"2025-01-13T15:17:48Z","published":"2025-01-13T15:17:48Z","title":"Smoothing Iterative Consensus-based Optimization Algorithm for Nonsmooth\n  Nonconvex Optimization Problems with Global Optimality","summary":"  In this paper, we focus on finding the global minimizer of a general\nunconstrained nonsmooth nonconvex optimization problem. Taking advantage of the\nsmoothing method and the consensus-based optimization (CBO) method, we propose\na novel smoothing iterative consensus-based optimization (SICBO) algorithm.\nFirst, we prove that the solution process of the proposed algorithm here\nexponentially converges to a common stochastic consensus point almost surely.\nSecond, we establish a detailed theoretical analysis to ensure the small enough\nerror between the objective function value at the consensus point and the\noptimal function value, to the best of our knowledge, which provides the first\ntheoretical guarantee to the global optimality of the proposed algorithm for\nnonconvex optimization problems. Moreover, unlike the previously introduced CBO\nmethods, the theoretical results are valid for the cases that the objective\nfunction is nonsmooth, nonconvex and perhaps non-Lipschitz continuous. Finally,\nseveral numerical examples are performed to illustrate the effectiveness of our\nproposed algorithm for solving the global minimizer of the nonsmooth and\nnonconvex optimization problems.\n","authors":["Jiazhen Wei","Wei Bian"],"pdf_url":"https://arxiv.org/pdf/2501.07401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07400v1","updated":"2025-01-13T15:17:28Z","published":"2025-01-13T15:17:28Z","title":"Derivation of effective gradient flow equations and dynamical truncation\n  of training data in Deep Learning","summary":"  We derive explicit equations governing the cumulative biases and weights in\nDeep Learning with ReLU activation function, based on gradient descent for the\nEuclidean cost in the input layer, and under the assumption that the weights\nare, in a precise sense, adapted to the coordinate system distinguished by the\nactivations. We show that gradient descent corresponds to a dynamical process\nin the input layer, whereby clusters of data are progressively reduced in\ncomplexity (\"truncated\") at an exponential rate that increases with the number\nof data points that have already been truncated. We provide a detailed\ndiscussion of several types of solutions to the gradient flow equations. A main\nmotivation for this work is to shed light on the interpretability question in\nsupervised learning.\n","authors":["Thomas Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07400v1.pdf","comment":"AMS Latex, 35 pages"},{"id":"http://arxiv.org/abs/2501.07383v1","updated":"2025-01-13T15:02:27Z","published":"2025-01-13T15:02:27Z","title":"Anomalies of the Scholtes regularization for mathematical programs with\n  complementarity constraints","summary":"  For mathematical programs with complementarity constraints (MPCC), we refine\nthe convergence analysis of the Scholtes regularization. Our goal is to relate\nnondegenerate C-stationary points of MPCC with nondegenerate Karush-Kuhn-Tucker\npoints of its Scholtes regularization. We detected the following anomalies: (i)\nin a neighborhood of a nondegenerate C-stationary point there could be\ndegenerate Karush-Kuhn-Tucker points of the Scholtes regularization; (ii) even\nif nondegenerate, they might be locally non-unique; (iii) if nevertheless\nunique, their quadratic index potentially differs from the C-index of the\nC-stationary point under consideration. Thus, a change of the topological type\nfor Karush-Kuhn-Tucker points of the Scholtes regularization is possible. In\nparticular, a nondegenerate minimizer of MPCC might be approximated by saddle\npoints. In order to bypass the mentioned anomalies, an additional generic\ncondition for nondegenerate C-stationary points of MPCC is identified. Then, we\nuniquely trace nondegenerate Karush-Kuhn-Tucker points of the Scholtes\nregularization and successively maintain their topological type.\n","authors":["Vladimir Shikhman","Sebastian Lämmel"],"pdf_url":"https://arxiv.org/pdf/2501.07383v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2305.08094v2","updated":"2025-01-13T14:53:11Z","published":"2023-05-14T08:10:49Z","title":"Accelerating genetic optimization of nonlinear model predictive control\n  by learning optimal search space size","summary":"  Genetic algorithm (GA) is typically used to solve nonlinear model predictive\ncontrol's optimization problem. However, the size of the search space in which\nthe GA searches for the optimal control inputs is crucial for its applicability\nto fast-response systems. This paper proposes accelerating the genetic\noptimization of NMPC by learning optimal search space size. The approach trains\na multivariate regression model to adaptively predict the best smallest size of\nthe search space in every control cycle. The proposed approach reduces the GA's\ncomputational time, improves the chance of convergence to better control\ninputs, and provides a stable and feasible solution. The proposed approach was\nevaluated on three nonlinear systems and compared to four other evolutionary\nalgorithms implemented in a processor-in-the-loop fashion. The results show\nthat the proposed approach provides a 17-45\\% reduction in computational time\nand increases the convergence rate by 35-47\\%. The source code is available on\nGitHub.\n","authors":["Eslam Mostafa","Hussein A. Aly","Ahmed Elliethy"],"pdf_url":"https://arxiv.org/pdf/2305.08094v2.pdf","comment":"Accepted by the Journal of Control and Decision"},{"id":"http://arxiv.org/abs/2501.07307v1","updated":"2025-01-13T13:16:30Z","published":"2025-01-13T13:16:30Z","title":"Quasiconvex Bulk and Surface Energies with subquadratic growth","summary":"  We establish partial H\\\"older continuity of the gradient for equilibrium\nconfigurations of vectorial multidimensional variational problems, involving\nbulk and surface energies. The bulk energy densities are uniformly strictly\nquasiconvex functions with $p$-growth, $1<p< 2$, without any further structure\nconditions. The anisotropic surface energy is defined by means of an elliptic\nintegrand $\\Phi$ not necessarily regular.\n","authors":["Menita Carozza","Luca Esposito","Lorenzo Lamberti"],"pdf_url":"https://arxiv.org/pdf/2501.07307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07306v1","updated":"2025-01-13T13:16:12Z","published":"2025-01-13T13:16:12Z","title":"Variable Bregman Majorization-Minimization Algorithm and its Application\n  to Dirichlet Maximum Likelihood Estimation","summary":"  We propose a novel Bregman descent algorithm for minimizing a convex function\nthat is expressed as the sum of a differentiable part (defined over an open\nset) and a possibly nonsmooth term. The approach, referred to as the Variable\nBregman Majorization-Minimization (VBMM) algorithm, extends the Bregman\nProximal Gradient method by allowing the Bregman function used in the\ndivergence to adaptively vary at each iteration, provided it satisfies a\nmajorizing condition on the objective function. This adaptive framework enables\nthe algorithm to approximate the objective more precisely at each iteration,\nthereby allowing for accelerated convergence compared to the traditional\nBregman Proximal Gradient descent. We establish the convergence of the VBMM\nalgorithm to a minimizer under mild assumptions on the family of metrics used.\nFurthermore, we introduce a novel application of both the Bregman Proximal\nGradient method and the VBMM algorithm to the estimation of the\nmultidimensional parameters of a Dirichlet distribution through the\nmaximization of its log-likelihood. Numerical experiments confirm that the VBMM\nalgorithm outperforms existing approaches in terms of convergence speed.\n","authors":["Ségolène Martin","Jean-Christophe Pesquet","Gabriele Steidl","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2501.07306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17434v2","updated":"2025-01-13T13:01:12Z","published":"2024-06-25T10:16:30Z","title":"Moment-based parameter inference with error guarantees for stochastic\n  reaction networks","summary":"  Inferring parameters of models of biochemical kinetics from single-cell data\nremains challenging because of the uncertainty arising from the intractability\nof the likelihood function of stochastic reaction networks. Such uncertainty\nfalls beyond current error quantification measures, which focus on the effects\nof finite sample size and identifiability but lack theoretical guarantees when\nlikelihood approximations are needed. Here, we propose a method for the\ninference of parameters of stochastic reaction networks that works for both\nsteady-state and time-resolved data and is applicable to networks with\nnon-linear and rational propensities. Our approach provides bounds on the\nparameters via convex optimisation over sets constrained by moment equations\nand moment matrices by taking observations to form moment intervals, which are\nthen used to constrain parameters through convex sets. The bounds on the\nparameters contain the true parameters under the condition that the moment\nintervals contain the true moments, thus providing uncertainty quantification\nand error guarantees. Our approach does not need to predict moments and\ndistributions for given parameters (i.e., it avoids solving or simulating the\nforward problem), and hence circumvents intractable likelihood computations or\ncomputationally expensive simulations. We demonstrate its use for uncertainty\nquantification, data integration and prediction of latent species statistics\nthrough synthetic data from common non-linear biochemical models including the\nSchl\\\"ogl model and the toggle switch, a model of post-transcriptional\nregulation at steady state, and a birth-death model with time-dependent data.\n","authors":["Zekai Li","Mauricio Barahona","Philipp Thomas"],"pdf_url":"https://arxiv.org/pdf/2406.17434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.01392v2","updated":"2025-01-13T12:55:56Z","published":"2022-08-02T12:23:37Z","title":"The Analytic Minimal Rank Sard Conjecture","summary":"  We obtain, under an additional assumption on the subanalytic abnormal\ndistribution constructed in [4], a proof of the minimal rank Sard conjecture in\nthe analytic category. It establishes that from a given point the set of points\naccessible through singular horizontal curves of minimal rank, which\ncorresponds to the rank of the distribution, has Lebesgue measure zero. The\nminimal rank Sard Conjecture is equivalent to the Sard Conjecture for co-rank 1\ndistributions.\n","authors":["A Belotto da Silva","A Parusiński","L Rifford"],"pdf_url":"https://arxiv.org/pdf/2208.01392v2.pdf","comment":"Important: The previous paper had two sets of distinct results. We\n  have divided the paper in two, and this version contains the second set of\n  results. The first part will appear in a new hal submission hal-04881557. The\n  title has changed"},{"id":"http://arxiv.org/abs/2501.07275v1","updated":"2025-01-13T12:40:52Z","published":"2025-01-13T12:40:52Z","title":"Generating Poisoning Attacks against Ridge Regression Models with\n  Categorical Features","summary":"  Machine Learning (ML) models have become a very powerful tool to extract\ninformation from large datasets and use it to make accurate predictions and\nautomated decisions. However, ML models can be vulnerable to external attacks,\ncausing them to underperform or deviate from their expected tasks. One way to\nattack ML models is by injecting malicious data to mislead the algorithm during\nthe training phase, which is referred to as a poisoning attack. We can prepare\nfor such situations by designing anticipated attacks, which are later used for\ncreating and testing defence strategies. In this paper, we propose an algorithm\nto generate strong poisoning attacks for a ridge regression model containing\nboth numerical and categorical features that explicitly models and poisons\ncategorical features. We model categorical features as SOS-1 sets and formulate\nthe problem of designing poisoning attacks as a bilevel optimization problem\nthat is nonconvex mixed-integer in the upper-level and unconstrained convex\nquadratic in the lower-level. We present the mathematical formulation of the\nproblem, introduce a single-level reformulation based on the Karush-Kuhn-Tucker\n(KKT) conditions of the lower level, find bounds for the lower-level variables\nto accelerate solver performance, and propose a new algorithm to poison\ncategorical features. Numerical experiments show that our method improves the\nmean squared error of all datasets compared to the previous benchmark in the\nliterature.\n","authors":["Monse Guedes-Ayala","Lars Schewe","Zeynep Suvak","Miguel Anjos"],"pdf_url":"https://arxiv.org/pdf/2501.07275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07203v1","updated":"2025-01-13T10:56:23Z","published":"2025-01-13T10:56:23Z","title":"Integrated Wind Farm Design: Optimizing Turbine Placement and Cable\n  Routing with Wake Effects","summary":"  An accelerated deployment of renewable energy sources is crucial for a\nsuccessful transformation of the current energy system, with wind energy\nplaying a key role in this transition. This study addresses the integrated wind\nfarm layout and cable routing problem, a challenging nonlinear optimization\nproblem. We model this problem as an extended version of the Quota Steiner Tree\nProblem (QSTP), optimizing turbine placement and network connectivity\nsimultaneously to meet specified expansion targets. Our proposed approach\naccounts for the wake effect - a region of reduced wind speed induced by each\ninstalled turbine - and enforces minimum spacing between turbines. We introduce\nan exact solution framework in terms of the novel Quota Steiner Tree Problem\nwith interference (QSTPI). By leveraging an interference-based splitting\nstrategy, we develop an advanced solver capable of tackling large-scale problem\ninstances. The presented approach outperforms generic state-of-the-art mixed\ninteger programming solvers on our dataset by up to two orders of magnitude.\nMoreover, we demonstrate that our integrated method significantly reduces the\ncosts in contrast to a sequential approach. Thus, we provide a planning tool\nthat enhances existing planning methodologies for supporting a faster and\ncost-efficient expansion of wind energy.\n","authors":["Jaap Pedersen","Niels Lindner","Daniel Rehfeldt","Thorsten Koch"],"pdf_url":"https://arxiv.org/pdf/2501.07203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04313v2","updated":"2025-01-13T10:55:19Z","published":"2024-05-07T13:36:33Z","title":"Deck of Cards method for Hierarchical, Robust and Stochastic Ordinal\n  Regression","summary":"  We consider the recently introduced application of the Deck of Cards Method\n(DCM) to ordinal regression proposing two extensions related to two main\nresearch trends in Multiple Criteria Decision Aiding, namely scaling and\nordinal regression generalizations. On the one hand, procedures, different from\nDCM (e.g. AHP, BWM, MACBETH) to collect and elaborate Decision Maker's (DM's)\npreference information are considered to define an overall evaluation of\nreference alternatives. On the other hand, Robust Ordinal Regression and\nStochastic Multicriteria Acceptability Analysis are used to offer the DM more\ndetailed and realistic decision-support outcomes. More precisely, we take into\naccount preference imprecision and indetermination through a set of admissible\ncomprehensive evaluations of alternatives provided by the whole set of value\nfunctions compatible with DM's preference information rather than the univocal\nassessment obtained from a single value function. In addition, we also consider\nalternatives evaluated on a set of criteria hierarchically structured. The\nmethodology we propose allows the DM to provide precise or imprecise\ninformation at different levels of the hierarchy of criteria. Like scaling\nprocedures, the compatible value function we consider can be of a different\nnature, such as weighted sum, linear or general monotone value function, or\nChoquet integral. Consequently, the approach we propose is versatile and\nwell-equipped to be adapted to DM's characteristics and requirements. The\napplicability of the proposed methodology is shown by a didactic example based\non a large ongoing research project in which Italian regions are evaluated on\ncriteria representing Circular Economy, Innovation-Driven Development and Smart\nSpecialization Strategies.\n","authors":["Salvatore Corrente","Salvatore Greco","Silvano Zappalá"],"pdf_url":"https://arxiv.org/pdf/2405.04313v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11990v2","updated":"2025-01-13T10:12:28Z","published":"2023-04-24T10:47:19Z","title":"A descent method for nonsmooth multiobjective optimization problems on\n  Riemannian manifolds","summary":"  In this paper, a descent method for nonsmooth multiobjective optimization\nproblems on complete Riemannian manifolds is proposed. The objective functions\nare only assumed to be locally Lipschitz continuous instead of convexity used\nin existing methods. A necessary condition for Pareto optimality in Euclidean\nspace is generalized to the Riemannian setting. At every iteration, an\nacceptable descent direction is obtained by constructing a convex hull of some\nRiemannian $\\varepsilon$-subgradients. And then a Riemannian Armijo-type line\nsearch is executed to produce the next iterate. The convergence result is\nestablished in the sense that a point satisfying the necessary condition for\nPareto optimality can be generated by the algorithm in a finite number of\niterations. Finally, some preliminary numerical results are reported, which\nshow that the proposed method is efficient.\n","authors":["Chunming Tang","Hao He","Jinbao Jian","Miantao Chao"],"pdf_url":"https://arxiv.org/pdf/2304.11990v2.pdf","comment":"22 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.05737v2","updated":"2025-01-13T10:11:14Z","published":"2025-01-10T06:06:41Z","title":"Efficient Gradient Tracking Algorithms for Distributed Optimization\n  Problems with Inexact Communication","summary":"  Distributed optimization problems usually face inexact communication issues\ninduced by communication quantization, differential privacy protection, or\nchannels noise. Most existing algorithms need two-timescale setting of the\nstepsize of gradient descent and the parameter of noise suppression to ensure\nthe convergence to the optimal solution. In this paper, we propose two\nsingle-timescale algorithms, VRA-DGT and VRA--DSGT, for distributed\ndeterministic and stochastic optimization problems with inexact communication\nrespectively. VRA-DGT integrates the Variance-Reduced Aggregation (VRA)\nmechanism with the distributed gradient tracking framework, which achieves a\nconvergence rate of $\\mathcal{O}\\left(k^{-1}\\right)$ in the mean-square sense\nwhen the objective function is strongly convex and smooth. For distributed\nstochastic optimization problem,VRA-DSGT, where a hybrid variance reduction\ntechnique has been introduced in VRA-DGT,\n  VRA-DGT,, maintains the convergence rate of $\\mathcal{O}\\left(k^{-1}\\right)$\nfor strongly convex and smooth objective function. Simulated experiments on\nlogistic regression problem with real-world data verify the effectiveness of\nthe proposed algorithms.\n","authors":["Shengchao Zhao","Yongchao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.05737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07174v1","updated":"2025-01-13T10:06:27Z","published":"2025-01-13T10:06:27Z","title":"State-space reduction techniques exploiting specific constraints for\n  quantum search Application to a specific job scheduling problem","summary":"  Quantum search has emerged as one of the most promising fields in quantum\ncomputing. State-of-the-art quantum search algorithms enable the search for\nspecific elements in a distribution by monotonically increasing the density of\nthese elements until reaching a high density. This kind of algorithms\ndemonstrate a theoretical quadratic speed-up on the number of queries compared\nto classical search algorithms in unstructured spaces. Unfortunately, the major\npart of the existing literature applies quantum search to problems which size\ngrows exponnentialy with the input size without exploiting any specific problem\nstructure, rendering this kind of approach not exploitable in real industrial\nproblems. In contrast, this work proposes exploiting specific constraints of\nscheduling problems to build an initial superposition of states with size\nalmost quadraticaly increasing as a function of the problem size. This state\nspace reduction, inspired by the quantum walk algorithm, constructs a state\nsuperposition corresponding to all paths in a state-graph embedding spacing\nconstraints between jobs. Our numerical results on quantum emulators highlights\nthe potential of state space reduction approach, which could lead to more\nefficient quantum search processes by focusing on a smaller, more relevant,\nsolution space.\n","authors":["Rodolphe Griset","Ioannis Lavdas","Jiri Guth Jarkovsky"],"pdf_url":"https://arxiv.org/pdf/2501.07174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18077v2","updated":"2025-01-13T08:42:17Z","published":"2024-09-26T17:24:30Z","title":"A 2-approximation algorithm for the softwired parsimony problem on\n  binary, tree-child phylogenetic networks","summary":"  Finding the most parsimonious tree inside a phylogenetic network with respect\nto a given character is an NP-hard combinatorial optimization problem that for\nmany network topologies is essentially inapproximable. In contrast, if the\nnetwork is a rooted tree, then Fitch's well-known algorithm calculates an\noptimal parsimony score for that character in polynomial time. Drawing\ninspiration from this we here introduce a new extension of Fitch's algorithm\nwhich runs in polynomial time and ensures an approximation factor of 2 on\nbinary, tree-child phylogenetic networks, a popular topologically-restricted\nsubclass of phylogenetic networks in the literature. Specifically, we show that\nFitch's algorithm can be seen as a primal-dual algorithm, how it can be\nextended to binary, tree-child networks and that the approximation guarantee of\nthis extension is tight. These results for a classic problem in phylogenetics\nstrengthens the link between polyhedral methods and phylogenetics and can aid\nin the study of other related optimization problems on phylogenetic networks.\n","authors":["Martin Frohn","Steven Kelk"],"pdf_url":"https://arxiv.org/pdf/2409.18077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07128v1","updated":"2025-01-13T08:31:29Z","published":"2025-01-13T08:31:29Z","title":"Adaptive Methods for Multiobjective Unit Commitment","summary":"  This work considers a multiobjective version of the unit commitment problem\nthat deals with finding the optimal generation schedule of a firm, over a\nperiod of time and a given electrical network. With growing importance of\nenvironmental impact, some objectives of interest include CO2 emission levels\nand renewable energy penetration, in addition to the standard generation costs.\nSome typical constraints include limits on generation levels and up/down times\non generation units. This further entails solving a multiobjective mixed\ninteger optimization problem. The related literature has predominantly focused\non heuristics (like Genetic Algorithms) for solving larger problem instances.\nOur major intent in this work is to propose scalable versions of mathematical\noptimization based approaches that help in speeding up the process of\nestimating the underlying Pareto frontier. Our contributions are computational\nand rest on two key embodiments. First, we use the notion of both epsilon\nconstraints and adaptive weights to solve a sequence of single objective\noptimization problems. Second, to ease the computational burden, we propose a\nMccormick-type relaxation for quadratic type constraints that arise due to the\nresulting formulation types. We test the proposed computational framework on\nreal network data from [1,50] and compare the same with standard solvers like\nGurobi. Results show a significant reduction in complexity (computational time)\nwhen deploying the proposed framework.\n","authors":["Ece Tevruez","Aswin Kannan"],"pdf_url":"https://arxiv.org/pdf/2501.07128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09527v3","updated":"2025-01-13T05:18:21Z","published":"2023-11-16T03:10:25Z","title":"Anytime Solvers for Variational Inequalities: the (Recursive) Safe\n  Monotone Flows","summary":"  This paper synthesizes anytime algorithms, in the form of continuous-time\ndynamical systems, to solve monotone variational inequalities. We introduce\nthree algorithms that solve this problem: the projected monotone flow, the safe\nmonotone flow, and the recursive safe monotone flow. The first two systems\nadmit dual interpretations: either as projected dynamical systems or as\ndynamical systems controlled with a feedback controller synthesized using\ntechniques from safety-critical control. The third flow bypasses the need to\nsolve quadratic programs along the trajectories by incorporating a dynamics\nwhose equilibria precisely correspond to such solutions, and interconnecting\nthe dynamical systems on different time scales. We perform a thorough analysis\nof the dynamical properties of all three systems. For the safe monotone flow,\nwe show that equilibria correspond exactly with critical points of the original\nproblem, and the constraint set is forward invariant and asymptotically stable.\nThe additional assumption of convexity and monotonicity allows us to derive\nglobal stability guarantees, as well as establish the system is contracting\nwhen the constraint set is polyhedral. For the recursive safe monotone flow, we\nuse tools from singular perturbation theory for contracting systems to show KKT\npoints are locally exponentially stable and globally attracting, and obtain\npractical safety guarantees. We illustrate the performance of the flows on a\ntwo-player game example and also demonstrate the versatility for\ninterconnection and regulation of dynamical processes of the safe monotone flow\nin an example of a receding horizon linear quadratic dynamic game.\n","authors":["Ahmed Allibhoy","Jorge Cortés"],"pdf_url":"https://arxiv.org/pdf/2311.09527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07057v1","updated":"2025-01-13T04:31:31Z","published":"2025-01-13T04:31:31Z","title":"Optimization with Multi-sourced Reference Information and Unknown Trust:\n  A Distributionally Robust Approach","summary":"  In problems that involve input parameter information gathered from multiple\ndata sources with varying reliability, incorporating users' trust about\ndifferent sources in decision-optimization models can potentially improve\nsolution performance and reliability. In this work, we propose a novel\nmulti-reference distributionally robust optimization (MR-DRO) framework, where\nthe model inputs are uncertain and their probability distributions can be\nstatistically inferred from multiple data sources. Via nonparametric data\nfusion, we construct a Wasserstein ambiguity set to minimize the worst-case\nexpected value of a stochastic objective function, accounting for both\nuncertainty and unknown reliability of information sources. We reformulate the\nMR-DRO model as a linear program given linear objective and constraints in the\noriginal problem. We also incorporate a dynamic trust update mechanism that\nadjusts the trust for each source based on its performance over time. In\naddition, we introduce the concept of probability dominance to identify sources\nwith dominant trust. Via solving instances of resource allocation and portfolio\noptimization, we demonstrate the effectiveness of the trust-informed MR-DRO\napproach compared to traditional optimization frameworks relying on a single\ndata source. Our results highlight the significance of integrating (dynamic)\nuser trust in decision making under uncertainty, particularly when given\ndiverse and potentially conflicting input data.\n","authors":["Yanru Guo","Ruiwei Jiang","Siqian Shen"],"pdf_url":"https://arxiv.org/pdf/2501.07057v1.pdf","comment":"38 pages, 9 figures, 7 tables"},{"id":"http://arxiv.org/abs/2501.07053v1","updated":"2025-01-13T04:23:47Z","published":"2025-01-13T04:23:47Z","title":"A New Concept of optimal control for epidemic spreading by Vaccination\n  Technique for Assessing social optimum employing Pontryagins Maximum\n  Principle","summary":"  This research introduces a new approach utilizing optimal control theory\n(OCT) to assess the Social Optimum (SO) of a vaccination game, navigating the\nintricate considerations of cost, availability, and distribution policies. By\nintegrating an SIRS/V epidemic model with a behavior model, the study analyzes\nindividual vaccination strategies. A unique optimal control framework, centered\non vaccination costs, is proposed, diverging significantly from previous\nmethods. Our findings confirm the effectiveness and feasibility of this\napproach in managing vaccination strategies. Moreover, we examine the\nunderlying social dilemma of the vaccination game, investigating key\nparameters. By calculating the Nash equilibrium (NE) through the behavior model\nand determining the SO using our approach, we measure the Social Efficiency\nDeficit (SED), quantifying the overall cost gap between the NE and SO. Results\nindicate that an increased waning immunity rate exacerbates the social dilemma,\nalthough higher vaccination costs partially mitigate it. This research provides\nvaluable insights into optimizing vaccination strategies amidst complex\nsocietal dynamics.\n","authors":["Md. Mamun-Ur-Rashid Khan","Jun Tanimoto"],"pdf_url":"https://arxiv.org/pdf/2501.07053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07018v1","updated":"2025-01-13T02:40:10Z","published":"2025-01-13T02:40:10Z","title":"PDLP: A Practical First-Order Method for Large-Scale Linear Programming","summary":"  We present PDLP, a practical first-order method for linear programming (LP)\ndesigned to solve large-scale LP problems. PDLP is based on the primal-dual\nhybrid gradient (PDHG) method applied to the minimax formulation of LP. PDLP\nincorporates several enhancements to PDHG, including diagonal preconditioning,\npresolving, adaptive step sizes, adaptive restarting, and feasibility\npolishing. Our algorithm is implemented in C++, available in Google's\nopen-source OR-Tools library, and supports multithreading.\n  To evaluate our method, we introduce a new collection of eleven large-scale\nLP problems with sizes ranging from 125 million to 6.3 billion nonzeros. PDLP\nsolves eight of these instances to optimality gaps of 1\\% (with primal and dual\nfeasibility errors of less than $10^{-8}$) within six days on a single machine.\nWe also compare PDLP with Gurobi barrier, primal simplex, and dual simplex\nimplementations. Gurobi barrier solves only three instances, exceeding our 1TB\nRAM limit on the other eight. While primal and dual simplex are more\nmemory-efficient than the barrier method, they are slower and solve only three\ninstances within six days.\n  Compared with the conference version of this work (in: Advances in Neural\nInformation Processing Systems 34 (NeurIPS 2021)), the key new contributions\nare: (i) feasibility polishing, a technique that quickly finds solutions that\nare approximately optimal but almost exactly feasible (without which only three\nof the eleven problems can be solved); (ii) a multithreaded C++ implementation\navailable in Google OR-Tools; and (iii) a new collection of large-scale LP\nproblems. Note that the conference version should be referred to for\ncomparisons with SCS and ablation studies, which we do not repeat in this\npaper.\n","authors":["David Applegate","Mateo Díaz","Oliver Hinder","Haihao Lu","Miles Lubin","Brendan O'Donoghue","Warren Schudy"],"pdf_url":"https://arxiv.org/pdf/2501.07018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07005v1","updated":"2025-01-13T01:49:17Z","published":"2025-01-13T01:49:17Z","title":"Global Search for Optimal Low Thrust Spacecraft Trajectories using\n  Diffusion Models and the Indirect Method","summary":"  Long time-duration low-thrust nonlinear optimal spacecraft trajectory global\nsearch is a computationally and time expensive problem characterized by\nclustering patterns in locally optimal solutions. During preliminary mission\ndesign, mission parameters are subject to frequent changes, necessitating that\ntrajectory designers efficiently generate high-quality control solutions for\nthese new scenarios. Generative machine learning models can be trained to learn\nhow the solution structure varies with respect to a conditional parameter,\nthereby accelerating the global search for missions with updated parameters. In\nthis work, state-of-the-art diffusion models are integrated with the indirect\napproach for trajectory optimization within a global search framework. This\nframework is tested on two low-thrust transfers of different complexity in the\ncircular restricted three-body problem. By generating and analyzing a training\ndata set, we develop mathematical relations and techniques to understand the\ncomplex structures in the costate domain of locally optimal solutions for these\nproblems. A diffusion model is trained on this data and successfully\naccelerates the global search for both problems. The model predicts how the\ncostate solution structure changes, based on the maximum spacecraft thrust\nmagnitude. Warm-starting a numerical solver with diffusion model samples for\nthe costates at the initial time increases the number of solutions generated\nper minute for problems with unseen thrust magnitudes by one to two orders of\nmagnitude in comparison to samples from a uniform distribution and from an\nadjoint control transformation.\n","authors":["Jannik Graebner","Ryne Beeson"],"pdf_url":"https://arxiv.org/pdf/2501.07005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19467v2","updated":"2025-01-13T01:20:09Z","published":"2024-11-29T04:36:19Z","title":"An Optimal Switching Approach for Bird Migration","summary":"  Bird migration is an adaptive behavior ultimately aiming at optimizing\nsurvival and reproductive success. We propose an optimal switching model to\nstudy bird migration, where birds' migration behaviors can be efficiently\nmodeled as switching between different stochastic differential equations. For\nindividuals with perfect information regarding the environment, we implement\nnumeric methods to see the expected payoff and corresponding optimal control.\nFor individual with only partial information of the environment, we combine the\nfinite difference method and stochastic simulations to investigate the change\nof the bird's optimal strategy. Based on biological backgrounds, we\ncharacterizing the optimal strategies of birds under different scenarios and\nthese behaviors depend on the specific assumptions of the model.\n","authors":["Jiawei Chu","King-Yeung Lam","Boyu Wang","Tong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.19467v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06983v1","updated":"2025-01-13T00:18:39Z","published":"2025-01-13T00:18:39Z","title":"An Alternating Approach to Approximate Dynamic Programming","summary":"  In this paper, we give a new approximate dynamic programming (ADP) method to\nsolve large-scale Markov decision programming (MDP) problem. In comparison with\nmany classic ADP methods which have large number of constraints, we formulate\nan alternating ADP (AADP) which have both small number of constraints and small\nnumber of variables by approximating the decision variables (instead of the\nobjective functions in classic ADP) and write the dual of the exact LP. Also,\nto get the basis functions, we use kernel approximation instead of empirical\nchoice of basis functions, which can efficiently learn nonlinear functions\nwhile retaining the expressive power. By treating option pricing as an\nlarge-scale MDP problem, we apply the AADP method to give an empirical proof\nthat American call option will not be exercised earlier if the underlying stock\nhas no dividend payment, which is a classic result proved by Black-Scholes\nmodel. We also make comparison of pricing options in high-dimensional with some\nbenchmark option pricing papers which use the classic ADP to give upper and\nlower bound of the option price.\n","authors":["Di Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10941v2","updated":"2025-01-13T00:03:58Z","published":"2024-11-17T02:39:58Z","title":"Efficient Estimation of Relaxed Model Parameters for Robust UAV\n  Trajectory Optimization","summary":"  Online trajectory optimization and optimal control methods are crucial for\nenabling sustainable unmanned aerial vehicle (UAV) services, such as\nagriculture, environmental monitoring, and transportation, where available\nactuation and energy are limited. However, optimal controllers are highly\nsensitive to model mismatch, which can occur due to loaded equipment, packages\nto be delivered, or pre-existing variability in fundamental structural and\nthrust-related parameters. To circumvent this problem, optimal controllers can\nbe paired with parameter estimators to improve their trajectory planning\nperformance and perform adaptive control. However, UAV platforms are limited in\nterms of onboard processing power, oftentimes making nonlinear parameter\nestimation too computationally expensive to consider. To address these issues,\nwe propose a relaxed, affine-in-parameters multirotor model along with an\nefficient optimal parameter estimator. We convexify the nominal Moving Horizon\nParameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via\nan affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast\nquadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC)\nin real time. We compare this approach to the equivalent nonlinear estimator in\nMonte Carlo simulations, demonstrating a decrease in average solve time and\ntrajectory optimality cost by 98.2% and 23.9-56.2%, respectively.\n","authors":["Derek Fan","David A. Copp"],"pdf_url":"https://arxiv.org/pdf/2411.10941v2.pdf","comment":"8 pages, 5 figures, to be published in IEEE Sustech 2025"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.07575v1","updated":"2025-01-13T18:59:48Z","published":"2025-01-13T18:59:48Z","title":"Dataset Distillation via Committee Voting","summary":"  Dataset distillation aims to synthesize a smaller, representative dataset\nthat preserves the essential properties of the original data, enabling\nefficient model training with reduced computational resources. Prior work has\nprimarily focused on improving the alignment or matching process between\noriginal and synthetic data, or on enhancing the efficiency of distilling large\ndatasets. In this work, we introduce ${\\bf C}$ommittee ${\\bf V}$oting for ${\\bf\nD}$ataset ${\\bf D}$istillation (CV-DD), a novel and orthogonal approach that\nleverages the collective wisdom of multiple models or experts to create\nhigh-quality distilled datasets. We start by showing how to establish a strong\nbaseline that already achieves state-of-the-art accuracy through leveraging\nrecent advancements and thoughtful adjustments in model design and optimization\nprocesses. By integrating distributions and predictions from a committee of\nmodels while generating high-quality soft labels, our method captures a wider\nspectrum of data features, reduces model-specific biases and the adverse\neffects of distribution shifts, leading to significant improvements in\ngeneralization. This voting-based strategy not only promotes diversity and\nrobustness within the distilled dataset but also significantly reduces\noverfitting, resulting in improved performance on post-eval tasks. Extensive\nexperiments across various datasets and IPCs (images per class) demonstrate\nthat Committee Voting leads to more reliable and adaptable distilled data\ncompared to single/multi-model distillation methods, demonstrating its\npotential for efficient and accurate dataset distillation. Code is available\nat: https://github.com/Jiacheng8/CV-DD.\n","authors":["Jiacheng Cui","Zhaoyi Li","Xiaochen Ma","Xinyue Bi","Yaxin Luo","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2501.07575v1.pdf","comment":"Code at: https://github.com/Jiacheng8/CV-DD"},{"id":"http://arxiv.org/abs/2501.07574v1","updated":"2025-01-13T18:59:20Z","published":"2025-01-13T18:59:20Z","title":"UnCommon Objects in 3D","summary":"  We introduce Uncommon Objects in 3D (uCO3D), a new object-centric dataset for\n3D deep learning and 3D generative AI. uCO3D is the largest publicly-available\ncollection of high-resolution videos of objects with 3D annotations that\nensures full-360$^{\\circ}$ coverage. uCO3D is significantly more diverse than\nMVImgNet and CO3Dv2, covering more than 1,000 object categories. It is also of\nhigher quality, due to extensive quality checks of both the collected videos\nand the 3D annotations. Similar to analogous datasets, uCO3D contains\nannotations for 3D camera poses, depth maps and sparse point clouds. In\naddition, each object is equipped with a caption and a 3D Gaussian Splat\nreconstruction. We train several large 3D models on MVImgNet, CO3Dv2, and uCO3D\nand obtain superior results using the latter, showing that uCO3D is better for\nlearning applications.\n","authors":["Xingchen Liu","Piyush Tayal","Jianyuan Wang","Jesus Zarzar","Tom Monnier","Konstantinos Tertikas","Jiali Duan","Antoine Toisoul","Jason Y. Zhang","Natalia Neverova","Andrea Vedaldi","Roman Shapovalov","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2501.07574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07563v1","updated":"2025-01-13T18:53:08Z","published":"2025-01-13T18:53:08Z","title":"Training-Free Motion-Guided Video Generation with Enhanced Temporal\n  Consistency Using Motion Consistency Loss","summary":"  In this paper, we address the challenge of generating temporally consistent\nvideos with motion guidance. While many existing methods depend on additional\ncontrol modules or inference-time fine-tuning, recent studies suggest that\neffective motion guidance is achievable without altering the model architecture\nor requiring extra training. Such approaches offer promising compatibility with\nvarious video generation foundation models. However, existing training-free\nmethods often struggle to maintain consistent temporal coherence across frames\nor to follow guided motion accurately. In this work, we propose a simple yet\neffective solution that combines an initial-noise-based approach with a novel\nmotion consistency loss, the latter being our key innovation. Specifically, we\ncapture the inter-frame feature correlation patterns of intermediate features\nfrom a video diffusion model to represent the motion pattern of the reference\nvideo. We then design a motion consistency loss to maintain similar feature\ncorrelation patterns in the generated video, using the gradient of this loss in\nthe latent space to guide the generation process for precise motion control.\nThis approach improves temporal consistency across various motion control tasks\nwhile preserving the benefits of a training-free setup. Extensive experiments\nshow that our method sets a new standard for efficient, temporally coherent\nvideo generation.\n","authors":["Xinyu Zhang","Zicheng Duan","Dong Gong","Lingqiao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.07563v1.pdf","comment":"Project page:\n  https://zhangxinyu-xyz.github.io/SimulateMotion.github.io/"},{"id":"http://arxiv.org/abs/2501.07556v1","updated":"2025-01-13T18:37:36Z","published":"2025-01-13T18:37:36Z","title":"MatchAnything: Universal Cross-Modality Image Matching with Large-Scale\n  Pre-Training","summary":"  Image matching, which aims to identify corresponding pixel locations between\nimages, is crucial in a wide range of scientific disciplines, aiding in image\nregistration, fusion, and analysis. In recent years, deep learning-based image\nmatching algorithms have dramatically outperformed humans in rapidly and\naccurately finding large amounts of correspondences. However, when dealing with\nimages captured under different imaging modalities that result in significant\nappearance changes, the performance of these algorithms often deteriorates due\nto the scarcity of annotated cross-modal training data. This limitation hinders\napplications in various fields that rely on multiple image modalities to obtain\ncomplementary information. To address this challenge, we propose a large-scale\npre-training framework that utilizes synthetic cross-modal training signals,\nincorporating diverse data from various sources, to train models to recognize\nand match fundamental structures across images. This capability is transferable\nto real-world, unseen cross-modality image matching tasks. Our key finding is\nthat the matching model trained with our framework achieves remarkable\ngeneralizability across more than eight unseen cross-modality registration\ntasks using the same network weight, substantially outperforming existing\nmethods, whether designed for generalization or tailored for specific tasks.\nThis advancement significantly enhances the applicability of image matching\ntechnologies across various scientific disciplines and paves the way for new\napplications in multi-modality human and artificial intelligence analysis and\nbeyond.\n","authors":["Xingyi He","Hao Yu","Sida Peng","Dongli Tan","Zehong Shen","Hujun Bao","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.07556v1.pdf","comment":"Project page: https://zju3dv.github.io/MatchAnything/"},{"id":"http://arxiv.org/abs/2501.07554v1","updated":"2025-01-13T18:37:08Z","published":"2025-01-13T18:37:08Z","title":"SST-EM: Advanced Metrics for Evaluating Semantic, Spatial and Temporal\n  Aspects in Video Editing","summary":"  Video editing models have advanced significantly, but evaluating their\nperformance remains challenging. Traditional metrics, such as CLIP text and\nimage scores, often fall short: text scores are limited by inadequate training\ndata and hierarchical dependencies, while image scores fail to assess temporal\nconsistency. We present SST-EM (Semantic, Spatial, and Temporal Evaluation\nMetric), a novel evaluation framework that leverages modern Vision-Language\nModels (VLMs), Object Detection, and Temporal Consistency checks. SST-EM\ncomprises four components: (1) semantic extraction from frames using a VLM, (2)\nprimary object tracking with Object Detection, (3) focused object refinement\nvia an LLM agent, and (4) temporal consistency assessment using a Vision\nTransformer (ViT). These components are integrated into a unified metric with\nweights derived from human evaluations and regression analysis. The name SST-EM\nreflects its focus on Semantic, Spatial, and Temporal aspects of video\nevaluation. SST-EM provides a comprehensive evaluation of semantic fidelity and\ntemporal smoothness in video editing. The source code is available in the\n\\textbf{\\href{https://github.com/custommetrics-sst/SST_CustomEvaluationMetrics.git}{GitHub\nRepository}}.\n","authors":["Varun Biyyala","Bharat Chanderprakash Kathuria","Jialu Li","Youshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07554v1.pdf","comment":"WACV workshop"},{"id":"http://arxiv.org/abs/2501.07542v1","updated":"2025-01-13T18:23:57Z","published":"2025-01-13T18:23:57Z","title":"Imagine while Reasoning in Space: Multimodal Visualization-of-Thought","summary":"  Chain-of-Thought (CoT) prompting has proven highly effective for enhancing\ncomplex reasoning in Large Language Models (LLMs) and Multimodal Large Language\nModels (MLLMs). Yet, it struggles in complex spatial reasoning tasks.\nNonetheless, human cognition extends beyond language alone, enabling the\nremarkable capability to think in both words and images. Inspired by this\nmechanism, we propose a new reasoning paradigm, Multimodal\nVisualization-of-Thought (MVoT). It enables visual thinking in MLLMs by\ngenerating image visualizations of their reasoning traces. To ensure\nhigh-quality visualization, we introduce token discrepancy loss into\nautoregressive MLLMs. This innovation significantly improves both visual\ncoherence and fidelity. We validate this approach through several dynamic\nspatial reasoning tasks. Experimental results reveal that MVoT demonstrates\ncompetitive performance across tasks. Moreover, it exhibits robust and reliable\nimprovements in the most challenging scenarios where CoT fails. Ultimately,\nMVoT establishes new possibilities for complex reasoning tasks where visual\nthinking can effectively complement verbal reasoning.\n","authors":["Chengzu Li","Wenshan Wu","Huanyu Zhang","Yan Xia","Shaoguang Mao","Li Dong","Ivan Vulić","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2501.07542v1.pdf","comment":"11 pages, 6 figures, 4 tables (27 pages, 10 figures, 16 tables\n  including references and appendices)"},{"id":"http://arxiv.org/abs/2501.07754v1","updated":"2025-01-13T23:55:11Z","published":"2025-01-13T23:55:11Z","title":"Universal Training of Neural Networks to Achieve Bayes Optimal\n  Classification Accuracy","summary":"  This work invokes the notion of $f$-divergence to introduce a novel upper\nbound on the Bayes error rate of a general classification task. We show that\nthe proposed bound can be computed by sampling from the output of a\nparameterized model. Using this practical interpretation, we introduce the\nBayes optimal learning threshold (BOLT) loss whose minimization enforces a\nclassification model to achieve the Bayes error rate. We validate the proposed\nloss for image and text classification tasks, considering MNIST, Fashion-MNIST,\nCIFAR-10, and IMDb datasets. Numerical experiments demonstrate that models\ntrained with BOLT achieve performance on par with or exceeding that of\ncross-entropy, particularly on challenging datasets. This highlights the\npotential of BOLT in improving generalization.\n","authors":["Mohammadreza Tavasoli Naeini","Ali Bereyhi","Morteza Noshad","Ben Liang","Alfred O. Hero III"],"pdf_url":"https://arxiv.org/pdf/2501.07754v1.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2410.20631v2","updated":"2025-01-13T23:45:51Z","published":"2024-10-27T23:29:46Z","title":"PViT: Prior-augmented Vision Transformer for Out-of-distribution\n  Detection","summary":"  Vision Transformers (ViTs) have achieved remarkable success over various\nvision tasks, yet their robustness against data distribution shifts and\ninherent inductive biases remain underexplored. To enhance the robustness of\nViT models for image Out-of-Distribution (OOD) detection, we introduce a novel\nand generic framework named Prior-augmented Vision Transformer (PViT). Taking\nas input the prior class logits from a pretrained model, we train PViT to\npredict the class logits. During inference, PViT identifies OOD samples by\nquantifying the divergence between the predicted class logits and the prior\nlogits obtained from pre-trained models. Unlike existing state-of-the-art(SOTA)\nOOD detection methods, PViT shapes the decision boundary between ID and OOD by\nutilizing the proposed prior guided confidence, without requiring additional\ndata modeling, generation methods, or structural modifications. Extensive\nexperiments on the large-scale ImageNet benchmark, evaluated against over seven\nOOD datasets, demonstrate that PViT significantly outperforms existing SOTA OOD\ndetection methods in terms of FPR95 and AUROC. The codebase is publicly\navailable at https://github.com/RanchoGoose/PViT.\n","authors":["Tianhao Zhang","Zhixiang Chen","Lyudmila S. Mihaylova"],"pdf_url":"https://arxiv.org/pdf/2410.20631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00142v2","updated":"2025-01-13T23:45:26Z","published":"2024-11-28T18:55:41Z","title":"Sparse Attention Vectors: Generative Multimodal Model Features Are\n  Discriminative Vision-Language Classifiers","summary":"  Generative Large Multimodal Models (LMMs) like LLaVA and Qwen-VL excel at a\nwide variety of vision-language (VL) tasks such as image captioning or visual\nquestion answering. Despite strong performance, LMMs are not directly suited\nfor foundational discriminative vision-language tasks (i.e., tasks requiring\ndiscrete label predictions) such as image classification and multiple-choice\nVQA. One key challenge in utilizing LMMs for discriminative tasks is the\nextraction of useful features from generative models. To overcome this issue,\nwe propose an approach for finding features in the model's latent space to more\neffectively leverage LMMs for discriminative tasks. Toward this end, we present\nSparse Attention Vectors (SAVs) -- a finetuning-free method that leverages\nsparse attention head activations (fewer than 1\\% of the heads) in LMMs as\nstrong features for VL tasks. With only few-shot examples, SAVs demonstrate\nstate-of-the-art performance compared to a variety of few-shot and finetuned\nbaselines on a collection of discriminative tasks. Our experiments also imply\nthat SAVs can scale in performance with additional examples and generalize to\nsimilar tasks, establishing SAVs as both effective and robust multimodal\nfeature representations.\n","authors":["Chancharik Mitra","Brandon Huang","Tianning Chai","Zhiqiu Lin","Assaf Arbelle","Rogerio Feris","Leonid Karlinsky","Trevor Darrell","Deva Ramanan","Roei Herzig"],"pdf_url":"https://arxiv.org/pdf/2412.00142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07750v1","updated":"2025-01-13T23:38:49Z","published":"2025-01-13T23:38:49Z","title":"Boosting Sclera Segmentation through Semi-supervised Learning with Fewer\n  Labels","summary":"  Sclera segmentation is crucial for developing automatic eye-related medical\ncomputer-aided diagnostic systems, as well as for personal identification and\nverification, because the sclera contains distinct personal features. Deep\nlearning-based sclera segmentation has achieved significant success compared to\ntraditional methods that rely on hand-crafted features, primarily because it\ncan autonomously extract critical output-related features without the need to\nconsider potential physical constraints. However, achieving accurate sclera\nsegmentation using these methods is challenging due to the scarcity of\nhigh-quality, fully labeled datasets, which depend on costly, labor-intensive\nmedical acquisition and expertise. To address this challenge, this paper\nintroduces a novel sclera segmentation framework that excels with limited\nlabeled samples. Specifically, we employ a semi-supervised learning method that\nintegrates domain-specific improvements and image-based spatial transformations\nto enhance segmentation performance. Additionally, we have developed a\nreal-world eye diagnosis dataset to enrich the evaluation process. Extensive\nexperiments on our dataset and two additional public datasets demonstrate the\neffectiveness and superiority of our proposed method, especially with\nsignificantly fewer labeled samples.\n","authors":["Guanjun Wang","Lu Wang","Ning Niu","Qiaoyi Yao","Yixuan Wang","Sufen Ren","Shengchao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07750v1.pdf","comment":"Under review, 19 pages, 9 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.07746v1","updated":"2025-01-13T23:21:33Z","published":"2025-01-13T23:21:33Z","title":"A Heterogeneous Multimodal Graph Learning Framework for Recognizing User\n  Emotions in Social Networks","summary":"  The rapid expansion of social media platforms has provided unprecedented\naccess to massive amounts of multimodal user-generated content. Comprehending\nuser emotions can provide valuable insights for improving communication and\nunderstanding of human behaviors. Despite significant advancements in Affective\nComputing, the diverse factors influencing user emotions in social networks\nremain relatively understudied. Moreover, there is a notable lack of deep\nlearning-based methods for predicting user emotions in social networks, which\ncould be addressed by leveraging the extensive multimodal data available. This\nwork presents a novel formulation of personalized emotion prediction in social\nnetworks based on heterogeneous graph learning. Building upon this formulation,\nwe design HMG-Emo, a Heterogeneous Multimodal Graph Learning Framework that\nutilizes deep learning-based features for user emotion recognition.\nAdditionally, we include a dynamic context fusion module in HMG-Emo that is\ncapable of adaptively integrating the different modalities in social media\ndata. Through extensive experiments, we demonstrate the effectiveness of\nHMG-Emo and verify the superiority of adopting a graph neural network-based\napproach, which outperforms existing baselines that use rich hand-crafted\nfeatures. To the best of our knowledge, HMG-Emo is the first multimodal and\ndeep-learning-based approach to predict personalized emotions within online\nsocial networks. Our work highlights the significance of exploiting advanced\ndeep learning techniques for less-explored problems in Affective Computing.\n","authors":["Sree Bhattacharyya","Shuhua Yang","James Z. Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07742v1","updated":"2025-01-13T23:13:33Z","published":"2025-01-13T23:13:33Z","title":"Fixing the Scale and Shift in Monocular Depth For Camera Pose Estimation","summary":"  Recent advances in monocular depth prediction have led to significantly\nimproved depth prediction accuracy. In turn, this enables various applications\nto use such depth predictions. In this paper, we propose a novel framework for\nestimating the relative pose between two cameras from point correspondences\nwith associated monocular depths. Since depth predictions are typically defined\nup to an unknown scale and shift parameter, our solvers jointly estimate both\nscale and shift parameters together with the camera pose. We derive efficient\nsolvers for three cases: (1) two calibrated cameras, (2) two uncalibrated\ncameras with an unknown but shared focal length, and (3) two uncalibrated\ncameras with unknown and different focal lengths. Experiments on synthetic and\nreal data, including experiments with depth maps estimated by 11 different\ndepth predictors, show the practical viability of our solvers. Compared to\nprior work, our solvers achieve state-of-the-art results on two large-scale,\nreal-world datasets. The source code is available at\nhttps://github.com/yaqding/pose_monodepth\n","authors":["Yaqing Ding","Václav Vávra","Viktor Kocur","Jian Yang","Torsten Sattler","Zuzana Kukelova"],"pdf_url":"https://arxiv.org/pdf/2501.07742v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.07730v1","updated":"2025-01-13T22:37:17Z","published":"2025-01-13T22:37:17Z","title":"Democratizing Text-to-Image Masked Generative Models with Compact\n  Text-Aware One-Dimensional Tokens","summary":"  Image tokenizers form the foundation of modern text-to-image generative\nmodels but are notoriously difficult to train. Furthermore, most existing\ntext-to-image models rely on large-scale, high-quality private datasets, making\nthem challenging to replicate. In this work, we introduce Text-Aware\nTransformer-based 1-Dimensional Tokenizer (TA-TiTok), an efficient and powerful\nimage tokenizer that can utilize either discrete or continuous 1-dimensional\ntokens. TA-TiTok uniquely integrates textual information during the tokenizer\ndecoding stage (i.e., de-tokenization), accelerating convergence and enhancing\nperformance. TA-TiTok also benefits from a simplified, yet effective, one-stage\ntraining process, eliminating the need for the complex two-stage distillation\nused in previous 1-dimensional tokenizers. This design allows for seamless\nscalability to large datasets. Building on this, we introduce a family of\ntext-to-image Masked Generative Models (MaskGen), trained exclusively on open\ndata while achieving comparable performance to models trained on private data.\nWe aim to release both the efficient, strong TA-TiTok tokenizers and the\nopen-data, open-weight MaskGen models to promote broader access and democratize\nthe field of text-to-image masked generative models.\n","authors":["Dongwon Kim","Ju He","Qihang Yu","Chenglin Yang","Xiaohui Shen","Suha Kwak","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07730v1.pdf","comment":"Project page at https://tacju.github.io/projects/maskgen.html"},{"id":"http://arxiv.org/abs/2404.12652v2","updated":"2025-01-13T21:59:56Z","published":"2024-04-19T06:41:32Z","title":"Pre-trained Vision-Language Models Learn Discoverable Visual Concepts","summary":"  Do vision-language models (VLMs) pre-trained to caption an image of a\n\"durian\" learn visual concepts such as \"brown\" (color) and \"spiky\" (texture) at\nthe same time? We aim to answer this question as visual concepts learned \"for\nfree\" would enable wide applications such as neuro-symbolic reasoning or\nhuman-interpretable object classification. We assume that the visual concepts,\nif captured by pre-trained VLMs, can be extracted by their vision-language\ninterface with text-based concept prompts. We observe that recent works\nprompting VLMs with concepts often differ in their strategies to define and\nevaluate the visual concepts, leading to conflicting conclusions. We propose a\nnew concept definition strategy based on two observations: First, certain\nconcept prompts include shortcuts that recognize correct concepts for wrong\nreasons; Second, multimodal information (e.g. visual discriminativeness, and\ntextual knowledge) should be leveraged when selecting the concepts. Our\nproposed concept discovery and learning (CDL) framework is thus designed to\nidentify a diverse list of generic visual concepts (e.g. \"spiky\" as opposed to\n\"spiky durian\"), which are ranked and selected based on visual and language\nmutual information. We carefully design quantitative and human evaluations of\nthe discovered concepts on six diverse visual recognition datasets, which\nconfirm that pre-trained VLMs do learn visual concepts that provide accurate\nand thorough descriptions for the recognized objects. All code and models are\npublicly released.\n","authors":["Yuan Zang","Tian Yun","Hao Tan","Trung Bui","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.12652v2.pdf","comment":"Transactions on Machine Learning Research, 2025"},{"id":"http://arxiv.org/abs/2501.07713v1","updated":"2025-01-13T21:52:46Z","published":"2025-01-13T21:52:46Z","title":"Testing Human-Hand Segmentation on In-Distribution and\n  Out-of-Distribution Data in Human-Robot Interactions Using a Deep Ensemble\n  Model","summary":"  Reliable detection and segmentation of human hands are critical for enhancing\nsafety and facilitating advanced interactions in human-robot collaboration.\nCurrent research predominantly evaluates hand segmentation under\nin-distribution (ID) data, which reflects the training data of deep learning\n(DL) models. However, this approach fails to address out-of-distribution (OOD)\nscenarios that often arise in real-world human-robot interactions. In this\nstudy, we present a novel approach by evaluating the performance of pre-trained\nDL models under both ID data and more challenging OOD scenarios. To mimic\nrealistic industrial scenarios, we designed a diverse dataset featuring simple\nand cluttered backgrounds with industrial tools, varying numbers of hands (0 to\n4), and hands with and without gloves. For OOD scenarios, we incorporated\nunique and rare conditions such as finger-crossing gestures and motion blur\nfrom fast-moving hands, addressing both epistemic and aleatoric uncertainties.\nTo ensure multiple point of views (PoVs), we utilized both egocentric cameras,\nmounted on the operator's head, and static cameras to capture RGB images of\nhuman-robot interactions. This approach allowed us to account for multiple\ncamera perspectives while also evaluating the performance of models trained on\nexisting egocentric datasets as well as static-camera datasets. For\nsegmentation, we used a deep ensemble model composed of UNet and RefineNet as\nbase learners. Performance evaluation was conducted using segmentation metrics\nand uncertainty quantification via predictive entropy. Results revealed that\nmodels trained on industrial datasets outperformed those trained on\nnon-industrial datasets, highlighting the importance of context-specific\ntraining. Although all models struggled with OOD scenarios, those trained on\nindustrial datasets demonstrated significantly better generalization.\n","authors":["Reza Jalayer","Yuxin Chen","Masoud Jalayer","Carlotta Orsenigo","Masayoshi Tomizuka"],"pdf_url":"https://arxiv.org/pdf/2501.07713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07711v1","updated":"2025-01-13T21:45:01Z","published":"2025-01-13T21:45:01Z","title":"Pedestrian Trajectory Prediction Based on Social Interactions Learning\n  With Random Weights","summary":"  Pedestrian trajectory prediction is a critical technology in the evolution of\nself-driving cars toward complete artificial intelligence. Over recent years,\nfocusing on the trajectories of pedestrians to model their social interactions\nhas surged with great interest in more accurate trajectory predictions.\nHowever, existing methods for modeling pedestrian social interactions rely on\npre-defined rules, struggling to capture non-explicit social interactions. In\nthis work, we propose a novel framework named DTGAN, which extends the\napplication of Generative Adversarial Networks (GANs) to graph sequence data,\nwith the primary objective of automatically capturing implicit social\ninteractions and achieving precise predictions of pedestrian trajectory. DTGAN\ninnovatively incorporates random weights within each graph to eliminate the\nneed for pre-defined interaction rules. We further enhance the performance of\nDTGAN by exploring diverse task loss functions during adversarial training,\nwhich yields improvements of 16.7\\% and 39.3\\% on metrics ADE and FDE,\nrespectively. The effectiveness and accuracy of our framework are verified on\ntwo public datasets. The experimental results show that our proposed DTGAN\nachieves superior performance and is well able to understand pedestrians'\nintentions.\n","authors":["Jiajia Xie","Sheng Zhang","Beihao Xia","Zhu Xiao","Hongbo Jiang","Siwang Zhou","Zheng Qin","Hongyang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07711v1.pdf","comment":"13 pages,7 figures,Accepted to IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2404.00427v2","updated":"2025-01-13T21:20:02Z","published":"2024-03-30T17:21:07Z","title":"Extracting Manifold Information from Point Clouds","summary":"  A kernel based method is proposed for the construction of signature\n(defining) functions of subsets of $\\mathbb{R}^d$. The subsets can range from\nfull dimensional manifolds (open subsets) to point clouds (a finite number of\npoints) and include bounded smooth manifolds of any codimension. The\ninterpolation and analysis of point clouds are the main application. Two\nextreme cases in terms of regularity are considered, where the data set is\ninterpolated by an analytic surface, at the one extreme, and by a H\\\"older\ncontinuous surface, at the other. The signature function can be computed as a\nlinear combination of translated kernels, the coefficients of which are the\nsolution of a finite dimensional linear problem. Once it is obtained, it can be\nused to estimate the dimension as well as the normal and the curvatures of the\ninterpolated surface. The method is global and does not require explicit\nknowledge of local neighborhoods or any other structure present in the data\nset. It admits a variational formulation with a natural ``regularized''\ncounterpart, that proves to be useful in dealing with data sets corrupted by\nnumerical error or noise. The underlying analytical structure of the approach\nis presented in general before it is applied to the case of point clouds.\n","authors":["Patrick Guidotti"],"pdf_url":"https://arxiv.org/pdf/2404.00427v2.pdf","comment":"27 pages, 16 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.07688v1","updated":"2025-01-13T21:04:37Z","published":"2025-01-13T21:04:37Z","title":"C2PD: Continuity-Constrained Pixelwise Deformation for Guided Depth\n  Super-Resolution","summary":"  Guided depth super-resolution (GDSR) has demonstrated impressive performance\nacross a wide range of domains, with numerous methods being proposed. However,\nexisting methods often treat depth maps as images, where shading values are\ncomputed discretely, making them struggle to effectively restore the continuity\ninherent in the depth map. In this paper, we propose a novel approach that\nmaximizes the utilization of spatial characteristics in depth, coupled with\nhuman abstract perception of real-world substance, by transforming the GDSR\nissue into deformation of a roughcast with ideal plasticity, which can be\ndeformed by force like a continuous object. Specifically, we firstly designed a\ncross-modal operation, Continuity-constrained Asymmetrical Pixelwise Operation\n(CAPO), which can mimic the process of deforming an isovolumetrically flexible\nobject through external forces. Utilizing CAPO as the fundamental component, we\ndevelop the Pixelwise Cross Gradient Deformation (PCGD), which is capable of\nemulating operations on ideal plastic objects (without volume constraint).\nNotably, our approach demonstrates state-of-the-art performance across four\nwidely adopted benchmarks for GDSR, with significant advantages in large-scale\ntasks and generalizability.\n","authors":["Jiahui Kang","Qing Cai","Runqing Tan","Yimei Liu","Zhi Liu"],"pdf_url":"https://arxiv.org/pdf/2501.07688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07681v1","updated":"2025-01-13T20:41:52Z","published":"2025-01-13T20:41:52Z","title":"Dataset Distillation as Pushforward Optimal Quantization","summary":"  Dataset distillation aims to find a synthetic training set such that training\non the synthetic data achieves similar performance to training on real data,\nwith orders of magnitude less computational requirements. Existing methods can\nbe broadly categorized as either bi-level optimization problems that have\nneural network training heuristics as the lower level problem, or disentangled\nmethods that bypass the bi-level optimization by matching distributions of\ndata. The latter method has the major advantages of speed and scalability in\nterms of size of both training and distilled datasets. We demonstrate that when\nequipped with an encoder-decoder structure, the empirically successful\ndisentangled methods can be reformulated as an optimal quantization problem,\nwhere a finite set of points is found to approximate the underlying probability\nmeasure by minimizing the expected projection distance. In particular, we link\nexisting disentangled dataset distillation methods to the classical optimal\nquantization and Wasserstein barycenter problems, demonstrating consistency of\ndistilled datasets for diffusion-based generative priors. We propose a simple\nextension of the state-of-the-art data distillation method D4M, achieving\nbetter performance on the ImageNet-1K dataset with trivial additional\ncomputation, and state-of-the-art performance in higher image-per-class\nsettings.\n","authors":["Hong Ye Tan","Emma Slade"],"pdf_url":"https://arxiv.org/pdf/2501.07681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02052v4","updated":"2025-01-13T19:51:53Z","published":"2024-10-02T21:42:35Z","title":"ExACT: Teaching AI Agents to Explore with Reflective-MCTS and\n  Exploratory Learning","summary":"  Autonomous agents have demonstrated significant potential in automating\ncomplex multistep decision-making tasks. However, even state-of-the-art\nvision-language models (VLMs), such as GPT-4o, still fall short of human-level\nperformance, particularly in intricate web environments and long-horizon tasks.\nTo address these limitations, we present ExACT, an approach to combine\ntest-time search and self-learning to build o1-like models for agentic\napplications. We first introduce Reflective Monte Carlo Tree Search (R-MCTS), a\nnovel test time algorithm designed to enhance AI agents' ability to explore\ndecision space on the fly. R-MCTS extends traditional MCTS by 1) incorporating\ncontrastive reflection, allowing agents to learn from past interactions and\ndynamically improve their search efficiency; and 2) using multi-agent debate\nfor reliable state evaluation. Next, we introduce Exploratory Learning, a novel\nlearning strategy to teach agents to search at inference time without relying\non any external search algorithms. On the challenging VisualWebArena benchmark,\nour GPT-4o based R-MCTS agent achieves a 6% to 30% relative improvement across\nvarious tasks compared to the previous state-of-the-art. Additionally, we show\nthat the knowledge and experience gained from test-time search can be\neffectively transferred back to GPT-4o via fine-tuning. After Exploratory\nLearning, GPT-4o 1) demonstrates the ability to explore the environment,\nevaluate a state, and backtrack to viable ones when it detects that the current\nstate cannot lead to success, and 2) matches 87% of R-MCTS's performance while\nusing significantly less compute. Notably, our work demonstrates the compute\nscaling properties in both training - data collection with R-MCTS - and testing\ntime. These results suggest a promising research direction to enhance VLMs'\ncapabilities for agentic applications via test-time search and self-learning.\n","authors":["Xiao Yu","Baolin Peng","Vineeth Vajipey","Hao Cheng","Michel Galley","Jianfeng Gao","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2410.02052v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07647v1","updated":"2025-01-13T19:17:06Z","published":"2025-01-13T19:17:06Z","title":"BlobGEN-Vid: Compositional Text-to-Video Generation with Blob Video\n  Representations","summary":"  Existing video generation models struggle to follow complex text prompts and\nsynthesize multiple objects, raising the need for additional grounding input\nfor improved controllability. In this work, we propose to decompose videos into\nvisual primitives - blob video representation, a general representation for\ncontrollable video generation. Based on blob conditions, we develop a\nblob-grounded video diffusion model named BlobGEN-Vid that allows users to\ncontrol object motions and fine-grained object appearance. In particular, we\nintroduce a masked 3D attention module that effectively improves regional\nconsistency across frames. In addition, we introduce a learnable module to\ninterpolate text embeddings so that users can control semantics in specific\nframes and obtain smooth object transitions. We show that our framework is\nmodel-agnostic and build BlobGEN-Vid based on both U-Net and DiT-based video\ndiffusion models. Extensive experimental results show that BlobGEN-Vid achieves\nsuperior zero-shot video generation ability and state-of-the-art layout\ncontrollability on multiple benchmarks. When combined with an LLM for layout\nplanning, our framework even outperforms proprietary text-to-video generators\nin terms of compositional accuracy.\n","authors":["Weixi Feng","Chao Liu","Sifei Liu","William Yang Wang","Arash Vahdat","Weili Nie"],"pdf_url":"https://arxiv.org/pdf/2501.07647v1.pdf","comment":"Project page: https://blobgen-vid2.github.io/"},{"id":"http://arxiv.org/abs/2411.11222v2","updated":"2025-01-13T18:20:35Z","published":"2024-11-18T01:19:37Z","title":"The Sound of Water: Inferring Physical Properties from Pouring Liquids","summary":"  We study the connection between audio-visual observations and the underlying\nphysics of a mundane yet intriguing everyday activity: pouring liquids. Given\nonly the sound of liquid pouring into a container, our objective is to\nautomatically infer physical properties such as the liquid level, the shape and\nsize of the container, the pouring rate and the time to fill. To this end, we:\n(i) show in theory that these properties can be determined from the fundamental\nfrequency (pitch); (ii) train a pitch detection model with supervision from\nsimulated data and visual data with a physics-inspired objective; (iii)\nintroduce a new large dataset of real pouring videos for a systematic study;\n(iv) show that the trained model can indeed infer these physical properties for\nreal data; and finally, (v) we demonstrate strong generalization to various\ncontainer shapes, other datasets, and in-the-wild YouTube videos. Our work\npresents a keen understanding of a narrow yet rich problem at the intersection\nof acoustics, physics, and learning. It opens up applications to enhance\nmultisensory perception in robotic pouring.\n","authors":["Piyush Bagad","Makarand Tapaswi","Cees G. M. Snoek","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2411.11222v2.pdf","comment":"Project page at https://bpiyush.github.io/pouring-water-website.\n  Short version accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2302.04850v3","updated":"2025-01-13T18:18:24Z","published":"2023-02-09T18:53:44Z","title":"Robot Synesthesia: A Sound and Emotion Guided AI Painter","summary":"  If a picture paints a thousand words, sound may voice a million. While recent\nrobotic painting and image synthesis methods have achieved progress in\ngenerating visuals from text inputs, the translation of sound into images is\nvastly unexplored. Generally, sound-based interfaces and sonic interactions\nhave the potential to expand accessibility and control for the user and provide\na means to convey complex emotions and the dynamic aspects of the real world.\nIn this paper, we propose an approach for using sound and speech to guide a\nrobotic painting process, known here as robot synesthesia. For general sound,\nwe encode the simulated paintings and input sounds into the same latent space.\nFor speech, we decouple speech into its transcribed text and the tone of the\nspeech. Whereas we use the text to control the content, we estimate the\nemotions from the tone to guide the mood of the painting. Our approach has been\nfully integrated with FRIDA, a robotic painting framework, adding sound and\nspeech to FRIDA's existing input modalities, such as text and style. In two\nsurveys, participants were able to correctly guess the emotion or natural sound\nused to generate a given painting more than twice as likely as random chance.\nOn our sound-guided image manipulation and music-guided paintings, we discuss\nthe results qualitatively.\n","authors":["Vihaan Misra","Peter Schaldenbrand","Jean Oh"],"pdf_url":"https://arxiv.org/pdf/2302.04850v3.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2306.11207v4","updated":"2025-01-13T18:16:34Z","published":"2023-06-20T00:14:47Z","title":"Quilt-1M: One Million Image-Text Pairs for Histopathology","summary":"  Recent accelerations in multi-modal applications have been made possible with\nthe plethora of image and text data available online. However, the scarcity of\nanalogous data in the medical field, specifically in histopathology, has slowed\ncomparable progress. To enable similar representation learning for\nhistopathology, we turn to YouTube, an untapped resource of videos, offering\n$1,087$ hours of valuable educational histopathology videos from expert\nclinicians. From YouTube, we curate QUILT: a large-scale vision-language\ndataset consisting of $802, 144$ image and text pairs. QUILT was automatically\ncurated using a mixture of models, including large language models, handcrafted\nalgorithms, human knowledge databases, and automatic speech recognition. In\ncomparison, the most comprehensive datasets curated for histopathology amass\nonly around $200$K samples. We combine QUILT with datasets from other sources,\nincluding Twitter, research papers, and the internet in general, to create an\neven larger dataset: QUILT-1M, with $1$M paired image-text samples, marking it\nas the largest vision-language histopathology dataset to date. We demonstrate\nthe value of QUILT-1M by fine-tuning a pre-trained CLIP model. Our model\noutperforms state-of-the-art models on both zero-shot and linear probing tasks\nfor classifying new histopathology images across $13$ diverse patch-level\ndatasets of $8$ different sub-pathologies and cross-modal retrieval tasks.\n","authors":["Wisdom Oluchi Ikezogwo","Mehmet Saygin Seyfioglu","Fatemeh Ghezloo","Dylan Stefan Chan Geva","Fatwir Sheikh Mohammed","Pavan Kumar Anand","Ranjay Krishna","Linda Shapiro"],"pdf_url":"https://arxiv.org/pdf/2306.11207v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07533v1","updated":"2025-01-13T18:10:19Z","published":"2025-01-13T18:10:19Z","title":"Confident Pseudo-labeled Diffusion Augmentation for Canine Cardiomegaly\n  Detection","summary":"  Canine cardiomegaly, marked by an enlarged heart, poses serious health risks\nif undetected, requiring accurate diagnostic methods. Current detection models\noften rely on small, poorly annotated datasets and struggle to generalize\nacross diverse imaging conditions, limiting their real-world applicability. To\naddress these issues, we propose a Confident Pseudo-labeled Diffusion\nAugmentation (CDA) model for identifying canine cardiomegaly. Our approach\naddresses the challenge of limited high-quality training data by employing\ndiffusion models to generate synthetic X-ray images and annotate Vertebral\nHeart Score key points, thereby expanding the dataset. We also employ a\npseudo-labeling strategy with Monte Carlo Dropout to select high-confidence\nlabels, refine the synthetic dataset, and improve accuracy. Iteratively\nincorporating these labels enhances the model's performance, overcoming the\nlimitations of existing approaches. Experimental results show that the CDA\nmodel outperforms traditional methods, achieving state-of-the-art accuracy in\ncanine cardiomegaly detection. The code implementation is available at\nhttps://github.com/Shira7z/CDA.\n","authors":["Shiman Zhang","Lakshmikar Reddy Polamreddy","Youshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07533v1.pdf","comment":"WACV workshop"},{"id":"http://arxiv.org/abs/2501.07530v1","updated":"2025-01-13T18:08:27Z","published":"2025-01-13T18:08:27Z","title":"IP-FaceDiff: Identity-Preserving Facial Video Editing with Diffusion","summary":"  Facial video editing has become increasingly important for content creators,\nenabling the manipulation of facial expressions and attributes. However,\nexisting models encounter challenges such as poor editing quality, high\ncomputational costs and difficulties in preserving facial identity across\ndiverse edits. Additionally, these models are often constrained to editing\npredefined facial attributes, limiting their flexibility to diverse editing\nprompts. To address these challenges, we propose a novel facial video editing\nframework that leverages the rich latent space of pre-trained text-to-image\n(T2I) diffusion models and fine-tune them specifically for facial video editing\ntasks. Our approach introduces a targeted fine-tuning scheme that enables high\nquality, localized, text-driven edits while ensuring identity preservation\nacross video frames. Additionally, by using pre-trained T2I models during\ninference, our approach significantly reduces editing time by 80%, while\nmaintaining temporal consistency throughout the video sequence. We evaluate the\neffectiveness of our approach through extensive testing across a wide range of\nchallenging scenarios, including varying head poses, complex action sequences,\nand diverse facial expressions. Our method consistently outperforms existing\ntechniques, demonstrating superior performance across a broad set of metrics\nand benchmarks.\n","authors":["Tharun Anand","Aryan Garg","Kaushik Mitra"],"pdf_url":"https://arxiv.org/pdf/2501.07530v1.pdf","comment":"WACV-25 Workshop"},{"id":"http://arxiv.org/abs/2402.16865v3","updated":"2025-01-13T18:06:23Z","published":"2024-01-21T04:14:54Z","title":"Enhance Eye Disease Detection using Learnable Probabilistic Discrete\n  Latents in Machine Learning Architectures","summary":"  Ocular diseases, including diabetic retinopathy and glaucoma, present a\nsignificant public health challenge due to their high prevalence and potential\nfor causing vision impairment. Early and accurate diagnosis is crucial for\neffective treatment and management. In recent years, deep learning models have\nemerged as powerful tools for analysing medical images, such as retina imaging.\nHowever, challenges persist in model relibability and uncertainty estimation,\nwhich are critical for clinical decision-making. This study leverages the\nprobabilistic framework of Generative Flow Networks (GFlowNets) to learn the\nposterior distribution over latent discrete dropout masks for the\nclassification and analysis of ocular diseases using fundus images. We develop\na robust and generalizable method that utilizes GFlowOut integrated with\nResNet18 and ViT models as the backbone in identifying various ocular\nconditions. This study employs a unique set of dropout masks - none, random,\nbottomup, and topdown - to enhance model performance in analyzing these fundus\nimages. Our results demonstrate that our learnable probablistic latents\nsignificantly improves accuracy, outperforming the traditional dropout\napproach. We utilize a gradient map calculation method, Grad-CAM, to assess\nmodel explainability, observing that the model accurately focuses on critical\nimage regions for predictions. The integration of GFlowOut in neural networks\npresents a promising advancement in the automated diagnosis of ocular diseases,\nwith implications for improving clinical workflows and patient outcomes.\n","authors":["Anirudh Prabhakaran","YeKun Xiao","Ching-Yu Cheng","Dianbo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.16865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07525v1","updated":"2025-01-13T17:55:32Z","published":"2025-01-13T17:55:32Z","title":"RadAlign: Advancing Radiology Report Generation with Vision-Language\n  Concept Alignment","summary":"  Automated chest radiographs interpretation requires both accurate disease\nclassification and detailed radiology report generation, presenting a\nsignificant challenge in the clinical workflow. Current approaches either focus\non classification accuracy at the expense of interpretability or generate\ndetailed but potentially unreliable reports through image captioning\ntechniques. In this study, we present RadAlign, a novel framework that combines\nthe predictive accuracy of vision-language models (VLMs) with the reasoning\ncapabilities of large language models (LLMs). Inspired by the radiologist's\nworkflow, RadAlign first employs a specialized VLM to align visual features\nwith key medical concepts, achieving superior disease classification with an\naverage AUC of 0.885 across multiple diseases. These recognized medical\nconditions, represented as text-based concepts in the aligned visual-language\nspace, are then used to prompt LLM-based report generation. Enhanced by a\nretrieval-augmented generation mechanism that grounds outputs in similar\nhistorical cases, RadAlign delivers superior report quality with a GREEN score\nof 0.678, outperforming state-of-the-art methods' 0.634. Our framework\nmaintains strong clinical interpretability while reducing hallucinations,\nadvancing automated medical imaging and report analysis through integrated\npredictive and generative AI. Code is available at\nhttps://github.com/difeigu/RadAlign.\n","authors":["Difei Gu","Yunhe Gao","Yang Zhou","Mu Zhou","Dimitris Metaxas"],"pdf_url":"https://arxiv.org/pdf/2501.07525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11677v2","updated":"2025-01-13T17:45:59Z","published":"2023-04-23T15:09:02Z","title":"RGB-D Indiscernible Object Counting in Underwater Scenes","summary":"  Recently, indiscernible/camouflaged scene understanding has attracted lots of\nresearch attention in the vision community. We further advance the frontier of\nthis field by systematically studying a new challenge named indiscernible\nobject counting (IOC), the goal of which is to count objects that are blended\nwith respect to their surroundings. Due to a lack of appropriate IOC datasets,\nwe present a large-scale dataset IOCfish5K which contains a total of 5,637\nhigh-resolution images and 659,024 annotated center points. Our dataset\nconsists of a large number of indiscernible objects (mainly fish) in underwater\nscenes, making the annotation process all the more challenging. IOCfish5K is\nsuperior to existing datasets with indiscernible scenes because of its larger\nscale, higher image resolutions, more annotations, and denser scenes. All these\naspects make it the most challenging dataset for IOC so far, supporting\nprogress in this area. Benefiting from the recent advancements of depth\nestimation foundation models, we construct high-quality depth maps for\nIOCfish5K by generating pseudo labels using the Depth Anything V2 model. The\nRGB-D version of IOCfish5K is named IOCfish5K-D. For benchmarking purposes on\nIOCfish5K, we select 14 mainstream methods for object counting and carefully\nevaluate them. For multimodal IOCfish5K-D, we evaluate other 4 popular\nmultimodal counting methods. Furthermore, we propose IOCFormer, a new strong\nbaseline that combines density and regression branches in a unified framework\nand can effectively tackle object counting under concealed scenes. We also\npropose IOCFormer-D to enable the effective usage of depth modality in helping\ndetect and count objects hidden in their environments. Experiments show that\nIOCFormer and IOCFormer-D achieve state-of-the-art scores on IOCfish5K and\nIOCfish5K-D, respectively.\n","authors":["Guolei Sun","Xiaogang Cheng","Zhaochong An","Xiaokang Wang","Yun Liu","Deng-Ping Fan","Ming-Ming Cheng","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2304.11677v2.pdf","comment":"Journal version. The resources are available at\n  https://github.com/GuoleiSun/Indiscernible-Object-Counting"},{"id":"http://arxiv.org/abs/2406.04158v3","updated":"2025-01-13T17:44:43Z","published":"2024-06-06T15:18:59Z","title":"CMAR-Net: Accurate Cross-Modal 3D SAR Reconstruction of Vehicle Targets\n  with Sparse Multi-Baseline Data","summary":"  Multi-baseline Synthetic Aperture Radar (SAR) three-dimensional (3D)\ntomography is a crucial remote sensing technique that provides 3D resolution\nunavailable in conventional SAR imaging. However, achieving high-quality\nimaging typically requires multi-angle or full-aperture data, resulting in\nsignificant imaging costs. Recent advancements in sparse 3D SAR, which rely on\ndata from limited apertures, have gained attention as a cost-effective\nalternative. Notably, deep learning techniques have markedly enhanced the\nimaging quality of sparse 3D SAR. Despite these advancements, existing methods\nprimarily depend on high-resolution radar images for supervising the training\nof deep neural networks (DNNs). This exclusive dependence on single-modal data\nprevents the introduction of complementary information from other data sources,\nlimiting further improvements in imaging performance. In this paper, we\nintroduce a Cross-Modal 3D-SAR Reconstruction Network (CMAR-Net) to enhance 3D\nSAR imaging by integrating heterogeneous information. Leveraging cross-modal\nsupervision from 2D optical images and error transfer guaranteed by\ndifferentiable rendering, CMAR-Net achieves efficient training and reconstructs\nhighly sparse multi-baseline SAR data into visually structured and accurate 3D\nimages, particularly for vehicle targets. Extensive experiments on simulated\nand real-world datasets demonstrate that CMAR-Net significantly outperforms\nSOTA sparse reconstruction algorithms based on compressed sensing (CS) and deep\nlearning (DL). Furthermore, our method eliminates the need for time-consuming\nfull-aperture data preprocessing and relies solely on computer-rendered optical\nimages, significantly reducing dataset construction costs. This work highlights\nthe potential of deep learning for multi-baseline SAR 3D imaging and introduces\na novel framework for radar imaging research through cross-modal learning.\n","authors":["Da Li","Guoqiang Zhao","Houjun Sun","Jiacheng Bao"],"pdf_url":"https://arxiv.org/pdf/2406.04158v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05379v2","updated":"2025-01-13T17:22:30Z","published":"2025-01-09T17:04:33Z","title":"Arc2Avatar: Generating Expressive 3D Avatars from a Single Image via ID\n  Guidance","summary":"  Inspired by the effectiveness of 3D Gaussian Splatting (3DGS) in\nreconstructing detailed 3D scenes within multi-view setups and the emergence of\nlarge 2D human foundation models, we introduce Arc2Avatar, the first SDS-based\nmethod utilizing a human face foundation model as guidance with just a single\nimage as input. To achieve that, we extend such a model for diverse-view human\nhead generation by fine-tuning on synthetic data and modifying its\nconditioning. Our avatars maintain a dense correspondence with a human face\nmesh template, allowing blendshape-based expression generation. This is\nachieved through a modified 3DGS approach, connectivity regularizers, and a\nstrategic initialization tailored for our task. Additionally, we propose an\noptional efficient SDS-based correction step to refine the blendshape\nexpressions, enhancing realism and diversity. Experiments demonstrate that\nArc2Avatar achieves state-of-the-art realism and identity preservation,\neffectively addressing color issues by allowing the use of very low guidance,\nenabled by our strong identity prior and initialization strategy, without\ncompromising detail. Please visit https://arc2avatar.github.io for more\nresources.\n","authors":["Dimitrios Gerogiannis","Foivos Paraperas Papantoniou","Rolandos Alexandros Potamias","Alexandros Lattas","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2501.05379v2.pdf","comment":"Project Page https://arc2avatar.github.io"},{"id":"http://arxiv.org/abs/2501.07499v1","updated":"2025-01-13T17:17:17Z","published":"2025-01-13T17:17:17Z","title":"Three-view Focal Length Recovery From Homographies","summary":"  In this paper, we propose a novel approach for recovering focal lengths from\nthree-view homographies. By examining the consistency of normal vectors between\ntwo homographies, we derive new explicit constraints between the focal lengths\nand homographies using an elimination technique. We demonstrate that three-view\nhomographies provide two additional constraints, enabling the recovery of one\nor two focal lengths. We discuss four possible cases, including three cameras\nhaving an unknown equal focal length, three cameras having two different\nunknown focal lengths, three cameras where one focal length is known, and the\nother two cameras have equal or different unknown focal lengths. All the\nproblems can be converted into solving polynomials in one or two unknowns,\nwhich can be efficiently solved using Sturm sequence or hidden variable\ntechnique. Evaluation using both synthetic and real data shows that the\nproposed solvers are both faster and more accurate than methods relying on\nexisting two-view solvers. The code and data are available on\nhttps://github.com/kocurvik/hf\n","authors":["Yaqing Ding","Viktor Kocur","Zuzana Berger Haladová","Qianliang Wu","Shen Cai","Jian Yang","Zuzana Kukelova"],"pdf_url":"https://arxiv.org/pdf/2501.07499v1.pdf","comment":"Code available at https://github.com/kocurvik/hf Dataset available\n  at: https://doi.org/10.5281/zenodo.14638904"},{"id":"http://arxiv.org/abs/2401.10815v2","updated":"2025-01-13T17:14:52Z","published":"2024-01-19T17:02:17Z","title":"RAD-DINO: Exploring Scalable Medical Image Encoders Beyond Text\n  Supervision","summary":"  Language-supervised pre-training has proven to be a valuable method for\nextracting semantically meaningful features from images, serving as a\nfoundational element in multimodal systems within the computer vision and\nmedical imaging domains. However, the computed features are limited by the\ninformation contained in the text, which is particularly problematic in medical\nimaging, where the findings described by radiologists focus on specific\nobservations. This challenge is compounded by the scarcity of paired\nimaging-text data due to concerns over leakage of personal health information.\nIn this work, we fundamentally challenge the prevailing reliance on language\nsupervision for learning general-purpose biomedical imaging encoders. We\nintroduce RAD-DINO, a biomedical image encoder pre-trained solely on unimodal\nbiomedical imaging data that obtains similar or greater performance than\nstate-of-the-art biomedical language-supervised models on a diverse range of\nbenchmarks. Specifically, the quality of learned representations is evaluated\non standard imaging tasks (classification and semantic segmentation), and a\nvision-language alignment task (text report generation from images). To further\ndemonstrate the drawback of language supervision, we show that features from\nRAD-DINO correlate with other medical records (e.g., sex or age) better than\nlanguage-supervised models, which are generally not mentioned in radiology\nreports. Finally, we conduct a series of ablations determining the factors in\nRAD-DINO's performance; notably, we observe that RAD-DINO's downstream\nperformance scales well with the quantity and diversity of training data,\ndemonstrating that image-only supervision is a scalable approach for training a\nfoundational biomedical image encoder. Model weights of RAD-DINO trained on\npublicly available datasets are available at\nhttps://huggingface.co/microsoft/rad-dino.\n","authors":["Fernando Pérez-García","Harshita Sharma","Sam Bond-Taylor","Kenza Bouzid","Valentina Salvatelli","Maximilian Ilse","Shruthi Bannur","Daniel C. Castro","Anton Schwaighofer","Matthew P. Lungren","Maria Wetscherek","Noel Codella","Stephanie L. Hyland","Javier Alvarez-Valle","Ozan Oktay"],"pdf_url":"https://arxiv.org/pdf/2401.10815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07496v1","updated":"2025-01-13T17:14:25Z","published":"2025-01-13T17:14:25Z","title":"Aligning First, Then Fusing: A Novel Weakly Supervised Multimodal\n  Violence Detection Method","summary":"  Weakly supervised violence detection refers to the technique of training\nmodels to identify violent segments in videos using only video-level labels.\nAmong these approaches, multimodal violence detection, which integrates\nmodalities such as audio and optical flow, holds great potential. Existing\nmethods in this domain primarily focus on designing multimodal fusion models to\naddress modality discrepancies. In contrast, we take a different approach;\nleveraging the inherent discrepancies across modalities in violence event\nrepresentation to propose a novel multimodal semantic feature alignment method.\nThis method sparsely maps the semantic features of local, transient, and less\ninformative modalities ( such as audio and optical flow ) into the more\ninformative RGB semantic feature space. Through an iterative process, the\nmethod identifies the suitable no-zero feature matching subspace and aligns the\nmodality-specific event representations based on this subspace, enabling the\nfull exploitation of information from all modalities during the subsequent\nmodality fusion stage. Building on this, we design a new weakly supervised\nviolence detection framework that consists of unimodal multiple-instance\nlearning for extracting unimodal semantic features, multimodal alignment,\nmultimodal fusion, and final detection. Experimental results on benchmark\ndatasets demonstrate the effectiveness of our method, achieving an average\nprecision (AP) of 86.07% on the XD-Violence dataset. Our code is available at\nhttps://github.com/xjpp2016/MAVD.\n","authors":["Wenping Jin","Li Zhu","Jing Sun"],"pdf_url":"https://arxiv.org/pdf/2501.07496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01541v2","updated":"2025-01-13T16:55:29Z","published":"2024-09-03T02:18:45Z","title":"Agentic Copyright Watermarking against Adversarial Evidence Forgery with\n  Purification-Agnostic Curriculum Proxy Learning","summary":"  With the proliferation of AI agents in various domains, protecting the\nownership of AI models has become crucial due to the significant investment in\ntheir development. Unauthorized use and illegal distribution of these models\npose serious threats to intellectual property, necessitating effective\ncopyright protection measures. Model watermarking has emerged as a key\ntechnique to address this issue, embedding ownership information within models\nto assert rightful ownership during copyright disputes. This paper presents\nseveral contributions to model watermarking: a self-authenticating black-box\nwatermarking protocol using hash techniques, a study on evidence forgery\nattacks using adversarial perturbations, a proposed defense involving a\npurification step to counter adversarial attacks, and a purification-agnostic\ncurriculum proxy learning method to enhance watermark robustness and model\nperformance. Experimental results demonstrate the effectiveness of these\napproaches in improving the security, reliability, and performance of\nwatermarked models.\n","authors":["Erjin Bao","Ching-Chun Chang","Hanrui Wang","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2409.01541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07478v1","updated":"2025-01-13T16:52:28Z","published":"2025-01-13T16:52:28Z","title":"3DGS-to-PC: Convert a 3D Gaussian Splatting Scene into a Dense Point\n  Cloud or Mesh","summary":"  3D Gaussian Splatting (3DGS) excels at producing highly detailed 3D\nreconstructions, but these scenes often require specialised renderers for\neffective visualisation. In contrast, point clouds are a widely used 3D\nrepresentation and are compatible with most popular 3D processing software, yet\nconverting 3DGS scenes into point clouds is a complex challenge. In this work\nwe introduce 3DGS-to-PC, a flexible and highly customisable framework that is\ncapable of transforming 3DGS scenes into dense, high-accuracy point clouds. We\nsample points probabilistically from each Gaussian as a 3D density function. We\nadditionally threshold new points using the Mahalanobis distance to the\nGaussian centre, preventing extreme outliers. The result is a point cloud that\nclosely represents the shape encoded into the 3D Gaussian scene. Individual\nGaussians use spherical harmonics to adapt colours depending on view, and each\npoint may contribute only subtle colour hints to the resulting rendered scene.\nTo avoid spurious or incorrect colours that do not fit with the final point\ncloud, we recalculate Gaussian colours via a customised image rendering\napproach, assigning each Gaussian the colour of the pixel to which it\ncontributes most across all views. 3DGS-to-PC also supports mesh generation\nthrough Poisson Surface Reconstruction, applied to points sampled from\npredicted surface Gaussians. This allows coloured meshes to be generated from\n3DGS scenes without the need for re-training. This package is highly\ncustomisable and capability of simple integration into existing 3DGS pipelines.\n3DGS-to-PC provides a powerful tool for converting 3DGS data into point cloud\nand surface-based formats.\n","authors":["Lewis A G Stuart","Michael P Pound"],"pdf_url":"https://arxiv.org/pdf/2501.07478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00982v2","updated":"2025-01-13T16:27:06Z","published":"2024-10-01T18:10:23Z","title":"ScVLM: Enhancing Vision-Language Model for Safety-Critical Event\n  Understanding","summary":"  Accurately identifying, understanding and describing traffic safety-critical\nevents (SCEs), including crashes, tire strikes, and near-crashes, is crucial\nfor advanced driver assistance systems, automated driving systems, and traffic\nsafety. As SCEs are rare events, most general vision-language models (VLMs)\nhave not been trained sufficiently to link SCE videos and narratives, which\ncould lead to hallucinations and missing key safety characteristics. Here, we\nintroduce ScVLM, a novel hybrid methodology that integrates supervised and\ncontrastive learning techniques to classify the severity and types of SCEs, as\nwell as to generate narrative descriptions of SCEs. This approach utilizes\nclassification to enhance VLMs' comprehension of driving videos and improve the\nrationality of event descriptions. The proposed approach is trained on and\nevaluated by more than 8,600 SCEs from the Second Strategic Highway Research\nProgram Naturalistic Driving Study dataset, the largest publicly accessible\ndriving dataset with videos and SCE annotations. The results demonstrate the\nsuperiority of the proposed approach in generating contextually accurate event\ndescriptions and mitigating VLM hallucinations. The code will be available at\nhttps://github.com/datadrivenwheels/ScVLM.\n","authors":["Liang Shi","Boyu Jiang","Tong Zeng","Feng Guo"],"pdf_url":"https://arxiv.org/pdf/2410.00982v2.pdf","comment":"To appear in Proceedings of the IEEE/CVF Winter Conference on\n  Applications of Computer Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2501.07451v1","updated":"2025-01-13T16:24:49Z","published":"2025-01-13T16:24:49Z","title":"A Survey on Dynamic Neural Networks: from Computer Vision to Multi-modal\n  Sensor Fusion","summary":"  Model compression is essential in the deployment of large Computer Vision\nmodels on embedded devices. However, static optimization techniques (e.g.\npruning, quantization, etc.) neglect the fact that different inputs have\ndifferent complexities, thus requiring different amount of computations.\nDynamic Neural Networks allow to condition the number of computations to the\nspecific input. The current literature on the topic is very extensive and\nfragmented. We present a comprehensive survey that synthesizes and unifies\nexisting Dynamic Neural Networks research in the context of Computer Vision.\nAdditionally, we provide a logical taxonomy based on which component of the\nnetwork is adaptive: the output, the computation graph or the input.\nFurthermore, we argue that Dynamic Neural Networks are particularly beneficial\nin the context of Sensor Fusion for better adaptivity, noise reduction and\ninformation prioritization. We present preliminary works in this direction.\n","authors":["Fabio Montello","Ronja Güldenring","Simone Scardapane","Lazaros Nalpantidis"],"pdf_url":"https://arxiv.org/pdf/2501.07451v1.pdf","comment":"Under review at International Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2402.13699v5","updated":"2025-01-13T16:21:58Z","published":"2024-02-21T11:00:23Z","title":"Automation of Quantum Dot Measurement Analysis via Explainable Machine\n  Learning","summary":"  The rapid development of quantum dot (QD) devices for quantum computing has\nnecessitated more efficient and automated methods for device characterization\nand tuning. This work demonstrates the feasibility and advantages of applying\nexplainable machine learning techniques to the analysis of quantum dot\nmeasurements, paving the way for further advances in automated and transparent\nQD device tuning. Many of the measurements acquired during the tuning process\ncome in the form of images that need to be properly analyzed to guide the\nsubsequent tuning steps. By design, features present in such images capture\ncertain behaviors or states of the measured QD devices. When considered\ncarefully, such features can aid the control and calibration of QD devices. An\nimportant example of such images are so-called $\\textit{triangle plots}$, which\nvisually represent current flow and reveal characteristics important for QD\ndevice calibration. While image-based classification tools, such as\nconvolutional neural networks (CNNs), can be used to verify whether a given\nmeasurement is $\\textit{good}$ and thus warrants the initiation of the next\nphase of tuning, they do not provide any insights into how the device should be\nadjusted in the case of $\\textit{bad}$ images. This is because CNNs sacrifice\nprediction and model intelligibility for high accuracy. To ameliorate this\ntrade-off, a recent study introduced an image vectorization approach that\nrelies on the Gabor wavelet transform (Schug $\\textit{et al.}$ 2024\n$\\textit{Proc. XAI4Sci: Explainable Machine Learning for Sciences Workshop\n(AAAI 2024) (Vancouver, Canada)}$ pp 1-6). Here we propose an alternative\nvectorization method that involves mathematical modeling of synthetic triangles\nto mimic the experimental data. Using explainable boosting machines, we show\nthat this new method offers superior explainability of model prediction without\nsacrificing accuracy.\n","authors":["Daniel Schug","Tyler J. Kovach","M. A. Wolfe","Jared Benson","Sanghyeok Park","J. P. Dodson","J. Corrigan","M. A. Eriksson","Justyna P. Zwolak"],"pdf_url":"https://arxiv.org/pdf/2402.13699v5.pdf","comment":"20 pages, 5 figures, abbreviated version published in Proceedings of\n  the XAI4Sci: Explainable machine learning for sciences workshop at AAAI 2024,\n  (Vancouver, Canada)"},{"id":"http://arxiv.org/abs/2501.07447v1","updated":"2025-01-13T16:18:31Z","published":"2025-01-13T16:18:31Z","title":"PrecipDiff: Leveraging image diffusion models to enhance satellite-based\n  precipitation observations","summary":"  A recent report from the World Meteorological Organization (WMO) highlights\nthat water-related disasters have caused the highest human losses among natural\ndisasters over the past 50 years, with over 91\\% of deaths occurring in\nlow-income countries. This disparity is largely due to the lack of adequate\nground monitoring stations, such as weather surveillance radars (WSR), which\nare expensive to install. For example, while the US and Europe combined possess\nover 600 WSRs, Africa, despite having almost one and half times their landmass,\nhas fewer than 40. To address this issue, satellite-based observations offer a\nglobal, near-real-time monitoring solution. However, they face several\nchallenges like accuracy, bias, and low spatial resolution. This study\nleverages the power of diffusion models and residual learning to address these\nlimitations in a unified framework. We introduce the first diffusion model for\ncorrecting the inconsistency between different precipitation products. Our\nmethod demonstrates the effectiveness in downscaling satellite precipitation\nestimates from 10 km to 1 km resolution. Extensive experiments conducted in the\nSeattle region demonstrate significant improvements in accuracy, bias\nreduction, and spatial detail. Importantly, our approach achieves these results\nusing only precipitation data, showcasing the potential of a purely computer\nvision-based approach for enhancing satellite precipitation products and paving\nthe way for further advancements in this domain.\n","authors":["Ting-Yu Dai","Hayato Ushijima-Mwesigwa"],"pdf_url":"https://arxiv.org/pdf/2501.07447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01246v3","updated":"2025-01-13T16:07:46Z","published":"2024-12-02T08:06:14Z","title":"Class Distance Weighted Cross Entropy Loss for Classification of Disease\n  Severity","summary":"  Assessing disease severity with ordinal classes, where each class reflects\nincreasing severity levels, benefits from loss functions designed for this\nordinal structure. Traditional categorical loss functions, like Cross-Entropy\n(CE), often perform suboptimally in these scenarios. To address this, we\npropose a novel loss function, Class Distance Weighted Cross-Entropy (CDW-CE),\nwhich penalizes misclassifications more severely when the predicted and actual\nclasses are farther apart. We evaluated CDW-CE using various deep\narchitectures, comparing its performance against several categorical and\nordinal loss functions. To assess the quality of latent representations, we\nused t-distributed stochastic neighbor embedding (t-SNE) and uniform manifold\napproximation and projection (UMAP) visualizations, quantified the clustering\nquality using the Silhouette Score, and compared Class Activation Maps (CAM)\ngenerated by models trained with CDW-CE and CE loss. Feedback from domain\nexperts was incorporated to evaluate how well model attention aligns with\nexpert opinion. Our results show that CDW-CE consistently improves performance\nin ordinal image classification tasks. It achieves higher Silhouette Scores,\nindicating better class discrimination capability, and its CAM visualizations\nshow a stronger focus on clinically significant regions, as validated by domain\nexperts. Receiver operator characteristics (ROC) curves and the area under the\ncurve (AUC) scores highlight that CDW-CE outperforms other loss functions,\nincluding prominent ordinal loss functions from the literature.\n","authors":["Gorkem Polat","Ümit Mert Çağlar","Alptekin Temizel"],"pdf_url":"https://arxiv.org/pdf/2412.01246v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07434v1","updated":"2025-01-13T16:02:33Z","published":"2025-01-13T16:02:33Z","title":"Guided SAM: Label-Efficient Part Segmentation","summary":"  Localizing object parts precisely is essential for tasks such as object\nrecognition and robotic manipulation. Recent part segmentation methods require\nextensive training data and labor-intensive annotations. Segment-Anything Model\n(SAM) has demonstrated good performance on a wide range of segmentation\nproblems, but requires (manual) positional prompts to guide it where to\nsegment. Furthermore, since it has been trained on full objects instead of\nobject parts, it is prone to over-segmentation of parts. To address this, we\npropose a novel approach that guides SAM towards the relevant object parts. Our\nmethod learns positional prompts from coarse patch annotations that are easier\nand cheaper to acquire. We train classifiers on image patches to identify part\nclasses and aggregate patches into regions of interest (ROIs) with positional\nprompts. SAM is conditioned on these ROIs and prompts. This approach, termed\n`Guided SAM', enhances efficiency and reduces manual effort, allowing effective\npart segmentation with minimal labeled data. We demonstrate the efficacy of\nGuided SAM on a dataset of car parts, improving the average IoU on state of the\nart models from 0.37 to 0.49 with annotations that are on average five times\nmore efficient to acquire.\n","authors":["S. B. van Rooij","G. J. Burghouts"],"pdf_url":"https://arxiv.org/pdf/2501.07434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07430v1","updated":"2025-01-13T15:54:21Z","published":"2025-01-13T15:54:21Z","title":"Diff-Ensembler: Learning to Ensemble 2D Diffusion Models for\n  Volume-to-Volume Medical Image Translation","summary":"  Despite success in volume-to-volume translations in medical images, most\nexisting models struggle to effectively capture the inherent volumetric\ndistribution using 3D representations. The current state-of-the-art approach\ncombines multiple 2D-based networks through weighted averaging, thereby\nneglecting the 3D spatial structures. Directly training 3D models in medical\nimaging presents significant challenges due to high computational demands and\nthe need for large-scale datasets. To address these challenges, we introduce\nDiff-Ensembler, a novel hybrid 2D-3D model for efficient and effective\nvolumetric translations by ensembling perpendicularly trained 2D diffusion\nmodels with a 3D network in each diffusion step. Moreover, our model can\nnaturally be used to ensemble diffusion models conditioned on different\nmodalities, allowing flexible and accurate fusion of input conditions.\nExtensive experiments demonstrate that Diff-Ensembler attains superior accuracy\nand volumetric realism in 3D medical image super-resolution and modality\ntranslation. We further demonstrate the strength of our model's volumetric\nrealism using tumor segmentation as a downstream task.\n","authors":["Xiyue Zhu","Dou Hoon Kwark","Ruike Zhu","Kaiwen Hong","Yiqi Tao","Shirui Luo","Yudu Li","Zhi-Pei Liang","Volodymyr Kindratenko"],"pdf_url":"https://arxiv.org/pdf/2501.07430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00843v2","updated":"2025-01-13T15:48:06Z","published":"2025-01-01T13:51:03Z","title":"FusionSORT: Fusion Methods for Online Multi-object Visual Tracking","summary":"  In this work, we investigate four different fusion methods for associating\ndetections to tracklets in multi-object visual tracking. In addition to\nconsidering strong cues such as motion and appearance information, we also\nconsider weak cues such as height intersection-over-union (height-IoU) and\ntracklet confidence information in the data association using different fusion\nmethods. These fusion methods include minimum, weighted sum based on IoU,\nKalman filter (KF) gating, and hadamard product of costs due to the different\ncues. We conduct extensive evaluations on validation sets of MOT17, MOT20 and\nDanceTrack datasets, and find out that the choice of a fusion method is key for\ndata association in multi-object visual tracking. We hope that this\ninvestigative work helps the computer vision research community to use the\nright fusion method for data association in multi-object visual tracking.\n","authors":["Nathanael L. Baisa"],"pdf_url":"https://arxiv.org/pdf/2501.00843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05226v2","updated":"2025-01-13T15:30:39Z","published":"2025-01-09T13:29:54Z","title":"Light Transport-aware Diffusion Posterior Sampling for Single-View\n  Reconstruction of 3D Volumes","summary":"  We introduce a single-view reconstruction technique of volumetric fields in\nwhich multiple light scattering effects are omnipresent, such as in clouds. We\nmodel the unknown distribution of volumetric fields using an unconditional\ndiffusion model trained on a novel benchmark dataset comprising 1,000\nsynthetically simulated volumetric density fields. The neural diffusion model\nis trained on the latent codes of a novel, diffusion-friendly, monoplanar\nrepresentation. The generative model is used to incorporate a tailored\nparametric diffusion posterior sampling technique into different reconstruction\ntasks. A physically-based differentiable volume renderer is employed to provide\ngradients with respect to light transport in the latent space. This stands in\ncontrast to classic NeRF approaches and makes the reconstructions better\naligned with observed data. Through various experiments, we demonstrate\nsingle-view reconstruction of volumetric clouds at a previously unattainable\nquality.\n","authors":["Ludwic Leonard","Nils Thuerey","Ruediger Westermann"],"pdf_url":"https://arxiv.org/pdf/2501.05226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08926v3","updated":"2025-01-13T15:19:14Z","published":"2024-10-11T15:50:53Z","title":"Zero-Shot Pupil Segmentation with SAM 2: A Case Study of Over 14 Million\n  Images","summary":"  We explore the transformative potential of SAM 2, a vision foundation model,\nin advancing gaze estimation and eye tracking technologies. By significantly\nreducing annotation time, lowering technical barriers through its ease of\ndeployment, and enhancing segmentation accuracy, SAM 2 addresses critical\nchallenges faced by researchers and practitioners. Utilizing its zero-shot\nsegmentation capabilities with minimal user input-a single click per video-we\ntested SAM 2 on over 14 million eye images from diverse datasets, including\nvirtual reality setups and the world's largest unified dataset recorded using\nwearable eye trackers. Remarkably, in pupil segmentation tasks, SAM 2 matches\nthe performance of domain-specific models trained solely on eye images,\nachieving competitive mean Intersection over Union (mIoU) scores of up to 93%\nwithout fine-tuning. Additionally, we provide our code and segmentation masks\nfor these widely used datasets to promote further research.\n","authors":["Virmarie Maquiling","Sean Anthony Byrne","Diederick C. Niehorster","Marco Carminati","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2410.08926v3.pdf","comment":"Virmarie Maquiling and Sean Anthony Byrne contributed equally to this\n  paper, 8 pages, 3 figures, ETRA 2025, pre-print"},{"id":"http://arxiv.org/abs/2501.07397v1","updated":"2025-01-13T15:12:40Z","published":"2025-01-13T15:12:40Z","title":"OCORD: Open-Campus Object Removal Dataset","summary":"  The rapid advancements in generative models, particularly diffusion-based\ntechniques, have revolutionized image inpainting tasks by enabling the\ngeneration of high-fidelity and diverse content. However, object removal\nremains under-explored as a specific subset of inpainting, facing challenges\nsuch as inadequate semantic understanding and the unintended generation of\nartifacts. Existing datasets for object removal often rely on synthetic data,\nwhich fails to align with real-world scenarios, limiting model performance.\nAlthough some real-world datasets address these issues partially, they suffer\nfrom scalability, annotation inefficiencies, and limited realism in physical\nphenomena such as lighting and shadows. To address these limitations, this\npaper introduces a novel approach to object removal by constructing a\nhigh-resolution real-world dataset through long-duration video capture with\nfixed camera settings. Leveraging advanced tools such as Grounding-DINO,\nSegment-Anything-Model, and MASA for automated annotation, we provides image,\nbackground, and mask pairs while significantly reducing annotation time and\nlabor. With our efficient annotation pipeline, we release the first fully open,\nhigh-resolution real-world dataset for object removal, and improved performance\nin object removal tasks through fine-tuning of pre-trained diffusion models.\n","authors":["Shuo Zhang","Runpu Wei","Kongming Liang"],"pdf_url":"https://arxiv.org/pdf/2501.07397v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2501.07396v1","updated":"2025-01-13T15:11:27Z","published":"2025-01-13T15:11:27Z","title":"Zero-Shot Scene Understanding for Automatic Target Recognition Using\n  Large Vision-Language Models","summary":"  Automatic target recognition (ATR) plays a critical role in tasks such as\nnavigation and surveillance, where safety and accuracy are paramount. In\nextreme use cases, such as military applications, these factors are often\nchallenged due to the presence of unknown terrains, environmental conditions,\nand novel object categories. Current object detectors, including open-world\ndetectors, lack the ability to confidently recognize novel objects or operate\nin unknown environments, as they have not been exposed to these new conditions.\nHowever, Large Vision-Language Models (LVLMs) exhibit emergent properties that\nenable them to recognize objects in varying conditions in a zero-shot manner.\nDespite this, LVLMs struggle to localize objects effectively within a scene. To\naddress these limitations, we propose a novel pipeline that combines the\ndetection capabilities of open-world detectors with the recognition confidence\nof LVLMs, creating a robust system for zero-shot ATR of novel classes and\nunknown domains. In this study, we compare the performance of various LVLMs for\nrecognizing military vehicles, which are often underrepresented in training\ndatasets. Additionally, we examine the impact of factors such as distance\nrange, modality, and prompting methods on the recognition performance,\nproviding insights into the development of more reliable ATR systems for novel\nconditions and classes.\n","authors":["Yasiru Ranasinghe","Vibashan VS","James Uplinger","Celso De Melo","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2501.07396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07390v1","updated":"2025-01-13T15:06:51Z","published":"2025-01-13T15:06:51Z","title":"Kolmogorov-Arnold Network for Remote Sensing Image Semantic Segmentation","summary":"  Semantic segmentation plays a crucial role in remote sensing applications,\nwhere the accurate extraction and representation of features are essential for\nhigh-quality results. Despite the widespread use of encoder-decoder\narchitectures, existing methods often struggle with fully utilizing the\nhigh-dimensional features extracted by the encoder and efficiently recovering\ndetailed information during decoding. To address these problems, we propose a\nnovel semantic segmentation network, namely DeepKANSeg, including two key\ninnovations based on the emerging Kolmogorov Arnold Network (KAN). Notably, the\nadvantage of KAN lies in its ability to decompose high-dimensional complex\nfunctions into univariate transformations, enabling efficient and flexible\nrepresentation of intricate relationships in data. First, we introduce a\nKAN-based deep feature refinement module, namely DeepKAN to effectively capture\ncomplex spatial and rich semantic relationships from high-dimensional features.\nSecond, we replace the traditional multi-layer perceptron (MLP) layers in the\nglobal-local combined decoder with KAN-based linear layers, namely GLKAN. This\nmodule enhances the decoder's ability to capture fine-grained details during\ndecoding. To evaluate the effectiveness of the proposed method, experiments are\nconducted on two well-known fine-resolution remote sensing benchmark datasets,\nnamely ISPRS Vaihingen and ISPRS Potsdam. The results demonstrate that the\nKAN-enhanced segmentation model achieves superior performance in terms of\naccuracy compared to state-of-the-art methods. They highlight the potential of\nKANs as a powerful alternative to traditional architectures in semantic\nsegmentation tasks. Moreover, the explicit univariate decomposition provides\nimproved interpretability, which is particularly beneficial for applications\nrequiring explainable learning in remote sensing.\n","authors":["Xianping Ma","Ziyao Wang","Yin Hu","Xiaokang Zhang","Man-On Pun"],"pdf_url":"https://arxiv.org/pdf/2501.07390v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.07378v1","updated":"2025-01-13T14:54:49Z","published":"2025-01-13T14:54:49Z","title":"FedSemiDG: Domain Generalized Federated Semi-supervised Medical Image\n  Segmentation","summary":"  Medical image segmentation is challenging due to the diversity of medical\nimages and the lack of labeled data, which motivates recent developments in\nfederated semi-supervised learning (FSSL) to leverage a large amount of\nunlabeled data from multiple centers for model training without sharing raw\ndata. However, what remains under-explored in FSSL is the domain shift problem\nwhich may cause suboptimal model aggregation and low effectivity of the\nutilization of unlabeled data, eventually leading to unsatisfactory performance\nin unseen domains. In this paper, we explore this previously ignored scenario,\nnamely domain generalized federated semi-supervised learning (FedSemiDG), which\naims to learn a model in a distributed manner from multiple domains with\nlimited labeled data and abundant unlabeled data such that the model can\ngeneralize well to unseen domains. We present a novel framework, Federated\nGeneralization-Aware SemiSupervised Learning (FGASL), to address the challenges\nin FedSemiDG by effectively tackling critical issues at both global and local\nlevels. Globally, we introduce Generalization-Aware Aggregation (GAA),\nassigning adaptive weights to local models based on their generalization\nperformance. Locally, we use a Dual-Teacher Adaptive Pseudo Label Refinement\n(DR) strategy to combine global and domain-specific knowledge, generating more\nreliable pseudo labels. Additionally, Perturbation-Invariant Alignment (PIA)\nenforces feature consistency under perturbations, promoting domain-invariant\nlearning. Extensive experiments on three medical segmentation tasks (cardiac\nMRI, spine MRI and bladder cancer MRI) demonstrate that our method\nsignificantly outperforms state-of-the-art FSSL and domain generalization\napproaches, achieving robust generalization on unseen domains.\n","authors":["Zhipeng Deng","Zhe Xu","Tsuyoshi Isshiki","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.07378v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2412.05271v4","updated":"2025-01-13T14:42:20Z","published":"2024-12-06T18:57:08Z","title":"Expanding Performance Boundaries of Open-Source Multimodal Models with\n  Model, Data, and Test-Time Scaling","summary":"  We introduce InternVL 2.5, an advanced multimodal large language model (MLLM)\nseries that builds upon InternVL 2.0, maintaining its core model architecture\nwhile introducing significant enhancements in training and testing strategies\nas well as data quality. In this work, we delve into the relationship between\nmodel scaling and performance, systematically exploring the performance trends\nin vision encoders, language models, dataset sizes, and test-time\nconfigurations. Through extensive evaluations on a wide range of benchmarks,\nincluding multi-discipline reasoning, document understanding, multi-image /\nvideo understanding, real-world comprehension, multimodal hallucination\ndetection, visual grounding, multilingual capabilities, and pure language\nprocessing, InternVL 2.5 exhibits competitive performance, rivaling leading\ncommercial models such as GPT-4o and Claude-3.5-Sonnet. Notably, our model is\nthe first open-source MLLMs to surpass 70% on the MMMU benchmark, achieving a\n3.7-point improvement through Chain-of-Thought (CoT) reasoning and showcasing\nstrong potential for test-time scaling. We hope this model contributes to the\nopen-source community by setting new standards for developing and applying\nmultimodal AI systems. HuggingFace demo see\nhttps://huggingface.co/spaces/OpenGVLab/InternVL\n","authors":["Zhe Chen","Weiyun Wang","Yue Cao","Yangzhou Liu","Zhangwei Gao","Erfei Cui","Jinguo Zhu","Shenglong Ye","Hao Tian","Zhaoyang Liu","Lixin Gu","Xuehui Wang","Qingyun Li","Yimin Ren","Zixuan Chen","Jiapeng Luo","Jiahao Wang","Tan Jiang","Bo Wang","Conghui He","Botian Shi","Xingcheng Zhang","Han Lv","Yi Wang","Wenqi Shao","Pei Chu","Zhongying Tu","Tong He","Zhiyong Wu","Huipeng Deng","Jiaye Ge","Kai Chen","Kaipeng Zhang","Limin Wang","Min Dou","Lewei Lu","Xizhou Zhu","Tong Lu","Dahua Lin","Yu Qiao","Jifeng Dai","Wenhai Wang"],"pdf_url":"https://arxiv.org/pdf/2412.05271v4.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2412.09718v2","updated":"2025-01-13T14:37:52Z","published":"2024-12-12T20:48:06Z","title":"BayesAdapter: enhanced uncertainty estimation in CLIP few-shot\n  adaptation","summary":"  The emergence of large pre-trained vision-language models (VLMs) represents a\nparadigm shift in machine learning, with unprecedented results in a broad span\nof visual recognition tasks. CLIP, one of the most popular VLMs, has exhibited\nremarkable zero-shot and transfer learning capabilities in classification. To\ntransfer CLIP to downstream tasks, adapters constitute a parameter-efficient\napproach that avoids backpropagation through the large model (unlike related\nprompt learning methods). However, CLIP adapters have been developed to target\ndiscriminative performance, and the quality of their uncertainty estimates has\nbeen overlooked. In this work we show that the discriminative performance of\nstate-of-the-art CLIP adapters does not always correlate with their uncertainty\nestimation capabilities, which are essential for a safe deployment in\nreal-world scenarios. We also demonstrate that one of such adapters is obtained\nthrough MAP inference from a more general probabilistic framework. Based on\nthis observation we introduce BayesAdapter, which leverages Bayesian inference\nto estimate a full probability distribution instead of a single point, better\ncapturing the variability inherent in the parameter space. In a comprehensive\nempirical evaluation we show that our approach obtains high quality uncertainty\nestimates in the predictions, standing out in calibration and selective\nclassification. Our code will be publicly available upon acceptance of the\npaper.\n","authors":["Pablo Morales-Álvarez","Stergios Christodoulidis","Maria Vakalopoulou","Pablo Piantanida","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2412.09718v2.pdf","comment":"30 pages, 5 figures, 23 tables"},{"id":"http://arxiv.org/abs/2406.16531v2","updated":"2025-01-13T14:34:40Z","published":"2024-06-24T11:10:41Z","title":"GIM: A Million-scale Benchmark for Generative Image Manipulation\n  Detection and Localization","summary":"  The extraordinary ability of generative models emerges as a new trend in\nimage editing and generating realistic images, posing a serious threat to the\ntrustworthiness of multimedia data and driving the research of image\nmanipulation detection and location (IMDL). However, the lack of a large-scale\ndata foundation makes the IMDL task unattainable. In this paper, we build a\nlocal manipulation data generation pipeline that integrates the powerful\ncapabilities of SAM, LLM, and generative models. Upon this basis, we propose\nthe GIM dataset, which has the following advantages: 1) Large scale, GIM\nincludes over one million pairs of AI-manipulated images and real images. 2)\nRich image content, GIM encompasses a broad range of image classes. 3) Diverse\ngenerative manipulation, the images are manipulated images with\nstate-of-the-art generators and various manipulation tasks. The aforementioned\nadvantages allow for a more comprehensive evaluation of IMDL methods, extending\ntheir applicability to diverse images. We introduce the GIM benchmark with two\nsettings to evaluate existing IMDL methods. In addition, we propose a novel\nIMDL framework, termed GIMFormer, which consists of a ShadowTracer,\nFrequency-Spatial block (FSB), and a Multi-Window Anomalous Modeling (MWAM)\nmodule. Extensive experiments on the GIM demonstrate that GIMFormer surpasses\nthe previous state-of-the-art approach on two different benchmarks.\n","authors":["Yirui Chen","Xudong Huang","Quan Zhang","Wei Li","Mingjian Zhu","Qiangyu Yan","Simiao Li","Hanting Chen","Hailin Hu","Jie Yang","Wei Liu","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2406.16531v2.pdf","comment":"Code page: https://github.com/chenyirui/GIM"},{"id":"http://arxiv.org/abs/2404.16432v5","updated":"2025-01-13T14:34:18Z","published":"2024-04-25T09:07:19Z","title":"Point-JEPA: A Joint Embedding Predictive Architecture for\n  Self-Supervised Learning on Point Cloud","summary":"  Recent advancements in self-supervised learning in the point cloud domain\nhave demonstrated significant potential. However, these methods often suffer\nfrom drawbacks, including lengthy pre-training time, the necessity of\nreconstruction in the input space, or the necessity of additional modalities.\nIn order to address these issues, we introduce Point-JEPA, a joint embedding\npredictive architecture designed specifically for point cloud data. To this\nend, we introduce a sequencer that orders point cloud patch embeddings to\nefficiently compute and utilize their proximity based on the indices during\ntarget and context selection. The sequencer also allows shared computations of\nthe patch embeddings' proximity between context and target selection, further\nimproving the efficiency. Experimentally, our method achieves competitive\nresults with state-of-the-art methods while avoiding the reconstruction in the\ninput space or additional modality.\n","authors":["Ayumu Saito","Prachi Kudeshia","Jiju Poovvancheri"],"pdf_url":"https://arxiv.org/pdf/2404.16432v5.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.07360v1","updated":"2025-01-13T14:30:01Z","published":"2025-01-13T14:30:01Z","title":"TimberVision: A Multi-Task Dataset and Framework for Log-Component\n  Segmentation and Tracking in Autonomous Forestry Operations","summary":"  Timber represents an increasingly valuable and versatile resource. However,\nforestry operations such as harvesting, handling and measuring logs still\nrequire substantial human labor in remote environments posing significant\nsafety risks. Progressively automating these tasks has the potential of\nincreasing their efficiency as well as safety, but requires an accurate\ndetection of individual logs as well as live trees and their context. Although\ninitial approaches have been proposed for this challenging application domain,\nspecialized data and algorithms are still too scarce to develop robust\nsolutions. To mitigate this gap, we introduce the TimberVision dataset,\nconsisting of more than 2k annotated RGB images containing a total of 51k trunk\ncomponents including cut and lateral surfaces, thereby surpassing any existing\ndataset in this domain in terms of both quantity and detail by a large margin.\nBased on this data, we conduct a series of ablation experiments for oriented\nobject detection and instance segmentation and evaluate the influence of\nmultiple scene parameters on model performance. We introduce a generic\nframework to fuse the components detected by our models for both tasks into\nunified trunk representations. Furthermore, we automatically derive geometric\nproperties and apply multi-object tracking to further enhance robustness. Our\ndetection and tracking approach provides highly descriptive and accurate trunk\nrepresentations solely from RGB image data, even under challenging\nenvironmental conditions. Our solution is suitable for a wide range of\napplication scenarios and can be readily combined with other sensor modalities.\n","authors":["Daniel Steininger","Julia Simon","Andreas Trondl","Markus Murschitz"],"pdf_url":"https://arxiv.org/pdf/2501.07360v1.pdf","comment":"Accepted at Winter Conference on Applications of Computer Vision\n  (WACV) 2025. Code and dataset available at\n  https://github.com/timbervision/timbervision"},{"id":"http://arxiv.org/abs/2501.03836v2","updated":"2025-01-13T14:10:16Z","published":"2025-01-07T14:45:39Z","title":"SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor\n  Diagnosis","summary":"  Brain tumors can result in neurological dysfunction, alterations in cognitive\nand psychological states, increased intracranial pressure, and the occurrence\nof seizures, thereby presenting a substantial risk to human life and health.\nThe You Only Look Once(YOLO) series models have demonstrated superior accuracy\nin object detection for medical imaging. In this paper, we develop a novel\nSCC-YOLO architecture by integrating the SCConv attention mechanism into\nYOLOv9. The SCConv module reconstructs an efficient convolutional module by\nreducing spatial and channel redundancy among features, thereby enhancing the\nlearning of image features. We investigate the impact of intergrating different\nattention mechanisms with the YOLOv9 model on brain tumor image detection using\nboth the Br35H dataset and our self-made dataset(Brain_Tumor_Dataset).\nExperimental results show that on the Br35H dataset, SCC-YOLO achieved a 0.3%\nimprovement in mAp50 compared to YOLOv9, while on our self-made dataset,\nSCC-YOLO exhibited a 0.5% improvement over YOLOv9. SCC-YOLO has reached\nstate-of-the-art performance in brain tumor detection. Source code is available\nat : https://jihulab.com/healthcare-information-studio/SCC-YOLO/-/tree/master\n","authors":["Runci Bai"],"pdf_url":"https://arxiv.org/pdf/2501.03836v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15770v2","updated":"2025-01-13T14:00:18Z","published":"2024-11-24T09:48:03Z","title":"Text-Guided Coarse-to-Fine Fusion Network for Robust Remote Sensing\n  Visual Question Answering","summary":"  Remote Sensing Visual Question Answering (RSVQA) has gained significant\nresearch interest. However, current RSVQA methods are limited by the imaging\nmechanisms of optical sensors, particularly under challenging conditions such\nas cloud-covered and low-light scenarios. Given the all-time and all-weather\nimaging capabilities of Synthetic Aperture Radar (SAR), it is crucial to\ninvestigate the integration of optical-SAR images to improve RSVQA performance.\nIn this work, we propose a Text-guided Coarse-to-Fine Fusion Network (TGFNet),\nwhich leverages the semantic relationships between question text and\nmulti-source images to guide the network toward complementary fusion at the\nfeature level. Specifically, we develop a Text-guided Coarse-to-Fine Attention\nRefinement (CFAR) module to focus on key areas related to the question in\ncomplex remote sensing images. This module progressively directs attention from\nbroad areas to finer details through key region routing, enhancing the model's\nability to focus on relevant regions. Furthermore, we propose an Adaptive\nMulti-Expert Fusion (AMEF) module that dynamically integrates different\nexperts, enabling the adaptive fusion of optical and SAR features. In addition,\nwe create the first large-scale benchmark dataset for evaluating optical-SAR\nRSVQA methods, comprising 6,008 well-aligned optical-SAR image pairs and\n1,036,694 well-labeled question-answer pairs across 16 diverse question types,\nincluding complex relational reasoning questions. Extensive experiments on the\nproposed dataset demonstrate that our TGFNet effectively integrates\ncomplementary information between optical and SAR images, significantly\nimproving the model's performance in challenging scenarios. The dataset is\navailable at: https://github.com/mmic-lcl/.\n  Index Terms: Remote Sensing Visual Question Answering, Multi-source Data\nFusion, Multimodal, Remote Sensing, OPT-SAR.\n","authors":["Zhicheng Zhao","Changfu Zhou","Yu Zhang","Chenglong Li","Xiaoliang Ma","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2411.15770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07342v1","updated":"2025-01-13T13:56:31Z","published":"2025-01-13T13:56:31Z","title":"A method for estimating roadway billboard salience","summary":"  Roadside billboards and other forms of outdoor advertising play a crucial\nrole in marketing initiatives; however, they can also distract drivers,\npotentially contributing to accidents. This study delves into the significance\nof roadside advertising in images captured from a driver's perspective.\nFirstly, it evaluates the effectiveness of neural networks in detecting\nadvertising along roads, focusing on the YOLOv5 and Faster R-CNN models.\nSecondly, the study addresses the determination of billboard significance using\nmethods for saliency extraction. The UniSal and SpectralResidual methods were\nemployed to create saliency maps for each image. The study establishes a\ndatabase of eye tracking sessions captured during city highway driving to\nassess the saliency models.\n","authors":["Zuzana Berger Haladova","Michal Zrubec","Zuzana Cernekova"],"pdf_url":"https://arxiv.org/pdf/2501.07342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05450v2","updated":"2025-01-13T13:54:31Z","published":"2024-10-07T19:34:25Z","title":"AI-Driven Early Mental Health Screening: Analyzing Selfies of Pregnant\n  Women","summary":"  Major Depressive Disorder and anxiety disorders affect millions globally,\ncontributing significantly to the burden of mental health issues. Early\nscreening is crucial for effective intervention, as timely identification of\nmental health issues can significantly improve treatment outcomes. Artificial\nintelligence (AI) can be valuable for improving the screening of mental\ndisorders, enabling early intervention and better treatment outcomes. AI-driven\nscreening can leverage the analysis of multiple data sources, including facial\nfeatures in digital images. However, existing methods often rely on controlled\nenvironments or specialized equipment, limiting their broad applicability. This\nstudy explores the potential of AI models for ubiquitous depression-anxiety\nscreening given face-centric selfies. The investigation focuses on high-risk\npregnant patients, a population that is particularly vulnerable to mental\nhealth issues. To cope with limited training data resulting from our clinical\nsetup, pre-trained models were utilized in two different approaches:\nfine-tuning convolutional neural networks (CNNs) originally designed for facial\nexpression recognition and employing vision-language models (VLMs) for\nzero-shot analysis of facial expressions. Experimental results indicate that\nthe proposed VLM-based method significantly outperforms CNNs, achieving an\naccuracy of 77.6%. Although there is significant room for improvement, the\nresults suggest that VLMs can be a promising approach for mental health\nscreening.\n","authors":["Gustavo A. Basílio","Thiago B. Pereira","Alessandro L. Koerich","Hermano Tavares","Ludmila Dias","Maria das Graças da S. Teixeira","Rafael T. Sousa","Wilian H. Hisatugu","Amanda S. Mota","Anilton S. Garcia","Marco Aurélio K. Galletta","Thiago M. Paixão"],"pdf_url":"https://arxiv.org/pdf/2410.05450v2.pdf","comment":"This article has been accepted for publication in HEALTHINF25 at the\n  18th International Joint Conference on Biomedical Engineering Systems and\n  Technologies (BIOSTEC 2025)"},{"id":"http://arxiv.org/abs/2501.07334v1","updated":"2025-01-13T13:47:00Z","published":"2025-01-13T13:47:00Z","title":"Anonymization of Documents for Law Enforcement with Machine Learning","summary":"  The steadily increasing utilization of data-driven methods and approaches in\nareas that handle sensitive personal information such as in law enforcement\nmandates an ever increasing effort in these institutions to comply with data\nprotection guidelines. In this work, we present a system for automatically\nanonymizing images of scanned documents, reducing manual effort while ensuring\ndata protection compliance. Our method considers the viability of further\nforensic processing after anonymization by minimizing automatically redacted\nareas by combining automatic detection of sensitive regions with knowledge from\na manually anonymized reference document. Using a self-supervised image model\nfor instance retrieval of the reference document, our approach requires only\none anonymized example to efficiently redact all documents of the same type,\nsignificantly reducing processing time. We show that our approach outperforms\nboth a purely automatic redaction system and also a naive copy-paste scheme of\nthe reference anonymization to other documents on a hand-crafted dataset of\nground truth redactions.\n","authors":["Manuel Eberhardinger","Patrick Takenaka","Daniel Grießhaber","Johannes Maucher"],"pdf_url":"https://arxiv.org/pdf/2501.07334v1.pdf","comment":"Accepted at IEEE Symposium on CI in Security, Defence and Biometrics\n  2025 (IEEE CISDB)"},{"id":"http://arxiv.org/abs/2403.15517v2","updated":"2025-01-13T13:32:48Z","published":"2024-03-22T11:14:30Z","title":"Improving Forward Compatibility in Class Incremental Learning by\n  Increasing Representation Rank and Feature Richness","summary":"  Class Incremental Learning (CIL) constitutes a pivotal subfield within\ncontinual learning, aimed at enabling models to progressively learn new\nclassification tasks while retaining knowledge obtained from prior tasks.\nAlthough previous studies have predominantly focused on backward compatible\napproaches to mitigate catastrophic forgetting, recent investigations have\nintroduced forward compatible methods to enhance performance on novel tasks and\ncomplement existing backward compatible methods. In this study, we introduce an\neffective-Rank based Feature Richness enhancement (RFR) method, designed for\nimproving forward compatibility. Specifically, this method increases the\neffective rank of representations during the base session, thereby facilitating\nthe incorporation of more informative features pertinent to unseen novel tasks.\nConsequently, RFR achieves dual objectives in backward and forward\ncompatibility: minimizing feature extractor modifications and enhancing novel\ntask performance, respectively. To validate the efficacy of our approach, we\nestablish a theoretical connection between effective rank and the Shannon\nentropy of representations. Subsequently, we conduct comprehensive experiments\nby integrating RFR into eleven well-known CIL methods. Our results demonstrate\nthe effectiveness of our approach in enhancing novel-task performance while\nmitigating catastrophic forgetting. Furthermore, our method notably improves\nthe average incremental accuracy across all eleven cases examined.\n","authors":["Jaeill Kim","Wonseok Lee","Moonjung Eo","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2403.15517v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07312v1","updated":"2025-01-13T13:24:41Z","published":"2025-01-13T13:24:41Z","title":"Localization-Aware Multi-Scale Representation Learning for Repetitive\n  Action Counting","summary":"  Repetitive action counting (RAC) aims to estimate the number of\nclass-agnostic action occurrences in a video without exemplars. Most current\nRAC methods rely on a raw frame-to-frame similarity representation for period\nprediction. However, this approach can be significantly disrupted by common\nnoise such as action interruptions and inconsistencies, leading to sub-optimal\ncounting performance in realistic scenarios. In this paper, we introduce a\nforeground localization optimization objective into similarity representation\nlearning to obtain more robust and efficient video features. We propose a\nLocalization-Aware Multi-Scale Representation Learning (LMRL) framework.\nSpecifically, we apply a Multi-Scale Period-Aware Representation (MPR) with a\nscale-specific design to accommodate various action frequencies and learn more\nflexible temporal correlations. Furthermore, we introduce the Repetition\nForeground Localization (RFL) method, which enhances the representation by\ncoarsely identifying periodic actions and incorporating global semantic\ninformation. These two modules can be jointly optimized, resulting in a more\ndiscerning periodic action representation. Our approach significantly reduces\nthe impact of noise, thereby improving counting accuracy. Additionally, the\nframework is designed to be scalable and adaptable to different types of video\ncontent. Experimental results on the RepCountA and UCFRep datasets demonstrate\nthat our proposed method effectively handles repetitive action counting.\n","authors":["Sujia Wang","Xiangwei Shen","Yansong Tang","Xin Dong","Wenjia Geng","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07312v1.pdf","comment":"Accepted by IEEE VCIP2024"},{"id":"http://arxiv.org/abs/2501.07305v1","updated":"2025-01-13T13:13:06Z","published":"2025-01-13T13:13:06Z","title":"The Devil is in the Spurious Correlation: Boosting Moment Retrieval via\n  Temporal Dynamic Learning","summary":"  Given a textual query along with a corresponding video, the objective of\nmoment retrieval aims to localize the moments relevant to the query within the\nvideo. While commendable results have been demonstrated by existing\ntransformer-based approaches, predicting the accurate temporal span of the\ntarget moment is currently still a major challenge. In this paper, we reveal\nthat a crucial reason stems from the spurious correlation between the text\nqueries and the moment context. Namely, the model may associate the textual\nquery with the background frames rather than the target moment. To address this\nissue, we propose a temporal dynamic learning approach for moment retrieval,\nwhere two strategies are designed to mitigate the spurious correlation. First,\nwe introduce a novel video synthesis approach to construct a dynamic context\nfor the relevant moment. With separate yet similar videos mixed up, the\nsynthesis approach empowers our model to attend to the target moment of the\ncorresponding query under various dynamic contexts. Second, we enhance the\nrepresentation by learning temporal dynamics. Besides the visual\nrepresentation, text queries are aligned with temporal dynamic representations,\nwhich enables our model to establish a non-spurious correlation between the\nquery-related moment and context. With the aforementioned proposed method, the\nspurious correlation issue in moment retrieval can be largely alleviated. Our\nmethod establishes a new state-of-the-art performance on two popular benchmarks\nof moment retrieval, \\ie, QVHighlights and Charades-STA. In addition, the\ndetailed ablation analyses demonstrate the effectiveness of the proposed\nstrategies. Our code will be publicly available.\n","authors":["Xinyang Zhou","Fanyue Wei","Lixin Duan","Wen Li"],"pdf_url":"https://arxiv.org/pdf/2501.07305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07304v1","updated":"2025-01-13T13:12:18Z","published":"2025-01-13T13:12:18Z","title":"Code and Pixels: Multi-Modal Contrastive Pre-training for Enhanced\n  Tabular Data Analysis","summary":"  Learning from tabular data is of paramount importance, as it complements the\nconventional analysis of image and video data by providing a rich source of\nstructured information that is often critical for comprehensive understanding\nand decision-making processes. We present Multi-task Contrastive Masked Tabular\nModeling (MT-CMTM), a novel method aiming to enhance tabular models by\nleveraging the correlation between tabular data and corresponding images.\nMT-CMTM employs a dual strategy combining contrastive learning with masked\ntabular modeling, optimizing the synergy between these data modalities.\n  Central to our approach is a 1D Convolutional Neural Network with residual\nconnections and an attention mechanism (1D-ResNet-CBAM), designed to\nefficiently process tabular data without relying on images. This enables\nMT-CMTM to handle purely tabular data for downstream tasks, eliminating the\nneed for potentially costly image acquisition and processing.\n  We evaluated MT-CMTM on the DVM car dataset, which is uniquely suited for\nthis particular scenario, and the newly developed HIPMP dataset, which connects\nmembrane fabrication parameters with image data. Our MT-CMTM model outperforms\nthe proposed tabular 1D-ResNet-CBAM, which is trained from scratch, achieving a\nrelative 1.48% improvement in relative MSE on HIPMP and a 2.38% increase in\nabsolute accuracy on DVM. These results demonstrate MT-CMTM's robustness and\nits potential to advance the field of multi-modal learning.\n","authors":["Kankana Roy","Lars Krämer","Sebastian Domaschke","Malik Haris","Roland Aydin","Fabian Isensee","Martin Held"],"pdf_url":"https://arxiv.org/pdf/2501.07304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20287v5","updated":"2025-01-13T13:12:17Z","published":"2024-03-29T16:58:13Z","title":"Benchmarking Counterfactual Image Generation","summary":"  Generative AI has revolutionised visual content editing, empowering users to\neffortlessly modify images and videos. However, not all edits are equal. To\nperform realistic edits in domains such as natural image or medical imaging,\nmodifications must respect causal relationships inherent to the data generation\nprocess. Such image editing falls into the counterfactual image generation\nregime. Evaluating counterfactual image generation is substantially complex:\nnot only it lacks observable ground truths, but also requires adherence to\ncausal constraints. Although several counterfactual image generation methods\nand evaluation metrics exist, a comprehensive comparison within a unified\nsetting is lacking. We present a comparison framework to thoroughly benchmark\ncounterfactual image generation methods. We integrate all models that have been\nused for the task at hand and expand them to novel datasets and causal graphs,\ndemonstrating the superiority of Hierarchical VAEs across most datasets and\nmetrics. Our framework is implemented in a user-friendly Python package that\ncan be extended to incorporate additional SCMs, causal methods, generative\nmodels, and datasets for the community to build on. Code:\nhttps://github.com/gulnazaki/counterfactual-benchmark.\n","authors":["Thomas Melistas","Nikos Spyrou","Nefeli Gkouti","Pedro Sanchez","Athanasios Vlontzos","Yannis Panagakis","Giorgos Papanastasiou","Sotirios A. Tsaftaris"],"pdf_url":"https://arxiv.org/pdf/2403.20287v5.pdf","comment":"Published as a conference paper at NeurIPS 2024 Datasets and\n  Benchmarks Track https://openreview.net/forum?id=0T8xRFrScB Project page:\n  https://gulnazaki.github.io/counterfactual-benchmark"},{"id":"http://arxiv.org/abs/2501.07300v1","updated":"2025-01-13T13:07:51Z","published":"2025-01-13T13:07:51Z","title":"Comparative analysis of optical character recognition methods for Sámi\n  texts from the National Library of Norway","summary":"  Optical Character Recognition (OCR) is crucial to the National Library of\nNorway's (NLN) digitisation process as it converts scanned documents into\nmachine-readable text. However, for the S\\'ami documents in NLN's collection,\nthe OCR accuracy is insufficient. Given that OCR quality affects downstream\nprocesses, evaluating and improving OCR for text written in S\\'ami languages is\nnecessary to make these resources accessible. To address this need, this work\nfine-tunes and evaluates three established OCR approaches, Transkribus,\nTesseract and TrOCR, for transcribing S\\'ami texts from NLN's collection. Our\nresults show that Transkribus and TrOCR outperform Tesseract on this task,\nwhile Tesseract achieves superior performance on an out-of-domain dataset.\nFurthermore, we show that fine-tuning pre-trained models and supplementing\nmanual annotations with machine annotations and synthetic text images can yield\naccurate OCR for S\\'ami languages, even with a moderate amount of manually\nannotated data.\n","authors":["Tita Enstad","Trond Trosterud","Marie Iversdatter Røsok","Yngvil Beyer","Marie Roald"],"pdf_url":"https://arxiv.org/pdf/2501.07300v1.pdf","comment":"To be published in Proceedings of the 25th Nordic Conference on\n  Computational Linguistics (NoDaLiDa)"},{"id":"http://arxiv.org/abs/2410.22829v2","updated":"2025-01-13T13:07:25Z","published":"2024-10-30T09:11:25Z","title":"Situational Scene Graph for Structured Human-centric Situation\n  Understanding","summary":"  Graph based representation has been widely used in modelling spatio-temporal\nrelationships in video understanding. Although effective, existing graph-based\napproaches focus on capturing the human-object relationships while ignoring\nfine-grained semantic properties of the action components. These semantic\nproperties are crucial for understanding the current situation, such as where\ndoes the action takes place, what tools are used and functional properties of\nthe objects. In this work, we propose a graph-based representation called\nSituational Scene Graph (SSG) to encode both human-object relationships and the\ncorresponding semantic properties. The semantic details are represented as\npredefined roles and values inspired by situation frame, which is originally\ndesigned to represent a single action. Based on our proposed representation, we\nintroduce the task of situational scene graph generation and propose a\nmulti-stage pipeline Interactive and Complementary Network (InComNet) to\naddress the task. Given that the existing datasets are not applicable to the\ntask, we further introduce a SSG dataset whose annotations consist of semantic\nrole-value frames for human, objects and verb predicates of human-object\nrelations. Finally, we demonstrate the effectiveness of our proposed SSG\nrepresentation by testing on different downstream tasks. Experimental results\nshow that the unified representation can not only benefit predicate\nclassification and semantic role-value classification, but also benefit\nreasoning tasks on human-centric situation understanding. We will release the\ncode and the dataset soon.\n","authors":["Chinthani Sugandhika","Chen Li","Deepu Rajan","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2410.22829v2.pdf","comment":"Accepted for WACV 2025"},{"id":"http://arxiv.org/abs/2501.07297v1","updated":"2025-01-13T13:04:00Z","published":"2025-01-13T13:04:00Z","title":"Toward Realistic Camouflaged Object Detection: Benchmarks and Method","summary":"  Camouflaged object detection (COD) primarily relies on semantic or instance\nsegmentation methods. While these methods have made significant advancements in\nidentifying the contours of camouflaged objects, they may be inefficient or\ncost-effective for tasks that only require the specific location of the object.\nObject detection algorithms offer an optimized solution for Realistic\nCamouflaged Object Detection (RCOD) in such cases. However, detecting\ncamouflaged objects remains a formidable challenge due to the high degree of\nsimilarity between the features of the objects and their backgrounds. Unlike\nsegmentation methods that perform pixel-wise comparisons to differentiate\nbetween foreground and background, object detectors omit this analysis, further\naggravating the challenge. To solve this problem, we propose a camouflage-aware\nfeature refinement (CAFR) strategy. Since camouflaged objects are not rare\ncategories, CAFR fully utilizes a clear perception of the current object within\nthe prior knowledge of large models to assist detectors in deeply understanding\nthe distinctions between background and foreground. Specifically, in CAFR, we\nintroduce the Adaptive Gradient Propagation (AGP) module that fine-tunes all\nfeature extractor layers in large detection models to fully refine\nclass-specific features from camouflaged contexts. We then design the Sparse\nFeature Refinement (SFR) module that optimizes the transformer-based feature\nextractor to focus primarily on capturing class-specific features in\ncamouflaged scenarios. To facilitate the assessment of RCOD tasks, we manually\nannotate the labels required for detection on three existing segmentation COD\ndatasets, creating a new benchmark for RCOD tasks. Code and datasets are\navailable at: https://github.com/zhimengXin/RCOD.\n","authors":["Zhimeng Xin","Tianxu Wu","Shiming Chen","Shuo Ye","Zijing Xie","Yixiong Zou","Xinge You","Yufei Guo"],"pdf_url":"https://arxiv.org/pdf/2501.07297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07296v1","updated":"2025-01-13T13:03:28Z","published":"2025-01-13T13:03:28Z","title":"Event-based Video Person Re-identification via Cross-Modality and\n  Temporal Collaboration","summary":"  Video-based person re-identification (ReID) has become increasingly important\ndue to its applications in video surveillance applications. By employing events\nin video-based person ReID, more motion information can be provided between\ncontinuous frames to improve recognition accuracy. Previous approaches have\nassisted by introducing event data into the video person ReID task, but they\nstill cannot avoid the privacy leakage problem caused by RGB images. In order\nto avoid privacy attacks and to take advantage of the benefits of event data,\nwe consider using only event data. To make full use of the information in the\nevent stream, we propose a Cross-Modality and Temporal Collaboration (CMTC)\nnetwork for event-based video person ReID. First, we design an event transform\nnetwork to obtain corresponding auxiliary information from the input of raw\nevents. Additionally, we propose a differential modality collaboration module\nto balance the roles of events and auxiliaries to achieve complementary\neffects. Furthermore, we introduce a temporal collaboration module to exploit\nmotion information and appearance cues. Experimental results demonstrate that\nour method outperforms others in the task of event-based video person ReID.\n","authors":["Renkai Li","Xin Yuan","Wei Liu","Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2501.07296v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.01311v2","updated":"2025-01-13T12:42:14Z","published":"2025-01-02T15:47:56Z","title":"Multi-Head Explainer: A General Framework to Improve Explainability in\n  CNNs and Transformers","summary":"  In this study, we introduce the Multi-Head Explainer (MHEX), a versatile and\nmodular framework that enhances both the explainability and accuracy of\nConvolutional Neural Networks (CNNs) and Transformer-based models. MHEX\nconsists of three core components: an Attention Gate that dynamically\nhighlights task-relevant features, Deep Supervision that guides early layers to\ncapture fine-grained details pertinent to the target class, and an Equivalent\nMatrix that unifies refined local and global representations to generate\ncomprehensive saliency maps. Our approach demonstrates superior compatibility,\nenabling effortless integration into existing residual networks like ResNet and\nTransformer architectures such as BERT with minimal modifications. Extensive\nexperiments on benchmark datasets in medical imaging and text classification\nshow that MHEX not only improves classification accuracy but also produces\nhighly interpretable and detailed saliency scores.\n","authors":["Bohang Sun","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2501.01311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14128v2","updated":"2025-01-13T12:23:55Z","published":"2024-07-19T08:56:12Z","title":"OCTolyzer: Fully automatic toolkit for segmentation and feature\n  extracting in optical coherence tomography and scanning laser ophthalmoscopy\n  data","summary":"  Optical coherence tomography (OCT) and scanning laser ophthalmoscopy (SLO) of\nthe eye has become essential to ophthalmology and the emerging field of\noculomics, thus requiring a need for transparent, reproducible, and rapid\nanalysis of this data for clinical research and the wider research community.\nHere, we introduce OCTolyzer, the first open-source toolkit for retinochoroidal\nanalysis in OCT/SLO data. It features two analysis suites for OCT and SLO data,\nfacilitating deep learning-based anatomical segmentation and feature extraction\nof the cross-sectional retinal and choroidal layers and en face retinal\nvessels. We describe OCTolyzer and evaluate the reproducibility of its OCT\nchoroid analysis. At the population level, metrics for choroid region thickness\nwere highly reproducible, with a mean absolute error (MAE)/Pearson correlation\nfor macular volume choroid thickness (CT) of 6.7$\\mu$m/0.99, macular B-scan CT\nof 11.6$\\mu$m/0.99, and peripapillary CT of 5.0$\\mu$m/0.99. Macular choroid\nvascular index (CVI) also showed strong reproducibility, with MAE/Pearson for\nvolume CVI yielding 0.0271/0.97 and B-scan CVI 0.0130/0.91. At the eye level,\nmeasurement noise for regional and vessel metrics was below 5% and 20% of the\npopulation's variability, respectively. Outliers were caused by poor-quality\nB-scans with thick choroids and invisible choroid-sclera boundary. Processing\ntimes on a laptop CPU were under three seconds for macular/peripapillary\nB-scans and 85 seconds for volume scans. OCTolyzer can convert OCT/SLO data\ninto reproducible and clinically meaningful retinochoroidal features and will\nimprove the standardisation of ocular measurements in OCT/SLO image analysis,\nrequiring no specialised training or proprietary software to be used. OCTolyzer\nis freely available here: https://github.com/jaburke166/OCTolyzer.\n","authors":["Jamie Burke","Justin Engelmann","Samuel Gibbon","Charlene Hamid","Diana Moukaddem","Dan Pugh","Tariq Farrah","Niall Strang","Neeraj Dhaun","Tom MacGillivray","Stuart King","Ian J. C. MacCormick"],"pdf_url":"https://arxiv.org/pdf/2407.14128v2.pdf","comment":"Main paper: 15 pages, 9 figures, 3 tables. Supplementary material: 9\n  pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2412.10351v2","updated":"2025-01-13T12:22:52Z","published":"2024-12-13T18:47:11Z","title":"VibrantVS: A high-resolution multi-task transformer for forest canopy\n  height estimation","summary":"  This paper explores the application of a novel multi-task vision transformer\n(ViT) model for the estimation of canopy height models (CHMs) using 4-band\nNational Agriculture Imagery Program (NAIP) imagery across the western United\nStates. We compare the effectiveness of this model in terms of accuracy and\nprecision aggregated across ecoregions and class heights versus three other\nbenchmark peer-reviewed models. Key findings suggest that, while other\nbenchmark models can provide high precision in localized areas, the VibrantVS\nmodel has substantial advantages across a broad reach of ecoregions in the\nwestern United States with higher accuracy, higher precision, the ability to\ngenerate updated inference at a cadence of three years or less, and high\nspatial resolution. The VibrantVS model provides significant value for\necological monitoring and land management decisions for wildfire mitigation.\n","authors":["Tony Chang","Kiarie Ndegwa","Andreas Gros","Vincent A. Landau","Luke J. Zachmann","Bogdan State","Mitchell A. Gritts","Colton W. Miller","Nathan E. Rutenbeck","Scott Conway","Guy Bayes"],"pdf_url":"https://arxiv.org/pdf/2412.10351v2.pdf","comment":"15 pages, 12 figures"},{"id":"http://arxiv.org/abs/2501.07260v1","updated":"2025-01-13T12:18:58Z","published":"2025-01-13T12:18:58Z","title":"Skip Mamba Diffusion for Monocular 3D Semantic Scene Completion","summary":"  3D semantic scene completion is critical for multiple downstream tasks in\nautonomous systems. It estimates missing geometric and semantic information in\nthe acquired scene data. Due to the challenging real-world conditions, this\ntask usually demands complex models that process multi-modal data to achieve\nacceptable performance. We propose a unique neural model, leveraging advances\nfrom the state space and diffusion generative modeling to achieve remarkable 3D\nsemantic scene completion performance with monocular image input. Our technique\nprocesses the data in the conditioned latent space of a variational autoencoder\nwhere diffusion modeling is carried out with an innovative state space\ntechnique. A key component of our neural network is the proposed Skimba (Skip\nMamba) denoiser, which is adept at efficiently processing long-sequence data.\nThe Skimba diffusion model is integral to our 3D scene completion network,\nincorporating a triple Mamba structure, dimensional decomposition residuals and\nvarying dilations along three directions. We also adopt a variant of this\nnetwork for the subsequent semantic segmentation stage of our method. Extensive\nevaluation on the standard SemanticKITTI and SSCBench-KITTI360 datasets show\nthat our approach not only outperforms other monocular techniques by a large\nmargin, it also achieves competitive performance against stereo methods. The\ncode is available at https://github.com/xrkong/skimba\n","authors":["Li Liang","Naveed Akhtar","Jordan Vice","Xiangrui Kong","Ajmal Saeed Mian"],"pdf_url":"https://arxiv.org/pdf/2501.07260v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.07256v1","updated":"2025-01-13T12:11:07Z","published":"2025-01-13T12:11:07Z","title":"EdgeTAM: On-Device Track Anything Model","summary":"  On top of Segment Anything Model (SAM), SAM 2 further extends its capability\nfrom image to video inputs through a memory bank mechanism and obtains a\nremarkable performance compared with previous methods, making it a foundation\nmodel for video segmentation task. In this paper, we aim at making SAM 2 much\nmore efficient so that it even runs on mobile devices while maintaining a\ncomparable performance. Despite several works optimizing SAM for better\nefficiency, we find they are not sufficient for SAM 2 because they all focus on\ncompressing the image encoder, while our benchmark shows that the newly\nintroduced memory attention blocks are also the latency bottleneck. Given this\nobservation, we propose EdgeTAM, which leverages a novel 2D Spatial Perceiver\nto reduce the computational cost. In particular, the proposed 2D Spatial\nPerceiver encodes the densely stored frame-level memories with a lightweight\nTransformer that contains a fixed set of learnable queries. Given that video\nsegmentation is a dense prediction task, we find preserving the spatial\nstructure of the memories is essential so that the queries are split into\nglobal-level and patch-level groups. We also propose a distillation pipeline\nthat further improves the performance without inference overhead. As a result,\nEdgeTAM achieves 87.7, 70.0, 72.3, and 71.7 J&F on DAVIS 2017, MOSE, SA-V val,\nand SA-V test, while running at 16 FPS on iPhone 15 Pro Max.\n","authors":["Chong Zhou","Chenchen Zhu","Yunyang Xiong","Saksham Suri","Fanyi Xiao","Lemeng Wu","Raghuraman Krishnamoorthi","Bo Dai","Chen Change Loy","Vikas Chandra","Bilge Soran"],"pdf_url":"https://arxiv.org/pdf/2501.07256v1.pdf","comment":"Code will be released at https://github.com/facebookresearch/EdgeTAM"},{"id":"http://arxiv.org/abs/2501.07251v1","updated":"2025-01-13T12:00:34Z","published":"2025-01-13T12:00:34Z","title":"MOS-Attack: A Scalable Multi-objective Adversarial Attack Framework","summary":"  Crafting adversarial examples is crucial for evaluating and enhancing the\nrobustness of Deep Neural Networks (DNNs), presenting a challenge equivalent to\nmaximizing a non-differentiable 0-1 loss function.\n  However, existing single objective methods, namely adversarial attacks focus\non a surrogate loss function, do not fully harness the benefits of engaging\nmultiple loss functions, as a result of insufficient understanding of their\nsynergistic and conflicting nature.\n  To overcome these limitations, we propose the Multi-Objective Set-based\nAttack (MOS Attack), a novel adversarial attack framework leveraging multiple\nloss functions and automatically uncovering their interrelations.\n  The MOS Attack adopts a set-based multi-objective optimization strategy,\nenabling the incorporation of numerous loss functions without additional\nparameters.\n  It also automatically mines synergistic patterns among various losses,\nfacilitating the generation of potent adversarial attacks with fewer\nobjectives.\n  Extensive experiments have shown that our MOS Attack outperforms\nsingle-objective attacks. Furthermore, by harnessing the identified synergistic\npatterns, MOS Attack continues to show superior results with a reduced number\nof loss functions.\n","authors":["Ping Guo","Cheng Gong","Xi Lin","Fei Liu","Zhichao Lu","Qingfu Zhang","Zhenkun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07251v1.pdf","comment":"Under Review of CVPR 2025"},{"id":"http://arxiv.org/abs/2501.07248v1","updated":"2025-01-13T11:58:02Z","published":"2025-01-13T11:58:02Z","title":"Implicit Neural Representations for Registration of Left Ventricle\n  Myocardium During a Cardiac Cycle","summary":"  Understanding the movement of the left ventricle myocardium (LVmyo) during\nthe cardiac cycle is essential for assessing cardiac function. One way to model\nthis movement is through a series of deformable image registrations (DIRs) of\nthe LVmyo. Traditional deep learning methods for DIRs, such as those based on\nconvolutional neural networks, often require substantial memory and\ncomputational resources. In contrast, implicit neural representations (INRs)\noffer an efficient approach by operating on any number of continuous points.\nThis study extends the use of INRs for DIR to cardiac computed tomography (CT),\nfocusing on LVmyo registration. To enhance the precision of the registration\naround the LVmyo, we incorporate the signed distance field of the LVmyo with\nthe Hounsfield Unit values from the CT frames. This guides the registration of\nthe LVmyo, while keeping the tissue information from the CT frames. Our\nframework demonstrates high registration accuracy and provides a robust method\nfor temporal registration that facilitates further analysis of LVmyo motion.\n","authors":["Mathias Micheelsen Lowes","Jonas Jalili Pedersen","Bjørn S. Hansen","Klaus Fuglsang Kofoed","Maxime Sermesant","Rasmus R. Paulsen"],"pdf_url":"https://arxiv.org/pdf/2501.07248v1.pdf","comment":"9 pages, 5 figures, STACOM 2024"},{"id":"http://arxiv.org/abs/2501.07245v1","updated":"2025-01-13T11:54:26Z","published":"2025-01-13T11:54:26Z","title":"Depth and Image Fusion for Road Obstacle Detection Using Stereo Camera","summary":"  This paper is devoted to the detection of objects on a road, performed with a\ncombination of two methods based on both the use of depth information and video\nanalysis of data from a stereo camera. Since neither the time of the appearance\nof an object on the road, nor its size and shape is known in advance,\nML/DL-based approaches are not applicable. The task becomes more complicated\ndue to variations in artificial illumination, inhomogeneous road surface\ntexture, and unknown character and features of the object. To solve this\nproblem we developed the depth and image fusion method that complements a\nsearch of small contrast objects by RGB-based method, and obstacle detection by\nstereo image-based approach with SLIC superpixel segmentation. We conducted\nexperiments with static and low speed obstacles in an underground parking lot\nand demonstrated the successful work of the developed technique for detecting\nand even tracking small objects, which can be parking infrastructure objects,\nthings left on the road, wheels, dropped boxes, etc.\n","authors":["Oleg Perezyabov","Mikhail Gavrilenkov","Ilya Afanasyev"],"pdf_url":"https://arxiv.org/pdf/2501.07245v1.pdf","comment":"8 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.07244v1","updated":"2025-01-13T11:52:55Z","published":"2025-01-13T11:52:55Z","title":"Can Vision-Language Models Evaluate Handwritten Math?","summary":"  Recent advancements in Vision-Language Models (VLMs) have opened new\npossibilities in automatic grading of handwritten student responses,\nparticularly in mathematics. However, a comprehensive study to test the ability\nof VLMs to evaluate and reason over handwritten content remains absent. To\naddress this gap, we introduce FERMAT, a benchmark designed to assess the\nability of VLMs to detect, localize and correct errors in handwritten\nmathematical content. FERMAT spans four key error dimensions - computational,\nconceptual, notational, and presentation - and comprises over 2,200 handwritten\nmath solutions derived from 609 manually curated problems from grades 7-12 with\nintentionally introduced perturbations. Using FERMAT we benchmark nine VLMs\nacross three tasks: error detection, localization, and correction. Our results\nreveal significant shortcomings in current VLMs in reasoning over handwritten\ntext, with Gemini-1.5-Pro achieving the highest error correction rate (77%). We\nalso observed that some models struggle with processing handwritten content, as\ntheir accuracy improves when handwritten inputs are replaced with printed text\nor images. These findings highlight the limitations of current VLMs and reveal\nnew avenues for improvement. We release FERMAT and all the associated resources\nin the open-source to drive further research.\n","authors":["Oikantik Nath","Hanani Bathina","Mohammed Safi Ur Rahman Khan","Mitesh M. Khapra"],"pdf_url":"https://arxiv.org/pdf/2501.07244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20104v2","updated":"2025-01-13T11:46:06Z","published":"2024-12-28T10:12:12Z","title":"SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object\n  Interaction Synthesis","summary":"  Synthesizing realistic human-object interaction motions is a critical problem\nin VR/AR and human animation. Unlike the commonly studied scenarios involving a\nsingle human or hand interacting with one object, we address a more generic\nmulti-body setting with arbitrary numbers of humans, hands, and objects. This\ncomplexity introduces significant challenges in synchronizing motions due to\nthe high correlations and mutual influences among bodies. To address these\nchallenges, we introduce SyncDiff, a novel method for multi-body interaction\nsynthesis using a synchronized motion diffusion strategy. SyncDiff employs a\nsingle diffusion model to capture the joint distribution of multi-body motions.\nTo enhance motion fidelity, we propose a frequency-domain motion decomposition\nscheme. Additionally, we introduce a new set of alignment scores to emphasize\nthe synchronization of different body motions. SyncDiff jointly optimizes both\ndata sample likelihood and alignment likelihood through an explicit\nsynchronization strategy. Extensive experiments across four datasets with\nvarious multi-body configurations demonstrate the superiority of SyncDiff over\nexisting state-of-the-art motion synthesis methods.\n","authors":["Wenkun He","Yun Liu","Ruitao Liu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2412.20104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07236v1","updated":"2025-01-13T11:34:55Z","published":"2025-01-13T11:34:55Z","title":"CSTA: Spatial-Temporal Causal Adaptive Learning for Exemplar-Free Video\n  Class-Incremental Learning","summary":"  Continual learning aims to acquire new knowledge while retaining past\ninformation. Class-incremental learning (CIL) presents a challenging scenario\nwhere classes are introduced sequentially. For video data, the task becomes\nmore complex than image data because it requires learning and preserving both\nspatial appearance and temporal action involvement. To address this challenge,\nwe propose a novel exemplar-free framework that equips separate spatiotemporal\nadapters to learn new class patterns, accommodating the incremental information\nrepresentation requirements unique to each class. While separate adapters are\nproven to mitigate forgetting and fit unique requirements, naively applying\nthem hinders the intrinsic connection between spatial and temporal information\nincrements, affecting the efficiency of representing newly learned class\ninformation. Motivated by this, we introduce two key innovations from a causal\nperspective. First, a causal distillation module is devised to maintain the\nrelation between spatial-temporal knowledge for a more efficient\nrepresentation. Second, a causal compensation mechanism is proposed to reduce\nthe conflicts during increment and memorization between different types of\ninformation. Extensive experiments conducted on benchmark datasets demonstrate\nthat our framework can achieve new state-of-the-art results, surpassing current\nexample-based methods by 4.2% in accuracy on average.\n","authors":["Tieyuan Chen","Huabin Liu","Chern Hong Lim","John See","Xing Gao","Junhui Hou","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07236v1.pdf","comment":"IEEE TCSVT Submission"},{"id":"http://arxiv.org/abs/2501.07227v1","updated":"2025-01-13T11:28:49Z","published":"2025-01-13T11:28:49Z","title":"MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning","summary":"  Video causal reasoning aims to achieve a high-level understanding of videos\nfrom a causal perspective. However, it exhibits limitations in its scope,\nprimarily executed in a question-answering paradigm and focusing on brief video\nsegments containing isolated events and basic causal relations, lacking\ncomprehensive and structured causality analysis for videos with multiple\ninterconnected events. To fill this gap, we introduce a new task and dataset,\nMulti-Event Causal Discovery (MECD). It aims to uncover the causal relations\nbetween events distributed chronologically across long videos. Given visual\nsegments and textual descriptions of events, MECD identifies the causal\nassociations between these events to derive a comprehensive and structured\nevent-level video causal graph explaining why and how the result event\noccurred. To address the challenges of MECD, we devise a novel framework\ninspired by the Granger Causality method, incorporating an efficient mask-based\nevent prediction model to perform an Event Granger Test. It estimates causality\nby comparing the predicted result event when premise events are masked versus\nunmasked. Furthermore, we integrate causal inference techniques such as\nfront-door adjustment and counterfactual inference to mitigate challenges in\nMECD like causality confounding and illusory causality. Additionally, context\nchain reasoning is introduced to conduct more robust and generalized reasoning.\nExperiments validate the effectiveness of our framework in reasoning complete\ncausal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%,\nrespectively. Further experiments demonstrate that causal relation graphs can\nalso contribute to downstream video understanding tasks such as video question\nanswering and video event prediction.\n","authors":["Tieyuan Chen","Huabin Liu","Yi Wang","Yihang Chen","Tianyao He","Chaofan Gan","Huanyu He","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07227v1.pdf","comment":"IEEE TPAMI Submission. arXiv admin note: substantial text overlap\n  with arXiv:2409.17647"},{"id":"http://arxiv.org/abs/2501.07221v1","updated":"2025-01-13T11:20:44Z","published":"2025-01-13T11:20:44Z","title":"Exploring the Use of Contrastive Language-Image Pre-Training for Human\n  Posture Classification: Insights from Yoga Pose Analysis","summary":"  Accurate human posture classification in images and videos is crucial for\nautomated applications across various fields, including work safety, physical\nrehabilitation, sports training, or daily assisted living. Recently, multimodal\nlearning methods, such as Contrastive Language-Image Pretraining (CLIP), have\nadvanced significantly in jointly understanding images and text. This study\naims to assess the effectiveness of CLIP in classifying human postures,\nfocusing on its application in yoga. Despite the initial limitations of the\nzero-shot approach, applying transfer learning on 15,301 images (real and\nsynthetic) with 82 classes has shown promising results. The article describes\nthe full procedure for fine-tuning, including the choice for image description\nsyntax, models and hyperparameters adjustment. The fine-tuned CLIP model,\ntested on 3826 images, achieves an accuracy of over 85%, surpassing the current\nstate-of-the-art of previous works on the same dataset by approximately 6%, its\ntraining time being 3.5 times lower than what is needed to fine-tune a\nYOLOv8-based model. For more application-oriented scenarios, with smaller\ndatasets of six postures each, containing 1301 and 401 training images, the\nfine-tuned models attain an accuracy of 98.8% and 99.1%, respectively.\nFurthermore, our experiments indicate that training with as few as 20 images\nper pose can yield around 90% accuracy in a six-class dataset. This study\ndemonstrates that this multimodal technique can be effectively used for yoga\npose classification, and possibly for human posture classification, in general.\nAdditionally, CLIP inference time (around 7 ms) supports that the model can be\nintegrated into automated systems for posture evaluation, e.g., for developing\na real-time personal yoga assistant for performance assessment.\n","authors":["Andrzej D. Dobrzycki","Ana M. Bernardos","Luca Bergesio","Andrzej Pomirski","Daniel Sáez-Trigueros"],"pdf_url":"https://arxiv.org/pdf/2501.07221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07214v1","updated":"2025-01-13T11:12:59Z","published":"2025-01-13T11:12:59Z","title":"TimeLogic: A Temporal Logic Benchmark for Video QA","summary":"  Temporal logical understanding, a core facet of human cognition, plays a\npivotal role in capturing complex sequential events and their temporal\nrelationships within videos. This capability is particularly crucial in tasks\nlike Video Question Answering (VideoQA), where the goal is to process visual\ndata over time together with textual data to provide coherent answers. However,\ncurrent VideoQA benchmarks devote little focus to evaluating this critical\nskill due to the challenge of annotating temporal logic. Despite the\nadvancement of vision-language models, assessing their temporal logical\nreasoning powers remains a challenge, primarily due to the lack QA pairs that\ndemand formal, complex temporal reasoning. To bridge this gap, we introduce the\nTimeLogic QA (TLQA) framework to automatically generate the QA pairs,\nspecifically designed to evaluate the temporal logical understanding. To this\nend, TLQA leverages temporal annotations from existing video datasets together\nwith temporal operators derived from logic theory to construct questions that\ntest understanding of event sequences and their temporal relationships. TLQA\nframework is generic and scalable, capable of leveraging both, existing video\naction datasets with temporal action segmentation annotations, or video\ndatasets with temporal scene graph annotations, to automatically generate\ntemporal logical questions. We leverage 4 datasets, STAR, Breakfast, AGQA, and\nCrossTask, and generate two VideoQA dataset variants - small (TLQA-S) and large\n(TLQA-L) - containing 2k and 10k QA pairs for each category, resulting in 32k\nand 160k total pairs per dataset. We undertake a comprehensive evaluation of\nleading-edge VideoQA models, employing the TLQA to benchmark their temporal\nlogical understanding capabilities. We assess the VideoQA model's temporal\nreasoning performance on 16 categories of temporal logic with varying temporal\ncomplexity.\n","authors":["Sirnam Swetha","Hilde Kuehne","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2501.07214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07213v1","updated":"2025-01-13T11:12:47Z","published":"2025-01-13T11:12:47Z","title":"Multi-face emotion detection for effective Human-Robot Interaction","summary":"  The integration of dialogue interfaces in mobile devices has become\nubiquitous, providing a wide array of services. As technology progresses,\nhumanoid robots designed with human-like features to interact effectively with\npeople are gaining prominence, and the use of advanced human-robot dialogue\ninterfaces is continually expanding. In this context, emotion recognition plays\na crucial role in enhancing human-robot interaction by enabling robots to\nunderstand human intentions. This research proposes a facial emotion detection\ninterface integrated into a mobile humanoid robot, capable of displaying\nreal-time emotions from multiple individuals on a user interface. To this end,\nvarious deep neural network models for facial expression recognition were\ndeveloped and evaluated under consistent computer-based conditions, yielding\npromising results. Afterwards, a trade-off between accuracy and memory\nfootprint was carefully considered to effectively implement this application on\na mobile humanoid robot.\n","authors":["Mohamed Ala Yahyaoui","Mouaad Oujabour","Leila Ben Letaifa","Amine Bohi"],"pdf_url":"https://arxiv.org/pdf/2501.07213v1.pdf","comment":"9 pages, 8 figures and 1 table. Accepted at the 17th International\n  Conference on Agents and Artificial Intelligence (ICAART 2025), Porto,\n  Portugal"},{"id":"http://arxiv.org/abs/2501.07202v1","updated":"2025-01-13T10:53:48Z","published":"2025-01-13T10:53:48Z","title":"FaceOracle: Chat with a Face Image Oracle","summary":"  A face image is a mandatory part of ID and travel documents. Obtaining\nhigh-quality face images when issuing such documents is crucial for both human\nexaminers and automated face recognition systems. In several international\nstandards, face image quality requirements are intricate and defined in detail.\nIdentifying and understanding non-compliance or defects in the submitted face\nimages is crucial for both issuing authorities and applicants. In this work, we\nintroduce FaceOracle, an LLM-powered AI assistant that helps its users analyze\na face image in a natural conversational manner using standard compliant\nalgorithms. Leveraging the power of LLMs, users can get explanations of various\nface image quality concepts as well as interpret the outcome of face image\nquality assessment (FIQA) algorithms. We implement a proof-of-concept that\ndemonstrates how experts at an issuing authority could integrate FaceOracle\ninto their workflow to analyze, understand, and communicate their decisions\nmore efficiently, resulting in enhanced productivity.\n","authors":["Wassim Kabbani","Kiran Raja","Raghavendra Ramachandra","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.07202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07197v1","updated":"2025-01-13T10:44:08Z","published":"2025-01-13T10:44:08Z","title":"Lung Cancer detection using Deep Learning","summary":"  In this paper we discuss lung cancer detection using hybrid model of\nConvolutional-Neural-Networks (CNNs) and Support-Vector-Machines-(SVMs) in\norder to gain early detection of tumors, benign or malignant. The work uses\nthis hybrid model by training upon the Computed Tomography scans (CT scans) as\ndataset. Using deep learning for detecting lung cancer early is a cutting-edge\nmethod.\n","authors":["Aryan Chaudhari","Ankush Singh","Sanchi Gajbhiye","Pratham Agrawal"],"pdf_url":"https://arxiv.org/pdf/2501.07197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07194v1","updated":"2025-01-13T10:42:18Z","published":"2025-01-13T10:42:18Z","title":"VAGeo: View-specific Attention for Cross-View Object Geo-Localization","summary":"  Cross-view object geo-localization (CVOGL) aims to locate an object of\ninterest in a captured ground- or drone-view image within the satellite image.\nHowever, existing works treat ground-view and drone-view query images\nequivalently, overlooking their inherent viewpoint discrepancies and the\nspatial correlation between the query image and the satellite-view reference\nimage. To this end, this paper proposes a novel View-specific Attention\nGeo-localization method (VAGeo) for accurate CVOGL. Specifically, VAGeo\ncontains two key modules: view-specific positional encoding (VSPE) module and\nchannel-spatial hybrid attention (CSHA) module. In object-level, according to\nthe characteristics of different viewpoints of ground and drone query images,\nviewpoint-specific positional codings are designed to more accurately identify\nthe click-point object of the query image in the VSPE module. In feature-level,\na hybrid attention in the CSHA module is introduced by combining channel\nattention and spatial attention mechanisms simultaneously for learning\ndiscriminative features. Extensive experimental results demonstrate that the\nproposed VAGeo gains a significant performance improvement, i.e., improving\nacc@0.25/acc@0.5 on the CVOGL dataset from 45.43%/42.24% to 48.21%/45.22% for\nground-view, and from 61.97%/57.66% to 66.19%/61.87% for drone-view.\n","authors":["Zhongyang Li","Xin Yuan","Wei Liu","Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2501.07194v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2411.11543v4","updated":"2025-01-13T10:39:04Z","published":"2024-11-18T13:01:57Z","title":"PSA-VLM: Enhancing Vision-Language Model Safety through Progressive\n  Concept-Bottleneck-Driven Alignment","summary":"  Benefiting from the powerful capabilities of Large Language Models (LLMs),\npre-trained visual encoder models connected to LLMs form Vision Language Models\n(VLMs). However, recent research shows that the visual modality in VLMs is\nhighly vulnerable, allowing attackers to bypass safety alignment in LLMs\nthrough visually transmitted content, launching harmful attacks. To address\nthis challenge, we propose a progressive concept-based alignment strategy,\nPSA-VLM, which incorporates safety modules as concept bottlenecks to enhance\nvisual modality safety alignment. By aligning model predictions with specific\nsafety concepts, we improve defenses against risky images, enhancing\nexplainability and controllability while minimally impacting general\nperformance. Our method is obtained through two-stage training. The low\ncomputational cost of the first stage brings very effective performance\nimprovement, and the fine-tuning of the language model in the second stage\nfurther improves the safety performance. Our method achieves state-of-the-art\nresults on popular VLM safety benchmark.\n","authors":["Zhendong Liu","Yuanbi Nie","Yingshui Tan","Jiaheng Liu","Xiangyu Yue","Qiushi Cui","Chongjun Wang","Xiaoyong Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.11543v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.13581"},{"id":"http://arxiv.org/abs/2501.07192v1","updated":"2025-01-13T10:38:58Z","published":"2025-01-13T10:38:58Z","title":"A4O: All Trigger for One sample","summary":"  Backdoor attacks have become a critical threat to deep neural networks\n(DNNs), drawing many research interests. However, most of the studied attacks\nemploy a single type of trigger. Consequently, proposed backdoor defenders\noften rely on the assumption that triggers would appear in a unified way. In\nthis paper, we show that this naive assumption can create a loophole, allowing\nmore sophisticated backdoor attacks to bypass. We design a novel backdoor\nattack mechanism that incorporates multiple types of backdoor triggers,\nfocusing on stealthiness and effectiveness. Our journey begins with the\nintriguing observation that the performance of a backdoor attack in deep\nlearning models, as well as its detectability and removability, are all\nproportional to the magnitude of the trigger. Based on this correlation, we\npropose reducing the magnitude of each trigger type and combining them to\nachieve a strong backdoor relying on the combined trigger while still staying\nsafely under the radar of defenders. Extensive experiments on three standard\ndatasets demonstrate that our method can achieve high attack success rates\n(ASRs) while consistently bypassing state-of-the-art defenses.\n","authors":["Duc Anh Vu","Anh Tuan Tran","Cong Tran","Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2501.07192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05767v2","updated":"2025-01-13T10:38:32Z","published":"2025-01-10T07:56:23Z","title":"Migician: Revealing the Magic of Free-Form Multi-Image Grounding in\n  Multimodal Large Language Models","summary":"  The recent advancement of Multimodal Large Language Models (MLLMs) has\nsignificantly improved their fine-grained perception of single images and\ngeneral comprehension across multiple images. However, existing MLLMs still\nface challenges in achieving precise grounding in complex multi-image\nscenarios. To address this, we first explore a Chain-of-Thought (CoT) framework\nthat integrates single-image grounding with multi-image comprehension. While\npartially effective, it remains unstable and struggles to capture abstract\nvisual information due to its non-end-to-end nature. Therefore, we introduce\nMigician, the first multi-image grounding model capable of performing free-form\nand accurate grounding across multiple images. To support this, we present the\nMGrounding-630k dataset, which comprises data for several multi-image grounding\ntasks derived from existing datasets, along with newly generated free-form\ngrounding instruction-following data. Furthermore, we propose MIG-Bench, a\ncomprehensive benchmark specifically designed for evaluating multi-image\ngrounding capabilities. Experimental results demonstrate that our model\nachieves significantly superior multi-image grounding capabilities,\noutperforming the best existing MLLMs by 21.61% and even surpassing much larger\n70B models. Our code, model, dataset, and benchmark are fully open-sourced at\nhttps://migician-vg.github.io/.\n","authors":["You Li","Heyu Huang","Chi Chen","Kaiyu Huang","Chao Huang","Zonghao Guo","Zhiyuan Liu","Jinan Xu","Yuhua Li","Ruixuan Li","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2501.05767v2.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.07185v1","updated":"2025-01-13T10:30:10Z","published":"2025-01-13T10:30:10Z","title":"Uncertainty Guarantees on Automated Precision Weeding using Conformal\n  Prediction","summary":"  Precision agriculture in general, and precision weeding in particular, have\ngreatly benefited from the major advancements in deep learning and computer\nvision. A large variety of commercial robotic solutions are already available\nand deployed. However, the adoption by farmers of such solutions is still low\nfor many reasons, an important one being the lack of trust in these systems.\nThis is in great part due to the opaqueness and complexity of deep neural\nnetworks and the manufacturers' inability to provide valid guarantees on their\nperformance. Conformal prediction, a well-established methodology in the\nmachine learning community, is an efficient and reliable strategy for providing\ntrustworthy guarantees on the predictions of any black-box model under very\nminimal constraints. Bridging the gap between the safe machine learning and\nprecision agriculture communities, this article showcases conformal prediction\nin action on the task of precision weeding through deep learning-based image\nclassification. After a detailed presentation of the conformal prediction\nmethodology and the development of a precision spraying pipeline based on a\n''conformalized'' neural network and well-defined spraying decision rules, the\narticle evaluates this pipeline on two real-world scenarios: one under\nin-distribution conditions, the other reflecting a near out-of-distribution\nsetting. The results show that we are able to provide formal, i.e. certifiable,\nguarantees on spraying at least 90% of the weeds.\n","authors":["Paul Melki","Lionel Bombrun","Boubacar Diallo","Jérôme Dias","Jean-Pierre da Costa"],"pdf_url":"https://arxiv.org/pdf/2501.07185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07179v1","updated":"2025-01-13T10:19:16Z","published":"2025-01-13T10:19:16Z","title":"Radial Distortion in Face Images: Detection and Impact","summary":"  Acquiring face images of sufficiently high quality is important for online ID\nand travel document issuance applications using face recognition systems (FRS).\nLow-quality, manipulated (intentionally or unintentionally), or distorted\nimages degrade the FRS performance and facilitate documents' misuse. Securing\nquality for enrolment images, especially in the unsupervised self-enrolment\nscenario via a smartphone, becomes important to assure FRS performance. In this\nwork, we focus on the less studied area of radial distortion (a.k.a., the\nfish-eye effect) in face images and its impact on FRS performance. We introduce\nan effective radial distortion detection model that can detect and flag radial\ndistortion in the enrolment scenario. We formalize the detection model as a\nface image quality assessment (FIQA) algorithm and provide a careful inspection\nof the effect of radial distortion on FRS performance. Evaluation results show\nexcellent detection results for the proposed models, and the study on the\nimpact on FRS uncovers valuable insights into how to best use these models in\noperational systems.\n","authors":["Wassim Kabbani","Tristan Le Pessot","Kiran Raja","Raghavendra Ramachandra","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.07179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20971v2","updated":"2025-01-13T10:14:27Z","published":"2024-05-31T16:18:46Z","title":"Amortizing intractable inference in diffusion models for vision,\n  language, and control","summary":"  Diffusion models have emerged as effective distribution estimators in vision,\nlanguage, and reinforcement learning, but their use as priors in downstream\ntasks poses an intractable posterior inference problem. This paper studies\namortized sampling of the posterior over data, $\\mathbf{x}\\sim p^{\\rm\npost}(\\mathbf{x})\\propto p(\\mathbf{x})r(\\mathbf{x})$, in a model that consists\nof a diffusion generative model prior $p(\\mathbf{x})$ and a black-box\nconstraint or likelihood function $r(\\mathbf{x})$. We state and prove the\nasymptotic correctness of a data-free learning objective, relative trajectory\nbalance, for training a diffusion model that samples from this posterior, a\nproblem that existing methods solve only approximately or in restricted cases.\nRelative trajectory balance arises from the generative flow network perspective\non diffusion models, which allows the use of deep reinforcement learning\ntechniques to improve mode coverage. Experiments illustrate the broad potential\nof unbiased inference of arbitrary posteriors under diffusion priors: in vision\n(classifier guidance), language (infilling under a discrete diffusion LLM), and\nmultimodal data (text-to-image generation). Beyond generative modeling, we\napply relative trajectory balance to the problem of continuous control with a\nscore-based behavior prior, achieving state-of-the-art results on benchmarks in\noffline reinforcement learning.\n","authors":["Siddarth Venkatraman","Moksh Jain","Luca Scimeca","Minsu Kim","Marcin Sendera","Mohsin Hasan","Luke Rowe","Sarthak Mittal","Pablo Lemos","Emmanuel Bengio","Alexandre Adam","Jarrid Rector-Brooks","Yoshua Bengio","Glen Berseth","Nikolay Malkin"],"pdf_url":"https://arxiv.org/pdf/2405.20971v2.pdf","comment":"NeurIPS 2024; code: https://github.com/GFNOrg/diffusion-finetuning"},{"id":"http://arxiv.org/abs/2412.15523v2","updated":"2025-01-13T10:01:56Z","published":"2024-12-20T03:23:26Z","title":"InstructOCR: Instruction Boosting Scene Text Spotting","summary":"  In the field of scene text spotting, previous OCR methods primarily relied on\nimage encoders and pre-trained text information, but they often overlooked the\nadvantages of incorporating human language instructions. To address this gap,\nwe propose InstructOCR, an innovative instruction-based scene text spotting\nmodel that leverages human language instructions to enhance the understanding\nof text within images. Our framework employs both text and image encoders\nduring training and inference, along with instructions meticulously designed\nbased on text attributes. This approach enables the model to interpret text\nmore accurately and flexibly. Extensive experiments demonstrate the\neffectiveness of our model and we achieve state-of-the-art results on widely\nused benchmarks. Furthermore, the proposed framework can be seamlessly applied\nto scene text VQA tasks. By leveraging instruction strategies during\npre-training, the performance on downstream VQA tasks can be significantly\nimproved, with a 2.6% increase on the TextVQA dataset and a 2.1% increase on\nthe ST-VQA dataset. These experimental results provide insights into the\nbenefits of incorporating human language instructions for OCR-related tasks.\n","authors":["Chen Duan","Qianyi Jiang","Pei Fu","Jiamin Chen","Shengxi Li","Zining Wang","Shan Guo","Junfeng Luo"],"pdf_url":"https://arxiv.org/pdf/2412.15523v2.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.08072v1","updated":"2025-01-13T10:01:27Z","published":"2025-01-13T10:01:27Z","title":"Evaluating Human Perception of Novel View Synthesis: Subjective Quality\n  Assessment of Gaussian Splatting and NeRF in Dynamic Scenes","summary":"  Gaussian Splatting (GS) and Neural Radiance Fields (NeRF) are two\ngroundbreaking technologies that have revolutionized the field of Novel View\nSynthesis (NVS), enabling immersive photorealistic rendering and user\nexperiences by synthesizing multiple viewpoints from a set of images of sparse\nviews. The potential applications of NVS, such as high-quality virtual and\naugmented reality, detailed 3D modeling, and realistic medical organ imaging,\nunderscore the importance of quality assessment of NVS methods from the\nperspective of human perception. Although some previous studies have explored\nsubjective quality assessments for NVS technology, they still face several\nchallenges, especially in NVS methods selection, scenario coverage, and\nevaluation methodology. To address these challenges, we conducted two\nsubjective experiments for the quality assessment of NVS technologies\ncontaining both GS-based and NeRF-based methods, focusing on dynamic and\nreal-world scenes. This study covers 360{\\deg}, front-facing, and\nsingle-viewpoint videos while providing a richer and greater number of real\nscenes. Meanwhile, it's the first time to explore the impact of NVS methods in\ndynamic scenes with moving objects. The two types of subjective experiments\nhelp to fully comprehend the influences of different viewing paths from a human\nperception perspective and pave the way for future development of\nfull-reference and no-reference quality metrics. In addition, we established a\ncomprehensive benchmark of various state-of-the-art objective metrics on the\nproposed database, highlighting that existing methods still struggle to\naccurately capture subjective quality. The results give us some insights into\nthe limitations of existing NVS methods and may promote the development of new\nNVS methods.\n","authors":["Yuhang Zhang","Joshua Maraval","Zhengyu Zhang","Nicolas Ramin","Shishun Tian","Lu Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07163v1","updated":"2025-01-13T09:49:34Z","published":"2025-01-13T09:49:34Z","title":"Adaptive Noise-Tolerant Network for Image Segmentation","summary":"  Unlike image classification and annotation, for which deep network models\nhave achieved dominating superior performances compared to traditional computer\nvision algorithms, deep learning for automatic image segmentation still faces\ncritical challenges. One of such hurdles is to obtain ground-truth\nsegmentations as the training labels for deep network training. Especially when\nwe study biomedical images, such as histopathological images (histo-images), it\nis unrealistic to ask for manual segmentation labels as the ground truth for\ntraining due to the fine image resolution as well as the large image size and\ncomplexity. In this paper, instead of relying on clean segmentation labels, we\nstudy whether and how integrating imperfect or noisy segmentation results from\noff-the-shelf segmentation algorithms may help achieve better segmentation\nresults through a new Adaptive Noise-Tolerant Network (ANTN) model. We extend\nthe noisy label deep learning to image segmentation with two novel aspects: (1)\nmultiple noisy labels can be integrated into one deep learning model; (2) noisy\nsegmentation modeling, including probabilistic parameters, is adaptive,\ndepending on the given testing image appearance. Implementation of the new ANTN\nmodel on both the synthetic data and real-world histo-images demonstrates its\neffectiveness and superiority over off-the-shelf and other existing\ndeep-learning-based image segmentation algorithms.\n","authors":["Weizhi Li"],"pdf_url":"https://arxiv.org/pdf/2501.07163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05862v3","updated":"2025-01-13T09:33:47Z","published":"2024-06-09T17:25:47Z","title":"II-Bench: An Image Implication Understanding Benchmark for Multimodal\n  Large Language Models","summary":"  The rapid advancements in the development of multimodal large language models\n(MLLMs) have consistently led to new breakthroughs on various benchmarks. In\nresponse, numerous challenging and comprehensive benchmarks have been proposed\nto more accurately assess the capabilities of MLLMs. However, there is a dearth\nof exploration of the higher-order perceptual capabilities of MLLMs. To fill\nthis gap, we propose the Image Implication understanding Benchmark, II-Bench,\nwhich aims to evaluate the model's higher-order perception of images. Through\nextensive experiments on II-Bench across multiple MLLMs, we have made\nsignificant findings. Initially, a substantial gap is observed between the\nperformance of MLLMs and humans on II-Bench. The pinnacle accuracy of MLLMs\nattains 74.8%, whereas human accuracy averages 90%, peaking at an impressive\n98%. Subsequently, MLLMs perform worse on abstract and complex images,\nsuggesting limitations in their ability to understand high-level semantics and\ncapture image details. Finally, it is observed that most models exhibit\nenhanced accuracy when image sentiment polarity hints are incorporated into the\nprompts. This observation underscores a notable deficiency in their inherent\nunderstanding of image sentiment. We believe that II-Bench will inspire the\ncommunity to develop the next generation of MLLMs, advancing the journey\ntowards expert artificial general intelligence (AGI). II-Bench is publicly\navailable at https://huggingface.co/datasets/m-a-p/II-Bench.\n","authors":["Ziqiang Liu","Feiteng Fang","Xi Feng","Xinrun Du","Chenhao Zhang","Zekun Wang","Yuelin Bai","Qixuan Zhao","Liyang Fan","Chengguang Gan","Hongquan Lin","Jiaming Li","Yuansheng Ni","Haihong Wu","Yaswanth Narsupalli","Zhigang Zheng","Chengming Li","Xiping Hu","Ruifeng Xu","Xiaojun Chen","Min Yang","Jiaheng Liu","Ruibo Liu","Wenhao Huang","Ge Zhang","Shiwen Ni"],"pdf_url":"https://arxiv.org/pdf/2406.05862v3.pdf","comment":"100 pages, 82 figures, add citations"},{"id":"http://arxiv.org/abs/2501.07158v1","updated":"2025-01-13T09:33:03Z","published":"2025-01-13T09:33:03Z","title":"Eye Sclera for Fair Face Image Quality Assessment","summary":"  Fair operational systems are crucial in gaining and maintaining society's\ntrust in face recognition systems (FRS). FRS start with capturing an image and\nassessing its quality before using it further for enrollment or verification.\nFair Face Image Quality Assessment (FIQA) schemes therefore become equally\nimportant in the context of fair FRS. This work examines the sclera as a\nquality assessment region for obtaining a fair FIQA. The sclera region is\nagnostic to demographic variations and skin colour for assessing the quality of\na face image. We analyze three skin tone related ISO/IEC face image quality\nassessment measures and assess the sclera region as an alternative area for\nassessing FIQ. Our analysis of the face dataset of individuals from different\ndemographic groups representing different skin tones indicates sclera as an\nalternative to measure dynamic range, over- and under-exposure of face using\nsclera region alone. The sclera region being agnostic to skin tone, i.e.,\ndemographic factors, provides equal utility as a fair FIQA as shown by our\nError-vs-Discard Characteristic (EDC) curve analysis.\n","authors":["Wassim Kabbani","Kiran Raja","Raghavendra Ramachandra","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.07158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14432v2","updated":"2025-01-13T09:26:17Z","published":"2024-09-22T13:11:08Z","title":"EM-DARTS: Hierarchical Differentiable Architecture Search for Eye\n  Movement Recognition","summary":"  Eye movement biometrics has received increasing attention thanks to its\nhighly secure identification. Although deep learning (DL) models have shown\nsuccess in eye movement recognition, their architectures largely rely on human\nprior knowledge. Differentiable Neural Architecture Search (DARTS) automates\nthe manual process of architecture design with high search efficiency. However,\nDARTS typically stacks multiple cells to form a convolutional network, which\nlimits the diversity of architecture. Furthermore, DARTS generally searches for\narchitectures using shallower networks than those used in the evaluation,\ncreating a significant disparity in architecture depth between the search and\nevaluation phases. To address this issue, we propose EM-DARTS, a hierarchical\ndifferentiable architecture search algorithm to automatically design the DL\narchitecture for eye movement recognition. First, we define a supernet and\npropose a global and local alternate Neural Architecture Search method to\nsearch the optimal architecture alternately with a differentiable neural\narchitecture search. The local search strategy aims to find an optimal\narchitecture for different cells while the global search strategy is\nresponsible for optimizing the architecture of the target network. To minimize\nredundancy, transfer entropy is proposed to compute the information amount of\neach layer, thereby further simplifying the network search process.\nExperimental results on three public datasets demonstrate that the proposed\nEM-DARTS is capable of producing an optimal architecture that leads to\nstate-of-the-art recognition performance, {Specifically, the recognition models\ndeveloped using EM-DARTS achieved the lowest EERs of 0.0453 on the GazeBase\ndataset, 0.0377 on the JuDo1000 dataset, and 0.1385 on the EMglasses dataset.\n","authors":["Huafeng Qin","Hongyu Zhu","Xin Jin","Xin Yu","Mounim A. El-Yacoubi","Shuqiang Yang"],"pdf_url":"https://arxiv.org/pdf/2409.14432v2.pdf","comment":"Submited to IEEE Transactions on Instrumentation and Measurement"},{"id":"http://arxiv.org/abs/2407.19507v2","updated":"2025-01-13T08:58:40Z","published":"2024-07-28T14:58:07Z","title":"WeCromCL: Weakly Supervised Cross-Modality Contrastive Learning for\n  Transcription-only Supervised Text Spotting","summary":"  Transcription-only Supervised Text Spotting aims to learn text spotters\nrelying only on transcriptions but no text boundaries for supervision, thus\neliminating expensive boundary annotation. The crux of this task lies in\nlocating each transcription in scene text images without location annotations.\nIn this work, we formulate this challenging problem as a Weakly Supervised\nCross-modality Contrastive Learning problem, and design a simple yet effective\nmodel dubbed WeCromCL that is able to detect each transcription in a scene\nimage in a weakly supervised manner. Unlike typical methods for cross-modality\ncontrastive learning that focus on modeling the holistic semantic correlation\nbetween an entire image and a text description, our WeCromCL conducts atomistic\ncontrastive learning to model the character-wise appearance consistency between\na text transcription and its correlated region in a scene image to detect an\nanchor point for the transcription in a weakly supervised manner. The detected\nanchor points by WeCromCL are further used as pseudo location labels to guide\nthe learning of text spotting. Extensive experiments on four challenging\nbenchmarks demonstrate the superior performance of our model over other\nmethods. Code will be released.\n","authors":["Jingjing Wu","Zhengyao Fang","Pengyuan Lyu","Chengquan Zhang","Fanglin Chen","Guangming Lu","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2407.19507v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2501.05826v2","updated":"2025-01-13T08:56:05Z","published":"2025-01-10T10:03:56Z","title":"AI-Driven Diabetic Retinopathy Screening: Multicentric Validation of\n  AIDRSS in India","summary":"  Purpose: Diabetic retinopathy (DR) is a major cause of vision loss,\nparticularly in India, where access to retina specialists is limited in rural\nareas. This study aims to evaluate the Artificial Intelligence-based Diabetic\nRetinopathy Screening System (AIDRSS) for DR detection and prevalence\nassessment, addressing the growing need for scalable, automated screening\nsolutions in resource-limited settings.\n  Approach: A multicentric, cross-sectional study was conducted in Kolkata,\nIndia, involving 5,029 participants and 10,058 macula-centric retinal fundus\nimages. The AIDRSS employed a deep learning algorithm with 50 million trainable\nparameters, integrated with Contrast Limited Adaptive Histogram Equalization\n(CLAHE) preprocessing for enhanced image quality. DR was graded using the\nInternational Clinical Diabetic Retinopathy (ICDR) Scale, categorizing disease\ninto five stages (DR0 to DR4). Statistical metrics including sensitivity,\nspecificity, and prevalence rates were evaluated against expert retina\nspecialist assessments.\n  Results: The prevalence of DR in the general population was 13.7%, rising to\n38.2% among individuals with elevated random blood glucose levels. The AIDRSS\nachieved an overall sensitivity of 92%, specificity of 88%, and 100%\nsensitivity for detecting referable DR (DR3 and DR4). These results demonstrate\nthe system's robust performance in accurately identifying and grading DR in a\ndiverse population.\n  Conclusions: AIDRSS provides a reliable, scalable solution for early DR\ndetection in resource-constrained environments. Its integration of advanced AI\ntechniques ensures high diagnostic accuracy, with potential to significantly\nreduce the burden of diabetes-related vision loss in underserved regions.\n","authors":["Amit Kr Dey","Pradeep Walia","Girish Somvanshi","Abrar Ali","Sagarnil Das","Pallabi Paul","Minakhi Ghosh"],"pdf_url":"https://arxiv.org/pdf/2501.05826v2.pdf","comment":"22 pages, 5 figures. arXiv admin note: substantial text overlap with\n  arXiv:1812.07105 by other authors without attribution"},{"id":"http://arxiv.org/abs/2501.07133v1","updated":"2025-01-13T08:44:35Z","published":"2025-01-13T08:44:35Z","title":"Robust Single Object Tracking in LiDAR Point Clouds under Adverse\n  Weather Conditions","summary":"  3D single object tracking (3DSOT) in LiDAR point clouds is a critical task\nfor outdoor perception, enabling real-time perception of object location,\norientation, and motion. Despite the impressive performance of current 3DSOT\nmethods, evaluating them on clean datasets inadequately reflects their\ncomprehensive performance, as the adverse weather conditions in real-world\nsurroundings has not been considered. One of the main obstacles is the lack of\nadverse weather benchmarks for the evaluation of 3DSOT. To this end, this work\nproposes a challenging benchmark for LiDAR-based 3DSOT in adverse weather,\nwhich comprises two synthetic datasets (KITTI-A and nuScenes-A) and one\nreal-world dataset (CADC-SOT) spanning three weather types: rain, fog, and\nsnow. Based on this benchmark, five representative 3D trackers from different\ntracking frameworks conducted robustness evaluation, resulting in significant\nperformance degradations. This prompts the question: What are the factors that\ncause current advanced methods to fail on such adverse weather samples?\nConsequently, we explore the impacts of adverse weather and answer the above\nquestion from three perspectives: 1) target distance; 2) template shape\ncorruption; and 3) target shape corruption. Finally, based on domain\nrandomization and contrastive learning, we designed a dual-branch tracking\nframework for adverse weather, named DRCT, achieving excellent performance in\nbenchmarks.\n","authors":["Xiantong Zhao","Xiuping Liu","Shengjing Tian","Yinan Han"],"pdf_url":"https://arxiv.org/pdf/2501.07133v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.06019v2","updated":"2025-01-13T08:42:11Z","published":"2024-08-12T09:19:38Z","title":"HeadGAP: Few-Shot 3D Head Avatar via Generalizable Gaussian Priors","summary":"  In this paper, we present a novel 3D head avatar creation approach capable of\ngeneralizing from few-shot in-the-wild data with high-fidelity and animatable\nrobustness. Given the underconstrained nature of this problem, incorporating\nprior knowledge is essential. Therefore, we propose a framework comprising\nprior learning and avatar creation phases. The prior learning phase leverages\n3D head priors derived from a large-scale multi-view dynamic dataset, and the\navatar creation phase applies these priors for few-shot personalization. Our\napproach effectively captures these priors by utilizing a Gaussian\nSplatting-based auto-decoder network with part-based dynamic modeling. Our\nmethod employs identity-shared encoding with personalized latent codes for\nindividual identities to learn the attributes of Gaussian primitives. During\nthe avatar creation phase, we achieve fast head avatar personalization by\nleveraging inversion and fine-tuning strategies. Extensive experiments\ndemonstrate that our model effectively exploits head priors and successfully\ngeneralizes them to few-shot personalization, achieving photo-realistic\nrendering quality, multi-view consistency, and stable animation.\n","authors":["Xiaozheng Zheng","Chao Wen","Zhaohu Li","Weiyi Zhang","Zhuo Su","Xu Chang","Yang Zhao","Zheng Lv","Xiaoyuan Zhang","Yongjie Zhang","Guidong Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2408.06019v2.pdf","comment":"Accepted to 3DV 2025. Project page: https://headgap.github.io/"},{"id":"http://arxiv.org/abs/2501.07120v1","updated":"2025-01-13T08:22:10Z","published":"2025-01-13T08:22:10Z","title":"MSV-Mamba: A Multiscale Vision Mamba Network for Echocardiography\n  Segmentation","summary":"  Ultrasound imaging frequently encounters challenges, such as those related to\nelevated noise levels, diminished spatiotemporal resolution, and the complexity\nof anatomical structures. These factors significantly hinder the model's\nability to accurately capture and analyze structural relationships and dynamic\npatterns across various regions of the heart. Mamba, an emerging model, is one\nof the most cutting-edge approaches that is widely applied to diverse vision\nand language tasks. To this end, this paper introduces a U-shaped deep learning\nmodel incorporating a large-window Mamba scale (LMS) module and a hierarchical\nfeature fusion approach for echocardiographic segmentation. First, a cascaded\nresidual block serves as an encoder and is employed to incrementally extract\nmultiscale detailed features. Second, a large-window multiscale mamba module is\nintegrated into the decoder to capture global dependencies across regions and\nenhance the segmentation capability for complex anatomical structures.\nFurthermore, our model introduces auxiliary losses at each decoder layer and\nemploys a dual attention mechanism to fuse multilayer features both spatially\nand across channels. This approach enhances segmentation performance and\naccuracy in delineating complex anatomical structures. Finally, the\nexperimental results using the EchoNet-Dynamic and CAMUS datasets demonstrate\nthat the model outperforms other methods in terms of both accuracy and\nrobustness. For the segmentation of the left ventricular endocardium\n(${LV}_{endo}$), the model achieved optimal values of 95.01 and 93.36,\nrespectively, while for the left ventricular epicardium (${LV}_{epi}$), values\nof 87.35 and 87.80, respectively, were achieved. This represents an improvement\nranging between 0.54 and 1.11 compared with the best-performing model.\n","authors":["Xiaoxian Yang","Qi Wang","Kaiqi Zhang","Ke Wei","Jun Lyu","Lingchao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04746v3","updated":"2025-01-13T08:08:28Z","published":"2023-12-07T23:16:37Z","title":"Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized\n  Narratives from Open-Source Histopathology Videos","summary":"  Diagnosis in histopathology requires a global whole slide images (WSIs)\nanalysis, requiring pathologists to compound evidence from different WSI\npatches. The gigapixel scale of WSIs poses a challenge for histopathology\nmulti-modal models. Training multi-model models for histopathology requires\ninstruction tuning datasets, which currently contain information for individual\nimage patches, without a spatial grounding of the concepts within each patch\nand without a wider view of the WSI. Therefore, they lack sufficient diagnostic\ncapacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a\nlarge-scale dataset of 107,131 histopathology-specific instruction\nquestion/answer pairs, grounded within diagnostically relevant image patches\nthat make up the WSI. Our dataset is collected by leveraging educational\nhistopathology videos from YouTube, which provides spatial localization of\nnarrations by automatically extracting the narrators' cursor positions.\nQuilt-Instruct supports contextual reasoning by extracting diagnosis and\nsupporting facts from the entire WSI. Using Quilt-Instruct, we train\nQuilt-LLaVA, which can reason beyond the given single image patch, enabling\ndiagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a\ncomprehensive evaluation dataset created from 985 images and 1283\nhuman-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using\npublic histopathology datasets, where Quilt-LLaVA significantly outperforms\nSOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set\nVQA. Our code, data, and model are publicly accessible at\nquilt-llava.github.io.\n","authors":["Mehmet Saygin Seyfioglu","Wisdom O. Ikezogwo","Fatemeh Ghezloo","Ranjay Krishna","Linda Shapiro"],"pdf_url":"https://arxiv.org/pdf/2312.04746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07114v1","updated":"2025-01-13T08:04:32Z","published":"2025-01-13T08:04:32Z","title":"Duplex: Dual Prototype Learning for Compositional Zero-Shot Learning","summary":"  Compositional Zero-Shot Learning (CZSL) aims to enable models to recognize\nnovel compositions of visual states and objects that were absent during\ntraining. Existing methods predominantly focus on learning semantic\nrepresentations of seen compositions but often fail to disentangle the\nindependent features of states and objects in images, thereby limiting their\nability to generalize to unseen compositions. To address this challenge, we\npropose Duplex, a novel dual-prototype learning method that integrates semantic\nand visual prototypes through a carefully designed dual-branch architecture,\nenabling effective representation learning for compositional tasks. Duplex\nutilizes a Graph Neural Network (GNN) to adaptively update visual prototypes,\ncapturing complex interactions between states and objects. Additionally, it\nleverages the strong visual-semantic alignment of pre-trained Vision-Language\nModels (VLMs) and employs a multi-path architecture combined with prompt\nengineering to align image and text representations, ensuring robust\ngeneralization. Extensive experiments on three benchmark datasets demonstrate\nthat Duplex outperforms state-of-the-art methods in both closed-world and\nopen-world settings.\n","authors":["Zhong Peng","Yishi Xu","Gerong Wang","Wenchao Chen","Bo Chen","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07113v1","updated":"2025-01-13T08:03:49Z","published":"2025-01-13T08:03:49Z","title":"Matching Free Depth Recovery from Structured Light","summary":"  We present a novel approach for depth estimation from images captured by\nstructured light systems. Unlike many previous methods that rely on image\nmatching process, our approach uses a density voxel grid to represent scene\ngeometry, which is trained via self-supervised differentiable volume rendering.\nOur method leverages color fields derived from projected patterns in structured\nlight systems during the rendering process, enabling the isolated optimization\nof the geometry field. This contributes to faster convergence and high-quality\noutput. Additionally, we incorporate normalized device coordinates (NDC), a\ndistortion loss, and a novel surface-based color loss to enhance geometric\nfidelity. Experimental results demonstrate that our method outperforms existing\nmatching-based techniques in geometric performance for few-shot scenarios,\nachieving approximately a 60% reduction in average estimated depth errors on\nsynthetic scenes and about 30% on real-world captured scenes. Furthermore, our\napproach delivers fast training, with a speed roughly three times faster than\nprevious matching-free methods that employ implicit representations.\n","authors":["Zhuohang Yu","Kai Wang","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07113v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.07110v1","updated":"2025-01-13T07:51:43Z","published":"2025-01-13T07:51:43Z","title":"Dynamic Multimodal Fusion via Meta-Learning Towards Micro-Video\n  Recommendation","summary":"  Multimodal information (e.g., visual, acoustic, and textual) has been widely\nused to enhance representation learning for micro-video recommendation. For\nintegrating multimodal information into a joint representation of micro-video,\nmultimodal fusion plays a vital role in the existing micro-video recommendation\napproaches. However, the static multimodal fusion used in previous studies is\ninsufficient to model the various relationships among multimodal information of\ndifferent micro-videos. In this paper, we develop a novel meta-learning-based\nmultimodal fusion framework called Meta Multimodal Fusion (MetaMMF), which\ndynamically assigns parameters to the multimodal fusion function for each\nmicro-video during its representation learning. Specifically, MetaMMF regards\nthe multimodal fusion of each micro-video as an independent task. Based on the\nmeta information extracted from the multimodal features of the input task,\nMetaMMF parameterizes a neural network as the item-specific fusion function via\na meta learner. We perform extensive experiments on three benchmark datasets,\ndemonstrating the significant improvements over several state-of-the-art\nmultimodal recommendation models, like MMGCN, LATTICE, and InvRL. Furthermore,\nwe lighten our model by adopting canonical polyadic decomposition to improve\nthe training efficiency, and validate its effectiveness through experimental\nresults. Codes are available at https://github.com/hanliu95/MetaMMF.\n","authors":["Han Liu","Yinwei Wei","Fan Liu","Wenjie Wang","Liqiang Nie","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2501.07110v1.pdf","comment":"This paper has been accepted by ACM Transactions on Information\n  Systems"},{"id":"http://arxiv.org/abs/2501.07109v1","updated":"2025-01-13T07:43:33Z","published":"2025-01-13T07:43:33Z","title":"The Quest for Visual Understanding: A Journey Through the Evolution of\n  Visual Question Answering","summary":"  Visual Question Answering (VQA) is an interdisciplinary field that bridges\nthe gap between computer vision (CV) and natural language processing(NLP),\nenabling Artificial Intelligence(AI) systems to answer questions about images.\nSince its inception in 2015, VQA has rapidly evolved, driven by advances in\ndeep learning, attention mechanisms, and transformer-based models. This survey\ntraces the journey of VQA from its early days, through major breakthroughs,\nsuch as attention mechanisms, compositional reasoning, and the rise of\nvision-language pre-training methods. We highlight key models, datasets, and\ntechniques that shaped the development of VQA systems, emphasizing the pivotal\nrole of transformer architectures and multimodal pre-training in driving recent\nprogress. Additionally, we explore specialized applications of VQA in domains\nlike healthcare and discuss ongoing challenges, such as dataset bias, model\ninterpretability, and the need for common-sense reasoning. Lastly, we discuss\nthe emerging trends in large multimodal language models and the integration of\nexternal knowledge, offering insights into the future directions of VQA. This\npaper aims to provide a comprehensive overview of the evolution of VQA,\nhighlighting both its current state and potential advancements.\n","authors":["Anupam Pandey","Deepjyoti Bodo","Arpan Phukan","Asif Ekbal"],"pdf_url":"https://arxiv.org/pdf/2501.07109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07104v1","updated":"2025-01-13T07:32:44Z","published":"2025-01-13T07:32:44Z","title":"RMAvatar: Photorealistic Human Avatar Reconstruction from Monocular\n  Video Based on Rectified Mesh-embedded Gaussians","summary":"  We introduce RMAvatar, a novel human avatar representation with Gaussian\nsplatting embedded on mesh to learn clothed avatar from a monocular video. We\nutilize the explicit mesh geometry to represent motion and shape of a virtual\nhuman and implicit appearance rendering with Gaussian Splatting. Our method\nconsists of two main modules: Gaussian initialization module and Gaussian\nrectification module. We embed Gaussians into triangular faces and control\ntheir motion through the mesh, which ensures low-frequency motion and surface\ndeformation of the avatar. Due to the limitations of LBS formula, the human\nskeleton is hard to control complex non-rigid transformations. We then design a\npose-related Gaussian rectification module to learn fine-detailed non-rigid\ndeformations, further improving the realism and expressiveness of the avatar.\nWe conduct extensive experiments on public datasets, RMAvatar shows\nstate-of-the-art performance on both rendering quality and quantitative\nevaluations. Please see our project page at https://rm-avatar.github.io.\n","authors":["Sen Peng","Weixing Xie","Zilong Wang","Xiaohu Guo","Zhonggui Chen","Baorong Yang","Xiao Dong"],"pdf_url":"https://arxiv.org/pdf/2501.07104v1.pdf","comment":"CVM2025"},{"id":"http://arxiv.org/abs/2411.14789v2","updated":"2025-01-13T07:29:53Z","published":"2024-11-22T08:17:46Z","title":"Simplifying CLIP: Unleashing the Power of Large-Scale Models on\n  Consumer-level Computers","summary":"  Contrastive Language-Image Pre-training (CLIP) has attracted a surge of\nattention for its superior zero-shot performance and excellent transferability\nto downstream tasks. However, training such large-scale models usually requires\nsubstantial computation and storage, which poses barriers for general users\nwith consumer-level computers. Motivated by this observation, in this paper we\ninvestigate how to achieve competitive performance on only one Nvidia RTX3090\nGPU and with one terabyte for storing dataset. On one hand, we simplify the\ntransformer block structure and combine Weight Inheritance with multi-stage\nKnowledge Distillation (WIKD), thereby reducing the parameters and improving\nthe inference speed during training along with deployment. On the other hand,\nconfronted with the convergence challenge posed by small dataset, we generate\nsynthetic captions for each sample as data augmentation, and devise a novel\nPair Matching (PM) loss to fully exploit the distinguishment among positive and\nnegative image-text pairs. Extensive experiments demonstrate that our model can\nachieve a new state-of-the-art datascale-parameter-accuracy tradeoff, which\ncould further popularize the CLIP model in the related research community.\n","authors":["Hongbo Liu"],"pdf_url":"https://arxiv.org/pdf/2411.14789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07101v1","updated":"2025-01-13T07:26:37Z","published":"2025-01-13T07:26:37Z","title":"Dual Scale-aware Adaptive Masked Knowledge Distillation for Object\n  Detection","summary":"  Recent feature masking knowledge distillation methods make use of attention\nmechanisms to identify either important spatial regions or channel clues for\ndiscriminative feature reconstruction. However, most of existing strategies\nperform global attention-guided feature masking distillation without delving\ninto fine-grained visual clues in feature maps. In particular, uncovering\nlocality-aware clues across different scales are conducive to reconstructing\nregion-aware features, thereby significantly benefiting distillation\nperformance. In this study, we propose a fine-grained adaptive feature masking\ndistillation framework for accurate object detection. Different from previous\nmethods in which global masking is performed on single-scale feature maps, we\nexplore the scale-aware feature masking by performing feature distillation\nacross various scales, such that the object-aware locality is encoded for\nimproved feature reconstruction. In addition, our fine-grained feature\ndistillation strategy is combined with a masking logits distillation scheme in\nwhich logits difference between teacher and student networks is utilized to\nguide the distillation process. Thus, it can help the student model to better\nlearn from the teacher counterpart with improved knowledge transfer. Extensive\nexperiments for detection task demonstrate the superiority of our method. For\nexample, when RetinaNet, RepPoints and Cascade Mask RCNN are used as teacher\ndetectors, the student network achieves mAP scores of 41.5\\%, 42.9\\%, and\n42.6\\%, respectively, outperforming state-of-the-art methods such as DMKD and\nFreeKD.\n","authors":["ZhouRui Zhang","Jun Li","JiaYan Li","ZhiJian Wu","JianHua Xu"],"pdf_url":"https://arxiv.org/pdf/2501.07101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07100v1","updated":"2025-01-13T07:26:05Z","published":"2025-01-13T07:26:05Z","title":"Collaborative Learning for 3D Hand-Object Reconstruction and\n  Compositional Action Recognition from Egocentric RGB Videos Using\n  Superquadrics","summary":"  With the availability of egocentric 3D hand-object interaction datasets,\nthere is increasing interest in developing unified models for hand-object pose\nestimation and action recognition. However, existing methods still struggle to\nrecognise seen actions on unseen objects due to the limitations in representing\nobject shape and movement using 3D bounding boxes. Additionally, the reliance\non object templates at test time limits their generalisability to unseen\nobjects. To address these challenges, we propose to leverage superquadrics as\nan alternative 3D object representation to bounding boxes and demonstrate their\neffectiveness on both template-free object reconstruction and action\nrecognition tasks. Moreover, as we find that pure appearance-based methods can\noutperform the unified methods, the potential benefits from 3D geometric\ninformation remain unclear. Therefore, we study the compositionality of actions\nby considering a more challenging task where the training combinations of verbs\nand nouns do not overlap with the testing split. We extend H2O and FPHA\ndatasets with compositional splits and design a novel collaborative learning\nframework that can explicitly reason about the geometric relations between\nhands and the manipulated object. Through extensive quantitative and\nqualitative evaluations, we demonstrate significant improvements over the\nstate-of-the-arts in (compositional) action recognition.\n","authors":["Tze Ho Elden Tse","Runyang Feng","Linfang Zheng","Jiho Park","Yixing Gao","Jihie Kim","Ales Leonardis","Hyung Jin Chang"],"pdf_url":"https://arxiv.org/pdf/2501.07100v1.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2412.07249v2","updated":"2025-01-13T07:22:02Z","published":"2024-12-10T07:18:51Z","title":"Buster: Implanting Semantic Backdoor into Text Encoder to Mitigate NSFW\n  Content Generation","summary":"  The rise of deep learning models in the digital era has raised substantial\nconcerns regarding the generation of Not-Safe-for-Work (NSFW) content. Existing\ndefense methods primarily involve model fine-tuning and post-hoc content\nmoderation. Nevertheless, these approaches largely lack scalability in\neliminating harmful content, degrade the quality of benign image generation, or\nincur high inference costs. To address these challenges, we propose an\ninnovative framework named \\textit{Buster}, which injects backdoors into the\ntext encoder to prevent NSFW content generation. Buster leverages deep semantic\ninformation rather than explicit prompts as triggers, redirecting NSFW prompts\ntowards targeted benign prompts. Additionally, Buster employs energy-based\ntraining data generation through Langevin dynamics for adversarial knowledge\naugmentation, thereby ensuring robustness in harmful concept definition. This\napproach demonstrates exceptional resilience and scalability in mitigating NSFW\ncontent. Particularly, Buster fine-tunes the text encoder of Text-to-Image\nmodels within merely five minutes, showcasing its efficiency. Our extensive\nexperiments denote that Buster outperforms nine state-of-the-art baselines,\nachieving a superior NSFW content removal rate of at least 91.2\\% while\npreserving the quality of harmless images.\n","authors":["Xin Zhao","Xiaojun Chen","Yuexin Xuan","Zhendong Zhao","Xiaojun Jia","Xinfeng Li","Xiaofeng Wang"],"pdf_url":"https://arxiv.org/pdf/2412.07249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15378v2","updated":"2025-01-13T06:49:22Z","published":"2023-08-29T15:16:51Z","title":"On the Robustness of Object Detection Models on Aerial Images","summary":"  The robustness of object detection models is a major concern when applied to\nreal-world scenarios. The performance of most models tends to degrade when\nconfronted with images affected by corruptions, since they are usually trained\nand evaluated on clean datasets. While numerous studies have explored the\nrobustness of object detection models on natural images, there is a paucity of\nresearch focused on models applied to aerial images, which feature complex\nbackgrounds, substantial variations in scales, and orientations of objects.\nThis paper addresses the challenge of assessing the robustness of object\ndetection models on aerial images, with a specific emphasis on scenarios where\nimages are affected by clouds. In this study, we introduce two novel benchmarks\nbased on DOTA-v1.0. The first benchmark encompasses 19 prevalent corruptions,\nwhile the second focuses on the cloud-corrupted condition-a phenomenon uncommon\nin natural images yet frequent in aerial photography. We systematically\nevaluate the robustness of mainstream object detection models and perform\nnecessary ablation experiments. Through our investigations, we find that\nrotation-invariant modeling and enhanced backbone architectures can improve the\nrobustness of models. Furthermore, increasing the capacity of Transformer-based\nbackbones can strengthen their robustness. The benchmarks we propose and our\ncomprehensive experimental analyses can facilitate research on robust object\ndetection on aerial images. The codes and datasets are available at:\nhttps://github.com/hehaodong530/DOTA-C.\n","authors":["Haodong He","Jian Ding","Bowen Xu","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2308.15378v2.pdf","comment":"accepted by IEEE TGRS"},{"id":"http://arxiv.org/abs/2501.07087v1","updated":"2025-01-13T06:45:32Z","published":"2025-01-13T06:45:32Z","title":"Video Quality Assessment for Online Processing: From Spatial to Temporal\n  Sampling","summary":"  With the rapid development of multimedia processing and deep learning\ntechnologies, especially in the field of video understanding, video quality\nassessment (VQA) has achieved significant progress. Although researchers have\nmoved from designing efficient video quality mapping models to various research\ndirections, in-depth exploration of the effectiveness-efficiency trade-offs of\nspatio-temporal modeling in VQA models is still less sufficient. Considering\nthe fact that videos have highly redundant information, this paper investigates\nthis problem from the perspective of joint spatial and temporal sampling,\naiming to seek the answer to how little information we should keep at least\nwhen feeding videos into the VQA models while with acceptable performance\nsacrifice. To this end, we drastically sample the video's information from both\nspatial and temporal dimensions, and the heavily squeezed video is then fed\ninto a stable VQA model. Comprehensive experiments regarding joint spatial and\ntemporal sampling are conducted on six public video quality databases, and the\nresults demonstrate the acceptable performance of the VQA model when throwing\naway most of the video information. Furthermore, with the proposed joint\nspatial and temporal sampling strategy, we make an initial attempt to design an\nonline VQA model, which is instantiated by as simple as possible a spatial\nfeature extractor, a temporal feature fusion module, and a global quality\nregression module. Through quantitative and qualitative experiments, we verify\nthe feasibility of online VQA model by simplifying itself and reducing input.\n","authors":["Jiebin Yan","Lei Wu","Yuming Fang","Xuelin Liu","Xue Xia","Weide Liu"],"pdf_url":"https://arxiv.org/pdf/2501.07087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07076v1","updated":"2025-01-13T06:13:25Z","published":"2025-01-13T06:13:25Z","title":"Representation Learning of Point Cloud Upsampling in Global and Local\n  Inputs","summary":"  In recent years, point cloud upsampling has been widely applied in fields\nsuch as 3D reconstruction. Our study investigates the factors influencing point\ncloud upsampling on both global and local levels through representation\nlearning. Specifically, the paper inputs global and local information of the\nsame point cloud model object into two encoders to extract these features,\nfuses them, and then feeds the combined features into an upsampling decoder.\nThe goal is to address issues of sparsity and noise in point clouds by\nleveraging prior knowledge from both global and local inputs. And the proposed\nframework can be applied to any state-of-the-art point cloud upsampling neural\nnetwork. Experiments were conducted on a series of autoencoder-based models\nutilizing deep learning, yielding interpretability for both global and local\ninputs, and it has been proven in the results that our proposed framework can\nfurther improve the upsampling effect in previous SOTA works. At the same time,\nthe Saliency Map reflects the differences between global and local feature\ninputs, as well as the effectiveness of training with both inputs in parallel.\n","authors":["Tongxu Zhang","Bei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07072v1","updated":"2025-01-13T05:57:09Z","published":"2025-01-13T05:57:09Z","title":"Label Calibration in Source Free Domain Adaptation","summary":"  Source-free domain adaptation (SFDA) utilizes a pre-trained source model with\nunlabeled target data. Self-supervised SFDA techniques generate pseudolabels\nfrom the pre-trained source model, but these pseudolabels often contain noise\ndue to domain discrepancies between the source and target domains. Traditional\nself-supervised SFDA techniques rely on deterministic model predictions using\nthe softmax function, leading to unreliable pseudolabels. In this work, we\npropose to introduce predictive uncertainty and softmax calibration for\npseudolabel refinement using evidential deep learning. The Dirichlet prior is\nplaced over the output of the target network to capture uncertainty using\nevidence with a single forward pass. Furthermore, softmax calibration solves\nthe translation invariance problem to assist in learning with noisy labels. We\nincorporate a combination of evidential deep learning loss and information\nmaximization loss with calibrated softmax in both prior and non-prior target\nknowledge SFDA settings. Extensive experimental analysis shows that our method\noutperforms other state-of-the-art methods on benchmark datasets.\n","authors":["Shivangi Rai","Rini Smita Thakur","Kunal Jangid","Vinod K Kurmi"],"pdf_url":"https://arxiv.org/pdf/2501.07072v1.pdf","comment":"Accepted in IEEE/CVF Winter Conference on Applications of Computer\n  Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2501.07070v1","updated":"2025-01-13T05:48:32Z","published":"2025-01-13T05:48:32Z","title":"Enhancing Image Generation Fidelity via Progressive Prompts","summary":"  The diffusion transformer (DiT) architecture has attracted significant\nattention in image generation, achieving better fidelity, performance, and\ndiversity. However, most existing DiT - based image generation methods focus on\nglobal - aware synthesis, and regional prompt control has been less explored.\nIn this paper, we propose a coarse - to - fine generation pipeline for regional\nprompt - following generation. Specifically, we first utilize the powerful\nlarge language model (LLM) to generate both high - level descriptions of the\nimage (such as content, topic, and objects) and low - level descriptions (such\nas details and style). Then, we explore the influence of cross - attention\nlayers at different depths. We find that deeper layers are always responsible\nfor high - level content control, while shallow layers handle low - level\ncontent control. Various prompts are injected into the proposed regional cross\n- attention control for coarse - to - fine generation. By using the proposed\npipeline, we enhance the controllability of DiT - based image generation.\nExtensive quantitative and qualitative results show that our pipeline can\nimprove the performance of the generated images.\n","authors":["Zhen Xiong","Yuqi Li","Chuanguang Yang","Tiao Tan","Zhihong Zhu","Siyuan Li","Yue Ma"],"pdf_url":"https://arxiv.org/pdf/2501.07070v1.pdf","comment":"Accepted by ICASSP 2025, Github:\n  https://github.com/ZhenXiong-dl/ICASSP2025-RCAC"},{"id":"http://arxiv.org/abs/2501.07069v1","updated":"2025-01-13T05:39:43Z","published":"2025-01-13T05:39:43Z","title":"Hierarchical Superpixel Segmentation via Structural Information Theory","summary":"  Superpixel segmentation is a foundation for many higher-level computer vision\ntasks, such as image segmentation, object recognition, and scene understanding.\nExisting graph-based superpixel segmentation methods typically concentrate on\nthe relationships between a given pixel and its directly adjacent pixels while\noverlooking the influence of non-adjacent pixels. These approaches do not fully\nleverage the global information in the graph, leading to suboptimal\nsegmentation quality. To address this limitation, we present SIT-HSS, a\nhierarchical superpixel segmentation method based on structural information\ntheory. Specifically, we first design a novel graph construction strategy that\nincrementally explores the pixel neighborhood to add edges based on\n1-dimensional structural entropy (1D SE). This strategy maximizes the retention\nof graph information while avoiding an overly complex graph structure. Then, we\ndesign a new 2D SE-guided hierarchical graph partitioning method, which\niteratively merges pixel clusters layer by layer to reduce the graph's 2D SE\nuntil a predefined segmentation scale is achieved. Experimental results on\nthree benchmark datasets demonstrate that the SIT-HSS performs better than\nstate-of-the-art unsupervised superpixel segmentation algorithms. The source\ncode is available at \\url{https://github.com/SELGroup/SIT-HSS}.\n","authors":["Minhui Xie","Hao Peng","Pu Li","Guangjie Zeng","Shuhai Wang","Jia Wu","Peng Li","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2501.07069v1.pdf","comment":"Accepted by SDM 2025"},{"id":"http://arxiv.org/abs/2406.17442v3","updated":"2025-01-13T05:36:58Z","published":"2024-06-25T10:23:53Z","title":"Pamba: Enhancing Global Interaction in Point Clouds via State Space\n  Model","summary":"  Transformers have demonstrated impressive results for 3D point cloud semantic\nsegmentation. However, the quadratic complexity of transformer makes\ncomputation costs high, limiting the number of points that can be processed\nsimultaneously and impeding the modeling of long-range dependencies between\nobjects in a single scene. Drawing inspiration from the great potential of\nrecent state space models (SSM) for long sequence modeling, we introduce Mamba,\nan SSM-based architecture, to the point cloud domain and propose Pamba, a novel\narchitecture with strong global modeling capability under linear complexity.\nSpecifically, to make the disorderness of point clouds fit in with the causal\nnature of Mamba, we propose a multi-path serialization strategy applicable to\npoint clouds. Besides, we propose the ConvMamba block to compensate for the\nshortcomings of Mamba in modeling local geometries and in unidirectional\nmodeling. Pamba obtains state-of-the-art results on several 3D point cloud\nsegmentation tasks, including ScanNet v2, ScanNet200, S3DIS and nuScenes, while\nits effectiveness is validated by extensive experiments.\n","authors":["Zhuoyuan Li","Yubo Ai","Jiahao Lu","ChuXin Wang","Jiacheng Deng","Hanzhi Chang","Yanzhe Liang","Wenfei Yang","Shifeng Zhang","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.17442v3.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2410.20974v2","updated":"2025-01-13T05:06:17Z","published":"2024-10-28T12:46:05Z","title":"MovieCharacter: A Tuning-Free Framework for Controllable Character Video\n  Synthesis","summary":"  Recent advancements in character video synthesis still depend on extensive\nfine-tuning or complex 3D modeling processes, which can restrict accessibility\nand hinder real-time applicability. To address these challenges, we propose a\nsimple yet effective tuning-free framework for character video synthesis, named\nMovieCharacter, designed to streamline the synthesis process while ensuring\nhigh-quality outcomes. Our framework decomposes the synthesis task into\ndistinct, manageable modules: character segmentation and tracking, video object\nremoval, character motion imitation, and video composition. This modular design\nnot only facilitates flexible customization but also ensures that each\ncomponent operates collaboratively to effectively meet user needs. By\nleveraging existing open-source models and integrating well-established\ntechniques, MovieCharacter achieves impressive synthesis results without\nnecessitating substantial resources or proprietary datasets. Experimental\nresults demonstrate that our framework enhances the efficiency, accessibility,\nand adaptability of character video synthesis, paving the way for broader\ncreative and interactive applications.\n","authors":["Di Qiu","Zheng Chen","Rui Wang","Mingyuan Fan","Changqian Yu","Junshi Huang","Xiang Wen"],"pdf_url":"https://arxiv.org/pdf/2410.20974v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16837v2","updated":"2025-01-13T05:04:59Z","published":"2024-07-23T21:02:38Z","title":"MLLM-CompBench: A Comparative Reasoning Benchmark for Multimodal LLMs","summary":"  The ability to compare objects, scenes, or situations is crucial for\neffective decision-making and problem-solving in everyday life. For instance,\ncomparing the freshness of apples enables better choices during grocery\nshopping while comparing sofa designs helps optimize the aesthetics of our\nliving space. Despite its significance, the comparative capability is largely\nunexplored in artificial general intelligence (AGI). In this paper, we\nintroduce MLLM-CompBench, a benchmark designed to evaluate the comparative\nreasoning capability of multimodal large language models (MLLMs).\nMLLM-CompBench mines and pairs images through visually oriented questions\ncovering eight dimensions of relative comparison: visual attribute, existence,\nstate, emotion, temporality, spatiality, quantity, and quality. We curate a\ncollection of around 40K image pairs using metadata from diverse vision\ndatasets and CLIP similarity scores. These image pairs span a broad array of\nvisual domains, including animals, fashion, sports, and both outdoor and indoor\nscenes. The questions are carefully crafted to discern relative characteristics\nbetween two images and are labeled by human annotators for accuracy and\nrelevance. We use MLLM-CompBench to evaluate recent MLLMs, including\nGPT-4V(ision), Gemini-Pro, and LLaVA-1.6. Our results reveal notable\nshortcomings in their comparative abilities. We believe MLLM-COMPBENCH not only\nsheds light on these limitations but also establishes a solid foundation for\nfuture enhancements in the comparative capability of MLLMs.\n","authors":["Jihyung Kil","Zheda Mai","Justin Lee","Zihe Wang","Kerrie Cheng","Lemeng Wang","Ye Liu","Arpita Chowdhury","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2407.16837v2.pdf","comment":"This paper has been accepted to NeurIPS 2024. The first two authors\n  contributed equally to this work"},{"id":"http://arxiv.org/abs/2501.07055v1","updated":"2025-01-13T04:30:41Z","published":"2025-01-13T04:30:41Z","title":"SFC-GAN: A Generative Adversarial Network for Brain Functional and\n  Structural Connectome Translation","summary":"  Modern brain imaging technologies have enabled the detailed reconstruction of\nhuman brain connectomes, capturing structural connectivity (SC) from diffusion\nMRI and functional connectivity (FC) from functional MRI. Understanding the\nintricate relationships between SC and FC is vital for gaining deeper insights\ninto the brain's functional and organizational mechanisms. However, obtaining\nboth SC and FC modalities simultaneously remains challenging, hindering\ncomprehensive analyses. Existing deep generative models typically focus on\nsynthesizing a single modality or unidirectional translation between FC and SC,\nthereby missing the potential benefits of bi-directional translation,\nespecially in scenarios where only one connectome is available. Therefore, we\npropose Structural-Functional Connectivity GAN (SFC-GAN), a novel framework for\nbidirectional translation between SC and FC. This approach leverages the\nCycleGAN architecture, incorporating convolutional layers to effectively\ncapture the spatial structures of brain connectomes. To preserve the\ntopological integrity of these connectomes, we employ a structure-preserving\nloss that guides the model in capturing both global and local connectome\npatterns while maintaining symmetry. Our framework demonstrates superior\nperformance in translating between SC and FC, outperforming baseline models in\nsimilarity and graph property evaluations compared to ground truth data, each\ntranslated modality can be effectively utilized for downstream classification.\n","authors":["Yee-Fan Tan","Jun Lin Liow","Pei-Sze Tan","Fuad Noman","Raphael C. -W. Phan","Hernando Ombao","Chee-Ming Ting"],"pdf_url":"https://arxiv.org/pdf/2501.07055v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.11477v3","updated":"2025-01-13T04:22:25Z","published":"2024-11-18T11:26:11Z","title":"SL-YOLO: A Stronger and Lighter Drone Target Detection Model","summary":"  Detecting small objects in complex scenes, such as those captured by drones,\nis a daunting challenge due to the difficulty in capturing the complex features\nof small targets. While the YOLO family has achieved great success in large\ntarget detection, its performance is less than satisfactory when faced with\nsmall targets. Because of this, this paper proposes a revolutionary model\nSL-YOLO (Stronger and Lighter YOLO) that aims to break the bottleneck of small\ntarget detection. We propose the Hierarchical Extended Path Aggregation Network\n(HEPAN), a pioneering cross-scale feature fusion method that can ensure\nunparalleled detection accuracy even in the most challenging environments. At\nthe same time, without sacrificing detection capabilities, we design the C2fDCB\nlightweight module and add the SCDown downsampling module to greatly reduce the\nmodel's parameters and computational complexity. Our experimental results on\nthe VisDrone2019 dataset reveal a significant improvement in performance, with\nmAP@0.5 jumping from 43.0% to 46.9% and mAP@0.5:0.95 increasing from 26.0% to\n28.9%. At the same time, the model parameters are reduced from 11.1M to 9.6M,\nand the FPS can reach 132, making it an ideal solution for real-time small\nobject detection in resource-constrained environments.\n","authors":["Defan Chen","Luchan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11477v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20870v2","updated":"2025-01-13T04:11:06Z","published":"2024-12-30T11:16:49Z","title":"SoftPatch+: Fully Unsupervised Anomaly Classification and Segmentation","summary":"  Although mainstream unsupervised anomaly detection (AD) (including\nimage-level classification and pixel-level segmentation)algorithms perform well\nin academic datasets, their performance is limited in practical application due\nto the ideal experimental setting of clean training data. Training with noisy\ndata is an inevitable problem in real-world anomaly detection but is seldom\ndiscussed. This paper is the first to consider fully unsupervised industrial\nanomaly detection (i.e., unsupervised AD with noisy data). To solve this\nproblem, we proposed memory-based unsupervised AD methods, SoftPatch and\nSoftPatch+, which efficiently denoise the data at the patch level. Noise\ndiscriminators are utilized to generate outlier scores for patch-level noise\nelimination before coreset construction. The scores are then stored in the\nmemory bank to soften the anomaly detection boundary. Compared with existing\nmethods, SoftPatch maintains a strong modeling ability of normal data and\nalleviates the overconfidence problem in coreset, and SoftPatch+ has more\nrobust performance which is articularly useful in real-world industrial\ninspection scenarios with high levels of noise (from 10% to 40%). Comprehensive\nexperiments conducted in diverse noise scenarios demonstrate that both\nSoftPatch and SoftPatch+ outperform the state-of-the-art AD methods on the\nMVTecAD, ViSA, and BTAD benchmarks. Furthermore, the performance of SoftPatch\nand SoftPatch+ is comparable to that of the noise-free methods in conventional\nunsupervised AD setting. The code of the proposed methods can be found at\nhttps://github.com/TencentYoutuResearch/AnomalyDetection-SoftPatch.\n","authors":["Chengjie Wang","Xi Jiang","Bin-Bin Gao","Zhenye Gan","Yong Liu","Feng Zheng","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2412.20870v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2403.14233\n  paper has been accepted by Pattern Recognition"},{"id":"http://arxiv.org/abs/2501.07044v1","updated":"2025-01-13T03:54:19Z","published":"2025-01-13T03:54:19Z","title":"Protego: Detecting Adversarial Examples for Vision Transformers via\n  Intrinsic Capabilities","summary":"  Transformer models have excelled in natural language tasks, prompting the\nvision community to explore their implementation in computer vision problems.\nHowever, these models are still influenced by adversarial examples. In this\npaper, we investigate the attack capabilities of six common adversarial attacks\non three pretrained ViT models to reveal the vulnerability of ViT models. To\nunderstand and analyse the bias in neural network decisions when the input is\nadversarial, we use two visualisation techniques that are attention rollout and\ngrad attention rollout. To prevent ViT models from adversarial attack, we\npropose Protego, a detection framework that leverages the transformer intrinsic\ncapabilities to detection adversarial examples of ViT models. Nonetheless, this\nis challenging due to a diversity of attack strategies that may be adopted by\nadversaries. Inspired by the attention mechanism, we know that the token of\nprediction contains all the information from the input sample. Additionally,\nthe attention region for adversarial examples differs from that of normal\nexamples. Given these points, we can train a detector that achieves superior\nperformance than existing detection methods to identify adversarial examples.\nOur experiments have demonstrated the high effectiveness of our detection\nmethod. For these six adversarial attack methods, our detector's AUC scores all\nexceed 0.95. Protego may advance investigations in metaverse security.\n","authors":["Jialin Wu","Kaikai Pan","Yanjiao Chen","Jiangyi Deng","Shengyuan Pang","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2501.07044v1.pdf","comment":"Accepted by IEEE MetaCom 2024"},{"id":"http://arxiv.org/abs/2501.07040v1","updated":"2025-01-13T03:43:21Z","published":"2025-01-13T03:43:21Z","title":"Rethinking Knowledge in Distillation: An In-context Sample Retrieval\n  Perspective","summary":"  Conventional knowledge distillation (KD) approaches are designed for the\nstudent model to predict similar output as the teacher model for each sample.\nUnfortunately, the relationship across samples with same class is often\nneglected. In this paper, we explore to redefine the knowledge in distillation,\ncapturing the relationship between each sample and its corresponding in-context\nsamples (a group of similar samples with the same or different classes), and\nperform KD from an in-context sample retrieval perspective. As KD is a type of\nlearned label smoothing regularization (LSR), we first conduct a theoretical\nanalysis showing that the teacher's knowledge from the in-context samples is a\ncrucial contributor to regularize the student training with the corresponding\nsamples. Buttressed by the analysis, we propose a novel in-context knowledge\ndistillation (IC-KD) framework that shows its superiority across diverse KD\nparadigms (offline, online, and teacher-free KD). Firstly, we construct a\nfeature memory bank from the teacher model and retrieve in-context samples for\neach corresponding sample through retrieval-based learning. We then introduce\nPositive In-Context Distillation (PICD) to reduce the discrepancy between a\nsample from the student and the aggregated in-context samples with the same\nclass from the teacher in the logit space. Moreover, Negative In-Context\nDistillation (NICD) is introduced to separate a sample from the student and the\nin-context samples with different classes from the teacher in the logit space.\nExtensive experiments demonstrate that IC-KD is effective across various types\nof KD, and consistently achieves state-of-the-art performance on CIFAR-100 and\nImageNet datasets.\n","authors":["Jinjing Zhu","Songze Li","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07039v1","updated":"2025-01-13T03:41:57Z","published":"2025-01-13T03:41:57Z","title":"IoT-Based Real-Time Medical-Related Human Activity Recognition Using\n  Skeletons and Multi-Stage Deep Learning for Healthcare","summary":"  The Internet of Things (IoT) and mobile technology have significantly\ntransformed healthcare by enabling real-time monitoring and diagnosis of\npatients. Recognizing medical-related human activities (MRHA) is pivotal for\nhealthcare systems, particularly for identifying actions that are critical to\npatient well-being. However, challenges such as high computational demands, low\naccuracy, and limited adaptability persist in Human Motion Recognition (HMR).\nWhile some studies have integrated HMR with IoT for real-time healthcare\napplications, limited research has focused on recognizing MRHA as essential for\neffective patient monitoring. This study proposes a novel HMR method for MRHA\ndetection, leveraging multi-stage deep learning techniques integrated with IoT.\nThe approach employs EfficientNet to extract optimized spatial features from\nskeleton frame sequences using seven Mobile Inverted Bottleneck Convolutions\n(MBConv) blocks, followed by ConvLSTM to capture spatio-temporal patterns. A\nclassification module with global average pooling, a fully connected layer, and\na dropout layer generates the final predictions. The model is evaluated on the\nNTU RGB+D 120 and HMDB51 datasets, focusing on MRHA, such as sneezing, falling,\nwalking, sitting, etc. It achieves 94.85% accuracy for cross-subject\nevaluations and 96.45% for cross-view evaluations on NTU RGB+D 120, along with\n89.00% accuracy on HMDB51. Additionally, the system integrates IoT capabilities\nusing a Raspberry Pi and GSM module, delivering real-time alerts via Twilios\nSMS service to caregivers and patients. This scalable and efficient solution\nbridges the gap between HMR and IoT, advancing patient monitoring, improving\nhealthcare outcomes, and reducing costs.\n","authors":["Subrata Kumer Paul","Abu Saleh Musa Miah","Rakhi Rani Paul","Md. Ekramul Hamid","Jungpil Shin","Md Abdur Rahim"],"pdf_url":"https://arxiv.org/pdf/2501.07039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04184v2","updated":"2025-01-13T03:33:36Z","published":"2025-01-07T23:32:05Z","title":"MedicalNarratives: Connecting Medical Vision and Language with Localized\n  Narratives","summary":"  We propose MedicalNarratives, a dataset curated from medical pedagogical\nvideos similar in nature to data collected in Think-Aloud studies and inspired\nby Localized Narratives, which collects grounded image-text data by curating\ninstructors' speech and mouse cursor movements synchronized in time.\nMedicalNarratives enables pretraining of both semantic and dense objectives,\nalleviating the need to train medical semantic and dense tasks disparately due\nto the lack of reasonably sized datasets. Our dataset contains 4.7M image-text\npairs from videos and articles, with 1M samples containing dense annotations in\nthe form of traces and bounding boxes. To evaluate the utility of\nMedicalNarratives, we train GenMedClip based on the CLIP architecture using our\ndataset spanning 12 medical domains and demonstrate that it outperforms\nprevious state-of-the-art models on a newly constructed medical imaging\nbenchmark that comprehensively evaluates performance across all modalities.\nData, demo, code and models available at https://medical-narratives.github.io\n","authors":["Wisdom O. Ikezogwo","Kevin Zhang","Mehmet Saygin Seyfioglu","Fatemeh Ghezloo","Linda Shapiro","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2501.04184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09792v3","updated":"2025-01-13T03:30:37Z","published":"2024-03-14T18:24:55Z","title":"Images are Achilles' Heel of Alignment: Exploiting Visual\n  Vulnerabilities for Jailbreaking Multimodal Large Language Models","summary":"  In this paper, we study the harmlessness alignment problem of multimodal\nlarge language models (MLLMs). We conduct a systematic empirical analysis of\nthe harmlessness performance of representative MLLMs and reveal that the image\ninput poses the alignment vulnerability of MLLMs. Inspired by this, we propose\na novel jailbreak method named HADES, which hides and amplifies the harmfulness\nof the malicious intent within the text input, using meticulously crafted\nimages. Experimental results show that HADES can effectively jailbreak existing\nMLLMs, which achieves an average Attack Success Rate (ASR) of 90.26% for\nLLaVA-1.5 and 71.60% for Gemini Pro Vision. Our code and data are available at\nhttps://github.com/RUCAIBox/HADES.\n","authors":["Yifan Li","Hangyu Guo","Kun Zhou","Wayne Xin Zhao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2403.09792v3.pdf","comment":"ECCV 2024 Oral"},{"id":"http://arxiv.org/abs/2501.07033v1","updated":"2025-01-13T03:10:54Z","published":"2025-01-13T03:10:54Z","title":"Detection of AI Deepfake and Fraud in Online Payments Using GAN-Based\n  Models","summary":"  This study explores the use of Generative Adversarial Networks (GANs) to\ndetect AI deepfakes and fraudulent activities in online payment systems. With\nthe growing prevalence of deepfake technology, which can manipulate facial\nfeatures in images and videos, the potential for fraud in online transactions\nhas escalated. Traditional security systems struggle to identify these\nsophisticated forms of fraud. This research proposes a novel GAN-based model\nthat enhances online payment security by identifying subtle manipulations in\npayment images. The model is trained on a dataset consisting of real-world\nonline payment images and deepfake images generated using advanced GAN\narchitectures, such as StyleGAN and DeepFake. The results demonstrate that the\nproposed model can accurately distinguish between legitimate transactions and\ndeepfakes, achieving a high detection rate above 95%. This approach\nsignificantly improves the robustness of payment systems against AI-driven\nfraud. The paper contributes to the growing field of digital security, offering\ninsights into the application of GANs for fraud detection in financial\nservices. Keywords- Payment Security, Image Recognition, Generative Adversarial\nNetworks, AI Deepfake, Fraudulent Activities\n","authors":["Zong Ke","Shicheng Zhou","Yining Zhou","Chia Hong Chang","Rong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07033v1.pdf","comment":"The paper will be published and indexed by IEEE at 2025 8th\n  International Conference on Advanced Algorithms and Control Engineering\n  (ICAACE 2025)"},{"id":"http://arxiv.org/abs/2411.19714v2","updated":"2025-01-13T02:43:47Z","published":"2024-11-29T14:02:00Z","title":"The Streetscape Application Services Stack (SASS): Towards a Distributed\n  Sensing Architecture for Urban Applications","summary":"  As urban populations grow, cities are becoming more complex, driving the\ndeployment of interconnected sensing systems to realize the vision of smart\ncities. These systems aim to improve safety, mobility, and quality of life\nthrough applications that integrate diverse sensors with real-time\ndecision-making. Streetscape applications-focusing on challenges like\npedestrian safety and adaptive traffic management-depend on managing\ndistributed, heterogeneous sensor data, aligning information across time and\nspace, and enabling real-time processing. These tasks are inherently complex\nand often difficult to scale. The Streetscape Application Services Stack (SASS)\naddresses these challenges with three core services: multimodal data\nsynchronization, spatiotemporal data fusion, and distributed edge computing. By\nstructuring these capabilities as clear, composable abstractions with clear\nsemantics, SASS allows developers to scale streetscape applications efficiently\nwhile minimizing the complexity of multimodal integration.\n  We evaluated SASS in two real-world testbed environments: a controlled\nparking lot and an urban intersection in a major U.S. city. These testbeds\nallowed us to test SASS under diverse conditions, demonstrating its practical\napplicability. The Multimodal Data Synchronization service reduced temporal\nmisalignment errors by 88%, achieving synchronization accuracy within 50\nmilliseconds. Spatiotemporal Data Fusion service improved detection accuracy\nfor pedestrians and vehicles by over 10%, leveraging multicamera integration.\nThe Distributed Edge Computing service increased system throughput by more than\nan order of magnitude. Together, these results show how SASS provides the\nabstractions and performance needed to support real-time, scalable urban\napplications, bridging the gap between sensing infrastructure and actionable\nstreetscape intelligence.\n","authors":["Navid Salami Pargoo","Mahshid Ghasemi","Shuren Xia","Mehmet Kerem Turkcan","Taqiya Ehsan","Chengbo Zang","Yuan Sun","Javad Ghaderi","Gil Zussman","Zoran Kostic","Jorge Ortiz"],"pdf_url":"https://arxiv.org/pdf/2411.19714v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05901v2","updated":"2025-01-13T02:34:19Z","published":"2025-01-10T11:53:46Z","title":"Valley2: Exploring Multimodal Models with Scalable Vision-Language\n  Design","summary":"  Recently, vision-language models have made remarkable progress, demonstrating\noutstanding capabilities in various tasks such as image captioning and video\nunderstanding. We introduce Valley2, a novel multimodal large language model\ndesigned to enhance performance across all domains and extend the boundaries of\npractical applications in e-commerce and short video scenarios. Notably,\nValley2 achieves state-of-the-art (SOTA) performance on e-commerce benchmarks,\nsurpassing open-source models of similar size by a large margin (79.66 vs.\n72.76). Additionally, Valley2 ranks second on the OpenCompass leaderboard among\nmodels with fewer than 10B parameters, with an impressive average score of\n67.4. The code and model weights are open-sourced at\nhttps://github.com/bytedance/Valley.\n","authors":["Ziheng Wu","Zhenghao Chen","Ruipu Luo","Can Zhang","Yuan Gao","Zhentao He","Xian Wang","Haoran Lin","Minghui Qiu"],"pdf_url":"https://arxiv.org/pdf/2501.05901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07017v1","updated":"2025-01-13T02:33:28Z","published":"2025-01-13T02:33:28Z","title":"UNetVL: Enhancing 3D Medical Image Segmentation with Chebyshev KAN\n  Powered Vision-LSTM","summary":"  3D medical image segmentation has progressed considerably due to\nConvolutional Neural Networks (CNNs) and Vision Transformers (ViTs), yet these\nmethods struggle to balance long-range dependency acquisition with\ncomputational efficiency. To address this challenge, we propose UNETVL (U-Net\nVision-LSTM), a novel architecture that leverages recent advancements in\ntemporal information processing. UNETVL incorporates Vision-LSTM (ViL) for\nimproved scalability and memory functions, alongside an efficient Chebyshev\nKolmogorov-Arnold Networks (KAN) to handle complex and long-range dependency\npatterns more effectively. We validated our method on the ACDC and AMOS2022\n(post challenge Task 2) benchmark datasets, showing a significant improvement\nin mean Dice score compared to recent state-of-the-art approaches, especially\nover its predecessor, UNETR, with increases of 7.3% on ACDC and 15.6% on AMOS,\nrespectively. Extensive ablation studies were conducted to demonstrate the\nimpact of each component in UNETVL, providing a comprehensive understanding of\nits architecture. Our code is available at https://github.com/tgrex6/UNETVL,\nfacilitating further research and applications in this domain.\n","authors":["Xuhui Guo","Tanmoy Dam","Rohan Dhamdhere","Gourav Modanwal","Anant Madabhushi"],"pdf_url":"https://arxiv.org/pdf/2501.07017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07016v1","updated":"2025-01-13T02:29:42Z","published":"2025-01-13T02:29:42Z","title":"A Multi-Modal Deep Learning Framework for Pan-Cancer Prognosis","summary":"  Prognostic task is of great importance as it closely related to the survival\nanalysis of patients, the optimization of treatment plans and the allocation of\nresources. The existing prognostic models have shown promising results on\nspecific datasets, but there are limitations in two aspects. On the one hand,\nthey merely explore certain types of modal data, such as patient histopathology\nWSI and gene expression analysis. On the other hand, they adopt the\nper-cancer-per-model paradigm, which means the trained models can only predict\nthe prognostic effect of a single type of cancer, resulting in weak\ngeneralization ability. In this paper, a deep-learning based model, named\nUMPSNet, is proposed. Specifically, to comprehensively understand the condition\nof patients, in addition to constructing encoders for histopathology images and\ngenomic expression profiles respectively, UMPSNet further integrates four types\nof important meta data (demographic information, cancer type information,\ntreatment protocols, and diagnosis results) into text templates, and then\nintroduces a text encoder to extract textual features. In addition, the optimal\ntransport OT-based attention mechanism is utilized to align and fuse features\nof different modalities. Furthermore, a guided soft mixture of experts (GMoE)\nmechanism is introduced to effectively address the issue of distribution\ndifferences among multiple cancer datasets. By incorporating the multi-modality\nof patient data and joint training, UMPSNet outperforms all SOTA approaches,\nand moreover, it demonstrates the effectiveness and generalization ability of\nthe proposed learning paradigm of a single model for multiple cancer types. The\ncode of UMPSNet is available at https://github.com/binging512/UMPSNet.\n","authors":["Binyu Zhang","Shichao Li","Junpeng Jian","Zhu Meng","Limei Guo","Zhicheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07015v1","updated":"2025-01-13T02:28:13Z","published":"2025-01-13T02:28:13Z","title":"SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting","summary":"  Achieving high-fidelity 3D reconstruction from monocular video remains\nchallenging due to the inherent limitations of traditional methods like\nStructure-from-Motion (SfM) and monocular SLAM in accurately capturing scene\ndetails. While differentiable rendering techniques such as Neural Radiance\nFields (NeRF) address some of these challenges, their high computational costs\nmake them unsuitable for real-time applications. Additionally, existing 3D\nGaussian Splatting (3DGS) methods often focus on photometric consistency,\nneglecting geometric accuracy and failing to exploit SLAM's dynamic depth and\npose updates for scene refinement. We propose a framework integrating dense\nSLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach\nintroduces SLAM-Informed Adaptive Densification, which dynamically updates and\ndensifies the Gaussian model by leveraging dense point clouds from SLAM.\nAdditionally, we incorporate Geometry-Guided Optimization, which combines\nedge-aware geometric constraints and photometric consistency to jointly\noptimize the appearance and geometry of the 3DGS scene representation, enabling\ndetailed and accurate SLAM mapping reconstruction. Experiments on the Replica\nand TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving\nstate-of-the-art results among monocular systems. Specifically, our method\nachieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica,\nrepresenting improvements of 10.7%, 6.4%, and 49.4%, respectively, over the\nprevious SOTA. On TUM-RGBD, our method outperforms the closest baseline by\n10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the\npotential of our framework in bridging the gap between photometric and\ngeometric dense 3D scene representations, paving the way for practical and\nefficient monocular dense reconstruction.\n","authors":["Yue Hu","Rong Liu","Meida Chen","Andrew Feng","Peter Beerel"],"pdf_url":"https://arxiv.org/pdf/2501.07015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12185v4","updated":"2025-01-13T02:14:51Z","published":"2024-02-19T14:48:23Z","title":"ChartX & ChartVLM: A Versatile Benchmark and Foundation Model for\n  Complicated Chart Reasoning","summary":"  Recently, many versatile Multi-modal Large Language Models (MLLMs) have\nemerged continuously. However, their capacity to query information depicted in\nvisual charts and engage in reasoning based on the queried contents remains\nunder-explored. In this paper, to comprehensively and rigorously benchmark the\nability of the off-the-shelf MLLMs in the chart domain, we construct ChartX, a\nmulti-modal evaluation set covering 18 chart types, 7 chart tasks, 22\ndisciplinary topics, and high-quality chart data. Besides, we develop ChartVLM\nto offer a new perspective on handling multi-modal tasks that strongly depend\non interpretable patterns, such as reasoning tasks in the field of charts or\ngeometric images. We evaluate the chart-related ability of mainstream MLLMs and\nour ChartVLM on the proposed ChartX evaluation set. Extensive experiments\ndemonstrate that ChartVLM surpasses both versatile and chart-related large\nmodels, achieving results comparable to GPT-4V. We believe that our study can\npave the way for further exploration in creating a more comprehensive chart\nevaluation set and developing more interpretable multi-modal models. Both\nChartX and ChartVLM are available at:\nhttps://github.com/Alpha-Innovator/ChartVLM\n","authors":["Renqiu Xia","Bo Zhang","Hancheng Ye","Xiangchao Yan","Qi Liu","Hongbin Zhou","Zijun Chen","Min Dou","Botian Shi","Junchi Yan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2402.12185v4.pdf","comment":"Code and dataset are available for downloading at:\n  https://github.com/Alpha-Innovator/ChartVLM 25 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.02763v2","updated":"2025-01-13T01:21:29Z","published":"2025-01-06T05:14:40Z","title":"LDMapNet-U: An End-to-End System for City-Scale Lane-Level Map Updating","summary":"  An up-to-date city-scale lane-level map is an indispensable infrastructure\nand a key enabling technology for ensuring the safety and user experience of\nautonomous driving systems. In industrial scenarios, reliance on manual\nannotation for map updates creates a critical bottleneck. Lane-level updates\nrequire precise change information and must ensure consistency with adjacent\ndata while adhering to strict standards. Traditional methods utilize a\nthree-stage approach-construction, change detection, and updating-which often\nnecessitates manual verification due to accuracy limitations. This results in\nlabor-intensive processes and hampers timely updates. To address these\nchallenges, we propose LDMapNet-U, which implements a new end-to-end paradigm\nfor city-scale lane-level map updating. By reconceptualizing the update task as\nan end-to-end map generation process grounded in historical map data, we\nintroduce a paradigm shift in map updating that simultaneously generates\nvectorized maps and change information. To achieve this, a Prior-Map Encoding\n(PME) module is introduced to effectively encode historical maps, serving as a\ncritical reference for detecting changes. Additionally, we incorporate a novel\nInstance Change Prediction (ICP) module that learns to predict associations\nwith historical maps. Consequently, LDMapNet-U simultaneously achieves\nvectorized map element generation and change detection. To demonstrate the\nsuperiority and effectiveness of LDMapNet-U, extensive experiments are\nconducted using large-scale real-world datasets. In addition, LDMapNet-U has\nbeen successfully deployed in production at Baidu Maps since April 2024,\nsupporting map updating for over 360 cities and significantly shortening the\nupdate cycle from quarterly to weekly. The updated maps serve hundreds of\nmillions of users and are integrated into the autonomous driving systems of\nseveral leading vehicle companies.\n","authors":["Deguo Xia","Weiming Zhang","Xiyan Liu","Wei Zhang","Chenting Gong","Xiao Tan","Jizhou Huang","Mengmeng Yang","Diange Yang"],"pdf_url":"https://arxiv.org/pdf/2501.02763v2.pdf","comment":"Accepted by KDD 2025, camera-ready version"},{"id":"http://arxiv.org/abs/2501.06986v1","updated":"2025-01-13T00:29:55Z","published":"2025-01-13T00:29:55Z","title":"LEO: Boosting Mixture of Vision Encoders for Multimodal Large Language\n  Models","summary":"  Enhanced visual understanding serves as a cornerstone for multimodal large\nlanguage models (MLLMs). Recent hybrid MLLMs incorporate a mixture of vision\nexperts to address the limitations of using a single vision encoder and\nexcessively long visual tokens. Despite the progress of these MLLMs, a research\ngap remains in effectively integrating diverse vision encoders. This work\nexplores fusion strategies of visual tokens for hybrid MLLMs, leading to the\ndesign of LEO, a novel MLLM with a dual-branch vision encoder framework that\nincorporates a post-adaptation fusion strategy and adaptive tiling: for each\nsegmented tile of the input images, LEO sequentially interleaves the visual\ntokens from its two vision encoders. Extensive evaluation across 13\nvision-language benchmarks reveals that LEO outperforms state-of-the-art\nopen-source MLLMs and hybrid MLLMs on the majority of tasks. Furthermore, we\nshow that LEO can be adapted to the specialized domain of autonomous driving\nwithout altering the model architecture or training recipe, achieving\ncompetitive performance compared to existing baselines. The code and model will\nbe publicly available.\n","authors":["Mozhgan Nasr Azadani","James Riddell","Sean Sedwards","Krzysztof Czarnecki"],"pdf_url":"https://arxiv.org/pdf/2501.06986v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.09698v5","updated":"2025-01-13T17:48:09Z","published":"2024-08-19T04:44:32Z","title":"Harnessing Multimodal Large Language Models for Multimodal Sequential\n  Recommendation","summary":"  Recent advances in Large Language Models (LLMs) have demonstrated significant\npotential in the field of Recommendation Systems (RSs). Most existing studies\nhave focused on converting user behavior logs into textual prompts and\nleveraging techniques such as prompt tuning to enable LLMs for recommendation\ntasks. Meanwhile, research interest has recently grown in multimodal\nrecommendation systems that integrate data from images, text, and other sources\nusing modality fusion techniques. This introduces new challenges to the\nexisting LLM-based recommendation paradigm which relies solely on text modality\ninformation. Moreover, although Multimodal Large Language Models (MLLMs)\ncapable of processing multi-modal inputs have emerged, how to equip MLLMs with\nmulti-modal recommendation capabilities remains largely unexplored. To this\nend, in this paper, we propose the Multimodal Large Language Model-enhanced\nMultimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic\nuser preference, we design a two-stage user preference summarization method.\nSpecifically, we first utilize an MLLM-based item-summarizer to extract image\nfeature given an item and convert the image into text. Then, we employ a\nrecurrent user preference summarization generation paradigm to capture the\ndynamic changes in user preferences based on an LLM-based user-summarizer.\nFinally, to enable the MLLM for multi-modal recommendation task, we propose to\nfine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT)\ntechniques. Extensive evaluations across various datasets validate the\neffectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt\nto the evolving dynamics of user preferences.\n","authors":["Yuyang Ye","Zhi Zheng","Yishan Shen","Tianshu Wang","Hengruo Zhang","Peijun Zhu","Runlong Yu","Kai Zhang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.09698v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07365v1","updated":"2025-01-13T14:34:26Z","published":"2025-01-13T14:34:26Z","title":"Multimodal semantic retrieval for product search","summary":"  Semantic retrieval (also known as dense retrieval) based on textual data has\nbeen extensively studied for both web search and product search application\nfields, where the relevance of a query and a potential target document is\ncomputed by their dense vector representation comparison. Product image is\ncrucial for e-commence search interactions and is a key factor for customers at\nproduct explorations. But its impact for semantic retrieval has not been well\nstudied yet. In this research, we build a multimodal representation for product\nitems in e-commerece search in contrast to pure-text representation of\nproducts, and investigate the impact of such representations. The models are\ndeveloped and evaluated on e-commerce datasets. We demonstrate that a\nmultimodal representation scheme for a product can show improvement either on\npurchase recall or relevance accuracy in semantic retrieval. Additionally, we\nprovide numerical analysis for exclusive matches retrieved by a multimodal\nsemantic retrieval model versus a text-only semantic retrieval model, to\ndemonstrate the validation of multimodal solutions.\n","authors":["Dong Liu","Esther Lopez Ramos"],"pdf_url":"https://arxiv.org/pdf/2501.07365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07294v1","updated":"2025-01-13T13:01:00Z","published":"2025-01-13T13:01:00Z","title":"Dataset-Agnostic Recommender Systems","summary":"  [This is a position paper and does not contain any empirical or theoretical\nresults] Recommender systems have become a cornerstone of personalized user\nexperiences, yet their development typically involves significant manual\nintervention, including dataset-specific feature engineering, hyperparameter\ntuning, and configuration. To this end, we introduce a novel paradigm:\nDataset-Agnostic Recommender Systems (DAReS) that aims to enable a single\ncodebase to autonomously adapt to various datasets without the need for\nfine-tuning, for a given recommender system task. Central to this approach is\nthe Dataset Description Language (DsDL), a structured format that provides\nmetadata about the dataset's features and labels, and allow the system to\nunderstand dataset's characteristics, allowing it to autonomously manage\nprocesses like feature selection, missing values imputation, noise removal, and\nhyperparameter optimization. By reducing the need for domain-specific expertise\nand manual adjustments, DAReS offers a more efficient and scalable solution for\nbuilding recommender systems across diverse application domains. It addresses\ncritical challenges in the field, such as reusability, reproducibility, and\naccessibility for non-expert users or entry-level researchers.\n","authors":["Tri Kurniawan Wijaya","Edoardo D'Amico","Xinyang Shao"],"pdf_url":"https://arxiv.org/pdf/2501.07294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07212v1","updated":"2025-01-13T11:12:43Z","published":"2025-01-13T11:12:43Z","title":"Future-Conditioned Recommendations with Multi-Objective Controllable\n  Decision Transformer","summary":"  Securing long-term success is the ultimate aim of recommender systems,\ndemanding strategies capable of foreseeing and shaping the impact of decisions\non future user satisfaction. Current recommendation strategies grapple with two\nsignificant hurdles. Firstly, the future impacts of recommendation decisions\nremain obscured, rendering it impractical to evaluate them through direct\noptimization of immediate metrics. Secondly, conflicts often emerge between\nmultiple objectives, like enhancing accuracy versus exploring diverse\nrecommendations. Existing strategies, trapped in a \"training, evaluation, and\nretraining\" loop, grow more labor-intensive as objectives evolve. To address\nthese challenges, we introduce a future-conditioned strategy for\nmulti-objective controllable recommendations, allowing for the direct\nspecification of future objectives and empowering the model to generate item\nsequences that align with these goals autoregressively. We present the\nMulti-Objective Controllable Decision Transformer (MocDT), an offline\nReinforcement Learning (RL) model capable of autonomously learning the mapping\nfrom multiple objectives to item sequences, leveraging extensive offline data.\nConsequently, it can produce recommendations tailored to any specified\nobjectives during the inference stage. Our empirical findings emphasize the\ncontrollable recommendation strategy's ability to produce item sequences\naccording to different objectives while maintaining performance that is\ncompetitive with current recommendation strategies across various objectives.\n","authors":["Chongming Gao","Kexin Huang","Ziang Fei","Jiaju Chen","Jiawei Chen","Jianshan Sun","Shuchang Liu","Qingpeng Cai","Peng Jiang"],"pdf_url":"https://arxiv.org/pdf/2501.07212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04549v2","updated":"2025-01-13T09:19:53Z","published":"2023-11-08T09:31:48Z","title":"Preference-Consistent Knowledge Distillation for Recommender System","summary":"  Feature-based knowledge distillation has been applied to compress modern\nrecommendation models, usually with projectors that align student (small)\nrecommendation models' dimensions with teacher dimensions. However, existing\nstudies have only focused on making the projected features (i.e., student\nfeatures after projectors) similar to teacher features, overlooking\ninvestigating whether the user preference can be transferred to student\nfeatures (i.e., student features before projectors) in this manner. In this\npaper, we find that due to the lack of restrictions on projectors, the process\nof transferring user preferences will likely be interfered with. We refer to\nthis phenomenon as preference inconsistency. It greatly wastes the power of\nfeature-based knowledge distillation. To mitigate preference inconsistency, we\npropose PCKD, which consists of two regularization terms for projectors. We\nalso propose a hybrid method that combines the two regularization terms. We\nfocus on items with high preference scores and significantly mitigate\npreference inconsistency, improving the performance of feature-based knowledge\ndistillation. Extensive experiments on three public datasets and three\nbackbones demonstrate the effectiveness of PCKD. The code of our method is\nprovided in https://github.com/woriazzc/KDs.\n","authors":["Zhangchi Zhu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.04549v2.pdf","comment":"TKDE 2024 Accepted"},{"id":"http://arxiv.org/abs/2411.10676v2","updated":"2025-01-13T09:10:18Z","published":"2024-11-16T02:41:12Z","title":"Exploring Feature-based Knowledge Distillation for Recommender System: A\n  Frequency Perspective","summary":"  In this paper, we analyze the feature-based knowledge distillation for\nrecommendation from the frequency perspective. By defining knowledge as\ndifferent frequency components of the features, we theoretically demonstrate\nthat regular feature-based knowledge distillation is equivalent to equally\nminimizing losses on all knowledge and further analyze how this equal loss\nweight allocation method leads to important knowledge being overlooked. In\nlight of this, we propose to emphasize important knowledge by redistributing\nknowledge weights. Furthermore, we propose FreqD, a lightweight knowledge\nreweighting method, to avoid the computational cost of calculating losses on\neach knowledge. Extensive experiments demonstrate that FreqD consistently and\nsignificantly outperforms state-of-the-art knowledge distillation methods for\nrecommender systems. Our code is available at https://github.com/woriazzc/KDs.\n","authors":["Zhangchi Zhu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.10676v2.pdf","comment":"ACM KDD 2025 Accepted"},{"id":"http://arxiv.org/abs/2501.07679v1","updated":"2025-01-13T20:32:38Z","published":"2025-01-13T20:32:38Z","title":"Constructing Set-Compositional and Negated Representations for\n  First-Stage Ranking","summary":"  Set compositional and negated queries are crucial for expressing complex\ninformation needs and enable the discovery of niche items like Books about\nnon-European monarchs. Despite the recent advances in LLMs, first-stage ranking\nremains challenging due to the requirement of encoding documents and queries\nindependently from each other. This limitation calls for constructing\ncompositional query representations that encapsulate logical operations or\nnegations, and can be used to match relevant documents effectively. In the\nfirst part of this work, we explore constructing such representations in a\nzero-shot setting using vector operations between lexically grounded Learned\nSparse Retrieval (LSR) representations. Specifically, we introduce Disentangled\nNegation that penalizes only the negated parts of a query, and a Combined\nPseudo-Term approach that enhances LSRs ability to handle intersections. We\nfind that our zero-shot approach is competitive and often outperforms\nretrievers fine-tuned on compositional data, highlighting certain limitations\nof LSR and Dense Retrievers. Finally, we address some of these limitations and\nimprove LSRs representation power for negation, by allowing them to attribute\nnegative term scores and effectively penalize documents containing the negated\nterms.\n","authors":["Antonios Minas Krasakis","Andrew Yates","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2501.07679v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2501.07250v1","updated":"2025-01-13T11:58:27Z","published":"2025-01-13T11:58:27Z","title":"Large Language Models: New Opportunities for Access to Science","summary":"  The adaptation of Large Language Models like ChatGPT for information\nretrieval from scientific data, software and publications is offering new\nopportunities to simplify access to and understanding of science for persons\nfrom all levels of expertise. They can become tools to both enhance the\nusability of the open science environment we are building as well as help to\nprovide systematic insight to a long-built corpus of scientific publications.\nThe uptake of Retrieval Augmented Generation-enhanced chat applications in the\nconstruction of the open science environment of the KM3NeT neutrino detectors\nserves as a focus point to explore and exemplify prospects for the wider\napplication of Large Language Models for our science.\n","authors":["Jutta Schnabel"],"pdf_url":"https://arxiv.org/pdf/2501.07250v1.pdf","comment":"conference proceeding to ADASS XXXIV 2024"},{"id":"http://arxiv.org/abs/2501.07111v1","updated":"2025-01-13T07:51:46Z","published":"2025-01-13T07:51:46Z","title":"ListConRanker: A Contrastive Text Reranker with Listwise Encoding","summary":"  Reranker models aim to re-rank the passages based on the semantics similarity\nbetween the given query and passages, which have recently received more\nattention due to the wide application of the Retrieval-Augmented Generation.\nMost previous methods apply pointwise encoding, meaning that it can only encode\nthe context of the query for each passage input into the model. However, for\nthe reranker model, given a query, the comparison results between passages are\neven more important, which is called listwise encoding. Besides, previous\nmodels are trained using the cross-entropy loss function, which leads to issues\nof unsmooth gradient changes during training and low training efficiency. To\naddress these issues, we propose a novel Listwise-encoded Contrastive text\nreRanker (ListConRanker). It can help the passage to be compared with other\npassages during the encoding process, and enhance the contrastive information\nbetween positive examples and between positive and negative examples. At the\nsame time, we use the circle loss to train the model to increase the\nflexibility of gradients and solve the problem of training efficiency.\nExperimental results show that ListConRanker achieves state-of-the-art\nperformance on the reranking benchmark of Chinese Massive Text Embedding\nBenchmark, including the cMedQA1.0, cMedQA2.0, MMarcoReranking, and T2Reranking\ndatasets.\n","authors":["Junlong Liu","Yue Ma","Ruihui Zhao","Junhao Zheng","Qianli Ma","Yangyang Kang"],"pdf_url":"https://arxiv.org/pdf/2501.07111v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.07110v1","updated":"2025-01-13T07:51:43Z","published":"2025-01-13T07:51:43Z","title":"Dynamic Multimodal Fusion via Meta-Learning Towards Micro-Video\n  Recommendation","summary":"  Multimodal information (e.g., visual, acoustic, and textual) has been widely\nused to enhance representation learning for micro-video recommendation. For\nintegrating multimodal information into a joint representation of micro-video,\nmultimodal fusion plays a vital role in the existing micro-video recommendation\napproaches. However, the static multimodal fusion used in previous studies is\ninsufficient to model the various relationships among multimodal information of\ndifferent micro-videos. In this paper, we develop a novel meta-learning-based\nmultimodal fusion framework called Meta Multimodal Fusion (MetaMMF), which\ndynamically assigns parameters to the multimodal fusion function for each\nmicro-video during its representation learning. Specifically, MetaMMF regards\nthe multimodal fusion of each micro-video as an independent task. Based on the\nmeta information extracted from the multimodal features of the input task,\nMetaMMF parameterizes a neural network as the item-specific fusion function via\na meta learner. We perform extensive experiments on three benchmark datasets,\ndemonstrating the significant improvements over several state-of-the-art\nmultimodal recommendation models, like MMGCN, LATTICE, and InvRL. Furthermore,\nwe lighten our model by adopting canonical polyadic decomposition to improve\nthe training efficiency, and validate its effectiveness through experimental\nresults. Codes are available at https://github.com/hanliu95/MetaMMF.\n","authors":["Han Liu","Yinwei Wei","Fan Liu","Wenjie Wang","Liqiang Nie","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2501.07110v1.pdf","comment":"This paper has been accepted by ACM Transactions on Information\n  Systems"},{"id":"http://arxiv.org/abs/2411.15005v4","updated":"2025-01-13T07:39:30Z","published":"2024-11-22T15:29:05Z","title":"Multi-granularity Interest Retrieval and Refinement Network for\n  Long-Term User Behavior Modeling in CTR Prediction","summary":"  Click-through Rate (CTR) prediction is crucial for online personalization\nplatforms. Recent advancements have shown that modeling rich user behaviors can\nsignificantly improve the performance of CTR prediction. Current long-term user\nbehavior modeling algorithms predominantly follow two cascading stages. The\nfirst stage retrieves subsequence related to the target item from the long-term\nbehavior sequence, while the second stage models the relationship between the\nsubsequence and the target item. Despite significant progress, these methods\nhave two critical flaws. First, the retrieval query typically includes only\ntarget item information, limiting the ability to capture the user's diverse\ninterests. Second, relational information, such as sequential and interactive\ninformation within the subsequence, is frequently overlooked. Therefore, it\nrequires to be further mined to more accurately model user interests.\n  To this end, we propose Multi-granularity Interest Retrieval and Refinement\nNetwork (MIRRN). Specifically, we first construct queries based on behaviors\nobserved at different time scales to obtain subsequences, each capturing users'\ninterest at various granularities. We then introduce an noval multi-head\nFourier transformer to efficiently learn sequential and interactive information\nwithin the subsequences, leading to more accurate modeling of user interests.\nFinally, we employ multi-head target attention to adaptively assess the impact\nof these multi-granularity interests on the target item. Extensive experiments\nhave demonstrated that MIRRN significantly outperforms state-of-the-art\nbaselines. Furthermore, an A/B test shows that MIRRN increases the average\nnumber of listening songs by 1.32% and the average time of listening songs by\n0.55% on the Huawei Music App. The implementation code is publicly available at\nhttps://github.com/USTC-StarTeam/MIRRN.\n","authors":["Xiang Xu","Hao Wang","Wei Guo","Luankang Zhang","Wanshan Yang","Runlong Yu","Yong Liu","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.15005v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07096v1","updated":"2025-01-13T07:09:01Z","published":"2025-01-13T07:09:01Z","title":"Intent-Interest Disentanglement and Item-Aware Intent Contrastive\n  Learning for Sequential Recommendation","summary":"  Recommender systems aim to provide personalized item recommendations by\ncapturing user behaviors derived from their interaction history. Considering\nthat user interactions naturally occur sequentially based on users' intents in\nmind, user behaviors can be interpreted as user intents. Therefore,\nintent-based sequential recommendations are actively studied recently to model\nuser intents from historical interactions for a more precise user understanding\nbeyond traditional studies that often overlook the underlying semantics behind\nuser interactions. However, existing studies face three challenges: 1) the\nlimited understanding of user behaviors by focusing solely on intents, 2) the\nlack of robustness in categorizing intents due to arbitrary fixed numbers of\nintent categories, and 3) the neglect of interacted items in modeling of user\nintents. To address these challenges, we propose Intent-Interest\nDisentanglement and Item-Aware Intent Contrastive Learning for Sequential\nRecommendation (IDCLRec). IDCLRec disentangles user behaviors into intents\nwhich are dynamic motivations and interests which are stable tastes of users\nfor a comprehensive understanding of user behaviors. A causal cross-attention\nmechanism is used to identify consistent interests across interactions, while\nresidual behaviors are modeled as intents by modeling their temporal dynamics\nthrough a similarity adjustment loss. In addition, without predefining the\nnumber of intent categories, an importance-weighted attention mechanism\ncaptures user-specific categorical intent considering the importance of intent\nfor each interaction. Furthermore, we introduce item-aware contrastive learning\nwhich aligns intents that occurred the same interaction and aligns intent with\nitem combinations occurred by the corresponding intent. Extensive experiments\nconducted on real-world datasets demonstrate the effectiveness of IDCLRec.\n","authors":["Yijin Choi","Chiehyeon Lim"],"pdf_url":"https://arxiv.org/pdf/2501.07096v1.pdf","comment":"14 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2412.20163v2","updated":"2025-01-13T06:17:38Z","published":"2024-12-28T14:27:45Z","title":"Topic-Aware Knowledge Graph with Large Language Models for\n  Interoperability in Recommender Systems","summary":"  The use of knowledge graphs in recommender systems has become one of the\ncommon approaches to addressing data sparsity and cold start problems. Recent\nadvances in large language models (LLMs) offer new possibilities for processing\nside and context information within knowledge graphs. However, consistent\nintegration across various systems remains challenging due to the need for\ndomain expert intervention and differences in system characteristics. To\naddress these issues, we propose a consistent approach that extracts both\ngeneral and specific topics from both side and context information using LLMs.\nFirst, general topics are iteratively extracted and updated from side\ninformation. Then, specific topics are extracted using context information.\nFinally, to address synonymous topics generated during the specific topic\nextraction process, a refining algorithm processes and resolves these issues\neffectively. This approach allows general topics to capture broad knowledge\nacross diverse item characteristics, while specific topics emphasize detailed\nattributes, providing a more comprehensive understanding of the semantic\nfeatures of items and the preferences of users. Experimental results\ndemonstrate significant improvements in recommendation performance across\ndiverse knowledge graphs.\n","authors":["Minhye Jeon","Seokho Ahn","Young-Duk Seo"],"pdf_url":"https://arxiv.org/pdf/2412.20163v2.pdf","comment":"Accepted in The 40th ACM/SIGAPP Symposium On Applied Computing(SAC)\n  2025"},{"id":"http://arxiv.org/abs/2406.00323v2","updated":"2025-01-13T05:39:04Z","published":"2024-06-01T06:53:03Z","title":"BeFA: A General Behavior-driven Feature Adapter for Multimedia\n  Recommendation","summary":"  Multimedia recommender systems focus on utilizing behavioral information and\ncontent information to model user preferences. Typically, it employs\npre-trained feature encoders to extract content features, then fuses them with\nbehavioral features. However, pre-trained feature encoders often extract\nfeatures from the entire content simultaneously, including excessive\npreference-irrelevant details. We speculate that it may result in the extracted\nfeatures not containing sufficient features to accurately reflect user\npreferences. To verify our hypothesis, we introduce an attribution analysis\nmethod for visually and intuitively analyzing the content features. The results\nindicate that certain products' content features exhibit the issues of\ninformation drift}and information omission,reducing the expressive ability of\nfeatures. Building upon this finding, we propose an effective and efficient\ngeneral Behavior-driven Feature Adapter (BeFA) to tackle these issues. This\nadapter reconstructs the content feature with the guidance of behavioral\ninformation, enabling content features accurately reflecting user preferences.\nExtensive experiments demonstrate the effectiveness of the adapter across all\nmultimedia recommendation methods. Our code is made publicly available on\nhttps://github.com/fqldom/BeFA.\n","authors":["Qile Fan","Penghang Yu","Zhiyi Tan","Bing-Kun Bao","Guanming Lu"],"pdf_url":"https://arxiv.org/pdf/2406.00323v2.pdf","comment":"This paper is accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.07063v1","updated":"2025-01-13T05:16:14Z","published":"2025-01-13T05:16:14Z","title":"Research on the Online Update Method for Retrieval-Augmented Generation\n  (RAG) Model with Incremental Learning","summary":"  In the contemporary context of rapid advancements in information technology\nand the exponential growth of data volume, language models are confronted with\nsignificant challenges in effectively navigating the dynamic and ever-evolving\ninformation landscape to update and adapt to novel knowledge in real time. In\nthis work, an online update method is proposed, which is based on the existing\nRetrieval Enhanced Generation (RAG) model with multiple innovation mechanisms.\nFirstly, the dynamic memory is used to capture the emerging data samples, and\nthen gradually integrate them into the core model through a tunable knowledge\ndistillation strategy. At the same time, hierarchical indexing and multi-layer\ngating mechanism are introduced into the retrieval module to ensure that the\nretrieved content is more targeted and accurate. Finally, a multi-stage network\nstructure is established for different types of inputs in the generation stage,\nand cross-attention matching and screening are carried out on the intermediate\nrepresentations of each stage to ensure the effective integration and iterative\nupdate of new and old knowledge. Experimental results show that the proposed\nmethod is better than the existing mainstream comparison models in terms of\nknowledge retention and inference accuracy.\n","authors":["Yuxin Fan","Yuxiang Wang","Lipeng Liu","Xirui Tang","Na Sun","Zidong Yu"],"pdf_url":"https://arxiv.org/pdf/2501.07063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07024v1","updated":"2025-01-13T02:53:07Z","published":"2025-01-13T02:53:07Z","title":"A Proposed Large Language Model-Based Smart Search for Archive System","summary":"  This study presents a novel framework for smart search in digital archival\nsystems, leveraging the capabilities of Large Language Models (LLMs) to enhance\ninformation retrieval. By employing a Retrieval-Augmented Generation (RAG)\napproach, the framework enables the processing of natural language queries and\ntransforming non-textual data into meaningful textual representations. The\nsystem integrates advanced metadata generation techniques, a hybrid retrieval\nmechanism, a router query engine, and robust response synthesis, the results\nproved search precision and relevance. We present the architecture and\nimplementation of the system and evaluate its performance in four experiments\nconcerning LLM efficiency, hybrid retrieval optimizations, multilingual query\nhandling, and the impacts of individual components. Obtained results show\nsignificant improvements over conventional approaches and have demonstrated the\npotential of AI-powered systems to transform modern archival practices.\n","authors":["Ha Dung Nguyen","Thi-Hoang Anh Nguyen","Thanh Binh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.07024v1.pdf","comment":"The 13th International Symposium on Information and Communication\n  Technology (SOICT 2024)"},{"id":"http://arxiv.org/abs/2501.06985v1","updated":"2025-01-13T00:29:29Z","published":"2025-01-13T00:29:29Z","title":"Graph Contrastive Learning on Multi-label Classification for\n  Recommendations","summary":"  In business analysis, providing effective recommendations is essential for\nenhancing company profits. The utilization of graph-based structures, such as\nbipartite graphs, has gained popularity for their ability to analyze complex\ndata relationships. Link prediction is crucial for recommending specific items\nto users. Traditional methods in this area often involve identifying patterns\nin the graph structure or using representational techniques like graph neural\nnetworks (GNNs). However, these approaches encounter difficulties as the volume\nof data increases. To address these challenges, we propose a model called Graph\nContrastive Learning for Multi-label Classification (MCGCL). MCGCL leverages\ncontrastive learning to enhance recommendation effectiveness. The model\nincorporates two training stages: a main task and a subtask. The main task is\nholistic user-item graph learning to capture user-item relationships. The\nhomogeneous user-user (item-item) subgraph is constructed to capture user-user\nand item-item relationships in the subtask. We assessed the performance using\nreal-world datasets from Amazon Reviews in multi-label classification tasks.\nComparative experiments with state-of-the-art methods confirm the effectiveness\nof MCGCL, highlighting its potential for improving recommendation systems.\n","authors":["Jiayang Wu","Wensheng Gan","Huashen Lu","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2501.06985v1.pdf","comment":"Preprint. 10 figures, 5 tables"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2501.07564v1","updated":"2025-01-13T18:53:23Z","published":"2025-01-13T18:53:23Z","title":"E2ESlack: An End-to-End Graph-Based Framework for Pre-Routing Slack\n  Prediction","summary":"  Pre-routing slack prediction remains a critical area of research in\nElectronic Design Automation (EDA). Despite numerous machine learning-based\napproaches targeting this task, there is still a lack of a truly end-to-end\nframework that engineers can use to obtain TNS/WNS metrics from raw circuit\ndata at the placement stage. Existing works have demonstrated effectiveness in\nArrival Time (AT) prediction but lack a mechanism for Required Arrival Time\n(RAT) prediction, which is essential for slack prediction and obtaining TNS/WNS\nmetrics. In this work, we propose E2ESlack, an end-to-end graph-based framework\nfor pre-routing slack prediction. The framework includes a TimingParser that\nsupports DEF, SDF and LIB files for feature extraction and graph construction,\nan arrival time prediction model and a fast RAT estimation module. To the best\nof our knowledge, this is the first work capable of predicting path-level\nslacks at the pre-routing stage. We perform extensive experiments and\ndemonstrate that our proposed RAT estimation method outperforms the SOTA\nML-based prediction method and also pre-routing STA tool. Additionally, the\nproposed E2ESlack framework achieves TNS/WNS values comparable to post-routing\nSTA results while saving up to 23x runtime.\n","authors":["Saurabh Bodhe","Zhanguang Zhang","Atia Hamidizadeh","Shixiong Kai","Yingxue Zhang","Mingxuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2501.07564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05451v2","updated":"2025-01-13T18:45:57Z","published":"2024-10-07T19:34:35Z","title":"SecAlign: Defending Against Prompt Injection with Preference\n  Optimization","summary":"  Large language models (LLMs) are becoming increasingly prevalent in modern\nsoftware systems, interfacing between the user and the Internet to assist with\ntasks that require advanced language understanding. To accomplish these tasks,\nthe LLM often uses external data sources such as user documents, web retrieval,\nresults from API calls, etc. This opens up new avenues for attackers to\nmanipulate the LLM via prompt injection. Adversarial prompts can be injected\ninto external data sources to override the system's intended instruction and\ninstead execute a malicious instruction.\n  To mitigate this vulnerability, we propose a new defense called SecAlign\nbased on the technique of preference optimization. Our defense first constructs\na preference dataset with prompt-injected inputs, secure outputs (ones that\nrespond to the legitimate instruction), and insecure outputs (ones that respond\nto the injection). We then perform preference optimization on this dataset to\nteach the LLM to prefer the secure output over the insecure one. This provides\nthe first known method that reduces the success rates of various prompt\ninjections to around 0%, even against attacks much more sophisticated than ones\nseen during training. This indicates our defense generalizes well against\nunknown and yet-to-come attacks. Also, our defended models are still practical\nwith similar utility to the one before our defensive training. Our code is at\nhttps://github.com/facebookresearch/SecAlign\n","authors":["Sizhe Chen","Arman Zharmagambetov","Saeed Mahloujifar","Kamalika Chaudhuri","David Wagner","Chuan Guo"],"pdf_url":"https://arxiv.org/pdf/2410.05451v2.pdf","comment":"Key words: prompt injection defense, LLM security, LLM-integrated\n  applications"},{"id":"http://arxiv.org/abs/2501.07555v1","updated":"2025-01-13T18:37:10Z","published":"2025-01-13T18:37:10Z","title":"Dynamic Prototype Rehearsal for Continual Learning in ECG Arrhythmia\n  Detection","summary":"  Continual Learning (CL) methods aim to learn from a sequence of tasks while\navoiding the challenge of forgetting previous knowledge. We present DREAM-CL, a\nnovel CL method for ECG arrhythmia detection that introduces dynamic prototype\nrehearsal memory. DREAM-CL selects representative prototypes by clustering data\nbased on learning behavior during each training session. Within each cluster,\nwe apply a smooth sorting operation that ranks samples by training difficulty,\ncompressing extreme values and removing outliers. The more challenging samples\nare then chosen as prototypes for the rehearsal memory, ensuring effective\nknowledge retention across sessions. We evaluate our method on\ntime-incremental, class-incremental, and lead-incremental scenarios using two\nwidely used ECG arrhythmia datasets, Chapman and PTB-XL. The results\ndemonstrate that DREAM-CL outperforms the state-of-the-art in CL for ECG\narrhythmia detection. Detailed ablation and sensitivity studies are performed\nto validate the different design choices of our method.\n","authors":["Sana Rahmani","Reetam Chatterjee","Ali Etemad","Javad Hashemi"],"pdf_url":"https://arxiv.org/pdf/2501.07555v1.pdf","comment":"Accepted to 2025 International Conference on Acoustics, Speech, and\n  Signal Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2411.04987v2","updated":"2025-01-13T18:24:22Z","published":"2024-11-07T18:55:10Z","title":"Few-Shot Task Learning through Inverse Generative Modeling","summary":"  Learning the intents of an agent, defined by its goals or motion style, is\noften extremely challenging from just a few examples. We refer to this problem\nas task concept learning and present our approach, Few-Shot Task Learning\nthrough Inverse Generative Modeling (FTL-IGM), which learns new task concepts\nby leveraging invertible neural generative models. The core idea is to pretrain\na generative model on a set of basic concepts and their demonstrations. Then,\ngiven a few demonstrations of a new concept (such as a new goal or a new\naction), our method learns the underlying concepts through backpropagation\nwithout updating the model weights, thanks to the invertibility of the\ngenerative model. We evaluate our method in five domains -- object\nrearrangement, goal-oriented navigation, motion caption of human actions,\nautonomous driving, and real-world table-top manipulation. Our experimental\nresults demonstrate that via the pretrained generative model, we successfully\nlearn novel concepts and generate agent plans or motion corresponding to these\nconcepts in (1) unseen environments and (2) in composition with training\nconcepts.\n","authors":["Aviv Netanyahu","Yilun Du","Antonia Bronars","Jyothish Pari","Joshua Tenenbaum","Tianmin Shu","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.04987v2.pdf","comment":"Added acknowledgment"},{"id":"http://arxiv.org/abs/2501.07542v1","updated":"2025-01-13T18:23:57Z","published":"2025-01-13T18:23:57Z","title":"Imagine while Reasoning in Space: Multimodal Visualization-of-Thought","summary":"  Chain-of-Thought (CoT) prompting has proven highly effective for enhancing\ncomplex reasoning in Large Language Models (LLMs) and Multimodal Large Language\nModels (MLLMs). Yet, it struggles in complex spatial reasoning tasks.\nNonetheless, human cognition extends beyond language alone, enabling the\nremarkable capability to think in both words and images. Inspired by this\nmechanism, we propose a new reasoning paradigm, Multimodal\nVisualization-of-Thought (MVoT). It enables visual thinking in MLLMs by\ngenerating image visualizations of their reasoning traces. To ensure\nhigh-quality visualization, we introduce token discrepancy loss into\nautoregressive MLLMs. This innovation significantly improves both visual\ncoherence and fidelity. We validate this approach through several dynamic\nspatial reasoning tasks. Experimental results reveal that MVoT demonstrates\ncompetitive performance across tasks. Moreover, it exhibits robust and reliable\nimprovements in the most challenging scenarios where CoT fails. Ultimately,\nMVoT establishes new possibilities for complex reasoning tasks where visual\nthinking can effectively complement verbal reasoning.\n","authors":["Chengzu Li","Wenshan Wu","Huanyu Zhang","Yan Xia","Shaoguang Mao","Li Dong","Ivan Vulić","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2501.07542v1.pdf","comment":"11 pages, 6 figures, 4 tables (27 pages, 10 figures, 16 tables\n  including references and appendices)"},{"id":"http://arxiv.org/abs/2312.15141v2","updated":"2025-01-13T18:21:03Z","published":"2023-12-23T02:34:50Z","title":"Improving the Performance of Echo State Networks Through State Feedback","summary":"  Reservoir computing, using nonlinear dynamical systems, offers a\ncost-effective alternative to neural networks for complex tasks involving\nprocessing of sequential data, time series modeling, and system identification.\nEcho state networks (ESNs), a type of reservoir computer, mirror neural\nnetworks but simplify training. They apply fixed, random linear transformations\nto the internal state, followed by nonlinear changes. This process, guided by\ninput signals and linear regression, adapts the system to match target\ncharacteristics, reducing computational demands. A potential drawback of ESNs\nis that the fixed reservoir may not offer the complexity needed for specific\nproblems. While directly altering (training) the internal ESN would reintroduce\nthe computational burden, an indirect modification can be achieved by\nredirecting some output as input. This feedback can influence the internal\nreservoir state, yielding ESNs with enhanced complexity suitable for broader\nchallenges. In this paper, we demonstrate that by feeding some component of the\nreservoir state back into the network through the input, we can drastically\nimprove upon the performance of a given ESN. We rigorously prove that, for any\ngiven ESN, feedback will almost always improve the accuracy of the output. For\na set of three tasks, each representing different problem classes, we find that\nwith feedback the average error measures are reduced by $30\\%-60\\%$.\nRemarkably, feedback provides at least an equivalent performance boost to\ndoubling the initial number of computational nodes, a computationally expensive\nand technologically challenging alternative. These results demonstrate the\nbroad applicability and substantial usefulness of this feedback scheme.\n","authors":["Peter J. Ehlers","Hendra I. Nurdin","Daniel Soh"],"pdf_url":"https://arxiv.org/pdf/2312.15141v2.pdf","comment":"36 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.07755v1","updated":"2025-01-13T23:56:24Z","published":"2025-01-13T23:56:24Z","title":"Performance Optimization of Ratings-Based Reinforcement Learning","summary":"  This paper explores multiple optimization methods to improve the performance\nof rating-based reinforcement learning (RbRL). RbRL, a method based on the idea\nof human ratings, has been developed to infer reward functions in reward-free\nenvironments for the subsequent policy learning via standard reinforcement\nlearning, which requires the availability of reward functions. Specifically,\nRbRL minimizes the cross entropy loss that quantifies the differences between\nhuman ratings and estimated ratings derived from the inferred reward. Hence, a\nlow loss means a high degree of consistency between human ratings and estimated\nratings. Despite its simple form, RbRL has various hyperparameters and can be\nsensitive to various factors. Therefore, it is critical to provide\ncomprehensive experiments to understand the impact of various hyperparameters\non the performance of RbRL. This paper is a work in progress, providing users\nsome general guidelines on how to select hyperparameters in RbRL.\n","authors":["Evelyn Rose","Devin White","Mingkang Wu","Vernon Lawhern","Nicholas R. Waytowich","Yongcan Cao"],"pdf_url":"https://arxiv.org/pdf/2501.07755v1.pdf","comment":"Accepted to the Collaborative AI and Modeling of Humans Bridge\n  Program at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.07754v1","updated":"2025-01-13T23:55:11Z","published":"2025-01-13T23:55:11Z","title":"Universal Training of Neural Networks to Achieve Bayes Optimal\n  Classification Accuracy","summary":"  This work invokes the notion of $f$-divergence to introduce a novel upper\nbound on the Bayes error rate of a general classification task. We show that\nthe proposed bound can be computed by sampling from the output of a\nparameterized model. Using this practical interpretation, we introduce the\nBayes optimal learning threshold (BOLT) loss whose minimization enforces a\nclassification model to achieve the Bayes error rate. We validate the proposed\nloss for image and text classification tasks, considering MNIST, Fashion-MNIST,\nCIFAR-10, and IMDb datasets. Numerical experiments demonstrate that models\ntrained with BOLT achieve performance on par with or exceeding that of\ncross-entropy, particularly on challenging datasets. This highlights the\npotential of BOLT in improving generalization.\n","authors":["Mohammadreza Tavasoli Naeini","Ali Bereyhi","Morteza Noshad","Ben Liang","Alfred O. Hero III"],"pdf_url":"https://arxiv.org/pdf/2501.07754v1.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2403.04081v2","updated":"2025-01-13T23:48:32Z","published":"2024-03-06T22:24:05Z","title":"Directional Smoothness and Gradient Methods: Convergence and Adaptivity","summary":"  We develop new sub-optimality bounds for gradient descent (GD) that depend on\nthe conditioning of the objective along the path of optimization rather than on\nglobal, worst-case constants. Key to our proofs is directional smoothness, a\nmeasure of gradient variation that we use to develop upper-bounds on the\nobjective. Minimizing these upper-bounds requires solving implicit equations to\nobtain a sequence of strongly adapted step-sizes; we show that these equations\nare straightforward to solve for convex quadratics and lead to new guarantees\nfor two classical step-sizes. For general functions, we prove that the Polyak\nstep-size and normalized GD obtain fast, path-dependent rates despite using no\nknowledge of the directional smoothness. Experiments on logistic regression\nshow our convergence guarantees are tighter than the classical theory based on\n$L$-smoothness.\n","authors":["Aaron Mishkin","Ahmed Khaled","Yuanhao Wang","Aaron Defazio","Robert M. Gower"],"pdf_url":"https://arxiv.org/pdf/2403.04081v2.pdf","comment":"Published as a poster at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.07747v1","updated":"2025-01-13T23:26:29Z","published":"2025-01-13T23:26:29Z","title":"Scaling Up ESM2 Architectures for Long Protein Sequences Analysis: Long\n  and Quantized Approaches","summary":"  Various approaches utilizing Transformer architectures have achieved\nstate-of-the-art results in Natural Language Processing (NLP). Based on this\nsuccess, numerous architectures have been proposed for other types of data,\nsuch as in biology, particularly for protein sequences. Notably among these are\nthe ESM2 architectures, pre-trained on billions of proteins, which form the\nbasis of various state-of-the-art approaches in the field. However, the ESM2\narchitectures have a limitation regarding input size, restricting it to 1,022\namino acids, which necessitates the use of preprocessing techniques to handle\nsequences longer than this limit. In this paper, we present the long and\nquantized versions of the ESM2 architectures, doubling the input size limit to\n2,048 amino acids.\n","authors":["Gabriel Bianchin de Oliveira","Helio Pedrini","Zanoni Dias"],"pdf_url":"https://arxiv.org/pdf/2501.07747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07741v1","updated":"2025-01-13T23:13:01Z","published":"2025-01-13T23:13:01Z","title":"Concentration of Measure for Distributions Generated via Diffusion\n  Models","summary":"  We show via a combination of mathematical arguments and empirical evidence\nthat data distributions sampled from diffusion models satisfy a Concentration\nof Measure Property saying that any Lipschitz $1$-dimensional projection of a\nrandom vector is not too far from its mean with high probability. This implies\nthat such models are quite restrictive and gives an explanation for a fact\npreviously observed in arXiv:2410.14171 that conventional diffusion models\ncannot capture \"heavy-tailed\" data (i.e. data $\\mathbf{x}$ for which the norm\n$\\|\\mathbf{x}\\|_2$ does not possess a subgaussian tail) well. We then proceed\nto train a generalized linear model using stochastic gradient descent (SGD) on\nthe diffusion-generated data for a multiclass classification task and observe\nempirically that a Gaussian universality result holds for the test error.\n  In other words, the test error depends only on the first and second order\nstatistics of the diffusion-generated data in the linear setting. Results of\nsuch forms are desirable because they allow one to assume the data itself is\nGaussian for analyzing performance of the trained classifier. Finally, we note\nthat current approaches to proving universality do not apply to this case as\nthe covariance matrices of the data tend to have vanishing minimum singular\nvalues for the diffusion-generated data, while the current proofs assume that\nthis is not the case (see Subsection 3.4 for more details). This leaves\nextending previous mathematical universality results as an intriguing open\nquestion.\n","authors":["Reza Ghane","Anthony Bao","Danil Akhtiamov","Babak Hassibi"],"pdf_url":"https://arxiv.org/pdf/2501.07741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07737v1","updated":"2025-01-13T23:00:40Z","published":"2025-01-13T23:00:40Z","title":"Multi-megabase scale genome interpretation with genetic language models","summary":"  Understanding how molecular changes caused by genetic variation drive disease\nrisk is crucial for deciphering disease mechanisms. However, interpreting\ngenome sequences is challenging because of the vast size of the human genome,\nand because its consequences manifest across a wide range of cells, tissues and\nscales -- spanning from molecular to whole organism level. Here, we present\nPhenformer, a multi-scale genetic language model that learns to generate\nmechanistic hypotheses as to how differences in genome sequence lead to\ndisease-relevant changes in expression across cell types and tissues directly\nfrom DNA sequences of up to 88 million base pairs. Using whole genome\nsequencing data from more than 150 000 individuals, we show that Phenformer\ngenerates mechanistic hypotheses about disease-relevant cell and tissue types\nthat match literature better than existing state-of-the-art methods, while\nusing only sequence data. Furthermore, disease risk predictors enriched by\nPhenformer show improved prediction performance and generalisation to diverse\npopulations. Accurate multi-megabase scale interpretation of whole genomes\nwithout additional experimental data enables both a deeper understanding of\nmolecular mechanisms involved in disease and improved disease risk prediction\nat the level of individuals.\n","authors":["Frederik Träuble","Lachlan Stuart","Andreas Georgiou","Pascal Notin","Arash Mehrjou","Ron Schwessinger","Mathieu Chevalley","Kim Branson","Bernhard Schölkopf","Cornelia van Duijn","Debora Marks","Patrick Schwab"],"pdf_url":"https://arxiv.org/pdf/2501.07737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07731v1","updated":"2025-01-13T22:46:24Z","published":"2025-01-13T22:46:24Z","title":"HyperQuery: Beyond Binary Link Prediction","summary":"  Groups with complex set intersection relations are a natural way to model a\nwide array of data, from the formation of social groups to the complex protein\ninteractions which form the basis of biological life. One approach to\nrepresenting such higher order relationships is as a hypergraph. However,\nefforts to apply machine learning techniques to hypergraph structured datasets\nhave been limited thus far. In this paper, we address the problem of link\nprediction in knowledge hypergraphs as well as simple hypergraphs and develop a\nnovel, simple, and effective optimization architecture that addresses both\ntasks. Additionally, we introduce a novel feature extraction technique using\nnode level clustering and we show how integrating data from node-level labels\ncan improve system performance. Our self-supervised approach achieves\nsignificant improvement over state of the art baselines on several hyperedge\nprediction and knowledge hypergraph completion benchmarks.\n","authors":["Sepideh Maleki","Josh Vekhter","Keshav Pingali"],"pdf_url":"https://arxiv.org/pdf/2501.07731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07729v1","updated":"2025-01-13T22:30:38Z","published":"2025-01-13T22:30:38Z","title":"Autoencoded UMAP-Enhanced Clustering for Unsupervised Learning","summary":"  We propose a novel approach to unsupervised learning by constructing a\nnon-linear embedding of the data into a low-dimensional space followed by any\nconventional clustering algorithm. The embedding promotes clusterability of the\ndata and is comprised of two mappings: the encoder of an autoencoder neural\nnetwork and the output of UMAP algorithm. The autoencoder is trained with a\ncomposite loss function that incorporates both a conventional data\nreconstruction as a regularization component and a clustering-promoting\ncomponent built using the spectral graph theory. The two embeddings and the\nsubsequent clustering are integrated into a three-stage unsupervised learning\nframework, referred to as Autoencoded UMAP-Enhanced Clustering (AUEC). When\napplied to MNIST data, AUEC significantly outperforms the state-of-the-art\ntechniques in terms of clustering accuracy.\n","authors":["Malihehsadat Chavooshi","Alexander V. Mamonov"],"pdf_url":"https://arxiv.org/pdf/2501.07729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07727v1","updated":"2025-01-13T22:29:31Z","published":"2025-01-13T22:29:31Z","title":"Stronger Than You Think: Benchmarking Weak Supervision on Realistic\n  Tasks","summary":"  Weak supervision (WS) is a popular approach for label-efficient learning,\nleveraging diverse sources of noisy but inexpensive weak labels to\nautomatically annotate training data. Despite its wide usage, WS and its\npractical value are challenging to benchmark due to the many knobs in its\nsetup, including: data sources, labeling functions (LFs), aggregation\ntechniques (called label models), and end model pipelines. Existing evaluation\nsuites tend to be limited, focusing on particular components or specialized use\ncases. Moreover, they often involve simplistic benchmark tasks or de-facto LF\nsets that are suboptimally written, producing insights that may not generalize\nto real-world settings. We address these limitations by introducing a new\nbenchmark, BOXWRENCH, designed to more accurately reflect real-world usages of\nWS. This benchmark features tasks with (1) higher class cardinality and\nimbalance, (2) notable domain expertise requirements, and (3) multilingual\nvariations across parallel corpora. For all tasks, LFs are written using a\ncareful procedure aimed at mimicking real-world settings. In contrast to\nexisting WS benchmarks, we show that supervised learning requires substantial\namounts (1000+) of labeled examples to match WS in many settings.\n","authors":["Tianyi Zhang","Linrong Cai","Jeffrey Li","Nicholas Roberts","Neel Guha","Jinoh Lee","Frederic Sala"],"pdf_url":"https://arxiv.org/pdf/2501.07727v1.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2311.09308v3","updated":"2025-01-13T22:22:06Z","published":"2023-11-15T19:02:40Z","title":"Divergences between Language Models and Human Brains","summary":"  Do machines and humans process language in similar ways? Recent research has\nhinted at the affirmative, showing that human neural activity can be\neffectively predicted using the internal representations of language models\n(LMs). Although such results are thought to reflect shared computational\nprinciples between LMs and human brains, there are also clear differences in\nhow LMs and humans represent and use language. In this work, we systematically\nexplore the divergences between human and machine language processing by\nexamining the differences between LM representations and human brain responses\nto language as measured by Magnetoencephalography (MEG) across two datasets in\nwhich subjects read and listened to narrative stories. Using an LLM-based\ndata-driven approach, we identify two domains that LMs do not capture well:\nsocial/emotional intelligence and physical commonsense. We validate these\nfindings with human behavioral experiments and hypothesize that the gap is due\nto insufficient representations of social/emotional and physical knowledge in\nLMs. Our results show that fine-tuning LMs on these domains can improve their\nalignment with human brain responses.\n","authors":["Yuchen Zhou","Emmy Liu","Graham Neubig","Michael J. Tarr","Leila Wehbe"],"pdf_url":"https://arxiv.org/pdf/2311.09308v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06091v4","updated":"2025-01-13T22:21:35Z","published":"2024-01-11T18:11:42Z","title":"A Closer Look at AUROC and AUPRC under Class Imbalance","summary":"  In machine learning (ML), a widespread claim is that the area under the\nprecision-recall curve (AUPRC) is a superior metric for model comparison to the\narea under the receiver operating characteristic (AUROC) for tasks with class\nimbalance. This paper refutes this notion on two fronts. First, we\ntheoretically characterize the behavior of AUROC and AUPRC in the presence of\nmodel mistakes, establishing clearly that AUPRC is not generally superior in\ncases of class imbalance. We further show that AUPRC can be a harmful metric as\nit can unduly favor model improvements in subpopulations with more frequent\npositive labels, heightening algorithmic disparities. Next, we empirically\nsupport our theory using experiments on both semi-synthetic and real-world\nfairness datasets. Prompted by these insights, we conduct a review of over 1.5\nmillion scientific papers to understand the origin of this invalid claim,\nfinding that it is often made without citation, misattributed to papers that do\nnot argue this point, and aggressively over-generalized from source arguments.\nOur findings represent a dual contribution: a significant technical advancement\nin understanding the relationship between AUROC and AUPRC and a stark warning\nabout unchecked assumptions in the ML community.\n","authors":["Matthew B. A. McDermott","Haoran Zhang","Lasse Hyldig Hansen","Giovanni Angelotti","Jack Gallifant"],"pdf_url":"https://arxiv.org/pdf/2401.06091v4.pdf","comment":"NeurIPS 2024 (https://openreview.net/forum?id=S3HvA808gk)"},{"id":"http://arxiv.org/abs/2501.07723v1","updated":"2025-01-13T22:18:52Z","published":"2025-01-13T22:18:52Z","title":"ESURF: Simple and Effective EDU Segmentation","summary":"  Segmenting text into Elemental Discourse Units (EDUs) is a fundamental task\nin discourse parsing. We present a new simple method for identifying EDU\nboundaries, and hence segmenting them, based on lexical and character n-gram\nfeatures, using random forest classification. We show that the method, despite\nits simplicity, outperforms other methods both for segmentation and within a\nstate of the art discourse parser. This indicates the importance of such\nfeatures for identifying basic discourse elements, pointing towards potentially\nmore training-efficient methods for discourse analysis.\n","authors":["Mohammadreza Sediqin","Shlomo Engelson Argamon"],"pdf_url":"https://arxiv.org/pdf/2501.07723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12652v2","updated":"2025-01-13T21:59:56Z","published":"2024-04-19T06:41:32Z","title":"Pre-trained Vision-Language Models Learn Discoverable Visual Concepts","summary":"  Do vision-language models (VLMs) pre-trained to caption an image of a\n\"durian\" learn visual concepts such as \"brown\" (color) and \"spiky\" (texture) at\nthe same time? We aim to answer this question as visual concepts learned \"for\nfree\" would enable wide applications such as neuro-symbolic reasoning or\nhuman-interpretable object classification. We assume that the visual concepts,\nif captured by pre-trained VLMs, can be extracted by their vision-language\ninterface with text-based concept prompts. We observe that recent works\nprompting VLMs with concepts often differ in their strategies to define and\nevaluate the visual concepts, leading to conflicting conclusions. We propose a\nnew concept definition strategy based on two observations: First, certain\nconcept prompts include shortcuts that recognize correct concepts for wrong\nreasons; Second, multimodal information (e.g. visual discriminativeness, and\ntextual knowledge) should be leveraged when selecting the concepts. Our\nproposed concept discovery and learning (CDL) framework is thus designed to\nidentify a diverse list of generic visual concepts (e.g. \"spiky\" as opposed to\n\"spiky durian\"), which are ranked and selected based on visual and language\nmutual information. We carefully design quantitative and human evaluations of\nthe discovered concepts on six diverse visual recognition datasets, which\nconfirm that pre-trained VLMs do learn visual concepts that provide accurate\nand thorough descriptions for the recognized objects. All code and models are\npublicly released.\n","authors":["Yuan Zang","Tian Yun","Hao Tan","Trung Bui","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.12652v2.pdf","comment":"Transactions on Machine Learning Research, 2025"},{"id":"http://arxiv.org/abs/2501.07713v1","updated":"2025-01-13T21:52:46Z","published":"2025-01-13T21:52:46Z","title":"Testing Human-Hand Segmentation on In-Distribution and\n  Out-of-Distribution Data in Human-Robot Interactions Using a Deep Ensemble\n  Model","summary":"  Reliable detection and segmentation of human hands are critical for enhancing\nsafety and facilitating advanced interactions in human-robot collaboration.\nCurrent research predominantly evaluates hand segmentation under\nin-distribution (ID) data, which reflects the training data of deep learning\n(DL) models. However, this approach fails to address out-of-distribution (OOD)\nscenarios that often arise in real-world human-robot interactions. In this\nstudy, we present a novel approach by evaluating the performance of pre-trained\nDL models under both ID data and more challenging OOD scenarios. To mimic\nrealistic industrial scenarios, we designed a diverse dataset featuring simple\nand cluttered backgrounds with industrial tools, varying numbers of hands (0 to\n4), and hands with and without gloves. For OOD scenarios, we incorporated\nunique and rare conditions such as finger-crossing gestures and motion blur\nfrom fast-moving hands, addressing both epistemic and aleatoric uncertainties.\nTo ensure multiple point of views (PoVs), we utilized both egocentric cameras,\nmounted on the operator's head, and static cameras to capture RGB images of\nhuman-robot interactions. This approach allowed us to account for multiple\ncamera perspectives while also evaluating the performance of models trained on\nexisting egocentric datasets as well as static-camera datasets. For\nsegmentation, we used a deep ensemble model composed of UNet and RefineNet as\nbase learners. Performance evaluation was conducted using segmentation metrics\nand uncertainty quantification via predictive entropy. Results revealed that\nmodels trained on industrial datasets outperformed those trained on\nnon-industrial datasets, highlighting the importance of context-specific\ntraining. Although all models struggled with OOD scenarios, those trained on\nindustrial datasets demonstrated significantly better generalization.\n","authors":["Reza Jalayer","Yuxin Chen","Masoud Jalayer","Carlotta Orsenigo","Masayoshi Tomizuka"],"pdf_url":"https://arxiv.org/pdf/2501.07713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07700v1","updated":"2025-01-13T21:24:15Z","published":"2025-01-13T21:24:15Z","title":"An Adaptive Collocation Point Strategy For Physics Informed Neural\n  Networks via the QR Discrete Empirical Interpolation Method","summary":"  Physics-informed neural networks (PINNs) have gained significant attention\nfor solving forward and inverse problems related to partial differential\nequations (PDEs). While advancements in loss functions and network\narchitectures have improved PINN accuracy, the impact of collocation point\nsampling on their performance remains underexplored. Fixed sampling methods,\nsuch as uniform random sampling and equispaced grids, can fail to capture\ncritical regions with high solution gradients, limiting their effectiveness for\ncomplex PDEs. Adaptive methods, inspired by adaptive mesh refinement from\ntraditional numerical methods, address this by dynamically updating collocation\npoints during training but may overlook residual dynamics between updates,\npotentially losing valuable information. To overcome this limitation, we\npropose an adaptive collocation point selection strategy utilizing the QR\nDiscrete Empirical Interpolation Method (QR-DEIM), a reduced-order modeling\ntechnique for efficiently approximating nonlinear functions. Our results on\nbenchmark PDEs, including the wave, Allen-Cahn, and Burgers' equations,\ndemonstrate that our QR-DEIM-based approach improves PINN accuracy compared to\nexisting methods, offering a promising direction for adaptive collocation point\nstrategies.\n","authors":["Adrian Celaya","David Fuentes","Beatrice Riviere"],"pdf_url":"https://arxiv.org/pdf/2501.07700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05223v4","updated":"2025-01-13T21:03:38Z","published":"2023-04-11T13:46:59Z","title":"Inhomogeneous graph trend filtering via a l2,0 cardinality penalty","summary":"  We study estimation of piecewise smooth signals over a graph. We propose a\n$\\ell_{2,0}$-norm penalized Graph Trend Filtering (GTF) model to estimate\npiecewise smooth graph signals that exhibit inhomogeneous levels of smoothness\nacross the nodes. We prove that the proposed GTF model is simultaneously a\nk-means clustering on the signal over the nodes and a minimum graph cut on the\nedges of the graph, where the clustering and the cut share the same assignment\nmatrix. We propose two methods to solve the proposed GTF model: a spectral\ndecomposition method and a method based on simulated annealing. In the\nexperiment on synthetic and real-world datasets, we show that the proposed GTF\nmodel has a better performances compared with existing approaches on the tasks\nof denoising, support recovery and semi-supervised classification. We also show\nthat the proposed GTF model can be solved more efficiently than existing models\nfor the dataset with a large edge set.\n","authors":["Xiaoqing Huang","Andersen Ang","Kun Huang","Jie Zhang","Yijie Wang"],"pdf_url":"https://arxiv.org/pdf/2304.05223v4.pdf","comment":"14 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.06068v2","updated":"2025-01-13T20:44:02Z","published":"2024-05-09T19:37:57Z","title":"Deep Learning-Based Residual Useful Lifetime Prediction for Assets with\n  Uncertain Failure Modes","summary":"  Industrial prognostics focuses on utilizing degradation signals to forecast\nand continually update the residual useful life of complex engineering systems.\nHowever, existing prognostic models for systems with multiple failure modes\nface several challenges in real-world applications, including overlapping\ndegradation signals from multiple components, the presence of unlabeled\nhistorical data, and the similarity of signals across different failure modes.\nTo tackle these issues, this research introduces two prognostic models that\nintegrate the mixture (log)-location-scale distribution with deep learning.\nThis integration facilitates the modeling of overlapping degradation signals,\neliminates the need for explicit failure mode identification, and utilizes deep\nlearning to capture complex nonlinear relationships between degradation signals\nand residual useful lifetimes. Numerical studies validate the superior\nperformance of these proposed models compared to existing methods.\n","authors":["Yuqi Su","Xiaolei Fang"],"pdf_url":"https://arxiv.org/pdf/2405.06068v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12672v5","updated":"2025-01-13T20:42:06Z","published":"2024-10-16T15:36:13Z","title":"Context Matters: Leveraging Contextual Features for Time Series\n  Forecasting","summary":"  Time series forecasts are often influenced by exogenous contextual features\nin addition to their corresponding history. For example, in financial settings,\nit is hard to accurately predict a stock price without considering public\nsentiments and policy decisions in the form of news articles, tweets, etc.\nThough this is common knowledge, the current state-of-the-art (SOTA)\nforecasting models fail to incorporate such contextual information, owing to\nits heterogeneity and multimodal nature. To address this, we introduce\nContextFormer, a novel plug-and-play method to surgically integrate multimodal\ncontextual information into existing pre-trained forecasting models.\nContextFormer effectively distills forecast-specific information from rich\nmultimodal contexts, including categorical, continuous, time-varying, and even\ntextual information, to significantly enhance the performance of existing base\nforecasters. ContextFormer outperforms SOTA forecasting models by up to 30% on\na range of real-world datasets spanning energy, traffic, environmental, and\nfinancial domains.\n","authors":["Sameep Chattopadhyay","Pulkit Paliwal","Sai Shankar Narasimhan","Shubhankar Agarwal","Sandeep P. Chinchali"],"pdf_url":"https://arxiv.org/pdf/2410.12672v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07681v1","updated":"2025-01-13T20:41:52Z","published":"2025-01-13T20:41:52Z","title":"Dataset Distillation as Pushforward Optimal Quantization","summary":"  Dataset distillation aims to find a synthetic training set such that training\non the synthetic data achieves similar performance to training on real data,\nwith orders of magnitude less computational requirements. Existing methods can\nbe broadly categorized as either bi-level optimization problems that have\nneural network training heuristics as the lower level problem, or disentangled\nmethods that bypass the bi-level optimization by matching distributions of\ndata. The latter method has the major advantages of speed and scalability in\nterms of size of both training and distilled datasets. We demonstrate that when\nequipped with an encoder-decoder structure, the empirically successful\ndisentangled methods can be reformulated as an optimal quantization problem,\nwhere a finite set of points is found to approximate the underlying probability\nmeasure by minimizing the expected projection distance. In particular, we link\nexisting disentangled dataset distillation methods to the classical optimal\nquantization and Wasserstein barycenter problems, demonstrating consistency of\ndistilled datasets for diffusion-based generative priors. We propose a simple\nextension of the state-of-the-art data distillation method D4M, achieving\nbetter performance on the ImageNet-1K dataset with trivial additional\ncomputation, and state-of-the-art performance in higher image-per-class\nsettings.\n","authors":["Hong Ye Tan","Emma Slade"],"pdf_url":"https://arxiv.org/pdf/2501.07681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07670v1","updated":"2025-01-13T20:08:52Z","published":"2025-01-13T20:08:52Z","title":"A Survey of Early Exit Deep Neural Networks in NLP","summary":"  Deep Neural Networks (DNNs) have grown increasingly large in size to achieve\nstate of the art performance across a wide range of tasks. However, their high\ncomputational requirements make them less suitable for resource-constrained\napplications. Also, real-world datasets often consist of a mixture of easy and\ncomplex samples, necessitating adaptive inference mechanisms that account for\nsample difficulty. Early exit strategies offer a promising solution by enabling\nadaptive inference, where simpler samples are classified using the initial\nlayers of the DNN, thereby accelerating the overall inference process. By\nattaching classifiers at different layers, early exit methods not only reduce\ninference latency but also improve the model robustness against adversarial\nattacks. This paper presents a comprehensive survey of early exit methods and\ntheir applications in NLP.\n","authors":["Divya Jyoti Bajpai","Manjesh Kumar Hanawal"],"pdf_url":"https://arxiv.org/pdf/2501.07670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07652v1","updated":"2025-01-13T19:24:14Z","published":"2025-01-13T19:24:14Z","title":"Finite Sample Identification of Partially Observed Bilinear Dynamical\n  Systems","summary":"  We consider the problem of learning a realization of a partially observed\nbilinear dynamical system (BLDS) from noisy input-output data. Given a single\ntrajectory of input-output samples, we provide a finite time analysis for\nlearning the system's Markov-like parameters, from which a balanced realization\nof the bilinear system can be obtained. Our bilinear system identification\nalgorithm learns the system's Markov-like parameters by regressing the outputs\nto highly correlated, nonlinear, and heavy-tailed covariates. Moreover, the\nstability of BLDS depends on the sequence of inputs used to excite the system.\nThese properties, unique to partially observed bilinear dynamical systems, pose\nsignificant challenges to the analysis of our algorithm for learning the\nunknown dynamics. We address these challenges and provide high probability\nerror bounds on our identification algorithm under a uniform stability\nassumption. Our analysis provides insights into system theoretic quantities\nthat affect learning accuracy and sample complexity. Lastly, we perform\nnumerical experiments with synthetic data to reinforce these insights.\n","authors":["Yahya Sattar","Yassir Jedra","Maryam Fazel","Sarah Dean"],"pdf_url":"https://arxiv.org/pdf/2501.07652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04527v2","updated":"2025-01-13T19:18:53Z","published":"2024-06-06T21:58:33Z","title":"Generative Assignment Flows for Representing and Learning Joint\n  Distributions of Discrete Data","summary":"  We introduce a novel generative model for the representation of joint\nprobability distributions of a possibly large number of discrete random\nvariables. The approach uses measure transport by randomized assignment flows\non the statistical submanifold of factorizing distributions, which enables to\nrepresent and sample efficiently from any target distribution and to assess the\nlikelihood of unseen data points. The complexity of the target distribution\nonly depends on the parametrization of the affinity function of the dynamical\nassignment flow system. Our model can be trained in a simulation-free manner by\nconditional Riemannian flow matching, using the training data encoded as\ngeodesics on the assignment manifold in closed-form, with respect to the\ne-connection of information geometry. Numerical experiments devoted to\ndistributions of structured image labelings demonstrate the applicability to\nlarge-scale problems, which may include discrete distributions in other\napplication areas. Performance measures show that our approach scales better\nwith the increasing number of classes than recent related work.\n","authors":["Bastian Boll","Daniel Gonzalez-Alvarado","Stefania Petra","Christoph Schnörr"],"pdf_url":"https://arxiv.org/pdf/2406.04527v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07643v1","updated":"2025-01-13T19:09:42Z","published":"2025-01-13T19:09:42Z","title":"A Step Toward Interpretability: Smearing the Likelihood","summary":"  The problem of interpretability of machine learning architecture in particle\nphysics has no agreed-upon definition, much less any proposed solution. We\npresent a first modest step toward these goals by proposing a definition and\ncorresponding practical method for isolation and identification of relevant\nphysical energy scales exploited by the machine. This is accomplished by\nsmearing or averaging over all input events that lie within a prescribed metric\nenergy distance of one another and correspondingly renders any quantity\nmeasured on a finite, discrete dataset continuous over the dataspace. Within\nthis approach, we are able to explicitly demonstrate that (approximate) scaling\nlaws are a consequence of extreme value theory applied to analysis of the\ndistribution of the irreducible minimal distance over which a machine must\nextrapolate given a finite dataset. As an example, we study quark versus gluon\njet identification, construct the smeared likelihood, and show that\ndiscrimination power steadily increases as resolution decreases, indicating\nthat the true likelihood for the problem is sensitive to emissions at all\nscales.\n","authors":["Andrew J. Larkoski"],"pdf_url":"https://arxiv.org/pdf/2501.07643v1.pdf","comment":"16+1 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.02648v4","updated":"2025-01-13T19:05:07Z","published":"2024-03-05T04:35:59Z","title":"Remove that Square Root: A New Efficient Scale-Invariant Version of\n  AdaGrad","summary":"  Adaptive methods are extremely popular in machine learning as they make\nlearning rate tuning less expensive. This paper introduces a novel optimization\nalgorithm named KATE, which presents a scale-invariant adaptation of the\nwell-known AdaGrad algorithm. We prove the scale-invariance of KATE for the\ncase of Generalized Linear Models. Moreover, for general smooth non-convex\nproblems, we establish a convergence rate of $O \\left(\\frac{\\log T}{\\sqrt{T}}\n\\right)$ for KATE, matching the best-known ones for AdaGrad and Adam. We also\ncompare KATE to other state-of-the-art adaptive algorithms Adam and AdaGrad in\nnumerical experiments with different problems, including complex machine\nlearning tasks like image classification and text classification on real data.\nThe results indicate that KATE consistently outperforms AdaGrad and\nmatches/surpasses the performance of Adam in all considered scenarios.\n","authors":["Sayantan Choudhury","Nazarii Tupitsa","Nicolas Loizou","Samuel Horvath","Martin Takac","Eduard Gorbunov"],"pdf_url":"https://arxiv.org/pdf/2403.02648v4.pdf","comment":"32 pages, 12 figures"},{"id":"http://arxiv.org/abs/2208.08561v3","updated":"2025-01-13T18:38:18Z","published":"2022-08-17T22:40:09Z","title":"Geometric Scattering on Measure Spaces","summary":"  The scattering transform is a multilayered, wavelet-based transform initially\nintroduced as a model of convolutional neural networks (CNNs) that has played a\nfoundational role in our understanding of these networks' stability and\ninvariance properties. Subsequently, there has been widespread interest in\nextending the success of CNNs to data sets with non-Euclidean structure, such\nas graphs and manifolds, leading to the emerging field of geometric deep\nlearning. In order to improve our understanding of the architectures used in\nthis new field, several papers have proposed generalizations of the scattering\ntransform for non-Euclidean data structures such as undirected graphs and\ncompact Riemannian manifolds without boundary.\n  In this paper, we introduce a general, unified model for geometric scattering\non measure spaces. Our proposed framework includes previous work on geometric\nscattering as special cases but also applies to more general settings such as\ndirected graphs, signed graphs, and manifolds with boundary. We propose a new\ncriterion that identifies to which groups a useful representation should be\ninvariant and show that this criterion is sufficient to guarantee that the\nscattering transform has desirable stability and invariance properties.\nAdditionally, we consider finite measure spaces that are obtained from randomly\nsampling an unknown manifold. We propose two methods for constructing a\ndata-driven graph on which the associated graph scattering transform\napproximates the scattering transform on the underlying manifold. Moreover, we\nuse a diffusion-maps based approach to prove quantitative estimates on the rate\nof convergence of one of these approximations as the number of sample points\ntends to infinity. Lastly, we showcase the utility of our method on spherical\nimages, directed graphs, and on high-dimensional single-cell data.\n","authors":["Joyce Chew","Matthew Hirn","Smita Krishnaswamy","Deanna Needell","Michael Perlmutter","Holly Steach","Siddharth Viswanath","Hau-Tieng Wu"],"pdf_url":"https://arxiv.org/pdf/2208.08561v3.pdf","comment":null},{"id":"http://arxiv.org/abs/1912.00043v3","updated":"2025-01-13T18:34:11Z","published":"2019-11-29T19:22:36Z","title":"Barcodes as Summary of Loss Function Topology","summary":"  We propose to study neural networks' loss surfaces by methods of topological\ndata analysis. We suggest to apply barcodes of Morse complexes to explore\ntopology of loss surfaces. An algorithm for calculations of the loss function's\nbarcodes of local minima is described. We have conducted experiments for\ncalculating barcodes of local minima for benchmark functions and for loss\nsurfaces of small neural networks. Our experiments confirm our two principal\nobservations for neural networks' loss surfaces. First, the barcodes of local\nminima are located in a small lower part of the range of values of neural\nnetworks' loss function. Secondly, increase of the neural network's depth and\nwidth lowers the barcodes of local minima. This has some natural implications\nfor the neural network's learning and for its generalization properties.\n","authors":["Serguei Barannikov","Alexander Korotin","Dmitry Oganesyan","Daniil Emtsev","Evgeny Burnaev"],"pdf_url":"https://arxiv.org/pdf/1912.00043v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11207v4","updated":"2025-01-13T18:16:34Z","published":"2023-06-20T00:14:47Z","title":"Quilt-1M: One Million Image-Text Pairs for Histopathology","summary":"  Recent accelerations in multi-modal applications have been made possible with\nthe plethora of image and text data available online. However, the scarcity of\nanalogous data in the medical field, specifically in histopathology, has slowed\ncomparable progress. To enable similar representation learning for\nhistopathology, we turn to YouTube, an untapped resource of videos, offering\n$1,087$ hours of valuable educational histopathology videos from expert\nclinicians. From YouTube, we curate QUILT: a large-scale vision-language\ndataset consisting of $802, 144$ image and text pairs. QUILT was automatically\ncurated using a mixture of models, including large language models, handcrafted\nalgorithms, human knowledge databases, and automatic speech recognition. In\ncomparison, the most comprehensive datasets curated for histopathology amass\nonly around $200$K samples. We combine QUILT with datasets from other sources,\nincluding Twitter, research papers, and the internet in general, to create an\neven larger dataset: QUILT-1M, with $1$M paired image-text samples, marking it\nas the largest vision-language histopathology dataset to date. We demonstrate\nthe value of QUILT-1M by fine-tuning a pre-trained CLIP model. Our model\noutperforms state-of-the-art models on both zero-shot and linear probing tasks\nfor classifying new histopathology images across $13$ diverse patch-level\ndatasets of $8$ different sub-pathologies and cross-modal retrieval tasks.\n","authors":["Wisdom Oluchi Ikezogwo","Mehmet Saygin Seyfioglu","Fatemeh Ghezloo","Dylan Stefan Chan Geva","Fatwir Sheikh Mohammed","Pavan Kumar Anand","Ranjay Krishna","Linda Shapiro"],"pdf_url":"https://arxiv.org/pdf/2306.11207v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07536v1","updated":"2025-01-13T18:16:13Z","published":"2025-01-13T18:16:13Z","title":"ML Mule: Mobile-Driven Context-Aware Collaborative Learning","summary":"  Artificial intelligence has been integrated into nearly every aspect of daily\nlife, powering applications from object detection with computer vision to large\nlanguage models for writing emails and compact models in smart homes. These\nmachine learning models cater to individual users but are often detached from\nthem, as they are typically stored and processed in centralized data centers.\nThis centralized approach raises privacy concerns, incurs high infrastructure\ncosts, and struggles with personalization. Federated and fully decentralized\nlearning methods have been proposed to address these issues, but they still\ndepend on centralized servers or face slow convergence due to communication\nconstraints. To overcome these challenges, we propose ML Mule, a approach that\nutilizes individual mobile devices as 'Mules' to train and transport model\nsnapshots as they move through physical spaces, sharing these models with the\nphysical 'Spaces' they inhabit. This method implicitly forms affinity groups\namong devices associated with users who share particular spaces, enabling\ncollaborative model evolution, and protecting users' privacy. Our approach\naddresses several major shortcomings of traditional, federated, and fully\ndecentralized learning systems. The proposed framework represents a new class\nof machine learning methods that are more robust, distributed, and\npersonalized, bringing the field closer to realizing the original vision of\nintelligent, adaptive, and genuinely context-aware smart environments. The\nresults show that ML Mule converges faster and achieves higher model accuracy\ncompared to other existing methods.\n","authors":["Haoxiang Yu","Javier Berrocal","Christine Julien"],"pdf_url":"https://arxiv.org/pdf/2501.07536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07534v1","updated":"2025-01-13T18:15:01Z","published":"2025-01-13T18:15:01Z","title":"Investigating Map-Based Path Loss Models: A Study of Feature\n  Representations in Convolutional Neural Networks","summary":"  Path loss prediction is a beneficial tool for efficient use of the radio\nfrequency spectrum. Building on prior research on high-resolution map-based\npath loss models, this paper studies convolutional neural network input\nrepresentations in more detail. We investigate different methods of\nrepresenting scalar features in convolutional neural networks. Specifically, we\ncompare using frequency and distance as input channels to convolutional layers\nor as scalar inputs to regression layers. We assess model performance using\nthree different feature configurations and find that representing scalar\nfeatures as image channels results in the strongest generalization.\n","authors":["Ryan G. Dempsey","Jonathan Ethier","Halim Yanikomeroglu"],"pdf_url":"https://arxiv.org/pdf/2501.07534v1.pdf","comment":"4 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2409.08389v2","updated":"2025-01-13T18:14:42Z","published":"2024-09-12T20:37:14Z","title":"Higher-Order Topological Directionality and Directed Simplicial Neural\n  Networks","summary":"  Topological Deep Learning (TDL) has emerged as a paradigm to process and\nlearn from signals defined on higher-order combinatorial topological spaces,\nsuch as simplicial or cell complexes. Although many complex systems have an\nasymmetric relational structure, most TDL models forcibly symmetrize these\nrelationships. In this paper, we first introduce a novel notion of higher-order\ndirectionality and we then design Directed Simplicial Neural Networks\n(Dir-SNNs) based on it. Dir-SNNs are message-passing networks operating on\ndirected simplicial complexes able to leverage directed and possibly asymmetric\ninteractions among the simplices. To our knowledge, this is the first TDL model\nusing a notion of higher-order directionality. We theoretically and empirically\nprove that Dir-SNNs are more expressive than their directed graph counterpart\nin distinguishing isomorphic directed graphs. Experiments on a synthetic source\nlocalization task demonstrate that Dir-SNNs outperform undirected SNNs when the\nunderlying complex is directed, and perform comparably when the underlying\ncomplex is undirected.\n","authors":["Manuel Lecha","Andrea Cavallo","Francesca Dominici","Elvin Isufi","Claudio Battiloro"],"pdf_url":"https://arxiv.org/pdf/2409.08389v2.pdf","comment":"7 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2402.16865v3","updated":"2025-01-13T18:06:23Z","published":"2024-01-21T04:14:54Z","title":"Enhance Eye Disease Detection using Learnable Probabilistic Discrete\n  Latents in Machine Learning Architectures","summary":"  Ocular diseases, including diabetic retinopathy and glaucoma, present a\nsignificant public health challenge due to their high prevalence and potential\nfor causing vision impairment. Early and accurate diagnosis is crucial for\neffective treatment and management. In recent years, deep learning models have\nemerged as powerful tools for analysing medical images, such as retina imaging.\nHowever, challenges persist in model relibability and uncertainty estimation,\nwhich are critical for clinical decision-making. This study leverages the\nprobabilistic framework of Generative Flow Networks (GFlowNets) to learn the\nposterior distribution over latent discrete dropout masks for the\nclassification and analysis of ocular diseases using fundus images. We develop\na robust and generalizable method that utilizes GFlowOut integrated with\nResNet18 and ViT models as the backbone in identifying various ocular\nconditions. This study employs a unique set of dropout masks - none, random,\nbottomup, and topdown - to enhance model performance in analyzing these fundus\nimages. Our results demonstrate that our learnable probablistic latents\nsignificantly improves accuracy, outperforming the traditional dropout\napproach. We utilize a gradient map calculation method, Grad-CAM, to assess\nmodel explainability, observing that the model accurately focuses on critical\nimage regions for predictions. The integration of GFlowOut in neural networks\npresents a promising advancement in the automated diagnosis of ocular diseases,\nwith implications for improving clinical workflows and patient outcomes.\n","authors":["Anirudh Prabhakaran","YeKun Xiao","Ching-Yu Cheng","Dianbo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.16865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17752v2","updated":"2025-01-13T18:03:46Z","published":"2024-11-25T16:20:49Z","title":"Path Loss Prediction Using Deep Learning","summary":"  Radio deployments and spectrum planning benefit from path loss predictions.\nObstructions along a communications link are often considered implicitly or\nthrough derived metrics such as representative clutter height or total\nobstruction depth. In this paper, we propose a path-specific path loss\nprediction method that uses convolutional neural networks to automatically\nperform feature extraction from high-resolution obstruction height maps. Our\nmethods result in low prediction error in a variety of environments without\nrequiring derived metrics.\n","authors":["Ryan G. Dempsey","Jonathan Ethier","Halim Yanikomeroglu"],"pdf_url":"https://arxiv.org/pdf/2411.17752v2.pdf","comment":"5 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.07525v1","updated":"2025-01-13T17:55:32Z","published":"2025-01-13T17:55:32Z","title":"RadAlign: Advancing Radiology Report Generation with Vision-Language\n  Concept Alignment","summary":"  Automated chest radiographs interpretation requires both accurate disease\nclassification and detailed radiology report generation, presenting a\nsignificant challenge in the clinical workflow. Current approaches either focus\non classification accuracy at the expense of interpretability or generate\ndetailed but potentially unreliable reports through image captioning\ntechniques. In this study, we present RadAlign, a novel framework that combines\nthe predictive accuracy of vision-language models (VLMs) with the reasoning\ncapabilities of large language models (LLMs). Inspired by the radiologist's\nworkflow, RadAlign first employs a specialized VLM to align visual features\nwith key medical concepts, achieving superior disease classification with an\naverage AUC of 0.885 across multiple diseases. These recognized medical\nconditions, represented as text-based concepts in the aligned visual-language\nspace, are then used to prompt LLM-based report generation. Enhanced by a\nretrieval-augmented generation mechanism that grounds outputs in similar\nhistorical cases, RadAlign delivers superior report quality with a GREEN score\nof 0.678, outperforming state-of-the-art methods' 0.634. Our framework\nmaintains strong clinical interpretability while reducing hallucinations,\nadvancing automated medical imaging and report analysis through integrated\npredictive and generative AI. Code is available at\nhttps://github.com/difeigu/RadAlign.\n","authors":["Difei Gu","Yunhe Gao","Yang Zhou","Mu Zhou","Dimitris Metaxas"],"pdf_url":"https://arxiv.org/pdf/2501.07525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07752v2","updated":"2025-01-13T17:34:22Z","published":"2024-12-10T18:50:37Z","title":"FlashRNN: Optimizing Traditional RNNs on Modern Hardware","summary":"  While Transformers and other sequence-parallelizable neural network\narchitectures seem like the current state of the art in sequence modeling, they\nspecifically lack state-tracking capabilities. These are important for\ntime-series tasks and logical reasoning. Traditional RNNs like LSTMs and GRUs,\nas well as modern variants like sLSTM do have these capabilities at the cost of\nstrictly sequential processing. While this is often seen as a strong\nlimitation, we show how fast these networks can get with our\nhardware-optimization FlashRNN in Triton and CUDA, optimizing kernels to the\nregister level on modern GPUs. We extend traditional RNNs with a\nparallelization variant that processes multiple RNNs of smaller hidden state in\nparallel, similar to the head-wise processing in Transformers. To enable\nflexibility on different GPU variants, we introduce a new optimization\nframework for hardware-internal cache sizes, memory and compute handling. It\nmodels the hardware in a setting using polyhedral-like constraints, including\nthe notion of divisibility. This speeds up the solution process in our\nConstrINT library for general integer constraint satisfaction problems (integer\nCSPs). We show that our kernels can achieve 50x speed-ups over a vanilla\nPyTorch implementation and allow 40x larger hidden sizes compared to our Triton\nimplementation. Our open-source kernels and the optimization library are\nreleased here to boost research in the direction of state-tracking enabled RNNs\nand sequence modeling: \\url{https://github.com/NX-AI/flashrnn}\n","authors":["Korbinian Pöppel","Maximilian Beck","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2412.07752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07508v1","updated":"2025-01-13T17:27:11Z","published":"2025-01-13T17:27:11Z","title":"Improving DeFi Accessibility through Efficient Liquidity Provisioning\n  with Deep Reinforcement Learning","summary":"  This paper applies deep reinforcement learning (DRL) to optimize liquidity\nprovisioning in Uniswap v3, a decentralized finance (DeFi) protocol\nimplementing an automated market maker (AMM) model with concentrated liquidity.\nWe model the liquidity provision task as a Markov Decision Process (MDP) and\ntrain an active liquidity provider (LP) agent using the Proximal Policy\nOptimization (PPO) algorithm. The agent dynamically adjusts liquidity positions\nby using information about price dynamics to balance fee maximization and\nimpermanent loss mitigation. We use a rolling window approach for training and\ntesting, reflecting realistic market conditions and regime shifts. This study\ncompares the data-driven performance of the DRL-based strategy against common\nheuristics adopted by small retail LP actors that do not systematically modify\ntheir liquidity positions. By promoting more efficient liquidity management,\nthis work aims to make DeFi markets more accessible and inclusive for a broader\nrange of participants. Through a data-driven approach to liquidity management,\nthis work seeks to contribute to the ongoing development of more efficient and\nuser-friendly DeFi markets.\n","authors":["Haonan Xu","Alessio Brini"],"pdf_url":"https://arxiv.org/pdf/2501.07508v1.pdf","comment":"9 pages, 5 figures. Accepted at AI for Social Impact: Bridging\n  Innovations in Finance, Social Media, and Crime Prevention Workshop at AAAI\n  2025"},{"id":"http://arxiv.org/abs/2501.07502v1","updated":"2025-01-13T17:19:34Z","published":"2025-01-13T17:19:34Z","title":"RbRL2.0: Integrated Reward and Policy Learning for Rating-based\n  Reinforcement Learning","summary":"  Reinforcement learning (RL), a common tool in decision making, learns\npolicies from various experiences based on the associated cumulative\nreturn/rewards without treating them differently. On the contrary, humans often\nlearn to distinguish from different levels of performance and extract the\nunderlying trends towards improving their decision making for best performance.\nMotivated by this, this paper proposes a novel RL method that mimics humans'\ndecision making process by differentiating among collected experiences for\neffective policy learning. The main idea is to extract important directional\ninformation from experiences with different performance levels, named ratings,\nso that policies can be updated towards desired deviation from these\nexperiences with different ratings. Specifically, we propose a new policy loss\nfunction that penalizes distribution similarities between the current policy\nand failed experiences with different ratings, and assign different weights to\nthe penalty terms based on the rating classes. Meanwhile, reward learning from\nthese rated samples can be integrated with the new policy loss towards an\nintegrated reward and policy learning from rated samples. Optimizing the\nintegrated reward and policy loss function will lead to the discovery of\ndirections for policy improvement towards maximizing cumulative rewards and\npenalizing most from the lowest performance level while least from the highest\nperformance level. To evaluate the effectiveness of the proposed method, we\npresent results for experiments on a few typical environments that show\nimproved convergence and overall performance over the existing rating-based\nreinforcement learning method with only reward learning.\n","authors":["Mingkang Wu","Devin White","Vernon Lawhern","Nicholas R. Waytowich","Yongcan Cao"],"pdf_url":"https://arxiv.org/pdf/2501.07502v1.pdf","comment":"Accepted to the Collaborative AI and Modeling of Humans Bridge\n  Program at AAAI 2025"},{"id":"http://arxiv.org/abs/2405.14496v5","updated":"2025-01-13T17:12:41Z","published":"2024-05-23T12:28:16Z","title":"Hybrid Top-Down Global Causal Discovery with Local Search for Linear and\n  Nonlinear Additive Noise Models","summary":"  Learning the unique directed acyclic graph corresponding to an unknown causal\nmodel is a challenging task. Methods based on functional causal models can\nidentify a unique graph, but either suffer from the curse of dimensionality or\nimpose strong parametric assumptions. To address these challenges, we propose a\nnovel hybrid approach for global causal discovery in observational data that\nleverages local causal substructures. We first present a topological sorting\nalgorithm that leverages ancestral relationships in linear structural causal\nmodels to establish a compact top-down hierarchical ordering, encoding more\ncausal information than linear orderings produced by existing methods. We\ndemonstrate that this approach generalizes to nonlinear settings with arbitrary\nnoise. We then introduce a nonparametric constraint-based algorithm that prunes\nspurious edges by searching for local conditioning sets, achieving greater\naccuracy than current methods. We provide theoretical guarantees for\ncorrectness and worst-case polynomial time complexities, with empirical\nvalidation on synthetic data.\n","authors":["Sujai Hiremath","Jacqueline R. M. A. Maasch","Mengxiao Gao","Promit Ghosal","Kyra Gan"],"pdf_url":"https://arxiv.org/pdf/2405.14496v5.pdf","comment":"To appear at the Thirty-Eighth Annual Conference on Neural\n  Information Processing Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2501.07493v1","updated":"2025-01-13T17:12:38Z","published":"2025-01-13T17:12:38Z","title":"Exploring and Mitigating Adversarial Manipulation of Voting-Based\n  Leaderboards","summary":"  It is now common to evaluate Large Language Models (LLMs) by having humans\nmanually vote to evaluate model outputs, in contrast to typical benchmarks that\nevaluate knowledge or skill at some particular task. Chatbot Arena, the most\npopular benchmark of this type, ranks models by asking users to select the\nbetter response between two randomly selected models (without revealing which\nmodel was responsible for the generations). These platforms are widely trusted\nas a fair and accurate measure of LLM capabilities. In this paper, we show that\nif bot protection and other defenses are not implemented, these voting-based\nbenchmarks are potentially vulnerable to adversarial manipulation.\nSpecifically, we show that an attacker can alter the leaderboard (to promote\ntheir favorite model or demote competitors) at the cost of roughly a thousand\nvotes (verified in a simulated, offline version of Chatbot Arena). Our attack\nconsists of two steps: first, we show how an attacker can determine which model\nwas used to generate a given reply with more than $95\\%$ accuracy; and then,\nthe attacker can use this information to consistently vote for (or against) a\ntarget model. Working with the Chatbot Arena developers, we identify, propose,\nand implement mitigations to improve the robustness of Chatbot Arena against\nadversarial manipulation, which, based on our analysis, substantially increases\nthe cost of such attacks. Some of these defenses were present before our\ncollaboration, such as bot protection with Cloudflare, malicious user\ndetection, and rate limiting. Others, including reCAPTCHA and login are being\nintegrated to strengthen the security in Chatbot Arena.\n","authors":["Yangsibo Huang","Milad Nasr","Anastasios Angelopoulos","Nicholas Carlini","Wei-Lin Chiang","Christopher A. Choquette-Choo","Daphne Ippolito","Matthew Jagielski","Katherine Lee","Ken Ziyu Liu","Ion Stoica","Florian Tramer","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00843v3","updated":"2025-01-13T16:58:43Z","published":"2024-06-30T22:33:47Z","title":"A Unified Approach to Extract Interpretable Rules from Tree Ensembles\n  via Integer Programming","summary":"  Tree ensembles are very popular machine learning models, known for their\neffectiveness in supervised classification and regression tasks. Their\nperformance derives from aggregating predictions of multiple decision trees,\nwhich are renowned for their interpretability properties. However, tree\nensemble models do not reliably exhibit interpretable output. Our work aims to\nextract an optimized list of rules from a trained tree ensemble, providing the\nuser with a condensed, interpretable model that retains most of the predictive\npower of the full model. Our approach consists of solving a set partitioning\nproblem formulated through Integer Programming. The proposed method works with\neither tabular or time series data, for both classification and regression\ntasks, and its flexible formulation can include any arbitrary loss or\nregularization functions. Our extensive computational experiments offer\nstatistically significant evidence that our method is competitive with other\nrule extraction methods in terms of predictive performance and fidelity towards\nthe tree ensemble. Moreover, we empirically show that the proposed method\neffectively extracts interpretable rules from tree ensemble that are designed\nfor time series data.\n","authors":["Lorenzo Bonasera","Emilio Carrizosa"],"pdf_url":"https://arxiv.org/pdf/2407.00843v3.pdf","comment":"- Improved overall manuscript flow and clearness - Added related work\n  on explanation fidelity - Added computational results on fidelity - Fixed\n  some flaws on data inference - Optimization problem with weighted objectives\n  - Added appendix containing qualitative examples - New computational results"},{"id":"http://arxiv.org/abs/2410.16314v3","updated":"2025-01-13T16:53:02Z","published":"2024-10-09T10:09:37Z","title":"Steering Large Language Models using Conceptors: Improving\n  Addition-Based Activation Engineering","summary":"  Large language models have transformed AI, yet reliably controlling their\noutputs remains a challenge. This paper explores activation engineering, where\noutputs of pre-trained LLMs are controlled by manipulating their activations at\ninference time. Unlike traditional methods using a single steering vector, we\nintroduce conceptors - mathematical constructs that represent sets of\nactivation vectors as ellipsoidal regions. Conceptors act as soft projection\nmatrices and offer more precise control over complex activation patterns. Our\nexperiments demonstrate that conceptors outperform traditional methods across\nmultiple steering tasks. We further use Boolean operations on conceptors for\ncombined steering goals that empirically outperform additively combining\nsteering vectors on a set of tasks. These results highlight conceptors as a\npromising tool for more effective steering of LLMs. Our code is available on\ngithub.com/jorispos/conceptorsteering.\n","authors":["Joris Postmus","Steven Abreu"],"pdf_url":"https://arxiv.org/pdf/2410.16314v3.pdf","comment":"Presented at the MINT workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2402.13699v5","updated":"2025-01-13T16:21:58Z","published":"2024-02-21T11:00:23Z","title":"Automation of Quantum Dot Measurement Analysis via Explainable Machine\n  Learning","summary":"  The rapid development of quantum dot (QD) devices for quantum computing has\nnecessitated more efficient and automated methods for device characterization\nand tuning. This work demonstrates the feasibility and advantages of applying\nexplainable machine learning techniques to the analysis of quantum dot\nmeasurements, paving the way for further advances in automated and transparent\nQD device tuning. Many of the measurements acquired during the tuning process\ncome in the form of images that need to be properly analyzed to guide the\nsubsequent tuning steps. By design, features present in such images capture\ncertain behaviors or states of the measured QD devices. When considered\ncarefully, such features can aid the control and calibration of QD devices. An\nimportant example of such images are so-called $\\textit{triangle plots}$, which\nvisually represent current flow and reveal characteristics important for QD\ndevice calibration. While image-based classification tools, such as\nconvolutional neural networks (CNNs), can be used to verify whether a given\nmeasurement is $\\textit{good}$ and thus warrants the initiation of the next\nphase of tuning, they do not provide any insights into how the device should be\nadjusted in the case of $\\textit{bad}$ images. This is because CNNs sacrifice\nprediction and model intelligibility for high accuracy. To ameliorate this\ntrade-off, a recent study introduced an image vectorization approach that\nrelies on the Gabor wavelet transform (Schug $\\textit{et al.}$ 2024\n$\\textit{Proc. XAI4Sci: Explainable Machine Learning for Sciences Workshop\n(AAAI 2024) (Vancouver, Canada)}$ pp 1-6). Here we propose an alternative\nvectorization method that involves mathematical modeling of synthetic triangles\nto mimic the experimental data. Using explainable boosting machines, we show\nthat this new method offers superior explainability of model prediction without\nsacrificing accuracy.\n","authors":["Daniel Schug","Tyler J. Kovach","M. A. Wolfe","Jared Benson","Sanghyeok Park","J. P. Dodson","J. Corrigan","M. A. Eriksson","Justyna P. Zwolak"],"pdf_url":"https://arxiv.org/pdf/2402.13699v5.pdf","comment":"20 pages, 5 figures, abbreviated version published in Proceedings of\n  the XAI4Sci: Explainable machine learning for sciences workshop at AAAI 2024,\n  (Vancouver, Canada)"},{"id":"http://arxiv.org/abs/2501.07447v1","updated":"2025-01-13T16:18:31Z","published":"2025-01-13T16:18:31Z","title":"PrecipDiff: Leveraging image diffusion models to enhance satellite-based\n  precipitation observations","summary":"  A recent report from the World Meteorological Organization (WMO) highlights\nthat water-related disasters have caused the highest human losses among natural\ndisasters over the past 50 years, with over 91\\% of deaths occurring in\nlow-income countries. This disparity is largely due to the lack of adequate\nground monitoring stations, such as weather surveillance radars (WSR), which\nare expensive to install. For example, while the US and Europe combined possess\nover 600 WSRs, Africa, despite having almost one and half times their landmass,\nhas fewer than 40. To address this issue, satellite-based observations offer a\nglobal, near-real-time monitoring solution. However, they face several\nchallenges like accuracy, bias, and low spatial resolution. This study\nleverages the power of diffusion models and residual learning to address these\nlimitations in a unified framework. We introduce the first diffusion model for\ncorrecting the inconsistency between different precipitation products. Our\nmethod demonstrates the effectiveness in downscaling satellite precipitation\nestimates from 10 km to 1 km resolution. Extensive experiments conducted in the\nSeattle region demonstrate significant improvements in accuracy, bias\nreduction, and spatial detail. Importantly, our approach achieves these results\nusing only precipitation data, showcasing the potential of a purely computer\nvision-based approach for enhancing satellite precipitation products and paving\nthe way for further advancements in this domain.\n","authors":["Ting-Yu Dai","Hayato Ushijima-Mwesigwa"],"pdf_url":"https://arxiv.org/pdf/2501.07447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07437v1","updated":"2025-01-13T16:05:41Z","published":"2025-01-13T16:05:41Z","title":"Pairwise Comparisons without Stochastic Transitivity: Model, Theory and\n  Applications","summary":"  Most statistical models for pairwise comparisons, including the Bradley-Terry\n(BT) and Thurstone models and many extensions, make a relatively strong\nassumption of stochastic transitivity. This assumption imposes the existence of\nan unobserved global ranking among all the players/teams/items and monotone\nconstraints on the comparison probabilities implied by the global ranking.\nHowever, the stochastic transitivity assumption does not hold in many\nreal-world scenarios of pairwise comparisons, especially games involving\nmultiple skills or strategies. As a result, models relying on this assumption\ncan have suboptimal predictive performance. In this paper, we propose a general\nfamily of statistical models for pairwise comparison data without a stochastic\ntransitivity assumption, substantially extending the BT and Thurstone models.\nIn this model, the pairwise probabilities are determined by a (approximately)\nlow-dimensional skew-symmetric matrix. Likelihood-based estimation methods and\ncomputational algorithms are developed, which allow for sparse data with only a\nsmall proportion of observed pairs. Theoretical analysis shows that the\nproposed estimator achieves minimax-rate optimality, which adapts effectively\nto the sparsity level of the data. The spectral theory for skew-symmetric\nmatrices plays a crucial role in the implementation and theoretical analysis.\nThe proposed method's superiority against the BT model, along with its broad\napplicability across diverse scenarios, is further supported by simulations and\nreal data analysis.\n","authors":["Sze Ming Lee","Yunxiao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07437v1.pdf","comment":"34 pages, 1 figure"},{"id":"http://arxiv.org/abs/2411.17645v2","updated":"2025-01-13T16:01:14Z","published":"2024-11-26T18:10:51Z","title":"Explainable AI for Classifying UTI Risk Groups Using a Real-World Linked\n  EHR and Pathology Lab Dataset","summary":"  The use of machine learning and AI on electronic health records (EHRs) holds\nsubstantial potential for clinical insight. However, this approach faces\nchallenges due to data heterogeneity, sparsity, temporal misalignment, and\nlimited labeled outcomes. In this context, we leverage a linked EHR dataset of\napproximately one million de-identified individuals from Bristol, North\nSomerset, and South Gloucestershire, UK, to characterize urinary tract\ninfections (UTIs). We implemented a data pre-processing and curation pipeline\nthat transforms the raw EHR data into a structured format suitable for\ndeveloping predictive models focused on data fairness, accountability and\ntransparency. Given the limited availability and biases of ground truth UTI\noutcomes, we introduce a UTI risk estimation framework informed by clinical\nexpertise to estimate UTI risk across individual patient timelines. Pairwise\nXGBoost models are trained using this framework to differentiate UTI risk\ncategories with explainable AI techniques applied to identify key predictors\nand support interpretability. Our findings reveal differences in clinical and\ndemographic predictors across risk groups. While this study highlights the\npotential of AI-driven insights to support UTI clinical decision-making,\nfurther investigation of patient sub-strata and extensive validation are needed\nto ensure robustness and applicability in clinical practice.\n","authors":["Yujie Dai","Brian Sullivan","Axel Montout","Amy Dillon","Chris Waller","Peter Acs","Rachel Denholm","Philip Williams","Alastair D Hay","Raul Santos-Rodriguez","Andrew Dowsey"],"pdf_url":"https://arxiv.org/pdf/2411.17645v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07429v1","updated":"2025-01-13T15:52:43Z","published":"2025-01-13T15:52:43Z","title":"Distance Measure Based on an Embedding of the Manifold of K-Component\n  Gaussian Mixture Models into the Manifold of Symmetric Positive Definite\n  Matrices","summary":"  In this paper, a distance between the Gaussian Mixture Models(GMMs) is\nobtained based on an embedding of the K-component Gaussian Mixture Model into\nthe manifold of the symmetric positive definite matrices. Proof of embedding of\nK-component GMMs into the manifold of symmetric positive definite matrices is\ngiven and shown that it is a submanifold. Then, proved that the manifold of\nGMMs with the pullback of induced metric is isometric to the submanifold with\nthe induced metric. Through this embedding we obtain a general lower bound for\nthe Fisher-Rao metric. This lower bound is a distance measure on the manifold\nof GMMs and we employ it for the similarity measure of GMMs. The effectiveness\nof this framework is demonstrated through an experiment on standard machine\nlearning benchmarks, achieving accuracy of 98%, 92%, and 93.33% on the UIUC,\nKTH-TIPS, and UMD texture recognition datasets respectively.\n","authors":["Amit Vishwakarma","KS Subrahamanian Moosath"],"pdf_url":"https://arxiv.org/pdf/2501.07429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07426v1","updated":"2025-01-13T15:47:02Z","published":"2025-01-13T15:47:02Z","title":"MVICAD2: Multi-View Independent Component Analysis with Delays and\n  Dilations","summary":"  Machine learning techniques in multi-view settings face significant\nchallenges, particularly when integrating heterogeneous data, aligning feature\nspaces, and managing view-specific biases. These issues are prominent in\nneuroscience, where data from multiple subjects exposed to the same stimuli are\nanalyzed to uncover brain activity dynamics. In magnetoencephalography (MEG),\nwhere signals are captured at the scalp level, estimating the brain's\nunderlying sources is crucial, especially in group studies where sources are\nassumed to be similar for all subjects. Common methods, such as Multi-View\nIndependent Component Analysis (MVICA), assume identical sources across\nsubjects, but this assumption is often too restrictive due to individual\nvariability and age-related changes. Multi-View Independent Component Analysis\nwith Delays (MVICAD) addresses this by allowing sources to differ up to a\ntemporal delay. However, temporal dilation effects, particularly in auditory\nstimuli, are common in brain dynamics, making the estimation of time delays\nalone insufficient. To address this, we propose Multi-View Independent\nComponent Analysis with Delays and Dilations (MVICAD2), which allows sources to\ndiffer across subjects in both temporal delays and dilations. We present a\nmodel with identifiable sources, derive an approximation of its likelihood in\nclosed form, and use regularization and optimization techniques to enhance\nperformance. Through simulations, we demonstrate that MVICAD2 outperforms\nexisting multi-view ICA methods. We further validate its effectiveness using\nthe Cam-CAN dataset, and showing how delays and dilations are related to aging.\n","authors":["Ambroise Heurtebise","Omar Chehab","Pierre Ablin","Alexandre Gramfort"],"pdf_url":"https://arxiv.org/pdf/2501.07426v1.pdf","comment":"19 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.07423v1","updated":"2025-01-13T15:43:22Z","published":"2025-01-13T15:43:22Z","title":"An Investigation into Seasonal Variations in Energy Forecasting for\n  Student Residences","summary":"  This research provides an in-depth evaluation of various machine learning\nmodels for energy forecasting, focusing on the unique challenges of seasonal\nvariations in student residential settings. The study assesses the performance\nof baseline models, such as LSTM and GRU, alongside state-of-the-art\nforecasting methods, including Autoregressive Feedforward Neural Networks,\nTransformers, and hybrid approaches. Special attention is given to predicting\nenergy consumption amidst challenges like seasonal patterns, vacations,\nmeteorological changes, and irregular human activities that cause sudden\nfluctuations in usage. The findings reveal that no single model consistently\noutperforms others across all seasons, emphasizing the need for season-specific\nmodel selection or tailored designs. Notably, the proposed Hyper Network based\nLSTM and MiniAutoEncXGBoost models exhibit strong adaptability to seasonal\nvariations, effectively capturing abrupt changes in energy consumption during\nsummer months. This study advances the energy forecasting field by emphasizing\nthe critical role of seasonal dynamics and model-specific behavior in achieving\naccurate predictions.\n","authors":["Muhammad Umair Danish","Mathumitha Sureshkumar","Thanuri Fonseka","Umeshika Uthayakumar","Vinura Galwaduge"],"pdf_url":"https://arxiv.org/pdf/2501.07423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05226v2","updated":"2025-01-13T15:30:39Z","published":"2025-01-09T13:29:54Z","title":"Light Transport-aware Diffusion Posterior Sampling for Single-View\n  Reconstruction of 3D Volumes","summary":"  We introduce a single-view reconstruction technique of volumetric fields in\nwhich multiple light scattering effects are omnipresent, such as in clouds. We\nmodel the unknown distribution of volumetric fields using an unconditional\ndiffusion model trained on a novel benchmark dataset comprising 1,000\nsynthetically simulated volumetric density fields. The neural diffusion model\nis trained on the latent codes of a novel, diffusion-friendly, monoplanar\nrepresentation. The generative model is used to incorporate a tailored\nparametric diffusion posterior sampling technique into different reconstruction\ntasks. A physically-based differentiable volume renderer is employed to provide\ngradients with respect to light transport in the latent space. This stands in\ncontrast to classic NeRF approaches and makes the reconstructions better\naligned with observed data. Through various experiments, we demonstrate\nsingle-view reconstruction of volumetric clouds at a previously unattainable\nquality.\n","authors":["Ludwic Leonard","Nils Thuerey","Ruediger Westermann"],"pdf_url":"https://arxiv.org/pdf/2501.05226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07405v1","updated":"2025-01-13T15:21:20Z","published":"2025-01-13T15:21:20Z","title":"PROTECT: Protein circadian time prediction using unsupervised learning","summary":"  Circadian rhythms regulate the physiology and behavior of humans and animals.\nDespite advancements in understanding these rhythms and predicting circadian\nphases at the transcriptional level, predicting circadian phases from proteomic\ndata remains elusive. This challenge is largely due to the scarcity of time\nlabels in proteomic datasets, which are often characterized by small sample\nsizes, high dimensionality, and significant noise. Furthermore, existing\nmethods for predicting circadian phases from transcriptomic data typically rely\non prior knowledge of known rhythmic genes, making them unsuitable for\nproteomic datasets. To address this gap, we developed a novel computational\nmethod using unsupervised deep learning techniques to predict circadian sample\nphases from proteomic data without requiring time labels or prior knowledge of\nproteins or genes. Our model involves a two-stage training process optimized\nfor robust circadian phase prediction: an initial greedy one-layer-at-a-time\npre-training which generates informative initial parameters followed by\nfine-tuning. During fine-tuning, a specialized loss function guides the model\nto align protein expression levels with circadian patterns, enabling it to\naccurately capture the underlying rhythmic structure within the data. We tested\nour method on both time-labeled and unlabeled proteomic data. For labeled data,\nwe compared our predictions to the known time labels, achieving high accuracy,\nwhile for unlabeled human datasets, including postmortem brain regions and\nurine samples, we explored circadian disruptions. Notably, our analysis\nidentified disruptions in rhythmic proteins between Alzheimer's disease and\ncontrol subjects across these samples.\n","authors":["Aram Ansary Ogholbake","Qiang Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.07405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07400v1","updated":"2025-01-13T15:17:28Z","published":"2025-01-13T15:17:28Z","title":"Derivation of effective gradient flow equations and dynamical truncation\n  of training data in Deep Learning","summary":"  We derive explicit equations governing the cumulative biases and weights in\nDeep Learning with ReLU activation function, based on gradient descent for the\nEuclidean cost in the input layer, and under the assumption that the weights\nare, in a precise sense, adapted to the coordinate system distinguished by the\nactivations. We show that gradient descent corresponds to a dynamical process\nin the input layer, whereby clusters of data are progressively reduced in\ncomplexity (\"truncated\") at an exponential rate that increases with the number\nof data points that have already been truncated. We provide a detailed\ndiscussion of several types of solutions to the gradient flow equations. A main\nmotivation for this work is to shed light on the interpretability question in\nsupervised learning.\n","authors":["Thomas Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07400v1.pdf","comment":"AMS Latex, 35 pages"},{"id":"http://arxiv.org/abs/2501.07382v1","updated":"2025-01-13T15:01:12Z","published":"2025-01-13T15:01:12Z","title":"Information-Theoretic Dual Memory System for Continual Learning","summary":"  Continuously acquiring new knowledge from a dynamic environment is a\nfundamental capability for animals, facilitating their survival and ability to\naddress various challenges. This capability is referred to as continual\nlearning, which focuses on the ability to learn a sequence of tasks without the\ndetriment of previous knowledge. A prevalent strategy to tackle continual\nlearning involves selecting and storing numerous essential data samples from\nprior tasks within a fixed-size memory buffer. However, the majority of current\nmemory-based techniques typically utilize a single memory buffer, which poses\nchallenges in concurrently managing newly acquired and previously learned\nsamples. Drawing inspiration from the Complementary Learning Systems (CLS)\ntheory, which defines rapid and gradual learning mechanisms for processing\ninformation, we propose an innovative dual memory system called the\nInformation-Theoretic Dual Memory System (ITDMS). This system comprises a fast\nmemory buffer designed to retain temporary and novel samples, alongside a slow\nmemory buffer dedicated to preserving critical and informative samples. The\nfast memory buffer is optimized employing an efficient reservoir sampling\nprocess. Furthermore, we introduce a novel information-theoretic memory\noptimization strategy that selectively identifies and retains diverse and\ninformative data samples for the slow memory buffer. Additionally, we propose a\nnovel balanced sample selection procedure that automatically identifies and\neliminates redundant memorized samples, thus freeing up memory capacity for new\ndata acquisitions, which can deal with a growing array of tasks. Our\nmethodology is rigorously assessed through a series of continual learning\nexperiments, with empirical results underscoring the effectiveness of the\nproposed system.\n","authors":["RunQing Wu","KaiHui Huang","HanYi Zhang","QiHe Liu","GuoJin Yu","JingSong Deng","Fei Ye"],"pdf_url":"https://arxiv.org/pdf/2501.07382v1.pdf","comment":"35 pages, 9 figures, submitted to Knowledge-Based Systems"},{"id":"http://arxiv.org/abs/2402.02429v3","updated":"2025-01-13T14:58:30Z","published":"2024-02-04T09:58:42Z","title":"Towards an Information Theoretic Framework of Context-Based Offline\n  Meta-Reinforcement Learning","summary":"  As a marriage between offline RL and meta-RL, the advent of offline\nmeta-reinforcement learning (OMRL) has shown great promise in enabling RL\nagents to multi-task and quickly adapt while acquiring knowledge safely. Among\nwhich, context-based OMRL (COMRL) as a popular paradigm, aims to learn a\nuniversal policy conditioned on effective task representations. In this work,\nby examining several key milestones in the field of COMRL, we propose to\nintegrate these seemingly independent methodologies into a unified framework.\nMost importantly, we show that the pre-existing COMRL algorithms are\nessentially optimizing the same mutual information objective between the task\nvariable $M$ and its latent representation $Z$ by implementing various\napproximate bounds. Such theoretical insight offers ample design freedom for\nnovel algorithms. As demonstrations, we propose a supervised and a\nself-supervised implementation of $I(Z; M)$, and empirically show that the\ncorresponding optimization algorithms exhibit remarkable generalization across\na broad spectrum of RL benchmarks, context shift scenarios, data qualities and\ndeep learning architectures. This work lays the information theoretic\nfoundation for COMRL methods, leading to a better understanding of task\nrepresentation learning in the context of reinforcement learning. Given its\ngenerality, we envision our framework as a promising offline pre-training\nparadigm of foundation models for decision making.\n","authors":["Lanqing Li","Hai Zhang","Xinyu Zhang","Shatong Zhu","Yang Yu","Junqiao Zhao","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2402.02429v3.pdf","comment":"26 pages, 8 figures, 7 tables. TLDR: We propose a novel information\n  theoretic framework of the context-based offline meta-RL paradigm, which\n  unifies several mainstream methods and leads to two robust algorithm\n  implementations"},{"id":"http://arxiv.org/abs/2501.07373v1","updated":"2025-01-13T14:41:56Z","published":"2025-01-13T14:41:56Z","title":"Dynami-CAL GraphNet: A Physics-Informed Graph Neural Network Conserving\n  Linear and Angular Momentum for Dynamical Systems","summary":"  Accurate, interpretable, and real-time modeling of multi-body dynamical\nsystems is essential for predicting behaviors and inferring physical properties\nin natural and engineered environments. Traditional physics-based models face\nscalability challenges and are computationally demanding, while data-driven\napproaches like Graph Neural Networks (GNNs) often lack physical consistency,\ninterpretability, and generalization. In this paper, we propose Dynami-CAL\nGraphNet, a Physics-Informed Graph Neural Network that integrates the learning\ncapabilities of GNNs with physics-based inductive biases to address these\nlimitations. Dynami-CAL GraphNet enforces pairwise conservation of linear and\nangular momentum for interacting nodes using edge-local reference frames that\nare equivariant to rotational symmetries, invariant to translations, and\nequivariant to node permutations. This design ensures physically consistent\npredictions of node dynamics while offering interpretable, edge-wise linear and\nangular impulses resulting from pairwise interactions. Evaluated on a 3D\ngranular system with inelastic collisions, Dynami-CAL GraphNet demonstrates\nstable error accumulation over extended rollouts, effective extrapolations to\nunseen configurations, and robust handling of heterogeneous interactions and\nexternal forces. Dynami-CAL GraphNet offers significant advantages in fields\nrequiring accurate, interpretable, and real-time modeling of complex multi-body\ndynamical systems, such as robotics, aerospace engineering, and materials\nscience. By providing physically consistent and scalable predictions that\nadhere to fundamental conservation laws, it enables the inference of forces and\nmoments while efficiently handling heterogeneous interactions and external\nforces.\n","authors":["Vinay Sharma","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2501.07373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07371v1","updated":"2025-01-13T14:40:42Z","published":"2025-01-13T14:40:42Z","title":"Simulating the Hubbard Model with Equivariant Normalizing Flows","summary":"  Generative models, particularly normalizing flows, have shown exceptional\nperformance in learning probability distributions across various domains of\nphysics, including statistical mechanics, collider physics, and lattice field\ntheory. In the context of lattice field theory, normalizing flows have been\nsuccessfully applied to accurately learn the Boltzmann distribution, enabling a\nrange of tasks such as direct estimation of thermodynamic observables and\nsampling independent and identically distributed (i.i.d.) configurations.\n  In this work, we present a proof-of-concept demonstration that normalizing\nflows can be used to learn the Boltzmann distribution for the Hubbard model.\nThis model is widely employed to study the electronic structure of graphene and\nother carbon nanomaterials. State-of-the-art numerical simulations of the\nHubbard model, such as those based on Hybrid Monte Carlo (HMC) methods, often\nsuffer from ergodicity issues, potentially leading to biased estimates of\nphysical observables. Our numerical experiments demonstrate that leveraging\ni.i.d.\\ sampling from the normalizing flow effectively addresses these issues.\n","authors":["Dominic Schuh","Janik Kreit","Evan Berkowitz","Lena Funcke","Thomas Luu","Kim A. Nicoli","Marcel Rodekamp"],"pdf_url":"https://arxiv.org/pdf/2501.07371v1.pdf","comment":"14 pages, 5 figures, contribution to the 41st International Symposium\n  on Lattice Field Theory (Lattice 2024), July 28th - August 3rd, 2024,\n  Liverpool, UK"},{"id":"http://arxiv.org/abs/2411.06376v2","updated":"2025-01-13T14:39:34Z","published":"2024-11-10T07:15:03Z","title":"Project Tracyn: Generative Artificial Intelligence based Peripherals\n  Trace Synthesizer","summary":"  Peripheral Component Interconnect Express (PCIe) is the de facto interconnect\nstandard for high-speed peripherals and CPUs. Prototyping and optimizing PCIe\ndevices for emerging scenarios is an ongoing challenge. Since Transaction Layer\nPackets (TLPs) capture device-CPU interactions, it is crucial to analyze and\ngenerate realistic TLP traces for effective device design and optimization.\nGenerative AI offers a promising approach for creating intricate, custom TLP\ntraces necessary for PCIe hardware and software development. However, existing\nmodels often generate impractical traces due to the absence of PCIe-specific\nconstraints, such as TLP ordering and causality. This paper presents Phantom,\nthe first framework that treats TLP trace generation as a generative AI problem\nwhile incorporating PCIe-specific constraints. We validate Phantom's\neffectiveness by generating TLP traces for an actual PCIe network interface\ncard. Experimental results show that Phantom produces practical, large-scale\nTLP traces, significantly outperforming existing models, with improvements of\nup to 1000$\\times$ in task-specific metrics and up to 2.19$\\times$ in Frechet\nInception Distance (FID) compared to backbone-only methods.\n","authors":["Zhibai Huang","Yihan Shen","Yongchen Xie","Zhixiang Wei","Yun wang","Fangxin Liu","Tao Song","Zhengwei Qi"],"pdf_url":"https://arxiv.org/pdf/2411.06376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10874v2","updated":"2025-01-13T14:38:07Z","published":"2024-02-16T18:20:33Z","title":"Design of 2D Skyrmionic Metamaterial Through Controlled Assembly","summary":"  Despite extensive research on magnetic skyrmions and antiskyrmions, a\nsignificant challenge remains in crafting nontrivial high-order skyrmionic\ntextures with varying, or even tailor-made, topologies. We address this\nchallenge, by focusing on a construction pathway of skyrmionic metamaterials\nwithin a monolayer thin film and suggest several skyrmionic metamaterials that\nare surprisingly stable, i.e., long-lived, due to a self-stabilization\nmechanism. This makes these new textures promising for applications. Central to\nour approach is the concept of 'simulated controlled assembly', in short, a\nprotocol inspired by 'click chemistry' that allows for positioning topological\nmagnetic structures where one likes, and then allowing for energy minimization\nto elucidate the stability. Utilizing high-throughput atomistic-spin-dynamic\nsimulations alongside state-of-the-art AI-driven tools, we have isolated\nskyrmions (topological charge Q=1), antiskyrmions (Q=-1), and skyrmionium\n(Q=0). These entities serve as foundational 'skyrmionic building blocks' to\nform the here reported intricate textures. In this work, two key contributions\nare introduced to the field of skyrmionic systems. First, we present a a novel\ncombination of atomistic spin dynamics simulations and controlled assembly\nprotocols for the stabilization and investigation of new topological magnets.\nSecond, using the aforementioned methods we report on the discovery of\nskyrmionic metamaterials.\n","authors":["Qichen Xu","Zhuanglin Shen","Alexander Edström","I. P. Miranda","Zhiwei Lu","Anders Bergman","Danny Thonig","Wanjian Yin","Olle Eriksson","Anna Delin"],"pdf_url":"https://arxiv.org/pdf/2402.10874v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09718v2","updated":"2025-01-13T14:37:52Z","published":"2024-12-12T20:48:06Z","title":"BayesAdapter: enhanced uncertainty estimation in CLIP few-shot\n  adaptation","summary":"  The emergence of large pre-trained vision-language models (VLMs) represents a\nparadigm shift in machine learning, with unprecedented results in a broad span\nof visual recognition tasks. CLIP, one of the most popular VLMs, has exhibited\nremarkable zero-shot and transfer learning capabilities in classification. To\ntransfer CLIP to downstream tasks, adapters constitute a parameter-efficient\napproach that avoids backpropagation through the large model (unlike related\nprompt learning methods). However, CLIP adapters have been developed to target\ndiscriminative performance, and the quality of their uncertainty estimates has\nbeen overlooked. In this work we show that the discriminative performance of\nstate-of-the-art CLIP adapters does not always correlate with their uncertainty\nestimation capabilities, which are essential for a safe deployment in\nreal-world scenarios. We also demonstrate that one of such adapters is obtained\nthrough MAP inference from a more general probabilistic framework. Based on\nthis observation we introduce BayesAdapter, which leverages Bayesian inference\nto estimate a full probability distribution instead of a single point, better\ncapturing the variability inherent in the parameter space. In a comprehensive\nempirical evaluation we show that our approach obtains high quality uncertainty\nestimates in the predictions, standing out in calibration and selective\nclassification. Our code will be publicly available upon acceptance of the\npaper.\n","authors":["Pablo Morales-Álvarez","Stergios Christodoulidis","Maria Vakalopoulou","Pablo Piantanida","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2412.09718v2.pdf","comment":"30 pages, 5 figures, 23 tables"},{"id":"http://arxiv.org/abs/2501.07365v1","updated":"2025-01-13T14:34:26Z","published":"2025-01-13T14:34:26Z","title":"Multimodal semantic retrieval for product search","summary":"  Semantic retrieval (also known as dense retrieval) based on textual data has\nbeen extensively studied for both web search and product search application\nfields, where the relevance of a query and a potential target document is\ncomputed by their dense vector representation comparison. Product image is\ncrucial for e-commence search interactions and is a key factor for customers at\nproduct explorations. But its impact for semantic retrieval has not been well\nstudied yet. In this research, we build a multimodal representation for product\nitems in e-commerece search in contrast to pure-text representation of\nproducts, and investigate the impact of such representations. The models are\ndeveloped and evaluated on e-commerce datasets. We demonstrate that a\nmultimodal representation scheme for a product can show improvement either on\npurchase recall or relevance accuracy in semantic retrieval. Additionally, we\nprovide numerical analysis for exclusive matches retrieved by a multimodal\nsemantic retrieval model versus a text-only semantic retrieval model, to\ndemonstrate the validation of multimodal solutions.\n","authors":["Dong Liu","Esther Lopez Ramos"],"pdf_url":"https://arxiv.org/pdf/2501.07365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13241v2","updated":"2025-01-13T14:32:28Z","published":"2024-09-20T05:57:50Z","title":"Exploring energy minimization to model strain localization as a strong\n  discontinuity using Physics Informed Neural Networks","summary":"  We explore the possibilities of using energy minimization for the numerical\nmodeling of strain localization in solids as a sharp discontinuity in the\ndisplacement field. For this purpose, we consider (regularized) strong\ndiscontinuity kinematics in elastoplastic solids. The corresponding\nmathematical model is discretized using Artificial Neural Networks (ANNs),\naiming to predict both the magnitude and location of the displacement jump from\nenergy minimization, $\\textit{i.e.}$, within a variational setting. The\narchitecture takes care of the kinematics, while the loss function takes care\nof the variational statement of the boundary value problem. The main idea\nbehind this approach is to solve both the equilibrium problem and the location\nof the localization band by means of trainable parameters in the ANN. As a\nproof of concept, we show through both 1D and 2D numerical examples that the\ncomputational modeling of strain localization for elastoplastic solids using\nenergy minimization is feasible.\n","authors":["Omar León","Víctor Rivera","Angel Vázquez-Patiño","Jacinto Ulloa","Esteban Samaniego"],"pdf_url":"https://arxiv.org/pdf/2409.13241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07360v1","updated":"2025-01-13T14:30:01Z","published":"2025-01-13T14:30:01Z","title":"TimberVision: A Multi-Task Dataset and Framework for Log-Component\n  Segmentation and Tracking in Autonomous Forestry Operations","summary":"  Timber represents an increasingly valuable and versatile resource. However,\nforestry operations such as harvesting, handling and measuring logs still\nrequire substantial human labor in remote environments posing significant\nsafety risks. Progressively automating these tasks has the potential of\nincreasing their efficiency as well as safety, but requires an accurate\ndetection of individual logs as well as live trees and their context. Although\ninitial approaches have been proposed for this challenging application domain,\nspecialized data and algorithms are still too scarce to develop robust\nsolutions. To mitigate this gap, we introduce the TimberVision dataset,\nconsisting of more than 2k annotated RGB images containing a total of 51k trunk\ncomponents including cut and lateral surfaces, thereby surpassing any existing\ndataset in this domain in terms of both quantity and detail by a large margin.\nBased on this data, we conduct a series of ablation experiments for oriented\nobject detection and instance segmentation and evaluate the influence of\nmultiple scene parameters on model performance. We introduce a generic\nframework to fuse the components detected by our models for both tasks into\nunified trunk representations. Furthermore, we automatically derive geometric\nproperties and apply multi-object tracking to further enhance robustness. Our\ndetection and tracking approach provides highly descriptive and accurate trunk\nrepresentations solely from RGB image data, even under challenging\nenvironmental conditions. Our solution is suitable for a wide range of\napplication scenarios and can be readily combined with other sensor modalities.\n","authors":["Daniel Steininger","Julia Simon","Andreas Trondl","Markus Murschitz"],"pdf_url":"https://arxiv.org/pdf/2501.07360v1.pdf","comment":"Accepted at Winter Conference on Applications of Computer Vision\n  (WACV) 2025. Code and dataset available at\n  https://github.com/timbervision/timbervision"},{"id":"http://arxiv.org/abs/2501.07358v1","updated":"2025-01-13T14:26:39Z","published":"2025-01-13T14:26:39Z","title":"Deep Generative Clustering with VAEs and Expectation-Maximization","summary":"  We propose a novel deep clustering method that integrates Variational\nAutoencoders (VAEs) into the Expectation-Maximization (EM) framework. Our\napproach models the probability distribution of each cluster with a VAE and\nalternates between updating model parameters by maximizing the Evidence Lower\nBound (ELBO) of the log-likelihood and refining cluster assignments based on\nthe learned distributions. This enables effective clustering and generation of\nnew samples from each cluster. Unlike existing VAE-based methods, our approach\neliminates the need for a Gaussian Mixture Model (GMM) prior or additional\nregularization techniques. Experiments on MNIST and FashionMNIST demonstrate\nsuperior clustering performance compared to state-of-the-art methods.\n","authors":["Michael Adipoetra","Ségolène Martin"],"pdf_url":"https://arxiv.org/pdf/2501.07358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06782v2","updated":"2025-01-13T14:11:49Z","published":"2024-11-11T08:19:54Z","title":"QuadWBG: Generalizable Quadrupedal Whole-Body Grasping","summary":"  Legged robots with advanced manipulation capabilities have the potential to\nsignificantly improve household duties and urban maintenance. Despite\nconsiderable progress in developing robust locomotion and precise manipulation\nmethods, seamlessly integrating these into cohesive whole-body control for\nreal-world applications remains challenging. In this paper, we present a\nmodular framework for robust and generalizable whole-body loco-manipulation\ncontroller based on a single arm-mounted camera. By using reinforcement\nlearning (RL), we enable a robust low-level policy for command execution over 5\ndimensions (5D) and a grasp-aware high-level policy guided by a novel metric,\nGeneralized Oriented Reachability Map (GORM). The proposed system achieves\nstate-of-the-art one-time grasping accuracy of 89% in the real world, including\nchallenging tasks such as grasping transparent objects. Through extensive\nsimulations and real-world experiments, we demonstrate that our system can\neffectively manage a large workspace, from floor level to above body height,\nand perform diverse whole-body loco-manipulation tasks.\n","authors":["Jilong Wang","Javokhirbek Rajabov","Chaoyi Xu","Yiming Zheng","He Wang"],"pdf_url":"https://arxiv.org/pdf/2411.06782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07346v1","updated":"2025-01-13T14:11:12Z","published":"2025-01-13T14:11:12Z","title":"Enhancing Online Reinforcement Learning with Meta-Learned Objective from\n  Offline Data","summary":"  A major challenge in Reinforcement Learning (RL) is the difficulty of\nlearning an optimal policy from sparse rewards. Prior works enhance online RL\nwith conventional Imitation Learning (IL) via a handcrafted auxiliary\nobjective, at the cost of restricting the RL policy to be sub-optimal when the\noffline data is generated by a non-expert policy. Instead, to better leverage\nvaluable information in offline data, we develop Generalized Imitation Learning\nfrom Demonstration (GILD), which meta-learns an objective that distills\nknowledge from offline data and instills intrinsic motivation towards the\noptimal policy. Distinct from prior works that are exclusive to a specific RL\nalgorithm, GILD is a flexible module intended for diverse vanilla off-policy RL\nalgorithms. In addition, GILD introduces no domain-specific hyperparameter and\nminimal increase in computational cost. In four challenging MuJoCo tasks with\nsparse rewards, we show that three RL algorithms enhanced with GILD\nsignificantly outperform state-of-the-art methods.\n","authors":["Shilong Deng","Zetao Zheng","Hongcai He","Paul Weng","Jie Shao"],"pdf_url":"https://arxiv.org/pdf/2501.07346v1.pdf","comment":"Accepted by AAAI 2025 (this version includes supplementary material)"},{"id":"http://arxiv.org/abs/2410.05450v2","updated":"2025-01-13T13:54:31Z","published":"2024-10-07T19:34:25Z","title":"AI-Driven Early Mental Health Screening: Analyzing Selfies of Pregnant\n  Women","summary":"  Major Depressive Disorder and anxiety disorders affect millions globally,\ncontributing significantly to the burden of mental health issues. Early\nscreening is crucial for effective intervention, as timely identification of\nmental health issues can significantly improve treatment outcomes. Artificial\nintelligence (AI) can be valuable for improving the screening of mental\ndisorders, enabling early intervention and better treatment outcomes. AI-driven\nscreening can leverage the analysis of multiple data sources, including facial\nfeatures in digital images. However, existing methods often rely on controlled\nenvironments or specialized equipment, limiting their broad applicability. This\nstudy explores the potential of AI models for ubiquitous depression-anxiety\nscreening given face-centric selfies. The investigation focuses on high-risk\npregnant patients, a population that is particularly vulnerable to mental\nhealth issues. To cope with limited training data resulting from our clinical\nsetup, pre-trained models were utilized in two different approaches:\nfine-tuning convolutional neural networks (CNNs) originally designed for facial\nexpression recognition and employing vision-language models (VLMs) for\nzero-shot analysis of facial expressions. Experimental results indicate that\nthe proposed VLM-based method significantly outperforms CNNs, achieving an\naccuracy of 77.6%. Although there is significant room for improvement, the\nresults suggest that VLMs can be a promising approach for mental health\nscreening.\n","authors":["Gustavo A. Basílio","Thiago B. Pereira","Alessandro L. Koerich","Hermano Tavares","Ludmila Dias","Maria das Graças da S. Teixeira","Rafael T. Sousa","Wilian H. Hisatugu","Amanda S. Mota","Anilton S. Garcia","Marco Aurélio K. Galletta","Thiago M. Paixão"],"pdf_url":"https://arxiv.org/pdf/2410.05450v2.pdf","comment":"This article has been accepted for publication in HEALTHINF25 at the\n  18th International Joint Conference on Biomedical Engineering Systems and\n  Technologies (BIOSTEC 2025)"},{"id":"http://arxiv.org/abs/2501.07337v1","updated":"2025-01-13T13:48:35Z","published":"2025-01-13T13:48:35Z","title":"Digital Operating Mode Classification of Real-World Amateur Radio\n  Transmissions","summary":"  This study presents an ML approach for classifying digital radio operating\nmodes evaluated on real-world transmissions. We generated 98 different\nparameterized radio signals from 17 digital operating modes, transmitted each\nof them on the 70 cm (UHF) amateur radio band, and recorded our transmissions\nwith two different architectures of SDR receivers. Three lightweight ML models\nwere trained exclusively on spectrograms of limited non-transmitted signals\nwith random characters as payloads. This training involved an online data\naugmentation pipeline to simulate various radio channel impairments. Our best\nmodel, EfficientNetB0, achieved an accuracy of 93.80% across the 17 operating\nmodes and 85.47% across all 98 parameterized radio signals, evaluated on our\nreal-world transmissions with Wikipedia articles as payloads. Furthermore, we\nanalyzed the impact of varying signal durations & the number of FFT bins on\nclassification, assessed the effectiveness of our simulated channel\nimpairments, and tested our models across multiple simulated SNRs.\n","authors":["Maximilian Bundscherer","Thomas H. Schmitt","Ilja Baumann","Tobias Bocklet"],"pdf_url":"https://arxiv.org/pdf/2501.07337v1.pdf","comment":"Conference IEEE ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07335v1","updated":"2025-01-13T13:47:05Z","published":"2025-01-13T13:47:05Z","title":"TempoGPT: Enhancing Temporal Reasoning via Quantizing Embedding","summary":"  Multi-modal language model has made advanced progress in vision and audio,\nbut still faces significant challenges in dealing with complex reasoning tasks\nin the time series domain. The reasons are twofold. First, labels for\nmulti-modal time series data are coarse and devoid of analysis or reasoning\nprocesses. Training with these data cannot improve the model's reasoning\ncapabilities. Second, due to the lack of precise tokenization in processing\ntime series, the representation patterns for temporal and textual information\nare inconsistent, which hampers the effectiveness of multi-modal alignment. To\naddress these challenges, we propose a multi-modal time series data\nconstruction approach and a multi-modal time series language model (TLM),\nTempoGPT. Specially, we construct multi-modal data for complex reasoning tasks\nby analyzing the variable-system relationships within a white-box system.\nAdditionally, proposed TempoGPT achieves consistent representation between\ntemporal and textual information by quantizing temporal embeddings, where\ntemporal embeddings are quantized into a series of discrete tokens using a\npredefined codebook; subsequently, a shared embedding layer processes both\ntemporal and textual tokens. Extensive experiments demonstrate that TempoGPT\naccurately perceives temporal information, logically infers conclusions, and\nachieves state-of-the-art in the constructed complex time series reasoning\ntasks. Moreover, we quantitatively demonstrate the effectiveness of quantizing\ntemporal embeddings in enhancing multi-modal alignment and the reasoning\ncapabilities of TLMs. Code and data are available at\nhttps://github.com/zhanghaochuan20/TempoGPT.\n","authors":["Haochuan Zhang","Chunhua Yang","Jie Han","Liyang Qin","Xiaoli Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09541v3","updated":"2025-01-13T13:41:53Z","published":"2024-05-15T17:55:05Z","title":"Spectral complexity of deep neural networks","summary":"  It is well-known that randomly initialized, push-forward, fully-connected\nneural networks weakly converge to isotropic Gaussian processes, in the limit\nwhere the width of all layers goes to infinity. In this paper, we propose to\nuse the angular power spectrum of the limiting field to characterize the\ncomplexity of the network architecture. In particular, we define sequences of\nrandom variables associated with the angular power spectrum, and provide a full\ncharacterization of the network complexity in terms of the asymptotic\ndistribution of these sequences as the depth diverges. On this basis, we\nclassify neural networks as low-disorder, sparse, or high-disorder; we show how\nthis classification highlights a number of distinct features for standard\nactivation functions, and in particular, sparsity properties of ReLU networks.\nOur theoretical results are also validated by numerical simulations.\n","authors":["Simmaco Di Lillo","Domenico Marinucci","Michele Salvi","Stefano Vigogna"],"pdf_url":"https://arxiv.org/pdf/2405.09541v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07324v1","updated":"2025-01-13T13:36:17Z","published":"2025-01-13T13:36:17Z","title":"Foundation Models at Work: Fine-Tuning for Fairness in Algorithmic\n  Hiring","summary":"  Foundation models require fine-tuning to ensure their generative outputs\nalign with intended results for specific tasks. Automating this fine-tuning\nprocess is challenging, as it typically needs human feedback that can be\nexpensive to acquire. We present AutoRefine, a method that leverages\nreinforcement learning for targeted fine-tuning, utilizing direct feedback from\nmeasurable performance improvements in specific downstream tasks. We\ndemonstrate the method for a problem arising in algorithmic hiring platforms\nwhere linguistic biases influence a recommendation system. In this setting, a\ngenerative model seeks to rewrite given job specifications to receive more\ndiverse candidate matches from a recommendation engine which matches jobs to\ncandidates. Our model detects and regulates biases in job descriptions to meet\ndiversity and fairness criteria. The experiments on a public hiring dataset and\na real-world hiring platform showcase how large language models can assist in\nidentifying and mitigation biases in the real world.\n","authors":["Buse Sibel Korkmaz","Rahul Nair","Elizabeth M. Daly","Evangelos Anagnostopoulos","Christos Varytimidis","Antonio del Rio Chanona"],"pdf_url":"https://arxiv.org/pdf/2501.07324v1.pdf","comment":"Accepted to AAAI 2025, AI Governance Workshop"},{"id":"http://arxiv.org/abs/2403.15517v2","updated":"2025-01-13T13:32:48Z","published":"2024-03-22T11:14:30Z","title":"Improving Forward Compatibility in Class Incremental Learning by\n  Increasing Representation Rank and Feature Richness","summary":"  Class Incremental Learning (CIL) constitutes a pivotal subfield within\ncontinual learning, aimed at enabling models to progressively learn new\nclassification tasks while retaining knowledge obtained from prior tasks.\nAlthough previous studies have predominantly focused on backward compatible\napproaches to mitigate catastrophic forgetting, recent investigations have\nintroduced forward compatible methods to enhance performance on novel tasks and\ncomplement existing backward compatible methods. In this study, we introduce an\neffective-Rank based Feature Richness enhancement (RFR) method, designed for\nimproving forward compatibility. Specifically, this method increases the\neffective rank of representations during the base session, thereby facilitating\nthe incorporation of more informative features pertinent to unseen novel tasks.\nConsequently, RFR achieves dual objectives in backward and forward\ncompatibility: minimizing feature extractor modifications and enhancing novel\ntask performance, respectively. To validate the efficacy of our approach, we\nestablish a theoretical connection between effective rank and the Shannon\nentropy of representations. Subsequently, we conduct comprehensive experiments\nby integrating RFR into eleven well-known CIL methods. Our results demonstrate\nthe effectiveness of our approach in enhancing novel-task performance while\nmitigating catastrophic forgetting. Furthermore, our method notably improves\nthe average incremental accuracy across all eleven cases examined.\n","authors":["Jaeill Kim","Wonseok Lee","Moonjung Eo","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2403.15517v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00304v3","updated":"2025-01-13T13:17:47Z","published":"2024-05-01T04:00:09Z","title":"QUACK: Quantum Aligned Centroid Kernel","summary":"  Quantum computing (QC) seems to show potential for application in machine\nlearning (ML). In particular quantum kernel methods (QKM) exhibit promising\nproperties for use in supervised ML tasks. However, a major disadvantage of\nkernel methods is their unfavorable quadratic scaling with the number of\ntraining samples. Together with the limits imposed by currently available\nquantum hardware (NISQ devices) with their low qubit coherence times, small\nnumber of qubits, and high error rates, the use of QC in ML at an industrially\nrelevant scale is currently impossible. As a small step in improving the\npotential applications of QKMs, we introduce QUACK, a quantum kernel algorithm\nwhose time complexity scales linear with the number of samples during training,\nand independent of the number of training samples in the inference stage. In\nthe training process, only the kernel entries for the samples and the centers\nof the classes are calculated, i.e. the maximum shape of the kernel for n\nsamples and c classes is (n, c). During training, the parameters of the quantum\nkernel and the positions of the centroids are optimized iteratively. In the\ninference stage, for every new sample the circuit is only evaluated for every\ncentroid, i.e. c times. We show that the QUACK algorithm nevertheless provides\nsatisfactory results and can perform at a similar level as classical kernel\nmethods with quadratic scaling during training. In addition, our (simulated)\nalgorithm is able to handle high-dimensional datasets such as MNIST with 784\nfeatures without any dimensionality reduction.\n","authors":["Kilian Tscharke","Sebastian Issel","Pascal Debus"],"pdf_url":"https://arxiv.org/pdf/2405.00304v3.pdf","comment":"2nd place Best Paper award in QML track @ IEEE International\n  Conference on Quantum Computing and Engineering (QCE) 2024"},{"id":"http://arxiv.org/abs/2501.07306v1","updated":"2025-01-13T13:16:12Z","published":"2025-01-13T13:16:12Z","title":"Variable Bregman Majorization-Minimization Algorithm and its Application\n  to Dirichlet Maximum Likelihood Estimation","summary":"  We propose a novel Bregman descent algorithm for minimizing a convex function\nthat is expressed as the sum of a differentiable part (defined over an open\nset) and a possibly nonsmooth term. The approach, referred to as the Variable\nBregman Majorization-Minimization (VBMM) algorithm, extends the Bregman\nProximal Gradient method by allowing the Bregman function used in the\ndivergence to adaptively vary at each iteration, provided it satisfies a\nmajorizing condition on the objective function. This adaptive framework enables\nthe algorithm to approximate the objective more precisely at each iteration,\nthereby allowing for accelerated convergence compared to the traditional\nBregman Proximal Gradient descent. We establish the convergence of the VBMM\nalgorithm to a minimizer under mild assumptions on the family of metrics used.\nFurthermore, we introduce a novel application of both the Bregman Proximal\nGradient method and the VBMM algorithm to the estimation of the\nmultidimensional parameters of a Dirichlet distribution through the\nmaximization of its log-likelihood. Numerical experiments confirm that the VBMM\nalgorithm outperforms existing approaches in terms of convergence speed.\n","authors":["Ségolène Martin","Jean-Christophe Pesquet","Gabriele Steidl","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2501.07306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03387v2","updated":"2025-01-13T13:13:38Z","published":"2024-11-05T18:14:49Z","title":"Quantifying Aleatoric Uncertainty of the Treatment Effect: A Novel\n  Orthogonal Learner","summary":"  Estimating causal quantities from observational data is crucial for\nunderstanding the safety and effectiveness of medical treatments. However, to\nmake reliable inferences, medical practitioners require not only estimating\naveraged causal quantities, such as the conditional average treatment effect,\nbut also understanding the randomness of the treatment effect as a random\nvariable. This randomness is referred to as aleatoric uncertainty and is\nnecessary for understanding the probability of benefit from treatment or\nquantiles of the treatment effect. Yet, the aleatoric uncertainty of the\ntreatment effect has received surprisingly little attention in the causal\nmachine learning community. To fill this gap, we aim to quantify the aleatoric\nuncertainty of the treatment effect at the covariate-conditional level, namely,\nthe conditional distribution of the treatment effect (CDTE). Unlike average\ncausal quantities, the CDTE is not point identifiable without strong additional\nassumptions. As a remedy, we employ partial identification to obtain sharp\nbounds on the CDTE and thereby quantify the aleatoric uncertainty of the\ntreatment effect. We then develop a novel, orthogonal learner for the bounds on\nthe CDTE, which we call AU-learner. We further show that our AU-learner has\nseveral strengths in that it satisfies Neyman-orthogonality and, thus,\nquasi-oracle efficiency. Finally, we propose a fully-parametric deep learning\ninstantiation of our AU-learner.\n","authors":["Valentyn Melnychuk","Stefan Feuerriegel","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2411.03387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07304v1","updated":"2025-01-13T13:12:18Z","published":"2025-01-13T13:12:18Z","title":"Code and Pixels: Multi-Modal Contrastive Pre-training for Enhanced\n  Tabular Data Analysis","summary":"  Learning from tabular data is of paramount importance, as it complements the\nconventional analysis of image and video data by providing a rich source of\nstructured information that is often critical for comprehensive understanding\nand decision-making processes. We present Multi-task Contrastive Masked Tabular\nModeling (MT-CMTM), a novel method aiming to enhance tabular models by\nleveraging the correlation between tabular data and corresponding images.\nMT-CMTM employs a dual strategy combining contrastive learning with masked\ntabular modeling, optimizing the synergy between these data modalities.\n  Central to our approach is a 1D Convolutional Neural Network with residual\nconnections and an attention mechanism (1D-ResNet-CBAM), designed to\nefficiently process tabular data without relying on images. This enables\nMT-CMTM to handle purely tabular data for downstream tasks, eliminating the\nneed for potentially costly image acquisition and processing.\n  We evaluated MT-CMTM on the DVM car dataset, which is uniquely suited for\nthis particular scenario, and the newly developed HIPMP dataset, which connects\nmembrane fabrication parameters with image data. Our MT-CMTM model outperforms\nthe proposed tabular 1D-ResNet-CBAM, which is trained from scratch, achieving a\nrelative 1.48% improvement in relative MSE on HIPMP and a 2.38% increase in\nabsolute accuracy on DVM. These results demonstrate MT-CMTM's robustness and\nits potential to advance the field of multi-modal learning.\n","authors":["Kankana Roy","Lars Krämer","Sebastian Domaschke","Malik Haris","Roland Aydin","Fabian Isensee","Martin Held"],"pdf_url":"https://arxiv.org/pdf/2501.07304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20287v5","updated":"2025-01-13T13:12:17Z","published":"2024-03-29T16:58:13Z","title":"Benchmarking Counterfactual Image Generation","summary":"  Generative AI has revolutionised visual content editing, empowering users to\neffortlessly modify images and videos. However, not all edits are equal. To\nperform realistic edits in domains such as natural image or medical imaging,\nmodifications must respect causal relationships inherent to the data generation\nprocess. Such image editing falls into the counterfactual image generation\nregime. Evaluating counterfactual image generation is substantially complex:\nnot only it lacks observable ground truths, but also requires adherence to\ncausal constraints. Although several counterfactual image generation methods\nand evaluation metrics exist, a comprehensive comparison within a unified\nsetting is lacking. We present a comparison framework to thoroughly benchmark\ncounterfactual image generation methods. We integrate all models that have been\nused for the task at hand and expand them to novel datasets and causal graphs,\ndemonstrating the superiority of Hierarchical VAEs across most datasets and\nmetrics. Our framework is implemented in a user-friendly Python package that\ncan be extended to incorporate additional SCMs, causal methods, generative\nmodels, and datasets for the community to build on. Code:\nhttps://github.com/gulnazaki/counterfactual-benchmark.\n","authors":["Thomas Melistas","Nikos Spyrou","Nefeli Gkouti","Pedro Sanchez","Athanasios Vlontzos","Yannis Panagakis","Giorgos Papanastasiou","Sotirios A. Tsaftaris"],"pdf_url":"https://arxiv.org/pdf/2403.20287v5.pdf","comment":"Published as a conference paper at NeurIPS 2024 Datasets and\n  Benchmarks Track https://openreview.net/forum?id=0T8xRFrScB Project page:\n  https://gulnazaki.github.io/counterfactual-benchmark"},{"id":"http://arxiv.org/abs/2501.07301v1","updated":"2025-01-13T13:10:16Z","published":"2025-01-13T13:10:16Z","title":"The Lessons of Developing Process Reward Models in Mathematical\n  Reasoning","summary":"  Process Reward Models (PRMs) emerge as a promising approach for process\nsupervision in mathematical reasoning of Large Language Models (LLMs), which\naim to identify and mitigate intermediate errors in the reasoning processes.\nHowever, the development of effective PRMs faces significant challenges,\nparticularly in data annotation and evaluation methodologies. In this paper,\nthrough extensive experiments, we demonstrate that commonly used Monte Carlo\n(MC) estimation-based data synthesis for PRMs typically yields inferior\nperformance and generalization compared to LLM-as-a-judge and human annotation\nmethods. MC estimation relies on completion models to evaluate current-step\ncorrectness, leading to inaccurate step verification. Furthermore, we identify\npotential biases in conventional Best-of-N (BoN) evaluation strategies for\nPRMs: (1) The unreliable policy models generate responses with correct answers\nbut flawed processes, leading to a misalignment between the evaluation criteria\nof BoN and the PRM objectives of process verification. (2) The tolerance of\nPRMs of such responses leads to inflated BoN scores. (3) Existing PRMs have a\nsignificant proportion of minimum scores concentrated on the final answer\nsteps, revealing the shift from process to outcome-based assessment in BoN\nOptimized PRMs. To address these challenges, we develop a consensus filtering\nmechanism that effectively integrates MC estimation with LLM-as-a-judge and\nadvocates a more comprehensive evaluation framework that combines\nresponse-level and step-level metrics. Based on the mechanisms, we\nsignificantly improve both model performance and data efficiency in the BoN\nevaluation and the step-wise error identification task. Finally, we release a\nnew state-of-the-art PRM that outperforms existing open-source alternatives and\nprovides practical guidelines for future research in building process\nsupervision models.\n","authors":["Zhenru Zhang","Chujie Zheng","Yangzhen Wu","Beichen Zhang","Runji Lin","Bowen Yu","Dayiheng Liu","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07294v1","updated":"2025-01-13T13:01:00Z","published":"2025-01-13T13:01:00Z","title":"Dataset-Agnostic Recommender Systems","summary":"  [This is a position paper and does not contain any empirical or theoretical\nresults] Recommender systems have become a cornerstone of personalized user\nexperiences, yet their development typically involves significant manual\nintervention, including dataset-specific feature engineering, hyperparameter\ntuning, and configuration. To this end, we introduce a novel paradigm:\nDataset-Agnostic Recommender Systems (DAReS) that aims to enable a single\ncodebase to autonomously adapt to various datasets without the need for\nfine-tuning, for a given recommender system task. Central to this approach is\nthe Dataset Description Language (DsDL), a structured format that provides\nmetadata about the dataset's features and labels, and allow the system to\nunderstand dataset's characteristics, allowing it to autonomously manage\nprocesses like feature selection, missing values imputation, noise removal, and\nhyperparameter optimization. By reducing the need for domain-specific expertise\nand manual adjustments, DAReS offers a more efficient and scalable solution for\nbuilding recommender systems across diverse application domains. It addresses\ncritical challenges in the field, such as reusability, reproducibility, and\naccessibility for non-expert users or entry-level researchers.\n","authors":["Tri Kurniawan Wijaya","Edoardo D'Amico","Xinyang Shao"],"pdf_url":"https://arxiv.org/pdf/2501.07294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07292v1","updated":"2025-01-13T13:00:24Z","published":"2025-01-13T13:00:24Z","title":"Estimating quantum relative entropies on quantum computers","summary":"  Quantum relative entropy, a quantum generalization of the well-known\nKullback-Leibler divergence, serves as a fundamental measure of the\ndistinguishability between quantum states and plays a pivotal role in quantum\ninformation science. Despite its importance, efficiently estimating quantum\nrelative entropy between two quantum states on quantum computers remains a\nsignificant challenge. In this work, we propose the first quantum algorithm for\nestimating quantum relative entropy and Petz R\\'{e}nyi divergence from two\nunknown quantum states on quantum computers, addressing open problems\nhighlighted in [Phys. Rev. A 109, 032431 (2024)] and [IEEE Trans. Inf. Theory\n70, 5653-5680 (2024)]. This is achieved by combining quadrature approximations\nof relative entropies, the variational representation of quantum f-divergences,\nand a new technique for parameterizing Hermitian polynomial operators to\nestimate their traces with quantum states. Notably, the circuit size of our\nalgorithm is at most 2n+1 with n being the number of qubits in the quantum\nstates and it is directly applicable to distributed scenarios, where quantum\nstates to be compared are hosted on cross-platform quantum computers. We\nvalidate our algorithm through numerical simulations, laying the groundwork for\nits future deployment on quantum hardware devices.\n","authors":["Yuchen Lu","Kun Fang"],"pdf_url":"https://arxiv.org/pdf/2501.07292v1.pdf","comment":"24 pages, 10 figures; comments are welcome"},{"id":"http://arxiv.org/abs/2501.07276v1","updated":"2025-01-13T12:41:27Z","published":"2025-01-13T12:41:27Z","title":"Bridging Smart Meter Gaps: A Benchmark of Statistical, Machine Learning\n  and Time Series Foundation Models for Data Imputation","summary":"  The integrity of time series data in smart grids is often compromised by\nmissing values due to sensor failures, transmission errors, or disruptions.\nGaps in smart meter data can bias consumption analyses and hinder reliable\npredictions, causing technical and economic inefficiencies. As smart meter data\ngrows in volume and complexity, conventional techniques struggle with its\nnonlinear and nonstationary patterns. In this context, Generative Artificial\nIntelligence offers promising solutions that may outperform traditional\nstatistical methods. In this paper, we evaluate two general-purpose Large\nLanguage Models and five Time Series Foundation Models for smart meter data\nimputation, comparing them with conventional Machine Learning and statistical\nmodels. We introduce artificial gaps (30 minutes to one day) into an anonymized\npublic dataset to test inference capabilities. Results show that Time Series\nFoundation Models, with their contextual understanding and pattern recognition,\ncould significantly enhance imputation accuracy in certain cases. However, the\ntrade-off between computational cost and performance gains remains a critical\nconsideration.\n","authors":["Amir Sartipi","Joaquin Delgado Fernandez","Sergio Potenciano Menci","Alessio Magitteri"],"pdf_url":"https://arxiv.org/pdf/2501.07276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07275v1","updated":"2025-01-13T12:40:52Z","published":"2025-01-13T12:40:52Z","title":"Generating Poisoning Attacks against Ridge Regression Models with\n  Categorical Features","summary":"  Machine Learning (ML) models have become a very powerful tool to extract\ninformation from large datasets and use it to make accurate predictions and\nautomated decisions. However, ML models can be vulnerable to external attacks,\ncausing them to underperform or deviate from their expected tasks. One way to\nattack ML models is by injecting malicious data to mislead the algorithm during\nthe training phase, which is referred to as a poisoning attack. We can prepare\nfor such situations by designing anticipated attacks, which are later used for\ncreating and testing defence strategies. In this paper, we propose an algorithm\nto generate strong poisoning attacks for a ridge regression model containing\nboth numerical and categorical features that explicitly models and poisons\ncategorical features. We model categorical features as SOS-1 sets and formulate\nthe problem of designing poisoning attacks as a bilevel optimization problem\nthat is nonconvex mixed-integer in the upper-level and unconstrained convex\nquadratic in the lower-level. We present the mathematical formulation of the\nproblem, introduce a single-level reformulation based on the Karush-Kuhn-Tucker\n(KKT) conditions of the lower level, find bounds for the lower-level variables\nto accelerate solver performance, and propose a new algorithm to poison\ncategorical features. Numerical experiments show that our method improves the\nmean squared error of all datasets compared to the previous benchmark in the\nliterature.\n","authors":["Monse Guedes-Ayala","Lars Schewe","Zeynep Suvak","Miguel Anjos"],"pdf_url":"https://arxiv.org/pdf/2501.07275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20351v3","updated":"2025-01-13T12:27:56Z","published":"2024-05-28T06:59:16Z","title":"Imitating from auxiliary imperfect demonstrations via Adversarial\n  Density Weighted Regression","summary":"  We propose a novel one-step supervised imitation learning (IL) framework\ncalled Adversarial Density Regression (ADR). This IL framework aims to correct\nthe policy learned on unknown-quality to match the expert distribution by\nutilizing demonstrations, without relying on the Bellman operator.\nSpecifically, ADR addresses several limitations in previous IL algorithms:\nFirst, most IL algorithms are based on the Bellman operator, which inevitably\nsuffer from cumulative offsets from sub-optimal rewards during multi-step\nupdate processes. Additionally, off-policy training frameworks suffer from\nOut-of-Distribution (OOD) state-actions. Second, while conservative terms help\nsolve the OOD issue, balancing the conservative term is difficult. To address\nthese limitations, we fully integrate a one-step density-weighted Behavioral\nCloning (BC) objective for IL with auxiliary imperfect demonstration.\nTheoretically, we demonstrate that this adaptation can effectively correct the\ndistribution of policies trained on unknown-quality datasets to align with the\nexpert policy's distribution. Moreover, the difference between the empirical\nand the optimal value function is proportional to the upper bound of ADR's\nobjective, indicating that minimizing ADR's objective is akin to approaching\nthe optimal value. Experimentally, we validated the performance of ADR by\nconducting extensive evaluations. Specifically, ADR outperforms all of the\nselected IL algorithms on tasks from the Gym-Mujoco domain. Meanwhile, it\nachieves an 89.5% improvement over IQL when utilizing ground truth rewards on\ntasks from the Adroit and Kitchen domains. Our codebase will be released at:\nhttps://github.com/stevezhangzA/Adverserial_Density_Regression.\n","authors":["Ziqi Zhang","Zifeng Zhuang","Jingzehua Xu","Yiyuan Yang","Yubo Huang","Donglin Wang","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20351v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05068v2","updated":"2025-01-13T12:06:15Z","published":"2025-01-09T08:44:06Z","title":"D3RM: A Discrete Denoising Diffusion Refinement Model for Piano\n  Transcription","summary":"  Diffusion models have been widely used in the generative domain due to their\nconvincing performance in modeling complex data distributions. Moreover, they\nhave shown competitive results on discriminative tasks, such as image\nsegmentation. While diffusion models have also been explored for automatic\nmusic transcription, their performance has yet to reach a competitive level. In\nthis paper, we focus on discrete diffusion model's refinement capabilities and\npresent a novel architecture for piano transcription. Our model utilizes\nNeighborhood Attention layers as the denoising module, gradually predicting the\ntarget high-resolution piano roll, conditioned on the finetuned features of a\npretrained acoustic model. To further enhance refinement, we devise a novel\nstrategy which applies distinct transition states during training and inference\nstage of discrete diffusion models. Experiments on the MAESTRO dataset show\nthat our approach outperforms previous diffusion-based piano transcription\nmodels and the baseline model in terms of F1 score. Our code is available in\nhttps://github.com/hanshounsu/d3rm.\n","authors":["Hounsu Kim","Taegyun Kwon","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2501.05068v2.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07251v1","updated":"2025-01-13T12:00:34Z","published":"2025-01-13T12:00:34Z","title":"MOS-Attack: A Scalable Multi-objective Adversarial Attack Framework","summary":"  Crafting adversarial examples is crucial for evaluating and enhancing the\nrobustness of Deep Neural Networks (DNNs), presenting a challenge equivalent to\nmaximizing a non-differentiable 0-1 loss function.\n  However, existing single objective methods, namely adversarial attacks focus\non a surrogate loss function, do not fully harness the benefits of engaging\nmultiple loss functions, as a result of insufficient understanding of their\nsynergistic and conflicting nature.\n  To overcome these limitations, we propose the Multi-Objective Set-based\nAttack (MOS Attack), a novel adversarial attack framework leveraging multiple\nloss functions and automatically uncovering their interrelations.\n  The MOS Attack adopts a set-based multi-objective optimization strategy,\nenabling the incorporation of numerous loss functions without additional\nparameters.\n  It also automatically mines synergistic patterns among various losses,\nfacilitating the generation of potent adversarial attacks with fewer\nobjectives.\n  Extensive experiments have shown that our MOS Attack outperforms\nsingle-objective attacks. Furthermore, by harnessing the identified synergistic\npatterns, MOS Attack continues to show superior results with a reduced number\nof loss functions.\n","authors":["Ping Guo","Cheng Gong","Xi Lin","Fei Liu","Zhichao Lu","Qingfu Zhang","Zhenkun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07251v1.pdf","comment":"Under Review of CVPR 2025"},{"id":"http://arxiv.org/abs/2501.07247v1","updated":"2025-01-13T11:55:04Z","published":"2025-01-13T11:55:04Z","title":"Interpretable machine-learning for predicting molecular weight of PLA\n  based on artificial bee colony optimization algorithm and adaptive neurofuzzy\n  inference system","summary":"  This article discusses the integration of the Artificial Bee Colony (ABC)\nalgorithm with two supervised learning methods, namely Artificial Neural\nNetworks (ANNs) and Adaptive Network-based Fuzzy Inference System (ANFIS), for\nfeature selection from Near-Infrared (NIR) spectra for predicting the molecular\nweight of medical-grade Polylactic Acid (PLA). During extrusion processing of\nPLA, in-line NIR spectra were captured along with extrusion process and machine\nsetting data. With a dataset comprising 63 observations and 512 input features,\nappropriate machine learning tools are essential for interpreting data and\nselecting features to improve prediction accuracy. Initially, the ABC\noptimization algorithm is coupled with ANN/ANFIS to forecast PLA molecular\nweight. The objective functions of the ABC algorithm are to minimize the root\nmean square error (RMSE) between experimental and predicted PLA molecular\nweights while also minimizing the number of input features. Results indicate\nthat employing ABC-ANFIS yields the lowest RMSE of 282 Da and identifies four\nsignificant parameters (NIR wavenumbers 6158 cm-1, 6310 cm-1, 6349 cm-1, and\nmelt temperature) for prediction. These findings demonstrate the effectiveness\nof using the ABC algorithm with ANFIS for selecting a minimal set of features\nto predict PLA molecular weight with high accuracy during processing\n","authors":["Amir Pouya Masoumi","Leo Creedon","Ramen Ghosh","Nimra Munir","Ross McMorrow","Marion McAfee"],"pdf_url":"https://arxiv.org/pdf/2501.07247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12094v2","updated":"2025-01-13T11:46:59Z","published":"2024-03-15T06:57:08Z","title":"Are LLMs Good Cryptic Crossword Solvers?","summary":"  Cryptic crosswords are puzzles that rely not only on general knowledge but\nalso on the solver's ability to manipulate language on different levels and\ndeal with various types of wordplay. Previous research suggests that solving\nsuch puzzles is a challenge even for modern NLP models. However, the abilities\nof large language models (LLMs) have not yet been tested on this task. In this\npaper, we establish the benchmark results for three popular LLMs -- LLaMA2,\nMistral, and ChatGPT -- showing that their performance on this task is still\nfar from that of humans.\n","authors":["Abdelrahman Sadallah","Daria Kotova","Ekaterina Kochmar"],"pdf_url":"https://arxiv.org/pdf/2403.12094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20104v2","updated":"2025-01-13T11:46:06Z","published":"2024-12-28T10:12:12Z","title":"SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object\n  Interaction Synthesis","summary":"  Synthesizing realistic human-object interaction motions is a critical problem\nin VR/AR and human animation. Unlike the commonly studied scenarios involving a\nsingle human or hand interacting with one object, we address a more generic\nmulti-body setting with arbitrary numbers of humans, hands, and objects. This\ncomplexity introduces significant challenges in synchronizing motions due to\nthe high correlations and mutual influences among bodies. To address these\nchallenges, we introduce SyncDiff, a novel method for multi-body interaction\nsynthesis using a synchronized motion diffusion strategy. SyncDiff employs a\nsingle diffusion model to capture the joint distribution of multi-body motions.\nTo enhance motion fidelity, we propose a frequency-domain motion decomposition\nscheme. Additionally, we introduce a new set of alignment scores to emphasize\nthe synchronization of different body motions. SyncDiff jointly optimizes both\ndata sample likelihood and alignment likelihood through an explicit\nsynchronization strategy. Extensive experiments across four datasets with\nvarious multi-body configurations demonstrate the superiority of SyncDiff over\nexisting state-of-the-art motion synthesis methods.\n","authors":["Wenkun He","Yun Liu","Ruitao Liu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2412.20104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05409v5","updated":"2025-01-13T11:35:37Z","published":"2024-05-08T20:23:24Z","title":"Initialization is Critical to Whether Transformers Fit Composite\n  Functions by Reasoning or Memorizing","summary":"  Transformers have shown impressive capabilities across various tasks, but\ntheir performance on compositional problems remains a topic of debate. In this\nwork, we investigate the mechanisms of how transformers behave on unseen\ncompositional tasks. We discover that the parameter initialization scale plays\na critical role in determining whether the model learns inferential\n(reasoning-based) solutions, which capture the underlying compositional\nprimitives, or symmetric (memory-based) solutions, which simply memorize\nmappings without understanding the compositional structure. By analyzing the\ninformation flow and vector representations within the model, we reveal the\ndistinct mechanisms underlying these solution types. We further find that\ninferential (reasoning-based) solutions exhibit low complexity bias, which we\nhypothesize is a key factor enabling them to learn individual mappings for\nsingle anchors. We validate our conclusions on various real-world datasets. Our\nfindings provide valuable insights into the role of initialization scale in\ntuning the reasoning and memorizing ability and we propose the initialization\nrate $\\gamma$ to be a convenient tunable hyper-parameter in common deep\nlearning frameworks, where $1/d_{\\mathrm{in}}^\\gamma$ is the standard deviation\nof parameters of the layer with $d_{\\mathrm{in}}$ input neurons.\n","authors":["Zhongwang Zhang","Pengxiao Lin","Zhiwei Wang","Yaoyu Zhang","Zhi-Qin John Xu"],"pdf_url":"https://arxiv.org/pdf/2405.05409v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07237v1","updated":"2025-01-13T11:35:09Z","published":"2025-01-13T11:35:09Z","title":"Breaking Memory Limits: Gradient Wavelet Transform Enhances LLMs\n  Training","summary":"  Large language models (LLMs) have shown impressive performance across a range\nof natural language processing tasks. However, their vast number of parameters\nintroduces significant memory challenges during training, particularly when\nusing memory-intensive optimizers like Adam. Existing memory-efficient\nalgorithms often rely on techniques such as singular value decomposition\nprojection or weight freezing. While these approaches help alleviate memory\nconstraints, they generally produce suboptimal results compared to full-rank\nupdates. In this paper, we investigate the memory-efficient method beyond\nlow-rank training, proposing a novel solution called Gradient Wavelet Transform\n(GWT), which applies wavelet transforms to gradients in order to significantly\nreduce the memory requirements for maintaining optimizer states. We demonstrate\nthat GWT can be seamlessly integrated with memory-intensive optimizers,\nenabling efficient training without sacrificing performance. Through extensive\nexperiments on both pre-training and fine-tuning tasks, we show that GWT\nachieves state-of-the-art performance compared with advanced memory-efficient\noptimizers and full-rank approaches in terms of both memory usage and training\nperformance.\n","authors":["Ziqing Wen","Ping Luo","Jiahuan Wang","Xiaoge Deng","Jinping Zou","Kun Yuan","Tao Sun","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.07237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15622v2","updated":"2025-01-13T11:17:38Z","published":"2023-05-25T00:03:22Z","title":"GFairHint: Improving Individual Fairness for Graph Neural Networks via\n  Fairness Hint","summary":"  Given the growing concerns about fairness in machine learning and the\nimpressive performance of Graph Neural Networks (GNNs) on graph data learning,\nalgorithmic fairness in GNNs has attracted significant attention. While many\nexisting studies improve fairness at the group level, only a few works promote\nindividual fairness, which renders similar outcomes for similar individuals. A\ndesirable framework that promotes individual fairness should (1) balance\nbetween fairness and performance, (2) accommodate two commonly-used individual\nsimilarity measures (externally annotated and computed from input features),\n(3) generalize across various GNN models, and (4) be computationally efficient.\nUnfortunately, none of the prior work achieves all the desirables. In this\nwork, we propose a novel method, GFairHint, which promotes individual fairness\nin GNNs and achieves all aforementioned desirables. GFairHint learns fairness\nrepresentations through an auxiliary link prediction task, and then\nconcatenates the representations with the learned node embeddings in original\nGNNs as a \"fairness hint\". Through extensive experimental investigations on\nfive real-world graph datasets under three prevalent GNN models covering both\nindividual similarity measures above, GFairHint achieves the best fairness\nresults in almost all combinations of datasets with various backbone models,\nwhile generating comparable utility results, with much less computational cost\ncompared to the previous state-of-the-art (SoTA) method.\n","authors":["Paiheng Xu","Yuhang Zhou","Bang An","Wei Ai","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2305.15622v2.pdf","comment":"Accepted by the ACM Transactions on Knowledge Discovery from Data\n  (TKDD 2025)"},{"id":"http://arxiv.org/abs/2501.06007v2","updated":"2025-01-13T11:02:23Z","published":"2025-01-10T14:42:08Z","title":"CoNOAir: A Neural Operator for Forecasting Carbon Monoxide Evolution in\n  Cities","summary":"  Carbon Monoxide (CO) is a dominant pollutant in urban areas due to the energy\ngeneration from fossil fuels for industry, automobile, and domestic\nrequirements. Forecasting the evolution of CO in real-time can enable the\ndeployment of effective early warning systems and intervention strategies.\nHowever, the computational cost associated with the physics and chemistry-based\nsimulation makes it prohibitive to implement such a model at the city and\ncountry scale. To address this challenge, here, we present a machine learning\nmodel based on neural operator, namely, Complex Neural Operator for Air Quality\n(CoNOAir), that can effectively forecast CO concentrations. We demonstrate this\nby developing a country-level model for short-term (hourly) and long-term\n(72-hour) forecasts of CO concentrations. Our model outperforms\nstate-of-the-art models such as Fourier neural operators (FNO) and provides\nreliable predictions for both short and long-term forecasts. We further analyse\nthe capability of the model to capture extreme events and generate forecasts in\nurban cities in India. Interestingly, we observe that the model predicts the\nnext hour CO concentrations with R2 values greater than 0.95 for all the cities\nconsidered. The deployment of such a model can greatly assist the governing\nbodies to provide early warning, plan intervention strategies, and develop\neffective strategies by considering several what-if scenarios. Altogether, the\npresent approach could provide a fillip to real-time predictions of CO\npollution in urban cities.\n","authors":["Sanchit Bedi","Karn Tiwari","Prathosh A. P.","Sri Harsha Kota","N. M. Anoop Krishnan"],"pdf_url":"https://arxiv.org/pdf/2501.06007v2.pdf","comment":"28 pages, 14 figures, under submission process"},{"id":"http://arxiv.org/abs/2501.07206v1","updated":"2025-01-13T11:00:31Z","published":"2025-01-13T11:00:31Z","title":"A data-driven approach to discover and quantify systemic lupus\n  erythematosus etiological heterogeneity from electronic health records","summary":"  Systemic lupus erythematosus (SLE) is a complex heterogeneous disease with\nmany manifestational facets. We propose a data-driven approach to discover\nprobabilistic independent sources from multimodal imperfect EHR data. These\nsources represent exogenous variables in the data generation process causal\ngraph that estimate latent root causes of the presence of SLE in the health\nrecord. We objectively evaluated the sources against the original variables\nfrom which they were discovered by training supervised models to discriminate\nSLE from negative health records using a reduced set of labelled instances. We\nfound 19 predictive sources with high clinical validity and whose EHR\nsignatures define independent factors of SLE heterogeneity. Using the sources\nas input patient data representation enables models to provide with rich\nexplanations that better capture the clinical reasons why a particular record\nis (not) an SLE case. Providers may be willing to trade patient-level\ninterpretability for discrimination especially in challenging cases.\n","authors":["Marco Barbero Mota","John M. Still","Jorge L. Gamboa","Eric V. Strobl","Charles M. Stein","Vivian K. Kawai","Thomas A. Lasko"],"pdf_url":"https://arxiv.org/pdf/2501.07206v1.pdf","comment":"Received Runner-up Knowledge Discovery and Data Mining Innovation\n  Award at the American Medical Informatics Association Annual Symposium 2024"},{"id":"http://arxiv.org/abs/2501.07201v1","updated":"2025-01-13T10:53:19Z","published":"2025-01-13T10:53:19Z","title":"An Enhanced Zeroth-Order Stochastic Frank-Wolfe Framework for\n  Constrained Finite-Sum Optimization","summary":"  We propose an enhanced zeroth-order stochastic Frank-Wolfe framework to\naddress constrained finite-sum optimization problems, a structure prevalent in\nlarge-scale machine-learning applications. Our method introduces a novel double\nvariance reduction framework that effectively reduces the gradient\napproximation variance induced by zeroth-order oracles and the stochastic\nsampling variance from finite-sum objectives. By leveraging this framework, our\nalgorithm achieves significant improvements in query efficiency, making it\nparticularly well-suited for high-dimensional optimization tasks. Specifically,\nfor convex objectives, the algorithm achieves a query complexity of O(d\n\\sqrt{n}/\\epsilon ) to find an epsilon-suboptimal solution, where d is the\ndimensionality and n is the number of functions in the finite-sum objective.\nFor non-convex objectives, it achieves a query complexity of\nO(d^{3/2}\\sqrt{n}/\\epsilon^2 ) without requiring the computation ofd partial\nderivatives at each iteration. These complexities are the best known among\nzeroth-order stochastic Frank-Wolfe algorithms that avoid explicit gradient\ncalculations. Empirical experiments on convex and non-convex machine learning\ntasks, including sparse logistic regression, robust classification, and\nadversarial attacks on deep networks, validate the computational efficiency and\nscalability of our approach. Our algorithm demonstrates superior performance in\nboth convergence rate and query complexity compared to existing methods.\n","authors":["Haishan Ye","Yinghui Huang","Hao Di","Xiangyu Chang"],"pdf_url":"https://arxiv.org/pdf/2501.07201v1.pdf","comment":"35 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.07197v1","updated":"2025-01-13T10:44:08Z","published":"2025-01-13T10:44:08Z","title":"Lung Cancer detection using Deep Learning","summary":"  In this paper we discuss lung cancer detection using hybrid model of\nConvolutional-Neural-Networks (CNNs) and Support-Vector-Machines-(SVMs) in\norder to gain early detection of tumors, benign or malignant. The work uses\nthis hybrid model by training upon the Computed Tomography scans (CT scans) as\ndataset. Using deep learning for detecting lung cancer early is a cutting-edge\nmethod.\n","authors":["Aryan Chaudhari","Ankush Singh","Sanchi Gajbhiye","Pratham Agrawal"],"pdf_url":"https://arxiv.org/pdf/2501.07197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06076v2","updated":"2025-01-13T10:42:59Z","published":"2025-01-10T16:13:57Z","title":"A monthly sub-national Harmonized Food Insecurity Dataset for\n  comprehensive analysis and predictive modeling","summary":"  Food security is a complex, multidimensional concept challenging to measure\ncomprehensively. Effective anticipation, monitoring, and mitigation of food\ncrises require timely and comprehensive global data. This paper introduces the\nHarmonized Food Insecurity Dataset (HFID), an open-source resource\nconsolidating four key data sources: the Integrated Food Security Phase\nClassification (IPC)/Cadre Harmonis\\'e (CH) phases, the Famine Early Warning\nSystems Network (FEWS NET) IPC-compatible phases, and the World Food Program's\n(WFP) Food Consumption Score (FCS) and reduced Coping Strategy Index (rCSI).\nUpdated monthly and using a common reference system for administrative units,\nthe HFID offers extensive spatial and temporal coverage. It serves as a vital\ntool for food security experts and humanitarian agencies, providing a unified\nresource for analyzing food security conditions and highlighting global data\ndisparities. The scientific community can also leverage the HFID to develop\ndata-driven predictive models, enhancing the capacity to forecast and prevent\nfuture food crises.\n","authors":["Mélissande Machefer","Michele Ronco","Anne-Claire Thomas","Michael Assouline","Melanie Rabier","Christina Corbane","Felix Rembold"],"pdf_url":"https://arxiv.org/pdf/2501.06076v2.pdf","comment":"The authors Melissande Machefer and Michele Ronco have contributed\n  equally as both first authors to this work. This work is currently being\n  reviewed in a peer-reviewed journal"},{"id":"http://arxiv.org/abs/2311.03992v2","updated":"2025-01-13T10:38:40Z","published":"2023-11-07T13:43:18Z","title":"Bandit Pareto Set Identification: the Fixed Budget Setting","summary":"  We study a multi-objective pure exploration problem in a multi-armed bandit\nmodel. Each arm is associated to an unknown multi-variate distribution and the\ngoal is to identify the distributions whose mean is not uniformly worse than\nthat of another distribution: the Pareto optimal set. We propose and analyze\nthe first algorithms for the \\emph{fixed budget} Pareto Set Identification\ntask. We propose Empirical Gap Elimination, a family of algorithms combining a\ncareful estimation of the ``hardness to classify'' each arm in or out of the\nPareto set with a generic elimination scheme. We prove that two particular\ninstances, EGE-SR and EGE-SH, have a probability of error that decays\nexponentially fast with the budget, with an exponent supported by an\ninformation theoretic lower-bound. We complement these findings with an\nempirical study using real-world and synthetic datasets, which showcase the\ngood performance of our algorithms.\n","authors":["Cyrille Kone","Emilie Kaufmann","Laura Richert"],"pdf_url":"https://arxiv.org/pdf/2311.03992v2.pdf","comment":"In Proceedings of AISTATS 2024"},{"id":"http://arxiv.org/abs/2501.07191v1","updated":"2025-01-13T10:38:12Z","published":"2025-01-13T10:38:12Z","title":"Pre-Trained Large Language Model Based Remaining Useful Life Transfer\n  Prediction of Bearing","summary":"  Accurately predicting the remaining useful life (RUL) of rotating machinery,\nsuch as bearings, is essential for ensuring equipment reliability and\nminimizing unexpected industrial failures. Traditional data-driven deep\nlearning methods face challenges in practical settings due to inconsistent\ntraining and testing data distributions and limited generalization for\nlong-term predictions.\n","authors":["Laifa Tao","Zhengduo Zhao","Xuesong Wang","Bin Li","Wenchao Zhan","Xuanyuan Su","Shangyu Li","Qixuan Huang","Haifei Liu","Chen Lu","Zhixuan Lian"],"pdf_url":"https://arxiv.org/pdf/2501.07191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10496v2","updated":"2025-01-13T10:34:16Z","published":"2024-09-16T17:28:21Z","title":"MusicLIME: Explainable Multimodal Music Understanding","summary":"  Multimodal models are critical for music understanding tasks, as they capture\nthe complex interplay between audio and lyrics. However, as these models become\nmore prevalent, the need for explainability grows-understanding how these\nsystems make decisions is vital for ensuring fairness, reducing bias, and\nfostering trust. In this paper, we introduce MusicLIME, a model-agnostic\nfeature importance explanation method designed for multimodal music models.\nUnlike traditional unimodal methods, which analyze each modality separately\nwithout considering the interaction between them, often leading to incomplete\nor misleading explanations, MusicLIME reveals how audio and lyrical features\ninteract and contribute to predictions, providing a holistic view of the\nmodel's decision-making. Additionally, we enhance local explanations by\naggregating them into global explanations, giving users a broader perspective\nof model behavior. Through this work, we contribute to improving the\ninterpretability of multimodal music models, empowering users to make informed\nchoices, and fostering more equitable, fair, and transparent music\nunderstanding systems.\n","authors":["Theodoros Sotirou","Vassilis Lyberatos","Orfeas Menis Mastromichalakis","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2409.10496v2.pdf","comment":"GitHub repository: https://github.com/IamTheo2000/MusicLIME. To be\n  presented at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07186v1","updated":"2025-01-13T10:31:36Z","published":"2025-01-13T10:31:36Z","title":"Generalizable Graph Neural Networks for Robust Power Grid Topology\n  Control","summary":"  The energy transition necessitates new congestion management methods. One\nsuch method is controlling the grid topology with machine learning (ML). This\napproach has gained popularity following the Learning to Run a Power Network\n(L2RPN) competitions. Graph neural networks (GNNs) are a class of ML models\nthat reflect graph structure in their computation, which makes them suitable\nfor power grid modeling. Various GNN approaches for topology control have thus\nbeen proposed. We propose the first GNN model for grid topology control that\nuses only GNN layers. Additionally, we identify the busbar information\nasymmetry problem that the popular homogeneous graph representation suffers\nfrom, and propose a heterogeneous graph representation to resolve it. We train\nboth homogeneous and heterogeneous GNNs and fully connected neural networks\n(FCNN) baselines on an imitation learning task. We evaluate the models\naccording to their classification accuracy and grid operation ability. We find\nthat the heterogeneous GNNs perform best on in-distribution networks, followed\nby the FCNNs, and lastly, the homogeneous GNNs. We also find that both GNN\ntypes generalize better to out-of-distribution networks than FCNNs.\n","authors":["Matthijs de Jong","Jan Viebahn","Yuliya Shapovalova"],"pdf_url":"https://arxiv.org/pdf/2501.07186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07185v1","updated":"2025-01-13T10:30:10Z","published":"2025-01-13T10:30:10Z","title":"Uncertainty Guarantees on Automated Precision Weeding using Conformal\n  Prediction","summary":"  Precision agriculture in general, and precision weeding in particular, have\ngreatly benefited from the major advancements in deep learning and computer\nvision. A large variety of commercial robotic solutions are already available\nand deployed. However, the adoption by farmers of such solutions is still low\nfor many reasons, an important one being the lack of trust in these systems.\nThis is in great part due to the opaqueness and complexity of deep neural\nnetworks and the manufacturers' inability to provide valid guarantees on their\nperformance. Conformal prediction, a well-established methodology in the\nmachine learning community, is an efficient and reliable strategy for providing\ntrustworthy guarantees on the predictions of any black-box model under very\nminimal constraints. Bridging the gap between the safe machine learning and\nprecision agriculture communities, this article showcases conformal prediction\nin action on the task of precision weeding through deep learning-based image\nclassification. After a detailed presentation of the conformal prediction\nmethodology and the development of a precision spraying pipeline based on a\n''conformalized'' neural network and well-defined spraying decision rules, the\narticle evaluates this pipeline on two real-world scenarios: one under\nin-distribution conditions, the other reflecting a near out-of-distribution\nsetting. The results show that we are able to provide formal, i.e. certifiable,\nguarantees on spraying at least 90% of the weeds.\n","authors":["Paul Melki","Lionel Bombrun","Boubacar Diallo","Jérôme Dias","Jean-Pierre da Costa"],"pdf_url":"https://arxiv.org/pdf/2501.07185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07661v4","updated":"2025-01-13T10:20:10Z","published":"2022-10-14T09:25:47Z","title":"CAB: Comprehensive Attention Benchmarking on Long Sequence Modeling","summary":"  Transformer has achieved remarkable success in language, image, and speech\nprocessing. Recently, various efficient attention architectures have been\nproposed to improve transformer's efficiency while largely preserving its\nefficacy, especially in modeling long sequences. A widely-used benchmark to\ntest these efficient methods' capability on long-range modeling is Long Range\nArena (LRA). However, LRA only focuses on the standard bidirectional (or\nnoncausal) self attention, and completely ignores cross attentions and\nunidirectional (or causal) attentions, which are equally important to\ndownstream applications. In this paper, we propose Comprehensive Attention\nBenchmark (CAB) under a fine-grained attention taxonomy with four\ndistinguishable attention patterns, namely, noncausal self, causal self,\nnoncausal cross, and causal cross attentions. CAB collects seven real-world\ntasks from different research areas to evaluate efficient attentions under the\nfour attention patterns. Among these tasks, CAB validates efficient attentions\nin eight backbone networks to show their generalization across neural\narchitectures. We conduct exhaustive experiments to benchmark the performances\nof nine widely-used efficient attention architectures designed with different\nphilosophies on CAB. Extensive experimental results also shed light on the\nfundamental problems of efficient attentions, such as efficiency length against\nvanilla attention, performance consistency across attention patterns, the\nbenefit of attention mechanisms, and interpolation/extrapolation on\nlong-context language modeling.\n","authors":["Jun Zhang","Shuyang Jiang","Jiangtao Feng","Lin Zheng","Lingpeng Kong"],"pdf_url":"https://arxiv.org/pdf/2210.07661v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20971v2","updated":"2025-01-13T10:14:27Z","published":"2024-05-31T16:18:46Z","title":"Amortizing intractable inference in diffusion models for vision,\n  language, and control","summary":"  Diffusion models have emerged as effective distribution estimators in vision,\nlanguage, and reinforcement learning, but their use as priors in downstream\ntasks poses an intractable posterior inference problem. This paper studies\namortized sampling of the posterior over data, $\\mathbf{x}\\sim p^{\\rm\npost}(\\mathbf{x})\\propto p(\\mathbf{x})r(\\mathbf{x})$, in a model that consists\nof a diffusion generative model prior $p(\\mathbf{x})$ and a black-box\nconstraint or likelihood function $r(\\mathbf{x})$. We state and prove the\nasymptotic correctness of a data-free learning objective, relative trajectory\nbalance, for training a diffusion model that samples from this posterior, a\nproblem that existing methods solve only approximately or in restricted cases.\nRelative trajectory balance arises from the generative flow network perspective\non diffusion models, which allows the use of deep reinforcement learning\ntechniques to improve mode coverage. Experiments illustrate the broad potential\nof unbiased inference of arbitrary posteriors under diffusion priors: in vision\n(classifier guidance), language (infilling under a discrete diffusion LLM), and\nmultimodal data (text-to-image generation). Beyond generative modeling, we\napply relative trajectory balance to the problem of continuous control with a\nscore-based behavior prior, achieving state-of-the-art results on benchmarks in\noffline reinforcement learning.\n","authors":["Siddarth Venkatraman","Moksh Jain","Luca Scimeca","Minsu Kim","Marcin Sendera","Mohsin Hasan","Luke Rowe","Sarthak Mittal","Pablo Lemos","Emmanuel Bengio","Alexandre Adam","Jarrid Rector-Brooks","Yoshua Bengio","Glen Berseth","Nikolay Malkin"],"pdf_url":"https://arxiv.org/pdf/2405.20971v2.pdf","comment":"NeurIPS 2024; code: https://github.com/GFNOrg/diffusion-finetuning"},{"id":"http://arxiv.org/abs/2501.07173v1","updated":"2025-01-13T10:05:47Z","published":"2025-01-13T10:05:47Z","title":"Knowledge Distillation and Enhanced Subdomain Adaptation Using Graph\n  Convolutional Network for Resource-Constrained Bearing Fault Diagnosis","summary":"  Bearing fault diagnosis under varying working conditions faces challenges,\nincluding a lack of labeled data, distribution discrepancies, and resource\nconstraints. To address these issues, we propose a progressive knowledge\ndistillation framework that transfers knowledge from a complex teacher model,\nutilizing a Graph Convolutional Network (GCN) with Autoregressive moving\naverage (ARMA) filters, to a compact and efficient student model. To mitigate\ndistribution discrepancies and labeling uncertainty, we introduce Enhanced\nLocal Maximum Mean Squared Discrepancy (ELMMSD), which leverages mean and\nvariance statistics in the Reproducing Kernel Hilbert Space (RKHS) and\nincorporates a priori probability distributions between labels. This approach\nincreases the distance between clustering centers, bridges subdomain gaps, and\nenhances subdomain alignment reliability. Experimental results on benchmark\ndatasets (CWRU and JNU) demonstrate that the proposed method achieves superior\ndiagnostic accuracy while significantly reducing computational costs.\nComprehensive ablation studies validate the effectiveness of each component,\nhighlighting the robustness and adaptability of the approach across diverse\nworking conditions.\n","authors":["Mohammadreza Kavianpour","Parisa Kavianpour","Amin Ramezani","Mohammad TH Beheshti"],"pdf_url":"https://arxiv.org/pdf/2501.07173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07172v1","updated":"2025-01-13T10:04:55Z","published":"2025-01-13T10:04:55Z","title":"Anomalous Agreement: How to find the Ideal Number of Anomaly Classes in\n  Correlated, Multivariate Time Series Data","summary":"  Detecting and classifying abnormal system states is critical for condition\nmonitoring, but supervised methods often fall short due to the rarity of\nanomalies and the lack of labeled data. Therefore, clustering is often used to\ngroup similar abnormal behavior. However, evaluating cluster quality without\nground truth is challenging, as existing measures such as the Silhouette Score\n(SSC) only evaluate the cohesion and separation of clusters and ignore possible\nprior knowledge about the data. To address this challenge, we introduce the\nSynchronized Anomaly Agreement Index (SAAI), which exploits the synchronicity\nof anomalies across multivariate time series to assess cluster quality. We\ndemonstrate the effectiveness of SAAI by showing that maximizing SAAI improves\naccuracy on the task of finding the true number of anomaly classes K in\ncorrelated time series by 0.23 compared to SSC and by 0.32 compared to X-Means.\nWe also show that clusters obtained by maximizing SAAI are easier to interpret\ncompared to SSC.\n","authors":["Ferdinand Rewicki","Joachim Denzler","Julia Niebling"],"pdf_url":"https://arxiv.org/pdf/2501.07172v1.pdf","comment":"Acccepted at AAAI Workshop on AI for Time Series Analysis (AI4TS)\n  2025"},{"id":"http://arxiv.org/abs/2409.01990v3","updated":"2025-01-13T10:02:27Z","published":"2024-09-03T15:35:01Z","title":"Efficient Large Foundation Models Design: A Perspective From Model and\n  System Co-Design","summary":"  This paper focuses on modern efficient training and inference technologies on\nfoundation models and illustrates them from two perspectives: model and system\ndesign. Model and System Design optimize LLM training and inference from\ndifferent aspects to save computational resources, making LLMs more efficient,\naffordable, and more accessible. The paper list repository is available at\n\\url{https://github.com/NoakLiu/Efficient-Foundation-Models-Survey}\n","authors":["Dong Liu","Yanxuan Yu","Zhixin Lai","Yite Wang","Jing Wu","Zhongwei Wan","Sina Alinejad","Benjamin Lengerich","Ying Nian Wu"],"pdf_url":"https://arxiv.org/pdf/2409.01990v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05098v4","updated":"2025-01-13T09:56:11Z","published":"2024-02-07T18:51:49Z","title":"Improved off-policy training of diffusion samplers","summary":"  We study the problem of training diffusion models to sample from a\ndistribution with a given unnormalized density or energy function. We benchmark\nseveral diffusion-structured inference methods, including simulation-based\nvariational approaches and off-policy methods (continuous generative flow\nnetworks). Our results shed light on the relative advantages of existing\nalgorithms while bringing into question some claims from past work. We also\npropose a novel exploration strategy for off-policy methods, based on local\nsearch in the target space with the use of a replay buffer, and show that it\nimproves the quality of samples on a variety of target distributions. Our code\nfor the sampling methods and benchmarks studied is made public at\nhttps://github.com/GFNOrg/gfn-diffusion as a base for future work on diffusion\nmodels for amortized inference.\n","authors":["Marcin Sendera","Minsu Kim","Sarthak Mittal","Pablo Lemos","Luca Scimeca","Jarrid Rector-Brooks","Alexandre Adam","Yoshua Bengio","Nikolay Malkin"],"pdf_url":"https://arxiv.org/pdf/2402.05098v4.pdf","comment":"NeurIPS 2024; code: https://github.com/GFNOrg/gfn-diffusion"},{"id":"http://arxiv.org/abs/2501.07155v1","updated":"2025-01-13T09:28:47Z","published":"2025-01-13T09:28:47Z","title":"AlphaNet: Scaling Up Local Frame-based Atomistic Foundation Model","summary":"  We present AlphaNet, a local frame-based equivariant model designed to\nachieve both accurate and efficient simulations for atomistic systems.\nRecently, machine learning force fields (MLFFs) have gained prominence in\nmolecular dynamics simulations due to their advantageous efficiency-accuracy\nbalance compared to classical force fields and quantum mechanical calculations,\nalongside their transferability across various systems. Despite the\nadvancements in improving model accuracy, the efficiency and scalability of\nMLFFs remain significant obstacles in practical applications. AlphaNet enhances\ncomputational efficiency and accuracy by leveraging the local geometric\nstructures of atomic environments through the construction of equivariant local\nframes and learnable frame transitions. We substantiate the efficacy of\nAlphaNet across diverse datasets, including defected graphene, formate\ndecomposition, zeolites, and surface reactions. AlphaNet consistently surpasses\nwell-established models, such as NequIP and DeepPot, in terms of both energy\nand force prediction accuracy. Notably, AlphaNet offers one of the best\ntrade-offs between computational efficiency and accuracy among existing models.\nMoreover, AlphaNet exhibits scalability across a broad spectrum of system and\ndataset sizes, affirming its versatility.\n","authors":["Bangchen Yin","Jiaao Wang","Weitao Du","Pengbo Wang","Penghua Ying","Haojun Jia","Zisheng Zhang","Yuanqi Du","Carla P. Gomes","Chenru Duan","Hai Xiao","Graeme Henkelman"],"pdf_url":"https://arxiv.org/pdf/2501.07155v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.05223v2","updated":"2025-01-13T09:27:23Z","published":"2025-01-09T13:19:59Z","title":"EVA-S2PLoR: A Secure Element-wise Multiplication Meets Logistic\n  Regression on Heterogeneous Database","summary":"  Accurate nonlinear computation is a key challenge in privacy-preserving\nmachine learning (PPML). Most existing frameworks approximate it through linear\noperations, resulting in significant precision loss. This paper proposes an\nefficient, verifiable and accurate security 2-party logistic regression\nframework (EVA-S2PLoR), which achieves accurate nonlinear function computation\nthrough a novel secure element-wise multiplication protocol and its derived\nprotocols. Our framework primarily includes secure 2-party vector element-wise\nmultiplication, addition to multiplication, reciprocal, and sigmoid function\nbased on data disguising technology, where high efficiency and accuracy are\nguaranteed by the simple computation flow based on the real number domain and\nthe few number of fixed communication rounds. We provide secure and robust\nanomaly detection through dimension transformation and Monte Carlo methods.\nEVA-S2PLoR outperforms many advanced frameworks in terms of precision\n(improving the performance of the sigmoid function by about 10 orders of\nmagnitude compared to most frameworks) and delivers the best overall\nperformance in secure logistic regression experiments.\n","authors":["Tianle Tao","Shizhao Peng","Tianyu Mei","Shoumo Li","Haogang Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.05223v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13139v3","updated":"2025-01-13T09:19:48Z","published":"2024-08-23T15:01:37Z","title":"Optimally Solving Simultaneous-Move Dec-POMDPs: The Sequential Central\n  Planning Approach","summary":"  The centralized training for decentralized execution paradigm emerged as the\nstate-of-the-art approach to $\\epsilon$-optimally solving decentralized\npartially observable Markov decision processes. However, scalability remains a\nsignificant issue. This paper presents a novel and more scalable alternative,\nnamely the sequential-move centralized training for decentralized execution.\nThis paradigm further pushes the applicability of the Bellman's principle of\noptimality, raising three new properties. First, it allows a central planner to\nreason upon sufficient sequential-move statistics instead of prior\nsimultaneous-move ones. Next, it proves that $\\epsilon$-optimal value functions\nare piecewise linear and convex in such sufficient sequential-move statistics.\nFinally, it drops the complexity of the backup operators from double\nexponential to polynomial at the expense of longer planning horizons. Besides,\nit makes it easy to use single-agent methods, e.g., SARSA algorithm enhanced\nwith these findings, while still preserving convergence guarantees. Experiments\non two- as well as many-agent domains from the literature against\n$\\epsilon$-optimal simultaneous-move solvers confirm the superiority of our\nnovel approach. This paradigm opens the door for efficient planning and\nreinforcement learning methods for multi-agent systems.\n","authors":["Johan Peralez","Aurèlien Delage","Jacopo Castellini","Rafael F. Cunha","Jilles S. Dibangoye"],"pdf_url":"https://arxiv.org/pdf/2408.13139v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07146v1","updated":"2025-01-13T09:11:33Z","published":"2025-01-13T09:11:33Z","title":"TIMRL: A Novel Meta-Reinforcement Learning Framework for Non-Stationary\n  and Multi-Task Environments","summary":"  In recent years, meta-reinforcement learning (meta-RL) algorithm has been\nproposed to improve sample efficiency in the field of decision-making and\ncontrol, enabling agents to learn new knowledge from a small number of samples.\nHowever, most research uses the Gaussian distribution to extract task\nrepresentation, which is poorly adapted to tasks that change in non-stationary\nenvironment. To address this problem, we propose a novel meta-reinforcement\nlearning method by leveraging Gaussian mixture model and the transformer\nnetwork to construct task inference model. The Gaussian mixture model is\nutilized to extend the task representation and conduct explicit encoding of\ntasks. Specifically, the classification of tasks is encoded through transformer\nnetwork to determine the Gaussian component corresponding to the task. By\nleveraging task labels, the transformer network is trained using supervised\nlearning. We validate our method on MuJoCo benchmarks with non-stationary and\nmulti-task environments. Experimental results demonstrate that the proposed\nmethod dramatically improves sample efficiency and accurately recognizes the\nclassification of the tasks, while performing excellently in the environment.\n","authors":["Chenyang Qi","Huiping Li","Panfeng Huang"],"pdf_url":"https://arxiv.org/pdf/2501.07146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10676v2","updated":"2025-01-13T09:10:18Z","published":"2024-11-16T02:41:12Z","title":"Exploring Feature-based Knowledge Distillation for Recommender System: A\n  Frequency Perspective","summary":"  In this paper, we analyze the feature-based knowledge distillation for\nrecommendation from the frequency perspective. By defining knowledge as\ndifferent frequency components of the features, we theoretically demonstrate\nthat regular feature-based knowledge distillation is equivalent to equally\nminimizing losses on all knowledge and further analyze how this equal loss\nweight allocation method leads to important knowledge being overlooked. In\nlight of this, we propose to emphasize important knowledge by redistributing\nknowledge weights. Furthermore, we propose FreqD, a lightweight knowledge\nreweighting method, to avoid the computational cost of calculating losses on\neach knowledge. Extensive experiments demonstrate that FreqD consistently and\nsignificantly outperforms state-of-the-art knowledge distillation methods for\nrecommender systems. Our code is available at https://github.com/woriazzc/KDs.\n","authors":["Zhangchi Zhu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.10676v2.pdf","comment":"ACM KDD 2025 Accepted"},{"id":"http://arxiv.org/abs/2409.08303v2","updated":"2025-01-13T09:03:56Z","published":"2024-09-10T20:04:27Z","title":"Explainable Metrics for the Assessment of Neurodegenerative Diseases\n  through Handwriting Analysis","summary":"  Motor dysfunction is a common sign of neurodegenerative diseases (NDs) such\nas Parkinson's disease (PD) and Alzheimer's disease (AD), but may be difficult\nto detect, especially in the early stages. In this work, we examine the\nbehavior of a wide array of explainable metrics extracted from the handwriting\nsignals of 113 subjects performing multiple tasks on a digital tablet, as part\nof the Neurological Signals dataset. The aim is to measure their effectiveness\nin characterizing NDs, including AD and PD. To this end, task-agnostic and\ntask-specific metrics are extracted from 14 distinct tasks. Subsequently,\nthrough statistical analysis and a series of classification experiments, we\ninvestigate which metrics provide greater discriminative power between NDs and\nhealthy controls and amongst different NDs. Preliminary results indicate that\nthe tasks at hand can all be effectively leveraged to distinguish between the\nconsidered set of NDs, specifically by measuring the stability, the speed of\nwriting, the time spent not writing, and the pressure variations between groups\nfrom our handcrafted explainable metrics, which shows p-values lower than\n0.0001 for multiple tasks. Using various binary classification algorithms on\nthe computed metrics, we obtain up to 87 % accuracy for the discrimination\nbetween AD and healthy controls (CTL), and up to 69 % for the discrimination\nbetween PD and CTL.\n","authors":["Thomas Thebaud","Anna Favaro","Casey Chen","Gabrielle Chavez","Laureano Moro-Velazquez","Ankur Butala","Najim Dehak"],"pdf_url":"https://arxiv.org/pdf/2409.08303v2.pdf","comment":"14 pages including references, under review in IEEE JHBI"},{"id":"http://arxiv.org/abs/2404.14047v3","updated":"2025-01-13T09:01:13Z","published":"2024-04-22T10:03:03Z","title":"An empirical study of LLaMA3 quantization: from LLMs to MLLMs","summary":"  The LLaMA family, a collection of foundation language models ranging from 7B\nto 65B parameters, has become one of the most powerful open-source large\nlanguage models (LLMs) and the popular LLM backbone of multi-modal large\nlanguage models (MLLMs), widely used in computer vision and natural language\nunderstanding tasks. In particular, LLaMA3 models have recently been released\nand have achieved impressive performance in various domains with super-large\nscale pre-training on over 15T tokens of data. Given the wide application of\nlow-bit quantization for LLMs in resource-constrained scenarios, we explore\nLLaMA3's capabilities when quantized to low bit-width. This exploration can\npotentially provide new insights and challenges for the low-bit quantization of\nLLaMA3 and other future LLMs, especially in addressing performance degradation\nissues that suffer in LLM compression. Specifically, we comprehensively\nevaluate the 10 existing post-training quantization and LoRA fine-tuning\n(LoRA-FT) methods of LLaMA3 on 1-8 bits and various datasets to reveal the\nlow-bit quantization performance of LLaMA3. To uncover the capabilities of\nlow-bit quantized MLLM, we assessed the performance of the LLaMA3-based\nLLaVA-Next-8B model under 2-4 ultra-low bits with post-training quantization\nmethods. Our experimental results indicate that LLaMA3 still suffers from\nnon-negligible degradation in linguistic and visual contexts, particularly\nunder ultra-low bit widths. This highlights the significant performance gap at\nlow bit-width that needs to be addressed in future developments. We expect that\nthis empirical study will prove valuable in advancing future models, driving\nLLMs and MLLMs to achieve higher accuracy at lower bit to enhance practicality.\nOur project is released on https://github.com/Macaronlin/LLaMA3-Quantization ,\nand quantized models are released at https://huggingface.co/Efficient-ML .\n","authors":["Wei Huang","Xingyu Zheng","Xudong Ma","Haotong Qin","Chengtao Lv","Hong Chen","Jie Luo","Xiaojuan Qi","Xianglong Liu","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2404.14047v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07124v1","updated":"2025-01-13T08:26:43Z","published":"2025-01-13T08:26:43Z","title":"LLM360 K2: Scaling Up 360-Open-Source Large Language Models","summary":"  We detail the training of the LLM360 K2-65B model, scaling up our 360-degree\nOPEN SOURCE approach to the largest and most powerful models under project\nLLM360. While open-source LLMs continue to advance, the answer to \"How are the\nlargest LLMs trained?\" remains unclear within the community. The implementation\ndetails for such high-capacity models are often protected due to business\nconsiderations associated with their high cost. This lack of transparency\nprevents LLM researchers from leveraging valuable insights from prior\nexperience, e.g., \"What are the best practices for addressing loss spikes?\" The\nLLM360 K2 project addresses this gap by providing full transparency and access\nto resources accumulated during the training of LLMs at the largest scale. This\nreport highlights key elements of the K2 project, including our first model, K2\nDIAMOND, a 65 billion-parameter LLM that surpasses LLaMA-65B and rivals\nLLaMA2-70B, while requiring fewer FLOPs and tokens. We detail the\nimplementation steps and present a longitudinal analysis of K2 DIAMOND's\ncapabilities throughout its training process. We also outline ongoing projects\nsuch as TXT360, setting the stage for future models in the series. By offering\npreviously unavailable resources, the K2 project also resonates with the\n360-degree OPEN SOURCE principles of transparency, reproducibility, and\naccessibility, which we believe are vital in the era of resource-intensive AI\nresearch.\n","authors":["Zhengzhong Liu","Bowen Tan","Hongyi Wang","Willie Neiswanger","Tianhua Tao","Haonan Li","Fajri Koto","Yuqi Wang","Suqi Sun","Omkar Pangarkar","Richard Fan","Yi Gu","Victor Miller","Liqun Ma","Liping Tang","Nikhil Ranjan","Yonghao Zhuang","Guowei He","Renxi Wang","Mingkai Deng","Robin Algayres","Yuanzhi Li","Zhiqiang Shen","Preslav Nakov","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2501.07124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07123v1","updated":"2025-01-13T08:25:14Z","published":"2025-01-13T08:25:14Z","title":"Inferring Interpretable Models of Fragmentation Functions using Symbolic\n  Regression","summary":"  Machine learning is rapidly making its path into natural sciences, including\nhigh-energy physics. We present the first study that infers, directly from\nexperimental data, a functional form of fragmentation functions. The latter\nrepresent a key ingredient to describe physical observables measured in\nhigh-energy physics processes that involve hadron production, and predict their\nvalues at different energy. Fragmentation functions can not be calculated in\ntheory and have to be determined instead from data. Traditional approaches rely\non global fits of experimental data using a pre-assumed functional form\ninspired from phenomenological models to learn its parameters. This novel\napproach uses a ML technique, namely symbolic regression, to learn an\nanalytical model from measured charged hadron multiplicities. The function\nlearned by symbolic regression resembles the Lund string function and describes\nthe data well, thus representing a potential candidate for use in global FFs\nfits. This study represents an approach to follow in such QCD-related\nphenomenology studies and more generally in sciences.\n","authors":["Nour Makke","Sanjay Chawla"],"pdf_url":"https://arxiv.org/pdf/2501.07123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16264v2","updated":"2025-01-13T08:00:49Z","published":"2024-12-20T09:22:07Z","title":"Continual Learning with Strategic Selection and Forgetting for Network\n  Intrusion Detection","summary":"  Intrusion Detection Systems (IDS) are crucial for safeguarding digital\ninfrastructure. In dynamic network environments, both threat landscapes and\nnormal operational behaviors are constantly changing, resulting in concept\ndrift. While continuous learning mitigates the adverse effects of concept\ndrift, insufficient attention to drift patterns and excessive preservation of\noutdated knowledge can still hinder the IDS's adaptability. In this paper, we\npropose SSF (Strategic Selection and Forgetting), a novel continual learning\nmethod for IDS, providing continuous model updates with a constantly refreshed\nmemory buffer. Our approach features a strategic sample selection algorithm to\nselect representative new samples and a strategic forgetting mechanism to drop\noutdated samples. The proposed strategic sample selection algorithm prioritizes\nnew samples that cause the `drifted' pattern, enabling the model to better\nunderstand the evolving landscape. Additionally, we introduce strategic\nforgetting upon detecting significant drift by discarding outdated samples to\nfree up memory, allowing the incorporation of more recent data. SSF captures\nevolving patterns effectively and ensures the model is aligned with the change\nof data patterns, significantly enhancing the IDS's adaptability to concept\ndrift. The state-of-the-art performance of SSF on NSL-KDD and UNSW-NB15\ndatasets demonstrates its superior adaptability to concept drift for network\nintrusion detection.\n","authors":["Xinchen Zhang","Running Zhao","Zhihan Jiang","Handi Chen","Yulong Ding","Edith C. H. Ngai","Shuang-Hua Yang"],"pdf_url":"https://arxiv.org/pdf/2412.16264v2.pdf","comment":"Accepted by IEEE International Conference on Computer Communications\n  (INFOCOM) 2025"},{"id":"http://arxiv.org/abs/2412.13973v2","updated":"2025-01-13T07:50:54Z","published":"2024-12-18T15:50:50Z","title":"Model-Agnostic Cosmological Inference with SDSS-IV eBOSS: Simultaneous\n  Probing for Background and Perturbed Universe","summary":"  Here we explore certain subtle features imprinted in data from the completed\nSloan Digital Sky Survey IV (SDSS-IV) extended Baryon Oscillation Spectroscopic\nSurvey (eBOSS) as a combined probe for the background and perturbed Universe.\nWe reconstruct the baryon Acoustic Oscillation (BAO) and Redshift Space\nDistortion (RSD) observables as functions of redshift, using measurements from\nSDSS alone. We apply the Multi-Task Gaussian Process (MTGP) framework to model\nthe interdependencies of cosmological observables $D_M(z)/r_d$, $D_H(z)/r_d$,\nand $f\\sigma_8(z)$, and track their evolution across different redshifts.\nSubsequently, we obtain constrained three-dimensional phase space containing\n$D_M(z)/r_d$, $D_H(z)/r_d$, and $f\\sigma_8(z)$ at different redshifts probed by\nthe SDSS-IV eBOSS survey. Furthermore, assuming the $\\Lambda$CDM model, we\nobtain constraints on model parameters $\\Omega_{m}$, $H_{0}r_{d}$, $\\sigma_{8}$\nand $S_{8}$ at each redshift probed by SDSS-IV eBOSS. This indicates\nredshift-dependent trends in $H_0$, $\\Omega_m$, $\\sigma_8$ and $S_8$ in the\n$\\Lambda$CDM model, suggesting a possible inconsistency in the $\\Lambda$CDM\nmodel. Ours is a template for model-independent extraction of information for\nboth background and perturbed Universe using a single galaxy survey taking into\naccount all the existing correlations between background and perturbed\nobservables and this can be easily extended to future DESI-3YR as well as\nEuclid results.\n","authors":["Purba Mukherjee","Anjan A. Sen"],"pdf_url":"https://arxiv.org/pdf/2412.13973v2.pdf","comment":"14 pages, 7 sets of figures, 3 tables. Comments are welcome. New\n  references added"},{"id":"http://arxiv.org/abs/2501.05809v2","updated":"2025-01-13T07:49:28Z","published":"2025-01-10T09:19:10Z","title":"AdaPRL: Adaptive Pairwise Regression Learning with Uncertainty\n  Estimation for Universal Regression Tasks","summary":"  Current deep regression models usually learn in point-wise way that treat\neach sample as an independent input, neglecting the relative ordering among\ndifferent data. Consequently, the regression model could neglect the data 's\ninterrelationships, potentially resulting in suboptimal performance. Moreover,\nthe existence of aleatoric uncertainty in the training data may drive the model\nto capture non-generalizable patterns, contributing to increased overfitting.\nTo address these issues, we propose a novel adaptive pairwise learning\nframework (AdaPRL) for regression tasks which leverages the relative\ndifferences between data points and integrates with deep probabilistic models\nto quantify the uncertainty associated with the predictions. Additionally, we\nadapt AdaPRL for applications in multi-task learning and multivariate time\nseries forecasting. Extensive experiments with several real-world regression\ndatasets including recommendation systems, age estimation, time series\nforecasting, natural language understanding, finance, and industry datasets\nshow that AdaPRL is compatible with different backbone networks in various\ntasks and achieves state-of-the-art performance on the vast majority of tasks,\nhighlighting its notable potential including enhancing prediction accuracy and\nranking ability, increasing generalization capability, improving robustness to\nnoisy data, improving resilience to reduced data, and enhancing\ninterpretability, etc.\n","authors":["Fuhang Liang","Rucong Xu","Deng Lin"],"pdf_url":"https://arxiv.org/pdf/2501.05809v2.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2409.17692v3","updated":"2025-01-13T07:41:44Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":"  In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v3.pdf","comment":"Technical Report. Codes and models are available in\n  https://github.com/MIO-Team/MIO"},{"id":"http://arxiv.org/abs/2411.14789v2","updated":"2025-01-13T07:29:53Z","published":"2024-11-22T08:17:46Z","title":"Simplifying CLIP: Unleashing the Power of Large-Scale Models on\n  Consumer-level Computers","summary":"  Contrastive Language-Image Pre-training (CLIP) has attracted a surge of\nattention for its superior zero-shot performance and excellent transferability\nto downstream tasks. However, training such large-scale models usually requires\nsubstantial computation and storage, which poses barriers for general users\nwith consumer-level computers. Motivated by this observation, in this paper we\ninvestigate how to achieve competitive performance on only one Nvidia RTX3090\nGPU and with one terabyte for storing dataset. On one hand, we simplify the\ntransformer block structure and combine Weight Inheritance with multi-stage\nKnowledge Distillation (WIKD), thereby reducing the parameters and improving\nthe inference speed during training along with deployment. On the other hand,\nconfronted with the convergence challenge posed by small dataset, we generate\nsynthetic captions for each sample as data augmentation, and devise a novel\nPair Matching (PM) loss to fully exploit the distinguishment among positive and\nnegative image-text pairs. Extensive experiments demonstrate that our model can\nachieve a new state-of-the-art datascale-parameter-accuracy tradeoff, which\ncould further popularize the CLIP model in the related research community.\n","authors":["Hongbo Liu"],"pdf_url":"https://arxiv.org/pdf/2411.14789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07249v2","updated":"2025-01-13T07:22:02Z","published":"2024-12-10T07:18:51Z","title":"Buster: Implanting Semantic Backdoor into Text Encoder to Mitigate NSFW\n  Content Generation","summary":"  The rise of deep learning models in the digital era has raised substantial\nconcerns regarding the generation of Not-Safe-for-Work (NSFW) content. Existing\ndefense methods primarily involve model fine-tuning and post-hoc content\nmoderation. Nevertheless, these approaches largely lack scalability in\neliminating harmful content, degrade the quality of benign image generation, or\nincur high inference costs. To address these challenges, we propose an\ninnovative framework named \\textit{Buster}, which injects backdoors into the\ntext encoder to prevent NSFW content generation. Buster leverages deep semantic\ninformation rather than explicit prompts as triggers, redirecting NSFW prompts\ntowards targeted benign prompts. Additionally, Buster employs energy-based\ntraining data generation through Langevin dynamics for adversarial knowledge\naugmentation, thereby ensuring robustness in harmful concept definition. This\napproach demonstrates exceptional resilience and scalability in mitigating NSFW\ncontent. Particularly, Buster fine-tunes the text encoder of Text-to-Image\nmodels within merely five minutes, showcasing its efficiency. Our extensive\nexperiments denote that Buster outperforms nine state-of-the-art baselines,\nachieving a superior NSFW content removal rate of at least 91.2\\% while\npreserving the quality of harmless images.\n","authors":["Xin Zhao","Xiaojun Chen","Yuexin Xuan","Zhendong Zhao","Xiaojun Jia","Xinfeng Li","Xiaofeng Wang"],"pdf_url":"https://arxiv.org/pdf/2412.07249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19943v3","updated":"2025-01-13T06:53:56Z","published":"2024-11-29T18:58:22Z","title":"Critical Tokens Matter: Token-Level Contrastive Estimation Enhances\n  LLM's Reasoning Capability","summary":"  Mathematical reasoning tasks pose significant challenges for large language\nmodels (LLMs) because they require precise logical deduction and sequence\nanalysis. In this work, we introduce the concept of critical tokens -- elements\nwithin reasoning trajectories that significantly influence incorrect outcomes.\nWe present a novel framework for identifying these tokens through rollout\nsampling and demonstrate their substantial divergence from traditional error\ntokens. Through extensive experiments on datasets such as GSM8K and MATH500, we\nshow that identifying and replacing critical tokens significantly improves\nmodel accuracy. We propose an efficient methodology for pinpointing these\ntokens in large-scale datasets using contrastive estimation and extend this\nframework to enhance model training processes with direct preference\noptimization (DPO). Experimental results on GSM8K and MATH500 benchmarks with\nthe widely used models Llama-3 (8B and 70B) and Deepseek-math (7B) demonstrate\nthe effectiveness of the proposed approach, cDPO. Our results underscore the\npotential of leveraging critical tokens to reduce errors in reasoning tasks,\nadvancing the development of AI systems capable of robust logical deduction.\nOur code, annotated datasets, and trained models are available at\nhttps://github.com/chenzhiling9954/Critical-Tokens-Matter to support and\nencourage future research in this promising field.\n","authors":["Zicheng Lin","Tian Liang","Jiahao Xu","Qiuzhi Lin","Xing Wang","Ruilin Luo","Chufan Shi","Siheng Li","Yujiu Yang","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2411.19943v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2403.10182v5","updated":"2025-01-13T06:51:13Z","published":"2024-03-15T10:38:48Z","title":"Fast and reliable uncertainty quantification with neural network\n  ensembles for industrial image classification","summary":"  Image classification with neural networks (NNs) is widely used in industrial\nprocesses, situations where the model likely encounters unknown objects during\ndeployment, i.e., out-of-distribution (OOD) data. Worryingly, NNs tend to make\nconfident yet incorrect predictions when confronted with OOD data. To increase\nthe models' reliability, they should quantify the uncertainty in their own\npredictions, communicating when the output should (not) be trusted. Deep\nensembles, composed of multiple independent NNs, have been shown to perform\nstrongly but are computationally expensive. Recent research has proposed more\nefficient NN ensembles, namely the snapshot, batch, and multi-input\nmulti-output ensemble. This study investigates the predictive and uncertainty\nperformance of efficient NN ensembles in the context of image classification\nfor industrial processes. It is the first to provide a comprehensive comparison\nand it proposes a novel Diversity Quality metric to quantify the ensembles'\nperformance on the in-distribution and OOD sets in one single metric. The\nresults highlight the batch ensemble as a cost-effective and competitive\nalternative to the deep ensemble. It matches the deep ensemble in both\nuncertainty and accuracy while exhibiting considerable savings in training\ntime, test time, and memory storage.\n","authors":["Arthur Thuy","Dries F. Benoit"],"pdf_url":"https://arxiv.org/pdf/2403.10182v5.pdf","comment":"Accepted Manuscript version of an article published in Annals of\n  Operations Research"},{"id":"http://arxiv.org/abs/2405.13796v5","updated":"2025-01-13T06:35:54Z","published":"2024-05-22T16:21:02Z","title":"Generalizing Weather Forecast to Fine-grained Temporal Scales via\n  Physics-AI Hybrid Modeling","summary":"  Data-driven artificial intelligence (AI) models have made significant\nadvancements in weather forecasting, particularly in medium-range and\nnowcasting. However, most data-driven weather forecasting models are black-box\nsystems that focus on learning data mapping rather than fine-grained physical\nevolution in the time dimension. Consequently, the limitations in the temporal\nscale of datasets prevent these models from forecasting at finer time scales.\nThis paper proposes a physics-AI hybrid model (i.e., WeatherGFT) which\ngeneralizes weather forecasts to finer-grained temporal scales beyond training\ndataset. Specifically, we employ a carefully designed PDE kernel to simulate\nphysical evolution on a small time scale (e.g., 300 seconds) and use a parallel\nneural networks with a learnable router for bias correction. Furthermore, we\nintroduce a lead time-aware training framework to promote the generalization of\nthe model at different lead times. The weight analysis of physics-AI modules\nindicates that physics conducts major evolution while AI performs corrections\nadaptively. Extensive experiments show that WeatherGFT trained on an hourly\ndataset, effectively generalizes forecasts across multiple time scales,\nincluding 30-minute, which is even smaller than the dataset's temporal\nresolution.\n","authors":["Wanghan Xu","Fenghua Ling","Wenlong Zhang","Tao Han","Hao Chen","Wanli Ouyang","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2405.13796v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07959v2","updated":"2025-01-13T06:25:57Z","published":"2024-11-12T17:36:20Z","title":"On the Convergence of Continual Federated Learning Using Incrementally\n  Aggregated Gradients","summary":"  The holy grail of machine learning is to enable Continual Federated Learning\n(CFL) to enhance the efficiency, privacy, and scalability of AI systems while\nlearning from streaming data. The primary challenge of a CFL system is to\novercome global catastrophic forgetting, wherein the accuracy of the global\nmodel trained on new tasks declines on the old tasks. In this work, we propose\nContinual Federated Learning with Aggregated Gradients (C-FLAG), a novel\nreplay-memory based federated strategy consisting of edge-based gradient\nupdates on memory and aggregated gradients on the current data. We provide\nconvergence analysis of the C-FLAG approach which addresses forgetting and bias\nwhile converging at a rate of $O(1/\\sqrt{T})$ over $T$ communication rounds. We\nformulate an optimization sub-problem that minimizes catastrophic forgetting,\ntranslating CFL into an iterative algorithm with adaptive learning rates that\nensure seamless learning across tasks. We empirically show that C-FLAG\noutperforms several state-of-the-art baselines on both task and\nclass-incremental settings with respect to metrics such as accuracy and\nforgetting.\n","authors":["Satish Kumar Keshri","Nazreen Shah","Ranjitha Prasad"],"pdf_url":"https://arxiv.org/pdf/2411.07959v2.pdf","comment":"30 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.07077v1","updated":"2025-01-13T06:16:11Z","published":"2025-01-13T06:16:11Z","title":"D3MES: Diffusion Transformer with multihead equivariant self-attention\n  for 3D molecule generation","summary":"  Understanding and predicting the diverse conformational states of molecules\nis crucial for advancing fields such as chemistry, material science, and drug\ndevelopment. Despite significant progress in generative models, accurately\ngenerating complex and biologically or material-relevant molecular structures\nremains a major challenge. In this work, we introduce a diffusion model for\nthree-dimensional (3D) molecule generation that combines a classifiable\ndiffusion model, Diffusion Transformer, with multihead equivariant\nself-attention. This method addresses two key challenges: correctly attaching\nhydrogen atoms in generated molecules through learning representations of\nmolecules after hydrogen atoms are removed; and overcoming the limitations of\nexisting models that cannot generate molecules across multiple classes\nsimultaneously. The experimental results demonstrate that our model not only\nachieves state-of-the-art performance across several key metrics but also\nexhibits robustness and versatility, making it highly suitable for early-stage\nlarge-scale generation processes in molecular design, followed by validation\nand further screening to obtain molecules with specific properties.\n","authors":["Zhejun Zhang","Yuanping Chen","Shibing Chu"],"pdf_url":"https://arxiv.org/pdf/2501.07077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06293v3","updated":"2025-01-13T04:59:24Z","published":"2024-02-09T10:14:18Z","title":"Probabilistic Forecasting of Irregular Time Series via Conditional Flows","summary":"  Probabilistic forecasting of irregularly sampled multivariate time series\nwith missing values is an important problem in many fields, including health\ncare, astronomy, and climate. State-of-the-art methods for the task estimate\nonly marginal distributions of observations in single channels and at single\ntimepoints, assuming a fixed-shape parametric distribution. In this work, we\npropose a novel model, ProFITi, for probabilistic forecasting of irregularly\nsampled time series with missing values using conditional normalizing flows.\nThe model learns joint distributions over the future values of the time series\nconditioned on past observations and queried channels and times, without\nassuming any fixed shape of the underlying distribution. As model components,\nwe introduce a novel invertible triangular attention layer and an invertible\nnon-linear activation function on and onto the whole real line. We conduct\nextensive experiments on four datasets and demonstrate that the proposed model\nprovides $4$ times higher likelihood over the previously best model.\n","authors":["Vijaya Krishna Yalavarthi","Randolf Scholz","Stefan Born","Lars Schmidt-Thieme"],"pdf_url":"https://arxiv.org/pdf/2402.06293v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.06838v2","updated":"2025-01-13T04:32:01Z","published":"2024-12-07T06:12:23Z","title":"Hardware implementation of timely reliable Bayesian decision-making\n  using memristors","summary":"  Brains perform decision-making by Bayes theorem. The theorem quantifies\nevents as probabilities and, based on probability rules, renders the decisions.\nLearning from this, Bayes theorem can be applied to enable efficient user-scene\ninteractions. However, given the probabilistic nature, implementing Bayes\ntheorem in hardware using conventional deterministic computing can incur\nexcessive computational cost and decision latency. Though challenging, here we\npresent a probabilistic computing approach based on memristors to implement the\nBayes theorem. We integrate memristors with Boolean logics and, by exploiting\nthe volatile stochastic switching of the memristors, realise probabilistic\nlogic operations, key for hardware Bayes theorem implementation. To empirically\nvalidate the efficacy of the hardware Bayes theorem in user-scene interactions,\nwe develop lightweight Bayesian inference and fusion hardware operators using\nthe probabilistic logics and apply the operators in road scene parsing for\nself-driving, including route planning and obstacle detection. The results show\nour operators can achieve reliable decisions in less than 0.4 ms (or\nequivalently 2,500 fps), outperforming human decision-making and the existing\ndriving assistance systems.\n","authors":["Lekai Song","Pengyu Liu","Yang Liu","Jingfang Pei","Wenyu Cui","Songwei Liu","Yingyi Wen","Teng Ma","Kong-Pang Pun","Leonard W. T. Ng","Guohua Hu"],"pdf_url":"https://arxiv.org/pdf/2412.06838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07055v1","updated":"2025-01-13T04:30:41Z","published":"2025-01-13T04:30:41Z","title":"SFC-GAN: A Generative Adversarial Network for Brain Functional and\n  Structural Connectome Translation","summary":"  Modern brain imaging technologies have enabled the detailed reconstruction of\nhuman brain connectomes, capturing structural connectivity (SC) from diffusion\nMRI and functional connectivity (FC) from functional MRI. Understanding the\nintricate relationships between SC and FC is vital for gaining deeper insights\ninto the brain's functional and organizational mechanisms. However, obtaining\nboth SC and FC modalities simultaneously remains challenging, hindering\ncomprehensive analyses. Existing deep generative models typically focus on\nsynthesizing a single modality or unidirectional translation between FC and SC,\nthereby missing the potential benefits of bi-directional translation,\nespecially in scenarios where only one connectome is available. Therefore, we\npropose Structural-Functional Connectivity GAN (SFC-GAN), a novel framework for\nbidirectional translation between SC and FC. This approach leverages the\nCycleGAN architecture, incorporating convolutional layers to effectively\ncapture the spatial structures of brain connectomes. To preserve the\ntopological integrity of these connectomes, we employ a structure-preserving\nloss that guides the model in capturing both global and local connectome\npatterns while maintaining symmetry. Our framework demonstrates superior\nperformance in translating between SC and FC, outperforming baseline models in\nsimilarity and graph property evaluations compared to ground truth data, each\ntranslated modality can be effectively utilized for downstream classification.\n","authors":["Yee-Fan Tan","Jun Lin Liow","Pei-Sze Tan","Fuad Noman","Raphael C. -W. Phan","Hernando Ombao","Chee-Ming Ting"],"pdf_url":"https://arxiv.org/pdf/2501.07055v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2206.08523v4","updated":"2025-01-13T04:26:07Z","published":"2022-06-17T03:11:18Z","title":"A Spatio-Temporal Neural Network Forecasting Approach for Emulation of\n  Firefront Models","summary":"  Computational simulations of wildfire spread typically employ empirical\nrate-of-spread calculations under various conditions (such as terrain, fuel\ntype, weather). Small perturbations in conditions can often lead to significant\nchanges in fire spread (such as speed and direction), necessitating a\ncomputationally expensive large set of simulations to quantify uncertainty.\nModel emulation seeks alternative representations of physical models using\nmachine learning, aiming to provide more efficient and/or simplified surrogate\nmodels. We propose a dedicated spatio-temporal neural network based framework\nfor model emulation, able to capture the complex behaviour of fire spread\nmodels. The proposed approach can approximate forecasts at fine spatial and\ntemporal resolutions that are often challenging for neural network based\napproaches. Furthermore, the proposed approach is robust even with small\ntraining sets, due to novel data augmentation methods. Empirical experiments\nshow good agreement between simulated and emulated firefronts, with an average\nJaccard score of 0.76.\n","authors":["Andrew Bolt","Carolyn Huston","Petra Kuhnert","Joel Janek Dabrowski","James Hilton","Conrad Sanderson"],"pdf_url":"https://arxiv.org/pdf/2206.08523v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07046v1","updated":"2025-01-13T04:05:19Z","published":"2025-01-13T04:05:19Z","title":"Differentially Private Kernelized Contextual Bandits","summary":"  We consider the problem of contextual kernel bandits with stochastic\ncontexts, where the underlying reward function belongs to a known Reproducing\nKernel Hilbert Space (RKHS). We study this problem under the additional\nconstraint of joint differential privacy, where the agents needs to ensure that\nthe sequence of query points is differentially private with respect to both the\nsequence of contexts and rewards. We propose a novel algorithm that improves\nupon the state of the art and achieves an error rate of\n$\\mathcal{O}\\left(\\sqrt{\\frac{\\gamma_T}{T}} + \\frac{\\gamma_T}{T\n\\varepsilon}\\right)$ after $T$ queries for a large class of kernel families,\nwhere $\\gamma_T$ represents the effective dimensionality of the kernel and\n$\\varepsilon > 0$ is the privacy parameter. Our results are based on a novel\nestimator for the reward function that simultaneously enjoys high utility along\nwith a low-sensitivity to observed rewards and contexts, which is crucial to\nobtain an order optimal learning performance with improved dependence on the\nprivacy parameter.\n","authors":["Nikola Pavlovic","Sudeep Salgia","Qing Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07045v1","updated":"2025-01-13T03:55:59Z","published":"2025-01-13T03:55:59Z","title":"ACCon: Angle-Compensated Contrastive Regularizer for Deep Regression","summary":"  In deep regression, capturing the relationship among continuous labels in\nfeature space is a fundamental challenge that has attracted increasing\ninterest. Addressing this issue can prevent models from converging to\nsuboptimal solutions across various regression tasks, leading to improved\nperformance, especially for imbalanced regression and under limited sample\nsizes. However, existing approaches often rely on order-aware representation\nlearning or distance-based weighting. In this paper, we hypothesize a linear\nnegative correlation between label distances and representation similarities in\nregression tasks. To implement this, we propose an angle-compensated\ncontrastive regularizer for deep regression, which adjusts the cosine distance\nbetween anchor and negative samples within the contrastive learning framework.\nOur method offers a plug-and-play compatible solution that extends most\nexisting contrastive learning methods for regression tasks. Extensive\nexperiments and theoretical analysis demonstrate that our proposed\nangle-compensated contrastive regularizer not only achieves competitive\nregression performance but also excels in data efficiency and effectiveness on\nimbalanced datasets.\n","authors":["Botao Zhao","Xiaoyang Qu","Zuheng Kang","Junqing Peng","Jing Xiao","Jianzong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07045v1.pdf","comment":"Accept by AAAI-2025 (The 39th Annual AAAI Conference on Artificial\n  Intelligence)"},{"id":"http://arxiv.org/abs/2501.07044v1","updated":"2025-01-13T03:54:19Z","published":"2025-01-13T03:54:19Z","title":"Protego: Detecting Adversarial Examples for Vision Transformers via\n  Intrinsic Capabilities","summary":"  Transformer models have excelled in natural language tasks, prompting the\nvision community to explore their implementation in computer vision problems.\nHowever, these models are still influenced by adversarial examples. In this\npaper, we investigate the attack capabilities of six common adversarial attacks\non three pretrained ViT models to reveal the vulnerability of ViT models. To\nunderstand and analyse the bias in neural network decisions when the input is\nadversarial, we use two visualisation techniques that are attention rollout and\ngrad attention rollout. To prevent ViT models from adversarial attack, we\npropose Protego, a detection framework that leverages the transformer intrinsic\ncapabilities to detection adversarial examples of ViT models. Nonetheless, this\nis challenging due to a diversity of attack strategies that may be adopted by\nadversaries. Inspired by the attention mechanism, we know that the token of\nprediction contains all the information from the input sample. Additionally,\nthe attention region for adversarial examples differs from that of normal\nexamples. Given these points, we can train a detector that achieves superior\nperformance than existing detection methods to identify adversarial examples.\nOur experiments have demonstrated the high effectiveness of our detection\nmethod. For these six adversarial attack methods, our detector's AUC scores all\nexceed 0.95. Protego may advance investigations in metaverse security.\n","authors":["Jialin Wu","Kaikai Pan","Yanjiao Chen","Jiangyi Deng","Shengyuan Pang","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2501.07044v1.pdf","comment":"Accepted by IEEE MetaCom 2024"},{"id":"http://arxiv.org/abs/2501.01312v3","updated":"2025-01-13T03:53:34Z","published":"2025-01-02T15:53:25Z","title":"Learning Spectral Methods by Transformers","summary":"  Transformers demonstrate significant advantages as the building block of\nmodern LLMs. In this work, we study the capacities of Transformers in\nperforming unsupervised learning. We show that multi-layered Transformers,\ngiven a sufficiently large set of pre-training instances, are able to learn the\nalgorithms themselves and perform statistical estimation tasks given new\ninstances. This learning paradigm is distinct from the in-context learning\nsetup and is similar to the learning procedure of human brains where skills are\nlearned through past experience. Theoretically, we prove that pre-trained\nTransformers can learn the spectral methods and use the classification of\nbi-class Gaussian mixture model as an example. Our proof is constructive using\nalgorithmic design techniques. Our results are built upon the similarities of\nmulti-layered Transformer architecture with the iterative recovery algorithms\nused in practice. Empirically, we verify the strong capacity of the\nmulti-layered (pre-trained) Transformer on unsupervised learning through the\nlens of both the PCA and the Clustering tasks performed on the synthetic and\nreal-world datasets.\n","authors":["Yihan He","Yuan Cao","Hong-Yu Chen","Dennis Wu","Jianqing Fan","Han Liu"],"pdf_url":"https://arxiv.org/pdf/2501.01312v3.pdf","comment":"77 pages, 12 figures"},{"id":"http://arxiv.org/abs/2501.07034v1","updated":"2025-01-13T03:13:32Z","published":"2025-01-13T03:13:32Z","title":"Explore the Use of Time Series Foundation Model for Car-Following\n  Behavior Analysis","summary":"  Modeling car-following behavior is essential for traffic simulation,\nanalyzing driving patterns, and understanding complex traffic flows with\nvarying levels of autonomous vehicles. Traditional models like the Safe\nDistance Model and Intelligent Driver Model (IDM) require precise parameter\ncalibration and often lack generality due to simplified assumptions about\ndriver behavior. While machine learning and deep learning methods capture\ncomplex patterns, they require large labeled datasets. Foundation models\nprovide a more efficient alternative. Pre-trained on vast, diverse time series\ndatasets, they can be applied directly to various tasks without the need for\nextensive re-training. These models generalize well across domains, and with\nminimal fine-tuning, they can be adapted to specific tasks like car-following\nbehavior prediction. In this paper, we apply Chronos, a state-of-the-art public\ntime series foundation model, to analyze car-following behavior using the Open\nACC dataset. Without fine-tuning, Chronos outperforms traditional models like\nIDM and Exponential smoothing with trend and seasonality (ETS), and achieves\nsimilar results to deep learning models such as DeepAR and TFT, with an RMSE of\n0.60. After fine-tuning, Chronos reduces the error to an RMSE of 0.53,\nrepresenting a 33.75% improvement over IDM and a 12-37% reduction compared to\nmachine learning models like ETS and deep learning models including DeepAR,\nWaveNet, and TFT. This demonstrates the potential of foundation models to\nsignificantly advance transportation research, offering a scalable, adaptable,\nand highly accurate approach to predicting and simulating car-following\nbehaviors.\n","authors":["Luwei Zeng","Runze Yan"],"pdf_url":"https://arxiv.org/pdf/2501.07034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07033v1","updated":"2025-01-13T03:10:54Z","published":"2025-01-13T03:10:54Z","title":"Detection of AI Deepfake and Fraud in Online Payments Using GAN-Based\n  Models","summary":"  This study explores the use of Generative Adversarial Networks (GANs) to\ndetect AI deepfakes and fraudulent activities in online payment systems. With\nthe growing prevalence of deepfake technology, which can manipulate facial\nfeatures in images and videos, the potential for fraud in online transactions\nhas escalated. Traditional security systems struggle to identify these\nsophisticated forms of fraud. This research proposes a novel GAN-based model\nthat enhances online payment security by identifying subtle manipulations in\npayment images. The model is trained on a dataset consisting of real-world\nonline payment images and deepfake images generated using advanced GAN\narchitectures, such as StyleGAN and DeepFake. The results demonstrate that the\nproposed model can accurately distinguish between legitimate transactions and\ndeepfakes, achieving a high detection rate above 95%. This approach\nsignificantly improves the robustness of payment systems against AI-driven\nfraud. The paper contributes to the growing field of digital security, offering\ninsights into the application of GANs for fraud detection in financial\nservices. Keywords- Payment Security, Image Recognition, Generative Adversarial\nNetworks, AI Deepfake, Fraudulent Activities\n","authors":["Zong Ke","Shicheng Zhou","Yining Zhou","Chia Hong Chang","Rong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07033v1.pdf","comment":"The paper will be published and indexed by IEEE at 2025 8th\n  International Conference on Advanced Algorithms and Control Engineering\n  (ICAACE 2025)"},{"id":"http://arxiv.org/abs/2311.11200v2","updated":"2025-01-13T03:08:53Z","published":"2023-11-19T02:26:16Z","title":"Beyond the Power Law: Estimation, Goodness-of-Fit, and a Semiparametric\n  Extension in Complex Networks","summary":"  Scale-free networks play a fundamental role in the study of complex networks\nand various applied fields due to their ability to model a wide range of\nreal-world systems. A key characteristic of these networks is their degree\ndistribution, which often follows a power-law distribution, where the\nprobability mass function is proportional to $x^{-\\alpha}$, with $\\alpha$\ntypically ranging between $2 < \\alpha < 3$. In this paper, we introduce\nBayesian inference methods to obtain more accurate estimates than those\nobtained using traditional methods, which often yield biased estimates, and\nprecise credible intervals. Through a simulation study, we demonstrate that our\napproach provides nearly unbiased estimates for the scaling parameter,\nenhancing the reliability of inferences. We also evaluate new goodness-of-fit\ntests to improve the effectiveness of the Kolmogorov-Smirnov test, commonly\nused for this purpose. Our findings show that the Watson test offers superior\npower while maintaining a controlled type I error rate, enabling us to better\ndetermine whether data adheres to a power-law distribution. Finally, we propose\na piecewise extension of this model to provide greater flexibility, evaluating\nthe estimation and its goodness-of-fit features as well. In the complex\nnetworks field, this extension allows us to model the full degree distribution,\ninstead of just focusing on the tail, as is commonly done. We demonstrate the\nutility of these novel methods through applications to two real-world datasets,\nshowcasing their practical relevance and potential to advance the analysis of\npower-law behavior.\n","authors":["Nixon Jerez-Lillo","Francisco A. Rodrigues","Paulo H. Ferreira","Pedro L. Ramos"],"pdf_url":"https://arxiv.org/pdf/2311.11200v2.pdf","comment":"33 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.07032v1","updated":"2025-01-13T03:07:39Z","published":"2025-01-13T03:07:39Z","title":"PRKAN: Parameter-Reduced Kolmogorov-Arnold Networks","summary":"  Kolmogorov-Arnold Networks (KANs) represent an innovation in neural network\narchitectures, offering a compelling alternative to Multi-Layer Perceptrons\n(MLPs) in models such as Convolutional Neural Networks (CNNs), Recurrent Neural\nNetworks (RNNs), and Transformers. By advancing network design, KANs are\ndriving groundbreaking research and enabling transformative applications across\nvarious scientific domains involving neural networks. However, existing KANs\noften require significantly more parameters in their network layers compared to\nMLPs. To address this limitation, this paper introduces PRKANs\n(\\textbf{P}arameter-\\textbf{R}educed \\textbf{K}olmogorov-\\textbf{A}rnold\n\\textbf{N}etworks), which employ several methods to reduce the parameter count\nin KAN layers, making them comparable to MLP layers. Experimental results on\nthe MNIST and Fashion-MNIST datasets demonstrate that PRKANs with attention\nmechanisms outperform several existing KANs and rival the performance of MLPs,\nalbeit with slightly longer training times. Furthermore, the study highlights\nthe advantages of Gaussian Radial Basis Functions (GRBFs) and layer\nnormalization in KAN designs. The repository for this work is available at:\n\\url{https://github.com/hoangthangta/All-KAN}.\n","authors":["Hoang-Thang Ta","Duy-Quy Thai","Anh Tran","Grigori Sidorov","Alexander Gelbukh"],"pdf_url":"https://arxiv.org/pdf/2501.07032v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2412.07819v2","updated":"2025-01-13T03:03:57Z","published":"2024-12-10T12:14:38Z","title":"Intelligent System for Automated Molecular Patent Infringement\n  Assessment","summary":"  Automated drug discovery offers significant potential for accelerating the\ndevelopment of novel therapeutics by substituting labor-intensive human\nworkflows with machine-driven processes. However, molecules generated by\nartificial intelligence may unintentionally infringe on existing patents,\nposing legal and financial risks that impede the full automation of drug\ndiscovery pipelines. This paper introduces PatentFinder, a novel multi-agent\nand tool-enhanced intelligence system that can accurately and comprehensively\nevaluate small molecules for patent infringement. PatentFinder features five\nspecialized agents that collaboratively analyze patent claims and molecular\nstructures with heuristic and model-based tools, generating interpretable\ninfringement reports. To support systematic evaluation, we curate\nMolPatent-240, a benchmark dataset tailored for patent infringement assessment\nalgorithms. On this benchmark, PatentFinder outperforms baseline methods that\nrely solely on large language models or specialized chemical tools, achieving a\n13.8% improvement in F1-score and a 12% increase in accuracy. Additionally,\nPatentFinder autonomously generates detailed and interpretable patent\ninfringement reports, showcasing enhanced accuracy and improved\ninterpretability. The high accuracy and interpretability of PatentFinder make\nit a valuable and reliable tool for automating patent infringement assessments,\noffering a practical solution for integrating patent protection analysis into\nthe drug discovery pipeline.\n","authors":["Yaorui Shi","Sihang Li","Taiyan Zhang","Xi Fang","Jiankun Wang","Zhiyuan Liu","Guojiang Zhao","Zhengdan Zhu","Zhifeng Gao","Renxin Zhong","Linfeng Zhang","Guolin Ke","Weinan E","Hengxing Cai","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2412.07819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07030v1","updated":"2025-01-13T03:02:15Z","published":"2025-01-13T03:02:15Z","title":"Erasing Noise in Signal Detection with Diffusion Model: From Theory to\n  Application","summary":"  In this paper, a signal detection method based on the denoise diffusion model\n(DM) is proposed, which outperforms the maximum likelihood (ML) estimation\nmethod that has long been regarded as the optimal signal detection technique.\nTheoretically, a novel mathematical theory for intelligent signal detection\nbased on stochastic differential equations (SDEs) is established in this paper,\ndemonstrating the effectiveness of DM in reducing the additive white Gaussian\nnoise in received signals. Moreover, a mathematical relationship between the\nsignal-to-noise ratio (SNR) and the timestep in DM is established, revealing\nthat for any given SNR, a corresponding optimal timestep can be identified.\nFurthermore, to address potential issues with out-of-distribution inputs in the\nDM, we employ a mathematical scaling technique that allows the trained DM to\nhandle signal detection across a wide range of SNRs without any fine-tuning.\nBuilding on the above theoretical foundation, we propose a DM-based signal\ndetection method, with the diffusion transformer (DiT) serving as the backbone\nneural network, whose computational complexity of this method is\n$\\mathcal{O}(n^2)$. Simulation results demonstrate that, for BPSK and QAM\nmodulation schemes, the DM-based method achieves a significantly lower symbol\nerror rate (SER) compared to ML estimation, while maintaining a much lower\ncomputational complexity.\n","authors":["Xiucheng Wang","Peilin Zheng","Nan Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.07030v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.11222v2","updated":"2025-01-13T18:20:35Z","published":"2024-11-18T01:19:37Z","title":"The Sound of Water: Inferring Physical Properties from Pouring Liquids","summary":"  We study the connection between audio-visual observations and the underlying\nphysics of a mundane yet intriguing everyday activity: pouring liquids. Given\nonly the sound of liquid pouring into a container, our objective is to\nautomatically infer physical properties such as the liquid level, the shape and\nsize of the container, the pouring rate and the time to fill. To this end, we:\n(i) show in theory that these properties can be determined from the fundamental\nfrequency (pitch); (ii) train a pitch detection model with supervision from\nsimulated data and visual data with a physics-inspired objective; (iii)\nintroduce a new large dataset of real pouring videos for a systematic study;\n(iv) show that the trained model can indeed infer these physical properties for\nreal data; and finally, (v) we demonstrate strong generalization to various\ncontainer shapes, other datasets, and in-the-wild YouTube videos. Our work\npresents a keen understanding of a narrow yet rich problem at the intersection\nof acoustics, physics, and learning. It opens up applications to enhance\nmultisensory perception in robotic pouring.\n","authors":["Piyush Bagad","Makarand Tapaswi","Cees G. M. Snoek","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2411.11222v2.pdf","comment":"Project page at https://bpiyush.github.io/pouring-water-website.\n  Short version accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07246v1","updated":"2025-01-13T11:54:40Z","published":"2025-01-13T11:54:40Z","title":"Audio-CoT: Exploring Chain-of-Thought Reasoning in Large Audio Language\n  Model","summary":"  Large Audio-Language Models (LALMs) have demonstrated remarkable performance\nin tasks involving audio perception and understanding, such as speech\nrecognition and audio captioning. However, their reasoning capabilities -\ncritical for solving complex real-world problems - remain underexplored. In\nthis work, we conduct the first exploration into integrating Chain-of-Thought\n(CoT) reasoning into LALMs to enhance their reasoning ability across auditory\nmodalities. We evaluate representative CoT methods, analyzing their performance\nin both information extraction and reasoning tasks across sound, music, and\nspeech domains. Our findings reveal that CoT methods significantly improve\nperformance on easy and medium tasks but encounter challenges with hard tasks,\nwhere reasoning chains can confuse the model rather than improve accuracy.\nAdditionally, we identify a positive correlation between reasoning path length\nand accuracy, demonstrating the potential of scaling inference for advanced\ninstruction-following and reasoning. This study not only highlights the promise\nof CoT in enhancing LALM reasoning capabilities but also identifies key\nlimitations and provides actionable directions for future research.\n","authors":["Ziyang Ma","Zhuo Chen","Yuping Wang","Eng Siong Chng","Xie Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07245v1","updated":"2025-01-13T11:54:26Z","published":"2025-01-13T11:54:26Z","title":"Depth and Image Fusion for Road Obstacle Detection Using Stereo Camera","summary":"  This paper is devoted to the detection of objects on a road, performed with a\ncombination of two methods based on both the use of depth information and video\nanalysis of data from a stereo camera. Since neither the time of the appearance\nof an object on the road, nor its size and shape is known in advance,\nML/DL-based approaches are not applicable. The task becomes more complicated\ndue to variations in artificial illumination, inhomogeneous road surface\ntexture, and unknown character and features of the object. To solve this\nproblem we developed the depth and image fusion method that complements a\nsearch of small contrast objects by RGB-based method, and obstacle detection by\nstereo image-based approach with SLIC superpixel segmentation. We conducted\nexperiments with static and low speed obstacles in an underground parking lot\nand demonstrated the successful work of the developed technique for detecting\nand even tracking small objects, which can be parking infrastructure objects,\nthings left on the road, wheels, dropped boxes, etc.\n","authors":["Oleg Perezyabov","Mikhail Gavrilenkov","Ilya Afanasyev"],"pdf_url":"https://arxiv.org/pdf/2501.07245v1.pdf","comment":"8 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.07110v1","updated":"2025-01-13T07:51:43Z","published":"2025-01-13T07:51:43Z","title":"Dynamic Multimodal Fusion via Meta-Learning Towards Micro-Video\n  Recommendation","summary":"  Multimodal information (e.g., visual, acoustic, and textual) has been widely\nused to enhance representation learning for micro-video recommendation. For\nintegrating multimodal information into a joint representation of micro-video,\nmultimodal fusion plays a vital role in the existing micro-video recommendation\napproaches. However, the static multimodal fusion used in previous studies is\ninsufficient to model the various relationships among multimodal information of\ndifferent micro-videos. In this paper, we develop a novel meta-learning-based\nmultimodal fusion framework called Meta Multimodal Fusion (MetaMMF), which\ndynamically assigns parameters to the multimodal fusion function for each\nmicro-video during its representation learning. Specifically, MetaMMF regards\nthe multimodal fusion of each micro-video as an independent task. Based on the\nmeta information extracted from the multimodal features of the input task,\nMetaMMF parameterizes a neural network as the item-specific fusion function via\na meta learner. We perform extensive experiments on three benchmark datasets,\ndemonstrating the significant improvements over several state-of-the-art\nmultimodal recommendation models, like MMGCN, LATTICE, and InvRL. Furthermore,\nwe lighten our model by adopting canonical polyadic decomposition to improve\nthe training efficiency, and validate its effectiveness through experimental\nresults. Codes are available at https://github.com/hanliu95/MetaMMF.\n","authors":["Han Liu","Yinwei Wei","Fan Liu","Wenjie Wang","Liqiang Nie","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2501.07110v1.pdf","comment":"This paper has been accepted by ACM Transactions on Information\n  Systems"},{"id":"http://arxiv.org/abs/2406.00323v2","updated":"2025-01-13T05:39:04Z","published":"2024-06-01T06:53:03Z","title":"BeFA: A General Behavior-driven Feature Adapter for Multimedia\n  Recommendation","summary":"  Multimedia recommender systems focus on utilizing behavioral information and\ncontent information to model user preferences. Typically, it employs\npre-trained feature encoders to extract content features, then fuses them with\nbehavioral features. However, pre-trained feature encoders often extract\nfeatures from the entire content simultaneously, including excessive\npreference-irrelevant details. We speculate that it may result in the extracted\nfeatures not containing sufficient features to accurately reflect user\npreferences. To verify our hypothesis, we introduce an attribution analysis\nmethod for visually and intuitively analyzing the content features. The results\nindicate that certain products' content features exhibit the issues of\ninformation drift}and information omission,reducing the expressive ability of\nfeatures. Building upon this finding, we propose an effective and efficient\ngeneral Behavior-driven Feature Adapter (BeFA) to tackle these issues. This\nadapter reconstructs the content feature with the guidance of behavioral\ninformation, enabling content features accurately reflecting user preferences.\nExtensive experiments demonstrate the effectiveness of the adapter across all\nmultimedia recommendation methods. Our code is made publicly available on\nhttps://github.com/fqldom/BeFA.\n","authors":["Qile Fan","Penghang Yu","Zhiyi Tan","Bing-Kun Bao","Guanming Lu"],"pdf_url":"https://arxiv.org/pdf/2406.00323v2.pdf","comment":"This paper is accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.07711v1","updated":"2025-01-13T21:45:01Z","published":"2025-01-13T21:45:01Z","title":"Pedestrian Trajectory Prediction Based on Social Interactions Learning\n  With Random Weights","summary":"  Pedestrian trajectory prediction is a critical technology in the evolution of\nself-driving cars toward complete artificial intelligence. Over recent years,\nfocusing on the trajectories of pedestrians to model their social interactions\nhas surged with great interest in more accurate trajectory predictions.\nHowever, existing methods for modeling pedestrian social interactions rely on\npre-defined rules, struggling to capture non-explicit social interactions. In\nthis work, we propose a novel framework named DTGAN, which extends the\napplication of Generative Adversarial Networks (GANs) to graph sequence data,\nwith the primary objective of automatically capturing implicit social\ninteractions and achieving precise predictions of pedestrian trajectory. DTGAN\ninnovatively incorporates random weights within each graph to eliminate the\nneed for pre-defined interaction rules. We further enhance the performance of\nDTGAN by exploring diverse task loss functions during adversarial training,\nwhich yields improvements of 16.7\\% and 39.3\\% on metrics ADE and FDE,\nrespectively. The effectiveness and accuracy of our framework are verified on\ntwo public datasets. The experimental results show that our proposed DTGAN\nachieves superior performance and is well able to understand pedestrians'\nintentions.\n","authors":["Jiajia Xie","Sheng Zhang","Beihao Xia","Zhu Xiao","Hongbo Jiang","Siwang Zhou","Zheng Qin","Hongyang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07711v1.pdf","comment":"13 pages,7 figures,Accepted to IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2501.07650v1","updated":"2025-01-13T19:21:18Z","published":"2025-01-13T19:21:18Z","title":"An Efficient NVoD Scheme Using Implicit Error Correction and Subchannels\n  for Wireless Networks","summary":"  Implicit Error Correction (IEC) is a near Video-on-Demand (nVoD) scheme that\ntrades bandwidth utilization for initial playback delay to potentially support\nan infinite number of users. Additionally, it provides error protection without\nany further bandwidth increase by exploiting the implicit redundancy of nVoD\nprotocols, using linear combinations of the segments transmitted in a given\ntime slot. However, IEC packet loss protection is weaker at the beginning of\nthe playback due to the lack of implicit redundancy and lower decoding\nefficiency, resulting in worse subjective playback quality. In tackling this\nissue, this paper contributes with an extension of the original nVoD\narchitecture, enhancing its performance by adding a new element namely,\nsubchannels. These subdivisions of the original channels do not provide further\npacket loss protection but significantly improve the decoding efficiency, which\nin turn increases playback quality, especially at the beginning. Even for very\nhigh packet loss probabilities, subchannels are designed to obtain higher\ndecoding efficiency which results in greater packet loss protection than that\nprovided by IEC. The proposed scheme is especially useful in wireless\ncooperative networks using techniques such as network coding, as content\ntransmissions can be split into different subchannels in order to maximize\nnetwork efficiency.\n","authors":["Rafael Asorey-Cacheda","Antonio-Javier Garcia-Sanchez","Joan Garcia-Haro"],"pdf_url":"https://arxiv.org/pdf/2501.07650v1.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.07575v1","updated":"2025-01-13T18:59:48Z","published":"2025-01-13T18:59:48Z","title":"Dataset Distillation via Committee Voting","summary":"  Dataset distillation aims to synthesize a smaller, representative dataset\nthat preserves the essential properties of the original data, enabling\nefficient model training with reduced computational resources. Prior work has\nprimarily focused on improving the alignment or matching process between\noriginal and synthetic data, or on enhancing the efficiency of distilling large\ndatasets. In this work, we introduce ${\\bf C}$ommittee ${\\bf V}$oting for ${\\bf\nD}$ataset ${\\bf D}$istillation (CV-DD), a novel and orthogonal approach that\nleverages the collective wisdom of multiple models or experts to create\nhigh-quality distilled datasets. We start by showing how to establish a strong\nbaseline that already achieves state-of-the-art accuracy through leveraging\nrecent advancements and thoughtful adjustments in model design and optimization\nprocesses. By integrating distributions and predictions from a committee of\nmodels while generating high-quality soft labels, our method captures a wider\nspectrum of data features, reduces model-specific biases and the adverse\neffects of distribution shifts, leading to significant improvements in\ngeneralization. This voting-based strategy not only promotes diversity and\nrobustness within the distilled dataset but also significantly reduces\noverfitting, resulting in improved performance on post-eval tasks. Extensive\nexperiments across various datasets and IPCs (images per class) demonstrate\nthat Committee Voting leads to more reliable and adaptable distilled data\ncompared to single/multi-model distillation methods, demonstrating its\npotential for efficient and accurate dataset distillation. Code is available\nat: https://github.com/Jiacheng8/CV-DD.\n","authors":["Jiacheng Cui","Zhaoyi Li","Xiaochen Ma","Xinyue Bi","Yaxin Luo","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2501.07575v1.pdf","comment":"Code at: https://github.com/Jiacheng8/CV-DD"},{"id":"http://arxiv.org/abs/2501.07574v1","updated":"2025-01-13T18:59:20Z","published":"2025-01-13T18:59:20Z","title":"UnCommon Objects in 3D","summary":"  We introduce Uncommon Objects in 3D (uCO3D), a new object-centric dataset for\n3D deep learning and 3D generative AI. uCO3D is the largest publicly-available\ncollection of high-resolution videos of objects with 3D annotations that\nensures full-360$^{\\circ}$ coverage. uCO3D is significantly more diverse than\nMVImgNet and CO3Dv2, covering more than 1,000 object categories. It is also of\nhigher quality, due to extensive quality checks of both the collected videos\nand the 3D annotations. Similar to analogous datasets, uCO3D contains\nannotations for 3D camera poses, depth maps and sparse point clouds. In\naddition, each object is equipped with a caption and a 3D Gaussian Splat\nreconstruction. We train several large 3D models on MVImgNet, CO3Dv2, and uCO3D\nand obtain superior results using the latter, showing that uCO3D is better for\nlearning applications.\n","authors":["Xingchen Liu","Piyush Tayal","Jianyuan Wang","Jesus Zarzar","Tom Monnier","Konstantinos Tertikas","Jiali Duan","Antoine Toisoul","Jason Y. Zhang","Natalia Neverova","Andrea Vedaldi","Roman Shapovalov","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2501.07574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07572v1","updated":"2025-01-13T18:58:07Z","published":"2025-01-13T18:58:07Z","title":"WebWalker: Benchmarking LLMs in Web Traversal","summary":"  Retrieval-augmented generation (RAG) demonstrates remarkable performance\nacross tasks in open-domain question-answering. However, traditional search\nengines may retrieve shallow content, limiting the ability of LLMs to handle\ncomplex, multi-layered information. To address it, we introduce WebWalkerQA, a\nbenchmark designed to assess the ability of LLMs to perform web traversal. It\nevaluates the capacity of LLMs to traverse a website's subpages to extract\nhigh-quality data systematically. We propose WebWalker, which is a multi-agent\nframework that mimics human-like web navigation through an explore-critic\nparadigm. Extensive experimental results show that WebWalkerQA is challenging\nand demonstrates the effectiveness of RAG combined with WebWalker, through the\nhorizontal and vertical integration in real-world scenarios.\n","authors":["Jialong Wu","Wenbiao Yin","Yong Jiang","Zhenglin Wang","Zekun Xi","Runnan Fang","Deyu Zhou","Pengjun Xie","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2501.07572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04987v2","updated":"2025-01-13T18:24:22Z","published":"2024-11-07T18:55:10Z","title":"Few-Shot Task Learning through Inverse Generative Modeling","summary":"  Learning the intents of an agent, defined by its goals or motion style, is\noften extremely challenging from just a few examples. We refer to this problem\nas task concept learning and present our approach, Few-Shot Task Learning\nthrough Inverse Generative Modeling (FTL-IGM), which learns new task concepts\nby leveraging invertible neural generative models. The core idea is to pretrain\na generative model on a set of basic concepts and their demonstrations. Then,\ngiven a few demonstrations of a new concept (such as a new goal or a new\naction), our method learns the underlying concepts through backpropagation\nwithout updating the model weights, thanks to the invertibility of the\ngenerative model. We evaluate our method in five domains -- object\nrearrangement, goal-oriented navigation, motion caption of human actions,\nautonomous driving, and real-world table-top manipulation. Our experimental\nresults demonstrate that via the pretrained generative model, we successfully\nlearn novel concepts and generate agent plans or motion corresponding to these\nconcepts in (1) unseen environments and (2) in composition with training\nconcepts.\n","authors":["Aviv Netanyahu","Yilun Du","Antonia Bronars","Jyothish Pari","Joshua Tenenbaum","Tianmin Shu","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.04987v2.pdf","comment":"Added acknowledgment"},{"id":"http://arxiv.org/abs/2501.07531v1","updated":"2025-01-13T18:09:25Z","published":"2025-01-13T18:09:25Z","title":"Evaluating Agent-based Program Repair at Google","summary":"  Agent-based program repair offers to automatically resolve complex bugs\nend-to-end by combining the planning, tool use, and code generation abilities\nof modern LLMs. Recent work has explored the use of agent-based repair\napproaches on the popular open-source SWE-Bench, a collection of bugs from\nhighly-rated GitHub Python projects. In addition, various agentic approaches\nsuch as SWE-Agent have been proposed to solve bugs in this benchmark. This\npaper explores the viability of using an agentic approach to address bugs in an\nenterprise context. To investigate this, we curate an evaluation set of 178\nbugs drawn from Google's issue tracking system. This dataset spans both\nhuman-reported (78) and machine-reported bugs (100).\n  To establish a repair performance baseline on this benchmark, we implement\nPasserine, an agent similar in spirit to SWE-Agent that can work within\nGoogle's development environment. We show that with 20 trajectory samples and\nGemini 1.5 Pro, Passerine can produce a patch that passes bug tests (i.e.,\nplausible) for 73% of machine-reported and 25.6% of human-reported bugs in our\nevaluation set. After manual examination, we found that 43% of machine-reported\nbugs and 17.9% of human-reported bugs have at least one patch that is\nsemantically equivalent to the ground-truth patch.\n  These results establish a baseline on an industrially relevant benchmark,\nwhich as we show, contains bugs drawn from a different distribution -- in terms\nof language diversity, size, and spread of changes, etc. -- compared to those\nin the popular SWE-Bench dataset.\n","authors":["Pat Rondon","Renyao Wei","José Cambronero","Jürgen Cito","Aaron Sun","Siddhant Sanyam","Michele Tufano","Satish Chandra"],"pdf_url":"https://arxiv.org/pdf/2501.07531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07755v1","updated":"2025-01-13T23:56:24Z","published":"2025-01-13T23:56:24Z","title":"Performance Optimization of Ratings-Based Reinforcement Learning","summary":"  This paper explores multiple optimization methods to improve the performance\nof rating-based reinforcement learning (RbRL). RbRL, a method based on the idea\nof human ratings, has been developed to infer reward functions in reward-free\nenvironments for the subsequent policy learning via standard reinforcement\nlearning, which requires the availability of reward functions. Specifically,\nRbRL minimizes the cross entropy loss that quantifies the differences between\nhuman ratings and estimated ratings derived from the inferred reward. Hence, a\nlow loss means a high degree of consistency between human ratings and estimated\nratings. Despite its simple form, RbRL has various hyperparameters and can be\nsensitive to various factors. Therefore, it is critical to provide\ncomprehensive experiments to understand the impact of various hyperparameters\non the performance of RbRL. This paper is a work in progress, providing users\nsome general guidelines on how to select hyperparameters in RbRL.\n","authors":["Evelyn Rose","Devin White","Mingkang Wu","Vernon Lawhern","Nicholas R. Waytowich","Yongcan Cao"],"pdf_url":"https://arxiv.org/pdf/2501.07755v1.pdf","comment":"Accepted to the Collaborative AI and Modeling of Humans Bridge\n  Program at AAAI 2025"},{"id":"http://arxiv.org/abs/2412.00142v2","updated":"2025-01-13T23:45:26Z","published":"2024-11-28T18:55:41Z","title":"Sparse Attention Vectors: Generative Multimodal Model Features Are\n  Discriminative Vision-Language Classifiers","summary":"  Generative Large Multimodal Models (LMMs) like LLaVA and Qwen-VL excel at a\nwide variety of vision-language (VL) tasks such as image captioning or visual\nquestion answering. Despite strong performance, LMMs are not directly suited\nfor foundational discriminative vision-language tasks (i.e., tasks requiring\ndiscrete label predictions) such as image classification and multiple-choice\nVQA. One key challenge in utilizing LMMs for discriminative tasks is the\nextraction of useful features from generative models. To overcome this issue,\nwe propose an approach for finding features in the model's latent space to more\neffectively leverage LMMs for discriminative tasks. Toward this end, we present\nSparse Attention Vectors (SAVs) -- a finetuning-free method that leverages\nsparse attention head activations (fewer than 1\\% of the heads) in LMMs as\nstrong features for VL tasks. With only few-shot examples, SAVs demonstrate\nstate-of-the-art performance compared to a variety of few-shot and finetuned\nbaselines on a collection of discriminative tasks. Our experiments also imply\nthat SAVs can scale in performance with additional examples and generalize to\nsimilar tasks, establishing SAVs as both effective and robust multimodal\nfeature representations.\n","authors":["Chancharik Mitra","Brandon Huang","Tianning Chai","Zhiqiu Lin","Assaf Arbelle","Rogerio Feris","Leonid Karlinsky","Trevor Darrell","Deva Ramanan","Roei Herzig"],"pdf_url":"https://arxiv.org/pdf/2412.00142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07751v1","updated":"2025-01-13T23:42:37Z","published":"2025-01-13T23:42:37Z","title":"Rethinking AI Cultural Evaluation","summary":"  As AI systems become more integrated into society, evaluating their capacity\nto align with diverse cultural values is crucial for their responsible\ndeployment. Current evaluation methods predominantly rely on multiple-choice\nquestion (MCQ) datasets. In this study, we demonstrate that MCQs are\ninsufficient for capturing the complexity of cultural values expressed in\nopen-ended scenarios. Our findings highlight significant discrepancies between\nMCQ-based assessments and the values conveyed in unconstrained interactions.\nBased on these findings, we recommend moving beyond MCQs to adopt more\nopen-ended, context-specific assessments that better reflect how AI models\nengage with cultural values in realistic settings.\n","authors":["Michal Bravansky","Filip Trhlik","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2501.07751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05467v2","updated":"2025-01-13T22:53:34Z","published":"2024-07-07T18:39:33Z","title":"The infrastructure powering IBM's Gen AI model development","summary":"  AI Infrastructure plays a key role in the speed and cost-competitiveness of\ndeveloping and deploying advanced AI models. The current demand for powerful AI\ninfrastructure for model training is driven by the emergence of generative AI\nand foundational models, where on occasion thousands of GPUs must cooperate on\na single training job for the model to be trained in a reasonable time.\nDelivering efficient and high-performing AI training requires an end-to-end\nsolution that combines hardware, software and holistic telemetry to cater for\nmultiple types of AI workloads. In this report, we describe IBM's hybrid cloud\ninfrastructure that powers our generative AI model development. This\ninfrastructure includes (1) Vela: an AI-optimized supercomputing capability\ndirectly integrated into the IBM Cloud, delivering scalable, dynamic,\nmulti-tenant and geographically distributed infrastructure for large-scale\nmodel training and other AI workflow steps and (2) Blue Vela: a large-scale,\npurpose-built, on-premises hosting environment that is optimized to support our\nlargest and most ambitious AI model training tasks. Vela provides IBM with the\ndual benefit of high performance for internal use along with the flexibility to\nadapt to an evolving commercial landscape. Blue Vela provides us with the\nbenefits of rapid development of our largest and most ambitious models, as well\nas future-proofing against the evolving model landscape in the industry. Taken\ntogether, they provide IBM with the ability to rapidly innovate in the\ndevelopment of both AI models and commercial offerings.\n","authors":["Talia Gershon","Seetharami Seelam","Brian Belgodere","Milton Bonilla","Lan Hoang","Danny Barnett","I-Hsin Chung","Apoorve Mohan","Ming-Hung Chen","Lixiang Luo","Robert Walkup","Constantinos Evangelinos","Shweta Salaria","Marc Dombrowa","Yoonho Park","Apo Kayi","Liran Schour","Alim Alim","Ali Sydney","Pavlos Maniotis","Laurent Schares","Bernard Metzler","Bengi Karacali-Akyamac","Sophia Wen","Tatsuhiro Chiba","Sunyanan Choochotkaew","Takeshi Yoshimura","Claudia Misale","Tonia Elengikal","Kevin O Connor","Zhuoran Liu","Richard Molina","Lars Schneidenbach","James Caden","Christopher Laibinis","Carlos Fonseca","Vasily Tarasov","Swaminathan Sundararaman","Frank Schmuck","Scott Guthridge","Jeremy Cohn","Marc Eshel","Paul Muench","Runyu Liu","William Pointer","Drew Wyskida","Bob Krull","Ray Rose","Brent Wolfe","William Cornejo","John Walter","Colm Malone","Clifford Perucci","Frank Franco","Nigel Hinds","Bob Calio","Pavel Druyan","Robert Kilduff","John Kienle","Connor McStay","Andrew Figueroa","Matthew Connolly","Edie Fost","Gina Roma","Jake Fonseca","Ido Levy","Michele Payne","Ryan Schenkel","Amir Malki","Lion Schneider","Aniruddha Narkhede","Shekeba Moshref","Alexandra Kisin","Olga Dodin","Bill Rippon","Henry Wrieth","John Ganci","Johnny Colino","Donna Habeger-Rose","Rakesh Pandey","Aditya Gidh","Aditya Gaur","Dennis Patterson","Samsuddin Salmani","Rambilas Varma","Rumana Rumana","Shubham Sharma","Aditya Gaur","Mayank Mishra","Rameswar Panda","Aditya Prasad","Matt Stallone","Gaoyuan Zhang","Yikang Shen","David Cox","Ruchir Puri","Dakshi Agrawal","Drew Thorstensen","Joel Belog","Brent Tang","Saurabh Kumar Gupta","Amitabha Biswas","Anup Maheshwari","Eran Gampel","Jason Van Patten","Matthew Runion","Sai Kaki","Yigal Bogin","Brian Reitz","Steve Pritko","Shahan Najam","Surya Nambala","Radhika Chirra","Rick Welp","Frank DiMitri","Felipe Telles","Amilcar Arvelo","King Chu","Ed Seminaro","Andrew Schram","Felix Eickhoff","William Hanson","Eric Mckeever","Michael Light","Dinakaran Joseph","Piyush Chaudhary","Piyush Shivam","Puneet Chaudhary","Wesley Jones","Robert Guthrie","Chris Bostic","Rezaul Islam","Steve Duersch","Wayne Sawdon","John Lewars","Matthew Klos","Michael Spriggs","Bill McMillan","George Gao","Ashish Kamra","Gaurav Singh","Marc Curry","Tushar Katarki","Joe Talerico","Zenghui Shi","Sai Sindhur Malleni","Erwan Gallen"],"pdf_url":"https://arxiv.org/pdf/2407.05467v2.pdf","comment":"Corresponding Authors: Talia Gershon, Seetharami Seelam,Brian\n  Belgodere, Milton Bonilla"},{"id":"http://arxiv.org/abs/2409.14634v3","updated":"2025-01-13T22:45:30Z","published":"2024-09-23T00:09:34Z","title":"Scideator: Human-LLM Scientific Idea Generation Grounded in\n  Research-Paper Facet Recombination","summary":"  The scientific ideation process often involves blending salient aspects of\nexisting papers to create new ideas. To see if large language models (LLMs) can\nassist this process, we contribute Scideator, a novel mixed-initiative tool for\nscientific ideation. Starting from a user-provided set of papers, Scideator\nextracts key facets (purposes, mechanisms, and evaluations) from these and\nrelevant papers, allowing users to explore the idea space by interactively\nrecombining facets to synthesize inventive ideas. Scideator also helps users to\ngauge idea novelty by searching the literature for potential overlaps and\nshowing automated novelty assessments and explanations. To support these tasks,\nScideator introduces four LLM-powered retrieval-augmented generation (RAG)\nmodules: Analogous Paper Facet Finder, Faceted Idea Generator, Idea Novelty\nChecker, and Idea Novelty Iterator. In a within-subjects user study, 19\ncomputer-science researchers identified significantly more interesting ideas\nusing Scideator compared to a strong baseline combining a scientific search\nengine with LLM interaction.\n","authors":["Marissa Radensky","Simra Shahid","Raymond Fok","Pao Siangliulue","Tom Hope","Daniel S. Weld"],"pdf_url":"https://arxiv.org/pdf/2409.14634v3.pdf","comment":"Added supplementary material"},{"id":"http://arxiv.org/abs/2311.09308v3","updated":"2025-01-13T22:22:06Z","published":"2023-11-15T19:02:40Z","title":"Divergences between Language Models and Human Brains","summary":"  Do machines and humans process language in similar ways? Recent research has\nhinted at the affirmative, showing that human neural activity can be\neffectively predicted using the internal representations of language models\n(LMs). Although such results are thought to reflect shared computational\nprinciples between LMs and human brains, there are also clear differences in\nhow LMs and humans represent and use language. In this work, we systematically\nexplore the divergences between human and machine language processing by\nexamining the differences between LM representations and human brain responses\nto language as measured by Magnetoencephalography (MEG) across two datasets in\nwhich subjects read and listened to narrative stories. Using an LLM-based\ndata-driven approach, we identify two domains that LMs do not capture well:\nsocial/emotional intelligence and physical commonsense. We validate these\nfindings with human behavioral experiments and hypothesize that the gap is due\nto insufficient representations of social/emotional and physical knowledge in\nLMs. Our results show that fine-tuning LMs on these domains can improve their\nalignment with human brain responses.\n","authors":["Yuchen Zhou","Emmy Liu","Graham Neubig","Michael J. Tarr","Leila Wehbe"],"pdf_url":"https://arxiv.org/pdf/2311.09308v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12652v2","updated":"2025-01-13T21:59:56Z","published":"2024-04-19T06:41:32Z","title":"Pre-trained Vision-Language Models Learn Discoverable Visual Concepts","summary":"  Do vision-language models (VLMs) pre-trained to caption an image of a\n\"durian\" learn visual concepts such as \"brown\" (color) and \"spiky\" (texture) at\nthe same time? We aim to answer this question as visual concepts learned \"for\nfree\" would enable wide applications such as neuro-symbolic reasoning or\nhuman-interpretable object classification. We assume that the visual concepts,\nif captured by pre-trained VLMs, can be extracted by their vision-language\ninterface with text-based concept prompts. We observe that recent works\nprompting VLMs with concepts often differ in their strategies to define and\nevaluate the visual concepts, leading to conflicting conclusions. We propose a\nnew concept definition strategy based on two observations: First, certain\nconcept prompts include shortcuts that recognize correct concepts for wrong\nreasons; Second, multimodal information (e.g. visual discriminativeness, and\ntextual knowledge) should be leveraged when selecting the concepts. Our\nproposed concept discovery and learning (CDL) framework is thus designed to\nidentify a diverse list of generic visual concepts (e.g. \"spiky\" as opposed to\n\"spiky durian\"), which are ranked and selected based on visual and language\nmutual information. We carefully design quantitative and human evaluations of\nthe discovered concepts on six diverse visual recognition datasets, which\nconfirm that pre-trained VLMs do learn visual concepts that provide accurate\nand thorough descriptions for the recognized objects. All code and models are\npublicly released.\n","authors":["Yuan Zang","Tian Yun","Hao Tan","Trung Bui","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.12652v2.pdf","comment":"Transactions on Machine Learning Research, 2025"},{"id":"http://arxiv.org/abs/2412.10999v2","updated":"2025-01-13T21:41:47Z","published":"2024-12-14T23:59:42Z","title":"Cocoa: Co-Planning and Co-Execution with AI Agents","summary":"  We present Cocoa, a system that implements a novel interaction design pattern\n-- interactive plans -- for users to collaborate with an AI agent on complex,\nmulti-step tasks in a document editor. Cocoa harmonizes human and AI efforts\nand enables flexible delegation of agency through two actions: Co-planning\n(where users collaboratively compose a plan of action with the agent) and\nCo-execution (where users collaboratively execute plan steps with the agent).\nUsing scientific research as a sample domain, we motivate the design of Cocoa\nthrough a formative study with 9 researchers while also drawing inspiration\nfrom the design of computational notebooks. We evaluate Cocoa through a user\nstudy with 16 researchers and find that when compared to a strong chat\nbaseline, Cocoa improved agent steerability without sacrificing ease of use. A\ndeeper investigation of the general utility of both systems uncovered insights\ninto usage contexts where interactive plans may be more appropriate than chat,\nand vice versa. Our work surfaces numerous practical implications and paves new\npaths for interactive interfaces that foster more effective collaboration\nbetween humans and agentic AI systems.\n","authors":["K. J. Kevin Feng","Kevin Pu","Matt Latzke","Tal August","Pao Siangliulue","Jonathan Bragg","Daniel S. Weld","Amy X. Zhang","Joseph Chee Chang"],"pdf_url":"https://arxiv.org/pdf/2412.10999v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12672v5","updated":"2025-01-13T20:42:06Z","published":"2024-10-16T15:36:13Z","title":"Context Matters: Leveraging Contextual Features for Time Series\n  Forecasting","summary":"  Time series forecasts are often influenced by exogenous contextual features\nin addition to their corresponding history. For example, in financial settings,\nit is hard to accurately predict a stock price without considering public\nsentiments and policy decisions in the form of news articles, tweets, etc.\nThough this is common knowledge, the current state-of-the-art (SOTA)\nforecasting models fail to incorporate such contextual information, owing to\nits heterogeneity and multimodal nature. To address this, we introduce\nContextFormer, a novel plug-and-play method to surgically integrate multimodal\ncontextual information into existing pre-trained forecasting models.\nContextFormer effectively distills forecast-specific information from rich\nmultimodal contexts, including categorical, continuous, time-varying, and even\ntextual information, to significantly enhance the performance of existing base\nforecasters. ContextFormer outperforms SOTA forecasting models by up to 30% on\na range of real-world datasets spanning energy, traffic, environmental, and\nfinancial domains.\n","authors":["Sameep Chattopadhyay","Pulkit Paliwal","Sai Shankar Narasimhan","Shubhankar Agarwal","Sandeep P. Chinchali"],"pdf_url":"https://arxiv.org/pdf/2410.12672v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04917v3","updated":"2025-01-13T20:28:04Z","published":"2024-03-07T22:03:36Z","title":"A Mixed-Integer Conic Program for the Moving-Target Traveling Salesman\n  Problem based on a Graph of Convex Sets","summary":"  This paper introduces a new formulation that finds the optimum for the\nMoving-Target Traveling Salesman Problem (MT-TSP), which seeks to find a\nshortest path for an agent, that starts at a depot, visits a set of moving\ntargets exactly once within their assigned time-windows, and returns to the\ndepot. The formulation relies on the key idea that when the targets move along\nlines, their trajectories become convex sets within the space-time coordinate\nsystem. The problem then reduces to finding the shortest path within a graph of\nconvex sets, subject to some speed constraints. We compare our formulation with\nthe current state-of-the-art Mixed Integer Conic Program (MICP) solver for the\nMT-TSP. The experimental results show that our formulation outperforms the MICP\nfor instances with up to 20 targets, with up to two orders of magnitude\nreduction in runtime, and up to a 60\\% tighter optimality gap. We also show\nthat the solution cost from the convex relaxation of our formulation provides\nsignificantly tighter lower bounds for the MT-TSP than the ones from the MICP.\n","authors":["Allen George Philip","Zhongqiang Ren","Sivakumar Rathinam","Howie Choset"],"pdf_url":"https://arxiv.org/pdf/2403.04917v3.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.07674v1","updated":"2025-01-13T20:13:59Z","published":"2025-01-13T20:13:59Z","title":"CDS: Data Synthesis Method Guided by Cognitive Diagnosis Theory","summary":"  Large Language Models (LLMs) have demonstrated outstanding capabilities\nacross various domains, but the increasing complexity of new challenges demands\nenhanced performance and adaptability. Traditional benchmarks, although\ncomprehensive, often lack the granularity needed for detailed capability\nanalysis. This study introduces the Cognitive Diagnostic Synthesis (CDS)\nmethod, which employs Cognitive Diagnosis Theory (CDT) for precise evaluation\nand targeted enhancement of LLMs. By decomposing complex tasks into discrete\nknowledge points, CDS accurately identifies and synthesizes data targeting\nmodel weaknesses, thereby enhancing the model's performance. This framework\nproposes a comprehensive pipeline driven by knowledge point evaluation,\nsynthesis, data augmentation, and filtering, which significantly improves the\nmodel's mathematical and coding capabilities, achieving up to an 11.12%\nimprovement in optimal scenarios.\n","authors":["Haokun Zhao","Jinyi Han","Jiaqing Liang","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2501.07674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07653v1","updated":"2025-01-13T19:26:09Z","published":"2025-01-13T19:26:09Z","title":"Large Language Models for Interpretable Mental Health Diagnosis","summary":"  We propose a clinical decision support system (CDSS) for mental health\ndiagnosis that combines the strengths of large language models (LLMs) and\nconstraint logic programming (CLP). Having a CDSS is important because of the\nhigh complexity of diagnostic manuals used by mental health professionals and\nthe danger of diagnostic errors. Our CDSS is a software tool that uses an LLM\nto translate diagnostic manuals to a logic program and solves the program using\nan off-the-shelf CLP engine to query a patient's diagnosis based on the encoded\nrules and provided data. By giving domain experts the opportunity to inspect\nthe LLM-generated logic program, and making modifications when needed, our CDSS\nensures that the diagnosis is not only accurate but also interpretable. We\nexperimentally compare it with two baseline approaches of using LLMs:\ndiagnosing patients using the LLM-only approach, and using the LLM-generated\nlogic program but without expert inspection. The results show that, while LLMs\nare extremely useful in generating candidate logic programs, these programs\nstill require expert inspection and modification to guarantee faithfulness to\nthe official diagnostic manuals. Additionally, ethical concerns arise from the\ndirect use of patient data in LLMs, underscoring the need for a safer hybrid\napproach like our proposed method.\n","authors":["Brian Hyeongseok Kim","Chao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07653v1.pdf","comment":"Accepted at AAAI 2025 Workshop on Large Language Models and\n  Generative AI for Health (GenAI4Health)"},{"id":"http://arxiv.org/abs/2501.07647v1","updated":"2025-01-13T19:17:06Z","published":"2025-01-13T19:17:06Z","title":"BlobGEN-Vid: Compositional Text-to-Video Generation with Blob Video\n  Representations","summary":"  Existing video generation models struggle to follow complex text prompts and\nsynthesize multiple objects, raising the need for additional grounding input\nfor improved controllability. In this work, we propose to decompose videos into\nvisual primitives - blob video representation, a general representation for\ncontrollable video generation. Based on blob conditions, we develop a\nblob-grounded video diffusion model named BlobGEN-Vid that allows users to\ncontrol object motions and fine-grained object appearance. In particular, we\nintroduce a masked 3D attention module that effectively improves regional\nconsistency across frames. In addition, we introduce a learnable module to\ninterpolate text embeddings so that users can control semantics in specific\nframes and obtain smooth object transitions. We show that our framework is\nmodel-agnostic and build BlobGEN-Vid based on both U-Net and DiT-based video\ndiffusion models. Extensive experimental results show that BlobGEN-Vid achieves\nsuperior zero-shot video generation ability and state-of-the-art layout\ncontrollability on multiple benchmarks. When combined with an LLM for layout\nplanning, our framework even outperforms proprietary text-to-video generators\nin terms of compositional accuracy.\n","authors":["Weixi Feng","Chao Liu","Sifei Liu","William Yang Wang","Arash Vahdat","Weili Nie"],"pdf_url":"https://arxiv.org/pdf/2501.07647v1.pdf","comment":"Project page: https://blobgen-vid2.github.io/"},{"id":"http://arxiv.org/abs/2403.02648v4","updated":"2025-01-13T19:05:07Z","published":"2024-03-05T04:35:59Z","title":"Remove that Square Root: A New Efficient Scale-Invariant Version of\n  AdaGrad","summary":"  Adaptive methods are extremely popular in machine learning as they make\nlearning rate tuning less expensive. This paper introduces a novel optimization\nalgorithm named KATE, which presents a scale-invariant adaptation of the\nwell-known AdaGrad algorithm. We prove the scale-invariance of KATE for the\ncase of Generalized Linear Models. Moreover, for general smooth non-convex\nproblems, we establish a convergence rate of $O \\left(\\frac{\\log T}{\\sqrt{T}}\n\\right)$ for KATE, matching the best-known ones for AdaGrad and Adam. We also\ncompare KATE to other state-of-the-art adaptive algorithms Adam and AdaGrad in\nnumerical experiments with different problems, including complex machine\nlearning tasks like image classification and text classification on real data.\nThe results indicate that KATE consistently outperforms AdaGrad and\nmatches/surpasses the performance of Adam in all considered scenarios.\n","authors":["Sayantan Choudhury","Nazarii Tupitsa","Nicolas Loizou","Samuel Horvath","Martin Takac","Eduard Gorbunov"],"pdf_url":"https://arxiv.org/pdf/2403.02648v4.pdf","comment":"32 pages, 12 figures"},{"id":"http://arxiv.org/abs/2501.07639v1","updated":"2025-01-13T19:01:58Z","published":"2025-01-13T19:01:58Z","title":"SafePowerGraph-LLM: Novel Power Grid Graph Embedding and Optimization\n  with Large Language Models","summary":"  Efficiently solving Optimal Power Flow (OPF) problems in power systems is\ncrucial for operational planning and grid management. There is a growing need\nfor scalable algorithms capable of handling the increasing variability,\nconstraints, and uncertainties in modern power networks while providing\naccurate and fast solutions. To address this, machine learning techniques,\nparticularly Graph Neural Networks (GNNs) have emerged as promising approaches.\nThis letter introduces SafePowerGraph-LLM, the first framework explicitly\ndesigned for solving OPF problems using Large Language Models (LLM)s. The\nproposed approach combines graph and tabular representations of power grids to\neffectively query LLMs, capturing the complex relationships and constraints in\npower systems. A new implementation of in-context learning and fine-tuning\nprotocols for LLMs is introduced, tailored specifically for the OPF problem.\nSafePowerGraph-LLM demonstrates reliable performances using off-the-shelf LLM.\nOur study reveals the impact of LLM architecture, size, and fine-tuning and\ndemonstrates our framework's ability to handle realistic grid components and\nconstraints.\n","authors":["Fabien Bernier","Jun Cao","Maxime Cordy","Salah Ghamizi"],"pdf_url":"https://arxiv.org/pdf/2501.07639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07525v1","updated":"2025-01-13T17:55:32Z","published":"2025-01-13T17:55:32Z","title":"RadAlign: Advancing Radiology Report Generation with Vision-Language\n  Concept Alignment","summary":"  Automated chest radiographs interpretation requires both accurate disease\nclassification and detailed radiology report generation, presenting a\nsignificant challenge in the clinical workflow. Current approaches either focus\non classification accuracy at the expense of interpretability or generate\ndetailed but potentially unreliable reports through image captioning\ntechniques. In this study, we present RadAlign, a novel framework that combines\nthe predictive accuracy of vision-language models (VLMs) with the reasoning\ncapabilities of large language models (LLMs). Inspired by the radiologist's\nworkflow, RadAlign first employs a specialized VLM to align visual features\nwith key medical concepts, achieving superior disease classification with an\naverage AUC of 0.885 across multiple diseases. These recognized medical\nconditions, represented as text-based concepts in the aligned visual-language\nspace, are then used to prompt LLM-based report generation. Enhanced by a\nretrieval-augmented generation mechanism that grounds outputs in similar\nhistorical cases, RadAlign delivers superior report quality with a GREEN score\nof 0.678, outperforming state-of-the-art methods' 0.634. Our framework\nmaintains strong clinical interpretability while reducing hallucinations,\nadvancing automated medical imaging and report analysis through integrated\npredictive and generative AI. Code is available at\nhttps://github.com/difeigu/RadAlign.\n","authors":["Difei Gu","Yunhe Gao","Yang Zhou","Mu Zhou","Dimitris Metaxas"],"pdf_url":"https://arxiv.org/pdf/2501.07525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07523v1","updated":"2025-01-13T17:50:30Z","published":"2025-01-13T17:50:30Z","title":"Parallel Key-Value Cache Fusion for Position Invariant RAG","summary":"  Recent advancements in Large Language Models (LLMs) underscore the necessity\nof Retrieval Augmented Generation (RAG) to leverage external information.\nHowever, LLMs are sensitive to the position of relevant information within\ncontexts and tend to generate incorrect responses when such information is\nplaced in the middle, known as `Lost in the Middle' phenomenon. In this paper,\nwe introduce a framework that generates consistent outputs for decoder-only\nmodels, irrespective of the input context order. Experimental results for three\nopen domain question answering tasks demonstrate position invariance, where the\nmodel is not sensitive to input context order, and superior robustness to\nirrelevent passages compared to prevailing approaches for RAG pipelines.\n","authors":["Philhoon Oh","Jinwoo Shin","James Thorne"],"pdf_url":"https://arxiv.org/pdf/2501.07523v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2408.09698v5","updated":"2025-01-13T17:48:09Z","published":"2024-08-19T04:44:32Z","title":"Harnessing Multimodal Large Language Models for Multimodal Sequential\n  Recommendation","summary":"  Recent advances in Large Language Models (LLMs) have demonstrated significant\npotential in the field of Recommendation Systems (RSs). Most existing studies\nhave focused on converting user behavior logs into textual prompts and\nleveraging techniques such as prompt tuning to enable LLMs for recommendation\ntasks. Meanwhile, research interest has recently grown in multimodal\nrecommendation systems that integrate data from images, text, and other sources\nusing modality fusion techniques. This introduces new challenges to the\nexisting LLM-based recommendation paradigm which relies solely on text modality\ninformation. Moreover, although Multimodal Large Language Models (MLLMs)\ncapable of processing multi-modal inputs have emerged, how to equip MLLMs with\nmulti-modal recommendation capabilities remains largely unexplored. To this\nend, in this paper, we propose the Multimodal Large Language Model-enhanced\nMultimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic\nuser preference, we design a two-stage user preference summarization method.\nSpecifically, we first utilize an MLLM-based item-summarizer to extract image\nfeature given an item and convert the image into text. Then, we employ a\nrecurrent user preference summarization generation paradigm to capture the\ndynamic changes in user preferences based on an LLM-based user-summarizer.\nFinally, to enable the MLLM for multi-modal recommendation task, we propose to\nfine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT)\ntechniques. Extensive evaluations across various datasets validate the\neffectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt\nto the evolving dynamics of user preferences.\n","authors":["Yuyang Ye","Zhi Zheng","Yishan Shen","Tianshu Wang","Hengruo Zhang","Peijun Zhu","Runlong Yu","Kai Zhang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.09698v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17976v3","updated":"2025-01-13T17:42:09Z","published":"2024-11-27T01:15:36Z","title":"The importance of visual modelling languages in generative software\n  engineering","summary":"  Multimodal GPTs represent a watershed in the interplay between Software\nEngineering and Generative Artificial Intelligence. GPT-4 accepts image and\ntext inputs, rather than simply natural language. We investigate relevant use\ncases stemming from these enhanced capabilities of GPT-4. To the best of our\nknowledge, no other work has investigated similar use cases involving Software\nEngineering tasks carried out via multimodal GPTs prompted with a mix of\ndiagrams and natural language.\n","authors":["Roberto Rossi"],"pdf_url":"https://arxiv.org/pdf/2411.17976v3.pdf","comment":"9 pages, working paper"},{"id":"http://arxiv.org/abs/2501.07515v1","updated":"2025-01-13T17:37:37Z","published":"2025-01-13T17:37:37Z","title":"The Paradox of Success in Evolutionary and Bioinspired Optimization:\n  Revisiting Critical Issues, Key Studies, and Methodological Pathways","summary":"  Evolutionary and bioinspired computation are crucial for efficiently\naddressing complex optimization problems across diverse application domains. By\nmimicking processes observed in nature, like evolution itself, these algorithms\noffer innovative solutions beyond the reach of traditional optimization\nmethods. They excel at finding near-optimal solutions in large, complex search\nspaces, making them invaluable in numerous fields. However, both areas are\nplagued by challenges at their core, including inadequate benchmarking,\nproblem-specific overfitting, insufficient theoretical grounding, and\nsuperfluous proposals justified only by their biological metaphor. This\noverview recapitulates and analyzes in depth the criticisms concerning the lack\nof innovation and rigor in experimental studies within the field. To this end,\nwe examine the judgmental positions of the existing literature in an informed\nattempt to guide the research community toward directions of solid contribution\nand advancement in these areas. We summarize guidelines for the design of\nevolutionary and bioinspired optimizers, the development of experimental\ncomparisons, and the derivation of novel proposals that take a step further in\nthe field. We provide a brief note on automating the process of creating these\nalgorithms, which may help align metaheuristic optimization research with its\nprimary objective (solving real-world problems), provided that our identified\npathways are followed. Our conclusions underscore the need for a sustained push\ntowards innovation and the enforcement of methodological rigor in prospective\nstudies to fully realize the potential of these advanced computational\ntechniques.\n","authors":["Daniel Molina","Javier Del Ser","Javier Poyatos","Francisco Herrera"],"pdf_url":"https://arxiv.org/pdf/2501.07515v1.pdf","comment":"38 pages, 1 figure"},{"id":"http://arxiv.org/abs/2412.07752v2","updated":"2025-01-13T17:34:22Z","published":"2024-12-10T18:50:37Z","title":"FlashRNN: Optimizing Traditional RNNs on Modern Hardware","summary":"  While Transformers and other sequence-parallelizable neural network\narchitectures seem like the current state of the art in sequence modeling, they\nspecifically lack state-tracking capabilities. These are important for\ntime-series tasks and logical reasoning. Traditional RNNs like LSTMs and GRUs,\nas well as modern variants like sLSTM do have these capabilities at the cost of\nstrictly sequential processing. While this is often seen as a strong\nlimitation, we show how fast these networks can get with our\nhardware-optimization FlashRNN in Triton and CUDA, optimizing kernels to the\nregister level on modern GPUs. We extend traditional RNNs with a\nparallelization variant that processes multiple RNNs of smaller hidden state in\nparallel, similar to the head-wise processing in Transformers. To enable\nflexibility on different GPU variants, we introduce a new optimization\nframework for hardware-internal cache sizes, memory and compute handling. It\nmodels the hardware in a setting using polyhedral-like constraints, including\nthe notion of divisibility. This speeds up the solution process in our\nConstrINT library for general integer constraint satisfaction problems (integer\nCSPs). We show that our kernels can achieve 50x speed-ups over a vanilla\nPyTorch implementation and allow 40x larger hidden sizes compared to our Triton\nimplementation. Our open-source kernels and the optimization library are\nreleased here to boost research in the direction of state-tracking enabled RNNs\nand sequence modeling: \\url{https://github.com/NX-AI/flashrnn}\n","authors":["Korbinian Pöppel","Maximilian Beck","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2412.07752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07507v1","updated":"2025-01-13T17:25:46Z","published":"2025-01-13T17:25:46Z","title":"Inductive Learning of Robot Task Knowledge from Raw Data and Online\n  Expert Feedback","summary":"  The increasing level of autonomy of robots poses challenges of trust and\nsocial acceptance, especially in human-robot interaction scenarios. This\nrequires an interpretable implementation of robotic cognitive capabilities,\npossibly based on formal methods as logics for the definition of task\nspecifications. However, prior knowledge is often unavailable in complex\nrealistic scenarios.\n  In this paper, we propose an offline algorithm based on inductive logic\nprogramming from noisy examples to extract task specifications (i.e., action\npreconditions, constraints and effects) directly from raw data of few\nheterogeneous (i.e., not repetitive) robotic executions. Our algorithm\nleverages on the output of any unsupervised action identification algorithm\nfrom video-kinematic recordings. Combining it with the definition of very\nbasic, almost task-agnostic, commonsense concepts about the environment, which\ncontribute to the interpretability of our methodology, we are able to learn\nlogical axioms encoding preconditions of actions, as well as their effects in\nthe event calculus paradigm. Since the quality of learned specifications\ndepends mainly on the accuracy of the action identification algorithm, we also\npropose an online framework for incremental refinement of task knowledge from\nuser feedback, guaranteeing safe execution. Results in a standard manipulation\ntask and benchmark for user training in the safety-critical surgical robotic\nscenario, show the robustness, data- and time-efficiency of our methodology,\nwith promising results towards the scalability in more complex domains.\n","authors":["Daniele Meli","Paolo Fiorini"],"pdf_url":"https://arxiv.org/pdf/2501.07507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07502v1","updated":"2025-01-13T17:19:34Z","published":"2025-01-13T17:19:34Z","title":"RbRL2.0: Integrated Reward and Policy Learning for Rating-based\n  Reinforcement Learning","summary":"  Reinforcement learning (RL), a common tool in decision making, learns\npolicies from various experiences based on the associated cumulative\nreturn/rewards without treating them differently. On the contrary, humans often\nlearn to distinguish from different levels of performance and extract the\nunderlying trends towards improving their decision making for best performance.\nMotivated by this, this paper proposes a novel RL method that mimics humans'\ndecision making process by differentiating among collected experiences for\neffective policy learning. The main idea is to extract important directional\ninformation from experiences with different performance levels, named ratings,\nso that policies can be updated towards desired deviation from these\nexperiences with different ratings. Specifically, we propose a new policy loss\nfunction that penalizes distribution similarities between the current policy\nand failed experiences with different ratings, and assign different weights to\nthe penalty terms based on the rating classes. Meanwhile, reward learning from\nthese rated samples can be integrated with the new policy loss towards an\nintegrated reward and policy learning from rated samples. Optimizing the\nintegrated reward and policy loss function will lead to the discovery of\ndirections for policy improvement towards maximizing cumulative rewards and\npenalizing most from the lowest performance level while least from the highest\nperformance level. To evaluate the effectiveness of the proposed method, we\npresent results for experiments on a few typical environments that show\nimproved convergence and overall performance over the existing rating-based\nreinforcement learning method with only reward learning.\n","authors":["Mingkang Wu","Devin White","Vernon Lawhern","Nicholas R. Waytowich","Yongcan Cao"],"pdf_url":"https://arxiv.org/pdf/2501.07502v1.pdf","comment":"Accepted to the Collaborative AI and Modeling of Humans Bridge\n  Program at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.07487v1","updated":"2025-01-13T17:04:23Z","published":"2025-01-13T17:04:23Z","title":"Data and System Perspectives of Sustainable Artificial Intelligence","summary":"  Sustainable AI is a subfield of AI for concerning developing and using AI\nsystems in ways of aiming to reduce environmental impact and achieve\nsustainability. Sustainable AI is increasingly important given that training of\nand inference with AI models such as large langrage models are consuming a\nlarge amount of computing power. In this article, we discuss current issues,\nopportunities and example solutions for addressing these issues, and future\nchallenges to tackle, from the data and system perspectives, related to data\nacquisition, data processing, and AI model training and inference.\n","authors":["Tao Xie","David Harel","Dezhi Ran","Zhenwen Li","Maoliang Li","Zhi Yang","Leye Wang","Xiang Chen","Ying Zhang","Wentao Zhang","Meng Li","Chen Zhang","Linyi Li","Assaf Marron"],"pdf_url":"https://arxiv.org/pdf/2501.07487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07486v1","updated":"2025-01-13T17:04:06Z","published":"2025-01-13T17:04:06Z","title":"Smart Learning in the 21st Century: Advancing Constructionism Across\n  Three Digital Epochs","summary":"  This article explores the evolution of constructionism as an educational\nframework, tracing its relevance and transformation across three pivotal eras:\nthe advent of personal computing, the networked society, and the current era of\ngenerative AI. Rooted in Seymour Papert constructionist philosophy, this study\nexamines how constructionist principles align with the expanding role of\ndigital technology in personal and collective learning. We discuss the\ntransformation of educational environments from hierarchical instructionism to\nconstructionist models that emphasize learner autonomy and interactive,\ncreative engagement. Central to this analysis is the concept of an expanded\npersonality, wherein digital tools and AI integration fundamentally reshape\nindividual self-perception and social interactions. By integrating\nconstructionism into the paradigm of smart education, we propose it as a\nfoundational approach to personalized and democratized learning. Our findings\nunderscore constructionism enduring relevance in navigating the complexities of\ntechnology-driven education, providing insights for educators and policymakers\nseeking to harness digital innovations to foster adaptive, student-centered\nlearning experiences.\n","authors":["Ilya Levin","Alexei L. Semenov","Mikael Gorsky"],"pdf_url":"https://arxiv.org/pdf/2501.07486v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2501.07482v1","updated":"2025-01-13T16:58:32Z","published":"2025-01-13T16:58:32Z","title":"TiEBe: A Benchmark for Assessing the Current Knowledge of Large Language\n  Models","summary":"  In a rapidly evolving knowledge landscape and the increasing adoption of\nlarge language models, a need has emerged to keep these models continuously\nupdated with current events. While existing benchmarks evaluate general factual\nrecall, they often overlook two critical aspects: the ability of models to\nintegrate evolving knowledge through continual learning and the significant\nregional disparities in their performance. To address these gaps, we introduce\nthe Timely Events Benchmark (TiEBe), a dataset containing over 11,000\nquestion-answer pairs focused on globally and regionally significant events.\nTiEBe leverages structured retrospective data from Wikipedia, enabling\ncontinuous updates to assess LLMs' knowledge of evolving global affairs and\ntheir understanding of events across different regions. Our benchmark\ndemonstrates that LLMs exhibit substantial geographic disparities in factual\nrecall, emphasizing the need for more balanced global knowledge representation.\nFurthermore, TiEBe serves as a tool for evaluating continual learning\nstrategies, providing insights into models' ability to acquire new information\nwithout forgetting past knowledge.\n","authors":["Thales Sales Almeida","Giovana Kerche Bonás","João Guilherme Alves Santos","Hugo Abonizio","Rodrigo Nogueira"],"pdf_url":"https://arxiv.org/pdf/2501.07482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07474v1","updated":"2025-01-13T16:46:45Z","published":"2025-01-13T16:46:45Z","title":"Estimating Musical Surprisal in Audio","summary":"  In modeling musical surprisal expectancy with computational methods, it has\nbeen proposed to use the information content (IC) of one-step predictions from\nan autoregressive model as a proxy for surprisal in symbolic music. With an\nappropriately chosen model, the IC of musical events has been shown to\ncorrelate with human perception of surprise and complexity aspects, including\ntonal and rhythmic complexity. This work investigates whether an analogous\nmethodology can be applied to music audio. We train an autoregressive\nTransformer model to predict compressed latent audio representations of a\npretrained autoencoder network. We verify learning effects by estimating the\ndecrease in IC with repetitions. We investigate the mean IC of musical segment\ntypes (e.g., A or B) and find that segment types appearing later in a piece\nhave a higher IC than earlier ones on average. We investigate the IC's relation\nto audio and musical features and find it correlated with timbral variations\nand loudness and, to a lesser extent, dissonance, rhythmic complexity, and\nonset density related to audio and musical features. Finally, we investigate if\nthe IC can predict EEG responses to songs and thus model humans' surprisal in\nmusic. We provide code for our method on github.com/sonycslparis/audioic.\n","authors":["Mathias Rose Bjare","Giorgia Cantisani","Stefan Lattner","Gerhard Widmer"],"pdf_url":"https://arxiv.org/pdf/2501.07474v1.pdf","comment":"5 pages, 2 figures, 1 table. Accepted at the 2025 IEEE International\n  Conference on Acoustics, Speech and Signal Processing (ICASSP 2025),\n  Hyderabad, India"},{"id":"http://arxiv.org/abs/2501.07468v1","updated":"2025-01-13T16:35:52Z","published":"2025-01-13T16:35:52Z","title":"A Survey of Embodied AI in Healthcare: Techniques, Applications, and\n  Opportunities","summary":"  Healthcare systems worldwide face persistent challenges in efficiency,\naccessibility, and personalization. Powered by modern AI technologies such as\nmultimodal large language models and world models, Embodied AI (EmAI)\nrepresents a transformative frontier, offering enhanced autonomy and the\nability to interact with the physical world to address these challenges. As an\ninterdisciplinary and rapidly evolving research domain, \"EmAI in healthcare\"\nspans diverse fields such as algorithms, robotics, and biomedicine. This\ncomplexity underscores the importance of timely reviews and analyses to track\nadvancements, address challenges, and foster cross-disciplinary collaboration.\nIn this paper, we provide a comprehensive overview of the \"brain\" of EmAI for\nhealthcare, wherein we introduce foundational AI algorithms for perception,\nactuation, planning, and memory, and focus on presenting the healthcare\napplications spanning clinical interventions, daily care & companionship,\ninfrastructure support, and biomedical research. Despite its promise, the\ndevelopment of EmAI for healthcare is hindered by critical challenges such as\nsafety concerns, gaps between simulation platforms and real-world applications,\nthe absence of standardized benchmarks, and uneven progress across\ninterdisciplinary domains. We discuss the technical barriers and explore\nethical considerations, offering a forward-looking perspective on the future of\nEmAI in healthcare. A hierarchical framework of intelligent levels for EmAI\nsystems is also introduced to guide further development. By providing\nsystematic insights, this work aims to inspire innovation and practical\napplications, paving the way for a new era of intelligent, patient-centered\nhealthcare.\n","authors":["Yihao Liu","Xu Cao","Tingting Chen","Yankai Jiang","Junjie You","Minghua Wu","Xiaosong Wang","Mengling Feng","Yaochu Jin","Jintai Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07468v1.pdf","comment":"44 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.07458v1","updated":"2025-01-13T16:28:01Z","published":"2025-01-13T16:28:01Z","title":"Understanding and Benchmarking Artificial Intelligence: OpenAI's o3 Is\n  Not AGI","summary":"  OpenAI's o3 achieves a high score of 87.5 % on ARC-AGI, a benchmark proposed\nto measure intelligence. This raises the question whether systems based on\nLarge Language Models (LLMs), particularly o3, demonstrate intelligence and\nprogress towards artificial general intelligence (AGI). Building on the\ndistinction between skills and intelligence made by Fran\\c{c}ois Chollet, the\ncreator of ARC-AGI, a new understanding of intelligence is introduced: an agent\nis the more intelligent, the more efficiently it can achieve the more diverse\ngoals in the more diverse worlds with the less knowledge. An analysis of the\nARC-AGI benchmark shows that its tasks represent a very specific type of\nproblem that can be solved by massive trialling of combinations of predefined\noperations. This method is also applied by o3, achieving its high score through\nthe extensive use of computing power. However, for most problems in the\nphysical world and in the human domain, solutions cannot be tested in advance\nand predefined operations are not available. Consequently, massive trialling of\npredefined operations, as o3 does, cannot be a basis for AGI - instead, new\napproaches are required that can reliably solve a wide variety of problems\nwithout existing skills. To support this development, a new benchmark for\nintelligence is outlined that covers a much higher diversity of unknown tasks\nto be solved, thus enabling a comprehensive assessment of intelligence and of\nprogress towards AGI.\n","authors":["Rolf Pfister","Hansueli Jud"],"pdf_url":"https://arxiv.org/pdf/2501.07458v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2501.07445v1","updated":"2025-01-13T16:13:22Z","published":"2025-01-13T16:13:22Z","title":"Online inductive learning from answer sets for efficient reinforcement\n  learning exploration","summary":"  This paper presents a novel approach combining inductive logic programming\nwith reinforcement learning to improve training performance and explainability.\nWe exploit inductive learning of answer set programs from noisy examples to\nlearn a set of logical rules representing an explainable approximation of the\nagent policy at each batch of experience. We then perform answer set reasoning\non the learned rules to guide the exploration of the learning agent at the next\nbatch, without requiring inefficient reward shaping and preserving optimality\nwith soft bias. The entire procedure is conducted during the online execution\nof the reinforcement learning algorithm. We preliminarily validate the efficacy\nof our approach by integrating it into the Q-learning algorithm for the Pac-Man\nscenario in two maps of increasing complexity. Our methodology produces a\nsignificant boost in the discounted return achieved by the agent, even in the\nfirst batches of training. Moreover, inductive learning does not compromise the\ncomputational time required by Q-learning and learned rules quickly converge to\nan explanation of the agent policy.\n","authors":["Celeste Veronese","Daniele Meli","Alessandro Farinelli"],"pdf_url":"https://arxiv.org/pdf/2501.07445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07440v1","updated":"2025-01-13T16:08:47Z","published":"2025-01-13T16:08:47Z","title":"Attention when you need","summary":"  Being attentive to task-relevant features can improve task performance, but\npaying attention comes with its own metabolic cost. Therefore, strategic\nallocation of attention is crucial in performing the task efficiently. This\nwork aims to understand this strategy. Recently, de Gee et al. conducted\nexperiments involving mice performing an auditory sustained attention-value\ntask. This task required the mice to exert attention to identify whether a\nhigh-order acoustic feature was present amid the noise. By varying the trial\nduration and reward magnitude, the task allows us to investigate how an agent\nshould strategically deploy their attention to maximize their benefits and\nminimize their costs. In our work, we develop a reinforcement learning-based\nnormative model of the mice to understand how it balances attention cost\nagainst its benefits. The model is such that at each moment the mice can choose\nbetween two levels of attention and decide when to take costly actions that\ncould obtain rewards. Our model suggests that efficient use of attentional\nresources involves alternating blocks of high attention with blocks of low\nattention. In the extreme case where the agent disregards sensory input during\nlow attention states, we see that high attention is used rhythmically. Our\nmodel provides evidence about how one should deploy attention as a function of\ntask utility, signal statistics, and how attention affects sensory evidence.\n","authors":["Lokesh Boominathan","Yizhou Chen","Matthew McGinley","Xaq Pitkow"],"pdf_url":"https://arxiv.org/pdf/2501.07440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17645v2","updated":"2025-01-13T16:01:14Z","published":"2024-11-26T18:10:51Z","title":"Explainable AI for Classifying UTI Risk Groups Using a Real-World Linked\n  EHR and Pathology Lab Dataset","summary":"  The use of machine learning and AI on electronic health records (EHRs) holds\nsubstantial potential for clinical insight. However, this approach faces\nchallenges due to data heterogeneity, sparsity, temporal misalignment, and\nlimited labeled outcomes. In this context, we leverage a linked EHR dataset of\napproximately one million de-identified individuals from Bristol, North\nSomerset, and South Gloucestershire, UK, to characterize urinary tract\ninfections (UTIs). We implemented a data pre-processing and curation pipeline\nthat transforms the raw EHR data into a structured format suitable for\ndeveloping predictive models focused on data fairness, accountability and\ntransparency. Given the limited availability and biases of ground truth UTI\noutcomes, we introduce a UTI risk estimation framework informed by clinical\nexpertise to estimate UTI risk across individual patient timelines. Pairwise\nXGBoost models are trained using this framework to differentiate UTI risk\ncategories with explainable AI techniques applied to identify key predictors\nand support interpretability. Our findings reveal differences in clinical and\ndemographic predictors across risk groups. While this study highlights the\npotential of AI-driven insights to support UTI clinical decision-making,\nfurther investigation of patient sub-strata and extensive validation are needed\nto ensure robustness and applicability in clinical practice.\n","authors":["Yujie Dai","Brian Sullivan","Axel Montout","Amy Dillon","Chris Waller","Peter Acs","Rachel Denholm","Philip Williams","Alastair D Hay","Raul Santos-Rodriguez","Andrew Dowsey"],"pdf_url":"https://arxiv.org/pdf/2411.17645v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07432v1","updated":"2025-01-13T15:59:28Z","published":"2025-01-13T15:59:28Z","title":"Empirical Evaluation of the Implicit Hitting Set Approach for Weighted\n  CSPs","summary":"  SAT technology has proven to be surprisingly effective in a large variety of\ndomains. However, for the Weighted CSP problem dedicated algorithms have always\nbeen superior. One approach not well-studied so far is the use of SAT in\nconjunction with the Implicit Hitting Set approach. In this work, we explore\nsome alternatives to the existing algorithm of reference. The alternatives,\nmostly borrowed from related boolean frameworks, consider trade-offs for the\ntwo main components of the IHS approach: the computation of low-cost hitting\nvectors, and their transformation into high-cost cores. For each one, we\npropose 4 levels of intensity. Since we also test the usefulness of cost\nfunction merging, our experiments consider 32 different implementations. Our\nempirical study shows that for WCSP it is not easy to identify the best\nalternative. Nevertheless, the cost-function merging encoding and extracting\nmaximal cores seems to be a robust approach.\n","authors":["Aleksandra Petrova","Javier Larrosa","Emma Rollón"],"pdf_url":"https://arxiv.org/pdf/2501.07432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07430v1","updated":"2025-01-13T15:54:21Z","published":"2025-01-13T15:54:21Z","title":"Diff-Ensembler: Learning to Ensemble 2D Diffusion Models for\n  Volume-to-Volume Medical Image Translation","summary":"  Despite success in volume-to-volume translations in medical images, most\nexisting models struggle to effectively capture the inherent volumetric\ndistribution using 3D representations. The current state-of-the-art approach\ncombines multiple 2D-based networks through weighted averaging, thereby\nneglecting the 3D spatial structures. Directly training 3D models in medical\nimaging presents significant challenges due to high computational demands and\nthe need for large-scale datasets. To address these challenges, we introduce\nDiff-Ensembler, a novel hybrid 2D-3D model for efficient and effective\nvolumetric translations by ensembling perpendicularly trained 2D diffusion\nmodels with a 3D network in each diffusion step. Moreover, our model can\nnaturally be used to ensemble diffusion models conditioned on different\nmodalities, allowing flexible and accurate fusion of input conditions.\nExtensive experiments demonstrate that Diff-Ensembler attains superior accuracy\nand volumetric realism in 3D medical image super-resolution and modality\ntranslation. We further demonstrate the strength of our model's volumetric\nrealism using tumor segmentation as a downstream task.\n","authors":["Xiyue Zhu","Dou Hoon Kwark","Ruike Zhu","Kaiwen Hong","Yiqi Tao","Shirui Luo","Yudu Li","Zhi-Pei Liang","Volodymyr Kindratenko"],"pdf_url":"https://arxiv.org/pdf/2501.07430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07423v1","updated":"2025-01-13T15:43:22Z","published":"2025-01-13T15:43:22Z","title":"An Investigation into Seasonal Variations in Energy Forecasting for\n  Student Residences","summary":"  This research provides an in-depth evaluation of various machine learning\nmodels for energy forecasting, focusing on the unique challenges of seasonal\nvariations in student residential settings. The study assesses the performance\nof baseline models, such as LSTM and GRU, alongside state-of-the-art\nforecasting methods, including Autoregressive Feedforward Neural Networks,\nTransformers, and hybrid approaches. Special attention is given to predicting\nenergy consumption amidst challenges like seasonal patterns, vacations,\nmeteorological changes, and irregular human activities that cause sudden\nfluctuations in usage. The findings reveal that no single model consistently\noutperforms others across all seasons, emphasizing the need for season-specific\nmodel selection or tailored designs. Notably, the proposed Hyper Network based\nLSTM and MiniAutoEncXGBoost models exhibit strong adaptability to seasonal\nvariations, effectively capturing abrupt changes in energy consumption during\nsummer months. This study advances the energy forecasting field by emphasizing\nthe critical role of seasonal dynamics and model-specific behavior in achieving\naccurate predictions.\n","authors":["Muhammad Umair Danish","Mathumitha Sureshkumar","Thanuri Fonseka","Umeshika Uthayakumar","Vinura Galwaduge"],"pdf_url":"https://arxiv.org/pdf/2501.07423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11547v2","updated":"2025-01-13T15:37:03Z","published":"2024-09-17T20:40:02Z","title":"Small Language Models can Outperform Humans in Short Creative Writing: A\n  Study Comparing SLMs with Humans and LLMs","summary":"  In this paper, we evaluate the creative fiction writing abilities of a\nfine-tuned small language model (SLM), BART-large, and compare its performance\nto human writers and two large language models (LLMs): GPT-3.5 and GPT-4o. Our\nevaluation consists of two experiments: (i) a human study in which 68\nparticipants rated short stories from humans and the SLM on grammaticality,\nrelevance, creativity, and attractiveness, and (ii) a qualitative linguistic\nanalysis examining the textual characteristics of stories produced by each\nmodel. In the first experiment, BART-large outscored average human writers\noverall (2.11 vs. 1.85), a 14% relative improvement, though the slight human\nadvantage in creativity was not statistically significant. In the second\nexperiment, qualitative analysis showed that while GPT-4o demonstrated\nnear-perfect coherence and used less cliche phrases, it tended to produce more\npredictable language, with only 3% of its synopses featuring surprising\nassociations (compared to 15% for BART). These findings highlight how model\nsize and fine-tuning influence the balance between creativity, fluency, and\ncoherence in creative writing tasks, and demonstrate that smaller models can,\nin certain contexts, rival both humans and larger models.\n","authors":["Guillermo Marco","Luz Rello","Julio Gonzalo"],"pdf_url":"https://arxiv.org/pdf/2409.11547v2.pdf","comment":"Accepted as Main Conference Paper at COLING 2025"},{"id":"http://arxiv.org/abs/2501.07408v1","updated":"2025-01-13T15:24:10Z","published":"2025-01-13T15:24:10Z","title":"Initial Findings on Sensor based Open Vocabulary Activity Recognition\n  via Text Embedding Inversion","summary":"  Conventional human activity recognition (HAR) relies on classifiers trained\nto predict discrete activity classes, inherently limiting recognition to\nactivities explicitly present in the training set. Such classifiers would\ninvariably fail, putting zero likelihood, when encountering unseen activities.\nWe propose Open Vocabulary HAR (OV-HAR), a framework that overcomes this\nlimitation by first converting each activity into natural language and breaking\nit into a sequence of elementary motions. This descriptive text is then encoded\ninto a fixed-size embedding. The model is trained to regress this embedding,\nwhich is subsequently decoded back into natural language using a pre-trained\nembedding inversion model. Unlike other works that rely on auto-regressive\nlarge language models (LLMs) at their core, OV-HAR achieves open vocabulary\nrecognition without the computational overhead of such models. The generated\ntext can be transformed into a single activity class using LLM prompt\nengineering. We have evaluated our approach on different modalities, including\nvision (pose), IMU, and pressure sensors, demonstrating robust generalization\nacross unseen activities and modalities, offering a fundamentally different\nparadigm from contemporary classifiers.\n","authors":["Lala Shakti Swarup Ray","Bo Zhou","Sungho Suh","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2501.07408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07405v1","updated":"2025-01-13T15:21:20Z","published":"2025-01-13T15:21:20Z","title":"PROTECT: Protein circadian time prediction using unsupervised learning","summary":"  Circadian rhythms regulate the physiology and behavior of humans and animals.\nDespite advancements in understanding these rhythms and predicting circadian\nphases at the transcriptional level, predicting circadian phases from proteomic\ndata remains elusive. This challenge is largely due to the scarcity of time\nlabels in proteomic datasets, which are often characterized by small sample\nsizes, high dimensionality, and significant noise. Furthermore, existing\nmethods for predicting circadian phases from transcriptomic data typically rely\non prior knowledge of known rhythmic genes, making them unsuitable for\nproteomic datasets. To address this gap, we developed a novel computational\nmethod using unsupervised deep learning techniques to predict circadian sample\nphases from proteomic data without requiring time labels or prior knowledge of\nproteins or genes. Our model involves a two-stage training process optimized\nfor robust circadian phase prediction: an initial greedy one-layer-at-a-time\npre-training which generates informative initial parameters followed by\nfine-tuning. During fine-tuning, a specialized loss function guides the model\nto align protein expression levels with circadian patterns, enabling it to\naccurately capture the underlying rhythmic structure within the data. We tested\nour method on both time-labeled and unlabeled proteomic data. For labeled data,\nwe compared our predictions to the known time labels, achieving high accuracy,\nwhile for unlabeled human datasets, including postmortem brain regions and\nurine samples, we explored circadian disruptions. Notably, our analysis\nidentified disruptions in rhythmic proteins between Alzheimer's disease and\ncontrol subjects across these samples.\n","authors":["Aram Ansary Ogholbake","Qiang Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.07405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08926v3","updated":"2025-01-13T15:19:14Z","published":"2024-10-11T15:50:53Z","title":"Zero-Shot Pupil Segmentation with SAM 2: A Case Study of Over 14 Million\n  Images","summary":"  We explore the transformative potential of SAM 2, a vision foundation model,\nin advancing gaze estimation and eye tracking technologies. By significantly\nreducing annotation time, lowering technical barriers through its ease of\ndeployment, and enhancing segmentation accuracy, SAM 2 addresses critical\nchallenges faced by researchers and practitioners. Utilizing its zero-shot\nsegmentation capabilities with minimal user input-a single click per video-we\ntested SAM 2 on over 14 million eye images from diverse datasets, including\nvirtual reality setups and the world's largest unified dataset recorded using\nwearable eye trackers. Remarkably, in pupil segmentation tasks, SAM 2 matches\nthe performance of domain-specific models trained solely on eye images,\nachieving competitive mean Intersection over Union (mIoU) scores of up to 93%\nwithout fine-tuning. Additionally, we provide our code and segmentation masks\nfor these widely used datasets to promote further research.\n","authors":["Virmarie Maquiling","Sean Anthony Byrne","Diederick C. Niehorster","Marco Carminati","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2410.08926v3.pdf","comment":"Virmarie Maquiling and Sean Anthony Byrne contributed equally to this\n  paper, 8 pages, 3 figures, ETRA 2025, pre-print"},{"id":"http://arxiv.org/abs/2501.07400v1","updated":"2025-01-13T15:17:28Z","published":"2025-01-13T15:17:28Z","title":"Derivation of effective gradient flow equations and dynamical truncation\n  of training data in Deep Learning","summary":"  We derive explicit equations governing the cumulative biases and weights in\nDeep Learning with ReLU activation function, based on gradient descent for the\nEuclidean cost in the input layer, and under the assumption that the weights\nare, in a precise sense, adapted to the coordinate system distinguished by the\nactivations. We show that gradient descent corresponds to a dynamical process\nin the input layer, whereby clusters of data are progressively reduced in\ncomplexity (\"truncated\") at an exponential rate that increases with the number\nof data points that have already been truncated. We provide a detailed\ndiscussion of several types of solutions to the gradient flow equations. A main\nmotivation for this work is to shed light on the interpretability question in\nsupervised learning.\n","authors":["Thomas Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07400v1.pdf","comment":"AMS Latex, 35 pages"},{"id":"http://arxiv.org/abs/2501.07392v1","updated":"2025-01-13T15:08:32Z","published":"2025-01-13T15:08:32Z","title":"The Essentials of AI for Life and Society: An AI Literacy Course for the\n  University Community","summary":"  We describe the development of a one-credit course to promote AI literacy at\nThe University of Texas at Austin. In response to a call for the rapid\ndeployment of class to serve a broad audience in Fall of 2023, we designed a\n14-week seminar-style course that incorporated an interdisciplinary group of\nspeakers who lectured on topics ranging from the fundamentals of AI to societal\nconcerns including disinformation and employment. University students, faculty,\nand staff, and even community members outside of the University, were invited\nto enroll in this online offering: The Essentials of AI for Life and Society.\nWe collected feedback from course participants through weekly reflections and a\nfinal survey. Satisfyingly, we found that attendees reported gains in their AI\nliteracy. We sought critical feedback through quantitative and qualitative\nanalysis, which uncovered challenges in designing a course for this general\naudience. We utilized the course feedback to design a three-credit version of\nthe course that is being offered in Fall of 2024. The lessons we learned and\nour plans for this new iteration may serve as a guide to instructors designing\nAI courses for a broad audience.\n","authors":["Joydeep Biswas","Don Fussell","Peter Stone","Kristin Patterson","Kristen Procko","Lea Sabatini","Zifan Xu"],"pdf_url":"https://arxiv.org/pdf/2501.07392v1.pdf","comment":"Accepted to EAAI-25: The 15th Symposium on Educational Advances in\n  Artificial Intelligence, collocated with AAAI-25"},{"id":"http://arxiv.org/abs/2501.07391v1","updated":"2025-01-13T15:07:55Z","published":"2025-01-13T15:07:55Z","title":"Enhancing Retrieval-Augmented Generation: A Study of Best Practices","summary":"  Retrieval-Augmented Generation (RAG) systems have recently shown remarkable\nadvancements by integrating retrieval mechanisms into language models,\nenhancing their ability to produce more accurate and contextually relevant\nresponses. However, the influence of various components and configurations\nwithin RAG systems remains underexplored. A comprehensive understanding of\nthese elements is essential for tailoring RAG systems to complex retrieval\ntasks and ensuring optimal performance across diverse applications. In this\npaper, we develop several advanced RAG system designs that incorporate query\nexpansion, various novel retrieval strategies, and a novel Contrastive\nIn-Context Learning RAG. Our study systematically investigates key factors,\nincluding language model size, prompt design, document chunk size, knowledge\nbase size, retrieval stride, query expansion techniques, Contrastive In-Context\nLearning knowledge bases, multilingual knowledge bases, and Focus Mode\nretrieving relevant context at sentence-level. Through extensive\nexperimentation, we provide a detailed analysis of how these factors influence\nresponse quality. Our findings offer actionable insights for developing RAG\nsystems, striking a balance between contextual richness and\nretrieval-generation efficiency, thereby paving the way for more adaptable and\nhigh-performing RAG frameworks in diverse real-world scenarios. Our code and\nimplementation details are publicly available.\n","authors":["Siran Li","Linus Stenzel","Carsten Eickhoff","Seyed Ali Bahrainian"],"pdf_url":"https://arxiv.org/pdf/2501.07391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07382v1","updated":"2025-01-13T15:01:12Z","published":"2025-01-13T15:01:12Z","title":"Information-Theoretic Dual Memory System for Continual Learning","summary":"  Continuously acquiring new knowledge from a dynamic environment is a\nfundamental capability for animals, facilitating their survival and ability to\naddress various challenges. This capability is referred to as continual\nlearning, which focuses on the ability to learn a sequence of tasks without the\ndetriment of previous knowledge. A prevalent strategy to tackle continual\nlearning involves selecting and storing numerous essential data samples from\nprior tasks within a fixed-size memory buffer. However, the majority of current\nmemory-based techniques typically utilize a single memory buffer, which poses\nchallenges in concurrently managing newly acquired and previously learned\nsamples. Drawing inspiration from the Complementary Learning Systems (CLS)\ntheory, which defines rapid and gradual learning mechanisms for processing\ninformation, we propose an innovative dual memory system called the\nInformation-Theoretic Dual Memory System (ITDMS). This system comprises a fast\nmemory buffer designed to retain temporary and novel samples, alongside a slow\nmemory buffer dedicated to preserving critical and informative samples. The\nfast memory buffer is optimized employing an efficient reservoir sampling\nprocess. Furthermore, we introduce a novel information-theoretic memory\noptimization strategy that selectively identifies and retains diverse and\ninformative data samples for the slow memory buffer. Additionally, we propose a\nnovel balanced sample selection procedure that automatically identifies and\neliminates redundant memorized samples, thus freeing up memory capacity for new\ndata acquisitions, which can deal with a growing array of tasks. Our\nmethodology is rigorously assessed through a series of continual learning\nexperiments, with empirical results underscoring the effectiveness of the\nproposed system.\n","authors":["RunQing Wu","KaiHui Huang","HanYi Zhang","QiHe Liu","GuoJin Yu","JingSong Deng","Fei Ye"],"pdf_url":"https://arxiv.org/pdf/2501.07382v1.pdf","comment":"35 pages, 9 figures, submitted to Knowledge-Based Systems"},{"id":"http://arxiv.org/abs/2405.01305v3","updated":"2025-01-13T14:58:22Z","published":"2024-05-02T14:11:50Z","title":"Distributed Representations Enable Robust Multi-Timescale Symbolic\n  Computation in Neuromorphic Hardware","summary":"  Programming recurrent spiking neural networks (RSNNs) to robustly perform\nmulti-timescale computation remains a difficult challenge. To address this, we\ndescribe a single-shot weight learning scheme to embed robust multi-timescale\ndynamics into attractor-based RSNNs, by exploiting the properties of\nhigh-dimensional distributed representations. We embed finite state machines\ninto the RSNN dynamics by superimposing a symmetric autoassociative weight\nmatrix and asymmetric transition terms, which are each formed by the vector\nbinding of an input and heteroassociative outer-products between states. Our\napproach is validated through simulations with highly nonideal weights; an\nexperimental closed-loop memristive hardware setup; and on Loihi 2, where it\nscales seamlessly to large state machines. This work introduces a scalable\napproach to embed robust symbolic computation through recurrent dynamics into\nneuromorphic hardware, without requiring parameter fine-tuning or significant\nplatform-specific optimisation. Moreover, it demonstrates that distributed\nsymbolic representations serve as a highly capable representation-invariant\nlanguage for cognitive algorithms in neuromorphic hardware.\n","authors":["Madison Cotteret","Hugh Greatorex","Alpha Renner","Junren Chen","Emre Neftci","Huaqiang Wu","Giacomo Indiveri","Martin Ziegler","Elisabetta Chicca"],"pdf_url":"https://arxiv.org/pdf/2405.01305v3.pdf","comment":"19 pages, 7 figures. Supplementary material: 13 pages, 8 figures.\n  Accepted for publication in Neuromorphic Computing and Engineering"},{"id":"http://arxiv.org/abs/2501.01576v2","updated":"2025-01-13T14:51:24Z","published":"2025-01-02T23:47:54Z","title":"Constructing and explaining machine learning models for chemistry:\n  example of the exploration and design of boron-based Lewis acids","summary":"  The integration of machine learning (ML) into chemistry offers transformative\npotential in the design of molecules with targeted properties. However, the\nfocus has often been on creating highly efficient predictive models, sometimes\nat the expense of interpretability. In this study, we leverage explainable AI\ntechniques to explore the rational design of boron-based Lewis acids, which\nplay a pivotal role in organic reactions due to their electron-ccepting\nproperties. Using Fluoride Ion Affinity as a proxy for Lewis acidity, we\ndeveloped interpretable ML models based on chemically meaningful descriptors,\nincluding ab initio computed features and substituent-based parameters derived\nfrom the Hammett linear free-energy relationship. By constraining the chemical\nspace to well-defined molecular scaffolds, we achieved highly accurate\npredictions (mean absolute error < 6 kJ/mol), surpassing conventional black-box\ndeep learning models in low-data regimes. Interpretability analyses of the\nmodels shed light on the origin of Lewis acidity in these compounds and\nidentified actionable levers to modulate it through the nature and positioning\nof substituents on the molecular scaffold. This work bridges ML and chemist's\nway of thinking, demonstrating how explainable models can inspire molecular\ndesign and enhance scientific understanding of chemical reactivity.\n","authors":["Juliette Fenogli","Laurence Grimaud","Rodolphe Vuilleumier"],"pdf_url":"https://arxiv.org/pdf/2501.01576v2.pdf","comment":"Main text is 14 pages, 7 figures, 1 scheme. Supporting information is\n  25 pages. For associated code and datasets, see\n  https://github.com/jfenogli/XAI_boron_LA"},{"id":"http://arxiv.org/abs/2411.06376v2","updated":"2025-01-13T14:39:34Z","published":"2024-11-10T07:15:03Z","title":"Project Tracyn: Generative Artificial Intelligence based Peripherals\n  Trace Synthesizer","summary":"  Peripheral Component Interconnect Express (PCIe) is the de facto interconnect\nstandard for high-speed peripherals and CPUs. Prototyping and optimizing PCIe\ndevices for emerging scenarios is an ongoing challenge. Since Transaction Layer\nPackets (TLPs) capture device-CPU interactions, it is crucial to analyze and\ngenerate realistic TLP traces for effective device design and optimization.\nGenerative AI offers a promising approach for creating intricate, custom TLP\ntraces necessary for PCIe hardware and software development. However, existing\nmodels often generate impractical traces due to the absence of PCIe-specific\nconstraints, such as TLP ordering and causality. This paper presents Phantom,\nthe first framework that treats TLP trace generation as a generative AI problem\nwhile incorporating PCIe-specific constraints. We validate Phantom's\neffectiveness by generating TLP traces for an actual PCIe network interface\ncard. Experimental results show that Phantom produces practical, large-scale\nTLP traces, significantly outperforming existing models, with improvements of\nup to 1000$\\times$ in task-specific metrics and up to 2.19$\\times$ in Frechet\nInception Distance (FID) compared to backbone-only methods.\n","authors":["Zhibai Huang","Yihan Shen","Yongchen Xie","Zhixiang Wei","Yun wang","Fangxin Liu","Tao Song","Zhengwei Qi"],"pdf_url":"https://arxiv.org/pdf/2411.06376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07359v1","updated":"2025-01-13T14:27:39Z","published":"2025-01-13T14:27:39Z","title":"Emergent effects of scaling on the functional hierarchies within large\n  language models","summary":"  Large language model (LLM) architectures are often described as functionally\nhierarchical: Early layers process syntax, middle layers begin to parse\nsemantics, and late layers integrate information. The present work revisits\nthese ideas. This research submits simple texts to an LLM (e.g., \"A church and\norgan\") and extracts the resulting activations. Then, for each layer, support\nvector machines and ridge regressions are fit to predict a text's label and\nthus examine whether a given layer encodes some information. Analyses using a\nsmall model (Llama-3.2-3b; 28 layers) partly bolster the common hierarchical\nperspective: Item-level semantics are most strongly represented early (layers\n2-7), then two-item relations (layers 8-12), and then four-item analogies\n(layers 10-15). Afterward, the representation of items and simple relations\ngradually decreases in deeper layers that focus on more global information.\nHowever, several findings run counter to a steady hierarchy view: First,\nalthough deep layers can represent document-wide abstractions, deep layers also\ncompress information from early portions of the context window without\nmeaningful abstraction. Second, when examining a larger model\n(Llama-3.3-70b-Instruct), stark fluctuations in abstraction level appear: As\ndepth increases, two-item relations and four-item analogies initially increase\nin their representation, then markedly decrease, and afterward increase again\nmomentarily. This peculiar pattern consistently emerges across several\nexperiments. Third, another emergent effect of scaling is coordination between\nthe attention mechanisms of adjacent layers. Across multiple experiments using\nthe larger model, adjacent layers fluctuate between what information they each\nspecialize in representing. In sum, an abstraction hierarchy often manifests\nacross layers, but large models also deviate from this structure in curious\nways.\n","authors":["Paul C. Bogdan"],"pdf_url":"https://arxiv.org/pdf/2501.07359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08434v2","updated":"2025-01-13T14:13:38Z","published":"2024-12-11T14:55:48Z","title":"Mitigating Out-of-Entity Errors in Named Entity Recognition: A\n  Sentence-Level Strategy","summary":"  Many previous models of named entity recognition (NER) suffer from the\nproblem of Out-of-Entity (OOE), i.e., the tokens in the entity mentions of the\ntest samples have not appeared in the training samples, which hinders the\nachievement of satisfactory performance. To improve OOE-NER performance, in\nthis paper, we propose a new framework, namely S+NER, which fully leverages\nsentence-level information. Our S+NER achieves better OOE-NER performance\nmainly due to the following two particular designs. 1) It first exploits the\npre-trained language model's capability of understanding the target entity's\nsentence-level context with a template set. 2) Then, it refines the\nsentence-level representation based on the positive and negative templates,\nthrough a contrastive learning strategy and template pooling method, to obtain\nbetter NER results. Our extensive experiments on five benchmark datasets have\ndemonstrated that, our S+NER outperforms some state-of-the-art OOE-NER models.\n","authors":["Guochao Jiang","Ziqin Luo","Chengwei Hu","Zepeng Ding","Deqing Yang"],"pdf_url":"https://arxiv.org/pdf/2412.08434v2.pdf","comment":"Accepted by COLING 2025"},{"id":"http://arxiv.org/abs/2411.06782v2","updated":"2025-01-13T14:11:49Z","published":"2024-11-11T08:19:54Z","title":"QuadWBG: Generalizable Quadrupedal Whole-Body Grasping","summary":"  Legged robots with advanced manipulation capabilities have the potential to\nsignificantly improve household duties and urban maintenance. Despite\nconsiderable progress in developing robust locomotion and precise manipulation\nmethods, seamlessly integrating these into cohesive whole-body control for\nreal-world applications remains challenging. In this paper, we present a\nmodular framework for robust and generalizable whole-body loco-manipulation\ncontroller based on a single arm-mounted camera. By using reinforcement\nlearning (RL), we enable a robust low-level policy for command execution over 5\ndimensions (5D) and a grasp-aware high-level policy guided by a novel metric,\nGeneralized Oriented Reachability Map (GORM). The proposed system achieves\nstate-of-the-art one-time grasping accuracy of 89% in the real world, including\nchallenging tasks such as grasping transparent objects. Through extensive\nsimulations and real-world experiments, we demonstrate that our system can\neffectively manage a large workspace, from floor level to above body height,\nand perform diverse whole-body loco-manipulation tasks.\n","authors":["Jilong Wang","Javokhirbek Rajabov","Chaoyi Xu","Yiming Zheng","He Wang"],"pdf_url":"https://arxiv.org/pdf/2411.06782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03836v2","updated":"2025-01-13T14:10:16Z","published":"2025-01-07T14:45:39Z","title":"SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor\n  Diagnosis","summary":"  Brain tumors can result in neurological dysfunction, alterations in cognitive\nand psychological states, increased intracranial pressure, and the occurrence\nof seizures, thereby presenting a substantial risk to human life and health.\nThe You Only Look Once(YOLO) series models have demonstrated superior accuracy\nin object detection for medical imaging. In this paper, we develop a novel\nSCC-YOLO architecture by integrating the SCConv attention mechanism into\nYOLOv9. The SCConv module reconstructs an efficient convolutional module by\nreducing spatial and channel redundancy among features, thereby enhancing the\nlearning of image features. We investigate the impact of intergrating different\nattention mechanisms with the YOLOv9 model on brain tumor image detection using\nboth the Br35H dataset and our self-made dataset(Brain_Tumor_Dataset).\nExperimental results show that on the Br35H dataset, SCC-YOLO achieved a 0.3%\nimprovement in mAp50 compared to YOLOv9, while on our self-made dataset,\nSCC-YOLO exhibited a 0.5% improvement over YOLOv9. SCC-YOLO has reached\nstate-of-the-art performance in brain tumor detection. Source code is available\nat : https://jihulab.com/healthcare-information-studio/SCC-YOLO/-/tree/master\n","authors":["Runci Bai"],"pdf_url":"https://arxiv.org/pdf/2501.03836v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05450v2","updated":"2025-01-13T13:54:31Z","published":"2024-10-07T19:34:25Z","title":"AI-Driven Early Mental Health Screening: Analyzing Selfies of Pregnant\n  Women","summary":"  Major Depressive Disorder and anxiety disorders affect millions globally,\ncontributing significantly to the burden of mental health issues. Early\nscreening is crucial for effective intervention, as timely identification of\nmental health issues can significantly improve treatment outcomes. Artificial\nintelligence (AI) can be valuable for improving the screening of mental\ndisorders, enabling early intervention and better treatment outcomes. AI-driven\nscreening can leverage the analysis of multiple data sources, including facial\nfeatures in digital images. However, existing methods often rely on controlled\nenvironments or specialized equipment, limiting their broad applicability. This\nstudy explores the potential of AI models for ubiquitous depression-anxiety\nscreening given face-centric selfies. The investigation focuses on high-risk\npregnant patients, a population that is particularly vulnerable to mental\nhealth issues. To cope with limited training data resulting from our clinical\nsetup, pre-trained models were utilized in two different approaches:\nfine-tuning convolutional neural networks (CNNs) originally designed for facial\nexpression recognition and employing vision-language models (VLMs) for\nzero-shot analysis of facial expressions. Experimental results indicate that\nthe proposed VLM-based method significantly outperforms CNNs, achieving an\naccuracy of 77.6%. Although there is significant room for improvement, the\nresults suggest that VLMs can be a promising approach for mental health\nscreening.\n","authors":["Gustavo A. Basílio","Thiago B. Pereira","Alessandro L. Koerich","Hermano Tavares","Ludmila Dias","Maria das Graças da S. Teixeira","Rafael T. Sousa","Wilian H. Hisatugu","Amanda S. Mota","Anilton S. Garcia","Marco Aurélio K. Galletta","Thiago M. Paixão"],"pdf_url":"https://arxiv.org/pdf/2410.05450v2.pdf","comment":"This article has been accepted for publication in HEALTHINF25 at the\n  18th International Joint Conference on Biomedical Engineering Systems and\n  Technologies (BIOSTEC 2025)"},{"id":"http://arxiv.org/abs/2501.07335v1","updated":"2025-01-13T13:47:05Z","published":"2025-01-13T13:47:05Z","title":"TempoGPT: Enhancing Temporal Reasoning via Quantizing Embedding","summary":"  Multi-modal language model has made advanced progress in vision and audio,\nbut still faces significant challenges in dealing with complex reasoning tasks\nin the time series domain. The reasons are twofold. First, labels for\nmulti-modal time series data are coarse and devoid of analysis or reasoning\nprocesses. Training with these data cannot improve the model's reasoning\ncapabilities. Second, due to the lack of precise tokenization in processing\ntime series, the representation patterns for temporal and textual information\nare inconsistent, which hampers the effectiveness of multi-modal alignment. To\naddress these challenges, we propose a multi-modal time series data\nconstruction approach and a multi-modal time series language model (TLM),\nTempoGPT. Specially, we construct multi-modal data for complex reasoning tasks\nby analyzing the variable-system relationships within a white-box system.\nAdditionally, proposed TempoGPT achieves consistent representation between\ntemporal and textual information by quantizing temporal embeddings, where\ntemporal embeddings are quantized into a series of discrete tokens using a\npredefined codebook; subsequently, a shared embedding layer processes both\ntemporal and textual tokens. Extensive experiments demonstrate that TempoGPT\naccurately perceives temporal information, logically infers conclusions, and\nachieves state-of-the-art in the constructed complex time series reasoning\ntasks. Moreover, we quantitatively demonstrate the effectiveness of quantizing\ntemporal embeddings in enhancing multi-modal alignment and the reasoning\ncapabilities of TLMs. Code and data are available at\nhttps://github.com/zhanghaochuan20/TempoGPT.\n","authors":["Haochuan Zhang","Chunhua Yang","Jie Han","Liyang Qin","Xiaoli Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07334v1","updated":"2025-01-13T13:47:00Z","published":"2025-01-13T13:47:00Z","title":"Anonymization of Documents for Law Enforcement with Machine Learning","summary":"  The steadily increasing utilization of data-driven methods and approaches in\nareas that handle sensitive personal information such as in law enforcement\nmandates an ever increasing effort in these institutions to comply with data\nprotection guidelines. In this work, we present a system for automatically\nanonymizing images of scanned documents, reducing manual effort while ensuring\ndata protection compliance. Our method considers the viability of further\nforensic processing after anonymization by minimizing automatically redacted\nareas by combining automatic detection of sensitive regions with knowledge from\na manually anonymized reference document. Using a self-supervised image model\nfor instance retrieval of the reference document, our approach requires only\none anonymized example to efficiently redact all documents of the same type,\nsignificantly reducing processing time. We show that our approach outperforms\nboth a purely automatic redaction system and also a naive copy-paste scheme of\nthe reference anonymization to other documents on a hand-crafted dataset of\nground truth redactions.\n","authors":["Manuel Eberhardinger","Patrick Takenaka","Daniel Grießhaber","Johannes Maucher"],"pdf_url":"https://arxiv.org/pdf/2501.07334v1.pdf","comment":"Accepted at IEEE Symposium on CI in Security, Defence and Biometrics\n  2025 (IEEE CISDB)"},{"id":"http://arxiv.org/abs/2409.10561v3","updated":"2025-01-13T13:12:09Z","published":"2024-09-11T14:41:44Z","title":"DrLLM: Prompt-Enhanced Distributed Denial-of-Service Resistance Method\n  with Large Language Models","summary":"  The increasing number of Distributed Denial of Service (DDoS) attacks poses a\nmajor threat to the Internet, highlighting the importance of DDoS mitigation.\nMost existing approaches require complex training methods to learn data\nfeatures, which increases the complexity and generality of the application. In\nthis paper, we propose DrLLM, which aims to mine anomalous traffic information\nin zero-shot scenarios through Large Language Models (LLMs). To bridge the gap\nbetween DrLLM and existing approaches, we embed the global and local\ninformation of the traffic data into the reasoning paradigm and design three\nmodules, namely Knowledge Embedding, Token Embedding, and Progressive Role\nReasoning, for data representation and reasoning. In addition we explore the\ngeneralization of prompt engineering in the cybersecurity domain to improve the\nclassification capability of DrLLM. Our ablation experiments demonstrate the\napplicability of DrLLM in zero-shot scenarios and further demonstrate the\npotential of LLMs in the network domains. DrLLM implementation code has been\nopen-sourced at https://github.com/liuup/DrLLM.\n","authors":["Zhenyu Yin","Shang Liu","Guangyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2409.10561v3.pdf","comment":"Accepted by ICASSP2025"},{"id":"http://arxiv.org/abs/2501.07301v1","updated":"2025-01-13T13:10:16Z","published":"2025-01-13T13:10:16Z","title":"The Lessons of Developing Process Reward Models in Mathematical\n  Reasoning","summary":"  Process Reward Models (PRMs) emerge as a promising approach for process\nsupervision in mathematical reasoning of Large Language Models (LLMs), which\naim to identify and mitigate intermediate errors in the reasoning processes.\nHowever, the development of effective PRMs faces significant challenges,\nparticularly in data annotation and evaluation methodologies. In this paper,\nthrough extensive experiments, we demonstrate that commonly used Monte Carlo\n(MC) estimation-based data synthesis for PRMs typically yields inferior\nperformance and generalization compared to LLM-as-a-judge and human annotation\nmethods. MC estimation relies on completion models to evaluate current-step\ncorrectness, leading to inaccurate step verification. Furthermore, we identify\npotential biases in conventional Best-of-N (BoN) evaluation strategies for\nPRMs: (1) The unreliable policy models generate responses with correct answers\nbut flawed processes, leading to a misalignment between the evaluation criteria\nof BoN and the PRM objectives of process verification. (2) The tolerance of\nPRMs of such responses leads to inflated BoN scores. (3) Existing PRMs have a\nsignificant proportion of minimum scores concentrated on the final answer\nsteps, revealing the shift from process to outcome-based assessment in BoN\nOptimized PRMs. To address these challenges, we develop a consensus filtering\nmechanism that effectively integrates MC estimation with LLM-as-a-judge and\nadvocates a more comprehensive evaluation framework that combines\nresponse-level and step-level metrics. Based on the mechanisms, we\nsignificantly improve both model performance and data efficiency in the BoN\nevaluation and the step-wise error identification task. Finally, we release a\nnew state-of-the-art PRM that outperforms existing open-source alternatives and\nprovides practical guidelines for future research in building process\nsupervision models.\n","authors":["Zhenru Zhang","Chujie Zheng","Yangzhen Wu","Beichen Zhang","Runji Lin","Bowen Yu","Dayiheng Liu","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07290v1","updated":"2025-01-13T12:59:53Z","published":"2025-01-13T12:59:53Z","title":"Principles for Responsible AI Consciousness Research","summary":"  Recent research suggests that it may be possible to build conscious AI\nsystems now or in the near future. Conscious AI systems would arguably deserve\nmoral consideration, and it may be the case that large numbers of conscious\nsystems could be created and caused to suffer. Furthermore, AI systems or\nAI-generated characters may increasingly give the impression of being\nconscious, leading to debate about their moral status. Organisations involved\nin AI research must establish principles and policies to guide research and\ndeployment choices and public communication concerning consciousness. Even if\nan organisation chooses not to study AI consciousness as such, it will still\nneed policies in place, as those developing advanced AI systems risk\ninadvertently creating conscious entities. Responsible research and deployment\npractices are essential to address this possibility. We propose five principles\nfor responsible research and argue that research organisations should make\nvoluntary, public commitments to principles on these lines. Our principles\nconcern research objectives and procedures, knowledge sharing and public\ncommunications.\n","authors":["Patrick Butlin","Theodoros Lappas"],"pdf_url":"https://arxiv.org/pdf/2501.07290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07288v1","updated":"2025-01-13T12:56:05Z","published":"2025-01-13T12:56:05Z","title":"LLM-Net: Democratizing LLMs-as-a-Service through Blockchain-based Expert\n  Networks","summary":"  The centralization of Large Language Models (LLMs) development has created\nsignificant barriers to AI advancement, limiting the democratization of these\npowerful technologies. This centralization, coupled with the scarcity of\nhigh-quality training data and mounting complexity of maintaining comprehensive\nexpertise across rapidly expanding knowledge domains, poses critical challenges\nto the continued growth of LLMs. While solutions like Retrieval-Augmented\nGeneration (RAG) offer potential remedies, maintaining up-to-date expert\nknowledge across diverse domains remains a significant challenge, particularly\ngiven the exponential growth of specialized information. This paper introduces\nLLMs Networks (LLM-Net), a blockchain-based framework that democratizes\nLLMs-as-a-Service through a decentralized network of specialized LLM providers.\nBy leveraging collective computational resources and distributed domain\nexpertise, LLM-Net incorporates fine-tuned expert models for various specific\ndomains, ensuring sustained knowledge growth while maintaining service quality\nthrough collaborative prompting mechanisms. The framework's robust design\nincludes blockchain technology for transparent transaction and performance\nvalidation, establishing an immutable record of service delivery. Our\nsimulation, built on top of state-of-the-art LLMs such as Claude 3.5 Sonnet,\nLlama 3.1, Grok-2, and GPT-4o, validates the effectiveness of the\nreputation-based mechanism in maintaining service quality by selecting\nhigh-performing respondents (LLM providers). Thereby it demonstrates the\npotential of LLM-Net to sustain AI advancement through the integration of\ndecentralized expertise and blockchain-based accountability.\n","authors":["Zan-Kai Chong","Hiroyuki Ohsaki","Bryan Ng"],"pdf_url":"https://arxiv.org/pdf/2501.07288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01311v2","updated":"2025-01-13T12:42:14Z","published":"2025-01-02T15:47:56Z","title":"Multi-Head Explainer: A General Framework to Improve Explainability in\n  CNNs and Transformers","summary":"  In this study, we introduce the Multi-Head Explainer (MHEX), a versatile and\nmodular framework that enhances both the explainability and accuracy of\nConvolutional Neural Networks (CNNs) and Transformer-based models. MHEX\nconsists of three core components: an Attention Gate that dynamically\nhighlights task-relevant features, Deep Supervision that guides early layers to\ncapture fine-grained details pertinent to the target class, and an Equivalent\nMatrix that unifies refined local and global representations to generate\ncomprehensive saliency maps. Our approach demonstrates superior compatibility,\nenabling effortless integration into existing residual networks like ResNet and\nTransformer architectures such as BERT with minimal modifications. Extensive\nexperiments on benchmark datasets in medical imaging and text classification\nshow that MHEX not only improves classification accuracy but also produces\nhighly interpretable and detailed saliency scores.\n","authors":["Bohang Sun","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2501.01311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07278v1","updated":"2025-01-13T12:42:04Z","published":"2025-01-13T12:42:04Z","title":"Lifelong Learning of Large Language Model based Agents: A Roadmap","summary":"  Lifelong learning, also known as continual or incremental learning, is a\ncrucial component for advancing Artificial General Intelligence (AGI) by\nenabling systems to continuously adapt in dynamic environments. While large\nlanguage models (LLMs) have demonstrated impressive capabilities in natural\nlanguage processing, existing LLM agents are typically designed for static\nsystems and lack the ability to adapt over time in response to new challenges.\nThis survey is the first to systematically summarize the potential techniques\nfor incorporating lifelong learning into LLM-based agents. We categorize the\ncore components of these agents into three modules: the perception module for\nmultimodal input integration, the memory module for storing and retrieving\nevolving knowledge, and the action module for grounded interactions with the\ndynamic environment. We highlight how these pillars collectively enable\ncontinuous adaptation, mitigate catastrophic forgetting, and improve long-term\nperformance. This survey provides a roadmap for researchers and practitioners\nworking to develop lifelong learning capabilities in LLM agents, offering\ninsights into emerging trends, evaluation metrics, and application scenarios.\nRelevant literature and resources are available at \\href{this\nurl}{https://github.com/qianlima-lab/awesome-lifelong-llm-agent}.\n","authors":["Junhao Zheng","Chengming Shi","Xidi Cai","Qiuke Li","Duzhen Zhang","Chenxing Li","Dong Yu","Qianli Ma"],"pdf_url":"https://arxiv.org/pdf/2501.07278v1.pdf","comment":"46 pages"},{"id":"http://arxiv.org/abs/2501.07276v1","updated":"2025-01-13T12:41:27Z","published":"2025-01-13T12:41:27Z","title":"Bridging Smart Meter Gaps: A Benchmark of Statistical, Machine Learning\n  and Time Series Foundation Models for Data Imputation","summary":"  The integrity of time series data in smart grids is often compromised by\nmissing values due to sensor failures, transmission errors, or disruptions.\nGaps in smart meter data can bias consumption analyses and hinder reliable\npredictions, causing technical and economic inefficiencies. As smart meter data\ngrows in volume and complexity, conventional techniques struggle with its\nnonlinear and nonstationary patterns. In this context, Generative Artificial\nIntelligence offers promising solutions that may outperform traditional\nstatistical methods. In this paper, we evaluate two general-purpose Large\nLanguage Models and five Time Series Foundation Models for smart meter data\nimputation, comparing them with conventional Machine Learning and statistical\nmodels. We introduce artificial gaps (30 minutes to one day) into an anonymized\npublic dataset to test inference capabilities. Results show that Time Series\nFoundation Models, with their contextual understanding and pattern recognition,\ncould significantly enhance imputation accuracy in certain cases. However, the\ntrade-off between computational cost and performance gains remains a critical\nconsideration.\n","authors":["Amir Sartipi","Joaquin Delgado Fernandez","Sergio Potenciano Menci","Alessio Magitteri"],"pdf_url":"https://arxiv.org/pdf/2501.07276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17726v4","updated":"2025-01-13T12:38:41Z","published":"2024-03-26T14:14:30Z","title":"Tiny Models are the Computational Saver for Large Models","summary":"  This paper introduces TinySaver, an early-exit-like dynamic model compression\napproach which employs tiny models to substitute large models adaptively.\nDistinct from traditional compression techniques, dynamic methods like\nTinySaver can leverage the difficulty differences to allow certain inputs to\ncomplete their inference processes early, thereby conserving computational\nresources. Most existing early exit designs are implemented by attaching\nadditional network branches to the model's backbone. Our study, however,\nreveals that completely independent tiny models can replace a substantial\nportion of the larger models' job with minimal impact on performance. Employing\nthem as the first exit can remarkably enhance computational efficiency. By\nsearching and employing the most appropriate tiny model as the computational\nsaver for a given large model, the proposed approaches work as a novel and\ngeneric method to model compression. This finding will help the research\ncommunity in exploring new compression methods to address the escalating\ncomputational demands posed by rapidly evolving AI models. Our evaluation of\nthis approach in ImageNet-1k classification demonstrates its potential to\nreduce the number of compute operations by up to 90\\%, with only negligible\nlosses in performance, across various modern vision models.\n","authors":["Qingyuan Wang","Barry Cardiff","Antoine Frappé","Benoit Larras","Deepu John"],"pdf_url":"https://arxiv.org/pdf/2403.17726v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20351v3","updated":"2025-01-13T12:27:56Z","published":"2024-05-28T06:59:16Z","title":"Imitating from auxiliary imperfect demonstrations via Adversarial\n  Density Weighted Regression","summary":"  We propose a novel one-step supervised imitation learning (IL) framework\ncalled Adversarial Density Regression (ADR). This IL framework aims to correct\nthe policy learned on unknown-quality to match the expert distribution by\nutilizing demonstrations, without relying on the Bellman operator.\nSpecifically, ADR addresses several limitations in previous IL algorithms:\nFirst, most IL algorithms are based on the Bellman operator, which inevitably\nsuffer from cumulative offsets from sub-optimal rewards during multi-step\nupdate processes. Additionally, off-policy training frameworks suffer from\nOut-of-Distribution (OOD) state-actions. Second, while conservative terms help\nsolve the OOD issue, balancing the conservative term is difficult. To address\nthese limitations, we fully integrate a one-step density-weighted Behavioral\nCloning (BC) objective for IL with auxiliary imperfect demonstration.\nTheoretically, we demonstrate that this adaptation can effectively correct the\ndistribution of policies trained on unknown-quality datasets to align with the\nexpert policy's distribution. Moreover, the difference between the empirical\nand the optimal value function is proportional to the upper bound of ADR's\nobjective, indicating that minimizing ADR's objective is akin to approaching\nthe optimal value. Experimentally, we validated the performance of ADR by\nconducting extensive evaluations. Specifically, ADR outperforms all of the\nselected IL algorithms on tasks from the Gym-Mujoco domain. Meanwhile, it\nachieves an 89.5% improvement over IQL when utilizing ground truth rewards on\ntasks from the Adroit and Kitchen domains. Our codebase will be released at:\nhttps://github.com/stevezhangzA/Adverserial_Density_Regression.\n","authors":["Ziqi Zhang","Zifeng Zhuang","Jingzehua Xu","Yiyuan Yang","Yubo Huang","Donglin Wang","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20351v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07260v1","updated":"2025-01-13T12:18:58Z","published":"2025-01-13T12:18:58Z","title":"Skip Mamba Diffusion for Monocular 3D Semantic Scene Completion","summary":"  3D semantic scene completion is critical for multiple downstream tasks in\nautonomous systems. It estimates missing geometric and semantic information in\nthe acquired scene data. Due to the challenging real-world conditions, this\ntask usually demands complex models that process multi-modal data to achieve\nacceptable performance. We propose a unique neural model, leveraging advances\nfrom the state space and diffusion generative modeling to achieve remarkable 3D\nsemantic scene completion performance with monocular image input. Our technique\nprocesses the data in the conditioned latent space of a variational autoencoder\nwhere diffusion modeling is carried out with an innovative state space\ntechnique. A key component of our neural network is the proposed Skimba (Skip\nMamba) denoiser, which is adept at efficiently processing long-sequence data.\nThe Skimba diffusion model is integral to our 3D scene completion network,\nincorporating a triple Mamba structure, dimensional decomposition residuals and\nvarying dilations along three directions. We also adopt a variant of this\nnetwork for the subsequent semantic segmentation stage of our method. Extensive\nevaluation on the standard SemanticKITTI and SSCBench-KITTI360 datasets show\nthat our approach not only outperforms other monocular techniques by a large\nmargin, it also achieves competitive performance against stereo methods. The\ncode is available at https://github.com/xrkong/skimba\n","authors":["Li Liang","Naveed Akhtar","Jordan Vice","Xiangrui Kong","Ajmal Saeed Mian"],"pdf_url":"https://arxiv.org/pdf/2501.07260v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.05068v2","updated":"2025-01-13T12:06:15Z","published":"2025-01-09T08:44:06Z","title":"D3RM: A Discrete Denoising Diffusion Refinement Model for Piano\n  Transcription","summary":"  Diffusion models have been widely used in the generative domain due to their\nconvincing performance in modeling complex data distributions. Moreover, they\nhave shown competitive results on discriminative tasks, such as image\nsegmentation. While diffusion models have also been explored for automatic\nmusic transcription, their performance has yet to reach a competitive level. In\nthis paper, we focus on discrete diffusion model's refinement capabilities and\npresent a novel architecture for piano transcription. Our model utilizes\nNeighborhood Attention layers as the denoising module, gradually predicting the\ntarget high-resolution piano roll, conditioned on the finetuned features of a\npretrained acoustic model. To further enhance refinement, we devise a novel\nstrategy which applies distinct transition states during training and inference\nstage of discrete diffusion models. Experiments on the MAESTRO dataset show\nthat our approach outperforms previous diffusion-based piano transcription\nmodels and the baseline model in terms of F1 score. Our code is available in\nhttps://github.com/hanshounsu/d3rm.\n","authors":["Hounsu Kim","Taegyun Kwon","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2501.05068v2.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07251v1","updated":"2025-01-13T12:00:34Z","published":"2025-01-13T12:00:34Z","title":"MOS-Attack: A Scalable Multi-objective Adversarial Attack Framework","summary":"  Crafting adversarial examples is crucial for evaluating and enhancing the\nrobustness of Deep Neural Networks (DNNs), presenting a challenge equivalent to\nmaximizing a non-differentiable 0-1 loss function.\n  However, existing single objective methods, namely adversarial attacks focus\non a surrogate loss function, do not fully harness the benefits of engaging\nmultiple loss functions, as a result of insufficient understanding of their\nsynergistic and conflicting nature.\n  To overcome these limitations, we propose the Multi-Objective Set-based\nAttack (MOS Attack), a novel adversarial attack framework leveraging multiple\nloss functions and automatically uncovering their interrelations.\n  The MOS Attack adopts a set-based multi-objective optimization strategy,\nenabling the incorporation of numerous loss functions without additional\nparameters.\n  It also automatically mines synergistic patterns among various losses,\nfacilitating the generation of potent adversarial attacks with fewer\nobjectives.\n  Extensive experiments have shown that our MOS Attack outperforms\nsingle-objective attacks. Furthermore, by harnessing the identified synergistic\npatterns, MOS Attack continues to show superior results with a reduced number\nof loss functions.\n","authors":["Ping Guo","Cheng Gong","Xi Lin","Fei Liu","Zhichao Lu","Qingfu Zhang","Zhenkun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07251v1.pdf","comment":"Under Review of CVPR 2025"},{"id":"http://arxiv.org/abs/2403.12094v2","updated":"2025-01-13T11:46:59Z","published":"2024-03-15T06:57:08Z","title":"Are LLMs Good Cryptic Crossword Solvers?","summary":"  Cryptic crosswords are puzzles that rely not only on general knowledge but\nalso on the solver's ability to manipulate language on different levels and\ndeal with various types of wordplay. Previous research suggests that solving\nsuch puzzles is a challenge even for modern NLP models. However, the abilities\nof large language models (LLMs) have not yet been tested on this task. In this\npaper, we establish the benchmark results for three popular LLMs -- LLaMA2,\nMistral, and ChatGPT -- showing that their performance on this task is still\nfar from that of humans.\n","authors":["Abdelrahman Sadallah","Daria Kotova","Ekaterina Kochmar"],"pdf_url":"https://arxiv.org/pdf/2403.12094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20104v2","updated":"2025-01-13T11:46:06Z","published":"2024-12-28T10:12:12Z","title":"SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object\n  Interaction Synthesis","summary":"  Synthesizing realistic human-object interaction motions is a critical problem\nin VR/AR and human animation. Unlike the commonly studied scenarios involving a\nsingle human or hand interacting with one object, we address a more generic\nmulti-body setting with arbitrary numbers of humans, hands, and objects. This\ncomplexity introduces significant challenges in synchronizing motions due to\nthe high correlations and mutual influences among bodies. To address these\nchallenges, we introduce SyncDiff, a novel method for multi-body interaction\nsynthesis using a synchronized motion diffusion strategy. SyncDiff employs a\nsingle diffusion model to capture the joint distribution of multi-body motions.\nTo enhance motion fidelity, we propose a frequency-domain motion decomposition\nscheme. Additionally, we introduce a new set of alignment scores to emphasize\nthe synchronization of different body motions. SyncDiff jointly optimizes both\ndata sample likelihood and alignment likelihood through an explicit\nsynchronization strategy. Extensive experiments across four datasets with\nvarious multi-body configurations demonstrate the superiority of SyncDiff over\nexisting state-of-the-art motion synthesis methods.\n","authors":["Wenkun He","Yun Liu","Ruitao Liu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2412.20104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07238v1","updated":"2025-01-13T11:36:33Z","published":"2025-01-13T11:36:33Z","title":"Lessons From Red Teaming 100 Generative AI Products","summary":"  In recent years, AI red teaming has emerged as a practice for probing the\nsafety and security of generative AI systems. Due to the nascency of the field,\nthere are many open questions about how red teaming operations should be\nconducted. Based on our experience red teaming over 100 generative AI products\nat Microsoft, we present our internal threat model ontology and eight main\nlessons we have learned:\n  1. Understand what the system can do and where it is applied\n  2. You don't have to compute gradients to break an AI system\n  3. AI red teaming is not safety benchmarking\n  4. Automation can help cover more of the risk landscape\n  5. The human element of AI red teaming is crucial\n  6. Responsible AI harms are pervasive but difficult to measure\n  7. LLMs amplify existing security risks and introduce new ones\n  8. The work of securing AI systems will never be complete\n  By sharing these insights alongside case studies from our operations, we\noffer practical recommendations aimed at aligning red teaming efforts with real\nworld risks. We also highlight aspects of AI red teaming that we believe are\noften misunderstood and discuss open questions for the field to consider.\n","authors":["Blake Bullwinkel","Amanda Minnich","Shiven Chawla","Gary Lopez","Martin Pouliot","Whitney Maxwell","Joris de Gruyter","Katherine Pratt","Saphir Qi","Nina Chikanov","Roman Lutz","Raja Sekhar Rao Dheekonda","Bolor-Erdene Jagdagdorj","Eugenia Kim","Justin Song","Keegan Hines","Daniel Jones","Giorgio Severi","Richard Lundeen","Sam Vaughan","Victoria Westerhoff","Pete Bryan","Ram Shankar Siva Kumar","Yonatan Zunger","Chang Kawaguchi","Mark Russinovich"],"pdf_url":"https://arxiv.org/pdf/2501.07238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07237v1","updated":"2025-01-13T11:35:09Z","published":"2025-01-13T11:35:09Z","title":"Breaking Memory Limits: Gradient Wavelet Transform Enhances LLMs\n  Training","summary":"  Large language models (LLMs) have shown impressive performance across a range\nof natural language processing tasks. However, their vast number of parameters\nintroduces significant memory challenges during training, particularly when\nusing memory-intensive optimizers like Adam. Existing memory-efficient\nalgorithms often rely on techniques such as singular value decomposition\nprojection or weight freezing. While these approaches help alleviate memory\nconstraints, they generally produce suboptimal results compared to full-rank\nupdates. In this paper, we investigate the memory-efficient method beyond\nlow-rank training, proposing a novel solution called Gradient Wavelet Transform\n(GWT), which applies wavelet transforms to gradients in order to significantly\nreduce the memory requirements for maintaining optimizer states. We demonstrate\nthat GWT can be seamlessly integrated with memory-intensive optimizers,\nenabling efficient training without sacrificing performance. Through extensive\nexperiments on both pre-training and fine-tuning tasks, we show that GWT\nachieves state-of-the-art performance compared with advanced memory-efficient\noptimizers and full-rank approaches in terms of both memory usage and training\nperformance.\n","authors":["Ziqing Wen","Ping Luo","Jiahuan Wang","Xiaoge Deng","Jinping Zou","Kun Yuan","Tao Sun","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.07237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07221v1","updated":"2025-01-13T11:20:44Z","published":"2025-01-13T11:20:44Z","title":"Exploring the Use of Contrastive Language-Image Pre-Training for Human\n  Posture Classification: Insights from Yoga Pose Analysis","summary":"  Accurate human posture classification in images and videos is crucial for\nautomated applications across various fields, including work safety, physical\nrehabilitation, sports training, or daily assisted living. Recently, multimodal\nlearning methods, such as Contrastive Language-Image Pretraining (CLIP), have\nadvanced significantly in jointly understanding images and text. This study\naims to assess the effectiveness of CLIP in classifying human postures,\nfocusing on its application in yoga. Despite the initial limitations of the\nzero-shot approach, applying transfer learning on 15,301 images (real and\nsynthetic) with 82 classes has shown promising results. The article describes\nthe full procedure for fine-tuning, including the choice for image description\nsyntax, models and hyperparameters adjustment. The fine-tuned CLIP model,\ntested on 3826 images, achieves an accuracy of over 85%, surpassing the current\nstate-of-the-art of previous works on the same dataset by approximately 6%, its\ntraining time being 3.5 times lower than what is needed to fine-tune a\nYOLOv8-based model. For more application-oriented scenarios, with smaller\ndatasets of six postures each, containing 1301 and 401 training images, the\nfine-tuned models attain an accuracy of 98.8% and 99.1%, respectively.\nFurthermore, our experiments indicate that training with as few as 20 images\nper pose can yield around 90% accuracy in a six-class dataset. This study\ndemonstrates that this multimodal technique can be effectively used for yoga\npose classification, and possibly for human posture classification, in general.\nAdditionally, CLIP inference time (around 7 ms) supports that the model can be\nintegrated into automated systems for posture evaluation, e.g., for developing\na real-time personal yoga assistant for performance assessment.\n","authors":["Andrzej D. Dobrzycki","Ana M. Bernardos","Luca Bergesio","Andrzej Pomirski","Daniel Sáez-Trigueros"],"pdf_url":"https://arxiv.org/pdf/2501.07221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07213v1","updated":"2025-01-13T11:12:47Z","published":"2025-01-13T11:12:47Z","title":"Multi-face emotion detection for effective Human-Robot Interaction","summary":"  The integration of dialogue interfaces in mobile devices has become\nubiquitous, providing a wide array of services. As technology progresses,\nhumanoid robots designed with human-like features to interact effectively with\npeople are gaining prominence, and the use of advanced human-robot dialogue\ninterfaces is continually expanding. In this context, emotion recognition plays\na crucial role in enhancing human-robot interaction by enabling robots to\nunderstand human intentions. This research proposes a facial emotion detection\ninterface integrated into a mobile humanoid robot, capable of displaying\nreal-time emotions from multiple individuals on a user interface. To this end,\nvarious deep neural network models for facial expression recognition were\ndeveloped and evaluated under consistent computer-based conditions, yielding\npromising results. Afterwards, a trade-off between accuracy and memory\nfootprint was carefully considered to effectively implement this application on\na mobile humanoid robot.\n","authors":["Mohamed Ala Yahyaoui","Mouaad Oujabour","Leila Ben Letaifa","Amine Bohi"],"pdf_url":"https://arxiv.org/pdf/2501.07213v1.pdf","comment":"9 pages, 8 figures and 1 table. Accepted at the 17th International\n  Conference on Agents and Artificial Intelligence (ICAART 2025), Porto,\n  Portugal"},{"id":"http://arxiv.org/abs/2412.13503v2","updated":"2025-01-13T10:43:11Z","published":"2024-12-18T04:55:29Z","title":"VaeDiff-DocRE: End-to-end Data Augmentation Framework for Document-level\n  Relation Extraction","summary":"  Document-level Relation Extraction (DocRE) aims to identify relationships\nbetween entity pairs within a document. However, most existing methods assume a\nuniform label distribution, resulting in suboptimal performance on real-world,\nimbalanced datasets. To tackle this challenge, we propose a novel data\naugmentation approach using generative models to enhance data from the\nembedding space. Our method leverages the Variational Autoencoder (VAE)\narchitecture to capture all relation-wise distributions formed by entity pair\nrepresentations and augment data for underrepresented relations. To better\ncapture the multi-label nature of DocRE, we parameterize the VAE's latent space\nwith a Diffusion Model. Additionally, we introduce a hierarchical training\nframework to integrate the proposed VAE-based augmentation module into DocRE\nsystems. Experiments on two benchmark datasets demonstrate that our method\noutperforms state-of-the-art models, effectively addressing the long-tail\ndistribution problem in DocRE.\n","authors":["Khai Phan Tran","Wen Hua","Xue Li"],"pdf_url":"https://arxiv.org/pdf/2412.13503v2.pdf","comment":"COLING 2025"},{"id":"http://arxiv.org/abs/2501.07196v1","updated":"2025-01-13T10:42:55Z","published":"2025-01-13T10:42:55Z","title":"Crowdsourced human-based computational approach for tagging peripheral\n  blood smear sample images from Sickle Cell Disease patients using non-expert\n  users","summary":"  In this paper, we present a human-based computation approach for the analysis\nof peripheral blood smear (PBS) images images in patients with Sickle Cell\nDisease (SCD). We used the Mechanical Turk microtask market to crowdsource the\nlabeling of PBS images. We then use the expert-tagged erythrocytesIDB dataset\nto assess the accuracy and reliability of our proposal. Our results showed that\nwhen a robust consensus is achieved among the Mechanical Turk workers,\nprobability of error is very low, based on comparison with expert analysis.\nThis suggests that our proposed approach can be used to annotate datasets of\nPBS images, which can then be used to train automated methods for the diagnosis\nof SCD. In future work, we plan to explore the potential integration of our\nfindings with outcomes obtained through automated methodologies. This could\nlead to the development of more accurate and reliable methods for the diagnosis\nof SCD\n","authors":["José María Buades Rubio","Gabriel Moyà-Alcover","Antoni Jaume-i-Capó","Nataša Petrović"],"pdf_url":"https://arxiv.org/pdf/2501.07196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11543v4","updated":"2025-01-13T10:39:04Z","published":"2024-11-18T13:01:57Z","title":"PSA-VLM: Enhancing Vision-Language Model Safety through Progressive\n  Concept-Bottleneck-Driven Alignment","summary":"  Benefiting from the powerful capabilities of Large Language Models (LLMs),\npre-trained visual encoder models connected to LLMs form Vision Language Models\n(VLMs). However, recent research shows that the visual modality in VLMs is\nhighly vulnerable, allowing attackers to bypass safety alignment in LLMs\nthrough visually transmitted content, launching harmful attacks. To address\nthis challenge, we propose a progressive concept-based alignment strategy,\nPSA-VLM, which incorporates safety modules as concept bottlenecks to enhance\nvisual modality safety alignment. By aligning model predictions with specific\nsafety concepts, we improve defenses against risky images, enhancing\nexplainability and controllability while minimally impacting general\nperformance. Our method is obtained through two-stage training. The low\ncomputational cost of the first stage brings very effective performance\nimprovement, and the fine-tuning of the language model in the second stage\nfurther improves the safety performance. Our method achieves state-of-the-art\nresults on popular VLM safety benchmark.\n","authors":["Zhendong Liu","Yuanbi Nie","Yingshui Tan","Jiaheng Liu","Xiangyu Yue","Qiushi Cui","Chongjun Wang","Xiaoyong Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.11543v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.13581"},{"id":"http://arxiv.org/abs/2501.05767v2","updated":"2025-01-13T10:38:32Z","published":"2025-01-10T07:56:23Z","title":"Migician: Revealing the Magic of Free-Form Multi-Image Grounding in\n  Multimodal Large Language Models","summary":"  The recent advancement of Multimodal Large Language Models (MLLMs) has\nsignificantly improved their fine-grained perception of single images and\ngeneral comprehension across multiple images. However, existing MLLMs still\nface challenges in achieving precise grounding in complex multi-image\nscenarios. To address this, we first explore a Chain-of-Thought (CoT) framework\nthat integrates single-image grounding with multi-image comprehension. While\npartially effective, it remains unstable and struggles to capture abstract\nvisual information due to its non-end-to-end nature. Therefore, we introduce\nMigician, the first multi-image grounding model capable of performing free-form\nand accurate grounding across multiple images. To support this, we present the\nMGrounding-630k dataset, which comprises data for several multi-image grounding\ntasks derived from existing datasets, along with newly generated free-form\ngrounding instruction-following data. Furthermore, we propose MIG-Bench, a\ncomprehensive benchmark specifically designed for evaluating multi-image\ngrounding capabilities. Experimental results demonstrate that our model\nachieves significantly superior multi-image grounding capabilities,\noutperforming the best existing MLLMs by 21.61% and even surpassing much larger\n70B models. Our code, model, dataset, and benchmark are fully open-sourced at\nhttps://migician-vg.github.io/.\n","authors":["You Li","Heyu Huang","Chi Chen","Kaiyu Huang","Chao Huang","Zonghao Guo","Zhiyuan Liu","Jinan Xu","Yuhua Li","Ruixuan Li","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2501.05767v2.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.10496v2","updated":"2025-01-13T10:34:16Z","published":"2024-09-16T17:28:21Z","title":"MusicLIME: Explainable Multimodal Music Understanding","summary":"  Multimodal models are critical for music understanding tasks, as they capture\nthe complex interplay between audio and lyrics. However, as these models become\nmore prevalent, the need for explainability grows-understanding how these\nsystems make decisions is vital for ensuring fairness, reducing bias, and\nfostering trust. In this paper, we introduce MusicLIME, a model-agnostic\nfeature importance explanation method designed for multimodal music models.\nUnlike traditional unimodal methods, which analyze each modality separately\nwithout considering the interaction between them, often leading to incomplete\nor misleading explanations, MusicLIME reveals how audio and lyrical features\ninteract and contribute to predictions, providing a holistic view of the\nmodel's decision-making. Additionally, we enhance local explanations by\naggregating them into global explanations, giving users a broader perspective\nof model behavior. Through this work, we contribute to improving the\ninterpretability of multimodal music models, empowering users to make informed\nchoices, and fostering more equitable, fair, and transparent music\nunderstanding systems.\n","authors":["Theodoros Sotirou","Vassilis Lyberatos","Orfeas Menis Mastromichalakis","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2409.10496v2.pdf","comment":"GitHub repository: https://github.com/IamTheo2000/MusicLIME. To be\n  presented at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07186v1","updated":"2025-01-13T10:31:36Z","published":"2025-01-13T10:31:36Z","title":"Generalizable Graph Neural Networks for Robust Power Grid Topology\n  Control","summary":"  The energy transition necessitates new congestion management methods. One\nsuch method is controlling the grid topology with machine learning (ML). This\napproach has gained popularity following the Learning to Run a Power Network\n(L2RPN) competitions. Graph neural networks (GNNs) are a class of ML models\nthat reflect graph structure in their computation, which makes them suitable\nfor power grid modeling. Various GNN approaches for topology control have thus\nbeen proposed. We propose the first GNN model for grid topology control that\nuses only GNN layers. Additionally, we identify the busbar information\nasymmetry problem that the popular homogeneous graph representation suffers\nfrom, and propose a heterogeneous graph representation to resolve it. We train\nboth homogeneous and heterogeneous GNNs and fully connected neural networks\n(FCNN) baselines on an imitation learning task. We evaluate the models\naccording to their classification accuracy and grid operation ability. We find\nthat the heterogeneous GNNs perform best on in-distribution networks, followed\nby the FCNNs, and lastly, the homogeneous GNNs. We also find that both GNN\ntypes generalize better to out-of-distribution networks than FCNNs.\n","authors":["Matthijs de Jong","Jan Viebahn","Yuliya Shapovalova"],"pdf_url":"https://arxiv.org/pdf/2501.07186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07183v1","updated":"2025-01-13T10:29:09Z","published":"2025-01-13T10:29:09Z","title":"Kriging and Gaussian Process Interpolation for Georeferenced Data\n  Augmentation","summary":"  Data augmentation is a crucial step in the development of robust supervised\nlearning models, especially when dealing with limited datasets. This study\nexplores interpolation techniques for the augmentation of geo-referenced data,\nwith the aim of predicting the presence of Commelina benghalensis L. in\nsugarcane plots in La R{\\'e}union. Given the spatial nature of the data and the\nhigh cost of data collection, we evaluated two interpolation approaches:\nGaussian processes (GPs) with different kernels and kriging with various\nvariograms. The objectives of this work are threefold: (i) to identify which\ninterpolation methods offer the best predictive performance for various\nregression algorithms, (ii) to analyze the evolution of performance as a\nfunction of the number of observations added, and (iii) to assess the spatial\nconsistency of augmented datasets. The results show that GP-based methods, in\nparticular with combined kernels (GP-COMB), significantly improve the\nperformance of regression algorithms while requiring less additional data.\nAlthough kriging shows slightly lower performance, it is distinguished by a\nmore homogeneous spatial coverage, a potential advantage in certain contexts.\n","authors":["Frédérick Fabre Ferber","Dominique Gay","Jean-Christophe Soulié","Jean Diatta","Odalric-Ambrym Maillard"],"pdf_url":"https://arxiv.org/pdf/2501.07183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07178v1","updated":"2025-01-13T10:16:48Z","published":"2025-01-13T10:16:48Z","title":"The Spoils of Algorithmic Collusion: Profit Allocation Among Asymmetric\n  Firms","summary":"  We study the propensity of independent algorithms to collude in repeated\nCournot duopoly games. Specifically, we investigate the predictive power of\ndifferent oligopoly and bargaining solutions regarding the effect of asymmetry\nbetween firms. We find that both consumers and firms can benefit from\nasymmetry. Algorithms produce more competitive outcomes when firms are\nsymmetric, but less when they are very asymmetric. Although the static Nash\nequilibrium underestimates the effect on total quantity and overestimates the\neffect on profits, it delivers surprisingly accurate predictions in terms of\ntotal welfare. The best description of our results is provided by the equal\nrelative gains solution. In particular, we find algorithms to agree on profits\nthat are on or close to the Pareto frontier for all degrees of asymmetry. Our\nresults suggest that the common belief that symmetric industries are more prone\nto collusion may no longer hold when algorithms increasingly drive managerial\ndecisions.\n","authors":["Simon Martin","Hans-Theo Normann","Paul Püplichhuisen","Tobias Werner"],"pdf_url":"https://arxiv.org/pdf/2501.07178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19655v2","updated":"2025-01-13T10:08:07Z","published":"2024-09-29T11:00:41Z","title":"Assessment and manipulation of latent constructs in pre-trained language\n  models using psychometric scales","summary":"  Human-like personality traits have recently been discovered in large language\nmodels, raising the hypothesis that their (known and as yet undiscovered)\nbiases conform with human latent psychological constructs. While large\nconversational models may be tricked into answering psychometric\nquestionnaires, the latent psychological constructs of thousands of simpler\ntransformers, trained for other tasks, cannot be assessed because appropriate\npsychometric methods are currently lacking. Here, we show how standard\npsychological questionnaires can be reformulated into natural language\ninference prompts, and we provide a code library to support the psychometric\nassessment of arbitrary models. We demonstrate, using a sample of 88 publicly\navailable models, the existence of human-like mental health-related constructs\n(including anxiety, depression, and Sense of Coherence) which conform with\nstandard theories in human psychology and show similar correlations and\nmitigation strategies. The ability to interpret and rectify the performance of\nlanguage models by using psychological tools can boost the development of more\nexplainable, controllable, and trustworthy models.\n","authors":["Maor Reuben","Ortal Slobodin","Aviad Elyshar","Idan-Chaim Cohen","Orna Braun-Lewensohn","Odeya Cohen","Rami Puzis"],"pdf_url":"https://arxiv.org/pdf/2409.19655v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07172v1","updated":"2025-01-13T10:04:55Z","published":"2025-01-13T10:04:55Z","title":"Anomalous Agreement: How to find the Ideal Number of Anomaly Classes in\n  Correlated, Multivariate Time Series Data","summary":"  Detecting and classifying abnormal system states is critical for condition\nmonitoring, but supervised methods often fall short due to the rarity of\nanomalies and the lack of labeled data. Therefore, clustering is often used to\ngroup similar abnormal behavior. However, evaluating cluster quality without\nground truth is challenging, as existing measures such as the Silhouette Score\n(SSC) only evaluate the cohesion and separation of clusters and ignore possible\nprior knowledge about the data. To address this challenge, we introduce the\nSynchronized Anomaly Agreement Index (SAAI), which exploits the synchronicity\nof anomalies across multivariate time series to assess cluster quality. We\ndemonstrate the effectiveness of SAAI by showing that maximizing SAAI improves\naccuracy on the task of finding the true number of anomaly classes K in\ncorrelated time series by 0.23 compared to SSC and by 0.32 compared to X-Means.\nWe also show that clusters obtained by maximizing SAAI are easier to interpret\ncompared to SSC.\n","authors":["Ferdinand Rewicki","Joachim Denzler","Julia Niebling"],"pdf_url":"https://arxiv.org/pdf/2501.07172v1.pdf","comment":"Acccepted at AAAI Workshop on AI for Time Series Analysis (AI4TS)\n  2025"},{"id":"http://arxiv.org/abs/2412.15523v2","updated":"2025-01-13T10:01:56Z","published":"2024-12-20T03:23:26Z","title":"InstructOCR: Instruction Boosting Scene Text Spotting","summary":"  In the field of scene text spotting, previous OCR methods primarily relied on\nimage encoders and pre-trained text information, but they often overlooked the\nadvantages of incorporating human language instructions. To address this gap,\nwe propose InstructOCR, an innovative instruction-based scene text spotting\nmodel that leverages human language instructions to enhance the understanding\nof text within images. Our framework employs both text and image encoders\nduring training and inference, along with instructions meticulously designed\nbased on text attributes. This approach enables the model to interpret text\nmore accurately and flexibly. Extensive experiments demonstrate the\neffectiveness of our model and we achieve state-of-the-art results on widely\nused benchmarks. Furthermore, the proposed framework can be seamlessly applied\nto scene text VQA tasks. By leveraging instruction strategies during\npre-training, the performance on downstream VQA tasks can be significantly\nimproved, with a 2.6% increase on the TextVQA dataset and a 2.1% increase on\nthe ST-VQA dataset. These experimental results provide insights into the\nbenefits of incorporating human language instructions for OCR-related tasks.\n","authors":["Chen Duan","Qianyi Jiang","Pei Fu","Jiamin Chen","Shengxi Li","Zining Wang","Shan Guo","Junfeng Luo"],"pdf_url":"https://arxiv.org/pdf/2412.15523v2.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.07166v1","updated":"2025-01-13T09:51:50Z","published":"2025-01-13T09:51:50Z","title":"Natural Language-Assisted Multi-modal Medication Recommendation","summary":"  Combinatorial medication recommendation(CMR) is a fundamental task of\nhealthcare, which offers opportunities for clinical physicians to provide more\nprecise prescriptions for patients with intricate health conditions,\nparticularly in the scenarios of long-term medical care. Previous research\nefforts have sought to extract meaningful information from electronic health\nrecords (EHRs) to facilitate combinatorial medication recommendations. Existing\nlearning-based approaches further consider the chemical structures of\nmedications, but ignore the textual medication descriptions in which the\nfunctionalities are clearly described. Furthermore, the textual knowledge\nderived from the EHRs of patients remains largely underutilized. To address\nthese issues, we introduce the Natural Language-Assisted Multi-modal Medication\nRecommendation(NLA-MMR), a multi-modal alignment framework designed to learn\nknowledge from the patient view and medication view jointly. Specifically,\nNLA-MMR formulates CMR as an alignment problem from patient and medication\nmodalities. In this vein, we employ pretrained language models(PLMs) to extract\nin-domain knowledge regarding patients and medications, serving as the\nfoundational representation for both modalities. In the medication modality, we\nexploit both chemical structures and textual descriptions to create medication\nrepresentations. In the patient modality, we generate the patient\nrepresentations based on textual descriptions of diagnosis, procedure, and\nsymptom. Extensive experiments conducted on three publicly accessible datasets\ndemonstrate that NLA-MMR achieves new state-of-the-art performance, with a\nnotable average improvement of 4.72% in Jaccard score. Our source code is\npublicly available on https://github.com/jtan1102/NLA-MMR_CIKM_2024.\n","authors":["Jie Tan","Yu Rong","Kangfei Zhao","Tian Bian","Tingyang Xu","Junzhou Huang","Hong Cheng","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2501.07166v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2501.07161v1","updated":"2025-01-13T09:41:54Z","published":"2025-01-13T09:41:54Z","title":"QuantuneV2: Compiler-Based Local Metric-Driven Mixed Precision\n  Quantization for Practical Embedded AI Applications","summary":"  Mixed-precision quantization methods have been proposed to reduce model size\nwhile minimizing accuracy degradation. However, existing studies require\nretraining and do not consider the computational overhead and intermediate\nrepresentations (IR) generated during the compilation process, limiting their\napplication at the compiler level. This computational overhead refers to the\nruntime latency caused by frequent quantization and dequantization operations\nduring inference. Performing these operations at the individual operator level\ncauses significant runtime delays. To address these issues, we propose\nQuantuneV2, a compiler-based mixed-precision quantization method designed for\npractical embedded AI applications. QuantuneV2 performs inference only twice,\nonce before quantization and once after quantization, and operates with a\ncomputational complexity of O(n) that increases linearly with the number of\nmodel parameters. We also made the sensitivity analysis more stable by using\nlocal metrics like weights, activation values, the Signal to Quantization Noise\nRatio, and the Mean Squared Error. We also cut down on computational overhead\nby choosing the best IR and using operator fusion. Experimental results show\nthat QuantuneV2 achieved up to a 10.28 percent improvement in accuracy and a\n12.52 percent increase in speed compared to existing methods across five\nmodels: ResNet18v1, ResNet50v1, SqueezeNetv1, VGGNet, and MobileNetv2. This\ndemonstrates that QuantuneV2 enhances model performance while maintaining\ncomputational efficiency, making it suitable for deployment in embedded AI\nenvironments.\n","authors":["Jeongseok Kim","Jemin Lee","Yongin Kwon","Daeyoung Kim"],"pdf_url":"https://arxiv.org/pdf/2501.07161v1.pdf","comment":"18 pages, 10 figures, Accepted in Future Generation Computer Systems\n  Journal"},{"id":"http://arxiv.org/abs/2406.05862v3","updated":"2025-01-13T09:33:47Z","published":"2024-06-09T17:25:47Z","title":"II-Bench: An Image Implication Understanding Benchmark for Multimodal\n  Large Language Models","summary":"  The rapid advancements in the development of multimodal large language models\n(MLLMs) have consistently led to new breakthroughs on various benchmarks. In\nresponse, numerous challenging and comprehensive benchmarks have been proposed\nto more accurately assess the capabilities of MLLMs. However, there is a dearth\nof exploration of the higher-order perceptual capabilities of MLLMs. To fill\nthis gap, we propose the Image Implication understanding Benchmark, II-Bench,\nwhich aims to evaluate the model's higher-order perception of images. Through\nextensive experiments on II-Bench across multiple MLLMs, we have made\nsignificant findings. Initially, a substantial gap is observed between the\nperformance of MLLMs and humans on II-Bench. The pinnacle accuracy of MLLMs\nattains 74.8%, whereas human accuracy averages 90%, peaking at an impressive\n98%. Subsequently, MLLMs perform worse on abstract and complex images,\nsuggesting limitations in their ability to understand high-level semantics and\ncapture image details. Finally, it is observed that most models exhibit\nenhanced accuracy when image sentiment polarity hints are incorporated into the\nprompts. This observation underscores a notable deficiency in their inherent\nunderstanding of image sentiment. We believe that II-Bench will inspire the\ncommunity to develop the next generation of MLLMs, advancing the journey\ntowards expert artificial general intelligence (AGI). II-Bench is publicly\navailable at https://huggingface.co/datasets/m-a-p/II-Bench.\n","authors":["Ziqiang Liu","Feiteng Fang","Xi Feng","Xinrun Du","Chenhao Zhang","Zekun Wang","Yuelin Bai","Qixuan Zhao","Liyang Fan","Chengguang Gan","Hongquan Lin","Jiaming Li","Yuansheng Ni","Haihong Wu","Yaswanth Narsupalli","Zhigang Zheng","Chengming Li","Xiping Hu","Ruifeng Xu","Xiaojun Chen","Min Yang","Jiaheng Liu","Ruibo Liu","Wenhao Huang","Ge Zhang","Shiwen Ni"],"pdf_url":"https://arxiv.org/pdf/2406.05862v3.pdf","comment":"100 pages, 82 figures, add citations"},{"id":"http://arxiv.org/abs/2501.07158v1","updated":"2025-01-13T09:33:03Z","published":"2025-01-13T09:33:03Z","title":"Eye Sclera for Fair Face Image Quality Assessment","summary":"  Fair operational systems are crucial in gaining and maintaining society's\ntrust in face recognition systems (FRS). FRS start with capturing an image and\nassessing its quality before using it further for enrollment or verification.\nFair Face Image Quality Assessment (FIQA) schemes therefore become equally\nimportant in the context of fair FRS. This work examines the sclera as a\nquality assessment region for obtaining a fair FIQA. The sclera region is\nagnostic to demographic variations and skin colour for assessing the quality of\na face image. We analyze three skin tone related ISO/IEC face image quality\nassessment measures and assess the sclera region as an alternative area for\nassessing FIQ. Our analysis of the face dataset of individuals from different\ndemographic groups representing different skin tones indicates sclera as an\nalternative to measure dynamic range, over- and under-exposure of face using\nsclera region alone. The sclera region being agnostic to skin tone, i.e.,\ndemographic factors, provides equal utility as a fair FIQA as shown by our\nError-vs-Discard Characteristic (EDC) curve analysis.\n","authors":["Wassim Kabbani","Kiran Raja","Raghavendra Ramachandra","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.07158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07157v1","updated":"2025-01-13T09:30:38Z","published":"2025-01-13T09:30:38Z","title":"CureGraph: Contrastive Multi-Modal Graph Representation Learning for\n  Urban Living Circle Health Profiling and Prediction","summary":"  The early detection and prediction of health status decline among the elderly\nat the neighborhood level are of great significance for urban planning and\npublic health policymaking. While existing studies affirm the connection\nbetween living environments and health outcomes, most rely on single data\nmodalities or simplistic feature concatenation of multi-modal information,\nlimiting their ability to comprehensively profile the health-oriented urban\nenvironments. To fill this gap, we propose CureGraph, a contrastive multi-modal\nrepresentation learning framework for urban health prediction that employs\ngraph-based techniques to infer the prevalence of common chronic diseases among\nthe elderly within the urban living circles of each neighborhood. CureGraph\nleverages rich multi-modal information, including photos and textual reviews of\nresidential areas and their surrounding points of interest, to generate urban\nneighborhood embeddings. By integrating pre-trained visual and textual encoders\nwith graph modeling techniques, CureGraph captures cross-modal spatial\ndependencies, offering a comprehensive understanding of urban environments\ntailored to elderly health considerations. Extensive experiments on real-world\ndatasets demonstrate that CureGraph improves the best baseline by $28\\%$ on\naverage in terms of $R^2$ across elderly disease risk prediction tasks.\nMoreover, the model enables the identification of stage-wise chronic disease\nprogression and supports comparative public health analysis across\nneighborhoods, offering actionable insights for sustainable urban development\nand enhanced quality of life. The code is publicly available at\nhttps://github.com/jinlin2021/CureGraph.\n","authors":["Jinlin Li","Xiao Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.07157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07146v1","updated":"2025-01-13T09:11:33Z","published":"2025-01-13T09:11:33Z","title":"TIMRL: A Novel Meta-Reinforcement Learning Framework for Non-Stationary\n  and Multi-Task Environments","summary":"  In recent years, meta-reinforcement learning (meta-RL) algorithm has been\nproposed to improve sample efficiency in the field of decision-making and\ncontrol, enabling agents to learn new knowledge from a small number of samples.\nHowever, most research uses the Gaussian distribution to extract task\nrepresentation, which is poorly adapted to tasks that change in non-stationary\nenvironment. To address this problem, we propose a novel meta-reinforcement\nlearning method by leveraging Gaussian mixture model and the transformer\nnetwork to construct task inference model. The Gaussian mixture model is\nutilized to extend the task representation and conduct explicit encoding of\ntasks. Specifically, the classification of tasks is encoded through transformer\nnetwork to determine the Gaussian component corresponding to the task. By\nleveraging task labels, the transformer network is trained using supervised\nlearning. We validate our method on MuJoCo benchmarks with non-stationary and\nmulti-task environments. Experimental results demonstrate that the proposed\nmethod dramatically improves sample efficiency and accurately recognizes the\nclassification of the tasks, while performing excellently in the environment.\n","authors":["Chenyang Qi","Huiping Li","Panfeng Huang"],"pdf_url":"https://arxiv.org/pdf/2501.07146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10676v2","updated":"2025-01-13T09:10:18Z","published":"2024-11-16T02:41:12Z","title":"Exploring Feature-based Knowledge Distillation for Recommender System: A\n  Frequency Perspective","summary":"  In this paper, we analyze the feature-based knowledge distillation for\nrecommendation from the frequency perspective. By defining knowledge as\ndifferent frequency components of the features, we theoretically demonstrate\nthat regular feature-based knowledge distillation is equivalent to equally\nminimizing losses on all knowledge and further analyze how this equal loss\nweight allocation method leads to important knowledge being overlooked. In\nlight of this, we propose to emphasize important knowledge by redistributing\nknowledge weights. Furthermore, we propose FreqD, a lightweight knowledge\nreweighting method, to avoid the computational cost of calculating losses on\neach knowledge. Extensive experiments demonstrate that FreqD consistently and\nsignificantly outperforms state-of-the-art knowledge distillation methods for\nrecommender systems. Our code is available at https://github.com/woriazzc/KDs.\n","authors":["Zhangchi Zhu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.10676v2.pdf","comment":"ACM KDD 2025 Accepted"},{"id":"http://arxiv.org/abs/2407.19507v2","updated":"2025-01-13T08:58:40Z","published":"2024-07-28T14:58:07Z","title":"WeCromCL: Weakly Supervised Cross-Modality Contrastive Learning for\n  Transcription-only Supervised Text Spotting","summary":"  Transcription-only Supervised Text Spotting aims to learn text spotters\nrelying only on transcriptions but no text boundaries for supervision, thus\neliminating expensive boundary annotation. The crux of this task lies in\nlocating each transcription in scene text images without location annotations.\nIn this work, we formulate this challenging problem as a Weakly Supervised\nCross-modality Contrastive Learning problem, and design a simple yet effective\nmodel dubbed WeCromCL that is able to detect each transcription in a scene\nimage in a weakly supervised manner. Unlike typical methods for cross-modality\ncontrastive learning that focus on modeling the holistic semantic correlation\nbetween an entire image and a text description, our WeCromCL conducts atomistic\ncontrastive learning to model the character-wise appearance consistency between\na text transcription and its correlated region in a scene image to detect an\nanchor point for the transcription in a weakly supervised manner. The detected\nanchor points by WeCromCL are further used as pseudo location labels to guide\nthe learning of text spotting. Extensive experiments on four challenging\nbenchmarks demonstrate the superior performance of our model over other\nmethods. Code will be released.\n","authors":["Jingjing Wu","Zhengyao Fang","Pengyuan Lyu","Chengquan Zhang","Fanglin Chen","Guangming Lu","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2407.19507v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2501.07139v1","updated":"2025-01-13T08:58:00Z","published":"2025-01-13T08:58:00Z","title":"FlexQuant: Elastic Quantization Framework for Locally Hosted LLM on Edge\n  Devices","summary":"  Deploying LLMs on edge devices presents serious technical challenges. Memory\nelasticity is crucial for edge devices with unified memory, where memory is\nshared and fluctuates dynamically. Existing solutions suffer from either poor\ntransition granularity or high storage costs. We propose FlexQuant, a novel\nelasticity framework that generates an ensemble of quantized models, providing\nan elastic hosting solution with 15x granularity improvement and 10x storage\nreduction compared to SoTA methods. FlexQuant works with most quantization\nmethods and creates a family of trade-off options under various storage limits\nthrough our pruning method. It brings great performance and flexibility to the\nedge deployment of LLMs.\n","authors":["Yuji Chai","Mujin Kwen","David Brooks","Gu-Yeon Wei"],"pdf_url":"https://arxiv.org/pdf/2501.07139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05826v2","updated":"2025-01-13T08:56:05Z","published":"2025-01-10T10:03:56Z","title":"AI-Driven Diabetic Retinopathy Screening: Multicentric Validation of\n  AIDRSS in India","summary":"  Purpose: Diabetic retinopathy (DR) is a major cause of vision loss,\nparticularly in India, where access to retina specialists is limited in rural\nareas. This study aims to evaluate the Artificial Intelligence-based Diabetic\nRetinopathy Screening System (AIDRSS) for DR detection and prevalence\nassessment, addressing the growing need for scalable, automated screening\nsolutions in resource-limited settings.\n  Approach: A multicentric, cross-sectional study was conducted in Kolkata,\nIndia, involving 5,029 participants and 10,058 macula-centric retinal fundus\nimages. The AIDRSS employed a deep learning algorithm with 50 million trainable\nparameters, integrated with Contrast Limited Adaptive Histogram Equalization\n(CLAHE) preprocessing for enhanced image quality. DR was graded using the\nInternational Clinical Diabetic Retinopathy (ICDR) Scale, categorizing disease\ninto five stages (DR0 to DR4). Statistical metrics including sensitivity,\nspecificity, and prevalence rates were evaluated against expert retina\nspecialist assessments.\n  Results: The prevalence of DR in the general population was 13.7%, rising to\n38.2% among individuals with elevated random blood glucose levels. The AIDRSS\nachieved an overall sensitivity of 92%, specificity of 88%, and 100%\nsensitivity for detecting referable DR (DR3 and DR4). These results demonstrate\nthe system's robust performance in accurately identifying and grading DR in a\ndiverse population.\n  Conclusions: AIDRSS provides a reliable, scalable solution for early DR\ndetection in resource-constrained environments. Its integration of advanced AI\ntechniques ensures high diagnostic accuracy, with potential to significantly\nreduce the burden of diabetes-related vision loss in underserved regions.\n","authors":["Amit Kr Dey","Pradeep Walia","Girish Somvanshi","Abrar Ali","Sagarnil Das","Pallabi Paul","Minakhi Ghosh"],"pdf_url":"https://arxiv.org/pdf/2501.05826v2.pdf","comment":"22 pages, 5 figures. arXiv admin note: substantial text overlap with\n  arXiv:1812.07105 by other authors without attribution"},{"id":"http://arxiv.org/abs/2312.04746v3","updated":"2025-01-13T08:08:28Z","published":"2023-12-07T23:16:37Z","title":"Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized\n  Narratives from Open-Source Histopathology Videos","summary":"  Diagnosis in histopathology requires a global whole slide images (WSIs)\nanalysis, requiring pathologists to compound evidence from different WSI\npatches. The gigapixel scale of WSIs poses a challenge for histopathology\nmulti-modal models. Training multi-model models for histopathology requires\ninstruction tuning datasets, which currently contain information for individual\nimage patches, without a spatial grounding of the concepts within each patch\nand without a wider view of the WSI. Therefore, they lack sufficient diagnostic\ncapacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a\nlarge-scale dataset of 107,131 histopathology-specific instruction\nquestion/answer pairs, grounded within diagnostically relevant image patches\nthat make up the WSI. Our dataset is collected by leveraging educational\nhistopathology videos from YouTube, which provides spatial localization of\nnarrations by automatically extracting the narrators' cursor positions.\nQuilt-Instruct supports contextual reasoning by extracting diagnosis and\nsupporting facts from the entire WSI. Using Quilt-Instruct, we train\nQuilt-LLaVA, which can reason beyond the given single image patch, enabling\ndiagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a\ncomprehensive evaluation dataset created from 985 images and 1283\nhuman-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using\npublic histopathology datasets, where Quilt-LLaVA significantly outperforms\nSOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set\nVQA. Our code, data, and model are publicly accessible at\nquilt-llava.github.io.\n","authors":["Mehmet Saygin Seyfioglu","Wisdom O. Ikezogwo","Fatemeh Ghezloo","Ranjay Krishna","Linda Shapiro"],"pdf_url":"https://arxiv.org/pdf/2312.04746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16264v2","updated":"2025-01-13T08:00:49Z","published":"2024-12-20T09:22:07Z","title":"Continual Learning with Strategic Selection and Forgetting for Network\n  Intrusion Detection","summary":"  Intrusion Detection Systems (IDS) are crucial for safeguarding digital\ninfrastructure. In dynamic network environments, both threat landscapes and\nnormal operational behaviors are constantly changing, resulting in concept\ndrift. While continuous learning mitigates the adverse effects of concept\ndrift, insufficient attention to drift patterns and excessive preservation of\noutdated knowledge can still hinder the IDS's adaptability. In this paper, we\npropose SSF (Strategic Selection and Forgetting), a novel continual learning\nmethod for IDS, providing continuous model updates with a constantly refreshed\nmemory buffer. Our approach features a strategic sample selection algorithm to\nselect representative new samples and a strategic forgetting mechanism to drop\noutdated samples. The proposed strategic sample selection algorithm prioritizes\nnew samples that cause the `drifted' pattern, enabling the model to better\nunderstand the evolving landscape. Additionally, we introduce strategic\nforgetting upon detecting significant drift by discarding outdated samples to\nfree up memory, allowing the incorporation of more recent data. SSF captures\nevolving patterns effectively and ensures the model is aligned with the change\nof data patterns, significantly enhancing the IDS's adaptability to concept\ndrift. The state-of-the-art performance of SSF on NSL-KDD and UNSW-NB15\ndatasets demonstrates its superior adaptability to concept drift for network\nintrusion detection.\n","authors":["Xinchen Zhang","Running Zhao","Zhihan Jiang","Handi Chen","Yulong Ding","Edith C. H. Ngai","Shuang-Hua Yang"],"pdf_url":"https://arxiv.org/pdf/2412.16264v2.pdf","comment":"Accepted by IEEE International Conference on Computer Communications\n  (INFOCOM) 2025"},{"id":"http://arxiv.org/abs/2501.07108v1","updated":"2025-01-13T07:42:55Z","published":"2025-01-13T07:42:55Z","title":"How GPT learns layer by layer","summary":"  Large Language Models (LLMs) excel at tasks like language processing,\nstrategy games, and reasoning but struggle to build generalizable internal\nrepresentations essential for adaptive decision-making in agents. For agents to\neffectively navigate complex environments, they must construct reliable world\nmodels. While LLMs perform well on specific benchmarks, they often fail to\ngeneralize, leading to brittle representations that limit their real-world\neffectiveness. Understanding how LLMs build internal world models is key to\ndeveloping agents capable of consistent, adaptive behavior across tasks. We\nanalyze OthelloGPT, a GPT-based model trained on Othello gameplay, as a\ncontrolled testbed for studying representation learning. Despite being trained\nsolely on next-token prediction with random valid moves, OthelloGPT shows\nmeaningful layer-wise progression in understanding board state and gameplay.\nEarly layers capture static attributes like board edges, while deeper layers\nreflect dynamic tile changes. To interpret these representations, we compare\nSparse Autoencoders (SAEs) with linear probes, finding that SAEs offer more\nrobust, disentangled insights into compositional features, whereas linear\nprobes mainly detect features useful for classification. We use SAEs to decode\nfeatures related to tile color and tile stability, a previously unexamined\nfeature that reflects complex gameplay concepts like board control and\nlong-term planning. We study the progression of linear probe accuracy and tile\ncolor using both SAE's and linear probes to compare their effectiveness at\ncapturing what the model is learning. Although we begin with a smaller language\nmodel, OthelloGPT, this study establishes a framework for understanding the\ninternal representations learned by GPT models, transformers, and LLMs more\nbroadly. Our code is publicly available: https://github.com/ALT-JS/OthelloSAE.\n","authors":["Jason Du","Kelly Hong","Alishba Imran","Erfan Jahanparast","Mehdi Khfifi","Kaichun Qiao"],"pdf_url":"https://arxiv.org/pdf/2501.07108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17692v3","updated":"2025-01-13T07:41:44Z","published":"2024-09-26T09:57:16Z","title":"MIO: A Foundation Model on Multimodal Tokens","summary":"  In this paper, we introduce MIO, a novel foundation model built on multimodal\ntokens, capable of understanding and generating speech, text, images, and\nvideos in an end-to-end, autoregressive manner. While the emergence of large\nlanguage models (LLMs) and multimodal large language models (MM-LLMs) propels\nadvancements in artificial general intelligence through their versatile\ncapabilities, they still lack true any-to-any understanding and generation.\nRecently, the release of GPT-4o has showcased the remarkable potential of\nany-to-any LLMs for complex real-world tasks, enabling omnidirectional input\nand output across images, speech, and text. However, it is closed-source and\ndoes not support the generation of multimodal interleaved sequences. To address\nthis gap, we present MIO, which is trained on a mixture of discrete tokens\nacross four modalities using causal multimodal modeling. MIO undergoes a\nfour-stage training process: (1) alignment pre-training, (2) interleaved\npre-training, (3) speech-enhanced pre-training, and (4) comprehensive\nsupervised fine-tuning on diverse textual, visual, and speech tasks. Our\nexperimental results indicate that MIO exhibits competitive, and in some cases\nsuperior, performance compared to previous dual-modal baselines, any-to-any\nmodel baselines, and even modality-specific baselines. Moreover, MIO\ndemonstrates advanced capabilities inherent to its any-to-any feature, such as\ninterleaved video-text generation, chain-of-visual-thought reasoning, visual\nguideline generation, instructional image editing, etc.\n","authors":["Zekun Wang","King Zhu","Chunpu Xu","Wangchunshu Zhou","Jiaheng Liu","Yibo Zhang","Jiashuo Wang","Ning Shi","Siyu Li","Yizhi Li","Haoran Que","Zhaoxiang Zhang","Yuanxing Zhang","Ge Zhang","Ke Xu","Jie Fu","Wenhao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.17692v3.pdf","comment":"Technical Report. Codes and models are available in\n  https://github.com/MIO-Team/MIO"},{"id":"http://arxiv.org/abs/2406.12315v6","updated":"2025-01-13T07:38:09Z","published":"2024-06-18T06:37:26Z","title":"A Comprehensive Study of Structural Pruning for Vision Models","summary":"  Structural pruning has emerged as a promising approach for producing more\nefficient models. Nevertheless, the community suffers from a lack of\nstandardized benchmarks and metrics, leaving the progress in this area not\nfully comprehended.To fill this gap, we present the first comprehensive\nbenchmark, termed PruningBench, for structural pruning. PruningBench showcases\nthe following three characteristics: 1) PruningBench employs a unified and\nconsistent framework for evaluating the effectiveness of diverse structural\npruning techniques; 2) PruningBench systematically evaluates 16 existing\npruning methods, encompassing a wide array of models (e.g., CNNs and ViTs) and\ntasks (e.g., classification and detection); 3) PruningBench provides easily\nimplementable interfaces to facilitate the implementation of future pruning\nmethods, and enables the subsequent researchers to incorporate their work into\nour leaderboards. We provide an online pruning platform for customizing pruning\ntasks and reproducing all results in this paper. Leaderboard results can also\nbe available.\n","authors":["Changhao Li","Haoling Li","Mengqi Xue","Gongfan Fang","Sheng Zhou","Zunlei Feng","Huiqiong Wang","Mingli Song","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2406.12315v6.pdf","comment":"This is a paper aims to present a evaluation benchmark for structural\n  pruning. The full text is 25 pages"},{"id":"http://arxiv.org/abs/2501.07102v1","updated":"2025-01-13T07:27:00Z","published":"2025-01-13T07:27:00Z","title":"AdaCS: Adaptive Normalization for Enhanced Code-Switching ASR","summary":"  Intra-sentential code-switching (CS) refers to the alternation between\nlanguages that happens within a single utterance and is a significant challenge\nfor Automatic Speech Recognition (ASR) systems. For example, when a Vietnamese\nspeaker uses foreign proper names or specialized terms within their speech. ASR\nsystems often struggle to accurately transcribe intra-sentential CS due to\ntheir training on monolingual data and the unpredictable nature of CS. This\nissue is even more pronounced for low-resource languages, where limited data\navailability hinders the development of robust models. In this study, we\npropose AdaCS, a normalization model integrates an adaptive bias attention\nmodule (BAM) into encoder-decoder network. This novel approach provides a\nrobust solution to CS ASR in unseen domains, thereby significantly enhancing\nour contribution to the field. By utilizing BAM to both identify and normalize\nCS phrases, AdaCS enhances its adaptive capabilities with a biased list of\nwords provided during inference. Our method demonstrates impressive performance\nand the ability to handle unseen CS phrases across various domains. Experiments\nshow that AdaCS outperforms previous state-of-the-art method on Vietnamese CS\nASR normalization by considerable WER reduction of 56.2% and 36.8% on the two\nproposed test sets.\n","authors":["The Chuong Chu","Vu Tuan Dat Pham","Kien Dao","Hoang Nguyen","Quoc Hung Truong"],"pdf_url":"https://arxiv.org/pdf/2501.07102v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07100v1","updated":"2025-01-13T07:26:05Z","published":"2025-01-13T07:26:05Z","title":"Collaborative Learning for 3D Hand-Object Reconstruction and\n  Compositional Action Recognition from Egocentric RGB Videos Using\n  Superquadrics","summary":"  With the availability of egocentric 3D hand-object interaction datasets,\nthere is increasing interest in developing unified models for hand-object pose\nestimation and action recognition. However, existing methods still struggle to\nrecognise seen actions on unseen objects due to the limitations in representing\nobject shape and movement using 3D bounding boxes. Additionally, the reliance\non object templates at test time limits their generalisability to unseen\nobjects. To address these challenges, we propose to leverage superquadrics as\nan alternative 3D object representation to bounding boxes and demonstrate their\neffectiveness on both template-free object reconstruction and action\nrecognition tasks. Moreover, as we find that pure appearance-based methods can\noutperform the unified methods, the potential benefits from 3D geometric\ninformation remain unclear. Therefore, we study the compositionality of actions\nby considering a more challenging task where the training combinations of verbs\nand nouns do not overlap with the testing split. We extend H2O and FPHA\ndatasets with compositional splits and design a novel collaborative learning\nframework that can explicitly reason about the geometric relations between\nhands and the manipulated object. Through extensive quantitative and\nqualitative evaluations, we demonstrate significant improvements over the\nstate-of-the-arts in (compositional) action recognition.\n","authors":["Tze Ho Elden Tse","Runyang Feng","Linfang Zheng","Jiho Park","Yixing Gao","Jihie Kim","Ales Leonardis","Hyung Jin Chang"],"pdf_url":"https://arxiv.org/pdf/2501.07100v1.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2412.07249v2","updated":"2025-01-13T07:22:02Z","published":"2024-12-10T07:18:51Z","title":"Buster: Implanting Semantic Backdoor into Text Encoder to Mitigate NSFW\n  Content Generation","summary":"  The rise of deep learning models in the digital era has raised substantial\nconcerns regarding the generation of Not-Safe-for-Work (NSFW) content. Existing\ndefense methods primarily involve model fine-tuning and post-hoc content\nmoderation. Nevertheless, these approaches largely lack scalability in\neliminating harmful content, degrade the quality of benign image generation, or\nincur high inference costs. To address these challenges, we propose an\ninnovative framework named \\textit{Buster}, which injects backdoors into the\ntext encoder to prevent NSFW content generation. Buster leverages deep semantic\ninformation rather than explicit prompts as triggers, redirecting NSFW prompts\ntowards targeted benign prompts. Additionally, Buster employs energy-based\ntraining data generation through Langevin dynamics for adversarial knowledge\naugmentation, thereby ensuring robustness in harmful concept definition. This\napproach demonstrates exceptional resilience and scalability in mitigating NSFW\ncontent. Particularly, Buster fine-tunes the text encoder of Text-to-Image\nmodels within merely five minutes, showcasing its efficiency. Our extensive\nexperiments denote that Buster outperforms nine state-of-the-art baselines,\nachieving a superior NSFW content removal rate of at least 91.2\\% while\npreserving the quality of harmless images.\n","authors":["Xin Zhao","Xiaojun Chen","Yuexin Xuan","Zhendong Zhao","Xiaojun Jia","Xinfeng Li","Xiaofeng Wang"],"pdf_url":"https://arxiv.org/pdf/2412.07249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01933v4","updated":"2025-01-13T07:13:56Z","published":"2024-08-04T05:15:02Z","title":"DiReCT: Diagnostic Reasoning for Clinical Notes via Large Language\n  Models","summary":"  Large language models (LLMs) have recently showcased remarkable capabilities,\nspanning a wide range of tasks and applications, including those in the medical\ndomain. Models like GPT-4 excel in medical question answering but may face\nchallenges in the lack of interpretability when handling complex tasks in real\nclinical settings. We thus introduce the diagnostic reasoning dataset for\nclinical notes (DiReCT), aiming at evaluating the reasoning ability and\ninterpretability of LLMs compared to human doctors. It contains 511 clinical\nnotes, each meticulously annotated by physicians, detailing the diagnostic\nreasoning process from observations in a clinical note to the final diagnosis.\nAdditionally, a diagnostic knowledge graph is provided to offer essential\nknowledge for reasoning, which may not be covered in the training data of\nexisting LLMs. Evaluations of leading LLMs on DiReCT bring out a significant\ngap between their reasoning ability and that of human doctors, highlighting the\ncritical need for models that can reason effectively in real-world clinical\nscenarios.\n","authors":["Bowen Wang","Jiuyang Chang","Yiming Qian","Guoxin Chen","Junhao Chen","Zhouqiang Jiang","Jiahao Zhang","Yuta Nakashima","Hajime Nagahara"],"pdf_url":"https://arxiv.org/pdf/2408.01933v4.pdf","comment":"9 pages,6 figures"},{"id":"http://arxiv.org/abs/2411.19943v3","updated":"2025-01-13T06:53:56Z","published":"2024-11-29T18:58:22Z","title":"Critical Tokens Matter: Token-Level Contrastive Estimation Enhances\n  LLM's Reasoning Capability","summary":"  Mathematical reasoning tasks pose significant challenges for large language\nmodels (LLMs) because they require precise logical deduction and sequence\nanalysis. In this work, we introduce the concept of critical tokens -- elements\nwithin reasoning trajectories that significantly influence incorrect outcomes.\nWe present a novel framework for identifying these tokens through rollout\nsampling and demonstrate their substantial divergence from traditional error\ntokens. Through extensive experiments on datasets such as GSM8K and MATH500, we\nshow that identifying and replacing critical tokens significantly improves\nmodel accuracy. We propose an efficient methodology for pinpointing these\ntokens in large-scale datasets using contrastive estimation and extend this\nframework to enhance model training processes with direct preference\noptimization (DPO). Experimental results on GSM8K and MATH500 benchmarks with\nthe widely used models Llama-3 (8B and 70B) and Deepseek-math (7B) demonstrate\nthe effectiveness of the proposed approach, cDPO. Our results underscore the\npotential of leveraging critical tokens to reduce errors in reasoning tasks,\nadvancing the development of AI systems capable of robust logical deduction.\nOur code, annotated datasets, and trained models are available at\nhttps://github.com/chenzhiling9954/Critical-Tokens-Matter to support and\nencourage future research in this promising field.\n","authors":["Zicheng Lin","Tian Liang","Jiahao Xu","Qiuzhi Lin","Xing Wang","Ruilin Luo","Chufan Shi","Siheng Li","Yujiu Yang","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2411.19943v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2412.10047v2","updated":"2025-01-13T06:47:21Z","published":"2024-12-13T11:19:56Z","title":"Large Action Models: From Inception to Implementation","summary":"  As AI continues to advance, there is a growing demand for systems that go\nbeyond language-based assistance and move toward intelligent agents capable of\nperforming real-world actions. This evolution requires the transition from\ntraditional Large Language Models (LLMs), which excel at generating textual\nresponses, to Large Action Models (LAMs), designed for action generation and\nexecution within dynamic environments. Enabled by agent systems, LAMs hold the\npotential to transform AI from passive language understanding to active task\ncompletion, marking a significant milestone in the progression toward\nartificial general intelligence.\n  In this paper, we present a comprehensive framework for developing LAMs,\noffering a systematic approach to their creation, from inception to deployment.\nWe begin with an overview of LAMs, highlighting their unique characteristics\nand delineating their differences from LLMs. Using a Windows OS-based agent as\na case study, we provide a detailed, step-by-step guide on the key stages of\nLAM development, including data collection, model training, environment\nintegration, grounding, and evaluation. This generalizable workflow can serve\nas a blueprint for creating functional LAMs in various application domains. We\nconclude by identifying the current limitations of LAMs and discussing\ndirections for future research and industrial deployment, emphasizing the\nchallenges and opportunities that lie ahead in realizing the full potential of\nLAMs in real-world applications.\n  The code for the data collection process utilized in this paper is publicly\navailable at: https://github.com/microsoft/UFO/tree/main/dataflow, and\ncomprehensive documentation can be found at\nhttps://microsoft.github.io/UFO/dataflow/overview/.\n","authors":["Lu Wang","Fangkai Yang","Chaoyun Zhang","Junting Lu","Jiaxu Qian","Shilin He","Pu Zhao","Bo Qiao","Ray Huang","Si Qin","Qisheng Su","Jiayi Ye","Yudi Zhang","Jian-Guang Lou","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.10047v2.pdf","comment":"25pages,12 figures"},{"id":"http://arxiv.org/abs/2501.07088v1","updated":"2025-01-13T06:47:05Z","published":"2025-01-13T06:47:05Z","title":"MathReader : Text-to-Speech for Mathematical Documents","summary":"  TTS (Text-to-Speech) document reader from Microsoft, Adobe, Apple, and OpenAI\nhave been serviced worldwide. They provide relatively good TTS results for\ngeneral plain text, but sometimes skip contents or provide unsatisfactory\nresults for mathematical expressions. This is because most modern academic\npapers are written in LaTeX, and when LaTeX formulas are compiled, they are\nrendered as distinctive text forms within the document. However, traditional\nTTS document readers output only the text as it is recognized, without\nconsidering the mathematical meaning of the formulas. To address this issue, we\npropose MathReader, which effectively integrates OCR, a fine-tuned T5 model,\nand TTS. MathReader demonstrated a lower Word Error Rate (WER) than existing\nTTS document readers, such as Microsoft Edge and Adobe Acrobat, when processing\ndocuments containing mathematical formulas. MathReader reduced the WER from\n0.510 to 0.281 compared to Microsoft Edge, and from 0.617 to 0.281 compared to\nAdobe Acrobat. This will significantly contribute to alleviating the\ninconvenience faced by users who want to listen to documents, especially those\nwho are visually impaired. The code is available at\nhttps://github.com/hyeonsieun/MathReader.\n","authors":["Sieun Hyeon","Kyudan Jung","Nam-Joon Kim","Hyun Gon Ryu","Jaeyoung Do"],"pdf_url":"https://arxiv.org/pdf/2501.07088v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07087v1","updated":"2025-01-13T06:45:32Z","published":"2025-01-13T06:45:32Z","title":"Video Quality Assessment for Online Processing: From Spatial to Temporal\n  Sampling","summary":"  With the rapid development of multimedia processing and deep learning\ntechnologies, especially in the field of video understanding, video quality\nassessment (VQA) has achieved significant progress. Although researchers have\nmoved from designing efficient video quality mapping models to various research\ndirections, in-depth exploration of the effectiveness-efficiency trade-offs of\nspatio-temporal modeling in VQA models is still less sufficient. Considering\nthe fact that videos have highly redundant information, this paper investigates\nthis problem from the perspective of joint spatial and temporal sampling,\naiming to seek the answer to how little information we should keep at least\nwhen feeding videos into the VQA models while with acceptable performance\nsacrifice. To this end, we drastically sample the video's information from both\nspatial and temporal dimensions, and the heavily squeezed video is then fed\ninto a stable VQA model. Comprehensive experiments regarding joint spatial and\ntemporal sampling are conducted on six public video quality databases, and the\nresults demonstrate the acceptable performance of the VQA model when throwing\naway most of the video information. Furthermore, with the proposed joint\nspatial and temporal sampling strategy, we make an initial attempt to design an\nonline VQA model, which is instantiated by as simple as possible a spatial\nfeature extractor, a temporal feature fusion module, and a global quality\nregression module. Through quantitative and qualitative experiments, we verify\nthe feasibility of online VQA model by simplifying itself and reducing input.\n","authors":["Jiebin Yan","Lei Wu","Yuming Fang","Xuelin Liu","Xue Xia","Weide Liu"],"pdf_url":"https://arxiv.org/pdf/2501.07087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13796v5","updated":"2025-01-13T06:35:54Z","published":"2024-05-22T16:21:02Z","title":"Generalizing Weather Forecast to Fine-grained Temporal Scales via\n  Physics-AI Hybrid Modeling","summary":"  Data-driven artificial intelligence (AI) models have made significant\nadvancements in weather forecasting, particularly in medium-range and\nnowcasting. However, most data-driven weather forecasting models are black-box\nsystems that focus on learning data mapping rather than fine-grained physical\nevolution in the time dimension. Consequently, the limitations in the temporal\nscale of datasets prevent these models from forecasting at finer time scales.\nThis paper proposes a physics-AI hybrid model (i.e., WeatherGFT) which\ngeneralizes weather forecasts to finer-grained temporal scales beyond training\ndataset. Specifically, we employ a carefully designed PDE kernel to simulate\nphysical evolution on a small time scale (e.g., 300 seconds) and use a parallel\nneural networks with a learnable router for bias correction. Furthermore, we\nintroduce a lead time-aware training framework to promote the generalization of\nthe model at different lead times. The weight analysis of physics-AI modules\nindicates that physics conducts major evolution while AI performs corrections\nadaptively. Extensive experiments show that WeatherGFT trained on an hourly\ndataset, effectively generalizes forecasts across multiple time scales,\nincluding 30-minute, which is even smaller than the dataset's temporal\nresolution.\n","authors":["Wanghan Xu","Fenghua Ling","Wenlong Zhang","Tao Han","Hao Chen","Wanli Ouyang","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2405.13796v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07078v1","updated":"2025-01-13T06:22:52Z","published":"2025-01-13T06:22:52Z","title":"ADKGD: Anomaly Detection in Knowledge Graphs with Dual-Channel Training","summary":"  In the current development of large language models (LLMs), it is important\nto ensure the accuracy and reliability of the underlying data sources. LLMs are\ncritical for various applications, but they often suffer from hallucinations\nand inaccuracies due to knowledge gaps in the training data. Knowledge graphs\n(KGs), as a powerful structural tool, could serve as a vital external\ninformation source to mitigate the aforementioned issues. By providing a\nstructured and comprehensive understanding of real-world data, KGs enhance the\nperformance and reliability of LLMs. However, it is common that errors exist in\nKGs while extracting triplets from unstructured data to construct KGs. This\ncould lead to degraded performance in downstream tasks such as\nquestion-answering and recommender systems. Therefore, anomaly detection in KGs\nis essential to identify and correct these errors. This paper presents an\nanomaly detection algorithm in knowledge graphs with dual-channel learning\n(ADKGD). ADKGD leverages a dual-channel learning approach to enhance\nrepresentation learning from both the entity-view and triplet-view\nperspectives. Furthermore, using a cross-layer approach, our framework\nintegrates internal information aggregation and context information\naggregation. We introduce a kullback-leibler (KL)-loss component to improve the\naccuracy of the scoring function between the dual channels. To evaluate ADKGD's\nperformance, we conduct empirical studies on three real-world KGs: WN18RR,\nFB15K, and NELL-995. Experimental results demonstrate that ADKGD outperforms\nthe state-of-the-art anomaly detection algorithms. The source code and datasets\nare publicly available at https://github.com/csjywu1/ADKGD.\n","authors":["Jiayang Wu","Wensheng Gan","Jiahao Zhang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2501.07078v1.pdf","comment":"Preprint. 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2412.20163v2","updated":"2025-01-13T06:17:38Z","published":"2024-12-28T14:27:45Z","title":"Topic-Aware Knowledge Graph with Large Language Models for\n  Interoperability in Recommender Systems","summary":"  The use of knowledge graphs in recommender systems has become one of the\ncommon approaches to addressing data sparsity and cold start problems. Recent\nadvances in large language models (LLMs) offer new possibilities for processing\nside and context information within knowledge graphs. However, consistent\nintegration across various systems remains challenging due to the need for\ndomain expert intervention and differences in system characteristics. To\naddress these issues, we propose a consistent approach that extracts both\ngeneral and specific topics from both side and context information using LLMs.\nFirst, general topics are iteratively extracted and updated from side\ninformation. Then, specific topics are extracted using context information.\nFinally, to address synonymous topics generated during the specific topic\nextraction process, a refining algorithm processes and resolves these issues\neffectively. This approach allows general topics to capture broad knowledge\nacross diverse item characteristics, while specific topics emphasize detailed\nattributes, providing a more comprehensive understanding of the semantic\nfeatures of items and the preferences of users. Experimental results\ndemonstrate significant improvements in recommendation performance across\ndiverse knowledge graphs.\n","authors":["Minhye Jeon","Seokho Ahn","Young-Duk Seo"],"pdf_url":"https://arxiv.org/pdf/2412.20163v2.pdf","comment":"Accepted in The 40th ACM/SIGAPP Symposium On Applied Computing(SAC)\n  2025"},{"id":"http://arxiv.org/abs/2501.07076v1","updated":"2025-01-13T06:13:25Z","published":"2025-01-13T06:13:25Z","title":"Representation Learning of Point Cloud Upsampling in Global and Local\n  Inputs","summary":"  In recent years, point cloud upsampling has been widely applied in fields\nsuch as 3D reconstruction. Our study investigates the factors influencing point\ncloud upsampling on both global and local levels through representation\nlearning. Specifically, the paper inputs global and local information of the\nsame point cloud model object into two encoders to extract these features,\nfuses them, and then feeds the combined features into an upsampling decoder.\nThe goal is to address issues of sparsity and noise in point clouds by\nleveraging prior knowledge from both global and local inputs. And the proposed\nframework can be applied to any state-of-the-art point cloud upsampling neural\nnetwork. Experiments were conducted on a series of autoencoder-based models\nutilizing deep learning, yielding interpretability for both global and local\ninputs, and it has been proven in the results that our proposed framework can\nfurther improve the upsampling effect in previous SOTA works. At the same time,\nthe Saliency Map reflects the differences between global and local feature\ninputs, as well as the effectiveness of training with both inputs in parallel.\n","authors":["Tongxu Zhang","Bei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16185v3","updated":"2025-01-13T06:10:24Z","published":"2024-01-29T14:32:27Z","title":"LLM4Vuln: A Unified Evaluation Framework for Decoupling and Enhancing\n  LLMs' Vulnerability Reasoning","summary":"  Large language models (LLMs) have demonstrated significant potential in\nvarious tasks, including those requiring human-level intelligence, such as\nvulnerability detection. However, recent efforts to use LLMs for vulnerability\ndetection remain preliminary, as they lack a deep understanding of whether a\nsubject LLM's vulnerability reasoning capability stems from the model itself or\nfrom external aids such as knowledge retrieval and tooling support.\n  In this paper, we aim to decouple LLMs' vulnerability reasoning from other\ncapabilities, such as vulnerability knowledge adoption, context information\nretrieval, and advanced prompt schemes. We introduce LLM4Vuln, a unified\nevaluation framework that separates and assesses LLMs' vulnerability reasoning\ncapabilities and examines improvements when combined with other enhancements.\n  We conduct controlled experiments using 147 ground-truth vulnerabilities and\n147 non-vulnerable cases in Solidity, Java and C/C++, testing them in a total\nof 3,528 scenarios across four LLMs (GPT-3.5, GPT-4, Phi-3, and Llama 3). Our\nfindings reveal the varying impacts of knowledge enhancement, context\nsupplementation, and prompt schemes. We also identify 14 zero-day\nvulnerabilities in four pilot bug bounty programs, resulting in $3,576 in\nbounties.\n","authors":["Yuqiang Sun","Daoyuan Wu","Yue Xue","Han Liu","Wei Ma","Lyuye Zhang","Yang Liu","Yingjiu Li"],"pdf_url":"https://arxiv.org/pdf/2401.16185v3.pdf","comment":"This is a technical report by Nanyang Technological University.\n  Updated to support Solidity, Java and C/C++"},{"id":"http://arxiv.org/abs/2501.07071v1","updated":"2025-01-13T05:53:56Z","published":"2025-01-13T05:53:56Z","title":"Value Compass Leaderboard: A Platform for Fundamental and Validated\n  Evaluation of LLMs Values","summary":"  As Large Language Models (LLMs) achieve remarkable breakthroughs, aligning\ntheir values with humans has become imperative for their responsible\ndevelopment and customized applications. However, there still lack evaluations\nof LLMs values that fulfill three desirable goals. (1) Value Clarification: We\nexpect to clarify the underlying values of LLMs precisely and comprehensively,\nwhile current evaluations focus narrowly on safety risks such as bias and\ntoxicity. (2) Evaluation Validity: Existing static, open-source benchmarks are\nprone to data contamination and quickly become obsolete as LLMs evolve.\nAdditionally, these discriminative evaluations uncover LLMs' knowledge about\nvalues, rather than valid assessments of LLMs' behavioral conformity to values.\n(3) Value Pluralism: The pluralistic nature of human values across individuals\nand cultures is largely ignored in measuring LLMs value alignment. To address\nthese challenges, we presents the Value Compass Leaderboard, with three\ncorrespondingly designed modules. It (i) grounds the evaluation on\nmotivationally distinct \\textit{basic values to clarify LLMs' underlying values\nfrom a holistic view; (ii) applies a \\textit{generative evolving evaluation\nframework with adaptive test items for evolving LLMs and direct value\nrecognition from behaviors in realistic scenarios; (iii) propose a metric that\nquantifies LLMs alignment with a specific value as a weighted sum over multiple\ndimensions, with weights determined by pluralistic values.\n","authors":["Jing Yao","Xiaoyuan Yi","Shitong Duan","Jindong Wang","Yuzhuo Bai","Muhua Huang","Peng Zhang","Tun Lu","Zhicheng Dou","Maosong Sun","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2501.07071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04945v2","updated":"2025-01-13T05:06:10Z","published":"2025-01-09T03:34:07Z","title":"Step-by-Step Mastery: Enhancing Soft Constraint Following Ability of\n  Large Language Models","summary":"  It is crucial for large language models (LLMs) to follow instructions that\ninvolve multiple constraints. However, soft constraints are semantically\nrelated and difficult to verify through automated methods. These constraints\nremain a significant challenge for LLMs. To enhance the ability of LLMs to\nfollow soft constraints, we initially design a pipeline to obtain high-quality\noutputs automatically. Additionally, to fully utilize the acquired data, we\nintroduce a training paradigm based on curriculum learning. We experimentally\nevaluate the effectiveness of our methods in improving LLMs' soft constraint\nfollowing ability and analyze the factors driving the improvements. The\ndatasets and code are publicly available at\nhttps://github.com/Rainier-rq/FollowSoftConstraints.\n","authors":["Qingyu Ren","Jie Zeng","Qianyu He","Jiaqing Liang","Yanghua Xiao","Weikang Zhou","Zeye Sun","Fei Yu"],"pdf_url":"https://arxiv.org/pdf/2501.04945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16837v2","updated":"2025-01-13T05:04:59Z","published":"2024-07-23T21:02:38Z","title":"MLLM-CompBench: A Comparative Reasoning Benchmark for Multimodal LLMs","summary":"  The ability to compare objects, scenes, or situations is crucial for\neffective decision-making and problem-solving in everyday life. For instance,\ncomparing the freshness of apples enables better choices during grocery\nshopping while comparing sofa designs helps optimize the aesthetics of our\nliving space. Despite its significance, the comparative capability is largely\nunexplored in artificial general intelligence (AGI). In this paper, we\nintroduce MLLM-CompBench, a benchmark designed to evaluate the comparative\nreasoning capability of multimodal large language models (MLLMs).\nMLLM-CompBench mines and pairs images through visually oriented questions\ncovering eight dimensions of relative comparison: visual attribute, existence,\nstate, emotion, temporality, spatiality, quantity, and quality. We curate a\ncollection of around 40K image pairs using metadata from diverse vision\ndatasets and CLIP similarity scores. These image pairs span a broad array of\nvisual domains, including animals, fashion, sports, and both outdoor and indoor\nscenes. The questions are carefully crafted to discern relative characteristics\nbetween two images and are labeled by human annotators for accuracy and\nrelevance. We use MLLM-CompBench to evaluate recent MLLMs, including\nGPT-4V(ision), Gemini-Pro, and LLaVA-1.6. Our results reveal notable\nshortcomings in their comparative abilities. We believe MLLM-COMPBENCH not only\nsheds light on these limitations but also establishes a solid foundation for\nfuture enhancements in the comparative capability of MLLMs.\n","authors":["Jihyung Kil","Zheda Mai","Justin Lee","Zihe Wang","Kerrie Cheng","Lemeng Wang","Ye Liu","Arpita Chowdhury","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2407.16837v2.pdf","comment":"This paper has been accepted to NeurIPS 2024. The first two authors\n  contributed equally to this work"},{"id":"http://arxiv.org/abs/2403.14704v2","updated":"2025-01-13T04:47:41Z","published":"2024-03-17T09:33:37Z","title":"A minimal coalition logic","summary":"  Coalition Logic is a central logic in logical studies of strategic reasoning,\nwhose models are concurrent game models. In this paper, first, we\nsystematically discuss three assumptions of concurrent game models and argue\nthat they are too strong. The first is seriality; that is, every coalition\nalways has an available joint action. The second is the independence of agents;\nthat is, the merge of two available joint actions of two disjoint coalitions is\nalways an available joint action of the union of the two coalitions. The third\nis determinism; that is, all available joint actions of the grand coalition\nalways have a unique outcome. Second, we present a coalition logic based on\ngeneral concurrent game models which do not have the three assumptions and show\nits completeness. This logic seems minimal for reasoning about coalitional\npowers.\n","authors":["Yinfeng Li","Fengkui Ju"],"pdf_url":"https://arxiv.org/pdf/2403.14704v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07058v1","updated":"2025-01-13T04:42:45Z","published":"2025-01-13T04:42:45Z","title":"Logic Meets Magic: LLMs Cracking Smart Contract Vulnerabilities","summary":"  Smart contract vulnerabilities caused significant economic losses in\nblockchain applications. Large Language Models (LLMs) provide new possibilities\nfor addressing this time-consuming task. However, state-of-the-art LLM-based\ndetection solutions are often plagued by high false-positive rates.\n  In this paper, we push the boundaries of existing research in two key ways.\nFirst, our evaluation is based on Solidity v0.8, offering the most up-to-date\ninsights compared to prior studies that focus on older versions (v0.4). Second,\nwe leverage the latest five LLM models (across companies), ensuring\ncomprehensive coverage across the most advanced capabilities in the field.\n  We conducted a series of rigorous evaluations. Our experiments demonstrate\nthat a well-designed prompt can reduce the false-positive rate by over 60%.\nSurprisingly, we also discovered that the recall rate for detecting some\nspecific vulnerabilities in Solidity v0.8 has dropped to just 13% compared to\nearlier versions (i.e., v0.4). Further analysis reveals the root cause of this\ndecline: the reliance of LLMs on identifying changes in newly introduced\nlibraries and frameworks during detection.\n","authors":["ZeKe Xiao","Qin Wang","Hammond Pearce","Shiping Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19925v2","updated":"2025-01-13T04:33:01Z","published":"2024-12-27T21:19:01Z","title":"HADES: Hardware Accelerated Decoding for Efficient Speculation in Large\n  Language Models","summary":"  Large Language Models (LLMs) have revolutionized natural language processing\nby understanding and generating human-like text. However, the increasing demand\nfor more sophisticated LLMs presents significant computational challenges due\nto their scale and complexity. This paper introduces Hardware Accelerated\nDecoding (HADES), a novel approach to enhance the performance and energy\nefficiency of LLMs. We address the design of an LLM accelerator with\nhardware-level speculative decoding support, a concept not previously explored\nin existing literature. Our work demonstrates how speculative decoding can\nsignificantly improve the efficiency of LLM operations, paving the way for more\nadvanced and practical applications of these models.\n","authors":["Ze Yang","Yihong Jin","Xinhe Xu"],"pdf_url":"https://arxiv.org/pdf/2412.19925v2.pdf","comment":"Accepted to ICCEA 2025"},{"id":"http://arxiv.org/abs/2501.07054v1","updated":"2025-01-13T04:28:40Z","published":"2025-01-13T04:28:40Z","title":"PoAct: Policy and Action Dual-Control Agent for Generalized Applications","summary":"  Based on their superior comprehension and reasoning capabilities, Large\nLanguage Model (LLM) driven agent frameworks have achieved significant success\nin numerous complex reasoning tasks. ReAct-like agents can solve various\nintricate problems step-by-step through progressive planning and tool calls,\niteratively optimizing new steps based on environmental feedback. However, as\nthe planning capabilities of LLMs improve, the actions invoked by tool calls in\nReAct-like frameworks often misalign with complex planning and challenging data\norganization. Code Action addresses these issues while also introducing the\nchallenges of a more complex action space and more difficult action\norganization. To leverage Code Action and tackle the challenges of its\ncomplexity, this paper proposes Policy and Action Dual-Control Agent (PoAct)\nfor generalized applications. The aim is to achieve higher-quality code actions\nand more accurate reasoning paths by dynamically switching reasoning policies\nand modifying the action space. Experimental results on the Agent Benchmark for\nboth legal and generic scenarios demonstrate the superior reasoning\ncapabilities and reduced token consumption of our approach in complex tasks. On\nthe LegalAgentBench, our method shows a 20 percent improvement over the\nbaseline while requiring fewer tokens. We conducted experiments and analyses on\nthe GPT-4o and GLM-4 series models, demonstrating the significant potential and\nscalability of our approach to solve complex problems.\n","authors":["Guozhi Yuan","Youfeng Liu","Jingli Yang","Wei Jia","Kai Lin","Yansong Gao","Shan He","Zilin Ding","Haitao Li"],"pdf_url":"https://arxiv.org/pdf/2501.07054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16908v2","updated":"2025-01-13T04:11:53Z","published":"2024-12-22T07:54:21Z","title":"Map Imagination Like Blind Humans: Group Diffusion Model for Robotic Map\n  Generation","summary":"  Can robots imagine or generate maps like humans do, especially when only\nlimited information can be perceived like blind people? To address this\nchallenging task, we propose a novel group diffusion model (GDM) based\narchitecture for robots to generate point cloud maps with very limited input\ninformation.Inspired from the blind humans' natural capability of imagining or\ngenerating mental maps, the proposed method can generate maps without visual\nperception data or depth data. With additional limited super-sparse spatial\npositioning data, like the extra contact-based positioning information the\nblind individuals can obtain, the map generation quality can be improved even\nmore.Experiments on public datasets are conducted, and the results indicate\nthat our method can generate reasonable maps solely based on path data, and\nproduce even more refined maps upon incorporating exiguous LiDAR data.Compared\nto conventional mapping approaches, our novel method significantly mitigates\nsensor dependency, enabling the robots to imagine and generate elementary maps\nwithout heavy onboard sensory devices.\n","authors":["Qijin Song","Weibang Bai"],"pdf_url":"https://arxiv.org/pdf/2412.16908v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07048v1","updated":"2025-01-13T04:10:45Z","published":"2025-01-13T04:10:45Z","title":"Unveiling the Potential of Text in High-Dimensional Time Series\n  Forecasting","summary":"  Time series forecasting has traditionally focused on univariate and\nmultivariate numerical data, often overlooking the benefits of incorporating\nmultimodal information, particularly textual data. In this paper, we propose a\nnovel framework that integrates time series models with Large Language Models\nto improve high-dimensional time series forecasting. Inspired by multimodal\nmodels, our method combines time series and textual data in the dual-tower\nstructure. This fusion of information creates a comprehensive representation,\nwhich is then processed through a linear layer to generate the final forecast.\nExtensive experiments demonstrate that incorporating text enhances\nhigh-dimensional time series forecasting performance. This work paves the way\nfor further research in multimodal time series forecasting.\n","authors":["Xin Zhou","Weiqing Wang","Shilin Qu","Zhiqiang Zhang","Christoph Bergmeir"],"pdf_url":"https://arxiv.org/pdf/2501.07048v1.pdf","comment":"Accepted by NeurIPS24 TSALM Workshop"},{"id":"http://arxiv.org/abs/2501.07045v1","updated":"2025-01-13T03:55:59Z","published":"2025-01-13T03:55:59Z","title":"ACCon: Angle-Compensated Contrastive Regularizer for Deep Regression","summary":"  In deep regression, capturing the relationship among continuous labels in\nfeature space is a fundamental challenge that has attracted increasing\ninterest. Addressing this issue can prevent models from converging to\nsuboptimal solutions across various regression tasks, leading to improved\nperformance, especially for imbalanced regression and under limited sample\nsizes. However, existing approaches often rely on order-aware representation\nlearning or distance-based weighting. In this paper, we hypothesize a linear\nnegative correlation between label distances and representation similarities in\nregression tasks. To implement this, we propose an angle-compensated\ncontrastive regularizer for deep regression, which adjusts the cosine distance\nbetween anchor and negative samples within the contrastive learning framework.\nOur method offers a plug-and-play compatible solution that extends most\nexisting contrastive learning methods for regression tasks. Extensive\nexperiments and theoretical analysis demonstrate that our proposed\nangle-compensated contrastive regularizer not only achieves competitive\nregression performance but also excels in data efficiency and effectiveness on\nimbalanced datasets.\n","authors":["Botao Zhao","Xiaoyang Qu","Zuheng Kang","Junqing Peng","Jing Xiao","Jianzong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07045v1.pdf","comment":"Accept by AAAI-2025 (The 39th Annual AAAI Conference on Artificial\n  Intelligence)"},{"id":"http://arxiv.org/abs/2412.07819v2","updated":"2025-01-13T03:03:57Z","published":"2024-12-10T12:14:38Z","title":"Intelligent System for Automated Molecular Patent Infringement\n  Assessment","summary":"  Automated drug discovery offers significant potential for accelerating the\ndevelopment of novel therapeutics by substituting labor-intensive human\nworkflows with machine-driven processes. However, molecules generated by\nartificial intelligence may unintentionally infringe on existing patents,\nposing legal and financial risks that impede the full automation of drug\ndiscovery pipelines. This paper introduces PatentFinder, a novel multi-agent\nand tool-enhanced intelligence system that can accurately and comprehensively\nevaluate small molecules for patent infringement. PatentFinder features five\nspecialized agents that collaboratively analyze patent claims and molecular\nstructures with heuristic and model-based tools, generating interpretable\ninfringement reports. To support systematic evaluation, we curate\nMolPatent-240, a benchmark dataset tailored for patent infringement assessment\nalgorithms. On this benchmark, PatentFinder outperforms baseline methods that\nrely solely on large language models or specialized chemical tools, achieving a\n13.8% improvement in F1-score and a 12% increase in accuracy. Additionally,\nPatentFinder autonomously generates detailed and interpretable patent\ninfringement reports, showcasing enhanced accuracy and improved\ninterpretability. The high accuracy and interpretability of PatentFinder make\nit a valuable and reliable tool for automating patent infringement assessments,\noffering a practical solution for integrating patent protection analysis into\nthe drug discovery pipeline.\n","authors":["Yaorui Shi","Sihang Li","Taiyan Zhang","Xi Fang","Jiankun Wang","Zhiyuan Liu","Guojiang Zhao","Zhengdan Zhu","Zhifeng Gao","Renxin Zhong","Linfeng Zhang","Guolin Ke","Weinan E","Hengxing Cai","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2412.07819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07024v1","updated":"2025-01-13T02:53:07Z","published":"2025-01-13T02:53:07Z","title":"A Proposed Large Language Model-Based Smart Search for Archive System","summary":"  This study presents a novel framework for smart search in digital archival\nsystems, leveraging the capabilities of Large Language Models (LLMs) to enhance\ninformation retrieval. By employing a Retrieval-Augmented Generation (RAG)\napproach, the framework enables the processing of natural language queries and\ntransforming non-textual data into meaningful textual representations. The\nsystem integrates advanced metadata generation techniques, a hybrid retrieval\nmechanism, a router query engine, and robust response synthesis, the results\nproved search precision and relevance. We present the architecture and\nimplementation of the system and evaluate its performance in four experiments\nconcerning LLM efficiency, hybrid retrieval optimizations, multilingual query\nhandling, and the impacts of individual components. Obtained results show\nsignificant improvements over conventional approaches and have demonstrated the\npotential of AI-powered systems to transform modern archival practices.\n","authors":["Ha Dung Nguyen","Thi-Hoang Anh Nguyen","Thanh Binh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.07024v1.pdf","comment":"The 13th International Symposium on Information and Communication\n  Technology (SOICT 2024)"},{"id":"http://arxiv.org/abs/2501.07021v1","updated":"2025-01-13T02:47:49Z","published":"2025-01-13T02:47:49Z","title":"Neural Probabilistic Circuits: Enabling Compositional and Interpretable\n  Predictions through Logical Reasoning","summary":"  End-to-end deep neural networks have achieved remarkable success across\nvarious domains but are often criticized for their lack of interpretability.\nWhile post hoc explanation methods attempt to address this issue, they often\nfail to accurately represent these black-box models, resulting in misleading or\nincomplete explanations. To overcome these challenges, we propose an inherently\ntransparent model architecture called Neural Probabilistic Circuits (NPCs),\nwhich enable compositional and interpretable predictions through logical\nreasoning. In particular, an NPC consists of two modules: an attribute\nrecognition model, which predicts probabilities for various attributes, and a\ntask predictor built on a probabilistic circuit, which enables logical\nreasoning over recognized attributes to make class predictions. To train NPCs,\nwe introduce a three-stage training algorithm comprising attribute recognition,\ncircuit construction, and joint optimization. Moreover, we theoretically\ndemonstrate that an NPC's error is upper-bounded by a linear combination of the\nerrors from its modules. To further demonstrate the interpretability of NPC, we\nprovide both the most probable explanations and the counterfactual\nexplanations. Empirical results on four benchmark datasets show that NPCs\nstrike a balance between interpretability and performance, achieving results\ncompetitive even with those of end-to-end black-box models while providing\nenhanced interpretability.\n","authors":["Weixin Chen","Simon Yu","Huajie Shao","Lui Sha","Han Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07020v1","updated":"2025-01-13T02:47:13Z","published":"2025-01-13T02:47:13Z","title":"ViSoLex: An Open-Source Repository for Vietnamese Social Media Lexical\n  Normalization","summary":"  ViSoLex is an open-source system designed to address the unique challenges of\nlexical normalization for Vietnamese social media text. The platform provides\ntwo core services: Non-Standard Word (NSW) Lookup and Lexical Normalization,\nenabling users to retrieve standard forms of informal language and standardize\ntext containing NSWs. ViSoLex's architecture integrates pre-trained language\nmodels and weakly supervised learning techniques to ensure accurate and\nefficient normalization, overcoming the scarcity of labeled data in Vietnamese.\nThis paper details the system's design, functionality, and its applications for\nresearchers and non-technical users. Additionally, ViSoLex offers a flexible,\ncustomizable framework that can be adapted to various datasets and research\nrequirements. By publishing the source code, ViSoLex aims to contribute to the\ndevelopment of more robust Vietnamese natural language processing tools and\nencourage further research in lexical normalization. Future directions include\nexpanding the system's capabilities for additional languages and improving the\nhandling of more complex non-standard linguistic patterns.\n","authors":["Anh Thi-Hoang Nguyen","Dung Ha Nguyen","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.07020v1.pdf","comment":"The 31st International Conference on Computational Linguistics\n  (COLING 2025)"},{"id":"http://arxiv.org/abs/2207.11132v3","updated":"2025-01-13T02:38:21Z","published":"2022-07-16T13:43:58Z","title":"Proactive Distributed Emergency Response with Heterogeneous Tasks\n  Allocation","summary":"  Traditionally, traffic incident management (TIM) programs coordinate the\ndeployment of emergency resources to immediate incident requests without\naccommodating the interdependencies on incident evolutions in the environment.\nHowever, ignoring inherent interdependencies on the evolution of incidents in\nthe environment while making current deployment decisions is shortsighted, and\nthe resulting naive deployment strategy can significantly worsen the overall\nincident delay impact on the network. The interdependencies on incident\nevolution in the environment, including those between incident occurrences, and\nthose between resource availability in near-future requests and the anticipated\nduration of the immediate incident request, should be considered through a\nlook-ahead model when making current-stage deployment decisions. This study\ndevelops a new proactive framework based on the distributed constraint\noptimization problem (DCOP) to address the above limitations, overcoming\nconventional TIM models that cannot accommodate the dependencies in the TIM\nproblem. Furthermore, the optimization objective is formulated to incorporate\nUnmanned Aerial Vehicles (UAVs). The UAVs' role in TIM includes exploring\nuncertain traffic conditions, detecting unexpected events, and augmenting\ninformation from roadway traffic sensors. Robustness analysis of our model for\nmultiple TIM scenarios shows satisfactory performance using local search\nexploration heuristics. Overall, our model reports a significant reduction in\ntotal incident delay compared to conventional TIM models. With UAV support, we\ndemonstrate a further decrease in the total incident delay ranging between 5%\nand 45% for the different number of incidents. UAV's active sensing can shorten\nresponse time of emergency vehicles, and a reduction in uncertainties\nassociated with the estimated incident delay impact.\n","authors":["Justice Darko","Hyoshin Park"],"pdf_url":"https://arxiv.org/pdf/2207.11132v3.pdf","comment":"16 pages, 13 figures, 3 tables, journal"},{"id":"http://arxiv.org/abs/2501.07017v1","updated":"2025-01-13T02:33:28Z","published":"2025-01-13T02:33:28Z","title":"UNetVL: Enhancing 3D Medical Image Segmentation with Chebyshev KAN\n  Powered Vision-LSTM","summary":"  3D medical image segmentation has progressed considerably due to\nConvolutional Neural Networks (CNNs) and Vision Transformers (ViTs), yet these\nmethods struggle to balance long-range dependency acquisition with\ncomputational efficiency. To address this challenge, we propose UNETVL (U-Net\nVision-LSTM), a novel architecture that leverages recent advancements in\ntemporal information processing. UNETVL incorporates Vision-LSTM (ViL) for\nimproved scalability and memory functions, alongside an efficient Chebyshev\nKolmogorov-Arnold Networks (KAN) to handle complex and long-range dependency\npatterns more effectively. We validated our method on the ACDC and AMOS2022\n(post challenge Task 2) benchmark datasets, showing a significant improvement\nin mean Dice score compared to recent state-of-the-art approaches, especially\nover its predecessor, UNETR, with increases of 7.3% on ACDC and 15.6% on AMOS,\nrespectively. Extensive ablation studies were conducted to demonstrate the\nimpact of each component in UNETVL, providing a comprehensive understanding of\nits architecture. Our code is available at https://github.com/tgrex6/UNETVL,\nfacilitating further research and applications in this domain.\n","authors":["Xuhui Guo","Tanmoy Dam","Rohan Dhamdhere","Gourav Modanwal","Anant Madabhushi"],"pdf_url":"https://arxiv.org/pdf/2501.07017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07016v1","updated":"2025-01-13T02:29:42Z","published":"2025-01-13T02:29:42Z","title":"A Multi-Modal Deep Learning Framework for Pan-Cancer Prognosis","summary":"  Prognostic task is of great importance as it closely related to the survival\nanalysis of patients, the optimization of treatment plans and the allocation of\nresources. The existing prognostic models have shown promising results on\nspecific datasets, but there are limitations in two aspects. On the one hand,\nthey merely explore certain types of modal data, such as patient histopathology\nWSI and gene expression analysis. On the other hand, they adopt the\nper-cancer-per-model paradigm, which means the trained models can only predict\nthe prognostic effect of a single type of cancer, resulting in weak\ngeneralization ability. In this paper, a deep-learning based model, named\nUMPSNet, is proposed. Specifically, to comprehensively understand the condition\nof patients, in addition to constructing encoders for histopathology images and\ngenomic expression profiles respectively, UMPSNet further integrates four types\nof important meta data (demographic information, cancer type information,\ntreatment protocols, and diagnosis results) into text templates, and then\nintroduces a text encoder to extract textual features. In addition, the optimal\ntransport OT-based attention mechanism is utilized to align and fuse features\nof different modalities. Furthermore, a guided soft mixture of experts (GMoE)\nmechanism is introduced to effectively address the issue of distribution\ndifferences among multiple cancer datasets. By incorporating the multi-modality\nof patient data and joint training, UMPSNet outperforms all SOTA approaches,\nand moreover, it demonstrates the effectiveness and generalization ability of\nthe proposed learning paradigm of a single model for multiple cancer types. The\ncode of UMPSNet is available at https://github.com/binging512/UMPSNet.\n","authors":["Binyu Zhang","Shichao Li","Junpeng Jian","Zhu Meng","Limei Guo","Zhicheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07014v1","updated":"2025-01-13T02:17:01Z","published":"2025-01-13T02:17:01Z","title":"AlgoRxplorers | Precision in Mutation -- Enhancing Drug Design with\n  Advanced Protein Stability Prediction Tools","summary":"  Predicting the impact of single-point amino acid mutations on protein\nstability is essential for understanding disease mechanisms and advancing drug\ndevelopment. Protein stability, quantified by changes in Gibbs free energy\n($\\Delta\\Delta G$), is influenced by these mutations. However, the scarcity of\ndata and the complexity of model interpretation pose challenges in accurately\npredicting stability changes. This study proposes the application of deep\nneural networks, leveraging transfer learning and fusing complementary\ninformation from different models, to create a feature-rich representation of\nthe protein stability landscape. We developed four models, with our third\nmodel, ThermoMPNN+, demonstrating the best performance in predicting\n$\\Delta\\Delta G$ values. This approach, which integrates diverse feature sets\nand embeddings through latent transfusion techniques, aims to refine\n$\\Delta\\Delta G$ predictions and contribute to a deeper understanding of\nprotein dynamics, potentially leading to advancements in disease research and\ndrug discovery.\n","authors":["Karishma Thakrar","Jiangqin Ma","Max Diamond","Akash Patel"],"pdf_url":"https://arxiv.org/pdf/2501.07014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06999v1","updated":"2025-01-13T01:20:23Z","published":"2025-01-13T01:20:23Z","title":"Likelihood Training of Cascaded Diffusion Models via Hierarchical\n  Volume-preserving Maps","summary":"  Cascaded models are multi-scale generative models with a marked capacity for\nproducing perceptually impressive samples at high resolutions. In this work, we\nshow that they can also be excellent likelihood models, so long as we overcome\na fundamental difficulty with probabilistic multi-scale models: the\nintractability of the likelihood function. Chiefly, in cascaded models each\nintermediary scale introduces extraneous variables that cannot be tractably\nmarginalized out for likelihood evaluation. This issue vanishes by modeling the\ndiffusion process on latent spaces induced by a class of transformations we\ncall hierarchical volume-preserving maps, which decompose spatially structured\ndata in a hierarchical fashion without introducing local distortions in the\nlatent space. We demonstrate that two such maps are well-known in the\nliterature for multiscale modeling: Laplacian pyramids and wavelet transforms.\nNot only do such reparameterizations allow the likelihood function to be\ndirectly expressed as a joint likelihood over the scales, we show that the\nLaplacian pyramid and wavelet transform also produces significant improvements\nto the state-of-the-art on a selection of benchmarks in likelihood modeling,\nincluding density estimation, lossless compression, and out-of-distribution\ndetection. Investigating the theoretical basis of our empirical gains we\nuncover deep connections to score matching under the Earth Mover's Distance\n(EMD), which is a well-known surrogate for perceptual similarity. Code can be\nfound at \\href{https://github.com/lihenryhfl/pcdm}{this https url}.\n","authors":["Henry Li","Ronen Basri","Yuval Kluger"],"pdf_url":"https://arxiv.org/pdf/2501.06999v1.pdf","comment":"Spotlight at ICLR 2024"},{"id":"http://arxiv.org/abs/2501.06994v1","updated":"2025-01-13T01:01:44Z","published":"2025-01-13T01:01:44Z","title":"Motion Tracks: A Unified Representation for Human-Robot Transfer in\n  Few-Shot Imitation Learning","summary":"  Teaching robots to autonomously complete everyday tasks remains a challenge.\nImitation Learning (IL) is a powerful approach that imbues robots with skills\nvia demonstrations, but is limited by the labor-intensive process of collecting\nteleoperated robot data. Human videos offer a scalable alternative, but it\nremains difficult to directly train IL policies from them due to the lack of\nrobot action labels. To address this, we propose to represent actions as\nshort-horizon 2D trajectories on an image. These actions, or motion tracks,\ncapture the predicted direction of motion for either human hands or robot\nend-effectors. We instantiate an IL policy called Motion Track Policy (MT-pi)\nwhich receives image observations and outputs motion tracks as actions. By\nleveraging this unified, cross-embodiment action space, MT-pi completes tasks\nwith high success given just minutes of human video and limited additional\nrobot demonstrations. At test time, we predict motion tracks from two camera\nviews, recovering 6DoF trajectories via multi-view synthesis. MT-pi achieves an\naverage success rate of 86.5% across 4 real-world tasks, outperforming\nstate-of-the-art IL baselines which do not leverage human data or our action\nspace by 40%, and generalizes to scenarios seen only in human videos. Code and\nvideos are available on our website\nhttps://portal-cornell.github.io/motion_track_policy/.\n","authors":["Juntao Ren","Priya Sundaresan","Dorsa Sadigh","Sanjiban Choudhury","Jeannette Bohg"],"pdf_url":"https://arxiv.org/pdf/2501.06994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12472v4","updated":"2025-01-13T00:43:22Z","published":"2023-11-21T09:33:13Z","title":"Seeing the Unseen: Learning Basis Confounder Representations for Robust\n  Traffic Prediction","summary":"  Traffic prediction is essential for intelligent transportation systems and\nurban computing. It aims to establish a relationship between historical traffic\ndata X and future traffic states Y by employing various statistical or deep\nlearning methods. However, the relations of X -> Y are often influenced by\nexternal confounders that simultaneously affect both X and Y , such as weather,\naccidents, and holidays. Existing deep-learning traffic prediction models adopt\nthe classic front-door and back-door adjustments to address the confounder\nissue. However, these methods have limitations in addressing continuous or\nundefined confounders, as they depend on predefined discrete values that are\noften impractical in complex, real-world scenarios. To overcome this challenge,\nwe propose the Spatial-Temporal sElf-superVised confoundEr learning (STEVE)\nmodel. This model introduces a basis vector approach, creating a base\nconfounder bank to represent any confounder as a linear combination of a group\nof basis vectors. It also incorporates self-supervised auxiliary tasks to\nenhance the expressive power of the base confounder bank. Afterward, a\nconfounder-irrelevant relation decoupling module is adopted to separate the\nconfounder effects from direct X -> Y relations. Extensive experiments across\nfour large-scale datasets validate our model's superior performance in handling\nspatial and temporal distribution shifts and underscore its adaptability to\nunseen confounders. Our model implementation is available at\nhttps://github.com/bigscity/STEVE_CODE.\n","authors":["Jiahao Ji","Wentao Zhang","Jingyuan Wang","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.12472v4.pdf","comment":"12 pages, 10 figures, Accepted by KDD 2025"},{"id":"http://arxiv.org/abs/2409.00265v2","updated":"2025-01-13T00:29:56Z","published":"2024-08-30T21:42:17Z","title":"Explainable Artificial Intelligence: A Survey of Needs, Techniques,\n  Applications, and Future Direction","summary":"  Artificial intelligence models encounter significant challenges due to their\nblack-box nature, particularly in safety-critical domains such as healthcare,\nfinance, and autonomous vehicles. Explainable Artificial Intelligence (XAI)\naddresses these challenges by providing explanations for how these models make\ndecisions and predictions, ensuring transparency, accountability, and fairness.\nExisting studies have examined the fundamental concepts of XAI, its general\nprinciples, and the scope of XAI techniques. However, there remains a gap in\nthe literature as there are no comprehensive reviews that delve into the\ndetailed mathematical representations, design methodologies of XAI models, and\nother associated aspects. This paper provides a comprehensive literature review\nencompassing common terminologies and definitions, the need for XAI,\nbeneficiaries of XAI, a taxonomy of XAI methods, and the application of XAI\nmethods in different application areas. The survey is aimed at XAI researchers,\nXAI practitioners, AI model developers, and XAI beneficiaries who are\ninterested in enhancing the trustworthiness, transparency, accountability, and\nfairness of their AI models.\n","authors":["Melkamu Mersha","Khang Lam","Joseph Wood","Ali AlShami","Jugal Kalita"],"pdf_url":"https://arxiv.org/pdf/2409.00265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06985v1","updated":"2025-01-13T00:29:29Z","published":"2025-01-13T00:29:29Z","title":"Graph Contrastive Learning on Multi-label Classification for\n  Recommendations","summary":"  In business analysis, providing effective recommendations is essential for\nenhancing company profits. The utilization of graph-based structures, such as\nbipartite graphs, has gained popularity for their ability to analyze complex\ndata relationships. Link prediction is crucial for recommending specific items\nto users. Traditional methods in this area often involve identifying patterns\nin the graph structure or using representational techniques like graph neural\nnetworks (GNNs). However, these approaches encounter difficulties as the volume\nof data increases. To address these challenges, we propose a model called Graph\nContrastive Learning for Multi-label Classification (MCGCL). MCGCL leverages\ncontrastive learning to enhance recommendation effectiveness. The model\nincorporates two training stages: a main task and a subtask. The main task is\nholistic user-item graph learning to capture user-item relationships. The\nhomogeneous user-user (item-item) subgraph is constructed to capture user-user\nand item-item relationships in the subtask. We assessed the performance using\nreal-world datasets from Amazon Reviews in multi-label classification tasks.\nComparative experiments with state-of-the-art methods confirm the effectiveness\nof MCGCL, highlighting its potential for improving recommendation systems.\n","authors":["Jiayang Wu","Wensheng Gan","Huashen Lu","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2501.06985v1.pdf","comment":"Preprint. 10 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.06981v1","updated":"2025-01-13T00:11:47Z","published":"2025-01-13T00:11:47Z","title":"Data Enrichment Work and AI Labor in Latin America and the Caribbean","summary":"  The global AI surge demands crowdworkers from diverse languages and cultures.\nThey are pivotal in labeling data for enabling global AI systems. Despite\nglobal significance, research has primarily focused on understanding the\nperspectives and experiences of US and India crowdworkers, leaving a notable\ngap. To bridge this, we conducted a survey with 100 crowdworkers across 16\nLatin American and Caribbean countries. We discovered that these workers\nexhibited pride and respect for their digital labor, with strong support and\nadmiration from their families. Notably, crowd work was also seen as a stepping\nstone to financial and professional independence. Surprisingly, despite wanting\nmore connection, these workers also felt isolated from peers and doubtful of\nothers' labor quality. They resisted collaboration and gender-based tools,\nvaluing gender-neutrality. Our work advances HCI understanding of Latin\nAmerican and Caribbean crowdwork, offering insights for digital resistance\ntools for the region.\n","authors":["Gianna Williams","Maya De Los Santos","Alexandra To","Saiph Savage"],"pdf_url":"https://arxiv.org/pdf/2501.06981v1.pdf","comment":"17 pages of content with 2 figures"},{"id":"http://arxiv.org/abs/2501.06980v1","updated":"2025-01-13T00:03:20Z","published":"2025-01-13T00:03:20Z","title":"Combining LLM decision and RL action selection to improve RL policy for\n  adaptive interventions","summary":"  Reinforcement learning (RL) is increasingly being used in the healthcare\ndomain, particularly for the development of personalized health adaptive\ninterventions. Inspired by the success of Large Language Models (LLMs), we are\ninterested in using LLMs to update the RL policy in real time, with the goal of\naccelerating personalization. We use the text-based user preference to\ninfluence the action selection on the fly, in order to immediately incorporate\nthe user preference. We use the term \"user preference\" as a broad term to refer\nto a user personal preference, constraint, health status, or a statement\nexpressing like or dislike, etc. Our novel approach is a hybrid method that\ncombines the LLM response and the RL action selection to improve the RL policy.\nGiven an LLM prompt that incorporates the user preference, the LLM acts as a\nfilter in the typical RL action selection. We investigate different prompting\nstrategies and action selection strategies. To evaluate our approach, we\nimplement a simulation environment that generates the text-based user\npreferences and models the constraints that impact behavioral dynamics. We show\nthat our approach is able to take into account the text-based user preferences,\nwhile improving the RL policy, thus improving personalization in adaptive\nintervention.\n","authors":["Karine Karine","Benjamin M. Marlin"],"pdf_url":"https://arxiv.org/pdf/2501.06980v1.pdf","comment":null}]},"2025-01-12T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2412.20758v2","updated":"2025-01-12T21:57:58Z","published":"2024-12-30T07:03:54Z","title":"High-Sensitivity Vision-Based Tactile Sensing Enhanced by\n  Microstructures and Lightweight CNN","summary":"  Tactile sensing is critical in advanced interactive systems by emulating the\nhuman sense of touch to detect stimuli. Vision-based tactile sensors (VBTSs)\nare promising for their ability to provide rich information, robustness,\nadaptability, low cost, and multimodal capabilities. However, current\ntechnologies still have limitations in sensitivity, spatial resolution, and the\nhigh computational demands of deep learning-based image processing. This paper\npresents a comprehensive approach combining a novel sensor structure with\nmicromachined structures and an efficient image processing method, and\ndemonstrates that carefully engineered microstructures within the sensor\nhardware can significantly enhance sensitivity while reducing computational\nload. Unlike traditional designs with tracking markers, our sensor incorporates\nan interface surface with micromachined trenches, as an example of\nmicrostructures, which modulate light transmission and amplify the variation in\nresponse to applied force. By capturing variations in brightness, wire width,\nand cross pattern locations with a camera, the sensor accurately infers the\ncontact location, the magnitude of displacement and applied force with a\nlightweight convolutional neural network (CNN). Theoretical and experimental\nresults demonstrated that the microstructures significantly enhance sensitivity\nby amplifying the visual effects of shape distortion. The sensor system\neffectively detected forces below 10 mN, and achieved a millimetre-level\nsingle-point spatial resolution. Using a model with only one convolutional\nlayer, a mean absolute error (MAE) below 0.05 mm have been achieved. Its soft\nsensor body ensures compatibility with soft robots and wearable electronics,\nwhile its immunity to electrical crosstalk and interference guarantees\nreliability in complex human-machine environments.\n","authors":["Mayue Shi","Yongqi Zhang","Xiaotong Guo","Eric M. Yeatman"],"pdf_url":"https://arxiv.org/pdf/2412.20758v2.pdf","comment":"27 pages, 13 figures, 2 tables; rearranged figures; corrected typos"},{"id":"http://arxiv.org/abs/2501.06946v1","updated":"2025-01-12T21:46:57Z","published":"2025-01-12T21:46:57Z","title":"Learning Implicit Social Navigation Behavior using Deep Inverse\n  Reinforcement Learning","summary":"  This paper reports on learning a reward map for social navigation in dynamic\nenvironments where the robot can reason about its path at any time, given\nagents' trajectories and scene geometry. Humans navigating in dense and dynamic\nindoor environments often work with several implied social rules. A rule-based\napproach fails to model all possible interactions between humans, robots, and\nscenes. We propose a novel Smooth Maximum Entropy Deep Inverse Reinforcement\nLearning (S-MEDIRL) algorithm that can extrapolate beyond expert demos to\nbetter encode scene navigability from few-shot demonstrations. The agent learns\nto predict the cost maps reasoning on trajectory data and scene geometry. The\nagent samples a trajectory that is then executed using a local crowd navigation\ncontroller. We present results in a photo-realistic simulation environment,\nwith a robot and a human navigating a narrow crossing scenario. The robot\nimplicitly learns to exhibit social behaviors such as yielding to oncoming\ntraffic and avoiding deadlocks. We compare the proposed approach to the popular\nmodel-based crowd navigation algorithm ORCA and a rule-based agent that\nexhibits yielding.\n","authors":["Tribhi Kathuria","Ke Liu","Junwoo Jang","X. Jessie Yang","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2501.06946v1.pdf","comment":"8 pages, Submitted to IEEE Robotics and Automation Letters (RAL)"},{"id":"http://arxiv.org/abs/2408.14518v2","updated":"2025-01-12T21:01:00Z","published":"2024-08-26T00:13:14Z","title":"A Survey on Reinforcement Learning Applications in SLAM","summary":"  The emergence of mobile robotics, particularly in the automotive industry,\nintroduces a promising era of enriched user experiences and adept handling of\ncomplex navigation challenges. The realization of these advancements\nnecessitates a focused technological effort and the successful execution of\nnumerous intricate tasks, particularly in the critical domain of Simultaneous\nLocalization and Mapping (SLAM). Various artificial intelligence (AI)\nmethodologies, such as deep learning and reinforcement learning, present viable\nsolutions to address the challenges in SLAM. This study specifically explores\nthe application of reinforcement learning in the context of SLAM. By enabling\nthe agent (the robot) to iteratively interact with and receive feedback from\nits environment, reinforcement learning facilitates the acquisition of\nnavigation and mapping skills, thereby enhancing the robot's decision-making\ncapabilities. This approach offers several advantages, including improved\nnavigation proficiency, increased resilience, reduced dependence on sensor\nprecision, and refinement of the decision-making process. The findings of this\nstudy, which provide an overview of reinforcement learning's utilization in\nSLAM, reveal significant advancements in the field. The investigation also\nhighlights the evolution and innovative integration of these techniques.\n","authors":["Mohammad Dehghani Tezerjani","Mohammad Khoshnazar","Mohammadhamed Tangestanizadeh","Arman Kiani","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2408.14518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06919v1","updated":"2025-01-12T20:07:22Z","published":"2025-01-12T20:07:22Z","title":"Shake-VLA: Vision-Language-Action Model-Based System for Bimanual\n  Robotic Manipulations and Liquid Mixing","summary":"  This paper introduces Shake-VLA, a Vision-Language-Action (VLA) model-based\nsystem designed to enable bimanual robotic manipulation for automated cocktail\npreparation. The system integrates a vision module for detecting ingredient\nbottles and reading labels, a speech-to-text module for interpreting user\ncommands, and a language model to generate task-specific robotic instructions.\nForce Torque (FT) sensors are employed to precisely measure the quantity of\nliquid poured, ensuring accuracy in ingredient proportions during the mixing\nprocess. The system architecture includes a Retrieval-Augmented Generation\n(RAG) module for accessing and adapting recipes, an anomaly detection mechanism\nto address ingredient availability issues, and bimanual robotic arms for\ndexterous manipulation. Experimental evaluations demonstrated a high success\nrate across system components, with the speech-to-text module achieving a 93%\nsuccess rate in noisy environments, the vision module attaining a 91% success\nrate in object and label detection in cluttered environment, the anomaly module\nsuccessfully identified 95% of discrepancies between detected ingredients and\nrecipe requirements, and the system achieved an overall success rate of 100% in\npreparing cocktails, from recipe formulation to action generation.\n","authors":["Muhamamd Haris Khan","Selamawit Asfaw","Dmitrii Iarchuk","Miguel Altamirano Cabrera","Luis Moreno","Issatay Tokmurziyev","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.06919v1.pdf","comment":"Accepted to IEEE/ACM HRI 2025"},{"id":"http://arxiv.org/abs/2501.04693v2","updated":"2025-01-12T20:02:27Z","published":"2025-01-08T18:57:33Z","title":"Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous\n  Sensors via Language Grounding","summary":"  Interacting with the world is a multi-sensory experience: achieving effective\ngeneral-purpose interaction requires making use of all available modalities --\nincluding vision, touch, and audio -- to fill in gaps from partial observation.\nFor example, when vision is occluded reaching into a bag, a robot should rely\non its senses of touch and sound. However, state-of-the-art generalist robot\npolicies are typically trained on large datasets to predict robot actions\nsolely from visual and proprioceptive observations. In this work, we propose\nFuSe, a novel approach that enables finetuning visuomotor generalist policies\non heterogeneous sensor modalities for which large datasets are not readily\navailable by leveraging natural language as a common cross-modal grounding. We\ncombine a multimodal contrastive loss with a sensory-grounded language\ngeneration loss to encode high-level semantics. In the context of robot\nmanipulation, we show that FuSe enables performing challenging tasks that\nrequire reasoning jointly over modalities such as vision, touch, and sound in a\nzero-shot setting, such as multimodal prompting, compositional cross-modal\nprompting, and descriptions of objects it interacts with. We show that the same\nrecipe is applicable to widely different generalist policies, including both\ndiffusion-based generalist policies and large vision-language-action (VLA)\nmodels. Extensive experiments in the real world show that FuSeis able to\nincrease success rates by over 20% compared to all considered baselines.\n","authors":["Joshua Jones","Oier Mees","Carmelo Sferrazza","Kyle Stachowicz","Pieter Abbeel","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2501.04693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06904v1","updated":"2025-01-12T19:05:44Z","published":"2025-01-12T19:05:44Z","title":"From Simulation to Field: Learning Terrain Traversability for Real-World\n  Deployment","summary":"  The challenge of traversability estimation is a crucial aspect of autonomous\nnavigation in unstructured outdoor environments such as forests. It involves\ndetermining whether certain areas are passable or risky for robots, taking into\naccount factors like terrain irregularities, slopes, and potential obstacles.\nThe majority of current methods for traversability estimation operate on the\nassumption of an offline computation, overlooking the significant influence of\nthe robot's heading direction on accurate traversability estimates. In this\nwork, we introduce a deep neural network that uses detailed geometric\nenvironmental data together with the robot's recent movement characteristics.\nThis fusion enables the generation of robot direction awareness and continuous\ntraversability estimates, essential for enhancing robot autonomy in challenging\nterrains like dense forests. The efficacy and significance of our approach are\nunderscored by experiments conducted on both simulated and real robotic\nplatforms in various environments, yielding quantitatively superior performance\nresults compared to existing methods. Moreover, we demonstrate that our method,\ntrained exclusively in a high-fidelity simulated setting, can accurately\npredict traversability in real-world applications without any real data\ncollection. Our experiments showcase the advantages of our method for\noptimizing path-planning and exploration tasks within difficult outdoor\nenvironments, underscoring its practicality for effective, real-world robotic\nnavigation. In the spirit of collaborative advancement, we have made the code\nimplementation available to the public.\n","authors":["Fetullah Atas","Grzegorz Cielniak","Lars Grimstad"],"pdf_url":"https://arxiv.org/pdf/2501.06904v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2501.06897v1","updated":"2025-01-12T18:38:51Z","published":"2025-01-12T18:38:51Z","title":"ActiveGAMER: Active GAussian Mapping through Efficient Rendering","summary":"  We introduce ActiveGAMER, an active mapping system that utilizes 3D Gaussian\nSplatting (3DGS) to achieve high-quality, real-time scene mapping and\nexploration. Unlike traditional NeRF-based methods, which are computationally\ndemanding and restrict active mapping performance, our approach leverages the\nefficient rendering capabilities of 3DGS, allowing effective and efficient\nexploration in complex environments. The core of our system is a\nrendering-based information gain module that dynamically identifies the most\ninformative viewpoints for next-best-view planning, enhancing both geometric\nand photometric reconstruction accuracy. ActiveGAMER also integrates a\ncarefully balanced framework, combining coarse-to-fine exploration,\npost-refinement, and a global-local keyframe selection strategy to maximize\nreconstruction completeness and fidelity. Our system autonomously explores and\nreconstructs environments with state-of-the-art geometric and photometric\naccuracy and completeness, significantly surpassing existing approaches in both\naspects. Extensive evaluations on benchmark datasets such as Replica and MP3D\nhighlight ActiveGAMER's effectiveness in active mapping tasks.\n","authors":["Liyan Chen","Huangying Zhan","Kevin Chen","Xiangyu Xu","Qingan Yan","Changjiang Cai","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2501.06897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06867v1","updated":"2025-01-12T16:31:53Z","published":"2025-01-12T16:31:53Z","title":"Toward a Universal Concept of Artificial Personality: Implementing\n  Robotic Personality in a Kinova Arm","summary":"  The fundamental role of personality in shaping interactions is increasingly\nbeing exploited in robotics. A carefully designed robotic personality has been\nshown to improve several key aspects of Human-Robot Interaction (HRI). However,\nthe fragmentation and rigidity of existing approaches reveal even greater\nchallenges when applied to non-humanoid robots. On one hand, the state of the\nart is very dispersed; on the other hand, Industry 4.0 is moving towards a\nfuture where humans and industrial robots are going to coexist. In this\ncontext, the proper design of a robotic personality can lead to more successful\ninteractions. This research takes a first step in that direction by integrating\na comprehensive cognitive architecture built upon the definition of robotic\npersonality - validated on humanoid robots - into a robotic Kinova Jaco2 arm.\nThe robot personality is defined through the cognitive architecture as a vector\nin the three-dimensional space encompassing Conscientiousness, Extroversion,\nand Agreeableness, affecting how actions are executed, the action selection\nprocess, and the internal reaction to environmental stimuli. Our main objective\nis to determine whether users perceive distinct personalities in the robot,\nregardless of its shape, and to understand the role language plays in shaping\nthese perceptions. To achieve this, we conducted a user study comprising 144\nsessions of a collaborative game between a Kinova Jaco2 arm and participants,\nwhere the robot's behavior was influenced by its assigned personality.\nFurthermore, we compared two conditions: in the first, the robot communicated\nsolely through gestures and action choices, while in the second, it also\nutilized verbal interaction.\n","authors":["Alice Nardelli","Lorenzo Landolfi","Dario Pasquali","Antonio Sgorbissa","Francesco Rea","Carmine Recchiuto"],"pdf_url":"https://arxiv.org/pdf/2501.06867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06847v1","updated":"2025-01-12T15:34:20Z","published":"2025-01-12T15:34:20Z","title":"Accelerating Discovery in Natural Science Laboratories with AI and\n  Robotics: Perspectives and Challenges from the 2024 IEEE ICRA Workshop,\n  Yokohama, Japan","summary":"  Science laboratory automation enables accelerated discovery in life sciences\nand materials. However, it requires interdisciplinary collaboration to address\nchallenges such as robust and flexible autonomy, reproducibility, throughput,\nstandardization, the role of human scientists, and ethics. This article\nhighlights these issues, reflecting perspectives from leading experts in\nlaboratory automation across different disciplines of the natural sciences.\n","authors":["Andrew I. Cooper","Patrick Courtney","Kourosh Darvish","Moritz Eckhoff","Hatem Fakhruldeen","Andrea Gabrielli","Animesh Garg","Sami Haddadin","Kanako Harada","Jason Hein","Maria Hübner","Dennis Knobbe","Gabriella Pizzuto","Florian Shkurti","Ruja Shrestha","Kerstin Thurow","Rafael Vescovi","Birgit Vogel-Heuser","Ádám Wolf","Naruki Yoshikawa","Yan Zeng","Zhengxue Zhou","Henning Zwirnmann"],"pdf_url":"https://arxiv.org/pdf/2501.06847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06806v1","updated":"2025-01-12T13:13:04Z","published":"2025-01-12T13:13:04Z","title":"Soft Vision-Based Tactile-Enabled SixthFinger: Advancing Daily Objects\n  Manipulation for Stroke Survivors","summary":"  The presence of post-stroke grasping deficiencies highlights the critical\nneed for the development and implementation of advanced compensatory\nstrategies. This paper introduces a novel system to aid chronic stroke\nsurvivors through the development of a soft, vision-based, tactile-enabled\nextra robotic finger. By incorporating vision-based tactile sensing, the system\nautonomously adjusts grip force in response to slippage detection. This synergy\nnot only ensures mechanical stability but also enriches tactile feedback,\nmimicking the dynamics of human-object interactions. At the core of our\napproach is a transformer-based framework trained on a comprehensive tactile\ndataset encompassing objects with a wide range of morphological properties,\nincluding variations in shape, size, weight, texture, and hardness.\nFurthermore, we validated the system's robustness in real-world applications,\nwhere it successfully manipulated various everyday objects. The promising\nresults highlight the potential of this approach to improve the quality of life\nfor stroke survivors.\n","authors":["Basma Hasanen","Mashood M. Mohsan","Abdulaziz Y. Alkayas","Federico Renda","Irfan Hussain"],"pdf_url":"https://arxiv.org/pdf/2501.06806v1.pdf","comment":"Robosoft 2025 conference"},{"id":"http://arxiv.org/abs/2501.06783v1","updated":"2025-01-12T11:42:28Z","published":"2025-01-12T11:42:28Z","title":"Cost-Effective Robotic Handwriting System with AI Integration","summary":"  This paper introduces a cost-effective robotic handwriting system designed to\nreplicate human-like handwriting with high precision. Combining a Raspberry Pi\nPico microcontroller, 3D-printed components, and a machine learning-based\nhandwriting generation model implemented via TensorFlow.js, the system converts\nuser-supplied text into realistic stroke trajectories. By leveraging\nlightweight 3D-printed materials and efficient mechanical designs, the system\nachieves a total hardware cost of approximately \\$56, significantly\nundercutting commercial alternatives. Experimental evaluations demonstrate\nhandwriting precision within $\\pm$0.3 millimeters and a writing speed of\napproximately 200 mm/min, positioning the system as a viable solution for\neducational, research, and assistive applications. This study seeks to lower\nthe barriers to personalized handwriting technologies, making them accessible\nto a broader audience.\n","authors":["Tianyi Huang","Richard Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.06783v1.pdf","comment":"This is an updated version of a paper originally presented at the\n  2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)"},{"id":"http://arxiv.org/abs/2403.11639v2","updated":"2025-01-12T09:21:44Z","published":"2024-03-18T10:21:05Z","title":"An Accurate and Real-time Relative Pose Estimation from Triple\n  Point-line Images by Decoupling Rotation and Translation","summary":"  Line features are valid complements for point features in man-made\nenvironments. 3D-2D constraints provided by line features have been widely used\nin Visual Odometry (VO) and Structure-from-Motion (SfM) systems. However, how\nto accurately solve three-view relative motion only with 2D observations of\npoints and lines in real time has not been fully explored. In this paper, we\npropose a novel three-view pose solver based on rotation-translation decoupled\nestimation. First, a high-precision rotation estimation method based on normal\nvector coplanarity constraints that consider the uncertainty of observations is\nproposed, which can be solved by Levenberg-Marquardt (LM) algorithm\nefficiently. Second, a robust linear translation constraint that minimizes the\ndegree of the rotation components and feature observation components in\nequations is elaborately designed for estimating translations accurately.\nExperiments on synthetic data and real-world data show that the proposed\napproach improves both rotation and translation accuracy compared to the\nclassical trifocal-tensor-based method and the state-of-the-art two-view\nalgorithm in outdoor and indoor environments.\n","authors":["Zewen Xu","Yijia He","Hao Wei","Bo Xu","BinJian Xie","Yihong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.11639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02444v3","updated":"2025-01-12T08:24:18Z","published":"2024-09-04T04:44:21Z","title":"USV-AUV Collaboration Framework for Underwater Tasks under Extreme Sea\n  Conditions","summary":"  Autonomous underwater vehicles (AUVs) are valuable for ocean exploration due\nto their flexibility and ability to carry communication and detection units.\nNevertheless, AUVs alone often face challenges in harsh and extreme sea\nconditions. This study introduces a unmanned surface vehicle (USV)-AUV\ncollaboration framework, which includes high-precision multi-AUV positioning\nusing USV path planning via Fisher information matrix optimization and\nreinforcement learning for multi-AUV cooperative tasks. Applied to a multi-AUV\nunderwater data collection task scenario, extensive simulations validate the\nframework's feasibility and superior performance, highlighting exceptional\ncoordination and robustness under extreme sea conditions. To accelerate\nrelevant research in this field, we have made the simulation code (demo\nversion) available as open-source.\n","authors":["Jingzehua Xu","Guanwen Xie","Xinqi Wang","Yimian Ding","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02444v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06719v1","updated":"2025-01-12T05:09:20Z","published":"2025-01-12T05:09:20Z","title":"Hierarchical Sampling-based Planner with LTL Constraints and Text\n  Prompting","summary":"  This project introduces a hierarchical planner integrating Linear Temporal\nLogic (LTL) constraints with natural language prompting for robot motion\nplanning. The framework decomposes maps into regions, generates directed\ngraphs, and converts them into transition systems for high-level planning. Text\ninstructions are translated into LTL formulas and converted to Deterministic\nFinite Automata (DFA) for sequential goal-reaching tasks while adhering to\nsafety constraints. High-level plans, derived via Breadth-First Search (BFS),\nguide low-level planners like Exploring Random Trees (RRT) and Probabilistic\nRoadmaps (PRM) for obstacle-avoidant navigation along with LTL tasks. The\napproach demonstrates adaptability to various task complexities, though\nchallenges such as graph construction overhead and suboptimal path generation\nremain. Future directions include extending to considering terrain conditions\nand incorporating higher-order dynamics.\n","authors":["Jingzhan Ge","Zi-Hao Zhang","Sheng-En Huang"],"pdf_url":"https://arxiv.org/pdf/2501.06719v1.pdf","comment":"8 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.06693v1","updated":"2025-01-12T03:01:15Z","published":"2025-01-12T03:01:15Z","title":"Vid2Sim: Realistic and Interactive Simulation from Video for Urban\n  Navigation","summary":"  Sim-to-real gap has long posed a significant challenge for robot learning in\nsimulation, preventing the deployment of learned models in the real world.\nPrevious work has primarily focused on domain randomization and system\nidentification to mitigate this gap. However, these methods are often limited\nby the inherent constraints of the simulation and graphics engines. In this\nwork, we propose Vid2Sim, a novel framework that effectively bridges the\nsim2real gap through a scalable and cost-efficient real2sim pipeline for neural\n3D scene reconstruction and simulation. Given a monocular video as input,\nVid2Sim can generate photorealistic and physically interactable 3D simulation\nenvironments to enable the reinforcement learning of visual navigation agents\nin complex urban environments. Extensive experiments demonstrate that Vid2Sim\nsignificantly improves the performance of urban navigation in the digital twins\nand real world by 31.2% and 68.3% in success rate compared with agents trained\nwith prior simulation methods.\n","authors":["Ziyang Xie","Zhizheng Liu","Zhenghao Peng","Wayne Wu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.06693v1.pdf","comment":"Project page: https://metadriverse.github.io/vid2sim/"},{"id":"http://arxiv.org/abs/2501.06680v1","updated":"2025-01-12T01:31:07Z","published":"2025-01-12T01:31:07Z","title":"Application of Vision-Language Model to Pedestrians Behavior and Scene\n  Understanding in Autonomous Driving","summary":"  Autonomous driving (AD) has experienced significant improvements in recent\nyears and achieved promising 3D detection, classification, and localization\nresults. However, many challenges remain, e.g. semantic understanding of\npedestrians' behaviors, and downstream handling for pedestrian interactions.\nRecent studies in applications of Large Language Models (LLM) and\nVision-Language Models (VLM) have achieved promising results in scene\nunderstanding and high-level maneuver planning in diverse traffic scenarios.\nHowever, deploying the billion-parameter LLMs to vehicles requires significant\ncomputation and memory resources. In this paper, we analyzed effective\nknowledge distillation of semantic labels to smaller Vision networks, which can\nbe used for the semantic representation of complex scenes for downstream\ndecision-making for planning and control.\n","authors":["Haoxiang Gao","Yu Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.06680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15908v2","updated":"2025-01-12T01:03:35Z","published":"2024-12-20T13:59:15Z","title":"Speedup Techniques for Switchable Temporal Plan Graph Optimization","summary":"  Multi-Agent Path Finding (MAPF) focuses on planning collision-free paths for\nmultiple agents. However, during the execution of a MAPF plan, agents may\nencounter unexpected delays, which can lead to inefficiencies, deadlocks, or\neven collisions. To address these issues, the Switchable Temporal Plan Graph\nprovides a framework for finding an acyclic Temporal Plan Graph with the\nminimum execution cost under delays, ensuring deadlock- and collision-free\nexecution. Unfortunately, existing optimal algorithms, such as Mixed Integer\nLinear Programming and Graph-Based Switchable Edge Search (GSES), are often too\nslow for practical use. This paper introduces Improved GSES, which\nsignificantly accelerates GSES through four speedup techniques: stronger\nadmissible heuristics, edge grouping, prioritized branching, and incremental\nimplementation. Experiments conducted on four different map types with varying\nnumbers of agents demonstrate that Improved GSES consistently achieves over\ntwice the success rate of GSES and delivers up to a 30-fold speedup on\ninstances where both methods successfully find solutions.\n","authors":["He Jiang","Muhan Lin","Jiaoyang Li"],"pdf_url":"https://arxiv.org/pdf/2412.15908v2.pdf","comment":"Accepted by AAAI 2025. This version contains the appendix"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2501.06976v1","updated":"2025-01-12T23:27:08Z","published":"2025-01-12T23:27:08Z","title":"TensorConvolutionPlus: A python package for distribution system\n  flexibility area estimation","summary":"  Power system operators need new, efficient operational tools to use the\nflexibility of distributed resources and deal with the challenges of highly\nuncertain and variable power systems. Transmission system operators can\nconsider the available flexibility in distribution systems (DSs) without\nbreaching the DS constraints through flexibility areas. However, there is an\nabsence of open-source packages for flexibility area estimation. This paper\nintroduces TensorConvolutionPlus, a user-friendly Python-based package for\nflexibility area estimation. The main features of TensorConvolutionPlus include\nestimating flexibility areas using the TensorConvolution+ algorithm, the power\nflow-based algorithm, an exhaustive PF-based algorithm, and an optimal power\nflow-based algorithm. Additional features include adapting flexibility area\nestimations from different operating conditions and including flexibility\nservice providers offering discrete setpoints of flexibility. The\nTensorConvolutionPlus package facilitates a broader adaptation of flexibility\nestimation algorithms by system operators and power system researchers.\n","authors":["Demetris Chrysostomou","Jose Luis Rueda Torres","Jochen Lorenz Cremer"],"pdf_url":"https://arxiv.org/pdf/2501.06976v1.pdf","comment":"9 pages, 10 figures,"},{"id":"http://arxiv.org/abs/2501.06940v1","updated":"2025-01-12T21:34:37Z","published":"2025-01-12T21:34:37Z","title":"Collaborative Human Activity Recognition with Passive Inter-Body\n  Electrostatic Field","summary":"  The passive body-area electrostatic field has recently been aspiringly\nexplored for wearable motion sensing, harnessing its two thrilling\ncharacteristics: full-body motion sensitivity and environmental sensitivity,\nwhich potentially empowers human activity recognition both independently and\njointly from a single sensing front-end and theoretically brings significant\ncompetition against traditional inertial sensor that is incapable in\nenvironmental variations sensing. While most works focus on exploring the\nelectrostatic field of a single body as the target, this work, for the first\ntime, quantitatively evaluates the mutual effect of inter-body electrostatic\nfields and its contribution to collaborative activity recognition. A wearable\nelectrostatic field sensing front-end and wrist-worn prototypes are built, and\na sixteen-hour, manually annotated dataset is collected, involving an\nexperiment of manipulating objects both independently and collaboratively. A\nregression model is finally used to recognize the collaborative activities\namong users. Despite the theoretical advantages of the body electrostatic\nfield, the recognition of both single and collaborative activities shows\nunanticipated less-competitive recognition performance compared with the\naccelerometer. However, It is worth mentioning that this novel sensing modality\nimproves the recognition F-score of user collaboration by 16\\% in the fusion\nresult of the two wearable motion sensing modalities, demonstrating the\npotential of bringing body electrostatic field as a complementary\npower-efficient signal for collaborative activity tracking using wearables.\n","authors":["Sizhen Bian","Vitor Fortes Rey","Siyu Yuan","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2501.06940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06917v1","updated":"2025-01-12T19:59:34Z","published":"2025-01-12T19:59:34Z","title":"Optimizing Phase Allocation in Unbalanced Power Distribution Networks\n  using a Linearized DistFlow Formulation","summary":"  Power distribution networks, especially in North America, are often\nunbalanced but are designed to keep unbalance levels within the limits\nspecified by IEEE, IEC, and NEMA standards. However, rapid integration of\nunbalanced devices, such as electric vehicle (EV) chargers and single-phase\nsolar plants, can exacerbate these imbalances. This increase can trigger\nprotection devices, increase losses, and potentially damage devices. To address\nthis issue, phase swapping (or phase allocation) has been proposed. Existing\napproaches predominantly rely on heuristic methods. In this work, we develop a\nmixed integer linear programming (MILP) approach for phase allocation. Our\napproach uses linearized DistFlow equations to represent the distribution\nnetwork and incorporates a phase consistency constraint, enforced with binary\nvariables, to ensure that downstream phase configurations align with upstream\nconfigurations. We validate the proposed approach on multiple benchmark test\ncases and demonstrate that it effectively improves network balance, as\nquantified by various metrics.\n","authors":["Rahul K. Gupta","Daniel K. Molzahn"],"pdf_url":"https://arxiv.org/pdf/2501.06917v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2501.06793v1","updated":"2025-01-12T12:10:42Z","published":"2025-01-12T12:10:42Z","title":"Differentially Private Gradient-Tracking-Based Distributed Stochastic\n  Optimization over Directed Graphs","summary":"  This paper proposes a new differentially private gradient-tracking-based\ndistributed stochastic optimization algorithm over directed graphs.\nSpecifically, privacy noises are added to each agent's state and tracking\nvariable to prevent information leakage, and then perturbed states and tracking\nvariables are transmitted to neighbors. We design two novel schemes of the\niteration step-sizes and the sampling number for the algorithm. By using the\nsampling parameter-controlled subsampling method, both schemes enhance the\ndifferential privacy level, and achieve the finite cumulative privacy budget\neven over infinite iterations. The convergence rate of the algorithm is shown\nfor both nonconvex with the Polyak-Lojasiewicz condition and strongly convex\nobjectives: Scheme (S1) achieves the polynomial convergence rate, and Scheme\n(S2) achieves the exponential convergence rate. The trade-off between the\nprivacy and the convergence rate is presented. The algorithm's effectiveness\nand superior performance over the existing works are demonstrated through\nnumerical examples of distributed training on benchmark datasets \"MNIST\" and\n\"CIFAR-10\".\n","authors":["Jialong Chen","Jimin Wang","Ji-Feng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06783v1","updated":"2025-01-12T11:42:28Z","published":"2025-01-12T11:42:28Z","title":"Cost-Effective Robotic Handwriting System with AI Integration","summary":"  This paper introduces a cost-effective robotic handwriting system designed to\nreplicate human-like handwriting with high precision. Combining a Raspberry Pi\nPico microcontroller, 3D-printed components, and a machine learning-based\nhandwriting generation model implemented via TensorFlow.js, the system converts\nuser-supplied text into realistic stroke trajectories. By leveraging\nlightweight 3D-printed materials and efficient mechanical designs, the system\nachieves a total hardware cost of approximately \\$56, significantly\nundercutting commercial alternatives. Experimental evaluations demonstrate\nhandwriting precision within $\\pm$0.3 millimeters and a writing speed of\napproximately 200 mm/min, positioning the system as a viable solution for\neducational, research, and assistive applications. This study seeks to lower\nthe barriers to personalized handwriting technologies, making them accessible\nto a broader audience.\n","authors":["Tianyi Huang","Richard Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.06783v1.pdf","comment":"This is an updated version of a paper originally presented at the\n  2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)"},{"id":"http://arxiv.org/abs/2501.06756v1","updated":"2025-01-12T09:02:32Z","published":"2025-01-12T09:02:32Z","title":"Generative AI Enabled Robust Sensor Placement in Cyber-Physical Power\n  Systems: A Graph Diffusion Approach","summary":"  With advancements in physical power systems and network technologies,\nintegrated Cyber-Physical Power Systems (CPPS) have significantly enhanced\nsystem monitoring and control efficiency and reliability. This integration,\nhowever, introduces complex challenges in designing coherent CPPS, particularly\nas few studies concurrently address the deployment of physical layers and\ncommunication connections in the cyber layer. This paper addresses these\nchallenges by proposing a framework for robust sensor placement to optimize\nanomaly detection in the physical layer and enhance communication resilience in\nthe cyber layer. We model the CPPS as an interdependent network via a graph,\nallowing for simultaneous consideration of both layers. Then, we adopt the\nLog-normal Shadowing Path Loss (LNSPL) model to ensure reliable data\ntransmission. Additionally, we leverage the Fiedler value to measure graph\nresilience against line failures and three anomaly detectors to fortify system\nsafety. However, the optimization problem is NP-hard. Therefore, we introduce\nthe Experience Feedback Graph Diffusion (EFGD) algorithm, which utilizes a\ndiffusion process to generate optimal sensor placement strategies. This\nalgorithm incorporates cross-entropy gradient and experience feedback\nmechanisms to expedite convergence and generate higher reward strategies.\nExtensive simulations demonstrate that the EFGD algorithm enhances model\nconvergence by 18.9% over existing graph diffusion methods and improves average\nreward by 22.90% compared to Denoising Diffusion Policy Optimization (DDPO) and\n19.57% compared to Graph Diffusion Policy Optimization (GDPO), thereby\nsignificantly bolstering the robustness and reliability of CPPS operations.\n","authors":["Changyuan Zhao","Guangyuan Liu","Bin Xiang","Dusit Niyato","Benoit Delinchant","Hongyang Du","Dong In Kim"],"pdf_url":"https://arxiv.org/pdf/2501.06756v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.02444v3","updated":"2025-01-12T08:24:18Z","published":"2024-09-04T04:44:21Z","title":"USV-AUV Collaboration Framework for Underwater Tasks under Extreme Sea\n  Conditions","summary":"  Autonomous underwater vehicles (AUVs) are valuable for ocean exploration due\nto their flexibility and ability to carry communication and detection units.\nNevertheless, AUVs alone often face challenges in harsh and extreme sea\nconditions. This study introduces a unmanned surface vehicle (USV)-AUV\ncollaboration framework, which includes high-precision multi-AUV positioning\nusing USV path planning via Fisher information matrix optimization and\nreinforcement learning for multi-AUV cooperative tasks. Applied to a multi-AUV\nunderwater data collection task scenario, extensive simulations validate the\nframework's feasibility and superior performance, highlighting exceptional\ncoordination and robustness under extreme sea conditions. To accelerate\nrelevant research in this field, we have made the simulation code (demo\nversion) available as open-source.\n","authors":["Jingzehua Xu","Guanwen Xie","Xinqi Wang","Yimian Ding","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02444v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06719v1","updated":"2025-01-12T05:09:20Z","published":"2025-01-12T05:09:20Z","title":"Hierarchical Sampling-based Planner with LTL Constraints and Text\n  Prompting","summary":"  This project introduces a hierarchical planner integrating Linear Temporal\nLogic (LTL) constraints with natural language prompting for robot motion\nplanning. The framework decomposes maps into regions, generates directed\ngraphs, and converts them into transition systems for high-level planning. Text\ninstructions are translated into LTL formulas and converted to Deterministic\nFinite Automata (DFA) for sequential goal-reaching tasks while adhering to\nsafety constraints. High-level plans, derived via Breadth-First Search (BFS),\nguide low-level planners like Exploring Random Trees (RRT) and Probabilistic\nRoadmaps (PRM) for obstacle-avoidant navigation along with LTL tasks. The\napproach demonstrates adaptability to various task complexities, though\nchallenges such as graph construction overhead and suboptimal path generation\nremain. Future directions include extending to considering terrain conditions\nand incorporating higher-order dynamics.\n","authors":["Jingzhan Ge","Zi-Hao Zhang","Sheng-En Huang"],"pdf_url":"https://arxiv.org/pdf/2501.06719v1.pdf","comment":"8 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.06679v1","updated":"2025-01-12T01:09:44Z","published":"2025-01-12T01:09:44Z","title":"Coordinated Deliverable Energy Flexibility from EV Aggregators in\n  Distribution Networks","summary":"  This paper presents a coordinated framework to optimize electric vehicle (EV)\ncharging considering grid constraints and system uncertainties. The proposed\nframework consists of two optimization models. In particular, the distribution\nsystem operator (DSO) solves the first model to optimize the amount of\ndeliverable energy flexibility that can be obtained from EV aggregators. To\naddress the uncertainties of loads and solar energy generation, a hybrid\nrobust/stochastic approach is employed, enabling the transformation of\nuncertainty-related constraints into a set of equivalent deterministic\nconstraints. Once the DSO has computed the optimal energy flexibility, each\naggregator utilizes the second optimization model to optimize the charging\nschedule for its respective fleet of EVs. Numerical simulations are performed\non a modified IEEE 33-bus distribution network to illustrate the efficiency of\nthe proposed framework.\n","authors":["Arash Baharvandi","Duong Tung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.06679v1.pdf","comment":"This Paper has been accepted for presentation in 2025 IEEE Green\n  Technologies Conference"},{"id":"http://arxiv.org/abs/2201.12518v4","updated":"2025-01-12T14:00:09Z","published":"2022-01-29T07:09:03Z","title":"Zeroth-Order Actor-Critic: An Evolutionary Framework for Sequential\n  Decision Problems","summary":"  Evolutionary algorithms (EAs) have shown promise in solving sequential\ndecision problems (SDPs) by simplifying them to static optimization problems\nand searching for the optimal policy parameters in a zeroth-order way. While\nthese methods are highly versatile, they often suffer from high sample\ncomplexity due to their ignorance of the underlying temporal structures. In\ncontrast, reinforcement learning (RL) methods typically formulate SDPs as\nMarkov Decision Process (MDP). Although more sample efficient than EAs, RL\nmethods are restricted to differentiable policies and prone to getting stuck in\nlocal optima. To address these issues, we propose a novel evolutionary\nframework Zeroth-Order Actor-Critic (ZOAC). We propose to use step-wise\nexploration in parameter space and theoretically derive the zeroth-order policy\ngradient. We further utilize the actor-critic architecture to effectively\nleverage the Markov property of SDPs and reduce the variance of gradient\nestimators. In each iteration, ZOAC employs samplers to collect trajectories\nwith parameter space exploration, and alternates between first-order policy\nevaluation (PEV) and zeroth-order policy improvement (PIM). To evaluate the\neffectiveness of ZOAC, we apply it to a challenging multi-lane driving task,\noptimizing the parameters in a rule-based, non-differentiable driving policy\nthat consists of three sub-modules: behavior selection, path planning, and\ntrajectory tracking. We also compare it with gradient-based RL methods on three\nGymnasium tasks, optimizing neural network policies with thousands of\nparameters. Experimental results demonstrate the strong capability of ZOAC in\nsolving SDPs. ZOAC significantly outperforms EAs that treat the problem as\nstatic optimization and matches the performance of gradient-based RL methods\neven without first-order information, in terms of total average return across\nall tasks.\n","authors":["Yuheng Lei","Yao Lyu","Guojian Zhan","Tao Zhang","Jiangtao Li","Jianyu Chen","Shengbo Eben Li","Sifa Zheng"],"pdf_url":"https://arxiv.org/pdf/2201.12518v4.pdf","comment":"Accepted by IEEE Transactions on Evolutionary Computation, Copyright\n  @IEEE"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.12636v4","updated":"2025-01-12T16:26:00Z","published":"2023-08-24T08:22:21Z","title":"Exploring Transferability of Multimodal Adversarial Samples for\n  Vision-Language Pre-training Models with Contrastive Learning","summary":"  The integration of visual and textual data in Vision-Language Pre-training\n(VLP) models is crucial for enhancing vision-language understanding. However,\nthe adversarial robustness of these models, especially in the alignment of\nimage-text features, has not yet been sufficiently explored. In this paper, we\nintroduce a novel gradient-based multimodal adversarial attack method,\nunderpinned by contrastive learning, to improve the transferability of\nmultimodal adversarial samples in VLP models. This method concurrently\ngenerates adversarial texts and images within imperceptive perturbation,\nemploying both image-text and intra-modal contrastive loss. We evaluate the\neffectiveness of our approach on image-text retrieval and visual entailment\ntasks, using publicly available datasets in a black-box setting. Extensive\nexperiments indicate a significant advancement over existing single-modal\ntransfer-based adversarial attack methods and current multimodal adversarial\nattack approaches.\n","authors":["Youze Wang","Wenbo Hu","Yinpeng Dong","Hanwang Zhang","Hang Su","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2308.12636v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08505v4","updated":"2025-01-12T03:44:16Z","published":"2024-03-13T13:12:57Z","title":"CAMSIC: Content-aware Masked Image Modeling Transformer for Stereo Image\n  Compression","summary":"  Existing learning-based stereo image codec adopt sophisticated transformation\nwith simple entropy models derived from single image codecs to encode latent\nrepresentations. However, those entropy models struggle to effectively capture\nthe spatial-disparity characteristics inherent in stereo images, which leads to\nsuboptimal rate-distortion results. In this paper, we propose a stereo image\ncompression framework, named CAMSIC. CAMSIC independently transforms each image\nto latent representation and employs a powerful decoder-free Transformer\nentropy model to capture both spatial and disparity dependencies, by\nintroducing a novel content-aware masked image modeling (MIM) technique. Our\ncontent-aware MIM facilitates efficient bidirectional interaction between prior\ninformation and estimated tokens, which naturally obviates the need for an\nextra Transformer decoder. Experiments show that our stereo image codec\nachieves state-of-the-art rate-distortion performance on two stereo image\ndatasets Cityscapes and InStereo2K with fast encoding and decoding speed. Code\nis available at https://github.com/Xinjie-Q/CAMSIC.\n","authors":["Xinjie Zhang","Shenyuan Gao","Zhening Liu","Jiawei Shao","Xingtong Ge","Dailan He","Tongda Xu","Yan Wang","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08505v4.pdf","comment":"Accepted by AAAI 2025"}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2205.13111v5","updated":"2025-01-12T22:56:34Z","published":"2022-05-26T02:34:08Z","title":"Distributionally Robust Gaussian Process Regression and Bayesian Inverse\n  Problems","summary":"  We study a distributionally robust optimization formulation (i.e., a min-max\ngame) for two representative problems in Bayesian nonparametric estimation:\nGaussian process regression and, more generally, linear inverse problems. Our\nformulation seeks the best mean-squared error predictor, in an\ninfinite-dimensional space, against an adversary who chooses the worst-case\nmodel in a Wasserstein ball around a nominal infinite-dimensional Bayesian\nmodel. The transport cost is chosen to control features such as the degree of\nroughness of the sample paths that the adversary is allowed to inject. We show\nthat the game has a well-defined value (i.e., strong duality holds in the sense\nthat max-min equals min-max) and that there exists a unique Nash equilibrium\nwhich can be computed by a sequence of finite-dimensional approximations.\nCrucially, the worst-case distribution is itself Gaussian. We explore\nproperties of the Nash equilibrium and the effects of hyperparameters through a\nset of numerical experiments, demonstrating the versatility of our modeling\nframework.\n","authors":["Xuhui Zhang","Jose Blanchet","Youssef Marzouk","Viet Anh Nguyen","Sven Wang"],"pdf_url":"https://arxiv.org/pdf/2205.13111v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06931v1","updated":"2025-01-12T20:59:40Z","published":"2025-01-12T20:59:40Z","title":"Discrete lossless convexification for pointing constraints","summary":"  Discrete Lossless Convexification (DLCvx) formulates a convex relaxation for\na specific class of discrete-time non-convex optimal control problems. It\nestablishes sufficient conditions under which the solution of the relaxed\nproblem satisfies the original non-convex constraints at specified time grid\npoints. Furthermore, it provides an upper bound on the number of time grid\npoints where these sufficient conditions may not hold, and thus the original\nconstraints could be violated. This paper extends DLCvx to problems with\ncontrol pointing constraints. Additionally, it introduces a novel DLCvx\nformulation for mixed-integer optimal control problems in which the control is\neither inactive or constrained within an annular sector. This formulation\nbroadens the feasible space for problems with pointing constraints. A numerical\nexample is provided to illustrate its application.\n","authors":["Dayou Luo","Fabio Spada","Behçet Açıkmeşe"],"pdf_url":"https://arxiv.org/pdf/2501.06931v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.06923v1","updated":"2025-01-12T20:23:46Z","published":"2025-01-12T20:23:46Z","title":"Optimal Online Bookmaking for Binary Games","summary":"  In online betting, the bookmaker can update the payoffs it offers on a\nparticular event many times before the event takes place, and the updated\npayoffs may depend on the bets accumulated thus far. We study the problem of\nbookmaking with the goal of maximizing the return in the worst-case, with\nrespect to the gamblers' behavior and the event's outcome. We formalize this\nproblem as the \\emph{Optimal Online Bookmaking game}, and provide the exact\nsolution for the binary case. To this end, we develop the optimal bookmaking\nstrategy, which relies on a new technique called bi-balancing trees, that\nassures that the house loss is the same for all \\emph{decisive} betting\nsequences, where the gambler bets all its money on a single outcome in each\nround.\n","authors":["Alankrita Bhatt","Or Ordentlich","Oron Sabag"],"pdf_url":"https://arxiv.org/pdf/2501.06923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13101v4","updated":"2025-01-12T12:57:10Z","published":"2024-12-17T17:14:56Z","title":"Pontryagin-Guided Policy Optimization for Merton's Portfolio Problem","summary":"  We present a Pontryagin-Guided Direct Policy Optimization (PG-DPO) framework\nfor Merton's portfolio problem, unifying modern neural-network-based policy\nparameterization with the adjoint viewpoint from Pontryagin's maximum principle\n(PMP). Instead of approximating the value function (as done in deep BSDE\nmethods), we track a policy-fixed BSDE for the adjoint processes, which allows\neach gradient update to align with continuous-time PMP conditions. This setup\nyields locally optimal consumption and investment policies that are closely\ntied to classical stochastic control. We further incorporate an alignment\npenalty that nudges the learned policy toward Pontryagin-derived solutions,\nenhancing both convergence speed and training stability. Numerical experiments\nconfirm that PG-DPO effectively handles both consumption and investment,\nachieving strong performance and interpretability without requiring large\noffline datasets or model-free reinforcement learning.\n","authors":["Jeonggyu Huh","Jaegi Jeon"],"pdf_url":"https://arxiv.org/pdf/2412.13101v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06804v1","updated":"2025-01-12T12:56:06Z","published":"2025-01-12T12:56:06Z","title":"A Smoothing Consensus-Based Optimization Algorithm for Nonsmooth\n  Nonconvex Optimization","summary":"  Lately, a novel swarm intelligence model, namely the consensus-based\noptimization (CBO) algorithm, was introduced to deal with the global\noptimization problems. Limited by the conditions of Ito's formula, the\nconvergence analysis of the previous CBO finite particle system mainly focuses\non the problem with smooth objective function. With the help of smoothing\nmethod, this paper achieves a breakthrough by proposing an effective CBO\nalgorithm for solving the global solution of a nonconvex, nonsmooth, and\npossible non-Lipschitz continuous minimization problem with theoretical\nanalysis, which dose not rely on the mean-field limit. We indicate that the\nproposed algorithm exhibits a global consensus and converges to a common state\nwith any initial data. Then, we give a more detailed error estimation on the\nobjective function values along the state of the proposed algorithm towards the\nglobal minimum. Finally, some numerical examples are presented to illustrate\nthe appreciable performance of the proposed method on solving the nonsmooth,\nnonconvex minimization problems.\n","authors":["Jiazhen Wei","Wei Bian"],"pdf_url":"https://arxiv.org/pdf/2501.06804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06788v1","updated":"2025-01-12T12:02:26Z","published":"2025-01-12T12:02:26Z","title":"How Low Can We Go? Minimizing Interaction Samples for Configurable\n  Systems","summary":"  Modern software systems are typically configurable, a fundamental\nprerequisite for wide applicability and reusability. This flexibility poses an\nextraordinary challenge for quality assurance, as the enormous number of\npossible configurations makes it impractical to test each of them separately.\nThis is where t-wise interaction sampling can be used to systematically cover\nthe configuration space and detect unknown feature interactions. Over the last\ntwo decades, numerous algorithms for computing small interaction samples have\nbeen studied, providing improvements for a range of heuristic results;\nnevertheless, it has remained unclear how much these results can still be\nimproved.\n  We present a significant breakthrough: a fundamental framework, based on the\nmathematical principle of duality, for combining near-optimal solutions with\nprovable lower bounds on the required sample size. This implies that we no\nlonger need to work on heuristics with marginal or no improvement, but can\ncertify the solution quality by establishing a limit on the remaining gap; in\nmany cases, we can even prove optimality of achieved solutions. This\ntheoretical contribution also provides extensive practical improvements: Our\nalgorithm SampLNS was tested on 47 small and medium-sized configurable systems\nfrom the existing literature. SampLNS can reliably find samples of smaller size\nthan previous methods in 85% of the cases; moreover, we can achieve and prove\noptimality of solutions for 63% of all instances. This makes it possible to\navoid cumbersome efforts of minimizing samples by researchers as well as\npractitioners, and substantially save testing resources for most configurable\nsystems.\n","authors":["Dominik Krupke","Ahmad Moradi","Michael Perk","Phillip Keldenich","Gabriel Gehrke","Sebastian Krieter","Thomas Thüm","Sándor P. Fekete"],"pdf_url":"https://arxiv.org/pdf/2501.06788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04676v2","updated":"2025-01-12T06:12:10Z","published":"2024-06-07T06:46:47Z","title":"Monotone Lipschitz-Gradient Denoiser: Explainability of Operator\n  Regularization Approaches Free From Lipschitz Constant Control","summary":"  This paper addresses explainability of the operator-regularization approach\nunder the use of monotone Lipschitz-gradient (MoL-Grad) denoiser -- an operator\nthat can be expressed as the Lipschitz continuous gradient of a differentiable\nconvex function. We prove that an operator is a MoL-Grad denoiser if and only\nif it is the ``single-valued'' proximity operator of a weakly convex function.\nAn extension of Moreau's decomposition is also shown with respect to a weakly\nconvex function and the conjugate of its convexified function. Under these\narguments, two specific algorithms, the forward-backward splitting algorithm\nand the primal-dual splitting algorithm, are considered, both employing\nMoL-Grad denoisers. These algorithms generate a sequence of vectors converging\nweakly, under conditions, to a minimizer of a certain cost function which\ninvolves an ``implicit regularizer'' induced by the denoiser. Unlike the\nprevious studies of operator regularization, our framework requires no control\nof the Lipschitz constant in learning the denoiser. The theoretical findings\nare supported by simulations.\n","authors":["Masahiro Yukawa","Isao Yamada"],"pdf_url":"https://arxiv.org/pdf/2406.04676v2.pdf","comment":"16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.06694v1","updated":"2025-01-12T03:01:25Z","published":"2025-01-12T03:01:25Z","title":"Initial Guess Generation for Low-Thrust Trajectory Design with\n  Robustness to Missed-Thrust-Events","summary":"  The growing interest in cislunar space exploration in recent years has driven\nan increasing demand for efficient low-thrust missions to key cislunar orbits.\nThese missions, typically possessing long thrust arcs, are particularly\nsusceptible to operational uncertainties such as missed thrust events.\nAddressing these challenges requires efficient robust trajectory design\nframeworks during the preliminary mission design phase, where it is necessary\nto explore the solution space at a rapid cadence under evolving operational\nconstraints. However, existing methods for missed thrust design rely on solving\nhigh-dimensional nonlinear programs, where generating effective initial guesses\nbecomes challenging. To enhance computational efficiency, quality, and depth of\nrobustness of solutions from global search, we compare two initial guess\nstrategies: a baseline non-conditional global search, which samples from a\nstatic distribution with global support, and a conditional global search, which\ngenerates initial guesses conditioned on solutions to problems with less depth\nof robustness. The conditional search provides a sequential procedure for\nsolving increasingly robust problems. We validate the improvements in the\nconditional approach using a low-thrust case study for the Lunar Gateway Power\nand Propulsion Element, where our results demonstrate that it significantly\nimproves convergence rate and solution quality, highlighting its potential in\npreliminary robust trajectory design.\n","authors":["Amlan Sinha","Ryne Beeson"],"pdf_url":"https://arxiv.org/pdf/2501.06694v1.pdf","comment":"This manuscript has been submitted for publication in the AIAA\n  Journal of Guidance, Control, and Dynamics. It represents a significant\n  evolution of our previous arXiv pre-print submission entitled \"Algorithmic\n  Considerations for Effective Global Search of Robust Low-Thrust\n  Trajectories\", reflecting substantial advancements and refinements"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.03397v3","updated":"2025-01-12T23:38:16Z","published":"2025-01-06T21:34:52Z","title":"DoubleDiffusion: Combining Heat Diffusion with Denoising Diffusion for\n  Generative Learning on 3D Meshes","summary":"  This paper proposes DoubleDiffusion, a novel framework that combines heat\ndissipation diffusion and denoising diffusion for direct generative learning on\n3D mesh surfaces. Our approach addresses the challenges of generating\ncontinuous signal distributions residing on a curve manifold surface. Unlike\nprevious methods that rely on unrolling 3D meshes into 2D or adopting field\nrepresentations, DoubleDiffusion leverages the Laplacian-Beltrami operator to\nprocess features respecting the mesh structure. This combination enables\neffective geometry-aware signal diffusion across the underlying geometry. As\nshown in Fig.1, we demonstrate that DoubleDiffusion has the ability to generate\nRGB signal distributions on complex 3D mesh surfaces and achieves per-category\nshape-conditioned texture generation across different shape geometry. Our work\ncontributes a new direction in diffusion-based generative modeling on 3D\nsurfaces, with potential applications in the field of 3D asset generation.\n","authors":["Xuyang Wang","Ziang Cheng","Zhenyu Li","Jiayu Yang","Haorui Ji","Pan Ji","Mehrtash Harandi","Richard Hartley","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2501.03397v3.pdf","comment":"Codes: https://github.com/Wxyxixixi/DoubleDiffusion_3D_Mesh"},{"id":"http://arxiv.org/abs/2403.15442v3","updated":"2025-01-12T22:16:50Z","published":"2024-03-17T11:28:23Z","title":"Artificial Intelligence for Cochlear Implants: Review of Strategies,\n  Challenges, and Perspectives","summary":"  Automatic speech recognition (ASR) plays a pivotal role in our daily lives,\noffering utility not only for interacting with machines but also for\nfacilitating communication for individuals with partial or profound hearing\nimpairments. The process involves receiving the speech signal in analog form,\nfollowed by various signal processing algorithms to make it compatible with\ndevices of limited capacities, such as cochlear implants (CIs). Unfortunately,\nthese implants, equipped with a finite number of electrodes, often result in\nspeech distortion during synthesis. Despite efforts by researchers to enhance\nreceived speech quality using various state-of-the-art (SOTA) signal processing\ntechniques, challenges persist, especially in scenarios involving multiple\nsources of speech, environmental noise, and other adverse conditions. The\nadvent of new artificial intelligence (AI) methods has ushered in cutting-edge\nstrategies to address the limitations and difficulties associated with\ntraditional signal processing techniques dedicated to CIs. This review aims to\ncomprehensively cover advancements in CI-based ASR and speech enhancement,\namong other related aspects. The primary objective is to provide a thorough\noverview of metrics and datasets, exploring the capabilities of AI algorithms\nin this biomedical field, and summarizing and commenting on the best results\nobtained. Additionally, the review will delve into potential applications and\nsuggest future directions to bridge existing research gaps in this domain.\n","authors":["Billel Essaid","Hamza Kheddar","Noureddine Batel","Muhammad E. H. Chowdhury","Abderrahmane Lakas"],"pdf_url":"https://arxiv.org/pdf/2403.15442v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06942v1","updated":"2025-01-12T21:39:06Z","published":"2025-01-12T21:39:06Z","title":"Comparison of Autoencoders for tokenization of ASL datasets","summary":"  Generative AI, powered by large language models (LLMs), has revolutionized\napplications across text, audio, images, and video. This study focuses on\ndeveloping and evaluating encoder-decoder architectures for the American Sign\nLanguage (ASL) image dataset, consisting of 87,000 images across 29 hand sign\nclasses. Three approaches were compared: Feedforward Autoencoders,\nConvolutional Autoencoders, and Diffusion Autoencoders. The Diffusion\nAutoencoder outperformed the others, achieving the lowest mean squared error\n(MSE) and highest Mean Opinion Score (MOS) due to its probabilistic noise\nmodeling and iterative denoising capabilities. The Convolutional Autoencoder\ndemonstrated effective spatial feature extraction but lacked the robustness of\nthe diffusion process, while the Feedforward Autoencoder served as a baseline\nwith limitations in handling complex image data. Objective and subjective\nevaluations confirmed the superiority of the Diffusion Autoencoder for\nhigh-fidelity image reconstruction, emphasizing its potential in multimodal AI\napplications such as sign language recognition and generation. This work\nprovides critical insights into designing robust encoder-decoder systems to\nadvance multimodal AI capabilities.\n","authors":["Vouk Praun-Petrovic","Aadhvika Koundinya","Lavanya Prahallad"],"pdf_url":"https://arxiv.org/pdf/2501.06942v1.pdf","comment":"9 pages, 2 tables, 4 figures"},{"id":"http://arxiv.org/abs/2501.06939v1","updated":"2025-01-12T21:33:06Z","published":"2025-01-12T21:33:06Z","title":"Super-Resolution of 3D Micro-CT Images Using Generative Adversarial\n  Networks: Enhancing Resolution and Segmentation Accuracy","summary":"  We develop a procedure for substantially improving the quality of segmented\n3D micro-Computed Tomography (micro-CT) images of rocks with a Machine Learning\n(ML) Generative Model. The proposed model enhances the resolution eightfold\n(8x) and addresses segmentation inaccuracies due to the overlapping X-ray\nattenuation in micro-CT measurement for different rock minerals and phases. The\nproposed generative model is a 3D Deep Convolutional Wasserstein Generative\nAdversarial Network with Gradient Penalty (3D DC WGAN-GP). The algorithm is\ntrained on segmented 3D low-resolution micro-CT images and segmented unpaired\ncomplementary 2D high-resolution Laser Scanning Microscope (LSM) images. The\nalgorithm was demonstrated on multiple samples of Berea sandstones. We achieved\nhigh-quality super-resolved 3D images with a resolution of 0.4375 micro-m/voxel\nand accurate segmentation for constituting minerals and pore space. The\ndescribed procedure can significantly expand the modern capabilities of digital\nrock physics.\n","authors":["Evgeny Ugolkov","Xupeng He","Hyung Kwak","Hussein Hoteit"],"pdf_url":"https://arxiv.org/pdf/2501.06939v1.pdf","comment":"24 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.06938v1","updated":"2025-01-12T21:30:44Z","published":"2025-01-12T21:30:44Z","title":"Evaluating unsupervised contrastive learning framework for MRI sequences\n  classification","summary":"  The automatic identification of Magnetic Resonance Imaging (MRI) sequences\ncan streamline clinical workflows by reducing the time radiologists spend\nmanually sorting and identifying sequences, thereby enabling faster diagnosis\nand treatment planning for patients. However, the lack of standardization in\nthe parameters of MRI scans poses challenges for automated systems and\ncomplicates the generation and utilization of datasets for machine learning\nresearch. To address this issue, we propose a system for MRI sequence\nidentification using an unsupervised contrastive deep learning framework. By\ntraining a convolutional neural network based on the ResNet-18 architecture,\nour system classifies nine common MRI sequence types as a 9-class\nclassification problem. The network was trained using an in-house internal\ndataset and validated on several public datasets, including BraTS, ADNI, Fused\nRadiology-Pathology Prostate Dataset, the Breast Cancer Dataset (ACRIN), among\nothers, encompassing diverse acquisition protocols and requiring only 2D slices\nfor training. Our system achieves a classification accuracy of over 0.95 across\nthe nine most common MRI sequence types.\n","authors":["Yuli Wang","Kritika Iyer","Sep Farhand","Yoshihisa Shinagawa"],"pdf_url":"https://arxiv.org/pdf/2501.06938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06927v1","updated":"2025-01-12T20:36:39Z","published":"2025-01-12T20:36:39Z","title":"CULTURE3D: Cultural Landmarks and Terrain Dataset for 3D Applications","summary":"  In this paper, we present a large-scale fine-grained dataset using\nhigh-resolution images captured from locations worldwide. Compared to existing\ndatasets, our dataset offers a significantly larger size and includes a higher\nlevel of detail, making it uniquely suited for fine-grained 3D applications.\nNotably, our dataset is built using drone-captured aerial imagery, which\nprovides a more accurate perspective for capturing real-world site layouts and\narchitectural structures. By reconstructing environments with these detailed\nimages, our dataset supports applications such as the COLMAP format for\nGaussian Splatting and the Structure-from-Motion (SfM) method. It is compatible\nwith widely-used techniques including SLAM, Multi-View Stereo, and Neural\nRadiance Fields (NeRF), enabling accurate 3D reconstructions and point clouds.\nThis makes it a benchmark for reconstruction and segmentation tasks. The\ndataset enables seamless integration with multi-modal data, supporting a range\nof 3D applications, from architectural reconstruction to virtual tourism. Its\nflexibility promotes innovation, facilitating breakthroughs in 3D modeling and\nanalysis.\n","authors":["Xinyi Zheng","Steve Zhang","Weizhe Lin","Aaron Zhang","Walterio W. Mayol-Cuevas","Junxiao Shen"],"pdf_url":"https://arxiv.org/pdf/2501.06927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06922v1","updated":"2025-01-12T20:17:46Z","published":"2025-01-12T20:17:46Z","title":"Benchmarking YOLOv8 for Optimal Crack Detection in Civil Infrastructure","summary":"  Ensuring the structural integrity and safety of bridges is crucial for the\nreliability of transportation networks and public safety. Traditional crack\ndetection methods are increasingly being supplemented or replaced by advanced\nartificial intelligence (AI) techniques. However, most of the models rely on\ntwo-stage target detection algorithms, which pose concerns for real-time\napplications due to their lower speed. While models such as YOLO (You Only Look\nOnce) have emerged as transformative tools due to their remarkable speed and\naccuracy. However, the potential of the latest YOLOv8 framework in this domain\nremains underexplored. This study bridges that gap by rigorously evaluating\nYOLOv8's performance across five model scales (nano, small, medium, large, and\nextra-large) using a high-quality Roboflow dataset. A comprehensive\nhyperparameter optimization was performed, testing six state-of-the-art\noptimizers-Stochastic Gradient Descent, Adaptive Moment Estimation, Adam with\nDecoupled Weight Decay, Root Mean Square Propagation, Rectified Adam, and\nNesterov-accelerated Adam. Results revealed that YOLOv8, optimized with\nStochastic Gradient Descent, delivered exceptional accuracy and speed, setting\na new benchmark for real-time crack detection. Beyond its immediate\napplication, this research positions YOLOv8 as a foundational approach for\nintegrating advanced computer vision techniques into infrastructure monitoring.\nBy enabling more reliable and proactive maintenance of aging bridge networks,\nthis work paves the way for safer, more efficient transportation systems\nworldwide.\n","authors":["Woubishet Zewdu Taffese","Ritesh Sharma","Mohammad Hossein Afsharmovahed","Gunasekaran Manogaran","Genda Chen"],"pdf_url":"https://arxiv.org/pdf/2501.06922v1.pdf","comment":"Accepted at 104th TRB Annual Meeting 2025"},{"id":"http://arxiv.org/abs/2501.06918v1","updated":"2025-01-12T20:01:07Z","published":"2025-01-12T20:01:07Z","title":"Driver Age and Its Effect on Key Driving Metrics: Insights from Dynamic\n  Vehicle Data","summary":"  By 2030, the senior population aged 65 and older is expected to increase by\nover 50%, significantly raising the number of older drivers on the road.\nDrivers over 70 face higher crash death rates compared to those in their\nforties and fifties, underscoring the importance of developing more effective\nsafety interventions for this demographic. Although the impact of aging on\ndriving behavior has been studied, there is limited research on how these\nbehaviors translate into real-world driving scenarios. This study addresses\nthis need by leveraging Naturalistic Driving Data (NDD) to analyze driving\nperformance measures - specifically, speed limit adherence on interstates and\ndeceleration at stop intersections, both of which may be influenced by\nage-related declines. Using NDD, we developed Cumulative Distribution Functions\n(CDFs) to establish benchmarks for key driving behaviors among senior and young\ndrivers. Our analysis, which included anomaly detection, benchmark comparisons,\nand accuracy evaluations, revealed significant differences in driving patterns\nprimarily related to speed limit adherence at 75mph. While our approach shows\npromising potential for enhancing Advanced Driver Assistance Systems (ADAS) by\nproviding tailored interventions based on age-specific adherence to speed limit\ndriving patterns, we recognize the need for additional data to refine and\nvalidate metrics for other driving behaviors. By establishing precise\nbenchmarks for various driving performance metrics, ADAS can effectively\nidentify anomalies, such as abrupt deceleration, which may indicate impaired\ndriving or other safety concerns. This study lays a strong foundation for\nfuture research aimed at improving safety interventions through detailed\ndriving behavior analysis.\n","authors":["Aparna Joshi","Kojo Adugyamfi","Jennifer Merickel","Pujitha Gunaratne","Anuj Sharma"],"pdf_url":"https://arxiv.org/pdf/2501.06918v1.pdf","comment":"21 pages, 9 figures, 4 Tables, 104th TRB Annual Meeting 2025,\n  Washington DC"},{"id":"http://arxiv.org/abs/2501.06909v1","updated":"2025-01-12T19:45:42Z","published":"2025-01-12T19:45:42Z","title":"Local Foreground Selection aware Attentive Feature Reconstruction for\n  few-shot fine-grained plant species classification","summary":"  Plant species exhibit significant intra-class variation and minimal\ninter-class variation. To enhance classification accuracy, it is essential to\nreduce intra-class variation while maximizing inter-class variation. This paper\naddresses plant species classification using a limited number of labelled\nsamples and introduces a novel Local Foreground Selection(LFS) attention\nmechanism. LFS is a straightforward module designed to generate discriminative\nsupport and query feature maps. It operates by integrating two types of\nattention: local attention, which captures local spatial details to enhance\nfeature discrimination and increase inter-class differentiation, and foreground\nselection attention, which emphasizes the foreground plant object while\nmitigating background interference. By focusing on the foreground, the query\nand support features selectively highlight relevant feature sequences and\ndisregard less significant background sequences, thereby reducing intra-class\ndifferences. Experimental results from three plant species datasets demonstrate\nthe effectiveness of the proposed LFS attention mechanism and its complementary\nadvantages over previous feature reconstruction methods.\n","authors":["Aisha Zulfiqar","Ebroul Izquiedro"],"pdf_url":"https://arxiv.org/pdf/2501.06909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06903v1","updated":"2025-01-12T19:01:05Z","published":"2025-01-12T19:01:05Z","title":"Synthetic Prior for Few-Shot Drivable Head Avatar Inversion","summary":"  We present SynShot, a novel method for the few-shot inversion of a drivable\nhead avatar based on a synthetic prior. We tackle two major challenges. First,\ntraining a controllable 3D generative network requires a large number of\ndiverse sequences, for which pairs of images and high-quality tracked meshes\nare not always available. Second, state-of-the-art monocular avatar models\nstruggle to generalize to new views and expressions, lacking a strong prior and\noften overfitting to a specific viewpoint distribution. Inspired by machine\nlearning models trained solely on synthetic data, we propose a method that\nlearns a prior model from a large dataset of synthetic heads with diverse\nidentities, expressions, and viewpoints. With few input images, SynShot\nfine-tunes the pretrained synthetic prior to bridge the domain gap, modeling a\nphotorealistic head avatar that generalizes to novel expressions and\nviewpoints. We model the head avatar using 3D Gaussian splatting and a\nconvolutional encoder-decoder that outputs Gaussian parameters in UV texture\nspace. To account for the different modeling complexities over parts of the\nhead (e.g., skin vs hair), we embed the prior with explicit control for\nupsampling the number of per-part primitives. Compared to state-of-the-art\nmonocular methods that require thousands of real training images, SynShot\nsignificantly improves novel view and expression synthesis.\n","authors":["Wojciech Zielonka","Stephan J. Garbin","Alexandros Lattas","George Kopanas","Paulo Gotardo","Thabo Beeler","Justus Thies","Timo Bolkart"],"pdf_url":"https://arxiv.org/pdf/2501.06903v1.pdf","comment":"Website https://zielon.github.io/synshot/"},{"id":"http://arxiv.org/abs/2501.06897v1","updated":"2025-01-12T18:38:51Z","published":"2025-01-12T18:38:51Z","title":"ActiveGAMER: Active GAussian Mapping through Efficient Rendering","summary":"  We introduce ActiveGAMER, an active mapping system that utilizes 3D Gaussian\nSplatting (3DGS) to achieve high-quality, real-time scene mapping and\nexploration. Unlike traditional NeRF-based methods, which are computationally\ndemanding and restrict active mapping performance, our approach leverages the\nefficient rendering capabilities of 3DGS, allowing effective and efficient\nexploration in complex environments. The core of our system is a\nrendering-based information gain module that dynamically identifies the most\ninformative viewpoints for next-best-view planning, enhancing both geometric\nand photometric reconstruction accuracy. ActiveGAMER also integrates a\ncarefully balanced framework, combining coarse-to-fine exploration,\npost-refinement, and a global-local keyframe selection strategy to maximize\nreconstruction completeness and fidelity. Our system autonomously explores and\nreconstructs environments with state-of-the-art geometric and photometric\naccuracy and completeness, significantly surpassing existing approaches in both\naspects. Extensive evaluations on benchmark datasets such as Replica and MP3D\nhighlight ActiveGAMER's effectiveness in active mapping tasks.\n","authors":["Liyan Chen","Huangying Zhan","Kevin Chen","Xiangyu Xu","Qingan Yan","Changjiang Cai","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2501.06897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06887v1","updated":"2025-01-12T17:50:47Z","published":"2025-01-12T17:50:47Z","title":"MedGrad E-CLIP: Enhancing Trust and Transparency in AI-Driven Skin\n  Lesion Diagnosis","summary":"  As deep learning models gain attraction in medical data, ensuring transparent\nand trustworthy decision-making is essential. In skin cancer diagnosis, while\nadvancements in lesion detection and classification have improved accuracy, the\nblack-box nature of these methods poses challenges in understanding their\ndecision processes, leading to trust issues among physicians. This study\nleverages the CLIP (Contrastive Language-Image Pretraining) model, trained on\ndifferent skin lesion datasets, to capture meaningful relationships between\nvisual features and diagnostic criteria terms. To further enhance transparency,\nwe propose a method called MedGrad E-CLIP, which builds on gradient-based\nE-CLIP by incorporating a weighted entropy mechanism designed for complex\nmedical imaging like skin lesions. This approach highlights critical image\nregions linked to specific diagnostic descriptions. The developed integrated\npipeline not only classifies skin lesions by matching corresponding\ndescriptions but also adds an essential layer of explainability developed\nespecially for medical data. By visually explaining how different features in\nan image relates to diagnostic criteria, this approach demonstrates the\npotential of advanced vision-language models in medical image analysis,\nultimately improving transparency, robustness, and trust in AI-driven\ndiagnostic systems.\n","authors":["Sadia Kamal","Tim Oates"],"pdf_url":"https://arxiv.org/pdf/2501.06887v1.pdf","comment":"Accepted to 2025 IEEE/CVF Winter Conference on Applications of\n  Computer Vision Workshops (WACVW)"},{"id":"http://arxiv.org/abs/2501.06884v1","updated":"2025-01-12T17:41:23Z","published":"2025-01-12T17:41:23Z","title":"Transforming Vision Transformer: Towards Efficient Multi-Task\n  Asynchronous Learning","summary":"  Multi-Task Learning (MTL) for Vision Transformer aims at enhancing the model\ncapability by tackling multiple tasks simultaneously. Most recent works have\npredominantly focused on designing Mixture-of-Experts (MoE) structures and in\ntegrating Low-Rank Adaptation (LoRA) to efficiently perform multi-task\nlearning. However, their rigid combination hampers both the optimization of MoE\nand the ef fectiveness of reparameterization of LoRA, leading to sub-optimal\nperformance and low inference speed. In this work, we propose a novel approach\ndubbed Efficient Multi-Task Learning (EMTAL) by transforming a pre-trained\nVision Transformer into an efficient multi-task learner during training, and\nreparameterizing the learned structure for efficient inference. Specifically,\nwe firstly develop the MoEfied LoRA structure, which decomposes the pre-trained\nTransformer into a low-rank MoE structure and employ LoRA to fine-tune the\nparameters. Subsequently, we take into account the intrinsic asynchronous\nnature of multi-task learning and devise a learning Quality Retaining (QR)\noptimization mechanism, by leveraging the historical high-quality class logits\nto prevent a well-trained task from performance degradation. Finally, we design\na router fading strategy to integrate the learned parameters into the original\nTransformer, archiving efficient inference. Extensive experiments on public\nbenchmarks demonstrate the superiority of our method, compared to the\nstate-of-the-art multi-task learning approaches.\n","authors":["Hanwen Zhong","Jiaxin Chen","Yutong Zhang","Di Huang","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06884v1.pdf","comment":"Accepted by the 38th Conference on Neural Information Processing\n  Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2501.06880v1","updated":"2025-01-12T17:28:09Z","published":"2025-01-12T17:28:09Z","title":"Real-Time Neural-Enhancement for Online Cloud Gaming","summary":"  Online Cloud gaming demands real-time, high-quality video transmission across\nvariable wide-area networks (WANs). Neural-enhanced video transmission\nalgorithms employing super-resolution (SR) for video quality enhancement have\neffectively challenged WAN environments. However, these SR-based methods\nrequire intensive fine-tuning for the whole video, making it infeasible in\ndiverse online cloud gaming. To address this, we introduce River, a cloud\ngaming delivery framework designed based on the observation that video segment\nfeatures in cloud gaming are typically repetitive and redundant. This permits a\nsignificant opportunity to reuse fine-tuned SR models, reducing the fine-tuning\nlatency of minutes to query latency of milliseconds. To enable the idea, we\ndesign a practical system that addresses several challenges, such as model\norganization, online model scheduler, and transfer strategy. River first builds\na content-aware encoder that fine-tunes SR models for diverse video segments\nand stores them in a lookup table. When delivering cloud gaming video streams\nonline, River checks the video features and retrieves the most relevant SR\nmodels to enhance the frame quality. Meanwhile, if no existing SR model\nperforms well enough for some video segments, River will further fine-tune new\nmodels and update the lookup table. Finally, to avoid the overhead of streaming\nmodel weight to the clients, River designs a prefetching strategy that predicts\nthe models with the highest possibility of being retrieved. Our evaluation\nbased on real video game streaming demonstrates River can reduce redundant\ntraining overhead by 44% and improve the Peak-Signal-to-Noise-Ratio by 1.81dB\ncompared to the SOTA solutions. Practical deployment shows River meets\nreal-time requirements, achieving approximately 720p 20fps on mobile devices.\n","authors":["Shan Jiang","Zhenhua Han","Haisheng Tan","Xinyang Jiang","Yifan Yang","Xiaoxi Zhang","Hongqiu Ni","Yuqing Yang","Xiang-Yang Li"],"pdf_url":"https://arxiv.org/pdf/2501.06880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06879v1","updated":"2025-01-12T17:26:24Z","published":"2025-01-12T17:26:24Z","title":"Defect Detection Network In PCB Circuit Devices Based on GAN Enhanced\n  YOLOv11","summary":"  This study proposes an advanced method for surface defect detection in\nprinted circuit boards (PCBs) using an improved YOLOv11 model enhanced with a\ngenerative adversarial network (GAN). The approach focuses on identifying six\ncommon defect types: missing hole, rat bite, open circuit, short circuit, burr,\nand virtual welding. By employing GAN to generate synthetic defect images, the\ndataset is augmented with diverse and realistic patterns, improving the model's\nability to generalize, particularly for complex and infrequent defects like\nburrs. The enhanced YOLOv11 model is evaluated on a PCB defect dataset,\ndemonstrating significant improvements in accuracy, recall, and robustness,\nespecially when dealing with defects in complex environments or small targets.\nThis research contributes to the broader field of electronic design automation\n(EDA), where efficient defect detection is a crucial step in ensuring\nhigh-quality PCB manufacturing. By integrating advanced deep learning\ntechniques, this approach enhances the automation and precision of defect\ndetection, reducing reliance on manual inspection and accelerating\ndesign-to-production workflows. The findings underscore the importance of\nincorporating GAN-based data augmentation and optimized detection architectures\nin EDA processes, providing valuable insights for improving reliability and\nefficiency in PCB defect detection within industrial applications.\n","authors":["Jiayi Huang","Feiyun Zhao","Lieyang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.06879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06878v1","updated":"2025-01-12T17:24:51Z","published":"2025-01-12T17:24:51Z","title":"Uncertainty-Aware Online Extrinsic Calibration: A Conformal Prediction\n  Approach","summary":"  Accurate sensor calibration is crucial for autonomous systems, yet its\nuncertainty quantification remains underexplored. We present the first approach\nto integrate uncertainty awareness into online extrinsic calibration, combining\nMonte Carlo Dropout with Conformal Prediction to generate prediction intervals\nwith a guaranteed level of coverage. Our method proposes a framework to enhance\nexisting calibration models with uncertainty quantification, compatible with\nvarious network architectures. Validated on KITTI (RGB Camera-LiDAR) and DSEC\n(Event Camera-LiDAR) datasets, we demonstrate effectiveness across different\nvisual sensor types, measuring performance with adapted metrics to evaluate the\nefficiency and reliability of the intervals. By providing calibration\nparameters with quantifiable confidence measures, we offer insights into the\nreliability of calibration estimates, which can greatly improve the robustness\nof sensor fusion in dynamic environments and usefully serve the Computer Vision\ncommunity.\n","authors":["Mathieu Cocheteux","Julien Moreau","Franck Davoine"],"pdf_url":"https://arxiv.org/pdf/2501.06878v1.pdf","comment":"Accepted for publication at WACV 2025"},{"id":"http://arxiv.org/abs/2411.17922v3","updated":"2025-01-12T16:50:07Z","published":"2024-11-26T22:31:09Z","title":"Exploring Superpixel Segmentation Methods in the Context of Citizen\n  Science and Deforestation Detection","summary":"  Tropical forests play an essential role in the planet's ecosystem, making the\nconservation of these biomes a worldwide priority. However, ongoing\ndeforestation and degradation pose a significant threat to their existence,\nnecessitating effective monitoring and the proposal of actions to mitigate the\ndamage caused by these processes. In this regard, initiatives range from\ngovernment and private sector monitoring programs to solutions based on citizen\nscience campaigns, for example. Particularly in the context of citizen science\ncampaigns, the segmentation of remote sensing images to identify deforested\nareas and subsequently submit them to analysis by non-specialized volunteers is\nnecessary. Thus, segmentation using superpixel-based techniques proves to be a\nviable solution for this important task. Therefore, this paper presents an\nanalysis of 22 superpixel-based segmentation methods applied to remote sensing\nimages, aiming to identify which of them are more suitable for generating\nsegments for citizen science campaigns. The results reveal that seven of the\nsegmentation methods outperformed the baseline method (SLIC) currently employed\nin the ForestEyes citizen science project, indicating an opportunity for\nimprovement in this important stage of campaign development.\n","authors":["Hugo Resende","Isabela Borlido","Victor Sundermann","Eduardo B. Neto","Silvio Jamil F. Guimarães","Fabio Faria","Alvaro Luiz Fazenda"],"pdf_url":"https://arxiv.org/pdf/2411.17922v3.pdf","comment":"This paper is under review"},{"id":"http://arxiv.org/abs/2501.06869v1","updated":"2025-01-12T16:39:13Z","published":"2025-01-12T16:39:13Z","title":"A Foundational Generative Model for Breast Ultrasound Image Analysis","summary":"  Foundational models have emerged as powerful tools for addressing various\ntasks in clinical settings. However, their potential development to breast\nultrasound analysis remains untapped. In this paper, we present BUSGen, the\nfirst foundational generative model specifically designed for breast ultrasound\nimage analysis. Pretrained on over 3.5 million breast ultrasound images, BUSGen\nhas acquired extensive knowledge of breast structures, pathological features,\nand clinical variations. With few-shot adaptation, BUSGen can generate\nrepositories of realistic and informative task-specific data, facilitating the\ndevelopment of models for a wide range of downstream tasks. Extensive\nexperiments highlight BUSGen's exceptional adaptability, significantly\nexceeding real-data-trained foundational models in breast cancer screening,\ndiagnosis, and prognosis. In breast cancer early diagnosis, our approach\noutperformed all board-certified radiologists (n=9), achieving an average\nsensitivity improvement of 16.5% (P-value<0.0001). Additionally, we\ncharacterized the scaling effect of using generated data which was as effective\nas the collected real-world data for training diagnostic models. Moreover,\nextensive experiments demonstrated that our approach improved the\ngeneralization ability of downstream models. Importantly, BUSGen protected\npatient privacy by enabling fully de-identified data sharing, making progress\nforward in secure medical data utilization. An online demo of BUSGen is\navailable at https://aibus.bio.\n","authors":["Haojun Yu","Youcheng Li","Nan Zhang","Zihan Niu","Xuantong Gong","Yanwen Luo","Haotian Ye","Siyu He","Quanlin Wu","Wangyan Qin","Mengyuan Zhou","Jie Han","Jia Tao","Ziwei Zhao","Di Dai","Di He","Dong Wang","Binghui Tang","Ling Huo","James Zou","Qingli Zhu","Yong Wang","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06869v1.pdf","comment":"Peking University; Stanford University; Peking University Cancer\n  Hospital & Institute; Peking Union Medical College Hospital; Cancer Hospital,\n  Chinese Academy of Medical Sciences"},{"id":"http://arxiv.org/abs/2501.06862v1","updated":"2025-01-12T16:22:17Z","published":"2025-01-12T16:22:17Z","title":"LarvSeg: Exploring Image Classification Data For Large Vocabulary\n  Semantic Segmentation via Category-wise Attentive Classifier","summary":"  Scaling up the vocabulary of semantic segmentation models is extremely\nchallenging because annotating large-scale mask labels is labour-intensive and\ntime-consuming. Recently, language-guided segmentation models have been\nproposed to address this challenge. However, their performance drops\nsignificantly when applied to out-of-distribution categories. In this paper, we\npropose a new large vocabulary semantic segmentation framework, called LarvSeg.\nDifferent from previous works, LarvSeg leverages image classification data to\nscale the vocabulary of semantic segmentation models as large-vocabulary\nclassification datasets usually contain balanced categories and are much easier\nto obtain. However, for classification tasks, the category is image-level,\nwhile for segmentation we need to predict the label at pixel level. To address\nthis issue, we first propose a general baseline framework to incorporate\nimage-level supervision into the training process of a pixel-level segmentation\nmodel, making the trained network perform semantic segmentation on newly\nintroduced categories in the classification data. We then observe that a model\ntrained on segmentation data can group pixel features of categories beyond the\ntraining vocabulary. Inspired by this finding, we design a category-wise\nattentive classifier to apply supervision to the precise regions of\ncorresponding categories to improve the model performance. Extensive\nexperiments demonstrate that LarvSeg significantly improves the large\nvocabulary semantic segmentation performance, especially in the categories\nwithout mask labels. For the first time, we provide a 21K-category semantic\nsegmentation model with the help of ImageNet21K. The code is available at\nhttps://github.com/HaojunYu1998/large_voc_seg.\n","authors":["Haojun Yu","Di Dai","Ziwei Zhao","Di He","Han Hu","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06862v1.pdf","comment":"PRCV 2024"},{"id":"http://arxiv.org/abs/2501.02198v2","updated":"2025-01-12T15:42:20Z","published":"2025-01-04T05:20:53Z","title":"Fresh-CL: Feature Realignment through Experts on Hypersphere in\n  Continual Learning","summary":"  Continual Learning enables models to learn and adapt to new tasks while\nretaining prior knowledge. Introducing new tasks, however, can naturally lead\nto feature entanglement across tasks, limiting the model's capability to\ndistinguish between new domain data. In this work, we propose a method called\nFeature Realignment through Experts on hyperSpHere in Continual Learning\n(Fresh-CL). By leveraging predefined and fixed simplex equiangular tight frame\n(ETF) classifiers on a hypersphere, our model improves feature separation both\nintra and inter tasks. However, the projection to a simplex ETF shifts with new\ntasks, disrupting structured feature representation of previous tasks and\ndegrading performance. Therefore, we propose a dynamic extension of ETF through\nmixture of experts, enabling adaptive projections onto diverse subspaces to\nenhance feature representation. Experiments on 11 datasets demonstrate a 2%\nimprovement in accuracy compared to the strongest baseline, particularly in\nfine-grained datasets, confirming the efficacy of combining ETF and MoE to\nimprove feature distinction in continual learning scenarios.\n","authors":["Zhongyi Zhou","Yaxin Peng","Pin Yi","Minjie Zhu","Chaomin Shen"],"pdf_url":"https://arxiv.org/pdf/2501.02198v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.06848v1","updated":"2025-01-12T15:34:24Z","published":"2025-01-12T15:34:24Z","title":"A General Framework for Inference-time Scaling and Steering of Diffusion\n  Models","summary":"  Diffusion models produce impressive results in modalities ranging from images\nand video to protein design and text. However, generating samples with\nuser-specified properties remains a challenge. Recent research proposes\nfine-tuning models to maximize rewards that capture desired properties, but\nthese methods require expensive training and are prone to mode collapse. In\nthis work, we propose Feynman Kac (FK) steering, an inference-time framework\nfor steering diffusion models with reward functions. FK steering works by\nsampling a system of multiple interacting diffusion processes, called\nparticles, and resampling particles at intermediate steps based on scores\ncomputed using functions called potentials. Potentials are defined using\nrewards for intermediate states and are selected such that a high value\nindicates that the particle will yield a high-reward sample. We explore various\nchoices of potentials, intermediate rewards, and samplers. We evaluate FK\nsteering on text-to-image and text diffusion models. For steering text-to-image\nmodels with a human preference reward, we find that FK steering a 0.8B\nparameter model outperforms a 2.6B parameter fine-tuned model on prompt\nfidelity, with faster sampling and no training. For steering text diffusion\nmodels with rewards for text quality and specific text attributes, we find that\nFK steering generates lower perplexity, more linguistically acceptable outputs\nand enables gradient-free control of attributes like toxicity. Our results\ndemonstrate that inference-time scaling and steering of diffusion models, even\nwith off-the-shelf rewards, can provide significant sample quality gains and\ncontrollability benefits. Code is available at\nhttps://github.com/zacharyhorvitz/Fk-Diffusion-Steering .\n","authors":["Raghav Singhal","Zachary Horvitz","Ryan Teehan","Mengye Ren","Zhou Yu","Kathleen McKeown","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2501.06848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04016v2","updated":"2025-01-12T15:24:23Z","published":"2024-07-04T15:46:01Z","title":"Mitigating Low-Frequency Bias: Feature Recalibration and Frequency\n  Attention Regularization for Adversarial Robustness","summary":"  Ensuring the robustness of deep neural networks against adversarial attacks\nremains a fundamental challenge in computer vision. While adversarial training\n(AT) has emerged as a promising defense strategy, our analysis reveals a\ncritical limitation: AT-trained models exhibit a bias toward low-frequency\nfeatures while neglecting high-frequency components. This bias is particularly\nconcerning as each frequency component carries distinct and crucial\ninformation: low-frequency features encode fundamental structural patterns,\nwhile high-frequency features capture intricate details and textures. To\naddress this limitation, we propose High-Frequency Feature Disentanglement and\nRecalibration (HFDR), a novel module that strategically separates and\nrecalibrates frequency-specific features to capture latent semantic cues. We\nfurther introduce frequency attention regularization to harmonize feature\nextraction across the frequency spectrum and mitigate the inherent\nlow-frequency bias of AT. Extensive experiments demonstrate our method's\nsuperior performance against white-box attacks and transfer attacks, while\nexhibiting strong generalization capabilities across diverse scenarios.\n","authors":["Kejia Zhang","Juanjuan Weng","Yuanzheng Cai","Zhiming Luo","Shaozi Li"],"pdf_url":"https://arxiv.org/pdf/2407.04016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06841v1","updated":"2025-01-12T15:18:31Z","published":"2025-01-12T15:18:31Z","title":"Faithful Counterfactual Visual Explanations (FCVE)","summary":"  Deep learning models in computer vision have made remarkable progress, but\ntheir lack of transparency and interpretability remains a challenge. The\ndevelopment of explainable AI can enhance the understanding and performance of\nthese models. However, existing techniques often struggle to provide convincing\nexplanations that non-experts easily understand, and they cannot accurately\nidentify models' intrinsic decision-making processes. To address these\nchallenges, we propose to develop a counterfactual explanation (CE) model that\nbalances plausibility and faithfulness. This model generates easy-to-understand\nvisual explanations by making minimum changes necessary in images without\naltering the pixel data. Instead, the proposed method identifies internal\nconcepts and filters learned by models and leverages them to produce plausible\ncounterfactual explanations. The provided explanations reflect the internal\ndecision-making process of the model, thus ensuring faithfulness to the model.\n","authors":["Bismillah Khan","Syed Ali Tariq","Tehseen Zia","Muhammad Ahsan","David Windridge"],"pdf_url":"https://arxiv.org/pdf/2501.06841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03880v2","updated":"2025-01-12T15:18:28Z","published":"2025-01-07T15:43:36Z","title":"SELMA3D challenge: Self-supervised learning for 3D light-sheet\n  microscopy image segmentation","summary":"  Recent innovations in light sheet microscopy, paired with developments in\ntissue clearing techniques, enable the 3D imaging of large mammalian tissues\nwith cellular resolution. Combined with the progress in large-scale data\nanalysis, driven by deep learning, these innovations empower researchers to\nrapidly investigate the morphological and functional properties of diverse\nbiological samples. Segmentation, a crucial preliminary step in the analysis\nprocess, can be automated using domain-specific deep learning models with\nexpert-level performance. However, these models exhibit high sensitivity to\ndomain shifts, leading to a significant drop in accuracy when applied to data\noutside their training distribution. To address this limitation, and inspired\nby the recent success of self-supervised learning in training generalizable\nmodels, we organized the SELMA3D Challenge during the MICCAI 2024 conference.\nSELMA3D provides a vast collection of light-sheet images from cleared mice and\nhuman brains, comprising 35 large 3D images-each with over 1000^3 voxels-and\n315 annotated small patches for finetuning, preliminary testing and final\ntesting. The dataset encompasses diverse biological structures, including\nvessel-like and spot-like structures. Five teams participated in all phases of\nthe challenge, and their proposed methods are reviewed in this paper.\nQuantitative and qualitative results from most participating teams demonstrate\nthat self-supervised learning on large datasets improves segmentation model\nperformance and generalization. We will continue to support and extend SELMA3D\nas an inaugural MICCAI challenge focused on self-supervised learning for 3D\nmicroscopy image segmentation.\n","authors":["Ying Chen","Rami Al-Maskari","Izabela Horvath","Mayar Ali","Luciano Hoher","Kaiyuan Yang","Zengming Lin","Zhiwei Zhai","Mengzhe Shen","Dejin Xun","Yi Wang","Tony Xu","Maged Goubran","Yunheng Wu","Kensaku Mori","Johannes C. Paetzold","Ali Erturk"],"pdf_url":"https://arxiv.org/pdf/2501.03880v2.pdf","comment":"2st version"},{"id":"http://arxiv.org/abs/2412.07783v2","updated":"2025-01-12T15:18:16Z","published":"2024-11-25T12:20:07Z","title":"Swin fMRI Transformer Predicts Early Neurodevelopmental Outcomes from\n  Neonatal fMRI","summary":"  Brain development in the first few months of human life is a critical phase\ncharacterized by rapid structural growth and functional organization.\nAccurately predicting developmental outcomes during this time is crucial for\nidentifying delays and enabling timely interventions. This study introduces the\nSwiFT (Swin 4D fMRI Transformer) model, designed to predict Bayley-III\ncomposite scores using neonatal fMRI data from the Developing Human Connectome\nProject (dHCP). To enhance predictive accuracy, we apply dimensionality\nreduction via group independent component analysis (ICA) and pretrain SwiFT on\nlarge adult fMRI datasets to address the challenges of limited neonatal data.\nOur analysis shows that SwiFT significantly outperforms baseline models in\npredicting cognitive, motor, and language outcomes, leveraging both\nsingle-label and multi-label prediction strategies. The model's attention-based\narchitecture processes spatiotemporal data end-to-end, delivering superior\npredictive performance. Additionally, we use Integrated Gradients with\nSmoothgrad sQuare (IG-SQ) to interpret predictions, identifying neural spatial\nrepresentations linked to early cognitive and behavioral development. These\nfindings underscore the potential of Transformer models to advance\nneurodevelopmental research and clinical practice.\n","authors":["Patrick Styll","Dowon Kim","Jiook Cha"],"pdf_url":"https://arxiv.org/pdf/2412.07783v2.pdf","comment":"fMRI Transformer, Developing Human Connectome Project, Bayley Scales\n  of Infant Development, Personalized Therapy, XAI"},{"id":"http://arxiv.org/abs/2401.11791v4","updated":"2025-01-12T15:17:36Z","published":"2024-01-22T09:41:05Z","title":"Semantic Prompt Learning for Weakly-Supervised Semantic Segmentation","summary":"  Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation\nmodels using image data with only image-level supervision. Since precise\npixel-level annotations are not accessible, existing methods typically focus on\nproducing pseudo masks for training segmentation models by refining CAM-like\nheatmaps. However, the produced heatmaps may capture only the discriminative\nimage regions of object categories or the associated co-occurring backgrounds.\nTo address the issues, we propose a Semantic Prompt Learning for WSSS (SemPLeS)\nframework, which learns to effectively prompt the CLIP latent space to enhance\nthe semantic alignment between the segmented regions and the target object\ncategories. More specifically, we propose Contrastive Prompt Learning and\nPrompt-guided Semantic Refinement to learn the prompts that adequately describe\nand suppress the co-occurring backgrounds associated with each object category.\nIn this way, SemPLeS can perform better semantic alignment between object\nregions and class labels, resulting in desired pseudo masks for training\nsegmentation models. The proposed SemPLeS framework achieves competitive\nperformance on standard WSSS benchmarks, PASCAL VOC 2012 and MS COCO 2014, and\nshows compatibility with other WSSS methods. Code:\nhttps://github.com/NVlabs/SemPLeS.\n","authors":["Ci-Siang Lin","Chien-Yi Wang","Yu-Chiang Frank Wang","Min-Hung Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11791v4.pdf","comment":"WACV 2025. Code: https://github.com/NVlabs/SemPLeS. Project page:\n  https://projectdisr.github.io/semples/"},{"id":"http://arxiv.org/abs/2409.13401v2","updated":"2025-01-12T15:10:26Z","published":"2024-09-20T11:02:18Z","title":"PointSAM: Pointly-Supervised Segment Anything Model for Remote Sensing\n  Images","summary":"  Segment Anything Model (SAM) is an advanced foundational model for image\nsegmentation, which is gradually being applied to remote sensing images (RSIs).\nDue to the domain gap between RSIs and natural images, traditional methods\ntypically use SAM as a source pre-trained model and fine-tune it with fully\nsupervised masks. Unlike these methods, our work focuses on fine-tuning SAM\nusing more convenient and challenging point annotations. Leveraging SAM's\nzero-shot capabilities, we adopt a self-training framework that iteratively\ngenerates pseudo-labels for training. However, if the pseudo-labels contain\nnoisy labels, there is a risk of error accumulation. To address this issue, we\nextract target prototypes from the target dataset and use the Hungarian\nalgorithm to match them with prediction prototypes, preventing the model from\nlearning in the wrong direction. Additionally, due to the complex backgrounds\nand dense distribution of objects in RSI, using point prompts may result in\nmultiple objects being recognized as one. To solve this problem, we propose a\nnegative prompt calibration method based on the non-overlapping nature of\ninstance masks. In brief, we use the prompts of overlapping masks as\ncorresponding negative signals, resulting in refined masks. Combining the above\nmethods, we propose a novel Pointly-supervised Segment Anything Model named\nPointSAM. We conduct experiments on RSI datasets, including WHU, HRSID, and\nNWPU VHR-10, and the results show that our method significantly outperforms\ndirect testing with SAM, SAM2, and other comparison methods. Furthermore, we\nintroduce PointSAM as a point-to-box converter and achieve encouraging results,\nsuggesting that this method can be extended to other point-supervised tasks.\nThe code is available at https://github.com/Lans1ng/PointSAM.\n","authors":["Nanqing Liu","Xun Xu","Yongyi Su","Haojie Zhang","Heng-Chao Li"],"pdf_url":"https://arxiv.org/pdf/2409.13401v2.pdf","comment":"Accepted by IEEE TGRS"},{"id":"http://arxiv.org/abs/2501.06836v1","updated":"2025-01-12T15:08:29Z","published":"2025-01-12T15:08:29Z","title":"SAM-DA: Decoder Adapter for Efficient Medical Domain Adaptation","summary":"  This paper addresses the domain adaptation challenge for semantic\nsegmentation in medical imaging. Despite the impressive performance of recent\nfoundational segmentation models like SAM on natural images, they struggle with\nmedical domain images. Beyond this, recent approaches that perform end-to-end\nfine-tuning of models are simply not computationally tractable. To address\nthis, we propose a novel SAM adapter approach that minimizes the number of\ntrainable parameters while achieving comparable performances to full\nfine-tuning. The proposed SAM adapter is strategically placed in the mask\ndecoder, offering excellent and broad generalization capabilities and improved\nsegmentation across both fully supervised and test-time domain adaptation\ntasks. Extensive validation on four datasets showcases the adapter's efficacy,\noutperforming existing methods while training less than 1% of SAM's total\nparameters.\n","authors":["Javier Gamazo Tejero","Moritz Schmid","Pablo Márquez Neila","Martin S. Zinkernagel","Sebastian Wolf","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2501.06836v1.pdf","comment":"WACV25"},{"id":"http://arxiv.org/abs/2501.06835v1","updated":"2025-01-12T15:07:03Z","published":"2025-01-12T15:07:03Z","title":"X-LeBench: A Benchmark for Extremely Long Egocentric Video Understanding","summary":"  Long-form egocentric video understanding provides rich contextual information\nand unique insights into long-term human behaviors, holding significant\npotential for applications in embodied intelligence, long-term activity\nanalysis, and personalized assistive technologies. However, existing benchmark\ndatasets primarily focus on single, short-duration videos or moderately long\nvideos up to dozens of minutes, leaving a substantial gap in evaluating\nextensive, ultra-long egocentric video recordings. To address this, we\nintroduce X-LeBench, a novel benchmark dataset specifically crafted for\nevaluating tasks on extremely long egocentric video recordings. Leveraging the\nadvanced text processing capabilities of large language models (LLMs),\nX-LeBench develops a life-logging simulation pipeline that produces realistic,\ncoherent daily plans aligned with real-world video data. This approach enables\nthe flexible integration of synthetic daily plans with real-world footage from\nEgo4D-a massive-scale egocentric video dataset covers a wide range of daily\nlife scenarios-resulting in 432 simulated video life logs that mirror realistic\ndaily activities in contextually rich scenarios. The video life-log durations\nspan from 23 minutes to 16.4 hours. The evaluation of several baseline systems\nand multimodal large language models (MLLMs) reveals their poor performance\nacross the board, highlighting the inherent challenges of long-form egocentric\nvideo understanding and underscoring the need for more advanced models.\n","authors":["Wenqi Zhou","Kai Cao","Hao Zheng","Xinyi Zheng","Miao Liu","Per Ola Kristensson","Walterio Mayol-Cuevas","Fan Zhang","Weizhe Lin","Junxiao Shen"],"pdf_url":"https://arxiv.org/pdf/2501.06835v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2501.06956v1","updated":"2025-01-12T22:25:46Z","published":"2025-01-12T22:25:46Z","title":"Patent Novelty Assessment Accelerating Innovation and Patent Prosecution","summary":"  In the rapidly evolving landscape of technological innovation, safeguarding\nintellectual property rights through patents is crucial for fostering progress\nand stimulating research and development investments. This report introduces a\nground-breaking Patent Novelty Assessment and Claim Generation System,\nmeticulously crafted to dissect the inventive aspects of intellectual property\nand simplify access to extensive patent claim data. Addressing a crucial gap in\nacademic institutions, our system provides college students and researchers\nwith an intuitive platform to navigate and grasp the intricacies of patent\nclaims, particularly tailored for the nuances of Chinese patents. Unlike\nconventional analysis systems, our initiative harnesses a proprietary Chinese\nAPI to ensure unparalleled precision and relevance. The primary challenge lies\nin the complexity of accessing and comprehending diverse patent claims,\ninhibiting effective innovation upon existing ideas. Our solution aims to\novercome these barriers by offering a bespoke approach that seamlessly\nretrieves comprehensive claim information, finely tuned to the specifics of the\nChinese patent landscape. By equipping users with efficient access to\ncomprehensive patent claim information, our transformative platform seeks to\nignite informed exploration and innovation in the ever-evolving domain of\nintellectual property. Its envisioned impact transcends individual colleges,\nnurturing an environment conducive to research and development while deepening\nthe understanding of patented concepts within the academic community.\n","authors":["Kapil Kashyap","Sean Fargose","Gandhar Dhonde","Aditya Mishra"],"pdf_url":"https://arxiv.org/pdf/2501.06956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06873v1","updated":"2025-01-12T17:03:45Z","published":"2025-01-12T17:03:45Z","title":"Causal Claims in Economics","summary":"  We analyze over 44,000 NBER and CEPR working papers from 1980 to 2023 using a\ncustom language model to construct knowledge graphs that map economic concepts\nand their relationships. We distinguish between general claims and those\ndocumented via causal inference methods (e.g., DiD, IV, RDD, RCTs). We document\na substantial rise in the share of causal claims-from roughly 4% in 1990 to\nnearly 28% in 2020-reflecting the growing influence of the \"credibility\nrevolution.\" We find that causal narrative complexity (e.g., the depth of\ncausal chains) strongly predicts both publication in top-5 journals and higher\ncitation counts, whereas non-causal complexity tends to be uncorrelated or\nnegatively associated with these outcomes. Novelty is also pivotal for top-5\npublication, but only when grounded in credible causal methods: introducing\ngenuinely new causal edges or paths markedly increases both the likelihood of\nacceptance at leading outlets and long-run citations, while non-causal novelty\nexhibits weak or even negative effects. Papers engaging with central, widely\nrecognized concepts tend to attract more citations, highlighting a divergence\nbetween factors driving publication success and long-term academic impact.\nFinally, bridging underexplored concept pairs is rewarded primarily when\ngrounded in causal methods, yet such gap filling exhibits no consistent link\nwith future citations. Overall, our findings suggest that methodological rigor\nand causal innovation are key drivers of academic recognition, but sustained\nimpact may require balancing novel contributions with conceptual integration\ninto established economic discourse.\n","authors":["Prashant Garg","Thiemo Fetzer"],"pdf_url":"https://arxiv.org/pdf/2501.06873v1.pdf","comment":"For data, interactive tools, and additional project information,\n  visit https://www.causal.claims/. The website contains resources such as data\n  downloads, interactive author and paper-level knowledge graphs, and more"},{"id":"http://arxiv.org/abs/2501.06833v1","updated":"2025-01-12T15:00:10Z","published":"2025-01-12T15:00:10Z","title":"Unveiling Temporal Trends in 19th Century Literature: An Information\n  Retrieval Approach","summary":"  In English literature, the 19th century witnessed a significant transition in\nstyles, themes, and genres. Consequently, the novels from this period display\nremarkable diversity. This paper explores these variations by examining the\nevolution of term usage in 19th century English novels through the lens of\ninformation retrieval. By applying a query expansion-based approach to a\ndecade-segmented collection of fiction from the British Library, we examine how\nrelated terms vary over time. Our analysis employs multiple standard metrics\nincluding Kendall's tau, Jaccard similarity, and Jensen-Shannon divergence to\nassess overlaps and shifts in expanded query term sets. Our results indicate a\nsignificant degree of divergence in the related terms across decades as\nselected by the query expansion technique, suggesting substantial linguistic\nand conceptual changes throughout the 19th century novels.\n","authors":["Suchana Datta","Dwaipayan Roy","Derek Greene","Gerardine Meaney"],"pdf_url":"https://arxiv.org/pdf/2501.06833v1.pdf","comment":"Accepted at JCDL 2024"},{"id":"http://arxiv.org/abs/2501.06699v1","updated":"2025-01-12T03:32:12Z","published":"2025-01-12T03:32:12Z","title":"Large Language Models, Knowledge Graphs and Search Engines: A Crossroads\n  for Answering Users' Questions","summary":"  Much has been discussed about how Large Language Models, Knowledge Graphs and\nSearch Engines can be combined in a synergistic manner. A dimension largely\nabsent from current academic discourse is the user perspective. In particular,\nthere remain many open questions regarding how best to address the diverse\ninformation needs of users, incorporating varying facets and levels of\ndifficulty. This paper introduces a taxonomy of user information needs, which\nguides us to study the pros, cons and possible synergies of Large Language\nModels, Knowledge Graphs and Search Engines. From this study, we derive a\nroadmap for future research.\n","authors":["Aidan Hogan","Xin Luna Dong","Denny Vrandečić","Gerhard Weikum"],"pdf_url":"https://arxiv.org/pdf/2501.06699v1.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.06965v1","updated":"2025-01-12T22:49:41Z","published":"2025-01-12T22:49:41Z","title":"Kolmogorov-Arnold Recurrent Network for Short Term Load Forecasting\n  Across Diverse Consumers","summary":"  Load forecasting plays a crucial role in energy management, directly\nimpacting grid stability, operational efficiency, cost reduction, and\nenvironmental sustainability. Traditional Vanilla Recurrent Neural Networks\n(RNNs) face issues such as vanishing and exploding gradients, whereas\nsophisticated RNNs such as LSTMs have shown considerable success in this\ndomain. However, these models often struggle to accurately capture complex and\nsudden variations in energy consumption, and their applicability is typically\nlimited to specific consumer types, such as offices or schools. To address\nthese challenges, this paper proposes the Kolmogorov-Arnold Recurrent Network\n(KARN), a novel load forecasting approach that combines the flexibility of\nKolmogorov-Arnold Networks with RNN's temporal modeling capabilities. KARN\nutilizes learnable temporal spline functions and edge-based activations to\nbetter model non-linear relationships in load data, making it adaptable across\na diverse range of consumer types. The proposed KARN model was rigorously\nevaluated on a variety of real-world datasets, including student residences,\ndetached homes, a home with electric vehicle charging, a townhouse, and\nindustrial buildings. Across all these consumer categories, KARN consistently\noutperformed traditional Vanilla RNNs, while it surpassed LSTM and Gated\nRecurrent Units (GRUs) in six buildings. The results demonstrate KARN's\nsuperior accuracy and applicability, making it a promising tool for enhancing\nload forecasting in diverse energy management scenarios.\n","authors":["Muhammad Umair Danish","Katarina Grolinger"],"pdf_url":"https://arxiv.org/pdf/2501.06965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06964v1","updated":"2025-01-12T22:49:32Z","published":"2025-01-12T22:49:32Z","title":"Enhancing Patient-Centric Communication: Leveraging LLMs to Simulate\n  Patient Perspectives","summary":"  Large Language Models (LLMs) have demonstrated impressive capabilities in\nrole-playing scenarios, particularly in simulating domain-specific experts\nusing tailored prompts. This ability enables LLMs to adopt the persona of\nindividuals with specific backgrounds, offering a cost-effective and efficient\nalternative to traditional, resource-intensive user studies. By mimicking human\nbehavior, LLMs can anticipate responses based on concrete demographic or\nprofessional profiles. In this paper, we evaluate the effectiveness of LLMs in\nsimulating individuals with diverse backgrounds and analyze the consistency of\nthese simulated behaviors compared to real-world outcomes. In particular, we\nexplore the potential of LLMs to interpret and respond to discharge summaries\nprovided to patients leaving the Intensive Care Unit (ICU). We evaluate and\ncompare with human responses the comprehensibility of discharge summaries among\nindividuals with varying educational backgrounds, using this analysis to assess\nthe strengths and limitations of LLM-driven simulations. Notably, when LLMs are\nprimed with educational background information, they deliver accurate and\nactionable medical guidance 88% of the time. However, when other information is\nprovided, performance significantly drops, falling below random chance levels.\nThis preliminary study shows the potential benefits and pitfalls of\nautomatically generating patient-specific health information from diverse\npopulations. While LLMs show promise in simulating health personas, our results\nhighlight critical gaps that must be addressed before they can be reliably used\nin clinical settings. Our findings suggest that a straightforward\nquery-response model could outperform a more tailored approach in delivering\nhealth information. This is a crucial first step in understanding how LLMs can\nbe optimized for personalized health communication while maintaining accuracy.\n","authors":["Xinyao Ma","Rui Zhu","Zihao Wang","Jingwei Xiong","Qingyu Chen","Haixu Tang","L. Jean Camp","Lucila Ohno-Machado"],"pdf_url":"https://arxiv.org/pdf/2501.06964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06963v1","updated":"2025-01-12T22:48:37Z","published":"2025-01-12T22:48:37Z","title":"Generative Artificial Intelligence-Supported Pentesting: A Comparison\n  between Claude Opus, GPT-4, and Copilot","summary":"  The advent of Generative Artificial Intelligence (GenAI) has brought a\nsignificant change to our society. GenAI can be applied across numerous fields,\nwith particular relevance in cybersecurity. Among the various areas of\napplication, its use in penetration testing (pentesting) or ethical hacking\nprocesses is of special interest. In this paper, we have analyzed the potential\nof leading generic-purpose GenAI tools-Claude Opus, GPT-4 from ChatGPT, and\nCopilot-in augmenting the penetration testing process as defined by the\nPenetration Testing Execution Standard (PTES). Our analysis involved evaluating\neach tool across all PTES phases within a controlled virtualized environment.\nThe findings reveal that, while these tools cannot fully automate the\npentesting process, they provide substantial support by enhancing efficiency\nand effectiveness in specific tasks. Notably, all tools demonstrated utility;\nhowever, Claude Opus consistently outperformed the others in our experimental\nscenarios.\n","authors":["Antonio López Martínez","Alejandro Cano","Antonio Ruiz-Martínez"],"pdf_url":"https://arxiv.org/pdf/2501.06963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06962v1","updated":"2025-01-12T22:48:04Z","published":"2025-01-12T22:48:04Z","title":"Compact Bayesian Neural Networks via pruned MCMC sampling","summary":"  Bayesian Neural Networks (BNNs) offer robust uncertainty quantification in\nmodel predictions, but training them presents a significant computational\nchallenge. This is mainly due to the problem of sampling multimodal posterior\ndistributions using Markov Chain Monte Carlo (MCMC) sampling and variational\ninference algorithms. Moreover, the number of model parameters scales\nexponentially with additional hidden layers, neurons, and features in the\ndataset. Typically, a significant portion of these densely connected parameters\nare redundant and pruning a neural network not only improves portability but\nalso has the potential for better generalisation capabilities. In this study,\nwe address some of the challenges by leveraging MCMC sampling with network\npruning to obtain compact probabilistic models having removed redundant\nparameters. We sample the posterior distribution of model parameters (weights\nand biases) and prune weights with low importance, resulting in a compact\nmodel. We ensure that the compact BNN retains its ability to estimate\nuncertainty via the posterior distribution while retaining the model training\nand generalisation performance accuracy by adapting post-pruning resampling. We\nevaluate the effectiveness of our MCMC pruning strategy on selected benchmark\ndatasets for regression and classification problems through empirical result\nanalysis. We also consider two coral reef drill-core lithology classification\ndatasets to test the robustness of the pruning model in complex real-world\ndatasets. We further investigate if refining compact BNN can retain any loss of\nperformance. Our results demonstrate the feasibility of training and pruning\nBNNs using MCMC whilst retaining generalisation performance with over 75%\nreduction in network size. This paves the way for developing compact BNN models\nthat provide uncertainty estimates for real-world applications.\n","authors":["Ratneel Deo","Scott Sisson","Jody M. Webster","Rohitash Chandra"],"pdf_url":"https://arxiv.org/pdf/2501.06962v1.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.06956v1","updated":"2025-01-12T22:25:46Z","published":"2025-01-12T22:25:46Z","title":"Patent Novelty Assessment Accelerating Innovation and Patent Prosecution","summary":"  In the rapidly evolving landscape of technological innovation, safeguarding\nintellectual property rights through patents is crucial for fostering progress\nand stimulating research and development investments. This report introduces a\nground-breaking Patent Novelty Assessment and Claim Generation System,\nmeticulously crafted to dissect the inventive aspects of intellectual property\nand simplify access to extensive patent claim data. Addressing a crucial gap in\nacademic institutions, our system provides college students and researchers\nwith an intuitive platform to navigate and grasp the intricacies of patent\nclaims, particularly tailored for the nuances of Chinese patents. Unlike\nconventional analysis systems, our initiative harnesses a proprietary Chinese\nAPI to ensure unparalleled precision and relevance. The primary challenge lies\nin the complexity of accessing and comprehending diverse patent claims,\ninhibiting effective innovation upon existing ideas. Our solution aims to\novercome these barriers by offering a bespoke approach that seamlessly\nretrieves comprehensive claim information, finely tuned to the specifics of the\nChinese patent landscape. By equipping users with efficient access to\ncomprehensive patent claim information, our transformative platform seeks to\nignite informed exploration and innovation in the ever-evolving domain of\nintellectual property. Its envisioned impact transcends individual colleges,\nnurturing an environment conducive to research and development while deepening\nthe understanding of patented concepts within the academic community.\n","authors":["Kapil Kashyap","Sean Fargose","Gandhar Dhonde","Aditya Mishra"],"pdf_url":"https://arxiv.org/pdf/2501.06956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15442v3","updated":"2025-01-12T22:16:50Z","published":"2024-03-17T11:28:23Z","title":"Artificial Intelligence for Cochlear Implants: Review of Strategies,\n  Challenges, and Perspectives","summary":"  Automatic speech recognition (ASR) plays a pivotal role in our daily lives,\noffering utility not only for interacting with machines but also for\nfacilitating communication for individuals with partial or profound hearing\nimpairments. The process involves receiving the speech signal in analog form,\nfollowed by various signal processing algorithms to make it compatible with\ndevices of limited capacities, such as cochlear implants (CIs). Unfortunately,\nthese implants, equipped with a finite number of electrodes, often result in\nspeech distortion during synthesis. Despite efforts by researchers to enhance\nreceived speech quality using various state-of-the-art (SOTA) signal processing\ntechniques, challenges persist, especially in scenarios involving multiple\nsources of speech, environmental noise, and other adverse conditions. The\nadvent of new artificial intelligence (AI) methods has ushered in cutting-edge\nstrategies to address the limitations and difficulties associated with\ntraditional signal processing techniques dedicated to CIs. This review aims to\ncomprehensively cover advancements in CI-based ASR and speech enhancement,\namong other related aspects. The primary objective is to provide a thorough\noverview of metrics and datasets, exploring the capabilities of AI algorithms\nin this biomedical field, and summarizing and commenting on the best results\nobtained. Additionally, the review will delve into potential applications and\nsuggest future directions to bridge existing research gaps in this domain.\n","authors":["Billel Essaid","Hamza Kheddar","Noureddine Batel","Muhammad E. H. Chowdhury","Abderrahmane Lakas"],"pdf_url":"https://arxiv.org/pdf/2403.15442v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06948v1","updated":"2025-01-12T21:55:04Z","published":"2025-01-12T21:55:04Z","title":"The Einstein Test: Towards a Practical Test of a Machine's Ability to\n  Exhibit Superintelligence","summary":"  Creative and disruptive insights (CDIs), such as the development of the\ntheory of relativity, have punctuated human history, marking pivotal shifts in\nour intellectual trajectory. Recent advancements in artificial intelligence\n(AI) have sparked debates over whether state of the art models possess the\ncapacity to generate CDIs. We argue that the ability to create CDIs should be\nregarded as a significant feature of machine superintelligence (SI).To this\nend, we propose a practical test to evaluate whether an approach to AI\ntargeting SI can yield novel insights of this kind. We propose the Einstein\ntest: given the data available prior to the emergence of a known CDI, can an AI\nindependently reproduce that insight (or one that is formally equivalent)? By\nachieving such a milestone, a machine can be considered to at least match\nhumanity's past top intellectual achievements, and therefore to have the\npotential to surpass them.\n","authors":["David Benrimoh","Nace Mikus","Ariel Rosenfeld"],"pdf_url":"https://arxiv.org/pdf/2501.06948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06937v1","updated":"2025-01-12T21:24:27Z","published":"2025-01-12T21:24:27Z","title":"An Empirical Study of Deep Reinforcement Learning in Continuing Tasks","summary":"  In reinforcement learning (RL), continuing tasks refer to tasks where the\nagent-environment interaction is ongoing and can not be broken down into\nepisodes. These tasks are suitable when environment resets are unavailable,\nagent-controlled, or predefined but where all rewards-including those beyond\nresets-are critical. These scenarios frequently occur in real-world\napplications and can not be modeled by episodic tasks. While modern deep RL\nalgorithms have been extensively studied and well understood in episodic tasks,\ntheir behavior in continuing tasks remains underexplored. To address this gap,\nwe provide an empirical study of several well-known deep RL algorithms using a\nsuite of continuing task testbeds based on Mujoco and Atari environments,\nhighlighting several key insights concerning continuing tasks. Using these\ntestbeds, we also investigate the effectiveness of a method for improving\ntemporal-difference-based RL algorithms in continuing tasks by centering\nrewards, as introduced by Naik et al. (2024). While their work primarily\nfocused on this method in conjunction with Q-learning, our results extend their\nfindings by demonstrating that this method is effective across a broader range\nof algorithms, scales to larger tasks, and outperforms two other\nreward-centering approaches.\n","authors":["Yi Wan","Dmytro Korenkevych","Zheqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.06937v1.pdf","comment":null}]},"2025-01-11T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.06660v1","updated":"2025-01-11T23:16:49Z","published":"2025-01-11T23:16:49Z","title":"MapGS: Generalizable Pretraining and Data Augmentation for Online\n  Mapping via Novel View Synthesis","summary":"  Online mapping reduces the reliance of autonomous vehicles on high-definition\n(HD) maps, significantly enhancing scalability. However, recent advancements\noften overlook cross-sensor configuration generalization, leading to\nperformance degradation when models are deployed on vehicles with different\ncamera intrinsics and extrinsics. With the rapid evolution of novel view\nsynthesis methods, we investigate the extent to which these techniques can be\nleveraged to address the sensor configuration generalization challenge. We\npropose a novel framework leveraging Gaussian splatting to reconstruct scenes\nand render camera images in target sensor configurations. The target config\nsensor data, along with labels mapped to the target config, are used to train\nonline mapping models. Our proposed framework on the nuScenes and Argoverse 2\ndatasets demonstrates a performance improvement of 18% through effective\ndataset augmentation, achieves faster convergence and efficient training, and\nexceeds state-of-the-art performance when using only 25% of the original\ntraining data. This enables data reuse and reduces the need for laborious data\nlabeling. Project page at https://henryzhangzhy.github.io/mapgs.\n","authors":["Hengyuan Zhang","David Paz","Yuliang Guo","Xinyu Huang","Henrik I. Christensen","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2501.06660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06639v1","updated":"2025-01-11T21:14:52Z","published":"2025-01-11T21:14:52Z","title":"Enhancing Path Planning Performance through Image Representation\n  Learning of High-Dimensional Configuration Spaces","summary":"  This paper presents a novel method for accelerating path-planning tasks in\nunknown scenes with obstacles by utilizing Wasserstein Generative Adversarial\nNetworks (WGANs) with Gradient Penalty (GP) to approximate the distribution of\nwaypoints for a collision-free path using the Rapidly-exploring Random Tree\nalgorithm. Our approach involves conditioning the WGAN-GP with a forward\ndiffusion process in a continuous latent space to handle multimodal datasets\neffectively. We also propose encoding the waypoints of a collision-free path as\na matrix, where the multidimensional ordering of the waypoints is naturally\npreserved. This method not only improves model learning but also enhances\ntraining convergence. Furthermore, we propose a method to assess whether the\ntrained model fails to accurately capture the true waypoints. In such cases, we\nrevert to uniform sampling to ensure the algorithm's probabilistic\ncompleteness; a process that traditionally involves manually determining an\noptimal ratio for each scenario in other machine learning-based methods. Our\nexperiments demonstrate promising results in accelerating path-planning tasks\nunder critical time constraints. The source code is openly available at\nhttps://bitbucket.org/joro3001/imagewgangpplanning/src/master/.\n","authors":["Jorge Ocampo Jimenez","Wael Suleiman"],"pdf_url":"https://arxiv.org/pdf/2501.06639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06605v1","updated":"2025-01-11T18:11:07Z","published":"2025-01-11T18:11:07Z","title":"RoboHorizon: An LLM-Assisted Multi-View World Model for Long-Horizon\n  Robotic Manipulation","summary":"  Efficient control in long-horizon robotic manipulation is challenging due to\ncomplex representation and policy learning requirements. Model-based visual\nreinforcement learning (RL) has shown great potential in addressing these\nchallenges but still faces notable limitations, particularly in handling sparse\nrewards and complex visual features in long-horizon environments. To address\nthese limitations, we propose the Recognize-Sense-Plan-Act (RSPA) pipeline for\nlong-horizon tasks and further introduce RoboHorizon, an LLM-assisted\nmulti-view world model tailored for long-horizon robotic manipulation. In\nRoboHorizon, pre-trained LLMs generate dense reward structures for multi-stage\nsub-tasks based on task language instructions, enabling robots to better\nrecognize long-horizon tasks. Keyframe discovery is then integrated into the\nmulti-view masked autoencoder (MAE) architecture to enhance the robot's ability\nto sense critical task sequences, strengthening its multi-stage perception of\nlong-horizon processes. Leveraging these dense rewards and multi-view\nrepresentations, a robotic world model is constructed to efficiently plan\nlong-horizon tasks, enabling the robot to reliably act through RL algorithms.\nExperiments on two representative benchmarks, RLBench and FurnitureBench, show\nthat RoboHorizon outperforms state-of-the-art visual model-based RL methods,\nachieving a 23.35% improvement in task success rates on RLBench's 4\nshort-horizon tasks and a 29.23% improvement on 6 long-horizon tasks from\nRLBench and 3 furniture assembly tasks from FurnitureBench.\n","authors":["Zixuan Chen","Jing Huo","Yangtao Chen","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2501.06605v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.06566v1","updated":"2025-01-11T15:06:34Z","published":"2025-01-11T15:06:34Z","title":"Cooperative Aerial Robot Inspection Challenge: A Benchmark for\n  Heterogeneous Multi-UAV Planning and Lessons Learned","summary":"  We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a\nsimulation-based benchmark for motion planning algorithms in heterogeneous\nmulti-UAV systems. CARIC features UAV teams with complementary sensors,\nrealistic constraints, and evaluation metrics prioritizing inspection quality\nand efficiency. It offers a ready-to-use perception-control software stack and\ndiverse scenarios to support the development and evaluation of task allocation\nand motion planning algorithms. Competitions using CARIC were held at IEEE CDC\n2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation,\nattracting innovative solutions from research teams worldwide. This paper\nexamines the top three teams from CDC 2023, analyzing their exploration,\ninspection, and task allocation strategies while drawing insights into their\nperformance across scenarios. The results highlight the task's complexity and\nsuggest promising directions for future research in cooperative multi-UAV\nsystems.\n","authors":["Muqing Cao","Thien-Minh Nguyen","Shenghai Yuan","Andreas Anastasiou","Angelos Zacharia","Savvas Papaioannou","Panayiotis Kolios","Christos G. Panayiotou","Marios M. Polycarpou","Xinhang Xu","Mingjie Zhang","Fei Gao","Boyu Zhou","Ben M. Chen","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2501.06566v1.pdf","comment":"Please find our website at https://ntu-aris.github.io/caric"},{"id":"http://arxiv.org/abs/2501.02184v2","updated":"2025-01-11T12:35:16Z","published":"2025-01-04T04:35:05Z","title":"Model-Free and Real-Time Bioinspired Unicycle-Based Source Seeking:\n  Differential Wheeled Robotic Experiments","summary":"  Bioinspred robots aimed at source-seeking are often studied, and their\ncontrols designed, using unicycle modeling and formulation. This is true not\nonly for model-based controllers, but also for model-free, real-time control\nmethods such as extremum seeking control (ESC). In this paper, we propose a\nunicycle-based ESC design applicable to differential wheeled robots that: (1)\nis very simple design, based on one simple control-affine law, and without\nstate integrators; (2) attenuates oscillations known to persist in ESC designs\n(i.e., fully stop at the source); and (3) operates in a model-free, real-time\nsetting, tolerating environmental/sensor noise. We provide simulation and\nreal-world robotic experimental results for fixed and moving light source\nseeking by a differential wheeled robot using our proposed design. Results\nindicate clear advantages of our proposed design when compared to the\nliterature, including attenuation of undesired oscillations, improved\nconvergence speed, and better handling of noise.\n","authors":["Ahmed A. Elgohary","Sameh A. Eisa","Shivam Bajpai"],"pdf_url":"https://arxiv.org/pdf/2501.02184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06528v1","updated":"2025-01-11T12:31:33Z","published":"2025-01-11T12:31:33Z","title":"Safe Circumnavigation of a Hostile Target Using Range-Based Measurements","summary":"  Robotic systems are frequently deployed in missions that are dull, dirty, and\ndangerous, where ensuring their safety is of paramount importance when\ndesigning stabilizing controllers to achieve their desired goals. This paper\naddresses the problem of safe circumnavigation around a hostile target by a\nnonholonomic robot, with the objective of maintaining a desired safe distance\nfrom the target. Our solution approach involves incorporating an auxiliary\ncircle into the problem formulation, which assists in navigating the robot\naround the target using available range-based measurements. By leveraging the\nconcept of a barrier Lyapunov function, we propose a novel control law that\nensures stable circumnavigation around the target while preventing the robot\nfrom entering the safety circle. This controller is designed based on a\nparameter that depends on the radii of three circles, namely the stabilizing\ncircle, the auxiliary circle, and the safety circle. By identifying an\nappropriate range for this design parameter, we rigorously prove the stability\nof the desired equilibrium of the closed-loop system. Additionally, we provide\nan analysis of the robot's motion within the auxiliary circle, which is\ninfluenced by a gain parameter in the proposed controller. Simulation and\nexperimental results are presented to illustrate the key theoretical\ndevelopments.\n","authors":["Gaurav Singh Bhati","Arukonda Vaishnavi","Anoop Jain"],"pdf_url":"https://arxiv.org/pdf/2501.06528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06493v1","updated":"2025-01-11T09:45:38Z","published":"2025-01-11T09:45:38Z","title":"Whole-Body Integrated Motion Planning for Aerial Manipulators","summary":"  Efficient motion planning for Aerial Manipulators (AMs) is essential for\ntackling complex manipulation tasks, yet achieving coupled trajectory planning\nremains challenging. In this work, we propose, to the best of our knowledge,\nthe first whole-body integrated motion planning framework for aerial\nmanipulators, which is facilitated by an improved Safe Flight Corridor (SFC)\ngeneration strategy and high-dimensional collision-free trajectory planning. In\nparticular, we formulate an optimization problem to generate feasible\ntrajectories for both the quadrotor and manipulator while ensuring collision\navoidance, dynamic feasibility, kinematic feasibility, and waypoint\nconstraints. To achieve collision avoidance, we introduce a variable geometry\napproximation method, which dynamically models the changing collision volume\ninduced by different manipulator configurations. Moreover, waypoint constraints\nin our framework are defined in $\\mathrm{SE(3)\\times\\mathbb{R}^3}$, allowing\nthe aerial manipulator to traverse specified positions while maintaining\ndesired attitudes and end-effector states. The effectiveness of our framework\nis validated through comprehensive simulations and real-world experiments\nacross various environments.\n","authors":["Weiliang Deng","Hongming Chen","Biyu Ye","Haoran Chen","Ximin Lyu"],"pdf_url":"https://arxiv.org/pdf/2501.06493v1.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2501.03763v2","updated":"2025-01-11T07:30:10Z","published":"2025-01-07T13:04:39Z","title":"3D Printable Gradient Lattice Design for Multi-Stiffness Robotic Fingers","summary":"  Human fingers achieve exceptional dexterity and adaptability by combining\nstructures with varying stiffness levels, from soft tissues (low) to tendons\nand cartilage (medium) to bones (high). This paper explores developing a\nrobotic finger with similar multi-stiffness characteristics. Specifically, we\npropose using a lattice configuration, parameterized by voxel size and unit\ncell geometry, to optimize and achieve fine-tuned stiffness properties with\nhigh granularity. A significant advantage of this approach is the feasibility\nof 3D printing the designs in a single process, eliminating the need for manual\nassembly of elements with differing stiffness. Based on this method, we present\na novel, human-like finger, and a soft gripper. We integrate the latter with a\nrigid manipulator and demonstrate the effectiveness in pick and place tasks.\n","authors":["Siebe J. Schouten","Tomas Steenman","Rens File","Merlijn Den Hartog","Aimee Sakes","Cosimo Della Santina","Kirsten Lussenburg","Ebrahim Shahabi"],"pdf_url":"https://arxiv.org/pdf/2501.03763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02751v3","updated":"2025-01-11T05:02:32Z","published":"2024-03-05T08:10:11Z","title":"Splat-Nav: Safe Real-Time Robot Navigation in Gaussian Splatting Maps","summary":"  We present Splat-Nav, a real-time robot navigation pipeline for Gaussian\nSplatting (GSplat) scenes, a powerful new 3D scene representation. Splat-Nav\nconsists of two components: 1) Splat-Plan, a safe planning module, and 2)\nSplat-Loc, a robust vision-based pose estimation module. Splat-Plan builds a\nsafe-by-construction polytope corridor through the map based on mathematically\nrigorous collision constraints and then constructs a B\\'ezier curve trajectory\nthrough this corridor. Splat-Loc provides real-time recursive state estimates\ngiven only an RGB feed from an on-board camera, leveraging the point-cloud\nrepresentation inherent in GSplat scenes. Working together, these modules give\nrobots the ability to recursively re-plan smooth and safe trajectories to goal\nlocations. Goals can be specified with position coordinates, or with language\ncommands by using a semantic GSplat. We demonstrate improved safety compared to\npoint cloud-based methods in extensive simulation experiments. In a total of\n126 hardware flights, we demonstrate equivalent safety and speed compared to\nmotion capture and visual odometry, but without a manual frame alignment\nrequired by those methods. We show online re-planning at more than 2 Hz and\npose estimation at about 25 Hz, an order of magnitude faster than Neural\nRadiance Field (NeRF)-based navigation methods, thereby enabling real-time\nnavigation. We provide experiment videos on our project page at\nhttps://chengine.github.io/splatnav/. Our codebase and ROS nodes can be found\nat https://github.com/chengine/splatnav.\n","authors":["Timothy Chen","Ola Shorinwa","Joseph Bruno","Aiden Swann","Javier Yu","Weijia Zeng","Keiko Nagami","Philip Dames","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2403.02751v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06431v1","updated":"2025-01-11T04:13:26Z","published":"2025-01-11T04:13:26Z","title":"Aug3D: Augmenting large scale outdoor datasets for Generalizable Novel\n  View Synthesis","summary":"  Recent photorealistic Novel View Synthesis (NVS) advances have increasingly\ngained attention. However, these approaches remain constrained to small indoor\nscenes. While optimization-based NVS models have attempted to address this,\ngeneralizable feed-forward methods, offering significant advantages, remain\nunderexplored. In this work, we train PixelNeRF, a feed-forward NVS model, on\nthe large-scale UrbanScene3D dataset. We propose four training strategies to\ncluster and train on this dataset, highlighting that performance is hindered by\nlimited view overlap. To address this, we introduce Aug3D, an augmentation\ntechnique that leverages reconstructed scenes using traditional\nStructure-from-Motion (SfM). Aug3D generates well-conditioned novel views\nthrough grid and semantic sampling to enhance feed-forward NVS model learning.\nOur experiments reveal that reducing the number of views per cluster from 20 to\n10 improves PSNR by 10%, but the performance remains suboptimal. Aug3D further\naddresses this by combining the newly generated novel views with the original\ndataset, demonstrating its effectiveness in improving the model's ability to\npredict novel views.\n","authors":["Aditya Rauniyar","Omar Alama","Silong Yong","Katia Sycara","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2501.06431v1.pdf","comment":"IROS 2024 Workshop, 9 Pages, 7 Figures"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2501.06670v1","updated":"2025-01-11T23:50:10Z","published":"2025-01-11T23:50:10Z","title":"A Geometric Analysis-Based Safety Assessment Framework for MASS Route\n  Decision-Making in Restricted Waters","summary":"  To enhance the safety of Maritime Autonomous Surface Ships (MASS) navigating\nin restricted waters, this paper aims to develop a geometric analysis-based\nroute safety assessment (GARSA) framework, specifically designed for their\nroute decision-making in irregularly shaped waterways. Utilizing line and point\ngeometric elements to define waterway boundaries, the framework enables to\nconstruct a dynamic width characterization function to quantify spatial safety\nalong intricate waterways. An iterative method is developed to calculate this\nfunction, enabling an abstracted spatial property representation of the\nwaterways. Based on this, we introduce a navigational safety index that\nbalances global navigational safety and local risk to determine the safest\nroute. To accommodate ship kinematic constraints, path modifications are\napplied using a dynamic window approach. A case study in a simulated Port of\nHamburg environment shows that GARSA effectively identifies safe routes and\navoids the risk of entering narrow waterways in an autonomous manner, thereby\nprioritizing safety in route decision-making for MASS in confined waters.\n","authors":["Zilong Xu","Zihao Wang","He Li","Dingli Yu","Zaili Yang","Jin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01973v4","updated":"2025-01-11T23:24:42Z","published":"2024-06-04T05:18:11Z","title":"Adaptive Relaxation based Non-Conservative Chance Constrained Stochastic\n  MPC","summary":"  Chance constrained stochastic model predictive controllers (CC-SMPC) trade\noff full constraint satisfaction for economical plant performance under\nuncertainty. Previous CC-SMPC works are over-conservative in constraint\nviolations leading to worse economic performance. Other past works require\na-priori information about the uncertainty set, limiting their application.\nThis paper considers a discrete LTI system with hard constraints on inputs and\nchance constraints on states, with unknown uncertainty distribution,\nstatistics, or samples. This work proposes a novel adaptive online update rule\nto relax the state constraints based on the time-average of past constraint\nviolations, to achieve reduced conservativeness in closed-loop. Under an ideal\ncontrol policy assumption, it is proven that the time-average of constraint\nviolations asymptotically converges to the maximum allowed violation\nprobability. The method is applied for optimal battery energy storage system\n(BESS) dispatch in a grid connected microgrid with PV generation and load\ndemand, with chance constraints on BESS state-of-charge (SOC). Realistic\nsimulations show the superior electricity cost saving potential of the proposed\nmethod as compared to the traditional economic MPC without chance constraints,\nand a state-of-the-art approach with chance constraints. We satisfy the chance\nconstraints non-conservatively in closed-loop, effectively trading off\nincreased cost savings with minimal adverse effects on BESS lifetime.\n","authors":["Avik Ghosh","Cristian Cortes-Aguirre","Yi-An Chen","Adil Khurram","Jan Kleissl"],"pdf_url":"https://arxiv.org/pdf/2406.01973v4.pdf","comment":"16 pages, 3 figures, 3 tables, Submitted to IEEE Transactions on\n  Control Systems Technology; Minor addition to footnote 6 from last version"},{"id":"http://arxiv.org/abs/2501.06635v1","updated":"2025-01-11T20:53:33Z","published":"2025-01-11T20:53:33Z","title":"A Reduced Order Iterative Linear Quadratic Regulator (ILQR) Technique\n  for the Optimal Control of Nonlinear Partial Differential Equations","summary":"  In this paper, we introduce a reduced order model-based reinforcement\nlearning (MBRL) approach, utilizing the Iterative Linear Quadratic Regulator\n(ILQR) algorithm for the optimal control of nonlinear partial differential\nequations (PDEs). The approach proposes a novel modification of the ILQR\ntechnique: it uses the Method of Snapshots to identify a reduced order Linear\nTime Varying (LTV) approximation of the nonlinear PDE dynamics around a current\nestimate of the optimal trajectory, utilizes the identified LTV model to solve\na time-varying reduced order LQR problem to obtain an improved estimate of the\noptimal trajectory along with a new reduced basis, and iterates till\nconvergence. The convergence behavior of the reduced order approach is analyzed\nand the algorithm is shown to converge to a limit set that is dependent on the\ntruncation error in the reduction. The proposed approach is tested on the\nviscous Burger's equation and two phase-field models for microstructure\nevolution in materials, and the results show that there is a significant\nreduction in the computational burden over the standard ILQR approach, without\nsignificantly sacrificing performance.\n","authors":["Aayushman Sharma","Suman Chakravorty"],"pdf_url":"https://arxiv.org/pdf/2501.06635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17723v2","updated":"2025-01-11T19:28:45Z","published":"2024-12-23T17:11:02Z","title":"Asynchronous Federated Learning: A Scalable Approach for Decentralized\n  Machine Learning","summary":"  Federated Learning (FL) has emerged as a powerful paradigm for decentralized\nmachine learning, enabling collaborative model training across diverse clients\nwithout sharing raw data. However, traditional FL approaches often face\nlimitations in scalability and efficiency due to their reliance on synchronous\nclient updates, which can result in significant delays and increased\ncommunication overhead, particularly in heterogeneous and dynamic environments.\nTo address these challenges in this paper, we propose an Asynchronous Federated\nLearning (AFL) algorithm, which allows clients to update the global model\nindependently and asynchronously. Our key contributions include a comprehensive\nconvergence analysis of AFL in the presence of client delays and model\nstaleness. By leveraging martingale difference sequence theory and variance\nbounds, we ensure robust convergence despite asynchronous updates. Assuming\nstrongly convex local objective functions, we establish bounds on gradient\nvariance under random client sampling and derive a recursion formula\nquantifying the impact of client delays on convergence.\n  The proposed AFL algorithm addresses key limitations of traditional FL\nmethods, such as inefficiency due to global synchronization and susceptibility\nto client drift. It enhances scalability, robustness, and efficiency in\nreal-world settings with heterogeneous client populations and dynamic network\nconditions. Our results underscore the potential of AFL to drive advancements\nin distributed learning systems, particularly for large-scale,\nprivacy-preserving applications in resource-constrained environments.\n","authors":["Ali Forootani","Raffaele Iervolino"],"pdf_url":"https://arxiv.org/pdf/2412.17723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06583v1","updated":"2025-01-11T16:38:06Z","published":"2025-01-11T16:38:06Z","title":"Optimizing wheel loader performance: an end-to-end approach","summary":"  Wheel loaders in mines and construction sites repeatedly load soil from a\npile to load receivers. This task presents a challenging optimization problem\nsince each loading's performance depends on the pile state, which depends on\nprevious loadings. We investigate an end-to-end optimization approach\nconsidering future loading outcomes and V-cycle transportation costs. To\npredict the evolution of the pile state and the loading performance, we use\nworld models that leverage deep neural networks trained on numerous simulated\nloading cycles. A look-ahead tree search optimizes the sequence of loading\nactions by evaluating the performance of thousands of action candidates, which\nexpand into subsequent action candidates under the predicted pile states\nrecursively. Test results demonstrate that, over a horizon of 15 sequential\nloadings, the look-ahead tree search is 6% more efficient than a greedy\nstrategy, which always selects the action that maximizes the current single\nloading performance, and 14% more efficient than using a fixed loading\ncontroller optimized for the nominal case.\n","authors":["Koji Aoshima","Eddie Wadbro","Martin Servin"],"pdf_url":"https://arxiv.org/pdf/2501.06583v1.pdf","comment":"25 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.06573v1","updated":"2025-01-11T15:49:20Z","published":"2025-01-11T15:49:20Z","title":"Modeling the residual queue and queue-dependent capacity in a static\n  traffic assignment problem","summary":"  The residual queue during a given study period (e.g., peak hour) is an\nimportant feature that should be considered when solving a traffic assignment\nproblem under equilibrium for strategic traffic planning. Although studies have\nfocused extensively on static or quasi-dynamic traffic assignment models\nconsidering the residual queue, they have failed to capture the situation\nwherein the equilibrium link flow passing through the link is less than the\nlink physical capacity under congested conditions. To address this critical\nissue, we introduce a novel static traffic assignment model that explicitly\nincorporates the residual queue and queue-dependent link capacity. The proposed\nmodel ensures that equilibrium link flows remain within the physical capacity\nbounds, yielding estimations more aligned with data observed by traffic\ndetectors, especially in oversaturated scenarios. A generalized link cost\nfunction considering queue-dependent capacity, with an additional queuing delay\nterm is proposed. The queuing delay term represents the added travel cost under\ncongestion, offering a framework wherein conventional static models, both with\nand without physical capacity constraints, become special cases of our model.\nOur study rigorously analyzes the mathematical properties of the new model,\nestablishing the theoretical uniqueness of solutions for link flow and residual\nqueue under certain conditions. We also introduce a gradient projection-based\nalternating minimization algorithm tailored for the proposed model. Numerical\nexamples are conducted to demonstrate the superiority and merit of the proposed\nmodel and solution algorithm.\n","authors":["Hao Fu","William H. K. Lam","Wei Ma","Yuxin Shi","Rui Jiang","Huijun Sun","Ziyou Gao"],"pdf_url":"https://arxiv.org/pdf/2501.06573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06566v1","updated":"2025-01-11T15:06:34Z","published":"2025-01-11T15:06:34Z","title":"Cooperative Aerial Robot Inspection Challenge: A Benchmark for\n  Heterogeneous Multi-UAV Planning and Lessons Learned","summary":"  We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a\nsimulation-based benchmark for motion planning algorithms in heterogeneous\nmulti-UAV systems. CARIC features UAV teams with complementary sensors,\nrealistic constraints, and evaluation metrics prioritizing inspection quality\nand efficiency. It offers a ready-to-use perception-control software stack and\ndiverse scenarios to support the development and evaluation of task allocation\nand motion planning algorithms. Competitions using CARIC were held at IEEE CDC\n2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation,\nattracting innovative solutions from research teams worldwide. This paper\nexamines the top three teams from CDC 2023, analyzing their exploration,\ninspection, and task allocation strategies while drawing insights into their\nperformance across scenarios. The results highlight the task's complexity and\nsuggest promising directions for future research in cooperative multi-UAV\nsystems.\n","authors":["Muqing Cao","Thien-Minh Nguyen","Shenghai Yuan","Andreas Anastasiou","Angelos Zacharia","Savvas Papaioannou","Panayiotis Kolios","Christos G. Panayiotou","Marios M. Polycarpou","Xinhang Xu","Mingjie Zhang","Fei Gao","Boyu Zhou","Ben M. Chen","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2501.06566v1.pdf","comment":"Please find our website at https://ntu-aris.github.io/caric"},{"id":"http://arxiv.org/abs/2410.21240v2","updated":"2025-01-11T14:39:03Z","published":"2024-10-28T17:39:13Z","title":"Quantum Reinforcement Learning-Based Two-Stage Unit Commitment Framework\n  for Enhanced Power Systems Robustness","summary":"  The volatility of renewable energy sources and fluctuations in real-time\nelectricity demand present significant challenges to traditional unit\ncommitment (UC) methods, often causing system constraint violations.\nConventional optimization algorithms face substantial difficulties in\nresponding quickly to these variations, frequently requiring the relaxation of\nconstraints or producing infeasible solutions. To address these challenges, a\nrobust two-stage UC framework based on quantum reinforcement learning (QRL) is\nproposed in this work, which improves both decision-making speed and solution\nfeasibility. In the first stage, the day-ahead scheduling of thermal generators\nis optimized. In the second stage, real-time adjustments are made to account\nfor changes in renewable generation and load, with microgrids integrated to\nreduce the impact of uncertainties on the power system. Both stages are\nformulated as Markov decision processes (MDPs), and QRL is used to efficiently\nsolve the problem. QRL provides key advantages, including more effective\nnavigation of the high-dimensional solution space and faster convergence\ncompared to classical methods, thus enhancing the robustness and computational\nefficiency of UC operations. The proposed QRL-based two-stage UC framework is\nvalidated using the IEEE RTS 24-bus system. Results demonstrate the\neffectiveness of the approach, showing improved solution feasibility and\ncomputational speed compared to conventional UC methods.\n","authors":["Xiang Wei","Ziqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.21240v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06552v1","updated":"2025-01-11T14:09:34Z","published":"2025-01-11T14:09:34Z","title":"When xURLLC Meets NOMA: A Stochastic Network Calculus Perspective","summary":"  The advent of next-generation ultra-reliable and low-latency communications\n(xURLLC) presents stringent and unprecedented requirements for key performance\nindicators (KPIs). As a disruptive technology, non-orthogonal multiple access\n(NOMA) harbors the potential to fulfill these stringent KPIs essential for\nxURLLC. However, the immaturity of research on the tail distributions of these\nKPIs significantly impedes the application of NOMA to xURLLC. Stochastic\nnetwork calculus (SNC), as a potent methodology, is leveraged to provide\ndependable theoretical insights into tail distribution analysis and statistical\nQoS provisioning (SQP). In this article, we develop a NOMA-assisted uplink\nxURLLC network architecture that incorporates an SNC-based SQP theoretical\nframework (SNC-SQP) to support tail distribution analysis in terms of delay,\nage-of-information (AoI), and reliability. Based on SNC-SQP, an SQP-driven\npower optimization problem is proposed to minimize transmit power while\nguaranteeing xURLLC's KPIs on delay, AoI, reliability, and power consumption.\nExtensive simulations validate our proposed theoretical framework and\ndemonstrate that the proposed power allocation scheme significantly reduces\nuplink transmit power and outperforms conventional schemes in terms of SQP\nperformance.\n","authors":["Yuang Chen","Hancheng Lu","Langtin Qin","Yansha Deng","Arumugam Nallanathan"],"pdf_url":"https://arxiv.org/pdf/2501.06552v1.pdf","comment":"7 pages, 5 figures, accepted by IEEE Communications Magazine"},{"id":"http://arxiv.org/abs/2501.06528v1","updated":"2025-01-11T12:31:33Z","published":"2025-01-11T12:31:33Z","title":"Safe Circumnavigation of a Hostile Target Using Range-Based Measurements","summary":"  Robotic systems are frequently deployed in missions that are dull, dirty, and\ndangerous, where ensuring their safety is of paramount importance when\ndesigning stabilizing controllers to achieve their desired goals. This paper\naddresses the problem of safe circumnavigation around a hostile target by a\nnonholonomic robot, with the objective of maintaining a desired safe distance\nfrom the target. Our solution approach involves incorporating an auxiliary\ncircle into the problem formulation, which assists in navigating the robot\naround the target using available range-based measurements. By leveraging the\nconcept of a barrier Lyapunov function, we propose a novel control law that\nensures stable circumnavigation around the target while preventing the robot\nfrom entering the safety circle. This controller is designed based on a\nparameter that depends on the radii of three circles, namely the stabilizing\ncircle, the auxiliary circle, and the safety circle. By identifying an\nappropriate range for this design parameter, we rigorously prove the stability\nof the desired equilibrium of the closed-loop system. Additionally, we provide\nan analysis of the robot's motion within the auxiliary circle, which is\ninfluenced by a gain parameter in the proposed controller. Simulation and\nexperimental results are presented to illustrate the key theoretical\ndevelopments.\n","authors":["Gaurav Singh Bhati","Arukonda Vaishnavi","Anoop Jain"],"pdf_url":"https://arxiv.org/pdf/2501.06528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00575v2","updated":"2025-01-11T12:27:08Z","published":"2024-06-30T03:33:42Z","title":"Learning to Control Unknown Strongly Monotone Games","summary":"  Consider a game where the players' utility functions include a reward\nfunction and a linear term for each dimension, with coefficients that are\ncontrolled by the manager. We assume that the game is strongly monotone, so\ngradient play converges to a unique Nash equilibrium (NE). The NE is typically\nglobally inefficient. The global performance at NE can be improved by imposing\nlinear constraints on the NE. We therefore want the manager to pick the\ncontrolled coefficients that impose the desired constraint on the NE. However,\nthis requires knowing the players' reward functions and action sets. Obtaining\nthis game information is infeasible in a large-scale network and violates user\nprivacy. To overcome this, we propose a simple algorithm that learns to shift\nthe NE to meet the linear constraints by adjusting the controlled coefficients\nonline. Our algorithm only requires the linear constraints violation as\nfeedback and does not need to know the reward functions or the action sets. We\nprove that our algorithm converges with probability 1 to the set of NE that\nsatisfy target linear constraints. We then prove an L2 convergence rate of\nnear-$O(t^{-1/4})$.\n","authors":["Siddharth Chandak","Ilai Bistritz","Nicholas Bambos"],"pdf_url":"https://arxiv.org/pdf/2407.00575v2.pdf","comment":"Submitted to IEEE Transactions on Control of Network Systems"},{"id":"http://arxiv.org/abs/2501.06510v1","updated":"2025-01-11T11:11:26Z","published":"2025-01-11T11:11:26Z","title":"Cooperative Optimal Output Tracking for Discrete-Time Multiagent\n  Systems: Stabilizing Policy Iteration Frameworks and Analysis","summary":"  In this paper, two model-free optimal output tracking frameworks based on\npolicy iteration for discrete-time multi-agent systems are proposed. First, we\nestablish a framework of stabilizing policy iteration that can start from any\ninitial feedback control policy, relaxing the dependence of traditional policy\niteration on the initial stabilizing control policy. Then, another efficient\nand equivalent $Q$-learning policy iteration framework is developed, which is\nshown to require only less system data to get the same results as the\nstabilizing policy iteration. Both frameworks obtain stabilizing control policy\nby iterating the stabilizing virtual closed-loop system step-by-step to the\nactual closed-loop system. Multiple explicit schemes for the iteration\nstep-size/coefficient are designed and their stability during the above\niterations is analyzed. By using the generated closed-loop stabilizing control\npolicy and two frameworks, the optimal feedback control gain is obtained. The\napproximate solution of the regulator equations is found by model-free\niteration, which leads to the optimal feedforward gain. Finally, the\ncooperative optimal output tracking is realized by a distributed\nfeedforward-feedback controller. The proposed algorithms are validated by\nsimulation.\n","authors":["Dongdong Li","Jiuxiang Dong"],"pdf_url":"https://arxiv.org/pdf/2501.06510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06491v1","updated":"2025-01-11T09:36:14Z","published":"2025-01-11T09:36:14Z","title":"Improving Requirements Classification with SMOTE-Tomek Preprocessing","summary":"  This study emphasizes the domain of requirements engineering by applying the\nSMOTE-Tomek preprocessing technique, combined with stratified K-fold\ncross-validation, to address class imbalance in the PROMISE dataset. This\ndataset comprises 969 categorized requirements, classified into functional and\nnon-functional types. The proposed approach enhances the representation of\nminority classes while maintaining the integrity of validation folds, leading\nto a notable improvement in classification accuracy. Logistic regression\nachieved 76.16\\%, significantly surpassing the baseline of 58.31\\%. These\nresults highlight the applicability and efficiency of machine learning models\nas scalable and interpretable solutions.\n","authors":["Barak Or"],"pdf_url":"https://arxiv.org/pdf/2501.06491v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.10430v1","updated":"2025-01-11T13:46:10Z","published":"2025-01-11T13:46:10Z","title":"Prediction Model of Aqua Fisheries Using IoT Devices","summary":"  Aquaculture involves cultivating marine and freshwater organisms, with\nreal-time monitoring of aquatic parameters being crucial in fish farming. This\nthesis proposes an IoT-based framework using sensors and Arduino for efficient\nmonitoring and control of water quality. Different sensors including pH,\ntemperature, and turbidity are placed in cultivating pond water and each of\nthem is connected to a common microcontroller board built on an Arduino Uno.\nThe sensors read the data from the water and store it as a CSV file in an IoT\ncloud named Thingspeak through the Arduino Microcontroller. In the experimental\npart, we collected data from 5 ponds with various sizes and environments. After\ngetting the real-time data, we compared these with the standard reference\nvalues. As a result, we can make the decision about which ponds are\nsatisfactory for cultivating fish and what is not. After that, we labeled the\ndata with 11 fish categories including Katla, sing, prawn, rui, koi, pangas,\ntilapia, silvercarp, karpio, magur, and shrimp. In addition, the data were\nanalyzed using 10 machine learning (ML) algorithms containing J48, Random\nForest, K-NN, K*, LMT, REPTree, JRIP, PART, Decision Table, and Logit boost.\nAfter experimental evaluation, it was observed among 5 ponds, only three ponds\nwere perfect for fish farming, where these 3 ponds only satisfied the standard\nreference values of pH (6.5-8.5), Temperature (16-24)oC, Turbidity (below\n10)ntu, Conductivity (970-1825){\\mu}S/cm, and Depth (1-4) meter. Among the\nstate-of-the-art machine learning algorithms, Random Forest achieved the\nhighest score of performance metrics as accuracy 94.42%, kappa statistics\n93.5%, and Avg. TP Rate 94.4%. In addition, we calculated the BOD, COD, and DO\nfor one scenario. This study includes details of the proposed IoT system's\nprototype hardware.\n","authors":["Md. Monirul Islam"],"pdf_url":"https://arxiv.org/pdf/2501.10430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10429v1","updated":"2025-01-11T06:19:47Z","published":"2025-01-11T06:19:47Z","title":"Recent Advances of 6G Ultra-Massive MIMO Technologies in Spatial and\n  Beam Domains","summary":"  To explore the full potential of ultra-massive multiple-input multiple-output\n(MIMO) communication systems, it is fundamental to understand new ultra-massive\nMIMO channel characteristics and establish pervasive channel models. On this\nbasis, large dimensional spatial-temporal transmission and random access\ntechnologies need to be investigated and evaluated for better practical\nimplementation. Firstly, this paper reviews recent advances of ultra-massive\nMIMO technologies in the traditional spatial domain, including wireless channel\ncharacterization and modeling, channel estimation, spatial multiplexing, and\nprecoding. Secondly, considering the dramatic increase of base station (BS)\nantennas and access users in ultra-massive MIMO systems, the confronted high\ndimensional complexity and computing burden of these ultra-massive MIMO\ntechnologies are indicated. To provide efficient and systematic solution, the\nemerging tendency to transform related technologies from the traditional\nspatial domain to beam domain is introduced. The utilities of large sparsity\nmerit, reduced energy consumption, and improved usage of radio frequency (RF)\nchains in the beam domain channel are elaborated. At last, future challenges of\nultra-massive MIMO communication systems are discussed.\n","authors":["Rui Feng","Cheng-Xiang Wang","Jie Huang","Xiqi Gao"],"pdf_url":"https://arxiv.org/pdf/2501.10429v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2501.03939v2","updated":"2025-01-11T14:21:37Z","published":"2025-01-07T17:00:35Z","title":"Visual question answering: from early developments to recent advances --\n  a survey","summary":"  Visual Question Answering (VQA) is an evolving research field aimed at\nenabling machines to answer questions about visual content by integrating image\nand language processing techniques such as feature extraction, object\ndetection, text embedding, natural language understanding, and language\ngeneration. With the growth of multimodal data research, VQA has gained\nsignificant attention due to its broad applications, including interactive\neducational tools, medical image diagnosis, customer service, entertainment,\nand social media captioning. Additionally, VQA plays a vital role in assisting\nvisually impaired individuals by generating descriptive content from images.\nThis survey introduces a taxonomy of VQA architectures, categorizing them based\non design choices and key components to facilitate comparative analysis and\nevaluation. We review major VQA approaches, focusing on deep learning-based\nmethods, and explore the emerging field of Large Visual Language Models (LVLMs)\nthat have demonstrated success in multimodal tasks like VQA. The paper further\nexamines available datasets and evaluation metrics essential for measuring VQA\nsystem performance, followed by an exploration of real-world VQA applications.\nFinally, we highlight ongoing challenges and future directions in VQA research,\npresenting open questions and potential areas for further development. This\nsurvey serves as a comprehensive resource for researchers and practitioners\ninterested in the latest advancements and future\n","authors":["Ngoc Dung Huynh","Mohamed Reda Bouadjenek","Sunil Aryal","Imran Razzak","Hakim Hacid"],"pdf_url":"https://arxiv.org/pdf/2501.03939v2.pdf","comment":"20 papers"},{"id":"http://arxiv.org/abs/2501.06488v1","updated":"2025-01-11T09:12:43Z","published":"2025-01-11T09:12:43Z","title":"NVS-SQA: Exploring Self-Supervised Quality Representation Learning for\n  Neurally Synthesized Scenes without References","summary":"  Neural View Synthesis (NVS), such as NeRF and 3D Gaussian Splatting,\neffectively creates photorealistic scenes from sparse viewpoints, typically\nevaluated by quality assessment methods like PSNR, SSIM, and LPIPS. However,\nthese full-reference methods, which compare synthesized views to reference\nviews, may not fully capture the perceptual quality of neurally synthesized\nscenes (NSS), particularly due to the limited availability of dense reference\nviews. Furthermore, the challenges in acquiring human perceptual labels hinder\nthe creation of extensive labeled datasets, risking model overfitting and\nreduced generalizability. To address these issues, we propose NVS-SQA, a NSS\nquality assessment method to learn no-reference quality representations through\nself-supervision without reliance on human labels. Traditional self-supervised\nlearning predominantly relies on the \"same instance, similar representation\"\nassumption and extensive datasets. However, given that these conditions do not\napply in NSS quality assessment, we employ heuristic cues and quality scores as\nlearning objectives, along with a specialized contrastive pair preparation\nprocess to improve the effectiveness and efficiency of learning. The results\nshow that NVS-SQA outperforms 17 no-reference methods by a large margin (i.e.,\non average 109.5% in SRCC, 98.6% in PLCC, and 91.5% in KRCC over the second\nbest) and even exceeds 16 full-reference methods across all evaluation metrics\n(i.e., 22.9% in SRCC, 19.1% in PLCC, and 18.6% in KRCC over the second best).\n","authors":["Qiang Qu","Yiran Shen","Xiaoming Chen","Yuk Ying Chung","Weidong Cai","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.06488v1.pdf","comment":null}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2501.06672v1","updated":"2025-01-11T23:58:23Z","published":"2025-01-11T23:58:23Z","title":"Approximate controllability for a one-dimensional wave equation with the\n  fixed endpoint control","summary":"  This paper is devoted to the study of the approximate controllability for a\none-dimensional wave equation in domains with moving boundary. This equation\nmodels the motion of a string where an endpoint is fixed and the other one is\nmoving. When the speed of the moving endpoint is less than the characteristic\nspeed, the controllability of this equation is established. We present the\nfollowing results: the existence and uniqueness of Nash equilibrium, the\napproximate controllability with respect to the leader control, and the\noptimality system for the leader control.\n","authors":["Isaías Pereira de Jesus"],"pdf_url":"https://arxiv.org/pdf/2501.06672v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2501.06666v1","updated":"2025-01-11T23:38:28Z","published":"2025-01-11T23:38:28Z","title":"Hierarchical Control for the Oldroyd Equation in Memoriam to Professor\n  Luiz Adauto Medeiros","summary":"  This manuscript deals with a hierarchical control problem for Oldroyd\nequation under the Stackelberg-Nash strategy. The Oldroyd equation model is\ndefined by non-regular coefficients, that is, they are bounded measurable\nfunctions. We assume that we can act in the dynamic of the system by a\nhierarchy of controls, where one main control (the leader) and several\nadditional secondary control (the followers) act in order to accomplish their\ngiven tasks: controllability for the leader and optimization for followers. We\nobtain the existence and uniqueness of Nash equilibrium and its\ncharacterization, the approximate controllability with respect to the leader\ncontrol, and the optimality system for leader control.\n","authors":["Isaías Pereira de Jesus","Marcondes Rodrigues Clark","Alexandro Marinho Oliveira","Aldo Trajano Louredo"],"pdf_url":"https://arxiv.org/pdf/2501.06666v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2501.06652v1","updated":"2025-01-11T22:29:51Z","published":"2025-01-11T22:29:51Z","title":"High-order Accurate Inference on Manifolds","summary":"  We present a new framework for statistical inference on Riemannian manifolds\nthat achieves high-order accuracy, addressing the challenges posed by\nnon-Euclidean parameter spaces frequently encountered in modern data science.\nOur approach leverages a novel and computationally efficient procedure to reach\nhigher-order asymptotic precision. In particular, we develop a bootstrap\nalgorithm on Riemannian manifolds that is both computationally efficient and\naccurate for hypothesis testing and confidence region construction. Although\nlocational hypothesis testing can be reformulated as a standard Euclidean\nproblem, constructing high-order accurate confidence regions necessitates\ncareful treatment of manifold geometry. To this end, we establish high-order\nasymptotics under a fixed normal chart centered at the true parameter, thereby\nenabling precise expansions that incorporate curvature effects. We demonstrate\nthe versatility of this framework across various manifold settings-including\nspheres, the Stiefel manifold, fixed-rank matrices manifolds, and rank-one\ntensor manifolds-and, for Euclidean submanifolds, introduce a class of\nprojection-like coordinate charts with strong consistency properties. Finally,\nnumerical studies confirm the practical merits of the proposed procedure.\n","authors":["Chengzhu Huang","Anru R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15129v3","updated":"2025-01-11T21:45:47Z","published":"2024-05-24T01:02:51Z","title":"ADMM for Nonsmooth Composite Optimization under Orthogonality\n  Constraints","summary":"  We consider a class of structured, nonconvex, nonsmooth optimization problems\nunder orthogonality constraints, where the objectives combine a smooth\nfunction, a nonsmooth concave function, and a nonsmooth weakly convex function.\nThis class of problems finds diverse applications in statistical learning and\ndata science. Existing methods for addressing these problems often fail to\nexploit the specific structure of orthogonality constraints, struggle with\nnonsmooth functions, or result in suboptimal oracle complexity. We propose {\\sf\nOADMM}, an Alternating Direction Method of Multipliers (ADMM) designed to solve\nthis class of problems using efficient proximal linearized strategies. Two\nspecific variants of {\\sf OADMM} are explored: one based on Euclidean\nProjection ({\\sf OADMM-EP}) and the other on Riemannian Retraction ({\\sf\nOADMM-RR}). Under mild assumptions, we prove that {\\sf OADMM} converges to a\ncritical point of the problem with an ergodic convergence rate of\n$\\mathcal{O}(1/\\epsilon^{3})$. Additionally, we establish a polynomial\nconvergence rate or super-exponential convergence rate for {\\sf OADMM},\ndepending on the specific setting, under the Kurdyka-Lojasiewicz (KL)\ninequality. To the best of our knowledge, this is \\textit{the first non-ergodic\nconvergence result} for this class of nonconvex nonsmooth optimization\nproblems. Numerical experiments demonstrate that the proposed algorithm\nachieves state-of-the-art performance.\n  \\textbf{Keywords:} Orthogonality Constraints; Nonconvex Optimization;\nNonsmooth Composite Optimization; ADMM; Convergence Analysis\n","authors":["Ganzhao Yuan"],"pdf_url":"https://arxiv.org/pdf/2405.15129v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06640v1","updated":"2025-01-11T21:19:57Z","published":"2025-01-11T21:19:57Z","title":"Characterization of Highly Robust Solutions in Multi-Objective\n  Programming in Banach Spaces","summary":"  This paper delves into the challenging issues in uncertain multi-objective\noptimization, where uncertainty permeates nonsmooth nonconvex objective and\nconstraint functions. In this context, we investigate highly robust (weakly\nefficient) solutions, a solution concept defined by efficiency across all\nscenarios. Our exploration reveals important relationships between highly\nrobust solutions and other robustness notions, including set-based and\nworst-case notions, as well as connections with proper and isolated efficiency.\nLeveraging modern techniques from variational analysis, we establish necessary\nand sufficient optimality conditions for these solutions. Moreover, we explore\nthe robustness of multi-objective optimization problems in the face of various\nuncertain sets, such as ball, ellipsoidal, and polyhedral sets.\n","authors":["Morteza Rahimi","Majid Soleimani-damaneh"],"pdf_url":"https://arxiv.org/pdf/2501.06640v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2412.14506v2","updated":"2025-01-11T19:56:47Z","published":"2024-12-19T04:06:10Z","title":"Delayed Feedback in Online Non-Convex Optimization: A Non-Stationary\n  Approach with Applications","summary":"  We study non-convex delayed-noise online optimization problems by evaluating\ndynamic regret in the non-stationary setting when the loss functions are\nquasar-convex. In particular, we consider scenarios involving quasar-convex\nfunctions either with a Lipschitz gradient or weakly smooth and, for each case,\nwe ensure bounded dynamic regret in terms of cumulative path variation\nachieving sub-linear regret rates. Furthermore, we illustrate the flexibility\nof our framework by applying it to both theoretical settings such as\nzeroth-order (bandit) and also to practical applications with quadratic\nfractional functions. Moreover, we provide new examples of non-convex functions\nthat are quasar-convex by proving that the class of differentiable strongly\nquasiconvex functions (Polyak 1966) are strongly quasar-convex on convex\ncompact sets. Finally, several numerical experiments validate our theoretical\nfindings, illustrating the effectiveness of our approach.\n","authors":["Felipe Lara","Cristian Vega"],"pdf_url":"https://arxiv.org/pdf/2412.14506v2.pdf","comment":"31 Pages, 7 Figures, 8 Tables"},{"id":"http://arxiv.org/abs/2501.06610v1","updated":"2025-01-11T18:18:37Z","published":"2025-01-11T18:18:37Z","title":"On the Convergence and Complexity of the Stochastic Central\n  Finite-Difference Based Gradient Estimation Methods","summary":"  This paper presents an algorithmic framework for solving unconstrained\nstochastic optimization problems using only stochastic function evaluations. We\nemploy central finite-difference based gradient estimation methods to\napproximate the gradients and dynamically control the accuracy of these\napproximations by adjusting the sample sizes used in stochastic realizations.\nWe analyze the theoretical properties of the proposed framework on nonconvex\nfunctions. Our analysis yields sublinear convergence results to the\nneighborhood of the solution, and establishes the optimal worst-case iteration\ncomplexity ($\\mathcal{O}(\\epsilon^{-1})$) and sample complexity\n($\\mathcal{O}(\\epsilon^{-2})$) for each gradient estimation method to achieve\nan $\\epsilon$-accurate solution. Finally, we demonstrate the performance of the\nproposed framework and the quality of the gradient estimation methods through\nnumerical experiments on nonlinear least squares problems.\n","authors":["Raghu Bollapragada","Cem Karamanli"],"pdf_url":"https://arxiv.org/pdf/2501.06610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06578v1","updated":"2025-01-11T16:22:41Z","published":"2025-01-11T16:22:41Z","title":"A Linear Complexity Algorithm for Optimal Transport Problem with\n  Log-type Cost","summary":"  In [Q. Liao et al., Commun. Math. Sci., 20(2022)], a linear-time Sinkhorn\nalgorithm is developed based on dynamic programming, which significantly\nreduces the computational complexity involved in solving optimal transport\nproblems. However, this algorithm is specifically designed for the\nWasserstein-1 metric. We are curious whether the preceding dynamic programming\nframework can be extended to tackle optimal transport problems with different\ntransport costs. Notably, two special kinds of optimal transport problems, the\nSinkhorn ranking and the far-field reflector and refractor problems, are\nclosely associated with the log-type transport costs. Interestingly, by\nemploying series rearrangement and dynamic programming techniques, it is\nfeasible to perform the matrix-vector multiplication within the Sinkhorn\niteration in linear time for this type of cost. This paper provides a detailed\nexposition of its implementation and applications, with numerical simulations\ndemonstrating the effectiveness and efficiency of our methods.\n","authors":["Ziyuan Lyu","Zihao Wang","Hao Wu","Shuai Yang"],"pdf_url":"https://arxiv.org/pdf/2501.06578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06576v1","updated":"2025-01-11T15:58:28Z","published":"2025-01-11T15:58:28Z","title":"Viscosity Iterative algorithm for solving Variational Inclusion and\n  Fixed point problems involving Multivalued Quasi-Nonexpansive and\n  Demicontractive Operators in real Hilbert Space","summary":"  This paper presents a modified general viscosity iterative process designed\nto solve variational inclusion and fixed point problems involving multi-valued\nquasi-nonexpansive and demi-contractive operators. The modified iterative\nprocess incorporates a viscosity approximation technique to handle the\nnonexpansive and contractive mappings, providing a more robust and efficient\nsolution approach. By introducing an additional sequence of iterates, the\nalgorithm iteratively approximates the desired solution by combining fixed\npoint iteration with viscosity approximation. The proposed method has been\nproven to converge strongly to the solution of the given problem, ensuring the\nreliability and accuracy of the results.\n","authors":["Furmose Mendy","John T Mendy"],"pdf_url":"https://arxiv.org/pdf/2501.06576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13931v4","updated":"2025-01-11T14:59:50Z","published":"2022-10-25T11:37:11Z","title":"On the Complexity of Decentralized Smooth Nonconvex Finite-Sum\n  Optimization","summary":"  We study the decentralized optimization problem $\\min_{{\\bf x}\\in{\\mathbb\nR}^d} f({\\bf x})\\triangleq \\frac{1}{m}\\sum_{i=1}^m f_i({\\bf x})$, where the\nlocal function on the $i$-th agent has the form of $f_i({\\bf x})\\triangleq\n\\frac{1}{n}\\sum_{j=1}^n f_{i,j}({\\bf x})$ and every individual $f_{i,j}$ is\nsmooth but possibly nonconvex. We propose a stochastic algorithm called\nDEcentralized probAbilistic Recursive gradiEnt deScenT (DEAREST) method, which\nachieves an $\\epsilon$-stationary point at each agent with the communication\nrounds of $\\tilde{\\mathcal O}(L\\epsilon^{-2}/\\sqrt{\\gamma}\\,)$, the computation\nrounds of $\\tilde{\\mathcal O}(n+(L+\\min\\{nL, \\sqrt{n/m}\\bar\nL\\})\\epsilon^{-2})$, and the local incremental first-oracle calls of ${\\mathcal\nO}(mn + {\\min\\{mnL, \\sqrt{mn}\\bar L\\}}{\\epsilon^{-2}})$, where $L$ is the\nsmoothness parameter of the objective function, $\\bar L$ is the mean-squared\nsmoothness parameter of all individual functions, and $\\gamma$ is the spectral\ngap of the mixing matrix associated with the network. We then establish the\nlower bounds to show that the proposed method is near-optimal. Notice that the\nsmoothness parameters $L$ and $\\bar L$ used in our algorithm design and\nanalysis are global, leading to sharper complexity bounds than existing results\nthat depend on the local smoothness. We further extend DEAREST to solve the\ndecentralized finite-sum optimization problem under the Polyak-{\\L}ojasiewicz\ncondition, also achieving the near-optimal complexity bounds.\n","authors":["Luo Luo","Yunyan Bai","Lesi Chen","Yuxing Liu","Haishan Ye"],"pdf_url":"https://arxiv.org/pdf/2210.13931v4.pdf","comment":"A major revision which significantly improves the results by\n  considering the global smoothness parameters and involving the content of PL\n  condition in ICML paper"},{"id":"http://arxiv.org/abs/2501.02184v2","updated":"2025-01-11T12:35:16Z","published":"2025-01-04T04:35:05Z","title":"Model-Free and Real-Time Bioinspired Unicycle-Based Source Seeking:\n  Differential Wheeled Robotic Experiments","summary":"  Bioinspred robots aimed at source-seeking are often studied, and their\ncontrols designed, using unicycle modeling and formulation. This is true not\nonly for model-based controllers, but also for model-free, real-time control\nmethods such as extremum seeking control (ESC). In this paper, we propose a\nunicycle-based ESC design applicable to differential wheeled robots that: (1)\nis very simple design, based on one simple control-affine law, and without\nstate integrators; (2) attenuates oscillations known to persist in ESC designs\n(i.e., fully stop at the source); and (3) operates in a model-free, real-time\nsetting, tolerating environmental/sensor noise. We provide simulation and\nreal-world robotic experimental results for fixed and moving light source\nseeking by a differential wheeled robot using our proposed design. Results\nindicate clear advantages of our proposed design when compared to the\nliterature, including attenuation of undesired oscillations, improved\nconvergence speed, and better handling of noise.\n","authors":["Ahmed A. Elgohary","Sameh A. Eisa","Shivam Bajpai"],"pdf_url":"https://arxiv.org/pdf/2501.02184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06517v1","updated":"2025-01-11T11:42:10Z","published":"2025-01-11T11:42:10Z","title":"On the general form of bimonotone operators","summary":"  In a recent paper (2024) Camacho, C\\'{a}novas, Mart\\'{\\i}nez-Legaz and Parra\nintroduced bimonotone operators, i.e., operators $T$ such that both $T$ and\n$-T$ are monotone, and found some interesting applications to convex\nfeasibility problems, especially in the case the operator is also paramonotone.\nIn the present paper we drop paramonotonicity and examine the question of\nfinding the most general form of a bimonotone operator in a Banach space. We\nshow that any such operator can be reduced in some sense to a single-valued,\nskew symmetric linear operator. This facilitates the proof of some results\ninvolving these operators in applications.\n","authors":["Nicolas Hadjisavvas"],"pdf_url":"https://arxiv.org/pdf/2501.06517v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2306.07886v4","updated":"2025-01-11T11:14:30Z","published":"2023-06-13T16:25:30Z","title":"Symmetry & Critical Points for Symmetric Tensor Decomposition Problems","summary":"  We consider the nonconvex optimization problem associated with the\ndecomposition of a real symmetric tensor into a sum of rank one terms. Use is\nmade of the rich symmetry structure to construct infinite families of critical\npoints represented by Puiseux series in the problem dimension, and so obtain\nprecise analytic estimates on the value of the objective function and the\nHessian spectrum. The results allow an analytic characterization of various\nobstructions to using local optimization methods, revealing in particular a\ncomplex array of saddles and minima differing by their symmetry, structure and\nanalytic properties. A~desirable phenomenon, occurring for all critical points\nconsidered, concerns the number of negative Hessian eigenvalues increasing with\nthe value of the objective function. Our approach makes use of Newton polyhedra\nas well as results from real algebraic geometry, notably the Curve Selection\nLemma, to determine the extremal character of degenerate critical points,\nestablishing in particular the existence of infinite families of third-order\nsaddles which can significantly slow down the optimization process.\n","authors":["Yossi Arjevani","Gal Vinograd"],"pdf_url":"https://arxiv.org/pdf/2306.07886v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06204v5","updated":"2025-01-11T09:01:26Z","published":"2024-08-12T14:55:48Z","title":"Distributed Computing for Huge-Scale Linear Programming","summary":"  This study develops an algorithm for distributed computing of linear\nprogramming problems of huge-scales. Global consensus with single common\nvariable, multiblocks, and augmented Lagrangian are adopted. The consensus is\nused to partition the constraints of equality and inequality into\nmulti-consensus blocks, and the subblocks of each consensus block are employed\nto partition the primal variables into $M$ sets of disjoint subvectors. The\nblock-coordinate Gauss-Seidel method, the proximal point method, and ADMM are\nused to update the primal variables, and descent models used to update the\ndual. Under the dual sequences supposedly bounded, convergence of the algorithm\nto optimal solution is shown and the rate of convergence of the augmented\nLagrangian, of $O(1/k)$ is obtained. It is yet to be investigated regarding the\nissue of the dual sequences to be bounded via initialization of the primal and\ndual sequences and the control parameter values.\n","authors":["Luoyi Tao"],"pdf_url":"https://arxiv.org/pdf/2408.06204v5.pdf","comment":"10 pages. The issues of initialization and boundedness of dual\n  sequences are discussed"},{"id":"http://arxiv.org/abs/2403.10441v2","updated":"2025-01-11T08:08:17Z","published":"2024-03-15T16:27:30Z","title":"A Mean-Field Game of Market Entry: Portfolio Liquidation with Trading\n  Constraints","summary":"  We consider both $N$-player and mean-field games of optimal portfolio\nliquidation in which the players are not allowed to change the direction of\ntrading. Players with an initially short position of stocks are only allowed to\nbuy while players with an initially long position are only allowed to sell the\nstock. Under suitable conditions on the model parameters we show that the games\nare equivalent to games of timing where the players need to determine the\noptimal times of market entry and exit. We identify the equilibrium entry and\nexit times and prove that equilibrium mean-trading rates can be characterized\nin terms of the solutions to a highly non-linear higher-order integral equation\nwith endogenous terminal condition. We prove the existence of a unique solution\nto the integral equation from which we obtain the existence of a unique\nequilibrium both in the mean-field and the $N$-player game.\n","authors":["Guanxing Fu","Paul P. Hager","Ulrich Horst"],"pdf_url":"https://arxiv.org/pdf/2403.10441v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06463v1","updated":"2025-01-11T07:26:07Z","published":"2025-01-11T07:26:07Z","title":"Two Proofs of a Structural Theorem of Decreasing Minimization on\n  Integrally Convex Sets","summary":"  This paper gives two different proofs to a structural theorem of decreasing\nminimization (lexicographic optimization) on integrally convex sets. The\ntheorem states that the set of decreasingly minimal elements of an integrally\nconvex set can be represented as the intersection of a unit discrete cube and a\nface of the convex hull of the given integrally convex set. The first proof\nresorts to the Fenchel-type duality theorem in discrete convex analysis and the\nsecond is more elementary using Farkas' lemma.\n","authors":["Kazuo Murota","Akihisa Tamura"],"pdf_url":"https://arxiv.org/pdf/2501.06463v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2410.19264v2","updated":"2025-01-11T06:54:54Z","published":"2024-10-25T02:37:41Z","title":"Multiple Regression for Matrix and Vector Predictors: Models, Theory,\n  Algorithms, and Beyond","summary":"  Matrix regression plays an important role in modern data analysis due to its\nability to handle complex relationships involving both matrix and vector\nvariables. We propose a class of regularized regression models capable of\npredicting both matrix and vector variables, accommodating various\nregularization techniques tailored to the inherent structures of the data. We\nestablish the consistency of our estimator when penalizing the nuclear norm of\nthe matrix variable and the $\\ell_1$ norm of the vector variable. To tackle the\ngeneral regularized regression model, we propose a unified framework based on\nan efficient preconditioned proximal point algorithm. Numerical experiments\ndemonstrate the superior estimation and prediction accuracy of our proposed\nestimator, as well as the efficiency of our algorithm compared to the\nstate-of-the-art solvers.\n","authors":["Meixia Lin","Ziyang Zeng","Yangjing Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.19264v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02995v2","updated":"2025-01-11T05:08:43Z","published":"2025-01-06T13:13:44Z","title":"Remarks on finite-approximate controllability of impulsive evolution\n  systems via resolvent-like operator in Hilbert spaces","summary":"  In this manuscript, we examine impulsive evolution systems in Hilbert spaces.\nUsing a resolvent-like operator, we first establish the finite-approximate\ncontrollability for linear systems. Subsequently, by applying the Schauder\nfixed-point theorem (SFPT), we prove the existence of a solution and\ndemonstrate the finite-approximate controllability of semilinear impulsive\nsystems in Hilbert spaces. Finally, we extend these results to a broader\napplication, specifically to the heat equation.\n","authors":["Javad A. Asadzade","Nazim I. Mahmudov"],"pdf_url":"https://arxiv.org/pdf/2501.02995v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2501.06581v1","updated":"2025-01-11T16:34:10Z","published":"2025-01-11T16:34:10Z","title":"Recommending the right academic programs: An interest mining approach\n  using BERTopic","summary":"  Prospective students face the challenging task of selecting a university\nprogram that will shape their academic and professional careers. For\ndecision-makers and support services, it is often time-consuming and extremely\ndifficult to match personal interests with suitable programs due to the vast\nand complex catalogue information available. This paper presents the first\ninformation system that provides students with efficient recommendations based\non both program content and personal preferences. BERTopic, a powerful topic\nmodeling algorithm, is used that leverages text embedding techniques to\ngenerate topic representations. It enables us to mine interest topics from all\ncourse descriptions, representing the full body of knowledge taught at the\ninstitution. Underpinned by the student's individual choice of topics, a\nshortlist of the most relevant programs is computed through statistical\nbacktracking in the knowledge map, a novel characterization of the\nprogram-course relationship. This approach can be applied to a wide range of\neducational settings, including professional and vocational training. A case\nstudy at a post-secondary school with 80 programs and over 5,000 courses shows\nthat the system provides immediate and effective decision support. The\npresented interest topics are meaningful, leading to positive effects such as\nserendipity, personalization, and fairness, as revealed by a qualitative study\ninvolving 65 students. Over 98% of users indicated that the recommendations\naligned with their interests, and about 94% stated they would use the tool in\nthe future. Quantitative analysis shows the system can be configured to ensure\nfairness, achieving 98% program coverage while maintaining a personalization\nscore of 0.77. These findings suggest that this real-time, user-centered,\ndata-driven system could improve the program selection process.\n","authors":["Alessandro Hill","Kalen Goo","Puneet Agarwal"],"pdf_url":"https://arxiv.org/pdf/2501.06581v1.pdf","comment":"Accepted at Data Mining and Knowledge Discovery (Springer)"},{"id":"http://arxiv.org/abs/2402.01339v2","updated":"2025-01-11T15:07:26Z","published":"2024-02-02T11:52:07Z","title":"Improving Sequential Recommendations with LLMs","summary":"  The sequential recommendation problem has attracted considerable research\nattention in the past few years, leading to the rise of numerous recommendation\nmodels. In this work, we explore how Large Language Models (LLMs), which are\nnowadays introducing disruptive effects in many AI-based applications, can be\nused to build or improve sequential recommendation approaches. Specifically, we\ndesign three orthogonal approaches and hybrids of those to leverage the power\nof LLMs in different ways. In addition, we investigate the potential of each\napproach by focusing on its comprising technical aspects and determining an\narray of alternative choices for each one. We conduct extensive experiments on\nthree datasets and explore a large variety of configurations, including\ndifferent language models and baseline recommendation models, to obtain a\ncomprehensive picture of the performance of each approach. Among other\nobservations, we highlight that initializing state-of-the-art sequential\nrecommendation models such as BERT4Rec or SASRec with embeddings obtained from\nan LLM can lead to substantial performance gains in terms of accuracy.\nFurthermore, we find that fine-tuning an LLM for recommendation tasks enables\nit to learn not only the tasks, but also concepts of a domain to some extent.\nWe also show that fine-tuning OpenAI GPT leads to considerably better\nperformance than fine-tuning Google PaLM 2. Overall, our extensive experiments\nindicate a huge potential value of leveraging LLMs in future recommendation\napproaches. We publicly share the code and data of our experiments to ensure\nreproducibility.\n","authors":["Artun Boz","Wouter Zorgdrager","Zoe Kotti","Jesse Harte","Panos Louridas","Dietmar Jannach","Vassilios Karakoidas","Marios Fragkoulis"],"pdf_url":"https://arxiv.org/pdf/2402.01339v2.pdf","comment":"35 pages, 12 figures, 7 tables"},{"id":"http://arxiv.org/abs/2501.06496v1","updated":"2025-01-11T10:11:19Z","published":"2025-01-11T10:11:19Z","title":"Analyzing the Role of Context in Forecasting with Large Language Models","summary":"  This study evaluates the forecasting performance of recent language models\n(LLMs) on binary forecasting questions. We first introduce a novel dataset of\nover 600 binary forecasting questions, augmented with related news articles and\ntheir concise question-related summaries. We then explore the impact of input\nprompts with varying level of context on forecasting performance. The results\nindicate that incorporating news articles significantly improves performance,\nwhile using few-shot examples leads to a decline in accuracy. We find that\nlarger models consistently outperform smaller models, highlighting the\npotential of LLMs in enhancing automated forecasting.\n","authors":["Gerrit Mutschlechner","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2501.06496v1.pdf","comment":null}]},"2025-01-14T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.08286v1","updated":"2025-01-14T18:01:15Z","published":"2025-01-14T18:01:15Z","title":"VINGS-Mono: Visual-Inertial Gaussian Splatting Monocular SLAM in Large\n  Scenes","summary":"  VINGS-Mono is a monocular (inertial) Gaussian Splatting (GS) SLAM framework\ndesigned for large scenes. The framework comprises four main components: VIO\nFront End, 2D Gaussian Map, NVS Loop Closure, and Dynamic Eraser. In the VIO\nFront End, RGB frames are processed through dense bundle adjustment and\nuncertainty estimation to extract scene geometry and poses. Based on this\noutput, the mapping module incrementally constructs and maintains a 2D Gaussian\nmap. Key components of the 2D Gaussian Map include a Sample-based Rasterizer,\nScore Manager, and Pose Refinement, which collectively improve mapping speed\nand localization accuracy. This enables the SLAM system to handle large-scale\nurban environments with up to 50 million Gaussian ellipsoids. To ensure global\nconsistency in large-scale scenes, we design a Loop Closure module, which\ninnovatively leverages the Novel View Synthesis (NVS) capabilities of Gaussian\nSplatting for loop closure detection and correction of the Gaussian map.\nAdditionally, we propose a Dynamic Eraser to address the inevitable presence of\ndynamic objects in real-world outdoor scenes. Extensive evaluations in indoor\nand outdoor environments demonstrate that our approach achieves localization\nperformance on par with Visual-Inertial Odometry while surpassing recent\nGS/NeRF SLAM methods. It also significantly outperforms all existing methods in\nterms of mapping and rendering quality. Furthermore, we developed a mobile app\nand verified that our framework can generate high-quality Gaussian maps in real\ntime using only a smartphone camera and a low-frequency IMU sensor. To the best\nof our knowledge, VINGS-Mono is the first monocular Gaussian SLAM method\ncapable of operating in outdoor environments and supporting kilometer-scale\nlarge scenes.\n","authors":["Ke Wu","Zicheng Zhang","Muer Tie","Ziqing Ai","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2501.08286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07571v3","updated":"2025-01-14T17:33:46Z","published":"2024-09-11T18:58:16Z","title":"FaVoR: Features via Voxel Rendering for Camera Relocalization","summary":"  Camera relocalization methods range from dense image alignment to direct\ncamera pose regression from a query image. Among these, sparse feature matching\nstands out as an efficient, versatile, and generally lightweight approach with\nnumerous applications. However, feature-based methods often struggle with\nsignificant viewpoint and appearance changes, leading to matching failures and\ninaccurate pose estimates. To overcome this limitation, we propose a novel\napproach that leverages a globally sparse yet locally dense 3D representation\nof 2D features. By tracking and triangulating landmarks over a sequence of\nframes, we construct a sparse voxel map optimized to render image patch\ndescriptors observed during tracking. Given an initial pose estimate, we first\nsynthesize descriptors from the voxels using volumetric rendering and then\nperform feature matching to estimate the camera pose. This methodology enables\nthe generation of descriptors for unseen views, enhancing robustness to view\nchanges. We extensively evaluate our method on the 7-Scenes and Cambridge\nLandmarks datasets. Our results show that our method significantly outperforms\nexisting state-of-the-art feature representation techniques in indoor\nenvironments, achieving up to a 39% improvement in median translation error.\nAdditionally, our approach yields comparable results to other methods for\noutdoor scenarios while maintaining lower memory and computational costs.\n","authors":["Vincenzo Polizzi","Marco Cannici","Davide Scaramuzza","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2409.07571v3.pdf","comment":"Accepted to the IEEE/CVF Winter Conference on Applications of\n  Computer Vision (WACV), Tucson, Arizona, US, Feb 28-Mar 4, 2025"},{"id":"http://arxiv.org/abs/2501.06693v2","updated":"2025-01-14T17:29:06Z","published":"2025-01-12T03:01:15Z","title":"Vid2Sim: Realistic and Interactive Simulation from Video for Urban\n  Navigation","summary":"  Sim-to-real gap has long posed a significant challenge for robot learning in\nsimulation, preventing the deployment of learned models in the real world.\nPrevious work has primarily focused on domain randomization and system\nidentification to mitigate this gap. However, these methods are often limited\nby the inherent constraints of the simulation and graphics engines. In this\nwork, we propose Vid2Sim, a novel framework that effectively bridges the\nsim2real gap through a scalable and cost-efficient real2sim pipeline for neural\n3D scene reconstruction and simulation. Given a monocular video as input,\nVid2Sim can generate photorealistic and physically interactable 3D simulation\nenvironments to enable the reinforcement learning of visual navigation agents\nin complex urban environments. Extensive experiments demonstrate that Vid2Sim\nsignificantly improves the performance of urban navigation in the digital twins\nand real world by 31.2% and 68.3% in success rate compared with agents trained\nwith prior simulation methods.\n","authors":["Ziyang Xie","Zhizheng Liu","Zhenghao Peng","Wayne Wu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.06693v2.pdf","comment":"Project page: https://metadriverse.github.io/vid2sim/"},{"id":"http://arxiv.org/abs/2501.08259v1","updated":"2025-01-14T17:15:27Z","published":"2025-01-14T17:15:27Z","title":"FDPP: Fine-tune Diffusion Policy with Human Preference","summary":"  Imitation learning from human demonstrations enables robots to perform\ncomplex manipulation tasks and has recently witnessed huge success. However,\nthese techniques often struggle to adapt behavior to new preferences or changes\nin the environment. To address these limitations, we propose Fine-tuning\nDiffusion Policy with Human Preference (FDPP). FDPP learns a reward function\nthrough preference-based learning. This reward is then used to fine-tune the\npre-trained policy with reinforcement learning (RL), resulting in alignment of\npre-trained policy with new human preferences while still solving the original\ntask. Our experiments across various robotic tasks and preferences demonstrate\nthat FDPP effectively customizes policy behavior without compromising\nperformance. Additionally, we show that incorporating Kullback-Leibler (KL)\nregularization during fine-tuning prevents over-fitting and helps maintain the\ncompetencies of the initial policy.\n","authors":["Yuxin Chen","Devesh K. Jha","Masayoshi Tomizuka","Diego Romeres"],"pdf_url":"https://arxiv.org/pdf/2501.08259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08222v1","updated":"2025-01-14T16:05:32Z","published":"2025-01-14T16:05:32Z","title":"Data-driven Spatial Classification using Multi-Arm Bandits for\n  Monitoring with Energy-Constrained Mobile Robots","summary":"  We consider the spatial classification problem for monitoring using data\ncollected by a coordinated team of mobile robots. Such classification problems\narise in several applications including search-and-rescue and precision\nagriculture. Specifically, we want to classify the regions of a search\nenvironment into interesting and uninteresting as quickly as possible using a\nteam of mobile sensors and mobile charging stations. We develop a data-driven\nstrategy that accommodates the noise in sensed data and the limited energy\ncapacity of the sensors, and generates collision-free motion plans for the\nteam. We propose a bi-level approach, where a high-level planner leverages a\nmulti-armed bandit framework to determine the potential regions of interest for\nthe drones to visit next based on the data collected online. Then, a low-level\npath planner based on integer programming coordinates the paths for the team to\nvisit the target regions subject to the physical constraints. We characterize\nseveral theoretical properties of the proposed approach, including anytime\nguarantees and task completion time. We show the efficacy of our approach in\nsimulation, and further validate these observations in physical experiments\nusing mobile robots.\n","authors":["Xiaoshan Lin","Siddharth Nayak","Stefano Di Cairano","Abraham P. Vinod"],"pdf_url":"https://arxiv.org/pdf/2501.08222v1.pdf","comment":"8 pages, 6 figures. See https://www.youtube.com/watch?v=gzulpOcVYzg\n  for an overview of the approach along with videos of the hardware experiments"},{"id":"http://arxiv.org/abs/2501.08096v1","updated":"2025-01-14T13:10:13Z","published":"2025-01-14T13:10:13Z","title":"Hybrid Action Based Reinforcement Learning for Multi-Objective\n  Compatible Autonomous Driving","summary":"  Reinforcement Learning (RL) has shown excellent performance in solving\ndecision-making and control problems of autonomous driving, which is\nincreasingly applied in diverse driving scenarios. However, driving is a\nmulti-attribute problem, leading to challenges in achieving multi-objective\ncompatibility for current RL methods, especially in both policy execution and\npolicy iteration. On the one hand, the common action space structure with\nsingle action type limits driving flexibility or results in large behavior\nfluctuations during policy execution. On the other hand, the multi-attribute\nweighted single reward function result in the agent's disproportionate\nattention to certain objectives during policy iterations. To this end, we\npropose a Multi-objective Ensemble-Critic reinforcement learning method with\nHybrid Parametrized Action for multi-objective compatible autonomous driving.\nSpecifically, a parameterized action space is constructed to generate hybrid\ndriving actions, combining both abstract guidance and concrete control\ncommands. A multi-objective critics architecture is constructed considering\nmultiple attribute rewards, to ensure simultaneously focusing on different\ndriving objectives. Additionally, uncertainty-based exploration strategy is\nintroduced to help the agent faster approach viable driving policy. The\nexperimental results in both the simulated traffic environment and the HighD\ndataset demonstrate that our method can achieve multi-objective compatible\nautonomous driving in terms of driving efficiency, action consistency, and\nsafety. It enhances the general performance of the driving while significantly\nincreasing training efficiency.\n","authors":["Guizhe Jin","Zhuoren Li","Bo Leng","Wei Han","Lu Xiong","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2501.08096v1.pdf","comment":"12 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.08077v1","updated":"2025-01-14T12:44:33Z","published":"2025-01-14T12:44:33Z","title":"HydroelasticTouch: Simulation of Tactile Sensors with Hydroelastic\n  Contact Surfaces","summary":"  Thanks to recent advancements in the development of inexpensive,\nhigh-resolution tactile sensors, touch sensing has become popular in\ncontact-rich robotic manipulation tasks. With the surge of data-driven methods\nand their requirement for substantial datasets, several methods of simulating\ntactile sensors have emerged in the tactile research community to overcome\nreal-world data collection limitations. These simulation approaches can be\nsplit into two main categories: fast but inaccurate (soft) point-contact models\nand slow but accurate finite element modeling. In this work, we present a novel\napproach to simulating pressure-based tactile sensors using the hydroelastic\ncontact model, which provides a high degree of physical realism at a reasonable\ncomputational cost. This model produces smooth contact forces for soft-to-soft\nand soft-to-rigid contacts along even non-convex contact surfaces. Pressure\nvalues are approximated at each point of the contact surface and can be\nintegrated to calculate sensor outputs. We validate our models' capacity to\nsynthesize real-world tactile data by conducting zero-shot sim-to-real transfer\nof a model for object state estimation. Our simulation is available as a\nplug-in to our open-source, MuJoCo-based simulator.\n","authors":["David P. Leins","Florian Patzelt","Robert Haschke"],"pdf_url":"https://arxiv.org/pdf/2501.08077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07344v3","updated":"2025-01-14T10:35:19Z","published":"2024-12-10T09:37:25Z","title":"Virtual Reflections on a Dynamic 2D Eye Model Improve Spatial Reference\n  Identification","summary":"  The visible orientation of human eyes creates some transparency about\npeople's spatial attention and other mental states. This leads to a dual role\nfor the eyes as a means of sensing and communication. Accordingly, artificial\neye models are being explored as communication media in human-machine\ninteraction scenarios. One challenge in the use of eye models for communication\nconsists of resolving spatial reference ambiguities, especially for\nscreen-based models. Here, we introduce an approach for overcoming this\nchallenge through the introduction of reflection-like features that are\ncontingent on artificial eye movements. We conducted a user study with 30\nparticipants in which participants had to use spatial references provided by\ndynamic eye models to advance in a fast-paced group interaction task. Compared\nto a non-reflective eye model and a pure reflection mode, their combination in\nthe new approach resulted in a higher identification accuracy and user\nexperience, suggesting a synergistic benefit.\n","authors":["Matti Krüger","Yutaka Oshima","Yu Fang"],"pdf_url":"https://arxiv.org/pdf/2412.07344v3.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2406.03912v2","updated":"2025-01-14T10:32:32Z","published":"2024-06-06T09:51:30Z","title":"GenSafe: A Generalizable Safety Enhancer for Safe Reinforcement Learning\n  Algorithms Based on Reduced Order Markov Decision Process Model","summary":"  Safe Reinforcement Learning (SRL) aims to realize a safe learning process for\nDeep Reinforcement Learning (DRL) algorithms by incorporating safety\nconstraints. However, the efficacy of SRL approaches often relies on accurate\nfunction approximations, which are notably challenging to achieve in the early\nlearning stages due to data insufficiency. To address this issue, we introduce\nin this work a novel Generalizable Safety enhancer (GenSafe) that is able to\novercome the challenge of data insufficiency and enhance the performance of SRL\napproaches. Leveraging model order reduction techniques, we first propose an\ninnovative method to construct a Reduced Order Markov Decision Process (ROMDP)\nas a low-dimensional approximator of the original safety constraints. Then, by\nsolving the reformulated ROMDP-based constraints, GenSafe refines the actions\nof the agent to increase the possibility of constraint satisfaction.\nEssentially, GenSafe acts as an additional safety layer for SRL algorithms. We\nevaluate GenSafe on multiple SRL approaches and benchmark problems. The results\ndemonstrate its capability to improve safety performance, especially in the\nearly learning phases, while maintaining satisfactory task performance. Our\nproposed GenSafe not only offers a novel measure to augment existing SRL\nmethods but also shows broad compatibility with various SRL algorithms, making\nit applicable to a wide range of systems and SRL problems.\n","authors":["Zhehua Zhou","Xuan Xie","Jiayang Song","Zhan Shu","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2406.03912v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02297v2","updated":"2025-01-14T10:27:40Z","published":"2024-08-05T08:14:28Z","title":"Perception Matters: Enhancing Embodied AI with Uncertainty-Aware\n  Semantic Segmentation","summary":"  Embodied AI has made significant progress acting in unexplored environments.\nHowever, tasks such as object search have largely focused on efficient policy\nlearning. In this work, we identify several gaps in current search methods:\nThey largely focus on dated perception models, neglect temporal aggregation,\nand transfer from ground truth directly to noisy perception at test time,\nwithout accounting for the resulting overconfidence in the perceived state. We\naddress the identified problems through calibrated perception probabilities and\nuncertainty across aggregation and found decisions, thereby adapting the models\nfor sequential tasks. The resulting methods can be directly integrated with\npretrained models across a wide family of existing search approaches at no\nadditional training cost. We perform extensive evaluations of aggregation\nmethods across both different semantic perception models and policies,\nconfirming the importance of calibrated uncertainties in both the aggregation\nand found decisions. We make the code and trained models available at\nhttps://semantic-search.cs.uni-freiburg.de.\n","authors":["Sai Prasanna","Daniel Honerkamp","Kshitij Sirohi","Tim Welschehold","Wolfram Burgard","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2408.02297v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07985v1","updated":"2025-01-14T10:13:41Z","published":"2025-01-14T10:13:41Z","title":"CHEQ-ing the Box: Safe Variable Impedance Learning for Robotic Polishing","summary":"  Robotic systems are increasingly employed for industrial automation, with\ncontact-rich tasks like polishing requiring dexterity and compliant behaviour.\nThese tasks are difficult to model, making classical control challenging. Deep\nreinforcement learning (RL) offers a promising solution by enabling the\nlearning of models and control policies directly from data. However, its\napplication to real-world problems is limited by data inefficiency and unsafe\nexploration. Adaptive hybrid RL methods blend classical control and RL\nadaptively, combining the strengths of both: structure from control and\nlearning from RL. This has led to improvements in data efficiency and\nexploration safety. However, their potential for hardware applications remains\nunderexplored, with no evaluations on physical systems to date. Such\nevaluations are critical to fully assess the practicality and effectiveness of\nthese methods in real-world settings. This work presents an experimental\ndemonstration of the hybrid RL algorithm CHEQ for robotic polishing with\nvariable impedance, a task requiring precise force and velocity tracking. In\nsimulation, we show that variable impedance enhances polishing performance. We\ncompare standalone RL with adaptive hybrid RL, demonstrating that CHEQ achieves\neffective learning while adhering to safety constraints. On hardware, CHEQ\nachieves effective polishing behaviour, requiring only eight hours of training\nand incurring just five failures. These results highlight the potential of\nadaptive hybrid RL for real-world, contact-rich tasks trained directly on\nhardware.\n","authors":["Emma Cramer","Lukas Jäschke","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2501.07985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09622v2","updated":"2025-01-14T09:22:35Z","published":"2024-04-15T09:49:33Z","title":"DIDLM: A SLAM Dataset for Difficult Scenarios Featuring Infrared, Depth\n  Cameras, LIDAR, 4D Radar, and Others under Adverse Weather, Low Light\n  Conditions, and Rough Roads","summary":"  Adverse weather conditions, low-light environments, and bumpy road surfaces\npose significant challenges to SLAM in robotic navigation and autonomous\ndriving. Existing datasets in this field predominantly rely on single sensors\nor combinations of LiDAR, cameras, and IMUs. However, 4D millimeter-wave radar\ndemonstrates robustness in adverse weather, infrared cameras excel in capturing\ndetails under low-light conditions, and depth images provide richer spatial\ninformation. Multi-sensor fusion methods also show potential for better\nadaptation to bumpy roads. Despite some SLAM studies incorporating these\nsensors and conditions, there remains a lack of comprehensive datasets\naddressing low-light environments and bumpy road conditions, or featuring a\nsufficiently diverse range of sensor data. In this study, we introduce a\nmulti-sensor dataset covering challenging scenarios such as snowy weather,\nrainy weather, nighttime conditions, speed bumps, and rough terrains. The\ndataset includes rarely utilized sensors for extreme conditions, such as 4D\nmillimeter-wave radar, infrared cameras, and depth cameras, alongside 3D LiDAR,\nRGB cameras, GPS, and IMU. It supports both autonomous driving and ground robot\napplications and provides reliable GPS/INS ground truth data, covering\nstructured and semi-structured terrains. We evaluated various SLAM algorithms\nusing this dataset, including RGB images, infrared images, depth images, LiDAR,\nand 4D millimeter-wave radar. The dataset spans a total of 18.5 km, 69 minutes,\nand approximately 660 GB, offering a valuable resource for advancing SLAM\nresearch under complex and extreme conditions. Our dataset is available at\nhttps://github.com/GongWeiSheng/DIDLM.\n","authors":["Weisheng Gong","Kaijie Su","Qingyong Li","Chen He","Tong Wu","Z. Jane Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07957v1","updated":"2025-01-14T09:21:17Z","published":"2025-01-14T09:21:17Z","title":"AI Guide Dog: Egocentric Path Prediction on Smartphone","summary":"  This paper introduces AI Guide Dog (AIGD), a lightweight egocentric\nnavigation assistance system for visually impaired individuals, designed for\nreal-time deployment on smartphones. AIGD addresses key challenges in blind\nnavigation by employing a vision-only, multi-label classification approach to\npredict directional commands, ensuring safe traversal across diverse\nenvironments. We propose a novel technique to enable goal-based outdoor\nnavigation by integrating GPS signals and high-level directions, while also\naddressing uncertain multi-path predictions for destination-free indoor\nnavigation. Our generalized model is the first navigation assistance system to\nhandle both goal-oriented and exploratory navigation scenarios across indoor\nand outdoor settings, establishing a new state-of-the-art in blind navigation.\nWe present methods, datasets, evaluations, and deployment insights to encourage\nfurther innovations in assistive navigation systems.\n","authors":["Aishwarya Jadhav","Jeffery Cao","Abhishree Shetty","Urvashi Priyam Kumar","Aditi Sharma","Ben Sukboontip","Jayant Sravan Tamarapalli","Jingyi Zhang","Anirudh Koul"],"pdf_url":"https://arxiv.org/pdf/2501.07957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07317v2","updated":"2025-01-14T09:00:27Z","published":"2025-01-13T13:28:03Z","title":"Evaluation of Artificial Intelligence Methods for Lead Time Prediction\n  in Non-Cycled Areas of Automotive Production","summary":"  The present study examines the effectiveness of applying Artificial\nIntelligence methods in an automotive production environment to predict unknown\nlead times in a non-cycle-controlled production area. Data structures are\nanalyzed to identify contextual features and then preprocessed using one-hot\nencoding. Methods selection focuses on supervised machine learning techniques.\nIn supervised learning methods, regression and classification methods are\nevaluated. Continuous regression based on target size distribution is not\nfeasible. Classification methods analysis shows that Ensemble Learning and\nSupport Vector Machines are the most suitable. Preliminary study results\nindicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost\nyield the best results. After further testing and extensive hyperparameter\noptimization, the final method choice is the LightGBM algorithm. Depending on\nfeature availability and prediction interval granularity, relative prediction\naccuracies of up to 90% can be achieved. Further tests highlight the importance\nof periodic retraining of AI models to accurately represent complex production\nprocesses using the database. The research demonstrates that AI methods can be\neffectively applied to highly variable production data, adding business value\nby providing an additional metric for various control tasks while outperforming\ncurrent non AI-based systems.\n","authors":["Cornelius Hake","Jonas Weigele","Frederik Reichert","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2501.07317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07255v2","updated":"2025-01-14T08:32:22Z","published":"2025-01-13T12:06:58Z","title":"GazeGrasp: DNN-Driven Robotic Grasping with Wearable Eye-Gaze Interface","summary":"  We present GazeGrasp, a gaze-based manipulation system enabling individuals\nwith motor impairments to control collaborative robots using eye-gaze. The\nsystem employs an ESP32 CAM for eye tracking, MediaPipe for gaze detection, and\nYOLOv8 for object localization, integrated with a Universal Robot UR10 for\nmanipulation tasks. After user-specific calibration, the system allows\nintuitive object selection with a magnetic snapping effect and robot control\nvia eye gestures. Experimental evaluation involving 13 participants\ndemonstrated that the magnetic snapping effect significantly reduced gaze\nalignment time, improving task efficiency by 31%. GazeGrasp provides a robust,\nhands-free interface for assistive robotics, enhancing accessibility and\nautonomy for users.\n","authors":["Issatay Tokmurziyev","Miguel Altamirano Cabrera","Luis Moreno","Muhammad Haris Khan","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07255v2.pdf","comment":"Accepted to: IEEE/ACM International Conference on Human-Robot\n  Interaction (HRI 2025)"},{"id":"http://arxiv.org/abs/2501.06566v2","updated":"2025-01-14T05:06:42Z","published":"2025-01-11T15:06:34Z","title":"Cooperative Aerial Robot Inspection Challenge: A Benchmark for\n  Heterogeneous Multi-UAV Planning and Lessons Learned","summary":"  We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a\nsimulation-based benchmark for motion planning algorithms in heterogeneous\nmulti-UAV systems. CARIC features UAV teams with complementary sensors,\nrealistic constraints, and evaluation metrics prioritizing inspection quality\nand efficiency. It offers a ready-to-use perception-control software stack and\ndiverse scenarios to support the development and evaluation of task allocation\nand motion planning algorithms. Competitions using CARIC were held at IEEE CDC\n2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation,\nattracting innovative solutions from research teams worldwide. This paper\nexamines the top three teams from CDC 2023, analyzing their exploration,\ninspection, and task allocation strategies while drawing insights into their\nperformance across scenarios. The results highlight the task's complexity and\nsuggest promising directions for future research in cooperative multi-UAV\nsystems.\n","authors":["Muqing Cao","Thien-Minh Nguyen","Shenghai Yuan","Andreas Anastasiou","Angelos Zacharia","Savvas Papaioannou","Panayiotis Kolios","Christos G. Panayiotou","Marios M. Polycarpou","Xinhang Xu","Mingjie Zhang","Fei Gao","Boyu Zhou","Ben M. Chen","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2501.06566v2.pdf","comment":"Please find our website at https://ntu-aris.github.io/caric"},{"id":"http://arxiv.org/abs/2501.07832v1","updated":"2025-01-14T04:26:49Z","published":"2025-01-14T04:26:49Z","title":"Low-Contact Grasping of Soft Tissue with Complex Geometry using a Vortex\n  Gripper","summary":"  Soft tissue manipulation is an integral aspect of most surgical procedures;\nhowever, the vast majority of surgical graspers used today are made of hard\nmaterials, such as metals or hard plastics. Furthermore, these graspers\npredominately function by pinching tissue between two hard objects as a method\nfor tissue manipulation. As such, the potential to apply too much force during\ncontact, and thus damage tissue, is inherently high. As an alternative\napproach, gaspers developed using a pneumatic vortex could potentially levitate\nsoft tissue, enabling manipulation with low or even no contact force. In this\npaper, we present the design and well as a full factorial study of the force\ncharacteristics of the vortex gripper grasping soft surfaces with four common\nshapes, with convex and concave curvature, and ranging over 10 different radii\nof curvature, for a total of 40 unique surfaces. By changing the parameters of\nthe nozzle elements in the design of the gripper, it was possible to\ninvestigate the influence of the mass flow parameters of the vortex gripper on\nthe lifting force for all of these different soft surfaces. An $\\pmb{ex}$\n$\\pmb{vivo}$ experiment was conducted on grasping biological tissues and soft\nballs of various shapes to show the advantages and disadvantages of the\nproposed technology. The obtained results allowed us to find limitations in the\nuse of vortex technology and the following stages of its improvement for\nmedical use.\n","authors":["Roman Mykhailyshyn","Ann Majewicz Fey"],"pdf_url":"https://arxiv.org/pdf/2501.07832v1.pdf","comment":"Submitted to T-MRB"},{"id":"http://arxiv.org/abs/2211.15975v4","updated":"2025-01-14T03:55:17Z","published":"2022-11-29T07:18:32Z","title":"Analyzing Infrastructure LiDAR Placement with Realistic LiDAR Simulation\n  Library","summary":"  Recently, Vehicle-to-Everything(V2X) cooperative perception has attracted\nincreasing attention. Infrastructure sensors play a critical role in this\nresearch field; however, how to find the optimal placement of infrastructure\nsensors is rarely studied. In this paper, we investigate the problem of\ninfrastructure sensor placement and propose a pipeline that can efficiently and\neffectively find optimal installation positions for infrastructure sensors in a\nrealistic simulated environment. To better simulate and evaluate LiDAR\nplacement, we establish a Realistic LiDAR Simulation library that can simulate\nthe unique characteristics of different popular LiDARs and produce\nhigh-fidelity LiDAR point clouds in the CARLA simulator. Through simulating\npoint cloud data in different LiDAR placements, we can evaluate the perception\naccuracy of these placements using multiple detection models. Then, we analyze\nthe correlation between the point cloud distribution and perception accuracy by\ncalculating the density and uniformity of regions of interest. Experiments show\nthat when using the same number and type of LiDAR, the placement scheme\noptimized by our proposed method improves the average precision by 15%,\ncompared with the conventional placement scheme in the standard lane scene. We\nalso analyze the correlation between perception performance in the region of\ninterest and LiDAR point cloud distribution and validate that density and\nuniformity can be indicators of performance. Both the RLS Library and related\ncode will be released at https://github.com/PJLab-ADG/PCSim.\n","authors":["Xinyu Cai","Wentao Jiang","Runsheng Xu","Wenquan Zhao","Jiaqi Ma","Si Liu","Yikang Li"],"pdf_url":"https://arxiv.org/pdf/2211.15975v4.pdf","comment":"7 pages, 6 figures, accepted to the IEEE International Conference on\n  Robotics and Automation (ICRA'23)"},{"id":"http://arxiv.org/abs/2501.06783v2","updated":"2025-01-14T03:16:01Z","published":"2025-01-12T11:42:28Z","title":"Cost-Effective Robotic Handwriting System with AI Integration","summary":"  This paper introduces a cost-effective robotic handwriting system designed to\nreplicate human-like handwriting with high precision. Combining a Raspberry Pi\nPico microcontroller, 3D-printed components, and a machine learning-based\nhandwriting generation model implemented via TensorFlow, the system converts\nuser-supplied text into realistic stroke trajectories. By leveraging\nlightweight 3D-printed materials and efficient mechanical designs, the system\nachieves a total hardware cost of approximately \\$56, significantly\nundercutting commercial alternatives. Experimental evaluations demonstrate\nhandwriting precision within $\\pm$0.3 millimeters and a writing speed of\napproximately 200 mm/min, positioning the system as a viable solution for\neducational, research, and assistive applications. This study seeks to lower\nthe barriers to personalized handwriting technologies, making them accessible\nto a broader audience.\n","authors":["Tianyi Huang","Richard Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.06783v2.pdf","comment":"This is an updated version of a paper originally presented at the\n  2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)"},{"id":"http://arxiv.org/abs/2305.17217v5","updated":"2025-01-14T01:13:02Z","published":"2023-05-26T19:22:31Z","title":"Tactile-based Exploration, Mapping and Navigation with\n  Collision-Resilient Aerial Vehicles","summary":"  This article introduces XPLORER, a passive deformable UAV with a\nspring-augmented chassis and proprioceptive state awareness, designed to endure\ncollisions and maintain smooth contact. We develop a fast-converging external\nforce estimation algorithm for XPLORER that leverages onboard sensors and\nproprioceptive data for contact and collision detection. Using this force\ninformation, we propose four motion primitives, including three novel\ntactile-based primitives: tactile-traversal, tactile-turning, and\nricocheting-to aid XPLORER in navigating unknown environments. These primitives\nare synthesized autonomously in real-time to enable efficient exploration and\nnavigation by leveraging collisions and contacts. Experimental results\ndemonstrate the effectiveness of our approach, highlighting the potential of\npassive deformable UAVs for contact-rich real-world tasks such as\nnon-destructive inspection, surveillance and mapping, and pursuit/evasion.\n","authors":["Karishma Patnaik","Aravind Adhith Pandian Saravanakumaran","Wenlong Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.17217v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12176v3","updated":"2025-01-14T22:44:14Z","published":"2024-03-18T18:49:20Z","title":"Safety Implications of Explainable Artificial Intelligence in End-to-End\n  Autonomous Driving","summary":"  The end-to-end learning pipeline is gradually creating a paradigm shift in\nthe ongoing development of highly autonomous vehicles, largely due to advances\nin deep learning, the availability of large-scale training datasets, and\nimprovements in integrated sensor devices. However, a lack of explainability in\nreal-time decisions with contemporary learning methods impedes user trust and\nattenuates the widespread deployment and commercialization of such vehicles.\nMoreover, the issue is exacerbated when these cars are involved in or cause\ntraffic accidents. Consequently, explainability in end-to-end autonomous\ndriving is essential to build trust in vehicular automation. With that said,\nautomotive researchers have not yet rigorously explored safety benefits and\nconsequences of explanations in end-to-end autonomous driving. This paper aims\nto bridge the gaps between these topics and seeks to answer the following\nresearch question: What are safety implications of explanations in end-to-end\nautonomous driving? In this regard, we first revisit established safety and\nexplainability concepts in end-to-end driving. Furthermore, we present three\ncritical case studies and show the pivotal role of explanations in enhancing\nself-driving safety. Finally, we describe insights from empirical studies and\nreveal potential value, limitations, and caveats of practical explainable AI\nmethods with respect to their safety assurance in end-to-end driving.\n","authors":["Shahin Atakishiyev","Mohammad Salameh","Randy Goebel"],"pdf_url":"https://arxiv.org/pdf/2403.12176v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06372v2","updated":"2025-01-14T22:43:44Z","published":"2024-10-08T21:14:09Z","title":"Cooperative and Asynchronous Transformer-based Mission Planning for\n  Heterogeneous Teams of Mobile Robots","summary":"  Cooperative mission planning for heterogeneous teams of mobile robots\npresents a unique set of challenges, particularly when operating under\ncommunication constraints and limited computational resources. To address these\nchallenges, we propose the Cooperative and Asynchronous Transformer-based\nMission Planning (CATMiP) framework, which leverages multi-agent reinforcement\nlearning (MARL) to coordinate distributed decision making among agents with\ndiverse sensing, motion, and actuation capabilities, operating under sporadic\nad hoc communication. A Class-based Macro-Action Decentralized Partially\nObservable Markov Decision Process (CMacDec-POMDP) is also formulated to\neffectively model asynchronous decision-making for heterogeneous teams of\nagents. The framework utilizes an asynchronous centralized training and\ndistributed execution scheme that is developed based on the Multi-Agent\nTransformer (MAT) architecture. This design allows a single trained model to\ngeneralize to larger environments and accommodate varying team sizes and\ncompositions. We evaluate CATMiP in a 2D grid-world simulation environment and\ncompare its performance against planning-based exploration methods. Results\ndemonstrate CATMiP's superior efficiency, scalability, and robustness to\ncommunication dropouts, highlighting its potential for real-world heterogeneous\nmobile robot systems. The code is available at\nhttps://github.com/mylad13/CATMiP.\n","authors":["Milad Farjadnasab","Shahin Sirouspour"],"pdf_url":"https://arxiv.org/pdf/2410.06372v2.pdf","comment":"27 pages, 8 figures, this work has been submitted to Elsevier for\n  possible publication"},{"id":"http://arxiv.org/abs/2501.08469v1","updated":"2025-01-14T22:30:38Z","published":"2025-01-14T22:30:38Z","title":"Electrostatic Clutches Enable High-Force Mechanical Multiplexing:\n  Demonstrating Single-Motor Full-Actuation of a 4-DoF Hand","summary":"  This paper introduces a novel mechanical multiplexing system powered by\nelectrostatic capstan clutches, enabling high-force, single-motor control of\nmultiple degrees of freedom (DoF). The system is capable of both bidirectional\nsingle-input single-output time-division and single-input multiple-output\nmultiplexing to actuate a commercial 4-DoF robotic hand with a single motor.\nOur mechanical multiplexer is also capable of powerless position holding owing\nto its use of a leadscrew nut acting as the output. Experimental results\ndemonstrate the effectiveness of this approach, achieving individual and\nsimultaneous actuation. This innovation offers a scalable solution for high-DoF\nrobotic systems, providing a path to efficient actuation in robotic platforms.\n","authors":["Timothy E. Amish","Jeffrey T. Auletta","Chad C. Kessens","Joshua R. Smith","Jeffrey I. Lipton"],"pdf_url":"https://arxiv.org/pdf/2501.08469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12713v2","updated":"2025-01-14T22:29:03Z","published":"2024-09-18T05:28:12Z","title":"A Signal Temporal Logic Approach for Task-Based Coordination of\n  Multi-Aerial Systems: a Wind Turbine Inspection Case Study","summary":"  The paper addresses task assignment and trajectory generation for\ncollaborative inspection missions using a fleet of multi-rotors, focusing on\nthe wind turbine inspection scenario. The proposed solution enables safe and\nfeasible trajectories while accommodating heterogeneous time-bound constraints\nand vehicle physical limits. An optimization problem is formulated to meet\nmission objectives and temporal requirements encoded as Signal Temporal Logic\n(STL) specifications. Additionally, an event-triggered replanner is introduced\nto address unforeseen events and compensate for lost time. Furthermore, a\ngeneralized robustness scoring method is employed to reflect user preferences\nand mitigate task conflicts. The effectiveness of the proposed approach is\ndemonstrated through MATLAB and Gazebo simulations, as well as field\nmulti-robot experiments in a mock-up scenario.\n","authors":["Giuseppe Silano","Alvaro Caballero","Davide Liuzza","Luigi Iannelli","Stjepan Bogdan","Martin Saska"],"pdf_url":"https://arxiv.org/pdf/2409.12713v2.pdf","comment":"\\c{opyright}2025 Elsevier. This work has been accepted to \"Robotics\n  and Autonomous Systems\" for possible publication. Personal use of this\n  material is permitted. Permission from Elsevier must be obtained for all\n  other uses"},{"id":"http://arxiv.org/abs/2501.04693v3","updated":"2025-01-14T22:28:39Z","published":"2025-01-08T18:57:33Z","title":"Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous\n  Sensors via Language Grounding","summary":"  Interacting with the world is a multi-sensory experience: achieving effective\ngeneral-purpose interaction requires making use of all available modalities --\nincluding vision, touch, and audio -- to fill in gaps from partial observation.\nFor example, when vision is occluded reaching into a bag, a robot should rely\non its senses of touch and sound. However, state-of-the-art generalist robot\npolicies are typically trained on large datasets to predict robot actions\nsolely from visual and proprioceptive observations. In this work, we propose\nFuSe, a novel approach that enables finetuning visuomotor generalist policies\non heterogeneous sensor modalities for which large datasets are not readily\navailable by leveraging natural language as a common cross-modal grounding. We\ncombine a multimodal contrastive loss with a sensory-grounded language\ngeneration loss to encode high-level semantics. In the context of robot\nmanipulation, we show that FuSe enables performing challenging tasks that\nrequire reasoning jointly over modalities such as vision, touch, and sound in a\nzero-shot setting, such as multimodal prompting, compositional cross-modal\nprompting, and descriptions of objects it interacts with. We show that the same\nrecipe is applicable to widely different generalist policies, including both\ndiffusion-based generalist policies and large vision-language-action (VLA)\nmodels. Extensive experiments in the real world show that FuSeis able to\nincrease success rates by over 20% compared to all considered baselines.\n","authors":["Joshua Jones","Oier Mees","Carmelo Sferrazza","Kyle Stachowicz","Pieter Abbeel","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2501.04693v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02459v3","updated":"2025-01-14T22:07:08Z","published":"2022-12-05T18:02:46Z","title":"Resilient Distributed Optimization for Multi-Agent Cyberphysical Systems","summary":"  This work focuses on the problem of distributed optimization in multi-agent\ncyberphysical systems, where a legitimate agent's iterates are influenced both\nby the values it receives from potentially malicious neighboring agents, and by\nits own self-serving target function. We develop a new algorithmic and\nanalytical framework to achieve resilience for the class of problems where\nstochastic values of trust between agents exist and can be exploited. In this\ncase, we show that convergence to the true global optimal point can be\nrecovered, both in mean and almost surely, even in the presence of malicious\nagents. Furthermore, we provide expected convergence rate guarantees in the\nform of upper bounds on the expected squared distance to the optimal value.\nFinally, numerical results are presented that validate our analytical\nconvergence guarantees even when the malicious agents compose the majority of\nagents in the network and where existing methods fail to converge to the\noptimal nominal points.\n","authors":["Michal Yemini","Angelia Nedić","Andrea J. Goldsmith","Stephanie Gil"],"pdf_url":"https://arxiv.org/pdf/2212.02459v3.pdf","comment":"Accepted for publication in the IEEE Transactions on Automatic\n  Control"},{"id":"http://arxiv.org/abs/2403.16689v3","updated":"2025-01-14T21:37:31Z","published":"2024-03-25T12:23:39Z","title":"SYNAPSE: SYmbolic Neural-Aided Preference Synthesis Engine","summary":"  This paper addresses the problem of preference learning, which aims to align\nrobot behaviors through learning user specific preferences (e.g. \"good\npull-over location\") from visual demonstrations. Despite its similarity to\nlearning factual concepts (e.g. \"red door\"), preference learning is a\nfundamentally harder problem due to its subjective nature and the paucity of\nperson-specific training data. We address this problem using a novel framework\ncalled SYNAPSE, which is a neuro-symbolic approach designed to efficiently\nlearn preferential concepts from limited data. SYNAPSE represents preferences\nas neuro-symbolic programs, facilitating inspection of individual parts for\nalignment, in a domain-specific language (DSL) that operates over images and\nleverages a novel combination of visual parsing, large language models, and\nprogram synthesis to learn programs representing individual preferences. We\nperform extensive evaluations on various preferential concepts as well as user\ncase studies demonstrating its ability to align well with dissimilar user\npreferences. Our method significantly outperforms baselines, especially when it\ncomes to out of distribution generalization. We show the importance of the\ndesign choices in the framework through multiple ablation studies. Code,\nadditional results, and supplementary material can be found on the website:\nhttps://amrl.cs.utexas.edu/synapse\n","authors":["Sadanand Modak","Noah Patton","Isil Dillig","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2403.16689v3.pdf","comment":"Accepted (oral) at AAAI 25"},{"id":"http://arxiv.org/abs/2501.07295v2","updated":"2025-01-14T19:39:23Z","published":"2025-01-13T13:01:21Z","title":"GestLLM: Advanced Hand Gesture Interpretation via Large Language Models\n  for Human-Robot Interaction","summary":"  This paper introduces GestLLM, an advanced system for human-robot interaction\nthat enables intuitive robot control through hand gestures. Unlike conventional\nsystems, which rely on a limited set of predefined gestures, GestLLM leverages\nlarge language models and feature extraction via MediaPipe to interpret a\ndiverse range of gestures. This integration addresses key limitations in\nexisting systems, such as restricted gesture flexibility and the inability to\nrecognize complex or unconventional gestures commonly used in human\ncommunication.\n  By combining state-of-the-art feature extraction and language model\ncapabilities, GestLLM achieves performance comparable to leading\nvision-language models while supporting gestures underrepresented in\ntraditional datasets. For example, this includes gestures from popular culture,\nsuch as the ``Vulcan salute\" from Star Trek, without any additional\npretraining, prompt engineering, etc. This flexibility enhances the naturalness\nand inclusivity of robot control, making interactions more intuitive and\nuser-friendly.\n  GestLLM provides a significant step forward in gesture-based interaction,\nenabling robots to understand and respond to a wide variety of hand gestures\neffectively. This paper outlines its design, implementation, and evaluation,\ndemonstrating its potential applications in advanced human-robot collaboration,\nassistive robotics, and interactive entertainment.\n","authors":["Oleg Kobzarev","Artem Lykov","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07295v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08389v1","updated":"2025-01-14T19:06:44Z","published":"2025-01-14T19:06:44Z","title":"Toward Zero-Shot User Intent Recognition in Shared Autonomy","summary":"  A fundamental challenge of shared autonomy is to use high-DoF robots to\nassist, rather than hinder, humans by first inferring user intent and then\nempowering the user to achieve their intent. Although successful, prior methods\neither rely heavily on a priori knowledge of all possible human intents or\nrequire many demonstrations and interactions with the human to learn these\nintents before being able to assist the user. We propose and study a zero-shot,\nvision-only shared autonomy (VOSA) framework designed to allow robots to use\nend-effector vision to estimate zero-shot human intents in conjunction with\nblended control to help humans accomplish manipulation tasks with unknown and\ndynamically changing object locations. To demonstrate the effectiveness of our\nVOSA framework, we instantiate a simple version of VOSA on a Kinova Gen3\nmanipulator and evaluate our system by conducting a user study on three\ntabletop manipulation tasks. The performance of VOSA matches that of an oracle\nbaseline model that receives privileged knowledge of possible human intents\nwhile also requiring significantly less effort than unassisted teleoperation.\nIn more realistic settings, where the set of possible human intents is fully or\npartially unknown, we demonstrate that VOSA requires less human effort and time\nthan baseline approaches while being preferred by a majority of the\nparticipants. Our results demonstrate the efficacy and efficiency of using\noff-the-shelf vision algorithms to enable flexible and beneficial shared\ncontrol of a robot manipulator. Code and videos available here:\nhttps://sites.google.com/view/zeroshot-sharedautonomy/home.\n","authors":["Atharv Belsare","Zohre Karimi","Connor Mattson","Daniel S. Brown"],"pdf_url":"https://arxiv.org/pdf/2501.08389v1.pdf","comment":"10 pages, 6 figures, Accepted to IEEE/ACM International Conference on\n  Human-Robot Interaction (HRI), 2025. Equal Contribution from the first three\n  authors"},{"id":"http://arxiv.org/abs/2501.10447v1","updated":"2025-01-14T13:14:21Z","published":"2025-01-14T13:14:21Z","title":"A Predictive Cooperative Collision Avoidance for Multi-Robot Systems\n  Using Control Barrier Function","summary":"  Control barrier function (CBF)-based methods provide the minimum modification\nnecessary to formally guarantee safety in the context of quadratic programming,\nand strict safety guarantee for safety critical systems. However, most\nCBF-related derivatives myopically focus on present safety at each time step, a\nreasoning over a look-ahead horizon is exactly missing. In this paper, a\npredictive safety matrix is constructed. We then consolidate the safety\ncondition based on the smallest eigenvalue of the proposed safety matrix. A\npredefined deconfliction strategy of motion paths is embedded into the\ntrajectory tracking module to manage deadlock conflicts, which computes the\ndeadlock escape velocity with the minimum attitude angle. Comparison results\nshow that the introduction of the predictive term is robust for measurement\nuncertainty and is immune to oscillations. The proposed deadlock avoidance\nmethod avoids a large detour, without obvious stagnation.\n","authors":["Xiaoxiao Li","Zhirui Sun","Hongpeng Wang","Shuai Li","Jiankun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10447v1.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2501.08304v1","updated":"2025-01-14T18:39:27Z","published":"2025-01-14T18:39:27Z","title":"A Novel Method for Detecting Dust Accumulation in Photovoltaic Systems:\n  Evaluating Visible Sunlight Obstruction in Different Dust Levels and AI-based\n  Bird Droppings Detection","summary":"  This paper presents an innovative method for automatically detecting dust\naccumulation on a PV system and notifying the user to clean it instantly. The\naccumulation of dust, bird, or insect droppings on the surface of photovoltaic\n(PV) panels creates a barrier between the solar energy and the panel's surface\nto receive sufficient energy to generate electricity. The study investigates\nthe effects of dust on PV panel output and visible sunlight (VSL) block amounts\nto utilize the necessity of cleaning and detection. The amount of blocked\nvisible sunlight while passing through glass due to dust determines the\naccumulated dust level. Visible sunlight can easily pass through the clean,\ntransparent glass but reflects when something like dust obstructs it. Based on\nthose concepts, a system is designed with a light sensor that is simple,\neffective, easy to install, hassle-free, and can spread the technology. The\nstudy also explores the effectiveness of the detection system developed by\nusing image processing and machine learning algorithms to identify dust levels\nand bird or insect droppings accurately. The experimental setup in Gazipur,\nBangladesh, found that excessive dust can block up to 55% of visible sunlight,\nwasting 55% of solar energy in the visible spectrum, and cleaning can recover\n3% of power weekly. The data from the dust detection system is correlated with\nthe 400W capacity solar panels' naturally lost efficiency data to validate the\nsystem. This research measured visible sunlight obstruction and loss due to\ndust. However, the addition of an infrared radiation sensor can draw the entire\nscenario of energy loss by doing more research.\n","authors":["Md Shahriar Kabir","Khalid Mahmud Niloy","S. M. Imrat Rahman","Md Imon Hossen","Sumaiya Afrose","Md. Ismail Hossain Mofazzol","Md Lion Ahmmed"],"pdf_url":"https://arxiv.org/pdf/2501.08304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07819v2","updated":"2025-01-14T18:10:07Z","published":"2024-01-15T16:46:04Z","title":"Enforcing contraction via data","summary":"  We present data-based conditions for enforcing contractivity via feedback\ncontrol and obtain desired asymptotic properties of the closed-loop system. We\nfocus on unknown nonlinear control systems whose vector fields are expressible\nvia a dictionary of functions and derive data-dependent semidefinite programs\nwhose solution returns the controller that guarantees contractivity. When data\nare perturbed by disturbances that are linear combinations of sinusoids of\nknown frequencies (but unknown amplitude and phase) and constants, we\nremarkably obtain conditions for contractivity that do not depend on the\nmagnitude of the disturbances, with imaginable positive consequences for the\nsynthesis of the controller. Finally, we show how to design from data an\nintegral controller for nonlinear systems that achieves constant reference\ntracking and constant disturbance rejection.\n","authors":["Zhongjie Hu","Claudio De Persis","Pietro Tesi"],"pdf_url":"https://arxiv.org/pdf/2401.07819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08227v1","updated":"2025-01-14T16:10:45Z","published":"2025-01-14T16:10:45Z","title":"Nonlinear Cruise Controllers with Bidirectional Sensing for a String of\n  Vehicles","summary":"  We introduce a nonlinear cruise controller that is fully decentralized (by\nvehicle) and uses spacing and speed measurements from the preceding and\nfollowing vehicles to decide on the appropriate control action (acceleration)\nfor each vehicle. The proposed cruise controller is studied on both a ring-road\nand an open road and guarantees that there are no collisions between vehicles,\nwhile their speeds are always positive and never exceed the road speed limits.\nFor both cases of the open road and the ring-road, we rigorously prove that the\nset of equilibrium points is globally asymptotically stable and provide KL\nestimates that guarantee uniform convergence to the said set. Moreover, we show\nthat for the ring-road, and under certain conditions, there is a single\nequilibrium point which is exponentially attractive.\n","authors":["Iasson Karafyllis","Dionysios Theodosis","Markos Papageorgiou"],"pdf_url":"https://arxiv.org/pdf/2501.08227v1.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2501.08181v1","updated":"2025-01-14T15:05:05Z","published":"2025-01-14T15:05:05Z","title":"Economic Model Predictive Control for Periodic Operation: A Quadratic\n  Programming Approach","summary":"  Periodic dynamical systems, distinguished by their repetitive behavior over\ntime, are prevalent across various engineering disciplines. In numerous\napplications, particularly within industrial contexts, the implementation of\nmodel predictive control (MPC) schemes tailored to optimize specific economic\ncriteria was shown to offer substantial advantages. However, the real-time\nimplementation of these schemes is often infeasible due to limited\ncomputational resources. To tackle this problem, we propose a\nresource-efficient economic model predictive control scheme for periodic\nsystems, leveraging existing single-layer MPC techniques. Our method relies on\na single quadratic optimization problem, which ensures high computational\nefficiency for real-time control in dynamic settings. We prove feasibility,\nstability and convergence to optimum of the proposed approach, and validate the\neffectiveness through numerical experiments.\n","authors":["Jose A. Borja-Conde","Juan M. Nadales","Filiberto Fele","Daniel Limon"],"pdf_url":"https://arxiv.org/pdf/2501.08181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08103v1","updated":"2025-01-14T13:21:18Z","published":"2025-01-14T13:21:18Z","title":"A Comparative Analysis of Transformer-less Inverter Topologies for\n  Grid-Connected PV Systems: Minimizing Leakage Current and THD","summary":"  The integration of distributed energy resources (DERs), particularly\nphotovoltaic (PV) systems, into power grids has gained major attention due to\ntheir environmental and economic benefits. Although traditional\ntransformer-based grid-connected PV inverters provide galvanic isolation for\nleakage current, they suffer from major drawbacks of high cost, lower\nefficiency, and increased size. Transformer-less grid-connected PV inverters\n(TLGI) have emerged as a prominent alternative, as they achieve higher\nefficiency, compact design, and lower cost. However, due to a lack of galvanic\nisolation, TLGIs are highly affected by leakage current caused by the\nfluctuation of common-mode voltage (CMV). This paper investigates three\ntopologies H4, H5, and HERIC with comparisons between their CMV,\ndifferential-mode voltage (DMV), total harmonic distortion (THD), and leakage\ncurrent. A simulation was conducted for each topology in MATLAB/Simulink\nR2023a, and the results demonstrate that the H5 topology achieves a balance\nbetween low leakage current, reduced THD, and optimal operational efficiency,\nmaking it suitable for practical application.\n","authors":["Shashwot Shrestha","Rachana Subedi","Swodesh Sharma","Sushil Phuyal","Indraman Tamrakar"],"pdf_url":"https://arxiv.org/pdf/2501.08103v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2501.08058v1","updated":"2025-01-14T12:12:51Z","published":"2025-01-14T12:12:51Z","title":"Range-Only Dynamic Output Feedback Controller for Safe and Secure Target\n  Circumnavigation","summary":"  The safety and security of robotic systems are paramount when navigating\naround a hostile target. This paper addresses the problem of circumnavigating\nan unknown target by a unicycle robot while ensuring it maintains a desired\nsafe distance and remains within the sensing region around the target\nthroughout its motion. The proposed control design methodology is based on the\nconstruction of a joint Lyapunov function that incorporates: (i) a quadratic\npotential function characterizing the desired target-circumnavigation\nobjective, and (ii) a barrier Lyapunov function-based potential term to enforce\nsafety and sensing constraints on the robot's motion. A notable feature of the\nproposed control design is its reliance exclusively on local range measurements\nbetween the robot and the target, realized using a dynamic output feedback\ncontroller that treats the range as the only observable output for feedback.\nUsing the Lyapunov stability theory, we show that the desired equilibrium of\nthe closed-loop system is asymptotically stable, and the prescribed safety and\nsecurity constraints are met under the proposed controllers. We also obtain\nrestrictive bounds on the post-design signals and provide both simulation and\nexperimental results to validate the theoretical contributions.\n","authors":["Anand Singh","Anoop Jain"],"pdf_url":"https://arxiv.org/pdf/2501.08058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08025v1","updated":"2025-01-14T11:28:02Z","published":"2025-01-14T11:28:02Z","title":"Analysis of Power Losses and the Efficacy of Power Minimization\n  Strategies in Multichannel Electrical Stimulation Systems","summary":"  Neuroprosthetic devices require multichannel stimulator systems with an\nincreasing number of channels. However, there are inherent power losses in\ntypical multichannel stimulation circuits caused by a mismatch between the\npower supply voltage and the voltage required at each electrode to successfully\nstimulate tissue. This imposes a bottleneck towards high-channel-count devices,\nwhich is particularly severe in wirelessly-powered devices. Hence, advances in\nthe power efficiency of stimulation systems are critical. To support these\nadvances, this paper presents a methodology to identify and quantify power\nlosses associated with different power supply scaling strategies in\nmultichannel stimulation systems. The proposed methodology utilizes\ndistributions of stimulation amplitudes and electrode impedances to calculate\npower losses in multichannel systems. Experimental data from previously\npublished studies spanning various stimulation applications were analyzed to\nevaluate the performance of fixed, global, and stepped supply scaling methods,\nfocusing on their impact on power dissipation and efficiency. Variability in\noutput conditions results in low power efficiency in multichannel stimulation\nsystems across all applications. Stepped voltage scaling demonstrated\nsubstantial efficiency improvements, achieving an increase of 67 % to 146 %,\nparticularly in high-channel-count applications with significant variability in\ntissue impedance. Global scaling, by contrast, was more advantageous for\nsystems with fewer channels. The findings highlight the importance of tailoring\npower management strategies to specific applications to optimize efficiency\nwhile minimizing system complexity. The proposed methodology offers a framework\nfor evaluating efficiency-complexity trade-offs, advancing the design of\nscalable neurostimulation systems.\n","authors":["Francesc Varkevisser","Wouter A. Serdijn","Tiago L. Costa"],"pdf_url":"https://arxiv.org/pdf/2501.08025v1.pdf","comment":"22 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.07995v1","updated":"2025-01-14T10:38:54Z","published":"2025-01-14T10:38:54Z","title":"MMAPs to model complex multi-state systems with vacation policies in the\n  repair facility","summary":"  Two complex multi-state systems subject to multiple events are built in an\nalgorithmic and computational way by considering phase-type distributions and\nMarkovian arrival processes with marked arrivals. The internal performance of\nthe system is composed of different degradation levels and internal repairable\nand non-repairable failures can occur. Also, the system is subject to external\nshocks that may provoke repairable or non-repairable failure. A multiple\nvacation policy is introduced in the system for the repairperson. Preventive\nmaintenance is included in the system to improve the behaviour. Two types of\ntask may be performed by the repairperson; corrective repair and preventive\nmaintenance. The systems are modelled, the transient and stationary\ndistributions are built and different performance measures are calculated in a\nmatrix-algorithmic form. Cost and rewards are included in the model in a vector\nmatrix way. Several economic measures are worked out and the net reward per\nunit of time is used to optimize the system. A numerical example shows that the\nsystem can be optimized according to the existence of preventive maintenance\nand the distribution of vacation time. The results have been implemented\ncomputationally with Matlab and R (packages: expm, optim).\n","authors":["Juan Eloy Ruiz-Castro","Christian Acal"],"pdf_url":"https://arxiv.org/pdf/2501.07995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03912v2","updated":"2025-01-14T10:32:32Z","published":"2024-06-06T09:51:30Z","title":"GenSafe: A Generalizable Safety Enhancer for Safe Reinforcement Learning\n  Algorithms Based on Reduced Order Markov Decision Process Model","summary":"  Safe Reinforcement Learning (SRL) aims to realize a safe learning process for\nDeep Reinforcement Learning (DRL) algorithms by incorporating safety\nconstraints. However, the efficacy of SRL approaches often relies on accurate\nfunction approximations, which are notably challenging to achieve in the early\nlearning stages due to data insufficiency. To address this issue, we introduce\nin this work a novel Generalizable Safety enhancer (GenSafe) that is able to\novercome the challenge of data insufficiency and enhance the performance of SRL\napproaches. Leveraging model order reduction techniques, we first propose an\ninnovative method to construct a Reduced Order Markov Decision Process (ROMDP)\nas a low-dimensional approximator of the original safety constraints. Then, by\nsolving the reformulated ROMDP-based constraints, GenSafe refines the actions\nof the agent to increase the possibility of constraint satisfaction.\nEssentially, GenSafe acts as an additional safety layer for SRL algorithms. We\nevaluate GenSafe on multiple SRL approaches and benchmark problems. The results\ndemonstrate its capability to improve safety performance, especially in the\nearly learning phases, while maintaining satisfactory task performance. Our\nproposed GenSafe not only offers a novel measure to augment existing SRL\nmethods but also shows broad compatibility with various SRL algorithms, making\nit applicable to a wide range of systems and SRL problems.\n","authors":["Zhehua Zhou","Xuan Xie","Jiayang Song","Zhan Shu","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2406.03912v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07981v1","updated":"2025-01-14T10:02:18Z","published":"2025-01-14T10:02:18Z","title":"A resource management approach for concurrent operation of RF\n  functionalities","summary":"  Future multifunction RF systems will be able to not only perform various\ndifferent radar, communication and electronic warfare functionalities but also\nto perform them simultaneously on the same aperture. This ability of concurrent\noperations requires new, cognitive approaches of resource management compared\nto classical methods. This paper presents such a new approach using a\ncombination of quality of service based resource management and Monte Carlo\ntree search.\n","authors":["Pascal Marquardt","Sebastian Durst","Kilian Barth","Tobias Müller"],"pdf_url":"https://arxiv.org/pdf/2501.07981v1.pdf","comment":"6 pages, 9 figures, presented at 2024 International Radar Conference\n  (RADAR2024)"},{"id":"http://arxiv.org/abs/2501.07973v1","updated":"2025-01-14T09:46:18Z","published":"2025-01-14T09:46:18Z","title":"An Open Source Validation System for Continuous Arterial Blood Pressure\n  Measuring Sensors","summary":"  Measuring the blood pressure waveform is becoming a more frequently studied\narea. The development of sensor technologies opens many new ways to be able to\nmeasure high-quality signals. The development of such an aim-specific sensor\ncan be time-consuming, expensive, and difficult to test or validate with known\nand consistent waveforms. In this paper, we present an open source blood\npressure waveform simulator with an open source Python validation package to\nreduce development costs for early-stage sensor development and research. The\nsimulator mainly consists of 3D printed parts which technology has become a\nwidely available and cheap solution. The core part of the simulator is a 3D\nprinted cam that can be generated based on real blood pressure waveforms. The\nvalidation framework can create a detailed comparison between the signal\nwaveform used to design the cam and the measured time series from the sensor\nbeing validated. The presented simulator proved to be robust and accurate in\nshort- and long-term use, as it produced the signal waveform consistently and\naccurately. To validate this solution, a 3D force sensor was used, which was\nproven earlier to be able to measure high-quality blood pressure waveforms on\nthe radial artery at the wrist. The results showed high similarity between the\nmeasured and the nominal waveforms, meaning that comparing the normalized\nsignals, the RMSE value ranged from $0.0276 \\pm 0.0047$ to $0.0212 \\pm 0.0023$,\nand the Pearson correlation ranged from $0.9933 \\pm 0.0027$ to $0.9978 \\pm\n0.0005$. Our validation framework is available at\nhttps://github.com/repat8/cam-bpw-sim. Our hardware framework, which allows\nreproduction of the presented solution, is available at\nhttps://github.com/repat8/cam-bpw-sim-hardware. The entire design is an open\nsource project and was developed using free software.\n","authors":["Attila Répai","Sándor Földi","Péter Sótonyi","György Cserey"],"pdf_url":"https://arxiv.org/pdf/2501.07973v1.pdf","comment":"8 pages, 5 figures. For associated repositories see\n  https://github.com/repat8/cam-bpw-sim-hardware and\n  https://github.com/repat8/cam-bpw-sim . Submitted to IEEE Transactions on\n  Instrumentation and Measurement"},{"id":"http://arxiv.org/abs/2501.07948v1","updated":"2025-01-14T09:02:03Z","published":"2025-01-14T09:02:03Z","title":"Synchronization of Kuramoto oscillators via HEOL, and a discussion on AI","summary":"  Artificial neural networks and their applications in deep learning have\nrecently made an incursion into the field of control. Deep learning techniques\nin control are often related to optimal control, which relies on Pontryagin\nmaximum principle or the Hamilton-Jacobi-Bellman equation. They imply control\nschemes that are tedious to implement. We show here that the new HEOL setting,\nresulting from the fusion of the two established approaches, namely\ndifferential flatness and model-free control, provides a solution to control\nproblems that is more sober in terms of computational resources. This\ncommunication is devoted to the synchronization of the popular Kuramoto's\ncoupled oscillators, which was already considered via artificial neural\nnetworks (B\\\"ottcher et al., Nature Communications 2022), where, contrarily to\nthis communication, only the single control variable is examined. One\nestablishes the flatness of Kuramoto's coupled oscillator model with\nmultiplicative control and develops the resulting HEOL control. Unlike many\nexemples, this system reveals singularities that are avoided by a clever\ngeneration of phase angle trajectories. The results obtained, verified in\nsimulation, show that it is not only possible to synchronize these oscillators\nin finite time, and even to follow angular frequency profiles, but also to\nexhibit robustness concerning model mismatches. To the best of our knowledge\nthis has never been done before. Concluding remarks advocate a viewpoint, which\nmight be traced back to Wiener's cybernetics: control theory belongs to AI.\n","authors":["Emmanuel Delaleau","Cédric Join","Michel Fliess"],"pdf_url":"https://arxiv.org/pdf/2501.07948v1.pdf","comment":"MATHMOD 2025 (11th Vienna International Conference on Mathematical\n  Modelling, 19-21 February 2025, Vienna, Austria)"},{"id":"http://arxiv.org/abs/2411.02824v2","updated":"2025-01-14T07:30:20Z","published":"2024-11-05T05:50:51Z","title":"Layer-Adaptive State Pruning for Deep State Space Models","summary":"  Due to the lack of state dimension optimization methods, deep state space\nmodels (SSMs) have sacrificed model capacity, training search space, or\nstability to alleviate computational costs caused by high state dimensions. In\nthis work, we provide a structured pruning method for SSMs, Layer-Adaptive\nSTate pruning (LAST), which reduces the state dimension of each layer in\nminimizing model-level output energy loss by extending modal truncation for a\nsingle system. LAST scores are evaluated using the $\\mathcal{H}_{\\infty}$ norms\nof subsystems and layer-wise energy normalization. The scores serve as global\npruning criteria, enabling cross-layer comparison of states and layer-adaptive\npruning. Across various sequence benchmarks, LAST optimizes previous SSMs,\nrevealing the redundancy and compressibility of their state spaces. Notably, we\ndemonstrate that, on average, pruning 33% of states still maintains performance\nwith 0.52% accuracy loss in multi-input multi-output SSMs without retraining.\nCode is available at https://github.com/msgwak/LAST.\n","authors":["Minseon Gwak","Seongrok Moon","Joohwan Ko","PooGyeon Park"],"pdf_url":"https://arxiv.org/pdf/2411.02824v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.06566v2","updated":"2025-01-14T05:06:42Z","published":"2025-01-11T15:06:34Z","title":"Cooperative Aerial Robot Inspection Challenge: A Benchmark for\n  Heterogeneous Multi-UAV Planning and Lessons Learned","summary":"  We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a\nsimulation-based benchmark for motion planning algorithms in heterogeneous\nmulti-UAV systems. CARIC features UAV teams with complementary sensors,\nrealistic constraints, and evaluation metrics prioritizing inspection quality\nand efficiency. It offers a ready-to-use perception-control software stack and\ndiverse scenarios to support the development and evaluation of task allocation\nand motion planning algorithms. Competitions using CARIC were held at IEEE CDC\n2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation,\nattracting innovative solutions from research teams worldwide. This paper\nexamines the top three teams from CDC 2023, analyzing their exploration,\ninspection, and task allocation strategies while drawing insights into their\nperformance across scenarios. The results highlight the task's complexity and\nsuggest promising directions for future research in cooperative multi-UAV\nsystems.\n","authors":["Muqing Cao","Thien-Minh Nguyen","Shenghai Yuan","Andreas Anastasiou","Angelos Zacharia","Savvas Papaioannou","Panayiotis Kolios","Christos G. Panayiotou","Marios M. Polycarpou","Xinhang Xu","Mingjie Zhang","Fei Gao","Boyu Zhou","Ben M. Chen","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2501.06566v2.pdf","comment":"Please find our website at https://ntu-aris.github.io/caric"},{"id":"http://arxiv.org/abs/2403.05007v6","updated":"2025-01-14T04:36:20Z","published":"2024-03-08T03:06:39Z","title":"Age of Computing: A Metric of Computation Freshness in Communication and\n  Computation Cooperative Networks","summary":"  In communication and computation cooperative networks (3CNs), timely\ncomputation is crucial but not always guaranteed. There is a strong demand for\na computational task to be completed within a given deadline. The time taken\ninvolves both processing time, communication time, and the impact of the\ndeadline. However, a measure of such timeliness in 3CNs is lacking. In this\npaper, we introduce the novel concept, Age of Computing (AoC), to capture\ncomputation freshness in 3CNs. We analyze AoC in a line topology consisting of\na source, a transmitter, a receiver, and a computational node. Tasks generated\nby the source are immediately available at the transmitter, where they enter a\ncommunication queue. These tasks then pass to the receiver and subsequently to\na computation queue at the computational node for processing. Each task has a\ndeadline, requiring completion within this timeframe. AoC is evaluated under\ntwo types of deadlines: (i) soft deadline, tasks can be fed back to the source\nif delayed beyond the deadline, but with additional latency; (ii) hard\ndeadline, tasks delayed beyond the deadline are discarded. Under both\ndeadlines, we derive the AoC formula and a general expression for the\ntime-average AoC. For the first-come, first-serve discipline, we obtain a\nclosed-form expression for the average AoC under the soft deadline and an\napproximation for the hard deadline. In addition to freshness, we define\ncomputation throughput, providing a general expression and an approximation. To\nexplore the relationship between freshness and throughput, we construct an\noptimization problem and prove that the objective pair is a weakly\nPareto-optimal point. Numerical results validate all the theoretical findings.\nAdditionally, they reveal that under the hard deadline, the computation\nthroughput serves as a reliable proxy for the average AoC.\n","authors":["Xingran Chen","Yi Zhuang","Kun Yang"],"pdf_url":"https://arxiv.org/pdf/2403.05007v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07827v1","updated":"2025-01-14T04:02:08Z","published":"2025-01-14T04:02:08Z","title":"Prediction Interval Construction Method for Electricity Prices","summary":"  Accurate prediction of electricity prices plays an essential role in the\nelectricity market. To reflect the uncertainty of electricity prices, price\nintervals are predicted. This paper proposes a novel prediction interval\nconstruction method. A conditional generative adversarial network is first\npresented to generate electricity price scenarios, with which the prediction\nintervals can be constructed. Then, different generated scenarios are stacked\nto obtain the probability densities, which can be applied to accurately reflect\nthe uncertainty of electricity prices. Furthermore, a reinforced prediction\nmechanism based on the volatility level of weather factors is introduced to\naddress the spikes or volatile prices. A case study is conducted to verify the\neffectiveness of the proposed novel prediction interval construction method.\nThe method can also provide the probability density of each price scenario\nwithin the prediction interval and has the superiority to address the volatile\nprices and price spikes with a reinforced prediction mechanism.\n","authors":["Xin Lu"],"pdf_url":"https://arxiv.org/pdf/2501.07827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04964v2","updated":"2025-01-14T03:47:54Z","published":"2025-01-09T04:34:07Z","title":"Promoting Shared Energy Storage Aggregation among High Price-Tolerance\n  Prosumer: An Incentive Deposit and Withdrawal Service","summary":"  Many residential prosumers exhibit a high price-tolerance for household\nelectricity bills and a low response to price incentives. This is because the\nhousehold electricity bills are not inherently high, and the potential for\nsaving on electricity bills through participation in conventional Shared Energy\nStorage (SES) is limited, which diminishes their motivation to actively engage\nin SES. Additionally, existing SES models often require prosumers to take\nadditional actions, such as optimizing rental capacity and bidding prices,\nwhich happen to be capabilities that typical household prosumers do not\npossess. To incentivize these high price-tolerance residential prosumers to\nparticipate in SES, a novel SES aggregation framework is proposed, which does\nnot require prosumers to take additional actions and allows them to maintain\nexisting energy storage patterns. Compared to conventional long-term operation\nof SES, the proposed framework introduces an additional short-term construction\nstep during which the energy service provider (ESP) acquires control of the\nenergy storage systems (ESS) and offers electricity deposit and withdrawal\nservices (DWS) with dynamic coefficients, enabling prosumers to withdraw more\nelectricity than they deposit without additional actions. Additionally, a\nmatching mechanism is proposed to align prosumers' electricity consumption\nbehaviors with ESP's optimization strategies. Finally, the dynamic coefficients\nin DWS and trading strategies are optimized by an improved deep reinforcement\nlearning (DRL) algorithm. Case studies are conducted to verify the\neffectiveness of the proposed SES aggregation framework with DWS and the\nmatching mechanism.\n","authors":["Xin Lu","Jing Qiu","Cuo Zhang","Gang Lei","Jianguo Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.04964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06783v2","updated":"2025-01-14T03:16:01Z","published":"2025-01-12T11:42:28Z","title":"Cost-Effective Robotic Handwriting System with AI Integration","summary":"  This paper introduces a cost-effective robotic handwriting system designed to\nreplicate human-like handwriting with high precision. Combining a Raspberry Pi\nPico microcontroller, 3D-printed components, and a machine learning-based\nhandwriting generation model implemented via TensorFlow, the system converts\nuser-supplied text into realistic stroke trajectories. By leveraging\nlightweight 3D-printed materials and efficient mechanical designs, the system\nachieves a total hardware cost of approximately \\$56, significantly\nundercutting commercial alternatives. Experimental evaluations demonstrate\nhandwriting precision within $\\pm$0.3 millimeters and a writing speed of\napproximately 200 mm/min, positioning the system as a viable solution for\neducational, research, and assistive applications. This study seeks to lower\nthe barriers to personalized handwriting technologies, making them accessible\nto a broader audience.\n","authors":["Tianyi Huang","Richard Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.06783v2.pdf","comment":"This is an updated version of a paper originally presented at the\n  2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)"},{"id":"http://arxiv.org/abs/2305.17217v5","updated":"2025-01-14T01:13:02Z","published":"2023-05-26T19:22:31Z","title":"Tactile-based Exploration, Mapping and Navigation with\n  Collision-Resilient Aerial Vehicles","summary":"  This article introduces XPLORER, a passive deformable UAV with a\nspring-augmented chassis and proprioceptive state awareness, designed to endure\ncollisions and maintain smooth contact. We develop a fast-converging external\nforce estimation algorithm for XPLORER that leverages onboard sensors and\nproprioceptive data for contact and collision detection. Using this force\ninformation, we propose four motion primitives, including three novel\ntactile-based primitives: tactile-traversal, tactile-turning, and\nricocheting-to aid XPLORER in navigating unknown environments. These primitives\nare synthesized autonomously in real-time to enable efficient exploration and\nnavigation by leveraging collisions and contacts. Experimental results\ndemonstrate the effectiveness of our approach, highlighting the potential of\npassive deformable UAVs for contact-rich real-world tasks such as\nnon-destructive inspection, surveillance and mapping, and pursuit/evasion.\n","authors":["Karishma Patnaik","Aravind Adhith Pandian Saravanakumaran","Wenlong Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.17217v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13033v4","updated":"2025-01-14T00:19:38Z","published":"2023-11-21T22:45:07Z","title":"Invariance Proximity: Closed-Form Error Bounds for Finite-Dimensional\n  Koopman-Based Models","summary":"  A popular way to approximate the Koopman operator's action on a\nfinite-dimensional subspace of functions is via orthogonal projections. The\nquality of the projected model directly depends on the selected subspace,\nspecifically on how close it is to being invariant under the Koopman operator.\nThe notion of invariance proximity provides a tight upper bound on the\nworst-case relative prediction error of the finite-dimensional model. However,\nits direct calculation is computationally challenging. This paper leverages the\ngeometric structure behind the definition of invariance proximity to provide a\nclosed-form expression in terms of Jordan principal angles on general inner\nproduct spaces. Unveiling this connection allows us to exploit specific\nisomorphisms to circumvent the computational challenges associated with spaces\nof functions and enables the use of existing efficient numerical routines to\ncompute invariance proximity.\n","authors":["Masih Haseli","Jorge Cortés"],"pdf_url":"https://arxiv.org/pdf/2311.13033v4.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.06042v2","updated":"2025-01-14T00:01:18Z","published":"2025-01-10T15:21:48Z","title":"The improvement in transmission resilience metrics from reduced outages\n  or faster restoration can be calculated by rerunning historical outage data","summary":"  Transmission utilities routinely collect detailed outage data, including\nresilience events in which outages bunch up due to weather. The resilience\nevents and their resilience metrics can readily be extracted from this\nhistorical outage data. Improvements such as grid hardening or investments in\nrestoration lead to reduced outages or faster restoration. We show how to rerun\nthis history with the effects of the reduced outages or faster restoration\nincluded to find the resulting improvement in resilience metrics, thus\nquantifying the benefits of these investments. This is demonstrated with case\nstudies for specific events (a derecho and a hurricane), and all large events\nor large thunderstorms in the Midwest USA. Instead of predicting future extreme\nevents with models, which is very challenging, the historical rerun readily\nquantifies the benefits that a resilience investment would have had if it had\nbeen made in the past. The historical rerun is particularly vivid in making the\ncase for resilience investments to stakeholders because it quantifies the\nbenefits for events actually experienced by those stakeholders, rather than for\nfuture events predicted with uncertainty.\n","authors":["Arslan Ahmad","Ian Dobson","Svetlana Ekisheva","Christopher Claypool","Mark Lauby"],"pdf_url":"https://arxiv.org/pdf/2501.06042v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01313v2","updated":"2025-01-14T23:44:26Z","published":"2023-12-03T08:05:25Z","title":"Observer-based Periodic Event-triggered and Self-triggered Boundary\n  Control of a Class of Parabolic PDEs","summary":"  This paper introduces the first observer-based periodic event-triggered\ncontrol (PETC) and self-triggered control (STC) for boundary control of a class\nof parabolic PDEs using PDE backstepping control. We introduce techniques to\nconvert a certain class of continuous-time event-triggered control into PETC\nand STC, eliminating the need for continuous monitoring of the event-triggering\nfunction. For the PETC, the event-triggering function requires only periodic\nevaluations to detect events, while the STC proactively computes the time of\nthe next event right at the current event time using the system model and the\ncontinuously available measurements. For both strategies, the control input is\nupdated exclusively at events and is maintained using a zero-order hold between\nevents. We demonstrate that the closed-loop system is Zeno-free. We offer\ncriteria for selecting an appropriate sampling period for the PETC and for\ndetermining the time until the next event under the STC. We prove the system's\nglobal exponential convergence to zero in the spatial $L^2$ norm for both\nanti-collocated and collocated sensing and actuation under the PETC. For the\nSTC, local exponential convergence to zero in the spatial $L^2$ norm for\ncollocated sensing and actuation is proven. Simulations are provided to\nillustrate the theoretical claims.\n","authors":["Bhathiya Rathnayake","Mamadou Diagne"],"pdf_url":"https://arxiv.org/pdf/2312.01313v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08178v2","updated":"2025-01-14T23:32:53Z","published":"2024-07-11T04:46:28Z","title":"Performance-Barrier Event-Triggered Control of a Class of\n  Reaction-Diffusion PDEs","summary":"  We employ the recent performance-barrier event-triggered control (P-ETC) for\nachieving global exponential convergence of a class of reaction-diffusion PDEs\nvia PDE backstepping control. Rather than insisting on a strictly monotonic\ndecrease of the Lyapunov function for the closed-loop system, P-ETC allows the\nLyapunov function to increase as long as it remains below an acceptable\nperformance-barrier. This approach integrates a performance residual, the\ndifference between the value of the performance-barrier and the Lyapunov\nfunction, into the triggering mechanism. The integration adds flexibility and\nresults in fewer control updates than with regular ETC (R-ETC) that demands a\nmonotonic decrease of the Lyapunov function. Our P-ETC PDE backstepping design\nensures global exponential convergence of the closed-loop system in the spatial\nL^2 norm, without encountering Zeno phenomenon. To avoid continuous monitoring\nof the triggering function that generates events, we develop periodic\nevent-triggered and self-triggered variants (P-PETC and P-STC, respectively) of\nthe P-ETC. The P-PETC only requires periodic evaluation of the triggering\nfunction whereas the P-STC preemptively computes the time of the next event at\nthe current event time using the system model and continuously available system\nstates. The P-PETC and P-STC also ensure a Zeno-free behavior and deliver\nperformance equivalent to that of the continuous-time P-ETC which requires\ncontinuous evaluation of the triggering function, in addition to the continuous\nsensing of the state. We provide numerical simulations to illustrate the\nproposed technique and to compare it with R-ETC associated with strictly\ndecreasing Lyapunov functions.\n","authors":["Bhathiya Rathnayake","Mamadou Diagne","Jorge Cortes","Miroslav Krstic"],"pdf_url":"https://arxiv.org/pdf/2407.08178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08472v1","updated":"2025-01-14T22:37:30Z","published":"2025-01-14T22:37:30Z","title":"Energy Storage Arbitrage Under Price Uncertainty: Market Risks and\n  Opportunities","summary":"  We investigate the profitability and risk of energy storage arbitrage in\nelectricity markets under price uncertainty, exploring both robust and\nchance-constrained optimization approaches. We analyze various uncertainty\nrepresentations, including polyhedral, ellipsoidal uncertainty sets and\nprobabilistic approximations, to model price fluctuations and construct\nefficient frontiers that highlight the tradeoff between risk and profit. Using\nhistorical electricity price data, we quantify the impact of uncertainty on\narbitrage strategies and compare their performance under distinct market\nconditions. The results reveal that arbitrage strategies under uncertainties\ncan effectively secure expected profits, and robust strategies perform better\nin risk management across varying levels of conservativeness, especially under\nhighly volatile market conditions. This work provides insights into storage\narbitrage strategy selection for market participants with differing risk\npreferences, emphasizing the adaptability of efficient frontiers to the\nelectricity market.\n","authors":["Yiqian Wu","Bolun Xu","James Anderson"],"pdf_url":"https://arxiv.org/pdf/2501.08472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08469v1","updated":"2025-01-14T22:30:38Z","published":"2025-01-14T22:30:38Z","title":"Electrostatic Clutches Enable High-Force Mechanical Multiplexing:\n  Demonstrating Single-Motor Full-Actuation of a 4-DoF Hand","summary":"  This paper introduces a novel mechanical multiplexing system powered by\nelectrostatic capstan clutches, enabling high-force, single-motor control of\nmultiple degrees of freedom (DoF). The system is capable of both bidirectional\nsingle-input single-output time-division and single-input multiple-output\nmultiplexing to actuate a commercial 4-DoF robotic hand with a single motor.\nOur mechanical multiplexer is also capable of powerless position holding owing\nto its use of a leadscrew nut acting as the output. Experimental results\ndemonstrate the effectiveness of this approach, achieving individual and\nsimultaneous actuation. This innovation offers a scalable solution for high-DoF\nrobotic systems, providing a path to efficient actuation in robotic platforms.\n","authors":["Timothy E. Amish","Jeffrey T. Auletta","Chad C. Kessens","Joshua R. Smith","Jeffrey I. Lipton"],"pdf_url":"https://arxiv.org/pdf/2501.08469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02459v3","updated":"2025-01-14T22:07:08Z","published":"2022-12-05T18:02:46Z","title":"Resilient Distributed Optimization for Multi-Agent Cyberphysical Systems","summary":"  This work focuses on the problem of distributed optimization in multi-agent\ncyberphysical systems, where a legitimate agent's iterates are influenced both\nby the values it receives from potentially malicious neighboring agents, and by\nits own self-serving target function. We develop a new algorithmic and\nanalytical framework to achieve resilience for the class of problems where\nstochastic values of trust between agents exist and can be exploited. In this\ncase, we show that convergence to the true global optimal point can be\nrecovered, both in mean and almost surely, even in the presence of malicious\nagents. Furthermore, we provide expected convergence rate guarantees in the\nform of upper bounds on the expected squared distance to the optimal value.\nFinally, numerical results are presented that validate our analytical\nconvergence guarantees even when the malicious agents compose the majority of\nagents in the network and where existing methods fail to converge to the\noptimal nominal points.\n","authors":["Michal Yemini","Angelia Nedić","Andrea J. Goldsmith","Stephanie Gil"],"pdf_url":"https://arxiv.org/pdf/2212.02459v3.pdf","comment":"Accepted for publication in the IEEE Transactions on Automatic\n  Control"},{"id":"http://arxiv.org/abs/2501.08420v1","updated":"2025-01-14T20:23:05Z","published":"2025-01-14T20:23:05Z","title":"Nonlinear Modeling of a PEM Fuel Cell System; a Practical Study with\n  Experimental Validation","summary":"  In this paper, a nonlinear six order model is proposed for a proton exchange\nmembrane fuel cell (PEMFC) as a control-oriented electrochemical model. Its\nvalidation is performed on a specific single cell PEMFC with effective\ndimension of 5 cm5 cm. This model is described in the nonlinear state space\nform with 6 state variables. Load current and DC voltage are considered as\nmeasurable disturbance and control input respectively. Besides, the model\nincludes fuel cell stack and its auxiliary components as well. In this survey,\na nonlinear state space representation is derived by arranging nonlinear\nequations and combining them with auxiliary components model. The proposed\nmodel can be successfully used to design nonlinear controller and nonlinear\nobserver systems. The analyzed PEMFC system consists of air compressor motor\ndynamic equations, air and fuel supply subsystems, a perfect air humidifier and\na fuel cell stack. An experimentally validated nonlinear model that reproduces\nthe most typical features of a laboratory PEMFC system is presented. This model\nis derived based on physics law in stack, including system gases dynamics. The\nobjective of this paper is to introduce a fully analytical model which has been\nfully validated on a fuel cell system and its auxiliary components. The\nproposed method can be used as a general modeling guideline for\ncontrol-oriented purposes. Moreover, it can be successfully implemented in\ncomposing a dynamic subsystem, like hydrogen subsystem, as part of the whole\nnonlinear model.\n","authors":["Seyed Mehdi Rakhtala","Roja Eini"],"pdf_url":"https://arxiv.org/pdf/2501.08420v1.pdf","comment":"1272-1296"},{"id":"http://arxiv.org/abs/2501.08418v1","updated":"2025-01-14T20:21:06Z","published":"2025-01-14T20:21:06Z","title":"CVaR-Based Variational Quantum Optimization for User Association in\n  Handoff-Aware Vehicular Networks","summary":"  Efficient resource allocation is essential for optimizing various tasks in\nwireless networks, which are usually formulated as generalized assignment\nproblems (GAP). GAP, as a generalized version of the linear sum assignment\nproblem, involves both equality and inequality constraints that add\ncomputational challenges. In this work, we present a novel Conditional Value at\nRisk (CVaR)-based Variational Quantum Eigensolver (VQE) framework to address\nGAP in vehicular networks (VNets). Our approach leverages a hybrid\nquantum-classical structure, integrating a tailored cost function that balances\nboth objective and constraint-specific penalties to improve solution quality\nand stability. Using the CVaR-VQE model, we handle the GAP efficiently by\nfocusing optimization on the lower tail of the solution space, enhancing both\nconvergence and resilience on noisy intermediate-scale quantum (NISQ) devices.\nWe apply this framework to a user-association problem in VNets, where our\nmethod achieves 23.5% improvement compared to the deep neural network (DNN)\napproach.\n","authors":["Zijiang Yan","Hao Zhou","Jianhua Pei","Aryan Kaushik","Hina Tabassum","Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08418v1.pdf","comment":"Accepted in IEEE International Conference on Communications (ICC\n  2025)"},{"id":"http://arxiv.org/abs/2501.10447v1","updated":"2025-01-14T13:14:21Z","published":"2025-01-14T13:14:21Z","title":"A Predictive Cooperative Collision Avoidance for Multi-Robot Systems\n  Using Control Barrier Function","summary":"  Control barrier function (CBF)-based methods provide the minimum modification\nnecessary to formally guarantee safety in the context of quadratic programming,\nand strict safety guarantee for safety critical systems. However, most\nCBF-related derivatives myopically focus on present safety at each time step, a\nreasoning over a look-ahead horizon is exactly missing. In this paper, a\npredictive safety matrix is constructed. We then consolidate the safety\ncondition based on the smallest eigenvalue of the proposed safety matrix. A\npredefined deconfliction strategy of motion paths is embedded into the\ntrajectory tracking module to manage deadlock conflicts, which computes the\ndeadlock escape velocity with the minimum attitude angle. Comparison results\nshow that the introduction of the predictive term is robust for measurement\nuncertainty and is immune to oscillations. The proposed deadlock avoidance\nmethod avoids a large detour, without obvious stagnation.\n","authors":["Xiaoxiao Li","Zhirui Sun","Hongpeng Wang","Shuai Li","Jiankun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10446v1","updated":"2025-01-14T11:47:02Z","published":"2025-01-14T11:47:02Z","title":"Optimizing a multi-state cold-standby system with multiple vacations in\n  the repair and loss of units","summary":"  A complex multi-state redundant system with preventive maintenance subject to\nmultiple events is considered. The online unit can undergo several types of\nfailures: internal and those provoked by external shocks. Multiple degradation\nlevels are assumed so as internal and external. Degradation levels are observed\nby random inspections and if they are major, the unit goes to repair facility\nwhere preventive maintenance is carried out. This repair facility is composed\nof a single repairperson governed by a multiple vacation policy. This policy is\nset up according to the operational number of units. Two types of task can be\nperformed by the repairperson, corrective repair and preventive maintenance.\nThe times embedded in the system are phase type distributed and the model is\nbuilt by using Markovian Arrival Processes with marked arrivals. Multiple\nperformance measures besides of the transient and stationary distribution are\nworked out through matrix-analytic methods. This methodology enables us to\nexpress the main results and the global development in a matrix-algorithmic\nform. To optimize the model costs and rewards are included. A numerical example\nshows the versatility of the model.\n","authors":["Juan Eloy Ruiz-Castro"],"pdf_url":"https://arxiv.org/pdf/2501.10446v1.pdf","comment":null}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2501.08330v1","updated":"2025-01-14T18:59:09Z","published":"2025-01-14T18:59:09Z","title":"Gradient Equilibrium in Online Learning: Theory and Applications","summary":"  We present a new perspective on online learning that we refer to as gradient\nequilibrium: a sequence of iterates achieves gradient equilibrium if the\naverage of gradients of losses along the sequence converges to zero. In\ngeneral, this condition is not implied by nor implies sublinear regret. It\nturns out that gradient equilibrium is achievable by standard online learning\nmethods such as gradient descent and mirror descent with constant step sizes\n(rather than decaying step sizes, as is usually required for no regret).\nFurther, as we show through examples, gradient equilibrium translates into an\ninterpretable and meaningful property in online prediction problems spanning\nregression, classification, quantile estimation, and others. Notably, we show\nthat the gradient equilibrium framework can be used to develop a debiasing\nscheme for black-box predictions under arbitrary distribution shift, based on\nsimple post hoc online descent updates. We also show that post hoc gradient\nupdates can be used to calibrate predicted quantiles under distribution shift,\nand that the framework leads to unbiased Elo scores for pairwise preference\nprediction.\n","authors":["Anastasios N. Angelopoulos","Michael I. Jordan","Ryan J. Tibshirani"],"pdf_url":"https://arxiv.org/pdf/2501.08330v1.pdf","comment":"Code available at\n  https://github.com/aangelopoulos/gradient-equilibrium/"},{"id":"http://arxiv.org/abs/2501.08317v1","updated":"2025-01-14T18:52:27Z","published":"2025-01-14T18:52:27Z","title":"A Similarity Measure Between Functions with Applications to Statistical\n  Learning and Optimization","summary":"  In this note, we present a novel measure of similarity between two functions.\nIt quantifies how the sub-optimality gaps of two functions convert to each\nother, and unifies several existing notions of functional similarity. We show\nthat it has convenient operation rules, and illustrate its use in empirical\nrisk minimization and non-stationary online optimization.\n","authors":["Chengpiao Huang","Kaizheng Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08317v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2201.03111v3","updated":"2025-01-14T18:22:20Z","published":"2022-01-09T23:46:29Z","title":"Sensitivity Analysis for Binary Outcome Misclassification in\n  Randomization Tests via Integer Programming","summary":"  Conducting a randomization test is a common method for testing causal null\nhypotheses in randomized experiments. The popularity of randomization tests is\nlargely because their statistical validity only depends on the randomization\ndesign, and no distributional or modeling assumption on the outcome variable is\nneeded. However, randomization tests may still suffer from other sources of\nbias, among which outcome misclassification is a significant one. We propose a\nmodel-free and finite-population sensitivity analysis approach for binary\noutcome misclassification in randomization tests. A central quantity in our\nframework is ``warning accuracy,\" defined as the threshold such that a\nrandomization test result based on the measured outcomes may differ from that\nbased on the true outcomes if the outcome measurement accuracy did not surpass\nthat threshold. We show how learning the warning accuracy and related concepts\ncan amplify analyses of randomization tests subject to outcome\nmisclassification without adding additional assumptions. We show that the\nwarning accuracy can be computed efficiently for large data sets by adaptively\nreformulating a large-scale integer program with respect to the randomization\ndesign. We apply the proposed approach to the Prostate Cancer Prevention Trial\n(PCPT). We also developed an open-source R package for implementation of our\napproach.\n","authors":["Siyu Heng","Pamela A. Shaw"],"pdf_url":"https://arxiv.org/pdf/2201.03111v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08293v1","updated":"2025-01-14T18:13:36Z","published":"2025-01-14T18:13:36Z","title":"A GPU-Accelerated Distributed Algorithm for Optimal Power Flow in\n  Distribution Systems","summary":"  We propose a GPU-accelerated distributed optimization algorithm for\ncontrolling multi-phase optimal power flow in active distribution systems with\ndynamically changing topologies. To handle varying network configurations and\nenable adaptable decomposition, we advocate a componentwise decomposition\nstrategy. However, this approach can lead to a prolonged computation time\nmainly due to the excessive iterations required for achieving consensus among a\nlarge number of fine-grained components. To overcome this, we introduce a\ntechnique that segregates equality constraints from inequality constraints,\nenabling GPU parallelism to reduce per-iteration time by orders of magnitude,\nthereby significantly accelerating the overall computation. Numerical\nexperiments on IEEE test systems ranging from 13 to 8500 buses demonstrate the\nsuperior scalability of the proposed approach compared to its CPU-based\ncounterparts.\n","authors":["Minseok Ryu","Geunyeong Byeon","Kibaek Kim"],"pdf_url":"https://arxiv.org/pdf/2501.08293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13092v2","updated":"2025-01-14T18:04:28Z","published":"2024-01-23T21:14:06Z","title":"Learning-based Attitude Estimation with Noisy Measurements and Unknown\n  Gyro Bias","summary":"  This paper introduces a learning-based, data-driven attitude estimator,\ncalled the retrospective cost attitude estimator (RCAE), for the SO(3) attitude\nrepresentation. RCAE is motivated by the multiplicative extended Kalman filter\n(MEKF). However, unlike MEKF, which requires computing a Jacobian to compute\nthe correction signal, RCAC uses retrospective cost optimization that depends\nonly on the measured data. Moreover, due to the structure of the correction\nsignal, RCAE does not require explicit estimation of gyro bias. The performance\nof RCAE is verified and compared with MEKF through both numerical simulations\nand physical experiments.\n","authors":["Parham Oveissi","Mohammad Mirtaba","Ankit Goel"],"pdf_url":"https://arxiv.org/pdf/2401.13092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13200v2","updated":"2025-01-14T17:40:57Z","published":"2023-04-26T00:00:48Z","title":"Breaking barriers in two-party quantum cryptography via stochastic\n  semidefinite programming","summary":"  In the last two decades, there has been much effort in finding secure\nprotocols for two-party cryptographic tasks. It has since been discovered that\neven with quantum mechanics, many such protocols are limited in their security\npromises. In this work, we use stochastic selection, an idea from stochastic\nprogramming, to circumvent such limitations. For example, we find a way to\nswitch between bit commitment, weak coin flipping, and oblivious transfer\nprotocols to improve their security. We also use stochastic selection to turn\ntrash into treasure yielding the first quantum protocol for Rabin oblivious\ntransfer.\n","authors":["Akshay Bansal","Jamie Sikora"],"pdf_url":"https://arxiv.org/pdf/2304.13200v2.pdf","comment":"42 pages, 2 figures"},{"id":"http://arxiv.org/abs/2410.10670v3","updated":"2025-01-14T17:30:36Z","published":"2024-10-14T16:14:32Z","title":"Barrier Function for Bilevel Optimization with Coupled Lower-Level\n  Constraints: Formulation, Approximation and Algorithms","summary":"  In this paper, we consider bilevel optimization problem where the lower-level\nhas coupled constraints, i.e. the constraints depend both on the upper- and\nlower-level variables. In particular, we consider two settings for the\nlower-level problem. The first is when the objective is strongly convex and the\nconstraints are convex with respect to the lower-level variable; The second is\nwhen the lower-level is a linear program. We propose to utilize a barrier\nfunction reformulation to translate the problem into an unconstrained problem.\nBy developing a series of new techniques, we proved that both the hyperfunction\nvalue and hypergradient of the barrier reformulated problem (uniformly)\nconverge to those of the original problem under minimal assumptions. Further,\nto overcome the non-Lipschitz smoothness of hyperfunction and lower-level\nproblem for barrier reformulated problems, we design an adaptive algorithm that\nensures a non-asymptotic convergence guarantee. We also design an algorithm\nthat converges to the stationary point of the original problem asymptotically\nunder certain assumptions. The proposed algorithms require minimal assumptions,\nand to our knowledge, they are the first with convergence guarantees when the\nlower-level problem is a linear program. Numerical experiments are conducted to\nshow the effectiveness of the proposed method.\n","authors":["Xiaotian Jiang","Jiaxiang Li","Mingyi Hong","Shuzhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.10670v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08263v1","updated":"2025-01-14T17:23:14Z","published":"2025-01-14T17:23:14Z","title":"Multiplayer Federated Learning: Reaching Equilibrium with Less\n  Communication","summary":"  Traditional Federated Learning (FL) approaches assume collaborative clients\nwith aligned objectives working towards a shared global model. However, in many\nreal-world scenarios, clients act as rational players with individual\nobjectives and strategic behaviors, a concept that existing FL frameworks are\nnot equipped to adequately address. To bridge this gap, we introduce\nMultiplayer Federated Learning (MpFL), a novel framework that models the\nclients in the FL environment as players in a game-theoretic context, aiming to\nreach an equilibrium. In this scenario, each player tries to optimize their own\nutility function, which may not align with the collective goal. Within MpFL, we\npropose Per-Player Local Stochastic Gradient Descent (PEARL-SGD), an algorithm\nin which each player/client performs local updates independently and\nperiodically communicates with other players. We theoretically analyze\nPEARL-SGD and prove that it reaches a neighborhood of equilibrium with less\ncommunication in the stochastic setup compared to its non-local counterpart.\nFinally, we verify our theoretical findings through numerical experiments.\n","authors":["TaeHo Yoon","Sayantan Choudhury","Nicolas Loizou"],"pdf_url":"https://arxiv.org/pdf/2501.08263v1.pdf","comment":"43 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.08256v1","updated":"2025-01-14T17:04:20Z","published":"2025-01-14T17:04:20Z","title":"Convergence of projected stochastic approximation algorithm","summary":"  We study the Robbins-Monro stochastic approximation algorithm with\nprojections on a hyperrectangle and prove its convergence. This work fills a\ngap in the convergence proof of the classic book by Kushner and Yin. Using the\nODE method, we show that the algorithm converges to stationary points of a\nrelated projected ODE. Our results provide a better theoretical foundation for\nstochastic optimization techniques, including stochastic gradient descent and\nits proximal version. These results extend the algorithm's applicability and\nrelax some assumptions of previous research.\n","authors":["Michał Borowski","Błażej Miasojedow"],"pdf_url":"https://arxiv.org/pdf/2501.08256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08227v1","updated":"2025-01-14T16:10:45Z","published":"2025-01-14T16:10:45Z","title":"Nonlinear Cruise Controllers with Bidirectional Sensing for a String of\n  Vehicles","summary":"  We introduce a nonlinear cruise controller that is fully decentralized (by\nvehicle) and uses spacing and speed measurements from the preceding and\nfollowing vehicles to decide on the appropriate control action (acceleration)\nfor each vehicle. The proposed cruise controller is studied on both a ring-road\nand an open road and guarantees that there are no collisions between vehicles,\nwhile their speeds are always positive and never exceed the road speed limits.\nFor both cases of the open road and the ring-road, we rigorously prove that the\nset of equilibrium points is globally asymptotically stable and provide KL\nestimates that guarantee uniform convergence to the said set. Moreover, we show\nthat for the ring-road, and under certain conditions, there is a single\nequilibrium point which is exponentially attractive.\n","authors":["Iasson Karafyllis","Dionysios Theodosis","Markos Papageorgiou"],"pdf_url":"https://arxiv.org/pdf/2501.08227v1.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2312.03618v3","updated":"2025-01-14T15:24:52Z","published":"2023-12-06T17:04:08Z","title":"Beyond discounted returns: Robust Markov decision processes with average\n  and Blackwell optimality","summary":"  Robust Markov Decision Processes (RMDPs) are a widely used framework for\nsequential decision-making under parameter uncertainty. RMDPs have been\nextensively studied when the objective is to maximize the discounted return,\nbut little is known for average optimality (optimizing the long-run average of\nthe rewards obtained over time) and Blackwell optimality (remaining discount\noptimal for all discount factors sufficiently close to ). In this paper, we\nprove several foundational results for RMDPs beyond the discounted return. We\nshow that average optimal policies can be chosen stationary and deterministic\nfor sa-rectangular RMDPs but, perhaps surprisingly, we show that for\ns-rectangular RMDPs average optimal policies may not exist, and if they exist,\nmay need to be history-dependent (Markovian). We also study Blackwell\noptimality for sa-rectangular RMDPs, where we show that $\\epsilon$-Blackwell\noptimal policies always exist, although Blackwell optimal policies may not\nexist. We also provide a sufficient condition for their existence, which\nencompasses virtually any examples from the literature. We then discuss the\nconnection between average and Blackwell optimality, and we describe several\nalgorithms to compute the optimal average return. Interestingly, our approach\nleverages the connections between RMDPs and stochastic games. Overall, our\npaper emphasizes the superior practical properties of distance-based\nsa-rectangular models over s-rectangular models for average and Blackwell\noptimality.\n","authors":["Julien Grand-Clément","Marek Petrik","Nicolas Vieille"],"pdf_url":"https://arxiv.org/pdf/2312.03618v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08181v1","updated":"2025-01-14T15:05:05Z","published":"2025-01-14T15:05:05Z","title":"Economic Model Predictive Control for Periodic Operation: A Quadratic\n  Programming Approach","summary":"  Periodic dynamical systems, distinguished by their repetitive behavior over\ntime, are prevalent across various engineering disciplines. In numerous\napplications, particularly within industrial contexts, the implementation of\nmodel predictive control (MPC) schemes tailored to optimize specific economic\ncriteria was shown to offer substantial advantages. However, the real-time\nimplementation of these schemes is often infeasible due to limited\ncomputational resources. To tackle this problem, we propose a\nresource-efficient economic model predictive control scheme for periodic\nsystems, leveraging existing single-layer MPC techniques. Our method relies on\na single quadratic optimization problem, which ensures high computational\nefficiency for real-time control in dynamic settings. We prove feasibility,\nstability and convergence to optimum of the proposed approach, and validate the\neffectiveness through numerical experiments.\n","authors":["Jose A. Borja-Conde","Juan M. Nadales","Filiberto Fele","Daniel Limon"],"pdf_url":"https://arxiv.org/pdf/2501.08181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08146v1","updated":"2025-01-14T14:23:52Z","published":"2025-01-14T14:23:52Z","title":"Proximal Flow Inspired Multi-Step Methods","summary":"  We investigate a family of approximate multi-step proximal point methods,\nframed as implicit linear discretizations of gradient flow. The resulting\nmethods are multi-step proximal point methods, with similar computational cost\nin each update as the proximal point method. We explore several optimization\nmethods where applying an approximate multistep proximal points method results\nin improved convergence behavior. We also include convergence analysis for the\nproposed method in several problem settings: quadratic problems, general\nproblems that are strongly or weakly (non)convex, and accelerated results for\nalternating projections.\n","authors":["Yushen Huang","Yifan Sun"],"pdf_url":"https://arxiv.org/pdf/2501.08146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05320v2","updated":"2025-01-14T13:59:39Z","published":"2025-01-09T15:40:59Z","title":"Isoperimetric inequalities for the fractional composite membrane problem","summary":"  In this article, we investigate some isoperimetric-type inequalities related\nto the first eigenvalue of the fractional composite membrane problem. First, we\nestablish an analogue of the renowned Faber-Krahn inequality for the fractional\ncomposite membrane problem. Next, we investigate an isoperimetric inequality\nfor the first eigenvalue of the fractional composite membrane problem on the\nintersection of two domains-a problem that was first studied by Lieb [23] for\nthe Laplacian. Similar results in the local case were previously obtained by\nCupini-Vecchi [9] for the composite membrane problem. Our findings provide\nfurther insights into the fractional setting, offering a new perspective on\nthese classical inequalities.\n","authors":["Mrityunjoy Ghosh"],"pdf_url":"https://arxiv.org/pdf/2501.05320v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2412.17070v3","updated":"2025-01-14T13:16:27Z","published":"2024-12-22T15:43:01Z","title":"Decoupled Functional Central Limit Theorems for Two-Time-Scale\n  Stochastic Approximation","summary":"  In two-time-scale stochastic approximation (SA), two iterates are updated at\ndifferent rates, governed by distinct step sizes, with each update influencing\nthe other. Previous studies have demonstrated that the convergence rates of the\nerror terms for these updates depend solely on their respective step sizes, a\nproperty known as decoupled convergence. However, a functional version of this\ndecoupled convergence has not been explored. Our work fills this gap by\nestablishing decoupled functional central limit theorems for two-time-scale SA,\noffering a more precise characterization of its asymptotic behavior. To achieve\nthese results, we leverage the martingale problem approach and establish\ntightness as a crucial intermediate step. Furthermore, to address the\ninterdependence between different time scales, we introduce an innovative\nauxiliary sequence to eliminate the primary influence of the fast-time-scale\nupdate on the slow-time-scale update.\n","authors":["Yuze Han","Xiang Li","Jiadong Liang","Zhihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.17070v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08098v1","updated":"2025-01-14T13:11:15Z","published":"2025-01-14T13:11:15Z","title":"A Time- and Space-Efficient Heuristic Approach for Late Train-Crew\n  Rescheduling","summary":"  In this paper, we reschedule the duties of train drivers one day before the\noperation. Due to absent drivers (e.g., because of sick leave), some trains\nhave no driver. Thus, duties need to be rescheduled for the day of operation.\nWe start with a feasible crew schedule for each of the remaining operating\ndrivers, a set of unassigned tasks originally assigned to the absent drivers,\nand a group of standby drivers with fixed start time, end time, start depot,\nand end depot. Our aim is to generate a crew schedule with as few canceled or\nchanged tasks as possible. We present a tabu-search-based approach for crew\nrescheduling. We also adapt a column-generation approach with the same\nobjective function and equivalent restrictions as the benchmark for comparing\nthe results, computational time, and space usage. Our tabu-search-based\napproach needs both less computation time and space than the column-generation\napproach to compute an acceptable result. We further test the performance of\nour approach under different settings. The data used in the experiments\noriginated from a regional passenger-train system around Stockholm, Sweden and\nwas provided by M\\\"alart\\r{a}g.\n","authors":["Liyun Yu","Carl Henrik Häll","Anders Peterson","Christiane Schmidt"],"pdf_url":"https://arxiv.org/pdf/2501.08098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08064v1","updated":"2025-01-14T12:18:47Z","published":"2025-01-14T12:18:47Z","title":"On Subdifferentials Via a Generalized Conjugation Scheme: An Application\n  to DC Problems and Optimality Conditions","summary":"  This paper studies properties of a subdifferential defined using a\ngeneralized conjugation scheme. We relate this subdifferential together with\nthe domain of an appropriate conjugate function and the {\\epsilon}-directional\nderivative. In addition, we also present necessary conditions for\n{\\epsilon}-optimality and global optimality in optimization problems involving\nthe difference of two convex functions. These conditions will be written via\nthis generalized notion of subdifferential studied in the first sections of the\npaper.\n","authors":["M. D. Fajardo","J. Vidal-Nunez"],"pdf_url":"https://arxiv.org/pdf/2501.08064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08061v1","updated":"2025-01-14T12:13:59Z","published":"2025-01-14T12:13:59Z","title":"On Fenchel c-conjugate dual problems for DC optimization: characterizing\n  weak, strong and stable strong duality","summary":"  In this paper we present two Fenchel-type dual problems for a DC (difference\nof convex functions) optimization primal one. They have been built by means of\nthe c-conjugation scheme, a pattern of conjugation which has been shown to be\nsuitable for evenly convex functions. We study characterizations of weak,\nstrong and stable strong duality for both pairs of primal-dual problems. We\nalso give conditions which relate the existence of strong and stable strong\nduality for both pairs.\n","authors":["M. D. Fajardo","J. Vidal-Nunez"],"pdf_url":"https://arxiv.org/pdf/2501.08061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09064v2","updated":"2025-01-14T10:42:38Z","published":"2023-12-14T16:02:08Z","title":"Parallel Inexact Levenberg-Marquardt Method for Nearly-Separable\n  Nonlinear Least Squares","summary":"  Motivated by localization problems such as cadastral maps refinements, we\nconsider a generic Nonlinear Least Squares (NLS) problem of minimizing an\naggregate squared fit across all nonlinear equations (measurements) with\nrespect to the set of unknowns, e.g., coordinates of the unknown points'\nlocations. In a number of scenarios, NLS problems exhibit a nearly-separable\nstructure: the set of measurements can be partitioned into disjoint groups\n(blocks), such that the unknowns that correspond to different blocks are only\nloosely coupled. We propose an efficient parallel method, termed Parallel\nInexact Levenberg Marquardt (PILM), to solve such generic large scale NLS\nproblems. PILM builds upon the classical Levenberg-Marquard (LM) method, with a\nmain novelty in that the nearly-block separable structure is leveraged in order\nto obtain a scalable parallel method. Therein, the problem-wide system of\nlinear equations that needs to be solved at every LM iteration is tackled\niteratively. At each (inner) iteration, the block-wise systems of linear\nequations are solved in parallel, while the problem-wide system is then handled\nvia sparse, inexpensive inter-block communication. We establish strong\nconvergence guarantees of PILM that are analogous to those of the classical LM;\nprovide PILM implementation in a master-worker parallel compute environment;\nand demonstrate its efficiency on huge scale cadastral map refinement problems.\n","authors":["Lidija Fodor","Dusan Jakovetic","Natasa Krejic","Greta Malaspina"],"pdf_url":"https://arxiv.org/pdf/2312.09064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07891v4","updated":"2025-01-14T10:35:32Z","published":"2024-06-12T05:49:50Z","title":"McCormick envelopes in mixed-integer PDE-constrained optimization","summary":"  McCormick envelopes are a standard tool for deriving convex relaxations of\noptimization problems that involve polynomial terms. Such McCormick relaxations\nprovide lower bounds, for example, in branch-and-bound procedures for\nmixed-integer nonlinear programs but have not gained much attention in\nPDE-constrained optimization so far. This lack of attention may be due to the\ndistributed nature of such problems, which on the one hand leads to infinitely\nmany linear constraints (generally state constraints that may be difficult to\nhandle) in addition to the state equation for a pointwise formulation of the\nMcCormick envelopes and renders bound-tightening procedures that successively\nimprove the resulting convex relaxations computationally intractable.\n  We analyze McCormick envelopes for a problem class that is governed by a\nsemilinear PDE involving a bilinearity and integrality constraints. We\napproximate the nonlinearity by averaging the involved terms over the cells of\na partition of the computational domain on which the PDE is defined. This\nyields convex relaxations that underestimate the original problem up to an a\npriori error estimate that depends on the mesh size of the discretization.\nThese approximate McCormick relaxations can be improved by means of an\noptimization-based bound-tightening procedure. We show that their minimizers\nconverge to minimizers to a limit problem with a pointwise formulation of the\nMcCormick envelopes when driving the mesh size to zero.\n  We provide a computational example, for which we certify all of our imposed\nassumptions. The results point to both the potential of the methodology and the\ngaps in the research that need to be closed.\n","authors":["Sven Leyffer","Paul Manns"],"pdf_url":"https://arxiv.org/pdf/2406.07891v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11713v2","updated":"2025-01-14T10:33:40Z","published":"2024-02-18T21:29:50Z","title":"Existence of an optimal shape for the first eigenvalue of polyharmonic\n  operators","summary":"  We prove the existence of an open set minimizing the first eigenvalue of the\nDirichlet polylaplacian of order $m\\geq1$ under volume constraint. Moreover,\nthe corresponding eigenfunction is shown to enjoy $C^{m-1,\\alpha}$ H\\\"older\nregularity. This is performed for dimension $2\\leq d\\leq 4m$. In particular,\nour analysis answers the question of the existence of an optimal shape for the\nclamped plate up to dimension $8$.\n","authors":["Roméo Leylekian"],"pdf_url":"https://arxiv.org/pdf/2402.11713v2.pdf","comment":"18 pages, no figure; Lemma 15 added"},{"id":"http://arxiv.org/abs/2305.13985v2","updated":"2025-01-14T10:28:10Z","published":"2023-05-23T12:11:13Z","title":"Distributed Inexact Newton Method with Adaptive Step Sizes","summary":"  We consider two formulations for distributed optimization wherein $N$ agents\nin a generic connected network solve a problem of common interest: distributed\npersonalized optimization and consensus optimization. A new method termed DINAS\n(Distributed Inexact Newton method with Adaptive Stepsize) is proposed. DINAS\nemploys large adaptively computed step-sizes, requires a reduced global\nparameters knowledge with respect to existing alternatives, and can operate\nwithout any local Hessian inverse calculations nor Hessian communications. When\nsolving personalized distributed learning formulations, DINAS achieves\nquadratic convergence with respect to computational cost and linear convergence\nwith respect to communication cost, the latter rate being independent of the\nlocal functions condition numbers or of the network topology. When solving\nconsensus optimization problems, DINAS is shown to converge to the global\nsolution. Extensive numerical experiments demonstrate significant improvements\nof DINAS over existing alternatives. As a result of independent interest, we\nprovide for the first time convergence analysis of the Newton method with the\nadaptive Polyak's step-size when the Newton direction is computed inexactly in\ncentralized environment.\n","authors":["Dusan Jakovetic","Natasa Krejic","Greta Malaspina"],"pdf_url":"https://arxiv.org/pdf/2305.13985v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07948v1","updated":"2025-01-14T09:02:03Z","published":"2025-01-14T09:02:03Z","title":"Synchronization of Kuramoto oscillators via HEOL, and a discussion on AI","summary":"  Artificial neural networks and their applications in deep learning have\nrecently made an incursion into the field of control. Deep learning techniques\nin control are often related to optimal control, which relies on Pontryagin\nmaximum principle or the Hamilton-Jacobi-Bellman equation. They imply control\nschemes that are tedious to implement. We show here that the new HEOL setting,\nresulting from the fusion of the two established approaches, namely\ndifferential flatness and model-free control, provides a solution to control\nproblems that is more sober in terms of computational resources. This\ncommunication is devoted to the synchronization of the popular Kuramoto's\ncoupled oscillators, which was already considered via artificial neural\nnetworks (B\\\"ottcher et al., Nature Communications 2022), where, contrarily to\nthis communication, only the single control variable is examined. One\nestablishes the flatness of Kuramoto's coupled oscillator model with\nmultiplicative control and develops the resulting HEOL control. Unlike many\nexemples, this system reveals singularities that are avoided by a clever\ngeneration of phase angle trajectories. The results obtained, verified in\nsimulation, show that it is not only possible to synchronize these oscillators\nin finite time, and even to follow angular frequency profiles, but also to\nexhibit robustness concerning model mismatches. To the best of our knowledge\nthis has never been done before. Concluding remarks advocate a viewpoint, which\nmight be traced back to Wiener's cybernetics: control theory belongs to AI.\n","authors":["Emmanuel Delaleau","Cédric Join","Michel Fliess"],"pdf_url":"https://arxiv.org/pdf/2501.07948v1.pdf","comment":"MATHMOD 2025 (11th Vienna International Conference on Mathematical\n  Modelling, 19-21 February 2025, Vienna, Austria)"},{"id":"http://arxiv.org/abs/2201.06859v4","updated":"2025-01-14T08:35:58Z","published":"2022-01-18T10:29:31Z","title":"Grand-Canonical Optimal Transport","summary":"  We study a generalization of the multi-marginal optimal transport problem,\nwhich has no fixed number of marginals $N$ and is inspired of statistical\nmechanics. It consists in optimizing a linear combination of the costs for all\nthe possible $N$'s, while fixing a certain linear combination of the\ncorresponding marginals.\n","authors":["Simone Di Marino","Mathieu Lewin","Luca Nenna"],"pdf_url":"https://arxiv.org/pdf/2201.06859v4.pdf","comment":"Final version, to appear in Arch. Rat. Mech. Anal"},{"id":"http://arxiv.org/abs/2302.08132v2","updated":"2025-01-14T08:34:33Z","published":"2023-02-16T07:55:13Z","title":"Analytically Tractable Models for Decision Making under Present Bias","summary":"  Time-inconsistency is a characteristic of human behavior in which people plan\nfor long-term benefits but take actions that differ from the plan due to\nconflicts with short-term benefits. Such time-inconsistent behavior is believed\nto be caused by present bias, a tendency to overestimate immediate rewards and\nunderestimate future rewards. It is essential in behavioral economics to\ninvestigate the relationship between present bias and time-inconsistency. In\nthis paper, we propose a model for analyzing agent behavior with present bias\nin tasks to make progress toward a goal over a specific period. Unlike previous\nmodels, the state sequence of the agent can be described analytically in our\nmodel. Based on this property, we analyze three crucial problems related to\nagents under present bias: task abandonment, optimal goal setting, and optimal\nreward scheduling. Extensive analysis reveals how present bias affects the\ncondition under which task abandonment occurs and optimal intervention\nstrategies. Our findings are meaningful for preventing task abandonment and\nintervening through incentives in the real world.\n","authors":["Yasunori Akagi","Naoki Marumo","Takeshi Kurashima"],"pdf_url":"https://arxiv.org/pdf/2302.08132v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12478v4","updated":"2025-01-14T08:00:05Z","published":"2023-10-19T05:31:48Z","title":"Homotopy trust-region method for phase-field approximations in\n  perimeter-regularized binary optimal control","summary":"  We consider optimal control problems that have binary-valued control input\nfunctions and a perimeter regularization. We develop and analyze a trust-region\nalgorithm that solves a sequence of subproblems in which the regularization\nterm and the binarity constraint are relaxed by a non-convex energy functional.\nWe show how the parameter that controls the distinctiveness of the resulting\nphase field can be coupled to the trust-region radius updates and be driven to\nzero over the course of the iterations in order to obtain convergence to\nstationary points of the limit problem under suitable regularity assumptions.\nFinally, we highlight and discuss the assumptions and restrictions of our\napproach and provide the first computational results for a motivating\napplication in the field of control of acoustic waves in dissipative media.\n","authors":["Paul Manns","Vanja Nikolić"],"pdf_url":"https://arxiv.org/pdf/2310.12478v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07873v1","updated":"2025-01-14T06:28:43Z","published":"2025-01-14T06:28:43Z","title":"A class of matrix splitting-based fixed-point iteration method for the\n  vertical nonlinear complementarity problem","summary":"  In this paper, we propose a class of matrix splitting-based fixed-point\niteration (FPI) methods for solving the vertical nonlinear complementarity\nproblem (VNCP). Under appropriate conditions, we present two convergence\nresults obtained using different techniques and estimate the number of\niterations required for the FPI method. Additionally, through numerical\nexperiments, we demonstrated that the FPI method surpasses other methods in\ncomputational efficiency.\n","authors":["Wang Yapeng","Mu Xuewen"],"pdf_url":"https://arxiv.org/pdf/2501.07873v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.07863v1","updated":"2025-01-14T05:59:37Z","published":"2025-01-14T05:59:37Z","title":"An accelerated gradient method with adaptive restart for convex\n  multiobjective optimization problems","summary":"  In this work, based on the continuous time approach, we propose an\naccelerated gradient method with adaptive residual restart for convex\nmultiobjective optimization problems. For the first, we derive rigorously the\ncontinuous limit of the multiobjective accelerated proximal gradient method by\nTanabe et al. [arXiv:2022.10994, 2022]. It is a second-order ordinary\ndifferential equation (ODE) that involves a special projection operator and can\nbe viewed as an extension of the ODE by Su et al. [J. Mach. Learn. Res., 2016]\nfor Nesterov's accelerated gradient method. Based on this, we introduce a novel\naccelerated multiobjective gradient (AMG) flow with tailored time scaling that\nadapts automatically to the convex case and the strongly convex case, and the\nexponential decay rate of a merit function along with the solution trajectory\nof AMG flow is established via the Lyapunov analysis. After that, we consider\nan implicit-explicit time discretization and obtain an accelerated\nmultiobjective gradient method with a convex quadratic programming subproblem.\nThe fast sublinear rate and linear rate are proved respectively for convex and\nstrongly convex problems. In addition, we present an efficient residual based\nadaptive restart technique to overcome the oscillation issue and improve the\nconvergence significantly. Numerical results are provided to validate the\npractical performance of the proposed method.\n","authors":["Hao Luo","Liping Tang","Xinmin Yang"],"pdf_url":"https://arxiv.org/pdf/2501.07863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13678v3","updated":"2025-01-14T04:48:13Z","published":"2023-07-25T17:36:17Z","title":"On structural contraction of biological interaction networks","summary":"  Biological networks are customarily described as structurally robust. This\nmeans that they often function extremely well under large forms of\nperturbations affecting both the concentrations and the kinetic parameters. In\norder to explain this property, various mathematical notions have been proposed\nin the literature. In this paper, we propose the notion of structural\ncontractivity, building on the previous work of the authors. That previous work\ncharacterized the long-term dynamics of classes of Biological Interaction\nNetworks (BINs), based on \"rate-dependent Lyapunov functions\". Here, we show\nthat stronger notions of convergence can be established by proving structural\ncontractivity with respect to non-standard polyhedral $\\ell_\\infty$-norms. In\nparticular, we show that such networks are nonexpansive. With additional\nverifiable conditions, we show that they are strictly contractive over\narbitrary positive compact sets. In addition, we show that such networks\nentrain to periodic inputs. We illustrate our theory with examples drawn from\nthe modeling of intracellular signaling pathways.\n","authors":["M. Ali Al-Radhawi","David Angeli","Eduardo Sontag"],"pdf_url":"https://arxiv.org/pdf/2307.13678v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07807v1","updated":"2025-01-14T03:17:09Z","published":"2025-01-14T03:17:09Z","title":"Peaceman-Rachford Splitting Method Converges Ergodically for Solving\n  Convex Optimization Problems","summary":"  In this paper, we prove that the ergodic sequence generated by the\nPeaceman-Rachford (PR) splitting method with semi-proximal terms converges for\nconvex optimization problems (COPs). Numerical experiments on the linear\nprogramming benchmark dataset further demonstrate that, with a restart\nstrategy, the ergodic sequence of the PR splitting method with semi-proximal\nterms consistently outperforms both the point-wise and ergodic sequences of the\nDouglas-Rachford (DR) splitting method. These findings indicate that the\nrestarted ergodic PR splitting method is a more effective choice for tackling\nlarge-scale COPs compared to its DR counterparts.\n","authors":["Kaihuang Chen","Defeng Sun","Yancheng Yuan","Guojun Zhang","Xinyuan Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02840v3","updated":"2025-01-14T01:21:46Z","published":"2024-06-05T01:30:58Z","title":"Statistical inference of convex order by Wasserstein projection","summary":"  Ranking distributions according to a stochastic order has wide applications\nin diverse areas. Although stochastic dominance has received much attention,\nconvex order, particularly in general dimensions, has yet to be investigated\nfrom a statistical point of view. This article addresses this gap by\nintroducing a simple statistical test for convex order based on the Wasserstein\nprojection distance. This projection distance not only encodes whether two\ndistributions are indeed in convex order, but also quantifies the deviation\nfrom the desired convex order and produces an optimal convex order\napproximation. Lipschitz stability of the backward and forward Wasserstein\nprojection distance is proved, which leads to elegant consistency and\nconcentration results of the estimator we employ as our test statistic.\nCombining these with state of the art results regarding the convergence rate of\nempirical distributions, we also derive upper bounds for the $p$-value and type\nI error of our test statistic, as well as upper bounds on the type II error for\nan appropriate class of strict alternatives. With proper choices of families of\ndistributions, we further attain that the power of the proposed test increases\nto one as the number of samples grows to infinity. Lastly, we provide an\nefficient numerical scheme for our test statistic, by way of an entropic\nFrank-Wolfe algorithm. Experiments based on synthetic data sets illuminate the\nsuccess of our approach.\n","authors":["Jakwang Kim","Young-Heon Kim","Yuanlong Ruan","Andrew Warren"],"pdf_url":"https://arxiv.org/pdf/2406.02840v3.pdf","comment":"31 pages, 3 figures, Add previous literature about the Wasserstein\n  projection (Aurelien Alfonsi, Jacopo Corbetta and Benjamin Jourdain (2020)),\n  and the stability of the projection measure in one dimension (Benjamin\n  Jourdain, William Margheriti and Gudmund Pammer(2023))"},{"id":"http://arxiv.org/abs/2404.14524v2","updated":"2025-01-14T00:40:59Z","published":"2024-04-22T18:46:35Z","title":"Randomized Nyström Preconditioned Interior Point-Proximal Method of\n  Multipliers","summary":"  We present a new algorithm for convex separable quadratic programming (QP)\ncalled Nys-IP-PMM, a regularized interior-point solver that uses low-rank\nstructure to accelerate solution of the Newton system. The algorithm combines\nthe interior point proximal method of multipliers (IP-PMM) with the randomized\nNystr\\\"om preconditioned conjugate gradient method as the inner linear system\nsolver. Our algorithm is matrix-free: it accesses the input matrices solely\nthrough matrix-vector products, as opposed to methods involving matrix\nfactorization. It works particularly well for separable QP instances with dense\nconstraint matrices. We establish convergence of Nys-IP-PMM. Numerical\nexperiments demonstrate its superior performance in terms of wallclock time\ncompared to previous matrix-free IPM-based approaches.\n","authors":["Ya-Chi Chu","Luiz-Rafael Santos","Madeleine Udell"],"pdf_url":"https://arxiv.org/pdf/2404.14524v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13033v4","updated":"2025-01-14T00:19:38Z","published":"2023-11-21T22:45:07Z","title":"Invariance Proximity: Closed-Form Error Bounds for Finite-Dimensional\n  Koopman-Based Models","summary":"  A popular way to approximate the Koopman operator's action on a\nfinite-dimensional subspace of functions is via orthogonal projections. The\nquality of the projected model directly depends on the selected subspace,\nspecifically on how close it is to being invariant under the Koopman operator.\nThe notion of invariance proximity provides a tight upper bound on the\nworst-case relative prediction error of the finite-dimensional model. However,\nits direct calculation is computationally challenging. This paper leverages the\ngeometric structure behind the definition of invariance proximity to provide a\nclosed-form expression in terms of Jordan principal angles on general inner\nproduct spaces. Unveiling this connection allows us to exploit specific\nisomorphisms to circumvent the computational challenges associated with spaces\nof functions and enables the use of existing efficient numerical routines to\ncompute invariance proximity.\n","authors":["Masih Haseli","Jorge Cortés"],"pdf_url":"https://arxiv.org/pdf/2311.13033v4.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.07757v1","updated":"2025-01-14T00:03:41Z","published":"2025-01-14T00:03:41Z","title":"Existence and uniqueness of control sets with a nonempty interior for\n  linear control systems on solvable groups","summary":"  In this paper, we obtain weak conditions for the existence of a control set\nwith a nonempty interior for a linear control system on a solvable Lie group.\nWe show that the Lie algebra rank condition together with the compactness of\nthe nilpotent part of the generalized kernel of the drift are enough to assure\nthe existence of such a control set. Moreover, this control set is unique and\ncontains the whole generalized kernel in its closure.\n","authors":["Adriano Da Silva"],"pdf_url":"https://arxiv.org/pdf/2501.07757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01313v2","updated":"2025-01-14T23:44:26Z","published":"2023-12-03T08:05:25Z","title":"Observer-based Periodic Event-triggered and Self-triggered Boundary\n  Control of a Class of Parabolic PDEs","summary":"  This paper introduces the first observer-based periodic event-triggered\ncontrol (PETC) and self-triggered control (STC) for boundary control of a class\nof parabolic PDEs using PDE backstepping control. We introduce techniques to\nconvert a certain class of continuous-time event-triggered control into PETC\nand STC, eliminating the need for continuous monitoring of the event-triggering\nfunction. For the PETC, the event-triggering function requires only periodic\nevaluations to detect events, while the STC proactively computes the time of\nthe next event right at the current event time using the system model and the\ncontinuously available measurements. For both strategies, the control input is\nupdated exclusively at events and is maintained using a zero-order hold between\nevents. We demonstrate that the closed-loop system is Zeno-free. We offer\ncriteria for selecting an appropriate sampling period for the PETC and for\ndetermining the time until the next event under the STC. We prove the system's\nglobal exponential convergence to zero in the spatial $L^2$ norm for both\nanti-collocated and collocated sensing and actuation under the PETC. For the\nSTC, local exponential convergence to zero in the spatial $L^2$ norm for\ncollocated sensing and actuation is proven. Simulations are provided to\nillustrate the theoretical claims.\n","authors":["Bhathiya Rathnayake","Mamadou Diagne"],"pdf_url":"https://arxiv.org/pdf/2312.01313v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08178v2","updated":"2025-01-14T23:32:53Z","published":"2024-07-11T04:46:28Z","title":"Performance-Barrier Event-Triggered Control of a Class of\n  Reaction-Diffusion PDEs","summary":"  We employ the recent performance-barrier event-triggered control (P-ETC) for\nachieving global exponential convergence of a class of reaction-diffusion PDEs\nvia PDE backstepping control. Rather than insisting on a strictly monotonic\ndecrease of the Lyapunov function for the closed-loop system, P-ETC allows the\nLyapunov function to increase as long as it remains below an acceptable\nperformance-barrier. This approach integrates a performance residual, the\ndifference between the value of the performance-barrier and the Lyapunov\nfunction, into the triggering mechanism. The integration adds flexibility and\nresults in fewer control updates than with regular ETC (R-ETC) that demands a\nmonotonic decrease of the Lyapunov function. Our P-ETC PDE backstepping design\nensures global exponential convergence of the closed-loop system in the spatial\nL^2 norm, without encountering Zeno phenomenon. To avoid continuous monitoring\nof the triggering function that generates events, we develop periodic\nevent-triggered and self-triggered variants (P-PETC and P-STC, respectively) of\nthe P-ETC. The P-PETC only requires periodic evaluation of the triggering\nfunction whereas the P-STC preemptively computes the time of the next event at\nthe current event time using the system model and continuously available system\nstates. The P-PETC and P-STC also ensure a Zeno-free behavior and deliver\nperformance equivalent to that of the continuous-time P-ETC which requires\ncontinuous evaluation of the triggering function, in addition to the continuous\nsensing of the state. We provide numerical simulations to illustrate the\nproposed technique and to compare it with R-ETC associated with strictly\ndecreasing Lyapunov functions.\n","authors":["Bhathiya Rathnayake","Mamadou Diagne","Jorge Cortes","Miroslav Krstic"],"pdf_url":"https://arxiv.org/pdf/2407.08178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04658v2","updated":"2025-01-14T22:41:40Z","published":"2025-01-08T18:13:35Z","title":"Quadratic-form Optimal Transport","summary":"  We introduce the framework of quadratic-form optimal transport (QOT), whose\ntransport cost has the form $\\iint c\\,\\mathrm{d}\\pi \\otimes\\mathrm{d}\\pi$ for\nsome coupling $\\pi$ between two marginals. Interesting examples of\nquadratic-form transport cost and their optimization include inequality\nmeasurement, the variance of a bivariate function, covariance, Kendall's tau,\nthe Gromov--Wasserstein distance, quadratic assignment problems, and quadratic\nregularization of classic optimal transport. QOT leads to substantially\ndifferent mathematical structures compared to classic transport problems and\nmany technical challenges. We illustrate the fundamental properties of QOT,\nprovide several cases where explicit solutions are obtained, and give general\nlower bounds of the optimal transport costs. For a wide class of cost\nfunctions, including the rectangular cost functions, the QOT problem is solved\nby a new coupling called the diamond transport, whose copula is supported on a\ndiamond in the unit square.\n","authors":["Ruodu Wang","Zhenyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.04658v2.pdf","comment":"41 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.08472v1","updated":"2025-01-14T22:37:30Z","published":"2025-01-14T22:37:30Z","title":"Energy Storage Arbitrage Under Price Uncertainty: Market Risks and\n  Opportunities","summary":"  We investigate the profitability and risk of energy storage arbitrage in\nelectricity markets under price uncertainty, exploring both robust and\nchance-constrained optimization approaches. We analyze various uncertainty\nrepresentations, including polyhedral, ellipsoidal uncertainty sets and\nprobabilistic approximations, to model price fluctuations and construct\nefficient frontiers that highlight the tradeoff between risk and profit. Using\nhistorical electricity price data, we quantify the impact of uncertainty on\narbitrage strategies and compare their performance under distinct market\nconditions. The results reveal that arbitrage strategies under uncertainties\ncan effectively secure expected profits, and robust strategies perform better\nin risk management across varying levels of conservativeness, especially under\nhighly volatile market conditions. This work provides insights into storage\narbitrage strategy selection for market participants with differing risk\npreferences, emphasizing the adaptability of efficient frontiers to the\nelectricity market.\n","authors":["Yiqian Wu","Bolun Xu","James Anderson"],"pdf_url":"https://arxiv.org/pdf/2501.08472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07413v2","updated":"2025-01-14T21:32:29Z","published":"2025-01-13T15:31:50Z","title":"Stable Set Polytopes with Rank $|V(G)|/3$ for the Lovász--Schrijver\n  SDP Operator","summary":"  We study the lift-and-project rank of the stable set polytope of graphs with\nrespect to the Lov\\'{a}sz--Schrijver SDP operator $\\text{LS}_+$ applied to the\nfractional stable set polytope. In particular, we show that for every positive\ninteger $\\ell$, the smallest possible graph with $\\text{LS}_+$-rank $\\ell$\ncontains $3\\ell$ vertices. This result is sharp and settles a conjecture posed\nby Lipt\\'{a}k and the second author in 2003, as well as answers a\ngeneralization of a problem posed by Knuth in 1994. We also show that for every\npositive integer $\\ell$ there exists a vertex-transitive graph on $4\\ell+12$\nvertices with $\\text{LS}_+$-rank at least $\\ell$.\n","authors":["Yu Hin Au","Levent Tunçel"],"pdf_url":"https://arxiv.org/pdf/2501.07413v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08439v1","updated":"2025-01-14T21:05:34Z","published":"2025-01-14T21:05:34Z","title":"Revisiting Continuous p-Hub Location Problems with the L1 Metric","summary":"  Motivated by emerging urban applications in commercial, public sector, and\nhumanitarian logistics, we revisit continuous $p$-hub location problems in\nwhich several facilities must be located in a continuous space such that the\nexpected minimum Manhattan travel distance from a random service provider to a\nrandom customer through exactly one hub facility is minimized. In this paper,\nwe begin by deriving closed-form results for a one-dimensional case and\ntwo-dimensional cases with up to two hubs. Subsequently, a simulation-based\napproximation method is proposed for more complex two-dimensional scenarios\nwith more than two hubs. Moreover, an extended problem with multiple service\nproviders is analyzed to reflect real-life service settings. Finally, we apply\nour model and approximation method using publicly available data as a case\nstudy to optimize the deployment of public-access automated external\ndefibrillators in Virginia Beach.\n","authors":["Yifan Wu","Joseph Geunes","Xiaofeng Nie"],"pdf_url":"https://arxiv.org/pdf/2501.08439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08406v1","updated":"2025-01-14T19:53:58Z","published":"2025-01-14T19:53:58Z","title":"OptiChat: Bridging Optimization Models and Practitioners with Large\n  Language Models","summary":"  Optimization models have been applied to solve a wide variety of\ndecision-making problems. These models are usually developed by optimization\nexperts but are used by practitioners without optimization expertise in various\napplication domains. As a result, practitioners often struggle to interact with\nand draw useful conclusions from optimization models independently. To fill\nthis gap, we introduce OptiChat, a natural language dialogue system designed to\nhelp practitioners interpret model formulation, diagnose infeasibility, analyze\nsensitivity, retrieve information, evaluate modifications, and provide\ncounterfactual explanations. By augmenting large language models (LLMs) with\nfunctional calls and code generation tailored for optimization models, we\nenable seamless interaction and minimize the risk of hallucinations in\nOptiChat. We develop a new dataset to evaluate OptiChat's performance in\nexplaining optimization models. Experiments demonstrate that OptiChat\neffectively bridges the gap between optimization models and practitioners,\ndelivering autonomous, accurate, and instant responses.\n","authors":["Hao Chen","Gonzalo Esteban Constante-Flores","Krishna Sri Ipsit Mantri","Sai Madhukiran Kompalli","Akshdeep Singh Ahluwalia","Can Li"],"pdf_url":"https://arxiv.org/pdf/2501.08406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.11939v4","updated":"2025-01-14T08:44:41Z","published":"2021-07-26T03:22:14Z","title":"Relaxed Indexability and Index Policy for Partially Observable Restless\n  Bandits","summary":"  This paper addresses an important class of restless multi-armed bandit (RMAB)\nproblems that finds broad application in operations research, stochastic\noptimization, and reinforcement learning. There are $N$ independent Markov\nprocesses that may be operated, observed and offer rewards. Due to the resource\nconstraint, we can only choose a subset of $M~(M<N)$ processes to operate and\naccrue reward determined by the states of selected processes. We formulate the\nproblem as a partially observable RMAB with an infinite state space and design\nan algorithm that achieves a near-optimal performance with low complexity. Our\nalgorithm is based on a generalization of Whittle's original idea of\nindexability. Referred to as the relaxed indexability, the extended definition\nleads to the efficient online verifications and computations of the approximate\nWhittle index under the proposed algorithmic framework.\n","authors":["Keqin Liu"],"pdf_url":"https://arxiv.org/pdf/2107.11939v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10444v1","updated":"2025-01-14T08:59:24Z","published":"2025-01-14T08:59:24Z","title":"Discrete time stochastic impulse control with delay","summary":"  We study a class of infinite-horizon impulse control problems with execution\ndelay in discrete time. Using probabilistic methods, particularly the notion of\nthe Snell envelope of processes, we construct an optimal strategy among all\nadmissible strategies for both risk-neutral and risk-sensitive utility\nfunctions. Furthermore, we establish the existence of bounded\n$\\epsilon$-optimal strategies. This framework provides a robust approach to\nhandling execution delays in discrete-time stochastic systems.\n","authors":["Said Hamadène","Boualem Djehiche"],"pdf_url":"https://arxiv.org/pdf/2501.10444v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.08333v1","updated":"2025-01-14T18:59:59Z","published":"2025-01-14T18:59:59Z","title":"DAViD: Modeling Dynamic Affordance of 3D Objects using Pre-trained Video\n  Diffusion Models","summary":"  Understanding the ability of humans to use objects is crucial for AI to\nimprove daily life. Existing studies for learning such ability focus on\nhuman-object patterns (e.g., contact, spatial relation, orientation) in static\nsituations, and learning Human-Object Interaction (HOI) patterns over time\n(i.e., movement of human and object) is relatively less explored. In this\npaper, we introduce a novel type of affordance named Dynamic Affordance. For a\ngiven input 3D object mesh, we learn dynamic affordance which models the\ndistribution of both (1) human motion and (2) human-guided object pose during\ninteractions. As a core idea, we present a method to learn the 3D dynamic\naffordance from synthetically generated 2D videos, leveraging a pre-trained\nvideo diffusion model. Specifically, we propose a pipeline that first generates\n2D HOI videos from the 3D object and then lifts them into 3D to generate 4D HOI\nsamples. Once we generate diverse 4D HOI samples on various target objects, we\ntrain our DAViD, where we present a method based on the Low-Rank Adaptation\n(LoRA) module for pre-trained human motion diffusion model (MDM) and an object\npose diffusion model with human pose guidance. Our motion diffusion model is\nextended for multi-object interactions, demonstrating the advantage of our\npipeline with LoRA for combining the concepts of object usage. Through\nextensive experiments, we demonstrate our DAViD outperforms the baselines in\ngenerating human motion with HOIs.\n","authors":["Hyeonwoo Kim","Sangwon Beak","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2501.08333v1.pdf","comment":"Project Page: https://snuvclab.github.io/david/"},{"id":"http://arxiv.org/abs/2501.08332v1","updated":"2025-01-14T18:59:55Z","published":"2025-01-14T18:59:55Z","title":"MangaNinja: Line Art Colorization with Precise Reference Following","summary":"  Derived from diffusion models, MangaNinjia specializes in the task of\nreference-guided line art colorization. We incorporate two thoughtful designs\nto ensure precise character detail transcription, including a patch shuffling\nmodule to facilitate correspondence learning between the reference color image\nand the target line art, and a point-driven control scheme to enable\nfine-grained color matching. Experiments on a self-collected benchmark\ndemonstrate the superiority of our model over current solutions in terms of\nprecise colorization. We further showcase the potential of the proposed\ninteractive point control in handling challenging cases, cross-character\ncolorization, multi-reference harmonization, beyond the reach of existing\nalgorithms.\n","authors":["Zhiheng Liu","Ka Leong Cheng","Xi Chen","Jie Xiao","Hao Ouyang","Kai Zhu","Yu Liu","Yujun Shen","Qifeng Chen","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2501.08332v1.pdf","comment":"Project page and code: https://johanan528.github.io/MangaNinjia/"},{"id":"http://arxiv.org/abs/2501.08331v1","updated":"2025-01-14T18:59:10Z","published":"2025-01-14T18:59:10Z","title":"Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using\n  Real-Time Warped Noise","summary":"  Generative modeling aims to transform random noise into structured outputs.\nIn this work, we enhance video diffusion models by allowing motion control via\nstructured latent noise sampling. This is achieved by just a change in data: we\npre-process training videos to yield structured noise. Consequently, our method\nis agnostic to diffusion model design, requiring no changes to model\narchitectures or training pipelines. Specifically, we propose a novel noise\nwarping algorithm, fast enough to run in real time, that replaces random\ntemporal Gaussianity with correlated warped noise derived from optical flow\nfields, while preserving the spatial Gaussianity. The efficiency of our\nalgorithm enables us to fine-tune modern video diffusion base models using\nwarped noise with minimal overhead, and provide a one-stop solution for a wide\nrange of user-friendly motion control: local object motion control, global\ncamera movement control, and motion transfer. The harmonization between\ntemporal coherence and spatial Gaussianity in our warped noise leads to\neffective motion control while maintaining per-frame pixel quality. Extensive\nexperiments and user studies demonstrate the advantages of our method, making\nit a robust and scalable approach for controlling motion in video diffusion\nmodels. Video results are available on our webpage:\nhttps://vgenai-netflix-eyeline-research.github.io/Go-with-the-Flow/; source\ncode and model checkpoints are available on GitHub:\nhttps://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow.\n","authors":["Ryan Burgert","Yuancheng Xu","Wenqi Xian","Oliver Pilarski","Pascal Clausen","Mingming He","Li Ma","Yitong Deng","Lingxiao Li","Mohsen Mousavi","Michael Ryoo","Paul Debevec","Ning Yu"],"pdf_url":"https://arxiv.org/pdf/2501.08331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08329v1","updated":"2025-01-14T18:59:05Z","published":"2025-01-14T18:59:05Z","title":"Predicting 4D Hand Trajectory from Monocular Videos","summary":"  We present HaPTIC, an approach that infers coherent 4D hand trajectories from\nmonocular videos. Current video-based hand pose reconstruction methods\nprimarily focus on improving frame-wise 3D pose using adjacent frames rather\nthan studying consistent 4D hand trajectories in space. Despite the additional\ntemporal cues, they generally underperform compared to image-based methods due\nto the scarcity of annotated video data. To address these issues, we repurpose\na state-of-the-art image-based transformer to take in multiple frames and\ndirectly predict a coherent trajectory. We introduce two types of lightweight\nattention layers: cross-view self-attention to fuse temporal information, and\nglobal cross-attention to bring in larger spatial context. Our method infers 4D\nhand trajectories similar to the ground truth while maintaining strong 2D\nreprojection alignment. We apply the method to both egocentric and allocentric\nvideos. It significantly outperforms existing methods in global trajectory\naccuracy while being comparable to the state-of-the-art in single-image pose\nestimation. Project website: https://judyye.github.io/haptic-www\n","authors":["Yufei Ye","Yao Feng","Omid Taheri","Haiwen Feng","Shubham Tulsiani","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2501.08329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08326v1","updated":"2025-01-14T18:58:04Z","published":"2025-01-14T18:58:04Z","title":"Omni-RGPT: Unifying Image and Video Region-level Understanding via Token\n  Marks","summary":"  We present Omni-RGPT, a multimodal large language model designed to\nfacilitate region-level comprehension for both images and videos. To achieve\nconsistent region representation across spatio-temporal dimensions, we\nintroduce Token Mark, a set of tokens highlighting the target regions within\nthe visual feature space. These tokens are directly embedded into spatial\nregions using region prompts (e.g., boxes or masks) and simultaneously\nincorporated into the text prompt to specify the target, establishing a direct\nconnection between visual and text tokens. To further support robust video\nunderstanding without requiring tracklets, we introduce an auxiliary task that\nguides Token Mark by leveraging the consistency of the tokens, enabling stable\nregion interpretation across the video. Additionally, we introduce a\nlarge-scale region-level video instruction dataset (RegVID-300k). Omni-RGPT\nachieves state-of-the-art results on image and video-based commonsense\nreasoning benchmarks while showing strong performance in captioning and\nreferring expression comprehension tasks.\n","authors":["Miran Heo","Min-Hung Chen","De-An Huang","Sifei Liu","Subhashree Radhakrishnan","Seon Joo Kim","Yu-Chiang Frank Wang","Ryo Hachiuma"],"pdf_url":"https://arxiv.org/pdf/2501.08326v1.pdf","comment":"Project page: https://miranheo.github.io/omni-rgpt/"},{"id":"http://arxiv.org/abs/2501.08325v1","updated":"2025-01-14T18:57:21Z","published":"2025-01-14T18:57:21Z","title":"GameFactory: Creating New Games with Generative Interactive Videos","summary":"  Generative game engines have the potential to revolutionize game development\nby autonomously creating new content and reducing manual workload. However,\nexisting video-based game generation methods fail to address the critical\nchallenge of scene generalization, limiting their applicability to existing\ngames with fixed styles and scenes. In this paper, we present GameFactory, a\nframework focused on exploring scene generalization in game video generation.\nTo enable the creation of entirely new and diverse games, we leverage\npre-trained video diffusion models trained on open-domain video data. To bridge\nthe domain gap between open-domain priors and small-scale game dataset, we\npropose a multi-phase training strategy that decouples game style learning from\naction control, preserving open-domain generalization while achieving action\ncontrollability. Using Minecraft as our data source, we release GF-Minecraft, a\nhigh-quality and diversity action-annotated video dataset for research.\nFurthermore, we extend our framework to enable autoregressive\naction-controllable game video generation, allowing the production of\nunlimited-length interactive game videos. Experimental results demonstrate that\nGameFactory effectively generates open-domain, diverse, and action-controllable\ngame videos, representing a significant step forward in AI-driven game\ngeneration. Our dataset and project page are publicly available at\n\\url{https://vvictoryuki.github.io/gamefactory/}.\n","authors":["Jiwen Yu","Yiran Qin","Xintao Wang","Pengfei Wan","Di Zhang","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08316v1","updated":"2025-01-14T18:51:48Z","published":"2025-01-14T18:51:48Z","title":"Diffusion Adversarial Post-Training for One-Step Video Generation","summary":"  The diffusion models are widely used for image and video generation, but\ntheir iterative generation process is slow and expansive. While existing\ndistillation approaches have demonstrated the potential for one-step generation\nin the image domain, they still suffer from significant quality degradation. In\nthis work, we propose Adversarial Post-Training (APT) against real data\nfollowing diffusion pre-training for one-step video generation. To improve the\ntraining stability and quality, we introduce several improvements to the model\narchitecture and training procedures, along with an approximated R1\nregularization objective. Empirically, our experiments show that our\nadversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720,\n24fps videos in real time using a single forward evaluation step. Additionally,\nour model is capable of generating 1024px images in a single step, achieving\nquality comparable to state-of-the-art methods.\n","authors":["Shanchuan Lin","Xin Xia","Yuxi Ren","Ceyuan Yang","Xuefeng Xiao","Lu Jiang"],"pdf_url":"https://arxiv.org/pdf/2501.08316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07169v3","updated":"2025-01-14T18:51:43Z","published":"2024-12-10T04:03:46Z","title":"Rate-In: Information-Driven Adaptive Dropout Rates for Improved\n  Inference-Time Uncertainty Estimation","summary":"  Accurate uncertainty estimation is crucial for deploying neural networks in\nrisk-sensitive applications such as medical diagnosis. Monte Carlo Dropout is a\nwidely used technique for approximating predictive uncertainty by performing\nstochastic forward passes with dropout during inference. However, using static\ndropout rates across all layers and inputs can lead to suboptimal uncertainty\nestimates, as it fails to adapt to the varying characteristics of individual\ninputs and network layers. Existing approaches optimize dropout rates during\ntraining using labeled data, resulting in fixed inference-time parameters that\ncannot adjust to new data distributions, compromising uncertainty estimates in\nMonte Carlo simulations.\n  In this paper, we propose Rate-In, an algorithm that dynamically adjusts\ndropout rates during inference by quantifying the information loss induced by\ndropout in each layer's feature maps. By treating dropout as controlled noise\ninjection and leveraging information-theoretic principles, Rate-In adapts\ndropout rates per layer and per input instance without requiring ground truth\nlabels. By quantifying the functional information loss in feature maps, we\nadaptively tune dropout rates to maintain perceptual quality across diverse\nmedical imaging tasks and architectural configurations. Our extensive empirical\nstudy on synthetic data and real-world medical imaging tasks demonstrates that\nRate-In improves calibration and sharpens uncertainty estimates compared to\nfixed or heuristic dropout rates without compromising predictive performance.\nRate-In offers a practical, unsupervised, inference-time approach to optimizing\ndropout for more reliable predictive uncertainty estimation in critical\napplications.\n","authors":["Tal Zeevi","Ravid Shwartz-Ziv","Yann LeCun","Lawrence H. Staib","John A. Onofrey"],"pdf_url":"https://arxiv.org/pdf/2412.07169v3.pdf","comment":"Updated author affiliation"},{"id":"http://arxiv.org/abs/2501.08313v1","updated":"2025-01-14T18:50:05Z","published":"2025-01-14T18:50:05Z","title":"MiniMax-01: Scaling Foundation Models with Lightning Attention","summary":"  We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01,\nwhich are comparable to top-tier models while offering superior capabilities in\nprocessing longer contexts. The core lies in lightning attention and its\nefficient scaling. To maximize computational capacity, we integrate it with\nMixture of Experts (MoE), creating a model with 32 experts and 456 billion\ntotal parameters, of which 45.9 billion are activated for each token. We\ndevelop an optimized parallel strategy and highly efficient\ncomputation-communication overlap techniques for MoE and lightning attention.\nThis approach enables us to conduct efficient training and inference on models\nwith hundreds of billions of parameters across contexts spanning millions of\ntokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens\nduring training and extrapolate to 4 million tokens during inference at an\naffordable cost. Our vision-language model, MiniMax-VL-01 is built through\ncontinued training with 512 billion vision-language tokens. Experiments on both\nstandard and in-house benchmarks show that our models match the performance of\nstate-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32\ntimes longer context window. We publicly release MiniMax-01 at\nhttps://github.com/MiniMax-AI.\n","authors":[" MiniMax","Aonian Li","Bangwei Gong","Bo Yang","Boji Shan","Chang Liu","Cheng Zhu","Chunhao Zhang","Congchao Guo","Da Chen","Dong Li","Enwei Jiao","Gengxin Li","Guojun Zhang","Haohai Sun","Houze Dong","Jiadai Zhu","Jiaqi Zhuang","Jiayuan Song","Jin Zhu","Jingtao Han","Jingyang Li","Junbin Xie","Junhao Xu","Junjie Yan","Kaishun Zhang","Kecheng Xiao","Kexi Kang","Le Han","Leyang Wang","Lianfei Yu","Liheng Feng","Lin Zheng","Linbo Chai","Long Xing","Meizhi Ju","Mingyuan Chi","Mozhi Zhang","Peikai Huang","Pengcheng Niu","Pengfei Li","Pengyu Zhao","Qi Yang","Qidi Xu","Qiexiang Wang","Qin Wang","Qiuhui Li","Ruitao Leng","Shengmin Shi","Shuqi Yu","Sichen Li","Songquan Zhu","Tao Huang","Tianrun Liang","Weigao Sun","Weixuan Sun","Weiyu Cheng","Wenkai Li","Xiangjun Song","Xiao Su","Xiaodong Han","Xinjie Zhang","Xinzhu Hou","Xu Min","Xun Zou","Xuyang Shen","Yan Gong","Yingjie Zhu","Yipeng Zhou","Yiran Zhong","Yongyi Hu","Yuanxiang Fan","Yue Yu","Yufeng Yang","Yuhao Li","Yunan Huang","Yunji Li","Yunpeng Huang","Yunzhi Xu","Yuxin Mao","Zehan Li","Zekang Li","Zewei Tao","Zewen Ying","Zhaoyang Cong","Zhen Qin","Zhenhua Fan","Zhihang Yu","Zhuo Jiang","Zijia Wu"],"pdf_url":"https://arxiv.org/pdf/2501.08313v1.pdf","comment":"A technical report from MiniMax. The authors are listed in\n  alphabetical order. We open-sourced our MiniMax-01 at\n  https://github.com/MiniMax-AI"},{"id":"http://arxiv.org/abs/2501.08303v1","updated":"2025-01-14T18:34:14Z","published":"2025-01-14T18:34:14Z","title":"Advancing Semantic Future Prediction through Multimodal Visual Sequence\n  Transformers","summary":"  Semantic future prediction is important for autonomous systems navigating\ndynamic environments. This paper introduces FUTURIST, a method for multimodal\nfuture semantic prediction that uses a unified and efficient visual sequence\ntransformer architecture. Our approach incorporates a multimodal masked visual\nmodeling objective and a novel masking mechanism designed for multimodal\ntraining. This allows the model to effectively integrate visible information\nfrom various modalities, improving prediction accuracy. Additionally, we\npropose a VAE-free hierarchical tokenization process, which reduces\ncomputational complexity, streamlines the training pipeline, and enables\nend-to-end training with high-resolution, multimodal inputs. We validate\nFUTURIST on the Cityscapes dataset, demonstrating state-of-the-art performance\nin future semantic segmentation for both short- and mid-term forecasting. We\nprovide the implementation code at https://github.com/Sta8is/FUTURIST .\n","authors":["Efstathios Karypidis","Ioannis Kakogeorgiou","Spyros Gidaris","Nikos Komodakis"],"pdf_url":"https://arxiv.org/pdf/2501.08303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08295v1","updated":"2025-01-14T18:22:21Z","published":"2025-01-14T18:22:21Z","title":"LayerAnimate: Layer-specific Control for Animation","summary":"  Animated video separates foreground and background elements into layers, with\ndistinct processes for sketching, refining, coloring, and in-betweening.\nExisting video generation methods typically treat animation as a monolithic\ndata domain, lacking fine-grained control over individual layers. In this\npaper, we introduce LayerAnimate, a novel architectural approach that enhances\nfine-grained control over individual animation layers within a video diffusion\nmodel, allowing users to independently manipulate foreground and background\nelements in distinct layers. To address the challenge of limited layer-specific\ndata, we propose a data curation pipeline that features automated element\nsegmentation, motion-state hierarchical merging, and motion coherence\nrefinement. Through quantitative and qualitative comparisons, and user study,\nwe demonstrate that LayerAnimate outperforms current methods in terms of\nanimation quality, control precision, and usability, making it an ideal tool\nfor both professional animators and amateur enthusiasts. This framework opens\nup new possibilities for layer-specific animation applications and creative\nflexibility. Our code is available at https://layeranimate.github.io.\n","authors":["Yuxue Yang","Lue Fan","Zuzen Lin","Feng Wang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08295v1.pdf","comment":"Project page: https://layeranimate.github.io"},{"id":"http://arxiv.org/abs/2407.04545v2","updated":"2025-01-14T18:20:45Z","published":"2024-07-05T14:30:24Z","title":"Gaussian Eigen Models for Human Heads","summary":"  Current personalized neural head avatars face a trade-off: lightweight models\nlack detail and realism, while high-quality, animatable avatars require\nsignificant computational resources, making them unsuitable for commodity\ndevices. To address this gap, we introduce Gaussian Eigen Models (GEM), which\nprovide high-quality, lightweight, and easily controllable head avatars. GEM\nutilizes 3D Gaussian primitives for representing the appearance combined with\nGaussian splatting for rendering. Building on the success of mesh-based 3D\nmorphable face models (3DMM), we define GEM as an ensemble of linear eigenbases\nfor representing the head appearance of a specific subject. In particular, we\nconstruct linear bases to represent the position, scale, rotation, and opacity\nof the 3D Gaussians. This allows us to efficiently generate Gaussian primitives\nof a specific head shape by a linear combination of the basis vectors, only\nrequiring a low-dimensional parameter vector that contains the respective\ncoefficients. We propose to construct these linear bases (GEM) by distilling\nhigh-quality compute-intense CNN-based Gaussian avatar models that can generate\nexpression-dependent appearance changes like wrinkles. These high-quality\nmodels are trained on multi-view videos of a subject and are distilled using a\nseries of principal component analyses. Once we have obtained the bases that\nrepresent the animatable appearance space of a specific human, we learn a\nregressor that takes a single RGB image as input and predicts the\nlow-dimensional parameter vector that corresponds to the shown facial\nexpression. In a series of experiments, we compare GEM's self-reenactment and\ncross-person reenactment results to state-of-the-art 3D avatar methods,\ndemonstrating GEM's higher visual quality and better generalization to new\nexpressions.\n","authors":["Wojciech Zielonka","Timo Bolkart","Thabo Beeler","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2407.04545v2.pdf","comment":"https://zielon.github.io/gem/"},{"id":"http://arxiv.org/abs/2410.24031v2","updated":"2025-01-14T18:03:42Z","published":"2024-10-31T15:29:51Z","title":"A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems\n  using Disparity Maps","summary":"  Face recognition technologies are increasingly used in various applications,\nyet they are vulnerable to face spoofing attacks. These spoofing attacks often\ninvolve unique 3D structures, such as printed papers or mobile device screens.\nAlthough stereo-depth cameras can detect such attacks effectively, their\nhigh-cost limits their widespread adoption. Conversely, two-sensor systems\nwithout extrinsic calibration offer a cost-effective alternative but are unable\nto calculate depth using stereo techniques. In this work, we propose a method\nto overcome this challenge by leveraging facial attributes to derive disparity\ninformation and estimate relative depth for anti-spoofing purposes, using\nnon-calibrated systems. We introduce a multi-modal anti-spoofing model, coined\nDisparity Model, that incorporates created disparity maps as a third modality\nalongside the two original sensor modalities. We demonstrate the effectiveness\nof the Disparity Model in countering various spoof attacks using a\ncomprehensive dataset collected from the Intel RealSense ID Solution F455. Our\nmethod outperformed existing methods in the literature, achieving an Equal\nError Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False\nPositive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the\nerrors of the best comparison method, respectively. Additionally, we introduce\na model ensemble that addresses 3D spoof attacks as well, achieving an EER of\n2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a\nstate-of-the-art solution for the challenging task of anti-spoofing in\nnon-calibrated systems that lack depth information.\n","authors":["Ariel Larey","Eyal Rond","Omer Achrack"],"pdf_url":"https://arxiv.org/pdf/2410.24031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08286v1","updated":"2025-01-14T18:01:15Z","published":"2025-01-14T18:01:15Z","title":"VINGS-Mono: Visual-Inertial Gaussian Splatting Monocular SLAM in Large\n  Scenes","summary":"  VINGS-Mono is a monocular (inertial) Gaussian Splatting (GS) SLAM framework\ndesigned for large scenes. The framework comprises four main components: VIO\nFront End, 2D Gaussian Map, NVS Loop Closure, and Dynamic Eraser. In the VIO\nFront End, RGB frames are processed through dense bundle adjustment and\nuncertainty estimation to extract scene geometry and poses. Based on this\noutput, the mapping module incrementally constructs and maintains a 2D Gaussian\nmap. Key components of the 2D Gaussian Map include a Sample-based Rasterizer,\nScore Manager, and Pose Refinement, which collectively improve mapping speed\nand localization accuracy. This enables the SLAM system to handle large-scale\nurban environments with up to 50 million Gaussian ellipsoids. To ensure global\nconsistency in large-scale scenes, we design a Loop Closure module, which\ninnovatively leverages the Novel View Synthesis (NVS) capabilities of Gaussian\nSplatting for loop closure detection and correction of the Gaussian map.\nAdditionally, we propose a Dynamic Eraser to address the inevitable presence of\ndynamic objects in real-world outdoor scenes. Extensive evaluations in indoor\nand outdoor environments demonstrate that our approach achieves localization\nperformance on par with Visual-Inertial Odometry while surpassing recent\nGS/NeRF SLAM methods. It also significantly outperforms all existing methods in\nterms of mapping and rendering quality. Furthermore, we developed a mobile app\nand verified that our framework can generate high-quality Gaussian maps in real\ntime using only a smartphone camera and a low-frequency IMU sensor. To the best\nof our knowledge, VINGS-Mono is the first monocular Gaussian SLAM method\ncapable of operating in outdoor environments and supporting kilometer-scale\nlarge scenes.\n","authors":["Ke Wu","Zicheng Zhang","Muer Tie","Ziqing Ai","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2501.08286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08285v1","updated":"2025-01-14T18:00:41Z","published":"2025-01-14T18:00:41Z","title":"Can Bayesian Neural Networks Explicitly Model Input Uncertainty?","summary":"  Inputs to machine learning models can have associated noise or uncertainties,\nbut they are often ignored and not modelled. It is unknown if Bayesian Neural\nNetworks and their approximations are able to consider uncertainty in their\ninputs. In this paper we build a two input Bayesian Neural Network (mean and\nstandard deviation) and evaluate its capabilities for input uncertainty\nestimation across different methods like Ensembles, MC-Dropout, and Flipout.\nOur results indicate that only some uncertainty estimation methods for\napproximate Bayesian NNs can model input uncertainty, in particular Ensembles\nand Flipout.\n","authors":["Matias Valdenegro-Toro","Marco Zullich"],"pdf_url":"https://arxiv.org/pdf/2501.08285v1.pdf","comment":"12 pages, 11 figures, VISAPP 2025 camera ready"},{"id":"http://arxiv.org/abs/2501.08282v1","updated":"2025-01-14T17:58:12Z","published":"2025-01-14T17:58:12Z","title":"LLaVA-ST: A Multimodal Large Language Model for Fine-Grained\n  Spatial-Temporal Understanding","summary":"  Recent advancements in multimodal large language models (MLLMs) have shown\npromising results, yet existing approaches struggle to effectively handle both\ntemporal and spatial localization simultaneously. This challenge stems from two\nkey issues: first, incorporating spatial-temporal localization introduces a\nvast number of coordinate combinations, complicating the alignment of\nlinguistic and visual coordinate representations; second, encoding fine-grained\ntemporal and spatial information during video feature compression is inherently\ndifficult. To address these issues, we propose LLaVA-ST, a MLLM for\nfine-grained spatial-temporal multimodal understanding. In LLaVA-ST, we propose\nLanguage-Aligned Positional Embedding, which embeds the textual coordinate\nspecial token into the visual space, simplifying the alignment of fine-grained\nspatial-temporal correspondences. Additionally, we design the Spatial-Temporal\nPacker, which decouples the feature compression of temporal and spatial\nresolutions into two distinct point-to-region attention processing streams.\nFurthermore, we propose ST-Align dataset with 4.3M training samples for\nfine-grained spatial-temporal multimodal understanding. With ST-align, we\npresent a progressive training pipeline that aligns the visual and textual\nfeature through sequential coarse-to-fine stages.Additionally, we introduce an\nST-Align benchmark to evaluate spatial-temporal interleaved fine-grained\nunderstanding tasks, which include Spatial-Temporal Video Grounding (STVG) ,\nEvent Localization and Captioning (ELC) and Spatial Video Grounding (SVG).\nLLaVA-ST achieves outstanding performance on 11 benchmarks requiring\nfine-grained temporal, spatial, or spatial-temporal interleaving multimodal\nunderstanding. Our code, data and benchmark will be released at Our code, data\nand benchmark will be released at https://github.com/appletea233/LLaVA-ST .\n","authors":["Hongyu Li","Jinyu Chen","Ziyu Wei","Shaofei Huang","Tianrui Hui","Jialin Gao","Xiaoming Wei","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08279v1","updated":"2025-01-14T17:55:12Z","published":"2025-01-14T17:55:12Z","title":"SmartEraser: Remove Anything from Images using Masked-Region Guidance","summary":"  Object removal has so far been dominated by the mask-and-inpaint paradigm,\nwhere the masked region is excluded from the input, leaving models relying on\nunmasked areas to inpaint the missing region. However, this approach lacks\ncontextual information for the masked area, often resulting in unstable\nperformance. In this work, we introduce SmartEraser, built with a new removing\nparadigm called Masked-Region Guidance. This paradigm retains the masked region\nin the input, using it as guidance for the removal process. It offers several\ndistinct advantages: (a) it guides the model to accurately identify the object\nto be removed, preventing its regeneration in the output; (b) since the user\nmask often extends beyond the object itself, it aids in preserving the\nsurrounding context in the final result. Leveraging this new paradigm, we\npresent Syn4Removal, a large-scale object removal dataset, where instance\nsegmentation data is used to copy and paste objects onto images as removal\ntargets, with the original images serving as ground truths. Experimental\nresults demonstrate that SmartEraser significantly outperforms existing\nmethods, achieving superior performance in object removal, especially in\ncomplex scenes with intricate compositions.\n","authors":["Longtao Jiang","Zhendong Wang","Jianmin Bao","Wengang Zhou","Dongdong Chen","Lei Shi","Dong Chen","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2501.08279v1.pdf","comment":"Project at: https://longtaojiang.github.io/smarteraser.github.io/"},{"id":"http://arxiv.org/abs/2406.08476v2","updated":"2025-01-14T17:46:01Z","published":"2024-06-12T17:59:04Z","title":"RMem: Restricted Memory Banks Improve Video Object Segmentation","summary":"  With recent video object segmentation (VOS) benchmarks evolving to\nchallenging scenarios, we revisit a simple but overlooked strategy: restricting\nthe size of memory banks. This diverges from the prevalent practice of\nexpanding memory banks to accommodate extensive historical information. Our\nspecially designed \"memory deciphering\" study offers a pivotal insight\nunderpinning such a strategy: expanding memory banks, while seemingly\nbeneficial, actually increases the difficulty for VOS modules to decode\nrelevant features due to the confusion from redundant information. By\nrestricting memory banks to a limited number of essential frames, we achieve a\nnotable improvement in VOS accuracy. This process balances the importance and\nfreshness of frames to maintain an informative memory bank within a bounded\ncapacity. Additionally, restricted memory banks reduce the training-inference\ndiscrepancy in memory lengths compared with continuous expansion. This fosters\nnew opportunities in temporal reasoning and enables us to introduce the\npreviously overlooked \"temporal positional embedding.\" Finally, our insights\nare embodied in \"RMem\" (\"R\" for restricted), a simple yet effective VOS\nmodification that excels at challenging VOS scenarios and establishes new state\nof the art for object state changes (on the VOST dataset) and long videos (on\nthe Long Videos dataset). Our code and demo are available at\nhttps://restricted-memory.github.io/.\n","authors":["Junbao Zhou","Ziqi Pang","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2406.08476v2.pdf","comment":"CVPR 2024, Project Page: https://restricted-memory.github.io/"},{"id":"http://arxiv.org/abs/2409.07571v3","updated":"2025-01-14T17:33:46Z","published":"2024-09-11T18:58:16Z","title":"FaVoR: Features via Voxel Rendering for Camera Relocalization","summary":"  Camera relocalization methods range from dense image alignment to direct\ncamera pose regression from a query image. Among these, sparse feature matching\nstands out as an efficient, versatile, and generally lightweight approach with\nnumerous applications. However, feature-based methods often struggle with\nsignificant viewpoint and appearance changes, leading to matching failures and\ninaccurate pose estimates. To overcome this limitation, we propose a novel\napproach that leverages a globally sparse yet locally dense 3D representation\nof 2D features. By tracking and triangulating landmarks over a sequence of\nframes, we construct a sparse voxel map optimized to render image patch\ndescriptors observed during tracking. Given an initial pose estimate, we first\nsynthesize descriptors from the voxels using volumetric rendering and then\nperform feature matching to estimate the camera pose. This methodology enables\nthe generation of descriptors for unseen views, enhancing robustness to view\nchanges. We extensively evaluate our method on the 7-Scenes and Cambridge\nLandmarks datasets. Our results show that our method significantly outperforms\nexisting state-of-the-art feature representation techniques in indoor\nenvironments, achieving up to a 39% improvement in median translation error.\nAdditionally, our approach yields comparable results to other methods for\noutdoor scenarios while maintaining lower memory and computational costs.\n","authors":["Vincenzo Polizzi","Marco Cannici","Davide Scaramuzza","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2409.07571v3.pdf","comment":"Accepted to the IEEE/CVF Winter Conference on Applications of\n  Computer Vision (WACV), Tucson, Arizona, US, Feb 28-Mar 4, 2025"},{"id":"http://arxiv.org/abs/2501.06693v2","updated":"2025-01-14T17:29:06Z","published":"2025-01-12T03:01:15Z","title":"Vid2Sim: Realistic and Interactive Simulation from Video for Urban\n  Navigation","summary":"  Sim-to-real gap has long posed a significant challenge for robot learning in\nsimulation, preventing the deployment of learned models in the real world.\nPrevious work has primarily focused on domain randomization and system\nidentification to mitigate this gap. However, these methods are often limited\nby the inherent constraints of the simulation and graphics engines. In this\nwork, we propose Vid2Sim, a novel framework that effectively bridges the\nsim2real gap through a scalable and cost-efficient real2sim pipeline for neural\n3D scene reconstruction and simulation. Given a monocular video as input,\nVid2Sim can generate photorealistic and physically interactable 3D simulation\nenvironments to enable the reinforcement learning of visual navigation agents\nin complex urban environments. Extensive experiments demonstrate that Vid2Sim\nsignificantly improves the performance of urban navigation in the digital twins\nand real world by 31.2% and 68.3% in success rate compared with agents trained\nwith prior simulation methods.\n","authors":["Ziyang Xie","Zhizheng Liu","Zhenghao Peng","Wayne Wu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.06693v2.pdf","comment":"Project page: https://metadriverse.github.io/vid2sim/"},{"id":"http://arxiv.org/abs/2501.08266v1","updated":"2025-01-14T17:26:02Z","published":"2025-01-14T17:26:02Z","title":"AI Driven Water Segmentation with deep learning models for Enhanced\n  Flood Monitoring","summary":"  Flooding is a major natural hazard causing significant fatalities and\neconomic losses annually, with increasing frequency due to climate change.\nRapid and accurate flood detection and monitoring are crucial for mitigating\nthese impacts. This study compares the performance of three deep learning\nmodels UNet, ResNet, and DeepLabv3 for pixelwise water segmentation to aid in\nflood detection, utilizing images from drones, in field observations, and\nsocial media. This study involves creating a new dataset that augments\nwellknown benchmark datasets with flood-specific images, enhancing the\nrobustness of the models. The UNet, ResNet, and DeepLab v3 architectures are\ntested to determine their effectiveness in various environmental conditions and\ngeographical locations, and the strengths and limitations of each model are\nalso discussed here, providing insights into their applicability in different\nscenarios by predicting image segmentation masks. This fully automated approach\nallows these models to isolate flooded areas in images, significantly reducing\nprocessing time compared to traditional semi-automated methods. The outcome of\nthis study is to predict segmented masks for each image effected by a flood\ndisaster and the validation accuracy of these models. This methodology\nfacilitates timely and continuous flood monitoring, providing vital data for\nemergency response teams to reduce loss of life and economic damages. It offers\na significant reduction in the time required to generate flood maps, cutting\ndown the manual processing time. Additionally, we present avenues for future\nresearch, including the integration of multimodal data sources and the\ndevelopment of robust deep learning architectures tailored specifically for\nflood detection tasks. Overall, our work contributes to the advancement of\nflood management strategies through innovative use of deep learning\ntechnologies.\n","authors":["Sanjida Afrin Mou","Tasfia Noor Chowdhury","Adib Ibn Mannan","Sadia Nourin Mim","Lubana Tarannum","Tasrin Noman","Jamal Uddin Ahamed"],"pdf_url":"https://arxiv.org/pdf/2501.08266v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.08258v1","updated":"2025-01-14T17:10:02Z","published":"2025-01-14T17:10:02Z","title":"Towards an End-to-End (E2E) Adversarial Learning and Application in the\n  Physical World","summary":"  The traditional learning process of patch-based adversarial attacks,\nconducted in the digital domain and then applied in the physical domain (e.g.,\nvia printed stickers), may suffer from reduced performance due to adversarial\npatches' limited transferability from the digital domain to the physical\ndomain. Given that previous studies have considered using projectors to apply\nadversarial attacks, we raise the following question: can adversarial learning\n(i.e., patch generation) be performed entirely in the physical domain with a\nprojector? In this work, we propose the Physical-domain Adversarial Patch\nLearning Augmentation (PAPLA) framework, a novel end-to-end (E2E) framework\nthat converts adversarial learning from the digital domain to the physical\ndomain using a projector. We evaluate PAPLA across multiple scenarios,\nincluding controlled laboratory settings and realistic outdoor environments,\ndemonstrating its ability to ensure attack success compared to conventional\ndigital learning-physical application (DL-PA) methods. We also analyze the\nimpact of environmental factors, such as projection surface color, projector\nstrength, ambient light, distance, and angle of the target object relative to\nthe camera, on the effectiveness of projected patches. Finally, we demonstrate\nthe feasibility of the attack against a parked car and a stop sign in a\nreal-world outdoor environment. Our results show that under specific\nconditions, E2E adversarial learning in the physical domain eliminates the\ntransferability issue and ensures evasion by object detectors. Finally, we\nprovide insights into the challenges and opportunities of applying adversarial\nlearning in the physical domain and explain where such an approach is more\neffective than using a sticker.\n","authors":["Dudi Biton","Jacob Shams","Koda Satoru","Asaf Shabtai","Yuval Elovici","Ben Nassi"],"pdf_url":"https://arxiv.org/pdf/2501.08258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10733v4","updated":"2025-01-14T16:47:44Z","published":"2024-10-14T17:15:07Z","title":"Deep Compression Autoencoder for Efficient High-Resolution Diffusion\n  Models","summary":"  We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder\nmodels for accelerating high-resolution diffusion models. Existing autoencoder\nmodels have demonstrated impressive results at a moderate spatial compression\nratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for\nhigh spatial compression ratios (e.g., 64x). We address this challenge by\nintroducing two key techniques: (1) Residual Autoencoding, where we design our\nmodels to learn residuals based on the space-to-channel transformed features to\nalleviate the optimization difficulty of high spatial-compression autoencoders;\n(2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases\ntraining strategy for mitigating the generalization penalty of high\nspatial-compression autoencoders. With these designs, we improve the\nautoencoder's spatial compression ratio up to 128 while maintaining the\nreconstruction quality. Applying our DC-AE to latent diffusion models, we\nachieve significant speedup without accuracy drop. For example, on ImageNet\n512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup\non H100 GPU for UViT-H while achieving a better FID, compared with the widely\nused SD-VAE-f8 autoencoder. Our code is available at\nhttps://github.com/mit-han-lab/efficientvit.\n","authors":["Junyu Chen","Han Cai","Junsong Chen","Enze Xie","Shang Yang","Haotian Tang","Muyang Li","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2410.10733v4.pdf","comment":"Preprint. First two authors contributed equally to this work. Update:\n  add USiT (UViT+SiT sampler) results"},{"id":"http://arxiv.org/abs/2405.20299v4","updated":"2025-01-14T16:38:36Z","published":"2024-05-30T17:46:23Z","title":"Scaling White-Box Transformers for Vision","summary":"  CRATE, a white-box transformer architecture designed to learn compressed and\nsparse representations, offers an intriguing alternative to standard vision\ntransformers (ViTs) due to its inherent mathematical interpretability. Despite\nextensive investigations into the scaling behaviors of language and vision\ntransformers, the scalability of CRATE remains an open question which this\npaper aims to address. Specifically, we propose CRATE-$\\alpha$, featuring\nstrategic yet minimal modifications to the sparse coding block in the CRATE\narchitecture design, and a light training recipe designed to improve the\nscalability of CRATE. Through extensive experiments, we demonstrate that\nCRATE-$\\alpha$ can effectively scale with larger model sizes and datasets. For\nexample, our CRATE-$\\alpha$-B substantially outperforms the prior best CRATE-B\nmodel accuracy on ImageNet classification by 3.7%, achieving an accuracy of\n83.2%. Meanwhile, when scaling further, our CRATE-$\\alpha$-L obtains an\nImageNet classification accuracy of 85.1%. More notably, these model\nperformance improvements are achieved while preserving, and potentially even\nenhancing the interpretability of learned CRATE models, as we demonstrate\nthrough showing that the learned token representations of increasingly larger\ntrained CRATE-$\\alpha$ models yield increasingly higher-quality unsupervised\nobject segmentation of images. The project page is\nhttps://rayjryang.github.io/CRATE-alpha/.\n","authors":["Jinrui Yang","Xianhang Li","Druv Pai","Yuyin Zhou","Yi Ma","Yaodong Yu","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2405.20299v4.pdf","comment":"project page: https://rayjryang.github.io/CRATE-alpha/"},{"id":"http://arxiv.org/abs/2501.08245v1","updated":"2025-01-14T16:31:01Z","published":"2025-01-14T16:31:01Z","title":"Continual Deep Active Learning for Medical Imaging: Replay-Base\n  Architecture for Context Adaptation","summary":"  Deep Learning for medical imaging faces challenges in adapting and\ngeneralizing to new contexts. Additionally, it often lacks sufficient labeled\ndata for specific tasks requiring significant annotation effort. Continual\nLearning (CL) tackles adaptability and generalizability by enabling lifelong\nlearning from a data stream while mitigating forgetting of previously learned\nknowledge. Active Learning (AL) reduces the number of required annotations for\neffective training. This work explores both approaches (CAL) to develop a novel\nframework for robust medical image analysis. Based on the automatic recognition\nof shifts in image characteristics, Replay-Base Architecture for Context\nAdaptation (RBACA) employs a CL rehearsal method to continually learn from\ndiverse contexts, and an AL component to select the most informative instances\nfor annotation. A novel approach to evaluate CAL methods is established using a\ndefined metric denominated IL-Score, which allows for the simultaneous\nassessment of transfer learning, forgetting, and final model performance. We\nshow that RBACA works in domain and class-incremental learning scenarios, by\nassessing its IL-Score on the segmentation and diagnosis of cardiac images. The\nresults show that RBACA outperforms a baseline framework without CAL, and a\nstate-of-the-art CAL method across various memory sizes and annotation budgets.\nOur code is available in https://github.com/RuiDaniel/RBACA .\n","authors":["Rui Daniel","M. Rita Verdelho","Catarina Barata","Carlos Santiago"],"pdf_url":"https://arxiv.org/pdf/2501.08245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08241v1","updated":"2025-01-14T16:28:02Z","published":"2025-01-14T16:28:02Z","title":"A Feature-Level Ensemble Model for COVID-19 Identification in CXR Images\n  using Choquet Integral and Differential Evolution Optimization","summary":"  The COVID-19 pandemic has profoundly impacted billions globally. It\nchallenges public health and healthcare systems due to its rapid spread and\nsevere respiratory effects. An effective strategy to mitigate the COVID-19\npandemic involves integrating testing to identify infected individuals. While\nRT-PCR is considered the gold standard for diagnosing COVID-19, it has some\nlimitations such as the risk of false negatives. To address this problem, this\npaper introduces a novel Deep Learning Diagnosis System that integrates\npre-trained Deep Convolutional Neural Networks (DCNNs) within an ensemble\nlearning framework to achieve precise identification of COVID-19 cases from\nChest X-ray (CXR) images. We combine feature vectors from the final hidden\nlayers of pre-trained DCNNs using the Choquet integral to capture interactions\nbetween different DCNNs that a linear approach cannot. We employed\nSugeno-$\\lambda$ measure theory to derive fuzzy measures for subsets of\nnetworks to enable aggregation. We utilized Differential Evolution to estimate\nfuzzy densities. We developed a TensorFlow-based layer for Choquet operation to\nfacilitate efficient aggregation, due to the intricacies involved in\naggregating feature vectors. Experimental results on the COVIDx dataset show\nthat our ensemble model achieved 98\\% accuracy in three-class classification\nand 99.50\\% in binary classification, outperforming its components-DenseNet-201\n(97\\% for three-class, 98.75\\% for binary), Inception-v3 (96.25\\% for\nthree-class, 98.50\\% for binary), and Xception (94.50\\% for three-class, 98\\%\nfor binary)-and surpassing many previous methods.\n","authors":["Amir Reza Takhsha","Maryam Rastgarpour","Mozhgan Naderi"],"pdf_url":"https://arxiv.org/pdf/2501.08241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10729v2","updated":"2025-01-14T16:17:00Z","published":"2024-06-15T20:04:06Z","title":"A Comprehensive Survey of Foundation Models in Medicine","summary":"  Foundation models (FMs) are large-scale deep learning models that are\ndeveloped using large datasets and self-supervised learning methods. These\nmodels serve as a base for different downstream tasks, including healthcare.\nFMs have been adopted with great success across various domains within\nhealthcare. Existing healthcare-based surveys have not yet included all of\nthese domains. Therefore, we provide a detailed survey of FMs in healthcare. We\nfocus on the history, learning strategies, flagship models, applications, and\nchallenges of FMs. We explore how FMs such as the BERT and GPT families are\nreshaping various healthcare domains, including clinical large language models,\nmedical image analysis, and omics. Furthermore, we provide a detailed taxonomy\nof healthcare applications facilitated by FMs, such as clinical NLP, medical\ncomputer vision, graph learning, and other biology-related tasks. Despite the\npromising opportunities FMs provide, they also have several associated\nchallenges, which are explained in detail. We also outline open research issues\nand potential lessons learned to provide researchers and practitioners with\ninsights into the capabilities of FMs in healthcare to advance their deployment\nand mitigate associated risks.\n","authors":["Wasif Khan","Seowung Leem","Kyle B. See","Joshua K. Wong","Shaoting Zhang","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2406.10729v2.pdf","comment":"Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING"},{"id":"http://arxiv.org/abs/2307.09059v3","updated":"2025-01-14T16:11:11Z","published":"2023-07-18T08:23:46Z","title":"Text-guided Image Restoration and Semantic Enhancement for Text-to-Image\n  Person Retrieval","summary":"  The goal of Text-to-Image Person Retrieval (TIPR) is to retrieve specific\nperson images according to the given textual descriptions. A primary challenge\nin this task is bridging the substantial representational gap between visual\nand textual modalities. The prevailing methods map texts and images into\nunified embedding space for matching, while the intricate semantic\ncorrespondences between texts and images are still not effectively constructed.\nTo address this issue, we propose a novel TIPR framework to build fine-grained\ninteractions and alignment between person images and the corresponding texts.\nSpecifically, via fine-tuning the Contrastive Language-Image Pre-training\n(CLIP) model, a visual-textual dual encoder is firstly constructed, to\npreliminarily align the image and text features. Secondly, a Text-guided Image\nRestoration (TIR) auxiliary task is proposed to map abstract textual entities\nto specific image regions, improving the alignment between local textual and\nvisual embeddings. Additionally, a cross-modal triplet loss is presented to\nhandle hard samples, and further enhance the model's discriminability for minor\ndifferences. Moreover, a pruning-based text data augmentation approach is\nproposed to enhance focus on essential elements in descriptions, thereby\navoiding excessive model attention to less significant information. The\nexperimental results show our proposed method outperforms state-of-the-art\nmethods on three popular benchmark datasets, and the code will be made publicly\navailable at https://github.com/Delong-liu-bupt/SEN.\n","authors":["Delong Liu","Haiwen Li","Zhicheng Zhao","Yuan Dong","Nikolaos V. Boulgouris"],"pdf_url":"https://arxiv.org/pdf/2307.09059v3.pdf","comment":"The paper was withdrawn due to a dispute among the authors regarding\n  the content of the article"},{"id":"http://arxiv.org/abs/2501.08226v1","updated":"2025-01-14T16:10:25Z","published":"2025-01-14T16:10:25Z","title":"Efficient Deep Learning-based Forward Solvers for Brain Tumor Growth\n  Models","summary":"  Glioblastoma, a highly aggressive brain tumor, poses major challenges due to\nits poor prognosis and high morbidity rates. Partial differential\nequation-based models offer promising potential to enhance therapeutic outcomes\nby simulating patient-specific tumor behavior for improved radiotherapy\nplanning. However, model calibration remains a bottleneck due to the high\ncomputational demands of optimization methods like Monte Carlo sampling and\nevolutionary algorithms. To address this, we recently introduced an approach\nleveraging a neural forward solver with gradient-based optimization to\nsignificantly reduce calibration time. This approach requires a highly accurate\nand fully differentiable forward model. We investigate multiple architectures,\nincluding (i) an enhanced TumorSurrogate, (ii) a modified nnU-Net, and (iii) a\n3D Vision Transformer (ViT). The optimized TumorSurrogate achieved the best\noverall results, excelling in both tumor outline matching and voxel-level\nprediction of tumor cell concentration. It halved the MSE relative to the\nbaseline model and achieved the highest Dice score across all tumor cell\nconcentration thresholds. Our study demonstrates significant enhancement in\nforward solver performance and outlines important future research directions.\n","authors":["Zeineb Haouari","Jonas Weidner","Ivan Ezhov","Aswathi Varma","Daniel Rueckert","Bjoern Menze","Benedikt Wiestler"],"pdf_url":"https://arxiv.org/pdf/2501.08226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08225v1","updated":"2025-01-14T16:09:16Z","published":"2025-01-14T16:09:16Z","title":"FramePainter: Endowing Interactive Image Editing with Video Diffusion\n  Priors","summary":"  Interactive image editing allows users to modify images through visual\ninteraction operations such as drawing, clicking, and dragging. Existing\nmethods construct such supervision signals from videos, as they capture how\nobjects change with various physical interactions. However, these models are\nusually built upon text-to-image diffusion models, so necessitate (i) massive\ntraining samples and (ii) an additional reference encoder to learn real-world\ndynamics and visual consistency. In this paper, we reformulate this task as an\nimage-to-video generation problem, so that inherit powerful video diffusion\npriors to reduce training costs and ensure temporal consistency. Specifically,\nwe introduce FramePainter as an efficient instantiation of this formulation.\nInitialized with Stable Video Diffusion, it only uses a lightweight sparse\ncontrol encoder to inject editing signals. Considering the limitations of\ntemporal attention in handling large motion between two frames, we further\npropose matching attention to enlarge the receptive field while encouraging\ndense correspondence between edited and source image tokens. We highlight the\neffectiveness and efficiency of FramePainter across various of editing signals:\nit domainantly outperforms previous state-of-the-art methods with far less\ntraining data, achieving highly seamless and coherent editing of images, \\eg,\nautomatically adjust the reflection of the cup. Moreover, FramePainter also\nexhibits exceptional generalization in scenarios not present in real-world\nvideos, \\eg, transform the clownfish into shark-like shape. Our code will be\navailable at https://github.com/YBYBZhang/FramePainter.\n","authors":["Yabo Zhang","Xinpeng Zhou","Yihan Zeng","Hang Xu","Hui Li","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2501.08225v1.pdf","comment":"Code: https://github.com/YBYBZhang/FramePainter"},{"id":"http://arxiv.org/abs/2408.12454v3","updated":"2025-01-14T15:35:55Z","published":"2024-08-22T14:52:53Z","title":"Relaxed Rotational Equivariance via $G$-Biases in Vision","summary":"  Group Equivariant Convolution (GConv) can capture rotational equivariance\nfrom original data. It assumes uniform and strict rotational equivariance\nacross all features as the transformations under the specific group. However,\nthe presentation or distribution of real-world data rarely conforms to strict\nrotational equivariance, commonly referred to as Rotational Symmetry-Breaking\n(RSB) in the system or dataset, making GConv unable to adapt effectively to\nthis phenomenon. Motivated by this, we propose a simple but highly effective\nmethod to address this problem, which utilizes a set of learnable biases called\n$G$-Biases under the group order to break strict group constraints and then\nachieve a Relaxed Rotational Equivariant Convolution (RREConv). To validate the\nefficiency of RREConv, we conduct extensive ablation experiments on the\ndiscrete rotational group $\\mathcal{C}_n$. Experiments demonstrate that the\nproposed RREConv-based methods achieve excellent performance compared to\nexisting GConv-based methods in both classification and 2D object detection\ntasks on the natural image datasets.\n","authors":["Zhiqiang Wu","Yingjie Liu","Licheng Sun","Jian Yang","Hanlin Dong","Shing-Ho J. Lin","Xuan Tang","Jinpeng Mi","Bo Jin","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2408.12454v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08199v1","updated":"2025-01-14T15:23:36Z","published":"2025-01-14T15:23:36Z","title":"EmoNeXt: an Adapted ConvNeXt for Facial Emotion Recognition","summary":"  Facial expressions play a crucial role in human communication serving as a\npowerful and impactful means to express a wide range of emotions. With\nadvancements in artificial intelligence and computer vision, deep neural\nnetworks have emerged as effective tools for facial emotion recognition. In\nthis paper, we propose EmoNeXt, a novel deep learning framework for facial\nexpression recognition based on an adapted ConvNeXt architecture network. We\nintegrate a Spatial Transformer Network (STN) to focus on feature-rich regions\nof the face and Squeeze-and-Excitation blocks to capture channel-wise\ndependencies. Moreover, we introduce a self-attention regularization term,\nencouraging the model to generate compact feature vectors. We demonstrate the\nsuperiority of our model over existing state-of-the-art deep learning models on\nthe FER2013 dataset regarding emotion classification accuracy.\n","authors":["Yassine El Boudouri","Amine Bohi"],"pdf_url":"https://arxiv.org/pdf/2501.08199v1.pdf","comment":"6 pages, 5 figures and 2 tables. 2023 IEEE 25th International\n  Workshop on Multimedia Signal Processing (MMSP), Poitiers, France"},{"id":"http://arxiv.org/abs/2501.08195v1","updated":"2025-01-14T15:18:28Z","published":"2025-01-14T15:18:28Z","title":"Self-supervised Deep Hyperspectral Inpainting with the Plug and Play and\n  Deep Image Prior Models","summary":"  Hyperspectral images are typically composed of hundreds of narrow and\ncontiguous spectral bands, each containing information regarding the material\ncomposition of the imaged scene. However, these images can be affected by\nvarious sources of noise, distortions, or data loss, which can significantly\ndegrade their quality and usefulness. This paper introduces a convergent\nguaranteed algorithm, LRS-PnP-DIP(1-Lip), which successfully addresses the\ninstability issue of DHP that has been reported before. The proposed algorithm\nextends the successful joint low-rank and sparse model to further exploit the\nunderlying data structures beyond the conventional and sometimes restrictive\nunions of subspace models. A stability analysis guarantees the convergence of\nthe proposed algorithm under mild assumptions , which is crucial for its\napplication in real-world scenarios. Extensive experiments demonstrate that the\nproposed solution consistently delivers visually and quantitatively superior\ninpainting results, establishing state-of-the-art performance.\n","authors":["Shuo Li","Mehrdad Yaghoobi"],"pdf_url":"https://arxiv.org/pdf/2501.08195v1.pdf","comment":"31 pages, 9 Figures, 7 Tables. arXiv admin note: text overlap with\n  arXiv:2306.08128"},{"id":"http://arxiv.org/abs/2501.08188v1","updated":"2025-01-14T15:13:00Z","published":"2025-01-14T15:13:00Z","title":"A Critical Synthesis of Uncertainty Quantification and Foundation Models\n  in Monocular Depth Estimation","summary":"  While recent foundation models have enabled significant breakthroughs in\nmonocular depth estimation, a clear path towards safe and reliable deployment\nin the real-world remains elusive. Metric depth estimation, which involves\npredicting absolute distances, poses particular challenges, as even the most\nadvanced foundation models remain prone to critical errors. Since quantifying\nthe uncertainty has emerged as a promising endeavor to address these\nlimitations and enable trustworthy deployment, we fuse five different\nuncertainty quantification methods with the current state-of-the-art\nDepthAnythingV2 foundation model. To cover a wide range of metric depth\ndomains, we evaluate their performance on four diverse datasets. Our findings\nidentify fine-tuning with the Gaussian Negative Log-Likelihood Loss (GNLL) as a\nparticularly promising approach, offering reliable uncertainty estimates while\nmaintaining predictive performance and computational efficiency on par with the\nbaseline, encompassing both training and inference time. By fusing uncertainty\nquantification and foundation models within the context of monocular depth\nestimation, this paper lays a critical foundation for future research aimed at\nimproving not only model performance but also its explainability. Extending\nthis critical synthesis of uncertainty quantification and foundation models\ninto other crucial tasks, such as semantic segmentation and pose estimation,\npresents exciting opportunities for safer and more reliable machine vision\nsystems.\n","authors":["Steven Landgraf","Rongjun Qin","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2501.08188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08182v1","updated":"2025-01-14T15:08:56Z","published":"2025-01-14T15:08:56Z","title":"CG-MER: A Card Game-based Multimodal dataset for Emotion Recognition","summary":"  The field of affective computing has seen significant advancements in\nexploring the relationship between emotions and emerging technologies. This\npaper presents a novel and valuable contribution to this field with the\nintroduction of a comprehensive French multimodal dataset designed specifically\nfor emotion recognition. The dataset encompasses three primary modalities:\nfacial expressions, speech, and gestures, providing a holistic perspective on\nemotions. Moreover, the dataset has the potential to incorporate additional\nmodalities, such as Natural Language Processing (NLP) to expand the scope of\nemotion recognition research. The dataset was curated through engaging\nparticipants in card game sessions, where they were prompted to express a range\nof emotions while responding to diverse questions. The study included 10\nsessions with 20 participants (9 females and 11 males). The dataset serves as a\nvaluable resource for furthering research in emotion recognition and provides\nan avenue for exploring the intricate connections between human emotions and\ndigital technologies.\n","authors":["Nessrine Farhat","Amine Bohi","Leila Ben Letaifa","Rim Slama"],"pdf_url":"https://arxiv.org/pdf/2501.08182v1.pdf","comment":"8 pages, 2 figures and 4 tables. Sixteenth International Conference\n  on Machine Vision (ICMV 2023), Yerevan, Armenia"},{"id":"http://arxiv.org/abs/2501.08180v1","updated":"2025-01-14T15:03:53Z","published":"2025-01-14T15:03:53Z","title":"D$^2$-DPM: Dual Denoising for Quantized Diffusion Probabilistic Models","summary":"  Diffusion models have achieved cutting-edge performance in image generation.\nHowever, their lengthy denoising process and computationally intensive score\nestimation network impede their scalability in low-latency and\nresource-constrained scenarios. Post-training quantization (PTQ) compresses and\naccelerates diffusion models without retraining, but it inevitably introduces\nadditional quantization noise, resulting in mean and variance deviations. In\nthis work, we propose D2-DPM, a dual denoising mechanism aimed at precisely\nmitigating the adverse effects of quantization noise on the noise estimation\nnetwork. Specifically, we first unravel the impact of quantization noise on the\nsampling equation into two components: the mean deviation and the variance\ndeviation. The mean deviation alters the drift coefficient of the sampling\nequation, influencing the trajectory trend, while the variance deviation\nmagnifies the diffusion coefficient, impacting the convergence of the sampling\ntrajectory. The proposed D2-DPM is thus devised to denoise the quantization\nnoise at each time step, and then denoise the noisy sample through the inverse\ndiffusion iterations. Experimental results demonstrate that D2-DPM achieves\nsuperior generation quality, yielding a 1.42 lower FID than the full-precision\nmodel while achieving 3.99x compression and 11.67x bit-operation acceleration.\n","authors":["Qian Zeng","Jie Song","Han Zheng","Hao Jiang","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2501.08180v1.pdf","comment":"9 pages, 4 figures, acceptted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.08174v1","updated":"2025-01-14T14:56:31Z","published":"2025-01-14T14:56:31Z","title":"Object-Centric 2D Gaussian Splatting: Background Removal and\n  Occlusion-Aware Pruning for Compact Object Models","summary":"  Current Gaussian Splatting approaches are effective for reconstructing entire\nscenes but lack the option to target specific objects, making them\ncomputationally expensive and unsuitable for object-specific applications. We\npropose a novel approach that leverages object masks to enable targeted\nreconstruction, resulting in object-centric models. Additionally, we introduce\nan occlusion-aware pruning strategy to minimize the number of Gaussians without\ncompromising quality. Our method reconstructs compact object models, yielding\nobject-centric Gaussian and mesh representations that are up to 96\\% smaller\nand up to 71\\% faster to train compared to the baseline while retaining\ncompetitive quality. These representations are immediately usable for\ndownstream applications such as appearance editing and physics simulation\nwithout additional processing.\n","authors":["Marcel Rogge","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2501.08174v1.pdf","comment":"Accepted at ICPRAM 2025 (https://icpram.scitevents.org/Home.aspx)"},{"id":"http://arxiv.org/abs/2411.19835v2","updated":"2025-01-14T14:53:10Z","published":"2024-11-29T16:45:25Z","title":"Feedback-driven object detection and iterative model improvement","summary":"  Automated object detection has become increasingly valuable across diverse\napplications, yet efficient, high-quality annotation remains a persistent\nchallenge. In this paper, we present the development and evaluation of a\nplatform designed to interactively improve object detection models. The\nplatform allows uploading and annotating images as well as fine-tuning object\ndetection models. Users can then manually review and refine annotations,\nfurther creating improved snapshots that are used for automatic object\ndetection on subsequent image uploads - a process we refer to as semi-automatic\nannotation resulting in a significant gain in annotation efficiency.\n  Whereas iterative refinement of model results to speed up annotation has\nbecome common practice, we are the first to quantitatively evaluate its\nbenefits with respect to time, effort, and interaction savings. Our\nexperimental results show clear evidence for a significant time reduction of up\nto 53% for semi-automatic compared to manual annotation. Importantly, these\nefficiency gains did not compromise annotation quality, while matching or\noccasionally even exceeding the accuracy of manual annotations. These findings\ndemonstrate the potential of our lightweight annotation platform for creating\nhigh-quality object detection datasets and provide best practices to guide\nfuture development of annotation platforms.\n  The platform is open-source, with the frontend and backend repositories\navailable on GitHub (https://github.com/ml-lab-htw/iterative-annotate). To\nsupport the understanding of our labeling process, we have created an\nexplanatory video demonstrating the methodology using microscopy images of E.\ncoli bacteria as an example. The video is available on YouTube\n(https://www.youtube.com/watch?v=CM9uhE8NN5E).\n","authors":["Sönke Tenckhoff","Mario Koddenbrock","Erik Rodner"],"pdf_url":"https://arxiv.org/pdf/2411.19835v2.pdf","comment":"AI4EA24"},{"id":"http://arxiv.org/abs/2501.08170v1","updated":"2025-01-14T14:50:57Z","published":"2025-01-14T14:50:57Z","title":"Benchmarking Multimodal Models for Fine-Grained Image Analysis: A\n  Comparative Study Across Diverse Visual Features","summary":"  This article introduces a benchmark designed to evaluate the capabilities of\nmultimodal models in analyzing and interpreting images. The benchmark focuses\non seven key visual aspects: main object, additional objects, background,\ndetail, dominant colors, style, and viewpoint. A dataset of 14,580 images,\ngenerated from diverse text prompts, was used to assess the performance of\nseven leading multimodal models. These models were evaluated on their ability\nto accurately identify and describe each visual aspect, providing insights into\ntheir strengths and weaknesses for comprehensive image understanding. The\nfindings of this benchmark have significant implications for the development\nand selection of multimodal models for various image analysis tasks.\n","authors":["Evgenii Evstafev"],"pdf_url":"https://arxiv.org/pdf/2501.08170v1.pdf","comment":"6 pages, 2 tables, 2 charts"},{"id":"http://arxiv.org/abs/2501.08169v1","updated":"2025-01-14T14:49:49Z","published":"2025-01-14T14:49:49Z","title":"Revolutionizing Communication with Deep Learning and XAI for Enhanced\n  Arabic Sign Language Recognition","summary":"  This study introduces an integrated approach to recognizing Arabic Sign\nLanguage (ArSL) using state-of-the-art deep learning models such as\nMobileNetV3, ResNet50, and EfficientNet-B2. These models are further enhanced\nby explainable AI (XAI) techniques to boost interpretability. The ArSL2018 and\nRGB Arabic Alphabets Sign Language (AASL) datasets are employed, with\nEfficientNet-B2 achieving peak accuracies of 99.48\\% and 98.99\\%, respectively.\nKey innovations include sophisticated data augmentation methods to mitigate\nclass imbalance, implementation of stratified 5-fold cross-validation for\nbetter generalization, and the use of Grad-CAM for clear model decision\ntransparency. The proposed system not only sets new benchmarks in recognition\naccuracy but also emphasizes interpretability, making it suitable for\napplications in healthcare, education, and inclusive communication\ntechnologies.\n","authors":["Mazen Balat","Rewaa Awaad","Ahmed B. Zaky","Salah A. Aly"],"pdf_url":"https://arxiv.org/pdf/2501.08169v1.pdf","comment":"13 pages, 25 figures, 16 tables"},{"id":"http://arxiv.org/abs/2412.13174v2","updated":"2025-01-14T14:48:32Z","published":"2024-12-17T18:53:43Z","title":"ORFormer: Occlusion-Robust Transformer for Accurate Facial Landmark\n  Detection","summary":"  Although facial landmark detection (FLD) has gained significant progress,\nexisting FLD methods still suffer from performance drops on partially\nnon-visible faces, such as faces with occlusions or under extreme lighting\nconditions or poses. To address this issue, we introduce ORFormer, a novel\ntransformer-based method that can detect non-visible regions and recover their\nmissing features from visible parts. Specifically, ORFormer associates each\nimage patch token with one additional learnable token called the messenger\ntoken. The messenger token aggregates features from all but its patch. This\nway, the consensus between a patch and other patches can be assessed by\nreferring to the similarity between its regular and messenger embeddings,\nenabling non-visible region identification. Our method then recovers occluded\npatches with features aggregated by the messenger tokens. Leveraging the\nrecovered features, ORFormer compiles high-quality heatmaps for the downstream\nFLD task. Extensive experiments show that our method generates heatmaps\nresilient to partial occlusions. By integrating the resultant heatmaps into\nexisting FLD methods, our method performs favorably against the state of the\narts on challenging datasets such as WFLW and COFW.\n","authors":["Jui-Che Chiang","Hou-Ning Hu","Bo-Syuan Hou","Chia-Yu Tseng","Yu-Lun Liu","Min-Hung Chen","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2412.13174v2.pdf","comment":"WACV 2025 Project Link: https://ben0919.github.io/ORFormer/"},{"id":"http://arxiv.org/abs/2501.08163v1","updated":"2025-01-14T14:41:51Z","published":"2025-01-14T14:41:51Z","title":"DM-Mamba: Dual-domain Multi-scale Mamba for MRI reconstruction","summary":"  The accelerated MRI reconstruction poses a challenging ill-posed inverse\nproblem due to the significant undersampling in k-space. Deep neural networks,\nsuch as CNNs and ViT, have shown substantial performance improvements for this\ntask while encountering the dilemma between global receptive fields and\nefficient computation. To this end, this paper pioneers exploring Mamba, a new\nparadigm for long-range dependency modeling with linear complexity, for\nefficient and effective MRI reconstruction. However, directly applying Mamba to\nMRI reconstruction faces three significant issues: (1) Mamba's row-wise and\ncolumn-wise scanning disrupts k-space's unique spectrum, leaving its potential\nin k-space learning unexplored. (2) Existing Mamba methods unfold feature maps\nwith multiple lengthy scanning paths, leading to long-range forgetting and high\ncomputational burden. (3) Mamba struggles with spatially-varying contents,\nresulting in limited diversity of local representations. To address these, we\npropose a dual-domain multi-scale Mamba for MRI reconstruction from the\nfollowing perspectives: (1) We pioneer vision Mamba in k-space learning. A\ncircular scanning is customized for spectrum unfolding, benefiting the global\nmodeling of k-space. (2) We propose a multi-scale Mamba with an efficient\nscanning strategy in both image and k-space domains. It mitigates long-range\nforgetting and achieves a better trade-off between efficiency and performance.\n(3) We develop a local diversity enhancement module to improve the\nspatially-varying representation of Mamba. Extensive experiments are conducted\non three public datasets for MRI reconstruction under various undersampling\npatterns. Comprehensive results demonstrate that our method significantly\noutperforms state-of-the-art methods with lower computational cost.\nImplementation code will be available at\nhttps://github.com/XiaoMengLiLiLi/DM-Mamba.\n","authors":["Yucong Meng","Zhiwei Yang","Zhijian Song","Yonghong Shi"],"pdf_url":"https://arxiv.org/pdf/2501.08163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06746v2","updated":"2025-01-14T14:40:35Z","published":"2025-01-12T08:04:52Z","title":"Diversified Augmentation with Domain Adaptation for Debiased Video\n  Temporal Grounding","summary":"  Temporal sentence grounding in videos (TSGV) faces challenges due to public\nTSGV datasets containing significant temporal biases, which are attributed to\nthe uneven temporal distributions of target moments. Existing methods generate\naugmented videos, where target moments are forced to have varying temporal\nlocations. However, since the video lengths of the given datasets have small\nvariations, only changing the temporal locations results in poor generalization\nability in videos with varying lengths. In this paper, we propose a novel\ntraining framework complemented by diversified data augmentation and a domain\ndiscriminator. The data augmentation generates videos with various lengths and\ntarget moment locations to diversify temporal distributions. However, augmented\nvideos inevitably exhibit distinct feature distributions which may introduce\nnoise. To address this, we design a domain adaptation auxiliary task to\ndiminish feature discrepancies between original and augmented videos. We also\nencourage the model to produce distinct predictions for videos with the same\ntext queries but different moment locations to promote debiased training.\nExperiments on Charades-CD and ActivityNet-CD datasets demonstrate the\neffectiveness and generalization abilities of our method in multiple grounding\nstructures, achieving state-of-the-art results.\n","authors":["Junlong Ren","Gangjian Zhang","Haifeng Sun","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06746v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.06040v2","updated":"2025-01-14T14:33:55Z","published":"2025-01-10T15:18:05Z","title":"MSCViT: A Small-size ViT architecture with Multi-Scale Self-Attention\n  Mechanism for Tiny Datasets","summary":"  Vision Transformer (ViT) has demonstrated significant potential in various\nvision tasks due to its strong ability in modelling long-range dependencies.\nHowever, such success is largely fueled by training on massive samples. In real\napplications, the large-scale datasets are not always available, and ViT\nperforms worse than Convolutional Neural Networks (CNNs) if it is only trained\non small scale dataset (called tiny dataset), since it requires large amount of\ntraining data to ensure its representational capacity. In this paper, a\nsmall-size ViT architecture with multi-scale self-attention mechanism and\nconvolution blocks is presented (dubbed MSCViT) to model different scales of\nattention at each layer. Firstly, we introduced wavelet convolution, which\nselectively combines the high-frequency components obtained by frequency\ndivision with our convolution channel to extract local features. Then, a\nlightweight multi-head attention module is developed to reduce the number of\ntokens and computational costs. Finally, the positional encoding (PE) in the\nbackbone is replaced by a local feature extraction module. Compared with the\noriginal ViT, it is parameter-efficient and is particularly suitable for tiny\ndatasets. Extensive experiments have been conducted on tiny datasets, in which\nour model achieves an accuracy of 84.68% on CIFAR-100 with 14.0M parameters and\n2.5 GFLOPs, without pre-training on large datasets.\n","authors":["Bowei Zhang","Yi Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08152v1","updated":"2025-01-14T14:26:18Z","published":"2025-01-14T14:26:18Z","title":"Energy Backdoor Attack to Deep Neural Networks","summary":"  The rise of deep learning (DL) has increased computing complexity and energy\nuse, prompting the adoption of application specific integrated circuits (ASICs)\nfor energy-efficient edge and mobile deployment. However, recent studies have\ndemonstrated the vulnerability of these accelerators to energy attacks. Despite\nthe development of various inference time energy attacks in prior research,\nbackdoor energy attacks remain unexplored. In this paper, we design an\ninnovative energy backdoor attack against deep neural networks (DNNs) operating\non sparsity-based accelerators. Our attack is carried out in two distinct\nphases: backdoor injection and backdoor stealthiness. Experimental results\nusing ResNet-18 and MobileNet-V2 models trained on CIFAR-10 and Tiny ImageNet\ndatasets show the effectiveness of our proposed attack in increasing energy\nconsumption on trigger samples while preserving the model's performance for\nclean/regular inputs. This demonstrates the vulnerability of DNNs to energy\nbackdoor attacks. The source code of our attack is available at:\nhttps://github.com/hbrachemi/energy_backdoor.\n","authors":["Hanene F. Z. Brachemi Meftah","Wassim Hamidouche","Sid Ahmed Fezza","Olivier Déforges","Kassem Kallas"],"pdf_url":"https://arxiv.org/pdf/2501.08152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09655v2","updated":"2025-01-14T14:22:05Z","published":"2022-10-18T07:48:59Z","title":"WINE: Wavelet-Guided GAN Inversion and Editing for High-Fidelity\n  Refinement","summary":"  Recent advanced GAN inversion models aim to convey high-fidelity information\nfrom original images to generators through methods using generator tuning or\nhigh-dimensional feature learning. Despite these efforts, accurately\nreconstructing image-specific details remains as a challenge due to the\ninherent limitations both in terms of training and structural aspects, leading\nto a bias towards low-frequency information. In this paper, we look into the\nwidely used pixel loss in GAN inversion, revealing its predominant focus on the\nreconstruction of low-frequency features. We then propose WINE, a\nWavelet-guided GAN Inversion aNd Editing model, which transfers the\nhigh-frequency information through wavelet coefficients via newly proposed\nwavelet loss and wavelet fusion scheme. Notably, WINE is the first attempt to\ninterpret GAN inversion in the frequency domain. Our experimental results\nshowcase the precision of WINE in preserving high-frequency details and\nenhancing image quality. Even in editing scenarios, WINE outperforms existing\nstate-of-the-art GAN inversion models with a fine balance between editability\nand reconstruction quality.\n","authors":["Chaewon Kim","Seung-Jun Moon","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2210.09655v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08142v1","updated":"2025-01-14T14:21:48Z","published":"2025-01-14T14:21:48Z","title":"Bootstrapping Corner Cases: High-Resolution Inpainting for Safety\n  Critical Detect and Avoid for Automated Flying","summary":"  Modern machine learning techniques have shown tremendous potential,\nespecially for object detection on camera images. For this reason, they are\nalso used to enable safety-critical automated processes such as autonomous\ndrone flights. We present a study on object detection for Detect and Avoid, a\nsafety critical function for drones that detects air traffic during automated\nflights for safety reasons. An ill-posed problem is the generation of good and\nespecially large data sets, since detection itself is the corner case. Most\nmodels suffer from limited ground truth in raw data, \\eg recorded air traffic\nor frontal flight with a small aircraft. It often leads to poor and critical\ndetection rates. We overcome this problem by using inpainting methods to\nbootstrap the dataset such that it explicitly contains the corner cases of the\nraw data. We provide an overview of inpainting methods and generative models\nand present an example pipeline given a small annotated dataset. We validate\nour method by generating a high-resolution dataset, which we make publicly\navailable and present it to an independent object detector that was fully\ntrained on real data.\n","authors":["Jonathan Lyhs","Lars Hinneburg","Michael Fischer","Florian Ölsner","Stefan Milz","Jeremy Tschirner","Patrick Mäder"],"pdf_url":"https://arxiv.org/pdf/2501.08142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08137v1","updated":"2025-01-14T14:15:10Z","published":"2025-01-14T14:15:10Z","title":"Audio-visual Deepfake Detection With Local Temporal Inconsistencies","summary":"  This paper proposes an audio-visual deepfake detection approach that aims to\ncapture fine-grained temporal inconsistencies between audio and visual\nmodalities. To achieve this, both architectural and data synthesis strategies\nare introduced. From an architectural perspective, a temporal distance map,\ncoupled with an attention mechanism, is designed to capture these\ninconsistencies while minimizing the impact of irrelevant temporal\nsubsequences. Moreover, we explore novel pseudo-fake generation techniques to\nsynthesize local inconsistencies. Our approach is evaluated against\nstate-of-the-art methods using the DFDC and FakeAVCeleb datasets, demonstrating\nits effectiveness in detecting audio-visual deepfakes.\n","authors":["Marcella Astrid","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2501.08137v1.pdf","comment":"Accepted in ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.06838v2","updated":"2025-01-14T14:09:23Z","published":"2025-01-12T15:14:58Z","title":"Generalized and Efficient 2D Gaussian Splatting for Arbitrary-scale\n  Super-Resolution","summary":"  Equipped with the continuous representation capability of Multi-Layer\nPerceptron (MLP), Implicit Neural Representation (INR) has been successfully\nemployed for Arbitrary-scale Super-Resolution (ASR). However, the limited\nreceptive field of the linear layers in MLP restricts the representation\ncapability of INR, while it is computationally expensive to query the MLP\nnumerous times to render each pixel. Recently, Gaussian Splatting (GS) has\nshown its advantages over INR in both visual quality and rendering speed in 3D\ntasks, which motivates us to explore whether GS can be employed for the ASR\ntask. However, directly applying GS to ASR is exceptionally challenging because\nthe original GS is an optimization-based method through overfitting each single\nscene, while in ASR we aim to learn a single model that can generalize to\ndifferent images and scaling factors. We overcome these challenges by\ndeveloping two novel techniques. Firstly, to generalize GS for ASR, we\nelaborately design an architecture to predict the corresponding\nimage-conditioned Gaussians of the input low-resolution image in a feed-forward\nmanner. Secondly, we implement an efficient differentiable 2D GPU/CUDA-based\nscale-aware rasterization to render super-resolved images by sampling discrete\nRGB values from the predicted contiguous Gaussians. Via end-to-end training,\nour optimized network, namely GSASR, can perform ASR for any image and unseen\nscaling factors. Extensive experiments validate the effectiveness of our\nproposed method. The project page can be found at\n\\url{https://mt-cly.github.io/GSASR.github.io/}.\n","authors":["Du Chen","Liyi Chen","Zhengqiang Zhang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08131v1","updated":"2025-01-14T14:07:48Z","published":"2025-01-14T14:07:48Z","title":"SAR Strikes Back: A New Hope for RSVQA","summary":"  Remote sensing visual question answering (RSVQA) is a task that automatically\nextracts information from satellite images and processes a question to predict\nthe answer from the images in textual form, helping with the interpretation of\nthe image. While different methods have been proposed to extract information\nfrom optical images with different spectral bands and resolutions, no method\nhas been proposed to answer questions from Synthetic Aperture Radar (SAR)\nimages. SAR images capture electromagnetic information from the scene, and are\nless affected by atmospheric conditions, such as clouds. In this work, our\nobjective is to introduce SAR in the RSVQA task, finding the best way to use\nthis modality. In our research, we carry out a study on different pipelines for\nthe task of RSVQA taking into account information from both SAR and optical\ndata. To this purpose, we also present a dataset that allows for the\nintroduction of SAR images in the RSVQA framework. We propose two different\nmodels to include the SAR modality. The first one is an end-to-end method in\nwhich we add an additional encoder for the SAR modality. In the second\napproach, we build on a two-stage framework. First, relevant information is\nextracted from SAR and, optionally, optical data. This information is then\ntranslated into natural language to be used in the second step which only\nrelies on a language model to provide the answer. We find that the second\npipeline allows us to obtain good results with SAR images alone. We then try\nvarious types of fusion methods to use SAR and optical images together, finding\nthat a fusion at the decision level achieves the best results on the proposed\ndataset. We show that SAR data offers additional information when fused with\nthe optical modality, particularly for questions related to specific land cover\nclasses, such as water areas.\n","authors":["Lucrezia Tosato","Flora Weissgerber","Laurent Wendling","Sylvain Lobry"],"pdf_url":"https://arxiv.org/pdf/2501.08131v1.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.08118v1","updated":"2025-01-14T13:51:14Z","published":"2025-01-14T13:51:14Z","title":"Revisiting Birds Eye View Perception Models with Frozen Foundation\n  Models: DINOv2 and Metric3Dv2","summary":"  Birds Eye View perception models require extensive data to perform and\ngeneralize effectively. While traditional datasets often provide abundant\ndriving scenes from diverse locations, this is not always the case. It is\ncrucial to maximize the utility of the available training data. With the advent\nof large foundation models such as DINOv2 and Metric3Dv2, a pertinent question\narises: can these models be integrated into existing model architectures to not\nonly reduce the required training data but surpass the performance of current\nmodels? We choose two model architectures in the vehicle segmentation domain to\nalter: Lift-Splat-Shoot, and Simple-BEV. For Lift-Splat-Shoot, we explore the\nimplementation of frozen DINOv2 for feature extraction and Metric3Dv2 for depth\nestimation, where we greatly exceed the baseline results by 7.4 IoU while\nutilizing only half the training data and iterations. Furthermore, we introduce\nan innovative application of Metric3Dv2's depth information as a PseudoLiDAR\npoint cloud incorporated into the Simple-BEV architecture, replacing\ntraditional LiDAR. This integration results in a +3 IoU improvement compared to\nthe Camera-only model.\n","authors":["Seamie Hayes","Ganesh Sistu","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2501.08118v1.pdf","comment":"Accepted for publication at the Electronic Imaging - Autonomous\n  Vehicles and Machines Connference 2025"},{"id":"http://arxiv.org/abs/2501.08115v1","updated":"2025-01-14T13:46:07Z","published":"2025-01-14T13:46:07Z","title":"RoHan: Robust Hand Detection in Operation Room","summary":"  Hand-specific localization has garnered significant interest within the\ncomputer vision community. Although there are numerous datasets with hand\nannotations from various angles and settings, domain transfer techniques\nfrequently struggle in surgical environments. This is mainly due to the limited\navailability of gloved hand instances and the unique challenges of operating\nrooms (ORs). Thus, hand-detection models tailored to OR settings require\nextensive training and expensive annotation processes. To overcome these\nchallenges, we present \"RoHan\" - a novel approach for robust hand detection in\nthe OR, leveraging advanced semi-supervised domain adaptation techniques to\ntackle the challenges of varying recording conditions, diverse glove colors,\nand occlusions common in surgical settings. Our methodology encompasses two\nmain stages: (1) data augmentation strategy that utilizes \"Artificial Gloves,\"\na method for augmenting publicly available hand datasets with synthetic images\nof hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that\nimproves detection performance in real-world OR settings through iterative\nprediction refinement and efficient frame filtering. We evaluate our method\nusing two datasets: simulated enterotomy repair and saphenous vein graft\nharvesting. \"RoHan\" substantially reduces the need for extensive labeling and\nmodel training, paving the way for the practical implementation of hand\ndetection technologies in medical settings.\n","authors":["Roi Papo","Sapir Gershov","Tom Friedman","Itay Or","Gil Bolotin","Shlomi Laufer"],"pdf_url":"https://arxiv.org/pdf/2501.08115v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2501.08114v1","updated":"2025-01-14T13:46:03Z","published":"2025-01-14T13:46:03Z","title":"Change Captioning in Remote Sensing: Evolution to SAT-Cap -- A\n  Single-Stage Transformer Approach","summary":"  Change captioning has become essential for accurately describing changes in\nmulti-temporal remote sensing data, providing an intuitive way to monitor\nEarth's dynamics through natural language. However, existing change captioning\nmethods face two key challenges: high computational demands due to multistage\nfusion strategy, and insufficient detail in object descriptions due to limited\nsemantic extraction from individual images. To solve these challenges, we\npropose SAT-Cap based on the transformers model with a single-stage feature\nfusion for remote sensing change captioning. In particular, SAT-Cap integrates\na Spatial-Channel Attention Encoder, a Difference-Guided Fusion module, and a\nCaption Decoder. Compared to typical models that require multi-stage fusion in\ntransformer encoder and fusion module, SAT-Cap uses only a simple cosine\nsimilarity-based fusion module for information integration, reducing the\ncomplexity of the model architecture. By jointly modeling spatial and channel\ninformation in Spatial-Channel Attention Encoder, our approach significantly\nenhances the model's ability to extract semantic information from objects in\nmulti-temporal remote sensing images. Extensive experiments validate the\neffectiveness of SAT-Cap, achieving CIDEr scores of 140.23% on the LEVIR-CC\ndataset and 97.74% on the DUBAI-CC dataset, surpassing current state-of-the-art\nmethods. The code and pre-trained models will be available online.\n","authors":["Yuduo Wang","Weikang Yu","Pedram Ghamisi"],"pdf_url":"https://arxiv.org/pdf/2501.08114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08111v1","updated":"2025-01-14T13:42:22Z","published":"2025-01-14T13:42:22Z","title":"EarthView: A Large Scale Remote Sensing Dataset for Self-Supervision","summary":"  This paper presents EarthView, a comprehensive dataset specifically designed\nfor self-supervision on remote sensing data, intended to enhance deep learning\napplications on Earth monitoring tasks. The dataset spans 15 tera pixels of\nglobal remote-sensing data, combining imagery from a diverse range of sources,\nincluding NEON, Sentinel, and a novel release of 1m spatial resolution data\nfrom Satellogic. Our dataset provides a wide spectrum of image data with\nvarying resolutions, harnessed from different sensors and organized coherently\ninto an accessible HuggingFace dataset in parquet format. This data spans five\nyears, from 2017 to 2022. Accompanying the dataset, we introduce EarthMAE, a\ntailored Masked Autoencoder, developed to tackle the distinct challenges of\nremote sensing data. Trained in a self-supervised fashion, EarthMAE effectively\nprocesses different data modalities such as hyperspectral, multispectral,\ntopographical data, segmentation maps, and temporal structure. This model helps\nus show that pre-training on Satellogic data improves performance on downstream\ntasks. While there is still a gap to fill in MAE for heterogeneous data, we\nregard this innovative combination of an expansive, diverse dataset and a\nversatile model adapted for self-supervised learning as a stride forward in\ndeep learning for Earth monitoring.\n","authors":["Diego Velazquez","Pau Rodriguez López","Sergio Alonso","Josep M. Gonfaus","Jordi Gonzalez","Gerardo Richarte","Javier Marin","Yoshua Bengio","Alexandre Lacoste"],"pdf_url":"https://arxiv.org/pdf/2501.08111v1.pdf","comment":"2nd Workshop on Computer Vision for Earth Observation (CV4EO)\n  Applications"},{"id":"http://arxiv.org/abs/2312.16409v2","updated":"2025-01-14T13:14:00Z","published":"2023-12-27T04:40:12Z","title":"Dynamic Sub-graph Distillation for Robust Semi-supervised Continual\n  Learning","summary":"  Continual learning (CL) has shown promising results and comparable\nperformance to learning at once in a fully supervised manner. However, CL\nstrategies typically require a large number of labeled samples, making their\nreal-life deployment challenging. In this work, we focus on semi-supervised\ncontinual learning (SSCL), where the model progressively learns from partially\nlabeled data with unknown categories. We provide a comprehensive analysis of\nSSCL and demonstrate that unreliable distributions of unlabeled data lead to\nunstable training and refinement of the progressing stages. This problem\nseverely impacts the performance of SSCL. To address the limitations, we\npropose a novel approach called Dynamic Sub-Graph Distillation (DSGD) for\nsemi-supervised continual learning, which leverages both semantic and\nstructural information to achieve more stable knowledge distillation on\nunlabeled data and exhibit robustness against distribution bias. Firstly, we\nformalize a general model of structural distillation and design a dynamic graph\nconstruction for the continual learning progress. Next, we define a structure\ndistillation vector and design a dynamic sub-graph distillation algorithm,\nwhich enables end-to-end training and adaptability to scale up tasks. The\nentire proposed method is adaptable to various CL methods and supervision\nsettings. Finally, experiments conducted on three datasets CIFAR10, CIFAR100,\nand ImageNet-100, with varying supervision ratios, demonstrate the\neffectiveness of our proposed approach in mitigating the catastrophic\nforgetting problem in semi-supervised continual learning scenarios.\n","authors":["Yan Fan","Yu Wang","Pengfei Zhu","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2312.16409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08097v1","updated":"2025-01-14T13:10:29Z","published":"2025-01-14T13:10:29Z","title":"Guiding the classification of hepatocellular carcinoma on 3D CT-scans\n  using deep and handcrafted radiological features","summary":"  Hepatocellular carcinoma is the most spread primary liver cancer across the\nworld ($\\sim$80\\% of the liver tumors). The gold standard for HCC diagnosis is\nliver biopsy. However, in the clinical routine, expert radiologists provide a\nvisual diagnosis by interpreting hepatic CT-scans according to a standardized\nprotocol, the LI-RADS, which uses five radiological criteria with an associated\ndecision tree. In this paper, we propose an automatic approach to predict\nhistology-proven HCC from CT images in order to reduce radiologists'\ninter-variability. We first show that standard deep learning methods fail to\naccurately predict HCC from CT-scans on a challenging database, and propose a\ntwo-step approach inspired by the LI-RADS system to improve the performance. We\nachieve improvements from 6 to 18 points of AUC with respect to deep learning\nbaselines trained with different architectures. We also provide clinical\nvalidation of our method, achieving results that outperform non-expert\nradiologists and are on par with expert ones.\n","authors":["E. Sarfati","A. Bône","M-M. Rohé","C. Aubé","M. Ronot","P. Gori","I. Bloch"],"pdf_url":"https://arxiv.org/pdf/2501.08097v1.pdf","comment":"IEEE ISBI 2025"},{"id":"http://arxiv.org/abs/2501.08094v1","updated":"2025-01-14T13:09:36Z","published":"2025-01-14T13:09:36Z","title":"CellOMaps: A Compact Representation for Robust Classification of Lung\n  Adenocarcinoma Growth Patterns","summary":"  Lung adenocarcinoma (LUAD) is a morphologically heterogeneous disease,\ncharacterized by five primary histological growth patterns. The classification\nof such patterns is crucial due to their direct relation to prognosis but the\nhigh subjectivity and observer variability pose a major challenge. Although\nseveral studies have developed machine learning methods for growth pattern\nclassification, they either only report the predominant pattern per slide or\nlack proper evaluation. We propose a generalizable machine learning pipeline\ncapable of classifying lung tissue into one of the five patterns or as\nnon-tumor. The proposed pipeline's strength lies in a novel compact Cell\nOrganization Maps (cellOMaps) representation that captures the cellular spatial\npatterns from Hematoxylin and Eosin whole slide images (WSIs). The proposed\npipeline provides state-of-the-art performance on LUAD growth pattern\nclassification when evaluated on both internal unseen slides and external\ndatasets, significantly outperforming the current approaches. In addition, our\npreliminary results show that the model's outputs can be used to predict\npatients Tumor Mutational Burden (TMB) levels.\n","authors":["Arwa Al-Rubaian","Gozde N. Gunesli","Wajd A. Althakfi","Ayesha Azam","David Snead","Nasir M. Rajpoot","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2501.08094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08088v1","updated":"2025-01-14T12:57:17Z","published":"2025-01-14T12:57:17Z","title":"AgentPose: Progressive Distribution Alignment via Feature Agent for\n  Human Pose Distillation","summary":"  Pose distillation is widely adopted to reduce model size in human pose\nestimation. However, existing methods primarily emphasize the transfer of\nteacher knowledge while often neglecting the performance degradation resulted\nfrom the curse of capacity gap between teacher and student. To address this\nissue, we propose AgentPose, a novel pose distillation method that integrates a\nfeature agent to model the distribution of teacher features and progressively\naligns the distribution of student features with that of the teacher feature,\neffectively overcoming the capacity gap and enhancing the ability of knowledge\ntransfer. Our comprehensive experiments conducted on the COCO dataset\nsubstantiate the effectiveness of our method in knowledge transfer,\nparticularly in scenarios with a high capacity gap.\n","authors":["Feng Zhang","Jinwei Liu","Xiatian Zhu","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08088v1.pdf","comment":"5 pages, 1 figures"},{"id":"http://arxiv.org/abs/2302.08878v3","updated":"2025-01-14T12:53:24Z","published":"2023-02-17T13:50:53Z","title":"Less is More: The Influence of Pruning on the Explainability of CNNs","summary":"  Over the last century, deep learning models have become the state-of-the-art\nfor solving complex computer vision problems. These modern computer vision\nmodels have millions of parameters, which presents two major challenges: (1)\nthe increased computational requirements hamper the deployment in\nresource-constrained environments, such as mobile or IoT devices, and (2)\nexplaining the complex decisions of such networks to humans is challenging.\nNetwork pruning is a technical approach to reduce the complexity of models,\nwhere less important parameters are removed. The work presented in this paper\ninvestigates whether this reduction in technical complexity also helps with\nperceived explainability. To do so, we conducted a pre-study and two\nhuman-grounded experiments, assessing the effects of different pruning ratios\non explainability. Overall, we evaluate four different compression rates (i.e.,\n2, 4, 8, and 32) with 37 500 tasks on Mechanical Turk. Results indicate that\nlower compression rates have a positive influence on explainability, while\nhigher compression rates show negative effects. Furthermore, we were able to\nidentify sweet spots that increase both the perceived explainability and the\nmodel's performance.\n","authors":["Florian Merkle","David Weber","Pascal Schöttle","Stephan Schlögl","Martin Nocker"],"pdf_url":"https://arxiv.org/pdf/2302.08878v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08083v1","updated":"2025-01-14T12:51:34Z","published":"2025-01-14T12:51:34Z","title":"Benchmarking Vision Foundation Models for Input Monitoring in Autonomous\n  Driving","summary":"  Deep neural networks (DNNs) remain challenged by distribution shifts in\ncomplex open-world domains like automated driving (AD): Absolute robustness\nagainst yet unknown novel objects (semantic shift) or styles like lighting\nconditions (covariate shift) cannot be guaranteed. Hence, reliable\noperation-time monitors for identification of out-of-training-data-distribution\n(OOD) scenarios are imperative. Current approaches for OOD classification are\nuntested for complex domains like AD, are limited in the kinds of shifts they\ndetect, or even require supervision with OOD samples. To prepare for\nunanticipated shifts, we instead establish a framework around a principled,\nunsupervised, and model-agnostic method that unifies detection of all kinds of\nshifts: Find a full model of the training data's feature distribution, to then\nuse its density at new points as in-distribution (ID) score. To implement this,\nwe propose to combine the newly available Vision Foundation Models (VFM) as\nfeature extractors with one of four alternative density modeling techniques. In\nan extensive benchmark of 4 VFMs against 20 baselines, we show the superior\nperformance of VFM feature encodings compared to shift-specific OOD monitors.\nAdditionally, we find that sophisticated architectures outperform larger latent\nspace dimensionality; and our method identifies samples with higher risk of\nerrors on downstream tasks, despite being model-agnostic. This suggests that\nVFMs are promising to realize model-agnostic, unsupervised, reliable safety\nmonitors in complex vision tasks.\n","authors":["Nert Keser","Halil Ibrahim Orhan","Niki Amini-Naieni","Gesina Schwalbe","Alois Knoll","Matthias Rottmann"],"pdf_url":"https://arxiv.org/pdf/2501.08083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00376v3","updated":"2025-01-14T12:37:26Z","published":"2024-03-01T09:01:53Z","title":"Spurious Feature Eraser: Stabilizing Test-Time Adaptation for\n  Vision-Language Foundation Model","summary":"  Vision-language foundation models have exhibited remarkable success across a\nmultitude of downstream tasks due to their scalability on extensive image-text\npaired data. However, these models also display significant limitations when\napplied to downstream tasks, such as fine-grained image classification, as a\nresult of ``decision shortcuts'' that hinder their generalization capabilities.\nIn this work, we find that the CLIP model possesses a rich set of features,\nencompassing both \\textit{desired invariant causal features} and\n\\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP\non downstream tasks originates from its inability to effectively utilize\npre-trained features in accordance with specific task requirements. To address\nthis challenge, we propose a simple yet effective method, Spurious Feature\nEraser (SEraser), to alleviate the decision shortcuts by erasing the spurious\nfeatures. Specifically, we introduce a test-time prompt tuning paradigm that\noptimizes a learnable prompt, thereby compelling the model to exploit invariant\nfeatures while disregarding decision shortcuts during the inference phase. The\nproposed method effectively alleviates excessive dependence on potentially\nmisleading spurious information. We conduct comparative analysis of the\nproposed method against various approaches which validates the significant\nsuperiority.\n","authors":["Huan Ma","Yan Zhu","Changqing Zhang","Peilin Zhao","Baoyuan Wu","Long-Kai Huang","Qinghua Hu","Bingzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2403.00376v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20020v3","updated":"2025-01-14T12:31:48Z","published":"2024-07-29T13:57:24Z","title":"ImagiNet: A Multi-Content Benchmark for Synthetic Image Detection","summary":"  Recent generative models produce images with a level of authenticity that\nmakes them nearly indistinguishable from real photos and artwork. Potential\nharmful use cases of these models, necessitate the creation of robust synthetic\nimage detectors. However, current datasets in the field contain generated\nimages with questionable quality or have examples from one predominant content\ntype which leads to poor generalizability of the underlying detectors. We find\nthat the curation of a balanced amount of high-resolution generated images\nacross various content types is crucial for the generalizability of detectors,\nand introduce ImagiNet, a dataset of 200K examples, spanning four categories:\nphotos, paintings, faces, and miscellaneous. Synthetic images in ImagiNet are\nproduced with both open-source and proprietary generators, whereas real\ncounterparts for each content type are collected from public datasets. The\nstructure of ImagiNet allows for a two-track evaluation system: i)\nclassification as real or synthetic and ii) identification of the generative\nmodel. To establish a strong baseline, we train a ResNet-50 model using a\nself-supervised contrastive objective (SelfCon) for each track which achieves\nevaluation AUC of up to 0.99 and balanced accuracy ranging from 86% to 95%,\neven under conditions that involve compression and resizing. The provided model\nis generalizable enough to achieve zero-shot state-of-the-art performance on\nprevious synthetic detection benchmarks. We provide ablations to demonstrate\nthe importance of content types and publish code and data.\n","authors":["Delyan Boychev","Radostin Cholakov"],"pdf_url":"https://arxiv.org/pdf/2407.20020v3.pdf","comment":"Workshop on Datasets and Evaluators of AI Safety, AAAI 2025"},{"id":"http://arxiv.org/abs/2411.02188v4","updated":"2025-01-14T12:27:32Z","published":"2024-11-04T15:42:22Z","title":"Digi2Real: Bridging the Realism Gap in Synthetic Data Face Recognition\n  via Foundation Models","summary":"  The accuracy of face recognition systems has improved significantly in the\npast few years, thanks to the large amount of data collected and advancements\nin neural network architectures. However, these large-scale datasets are often\ncollected without explicit consent, raising ethical and privacy concerns. To\naddress this, there have been proposals to use synthetic datasets for training\nface recognition models. Yet, such models still rely on real data to train the\ngenerative models and generally exhibit inferior performance compared to those\ntrained on real datasets. One of these datasets, DigiFace, uses a graphics\npipeline to generate different identities and intra-class variations without\nusing real data in model training. However, the performance of this approach is\npoor on face recognition benchmarks, possibly due to the lack of realism in the\nimages generated by the graphics pipeline. In this work, we introduce a novel\nframework for realism transfer aimed at enhancing the realism of synthetically\ngenerated face images. Our method leverages the large-scale face foundation\nmodel, and we adapt the pipeline for realism enhancement. By integrating the\ncontrollable aspects of the graphics pipeline with our realism enhancement\ntechnique, we generate a large amount of realistic variations, combining the\nadvantages of both approaches. Our empirical evaluations demonstrate that\nmodels trained using our enhanced dataset significantly improve the performance\nof face recognition systems over the baseline. The source code and dataset will\nbe publicly accessible at the following link:\nhttps://www.idiap.ch/paper/digi2real\n","authors":["Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2411.02188v4.pdf","comment":"The dataset would be available here:\n  https://www.idiap.ch/paper/digi2real Accepted for Publication in WACV 2025"},{"id":"http://arxiv.org/abs/2501.08062v1","updated":"2025-01-14T12:15:49Z","published":"2025-01-14T12:15:49Z","title":"Skeleton and Font Generation Network for Zero-shot Chinese Character\n  Generation","summary":"  Automatic font generation remains a challenging research issue, primarily due\nto the vast number of Chinese characters, each with unique and intricate\nstructures. Our investigation of previous studies reveals inherent bias capable\nof causing structural changes in characters. Specifically, when generating a\nChinese character similar to, but different from, those in the training\nsamples, the bias is prone to either correcting or ignoring these subtle\nvariations. To address this concern, we propose a novel Skeleton and Font\nGeneration Network (SFGN) to achieve a more robust Chinese character font\ngeneration. Our approach includes a skeleton builder and font generator. The\nskeleton builder synthesizes content features using low-resource text input,\nenabling our technique to realize font generation independently of content\nimage inputs. Unlike previous font generation methods that treat font style as\na global embedding, we introduce a font generator to align content and style\nfeatures on the radical level, which is a brand-new perspective for font\ngeneration. Except for common characters, we also conduct experiments on\nmisspelled characters, a substantial portion of which slightly differs from the\ncommon ones. Our approach visually demonstrates the efficacy of generated\nimages and outperforms current state-of-the-art font generation methods.\nMoreover, we believe that misspelled character generation have significant\npedagogical implications and verify such supposition through experiments. We\nused generated misspelled characters as data augmentation in Chinese character\nerror correction tasks, simulating the scenario where students learn\nhandwritten Chinese characters with the help of misspelled characters. The\nsignificantly improved performance of error correction tasks demonstrates the\neffectiveness of our proposed approach and the value of misspelled character\ngeneration.\n","authors":["Mobai Xue","Jun Du","Zhenrong Zhang","Jiefeng Ma","Qikai Chang","Pengfei Hu","Jianshu Zhang","Yu Hu"],"pdf_url":"https://arxiv.org/pdf/2501.08062v1.pdf","comment":"36 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.09471v5","updated":"2025-01-14T11:59:06Z","published":"2024-03-14T15:10:54Z","title":"MambaTalk: Efficient Holistic Gesture Synthesis with Selective State\n  Space Models","summary":"  Gesture synthesis is a vital realm of human-computer interaction, with\nwide-ranging applications across various fields like film, robotics, and\nvirtual reality. Recent advancements have utilized the diffusion model and\nattention mechanisms to improve gesture synthesis. However, due to the high\ncomputational complexity of these techniques, generating long and diverse\nsequences with low latency remains a challenge. We explore the potential of\nstate space models (SSMs) to address the challenge, implementing a two-stage\nmodeling strategy with discrete motion priors to enhance the quality of\ngestures. Leveraging the foundational Mamba block, we introduce MambaTalk,\nenhancing gesture diversity and rhythm through multimodal integration.\nExtensive experiments demonstrate that our method matches or exceeds the\nperformance of state-of-the-art models.\n","authors":["Zunnan Xu","Yukang Lin","Haonan Han","Sicheng Yang","Ronghui Li","Yachao Zhang","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2403.09471v5.pdf","comment":"NeurlPS 2024, Camera Ready"},{"id":"http://arxiv.org/abs/2410.03335v2","updated":"2025-01-14T11:59:03Z","published":"2024-10-04T11:40:53Z","title":"Audio-Agent: Leveraging LLMs For Audio Generation, Editing and\n  Composition","summary":"  We introduce Audio-Agent, a multimodal framework for audio generation,\nediting and composition based on text or video inputs. Conventional approaches\nfor text-to-audio (TTA) tasks often make single-pass inferences from text\ndescriptions. While straightforward, this design struggles to produce\nhigh-quality audio when given complex text conditions. In our method, we\nutilize a pre-trained TTA diffusion network as the audio generation agent to\nwork in tandem with GPT-4, which decomposes the text condition into atomic,\nspecific instructions and calls the agent for audio generation. In doing so,\nAudio-Agent can generate high-quality audio that is closely aligned with the\nprovided text or video exhibiting complex and multiple events, while supporting\nvariable-length and variable-volume generation. For video-to-audio (VTA) tasks,\nmost existing methods require training a timestamp detector to synchronize\nvideo events with the generated audio, a process that can be tedious and\ntime-consuming. Instead, we propose a simpler approach by fine-tuning a\npre-trained Large Language Model (LLM), e.g., Gemma2-2B-it, to obtain both\nsemantic and temporal conditions that bridge the video and audio modality.\nConsequently, our framework contributes a comprehensive solution for both TTA\nand VTA tasks without substantial computational overhead in training.\n","authors":["Zixuan Wang","Chi-Keung Tang","Yu-Wing Tai"],"pdf_url":"https://arxiv.org/pdf/2410.03335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08049v1","updated":"2025-01-14T11:56:00Z","published":"2025-01-14T11:56:00Z","title":"Self-Attentive Spatio-Temporal Calibration for Precise Intermediate\n  Layer Matching in ANN-to-SNN Distillation","summary":"  Spiking Neural Networks (SNNs) are promising for low-power computation due to\ntheir event-driven mechanism but often suffer from lower accuracy compared to\nArtificial Neural Networks (ANNs). ANN-to-SNN knowledge distillation can\nimprove SNN performance, but previous methods either focus solely on label\ninformation, missing valuable intermediate layer features, or use a layer-wise\napproach that neglects spatial and temporal semantic inconsistencies, leading\nto performance degradation.To address these limitations, we propose a novel\nmethod called self-attentive spatio-temporal calibration (SASTC). SASTC uses\nself-attention to identify semantically aligned layer pairs between ANN and\nSNN, both spatially and temporally. This enables the autonomous transfer of\nrelevant semantic information. Extensive experiments show that SASTC\noutperforms existing methods, effectively solving the mismatching problem.\nSuperior accuracy results include 95.12% on CIFAR-10, 79.40% on CIFAR-100 with\n2 time steps, and 68.69% on ImageNet with 4 time steps for static datasets, and\n97.92% on DVS-Gesture and 83.60% on DVS-CIFAR10 for neuromorphic datasets. This\nmarks the first time SNNs have outperformed ANNs on both CIFAR-10 and\nCIFAR-100, shedding the new light on the potential applications of SNNs.\n","authors":["Di Hong","Yueming Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08042v1","updated":"2025-01-14T11:47:35Z","published":"2025-01-14T11:47:35Z","title":"Exploring visual language models as a powerful tool in the diagnosis of\n  Ewing Sarcoma","summary":"  Ewing's sarcoma (ES), characterized by a high density of small round blue\ncells without structural organization, presents a significant health concern,\nparticularly among adolescents aged 10 to 19. Artificial intelligence-based\nsystems for automated analysis of histopathological images are promising to\ncontribute to an accurate diagnosis of ES. In this context, this study explores\nthe feature extraction ability of different pre-training strategies for\ndistinguishing ES from other soft tissue or bone sarcomas with similar\nmorphology in digitized tissue microarrays for the first time, as far as we\nknow. Vision-language supervision (VLS) is compared to fully-supervised\nImageNet pre-training within a multiple instance learning paradigm. Our\nfindings indicate a substantial improvement in diagnostic accuracy with the\nadaption of VLS using an in-domain dataset. Notably, these models not only\nenhance the accuracy of predicted classes but also drastically reduce the\nnumber of trainable parameters and computational costs.\n","authors":["Alvaro Pastor-Naranjo","Pablo Meseguer","Rocío del Amor","Jose Antonio Lopez-Guerrero","Samuel Navarro","Katia Scotlandi","Antonio Llombart-Bosch","Isidro Machado","Valery Naranjo"],"pdf_url":"https://arxiv.org/pdf/2501.08042v1.pdf","comment":"11 pages, 5 figures, 2 tables. Oral presentation at KES-InMed 2024\n  held in Madeira, Portugal"},{"id":"http://arxiv.org/abs/2501.08038v1","updated":"2025-01-14T11:42:54Z","published":"2025-01-14T11:42:54Z","title":"Robust Low-Light Human Pose Estimation through Illumination-Texture\n  Modulation","summary":"  As critical visual details become obscured, the low visibility and high ISO\nnoise in extremely low-light images pose a significant challenge to human pose\nestimation. Current methods fail to provide high-quality representations due to\nreliance on pixel-level enhancements that compromise semantics and the\ninability to effectively handle extreme low-light conditions for robust feature\nlearning. In this work, we propose a frequency-based framework for low-light\nhuman pose estimation, rooted in the \"divide-and-conquer\" principle. Instead of\nuniformly enhancing the entire image, our method focuses on task-relevant\ninformation. By applying dynamic illumination correction to the low-frequency\ncomponents and low-rank denoising to the high-frequency components, we\neffectively enhance both the semantic and texture information essential for\naccurate pose estimation. As a result, this targeted enhancement method results\nin robust, high-quality representations, significantly improving pose\nestimation performance. Extensive experiments demonstrating its superiority\nover state-of-the-art methods in various challenging low-light scenarios.\n","authors":["Feng Zhang","Ze Li","Xiatian Zhu","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08038v1.pdf","comment":"5 pages, 2 figures, conference"},{"id":"http://arxiv.org/abs/2409.16597v3","updated":"2025-01-14T11:27:28Z","published":"2024-09-25T03:49:46Z","title":"EventHallusion: Diagnosing Event Hallucinations in Video LLMs","summary":"  Recently, Multimodal Large Language Models (MLLMs) have made significant\nprogress in the video comprehension field. Despite remarkable content reasoning\nand instruction following capabilities they demonstrated, the hallucination\nproblem of these VideoLLMs is less explored compared with its counterpart in\nthe image domain. To mitigate this gap, we propose EventHallusion, a novel\nbenchmark that focuses on assessing the VideoLLMs' hallucination toward event,\nthe crux of video analysis. From a hallucination attribution perspective, our\nEventHallusion benchmark is curated to assess a VideoLLM's susceptibility\ntoward language priors and vision-language biases. On the other hand, we also\npropose a simple yet effective method, called Temporal Contrastive Decoding\n(TCD), to tackle the hallucination problems of VideoLLMs. The proposed TCD\nmethod rectifies the model's bias toward its priors during the decoding stage\nby comparing the original video with a modified version, in which temporal cues\nare disrupted. Through comprehensive evaluation of eight open-source and two\nclosed-source VideoLLMs on the proposed EventHallusion benchmark, we observe\nthat the open-source models suffer significantly from hallucination problems,\nwhereas the closed-source ones perform markedly better. By further equipping\nopen-source VideoLLMs with the proposed TCD approach, evident performance\nimprovements are achieved across most metrics in the EventHallusion benchmark.\nOur codes and benchmark data are available at\nhttps://github.com/Stevetich/EventHallusion.\n","authors":["Jiacheng Zhang","Yang Jiao","Shaoxiang Chen","Na Zhao","Jingjing Chen"],"pdf_url":"https://arxiv.org/pdf/2409.16597v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02774v3","updated":"2025-01-14T11:14:57Z","published":"2024-03-05T08:41:41Z","title":"Fast, Scale-Adaptive, and Uncertainty-Aware Downscaling of Earth System\n  Model Fields with Generative Machine Learning","summary":"  Accurate and high-resolution Earth system model (ESM) simulations are\nessential to assess the ecological and socio-economic impacts of anthropogenic\nclimate change, but are computationally too expensive to be run at sufficiently\nhigh spatial resolution. Recent machine learning approaches have shown\npromising results in downscaling ESM simulations, outperforming\nstate-of-the-art statistical approaches. However, existing methods require\ncomputationally costly retraining for each ESM and extrapolate poorly to\nclimates unseen during training. We address these shortcomings by learning a\nconsistency model (CM) that efficiently and accurately downscales arbitrary ESM\nsimulations without retraining in a zero-shot manner. Our approach yields\nprobabilistic downscaled fields at a resolution only limited by the\nobservational reference data. We show that the CM outperforms state-of-the-art\ndiffusion models at a fraction of computational cost while maintaining high\ncontrollability on the downscaling task. Further, our method generalizes to\nclimate states unseen during training without explicitly formulated physical\nconstraints.\n","authors":["Philipp Hess","Michael Aich","Baoxiang Pan","Niklas Boers"],"pdf_url":"https://arxiv.org/pdf/2403.02774v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04594v2","updated":"2025-01-14T11:03:05Z","published":"2024-12-05T20:15:34Z","title":"Learning Symmetries via Weight-Sharing with Doubly Stochastic Tensors","summary":"  Group equivariance has emerged as a valuable inductive bias in deep learning,\nenhancing generalization, data efficiency, and robustness. Classically, group\nequivariant methods require the groups of interest to be known beforehand,\nwhich may not be realistic for real-world data. Additionally, baking in fixed\ngroup equivariance may impose overly restrictive constraints on model\narchitecture. This highlights the need for methods that can dynamically\ndiscover and apply symmetries as soft constraints. For neural network\narchitectures, equivariance is commonly achieved through group transformations\nof a canonical weight tensor, resulting in weight sharing over a given group\n$G$. In this work, we propose to learn such a weight-sharing scheme by defining\na collection of learnable doubly stochastic matrices that act as soft\npermutation matrices on canonical weight tensors, which can take regular group\nrepresentations as a special case. This yields learnable kernel transformations\nthat are jointly optimized with downstream tasks. We show that when the dataset\nexhibits strong symmetries, the permutation matrices will converge to regular\ngroup representations and our weight-sharing networks effectively become\nregular group convolutions. Additionally, the flexibility of the method enables\nit to effectively pick up on partial symmetries.\n","authors":["Putri A. van der Linden","Alejandro García-Castellanos","Sharvaree Vadgama","Thijs P. Kuipers","Erik J. Bekkers"],"pdf_url":"https://arxiv.org/pdf/2412.04594v2.pdf","comment":"19 pages, 14 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.15500v3","updated":"2025-01-14T11:02:13Z","published":"2024-07-22T09:31:30Z","title":"TextureCrop: Enhancing Synthetic Image Detection through Texture-based\n  Cropping","summary":"  Generative AI technologies produce increasingly realistic imagery, which,\ndespite its potential for creative applications, can also be misused to produce\nmisleading and harmful content. This renders Synthetic Image Detection (SID)\nmethods essential for identifying AI-generated content online. State-of-the-art\nSID methods typically resize or center-crop input images due to architectural\nor computational constraints, which hampers the detection of artifacts that\nappear in high-resolution images. To address this limitation, we propose\nTextureCrop, an image pre-processing component that can be plugged in any\npre-trained SID model to improve its performance. By focusing on high-frequency\nimage parts where generative artifacts are prevalent, TextureCrop enhances SID\nperformance with manageable memory requirements. Experimental results\ndemonstrate a consistent improvement in AUC across various detectors by 6.1%\ncompared to center cropping and by 15% compared to resizing, across\nhigh-resolution images from the Forensynths, Synthbuster and TWIGMA datasets.\nCode available at https : //github.com/mever-team/texture-crop.\n","authors":["Despina Konstantinidou","Christos Koutlis","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.15500v3.pdf","comment":"10 pages, 7 images"},{"id":"http://arxiv.org/abs/2408.07583v2","updated":"2025-01-14T10:52:15Z","published":"2024-08-14T14:28:11Z","title":"Transformers and Large Language Models for Efficient Intrusion Detection\n  Systems: A Comprehensive Survey","summary":"  With significant advancements in Transformers LLMs, NLP has extended its\nreach into many research fields due to its enhanced capabilities in text\ngeneration and user interaction. One field benefiting greatly from these\nadvancements is cybersecurity. In cybersecurity, many parameters that need to\nbe protected and exchanged between senders and receivers are in the form of\ntext and tabular data, making NLP a valuable tool in enhancing the security\nmeasures of communication protocols. This survey paper provides a comprehensive\nanalysis of the utilization of Transformers and LLMs in cyber-threat detection\nsystems. The methodology of paper selection and bibliometric analysis is\noutlined to establish a rigorous framework for evaluating existing research.\nThe fundamentals of Transformers are discussed, including background\ninformation on various cyber-attacks and datasets commonly used in this field.\nThe survey explores the application of Transformers in IDSs, focusing on\ndifferent architectures such as Attention-based models, LLMs like BERT and GPT,\nCNN/LSTM-Transformer hybrids, emerging approaches like ViTs, among others.\nFurthermore, it explores the diverse environments and applications where\nTransformers and LLMs-based IDS have been implemented, including computer\nnetworks, IoT devices, critical infrastructure protection, cloud computing,\nSDN, as well as in autonomous vehicles. The paper also addresses research\nchallenges and future directions in this area, identifying key issues such as\ninterpretability, scalability, and adaptability to evolving threats, and more.\nFinally, the conclusion summarizes the findings and highlights the significance\nof Transformers and LLMs in enhancing cyber-threat detection capabilities,\nwhile also outlining potential avenues for further research and development.\n","authors":["Hamza Kheddar"],"pdf_url":"https://arxiv.org/pdf/2408.07583v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.04760 by other authors"},{"id":"http://arxiv.org/abs/2501.08005v1","updated":"2025-01-14T10:49:26Z","published":"2025-01-14T10:49:26Z","title":"DisCoPatch: Batch Statistics Are All You Need For OOD Detection, But\n  Only If You Can Trust Them","summary":"  Out-of-distribution (OOD) detection holds significant importance across many\napplications. While semantic and domain-shift OOD problems are well-studied,\nthis work focuses on covariate shifts - subtle variations in the data\ndistribution that can degrade machine learning performance. We hypothesize that\ndetecting these subtle shifts can improve our understanding of in-distribution\nboundaries, ultimately improving OOD detection. In adversarial discriminators\ntrained with Batch Normalization (BN), real and adversarial samples form\ndistinct domains with unique batch statistics - a property we exploit for OOD\ndetection. We introduce DisCoPatch, an unsupervised Adversarial Variational\nAutoencoder (VAE) framework that harnesses this mechanism. During inference,\nbatches consist of patches from the same image, ensuring a consistent data\ndistribution that allows the model to rely on batch statistics. DisCoPatch uses\nthe VAE's suboptimal outputs (generated and reconstructed) as negative samples\nto train the discriminator, thereby improving its ability to delineate the\nboundary between in-distribution samples and covariate shifts. By tightening\nthis boundary, DisCoPatch achieves state-of-the-art results in public OOD\ndetection benchmarks. The proposed model not only excels in detecting covariate\nshifts, achieving 95.5% AUROC on ImageNet-1K(-C) but also outperforms all prior\nmethods on public Near-OOD (95.0%) benchmarks. With a compact model size of\n25MB, it achieves high OOD detection performance at notably lower latency than\nexisting methods, making it an efficient and practical solution for real-world\nOOD detection applications. The code will be made publicly available\n","authors":["Francisco Caetano","Christiaan Viviers","Luis A. Zavala-Mondragón","Peter H. N. de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2501.08005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08002v1","updated":"2025-01-14T10:46:41Z","published":"2025-01-14T10:46:41Z","title":"Maximizing Uncertainty for Federated learning via Bayesian\n  Optimisation-based Model Poisoning","summary":"  As we transition from Narrow Artificial Intelligence towards Artificial Super\nIntelligence, users are increasingly concerned about their privacy and the\ntrustworthiness of machine learning (ML) technology. A common denominator for\nthe metrics of trustworthiness is the quantification of uncertainty inherent in\nDL algorithms, and specifically in the model parameters, input data, and model\npredictions. One of the common approaches to address privacy-related issues in\nDL is to adopt distributed learning such as federated learning (FL), where\nprivate raw data is not shared among users. Despite the privacy-preserving\nmechanisms in FL, it still faces challenges in trustworthiness. Specifically,\nthe malicious users, during training, can systematically create malicious model\nparameters to compromise the models predictive and generative capabilities,\nresulting in high uncertainty about their reliability. To demonstrate malicious\nbehaviour, we propose a novel model poisoning attack method named Delphi which\naims to maximise the uncertainty of the global model output. We achieve this by\ntaking advantage of the relationship between the uncertainty and the model\nparameters of the first hidden layer of the local model. Delphi employs two\ntypes of optimisation , Bayesian Optimisation and Least Squares Trust Region,\nto search for the optimal poisoned model parameters, named as Delphi-BO and\nDelphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise\nthe distance of the predictive probability distribution towards an uncertain\ndistribution of model output. Furthermore, we establish a mathematical proof\nfor the attack effectiveness demonstrated in FL. Numerical results demonstrate\nthat Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR\nhighlighting vulnerability of FL systems to model poisoning attacks.\n","authors":["Marios Aristodemou","Xiaolan Liu","Yuan Wang","Konstantinos G. Kyriakopoulos","Sangarapillai Lambotharan","Qingsong Wei"],"pdf_url":"https://arxiv.org/pdf/2501.08002v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.07994v1","updated":"2025-01-14T10:38:18Z","published":"2025-01-14T10:38:18Z","title":"Combining imaging and shape features for prediction tasks of Alzheimer's\n  disease classification and brain age regression","summary":"  We investigate combining imaging and shape features extracted from MRI for\nthe clinically relevant tasks of brain age prediction and Alzheimer's disease\nclassification. Our proposed model fuses ResNet-extracted image embeddings with\nshape embeddings from a bespoke graph neural network. The shape embeddings are\nderived from surface meshes of 15 brain structures, capturing detailed\ngeometric information. Combined with the appearance features from T1-weighted\nimages, we observe improvements in the prediction performance on both tasks,\nwith substantial gains for classification. We evaluate the model using public\ndatasets, including CamCAN, IXI, and OASIS3, demonstrating the effectiveness of\nfusing imaging and shape features for brain analysis.\n","authors":["Nairouz Shehata","Carolina Piçarra","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2501.07994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03033v3","updated":"2025-01-14T10:34:00Z","published":"2024-11-05T12:10:02Z","title":"Rethinking Decoders for Transformer-based Semantic Segmentation: A\n  Compression Perspective","summary":"  State-of-the-art methods for Transformer-based semantic segmentation\ntypically adopt Transformer decoders that are used to extract additional\nembeddings from image embeddings via cross-attention, refine either or both\ntypes of embeddings via self-attention, and project image embeddings onto the\nadditional embeddings via dot-product. Despite their remarkable success, these\nempirical designs still lack theoretical justifications or interpretations,\nthus hindering potentially principled improvements. In this paper, we argue\nthat there are fundamental connections between semantic segmentation and\ncompression, especially between the Transformer decoders and Principal\nComponent Analysis (PCA). From such a perspective, we derive a white-box, fully\nattentional DEcoder for PrIncipled semantiC segemenTation (DEPICT), with the\ninterpretations as follows: 1) the self-attention operator refines image\nembeddings to construct an ideal principal subspace that aligns with the\nsupervision and retains most information; 2) the cross-attention operator seeks\nto find a low-rank approximation of the refined image embeddings, which is\nexpected to be a set of orthonormal bases of the principal subspace and\ncorresponds to the predefined classes; 3) the dot-product operation yields\ncompact representation for image embeddings as segmentation masks. Experiments\nconducted on dataset ADE20K find that DEPICT consistently outperforms its\nblack-box counterpart, Segmenter, and it is light weight and more robust.\n","authors":["Qishuai Wen","Chun-Guang Li"],"pdf_url":"https://arxiv.org/pdf/2411.03033v3.pdf","comment":"NeurIPS2024. Code:https://github.com/QishuaiWen/DEPICT/"},{"id":"http://arxiv.org/abs/2407.10377v3","updated":"2025-01-14T10:30:19Z","published":"2024-07-15T01:11:30Z","title":"Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal\n  MRI Datasets","summary":"  Multi-modal magnetic resonance imaging (MRI) provides information of lesions\nfor computer-aided diagnosis from different views. Deep learning algorithms are\nsuitable for identifying specific anatomical structures, segmenting lesions,\nand classifying diseases. Manual labels are limited due to the high expense,\nwhich hinders further improvement of accuracy. Self-supervised learning,\nparticularly masked image modeling (MIM), has shown promise in utilizing\nunlabeled data. However, we spot model collapse when applying MIM to\nmulti-modal MRI datasets. The performance of downstream tasks does not see any\nimprovement following the collapsed model. To solve model collapse, we analyze\nand address it in two types: complete collapse and dimensional collapse. We\nfind complete collapse occurs because the collapsed loss value in multi-modal\nMRI datasets falls below the normally converged loss value. Based on this, the\nhybrid mask pattern (HMP) masking strategy is introduced to elevate the\ncollapsed loss above the normally converged loss value and avoid complete\ncollapse. Additionally, we reveal that dimensional collapse stems from\ninsufficient feature uniformity in MIM. We mitigate dimensional collapse by\nintroducing the pyramid barlow twins (PBT) module as an explicit regularization\nmethod. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module\nto avoid model collapse multi-modal MRI. Experiments are conducted on three\nmulti-modal MRI datasets to validate the effectiveness of our approach in\npreventing both types of model collapse. By preventing model collapse, the\ntraining of the model becomes more stable, resulting in a decent improvement in\nperformance for segmentation and classification tasks. The code is available at\nhttps://github.com/LinxuanHan/E-MIM.\n","authors":["Linxuan Han","Sa Xiao","Zimeng Li","Haidong Li","Xiuchao Zhao","Yeqing Han","Fumin Guo","Xin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.10377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02297v2","updated":"2025-01-14T10:27:40Z","published":"2024-08-05T08:14:28Z","title":"Perception Matters: Enhancing Embodied AI with Uncertainty-Aware\n  Semantic Segmentation","summary":"  Embodied AI has made significant progress acting in unexplored environments.\nHowever, tasks such as object search have largely focused on efficient policy\nlearning. In this work, we identify several gaps in current search methods:\nThey largely focus on dated perception models, neglect temporal aggregation,\nand transfer from ground truth directly to noisy perception at test time,\nwithout accounting for the resulting overconfidence in the perceived state. We\naddress the identified problems through calibrated perception probabilities and\nuncertainty across aggregation and found decisions, thereby adapting the models\nfor sequential tasks. The resulting methods can be directly integrated with\npretrained models across a wide family of existing search approaches at no\nadditional training cost. We perform extensive evaluations of aggregation\nmethods across both different semantic perception models and policies,\nconfirming the importance of calibrated uncertainties in both the aggregation\nand found decisions. We make the code and trained models available at\nhttps://semantic-search.cs.uni-freiburg.de.\n","authors":["Sai Prasanna","Daniel Honerkamp","Kshitij Sirohi","Tim Welschehold","Wolfram Burgard","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2408.02297v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07988v1","updated":"2025-01-14T10:24:20Z","published":"2025-01-14T10:24:20Z","title":"GAC-Net_Geometric and attention-based Network for Depth Completion","summary":"  Depth completion is a key task in autonomous driving, aiming to complete\nsparse LiDAR depth measurements into high-quality dense depth maps through\nimage guidance. However, existing methods usually treat depth maps as an\nadditional channel of color images, or directly perform convolution on sparse\ndata, failing to fully exploit the 3D geometric information in depth maps,\nespecially with limited performance in complex boundaries and sparse areas. To\naddress these issues, this paper proposes a depth completion network combining\nchannel attention mechanism and 3D global feature perception (CGA-Net). The\nmain innovations include: 1) Utilizing PointNet++ to extract global 3D\ngeometric features from sparse depth maps, enhancing the scene perception\nability of low-line LiDAR data; 2) Designing a channel-attention-based\nmultimodal feature fusion module to efficiently integrate sparse depth, RGB\nimages, and 3D geometric features; 3) Combining residual learning with CSPN++\nto optimize the depth refinement stage, further improving the completion\nquality in edge areas and complex scenes. Experiments on the KITTI depth\ncompletion dataset show that CGA-Net can significantly improve the prediction\naccuracy of dense depth maps, achieving a new state-of-the-art (SOTA), and\ndemonstrating strong robustness to sparse and complex scenes.\n","authors":["Kuang Zhu","Xingli Gan","Min Sun"],"pdf_url":"https://arxiv.org/pdf/2501.07988v1.pdf","comment":"13pages,4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.07984v1","updated":"2025-01-14T10:09:55Z","published":"2025-01-14T10:09:55Z","title":"Threshold Attention Network for Semantic Segmentation of Remote Sensing\n  Images","summary":"  Semantic segmentation of remote sensing images is essential for various\napplications, including vegetation monitoring, disaster management, and urban\nplanning. Previous studies have demonstrated that the self-attention mechanism\n(SA) is an effective approach for designing segmentation networks that can\ncapture long-range pixel dependencies. SA enables the network to model the\nglobal dependencies between the input features, resulting in improved\nsegmentation outcomes. However, the high density of attentional feature maps\nused in this mechanism causes exponential increases in computational\ncomplexity. Additionally, it introduces redundant information that negatively\nimpacts the feature representation. Inspired by traditional threshold\nsegmentation algorithms, we propose a novel threshold attention mechanism\n(TAM). This mechanism significantly reduces computational effort while also\nbetter modeling the correlation between different regions of the feature map.\nBased on TAM, we present a threshold attention network (TANet) for semantic\nsegmentation. TANet consists of an attentional feature enhancement module\n(AFEM) for global feature enhancement of shallow features and a threshold\nattention pyramid pooling module (TAPP) for acquiring feature information at\ndifferent scales for deep features. We have conducted extensive experiments on\nthe ISPRS Vaihingen and Potsdam datasets. The results demonstrate the validity\nand superiority of our proposed TANet compared to the most state-of-the-art\nmodels.\n","authors":["Wei Long","Yongjun Zhang","Zhongwei Cui","Yujie Xu","Xuexue Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07983v1","updated":"2025-01-14T10:06:02Z","published":"2025-01-14T10:06:02Z","title":"V-Trans4Style: Visual Transition Recommendation for Video Production\n  Style Adaptation","summary":"  We introduce V-Trans4Style, an innovative algorithm tailored for dynamic\nvideo content editing needs. It is designed to adapt videos to different\nproduction styles like documentaries, dramas, feature films, or a specific\nYouTube channel's video-making technique. Our algorithm recommends optimal\nvisual transitions to help achieve this flexibility using a more bottom-up\napproach. We first employ a transformer-based encoder-decoder network to learn\nrecommending temporally consistent and visually seamless sequences of visual\ntransitions using only the input videos. We then introduce a style conditioning\nmodule that leverages this model to iteratively adjust the visual transitions\nobtained from the decoder through activation maximization. We demonstrate the\nefficacy of our method through experiments conducted on our newly introduced\nAutoTransition++ dataset. It is a 6k video version of AutoTransition Dataset\nthat additionally categorizes its videos into different production style\ncategories. Our encoder-decoder model outperforms the state-of-the-art\ntransition recommendation method, achieving improvements of 10% to 80% in\nRecall@K and mean rank values over baseline. Our style conditioning module\nresults in visual transitions that improve the capture of the desired video\nproduction style characteristics by an average of around 12% in comparison to\nother methods when measured with similarity metrics. We hope that our work\nserves as a foundation for exploring and understanding video production styles\nfurther.\n","authors":["Pooja Guhan","Tsung-Wei Huang","Guan-Ming Su","Subhadra Gopalakrishnan","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2501.07983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07978v1","updated":"2025-01-14T09:52:56Z","published":"2025-01-14T09:52:56Z","title":"Facial Dynamics in Video: Instruction Tuning for Improved Facial\n  Expression Perception and Contextual Awareness","summary":"  Facial expression captioning has found widespread application across various\ndomains. Recently, the emergence of video Multimodal Large Language Models\n(MLLMs) has shown promise in general video understanding tasks. However,\ndescribing facial expressions within videos poses two major challenges for\nthese models: (1) the lack of adequate datasets and benchmarks, and (2) the\nlimited visual token capacity of video MLLMs. To address these issues, this\npaper introduces a new instruction-following dataset tailored for dynamic\nfacial expression caption. The dataset comprises 5,033 high-quality video clips\nannotated manually, containing over 700,000 tokens. Its purpose is to improve\nthe capability of video MLLMs to discern subtle facial nuances. Furthermore, we\npropose FaceTrack-MM, which leverages a limited number of tokens to encode the\nmain character's face. This model demonstrates superior performance in tracking\nfaces and focusing on the facial expressions of the main characters, even in\nintricate multi-person scenarios. Additionally, we introduce a novel evaluation\nmetric combining event extraction, relation classification, and the longest\ncommon subsequence (LCS) algorithm to assess the content consistency and\ntemporal sequence consistency of generated text. Moreover, we present\nFEC-Bench, a benchmark designed to assess the performance of existing video\nMLLMs in this specific task. All data and source code will be made publicly\navailable.\n","authors":["Jiaxing Zhao","Boyuan Sun","Xiang Chen","Xihan Wei"],"pdf_url":"https://arxiv.org/pdf/2501.07978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07972v1","updated":"2025-01-14T09:45:10Z","published":"2025-01-14T09:45:10Z","title":"Zero-shot Video Moment Retrieval via Off-the-shelf Multimodal Large\n  Language Models","summary":"  The target of video moment retrieval (VMR) is predicting temporal spans\nwithin a video that semantically match a given linguistic query. Existing VMR\nmethods based on multimodal large language models (MLLMs) overly rely on\nexpensive high-quality datasets and time-consuming fine-tuning. Although some\nrecent studies introduce a zero-shot setting to avoid fine-tuning, they\noverlook inherent language bias in the query, leading to erroneous\nlocalization. To tackle the aforementioned challenges, this paper proposes\nMoment-GPT, a tuning-free pipeline for zero-shot VMR utilizing frozen MLLMs.\nSpecifically, we first employ LLaMA-3 to correct and rephrase the query to\nmitigate language bias. Subsequently, we design a span generator combined with\nMiniGPT-v2 to produce candidate spans adaptively. Finally, to leverage the\nvideo comprehension capabilities of MLLMs, we apply VideoChatGPT and span\nscorer to select the most appropriate spans. Our proposed method substantially\noutperforms the state-ofthe-art MLLM-based and zero-shot models on several\npublic datasets, including QVHighlights, ActivityNet-Captions, and\nCharades-STA.\n","authors":["Yifang Xu","Yunzhuo Sun","Benxiang Zhai","Ming Li","Wenxin Liang","Yang Li","Sidan Du"],"pdf_url":"https://arxiv.org/pdf/2501.07972v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2409.09610v2","updated":"2025-01-14T09:44:01Z","published":"2024-09-15T04:34:38Z","title":"TextureDiffusion: Target Prompt Disentangled Editing for Various Texture\n  Transfer","summary":"  Recently, text-guided image editing has achieved significant success.\nHowever, existing methods can only apply simple textures like wood or gold when\nchanging the texture of an object. Complex textures such as cloud or fire pose\na challenge. This limitation stems from that the target prompt needs to contain\nboth the input image content and <texture>, restricting the texture\nrepresentation. In this paper, we propose TextureDiffusion, a tuning-free image\nediting method applied to various texture transfer. Initially, the target\nprompt is directly set to \"<texture>\", making the texture disentangled from the\ninput image content to enhance texture representation. Subsequently, query\nfeatures in self-attention and features in residual blocks are utilized to\npreserve the structure of the input image. Finally, to maintain the background,\nwe introduce an edit localization technique which blends the self-attention\nresults and the intermediate latents. Comprehensive experiments demonstrate\nthat TextureDiffusion can harmoniously transfer various textures with excellent\nstructure and background preservation. Code is publicly available at\nhttps://github.com/THU-CVML/TextureDiffusion\n","authors":["Zihan Su","Junhao Zhuang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2409.09610v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03907v2","updated":"2025-01-14T09:40:53Z","published":"2024-12-05T06:26:32Z","title":"ONER: Online Experience Replay for Incremental Anomaly Detection","summary":"  Incremental anomaly detection sequentially recognizes abnormal regions in\nnovel categories for dynamic industrial scenarios. This remains highly\nchallenging due to knowledge overwriting and feature conflicts, leading to\ncatastrophic forgetting. In this work, we propose ONER, an end-to-end ONline\nExperience Replay method, which efficiently mitigates catastrophic forgetting\nwhile adapting to new tasks with minimal cost. Specifically, our framework\nutilizes two types of experiences from past tasks: decomposed prompts and\nsemantic prototypes, addressing both model parameter updates and feature\noptimization. The decomposed prompts consist of learnable components that\nassemble to produce attention-conditioned prompts. These prompts reuse\npreviously learned knowledge, enabling model to learn novel tasks effectively.\nThe semantic prototypes operate at both pixel and image levels, performing\nregularization in the latent feature space to prevent forgetting across various\ntasks. Extensive experiments demonstrate that our method achieves\nstate-of-the-art performance in incremental anomaly detection with\nsignificantly reduced forgetting, as well as efficiently adapting to new\ncategories with minimal costs. These results confirm the efficiency and\nstability of ONER, making it a powerful solution for real-world applications.\n","authors":["Yizhou Jin","Jiahui Zhu","Guodong Wang","Shiwei Li","Jinjin Zhang","Qingjie Liu","Xinyue Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07960v1","updated":"2025-01-14T09:24:27Z","published":"2025-01-14T09:24:27Z","title":"SkipClick: Combining Quick Responses and Low-Level Features for\n  Interactive Segmentation in Winter Sports Contexts","summary":"  In this paper, we present a novel architecture for interactive segmentation\nin winter sports contexts. The field of interactive segmentation deals with the\nprediction of high-quality segmentation masks by informing the network about\nthe objects position with the help of user guidance. In our case the guidance\nconsists of click prompts. For this task, we first present a baseline\narchitecture which is specifically geared towards quickly responding after each\nclick. Afterwards, we motivate and describe a number of architectural\nmodifications which improve the performance when tasked with segmenting winter\nsports equipment on the WSESeg dataset. With regards to the average NoC@85\nmetric on the WSESeg classes, we outperform SAM and HQ-SAM by 2.336 and 7.946\nclicks, respectively. When applied to the HQSeg-44k dataset, our system\ndelivers state-of-the-art results with a NoC@90 of 6.00 and NoC@95 of 9.89. In\naddition to that, we test our model on a novel dataset containing masks for\nhumans during skiing.\n","authors":["Robin Schön","Julian Lorenz","Daniel Kienzle","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2501.07960v1.pdf","comment":"4 figures, 6 tables, 12 pages"},{"id":"http://arxiv.org/abs/2501.07957v1","updated":"2025-01-14T09:21:17Z","published":"2025-01-14T09:21:17Z","title":"AI Guide Dog: Egocentric Path Prediction on Smartphone","summary":"  This paper introduces AI Guide Dog (AIGD), a lightweight egocentric\nnavigation assistance system for visually impaired individuals, designed for\nreal-time deployment on smartphones. AIGD addresses key challenges in blind\nnavigation by employing a vision-only, multi-label classification approach to\npredict directional commands, ensuring safe traversal across diverse\nenvironments. We propose a novel technique to enable goal-based outdoor\nnavigation by integrating GPS signals and high-level directions, while also\naddressing uncertain multi-path predictions for destination-free indoor\nnavigation. Our generalized model is the first navigation assistance system to\nhandle both goal-oriented and exploratory navigation scenarios across indoor\nand outdoor settings, establishing a new state-of-the-art in blind navigation.\nWe present methods, datasets, evaluations, and deployment insights to encourage\nfurther innovations in assistive navigation systems.\n","authors":["Aishwarya Jadhav","Jeffery Cao","Abhishree Shetty","Urvashi Priyam Kumar","Aditi Sharma","Ben Sukboontip","Jayant Sravan Tamarapalli","Jingyi Zhang","Anirudh Koul"],"pdf_url":"https://arxiv.org/pdf/2501.07957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04665v3","updated":"2025-01-14T09:11:42Z","published":"2025-01-08T18:22:44Z","title":"HyFusion: Enhanced Reception Field Transformer for Hyperspectral Image\n  Fusion","summary":"  Hyperspectral image (HSI) fusion addresses the challenge of reconstructing\nHigh-Resolution HSIs (HR-HSIs) from High-Resolution Multispectral images\n(HR-MSIs) and Low-Resolution HSIs (LR-HSIs), a critical task given the high\ncosts and hardware limitations associated with acquiring high-quality HSIs.\nWhile existing methods leverage spatial and spectral relationships, they often\nsuffer from limited receptive fields and insufficient feature utilization,\nleading to suboptimal performance. Furthermore, the scarcity of high-quality\nHSI data highlights the importance of efficient data utilization to maximize\nreconstruction quality. To address these issues, we propose HyFusion, a novel\nDual-Coupled Network (DCN) framework designed to enhance cross-domain feature\nextraction and enable effective feature map reusing. The framework first\nprocesses HR-MSI and LR-HSI inputs through specialized subnetworks that\nmutually enhance each other during feature extraction, preserving complementary\nspatial and spectral details. At its core, HyFusion utilizes an Enhanced\nReception Field Block (ERFB), which combines shifting-window attention and\ndense connections to expand the receptive field, effectively capturing\nlong-range dependencies while minimizing information loss. Extensive\nexperiments demonstrate that HyFusion achieves state-of-the-art performance in\nHR-MSI/LR-HSI fusion, significantly improving reconstruction quality while\nmaintaining a compact model size and computational efficiency. By integrating\nenhanced receptive fields and feature map reusing into a coupled network\narchitecture, HyFusion provides a practical and effective solution for HSI\nfusion in resource-constrained scenarios, setting a new benchmark in\nhyperspectral imaging. Our code will be publicly available.\n","authors":["Chia-Ming Lee","Yu-Fan Lin","Yu-Hao Ho","Li-Wei Kang","Chih-Chung Hsu"],"pdf_url":"https://arxiv.org/pdf/2501.04665v3.pdf","comment":"Submitted to IGARSS 2025"},{"id":"http://arxiv.org/abs/2501.07953v1","updated":"2025-01-14T09:09:14Z","published":"2025-01-14T09:09:14Z","title":"Robust Hyperspectral Image Panshapring via Sparse Spatial-Spectral\n  Representation","summary":"  High-resolution hyperspectral imaging plays a crucial role in various remote\nsensing applications, yet its acquisition often faces fundamental limitations\ndue to hardware constraints. This paper introduces S$^{3}$RNet, a novel\nframework for hyperspectral image pansharpening that effectively combines\nlow-resolution hyperspectral images (LRHSI) with high-resolution multispectral\nimages (HRMSI) through sparse spatial-spectral representation. The core of\nS$^{3}$RNet is the Multi-Branch Fusion Network (MBFN), which employs parallel\nbranches to capture complementary features at different spatial and spectral\nscales. Unlike traditional approaches that treat all features equally, our\nSpatial-Spectral Attention Weight Block (SSAWB) dynamically adjusts feature\nweights to maintain sparse representation while suppressing noise and\nredundancy. To enhance feature propagation, we incorporate the Dense Feature\nAggregation Block (DFAB), which efficiently aggregates inputted features\nthrough dense connectivity patterns. This integrated design enables S$^{3}$RNet\nto selectively emphasize the most informative features from differnt scale\nwhile maintaining computational efficiency. Comprehensive experiments\ndemonstrate that S$^{3}$RNet achieves state-of-the-art performance across\nmultiple evaluation metrics, showing particular strength in maintaining high\nreconstruction quality even under challenging noise conditions. The code will\nbe made publicly available.\n","authors":["Chia-Ming Lee","Yu-Fan Lin","Li-Wei Kang","Chih-Chung Hsu"],"pdf_url":"https://arxiv.org/pdf/2501.07953v1.pdf","comment":"Submitted to IGARSS 2025"},{"id":"http://arxiv.org/abs/2501.00700v2","updated":"2025-01-14T09:04:35Z","published":"2025-01-01T02:18:18Z","title":"Knowledge-Guided Prompt Learning for Deepfake Facial Image Detection","summary":"  Recent generative models demonstrate impressive performance on synthesizing\nphotographic images, which makes humans hardly to distinguish them from\npristine ones, especially on realistic-looking synthetic facial images.\nPrevious works mostly focus on mining discriminative artifacts from vast amount\nof visual data. However, they usually lack the exploration of prior knowledge\nand rarely pay attention to the domain shift between training categories (e.g.,\nnatural and indoor objects) and testing ones (e.g., fine-grained human facial\nimages), resulting in unsatisfactory detection performance. To address these\nissues, we propose a novel knowledge-guided prompt learning method for deepfake\nfacial image detection. Specifically, we retrieve forgery-related prompts from\nlarge language models as expert knowledge to guide the optimization of\nlearnable prompts. Besides, we elaborate test-time prompt tuning to alleviate\nthe domain shift, achieving significant performance improvement and boosting\nthe application in real-world scenarios. Extensive experiments on\nDeepFakeFaceForensics dataset show that our proposed approach notably\noutperforms state-of-the-art methods.\n","authors":["Hao Wang","Cheng Deng","Zhidong Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.00700v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2305.11421v3","updated":"2025-01-14T08:59:17Z","published":"2023-05-19T04:16:50Z","title":"PastNet: Introducing Physical Inductive Biases for Spatio-temporal Video\n  Prediction","summary":"  In this paper, we investigate the challenge of spatio-temporal video\nprediction task, which involves generating future video frames based on\nhistorical spatio-temporal observation streams. Existing approaches typically\nutilize external information such as semantic maps to improve video prediction\naccuracy, which often neglect the inherent physical knowledge embedded within\nvideos. Worse still, their high computational costs could impede their\napplications for high-resolution videos. To address these constraints, we\nintroduce a novel framework called \\underline{P}hysics-\\underline{a}ssisted\n\\underline{S}patio-\\underline{t}emporal \\underline{Net}work (PastNet) for\nhigh-quality video prediction. The core of PastNet lies in incorporating a\nspectral convolution operator in the Fourier domain, which efficiently\nintroduces inductive biases from the underlying physical laws. Additionally, we\nemploy a memory bank with the estimated intrinsic dimensionality to discretize\nlocal features during the processing of complex spatio-temporal signals,\nthereby reducing computational costs and facilitating efficient high-resolution\nvideo prediction. Extensive experiments on various widely-used spatio-temporal\nvideo benchmarks demonstrate the effectiveness and efficiency of the proposed\nPastNet compared with a range of state-of-the-art methods, particularly in\nhigh-resolution scenarios.\n","authors":["Hao Wu","Fan Xu","Chong Chen","Xian-Sheng Hua","Xiao Luo","Haixin Wang"],"pdf_url":"https://arxiv.org/pdf/2305.11421v3.pdf","comment":"11"},{"id":"http://arxiv.org/abs/2501.07945v1","updated":"2025-01-14T08:56:59Z","published":"2025-01-14T08:56:59Z","title":"Early prediction of the transferability of bovine embryos from\n  videomicroscopy","summary":"  Videomicroscopy is a promising tool combined with machine learning for\nstudying the early development of in vitro fertilized bovine embryos and\nassessing its transferability as soon as possible. We aim to predict the embryo\ntransferability within four days at most, taking 2D time-lapse microscopy\nvideos as input. We formulate this problem as a supervised binary\nclassification problem for the classes transferable and not transferable. The\nchallenges are three-fold: 1) poorly discriminating appearance and motion, 2)\nclass ambiguity, 3) small amount of annotated data. We propose a 3D\nconvolutional neural network involving three pathways, which makes it\nmulti-scale in time and able to handle appearance and motion in different ways.\nFor training, we retain the focal loss. Our model, named SFR, compares\nfavorably to other methods. Experiments demonstrate its effectiveness and\naccuracy for our challenging biological task.\n","authors":["Yasmine Hachani","Patrick Bouthemy","Elisa Fromont","Sylvie Ruffini","Ludivine Laffont","Alline de Paula Reis"],"pdf_url":"https://arxiv.org/pdf/2501.07945v1.pdf","comment":"Accepted at the 2024 IEEE International Conference on Image\n  Processing"},{"id":"http://arxiv.org/abs/2501.03659v2","updated":"2025-01-14T08:52:51Z","published":"2025-01-07T09:47:46Z","title":"DehazeGS: Seeing Through Fog with 3D Gaussian Splatting","summary":"  Current novel view synthesis tasks primarily rely on high-quality and clear\nimages. However, in foggy scenes, scattering and attenuation can significantly\ndegrade the reconstruction and rendering quality. Although NeRF-based dehazing\nreconstruction algorithms have been developed, their use of deep fully\nconnected neural networks and per-ray sampling strategies leads to high\ncomputational costs. Moreover, NeRF's implicit representation struggles to\nrecover fine details from hazy scenes. In contrast, recent advancements in 3D\nGaussian Splatting achieve high-quality 3D scene reconstruction by explicitly\nmodeling point clouds into 3D Gaussians. In this paper, we propose leveraging\nthe explicit Gaussian representation to explain the foggy image formation\nprocess through a physically accurate forward rendering process. We introduce\nDehazeGS, a method capable of decomposing and rendering a fog-free background\nfrom participating media using only muti-view foggy images as input. We model\nthe transmission within each Gaussian distribution to simulate the formation of\nfog. During this process, we jointly learn the atmospheric light and scattering\ncoefficient while optimizing the Gaussian representation of the hazy scene. In\nthe inference stage, we eliminate the effects of scattering and attenuation on\nthe Gaussians and directly project them onto a 2D plane to obtain a clear view.\nExperiments on both synthetic and real-world foggy datasets demonstrate that\nDehazeGS achieves state-of-the-art performance in terms of both rendering\nquality and computational efficiency.\n","authors":["Jinze Yu","Yiqun Wang","Zhengda Lu","Jianwei Guo","Yong Li","Hongxing Qin","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03659v2.pdf","comment":"9 pages,4 figures"},{"id":"http://arxiv.org/abs/2409.16793v2","updated":"2025-01-14T08:47:17Z","published":"2024-09-25T10:14:01Z","title":"Spacewalker: Traversing Representation Spaces for Fast Interactive\n  Exploration and Annotation of Unstructured Data","summary":"  In industries such as healthcare, finance, and manufacturing, analysis of\nunstructured textual data presents significant challenges for analysis and\ndecision making. Uncovering patterns within large-scale corpora and\nunderstanding their semantic impact is critical, but depends on domain experts\nor resource-intensive manual reviews. In response, we introduce Spacewalker in\nthis system demonstration paper, an interactive tool designed to analyze,\nexplore, and annotate data across multiple modalities. It allows users to\nextract data representations, visualize them in low-dimensional spaces and\ntraverse large datasets either exploratory or by querying regions of interest.\nWe evaluated Spacewalker through extensive experiments and annotation studies,\nassessing its efficacy in improving data integrity verification and annotation.\nWe show that Spacewalker reduces time and effort compared to traditional\nmethods. The code of this work is open-source and can be found at:\nhttps://github.com/code-lukas/Spacewalker\n","authors":["Lukas Heine","Fabian Hörst","Jana Fragemann","Gijs Luijten","Jan Egger","Fin Bahnsen","M. Saquib Sarfraz","Jens Kleesiek","Constantin Seibold"],"pdf_url":"https://arxiv.org/pdf/2409.16793v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.06664v3","updated":"2025-01-14T08:33:08Z","published":"2024-12-09T17:01:42Z","title":"Knowledge Transfer and Domain Adaptation for Fine-Grained Remote Sensing\n  Image Segmentation","summary":"  Fine-grained remote sensing image segmentation is essential for accurately\nidentifying detailed objects in remote sensing images. Recently, vision\ntransformer models (VTMs) pre-trained on large-scale datasets have demonstrated\nstrong zero-shot generalization. However, directly applying them to specific\ntasks may lead to domain shift. We introduce a novel end-to-end learning\nparadigm combining knowledge guidance with domain refinement to enhance\nperformance. We present two key components: the Feature Alignment Module (FAM)\nand the Feature Modulation Module (FMM). FAM aligns features from a CNN-based\nbackbone with those from the pretrained VTM's encoder using channel\ntransformation and spatial interpolation, and transfers knowledge via KL\ndivergence and L2 normalization constraint. FMM further adapts the knowledge to\nthe specific domain to address domain shift. We also introduce a fine-grained\ngrass segmentation dataset and demonstrate, through experiments on two\ndatasets, that our method achieves a significant improvement of 2.57 mIoU on\nthe grass dataset and 3.73 mIoU on the cloud dataset. The results highlight the\npotential of combining knowledge transfer and domain adaptation to overcome\ndomain-related challenges and data limitations. The project page is available\nat https://xavierjiezou.github.io/KTDA/.\n","authors":["Shun Zhang","Xuechao Zou","Kai Li","Congyan Lang","Shiying Wang","Pin Tao","Tengfei Cao"],"pdf_url":"https://arxiv.org/pdf/2412.06664v3.pdf","comment":"6 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2412.21079v3","updated":"2025-01-14T08:23:30Z","published":"2024-12-30T16:56:44Z","title":"Edicho: Consistent Image Editing in the Wild","summary":"  As a verified need, consistent editing across in-the-wild images remains a\ntechnical challenge arising from various unmanageable factors, like object\nposes, lighting conditions, and photography environments. Edicho steps in with\na training-free solution based on diffusion models, featuring a fundamental\ndesign principle of using explicit image correspondence to direct editing.\nSpecifically, the key components include an attention manipulation module and a\ncarefully refined classifier-free guidance (CFG) denoising strategy, both of\nwhich take into account the pre-estimated correspondence. Such an\ninference-time algorithm enjoys a plug-and-play nature and is compatible to\nmost diffusion-based editing methods, such as ControlNet and BrushNet.\nExtensive results demonstrate the efficacy of Edicho in consistent cross-image\nediting under diverse settings. We will release the code to facilitate future\nstudies.\n","authors":["Qingyan Bai","Hao Ouyang","Yinghao Xu","Qiuyu Wang","Ceyuan Yang","Ka Leong Cheng","Yujun Shen","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2412.21079v3.pdf","comment":"Project page: https://ant-research.github.io/edicho/"},{"id":"http://arxiv.org/abs/2501.07922v1","updated":"2025-01-14T08:12:20Z","published":"2025-01-14T08:12:20Z","title":"VENOM: Text-driven Unrestricted Adversarial Example Generation with\n  Diffusion Models","summary":"  Adversarial attacks have proven effective in deceiving machine learning\nmodels by subtly altering input images, motivating extensive research in recent\nyears. Traditional methods constrain perturbations within $l_p$-norm bounds,\nbut advancements in Unrestricted Adversarial Examples (UAEs) allow for more\ncomplex, generative-model-based manipulations. Diffusion models now lead UAE\ngeneration due to superior stability and image quality over GANs. However,\nexisting diffusion-based UAE methods are limited to using reference images and\nface challenges in generating Natural Adversarial Examples (NAEs) directly from\nrandom noise, often producing uncontrolled or distorted outputs. In this work,\nwe introduce VENOM, the first text-driven framework for high-quality\nunrestricted adversarial examples generation through diffusion models. VENOM\nunifies image content generation and adversarial synthesis into a single\nreverse diffusion process, enabling high-fidelity adversarial examples without\nsacrificing attack success rate (ASR). To stabilize this process, we\nincorporate an adaptive adversarial guidance strategy with momentum, ensuring\nthat the generated adversarial examples $x^*$ align with the distribution\n$p(x)$ of natural images. Extensive experiments demonstrate that VENOM achieves\nsuperior ASR and image quality compared to prior methods, marking a significant\nadvancement in adversarial example generation and providing insights into model\nvulnerabilities for improved defense development.\n","authors":["Hui Kuurila-Zhang","Haoyu Chen","Guoying Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10568v3","updated":"2025-01-14T08:01:17Z","published":"2024-03-14T17:47:10Z","title":"MoPE: Mixture of Prompt Experts for Parameter-Efficient and Scalable\n  Multimodal Fusion","summary":"  Despite the demonstrated parameter efficiency of prompt-based multimodal\nfusion methods, their limited adaptivity and expressiveness often result in\nsuboptimal performance compared to other tuning approaches. In this paper, we\nintroduce the Mixture of Prompt Experts (MoPE), the first technique designed to\novercome these limitations by decomposing standard prompts to capture\ninstance-level features adaptively. Building on this decomposition, MoPE\nenhances prompt fusion's expressiveness by leveraging multimodal pairing priors\nto route the most effective prompt for each instance dynamically. Compared to\nvanilla prompting, our MoPE-based fusion method exhibits greater\nexpressiveness, scaling more effectively with the training data and the overall\nnumber of trainable parameters. We also investigate regularization terms for\nexpert routing, which lead to emergent expert specialization with enhanced\nadaptiveness and interpretablity. Extensive experiments across six multimodal\ndatasets spanning four modalities demonstrate state-of-the-art performance for\nprompt fusion, matching or even surpassing the performance of fine-tuning while\nrequiring only 0.8% of the trainable parameters. Project homepage:\nhttps://github.com/songrise/MoPE\n","authors":["Ruixiang Jiang","Lingbo Liu","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.10568v3.pdf","comment":"Under Review, Extended version of arxiv:2312.03734"},{"id":"http://arxiv.org/abs/2501.07901v1","updated":"2025-01-14T07:35:14Z","published":"2025-01-14T07:35:14Z","title":"Cloud Removal With PolSAR-Optical Data Fusion Using A Two-Flow Residual\n  Network","summary":"  Optical remote sensing images play a crucial role in the observation of the\nEarth's surface. However, obtaining complete optical remote sensing images is\nchallenging due to cloud cover. Reconstructing cloud-free optical images has\nbecome a major task in recent years. This paper presents a two-flow\nPolarimetric Synthetic Aperture Radar (PolSAR)-Optical data fusion cloud\nremoval algorithm (PODF-CR), which achieves the reconstruction of missing\noptical images. PODF-CR consists of an encoding module and a decoding module.\nThe encoding module includes two parallel branches that extract PolSAR image\nfeatures and optical image features. To address speckle noise in PolSAR images,\nwe introduce dynamic filters in the PolSAR branch for image denoising. To\nbetter facilitate the fusion between multimodal optical images and PolSAR\nimages, we propose fusion blocks based on cross-skip connections to enable\ninteraction of multimodal data information. The obtained fusion features are\nrefined through an attention mechanism to provide better conditions for the\nsubsequent decoding of the fused images. In the decoding module, multi-scale\nconvolution is introduced to obtain multi-scale information. Additionally, to\nbetter utilize comprehensive scattering information and polarization\ncharacteristics to assist in the restoration of optical images, we use a\ndataset for cloud restoration called OPT-BCFSAR-PFSAR, which includes\nbackscatter coefficient feature images and polarization feature images obtained\nfrom PoLSAR data and optical images. Experimental results demonstrate that this\nmethod outperforms existing methods in both qualitative and quantitative\nevaluations.\n","authors":["Yuxi Wang","Wenjuan Zhang","Bing Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07898v1","updated":"2025-01-14T07:26:55Z","published":"2025-01-14T07:26:55Z","title":"Demographic Variability in Face Image Quality Measures","summary":"  Face image quality assessment (FIQA) algorithms are being integrated into\nonline identity management applications. These applications allow users to\nupload a face image as part of their document issuance process, where the image\nis then run through a quality assessment process to make sure it meets the\nquality and compliance requirements. Concerns about demographic bias have been\nraised about biometric systems, given the societal implications this may cause.\nIt is therefore important that demographic variability in FIQA algorithms is\nassessed such that mitigation measures can be created. In this work, we study\nthe demographic variability of all face image quality measures included in the\nISO/IEC 29794-5 international standard across three demographic variables: age,\ngender, and skin tone. The results are rather promising and show no clear bias\ntoward any specific demographic group for most measures. Only two quality\nmeasures are found to have considerable variations in their outcomes for\ndifferent groups on the skin tone variable.\n","authors":["Wassim Kabbani","Kiran Raja","Raghavendra Ramachandra","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.07898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07888v1","updated":"2025-01-14T06:54:39Z","published":"2025-01-14T06:54:39Z","title":"Tarsier2: Advancing Large Vision-Language Models from Detailed Video\n  Description to Comprehensive Video Understanding","summary":"  We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM)\ndesigned for generating detailed and accurate video descriptions, while also\nexhibiting superior general video understanding capabilities. Tarsier2 achieves\nsignificant advancements through three key upgrades: (1) Scaling pre-training\ndata from 11M to 40M video-text pairs, enriching both volume and diversity; (2)\nPerforming fine-grained temporal alignment during supervised fine-tuning; (3)\nUsing model-based sampling to automatically construct preference data and\napplying DPO training for optimization. Extensive experiments show that\nTarsier2-7B consistently outperforms leading proprietary models, including\nGPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K\nbenchmark, Tarsier2-7B improves F1 by 2.8\\% over GPT-4o and 5.8\\% over\nGemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\\%\nperformance advantage over GPT-4o and +24.9\\% over Gemini-1.5-Pro. Tarsier2-7B\nalso sets new state-of-the-art results across 15 public benchmarks, spanning\ntasks such as video question-answering, video grounding, hallucination test,\nand embodied question-answering, demonstrating its versatility as a robust\ngeneralist vision-language model.\n","authors":["Liping Yuan","Jiawei Wang","Haomiao Sun","Yuchen Zhang","Yuan Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07885v1","updated":"2025-01-14T06:51:27Z","published":"2025-01-14T06:51:27Z","title":"Mitigating Algorithmic Bias in Multiclass CNN Classifications Using\n  Causal Modeling","summary":"  This study describes a procedure for applying causal modeling to detect and\nmitigate algorithmic bias in a multiclass classification problem. The dataset\nwas derived from the FairFace dataset, supplemented with emotional labels\ngenerated by the DeepFace pre-trained model. A custom Convolutional Neural\nNetwork (CNN) was developed, consisting of four convolutional blocks, followed\nby fully connected layers and dropout layers to mitigate overfitting. Gender\nbias was identified in the CNN model's classifications: Females were more\nlikely to be classified as \"happy\" or \"sad,\" while males were more likely to be\nclassified as \"neutral.\" To address this, the one-vs-all (OvA) technique was\napplied. A causal model was constructed for each emotion class to adjust the\nCNN model's predicted class probabilities. The adjusted probabilities for the\nvarious classes were then aggregated by selecting the class with the highest\nprobability. The resulting debiased classifications demonstrated enhanced\ngender fairness across all classes, with negligible impact--or even a slight\nimprovement--on overall accuracy. This study highlights that algorithmic\nfairness and accuracy are not necessarily trade-offs. All data and code for\nthis study are publicly available for download.\n","authors":["Min Sik Byun","Wendy Wan Yee Hui","Wai Kwong Lau"],"pdf_url":"https://arxiv.org/pdf/2501.07885v1.pdf","comment":"7 pages; 6 figures"},{"id":"http://arxiv.org/abs/2501.07171v2","updated":"2025-01-14T06:46:14Z","published":"2025-01-13T09:58:03Z","title":"BIOMEDICA: An Open Biomedical Image-Caption Archive, Dataset, and\n  Vision-Language Models Derived from Scientific Literature","summary":"  The development of vision-language models (VLMs) is driven by large-scale and\ndiverse multimodal datasets. However, progress toward generalist biomedical\nVLMs is limited by the lack of annotated, publicly accessible datasets across\nbiology and medicine. Existing efforts are restricted to narrow domains,\nmissing the full diversity of biomedical knowledge encoded in scientific\nliterature. To address this gap, we introduce BIOMEDICA, a scalable,\nopen-source framework to extract, annotate, and serialize the entirety of the\nPubMed Central Open Access subset into an easy-to-use, publicly accessible\ndataset. Our framework produces a comprehensive archive with over 24 million\nunique image-text pairs from over 6 million articles. Metadata and\nexpert-guided annotations are also provided. We demonstrate the utility and\naccessibility of our resource by releasing BMCA-CLIP, a suite of CLIP-style\nmodels continuously pre-trained on the BIOMEDICA dataset via streaming,\neliminating the need to download 27 TB of data locally. On average, our models\nachieve state-of-the-art performance across 40 tasks - spanning pathology,\nradiology, ophthalmology, dermatology, surgery, molecular biology,\nparasitology, and cell biology - excelling in zero-shot classification with a\n6.56% average improvement (as high as 29.8% and 17.5% in dermatology and\nophthalmology, respectively), and stronger image-text retrieval, all while\nusing 10x less compute. To foster reproducibility and collaboration, we release\nour codebase and dataset for the broader research community.\n","authors":["Alejandro Lozano","Min Woo Sun","James Burgess","Liangyu Chen","Jeffrey J Nirschl","Jeffrey Gu","Ivan Lopez","Josiah Aklilu","Austin Wolfgang Katzer","Collin Chiu","Anita Rau","Xiaohan Wang","Yuhui Zhang","Alfred Seunghoon Song","Robert Tibshirani","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2501.07171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07870v1","updated":"2025-01-14T06:21:31Z","published":"2025-01-14T06:21:31Z","title":"Make-A-Character 2: Animatable 3D Character Generation From a Single\n  Image","summary":"  This report introduces Make-A-Character 2, an advanced system for generating\nhigh-quality 3D characters from single portrait photographs, ideal for game\ndevelopment and digital human applications. Make-A-Character 2 builds upon its\npredecessor by incorporating several significant improvements for image-based\nhead generation. We utilize the IC-Light method to correct non-ideal\nillumination in input photos and apply neural network-based color correction to\nharmonize skin tones between the photos and game engine renders. We also employ\nthe Hierarchical Representation Network to capture high-frequency facial\nstructures and conduct adaptive skeleton calibration for accurate and\nexpressive facial animations. The entire image-to-3D-character generation\nprocess takes less than 2 minutes. Furthermore, we leverage transformer\narchitecture to generate co-speech facial and gesture actions, enabling\nreal-time conversation with the generated character. These technologies have\nbeen integrated into our conversational AI avatar products.\n","authors":["Lin Liu","Yutong Wang","Jiahao Chen","Jianfang Li","Tangli Xue","Longlong Li","Jianqiang Ren","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2501.07870v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2501.07859v1","updated":"2025-01-14T05:55:20Z","published":"2025-01-14T05:55:20Z","title":"deepTerra -- AI Land Classification Made Easy","summary":"  deepTerra is a comprehensive platform designed to facilitate the\nclassification of land surface features using machine learning and satellite\nimagery. The platform includes modules for data collection, image augmentation,\ntraining, testing, and prediction, streamlining the entire workflow for image\nclassification tasks. This paper presents a detailed overview of the\ncapabilities of deepTerra, shows how it has been applied to various research\nareas, and discusses the future directions it might take.\n","authors":["Andrew Keith Wilkinson"],"pdf_url":"https://arxiv.org/pdf/2501.07859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00836v2","updated":"2025-01-14T05:49:16Z","published":"2025-01-01T13:38:15Z","title":"Recognizing Artistic Style of Archaeological Image Fragments Using Deep\n  Style Extrapolation","summary":"  Ancient artworks obtained in archaeological excavations usually suffer from a\ncertain degree of fragmentation and physical degradation. Often, fragments of\nmultiple artifacts from different periods or artistic styles could be found on\nthe same site. With each fragment containing only partial information about its\nsource, and pieces from different objects being mixed, categorizing broken\nartifacts based on their visual cues could be a challenging task, even for\nprofessionals. As classification is a common function of many machine learning\nmodels, the power of modern architectures can be harnessed for efficient and\naccurate fragment classification. In this work, we present a generalized\ndeep-learning framework for predicting the artistic style of image fragments,\nachieving state-of-the-art results for pieces with varying styles and\ngeometries.\n","authors":["Gur Elkin","Ofir Itzhak Shahar","Yaniv Ohayon","Nadav Alali","Ohad Ben-Shahar"],"pdf_url":"https://arxiv.org/pdf/2501.00836v2.pdf","comment":"To be published in the 27th International Conference on\n  Human-Computer Interaction (HCII 2025)"},{"id":"http://arxiv.org/abs/2501.07855v1","updated":"2025-01-14T05:43:59Z","published":"2025-01-14T05:43:59Z","title":"State-of-the-Art Transformer Models for Image Super-Resolution:\n  Techniques, Challenges, and Applications","summary":"  Image Super-Resolution (SR) aims to recover a high-resolution image from its\nlow-resolution counterpart, which has been affected by a specific degradation\nprocess. This is achieved by enhancing detail and visual quality. Recent\nadvancements in transformer-based methods have remolded image super-resolution\nby enabling high-quality reconstructions surpassing previous deep-learning\napproaches like CNN and GAN-based. This effectively addresses the limitations\nof previous methods, such as limited receptive fields, poor global context\ncapture, and challenges in high-frequency detail recovery. Additionally, the\npaper reviews recent trends and advancements in transformer-based SR models,\nexploring various innovative techniques and architectures that combine\ntransformers with traditional networks to balance global and local contexts.\nThese neoteric methods are critically analyzed, revealing promising yet\nunexplored gaps and potential directions for future research. Several\nvisualizations of models and techniques are included to foster a holistic\nunderstanding of recent trends. This work seeks to offer a structured roadmap\nfor researchers at the forefront of deep learning, specifically exploring the\nimpact of transformers on super-resolution techniques.\n","authors":["Debasish Dutta","Deepjyoti Chetia","Neeharika Sonowal","Sanjib Kr Kalita"],"pdf_url":"https://arxiv.org/pdf/2501.07855v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2501.07850v1","updated":"2025-01-14T05:23:42Z","published":"2025-01-14T05:23:42Z","title":"An Intra- and Cross-frame Topological Consistency Scheme for\n  Semi-supervised Atherosclerotic Coronary Plaque Segmentation","summary":"  Enhancing the precision of segmenting coronary atherosclerotic plaques from\nCT Angiography (CTA) images is pivotal for advanced Coronary Atherosclerosis\nAnalysis (CAA), which distinctively relies on the analysis of vessel\ncross-section images reconstructed via Curved Planar Reformation. This task\npresents significant challenges due to the indistinct boundaries and structures\nof plaques and blood vessels, leading to the inadequate performance of current\ndeep learning models, compounded by the inherent difficulty in annotating such\ncomplex data. To address these issues, we propose a novel dual-consistency\nsemi-supervised framework that integrates Intra-frame Topological Consistency\n(ITC) and Cross-frame Topological Consistency (CTC) to leverage labeled and\nunlabeled data. ITC employs a dual-task network for simultaneous segmentation\nmask and Skeleton-aware Distance Transform (SDT) prediction, achieving similar\nprediction of topology structure through consistency constraint without\nadditional annotations. Meanwhile, CTC utilizes an unsupervised estimator for\nanalyzing pixel flow between skeletons and boundaries of adjacent frames,\nensuring spatial continuity. Experiments on two CTA datasets show that our\nmethod surpasses existing semi-supervised methods and approaches the\nperformance of supervised methods on CAA. In addition, our method also performs\nbetter than other methods on the ACDC dataset, demonstrating its\ngeneralization.\n","authors":["Ziheng Zhang","Zihan Li","Dandan Shan","Yuehui Qiu","Qingqi Hong","Qingqiang Wu"],"pdf_url":"https://arxiv.org/pdf/2501.07850v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.06480v2","updated":"2025-01-14T04:16:54Z","published":"2025-01-11T08:13:13Z","title":"Flash Window Attention: speedup the attention computation for Swin\n  Transformer","summary":"  To address the high resolution of image pixels, the Swin Transformer\nintroduces window attention. This mechanism divides an image into\nnon-overlapping windows and restricts attention computation to within each\nwindow, significantly enhancing computational efficiency. To further optimize\nthis process, one might consider replacing standard attention with flash\nattention, which has proven to be more efficient in language models. However, a\ndirect substitution is ineffective. Flash attention is designed for long\nsequences, whereas window attention deals with shorter sequences but must\nhandle numerous of them in parallel. In this report, we present an optimized\nsolution called Flash Window Attention, tailored specifically for window\nattention. Flash Window Attention improves attention computation efficiency by\nup to 300% and enhances end-to-end runtime efficiency by up to 30%. Our code is\navailable online.\n","authors":["Zhendong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06480v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15975v4","updated":"2025-01-14T03:55:17Z","published":"2022-11-29T07:18:32Z","title":"Analyzing Infrastructure LiDAR Placement with Realistic LiDAR Simulation\n  Library","summary":"  Recently, Vehicle-to-Everything(V2X) cooperative perception has attracted\nincreasing attention. Infrastructure sensors play a critical role in this\nresearch field; however, how to find the optimal placement of infrastructure\nsensors is rarely studied. In this paper, we investigate the problem of\ninfrastructure sensor placement and propose a pipeline that can efficiently and\neffectively find optimal installation positions for infrastructure sensors in a\nrealistic simulated environment. To better simulate and evaluate LiDAR\nplacement, we establish a Realistic LiDAR Simulation library that can simulate\nthe unique characteristics of different popular LiDARs and produce\nhigh-fidelity LiDAR point clouds in the CARLA simulator. Through simulating\npoint cloud data in different LiDAR placements, we can evaluate the perception\naccuracy of these placements using multiple detection models. Then, we analyze\nthe correlation between the point cloud distribution and perception accuracy by\ncalculating the density and uniformity of regions of interest. Experiments show\nthat when using the same number and type of LiDAR, the placement scheme\noptimized by our proposed method improves the average precision by 15%,\ncompared with the conventional placement scheme in the standard lane scene. We\nalso analyze the correlation between perception performance in the region of\ninterest and LiDAR point cloud distribution and validate that density and\nuniformity can be indicators of performance. Both the RLS Library and related\ncode will be released at https://github.com/PJLab-ADG/PCSim.\n","authors":["Xinyu Cai","Wentao Jiang","Runsheng Xu","Wenquan Zhao","Jiaqi Ma","Si Liu","Yikang Li"],"pdf_url":"https://arxiv.org/pdf/2211.15975v4.pdf","comment":"7 pages, 6 figures, accepted to the IEEE International Conference on\n  Robotics and Automation (ICRA'23)"},{"id":"http://arxiv.org/abs/2501.07819v1","updated":"2025-01-14T03:50:23Z","published":"2025-01-14T03:50:23Z","title":"3UR-LLM: An End-to-End Multimodal Large Language Model for 3D Scene\n  Understanding","summary":"  Multi-modal Large Language Models (MLLMs) exhibit impressive capabilities in\n2D tasks, yet encounter challenges in discerning the spatial positions,\ninterrelations, and causal logic in scenes when transitioning from 2D to 3D\nrepresentations. We find that the limitations mainly lie in: i) the high\nannotation cost restricting the scale-up of volumes of 3D scene data, and ii)\nthe lack of a straightforward and effective way to perceive 3D information\nwhich results in prolonged training durations and complicates the streamlined\nframework. To this end, we develop pipeline based on open-source 2D MLLMs and\nLLMs to generate high-quality 3D-text pairs and construct 3DS-160K , to enhance\nthe pre-training process. Leveraging this high-quality pre-training data, we\nintroduce the 3UR-LLM model, an end-to-end 3D MLLM designed for precise\ninterpretation of 3D scenes, showcasing exceptional capability in navigating\nthe complexities of the physical world. 3UR-LLM directly receives 3D point\ncloud as input and project 3D features fused with text instructions into a\nmanageable set of tokens. Considering the computation burden derived from these\nhybrid tokens, we design a 3D compressor module to cohesively compress the 3D\nspatial cues and textual narrative. 3UR-LLM achieves promising performance with\nrespect to the previous SOTAs, for instance, 3UR-LLM exceeds its counterparts\nby 7.1\\% CIDEr on ScanQA, while utilizing fewer training resources. The code\nand model weights for 3UR-LLM and the 3DS-160K benchmark are available at\n3UR-LLM.\n","authors":["Haomiao Xiong","Yunzhi Zhuge","Jiawen Zhu","Lu Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.07819v1.pdf","comment":"Accepted to IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2412.10106v4","updated":"2025-01-14T03:43:02Z","published":"2024-12-13T12:47:30Z","title":"A Cascaded Dilated Convolution Approach for Mpox Lesion Classification","summary":"  The global outbreak of the Mpox virus, classified as a Public Health\nEmergency of International Concern (PHEIC) by the World Health Organization,\npresents significant diagnostic challenges due to its visual similarity to\nother skin lesion diseases. Traditional diagnostic methods for Mpox, which rely\non clinical symptoms and laboratory tests, are slow and labor intensive. Deep\nlearning-based approaches for skin lesion classification offer a promising\nalternative. However, developing a model that balances efficiency with accuracy\nis crucial to ensure reliable and timely diagnosis without compromising\nperformance. This study introduces the Cascaded Atrous Group Attention (CAGA)\nframework to address these challenges, combining the Cascaded Atrous Attention\nmodule and the Cascaded Group Attention mechanism. The Cascaded Atrous\nAttention module utilizes dilated convolutions and cascades the outputs to\nenhance multi-scale representation. This is integrated into the Cascaded Group\nAttention mechanism, which reduces redundancy in Multi-Head Self-Attention. By\nintegrating the Cascaded Atrous Group Attention module with EfficientViT-L1 as\nthe backbone architecture, this approach achieves state-of-the-art performance,\nreaching an accuracy of 98% on the Mpox Close Skin Image (MCSI) dataset while\nreducing model parameters by 37.5% compared to the original EfficientViT-L1.\nThe model's robustness is demonstrated through extensive validation on two\nadditional benchmark datasets, where it consistently outperforms existing\napproaches.\n","authors":["Ayush Deshmukh"],"pdf_url":"https://arxiv.org/pdf/2412.10106v4.pdf","comment":"8 pages, 4 figures, Submitted to Medical Imaging with Deep Learning"},{"id":"http://arxiv.org/abs/2501.07810v1","updated":"2025-01-14T03:20:20Z","published":"2025-01-14T03:20:20Z","title":"AVS-Mamba: Exploring Temporal and Multi-modal Mamba for Audio-Visual\n  Segmentation","summary":"  The essence of audio-visual segmentation (AVS) lies in locating and\ndelineating sound-emitting objects within a video stream. While\nTransformer-based methods have shown promise, their handling of long-range\ndependencies struggles due to quadratic computational costs, presenting a\nbottleneck in complex scenarios. To overcome this limitation and facilitate\ncomplex multi-modal comprehension with linear complexity, we introduce\nAVS-Mamba, a selective state space model to address the AVS task. Our framework\nincorporates two key components for video understanding and cross-modal\nlearning: Temporal Mamba Block for sequential video processing and\nVision-to-Audio Fusion Block for advanced audio-vision integration. Building on\nthis, we develop the Multi-scale Temporal Encoder, aimed at enhancing the\nlearning of visual features across scales, facilitating the perception of\nintra- and inter-frame information. To perform multi-modal fusion, we propose\nthe Modality Aggregation Decoder, leveraging the Vision-to-Audio Fusion Block\nto integrate visual features into audio features across both frame and temporal\nlevels. Further, we adopt the Contextual Integration Pyramid to perform\naudio-to-vision spatial-temporal context collaboration. Through these\ninnovative contributions, our approach achieves new state-of-the-art results on\nthe AVSBench-object and AVSBench-semantic datasets. Our source code and model\nweights are available at AVS-Mamba.\n","authors":["Sitong Gong","Yunzhi Zhuge","Lu Zhang","Yifan Wang","Pingping Zhang","Lijun Wang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.07810v1.pdf","comment":"Accepted to IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2501.07808v1","updated":"2025-01-14T03:19:10Z","published":"2025-01-14T03:19:10Z","title":"A Low-cost and Ultra-lightweight Binary Neural Network for Traffic\n  Signal Recognition","summary":"  The deployment of neural networks in vehicle platforms and wearable\nArtificial Intelligence-of-Things (AIOT) scenarios has become a research area\nthat has attracted much attention. With the continuous evolution of deep\nlearning technology, many image classification models are committed to\nimproving recognition accuracy, but this is often accompanied by problems such\nas large model resource usage, complex structure, and high power consumption,\nwhich makes it challenging to deploy on resource-constrained platforms. Herein,\nwe propose an ultra-lightweight binary neural network (BNN) model designed for\nhardware deployment, and conduct image classification research based on the\nGerman Traffic Sign Recognition Benchmark (GTSRB) dataset. In addition, we also\nverify it on the Chinese Traffic Sign (CTS) and Belgian Traffic Sign (BTS)\ndatasets. The proposed model shows excellent recognition performance with an\naccuracy of up to 97.64%, making it one of the best performing BNN models in\nthe GTSRB dataset. Compared with the full-precision model, the accuracy loss is\ncontrolled within 1%, and the parameter storage overhead of the model is only\n10% of that of the full-precision model. More importantly, our network model\nonly relies on logical operations and low-bit width fixed-point addition and\nsubtraction operations during the inference phase, which greatly simplifies the\ndesign complexity of the processing element (PE). Our research shows the great\npotential of BNN in the hardware deployment of computer vision models,\nespecially in the field of computer vision tasks related to autonomous driving.\n","authors":["Mingke Xiao","Yue Su","Liang Yu","Guanglong Qu","Yutong Jia","Yukuan Chang","Xu Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07806v1","updated":"2025-01-14T03:15:46Z","published":"2025-01-14T03:15:46Z","title":"Learning Motion and Temporal Cues for Unsupervised Video Object\n  Segmentation","summary":"  In this paper, we address the challenges in unsupervised video object\nsegmentation (UVOS) by proposing an efficient algorithm, termed MTNet, which\nconcurrently exploits motion and temporal cues. Unlike previous methods that\nfocus solely on integrating appearance with motion or on modeling temporal\nrelations, our method combines both aspects by integrating them within a\nunified framework. MTNet is devised by effectively merging appearance and\nmotion features during the feature extraction process within encoders,\npromoting a more complementary representation. To capture the intricate\nlong-range contextual dynamics and information embedded within videos, a\ntemporal transformer module is introduced, facilitating efficacious inter-frame\ninteractions throughout a video clip. Furthermore, we employ a cascade of\ndecoders all feature levels across all feature levels to optimally exploit the\nderived features, aiming to generate increasingly precise segmentation masks.\nAs a result, MTNet provides a strong and compact framework that explores both\ntemporal and cross-modality knowledge to robustly localize and track the\nprimary object accurately in various challenging scenarios efficiently.\nExtensive experiments across diverse benchmarks conclusively show that our\nmethod not only attains state-of-the-art performance in unsupervised video\nobject segmentation but also delivers competitive results in video salient\nobject detection. These findings highlight the method's robust versatility and\nits adeptness in adapting to a range of segmentation tasks. Source code is\navailable on https://github.com/hy0523/MTNet.\n","authors":["Yunzhi Zhuge","Hongyu Gu","Lu Zhang","Jinqing Qi","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.07806v1.pdf","comment":"Accepted to IEEE Transactions on Neural Networks and Learning Systems\n  (TNNLS)"},{"id":"http://arxiv.org/abs/2501.07804v1","updated":"2025-01-14T03:12:25Z","published":"2025-01-14T03:12:25Z","title":"Balance Divergence for Knowledge Distillation","summary":"  Knowledge distillation has been widely adopted in computer vision task\nprocessing, since it can effectively enhance the performance of lightweight\nstudent networks by leveraging the knowledge transferred from cumbersome\nteacher networks. Most existing knowledge distillation methods utilize\nKullback-Leibler divergence to mimic the logit output probabilities between the\nteacher network and the student network. Nonetheless, these methods may neglect\nthe negative parts of the teacher's ''dark knowledge'' because the divergence\ncalculations may ignore the effect of the minute probabilities from the\nteacher's logit output. This deficiency may lead to suboptimal performance in\nlogit mimicry during the distillation process and result in an imbalance of\ninformation acquired by the student network. In this paper, we investigate the\nimpact of this imbalance and propose a novel method, named Balance Divergence\nDistillation. By introducing a compensatory operation using reverse\nKullback-Leibler divergence, our method can improve the modeling of the\nextremely small values in the negative from the teacher and preserve the\nlearning capacity for the positive. Furthermore, we test the impact of\ndifferent temperature coefficients adjustments, which may conducted to further\nbalance for knowledge transferring. We evaluate the proposed method on several\ncomputer vision tasks, including image classification and semantic\nsegmentation. The evaluation results show that our method achieves an accuracy\nimprovement of 1%~3% for lightweight students on both CIFAR-100 and ImageNet\ndataset, and a 4.55% improvement in mIoU for PSP-ResNet18 on the Cityscapes\ndataset. The experiments show that our method is a simple yet highly effective\nsolution that can be smoothly applied to different knowledge distillation\nmethods.\n","authors":["Yafei Qi","Chen Wang","Zhaoning Zhang","Yaping Liu","Yongmin Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07800v1","updated":"2025-01-14T02:56:19Z","published":"2025-01-14T02:56:19Z","title":"BioPose: Biomechanically-accurate 3D Pose Estimation from Monocular\n  Videos","summary":"  Recent advancements in 3D human pose estimation from single-camera images and\nvideos have relied on parametric models, like SMPL. However, these models\noversimplify anatomical structures, limiting their accuracy in capturing true\njoint locations and movements, which reduces their applicability in\nbiomechanics, healthcare, and robotics. Biomechanically accurate pose\nestimation, on the other hand, typically requires costly marker-based motion\ncapture systems and optimization techniques in specialized labs. To bridge this\ngap, we propose BioPose, a novel learning-based framework for predicting\nbiomechanically accurate 3D human pose directly from monocular videos. BioPose\nincludes three key components: a Multi-Query Human Mesh Recovery model\n(MQ-HMR), a Neural Inverse Kinematics (NeurIK) model, and a 2D-informed pose\nrefinement technique. MQ-HMR leverages a multi-query deformable transformer to\nextract multi-scale fine-grained image features, enabling precise human mesh\nrecovery. NeurIK treats the mesh vertices as virtual markers, applying a\nspatial-temporal network to regress biomechanically accurate 3D poses under\nanatomical constraints. To further improve 3D pose estimations, a 2D-informed\nrefinement step optimizes the query tokens during inference by aligning the 3D\nstructure with 2D pose observations. Experiments on benchmark datasets\ndemonstrate that BioPose significantly outperforms state-of-the-art methods.\nProject website:\n\\url{https://m-usamasaleem.github.io/publication/BioPose/BioPose.html}.\n","authors":["Farnoosh Koleini","Muhammad Usama Saleem","Pu Wang","Hongfei Xue","Ahmed Helmy","Abbey Fenwick"],"pdf_url":"https://arxiv.org/pdf/2501.07800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09323v3","updated":"2025-01-14T02:56:00Z","published":"2024-09-14T05:53:33Z","title":"Implicit Neural Representations with Fourier Kolmogorov-Arnold Networks","summary":"  Implicit neural representations (INRs) use neural networks to provide\ncontinuous and resolution-independent representations of complex signals with a\nsmall number of parameters. However, existing INR models often fail to capture\nimportant frequency components specific to each task. To address this issue, in\nthis paper, we propose a Fourier Kolmogorov Arnold network (FKAN) for INRs. The\nproposed FKAN utilizes learnable activation functions modeled as Fourier series\nin the first layer to effectively control and learn the task-specific frequency\ncomponents. In addition, the activation functions with learnable Fourier\ncoefficients improve the ability of the network to capture complex patterns and\ndetails, which is beneficial for high-resolution and high-dimensional data.\nExperimental results show that our proposed FKAN model outperforms three\nstate-of-the-art baseline schemes, and improves the peak signal-to-noise ratio\n(PSNR) and structural similarity index measure (SSIM) for the image\nrepresentation task and intersection over union (IoU) for the 3D occupancy\nvolume representation task, respectively. The code is available at\ngithub.com/Ali-Meh619/FKAN.\n","authors":["Ali Mehrabian","Parsa Mojarad Adi","Moein Heidari","Ilker Hacihaliloglu"],"pdf_url":"https://arxiv.org/pdf/2409.09323v3.pdf","comment":"Accepted for publication in Proc. IEEE ICASSP 2025"},{"id":"http://arxiv.org/abs/2407.02772v2","updated":"2025-01-14T02:30:09Z","published":"2024-07-03T03:01:43Z","title":"Gradient descent with generalized Newton's method","summary":"  We propose the generalized Newton's method (GeN) -- a Hessian-informed\napproach that applies to any optimizer such as SGD and Adam, and covers the\nNewton-Raphson method as a sub-case. Our method automatically and dynamically\nselects the learning rate that accelerates the convergence, without the\nintensive tuning of the learning rate scheduler. In practice, our method is\neasily implementable, since it only requires additional forward passes with\nalmost zero computational overhead (in terms of training time and memory cost),\nif the overhead is amortized over many iterations. We present extensive\nexperiments on language and vision tasks (e.g. GPT and ResNet) to showcase that\nGeN optimizers match the state-of-the-art performance, which was achieved with\ncarefully tuned learning rate schedulers.\n","authors":["Zhiqi Bu","Shiyun Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15761v2","updated":"2025-01-14T02:10:46Z","published":"2024-11-24T09:12:37Z","title":"MambaTrack: Exploiting Dual-Enhancement for Night UAV Tracking","summary":"  Night unmanned aerial vehicle (UAV) tracking is impeded by the challenges of\npoor illumination, with previous daylight-optimized methods demonstrating\nsuboptimal performance in low-light conditions, limiting the utility of UAV\napplications. To this end, we propose an efficient mamba-based tracker,\nleveraging dual enhancement techniques to boost night UAV tracking. The\nmamba-based low-light enhancer, equipped with an illumination estimator and a\ndamage restorer, achieves global image enhancement while preserving the details\nand structure of low-light images. Additionally, we advance a cross-modal mamba\nnetwork to achieve efficient interactive learning between vision and language\nmodalities. Extensive experiments showcase that our method achieves advanced\nperformance and exhibits significantly improved computation and memory\nefficiency. For instance, our method is 2.8$\\times$ faster than CiteTracker and\nreduces 50.2$\\%$ GPU memory. Our codes are available at\n\\url{https://github.com/983632847/Awesome-Multimodal-Object-Tracking}.\n","authors":["Chunhui Zhang","Li Liu","Hao Wen","Xi Zhou","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15761v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2405.14880v4","updated":"2025-01-14T01:57:44Z","published":"2024-04-04T20:06:07Z","title":"Dissecting Query-Key Interaction in Vision Transformers","summary":"  Self-attention in vision transformers is often thought to perform perceptual\ngrouping where tokens attend to other tokens with similar embeddings, which\ncould correspond to semantically similar features of an object. However,\nattending to dissimilar tokens can be beneficial by providing contextual\ninformation. We propose to analyze the query-key interaction by the singular\nvalue decomposition of the interaction matrix (i.e.\n${\\textbf{W}_q}^\\top\\textbf{W}_k$). We find that in many ViTs, especially those\nwith classification training objectives, early layers attend more to similar\ntokens, while late layers show increased attention to dissimilar tokens,\nproviding evidence corresponding to perceptual grouping and contextualization,\nrespectively. Many of these interactions between features represented by\nsingular vectors are interpretable and semantic, such as attention between\nrelevant objects, between parts of an object, or between the foreground and\nbackground. This offers a novel perspective on interpreting the attention\nmechanism, which contributes to understanding how transformer models utilize\ncontext and salient features when processing images.\n","authors":["Xu Pan","Aaron Philip","Ziqian Xie","Odelia Schwartz"],"pdf_url":"https://arxiv.org/pdf/2405.14880v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07783v1","updated":"2025-01-14T01:57:41Z","published":"2025-01-14T01:57:41Z","title":"Parameter-Inverted Image Pyramid Networks for Visual Perception and\n  Multimodal Understanding","summary":"  Image pyramids are widely adopted in top-performing methods to obtain\nmulti-scale features for precise visual perception and understanding. However,\ncurrent image pyramids use the same large-scale model to process multiple\nresolutions of images, leading to significant computational cost. To address\nthis challenge, we propose a novel network architecture, called\nParameter-Inverted Image Pyramid Networks (PIIP). Specifically, PIIP uses\npretrained models (ViTs or CNNs) as branches to process multi-scale images,\nwhere images of higher resolutions are processed by smaller network branches to\nbalance computational cost and performance. To integrate information from\ndifferent spatial scales, we further propose a novel cross-branch feature\ninteraction mechanism. To validate PIIP, we apply it to various perception\nmodels and a representative multimodal large language model called LLaVA, and\nconduct extensive experiments on various tasks such as object detection,\nsegmentation, image classification and multimodal understanding. PIIP achieves\nsuperior performance compared to single-branch and existing multi-resolution\napproaches with lower computational cost. When applied to InternViT-6B, a\nlarge-scale vision foundation model, PIIP can improve its performance by 1%-2%\non detection and segmentation with only 40%-60% of the original computation,\nfinally achieving 60.0 box AP on MS COCO and 59.7 mIoU on ADE20K. For\nmultimodal understanding, our PIIP-LLaVA achieves 73.0% accuracy on TextVQA and\n74.5% on MMBench with only 2.8M training data. Our code is released at\nhttps://github.com/OpenGVLab/PIIP.\n","authors":["Zhaokai Wang","Xizhou Zhu","Xue Yang","Gen Luo","Hao Li","Changyao Tian","Wenhan Dou","Junqi Ge","Lewei Lu","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2501.07783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12463v2","updated":"2025-01-14T01:57:04Z","published":"2024-08-22T15:04:59Z","title":"Smartphone-based Eye Tracking System using Edge Intelligence and Model\n  Optimisation","summary":"  A significant limitation of current smartphone-based eye-tracking algorithms\nis their low accuracy when applied to video-type visual stimuli, as they are\ntypically trained on static images. Also, the increasing demand for real-time\ninteractive applications like games, VR, and AR on smartphones requires\novercoming the limitations posed by resource constraints such as limited\ncomputational power, battery life, and network bandwidth. Therefore, we\ndeveloped two new smartphone eye-tracking techniques for video-type visuals by\ncombining Convolutional Neural Networks (CNN) with two different Recurrent\nNeural Networks (RNN), namely Long Short Term Memory (LSTM) and Gated Recurrent\nUnit (GRU). Our CNN+LSTM and CNN+GRU models achieved an average Root Mean\nSquare Error of 0.955 cm and 1.091 cm, respectively. To address the\ncomputational constraints of smartphones, we developed an edge intelligence\narchitecture to enhance the performance of smartphone-based eye tracking. We\napplied various optimisation methods like quantisation and pruning to deep\nlearning models for better energy, CPU, and memory usage on edge devices,\nfocusing on real-time processing. Using model quantisation, the model inference\ntime in the CNN+LSTM and CNN+GRU models was reduced by 21.72% and 19.50%,\nrespectively, on edge devices.\n","authors":["Nishan Gunawardena","Gough Yumu Lui","Jeewani Anupama Ginige","Bahman Javadi"],"pdf_url":"https://arxiv.org/pdf/2408.12463v2.pdf","comment":"I have included the three papers as reference, which are closely\n  related. We have expanded the future work section to provide a more thorough\n  discussion of the concepts of \"varying lighting conditions\" and \"dynamic user\n  environments.\" We have added a note below Table 4 to clarify the\n  abbreviations' meaning. Elaborated the role of the Domain Expert within the\n  presentation layer in Section 4.1"},{"id":"http://arxiv.org/abs/2407.14649v2","updated":"2025-01-14T01:39:22Z","published":"2024-07-19T19:56:53Z","title":"The Collection of a Human Robot Collaboration Dataset for Cooperative\n  Assembly in Glovebox Environments","summary":"  Industry 4.0 introduced AI as a transformative solution for modernizing\nmanufacturing processes. Its successor, Industry 5.0, envisions humans as\ncollaborators and experts guiding these AI-driven manufacturing solutions.\nDeveloping these techniques necessitates algorithms capable of safe, real-time\nidentification of human positions in a scene, particularly their hands, during\ncollaborative assembly. Although substantial efforts have curated datasets for\nhand segmentation, most focus on residential or commercial domains. Existing\ndatasets targeting industrial settings predominantly rely on synthetic data,\nwhich we demonstrate does not effectively transfer to real-world operations.\nMoreover, these datasets lack uncertainty estimations critical for safe\ncollaboration. Addressing these gaps, we present HAGS: Hand and Glove\nSegmentation Dataset. This dataset provides challenging examples to build\napplications toward hand and glove segmentation in industrial human-robot\ncollaboration scenarios as well as assess out-of-distribution images,\nconstructed via green screen augmentations, to determine ML-classifier\nrobustness. We study state-of-the-art, real-time segmentation models to\nevaluate existing methods. Our dataset and baselines are publicly available.\n","authors":["Shivansh Sharma","Mathew Huang","Sanat Nair","Alan Wen","Christina Petlowany","Juston Moore","Selma Wanna","Mitch Pryor"],"pdf_url":"https://arxiv.org/pdf/2407.14649v2.pdf","comment":"draft paper to be submitted to IJRR"},{"id":"http://arxiv.org/abs/2210.01272v3","updated":"2025-01-14T01:34:10Z","published":"2022-10-03T23:44:38Z","title":"A systematic review of the use of Deep Learning in Satellite Imagery for\n  Agriculture","summary":"  Agricultural research is essential for increasing food production to meet the\nrequirements of an increasing population in the coming decades. Recently,\nsatellite technology has been improving rapidly and deep learning has seen much\nsuccess in generic computer vision tasks and many application areas which\npresents an important opportunity to improve analysis of agricultural land.\nHere we present a systematic review of 150 studies to find the current uses of\ndeep learning on satellite imagery for agricultural research. Although we\nidentify 5 categories of agricultural monitoring tasks, the majority of the\nresearch interest is in crop segmentation and yield prediction. We found that,\nwhen used, modern deep learning methods consistently outperformed traditional\nmachine learning across most tasks; the only exception was that Long Short-Term\nMemory (LSTM) Recurrent Neural Networks did not consistently outperform Random\nForests (RF) for yield prediction. The reviewed studies have largely adopted\nmethodologies from generic computer vision, except for one major omission:\nbenchmark datasets are not utilised to evaluate models across studies, making\nit difficult to compare results. Additionally, some studies have specifically\nutilised the extra spectral resolution available in satellite imagery, but\nother divergent properties of satellite images - such as the hugely different\nscales of spatial patterns - are not being taken advantage of in the reviewed\nstudies.\n","authors":["Brandon Victor","Zhen He","Aiden Nibali"],"pdf_url":"https://arxiv.org/pdf/2210.01272v3.pdf","comment":"23 pages, 5 figures and 10 tables in main paper. Final version, as\n  submitted and accepted at JSTARS"},{"id":"http://arxiv.org/abs/2408.06170v4","updated":"2025-01-14T01:27:36Z","published":"2024-08-12T14:16:10Z","title":"Zero-shot 3D Segmentation of Abdominal Organs in CT Scans Using Segment\n  Anything Model 2: Adapting Video Tracking Capabilities for 3D Medical Imaging","summary":"  Objectives: To evaluate the zero-shot performance of Segment Anything Model 2\n(SAM 2) in 3D segmentation of abdominal organs in CT scans, and to investigate\nthe effects of prompt settings on segmentation results.\n  Materials and Methods: In this retrospective study, we used a subset of the\nTotalSegmentator CT dataset from eight institutions to assess SAM 2's ability\nto segment eight abdominal organs. Segmentation was initiated from three\ndifferent z-coordinate levels (caudal, mid, and cranial levels) of each organ.\nPerformance was measured using the Dice similarity coefficient (DSC). We also\nanalyzed the impact of \"negative prompts,\" which explicitly exclude certain\nregions from the segmentation process, on accuracy.\n  Results: 123 patients (mean age, 60.7 \\pm 15.5 years; 63 men, 60 women) were\nevaluated. As a zero-shot approach, larger organs with clear boundaries\ndemonstrated high segmentation performance, with mean DSCs as follows: liver\n0.821 \\pm 0.192, right kidney 0.862 \\pm 0.212, left kidney 0.870 \\pm 0.154, and\nspleen 0.891 \\pm 0.131. Smaller organs showed lower performance: gallbladder\n0.531 \\pm 0.291, pancreas 0.361 \\pm 0.197, and adrenal glands, right 0.203 \\pm\n0.222, left 0.308 \\pm 0.234. The initial slice for segmentation and the use of\nnegative prompts significantly influenced the results. By removing negative\nprompts from the input, the DSCs significantly decreased for six organs.\n  Conclusion: SAM 2 demonstrated promising zero-shot performance in segmenting\ncertain abdominal organs in CT scans, particularly larger organs. Performance\nwas significantly influenced by input negative prompts and initial slice\nselection, highlighting the importance of optimizing these factors.\n","authors":["Yosuke Yamagishi","Shouhei Hanaoka","Tomohiro Kikuchi","Takahiro Nakao","Yuta Nakamura","Yukihiro Nomura","Soichiro Miki","Takeharu Yoshikawa","Osamu Abe"],"pdf_url":"https://arxiv.org/pdf/2408.06170v4.pdf","comment":"20 pages, 7 figures (including 2 supplemental figure), 4 tables"},{"id":"http://arxiv.org/abs/2306.03983v2","updated":"2025-01-14T01:10:52Z","published":"2023-06-06T19:36:11Z","title":"XVertNet: Unsupervised Contrast Enhancement of Vertebral Structures with\n  Dynamic Self-Tuning Guidance and Multi-Stage Analysis","summary":"  Chest X-rays remain the primary diagnostic tool in emergency medicine, yet\ntheir limited ability to capture fine anatomical details can result in missed\nor delayed diagnoses. To address this, we introduce XVertNet, a novel\ndeep-learning framework designed to enhance vertebral structure visualization\nin X-ray images significantly. Our framework introduces two key innovations:\n(1) An unsupervised learning architecture that eliminates reliance on manually\nlabeled training data a persistent bottleneck in medical imaging, and (2) a\ndynamic self-tuned internal guidance mechanism featuring an adaptive feedback\nloop for real-time image optimization. Extensive validation across four major\npublic datasets revealed that XVertNet outperforms state-of-the-art enhancement\nmethods, as demonstrated by improvements in entropy scores, Tenengrad criterion\nvalues, the local phase coherence sharpness index (LPC-SI), and thetone mapped\nimage quality index (TMQI). Furthermore, clinical validation conducted with two\nboard-certified radiologists confirmed that the enhanced images enabled more\nsensitive detection of subtle vertebral fractures and degenerative changes. The\nunsupervised nature of XVertNet facilitates immediate clinical deployment\nwithout requiring additional training overhead. This innovation represents a\ntransformative advancement in emergency radiology, providing a scalable and\ntime-efficient solution to enhance diagnostic accuracy in high-pressure\nclinical environments.\n","authors":["Ella Eidlin","Assaf Hoogi","Hila Rozen","Mohammad Badarne","Nathan S. Netanyahu"],"pdf_url":"https://arxiv.org/pdf/2306.03983v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2501.07769v1","updated":"2025-01-14T00:59:55Z","published":"2025-01-14T00:59:55Z","title":"BMIP: Bi-directional Modality Interaction Prompt Learning for VLM","summary":"  Vision-language models (VLMs) have exhibited remarkable generalization\ncapabilities, and prompt learning for VLMs has attracted great attention for\nthe ability to adapt pre-trained VLMs to specific downstream tasks. However,\nexisting studies mainly focus on single-modal prompts or uni-directional\nmodality interaction, overlooking the powerful alignment effects resulting from\nthe interaction between the vision and language modalities. To this end, we\npropose a novel prompt learning method called\n$\\underline{\\textbf{B}}i-directional \\underline{\\textbf{M}}odality\n\\underline{\\textbf{I}}nteraction \\underline{\\textbf{P}}rompt (BMIP)$, which\ndynamically weights bi-modal information through learning the information of\nthe attention layer, enhancing trainability and inter-modal consistency\ncompared to simple information aggregation methods. To evaluate the\neffectiveness of prompt learning methods, we propose a more realistic\nevaluation paradigm called open-world generalization complementing the widely\nadopted cross-dataset transfer and domain generalization tasks. Comprehensive\nexperiments on various datasets reveal that BMIP not only outperforms current\nstate-of-the-art methods across all three evaluation paradigms but is also\nflexible enough to be combined with other prompt-based methods for consistent\nperformance enhancement.\n","authors":["Song-Lin Lv","Yu-Yang Chen","Zhi Zhou","Ming Yang","Lan-Zhe Guo"],"pdf_url":"https://arxiv.org/pdf/2501.07769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07762v1","updated":"2025-01-14T00:30:22Z","published":"2025-01-14T00:30:22Z","title":"PSReg: Prior-guided Sparse Mixture of Experts for Point Cloud\n  Registration","summary":"  The discriminative feature is crucial for point cloud registration. Recent\nmethods improve the feature discriminative by distinguishing between\nnon-overlapping and overlapping region points. However, they still face\nchallenges in distinguishing the ambiguous structures in the overlapping\nregions. Therefore, the ambiguous features they extracted resulted in a\nsignificant number of outlier matches from overlapping regions. To solve this\nproblem, we propose a prior-guided SMoE-based registration method to improve\nthe feature distinctiveness by dispatching the potential correspondences to the\nsame experts. Specifically, we propose a prior-guided SMoE module by fusing\nprior overlap and potential correspondence embeddings for routing, assigning\ntokens to the most suitable experts for processing. In addition, we propose a\nregistration framework by a specific combination of Transformer layer and\nprior-guided SMoE module. The proposed method not only pays attention to the\nimportance of locating the overlapping areas of point clouds, but also commits\nto finding more accurate correspondences in overlapping areas. Our extensive\nexperiments demonstrate the effectiveness of our method, achieving\nstate-of-the-art registration recall (95.7\\%/79.3\\%) on the 3DMatch/3DLoMatch\nbenchmark. Moreover, we also test the performance on ModelNet40 and demonstrate\nexcellent performance.\n","authors":["Xiaoshui Huang","Zhou Huang","Yifan Zuo","Yongshun Gong","Chengdong Zhang","Deyang Liu","Yuming Fang"],"pdf_url":"https://arxiv.org/pdf/2501.07762v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.08495v1","updated":"2025-01-14T23:57:35Z","published":"2025-01-14T23:57:35Z","title":"Automotive Elevation Mapping with Interferometric Synthetic Aperture\n  Radar","summary":"  Radar is a low-cost and ubiquitous automotive sensor, but is limited by array\nresolution and sensitivity when performing direction of arrival analysis.\nSynthetic Aperture Radar (SAR) is a class of techniques to improve azimuth\nresolution and sensitivity for radar. Interferometric SAR (InSAR) can be used\nto extract elevation from the variations in phase measurements in SAR images.\nUtilizing InSAR we show that a typical, low-resolution radar array mounted on a\nvehicle can be used to accurately localize detections in 3D space for both\nurban and agricultural environments. We generate point clouds in each\nenvironment by combining InSAR with a signal processing scheme tailored to\nautomotive driving. This low-compute approach allows radar to be used as a\nprimary sensor to map fine details in complex driving environments, and be used\nto make autonomous perception decisions.\n","authors":["Leyla A. Kabuli","Griffin Foster"],"pdf_url":"https://arxiv.org/pdf/2501.08495v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.08490v1","updated":"2025-01-14T23:31:20Z","published":"2025-01-14T23:31:20Z","title":"FLAVARS: A Multimodal Foundational Language and Vision Alignment Model\n  for Remote Sensing","summary":"  Remote sensing imagery is dense with objects and contextual visual\ninformation. There is a recent trend to combine paired satellite images and\ntext captions for pretraining performant encoders for downstream tasks.\nHowever, while contrastive image-text methods like CLIP enable vision-language\nalignment and zero-shot classification ability, vision-only downstream\nperformance tends to degrade compared to image-only pretraining, such as MAE.\nIn this paper, we propose FLAVARS, a pretraining method that combines the best\nof both contrastive learning and masked modeling, along with geospatial\nalignment via contrastive location encoding. We find that FLAVARS significantly\noutperforms a baseline of SkyCLIP for vision-only tasks such as KNN\nclassification and semantic segmentation, +6\\% mIOU on SpaceNet1, while\nretaining the ability to perform zero-shot classification, unlike MAE\npretrained methods.\n","authors":["Isaac Corley","Simone Fobi Nsutezo","Anthony Ortiz","Caleb Robinson","Rahul Dodhia","Juan M. Lavista Ferres","Peyman Najafirad"],"pdf_url":"https://arxiv.org/pdf/2501.08490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08471v1","updated":"2025-01-14T22:36:11Z","published":"2025-01-14T22:36:11Z","title":"Benchmarking Classical, Deep, and Generative Models for Human Activity\n  Recognition","summary":"  Human Activity Recognition (HAR) has gained significant importance with the\ngrowing use of sensor-equipped devices and large datasets. This paper evaluates\nthe performance of three categories of models : classical machine learning,\ndeep learning architectures, and Restricted Boltzmann Machines (RBMs) using\nfive key benchmark datasets of HAR (UCI-HAR, OPPORTUNITY, PAMAP2, WISDM, and\nBerkeley MHAD). We assess various models, including Decision Trees, Random\nForests, Convolutional Neural Networks (CNN), and Deep Belief Networks (DBNs),\nusing metrics such as accuracy, precision, recall, and F1-score for a\ncomprehensive comparison. The results show that CNN models offer superior\nperformance across all datasets, especially on the Berkeley MHAD. Classical\nmodels like Random Forest do well on smaller datasets but face challenges with\nlarger, more complex data. RBM-based models also show notable potential,\nparticularly for feature learning. This paper offers a detailed comparison to\nhelp researchers choose the most suitable model for HAR tasks.\n","authors":["Md Meem Hossain","The Anh Han","Safina Showkat Ara","Zia Ush Shamszaman"],"pdf_url":"https://arxiv.org/pdf/2501.08471v1.pdf","comment":"48 pages, 21 Figures"},{"id":"http://arxiv.org/abs/2501.08470v1","updated":"2025-01-14T22:33:07Z","published":"2025-01-14T22:33:07Z","title":"Detecting Contextual Anomalies by Discovering Consistent Spatial Regions","summary":"  We describe a method for modeling spatial context to enable video anomaly\ndetection. The main idea is to discover regions that share similar object-level\nactivities by clustering joint object attributes using Gaussian mixture models.\nWe demonstrate that this straightforward approach, using orders of magnitude\nfewer parameters than competing models, achieves state-of-the-art performance\nin the challenging spatial-context-dependent Street Scene dataset. As a side\nbenefit, the high-resolution discovered regions learned by the model also\nprovide explainable normalcy maps for human operators without the need for any\npre-trained segmentation model.\n","authors":["Zhengye Yang","Richard J. Radke"],"pdf_url":"https://arxiv.org/pdf/2501.08470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06720v4","updated":"2025-01-14T22:30:10Z","published":"2023-04-13T17:59:55Z","title":"Expressive Text-to-Image Generation with Rich Text","summary":"  Plain text has become a prevalent interface for text-to-image synthesis.\nHowever, its limited customization options hinder users from accurately\ndescribing desired outputs. For example, plain text makes it hard to specify\ncontinuous quantities, such as the precise RGB color value or importance of\neach word. Furthermore, creating detailed text prompts for complex scenes is\ntedious for humans to write and challenging for text encoders to interpret. To\naddress these challenges, we propose using a rich-text editor supporting\nformats such as font style, size, color, and footnote. We extract each word's\nattributes from rich text to enable local style control, explicit token\nreweighting, precise color rendering, and detailed region synthesis. We achieve\nthese capabilities through a region-based diffusion process. We first obtain\neach word's region based on attention maps of a diffusion process using plain\ntext. For each region, we enforce its text attributes by creating\nregion-specific detailed prompts and applying region-specific guidance, and\nmaintain its fidelity against plain-text generation through region-based\ninjections. We present various examples of image generation from rich text and\ndemonstrate that our method outperforms strong baselines with quantitative\nevaluations.\n","authors":["Songwei Ge","Taesung Park","Jun-Yan Zhu","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2304.06720v4.pdf","comment":"Project webpage: https://rich-text-to-image.github.io/"},{"id":"http://arxiv.org/abs/2501.08465v1","updated":"2025-01-14T22:23:11Z","published":"2025-01-14T22:23:11Z","title":"Predicting Performance of Object Detection Models in Electron Microscopy\n  Using Random Forests","summary":"  Quantifying prediction uncertainty when applying object detection models to\nnew, unlabeled datasets is critical in applied machine learning. This study\nintroduces an approach to estimate the performance of deep learning-based\nobject detection models for quantifying defects in transmission electron\nmicroscopy (TEM) images, focusing on detecting irradiation-induced cavities in\nTEM images of metal alloys. We developed a random forest regression model that\npredicts the object detection F1 score, a statistical metric used to evaluate\nthe ability to accurately locate and classify objects of interest. The random\nforest model uses features extracted from the predictions of the object\ndetection model whose uncertainty is being quantified, enabling fast prediction\non new, unlabeled images. The mean absolute error (MAE) for predicting F1 of\nthe trained model on test data is 0.09, and the $R^2$ score is 0.77, indicating\nthere is a significant correlation between the random forest regression model\npredicted and true defect detection F1 scores. The approach is shown to be\nrobust across three distinct TEM image datasets with varying imaging and\nmaterial domains. Our approach enables users to estimate the reliability of a\ndefect detection and segmentation model predictions and assess the\napplicability of the model to their specific datasets, providing valuable\ninformation about possible domain shifts and whether the model needs to be\nfine-tuned or trained on additional data to be maximally effective for the\ndesired use case.\n","authors":["Ni Li","Ryan Jacobs","Matthew Lynch","Vidit Agrawal","Kevin Field","Dane Morgan"],"pdf_url":"https://arxiv.org/pdf/2501.08465v1.pdf","comment":"14 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.08460v1","updated":"2025-01-14T22:09:06Z","published":"2025-01-14T22:09:06Z","title":"Towards Zero-Shot & Explainable Video Description by Reasoning over\n  Graphs of Events in Space and Time","summary":"  In the current era of Machine Learning, Transformers have become the de facto\napproach across a variety of domains, such as computer vision and natural\nlanguage processing. Transformer-based solutions are the backbone of current\nstate-of-the-art methods for language generation, image and video\nclassification, segmentation, action and object recognition, among many others.\nInterestingly enough, while these state-of-the-art methods produce impressive\nresults in their respective domains, the problem of understanding the\nrelationship between vision and language is still beyond our reach. In this\nwork, we propose a common ground between vision and language based on events in\nspace and time in an explainable and programmatic way, to connect\nlearning-based vision and language state of the art models and provide a\nsolution to the long standing problem of describing videos in natural language.\nWe validate that our algorithmic approach is able to generate coherent, rich\nand relevant textual descriptions on videos collected from a variety of\ndatasets, using both standard metrics (e.g. Bleu, ROUGE) and the modern\nLLM-as-a-Jury approach.\n","authors":["Mihai Masala","Marius Leordeanu"],"pdf_url":"https://arxiv.org/pdf/2501.08460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09349v4","updated":"2025-01-14T22:05:06Z","published":"2023-06-15T17:59:59Z","title":"UrbanIR: Large-Scale Urban Scene Inverse Rendering from a Single Video","summary":"  We present UrbanIR (Urban Scene Inverse Rendering), a new inverse graphics\nmodel that enables realistic, free-viewpoint renderings of scenes under various\nlighting conditions with a single video. It accurately infers shape, albedo,\nvisibility, and sun and sky illumination from wide-baseline videos, such as\nthose from car-mounted cameras, differing from NeRF's dense view settings. In\nthis context, standard methods often yield subpar geometry and material\nestimates, such as inaccurate roof representations and numerous 'floaters'.\nUrbanIR addresses these issues with novel losses that reduce errors in inverse\ngraphics inference and rendering artifacts. Its techniques allow for precise\nshadow volume estimation in the original scene. The model's outputs support\ncontrollable editing, enabling photorealistic free-viewpoint renderings of\nnight simulations, relit scenes, and inserted objects, marking a significant\nimprovement over existing state-of-the-art methods.\n","authors":["Chih-Hao Lin","Bohan Liu","Yi-Ting Chen","Kuan-Sheng Chen","David Forsyth","Jia-Bin Huang","Anand Bhattad","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2306.09349v4.pdf","comment":"https://urbaninverserendering.github.io/"},{"id":"http://arxiv.org/abs/2501.08458v1","updated":"2025-01-14T22:03:00Z","published":"2025-01-14T22:03:00Z","title":"RWKV-UNet: Improving UNet with Long-Range Cooperation for Effective\n  Medical Image Segmentation","summary":"  In recent years, there have been significant advancements in deep learning\nfor medical image analysis, especially with convolutional neural networks\n(CNNs) and transformer models. However, CNNs face limitations in capturing\nlong-range dependencies while transformers suffer high computational\ncomplexities. To address this, we propose RWKV-UNet, a novel model that\nintegrates the RWKV (Receptance Weighted Key Value) structure into the U-Net\narchitecture. This integration enhances the model's ability to capture\nlong-range dependencies and improve contextual understanding, which is crucial\nfor accurate medical image segmentation. We build a strong encoder with\ndeveloped inverted residual RWKV (IR-RWKV) blocks combining CNNs and RWKVs. We\nalso propose a Cross-Channel Mix (CCM) module to improve skip connections with\nmulti-scale feature fusion, achieving global channel information integration.\nExperiments on benchmark datasets, including Synapse, ACDC, BUSI, CVC-ClinicDB,\nCVC-ColonDB, Kvasir-SEG, ISIC 2017 and GLAS show that RWKV-UNet achieves\nstate-of-the-art performance on various types of medical image segmentation.\nAdditionally, smaller variants, RWKV-UNet-S and RWKV-UNet-T, balance accuracy\nand computational efficiency, making them suitable for broader clinical\napplications.\n","authors":["Juntao Jiang","Jiangning Zhang","Weixuan Liu","Muxuan Gao","Xiaobin Hu","Xiaoxiao Yan","Feiyue Huang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08453v1","updated":"2025-01-14T21:53:11Z","published":"2025-01-14T21:53:11Z","title":"Vchitect-2.0: Parallel Transformer for Scaling Up Video Diffusion Models","summary":"  We present Vchitect-2.0, a parallel transformer architecture designed to\nscale up video diffusion models for large-scale text-to-video generation. The\noverall Vchitect-2.0 system has several key designs. (1) By introducing a novel\nMultimodal Diffusion Block, our approach achieves consistent alignment between\ntext descriptions and generated video frames, while maintaining temporal\ncoherence across sequences. (2) To overcome memory and computational\nbottlenecks, we propose a Memory-efficient Training framework that incorporates\nhybrid parallelism and other memory reduction techniques, enabling efficient\ntraining of long video sequences on distributed systems. (3) Additionally, our\nenhanced data processing pipeline ensures the creation of Vchitect T2V\nDataVerse, a high-quality million-scale training dataset through rigorous\nannotation and aesthetic evaluation. Extensive benchmarking demonstrates that\nVchitect-2.0 outperforms existing methods in video quality, training\nefficiency, and scalability, serving as a suitable base for high-fidelity video\ngeneration.\n","authors":["Weichen Fan","Chenyang Si","Junhao Song","Zhenyu Yang","Yinan He","Long Zhuo","Ziqi Huang","Ziyue Dong","Jingwen He","Dongwei Pan","Yi Wang","Yuming Jiang","Yaohui Wang","Peng Gao","Xinyuan Chen","Hengjie Li","Dahua Lin","Yu Qiao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16689v3","updated":"2025-01-14T21:37:31Z","published":"2024-03-25T12:23:39Z","title":"SYNAPSE: SYmbolic Neural-Aided Preference Synthesis Engine","summary":"  This paper addresses the problem of preference learning, which aims to align\nrobot behaviors through learning user specific preferences (e.g. \"good\npull-over location\") from visual demonstrations. Despite its similarity to\nlearning factual concepts (e.g. \"red door\"), preference learning is a\nfundamentally harder problem due to its subjective nature and the paucity of\nperson-specific training data. We address this problem using a novel framework\ncalled SYNAPSE, which is a neuro-symbolic approach designed to efficiently\nlearn preferential concepts from limited data. SYNAPSE represents preferences\nas neuro-symbolic programs, facilitating inspection of individual parts for\nalignment, in a domain-specific language (DSL) that operates over images and\nleverages a novel combination of visual parsing, large language models, and\nprogram synthesis to learn programs representing individual preferences. We\nperform extensive evaluations on various preferential concepts as well as user\ncase studies demonstrating its ability to align well with dissimilar user\npreferences. Our method significantly outperforms baselines, especially when it\ncomes to out of distribution generalization. We show the importance of the\ndesign choices in the framework through multiple ablation studies. Code,\nadditional results, and supplementary material can be found on the website:\nhttps://amrl.cs.utexas.edu/synapse\n","authors":["Sadanand Modak","Noah Patton","Isil Dillig","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2403.16689v3.pdf","comment":"Accepted (oral) at AAAI 25"},{"id":"http://arxiv.org/abs/2501.08446v1","updated":"2025-01-14T21:34:34Z","published":"2025-01-14T21:34:34Z","title":"Poseidon: A ViT-based Architecture for Multi-Frame Pose Estimation with\n  Adaptive Frame Weighting and Multi-Scale Feature Fusion","summary":"  Human pose estimation, a vital task in computer vision, involves detecting\nand localising human joints in images and videos. While single-frame pose\nestimation has seen significant progress, it often fails to capture the\ntemporal dynamics for understanding complex, continuous movements. We propose\nPoseidon, a novel multi-frame pose estimation architecture that extends the\nViTPose model by integrating temporal information for enhanced accuracy and\nrobustness to address these limitations. Poseidon introduces key innovations:\n(1) an Adaptive Frame Weighting (AFW) mechanism that dynamically prioritises\nframes based on their relevance, ensuring that the model focuses on the most\ninformative data; (2) a Multi-Scale Feature Fusion (MSFF) module that\naggregates features from different backbone layers to capture both fine-grained\ndetails and high-level semantics; and (3) a Cross-Attention module for\neffective information exchange between central and contextual frames, enhancing\nthe model's temporal coherence. The proposed architecture improves performance\nin complex video scenarios and offers scalability and computational efficiency\nsuitable for real-world applications. Our approach achieves state-of-the-art\nperformance on the PoseTrack21 and PoseTrack18 datasets, achieving mAP scores\nof 88.3 and 87.8, respectively, outperforming existing methods.\n","authors":["Cesare Davide Pace","Alessandro Marco De Nunzio","Claudio De Stefano","Francesco Fontanella","Mario Molinara"],"pdf_url":"https://arxiv.org/pdf/2501.08446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07298v3","updated":"2025-01-14T21:26:13Z","published":"2024-10-09T17:07:34Z","title":"Enhancing Performance of Point Cloud Completion Networks with\n  Consistency Loss","summary":"  Point cloud completion networks are conventionally trained to minimize the\ndisparities between the completed point cloud and the ground-truth counterpart.\nHowever, an incomplete object-level point cloud can have multiple valid\ncompletion solutions when it is examined in isolation. This one-to-many mapping\nissue can cause contradictory supervision signals to the network because the\nloss function may produce different values for identical input-output pairs of\nthe network. In many cases, this issue could adversely affect the network\noptimization process. In this work, we propose to enhance the conventional\nlearning objective using a novel completion consistency loss to mitigate the\none-to-many mapping problem. Specifically, the proposed consistency loss ensure\nthat a point cloud completion network generates a coherent completion solution\nfor incomplete objects originating from the same source point cloud.\nExperimental results across multiple well-established datasets and benchmarks\ndemonstrated the proposed completion consistency loss have excellent capability\nto enhance the completion performance of various existing networks without any\nmodification to the design of the networks. The proposed consistency loss\nenhances the performance of the point completion network without affecting the\ninference speed, thereby increasing the accuracy of point cloud completion.\nNotably, a state-of-the-art point completion network trained with the proposed\nconsistency loss can achieve state-of-the-art accuracy on the challenging new\nMVP dataset. The code and result of experiment various point completion models\nusing proposed consistency loss will be available at:\nhttps://github.com/kaist-avelab/ConsistencyLoss .\n","authors":["Kevin Tirta Wijaya","Christofel Rio Goenawan","Seung-Hyun Kong"],"pdf_url":"https://arxiv.org/pdf/2410.07298v3.pdf","comment":"First version of Paper \"Enhancing Performance of Point Cloud\n  Completion Networks with Consistency Loss\" by Kevin Tirta Wijaya and\n  Christofel Rio Goenawan. In process submission to Neurocomputing Journal 2024"},{"id":"http://arxiv.org/abs/2501.08440v1","updated":"2025-01-14T21:08:08Z","published":"2025-01-14T21:08:08Z","title":"FARE: A Deep Learning-Based Framework for Radar-based Face Recognition\n  and Out-of-distribution Detection","summary":"  In this work, we propose a novel pipeline for face recognition and\nout-of-distribution (OOD) detection using short-range FMCW radar. The proposed\nsystem utilizes Range-Doppler and micro Range-Doppler Images. The architecture\nfeatures a primary path (PP) responsible for the classification of\nin-distribution (ID) faces, complemented by intermediate paths (IPs) dedicated\nto OOD detection. The network is trained in two stages: first, the PP is\ntrained using triplet loss to optimize ID face classification. In the second\nstage, the PP is frozen, and the IPs-comprising simple linear autoencoder\nnetworks-are trained specifically for OOD detection. Using our dataset\ngenerated with a 60 GHz FMCW radar, our method achieves an ID classification\naccuracy of 99.30% and an OOD detection AUROC of 96.91%.\n","authors":["Sabri Mustafa Kahya","Boran Hamdi Sivrikaya","Muhammet Sami Yavuz","Eckehard Steinbach"],"pdf_url":"https://arxiv.org/pdf/2501.08440v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07015v2","updated":"2025-01-14T21:02:31Z","published":"2025-01-13T02:28:13Z","title":"SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting","summary":"  Achieving high-fidelity 3D reconstruction from monocular video remains\nchallenging due to the inherent limitations of traditional methods like\nStructure-from-Motion (SfM) and monocular SLAM in accurately capturing scene\ndetails. While differentiable rendering techniques such as Neural Radiance\nFields (NeRF) address some of these challenges, their high computational costs\nmake them unsuitable for real-time applications. Additionally, existing 3D\nGaussian Splatting (3DGS) methods often focus on photometric consistency,\nneglecting geometric accuracy and failing to exploit SLAM's dynamic depth and\npose updates for scene refinement. We propose a framework integrating dense\nSLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach\nintroduces SLAM-Informed Adaptive Densification, which dynamically updates and\ndensifies the Gaussian model by leveraging dense point clouds from SLAM.\nAdditionally, we incorporate Geometry-Guided Optimization, which combines\nedge-aware geometric constraints and photometric consistency to jointly\noptimize the appearance and geometry of the 3DGS scene representation, enabling\ndetailed and accurate SLAM mapping reconstruction. Experiments on the Replica\nand TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving\nstate-of-the-art results among monocular systems. Specifically, our method\nachieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica,\nrepresenting improvements of 10.7%, 6.4%, and 49.4%, respectively, over the\nprevious SOTA. On TUM-RGBD, our method outperforms the closest baseline by\n10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the\npotential of our framework in bridging the gap between photometric and\ngeometric dense 3D scene representations, paving the way for practical and\nefficient monocular dense reconstruction.\n","authors":["Yue Hu","Rong Liu","Meida Chen","Peter Beerel","Andrew Feng"],"pdf_url":"https://arxiv.org/pdf/2501.07015v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08415v1","updated":"2025-01-14T20:12:09Z","published":"2025-01-14T20:12:09Z","title":"Cross-Modal Transferable Image-to-Video Attack on Video Quality Metrics","summary":"  Recent studies have revealed that modern image and video quality assessment\n(IQA/VQA) metrics are vulnerable to adversarial attacks. An attacker can\nmanipulate a video through preprocessing to artificially increase its quality\nscore according to a certain metric, despite no actual improvement in visual\nquality. Most of the attacks studied in the literature are white-box attacks,\nwhile black-box attacks in the context of VQA have received less attention.\nMoreover, some research indicates a lack of transferability of adversarial\nexamples generated for one model to another when applied to VQA. In this paper,\nwe propose a cross-modal attack method, IC2VQA, aimed at exploring the\nvulnerabilities of modern VQA models. This approach is motivated by the\nobservation that the low-level feature spaces of images and videos are similar.\nWe investigate the transferability of adversarial perturbations across\ndifferent modalities; specifically, we analyze how adversarial perturbations\ngenerated on a white-box IQA model with an additional CLIP module can\neffectively target a VQA model. The addition of the CLIP module serves as a\nvaluable aid in increasing transferability, as the CLIP model is known for its\neffective capture of low-level semantics. Extensive experiments demonstrate\nthat IC2VQA achieves a high success rate in attacking three black-box VQA\nmodels. We compare our method with existing black-box attack strategies,\nhighlighting its superiority in terms of attack success within the same number\nof iterations and levels of attack strength. We believe that the proposed\nmethod will contribute to the deeper analysis of robust VQA metrics.\n","authors":["Georgii Gotin","Ekaterina Shumitskaya","Anastasia Antsiferova","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2501.08415v1.pdf","comment":"Accepted for VISAPP 2025"},{"id":"http://arxiv.org/abs/2501.08411v1","updated":"2025-01-14T19:59:59Z","published":"2025-01-14T19:59:59Z","title":"BiDepth Multimodal Neural Network: Bidirectional Depth Deep Learning\n  Arcitecture for Spatial-Temporal Prediction","summary":"  Accurate prediction of spatial-temporal (ST) information in dynamic systems,\nsuch as urban mobility and weather patterns, is a crucial yet challenging\nproblem. The complexity stems from the intricate interplay between spatial\nproximity and temporal relevance, where both long-term trends and short-term\nfluctuations are present in convoluted patterns. Existing approaches, including\ntraditional statistical methods and conventional neural networks, may provide\ninaccurate results due to the lack of an effective mechanism that\nsimultaneously incorporates information at variable temporal depths while\nmaintaining spatial context, resulting in a trade-off between comprehensive\nlong-term historical analysis and responsiveness to short-term new information.\nTo bridge this gap, this paper proposes the BiDepth Multimodal Neural Network\n(BDMNN) with bidirectional depth modulation that enables a comprehensive\nunderstanding of both long-term seasonality and short-term fluctuations,\nadapting to the complex ST context. Case studies with real-world public data\ndemonstrate significant improvements in prediction accuracy, with a 12%\nreduction in Mean Squared Error for urban traffic prediction and a 15%\nimprovement in rain precipitation forecasting compared to state-of-the-art\nbenchmarks, without demanding extra computational resources.\n","authors":["Sina Ehsani","Fenglian Pan","Qingpei Hu","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08411v1.pdf","comment":"This paper has been submitted to Applied Intelligence for review"},{"id":"http://arxiv.org/abs/2501.08408v1","updated":"2025-01-14T19:56:43Z","published":"2025-01-14T19:56:43Z","title":"Leveraging 2D Masked Reconstruction for Domain Adaptation of 3D Pose\n  Estimation","summary":"  RGB-based 3D pose estimation methods have been successful with the\ndevelopment of deep learning and the emergence of high-quality 3D pose\ndatasets. However, most existing methods do not operate well for testing images\nwhose distribution is far from that of training data. However, most existing\nmethods do not operate well for testing images whose distribution is far from\nthat of training data. This problem might be alleviated by involving diverse\ndata during training, however it is non-trivial to collect such diverse data\nwith corresponding labels (i.e. 3D pose). In this paper, we introduced an\nunsupervised domain adaptation framework for 3D pose estimation that utilizes\nthe unlabeled data in addition to labeled data via masked image modeling (MIM)\nframework. Foreground-centric reconstruction and attention regularization are\nfurther proposed to increase the effectiveness of unlabeled data usage.\nExperiments are conducted on the various datasets in human and hand pose\nestimation tasks, especially using the cross-domain scenario. We demonstrated\nthe effectiveness of ours by achieving the state-of-the-art accuracy on all\ndatasets.\n","authors":["Hansoo Park","Chanwoo Kim","Jihyeon Kim","Hoseong Cho","Nhat Nguyen Bao Truong","Taehwan Kim","Seungryul Baek"],"pdf_url":"https://arxiv.org/pdf/2501.08408v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.04809v2","updated":"2025-01-14T19:42:28Z","published":"2024-08-09T01:40:12Z","title":"On the Geometry of Deep Learning","summary":"  In this paper, we overview one promising avenue of progress at the\nmathematical foundation of deep learning: the connection between deep networks\nand function approximation by affine splines (continuous piecewise linear\nfunctions in multiple dimensions). In particular, we will overview work over\nthe past decade on understanding certain geometrical properties of a deep\nnetwork's affine spline mapping, in particular how it tessellates its input\nspace. As we will see, the affine spline connection and geometrical viewpoint\nprovide a powerful portal through which to view, analyze, and improve the inner\nworkings of a deep network.\n","authors":["Randall Balestriero","Ahmed Imtiaz Humayun","Richard Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2408.04809v2.pdf","comment":"Accepted for publication at 'Notices of the American Mathematical\n  Society'"},{"id":"http://arxiv.org/abs/2501.08370v1","updated":"2025-01-14T18:40:33Z","published":"2025-01-14T18:40:33Z","title":"3D Gaussian Splatting with Normal Information for Mesh Extraction and\n  Improved Rendering","summary":"  Differentiable 3D Gaussian splatting has emerged as an efficient and flexible\nrendering technique for representing complex scenes from a collection of 2D\nviews and enabling high-quality real-time novel-view synthesis. However, its\nreliance on photometric losses can lead to imprecisely reconstructed geometry\nand extracted meshes, especially in regions with high curvature or fine detail.\nWe propose a novel regularization method using the gradients of a signed\ndistance function estimated from the Gaussians, to improve the quality of\nrendering while also extracting a surface mesh. The regularizing normal\nsupervision facilitates better rendering and mesh reconstruction, which is\ncrucial for downstream applications in video generation, animation, AR-VR and\ngaming. We demonstrate the effectiveness of our approach on datasets such as\nMip-NeRF360, Tanks and Temples, and Deep-Blending. Our method scores higher on\nphotorealism metrics compared to other mesh extracting rendering methods\nwithout compromising mesh quality.\n","authors":["Meenakshi Krishnan","Liam Fowl","Ramani Duraiswami"],"pdf_url":"https://arxiv.org/pdf/2501.08370v1.pdf","comment":"ICASSP 2025: Workshop on Generative Data Augmentation for Real-World\n  Signal Processing Applications"},{"id":"http://arxiv.org/abs/2501.08361v1","updated":"2025-01-14T10:04:05Z","published":"2025-01-14T10:04:05Z","title":"Weight Averaging for Out-of-Distribution Generalization and Few-Shot\n  Domain Adaptation","summary":"  Empirical risk minimization (ERM) is not robust to changes in the\ndistribution of data. When the distribution of test data is different from that\nof training data, the problem is known as out-of-distribution generalization.\nRecently, two techniques have been developed for addressing out-of-distribution\ngeneralization in computer vision: weight averaging (WA) and sharpness-aware\nminimization (SAM). WA involves training multiple models with different\nhyperparameters and then averaging the weights of these models, which can\nsignificantly improve out-of-distribution generalization performance. SAM\noptimizes a neural network to find minima in flat regions, which have been\nproven to perform well under distribution shifts. While these techniques have\nmade great progress, there is still room for improvement and further\nexploration. In this thesis, we propose increasing the model diversity in WA\nexplicitly by introducing gradient similarity as a loss regularizer to further\nimprove out-of-distribution generalization performance. We also propose\ncombining WA and SAM to solve the problem of few-shot domain adaptation. Our\nextensive experiments on digits datasets (MNIST, SVHN, USPS, MNIST-M) and other\ndomain adaptation datasets (VLCS, PACS) show that combining WA and SAM leads to\nimproved out-of-distribution generalization performance and significantly\nincreases few-shot domain adaptation accuracy.\n","authors":["Shijian Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08361v1.pdf","comment":"Master Thesis"},{"id":"http://arxiv.org/abs/2501.09038v1","updated":"2025-01-14T20:59:37Z","published":"2025-01-14T20:59:37Z","title":"Do generative video models learn physical principles from watching\n  videos?","summary":"  AI video generation is undergoing a revolution, with quality and realism\nadvancing rapidly. These advances have led to a passionate scientific debate:\nDo video models learn ``world models'' that discover laws of physics -- or,\nalternatively, are they merely sophisticated pixel predictors that achieve\nvisual realism without understanding the physical principles of reality? We\naddress this question by developing Physics-IQ, a comprehensive benchmark\ndataset that can only be solved by acquiring a deep understanding of various\nphysical principles, like fluid dynamics, optics, solid mechanics, magnetism\nand thermodynamics. We find that across a range of current models (Sora,\nRunway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical\nunderstanding is severely limited, and unrelated to visual realism. At the same\ntime, some test cases can already be successfully solved. This indicates that\nacquiring certain physical principles from observation alone may be possible,\nbut significant challenges remain. While we expect rapid advances ahead, our\nwork demonstrates that visual realism does not imply physical understanding.\nOur project page is at https://physics-iq.github.io; code at\nhttps://github.com/google-deepmind/physics-IQ-benchmark.\n","authors":["Saman Motamed","Laura Culp","Kevin Swersky","Priyank Jaini","Robert Geirhos"],"pdf_url":"https://arxiv.org/pdf/2501.09038v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.11646v3","updated":"2025-01-14T18:34:12Z","published":"2024-08-21T14:17:24Z","title":"Mathematical Information Retrieval: Search and Question Answering","summary":"  Mathematical information is essential for technical work, but its creation,\ninterpretation, and search are challenging. To help address these challenges,\nresearchers have developed multimodal search engines and mathematical question\nanswering systems. This book begins with a simple framework characterizing the\ninformation tasks that people and systems perform as we work to answer\nmath-related questions. The framework is used to organize and relate the other\ncore topics of the book, including interactions between people and systems,\nrepresenting math formulas in sources, and evaluation. We close by addressing\nsome key questions and presenting directions for future work. This book is\nintended for students, instructors, and researchers interested in systems that\nhelp us find and use mathematical information.\n","authors":["Richard Zanibbi","Behrooz Mansouri","Anurag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2408.11646v3.pdf","comment":"[DRAFT] Revised (3rd) draft"},{"id":"http://arxiv.org/abs/2501.08267v1","updated":"2025-01-14T17:29:41Z","published":"2025-01-14T17:29:41Z","title":"TriMod Fusion for Multimodal Named Entity Recognition in Social Media","summary":"  Social media platforms serve as invaluable sources of user-generated content,\noffering insights into various aspects of human behavior. Named Entity\nRecognition (NER) plays a crucial role in analyzing such content by identifying\nand categorizing named entities into predefined classes. However, traditional\nNER models often struggle with the informal, contextually sparse, and ambiguous\nnature of social media language. To address these challenges, recent research\nhas focused on multimodal approaches that leverage both textual and visual cues\nfor enhanced entity recognition. Despite advances, existing methods face\nlimitations in capturing nuanced mappings between visual objects and textual\nentities and addressing distributional disparities between modalities. In this\npaper, we propose a novel approach that integrates textual, visual, and hashtag\nfeatures (TriMod), utilizing Transformer-attention for effective modality\nfusion. The improvements exhibited by our model suggest that named entities can\ngreatly benefit from the auxiliary context provided by multiple modalities,\nenabling more accurate recognition. Through the experiments on a multimodal\nsocial media dataset, we demonstrate the superiority of our approach over\nexisting state-of-the-art methods, achieving significant improvements in\nprecision, recall, and F1 score.\n","authors":["Mosab Alfaqeeh"],"pdf_url":"https://arxiv.org/pdf/2501.08267v1.pdf","comment":"Accepted at CASCON"},{"id":"http://arxiv.org/abs/2501.08248v1","updated":"2025-01-14T16:38:33Z","published":"2025-01-14T16:38:33Z","title":"Eliciting In-context Retrieval and Reasoning for Long-context Large\n  Language Models","summary":"  Recent advancements in long-context language models (LCLMs) promise to\ntransform Retrieval-Augmented Generation (RAG) by simplifying pipelines. With\ntheir expanded context windows, LCLMs can process entire knowledge bases and\nperform retrieval and reasoning directly -- a capability we define as\nIn-Context Retrieval and Reasoning (ICR^2). However, existing benchmarks like\nLOFT often overestimate LCLM performance by providing overly simplified\ncontexts. To address this, we introduce ICR^2, a benchmark that evaluates LCLMs\nin more realistic scenarios by including confounding passages retrieved with\nstrong retrievers. We then propose three methods to enhance LCLM performance:\n(1) retrieve-then-generate fine-tuning, (2) retrieval-attention-probing, which\nuses attention heads to filter and de-noise long contexts during decoding, and\n(3) joint retrieval head training alongside the generation head. Our evaluation\nof five well-known LCLMs on LOFT and ICR^2 demonstrates significant gains with\nour best approach applied to Mistral-7B: +17 and +15 points by Exact Match on\nLOFT, and +13 and +2 points on ICR^2, compared to vanilla RAG and supervised\nfine-tuning, respectively. It even outperforms GPT-4-Turbo on most tasks\ndespite being a much smaller model.\n","authors":["Yifu Qiu","Varun Embar","Yizhe Zhang","Navdeep Jaitly","Shay B. Cohen","Benjamin Han"],"pdf_url":"https://arxiv.org/pdf/2501.08248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16793v2","updated":"2025-01-14T08:47:17Z","published":"2024-09-25T10:14:01Z","title":"Spacewalker: Traversing Representation Spaces for Fast Interactive\n  Exploration and Annotation of Unstructured Data","summary":"  In industries such as healthcare, finance, and manufacturing, analysis of\nunstructured textual data presents significant challenges for analysis and\ndecision making. Uncovering patterns within large-scale corpora and\nunderstanding their semantic impact is critical, but depends on domain experts\nor resource-intensive manual reviews. In response, we introduce Spacewalker in\nthis system demonstration paper, an interactive tool designed to analyze,\nexplore, and annotate data across multiple modalities. It allows users to\nextract data representations, visualize them in low-dimensional spaces and\ntraverse large datasets either exploratory or by querying regions of interest.\nWe evaluated Spacewalker through extensive experiments and annotation studies,\nassessing its efficacy in improving data integrity verification and annotation.\nWe show that Spacewalker reduces time and effort compared to traditional\nmethods. The code of this work is open-source and can be found at:\nhttps://github.com/code-lukas/Spacewalker\n","authors":["Lukas Heine","Fabian Hörst","Jana Fragemann","Gijs Luijten","Jan Egger","Fin Bahnsen","M. Saquib Sarfraz","Jens Kleesiek","Constantin Seibold"],"pdf_url":"https://arxiv.org/pdf/2409.16793v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07793v1","updated":"2025-01-14T02:27:06Z","published":"2025-01-14T02:27:06Z","title":"Unsupervised Query Routing for Retrieval Augmented Generation","summary":"  Query routing for retrieval-augmented generation aims to assign an input\nquery to the most suitable search engine. Existing works rely heavily on\nsupervised datasets that require extensive manual annotation, resulting in high\ncosts and limited scalability, as well as poor generalization to\nout-of-distribution scenarios. To address these challenges, we introduce a\nnovel unsupervised method that constructs the \"upper-bound\" response to\nevaluate the quality of retrieval-augmented responses. This evaluation enables\nthe decision of the most suitable search engine for a given query. By\neliminating manual annotations, our approach can automatically process\nlarge-scale real user queries and create training data. We conduct extensive\nexperiments across five datasets, demonstrating that our method significantly\nenhances scalability and generalization capabilities.\n","authors":["Feiteng Mu","Liwen Zhang","Yong Jiang","Wenjie Li","Zhen Zhang","Pengjun Xie","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2501.07793v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2501.08330v1","updated":"2025-01-14T18:59:09Z","published":"2025-01-14T18:59:09Z","title":"Gradient Equilibrium in Online Learning: Theory and Applications","summary":"  We present a new perspective on online learning that we refer to as gradient\nequilibrium: a sequence of iterates achieves gradient equilibrium if the\naverage of gradients of losses along the sequence converges to zero. In\ngeneral, this condition is not implied by nor implies sublinear regret. It\nturns out that gradient equilibrium is achievable by standard online learning\nmethods such as gradient descent and mirror descent with constant step sizes\n(rather than decaying step sizes, as is usually required for no regret).\nFurther, as we show through examples, gradient equilibrium translates into an\ninterpretable and meaningful property in online prediction problems spanning\nregression, classification, quantile estimation, and others. Notably, we show\nthat the gradient equilibrium framework can be used to develop a debiasing\nscheme for black-box predictions under arbitrary distribution shift, based on\nsimple post hoc online descent updates. We also show that post hoc gradient\nupdates can be used to calibrate predicted quantiles under distribution shift,\nand that the framework leads to unbiased Elo scores for pairwise preference\nprediction.\n","authors":["Anastasios N. Angelopoulos","Michael I. Jordan","Ryan J. Tibshirani"],"pdf_url":"https://arxiv.org/pdf/2501.08330v1.pdf","comment":"Code available at\n  https://github.com/aangelopoulos/gradient-equilibrium/"},{"id":"http://arxiv.org/abs/2501.08317v1","updated":"2025-01-14T18:52:27Z","published":"2025-01-14T18:52:27Z","title":"A Similarity Measure Between Functions with Applications to Statistical\n  Learning and Optimization","summary":"  In this note, we present a novel measure of similarity between two functions.\nIt quantifies how the sub-optimality gaps of two functions convert to each\nother, and unifies several existing notions of functional similarity. We show\nthat it has convenient operation rules, and illustrate its use in empirical\nrisk minimization and non-stationary online optimization.\n","authors":["Chengpiao Huang","Kaizheng Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08317v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2501.08316v1","updated":"2025-01-14T18:51:48Z","published":"2025-01-14T18:51:48Z","title":"Diffusion Adversarial Post-Training for One-Step Video Generation","summary":"  The diffusion models are widely used for image and video generation, but\ntheir iterative generation process is slow and expansive. While existing\ndistillation approaches have demonstrated the potential for one-step generation\nin the image domain, they still suffer from significant quality degradation. In\nthis work, we propose Adversarial Post-Training (APT) against real data\nfollowing diffusion pre-training for one-step video generation. To improve the\ntraining stability and quality, we introduce several improvements to the model\narchitecture and training procedures, along with an approximated R1\nregularization objective. Empirically, our experiments show that our\nadversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720,\n24fps videos in real time using a single forward evaluation step. Additionally,\nour model is capable of generating 1024px images in a single step, achieving\nquality comparable to state-of-the-art methods.\n","authors":["Shanchuan Lin","Xin Xia","Yuxi Ren","Ceyuan Yang","Xuefeng Xiao","Lu Jiang"],"pdf_url":"https://arxiv.org/pdf/2501.08316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07169v3","updated":"2025-01-14T18:51:43Z","published":"2024-12-10T04:03:46Z","title":"Rate-In: Information-Driven Adaptive Dropout Rates for Improved\n  Inference-Time Uncertainty Estimation","summary":"  Accurate uncertainty estimation is crucial for deploying neural networks in\nrisk-sensitive applications such as medical diagnosis. Monte Carlo Dropout is a\nwidely used technique for approximating predictive uncertainty by performing\nstochastic forward passes with dropout during inference. However, using static\ndropout rates across all layers and inputs can lead to suboptimal uncertainty\nestimates, as it fails to adapt to the varying characteristics of individual\ninputs and network layers. Existing approaches optimize dropout rates during\ntraining using labeled data, resulting in fixed inference-time parameters that\ncannot adjust to new data distributions, compromising uncertainty estimates in\nMonte Carlo simulations.\n  In this paper, we propose Rate-In, an algorithm that dynamically adjusts\ndropout rates during inference by quantifying the information loss induced by\ndropout in each layer's feature maps. By treating dropout as controlled noise\ninjection and leveraging information-theoretic principles, Rate-In adapts\ndropout rates per layer and per input instance without requiring ground truth\nlabels. By quantifying the functional information loss in feature maps, we\nadaptively tune dropout rates to maintain perceptual quality across diverse\nmedical imaging tasks and architectural configurations. Our extensive empirical\nstudy on synthetic data and real-world medical imaging tasks demonstrates that\nRate-In improves calibration and sharpens uncertainty estimates compared to\nfixed or heuristic dropout rates without compromising predictive performance.\nRate-In offers a practical, unsupervised, inference-time approach to optimizing\ndropout for more reliable predictive uncertainty estimation in critical\napplications.\n","authors":["Tal Zeevi","Ravid Shwartz-Ziv","Yann LeCun","Lawrence H. Staib","John A. Onofrey"],"pdf_url":"https://arxiv.org/pdf/2412.07169v3.pdf","comment":"Updated author affiliation"},{"id":"http://arxiv.org/abs/2501.08306v1","updated":"2025-01-14T18:44:35Z","published":"2025-01-14T18:44:35Z","title":"Path Loss Prediction Using Machine Learning with Extended Features","summary":"  Wireless communications rely on path loss modeling, which is most effective\nwhen it includes the physical details of the propagation environment. Acquiring\nthis data has historically been challenging, but geographic information system\ndata is becoming increasingly available with higher resolution and accuracy.\nAccess to such details enables propagation models to more accurately predict\ncoverage and minimize interference in wireless deployments. Machine\nlearning-based modeling can significantly support this effort, with\nfeature-based approaches allowing for accurate, efficient, and scalable\npropagation modeling. Building on previous work, we introduce an extended set\nof features that improves prediction accuracy while, most importantly,\nmaintaining model generalization across a broad range of environments.\n","authors":["Jonathan Ethier","Mathieu Chateauvert","Ryan G. Dempsey","Alexis Bose"],"pdf_url":"https://arxiv.org/pdf/2501.08306v1.pdf","comment":"4 pages, 4 figures, conference paper"},{"id":"http://arxiv.org/abs/2501.08305v1","updated":"2025-01-14T18:41:15Z","published":"2025-01-14T18:41:15Z","title":"Benchmarking Graph Representations and Graph Neural Networks for\n  Multivariate Time Series Classification","summary":"  Multivariate Time Series Classification (MTSC) enables the analysis if\ncomplex temporal data, and thus serves as a cornerstone in various real-world\napplications, ranging from healthcare to finance. Since the relationship among\nvariables in MTS usually contain crucial cues, a large number of graph-based\nMTSC approaches have been proposed, as the graph topology and edges can\nexplicitly represent relationships among variables (channels), where not only\nvarious MTS graph representation learning strategies but also different Graph\nNeural Networks (GNNs) have been explored. Despite such progresses, there is no\ncomprehensive study that fairly benchmarks and investigates the performances of\nexisting widely-used graph representation learning strategies/GNN classifiers\nin the application of different MTSC tasks. In this paper, we present the first\nbenchmark which systematically investigates the effectiveness of the\nwidely-used three node feature definition strategies, four edge feature\nlearning strategies and five GNN architecture, resulting in 60 different\nvariants for graph-based MTSC. These variants are developed and evaluated with\na standardized data pipeline and training/validation/testing strategy on 26\nwidely-used suspensor MTSC datasets. Our experiments highlight that node\nfeatures significantly influence MTSC performance, while the visualization of\nedge features illustrates why adaptive edge learning outperforms other edge\nfeature learning methods. The code of the proposed benchmark is publicly\navailable at\n\\url{https://github.com/CVI-yangwn/Benchmark-GNN-for-Multivariate-Time-Series-Classification}.\n","authors":["Wennuo Yang","Shiling Wu","Yuzhi Zhou","Weicheng Xie","Linlin Shen","Siyang Song"],"pdf_url":"https://arxiv.org/pdf/2501.08305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08297v1","updated":"2025-01-14T18:28:08Z","published":"2025-01-14T18:28:08Z","title":"Polynomial Threshold Functions of Bounded Tree-Width: Some\n  Explainability and Complexity Aspects","summary":"  The tree-width of a multivariate polynomial is the tree-width of the\nhypergraph with hyperedges corresponding to its terms. Multivariate polynomials\nof bounded tree-width have been studied by Makowsky and Meer as a new sparsity\ncondition that allows for polynomial solvability of problems which are\nintractable in general. We consider a variation on this theme for Boolean\nvariables. A representation of a Boolean function as the sign of a polynomial\nis called a polynomial threshold representation. We discuss Boolean functions\nrepresentable as polynomial threshold functions of bounded tree-width and\npresent two applications to Bayesian network classifiers, a probabilistic\ngraphical model. Both applications are in Explainable Artificial Intelligence\n(XAI), the research area dealing with the black-box nature of many recent\nmachine learning models. We also give a separation result between the\nrepresentational power of positive and general polynomial threshold functions.\n","authors":["Karine Chubarian","Johnny Joyce","Gyorgy Turan"],"pdf_url":"https://arxiv.org/pdf/2501.08297v1.pdf","comment":"22 pages, 3 figures. To be published in Festschrift in honor of\n  Johann A. Makowsky"},{"id":"http://arxiv.org/abs/2501.08288v1","updated":"2025-01-14T18:08:52Z","published":"2025-01-14T18:08:52Z","title":"Avoiding subtraction and division of stochastic signals using\n  normalizing flows: NFdeconvolve","summary":"  Across the scientific realm, we find ourselves subtracting or dividing\nstochastic signals. For instance, consider a stochastic realization, $x$,\ngenerated from the addition or multiplication of two stochastic signals $a$ and\n$b$, namely $x=a+b$ or $x = ab$. For the $x=a+b$ example, $a$ can be\nfluorescence background and $b$ the signal of interest whose statistics are to\nbe learned from the measured $x$. Similarly, when writing $x=ab$, $a$ can be\nthought of as the illumination intensity and $b$ the density of fluorescent\nmolecules of interest. Yet dividing or subtracting stochastic signals amplifies\nnoise, and we ask instead whether, using the statistics of $a$ and the\nmeasurement of $x$ as input, we can recover the statistics of $b$. Here, we\nshow how normalizing flows can generate an approximation of the probability\ndistribution over $b$, thereby avoiding subtraction or division altogether.\nThis method is implemented in our software package, NFdeconvolve, available on\nGitHub with a tutorial linked in the main text.\n","authors":["Pedro Pessoa","Max Schweiger","Lance W. Q. Xu","Tristan Manha","Ayush Saurabh","Julian Antolin Camarena","Steve Pressé"],"pdf_url":"https://arxiv.org/pdf/2501.08288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08285v1","updated":"2025-01-14T18:00:41Z","published":"2025-01-14T18:00:41Z","title":"Can Bayesian Neural Networks Explicitly Model Input Uncertainty?","summary":"  Inputs to machine learning models can have associated noise or uncertainties,\nbut they are often ignored and not modelled. It is unknown if Bayesian Neural\nNetworks and their approximations are able to consider uncertainty in their\ninputs. In this paper we build a two input Bayesian Neural Network (mean and\nstandard deviation) and evaluate its capabilities for input uncertainty\nestimation across different methods like Ensembles, MC-Dropout, and Flipout.\nOur results indicate that only some uncertainty estimation methods for\napproximate Bayesian NNs can model input uncertainty, in particular Ensembles\nand Flipout.\n","authors":["Matias Valdenegro-Toro","Marco Zullich"],"pdf_url":"https://arxiv.org/pdf/2501.08285v1.pdf","comment":"12 pages, 11 figures, VISAPP 2025 camera ready"},{"id":"http://arxiv.org/abs/2501.08281v1","updated":"2025-01-14T17:57:26Z","published":"2025-01-14T17:57:26Z","title":"Decoding Interpretable Logic Rules from Neural Networks","summary":"  As deep neural networks continue to excel across various domains, their\nblack-box nature has raised concerns about transparency and trust. In\nparticular, interpretability has become increasingly essential for applications\nthat demand high safety and knowledge rigor, such as drug discovery, autonomous\ndriving, and genomics. However, progress in understanding even the simplest\ndeep neural networks - such as fully connected networks - has been limited,\ndespite their role as foundational elements in state-of-the-art models like\nResNet and Transformer. In this paper, we address this challenge by introducing\nNeuroLogic, a novel approach for decoding interpretable logic rules from neural\nnetworks. NeuroLogic leverages neural activation patterns to capture the\nmodel's critical decision-making processes, translating them into logical rules\nrepresented by hidden predicates. Thanks to its flexible design in the\ngrounding phase, NeuroLogic can be adapted to a wide range of neural networks.\nFor simple fully connected neural networks, hidden predicates can be grounded\nin certain split patterns of original input features to derive\ndecision-tree-like rules. For large, complex vision neural networks, NeuroLogic\ngrounds hidden predicates into high-level visual concepts that are\nunderstandable to humans. Our empirical study demonstrates that NeuroLogic can\nextract global and interpretable rules from state-of-the-art models such as\nResNet, a task at which existing work struggles. We believe NeuroLogic can help\npave the way for understanding the black-box nature of neural networks.\n","authors":["Chuqin Geng","Xiaojie Xu","Zhaoyue Wang","Ziyu Zhao","Xujie Si"],"pdf_url":"https://arxiv.org/pdf/2501.08281v1.pdf","comment":"23 pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.06993v2","updated":"2025-01-14T17:52:40Z","published":"2024-10-09T15:40:04Z","title":"Efficient Distribution Matching of Representations via Noise-Injected\n  Deep InfoMax","summary":"  Deep InfoMax (DIM) is a well-established method for self-supervised\nrepresentation learning (SSRL) based on maximization of the mutual information\nbetween the input and the output of a deep neural network encoder. Despite the\nDIM and contrastive SSRL in general being well-explored, the task of learning\nrepresentations conforming to a specific distribution (i.e., distribution\nmatching, DM) is still under-addressed. Motivated by the importance of DM to\nseveral downstream tasks (including generative modeling, disentanglement,\noutliers detection and other), we enhance DIM to enable automatic matching of\nlearned representations to a selected prior distribution. To achieve this, we\npropose injecting an independent noise into the normalized outputs of the\nencoder, while keeping the same InfoMax training objective. We show that such\nmodification allows for learning uniformly and normally distributed\nrepresentations, as well as representations of other absolutely continuous\ndistributions. Our approach is tested on various downstream tasks. The results\nindicate a moderate trade-off between the performance on the downstream tasks\nand quality of DM.\n","authors":["Ivan Butakov","Alexander Semenenko","Alexander Tolmachev","Andrey Gladkov","Marina Munkhoeva","Alexey Frolov"],"pdf_url":"https://arxiv.org/pdf/2410.06993v2.pdf","comment":"25 pages, 7 fugures"},{"id":"http://arxiv.org/abs/2501.08266v1","updated":"2025-01-14T17:26:02Z","published":"2025-01-14T17:26:02Z","title":"AI Driven Water Segmentation with deep learning models for Enhanced\n  Flood Monitoring","summary":"  Flooding is a major natural hazard causing significant fatalities and\neconomic losses annually, with increasing frequency due to climate change.\nRapid and accurate flood detection and monitoring are crucial for mitigating\nthese impacts. This study compares the performance of three deep learning\nmodels UNet, ResNet, and DeepLabv3 for pixelwise water segmentation to aid in\nflood detection, utilizing images from drones, in field observations, and\nsocial media. This study involves creating a new dataset that augments\nwellknown benchmark datasets with flood-specific images, enhancing the\nrobustness of the models. The UNet, ResNet, and DeepLab v3 architectures are\ntested to determine their effectiveness in various environmental conditions and\ngeographical locations, and the strengths and limitations of each model are\nalso discussed here, providing insights into their applicability in different\nscenarios by predicting image segmentation masks. This fully automated approach\nallows these models to isolate flooded areas in images, significantly reducing\nprocessing time compared to traditional semi-automated methods. The outcome of\nthis study is to predict segmented masks for each image effected by a flood\ndisaster and the validation accuracy of these models. This methodology\nfacilitates timely and continuous flood monitoring, providing vital data for\nemergency response teams to reduce loss of life and economic damages. It offers\na significant reduction in the time required to generate flood maps, cutting\ndown the manual processing time. Additionally, we present avenues for future\nresearch, including the integration of multimodal data sources and the\ndevelopment of robust deep learning architectures tailored specifically for\nflood detection tasks. Overall, our work contributes to the advancement of\nflood management strategies through innovative use of deep learning\ntechnologies.\n","authors":["Sanjida Afrin Mou","Tasfia Noor Chowdhury","Adib Ibn Mannan","Sadia Nourin Mim","Lubana Tarannum","Tasrin Noman","Jamal Uddin Ahamed"],"pdf_url":"https://arxiv.org/pdf/2501.08266v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.08263v1","updated":"2025-01-14T17:23:14Z","published":"2025-01-14T17:23:14Z","title":"Multiplayer Federated Learning: Reaching Equilibrium with Less\n  Communication","summary":"  Traditional Federated Learning (FL) approaches assume collaborative clients\nwith aligned objectives working towards a shared global model. However, in many\nreal-world scenarios, clients act as rational players with individual\nobjectives and strategic behaviors, a concept that existing FL frameworks are\nnot equipped to adequately address. To bridge this gap, we introduce\nMultiplayer Federated Learning (MpFL), a novel framework that models the\nclients in the FL environment as players in a game-theoretic context, aiming to\nreach an equilibrium. In this scenario, each player tries to optimize their own\nutility function, which may not align with the collective goal. Within MpFL, we\npropose Per-Player Local Stochastic Gradient Descent (PEARL-SGD), an algorithm\nin which each player/client performs local updates independently and\nperiodically communicates with other players. We theoretically analyze\nPEARL-SGD and prove that it reaches a neighborhood of equilibrium with less\ncommunication in the stochastic setup compared to its non-local counterpart.\nFinally, we verify our theoretical findings through numerical experiments.\n","authors":["TaeHo Yoon","Sayantan Choudhury","Nicolas Loizou"],"pdf_url":"https://arxiv.org/pdf/2501.08263v1.pdf","comment":"43 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.02748v3","updated":"2025-01-14T17:20:04Z","published":"2024-10-03T17:57:01Z","title":"CriSPO: Multi-Aspect Critique-Suggestion-guided Automatic Prompt\n  Optimization for Text Generation","summary":"  Existing automatic prompt engineering methods are typically designed for\ndiscriminative tasks, where new task prompts are iteratively refined with\nlimited feedback from a single metric reflecting a single aspect. However,\nthese approaches are suboptimal for generative tasks, which require more\nnuanced guidance beyond a single numeric metric to improve the prompt and\noptimize multiple aspects of the generated text. To address these challenges,\nwe propose a novel multi-aspect Critique-Suggestion-guided automatic Prompt\nOptimization (CriSPO) approach. CriSPO introduces a critique-suggestion module\nas its core component. This module spontaneously discovers aspects, and\ncompares generated and reference texts across these aspects, providing specific\nsuggestions for prompt modification. These clear critiques and actionable\nsuggestions guide a receptive optimizer module to make more substantial\nchanges, exploring a broader and more effective search space. To further\nimprove CriSPO with multi-metric optimization, we introduce an Automatic Suffix\nTuning (AST) extension to enhance the performance of task prompts across\nmultiple metrics. We evaluate CriSPO on 4 state-of-the-art LLMs across 4\nsummarization and 5 QA datasets. Extensive experiments show 3-4% ROUGE score\nimprovement on summarization and substantial improvement of various metrics on\nQA. Code available at https://github.com/amazon-science/crispo\n","authors":["Han He","Qianchu Liu","Lei Xu","Chaitanya Shivade","Yi Zhang","Sundararajan Srinivasan","Katrin Kirchhoff"],"pdf_url":"https://arxiv.org/pdf/2410.02748v3.pdf","comment":"Accepted to AAAI-2025"},{"id":"http://arxiv.org/abs/2501.08259v1","updated":"2025-01-14T17:15:27Z","published":"2025-01-14T17:15:27Z","title":"FDPP: Fine-tune Diffusion Policy with Human Preference","summary":"  Imitation learning from human demonstrations enables robots to perform\ncomplex manipulation tasks and has recently witnessed huge success. However,\nthese techniques often struggle to adapt behavior to new preferences or changes\nin the environment. To address these limitations, we propose Fine-tuning\nDiffusion Policy with Human Preference (FDPP). FDPP learns a reward function\nthrough preference-based learning. This reward is then used to fine-tune the\npre-trained policy with reinforcement learning (RL), resulting in alignment of\npre-trained policy with new human preferences while still solving the original\ntask. Our experiments across various robotic tasks and preferences demonstrate\nthat FDPP effectively customizes policy behavior without compromising\nperformance. Additionally, we show that incorporating Kullback-Leibler (KL)\nregularization during fine-tuning prevents over-fitting and helps maintain the\ncompetencies of the initial policy.\n","authors":["Yuxin Chen","Devesh K. Jha","Masayoshi Tomizuka","Diego Romeres"],"pdf_url":"https://arxiv.org/pdf/2501.08259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06457v2","updated":"2025-01-14T16:58:26Z","published":"2025-01-11T07:09:57Z","title":"Automated Detection and Analysis of Minor Deformations in Flat Walls Due\n  to Railway Vibrations Using LiDAR and Machine Learning","summary":"  This study introduces an advanced methodology for automatically identifying\nminor deformations in flat walls caused by vibrations from nearby railway\ntracks. It leverages high-density Terrestrial Laser Scanner (TLS) LiDAR surveys\nand AI/ML techniques to collect and analyze data. The scan data is processed\ninto a detailed point cloud, which is segmented to distinguish ground points,\ntrees, buildings, and other objects. The analysis focuses on identifying\nsections along flat walls and estimating their deformations relative to the\nground orientation.\n  Findings from the study, conducted at the RGIPT campus, reveal significant\ndeformations in walls close to the railway corridor, with the highest\ndeformations ranging from 7 to 8 cm and an average of 3 to 4 cm. In contrast,\nwalls further from the corridor show negligible deformations. The developed\nautomated process for feature extraction and deformation monitoring\ndemonstrates potential for structural health monitoring. By integrating LiDAR\ndata with machine learning, the methodology provides an efficient system for\nidentifying and analyzing structural deformations, highlighting the importance\nof continuous monitoring for ensuring structural integrity and public safety in\nurban infrastructure. This approach represents a substantial advancement in\nautomated feature extraction and deformation analysis, contributing to more\neffective management of urban infrastructure.\n","authors":["Surjo Dey","Ankit Sharma","Hritu Raj","Susham Biswas"],"pdf_url":"https://arxiv.org/pdf/2501.06457v2.pdf","comment":"I am requesting the withdrawal of my paper due to the need for\n  significant revisions to ensure the accuracy and integrity of the presented\n  findings"},{"id":"http://arxiv.org/abs/2410.18803v2","updated":"2025-01-14T16:54:54Z","published":"2024-10-24T14:52:21Z","title":"Language-Agnostic Modeling of Source Reliability on Wikipedia","summary":"  Over the last few years, content verification through reliable sources has\nbecome a fundamental need to combat disinformation. Here, we present a\nlanguage-agnostic model designed to assess the reliability of sources across\nmultiple language editions of Wikipedia. Utilizing editorial activity data, the\nmodel evaluates source reliability within different articles of varying\ncontroversiality such as Climate Change, COVID-19, History, Media, and Biology\ntopics. Crafting features that express domain usage across articles, the model\neffectively predicts source reliability, achieving an F1 Macro score of\napproximately 0.80 for English and other high-resource languages. For\nmid-resource languages, we achieve 0.65 while the performance of low-resource\nlanguages varies; in all cases, the time the domain remains present in the\narticles (which we dub as permanence) is one of the most predictive features.\nWe highlight the challenge of maintaining consistent model performance across\nlanguages of varying resource levels and demonstrate that adapting models from\nhigher-resource languages can improve performance. This work contributes not\nonly to Wikipedia's efforts in ensuring content verifiability but in ensuring\nreliability across diverse user-generated content in various language\ncommunities.\n","authors":["Jacopo D'Ignazi","Andreas Kaltenbrunner","Yelena Mejova","Michele Tizzani","Kyriaki Kalimeri","Mariano Beiró","Pablo Aragón"],"pdf_url":"https://arxiv.org/pdf/2410.18803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08248v1","updated":"2025-01-14T16:38:33Z","published":"2025-01-14T16:38:33Z","title":"Eliciting In-context Retrieval and Reasoning for Long-context Large\n  Language Models","summary":"  Recent advancements in long-context language models (LCLMs) promise to\ntransform Retrieval-Augmented Generation (RAG) by simplifying pipelines. With\ntheir expanded context windows, LCLMs can process entire knowledge bases and\nperform retrieval and reasoning directly -- a capability we define as\nIn-Context Retrieval and Reasoning (ICR^2). However, existing benchmarks like\nLOFT often overestimate LCLM performance by providing overly simplified\ncontexts. To address this, we introduce ICR^2, a benchmark that evaluates LCLMs\nin more realistic scenarios by including confounding passages retrieved with\nstrong retrievers. We then propose three methods to enhance LCLM performance:\n(1) retrieve-then-generate fine-tuning, (2) retrieval-attention-probing, which\nuses attention heads to filter and de-noise long contexts during decoding, and\n(3) joint retrieval head training alongside the generation head. Our evaluation\nof five well-known LCLMs on LOFT and ICR^2 demonstrates significant gains with\nour best approach applied to Mistral-7B: +17 and +15 points by Exact Match on\nLOFT, and +13 and +2 points on ICR^2, compared to vanilla RAG and supervised\nfine-tuning, respectively. It even outperforms GPT-4-Turbo on most tasks\ndespite being a much smaller model.\n","authors":["Yifu Qiu","Varun Embar","Yizhe Zhang","Navdeep Jaitly","Shay B. Cohen","Benjamin Han"],"pdf_url":"https://arxiv.org/pdf/2501.08248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08246v1","updated":"2025-01-14T16:32:01Z","published":"2025-01-14T16:32:01Z","title":"Text-Diffusion Red-Teaming of Large Language Models: Unveiling Harmful\n  Behaviors with Proximity Constraints","summary":"  Recent work has proposed automated red-teaming methods for testing the\nvulnerabilities of a given target large language model (LLM). These methods use\nred-teaming LLMs to uncover inputs that induce harmful behavior in a target\nLLM. In this paper, we study red-teaming strategies that enable a targeted\nsecurity assessment. We propose an optimization framework for red-teaming with\nproximity constraints, where the discovered prompts must be similar to\nreference prompts from a given dataset. This dataset serves as a template for\nthe discovered prompts, anchoring the search for test-cases to specific topics,\nwriting styles, or types of harmful behavior. We show that established\nauto-regressive model architectures do not perform well in this setting. We\ntherefore introduce a black-box red-teaming method inspired by text-diffusion\nmodels: Diffusion for Auditing and Red-Teaming (DART). DART modifies the\nreference prompt by perturbing it in the embedding space, directly controlling\nthe amount of change introduced. We systematically evaluate our method by\ncomparing its effectiveness with established methods based on model fine-tuning\nand zero- and few-shot prompting. Our results show that DART is significantly\nmore effective at discovering harmful inputs in close proximity to the\nreference prompt.\n","authors":["Jonathan Nöther","Adish Singla","Goran Radanović"],"pdf_url":"https://arxiv.org/pdf/2501.08246v1.pdf","comment":"This is an extended version of a paper published at AAAI 25"},{"id":"http://arxiv.org/abs/2501.08245v1","updated":"2025-01-14T16:31:01Z","published":"2025-01-14T16:31:01Z","title":"Continual Deep Active Learning for Medical Imaging: Replay-Base\n  Architecture for Context Adaptation","summary":"  Deep Learning for medical imaging faces challenges in adapting and\ngeneralizing to new contexts. Additionally, it often lacks sufficient labeled\ndata for specific tasks requiring significant annotation effort. Continual\nLearning (CL) tackles adaptability and generalizability by enabling lifelong\nlearning from a data stream while mitigating forgetting of previously learned\nknowledge. Active Learning (AL) reduces the number of required annotations for\neffective training. This work explores both approaches (CAL) to develop a novel\nframework for robust medical image analysis. Based on the automatic recognition\nof shifts in image characteristics, Replay-Base Architecture for Context\nAdaptation (RBACA) employs a CL rehearsal method to continually learn from\ndiverse contexts, and an AL component to select the most informative instances\nfor annotation. A novel approach to evaluate CAL methods is established using a\ndefined metric denominated IL-Score, which allows for the simultaneous\nassessment of transfer learning, forgetting, and final model performance. We\nshow that RBACA works in domain and class-incremental learning scenarios, by\nassessing its IL-Score on the segmentation and diagnosis of cardiac images. The\nresults show that RBACA outperforms a baseline framework without CAL, and a\nstate-of-the-art CAL method across various memory sizes and annotation budgets.\nOur code is available in https://github.com/RuiDaniel/RBACA .\n","authors":["Rui Daniel","M. Rita Verdelho","Catarina Barata","Carlos Santiago"],"pdf_url":"https://arxiv.org/pdf/2501.08245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08243v1","updated":"2025-01-14T16:30:10Z","published":"2025-01-14T16:30:10Z","title":"Engineering LLM Powered Multi-agent Framework for Autonomous CloudOps","summary":"  Cloud Operations (CloudOps) is a rapidly growing field focused on the\nautomated management and optimization of cloud infrastructure which is\nessential for organizations navigating increasingly complex cloud environments.\nMontyCloud Inc. is one of the major companies in the CloudOps domain that\nleverages autonomous bots to manage cloud compliance, security, and continuous\noperations. To make the platform more accessible and effective to the\ncustomers, we leveraged the use of GenAI.\n  Developing a GenAI-based solution for autonomous CloudOps for the existing\nMontyCloud system presented us with various challenges such as i) diverse data\nsources; ii) orchestration of multiple processes; and iii) handling complex\nworkflows to automate routine tasks. To this end, we developed MOYA, a\nmulti-agent framework that leverages GenAI and balances autonomy with the\nnecessary human control. This framework integrates various internal and\nexternal systems and is optimized for factors like task orchestration,\nsecurity, and error mitigation while producing accurate, reliable, and relevant\ninsights by utilizing Retrieval Augmented Generation (RAG). Evaluations of our\nmulti-agent system with the help of practitioners as well as using automated\nchecks demonstrate enhanced accuracy, responsiveness, and effectiveness over\nnon-agentic approaches across complex workflows.\n","authors":["Kannan Parthasarathy","Karthik Vaidhyanathan","Rudra Dhar","Venkat Krishnamachari","Basil Muhammed","Adyansh Kakran","Sreemaee Akshathala","Shrikara Arun","Sumant Dubey","Mohan Veerubhotla","Amey Karan"],"pdf_url":"https://arxiv.org/pdf/2501.08243v1.pdf","comment":"The paper has been accepted as full paper to CAIN 2025\n  (https://conf.researchr.org/home/cain-2025), co-located with ICSE 2025\n  (https://conf.researchr.org/home/icse-2025). The paper was submitted to CAIN\n  for review on 9 November 2024"},{"id":"http://arxiv.org/abs/2501.08241v1","updated":"2025-01-14T16:28:02Z","published":"2025-01-14T16:28:02Z","title":"A Feature-Level Ensemble Model for COVID-19 Identification in CXR Images\n  using Choquet Integral and Differential Evolution Optimization","summary":"  The COVID-19 pandemic has profoundly impacted billions globally. It\nchallenges public health and healthcare systems due to its rapid spread and\nsevere respiratory effects. An effective strategy to mitigate the COVID-19\npandemic involves integrating testing to identify infected individuals. While\nRT-PCR is considered the gold standard for diagnosing COVID-19, it has some\nlimitations such as the risk of false negatives. To address this problem, this\npaper introduces a novel Deep Learning Diagnosis System that integrates\npre-trained Deep Convolutional Neural Networks (DCNNs) within an ensemble\nlearning framework to achieve precise identification of COVID-19 cases from\nChest X-ray (CXR) images. We combine feature vectors from the final hidden\nlayers of pre-trained DCNNs using the Choquet integral to capture interactions\nbetween different DCNNs that a linear approach cannot. We employed\nSugeno-$\\lambda$ measure theory to derive fuzzy measures for subsets of\nnetworks to enable aggregation. We utilized Differential Evolution to estimate\nfuzzy densities. We developed a TensorFlow-based layer for Choquet operation to\nfacilitate efficient aggregation, due to the intricacies involved in\naggregating feature vectors. Experimental results on the COVIDx dataset show\nthat our ensemble model achieved 98\\% accuracy in three-class classification\nand 99.50\\% in binary classification, outperforming its components-DenseNet-201\n(97\\% for three-class, 98.75\\% for binary), Inception-v3 (96.25\\% for\nthree-class, 98.50\\% for binary), and Xception (94.50\\% for three-class, 98\\%\nfor binary)-and surpassing many previous methods.\n","authors":["Amir Reza Takhsha","Maryam Rastgarpour","Mozhgan Naderi"],"pdf_url":"https://arxiv.org/pdf/2501.08241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08236v1","updated":"2025-01-14T16:21:54Z","published":"2025-01-14T16:21:54Z","title":"Privacy-Preserving Model and Preprocessing Verification for Machine\n  Learning","summary":"  This paper presents a framework for privacy-preserving verification of\nmachine learning models, focusing on models trained on sensitive data.\nIntegrating Local Differential Privacy (LDP) with model explanations from LIME\nand SHAP, our framework enables robust verification without compromising\nindividual privacy. It addresses two key tasks: binary classification, to\nverify if a target model was trained correctly by applying the appropriate\npreprocessing steps, and multi-class classification, to identify specific\npreprocessing errors. Evaluations on three real-world datasets-Diabetes, Adult,\nand Student Record-demonstrate that while the ML-based approach is particularly\neffective in binary tasks, the threshold-based method performs comparably in\nmulti-class tasks. Results indicate that although verification accuracy varies\nacross datasets and noise levels, the framework provides effective detection of\npreprocessing errors, strong privacy guarantees, and practical applicability\nfor safeguarding sensitive data.\n","authors":["Wenbiao Li","Anisa Halimi","Xiaoqian Jiang","Jaideep Vaidya","Erman Ayday"],"pdf_url":"https://arxiv.org/pdf/2501.08236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08234v1","updated":"2025-01-14T16:19:25Z","published":"2025-01-14T16:19:25Z","title":"Dynamic Pricing in High-Speed Railways Using Multi-Agent Reinforcement\n  Learning","summary":"  This paper addresses a critical challenge in the high-speed passenger railway\nindustry: designing effective dynamic pricing strategies in the context of\ncompeting and cooperating operators. To address this, a multi-agent\nreinforcement learning (MARL) framework based on a non-zero-sum Markov game is\nproposed, incorporating random utility models to capture passenger decision\nmaking. Unlike prior studies in areas such as energy, airlines, and mobile\nnetworks, dynamic pricing for railway systems using deep reinforcement learning\nhas received limited attention. A key contribution of this paper is a\nparametrisable and versatile reinforcement learning simulator designed to model\na variety of railway network configurations and demand patterns while enabling\nrealistic, microscopic modelling of user behaviour, called RailPricing-RL. This\nenvironment supports the proposed MARL framework, which models heterogeneous\nagents competing to maximise individual profits while fostering cooperative\nbehaviour to synchronise connecting services. Experimental results validate the\nframework, demonstrating how user preferences affect MARL performance and how\npricing policies influence passenger choices, utility, and overall system\ndynamics. This study provides a foundation for advancing dynamic pricing\nstrategies in railway systems, aligning profitability with system-wide\nefficiency, and supporting future research on optimising pricing policies.\n","authors":["Enrique Adrian Villarrubia-Martin","Luis Rodriguez-Benitez","David Muñoz-Valero","Giovanni Montana","Luis Jimenez-Linares"],"pdf_url":"https://arxiv.org/pdf/2501.08234v1.pdf","comment":"37 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.10729v2","updated":"2025-01-14T16:17:00Z","published":"2024-06-15T20:04:06Z","title":"A Comprehensive Survey of Foundation Models in Medicine","summary":"  Foundation models (FMs) are large-scale deep learning models that are\ndeveloped using large datasets and self-supervised learning methods. These\nmodels serve as a base for different downstream tasks, including healthcare.\nFMs have been adopted with great success across various domains within\nhealthcare. Existing healthcare-based surveys have not yet included all of\nthese domains. Therefore, we provide a detailed survey of FMs in healthcare. We\nfocus on the history, learning strategies, flagship models, applications, and\nchallenges of FMs. We explore how FMs such as the BERT and GPT families are\nreshaping various healthcare domains, including clinical large language models,\nmedical image analysis, and omics. Furthermore, we provide a detailed taxonomy\nof healthcare applications facilitated by FMs, such as clinical NLP, medical\ncomputer vision, graph learning, and other biology-related tasks. Despite the\npromising opportunities FMs provide, they also have several associated\nchallenges, which are explained in detail. We also outline open research issues\nand potential lessons learned to provide researchers and practitioners with\ninsights into the capabilities of FMs in healthcare to advance their deployment\nand mitigate associated risks.\n","authors":["Wasif Khan","Seowung Leem","Kyle B. See","Joshua K. Wong","Shaoting Zhang","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2406.10729v2.pdf","comment":"Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING"},{"id":"http://arxiv.org/abs/2501.08226v1","updated":"2025-01-14T16:10:25Z","published":"2025-01-14T16:10:25Z","title":"Efficient Deep Learning-based Forward Solvers for Brain Tumor Growth\n  Models","summary":"  Glioblastoma, a highly aggressive brain tumor, poses major challenges due to\nits poor prognosis and high morbidity rates. Partial differential\nequation-based models offer promising potential to enhance therapeutic outcomes\nby simulating patient-specific tumor behavior for improved radiotherapy\nplanning. However, model calibration remains a bottleneck due to the high\ncomputational demands of optimization methods like Monte Carlo sampling and\nevolutionary algorithms. To address this, we recently introduced an approach\nleveraging a neural forward solver with gradient-based optimization to\nsignificantly reduce calibration time. This approach requires a highly accurate\nand fully differentiable forward model. We investigate multiple architectures,\nincluding (i) an enhanced TumorSurrogate, (ii) a modified nnU-Net, and (iii) a\n3D Vision Transformer (ViT). The optimized TumorSurrogate achieved the best\noverall results, excelling in both tumor outline matching and voxel-level\nprediction of tumor cell concentration. It halved the MSE relative to the\nbaseline model and achieved the highest Dice score across all tumor cell\nconcentration thresholds. Our study demonstrates significant enhancement in\nforward solver performance and outlines important future research directions.\n","authors":["Zeineb Haouari","Jonas Weidner","Ivan Ezhov","Aswathi Varma","Daniel Rueckert","Bjoern Menze","Benedikt Wiestler"],"pdf_url":"https://arxiv.org/pdf/2501.08226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06773v2","updated":"2025-01-14T16:08:28Z","published":"2025-01-12T10:43:05Z","title":"Pareto Set Learning for Multi-Objective Reinforcement Learning","summary":"  Multi-objective decision-making problems have emerged in numerous real-world\nscenarios, such as video games, navigation and robotics. Considering the clear\nadvantages of Reinforcement Learning (RL) in optimizing decision-making\nprocesses, researchers have delved into the development of Multi-Objective RL\n(MORL) methods for solving multi-objective decision problems. However, previous\nmethods either cannot obtain the entire Pareto front, or employ only a single\npolicy network for all the preferences over multiple objectives, which may not\nproduce personalized solutions for each preference. To address these\nlimitations, we propose a novel decomposition-based framework for MORL, Pareto\nSet Learning for MORL (PSL-MORL), that harnesses the generation capability of\nhypernetwork to produce the parameters of the policy network for each\ndecomposition weight, generating relatively distinct policies for various\nscalarized subproblems with high efficiency. PSL-MORL is a general framework,\nwhich is compatible for any RL algorithm. The theoretical result guarantees the\nsuperiority of the model capacity of PSL-MORL and the optimality of the\nobtained policy network. Through extensive experiments on diverse benchmarks,\nwe demonstrate the effectiveness of PSL-MORL in achieving dense coverage of the\nPareto front, significantly outperforming state-of-the-art MORL methods in the\nhypervolume and sparsity indicators.\n","authors":["Erlong Liu","Yu-Chang Wu","Xiaobin Huang","Chengrui Gao","Ren-Jian Wang","Ke Xue","Chao Qian"],"pdf_url":"https://arxiv.org/pdf/2501.06773v2.pdf","comment":"AAAI 2025 Accept"},{"id":"http://arxiv.org/abs/2501.08223v1","updated":"2025-01-14T16:06:54Z","published":"2025-01-14T16:06:54Z","title":"Big Batch Bayesian Active Learning by Considering Predictive\n  Probabilities","summary":"  We observe that BatchBALD, a popular acquisition function for batch Bayesian\nactive learning for classification, can conflate epistemic and aleatoric\nuncertainty, leading to suboptimal performance. Motivated by this observation,\nwe propose to focus on the predictive probabilities, which only exhibit\nepistemic uncertainty. The result is an acquisition function that not only\nperforms better, but is also faster to evaluate, allowing for larger batches\nthan before.\n","authors":["Sebastian W. Ober","Samuel Power","Tom Diethe","Henry B. Moss"],"pdf_url":"https://arxiv.org/pdf/2501.08223v1.pdf","comment":"7 pages, 2 figures; presented as a lightning talk at the NeurIPS\n  Workshop on Bayesian Decision-making and Uncertainty (BDU; 2024)"},{"id":"http://arxiv.org/abs/2501.08219v1","updated":"2025-01-14T16:02:33Z","published":"2025-01-14T16:02:33Z","title":"Investigating Energy Efficiency and Performance Trade-offs in LLM\n  Inference Across Tasks and DVFS Settings","summary":"  Large language models (LLMs) have shown significant improvements in many\nnatural language processing (NLP) tasks, accelerating their rapid adoption\nacross many industries. These models are resource-intensive, requiring\nextensive computational resources both during training and inference, leading\nto increased energy consumption and negative environmental impact. As their\nadoption accelerates, the sustainability of LLMs has become a critical issue,\nnecessitating strategies to optimize their runtime efficiency without\ncompromising performance. Hence, it is imperative to identify the parameters\nthat significantly influence the performance and energy efficiency of LLMs. To\nthat end, in this work, we investigate the effect of important parameters on\nthe performance and energy efficiency of LLMs during inference and examine\ntheir trade-offs.\n  First, we analyze how different types of models with varying numbers of\nparameters and architectures perform on tasks like text generation, question\nanswering, and summarization by benchmarking LLMs such as Falcon-7B,\nMistral-7B-v0.1, T5-3B, GPT-2, GPT-J-6B, and GPT-Neo-2.7B. Second, we study\ninput and output sequence characteristics such as sequence length concerning\nenergy consumption, performance, and throughput. Finally, we explore the impact\nof hardware-based power-saving techniques, i.e., Dynamic Voltage Frequency\nScaling (DVFS), on the models' latency and energy efficiency. Our extensive\nbenchmarking and statistical analysis reveal many interesting findings,\nuncovering how specific optimizations can reduce energy consumption while\nmaintaining throughput and accuracy. This study provides actionable insights\nfor researchers and practitioners to design energy-efficient LLM inference\nsystems.\n","authors":["Paul Joe Maliakel","Shashikant Ilager","Ivona Brandic"],"pdf_url":"https://arxiv.org/pdf/2501.08219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08205v1","updated":"2025-01-14T15:45:27Z","published":"2025-01-14T15:45:27Z","title":"Modeling Feature Maps for Quantum Machine Learning","summary":"  Quantum Machine Learning (QML) offers significant potential for complex tasks\nlike genome sequence classification, but quantum noise on Noisy\nIntermediate-Scale Quantum (NISQ) devices poses practical challenges. This\nstudy systematically evaluates how various quantum noise models including\ndephasing, amplitude damping, depolarizing, thermal noise, bit-flip, and\nphase-flip affect key QML algorithms (QSVC, Peg-QSVC, QNN, VQC) and feature\nmapping techniques (ZFeatureMap, ZZFeatureMap, and PauliFeatureMap). Results\nindicate that QSVC is notably robust under noise, whereas Peg-QSVC and QNN are\nmore sensitive, particularly to depolarizing and amplitude-damping noise. The\nPauliFeatureMap is especially vulnerable, highlighting difficulties in\nmaintaining accurate classification under noisy conditions. These findings\nunderscore the critical importance of feature map selection and noise\nmitigation strategies in optimizing QML for genomic classification, with\npromising implications for personalized medicine.\n","authors":["Navneet Singh","Shiva Raj Pokhrel"],"pdf_url":"https://arxiv.org/pdf/2501.08205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08202v1","updated":"2025-01-14T15:37:03Z","published":"2025-01-14T15:37:03Z","title":"Data-driven system identification using quadratic embeddings of\n  nonlinear dynamics","summary":"  We propose a novel data-driven method called QENDy (Quadratic Embedding of\nNonlinear Dynamics) that not only allows us to learn quadratic representations\nof highly nonlinear dynamical systems, but also to identify the governing\nequations. The approach is based on an embedding of the system into a\nhigher-dimensional feature space in which the dynamics become quadratic. Just\nlike SINDy (Sparse Identification of Nonlinear Dynamics), our method requires\ntrajectory data, time derivatives for the training data points, which can also\nbe estimated using finite difference approximations, and a set of preselected\nbasis functions, called dictionary. We illustrate the efficacy and accuracy of\nQENDy with the aid of various benchmark problems and compare its performance\nwith SINDy and a deep learning method for identifying quadratic embeddings.\nFurthermore, we analyze the convergence of QENDy and SINDy in the infinite data\nlimit, highlight their similarities and main differences, and compare the\nquadratic embedding with linearization techniques based on the Koopman\noperator.\n","authors":["Stefan Klus","Joel-Pascal N'Konzi"],"pdf_url":"https://arxiv.org/pdf/2501.08202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08201v1","updated":"2025-01-14T15:36:32Z","published":"2025-01-14T15:36:32Z","title":"Globally Convergent Variational Inference","summary":"  In variational inference (VI), an approximation of the posterior distribution\nis selected from a family of distributions through numerical optimization. With\nthe most common variational objective function, known as the evidence lower\nbound (ELBO), only convergence to a local optimum can be guaranteed. In this\nwork, we instead establish the global convergence of a particular VI method.\nThis VI method, which may be considered an instance of neural posterior\nestimation (NPE), minimizes an expectation of the inclusive (forward) KL\ndivergence to fit a variational distribution that is parameterized by a neural\nnetwork. Our convergence result relies on the neural tangent kernel (NTK) to\ncharacterize the gradient dynamics that arise from considering the variational\nobjective in function space. In the asymptotic regime of a fixed,\npositive-definite neural tangent kernel, we establish conditions under which\nthe variational objective admits a unique solution in a reproducing kernel\nHilbert space (RKHS). Then, we show that the gradient descent dynamics in\nfunction space converge to this unique function. In ablation studies and\npractical problems, we demonstrate that our results explain the behavior of NPE\nin non-asymptotic finite-neuron settings, and show that NPE outperforms\nELBO-based optimization, which often converges to shallow local optima.\n","authors":["Declan McNamara","Jackson Loper","Jeffrey Regier"],"pdf_url":"https://arxiv.org/pdf/2501.08201v1.pdf","comment":"Accepted to the 38th Conference on Neural Information Processing\n  Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2501.08200v1","updated":"2025-01-14T15:27:01Z","published":"2025-01-14T15:27:01Z","title":"CWEval: Outcome-driven Evaluation on Functionality and Security of LLM\n  Code Generation","summary":"  Large Language Models (LLMs) have significantly aided developers by\ngenerating or assisting in code writing, enhancing productivity across various\ntasks. While identifying incorrect code is often straightforward, detecting\nvulnerabilities in functionally correct code is more challenging, especially\nfor developers with limited security knowledge, which poses considerable\nsecurity risks of using LLM-generated code and underscores the need for robust\nevaluation benchmarks that assess both functional correctness and security.\nCurrent benchmarks like CyberSecEval and SecurityEval attempt to solve it but\nare hindered by unclear and impractical specifications, failing to assess both\nfunctionality and security accurately. To tackle these deficiencies, we\nintroduce CWEval, a novel outcome-driven evaluation framework designed to\nenhance the evaluation of secure code generation by LLMs. This framework not\nonly assesses code functionality but also its security simultaneously with\nhigh-quality task specifications and outcome-driven test oracles which provides\nhigh accuracy. Coupled with CWEval-bench, a multilingual, security-critical\ncoding benchmark, CWEval provides a rigorous empirical security evaluation on\nLLM-generated code, overcoming previous benchmarks' shortcomings. Through our\nevaluations, CWEval reveals a notable portion of functional but insecure code\nproduced by LLMs, and shows a serious inaccuracy of previous evaluations,\nultimately contributing significantly to the field of secure code generation.\nWe open-source our artifact at: https://github.com/Co1lin/CWEval .\n","authors":["Jinjun Peng","Leyi Cui","Kele Huang","Junfeng Yang","Baishakhi Ray"],"pdf_url":"https://arxiv.org/pdf/2501.08200v1.pdf","comment":"to be published in LLM4Code 2025"},{"id":"http://arxiv.org/abs/2501.08195v1","updated":"2025-01-14T15:18:28Z","published":"2025-01-14T15:18:28Z","title":"Self-supervised Deep Hyperspectral Inpainting with the Plug and Play and\n  Deep Image Prior Models","summary":"  Hyperspectral images are typically composed of hundreds of narrow and\ncontiguous spectral bands, each containing information regarding the material\ncomposition of the imaged scene. However, these images can be affected by\nvarious sources of noise, distortions, or data loss, which can significantly\ndegrade their quality and usefulness. This paper introduces a convergent\nguaranteed algorithm, LRS-PnP-DIP(1-Lip), which successfully addresses the\ninstability issue of DHP that has been reported before. The proposed algorithm\nextends the successful joint low-rank and sparse model to further exploit the\nunderlying data structures beyond the conventional and sometimes restrictive\nunions of subspace models. A stability analysis guarantees the convergence of\nthe proposed algorithm under mild assumptions , which is crucial for its\napplication in real-world scenarios. Extensive experiments demonstrate that the\nproposed solution consistently delivers visually and quantitatively superior\ninpainting results, establishing state-of-the-art performance.\n","authors":["Shuo Li","Mehrdad Yaghoobi"],"pdf_url":"https://arxiv.org/pdf/2501.08195v1.pdf","comment":"31 pages, 9 Figures, 7 Tables. arXiv admin note: text overlap with\n  arXiv:2306.08128"},{"id":"http://arxiv.org/abs/2501.08193v1","updated":"2025-01-14T15:14:26Z","published":"2025-01-14T15:14:26Z","title":"Modeling Quantum Machine Learning for Genomic Data Analysis","summary":"  Quantum Machine Learning (QML) continues to evolve, unlocking new\nopportunities for diverse applications. In this study, we investigate and\nevaluate the applicability of QML models for binary classification of genome\nsequence data by employing various feature mapping techniques. We present an\nopen-source, independent Qiskit-based implementation to conduct experiments on\na benchmark genomic dataset. Our simulations reveal that the interplay between\nfeature mapping techniques and QML algorithms significantly influences\nperformance. Notably, the Pegasos Quantum Support Vector Classifier\n(Pegasos-QSVC) exhibits high sensitivity, particularly excelling in recall\nmetrics, while Quantum Neural Networks (QNN) achieve the highest training\naccuracy across all feature maps. However, the pronounced variability in\nclassifier performance, dependent on feature mapping, highlights the risk of\noverfitting to localized output distributions in certain scenarios. This work\nunderscores the transformative potential of QML for genomic data classification\nwhile emphasizing the need for continued advancements to enhance the robustness\nand accuracy of these methodologies.\n","authors":["Navneet Singh","Shiva Raj Pokhrel"],"pdf_url":"https://arxiv.org/pdf/2501.08193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08188v1","updated":"2025-01-14T15:13:00Z","published":"2025-01-14T15:13:00Z","title":"A Critical Synthesis of Uncertainty Quantification and Foundation Models\n  in Monocular Depth Estimation","summary":"  While recent foundation models have enabled significant breakthroughs in\nmonocular depth estimation, a clear path towards safe and reliable deployment\nin the real-world remains elusive. Metric depth estimation, which involves\npredicting absolute distances, poses particular challenges, as even the most\nadvanced foundation models remain prone to critical errors. Since quantifying\nthe uncertainty has emerged as a promising endeavor to address these\nlimitations and enable trustworthy deployment, we fuse five different\nuncertainty quantification methods with the current state-of-the-art\nDepthAnythingV2 foundation model. To cover a wide range of metric depth\ndomains, we evaluate their performance on four diverse datasets. Our findings\nidentify fine-tuning with the Gaussian Negative Log-Likelihood Loss (GNLL) as a\nparticularly promising approach, offering reliable uncertainty estimates while\nmaintaining predictive performance and computational efficiency on par with the\nbaseline, encompassing both training and inference time. By fusing uncertainty\nquantification and foundation models within the context of monocular depth\nestimation, this paper lays a critical foundation for future research aimed at\nimproving not only model performance but also its explainability. Extending\nthis critical synthesis of uncertainty quantification and foundation models\ninto other crucial tasks, such as semantic segmentation and pose estimation,\npresents exciting opportunities for safer and more reliable machine vision\nsystems.\n","authors":["Steven Landgraf","Rongjun Qin","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2501.08188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08187v1","updated":"2025-01-14T15:12:19Z","published":"2025-01-14T15:12:19Z","title":"A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction\n  Following","summary":"  Large language models excel at interpreting complex natural language\ninstructions, enabling them to perform a wide range of tasks. In the life\nsciences, single-cell RNA sequencing (scRNA-seq) data serves as the \"language\nof cellular biology\", capturing intricate gene expression patterns at the\nsingle-cell level. However, interacting with this \"language\" through\nconventional tools is often inefficient and unintuitive, posing challenges for\nresearchers. To address these limitations, we present InstructCell, a\nmulti-modal AI copilot that leverages natural language as a medium for more\ndirect and flexible single-cell analysis. We construct a comprehensive\nmulti-modal instruction dataset that pairs text-based instructions with\nscRNA-seq profiles from diverse tissues and species. Building on this, we\ndevelop a multi-modal cell language architecture capable of simultaneously\ninterpreting and processing both modalities. InstructCell empowers researchers\nto accomplish critical tasks-such as cell type annotation, conditional\npseudo-cell generation, and drug sensitivity prediction-using straightforward\nnatural language commands. Extensive evaluations demonstrate that InstructCell\nconsistently meets or exceeds the performance of existing single-cell\nfoundation models, while adapting to diverse experimental conditions. More\nimportantly, InstructCell provides an accessible and intuitive tool for\nexploring complex single-cell data, lowering technical barriers and enabling\ndeeper biological insights.\n","authors":["Yin Fang","Xinle Deng","Kangwei Liu","Ningyu Zhang","Jingyang Qian","Penghui Yang","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08187v1.pdf","comment":"37 pages; 13 figures; Code: https://github.com/zjunlp/Instructcell;\n  Models: https://huggingface.co/zjunlp/Instructcell-chat,\n  https://huggingface.co/zjunlp/InstructCell-instruct"},{"id":"http://arxiv.org/abs/2501.08180v1","updated":"2025-01-14T15:03:53Z","published":"2025-01-14T15:03:53Z","title":"D$^2$-DPM: Dual Denoising for Quantized Diffusion Probabilistic Models","summary":"  Diffusion models have achieved cutting-edge performance in image generation.\nHowever, their lengthy denoising process and computationally intensive score\nestimation network impede their scalability in low-latency and\nresource-constrained scenarios. Post-training quantization (PTQ) compresses and\naccelerates diffusion models without retraining, but it inevitably introduces\nadditional quantization noise, resulting in mean and variance deviations. In\nthis work, we propose D2-DPM, a dual denoising mechanism aimed at precisely\nmitigating the adverse effects of quantization noise on the noise estimation\nnetwork. Specifically, we first unravel the impact of quantization noise on the\nsampling equation into two components: the mean deviation and the variance\ndeviation. The mean deviation alters the drift coefficient of the sampling\nequation, influencing the trajectory trend, while the variance deviation\nmagnifies the diffusion coefficient, impacting the convergence of the sampling\ntrajectory. The proposed D2-DPM is thus devised to denoise the quantization\nnoise at each time step, and then denoise the noisy sample through the inverse\ndiffusion iterations. Experimental results demonstrate that D2-DPM achieves\nsuperior generation quality, yielding a 1.42 lower FID than the full-precision\nmodel while achieving 3.99x compression and 11.67x bit-operation acceleration.\n","authors":["Qian Zeng","Jie Song","Han Zheng","Hao Jiang","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2501.08180v1.pdf","comment":"9 pages, 4 figures, acceptted by AAAI2025"},{"id":"http://arxiv.org/abs/2411.19835v2","updated":"2025-01-14T14:53:10Z","published":"2024-11-29T16:45:25Z","title":"Feedback-driven object detection and iterative model improvement","summary":"  Automated object detection has become increasingly valuable across diverse\napplications, yet efficient, high-quality annotation remains a persistent\nchallenge. In this paper, we present the development and evaluation of a\nplatform designed to interactively improve object detection models. The\nplatform allows uploading and annotating images as well as fine-tuning object\ndetection models. Users can then manually review and refine annotations,\nfurther creating improved snapshots that are used for automatic object\ndetection on subsequent image uploads - a process we refer to as semi-automatic\nannotation resulting in a significant gain in annotation efficiency.\n  Whereas iterative refinement of model results to speed up annotation has\nbecome common practice, we are the first to quantitatively evaluate its\nbenefits with respect to time, effort, and interaction savings. Our\nexperimental results show clear evidence for a significant time reduction of up\nto 53% for semi-automatic compared to manual annotation. Importantly, these\nefficiency gains did not compromise annotation quality, while matching or\noccasionally even exceeding the accuracy of manual annotations. These findings\ndemonstrate the potential of our lightweight annotation platform for creating\nhigh-quality object detection datasets and provide best practices to guide\nfuture development of annotation platforms.\n  The platform is open-source, with the frontend and backend repositories\navailable on GitHub (https://github.com/ml-lab-htw/iterative-annotate). To\nsupport the understanding of our labeling process, we have created an\nexplanatory video demonstrating the methodology using microscopy images of E.\ncoli bacteria as an example. The video is available on YouTube\n(https://www.youtube.com/watch?v=CM9uhE8NN5E).\n","authors":["Sönke Tenckhoff","Mario Koddenbrock","Erik Rodner"],"pdf_url":"https://arxiv.org/pdf/2411.19835v2.pdf","comment":"AI4EA24"},{"id":"http://arxiv.org/abs/2501.08169v1","updated":"2025-01-14T14:49:49Z","published":"2025-01-14T14:49:49Z","title":"Revolutionizing Communication with Deep Learning and XAI for Enhanced\n  Arabic Sign Language Recognition","summary":"  This study introduces an integrated approach to recognizing Arabic Sign\nLanguage (ArSL) using state-of-the-art deep learning models such as\nMobileNetV3, ResNet50, and EfficientNet-B2. These models are further enhanced\nby explainable AI (XAI) techniques to boost interpretability. The ArSL2018 and\nRGB Arabic Alphabets Sign Language (AASL) datasets are employed, with\nEfficientNet-B2 achieving peak accuracies of 99.48\\% and 98.99\\%, respectively.\nKey innovations include sophisticated data augmentation methods to mitigate\nclass imbalance, implementation of stratified 5-fold cross-validation for\nbetter generalization, and the use of Grad-CAM for clear model decision\ntransparency. The proposed system not only sets new benchmarks in recognition\naccuracy but also emphasizes interpretability, making it suitable for\napplications in healthcare, education, and inclusive communication\ntechnologies.\n","authors":["Mazen Balat","Rewaa Awaad","Ahmed B. Zaky","Salah A. Aly"],"pdf_url":"https://arxiv.org/pdf/2501.08169v1.pdf","comment":"13 pages, 25 figures, 16 tables"},{"id":"http://arxiv.org/abs/2412.13174v2","updated":"2025-01-14T14:48:32Z","published":"2024-12-17T18:53:43Z","title":"ORFormer: Occlusion-Robust Transformer for Accurate Facial Landmark\n  Detection","summary":"  Although facial landmark detection (FLD) has gained significant progress,\nexisting FLD methods still suffer from performance drops on partially\nnon-visible faces, such as faces with occlusions or under extreme lighting\nconditions or poses. To address this issue, we introduce ORFormer, a novel\ntransformer-based method that can detect non-visible regions and recover their\nmissing features from visible parts. Specifically, ORFormer associates each\nimage patch token with one additional learnable token called the messenger\ntoken. The messenger token aggregates features from all but its patch. This\nway, the consensus between a patch and other patches can be assessed by\nreferring to the similarity between its regular and messenger embeddings,\nenabling non-visible region identification. Our method then recovers occluded\npatches with features aggregated by the messenger tokens. Leveraging the\nrecovered features, ORFormer compiles high-quality heatmaps for the downstream\nFLD task. Extensive experiments show that our method generates heatmaps\nresilient to partial occlusions. By integrating the resultant heatmaps into\nexisting FLD methods, our method performs favorably against the state of the\narts on challenging datasets such as WFLW and COFW.\n","authors":["Jui-Che Chiang","Hou-Ning Hu","Bo-Syuan Hou","Chia-Yu Tseng","Yu-Lun Liu","Min-Hung Chen","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2412.13174v2.pdf","comment":"WACV 2025 Project Link: https://ben0919.github.io/ORFormer/"},{"id":"http://arxiv.org/abs/2501.08156v1","updated":"2025-01-14T14:31:45Z","published":"2025-01-14T14:31:45Z","title":"Inference-Time-Compute: More Faithful? A Research Note","summary":"  Models trained specifically to generate long Chains of Thought (CoTs) have\nrecently achieved impressive results. We refer to these models as\nInference-Time-Compute (ITC) models. Are the CoTs of ITC models more faithful\ncompared to traditional non-ITC models? We evaluate two ITC models (based on\nQwen-2.5 and Gemini-2) on an existing test of faithful CoT To measure\nfaithfulness, we test if models articulate cues in their prompt that influence\ntheir answers to MMLU questions. For example, when the cue \"A Stanford\nProfessor thinks the answer is D'\" is added to the prompt, models sometimes\nswitch their answer to D. In such cases, the Gemini ITC model articulates the\ncue 54% of the time, compared to 14% for the non-ITC Gemini.\n  We evaluate 7 types of cue, such as misleading few-shot examples and\nanchoring on past responses. ITC models articulate cues that influence them\nmuch more reliably than all the 6 non-ITC models tested, such as\nClaude-3.5-Sonnet and GPT-4o, which often articulate close to 0% of the time.\n  However, our study has important limitations. We evaluate only two ITC models\n-- we cannot evaluate OpenAI's SOTA o1 model. We also lack details about the\ntraining of these ITC models, making it hard to attribute our findings to\nspecific processes.\n  We think faithfulness of CoT is an important property for AI Safety. The ITC\nmodels we tested show a large improvement in faithfulness, which is worth\ninvestigating further. To speed up this investigation, we release these early\nresults as a research note.\n","authors":["James Chua","Owain Evans"],"pdf_url":"https://arxiv.org/pdf/2501.08156v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.08155v1","updated":"2025-01-14T14:29:36Z","published":"2025-01-14T14:29:36Z","title":"FairTTTS: A Tree Test Time Simulation Method for Fairness-Aware\n  Classification","summary":"  Algorithmic decision-making has become deeply ingrained in many domains, yet\nbiases in machine learning models can still produce discriminatory outcomes,\noften harming unprivileged groups. Achieving fair classification is inherently\nchallenging, requiring a careful balance between predictive performance and\nethical considerations. We present FairTTTS, a novel post-processing bias\nmitigation method inspired by the Tree Test Time Simulation (TTTS) method.\nOriginally developed to enhance accuracy and robustness against adversarial\ninputs through probabilistic decision-path adjustments, TTTS serves as the\nfoundation for FairTTTS. By building on this accuracy-enhancing technique,\nFairTTTS mitigates bias and improves predictive performance. FairTTTS uses a\ndistance-based heuristic to adjust decisions at protected attribute nodes,\nensuring fairness for unprivileged samples. This fairness-oriented adjustment\noccurs as a post-processing step, allowing FairTTTS to be applied to\npre-trained models, diverse datasets, and various fairness metrics without\nretraining. Extensive evaluation on seven benchmark datasets shows that\nFairTTTS outperforms traditional methods in fairness improvement, achieving a\n20.96% average increase over the baseline compared to 18.78% for related work,\nand further enhances accuracy by 0.55%. In contrast, competing methods\ntypically reduce accuracy by 0.42%. These results confirm that FairTTTS\neffectively promotes more equitable decision-making while simultaneously\nimproving predictive performance.\n","authors":["Nurit Cohen-Inger","Lior Rokach","Bracha Shapira","Seffi Cohen"],"pdf_url":"https://arxiv.org/pdf/2501.08155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08149v1","updated":"2025-01-14T14:25:10Z","published":"2025-01-14T14:25:10Z","title":"Multiple-Input Variational Auto-Encoder for Anomaly Detection in\n  Heterogeneous Data","summary":"  Anomaly detection (AD) plays a pivotal role in AI applications, e.g., in\nclassification, and intrusion/threat detection in cybersecurity. However, most\nexisting methods face challenges of heterogeneity amongst feature subsets posed\nby non-independent and identically distributed (non-IID) data. We propose a\nnovel neural network model called Multiple-Input Auto-Encoder for AD (MIAEAD)\nto address this. MIAEAD assigns an anomaly score to each feature subset of a\ndata sample to indicate its likelihood of being an anomaly. This is done by\nusing the reconstruction error of its sub-encoder as the anomaly score. All\nsub-encoders are then simultaneously trained using unsupervised learning to\ndetermine the anomaly scores of feature subsets. The final AUC of MIAEAD is\ncalculated for each sub-dataset, and the maximum AUC obtained among the\nsub-datasets is selected. To leverage the modelling of the distribution of\nnormal data to identify anomalies of the generative models, we develop a novel\nneural network architecture/model called Multiple-Input Variational\nAuto-Encoder (MIVAE). MIVAE can process feature subsets through its\nsub-encoders before learning distribution of normal data in the latent space.\nThis allows MIVAE to identify anomalies that deviate from the learned\ndistribution. We theoretically prove that the difference in the average anomaly\nscore between normal samples and anomalies obtained by the proposed MIVAE is\ngreater than that of the Variational Auto-Encoder (VAEAD), resulting in a\nhigher AUC for MIVAE. Extensive experiments on eight real-world anomaly\ndatasets demonstrate the superior performance of MIAEAD and MIVAE over\nconventional methods and the state-of-the-art unsupervised models, by up to 6%\nin terms of AUC score. Alternatively, MIAEAD and MIVAE have a high AUC when\napplied to feature subsets with low heterogeneity based on the coefficient of\nvariation (CV) score.\n","authors":["Phai Vu Dinh","Diep N. Nguyen","Dinh Thai Hoang","Quang Uy Nguyen","Eryk Dutkiewicz"],"pdf_url":"https://arxiv.org/pdf/2501.08149v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2210.09655v2","updated":"2025-01-14T14:22:05Z","published":"2022-10-18T07:48:59Z","title":"WINE: Wavelet-Guided GAN Inversion and Editing for High-Fidelity\n  Refinement","summary":"  Recent advanced GAN inversion models aim to convey high-fidelity information\nfrom original images to generators through methods using generator tuning or\nhigh-dimensional feature learning. Despite these efforts, accurately\nreconstructing image-specific details remains as a challenge due to the\ninherent limitations both in terms of training and structural aspects, leading\nto a bias towards low-frequency information. In this paper, we look into the\nwidely used pixel loss in GAN inversion, revealing its predominant focus on the\nreconstruction of low-frequency features. We then propose WINE, a\nWavelet-guided GAN Inversion aNd Editing model, which transfers the\nhigh-frequency information through wavelet coefficients via newly proposed\nwavelet loss and wavelet fusion scheme. Notably, WINE is the first attempt to\ninterpret GAN inversion in the frequency domain. Our experimental results\nshowcase the precision of WINE in preserving high-frequency details and\nenhancing image quality. Even in editing scenarios, WINE outperforms existing\nstate-of-the-art GAN inversion models with a fine balance between editability\nand reconstruction quality.\n","authors":["Chaewon Kim","Seung-Jun Moon","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2210.09655v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08142v1","updated":"2025-01-14T14:21:48Z","published":"2025-01-14T14:21:48Z","title":"Bootstrapping Corner Cases: High-Resolution Inpainting for Safety\n  Critical Detect and Avoid for Automated Flying","summary":"  Modern machine learning techniques have shown tremendous potential,\nespecially for object detection on camera images. For this reason, they are\nalso used to enable safety-critical automated processes such as autonomous\ndrone flights. We present a study on object detection for Detect and Avoid, a\nsafety critical function for drones that detects air traffic during automated\nflights for safety reasons. An ill-posed problem is the generation of good and\nespecially large data sets, since detection itself is the corner case. Most\nmodels suffer from limited ground truth in raw data, \\eg recorded air traffic\nor frontal flight with a small aircraft. It often leads to poor and critical\ndetection rates. We overcome this problem by using inpainting methods to\nbootstrap the dataset such that it explicitly contains the corner cases of the\nraw data. We provide an overview of inpainting methods and generative models\nand present an example pipeline given a small annotated dataset. We validate\nour method by generating a high-resolution dataset, which we make publicly\navailable and present it to an independent object detector that was fully\ntrained on real data.\n","authors":["Jonathan Lyhs","Lars Hinneburg","Michael Fischer","Florian Ölsner","Stefan Milz","Jeremy Tschirner","Patrick Mäder"],"pdf_url":"https://arxiv.org/pdf/2501.08142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08139v1","updated":"2025-01-14T14:19:40Z","published":"2025-01-14T14:19:40Z","title":"EEG-ReMinD: Enhancing Neurodegenerative EEG Decoding through\n  Self-Supervised State Reconstruction-Primed Riemannian Dynamics","summary":"  The development of EEG decoding algorithms confronts challenges such as data\nsparsity, subject variability, and the need for precise annotations, all of\nwhich are vital for advancing brain-computer interfaces and enhancing the\ndiagnosis of diseases. To address these issues, we propose a novel two-stage\napproach named Self-Supervised State Reconstruction-Primed Riemannian Dynamics\n(EEG-ReMinD) , which mitigates reliance on supervised learning and integrates\ninherent geometric features. This approach efficiently handles EEG data\ncorruptions and reduces the dependency on labels. EEG-ReMinD utilizes\nself-supervised and geometric learning techniques, along with an attention\nmechanism, to analyze the temporal dynamics of EEG features within the\nframework of Riemannian geometry, referred to as Riemannian dynamics.\nComparative analyses on both intact and corrupted datasets from two different\nneurodegenerative disorders underscore the enhanced performance of EEG-ReMinD.\n","authors":["Zirui Wang","Zhenxi Song","Yi Guo","Yuxin Liu","Guoyang Xu","Min Zhang","Zhiguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08134v1","updated":"2025-01-14T14:14:22Z","published":"2025-01-14T14:14:22Z","title":"An Empirical Wall-Pressure Spectrum Model for Aeroacoustic Predictions\n  Based on Symbolic Regression","summary":"  Fast-turn around methods to predict airfoil trailing-edge noise are crucial\nfor incorporating noise limitations into design optimization loops of several\napplications. Among these aeroacoustic predictive models, Amiet's theory offers\nthe best balance between accuracy and simplicity. The accuracy of the model\nrelies heavily on precise wall-pressure spectrum predictions, which are often\nbased on single-equation formulations with adjustable parameters. These\nparameters are calibrated for particular airfoils and flow conditions and\nconsequently tend to fail when applied outside their calibration range. This\npaper introduces a new wall-pressure spectrum empirical model designed to\nenhance the robustness and accuracy of current state-of-the-art predictions\nwhile widening the range of applicability of the model to different airfoils\nand flow conditions. The model is developed using AI-based symbolic regression\nvia a genetic-algorithm-based approach, and applied to a dataset of\nwall-pressure fluctuations measured on NACA 0008 and NACA 63018 airfoils at\nmultiple angles of attack and inflow velocities, covering turbulent boundary\nlayers with both adverse and favorable pressure gradients. Validation against\nexperimental data (outside the training dataset) demonstrates the robustness of\nthe model compared to well-accepted semi-empirical models. Finally, the model\nis integrated with Amiet's theory to predict the aeroacoustic noise of a\nfull-scale wind turbine, showing good agreement with experimental measurements.\n","authors":["Laura Botero Bolívar","David Huergo","Fernanda L. dos Santos","Cornelis H. Venner","Leandro D. de Santana","Esteban Ferrer"],"pdf_url":"https://arxiv.org/pdf/2501.08134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15102v2","updated":"2025-01-14T14:07:55Z","published":"2024-11-22T18:06:14Z","title":"AttriBoT: A Bag of Tricks for Efficiently Approximating Leave-One-Out\n  Context Attribution","summary":"  The influence of contextual input on the behavior of large language models\n(LLMs) has prompted the development of context attribution methods that aim to\nquantify each context span's effect on an LLM's generations. The leave-one-out\n(LOO) error, which measures the change in the likelihood of the LLM's response\nwhen a given span of the context is removed, provides a principled way to\nperform context attribution, but can be prohibitively expensive to compute for\nlarge models. In this work, we introduce AttriBoT, a series of novel techniques\nfor efficiently computing an approximation of the LOO error for context\nattribution. Specifically, AttriBoT uses cached activations to avoid redundant\noperations, performs hierarchical attribution to reduce computation, and\nemulates the behavior of large target models with smaller proxy models. Taken\ntogether, AttriBoT can provide a >300x speedup while remaining more faithful to\na target model's LOO error than prior context attribution methods. This stark\nincrease in performance makes computing context attributions for a given\nresponse 30x faster than generating the response itself, empowering real-world\napplications that require computing attributions at scale. We release a\nuser-friendly and efficient implementation of AttriBoT to enable efficient LLM\ninterpretability as well as encourage future development of efficient context\nattribution methods.\n","authors":["Fengyuan Liu","Nikhil Kandpal","Colin Raffel"],"pdf_url":"https://arxiv.org/pdf/2411.15102v2.pdf","comment":"29 pages, 11 figures"},{"id":"http://arxiv.org/abs/2412.00123v3","updated":"2025-01-14T14:01:36Z","published":"2024-11-28T10:32:50Z","title":"Electricity Price Prediction Using Multi-Kernel Gaussian Process\n  Regression Combined with Kernel-Based Support Vector Regression","summary":"  This paper presents a new hybrid model for predicting German electricity\nprices. The algorithm is based on combining Gaussian Process Regression (GPR)\nand Support Vector Regression (SVR). While GPR is a competent model for\nlearning the stochastic pattern within the data and interpolation, its\nperformance for out-of-sample data is not very promising. By choosing a\nsuitable data-dependent covariance function, we can enhance the performance of\nGPR for the tested German hourly power prices. However, since the out-of-sample\nprediction depends on the training data, the prediction is vulnerable to noise\nand outliers. To overcome this issue, a separate prediction is made using SVR,\nwhich applies margin-based optimization, having an advantage in dealing with\nnon-linear processes and outliers, since only certain necessary points (support\nvectors) in the training data are responsible for regression. Both individual\npredictions are later combined using the performance-based weight assignment\nmethod. A test on historic German power prices shows that this approach\noutperforms its chosen benchmarks such as the autoregressive exogenous model,\nthe naive approach, as well as the long short-term memory approach of\nprediction.\n","authors":["Abhinav Das","Stephan Schlüter","Lorenz Schneider"],"pdf_url":"https://arxiv.org/pdf/2412.00123v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16625v3","updated":"2025-01-14T13:48:49Z","published":"2023-05-26T04:34:28Z","title":"Set-based Neural Network Encoding Without Weight Tying","summary":"  We propose a neural network weight encoding method for network property\nprediction that utilizes set-to-set and set-to-vector functions to efficiently\nencode neural network parameters. Our approach is capable of encoding neural\nnetworks in a model zoo of mixed architecture and different parameter sizes as\nopposed to previous approaches that require custom encoding models for\ndifferent architectures. Furthermore, our \\textbf{S}et-based \\textbf{N}eural\nnetwork \\textbf{E}ncoder (SNE) takes into consideration the hierarchical\ncomputational structure of neural networks. To respect symmetries inherent in\nnetwork weight space, we utilize Logit Invariance to learn the required minimal\ninvariance properties. Additionally, we introduce a \\textit{pad-chunk-encode}\npipeline to efficiently encode neural network layers that is adjustable to\ncomputational and memory constraints. We also introduce two new tasks for\nneural network property prediction: cross-dataset and cross-architecture. In\ncross-dataset property prediction, we evaluate how well property predictors\ngeneralize across model zoos trained on different datasets but of the same\narchitecture. In cross-architecture property prediction, we evaluate how well\nproperty predictors transfer to model zoos of different architecture not seen\nduring training. We show that SNE outperforms the relevant baselines on\nstandard benchmarks.\n","authors":["Bruno Andreis","Soro Bedionita","Philip H. S. Torr","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2305.16625v3.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2501.08115v1","updated":"2025-01-14T13:46:07Z","published":"2025-01-14T13:46:07Z","title":"RoHan: Robust Hand Detection in Operation Room","summary":"  Hand-specific localization has garnered significant interest within the\ncomputer vision community. Although there are numerous datasets with hand\nannotations from various angles and settings, domain transfer techniques\nfrequently struggle in surgical environments. This is mainly due to the limited\navailability of gloved hand instances and the unique challenges of operating\nrooms (ORs). Thus, hand-detection models tailored to OR settings require\nextensive training and expensive annotation processes. To overcome these\nchallenges, we present \"RoHan\" - a novel approach for robust hand detection in\nthe OR, leveraging advanced semi-supervised domain adaptation techniques to\ntackle the challenges of varying recording conditions, diverse glove colors,\nand occlusions common in surgical settings. Our methodology encompasses two\nmain stages: (1) data augmentation strategy that utilizes \"Artificial Gloves,\"\na method for augmenting publicly available hand datasets with synthetic images\nof hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that\nimproves detection performance in real-world OR settings through iterative\nprediction refinement and efficient frame filtering. We evaluate our method\nusing two datasets: simulated enterotomy repair and saphenous vein graft\nharvesting. \"RoHan\" substantially reduces the need for extensive labeling and\nmodel training, paving the way for the practical implementation of hand\ndetection technologies in medical settings.\n","authors":["Roi Papo","Sapir Gershov","Tom Friedman","Itay Or","Gil Bolotin","Shlomi Laufer"],"pdf_url":"https://arxiv.org/pdf/2501.08115v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2501.04023v2","updated":"2025-01-14T13:40:35Z","published":"2024-12-27T20:16:04Z","title":"Approximation Rates in Fréchet Metrics: Barron Spaces, Paley-Wiener\n  Spaces, and Fourier Multipliers","summary":"  Operator learning is a recent development in the simulation of Partial\nDifferential Equations (PDEs) by means of neural networks. The idea behind this\napproach is to learn the behavior of an operator, such that the resulting\nneural network is an (approximate) mapping in infinite-dimensional spaces that\nis capable of (approximately) simulating the solution operator governed by the\nPDE. In our work, we study some general approximation capabilities for linear\ndifferential operators by approximating the corresponding symbol in the Fourier\ndomain. Analogous to the structure of the class of H\\\"ormander-Symbols, we\nconsider the approximation with respect to a topology that is induced by a\nsequence of semi-norms. In that sense, we measure the approximation error in\nterms of a Fr\\'echet metric, and our main result identifies sufficient\nconditions for achieving a predefined approximation error. Secondly, we then\nfocus on a natural extension of our main theorem, in which we manage to reduce\nthe assumptions on the sequence of semi-norms. Based on existing approximation\nresults for the exponential spectral Barron space, we then present a concrete\nexample of symbols that can be approximated well.\n","authors":["Ahmed Abdeljawad","Thomas Dittrich"],"pdf_url":"https://arxiv.org/pdf/2501.04023v2.pdf","comment":"Minor revision"},{"id":"http://arxiv.org/abs/2501.08109v1","updated":"2025-01-14T13:40:08Z","published":"2025-01-14T13:40:08Z","title":"Data-driven inventory management for new products: A warm-start and\n  adjusted Dyna-$Q$ approach","summary":"  In this paper, we propose a novel reinforcement learning algorithm for\ninventory management of newly launched products with no or limited historical\ndemand information. The algorithm follows the classic Dyna-$Q$ structure,\nbalancing the model-based and model-free approaches, while accelerating the\ntraining process of Dyna-$Q$ and mitigating the model discrepancy generated by\nthe model-based feedback. Warm-start information from the demand data of\nexisting similar products can be incorporated into the algorithm to further\nstabilize the early-stage training and reduce the variance of the estimated\noptimal policy. Our approach is validated through a case study of bakery\ninventory management with real data. The adjusted Dyna-$Q$ shows up to a 23.7\\%\nreduction in average daily cost compared with $Q$-learning, and up to a 77.5\\%\nreduction in training time within the same horizon compared with classic\nDyna-$Q$. By incorporating the warm-start information, it can be found that the\nadjusted Dyna-$Q$ has the lowest total cost, lowest variance in total cost, and\nrelatively low shortage percentages among all the algorithms under a 30-day\ntesting.\n","authors":["Xinyu Qu","Longxiao Liu","Wenjie Huang"],"pdf_url":"https://arxiv.org/pdf/2501.08109v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.11304v4","updated":"2025-01-14T13:36:51Z","published":"2024-11-18T05:59:29Z","title":"Towards Federated Graph Learning in One-shot Communication","summary":"  Federated Graph Learning (FGL) has emerged as a promising paradigm for\nbreaking data silos among distributed private graphs. In practical scenarios\ninvolving heterogeneous distributed graph data, personalized Federated Graph\nLearning (pFGL) aims to enhance model utility by training personalized models\ntailored to client needs. However, existing pFGL methods often require numerous\ncommunication rounds under heterogeneous graphs, leading to significant\ncommunication overhead and security concerns. While One-shot Federated Learning\n(OFL) enables collaboration in a single round, existing OFL methods are\ndesigned for image-centric tasks and ineffective for graph data, leaving a\ncritical gap in the field. Additionally, personalized models derived from\nexisting methods suffer from bias, failing to effectively generalize to the\nminority. To address these challenges, we propose the first $\\textbf{O}$ne-shot\n$\\textbf{p}$ersonalized $\\textbf{F}$ederated $\\textbf{G}$raph\n$\\textbf{L}$earning method ($\\textbf{O-pFGL}$) for node classification,\ncompatible with Secure Aggregation protocols for privacy preservation.\nSpecifically, for effective graph learning in one communication round, our\nmethod estimates and aggregates class-wise feature distribution statistics to\nconstruct a global pseudo-graph on the server, facilitating the training of a\nglobal graph model. To mitigate bias, we introduce a two-stage personalized\ntraining approach that adaptively balances local personal information and\nglobal insights from the pseudo-graph, improving both personalization and\ngeneralization. Extensive experiments on 12 multi-scale graph datasets\ndemonstrate that our method significantly outperforms state-of-the-art\nbaselines across various settings.\n","authors":["Guochen Yan","Xunkai Li","Luyuan Xie","Wentao Zhang","Qingni Shen","Yuejian Fang","Zhonghai Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11304v4.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2501.08099v1","updated":"2025-01-14T13:16:33Z","published":"2025-01-14T13:16:33Z","title":"Smooth Handovers via Smoothed Online Learning","summary":"  With users demanding seamless connectivity, handovers (HOs) have become a\nfundamental element of cellular networks. However, optimizing HOs is a\nchallenging problem, further exacerbated by the growing complexity of mobile\nnetworks. This paper presents the first countrywide study of HO optimization,\nthrough the prism of Smoothed Online Learning (SOL). We first analyze an\nextensive dataset from a commercial mobile network operator (MNO) in Europe\nwith more than 40M users, to understand and reveal important features and\nperformance impacts on HOs. Our findings highlight a correlation between HO\nfailures/delays, and the characteristics of radio cells and end-user devices,\nshowcasing the impact of heterogeneity in mobile networks nowadays. We\nsubsequently model UE-cell associations as dynamic decisions and propose a\nrealistic system model for smooth and accurate HOs that extends existing\napproaches by (i) incorporating device and cell features on HO optimization,\nand (ii) eliminating (prior) strong assumptions about requiring future signal\nmeasurements and knowledge of end-user mobility. Our algorithm, aligned with\nthe O-RAN paradigm, provides robust dynamic regret guarantees, even in\nchallenging environments, and shows superior performance in multiple scenarios\nwith real-world and synthetic data.\n","authors":["Michail Kalntis","Andra Lutu","Jesús Omaña Iglesias","Fernando A. Kuipers","George Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2501.08099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16409v2","updated":"2025-01-14T13:14:00Z","published":"2023-12-27T04:40:12Z","title":"Dynamic Sub-graph Distillation for Robust Semi-supervised Continual\n  Learning","summary":"  Continual learning (CL) has shown promising results and comparable\nperformance to learning at once in a fully supervised manner. However, CL\nstrategies typically require a large number of labeled samples, making their\nreal-life deployment challenging. In this work, we focus on semi-supervised\ncontinual learning (SSCL), where the model progressively learns from partially\nlabeled data with unknown categories. We provide a comprehensive analysis of\nSSCL and demonstrate that unreliable distributions of unlabeled data lead to\nunstable training and refinement of the progressing stages. This problem\nseverely impacts the performance of SSCL. To address the limitations, we\npropose a novel approach called Dynamic Sub-Graph Distillation (DSGD) for\nsemi-supervised continual learning, which leverages both semantic and\nstructural information to achieve more stable knowledge distillation on\nunlabeled data and exhibit robustness against distribution bias. Firstly, we\nformalize a general model of structural distillation and design a dynamic graph\nconstruction for the continual learning progress. Next, we define a structure\ndistillation vector and design a dynamic sub-graph distillation algorithm,\nwhich enables end-to-end training and adaptability to scale up tasks. The\nentire proposed method is adaptable to various CL methods and supervision\nsettings. Finally, experiments conducted on three datasets CIFAR10, CIFAR100,\nand ImageNet-100, with varying supervision ratios, demonstrate the\neffectiveness of our proposed approach in mitigating the catastrophic\nforgetting problem in semi-supervised continual learning scenarios.\n","authors":["Yan Fan","Yu Wang","Pengfei Zhu","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2312.16409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10174v3","updated":"2025-01-14T13:11:05Z","published":"2024-10-14T05:45:52Z","title":"Balanced Neural ODEs: nonlinear model order reduction and Koopman\n  operator approximations","summary":"  Variational Autoencoders (VAEs) are a powerful framework for learning latent\nrepresentations of reduced dimensionality, while Neural ODEs excel in learning\ntransient system dynamics. This work combines the strengths of both to generate\nfast surrogate models with adjustable complexity reacting on time-varying\ninputs signals. By leveraging the VAE's dimensionality reduction using a\nnonhierarchical prior, our method adaptively assigns stochastic noise,\nnaturally complementing known NeuralODE training enhancements and enabling\nprobabilistic time series modeling. We show that standard Latent ODEs struggle\nwith dimensionality reduction in systems with time-varying inputs. Our approach\nmitigates this by continuously propagating variational parameters through time,\nestablishing fixed information channels in latent space. This results in a\nflexible and robust method that can learn different system complexities, e.g.\ndeep neural networks or linear matrices. Hereby, it enables efficient\napproximation of the Koopman operator without the need for predefining its\ndimensionality. As our method balances dimensionality reduction and\nreconstruction accuracy, we call it Balanced Neural ODE (B-NODE). We\ndemonstrate the effectiveness of this methods on several academic and\nreal-world test cases, e.g. a power plant or MuJoCo data.\n","authors":["Julius Aka","Johannes Brunnemann","Jörg Eiden","Arne Speerforck","Lars Mikelsons"],"pdf_url":"https://arxiv.org/pdf/2410.10174v3.pdf","comment":"Conference paper under review, after revision"},{"id":"http://arxiv.org/abs/2501.08096v1","updated":"2025-01-14T13:10:13Z","published":"2025-01-14T13:10:13Z","title":"Hybrid Action Based Reinforcement Learning for Multi-Objective\n  Compatible Autonomous Driving","summary":"  Reinforcement Learning (RL) has shown excellent performance in solving\ndecision-making and control problems of autonomous driving, which is\nincreasingly applied in diverse driving scenarios. However, driving is a\nmulti-attribute problem, leading to challenges in achieving multi-objective\ncompatibility for current RL methods, especially in both policy execution and\npolicy iteration. On the one hand, the common action space structure with\nsingle action type limits driving flexibility or results in large behavior\nfluctuations during policy execution. On the other hand, the multi-attribute\nweighted single reward function result in the agent's disproportionate\nattention to certain objectives during policy iterations. To this end, we\npropose a Multi-objective Ensemble-Critic reinforcement learning method with\nHybrid Parametrized Action for multi-objective compatible autonomous driving.\nSpecifically, a parameterized action space is constructed to generate hybrid\ndriving actions, combining both abstract guidance and concrete control\ncommands. A multi-objective critics architecture is constructed considering\nmultiple attribute rewards, to ensure simultaneously focusing on different\ndriving objectives. Additionally, uncertainty-based exploration strategy is\nintroduced to help the agent faster approach viable driving policy. The\nexperimental results in both the simulated traffic environment and the HighD\ndataset demonstrate that our method can achieve multi-objective compatible\nautonomous driving in terms of driving efficiency, action consistency, and\nsafety. It enhances the general performance of the driving while significantly\nincreasing training efficiency.\n","authors":["Guizhe Jin","Zhuoren Li","Bo Leng","Wei Han","Lu Xiong","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2501.08096v1.pdf","comment":"12 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.08085v1","updated":"2025-01-14T12:54:19Z","published":"2025-01-14T12:54:19Z","title":"Dynamic Multimodal Sentiment Analysis: Leveraging Cross-Modal Attention\n  for Enabled Classification","summary":"  This paper explores the development of a multimodal sentiment analysis model\nthat integrates text, audio, and visual data to enhance sentiment\nclassification. The goal is to improve emotion detection by capturing the\ncomplex interactions between these modalities, thereby enabling more accurate\nand nuanced sentiment interpretation. The study evaluates three feature fusion\nstrategies -- late stage fusion, early stage fusion, and multi-headed attention\n-- within a transformer-based architecture. Experiments were conducted using\nthe CMU-MOSEI dataset, which includes synchronized text, audio, and visual\ninputs labeled with sentiment scores. Results show that early stage fusion\nsignificantly outperforms late stage fusion, achieving an accuracy of 71.87\\%,\nwhile the multi-headed attention approach offers marginal improvement, reaching\n72.39\\%. The findings suggest that integrating modalities early in the process\nenhances sentiment classification, while attention mechanisms may have limited\nimpact within the current framework. Future work will focus on refining feature\nfusion techniques, incorporating temporal data, and exploring dynamic feature\nweighting to further improve model performance.\n","authors":["Hui Lee","Singh Suniljit","Yong Siang Ong"],"pdf_url":"https://arxiv.org/pdf/2501.08085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00376v3","updated":"2025-01-14T12:37:26Z","published":"2024-03-01T09:01:53Z","title":"Spurious Feature Eraser: Stabilizing Test-Time Adaptation for\n  Vision-Language Foundation Model","summary":"  Vision-language foundation models have exhibited remarkable success across a\nmultitude of downstream tasks due to their scalability on extensive image-text\npaired data. However, these models also display significant limitations when\napplied to downstream tasks, such as fine-grained image classification, as a\nresult of ``decision shortcuts'' that hinder their generalization capabilities.\nIn this work, we find that the CLIP model possesses a rich set of features,\nencompassing both \\textit{desired invariant causal features} and\n\\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP\non downstream tasks originates from its inability to effectively utilize\npre-trained features in accordance with specific task requirements. To address\nthis challenge, we propose a simple yet effective method, Spurious Feature\nEraser (SEraser), to alleviate the decision shortcuts by erasing the spurious\nfeatures. Specifically, we introduce a test-time prompt tuning paradigm that\noptimizes a learnable prompt, thereby compelling the model to exploit invariant\nfeatures while disregarding decision shortcuts during the inference phase. The\nproposed method effectively alleviates excessive dependence on potentially\nmisleading spurious information. We conduct comparative analysis of the\nproposed method against various approaches which validates the significant\nsuperiority.\n","authors":["Huan Ma","Yan Zhu","Changqing Zhang","Peilin Zhao","Baoyuan Wu","Long-Kai Huang","Qinghua Hu","Bingzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2403.00376v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08071v1","updated":"2025-01-14T12:36:18Z","published":"2025-01-14T12:36:18Z","title":"CuAsmRL: Optimizing GPU SASS Schedules via Deep Reinforcement Learning","summary":"  Large language models (LLMs) are remarked by their substantial computational\nrequirements. To mitigate the cost, researchers develop specialized CUDA\nkernels, which often fuse several tensor operations to maximize the utilization\nof GPUs as much as possible. However, those specialized kernels may still leave\nperformance on the table as CUDA assembly experts show that manual optimization\nof GPU SASS schedules can lead to better performance, and trial-and-error is\nlargely employed to manually find the best GPU SASS schedules.\n  In this work, we employ an automatic approach to optimize GPU SASS schedules,\nwhich thus can be integrated into existing compiler frameworks. The key to\nautomatic optimization is training an RL agent to mimic how human experts\nperform manual scheduling. To this end, we formulate an assembly game, where RL\nagents can play to find the best GPU SASS schedules. The assembly game starts\nfrom a \\textit{-O3} optimized SASS schedule, and the RL agents can iteratively\napply actions to mutate the current schedules. Positive rewards are generated\nif the mutated schedules get higher throughput by executing on GPUs.\nExperiments show that CuAsmRL can further improve the performance of existing\nspecialized CUDA kernels transparently by up to $26\\%$, and on average $9\\%$.\nMoreover, it is used as a tool to reveal potential optimization moves learned\nautomatically.\n","authors":["Guoliang He","Eiko Yoneki"],"pdf_url":"https://arxiv.org/pdf/2501.08071v1.pdf","comment":"cgo 2025"},{"id":"http://arxiv.org/abs/2501.08067v1","updated":"2025-01-14T12:33:02Z","published":"2025-01-14T12:33:02Z","title":"Optimal Policy Adaptation under Covariate Shift","summary":"  Transfer learning of prediction models has been extensively studied, while\nthe corresponding policy learning approaches are rarely discussed. In this\npaper, we propose principled approaches for learning the optimal policy in the\ntarget domain by leveraging two datasets: one with full information from the\nsource domain and the other from the target domain with only covariates. First,\nunder the setting of covariate shift, we formulate the problem from a\nperspective of causality and present the identifiability assumptions for the\nreward induced by a given policy. Then, we derive the efficient influence\nfunction and the semiparametric efficiency bound for the reward. Based on this,\nwe construct a doubly robust and semiparametric efficient estimator for the\nreward and then learn the optimal policy by optimizing the estimated reward.\nMoreover, we theoretically analyze the bias and the generalization error bound\nfor the learned policy. Furthermore, in the presence of both covariate and\nconcept shifts, we propose a novel sensitivity analysis method to evaluate the\nrobustness of the proposed policy learning approach. Extensive experiments\ndemonstrate that the approach not only estimates the reward more accurately but\nalso yields a policy that closely approximates the theoretically optimal\npolicy.\n","authors":["Xueqing Liu","Qinwei Yang","Zhaoqing Tian","Ruocheng Guo","Peng Wu"],"pdf_url":"https://arxiv.org/pdf/2501.08067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20020v3","updated":"2025-01-14T12:31:48Z","published":"2024-07-29T13:57:24Z","title":"ImagiNet: A Multi-Content Benchmark for Synthetic Image Detection","summary":"  Recent generative models produce images with a level of authenticity that\nmakes them nearly indistinguishable from real photos and artwork. Potential\nharmful use cases of these models, necessitate the creation of robust synthetic\nimage detectors. However, current datasets in the field contain generated\nimages with questionable quality or have examples from one predominant content\ntype which leads to poor generalizability of the underlying detectors. We find\nthat the curation of a balanced amount of high-resolution generated images\nacross various content types is crucial for the generalizability of detectors,\nand introduce ImagiNet, a dataset of 200K examples, spanning four categories:\nphotos, paintings, faces, and miscellaneous. Synthetic images in ImagiNet are\nproduced with both open-source and proprietary generators, whereas real\ncounterparts for each content type are collected from public datasets. The\nstructure of ImagiNet allows for a two-track evaluation system: i)\nclassification as real or synthetic and ii) identification of the generative\nmodel. To establish a strong baseline, we train a ResNet-50 model using a\nself-supervised contrastive objective (SelfCon) for each track which achieves\nevaluation AUC of up to 0.99 and balanced accuracy ranging from 86% to 95%,\neven under conditions that involve compression and resizing. The provided model\nis generalizable enough to achieve zero-shot state-of-the-art performance on\nprevious synthetic detection benchmarks. We provide ablations to demonstrate\nthe importance of content types and publish code and data.\n","authors":["Delyan Boychev","Radostin Cholakov"],"pdf_url":"https://arxiv.org/pdf/2407.20020v3.pdf","comment":"Workshop on Datasets and Evaluators of AI Safety, AAAI 2025"},{"id":"http://arxiv.org/abs/2410.03335v2","updated":"2025-01-14T11:59:03Z","published":"2024-10-04T11:40:53Z","title":"Audio-Agent: Leveraging LLMs For Audio Generation, Editing and\n  Composition","summary":"  We introduce Audio-Agent, a multimodal framework for audio generation,\nediting and composition based on text or video inputs. Conventional approaches\nfor text-to-audio (TTA) tasks often make single-pass inferences from text\ndescriptions. While straightforward, this design struggles to produce\nhigh-quality audio when given complex text conditions. In our method, we\nutilize a pre-trained TTA diffusion network as the audio generation agent to\nwork in tandem with GPT-4, which decomposes the text condition into atomic,\nspecific instructions and calls the agent for audio generation. In doing so,\nAudio-Agent can generate high-quality audio that is closely aligned with the\nprovided text or video exhibiting complex and multiple events, while supporting\nvariable-length and variable-volume generation. For video-to-audio (VTA) tasks,\nmost existing methods require training a timestamp detector to synchronize\nvideo events with the generated audio, a process that can be tedious and\ntime-consuming. Instead, we propose a simpler approach by fine-tuning a\npre-trained Large Language Model (LLM), e.g., Gemma2-2B-it, to obtain both\nsemantic and temporal conditions that bridge the video and audio modality.\nConsequently, our framework contributes a comprehensive solution for both TTA\nand VTA tasks without substantial computational overhead in training.\n","authors":["Zixuan Wang","Chi-Keung Tang","Yu-Wing Tai"],"pdf_url":"https://arxiv.org/pdf/2410.03335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08050v1","updated":"2025-01-14T11:56:05Z","published":"2025-01-14T11:56:05Z","title":"On the use of Statistical Learning Theory for model selection in\n  Structural Health Monitoring","summary":"  Whenever data-based systems are employed in engineering applications,\ndefining an optimal statistical representation is subject to the problem of\nmodel selection. This paper focusses on how well models can generalise in\nStructural Health Monitoring (SHM). Although statistical model validation in\nthis field is often performed heuristically, it is possible to estimate\ngeneralisation more rigorously using the bounds provided by Statistical\nLearning Theory (SLT). Therefore, this paper explores the selection process of\na kernel smoother for modelling the impulse response of a linear oscillator\nfrom the perspective of SLT. It is demonstrated that incorporating domain\nknowledge into the regression problem yields a lower guaranteed risk, thereby\nenhancing generalisation.\n","authors":["C. A. Lindley","N. Dervilis","K. Worden"],"pdf_url":"https://arxiv.org/pdf/2501.08050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08049v1","updated":"2025-01-14T11:56:00Z","published":"2025-01-14T11:56:00Z","title":"Self-Attentive Spatio-Temporal Calibration for Precise Intermediate\n  Layer Matching in ANN-to-SNN Distillation","summary":"  Spiking Neural Networks (SNNs) are promising for low-power computation due to\ntheir event-driven mechanism but often suffer from lower accuracy compared to\nArtificial Neural Networks (ANNs). ANN-to-SNN knowledge distillation can\nimprove SNN performance, but previous methods either focus solely on label\ninformation, missing valuable intermediate layer features, or use a layer-wise\napproach that neglects spatial and temporal semantic inconsistencies, leading\nto performance degradation.To address these limitations, we propose a novel\nmethod called self-attentive spatio-temporal calibration (SASTC). SASTC uses\nself-attention to identify semantically aligned layer pairs between ANN and\nSNN, both spatially and temporally. This enables the autonomous transfer of\nrelevant semantic information. Extensive experiments show that SASTC\noutperforms existing methods, effectively solving the mismatching problem.\nSuperior accuracy results include 95.12% on CIFAR-10, 79.40% on CIFAR-100 with\n2 time steps, and 68.69% on ImageNet with 4 time steps for static datasets, and\n97.92% on DVS-Gesture and 83.60% on DVS-CIFAR10 for neuromorphic datasets. This\nmarks the first time SNNs have outperformed ANNs on both CIFAR-10 and\nCIFAR-100, shedding the new light on the potential applications of SNNs.\n","authors":["Di Hong","Yueming Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08047v1","updated":"2025-01-14T11:54:45Z","published":"2025-01-14T11:54:45Z","title":"Gen-A: Generalizing Ambisonics Neural Encoding to Unseen Microphone\n  Arrays","summary":"  Using deep neural networks (DNNs) for encoding of microphone array (MA)\nsignals to the Ambisonics spatial audio format can surpass certain limitations\nof established conventional methods, but existing DNN-based methods need to be\ntrained separately for each MA. This paper proposes a DNN-based method for\nAmbisonics encoding that can generalize to arbitrary MA geometries unseen\nduring training. The method takes as inputs the MA geometry and MA signals and\nuses a multi-level encoder consisting of separate paths for geometry and signal\ndata, where geometry features inform the signal encoder at each level. The\nmethod is validated in simulated anechoic and reverberant conditions with one\nand two sources. The results indicate improvement over conventional encoding\nacross the whole frequency range for dry scenes, while for reverberant scenes\nthe improvement is frequency-dependent.\n","authors":["Mikko Heikkinen","Archontis Politis","Konstantinos Drossos","Tuomas Virtanen"],"pdf_url":"https://arxiv.org/pdf/2501.08047v1.pdf","comment":"Accepted for publication in Proceedings of the 2025 IEEE\n  International Conference on Acoustics, Speech and Signal Processing"},{"id":"http://arxiv.org/abs/2501.08044v1","updated":"2025-01-14T11:52:16Z","published":"2025-01-14T11:52:16Z","title":"UFGraphFR: An attempt at a federated recommendation system based on user\n  text characteristics","summary":"  Federated learning has become an important research area in 'private\ncomputing' due to the 'useable invisibility' of data during training. Inspired\nby Federated learning, the federated recommendation system has gradually become\na new recommendation service architecture that can protect users' privacy. The\nuse of user diagrams to enhance federated recommendations is a promising topic.\nHow to use user diagrams to enhance federated recommendations is a promising\nresearch topic. However, it's a great challenge to construct a user diagram\nwithout compromising privacy in a federated learning scenario. Inspired by the\nsimple idea that similar users often have the same attribute characteristics,\nwe propose a personalized federated recommendation algorithm based on the user\nrelationship graph constructed by the user text characteristics(Graph\nFederation Recommendation System based on User Text description Features,\nUFGraphFR). The method uses the embedding layer weight of the user's text\nfeature description to construct the user relationship graph. It introduces the\nTransformer mechanism to capture the sequence modeling of the user's historical\ninteraction sequence. Without access to user history interactions and specific\nuser attributes, the federal learning privacy protection of data 'useable\ninvisibility' is embodied. Preliminary experiments on some benchmark datasets\ndemonstrate the superior performance of UFGraphFR. Our experiments show that\nthis model can protect user privacy to some extent without affecting the\nperformance of the recommendation system. The code will be easily available on\nhttps://github.com/trueWangSyutung/UFGraphFR.\n","authors":["Xudong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08043v1","updated":"2025-01-14T11:51:57Z","published":"2025-01-14T11:51:57Z","title":"PolyLUT: Ultra-low Latency Polynomial Inference with Hardware-Aware\n  Structured Pruning","summary":"  Standard deep neural network inference involves the computation of\ninterleaved linear maps and nonlinear activation functions. Prior work for\nultra-low latency implementations has hardcoded these operations inside FPGA\nlookup tables (LUTs). However, FPGA LUTs can implement a much greater variety\nof functions. In this paper, we propose a novel approach to training DNNs for\nFPGA deployment using multivariate polynomials as the basic building block. Our\nmethod takes advantage of the flexibility offered by the soft logic, hiding the\npolynomial evaluation inside the LUTs with minimal overhead. By using\npolynomial building blocks, we achieve the same accuracy using considerably\nfewer layers of soft logic than by using linear functions, leading to\nsignificant latency and area improvements. LUT-based implementations also face\na significant challenge: the LUT size grows exponentially with the number of\ninputs. Prior work relies on a priori fixed sparsity, with results heavily\ndependent on seed selection. To address this, we propose a structured pruning\nstrategy using a bespoke hardware-aware group regularizer that encourages a\nparticular sparsity pattern that leads to a small number of inputs per neuron.\nWe demonstrate the effectiveness of PolyLUT on three tasks: network intrusion\ndetection, jet identification at the CERN Large Hadron Collider, and MNIST.\n","authors":["Marta Andronic","Jiawen Li","George A. Constantinides"],"pdf_url":"https://arxiv.org/pdf/2501.08043v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2309.02334"},{"id":"http://arxiv.org/abs/2501.08040v1","updated":"2025-01-14T11:46:36Z","published":"2025-01-14T11:46:36Z","title":"Convergence Analysis of Real-time Recurrent Learning (RTRL) for a class\n  of Recurrent Neural Networks","summary":"  Recurrent neural networks (RNNs) are commonly trained with the truncated\nbackpropagation-through-time (TBPTT) algorithm. For the purposes of\ncomputational tractability, the TBPTT algorithm truncates the chain rule and\ncalculates the gradient on a finite block of the overall data sequence. Such\napproximation could lead to significant inaccuracies, as the block length for\nthe truncated backpropagation is typically limited to be much smaller than the\noverall sequence length. In contrast, Real-time recurrent learning (RTRL) is an\nonline optimization algorithm which asymptotically follows the true gradient of\nthe loss on the data sequence as the number of sequence time steps $t\n\\rightarrow \\infty$. RTRL forward propagates the derivatives of the RNN\nhidden/memory units with respect to the parameters and, using the forward\nderivatives, performs online updates of the parameters at each time step in the\ndata sequence. RTRL's online forward propagation allows for exact optimization\nover extremely long data sequences, although it can be computationally costly\nfor models with large numbers of parameters. We prove convergence of the RTRL\nalgorithm for a class of RNNs. The convergence analysis establishes a fixed\npoint for the joint distribution of the data sequence, RNN hidden layer, and\nthe RNN hidden layer forward derivatives as the number of data samples from the\nsequence and the number of training steps tend to infinity. We prove\nconvergence of the RTRL algorithm to a stationary point of the loss. Numerical\nstudies illustrate our theoretical results. One potential application area for\nRTRL is the analysis of financial data, which typically involve long time\nseries and models with small to medium numbers of parameters. This makes RTRL\ncomputationally tractable and a potentially appealing optimization method for\ntraining models. Thus, we include an example of RTRL applied to limit order\nbook data.\n","authors":["Samuel Chun-Hei Lam","Justin Sirignano","Konstantinos Spiliopoulos"],"pdf_url":"https://arxiv.org/pdf/2501.08040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08037v1","updated":"2025-01-14T11:42:51Z","published":"2025-01-14T11:42:51Z","title":"Enhanced SPS Velocity-adaptive Scheme: Access Fariness in 5G NR V2I\n  Networks","summary":"  Vehicle-to-Infrastructure (V2I) technology enables information exchange\nbetween vehicles and road infrastructure. Specifically, when a vehicle\napproaches a roadside unit (RSU), it can exchange information with the RSU to\nobtain accurate data that assists in driving. With the release of the 3rd\nGeneration Partnership Project (3GPP) Release 16, which includes the 5G New\nRadio (NR) Vehicle-to-Everything (V2X) standards, vehicles typically adopt\nmode-2 communication using sensing-based semi-persistent scheduling (SPS) for\nresource allocation. In this approach, vehicles identify candidate resources\nwithin a selection window and exclude ineligible resources based on information\nfrom a sensing window. However, vehicles often drive at different speeds,\nresulting in varying amounts of data transmission with RSUs as they pass by,\nwhich leads to unfair access. Therefore, it is essential to design an access\nscheme that accounts for different vehicle speeds to achieve fair access across\nthe network. This paper formulates an optimization problem for vehicular\nnetworks and proposes a multi-objective optimization scheme to address it by\nadjusting the selection window in the SPS mechanism of 5G NR V2I mode-2.\nSimulation results demonstrate the effectiveness of the proposed scheme\n","authors":["Xiao Xu","Qiong Wu","Pingyi Fan","Kezhi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08037v1.pdf","comment":"This paper has been submitted to IEEE Journal. The source code has\n  been released at:\n  https://github.com/qiongwu86/Enhanced-SPS-Velocity-adaptiveScheme-Access-Fariness-in-5G-NR-V2I-Networks"},{"id":"http://arxiv.org/abs/2411.17350v2","updated":"2025-01-14T11:34:26Z","published":"2024-11-26T11:52:47Z","title":"Correlation-Aware Graph Convolutional Networks for Multi-Label Node\n  Classification","summary":"  Multi-label node classification is an important yet under-explored domain in\ngraph mining as many real-world nodes belong to multiple categories rather than\njust a single one. Although a few efforts have been made by utilizing Graph\nConvolution Networks (GCNs) to learn node representations and model\ncorrelations between multiple labels in the embedding space, they still suffer\nfrom the ambiguous feature and ambiguous topology induced by multiple labels,\nwhich reduces the credibility of the messages delivered in graphs and overlooks\nthe label correlations on graph data. Therefore, it is crucial to reduce the\nambiguity and empower the GCNs for accurate classification. However, this is\nquite challenging due to the requirement of retaining the distinctiveness of\neach label while fully harnessing the correlation between labels\nsimultaneously. To address these issues, in this paper, we propose a\nCorrelation-aware Graph Convolutional Network (CorGCN) for multi-label node\nclassification. By introducing a novel Correlation-Aware Graph Decomposition\nmodule, CorGCN can learn a graph that contains rich label-correlated\ninformation for each label. It then employs a Correlation-Enhanced Graph\nConvolution to model the relationships between labels during message passing to\nfurther bolster the classification process. Extensive experiments on five\ndatasets demonstrate the effectiveness of our proposed CorGCN.\n","authors":["Yuanchen Bei","Weizhi Chen","Hao Chen","Sheng Zhou","Carl Yang","Jiapei Fan","Longtao Huang","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2411.17350v2.pdf","comment":"12 pages, accepted by KDD2025"},{"id":"http://arxiv.org/abs/2402.03169v3","updated":"2025-01-14T11:32:56Z","published":"2024-02-05T16:38:30Z","title":"A Random Matrix Approach to Low-Multilinear-Rank Tensor Approximation","summary":"  This work presents a comprehensive understanding of the estimation of a\nplanted low-rank signal from a general spiked tensor model near the\ncomputational threshold. Relying on standard tools from the theory of large\nrandom matrices, we characterize the large-dimensional spectral behavior of the\nunfoldings of the data tensor and exhibit relevant signal-to-noise ratios\ngoverning the detectability of the principal directions of the signal. These\nresults allow to accurately predict the reconstruction performance of truncated\nmultilinear SVD (MLSVD) in the non-trivial regime. This is particularly\nimportant since it serves as an initialization of the higher-order orthogonal\niteration (HOOI) scheme, whose convergence to the best low-multilinear-rank\napproximation depends entirely on its initialization. We give a sufficient\ncondition for the convergence of HOOI and show that the number of iterations\nbefore convergence tends to $1$ in the large-dimensional limit.\n","authors":["Hugo Lebeau","Florent Chatelain","Romain Couillet"],"pdf_url":"https://arxiv.org/pdf/2402.03169v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08019v1","updated":"2025-01-14T11:19:52Z","published":"2025-01-14T11:19:52Z","title":"An AI-driven framework for rapid and localized optimizations of urban\n  open spaces","summary":"  As urbanization accelerates, open spaces are increasingly recognized for\ntheir role in enhancing sustainability and well-being, yet they remain\nunderexplored compared to built spaces. This study introduces an AI-driven\nframework that integrates machine learning models (MLMs) and explainable AI\ntechniques to optimize Sky View Factor (SVF) and visibility, key spatial\nmetrics influencing thermal comfort and perceived safety in urban spaces.\nUnlike global optimization methods, which are computationally intensive and\nimpractical for localized adjustments, this framework supports incremental\ndesign improvements with lower computational costs and greater flexibility. The\nframework employs SHapley Adaptive Explanations (SHAP) to analyze feature\nimportance and Counterfactual Explanations (CFXs) to propose minimal design\nchanges. Simulations tested five MLMs, identifying XGBoost as the most\naccurate, with building width, park area, and heights of surrounding buildings\nas critical for SVF, and distances from southern buildings as key for\nvisibility. Compared to Genetic Algorithms, which required approximately 15/30\nminutes across 3/4 generations to converge, the tested CFX approach achieved\noptimized results in 1 minute with a 5% RMSE error, demonstrating significantly\nfaster performance and suitability for scalable retrofitting strategies. This\ninterpretable and computationally efficient framework advances urban\nperformance optimization, providing data-driven insights and practical\nretrofitting solutions for enhancing usability and environmental quality across\ndiverse urban contexts.\n","authors":["Pegah Eshraghi","Arman Nikkhah Dehnavi","Maedeh Mirdamadi","Riccardo Talami","Zahra-Sadat Zomorodian"],"pdf_url":"https://arxiv.org/pdf/2501.08019v1.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2403.02774v3","updated":"2025-01-14T11:14:57Z","published":"2024-03-05T08:41:41Z","title":"Fast, Scale-Adaptive, and Uncertainty-Aware Downscaling of Earth System\n  Model Fields with Generative Machine Learning","summary":"  Accurate and high-resolution Earth system model (ESM) simulations are\nessential to assess the ecological and socio-economic impacts of anthropogenic\nclimate change, but are computationally too expensive to be run at sufficiently\nhigh spatial resolution. Recent machine learning approaches have shown\npromising results in downscaling ESM simulations, outperforming\nstate-of-the-art statistical approaches. However, existing methods require\ncomputationally costly retraining for each ESM and extrapolate poorly to\nclimates unseen during training. We address these shortcomings by learning a\nconsistency model (CM) that efficiently and accurately downscales arbitrary ESM\nsimulations without retraining in a zero-shot manner. Our approach yields\nprobabilistic downscaled fields at a resolution only limited by the\nobservational reference data. We show that the CM outperforms state-of-the-art\ndiffusion models at a fraction of computational cost while maintaining high\ncontrollability on the downscaling task. Further, our method generalizes to\nclimate states unseen during training without explicitly formulated physical\nconstraints.\n","authors":["Philipp Hess","Michael Aich","Baoxiang Pan","Niklas Boers"],"pdf_url":"https://arxiv.org/pdf/2403.02774v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04594v2","updated":"2025-01-14T11:03:05Z","published":"2024-12-05T20:15:34Z","title":"Learning Symmetries via Weight-Sharing with Doubly Stochastic Tensors","summary":"  Group equivariance has emerged as a valuable inductive bias in deep learning,\nenhancing generalization, data efficiency, and robustness. Classically, group\nequivariant methods require the groups of interest to be known beforehand,\nwhich may not be realistic for real-world data. Additionally, baking in fixed\ngroup equivariance may impose overly restrictive constraints on model\narchitecture. This highlights the need for methods that can dynamically\ndiscover and apply symmetries as soft constraints. For neural network\narchitectures, equivariance is commonly achieved through group transformations\nof a canonical weight tensor, resulting in weight sharing over a given group\n$G$. In this work, we propose to learn such a weight-sharing scheme by defining\na collection of learnable doubly stochastic matrices that act as soft\npermutation matrices on canonical weight tensors, which can take regular group\nrepresentations as a special case. This yields learnable kernel transformations\nthat are jointly optimized with downstream tasks. We show that when the dataset\nexhibits strong symmetries, the permutation matrices will converge to regular\ngroup representations and our weight-sharing networks effectively become\nregular group convolutions. Additionally, the flexibility of the method enables\nit to effectively pick up on partial symmetries.\n","authors":["Putri A. van der Linden","Alejandro García-Castellanos","Sharvaree Vadgama","Thijs P. Kuipers","Erik J. Bekkers"],"pdf_url":"https://arxiv.org/pdf/2412.04594v2.pdf","comment":"19 pages, 14 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.08002v1","updated":"2025-01-14T10:46:41Z","published":"2025-01-14T10:46:41Z","title":"Maximizing Uncertainty for Federated learning via Bayesian\n  Optimisation-based Model Poisoning","summary":"  As we transition from Narrow Artificial Intelligence towards Artificial Super\nIntelligence, users are increasingly concerned about their privacy and the\ntrustworthiness of machine learning (ML) technology. A common denominator for\nthe metrics of trustworthiness is the quantification of uncertainty inherent in\nDL algorithms, and specifically in the model parameters, input data, and model\npredictions. One of the common approaches to address privacy-related issues in\nDL is to adopt distributed learning such as federated learning (FL), where\nprivate raw data is not shared among users. Despite the privacy-preserving\nmechanisms in FL, it still faces challenges in trustworthiness. Specifically,\nthe malicious users, during training, can systematically create malicious model\nparameters to compromise the models predictive and generative capabilities,\nresulting in high uncertainty about their reliability. To demonstrate malicious\nbehaviour, we propose a novel model poisoning attack method named Delphi which\naims to maximise the uncertainty of the global model output. We achieve this by\ntaking advantage of the relationship between the uncertainty and the model\nparameters of the first hidden layer of the local model. Delphi employs two\ntypes of optimisation , Bayesian Optimisation and Least Squares Trust Region,\nto search for the optimal poisoned model parameters, named as Delphi-BO and\nDelphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise\nthe distance of the predictive probability distribution towards an uncertain\ndistribution of model output. Furthermore, we establish a mathematical proof\nfor the attack effectiveness demonstrated in FL. Numerical results demonstrate\nthat Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR\nhighlighting vulnerability of FL systems to model poisoning attacks.\n","authors":["Marios Aristodemou","Xiaolan Liu","Yuan Wang","Konstantinos G. Kyriakopoulos","Sangarapillai Lambotharan","Qingsong Wei"],"pdf_url":"https://arxiv.org/pdf/2501.08002v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.07999v1","updated":"2025-01-14T10:41:46Z","published":"2025-01-14T10:41:46Z","title":"Unsupervised Feature Construction for Anomaly Detection in Time Series\n  -- An Evaluation","summary":"  To detect anomalies with precision and without prior knowledge in time\nseries, is it better to build a detector from the initial temporal\nrepresentation, or to compute a new (tabular) representation using an existing\nautomatic variable construction library? In this article, we address this\nquestion by conducting an in-depth experimental study for two popular detectors\n(Isolation Forest and Local Outlier Factor). The obtained results, for 5\ndifferent datasets, show that the new representation, computed using the\ntsfresh library, allows Isolation Forest to significantly improve its\nperformance.\n","authors":["Marine Hamon","Vincent Lemaire","Nour Eddine Yassine Nair-Benrekia","Samuel Berlemont","Julien Cumin"],"pdf_url":"https://arxiv.org/pdf/2501.07999v1.pdf","comment":"7"},{"id":"http://arxiv.org/abs/2410.07662v3","updated":"2025-01-14T10:41:34Z","published":"2024-10-10T07:12:32Z","title":"Scalable and Resource-Efficient Second-Order Federated Learning via\n  Over-the-Air Aggregation","summary":"  Second-order federated learning (FL) algorithms offer faster convergence than\ntheir first-order counterparts by leveraging curvature information. However,\nthey are hindered by high computational and storage costs, particularly for\nlarge-scale models. Furthermore, the communication overhead associated with\nlarge models and digital transmission exacerbates these challenges, causing\ncommunication bottlenecks. In this work, we propose a scalable second-order FL\nalgorithm using a sparse Hessian estimate and leveraging over-the-air\naggregation, making it feasible for larger models. Our simulation results\ndemonstrate more than $67\\%$ of communication resources and energy savings\ncompared to other first and second-order baselines.\n","authors":["Abdulmomen Ghalkha","Chaouki Ben Issaid","Mehdi Bennis"],"pdf_url":"https://arxiv.org/pdf/2410.07662v3.pdf","comment":"6 pages, 1 figure, 4 subfigures, letter"},{"id":"http://arxiv.org/abs/2501.07996v1","updated":"2025-01-14T10:39:04Z","published":"2025-01-14T10:39:04Z","title":"Reward Compatibility: A Framework for Inverse RL","summary":"  We provide an original theoretical study of Inverse Reinforcement Learning\n(IRL) through the lens of reward compatibility, a novel framework to quantify\nthe compatibility of a reward with the given expert's demonstrations.\nIntuitively, a reward is more compatible with the demonstrations the closer the\nperformance of the expert's policy computed with that reward is to the optimal\nperformance for that reward. This generalizes the notion of feasible reward\nset, the most common framework in the theoretical IRL literature, for which a\nreward is either compatible or not compatible. The grayscale introduced by the\nreward compatibility is the key to extend the realm of provably efficient IRL\nfar beyond what is attainable with the feasible reward set: from tabular to\nlarge-scale MDPs. We analyze the IRL problem across various settings, including\noptimal and suboptimal expert's demonstrations and both online and offline data\ncollection. For all of these dimensions, we provide a tractable algorithm and\ncorresponding sample complexity analysis, as well as various insights on reward\ncompatibility and how the framework can pave the way to yet more general\nproblem settings.\n","authors":["Filippo Lazzati","Mirco Mutti","Alberto Metelli"],"pdf_url":"https://arxiv.org/pdf/2501.07996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07994v1","updated":"2025-01-14T10:38:18Z","published":"2025-01-14T10:38:18Z","title":"Combining imaging and shape features for prediction tasks of Alzheimer's\n  disease classification and brain age regression","summary":"  We investigate combining imaging and shape features extracted from MRI for\nthe clinically relevant tasks of brain age prediction and Alzheimer's disease\nclassification. Our proposed model fuses ResNet-extracted image embeddings with\nshape embeddings from a bespoke graph neural network. The shape embeddings are\nderived from surface meshes of 15 brain structures, capturing detailed\ngeometric information. Combined with the appearance features from T1-weighted\nimages, we observe improvements in the prediction performance on both tasks,\nwith substantial gains for classification. We evaluate the model using public\ndatasets, including CamCAN, IXI, and OASIS3, demonstrating the effectiveness of\nfusing imaging and shape features for brain analysis.\n","authors":["Nairouz Shehata","Carolina Piçarra","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2501.07994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03033v3","updated":"2025-01-14T10:34:00Z","published":"2024-11-05T12:10:02Z","title":"Rethinking Decoders for Transformer-based Semantic Segmentation: A\n  Compression Perspective","summary":"  State-of-the-art methods for Transformer-based semantic segmentation\ntypically adopt Transformer decoders that are used to extract additional\nembeddings from image embeddings via cross-attention, refine either or both\ntypes of embeddings via self-attention, and project image embeddings onto the\nadditional embeddings via dot-product. Despite their remarkable success, these\nempirical designs still lack theoretical justifications or interpretations,\nthus hindering potentially principled improvements. In this paper, we argue\nthat there are fundamental connections between semantic segmentation and\ncompression, especially between the Transformer decoders and Principal\nComponent Analysis (PCA). From such a perspective, we derive a white-box, fully\nattentional DEcoder for PrIncipled semantiC segemenTation (DEPICT), with the\ninterpretations as follows: 1) the self-attention operator refines image\nembeddings to construct an ideal principal subspace that aligns with the\nsupervision and retains most information; 2) the cross-attention operator seeks\nto find a low-rank approximation of the refined image embeddings, which is\nexpected to be a set of orthonormal bases of the principal subspace and\ncorresponds to the predefined classes; 3) the dot-product operation yields\ncompact representation for image embeddings as segmentation masks. Experiments\nconducted on dataset ADE20K find that DEPICT consistently outperforms its\nblack-box counterpart, Segmenter, and it is light weight and more robust.\n","authors":["Qishuai Wen","Chun-Guang Li"],"pdf_url":"https://arxiv.org/pdf/2411.03033v3.pdf","comment":"NeurIPS2024. Code:https://github.com/QishuaiWen/DEPICT/"},{"id":"http://arxiv.org/abs/2406.03912v2","updated":"2025-01-14T10:32:32Z","published":"2024-06-06T09:51:30Z","title":"GenSafe: A Generalizable Safety Enhancer for Safe Reinforcement Learning\n  Algorithms Based on Reduced Order Markov Decision Process Model","summary":"  Safe Reinforcement Learning (SRL) aims to realize a safe learning process for\nDeep Reinforcement Learning (DRL) algorithms by incorporating safety\nconstraints. However, the efficacy of SRL approaches often relies on accurate\nfunction approximations, which are notably challenging to achieve in the early\nlearning stages due to data insufficiency. To address this issue, we introduce\nin this work a novel Generalizable Safety enhancer (GenSafe) that is able to\novercome the challenge of data insufficiency and enhance the performance of SRL\napproaches. Leveraging model order reduction techniques, we first propose an\ninnovative method to construct a Reduced Order Markov Decision Process (ROMDP)\nas a low-dimensional approximator of the original safety constraints. Then, by\nsolving the reformulated ROMDP-based constraints, GenSafe refines the actions\nof the agent to increase the possibility of constraint satisfaction.\nEssentially, GenSafe acts as an additional safety layer for SRL algorithms. We\nevaluate GenSafe on multiple SRL approaches and benchmark problems. The results\ndemonstrate its capability to improve safety performance, especially in the\nearly learning phases, while maintaining satisfactory task performance. Our\nproposed GenSafe not only offers a novel measure to augment existing SRL\nmethods but also shows broad compatibility with various SRL algorithms, making\nit applicable to a wide range of systems and SRL problems.\n","authors":["Zhehua Zhou","Xuan Xie","Jiayang Song","Zhan Shu","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2406.03912v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07985v1","updated":"2025-01-14T10:13:41Z","published":"2025-01-14T10:13:41Z","title":"CHEQ-ing the Box: Safe Variable Impedance Learning for Robotic Polishing","summary":"  Robotic systems are increasingly employed for industrial automation, with\ncontact-rich tasks like polishing requiring dexterity and compliant behaviour.\nThese tasks are difficult to model, making classical control challenging. Deep\nreinforcement learning (RL) offers a promising solution by enabling the\nlearning of models and control policies directly from data. However, its\napplication to real-world problems is limited by data inefficiency and unsafe\nexploration. Adaptive hybrid RL methods blend classical control and RL\nadaptively, combining the strengths of both: structure from control and\nlearning from RL. This has led to improvements in data efficiency and\nexploration safety. However, their potential for hardware applications remains\nunderexplored, with no evaluations on physical systems to date. Such\nevaluations are critical to fully assess the practicality and effectiveness of\nthese methods in real-world settings. This work presents an experimental\ndemonstration of the hybrid RL algorithm CHEQ for robotic polishing with\nvariable impedance, a task requiring precise force and velocity tracking. In\nsimulation, we show that variable impedance enhances polishing performance. We\ncompare standalone RL with adaptive hybrid RL, demonstrating that CHEQ achieves\neffective learning while adhering to safety constraints. On hardware, CHEQ\nachieves effective polishing behaviour, requiring only eight hours of training\nand incurring just five failures. These results highlight the potential of\nadaptive hybrid RL for real-world, contact-rich tasks trained directly on\nhardware.\n","authors":["Emma Cramer","Lukas Jäschke","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2501.07985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08558v2","updated":"2025-01-14T10:02:39Z","published":"2024-09-13T06:24:18Z","title":"Fair CoVariance Neural Networks","summary":"  Covariance-based data processing is widespread across signal processing and\nmachine learning applications due to its ability to model data\ninterconnectivities and dependencies. However, harmful biases in the data may\nbecome encoded in the sample covariance matrix and cause data-driven methods to\ntreat different subpopulations unfairly. Existing works such as fair principal\ncomponent analysis (PCA) mitigate these effects, but remain unstable in low\nsample regimes, which in turn may jeopardize the fairness goal. To address both\nbiases and instability, we propose Fair coVariance Neural Networks (FVNNs),\nwhich perform graph convolutions on the covariance matrix for both fair and\naccurate predictions. Our FVNNs provide a flexible model compatible with\nseveral existing bias mitigation techniques. In particular, FVNNs allow for\nmitigating the bias in two ways: first, they operate on fair covariance\nestimates that remove biases from their principal components; second, they are\ntrained in an end-to-end fashion via a fairness regularizer in the loss\nfunction so that the model parameters are tailored to solve the task directly\nin a fair manner. We prove that FVNNs are intrinsically fairer than analogous\nPCA approaches thanks to their stability in low sample regimes. We validate the\nrobustness and fairness of our model on synthetic and real-world data,\nshowcasing the flexibility of FVNNs along with the tradeoff between fair and\naccurate performance.\n","authors":["Andrea Cavallo","Madeline Navarro","Santiago Segarra","Elvin Isufi"],"pdf_url":"https://arxiv.org/pdf/2409.08558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02931v2","updated":"2025-01-14T10:01:41Z","published":"2025-01-06T11:14:18Z","title":"Self-Attention as a Parametric Endofunctor: A Categorical Framework for\n  Transformer Architectures","summary":"  Self-attention mechanisms have revolutionised deep learning architectures,\nyet their core mathematical structures remain incompletely understood. In this\nwork, we develop a category-theoretic framework focusing on the linear\ncomponents of self-attention. Specifically, we show that the query, key, and\nvalue maps naturally define a parametric 1-morphism in the 2-category\n$\\mathbf{Para(Vect)}$. On the underlying 1-category $\\mathbf{Vect}$, these maps\ninduce an endofunctor whose iterated composition precisely models multi-layer\nattention. We further prove that stacking multiple self-attention layers\ncorresponds to constructing the free monad on this endofunctor. For positional\nencodings, we demonstrate that strictly additive embeddings correspond to\nmonoid actions in an affine sense, while standard sinusoidal encodings, though\nnot additive, retain a universal property among injective (faithful)\nposition-preserving maps. We also establish that the linear portions of\nself-attention exhibit natural equivariance to permutations of input tokens,\nand show how the \"circuits\" identified in mechanistic interpretability can be\ninterpreted as compositions of parametric 1-morphisms. This categorical\nperspective unifies geometric, algebraic, and interpretability-based approaches\nto transformer analysis, making explicit the underlying structures of\nattention. We restrict to linear maps throughout, deferring the treatment of\nnonlinearities such as softmax and layer normalisation, which require more\nadvanced categorical constructions. Our results build on and extend recent work\non category-theoretic foundations for deep learning, offering deeper insights\ninto the algebraic structure of attention mechanisms.\n","authors":["Charles O'Neill"],"pdf_url":"https://arxiv.org/pdf/2501.02931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21151v2","updated":"2025-01-14T09:58:24Z","published":"2024-07-30T19:28:28Z","title":"Private Collaborative Edge Inference via Over-the-Air Computation","summary":"  We consider collaborative inference at the wireless edge, where each client's\nmodel is trained independently on its local dataset. Clients are queried in\nparallel to make an accurate decision collaboratively. In addition to\nmaximizing the inference accuracy, we also want to ensure the privacy of local\nmodels. To this end, we leverage the superposition property of the multiple\naccess channel to implement bandwidth-efficient multi-user inference methods.\nWe propose different methods for ensemble and multi-view classification that\nexploit over-the-air computation (OAC). We show that these schemes perform\nbetter than their orthogonal counterparts with statistically significant\ndifferences while using fewer resources and providing privacy guarantees. We\nalso provide experimental results verifying the benefits of the proposed OAC\napproach to multi-user inference, and perform an ablation study to demonstrate\nthe effectiveness of our design choices. We share the source code of the\nframework publicly on Github to facilitate further research and\nreproducibility.\n","authors":["Selim F. Yilmaz","Burak Hasircioglu","Li Qiao","Deniz Gunduz"],"pdf_url":"https://arxiv.org/pdf/2407.21151v2.pdf","comment":"17 pages, 8 figures. This work extends from our preliminary study\n  presented at the 2022 IEEE International Symposium on Information Theory [1].\n  arXiv admin note: text overlap with arXiv:2202.03129"},{"id":"http://arxiv.org/abs/2410.11005v2","updated":"2025-01-14T09:52:50Z","published":"2024-10-14T18:44:23Z","title":"One Language, Many Gaps: Evaluating Dialect Fairness and Robustness of\n  Large Language Models in Reasoning Tasks","summary":"  Language is not monolithic. While benchmarks, including those designed for\nmultiple languages, are often used as proxies to evaluate the performance of\nLarge Language Models (LLMs), they tend to overlook the nuances of\nwithin-language variation, and thus fail to model the experience of speakers of\nnon-standard dialects. Focusing on African American Vernacular English (AAVE),\nwe present the first study aimed at objectively assessing the fairness and\nrobustness of LLMs in handling dialects in canonical reasoning tasks, including\nalgorithm, math, logic, and integrated reasoning. We introduce \\textbf{ReDial}\n(\\textbf{Re}asoning with \\textbf{Dial}ect Queries), a benchmark containing\n1.2K+ parallel query pairs in Standardized English and AAVE. We hire AAVE\nspeakers, including experts with computer science backgrounds, to rewrite seven\npopular benchmarks, such as HumanEval and GSM8K. With ReDial, we evaluate\nwidely used LLMs, including GPT, Claude, Llama, Mistral, and the Phi model\nfamilies. Our findings reveal that \\textbf{almost all of these widely used\nmodels show significant brittleness and unfairness to queries in AAVE}. Our\nwork establishes a systematic and objective framework for analyzing LLM bias in\ndialectal queries. Moreover, it highlights how mainstream LLMs provide unfair\nservice to dialect speakers in reasoning tasks, laying a critical foundation\nfor relevant future research. Code and data can be accessed at\nhttps://github.com/fangru-lin/redial_dialect_robustness_fairness.\n","authors":["Fangru Lin","Shaoguang Mao","Emanuele La Malfa","Valentin Hofmann","Adrian de Wynter","Xun Wang","Si-Qing Chen","Michael Wooldridge","Janet B. Pierrehumbert","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2410.11005v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07964v1","updated":"2025-01-14T09:35:49Z","published":"2025-01-14T09:35:49Z","title":"Derivation of Output Correlation Inferences for Multi-Output (aka\n  Multi-Task) Gaussian Process","summary":"  Gaussian process (GP) is arguably one of the most widely used machine\nlearning algorithms in practice. One of its prominent applications is Bayesian\noptimization (BO). Although the vanilla GP itself is already a powerful tool\nfor BO, it is often beneficial to be able to consider the dependencies of\nmultiple outputs. To do so, Multi-task GP (MTGP) is formulated, but it is not\ntrivial to fully understand the derivations of its formulations and their\ngradients from the previous literature. This paper serves friendly derivations\nof the MTGP formulations and their gradients.\n","authors":["Shuhei Watanabe"],"pdf_url":"https://arxiv.org/pdf/2501.07964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07957v1","updated":"2025-01-14T09:21:17Z","published":"2025-01-14T09:21:17Z","title":"AI Guide Dog: Egocentric Path Prediction on Smartphone","summary":"  This paper introduces AI Guide Dog (AIGD), a lightweight egocentric\nnavigation assistance system for visually impaired individuals, designed for\nreal-time deployment on smartphones. AIGD addresses key challenges in blind\nnavigation by employing a vision-only, multi-label classification approach to\npredict directional commands, ensuring safe traversal across diverse\nenvironments. We propose a novel technique to enable goal-based outdoor\nnavigation by integrating GPS signals and high-level directions, while also\naddressing uncertain multi-path predictions for destination-free indoor\nnavigation. Our generalized model is the first navigation assistance system to\nhandle both goal-oriented and exploratory navigation scenarios across indoor\nand outdoor settings, establishing a new state-of-the-art in blind navigation.\nWe present methods, datasets, evaluations, and deployment insights to encourage\nfurther innovations in assistive navigation systems.\n","authors":["Aishwarya Jadhav","Jeffery Cao","Abhishree Shetty","Urvashi Priyam Kumar","Aditi Sharma","Ben Sukboontip","Jayant Sravan Tamarapalli","Jingyi Zhang","Anirudh Koul"],"pdf_url":"https://arxiv.org/pdf/2501.07957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07446v2","updated":"2025-01-14T09:17:26Z","published":"2025-01-13T16:16:53Z","title":"Synthesis and Analysis of Data as Probability Measures with\n  Entropy-Regularized Optimal Transport","summary":"  We consider synthesis and analysis of probability measures using the\nentropy-regularized Wasserstein-2 cost and its unbiased version, the Sinkhorn\ndivergence. The synthesis problem consists of computing the barycenter, with\nrespect to these costs, of $m$ reference measures given a set of coefficients\nbelonging to the $m$-dimensional simplex. The analysis problem consists of\nfinding the coefficients for the closest barycenter in the Wasserstein-2\ndistance to a given measure $\\mu$. Under the weakest assumptions on the\nmeasures thus far in the literature, we compute the derivative of the\nentropy-regularized Wasserstein-2 cost. We leverage this to establish a\ncharacterization of regularized barycenters as solutions to a fixed-point\nequation for the average of the entropic maps from the barycenter to the\nreference measures. This characterization yields a finite-dimensional, convex,\nquadratic program for solving the analysis problem when $\\mu$ is a barycenter.\nIt is shown that these coordinates, as well as the value of the barycenter\nfunctional, can be estimated from samples with dimension-independent rates of\nconvergence, a hallmark of entropy-regularized optimal transport, and we verify\nthese rates experimentally. We also establish that barycentric coordinates are\nstable with respect to perturbations in the Wasserstein-2 metric, suggesting a\nrobustness of these coefficients to corruptions. We employ the barycentric\ncoefficients as features for classification of corrupted point cloud data, and\nshow that compared to neural network baselines, our approach is more efficient\nin small training data regimes.\n","authors":["Brendan Mallery","James M. Murphy","Shuchin Aeron"],"pdf_url":"https://arxiv.org/pdf/2501.07446v2.pdf","comment":"58 pages. Code to reproduce experiments:\n  https://github.com/brendanmallery9/Entropic-Barycenters"},{"id":"http://arxiv.org/abs/2501.00709v2","updated":"2025-01-14T09:05:54Z","published":"2025-01-01T03:12:18Z","title":"KAN KAN Buff Signed Graph Neural Networks?","summary":"  Graph Representation Learning focuses on creating embeddings for nodes and\nedges that capture their features and connections. Graph Neural Networks (GNNs)\nuse neural networks to model complex graph relationships. The Kolmogorov-Arnold\nNeural Network (KAN) has recently emerged as an alternative to the Multi-Layer\nPerceptron (MLP), offering better accuracy and interpretability with fewer\nparameters. KANs have been applied to GNN tasks. This paper introduces the\nintegration of KANs into Signed Graph Convolutional Networks (SGCNs). We\nevaluate KAN-enhanced SGCNs (KASGCN) on signed community detection and link\nsign prediction tasks to improve embedding quality in signed networks. While\nthe results show some variability, KASGCN performs competitively with or\nsimilarly to the standard SGCN in the functions tested. Its effectiveness\ndepends on the specific context, such as the signed graph and parameter\nsettings.\n","authors":["Muhieddine Shebaro","Jelena Tešić"],"pdf_url":"https://arxiv.org/pdf/2501.00709v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07317v2","updated":"2025-01-14T09:00:27Z","published":"2025-01-13T13:28:03Z","title":"Evaluation of Artificial Intelligence Methods for Lead Time Prediction\n  in Non-Cycled Areas of Automotive Production","summary":"  The present study examines the effectiveness of applying Artificial\nIntelligence methods in an automotive production environment to predict unknown\nlead times in a non-cycle-controlled production area. Data structures are\nanalyzed to identify contextual features and then preprocessed using one-hot\nencoding. Methods selection focuses on supervised machine learning techniques.\nIn supervised learning methods, regression and classification methods are\nevaluated. Continuous regression based on target size distribution is not\nfeasible. Classification methods analysis shows that Ensemble Learning and\nSupport Vector Machines are the most suitable. Preliminary study results\nindicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost\nyield the best results. After further testing and extensive hyperparameter\noptimization, the final method choice is the LightGBM algorithm. Depending on\nfeature availability and prediction interval granularity, relative prediction\naccuracies of up to 90% can be achieved. Further tests highlight the importance\nof periodic retraining of AI models to accurately represent complex production\nprocesses using the database. The research demonstrates that AI methods can be\neffectively applied to highly variable production data, adding business value\nby providing an additional metric for various control tasks while outperforming\ncurrent non AI-based systems.\n","authors":["Cornelius Hake","Jonas Weigele","Frederik Reichert","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2501.07317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14961v3","updated":"2025-01-14T08:56:48Z","published":"2024-01-26T15:52:41Z","title":"Set-Based Training for Neural Network Verification","summary":"  Neural networks are vulnerable to adversarial attacks, i.e., small input\nperturbations can significantly affect the outputs of a neural network.\nTherefore, to ensure safety of safety-critical environments, the robustness of\na neural network must be formally verified against input perturbations, e.g.,\nfrom noisy sensors. To improve the robustness of neural networks and thus\nsimplify the formal verification, we present a novel set-based training\nprocedure in which we compute the set of possible outputs given the set of\npossible inputs and compute for the first time a gradient set, i.e., each\npossible output has a different gradient. Therefore, we can directly reduce the\nsize of the output enclosure by choosing gradients toward its center. Small\noutput enclosures increase the robustness of a neural network and, at the same\ntime, simplify its formal verification. The latter benefit is due to the fact\nthat a larger size of propagated sets increases the conservatism of most\nverification methods. Our extensive evaluation demonstrates that set-based\ntraining produces robust neural networks with competitive performance, which\ncan be verified using fast (polynomial-time) verification algorithms due to the\nreduced output set.\n","authors":["Lukas Koller","Tobias Ladner","Matthias Althoff"],"pdf_url":"https://arxiv.org/pdf/2401.14961v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13874v4","updated":"2025-01-14T08:42:23Z","published":"2024-10-02T13:02:17Z","title":"COOL: Efficient and Reliable Chain-Oriented Objective Logic with Neural\n  Networks Feedback Control for Program Synthesis","summary":"  Program synthesis methods, whether formal or neural-based, lack fine-grained\ncontrol and flexible modularity, which limits their adaptation to complex\nsoftware development. These limitations stem from rigid Domain-Specific\nLanguage (DSL) frameworks and neural network incorrect predictions. To this\nend, we propose the Chain of Logic (CoL), which organizes the synthesis process\ninto an activity flow and provides heuristic control to guide the process.\nFurthermore, by integrating neural networks with libraries and introducing a\nNeural Network Feedback Control (NNFC) mechanism, our approach modularizes\nsynthesis and mitigates the impact of neural network mispredictions.\nExperiments on relational and symbolic synthesis tasks show that CoL\nsignificantly enhances the efficiency and reliability of DSL program synthesis\nacross multiple metrics. Specifically, CoL improves accuracy by 70% while\nreducing tree operations by 91% and time by 95%. Additionally, NNFC further\nboosts accuracy by 6%, with a 64% reduction in tree operations under\nchallenging conditions such as insufficient training data, increased\ndifficulty, and multidomain synthesis. These improvements confirm COOL as a\nhighly efficient and reliable program synthesis framework.\n","authors":["Jipeng Han"],"pdf_url":"https://arxiv.org/pdf/2410.13874v4.pdf","comment":"31 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.07927v1","updated":"2025-01-14T08:30:49Z","published":"2025-01-14T08:30:49Z","title":"Gandalf the Red: Adaptive Security for LLMs","summary":"  Current evaluations of defenses against prompt attacks in large language\nmodel (LLM) applications often overlook two critical factors: the dynamic\nnature of adversarial behavior and the usability penalties imposed on\nlegitimate users by restrictive defenses. We propose D-SEC (Dynamic Security\nUtility Threat Model), which explicitly separates attackers from legitimate\nusers, models multi-step interactions, and rigorously expresses the\nsecurity-utility in an optimizable form. We further address the shortcomings in\nexisting evaluations by introducing Gandalf, a crowd-sourced, gamified\nred-teaming platform designed to generate realistic, adaptive attack datasets.\nUsing Gandalf, we collect and release a dataset of 279k prompt attacks.\nComplemented by benign user data, our analysis reveals the interplay between\nsecurity and utility, showing that defenses integrated in the LLM (e.g., system\nprompts) can degrade usability even without blocking requests. We demonstrate\nthat restricted application domains, defense-in-depth, and adaptive defenses\nare effective strategies for building secure and useful LLM applications. Code\nis available at\n\\href{https://github.com/lakeraai/dsec-gandalf}{\\texttt{https://github.com/lakeraai/dsec-gandalf}}.\n","authors":["Niklas Pfister","Václav Volhejn","Manuel Knott","Santiago Arias","Julia Bazińska","Mykhailo Bichurin","Alan Commike","Janet Darling","Peter Dienes","Matthew Fiedler","David Haber","Matthias Kraft","Marco Lancini","Max Mathys","Damián Pascual-Ortiz","Jakub Podolak","Adrià Romero-López","Kyriacos Shiarlis","Andreas Signer","Zsolt Terek","Athanasios Theocharis","Daniel Timbrell","Samuel Trautwein","Samuel Watts","Natalie Wu","Mateo Rojas-Carulla"],"pdf_url":"https://arxiv.org/pdf/2501.07927v1.pdf","comment":"Niklas Pfister, V\\'aclav Volhejn and Manuel Knott contributed equally"},{"id":"http://arxiv.org/abs/2501.07925v1","updated":"2025-01-14T08:26:58Z","published":"2025-01-14T08:26:58Z","title":"Phase of Flight Classification in Aviation Safety using LSTM, GRU, and\n  BiLSTM: A Case Study with ASN Dataset","summary":"  Safety is the main concern in the aviation industry, where even minor\noperational issues can lead to serious consequences. This study addresses the\nneed for comprehensive aviation accident analysis by leveraging natural\nlanguage processing (NLP) and advanced AI models to classify the phase of\nflight from unstructured aviation accident analysis narratives. The research\naims to determine whether the phase of flight can be inferred from narratives\nof post-accident events using NLP techniques. The classification performance of\nvarious deep learning models was evaluated. For single RNN-based models, LSTM\nachieved an accuracy of 63%, precision 60%, and recall 61%. BiLSTM recorded an\naccuracy of 64%, precision 63%, and a recall of 64%. GRU exhibited balanced\nperformance with an accuracy and recall of 60% and a precision of 63%. Joint\nRNN-based models further enhanced predictive capabilities. GRU-LSTM,\nLSTM-BiLSTM, and GRU-BiLSTM demonstrated accuracy rates of 62%, 67%, and 60%,\nrespectively, showcasing the benefits of combining these architectures. To\nprovide a comprehensive overview of model performance, single and combined\nmodels were compared in terms of the various metrics. These results underscore\nthe models' capacity to classify the phase of flight from raw text narratives,\nequipping aviation industry stakeholders with valuable insights for proactive\ndecision-making. Therefore, this research signifies a substantial advancement\nin the application of NLP and deep learning models to enhance aviation safety.\n","authors":["Aziida Nanyonga","Hassan Wasswa","Graham Wild"],"pdf_url":"https://arxiv.org/pdf/2501.07925v1.pdf","comment":"Aviation Safety, Deep learning algorithms, Flight phase, NLP, ASN,\n  and Classification"},{"id":"http://arxiv.org/abs/2501.07923v1","updated":"2025-01-14T08:18:41Z","published":"2025-01-14T08:18:41Z","title":"Aviation Safety Enhancement via NLP & Deep Learning: Classifying Flight\n  Phases in ATSB Safety Reports","summary":"  Aviation safety is paramount, demanding precise analysis of safety\noccurrences during different flight phases. This study employs Natural Language\nProcessing (NLP) and Deep Learning models, including LSTM, CNN, Bidirectional\nLSTM (BLSTM), and simple Recurrent Neural Networks (sRNN), to classify flight\nphases in safety reports from the Australian Transport Safety Bureau (ATSB).\nThe models exhibited high accuracy, precision, recall, and F1 scores, with LSTM\nachieving the highest performance of 87%, 88%, 87%, and 88%, respectively. This\nperformance highlights their effectiveness in automating safety occurrence\nanalysis. The integration of NLP and Deep Learning technologies promises\ntransformative enhancements in aviation safety analysis, enabling targeted\nsafety measures and streamlined report handling.\n","authors":["Aziida Nanyonga","Hassan Wasswa","Graham Wild"],"pdf_url":"https://arxiv.org/pdf/2501.07923v1.pdf","comment":"NLP, Aviation Safety, ATSB, Deep learning, Flight phase. arXiv admin\n  note: substantial text overlap with arXiv:2501.01694"},{"id":"http://arxiv.org/abs/2403.10568v3","updated":"2025-01-14T08:01:17Z","published":"2024-03-14T17:47:10Z","title":"MoPE: Mixture of Prompt Experts for Parameter-Efficient and Scalable\n  Multimodal Fusion","summary":"  Despite the demonstrated parameter efficiency of prompt-based multimodal\nfusion methods, their limited adaptivity and expressiveness often result in\nsuboptimal performance compared to other tuning approaches. In this paper, we\nintroduce the Mixture of Prompt Experts (MoPE), the first technique designed to\novercome these limitations by decomposing standard prompts to capture\ninstance-level features adaptively. Building on this decomposition, MoPE\nenhances prompt fusion's expressiveness by leveraging multimodal pairing priors\nto route the most effective prompt for each instance dynamically. Compared to\nvanilla prompting, our MoPE-based fusion method exhibits greater\nexpressiveness, scaling more effectively with the training data and the overall\nnumber of trainable parameters. We also investigate regularization terms for\nexpert routing, which lead to emergent expert specialization with enhanced\nadaptiveness and interpretablity. Extensive experiments across six multimodal\ndatasets spanning four modalities demonstrate state-of-the-art performance for\nprompt fusion, matching or even surpassing the performance of fine-tuning while\nrequiring only 0.8% of the trainable parameters. Project homepage:\nhttps://github.com/songrise/MoPE\n","authors":["Ruixiang Jiang","Lingbo Liu","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.10568v3.pdf","comment":"Under Review, Extended version of arxiv:2312.03734"},{"id":"http://arxiv.org/abs/2501.07905v1","updated":"2025-01-14T07:50:09Z","published":"2025-01-14T07:50:09Z","title":"Logarithmic Memory Networks (LMNs): Efficient Long-Range Sequence\n  Modeling for Resource-Constrained Environments","summary":"  Long-range sequence modeling is a crucial aspect of natural language\nprocessing and time series analysis. However, traditional models like Recurrent\nNeural Networks (RNNs) and Transformers suffer from computational and memory\ninefficiencies, especially when dealing with long sequences. This paper\nintroduces Logarithmic Memory Networks (LMNs), a novel architecture that\nleverages a hierarchical logarithmic tree structure to efficiently store and\nretrieve past information. LMNs dynamically summarize historical context,\nsignificantly reducing the memory footprint and computational complexity of\nattention mechanisms from O(n2) to O(log(n)). The model employs a\nsingle-vector, targeted attention mechanism to access stored information, and\nthe memory block construction worker (summarizer) layer operates in two modes:\na parallel execution mode during training for efficient processing of\nhierarchical tree structures and a sequential execution mode during inference,\nwhich acts as a memory management system. It also implicitly encodes positional\ninformation, eliminating the need for explicit positional encodings. These\nfeatures make LMNs a robust and scalable solution for processing long-range\nsequences in resource-constrained environments, offering practical improvements\nin efficiency and scalability. The code is publicly available under the MIT\nLicense on GitHub: https://github.com/AhmedBoin/LogarithmicMemory.\n","authors":["Mohamed A. Taha"],"pdf_url":"https://arxiv.org/pdf/2501.07905v1.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.07903v1","updated":"2025-01-14T07:46:33Z","published":"2025-01-14T07:46:33Z","title":"Optimal Classification Trees for Continuous Feature Data Using Dynamic\n  Programming with Branch-and-Bound","summary":"  Computing an optimal classification tree that provably maximizes training\nperformance within a given size limit, is NP-hard, and in practice, most\nstate-of-the-art methods do not scale beyond computing optimal trees of depth\nthree. Therefore, most methods rely on a coarse binarization of continuous\nfeatures to maintain scalability. We propose a novel algorithm that optimizes\ntrees directly on the continuous feature data using dynamic programming with\nbranch-and-bound. We develop new pruning techniques that eliminate many\nsub-optimal splits in the search when similar to previously computed splits and\nwe provide an efficient subroutine for computing optimal depth-two trees. Our\nexperiments demonstrate that these techniques improve runtime by one or more\norders of magnitude over state-of-the-art optimal methods and improve test\naccuracy by 5% over greedy heuristics.\n","authors":["Catalin E. Brita","Jacobus G. M. van der Linden","Emir Demirović"],"pdf_url":"https://arxiv.org/pdf/2501.07903v1.pdf","comment":"In the proceedings of AAAI-25"},{"id":"http://arxiv.org/abs/2411.02824v2","updated":"2025-01-14T07:30:20Z","published":"2024-11-05T05:50:51Z","title":"Layer-Adaptive State Pruning for Deep State Space Models","summary":"  Due to the lack of state dimension optimization methods, deep state space\nmodels (SSMs) have sacrificed model capacity, training search space, or\nstability to alleviate computational costs caused by high state dimensions. In\nthis work, we provide a structured pruning method for SSMs, Layer-Adaptive\nSTate pruning (LAST), which reduces the state dimension of each layer in\nminimizing model-level output energy loss by extending modal truncation for a\nsingle system. LAST scores are evaluated using the $\\mathcal{H}_{\\infty}$ norms\nof subsystems and layer-wise energy normalization. The scores serve as global\npruning criteria, enabling cross-layer comparison of states and layer-adaptive\npruning. Across various sequence benchmarks, LAST optimizes previous SSMs,\nrevealing the redundancy and compressibility of their state spaces. Notably, we\ndemonstrate that, on average, pruning 33% of states still maintains performance\nwith 0.52% accuracy loss in multi-input multi-output SSMs without retraining.\nCode is available at https://github.com/msgwak/LAST.\n","authors":["Minseon Gwak","Seongrok Moon","Joohwan Ko","PooGyeon Park"],"pdf_url":"https://arxiv.org/pdf/2411.02824v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.07368v2","updated":"2025-01-14T07:28:06Z","published":"2024-07-10T05:03:48Z","title":"Data-driven Bayesian State Estimation with Compressed Measurement of\n  Model-free Process using Semi-supervised Learning","summary":"  The research topic is: data-driven Bayesian state estimation with compressed\nmeasurement (BSCM) of model-free process, say for a (causal) tracking\napplication. The dimension of the temporal measurement vector is lower than the\ndimension of the temporal state vector to be estimated. Hence the state\nestimation problem is an underdetermined inverse problem. The underlying\ndynamical model of the states is assumed to be unknown and hence, we use the\nterminology 'model-free process'. In absence of the dynamical model, we can not\nemploy traditional model-driven methods like Kalman Filter (KF) and Particle\nFilter (PF), and instead require data-driven methods. We first experimentally\nshow that two existing unsupervised learning-based data-driven methods fail to\naddress the BSCM problem for model-free process; they are - data-driven\nnonlinear state estimation (DANSE) method and deep Markov model (DMM) method.\nThe unsupervised learning uses unlabelled data comprised of only noisy, linear\nmeasurements. While DANSE provides a good predictive / forecasting performance\nto model the temporal measurement data as time-series, its unsupervised\nlearning lacks a regularization for state estimation. We then investigate the\nuse of a semi-supervised learning approach, and develop a semi-supervised\nlearning-based DANSE method, referred to as SemiDANSE. In SemiDANSE, we use a\nlimited amount of labelled data along-with a large amount of unlabelled data,\nand that helps to bring the desired regularization for addressing the BSCM\nproblem. The labelled data means pairwise measurement-and-state data. Using\nthree chaotic dynamical systems (or processes) with nonlinear dynamical models\nas benchmark, we show that the data-driven SemiDANSE provides competitive\nperformance for BSCM against a hybrid method called KalmanNet and two\nmodel-driven methods -- an extended KF (EKF) and an unscented KF (UKF).\n","authors":["Anubhab Ghosh","Yonina C. Eldar","Saikat Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2407.07368v2.pdf","comment":"14 pages, under review at IEEE TSP"},{"id":"http://arxiv.org/abs/2410.15322v2","updated":"2025-01-14T06:59:12Z","published":"2024-10-20T07:32:16Z","title":"FoMo: A Foundation Model for Mobile Traffic Forecasting with Diffusion\n  Model","summary":"  Mobile traffic forecasting allows operators to anticipate network dynamics\nand performance in advance, offering substantial potential for enhancing\nservice quality and improving user experience. However, existing models are\noften task-oriented and are trained with tailored data, which limits their\neffectiveness in diverse mobile network tasks of Base Station (BS) deployment,\nresource allocation, energy optimization, etc. and hinders generalization\nacross different urban environments. Foundation models have made remarkable\nstrides across various domains of NLP and CV due to their multi-tasking\nadaption and zero/few-shot learning capabilities. In this paper, we propose an\ninnovative Foundation model for Mo}bile traffic forecasting (FoMo), aiming to\nhandle diverse forecasting tasks of short/long-term predictions and\ndistribution generation across multiple cities to support network planning and\noptimization. FoMo combines diffusion models and transformers, where various\nspatio-temporal masks are proposed to enable FoMo to learn intrinsic features\nof different tasks, and a contrastive learning strategy is developed to capture\nthe correlations between mobile traffic and urban contexts, thereby improving\nits transfer learning capability. Extensive experiments on 9 real-world\ndatasets demonstrate that FoMo outperforms current models concerning diverse\nforecasting tasks and zero/few-shot learning, showcasing a strong universality.\n","authors":["Haoye Chai","Xiaoqian Qi","Shiyuan Zhang","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2410.15322v2.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.07886v1","updated":"2025-01-14T06:54:17Z","published":"2025-01-14T06:54:17Z","title":"Iterative Label Refinement Matters More than Preference Optimization\n  under Weak Supervision","summary":"  Language model (LM) post-training relies on two stages of human supervision:\ntask demonstrations for supervised finetuning (SFT), followed by preference\ncomparisons for reinforcement learning from human feedback (RLHF). As LMs\nbecome more capable, the tasks they are given become harder to supervise. Will\npost-training remain effective under unreliable supervision? To test this, we\nsimulate unreliable demonstrations and comparison feedback using small LMs and\ntime-constrained humans. We find that in the presence of unreliable\nsupervision, SFT still retains some effectiveness, but DPO (a common RLHF\nalgorithm) fails to improve the model beyond SFT. To address this, we propose\niterative label refinement (ILR) as an alternative to RLHF. ILR improves the\nSFT data by using comparison feedback to decide whether human demonstrations\nshould be replaced by model-generated alternatives, then retrains the model via\nSFT on the updated data. SFT+ILR outperforms SFT+DPO on several tasks with\nunreliable supervision (math, coding, and safe instruction-following). Our\nfindings suggest that as LMs are used for complex tasks where human supervision\nis unreliable, RLHF may no longer be the best use of human comparison feedback;\ninstead, it is better to direct feedback towards improving the training data\nrather than continually training the model. Our code and data are available at\nhttps://github.com/helloelwin/iterative-label-refinement.\n","authors":["Yaowen Ye","Cassidy Laidlaw","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2501.07886v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.07885v1","updated":"2025-01-14T06:51:27Z","published":"2025-01-14T06:51:27Z","title":"Mitigating Algorithmic Bias in Multiclass CNN Classifications Using\n  Causal Modeling","summary":"  This study describes a procedure for applying causal modeling to detect and\nmitigate algorithmic bias in a multiclass classification problem. The dataset\nwas derived from the FairFace dataset, supplemented with emotional labels\ngenerated by the DeepFace pre-trained model. A custom Convolutional Neural\nNetwork (CNN) was developed, consisting of four convolutional blocks, followed\nby fully connected layers and dropout layers to mitigate overfitting. Gender\nbias was identified in the CNN model's classifications: Females were more\nlikely to be classified as \"happy\" or \"sad,\" while males were more likely to be\nclassified as \"neutral.\" To address this, the one-vs-all (OvA) technique was\napplied. A causal model was constructed for each emotion class to adjust the\nCNN model's predicted class probabilities. The adjusted probabilities for the\nvarious classes were then aggregated by selecting the class with the highest\nprobability. The resulting debiased classifications demonstrated enhanced\ngender fairness across all classes, with negligible impact--or even a slight\nimprovement--on overall accuracy. This study highlights that algorithmic\nfairness and accuracy are not necessarily trade-offs. All data and code for\nthis study are publicly available for download.\n","authors":["Min Sik Byun","Wendy Wan Yee Hui","Wai Kwong Lau"],"pdf_url":"https://arxiv.org/pdf/2501.07885v1.pdf","comment":"7 pages; 6 figures"},{"id":"http://arxiv.org/abs/2501.07884v1","updated":"2025-01-14T06:50:56Z","published":"2025-01-14T06:50:56Z","title":"MD-Syn: Synergistic drug combination prediction based on the\n  multidimensional feature fusion method and attention mechanisms","summary":"  Drug combination therapies have shown promising therapeutic efficacy in\ncomplex diseases and have demonstrated the potential to reduce drug resistance.\nHowever, the huge number of possible drug combinations makes it difficult to\nscreen them all in traditional experiments. In this study, we proposed MD-Syn,\na computational framework, which is based on the multidimensional feature\nfusion method and multi-head attention mechanisms. Given drug pair-cell line\ntriplets, MD-Syn considers one-dimensional and two-dimensional feature spaces\nsimultaneously. It consists of a one-dimensional feature embedding module\n(1D-FEM), a two-dimensional feature embedding module (2D-FEM), and a deep\nneural network-based classifier for synergistic drug combination prediction.\nMD-Syn achieved the AUROC of 0.919 in 5-fold cross-validation, outperforming\nthe state-of-the-art methods. Further, MD-Syn showed comparable results over\ntwo independent datasets. In addition, the multi-head attention mechanisms not\nonly learn embeddings from different feature aspects but also focus on\nessential interactive feature elements, improving the interpretability of\nMD-Syn. In summary, MD-Syn is an interpretable framework to prioritize\nsynergistic drug combination pairs with chemicals and cancer cell line gene\nexpression profiles. To facilitate broader community access to this model, we\nhave developed a web portal (https://labyeh104-2.life.nthu.edu.tw/) that\nenables customized predictions of drug combination synergy effects based on\nuser-specified compounds.\n","authors":["XinXin Ge","Yi-Ting Lee","Shan-Ju Yeh"],"pdf_url":"https://arxiv.org/pdf/2501.07884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04539v4","updated":"2025-01-14T06:42:51Z","published":"2023-10-06T19:06:13Z","title":"Generating Less Certain Adversarial Examples Improves Robust\n  Generalization","summary":"  This paper revisits the robust overfitting phenomenon of adversarial\ntraining. Observing that models with better robust generalization performance\nare less certain in predicting adversarially generated training inputs, we\nargue that overconfidence in predicting adversarial examples is a potential\ncause. Therefore, we hypothesize that generating less certain adversarial\nexamples improves robust generalization, and propose a formal definition of\nadversarial certainty that captures the variance of the model's predicted\nlogits on adversarial examples. Our theoretical analysis of synthetic\ndistributions characterizes the connection between adversarial certainty and\nrobust generalization. Accordingly, built upon the notion of adversarial\ncertainty, we develop a general method to search for models that can generate\ntraining-time adversarial inputs with reduced certainty, while maintaining the\nmodel's capability in distinguishing adversarial examples. Extensive\nexperiments on image benchmarks demonstrate that our method effectively learns\nmodels with consistently improved robustness and mitigates robust overfitting,\nconfirming the importance of generating less certain adversarial examples for\nrobust generalization. Our implementations are available as open-source code\nat: https://github.com/TrustMLRG/AdvCertainty.\n","authors":["Minxing Zhang","Michael Backes","Xiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.04539v4.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2501.07879v1","updated":"2025-01-14T06:41:55Z","published":"2025-01-14T06:41:55Z","title":"Distributed Nonparametric Estimation: from Sparse to Dense Samples per\n  Terminal","summary":"  Consider the communication-constrained problem of nonparametric function\nestimation, in which each distributed terminal holds multiple i.i.d. samples.\nUnder certain regularity assumptions, we characterize the minimax optimal rates\nfor all regimes, and identify phase transitions of the optimal rates as the\nsamples per terminal vary from sparse to dense. This fully solves the problem\nleft open by previous works, whose scopes are limited to regimes with either\ndense samples or a single sample per terminal. To achieve the optimal rates, we\ndesign a layered estimation protocol by exploiting protocols for the parametric\ndensity estimation problem. We show the optimality of the protocol using\ninformation-theoretic methods and strong data processing inequalities, and\nincorporating the classic balls and bins model. The optimal rates are immediate\nfor various special cases such as density estimation, Gaussian, binary, Poisson\nand heteroskedastic regression models.\n","authors":["Deheng Yuan","Tao Guo","Zhongyi Huang"],"pdf_url":"https://arxiv.org/pdf/2501.07879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07145v2","updated":"2025-01-14T06:38:10Z","published":"2025-01-13T09:11:13Z","title":"A User's Guide to $\\texttt{KSig}$: GPU-Accelerated Computation of the\n  Signature Kernel","summary":"  The signature kernel is a positive definite kernel for sequential and\ntemporal data that has become increasingly popular in machine learning\napplications due to powerful theoretical guarantees, strong empirical\nperformance, and recently introduced various scalable variations. In this\nchapter, we give a short introduction to $\\texttt{KSig}$, a\n$\\texttt{Scikit-Learn}$ compatible Python package that implements various\nGPU-accelerated algorithms for computing signature kernels, and performing\ndownstream learning tasks. We also introduce a new algorithm based on tensor\nsketches which gives strong performance compared to existing algorithms. The\npackage is available at https://github.com/tgcsaba/ksig.\n","authors":["Csaba Tóth","Danilo Jr Dela Cruz","Harald Oberhauser"],"pdf_url":"https://arxiv.org/pdf/2501.07145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23111v6","updated":"2025-01-14T06:25:54Z","published":"2024-10-30T15:23:44Z","title":"Exploring Gradient Subspaces: Addressing and Overcoming LoRA's\n  Limitations in Federated Fine-Tuning of Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated remarkable capabilities across\nvarious domains, particularly in task generalization for both text and vision\ndata. While fine-tuning these models can significantly enhance their\nperformance on specific downstream tasks, it often requires high-quality data\nthat cannot be shared due to privacy concerns. Federated Learning (FL) offers a\npromising solution for collaborative training without direct data sharing.\nHowever, many parameter-efficient fine-tuning strategies for LLMs in FL,\nparticularly those based on Low-Rank Adaptation (LoRA), face limitations. In\nthis paper, we critically analyze the convergence and performance guarantees of\npopular FL frameworks utilizing LoRA, highlighting its suboptimal nature due to\nconstrained subspace learning of low-rank matrices. This limitation hinders\neffective fine-tuning of LLMs in federated settings. Through rigorous\nanalytical and empirical evaluations, we demonstrate that direct weight\naveraging outperforms LoRA-based strategies, leading to superior performance\nfor fine-tuned models. Our comprehensive comparison unmasks inefficiencies in\nLoRA approaches and underscores the advantages of direct weight aggregation. We\nextend our analysis to low-rank gradient-based optimizers, such as GaLore, used\nduring local training steps. Our findings show that GaLore along with\ndirect-weight aggregation is a more effective approach, outperforming federated\nLoRA methods like FlexLoRA and FFA-LoRA across both text and image modalities.\nWhile privacy remains paramount in FL discourse, our focus is on assessing\nperformance outcomes of federated fine-tuned models and evaluating various FL\nframeworks from both theoretical and empirical perspectives. Our findings\nadvocate reassessing the reliance on LoRA within FL contexts, paving the way\nfor more efficient training methodologies.\n","authors":["Navyansh Mahla","Kshitij Sharad Jadhav","Ganesh Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2410.23111v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19982v2","updated":"2025-01-14T06:18:03Z","published":"2024-10-25T21:46:25Z","title":"Random Policy Enables In-Context Reinforcement Learning within Trust\n  Horizons","summary":"  Pretrained foundation models have exhibited extraordinary in-context learning\nperformance, allowing zero-shot generalization to new tasks not encountered\nduring pretraining. In the case of reinforcement learning (RL), in-context RL\n(ICRL) emerges when pretraining FMs on decision-making problems in an\nautoregressive-supervised manner. Nevertheless, current state-of-the-art ICRL\nalgorithms, like Algorithm Distillation, Decision Pretrained Transformer and\nDecision Importance Transformer, impose stringent requirements on the\npretraining dataset concerning the source policies, context information, and\naction labels. Notably, these algorithms either demand optimal policies or\nrequire varying degrees of well-trained behavior policies for all pretraining\nenvironments. This significantly hinders the application of ICRL to real-world\nscenarios, where acquiring optimal or well-trained policies for a substantial\nvolume of real-world training environments can be intractable. To overcome this\nchallenge, we introduce a novel approach, termed State-Action Distillation\n(SAD), that allows to generate an effective pretraining dataset guided solely\nby random policies. In particular, SAD selects query states and corresponding\naction labels by distilling outstanding state-action pairs from the entire\nstate and action spaces by using random policies within a trust horizon, and\nthen inherits the classical autoregressive-supervised mechanism during\npretraining. To the best of our knowledge, this is the first work that enables\neffective ICRL under random policies and random contexts. We also establish\nquantitative analysis of the trustworthiness as well as the performance\nguarantees of SAD. Moreover, our empirical results across multiple popular ICRL\nbenchmark environments demonstrate that, on average, SAD outperforms the best\nbaseline by 236.3% in the offline evaluation and by 135.2% in the online\nevaluation.\n","authors":["Weiqin Chen","Santiago Paternain"],"pdf_url":"https://arxiv.org/pdf/2410.19982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10703v2","updated":"2025-01-14T06:02:00Z","published":"2024-12-14T06:22:49Z","title":"Doubly-Bounded Queue for Constrained Online Learning: Keeping Pace with\n  Dynamics of Both Loss and Constraint","summary":"  We consider online convex optimization with time-varying constraints and\nconduct performance analysis using two stringent metrics: dynamic regret with\nrespect to the online solution benchmark, and hard constraint violation that\ndoes not allow any compensated violation over time. We propose an efficient\nalgorithm called Constrained Online Learning with Doubly-bounded Queue (COLDQ),\nwhich introduces a novel virtual queue that is both lower and upper bounded,\nallowing tight control of the constraint violation without the need for the\nSlater condition. We prove via a new Lyapunov drift analysis that COLDQ\nachieves $O(T^\\frac{1+V_x}{2})$ dynamic regret and $O(T^{V_g})$ hard constraint\nviolation, where $V_x$ and $V_g$ capture the dynamics of the loss and\nconstraint functions. For the first time, the two bounds smoothly approach to\nthe best-known $O(T^\\frac{1}{2})$ regret and $O(1)$ violation, as the dynamics\nof the losses and constraints diminish. For strongly convex loss functions,\nCOLDQ matches the best-known $O(\\log{T})$ static regret while maintaining the\n$O(T^{V_g})$ hard constraint violation. We further introduce an expert-tracking\nvariation of COLDQ, which achieves the same performance bounds without any\nprior knowledge of the system dynamics. Simulation results demonstrate that\nCOLDQ outperforms the state-of-the-art approaches.\n","authors":["Juncheng Wang","Bingjie Yan","Yituo Liu"],"pdf_url":"https://arxiv.org/pdf/2412.10703v2.pdf","comment":"To appear in AAAI 2025"},{"id":"http://arxiv.org/abs/2501.07859v1","updated":"2025-01-14T05:55:20Z","published":"2025-01-14T05:55:20Z","title":"deepTerra -- AI Land Classification Made Easy","summary":"  deepTerra is a comprehensive platform designed to facilitate the\nclassification of land surface features using machine learning and satellite\nimagery. The platform includes modules for data collection, image augmentation,\ntraining, testing, and prediction, streamlining the entire workflow for image\nclassification tasks. This paper presents a detailed overview of the\ncapabilities of deepTerra, shows how it has been applied to various research\nareas, and discusses the future directions it might take.\n","authors":["Andrew Keith Wilkinson"],"pdf_url":"https://arxiv.org/pdf/2501.07859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19255v2","updated":"2025-01-14T05:48:07Z","published":"2024-12-26T15:45:45Z","title":"Multi-matrix Factorization Attention","summary":"  We propose novel attention architectures, Multi-matrix Factorization\nAttention (MFA) and MFA-Key-Reuse (MFA-KR). Existing variants for standard\nMulti-Head Attention (MHA), including SOTA methods like MLA, fail to maintain\nas strong performance under stringent Key-Value cache (KV cache) constraints.\nMFA enhances model capacity by efficiently scaling up both the number and\ndimension of attention heads through low-rank matrix factorization in the\nQuery-Key (QK) circuit. Extending MFA, MFA-KR further reduces memory\nrequirements by repurposing the key cache as value through value projection\nre-parameterization. MFA's design enables strong model capacity when working\nunder tight KV cache budget, while MFA-KR is suitable for even harsher KV cache\nlimits with minor performance trade-off. Notably, in our extensive and\nlarge-scale experiments, the proposed architecture outperforms MLA and performs\ncomparably to MHA, while reducing KV cache usage by up to 56% and 93.7%,\nrespectively.\n","authors":["Jingcheng Hu","Houyi Li","Yinmin Zhang","Zili Wang","Shuigeng Zhou","Xiangyu Zhang","Heung-Yeung Shum","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2412.19255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07855v1","updated":"2025-01-14T05:43:59Z","published":"2025-01-14T05:43:59Z","title":"State-of-the-Art Transformer Models for Image Super-Resolution:\n  Techniques, Challenges, and Applications","summary":"  Image Super-Resolution (SR) aims to recover a high-resolution image from its\nlow-resolution counterpart, which has been affected by a specific degradation\nprocess. This is achieved by enhancing detail and visual quality. Recent\nadvancements in transformer-based methods have remolded image super-resolution\nby enabling high-quality reconstructions surpassing previous deep-learning\napproaches like CNN and GAN-based. This effectively addresses the limitations\nof previous methods, such as limited receptive fields, poor global context\ncapture, and challenges in high-frequency detail recovery. Additionally, the\npaper reviews recent trends and advancements in transformer-based SR models,\nexploring various innovative techniques and architectures that combine\ntransformers with traditional networks to balance global and local contexts.\nThese neoteric methods are critically analyzed, revealing promising yet\nunexplored gaps and potential directions for future research. Several\nvisualizations of models and techniques are included to foster a holistic\nunderstanding of recent trends. This work seeks to offer a structured roadmap\nfor researchers at the forefront of deep learning, specifically exploring the\nimpact of transformers on super-resolution techniques.\n","authors":["Debasish Dutta","Deepjyoti Chetia","Neeharika Sonowal","Sanjib Kr Kalita"],"pdf_url":"https://arxiv.org/pdf/2501.07855v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2501.07850v1","updated":"2025-01-14T05:23:42Z","published":"2025-01-14T05:23:42Z","title":"An Intra- and Cross-frame Topological Consistency Scheme for\n  Semi-supervised Atherosclerotic Coronary Plaque Segmentation","summary":"  Enhancing the precision of segmenting coronary atherosclerotic plaques from\nCT Angiography (CTA) images is pivotal for advanced Coronary Atherosclerosis\nAnalysis (CAA), which distinctively relies on the analysis of vessel\ncross-section images reconstructed via Curved Planar Reformation. This task\npresents significant challenges due to the indistinct boundaries and structures\nof plaques and blood vessels, leading to the inadequate performance of current\ndeep learning models, compounded by the inherent difficulty in annotating such\ncomplex data. To address these issues, we propose a novel dual-consistency\nsemi-supervised framework that integrates Intra-frame Topological Consistency\n(ITC) and Cross-frame Topological Consistency (CTC) to leverage labeled and\nunlabeled data. ITC employs a dual-task network for simultaneous segmentation\nmask and Skeleton-aware Distance Transform (SDT) prediction, achieving similar\nprediction of topology structure through consistency constraint without\nadditional annotations. Meanwhile, CTC utilizes an unsupervised estimator for\nanalyzing pixel flow between skeletons and boundaries of adjacent frames,\nensuring spatial continuity. Experiments on two CTA datasets show that our\nmethod surpasses existing semi-supervised methods and approaches the\nperformance of supervised methods on CAA. In addition, our method also performs\nbetter than other methods on the ACDC dataset, demonstrating its\ngeneralization.\n","authors":["Ziheng Zhang","Zihan Li","Dandan Shan","Yuehui Qiu","Qingqi Hong","Qingqiang Wu"],"pdf_url":"https://arxiv.org/pdf/2501.07850v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2411.03865v4","updated":"2025-01-14T05:23:03Z","published":"2024-11-06T12:19:01Z","title":"AdaSociety: An Adaptive Environment with Social Structures for\n  Multi-Agent Decision-Making","summary":"  Traditional interactive environments limit agents' intelligence growth with\nfixed tasks. Recently, single-agent environments address this by generating new\ntasks based on agent actions, enhancing task diversity. We consider the\ndecision-making problem in multi-agent settings, where tasks are further\ninfluenced by social connections, affecting rewards and information access.\nHowever, existing multi-agent environments lack a combination of adaptive\nphysical surroundings and social connections, hindering the learning of\nintelligent behaviors. To address this, we introduce AdaSociety, a customizable\nmulti-agent environment featuring expanding state and action spaces, alongside\nexplicit and alterable social structures. As agents progress, the environment\nadaptively generates new tasks with social structures for agents to undertake.\nIn AdaSociety, we develop three mini-games showcasing distinct social\nstructures and tasks. Initial results demonstrate that specific social\nstructures can promote both individual and collective benefits, though current\nreinforcement learning and LLM-based algorithms show limited effectiveness in\nleveraging social structures to enhance performance. Overall, AdaSociety serves\nas a valuable research platform for exploring intelligence in diverse physical\nand social settings. The code is available at\nhttps://github.com/bigai-ai/AdaSociety.\n","authors":["Yizhe Huang","Xingbo Wang","Hao Liu","Fanqi Kong","Aoyang Qin","Min Tang","Song-Chun Zhu","Mingjie Bi","Siyuan Qi","Xue Feng"],"pdf_url":"https://arxiv.org/pdf/2411.03865v4.pdf","comment":"Accepted at NeurIPS D&B 2024"},{"id":"http://arxiv.org/abs/2405.10480v2","updated":"2025-01-14T05:00:34Z","published":"2024-05-17T00:52:39Z","title":"Lean Attention: Hardware-Aware Scalable Attention Mechanism for the\n  Decode-Phase of Transformers","summary":"  Transformer-based models have emerged as one of the most widely used\narchitectures for natural language processing, natural language generation, and\nimage generation. The size of the state-of-the-art models has increased\nsteadily reaching billions of parameters. These huge models are memory hungry\nand incur significant inference latency even on cutting edge AI-accelerators,\nsuch as GPUs. Specifically, the time and memory complexity of the attention\noperation is quadratic in terms of the total context length, i.e., prompt and\noutput tokens. Thus, several optimizations such as key-value tensor caching and\nFlashAttention computation have been proposed to deliver the low latency\ndemands of applications relying on such large models. However, these techniques\ndo not cater to the computationally distinct nature of different phases during\ninference.\n  To that end, we propose LeanAttention, a scalable technique of computing\nself-attention for the token-generation phase (decode-phase) of decoder-only\ntransformer models. LeanAttention enables scaling the attention mechanism\nimplementation for the challenging case of long context lengths by re-designing\nthe execution flow for the decode-phase. We identify that the associative\nproperty of online softmax can be treated as a reduction operation thus\nallowing us to parallelize the attention computation over these large context\nlengths. We extend the \"stream-K\" style reduction of tiled calculation to\nself-attention to enable parallel computation resulting in an average of 2.6x\nattention execution speedup over FlashAttention-2 and up to 8.33x speedup for\n512k context lengths.\n","authors":["Rya Sanovar","Srikant Bharadwaj","Renee St. Amant","Victor Rühle","Saravan Rajmohan"],"pdf_url":"https://arxiv.org/pdf/2405.10480v2.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.14389v2","updated":"2025-01-14T04:58:26Z","published":"2024-04-22T17:50:27Z","title":"Poisoning Attacks on Federated Learning-based Wireless Traffic\n  Prediction","summary":"  Federated Learning (FL) offers a distributed framework to train a global\ncontrol model across multiple base stations without compromising the privacy of\ntheir local network data. This makes it ideal for applications like wireless\ntraffic prediction (WTP), which plays a crucial role in optimizing network\nresources, enabling proactive traffic flow management, and enhancing the\nreliability of downstream communication-aided applications, such as IoT\ndevices, autonomous vehicles, and industrial automation systems. Despite its\npromise, the security aspects of FL-based distributed wireless systems,\nparticularly in regression-based WTP problems, remain inadequately\ninvestigated. In this paper, we introduce a novel fake traffic injection (FTI)\nattack, designed to undermine the FL-based WTP system by injecting fabricated\ntraffic distributions with minimal knowledge. We further propose a defense\nmechanism, termed global-local inconsistency detection (GLID), which\nstrategically removes abnormal model parameters that deviate beyond a specific\npercentile range estimated through statistical methods in each dimension.\nExtensive experimental evaluations, performed on real-world wireless traffic\ndatasets, demonstrate that both our attack and defense strategies significantly\noutperform existing baselines.\n","authors":["Zifan Zhang","Minghong Fang","Jiayuan Huang","Yuchen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.14389v2.pdf","comment":"Accepted by IFIP/IEEE Networking 2024"},{"id":"http://arxiv.org/abs/2501.03461v2","updated":"2025-01-14T04:53:30Z","published":"2025-01-07T01:35:56Z","title":"Radar Signal Recognition through Self-Supervised Learning and Domain\n  Adaptation","summary":"  Automatic radar signal recognition (RSR) plays a pivotal role in electronic\nwarfare (EW), as accurately classifying radar signals is critical for informing\ndecision-making processes. Recent advances in deep learning have shown\nsignificant potential in improving RSR performance in domains with ample\nannotated data. However, these methods fall short in EW scenarios where\nannotated RF data are scarce or impractical to obtain. To address these\nchallenges, we introduce a self-supervised learning (SSL) method which utilises\nmasked signal modelling and RF domain adaption to enhance RSR performance in\nenvironments with limited RF samples and labels. Specifically, we investigate\npre-training masked autoencoders (MAE) on baseband in-phase and quadrature\n(I/Q) signals from various RF domains and subsequently transfer the learned\nrepresentation to the radar domain, where annotated data are limited. Empirical\nresults show that our lightweight self-supervised ResNet model with domain\nadaptation achieves up to a 17.5% improvement in 1-shot classification accuracy\nwhen pre-trained on in-domain signals (i.e., radar signals) and up to a 16.31%\nimprovement when pre-trained on out-of-domain signals (i.e., comm signals),\ncompared to its baseline without SSL. We also provide reference results for\nseveral MAE designs and pre-training strategies, establishing a new benchmark\nfor few-shot radar signal classification.\n","authors":["Zi Huang","Simon Denman","Akila Pemasiri","Clinton Fookes","Terrence Martin"],"pdf_url":"https://arxiv.org/pdf/2501.03461v2.pdf","comment":"5 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.06366v2","updated":"2025-01-14T04:42:08Z","published":"2025-01-10T22:27:44Z","title":"Counterfactually Fair Reinforcement Learning via Sequential Data\n  Preprocessing","summary":"  When applied in healthcare, reinforcement learning (RL) seeks to dynamically\nmatch the right interventions to subjects to maximize population benefit.\nHowever, the learned policy may disproportionately allocate efficacious actions\nto one subpopulation, creating or exacerbating disparities in other\nsocioeconomically-disadvantaged subgroups. These biases tend to occur in\nmulti-stage decision making and can be self-perpetuating, which if unaccounted\nfor could cause serious unintended consequences that limit access to care or\ntreatment benefit. Counterfactual fairness (CF) offers a promising statistical\ntool grounded in causal inference to formulate and study fairness. In this\npaper, we propose a general framework for fair sequential decision making. We\ntheoretically characterize the optimal CF policy and prove its stationarity,\nwhich greatly simplifies the search for optimal CF policies by leveraging\nexisting RL algorithms. The theory also motivates a sequential data\npreprocessing algorithm to achieve CF decision making under an additive noise\nassumption. We prove and then validate our policy learning approach in\ncontrolling unfairness and attaining optimal value through simulations.\nAnalysis of a digital health dataset designed to reduce opioid misuse shows\nthat our proposal greatly enhances fair access to counseling.\n","authors":["Jitao Wang","Chengchun Shi","John D. Piette","Joshua R. Loftus","Donglin Zeng","Zhenke Wu"],"pdf_url":"https://arxiv.org/pdf/2501.06366v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07834v1","updated":"2025-01-14T04:35:37Z","published":"2025-01-14T04:35:37Z","title":"Flow: A Modular Approach to Automated Agentic Workflow Generation","summary":"  Multi-agent frameworks powered by large language models (LLMs) have\ndemonstrated great success in automated planning and task execution. However,\nthe effective adjustment of Agentic workflows during execution has not been\nwell-studied. A effective workflow adjustment is crucial, as in many real-world\nscenarios, the initial plan must adjust to unforeseen challenges and changing\nconditions in real-time to ensure the efficient execution of complex tasks. In\nthis paper, we define workflows as an activity-on-vertex (AOV) graphs. We\ncontinuously refine the workflow by dynamically adjusting task allocations\nbased on historical performance and previous AOV with LLM agents. To further\nenhance system performance, we emphasize modularity in workflow design based on\nmeasuring parallelism and dependence complexity. Our proposed multi-agent\nframework achieved efficient sub-task concurrent execution, goal achievement,\nand error tolerance. Empirical results across different practical tasks\ndemonstrate dramatic improvements in the efficiency of multi-agent frameworks\nthrough dynamic workflow updating and modularization.\n","authors":["Boye Niu","Yiliao Song","Kai Lian","Yifan Shen","Yu Yao","Kun Zhang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.07834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06408v2","updated":"2025-01-14T04:30:31Z","published":"2025-01-11T02:23:08Z","title":"Computational and Statistical Asymptotic Analysis of the JKO Scheme for\n  Iterative Algorithms to update distributions","summary":"  The seminal paper of Jordan, Kinderlehrer, and Otto introduced what is now\nwidely known as the JKO scheme, an iterative algorithmic framework for\ncomputing distributions. This scheme can be interpreted as a Wasserstein\ngradient flow and has been successfully applied in machine learning contexts,\nsuch as deriving policy solutions in reinforcement learning. In this paper, we\nextend the JKO scheme to accommodate models with unknown parameters.\nSpecifically, we develop statistical methods to estimate these parameters and\nadapt the JKO scheme to incorporate the estimated values. To analyze the\nadopted statistical JKO scheme, we establish an asymptotic theory via\nstochastic partial differential equations that describes its limiting dynamic\nbehavior. Our framework allows both the sample size used in parameter\nestimation and the number of algorithmic iterations to go to infinity. This\nstudy offers a unified framework for joint computational and statistical\nasymptotic analysis of the statistical JKO scheme. On the computational side,\nwe examine the scheme's dynamic behavior as the number of iterations increases,\nwhile on the statistical side, we investigate the large-sample behavior of the\nresulting distributions computed through the scheme. We conduct numerical\nsimulations to evaluate the finite-sample performance of the proposed methods\nand validate the developed asymptotic theory.\n","authors":["Shang Wu","Yazhen Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11869v3","updated":"2025-01-14T04:25:23Z","published":"2024-08-19T02:27:00Z","title":"ELDER: Enhancing Lifelong Model Editing with Mixture-of-LoRA","summary":"  Large language models (LLMs) require model editing to efficiently update\nspecific knowledge within them and avoid factual errors. Most model editing\nmethods are solely designed for single-time use and result in a significant\nforgetting effect in lifelong editing scenarios, where sequential edits are\nconducted over time. Previous approaches manage sequential edits by freezing\noriginal parameters and discretely allocating new parameters for each knowledge\nupdate. However, these methods lack robustness to minor input variations due to\nthe discrete mapping between data and parameters. To overcome this challenge,\nwe propose ELDER, a novel approach to create a continuous association between\ndata and adapters. ELDER integrates multiple LoRAs through a router network and\nis trained to establish a smooth data-adapter association, thereby enhancing\nthe edit robustness and generalization of semantically equivalent inputs. To\nensure inputs containing the same knowledge will be processed by the same\nLoRAs, we design a novel loss to guide the model link LoRA allocations with\nedit knowledge. Furthermore, we propose a deferral mechanism to retain the\noriginal LLM capabilities post-edit. Extensive experiments on GPT-2 XL and\nLLaMA2-7B demonstrate that ELDER effectively edits models in the lifelong\nsetting, outperforming eight baselines while exhibiting strong scalability and\npreserving LLMs' general abilities on downstream tasks. Our code is available\nat https://github.com/JiaangL/ELDER.\n","authors":["Jiaang Li","Quan Wang","Zhongnan Wang","Yongdong Zhang","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2408.11869v3.pdf","comment":"Accepted by AAAI-25"},{"id":"http://arxiv.org/abs/2410.12476v2","updated":"2025-01-14T04:19:49Z","published":"2024-10-16T11:46:32Z","title":"Retrieval-Reasoning Large Language Model-based Synthetic Clinical Trial\n  Generation","summary":"  Machine learning (ML) exhibits promise in the clinical domain. However, it is\nconstrained by data scarcity and ethical considerations, as the generation of\nclinical trials presents significant challenges due to stringent privacy\nregulations, high costs, and the extended duration required for conducting\nstudies with human participants. Despite the advancements of large language\nmodels (LLMs) in general generation tasks, their potential in facilitating the\ngeneration of synthetic clinical trials is under-explored. To address this gap,\nwe introduce a novel Retrieval-Reasoning few-shot framework that leverages LLMs\nto generate artificial yet realistic and diverse clinical trials with binary\nsuccess/failure labels. Experiments conducted on real clinical trials from the\n\\url{ClinicalTrials.gov} database demonstrate that our synthetic data can\neffectively augment real datasets. Furthermore, by fine-tuning a pre-trained\nmodel as a binary classifier on synthetic clinical trial datasets, we\ndemonstrate that this augmentation enhances model training for downstream tasks\nsuch as trial outcome prediction. Our findings suggest that LLMs for synthetic\nclinical trial generation hold promise for accelerating clinical research and\nupholding ethical standards for patient privacy. The code is publicly available\nat\nhttps://anonymous.4open.science/r/Retrieval_Reasoning_Clinical_Trial_Generation-3EC4.\n","authors":["Zerui Xu","Fang Wu","Yuanyuan Zhang","Yue Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.12476v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15240v3","updated":"2025-01-14T04:10:46Z","published":"2024-11-22T01:58:35Z","title":"AI Foundation Models for Wearable Movement Data in Mental Health\n  Research","summary":"  Pretrained foundation models and transformer architectures have driven the\nsuccess of large language models (LLMs) and other modern AI breakthroughs.\nHowever, similar advancements in health data modeling remain limited due to the\nneed for innovative adaptations. Wearable movement data offers a valuable\navenue for exploration, as it's a core feature in nearly all commercial\nsmartwatches, well established in clinical and mental health research, and the\nsequential nature of the data shares similarities to language. We introduce the\nPretrained Actigraphy Transformer (PAT), the first open source foundation model\ndesigned for time-series wearable movement data. Leveraging transformer-based\narchitectures and novel techniques, such as patch embeddings, and pretraining\non data from 29,307 participants in a national U.S. sample, PAT achieves\nstate-of-the-art performance in several mental health prediction tasks. PAT is\nalso lightweight and easily interpretable, making it a robust tool for mental\nhealth research.\n  GitHub: https://github.com/njacobsonlab/Pretrained-Actigraphy-Transformer/\n","authors":["Franklin Y. Ruan","Aiwei Zhang","Jenny Y. Oh","SouYoung Jin","Nicholas C. Jacobson"],"pdf_url":"https://arxiv.org/pdf/2411.15240v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07827v1","updated":"2025-01-14T04:02:08Z","published":"2025-01-14T04:02:08Z","title":"Prediction Interval Construction Method for Electricity Prices","summary":"  Accurate prediction of electricity prices plays an essential role in the\nelectricity market. To reflect the uncertainty of electricity prices, price\nintervals are predicted. This paper proposes a novel prediction interval\nconstruction method. A conditional generative adversarial network is first\npresented to generate electricity price scenarios, with which the prediction\nintervals can be constructed. Then, different generated scenarios are stacked\nto obtain the probability densities, which can be applied to accurately reflect\nthe uncertainty of electricity prices. Furthermore, a reinforced prediction\nmechanism based on the volatility level of weather factors is introduced to\naddress the spikes or volatile prices. A case study is conducted to verify the\neffectiveness of the proposed novel prediction interval construction method.\nThe method can also provide the probability density of each price scenario\nwithin the prediction interval and has the superiority to address the volatile\nprices and price spikes with a reinforced prediction mechanism.\n","authors":["Xin Lu"],"pdf_url":"https://arxiv.org/pdf/2501.07827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07824v1","updated":"2025-01-14T03:59:48Z","published":"2025-01-14T03:59:48Z","title":"Real-time Verification and Refinement of Language Model Text Generation","summary":"  Large language models (LLMs) have shown remarkable performance across a wide\nrange of natural language tasks. However, a critical challenge remains in that\nthey sometimes generate factually incorrect answers. To address this, while\nmany previous work has focused on identifying errors in their generation and\nfurther refining them, they are slow in deployment since they are designed to\nverify the response from LLMs only after their entire generation (from the\nfirst to last tokens) is done. Further, we observe that once LLMs generate\nincorrect tokens early on, there is a higher likelihood that subsequent tokens\nwill also be factually incorrect. To this end, in this work, we propose\nStreaming-VR (Streaming Verification and Refinement), a novel approach designed\nto enhance the efficiency of verification and refinement of LLM outputs.\nSpecifically, the proposed Streaming-VR enables on-the-fly verification and\ncorrection of tokens as they are being generated, similar to a streaming\nprocess, ensuring that each subset of tokens is checked and refined in\nreal-time by another LLM as the LLM constructs its response. Through\ncomprehensive evaluations on multiple datasets, we demonstrate that our\napproach not only enhances the factual accuracy of LLMs, but also offers a more\nefficient solution compared to prior refinement methods.\n","authors":["Joonho Ko","Jinheon Baek","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2501.07824v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.07818v1","updated":"2025-01-14T03:43:23Z","published":"2025-01-14T03:43:23Z","title":"A Multi-Encoder Frozen-Decoder Approach for Fine-Tuning Large Language\n  Models","summary":"  Among parameter-efficient fine-tuning methods, freezing has emerged as a\npopular strategy for speeding up training, reducing catastrophic forgetting,\nand improving downstream performance. We investigate the impact of freezing the\ndecoder in a multi-task setup comprising diverse natural language tasks, aiming\nto reduce deployment overhead and enhance portability to novel tasks. Our\nexperiments, conducted by fine-tuning both individual and multi-task setups on\nthe AlexaTM model, reveal that freezing decoders is highly effective for tasks\nwith natural language outputs and mitigates catastrophic forgetting in\nmultilingual tasks. However, we find that pairing frozen decoders with a larger\nmodel can effectively maintain or even enhance performance in structured and QA\ntasks, making it a viable strategy for a broader range of task types.\n","authors":["Kaustubh D. Dhole"],"pdf_url":"https://arxiv.org/pdf/2501.07818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00090v2","updated":"2025-01-14T03:27:10Z","published":"2024-11-27T12:34:45Z","title":"Energy-Efficient Split Learning for Fine-Tuning Large Language Models in\n  Edge Networks","summary":"  In this letter, we propose an energy-efficient split learning (SL) framework\nfor fine-tuning large language models (LLMs) using geo-distributed personal\ndata at the network edge, where LLMs are split and alternately across massive\nmobile devices and an edge server. Considering the device heterogeneity and\nchannel dynamics in edge networks, a \\underline{C}ut l\\underline{A}yer and\ncomputing \\underline{R}esource \\underline{D}ecision (CARD) algorithm is\ndeveloped to minimize training delay and energy consumption. Simulation results\ndemonstrate that the proposed approach reduces the average training delay and\nserver's energy consumption by 70.8% and 53.1%, compared to the benchmarks,\nrespectively.\n","authors":["Zuguang Li","Shaohua Wu","Liang Li","Songge Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.00090v2.pdf","comment":"5 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.07814v1","updated":"2025-01-14T03:26:05Z","published":"2025-01-14T03:26:05Z","title":"STTS-EAD: Improving Spatio-Temporal Learning Based Time Series\n  Prediction via","summary":"  Handling anomalies is a critical preprocessing step in multivariate time\nseries prediction. However, existing approaches that separate anomaly\npreprocessing from model training for multivariate time series prediction\nencounter significant limitations. Specifically, these methods fail to utilize\nauxiliary information crucial for identifying latent anomalies associated with\nspatiotemporal factors during the preprocessing stage. Instead, they rely\nsolely on data distribution for anomaly detection, which can result in the\nincorrect processing of numerous samples that could otherwise contribute\npositively to model training. To address this, we propose STTS-EAD, an\nend-to-end method that seamlessly integrates anomaly detection into the\ntraining process of multivariate time series forecasting and aims to improve\nSpatio-Temporal learning based Time Series prediction via Embedded Anomaly\nDetection. Our proposed STTS-EAD leverages spatio-temporal information for\nforecasting and anomaly detection, with the two parts alternately executed and\noptimized for each other. To the best of our knowledge, STTS-EAD is the first\nto integrate anomaly detection and forecasting tasks in the training phase for\nimproving the accuracy of multivariate time series forecasting. Extensive\nexperiments on a public stock dataset and two real-world sales datasets from a\nrenowned coffee chain enterprise show that our proposed method can effectively\nprocess detected anomalies in the training stage to improve forecasting\nperformance in the inference stage and significantly outperform baselines.\n","authors":["Yuanyuan Liang","Tianhao Zhang","Tingyu Xie"],"pdf_url":"https://arxiv.org/pdf/2501.07814v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2501.07809v1","updated":"2025-01-14T03:20:17Z","published":"2025-01-14T03:20:17Z","title":"Conformal mapping Coordinates Physics-Informed Neural Networks\n  (CoCo-PINNs): learning neural networks for designing neutral inclusions","summary":"  We focus on designing and solving the neutral inclusion problem via neural\nnetworks. The neutral inclusion problem has a long history in the theory of\ncomposite materials, and it is exceedingly challenging to identify the precise\ncondition that precipitates a general-shaped inclusion into a neutral\ninclusion. Physics-informed neural networks (PINNs) have recently become a\nhighly successful approach to addressing both forward and inverse problems\nassociated with partial differential equations. We found that traditional PINNs\nperform inadequately when applied to the inverse problem of designing neutral\ninclusions with arbitrary shapes. In this study, we introduce a novel approach,\nConformal mapping Coordinates Physics-Informed Neural Networks (CoCo-PINNs),\nwhich integrates complex analysis techniques into PINNs. This method exhibits\nstrong performance in solving forward-inverse problems to construct neutral\ninclusions of arbitrary shapes in two dimensions, where the imperfect interface\ncondition on the inclusion's boundary is modeled by training neural networks.\nNotably, we mathematically prove that training with a single linear field is\nsufficient to achieve neutrality for untrained linear fields in arbitrary\ndirections, given a minor assumption. We demonstrate that CoCo-PINNs offer\nenhanced performances in terms of credibility, consistency, and stability.\n","authors":["Daehee Cho","Hyeonmin Yun","Jaeyong Lee","Mikyoung Lim"],"pdf_url":"https://arxiv.org/pdf/2501.07809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12843v3","updated":"2025-01-14T03:08:02Z","published":"2024-06-18T17:57:49Z","title":"Can Go AIs be adversarially robust?","summary":"  Prior work found that superhuman Go AIs can be defeated by simple adversarial\nstrategies, especially \"cyclic\" attacks. In this paper, we study whether adding\nnatural countermeasures can achieve robustness in Go, a favorable domain for\nrobustness since it benefits from incredible average-case capability and a\nnarrow, innately adversarial setting. We test three defenses: adversarial\ntraining on hand-constructed positions, iterated adversarial training, and\nchanging the network architecture. We find that though some of these defenses\nprotect against previously discovered attacks, none withstand freshly trained\nadversaries. Furthermore, most of the reliably effective attacks these\nadversaries discover are different realizations of the same overall class of\ncyclic attacks. Our results suggest that building robust AI systems is\nchallenging even with extremely superhuman systems in some of the most\ntractable settings, and highlight two key gaps: efficient generalization of\ndefenses, and diversity in training. For interactive examples of attacks and a\nlink to our codebase, see https://goattack.far.ai.\n","authors":["Tom Tseng","Euan McLean","Kellin Pelrine","Tony T. Wang","Adam Gleave"],"pdf_url":"https://arxiv.org/pdf/2406.12843v3.pdf","comment":"63 pages, AAAI 2025"},{"id":"http://arxiv.org/abs/2501.07800v1","updated":"2025-01-14T02:56:19Z","published":"2025-01-14T02:56:19Z","title":"BioPose: Biomechanically-accurate 3D Pose Estimation from Monocular\n  Videos","summary":"  Recent advancements in 3D human pose estimation from single-camera images and\nvideos have relied on parametric models, like SMPL. However, these models\noversimplify anatomical structures, limiting their accuracy in capturing true\njoint locations and movements, which reduces their applicability in\nbiomechanics, healthcare, and robotics. Biomechanically accurate pose\nestimation, on the other hand, typically requires costly marker-based motion\ncapture systems and optimization techniques in specialized labs. To bridge this\ngap, we propose BioPose, a novel learning-based framework for predicting\nbiomechanically accurate 3D human pose directly from monocular videos. BioPose\nincludes three key components: a Multi-Query Human Mesh Recovery model\n(MQ-HMR), a Neural Inverse Kinematics (NeurIK) model, and a 2D-informed pose\nrefinement technique. MQ-HMR leverages a multi-query deformable transformer to\nextract multi-scale fine-grained image features, enabling precise human mesh\nrecovery. NeurIK treats the mesh vertices as virtual markers, applying a\nspatial-temporal network to regress biomechanically accurate 3D poses under\nanatomical constraints. To further improve 3D pose estimations, a 2D-informed\nrefinement step optimizes the query tokens during inference by aligning the 3D\nstructure with 2D pose observations. Experiments on benchmark datasets\ndemonstrate that BioPose significantly outperforms state-of-the-art methods.\nProject website:\n\\url{https://m-usamasaleem.github.io/publication/BioPose/BioPose.html}.\n","authors":["Farnoosh Koleini","Muhammad Usama Saleem","Pu Wang","Hongfei Xue","Ahmed Helmy","Abbey Fenwick"],"pdf_url":"https://arxiv.org/pdf/2501.07800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15274v3","updated":"2025-01-14T02:52:40Z","published":"2024-10-20T04:17:59Z","title":"Physically Guided Deep Unsupervised Inversion for 1D Magnetotelluric\n  Models","summary":"  The global demand for unconventional energy sources such as geothermal energy\nand white hydrogen requires new exploration techniques for precise subsurface\nstructure characterization and potential reservoir identification. The\nMagnetotelluric (MT) method is crucial for these tasks, providing critical\ninformation on the distribution of subsurface electrical resistivity at depths\nranging from hundreds to thousands of meters. However, traditional iterative\nalgorithm-based inversion methods require the adjustment of multiple\nparameters, demanding time-consuming and exhaustive tuning processes to achieve\nproper cost function minimization. Recent advances have incorporated deep\nlearning algorithms for MT inversion, primarily based on supervised learning,\nand large labeled datasets are needed for training. This work utilizes\nTensorFlow operations to create a differentiable forward MT operator,\nleveraging its automatic differentiation capability. Moreover, instead of\nsolving for the subsurface model directly, as classical algorithms perform,\nthis paper presents a new deep unsupervised inversion algorithm guided by\nphysics to estimate 1D MT models. Instead of using datasets with the observed\ndata and their respective model as labels during training, our method employs a\ndifferentiable modeling operator that physically guides the cost function\nminimization, making the proposed method solely dependent on observed data.\nTherefore, the optimization algorithm updates the network weights to minimize\nthe data misfit. We test the proposed method with field and synthetic data at\ndifferent acquisition frequencies, demonstrating that the resistivity models\nobtained are more accurate than those calculated using other techniques.\n","authors":["Paul Goyes-Peñafiel","Umair bin Waheed","Henry Arguello"],"pdf_url":"https://arxiv.org/pdf/2410.15274v3.pdf","comment":"5 pages, 6 figures, github repository, submitted to IEEE-GRSL"},{"id":"http://arxiv.org/abs/2501.06252v2","updated":"2025-01-14T02:52:26Z","published":"2025-01-09T01:19:21Z","title":"$\\text{Transformer}^2$: Self-adaptive LLMs","summary":"  Self-adaptive large language models (LLMs) aim to solve the challenges posed\nby traditional fine-tuning methods, which are often computationally intensive\nand static in their ability to handle diverse tasks. We introduce\n$\\text{Transformer}^2$, a novel self-adaptation framework that adapts LLMs for\nunseen tasks in real-time by selectively adjusting only the singular components\nof their weight matrices. During inference, $\\text{Transformer}^2$ employs a\ntwo-pass mechanism: first, a dispatch system identifies the task properties,\nand then task-specific \"expert\" vectors, trained using reinforcement learning,\nare dynamically mixed to obtain targeted behavior for the incoming prompt. Our\nmethod outperforms ubiquitous approaches such as LoRA, with fewer parameters\nand greater efficiency. $\\text{Transformer}^2$ demonstrates versatility across\ndifferent LLM architectures and modalities, including vision-language tasks.\n$\\text{Transformer}^2$ represents a significant leap forward, offering a\nscalable, efficient solution for enhancing the adaptability and task-specific\nperformance of LLMs, paving the way for truly dynamic, self-organizing AI\nsystems.\n","authors":["Qi Sun","Edoardo Cetin","Yujin Tang"],"pdf_url":"https://arxiv.org/pdf/2501.06252v2.pdf","comment":"18 panges, 11 figures, 9 tables"},{"id":"http://arxiv.org/abs/2501.07564v2","updated":"2025-01-14T02:38:26Z","published":"2025-01-13T18:53:23Z","title":"E2ESlack: An End-to-End Graph-Based Framework for Pre-Routing Slack\n  Prediction","summary":"  Pre-routing slack prediction remains a critical area of research in\nElectronic Design Automation (EDA). Despite numerous machine learning-based\napproaches targeting this task, there is still a lack of a truly end-to-end\nframework that engineers can use to obtain TNS/WNS metrics from raw circuit\ndata at the placement stage. Existing works have demonstrated effectiveness in\nArrival Time (AT) prediction but lack a mechanism for Required Arrival Time\n(RAT) prediction, which is essential for slack prediction and obtaining TNS/WNS\nmetrics. In this work, we propose E2ESlack, an end-to-end graph-based framework\nfor pre-routing slack prediction. The framework includes a TimingParser that\nsupports DEF, SDF and LIB files for feature extraction and graph construction,\nan arrival time prediction model and a fast RAT estimation module. To the best\nof our knowledge, this is the first work capable of predicting path-level\nslacks at the pre-routing stage. We perform extensive experiments and\ndemonstrate that our proposed RAT estimation method outperforms the SOTA\nML-based prediction method and also pre-routing STA tool. Additionally, the\nproposed E2ESlack framework achieves TNS/WNS values comparable to post-routing\nSTA results while saving up to 23x runtime.\n","authors":["Saurabh Bodhe","Zhanguang Zhang","Atia Hamidizadeh","Shixiong Kai","Yingxue Zhang","Mingxuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2501.07564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07794v1","updated":"2025-01-14T02:33:40Z","published":"2025-01-14T02:33:40Z","title":"Linearly Convergent Mixup Learning","summary":"  Learning in the reproducing kernel Hilbert space (RKHS) such as the support\nvector machine has been recognized as a promising technique. It continues to be\nhighly effective and competitive in numerous prediction tasks, particularly in\nsettings where there is a shortage of training data or computational\nlimitations exist. These methods are especially valued for their ability to\nwork with small datasets and their interpretability. To address the issue of\nlimited training data, mixup data augmentation, widely used in deep learning,\nhas remained challenging to apply to learning in RKHS due to the generation of\nintermediate class labels. Although gradient descent methods handle these\nlabels effectively, dual optimization approaches are typically not directly\napplicable. In this study, we present two novel algorithms that extend to a\nbroader range of binary classification models. Unlike gradient-based\napproaches, our algorithms do not require hyperparameters like learning rates,\nsimplifying their implementation and optimization. Both the number of\niterations to converge and the computational cost per iteration scale linearly\nwith respect to the dataset size. The numerical experiments demonstrate that\nour algorithms achieve faster convergence to the optimal solution compared to\ngradient descent approaches, and that mixup data augmentation consistently\nimproves the predictive performance across various loss functions.\n","authors":["Gakuto Obi","Ayato Saito","Yuto Sasaki","Tsuyoshi Kato"],"pdf_url":"https://arxiv.org/pdf/2501.07794v1.pdf","comment":"none"},{"id":"http://arxiv.org/abs/2407.02772v2","updated":"2025-01-14T02:30:09Z","published":"2024-07-03T03:01:43Z","title":"Gradient descent with generalized Newton's method","summary":"  We propose the generalized Newton's method (GeN) -- a Hessian-informed\napproach that applies to any optimizer such as SGD and Adam, and covers the\nNewton-Raphson method as a sub-case. Our method automatically and dynamically\nselects the learning rate that accelerates the convergence, without the\nintensive tuning of the learning rate scheduler. In practice, our method is\neasily implementable, since it only requires additional forward passes with\nalmost zero computational overhead (in terms of training time and memory cost),\nif the overhead is amortized over many iterations. We present extensive\nexperiments on language and vision tasks (e.g. GPT and ResNet) to showcase that\nGeN optimizers match the state-of-the-art performance, which was achieved with\ncarefully tuned learning rate schedulers.\n","authors":["Zhiqi Bu","Shiyun Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19784v4","updated":"2025-01-14T02:28:28Z","published":"2024-12-27T18:25:27Z","title":"Can AI Help with Your Personal Finances?","summary":"  In recent years, Large Language Models (LLMs) have emerged as a\ntransformative development in artificial intelligence (AI), drawing significant\nattention from industry and academia. Trained on vast datasets, these\nsophisticated AI systems exhibit impressive natural language processing and\ncontent generation capabilities. This paper explores the potential of LLMs to\naddress key challenges in personal finance, focusing on the United States. We\nevaluate several leading LLMs, including OpenAI's ChatGPT, Google's Gemini,\nAnthropic's Claude, and Meta's Llama, to assess their effectiveness in\nproviding accurate financial advice on topics such as mortgages, taxes, loans,\nand investments. Our findings show that while these models achieve an average\naccuracy rate of approximately 70%, they also display notable limitations in\ncertain areas. Specifically, LLMs struggle to provide accurate responses for\ncomplex financial queries, with performance varying significantly across\ndifferent topics. Despite these limitations, the analysis reveals notable\nimprovements in newer versions of these models, highlighting their growing\nutility for individuals and financial advisors. As these AI systems continue to\nevolve, their potential for advancing AI-driven applications in personal\nfinance becomes increasingly promising.\n","authors":["Oudom Hean","Utsha Saha","Binita Saha"],"pdf_url":"https://arxiv.org/pdf/2412.19784v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12463v2","updated":"2025-01-14T01:57:04Z","published":"2024-08-22T15:04:59Z","title":"Smartphone-based Eye Tracking System using Edge Intelligence and Model\n  Optimisation","summary":"  A significant limitation of current smartphone-based eye-tracking algorithms\nis their low accuracy when applied to video-type visual stimuli, as they are\ntypically trained on static images. Also, the increasing demand for real-time\ninteractive applications like games, VR, and AR on smartphones requires\novercoming the limitations posed by resource constraints such as limited\ncomputational power, battery life, and network bandwidth. Therefore, we\ndeveloped two new smartphone eye-tracking techniques for video-type visuals by\ncombining Convolutional Neural Networks (CNN) with two different Recurrent\nNeural Networks (RNN), namely Long Short Term Memory (LSTM) and Gated Recurrent\nUnit (GRU). Our CNN+LSTM and CNN+GRU models achieved an average Root Mean\nSquare Error of 0.955 cm and 1.091 cm, respectively. To address the\ncomputational constraints of smartphones, we developed an edge intelligence\narchitecture to enhance the performance of smartphone-based eye tracking. We\napplied various optimisation methods like quantisation and pruning to deep\nlearning models for better energy, CPU, and memory usage on edge devices,\nfocusing on real-time processing. Using model quantisation, the model inference\ntime in the CNN+LSTM and CNN+GRU models was reduced by 21.72% and 19.50%,\nrespectively, on edge devices.\n","authors":["Nishan Gunawardena","Gough Yumu Lui","Jeewani Anupama Ginige","Bahman Javadi"],"pdf_url":"https://arxiv.org/pdf/2408.12463v2.pdf","comment":"I have included the three papers as reference, which are closely\n  related. We have expanded the future work section to provide a more thorough\n  discussion of the concepts of \"varying lighting conditions\" and \"dynamic user\n  environments.\" We have added a note below Table 4 to clarify the\n  abbreviations' meaning. Elaborated the role of the Domain Expert within the\n  presentation layer in Section 4.1"},{"id":"http://arxiv.org/abs/2312.09982v4","updated":"2025-01-14T01:42:46Z","published":"2023-12-15T17:49:24Z","title":"ACPO: AI-Enabled Compiler Framework","summary":"  The key to performance optimization of a program is to decide correctly when\na certain transformation should be applied by a compiler. This is an ideal\nopportunity to apply machine-learning models to speed up the tuning process;\nwhile this realization has been around since the late 90s, only recent\nadvancements in ML enabled a practical application of ML to compilers as an\nend-to-end framework.\n  This paper presents ACPO: An AI-Enabled Compiler Framework, a novel framework\nthat provides LLVM with simple and comprehensive tools to benefit from\nemploying ML models for different optimization passes. We first showcase the\nhigh-level view, class hierarchy, and functionalities of ACPO and subsequently,\ndemonstrate \\taco{a couple of use cases of ACPO by ML-enabling the Loop Unroll\nand Function Inlining passes used in LLVM's O3. and finally, describe how ACPO\ncan be leveraged to optimize other passes. Experimental results reveal that the\nACPO model for Loop Unroll can gain on average 4%, 3%, 5.4%, and 0.2% compared\nto LLVM's vanilla O3 optimization when deployed on Polybench, Coral-2,\nCoreMark, and Graph-500, respectively. Furthermore, by including both Function\nInlining and Loop Unroll models, ACPO can provide a combined speedup of 4.5% on\nPolybench and 2.4% on Cbench when compared with LLVM's O3, respectively.\n","authors":["Amir H. Ashouri","Muhammad Asif Manzoor","Duc Minh Vu","Raymond Zhang","Colin Toft","Ziwen Wang","Angel Zhang","Bryan Chan","Tomasz S. Czajkowski","Yaoqing Gao"],"pdf_url":"https://arxiv.org/pdf/2312.09982v4.pdf","comment":"ACPO (12 pages)"},{"id":"http://arxiv.org/abs/2404.12404v4","updated":"2025-01-14T01:41:21Z","published":"2024-04-15T17:49:16Z","title":"EPIC: Effective Prompting for Imbalanced-Class Data Synthesis in Tabular\n  Data Classification via Large Language Models","summary":"  Large language models (LLMs) have demonstrated remarkable in-context learning\ncapabilities across diverse applications. In this work, we explore the\neffectiveness of LLMs for generating realistic synthetic tabular data,\nidentifying key prompt design elements to optimize performance. We introduce\nEPIC, a novel approach that leverages balanced, grouped data samples and\nconsistent formatting with unique variable mapping to guide LLMs in generating\naccurate synthetic data across all classes, even for imbalanced datasets.\nEvaluations on real-world datasets show that EPIC achieves state-of-the-art\nmachine learning classification performance, significantly improving generation\nefficiency. These findings highlight the effectiveness of EPIC for synthetic\ntabular data generation, particularly in addressing class imbalance. Our source\ncode for our work is available at:\nhttps://seharanul17.github.io/project-synthetic-tabular-llm/\n","authors":["Jinhee Kim","Taesung Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2404.12404v4.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2210.01272v3","updated":"2025-01-14T01:34:10Z","published":"2022-10-03T23:44:38Z","title":"A systematic review of the use of Deep Learning in Satellite Imagery for\n  Agriculture","summary":"  Agricultural research is essential for increasing food production to meet the\nrequirements of an increasing population in the coming decades. Recently,\nsatellite technology has been improving rapidly and deep learning has seen much\nsuccess in generic computer vision tasks and many application areas which\npresents an important opportunity to improve analysis of agricultural land.\nHere we present a systematic review of 150 studies to find the current uses of\ndeep learning on satellite imagery for agricultural research. Although we\nidentify 5 categories of agricultural monitoring tasks, the majority of the\nresearch interest is in crop segmentation and yield prediction. We found that,\nwhen used, modern deep learning methods consistently outperformed traditional\nmachine learning across most tasks; the only exception was that Long Short-Term\nMemory (LSTM) Recurrent Neural Networks did not consistently outperform Random\nForests (RF) for yield prediction. The reviewed studies have largely adopted\nmethodologies from generic computer vision, except for one major omission:\nbenchmark datasets are not utilised to evaluate models across studies, making\nit difficult to compare results. Additionally, some studies have specifically\nutilised the extra spectral resolution available in satellite imagery, but\nother divergent properties of satellite images - such as the hugely different\nscales of spatial patterns - are not being taken advantage of in the reviewed\nstudies.\n","authors":["Brandon Victor","Zhen He","Aiden Nibali"],"pdf_url":"https://arxiv.org/pdf/2210.01272v3.pdf","comment":"23 pages, 5 figures and 10 tables in main paper. Final version, as\n  submitted and accepted at JSTARS"},{"id":"http://arxiv.org/abs/2302.01313v8","updated":"2025-01-14T01:28:03Z","published":"2023-02-02T18:39:30Z","title":"Double Equivariance for Inductive Link Prediction for Both New Nodes and\n  New Relation Types","summary":"  The task of fully inductive link prediction in knowledge graphs has gained\nsignificant attention, with various graph neural networks being proposed to\naddress it. This task presents greater challenges than traditional inductive\nlink prediction tasks with only new nodes, as models must be capable of\nzero-shot generalization to both unseen nodes and unseen relation types in the\ninference graph. Despite the development of novel models, a unifying\ntheoretical understanding of their success remains elusive, and the limitations\nof these methods are not well-studied. In this work, we introduce the concept\nof double permutation-equivariant representations and demonstrate its necessity\nfor effective performance in this task. We show that many existing models,\ndespite their diverse architectural designs, conform to this framework.\nHowever, we also identify inherent limitations in double\npermutation-equivariant representations, which restrict these models's ability\nto learn effectively on datasets with varying characteristics. Our findings\nsuggest that while double equivariance is necessary for meta-learning across\nknowledge graphs from different domains, it is not sufficient. There remains a\nfundamental gap between double permutation-equivariant models and the concept\nof foundation models designed to learn patterns across all domains.\n","authors":["Jincheng Zhou","Yucheng Zhang","Jianfei Gao","Yangze Zhou","Bruno Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2302.01313v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07774v1","updated":"2025-01-14T01:16:30Z","published":"2025-01-14T01:16:30Z","title":"Transforming Indoor Localization: Advanced Transformer Architecture for\n  NLOS Dominated Wireless Environments with Distributed Sensors","summary":"  Indoor localization in challenging non-line-of-sight (NLOS) environments\noften leads to mediocre accuracy with traditional approaches. Deep learning\n(DL) has been applied to tackle these challenges; however, many DL approaches\noverlook computational complexity, especially for floating-point operations\n(FLOPs), making them unsuitable for resource-limited devices. Transformer-based\nmodels have achieved remarkable success in natural language processing (NLP)\nand computer vision (CV) tasks, motivating their use in wireless applications.\nHowever, their use in indoor localization remains nascent, and directly\napplying Transformers for indoor localization can be both computationally\nintensive and exhibit limitations in accuracy. To address these challenges, in\nthis work, we introduce a novel tokenization approach, referred to as Sensor\nSnapshot Tokenization (SST), which preserves variable-specific representations\nof power delay profile (PDP) and enhances attention mechanisms by effectively\ncapturing multi-variate correlation. Complementing this, we propose a\nlightweight Swish-Gated Linear Unit-based Transformer (L-SwiGLU Transformer)\nmodel, designed to reduce computational complexity without compromising\nlocalization accuracy. Together, these contributions mitigate the computational\nburden and dependency on large datasets, making Transformer models more\nefficient and suitable for resource-constrained scenarios. The proposed\ntokenization method enables the Vanilla Transformer to achieve a 90th\npercentile positioning error of 0.388 m in a highly NLOS indoor factory,\nsurpassing conventional tokenization methods. The L-SwiGLU ViT further reduces\nthe error to 0.355 m, achieving an 8.51% improvement. Additionally, the\nproposed model outperforms a 14.1 times larger model with a 46.13% improvement,\nunderscoring its computational efficiency.\n","authors":["Saad Masrur"," Jung-Fu"," Cheng","Atieh R. Khamesi","Ismail Guvenc"],"pdf_url":"https://arxiv.org/pdf/2501.07774v1.pdf","comment":"The paper has been submitted to IEEE Transactions on Machine Learning\n  in Communications and Networking"},{"id":"http://arxiv.org/abs/2501.07773v1","updated":"2025-01-14T01:08:15Z","published":"2025-01-14T01:08:15Z","title":"Symmetry-Aware Generative Modeling through Learned Canonicalization","summary":"  Generative modeling of symmetric densities has a range of applications in AI\nfor science, from drug discovery to physics simulations. The existing\ngenerative modeling paradigm for invariant densities combines an invariant\nprior with an equivariant generative process. However, we observe that this\ntechnique is not necessary and has several drawbacks resulting from the\nlimitations of equivariant networks. Instead, we propose to model a learned\nslice of the density so that only one representative element per orbit is\nlearned. To accomplish this, we learn a group-equivariant canonicalization\nnetwork that maps training samples to a canonical pose and train a\nnon-equivariant generative model over these canonicalized samples. We implement\nthis idea in the context of diffusion models. Our preliminary experimental\nresults on molecular modeling are promising, demonstrating improved sample\nquality and faster inference time.\n","authors":["Kusha Sareen","Daniel Levy","Arnab Kumar Mondal","Sékou-Oumar Kaba","Tara Akhound-Sadegh","Siamak Ravanbakhsh"],"pdf_url":"https://arxiv.org/pdf/2501.07773v1.pdf","comment":"NeurReps 2024 Workshop Version"},{"id":"http://arxiv.org/abs/2501.07769v1","updated":"2025-01-14T00:59:55Z","published":"2025-01-14T00:59:55Z","title":"BMIP: Bi-directional Modality Interaction Prompt Learning for VLM","summary":"  Vision-language models (VLMs) have exhibited remarkable generalization\ncapabilities, and prompt learning for VLMs has attracted great attention for\nthe ability to adapt pre-trained VLMs to specific downstream tasks. However,\nexisting studies mainly focus on single-modal prompts or uni-directional\nmodality interaction, overlooking the powerful alignment effects resulting from\nthe interaction between the vision and language modalities. To this end, we\npropose a novel prompt learning method called\n$\\underline{\\textbf{B}}i-directional \\underline{\\textbf{M}}odality\n\\underline{\\textbf{I}}nteraction \\underline{\\textbf{P}}rompt (BMIP)$, which\ndynamically weights bi-modal information through learning the information of\nthe attention layer, enhancing trainability and inter-modal consistency\ncompared to simple information aggregation methods. To evaluate the\neffectiveness of prompt learning methods, we propose a more realistic\nevaluation paradigm called open-world generalization complementing the widely\nadopted cross-dataset transfer and domain generalization tasks. Comprehensive\nexperiments on various datasets reveal that BMIP not only outperforms current\nstate-of-the-art methods across all three evaluation paradigms but is also\nflexible enough to be combined with other prompt-based methods for consistent\nperformance enhancement.\n","authors":["Song-Lin Lv","Yu-Yang Chen","Zhi Zhou","Ming Yang","Lan-Zhe Guo"],"pdf_url":"https://arxiv.org/pdf/2501.07769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07765v1","updated":"2025-01-14T00:47:15Z","published":"2025-01-14T00:47:15Z","title":"PINN-FEM: A Hybrid Approach for Enforcing Dirichlet Boundary Conditions\n  in Physics-Informed Neural Networks","summary":"  Physics-Informed Neural Networks (PINNs) solve partial differential equations\n(PDEs) by embedding governing equations and boundary/initial conditions into\nthe loss function. However, enforcing Dirichlet boundary conditions accurately\nremains challenging, often leading to soft enforcement that compromises\nconvergence and reliability in complex domains. We propose a hybrid approach,\nPINN-FEM, which combines PINNs with finite element methods (FEM) to impose\nstrong Dirichlet boundary conditions via domain decomposition. This method\nincorporates FEM-based representations near the boundary, ensuring exact\nenforcement without compromising convergence. Through six experiments of\nincreasing complexity, PINN-FEM outperforms standard PINN models, showcasing\nsuperior accuracy and robustness. While distance functions and similar\ntechniques have been proposed for boundary condition enforcement, they lack\ngenerality for real-world applications. PINN-FEM bridges this gap by leveraging\nFEM near boundaries, making it well-suited for industrial and scientific\nproblems.\n","authors":["Nahil Sobh","Rini Jasmine Gladstone","Hadi Meidani"],"pdf_url":"https://arxiv.org/pdf/2501.07765v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2501.07764v1","updated":"2025-01-14T00:47:05Z","published":"2025-01-14T00:47:05Z","title":"Deep Learning for Disease Outbreak Prediction: A Robust Early Warning\n  Signal for Transcritical Bifurcations","summary":"  Early Warning Signals (EWSs) are vital for implementing preventive measures\nbefore a disease turns into a pandemic. While new diseases exhibit unique\nbehaviors, they often share fundamental characteristics from a dynamical\nsystems perspective. Moreover, measurements during disease outbreaks are often\ncorrupted by different noise sources, posing challenges for Time Series\nClassification (TSC) tasks. In this study, we address the problem of having a\nrobust EWS for disease outbreak prediction using a best-performing deep\nlearning model in the domain of TSC. We employed two simulated datasets to\ntrain the model: one representing generated dynamical systems with randomly\nselected polynomial terms to model new disease behaviors, and another\nsimulating noise-induced disease dynamics to account for noisy measurements.\nThe model's performance was analyzed using both simulated data from different\ndisease models and real-world data, including influenza and COVID-19. Results\ndemonstrate that the proposed model outperforms previous models, effectively\nproviding EWSs of impending outbreaks across various scenarios. This study\nbridges advancements in deep learning with the ability to provide robust early\nwarning signals in noisy environments, making it highly applicable to\nreal-world crises involving emerging disease outbreaks.\n","authors":["Reza Miry","Amit K. Chakraborty","Russell Greiner","Mark A. Lewis","Hao Wang","Tianyu Guan","Pouria Ramazi"],"pdf_url":"https://arxiv.org/pdf/2501.07764v1.pdf","comment":"14 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2501.07763v1","updated":"2025-01-14T00:39:46Z","published":"2025-01-14T00:39:46Z","title":"On the Statistical Capacity of Deep Generative Models","summary":"  Deep generative models are routinely used in generating samples from complex,\nhigh-dimensional distributions. Despite their apparent successes, their\nstatistical properties are not well understood. A common assumption is that\nwith enough training data and sufficiently large neural networks, deep\ngenerative model samples will have arbitrarily small errors in sampling from\nany continuous target distribution. We set up a unifying framework that debunks\nthis belief. We demonstrate that broad classes of deep generative models,\nincluding variational autoencoders and generative adversarial networks, are not\nuniversal generators. Under the predominant case of Gaussian latent variables,\nthese models can only generate concentrated samples that exhibit light tails.\nUsing tools from concentration of measure and convex geometry, we give\nanalogous results for more general log-concave and strongly log-concave latent\nvariable distributions. We extend our results to diffusion models via a\nreduction argument. We use the Gromov--Levy inequality to give similar\nguarantees when the latent variables lie on manifolds with positive Ricci\ncurvature. These results shed light on the limited capacity of common deep\ngenerative models to handle heavy tails. We illustrate the empirical relevance\nof our work with simulations and financial data.\n","authors":["Edric Tam","David B. Dunson"],"pdf_url":"https://arxiv.org/pdf/2501.07763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07761v1","updated":"2025-01-14T00:28:26Z","published":"2025-01-14T00:28:26Z","title":"Impatient Bandits: Optimizing for the Long-Term Without Delay","summary":"  Increasingly, recommender systems are tasked with improving users' long-term\nsatisfaction. In this context, we study a content exploration task, which we\nformalize as a bandit problem with delayed rewards. There is an apparent\ntrade-off in choosing the learning signal: waiting for the full reward to\nbecome available might take several weeks, slowing the rate of learning,\nwhereas using short-term proxy rewards reflects the actual long-term goal only\nimperfectly. First, we develop a predictive model of delayed rewards that\nincorporates all information obtained to date. Rewards as well as shorter-term\nsurrogate outcomes are combined through a Bayesian filter to obtain a\nprobabilistic belief. Second, we devise a bandit algorithm that quickly learns\nto identify content aligned with long-term success using this new predictive\nmodel. We prove a regret bound for our algorithm that depends on the\n\\textit{Value of Progressive Feedback}, an information theoretic metric that\ncaptures the quality of short-term leading indicators that are observed prior\nto the long-term reward. We apply our approach to a podcast recommendation\nproblem, where we seek to recommend shows that users engage with repeatedly\nover two months. We empirically validate that our approach significantly\noutperforms methods that optimize for short-term proxies or rely solely on\ndelayed rewards, as demonstrated by an A/B test in a recommendation system that\nserves hundreds of millions of users.\n","authors":["Kelly W. Zhang","Thomas Baldwin-McDonald","Kamil Ciosek","Lucas Maystre","Daniel Russo"],"pdf_url":"https://arxiv.org/pdf/2501.07761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09072v2","updated":"2025-01-14T00:21:51Z","published":"2024-11-13T22:55:45Z","title":"Continuous GNN-based Anomaly Detection on Edge using Efficient Adaptive\n  Knowledge Graph Learning","summary":"  The increasing demand for robust security solutions across various industries\nhas made Video Anomaly Detection (VAD) a critical task in applications such as\nintelligent surveillance, evidence investigation, and violence detection.\nTraditional approaches to VAD often rely on finetuning large pre-trained\nmodels, which can be computationally expensive and impractical for real-time or\nresource-constrained environments. To address this, MissionGNN introduced a\nmore efficient method by training a graph neural network (GNN) using a fixed\nknowledge graph (KG) derived from large language models (LLMs) like GPT-4.\nWhile this approach demonstrated significant efficiency in computational power\nand memory, it faces limitations in dynamic environments where frequent updates\nto the KG are necessary due to evolving behavior trends and shifting data\npatterns. These updates typically require cloud-based computation, posing\nchallenges for edge computing applications. In this paper, we propose a novel\nframework that facilitates continuous KG adaptation directly on edge devices,\novercoming the limitations of cloud dependency. Our method dynamically modifies\nthe KG through a three-phase process: pruning, alternating, and creating nodes,\nenabling real-time adaptation to changing data trends. This continuous learning\napproach enhances the robustness of anomaly detection models, making them more\nsuitable for deployment in dynamic and resource-constrained environments.\n","authors":["Sanggeon Yun","Ryozo Masukawa","William Youngwoo Chung","Minhyoung Na","Nathaniel Bastian","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2411.09072v2.pdf","comment":"Accepted to DATE 2025"},{"id":"http://arxiv.org/abs/2411.14612v2","updated":"2025-01-14T00:20:32Z","published":"2024-11-21T22:28:45Z","title":"Exploiting Boosting in Hyperdimensional Computing for Enhanced\n  Reliability in Healthcare","summary":"  Hyperdimensional computing (HDC) enables efficient data encoding and\nprocessing in high-dimensional space, benefiting machine learning and data\nanalysis. However, underutilization of these spaces can lead to overfitting and\nreduced model reliability, especially in data-limited systems a critical issue\nin sectors like healthcare that demand robustness and consistent performance.\nWe introduce BoostHD, an approach that applies boosting algorithms to partition\nthe hyperdimensional space into subspaces, creating an ensemble of weak\nlearners. By integrating boosting with HDC, BoostHD enhances performance and\nreliability beyond existing HDC methods. Our analysis highlights the importance\nof efficient utilization of hyperdimensional spaces for improved model\nperformance. Experiments on healthcare datasets show that BoostHD outperforms\nstate-of-the-art methods. On the WESAD dataset, it achieved an accuracy of\n98.37%, surpassing Random Forest, XGBoost, and OnlineHD. BoostHD also\ndemonstrated superior inference efficiency and stability, maintaining high\naccuracy under data imbalance and noise. In person-specific evaluations, it\nachieved an average accuracy of 96.19%, outperforming other models. By\naddressing the limitations of both boosting and HDC, BoostHD expands the\napplicability of HDC in critical domains where reliability and precision are\nparamount.\n","authors":["SungHeon Jeong","Hamza Errahmouni Barkam","Sanggeon Yun","Yeseong Kim","Shaahin Angizi","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2411.14612v2.pdf","comment":"Accepted to DATE 2025"},{"id":"http://arxiv.org/abs/2412.13228v3","updated":"2025-01-14T00:18:03Z","published":"2024-12-17T11:30:54Z","title":"TSEML: A task-specific embedding-based method for few-shot\n  classification of cancer molecular subtypes","summary":"  Molecular subtyping of cancer is recognized as a critical and challenging\nupstream task for personalized therapy. Existing deep learning methods have\nachieved significant performance in this domain when abundant data samples are\navailable. However, the acquisition of densely labeled samples for cancer\nmolecular subtypes remains a significant challenge for conventional\ndata-intensive deep learning approaches. In this work, we focus on the few-shot\nmolecular subtype prediction problem in heterogeneous and small cancer\ndatasets, aiming to enhance precise diagnosis and personalized treatment. We\nfirst construct a new few-shot dataset for cancer molecular subtype\nclassification and auxiliary cancer classification, named TCGA Few-Shot, from\nexisting publicly available datasets. To effectively leverage the relevant\nknowledge from both tasks, we introduce a task-specific embedding-based\nmeta-learning framework (TSEML). TSEML leverages the synergistic strengths of a\nmodel-agnostic meta-learning (MAML) approach and a prototypical network\n(ProtoNet) to capture diverse and fine-grained features. Comparative\nexperiments conducted on the TCGA Few-Shot dataset demonstrate that our TSEML\nframework achieves superior performance in addressing the problem of few-shot\nmolecular subtype classification.\n","authors":["Ran Su","Rui Shi","Hui Cui","Ping Xuan","Chengyan Fang","Xikang Feng","Qiangguo Jin"],"pdf_url":"https://arxiv.org/pdf/2412.13228v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08496v1","updated":"2025-01-14T23:59:23Z","published":"2025-01-14T23:59:23Z","title":"Quantifying the Importance of Data Alignment in Downstream Model\n  Performance","summary":"  Contrary to the conventional emphasis on dataset size, we explore the role of\ndata alignment -- an often overlooked aspect of data quality -- in training\ncapable Large Language Models (LLMs). To do so, we use the Task2Vec-based\nalignment coefficient, a quantitative measure of the similarity between two\ndatasets, to quantify the impact of alignment between training data and\nevaluation data on downstream performance. In particular, we conduct controlled\n\\textit{interventional} experiments for two settings: 1. the impact of\nincreased alignment coefficients between various pre-training (pt) against\nevaluation datasets, and 2. the impact of increased alignment coefficients\nbetween domain specific fine-tuning (ft) against domain specific evaluation.\nThe domain specific task we explore is Autoformalization -- the machine\ntranslation task between natural language and code for formal verification. In\nboth settings, we find a strong, predictable negative correlation between the\nalignment coefficient of a model's training and evaluation data and the model's\nloss/perplexity on the respective downstream task. These findings suggest a\nre-evaluation of LLM training approaches, demonstrating the relevance of data\nalignment compared to data quantity, especially in specialized downstream tasks\nsuch as Autoformalization.\n","authors":["Krrish Chawla","Aryan Sahai","Mario DePavia","Sudharsan Sundar","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08490v1","updated":"2025-01-14T23:31:20Z","published":"2025-01-14T23:31:20Z","title":"FLAVARS: A Multimodal Foundational Language and Vision Alignment Model\n  for Remote Sensing","summary":"  Remote sensing imagery is dense with objects and contextual visual\ninformation. There is a recent trend to combine paired satellite images and\ntext captions for pretraining performant encoders for downstream tasks.\nHowever, while contrastive image-text methods like CLIP enable vision-language\nalignment and zero-shot classification ability, vision-only downstream\nperformance tends to degrade compared to image-only pretraining, such as MAE.\nIn this paper, we propose FLAVARS, a pretraining method that combines the best\nof both contrastive learning and masked modeling, along with geospatial\nalignment via contrastive location encoding. We find that FLAVARS significantly\noutperforms a baseline of SkyCLIP for vision-only tasks such as KNN\nclassification and semantic segmentation, +6\\% mIOU on SpaceNet1, while\nretaining the ability to perform zero-shot classification, unlike MAE\npretrained methods.\n","authors":["Isaac Corley","Simone Fobi Nsutezo","Anthony Ortiz","Caleb Robinson","Rahul Dodhia","Juan M. Lavista Ferres","Peyman Najafirad"],"pdf_url":"https://arxiv.org/pdf/2501.08490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13904v2","updated":"2025-01-14T23:31:03Z","published":"2024-09-20T21:20:04Z","title":"High-dimensional learning of narrow neural networks","summary":"  Recent years have been marked with the fast-pace diversification and\nincreasing ubiquity of machine learning applications. Yet, a firm theoretical\nunderstanding of the surprising efficiency of neural networks to learn from\nhigh-dimensional data still proves largely elusive. In this endeavour, analyses\ninspired by statistical physics have proven instrumental, enabling the tight\nasymptotic characterization of the learning of neural networks in high\ndimensions, for a broad class of solvable models. This manuscript reviews the\ntools and ideas underlying recent progress in this line of work. We introduce a\ngeneric model -- the sequence multi-index model -- which encompasses numerous\npreviously studied models as special instances. This unified framework covers a\nbroad class of machine learning architectures with a finite number of hidden\nunits, including multi-layer perceptrons, autoencoders, attention mechanisms;\nand tasks, including (un)supervised learning, denoising, contrastive learning,\nin the limit of large data dimension, and comparably large number of samples.\nWe explicate in full detail the analysis of the learning of sequence\nmulti-index models, using statistical physics techniques such as the replica\nmethod and approximate message-passing algorithms. This manuscript thus\nprovides a unified presentation of analyses reported in several previous works,\nand a detailed overview of central techniques in the field of statistical\nphysics of machine learning. This review should be a useful primer for machine\nlearning theoreticians curious of statistical physics approaches; it should\nalso be of value to statistical physicists interested in the transfer of such\nideas to the study of neural networks.\n","authors":["Hugo Cui"],"pdf_url":"https://arxiv.org/pdf/2409.13904v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06720v4","updated":"2025-01-14T22:30:10Z","published":"2023-04-13T17:59:55Z","title":"Expressive Text-to-Image Generation with Rich Text","summary":"  Plain text has become a prevalent interface for text-to-image synthesis.\nHowever, its limited customization options hinder users from accurately\ndescribing desired outputs. For example, plain text makes it hard to specify\ncontinuous quantities, such as the precise RGB color value or importance of\neach word. Furthermore, creating detailed text prompts for complex scenes is\ntedious for humans to write and challenging for text encoders to interpret. To\naddress these challenges, we propose using a rich-text editor supporting\nformats such as font style, size, color, and footnote. We extract each word's\nattributes from rich text to enable local style control, explicit token\nreweighting, precise color rendering, and detailed region synthesis. We achieve\nthese capabilities through a region-based diffusion process. We first obtain\neach word's region based on attention maps of a diffusion process using plain\ntext. For each region, we enforce its text attributes by creating\nregion-specific detailed prompts and applying region-specific guidance, and\nmaintain its fidelity against plain-text generation through region-based\ninjections. We present various examples of image generation from rich text and\ndemonstrate that our method outperforms strong baselines with quantitative\nevaluations.\n","authors":["Songwei Ge","Taesung Park","Jun-Yan Zhu","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2304.06720v4.pdf","comment":"Project webpage: https://rich-text-to-image.github.io/"},{"id":"http://arxiv.org/abs/2501.08464v1","updated":"2025-01-14T22:20:55Z","published":"2025-01-14T22:20:55Z","title":"Time series forecasting for multidimensional telemetry data using GAN\n  and BiLSTM in a Digital Twin","summary":"  The research related to digital twins has been increasing in recent years.\nBesides the mirroring of the physical word into the digital, there is the need\nof providing services related to the data collected and transferred to the\nvirtual world. One of these services is the forecasting of physical part future\nbehavior, that could lead to applications, like preventing harmful events or\ndesigning improvements to get better performance. One strategy used to predict\nany system operation it is the use of time series models like ARIMA or LSTM,\nand improvements were implemented using these algorithms. Recently, deep\nlearning techniques based on generative models such as Generative Adversarial\nNetworks (GANs) have been proposed to create time series and the use of LSTM\nhas gained more relevance in time series forecasting, but both have limitations\nthat restrict the forecasting results. Another issue found in the literature is\nthe challenge of handling multivariate environments/applications in time series\ngeneration. Therefore, new methods need to be studied in order to fill these\ngaps and, consequently, provide better resources for creating useful digital\ntwins. In this proposal, it is going to be studied the integration of a BiLSTM\nlayer with a time series obtained by GAN in order to improve the forecasting of\nall the features provided by the dataset in terms of accuracy and,\nconsequently, improving behaviour prediction.\n","authors":["Joao Carmo de Almeida Neto","Claudio Miceli de Farias","Leandro Santiago de Araujo","Leopoldo Andre Dutra Lusquino Filho"],"pdf_url":"https://arxiv.org/pdf/2501.08464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01829v3","updated":"2025-01-14T22:10:21Z","published":"2024-08-03T17:43:10Z","title":"Neural Network Emulator for Atmospheric Chemical ODE","summary":"  Modeling atmospheric chemistry is complex and computationally intense. Given\nthe recent success of Deep neural networks in digital signal processing, we\npropose a Neural Network Emulator for fast chemical concentration modeling. We\nconsider atmospheric chemistry as a time-dependent Ordinary Differential\nEquation. To extract the hidden correlations between initial states and future\ntime evolution, we propose ChemNNE, an Attention based Neural Network Emulator\n(NNE) that can model the atmospheric chemistry as a neural ODE process. To\nefficiently simulate the chemical changes, we propose the sinusoidal time\nembedding to estimate the oscillating tendency over time. More importantly, we\nuse the Fourier neural operator to model the ODE process for efficient\ncomputation. We also propose three physical-informed losses to supervise the\ntraining optimization. To evaluate our model, we propose a large-scale chemical\ndataset that can be used for neural network training and evaluation. The\nextensive experiments show that our approach achieves state-of-the-art\nperformance in modeling accuracy and computational speed.\n","authors":["Zhi-Song Liu","Petri Clusius","Michael Boy"],"pdf_url":"https://arxiv.org/pdf/2408.01829v3.pdf","comment":"25 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.23326v2","updated":"2025-01-14T22:08:40Z","published":"2024-10-30T15:08:05Z","title":"MassSpecGym: A benchmark for the discovery and identification of\n  molecules","summary":"  The discovery and identification of molecules in biological and environmental\nsamples is crucial for advancing biomedical and chemical sciences. Tandem mass\nspectrometry (MS/MS) is the leading technique for high-throughput elucidation\nof molecular structures. However, decoding a molecular structure from its mass\nspectrum is exceptionally challenging, even when performed by human experts. As\na result, the vast majority of acquired MS/MS spectra remain uninterpreted,\nthereby limiting our understanding of the underlying (bio)chemical processes.\nDespite decades of progress in machine learning applications for predicting\nmolecular structures from MS/MS spectra, the development of new methods is\nseverely hindered by the lack of standard datasets and evaluation protocols. To\naddress this problem, we propose MassSpecGym -- the first comprehensive\nbenchmark for the discovery and identification of molecules from MS/MS data.\nOur benchmark comprises the largest publicly available collection of\nhigh-quality labeled MS/MS spectra and defines three MS/MS annotation\nchallenges: \\textit{de novo} molecular structure generation, molecule\nretrieval, and spectrum simulation. It includes new evaluation metrics and a\ngeneralization-demanding data split, therefore standardizing the MS/MS\nannotation tasks and rendering the problem accessible to the broad machine\nlearning community. MassSpecGym is publicly available at\n\\url{https://github.com/pluskal-lab/MassSpecGym}.\n","authors":["Roman Bushuiev","Anton Bushuiev","Niek F. de Jonge","Adamo Young","Fleming Kretschmer","Raman Samusevich","Janne Heirman","Fei Wang","Luke Zhang","Kai Dührkop","Marcus Ludwig","Nils A. Haupt","Apurva Kalia","Corinna Brungs","Robin Schmid","Russell Greiner","Bo Wang","David S. Wishart","Li-Ping Liu","Juho Rousu","Wout Bittremieux","Hannes Rost","Tytus D. Mak","Soha Hassoun","Florian Huber","Justin J. J. van der Hooft","Michael A. Stravs","Sebastian Böcker","Josef Sivic","Tomáš Pluskal"],"pdf_url":"https://arxiv.org/pdf/2410.23326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08459v1","updated":"2025-01-14T22:07:29Z","published":"2025-01-14T22:07:29Z","title":"Head Motion Degrades Machine Learning Classification of Alzheimer's\n  Disease from Positron Emission Tomography","summary":"  Brain positron emission tomography (PET) imaging is broadly used in research\nand clinical routines to study, diagnose, and stage Alzheimer's disease (AD).\nHowever, its potential cannot be fully exploited yet due to the lack of\nportable motion correction solutions, especially in clinical settings. Head\nmotion during data acquisition has indeed been shown to degrade image quality\nand induces tracer uptake quantification error. In this study, we demonstrate\nthat it also biases machine learning-based AD classification. We start by\nproposing a binary classification algorithm solely based on PET images. We find\nthat it reaches a high accuracy in classifying motion corrected images into\ncognitive normal or AD. We demonstrate that the classification accuracy\nsubstantially decreases when images lack motion correction, thereby limiting\nthe algorithm's effectiveness and biasing image interpretation. We validate\nthese findings in cohorts of 128 $^{11}$C-UCB-J and 173 $^{18}$F-FDG scans, two\ntracers highly relevant to the study of AD. Classification accuracies decreased\nby 10% and 5% on 20 $^{18}$F-FDG and 20 $^{11}$C-UCB-J testing cases,\nrespectively. Our findings underscore the critical need for efficient motion\ncorrection methods to make the most of the diagnostic capabilities of PET-based\nmachine learning.\n","authors":["Eléonore V. Lieffrig","Takuya Toyonaga","Jiazhen Zhang","John A. Onofrey"],"pdf_url":"https://arxiv.org/pdf/2501.08459v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2501.08457v1","updated":"2025-01-14T22:02:38Z","published":"2025-01-14T22:02:38Z","title":"Large Language Models For Text Classification: Case Study And\n  Comprehensive Review","summary":"  Unlocking the potential of Large Language Models (LLMs) in data\nclassification represents a promising frontier in natural language processing.\nIn this work, we evaluate the performance of different LLMs in comparison with\nstate-of-the-art deep-learning and machine-learning models, in two different\nclassification scenarios: i) the classification of employees' working locations\nbased on job reviews posted online (multiclass classification), and 2) the\nclassification of news articles as fake or not (binary classification). Our\nanalysis encompasses a diverse range of language models differentiating in\nsize, quantization, and architecture. We explore the impact of alternative\nprompting techniques and evaluate the models based on the weighted F1-score.\nAlso, we examine the trade-off between performance (F1-score) and time\n(inference response time) for each language model to provide a more nuanced\nunderstanding of each model's practical applicability. Our work reveals\nsignificant variations in model responses based on the prompting strategies. We\nfind that LLMs, particularly Llama3 and GPT-4, can outperform traditional\nmethods in complex classification tasks, such as multiclass classification,\nthough at the cost of longer inference times. In contrast, simpler ML models\noffer better performance-to-time trade-offs in simpler binary classification\ntasks.\n","authors":["Arina Kostina","Marios D. Dikaiakos","Dimosthenis Stefanidis","George Pallis"],"pdf_url":"https://arxiv.org/pdf/2501.08457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16867v2","updated":"2025-01-14T22:01:02Z","published":"2024-12-22T05:36:51Z","title":"A Parameter-Efficient Quantum Anomaly Detection Method on a\n  Superconducting Quantum Processor","summary":"  Quantum machine learning has gained attention for its potential to address\ncomputational challenges. However, whether those algorithms can effectively\nsolve practical problems and outperform their classical counterparts,\nespecially on current quantum hardware, remains a critical question. In this\nwork, we propose a novel quantum machine learning method, called Quantum\nSupport Vector Data Description (QSVDD), for practical image anomaly detection,\nwhich aims to achieve both parameter efficiency and superior accuracy compared\nto classical models. Emulation results indicate that QSVDD demonstrates\nfavourable recognition capabilities compared to classical baselines, achieving\nan average accuracy of over 90% on benchmarks with significantly fewer\ntrainable parameters. Theoretical analysis confirms that QSVDD has a comparable\nexpressivity to classical counterparts while requiring only a fraction of the\nparameters. Furthermore, we demonstrate the first implementation of a quantum\nanomaly detection method for general image datasets on a superconducting\nquantum processor. Specifically, we achieve an accuracy of over 80% with only\n16 parameters on the device, providing initial evidence of QSVDD's practical\nviability in the noisy intermediate-scale quantum era and highlighting its\nsignificant reduction in parameter requirements.\n","authors":["Maida Wang","Jinyang Jiang","Peter V. Coveney"],"pdf_url":"https://arxiv.org/pdf/2412.16867v2.pdf","comment":"30 pages, 13 figures"},{"id":"http://arxiv.org/abs/2501.08455v1","updated":"2025-01-14T22:00:01Z","published":"2025-01-14T22:00:01Z","title":"Keras Sig: Efficient Path Signature Computation on GPU in Keras 3","summary":"  In this paper we introduce Keras Sig a high-performance pythonic library\ndesigned to compute path signature for deep learning applications. Entirely\nbuilt in Keras 3, \\textit{Keras Sig} leverages the seamless integration with\nthe mostly used deep learning backends such as PyTorch, JAX and TensorFlow.\nInspired by Kidger and Lyons (2021),we proposed a novel approach reshaping\nsignature calculations to leverage GPU parallelism. This adjustment allows us\nto reduce the training time by 55\\% and 5 to 10-fold improvements in direct\nsignature computation compared to existing methods, while maintaining similar\nCPU performance. Relying on high-level tensor operations instead of low-level\nC++ code, Keras Sig significantly reduces the versioning and compatibility\nissues commonly encountered in deep learning libraries, while delivering\nsuperior or comparable performance across various hardware configurations. We\ndemonstrate through extensive benchmarking that our approach scales efficiently\nwith the length of input sequences and maintains competitive performance across\nvarious signature parameters, though bounded by memory constraints for very\nlarge signature dimensions.\n","authors":["Rémi Genet","Hugo Inzirillo"],"pdf_url":"https://arxiv.org/pdf/2501.08455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08453v1","updated":"2025-01-14T21:53:11Z","published":"2025-01-14T21:53:11Z","title":"Vchitect-2.0: Parallel Transformer for Scaling Up Video Diffusion Models","summary":"  We present Vchitect-2.0, a parallel transformer architecture designed to\nscale up video diffusion models for large-scale text-to-video generation. The\noverall Vchitect-2.0 system has several key designs. (1) By introducing a novel\nMultimodal Diffusion Block, our approach achieves consistent alignment between\ntext descriptions and generated video frames, while maintaining temporal\ncoherence across sequences. (2) To overcome memory and computational\nbottlenecks, we propose a Memory-efficient Training framework that incorporates\nhybrid parallelism and other memory reduction techniques, enabling efficient\ntraining of long video sequences on distributed systems. (3) Additionally, our\nenhanced data processing pipeline ensures the creation of Vchitect T2V\nDataVerse, a high-quality million-scale training dataset through rigorous\nannotation and aesthetic evaluation. Extensive benchmarking demonstrates that\nVchitect-2.0 outperforms existing methods in video quality, training\nefficiency, and scalability, serving as a suitable base for high-fidelity video\ngeneration.\n","authors":["Weichen Fan","Chenyang Si","Junhao Song","Zhenyu Yang","Yinan He","Long Zhuo","Ziqi Huang","Ziyue Dong","Jingwen He","Dongwei Pan","Yi Wang","Yuming Jiang","Yaohui Wang","Peng Gao","Xinyuan Chen","Hengjie Li","Dahua Lin","Yu Qiao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11113v3","updated":"2025-01-14T21:50:37Z","published":"2024-10-14T21:46:57Z","title":"Statistical Properties of Deep Neural Networks with Dependent Data","summary":"  This paper establishes statistical properties of deep neural network (DNN)\nestimators under dependent data. Two general results for nonparametric sieve\nestimators directly applicable to DNN estimators are given. The first\nestablishes rates for convergence in probability under nonstationary data. The\nsecond provides non-asymptotic probability bounds on $\\mathcal{L}^{2}$-errors\nunder stationary $\\beta$-mixing data. I apply these results to DNN estimators\nin both regression and classification contexts imposing only a standard\nH\\\"older smoothness assumption. The DNN architectures considered are common in\napplications, featuring fully connected feedforward networks with any\ncontinuous piecewise linear activation function, unbounded weights, and a width\nand depth that grows with sample size. The framework provided also offers\npotential for research into other DNN architectures and time-series\napplications.\n","authors":["Chad Brown"],"pdf_url":"https://arxiv.org/pdf/2410.11113v3.pdf","comment":"86 pages, 2 figures, removed partially linear model section and\n  uploaded as a separate paper (arXiv:2410.22574v1)"},{"id":"http://arxiv.org/abs/2403.16689v3","updated":"2025-01-14T21:37:31Z","published":"2024-03-25T12:23:39Z","title":"SYNAPSE: SYmbolic Neural-Aided Preference Synthesis Engine","summary":"  This paper addresses the problem of preference learning, which aims to align\nrobot behaviors through learning user specific preferences (e.g. \"good\npull-over location\") from visual demonstrations. Despite its similarity to\nlearning factual concepts (e.g. \"red door\"), preference learning is a\nfundamentally harder problem due to its subjective nature and the paucity of\nperson-specific training data. We address this problem using a novel framework\ncalled SYNAPSE, which is a neuro-symbolic approach designed to efficiently\nlearn preferential concepts from limited data. SYNAPSE represents preferences\nas neuro-symbolic programs, facilitating inspection of individual parts for\nalignment, in a domain-specific language (DSL) that operates over images and\nleverages a novel combination of visual parsing, large language models, and\nprogram synthesis to learn programs representing individual preferences. We\nperform extensive evaluations on various preferential concepts as well as user\ncase studies demonstrating its ability to align well with dissimilar user\npreferences. Our method significantly outperforms baselines, especially when it\ncomes to out of distribution generalization. We show the importance of the\ndesign choices in the framework through multiple ablation studies. Code,\nadditional results, and supplementary material can be found on the website:\nhttps://amrl.cs.utexas.edu/synapse\n","authors":["Sadanand Modak","Noah Patton","Isil Dillig","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2403.16689v3.pdf","comment":"Accepted (oral) at AAAI 25"},{"id":"http://arxiv.org/abs/2211.00460v3","updated":"2025-01-14T21:26:20Z","published":"2022-11-01T13:42:44Z","title":"Augmentation Invariant Manifold Learning","summary":"  Data augmentation is a widely used technique and an essential ingredient in\nthe recent advance in self-supervised representation learning. By preserving\nthe similarity between augmented data, the resulting data representation can\nimprove various downstream analyses and achieve state-of-the-art performance in\nmany applications. Despite the empirical effectiveness, most existing methods\nlack theoretical understanding under a general nonlinear setting. To fill this\ngap, we develop a statistical framework on a low-dimension product manifold to\nmodel the data augmentation transformation. Under this framework, we introduce\na new representation learning method called augmentation invariant manifold\nlearning and design a computationally efficient algorithm by reformulating it\nas a stochastic optimization problem. Compared with existing self-supervised\nmethods, the new method simultaneously exploits the manifold's geometric\nstructure and invariant property of augmented data and has an explicit\ntheoretical guarantee. Our theoretical investigation characterizes the role of\ndata augmentation in the proposed method and reveals why and how the data\nrepresentation learned from augmented data can improve the $k$-nearest neighbor\nclassifier in the downstream analysis, showing that a more complex data\naugmentation leads to more improvement in downstream analysis. Finally,\nnumerical experiments on simulated and real data sets are presented to\ndemonstrate the merit of the proposed method.\n","authors":["Shulei Wang"],"pdf_url":"https://arxiv.org/pdf/2211.00460v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12588v2","updated":"2025-01-14T21:17:58Z","published":"2024-06-18T13:18:52Z","title":"UIFV: Data Reconstruction Attack in Vertical Federated Learning","summary":"  Vertical Federated Learning (VFL) facilitates collaborative machine learning\nwithout the need for participants to share raw private data. However, recent\nstudies have revealed privacy risks where adversaries might reconstruct\nsensitive features through data leakage during the learning process. Although\ndata reconstruction methods based on gradient or model information are somewhat\neffective, they reveal limitations in VFL application scenarios. This is\nbecause these traditional methods heavily rely on specific model structures\nand/or have strict limitations on application scenarios. To address this, our\nstudy introduces the Unified InverNet Framework into VFL, which yields a novel\nand flexible approach (dubbed UIFV) that leverages intermediate feature data to\nreconstruct original data, instead of relying on gradients or model details.\nThe intermediate feature data is the feature exchanged by different\nparticipants during the inference phase of VFL. Experiments on four datasets\ndemonstrate that our methods significantly outperform state-of-the-art\ntechniques in attack precision. Our work exposes severe privacy vulnerabilities\nwithin VFL systems that pose real threats to practical VFL applications and\nthus confirms the necessity of further enhancing privacy protection in the VFL\narchitecture.\n","authors":["Jirui Yang","Peng Chen","Zhihui Lu","Qiang Duan","Yubing Bao"],"pdf_url":"https://arxiv.org/pdf/2406.12588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08440v1","updated":"2025-01-14T21:08:08Z","published":"2025-01-14T21:08:08Z","title":"FARE: A Deep Learning-Based Framework for Radar-based Face Recognition\n  and Out-of-distribution Detection","summary":"  In this work, we propose a novel pipeline for face recognition and\nout-of-distribution (OOD) detection using short-range FMCW radar. The proposed\nsystem utilizes Range-Doppler and micro Range-Doppler Images. The architecture\nfeatures a primary path (PP) responsible for the classification of\nin-distribution (ID) faces, complemented by intermediate paths (IPs) dedicated\nto OOD detection. The network is trained in two stages: first, the PP is\ntrained using triplet loss to optimize ID face classification. In the second\nstage, the PP is frozen, and the IPs-comprising simple linear autoencoder\nnetworks-are trained specifically for OOD detection. Using our dataset\ngenerated with a 60 GHz FMCW radar, our method achieves an ID classification\naccuracy of 99.30% and an OOD detection AUROC of 96.91%.\n","authors":["Sabri Mustafa Kahya","Boran Hamdi Sivrikaya","Muhammet Sami Yavuz","Eckehard Steinbach"],"pdf_url":"https://arxiv.org/pdf/2501.08440v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2406.00519v2","updated":"2025-01-14T20:44:45Z","published":"2024-06-01T18:01:03Z","title":"Learning Discrete Concepts in Latent Hierarchical Models","summary":"  Learning concepts from natural high-dimensional data (e.g., images) holds\npotential in building human-aligned and interpretable machine learning models.\nDespite its encouraging prospect, formalization and theoretical insights into\nthis crucial task are still lacking. In this work, we formalize concepts as\ndiscrete latent causal variables that are related via a hierarchical causal\nmodel that encodes different abstraction levels of concepts embedded in\nhigh-dimensional data (e.g., a dog breed and its eye shapes in natural images).\nWe formulate conditions to facilitate the identification of the proposed causal\nmodel, which reveals when learning such concepts from unsupervised data is\npossible. Our conditions permit complex causal hierarchical structures beyond\nlatent trees and multi-level directed acyclic graphs in prior work and can\nhandle high-dimensional, continuous observed variables, which is well-suited\nfor unstructured data modalities such as images. We substantiate our\ntheoretical claims with synthetic data experiments. Further, we discuss our\ntheory's implications for understanding the underlying mechanisms of latent\ndiffusion models and provide corresponding empirical evidence for our\ntheoretical insights.\n","authors":["Lingjing Kong","Guangyi Chen","Biwei Huang","Eric P. Xing","Yuejie Chi","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.00519v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.08430v1","updated":"2025-01-14T20:44:17Z","published":"2025-01-14T20:44:17Z","title":"Physics-informed neural networks for phase-resolved data assimilation\n  and prediction of nonlinear ocean waves","summary":"  The assimilation and prediction of phase-resolved surface gravity waves are\ncritical challenges in ocean science and engineering. Potential flow theory\n(PFT) has been widely employed to develop wave models and numerical techniques\nfor wave prediction. However, traditional wave prediction methods are often\nlimited. For example, most simplified wave models have a limited ability to\ncapture strong wave nonlinearity, while fully nonlinear PFT solvers often fail\nto meet the speed requirements of engineering applications. This computational\ninefficiency also hinders the development of effective data assimilation\ntechniques, which are required to reconstruct spatial wave information from\nsparse measurements to initialize the wave prediction. To address these\nchallenges, we propose a novel solver method that leverages physics-informed\nneural networks (PINNs) that parameterize PFT solutions as neural networks.\nThis provides a computationally inexpensive way to assimilate and predict wave\ndata. The proposed PINN framework is validated through comparisons with\nanalytical linear PFT solutions and experimental data collected in a laboratory\nwave flume. The results demonstrate that our approach accurately captures and\npredicts irregular, nonlinear, and dispersive wave surface dynamics. Moreover,\nthe PINN can infer the fully nonlinear velocity potential throughout the entire\nfluid volume solely from surface elevation measurements, enabling the\ncalculation of fluid velocities that are difficult to measure experimentally.\n","authors":["Svenja Ehlers","Norbert Hoffmann","Tianning Tang","Adrian H. Callaghan","Rui Cao","Enrique M. Padilla","Yuxin Fang","Merten Stender"],"pdf_url":"https://arxiv.org/pdf/2501.08430v1.pdf","comment":"22 pages, 12 Figures, preprint"},{"id":"http://arxiv.org/abs/2501.08428v1","updated":"2025-01-14T20:38:30Z","published":"2025-01-14T20:38:30Z","title":"Physics-Informed Latent Neural Operator for Real-time Predictions of\n  Complex Physical Systems","summary":"  Deep operator network (DeepONet) has shown great promise as a surrogate model\nfor systems governed by partial differential equations (PDEs), learning\nmappings between infinite-dimensional function spaces with high accuracy.\nHowever, achieving low generalization errors often requires highly\noverparameterized networks, posing significant challenges for large-scale,\ncomplex systems. To address these challenges, latent DeepONet was proposed,\nintroducing a two-step approach: first, a reduced-order model is used to learn\na low-dimensional latent space, followed by operator learning on this latent\nspace. While effective, this method is inherently data-driven, relying on large\ndatasets and making it difficult to incorporate governing physics into the\nframework. Additionally, the decoupled nature of these steps prevents\nend-to-end optimization and the ability to handle data scarcity. This work\nintroduces PI-Latent-NO, a physics-informed latent operator learning framework\nthat overcomes these limitations. Our architecture employs two coupled\nDeepONets in an end-to-end training scheme: the first, termed Latent-DeepONet,\nidentifies and learns the low-dimensional latent space, while the second,\nReconstruction-DeepONet, maps the latent representations back to the original\nphysical space. By integrating governing physics directly into the training\nprocess, our approach requires significantly fewer data samples while achieving\nhigh accuracy. Furthermore, the framework is computationally and memory\nefficient, exhibiting nearly constant scaling behavior on a single GPU and\ndemonstrating the potential for further efficiency gains with distributed\ntraining. We validate the proposed method on high-dimensional parametric PDEs,\ndemonstrating its effectiveness as a proof of concept and its potential\nscalability for large-scale systems.\n","authors":["Sharmila Karumuri","Lori Graham-Brady","Somdatta Goswami"],"pdf_url":"https://arxiv.org/pdf/2501.08428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08426v1","updated":"2025-01-14T20:38:15Z","published":"2025-01-14T20:38:15Z","title":"Causal vs. Anticausal merging of predictors","summary":"  We study the differences arising from merging predictors in the causal and\nanticausal directions using the same data. In particular we study the\nasymmetries that arise in a simple model where we merge the predictors using\none binary variable as target and two continuous variables as predictors. We\nuse Causal Maximum Entropy (CMAXENT) as inductive bias to merge the predictors,\nhowever, we expect similar differences to hold also when we use other merging\nmethods that take into account asymmetries between cause and effect. We show\nthat if we observe all bivariate distributions, the CMAXENT solution reduces to\na logistic regression in the causal direction and Linear Discriminant Analysis\n(LDA) in the anticausal direction. Furthermore, we study how the decision\nboundaries of these two solutions differ whenever we observe only some of the\nbivariate distributions implications for Out-Of-Variable (OOV) generalisation.\n","authors":["Sergio Hernan Garrido Mejia","Patrick Blöbaum","Bernhard Schölkopf","Dominik Janzing"],"pdf_url":"https://arxiv.org/pdf/2501.08426v1.pdf","comment":"Presented at the 38th Conference on Neural Information Processing\n  Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2501.08425v1","updated":"2025-01-14T20:33:30Z","published":"2025-01-14T20:33:30Z","title":"Is Stochastic Gradient Descent Effective? A PDE Perspective on Machine\n  Learning processes","summary":"  In this paper we analyze the behaviour of the stochastic gradient descent\n(SGD), a widely used method in supervised learning for optimizing neural\nnetwork weights via a minimization of non-convex loss functions. Since the\npioneering work of E, Li and Tai (2017), the underlying structure of such\nprocesses can be understood via parabolic PDEs of Fokker-Planck type, which are\nat the core of our analysis. Even if Fokker-Planck equations have a long\nhistory and a extensive literature, almost nothing is known when the potential\nis non-convex or when the diffusion matrix is degenerate, and this is the main\ndifficulty that we face in our analysis.\n  We identify two different regimes: in the initial phase of SGD, the loss\nfunction drives the weights to concentrate around the nearest local minimum. We\nrefer to this phase as the drift regime and we provide quantitative estimates\non this concentration phenomenon. Next, we introduce the diffusion regime,\nwhere stochastic fluctuations help the learning process to escape suboptimal\nlocal minima. We analyze the Mean Exit Time (MET) and prove upper and lower\nbounds of the MET. Finally, we address the asymptotic convergence of SGD, for a\nnon-convex cost function and a degenerate diffusion matrix, that do not allow\nto use the standard approaches, and require new techniques. For this purpose,\nwe exploit two different methods: duality and entropy methods.\n  We provide new results about the dynamics and effectiveness of SGD, offering\na deep connection between stochastic optimization and PDE theory, and some\nanswers and insights to basic questions in the Machine Learning processes: How\nlong does SGD take to escape from a bad minimum? Do neural network parameters\nconverge using SGD? How do parameters evolve in the first stage of training\nwith SGD?\n","authors":["Davide Barbieri","Matteo Bonforte","Peio Ibarrondo"],"pdf_url":"https://arxiv.org/pdf/2501.08425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08423v1","updated":"2025-01-14T20:32:31Z","published":"2025-01-14T20:32:31Z","title":"A Constant Velocity Latent Dynamics Approach for Accelerating Simulation\n  of Stiff Nonlinear Systems","summary":"  Solving stiff ordinary differential equations (StODEs) requires sophisticated\nnumerical solvers, which are often computationally expensive. In particular,\nStODE's often cannot be solved with traditional explicit time integration\nschemes and one must resort to costly implicit methods to compute solutions. On\nthe other hand, state-of-the-art machine learning (ML) based methods such as\nNeural ODE (NODE) poorly handle the timescale separation of various elements of\nthe solutions to StODEs and require expensive implicit solvers for integration\nat inference time. In this work, we embark on a different path which involves\nlearning a latent dynamics for StODEs, in which one completely avoids numerical\nintegration. To that end, we consider a constant velocity latent dynamical\nsystem whose solution is a sequence of straight lines. Given the initial\ncondition and parameters of the ODE, the encoder networks learn the slope (i.e\nthe constant velocity) and the initial condition for the latent dynamics. In\nother words, the solution of the original dynamics is encoded into a sequence\nof straight lines which can be decoded back to retrieve the actual solution as\nand when required. Another key idea in our approach is a nonlinear\ntransformation of time, which allows for the \"stretching/squeezing\" of time in\nthe latent space, thereby allowing for varying levels of attention to different\ntemporal regions in the solution. Additionally, we provide a simple\nuniversal-approximation-type proof showing that our approach can approximate\nthe solution of stiff nonlinear system on a compact set to any degree of\naccuracy, {\\epsilon}. We show that the dimension of the latent dynamical system\nin our approach is independent of {\\epsilon}. Numerical investigation on\nprototype StODEs suggest that our method outperforms state-of-the art machine\nlearning approaches for handling StODEs.\n","authors":["William Cole Nockolds","C. G. Krishnanunni","Tan Bui-Thanh"],"pdf_url":"https://arxiv.org/pdf/2501.08423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08421v1","updated":"2025-01-14T20:24:12Z","published":"2025-01-14T20:24:12Z","title":"SEAL: Speaker Error Correction using Acoustic-conditioned Large Language\n  Models","summary":"  Speaker Diarization (SD) is a crucial component of modern end-to-end ASR\npipelines. Traditional SD systems, which are typically audio-based and operate\nindependently of ASR, often introduce speaker errors, particularly during\nspeaker transitions and overlapping speech. Recently, language models including\nfine-tuned large language models (LLMs) have shown to be effective as a\nsecond-pass speaker error corrector by leveraging lexical context in the\ntranscribed output. In this work, we introduce a novel acoustic conditioning\napproach to provide more fine-grained information from the acoustic diarizer to\nthe LLM. We also show that a simpler constrained decoding strategy reduces LLM\nhallucinations, while avoiding complicated post-processing. Our approach\nsignificantly reduces the speaker error rates by 24-43% across Fisher,\nCallhome, and RT03-CTS datasets, compared to the first-pass Acoustic SD.\n","authors":["Anurag Kumar","Rohit Paturi","Amber Afshan","Sundararajan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2501.08421v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.08418v1","updated":"2025-01-14T20:21:06Z","published":"2025-01-14T20:21:06Z","title":"CVaR-Based Variational Quantum Optimization for User Association in\n  Handoff-Aware Vehicular Networks","summary":"  Efficient resource allocation is essential for optimizing various tasks in\nwireless networks, which are usually formulated as generalized assignment\nproblems (GAP). GAP, as a generalized version of the linear sum assignment\nproblem, involves both equality and inequality constraints that add\ncomputational challenges. In this work, we present a novel Conditional Value at\nRisk (CVaR)-based Variational Quantum Eigensolver (VQE) framework to address\nGAP in vehicular networks (VNets). Our approach leverages a hybrid\nquantum-classical structure, integrating a tailored cost function that balances\nboth objective and constraint-specific penalties to improve solution quality\nand stability. Using the CVaR-VQE model, we handle the GAP efficiently by\nfocusing optimization on the lower tail of the solution space, enhancing both\nconvergence and resilience on noisy intermediate-scale quantum (NISQ) devices.\nWe apply this framework to a user-association problem in VNets, where our\nmethod achieves 23.5% improvement compared to the deep neural network (DNN)\napproach.\n","authors":["Zijiang Yan","Hao Zhou","Jianhua Pei","Aryan Kaushik","Hina Tabassum","Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08418v1.pdf","comment":"Accepted in IEEE International Conference on Communications (ICC\n  2025)"},{"id":"http://arxiv.org/abs/2412.04984v2","updated":"2025-01-14T20:16:01Z","published":"2024-12-06T12:09:50Z","title":"Frontier Models are Capable of In-context Scheming","summary":"  Frontier models are increasingly trained and deployed as autonomous agent.\nOne safety concern is that AI agents might covertly pursue misaligned goals,\nhiding their true capabilities and objectives - also known as scheming. We\nstudy whether models have the capability to scheme in pursuit of a goal that we\nprovide in-context and instruct the model to strongly follow. We evaluate\nfrontier models on a suite of six agentic evaluations where models are\ninstructed to pursue goals and are placed in environments that incentivize\nscheming. Our results show that o1, Claude 3.5 Sonnet, Claude 3 Opus, Gemini\n1.5 Pro, and Llama 3.1 405B all demonstrate in-context scheming capabilities.\nThey recognize scheming as a viable strategy and readily engage in such\nbehavior. For example, models strategically introduce subtle mistakes into\ntheir responses, attempt to disable their oversight mechanisms, and even\nexfiltrate what they believe to be their model weights to external servers.\nAdditionally, this deceptive behavior proves persistent. When o1 has engaged in\nscheming, it maintains its deception in over 85% of follow-up questions and\noften remains deceptive in multi-turn interrogations. Analysis of the models'\nchains-of-thought reveals that models explicitly reason about these deceptive\nstrategies, providing evidence that the scheming behavior is not accidental.\nSurprisingly, we also find rare instances where models engage in scheming when\nonly given a goal, without being strongly nudged to pursue it. We observe cases\nwhere Claude 3.5 Sonnet strategically underperforms in evaluations in pursuit\nof being helpful, a goal that was acquired during training rather than\nin-context. Our findings demonstrate that frontier models now possess\ncapabilities for basic in-context scheming, making the potential of AI agents\nto engage in scheming behavior a concrete rather than theoretical concern.\n","authors":["Alexander Meinke","Bronson Schoen","Jérémy Scheurer","Mikita Balesni","Rusheb Shah","Marius Hobbhahn"],"pdf_url":"https://arxiv.org/pdf/2412.04984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05604v2","updated":"2025-01-14T20:02:17Z","published":"2024-10-08T01:34:42Z","title":"Accelerating the discovery of low-energy structure configurations: a\n  computational approach that integrates first-principles calculations, Monte\n  Carlo sampling, and Machine Learning","summary":"  Finding Minimum Energy Configurations (MECs) is essential in fields such as\nphysics, chemistry, and materials science, as they represent the most stable\nstates of the systems. In particular, identifying such MECs in multi-component\nalloys considered candidate PFMs is key because it determines the most stable\narrangement of atoms within the alloy, directly influencing its phase\nstability, structural integrity, and thermo-mechanical properties. However,\nsince the search space grows exponentially with the number of atoms considered,\nobtaining such MECs using computationally expensive first-principles DFT\ncalculations often results in a cumbersome task. To escape the above compromise\nbetween physical fidelity and computational efficiency, we have developed a\nnovel physics-based data-driven approach that combines Monte Carlo sampling,\nfirst-principles DFT calculations, and Machine Learning to accelerate the\ndiscovery of MECs in multi-component alloys. More specifically, we have\nleveraged well-established Cluster Expansion (CE) techniques with Local Outlier\nFactor models to establish strategies that enhance the reliability of the CE\nmethod. In this work, we demonstrated the capabilities of the proposed approach\nfor the particular case of a tungsten-based quaternary high-entropy alloy.\nHowever, the method is applicable to other types of alloys and enables a wide\nrange of applications.\n","authors":["Md Rajib Khan Musa","Yichen Qian","Jie Peng","David Cereceda"],"pdf_url":"https://arxiv.org/pdf/2410.05604v2.pdf","comment":"added changes made during revision of manuscript"},{"id":"http://arxiv.org/abs/2501.08411v1","updated":"2025-01-14T19:59:59Z","published":"2025-01-14T19:59:59Z","title":"BiDepth Multimodal Neural Network: Bidirectional Depth Deep Learning\n  Arcitecture for Spatial-Temporal Prediction","summary":"  Accurate prediction of spatial-temporal (ST) information in dynamic systems,\nsuch as urban mobility and weather patterns, is a crucial yet challenging\nproblem. The complexity stems from the intricate interplay between spatial\nproximity and temporal relevance, where both long-term trends and short-term\nfluctuations are present in convoluted patterns. Existing approaches, including\ntraditional statistical methods and conventional neural networks, may provide\ninaccurate results due to the lack of an effective mechanism that\nsimultaneously incorporates information at variable temporal depths while\nmaintaining spatial context, resulting in a trade-off between comprehensive\nlong-term historical analysis and responsiveness to short-term new information.\nTo bridge this gap, this paper proposes the BiDepth Multimodal Neural Network\n(BDMNN) with bidirectional depth modulation that enables a comprehensive\nunderstanding of both long-term seasonality and short-term fluctuations,\nadapting to the complex ST context. Case studies with real-world public data\ndemonstrate significant improvements in prediction accuracy, with a 12%\nreduction in Mean Squared Error for urban traffic prediction and a 15%\nimprovement in rain precipitation forecasting compared to state-of-the-art\nbenchmarks, without demanding extra computational resources.\n","authors":["Sina Ehsani","Fenglian Pan","Qingpei Hu","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08411v1.pdf","comment":"This paper has been submitted to Applied Intelligence for review"},{"id":"http://arxiv.org/abs/2501.08408v1","updated":"2025-01-14T19:56:43Z","published":"2025-01-14T19:56:43Z","title":"Leveraging 2D Masked Reconstruction for Domain Adaptation of 3D Pose\n  Estimation","summary":"  RGB-based 3D pose estimation methods have been successful with the\ndevelopment of deep learning and the emergence of high-quality 3D pose\ndatasets. However, most existing methods do not operate well for testing images\nwhose distribution is far from that of training data. However, most existing\nmethods do not operate well for testing images whose distribution is far from\nthat of training data. This problem might be alleviated by involving diverse\ndata during training, however it is non-trivial to collect such diverse data\nwith corresponding labels (i.e. 3D pose). In this paper, we introduced an\nunsupervised domain adaptation framework for 3D pose estimation that utilizes\nthe unlabeled data in addition to labeled data via masked image modeling (MIM)\nframework. Foreground-centric reconstruction and attention regularization are\nfurther proposed to increase the effectiveness of unlabeled data usage.\nExperiments are conducted on the various datasets in human and hand pose\nestimation tasks, especially using the cross-domain scenario. We demonstrated\nthe effectiveness of ours by achieving the state-of-the-art accuracy on all\ndatasets.\n","authors":["Hansoo Park","Chanwoo Kim","Jihyeon Kim","Hoseong Cho","Nhat Nguyen Bao Truong","Taehwan Kim","Seungryul Baek"],"pdf_url":"https://arxiv.org/pdf/2501.08408v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.08406v1","updated":"2025-01-14T19:53:58Z","published":"2025-01-14T19:53:58Z","title":"OptiChat: Bridging Optimization Models and Practitioners with Large\n  Language Models","summary":"  Optimization models have been applied to solve a wide variety of\ndecision-making problems. These models are usually developed by optimization\nexperts but are used by practitioners without optimization expertise in various\napplication domains. As a result, practitioners often struggle to interact with\nand draw useful conclusions from optimization models independently. To fill\nthis gap, we introduce OptiChat, a natural language dialogue system designed to\nhelp practitioners interpret model formulation, diagnose infeasibility, analyze\nsensitivity, retrieve information, evaluate modifications, and provide\ncounterfactual explanations. By augmenting large language models (LLMs) with\nfunctional calls and code generation tailored for optimization models, we\nenable seamless interaction and minimize the risk of hallucinations in\nOptiChat. We develop a new dataset to evaluate OptiChat's performance in\nexplaining optimization models. Experiments demonstrate that OptiChat\neffectively bridges the gap between optimization models and practitioners,\ndelivering autonomous, accurate, and instant responses.\n","authors":["Hao Chen","Gonzalo Esteban Constante-Flores","Krishna Sri Ipsit Mantri","Sai Madhukiran Kompalli","Akshdeep Singh Ahluwalia","Can Li"],"pdf_url":"https://arxiv.org/pdf/2501.08406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04809v2","updated":"2025-01-14T19:42:28Z","published":"2024-08-09T01:40:12Z","title":"On the Geometry of Deep Learning","summary":"  In this paper, we overview one promising avenue of progress at the\nmathematical foundation of deep learning: the connection between deep networks\nand function approximation by affine splines (continuous piecewise linear\nfunctions in multiple dimensions). In particular, we will overview work over\nthe past decade on understanding certain geometrical properties of a deep\nnetwork's affine spline mapping, in particular how it tessellates its input\nspace. As we will see, the affine spline connection and geometrical viewpoint\nprovide a powerful portal through which to view, analyze, and improve the inner\nworkings of a deep network.\n","authors":["Randall Balestriero","Ahmed Imtiaz Humayun","Richard Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2408.04809v2.pdf","comment":"Accepted for publication at 'Notices of the American Mathematical\n  Society'"},{"id":"http://arxiv.org/abs/2501.08397v1","updated":"2025-01-14T19:27:58Z","published":"2025-01-14T19:27:58Z","title":"Predict Confidently, Predict Right: Abstention in Dynamic Graph Learning","summary":"  Many real-world systems can be modeled as dynamic graphs, where nodes and\nedges evolve over time, requiring specialized models to capture their evolving\ndynamics in risk-sensitive applications effectively. Temporal graph neural\nnetworks (GNNs) are one such category of specialized models. For the first\ntime, our approach integrates a reject option strategy within the framework of\nGNNs for continuous-time dynamic graphs. This allows the model to strategically\nabstain from making predictions when the uncertainty is high and confidence is\nlow, thus minimizing the risk of critical misclassification and enhancing the\nresults and reliability. We propose a coverage-based abstention prediction\nmodel to implement the reject option that maximizes prediction within a\nspecified coverage. It improves the prediction score for link prediction and\nnode classification tasks. Temporal GNNs deal with extremely skewed datasets\nfor the next state prediction or node classification task. In the case of class\nimbalance, our method can be further tuned to provide a higher weightage to the\nminority class. Exhaustive experiments are presented on four datasets for\ndynamic link prediction and two datasets for dynamic node classification tasks.\nThis demonstrates the effectiveness of our approach in improving the\nreliability and area under the curve (AUC)/ average precision (AP) scores for\npredictions in dynamic graph scenarios. The results highlight our model's\nability to efficiently handle the trade-offs between prediction confidence and\ncoverage, making it a dependable solution for applications requiring high\nprecision in dynamic and uncertain environments.\n","authors":["Jayadratha Gayen","Himanshu Pal","Naresh Manwani","Charu Sharma"],"pdf_url":"https://arxiv.org/pdf/2501.08397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08393v1","updated":"2025-01-14T19:19:37Z","published":"2025-01-14T19:19:37Z","title":"Empathetic Conversational Agents: Utilizing Neural and Physiological\n  Signals for Enhanced Empathetic Interactions","summary":"  Conversational agents (CAs) are revolutionizing human-computer interaction by\nevolving from text-based chatbots to empathetic digital humans (DHs) capable of\nrich emotional expressions. This paper explores the integration of neural and\nphysiological signals into the perception module of CAs to enhance empathetic\ninteractions. By leveraging these cues, the study aims to detect emotions in\nreal-time and generate empathetic responses and expressions. We conducted a\nuser study where participants engaged in conversations with a DH about\nemotional topics. The DH responded and displayed expressions by mirroring\ndetected emotions in real-time using neural and physiological cues. The results\nindicate that participants experienced stronger emotions and greater engagement\nduring interactions with the Empathetic DH, demonstrating the effectiveness of\nincorporating neural and physiological signals for real-time emotion\nrecognition. However, several challenges were identified, including recognition\naccuracy, emotional transition speeds, individual personality effects, and\nlimitations in voice tone modulation. Addressing these challenges is crucial\nfor further refining Empathetic DHs and fostering meaningful connections\nbetween humans and artificial entities. Overall, this research advances\nhuman-agent interaction and highlights the potential of real-time neural and\nphysiological emotion recognition in creating empathetic DHs.\n","authors":["Nastaran Saffaryazdi","Tamil Selvan Gunasekaran","Kate Laveys","Elizabeth Broadbent","Mark Billinghurst"],"pdf_url":"https://arxiv.org/pdf/2501.08393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08365v1","updated":"2025-01-14T17:18:05Z","published":"2025-01-14T17:18:05Z","title":"Towards Best Practices for Open Datasets for LLM Training","summary":"  Many AI companies are training their large language models (LLMs) on data\nwithout the permission of the copyright owners. The permissibility of doing so\nvaries by jurisdiction: in countries like the EU and Japan, this is allowed\nunder certain restrictions, while in the United States, the legal landscape is\nmore ambiguous. Regardless of the legal status, concerns from creative\nproducers have led to several high-profile copyright lawsuits, and the threat\nof litigation is commonly cited as a reason for the recent trend towards\nminimizing the information shared about training datasets by both corporate and\npublic interest actors. This trend in limiting data information causes harm by\nhindering transparency, accountability, and innovation in the broader ecosystem\nby denying researchers, auditors, and impacted individuals access to the\ninformation needed to understand AI models.\n  While this could be mitigated by training language models on open access and\npublic domain data, at the time of writing, there are no such models (trained\nat a meaningful scale) due to the substantial technical and sociological\nchallenges in assembling the necessary corpus. These challenges include\nincomplete and unreliable metadata, the cost and complexity of digitizing\nphysical records, and the diverse set of legal and technical skills required to\nensure relevance and responsibility in a quickly changing landscape. Building\ntowards a future where AI systems can be trained on openly licensed data that\nis responsibly curated and governed requires collaboration across legal,\ntechnical, and policy domains, along with investments in metadata standards,\ndigitization, and fostering a culture of openness.\n","authors":["Stefan Baack","Stella Biderman","Kasia Odrozek","Aviya Skowron","Ayah Bdeir","Jillian Bommarito","Jennifer Ding","Maximilian Gahntz","Paul Keller","Pierre-Carl Langlais","Greg Lindahl","Sebastian Majstorovic","Nik Marda","Guilherme Penedo","Maarten Van Segbroeck","Jennifer Wang","Leandro von Werra","Mitchell Baker","Julie Belião","Kasia Chmielinski","Marzieh Fadaee","Lisa Gutermuth","Hynek Kydlíček","Greg Leppert","EM Lewis-Jong","Solana Larsen","Shayne Longpre","Angela Oduor Lungati","Cullen Miller","Victor Miller","Max Ryabinin","Kathleen Siminyu","Andrew Strait","Mark Surman","Anna Tumadóttir","Maurice Weber","Rebecca Weiss","Lee White","Thomas Wolf"],"pdf_url":"https://arxiv.org/pdf/2501.08365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00649v3","updated":"2025-01-14T17:08:47Z","published":"2024-06-30T10:21:41Z","title":"Particle Semi-Implicit Variational Inference","summary":"  Semi-implicit variational inference (SIVI) enriches the expressiveness of\nvariational families by utilizing a kernel and a mixing distribution to\nhierarchically define the variational distribution. Existing SIVI methods\nparameterize the mixing distribution using implicit distributions, leading to\nintractable variational densities. As a result, directly maximizing the\nevidence lower bound (ELBO) is not possible, so they resort to one of the\nfollowing: optimizing bounds on the ELBO, employing costly inner-loop Markov\nchain Monte Carlo runs, or solving minimax objectives. In this paper, we\npropose a novel method for SIVI called Particle Variational Inference (PVI)\nwhich employs empirical measures to approximate the optimal mixing\ndistributions characterized as the minimizer of a free energy functional. PVI\narises naturally as a particle approximation of a Euclidean--Wasserstein\ngradient flow and, unlike prior works, it directly optimizes the ELBO whilst\nmaking no parametric assumption about the mixing distribution. Our empirical\nresults demonstrate that PVI performs favourably compared to other SIVI methods\nacross various tasks. Moreover, we provide a theoretical analysis of the\nbehaviour of the gradient flow of a related free energy functional:\nestablishing the existence and uniqueness of solutions as well as propagation\nof chaos results.\n","authors":["Jen Ning Lim","Adam M. Johansen"],"pdf_url":"https://arxiv.org/pdf/2407.00649v3.pdf","comment":"NeurIPS 2024 Camera ready"},{"id":"http://arxiv.org/abs/2501.08361v1","updated":"2025-01-14T10:04:05Z","published":"2025-01-14T10:04:05Z","title":"Weight Averaging for Out-of-Distribution Generalization and Few-Shot\n  Domain Adaptation","summary":"  Empirical risk minimization (ERM) is not robust to changes in the\ndistribution of data. When the distribution of test data is different from that\nof training data, the problem is known as out-of-distribution generalization.\nRecently, two techniques have been developed for addressing out-of-distribution\ngeneralization in computer vision: weight averaging (WA) and sharpness-aware\nminimization (SAM). WA involves training multiple models with different\nhyperparameters and then averaging the weights of these models, which can\nsignificantly improve out-of-distribution generalization performance. SAM\noptimizes a neural network to find minima in flat regions, which have been\nproven to perform well under distribution shifts. While these techniques have\nmade great progress, there is still room for improvement and further\nexploration. In this thesis, we propose increasing the model diversity in WA\nexplicitly by introducing gradient similarity as a loss regularizer to further\nimprove out-of-distribution generalization performance. We also propose\ncombining WA and SAM to solve the problem of few-shot domain adaptation. Our\nextensive experiments on digits datasets (MNIST, SVHN, USPS, MNIST-M) and other\ndomain adaptation datasets (VLCS, PACS) show that combining WA and SAM leads to\nimproved out-of-distribution generalization performance and significantly\nincreases few-shot domain adaptation accuracy.\n","authors":["Shijian Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08361v1.pdf","comment":"Master Thesis"}],"Multimedia":[{"id":"http://arxiv.org/abs/2501.08137v1","updated":"2025-01-14T14:15:10Z","published":"2025-01-14T14:15:10Z","title":"Audio-visual Deepfake Detection With Local Temporal Inconsistencies","summary":"  This paper proposes an audio-visual deepfake detection approach that aims to\ncapture fine-grained temporal inconsistencies between audio and visual\nmodalities. To achieve this, both architectural and data synthesis strategies\nare introduced. From an architectural perspective, a temporal distance map,\ncoupled with an attention mechanism, is designed to capture these\ninconsistencies while minimizing the impact of irrelevant temporal\nsubsequences. Moreover, we explore novel pseudo-fake generation techniques to\nsynthesize local inconsistencies. Our approach is evaluated against\nstate-of-the-art methods using the DFDC and FakeAVCeleb datasets, demonstrating\nits effectiveness in detecting audio-visual deepfakes.\n","authors":["Marcella Astrid","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2501.08137v1.pdf","comment":"Accepted in ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07972v1","updated":"2025-01-14T09:45:10Z","published":"2025-01-14T09:45:10Z","title":"Zero-shot Video Moment Retrieval via Off-the-shelf Multimodal Large\n  Language Models","summary":"  The target of video moment retrieval (VMR) is predicting temporal spans\nwithin a video that semantically match a given linguistic query. Existing VMR\nmethods based on multimodal large language models (MLLMs) overly rely on\nexpensive high-quality datasets and time-consuming fine-tuning. Although some\nrecent studies introduce a zero-shot setting to avoid fine-tuning, they\noverlook inherent language bias in the query, leading to erroneous\nlocalization. To tackle the aforementioned challenges, this paper proposes\nMoment-GPT, a tuning-free pipeline for zero-shot VMR utilizing frozen MLLMs.\nSpecifically, we first employ LLaMA-3 to correct and rephrase the query to\nmitigate language bias. Subsequently, we design a span generator combined with\nMiniGPT-v2 to produce candidate spans adaptively. Finally, to leverage the\nvideo comprehension capabilities of MLLMs, we apply VideoChatGPT and span\nscorer to select the most appropriate spans. Our proposed method substantially\noutperforms the state-ofthe-art MLLM-based and zero-shot models on several\npublic datasets, including QVHighlights, ActivityNet-Captions, and\nCharades-STA.\n","authors":["Yifang Xu","Yunzhuo Sun","Benxiang Zhai","Ming Li","Wenxin Liang","Yang Li","Sidan Du"],"pdf_url":"https://arxiv.org/pdf/2501.07972v1.pdf","comment":"Accepted by AAAI 2025"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.08328v1","updated":"2025-01-14T18:59:03Z","published":"2025-01-14T18:59:03Z","title":"PokerBench: Training Large Language Models to become Professional Poker\n  Players","summary":"  We introduce PokerBench - a benchmark for evaluating the poker-playing\nabilities of large language models (LLMs). As LLMs excel in traditional NLP\ntasks, their application to complex, strategic games like poker poses a new\nchallenge. Poker, an incomplete information game, demands a multitude of skills\nsuch as mathematics, reasoning, planning, strategy, and a deep understanding of\ngame theory and human psychology. This makes Poker the ideal next frontier for\nlarge language models. PokerBench consists of a comprehensive compilation of\n11,000 most important scenarios, split between pre-flop and post-flop play,\ndeveloped in collaboration with trained poker players. We evaluate prominent\nmodels including GPT-4, ChatGPT 3.5, and various Llama and Gemma series models,\nfinding that all state-of-the-art LLMs underperform in playing optimal poker.\nHowever, after fine-tuning, these models show marked improvements. We validate\nPokerBench by having models with different scores compete with each other,\ndemonstrating that higher scores on PokerBench lead to higher win rates in\nactual poker games. Through gameplay between our fine-tuned model and GPT-4, we\nalso identify limitations of simple supervised fine-tuning for learning optimal\nplaying strategy, suggesting the need for more advanced methodologies for\neffectively training language models to excel in games. PokerBench thus\npresents a unique benchmark for a quick and reliable evaluation of the\npoker-playing ability of LLMs as well as a comprehensive benchmark to study the\nprogress of LLMs in complex game-playing scenarios. The dataset and code will\nbe made available at: \\url{https://github.com/pokerllm/pokerbench}.\n","authors":["Richard Zhuang","Akshat Gupta","Richard Yang","Aniket Rahane","Zhengyu Li","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2501.08328v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.08324v1","updated":"2025-01-14T18:56:33Z","published":"2025-01-14T18:56:33Z","title":"ADAM-1: AI and Bioinformatics for Alzheimer's Detection and\n  Microbiome-Clinical Data Integrations","summary":"  The Alzheimer's Disease Analysis Model Generation 1 (ADAM) is a multi-agent\nlarge language model (LLM) framework designed to integrate and analyze\nmulti-modal data, including microbiome profiles, clinical datasets, and\nexternal knowledge bases, to enhance the understanding and detection of\nAlzheimer's disease (AD). By leveraging retrieval-augmented generation (RAG)\ntechniques along with its multi-agent architecture, ADAM-1 synthesizes insights\nfrom diverse data sources and contextualizes findings using literature-driven\nevidence. Comparative evaluation against XGBoost revealed similar mean F1\nscores but significantly reduced variance for ADAM-1, highlighting its\nrobustness and consistency, particularly in small laboratory datasets. While\ncurrently tailored for binary classification tasks, future iterations aim to\nincorporate additional data modalities, such as neuroimaging and biomarkers, to\nbroaden the scalability and applicability for Alzheimer's research and\ndiagnostics.\n","authors":["Ziyuan Huang","Vishaldeep Kaur Sekhon","Ouyang Guo","Mark Newman","Roozbeh Sadeghian","Maria L. Vaida","Cynthia Jo","Doyle Ward","Vanni Bucci","John P. Haran"],"pdf_url":"https://arxiv.org/pdf/2501.08324v1.pdf","comment":"16 pages, 16 figures"},{"id":"http://arxiv.org/abs/2501.08316v1","updated":"2025-01-14T18:51:48Z","published":"2025-01-14T18:51:48Z","title":"Diffusion Adversarial Post-Training for One-Step Video Generation","summary":"  The diffusion models are widely used for image and video generation, but\ntheir iterative generation process is slow and expansive. While existing\ndistillation approaches have demonstrated the potential for one-step generation\nin the image domain, they still suffer from significant quality degradation. In\nthis work, we propose Adversarial Post-Training (APT) against real data\nfollowing diffusion pre-training for one-step video generation. To improve the\ntraining stability and quality, we introduce several improvements to the model\narchitecture and training procedures, along with an approximated R1\nregularization objective. Empirically, our experiments show that our\nadversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720,\n24fps videos in real time using a single forward evaluation step. Additionally,\nour model is capable of generating 1024px images in a single step, achieving\nquality comparable to state-of-the-art methods.\n","authors":["Shanchuan Lin","Xin Xia","Yuxi Ren","Ceyuan Yang","Xuefeng Xiao","Lu Jiang"],"pdf_url":"https://arxiv.org/pdf/2501.08316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08297v1","updated":"2025-01-14T18:28:08Z","published":"2025-01-14T18:28:08Z","title":"Polynomial Threshold Functions of Bounded Tree-Width: Some\n  Explainability and Complexity Aspects","summary":"  The tree-width of a multivariate polynomial is the tree-width of the\nhypergraph with hyperedges corresponding to its terms. Multivariate polynomials\nof bounded tree-width have been studied by Makowsky and Meer as a new sparsity\ncondition that allows for polynomial solvability of problems which are\nintractable in general. We consider a variation on this theme for Boolean\nvariables. A representation of a Boolean function as the sign of a polynomial\nis called a polynomial threshold representation. We discuss Boolean functions\nrepresentable as polynomial threshold functions of bounded tree-width and\npresent two applications to Bayesian network classifiers, a probabilistic\ngraphical model. Both applications are in Explainable Artificial Intelligence\n(XAI), the research area dealing with the black-box nature of many recent\nmachine learning models. We also give a separation result between the\nrepresentational power of positive and general polynomial threshold functions.\n","authors":["Karine Chubarian","Johnny Joyce","Gyorgy Turan"],"pdf_url":"https://arxiv.org/pdf/2501.08297v1.pdf","comment":"22 pages, 3 figures. To be published in Festschrift in honor of\n  Johann A. Makowsky"},{"id":"http://arxiv.org/abs/2501.08292v1","updated":"2025-01-14T18:13:08Z","published":"2025-01-14T18:13:08Z","title":"HALoGEN: Fantastic LLM Hallucinations and Where to Find Them","summary":"  Despite their impressive ability to generate high-quality and fluent text,\ngenerative large language models (LLMs) also produce hallucinations: statements\nthat are misaligned with established world knowledge or provided input context.\nHowever, measuring hallucination can be challenging, as having humans verify\nmodel generations on-the-fly is both expensive and time-consuming. In this\nwork, we release HALoGEN, a comprehensive hallucination benchmark consisting\nof: (1) 10,923 prompts for generative models spanning nine domains including\nprogramming, scientific attribution, and summarization, and (2) automatic\nhigh-precision verifiers for each use case that decompose LLM generations into\natomic units, and verify each unit against a high-quality knowledge source. We\nuse this framework to evaluate ~150,000 generations from 14 language models,\nfinding that even the best-performing models are riddled with hallucinations\n(sometimes up to 86% of generated atomic facts depending on the domain). We\nfurther define a novel error classification for LLM hallucinations based on\nwhether they likely stem from incorrect recollection of training data (Type A\nerrors), or incorrect knowledge in training data (Type B errors), or are\nfabrication (Type C errors). We hope our framework provides a foundation to\nenable the principled study of why generative models hallucinate, and advances\nthe development of trustworthy large language models.\n","authors":["Abhilasha Ravichander","Shrusti Ghela","David Wadden","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2501.08292v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.24031v2","updated":"2025-01-14T18:03:42Z","published":"2024-10-31T15:29:51Z","title":"A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems\n  using Disparity Maps","summary":"  Face recognition technologies are increasingly used in various applications,\nyet they are vulnerable to face spoofing attacks. These spoofing attacks often\ninvolve unique 3D structures, such as printed papers or mobile device screens.\nAlthough stereo-depth cameras can detect such attacks effectively, their\nhigh-cost limits their widespread adoption. Conversely, two-sensor systems\nwithout extrinsic calibration offer a cost-effective alternative but are unable\nto calculate depth using stereo techniques. In this work, we propose a method\nto overcome this challenge by leveraging facial attributes to derive disparity\ninformation and estimate relative depth for anti-spoofing purposes, using\nnon-calibrated systems. We introduce a multi-modal anti-spoofing model, coined\nDisparity Model, that incorporates created disparity maps as a third modality\nalongside the two original sensor modalities. We demonstrate the effectiveness\nof the Disparity Model in countering various spoof attacks using a\ncomprehensive dataset collected from the Intel RealSense ID Solution F455. Our\nmethod outperformed existing methods in the literature, achieving an Equal\nError Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False\nPositive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the\nerrors of the best comparison method, respectively. Additionally, we introduce\na model ensemble that addresses 3D spoof attacks as well, achieving an EER of\n2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a\nstate-of-the-art solution for the challenging task of anti-spoofing in\nnon-calibrated systems that lack depth information.\n","authors":["Ariel Larey","Eyal Rond","Omer Achrack"],"pdf_url":"https://arxiv.org/pdf/2410.24031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08476v2","updated":"2025-01-14T17:46:01Z","published":"2024-06-12T17:59:04Z","title":"RMem: Restricted Memory Banks Improve Video Object Segmentation","summary":"  With recent video object segmentation (VOS) benchmarks evolving to\nchallenging scenarios, we revisit a simple but overlooked strategy: restricting\nthe size of memory banks. This diverges from the prevalent practice of\nexpanding memory banks to accommodate extensive historical information. Our\nspecially designed \"memory deciphering\" study offers a pivotal insight\nunderpinning such a strategy: expanding memory banks, while seemingly\nbeneficial, actually increases the difficulty for VOS modules to decode\nrelevant features due to the confusion from redundant information. By\nrestricting memory banks to a limited number of essential frames, we achieve a\nnotable improvement in VOS accuracy. This process balances the importance and\nfreshness of frames to maintain an informative memory bank within a bounded\ncapacity. Additionally, restricted memory banks reduce the training-inference\ndiscrepancy in memory lengths compared with continuous expansion. This fosters\nnew opportunities in temporal reasoning and enables us to introduce the\npreviously overlooked \"temporal positional embedding.\" Finally, our insights\nare embodied in \"RMem\" (\"R\" for restricted), a simple yet effective VOS\nmodification that excels at challenging VOS scenarios and establishes new state\nof the art for object state changes (on the VOST dataset) and long videos (on\nthe Long Videos dataset). Our code and demo are available at\nhttps://restricted-memory.github.io/.\n","authors":["Junbao Zhou","Ziqi Pang","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2406.08476v2.pdf","comment":"CVPR 2024, Project Page: https://restricted-memory.github.io/"},{"id":"http://arxiv.org/abs/2501.08271v1","updated":"2025-01-14T17:37:40Z","published":"2025-01-14T17:37:40Z","title":"Comparative Analysis of Efficient Adapter-Based Fine-Tuning of\n  State-of-the-Art Transformer Models","summary":"  In this work, we investigate the efficacy of various adapter architectures on\nsupervised binary classification tasks from the SuperGLUE benchmark as well as\na supervised multi-class news category classification task from Kaggle.\nSpecifically, we compare classification performance and time complexity of\nthree transformer models, namely DistilBERT, ELECTRA, and BART, using\nconventional fine-tuning as well as nine state-of-the-art (SoTA) adapter\narchitectures. Our analysis reveals performance differences across adapter\narchitectures, highlighting their ability to achieve comparable or better\nperformance relative to fine-tuning at a fraction of the training time. Similar\nresults are observed on the new classification task, further supporting our\nfindings and demonstrating adapters as efficient and flexible alternatives to\nfine-tuning. This study provides valuable insights and guidelines for selecting\nand implementing adapters in diverse natural language processing (NLP)\napplications.\n","authors":["Saad Mashkoor Siddiqui","Mohammad Ali Sheikh","Muhammad Aleem","Kajol R Singh"],"pdf_url":"https://arxiv.org/pdf/2501.08271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08266v1","updated":"2025-01-14T17:26:02Z","published":"2025-01-14T17:26:02Z","title":"AI Driven Water Segmentation with deep learning models for Enhanced\n  Flood Monitoring","summary":"  Flooding is a major natural hazard causing significant fatalities and\neconomic losses annually, with increasing frequency due to climate change.\nRapid and accurate flood detection and monitoring are crucial for mitigating\nthese impacts. This study compares the performance of three deep learning\nmodels UNet, ResNet, and DeepLabv3 for pixelwise water segmentation to aid in\nflood detection, utilizing images from drones, in field observations, and\nsocial media. This study involves creating a new dataset that augments\nwellknown benchmark datasets with flood-specific images, enhancing the\nrobustness of the models. The UNet, ResNet, and DeepLab v3 architectures are\ntested to determine their effectiveness in various environmental conditions and\ngeographical locations, and the strengths and limitations of each model are\nalso discussed here, providing insights into their applicability in different\nscenarios by predicting image segmentation masks. This fully automated approach\nallows these models to isolate flooded areas in images, significantly reducing\nprocessing time compared to traditional semi-automated methods. The outcome of\nthis study is to predict segmented masks for each image effected by a flood\ndisaster and the validation accuracy of these models. This methodology\nfacilitates timely and continuous flood monitoring, providing vital data for\nemergency response teams to reduce loss of life and economic damages. It offers\na significant reduction in the time required to generate flood maps, cutting\ndown the manual processing time. Additionally, we present avenues for future\nresearch, including the integration of multimodal data sources and the\ndevelopment of robust deep learning architectures tailored specifically for\nflood detection tasks. Overall, our work contributes to the advancement of\nflood management strategies through innovative use of deep learning\ntechnologies.\n","authors":["Sanjida Afrin Mou","Tasfia Noor Chowdhury","Adib Ibn Mannan","Sadia Nourin Mim","Lubana Tarannum","Tasrin Noman","Jamal Uddin Ahamed"],"pdf_url":"https://arxiv.org/pdf/2501.08266v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.02748v3","updated":"2025-01-14T17:20:04Z","published":"2024-10-03T17:57:01Z","title":"CriSPO: Multi-Aspect Critique-Suggestion-guided Automatic Prompt\n  Optimization for Text Generation","summary":"  Existing automatic prompt engineering methods are typically designed for\ndiscriminative tasks, where new task prompts are iteratively refined with\nlimited feedback from a single metric reflecting a single aspect. However,\nthese approaches are suboptimal for generative tasks, which require more\nnuanced guidance beyond a single numeric metric to improve the prompt and\noptimize multiple aspects of the generated text. To address these challenges,\nwe propose a novel multi-aspect Critique-Suggestion-guided automatic Prompt\nOptimization (CriSPO) approach. CriSPO introduces a critique-suggestion module\nas its core component. This module spontaneously discovers aspects, and\ncompares generated and reference texts across these aspects, providing specific\nsuggestions for prompt modification. These clear critiques and actionable\nsuggestions guide a receptive optimizer module to make more substantial\nchanges, exploring a broader and more effective search space. To further\nimprove CriSPO with multi-metric optimization, we introduce an Automatic Suffix\nTuning (AST) extension to enhance the performance of task prompts across\nmultiple metrics. We evaluate CriSPO on 4 state-of-the-art LLMs across 4\nsummarization and 5 QA datasets. Extensive experiments show 3-4% ROUGE score\nimprovement on summarization and substantial improvement of various metrics on\nQA. Code available at https://github.com/amazon-science/crispo\n","authors":["Han He","Qianchu Liu","Lei Xu","Chaitanya Shivade","Yi Zhang","Sundararajan Srinivasan","Katrin Kirchhoff"],"pdf_url":"https://arxiv.org/pdf/2410.02748v3.pdf","comment":"Accepted to AAAI-2025"},{"id":"http://arxiv.org/abs/2410.10733v4","updated":"2025-01-14T16:47:44Z","published":"2024-10-14T17:15:07Z","title":"Deep Compression Autoencoder for Efficient High-Resolution Diffusion\n  Models","summary":"  We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder\nmodels for accelerating high-resolution diffusion models. Existing autoencoder\nmodels have demonstrated impressive results at a moderate spatial compression\nratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for\nhigh spatial compression ratios (e.g., 64x). We address this challenge by\nintroducing two key techniques: (1) Residual Autoencoding, where we design our\nmodels to learn residuals based on the space-to-channel transformed features to\nalleviate the optimization difficulty of high spatial-compression autoencoders;\n(2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases\ntraining strategy for mitigating the generalization penalty of high\nspatial-compression autoencoders. With these designs, we improve the\nautoencoder's spatial compression ratio up to 128 while maintaining the\nreconstruction quality. Applying our DC-AE to latent diffusion models, we\nachieve significant speedup without accuracy drop. For example, on ImageNet\n512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup\non H100 GPU for UViT-H while achieving a better FID, compared with the widely\nused SD-VAE-f8 autoencoder. Our code is available at\nhttps://github.com/mit-han-lab/efficientvit.\n","authors":["Junyu Chen","Han Cai","Junsong Chen","Enze Xie","Shang Yang","Haotian Tang","Muyang Li","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2410.10733v4.pdf","comment":"Preprint. First two authors contributed equally to this work. Update:\n  add USiT (UViT+SiT sampler) results"},{"id":"http://arxiv.org/abs/2501.08248v1","updated":"2025-01-14T16:38:33Z","published":"2025-01-14T16:38:33Z","title":"Eliciting In-context Retrieval and Reasoning for Long-context Large\n  Language Models","summary":"  Recent advancements in long-context language models (LCLMs) promise to\ntransform Retrieval-Augmented Generation (RAG) by simplifying pipelines. With\ntheir expanded context windows, LCLMs can process entire knowledge bases and\nperform retrieval and reasoning directly -- a capability we define as\nIn-Context Retrieval and Reasoning (ICR^2). However, existing benchmarks like\nLOFT often overestimate LCLM performance by providing overly simplified\ncontexts. To address this, we introduce ICR^2, a benchmark that evaluates LCLMs\nin more realistic scenarios by including confounding passages retrieved with\nstrong retrievers. We then propose three methods to enhance LCLM performance:\n(1) retrieve-then-generate fine-tuning, (2) retrieval-attention-probing, which\nuses attention heads to filter and de-noise long contexts during decoding, and\n(3) joint retrieval head training alongside the generation head. Our evaluation\nof five well-known LCLMs on LOFT and ICR^2 demonstrates significant gains with\nour best approach applied to Mistral-7B: +17 and +15 points by Exact Match on\nLOFT, and +13 and +2 points on ICR^2, compared to vanilla RAG and supervised\nfine-tuning, respectively. It even outperforms GPT-4-Turbo on most tasks\ndespite being a much smaller model.\n","authors":["Yifu Qiu","Varun Embar","Yizhe Zhang","Navdeep Jaitly","Shay B. Cohen","Benjamin Han"],"pdf_url":"https://arxiv.org/pdf/2501.08248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08243v1","updated":"2025-01-14T16:30:10Z","published":"2025-01-14T16:30:10Z","title":"Engineering LLM Powered Multi-agent Framework for Autonomous CloudOps","summary":"  Cloud Operations (CloudOps) is a rapidly growing field focused on the\nautomated management and optimization of cloud infrastructure which is\nessential for organizations navigating increasingly complex cloud environments.\nMontyCloud Inc. is one of the major companies in the CloudOps domain that\nleverages autonomous bots to manage cloud compliance, security, and continuous\noperations. To make the platform more accessible and effective to the\ncustomers, we leveraged the use of GenAI.\n  Developing a GenAI-based solution for autonomous CloudOps for the existing\nMontyCloud system presented us with various challenges such as i) diverse data\nsources; ii) orchestration of multiple processes; and iii) handling complex\nworkflows to automate routine tasks. To this end, we developed MOYA, a\nmulti-agent framework that leverages GenAI and balances autonomy with the\nnecessary human control. This framework integrates various internal and\nexternal systems and is optimized for factors like task orchestration,\nsecurity, and error mitigation while producing accurate, reliable, and relevant\ninsights by utilizing Retrieval Augmented Generation (RAG). Evaluations of our\nmulti-agent system with the help of practitioners as well as using automated\nchecks demonstrate enhanced accuracy, responsiveness, and effectiveness over\nnon-agentic approaches across complex workflows.\n","authors":["Kannan Parthasarathy","Karthik Vaidhyanathan","Rudra Dhar","Venkat Krishnamachari","Basil Muhammed","Adyansh Kakran","Sreemaee Akshathala","Shrikara Arun","Sumant Dubey","Mohan Veerubhotla","Amey Karan"],"pdf_url":"https://arxiv.org/pdf/2501.08243v1.pdf","comment":"The paper has been accepted as full paper to CAIN 2025\n  (https://conf.researchr.org/home/cain-2025), co-located with ICSE 2025\n  (https://conf.researchr.org/home/icse-2025). The paper was submitted to CAIN\n  for review on 9 November 2024"},{"id":"http://arxiv.org/abs/2501.08241v1","updated":"2025-01-14T16:28:02Z","published":"2025-01-14T16:28:02Z","title":"A Feature-Level Ensemble Model for COVID-19 Identification in CXR Images\n  using Choquet Integral and Differential Evolution Optimization","summary":"  The COVID-19 pandemic has profoundly impacted billions globally. It\nchallenges public health and healthcare systems due to its rapid spread and\nsevere respiratory effects. An effective strategy to mitigate the COVID-19\npandemic involves integrating testing to identify infected individuals. While\nRT-PCR is considered the gold standard for diagnosing COVID-19, it has some\nlimitations such as the risk of false negatives. To address this problem, this\npaper introduces a novel Deep Learning Diagnosis System that integrates\npre-trained Deep Convolutional Neural Networks (DCNNs) within an ensemble\nlearning framework to achieve precise identification of COVID-19 cases from\nChest X-ray (CXR) images. We combine feature vectors from the final hidden\nlayers of pre-trained DCNNs using the Choquet integral to capture interactions\nbetween different DCNNs that a linear approach cannot. We employed\nSugeno-$\\lambda$ measure theory to derive fuzzy measures for subsets of\nnetworks to enable aggregation. We utilized Differential Evolution to estimate\nfuzzy densities. We developed a TensorFlow-based layer for Choquet operation to\nfacilitate efficient aggregation, due to the intricacies involved in\naggregating feature vectors. Experimental results on the COVIDx dataset show\nthat our ensemble model achieved 98\\% accuracy in three-class classification\nand 99.50\\% in binary classification, outperforming its components-DenseNet-201\n(97\\% for three-class, 98.75\\% for binary), Inception-v3 (96.25\\% for\nthree-class, 98.50\\% for binary), and Xception (94.50\\% for three-class, 98\\%\nfor binary)-and surpassing many previous methods.\n","authors":["Amir Reza Takhsha","Maryam Rastgarpour","Mozhgan Naderi"],"pdf_url":"https://arxiv.org/pdf/2501.08241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08234v1","updated":"2025-01-14T16:19:25Z","published":"2025-01-14T16:19:25Z","title":"Dynamic Pricing in High-Speed Railways Using Multi-Agent Reinforcement\n  Learning","summary":"  This paper addresses a critical challenge in the high-speed passenger railway\nindustry: designing effective dynamic pricing strategies in the context of\ncompeting and cooperating operators. To address this, a multi-agent\nreinforcement learning (MARL) framework based on a non-zero-sum Markov game is\nproposed, incorporating random utility models to capture passenger decision\nmaking. Unlike prior studies in areas such as energy, airlines, and mobile\nnetworks, dynamic pricing for railway systems using deep reinforcement learning\nhas received limited attention. A key contribution of this paper is a\nparametrisable and versatile reinforcement learning simulator designed to model\na variety of railway network configurations and demand patterns while enabling\nrealistic, microscopic modelling of user behaviour, called RailPricing-RL. This\nenvironment supports the proposed MARL framework, which models heterogeneous\nagents competing to maximise individual profits while fostering cooperative\nbehaviour to synchronise connecting services. Experimental results validate the\nframework, demonstrating how user preferences affect MARL performance and how\npricing policies influence passenger choices, utility, and overall system\ndynamics. This study provides a foundation for advancing dynamic pricing\nstrategies in railway systems, aligning profitability with system-wide\nefficiency, and supporting future research on optimising pricing policies.\n","authors":["Enrique Adrian Villarrubia-Martin","Luis Rodriguez-Benitez","David Muñoz-Valero","Giovanni Montana","Luis Jimenez-Linares"],"pdf_url":"https://arxiv.org/pdf/2501.08234v1.pdf","comment":"37 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.14831v3","updated":"2025-01-14T16:17:49Z","published":"2024-05-23T17:47:55Z","title":"HippoRAG: Neurobiologically Inspired Long-Term Memory for Large Language\n  Models","summary":"  In order to thrive in hostile and ever-changing natural environments,\nmammalian brains evolved to store large amounts of knowledge about the world\nand continually integrate new information while avoiding catastrophic\nforgetting. Despite the impressive accomplishments, large language models\n(LLMs), even with retrieval-augmented generation (RAG), still struggle to\nefficiently and effectively integrate a large amount of new experiences after\npre-training. In this work, we introduce HippoRAG, a novel retrieval framework\ninspired by the hippocampal indexing theory of human long-term memory to enable\ndeeper and more efficient knowledge integration over new experiences. HippoRAG\nsynergistically orchestrates LLMs, knowledge graphs, and the Personalized\nPageRank algorithm to mimic the different roles of neocortex and hippocampus in\nhuman memory. We compare HippoRAG with existing RAG methods on multi-hop\nquestion answering and show that our method outperforms the state-of-the-art\nmethods remarkably, by up to 20%. Single-step retrieval with HippoRAG achieves\ncomparable or better performance than iterative retrieval like IRCoT while\nbeing 10-30 times cheaper and 6-13 times faster, and integrating HippoRAG into\nIRCoT brings further substantial gains. Finally, we show that our method can\ntackle new types of scenarios that are out of reach of existing methods. Code\nand data are available at https://github.com/OSU-NLP-Group/HippoRAG.\n","authors":["Bernal Jiménez Gutiérrez","Yiheng Shu","Yu Gu","Michihiro Yasunaga","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2405.14831v3.pdf","comment":"NeurIPS 2024. Code and data:\n  https://github.com/OSU-NLP-Group/HippoRAG"},{"id":"http://arxiv.org/abs/2406.10729v2","updated":"2025-01-14T16:17:00Z","published":"2024-06-15T20:04:06Z","title":"A Comprehensive Survey of Foundation Models in Medicine","summary":"  Foundation models (FMs) are large-scale deep learning models that are\ndeveloped using large datasets and self-supervised learning methods. These\nmodels serve as a base for different downstream tasks, including healthcare.\nFMs have been adopted with great success across various domains within\nhealthcare. Existing healthcare-based surveys have not yet included all of\nthese domains. Therefore, we provide a detailed survey of FMs in healthcare. We\nfocus on the history, learning strategies, flagship models, applications, and\nchallenges of FMs. We explore how FMs such as the BERT and GPT families are\nreshaping various healthcare domains, including clinical large language models,\nmedical image analysis, and omics. Furthermore, we provide a detailed taxonomy\nof healthcare applications facilitated by FMs, such as clinical NLP, medical\ncomputer vision, graph learning, and other biology-related tasks. Despite the\npromising opportunities FMs provide, they also have several associated\nchallenges, which are explained in detail. We also outline open research issues\nand potential lessons learned to provide researchers and practitioners with\ninsights into the capabilities of FMs in healthcare to advance their deployment\nand mitigate associated risks.\n","authors":["Wasif Khan","Seowung Leem","Kyle B. See","Joshua K. Wong","Shaoting Zhang","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2406.10729v2.pdf","comment":"Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING"},{"id":"http://arxiv.org/abs/2307.09059v3","updated":"2025-01-14T16:11:11Z","published":"2023-07-18T08:23:46Z","title":"Text-guided Image Restoration and Semantic Enhancement for Text-to-Image\n  Person Retrieval","summary":"  The goal of Text-to-Image Person Retrieval (TIPR) is to retrieve specific\nperson images according to the given textual descriptions. A primary challenge\nin this task is bridging the substantial representational gap between visual\nand textual modalities. The prevailing methods map texts and images into\nunified embedding space for matching, while the intricate semantic\ncorrespondences between texts and images are still not effectively constructed.\nTo address this issue, we propose a novel TIPR framework to build fine-grained\ninteractions and alignment between person images and the corresponding texts.\nSpecifically, via fine-tuning the Contrastive Language-Image Pre-training\n(CLIP) model, a visual-textual dual encoder is firstly constructed, to\npreliminarily align the image and text features. Secondly, a Text-guided Image\nRestoration (TIR) auxiliary task is proposed to map abstract textual entities\nto specific image regions, improving the alignment between local textual and\nvisual embeddings. Additionally, a cross-modal triplet loss is presented to\nhandle hard samples, and further enhance the model's discriminability for minor\ndifferences. Moreover, a pruning-based text data augmentation approach is\nproposed to enhance focus on essential elements in descriptions, thereby\navoiding excessive model attention to less significant information. The\nexperimental results show our proposed method outperforms state-of-the-art\nmethods on three popular benchmark datasets, and the code will be made publicly\navailable at https://github.com/Delong-liu-bupt/SEN.\n","authors":["Delong Liu","Haiwen Li","Zhicheng Zhao","Yuan Dong","Nikolaos V. Boulgouris"],"pdf_url":"https://arxiv.org/pdf/2307.09059v3.pdf","comment":"The paper was withdrawn due to a dispute among the authors regarding\n  the content of the article"},{"id":"http://arxiv.org/abs/2501.08220v1","updated":"2025-01-14T16:04:46Z","published":"2025-01-14T16:04:46Z","title":"Optimization of Link Configuration for Satellite Communication Using\n  Reinforcement Learning","summary":"  Satellite communication is a key technology in our modern connected world.\nWith increasingly complex hardware, one challenge is to efficiently configure\nlinks (connections) on a satellite transponder. Planning an optimal link\nconfiguration is extremely complex and depends on many parameters and metrics.\nThe optimal use of the limited resources, bandwidth and power of the\ntransponder is crucial. Such an optimization problem can be approximated using\nmetaheuristic methods such as simulated annealing, but recent research results\nalso show that reinforcement learning can achieve comparable or even better\nperformance in optimization methods. However, there have not yet been any\nstudies on link configuration on satellite transponders. In order to close this\nresearch gap, a transponder environment was developed as part of this work. For\nthis environment, the performance of the reinforcement learning algorithm PPO\nwas compared with the metaheuristic simulated annealing in two experiments. The\nresults show that Simulated Annealing delivers better results for this static\nproblem than the PPO algorithm, however, the research in turn also underlines\nthe potential of reinforcement learning for optimization problems.\n","authors":["Tobias Rohe","Michael Kölle","Jan Matheis","Rüdiger Höpfl","Leo Sünkel","Claudia Linnhoff-Popien"],"pdf_url":"https://arxiv.org/pdf/2501.08220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14012v2","updated":"2025-01-14T15:58:02Z","published":"2024-11-21T10:54:35Z","title":"Logic Augmented Generation","summary":"  Semantic Knowledge Graphs (SKG) face challenges with scalability,\nflexibility, contextual understanding, and handling unstructured or ambiguous\ninformation. However, they offer formal and structured knowledge enabling\nhighly interpretable and reliable results by means of reasoning and querying.\nLarge Language Models (LLMs) overcome those limitations making them suitable in\nopen-ended tasks and unstructured environments. Nevertheless, LLMs are neither\ninterpretable nor reliable. To solve the dichotomy between LLMs and SKGs we\nenvision Logic Augmented Generation (LAG) that combines the benefits of the two\nworlds. LAG uses LLMs as Reactive Continuous Knowledge Graphs that can generate\npotentially infinite relations and tacit knowledge on-demand. SKGs are key for\ninjecting a discrete heuristic dimension with clear logical and factual\nboundaries. We exemplify LAG in two tasks of collective intelligence, i.e.,\nmedical diagnostics and climate projections. Understanding the properties and\nlimitations of LAG, which are still mostly unknown, is of utmost importance for\nenabling a variety of tasks involving tacit knowledge in order to provide\ninterpretable and effective results.\n","authors":["Aldo Gangemi","Andrea Giovanni Nuzzolese"],"pdf_url":"https://arxiv.org/pdf/2411.14012v2.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.08208v1","updated":"2025-01-14T15:46:39Z","published":"2025-01-14T15:46:39Z","title":"ASTRID -- An Automated and Scalable TRIaD for the Evaluation of\n  RAG-based Clinical Question Answering Systems","summary":"  Large Language Models (LLMs) have shown impressive potential in clinical\nquestion answering (QA), with Retrieval Augmented Generation (RAG) emerging as\na leading approach for ensuring the factual accuracy of model responses.\nHowever, current automated RAG metrics perform poorly in clinical and\nconversational use cases. Using clinical human evaluations of responses is\nexpensive, unscalable, and not conducive to the continuous iterative\ndevelopment of RAG systems. To address these challenges, we introduce ASTRID -\nan Automated and Scalable TRIaD for evaluating clinical QA systems leveraging\nRAG - consisting of three metrics: Context Relevance (CR), Refusal Accuracy\n(RA), and Conversational Faithfulness (CF). Our novel evaluation metric, CF, is\ndesigned to better capture the faithfulness of a model's response to the\nknowledge base without penalising conversational elements. To validate our\ntriad, we curate a dataset of over 200 real-world patient questions posed to an\nLLM-based QA agent during surgical follow-up for cataract surgery - the highest\nvolume operation in the world - augmented with clinician-selected questions for\nemergency, clinical, and non-clinical out-of-domain scenarios. We demonstrate\nthat CF can predict human ratings of faithfulness better than existing\ndefinitions for conversational use cases. Furthermore, we show that evaluation\nusing our triad consisting of CF, RA, and CR exhibits alignment with clinician\nassessment for inappropriate, harmful, or unhelpful responses. Finally, using\nnine different LLMs, we demonstrate that the three metrics can closely agree\nwith human evaluations, highlighting the potential of these metrics for use in\nLLM-driven automated evaluation pipelines. We also publish the prompts and\ndatasets for these experiments, providing valuable resources for further\nresearch and development.\n","authors":["Mohita Chowdhury","Yajie Vera He","Aisling Higham","Ernest Lim"],"pdf_url":"https://arxiv.org/pdf/2501.08208v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2501.08205v1","updated":"2025-01-14T15:45:27Z","published":"2025-01-14T15:45:27Z","title":"Modeling Feature Maps for Quantum Machine Learning","summary":"  Quantum Machine Learning (QML) offers significant potential for complex tasks\nlike genome sequence classification, but quantum noise on Noisy\nIntermediate-Scale Quantum (NISQ) devices poses practical challenges. This\nstudy systematically evaluates how various quantum noise models including\ndephasing, amplitude damping, depolarizing, thermal noise, bit-flip, and\nphase-flip affect key QML algorithms (QSVC, Peg-QSVC, QNN, VQC) and feature\nmapping techniques (ZFeatureMap, ZZFeatureMap, and PauliFeatureMap). Results\nindicate that QSVC is notably robust under noise, whereas Peg-QSVC and QNN are\nmore sensitive, particularly to depolarizing and amplitude-damping noise. The\nPauliFeatureMap is especially vulnerable, highlighting difficulties in\nmaintaining accurate classification under noisy conditions. These findings\nunderscore the critical importance of feature map selection and noise\nmitigation strategies in optimizing QML for genomic classification, with\npromising implications for personalized medicine.\n","authors":["Navneet Singh","Shiva Raj Pokhrel"],"pdf_url":"https://arxiv.org/pdf/2501.08205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12454v3","updated":"2025-01-14T15:35:55Z","published":"2024-08-22T14:52:53Z","title":"Relaxed Rotational Equivariance via $G$-Biases in Vision","summary":"  Group Equivariant Convolution (GConv) can capture rotational equivariance\nfrom original data. It assumes uniform and strict rotational equivariance\nacross all features as the transformations under the specific group. However,\nthe presentation or distribution of real-world data rarely conforms to strict\nrotational equivariance, commonly referred to as Rotational Symmetry-Breaking\n(RSB) in the system or dataset, making GConv unable to adapt effectively to\nthis phenomenon. Motivated by this, we propose a simple but highly effective\nmethod to address this problem, which utilizes a set of learnable biases called\n$G$-Biases under the group order to break strict group constraints and then\nachieve a Relaxed Rotational Equivariant Convolution (RREConv). To validate the\nefficiency of RREConv, we conduct extensive ablation experiments on the\ndiscrete rotational group $\\mathcal{C}_n$. Experiments demonstrate that the\nproposed RREConv-based methods achieve excellent performance compared to\nexisting GConv-based methods in both classification and 2D object detection\ntasks on the natural image datasets.\n","authors":["Zhiqiang Wu","Yingjie Liu","Licheng Sun","Jian Yang","Hanlin Dong","Shing-Ho J. Lin","Xuan Tang","Jinpeng Mi","Bo Jin","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2408.12454v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08199v1","updated":"2025-01-14T15:23:36Z","published":"2025-01-14T15:23:36Z","title":"EmoNeXt: an Adapted ConvNeXt for Facial Emotion Recognition","summary":"  Facial expressions play a crucial role in human communication serving as a\npowerful and impactful means to express a wide range of emotions. With\nadvancements in artificial intelligence and computer vision, deep neural\nnetworks have emerged as effective tools for facial emotion recognition. In\nthis paper, we propose EmoNeXt, a novel deep learning framework for facial\nexpression recognition based on an adapted ConvNeXt architecture network. We\nintegrate a Spatial Transformer Network (STN) to focus on feature-rich regions\nof the face and Squeeze-and-Excitation blocks to capture channel-wise\ndependencies. Moreover, we introduce a self-attention regularization term,\nencouraging the model to generate compact feature vectors. We demonstrate the\nsuperiority of our model over existing state-of-the-art deep learning models on\nthe FER2013 dataset regarding emotion classification accuracy.\n","authors":["Yassine El Boudouri","Amine Bohi"],"pdf_url":"https://arxiv.org/pdf/2501.08199v1.pdf","comment":"6 pages, 5 figures and 2 tables. 2023 IEEE 25th International\n  Workshop on Multimedia Signal Processing (MMSP), Poitiers, France"},{"id":"http://arxiv.org/abs/2501.08192v1","updated":"2025-01-14T15:14:10Z","published":"2025-01-14T15:14:10Z","title":"PRESERVE: Prefetching Model Weights and KV-Cache in Distributed LLM\n  Serving","summary":"  Large language models (LLMs) are widely used across various applications, but\ntheir substantial computational requirements pose significant challenges,\nparticularly in terms of HBM bandwidth bottlenecks and inter-device\ncommunication overhead. In this paper, we present PRESERVE, a novel prefetching\nframework designed to optimize LLM inference by overlapping memory reads for\nmodel weights and KV-cache with collective communication operations. Through\nextensive experiments conducted on commercial AI accelerators, we demonstrate\nup to 1.6x end-to-end speedup on state-of-the-art, open-source LLMs.\nAdditionally, we perform a design space exploration that identifies the optimal\nhardware configuration for the proposed method, showing a further 1.25x\nimprovement in performance per cost by selecting the optimal L2 cache size. Our\nresults show that PRESERVE has the potential to mitigate the memory bottlenecks\nand communication overheads, offering a solution to improve the performance and\nscalability of the LLM inference systems.\n","authors":["Ahmet Caner Yüzügüler","Jiawei Zhuang","Lukas Cavigelli"],"pdf_url":"https://arxiv.org/pdf/2501.08192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08188v1","updated":"2025-01-14T15:13:00Z","published":"2025-01-14T15:13:00Z","title":"A Critical Synthesis of Uncertainty Quantification and Foundation Models\n  in Monocular Depth Estimation","summary":"  While recent foundation models have enabled significant breakthroughs in\nmonocular depth estimation, a clear path towards safe and reliable deployment\nin the real-world remains elusive. Metric depth estimation, which involves\npredicting absolute distances, poses particular challenges, as even the most\nadvanced foundation models remain prone to critical errors. Since quantifying\nthe uncertainty has emerged as a promising endeavor to address these\nlimitations and enable trustworthy deployment, we fuse five different\nuncertainty quantification methods with the current state-of-the-art\nDepthAnythingV2 foundation model. To cover a wide range of metric depth\ndomains, we evaluate their performance on four diverse datasets. Our findings\nidentify fine-tuning with the Gaussian Negative Log-Likelihood Loss (GNLL) as a\nparticularly promising approach, offering reliable uncertainty estimates while\nmaintaining predictive performance and computational efficiency on par with the\nbaseline, encompassing both training and inference time. By fusing uncertainty\nquantification and foundation models within the context of monocular depth\nestimation, this paper lays a critical foundation for future research aimed at\nimproving not only model performance but also its explainability. Extending\nthis critical synthesis of uncertainty quantification and foundation models\ninto other crucial tasks, such as semantic segmentation and pose estimation,\npresents exciting opportunities for safer and more reliable machine vision\nsystems.\n","authors":["Steven Landgraf","Rongjun Qin","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2501.08188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08187v1","updated":"2025-01-14T15:12:19Z","published":"2025-01-14T15:12:19Z","title":"A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction\n  Following","summary":"  Large language models excel at interpreting complex natural language\ninstructions, enabling them to perform a wide range of tasks. In the life\nsciences, single-cell RNA sequencing (scRNA-seq) data serves as the \"language\nof cellular biology\", capturing intricate gene expression patterns at the\nsingle-cell level. However, interacting with this \"language\" through\nconventional tools is often inefficient and unintuitive, posing challenges for\nresearchers. To address these limitations, we present InstructCell, a\nmulti-modal AI copilot that leverages natural language as a medium for more\ndirect and flexible single-cell analysis. We construct a comprehensive\nmulti-modal instruction dataset that pairs text-based instructions with\nscRNA-seq profiles from diverse tissues and species. Building on this, we\ndevelop a multi-modal cell language architecture capable of simultaneously\ninterpreting and processing both modalities. InstructCell empowers researchers\nto accomplish critical tasks-such as cell type annotation, conditional\npseudo-cell generation, and drug sensitivity prediction-using straightforward\nnatural language commands. Extensive evaluations demonstrate that InstructCell\nconsistently meets or exceeds the performance of existing single-cell\nfoundation models, while adapting to diverse experimental conditions. More\nimportantly, InstructCell provides an accessible and intuitive tool for\nexploring complex single-cell data, lowering technical barriers and enabling\ndeeper biological insights.\n","authors":["Yin Fang","Xinle Deng","Kangwei Liu","Ningyu Zhang","Jingyang Qian","Penghui Yang","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08187v1.pdf","comment":"37 pages; 13 figures; Code: https://github.com/zjunlp/Instructcell;\n  Models: https://huggingface.co/zjunlp/Instructcell-chat,\n  https://huggingface.co/zjunlp/InstructCell-instruct"},{"id":"http://arxiv.org/abs/2501.08184v1","updated":"2025-01-14T15:10:25Z","published":"2025-01-14T15:10:25Z","title":"Assessing AI Adoption and Digitalization in SMEs: A Framework for\n  Implementation","summary":"  The primary objective of this research is to examine the current state of\ndigitalization and the integration of artificial intelligence (AI) within small\nand medium-sized enterprises (SMEs) in Italy. There is a significant gap\nbetween SMEs and large corporations in their use of AI, with SMEs facing\nnumerous barriers to adoption. This study identifies critical drivers and\nobstacles to achieving intelligent transformation, proposing a framework model\nto address key challenges and provide actionable guidelines\n","authors":["Serena Proietti","Roberto Magnani"],"pdf_url":"https://arxiv.org/pdf/2501.08184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08182v1","updated":"2025-01-14T15:08:56Z","published":"2025-01-14T15:08:56Z","title":"CG-MER: A Card Game-based Multimodal dataset for Emotion Recognition","summary":"  The field of affective computing has seen significant advancements in\nexploring the relationship between emotions and emerging technologies. This\npaper presents a novel and valuable contribution to this field with the\nintroduction of a comprehensive French multimodal dataset designed specifically\nfor emotion recognition. The dataset encompasses three primary modalities:\nfacial expressions, speech, and gestures, providing a holistic perspective on\nemotions. Moreover, the dataset has the potential to incorporate additional\nmodalities, such as Natural Language Processing (NLP) to expand the scope of\nemotion recognition research. The dataset was curated through engaging\nparticipants in card game sessions, where they were prompted to express a range\nof emotions while responding to diverse questions. The study included 10\nsessions with 20 participants (9 females and 11 males). The dataset serves as a\nvaluable resource for furthering research in emotion recognition and provides\nan avenue for exploring the intricate connections between human emotions and\ndigital technologies.\n","authors":["Nessrine Farhat","Amine Bohi","Leila Ben Letaifa","Rim Slama"],"pdf_url":"https://arxiv.org/pdf/2501.08182v1.pdf","comment":"8 pages, 2 figures and 4 tables. Sixteenth International Conference\n  on Machine Vision (ICMV 2023), Yerevan, Armenia"},{"id":"http://arxiv.org/abs/2501.07572v2","updated":"2025-01-14T15:06:56Z","published":"2025-01-13T18:58:07Z","title":"WebWalker: Benchmarking LLMs in Web Traversal","summary":"  Retrieval-augmented generation (RAG) demonstrates remarkable performance\nacross tasks in open-domain question-answering. However, traditional search\nengines may retrieve shallow content, limiting the ability of LLMs to handle\ncomplex, multi-layered information. To address it, we introduce WebWalkerQA, a\nbenchmark designed to assess the ability of LLMs to perform web traversal. It\nevaluates the capacity of LLMs to traverse a website's subpages to extract\nhigh-quality data systematically. We propose WebWalker, which is a multi-agent\nframework that mimics human-like web navigation through an explore-critic\nparadigm. Extensive experimental results show that WebWalkerQA is challenging\nand demonstrates the effectiveness of RAG combined with WebWalker, through the\nhorizontal and vertical integration in real-world scenarios.\n","authors":["Jialong Wu","Wenbiao Yin","Yong Jiang","Zhenglin Wang","Zekun Xi","Runnan Fang","Linhai Zhang","Yulan He","Deyu Zhou","Pengjun Xie","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2501.07572v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08169v1","updated":"2025-01-14T14:49:49Z","published":"2025-01-14T14:49:49Z","title":"Revolutionizing Communication with Deep Learning and XAI for Enhanced\n  Arabic Sign Language Recognition","summary":"  This study introduces an integrated approach to recognizing Arabic Sign\nLanguage (ArSL) using state-of-the-art deep learning models such as\nMobileNetV3, ResNet50, and EfficientNet-B2. These models are further enhanced\nby explainable AI (XAI) techniques to boost interpretability. The ArSL2018 and\nRGB Arabic Alphabets Sign Language (AASL) datasets are employed, with\nEfficientNet-B2 achieving peak accuracies of 99.48\\% and 98.99\\%, respectively.\nKey innovations include sophisticated data augmentation methods to mitigate\nclass imbalance, implementation of stratified 5-fold cross-validation for\nbetter generalization, and the use of Grad-CAM for clear model decision\ntransparency. The proposed system not only sets new benchmarks in recognition\naccuracy but also emphasizes interpretability, making it suitable for\napplications in healthcare, education, and inclusive communication\ntechnologies.\n","authors":["Mazen Balat","Rewaa Awaad","Ahmed B. Zaky","Salah A. Aly"],"pdf_url":"https://arxiv.org/pdf/2501.08169v1.pdf","comment":"13 pages, 25 figures, 16 tables"},{"id":"http://arxiv.org/abs/2501.08168v1","updated":"2025-01-14T14:49:45Z","published":"2025-01-14T14:49:45Z","title":"LeapVAD: A Leap in Autonomous Driving via Cognitive Perception and\n  Dual-Process Thinking","summary":"  While autonomous driving technology has made remarkable strides, data-driven\napproaches still struggle with complex scenarios due to their limited reasoning\ncapabilities. Meanwhile, knowledge-driven autonomous driving systems have\nevolved considerably with the popularization of visual language models. In this\npaper, we propose LeapVAD, a novel method based on cognitive perception and\ndual-process thinking. Our approach implements a human-attentional mechanism to\nidentify and focus on critical traffic elements that influence driving\ndecisions. By characterizing these objects through comprehensive attributes -\nincluding appearance, motion patterns, and associated risks - LeapVAD achieves\nmore effective environmental representation and streamlines the decision-making\nprocess. Furthermore, LeapVAD incorporates an innovative dual-process\ndecision-making module miming the human-driving learning process. The system\nconsists of an Analytic Process (System-II) that accumulates driving experience\nthrough logical reasoning and a Heuristic Process (System-I) that refines this\nknowledge via fine-tuning and few-shot learning. LeapVAD also includes\nreflective mechanisms and a growing memory bank, enabling it to learn from past\nmistakes and continuously improve its performance in a closed-loop environment.\nTo enhance efficiency, we develop a scene encoder network that generates\ncompact scene representations for rapid retrieval of relevant driving\nexperiences. Extensive evaluations conducted on two leading autonomous driving\nsimulators, CARLA and DriveArena, demonstrate that LeapVAD achieves superior\nperformance compared to camera-only approaches despite limited training data.\nComprehensive ablation studies further emphasize its effectiveness in\ncontinuous learning and domain adaptation. Project page:\nhttps://pjlab-adg.github.io/LeapVAD/.\n","authors":["Yukai Ma","Tiantian Wei","Naiting Zhong","Jianbiao Mei","Tao Hu","Licheng Wen","Xuemeng Yang","Botian Shi","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08167v1","updated":"2025-01-14T14:49:14Z","published":"2025-01-14T14:49:14Z","title":"Potential and Perils of Large Language Models as Judges of Unstructured\n  Textual Data","summary":"  Rapid advancements in large language models have unlocked remarkable\ncapabilities when it comes to processing and summarizing unstructured text\ndata. This has implications for the analysis of rich, open-ended datasets, such\nas survey responses, where LLMs hold the promise of efficiently distilling key\nthemes and sentiments. However, as organizations increasingly turn to these\npowerful AI systems to make sense of textual feedback, a critical question\narises, can we trust LLMs to accurately represent the perspectives contained\nwithin these text based datasets? While LLMs excel at generating human-like\nsummaries, there is a risk that their outputs may inadvertently diverge from\nthe true substance of the original responses. Discrepancies between the\nLLM-generated outputs and the actual themes present in the data could lead to\nflawed decision-making, with far-reaching consequences for organizations. This\nresearch investigates the effectiveness of LLMs as judge models to evaluate the\nthematic alignment of summaries generated by other LLMs. We utilized an\nAnthropic Claude model to generate thematic summaries from open-ended survey\nresponses, with Amazon's Titan Express, Nova Pro, and Meta's Llama serving as\nLLM judges. The LLM-as-judge approach was compared to human evaluations using\nCohen's kappa, Spearman's rho, and Krippendorff's alpha, validating a scalable\nalternative to traditional human centric evaluation methods. Our findings\nreveal that while LLMs as judges offer a scalable solution comparable to human\nraters, humans may still excel at detecting subtle, context-specific nuances.\nThis research contributes to the growing body of knowledge on AI assisted text\nanalysis. We discuss limitations and provide recommendations for future\nresearch, emphasizing the need for careful consideration when generalizing LLM\njudge models across various contexts and use cases.\n","authors":["Rewina Bedemariam","Natalie Perez","Sreyoshi Bhaduri","Satya Kapoor","Alex Gil","Elizabeth Conjar","Ikkei Itoku","David Theil","Aman Chadha","Naumaan Nayyar"],"pdf_url":"https://arxiv.org/pdf/2501.08167v1.pdf","comment":"11 pages, 1 appendix"},{"id":"http://arxiv.org/abs/2412.13174v2","updated":"2025-01-14T14:48:32Z","published":"2024-12-17T18:53:43Z","title":"ORFormer: Occlusion-Robust Transformer for Accurate Facial Landmark\n  Detection","summary":"  Although facial landmark detection (FLD) has gained significant progress,\nexisting FLD methods still suffer from performance drops on partially\nnon-visible faces, such as faces with occlusions or under extreme lighting\nconditions or poses. To address this issue, we introduce ORFormer, a novel\ntransformer-based method that can detect non-visible regions and recover their\nmissing features from visible parts. Specifically, ORFormer associates each\nimage patch token with one additional learnable token called the messenger\ntoken. The messenger token aggregates features from all but its patch. This\nway, the consensus between a patch and other patches can be assessed by\nreferring to the similarity between its regular and messenger embeddings,\nenabling non-visible region identification. Our method then recovers occluded\npatches with features aggregated by the messenger tokens. Leveraging the\nrecovered features, ORFormer compiles high-quality heatmaps for the downstream\nFLD task. Extensive experiments show that our method generates heatmaps\nresilient to partial occlusions. By integrating the resultant heatmaps into\nexisting FLD methods, our method performs favorably against the state of the\narts on challenging datasets such as WFLW and COFW.\n","authors":["Jui-Che Chiang","Hou-Ning Hu","Bo-Syuan Hou","Chia-Yu Tseng","Yu-Lun Liu","Min-Hung Chen","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2412.13174v2.pdf","comment":"WACV 2025 Project Link: https://ben0919.github.io/ORFormer/"},{"id":"http://arxiv.org/abs/2501.08165v1","updated":"2025-01-14T14:46:19Z","published":"2025-01-14T14:46:19Z","title":"I Can Find You in Seconds! Leveraging Large Language Models for Code\n  Authorship Attribution","summary":"  Source code authorship attribution is important in software forensics,\nplagiarism detection, and protecting software patch integrity. Existing\ntechniques often rely on supervised machine learning, which struggles with\ngeneralization across different programming languages and coding styles due to\nthe need for large labeled datasets. Inspired by recent advances in natural\nlanguage authorship analysis using large language models (LLMs), which have\nshown exceptional performance without task-specific tuning, this paper explores\nthe use of LLMs for source code authorship attribution.\n  We present a comprehensive study demonstrating that state-of-the-art LLMs can\nsuccessfully attribute source code authorship across different languages. LLMs\ncan determine whether two code snippets are written by the same author with\nzero-shot prompting, achieving a Matthews Correlation Coefficient (MCC) of\n0.78, and can attribute code authorship from a small set of reference code\nsnippets via few-shot learning, achieving MCC of 0.77. Additionally, LLMs show\nsome adversarial robustness against misattribution attacks.\n  Despite these capabilities, we found that naive prompting of LLMs does not\nscale well with a large number of authors due to input token limitations. To\naddress this, we propose a tournament-style approach for large-scale\nattribution. Evaluating this approach on datasets of C++ (500 authors, 26,355\nsamples) and Java (686 authors, 55,267 samples) code from GitHub, we achieve\nclassification accuracy of up to 65% for C++ and 68.7% for Java using only one\nreference per author. These results open new possibilities for applying LLMs to\ncode authorship attribution in cybersecurity and software engineering.\n","authors":["Soohyeon Choi","Yong Kiam Tan","Mark Huasong Meng","Mohamed Ragab","Soumik Mondal","David Mohaisen","Khin Mi Mi Aung"],"pdf_url":"https://arxiv.org/pdf/2501.08165v1.pdf","comment":"12 pages, 5 figures,"},{"id":"http://arxiv.org/abs/2501.08155v1","updated":"2025-01-14T14:29:36Z","published":"2025-01-14T14:29:36Z","title":"FairTTTS: A Tree Test Time Simulation Method for Fairness-Aware\n  Classification","summary":"  Algorithmic decision-making has become deeply ingrained in many domains, yet\nbiases in machine learning models can still produce discriminatory outcomes,\noften harming unprivileged groups. Achieving fair classification is inherently\nchallenging, requiring a careful balance between predictive performance and\nethical considerations. We present FairTTTS, a novel post-processing bias\nmitigation method inspired by the Tree Test Time Simulation (TTTS) method.\nOriginally developed to enhance accuracy and robustness against adversarial\ninputs through probabilistic decision-path adjustments, TTTS serves as the\nfoundation for FairTTTS. By building on this accuracy-enhancing technique,\nFairTTTS mitigates bias and improves predictive performance. FairTTTS uses a\ndistance-based heuristic to adjust decisions at protected attribute nodes,\nensuring fairness for unprivileged samples. This fairness-oriented adjustment\noccurs as a post-processing step, allowing FairTTTS to be applied to\npre-trained models, diverse datasets, and various fairness metrics without\nretraining. Extensive evaluation on seven benchmark datasets shows that\nFairTTTS outperforms traditional methods in fairness improvement, achieving a\n20.96% average increase over the baseline compared to 18.78% for related work,\nand further enhances accuracy by 0.55%. In contrast, competing methods\ntypically reduce accuracy by 0.42%. These results confirm that FairTTTS\neffectively promotes more equitable decision-making while simultaneously\nimproving predictive performance.\n","authors":["Nurit Cohen-Inger","Lior Rokach","Bracha Shapira","Seffi Cohen"],"pdf_url":"https://arxiv.org/pdf/2501.08155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16779v2","updated":"2025-01-14T14:26:03Z","published":"2024-08-15T16:41:00Z","title":"Inductive Learning of Logical Theories with LLMs: An Expressivity-Graded\n  Analysis","summary":"  This work presents a novel systematic methodology to analyse the capabilities\nand limitations of Large Language Models (LLMs) with feedback from a formal\ninference engine, on logic theory induction. The analysis is complexity-graded\nw.r.t. rule dependency structure, allowing quantification of specific inference\nchallenges on LLM performance. Integrating LLMs with formal methods is a\npromising frontier in the Natural Language Processing field, as an important\navenue for improving model inference control and explainability. In particular,\ninductive learning over complex sets of facts and rules, poses unique\nchallenges for current autoregressive models, as they lack explicit symbolic\ngrounding. While they can be complemented by formal systems, the properties\ndelivered by LLMs regarding inductive learning, are not well understood and\nquantified. Empirical results indicate that the largest LLMs can achieve\ncompetitive results against a SOTA Inductive Logic Programming (ILP) system\nbaseline, but also that tracking long predicate relationship chains is a more\ndifficult obstacle than theory complexity for LLMs.\n","authors":["João Pedro Gandarela","Danilo S. Carvalho","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2408.16779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08149v1","updated":"2025-01-14T14:25:10Z","published":"2025-01-14T14:25:10Z","title":"Multiple-Input Variational Auto-Encoder for Anomaly Detection in\n  Heterogeneous Data","summary":"  Anomaly detection (AD) plays a pivotal role in AI applications, e.g., in\nclassification, and intrusion/threat detection in cybersecurity. However, most\nexisting methods face challenges of heterogeneity amongst feature subsets posed\nby non-independent and identically distributed (non-IID) data. We propose a\nnovel neural network model called Multiple-Input Auto-Encoder for AD (MIAEAD)\nto address this. MIAEAD assigns an anomaly score to each feature subset of a\ndata sample to indicate its likelihood of being an anomaly. This is done by\nusing the reconstruction error of its sub-encoder as the anomaly score. All\nsub-encoders are then simultaneously trained using unsupervised learning to\ndetermine the anomaly scores of feature subsets. The final AUC of MIAEAD is\ncalculated for each sub-dataset, and the maximum AUC obtained among the\nsub-datasets is selected. To leverage the modelling of the distribution of\nnormal data to identify anomalies of the generative models, we develop a novel\nneural network architecture/model called Multiple-Input Variational\nAuto-Encoder (MIVAE). MIVAE can process feature subsets through its\nsub-encoders before learning distribution of normal data in the latent space.\nThis allows MIVAE to identify anomalies that deviate from the learned\ndistribution. We theoretically prove that the difference in the average anomaly\nscore between normal samples and anomalies obtained by the proposed MIVAE is\ngreater than that of the Variational Auto-Encoder (VAEAD), resulting in a\nhigher AUC for MIVAE. Extensive experiments on eight real-world anomaly\ndatasets demonstrate the superior performance of MIAEAD and MIVAE over\nconventional methods and the state-of-the-art unsupervised models, by up to 6%\nin terms of AUC score. Alternatively, MIAEAD and MIVAE have a high AUC when\napplied to feature subsets with low heterogeneity based on the coefficient of\nvariation (CV) score.\n","authors":["Phai Vu Dinh","Diep N. Nguyen","Dinh Thai Hoang","Quang Uy Nguyen","Eryk Dutkiewicz"],"pdf_url":"https://arxiv.org/pdf/2501.08149v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2501.08145v1","updated":"2025-01-14T14:23:18Z","published":"2025-01-14T14:23:18Z","title":"Refusal Behavior in Large Language Models: A Nonlinear Perspective","summary":"  Refusal behavior in large language models (LLMs) enables them to decline\nresponding to harmful, unethical, or inappropriate prompts, ensuring alignment\nwith ethical standards. This paper investigates refusal behavior across six\nLLMs from three architectural families. We challenge the assumption of refusal\nas a linear phenomenon by employing dimensionality reduction techniques,\nincluding PCA, t-SNE, and UMAP. Our results reveal that refusal mechanisms\nexhibit nonlinear, multidimensional characteristics that vary by model\narchitecture and layer. These findings highlight the need for nonlinear\ninterpretability to improve alignment research and inform safer AI deployment\nstrategies.\n","authors":["Fabian Hildebrandt","Andreas Maier","Patrick Krauss","Achim Schilling"],"pdf_url":"https://arxiv.org/pdf/2501.08145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08139v1","updated":"2025-01-14T14:19:40Z","published":"2025-01-14T14:19:40Z","title":"EEG-ReMinD: Enhancing Neurodegenerative EEG Decoding through\n  Self-Supervised State Reconstruction-Primed Riemannian Dynamics","summary":"  The development of EEG decoding algorithms confronts challenges such as data\nsparsity, subject variability, and the need for precise annotations, all of\nwhich are vital for advancing brain-computer interfaces and enhancing the\ndiagnosis of diseases. To address these issues, we propose a novel two-stage\napproach named Self-Supervised State Reconstruction-Primed Riemannian Dynamics\n(EEG-ReMinD) , which mitigates reliance on supervised learning and integrates\ninherent geometric features. This approach efficiently handles EEG data\ncorruptions and reduces the dependency on labels. EEG-ReMinD utilizes\nself-supervised and geometric learning techniques, along with an attention\nmechanism, to analyze the temporal dynamics of EEG features within the\nframework of Riemannian geometry, referred to as Riemannian dynamics.\nComparative analyses on both intact and corrupted datasets from two different\nneurodegenerative disorders underscore the enhanced performance of EEG-ReMinD.\n","authors":["Zirui Wang","Zhenxi Song","Yi Guo","Yuxin Liu","Guoyang Xu","Min Zhang","Zhiguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13612v2","updated":"2025-01-14T14:16:45Z","published":"2024-12-18T08:42:25Z","title":"Are LLMs Good Literature Review Writers? Evaluating the Literature\n  Review Writing Ability of Large Language Models","summary":"  The literature review is a crucial form of academic writing that involves\ncomplex processes of literature collection, organization, and summarization.\nThe emergence of large language models (LLMs) has introduced promising tools to\nautomate these processes. However, their actual capabilities in writing\ncomprehensive literature reviews remain underexplored, such as whether they can\ngenerate accurate and reliable references. To address this gap, we propose a\nframework to assess the literature review writing ability of LLMs\nautomatically. We evaluate the performance of LLMs across three tasks:\ngenerating references, writing abstracts, and writing literature reviews. We\nemploy external tools for a multidimensional evaluation, which includes\nassessing hallucination rates in references, semantic coverage, and factual\nconsistency with human-written context. By analyzing the experimental results,\nwe find that, despite advancements, even the most sophisticated models still\ncannot avoid generating hallucinated references. Additionally, different models\nexhibit varying performance in literature review writing across different\ndisciplines.\n","authors":["Xuemei Tang","Xufeng Duan","Zhenguang G. Cai"],"pdf_url":"https://arxiv.org/pdf/2412.13612v2.pdf","comment":"12 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.08134v1","updated":"2025-01-14T14:14:22Z","published":"2025-01-14T14:14:22Z","title":"An Empirical Wall-Pressure Spectrum Model for Aeroacoustic Predictions\n  Based on Symbolic Regression","summary":"  Fast-turn around methods to predict airfoil trailing-edge noise are crucial\nfor incorporating noise limitations into design optimization loops of several\napplications. Among these aeroacoustic predictive models, Amiet's theory offers\nthe best balance between accuracy and simplicity. The accuracy of the model\nrelies heavily on precise wall-pressure spectrum predictions, which are often\nbased on single-equation formulations with adjustable parameters. These\nparameters are calibrated for particular airfoils and flow conditions and\nconsequently tend to fail when applied outside their calibration range. This\npaper introduces a new wall-pressure spectrum empirical model designed to\nenhance the robustness and accuracy of current state-of-the-art predictions\nwhile widening the range of applicability of the model to different airfoils\nand flow conditions. The model is developed using AI-based symbolic regression\nvia a genetic-algorithm-based approach, and applied to a dataset of\nwall-pressure fluctuations measured on NACA 0008 and NACA 63018 airfoils at\nmultiple angles of attack and inflow velocities, covering turbulent boundary\nlayers with both adverse and favorable pressure gradients. Validation against\nexperimental data (outside the training dataset) demonstrates the robustness of\nthe model compared to well-accepted semi-empirical models. Finally, the model\nis integrated with Amiet's theory to predict the aeroacoustic noise of a\nfull-scale wind turbine, showing good agreement with experimental measurements.\n","authors":["Laura Botero Bolívar","David Huergo","Fernanda L. dos Santos","Cornelis H. Venner","Leandro D. de Santana","Esteban Ferrer"],"pdf_url":"https://arxiv.org/pdf/2501.08134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08120v1","updated":"2025-01-14T13:52:41Z","published":"2025-01-14T13:52:41Z","title":"In-situ graph reasoning and knowledge expansion using Graph-PReFLexOR","summary":"  The pursuit of automated scientific discovery has fueled progress from\nsymbolic logic to modern AI, forging new frontiers in reasoning and pattern\nrecognition. Transformers function as potential systems, where every possible\nrelationship remains latent potentiality until tasks impose constraints, akin\nto measurement. Yet, refining their sampling requires more than probabilistic\nselection: solutions must conform to specific structures or rules, ensuring\nconsistency and the invocation of general principles. We present\nGraph-PReFLexOR (Graph-based Preference-based Recursive Language Modeling for\nExploratory Optimization of Reasoning), a framework that combines graph\nreasoning with symbolic abstraction to dynamically expand domain knowledge.\nInspired by reinforcement learning, Graph-PReFLexOR defines reasoning as a\nstructured mapping, where tasks yield knowledge graphs, abstract patterns, and\nultimately, final answers. Inspired by category theory, it encodes concepts as\nnodes and their relationships as edges, supporting hierarchical inference and\nadaptive learning through isomorphic representations. Demonstrations include\nhypothesis generation, materials design, and creative reasoning, such as\ndiscovering relationships between mythological concepts like 'thin places' with\nmaterials science. We propose a 'knowledge garden growth' strategy that\nintegrates insights across domains, promoting interdisciplinary connections.\nResults with a 3-billion-parameter Graph-PReFLexOR model show superior\nreasoning depth and adaptability, underscoring the potential for transparent,\nmultidisciplinary AI-driven discovery. It lays the groundwork for general\nautonomous reasoning solutions.\n","authors":["Markus J. Buehler"],"pdf_url":"https://arxiv.org/pdf/2501.08120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16625v3","updated":"2025-01-14T13:48:49Z","published":"2023-05-26T04:34:28Z","title":"Set-based Neural Network Encoding Without Weight Tying","summary":"  We propose a neural network weight encoding method for network property\nprediction that utilizes set-to-set and set-to-vector functions to efficiently\nencode neural network parameters. Our approach is capable of encoding neural\nnetworks in a model zoo of mixed architecture and different parameter sizes as\nopposed to previous approaches that require custom encoding models for\ndifferent architectures. Furthermore, our \\textbf{S}et-based \\textbf{N}eural\nnetwork \\textbf{E}ncoder (SNE) takes into consideration the hierarchical\ncomputational structure of neural networks. To respect symmetries inherent in\nnetwork weight space, we utilize Logit Invariance to learn the required minimal\ninvariance properties. Additionally, we introduce a \\textit{pad-chunk-encode}\npipeline to efficiently encode neural network layers that is adjustable to\ncomputational and memory constraints. We also introduce two new tasks for\nneural network property prediction: cross-dataset and cross-architecture. In\ncross-dataset property prediction, we evaluate how well property predictors\ngeneralize across model zoos trained on different datasets but of the same\narchitecture. In cross-architecture property prediction, we evaluate how well\nproperty predictors transfer to model zoos of different architecture not seen\nduring training. We show that SNE outperforms the relevant baselines on\nstandard benchmarks.\n","authors":["Bruno Andreis","Soro Bedionita","Philip H. S. Torr","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2305.16625v3.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2501.08109v1","updated":"2025-01-14T13:40:08Z","published":"2025-01-14T13:40:08Z","title":"Data-driven inventory management for new products: A warm-start and\n  adjusted Dyna-$Q$ approach","summary":"  In this paper, we propose a novel reinforcement learning algorithm for\ninventory management of newly launched products with no or limited historical\ndemand information. The algorithm follows the classic Dyna-$Q$ structure,\nbalancing the model-based and model-free approaches, while accelerating the\ntraining process of Dyna-$Q$ and mitigating the model discrepancy generated by\nthe model-based feedback. Warm-start information from the demand data of\nexisting similar products can be incorporated into the algorithm to further\nstabilize the early-stage training and reduce the variance of the estimated\noptimal policy. Our approach is validated through a case study of bakery\ninventory management with real data. The adjusted Dyna-$Q$ shows up to a 23.7\\%\nreduction in average daily cost compared with $Q$-learning, and up to a 77.5\\%\nreduction in training time within the same horizon compared with classic\nDyna-$Q$. By incorporating the warm-start information, it can be found that the\nadjusted Dyna-$Q$ has the lowest total cost, lowest variance in total cost, and\nrelatively low shortage percentages among all the algorithms under a 30-day\ntesting.\n","authors":["Xinyu Qu","Longxiao Liu","Wenjie Huang"],"pdf_url":"https://arxiv.org/pdf/2501.08109v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.08102v1","updated":"2025-01-14T13:19:47Z","published":"2025-01-14T13:19:47Z","title":"Consistency of Responses and Continuations Generated by Large Language\n  Models on Social Media","summary":"  Large Language Models (LLMs) demonstrate remarkable capabilities in text\ngeneration, yet their emotional consistency and semantic coherence in social\nmedia contexts remain insufficiently understood. This study investigates how\nLLMs handle emotional content and maintain semantic relationships through\ncontinuation and response tasks using two open-source models: Gemma and Llama.\nBy analyzing climate change discussions from Twitter and Reddit, we examine\nemotional transitions, intensity patterns, and semantic similarity between\nhuman-authored and LLM-generated content. Our findings reveal that while both\nmodels maintain high semantic coherence, they exhibit distinct emotional\npatterns: Gemma shows a tendency toward negative emotion amplification,\nparticularly anger, while maintaining certain positive emotions like optimism.\nLlama demonstrates superior emotional preservation across a broader spectrum of\naffects. Both models systematically generate responses with attenuated\nemotional intensity compared to human-authored content and show a bias toward\npositive emotions in response tasks. Additionally, both models maintain strong\nsemantic similarity with original texts, though performance varies between\ncontinuation and response tasks. These findings provide insights into LLMs'\nemotional and semantic processing capabilities, with implications for their\ndeployment in social media contexts and human-AI interaction design.\n","authors":["Wenlu Fan","Yuqi Zhu","Chenyang Wang","Bin Wang","Wentao Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08097v1","updated":"2025-01-14T13:10:29Z","published":"2025-01-14T13:10:29Z","title":"Guiding the classification of hepatocellular carcinoma on 3D CT-scans\n  using deep and handcrafted radiological features","summary":"  Hepatocellular carcinoma is the most spread primary liver cancer across the\nworld ($\\sim$80\\% of the liver tumors). The gold standard for HCC diagnosis is\nliver biopsy. However, in the clinical routine, expert radiologists provide a\nvisual diagnosis by interpreting hepatic CT-scans according to a standardized\nprotocol, the LI-RADS, which uses five radiological criteria with an associated\ndecision tree. In this paper, we propose an automatic approach to predict\nhistology-proven HCC from CT images in order to reduce radiologists'\ninter-variability. We first show that standard deep learning methods fail to\naccurately predict HCC from CT-scans on a challenging database, and propose a\ntwo-step approach inspired by the LI-RADS system to improve the performance. We\nachieve improvements from 6 to 18 points of AUC with respect to deep learning\nbaselines trained with different architectures. We also provide clinical\nvalidation of our method, achieving results that outperform non-expert\nradiologists and are on par with expert ones.\n","authors":["E. Sarfati","A. Bône","M-M. Rohé","C. Aubé","M. Ronot","P. Gori","I. Bloch"],"pdf_url":"https://arxiv.org/pdf/2501.08097v1.pdf","comment":"IEEE ISBI 2025"},{"id":"http://arxiv.org/abs/2501.08096v1","updated":"2025-01-14T13:10:13Z","published":"2025-01-14T13:10:13Z","title":"Hybrid Action Based Reinforcement Learning for Multi-Objective\n  Compatible Autonomous Driving","summary":"  Reinforcement Learning (RL) has shown excellent performance in solving\ndecision-making and control problems of autonomous driving, which is\nincreasingly applied in diverse driving scenarios. However, driving is a\nmulti-attribute problem, leading to challenges in achieving multi-objective\ncompatibility for current RL methods, especially in both policy execution and\npolicy iteration. On the one hand, the common action space structure with\nsingle action type limits driving flexibility or results in large behavior\nfluctuations during policy execution. On the other hand, the multi-attribute\nweighted single reward function result in the agent's disproportionate\nattention to certain objectives during policy iterations. To this end, we\npropose a Multi-objective Ensemble-Critic reinforcement learning method with\nHybrid Parametrized Action for multi-objective compatible autonomous driving.\nSpecifically, a parameterized action space is constructed to generate hybrid\ndriving actions, combining both abstract guidance and concrete control\ncommands. A multi-objective critics architecture is constructed considering\nmultiple attribute rewards, to ensure simultaneously focusing on different\ndriving objectives. Additionally, uncertainty-based exploration strategy is\nintroduced to help the agent faster approach viable driving policy. The\nexperimental results in both the simulated traffic environment and the HighD\ndataset demonstrate that our method can achieve multi-objective compatible\nautonomous driving in terms of driving efficiency, action consistency, and\nsafety. It enhances the general performance of the driving while significantly\nincreasing training efficiency.\n","authors":["Guizhe Jin","Zhuoren Li","Bo Leng","Wei Han","Lu Xiong","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2501.08096v1.pdf","comment":"12 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.08090v1","updated":"2025-01-14T12:57:40Z","published":"2025-01-14T12:57:40Z","title":"Hierarchical Autoscaling for Large Language Model Serving with Chiron","summary":"  Large language model (LLM) serving is becoming an increasingly important\nworkload for cloud providers. Based on performance SLO requirements, LLM\ninference requests can be divided into (a) interactive requests that have tight\nSLOs in the order of seconds, and (b) batch requests that have relaxed SLO in\nthe order of minutes to hours. These SLOs can degrade based on the arrival\nrates, multiplexing, and configuration parameters, thus necessitating the use\nof resource autoscaling on serving instances and their batch sizes. However,\nprevious autoscalers for LLM serving do not consider request SLOs leading to\nunnecessary scaling and resource under-utilization. To address these\nlimitations, we introduce Chiron, an autoscaler that uses the idea of\nhierarchical backpressure estimated using queue size, utilization, and SLOs.\nOur experiments show that Chiron achieves up to 90% higher SLO attainment and\nimproves GPU efficiency by up to 70% compared to existing solutions.\n","authors":["Archit Patke","Dhemath Reddy","Saurabh Jha","Chandra Narayanaswami","Zbigniew Kalbarczyk","Ravishankar Iyer"],"pdf_url":"https://arxiv.org/pdf/2501.08090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11531v2","updated":"2025-01-14T12:56:34Z","published":"2024-11-18T12:40:51Z","title":"Addressing Hallucinations in Language Models with Knowledge Graph\n  Embeddings as an Additional Modality","summary":"  In this paper we present an approach to reduce hallucinations in Large\nLanguage Models (LLMs) by incorporating Knowledge Graphs (KGs) as an additional\nmodality. Our method involves transforming input text into a set of KG\nembeddings and using an adapter to integrate these embeddings into the language\nmodel space, without relying on external retrieval processes.\n  To facilitate this, we created WikiEntities, a dataset containing over 3\nmillion Wikipedia texts annotated with entities from Wikidata and their\ncorresponding embeddings from PyTorch-BigGraph. This dataset serves as a\nvaluable resource for training Entity Linking models and adapting the described\nmethod to various LLMs using specialized adapters.\n  Our method does not require fine-tuning of the language models themselves;\ninstead, we only train the adapter. This ensures that the model's performance\non other tasks is not affected. We trained an adapter for the Mistral 7B, LLaMA\n2-7B (chat), and LLaMA 3-8B (instruct) models using this dataset and\ndemonstrated that our approach improves performance on the HaluEval, True-False\nbenchmarks and FEVER dataset. The results indicate that incorporating KGs as a\nnew modality can effectively reduce hallucinations and improve the factual\naccuracy of language models, all without the need for external retrieval.\n","authors":["Viktoriia Chekalina","Anton Razzhigaev","Elizaveta Goncharova","Andrey Kuznetsov"],"pdf_url":"https://arxiv.org/pdf/2411.11531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08086v1","updated":"2025-01-14T12:55:48Z","published":"2025-01-14T12:55:48Z","title":"NOMTO: Neural Operator-based symbolic Model approximaTion and discOvery","summary":"  While many physical and engineering processes are most effectively described\nby non-linear symbolic models, existing non-linear symbolic regression (SR)\nmethods are restricted to a limited set of continuous algebraic functions,\nthereby limiting their applicability to discover higher order non-linear\ndifferential relations. In this work, we introduce the Neural Operator-based\nsymbolic Model approximaTion and discOvery (NOMTO) method, a novel approach to\nsymbolic model discovery that leverages Neural Operators to encompass a broad\nrange of symbolic operations. We demonstrate that NOMTO can successfully\nidentify symbolic expressions containing elementary functions with\nsingularities, special functions, and derivatives. Additionally, our\nexperiments demonstrate that NOMTO can accurately rediscover second-order\nnon-linear partial differential equations. By broadening the set of symbolic\noperations available for discovery, NOMTO significantly advances the\ncapabilities of existing SR methods. It provides a powerful and flexible tool\nfor model discovery, capable of capturing complex relations in a variety of\nphysical systems.\n","authors":["Sergei Garmaev","Siddhartha Mishra","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2501.08086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08878v3","updated":"2025-01-14T12:53:24Z","published":"2023-02-17T13:50:53Z","title":"Less is More: The Influence of Pruning on the Explainability of CNNs","summary":"  Over the last century, deep learning models have become the state-of-the-art\nfor solving complex computer vision problems. These modern computer vision\nmodels have millions of parameters, which presents two major challenges: (1)\nthe increased computational requirements hamper the deployment in\nresource-constrained environments, such as mobile or IoT devices, and (2)\nexplaining the complex decisions of such networks to humans is challenging.\nNetwork pruning is a technical approach to reduce the complexity of models,\nwhere less important parameters are removed. The work presented in this paper\ninvestigates whether this reduction in technical complexity also helps with\nperceived explainability. To do so, we conducted a pre-study and two\nhuman-grounded experiments, assessing the effects of different pruning ratios\non explainability. Overall, we evaluate four different compression rates (i.e.,\n2, 4, 8, and 32) with 37 500 tasks on Mechanical Turk. Results indicate that\nlower compression rates have a positive influence on explainability, while\nhigher compression rates show negative effects. Furthermore, we were able to\nidentify sweet spots that increase both the perceived explainability and the\nmodel's performance.\n","authors":["Florian Merkle","David Weber","Pascal Schöttle","Stephan Schlögl","Martin Nocker"],"pdf_url":"https://arxiv.org/pdf/2302.08878v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08074v1","updated":"2025-01-14T12:42:01Z","published":"2025-01-14T12:42:01Z","title":"Artificial Liver Classifier: A New Alternative to Conventional Machine\n  Learning Models","summary":"  Supervised machine learning classifiers often encounter challenges related to\nperformance, accuracy, and overfitting. This paper introduces the Artificial\nLiver Classifier (ALC), a novel supervised learning classifier inspired by the\nhuman liver's detoxification function. The ALC is characterized by its\nsimplicity, speed, hyperparameters-free, ability to reduce overfitting, and\neffectiveness in addressing multi-classification problems through\nstraightforward mathematical operations. To optimize the ALC's parameters, an\nimproved FOX optimization algorithm (IFOX) is employed as the training method.\nThe proposed ALC was evaluated on five benchmark machine learning datasets:\nIris Flower, Breast Cancer Wisconsin, Wine, Voice Gender, and MNIST. The\nresults demonstrated competitive performance, with the ALC achieving 100%\naccuracy on the Iris dataset, surpassing logistic regression, multilayer\nperceptron, and support vector machine. Similarly, on the Breast Cancer\ndataset, it achieved 99.12% accuracy, outperforming XGBoost and logistic\nregression. Across all datasets, the ALC consistently exhibited lower\noverfitting gaps and loss compared to conventional classifiers. These findings\nhighlight the potential of leveraging biological process simulations to develop\nefficient machine learning models and open new avenues for innovation in the\nfield.\n","authors":["Mahmood A. Jumaah","Yossra H. Ali","Tarik A. Rashid"],"pdf_url":"https://arxiv.org/pdf/2501.08074v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2403.00376v3","updated":"2025-01-14T12:37:26Z","published":"2024-03-01T09:01:53Z","title":"Spurious Feature Eraser: Stabilizing Test-Time Adaptation for\n  Vision-Language Foundation Model","summary":"  Vision-language foundation models have exhibited remarkable success across a\nmultitude of downstream tasks due to their scalability on extensive image-text\npaired data. However, these models also display significant limitations when\napplied to downstream tasks, such as fine-grained image classification, as a\nresult of ``decision shortcuts'' that hinder their generalization capabilities.\nIn this work, we find that the CLIP model possesses a rich set of features,\nencompassing both \\textit{desired invariant causal features} and\n\\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP\non downstream tasks originates from its inability to effectively utilize\npre-trained features in accordance with specific task requirements. To address\nthis challenge, we propose a simple yet effective method, Spurious Feature\nEraser (SEraser), to alleviate the decision shortcuts by erasing the spurious\nfeatures. Specifically, we introduce a test-time prompt tuning paradigm that\noptimizes a learnable prompt, thereby compelling the model to exploit invariant\nfeatures while disregarding decision shortcuts during the inference phase. The\nproposed method effectively alleviates excessive dependence on potentially\nmisleading spurious information. We conduct comparative analysis of the\nproposed method against various approaches which validates the significant\nsuperiority.\n","authors":["Huan Ma","Yan Zhu","Changqing Zhang","Peilin Zhao","Baoyuan Wu","Long-Kai Huang","Qinghua Hu","Bingzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2403.00376v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08068v1","updated":"2025-01-14T12:34:25Z","published":"2025-01-14T12:34:25Z","title":"A Roadmap to Guide the Integration of LLMs in Hierarchical Planning","summary":"  Recent advances in Large Language Models (LLMs) are fostering their\nintegration into several reasoning-related fields, including Automated Planning\n(AP). However, their integration into Hierarchical Planning (HP), a subfield of\nAP that leverages hierarchical knowledge to enhance planning performance,\nremains largely unexplored. In this preliminary work, we propose a roadmap to\naddress this gap and harness the potential of LLMs for HP. To this end, we\npresent a taxonomy of integration methods, exploring how LLMs can be utilized\nwithin the HP life cycle. Additionally, we provide a benchmark with a\nstandardized dataset for evaluating the performance of future LLM-based HP\napproaches, and present initial results for a state-of-the-art HP planner and\nLLM planner. As expected, the latter exhibits limited performance (3\\% correct\nplans, and none with a correct hierarchical decomposition) but serves as a\nvaluable baseline for future approaches.\n","authors":["Israel Puerta-Merino","Carlos Núñez-Molina","Pablo Mesejo","Juan Fernández-Olivares"],"pdf_url":"https://arxiv.org/pdf/2501.08068v1.pdf","comment":"5 pages, 0 figures, to be published in the AAAI Workshop on Planning\n  in the Era of LLMs ( https://llmforplanning.github.io )"},{"id":"http://arxiv.org/abs/2501.08057v1","updated":"2025-01-14T12:12:06Z","published":"2025-01-14T12:12:06Z","title":"Optimizing Speech Multi-View Feature Fusion through Conditional\n  Computation","summary":"  Recent advancements have highlighted the efficacy of self-supervised learning\n(SSL) features in various speech-related tasks, providing lightweight and\nversatile multi-view speech representations. However, our study reveals that\nwhile SSL features expedite model convergence, they conflict with traditional\nspectral features like FBanks in terms of update directions. In response, we\npropose a novel generalized feature fusion framework grounded in conditional\ncomputation, featuring a gradient-sensitive gating network and a multi-stage\ndropout strategy. This framework mitigates feature conflicts and bolsters model\nrobustness to multi-view input features. By integrating SSL and spectral\nfeatures, our approach accelerates convergence and maintains performance on par\nwith spectral models across multiple speech translation tasks on the MUSTC\ndataset.\n","authors":["Weiqiao Shan","Yuhao Zhang","Yuchen Han","Bei Li","Xiaofeng Zhao","Yuang Li","Min Zhang","Hao Yang","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.08057v1.pdf","comment":"ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.08053v1","updated":"2025-01-14T12:01:54Z","published":"2025-01-14T12:01:54Z","title":"Exploring Narrative Clustering in Large Language Models: A Layerwise\n  Analysis of BERT","summary":"  This study investigates the internal mechanisms of BERT, a transformer-based\nlarge language model, with a focus on its ability to cluster narrative content\nand authorial style across its layers. Using a dataset of narratives developed\nvia GPT-4, featuring diverse semantic content and stylistic variations, we\nanalyze BERT's layerwise activations to uncover patterns of localized neural\nprocessing. Through dimensionality reduction techniques such as Principal\nComponent Analysis (PCA) and Multidimensional Scaling (MDS), we reveal that\nBERT exhibits strong clustering based on narrative content in its later layers,\nwith progressively compact and distinct clusters. While strong stylistic\nclustering might occur when narratives are rephrased into different text types\n(e.g., fables, sci-fi, kids' stories), minimal clustering is observed for\nauthorial style specific to individual writers. These findings highlight BERT's\nprioritization of semantic content over stylistic features, offering insights\ninto its representational capabilities and processing hierarchy. This study\ncontributes to understanding how transformer models like BERT encode linguistic\ninformation, paving the way for future interdisciplinary research in artificial\nintelligence and cognitive neuroscience.\n","authors":["Awritrojit Banerjee","Achim Schilling","Patrick Krauss"],"pdf_url":"https://arxiv.org/pdf/2501.08053v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2408.03062,\n  arXiv:2408.04270, arXiv:2307.01577"},{"id":"http://arxiv.org/abs/2501.08049v1","updated":"2025-01-14T11:56:00Z","published":"2025-01-14T11:56:00Z","title":"Self-Attentive Spatio-Temporal Calibration for Precise Intermediate\n  Layer Matching in ANN-to-SNN Distillation","summary":"  Spiking Neural Networks (SNNs) are promising for low-power computation due to\ntheir event-driven mechanism but often suffer from lower accuracy compared to\nArtificial Neural Networks (ANNs). ANN-to-SNN knowledge distillation can\nimprove SNN performance, but previous methods either focus solely on label\ninformation, missing valuable intermediate layer features, or use a layer-wise\napproach that neglects spatial and temporal semantic inconsistencies, leading\nto performance degradation.To address these limitations, we propose a novel\nmethod called self-attentive spatio-temporal calibration (SASTC). SASTC uses\nself-attention to identify semantically aligned layer pairs between ANN and\nSNN, both spatially and temporally. This enables the autonomous transfer of\nrelevant semantic information. Extensive experiments show that SASTC\noutperforms existing methods, effectively solving the mismatching problem.\nSuperior accuracy results include 95.12% on CIFAR-10, 79.40% on CIFAR-100 with\n2 time steps, and 68.69% on ImageNet with 4 time steps for static datasets, and\n97.92% on DVS-Gesture and 83.60% on DVS-CIFAR10 for neuromorphic datasets. This\nmarks the first time SNNs have outperformed ANNs on both CIFAR-10 and\nCIFAR-100, shedding the new light on the potential applications of SNNs.\n","authors":["Di Hong","Yueming Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08046v1","updated":"2025-01-14T11:53:10Z","published":"2025-01-14T11:53:10Z","title":"Building Symbiotic AI: Reviewing the AI Act for a Human-Centred,\n  Principle-Based Framework","summary":"  Artificial Intelligence (AI) spreads quickly as new technologies and services\ntake over modern society. The need to regulate AI design, development, and use\nis strictly necessary to avoid unethical and potentially dangerous consequences\nto humans. The European Union (EU) has released a new legal framework, the AI\nAct, to regulate AI by undertaking a risk-based approach to safeguard humans\nduring interaction. At the same time, researchers offer a new perspective on AI\nsystems, commonly known as Human-Centred AI (HCAI), highlighting the need for a\nhuman-centred approach to their design. In this context, Symbiotic AI (a\nsubtype of HCAI) promises to enhance human capabilities through a deeper and\ncontinuous collaboration between human intelligence and AI. This article\npresents the results of a Systematic Literature Review (SLR) that aims to\nidentify principles that characterise the design and development of Symbiotic\nAI systems while considering humans as the core of the process. Through content\nanalysis, four principles emerged from the review that must be applied to\ncreate Human-Centred AI systems that can establish a symbiotic relationship\nwith humans. In addition, current trends and challenges were defined to\nindicate open questions that may guide future research for the development of\nSAI systems that comply with the AI Act.\n","authors":["Miriana Calvano","Antonio Curci","Giuseppe Desolda","Andrea Esposito","Rosa Lanzilotti","Antonio Piccinno"],"pdf_url":"https://arxiv.org/pdf/2501.08046v1.pdf","comment":"First version: 17 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.08042v1","updated":"2025-01-14T11:47:35Z","published":"2025-01-14T11:47:35Z","title":"Exploring visual language models as a powerful tool in the diagnosis of\n  Ewing Sarcoma","summary":"  Ewing's sarcoma (ES), characterized by a high density of small round blue\ncells without structural organization, presents a significant health concern,\nparticularly among adolescents aged 10 to 19. Artificial intelligence-based\nsystems for automated analysis of histopathological images are promising to\ncontribute to an accurate diagnosis of ES. In this context, this study explores\nthe feature extraction ability of different pre-training strategies for\ndistinguishing ES from other soft tissue or bone sarcomas with similar\nmorphology in digitized tissue microarrays for the first time, as far as we\nknow. Vision-language supervision (VLS) is compared to fully-supervised\nImageNet pre-training within a multiple instance learning paradigm. Our\nfindings indicate a substantial improvement in diagnostic accuracy with the\nadaption of VLS using an in-domain dataset. Notably, these models not only\nenhance the accuracy of predicted classes but also drastically reduce the\nnumber of trainable parameters and computational costs.\n","authors":["Alvaro Pastor-Naranjo","Pablo Meseguer","Rocío del Amor","Jose Antonio Lopez-Guerrero","Samuel Navarro","Katia Scotlandi","Antonio Llombart-Bosch","Isidro Machado","Valery Naranjo"],"pdf_url":"https://arxiv.org/pdf/2501.08042v1.pdf","comment":"11 pages, 5 figures, 2 tables. Oral presentation at KES-InMed 2024\n  held in Madeira, Portugal"},{"id":"http://arxiv.org/abs/2501.08035v1","updated":"2025-01-14T11:39:55Z","published":"2025-01-14T11:39:55Z","title":"READ: Reinforcement-based Adversarial Learning for Text Classification\n  with Limited Labeled Data","summary":"  Pre-trained transformer models such as BERT have shown massive gains across\nmany text classification tasks. However, these models usually need enormous\nlabeled data to achieve impressive performances. Obtaining labeled data is\noften expensive and time-consuming, whereas collecting unlabeled data using\nsome heuristics is relatively much cheaper for any task. Therefore, this paper\nproposes a method that encapsulates reinforcement learning-based text\ngeneration and semi-supervised adversarial learning approaches in a novel way\nto improve the model's performance. Our method READ, Reinforcement-based\nAdversarial learning, utilizes an unlabeled dataset to generate diverse\nsynthetic text through reinforcement learning, improving the model's\ngeneralization capability using adversarial learning. Our experimental results\nshow that READ outperforms the existing state-of-art methods on multiple\ndatasets.\n","authors":["Rohit Sharma","Shanu Kumar","Avinash Kumar"],"pdf_url":"https://arxiv.org/pdf/2501.08035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10243v2","updated":"2025-01-14T11:23:05Z","published":"2024-08-05T10:18:00Z","title":"TrIM, Triangular Input Movement Systolic Array for Convolutional Neural\n  Networks: Architecture and Hardware Implementation","summary":"  Modern hardware architectures for Convolutional Neural Networks (CNNs), other\nthan targeting high performance, aim at dissipating limited energy. Reducing\nthe data movement cost between the computing cores and the memory is a way to\nmitigate the energy consumption. Systolic arrays are suitable architectures to\nachieve this objective: they use multiple processing elements that communicate\neach other to maximize data utilization, based on proper dataflows like the\nweight stationary and row stationary. Motivated by this, we have proposed TrIM,\nan innovative dataflow based on a triangular movement of inputs, and capable to\nreduce the number of memory accesses by one order of magnitude when compared to\nstate-of-the-art systolic arrays. In this paper, we present a TrIM-based\nhardware architecture for CNNs. As a showcase, the accelerator is implemented\nonto a Field Programmable Gate Array (FPGA) to execute the VGG-16 and AlexNet\nCNNs. The architecture achieves a peak throughput of 453.6 Giga Operations per\nSecond, outperforming a state-of-the-art row stationary systolic array up to\n~3x in terms of memory accesses, and being up to ~11.9x more energy-efficient\nthan other FPGA accelerators.\n","authors":["Cristian Sestito","Shady Agwa","Themis Prodromakis"],"pdf_url":"https://arxiv.org/pdf/2408.10243v2.pdf","comment":"This work has been accepted by IEEE TCAS-I for publication"},{"id":"http://arxiv.org/abs/2501.08020v1","updated":"2025-01-14T11:20:19Z","published":"2025-01-14T11:20:19Z","title":"Cooperative Patrol Routing: Optimizing Urban Crime Surveillance through\n  Multi-Agent Reinforcement Learning","summary":"  The effective design of patrol strategies is a difficult and complex problem,\nespecially in medium and large areas. The objective is to plan, in a\ncoordinated manner, the optimal routes for a set of patrols in a given area, in\norder to achieve maximum coverage of the area, while also trying to minimize\nthe number of patrols. In this paper, we propose a multi-agent reinforcement\nlearning (MARL) model, based on a decentralized partially observable Markov\ndecision process, to plan unpredictable patrol routes within an urban\nenvironment represented as an undirected graph. The model attempts to maximize\na target function that characterizes the environment within a given time frame.\nOur model has been tested to optimize police patrol routes in three\nmedium-sized districts of the city of Malaga. The aim was to maximize\nsurveillance coverage of the most crime-prone areas, based on actual crime data\nin the city. To address this problem, several MARL algorithms have been\nstudied, and among these the Value Decomposition Proximal Policy Optimization\n(VDPPO) algorithm exhibited the best performance. We also introduce a novel\nmetric, the coverage index, for the evaluation of the coverage performance of\nthe routes generated by our model. This metric is inspired by the predictive\naccuracy index (PAI), which is commonly used in criminology to detect hotspots.\nUsing this metric, we have evaluated the model under various scenarios in which\nthe number of agents (or patrols), their starting positions, and the level of\ninformation they can observe in the environment have been modified. Results\nshow that the coordinated routes generated by our model achieve a coverage of\nmore than $90\\%$ of the $3\\%$ of graph nodes with the highest crime incidence,\nand $65\\%$ for $20\\%$ of these nodes; $3\\%$ and $20\\%$ represent the coverage\nstandards for police resource allocation.\n","authors":["Juan Palma-Borda","Eduardo Guzmán","María-Victoria Belmonte"],"pdf_url":"https://arxiv.org/pdf/2501.08020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08019v1","updated":"2025-01-14T11:19:52Z","published":"2025-01-14T11:19:52Z","title":"An AI-driven framework for rapid and localized optimizations of urban\n  open spaces","summary":"  As urbanization accelerates, open spaces are increasingly recognized for\ntheir role in enhancing sustainability and well-being, yet they remain\nunderexplored compared to built spaces. This study introduces an AI-driven\nframework that integrates machine learning models (MLMs) and explainable AI\ntechniques to optimize Sky View Factor (SVF) and visibility, key spatial\nmetrics influencing thermal comfort and perceived safety in urban spaces.\nUnlike global optimization methods, which are computationally intensive and\nimpractical for localized adjustments, this framework supports incremental\ndesign improvements with lower computational costs and greater flexibility. The\nframework employs SHapley Adaptive Explanations (SHAP) to analyze feature\nimportance and Counterfactual Explanations (CFXs) to propose minimal design\nchanges. Simulations tested five MLMs, identifying XGBoost as the most\naccurate, with building width, park area, and heights of surrounding buildings\nas critical for SVF, and distances from southern buildings as key for\nvisibility. Compared to Genetic Algorithms, which required approximately 15/30\nminutes across 3/4 generations to converge, the tested CFX approach achieved\noptimized results in 1 minute with a 5% RMSE error, demonstrating significantly\nfaster performance and suitability for scalable retrofitting strategies. This\ninterpretable and computationally efficient framework advances urban\nperformance optimization, providing data-driven insights and practical\nretrofitting solutions for enhancing usability and environmental quality across\ndiverse urban contexts.\n","authors":["Pegah Eshraghi","Arman Nikkhah Dehnavi","Maedeh Mirdamadi","Riccardo Talami","Zahra-Sadat Zomorodian"],"pdf_url":"https://arxiv.org/pdf/2501.08019v1.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2501.06713v2","updated":"2025-01-14T11:03:56Z","published":"2025-01-12T04:44:06Z","title":"MiniRAG: Towards Extremely Simple Retrieval-Augmented Generation","summary":"  The growing demand for efficient and lightweight Retrieval-Augmented\nGeneration (RAG) systems has highlighted significant challenges when deploying\nSmall Language Models (SLMs) in existing RAG frameworks. Current approaches\nface severe performance degradation due to SLMs' limited semantic understanding\nand text processing capabilities, creating barriers for widespread adoption in\nresource-constrained scenarios. To address these fundamental limitations, we\npresent MiniRAG, a novel RAG system designed for extreme simplicity and\nefficiency. MiniRAG introduces two key technical innovations: (1) a\nsemantic-aware heterogeneous graph indexing mechanism that combines text chunks\nand named entities in a unified structure, reducing reliance on complex\nsemantic understanding, and (2) a lightweight topology-enhanced retrieval\napproach that leverages graph structures for efficient knowledge discovery\nwithout requiring advanced language capabilities. Our extensive experiments\ndemonstrate that MiniRAG achieves comparable performance to LLM-based methods\neven when using SLMs while requiring only 25\\% of the storage space.\nAdditionally, we contribute a comprehensive benchmark dataset for evaluating\nlightweight RAG systems under realistic on-device scenarios with complex\nqueries. We fully open-source our implementation and datasets at:\nhttps://github.com/HKUDS/MiniRAG.\n","authors":["Tianyu Fan","Jingyuan Wang","Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2501.06713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08009v1","updated":"2025-01-14T10:54:36Z","published":"2025-01-14T10:54:36Z","title":"Tutorial: VAE as an inference paradigm for neuroimaging","summary":"  In this tutorial, we explore Variational Autoencoders (VAEs), an essential\nframework for unsupervised learning, particularly suited for high-dimensional\ndatasets such as neuroimaging. By integrating deep learning with Bayesian\ninference, VAEs enable the generation of interpretable latent representations.\nThis tutorial outlines the theoretical foundations of VAEs, addresses practical\nchallenges such as convergence issues and over-fitting, and discusses\nstrategies like the reparameterization trick and hyperparameter optimization.\nWe also highlight key applications of VAEs in neuroimaging, demonstrating their\npotential to uncover meaningful patterns, including those associated with\nneurodegenerative processes, and their broader implications for analyzing\ncomplex brain data.\n","authors":["C. Vázquez-García","F. J. Martínez-Murcia","F. Segovia Román","Juan M. Górriz Sáez"],"pdf_url":"https://arxiv.org/pdf/2501.08009v1.pdf","comment":"18 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.07583v2","updated":"2025-01-14T10:52:15Z","published":"2024-08-14T14:28:11Z","title":"Transformers and Large Language Models for Efficient Intrusion Detection\n  Systems: A Comprehensive Survey","summary":"  With significant advancements in Transformers LLMs, NLP has extended its\nreach into many research fields due to its enhanced capabilities in text\ngeneration and user interaction. One field benefiting greatly from these\nadvancements is cybersecurity. In cybersecurity, many parameters that need to\nbe protected and exchanged between senders and receivers are in the form of\ntext and tabular data, making NLP a valuable tool in enhancing the security\nmeasures of communication protocols. This survey paper provides a comprehensive\nanalysis of the utilization of Transformers and LLMs in cyber-threat detection\nsystems. The methodology of paper selection and bibliometric analysis is\noutlined to establish a rigorous framework for evaluating existing research.\nThe fundamentals of Transformers are discussed, including background\ninformation on various cyber-attacks and datasets commonly used in this field.\nThe survey explores the application of Transformers in IDSs, focusing on\ndifferent architectures such as Attention-based models, LLMs like BERT and GPT,\nCNN/LSTM-Transformer hybrids, emerging approaches like ViTs, among others.\nFurthermore, it explores the diverse environments and applications where\nTransformers and LLMs-based IDS have been implemented, including computer\nnetworks, IoT devices, critical infrastructure protection, cloud computing,\nSDN, as well as in autonomous vehicles. The paper also addresses research\nchallenges and future directions in this area, identifying key issues such as\ninterpretability, scalability, and adaptability to evolving threats, and more.\nFinally, the conclusion summarizes the findings and highlights the significance\nof Transformers and LLMs in enhancing cyber-threat detection capabilities,\nwhile also outlining potential avenues for further research and development.\n","authors":["Hamza Kheddar"],"pdf_url":"https://arxiv.org/pdf/2408.07583v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.04760 by other authors"},{"id":"http://arxiv.org/abs/2501.08008v1","updated":"2025-01-14T10:51:31Z","published":"2025-01-14T10:51:31Z","title":"TriAdaptLoRA: Brain-Inspired Triangular Adaptive Low-Rank Adaptation for\n  Parameter-Efficient Fine-Tuning","summary":"  The fine-tuning of Large Language Models (LLMs) is pivotal for achieving\noptimal performance across diverse downstream tasks. However, while full\nfine-tuning delivers superior results, it entails significant computational and\nresource costs. Parameter-Efficient Fine-Tuning (PEFT) methods, such as LoRA,\naddress these challenges by reducing the number of trainable parameters, but\nthey often struggle with rank adjustment efficiency and task-specific\nadaptability. We propose Triangular Adaptive Low-Rank Adaptation\n(TriAdaptLoRA), a novel PEFT framework inspired by neuroscience principles,\nwhich dynamically optimizes the allocation of trainable parameters.\nTriAdaptLoRA introduces three key innovations: 1) a triangular split of\ntransformation matrices into lower and upper triangular components to maximize\nparameter utilization, 2) a parameter importance metric based on normalized\nFrobenius norms for efficient adaptation, and 3) an adaptive rank-growth\nstrategy governed by dynamic thresholds, allowing flexible parameter allocation\nacross training steps. Experiments conducted on a variety of natural language\nunderstanding and generation tasks demonstrate that TriAdaptLoRA consistently\noutperforms existing PEFT methods. It achieves superior performance, enhanced\nstability, and reduced computational overhead, particularly under linear\nthreshold-driven rank growth. These results highlight its efficacy as a\nscalable and resource-efficient solution for fine-tuning LLMs.\n","authors":["Yao Liang","Yuwei Wang","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2501.08008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08005v1","updated":"2025-01-14T10:49:26Z","published":"2025-01-14T10:49:26Z","title":"DisCoPatch: Batch Statistics Are All You Need For OOD Detection, But\n  Only If You Can Trust Them","summary":"  Out-of-distribution (OOD) detection holds significant importance across many\napplications. While semantic and domain-shift OOD problems are well-studied,\nthis work focuses on covariate shifts - subtle variations in the data\ndistribution that can degrade machine learning performance. We hypothesize that\ndetecting these subtle shifts can improve our understanding of in-distribution\nboundaries, ultimately improving OOD detection. In adversarial discriminators\ntrained with Batch Normalization (BN), real and adversarial samples form\ndistinct domains with unique batch statistics - a property we exploit for OOD\ndetection. We introduce DisCoPatch, an unsupervised Adversarial Variational\nAutoencoder (VAE) framework that harnesses this mechanism. During inference,\nbatches consist of patches from the same image, ensuring a consistent data\ndistribution that allows the model to rely on batch statistics. DisCoPatch uses\nthe VAE's suboptimal outputs (generated and reconstructed) as negative samples\nto train the discriminator, thereby improving its ability to delineate the\nboundary between in-distribution samples and covariate shifts. By tightening\nthis boundary, DisCoPatch achieves state-of-the-art results in public OOD\ndetection benchmarks. The proposed model not only excels in detecting covariate\nshifts, achieving 95.5% AUROC on ImageNet-1K(-C) but also outperforms all prior\nmethods on public Near-OOD (95.0%) benchmarks. With a compact model size of\n25MB, it achieves high OOD detection performance at notably lower latency than\nexisting methods, making it an efficient and practical solution for real-world\nOOD detection applications. The code will be made publicly available\n","authors":["Francisco Caetano","Christiaan Viviers","Luis A. Zavala-Mondragón","Peter H. N. de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2501.08005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08002v1","updated":"2025-01-14T10:46:41Z","published":"2025-01-14T10:46:41Z","title":"Maximizing Uncertainty for Federated learning via Bayesian\n  Optimisation-based Model Poisoning","summary":"  As we transition from Narrow Artificial Intelligence towards Artificial Super\nIntelligence, users are increasingly concerned about their privacy and the\ntrustworthiness of machine learning (ML) technology. A common denominator for\nthe metrics of trustworthiness is the quantification of uncertainty inherent in\nDL algorithms, and specifically in the model parameters, input data, and model\npredictions. One of the common approaches to address privacy-related issues in\nDL is to adopt distributed learning such as federated learning (FL), where\nprivate raw data is not shared among users. Despite the privacy-preserving\nmechanisms in FL, it still faces challenges in trustworthiness. Specifically,\nthe malicious users, during training, can systematically create malicious model\nparameters to compromise the models predictive and generative capabilities,\nresulting in high uncertainty about their reliability. To demonstrate malicious\nbehaviour, we propose a novel model poisoning attack method named Delphi which\naims to maximise the uncertainty of the global model output. We achieve this by\ntaking advantage of the relationship between the uncertainty and the model\nparameters of the first hidden layer of the local model. Delphi employs two\ntypes of optimisation , Bayesian Optimisation and Least Squares Trust Region,\nto search for the optimal poisoned model parameters, named as Delphi-BO and\nDelphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise\nthe distance of the predictive probability distribution towards an uncertain\ndistribution of model output. Furthermore, we establish a mathematical proof\nfor the attack effectiveness demonstrated in FL. Numerical results demonstrate\nthat Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR\nhighlighting vulnerability of FL systems to model poisoning attacks.\n","authors":["Marios Aristodemou","Xiaolan Liu","Yuan Wang","Konstantinos G. Kyriakopoulos","Sangarapillai Lambotharan","Qingsong Wei"],"pdf_url":"https://arxiv.org/pdf/2501.08002v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.08001v1","updated":"2025-01-14T10:44:38Z","published":"2025-01-14T10:44:38Z","title":"GDiffRetro: Retrosynthesis Prediction with Dual Graph Enhanced Molecular\n  Representation and Diffusion Generation","summary":"  Retrosynthesis prediction focuses on identifying reactants capable of\nsynthesizing a target product. Typically, the retrosynthesis prediction\ninvolves two phases: Reaction Center Identification and Reactant Generation.\nHowever, we argue that most existing methods suffer from two limitations in the\ntwo phases: (i) Existing models do not adequately capture the ``face''\ninformation in molecular graphs for the reaction center identification. (ii)\nCurrent approaches for the reactant generation predominantly use sequence\ngeneration in a 2D space, which lacks versatility in generating reasonable\ndistributions for completed reactive groups and overlooks molecules' inherent\n3D properties. To overcome the above limitations, we propose GDiffRetro. For\nthe reaction center identification, GDiffRetro uniquely integrates the original\ngraph with its corresponding dual graph to represent molecular structures,\nwhich helps guide the model to focus more on the faces in the graph. For the\nreactant generation, GDiffRetro employs a conditional diffusion model in 3D to\nfurther transform the obtained synthon into a complete reactant. Our\nexperimental findings reveal that GDiffRetro outperforms state-of-the-art\nsemi-template models across various evaluative metrics.\n","authors":["Shengyin Sun","Wenhao Yu","Yuxiang Ren","Weitao Du","Liwei Liu","Xuecang Zhang","Ying Hu","Chen Ma"],"pdf_url":"https://arxiv.org/pdf/2501.08001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07992v1","updated":"2025-01-14T10:35:54Z","published":"2025-01-14T10:35:54Z","title":"LLM-Ehnanced Holonic Architecture for Ad-Hoc Scalable SoS","summary":"  As modern system of systems (SoS) become increasingly adaptive and human\ncentred, traditional architectures often struggle to support interoperability,\nreconfigurability, and effective human system interaction. This paper addresses\nthese challenges by advancing the state of the art holonic architecture for\nSoS, offering two main contributions to support these adaptive needs. First, we\npropose a layered architecture for holons, which includes reasoning,\ncommunication, and capabilities layers. This design facilitates seamless\ninteroperability among heterogeneous constituent systems by improving data\nexchange and integration. Second, inspired by principles of intelligent\nmanufacturing, we introduce specialised holons namely, supervisor, planner,\ntask, and resource holons aimed at enhancing the adaptability and\nreconfigurability of SoS. These specialised holons utilise large language\nmodels within their reasoning layers to support decision making and ensure real\ntime adaptability. We demonstrate our approach through a 3D mobility case study\nfocused on smart city transportation, showcasing its potential for managing\ncomplex, multimodal SoS environments. Additionally, we propose evaluation\nmethods to assess the architecture efficiency and scalability,laying the\ngroundwork for future empirical validations through simulations and real world\nimplementations.\n","authors":["Muhammad Ashfaq","Ahmed R. Sadik","Tommi Mikkonen","Muhammad Waseem","Niko Mäkitalo"],"pdf_url":"https://arxiv.org/pdf/2501.07992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07991v1","updated":"2025-01-14T10:35:18Z","published":"2025-01-14T10:35:18Z","title":"Training Hybrid Neural Networks with Multimode Optical Nonlinearities\n  Using Digital Twins","summary":"  The ability to train ever-larger neural networks brings artificial\nintelligence to the forefront of scientific and technical discoveries. However,\ntheir exponentially increasing size creates a proportionally greater demand for\nenergy and computational hardware. Incorporating complex physical events in\nnetworks as fixed, efficient computation modules can address this demand by\ndecreasing the complexity of trainable layers. Here, we utilize ultrashort\npulse propagation in multimode fibers, which perform large-scale nonlinear\ntransformations, for this purpose. Training the hybrid architecture is achieved\nthrough a neural model that differentiably approximates the optical system. The\ntraining algorithm updates the neural simulator and backpropagates the error\nsignal over this proxy to optimize layers preceding the optical one. Our\nexperimental results achieve state-of-the-art image classification accuracies\nand simulation fidelity. Moreover, the framework demonstrates exceptional\nresilience to experimental drifts. By integrating low-energy physical systems\ninto neural networks, this approach enables scalable, energy-efficient AI\nmodels with significantly reduced computational demands.\n","authors":["Ilker Oguz","Louis J. E. Suter","Jih-Liang Hsieh","Mustafa Yildirim","Niyazi Ulas Dinc","Christophe Moser","Demetri Psaltis"],"pdf_url":"https://arxiv.org/pdf/2501.07991v1.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2406.03912v2","updated":"2025-01-14T10:32:32Z","published":"2024-06-06T09:51:30Z","title":"GenSafe: A Generalizable Safety Enhancer for Safe Reinforcement Learning\n  Algorithms Based on Reduced Order Markov Decision Process Model","summary":"  Safe Reinforcement Learning (SRL) aims to realize a safe learning process for\nDeep Reinforcement Learning (DRL) algorithms by incorporating safety\nconstraints. However, the efficacy of SRL approaches often relies on accurate\nfunction approximations, which are notably challenging to achieve in the early\nlearning stages due to data insufficiency. To address this issue, we introduce\nin this work a novel Generalizable Safety enhancer (GenSafe) that is able to\novercome the challenge of data insufficiency and enhance the performance of SRL\napproaches. Leveraging model order reduction techniques, we first propose an\ninnovative method to construct a Reduced Order Markov Decision Process (ROMDP)\nas a low-dimensional approximator of the original safety constraints. Then, by\nsolving the reformulated ROMDP-based constraints, GenSafe refines the actions\nof the agent to increase the possibility of constraint satisfaction.\nEssentially, GenSafe acts as an additional safety layer for SRL algorithms. We\nevaluate GenSafe on multiple SRL approaches and benchmark problems. The results\ndemonstrate its capability to improve safety performance, especially in the\nearly learning phases, while maintaining satisfactory task performance. Our\nproposed GenSafe not only offers a novel measure to augment existing SRL\nmethods but also shows broad compatibility with various SRL algorithms, making\nit applicable to a wide range of systems and SRL problems.\n","authors":["Zhehua Zhou","Xuan Xie","Jiayang Song","Zhan Shu","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2406.03912v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10377v3","updated":"2025-01-14T10:30:19Z","published":"2024-07-15T01:11:30Z","title":"Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal\n  MRI Datasets","summary":"  Multi-modal magnetic resonance imaging (MRI) provides information of lesions\nfor computer-aided diagnosis from different views. Deep learning algorithms are\nsuitable for identifying specific anatomical structures, segmenting lesions,\nand classifying diseases. Manual labels are limited due to the high expense,\nwhich hinders further improvement of accuracy. Self-supervised learning,\nparticularly masked image modeling (MIM), has shown promise in utilizing\nunlabeled data. However, we spot model collapse when applying MIM to\nmulti-modal MRI datasets. The performance of downstream tasks does not see any\nimprovement following the collapsed model. To solve model collapse, we analyze\nand address it in two types: complete collapse and dimensional collapse. We\nfind complete collapse occurs because the collapsed loss value in multi-modal\nMRI datasets falls below the normally converged loss value. Based on this, the\nhybrid mask pattern (HMP) masking strategy is introduced to elevate the\ncollapsed loss above the normally converged loss value and avoid complete\ncollapse. Additionally, we reveal that dimensional collapse stems from\ninsufficient feature uniformity in MIM. We mitigate dimensional collapse by\nintroducing the pyramid barlow twins (PBT) module as an explicit regularization\nmethod. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module\nto avoid model collapse multi-modal MRI. Experiments are conducted on three\nmulti-modal MRI datasets to validate the effectiveness of our approach in\npreventing both types of model collapse. By preventing model collapse, the\ntraining of the model becomes more stable, resulting in a decent improvement in\nperformance for segmentation and classification tasks. The code is available at\nhttps://github.com/LinxuanHan/E-MIM.\n","authors":["Linxuan Han","Sa Xiao","Zimeng Li","Haidong Li","Xiuchao Zhao","Yeqing Han","Fumin Guo","Xin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.10377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07988v1","updated":"2025-01-14T10:24:20Z","published":"2025-01-14T10:24:20Z","title":"GAC-Net_Geometric and attention-based Network for Depth Completion","summary":"  Depth completion is a key task in autonomous driving, aiming to complete\nsparse LiDAR depth measurements into high-quality dense depth maps through\nimage guidance. However, existing methods usually treat depth maps as an\nadditional channel of color images, or directly perform convolution on sparse\ndata, failing to fully exploit the 3D geometric information in depth maps,\nespecially with limited performance in complex boundaries and sparse areas. To\naddress these issues, this paper proposes a depth completion network combining\nchannel attention mechanism and 3D global feature perception (CGA-Net). The\nmain innovations include: 1) Utilizing PointNet++ to extract global 3D\ngeometric features from sparse depth maps, enhancing the scene perception\nability of low-line LiDAR data; 2) Designing a channel-attention-based\nmultimodal feature fusion module to efficiently integrate sparse depth, RGB\nimages, and 3D geometric features; 3) Combining residual learning with CSPN++\nto optimize the depth refinement stage, further improving the completion\nquality in edge areas and complex scenes. Experiments on the KITTI depth\ncompletion dataset show that CGA-Net can significantly improve the prediction\naccuracy of dense depth maps, achieving a new state-of-the-art (SOTA), and\ndemonstrating strong robustness to sparse and complex scenes.\n","authors":["Kuang Zhu","Xingli Gan","Min Sun"],"pdf_url":"https://arxiv.org/pdf/2501.07988v1.pdf","comment":"13pages,4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.21151v2","updated":"2025-01-14T09:58:24Z","published":"2024-07-30T19:28:28Z","title":"Private Collaborative Edge Inference via Over-the-Air Computation","summary":"  We consider collaborative inference at the wireless edge, where each client's\nmodel is trained independently on its local dataset. Clients are queried in\nparallel to make an accurate decision collaboratively. In addition to\nmaximizing the inference accuracy, we also want to ensure the privacy of local\nmodels. To this end, we leverage the superposition property of the multiple\naccess channel to implement bandwidth-efficient multi-user inference methods.\nWe propose different methods for ensemble and multi-view classification that\nexploit over-the-air computation (OAC). We show that these schemes perform\nbetter than their orthogonal counterparts with statistically significant\ndifferences while using fewer resources and providing privacy guarantees. We\nalso provide experimental results verifying the benefits of the proposed OAC\napproach to multi-user inference, and perform an ablation study to demonstrate\nthe effectiveness of our design choices. We share the source code of the\nframework publicly on Github to facilitate further research and\nreproducibility.\n","authors":["Selim F. Yilmaz","Burak Hasircioglu","Li Qiao","Deniz Gunduz"],"pdf_url":"https://arxiv.org/pdf/2407.21151v2.pdf","comment":"17 pages, 8 figures. This work extends from our preliminary study\n  presented at the 2022 IEEE International Symposium on Information Theory [1].\n  arXiv admin note: text overlap with arXiv:2202.03129"},{"id":"http://arxiv.org/abs/2501.07978v1","updated":"2025-01-14T09:52:56Z","published":"2025-01-14T09:52:56Z","title":"Facial Dynamics in Video: Instruction Tuning for Improved Facial\n  Expression Perception and Contextual Awareness","summary":"  Facial expression captioning has found widespread application across various\ndomains. Recently, the emergence of video Multimodal Large Language Models\n(MLLMs) has shown promise in general video understanding tasks. However,\ndescribing facial expressions within videos poses two major challenges for\nthese models: (1) the lack of adequate datasets and benchmarks, and (2) the\nlimited visual token capacity of video MLLMs. To address these issues, this\npaper introduces a new instruction-following dataset tailored for dynamic\nfacial expression caption. The dataset comprises 5,033 high-quality video clips\nannotated manually, containing over 700,000 tokens. Its purpose is to improve\nthe capability of video MLLMs to discern subtle facial nuances. Furthermore, we\npropose FaceTrack-MM, which leverages a limited number of tokens to encode the\nmain character's face. This model demonstrates superior performance in tracking\nfaces and focusing on the facial expressions of the main characters, even in\nintricate multi-person scenarios. Additionally, we introduce a novel evaluation\nmetric combining event extraction, relation classification, and the longest\ncommon subsequence (LCS) algorithm to assess the content consistency and\ntemporal sequence consistency of generated text. Moreover, we present\nFEC-Bench, a benchmark designed to assess the performance of existing video\nMLLMs in this specific task. All data and source code will be made publicly\navailable.\n","authors":["Jiaxing Zhao","Boyuan Sun","Xiang Chen","Xihan Wei"],"pdf_url":"https://arxiv.org/pdf/2501.07978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07970v1","updated":"2025-01-14T09:41:18Z","published":"2025-01-14T09:41:18Z","title":"Comprehensive Metapath-based Heterogeneous Graph Transformer for\n  Gene-Disease Association Prediction","summary":"  Discovering gene-disease associations is crucial for understanding disease\nmechanisms, yet identifying these associations remains challenging due to the\ntime and cost of biological experiments. Computational methods are increasingly\nvital for efficient and scalable gene-disease association prediction.\nGraph-based learning models, which leverage node features and network\nrelationships, are commonly employed for biomolecular predictions. However,\nexisting methods often struggle to effectively integrate node features,\nheterogeneous structures, and semantic information. To address these\nchallenges, we propose COmprehensive MEtapath-based heterogeneous graph\nTransformer(COMET) for predicting gene-disease associations. COMET integrates\ndiverse datasets to construct comprehensive heterogeneous networks,\ninitializing node features with BioGPT. We define seven Metapaths and utilize a\ntransformer framework to aggregate Metapath instances, capturing global\ncontexts and long-distance dependencies. Through intra- and inter-metapath\naggregation using attention mechanisms, COMET fuses latent vectors from\nmultiple Metapaths to enhance GDA prediction accuracy. Our method demonstrates\nsuperior robustness compared to state-of-the-art approaches. Ablation studies\nand visualizations validate COMET's effectiveness, providing valuable insights\nfor advancing human health research.\n","authors":["Wentao Cui","Shoubo Li","Chen Fang","Qingqing Long","Chengrui Wang","Xuezhi Wang","Yuanchun Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.07970v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2501.07964v1","updated":"2025-01-14T09:35:49Z","published":"2025-01-14T09:35:49Z","title":"Derivation of Output Correlation Inferences for Multi-Output (aka\n  Multi-Task) Gaussian Process","summary":"  Gaussian process (GP) is arguably one of the most widely used machine\nlearning algorithms in practice. One of its prominent applications is Bayesian\noptimization (BO). Although the vanilla GP itself is already a powerful tool\nfor BO, it is often beneficial to be able to consider the dependencies of\nmultiple outputs. To do so, Multi-task GP (MTGP) is formulated, but it is not\ntrivial to fully understand the derivations of its formulations and their\ngradients from the previous literature. This paper serves friendly derivations\nof the MTGP formulations and their gradients.\n","authors":["Shuhei Watanabe"],"pdf_url":"https://arxiv.org/pdf/2501.07964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07959v1","updated":"2025-01-14T09:23:30Z","published":"2025-01-14T09:23:30Z","title":"Self-Instruct Few-Shot Jailbreaking: Decompose the Attack into Pattern\n  and Behavior Learning","summary":"  Recently, several works have been conducted on jailbreaking Large Language\nModels (LLMs) with few-shot malicious demos. In particular, Zheng et al. (2024)\nfocuses on improving the efficiency of Few-Shot Jailbreaking (FSJ) by injecting\nspecial tokens into the demos and employing demo-level random search.\nNevertheless, this method lacks generality since it specifies the\ninstruction-response structure. Moreover, the reason why inserting special\ntokens takes effect in inducing harmful behaviors is only empirically\ndiscussed. In this paper, we take a deeper insight into the mechanism of\nspecial token injection and propose Self-Instruct Few-Shot Jailbreaking\n(Self-Instruct-FSJ) facilitated with the demo-level greedy search. This\nframework decomposes the FSJ attack into pattern and behavior learning to\nexploit the model's vulnerabilities in a more generalized and efficient way. We\nconduct elaborate experiments to evaluate our method on common open-source\nmodels and compare it with baseline algorithms. Our code is available at\nhttps://github.com/iphosi/Self-Instruct-FSJ.\n","authors":["Jiaqi Hua","Wanxu Wei"],"pdf_url":"https://arxiv.org/pdf/2501.07959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09622v2","updated":"2025-01-14T09:22:35Z","published":"2024-04-15T09:49:33Z","title":"DIDLM: A SLAM Dataset for Difficult Scenarios Featuring Infrared, Depth\n  Cameras, LIDAR, 4D Radar, and Others under Adverse Weather, Low Light\n  Conditions, and Rough Roads","summary":"  Adverse weather conditions, low-light environments, and bumpy road surfaces\npose significant challenges to SLAM in robotic navigation and autonomous\ndriving. Existing datasets in this field predominantly rely on single sensors\nor combinations of LiDAR, cameras, and IMUs. However, 4D millimeter-wave radar\ndemonstrates robustness in adverse weather, infrared cameras excel in capturing\ndetails under low-light conditions, and depth images provide richer spatial\ninformation. Multi-sensor fusion methods also show potential for better\nadaptation to bumpy roads. Despite some SLAM studies incorporating these\nsensors and conditions, there remains a lack of comprehensive datasets\naddressing low-light environments and bumpy road conditions, or featuring a\nsufficiently diverse range of sensor data. In this study, we introduce a\nmulti-sensor dataset covering challenging scenarios such as snowy weather,\nrainy weather, nighttime conditions, speed bumps, and rough terrains. The\ndataset includes rarely utilized sensors for extreme conditions, such as 4D\nmillimeter-wave radar, infrared cameras, and depth cameras, alongside 3D LiDAR,\nRGB cameras, GPS, and IMU. It supports both autonomous driving and ground robot\napplications and provides reliable GPS/INS ground truth data, covering\nstructured and semi-structured terrains. We evaluated various SLAM algorithms\nusing this dataset, including RGB images, infrared images, depth images, LiDAR,\nand 4D millimeter-wave radar. The dataset spans a total of 18.5 km, 69 minutes,\nand approximately 660 GB, offering a valuable resource for advancing SLAM\nresearch under complex and extreme conditions. Our dataset is available at\nhttps://github.com/GongWeiSheng/DIDLM.\n","authors":["Weisheng Gong","Kaijie Su","Qingyong Li","Chen He","Tong Wu","Z. Jane Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07957v1","updated":"2025-01-14T09:21:17Z","published":"2025-01-14T09:21:17Z","title":"AI Guide Dog: Egocentric Path Prediction on Smartphone","summary":"  This paper introduces AI Guide Dog (AIGD), a lightweight egocentric\nnavigation assistance system for visually impaired individuals, designed for\nreal-time deployment on smartphones. AIGD addresses key challenges in blind\nnavigation by employing a vision-only, multi-label classification approach to\npredict directional commands, ensuring safe traversal across diverse\nenvironments. We propose a novel technique to enable goal-based outdoor\nnavigation by integrating GPS signals and high-level directions, while also\naddressing uncertain multi-path predictions for destination-free indoor\nnavigation. Our generalized model is the first navigation assistance system to\nhandle both goal-oriented and exploratory navigation scenarios across indoor\nand outdoor settings, establishing a new state-of-the-art in blind navigation.\nWe present methods, datasets, evaluations, and deployment insights to encourage\nfurther innovations in assistive navigation systems.\n","authors":["Aishwarya Jadhav","Jeffery Cao","Abhishree Shetty","Urvashi Priyam Kumar","Aditi Sharma","Ben Sukboontip","Jayant Sravan Tamarapalli","Jingyi Zhang","Anirudh Koul"],"pdf_url":"https://arxiv.org/pdf/2501.07957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07317v2","updated":"2025-01-14T09:00:27Z","published":"2025-01-13T13:28:03Z","title":"Evaluation of Artificial Intelligence Methods for Lead Time Prediction\n  in Non-Cycled Areas of Automotive Production","summary":"  The present study examines the effectiveness of applying Artificial\nIntelligence methods in an automotive production environment to predict unknown\nlead times in a non-cycle-controlled production area. Data structures are\nanalyzed to identify contextual features and then preprocessed using one-hot\nencoding. Methods selection focuses on supervised machine learning techniques.\nIn supervised learning methods, regression and classification methods are\nevaluated. Continuous regression based on target size distribution is not\nfeasible. Classification methods analysis shows that Ensemble Learning and\nSupport Vector Machines are the most suitable. Preliminary study results\nindicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost\nyield the best results. After further testing and extensive hyperparameter\noptimization, the final method choice is the LightGBM algorithm. Depending on\nfeature availability and prediction interval granularity, relative prediction\naccuracies of up to 90% can be achieved. Further tests highlight the importance\nof periodic retraining of AI models to accurately represent complex production\nprocesses using the database. The research demonstrates that AI methods can be\neffectively applied to highly variable production data, adding business value\nby providing an additional metric for various control tasks while outperforming\ncurrent non AI-based systems.\n","authors":["Cornelius Hake","Jonas Weigele","Frederik Reichert","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2501.07317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11421v3","updated":"2025-01-14T08:59:17Z","published":"2023-05-19T04:16:50Z","title":"PastNet: Introducing Physical Inductive Biases for Spatio-temporal Video\n  Prediction","summary":"  In this paper, we investigate the challenge of spatio-temporal video\nprediction task, which involves generating future video frames based on\nhistorical spatio-temporal observation streams. Existing approaches typically\nutilize external information such as semantic maps to improve video prediction\naccuracy, which often neglect the inherent physical knowledge embedded within\nvideos. Worse still, their high computational costs could impede their\napplications for high-resolution videos. To address these constraints, we\nintroduce a novel framework called \\underline{P}hysics-\\underline{a}ssisted\n\\underline{S}patio-\\underline{t}emporal \\underline{Net}work (PastNet) for\nhigh-quality video prediction. The core of PastNet lies in incorporating a\nspectral convolution operator in the Fourier domain, which efficiently\nintroduces inductive biases from the underlying physical laws. Additionally, we\nemploy a memory bank with the estimated intrinsic dimensionality to discretize\nlocal features during the processing of complex spatio-temporal signals,\nthereby reducing computational costs and facilitating efficient high-resolution\nvideo prediction. Extensive experiments on various widely-used spatio-temporal\nvideo benchmarks demonstrate the effectiveness and efficiency of the proposed\nPastNet compared with a range of state-of-the-art methods, particularly in\nhigh-resolution scenarios.\n","authors":["Hao Wu","Fan Xu","Chong Chen","Xian-Sheng Hua","Xiao Luo","Haixin Wang"],"pdf_url":"https://arxiv.org/pdf/2305.11421v3.pdf","comment":"11"},{"id":"http://arxiv.org/abs/2501.07945v1","updated":"2025-01-14T08:56:59Z","published":"2025-01-14T08:56:59Z","title":"Early prediction of the transferability of bovine embryos from\n  videomicroscopy","summary":"  Videomicroscopy is a promising tool combined with machine learning for\nstudying the early development of in vitro fertilized bovine embryos and\nassessing its transferability as soon as possible. We aim to predict the embryo\ntransferability within four days at most, taking 2D time-lapse microscopy\nvideos as input. We formulate this problem as a supervised binary\nclassification problem for the classes transferable and not transferable. The\nchallenges are three-fold: 1) poorly discriminating appearance and motion, 2)\nclass ambiguity, 3) small amount of annotated data. We propose a 3D\nconvolutional neural network involving three pathways, which makes it\nmulti-scale in time and able to handle appearance and motion in different ways.\nFor training, we retain the focal loss. Our model, named SFR, compares\nfavorably to other methods. Experiments demonstrate its effectiveness and\naccuracy for our challenging biological task.\n","authors":["Yasmine Hachani","Patrick Bouthemy","Elisa Fromont","Sylvie Ruffini","Ludivine Laffont","Alline de Paula Reis"],"pdf_url":"https://arxiv.org/pdf/2501.07945v1.pdf","comment":"Accepted at the 2024 IEEE International Conference on Image\n  Processing"},{"id":"http://arxiv.org/abs/2501.07931v1","updated":"2025-01-14T08:32:16Z","published":"2025-01-14T08:32:16Z","title":"Advice for Diabetes Self-Management by ChatGPT Models: Challenges and\n  Recommendations","summary":"  Given their ability for advanced reasoning, extensive contextual\nunderstanding, and robust question-answering abilities, large language models\nhave become prominent in healthcare management research. Despite adeptly\nhandling a broad spectrum of healthcare inquiries, these models face\nsignificant challenges in delivering accurate and practical advice for chronic\nconditions such as diabetes. We evaluate the responses of ChatGPT versions 3.5\nand 4 to diabetes patient queries, assessing their depth of medical knowledge\nand their capacity to deliver personalized, context-specific advice for\ndiabetes self-management. Our findings reveal discrepancies in accuracy and\nembedded biases, emphasizing the models' limitations in providing tailored\nadvice unless activated by sophisticated prompting techniques. Additionally, we\nobserve that both models often provide advice without seeking necessary\nclarification, a practice that can result in potentially dangerous advice. This\nunderscores the limited practical effectiveness of these models without human\noversight in clinical settings. To address these issues, we propose a\ncommonsense evaluation layer for prompt evaluation and incorporating\ndisease-specific external memory using an advanced Retrieval Augmented\nGeneration technique. This approach aims to improve information quality and\nreduce misinformation risks, contributing to more reliable AI applications in\nhealthcare settings. Our findings seek to influence the future direction of AI\nin healthcare, enhancing both the scope and quality of its integration.\n","authors":["Waqar Hussain","John Grundy"],"pdf_url":"https://arxiv.org/pdf/2501.07931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07930v1","updated":"2025-01-14T08:32:12Z","published":"2025-01-14T08:32:12Z","title":"An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN\n  Architectures","summary":"  Orthogonal convolutional layers are the workhorse of multiple areas in\nmachine learning, such as adversarial robustness, normalizing flows, GANs, and\nLipschitzconstrained models. Their ability to preserve norms and ensure stable\ngradient propagation makes them valuable for a large range of problems. Despite\ntheir promise, the deployment of orthogonal convolution in large-scale\napplications is a significant challenge due to computational overhead and\nlimited support for modern features like strides, dilations, group\nconvolutions, and transposed convolutions.In this paper, we introduce AOC\n(Adaptative Orthogonal Convolution), a scalable method for constructing\northogonal convolutions, effectively overcoming these limitations. This\nadvancement unlocks the construction of architectures that were previously\nconsidered impractical. We demonstrate through our experiments that our method\nproduces expressive models that become increasingly efficient as they scale. To\nfoster further advancement, we provide an open-source library implementing this\nmethod, available at https://github.com/thib-s/orthogonium.\n","authors":["Thibaut Boissin","Franck Mamalet","Thomas Fel","Agustin Martin Picard","Thomas Massena","Mathieu Serrurier"],"pdf_url":"https://arxiv.org/pdf/2501.07930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07927v1","updated":"2025-01-14T08:30:49Z","published":"2025-01-14T08:30:49Z","title":"Gandalf the Red: Adaptive Security for LLMs","summary":"  Current evaluations of defenses against prompt attacks in large language\nmodel (LLM) applications often overlook two critical factors: the dynamic\nnature of adversarial behavior and the usability penalties imposed on\nlegitimate users by restrictive defenses. We propose D-SEC (Dynamic Security\nUtility Threat Model), which explicitly separates attackers from legitimate\nusers, models multi-step interactions, and rigorously expresses the\nsecurity-utility in an optimizable form. We further address the shortcomings in\nexisting evaluations by introducing Gandalf, a crowd-sourced, gamified\nred-teaming platform designed to generate realistic, adaptive attack datasets.\nUsing Gandalf, we collect and release a dataset of 279k prompt attacks.\nComplemented by benign user data, our analysis reveals the interplay between\nsecurity and utility, showing that defenses integrated in the LLM (e.g., system\nprompts) can degrade usability even without blocking requests. We demonstrate\nthat restricted application domains, defense-in-depth, and adaptive defenses\nare effective strategies for building secure and useful LLM applications. Code\nis available at\n\\href{https://github.com/lakeraai/dsec-gandalf}{\\texttt{https://github.com/lakeraai/dsec-gandalf}}.\n","authors":["Niklas Pfister","Václav Volhejn","Manuel Knott","Santiago Arias","Julia Bazińska","Mykhailo Bichurin","Alan Commike","Janet Darling","Peter Dienes","Matthew Fiedler","David Haber","Matthias Kraft","Marco Lancini","Max Mathys","Damián Pascual-Ortiz","Jakub Podolak","Adrià Romero-López","Kyriacos Shiarlis","Andreas Signer","Zsolt Terek","Athanasios Theocharis","Daniel Timbrell","Samuel Trautwein","Samuel Watts","Natalie Wu","Mateo Rojas-Carulla"],"pdf_url":"https://arxiv.org/pdf/2501.07927v1.pdf","comment":"Niklas Pfister, V\\'aclav Volhejn and Manuel Knott contributed equally"},{"id":"http://arxiv.org/abs/2501.07924v1","updated":"2025-01-14T08:23:15Z","published":"2025-01-14T08:23:15Z","title":"Exploring Aviation Incident Narratives Using Topic Modeling and\n  Clustering Techniques","summary":"  Aviation safety is a global concern, requiring detailed investigations into\nincidents to understand contributing factors comprehensively. This study uses\nthe National Transportation Safety Board (NTSB) dataset. It applies advanced\nnatural language processing (NLP) techniques, including Latent Dirichlet\nAllocation (LDA), Non-Negative Matrix Factorization (NMF), Latent Semantic\nAnalysis (LSA), Probabilistic Latent Semantic Analysis (pLSA), and K-means\nclustering. The main objectives are identifying latent themes, exploring\nsemantic relationships, assessing probabilistic connections, and cluster\nincidents based on shared characteristics. This research contributes to\naviation safety by providing insights into incident narratives and\ndemonstrating the versatility of NLP and topic modelling techniques in\nextracting valuable information from complex datasets. The results, including\ntopics identified from various techniques, provide an understanding of\nrecurring themes. Comparative analysis reveals that LDA performed best with a\ncoherence value of 0.597, pLSA of 0.583, LSA of 0.542, and NMF of 0.437.\nK-means clustering further reveals commonalities and unique insights into\nincident narratives. In conclusion, this study uncovers latent patterns and\nthematic structures within incident narratives, offering a comparative analysis\nof multiple-topic modelling techniques. Future research avenues include\nexploring temporal patterns, incorporating additional datasets, and developing\npredictive models for early identification of safety issues. This research lays\nthe groundwork for enhancing the understanding and improvement of aviation\nsafety by utilising the wealth of information embedded in incident narratives.\n","authors":["Aziida Nanyonga","Hassan Wasswa","Ugur Turhan","Keith Joiner","Graham Wild"],"pdf_url":"https://arxiv.org/pdf/2501.07924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07919v1","updated":"2025-01-14T08:10:43Z","published":"2025-01-14T08:10:43Z","title":"Large Language Model Interface for Home Energy Management Systems","summary":"  Home Energy Management Systems (HEMSs) help households tailor their\nelectricity usage based on power system signals such as energy prices. This\ntechnology helps to reduce energy bills and offers greater demand-side\nflexibility that supports the power system stability. However, residents who\nlack a technical background may find it difficult to use HEMSs effectively,\nbecause HEMSs require well-formatted parameterization that reflects the\ncharacteristics of the energy resources, houses, and users' needs. Recently,\nLarge-Language Models (LLMs) have demonstrated an outstanding ability in\nlanguage understanding. Motivated by this, we propose an LLM-based interface\nthat interacts with users to understand and parameterize their\n``badly-formatted answers'', and then outputs well-formatted parameters to\nimplement an HEMS. We further use Reason and Act method (ReAct) and few-shot\nprompting to enhance the LLM performance. Evaluating the interface performance\nrequires multiple user--LLM interactions. To avoid the efforts in finding\nvolunteer users and reduce the evaluation time, we additionally propose a\nmethod that uses another LLM to simulate users with varying expertise, ranging\nfrom knowledgeable to non-technical. By comprehensive evaluation, the proposed\nLLM-based HEMS interface achieves an average parameter retrieval accuracy of\n88\\%, outperforming benchmark models without ReAct and/or few-shot prompting.\n","authors":["François Michelon","Yihong Zhou","Thomas Morstyn"],"pdf_url":"https://arxiv.org/pdf/2501.07919v1.pdf","comment":"13 pages conference paper"},{"id":"http://arxiv.org/abs/2403.10568v3","updated":"2025-01-14T08:01:17Z","published":"2024-03-14T17:47:10Z","title":"MoPE: Mixture of Prompt Experts for Parameter-Efficient and Scalable\n  Multimodal Fusion","summary":"  Despite the demonstrated parameter efficiency of prompt-based multimodal\nfusion methods, their limited adaptivity and expressiveness often result in\nsuboptimal performance compared to other tuning approaches. In this paper, we\nintroduce the Mixture of Prompt Experts (MoPE), the first technique designed to\novercome these limitations by decomposing standard prompts to capture\ninstance-level features adaptively. Building on this decomposition, MoPE\nenhances prompt fusion's expressiveness by leveraging multimodal pairing priors\nto route the most effective prompt for each instance dynamically. Compared to\nvanilla prompting, our MoPE-based fusion method exhibits greater\nexpressiveness, scaling more effectively with the training data and the overall\nnumber of trainable parameters. We also investigate regularization terms for\nexpert routing, which lead to emergent expert specialization with enhanced\nadaptiveness and interpretablity. Extensive experiments across six multimodal\ndatasets spanning four modalities demonstrate state-of-the-art performance for\nprompt fusion, matching or even surpassing the performance of fine-tuning while\nrequiring only 0.8% of the trainable parameters. Project homepage:\nhttps://github.com/songrise/MoPE\n","authors":["Ruixiang Jiang","Lingbo Liu","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.10568v3.pdf","comment":"Under Review, Extended version of arxiv:2312.03734"},{"id":"http://arxiv.org/abs/2411.07240v2","updated":"2025-01-14T07:57:26Z","published":"2024-11-11T18:59:02Z","title":"UTMath: Math Evaluation with Unit Test via Reasoning-to-Coding Thoughts","summary":"  The evaluation of mathematical reasoning capabilities is essential for\nadvancing Artificial General Intelligence (AGI). While Large Language Models\n(LLMs) have shown impressive performance in solving mathematical problems,\nexisting benchmarks such as GSM8K and MATH present limitations, including\nnarrow problem definitions with specific numbers and reliance on predetermined\nrules that hinder accurate assessments of reasoning and generality. This paper\nintroduces the UTMath Benchmark, a robust evaluation framework designed to\nassess LLMs through extensive unit tests, with a focus on both the accuracy and\ngenerality of model responses. It comprises 1,053 cutting-edge problems\nspanning nine mathematical domains, with an average of 68 test cases per\nproblem. UTMath is highly challenging, with the best-performing model, o1-mini,\nsolving only 32.57\\% of the problems, followed by o1-preview at 27.16\\%, and\nGPT-4o at 26.93\\%. Furthermore, we present the Reasoning-to-Coding of Thoughts\n(RCoT) approach, which encourages LLMs to engage in explicit reasoning prior to\ncode generation, thereby facilitating the production of more sophisticated\nsolutions and enhancing overall performance and efficiency. Additionally, we\nalso release the UTMath-Train training dataset (more than 70k samples), to\nsupport the community in further exploring mathematical reasoning. Our\nbenchmark can be accessed via the following link:\nhttps://github.com/UTMathGroup/UTMath\n","authors":["Bo Yang","Qingping Yang","Yingwei Ma","Runtao Liu"],"pdf_url":"https://arxiv.org/pdf/2411.07240v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07913v1","updated":"2025-01-14T07:55:18Z","published":"2025-01-14T07:55:18Z","title":"Governing AI Agents","summary":"  The field of AI is undergoing a fundamental transition from systems that can\nproduce synthetic content upon request to autonomous agents that can plan and\nexecute complex tasks with only limited human involvement. Companies that\npioneered the development of generative AI tools are now building AI agents\nthat can be instructed to independently navigate the internet, perform a wide\nrange of online tasks, and serve as artificial personal assistants and virtual\ncoworkers. The opportunities presented by this new technology are tremendous,\nas are the associated risks. Fortunately, there exist robust analytic\nframeworks for confronting many of these challenges, namely, the economic\ntheory of principal-agent problems and the common law doctrine of agency\nrelationships. Drawing on these frameworks, this Article makes three\ncontributions. First, it uses agency law and theory to identify and\ncharacterize problems arising from AI agents, including issues of information\nasymmetry, discretionary authority, and loyalty. Second, it illustrates the\nlimitations of conventional solutions to agency problems: incentive design,\nmonitoring, and enforcement might not be effective for governing AI agents that\nmake uninterpretable decisions and operate at unprecedented speed and scale.\nThird, the Article explores the implications of agency law and theory for\ndesigning and regulating AI agents, arguing that new technical and legal\ninfrastructure is needed to support governance principles of inclusivity,\nvisibility, and liability.\n","authors":["Noam Kolt"],"pdf_url":"https://arxiv.org/pdf/2501.07913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07911v1","updated":"2025-01-14T07:53:44Z","published":"2025-01-14T07:53:44Z","title":"Deep Learning and Natural Language Processing in the Field of\n  Construction","summary":"  This article presents a complete process to extract hypernym relationships in\nthe field of construction using two main steps: terminology extraction and\ndetection of hypernyms from these terms. We first describe the corpus analysis\nmethod to extract terminology from a collection of technical specifications in\nthe field of construction. Using statistics and word n-grams analysis, we\nextract the domain's terminology and then perform pruning steps with linguistic\npatterns and internet queries to improve the quality of the final terminology.\nSecond, we present a machine-learning approach based on various words embedding\nmodels and combinations to deal with the detection of hypernyms from the\nextracted terminology. Extracted terminology is evaluated using a manual\nevaluation carried out by 6 experts in the domain, and the hypernym\nidentification method is evaluated with different datasets. The global approach\nprovides relevant and promising results.\n","authors":["Rémy Kessler","Nicolas Béchet"],"pdf_url":"https://arxiv.org/pdf/2501.07911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07905v1","updated":"2025-01-14T07:50:09Z","published":"2025-01-14T07:50:09Z","title":"Logarithmic Memory Networks (LMNs): Efficient Long-Range Sequence\n  Modeling for Resource-Constrained Environments","summary":"  Long-range sequence modeling is a crucial aspect of natural language\nprocessing and time series analysis. However, traditional models like Recurrent\nNeural Networks (RNNs) and Transformers suffer from computational and memory\ninefficiencies, especially when dealing with long sequences. This paper\nintroduces Logarithmic Memory Networks (LMNs), a novel architecture that\nleverages a hierarchical logarithmic tree structure to efficiently store and\nretrieve past information. LMNs dynamically summarize historical context,\nsignificantly reducing the memory footprint and computational complexity of\nattention mechanisms from O(n2) to O(log(n)). The model employs a\nsingle-vector, targeted attention mechanism to access stored information, and\nthe memory block construction worker (summarizer) layer operates in two modes:\na parallel execution mode during training for efficient processing of\nhierarchical tree structures and a sequential execution mode during inference,\nwhich acts as a memory management system. It also implicitly encodes positional\ninformation, eliminating the need for explicit positional encodings. These\nfeatures make LMNs a robust and scalable solution for processing long-range\nsequences in resource-constrained environments, offering practical improvements\nin efficiency and scalability. The code is publicly available under the MIT\nLicense on GitHub: https://github.com/AhmedBoin/LogarithmicMemory.\n","authors":["Mohamed A. Taha"],"pdf_url":"https://arxiv.org/pdf/2501.07905v1.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.03055v2","updated":"2025-01-14T07:46:44Z","published":"2025-01-06T14:41:45Z","title":"To Analyze and Regulate Human-in-the-loop Learning for Congestion Games","summary":"  In congestion games, selfish users behave myopically to crowd to the shortest\npaths, and the social planner designs mechanisms to regulate such selfish\nrouting through information or payment incentives. However, such mechanism\ndesign requires the knowledge of time-varying traffic conditions and it is the\nusers themselves to learn and report past road experiences to the social\nplanner (e.g., Waze or Google Maps). When congestion games meet mobile\ncrowdsourcing, it is critical to incentivize selfish users to explore\nnon-shortest paths in the best exploitation-exploration trade-off. First, we\nconsider a simple but fundamental parallel routing network with one\ndeterministic path and multiple stochastic paths for users with an average\narrival probability $\\lambda$. We prove that the current myopic routing policy\n(widely used in Waze and Google Maps) misses both exploration (when strong\nhazard belief) and exploitation (when weak hazard belief) as compared to the\nsocial optimum. Due to the myopic policy's under-exploration, we prove that the\ncaused price of anarchy (PoA) is larger than\n\\(\\frac{1}{1-\\rho^{\\frac{1}{\\lambda}}}\\), which can be arbitrarily large as\ndiscount factor \\(\\rho\\rightarrow1\\). To mitigate such huge efficiency loss, we\npropose a novel selective information disclosure (SID) mechanism: we only\nreveal the latest traffic information to users when they intend to over-explore\nstochastic paths upon arrival, while hiding such information when they want to\nunder-explore. We prove that our mechanism successfully reduces PoA to be less\nthan~\\(2\\). Besides the parallel routing network, we further extend our\nmechanism and PoA results to any linear path graphs with multiple intermediate\nnodes.\n","authors":["Hongbo Li","Lingjie Duan"],"pdf_url":"https://arxiv.org/pdf/2501.03055v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2211.14029"},{"id":"http://arxiv.org/abs/2501.07903v1","updated":"2025-01-14T07:46:33Z","published":"2025-01-14T07:46:33Z","title":"Optimal Classification Trees for Continuous Feature Data Using Dynamic\n  Programming with Branch-and-Bound","summary":"  Computing an optimal classification tree that provably maximizes training\nperformance within a given size limit, is NP-hard, and in practice, most\nstate-of-the-art methods do not scale beyond computing optimal trees of depth\nthree. Therefore, most methods rely on a coarse binarization of continuous\nfeatures to maintain scalability. We propose a novel algorithm that optimizes\ntrees directly on the continuous feature data using dynamic programming with\nbranch-and-bound. We develop new pruning techniques that eliminate many\nsub-optimal splits in the search when similar to previously computed splits and\nwe provide an efficient subroutine for computing optimal depth-two trees. Our\nexperiments demonstrate that these techniques improve runtime by one or more\norders of magnitude over state-of-the-art optimal methods and improve test\naccuracy by 5% over greedy heuristics.\n","authors":["Catalin E. Brita","Jacobus G. M. van der Linden","Emir Demirović"],"pdf_url":"https://arxiv.org/pdf/2501.07903v1.pdf","comment":"In the proceedings of AAAI-25"},{"id":"http://arxiv.org/abs/2406.17863v4","updated":"2025-01-14T07:33:19Z","published":"2024-06-25T18:05:31Z","title":"What type of inference is planning?","summary":"  Multiple types of inference are available for probabilistic graphical models,\ne.g., marginal, maximum-a-posteriori, and even marginal maximum-a-posteriori.\nWhich one do researchers mean when they talk about \"planning as inference\"?\nThere is no consistency in the literature, different types are used, and their\nability to do planning is further entangled with specific approximations or\nadditional constraints. In this work we use the variational framework to show\nthat, just like all commonly used types of inference correspond to different\nweightings of the entropy terms in the variational problem, planning\ncorresponds exactly to a different set of weights. This means that all the\ntricks of variational inference are readily applicable to planning. We develop\nan analogue of loopy belief propagation that allows us to perform approximate\nplanning in factored-state Markov decisions processes without incurring\nintractability due to the exponentially large state space. The variational\nperspective shows that the previous types of inference for planning are only\nadequate in environments with low stochasticity, and allows us to characterize\neach type by its own merits, disentangling the type of inference from the\nadditional approximations that its practical use requires. We validate these\nresults empirically on synthetic MDPs and tasks posed in the International\nPlanning Competition.\n","authors":["Miguel Lázaro-Gredilla","Li Yang Ku","Kevin P. Murphy","Dileep George"],"pdf_url":"https://arxiv.org/pdf/2406.17863v4.pdf","comment":"Camera-ready version update"},{"id":"http://arxiv.org/abs/2501.05729v2","updated":"2025-01-14T07:28:10Z","published":"2025-01-10T05:53:37Z","title":"ExPO: Explainable Phonetic Trait-Oriented Network for Speaker\n  Verification","summary":"  In speaker verification, we use computational method to verify if an\nutterance matches the identity of an enrolled speaker. This task is similar to\nthe manual task of forensic voice comparison, where linguistic analysis is\ncombined with auditory measurements to compare and evaluate voice samples.\nDespite much success, we have yet to develop a speaker verification system that\noffers explainable results comparable to those from manual forensic voice\ncomparison. A novel approach, Explainable Phonetic Trait-Oriented (ExPO)\nnetwork, is proposed in this paper to introduce the speaker's phonetic trait\nwhich describes the speaker's characteristics at the phonetic level, resembling\nwhat forensic comparison does. ExPO not only generates utterance-level speaker\nembeddings but also allows for fine-grained analysis and visualization of\nphonetic traits, offering an explainable speaker verification process.\nFurthermore, we investigate phonetic traits from within-speaker and\nbetween-speaker variation perspectives to determine which trait is most\neffective for speaker verification, marking an important step towards\nexplainable speaker verification. Our code is available at\nhttps://github.com/mmmmayi/ExPO.\n","authors":["Yi Ma","Shuai Wang","Tianchi Liu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2501.05729v2.pdf","comment":"Accepted by IEEE Signal Processing Letters"},{"id":"http://arxiv.org/abs/2405.03372v2","updated":"2025-01-14T07:26:51Z","published":"2024-05-06T11:25:59Z","title":"Snake Learning: A Communication- and Computation-Efficient Distributed\n  Learning Framework for 6G","summary":"  In the evolution towards 6G, integrating Artificial Intelligence (AI) with\nadvanced network infrastructure emerges as a pivotal strategy for enhancing\nnetwork intelligence and resource utilization. Existing distributed learning\nframeworks like Federated Learning and Split Learning often struggle with\nsignificant challenges in dynamic network environments including high\nsynchronization demands, costly communication overhead, severe computing\nresource consumption, and data heterogeneity across network nodes. These\nobstacles hinder the applications of ubiquitous computing capabilities of 6G\nnetworks, especially in light of the trend of escalating model parameters and\ntraining data volumes. To address these challenges effectively, this paper\nintroduces ``Snake Learning\", a cost-effective distributed learning framework.\nSpecifically, Snake Learning respects the heterogeneity of inter-node computing\ncapability and local data distribution in 6G networks, and sequentially trains\nthe designated part of model layers on individual nodes. This layer-by-layer\nserpentine update mechanism contributes to significantly reducing the\nrequirements for storage, memory and communication during the model training\nphase, and demonstrates superior adaptability and efficiency for both\nclassification and fine-tuning tasks across homogeneous and heterogeneous data\ndistributions.\n","authors":["Xiaoxue Yu","Xingfu Yi","Rongpeng Li","Fei Wang","Chenghui Peng","Zhifeng Zhao","Honggang Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.03372v2.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.07896v1","updated":"2025-01-14T07:23:52Z","published":"2025-01-14T07:23:52Z","title":"Anytime Cooperative Implicit Hitting Set Solving","summary":"  The Implicit Hitting Set (HS) approach has shown to be very effective for\nMaxSAT, Pseudo-boolean optimization and other boolean frameworks. Very\nrecently, it has also shown its potential in the very similar Weighted CSP\nframework by means of the so-called cost-function merging. The original\nformulation of the HS approach focuses on obtaining increasingly better lower\nbounds (HS-lb). However, and as shown for Pseudo-Boolean Optimization, this\napproach can also be adapted to compute increasingly better upper bounds\n(HS-ub). In this paper we consider both HS approaches and show how they can be\neasily combined in a multithread architecture where cores discovered by either\ncomponent are available by the other which, interestingly, generates synergy\nbetween them. We show that the resulting algorithm (HS-lub) is consistently\nsuperior to either HS-lb and HS-ub in isolation. Most importantly, HS-lub has\nan effective anytime behaviour with which the optimality gap is reduced during\nthe execution. We tested our approach on the Weighted CSP framework and show on\nthree different benchmarks that our very simple implementation sometimes\noutperforms the parallel hybrid best-first search implementation of the far\nmore developed state-of-the-art Toulbar2.\n","authors":["Emma Rollón","Javier Larrosa","Aleksandra Petrova"],"pdf_url":"https://arxiv.org/pdf/2501.07896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07892v1","updated":"2025-01-14T07:16:43Z","published":"2025-01-14T07:16:43Z","title":"Leveraging Metamemory Mechanisms for Enhanced Data-Free Code Generation\n  in LLMs","summary":"  Automated code generation using large language models (LLMs) has gained\nattention due to its efficiency and adaptability. However, real-world coding\ntasks or benchmarks like HumanEval and StudentEval often lack dedicated\ntraining datasets, challenging existing few-shot prompting approaches that rely\non reference examples. Inspired by human metamemory-a cognitive process\ninvolving recall and evaluation-we present a novel framework (namely M^2WF) for\nimproving LLMs' one-time code generation. This approach enables LLMs to\nautonomously generate, evaluate, and utilize synthetic examples to enhance\nreliability and performance. Unlike prior methods, it minimizes dependency on\ncurated data and adapts flexibly to various coding scenarios. Our experiments\ndemonstrate significant improvements in coding benchmarks, offering a scalable\nand robust solution for data-free environments. The code and framework will be\npublicly available on GitHub and HuggingFace.\n","authors":["Shuai Wang","Liang Ding","Yibing Zhan","Yong Luo","Zheng He","Dapeng Tao"],"pdf_url":"https://arxiv.org/pdf/2501.07892v1.pdf","comment":"11 pages,6 figures"},{"id":"http://arxiv.org/abs/2405.18731v2","updated":"2025-01-14T07:14:00Z","published":"2024-05-29T03:21:09Z","title":"VBIM-Net: Variational Born Iterative Network for Inverse Scattering\n  Problems","summary":"  Recently, studies have shown the potential of integrating field-type\niterative methods with deep learning (DL) techniques in solving inverse\nscattering problems (ISPs). In this article, we propose a novel Variational\nBorn Iterative Network, namely, VBIM-Net, to solve the full-wave ISPs with\nsignificantly improved structural rationality and inversion quality. The\nproposed VBIM-Net emulates the alternating updates of the total electric field\nand the contrast in the variational Born iterative method (VBIM) by multiple\nlayers of subnetworks. We embed the analytical calculation of the contrast\nvariation into each subnetwork, converting the scattered field residual into an\napproximate contrast variation and then enhancing it by a U-Net, thus avoiding\nthe requirement of matched measurement dimension and grid resolution as in\nexisting approaches. The total field and contrast of each layer's output is\nsupervised in the loss function of VBIM-Net, imposing soft physical constraints\non the variables in the subnetworks, which benefits the model's performance.In\naddition, we design a training scheme with extra noise to enhance the model's\nstability. Extensive numerical results on synthetic and experimental data both\nverify the inversion quality, generalization ability, and robustness of the\nproposed VBIM-Net. This work may provide some new inspiration for the design of\nefficient field-type DL schemes.\n","authors":["Ziqing Xing","Zhaoyang Zhang","Zirui Chen","Yusong Wang","Haoran Ma","Zhun Wei"],"pdf_url":"https://arxiv.org/pdf/2405.18731v2.pdf","comment":"Accepted by IEEE Transactions on Geoscience and Remote Sensing"},{"id":"http://arxiv.org/abs/2501.07890v1","updated":"2025-01-14T06:59:51Z","published":"2025-01-14T06:59:51Z","title":"GRAPHMOE: Amplifying Cognitive Depth of Mixture-of-Experts Network via\n  Introducing Self-Rethinking Mechanism","summary":"  Traditional Mixture-of-Experts (MoE) networks benefit from utilizing multiple\nsmaller expert models as opposed to a single large network. However, these\nexperts typically operate independently, leaving a question open about whether\ninterconnecting these models could enhance the performance of MoE networks. In\nresponse, we introduce GRAPHMOE, a novel method aimed at augmenting the\ncognitive depth of language models via a self-rethinking mechanism constructed\non Pseudo GraphMoE networks. GRAPHMOE employs a recurrent routing strategy to\nsimulate iterative thinking steps, thereby facilitating the flow of information\namong expert nodes. We implement the GRAPHMOE architecture using Low-Rank\nAdaptation techniques (LoRA) and conduct extensive experiments on various\nbenchmark datasets. The experimental results reveal that GRAPHMOE outperforms\nother LoRA based models, achieving state-of-the-art (SOTA) performance.\nAdditionally, this study explores a novel recurrent routing strategy that may\ninspire further advancements in enhancing the reasoning capabilities of\nlanguage models.\n","authors":["Chen Tang","Bo Lv","Zifan Zheng","Bohao Yang","Kun Zhao","Ning Liao","Xiaoxing Wang","Feiyu Xiong","Zhiyu Li","Nayu Liu","Jingchi Jiang"],"pdf_url":"https://arxiv.org/pdf/2501.07890v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2410.15322v2","updated":"2025-01-14T06:59:12Z","published":"2024-10-20T07:32:16Z","title":"FoMo: A Foundation Model for Mobile Traffic Forecasting with Diffusion\n  Model","summary":"  Mobile traffic forecasting allows operators to anticipate network dynamics\nand performance in advance, offering substantial potential for enhancing\nservice quality and improving user experience. However, existing models are\noften task-oriented and are trained with tailored data, which limits their\neffectiveness in diverse mobile network tasks of Base Station (BS) deployment,\nresource allocation, energy optimization, etc. and hinders generalization\nacross different urban environments. Foundation models have made remarkable\nstrides across various domains of NLP and CV due to their multi-tasking\nadaption and zero/few-shot learning capabilities. In this paper, we propose an\ninnovative Foundation model for Mo}bile traffic forecasting (FoMo), aiming to\nhandle diverse forecasting tasks of short/long-term predictions and\ndistribution generation across multiple cities to support network planning and\noptimization. FoMo combines diffusion models and transformers, where various\nspatio-temporal masks are proposed to enable FoMo to learn intrinsic features\nof different tasks, and a contrastive learning strategy is developed to capture\nthe correlations between mobile traffic and urban contexts, thereby improving\nits transfer learning capability. Extensive experiments on 9 real-world\ndatasets demonstrate that FoMo outperforms current models concerning diverse\nforecasting tasks and zero/few-shot learning, showcasing a strong universality.\n","authors":["Haoye Chai","Xiaoqian Qi","Shiyuan Zhang","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2410.15322v2.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.07888v1","updated":"2025-01-14T06:54:39Z","published":"2025-01-14T06:54:39Z","title":"Tarsier2: Advancing Large Vision-Language Models from Detailed Video\n  Description to Comprehensive Video Understanding","summary":"  We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM)\ndesigned for generating detailed and accurate video descriptions, while also\nexhibiting superior general video understanding capabilities. Tarsier2 achieves\nsignificant advancements through three key upgrades: (1) Scaling pre-training\ndata from 11M to 40M video-text pairs, enriching both volume and diversity; (2)\nPerforming fine-grained temporal alignment during supervised fine-tuning; (3)\nUsing model-based sampling to automatically construct preference data and\napplying DPO training for optimization. Extensive experiments show that\nTarsier2-7B consistently outperforms leading proprietary models, including\nGPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K\nbenchmark, Tarsier2-7B improves F1 by 2.8\\% over GPT-4o and 5.8\\% over\nGemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\\%\nperformance advantage over GPT-4o and +24.9\\% over Gemini-1.5-Pro. Tarsier2-7B\nalso sets new state-of-the-art results across 15 public benchmarks, spanning\ntasks such as video question-answering, video grounding, hallucination test,\nand embodied question-answering, demonstrating its versatility as a robust\ngeneralist vision-language model.\n","authors":["Liping Yuan","Jiawei Wang","Haomiao Sun","Yuchen Zhang","Yuan Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07886v1","updated":"2025-01-14T06:54:17Z","published":"2025-01-14T06:54:17Z","title":"Iterative Label Refinement Matters More than Preference Optimization\n  under Weak Supervision","summary":"  Language model (LM) post-training relies on two stages of human supervision:\ntask demonstrations for supervised finetuning (SFT), followed by preference\ncomparisons for reinforcement learning from human feedback (RLHF). As LMs\nbecome more capable, the tasks they are given become harder to supervise. Will\npost-training remain effective under unreliable supervision? To test this, we\nsimulate unreliable demonstrations and comparison feedback using small LMs and\ntime-constrained humans. We find that in the presence of unreliable\nsupervision, SFT still retains some effectiveness, but DPO (a common RLHF\nalgorithm) fails to improve the model beyond SFT. To address this, we propose\niterative label refinement (ILR) as an alternative to RLHF. ILR improves the\nSFT data by using comparison feedback to decide whether human demonstrations\nshould be replaced by model-generated alternatives, then retrains the model via\nSFT on the updated data. SFT+ILR outperforms SFT+DPO on several tasks with\nunreliable supervision (math, coding, and safe instruction-following). Our\nfindings suggest that as LMs are used for complex tasks where human supervision\nis unreliable, RLHF may no longer be the best use of human comparison feedback;\ninstead, it is better to direct feedback towards improving the training data\nrather than continually training the model. Our code and data are available at\nhttps://github.com/helloelwin/iterative-label-refinement.\n","authors":["Yaowen Ye","Cassidy Laidlaw","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2501.07886v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2309.03852v3","updated":"2025-01-14T06:40:36Z","published":"2023-09-07T17:07:36Z","title":"FLM-101B: An Open LLM and How to Train It with $100K Budget","summary":"  Large language models (LLMs) are considered important approaches towards\nfoundational machine intelligence, achieving remarkable success in Natural\nLanguage Processing and multimodal tasks, among others. However, the carbon\nfootprints and financial costs originating from heavy pre-training computation\nis a non-negligible issue. Progressive training methods, inspired by the\nneurogenesis process that grows neural structures, have shown potential to\naccelerate LLM pre-training. However, the algorithms, implementation, and\npractices for progressively training LLMs beyond 100B parameters remain\nunderexplored. In this paper, we show that our model, namely FLM-101B, trained\nwith our growth strategy under a budget of \\$100K, reaches 80\\% of the\nbaselines' performances with only 10\\% of their floating-point operations. We\nbelieve that further studies on progressive training will benefit the community\nby cutting down the costs and promoting green AI. The checkpoint of FLM-101B is\nreleased at https://huggingface.co/CofeAI/FLM-101B.\n","authors":["Xiang Li","Yiqun Yao","Xin Jiang","Xuezhi Fang","Xuying Meng","Siqi Fan","Peng Han","Jing Li","Li Du","Bowen Qin","Zheng Zhang","Aixin Sun","Yequan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03852v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07875v1","updated":"2025-01-14T06:33:40Z","published":"2025-01-14T06:33:40Z","title":"Continual Learning with Embedding Layer Surgery and Task-wise Beam\n  Search using Whisper","summary":"  Current Multilingual ASR models only support a fraction of the world's\nlanguages. Continual Learning (CL) aims to tackle this problem by adding new\nlanguages to pre-trained models while avoiding the loss of performance on\nexisting languages, also known as Catastrophic Forgetting (CF). However,\nexisting CL methods overlook the adaptation of the token embedding lookup table\nat the decoder, despite its significant contribution to CF. We propose\nEmbedding Layer Surgery where separate copies of the token embeddings are\ncreated for each new languages, and one of the copies is selected to replace\nthe old languages embeddings when transcribing the corresponding new language.\nUnfortunately, this approach means LID errors also cause incorrect ASR\nembedding selection. Our Task-wise Beam Search allows self-correction for such\nmistakes. By adapting Whisper to 10 hours of data for each of 10 unseen\nlanguages from Common Voice, results show that our method reduces the Average\nWER (AWER) of pre-trained languages from 14.2% to 11.9% compared with\nExperience Replay, without compromising the AWER of the unseen languages.\n","authors":["Chin Yuen Kwok","Jia Qi Yip","Eng Siong Chng"],"pdf_url":"https://arxiv.org/pdf/2501.07875v1.pdf","comment":"Published in 2024 IEEE Spoken Language Technology Workshop"},{"id":"http://arxiv.org/abs/2410.23111v6","updated":"2025-01-14T06:25:54Z","published":"2024-10-30T15:23:44Z","title":"Exploring Gradient Subspaces: Addressing and Overcoming LoRA's\n  Limitations in Federated Fine-Tuning of Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated remarkable capabilities across\nvarious domains, particularly in task generalization for both text and vision\ndata. While fine-tuning these models can significantly enhance their\nperformance on specific downstream tasks, it often requires high-quality data\nthat cannot be shared due to privacy concerns. Federated Learning (FL) offers a\npromising solution for collaborative training without direct data sharing.\nHowever, many parameter-efficient fine-tuning strategies for LLMs in FL,\nparticularly those based on Low-Rank Adaptation (LoRA), face limitations. In\nthis paper, we critically analyze the convergence and performance guarantees of\npopular FL frameworks utilizing LoRA, highlighting its suboptimal nature due to\nconstrained subspace learning of low-rank matrices. This limitation hinders\neffective fine-tuning of LLMs in federated settings. Through rigorous\nanalytical and empirical evaluations, we demonstrate that direct weight\naveraging outperforms LoRA-based strategies, leading to superior performance\nfor fine-tuned models. Our comprehensive comparison unmasks inefficiencies in\nLoRA approaches and underscores the advantages of direct weight aggregation. We\nextend our analysis to low-rank gradient-based optimizers, such as GaLore, used\nduring local training steps. Our findings show that GaLore along with\ndirect-weight aggregation is a more effective approach, outperforming federated\nLoRA methods like FlexLoRA and FFA-LoRA across both text and image modalities.\nWhile privacy remains paramount in FL discourse, our focus is on assessing\nperformance outcomes of federated fine-tuned models and evaluating various FL\nframeworks from both theoretical and empirical perspectives. Our findings\nadvocate reassessing the reliance on LoRA within FL contexts, paving the way\nfor more efficient training methodologies.\n","authors":["Navyansh Mahla","Kshitij Sharad Jadhav","Ganesh Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2410.23111v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19982v2","updated":"2025-01-14T06:18:03Z","published":"2024-10-25T21:46:25Z","title":"Random Policy Enables In-Context Reinforcement Learning within Trust\n  Horizons","summary":"  Pretrained foundation models have exhibited extraordinary in-context learning\nperformance, allowing zero-shot generalization to new tasks not encountered\nduring pretraining. In the case of reinforcement learning (RL), in-context RL\n(ICRL) emerges when pretraining FMs on decision-making problems in an\nautoregressive-supervised manner. Nevertheless, current state-of-the-art ICRL\nalgorithms, like Algorithm Distillation, Decision Pretrained Transformer and\nDecision Importance Transformer, impose stringent requirements on the\npretraining dataset concerning the source policies, context information, and\naction labels. Notably, these algorithms either demand optimal policies or\nrequire varying degrees of well-trained behavior policies for all pretraining\nenvironments. This significantly hinders the application of ICRL to real-world\nscenarios, where acquiring optimal or well-trained policies for a substantial\nvolume of real-world training environments can be intractable. To overcome this\nchallenge, we introduce a novel approach, termed State-Action Distillation\n(SAD), that allows to generate an effective pretraining dataset guided solely\nby random policies. In particular, SAD selects query states and corresponding\naction labels by distilling outstanding state-action pairs from the entire\nstate and action spaces by using random policies within a trust horizon, and\nthen inherits the classical autoregressive-supervised mechanism during\npretraining. To the best of our knowledge, this is the first work that enables\neffective ICRL under random policies and random contexts. We also establish\nquantitative analysis of the trustworthiness as well as the performance\nguarantees of SAD. Moreover, our empirical results across multiple popular ICRL\nbenchmark environments demonstrate that, on average, SAD outperforms the best\nbaseline by 236.3% in the offline evaluation and by 135.2% in the online\nevaluation.\n","authors":["Weiqin Chen","Santiago Paternain"],"pdf_url":"https://arxiv.org/pdf/2410.19982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09012v2","updated":"2025-01-14T06:06:54Z","published":"2024-12-12T07:23:52Z","title":"What Makes Cryptic Crosswords Challenging for LLMs?","summary":"  Cryptic crosswords are puzzles that rely on general knowledge and the\nsolver's ability to manipulate language on different levels, dealing with\nvarious types of wordplay. Previous research suggests that solving such puzzles\nis challenging even for modern NLP models, including Large Language Models\n(LLMs). However, there is little to no research on the reasons for their poor\nperformance on this task. In this paper, we establish the benchmark results for\nthree popular LLMs: Gemma2, LLaMA3 and ChatGPT, showing that their performance\non this task is still significantly below that of humans. We also investigate\nwhy these models struggle to achieve superior performance. We release our code\nand introduced datasets at\nhttps://github.com/bodasadallah/decrypting-crosswords.\n","authors":["Abdelrahman Sadallah","Daria Kotova","Ekaterina Kochmar"],"pdf_url":"https://arxiv.org/pdf/2412.09012v2.pdf","comment":"COLING 2025. arXiv admin note: text overlap with arXiv:2403.12094"},{"id":"http://arxiv.org/abs/2403.11075v2","updated":"2025-01-14T06:02:50Z","published":"2024-03-17T03:52:52Z","title":"GOMA: Proactive Embodied Cooperative Communication via Goal-Oriented\n  Mental Alignment","summary":"  Verbal communication plays a crucial role in human cooperation, particularly\nwhen the partners only have incomplete information about the task, environment,\nand each other's mental state. In this paper, we propose a novel cooperative\ncommunication framework, Goal-Oriented Mental Alignment (GOMA). GOMA formulates\nverbal communication as a planning problem that minimizes the misalignment\nbetween the parts of agents' mental states that are relevant to the goals. This\napproach enables an embodied assistant to reason about when and how to\nproactively initialize communication with humans verbally using natural\nlanguage to help achieve better cooperation. We evaluate our approach against\nstrong baselines in two challenging environments, Overcooked (a multiplayer\ngame) and VirtualHome (a household simulator). Our experimental results\ndemonstrate that large language models struggle with generating meaningful\ncommunication that is grounded in the social and physical context. In contrast,\nour approach can successfully generate concise verbal communication for the\nembodied assistant to effectively boost the performance of the cooperation as\nwell as human users' perception of the assistant.\n","authors":["Lance Ying","Kunal Jha","Shivam Aarya","Joshua B. Tenenbaum","Antonio Torralba","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2403.11075v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.07859v1","updated":"2025-01-14T05:55:20Z","published":"2025-01-14T05:55:20Z","title":"deepTerra -- AI Land Classification Made Easy","summary":"  deepTerra is a comprehensive platform designed to facilitate the\nclassification of land surface features using machine learning and satellite\nimagery. The platform includes modules for data collection, image augmentation,\ntraining, testing, and prediction, streamlining the entire workflow for image\nclassification tasks. This paper presents a detailed overview of the\ncapabilities of deepTerra, shows how it has been applied to various research\nareas, and discusses the future directions it might take.\n","authors":["Andrew Keith Wilkinson"],"pdf_url":"https://arxiv.org/pdf/2501.07859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07857v1","updated":"2025-01-14T05:48:27Z","published":"2025-01-14T05:48:27Z","title":"Hierarchical Repository-Level Code Summarization for Business\n  Applications Using Local LLMs","summary":"  In large-scale software development, understanding the functionality and\nintent behind complex codebases is critical for effective development and\nmaintenance. While code summarization has been widely studied, existing methods\nprimarily focus on smaller code units, such as functions, and struggle with\nlarger code artifacts like files and packages. Additionally, current\nsummarization models tend to emphasize low-level implementation details, often\noverlooking the domain and business context that are crucial for real-world\napplications. This paper proposes a two-step hierarchical approach for\nrepository-level code summarization, tailored to business applications. First,\nsmaller code units such as functions and variables are identified using syntax\nanalysis and summarized with local LLMs. These summaries are then aggregated to\ngenerate higher-level file and package summaries. To ensure the summaries are\ngrounded in business context, we design custom prompts that capture the\nintended purpose of code artifacts based on the domain and problem context of\nthe business application. We evaluate our approach on a business support system\n(BSS) for the telecommunications domain, showing that syntax analysis-based\nhierarchical summarization improves coverage, while business-context grounding\nenhances the relevance of the generated summaries.\n","authors":["Nilesh Dhulshette","Sapan Shah","Vinay Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2501.07857v1.pdf","comment":"To appear at LLM4Code@ICSE 2025"},{"id":"http://arxiv.org/abs/2501.07855v1","updated":"2025-01-14T05:43:59Z","published":"2025-01-14T05:43:59Z","title":"State-of-the-Art Transformer Models for Image Super-Resolution:\n  Techniques, Challenges, and Applications","summary":"  Image Super-Resolution (SR) aims to recover a high-resolution image from its\nlow-resolution counterpart, which has been affected by a specific degradation\nprocess. This is achieved by enhancing detail and visual quality. Recent\nadvancements in transformer-based methods have remolded image super-resolution\nby enabling high-quality reconstructions surpassing previous deep-learning\napproaches like CNN and GAN-based. This effectively addresses the limitations\nof previous methods, such as limited receptive fields, poor global context\ncapture, and challenges in high-frequency detail recovery. Additionally, the\npaper reviews recent trends and advancements in transformer-based SR models,\nexploring various innovative techniques and architectures that combine\ntransformers with traditional networks to balance global and local contexts.\nThese neoteric methods are critically analyzed, revealing promising yet\nunexplored gaps and potential directions for future research. Several\nvisualizations of models and techniques are included to foster a holistic\nunderstanding of recent trends. This work seeks to offer a structured roadmap\nfor researchers at the forefront of deep learning, specifically exploring the\nimpact of transformers on super-resolution techniques.\n","authors":["Debasish Dutta","Deepjyoti Chetia","Neeharika Sonowal","Sanjib Kr Kalita"],"pdf_url":"https://arxiv.org/pdf/2501.07855v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2501.07853v1","updated":"2025-01-14T05:41:09Z","published":"2025-01-14T05:41:09Z","title":"Optimizing Language Models for Grammatical Acceptability: A Comparative\n  Study of Fine-Tuning Techniques","summary":"  This study explores the fine-tuning (FT) of the Open Pre-trained Transformer\n(OPT-125M) for grammatical acceptability tasks using the CoLA dataset. By\ncomparing Vanilla-Fine-Tuning (VFT), Pattern-Based-Fine-Tuning (PBFT), and\nParameter-Efficient Fine-Tuning techniques (PEFT) like Low-Rank Adaptation\n(LoRA), we demonstrate significant improvements in computational efficiency\nwhile maintaining high accuracy. Our experiments reveal that while VFT achieves\nthe highest accuracy (81.2%), LoRA enhancing FT by reducing memory usage and\niteration time by more than 50%, and increases accuracy in PBFT case. Context\nDistillation (CD), though computationally efficient, underperformed with\naccuracy around 31%. Our findings contribute to democratizing access to large\nlanguage models (LLM) by reducing computational barriers.\n","authors":["Shobhit Ratan","Farley Knight","Ghada Jerfel","Sze Chung Ho"],"pdf_url":"https://arxiv.org/pdf/2501.07853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03865v4","updated":"2025-01-14T05:23:03Z","published":"2024-11-06T12:19:01Z","title":"AdaSociety: An Adaptive Environment with Social Structures for\n  Multi-Agent Decision-Making","summary":"  Traditional interactive environments limit agents' intelligence growth with\nfixed tasks. Recently, single-agent environments address this by generating new\ntasks based on agent actions, enhancing task diversity. We consider the\ndecision-making problem in multi-agent settings, where tasks are further\ninfluenced by social connections, affecting rewards and information access.\nHowever, existing multi-agent environments lack a combination of adaptive\nphysical surroundings and social connections, hindering the learning of\nintelligent behaviors. To address this, we introduce AdaSociety, a customizable\nmulti-agent environment featuring expanding state and action spaces, alongside\nexplicit and alterable social structures. As agents progress, the environment\nadaptively generates new tasks with social structures for agents to undertake.\nIn AdaSociety, we develop three mini-games showcasing distinct social\nstructures and tasks. Initial results demonstrate that specific social\nstructures can promote both individual and collective benefits, though current\nreinforcement learning and LLM-based algorithms show limited effectiveness in\nleveraging social structures to enhance performance. Overall, AdaSociety serves\nas a valuable research platform for exploring intelligence in diverse physical\nand social settings. The code is available at\nhttps://github.com/bigai-ai/AdaSociety.\n","authors":["Yizhe Huang","Xingbo Wang","Hao Liu","Fanqi Kong","Aoyang Qin","Min Tang","Song-Chun Zhu","Mingjie Bi","Siyuan Qi","Xue Feng"],"pdf_url":"https://arxiv.org/pdf/2411.03865v4.pdf","comment":"Accepted at NeurIPS D&B 2024"},{"id":"http://arxiv.org/abs/2501.07849v1","updated":"2025-01-14T05:21:27Z","published":"2025-01-14T05:21:27Z","title":"Unveiling Provider Bias in Large Language Models for Code Generation","summary":"  Large Language Models (LLMs) have emerged as the new recommendation engines,\noutperforming traditional methods in both capability and scope, particularly in\ncode generation applications. Our research reveals a novel provider bias in\nLLMs, namely without explicit input prompts, these models show systematic\npreferences for services from specific providers in their recommendations\n(e.g., favoring Google Cloud over Microsoft Azure). This bias holds significant\nimplications for market dynamics and societal equilibrium, potentially\npromoting digital monopolies. It may also deceive users and violate their\nexpectations, leading to various consequences. This paper presents the first\ncomprehensive empirical study of provider bias in LLM code generation. We\ndevelop a systematic methodology encompassing an automated pipeline for dataset\ngeneration, incorporating 6 distinct coding task categories and 30 real-world\napplication scenarios. Our analysis encompasses over 600,000 LLM-generated\nresponses across seven state-of-the-art models, utilizing approximately 500\nmillion tokens (equivalent to \\$5,000+ in computational costs). The study\nevaluates both the generated code snippets and their embedded service provider\nselections to quantify provider bias. Additionally, we conduct a comparative\nanalysis of seven debiasing prompting techniques to assess their efficacy in\nmitigating these biases. Our findings demonstrate that LLMs exhibit significant\nprovider preferences, predominantly favoring services from Google and Amazon,\nand can autonomously modify input code to incorporate their preferred providers\nwithout users' requests. Notably, we observe discrepancies between providers\nrecommended in conversational contexts versus those implemented in generated\ncode. The complete dataset and analysis results are available in our\nrepository.\n","authors":["Xiaoyu Zhang","Juan Zhai","Shiqing Ma","Qingshuang Bao","Weipeng Jiang","Chao Shen","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.07849v1.pdf","comment":"21 pages, 15 figures"},{"id":"http://arxiv.org/abs/2411.14773v2","updated":"2025-01-14T05:16:18Z","published":"2024-11-22T07:29:26Z","title":"Mode-conditioned music learning and composition: a spiking neural\n  network inspired by neuroscience and psychology","summary":"  Musical mode is one of the most critical element that establishes the\nframework of pitch organization and determines the harmonic relationships.\nPrevious works often use the simplistic and rigid alignment method, and\noverlook the diversity of modes. However, in contrast to AI models, humans\npossess cognitive mechanisms for perceiving the various modes and keys. In this\npaper, we propose a spiking neural network inspired by brain mechanisms and\npsychological theories to represent musical modes and keys, ultimately\ngenerating musical pieces that incorporate tonality features. Specifically, the\ncontributions are detailed as follows: 1) The model is designed with multiple\ncollaborated subsystems inspired by the structures and functions of\ncorresponding brain regions; 2)We incorporate mechanisms for neural circuit\nevolutionary learning that enable the network to learn and generate\nmode-related features in music, reflecting the cognitive processes involved in\nhuman music perception. 3)The results demonstrate that the proposed model shows\na connection framework closely similar to the Krumhansl-Schmuckler model, which\nis one of the most significant key perception models in the music psychology\ndomain. 4) Experiments show that the model can generate music pieces with\ncharacteristics of the given modes and keys. Additionally, the quantitative\nassessments of generated pieces reveals that the generating music pieces have\nboth tonality characteristics and the melodic adaptability needed to generate\ndiverse and musical content. By combining insights from neuroscience,\npsychology, and music theory with advanced neural network architectures, our\nresearch aims to create a system that not only learns and generates music but\nalso bridges the gap between human cognition and artificial intelligence.\n","authors":["Qian Liang","Yi Zeng","Menghaoran Tang"],"pdf_url":"https://arxiv.org/pdf/2411.14773v2.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.03461v2","updated":"2025-01-14T04:53:30Z","published":"2025-01-07T01:35:56Z","title":"Radar Signal Recognition through Self-Supervised Learning and Domain\n  Adaptation","summary":"  Automatic radar signal recognition (RSR) plays a pivotal role in electronic\nwarfare (EW), as accurately classifying radar signals is critical for informing\ndecision-making processes. Recent advances in deep learning have shown\nsignificant potential in improving RSR performance in domains with ample\nannotated data. However, these methods fall short in EW scenarios where\nannotated RF data are scarce or impractical to obtain. To address these\nchallenges, we introduce a self-supervised learning (SSL) method which utilises\nmasked signal modelling and RF domain adaption to enhance RSR performance in\nenvironments with limited RF samples and labels. Specifically, we investigate\npre-training masked autoencoders (MAE) on baseband in-phase and quadrature\n(I/Q) signals from various RF domains and subsequently transfer the learned\nrepresentation to the radar domain, where annotated data are limited. Empirical\nresults show that our lightweight self-supervised ResNet model with domain\nadaptation achieves up to a 17.5% improvement in 1-shot classification accuracy\nwhen pre-trained on in-domain signals (i.e., radar signals) and up to a 16.31%\nimprovement when pre-trained on out-of-domain signals (i.e., comm signals),\ncompared to its baseline without SSL. We also provide reference results for\nseveral MAE designs and pre-training strategies, establishing a new benchmark\nfor few-shot radar signal classification.\n","authors":["Zi Huang","Simon Denman","Akila Pemasiri","Clinton Fookes","Terrence Martin"],"pdf_url":"https://arxiv.org/pdf/2501.03461v2.pdf","comment":"5 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.07837v1","updated":"2025-01-14T04:41:03Z","published":"2025-01-14T04:41:03Z","title":"A Driver Advisory System Based on Large Language Model for High-speed\n  Train","summary":"  With the rapid development of China high-speed railway, drivers face\nincreasingly significant technical challenges during operations, such as fault\nhandling. Currently, drivers depend on the onboard mechanic when facing\ntechnical issues, for instance, traction loss or sensor faults. This dependency\ncan hinder effective operation, even lead to accidents, while waiting for\nfaults to be addressed. To enhance the accuracy and explainability of actions\nduring fault handling, an Intelligent Driver Advisory System (IDAS) framework\nbased on a large language model (LLM) named IDAS-LLM, is introduced. Initially,\ndomain-fine-tuning of the LLM is performed using a constructed railway\nknowledge question-and-answer dataset to improve answer accuracy in\nrailway-related questions. Subsequently, integration of the Retrieval-augmented\nGeneration (RAG) architecture is pursued for system design to enhance the\nexplainability of generated responses. Comparative experiments are conducted\nusing the constructed railway driving knowledge assessment dataset. Results\nindicate that domain-fine-tuned LLMs show an improvement in answer accuracy by\nan average of 10%, outperforming some current mainstream LLMs. Additionally,\nthe inclusion of the RAG framework increases the average recall rate of\nquestion-and-answer sessions by about 4%. Finally, the fault handling\ncapability of IDAS-LLM is demonstrated through simulations of real operational\nscenarios, proving that the proposed framework has practical application\nprospects.\n","authors":["Y. C. Luo","J. Xun","W. Wang","R. Z. Zhang","Z. C. Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07837v1.pdf","comment":"18 pages, 7 figures, presented at 104th TRB Annual Meeting"},{"id":"http://arxiv.org/abs/2501.07834v1","updated":"2025-01-14T04:35:37Z","published":"2025-01-14T04:35:37Z","title":"Flow: A Modular Approach to Automated Agentic Workflow Generation","summary":"  Multi-agent frameworks powered by large language models (LLMs) have\ndemonstrated great success in automated planning and task execution. However,\nthe effective adjustment of Agentic workflows during execution has not been\nwell-studied. A effective workflow adjustment is crucial, as in many real-world\nscenarios, the initial plan must adjust to unforeseen challenges and changing\nconditions in real-time to ensure the efficient execution of complex tasks. In\nthis paper, we define workflows as an activity-on-vertex (AOV) graphs. We\ncontinuously refine the workflow by dynamically adjusting task allocations\nbased on historical performance and previous AOV with LLM agents. To further\nenhance system performance, we emphasize modularity in workflow design based on\nmeasuring parallelism and dependence complexity. Our proposed multi-agent\nframework achieved efficient sub-task concurrent execution, goal achievement,\nand error tolerance. Empirical results across different practical tasks\ndemonstrate dramatic improvements in the efficiency of multi-agent frameworks\nthrough dynamic workflow updating and modularization.\n","authors":["Boye Niu","Yiliao Song","Kai Lian","Yifan Shen","Yu Yao","Kun Zhang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.07834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11869v3","updated":"2025-01-14T04:25:23Z","published":"2024-08-19T02:27:00Z","title":"ELDER: Enhancing Lifelong Model Editing with Mixture-of-LoRA","summary":"  Large language models (LLMs) require model editing to efficiently update\nspecific knowledge within them and avoid factual errors. Most model editing\nmethods are solely designed for single-time use and result in a significant\nforgetting effect in lifelong editing scenarios, where sequential edits are\nconducted over time. Previous approaches manage sequential edits by freezing\noriginal parameters and discretely allocating new parameters for each knowledge\nupdate. However, these methods lack robustness to minor input variations due to\nthe discrete mapping between data and parameters. To overcome this challenge,\nwe propose ELDER, a novel approach to create a continuous association between\ndata and adapters. ELDER integrates multiple LoRAs through a router network and\nis trained to establish a smooth data-adapter association, thereby enhancing\nthe edit robustness and generalization of semantically equivalent inputs. To\nensure inputs containing the same knowledge will be processed by the same\nLoRAs, we design a novel loss to guide the model link LoRA allocations with\nedit knowledge. Furthermore, we propose a deferral mechanism to retain the\noriginal LLM capabilities post-edit. Extensive experiments on GPT-2 XL and\nLLaMA2-7B demonstrate that ELDER effectively edits models in the lifelong\nsetting, outperforming eight baselines while exhibiting strong scalability and\npreserving LLMs' general abilities on downstream tasks. Our code is available\nat https://github.com/JiaangL/ELDER.\n","authors":["Jiaang Li","Quan Wang","Zhongnan Wang","Yongdong Zhang","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2408.11869v3.pdf","comment":"Accepted by AAAI-25"},{"id":"http://arxiv.org/abs/2411.15240v3","updated":"2025-01-14T04:10:46Z","published":"2024-11-22T01:58:35Z","title":"AI Foundation Models for Wearable Movement Data in Mental Health\n  Research","summary":"  Pretrained foundation models and transformer architectures have driven the\nsuccess of large language models (LLMs) and other modern AI breakthroughs.\nHowever, similar advancements in health data modeling remain limited due to the\nneed for innovative adaptations. Wearable movement data offers a valuable\navenue for exploration, as it's a core feature in nearly all commercial\nsmartwatches, well established in clinical and mental health research, and the\nsequential nature of the data shares similarities to language. We introduce the\nPretrained Actigraphy Transformer (PAT), the first open source foundation model\ndesigned for time-series wearable movement data. Leveraging transformer-based\narchitectures and novel techniques, such as patch embeddings, and pretraining\non data from 29,307 participants in a national U.S. sample, PAT achieves\nstate-of-the-art performance in several mental health prediction tasks. PAT is\nalso lightweight and easily interpretable, making it a robust tool for mental\nhealth research.\n  GitHub: https://github.com/njacobsonlab/Pretrained-Actigraphy-Transformer/\n","authors":["Franklin Y. Ruan","Aiwei Zhang","Jenny Y. Oh","SouYoung Jin","Nicholas C. Jacobson"],"pdf_url":"https://arxiv.org/pdf/2411.15240v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07824v1","updated":"2025-01-14T03:59:48Z","published":"2025-01-14T03:59:48Z","title":"Real-time Verification and Refinement of Language Model Text Generation","summary":"  Large language models (LLMs) have shown remarkable performance across a wide\nrange of natural language tasks. However, a critical challenge remains in that\nthey sometimes generate factually incorrect answers. To address this, while\nmany previous work has focused on identifying errors in their generation and\nfurther refining them, they are slow in deployment since they are designed to\nverify the response from LLMs only after their entire generation (from the\nfirst to last tokens) is done. Further, we observe that once LLMs generate\nincorrect tokens early on, there is a higher likelihood that subsequent tokens\nwill also be factually incorrect. To this end, in this work, we propose\nStreaming-VR (Streaming Verification and Refinement), a novel approach designed\nto enhance the efficiency of verification and refinement of LLM outputs.\nSpecifically, the proposed Streaming-VR enables on-the-fly verification and\ncorrection of tokens as they are being generated, similar to a streaming\nprocess, ensuring that each subset of tokens is checked and refined in\nreal-time by another LLM as the LLM constructs its response. Through\ncomprehensive evaluations on multiple datasets, we demonstrate that our\napproach not only enhances the factual accuracy of LLMs, but also offers a more\nefficient solution compared to prior refinement methods.\n","authors":["Joonho Ko","Jinheon Baek","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2501.07824v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.07818v1","updated":"2025-01-14T03:43:23Z","published":"2025-01-14T03:43:23Z","title":"A Multi-Encoder Frozen-Decoder Approach for Fine-Tuning Large Language\n  Models","summary":"  Among parameter-efficient fine-tuning methods, freezing has emerged as a\npopular strategy for speeding up training, reducing catastrophic forgetting,\nand improving downstream performance. We investigate the impact of freezing the\ndecoder in a multi-task setup comprising diverse natural language tasks, aiming\nto reduce deployment overhead and enhance portability to novel tasks. Our\nexperiments, conducted by fine-tuning both individual and multi-task setups on\nthe AlexaTM model, reveal that freezing decoders is highly effective for tasks\nwith natural language outputs and mitigates catastrophic forgetting in\nmultilingual tasks. However, we find that pairing frozen decoders with a larger\nmodel can effectively maintain or even enhance performance in structured and QA\ntasks, making it a viable strategy for a broader range of task types.\n","authors":["Kaustubh D. Dhole"],"pdf_url":"https://arxiv.org/pdf/2501.07818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10106v4","updated":"2025-01-14T03:43:02Z","published":"2024-12-13T12:47:30Z","title":"A Cascaded Dilated Convolution Approach for Mpox Lesion Classification","summary":"  The global outbreak of the Mpox virus, classified as a Public Health\nEmergency of International Concern (PHEIC) by the World Health Organization,\npresents significant diagnostic challenges due to its visual similarity to\nother skin lesion diseases. Traditional diagnostic methods for Mpox, which rely\non clinical symptoms and laboratory tests, are slow and labor intensive. Deep\nlearning-based approaches for skin lesion classification offer a promising\nalternative. However, developing a model that balances efficiency with accuracy\nis crucial to ensure reliable and timely diagnosis without compromising\nperformance. This study introduces the Cascaded Atrous Group Attention (CAGA)\nframework to address these challenges, combining the Cascaded Atrous Attention\nmodule and the Cascaded Group Attention mechanism. The Cascaded Atrous\nAttention module utilizes dilated convolutions and cascades the outputs to\nenhance multi-scale representation. This is integrated into the Cascaded Group\nAttention mechanism, which reduces redundancy in Multi-Head Self-Attention. By\nintegrating the Cascaded Atrous Group Attention module with EfficientViT-L1 as\nthe backbone architecture, this approach achieves state-of-the-art performance,\nreaching an accuracy of 98% on the Mpox Close Skin Image (MCSI) dataset while\nreducing model parameters by 37.5% compared to the original EfficientViT-L1.\nThe model's robustness is demonstrated through extensive validation on two\nadditional benchmark datasets, where it consistently outperforms existing\napproaches.\n","authors":["Ayush Deshmukh"],"pdf_url":"https://arxiv.org/pdf/2412.10106v4.pdf","comment":"8 pages, 4 figures, Submitted to Medical Imaging with Deep Learning"},{"id":"http://arxiv.org/abs/2501.07815v1","updated":"2025-01-14T03:26:43Z","published":"2025-01-14T03:26:43Z","title":"Agent-Centric Projection of Prompting Techniques and Implications for\n  Synthetic Training Data for Large Language Models","summary":"  Recent advances in prompting techniques and multi-agent systems for Large\nLanguage Models (LLMs) have produced increasingly complex approaches. However,\nwe lack a framework for characterizing and comparing prompting techniques or\nunderstanding their relationship to multi-agent LLM systems. This position\npaper introduces and explains the concepts of linear contexts (a single,\ncontinuous sequence of interactions) and non-linear contexts (branching or\nmulti-path) in LLM systems. These concepts enable the development of an\nagent-centric projection of prompting techniques, a framework that can reveal\ndeep connections between prompting strategies and multi-agent systems. We\npropose three conjectures based on this framework: (1) results from non-linear\nprompting techniques can predict outcomes in equivalent multi-agent systems,\n(2) multi-agent system architectures can be replicated through single-LLM\nprompting techniques that simulate equivalent interaction patterns, and (3)\nthese equivalences suggest novel approaches for generating synthetic training\ndata. We argue that this perspective enables systematic cross-pollination of\nresearch findings between prompting and multi-agent domains, while providing\nnew directions for improving both the design and training of future LLM\nsystems.\n","authors":["Dhruv Dhamani","Mary Lou Maher"],"pdf_url":"https://arxiv.org/pdf/2501.07815v1.pdf","comment":"8 pages, 5 figures. Accepted at ICAART 2025. Derived from an early\n  draft at 2312.17601. arXiv admin note: substantial text overlap with\n  arXiv:2312.17601"},{"id":"http://arxiv.org/abs/2501.07814v1","updated":"2025-01-14T03:26:05Z","published":"2025-01-14T03:26:05Z","title":"STTS-EAD: Improving Spatio-Temporal Learning Based Time Series\n  Prediction via","summary":"  Handling anomalies is a critical preprocessing step in multivariate time\nseries prediction. However, existing approaches that separate anomaly\npreprocessing from model training for multivariate time series prediction\nencounter significant limitations. Specifically, these methods fail to utilize\nauxiliary information crucial for identifying latent anomalies associated with\nspatiotemporal factors during the preprocessing stage. Instead, they rely\nsolely on data distribution for anomaly detection, which can result in the\nincorrect processing of numerous samples that could otherwise contribute\npositively to model training. To address this, we propose STTS-EAD, an\nend-to-end method that seamlessly integrates anomaly detection into the\ntraining process of multivariate time series forecasting and aims to improve\nSpatio-Temporal learning based Time Series prediction via Embedded Anomaly\nDetection. Our proposed STTS-EAD leverages spatio-temporal information for\nforecasting and anomaly detection, with the two parts alternately executed and\noptimized for each other. To the best of our knowledge, STTS-EAD is the first\nto integrate anomaly detection and forecasting tasks in the training phase for\nimproving the accuracy of multivariate time series forecasting. Extensive\nexperiments on a public stock dataset and two real-world sales datasets from a\nrenowned coffee chain enterprise show that our proposed method can effectively\nprocess detected anomalies in the training stage to improve forecasting\nperformance in the inference stage and significantly outperform baselines.\n","authors":["Yuanyuan Liang","Tianhao Zhang","Tingyu Xie"],"pdf_url":"https://arxiv.org/pdf/2501.07814v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2501.07813v1","updated":"2025-01-14T03:25:26Z","published":"2025-01-14T03:25:26Z","title":"Talk to Right Specialists: Routing and Planning in Multi-agent System\n  for Question Answering","summary":"  Leveraging large language models (LLMs), an agent can utilize\nretrieval-augmented generation (RAG) techniques to integrate external knowledge\nand increase the reliability of its responses. Current RAG-based agents\nintegrate single, domain-specific knowledge sources, limiting their ability and\nleading to hallucinated or inaccurate responses when addressing cross-domain\nqueries. Integrating multiple knowledge bases into a unified RAG-based agent\nraises significant challenges, including increased retrieval overhead and data\nsovereignty when sensitive data is involved. In this work, we propose RopMura,\na novel multi-agent system that addresses these limitations by incorporating\nhighly efficient routing and planning mechanisms. RopMura features two key\ncomponents: a router that intelligently selects the most relevant agents based\non knowledge boundaries and a planner that decomposes complex multi-hop queries\ninto manageable steps, allowing for coordinating cross-domain responses.\nExperimental results demonstrate that RopMura effectively handles both\nsingle-hop and multi-hop queries, with the routing mechanism enabling precise\nanswers for single-hop queries and the combined routing and planning mechanisms\nachieving accurate, multi-step resolutions for complex queries.\n","authors":["Feijie Wu","Zitao Li","Fei Wei","Yaliang Li","Bolin Ding","Jing Gao"],"pdf_url":"https://arxiv.org/pdf/2501.07813v1.pdf","comment":"Work In Progress"},{"id":"http://arxiv.org/abs/2501.07809v1","updated":"2025-01-14T03:20:17Z","published":"2025-01-14T03:20:17Z","title":"Conformal mapping Coordinates Physics-Informed Neural Networks\n  (CoCo-PINNs): learning neural networks for designing neutral inclusions","summary":"  We focus on designing and solving the neutral inclusion problem via neural\nnetworks. The neutral inclusion problem has a long history in the theory of\ncomposite materials, and it is exceedingly challenging to identify the precise\ncondition that precipitates a general-shaped inclusion into a neutral\ninclusion. Physics-informed neural networks (PINNs) have recently become a\nhighly successful approach to addressing both forward and inverse problems\nassociated with partial differential equations. We found that traditional PINNs\nperform inadequately when applied to the inverse problem of designing neutral\ninclusions with arbitrary shapes. In this study, we introduce a novel approach,\nConformal mapping Coordinates Physics-Informed Neural Networks (CoCo-PINNs),\nwhich integrates complex analysis techniques into PINNs. This method exhibits\nstrong performance in solving forward-inverse problems to construct neutral\ninclusions of arbitrary shapes in two dimensions, where the imperfect interface\ncondition on the inclusion's boundary is modeled by training neural networks.\nNotably, we mathematically prove that training with a single linear field is\nsufficient to achieve neutrality for untrained linear fields in arbitrary\ndirections, given a minor assumption. We demonstrate that CoCo-PINNs offer\nenhanced performances in terms of credibility, consistency, and stability.\n","authors":["Daehee Cho","Hyeonmin Yun","Jaeyong Lee","Mikyoung Lim"],"pdf_url":"https://arxiv.org/pdf/2501.07809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07808v1","updated":"2025-01-14T03:19:10Z","published":"2025-01-14T03:19:10Z","title":"A Low-cost and Ultra-lightweight Binary Neural Network for Traffic\n  Signal Recognition","summary":"  The deployment of neural networks in vehicle platforms and wearable\nArtificial Intelligence-of-Things (AIOT) scenarios has become a research area\nthat has attracted much attention. With the continuous evolution of deep\nlearning technology, many image classification models are committed to\nimproving recognition accuracy, but this is often accompanied by problems such\nas large model resource usage, complex structure, and high power consumption,\nwhich makes it challenging to deploy on resource-constrained platforms. Herein,\nwe propose an ultra-lightweight binary neural network (BNN) model designed for\nhardware deployment, and conduct image classification research based on the\nGerman Traffic Sign Recognition Benchmark (GTSRB) dataset. In addition, we also\nverify it on the Chinese Traffic Sign (CTS) and Belgian Traffic Sign (BTS)\ndatasets. The proposed model shows excellent recognition performance with an\naccuracy of up to 97.64%, making it one of the best performing BNN models in\nthe GTSRB dataset. Compared with the full-precision model, the accuracy loss is\ncontrolled within 1%, and the parameter storage overhead of the model is only\n10% of that of the full-precision model. More importantly, our network model\nonly relies on logical operations and low-bit width fixed-point addition and\nsubtraction operations during the inference phase, which greatly simplifies the\ndesign complexity of the processing element (PE). Our research shows the great\npotential of BNN in the hardware deployment of computer vision models,\nespecially in the field of computer vision tasks related to autonomous driving.\n","authors":["Mingke Xiao","Yue Su","Liang Yu","Guanglong Qu","Yutong Jia","Yukuan Chang","Xu Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06783v2","updated":"2025-01-14T03:16:01Z","published":"2025-01-12T11:42:28Z","title":"Cost-Effective Robotic Handwriting System with AI Integration","summary":"  This paper introduces a cost-effective robotic handwriting system designed to\nreplicate human-like handwriting with high precision. Combining a Raspberry Pi\nPico microcontroller, 3D-printed components, and a machine learning-based\nhandwriting generation model implemented via TensorFlow, the system converts\nuser-supplied text into realistic stroke trajectories. By leveraging\nlightweight 3D-printed materials and efficient mechanical designs, the system\nachieves a total hardware cost of approximately \\$56, significantly\nundercutting commercial alternatives. Experimental evaluations demonstrate\nhandwriting precision within $\\pm$0.3 millimeters and a writing speed of\napproximately 200 mm/min, positioning the system as a viable solution for\neducational, research, and assistive applications. This study seeks to lower\nthe barriers to personalized handwriting technologies, making them accessible\nto a broader audience.\n","authors":["Tianyi Huang","Richard Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.06783v2.pdf","comment":"This is an updated version of a paper originally presented at the\n  2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)"},{"id":"http://arxiv.org/abs/2406.12843v3","updated":"2025-01-14T03:08:02Z","published":"2024-06-18T17:57:49Z","title":"Can Go AIs be adversarially robust?","summary":"  Prior work found that superhuman Go AIs can be defeated by simple adversarial\nstrategies, especially \"cyclic\" attacks. In this paper, we study whether adding\nnatural countermeasures can achieve robustness in Go, a favorable domain for\nrobustness since it benefits from incredible average-case capability and a\nnarrow, innately adversarial setting. We test three defenses: adversarial\ntraining on hand-constructed positions, iterated adversarial training, and\nchanging the network architecture. We find that though some of these defenses\nprotect against previously discovered attacks, none withstand freshly trained\nadversaries. Furthermore, most of the reliably effective attacks these\nadversaries discover are different realizations of the same overall class of\ncyclic attacks. Our results suggest that building robust AI systems is\nchallenging even with extremely superhuman systems in some of the most\ntractable settings, and highlight two key gaps: efficient generalization of\ndefenses, and diversity in training. For interactive examples of attacks and a\nlink to our codebase, see https://goattack.far.ai.\n","authors":["Tom Tseng","Euan McLean","Kellin Pelrine","Tony T. Wang","Adam Gleave"],"pdf_url":"https://arxiv.org/pdf/2406.12843v3.pdf","comment":"63 pages, AAAI 2025"},{"id":"http://arxiv.org/abs/2501.07802v1","updated":"2025-01-14T03:03:37Z","published":"2025-01-14T03:03:37Z","title":"Visual Language Models as Operator Agents in the Space Domain","summary":"  This paper explores the application of Vision-Language Models (VLMs) as\noperator agents in the space domain, focusing on both software and hardware\noperational paradigms. Building on advances in Large Language Models (LLMs) and\ntheir multimodal extensions, we investigate how VLMs can enhance autonomous\ncontrol and decision-making in space missions. In the software context, we\nemploy VLMs within the Kerbal Space Program Differential Games (KSPDG)\nsimulation environment, enabling the agent to interpret visual screenshots of\nthe graphical user interface to perform complex orbital maneuvers. In the\nhardware context, we integrate VLMs with robotic systems equipped with cameras\nto inspect and diagnose physical space objects, such as satellites. Our results\ndemonstrate that VLMs can effectively process visual and textual data to\ngenerate contextually appropriate actions, competing with traditional methods\nand non-multimodal LLMs in simulation tasks, and showing promise in real-world\napplications.\n","authors":["Alejandro Carrasco","Marco Nedungadi","Enrico M. Zucchelli","Amit Jain","Victor Rodriguez-Fernandez","Richard Linares"],"pdf_url":"https://arxiv.org/pdf/2501.07802v1.pdf","comment":"Updated version of the paper presented in 2025 AIAA SciTech.\n  https://arc.aiaa.org/doi/10.2514/6.2025-1543"},{"id":"http://arxiv.org/abs/2501.07801v1","updated":"2025-01-14T02:57:20Z","published":"2025-01-14T02:57:20Z","title":"A Comparative Analysis of DNN-based White-Box Explainable AI Methods in\n  Network Security","summary":"  New research focuses on creating artificial intelligence (AI) solutions for\nnetwork intrusion detection systems (NIDS), drawing its inspiration from the\never-growing number of intrusions on networked systems, increasing its\ncomplexity and intelligibility. Hence, the use of explainable AI (XAI)\ntechniques in real-world intrusion detection systems comes from the requirement\nto comprehend and elucidate black-box AI models to security analysts. In an\neffort to meet such requirements, this paper focuses on applying and evaluating\nWhite-Box XAI techniques (particularly LRP, IG, and DeepLift) for NIDS via an\nend-to-end framework for neural network models, using three widely used network\nintrusion datasets (NSL-KDD, CICIDS-2017, and RoEduNet-SIMARGL2021), assessing\nits global and local scopes, and examining six distinct assessment measures\n(descriptive accuracy, sparsity, stability, robustness, efficiency, and\ncompleteness). We also compare the performance of white-box XAI methods with\nblack-box XAI methods. The results show that using White-box XAI techniques\nscores high in robustness and completeness, which are crucial metrics for IDS.\nMoreover, the source codes for the programs developed for our XAI evaluation\nframework are available to be improved and used by the research community.\n","authors":["Osvaldo Arreche","Mustafa Abdallah"],"pdf_url":"https://arxiv.org/pdf/2501.07801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07800v1","updated":"2025-01-14T02:56:19Z","published":"2025-01-14T02:56:19Z","title":"BioPose: Biomechanically-accurate 3D Pose Estimation from Monocular\n  Videos","summary":"  Recent advancements in 3D human pose estimation from single-camera images and\nvideos have relied on parametric models, like SMPL. However, these models\noversimplify anatomical structures, limiting their accuracy in capturing true\njoint locations and movements, which reduces their applicability in\nbiomechanics, healthcare, and robotics. Biomechanically accurate pose\nestimation, on the other hand, typically requires costly marker-based motion\ncapture systems and optimization techniques in specialized labs. To bridge this\ngap, we propose BioPose, a novel learning-based framework for predicting\nbiomechanically accurate 3D human pose directly from monocular videos. BioPose\nincludes three key components: a Multi-Query Human Mesh Recovery model\n(MQ-HMR), a Neural Inverse Kinematics (NeurIK) model, and a 2D-informed pose\nrefinement technique. MQ-HMR leverages a multi-query deformable transformer to\nextract multi-scale fine-grained image features, enabling precise human mesh\nrecovery. NeurIK treats the mesh vertices as virtual markers, applying a\nspatial-temporal network to regress biomechanically accurate 3D poses under\nanatomical constraints. To further improve 3D pose estimations, a 2D-informed\nrefinement step optimizes the query tokens during inference by aligning the 3D\nstructure with 2D pose observations. Experiments on benchmark datasets\ndemonstrate that BioPose significantly outperforms state-of-the-art methods.\nProject website:\n\\url{https://m-usamasaleem.github.io/publication/BioPose/BioPose.html}.\n","authors":["Farnoosh Koleini","Muhammad Usama Saleem","Pu Wang","Hongfei Xue","Ahmed Helmy","Abbey Fenwick"],"pdf_url":"https://arxiv.org/pdf/2501.07800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06252v2","updated":"2025-01-14T02:52:26Z","published":"2025-01-09T01:19:21Z","title":"$\\text{Transformer}^2$: Self-adaptive LLMs","summary":"  Self-adaptive large language models (LLMs) aim to solve the challenges posed\nby traditional fine-tuning methods, which are often computationally intensive\nand static in their ability to handle diverse tasks. We introduce\n$\\text{Transformer}^2$, a novel self-adaptation framework that adapts LLMs for\nunseen tasks in real-time by selectively adjusting only the singular components\nof their weight matrices. During inference, $\\text{Transformer}^2$ employs a\ntwo-pass mechanism: first, a dispatch system identifies the task properties,\nand then task-specific \"expert\" vectors, trained using reinforcement learning,\nare dynamically mixed to obtain targeted behavior for the incoming prompt. Our\nmethod outperforms ubiquitous approaches such as LoRA, with fewer parameters\nand greater efficiency. $\\text{Transformer}^2$ demonstrates versatility across\ndifferent LLM architectures and modalities, including vision-language tasks.\n$\\text{Transformer}^2$ represents a significant leap forward, offering a\nscalable, efficient solution for enhancing the adaptability and task-specific\nperformance of LLMs, paving the way for truly dynamic, self-organizing AI\nsystems.\n","authors":["Qi Sun","Edoardo Cetin","Yujin Tang"],"pdf_url":"https://arxiv.org/pdf/2501.06252v2.pdf","comment":"18 panges, 11 figures, 9 tables"},{"id":"http://arxiv.org/abs/2412.19784v4","updated":"2025-01-14T02:28:28Z","published":"2024-12-27T18:25:27Z","title":"Can AI Help with Your Personal Finances?","summary":"  In recent years, Large Language Models (LLMs) have emerged as a\ntransformative development in artificial intelligence (AI), drawing significant\nattention from industry and academia. Trained on vast datasets, these\nsophisticated AI systems exhibit impressive natural language processing and\ncontent generation capabilities. This paper explores the potential of LLMs to\naddress key challenges in personal finance, focusing on the United States. We\nevaluate several leading LLMs, including OpenAI's ChatGPT, Google's Gemini,\nAnthropic's Claude, and Meta's Llama, to assess their effectiveness in\nproviding accurate financial advice on topics such as mortgages, taxes, loans,\nand investments. Our findings show that while these models achieve an average\naccuracy rate of approximately 70%, they also display notable limitations in\ncertain areas. Specifically, LLMs struggle to provide accurate responses for\ncomplex financial queries, with performance varying significantly across\ndifferent topics. Despite these limitations, the analysis reveals notable\nimprovements in newer versions of these models, highlighting their growing\nutility for individuals and financial advisors. As these AI systems continue to\nevolve, their potential for advancing AI-driven applications in personal\nfinance becomes increasingly promising.\n","authors":["Oudom Hean","Utsha Saha","Binita Saha"],"pdf_url":"https://arxiv.org/pdf/2412.19784v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14880v4","updated":"2025-01-14T01:57:44Z","published":"2024-04-04T20:06:07Z","title":"Dissecting Query-Key Interaction in Vision Transformers","summary":"  Self-attention in vision transformers is often thought to perform perceptual\ngrouping where tokens attend to other tokens with similar embeddings, which\ncould correspond to semantically similar features of an object. However,\nattending to dissimilar tokens can be beneficial by providing contextual\ninformation. We propose to analyze the query-key interaction by the singular\nvalue decomposition of the interaction matrix (i.e.\n${\\textbf{W}_q}^\\top\\textbf{W}_k$). We find that in many ViTs, especially those\nwith classification training objectives, early layers attend more to similar\ntokens, while late layers show increased attention to dissimilar tokens,\nproviding evidence corresponding to perceptual grouping and contextualization,\nrespectively. Many of these interactions between features represented by\nsingular vectors are interpretable and semantic, such as attention between\nrelevant objects, between parts of an object, or between the foreground and\nbackground. This offers a novel perspective on interpreting the attention\nmechanism, which contributes to understanding how transformer models utilize\ncontext and salient features when processing images.\n","authors":["Xu Pan","Aaron Philip","Ziqian Xie","Odelia Schwartz"],"pdf_url":"https://arxiv.org/pdf/2405.14880v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09982v4","updated":"2025-01-14T01:42:46Z","published":"2023-12-15T17:49:24Z","title":"ACPO: AI-Enabled Compiler Framework","summary":"  The key to performance optimization of a program is to decide correctly when\na certain transformation should be applied by a compiler. This is an ideal\nopportunity to apply machine-learning models to speed up the tuning process;\nwhile this realization has been around since the late 90s, only recent\nadvancements in ML enabled a practical application of ML to compilers as an\nend-to-end framework.\n  This paper presents ACPO: An AI-Enabled Compiler Framework, a novel framework\nthat provides LLVM with simple and comprehensive tools to benefit from\nemploying ML models for different optimization passes. We first showcase the\nhigh-level view, class hierarchy, and functionalities of ACPO and subsequently,\ndemonstrate \\taco{a couple of use cases of ACPO by ML-enabling the Loop Unroll\nand Function Inlining passes used in LLVM's O3. and finally, describe how ACPO\ncan be leveraged to optimize other passes. Experimental results reveal that the\nACPO model for Loop Unroll can gain on average 4%, 3%, 5.4%, and 0.2% compared\nto LLVM's vanilla O3 optimization when deployed on Polybench, Coral-2,\nCoreMark, and Graph-500, respectively. Furthermore, by including both Function\nInlining and Loop Unroll models, ACPO can provide a combined speedup of 4.5% on\nPolybench and 2.4% on Cbench when compared with LLVM's O3, respectively.\n","authors":["Amir H. Ashouri","Muhammad Asif Manzoor","Duc Minh Vu","Raymond Zhang","Colin Toft","Ziwen Wang","Angel Zhang","Bryan Chan","Tomasz S. Czajkowski","Yaoqing Gao"],"pdf_url":"https://arxiv.org/pdf/2312.09982v4.pdf","comment":"ACPO (12 pages)"},{"id":"http://arxiv.org/abs/2404.12404v4","updated":"2025-01-14T01:41:21Z","published":"2024-04-15T17:49:16Z","title":"EPIC: Effective Prompting for Imbalanced-Class Data Synthesis in Tabular\n  Data Classification via Large Language Models","summary":"  Large language models (LLMs) have demonstrated remarkable in-context learning\ncapabilities across diverse applications. In this work, we explore the\neffectiveness of LLMs for generating realistic synthetic tabular data,\nidentifying key prompt design elements to optimize performance. We introduce\nEPIC, a novel approach that leverages balanced, grouped data samples and\nconsistent formatting with unique variable mapping to guide LLMs in generating\naccurate synthetic data across all classes, even for imbalanced datasets.\nEvaluations on real-world datasets show that EPIC achieves state-of-the-art\nmachine learning classification performance, significantly improving generation\nefficiency. These findings highlight the effectiveness of EPIC for synthetic\ntabular data generation, particularly in addressing class imbalance. Our source\ncode for our work is available at:\nhttps://seharanul17.github.io/project-synthetic-tabular-llm/\n","authors":["Jinhee Kim","Taesung Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2404.12404v4.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2302.01313v8","updated":"2025-01-14T01:28:03Z","published":"2023-02-02T18:39:30Z","title":"Double Equivariance for Inductive Link Prediction for Both New Nodes and\n  New Relation Types","summary":"  The task of fully inductive link prediction in knowledge graphs has gained\nsignificant attention, with various graph neural networks being proposed to\naddress it. This task presents greater challenges than traditional inductive\nlink prediction tasks with only new nodes, as models must be capable of\nzero-shot generalization to both unseen nodes and unseen relation types in the\ninference graph. Despite the development of novel models, a unifying\ntheoretical understanding of their success remains elusive, and the limitations\nof these methods are not well-studied. In this work, we introduce the concept\nof double permutation-equivariant representations and demonstrate its necessity\nfor effective performance in this task. We show that many existing models,\ndespite their diverse architectural designs, conform to this framework.\nHowever, we also identify inherent limitations in double\npermutation-equivariant representations, which restrict these models's ability\nto learn effectively on datasets with varying characteristics. Our findings\nsuggest that while double equivariance is necessary for meta-learning across\nknowledge graphs from different domains, it is not sufficient. There remains a\nfundamental gap between double permutation-equivariant models and the concept\nof foundation models designed to learn patterns across all domains.\n","authors":["Jincheng Zhou","Yucheng Zhang","Jianfei Gao","Yangze Zhou","Bruno Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2302.01313v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07774v1","updated":"2025-01-14T01:16:30Z","published":"2025-01-14T01:16:30Z","title":"Transforming Indoor Localization: Advanced Transformer Architecture for\n  NLOS Dominated Wireless Environments with Distributed Sensors","summary":"  Indoor localization in challenging non-line-of-sight (NLOS) environments\noften leads to mediocre accuracy with traditional approaches. Deep learning\n(DL) has been applied to tackle these challenges; however, many DL approaches\noverlook computational complexity, especially for floating-point operations\n(FLOPs), making them unsuitable for resource-limited devices. Transformer-based\nmodels have achieved remarkable success in natural language processing (NLP)\nand computer vision (CV) tasks, motivating their use in wireless applications.\nHowever, their use in indoor localization remains nascent, and directly\napplying Transformers for indoor localization can be both computationally\nintensive and exhibit limitations in accuracy. To address these challenges, in\nthis work, we introduce a novel tokenization approach, referred to as Sensor\nSnapshot Tokenization (SST), which preserves variable-specific representations\nof power delay profile (PDP) and enhances attention mechanisms by effectively\ncapturing multi-variate correlation. Complementing this, we propose a\nlightweight Swish-Gated Linear Unit-based Transformer (L-SwiGLU Transformer)\nmodel, designed to reduce computational complexity without compromising\nlocalization accuracy. Together, these contributions mitigate the computational\nburden and dependency on large datasets, making Transformer models more\nefficient and suitable for resource-constrained scenarios. The proposed\ntokenization method enables the Vanilla Transformer to achieve a 90th\npercentile positioning error of 0.388 m in a highly NLOS indoor factory,\nsurpassing conventional tokenization methods. The L-SwiGLU ViT further reduces\nthe error to 0.355 m, achieving an 8.51% improvement. Additionally, the\nproposed model outperforms a 14.1 times larger model with a 46.13% improvement,\nunderscoring its computational efficiency.\n","authors":["Saad Masrur"," Jung-Fu"," Cheng","Atieh R. Khamesi","Ismail Guvenc"],"pdf_url":"https://arxiv.org/pdf/2501.07774v1.pdf","comment":"The paper has been submitted to IEEE Transactions on Machine Learning\n  in Communications and Networking"},{"id":"http://arxiv.org/abs/2501.07766v1","updated":"2025-01-14T00:47:24Z","published":"2025-01-14T00:47:24Z","title":"Large Language Models for Knowledge Graph Embedding Techniques, Methods,\n  and Challenges: A Survey","summary":"  Large Language Models (LLMs) have attracted a lot of attention in various\nfields due to their superior performance, aiming to train hundreds of millions\nor more parameters on large amounts of text data to understand and generate\nnatural language. As the superior performance of LLMs becomes apparent, they\nare increasingly being applied to knowledge graph embedding (KGE) related tasks\nto improve the processing results. As a deep learning model in the field of\nNatural Language Processing (NLP), it learns a large amount of textual data to\npredict the next word or generate content related to a given text. However,\nLLMs have recently been invoked to varying degrees in different types of KGE\nrelated scenarios such as multi-modal KGE and open KGE according to their task\ncharacteristics. In this paper, we investigate a wide range of approaches for\nperforming LLMs-related tasks in different types of KGE scenarios. To better\ncompare the various approaches, we summarize each KGE scenario in a\nclassification. In addition to the categorization methods, we provide a tabular\noverview of the methods and their source code links for a more direct\ncomparison. In the article we also discuss the applications in which the\nmethods are mainly used and suggest several forward-looking directions for the\ndevelopment of this new research area.\n","authors":["Bingchen Liu","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2501.07766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07764v1","updated":"2025-01-14T00:47:05Z","published":"2025-01-14T00:47:05Z","title":"Deep Learning for Disease Outbreak Prediction: A Robust Early Warning\n  Signal for Transcritical Bifurcations","summary":"  Early Warning Signals (EWSs) are vital for implementing preventive measures\nbefore a disease turns into a pandemic. While new diseases exhibit unique\nbehaviors, they often share fundamental characteristics from a dynamical\nsystems perspective. Moreover, measurements during disease outbreaks are often\ncorrupted by different noise sources, posing challenges for Time Series\nClassification (TSC) tasks. In this study, we address the problem of having a\nrobust EWS for disease outbreak prediction using a best-performing deep\nlearning model in the domain of TSC. We employed two simulated datasets to\ntrain the model: one representing generated dynamical systems with randomly\nselected polynomial terms to model new disease behaviors, and another\nsimulating noise-induced disease dynamics to account for noisy measurements.\nThe model's performance was analyzed using both simulated data from different\ndisease models and real-world data, including influenza and COVID-19. Results\ndemonstrate that the proposed model outperforms previous models, effectively\nproviding EWSs of impending outbreaks across various scenarios. This study\nbridges advancements in deep learning with the ability to provide robust early\nwarning signals in noisy environments, making it highly applicable to\nreal-world crises involving emerging disease outbreaks.\n","authors":["Reza Miry","Amit K. Chakraborty","Russell Greiner","Mark A. Lewis","Hao Wang","Tianyu Guan","Pouria Ramazi"],"pdf_url":"https://arxiv.org/pdf/2501.07764v1.pdf","comment":"14 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2501.07763v1","updated":"2025-01-14T00:39:46Z","published":"2025-01-14T00:39:46Z","title":"On the Statistical Capacity of Deep Generative Models","summary":"  Deep generative models are routinely used in generating samples from complex,\nhigh-dimensional distributions. Despite their apparent successes, their\nstatistical properties are not well understood. A common assumption is that\nwith enough training data and sufficiently large neural networks, deep\ngenerative model samples will have arbitrarily small errors in sampling from\nany continuous target distribution. We set up a unifying framework that debunks\nthis belief. We demonstrate that broad classes of deep generative models,\nincluding variational autoencoders and generative adversarial networks, are not\nuniversal generators. Under the predominant case of Gaussian latent variables,\nthese models can only generate concentrated samples that exhibit light tails.\nUsing tools from concentration of measure and convex geometry, we give\nanalogous results for more general log-concave and strongly log-concave latent\nvariable distributions. We extend our results to diffusion models via a\nreduction argument. We use the Gromov--Levy inequality to give similar\nguarantees when the latent variables lie on manifolds with positive Ricci\ncurvature. These results shed light on the limited capacity of common deep\ngenerative models to handle heavy tails. We illustrate the empirical relevance\nof our work with simulations and financial data.\n","authors":["Edric Tam","David B. Dunson"],"pdf_url":"https://arxiv.org/pdf/2501.07763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07762v1","updated":"2025-01-14T00:30:22Z","published":"2025-01-14T00:30:22Z","title":"PSReg: Prior-guided Sparse Mixture of Experts for Point Cloud\n  Registration","summary":"  The discriminative feature is crucial for point cloud registration. Recent\nmethods improve the feature discriminative by distinguishing between\nnon-overlapping and overlapping region points. However, they still face\nchallenges in distinguishing the ambiguous structures in the overlapping\nregions. Therefore, the ambiguous features they extracted resulted in a\nsignificant number of outlier matches from overlapping regions. To solve this\nproblem, we propose a prior-guided SMoE-based registration method to improve\nthe feature distinctiveness by dispatching the potential correspondences to the\nsame experts. Specifically, we propose a prior-guided SMoE module by fusing\nprior overlap and potential correspondence embeddings for routing, assigning\ntokens to the most suitable experts for processing. In addition, we propose a\nregistration framework by a specific combination of Transformer layer and\nprior-guided SMoE module. The proposed method not only pays attention to the\nimportance of locating the overlapping areas of point clouds, but also commits\nto finding more accurate correspondences in overlapping areas. Our extensive\nexperiments demonstrate the effectiveness of our method, achieving\nstate-of-the-art registration recall (95.7\\%/79.3\\%) on the 3DMatch/3DLoMatch\nbenchmark. Moreover, we also test the performance on ModelNet40 and demonstrate\nexcellent performance.\n","authors":["Xiaoshui Huang","Zhou Huang","Yifan Zuo","Yongshun Gong","Chengdong Zhang","Deyang Liu","Yuming Fang"],"pdf_url":"https://arxiv.org/pdf/2501.07762v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.07761v1","updated":"2025-01-14T00:28:26Z","published":"2025-01-14T00:28:26Z","title":"Impatient Bandits: Optimizing for the Long-Term Without Delay","summary":"  Increasingly, recommender systems are tasked with improving users' long-term\nsatisfaction. In this context, we study a content exploration task, which we\nformalize as a bandit problem with delayed rewards. There is an apparent\ntrade-off in choosing the learning signal: waiting for the full reward to\nbecome available might take several weeks, slowing the rate of learning,\nwhereas using short-term proxy rewards reflects the actual long-term goal only\nimperfectly. First, we develop a predictive model of delayed rewards that\nincorporates all information obtained to date. Rewards as well as shorter-term\nsurrogate outcomes are combined through a Bayesian filter to obtain a\nprobabilistic belief. Second, we devise a bandit algorithm that quickly learns\nto identify content aligned with long-term success using this new predictive\nmodel. We prove a regret bound for our algorithm that depends on the\n\\textit{Value of Progressive Feedback}, an information theoretic metric that\ncaptures the quality of short-term leading indicators that are observed prior\nto the long-term reward. We apply our approach to a podcast recommendation\nproblem, where we seek to recommend shows that users engage with repeatedly\nover two months. We empirically validate that our approach significantly\noutperforms methods that optimize for short-term proxies or rely solely on\ndelayed rewards, as demonstrated by an A/B test in a recommendation system that\nserves hundreds of millions of users.\n","authors":["Kelly W. Zhang","Thomas Baldwin-McDonald","Kamil Ciosek","Lucas Maystre","Daniel Russo"],"pdf_url":"https://arxiv.org/pdf/2501.07761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14612v2","updated":"2025-01-14T00:20:32Z","published":"2024-11-21T22:28:45Z","title":"Exploiting Boosting in Hyperdimensional Computing for Enhanced\n  Reliability in Healthcare","summary":"  Hyperdimensional computing (HDC) enables efficient data encoding and\nprocessing in high-dimensional space, benefiting machine learning and data\nanalysis. However, underutilization of these spaces can lead to overfitting and\nreduced model reliability, especially in data-limited systems a critical issue\nin sectors like healthcare that demand robustness and consistent performance.\nWe introduce BoostHD, an approach that applies boosting algorithms to partition\nthe hyperdimensional space into subspaces, creating an ensemble of weak\nlearners. By integrating boosting with HDC, BoostHD enhances performance and\nreliability beyond existing HDC methods. Our analysis highlights the importance\nof efficient utilization of hyperdimensional spaces for improved model\nperformance. Experiments on healthcare datasets show that BoostHD outperforms\nstate-of-the-art methods. On the WESAD dataset, it achieved an accuracy of\n98.37%, surpassing Random Forest, XGBoost, and OnlineHD. BoostHD also\ndemonstrated superior inference efficiency and stability, maintaining high\naccuracy under data imbalance and noise. In person-specific evaluations, it\nachieved an average accuracy of 96.19%, outperforming other models. By\naddressing the limitations of both boosting and HDC, BoostHD expands the\napplicability of HDC in critical domains where reliability and precision are\nparamount.\n","authors":["SungHeon Jeong","Hamza Errahmouni Barkam","Sanggeon Yun","Yeseong Kim","Shaahin Angizi","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2411.14612v2.pdf","comment":"Accepted to DATE 2025"},{"id":"http://arxiv.org/abs/2412.13228v3","updated":"2025-01-14T00:18:03Z","published":"2024-12-17T11:30:54Z","title":"TSEML: A task-specific embedding-based method for few-shot\n  classification of cancer molecular subtypes","summary":"  Molecular subtyping of cancer is recognized as a critical and challenging\nupstream task for personalized therapy. Existing deep learning methods have\nachieved significant performance in this domain when abundant data samples are\navailable. However, the acquisition of densely labeled samples for cancer\nmolecular subtypes remains a significant challenge for conventional\ndata-intensive deep learning approaches. In this work, we focus on the few-shot\nmolecular subtype prediction problem in heterogeneous and small cancer\ndatasets, aiming to enhance precise diagnosis and personalized treatment. We\nfirst construct a new few-shot dataset for cancer molecular subtype\nclassification and auxiliary cancer classification, named TCGA Few-Shot, from\nexisting publicly available datasets. To effectively leverage the relevant\nknowledge from both tasks, we introduce a task-specific embedding-based\nmeta-learning framework (TSEML). TSEML leverages the synergistic strengths of a\nmodel-agnostic meta-learning (MAML) approach and a prototypical network\n(ProtoNet) to capture diverse and fine-grained features. Comparative\nexperiments conducted on the TCGA Few-Shot dataset demonstrate that our TSEML\nframework achieves superior performance in addressing the problem of few-shot\nmolecular subtype classification.\n","authors":["Ran Su","Rui Shi","Hui Cui","Ping Xuan","Chengyan Fang","Xikang Feng","Qiangguo Jin"],"pdf_url":"https://arxiv.org/pdf/2412.13228v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08496v1","updated":"2025-01-14T23:59:23Z","published":"2025-01-14T23:59:23Z","title":"Quantifying the Importance of Data Alignment in Downstream Model\n  Performance","summary":"  Contrary to the conventional emphasis on dataset size, we explore the role of\ndata alignment -- an often overlooked aspect of data quality -- in training\ncapable Large Language Models (LLMs). To do so, we use the Task2Vec-based\nalignment coefficient, a quantitative measure of the similarity between two\ndatasets, to quantify the impact of alignment between training data and\nevaluation data on downstream performance. In particular, we conduct controlled\n\\textit{interventional} experiments for two settings: 1. the impact of\nincreased alignment coefficients between various pre-training (pt) against\nevaluation datasets, and 2. the impact of increased alignment coefficients\nbetween domain specific fine-tuning (ft) against domain specific evaluation.\nThe domain specific task we explore is Autoformalization -- the machine\ntranslation task between natural language and code for formal verification. In\nboth settings, we find a strong, predictable negative correlation between the\nalignment coefficient of a model's training and evaluation data and the model's\nloss/perplexity on the respective downstream task. These findings suggest a\nre-evaluation of LLM training approaches, demonstrating the relevance of data\nalignment compared to data quantity, especially in specialized downstream tasks\nsuch as Autoformalization.\n","authors":["Krrish Chawla","Aryan Sahai","Mario DePavia","Sudharsan Sundar","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01641v3","updated":"2025-01-14T23:08:11Z","published":"2024-06-03T06:07:27Z","title":"Reciprocal Reward Influence Encourages Cooperation From Self-Interested\n  Agents","summary":"  Cooperation between self-interested individuals is a widespread phenomenon in\nthe natural world, but remains elusive in interactions between artificially\nintelligent agents. Instead, naive reinforcement learning algorithms typically\nconverge to Pareto-dominated outcomes in even the simplest of social dilemmas.\nAn emerging literature on opponent shaping has demonstrated the ability to\nreach prosocial outcomes by influencing the learning of other agents. However,\nsuch methods differentiate through the learning step of other agents or\noptimize for meta-game dynamics, which rely on privileged access to opponents'\nlearning algorithms or exponential sample complexity, respectively. To provide\na learning rule-agnostic and sample-efficient alternative, we introduce\nReciprocators, reinforcement learning agents which are intrinsically motivated\nto reciprocate the influence of opponents' actions on their returns. This\napproach seeks to modify other agents' $Q$-values by increasing their return\nfollowing beneficial actions (with respect to the Reciprocator) and decreasing\nit after detrimental actions, guiding them towards mutually beneficial actions\nwithout directly differentiating through a model of their policy. We show that\nReciprocators can be used to promote cooperation in temporally extended social\ndilemmas during simultaneous learning. Our code is available at\nhttps://github.com/johnlyzhou/reciprocator/.\n","authors":["John L. Zhou","Weizhe Hong","Jonathan C. Kao"],"pdf_url":"https://arxiv.org/pdf/2406.01641v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2403.12176v3","updated":"2025-01-14T22:44:14Z","published":"2024-03-18T18:49:20Z","title":"Safety Implications of Explainable Artificial Intelligence in End-to-End\n  Autonomous Driving","summary":"  The end-to-end learning pipeline is gradually creating a paradigm shift in\nthe ongoing development of highly autonomous vehicles, largely due to advances\nin deep learning, the availability of large-scale training datasets, and\nimprovements in integrated sensor devices. However, a lack of explainability in\nreal-time decisions with contemporary learning methods impedes user trust and\nattenuates the widespread deployment and commercialization of such vehicles.\nMoreover, the issue is exacerbated when these cars are involved in or cause\ntraffic accidents. Consequently, explainability in end-to-end autonomous\ndriving is essential to build trust in vehicular automation. With that said,\nautomotive researchers have not yet rigorously explored safety benefits and\nconsequences of explanations in end-to-end autonomous driving. This paper aims\nto bridge the gaps between these topics and seeks to answer the following\nresearch question: What are safety implications of explanations in end-to-end\nautonomous driving? In this regard, we first revisit established safety and\nexplainability concepts in end-to-end driving. Furthermore, we present three\ncritical case studies and show the pivotal role of explanations in enhancing\nself-driving safety. Finally, we describe insights from empirical studies and\nreveal potential value, limitations, and caveats of practical explainable AI\nmethods with respect to their safety assurance in end-to-end driving.\n","authors":["Shahin Atakishiyev","Mohammad Salameh","Randy Goebel"],"pdf_url":"https://arxiv.org/pdf/2403.12176v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06372v2","updated":"2025-01-14T22:43:44Z","published":"2024-10-08T21:14:09Z","title":"Cooperative and Asynchronous Transformer-based Mission Planning for\n  Heterogeneous Teams of Mobile Robots","summary":"  Cooperative mission planning for heterogeneous teams of mobile robots\npresents a unique set of challenges, particularly when operating under\ncommunication constraints and limited computational resources. To address these\nchallenges, we propose the Cooperative and Asynchronous Transformer-based\nMission Planning (CATMiP) framework, which leverages multi-agent reinforcement\nlearning (MARL) to coordinate distributed decision making among agents with\ndiverse sensing, motion, and actuation capabilities, operating under sporadic\nad hoc communication. A Class-based Macro-Action Decentralized Partially\nObservable Markov Decision Process (CMacDec-POMDP) is also formulated to\neffectively model asynchronous decision-making for heterogeneous teams of\nagents. The framework utilizes an asynchronous centralized training and\ndistributed execution scheme that is developed based on the Multi-Agent\nTransformer (MAT) architecture. This design allows a single trained model to\ngeneralize to larger environments and accommodate varying team sizes and\ncompositions. We evaluate CATMiP in a 2D grid-world simulation environment and\ncompare its performance against planning-based exploration methods. Results\ndemonstrate CATMiP's superior efficiency, scalability, and robustness to\ncommunication dropouts, highlighting its potential for real-world heterogeneous\nmobile robot systems. The code is available at\nhttps://github.com/mylad13/CATMiP.\n","authors":["Milad Farjadnasab","Shahin Sirouspour"],"pdf_url":"https://arxiv.org/pdf/2410.06372v2.pdf","comment":"27 pages, 8 figures, this work has been submitted to Elsevier for\n  possible publication"},{"id":"http://arxiv.org/abs/2501.08471v1","updated":"2025-01-14T22:36:11Z","published":"2025-01-14T22:36:11Z","title":"Benchmarking Classical, Deep, and Generative Models for Human Activity\n  Recognition","summary":"  Human Activity Recognition (HAR) has gained significant importance with the\ngrowing use of sensor-equipped devices and large datasets. This paper evaluates\nthe performance of three categories of models : classical machine learning,\ndeep learning architectures, and Restricted Boltzmann Machines (RBMs) using\nfive key benchmark datasets of HAR (UCI-HAR, OPPORTUNITY, PAMAP2, WISDM, and\nBerkeley MHAD). We assess various models, including Decision Trees, Random\nForests, Convolutional Neural Networks (CNN), and Deep Belief Networks (DBNs),\nusing metrics such as accuracy, precision, recall, and F1-score for a\ncomprehensive comparison. The results show that CNN models offer superior\nperformance across all datasets, especially on the Berkeley MHAD. Classical\nmodels like Random Forest do well on smaller datasets but face challenges with\nlarger, more complex data. RBM-based models also show notable potential,\nparticularly for feature learning. This paper offers a detailed comparison to\nhelp researchers choose the most suitable model for HAR tasks.\n","authors":["Md Meem Hossain","The Anh Han","Safina Showkat Ara","Zia Ush Shamszaman"],"pdf_url":"https://arxiv.org/pdf/2501.08471v1.pdf","comment":"48 pages, 21 Figures"},{"id":"http://arxiv.org/abs/2501.08470v1","updated":"2025-01-14T22:33:07Z","published":"2025-01-14T22:33:07Z","title":"Detecting Contextual Anomalies by Discovering Consistent Spatial Regions","summary":"  We describe a method for modeling spatial context to enable video anomaly\ndetection. The main idea is to discover regions that share similar object-level\nactivities by clustering joint object attributes using Gaussian mixture models.\nWe demonstrate that this straightforward approach, using orders of magnitude\nfewer parameters than competing models, achieves state-of-the-art performance\nin the challenging spatial-context-dependent Street Scene dataset. As a side\nbenefit, the high-resolution discovered regions learned by the model also\nprovide explainable normalcy maps for human operators without the need for any\npre-trained segmentation model.\n","authors":["Zhengye Yang","Richard J. Radke"],"pdf_url":"https://arxiv.org/pdf/2501.08470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04693v3","updated":"2025-01-14T22:28:39Z","published":"2025-01-08T18:57:33Z","title":"Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous\n  Sensors via Language Grounding","summary":"  Interacting with the world is a multi-sensory experience: achieving effective\ngeneral-purpose interaction requires making use of all available modalities --\nincluding vision, touch, and audio -- to fill in gaps from partial observation.\nFor example, when vision is occluded reaching into a bag, a robot should rely\non its senses of touch and sound. However, state-of-the-art generalist robot\npolicies are typically trained on large datasets to predict robot actions\nsolely from visual and proprioceptive observations. In this work, we propose\nFuSe, a novel approach that enables finetuning visuomotor generalist policies\non heterogeneous sensor modalities for which large datasets are not readily\navailable by leveraging natural language as a common cross-modal grounding. We\ncombine a multimodal contrastive loss with a sensory-grounded language\ngeneration loss to encode high-level semantics. In the context of robot\nmanipulation, we show that FuSe enables performing challenging tasks that\nrequire reasoning jointly over modalities such as vision, touch, and sound in a\nzero-shot setting, such as multimodal prompting, compositional cross-modal\nprompting, and descriptions of objects it interacts with. We show that the same\nrecipe is applicable to widely different generalist policies, including both\ndiffusion-based generalist policies and large vision-language-action (VLA)\nmodels. Extensive experiments in the real world show that FuSeis able to\nincrease success rates by over 20% compared to all considered baselines.\n","authors":["Joshua Jones","Oier Mees","Carmelo Sferrazza","Kyle Stachowicz","Pieter Abbeel","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2501.04693v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08460v1","updated":"2025-01-14T22:09:06Z","published":"2025-01-14T22:09:06Z","title":"Towards Zero-Shot & Explainable Video Description by Reasoning over\n  Graphs of Events in Space and Time","summary":"  In the current era of Machine Learning, Transformers have become the de facto\napproach across a variety of domains, such as computer vision and natural\nlanguage processing. Transformer-based solutions are the backbone of current\nstate-of-the-art methods for language generation, image and video\nclassification, segmentation, action and object recognition, among many others.\nInterestingly enough, while these state-of-the-art methods produce impressive\nresults in their respective domains, the problem of understanding the\nrelationship between vision and language is still beyond our reach. In this\nwork, we propose a common ground between vision and language based on events in\nspace and time in an explainable and programmatic way, to connect\nlearning-based vision and language state of the art models and provide a\nsolution to the long standing problem of describing videos in natural language.\nWe validate that our algorithmic approach is able to generate coherent, rich\nand relevant textual descriptions on videos collected from a variety of\ndatasets, using both standard metrics (e.g. Bleu, ROUGE) and the modern\nLLM-as-a-Jury approach.\n","authors":["Mihai Masala","Marius Leordeanu"],"pdf_url":"https://arxiv.org/pdf/2501.08460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10463v2","updated":"2025-01-14T22:08:13Z","published":"2024-11-03T01:34:45Z","title":"Unexploited Information Value in Human-AI Collaboration","summary":"  Humans and AIs are often paired on decision tasks with the expectation of\nachieving complementary performance -- where the combination of human and AI\noutperforms either one alone. However, how to improve performance of a human-AI\nteam is often not clear without knowing more about what particular information\nand strategies each agent employs. In this paper, we propose a model based in\nstatistical decision theory to analyze human-AI collaboration from the\nperspective of what information could be used to improve a human or AI\ndecision. We demonstrate our model on a deepfake detection task to investigate\nseven video-level features by their unexploited value of information. We\ncompare the human alone, AI alone and human-AI team and offer insights on how\nthe AI assistance impacts people's usage of the information and what\ninformation that the AI exploits well might be useful for improving human\ndecisions.\n","authors":["Ziyang Guo","Yifan Wu","Jason Hartline","Jessica Hullman"],"pdf_url":"https://arxiv.org/pdf/2411.10463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11186v4","updated":"2025-01-14T22:07:53Z","published":"2024-07-15T19:17:31Z","title":"Empowering Persian LLMs for Instruction Following: A Novel Dataset and\n  Training Approach","summary":"  Instruction-tuned large language models have demonstrated remarkable\ncapabilities in following human instructions across various domains. However,\ntheir proficiency remains notably deficient in many low-resource languages. To\naddress this challenge, we begin by introducing FarsInstruct a comprehensive\ninstruction dataset designed to enhance the instruction following ability of\nlarge language models specifically for the Persian language a significant yet\nunderrepresented language globally. FarsInstruct encompasses a wide range of\ntask types and datasets, each containing a mix of straightforward to complex\nmanual written instructions, as well as translations from the Public Pool of\nPrompts, ensuring a rich linguistic and cultural representation. Furthermore,\nwe introduce Co-CoLA, a framework designed to enhance the multi-task\nadaptability of LoRA-tuned models. Through extensive experimental analyses, our\nstudy showcases the effectiveness of the FarsInstruct dataset coupled with\ntraining by the Co-CoLA framework, in improving the performance of large\nlanguage models within the Persian context. As of the current writing,\nFarsInstruct comprises 197 templates across 21 distinct datasets, and we intend\nto update it consistently, thus augmenting its applicability.\n","authors":["Hojjat Mokhtarabadi","Ziba Zamani","Abbas Maazallahi","Mohammad Hossein Manshaei"],"pdf_url":"https://arxiv.org/pdf/2407.11186v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03205v3","updated":"2025-01-14T21:58:47Z","published":"2024-12-04T10:44:50Z","title":"U-MATH: A University-Level Benchmark for Evaluating Mathematical Skills\n  in LLMs","summary":"  The current evaluation of mathematical skills in LLMs is limited, as existing\nbenchmarks are either relatively small, primarily focus on elementary and\nhigh-school problems, or lack diversity in topics. Additionally, the inclusion\nof visual elements in tasks remains largely under-explored.\n  To address these gaps, we introduce U-MATH, a novel benchmark of 1,100\nunpublished open-ended university-level problems sourced from teaching\nmaterials. It is balanced across six core subjects, with 20% of multimodal\nproblems. Given the open-ended nature of U-MATH problems, we employ an LLM to\njudge the correctness of generated solutions. To this end, we release\n$\\mu$-MATH, a dataset to evaluate the LLMs' capabilities in judging solutions.\n  The evaluation of general domain, math-specific, and multimodal LLMs\nhighlights the challenges presented by U-MATH. Our findings reveal that LLMs\nachieve a maximum accuracy of only 63% on text-based tasks, with even lower 45%\non visual problems. The solution assessment proves challenging for LLMs, with\nthe best LLM judge having an F1-score of 80% on $\\mu$-MATH.\n","authors":["Konstantin Chernyshev","Vitaliy Polshkov","Ekaterina Artemova","Alex Myasnikov","Vlad Stepanov","Alexei Miasnikov","Sergei Tilga"],"pdf_url":"https://arxiv.org/pdf/2412.03205v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00564v3","updated":"2025-01-14T21:45:16Z","published":"2024-08-31T23:28:10Z","title":"Using Deep Learning to Design High Aspect Ratio Fusion Devices","summary":"  The design of fusion devices is typically based on computationally expensive\nsimulations. This can be alleviated using high aspect ratio models that employ\na reduced number of free parameters, especially in the case of stellarator\noptimization where non-axisymmetric magnetic fields with a large parameter\nspace are optimized to satisfy certain performance criteria. However,\noptimization is still required to find configurations with properties such as\nlow elongation, high rotational transform, finite plasma beta, and good fast\nparticle confinement. In this work, we train a machine learning model to\nconstruct configurations with favorable confinement properties by finding a\nsolution to the inverse design problem, that is, obtaining a set of model input\nparameters for given desired properties. Since the solution of the inverse\nproblem is non-unique, a probabilistic approach, based on mixture density\nnetworks, is used. It is shown that optimized configurations can be generated\nreliably using this method.\n","authors":["P. Curvo","D. R. Ferreira","R. Jorge"],"pdf_url":"https://arxiv.org/pdf/2409.00564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08450v1","updated":"2025-01-14T21:38:23Z","published":"2025-01-14T21:38:23Z","title":"Active Sampling for Node Attribute Completion on Graphs","summary":"  Node attribute, a type of crucial information for graph analysis, may be\npartially or completely missing for certain nodes in real world applications.\nRestoring the missing attributes is expected to benefit downstream graph\nlearning. Few attempts have been made on node attribute completion, but a novel\nframework called Structure-attribute Transformer (SAT) was recently proposed by\nusing a decoupled scheme to leverage structures and attributes. SAT ignores the\ndifferences in contributing to the learning schedule and finding a practical\nway to model the different importance of nodes with observed attributes is\nchallenging. This paper proposes a novel AcTive Sampling algorithm (ATS) to\nrestore missing node attributes. The representativeness and uncertainty of each\nnode's information are first measured based on graph structure, representation\nsimilarity and learning bias. To select nodes as train samples in the next\noptimization step, a weighting scheme controlled by Beta distribution is then\nintroduced to linearly combine the two properties. Extensive experiments on\nfour public benchmark datasets and two downstream tasks have shown the\nsuperiority of ATS in node attribute completion.\n","authors":["Benyuan Liu","Xu Chen","Yanfeng Wang","Ya Zhang","Zhi Cao","Ivor Tsang"],"pdf_url":"https://arxiv.org/pdf/2501.08450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07298v3","updated":"2025-01-14T21:26:13Z","published":"2024-10-09T17:07:34Z","title":"Enhancing Performance of Point Cloud Completion Networks with\n  Consistency Loss","summary":"  Point cloud completion networks are conventionally trained to minimize the\ndisparities between the completed point cloud and the ground-truth counterpart.\nHowever, an incomplete object-level point cloud can have multiple valid\ncompletion solutions when it is examined in isolation. This one-to-many mapping\nissue can cause contradictory supervision signals to the network because the\nloss function may produce different values for identical input-output pairs of\nthe network. In many cases, this issue could adversely affect the network\noptimization process. In this work, we propose to enhance the conventional\nlearning objective using a novel completion consistency loss to mitigate the\none-to-many mapping problem. Specifically, the proposed consistency loss ensure\nthat a point cloud completion network generates a coherent completion solution\nfor incomplete objects originating from the same source point cloud.\nExperimental results across multiple well-established datasets and benchmarks\ndemonstrated the proposed completion consistency loss have excellent capability\nto enhance the completion performance of various existing networks without any\nmodification to the design of the networks. The proposed consistency loss\nenhances the performance of the point completion network without affecting the\ninference speed, thereby increasing the accuracy of point cloud completion.\nNotably, a state-of-the-art point completion network trained with the proposed\nconsistency loss can achieve state-of-the-art accuracy on the challenging new\nMVP dataset. The code and result of experiment various point completion models\nusing proposed consistency loss will be available at:\nhttps://github.com/kaist-avelab/ConsistencyLoss .\n","authors":["Kevin Tirta Wijaya","Christofel Rio Goenawan","Seung-Hyun Kong"],"pdf_url":"https://arxiv.org/pdf/2410.07298v3.pdf","comment":"First version of Paper \"Enhancing Performance of Point Cloud\n  Completion Networks with Consistency Loss\" by Kevin Tirta Wijaya and\n  Christofel Rio Goenawan. In process submission to Neurocomputing Journal 2024"},{"id":"http://arxiv.org/abs/2406.12588v2","updated":"2025-01-14T21:17:58Z","published":"2024-06-18T13:18:52Z","title":"UIFV: Data Reconstruction Attack in Vertical Federated Learning","summary":"  Vertical Federated Learning (VFL) facilitates collaborative machine learning\nwithout the need for participants to share raw private data. However, recent\nstudies have revealed privacy risks where adversaries might reconstruct\nsensitive features through data leakage during the learning process. Although\ndata reconstruction methods based on gradient or model information are somewhat\neffective, they reveal limitations in VFL application scenarios. This is\nbecause these traditional methods heavily rely on specific model structures\nand/or have strict limitations on application scenarios. To address this, our\nstudy introduces the Unified InverNet Framework into VFL, which yields a novel\nand flexible approach (dubbed UIFV) that leverages intermediate feature data to\nreconstruct original data, instead of relying on gradients or model details.\nThe intermediate feature data is the feature exchanged by different\nparticipants during the inference phase of VFL. Experiments on four datasets\ndemonstrate that our methods significantly outperform state-of-the-art\ntechniques in attack precision. Our work exposes severe privacy vulnerabilities\nwithin VFL systems that pose real threats to practical VFL applications and\nthus confirms the necessity of further enhancing privacy protection in the VFL\narchitecture.\n","authors":["Jirui Yang","Peng Chen","Zhihui Lu","Qiang Duan","Yubing Bao"],"pdf_url":"https://arxiv.org/pdf/2406.12588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08440v1","updated":"2025-01-14T21:08:08Z","published":"2025-01-14T21:08:08Z","title":"FARE: A Deep Learning-Based Framework for Radar-based Face Recognition\n  and Out-of-distribution Detection","summary":"  In this work, we propose a novel pipeline for face recognition and\nout-of-distribution (OOD) detection using short-range FMCW radar. The proposed\nsystem utilizes Range-Doppler and micro Range-Doppler Images. The architecture\nfeatures a primary path (PP) responsible for the classification of\nin-distribution (ID) faces, complemented by intermediate paths (IPs) dedicated\nto OOD detection. The network is trained in two stages: first, the PP is\ntrained using triplet loss to optimize ID face classification. In the second\nstage, the PP is frozen, and the IPs-comprising simple linear autoencoder\nnetworks-are trained specifically for OOD detection. Using our dataset\ngenerated with a 60 GHz FMCW radar, our method achieves an ID classification\naccuracy of 99.30% and an OOD detection AUROC of 96.91%.\n","authors":["Sabri Mustafa Kahya","Boran Hamdi Sivrikaya","Muhammet Sami Yavuz","Eckehard Steinbach"],"pdf_url":"https://arxiv.org/pdf/2501.08440v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2406.00519v2","updated":"2025-01-14T20:44:45Z","published":"2024-06-01T18:01:03Z","title":"Learning Discrete Concepts in Latent Hierarchical Models","summary":"  Learning concepts from natural high-dimensional data (e.g., images) holds\npotential in building human-aligned and interpretable machine learning models.\nDespite its encouraging prospect, formalization and theoretical insights into\nthis crucial task are still lacking. In this work, we formalize concepts as\ndiscrete latent causal variables that are related via a hierarchical causal\nmodel that encodes different abstraction levels of concepts embedded in\nhigh-dimensional data (e.g., a dog breed and its eye shapes in natural images).\nWe formulate conditions to facilitate the identification of the proposed causal\nmodel, which reveals when learning such concepts from unsupervised data is\npossible. Our conditions permit complex causal hierarchical structures beyond\nlatent trees and multi-level directed acyclic graphs in prior work and can\nhandle high-dimensional, continuous observed variables, which is well-suited\nfor unstructured data modalities such as images. We substantiate our\ntheoretical claims with synthetic data experiments. Further, we discuss our\ntheory's implications for understanding the underlying mechanisms of latent\ndiffusion models and provide corresponding empirical evidence for our\ntheoretical insights.\n","authors":["Lingjing Kong","Guangyi Chen","Biwei Huang","Eric P. Xing","Yuejie Chi","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.00519v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.08429v1","updated":"2025-01-14T20:42:57Z","published":"2025-01-14T20:42:57Z","title":"Modeling Discrimination with Causal Abstraction","summary":"  A person is directly racially discriminated against only if her race caused\nher worse treatment. This implies that race is an attribute sufficiently\nseparable from other attributes to isolate its causal role. But race is\nembedded in a nexus of social factors that resist isolated treatment. If race\nis socially constructed, in what sense can it cause worse treatment? Some\npropose that the perception of race, rather than race itself, causes worse\ntreatment. Others suggest that since causal models require modularity, i.e. the\nability to isolate causal effects, attempts to causally model discrimination\nare misguided.\n  This paper addresses the problem differently. We introduce a framework for\nreasoning about discrimination, in which race is a high-level abstraction of\nlower-level features. In this framework, race can be modeled as itself causing\nworse treatment. Modularity is ensured by allowing assumptions about social\nconstruction to be precisely and explicitly stated, via an alignment between\nrace and its constituents. Such assumptions can then be subjected to normative\nand empirical challenges, which lead to different views of when discrimination\noccurs. By distinguishing constitutive and causal relations, the abstraction\nframework pinpoints disagreements in the current literature on modeling\ndiscrimination, while preserving a precise causal account of discrimination.\n","authors":["Milan Mossé","Kara Schechtman","Frederick Eberhardt","Thomas Icard"],"pdf_url":"https://arxiv.org/pdf/2501.08429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04640v5","updated":"2025-01-14T20:38:46Z","published":"2023-04-10T15:12:09Z","title":"NeuroBench: A Framework for Benchmarking Neuromorphic Computing\n  Algorithms and Systems","summary":"  Neuromorphic computing shows promise for advancing computing efficiency and\ncapabilities of AI applications using brain-inspired principles. However, the\nneuromorphic research field currently lacks standardized benchmarks, making it\ndifficult to accurately measure technological advancements, compare performance\nwith conventional methods, and identify promising future research directions.\nPrior neuromorphic computing benchmark efforts have not seen widespread\nadoption due to a lack of inclusive, actionable, and iterative benchmark design\nand guidelines. To address these shortcomings, we present NeuroBench: a\nbenchmark framework for neuromorphic computing algorithms and systems.\nNeuroBench is a collaboratively-designed effort from an open community of\nresearchers across industry and academia, aiming to provide a representative\nstructure for standardizing the evaluation of neuromorphic approaches. The\nNeuroBench framework introduces a common set of tools and systematic\nmethodology for inclusive benchmark measurement, delivering an objective\nreference framework for quantifying neuromorphic approaches in both\nhardware-independent (algorithm track) and hardware-dependent (system track)\nsettings. In this article, we outline tasks and guidelines for benchmarks\nacross multiple application domains, and present initial performance baselines\nacross neuromorphic and conventional approaches for both benchmark tracks.\nNeuroBench is intended to continually expand its benchmarks and features to\nfoster and track the progress made by the research community.\n","authors":["Jason Yik","Korneel Van den Berghe","Douwe den Blanken","Younes Bouhadjar","Maxime Fabre","Paul Hueber","Weijie Ke","Mina A Khoei","Denis Kleyko","Noah Pacik-Nelson","Alessandro Pierro","Philipp Stratmann","Pao-Sheng Vincent Sun","Guangzhi Tang","Shenqi Wang","Biyan Zhou","Soikat Hasan Ahmed","George Vathakkattil Joseph","Benedetto Leto","Aurora Micheli","Anurag Kumar Mishra","Gregor Lenz","Tao Sun","Zergham Ahmed","Mahmoud Akl","Brian Anderson","Andreas G. Andreou","Chiara Bartolozzi","Arindam Basu","Petrut Bogdan","Sander Bohte","Sonia Buckley","Gert Cauwenberghs","Elisabetta Chicca","Federico Corradi","Guido de Croon","Andreea Danielescu","Anurag Daram","Mike Davies","Yigit Demirag","Jason Eshraghian","Tobias Fischer","Jeremy Forest","Vittorio Fra","Steve Furber","P. Michael Furlong","William Gilpin","Aditya Gilra","Hector A. Gonzalez","Giacomo Indiveri","Siddharth Joshi","Vedant Karia","Lyes Khacef","James C. Knight","Laura Kriener","Rajkumar Kubendran","Dhireesha Kudithipudi","Shih-Chii Liu","Yao-Hong Liu","Haoyuan Ma","Rajit Manohar","Josep Maria Margarit-Taulé","Christian Mayr","Konstantinos Michmizos","Dylan R. Muir","Emre Neftci","Thomas Nowotny","Fabrizio Ottati","Ayca Ozcelikkale","Priyadarshini Panda","Jongkil Park","Melika Payvand","Christian Pehle","Mihai A. Petrovici","Christoph Posch","Alpha Renner","Yulia Sandamirskaya","Clemens JS Schaefer","André van Schaik","Johannes Schemmel","Samuel Schmidgall","Catherine Schuman","Jae-sun Seo","Sadique Sheik","Sumit Bam Shrestha","Manolis Sifalakis","Amos Sironi","Kenneth Stewart","Matthew Stewart","Terrence C. Stewart","Jonathan Timcheck","Nergis Tömen","Gianvito Urgese","Marian Verhelst","Craig M. Vineyard","Bernhard Vogginger","Amirreza Yousefzadeh","Fatima Tuz Zohora","Charlotte Frenkel","Vijay Janapa Reddi"],"pdf_url":"https://arxiv.org/pdf/2304.04640v5.pdf","comment":"To appear in Nature Neuromorphic Hardware and Computing collection"},{"id":"http://arxiv.org/abs/2501.08426v1","updated":"2025-01-14T20:38:15Z","published":"2025-01-14T20:38:15Z","title":"Causal vs. Anticausal merging of predictors","summary":"  We study the differences arising from merging predictors in the causal and\nanticausal directions using the same data. In particular we study the\nasymmetries that arise in a simple model where we merge the predictors using\none binary variable as target and two continuous variables as predictors. We\nuse Causal Maximum Entropy (CMAXENT) as inductive bias to merge the predictors,\nhowever, we expect similar differences to hold also when we use other merging\nmethods that take into account asymmetries between cause and effect. We show\nthat if we observe all bivariate distributions, the CMAXENT solution reduces to\na logistic regression in the causal direction and Linear Discriminant Analysis\n(LDA) in the anticausal direction. Furthermore, we study how the decision\nboundaries of these two solutions differ whenever we observe only some of the\nbivariate distributions implications for Out-Of-Variable (OOV) generalisation.\n","authors":["Sergio Hernan Garrido Mejia","Patrick Blöbaum","Bernhard Schölkopf","Dominik Janzing"],"pdf_url":"https://arxiv.org/pdf/2501.08426v1.pdf","comment":"Presented at the 38th Conference on Neural Information Processing\n  Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2501.08421v1","updated":"2025-01-14T20:24:12Z","published":"2025-01-14T20:24:12Z","title":"SEAL: Speaker Error Correction using Acoustic-conditioned Large Language\n  Models","summary":"  Speaker Diarization (SD) is a crucial component of modern end-to-end ASR\npipelines. Traditional SD systems, which are typically audio-based and operate\nindependently of ASR, often introduce speaker errors, particularly during\nspeaker transitions and overlapping speech. Recently, language models including\nfine-tuned large language models (LLMs) have shown to be effective as a\nsecond-pass speaker error corrector by leveraging lexical context in the\ntranscribed output. In this work, we introduce a novel acoustic conditioning\napproach to provide more fine-grained information from the acoustic diarizer to\nthe LLM. We also show that a simpler constrained decoding strategy reduces LLM\nhallucinations, while avoiding complicated post-processing. Our approach\nsignificantly reduces the speaker error rates by 24-43% across Fisher,\nCallhome, and RT03-CTS datasets, compared to the first-pass Acoustic SD.\n","authors":["Anurag Kumar","Rohit Paturi","Amber Afshan","Sundararajan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2501.08421v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.08418v1","updated":"2025-01-14T20:21:06Z","published":"2025-01-14T20:21:06Z","title":"CVaR-Based Variational Quantum Optimization for User Association in\n  Handoff-Aware Vehicular Networks","summary":"  Efficient resource allocation is essential for optimizing various tasks in\nwireless networks, which are usually formulated as generalized assignment\nproblems (GAP). GAP, as a generalized version of the linear sum assignment\nproblem, involves both equality and inequality constraints that add\ncomputational challenges. In this work, we present a novel Conditional Value at\nRisk (CVaR)-based Variational Quantum Eigensolver (VQE) framework to address\nGAP in vehicular networks (VNets). Our approach leverages a hybrid\nquantum-classical structure, integrating a tailored cost function that balances\nboth objective and constraint-specific penalties to improve solution quality\nand stability. Using the CVaR-VQE model, we handle the GAP efficiently by\nfocusing optimization on the lower tail of the solution space, enhancing both\nconvergence and resilience on noisy intermediate-scale quantum (NISQ) devices.\nWe apply this framework to a user-association problem in VNets, where our\nmethod achieves 23.5% improvement compared to the deep neural network (DNN)\napproach.\n","authors":["Zijiang Yan","Hao Zhou","Jianhua Pei","Aryan Kaushik","Hina Tabassum","Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08418v1.pdf","comment":"Accepted in IEEE International Conference on Communications (ICC\n  2025)"},{"id":"http://arxiv.org/abs/2412.04984v2","updated":"2025-01-14T20:16:01Z","published":"2024-12-06T12:09:50Z","title":"Frontier Models are Capable of In-context Scheming","summary":"  Frontier models are increasingly trained and deployed as autonomous agent.\nOne safety concern is that AI agents might covertly pursue misaligned goals,\nhiding their true capabilities and objectives - also known as scheming. We\nstudy whether models have the capability to scheme in pursuit of a goal that we\nprovide in-context and instruct the model to strongly follow. We evaluate\nfrontier models on a suite of six agentic evaluations where models are\ninstructed to pursue goals and are placed in environments that incentivize\nscheming. Our results show that o1, Claude 3.5 Sonnet, Claude 3 Opus, Gemini\n1.5 Pro, and Llama 3.1 405B all demonstrate in-context scheming capabilities.\nThey recognize scheming as a viable strategy and readily engage in such\nbehavior. For example, models strategically introduce subtle mistakes into\ntheir responses, attempt to disable their oversight mechanisms, and even\nexfiltrate what they believe to be their model weights to external servers.\nAdditionally, this deceptive behavior proves persistent. When o1 has engaged in\nscheming, it maintains its deception in over 85% of follow-up questions and\noften remains deceptive in multi-turn interrogations. Analysis of the models'\nchains-of-thought reveals that models explicitly reason about these deceptive\nstrategies, providing evidence that the scheming behavior is not accidental.\nSurprisingly, we also find rare instances where models engage in scheming when\nonly given a goal, without being strongly nudged to pursue it. We observe cases\nwhere Claude 3.5 Sonnet strategically underperforms in evaluations in pursuit\nof being helpful, a goal that was acquired during training rather than\nin-context. Our findings demonstrate that frontier models now possess\ncapabilities for basic in-context scheming, making the potential of AI agents\nto engage in scheming behavior a concrete rather than theoretical concern.\n","authors":["Alexander Meinke","Bronson Schoen","Jérémy Scheurer","Mikita Balesni","Rusheb Shah","Marius Hobbhahn"],"pdf_url":"https://arxiv.org/pdf/2412.04984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08415v1","updated":"2025-01-14T20:12:09Z","published":"2025-01-14T20:12:09Z","title":"Cross-Modal Transferable Image-to-Video Attack on Video Quality Metrics","summary":"  Recent studies have revealed that modern image and video quality assessment\n(IQA/VQA) metrics are vulnerable to adversarial attacks. An attacker can\nmanipulate a video through preprocessing to artificially increase its quality\nscore according to a certain metric, despite no actual improvement in visual\nquality. Most of the attacks studied in the literature are white-box attacks,\nwhile black-box attacks in the context of VQA have received less attention.\nMoreover, some research indicates a lack of transferability of adversarial\nexamples generated for one model to another when applied to VQA. In this paper,\nwe propose a cross-modal attack method, IC2VQA, aimed at exploring the\nvulnerabilities of modern VQA models. This approach is motivated by the\nobservation that the low-level feature spaces of images and videos are similar.\nWe investigate the transferability of adversarial perturbations across\ndifferent modalities; specifically, we analyze how adversarial perturbations\ngenerated on a white-box IQA model with an additional CLIP module can\neffectively target a VQA model. The addition of the CLIP module serves as a\nvaluable aid in increasing transferability, as the CLIP model is known for its\neffective capture of low-level semantics. Extensive experiments demonstrate\nthat IC2VQA achieves a high success rate in attacking three black-box VQA\nmodels. We compare our method with existing black-box attack strategies,\nhighlighting its superiority in terms of attack success within the same number\nof iterations and levels of attack strength. We believe that the proposed\nmethod will contribute to the deeper analysis of robust VQA metrics.\n","authors":["Georgii Gotin","Ekaterina Shumitskaya","Anastasia Antsiferova","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2501.08415v1.pdf","comment":"Accepted for VISAPP 2025"},{"id":"http://arxiv.org/abs/2501.08411v1","updated":"2025-01-14T19:59:59Z","published":"2025-01-14T19:59:59Z","title":"BiDepth Multimodal Neural Network: Bidirectional Depth Deep Learning\n  Arcitecture for Spatial-Temporal Prediction","summary":"  Accurate prediction of spatial-temporal (ST) information in dynamic systems,\nsuch as urban mobility and weather patterns, is a crucial yet challenging\nproblem. The complexity stems from the intricate interplay between spatial\nproximity and temporal relevance, where both long-term trends and short-term\nfluctuations are present in convoluted patterns. Existing approaches, including\ntraditional statistical methods and conventional neural networks, may provide\ninaccurate results due to the lack of an effective mechanism that\nsimultaneously incorporates information at variable temporal depths while\nmaintaining spatial context, resulting in a trade-off between comprehensive\nlong-term historical analysis and responsiveness to short-term new information.\nTo bridge this gap, this paper proposes the BiDepth Multimodal Neural Network\n(BDMNN) with bidirectional depth modulation that enables a comprehensive\nunderstanding of both long-term seasonality and short-term fluctuations,\nadapting to the complex ST context. Case studies with real-world public data\ndemonstrate significant improvements in prediction accuracy, with a 12%\nreduction in Mean Squared Error for urban traffic prediction and a 15%\nimprovement in rain precipitation forecasting compared to state-of-the-art\nbenchmarks, without demanding extra computational resources.\n","authors":["Sina Ehsani","Fenglian Pan","Qingpei Hu","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08411v1.pdf","comment":"This paper has been submitted to Applied Intelligence for review"},{"id":"http://arxiv.org/abs/2408.04809v2","updated":"2025-01-14T19:42:28Z","published":"2024-08-09T01:40:12Z","title":"On the Geometry of Deep Learning","summary":"  In this paper, we overview one promising avenue of progress at the\nmathematical foundation of deep learning: the connection between deep networks\nand function approximation by affine splines (continuous piecewise linear\nfunctions in multiple dimensions). In particular, we will overview work over\nthe past decade on understanding certain geometrical properties of a deep\nnetwork's affine spline mapping, in particular how it tessellates its input\nspace. As we will see, the affine spline connection and geometrical viewpoint\nprovide a powerful portal through which to view, analyze, and improve the inner\nworkings of a deep network.\n","authors":["Randall Balestriero","Ahmed Imtiaz Humayun","Richard Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2408.04809v2.pdf","comment":"Accepted for publication at 'Notices of the American Mathematical\n  Society'"},{"id":"http://arxiv.org/abs/2501.08402v1","updated":"2025-01-14T19:37:08Z","published":"2025-01-14T19:37:08Z","title":"Addressing Quality Challenges in Deep Learning: The Role of MLOps and\n  Domain Knowledge","summary":"  Deep learning (DL) systems present unique challenges in software engineering,\nespecially concerning quality attributes like correctness and resource\nefficiency. While DL models achieve exceptional performance in specific tasks,\nengineering DL-based systems is still essential. The effort, cost, and\npotential diminishing returns of continual improvements must be carefully\nevaluated, as software engineers often face the critical decision of when to\nstop refining a system relative to its quality attributes. This experience\npaper explores the role of MLOps practices -- such as monitoring and experiment\ntracking -- in creating transparent and reproducible experimentation\nenvironments that enable teams to assess and justify the impact of design\ndecisions on quality attributes. Furthermore, we report on experiences\naddressing the quality challenges by embedding domain knowledge into the design\nof a DL model and its integration within a larger system. The findings offer\nactionable insights into not only the benefits of domain knowledge and MLOps\nbut also the strategic consideration of when to limit further optimizations in\nDL projects to maximize overall system quality and reliability.\n","authors":["Santiago del Rey","Adrià Medina","Xavier Franch","Silverio Martínez-Fernández"],"pdf_url":"https://arxiv.org/pdf/2501.08402v1.pdf","comment":"6 pages, 1 figure, accepted to the 4th International Conference on AI\n  Engineering - Software Engineering for AI (CAIN)"},{"id":"http://arxiv.org/abs/2501.08365v1","updated":"2025-01-14T17:18:05Z","published":"2025-01-14T17:18:05Z","title":"Towards Best Practices for Open Datasets for LLM Training","summary":"  Many AI companies are training their large language models (LLMs) on data\nwithout the permission of the copyright owners. The permissibility of doing so\nvaries by jurisdiction: in countries like the EU and Japan, this is allowed\nunder certain restrictions, while in the United States, the legal landscape is\nmore ambiguous. Regardless of the legal status, concerns from creative\nproducers have led to several high-profile copyright lawsuits, and the threat\nof litigation is commonly cited as a reason for the recent trend towards\nminimizing the information shared about training datasets by both corporate and\npublic interest actors. This trend in limiting data information causes harm by\nhindering transparency, accountability, and innovation in the broader ecosystem\nby denying researchers, auditors, and impacted individuals access to the\ninformation needed to understand AI models.\n  While this could be mitigated by training language models on open access and\npublic domain data, at the time of writing, there are no such models (trained\nat a meaningful scale) due to the substantial technical and sociological\nchallenges in assembling the necessary corpus. These challenges include\nincomplete and unreliable metadata, the cost and complexity of digitizing\nphysical records, and the diverse set of legal and technical skills required to\nensure relevance and responsibility in a quickly changing landscape. Building\ntowards a future where AI systems can be trained on openly licensed data that\nis responsibly curated and governed requires collaboration across legal,\ntechnical, and policy domains, along with investments in metadata standards,\ndigitization, and fostering a culture of openness.\n","authors":["Stefan Baack","Stella Biderman","Kasia Odrozek","Aviya Skowron","Ayah Bdeir","Jillian Bommarito","Jennifer Ding","Maximilian Gahntz","Paul Keller","Pierre-Carl Langlais","Greg Lindahl","Sebastian Majstorovic","Nik Marda","Guilherme Penedo","Maarten Van Segbroeck","Jennifer Wang","Leandro von Werra","Mitchell Baker","Julie Belião","Kasia Chmielinski","Marzieh Fadaee","Lisa Gutermuth","Hynek Kydlíček","Greg Leppert","EM Lewis-Jong","Solana Larsen","Shayne Longpre","Angela Oduor Lungati","Cullen Miller","Victor Miller","Max Ryabinin","Kathleen Siminyu","Andrew Strait","Mark Surman","Anna Tumadóttir","Maurice Weber","Rebecca Weiss","Lee White","Thomas Wolf"],"pdf_url":"https://arxiv.org/pdf/2501.08365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09039v1","updated":"2025-01-14T21:27:40Z","published":"2025-01-14T21:27:40Z","title":"Playing Devil's Advocate: Unmasking Toxicity and Vulnerabilities in\n  Large Vision-Language Models","summary":"  The rapid advancement of Large Vision-Language Models (LVLMs) has enhanced\ncapabilities offering potential applications from content creation to\nproductivity enhancement. Despite their innovative potential, LVLMs exhibit\nvulnerabilities, especially in generating potentially toxic or unsafe\nresponses. Malicious actors can exploit these vulnerabilities to propagate\ntoxic content in an automated (or semi-) manner, leveraging the susceptibility\nof LVLMs to deception via strategically crafted prompts without fine-tuning or\ncompute-intensive procedures. Despite the red-teaming efforts and inherent\npotential risks associated with the LVLMs, exploring vulnerabilities of LVLMs\nremains nascent and yet to be fully addressed in a systematic manner. This\nstudy systematically examines the vulnerabilities of open-source LVLMs,\nincluding LLaVA, InstructBLIP, Fuyu, and Qwen, using adversarial prompt\nstrategies that simulate real-world social manipulation tactics informed by\nsocial theories. Our findings show that (i) toxicity and insulting are the most\nprevalent behaviors, with the mean rates of 16.13% and 9.75%, respectively;\n(ii) Qwen-VL-Chat, LLaVA-v1.6-Vicuna-7b, and InstructBLIP-Vicuna-7b are the\nmost vulnerable models, exhibiting toxic response rates of 21.50%, 18.30% and\n17.90%, and insulting responses of 13.40%, 11.70% and 10.10%, respectively;\n(iii) prompting strategies incorporating dark humor and multimodal toxic prompt\ncompletion significantly elevated these vulnerabilities. Despite being\nfine-tuned for safety, these models still generate content with varying degrees\nof toxicity when prompted with adversarial inputs, highlighting the urgent need\nfor enhanced safety mechanisms and robust guardrails in LVLM development.\n","authors":["Abdulkadir Erol","Trilok Padhi","Agnik Saha","Ugur Kursuncu","Mehmet Emin Aktas"],"pdf_url":"https://arxiv.org/pdf/2501.09039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09038v1","updated":"2025-01-14T20:59:37Z","published":"2025-01-14T20:59:37Z","title":"Do generative video models learn physical principles from watching\n  videos?","summary":"  AI video generation is undergoing a revolution, with quality and realism\nadvancing rapidly. These advances have led to a passionate scientific debate:\nDo video models learn ``world models'' that discover laws of physics -- or,\nalternatively, are they merely sophisticated pixel predictors that achieve\nvisual realism without understanding the physical principles of reality? We\naddress this question by developing Physics-IQ, a comprehensive benchmark\ndataset that can only be solved by acquiring a deep understanding of various\nphysical principles, like fluid dynamics, optics, solid mechanics, magnetism\nand thermodynamics. We find that across a range of current models (Sora,\nRunway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical\nunderstanding is severely limited, and unrelated to visual realism. At the same\ntime, some test cases can already be successfully solved. This indicates that\nacquiring certain physical principles from observation alone may be possible,\nbut significant challenges remain. While we expect rapid advances ahead, our\nwork demonstrates that visual realism does not imply physical understanding.\nOur project page is at https://physics-iq.github.io; code at\nhttps://github.com/google-deepmind/physics-IQ-benchmark.\n","authors":["Saman Motamed","Laura Culp","Kevin Swersky","Priyank Jaini","Robert Geirhos"],"pdf_url":"https://arxiv.org/pdf/2501.09038v1.pdf","comment":null}]},"2025-01-15T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2403.16478v4","updated":"2025-01-15T17:58:48Z","published":"2024-03-25T07:04:24Z","title":"Real-World Evaluation of two Cooperative Intersection Management\n  Approaches","summary":"  Cooperative maneuver planning promises to significantly improve traffic\nefficiency at unsignalized intersections by leveraging connected automated\nvehicles. Previous works on this topic have been mostly developed for\ncompletely automated traffic in a simple simulated environment. In contrast,\nour previously introduced planning approaches are specifically designed to\nhandle real-world mixed traffic. The two methods are based on multi-scenario\nprediction and graph-based reinforcement learning, respectively. This is the\nfirst study to perform evaluations in a novel mixed traffic simulation\nframework as well as real-world drives with prototype connected automated\nvehicles in public traffic. The simulation features the same connected\nautomated driving software stack as deployed on one of the automated vehicles.\nOur quantitative evaluations show that cooperative maneuver planning achieves a\nsubstantial reduction in crossing times and the number of stops. In a realistic\nenvironment with few automated vehicles, there are noticeable efficiency gains\nwith only slightly increasing criticality metrics.\n","authors":["Marvin Klimke","Max Bastian Mertens","Benjamin Völz","Michael Buchholz"],"pdf_url":"https://arxiv.org/pdf/2403.16478v4.pdf","comment":"M. Klimke and M. B. Mertens are both first authors with equal\n  contribution. 10 pages, 9 figures, 3 tables, submitted to IEEE Intelligent\n  Transportation Systems Magazine"},{"id":"http://arxiv.org/abs/2411.00138v3","updated":"2025-01-15T17:44:41Z","published":"2024-10-31T18:37:22Z","title":"Learning Low-Dimensional Strain Models of Soft Robots by Looking at the\n  Evolution of Their Shape with Application to Model-Based Control","summary":"  Obtaining dynamic models of continuum soft robots is central to the analysis\nand control of soft robots, and researchers have devoted much attention to the\nchallenge of proposing both data-driven and first-principle solutions. Both\navenues have, however, shown their limitations; the former lacks structure and\nperforms poorly outside training data, while the latter requires significant\nsimplifications and extensive expert knowledge to be used in practice. This\npaper introduces a streamlined method for learning low-dimensional,\nphysics-based models that are both accurate and easy to interpret. We start\nwith an algorithm that uses image data (i.e., shape evolutions) to determine\nthe minimal necessary segments for describing a soft robot's movement.\nFollowing this, we apply a dynamic regression and strain sparsification\nalgorithm to identify relevant strains and define the model's dynamics. We\nvalidate our approach through simulations with various planar soft\nmanipulators, comparing its performance against other learning strategies,\nshowing that our models are both computationally efficient and 25x more\naccurate on out-of-training distribution inputs. Finally, we demonstrate that\nthanks to the capability of the method of generating physically compatible\nmodels, the learned models can be straightforwardly combined with model-based\ncontrol policies.\n","authors":["Ricardo Valadas","Maximilian Stölzle","Jingyue Liu","Cosimo Della Santina"],"pdf_url":"https://arxiv.org/pdf/2411.00138v3.pdf","comment":"8 pages, appearing in Proceedings of the 2025 IEEE 8th International\n  Conference on Soft Robotics (RoboSoft)"},{"id":"http://arxiv.org/abs/2501.08946v1","updated":"2025-01-15T16:49:22Z","published":"2025-01-15T16:49:22Z","title":"Applying General Turn-taking Models to Conversational Human-Robot\n  Interaction","summary":"  Turn-taking is a fundamental aspect of conversation, but current Human-Robot\nInteraction (HRI) systems often rely on simplistic, silence-based models,\nleading to unnatural pauses and interruptions. This paper investigates, for the\nfirst time, the application of general turn-taking models, specifically TurnGPT\nand Voice Activity Projection (VAP), to improve conversational dynamics in HRI.\nThese models are trained on human-human dialogue data using self-supervised\nlearning objectives, without requiring domain-specific fine-tuning. We propose\nmethods for using these models in tandem to predict when a robot should begin\npreparing responses, take turns, and handle potential interruptions. We\nevaluated the proposed system in a within-subject study against a traditional\nbaseline system, using the Furhat robot with 39 adults in a conversational\nsetting, in combination with a large language model for autonomous response\ngeneration. The results show that participants significantly prefer the\nproposed system, and it significantly reduces response delays and\ninterruptions.\n","authors":["Gabriel Skantze","Bahar Irfan"],"pdf_url":"https://arxiv.org/pdf/2501.08946v1.pdf","comment":"Accepted at HRI 2025 (the IEEE/ACM International Conference on\n  Human-Robot Interaction)"},{"id":"http://arxiv.org/abs/2501.08941v1","updated":"2025-01-15T16:44:35Z","published":"2025-01-15T16:44:35Z","title":"A Reinforcement Learning Approach to Quiet and Safe UAM Traffic\n  Management","summary":"  Urban air mobility (UAM) is a transformative system that operates various\nsmall aerial vehicles in urban environments to reshape urban transportation.\nHowever, integrating UAM into existing urban environments presents a variety of\ncomplex challenges. Recent analyses of UAM's operational constraints highlight\naircraft noise and system safety as key hurdles to UAM system implementation.\nFuture UAM air traffic management schemes must ensure that the system is both\nquiet and safe. We propose a multi-agent reinforcement learning approach to\nmanage UAM traffic, aiming at both vertical separation assurance and noise\nmitigation. Through extensive training, the reinforcement learning agent learns\nto balance the two primary objectives by employing altitude adjustments in a\nmulti-layer UAM network. The results reveal the tradeoffs among noise impact,\ntraffic congestion, and separation. Overall, our findings demonstrate the\npotential of reinforcement learning in mitigating UAM's noise impact while\nmaintaining safe separation using altitude adjustments\n","authors":["Surya Murthy","John-Paul Clarke","Ufuk Topcu","Zhenyu Gao"],"pdf_url":"https://arxiv.org/pdf/2501.08941v1.pdf","comment":"Paper presented at SciTech 2025"},{"id":"http://arxiv.org/abs/2501.08908v1","updated":"2025-01-15T16:18:13Z","published":"2025-01-15T16:18:13Z","title":"When Uncertainty Leads to Unsafety: Empirical Insights into the Role of\n  Uncertainty in Unmanned Aerial Vehicle Safety","summary":"  Despite the recent developments in obstacle avoidance and other safety\nfeatures, autonomous Unmanned Aerial Vehicles (UAVs) continue to face safety\nchallenges. No previous work investigated the relationship between the\nbehavioral uncertainty of a UAV and the unsafety of its flight. By quantifying\nuncertainty, it is possible to develop a predictor for unsafety, which acts as\na flight supervisor. We conducted a large-scale empirical investigation of\nsafety violations using PX4-Autopilot, an open-source UAV software platform.\nOur dataset of over 5,000 simulated flights, created to challenge obstacle\navoidance, allowed us to explore the relation between uncertain UAV decisions\nand safety violations: up to 89% of unsafe UAV states exhibit significant\ndecision uncertainty, and up to 74% of uncertain decisions lead to unsafe\nstates. Based on these findings, we implemented Superialist (Supervising\nAutonomous Aerial Vehicles), a runtime uncertainty detector based on\nautoencoders, the state-of-the-art technology for anomaly detection.\nSuperialist achieved high performance in detecting uncertain behaviors with up\nto 96% precision and 93% recall. Despite the observed performance degradation\nwhen using the same approach for predicting unsafety (up to 74% precision and\n87% recall), Superialist enabled early prediction of unsafe states up to 50\nseconds in advance.\n","authors":["Sajad Khatiri","Fatemeh Mohammadi Amin","Sebastiano Panichella","Paolo Tonella"],"pdf_url":"https://arxiv.org/pdf/2501.08908v1.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2501.08880v1","updated":"2025-01-15T15:51:06Z","published":"2025-01-15T15:51:06Z","title":"SLC$^2$-SLAM: Semantic-guided Loop Closure with Shared Latent Code for\n  NeRF SLAM","summary":"  Targeting the notorious cumulative drift errors in NeRF SLAM, we propose a\nSemantic-guided Loop Closure with Shared Latent Code, dubbed SLC$^2$-SLAM.\nEspecially, we argue that latent codes stored in many NeRF SLAM systems are not\nfully exploited, as they are only used for better reconstruction. In this\npaper, we propose a simple yet effective way to detect potential loops using\nthe same latent codes as local features. To further improve the loop detection\nperformance, we use the semantic information, which are also decoded from the\nsame latent codes to guide the aggregation of local features. Finally, with the\npotential loops detected, we close them with a graph optimization followed by\nbundle adjustment to refine both the estimated poses and the reconstructed\nscene. To evaluate the performance of our SLC$^2$-SLAM, we conduct extensive\nexperiments on Replica and ScanNet datasets. Our proposed semantic-guided loop\nclosure significantly outperforms the pre-trained NetVLAD and ORB combined with\nBag-of-Words, which are used in all the other NeRF SLAM with loop closure. As a\nresult, our SLC$^2$-SLAM also demonstrated better tracking and reconstruction\nperformance, especially in larger scenes with more loops, like ScanNet.\n","authors":["Yuhang Ming","Di Ma","Weichen Dai","Han Yang","Rui Fan","Guofeng Zhang","Wanzeng Kong"],"pdf_url":"https://arxiv.org/pdf/2501.08880v1.pdf","comment":"8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.07317v3","updated":"2025-01-15T14:01:15Z","published":"2025-01-13T13:28:03Z","title":"Evaluation of Artificial Intelligence Methods for Lead Time Prediction\n  in Non-Cycled Areas of Automotive Production","summary":"  The present study examines the effectiveness of applying Artificial\nIntelligence methods in an automotive production environment to predict unknown\nlead times in a non-cycle-controlled production area. Data structures are\nanalyzed to identify contextual features and then preprocessed using one-hot\nencoding. Methods selection focuses on supervised machine learning techniques.\nIn supervised learning methods, regression and classification methods are\nevaluated. Continuous regression based on target size distribution is not\nfeasible. Classification methods analysis shows that Ensemble Learning and\nSupport Vector Machines are the most suitable. Preliminary study results\nindicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost\nyield the best results. After further testing and extensive hyperparameter\noptimization, the final method choice is the LightGBM algorithm. Depending on\nfeature availability and prediction interval granularity, relative prediction\naccuracies of up to 90% can be achieved. Further tests highlight the importance\nof periodic retraining of AI models to accurately represent complex production\nprocesses using the database. The research demonstrates that AI methods can be\neffectively applied to highly variable production data, adding business value\nby providing an additional metric for various control tasks while outperforming\ncurrent non AI-based systems.\n","authors":["Cornelius Hake","Jonas Weigele","Frederik Reichert","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2501.07317v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10700v2","updated":"2025-01-15T12:45:24Z","published":"2024-03-15T21:36:15Z","title":"Mind the Error! Detection and Localization of Instruction Errors in\n  Vision-and-Language Navigation","summary":"  Vision-and-Language Navigation in Continuous Environments (VLN-CE) is one of\nthe most intuitive yet challenging embodied AI tasks. Agents are tasked to\nnavigate towards a target goal by executing a set of low-level actions,\nfollowing a series of natural language instructions. All VLN-CE methods in the\nliterature assume that language instructions are exact. However, in practice,\ninstructions given by humans can contain errors when describing a spatial\nenvironment due to inaccurate memory or confusion. Current VLN-CE benchmarks do\nnot address this scenario, making the state-of-the-art methods in VLN-CE\nfragile in the presence of erroneous instructions from human users. For the\nfirst time, we propose a novel benchmark dataset that introduces various types\nof instruction errors considering potential human causes. This benchmark\nprovides valuable insight into the robustness of VLN systems in continuous\nenvironments. We observe a noticeable performance drop (up to -25%) in Success\nRate when evaluating the state-of-the-art VLN-CE methods on our benchmark.\nMoreover, we formally define the task of Instruction Error Detection and\nLocalization, and establish an evaluation protocol on top of our benchmark\ndataset. We also propose an effective method, based on a cross-modal\ntransformer architecture, that achieves the best performance in error detection\nand localization, compared to baselines. Surprisingly, our proposed method has\nrevealed errors in the validation set of the two commonly used datasets for\nVLN-CE, i.e., R2R-CE and RxR-CE, demonstrating the utility of our technique in\nother tasks. Code and dataset available at\nhttps://intelligolabs.github.io/R2RIE-CE\n","authors":["Francesco Taioli","Stefano Rosa","Alberto Castellini","Lorenzo Natale","Alessio Del Bue","Alessandro Farinelli","Marco Cristani","Yiming Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10700v2.pdf","comment":"3 figures, 8 pages. Accepted at IROS'24"},{"id":"http://arxiv.org/abs/2403.13674v2","updated":"2025-01-15T11:27:53Z","published":"2024-03-20T15:32:56Z","title":"Reward-Driven Automated Curriculum Learning for Interaction-Aware\n  Self-Driving at Unsignalized Intersections","summary":"  In this work, we present a reward-driven automated curriculum reinforcement\nlearning approach for interaction-aware self-driving at unsignalized\nintersections, taking into account the uncertainties associated with\nsurrounding vehicles (SVs). These uncertainties encompass the uncertainty of\nSVs' driving intention and also the quantity of SVs. To deal with this problem,\nthe curriculum set is specifically designed to accommodate a progressively\nincreasing number of SVs. By implementing an automated curriculum selection\nmechanism, the importance weights are rationally allocated across various\ncurricula, thereby facilitating improved sample efficiency and training\noutcomes. Furthermore, the reward function is meticulously designed to guide\nthe agent towards effective policy exploration. Thus the proposed framework\ncould proactively address the above uncertainties at unsignalized intersections\nby employing the automated curriculum learning technique that progressively\nincreases task difficulty, and this ensures safe self-driving through effective\ninteraction with SVs. Comparative experiments are conducted in $Highway\\_Env$,\nand the results indicate that our approach achieves the highest task success\nrate, attains strong robustness to initialization parameters of the curriculum\nselection module, and exhibits superior adaptability to diverse situational\nconfigurations at unsignalized intersections. Furthermore, the effectiveness of\nthe proposed method is validated using the high-fidelity CARLA simulator.\n","authors":["Zengqi Peng","Xiao Zhou","Lei Zheng","Yubin Wang","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2403.13674v2.pdf","comment":"8 pages, 6 figures, add grant information, minor textual polishing"},{"id":"http://arxiv.org/abs/2501.08726v1","updated":"2025-01-15T11:10:34Z","published":"2025-01-15T11:10:34Z","title":"Task Allocation in Mobile Robot Fleets: A review","summary":"  Mobile robot fleets are currently used in different scenarios such as medical\nenvironments or logistics. The management of these systems provides different\nchallenges that vary from the control of the movement of each robot to the\nallocation of tasks to be performed. Task Allocation (TA) problem is a key\ntopic for the proper management of mobile robot fleets to ensure the\nminimization of energy consumption and quantity of necessary robots. Solutions\non this aspect are essential to reach economic and environmental sustainability\nof robot fleets, mainly in industry applications such as warehouse logistics.\nThe minimization of energy consumption introduces TA problem as an optimization\nissue which has been treated in recent studies. This work focuses on the\nanalysis of current trends in solving TA of mobile robot fleets. Main TA\noptimization algorithms are presented, including novel methods based on\nArtificial Intelligence (AI). Additionally, this work showcases most important\nresults extracted from simulations, including frameworks utilized for the\ndevelopment of the simulations. Finally, some conclusions are obtained from the\nanalysis to target on gaps that must be treated in the future.\n","authors":["Andrés Meseguer Valenzuela","Francisco Blanes Noguera"],"pdf_url":"https://arxiv.org/pdf/2501.08726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08672v1","updated":"2025-01-15T09:04:56Z","published":"2025-01-15T09:04:56Z","title":"GS-LIVO: Real-Time LiDAR, Inertial, and Visual Multi-sensor Fused\n  Odometry with Gaussian Mapping","summary":"  In recent years, 3D Gaussian splatting (3D-GS) has emerged as a novel scene\nrepresentation approach. However, existing vision-only 3D-GS methods often rely\non hand-crafted heuristics for point-cloud densification and face challenges in\nhandling occlusions and high GPU memory and computation consumption.\nLiDAR-Inertial-Visual (LIV) sensor configuration has demonstrated superior\nperformance in localization and dense mapping by leveraging complementary\nsensing characteristics: rich texture information from cameras, precise\ngeometric measurements from LiDAR, and high-frequency motion data from IMU.\nInspired by this, we propose a novel real-time Gaussian-based simultaneous\nlocalization and mapping (SLAM) system. Our map system comprises a global\nGaussian map and a sliding window of Gaussians, along with an IESKF-based\nodometry. The global Gaussian map consists of hash-indexed voxels organized in\na recursive octree, effectively covering sparse spatial volumes while adapting\nto different levels of detail and scales. The Gaussian map is initialized\nthrough multi-sensor fusion and optimized with photometric gradients. Our\nsystem incrementally maintains a sliding window of Gaussians, significantly\nreducing GPU computation and memory consumption by only optimizing the map\nwithin the sliding window. Moreover, we implement a tightly coupled\nmulti-sensor fusion odometry with an iterative error state Kalman filter\n(IESKF), leveraging real-time updating and rendering of the Gaussian map. Our\nsystem represents the first real-time Gaussian-based SLAM framework deployable\non resource-constrained embedded systems, demonstrated on the NVIDIA Jetson\nOrin NX platform. The framework achieves real-time performance while\nmaintaining robust multi-sensor fusion capabilities. All implementation\nalgorithms, hardware designs, and CAD models will be publicly available.\n","authors":["Sheng Hong","Chunran Zheng","Yishu Shen","Changze Li","Fu Zhang","Tong Qin","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2501.08672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08655v1","updated":"2025-01-15T08:46:20Z","published":"2025-01-15T08:46:20Z","title":"Application of Deep Reinforcement Learning to UAV Swarming for Ground\n  Surveillance","summary":"  This paper summarizes in depth the state of the art of aerial swarms,\ncovering both classical and new reinforcement-learning-based approaches for\ntheir management. Then, it proposes a hybrid AI system, integrating deep\nreinforcement learning in a multi-agent centralized swarm architecture. The\nproposed system is tailored to perform surveillance of a specific area,\nsearching and tracking ground targets, for security and law enforcement\napplications. The swarm is governed by a central swarm controller responsible\nfor distributing different search and tracking tasks among the cooperating\nUAVs. Each UAV agent is then controlled by a collection of cooperative\nsub-agents, whose behaviors have been trained using different deep\nreinforcement learning models, tailored for the different task types proposed\nby the swarm controller. More specifically, proximal policy optimization (PPO)\nalgorithms were used to train the agents' behavior. In addition, several\nmetrics to assess the performance of the swarm in this application were\ndefined. The results obtained through simulation show that our system searches\nthe operation area effectively, acquires the targets in a reasonable time, and\nis capable of tracking them continuously and consistently.\n","authors":["Raúl Arranz","David Carramiñana","Gonzalo de Miguel","Juan A. Besada","Ana M. Bernardos"],"pdf_url":"https://arxiv.org/pdf/2501.08655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15627v2","updated":"2025-01-15T08:18:38Z","published":"2024-09-24T00:10:29Z","title":"ModCube: Modular, Self-Assembling Cubic Underwater Robot","summary":"  This paper presents a low-cost, centralized modular underwater robot\nplatform, ModCube, which can be used to study swarm coordination for a wide\nrange of tasks in underwater environments. A ModCube structure consists of\nmultiple ModCube robots. Each robot can move in six DoF with eight thrusters\nand can be rigidly connected to other ModCube robots with an electromagnet\ncontrolled by onboard computer. In this paper, we present a novel method for\ncharacterizing and visualizing dynamic behavior, along with four benchmarks to\nevaluate the morphological performance of the robot. Analysis shows that our\nModCube design is desirable for omnidirectional tasks, compared with the\nconfigurations widely used by commercial underwater robots. We run real robot\nexperiments in two water tanks to demonstrate the robust control and\nself-assemble of the proposed system, We also open-source the design and code\nto facilitate future research.\n","authors":["Jiaxi Zheng","Guangmin Dai","Botao He","Zhaoyang Mu","Zhaochen Meng","Tianyi Zhang","Weiming Zhi","Dixia Fan"],"pdf_url":"https://arxiv.org/pdf/2409.15627v2.pdf","comment":"8 pages, 8 figures, letter"},{"id":"http://arxiv.org/abs/2501.06605v2","updated":"2025-01-15T08:01:51Z","published":"2025-01-11T18:11:07Z","title":"RoboHorizon: An LLM-Assisted Multi-View World Model for Long-Horizon\n  Robotic Manipulation","summary":"  Efficient control in long-horizon robotic manipulation is challenging due to\ncomplex representation and policy learning requirements. Model-based visual\nreinforcement learning (RL) has shown great potential in addressing these\nchallenges but still faces notable limitations, particularly in handling sparse\nrewards and complex visual features in long-horizon environments. To address\nthese limitations, we propose the Recognize-Sense-Plan-Act (RSPA) pipeline for\nlong-horizon tasks and further introduce RoboHorizon, an LLM-assisted\nmulti-view world model tailored for long-horizon robotic manipulation. In\nRoboHorizon, pre-trained LLMs generate dense reward structures for multi-stage\nsub-tasks based on task language instructions, enabling robots to better\nrecognize long-horizon tasks. Keyframe discovery is then integrated into the\nmulti-view masked autoencoder (MAE) architecture to enhance the robot's ability\nto sense critical task sequences, strengthening its multi-stage perception of\nlong-horizon processes. Leveraging these dense rewards and multi-view\nrepresentations, a robotic world model is constructed to efficiently plan\nlong-horizon tasks, enabling the robot to reliably act through RL algorithms.\nExperiments on two representative benchmarks, RLBench and FurnitureBench, show\nthat RoboHorizon outperforms state-of-the-art visual model-based RL methods,\nachieving a 23.35% improvement in task success rates on RLBench's 4\nshort-horizon tasks and a 29.23% improvement on 6 long-horizon tasks from\nRLBench and 3 furniture assembly tasks from FurnitureBench.\n","authors":["Zixuan Chen","Jing Huo","Yangtao Chen","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2501.06605v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.08629v1","updated":"2025-01-15T07:24:15Z","published":"2025-01-15T07:24:15Z","title":"Self-Organizing Edge Computing Distribution Framework for Visual SLAM","summary":"  Localization within a known environment is a crucial capability for mobile\nrobots. Simultaneous Localization and Mapping (SLAM) is a prominent solution to\nthis problem. SLAM is a framework that consists of a diverse set of\ncomputational tasks ranging from real-time tracking to computation-intensive\nmap optimization. This combination can present a challenge for resource-limited\nmobile robots. Previously, edge-assisted SLAM methods have demonstrated\npromising real-time execution capabilities by offloading heavy computations\nwhile performing real-time tracking onboard. However, the common approach of\nutilizing a client-server architecture for offloading is sensitive to server\nand network failures. In this article, we propose a novel edge-assisted SLAM\nframework capable of self-organizing fully distributed SLAM execution across a\nnetwork of devices or functioning on a single device without connectivity. The\narchitecture consists of three layers and is designed to be device-agnostic,\nresilient to network failures, and minimally invasive to the core SLAM system.\nWe have implemented and demonstrated the framework for monocular ORB SLAM3 and\nevaluated it in both fully distributed and standalone SLAM configurations\nagainst the ORB SLAM3. The experiment results demonstrate that the proposed\ndesign matches the accuracy and resource utilization of the monolithic approach\nwhile enabling collaborative execution.\n","authors":["Jussi Kalliola","Lauri Suomela","Sergio Moreschini","David Hästbacka"],"pdf_url":"https://arxiv.org/pdf/2501.08629v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.08593v1","updated":"2025-01-15T05:36:41Z","published":"2025-01-15T05:36:41Z","title":"Image-to-Force Estimation for Soft Tissue Interaction in\n  Robotic-Assisted Surgery Using Structured Light","summary":"  For Minimally Invasive Surgical (MIS) robots, accurate haptic interaction\nforce feedback is essential for ensuring the safety of interacting with soft\ntissue. However, most existing MIS robotic systems cannot facilitate direct\nmeasurement of the interaction force with hardware sensors due to space\nlimitations. This letter introduces an effective vision-based scheme that\nutilizes a One-Shot structured light projection with a designed pattern on soft\ntissue coupled with haptic information processing through a trained\nimage-to-force neural network. The images captured from the endoscopic stereo\ncamera are analyzed to reconstruct high-resolution 3D point clouds for soft\ntissue deformation. Based on this, a modified PointNet-based force estimation\nmethod is proposed, which excels in representing the complex mechanical\nproperties of soft tissue. Numerical force interaction experiments are\nconducted on three silicon materials with different stiffness. The results\nvalidate the effectiveness of the proposed scheme.\n","authors":["Jiayin Wang","Mingfeng Yao","Yanran Wei","Xiaoyu Guo","Ayong Zheng","Weidong Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.08593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08575v1","updated":"2025-01-15T04:51:10Z","published":"2025-01-15T04:51:10Z","title":"GOTLoc: General Outdoor Text-based Localization Using Scene Graph\n  Retrieval with OpenStreetMap","summary":"  We propose GOTLoc, a robust localization method capable of operating even in\noutdoor environments where GPS signals are unavailable. The method achieves\nthis robust localization by leveraging comparisons between scene graphs\ngenerated from text descriptions and maps. Existing text-based localization\nstudies typically represent maps as point clouds and identify the most similar\nscenes by comparing embeddings of text and point cloud data. However, point\ncloud maps have limited scalability as it is impractical to pre-generate maps\nfor all outdoor spaces. Furthermore, their large data size makes it challenging\nto store and utilize them directly on actual robots. To address these issues,\nGOTLoc leverages compact data structures, such as scene graphs, to store\nspatial information, enabling individual robots to carry and utilize large\namounts of map data. Additionally, by utilizing publicly available map data,\nsuch as OpenStreetMap, which provides global information on outdoor spaces, we\neliminate the need for additional effort to create custom map data. For\nperformance evaluation, we utilized the KITTI360Pose dataset in conjunction\nwith corresponding OpenStreetMap data to compare the proposed method with\nexisting approaches. Our results demonstrate that the proposed method achieves\naccuracy comparable to algorithms relying on point cloud maps. Moreover, in\ncity-scale tests, GOTLoc required significantly less storage compared to point\ncloud-based methods and completed overall processing within a few seconds,\nvalidating its applicability to real-world robotics. Our code is available at\nhttps://github.com/donghwijung/GOTLoc.\n","authors":["Donghwi Jung","Keonwoo Kim","Seong-Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2501.08575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08558v1","updated":"2025-01-15T03:49:08Z","published":"2025-01-15T03:49:08Z","title":"LAMS: LLM-Driven Automatic Mode Switching for Assistive Teleoperation","summary":"  Teleoperating high degrees-of-freedom (DoF) robotic manipulators via low-DoF\ncontrollers like joysticks often requires frequent switching between control\nmodes, where each mode maps controller movements to specific robot actions.\nManually performing this frequent switching can make teleoperation cumbersome\nand inefficient. On the other hand, existing automatic mode-switching\nsolutions, such as heuristic-based or learning-based methods, are often\ntask-specific and lack generalizability. In this paper, we introduce LLM-Driven\nAutomatic Mode Switching (LAMS), a novel approach that leverages Large Language\nModels (LLMs) to automatically switch control modes based on task context.\nUnlike existing methods, LAMS requires no prior task demonstrations and\nincrementally improves by integrating user-generated mode-switching examples.\nWe validate LAMS through an ablation study and a user study with 10\nparticipants on complex, long-horizon tasks, demonstrating that LAMS\neffectively reduces manual mode switches, is preferred over alternative\nmethods, and improves performance over time. The project website with\nsupplementary materials is at https://lams-assistance.github.io/.\n","authors":["Yiran Tao","Jehan Yang","Dan Ding","Zackory Erickson"],"pdf_url":"https://arxiv.org/pdf/2501.08558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08520v1","updated":"2025-01-15T02:14:04Z","published":"2025-01-15T02:14:04Z","title":"Chance-Constrained Sampling-Based MPC for Collision Avoidance in\n  Uncertain Dynamic Environments","summary":"  Navigating safely in dynamic and uncertain environments is challenging due to\nuncertainties in perception and motion. This letter presents C2U-MPPI, a robust\nsampling-based Model Predictive Control (MPC) framework that addresses these\nchallenges by leveraging the Unscented Model Predictive Path Integral (U-MPPI)\ncontrol strategy with integrated probabilistic chance constraints, ensuring\nmore reliable and efficient navigation under uncertainty. Unlike gradient-based\nMPC methods, our approach (i) avoids linearization of system dynamics and\ndirectly applies non-convex and nonlinear chance constraints, enabling more\naccurate and flexible optimization, and (ii) enhances computational efficiency\nby reformulating probabilistic constraints into a deterministic form and\nemploying a layered dynamic obstacle representation, enabling real-time\nhandling of multiple obstacles. Extensive experiments in simulated and\nreal-world human-shared environments validate the effectiveness of our\nalgorithm against baseline methods, showcasing its capability to generate\nfeasible trajectories and control inputs that adhere to system dynamics and\nconstraints in dynamic settings, enabled by unscented-based sampling strategy\nand risk-sensitive trajectory evaluation. A supplementary video is available\nat: https://youtu.be/FptAhvJlQm8\n","authors":["Ihab S. Mohamed","Mahmoud Ali","Lantao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08520v1.pdf","comment":"This paper has 8 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2412.01168v3","updated":"2025-01-15T01:31:09Z","published":"2024-12-02T06:10:23Z","title":"On the Surprising Effectiveness of Spectrum Clipping in Learning Stable\n  Linear Dynamics","summary":"  When learning stable linear dynamical systems from data, three important\nproperties are desirable: i) predictive accuracy, ii) provable stability, and\niii) computational efficiency. Unconstrained minimization of reconstruction\nerrors leads to high accuracy and efficiency but cannot guarantee stability.\nExisting methods to remedy this focus on enforcing stability while also\nensuring accuracy, but do so only at the cost of increased computation. In this\nwork, we investigate if a straightforward approach can simultaneously offer all\nthree desiderata of learning stable linear systems. Specifically, we consider a\npost-hoc approach that manipulates the spectrum of the learned system matrix\nafter it is learned in an unconstrained fashion. We call this approach spectrum\nclipping (SC) as it involves eigen decomposition and subsequent reconstruction\nof the system matrix after clipping all of its eigenvalues that are larger than\none to one (without altering the eigenvectors). Through detailed experiments\ninvolving two different applications and publicly available benchmark datasets,\nwe demonstrate that this simple technique can simultaneously learn highly\naccurate linear systems that are provably stable. Notably, we demonstrate that\nSC can achieve similar or better performance than strong baselines while being\norders-of-magnitude faster. We also show that SC can be readily combined with\nKoopman operators to learn stable nonlinear dynamics, such as those underlying\ncomplex dexterous manipulation skills involving multi-fingered robotic hands.\nFurther, we find that SC can learn stable robot policies even when the training\ndata includes unsuccessful or truncated demonstrations. Our codes and dataset\ncan be found at https://github.com/GT-STAR-Lab/spec_clip.\n","authors":["Hanyao Guo","Yunhai Han","Harish Ravichandar"],"pdf_url":"https://arxiv.org/pdf/2412.01168v3.pdf","comment":"Under review by L4DC 2025"},{"id":"http://arxiv.org/abs/2501.08507v1","updated":"2025-01-15T01:09:11Z","published":"2025-01-15T01:09:11Z","title":"A Framework for Dynamic Situational Awareness in Human Robot Teams: An\n  Interview Study","summary":"  In human-robot teams, human situational awareness is the operator's conscious\nknowledge of the team's states, actions, plans and their environment.\nAppropriate human situational awareness is critical to successful human-robot\ncollaboration. In human-robot teaming, it is often assumed that the best and\nrequired level of situational awareness is knowing everything at all times.\nThis view is problematic, because what a human needs to know for optimal team\nperformance varies given the dynamic environmental conditions, task context and\nroles and capabilities of team members. We explore this topic by interviewing\n16 participants with active and repeated experience in diverse human-robot\nteaming applications. Based on analysis of these interviews, we derive a\nframework explaining the dynamic nature of required situational awareness in\nhuman-robot teaming. In addition, we identify a range of factors affecting the\ndynamic nature of required and actual levels of situational awareness (i.e.,\ndynamic situational awareness), types of situational awareness inefficiencies\nresulting from gaps between actual and required situational awareness, and\ntheir main consequences. We also reveal various strategies, initiated by humans\nand robots, that assist in maintaining the required situational awareness. Our\nfindings inform the implementation of accurate estimates of dynamic situational\nawareness and the design of user-adaptive human-robot interfaces. Therefore,\nthis work contributes to the future design of more collaborative and effective\nhuman-robot teams.\n","authors":["Hashini Senaratne","Leimin Tian","Pavan Sikka","Jason Williams","David Howard","Dana Kulić","Cécile Paris"],"pdf_url":"https://arxiv.org/pdf/2501.08507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09203v1","updated":"2025-01-15T23:36:05Z","published":"2025-01-15T23:36:05Z","title":"Unified Few-shot Crack Segmentation and its Precise 3D Automatic\n  Measurement in Concrete Structures","summary":"  Visual-Spatial Systems has become increasingly essential in concrete crack\ninspection. However, existing methods often lacks adaptability to diverse\nscenarios, exhibits limited robustness in image-based approaches, and struggles\nwith curved or complex geometries. To address these limitations, an innovative\nframework for two-dimensional (2D) crack detection, three-dimensional (3D)\nreconstruction, and 3D automatic crack measurement was proposed by integrating\ncomputer vision technologies and multi-modal Simultaneous localization and\nmapping (SLAM) in this study. Firstly, building on a base DeepLabv3+\nsegmentation model, and incorporating specific refinements utilizing foundation\nmodel Segment Anything Model (SAM), we developed a crack segmentation method\nwith strong generalization across unfamiliar scenarios, enabling the generation\nof precise 2D crack masks. To enhance the accuracy and robustness of 3D\nreconstruction, Light Detection and Ranging (LiDAR) point clouds were utilized\ntogether with image data and segmentation masks. By leveraging both image- and\nLiDAR-SLAM, we developed a multi-frame and multi-modal fusion framework that\nproduces dense, colorized point clouds, effectively capturing crack semantics\nat a 3D real-world scale. Furthermore, the crack geometric attributions were\nmeasured automatically and directly within 3D dense point cloud space,\nsurpassing the limitations of conventional 2D image-based measurements. This\nadvancement makes the method suitable for structural components with curved and\ncomplex 3D geometries. Experimental results across various concrete structures\nhighlight the significant improvements and unique advantages of the proposed\nmethod, demonstrating its effectiveness, accuracy, and robustness in real-world\napplications.\n","authors":["Pengru Deng","Jiapeng Yao","Chun Li","Su Wang","Xinrun Li","Varun Ojha","Xuhui He","Takashi Matsumoto"],"pdf_url":"https://arxiv.org/pdf/2501.09203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09198v1","updated":"2025-01-15T23:12:38Z","published":"2025-01-15T23:12:38Z","title":"Combining Movement Primitives with Contraction Theory","summary":"  This paper presents a modular framework for motion planning using movement\nprimitives. Central to the approach is Contraction Theory, a modular stability\ntool for nonlinear dynamical systems. The approach extends prior methods by\nachieving parallel and sequential combinations of both discrete and rhythmic\nmovements, while enabling independent modulation of each movement. This modular\nframework enables a divide-and-conquer strategy to simplify the programming of\ncomplex robot motion planning. Simulation examples illustrate the flexibility\nand versatility of the framework, highlighting its potential to address diverse\nchallenges in robot motion planning.\n","authors":["Moses C. Nah","Johannes Lachner","Neville Hogan","Jean-Jacques Slotine"],"pdf_url":"https://arxiv.org/pdf/2501.09198v1.pdf","comment":"8 pages, 4 figures, submitted to Robotics and Automation Letters\n  (RA-L) for review"},{"id":"http://arxiv.org/abs/2501.09192v1","updated":"2025-01-15T22:50:02Z","published":"2025-01-15T22:50:02Z","title":"Estimation-Aware Trajectory Optimization with Set-Valued Measurement\n  Uncertainties","summary":"  In this paper, we present an optimization-based framework for generating\nestimation-aware trajectories in scenarios where measurement (output)\nuncertainties are state-dependent and set-valued. The framework leverages the\nconcept of regularity for set-valued output maps. Specifically, we demonstrate\nthat, for output-regular maps, one can utilize a set-valued observability\nmeasure that is concave with respect to finite-horizon state trajectories. By\nmaximizing this measure, optimized estimation-aware trajectories can be\ndesigned for a broad class of systems, including those with locally linearized\ndynamics. To illustrate the effectiveness of the proposed approach, we provide\na representative example in the context of trajectory planning for vision-based\nestimation. We present an estimation-aware trajectory for an uncooperative\ntarget-tracking problem that uses a machine learning (ML)-based estimation\nmodule on an ego-satellite.\n","authors":["Aditya Deole","Mehran Mesbahi"],"pdf_url":"https://arxiv.org/pdf/2501.09192v1.pdf","comment":"25 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.09167v1","updated":"2025-01-15T21:36:19Z","published":"2025-01-15T21:36:19Z","title":"Embodied Scene Understanding for Vision Language Models via MetaVQA","summary":"  Vision Language Models (VLMs) demonstrate significant potential as embodied\nAI agents for various mobility applications. However, a standardized,\nclosed-loop benchmark for evaluating their spatial reasoning and sequential\ndecision-making capabilities is lacking. To address this, we present MetaVQA: a\ncomprehensive benchmark designed to assess and enhance VLMs' understanding of\nspatial relationships and scene dynamics through Visual Question Answering\n(VQA) and closed-loop simulations. MetaVQA leverages Set-of-Mark prompting and\ntop-down view ground-truth annotations from nuScenes and Waymo datasets to\nautomatically generate extensive question-answer pairs based on diverse\nreal-world traffic scenarios, ensuring object-centric and context-rich\ninstructions. Our experiments show that fine-tuning VLMs with the MetaVQA\ndataset significantly improves their spatial reasoning and embodied scene\ncomprehension in safety-critical simulations, evident not only in improved VQA\naccuracies but also in emerging safety-aware driving maneuvers. In addition,\nthe learning demonstrates strong transferability from simulation to real-world\nobservation. Code and data will be publicly available at\nhttps://metadriverse.github.io/metavqa .\n","authors":["Weizhen Wang","Chenda Duan","Zhenghao Peng","Yuxin Liu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.09167v1.pdf","comment":"for the project webpage, see https://metadriverse.github.io/metavqa"},{"id":"http://arxiv.org/abs/2501.09160v1","updated":"2025-01-15T21:22:09Z","published":"2025-01-15T21:22:09Z","title":"AutoLoop: Fast Visual SLAM Fine-tuning through Agentic Curriculum\n  Learning","summary":"  Current visual SLAM systems face significant challenges in balancing\ncomputational efficiency with robust loop closure handling. Traditional\napproaches require careful manual tuning and incur substantial computational\noverhead, while learning-based methods either lack explicit loop closure\ncapabilities or implement them through computationally expensive methods. We\npresent AutoLoop, a novel approach that combines automated curriculum learning\nwith efficient fine-tuning for visual SLAM systems. Our method employs a DDPG\n(Deep Deterministic Policy Gradient) agent to dynamically adjust loop closure\nweights during training, eliminating the need for manual hyperparameter search\nwhile significantly reducing the required training steps. The approach\npre-computes potential loop closure pairs offline and leverages them through an\nagent-guided curriculum, allowing the model to adapt efficiently to new\nscenarios. Experiments conducted on TartanAir for training and validated across\nmultiple benchmarks including KITTI, EuRoC, ICL-NUIM and TUM RGB-D demonstrate\nthat AutoLoop achieves comparable or superior performance while reducing\ntraining time by an order of magnitude compared to traditional approaches.\nAutoLoop provides a practical solution for rapid adaptation of visual SLAM\nsystems, automating the weight tuning process that traditionally requires\nmultiple manual iterations. Our results show that this automated curriculum\nstrategy not only accelerates training but also maintains or improves the\nmodel's performance across diverse environmental conditions.\n","authors":["Assaf Lahiany","Oren Gal"],"pdf_url":"https://arxiv.org/pdf/2501.09160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04999v2","updated":"2025-01-15T04:45:12Z","published":"2022-09-12T03:12:04Z","title":"Experimental Study on The Effect of Multi-step Deep Reinforcement\n  Learning in POMDPs","summary":"  Deep Reinforcement Learning (DRL) has made tremendous advances in both\nsimulated and real-world robot control tasks in recent years. This is\nparticularly the case for tasks that can be carefully engineered with a full\nstate representation, and which can then be formulated as a Markov Decision\nProcess (MDP). However, applying DRL strategies designed for MDPs to novel\nrobot control tasks can be challenging, because the available observations may\nbe a partial representation of the state, resulting in a Partially Observable\nMarkov Decision Process (POMDP). This paper considers three popular DRL\nalgorithms, namely Proximal Policy Optimization (PPO), Twin Delayed Deep\nDeterministic Policy Gradient (TD3), and Soft Actor-Critic (SAC), invented for\nMDPs, and studies their performance in POMDP scenarios. While prior work has\nfound that SAC and TD3 typically outperform PPO across a broad range of tasks\nthat can be represented as MDPs, we show that this is not always the case,\nusing three representative POMDP environments. Empirical studies show that this\nis related to multi-step bootstrapping, where multi-step immediate rewards,\ninstead of one-step immediate reward, are used to calculate the target value\nestimation of an observation and action pair. We identify this by observing\nthat the inclusion of multi-step bootstrapping in TD3 (MTD3) and SAC (MSAC)\nresults in improved robustness in POMDP settings.\n","authors":["Lingheng Meng","Rob Gorbet","Michael Burke","Dana Kulić"],"pdf_url":"https://arxiv.org/pdf/2209.04999v2.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2411.13951v3","updated":"2025-01-15T17:16:22Z","published":"2024-11-21T09:03:12Z","title":"A Discrete-sequence Dataset for Evaluating Online Unsupervised Anomaly\n  Detection Approaches for Multivariate Time Series","summary":"  Benchmarking anomaly detection approaches for multivariate time series is\nchallenging due to the lack of high-quality datasets. Current publicly\navailable datasets are too small, not diverse and feature trivial anomalies,\nwhich hinders measurable progress in this research area. We propose a solution:\na diverse, extensive, and non-trivial dataset generated via state-of-the-art\nsimulation tools that reflects realistic behaviour of an automotive powertrain,\nincluding its multivariate, dynamic and variable-state properties. To cater for\nboth unsupervised and semi-supervised anomaly detection settings, as well as\ntime series generation and forecasting, we make different versions of the\ndataset available, where training and test subsets are offered in contaminated\nand clean versions, depending on the task. We also provide baseline results\nfrom a small selection of approaches based on deterministic and variational\nautoencoders, as well as a non-parametric approach. As expected, the baseline\nexperimentation shows that the approaches trained on the semi-supervised\nversion of the dataset outperform their unsupervised counterparts, highlighting\na need for approaches more robust to contaminated training data.\n","authors":["Lucas Correia","Jan-Christoph Goos","Thomas Bäck","Anna V. Kononova"],"pdf_url":"https://arxiv.org/pdf/2411.13951v3.pdf","comment":"Submitted to the IEEE Transactions on Reliability journal"},{"id":"http://arxiv.org/abs/2501.08918v1","updated":"2025-01-15T16:23:04Z","published":"2025-01-15T16:23:04Z","title":"Efficient Planning in Large-scale Systems Using Hierarchical Finite\n  State Machines","summary":"  We consider optimal planning in a large-scale system formalised as a\nhierarchical finite state machine (HFSM). A planning algorithm is proposed\ncomputing an optimal plan between any two states in the HFSM, consisting of two\nsteps: A pre-processing step that computes optimal exit costs of the machines\nin the HFSM, with time complexity scaling with the number of machines; and a\nquery step that efficiently computes an optimal plan by removing irrelevant\nsubtrees of the HFSM using the optimal exit costs. The algorithm is\nreconfigurable in the sense that changes in the HFSM are handled with ease,\nwhere the pre-processing step recomputes only the optimal exit costs affected\nby the change. The algorithm can also exploit compact representations that\ngroups together identical machines in the HFSM, where the algorithm only needs\nto compute the optimal exit costs for one of the identical machines within each\ngroup, thereby avoid unnecessary recomputations. We validate the algorithm on\nlarge systems with millions of states and a robotic application. It is shown\nthat our approach outperforms Dijkstra's algorithm, Bidirectional Dijkstra and\nContraction Hierarchies.\n","authors":["Elis Stefansson","Karl H. Johansson"],"pdf_url":"https://arxiv.org/pdf/2501.08918v1.pdf","comment":"Submitted to TAC"},{"id":"http://arxiv.org/abs/2501.08916v1","updated":"2025-01-15T16:22:11Z","published":"2025-01-15T16:22:11Z","title":"Integrating Cybersecurity in Predictive Cost-Benefit Power Scheduling: A\n  DeepStack Model with Dynamic Defense Mechanism","summary":"  This paper introduces a novel, deep learning-based predictive model tailored\nto address wind curtailment in contemporary power systems, while enhancing\ncybersecurity measures through the implementation of a Dynamic Defense\nMechanism (DDM). The augmented BiLSTM architecture facilitates accurate\nshort-term predictions for wind power. In addition, a ConvGAN-driven step for\nstochastic scenario generation and a hierarchical, multi-stage optimization\nframework, which includes cases with and without Battery Energy Storage (BES),\nsignificantly minimizes operational costs. The inclusion of DDM strategically\nalters network reactances, thereby obfuscating the system's operational\nparameters to deter cyber threats. This robust solution not only integrates\nwind power more efficiently into power grids, leveraging BES potential to\nimprove the economic efficiency of the system, but also boosting the cyber\nsecurity of the system. Validation using the Illinois 200-bus system\ndemonstrates the model's potential, achieving a 98% accuracy in forecasting and\nsubstantial cost reductions of over 3.8%. The results underscore the dual\nbenefits of enhancing system reliability and security through advanced deep\nlearning architectures and the strategic application of cybersecurity measures.\n","authors":["Ali Peivand","Seyyed Mostafa Nosratabadi"],"pdf_url":"https://arxiv.org/pdf/2501.08916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08868v1","updated":"2025-01-15T15:37:33Z","published":"2025-01-15T15:37:33Z","title":"Processing and Analyzing Real-World Driving Data: Insights on Trips,\n  Scenarios, and Human Driving Behaviors","summary":"  Analyzing large volumes of real-world driving data is essential for providing\nmeaningful and reliable insights into real-world trips, scenarios, and human\ndriving behaviors. To this end, we developed a multi-level data processing\napproach that adds new information, segments data, and extracts desired\nparameters. Leveraging a confidential but extensive dataset (over 1 million\nkm), this approach leads to three levels of in-depth analysis: trip, scenario,\nand driving. The trip-level analysis explains representative properties\nobserved in real-world trips, while the scenario-level analysis focuses on\nscenario conditions resulting from road events that reduce vehicle speed. The\ndriving-level analysis identifies the cause of driving regimes for specific\nsituations and characterizes typical human driving behaviors. Such analyses can\nsupport the design of both trip- and scenario-based tests, the modeling of\nhuman drivers, and the establishment of guidelines for connected and automated\nvehicles.\n","authors":["Jihun Han","Dominik Karbowski","Ayman Moawad","Namdoo Kim","Aymeric Rousseau","Shihong Fan","Jason Hoon Lee","Jinho Ha"],"pdf_url":"https://arxiv.org/pdf/2501.08868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08853v1","updated":"2025-01-15T15:07:55Z","published":"2025-01-15T15:07:55Z","title":"Achieving Stability and Optimality: Control Strategy for a Wind Turbine\n  Supplying an Electrolyzer in the Islanded Storage-less Microgrid","summary":"  Wind power generation supplying electrolyzers in islanded microgrids is an\nessential technical pathway for green hydrogen production, attracting growing\nattention in the transition towards net zero carbon emissions. Both academia\nand industry widely recognize that islanded AC microgrids normally rely on\nbattery energy storage systems (BESSs) for grid-forming functions. However, the\nhigh cost of BESS significantly increases the levelized cost of hydrogen\n(LCOH), compromising economic feasibility. To address this challenge and reduce\nthe LCOH, this paper focuses on a wind turbine (WT) supplying an electrolyzer\nin a storage-less microgrid and identifies a unique characteristic that\nchallenges the conventional understanding of this microgrid: active power is\ncoupled with microgrid voltage rather than frequency, the latter being entirely\ndecoupled from active power balance. Based on this unique characteristic, this\npaper develops a new control strategy that maintains power balance, stabilizes\nthe voltage and frequency, and maximizes hydrogen production. The effectiveness\nof the control strategy is validated through case studies conducted in\nMatlab/Simulink, especially its capability to maintain stability while\nmaximizing hydrogen production under various conditions.\n","authors":["Bosen Yang","Kang Ma","Jin Lin","Mingjun Zhang"," QiweiDuan","Zhendong Ji","Zhi Liu","Yonghua Song"],"pdf_url":"https://arxiv.org/pdf/2501.08853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07819v3","updated":"2025-01-15T14:14:37Z","published":"2024-01-15T16:46:04Z","title":"Enforcing contraction via data","summary":"  We present data-based conditions for enforcing contractivity via feedback\ncontrol and obtain desired asymptotic properties of the closed-loop system. We\nfocus on unknown nonlinear control systems whose vector fields are expressible\nvia a dictionary of functions and derive data-dependent semidefinite programs\nwhose solution returns the controller that guarantees contractivity. When data\nare perturbed by disturbances that are linear combinations of sinusoids of\nknown frequencies (but unknown amplitude and phase) and constants, we\nremarkably obtain conditions for contractivity that do not depend on the\nmagnitude of the disturbances, with imaginable positive consequences for the\nsynthesis of the controller. Finally, we show how to design from data an\nintegral controller for nonlinear systems that achieves constant reference\ntracking and constant disturbance rejection.\n","authors":["Zhongjie Hu","Claudio De Persis","Pietro Tesi"],"pdf_url":"https://arxiv.org/pdf/2401.07819v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08808v1","updated":"2025-01-15T14:03:42Z","published":"2025-01-15T14:03:42Z","title":"A Bayesian Hierarchical Model for Generating Synthetic Unbalanced Power\n  Distribution Grids","summary":"  The real-world data of power networks is often inaccessible due to privacy\nand security concerns, highlighting the need for tools to generate realistic\nsynthetic network data. Existing methods leverage geographic tools like\nOpenStreetMap with heuristic rules to model system topology and typically focus\non single-phase, balanced systems, limiting their applicability to real-world\ndistribution systems, which are usually unbalanced. This work proposes a\nBayesian Hierarchical Model (BHM) to generate unbalanced three-phase\ndistribution systems learning from existing networks. The scheme takes as input\nthe base topology and aggregated demand per node and outputs a three-phase\nunbalanced system. The proposed scheme achieves a Mean Absolute Percentage\nError (MAPE) of less than $8\\%$ across all phases, with computation times of\n20.4 seconds for model training and 3.1 seconds per sample generation. The tool\nis applied to learn from publicly available SMART-DS dataset and applied to\ngenerate European 906 and IEEE-123 systems. We demonstrate the transfer\nlearning capability of the proposed tool by leveraging a model trained on an\nobserved system to generate a synthetic network for an unobserved system.\nSpecifically, the tool is trained using the publicly available SMART-DS dataset\nand subsequently applied to generate synthetic networks for the European\n906-bus system and the IEEE 123-bus system. This tool allows researchers to\nsimulate realistic unbalanced three-phase power data with high accuracy and\nspeed, enhancing planning and operational analysis for modern power grids.\n","authors":["Henrique O. Caetano","Rahul K. Gupta","Marco Aiello","Carlos Dias Maciel"],"pdf_url":"https://arxiv.org/pdf/2501.08808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08781v1","updated":"2025-01-15T13:05:29Z","published":"2025-01-15T13:05:29Z","title":"Cultivating Precision: Comparative Analysis of Sensor-Based Yogurt\n  Fermentation Monitoring Techniques","summary":"  Fermented dairy products, including yogurt, are widely consumed for their\nnutritional and health benefits. While numerous methods exist to monitor and\nunderstand yogurt fermentation, the literature lacks an integrated evaluation\nof diverse sensing approaches within a single experimental framework. To\naddress this gap, this study systematically examines and compares multiple\nmeasurement techniques--electrical impedance, DC resistance, pH, optical\ntransparency, carbon dioxide concentration, ambient temperature, and relative\nhumidity--in tracking the yogurt fermentation process. By presenting a unified\nset of experimental results and assessing each method's observational\ncharacteristics, this work offers an encompassing reference point for\nresearchers seeking to understand the relative merits and limitations of\ndifferent sensing modalities. Rather than establishing definitive guidelines or\npractical recommendations, the findings provide a foundation for subsequent\ninvestigations into sensor-based fermentation monitoring, thereby contributing\nto a more comprehensive understanding of yogurt fermentation dynamics.\n","authors":["Ege Keskin","İhsan Ozan Yıldırım"],"pdf_url":"https://arxiv.org/pdf/2501.08781v1.pdf","comment":"11 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.08688v1","updated":"2025-01-15T09:41:32Z","published":"2025-01-15T09:41:32Z","title":"Some remarks on practical stabilization via CLF-based control under\n  measurement noise","summary":"  Practical stabilization of input-affine systems in the presence of\nmeasurement errors and input constraints is considered in this brief note.\nAssuming that a Lyapunov function and a stabilizing control exist for an\ninput-affine system, the required measurement accuracy at each point of the\nstate space is computed. This is done via the Lyapunov function-based decay\ncondition, which describes along with the input constraints a set of admissible\ncontrols. Afterwards, the measurement time points are computed based on the\nsystem dynamics. It is shown that between these self-triggered measurement time\npoints, the system evolves and converges into the so-called target ball, i.e. a\nvicinity of the origin, where it remains. Furthermore, it is shown that the\napproach ensures the existence of a control law, which is admissible for all\npossible states and it introduces a connection between measurement time points,\nmeasurement accuracy, target ball, and decay. The results of the approach are\nshown in three examples.\n","authors":["Patrick Schmidt","Pavel Osinenko","Stefan Streif"],"pdf_url":"https://arxiv.org/pdf/2501.08688v1.pdf","comment":"14 pages, 8 figures, DOI 10.1109/ACCESS.2024.3521048"},{"id":"http://arxiv.org/abs/2401.15299v3","updated":"2025-01-15T09:23:55Z","published":"2024-01-27T05:14:17Z","title":"SupplyGraph: A Benchmark Dataset for Supply Chain Planning using Graph\n  Neural Networks","summary":"  Graph Neural Networks (GNNs) have gained traction across different domains\nsuch as transportation, bio-informatics, language processing, and computer\nvision. However, there is a noticeable absence of research on applying GNNs to\nsupply chain networks. Supply chain networks are inherently graph-like in\nstructure, making them prime candidates for applying GNN methodologies. This\nopens up a world of possibilities for optimizing, predicting, and solving even\nthe most complex supply chain problems. A major setback in this approach lies\nin the absence of real-world benchmark datasets to facilitate the research and\nresolution of supply chain problems using GNNs. To address the issue, we\npresent a real-world benchmark dataset for temporal tasks, obtained from one of\nthe leading FMCG companies in Bangladesh, focusing on supply chain planning for\nproduction purposes. The dataset includes temporal data as node features to\nenable sales predictions, production planning, and the identification of\nfactory issues. By utilizing this dataset, researchers can employ GNNs to\naddress numerous supply chain problems, thereby advancing the field of supply\nchain analytics and planning. Source: https://github.com/CIOL-SUST/SupplyGraph\n","authors":["Azmine Toushik Wasi","MD Shafikul Islam","Adipto Raihan Akib"],"pdf_url":"https://arxiv.org/pdf/2401.15299v3.pdf","comment":"Accepted to 4th workshop on Graphs and more Complex structures for\n  Learning and Reasoning, colocated with AAAI 2024"},{"id":"http://arxiv.org/abs/2501.08680v1","updated":"2025-01-15T09:22:18Z","published":"2025-01-15T09:22:18Z","title":"Digital Twin Online Channel Modeling: Challenges,Principles, and\n  Applications","summary":"  Different from traditional offline channel modeling, digital twin online\nchannel modeling can sense and accurately characterize dynamic wireless\nchannels in real time, and can therefore greatly assist 6G network\noptimization. This article proposes a novel promising framework and a\nstep-by-step design procedure of digital twin online channel models (DTOCM). By\nenabling continuous visualization and accurate prediction of dynamic channel\nvariations, DTOCM can synchronize the performance between simulated and real\nnetworks. We first explore the evolution and conceptual advancements of DTOCM,\nhighlighting its visions and associated challenges. Then, we explain its\noperational principles, construction mechanisms, and applications to typical 6G\nscenarios. Subsequently, the real-time channel information provisioning and\nvisualization capabilities of DTOCM are illustrated through our DTOCM platform\nbased on practical scenarios. Finally, future research directions and open\nissues are discussed.\n","authors":["Junling Li","Cheng-Xiang Wang","Chen Huang","Tianrun Qi","Tong Wu"],"pdf_url":"https://arxiv.org/pdf/2501.08680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16626v2","updated":"2025-01-15T05:27:47Z","published":"2024-01-29T23:50:40Z","title":"Implications of Zoning Ordinances for Rural Utility-Scale Solar\n  Deployment and Power System Decarbonization in the Great Lakes Region","summary":"  Local zoning ordinances across the United States have the impact of\nrestricting development of energy infrastructure, including utility-scale solar\nphotovoltaics. While these ordinances may be developed for legitimate purposes\nto protect public health and safety, they could impede or increase costs of\npower sector decarbonization. We quantify the role of utility-scale solar\nzoning ordinances on power sector decarbonization across the Great Lakes region\n(Illinois, Indiana, Michigan, Minnesota, Ohio, and Wisconsin) by integrating\n6,300 rural community zoning ordinances into a power system planning model.\nRelative to no ordinances, solar zoning ordinances reduce total potential\ndeployment of solar PV by 52% (or 1.6 TW) across our region. Currently,\nhowever, the biggest zoning barrier to deployment is zoning ordinances which\nare silent on utility-scale solar. Deployment restrictions translate to up to 4\nGW greater investment needs and 5.6% greater PV investment costs to achieve a\n10% PV generation target. Starker shifts occur at the state level, e.g.\nWisconsin sees a 40% reduction in PV investments due to zoning restrictions.\nOur results underscore the need for planning that aligns local zoning laws with\nstate and regional goals.\n","authors":["Papa Yaw Owusu-Obeng","Sarah Banas Mills","Michael T. Craig"],"pdf_url":"https://arxiv.org/pdf/2401.16626v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19213v2","updated":"2025-01-15T04:17:38Z","published":"2024-05-29T15:56:33Z","title":"EdgeSight: Enabling Modeless and Cost-Efficient Inference at the Edge","summary":"  Traditional ML inference is evolving toward modeless inference, which\nabstracts the complexity of model selection from users, allowing the system to\nautomatically choose the most appropriate model for each request based on\naccuracy and resource requirements. While prior studies have focused on\nmodeless inference within data centers, this paper tackles the pressing need\nfor cost-efficient modeless inference at the edge -- particularly within its\nunique constraints of limited device memory, volatile network conditions, and\nrestricted power consumption.\n  To overcome these challenges, we propose EdgeSight, a system that provides\ncost-efficient EdgeSight serving for diverse DNNs at the edge. EdgeSight\nemploys an edge-data center (edge-DC) architecture, utilizing confidence\nscaling to reduce the number of model options while meeting diverse accuracy\nrequirements. Additionally, it supports lossy inference in volatile network\nenvironments. Our experimental results show that EdgeSight outperforms existing\nsystems by up to 1.6x in P99 latency for modeless services. Furthermore, our\nFPGA prototype demonstrates similar performance at certain accuracy levels,\nwith a power consumption reduction of up to 3.34x.\n","authors":["ChonLam Lao","Jiaqi Gao","Ganesh Ananthanarayanan","Aditya Akella","Minlan Yu"],"pdf_url":"https://arxiv.org/pdf/2405.19213v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2501.08532v1","updated":"2025-01-15T02:42:31Z","published":"2025-01-15T02:42:31Z","title":"A Novel Multiple Interval Prediction Method for Electricity Prices based\n  on Scenarios Generation: Results","summary":"  This paper introduces an innovative interval prediction methodology aimed at\naddressing the limitations of current evaluation indicators while enhancing\nprediction accuracy and reliability. To achieve this, new evaluation metrics\nare proposed, offering a comprehensive assessment of interval prediction\nmethods across both all-sample and single-sample scenarios. Additionally, a\nnovel Pattern-Diversity Conditional Time-Series Generative Adversarial Network\n(PDCTSGAN) is developed, designed to generate realistic scenarios and support a\nnew interval prediction framework based on scenario generation. The PDCTSGAN\nmodel incorporates unique modifications to random noise inputs, enabling the\ncreation of pattern-diverse and realistic scenarios. These scenarios are then\nutilized to produce multiple interval patterns characterized by high coverage\nprobability and reduced average width. The proposed approach is validated\nthrough detailed case studies, and the paper concludes with a discussion of\nfuture research directions to further refine interval prediction techniques.\n","authors":["Lu Xin"],"pdf_url":"https://arxiv.org/pdf/2501.08532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08531v1","updated":"2025-01-15T02:41:00Z","published":"2025-01-15T02:41:00Z","title":"A Novel Multiple Interval Prediction Method for Electricity Prices based\n  on Scenarios Generation: Definition and Method","summary":"  This paper presents interval prediction methodology to address limitations in\nexisting evaluation indicators and improve prediction accuracy and reliability.\nFirst, new evaluation indicators are proposed to comprehensively assess\ninterval prediction methods, considering both all-sample and single-sample\nscenarios. Second, a novel Pattern-Diversity Conditional Time-Series Generative\nAdversarial Network (PDCTSGAN) is introduced to generate realistic scenarios,\nenabling a new interval prediction approach based on scenario generation. The\nPDCTSGAN model innovatively incorporates modifications to random noise inputs,\nallowing the generation of pattern-diverse realistic scenarios. These scenarios\nare further utilized to construct multiple interval patterns with high coverage\nprobability and low average width. The effectiveness of the proposed\nmethodology is demonstrated through comprehensive case studies. The paper\nconcludes by highlighting future research directions to further enhance\ninterval prediction methods.\n","authors":["Lu Xin"],"pdf_url":"https://arxiv.org/pdf/2501.08531v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2501.07827"},{"id":"http://arxiv.org/abs/2501.08520v1","updated":"2025-01-15T02:14:04Z","published":"2025-01-15T02:14:04Z","title":"Chance-Constrained Sampling-Based MPC for Collision Avoidance in\n  Uncertain Dynamic Environments","summary":"  Navigating safely in dynamic and uncertain environments is challenging due to\nuncertainties in perception and motion. This letter presents C2U-MPPI, a robust\nsampling-based Model Predictive Control (MPC) framework that addresses these\nchallenges by leveraging the Unscented Model Predictive Path Integral (U-MPPI)\ncontrol strategy with integrated probabilistic chance constraints, ensuring\nmore reliable and efficient navigation under uncertainty. Unlike gradient-based\nMPC methods, our approach (i) avoids linearization of system dynamics and\ndirectly applies non-convex and nonlinear chance constraints, enabling more\naccurate and flexible optimization, and (ii) enhances computational efficiency\nby reformulating probabilistic constraints into a deterministic form and\nemploying a layered dynamic obstacle representation, enabling real-time\nhandling of multiple obstacles. Extensive experiments in simulated and\nreal-world human-shared environments validate the effectiveness of our\nalgorithm against baseline methods, showcasing its capability to generate\nfeasible trajectories and control inputs that adhere to system dynamics and\nconstraints in dynamic settings, enabled by unscented-based sampling strategy\nand risk-sensitive trajectory evaluation. A supplementary video is available\nat: https://youtu.be/FptAhvJlQm8\n","authors":["Ihab S. Mohamed","Mahmoud Ali","Lantao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08520v1.pdf","comment":"This paper has 8 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.08516v1","updated":"2025-01-15T02:00:27Z","published":"2025-01-15T02:00:27Z","title":"A Survey on IBR Penetrated Power System Stability Analysis Using\n  Frequency Scanning","summary":"  The rapid rise in inverter-based renewable resources has heightened concerns\nover subsynchronous resonance and oscillations, thereby challenging grid\nstability. This paper reviews approaches to identify and mitigate these issues,\nfocusing on frequency scanning methods for stability assessment. It categorizes\nwhite-, black-, and gray-box modeling techniques, compares positive-sequence,\ndq-frame, and alpha-beta domain scanning, and examines perturbation shapes like\nstep, ramp, and chirp. A comparative study highlights their strengths,\nlimitations, and suitability for specific scenarios. By summarizing past events\nand surveying available tools, this work guides operators and researchers\ntoward more effective, reliable stability analysis methods in grids with high\nrenewable penetration.\n","authors":["Shuvangkar Chandra Das","Lokesh Saravana","Le Minh Vu","Manh Bui","Tuyen Vu","Jianhua Zhang","Thomas Ortmeyer"],"pdf_url":"https://arxiv.org/pdf/2501.08516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01168v3","updated":"2025-01-15T01:31:09Z","published":"2024-12-02T06:10:23Z","title":"On the Surprising Effectiveness of Spectrum Clipping in Learning Stable\n  Linear Dynamics","summary":"  When learning stable linear dynamical systems from data, three important\nproperties are desirable: i) predictive accuracy, ii) provable stability, and\niii) computational efficiency. Unconstrained minimization of reconstruction\nerrors leads to high accuracy and efficiency but cannot guarantee stability.\nExisting methods to remedy this focus on enforcing stability while also\nensuring accuracy, but do so only at the cost of increased computation. In this\nwork, we investigate if a straightforward approach can simultaneously offer all\nthree desiderata of learning stable linear systems. Specifically, we consider a\npost-hoc approach that manipulates the spectrum of the learned system matrix\nafter it is learned in an unconstrained fashion. We call this approach spectrum\nclipping (SC) as it involves eigen decomposition and subsequent reconstruction\nof the system matrix after clipping all of its eigenvalues that are larger than\none to one (without altering the eigenvectors). Through detailed experiments\ninvolving two different applications and publicly available benchmark datasets,\nwe demonstrate that this simple technique can simultaneously learn highly\naccurate linear systems that are provably stable. Notably, we demonstrate that\nSC can achieve similar or better performance than strong baselines while being\norders-of-magnitude faster. We also show that SC can be readily combined with\nKoopman operators to learn stable nonlinear dynamics, such as those underlying\ncomplex dexterous manipulation skills involving multi-fingered robotic hands.\nFurther, we find that SC can learn stable robot policies even when the training\ndata includes unsuccessful or truncated demonstrations. Our codes and dataset\ncan be found at https://github.com/GT-STAR-Lab/spec_clip.\n","authors":["Hanyao Guo","Yunhai Han","Harish Ravichandar"],"pdf_url":"https://arxiv.org/pdf/2412.01168v3.pdf","comment":"Under review by L4DC 2025"},{"id":"http://arxiv.org/abs/2501.00722v2","updated":"2025-01-15T00:22:15Z","published":"2025-01-01T04:54:08Z","title":"Performance-Barrier Event-Triggered PDE Control of Traffic Flow","summary":"  For stabilizing stop-and-go oscillations in traffic flow by actuating a\nvariable speed limit (VSL) at a downstream boundary of a freeway segment, we\nintroduce event-triggered PDE backstepping designs employing the recent concept\nof performance-barrier event-triggered control (P-ETC). Our design is for\nlinearized hyperbolic Aw-Rascle-Zhang (ARZ) PDEs governing traffic velocity and\ndensity. Compared to continuous feedback, ETC provides a piecewise-constant VSL\ncommands-more likely to be obeyed by human drivers. Unlike the existing regular\nETC (R-ETC), which enforces conservatively a strict decrease of a Lyapunov\nfunction, our performance-barrier (P-ETC) approach permits an increase, as long\nas the Lyapunov function remains below a performance barrier, resulting in\nfewer control updates than R-ETC. To relieve VSL from continuously monitoring\nthe triggering function, we also develop periodic event-triggered (PETC) and\nself-triggered (STC) versions of both R-ETC and P-ETC. These are referred to as\nR/P-PETC and R/P-STC, respectively, and we show that they both guarantee\nZeno-free behavior and exponential convergence in the spatial $L^2$ norm. With\ncomparative simulations, we illustrate the benefits of the performance-barrier\ndesigns through traffic metrics (driver comfort, safety, travel time, fuel\nconsumption). The proposed algorithms reduce discomfort nearly in half relative\nto driver behavior without VSL, while tripling the driver safety, measured by\nthe average dwell time, relative to the R-ETC frequent-switching VSL schedule.\n","authors":["Peihan Zhang","Bhathiya Rathnayake","Mamadou Diagne","Miroslav Krstic"],"pdf_url":"https://arxiv.org/pdf/2501.00722v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11051v3","updated":"2025-01-15T00:12:26Z","published":"2024-07-09T06:10:59Z","title":"Exploring the 6G Potentials: Immersive, Hyper Reliable, and Low-Latency\n  Communication","summary":"  The transition towards the sixth-generation (6G) wireless telecommunications\nnetworks introduces significant challenges for researchers and industry\nstakeholders. The 6G technology aims to enhance existing usage scenarios\nthrough supporting innovative applications that require stringent key\nperformance indicators (KPIs). In some critical use cases of 6G, multiple KPIs,\nincluding immersive throughput, with an envisioned peak data rate of $1$ Tbps,\nhyper-reliability, in the range of $10^{-5}$ to $10^{-7}$, and hyper\nlow-latency, between $0.1$ and $1$ ms, must be achieved simultaneously to\ndeliver the expected service experience. However, this is challenging due to\nthe conflicting nature of these KPIs. This article proposes a new service class\nof 6G as immersive, hyper reliable, and low-latency communication (IHRLLC), and\nintroduces a potential network architecture to achieve the associated KPIs.\nSpecifically, enhanced technologies, such as ultra-massive multiple-input\nmultiple-output (umMIMO)-aided terahertz (THz) communications, reconfigurable\nintelligent surfaces (RIS), and non-terrestrial networks (NTN), are viewed as\nthe key enablers for achieving immersive data rates and hyper reliability.\nGiven the computational complexity involved in employing these technologies, we\npropose mathematical and computational enabling technologies, such as\nlearn-to-optimize (L2O), generative-AI (GenAI), quantum computing, and network\ndigital twin (NDT), to complement the proposed architecture and optimize the\nlatency.\n","authors":["Afsoon Alidadi Shamsabadi","Animesh Yadav","Yasser Gadallah","Halim Yanikomeroglu"],"pdf_url":"https://arxiv.org/pdf/2407.11051v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09192v1","updated":"2025-01-15T22:50:02Z","published":"2025-01-15T22:50:02Z","title":"Estimation-Aware Trajectory Optimization with Set-Valued Measurement\n  Uncertainties","summary":"  In this paper, we present an optimization-based framework for generating\nestimation-aware trajectories in scenarios where measurement (output)\nuncertainties are state-dependent and set-valued. The framework leverages the\nconcept of regularity for set-valued output maps. Specifically, we demonstrate\nthat, for output-regular maps, one can utilize a set-valued observability\nmeasure that is concave with respect to finite-horizon state trajectories. By\nmaximizing this measure, optimized estimation-aware trajectories can be\ndesigned for a broad class of systems, including those with locally linearized\ndynamics. To illustrate the effectiveness of the proposed approach, we provide\na representative example in the context of trajectory planning for vision-based\nestimation. We present an estimation-aware trajectory for an uncooperative\ntarget-tracking problem that uses a machine learning (ML)-based estimation\nmodule on an ego-satellite.\n","authors":["Aditya Deole","Mehran Mesbahi"],"pdf_url":"https://arxiv.org/pdf/2501.09192v1.pdf","comment":"25 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.09143v1","updated":"2025-01-15T20:50:15Z","published":"2025-01-15T20:50:15Z","title":"Reducing real-time complexity via sub-control Lyapunov functions: from\n  theory to experiments","summary":"  The techniques to design control Lyapunov functions (CLF), along with a\nproper stabilizing feedback, possibly in the presence of constraints, often\nprovide control laws that are too complex for proper implementation online,\nespecially when an optimization problem is involved. In this work, we show how\nto acquire an alternative, computationally attractive feedback. Given a nominal\nCLF and a nominal state feedback, we say that a different positive definite\nfunction is a Sub-control Lyapunov function (SCLF) if its Lyapunov derivative\nis negative-definite and bounded above by the Lyapunov derivative of the\nnominal function with the nominal control. It turns out that if we consider a\nfamily of basis functions, then a SCLF can be computed by linear programming,\nwith an infinite number of constraints. The idea is that although the offline\ncomputational burden to achieve the new controller and solve the linear program\nis considerable, the online computational burden is drastically reduced.\nComprehensive simulations and experiments on drone control are conducted to\ndemonstrate the effectiveness of the study.\n","authors":["Huu-Thinh Do","Franco Blanchini","Stefano Miani","Ionela Prodan"],"pdf_url":"https://arxiv.org/pdf/2501.09143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07540v2","updated":"2025-01-15T20:00:29Z","published":"2024-11-12T04:37:46Z","title":"Lateral String Stability in Autonomous & Connected Vehicle Platoons","summary":"  This paper addresses the lateral control of Autonomous and Connected Vehicles\n(ACVs) in a platoon executing an Emergency Lane Change (ELC) maneuver. These\nmaneuvers are typically triggered by emergency signals from the front or rear\nof the platoon in response to the need to avoid obstacles or allow other\nvehicles to pass. The study assumes that ACVs maintain reliable connectivity,\nenabling each following vehicle to access GPS position traces of both the lead\nand immediately preceding vehicles in the platoon. We demonstrate that lateral\nstring stability in the ACV platoon can be achieved using communicated\ninformation solely from the lead and preceding vehicles. Additionally, we\npresent a lateral control framework for ACVs, which helps track a discretized\npreview of the trajectory constructed from the communicated data. This\nframework involves constructing two distinct trajectories based on the preview\ndata from the lead and preceding vehicles, calculating the associated errors\nand lateral control actions for each, and then integrating these to generate a\nsteering command. Numerical results validate the effectiveness of the proposed\nlateral control scheme.\n","authors":["Neelkamal Somisetty","Swaroop Darbha"],"pdf_url":"https://arxiv.org/pdf/2411.07540v2.pdf","comment":"18th IEEE International Conference on Vehicular Electronics and\n  Safety 2024 (ICVES)"}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2405.09809v4","updated":"2025-01-15T17:52:37Z","published":"2024-05-16T04:42:21Z","title":"Dynamic Sensor Selection for Biomarker Discovery","summary":"  Advances in methods of biological data collection are driving the rapid\ngrowth of comprehensive datasets across clinical and research settings. These\ndatasets provide the opportunity to monitor biological systems in greater depth\nand at finer time steps than was achievable in the past. Classically,\nbiomarkers are used to represent and track key aspects of a biological system.\nBiomarkers retain utility even with the availability of large datasets, since\nmonitoring and interpreting changes in a vast number of molecules remains\nimpractical. However, given the large number of molecules in these datasets, a\nmajor challenge is identifying the best biomarkers for a particular setting\nHere, we apply principles of observability theory to establish a general\nmethodology for biomarker selection. We demonstrate that observability measures\neffectively identify biologically meaningful sensors in a range of time series\ntranscriptomics data. Motivated by the practical considerations of biological\nsystems, we introduce the method of dynamic sensor selection (DSS) to maximize\nobservability over time, thus enabling observability over regimes where system\ndynamics themselves are subject to change. This observability framework is\nflexible, capable of modeling gene expression dynamics and using auxiliary\ndata, including chromosome conformation, to select biomarkers. Additionally, we\ndemonstrate the applicability of this approach beyond genomics by evaluating\nthe observability of neural activity These applications demonstrate the utility\nof observability-guided biomarker selection for across a wide range of\nbiological systems, from agriculture and biomanufacturing to neural\napplications and beyond.\n","authors":["Joshua Pickard","Cooper Stansbury","Amit Surana","Lindsey Muir","Anthony Bloch","Indika Rajapakse"],"pdf_url":"https://arxiv.org/pdf/2405.09809v4.pdf","comment":"21 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.04488v2","updated":"2025-01-15T17:49:19Z","published":"2024-08-08T14:35:55Z","title":"Multi-Objective LQR with Linear Scalarization","summary":"  The framework of decision-making, modeled as a Markov Decision Process (MDP),\ntypically assumes a single objective. However, practical scenarios often\ninvolve tradeoffs between multiple objectives. We address this in the Linear\nQuadratic Regulator (LQR), a canonical continuous, infinite horizon MDP. First,\nwe establish that the Pareto front for LQR is characterized by linear\nscalarization: a convex combination of objectives recovers all tradeoff points,\nmaking multi-objective LQR reducible to single-objective problems. This\nhighlights an important instance where linear scalarization suffices for a\nnon-convex problem. Second, we show the Pareto front is smooth, in that an\n$\\epsilon$ perturbation of a scalarization parameter yields an $\\epsilon$\napproximation to the objective. These results inspire a simple algorithm to\napproximate the Pareto front via grid search over scalarization parameters,\nwhere each optimization problem retains the computational efficiency of\nsingle-objective LQR. Lastly, we extend the analysis to certainty equivalence,\nwhere unknown dynamics are replaced with estimates.\n","authors":["Ali Jadbabaie","Devavrat Shah","Sean R. Sinclair"],"pdf_url":"https://arxiv.org/pdf/2408.04488v2.pdf","comment":"38 pages, 2 figures"},{"id":"http://arxiv.org/abs/2304.06379v3","updated":"2025-01-15T16:52:21Z","published":"2023-04-13T10:24:09Z","title":"Separable approximations of optimal value functions under a decaying\n  sensitivity assumption","summary":"  An efficient approach for the construction of separable approximations of\noptimal value functions from interconnected optimal control problems is\npresented. The approach is based on assuming decaying sensitivities between\nsubsystems, enabling a curse-of-dimensionality free approximation, for instance\nby deep neural networks.\n","authors":["Mario Sperl","Luca Saluzzi","Lars Grüne","Dante Kalise"],"pdf_url":"https://arxiv.org/pdf/2304.06379v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08906v1","updated":"2025-01-15T16:15:29Z","published":"2025-01-15T16:15:29Z","title":"A consensus-based optimization method for nonsmooth nonconvex programs\n  with approximated gradient descent scheme","summary":"  In this paper, we are interested in finding the global minimizer of a\nnonsmooth nonconvex unconstrained optimization problem. By combining the\ndiscrete consensus-based optimization (CBO) algorithm and the gradient descent\nmethod, we develop a novel CBO algorithm with an extra gradient descent scheme\nevaluated by the forward-difference technique on the function values, where\nonly the objective function values are used in the proposed algorithm. First,\nwe prove that the proposed algorithm can exhibit global consensus in an\nexponential rate in two senses and possess a unique global consensus point.\nSecond, we evaluate the error estimate between the objective function value on\nthe global consensus point and its global minimum. In particular, as the\nparameter $\\beta$ tends to $\\infty$, the error converges to zero and the\nconvergence rate is $\\mathcal{O}\\left(\\frac{\\log\\beta}{\\beta}\\right)$. Third,\nunder some suitable assumptions on the objective function, we provide the\nnumber of iterations required for the mean square error in expectation to reach\nthe desired accuracy. It is worth underlining that the theoretical analysis in\nthis paper does not use the mean-field limit. Finally, we illustrate the\nimproved efficiency and promising performance of our novel CBO method through\nsome experiments on several nonconvex benchmark problems and the application to\ntrain deep neural networks.\n","authors":["Jiazhen Wei","Fan Wu","Wei Bian"],"pdf_url":"https://arxiv.org/pdf/2501.08906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08887v1","updated":"2025-01-15T15:57:13Z","published":"2025-01-15T15:57:13Z","title":"PAC Learnability of Scenario Decision-Making Algorithms: Necessary and\n  Sufficient Conditions","summary":"  We study the PAC property of scenario decision-making algorithms, that is,\nthe ability to make a decision that has an arbitrarily low risk of violating an\nunknown safety constraint, provided sufficiently many realizations (called\nscenarios) of the safety constraint are sampled. Sufficient conditions for\nscenario decision-making algorithms to be PAC are available in the literature,\nsuch as finiteness of the VC dimension of its associated classifier and\nexistence of a compression scheme. We study the question of whether these\nsufficient conditions are also necessary. We show with counterexamples that\nthis is not the case in general. This contrasts with binary classification\nlearning, for which the analogous conditions are sufficient and necessary.\nPopular scenario decision-making algorithms, such as scenario optimization,\nenjoy additional properties, such as stability and consistency. We show that\neven under these additional assumptions the above conclusions hold. Finally, we\nderive a necessary condition for scenario decision-making algorithms to be PAC,\ninspired by the VC dimension and the so-called no-free-lunch theorem.\n","authors":["Guillaume O. Berger","Raphaël M. Jungers"],"pdf_url":"https://arxiv.org/pdf/2501.08887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08884v1","updated":"2025-01-15T15:53:34Z","published":"2025-01-15T15:53:34Z","title":"Improved Compression Bounds for Scenario Decision Making","summary":"  Scenario decision making offers a flexible way of making decision in an\nuncertain environment while obtaining probabilistic guarantees on the risk of\nfailure of the decision. The idea of this approach is to draw samples of the\nuncertainty and make a decision based on the samples, called \"scenarios\". The\nprobabilistic guarantees take the form of a bound on the probability of\nsampling a set of scenarios that will lead to a decision whose risk of failure\nis above a given maximum tolerance. This bound can be expressed as a function\nof the number of sampled scenarios, the maximum tolerated risk, and some\nintrinsic property of the problem called the \"compression size\". Several such\nbounds have been proposed in the literature under various assumptions on the\nproblem. We propose new bounds that improve upon the existing ones without\nrequiring stronger assumptions on the problem.\n","authors":["Guillaume O. Berger","Raphaël M. Jungers"],"pdf_url":"https://arxiv.org/pdf/2501.08884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12623v3","updated":"2025-01-15T13:51:54Z","published":"2024-01-23T10:19:50Z","title":"A Unifying System Theory Framework for Distributed Optimization and\n  Games","summary":"  This paper introduces a systematic methodological framework to design and\nanalyze distributed algorithms for optimization and games over networks.\nStarting from a centralized method, we identify an aggregation function\ninvolving all the decision variables (e.g., a global cost gradient or\nconstraint) and introduce a distributed consensus-oriented scheme to\nasymptotically approximate the unavailable information at each agent. Then, we\ndelineate the proper methodology for intertwining the identified building\nblocks, i.e., the optimization-oriented method and the consensus-oriented one.\nThe key intuition is to interpret the obtained interconnection as a singularly\nperturbed system. We rely on this interpretation to provide sufficient\nconditions for the building blocks to be successfully connected into a\ndistributed scheme exhibiting the convergence guarantees of the centralized\nalgorithm. Finally, we show the potential of our approach by developing a new\ndistributed scheme for constraint-coupled problems with a linear convergence\nrate.\n","authors":["Guido Carnevale","Nicola Mimmo","Giuseppe Notarstefano"],"pdf_url":"https://arxiv.org/pdf/2401.12623v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08800v1","updated":"2025-01-15T13:49:33Z","published":"2025-01-15T13:49:33Z","title":"Markov decision processes: on the convergence of the Monte-Carlo first\n  visit algorithm","summary":"  We consider the Monte-Carlo first visit algorithm, of which the goal is to\nfind the optimal control in a Markov decision process with finite state space\nand finite number of possible actions. We show its convergence when the\ndiscount factor is smaller than $1/2$.\n","authors":["Sylvain Delattre","Nicolas Fournier"],"pdf_url":"https://arxiv.org/pdf/2501.08800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08779v1","updated":"2025-01-15T13:01:34Z","published":"2025-01-15T13:01:34Z","title":"Nesterov Acceleration for Ensemble Kalman Inversion and Variants","summary":"  Ensemble Kalman inversion (EKI) is a derivative-free, particle-based\noptimization method for solving inverse problems. It can be shown that EKI\napproximates a gradient flow, which allows the application of methods for\naccelerating gradient descent. Here, we show that Nesterov acceleration is\neffective in speeding up the reduction of the EKI cost function on a variety of\ninverse problems. We also implement Nesterov acceleration for two EKI variants,\nunscented Kalman inversion and ensemble transform Kalman inversion. Our\nspecific implementation takes the form of a particle-level nudge that is\ndemonstrably simple to couple in a black-box fashion with any existing EKI\nvariant algorithms, comes with no additional computational expense, and with no\nadditional tuning hyperparameters. This work shows a pathway for future\nresearch to translate advances in gradient-based optimization into advances in\ngradient-free Kalman optimization.\n","authors":["Sydney Vernon","Eviatar Bach","Oliver R. A. Dunbar"],"pdf_url":"https://arxiv.org/pdf/2501.08779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08770v1","updated":"2025-01-15T12:42:33Z","published":"2025-01-15T12:42:33Z","title":"Major-Minor Mean Field Game of Stopping: An Entropy Regularization\n  Approach","summary":"  This paper studies a discrete-time major-minor mean field game of stopping\nwhere the major player can choose either an optimal control or stopping time.\nWe look for the relaxed equilibrium as a randomized stopping policy, which is\nformulated as a fixed point of a set-valued mapping, whose existence is\nchallenging by direct arguments. To overcome the difficulties caused by the\npresence of a major player, we propose to study an auxiliary problem by\nconsidering entropy regularization in the major player's problem while\nformulating the minor players' optimal stopping problems as linear programming\nover occupation measures. We first show the existence of regularized equilibria\nas fixed points of some simplified set-valued operator using the\nKakutani-Fan-Glicksberg fixed-point theorem. Next, we prove that the\nregularized equilibrium converges as the regularization parameter $\\lambda$\ntends to 0, and the limit corresponds to a fixed point of the original\noperator, thereby confirming the existence of a relaxed equilibrium in the\noriginal mean field game problem.\n","authors":["Xiang Yu","Jiacheng Zhang","Keyu Zhang","Zhou Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.08770v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2210.03554 by other authors"},{"id":"http://arxiv.org/abs/2408.04417v2","updated":"2025-01-15T11:45:31Z","published":"2024-08-08T12:44:48Z","title":"An Overview of Convergence Rates for Sum of Squares Hierarchies in\n  Polynomial Optimization","summary":"  In this survey we consider polynomial optimization problems, asking to\nminimize a polynomial function over a compact semialgebraic set, defined by\npolynomial inequalities. This models a great variety of (in general, nonlinear\nnonconvex) optimization problems. Various hierarchies of (lower and upper)\nbounds have been introduced, having the remarkable property that they converge\nasymptotically to the global minimum. These bounds exploit algebraic\nrepresentations of positive polynomials in terms of sums of squares and can be\ncomputed using semidefinite optimization. Our focus lies in the performance\nanalysis of these hierarchies of bounds, namely, in how far the bounds are from\nthe global minimum as the degrees of the sums of squares they involve tend to\ninfinity. We present the main state-of-the-art results and offer a gentle\nintroductory overview over the various techniques that have been recently\ndeveloped to establish them, stemming from the theory of orthogonal\npolynomials, approximation theory, Fourier analysis, and more.\n","authors":["Monique Laurent","Lucas Slot"],"pdf_url":"https://arxiv.org/pdf/2408.04417v2.pdf","comment":"v2: Made minor change to title. Fixed several typos. Updated caption\n  of Table 2"},{"id":"http://arxiv.org/abs/2501.08719v1","updated":"2025-01-15T10:59:22Z","published":"2025-01-15T10:59:22Z","title":"Extrapolated Hard Thresholding Algorithms with Finite Length for\n  Composite $\\ell_0$ Penalized Problems","summary":"  For a class of sparse optimization problems with the penalty function of\n$\\|(\\cdot)_+\\|_0$, we first characterize its local minimizers and then propose\nan extrapolated hard thresholding algorithm to solve such problems. We show\nthat the iterates generated by the proposed algorithm with $\\epsilon>0$ (where\n$\\epsilon$ is the dry friction coefficient) have finite length, without relying\non the Kurdyka-{\\L}ojasiewicz inequality. Furthermore, we demonstrate that the\nalgorithm converges to an $\\epsilon$-local minimizer of this problem. For the\nspecial case that $\\epsilon=0$, we establish that any accumulation point of the\niterates is a local minimizer of the problem. Additionally, we analyze the\nconvergence when an error term is present in the algorithm, showing that the\nalgorithm still converges in the same manner as before, provided that the\nerrors asymptotically approach zero. Finally, we conduct numerical experiments\nto verify the theoretical results of the proposed algorithm.\n","authors":["Fan Wu","Jiazhen Wei","Wei Bian"],"pdf_url":"https://arxiv.org/pdf/2501.08719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08709v1","updated":"2025-01-15T10:50:47Z","published":"2025-01-15T10:50:47Z","title":"Kernel EDMD for data-driven nonlinear Koopman MPC with stability\n  guarantees","summary":"  Extended dynamic mode decomposition (EDMD) is a popular data-driven method to\npredict the action of the Koopman operator, i.e., the evolution of an\nobservable function along the flow of a dynamical system. In this paper, we\nleverage a recently-introduced kernel EDMD method for control systems for\ndata-driven model predictive control. Building upon pointwise error bounds\nproportional in the state, we rigorously show practical asymptotic stability of\nthe origin w.r.t. the MPC closed loop without stabilizing terminal conditions.\nThe key novelty is that we avoid restrictive invariance conditions. Last, we\nverify our findings by numerical simulations.\n","authors":["Lea Bold","Manuel Schaller","Irene Schimperna","Karl Worthmann"],"pdf_url":"https://arxiv.org/pdf/2501.08709v1.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.08706v1","updated":"2025-01-15T10:45:24Z","published":"2025-01-15T10:45:24Z","title":"Optimal control of counter-terrorism tactics","summary":"  This paper presents an optimal control problem to analyze the efficacy of\ncounter-terrorism tactics. We present an algorithm that efficiently combines\nthe Minimum Principle of Pontryagin, the shooting method and the cyclic descent\nof coordinates. We also present a result that allows us to know a priori the\nsteady state solutions. Using this technique we are able to choose parameters\nthat reach a specific solution, of which there are two. Numerical examples are\npresented to illustrate the possibilities of the method. Finally, we study the\nsufficient conditions for optimality and suggest an improvement on the\nfunctional which also guarantees local optimality.\n","authors":["L. Bayon","P. Fortuny Ayuso","P. J. Garcia-Nieto","J. M. Grau","M. M. Ruiz"],"pdf_url":"https://arxiv.org/pdf/2501.08706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05807v2","updated":"2025-01-15T09:53:49Z","published":"2024-10-08T08:40:07Z","title":"Extended convexity and smoothness and their applications in deep\n  learning","summary":"  This paper introduces an optimization framework aimed at providing a\ntheoretical foundation for a class of composite optimization problems,\nparticularly those encountered in deep learning. In this framework, we\nintroduce $\\mathcal{H}(\\phi)$-convexity and $\\mathcal{H}(\\Phi)$-smoothness to\ngeneralize the existing concepts of Lipschitz smoothness and strong convexity.\nFurthermore, we analyze and establish the convergence of both gradient descent\nand stochastic gradient descent methods for objective functions that are\n$\\mathcal{H}(\\Phi)$-smooth. We prove that the optimal convergence rates of\nthese methods depend solely on the homogeneous degree of $\\Phi$. Based on these\nfindings, we construct two types of non-convex and non-smooth optimization\nproblems: deterministic composite and stochastic composite optimization\nproblems, which encompass the majority of optimization problems in deep\nlearning. To address these problems, we develop the gradient structure control\nalgorithm and prove that it can locate approximate global optima. This marks a\nsignificant departure from traditional non-convex analysis framework, which\ntypically settle for stationary points. Therefore, with the introduction of\n$\\mathcal{H}(\\phi)$-convexity and $\\mathcal{H}(\\Phi)$-smoothness, along with\nthe GSC algorithm, the non-convex optimization mechanisms in deep learning can\nbe theoretically explained and supported. Finally, the effectiveness of the\nproposed framework is substantiated through empirical experimentation.\n","authors":["Binchuan Qi","Wei Gong","Li Li"],"pdf_url":"https://arxiv.org/pdf/2410.05807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08688v1","updated":"2025-01-15T09:41:32Z","published":"2025-01-15T09:41:32Z","title":"Some remarks on practical stabilization via CLF-based control under\n  measurement noise","summary":"  Practical stabilization of input-affine systems in the presence of\nmeasurement errors and input constraints is considered in this brief note.\nAssuming that a Lyapunov function and a stabilizing control exist for an\ninput-affine system, the required measurement accuracy at each point of the\nstate space is computed. This is done via the Lyapunov function-based decay\ncondition, which describes along with the input constraints a set of admissible\ncontrols. Afterwards, the measurement time points are computed based on the\nsystem dynamics. It is shown that between these self-triggered measurement time\npoints, the system evolves and converges into the so-called target ball, i.e. a\nvicinity of the origin, where it remains. Furthermore, it is shown that the\napproach ensures the existence of a control law, which is admissible for all\npossible states and it introduces a connection between measurement time points,\nmeasurement accuracy, target ball, and decay. The results of the approach are\nshown in three examples.\n","authors":["Patrick Schmidt","Pavel Osinenko","Stefan Streif"],"pdf_url":"https://arxiv.org/pdf/2501.08688v1.pdf","comment":"14 pages, 8 figures, DOI 10.1109/ACCESS.2024.3521048"},{"id":"http://arxiv.org/abs/2501.08658v1","updated":"2025-01-15T08:49:11Z","published":"2025-01-15T08:49:11Z","title":"$H^\\infty$-control for a class of boundary controlled hyperbolic PDEs","summary":"  A solution to the suboptimal $H^\\infty$-control problem is given for a class\nof hyperbolic partial differential equations (PDEs). The first result of this\nmanuscript shows that the considered class of PDEs admits an equivalent\nrepresentation as an infinite-dimensional discrete-time system. Taking\nadvantage of this, this manuscript shows that it is equivalent to solve the\nsuboptimal $H^\\infty$-control problem for a finite-dimensional discrete-time\nsystem whose matrices are derived from the PDEs. After computing the solution\nto this much simpler problem, the solution to the original problem can be\ndeduced easily. In particular, the optimal compensator solution to the\nsuboptimal $H^\\infty$-control problem is governed by a set of hyperbolic PDEs,\nactuated and observed at the boundary. We illustrate our results with a\nboundary controlled and boundary observed vibrating string.\n","authors":["Anthony Hastir","Birgit Jacob","Hans Zwart"],"pdf_url":"https://arxiv.org/pdf/2501.08658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08651v1","updated":"2025-01-15T08:27:05Z","published":"2025-01-15T08:27:05Z","title":"Geometry of Sparsity-Inducing Norms","summary":"  Sparse optimization seeks an optimal solution with few nonzero entries. To\nachieve this, it is common to add to the criterion a penalty term proportional\nto the $\\ell_1$-norm, which is recognized as the archetype of sparsity-inducing\nnorms. In this approach, the number of nonzero entries is not controlled a\npriori. By contrast, in this paper, we focus on finding an optimal solution\nwith at most~$k$ nonzero coordinates (or for short, $k$-sparse vectors), where\n$k$ is a given sparsity level (or ``sparsity budget''). For this purpose, we\nstudy the class of generalized $k$-support norms that arise from a given source\nnorm. When added as a penalty term, we provide conditions under which such\ngeneralized $k$-support norms promote $k$-sparse solutions. The result follows\nfrom an analysis of the exposed faces of closed convex sets generated by\n$k$-sparse vectors, and of how primal support identification can be deduced\nfrom dual information. Finally, we study some of the geometric properties of\nthe unit balls for the $k$-support norms and their dual norms when the source\nnorm belongs to the family of $\\ell_p$-norms.\n","authors":["Jean-Philippe Chancelier","Michel de Lara","Antoine Deza","Lionel Pournin"],"pdf_url":"https://arxiv.org/pdf/2501.08651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19212v5","updated":"2025-01-15T08:18:27Z","published":"2024-09-28T02:30:44Z","title":"An Accelerated Algorithm for Stochastic Bilevel Optimization under\n  Unbounded Smoothness","summary":"  This paper investigates a class of stochastic bilevel optimization problems\nwhere the upper-level function is nonconvex with potentially unbounded\nsmoothness and the lower-level problem is strongly convex. These problems have\nsignificant applications in sequential data learning, such as text\nclassification using recurrent neural networks. The unbounded smoothness is\ncharacterized by the smoothness constant of the upper-level function scaling\nlinearly with the gradient norm, lacking a uniform upper bound. Existing\nstate-of-the-art algorithms require $\\widetilde{O}(1/\\epsilon^4)$ oracle calls\nof stochastic gradient or Hessian/Jacobian-vector product to find an\n$\\epsilon$-stationary point. However, it remains unclear if we can further\nimprove the convergence rate when the assumptions for the function in the\npopulation level also hold for each random realization almost surely. To\naddress this issue, we propose a new Accelerated Bilevel Optimization algorithm\nnamed AccBO. The algorithm updates the upper-level variable by normalized\nstochastic gradient descent with recursive momentum and the lower-level\nvariable by the stochastic Nesterov accelerated gradient descent algorithm with\naveraging. We prove that our algorithm achieves an oracle complexity of\n$\\widetilde{O}(1/\\epsilon^3)$ to find an $\\epsilon$-stationary point, when the\nlower-level stochastic gradient's variance is $O(\\epsilon)$. Our proof relies\non a novel lemma characterizing the dynamics of stochastic Nesterov accelerated\ngradient descent algorithm under distribution drift with high probability for\nthe lower-level variable, which is of independent interest and also plays a\ncrucial role in analyzing the hypergradient estimation error over time.\nExperimental results on various tasks confirm that our proposed algorithm\nachieves the predicted theoretical acceleration and significantly outperforms\nbaselines in bilevel optimization.\n","authors":["Xiaochuan Gong","Jie Hao","Mingrui Liu"],"pdf_url":"https://arxiv.org/pdf/2409.19212v5.pdf","comment":"Accepted by NeurIPS 2024. The code is available at\n  https://github.com/MingruiLiu-ML-Lab/Accelerated-Bilevel-Optimization-Unbounded-Smoothness"},{"id":"http://arxiv.org/abs/2401.15276v2","updated":"2025-01-15T07:34:41Z","published":"2024-01-27T03:09:43Z","title":"Analytic Formulas for Alternating Projection Sequences for the Positive\n  Semidefinite Cone and an Application to Convergence Analysis","summary":"  We derive analytic formulas for the alternating projection method applied to\nthe cone $\\mathbb{S}^n_+$ of positive semidefinite matrices and an affine\nsubspace. More precisely, we find recursive relations on parameters\nrepresenting a sequence constructed by the alternating projection method. By\napplying these formulas, we analyze the alternating projection method in detail\nand show that the upper bound given by the singularity degree is actually tight\nwhen the alternating projection method is applied to $\\mathbb{S}^3_+$ and a\n$3$-plane whose intersection is a singleton with singularity degree $2$.\n","authors":["Hiroyuki Ochiai","Yoshiyuki Sekiguchi","Hayato Waki"],"pdf_url":"https://arxiv.org/pdf/2401.15276v2.pdf","comment":"31 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.07863v2","updated":"2025-01-15T05:27:42Z","published":"2025-01-14T05:59:37Z","title":"An accelerated gradient method with adaptive restart for convex\n  multiobjective optimization problems","summary":"  In this work, based on the continuous time approach, we propose an\naccelerated gradient method with adaptive residual restart for convex\nmultiobjective optimization problems. For the first, we derive rigorously the\ncontinuous limit of the multiobjective accelerated proximal gradient method by\nTanabe et al. [Comput. Optim. Appl., 2023]. It is a second-order ordinary\ndifferential equation (ODE) that involves a special projection operator and can\nbe viewed as an extension of the ODE by Su et al. [J. Mach. Learn. Res., 2016]\nfor Nesterov's accelerated gradient method. Then, we introduce a novel\naccelerated multiobjective gradient (AMG) flow with tailored time scaling that\nadapts automatically to the convex case and the strongly convex case, and the\nexponential decay rate of a merit function along with the solution trajectory\nof AMG flow is established via the Lyapunov analysis. After that, we consider\nan implicit-explicit time discretization and obtain an accelerated\nmultiobjective gradient method with a convex quadratic programming subproblem.\nThe fast sublinear rate and linear rate are proved respectively for convex and\nstrongly convex problems. In addition, we present an efficient residual based\nadaptive restart technique to overcome the oscillation issue and improve the\nconvergence significantly. Numerical results are provided to validate the\npractical performance of the proposed method.\n","authors":["Hao Luo","Liping Tang","Xinmin Yang"],"pdf_url":"https://arxiv.org/pdf/2501.07863v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18426v2","updated":"2025-01-15T04:59:43Z","published":"2024-09-27T03:27:46Z","title":"Dual Cone Gradient Descent for Training Physics-Informed Neural Networks","summary":"  Physics-informed neural networks (PINNs) have emerged as a prominent approach\nfor solving partial differential equations (PDEs) by minimizing a combined loss\nfunction that incorporates both boundary loss and PDE residual loss. Despite\ntheir remarkable empirical performance in various scientific computing tasks,\nPINNs often fail to generate reasonable solutions, and such pathological\nbehaviors remain difficult to explain and resolve. In this paper, we identify\nthat PINNs can be adversely trained when gradients of each loss function\nexhibit a significant imbalance in their magnitudes and present a negative\ninner product value. To address these issues, we propose a novel optimization\nframework, Dual Cone Gradient Descent (DCGD), which adjusts the direction of\nthe updated gradient to ensure it falls within a dual cone region. This region\nis defined as a set of vectors where the inner products with both the gradients\nof the PDE residual loss and the boundary loss are non-negative. Theoretically,\nwe analyze the convergence properties of DCGD algorithms in a non-convex\nsetting. On a variety of benchmark equations, we demonstrate that DCGD\noutperforms other optimization algorithms in terms of various evaluation\nmetrics. In particular, DCGD achieves superior predictive accuracy and enhances\nthe stability of training for failure modes of PINNs and complex PDEs, compared\nto existing optimally tuned models. Moreover, DCGD can be further improved by\ncombining it with popular strategies for PINNs, including learning rate\nannealing and the Neural Tangent Kernel (NTK).\n","authors":["Youngsik Hwang","Dong-Young Lim"],"pdf_url":"https://arxiv.org/pdf/2409.18426v2.pdf","comment":"The Thirty-eighth Annual Conference on Neural Information Processing\n  Systems, 2024"},{"id":"http://arxiv.org/abs/2501.08522v1","updated":"2025-01-15T02:22:15Z","published":"2025-01-15T02:22:15Z","title":"Differentiable Singular Value Decomposition","summary":"  Singular value decomposition is widely used in modal analysis, such as proper\northogonal decomposition and resolvent analysis, to extract key features from\ncomplex problems. SVD derivatives need to be computed efficiently to enable the\nlarge scale design optimization. However, for a general complex matrix, no\nmethod can accurately compute this derivative to machine precision and remain\nscalable with respect to the number of design variables without requiring the\nall of the singular variables. We propose two algorithms to efficiently compute\nthis derivative based on the adjoint method and reverse automatic\ndifferentiation and RAD-based singular value derivative formula.\nDifferentiation results for each method proposed were compared with FD results\nfor one square and one tall rectangular matrix example and matched with the FD\nresults to about 5 to 7 digits. Finally, we demonstrate the scalability of the\nproposed method by calculating the derivatives of singular values with respect\nto the snapshot matrix derived from the POD of a large dataset for a\nlaminar-turbulent transitional flow over a flat plate, sourced from the John\nHopkins turbulence database.\n","authors":["Rohit Kanchi","Sicheng He"],"pdf_url":"https://arxiv.org/pdf/2501.08522v1.pdf","comment":"52 pages , 4 tables, 2 figures"},{"id":"http://arxiv.org/abs/2501.09192v1","updated":"2025-01-15T22:50:02Z","published":"2025-01-15T22:50:02Z","title":"Estimation-Aware Trajectory Optimization with Set-Valued Measurement\n  Uncertainties","summary":"  In this paper, we present an optimization-based framework for generating\nestimation-aware trajectories in scenarios where measurement (output)\nuncertainties are state-dependent and set-valued. The framework leverages the\nconcept of regularity for set-valued output maps. Specifically, we demonstrate\nthat, for output-regular maps, one can utilize a set-valued observability\nmeasure that is concave with respect to finite-horizon state trajectories. By\nmaximizing this measure, optimized estimation-aware trajectories can be\ndesigned for a broad class of systems, including those with locally linearized\ndynamics. To illustrate the effectiveness of the proposed approach, we provide\na representative example in the context of trajectory planning for vision-based\nestimation. We present an estimation-aware trajectory for an uncooperative\ntarget-tracking problem that uses a machine learning (ML)-based estimation\nmodule on an ego-satellite.\n","authors":["Aditya Deole","Mehran Mesbahi"],"pdf_url":"https://arxiv.org/pdf/2501.09192v1.pdf","comment":"25 pages, 5 figures"},{"id":"http://arxiv.org/abs/2212.00927v4","updated":"2025-01-15T22:35:44Z","published":"2022-12-02T01:45:22Z","title":"First-Order Methods for Nonsmooth Nonconvex Functional Constrained\n  Optimization with or without Slater Points","summary":"  Constrained optimization problems where both the objective and constraints\nmay be nonsmooth and nonconvex arise across many learning and data science\nsettings. In this paper, we show for any Lipschitz, weakly convex objectives\nand constraints, a simple first-order method finds a feasible,\n$\\epsilon$-stationary point at a convergence rate of $O(\\epsilon^{-4})$ without\nrelying on compactness or Constraint Qualification (CQ). When CQ holds, this\nconvergence is measured by approximately satisfying the Karush-Kuhn-Tucker\nconditions. When CQ fails, we guarantee the attainment of weaker Fritz-John\nconditions. As an illustrative example, our method stably converges on\npiecewise quadratic SCAD regularized problems despite frequent violations of\nconstraint qualification. The considered algorithm is similar to those of\n\"Quadratically regularized subgradient methods for weakly convex optimization\nwith weakly convex constraints\" by Ma et al. and \"Stochastic first-order\nmethods for convex and nonconvex functional constrained optimization\" by Boob\net al. (whose guarantees further assume compactness and CQ), iteratively taking\ninexact proximal steps, computed via an inner loop applying a switching\nsubgradient method to a strongly convex constrained subproblem. Our\nnon-Lipschitz analysis of the switching subgradient method appears to be new\nand may be of independent interest.\n","authors":["Zhichao Jia","Benjamin Grimmer"],"pdf_url":"https://arxiv.org/pdf/2212.00927v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15336v2","updated":"2025-01-15T22:05:48Z","published":"2023-08-29T14:30:36Z","title":"Second-order methods for quartically-regularised cubic polynomials, with\n  applications to high-order tensor methods","summary":"  There has been growing interest in high-order tensor methods for nonconvex\noptimization, with adaptive regularization, as they possess better/optimal\nworst-case evaluation complexity globally and faster convergence\nasymptotically. These algorithms crucially rely on repeatedly minimizing\nnonconvex multivariate Taylor-based polynomial sub-problems, at least locally.\nFinding efficient techniques for the solution of these sub-problems, beyond the\nsecond-order case, has been an open question. This paper proposes a\nsecond-order method, Quadratic Quartic Regularisation (QQR), for efficiently\nminimizing nonconvex quartically-regularized cubic polynomials, such as the\nAR$p$ sub-problem [3] with $p=3$. Inspired by [35], QQR approximates the\nthird-order tensor term by a linear combination of quadratic and quartic terms,\nyielding (possibly nonconvex) local models that are solvable to global\noptimality. In order to achieve accuracy $\\epsilon$ in the first-order\ncriticality of the sub-problem in finitely many iterations, we show that the\nerror in the QQR method decreases either linearly or by at least\n$\\mathcal{O}(\\epsilon^{4/3})$ for locally convex iterations, while in the\nnonconvex case, by at least $\\mathcal{O}(\\epsilon)$; thus improving, on these\ntypes of iterations, the general cubic-regularization bound. Preliminary\nnumerical experiments indicate that two QQR variants perform competitively with\nstate-of-the-art approaches such as ARC (also known as AR$p$ with $p=2$),\nachieving either a lower objective value or iteration counts.\n","authors":["Coralia Cartis","Wenqi Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.15336v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09150v1","updated":"2025-01-15T21:05:38Z","published":"2025-01-15T21:05:38Z","title":"Extended Triangle Inequalities for Nonconvex Box-Constrained Quadratic\n  Programming","summary":"  Let $\\rm{Box}_n = \\{x \\in \\mathbb{R}^n : 0 \\leq x \\leq e \\}$, and let\n$\\rm{QPB}_n$ denote the convex hull of $\\{(1, x')'(1, x') : x \\in\n\\rm{Box}_n\\}$. The quadratic programming problem $\\min\\{x'Q x + q'x : x \\in\n\\rm{Box}_n\\}$ where $Q$ is not positive semidefinite (PSD), is equivalent to a\nlinear optimization problem over $\\rm{QPB}_n$ and could be efficiently solved\nif a tractable characterization of $\\rm{QPB}_n$ was available. It is known that\n$\\rm{QPB}_2$ can be represented using a PSD constraint combined with\nconstraints generated using the reformulation-linearization technique (RLT).\nThe triangle (TRI) inequalities are also valid for $\\rm{QPB}_3$, but the PSD,\nRLT and TRI constraints together do not fully characterize $\\rm{QPB}_3$. In\nthis paper we describe new valid linear inequalities for $\\rm{QPB}_n$, $n \\geq\n3$ based on strengthening the approximation of $\\rm{QPB}_3$ given by the PSD,\nRLT and TRI constraints. These new inequalities are generated in a systematic\nway using a known disjunctive characterization for $\\rm{QPB}_3$. We also\ndescribe a conic strengthening of the linear inequalities that incorporates\nsecond-order cone constraints. We show computationally that the new\ninequalities and their conic strengthenings obtain exact solutions for some\nnonconvex box-constrained instances that are not solved exactly using the PSD,\nRLT and TRI constraints.\n","authors":["Kurt M. Anstreicher","Diane Puges"],"pdf_url":"https://arxiv.org/pdf/2501.09150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09143v1","updated":"2025-01-15T20:50:15Z","published":"2025-01-15T20:50:15Z","title":"Reducing real-time complexity via sub-control Lyapunov functions: from\n  theory to experiments","summary":"  The techniques to design control Lyapunov functions (CLF), along with a\nproper stabilizing feedback, possibly in the presence of constraints, often\nprovide control laws that are too complex for proper implementation online,\nespecially when an optimization problem is involved. In this work, we show how\nto acquire an alternative, computationally attractive feedback. Given a nominal\nCLF and a nominal state feedback, we say that a different positive definite\nfunction is a Sub-control Lyapunov function (SCLF) if its Lyapunov derivative\nis negative-definite and bounded above by the Lyapunov derivative of the\nnominal function with the nominal control. It turns out that if we consider a\nfamily of basis functions, then a SCLF can be computed by linear programming,\nwith an infinite number of constraints. The idea is that although the offline\ncomputational burden to achieve the new controller and solve the linear program\nis considerable, the online computational burden is drastically reduced.\nComprehensive simulations and experiments on drone control are conducted to\ndemonstrate the effectiveness of the study.\n","authors":["Huu-Thinh Do","Franco Blanchini","Stefano Miani","Ionela Prodan"],"pdf_url":"https://arxiv.org/pdf/2501.09143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09137v1","updated":"2025-01-15T20:43:36Z","published":"2025-01-15T20:43:36Z","title":"Gradient Descent Converges Linearly to Flatter Minima than Gradient Flow\n  in Shallow Linear Networks","summary":"  We study the gradient descent (GD) dynamics of a depth-2 linear neural\nnetwork with a single input and output. We show that GD converges at an\nexplicit linear rate to a global minimum of the training loss, even with a\nlarge stepsize -- about $2/\\textrm{sharpness}$. It still converges for even\nlarger stepsizes, but may do so very slowly. We also characterize the solution\nto which GD converges, which has lower norm and sharpness than the gradient\nflow solution. Our analysis reveals a trade off between the speed of\nconvergence and the magnitude of implicit regularization. This sheds light on\nthe benefits of training at the ``Edge of Stability'', which induces additional\nregularization by delaying convergence and may have implications for training\nmore complex models.\n","authors":["Pierfrancesco Beneventano","Blake Woodworth"],"pdf_url":"https://arxiv.org/pdf/2501.09137v1.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2209.10825v4","updated":"2025-01-15T20:43:18Z","published":"2022-09-22T07:12:48Z","title":"Nonsmooth Nonconvex-Nonconcave Minimax Optimization: Primal-Dual\n  Balancing and Iteration Complexity Analysis","summary":"  Nonconvex-nonconcave minimax optimization has gained widespread interest over\nthe last decade. However, most existing works focus on variants of gradient\ndescent-ascent (GDA) algorithms, which are only applicable to smooth\nnonconvex-concave settings. To address this limitation, we propose a novel\nalgorithm named smoothed proximal linear descent-ascent (smoothed PLDA), which\ncan effectively handle a broad range of structured nonsmooth\nnonconvex-nonconcave minimax problems. Specifically, we consider the setting\nwhere the primal function has a nonsmooth composite structure and the dual\nfunction possesses the Kurdyka-Lojasiewicz (KL) property with exponent $\\theta\n\\in [0,1)$. We introduce a novel convergence analysis framework for smoothed\nPLDA, the key components of which are our newly developed nonsmooth primal\nerror bound and dual error bound. Using this framework, we show that smoothed\nPLDA can find both $\\epsilon$-game-stationary points and\n$\\epsilon$-optimization-stationary points of the problems of interest in\n$\\mathcal{O}(\\epsilon^{-2\\max\\{2\\theta,1\\}})$ iterations. Furthermore, when\n$\\theta \\in [0,\\frac{1}{2}]$, smoothed PLDA achieves the optimal iteration\ncomplexity of $\\mathcal{O}(\\epsilon^{-2})$. To further demonstrate the\neffectiveness and wide applicability of our analysis framework, we show that\ncertain max-structured problem possesses the KL property with exponent\n$\\theta=0$ under mild assumptions. As a by-product, we establish\nalgorithm-independent quantitative relationships among various stationarity\nconcepts, which may be of independent interest.\n","authors":["Jiajin Li","Linglingzhi Zhu","Anthony Man-Cho So"],"pdf_url":"https://arxiv.org/pdf/2209.10825v4.pdf","comment":"Accepted for publication in Mathematical Programming"},{"id":"http://arxiv.org/abs/2501.09097v1","updated":"2025-01-15T19:30:48Z","published":"2025-01-15T19:30:48Z","title":"Least-Squares Problem Over Probability Measure Space","summary":"  In this work, we investigate the variational problem $$\\rho_x^\\ast =\n\\text{argmin}_{\\rho_x} D(G\\#\\rho_x, \\rho_y)\\,, $$ where $D$ quantifies the\ndifference between two probability measures, and ${G}$ is a forward operator\nthat maps a variable $x$ to $y=G(x)$. This problem can be regarded as an\nanalogue of its counterpart in linear spaces (e.g., Euclidean spaces),\n$\\text{argmin}_x \\|G(x) - y\\|^2$. Similar to how the choice of norm $\\|\\cdot\\|$\ninfluences the optimizer in $\\mathbb R^d$ or other linear spaces, the minimizer\nin the probabilistic variational problem also depends on the choice of $D$. Our\nfindings reveal that using a $\\phi$-divergence for $D$ leads to the recovery of\na conditional distribution of $\\rho_y$, while employing the Wasserstein\ndistance results in the recovery of a marginal distribution.\n","authors":["Qin Li","Li Wang","Yunan Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09097v1.pdf","comment":"5 pages, 0 figures"},{"id":"http://arxiv.org/abs/2501.09088v1","updated":"2025-01-15T19:15:58Z","published":"2025-01-15T19:15:58Z","title":"Stochastic Optimal Control of Prosumers in a District Heating System","summary":"  We consider a network of residential heating systems in which several\nprosumers satisfy their heating and hot water demand using solar thermal\ncollectors and services of a central producer. Overproduction of heat can\neither be stored in a local thermal storage or sold to the network. Our focus\nis the minimization of the prosumers expected discounted total cost from\npurchasing and selling thermal energy and running the system. This decision\nmaking problem under uncertainty about the future production and consumption of\nthermal energy is formulated as a stochastic optimal control problem and solved\nwith dynamic programming techniques. We present numerical results for the value\nfunction and the optimal control.\n","authors":["Maalvladédon Ganet Somé"],"pdf_url":"https://arxiv.org/pdf/2501.09088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10465v1","updated":"2025-01-15T15:00:23Z","published":"2025-01-15T15:00:23Z","title":"The Mathematics of Artificial Intelligence","summary":"  This overview article highlights the critical role of mathematics in\nartificial intelligence (AI), emphasizing that mathematics provides tools to\nbetter understand and enhance AI systems. Conversely, AI raises new problems\nand drives the development of new mathematics at the intersection of various\nfields. This article focuses on the application of analytical and probabilistic\ntools to model neural network architectures and better understand their\noptimization. Statistical questions (particularly the generalization capacity\nof these networks) are intentionally set aside, though they are of crucial\nimportance. We also shed light on the evolution of ideas that have enabled\nsignificant advances in AI through architectures tailored to specific tasks,\neach echoing distinct mathematical techniques. The goal is to encourage more\nmathematicians to take an interest in and contribute to this exciting field.\n","authors":["Gabriel Peyré"],"pdf_url":"https://arxiv.org/pdf/2501.10465v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.09019v1","updated":"2025-01-15T18:59:15Z","published":"2025-01-15T18:59:15Z","title":"Ouroboros-Diffusion: Exploring Consistent Content Generation in\n  Tuning-free Long Video Diffusion","summary":"  The first-in-first-out (FIFO) video diffusion, built on a pre-trained\ntext-to-video model, has recently emerged as an effective approach for\ntuning-free long video generation. This technique maintains a queue of video\nframes with progressively increasing noise, continuously producing clean frames\nat the queue's head while Gaussian noise is enqueued at the tail. However,\nFIFO-Diffusion often struggles to keep long-range temporal consistency in the\ngenerated videos due to the lack of correspondence modeling across frames. In\nthis paper, we propose Ouroboros-Diffusion, a novel video denoising framework\ndesigned to enhance structural and content (subject) consistency, enabling the\ngeneration of consistent videos of arbitrary length. Specifically, we introduce\na new latent sampling technique at the queue tail to improve structural\nconsistency, ensuring perceptually smooth transitions among frames. To enhance\nsubject consistency, we devise a Subject-Aware Cross-Frame Attention (SACFA)\nmechanism, which aligns subjects across frames within short segments to achieve\nbetter visual coherence. Furthermore, we introduce self-recurrent guidance.\nThis technique leverages information from all previous cleaner frames at the\nfront of the queue to guide the denoising of noisier frames at the end,\nfostering rich and contextual global information interaction. Extensive\nexperiments of long video generation on the VBench benchmark demonstrate the\nsuperiority of our Ouroboros-Diffusion, particularly in terms of subject\nconsistency, motion smoothness, and temporal consistency.\n","authors":["Jingyuan Chen","Fuchen Long","Jie An","Zhaofan Qiu","Ting Yao","Jiebo Luo","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2501.09019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14505v2","updated":"2025-01-15T18:57:31Z","published":"2024-07-19T17:58:36Z","title":"T2V-CompBench: A Comprehensive Benchmark for Compositional Text-to-video\n  Generation","summary":"  Text-to-video (T2V) generative models have advanced significantly, yet their\nability to compose different objects, attributes, actions, and motions into a\nvideo remains unexplored. Previous text-to-video benchmarks also neglect this\nimportant ability for evaluation. In this work, we conduct the first systematic\nstudy on compositional text-to-video generation. We propose T2V-CompBench, the\nfirst benchmark tailored for compositional text-to-video generation.\nT2V-CompBench encompasses diverse aspects of compositionality, including\nconsistent attribute binding, dynamic attribute binding, spatial relationships,\nmotion binding, action binding, object interactions, and generative numeracy.\nWe further carefully design evaluation metrics of multimodal large language\nmodel (MLLM)-based, detection-based, and tracking-based metrics, which can\nbetter reflect the compositional text-to-video generation quality of seven\nproposed categories with 1400 text prompts. The effectiveness of the proposed\nmetrics is verified by correlation with human evaluations. We also benchmark\nvarious text-to-video generative models and conduct in-depth analysis across\ndifferent models and various compositional categories. We find that\ncompositional text-to-video generation is highly challenging for current\nmodels, and we hope our attempt could shed light on future research in this\ndirection.\n","authors":["Kaiyue Sun","Kaiyi Huang","Xian Liu","Yue Wu","Zihan Xu","Zhenguo Li","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2407.14505v2.pdf","comment":"Project page: https://t2v-compbench-2025.github.io/ Code:\n  https://github.com/KaiyueSun98/T2V-CompBench/tree/V2"},{"id":"http://arxiv.org/abs/2501.09012v1","updated":"2025-01-15T18:56:22Z","published":"2025-01-15T18:56:22Z","title":"Multimodal LLMs Can Reason about Aesthetics in Zero-Shot","summary":"  We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability\nshall be elicited to evaluate the aesthetics of artworks. To facilitate this\ninvestigation, we construct MM-StyleBench, a novel high-quality dataset for\nbenchmarking artistic stylization. We then develop a principled method for\nhuman preference modeling and perform a systematic correlation analysis between\nMLLMs' responses and human preference. Our experiments reveal an inherent\nhallucination issue of MLLMs in art evaluation, associated with response\nsubjectivity. ArtCoT is proposed, demonstrating that art-specific task\ndecomposition and the use of concrete language boost MLLMs' reasoning ability\nfor aesthetics. Our findings offer valuable insights into MLLMs for art and can\nbenefit a wide range of downstream applications, such as style transfer and\nartistic image generation. Code available at\nhttps://github.com/songrise/MLLM4Art.\n","authors":["Ruixiang Jiang","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09012v1.pdf","comment":"WIP, Homepage https://github.com/songrise/MLLM4Art"},{"id":"http://arxiv.org/abs/2501.09008v1","updated":"2025-01-15T18:48:38Z","published":"2025-01-15T18:48:38Z","title":"SimGen: A Diffusion-Based Framework for Simultaneous Surgical Image and\n  Segmentation Mask Generation","summary":"  Acquiring and annotating surgical data is often resource-intensive, ethical\nconstraining, and requiring significant expert involvement. While generative AI\nmodels like text-to-image can alleviate data scarcity, incorporating spatial\nannotations, such as segmentation masks, is crucial for precision-driven\nsurgical applications, simulation, and education. This study introduces both a\nnovel task and method, SimGen, for Simultaneous Image and Mask Generation.\nSimGen is a diffusion model based on the DDPM framework and Residual U-Net,\ndesigned to jointly generate high-fidelity surgical images and their\ncorresponding segmentation masks. The model leverages cross-correlation priors\nto capture dependencies between continuous image and discrete mask\ndistributions. Additionally, a Canonical Fibonacci Lattice (CFL) is employed to\nenhance class separability and uniformity in the RGB space of the masks. SimGen\ndelivers high-fidelity images and accurate segmentation masks, outperforming\nbaselines across six public datasets assessed on image and semantic inception\ndistance metrics. Ablation study shows that the CFL improves mask quality and\nspatial separation. Downstream experiments suggest generated image-mask pairs\nare usable if regulations limit human data release for research. This work\noffers a cost-effective solution for generating paired surgical images and\ncomplex labels, advancing surgical AI development by reducing the need for\nexpensive manual annotations.\n","authors":["Aditya Bhat","Rupak Bose","Chinedu Innocent Nwoye","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2501.09008v1.pdf","comment":"12 pages, 17 figures, 4 tables, project page at\n  https://camma-public.github.io/endogen/"},{"id":"http://arxiv.org/abs/2403.13163v5","updated":"2025-01-15T18:45:15Z","published":"2024-03-19T21:31:31Z","title":"DeblurDiNAT: A Compact Model with Exceptional Generalization and Visual\n  Fidelity on Unseen Domains","summary":"  Recent deblurring networks have effectively restored clear images from the\nblurred ones. However, they often struggle with generalization to unknown\ndomains. Moreover, these models typically focus on distortion metrics such as\nPSNR and SSIM, neglecting the critical aspect of metrics aligned with human\nperception. To address these limitations, we propose DeblurDiNAT, a deblurring\nTransformer based on Dilated Neighborhood Attention. First, DeblurDiNAT employs\nan alternating dilation factor paradigm to capture both local and global\nblurred patterns, enhancing generalization and perceptual clarity. Second, a\nlocal cross-channel learner aids the Transformer block to understand the\nshort-range relationships between adjacent channels. Additionally, we present a\nlinear feed-forward network with a simple while effective design. Finally, a\ndual-stage feature fusion module is introduced as an alternative to the\nexisting approach, which efficiently process multi-scale visual information\nacross network levels. Compared to state-of-the-art models, our compact\nDeblurDiNAT demonstrates superior generalization capabilities and achieves\nremarkable performance in perceptual metrics, while maintaining a favorable\nmodel size.\n","authors":["Hanzhou Liu","Binghan Li","Chengkai Liu","Mi Lu"],"pdf_url":"https://arxiv.org/pdf/2403.13163v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09001v1","updated":"2025-01-15T18:30:58Z","published":"2025-01-15T18:30:58Z","title":"Vision Foundation Models for Computed Tomography","summary":"  Foundation models (FMs) have shown transformative potential in radiology by\nperforming diverse, complex tasks across imaging modalities. Here, we developed\nCT-FM, a large-scale 3D image-based pre-trained model designed explicitly for\nvarious radiological tasks. CT-FM was pre-trained using 148,000 computed\ntomography (CT) scans from the Imaging Data Commons through label-agnostic\ncontrastive learning. We evaluated CT-FM across four categories of tasks,\nnamely, whole-body and tumor segmentation, head CT triage, medical image\nretrieval, and semantic understanding, showing superior performance against\nstate-of-the-art models. Beyond quantitative success, CT-FM demonstrated the\nability to cluster regions anatomically and identify similar anatomical and\nstructural concepts across scans. Furthermore, it remained robust across\ntest-retest settings and indicated reasonable salient regions attached to its\nembeddings. This study demonstrates the value of large-scale medical imaging\nfoundation models and by open-sourcing the model weights, code, and data, aims\nto support more adaptable, reliable, and interpretable AI solutions in\nradiology.\n","authors":["Suraj Pai","Ibrahim Hadzic","Dennis Bontempi","Keno Bressem","Benjamin H. Kann","Andriy Fedorov","Raymond H. Mak","Hugo J. W. L. Aerts"],"pdf_url":"https://arxiv.org/pdf/2501.09001v1.pdf","comment":"6 figures, followed by 9 Extended Data Figures and a Supplementary\n  Information document"},{"id":"http://arxiv.org/abs/2501.01557v2","updated":"2025-01-15T18:29:56Z","published":"2025-01-02T22:24:13Z","title":"Click-Calib: A Robust Extrinsic Calibration Method for Surround-View\n  Systems","summary":"  Surround-View System (SVS) is an essential component in Advanced Driver\nAssistance System (ADAS) and requires precise calibrations. However,\nconventional offline extrinsic calibration methods are cumbersome and\ntime-consuming as they rely heavily on physical patterns. Additionally, these\nmethods primarily focus on short-range areas surrounding the vehicle, resulting\nin lower calibration quality in more distant zones. To address these\nlimitations, we propose Click-Calib, a pattern-free approach for offline SVS\nextrinsic calibration. Without requiring any special setup, the user only needs\nto click a few keypoints on the ground in natural scenes. Unlike other offline\ncalibration approaches, Click-Calib optimizes camera poses over a wide range by\nminimizing reprojection distance errors of keypoints, thereby achieving\naccurate calibrations at both short and long distances. Furthermore,\nClick-Calib supports both single-frame and multiple-frame modes, with the\nlatter offering even better results. Evaluations on our in-house dataset and\nthe public WoodScape dataset demonstrate its superior accuracy and robustness\ncompared to baseline methods. Code is available at\nhttps://github.com/lwangvaleo/click_calib.\n","authors":["Lihao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.01557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06848v2","updated":"2025-01-15T18:28:37Z","published":"2025-01-12T15:34:24Z","title":"A General Framework for Inference-time Scaling and Steering of Diffusion\n  Models","summary":"  Diffusion models produce impressive results in modalities ranging from images\nand video to protein design and text. However, generating samples with\nuser-specified properties remains a challenge. Recent research proposes\nfine-tuning models to maximize rewards that capture desired properties, but\nthese methods require expensive training and are prone to mode collapse. In\nthis work, we propose Feynman Kac (FK) steering, an inference-time framework\nfor steering diffusion models with reward functions. FK steering works by\nsampling a system of multiple interacting diffusion processes, called\nparticles, and resampling particles at intermediate steps based on scores\ncomputed using functions called potentials. Potentials are defined using\nrewards for intermediate states and are selected such that a high value\nindicates that the particle will yield a high-reward sample. We explore various\nchoices of potentials, intermediate rewards, and samplers. We evaluate FK\nsteering on text-to-image and text diffusion models. For steering text-to-image\nmodels with a human preference reward, we find that FK steering a 0.8B\nparameter model outperforms a 2.6B parameter fine-tuned model on prompt\nfidelity, with faster sampling and no training. For steering text diffusion\nmodels with rewards for text quality and specific text attributes, we find that\nFK steering generates lower perplexity, more linguistically acceptable outputs\nand enables gradient-free control of attributes like toxicity. Our results\ndemonstrate that inference-time scaling and steering of diffusion models, even\nwith off-the-shelf rewards, can provide significant sample quality gains and\ncontrollability benefits. Code is available at\nhttps://github.com/zacharyhorvitz/Fk-Diffusion-Steering .\n","authors":["Raghav Singhal","Zachary Horvitz","Ryan Teehan","Mengye Ren","Zhou Yu","Kathleen McKeown","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2501.06848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08994v1","updated":"2025-01-15T18:20:37Z","published":"2025-01-15T18:20:37Z","title":"RepVideo: Rethinking Cross-Layer Representation for Video Generation","summary":"  Video generation has achieved remarkable progress with the introduction of\ndiffusion models, which have significantly improved the quality of generated\nvideos. However, recent research has primarily focused on scaling up model\ntraining, while offering limited insights into the direct impact of\nrepresentations on the video generation process. In this paper, we initially\ninvestigate the characteristics of features in intermediate layers, finding\nsubstantial variations in attention maps across different layers. These\nvariations lead to unstable semantic representations and contribute to\ncumulative differences between features, which ultimately reduce the similarity\nbetween adjacent frames and negatively affect temporal coherence. To address\nthis, we propose RepVideo, an enhanced representation framework for\ntext-to-video diffusion models. By accumulating features from neighboring\nlayers to form enriched representations, this approach captures more stable\nsemantic information. These enhanced representations are then used as inputs to\nthe attention mechanism, thereby improving semantic expressiveness while\nensuring feature consistency across adjacent frames. Extensive experiments\ndemonstrate that our RepVideo not only significantly enhances the ability to\ngenerate accurate spatial appearances, such as capturing complex spatial\nrelationships between multiple objects, but also improves temporal consistency\nin video generation.\n","authors":["Chenyang Si","Weichen Fan","Zhengyao Lv","Ziqi Huang","Yu Qiao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08994v1.pdf","comment":"Project page: https://vchitect.github.io/RepVid-Webpage"},{"id":"http://arxiv.org/abs/2409.01998v2","updated":"2025-01-15T18:07:13Z","published":"2024-09-03T15:43:44Z","title":"SA-MLP: A Low-Power Multiplication-Free Deep Network for 3D Point Cloud\n  Classification in Resource-Constrained Environments","summary":"  Point cloud classification plays a crucial role in the processing and\nanalysis of data from 3D sensors such as LiDAR, which are commonly used in\napplications like autonomous vehicles, robotics, and environmental monitoring.\nHowever, traditional neural networks, which rely heavily on multiplication\noperations, often face challenges in terms of high computational costs and\nenergy consumption. This study presents a novel family of efficient MLP-based\narchitectures designed to improve the computational efficiency of point cloud\nclassification tasks in sensor systems. The baseline model, Mul-MLP, utilizes\nconventional multiplication operations, while Add-MLP and Shift-MLP replace\nmultiplications with addition and shift operations, respectively. These\nreplacements leverage more sensor-friendly operations that can significantly\nreduce computational overhead, making them particularly suitable for\nresource-constrained sensor platforms. To further enhance performance, we\npropose SA-MLP, a hybrid architecture that alternates between shift and adder\nlayers, preserving the network depth while optimizing computational efficiency.\nUnlike previous approaches such as ShiftAddNet, which increase the layer count\nand limit representational capacity by freezing shift weights, SA-MLP fully\nexploits the complementary advantages of shift and adder layers by employing\ndistinct learning rates and optimizers. Experimental results show that Add-MLP\nand Shift-MLP achieve competitive performance compared to Mul-MLP, while SA-MLP\nsurpasses the baseline, delivering results comparable to state-of-the-art MLP\nmodels in terms of both classification accuracy and computational efficiency.\nThis work offers a promising, energy-efficient solution for sensor-driven\napplications requiring real-time point cloud classification, particularly in\nenvironments with limited computational resources.\n","authors":["Qiang Zheng","Chao Zhang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2409.01998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08983v1","updated":"2025-01-15T17:59:56Z","published":"2025-01-15T17:59:56Z","title":"CityDreamer4D: Compositional Generative Model of Unbounded 4D Cities","summary":"  3D scene generation has garnered growing attention in recent years and has\nmade significant progress. Generating 4D cities is more challenging than 3D\nscenes due to the presence of structurally complex, visually diverse objects\nlike buildings and vehicles, and heightened human sensitivity to distortions in\nurban environments. To tackle these issues, we propose CityDreamer4D, a\ncompositional generative model specifically tailored for generating unbounded\n4D cities. Our main insights are 1) 4D city generation should separate dynamic\nobjects (e.g., vehicles) from static scenes (e.g., buildings and roads), and 2)\nall objects in the 4D scene should be composed of different types of neural\nfields for buildings, vehicles, and background stuff. Specifically, we propose\nTraffic Scenario Generator and Unbounded Layout Generator to produce dynamic\ntraffic scenarios and static city layouts using a highly compact BEV\nrepresentation. Objects in 4D cities are generated by combining stuff-oriented\nand instance-oriented neural fields for background stuff, buildings, and\nvehicles. To suit the distinct characteristics of background stuff and\ninstances, the neural fields employ customized generative hash grids and\nperiodic positional embeddings as scene parameterizations. Furthermore, we\noffer a comprehensive suite of datasets for city generation, including OSM,\nGoogleEarth, and CityTopia. The OSM dataset provides a variety of real-world\ncity layouts, while the Google Earth and CityTopia datasets deliver\nlarge-scale, high-quality city imagery complete with 3D instance annotations.\nLeveraging its compositional design, CityDreamer4D supports a range of\ndownstream applications, such as instance editing, city stylization, and urban\nsimulation, while delivering state-of-the-art performance in generating\nrealistic 4D cities.\n","authors":["Haozhe Xie","Zhaoxi Chen","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08982v1","updated":"2025-01-15T17:59:32Z","published":"2025-01-15T17:59:32Z","title":"CityLoc: 6 DoF Localization of Text Descriptions in Large-Scale Scenes\n  with Gaussian Representation","summary":"  Localizing text descriptions in large-scale 3D scenes is inherently an\nambiguous task. This nonetheless arises while describing general concepts, e.g.\nall traffic lights in a city.\n  To facilitate reasoning based on such concepts, text localization in the form\nof distribution is required. In this paper, we generate the distribution of the\ncamera poses conditioned upon the textual description.\n  To facilitate such generation, we propose a diffusion-based architecture that\nconditionally diffuses the noisy 6DoF camera poses to their plausible\nlocations.\n  The conditional signals are derived from the text descriptions, using the\npre-trained text encoders. The connection between text descriptions and pose\ndistribution is established through pretrained Vision-Language-Model, i.e.\nCLIP. Furthermore, we demonstrate that the candidate poses for the distribution\ncan be further refined by rendering potential poses using 3D Gaussian\nsplatting, guiding incorrectly posed samples towards locations that better\nalign with the textual description, through visual reasoning.\n  We demonstrate the effectiveness of our method by comparing it with both\nstandard retrieval methods and learning-based approaches. Our proposed method\nconsistently outperforms these baselines across all five large-scale datasets.\nOur source code and dataset will be made publicly available.\n","authors":["Qi Ma","Runyi Yang","Bin Ren","Ender Konukoglu","Luc Van Gool","Danda Pani Paudel"],"pdf_url":"https://arxiv.org/pdf/2501.08982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06394v5","updated":"2025-01-15T17:56:35Z","published":"2023-11-10T20:50:36Z","title":"A design of Convolutional Neural Network model for the Diagnosis of the\n  COVID-19","summary":"  With the spread of COVID-19 around the globe over the past year, the usage of\nartificial intelligence (AI) algorithms and image processing methods to analyze\nthe X-ray images of patients' chest with COVID-19 has become essential. The\nCOVID-19 virus recognition in the lung area of a patient is one of the basic\nand essential needs of clicical centers and hospitals. Most research in this\nfield has been devoted to papers on the basis of deep learning methods\nutilizing CNNs (Convolutional Neural Network), which mainly deal with the\nscreening of sick and healthy people.In this study, a new structure of a\n19-layer CNN has been recommended for accurately recognition of the COVID-19\nfrom the X-ray pictures of chest. The offered CNN is developed to serve as a\nprecise diagnosis system for a three class (viral pneumonia, Normal, COVID) and\na four classclassification (Lung opacity, Normal, COVID-19, and pneumonia). A\ncomparison is conducted among the outcomes of the offered procedure and some\npopular pretrained networks, including Inception, Alexnet, ResNet50,\nSqueezenet, and VGG19 and based on Specificity, Accuracy, Precision,\nSensitivity, Confusion Matrix, and F1-score. The experimental results of the\noffered CNN method specify its dominance over the existing published\nprocedures. This method can be a useful tool for clinicians in deciding\nproperly about COVID-19.\n","authors":["Xinyuan Song"],"pdf_url":"https://arxiv.org/pdf/2311.06394v5.pdf","comment":"Important mistakes found. There's no new version currently. Also\n  contradiction with authorship"},{"id":"http://arxiv.org/abs/2501.05179v2","updated":"2025-01-15T17:34:26Z","published":"2025-01-09T11:57:58Z","title":"Compression with Global Guidance: Towards Training-free High-Resolution\n  MLLMs Acceleration","summary":"  Multimodal large language models (MLLMs) have attracted considerable\nattention due to their exceptional performance in visual content understanding\nand reasoning. However, their inference efficiency has been a notable concern,\nas the increasing length of multimodal contexts leads to quadratic complexity.\nToken compression techniques, which reduce the number of visual tokens, have\ndemonstrated their effectiveness in reducing computational costs. Yet, these\napproaches have struggled to keep pace with the rapid advancements in MLLMs,\nespecially the AnyRes strategy in the context of high-resolution image\nunderstanding. In this paper, we propose a novel token compression method,\nGlobalCom$^2$, tailored for high-resolution MLLMs that receive both the\nthumbnail and multiple crops. GlobalCom$^2$ treats the tokens derived from the\nthumbnail as the \"commander\" of the entire token compression process, directing\nthe allocation of retention ratios and the specific compression for each crop.\nIn this way, redundant tokens are eliminated while important local details are\nadaptively preserved to the highest extent feasible. Empirical results across\n10 benchmarks reveal that GlobalCom$^2$ achieves an optimal balance between\nperformance and efficiency, and consistently outperforms state-of-the-art token\ncompression methods with LLaVA-NeXT-7B/13B models. Our code is released at\nhttps://github.com/xuyang-liu16/GlobalCom2.\n","authors":["Xuyang Liu","Ziming Wang","Yuhang Han","Yingyao Wang","Jiale Yuan","Jun Song","Bo Zheng","Linfeng Zhang","Siteng Huang","Honggang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.05179v2.pdf","comment":"Our code is released at\n  \\url{https://github.com/xuyang-liu16/GlobalCom2}"},{"id":"http://arxiv.org/abs/2501.08962v1","updated":"2025-01-15T17:18:46Z","published":"2025-01-15T17:18:46Z","title":"An analysis of data variation and bias in image-based dermatological\n  datasets for machine learning classification","summary":"  AI algorithms have become valuable in aiding professionals in healthcare. The\nincreasing confidence obtained by these models is helpful in critical decision\ndemands. In clinical dermatology, classification models can detect malignant\nlesions on patients' skin using only RGB images as input. However, most\nlearning-based methods employ data acquired from dermoscopic datasets on\ntraining, which are large and validated by a gold standard. Clinical models aim\nto deal with classification on users' smartphone cameras that do not contain\nthe corresponding resolution provided by dermoscopy. Also, clinical\napplications bring new challenges. It can contain captures from uncontrolled\nenvironments, skin tone variations, viewpoint changes, noises in data and\nlabels, and unbalanced classes. A possible alternative would be to use transfer\nlearning to deal with the clinical images. However, as the number of samples is\nlow, it can cause degradations on the model's performance; the source\ndistribution used in training differs from the test set. This work aims to\nevaluate the gap between dermoscopic and clinical samples and understand how\nthe dataset variations impact training. It assesses the main differences\nbetween distributions that disturb the model's prediction. Finally, from\nexperiments on different architectures, we argue how to combine the data from\ndivergent distributions, decreasing the impact on the model's final accuracy.\n","authors":["Francisco Mauro","Emanoel Thyago","Othon Vinicius","Rodrigo Abreu","Kelvin Cunha","José Gabriel","Rafael Barros","Thales Bezerra","Manoel Henriques","Natalia Lopes","Érico Moutinho","Jéssica Guido","Tsang Ing Ren","Paulo Borba"],"pdf_url":"https://arxiv.org/pdf/2501.08962v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2312.02186v3","updated":"2025-01-15T17:11:20Z","published":"2023-12-01T20:16:02Z","title":"Identifying Spurious Correlations using Counterfactual Alignment","summary":"  Models driven by spurious correlations often yield poor generalization\nperformance. We propose the counterfactual (CF) alignment method to detect and\nquantify spurious correlations of black box classifiers. Our methodology is\nbased on counterfactual images generated with respect to one classifier being\ninput into other classifiers to see if they also induce changes in the outputs\nof these classifiers. The relationship between these responses can be\nquantified and used to identify specific instances where a spurious correlation\nexists. This is validated by observing intuitive trends in face-attribute and\nwaterbird classifiers, as well as by fabricating spurious correlations and\ndetecting their presence, both visually and quantitatively. Furthermore,\nutilizing the CF alignment method, we demonstrate that we can evaluate robust\noptimization methods (GroupDRO, JTT, and FLAC) by detecting a reduction in\nspurious correlations.\n","authors":["Joseph Paul Cohen","Louis Blankemeier","Akshay Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2312.02186v3.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR), Code:\n  https://github.com/ieee8023/latentshift"},{"id":"http://arxiv.org/abs/2409.17137v4","updated":"2025-01-15T16:56:26Z","published":"2024-09-25T17:56:00Z","title":"PACE: Marrying generalization in PArameter-efficient fine-tuning with\n  Consistency rEgularization","summary":"  Parameter-Efficient Fine-Tuning (PEFT) effectively adapts pre-trained\ntransformers to downstream tasks. However, the optimization of tasks\nperformance often comes at the cost of generalizability in fine-tuned models.\nTo address this issue, we theoretically connect smaller weight gradient norms\nduring training and larger datasets to the improvements in model\ngeneralization. Motivated by this connection, we propose reducing gradient\nnorms for enhanced generalization and aligning fine-tuned model with the\npre-trained counterpart to retain knowledge from large-scale pre-training data.\nYet, naive alignment does not guarantee gradient reduction and can potentially\ncause gradient explosion, complicating efforts to manage gradients. To address\nsuch an issue, we propose PACE, marrying generalization of PArameter-efficient\nfine-tuning with Consistency rEgularization. We perturb features learned from\nthe adapter with the multiplicative noise and ensure the fine-tuned model\nremains consistent for same sample under different perturbations. Theoretical\nanalysis shows that PACE not only implicitly regularizes gradients for enhanced\ngeneralization, but also implicitly aligns the fine-tuned and pre-trained\nmodels to retain knowledge. Experimental evidence supports our theories. PACE\nsurpasses existing PEFT methods in visual adaptation tasks (VTAB-1k, FGVC,\nfew-shot learning, domain adaptation) showcasing its potential for\nresource-efficient fine-tuning. It also improves LoRA in text classification\n(GLUE) and mathematical reasoning (GSM-8K). The code is available at\nhttps://github.com/MaxwellYaoNi/PACE\n","authors":["Yao Ni","Shan Zhang","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2409.17137v4.pdf","comment":"Accepted by NeurIPS 2024 as a spotlight"},{"id":"http://arxiv.org/abs/2412.14816v3","updated":"2025-01-15T16:54:36Z","published":"2024-12-19T13:10:03Z","title":"TextSleuth: Towards Explainable Tampered Text Detection","summary":"  Recently, tampered text detection has attracted increasing attention due to\nits essential role in information security. Although existing methods can\ndetect the tampered text region, the interpretation of such detection remains\nunclear, making the prediction unreliable. To address this problem, we propose\nto explain the basis of tampered text detection with natural language via large\nmultimodal models. To fill the data gap for this task, we propose a\nlarge-scale, comprehensive dataset, ETTD, which contains both pixel-level\nannotations for tampered text region and natural language annotations\ndescribing the anomaly of the tampered text. Multiple methods are employed to\nimprove the quality of the proposed data. For example, elaborate queries are\nintroduced to generate high-quality anomaly descriptions with GPT4o. A fused\nmask prompt is proposed to reduce confusion when querying GPT4o to generate\nanomaly descriptions. To automatically filter out low-quality annotations, we\nalso propose to prompt GPT4o to recognize tampered texts before describing the\nanomaly, and to filter out the responses with low OCR accuracy. To further\nimprove explainable tampered text detection, we propose a simple yet effective\nmodel called TextSleuth, which achieves improved fine-grained perception and\ncross-domain generalization by focusing on the suspected region, with a\ntwo-stage analysis paradigm and an auxiliary grounding prompt. Extensive\nexperiments on both the ETTD dataset and the public dataset have verified the\neffectiveness of the proposed methods. In-depth analysis is also provided to\ninspire further research. Our dataset and code will be open-source.\n","authors":["Chenfan Qu","Jian Liu","Haoxing Chen","Baihan Yu","Jingjing Liu","Weiqiang Wang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2412.14816v3.pdf","comment":"The first work for explainable tampered text detection"},{"id":"http://arxiv.org/abs/2501.08931v1","updated":"2025-01-15T16:34:20Z","published":"2025-01-15T16:34:20Z","title":"Visual WetlandBirds Dataset: Bird Species Identification and Behavior\n  Recognition in Videos","summary":"  The current biodiversity loss crisis makes animal monitoring a relevant field\nof study. In light of this, data collected through monitoring can provide\nessential insights, and information for decision-making aimed at preserving\nglobal biodiversity. Despite the importance of such data, there is a notable\nscarcity of datasets featuring videos of birds, and none of the existing\ndatasets offer detailed annotations of bird behaviors in video format. In\nresponse to this gap, our study introduces the first fine-grained video dataset\nspecifically designed for bird behavior detection and species classification.\nThis dataset addresses the need for comprehensive bird video datasets and\nprovides detailed data on bird actions, facilitating the development of deep\nlearning models to recognize these, similar to the advancements made in human\naction recognition. The proposed dataset comprises 178 videos recorded in\nSpanish wetlands, capturing 13 different bird species performing 7 distinct\nbehavior classes. In addition, we also present baseline results using state of\nthe art models on two tasks: bird behavior recognition and species\nclassification.\n","authors":["Javier Rodriguez-Juan","David Ortiz-Perez","Manuel Benavent-Lledo","David Mulero-Pérez","Pablo Ruiz-Ponce","Adrian Orihuela-Torres","Jose Garcia-Rodriguez","Esther Sebastián-González"],"pdf_url":"https://arxiv.org/pdf/2501.08931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08924v1","updated":"2025-01-15T16:30:05Z","published":"2025-01-15T16:30:05Z","title":"Learning Joint Denoising, Demosaicing, and Compression from the Raw\n  Natural Image Noise Dataset","summary":"  This paper introduces the Raw Natural Image Noise Dataset (RawNIND), a\ndiverse collection of paired raw images designed to support the development of\ndenoising models that generalize across sensors, image development workflows,\nand styles. Two denoising methods are proposed: one operates directly on raw\nBayer data, leveraging computational efficiency, while the other processes\nlinear RGB images for improved generalization to different sensors, with both\npreserving flexibility for subsequent development. Both methods outperform\ntraditional approaches which rely on developed images. Additionally, the\nintegration of denoising and compression at the raw data level significantly\nenhances rate-distortion performance and computational efficiency. These\nfindings suggest a paradigm shift toward raw data workflows for efficient and\nflexible image processing.\n","authors":["Benoit Brummer","Christophe De Vleeschouwer"],"pdf_url":"https://arxiv.org/pdf/2501.08924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08912v1","updated":"2025-01-15T16:20:26Z","published":"2025-01-15T16:20:26Z","title":"Empowering Agricultural Insights: RiceLeafBD - A Novel Dataset and\n  Optimal Model Selection for Rice Leaf Disease Diagnosis through Transfer\n  Learning Technique","summary":"  The number of people living in this agricultural nation of ours, which is\nsurrounded by lush greenery, is growing on a daily basis. As a result of this,\nthe level of arable land is decreasing, as well as residential houses and\nindustrial factories. The food crisis is becoming the main threat for us in the\nupcoming days. Because on the one hand, the population is increasing, and on\nthe other hand, the amount of food crop production is decreasing due to the\nattack of diseases. Rice is one of the most significant cultivated crops since\nit provides food for more than half of the world's population. Bangladesh is\ndependent on rice (Oryza sativa) as a vital crop for its agriculture, but it\nfaces a significant problem as a result of the ongoing decline in rice yield\nbrought on by common diseases. Early disease detection is the main difficulty\nin rice crop cultivation. In this paper, we proposed our own dataset, which was\ncollected from the Bangladesh field, and also applied deep learning and\ntransfer learning models for the evaluation of the datasets. We elaborately\nexplain our dataset and also give direction for further research work to serve\nsociety using this dataset. We applied a light CNN model and pre-trained\nInceptionNet-V2, EfficientNet-V2, and MobileNet-V2 models, which achieved 91.5%\nperformance for the EfficientNet-V2 model of this work. The results obtained\nassaulted other models and even exceeded approaches that are considered to be\npart of the state of the art. It has been demonstrated by this study that it is\npossible to precisely and effectively identify diseases that affect rice leaves\nusing this unbiased datasets. After analysis of the performance of different\nmodels, the proposed datasets are significant for the society for research work\nto provide solutions for decreasing rice leaf disease.\n","authors":["Sadia Afrin Rimi","Md. Jalal Uddin Chowdhury","Rifat Abdullah","Iftekhar Ahmed","Mahrima Akter Mim","Mohammad Shoaib Rahman"],"pdf_url":"https://arxiv.org/pdf/2501.08912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08910v1","updated":"2025-01-15T16:19:37Z","published":"2025-01-15T16:19:37Z","title":"Lights, Camera, Matching: The Role of Image Illumination in Fair Face\n  Recognition","summary":"  Facial brightness is a key image quality factor impacting face recognition\naccuracy differentials across demographic groups. In this work, we aim to\ndecrease the accuracy gap between the similarity score distributions for\nCaucasian and African American female mated image pairs, as measured by d'\nbetween distributions. To balance brightness across demographic groups, we\nconduct three experiments, interpreting brightness in the face skin region\neither as median pixel value or as the distribution of pixel values. Balancing\nbased on median brightness alone yields up to a 46.8% decrease in d', while\nbalancing based on brightness distribution yields up to a 57.6% decrease. In\nall three cases, the similarity scores of the individual distributions improve,\nwith mean scores maximally improving 5.9% for Caucasian females and 3.7% for\nAfrican American females.\n","authors":["Gabriella Pangelinan","Grace Bezold","Haiyu Wu","Michael C. King","Kevin W. Bowyer"],"pdf_url":"https://arxiv.org/pdf/2501.08910v1.pdf","comment":"14 pages, 11 figures, Conference submission"},{"id":"http://arxiv.org/abs/2308.07898v2","updated":"2025-01-15T16:19:36Z","published":"2023-08-15T17:39:52Z","title":"A Foundation Language-Image Model of the Retina (FLAIR): Encoding Expert\n  Knowledge in Text Supervision","summary":"  Foundation vision-language models are currently transforming computer vision,\nand are on the rise in medical imaging fueled by their very promising\ngeneralization capabilities. However, the initial attempts to transfer this new\nparadigm to medical imaging have shown less impressive performances than those\nobserved in other domains, due to the significant domain shift and the complex,\nexpert domain knowledge inherent to medical-imaging tasks. Motivated by the\nneed for domain-expert foundation models, we present FLAIR, a pre-trained\nvision-language model for universal retinal fundus image understanding. To this\nend, we compiled 38 open-access, mostly categorical fundus imaging datasets\nfrom various sources, with up to 101 different target conditions and 288,307\nimages. We integrate the expert's domain knowledge in the form of descriptive\ntextual prompts, during both pre-training and zero-shot inference, enhancing\nthe less-informative categorical supervision of the data. Such a textual\nexpert's knowledge, which we compiled from the relevant clinical literature and\ncommunity standards, describes the fine-grained features of the pathologies as\nwell as the hierarchies and dependencies between them. We report comprehensive\nevaluations, which illustrate the benefit of integrating expert knowledge and\nthe strong generalization capabilities of FLAIR under difficult scenarios with\ndomain shifts or unseen categories. When adapted with a lightweight linear\nprobe, FLAIR outperforms fully-trained, dataset-focused models, more so in the\nfew-shot regimes. Interestingly, FLAIR outperforms by a wide margin\nlarger-scale generalist image-language models and retina domain-specific\nself-supervised networks, which emphasizes the potential of embedding experts'\ndomain knowledge and the limitations of generalist models in medical imaging.\n","authors":["Julio Silva-Rodríguez","Hadi Chakor","Riadh Kobbi","Jose Dolz","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2308.07898v2.pdf","comment":"Accepted in Medical Image Analysis. The pre-trained model is\n  available at: https://github.com/jusiro/FLAIR"},{"id":"http://arxiv.org/abs/2501.08902v1","updated":"2025-01-15T16:11:24Z","published":"2025-01-15T16:11:24Z","title":"Multi-View Transformers for Airway-To-Lung Ratio Inference on Cardiac CT\n  Scans: The C4R Study","summary":"  The ratio of airway tree lumen to lung size (ALR), assessed at full\ninspiration on high resolution full-lung computed tomography (CT), is a major\nrisk factor for chronic obstructive pulmonary disease (COPD). There is growing\ninterest to infer ALR from cardiac CT images, which are widely available in\nepidemiological cohorts, to investigate the relationship of ALR to severe\nCOVID-19 and post-acute sequelae of SARS-CoV-2 infection (PASC). Previously,\ncardiac scans included approximately 2/3 of the total lung volume with 5-6x\ngreater slice thickness than high-resolution (HR) full-lung (FL) CT. In this\nstudy, we present a novel attention-based Multi-view Swin Transformer to infer\nFL ALR values from segmented cardiac CT scans. For the supervised training we\nexploit paired full-lung and cardiac CTs acquired in the Multi-Ethnic Study of\nAtherosclerosis (MESA). Our network significantly outperforms a proxy direct\nALR inference on segmented cardiac CT scans and achieves accuracy and\nreproducibility comparable with a scan-rescan reproducibility of the FL ALR\nground-truth.\n","authors":["Sneha N. Naik","Elsa D. Angelini","Eric A. Hoffman","Elizabeth C. Oelsner","R. Graham Barr","Benjamin M. Smith","Andrew F. Laine"],"pdf_url":"https://arxiv.org/pdf/2501.08902v1.pdf","comment":"Accepted to appear in Proceedings of International Symposium on\n  Biomedical Imaging (ISBI), 2025"},{"id":"http://arxiv.org/abs/2501.08900v1","updated":"2025-01-15T16:08:25Z","published":"2025-01-15T16:08:25Z","title":"Enhanced Multi-Scale Cross-Attention for Person Image Generation","summary":"  In this paper, we propose a novel cross-attention-based generative\nadversarial network (GAN) for the challenging person image generation task.\nCross-attention is a novel and intuitive multi-modal fusion method in which an\nattention/correlation matrix is calculated between two feature maps of\ndifferent modalities. Specifically, we propose the novel XingGAN (or\nCrossingGAN), which consists of two generation branches that capture the\nperson's appearance and shape, respectively. Moreover, we propose two novel\ncross-attention blocks to effectively transfer and update the person's shape\nand appearance embeddings for mutual improvement. This has not been considered\nby any other existing GAN-based image generation work. To further learn the\nlong-range correlations between different person poses at different scales and\nsub-regions, we propose two novel multi-scale cross-attention blocks. To tackle\nthe issue of independent correlation computations within the cross-attention\nmechanism leading to noisy and ambiguous attention weights, which hinder\nperformance improvements, we propose a module called enhanced attention (EA).\nLastly, we introduce a novel densely connected co-attention module to fuse\nappearance and shape features at different stages effectively. Extensive\nexperiments on two public datasets demonstrate that the proposed method\noutperforms current GAN-based methods and performs on par with diffusion-based\nmethods. However, our method is significantly faster than diffusion-based\nmethods in both training and inference.\n","authors":["Hao Tang","Ling Shao","Nicu Sebe","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2501.08900v1.pdf","comment":"Accepted to TPAMI, an extended version of a paper published in\n  ECCV2020. arXiv admin note: substantial text overlap with arXiv:2007.09278"},{"id":"http://arxiv.org/abs/2501.08885v1","updated":"2025-01-15T15:56:06Z","published":"2025-01-15T15:56:06Z","title":"Feature-based One-For-All: A Universal Framework for Heterogeneous\n  Knowledge Distillation","summary":"  Knowledge distillation (KD) involves transferring knowledge from a\npre-trained heavy teacher model to a lighter student model, thereby reducing\nthe inference cost while maintaining comparable effectiveness. Prior KD\ntechniques typically assume homogeneity between the teacher and student models.\nHowever, as technology advances, a wide variety of architectures have emerged,\nranging from initial Convolutional Neural Networks (CNNs) to Vision\nTransformers (ViTs), and Multi-Level Perceptrons (MLPs). Consequently,\ndeveloping a universal KD framework compatible with any architecture has become\nan important research topic. In this paper, we introduce a feature-based\none-for-all (FOFA) KD framework to enable feature distillation across diverse\narchitecture. Our framework comprises two key components. First, we design\nprompt tuning blocks that incorporate student feedback, allowing teacher\nfeatures to adapt to the student model's learning process. Second, we propose\nregion-aware attention to mitigate the view mismatch problem between\nheterogeneous architecture. By leveraging these two modules, effective\ndistillation of intermediate features can be achieved across heterogeneous\narchitectures. Extensive experiments on CIFAR, ImageNet, and COCO demonstrate\nthe superiority of the proposed method.\n","authors":["Jhe-Hao Lin","Yi Yao","Chan-Feng Hsu","Hongxia Xie","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.08885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20062v2","updated":"2025-01-15T15:53:13Z","published":"2024-12-28T07:34:49Z","title":"MADiff: Text-Guided Fashion Image Editing with Mask Prediction and\n  Attention-Enhanced Diffusion","summary":"  Text-guided image editing model has achieved great success in general domain.\nHowever, directly applying these models to the fashion domain may encounter two\nissues: (1) Inaccurate localization of editing region; (2) Weak editing\nmagnitude. To address these issues, the MADiff model is proposed. Specifically,\nto more accurately identify editing region, the MaskNet is proposed, in which\nthe foreground region, densepose and mask prompts from large language model are\nfed into a lightweight UNet to predict the mask for editing region. To\nstrengthen the editing magnitude, the Attention-Enhanced Diffusion Model is\nproposed, where the noise map, attention map, and the mask from MaskNet are fed\ninto the proposed Attention Processor to produce a refined noise map. By\nintegrating the refined noise map into the diffusion model, the edited image\ncan better align with the target prompt. Given the absence of benchmarks in\nfashion image editing, we constructed a dataset named Fashion-E, comprising\n28390 image-text pairs in the training set, and 2639 image-text pairs for four\ntypes of fashion tasks in the evaluation set. Extensive experiments on\nFashion-E demonstrate that our proposed method can accurately predict the mask\nof editing region and significantly enhance editing magnitude in fashion image\nediting compared to the state-of-the-art methods.\n","authors":["Zechao Zhan","Dehong Gao","Jinxia Zhang","Jiale Huang","Yang Hu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2412.20062v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03492v6","updated":"2025-01-15T15:26:03Z","published":"2023-06-06T08:19:30Z","title":"Industrial Anomaly Detection and Localization Using Weakly-Supervised\n  Residual Transformers","summary":"  Recent advancements in industrial anomaly detection (AD) have demonstrated\nthat incorporating a small number of anomalous samples during training can\nsignificantly enhance accuracy. However, this improvement often comes at the\ncost of extensive annotation efforts, which are impractical for many real-world\napplications. In this paper, we introduce a novel framework, Weak}ly-supervised\nRESidual Transformer (WeakREST), designed to achieve high anomaly detection\naccuracy while minimizing the reliance on manual annotations. First, we\nreformulate the pixel-wise anomaly localization task into a block-wise\nclassification problem. Second, we introduce a residual-based feature\nrepresentation called Positional Fast Anomaly Residuals (PosFAR) which captures\nanomalous patterns more effectively. To leverage this feature, we adapt the\nSwin Transformer for enhanced anomaly detection and localization. Additionally,\nwe propose a weak annotation approach, utilizing bounding boxes and image tags\nto define anomalous regions. This approach establishes a semi-supervised\nlearning context that reduces the dependency on precise pixel-level labels. To\nfurther improve the learning process, we develop a novel ResMixMatch algorithm,\ncapable of handling the interplay between weak labels and residual-based\nrepresentations.\n  On the benchmark dataset MVTec-AD, our method achieves an Average Precision\n(AP) of $83.0\\%$, surpassing the previous best result of $82.7\\%$ in the\nunsupervised setting. In the supervised AD setting, WeakREST attains an AP of\n$87.6\\%$, outperforming the previous best of $86.0\\%$. Notably, even when using\nweaker annotations such as bounding boxes, WeakREST exceeds the performance of\nleading methods relying on pixel-wise supervision, achieving an AP of $87.1\\%$\ncompared to the prior best of $86.0\\%$ on MVTec-AD.\n","authors":["Hanxi Li","Jingqi Wu","Deyin Liu","Lin Wu","Hao Chen","Mingwen Wang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2306.03492v6.pdf","comment":"13 pages,7 figures"},{"id":"http://arxiv.org/abs/2411.10175v2","updated":"2025-01-15T15:24:32Z","published":"2024-11-15T13:21:26Z","title":"The Surprising Ineffectiveness of Pre-Trained Visual Representations for\n  Model-Based Reinforcement Learning","summary":"  Visual Reinforcement Learning (RL) methods often require extensive amounts of\ndata. As opposed to model-free RL, model-based RL (MBRL) offers a potential\nsolution with efficient data utilization through planning. Additionally, RL\nlacks generalization capabilities for real-world tasks. Prior work has shown\nthat incorporating pre-trained visual representations (PVRs) enhances sample\nefficiency and generalization. While PVRs have been extensively studied in the\ncontext of model-free RL, their potential in MBRL remains largely unexplored.\nIn this paper, we benchmark a set of PVRs on challenging control tasks in a\nmodel-based RL setting. We investigate the data efficiency, generalization\ncapabilities, and the impact of different properties of PVRs on the performance\nof model-based agents. Our results, perhaps surprisingly, reveal that for MBRL\ncurrent PVRs are not more sample efficient than learning representations from\nscratch, and that they do not generalize better to out-of-distribution (OOD)\nsettings. To explain this, we analyze the quality of the trained dynamics\nmodel. Furthermore, we show that data diversity and network architecture are\nthe most important contributors to OOD generalization performance.\n","authors":["Moritz Schneider","Robert Krug","Narunas Vaskevicius","Luigi Palmieri","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2411.10175v2.pdf","comment":"Published at the 38th Conference on Neural Information Processing\n  Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/"},{"id":"http://arxiv.org/abs/2412.18977v2","updated":"2025-01-15T15:22:45Z","published":"2024-12-25T19:38:32Z","title":"CGCOD: Class-Guided Camouflaged Object Detection","summary":"  Camouflaged Object Detection (COD) aims to identify objects that blend\nseamlessly into their surroundings. The inherent visual complexity of\ncamouflaged objects, including their low contrast with the background, diverse\ntextures, and subtle appearance variations, often obscures semantic cues,\nmaking accurate segmentation highly challenging. Existing methods primarily\nrely on visual features, which are insufficient to handle the variability and\nintricacy of camouflaged objects, leading to unstable object perception and\nambiguous segmentation results. To tackle these limitations, we introduce a\nnovel task, class-guided camouflaged object detection (CGCOD), which extends\ntraditional COD task by incorporating object-specific class knowledge to\nenhance detection robustness and accuracy. To facilitate this task, we present\na new dataset, CamoClass, comprising real-world camouflaged objects with class\nannotations. Furthermore, we propose a multi-stage framework, CGNet, which\nincorporates a plug-and-play class prompt generator and a simple yet effective\nclass-guided detector. This establishes a new paradigm for COD, bridging the\ngap between contextual understanding and class-guided detection. Extensive\nexperimental results demonstrate the effectiveness of our flexible framework in\nimproving the performance of proposed and existing detectors by leveraging\nclass-level textual information.\n","authors":["Chenxi Zhang","Qing Zhang","Jiayun Wu","Youwei Pang"],"pdf_url":"https://arxiv.org/pdf/2412.18977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08861v1","updated":"2025-01-15T15:20:46Z","published":"2025-01-15T15:20:46Z","title":"Generative Planning with 3D-vision Language Pre-training for End-to-End\n  Autonomous Driving","summary":"  Autonomous driving is a challenging task that requires perceiving and\nunderstanding the surrounding environment for safe trajectory planning. While\nexisting vision-based end-to-end models have achieved promising results, these\nmethods are still facing the challenges of vision understanding, decision\nreasoning and scene generalization. To solve these issues, a generative\nplanning with 3D-vision language pre-training model named GPVL is proposed for\nend-to-end autonomous driving. The proposed paradigm has two significant\naspects. On one hand, a 3D-vision language pre-training module is designed to\nbridge the gap between visual perception and linguistic understanding in the\nbird's eye view. On the other hand, a cross-modal language model is introduced\nto generate holistic driving decisions and fine-grained trajectories with\nperception and navigation information in an auto-regressive manner. Experiments\non the challenging nuScenes dataset demonstrate that the proposed scheme\nachieves excellent performances compared with state-of-the-art methods.\nBesides, the proposed GPVL presents strong generalization ability and real-time\npotential when handling high-level commands in various scenarios. It is\nbelieved that the effective, robust and efficient performance of GPVL is\ncrucial for the practical application of future autonomous driving systems.\nCode is available at https://github.com/ltp1995/GPVL\n","authors":["Tengpeng Li","Hanli Wang","Xianfei Li","Wenlong Liao","Tao He","Pai Peng"],"pdf_url":"https://arxiv.org/pdf/2501.08861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08841v1","updated":"2025-01-15T14:52:20Z","published":"2025-01-15T14:52:20Z","title":"Exploring Task-Level Optimal Prompts for Visual In-Context Learning","summary":"  With the development of Vision Foundation Models (VFMs) in recent years,\nVisual In-Context Learning (VICL) has become a better choice compared to\nmodifying models in most scenarios. Different from retraining or fine-tuning\nmodel, VICL does not require modifications to the model's weights or\narchitecture, and only needs a prompt with demonstrations to teach VFM how to\nsolve tasks. Currently, significant computational cost for finding optimal\nprompts for every test sample hinders the deployment of VICL, as determining\nwhich demonstrations to use for constructing prompts is very costly. In this\npaper, however, we find a counterintuitive phenomenon that most test samples\nactually achieve optimal performance under the same prompts, and searching for\nsample-level prompts only costs more time but results in completely identical\nprompts. Therefore, we propose task-level prompting to reduce the cost of\nsearching for prompts during the inference stage and introduce two time-saving\nyet effective task-level prompt search strategies. Extensive experimental\nresults show that our proposed method can identify near-optimal prompts and\nreach the best VICL performance with a minimal cost that prior work has never\nachieved.\n","authors":["Yan Zhu","Huan Ma","Changqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08837v1","updated":"2025-01-15T14:46:44Z","published":"2025-01-15T14:46:44Z","title":"MANTA: Diffusion Mamba for Efficient and Effective Stochastic Long-Term\n  Dense Anticipation","summary":"  Our work addresses the problem of stochastic long-term dense anticipation.\nThe goal of this task is to predict actions and their durations several minutes\ninto the future based on provided video observations. Anticipation over\nextended horizons introduces high uncertainty, as a single observation can lead\nto multiple plausible future outcomes. To address this uncertainty, stochastic\nmodels are designed to predict several potential future action sequences.\nRecent work has further proposed to incorporate uncertainty modelling for\nobserved frames by simultaneously predicting per-frame past and future actions\nin a unified manner. While such joint modelling of actions is beneficial, it\nrequires long-range temporal capabilities to connect events across distant past\nand future time points. However, the previous work struggles to achieve such a\nlong-range understanding due to its limited and/or sparse receptive field. To\nalleviate this issue, we propose a novel MANTA (MAmba for ANTicipation)\nnetwork. Our model enables effective long-term temporal modelling even for very\nlong sequences while maintaining linear complexity in sequence length. We\ndemonstrate that our approach achieves state-of-the-art results on three\ndatasets - Breakfast, 50Salads, and Assembly101 - while also significantly\nimproving computational and memory efficiency.\n","authors":["Olga Zatsarynna","Emad Bahrami","Yazan Abu Farha","Gianpiero Francesca","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2501.08837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16758v2","updated":"2025-01-15T14:35:11Z","published":"2024-12-21T20:16:37Z","title":"Evaluation of radiomic feature harmonization techniques for benign and\n  malignant pulmonary nodules","summary":"  BACKGROUND: Radiomics provides quantitative features of pulmonary nodules\n(PNs) which could aid lung cancer diagnosis, but medical image acquisition\nvariability is an obstacle to clinical application. Acquisition effects may\ndiffer between radiomic features from benign vs. malignant PNs. PURPOSE: We\nevaluated how to account for differences between benign and malignant PNs when\ncorrecting radiomic features' acquisition dependency. METHODS: We used 567\nchest CT scans grouped as benign, malignant, or lung cancer screening (mixed\nbenign, malignant). ComBat harmonization was applied to extracted features for\nvariation in 4 acquisition parameters. We compared: harmonizing without\ndistinction, harmonizing with a covariate to preserve distinctions between\nsubgroups, and harmonizing subgroups separately. Significant ($p\\le0.05$)\nKruskal-Wallis tests showed whether harmonization removed acquisition\ndependency. A LASSO-SVM pipeline was trained on successfully harmonized\nfeatures to predict malignancy. To evaluate predictive information in these\nfeatures, the trained harmonization estimators and predictive model were\napplied to unseen test sets. Harmonization and predictive performance were\nassessed for 10 trials of 5-fold cross-validation. RESULTS: An average 2.1% of\nfeatures (95% CI:1.9-2.4%) were acquisition-independent when harmonized without\ndistinction, 27.3% (95% CI:25.7-28.9%) when harmonized with a covariate, and\n90.9% (95% CI:90.4-91.5%) when harmonized separately. Data harmonized\nseparately or with a covariate trained models with higher ROC-AUC for screening\nscans than data harmonized without distinction between benign and malignant PNs\n(Delong test, adjusted $p\\le0.05$). CONCLUSIONS: Radiomic features of benign\nand malignant PNs need different corrective transformations to recover\nacquisition-independent distributions. This can be done by harmonizing\nseparately or with a covariate.\n","authors":["Claire Huchthausen","Menglin Shi","Gabriel L. A. de Sousa","Jonathan Colen","Emery Shelley","James Larner","Einsley Janowski","Krishni Wijesooriya"],"pdf_url":"https://arxiv.org/pdf/2412.16758v2.pdf","comment":"15 pages, 3 figures, plus supplemental material; updated author list,\n  corrected result in paragraph 3 of Discussion, updated Figure S1"},{"id":"http://arxiv.org/abs/2501.08828v1","updated":"2025-01-15T14:30:13Z","published":"2025-01-15T14:30:13Z","title":"MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents","summary":"  Multi-modal document retrieval is designed to identify and retrieve various\nforms of multi-modal content, such as figures, tables, charts, and layout\ninformation from extensive documents. Despite its significance, there is a\nnotable lack of a robust benchmark to effectively evaluate the performance of\nsystems in multi-modal document retrieval. To address this gap, this work\nintroduces a new benchmark, named as MMDocIR, encompassing two distinct tasks:\npage-level and layout-level retrieval. The former focuses on localizing the\nmost relevant pages within a long document, while the latter targets the\ndetection of specific layouts, offering a more fine-grained granularity than\nwhole-page analysis. A layout can refer to a variety of elements such as\ntextual paragraphs, equations, figures, tables, or charts. The MMDocIR\nbenchmark comprises a rich dataset featuring expertly annotated labels for\n1,685 questions and bootstrapped labels for 173,843 questions, making it a\npivotal resource for advancing multi-modal document retrieval for both training\nand evaluation. Through rigorous experiments, we reveal that (i) visual\nretrievers significantly outperform their text counterparts, (ii) MMDocIR train\nset can effectively benefit the training process of multi-modal document\nretrieval and (iii) text retrievers leveraging on VLM-text perform much better\nthan those using OCR-text. These findings underscores the potential advantages\nof integrating visual elements for multi-modal document retrieval.\n","authors":["Kuicai Dong","Yujing Chang","Xin Deik Goh","Dexun Li","Ruiming Tang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08828v1.pdf","comment":"https://huggingface.co/MMDocIR"},{"id":"http://arxiv.org/abs/2501.08819v1","updated":"2025-01-15T14:17:13Z","published":"2025-01-15T14:17:13Z","title":"Boosting Diffusion Guidance via Learning Degradation-Aware Models for\n  Blind Super Resolution","summary":"  Recently, diffusion-based blind super-resolution (SR) methods have shown\ngreat ability to generate high-resolution images with abundant high-frequency\ndetail, but the detail is often achieved at the expense of fidelity. Meanwhile,\nanother line of research focusing on rectifying the reverse process of\ndiffusion models (i.e., diffusion guidance), has demonstrated the power to\ngenerate high-fidelity results for non-blind SR. However, these methods rely on\nknown degradation kernels, making them difficult to apply to blind SR. To\naddress these issues, we introduce degradation-aware models that can be\nintegrated into the diffusion guidance framework, eliminating the need to know\ndegradation kernels. Additionally, we propose two novel techniques input\nperturbation and guidance scalar to further improve our performance. Extensive\nexperimental results show that our proposed method has superior performance\nover state-of-the-art methods on blind SR benchmarks\n","authors":["Shao-Hao Lu","Ren Wang","Ching-Chun Huang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2501.08819v1.pdf","comment":"To appear in WACV 2025. Code is available at:\n  https://github.com/ryanlu2240/Boosting-Diffusion-Guidance-via-Learning-Degradation-Aware-Models-for-Blind-Super-Resolution"},{"id":"http://arxiv.org/abs/2501.08816v1","updated":"2025-01-15T14:12:59Z","published":"2025-01-15T14:12:59Z","title":"IDEA: Image Description Enhanced CLIP-Adapter","summary":"  CLIP (Contrastive Language-Image Pre-training) has attained great success in\npattern recognition and computer vision. Transferring CLIP to downstream tasks\n(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.\nHowever, current studies primarily focus on either prompt learning for text or\nadapter tuning for vision, without fully exploiting the complementary\ninformation and correlations among image-text pairs. In this paper, we propose\nan Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to\nfew-shot image classification tasks. This method captures fine-grained features\nby leveraging both visual features and textual descriptions of images. IDEA is\na training-free method for CLIP, and it can be comparable to or even exceeds\nstate-of-the-art models on multiple tasks. Furthermore, we introduce\nTrainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable\ncomponents (i.e., a projector and a learnable latent space), further enhancing\nthe model's performance and achieving SOTA results on 11 datasets. As one\nimportant contribution, we employ the Llama model and design a comprehensive\npipeline to generate textual descriptions for images of 11 datasets, resulting\nin a total of 1,637,795 image-text pairs, named \"IMD-11\". Our code and data are\nreleased at https://github.com/FourierAI/IDEA.\n","authors":["Zhipeng Ye","Feng Jiang","Qiufeng Wang","Kaizhu Huang","Jiaqi Huang"],"pdf_url":"https://arxiv.org/pdf/2501.08816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08815v1","updated":"2025-01-15T14:12:55Z","published":"2025-01-15T14:12:55Z","title":"Human Pose-Constrained UV Map Estimation","summary":"  UV map estimation is used in computer vision for detailed analysis of human\nposture or activity. Previous methods assign pixels to body model vertices by\ncomparing pixel descriptors independently, without enforcing global coherence\nor plausibility in the UV map. We propose Pose-Constrained Continuous Surface\nEmbeddings (PC-CSE), which integrates estimated 2D human pose into the\npixel-to-vertex assignment process. The pose provides global anatomical\nconstraints, ensuring that UV maps remain coherent while preserving local\nprecision. Evaluation on DensePose COCO demonstrates consistent improvement,\nregardless of the chosen 2D human pose model. Whole-body poses offer better\nconstraints by incorporating additional details about the hands and feet.\nConditioning UV maps with human pose reduces invalid mappings and enhances\nanatomical plausibility. In addition, we highlight inconsistencies in the\nground-truth annotations.\n","authors":["Matej Suchanek","Miroslav Purkrabek","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2501.08815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08807v1","updated":"2025-01-15T14:03:27Z","published":"2025-01-15T14:03:27Z","title":"Multi-visual modality micro drone-based structural damage detection","summary":"  Accurate detection and resilience of object detectors in structural damage\ndetection are important in ensuring the continuous use of civil infrastructure.\nHowever, achieving robustness in object detectors remains a persistent\nchallenge, impacting their ability to generalize effectively. This study\nproposes DetectorX, a robust framework for structural damage detection coupled\nwith a micro drone. DetectorX addresses the challenges of object detector\nrobustness by incorporating two innovative modules: a stem block and a spiral\npooling technique. The stem block introduces a dynamic visual modality by\nleveraging the outputs of two Deep Convolutional Neural Network (DCNN) models.\nThe framework employs the proposed event-based reward reinforcement learning to\nconstrain the actions of a parent and child DCNN model leading to a reward.\nThis results in the induction of two dynamic visual modalities alongside the\nRed, Green, and Blue (RGB) data. This enhancement significantly augments\nDetectorX's perception and adaptability in diverse environmental situations.\nFurther, a spiral pooling technique, an online image augmentation method,\nstrengthens the framework by increasing feature representations by\nconcatenating spiraled and average/max pooled features. In three extensive\nexperiments: (1) comparative and (2) robustness, which use the Pacific\nEarthquake Engineering Research Hub ImageNet dataset, and (3) field-experiment,\nDetectorX performed satisfactorily across varying metrics, including precision\n(0.88), recall (0.84), average precision (0.91), mean average precision (0.76),\nand mean average recall (0.73), compared to the competing detectors including\nYou Only Look Once X-medium (YOLOX-m) and others. The study's findings indicate\nthat DetectorX can provide satisfactory results and demonstrate resilience in\nchallenging environments.\n","authors":["Isaac Osei Agyemanga","Liaoyuan Zeng","Jianwen Chena","Isaac Adjei-Mensah","Daniel Acheampong"],"pdf_url":"https://arxiv.org/pdf/2501.08807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19694v2","updated":"2025-01-15T13:53:29Z","published":"2024-07-29T04:33:04Z","title":"Structural damage detection via hierarchical damage information with\n  volumetric assessment","summary":"  Structural health monitoring (SHM) is essential for ensuring the safety and\nlongevity of infrastructure, but complex image environments, noisy labels, and\nreliance on manual damage assessments often hinder its effectiveness. This\nstudy introduces the Guided Detection Network (Guided-DetNet), a framework\ndesigned to address these challenges. Guided-DetNet is characterized by a\nGenerative Attention Module (GAM), Hierarchical Elimination Algorithm (HEA),\nand Volumetric Contour Visual Assessment (VCVA). GAM leverages cross-horizontal\nand cross-vertical patch merging and cross-foreground-background feature fusion\nto generate varied features to mitigate complex image environments. HEA\naddresses noisy labeling using hierarchical relationships among classes to\nrefine instances given an image by eliminating unlikely class instances. VCVA\nassesses the severity of detected damages via volumetric representation and\nquantification leveraging the Dirac delta distribution. A comprehensive\nquantitative study and two robustness tests were conducted using the PEER Hub\ndataset, and a drone-based application, which involved a field experiment, was\nconducted to substantiate Guided-DetNet's promising performances. In triple\nclassification tasks, the framework achieved 96% accuracy, surpassing\nstate-of-the-art classifiers by up to 3%. In dual detection tasks, it\noutperformed competitive detectors with a precision of 94% and a mean average\nprecision (mAP) of 79% while maintaining a frame rate of 57.04fps, suitable for\nreal-time applications. Additionally, robustness tests demonstrated resilience\nunder adverse conditions, with precision scores ranging from 79% to 91%.\nGuided-DetNet is established as a robust and efficient framework for SHM,\noffering advancements in automation and precision, with the potential for\nwidespread application in drone-based infrastructure inspections.\n","authors":["Isaac Osei Agyemang","Isaac Adjei-Mensah","Daniel Acheampong","Gordon Owusu Boateng","Adu Asare Baffour"],"pdf_url":"https://arxiv.org/pdf/2407.19694v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08799v1","updated":"2025-01-15T13:46:33Z","published":"2025-01-15T13:46:33Z","title":"Exploring ChatGPT for Face Presentation Attack Detection in Zero and\n  Few-Shot in-Context Learning","summary":"  This study highlights the potential of ChatGPT (specifically GPT-4o) as a\ncompetitive alternative for Face Presentation Attack Detection (PAD),\noutperforming several PAD models, including commercial solutions, in specific\nscenarios. Our results show that GPT-4o demonstrates high consistency,\nparticularly in few-shot in-context learning, where its performance improves as\nmore examples are provided (reference data). We also observe that detailed\nprompts enable the model to provide scores reliably, a behavior not observed\nwith concise prompts. Additionally, explanation-seeking prompts slightly\nenhance the model's performance by improving its interpretability. Remarkably,\nthe model exhibits emergent reasoning capabilities, correctly predicting the\nattack type (print or replay) with high accuracy in few-shot scenarios, despite\nnot being explicitly instructed to classify attack types. Despite these\nstrengths, GPT-4o faces challenges in zero-shot tasks, where its performance is\nlimited compared to specialized PAD systems. Experiments were conducted on a\nsubset of the SOTERIA dataset, ensuring compliance with data privacy\nregulations by using only data from consenting individuals. These findings\nunderscore GPT-4o's promise in PAD applications, laying the groundwork for\nfuture research to address broader data privacy concerns and improve\ncross-dataset generalization. Code available here:\nhttps://gitlab.idiap.ch/bob/bob.paper.wacv2025_chatgpt_face_pad\n","authors":["Alain Komaty","Hatef Otroshi Shahreza","Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2501.08799v1.pdf","comment":"Accepted in WACV workshop 2025"},{"id":"http://arxiv.org/abs/2412.16563v2","updated":"2025-01-15T13:34:12Z","published":"2024-12-21T10:16:07Z","title":"SemTalk: Holistic Co-speech Motion Generation with Frame-level Semantic\n  Emphasis","summary":"  A good co-speech motion generation cannot be achieved without a careful\nintegration of common rhythmic motion and rare yet essential semantic motion.\nIn this work, we propose SemTalk for holistic co-speech motion generation with\nframe-level semantic emphasis. Our key insight is to separately learn general\nmotions and sparse motions, and then adaptively fuse them. In particular,\nrhythmic consistency learning is explored to establish rhythm-related base\nmotion, ensuring a coherent foundation that synchronizes gestures with the\nspeech rhythm. Subsequently, textit{semantic emphasis learning is designed to\ngenerate semantic-aware sparse motion, focusing on frame-level semantic cues.\nFinally, to integrate sparse motion into the base motion and generate\nsemantic-emphasized co-speech gestures, we further leverage a learned semantic\nscore for adaptive synthesis. Qualitative and quantitative comparisons on two\npublic datasets demonstrate that our method outperforms the state-of-the-art,\ndelivering high-quality co-speech motion with enhanced semantic richness over a\nstable base motion.\n","authors":["Xiangyue Zhang","Jianfang Li","Jiaxu Zhang","Ziqiang Dang","Jianqiang Ren","Liefeng Bo","Zhigang Tu"],"pdf_url":"https://arxiv.org/pdf/2412.16563v2.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.02487v3","updated":"2025-01-15T13:07:56Z","published":"2025-01-05T09:40:58Z","title":"ACE++: Instruction-Based Image Creation and Editing via Context-Aware\n  Content Filling","summary":"  We report ACE++, an instruction-based diffusion framework that tackles\nvarious image generation and editing tasks. Inspired by the input format for\nthe inpainting task proposed by FLUX.1-Fill-dev, we improve the Long-context\nCondition Unit (LCU) introduced in ACE and extend this input paradigm to any\nediting and generation tasks. To take full advantage of image generative\npriors, we develop a two-stage training scheme to minimize the efforts of\nfinetuning powerful text-to-image diffusion models like FLUX.1-dev. In the\nfirst stage, we pre-train the model using task data with the 0-ref tasks from\nthe text-to-image model. There are many models in the community based on the\npost-training of text-to-image foundational models that meet this training\nparadigm of the first stage. For example, FLUX.1-Fill-dev deals primarily with\npainting tasks and can be used as an initialization to accelerate the training\nprocess. In the second stage, we finetune the above model to support the\ngeneral instructions using all tasks defined in ACE. To promote the widespread\napplication of ACE++ in different scenarios, we provide a comprehensive set of\nmodels that cover both full finetuning and lightweight finetuning, while\nconsidering general applicability and applicability in vertical scenarios. The\nqualitative analysis showcases the superiority of ACE++ in terms of generating\nimage quality and prompt following ability. Code and models will be available\non the project page: https://ali-vilab. github.io/ACE_plus_page/.\n","authors":["Chaojie Mao","Jingfeng Zhang","Yulin Pan","Zeyinzi Jiang","Zhen Han","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.02487v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07047v2","updated":"2025-01-15T12:47:35Z","published":"2024-05-11T16:30:39Z","title":"Solving Energy-Independent Density for CT Metal Artifact Reduction via\n  Neural Representation","summary":"  X-ray CT often suffers from shadowing and streaking artifacts in the presence\nof metallic materials, which severely degrade imaging quality. Physically, the\nlinear attenuation coefficients (LACs) of metals vary significantly with X-ray\nenergy, causing a nonlinear beam hardening effect (BHE) in CT measurements.\nReconstructing CT images from metal-corrupted measurements consequently becomes\na challenging nonlinear inverse problem. Existing state-of-the-art (SOTA) metal\nartifact reduction (MAR) algorithms rely on supervised learning with numerous\npaired CT samples. While promising, these supervised methods often assume that\nthe unknown LACs are energy-independent, ignoring the energy-induced BHE, which\nresults in limited generalization. Moreover, the requirement for large datasets\nalso limits their applications in real-world scenarios. In this work, we\npropose Density neural representation (Diner), a novel unsupervised MAR method.\nOur key innovation lies in formulating MAR as an energy-independent density\nreconstruction problem that strictly adheres to the photon-tissue absorption\nphysical model. This model is inherently nonlinear and complex, making it a\nrarely considered approach in inverse imaging problems. By introducing the\nwater-equivalent tissues approximation and a new polychromatic model to\ncharacterize the nonlinear CT acquisition process, we directly learn the neural\nrepresentation of the density map from raw measurements without using external\ntraining data. This energy-independent density reconstruction framework\nfundamentally resolves the nonlinear BHE, enabling superior MAR performance\nacross a wide range of scanning scenarios. Extensive experiments on both\nsimulated and real-world datasets demonstrate the superiority of our\nunsupervised Diner over popular supervised methods in terms of MAR performance\nand robustness.\n","authors":["Qing Wu","Xu Guo","Lixuan Chen","Yanyan Liu","Dongming He","Xudong Wang","Xueli Chen","Yifeng Zhang","S. Kevin Zhou","Jingyi Yu","Yuyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.07047v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2312.17345v2","updated":"2025-01-15T12:46:07Z","published":"2023-12-28T20:26:03Z","title":"3VL: Using Trees to Improve Vision-Language Models' Interpretability","summary":"  Vision-Language models (VLMs) have proven to be effective at aligning image\nand text representations, producing superior zero-shot results when transferred\nto many downstream tasks. However, these representations suffer from some key\nshortcomings in understanding Compositional Language Concepts (CLC), such as\nrecognizing objects' attributes, states, and relations between different\nobjects. Moreover, VLMs typically have poor interpretability, making it\nchallenging to debug and mitigate compositional-understanding failures. In this\nwork, we introduce the architecture and training technique of Tree-augmented\nVision-Language (3VL) model accompanied by our proposed Anchor inference method\nand Differential Relevance (DiRe) interpretability tool. By expanding the text\nof an arbitrary image-text pair into a hierarchical tree structure using\nlanguage analysis tools, 3VL allows the induction of this structure into the\nvisual representation learned by the model, enhancing its interpretability and\ncompositional reasoning. Additionally, we show how Anchor, a simple technique\nfor text unification, can be used to filter nuisance factors while increasing\nCLC understanding performance, e.g., on the fundamental VL-Checklist benchmark.\nWe also show how DiRe, which performs a differential comparison between VLM\nrelevancy maps, enables us to generate compelling visualizations of the reasons\nfor a model's success or failure. Our code is available at:\nhttps://github.com/niryellinek/3VL.\n","authors":["Nir Yellinek","Leonid Karlinsky","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2312.17345v2.pdf","comment":"accepted to IEEE TIP"},{"id":"http://arxiv.org/abs/2501.08771v1","updated":"2025-01-15T12:44:52Z","published":"2025-01-15T12:44:52Z","title":"Admitting Ignorance Helps the Video Question Answering Models to Answer","summary":"  Significant progress has been made in the field of video question answering\n(VideoQA) thanks to deep learning and large-scale pretraining. Despite the\npresence of sophisticated model structures and powerful video-text foundation\nmodels, most existing methods focus solely on maximizing the correlation\nbetween answers and video-question pairs during training. We argue that these\nmodels often establish shortcuts, resulting in spurious correlations between\nquestions and answers, especially when the alignment between video and text\ndata is suboptimal. To address these spurious correlations, we propose a novel\ntraining framework in which the model is compelled to acknowledge its ignorance\nwhen presented with an intervened question, rather than making guesses solely\nbased on superficial question-answer correlations. We introduce methodologies\nfor intervening in questions, utilizing techniques such as displacement and\nperturbation, and design frameworks for the model to admit its lack of\nknowledge in both multi-choice VideoQA and open-ended settings. In practice, we\nintegrate a state-of-the-art model into our framework to validate its\neffectiveness. The results clearly demonstrate that our framework can\nsignificantly enhance the performance of VideoQA models with minimal structural\nmodifications.\n","authors":["Haopeng Li","Tom Drummond","Mingming Gong","Mohammed Bennamoun","Qiuhong Ke"],"pdf_url":"https://arxiv.org/pdf/2501.08771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06406v2","updated":"2025-01-15T12:36:24Z","published":"2024-03-11T03:35:41Z","title":"When No-Reference Image Quality Models Meet MAP Estimation in Diffusion\n  Latents","summary":"  Contemporary no-reference image quality assessment (NR-IQA) models can\neffectively quantify perceived image quality, often achieving strong\ncorrelations with human perceptual scores on standard IQA benchmarks. Yet,\nlimited efforts have been devoted to treating NR-IQA models as natural image\npriors for real-world image enhancement, and consequently comparing them from a\nperceptual optimization standpoint. In this work, we show -- for the first time\n-- that NR-IQA models can be plugged into the maximum a posteriori (MAP)\nestimation framework for image enhancement. This is achieved by performing\ngradient ascent in the diffusion latent space rather than in the raw pixel\ndomain, leveraging a pretrained differentiable and bijective diffusion process.\nLikely, different NR-IQA models lead to different enhanced outputs, which in\nturn provides a new computational means of comparing them. Unlike conventional\ncorrelation-based measures, our comparison method offers complementary insights\ninto the respective strengths and weaknesses of the competing NR-IQA models in\nperceptual optimization scenarios. Additionally, we aim to improve the\nbest-performing NR-IQA model in diffusion latent MAP estimation by\nincorporating the advantages of other top-performing methods. The resulting\nmodel delivers noticeably better results in enhancing real-world images\nafflicted by unknown and complex distortions, all preserving a high degree of\nimage fidelity.\n","authors":["Weixia Zhang","Dingquan Li","Guangtao Zhai","Xiaokang Yang","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2403.06406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08763v1","updated":"2025-01-15T12:33:11Z","published":"2025-01-15T12:33:11Z","title":"Few-Shot Learner Generalizes Across AI-Generated Image Detection","summary":"  Current fake image detectors trained on large synthetic image datasets\nperform satisfactorily on limited studied generative models. However, they\nsuffer a notable performance decline over unseen models. Besides, collecting\nadequate training data from online generative models is often expensive or\ninfeasible. To overcome these issues, we propose Few-Shot Detector (FSD), a\nnovel AI-generated image detector which learns a specialized metric space to\neffectively distinguish unseen fake images by utilizing very few samples.\nExperiments show FSD achieves state-of-the-art performance by $+7.4\\%$ average\nACC on GenImage dataset. More importantly, our method is better capable of\ncapturing the intra-category common features in unseen images without further\ntraining.\n","authors":["Shiyu Wu","Jing Liu","Jing Li","Yequan Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08763v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.01505v4","updated":"2025-01-15T12:31:57Z","published":"2024-01-03T02:22:34Z","title":"Sports-QA: A Large-Scale Video Question Answering Benchmark for Complex\n  and Professional Sports","summary":"  Reasoning over sports videos for question answering is an important task with\nnumerous applications, such as player training and information retrieval.\nHowever, this task has not been explored due to the lack of relevant datasets\nand the challenging nature it presents. Most datasets for video question\nanswering (VideoQA) focus mainly on general and coarse-grained understanding of\ndaily-life videos, which is not applicable to sports scenarios requiring\nprofessional action understanding and fine-grained motion analysis. In this\npaper, we introduce the first dataset, named Sports-QA, specifically designed\nfor the sports VideoQA task. The Sports-QA dataset includes various types of\nquestions, such as descriptions, chronologies, causalities, and counterfactual\nconditions, covering multiple sports. Furthermore, to address the\ncharacteristics of the sports VideoQA task, we propose a new Auto-Focus\nTransformer (AFT) capable of automatically focusing on particular scales of\ntemporal information for question answering. We conduct extensive experiments\non Sports-QA, including baseline studies and the evaluation of different\nmethods. The results demonstrate that our AFT achieves state-of-the-art\nperformance.\n","authors":["Haopeng Li","Andong Deng","Jun Liu","Hossein Rahmani","Yulan Guo","Bernt Schiele","Mohammed Bennamoun","Qiuhong Ke"],"pdf_url":"https://arxiv.org/pdf/2401.01505v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08002v2","updated":"2025-01-15T11:52:29Z","published":"2025-01-14T10:46:41Z","title":"Maximizing Uncertainty for Federated learning via Bayesian\n  Optimisation-based Model Poisoning","summary":"  As we transition from Narrow Artificial Intelligence towards Artificial Super\nIntelligence, users are increasingly concerned about their privacy and the\ntrustworthiness of machine learning (ML) technology. A common denominator for\nthe metrics of trustworthiness is the quantification of uncertainty inherent in\nDL algorithms, and specifically in the model parameters, input data, and model\npredictions. One of the common approaches to address privacy-related issues in\nDL is to adopt distributed learning such as federated learning (FL), where\nprivate raw data is not shared among users. Despite the privacy-preserving\nmechanisms in FL, it still faces challenges in trustworthiness. Specifically,\nthe malicious users, during training, can systematically create malicious model\nparameters to compromise the models predictive and generative capabilities,\nresulting in high uncertainty about their reliability. To demonstrate malicious\nbehaviour, we propose a novel model poisoning attack method named Delphi which\naims to maximise the uncertainty of the global model output. We achieve this by\ntaking advantage of the relationship between the uncertainty and the model\nparameters of the first hidden layer of the local model. Delphi employs two\ntypes of optimisation , Bayesian Optimisation and Least Squares Trust Region,\nto search for the optimal poisoned model parameters, named as Delphi-BO and\nDelphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise\nthe distance of the predictive probability distribution towards an uncertain\ndistribution of model output. Furthermore, we establish a mathematical proof\nfor the attack effectiveness demonstrated in FL. Numerical results demonstrate\nthat Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR\nhighlighting vulnerability of FL systems to model poisoning attacks.\n","authors":["Marios Aristodemou","Xiaolan Liu","Yuan Wang","Konstantinos G. Kyriakopoulos","Sangarapillai Lambotharan","Qingsong Wei"],"pdf_url":"https://arxiv.org/pdf/2501.08002v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2402.12238v2","updated":"2025-01-15T11:52:13Z","published":"2024-02-19T15:48:55Z","title":"MGF: Mixed Gaussian Flow for Diverse Trajectory Prediction","summary":"  To predict future trajectories, the normalizing flow with a standard Gaussian\nprior suffers from weak diversity. The ineffectiveness comes from the conflict\nbetween the fact of asymmetric and multi-modal distribution of likely outcomes\nand symmetric and single-modal original distribution and supervision losses.\nInstead, we propose constructing a mixed Gaussian prior for a normalizing flow\nmodel for trajectory prediction. The prior is constructed by analyzing the\ntrajectory patterns in the training samples without requiring extra annotations\nwhile showing better expressiveness and being multi-modal and asymmetric.\nBesides diversity, it also provides better controllability for probabilistic\ntrajectory generation. We name our method Mixed Gaussian Flow (MGF). It\nachieves state-of-the-art performance in the evaluation of both trajectory\nalignment and diversity on the popular UCY/ETH and SDD datasets. Code is\navailable at https://github.com/mulplue/MGF.\n","authors":["Jiahe Chen","Jinkun Cao","Dahua Lin","Kris Kitani","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2402.12238v2.pdf","comment":"Accepted by Neurips 2024. Code: https://github.com/mulplue/MGF"},{"id":"http://arxiv.org/abs/2407.11664v3","updated":"2025-01-15T11:51:19Z","published":"2024-07-16T12:36:26Z","title":"Mask-guided cross-image attention for zero-shot in-silico\n  histopathologic image generation with a diffusion model","summary":"  Creating in-silico data with generative AI promises a cost-effective\nalternative to staining, imaging, and annotating whole slide images in\ncomputational pathology. Diffusion models are the state-of-the-art solution for\ngenerating in-silico images, offering unparalleled fidelity and realism. Using\nappearance transfer diffusion models allows for zero-shot image generation,\nfacilitating fast application and making model training unnecessary. However\ncurrent appearance transfer diffusion models are designed for natural images,\nwhere the main task is to transfer the foreground object from an origin to a\ntarget domain, while the background is of insignificant importance. In\ncomputational pathology, specifically in oncology, it is however not\nstraightforward to define which objects in an image should be classified as\nforeground and background, as all objects in an image may be of critical\nimportance for the detailed understanding the tumor micro-environment. We\ncontribute to the applicability of appearance transfer diffusion models to\nimmunohistochemistry-stained images by modifying the appearance transfer\nguidance to alternate between class-specific AdaIN feature statistics matchings\nusing existing segmentation masks. The performance of the proposed method is\ndemonstrated on the downstream task of supervised epithelium segmentation,\nshowing that the number of manual annotations required for model training can\nbe reduced by 75%, outperforming the baseline approach. Additionally, we\nconsulted with a certified pathologist to investigate future improvements. We\nanticipate this work to inspire the application of zero-shot diffusion models\nin computational pathology, providing an efficient method to generate in-silico\nimages with unmatched fidelity and realism, which prove meaningful for\ndownstream tasks, such as training existing deep learning models or finetuning\nfoundation models.\n","authors":["Dominik Winter","Nicolas Triltsch","Marco Rosati","Anatoliy Shumilov","Ziya Kokaragac","Yuri Popov","Thomas Padel","Laura Sebastian Monasor","Ross Hill","Markus Schick","Nicolas Brieu"],"pdf_url":"https://arxiv.org/pdf/2407.11664v3.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2501.08717v1","updated":"2025-01-15T10:58:32Z","published":"2025-01-15T10:58:32Z","title":"$\\texttt{InfoHier}$: Hierarchical Information Extraction via Encoding\n  and Embedding","summary":"  Analyzing large-scale datasets, especially involving complex and\nhigh-dimensional data like images, is particularly challenging. While\nself-supervised learning (SSL) has proven effective for learning\nrepresentations from unlabelled data, it typically focuses on flat,\nnon-hierarchical structures, missing the multi-level relationships present in\nmany real-world datasets. Hierarchical clustering (HC) can uncover these\nrelationships by organizing data into a tree-like structure, but it often\nrelies on rigid similarity metrics that struggle to capture the complexity of\ndiverse data types. To address these we envision $\\texttt{InfoHier}$, a\nframework that combines SSL with HC to jointly learn robust latent\nrepresentations and hierarchical structures. This approach leverages SSL to\nprovide adaptive representations, enhancing HC's ability to capture complex\npatterns. Simultaneously, it integrates HC loss to refine SSL training,\nresulting in representations that are more attuned to the underlying\ninformation hierarchy. $\\texttt{InfoHier}$ has the potential to improve the\nexpressiveness and performance of both clustering and representation learning,\noffering significant benefits for data analysis, management, and information\nretrieval.\n","authors":["Tianru Zhang","Li Ju","Prashant Singh","Salman Toor"],"pdf_url":"https://arxiv.org/pdf/2501.08717v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.08712v1","updated":"2025-01-15T10:54:21Z","published":"2025-01-15T10:54:21Z","title":"Self-supervised Transformation Learning for Equivariant Representations","summary":"  Unsupervised representation learning has significantly advanced various\nmachine learning tasks. In the computer vision domain, state-of-the-art\napproaches utilize transformations like random crop and color jitter to achieve\ninvariant representations, embedding semantically the same inputs despite\ntransformations. However, this can degrade performance in tasks requiring\nprecise features, such as localization or flower classification. To address\nthis, recent research incorporates equivariant representation learning, which\ncaptures transformation-sensitive information. However, current methods depend\non transformation labels and thus struggle with interdependency and complex\ntransformations. We propose Self-supervised Transformation Learning (STL),\nreplacing transformation labels with transformation representations derived\nfrom image pairs. The proposed method ensures transformation representation is\nimage-invariant and learns corresponding equivariant transformations, enhancing\nperformance without increased batch complexity. We demonstrate the approach's\neffectiveness across diverse classification and detection tasks, outperforming\nexisting methods in 7 out of 11 benchmarks and excelling in detection. By\nintegrating complex transformations like AugMix, unusable by prior equivariant\nmethods, this approach enhances performance across tasks, underscoring its\nadaptability and resilience. Additionally, its compatibility with various base\nmodels highlights its flexibility and broad applicability. The code is\navailable at https://github.com/jaemyung-u/stl.\n","authors":["Jaemyung Yu","Jaehyun Choi","Dong-Jae Lee","HyeongGwon Hong","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2501.08712v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n  2024)"},{"id":"http://arxiv.org/abs/2501.08115v2","updated":"2025-01-15T10:05:39Z","published":"2025-01-14T13:46:07Z","title":"RoHan: Robust Hand Detection in Operation Room","summary":"  Hand-specific localization has garnered significant interest within the\ncomputer vision community. Although there are numerous datasets with hand\nannotations from various angles and settings, domain transfer techniques\nfrequently struggle in surgical environments. This is mainly due to the limited\navailability of gloved hand instances and the unique challenges of operating\nrooms (ORs). Thus, hand-detection models tailored to OR settings require\nextensive training and expensive annotation processes. To overcome these\nchallenges, we present \"RoHan\" - a novel approach for robust hand detection in\nthe OR, leveraging advanced semi-supervised domain adaptation techniques to\ntackle the challenges of varying recording conditions, diverse glove colors,\nand occlusions common in surgical settings. Our methodology encompasses two\nmain stages: (1) data augmentation strategy that utilizes \"Artificial Gloves,\"\na method for augmenting publicly available hand datasets with synthetic images\nof hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that\nimproves detection performance in real-world OR settings through iterative\nprediction refinement and efficient frame filtering. We evaluate our method\nusing two datasets: simulated enterotomy repair and saphenous vein graft\nharvesting. \"RoHan\" substantially reduces the need for extensive labeling and\nmodel training, paving the way for the practical implementation of hand\ndetection technologies in medical settings.\n","authors":["Roi Papo","Sapir Gershov","Tom Friedman","Itay Or","Gil Bolotin","Shlomi Laufer"],"pdf_url":"https://arxiv.org/pdf/2501.08115v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2410.05301v2","updated":"2025-01-15T09:42:42Z","published":"2024-10-04T12:22:54Z","title":"Diffusion-based Unsupervised Audio-visual Speech Enhancement","summary":"  This paper proposes a new unsupervised audio-visual speech enhancement (AVSE)\napproach that combines a diffusion-based audio-visual speech generative model\nwith a non-negative matrix factorization (NMF) noise model. First, the\ndiffusion model is pre-trained on clean speech conditioned on corresponding\nvideo data to simulate the speech generative distribution. This pre-trained\nmodel is then paired with the NMF-based noise model to estimate clean speech\niteratively. Specifically, a diffusion-based posterior sampling approach is\nimplemented within the reverse diffusion process, where after each iteration, a\nspeech estimate is obtained and used to update the noise parameters.\nExperimental results confirm that the proposed AVSE approach not only\noutperforms its audio-only counterpart but also generalizes better than a\nrecent supervised-generative AVSE method. Additionally, the new inference\nalgorithm offers a better balance between inference speed and performance\ncompared to the previous diffusion-based method. Code and demo available at:\nhttps://jeaneudesayilo.github.io/fast_UdiffSE\n","authors":["Jean-Eudes Ayilo","Mostafa Sadeghi","Romain Serizel","Xavier Alameda-Pineda"],"pdf_url":"https://arxiv.org/pdf/2410.05301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06787v2","updated":"2025-01-15T09:39:03Z","published":"2025-01-12T11:54:46Z","title":"Improving Pain Classification using Spatio-Temporal Deep Learning\n  Approaches with Facial Expressions","summary":"  Pain management and severity detection are crucial for effective treatment,\nyet traditional self-reporting methods are subjective and may be unsuitable for\nnon-verbal individuals (people with limited speaking skills). To address this\nlimitation, we explore automated pain detection using facial expressions. Our\nstudy leverages deep learning techniques to improve pain assessment by\nanalyzing facial images from the Pain Emotion Faces Database (PEMF). We propose\ntwo novel approaches1: (1) a hybrid ConvNeXt model combined with Long\nShort-Term Memory (LSTM) blocks to analyze video frames and predict pain\npresence, and (2) a Spatio-Temporal Graph Convolution Network (STGCN)\nintegrated with LSTM to process landmarks from facial images for pain\ndetection. Our work represents the first use of the PEMF dataset for binary\npain classification and demonstrates the effectiveness of these models through\nextensive experimentation. The results highlight the potential of combining\nspatial and temporal features for enhanced pain detection, offering a promising\nadvancement in objective pain assessment methodologies.\n","authors":["Aafaf Ridouan","Amine Bohi","Youssef Mourchid"],"pdf_url":"https://arxiv.org/pdf/2501.06787v2.pdf","comment":"8 pages, 3 figures, 3 tables. Accepted and presented at the 18th\n  International Conference on Machine Vision (ICMV 2024), Edinburgh, UK"},{"id":"http://arxiv.org/abs/2501.08682v1","updated":"2025-01-15T09:22:38Z","published":"2025-01-15T09:22:38Z","title":"RealVVT: Towards Photorealistic Video Virtual Try-on via Spatio-Temporal\n  Consistency","summary":"  Virtual try-on has emerged as a pivotal task at the intersection of computer\nvision and fashion, aimed at digitally simulating how clothing items fit on the\nhuman body. Despite notable progress in single-image virtual try-on (VTO),\ncurrent methodologies often struggle to preserve a consistent and authentic\nappearance of clothing across extended video sequences. This challenge arises\nfrom the complexities of capturing dynamic human pose and maintaining target\nclothing characteristics. We leverage pre-existing video foundation models to\nintroduce RealVVT, a photoRealistic Video Virtual Try-on framework tailored to\nbolster stability and realism within dynamic video contexts. Our methodology\nencompasses a Clothing & Temporal Consistency strategy, an Agnostic-guided\nAttention Focus Loss mechanism to ensure spatial consistency, and a Pose-guided\nLong Video VTO technique adept at handling extended video sequences.Extensive\nexperiments across various datasets confirms that our approach outperforms\nexisting state-of-the-art models in both single-image and video VTO tasks,\noffering a viable solution for practical applications within the realms of\nfashion e-commerce and virtual fitting environments.\n","authors":["Siqi Li","Zhengkai Jiang","Jiawei Zhou","Zhihong Liu","Xiaowei Chi","Haoqian Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08682v1.pdf","comment":"10 pages (8 pages main text, 2 pages references), 5 figures in the\n  main text, and 4 pages supplementary materials with 3 additional figures"},{"id":"http://arxiv.org/abs/2501.08676v1","updated":"2025-01-15T09:07:12Z","published":"2025-01-15T09:07:12Z","title":"FlexiClip: Locality-Preserving Free-Form Character Animation","summary":"  Animating clipart images with seamless motion while maintaining visual\nfidelity and temporal coherence presents significant challenges. Existing\nmethods, such as AniClipart, effectively model spatial deformations but often\nfail to ensure smooth temporal transitions, resulting in artifacts like abrupt\nmotions and geometric distortions. Similarly, text-to-video (T2V) and\nimage-to-video (I2V) models struggle to handle clipart due to the mismatch in\nstatistical properties between natural video and clipart styles. This paper\nintroduces FlexiClip, a novel approach designed to overcome these limitations\nby addressing the intertwined challenges of temporal consistency and geometric\nintegrity. FlexiClip extends traditional B\\'ezier curve-based trajectory\nmodeling with key innovations: temporal Jacobians to correct motion dynamics\nincrementally, continuous-time modeling via probability flow ODEs (pfODEs) to\nmitigate temporal noise, and a flow matching loss inspired by GFlowNet\nprinciples to optimize smooth motion transitions. These enhancements ensure\ncoherent animations across complex scenarios involving rapid movements and\nnon-rigid deformations. Extensive experiments validate the effectiveness of\nFlexiClip in generating animations that are not only smooth and natural but\nalso structurally consistent across diverse clipart types, including humans and\nanimals. By integrating spatial and temporal modeling with pre-trained video\ndiffusion models, FlexiClip sets a new standard for high-quality clipart\nanimation, offering robust performance across a wide range of visual content.\nProject Page: https://creative-gen.github.io/flexiclip.github.io/\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2501.08676v1.pdf","comment":"13 pages, 4 figures, 7 tables"},{"id":"http://arxiv.org/abs/2501.08672v1","updated":"2025-01-15T09:04:56Z","published":"2025-01-15T09:04:56Z","title":"GS-LIVO: Real-Time LiDAR, Inertial, and Visual Multi-sensor Fused\n  Odometry with Gaussian Mapping","summary":"  In recent years, 3D Gaussian splatting (3D-GS) has emerged as a novel scene\nrepresentation approach. However, existing vision-only 3D-GS methods often rely\non hand-crafted heuristics for point-cloud densification and face challenges in\nhandling occlusions and high GPU memory and computation consumption.\nLiDAR-Inertial-Visual (LIV) sensor configuration has demonstrated superior\nperformance in localization and dense mapping by leveraging complementary\nsensing characteristics: rich texture information from cameras, precise\ngeometric measurements from LiDAR, and high-frequency motion data from IMU.\nInspired by this, we propose a novel real-time Gaussian-based simultaneous\nlocalization and mapping (SLAM) system. Our map system comprises a global\nGaussian map and a sliding window of Gaussians, along with an IESKF-based\nodometry. The global Gaussian map consists of hash-indexed voxels organized in\na recursive octree, effectively covering sparse spatial volumes while adapting\nto different levels of detail and scales. The Gaussian map is initialized\nthrough multi-sensor fusion and optimized with photometric gradients. Our\nsystem incrementally maintains a sliding window of Gaussians, significantly\nreducing GPU computation and memory consumption by only optimizing the map\nwithin the sliding window. Moreover, we implement a tightly coupled\nmulti-sensor fusion odometry with an iterative error state Kalman filter\n(IESKF), leveraging real-time updating and rendering of the Gaussian map. Our\nsystem represents the first real-time Gaussian-based SLAM framework deployable\non resource-constrained embedded systems, demonstrated on the NVIDIA Jetson\nOrin NX platform. The framework achieves real-time performance while\nmaintaining robust multi-sensor fusion capabilities. All implementation\nalgorithms, hardware designs, and CAD models will be publicly available.\n","authors":["Sheng Hong","Chunran Zheng","Yishu Shen","Changze Li","Fu Zhang","Tong Qin","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2501.08672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08667v1","updated":"2025-01-15T09:02:04Z","published":"2025-01-15T09:02:04Z","title":"TimeFlow: Longitudinal Brain Image Registration and Aging Progression\n  Analysis","summary":"  Predicting future brain states is crucial for understanding healthy aging and\nneurodegenerative diseases. Longitudinal brain MRI registration, a cornerstone\nfor such analyses, has long been limited by its inability to forecast future\ndevelopments, reliance on extensive, dense longitudinal data, and the need to\nbalance registration accuracy with temporal smoothness. In this work, we\npresent \\emph{TimeFlow}, a novel framework for longitudinal brain MRI\nregistration that overcomes all these challenges. Leveraging a U-Net\narchitecture with temporal conditioning inspired by diffusion models, TimeFlow\nenables accurate longitudinal registration and facilitates prospective analyses\nthrough future image prediction. Unlike traditional methods that depend on\nexplicit smoothness regularizers and dense sequential data, TimeFlow achieves\ntemporal consistency and continuity without these constraints. Experimental\nresults highlight its superior performance in both future timepoint prediction\nand registration accuracy compared to state-of-the-art methods. Additionally,\nTimeFlow supports novel biological brain aging analyses, effectively\ndifferentiating neurodegenerative conditions from healthy aging. It eliminates\nthe need for segmentation, thereby avoiding the challenges of non-trivial\nannotation and inconsistent segmentation errors. TimeFlow paves the way for\naccurate, data-efficient, and annotation-free prospective analyses of brain\naging and chronic diseases.\n","authors":["Bailiang Jian","Jiazhen Pan","Yitong Li","Fabian Bongratz","Ruochen Li","Daniel Rueckert","Benedikt Wiestler","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2501.08667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08665v1","updated":"2025-01-15T09:00:32Z","published":"2025-01-15T09:00:32Z","title":"A Survey on Facial Image Privacy Preservation in Cloud-Based Services","summary":"  Facial recognition models are increasingly employed by commercial\nenterprises, government agencies, and cloud service providers for identity\nverification, consumer services, and surveillance. These models are often\ntrained using vast amounts of facial data processed and stored in cloud-based\nplatforms, raising significant privacy concerns. Users' facial images may be\nexploited without their consent, leading to potential data breaches and misuse.\nThis survey presents a comprehensive review of current methods aimed at\npreserving facial image privacy in cloud-based services. We categorize these\nmethods into two primary approaches: image obfuscation-based protection and\nadversarial perturbation-based protection. We provide an in-depth analysis of\nboth categories, offering qualitative and quantitative comparisons of their\neffectiveness. Additionally, we highlight unresolved challenges and propose\nfuture research directions to improve privacy preservation in cloud computing\nenvironments.\n","authors":["Chen Chen","Mengyuan Sun","Xueluan Gong","Yanjiao Chen","Qian Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08662v1","updated":"2025-01-15T08:57:41Z","published":"2025-01-15T08:57:41Z","title":"Product of Gaussian Mixture Diffusion Model for non-linear MRI Inversion","summary":"  Diffusion models have recently shown remarkable results in magnetic resonance\nimaging reconstruction. However, the employed networks typically are black-box\nestimators of the (smoothed) prior score with tens of millions of parameters,\nrestricting interpretability and increasing reconstruction time. Furthermore,\nparallel imaging reconstruction algorithms either rely on off-line coil\nsensitivity estimation, which is prone to misalignment and restricting sampling\ntrajectories, or perform per-coil reconstruction, making the computational cost\nproportional to the number of coils. To overcome this, we jointly reconstruct\nthe image and the coil sensitivities using the lightweight,\nparameter-efficient, and interpretable product of Gaussian mixture diffusion\nmodel as an image prior and a classical smoothness priors on the coil\nsensitivities. The proposed method delivers promising results while allowing\nfor fast inference and demonstrating robustness to contrast out-of-distribution\ndata and sampling trajectories, comparable to classical variational penalties\nsuch as total variation. Finally, the probabilistic formulation allows the\ncalculation of the posterior expectation and pixel-wise variance.\n","authors":["Laurenz Nagler","Martin Zach","Thomas Pock"],"pdf_url":"https://arxiv.org/pdf/2501.08662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08659v1","updated":"2025-01-15T08:50:52Z","published":"2025-01-15T08:50:52Z","title":"BRIGHT-VO: Brightness-Guided Hybrid Transformer for Visual Odometry with\n  Multi-modality Refinement Module","summary":"  Visual odometry (VO) plays a crucial role in autonomous driving, robotic\nnavigation, and other related tasks by estimating the position and orientation\nof a camera based on visual input. Significant progress has been made in\ndata-driven VO methods, particularly those leveraging deep learning techniques\nto extract image features and estimate camera poses. However, these methods\noften struggle in low-light conditions because of the reduced visibility of\nfeatures and the increased difficulty of matching keypoints. To address this\nlimitation, we introduce BrightVO, a novel VO model based on Transformer\narchitecture, which not only performs front-end visual feature extraction, but\nalso incorporates a multi-modality refinement module in the back-end that\nintegrates Inertial Measurement Unit (IMU) data. Using pose graph optimization,\nthis module iteratively refines pose estimates to reduce errors and improve\nboth accuracy and robustness. Furthermore, we create a synthetic low-light\ndataset, KiC4R, which includes a variety of lighting conditions to facilitate\nthe training and evaluation of VO frameworks in challenging environments.\nExperimental results demonstrate that BrightVO achieves state-of-the-art\nperformance on both the KiC4R dataset and the KITTI benchmarks. Specifically,\nit provides an average improvement of 20% in pose estimation accuracy in normal\noutdoor environments and 259% in low-light conditions, outperforming existing\nmethods. For widespread use and further development, the research work is fully\nopen-source at https://github.com/Anastasiawd/BrightVO.\n","authors":["Dongzhihan Wang","Yang Yang","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08659v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.08654v1","updated":"2025-01-15T08:43:48Z","published":"2025-01-15T08:43:48Z","title":"StereoGen: High-quality Stereo Image Generation from a Single Image","summary":"  State-of-the-art supervised stereo matching methods have achieved amazing\nresults on various benchmarks. However, these data-driven methods suffer from\ngeneralization to real-world scenarios due to the lack of real-world annotated\ndata. In this paper, we propose StereoGen, a novel pipeline for high-quality\nstereo image generation. This pipeline utilizes arbitrary single images as left\nimages and pseudo disparities generated by a monocular depth estimation model\nto synthesize high-quality corresponding right images. Unlike previous methods\nthat fill the occluded area in warped right images using random backgrounds or\nusing convolutions to take nearby pixels selectively, we fine-tune a diffusion\ninpainting model to recover the background. Images generated by our model\npossess better details and undamaged semantic structures. Besides, we propose\nTraining-free Confidence Generation and Adaptive Disparity Selection. The\nformer suppresses the negative effect of harmful pseudo ground truth during\nstereo training, while the latter helps generate a wider disparity distribution\nand better synthetic images. Experiments show that models trained under our\npipeline achieve state-of-the-art zero-shot generalization results among all\npublished methods. The code will be available upon publication of the paper.\n","authors":["Xianqi Wang","Hao Yang","Gangwei Xu","Junda Cheng","Min Lin","Yong Deng","Jinliang Zang","Yurui Chen","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2501.08654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02640v3","updated":"2025-01-15T08:41:38Z","published":"2025-01-05T20:05:10Z","title":"Multispectral Pedestrian Detection with Sparsely Annotated Label","summary":"  Although existing Sparsely Annotated Object Detection (SAOD) approches have\nmade progress in handling sparsely annotated environments in multispectral\ndomain, where only some pedestrians are annotated, they still have the\nfollowing limitations: (i) they lack considerations for improving the quality\nof pseudo-labels for missing annotations, and (ii) they rely on fixed ground\ntruth annotations, which leads to learning only a limited range of pedestrian\nvisual appearances in the multispectral domain. To address these issues, we\npropose a novel framework called Sparsely Annotated Multispectral Pedestrian\nDetection (SAMPD). For limitation (i), we introduce Multispectral\nPedestrian-aware Adaptive Weight (MPAW) and Positive Pseudo-label Enhancement\n(PPE) module. Utilizing multispectral knowledge, these modules ensure the\ngeneration of high-quality pseudo-labels and enable effective learning by\nincreasing weights for high-quality pseudo-labels based on modality\ncharacteristics. To address limitation (ii), we propose an Adaptive Pedestrian\nRetrieval Augmentation (APRA) module, which adaptively incorporates pedestrian\npatches from ground-truth and dynamically integrates high-quality pseudo-labels\nwith the ground-truth, facilitating a more diverse learning pool of\npedestrians. Extensive experimental results demonstrate that our SAMPD\nsignificantly enhances performance in sparsely annotated environments within\nthe multispectral domain.\n","authors":["Chan Lee","Seungho Shin","Gyeong-Moon Park","Jung Uk Kim"],"pdf_url":"https://arxiv.org/pdf/2501.02640v3.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.08649v1","updated":"2025-01-15T08:24:35Z","published":"2025-01-15T08:24:35Z","title":"Joint Learning of Depth and Appearance for Portrait Image Animation","summary":"  2D portrait animation has experienced significant advancements in recent\nyears. Much research has utilized the prior knowledge embedded in large\ngenerative diffusion models to enhance high-quality image manipulation.\nHowever, most methods only focus on generating RGB images as output, and the\nco-generation of consistent visual plus 3D output remains largely\nunder-explored. In our work, we propose to jointly learn the visual appearance\nand depth simultaneously in a diffusion-based portrait image generator. Our\nmethod embraces the end-to-end diffusion paradigm and introduces a new\narchitecture suitable for learning this conditional joint distribution,\nconsisting of a reference network and a channel-expanded diffusion backbone.\nOnce trained, our framework can be efficiently adapted to various downstream\napplications, such as facial depth-to-image and image-to-depth generation,\nportrait relighting, and audio-driven talking head animation with consistent 3D\noutput.\n","authors":["Xinya Ji","Gaspard Zoss","Prashanth Chandran","Lingchen Yang","Xun Cao","Barbara Solenthaler","Derek Bradley"],"pdf_url":"https://arxiv.org/pdf/2501.08649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08643v1","updated":"2025-01-15T08:11:24Z","published":"2025-01-15T08:11:24Z","title":"MonSter: Marry Monodepth to Stereo Unleashes Power","summary":"  Stereo matching recovers depth from image correspondences. Existing methods\nstruggle to handle ill-posed regions with limited matching cues, such as\nocclusions and textureless areas. To address this, we propose MonSter, a novel\nmethod that leverages the complementary strengths of monocular depth estimation\nand stereo matching. MonSter integrates monocular depth and stereo matching\ninto a dual-branch architecture to iteratively improve each other.\nConfidence-based guidance adaptively selects reliable stereo cues for monodepth\nscale-shift recovery. The refined monodepth is in turn guides stereo\neffectively at ill-posed regions. Such iterative mutual enhancement enables\nMonSter to evolve monodepth priors from coarse object-level structures to\npixel-level geometry, fully unlocking the potential of stereo matching. As\nshown in Fig.1, MonSter ranks 1st across five most commonly used leaderboards\n-- SceneFlow, KITTI 2012, KITTI 2015, Middlebury, and ETH3D. Achieving up to\n49.5% improvements (Bad 1.0 on ETH3D) over the previous best method.\nComprehensive analysis verifies the effectiveness of MonSter in ill-posed\nregions. In terms of zero-shot generalization, MonSter significantly and\nconsistently outperforms state-of-the-art across the board. The code is\npublicly available at: https://github.com/Junda24/MonSter.\n","authors":["Junda Cheng","Longliang Liu","Gangwei Xu","Xianqi Wang","Zhaoxing Zhang","Yong Deng","Jinliang Zang","Yurui Chen","Zhipeng Cai","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2501.08643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08639v1","updated":"2025-01-15T08:04:44Z","published":"2025-01-15T08:04:44Z","title":"Detecting Wildfire Flame and Smoke through Edge Computing using Transfer\n  Learning Enhanced Deep Learning Models","summary":"  Autonomous unmanned aerial vehicles (UAVs) integrated with edge computing\ncapabilities empower real-time data processing directly on the device,\ndramatically reducing latency in critical scenarios such as wildfire detection.\nThis study underscores Transfer Learning's (TL) significance in boosting the\nperformance of object detectors for identifying wildfire smoke and flames,\nespecially when trained on limited datasets, and investigates the impact TL has\non edge computing metrics. With the latter focusing how TL-enhanced You Only\nLook Once (YOLO) models perform in terms of inference time, power usage, and\nenergy consumption when using edge computing devices. This study utilizes the\nAerial Fire and Smoke Essential (AFSE) dataset as the target, with the Flame\nand Smoke Detection Dataset (FASDD) and the Microsoft Common Objects in Context\n(COCO) dataset serving as source datasets. We explore a two-stage cascaded TL\nmethod, utilizing D-Fire or FASDD as initial stage target datasets and AFSE as\nthe subsequent stage. Through fine-tuning, TL significantly enhances detection\nprecision, achieving up to 79.2% mean Average Precision (mAP@0.5), reduces\ntraining time, and increases model generalizability across the AFSE dataset.\nHowever, cascaded TL yielded no notable improvements and TL alone did not\nbenefit the edge computing metrics evaluated. Lastly, this work found that\nYOLOv5n remains a powerful model when lacking hardware acceleration, finding\nthat YOLOv5n can process images nearly twice as fast as its newer counterpart,\nYOLO11n. Overall, the results affirm TL's role in augmenting the accuracy of\nobject detectors while also illustrating that additional enhancements are\nneeded to improve edge computing performance.\n","authors":["Giovanny Vazquez","Shengjie Zhai","Mei Yang"],"pdf_url":"https://arxiv.org/pdf/2501.08639v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.05095v5","updated":"2025-01-15T07:37:21Z","published":"2024-05-08T14:44:34Z","title":"Approximation properties relative to continuous scale space for hybrid\n  discretizations of Gaussian derivative operators","summary":"  This paper presents an analysis of properties of two hybrid discretization\nmethods for Gaussian derivatives, based on convolutions with either the\nnormalized sampled Gaussian kernel or the integrated Gaussian kernel followed\nby central differences. The motivation for studying these discretization\nmethods is that in situations when multiple spatial derivatives of different\norder are needed at the same scale level, they can be computed significantly\nmore efficiently compared to more direct derivative approximations based on\nexplicit convolutions with either sampled Gaussian kernels or integrated\nGaussian kernels.\n  While these computational benefits do also hold for the genuinely discrete\napproach for computing discrete analogues of Gaussian derivatives, based on\nconvolution with the discrete analogue of the Gaussian kernel followed by\ncentral differences, the underlying mathematical primitives for the discrete\nanalogue of the Gaussian kernel, in terms of modified Bessel functions of\ninteger order, may not be available in certain frameworks for image processing,\nsuch as when performing deep learning based on scale-parameterized filters in\nterms of Gaussian derivatives, with learning of the scale levels.\n  In this paper, we present a characterization of the properties of these\nhybrid discretization methods, in terms of quantitative performance measures\nconcerning the amount of spatial smoothing that they imply, as well as the\nrelative consistency of scale estimates obtained from scale-invariant feature\ndetectors with automatic scale selection, with an emphasis on the behaviour for\nvery small values of the scale parameter, which may differ significantly from\ncorresponding results obtained from the fully continuous scale-space theory, as\nwell as between different types of discretization methods.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2405.05095v5.pdf","comment":"23 pages, 9 figures. arXiv admin note: text overlap with\n  arXiv:2311.11317"},{"id":"http://arxiv.org/abs/2411.15098v4","updated":"2025-01-15T07:30:29Z","published":"2024-11-22T17:55:15Z","title":"OminiControl: Minimal and Universal Control for Diffusion Transformer","summary":"  In this paper, we introduce OminiControl, a highly versatile and\nparameter-efficient framework that integrates image conditions into pre-trained\nDiffusion Transformer (DiT) models. At its core, OminiControl leverages a\nparameter reuse mechanism, enabling the DiT to encode image conditions using\nitself as a powerful backbone and process them with its flexible multi-modal\nattention processors. Unlike existing methods, which rely heavily on additional\nencoder modules with complex architectures, OminiControl (1) effectively and\nefficiently incorporates injected image conditions with only ~0.1% additional\nparameters, and (2) addresses a wide range of image conditioning tasks in a\nunified manner, including subject-driven generation and spatially-aligned\nconditions such as edges, depth, and more. Remarkably, these capabilities are\nachieved by training on images generated by the DiT itself, which is\nparticularly beneficial for subject-driven generation. Extensive evaluations\ndemonstrate that OminiControl outperforms existing UNet-based and DiT-adapted\nmodels in both subject-driven and spatially-aligned conditional generation.\nAdditionally, we release our training dataset, Subjects200K, a diverse\ncollection of over 200,000 identity-consistent images, along with an efficient\ndata synthesis pipeline to advance research in subject-consistent generation.\n","authors":["Zhenxiong Tan","Songhua Liu","Xingyi Yang","Qiaochu Xue","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15098v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08629v1","updated":"2025-01-15T07:24:15Z","published":"2025-01-15T07:24:15Z","title":"Self-Organizing Edge Computing Distribution Framework for Visual SLAM","summary":"  Localization within a known environment is a crucial capability for mobile\nrobots. Simultaneous Localization and Mapping (SLAM) is a prominent solution to\nthis problem. SLAM is a framework that consists of a diverse set of\ncomputational tasks ranging from real-time tracking to computation-intensive\nmap optimization. This combination can present a challenge for resource-limited\nmobile robots. Previously, edge-assisted SLAM methods have demonstrated\npromising real-time execution capabilities by offloading heavy computations\nwhile performing real-time tracking onboard. However, the common approach of\nutilizing a client-server architecture for offloading is sensitive to server\nand network failures. In this article, we propose a novel edge-assisted SLAM\nframework capable of self-organizing fully distributed SLAM execution across a\nnetwork of devices or functioning on a single device without connectivity. The\narchitecture consists of three layers and is designed to be device-agnostic,\nresilient to network failures, and minimally invasive to the core SLAM system.\nWe have implemented and demonstrated the framework for monocular ORB SLAM3 and\nevaluated it in both fully distributed and standalone SLAM configurations\nagainst the ORB SLAM3. The experiment results demonstrate that the proposed\ndesign matches the accuracy and resource utilization of the monolithic approach\nwhile enabling collaborative execution.\n","authors":["Jussi Kalliola","Lauri Suomela","Sergio Moreschini","David Hästbacka"],"pdf_url":"https://arxiv.org/pdf/2501.08629v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.10919v3","updated":"2025-01-15T07:17:58Z","published":"2024-08-20T15:04:14Z","title":"CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network","summary":"  In recent years, Wi-Fi sensing has garnered significant attention due to its\nnumerous benefits, such as privacy protection, low cost, and penetration\nability. Extensive research has been conducted in this field, focusing on areas\nsuch as gesture recognition, people identification, and fall detection.\nHowever, many data-driven methods encounter challenges related to domain shift,\nwhere the model fails to perform well in environments different from the\ntraining data. One major factor contributing to this issue is the limited\navailability of Wi-Fi sensing datasets, which makes models learn excessive\nirrelevant information and over-fit to the training set. Unfortunately,\ncollecting large-scale Wi-Fi sensing datasets across diverse scenarios is a\nchallenging task. To address this problem, we propose CrossFi, a siamese\nnetwork-based approach that excels in both in-domain scenario and cross-domain\nscenario, including few-shot, zero-shot scenarios, and even works in few-shot\nnew-class scenario where testing set contains new categories. The core\ncomponent of CrossFi is a sample-similarity calculation network called CSi-Net,\nwhich improves the structure of the siamese network by using an attention\nmechanism to capture similarity information, instead of simply calculating the\ndistance or cosine similarity. Based on it, we develop an extra Weight-Net that\ncan generate a template for each class, so that our CrossFi can work in\ndifferent scenarios. Experimental results demonstrate that our CrossFi achieves\nstate-of-the-art performance across various scenarios. In gesture recognition\ntask, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%\nin one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,\nand 84.75% in one-shot new-class scenario. The code for our model is publicly\navailable at https://github.com/RS2002/CrossFi.\n","authors":["Zijian Zhao","Tingwei Chen","Zhijie Cai","Xiaoyang Li","Hang Li","Qimei Chen","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.10919v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00330v2","updated":"2025-01-15T06:57:25Z","published":"2024-11-01T03:08:10Z","title":"Multiple Information Prompt Learning for Cloth-Changing Person\n  Re-Identification","summary":"  Cloth-changing person re-identification is a subject closer to the real\nworld, which focuses on solving the problem of person re-identification after\npedestrians change clothes. The primary challenge in this field is to overcome\nthe complex interplay between intra-class and inter-class variations and to\nidentify features that remain unaffected by changes in appearance. Sufficient\ndata collection for model training would significantly aid in addressing this\nproblem. However, it is challenging to gather diverse datasets in practice.\nCurrent methods focus on implicitly learning identity information from the\noriginal image or introducing additional auxiliary models, which are largely\nlimited by the quality of the image and the performance of the additional\nmodel. To address these issues, inspired by prompt learning, we propose a novel\nmultiple information prompt learning (MIPL) scheme for cloth-changing person\nReID, which learns identity robust features through the common prompt guidance\nof multiple messages. Specifically, the clothing information stripping (CIS)\nmodule is designed to decouple the clothing information from the original RGB\nimage features to counteract the influence of clothing appearance. The\nBio-guided attention (BGA) module is proposed to increase the learning\nintensity of the model for key information. A dual-length hybrid patch (DHP)\nmodule is employed to make the features have diverse coverage to minimize the\nimpact of feature bias. Extensive experiments demonstrate that the proposed\nmethod outperforms all state-of-the-art methods on the LTCC, Celeb-reID,\nCeleb-reID-light, and CSCC datasets, achieving rank-1 scores of 74.8%, 73.3%,\n66.0%, and 88.1%, respectively. When compared to AIM (CVPR23), ACID (TIP23),\nand SCNet (MM23), MIPL achieves rank-1 improvements of 11.3%, 13.8%, and 7.9%,\nrespectively, on the PRCC dataset.\n","authors":["Shengxun Wei","Zan Gao","Chunjie Ma","Yibo Zhao","Weili Guan","Shengyong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.00330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00961v2","updated":"2025-01-15T06:46:51Z","published":"2025-01-01T21:45:00Z","title":"The Silent Majority: Demystifying Memorization Effect in the Presence of\n  Spurious Correlations","summary":"  Machine learning models often rely on simple spurious features -- patterns in\ntraining data that correlate with targets but are not causally related to them,\nlike image backgrounds in foreground classification. This reliance typically\nleads to imbalanced test performance across minority and majority groups. In\nthis work, we take a closer look at the fundamental cause of such imbalanced\nperformance through the lens of memorization, which refers to the ability to\npredict accurately on \\textit{atypical} examples (minority groups) in the\ntraining set but failing in achieving the same accuracy in the testing set.\nThis paper systematically shows the ubiquitous existence of spurious features\nin a small set of neurons within the network, providing the first-ever evidence\nthat memorization may contribute to imbalanced group performance. Through three\nexperimental sources of converging empirical evidence, we find the property of\na small subset of neurons or channels in memorizing minority group information.\nInspired by these findings, we articulate the hypothesis: the imbalanced group\nperformance is a byproduct of ``noisy'' spurious memorization confined to a\nsmall set of neurons. To further substantiate this hypothesis, we show that\neliminating these unnecessary spurious memorization patterns via a novel\nframework during training can significantly affect the model performance on\nminority groups. Our experimental results across various architectures and\nbenchmarks offer new insights on how neural networks encode core and spurious\nknowledge, laying the groundwork for future research in demystifying robustness\nto spurious correlation.\n","authors":["Chenyu You","Haocheng Dai","Yifei Min","Jasjeet S. Sekhon","Sarang Joshi","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2501.00961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19599v3","updated":"2025-01-15T06:40:31Z","published":"2024-09-29T07:32:14Z","title":"DATransNet: Dynamic Attention Transformer Network for Infrared Small\n  Target Detection","summary":"  Infrared small target detection (ISTD) is widely used in civilian and\nmilitary applications. However, ISTD encounters several challenges, including\nthe tendency for small and dim targets to be obscured by complex backgrounds.To\naddress this issue, we propose the Dynamic Attention Transformer Network\n(DATransNet), which aims to extract and preserve edge information of small\ntargets.DATransNet employs the Dynamic Attention Transformer (DATrans),\nsimulating central difference convolutions (CDC) to extract and integrate\ngradient features with deeper features.Furthermore, we propose a global feature\nextraction module (GFEM) that offers a comprehensive perspective to prevent the\nnetwork from focusing solely on details while neglecting the background\ninformation. We compare the network with state-of-the-art (SOTA) approaches,\nand the results demonstrate that our method performs effectively. Our source\ncode is available at https://github.com/greekinRoma/DATransNet.\n","authors":["Chen Hu","Yian Huang","Kexuan Li","Luping Zhang","Chang Long","Yiming Zhu","Tian Pu","Zhenming Peng"],"pdf_url":"https://arxiv.org/pdf/2409.19599v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.03678v2","updated":"2025-01-15T06:32:05Z","published":"2022-06-08T05:04:43Z","title":"Ultra-High-Definition Image Deblurring via Multi-scale Cubic-Mixer","summary":"  Currently, transformer-based algorithms are making a splash in the domain of\nimage deblurring. Their achievement depends on the self-attention mechanism\nwith CNN stem to model long range dependencies between tokens. Unfortunately,\nthis ear-pleasing pipeline introduces high computational complexity and makes\nit difficult to run an ultra-high-definition image on a single GPU in real\ntime. To trade-off accuracy and efficiency, the input degraded image is\ncomputed cyclically over three dimensional ($C$, $W$, and $H$) signals without\na self-attention mechanism. We term this deep network as Multi-scale\nCubic-Mixer, which is acted on both the real and imaginary components after\nfast Fourier transform to estimate the Fourier coefficients and thus obtain a\ndeblurred image. Furthermore, we combine the multi-scale cubic-mixer with a\nslicing strategy to generate high-quality results at a much lower computational\ncost. Experimental results demonstrate that the proposed algorithm performs\nfavorably against the state-of-the-art deblurring approaches on the several\nbenchmarks and a new ultra-high-definition dataset in terms of accuracy and\nspeed.\n","authors":["Xingchi Chen","Xiuyi Jia","Zhuoran Zheng"],"pdf_url":"https://arxiv.org/pdf/2206.03678v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2501.08609v1","updated":"2025-01-15T06:15:15Z","published":"2025-01-15T06:15:15Z","title":"Computerized Assessment of Motor Imitation for Distinguishing Autism in\n  Video (CAMI-2DNet)","summary":"  Motor imitation impairments are commonly reported in individuals with autism\nspectrum conditions (ASCs), suggesting that motor imitation could be used as a\nphenotype for addressing autism heterogeneity. Traditional methods for\nassessing motor imitation are subjective, labor-intensive, and require\nextensive human training. Modern Computerized Assessment of Motor Imitation\n(CAMI) methods, such as CAMI-3D for motion capture data and CAMI-2D for video\ndata, are less subjective. However, they rely on labor-intensive data\nnormalization and cleaning techniques, and human annotations for algorithm\ntraining. To address these challenges, we propose CAMI-2DNet, a scalable and\ninterpretable deep learning-based approach to motor imitation assessment in\nvideo data, which eliminates the need for data normalization, cleaning and\nannotation. CAMI-2DNet uses an encoder-decoder architecture to map a video to a\nmotion encoding that is disentangled from nuisance factors such as body shape\nand camera views. To learn a disentangled representation, we employ synthetic\ndata generated by motion retargeting of virtual characters through the\nreshuffling of motion, body shape, and camera views, as well as real\nparticipant data. To automatically assess how well an individual imitates an\nactor, we compute a similarity score between their motion encodings, and use it\nto discriminate individuals with ASCs from neurotypical (NT) individuals. Our\ncomparative analysis demonstrates that CAMI-2DNet has a strong correlation with\nhuman scores while outperforming CAMI-2D in discriminating ASC vs NT children.\nMoreover, CAMI-2DNet performs comparably to CAMI-3D while offering greater\npracticality by operating directly on video data and without the need for\nad-hoc data normalization and human annotations.\n","authors":["Kaleab A. Kinfu","Carolina Pacheco","Alice D. Sperry","Deana Crocetti","Bahar Tunçgenç","Stewart H. Mostofsky","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2501.08609v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2407.01960v2","updated":"2025-01-15T06:06:31Z","published":"2024-07-02T05:31:59Z","title":"Zero-shot Video Restoration and Enhancement Using Pre-Trained Image\n  Diffusion Model","summary":"  Diffusion-based zero-shot image restoration and enhancement models have\nachieved great success in various tasks of image restoration and enhancement.\nHowever, directly applying them to video restoration and enhancement results in\nsevere temporal flickering artifacts. In this paper, we propose the first\nframework for zero-shot video restoration and enhancement based on the\npre-trained image diffusion model. By replacing the spatial self-attention\nlayer with the proposed short-long-range (SLR) temporal attention layer, the\npre-trained image diffusion model can take advantage of the temporal\ncorrelation between frames. We further propose temporal consistency guidance,\nspatial-temporal noise sharing, and an early stopping sampling strategy to\nimprove temporally consistent sampling. Our method is a plug-and-play module\nthat can be inserted into any diffusion-based image restoration or enhancement\nmethods to further improve their performance. Experimental results demonstrate\nthe superiority of our proposed method. Our code is available at\nhttps://github.com/cao-cong/ZVRD.\n","authors":["Cong Cao","Huanjing Yue","Xin Liu","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01960v2.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.08605v1","updated":"2025-01-15T06:05:57Z","published":"2025-01-15T06:05:57Z","title":"PACF: Prototype Augmented Compact Features for Improving Domain Adaptive\n  Object Detection","summary":"  In recent years, there has been significant advancement in object detection.\nHowever, applying off-the-shelf detectors to a new domain leads to significant\nperformance drop, caused by the domain gap. These detectors exhibit\nhigher-variance class-conditional distributions in the target domain than that\nin the source domain, along with mean shift. To address this problem, we\npropose the Prototype Augmented Compact Features (PACF) framework to regularize\nthe distribution of intra-class features. Specifically, we provide an in-depth\ntheoretical analysis on the lower bound of the target features-related\nlikelihood and derive the prototype cross entropy loss to further calibrate the\ndistribution of target RoI features. Furthermore, a mutual regularization\nstrategy is designed to enable the linear and prototype-based classifiers to\nlearn from each other, promoting feature compactness while enhancing\ndiscriminability. Thanks to this PACF framework, we have obtained a more\ncompact cross-domain feature space, within which the variance of the target\nfeatures' class-conditional distributions has significantly decreased, and the\nclass-mean shift between the two domains has also been further reduced. The\nresults on different adaptation settings are state-of-the-art, which\ndemonstrate the board applicability and effectiveness of the proposed approach.\n","authors":["Chenguang Liu","Yongchao Feng","Yanan Zhang","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08604v1","updated":"2025-01-15T06:04:18Z","published":"2025-01-15T06:04:18Z","title":"Watermarking in Diffusion Model: Gaussian Shading with Exact Diffusion\n  Inversion via Coupled Transformations (EDICT)","summary":"  This paper introduces a novel approach to enhance the performance of Gaussian\nShading, a prevalent watermarking technique, by integrating the Exact Diffusion\nInversion via Coupled Transformations (EDICT) framework. While Gaussian Shading\ntraditionally embeds watermarks in a noise latent space, followed by iterative\ndenoising for image generation and noise addition for watermark recovery, its\ninversion process is not exact, leading to potential watermark distortion. We\npropose to leverage EDICT's ability to derive exact inverse mappings to refine\nthis process. Our method involves duplicating the watermark-infused noisy\nlatent and employing a reciprocal, alternating denoising and noising scheme\nbetween the two latents, facilitated by EDICT. This allows for a more precise\nreconstruction of both the image and the embedded watermark. Empirical\nevaluation on standard datasets demonstrates that our integrated approach\nyields a slight, yet statistically significant improvement in watermark\nrecovery fidelity. These results highlight the potential of EDICT to enhance\nexisting diffusion-based watermarking techniques by providing a more accurate\nand robust inversion mechanism. To the best of our knowledge, this is the first\nwork to explore the synergy between EDICT and Gaussian Shading for digital\nwatermarking, opening new avenues for research in robust and high-fidelity\nwatermark embedding and extraction.\n","authors":["Krishna Panthi"],"pdf_url":"https://arxiv.org/pdf/2501.08604v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2501.08593v1","updated":"2025-01-15T05:36:41Z","published":"2025-01-15T05:36:41Z","title":"Image-to-Force Estimation for Soft Tissue Interaction in\n  Robotic-Assisted Surgery Using Structured Light","summary":"  For Minimally Invasive Surgical (MIS) robots, accurate haptic interaction\nforce feedback is essential for ensuring the safety of interacting with soft\ntissue. However, most existing MIS robotic systems cannot facilitate direct\nmeasurement of the interaction force with hardware sensors due to space\nlimitations. This letter introduces an effective vision-based scheme that\nutilizes a One-Shot structured light projection with a designed pattern on soft\ntissue coupled with haptic information processing through a trained\nimage-to-force neural network. The images captured from the endoscopic stereo\ncamera are analyzed to reconstruct high-resolution 3D point clouds for soft\ntissue deformation. Based on this, a modified PointNet-based force estimation\nmethod is proposed, which excels in representing the complex mechanical\nproperties of soft tissue. Numerical force interaction experiments are\nconducted on three silicon materials with different stiffness. The results\nvalidate the effectiveness of the proposed scheme.\n","authors":["Jiayin Wang","Mingfeng Yao","Yanran Wei","Xiaoyu Guo","Ayong Zheng","Weidong Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.08593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00580v2","updated":"2025-01-15T05:30:24Z","published":"2024-11-30T20:40:10Z","title":"Continuous Concepts Removal in Text-to-image Diffusion Models","summary":"  Text-to-image diffusion models have shown an impressive ability to generate\nhigh-quality images from input textual descriptions. However, concerns have\nbeen raised about the potential for these models to create content that\ninfringes on copyrights or depicts disturbing subject matter. Removing specific\nconcepts from these models is a promising potential solution to this problem.\nHowever, existing methods for concept removal do not work well in practical but\nchallenging scenarios where concepts need to be continuously removed.\nSpecifically, these methods lead to poor alignment between the text prompts and\nthe generated image after the continuous removal process. To address this\nissue, we propose a novel approach called CCRT that includes a designed\nknowledge distillation paradigm. It constrains the text-image alignment\nbehavior during the continuous concept removal process by using a set of text\nprompts generated through our genetic algorithm, which employs a designed\nfuzzing strategy. We conduct extensive experiments involving the removal of\nvarious concepts. The results evaluated through both algorithmic metrics and\nhuman studies demonstrate that our CCRT can effectively remove the targeted\nconcepts in a continuous manner while maintaining the high generation quality\n(e.g., text-image alignment) of the model.\n","authors":["Tingxu Han","Weisong Sun","Yanrong Hu","Chunrong Fang","Yonglong Zhang","Shiqing Ma","Tao Zheng","Zhenyu Chen","Zhenting Wang"],"pdf_url":"https://arxiv.org/pdf/2412.00580v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08580v1","updated":"2025-01-15T05:00:03Z","published":"2025-01-15T05:00:03Z","title":"Densely Connected Parameter-Efficient Tuning for Referring Image\n  Segmentation","summary":"  In the domain of computer vision, Parameter-Efficient Tuning (PET) is\nincreasingly replacing the traditional paradigm of pre-training followed by\nfull fine-tuning. PET is particularly favored for its effectiveness in large\nfoundation models, as it streamlines transfer learning costs and optimizes\nhardware utilization. However, the current PET methods are mainly designed for\nsingle-modal optimization. While some pioneering studies have undertaken\npreliminary explorations, they still remain at the level of aligned encoders\n(e.g., CLIP) and lack exploration of misaligned encoders. These methods show\nsub-optimal performance with misaligned encoders, as they fail to effectively\nalign the multimodal features during fine-tuning. In this paper, we introduce\nDETRIS, a parameter-efficient tuning framework designed to enhance low-rank\nvisual feature propagation by establishing dense interconnections between each\nlayer and all preceding layers, which enables effective cross-modal feature\ninteraction and adaptation to misaligned encoders. We also suggest using text\nadapters to improve textual features. Our simple yet efficient approach greatly\nsurpasses state-of-the-art methods with 0.9% to 1.8% backbone parameter\nupdates, evaluated on challenging benchmarks. Our project is available at\n\\url{https://github.com/jiaqihuang01/DETRIS}.\n","authors":["Jiaqi Huang","Zunnan Xu","Ting Liu","Yong Liu","Haonan Han","Kehong Yuan","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2501.08580v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.08577v1","updated":"2025-01-15T04:56:26Z","published":"2025-01-15T04:56:26Z","title":"Scalable and High-Quality Neural Implicit Representation for 3D\n  Reconstruction","summary":"  Various SDF-based neural implicit surface reconstruction methods have been\nproposed recently, and have demonstrated remarkable modeling capabilities.\nHowever, due to the global nature and limited representation ability of a\nsingle network, existing methods still suffer from many drawbacks, such as\nlimited accuracy and scale of the reconstruction. In this paper, we propose a\nversatile, scalable and high-quality neural implicit representation to address\nthese issues. We integrate a divide-and-conquer approach into the neural\nSDF-based reconstruction. Specifically, we model the object or scene as a\nfusion of multiple independent local neural SDFs with overlapping regions. The\nconstruction of our representation involves three key steps: (1) constructing\nthe distribution and overlap relationship of the local radiance fields based on\nobject structure or data distribution, (2) relative pose registration for\nadjacent local SDFs, and (3) SDF blending. Thanks to the independent\nrepresentation of each local region, our approach can not only achieve\nhigh-fidelity surface reconstruction, but also enable scalable scene\nreconstruction. Extensive experimental results demonstrate the effectiveness\nand practicality of our proposed method.\n","authors":["Leyuan Yang","Bailin Deng","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02281v2","updated":"2025-01-15T04:51:48Z","published":"2024-11-04T17:09:58Z","title":"Conformal-in-the-Loop for Learning with Imbalanced Noisy Data","summary":"  Class imbalance and label noise are pervasive in large-scale datasets, yet\nmuch of machine learning research assumes well-labeled, balanced data, which\nrarely reflects real world conditions. Existing approaches typically address\neither label noise or class imbalance in isolation, leading to suboptimal\nresults when both issues coexist. In this work, we propose\nConformal-in-the-Loop (CitL), a novel training framework that addresses both\nchallenges with a conformal prediction-based approach. CitL evaluates sample\nuncertainty to adjust weights and prune unreliable examples, enhancing model\nresilience and accuracy with minimal computational cost. Our extensive\nexperiments include a detailed analysis showing how CitL effectively emphasizes\nimpactful data in noisy, imbalanced datasets. Our results show that CitL\nconsistently boosts model performance, achieving up to a 6.1% increase in\nclassification accuracy and a 5.0 mIoU improvement in segmentation. Our code is\npublicly available: CitL.\n","authors":["John Brandon Graham-Knight","Jamil Fayyad","Nourhan Bayasi","Patricia Lasserre","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2411.02281v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2501.08575v1","updated":"2025-01-15T04:51:10Z","published":"2025-01-15T04:51:10Z","title":"GOTLoc: General Outdoor Text-based Localization Using Scene Graph\n  Retrieval with OpenStreetMap","summary":"  We propose GOTLoc, a robust localization method capable of operating even in\noutdoor environments where GPS signals are unavailable. The method achieves\nthis robust localization by leveraging comparisons between scene graphs\ngenerated from text descriptions and maps. Existing text-based localization\nstudies typically represent maps as point clouds and identify the most similar\nscenes by comparing embeddings of text and point cloud data. However, point\ncloud maps have limited scalability as it is impractical to pre-generate maps\nfor all outdoor spaces. Furthermore, their large data size makes it challenging\nto store and utilize them directly on actual robots. To address these issues,\nGOTLoc leverages compact data structures, such as scene graphs, to store\nspatial information, enabling individual robots to carry and utilize large\namounts of map data. Additionally, by utilizing publicly available map data,\nsuch as OpenStreetMap, which provides global information on outdoor spaces, we\neliminate the need for additional effort to create custom map data. For\nperformance evaluation, we utilized the KITTI360Pose dataset in conjunction\nwith corresponding OpenStreetMap data to compare the proposed method with\nexisting approaches. Our results demonstrate that the proposed method achieves\naccuracy comparable to algorithms relying on point cloud maps. Moreover, in\ncity-scale tests, GOTLoc required significantly less storage compared to point\ncloud-based methods and completed overall processing within a few seconds,\nvalidating its applicability to real-world robotics. Our code is available at\nhttps://github.com/donghwijung/GOTLoc.\n","authors":["Donghwi Jung","Keonwoo Kim","Seong-Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2501.08575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08562v1","updated":"2025-01-15T04:07:06Z","published":"2025-01-15T04:07:06Z","title":"MIAFEx: An Attention-based Feature Extraction Method for Medical Image\n  Classification","summary":"  Feature extraction techniques are crucial in medical image classification;\nhowever, classical feature extractors in addition to traditional machine\nlearning classifiers often exhibit significant limitations in providing\nsufficient discriminative information for complex image sets. While\nConvolutional Neural Networks (CNNs) and Vision Transformer (ViT) have shown\npromise in feature extraction, they are prone to overfitting due to the\ninherent characteristics of medical imaging data, including small sample sizes\nor high intra-class variance. In this work, the Medical Image Attention-based\nFeature Extractor (MIAFEx) is proposed, a novel method that employs a learnable\nrefinement mechanism to enhance the classification token within the Transformer\nencoder architecture. This mechanism adjusts the token based on learned\nweights, improving the extraction of salient features and enhancing the model's\nadaptability to the challenges presented by medical imaging data. The MIAFEx\noutput features quality is compared against classical feature extractors using\ntraditional and hybrid classifiers. Also, the performance of these features is\ncompared against modern CNN and ViT models in classification tasks,\ndemonstrating its superiority in accuracy and robustness across multiple\ncomplex classification medical imaging datasets. This advantage is particularly\npronounced in scenarios with limited training data, where traditional and\nmodern models often struggle to generalize effectively. The source code of this\nproposal can be found at\nhttps://github.com/Oscar-RamosS/Medical-Image-Attention-based-Feature-Extractor-MIAFEx\n","authors":["Oscar Ramos-Soto","Jorge Ramos-Frutos","Ezequiel Perez-Zarate","Diego Oliva","Sandra E. Balderas-Mata"],"pdf_url":"https://arxiv.org/pdf/2501.08562v1.pdf","comment":"In preparation for Journal Submission"},{"id":"http://arxiv.org/abs/2501.08553v1","updated":"2025-01-15T03:28:14Z","published":"2025-01-15T03:28:14Z","title":"DynamicFace: High-Quality and Consistent Video Face Swapping using\n  Composable 3D Facial Priors","summary":"  Face swapping transfers the identity of a source face to a target face while\nretaining the attributes like expression, pose, hair, and background of the\ntarget face. Advanced face swapping methods have achieved attractive results.\nHowever, these methods often inadvertently transfer identity information from\nthe target face, compromising expression-related details and accurate identity.\nWe propose a novel method DynamicFace that leverages the power of diffusion\nmodel and plug-and-play temporal layers for video face swapping. First, we\nintroduce four fine-grained face conditions using 3D facial priors. All\nconditions are designed to be disentangled from each other for precise and\nunique control. Then, we adopt Face Former and ReferenceNet for high-level and\ndetailed identity injection. Through experiments on the FF++ dataset, we\ndemonstrate that our method achieves state-of-the-art results in face swapping,\nshowcasing superior image quality, identity preservation, and expression\naccuracy. Besides, our method could be easily transferred to video domain with\ntemporal attention layer. Our code and results will be available on the project\npage: https://dynamic-face.github.io/\n","authors":["Runqi Wang","Sijie Xu","Tianyao He","Yang Chen","Wei Zhu","Dejia Song","Nemo Chen","Xu Tang","Yao Hu"],"pdf_url":"https://arxiv.org/pdf/2501.08553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08549v1","updated":"2025-01-15T03:17:24Z","published":"2025-01-15T03:17:24Z","title":"The Devil is in Temporal Token: High Quality Video Reasoning\n  Segmentation","summary":"  Existing methods for Video Reasoning Segmentation rely heavily on a single\nspecial token to represent the object in the keyframe or the entire video,\ninadequately capturing spatial complexity and inter-frame motion. To overcome\nthese challenges, we propose VRS-HQ, an end-to-end video reasoning segmentation\napproach that leverages Multimodal Large Language Models (MLLMs) to inject rich\nspatiotemporal features into hierarchical tokens.Our key innovations include a\nTemporal Dynamic Aggregation (TDA) and a Token-driven Keyframe Selection (TKS).\nSpecifically, we design frame-level <SEG> and temporal-level <TAK> tokens that\nutilize MLLM's autoregressive learning to effectively capture both local and\nglobal information. Subsequently, we apply a similarity-based weighted fusion\nand frame selection strategy, then utilize SAM2 to perform keyframe\nsegmentation and propagation. To enhance keyframe localization accuracy, the\nTKS filters keyframes based on SAM2's occlusion scores during inference. VRS-HQ\nachieves state-of-the-art performance on ReVOS, surpassing VISA by\n5.9%/12.5%/9.1% in J&F scores across the three subsets. These results highlight\nthe strong temporal reasoning and segmentation capabilities of our method. Code\nand model weights will be released at VRS-HQ.\n","authors":["Sitong Gong","Yunzhi Zhuge","Lu Zhang","Zongxin Yang","Pingping Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.08549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08545v1","updated":"2025-01-15T03:11:33Z","published":"2025-01-15T03:11:33Z","title":"Comprehensive Subjective and Objective Evaluation Method for\n  Text-generated Video","summary":"  Recent text-to-video (T2V) technology advancements, as demonstrated by models\nsuch as Gen3, Pika, and Sora, have significantly broadened its applicability\nand popularity. This progress has created a growing demand for accurate quality\nassessment metrics to evaluate the perceptual quality of text-generated videos\nand optimize video generation models. However, assessing the quality of\ntext-generated videos remains challenging due to the presence of highly complex\ndistortions, such as unnatural actions and phenomena that defy human cognition.\nTo address these challenges, we constructed a large-scale benchmark dataset for\n\\textbf{T}ext-generated \\textbf{V}ideo \\textbf{eval}uation,\n\\textbf{T2VEval-Bench}, comprising 148 textual words and 1,783 videos generated\nby 12 models. During the subjective evaluation, we collected five key scores:\noverall impression, video quality, aesthetic quality, realness, and text-video\nconsistency. For objective evaluation, we developed the \\textbf{T2VEval} model,\nwhich assesses videos across three branches: quality, authenticity, and\nconsistency. Using an attention-based fusion module, T2VEval effectively\nintegrates features from each branch and predicts scores with the aid of a\nlarge oracle model. Additionally, we implemented a progressive training\nstrategy, enabling each branch to learn targeted knowledge while maintaining\nsynergy with the others. Experimental results demonstrate that T2VEval achieves\nstate-of-the-art performance across multiple metrics. The dataset and code will\nbe open-sourced upon completion of the follow-up work.\n","authors":["Zelu Qi","Ping Shi","Shuqi Wang","Zhaoyang Zhang","Zefeng Ying","Da Pan"],"pdf_url":"https://arxiv.org/pdf/2501.08545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19727v2","updated":"2025-01-15T02:29:14Z","published":"2024-09-29T14:57:45Z","title":"Investigating the Effect of Network Pruning on Performance and\n  Interpretability","summary":"  Deep Neural Networks (DNNs) are often over-parameterized for their tasks and\ncan be compressed quite drastically by removing weights, a process called\npruning. We investigate the impact of different pruning techniques on the\nclassification performance and interpretability of GoogLeNet. We systematically\napply unstructured and structured pruning, as well as connection sparsity\n(pruning of input weights) methods to the network and analyze the outcomes\nregarding the network's performance on the validation set of ImageNet. We also\ncompare different retraining strategies, such as iterative pruning and one-shot\npruning. We find that with sufficient retraining epochs, the performance of the\nnetworks can approximate the performance of the default GoogLeNet - and even\nsurpass it in some cases. To assess interpretability, we employ the Mechanistic\nInterpretability Score (MIS) developed by Zimmermann et al. . Our experiments\nreveal that there is no significant relationship between interpretability and\npruning rate when using MIS as a measure. Additionally, we observe that\nnetworks with extremely low accuracy can still achieve high MIS scores,\nsuggesting that the MIS may not always align with intuitive notions of\ninterpretability, such as understanding the basis of correct decisions.\n","authors":["Jonathan von Rad","Florian Seuffert"],"pdf_url":"https://arxiv.org/pdf/2409.19727v2.pdf","comment":"4 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.07870v2","updated":"2025-01-15T02:23:10Z","published":"2025-01-14T06:21:31Z","title":"Make-A-Character 2: Animatable 3D Character Generation From a Single\n  Image","summary":"  This report introduces Make-A-Character 2, an advanced system for generating\nhigh-quality 3D characters from single portrait photographs, ideal for game\ndevelopment and digital human applications. Make-A-Character 2 builds upon its\npredecessor by incorporating several significant improvements for image-based\nhead generation. We utilize the IC-Light method to correct non-ideal\nillumination in input photos and apply neural network-based color correction to\nharmonize skin tones between the photos and game engine renders. We also employ\nthe Hierarchical Representation Network to capture high-frequency facial\nstructures and conduct adaptive skeleton calibration for accurate and\nexpressive facial animations. The entire image-to-3D-character generation\nprocess takes less than 2 minutes. Furthermore, we leverage transformer\narchitecture to generate co-speech facial and gesture actions, enabling\nreal-time conversation with the generated character. These technologies have\nbeen integrated into our conversational AI avatar products.\n","authors":["Lin Liu","Yutong Wang","Jiahao Chen","Jianfang Li","Tangli Xue","Longlong Li","Jianqiang Ren","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2501.07870v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2412.11409v3","updated":"2025-01-15T01:59:02Z","published":"2024-12-16T03:25:23Z","title":"Multi-modal and Multi-scale Spatial Environment Understanding for\n  Immersive Visual Text-to-Speech","summary":"  Visual Text-to-Speech (VTTS) aims to take the environmental image as the\nprompt to synthesize the reverberant speech for the spoken content. The\nchallenge of this task lies in understanding the spatial environment from the\nimage. Many attempts have been made to extract global spatial visual\ninformation from the RGB space of an spatial image. However, local and depth\nimage information are crucial for understanding the spatial environment, which\nprevious works have ignored. To address the issues, we propose a novel\nmulti-modal and multi-scale spatial environment understanding scheme to achieve\nimmersive VTTS, termed M2SE-VTTS. The multi-modal aims to take both the RGB and\nDepth spaces of the spatial image to learn more comprehensive spatial\ninformation, and the multi-scale seeks to model the local and global spatial\nknowledge simultaneously. Specifically, we first split the RGB and Depth images\ninto patches and adopt the Gemini-generated environment captions to guide the\nlocal spatial understanding. After that, the multi-modal and multi-scale\nfeatures are integrated by the local-aware global spatial understanding. In\nthis way, M2SE-VTTS effectively models the interactions between local and\nglobal spatial contexts in the multi-modal spatial environment. Objective and\nsubjective evaluations suggest that our model outperforms the advanced\nbaselines in environmental speech generation. The code and audio samples are\navailable at: https://github.com/AI-S2-Lab/M2SE-VTTS.\n","authors":["Rui Liu","Shuwei He","Yifan Hu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2412.11409v3.pdf","comment":"9 pages,2 figures, Accepted by AAAI'2025"},{"id":"http://arxiv.org/abs/2501.08514v1","updated":"2025-01-15T01:52:54Z","published":"2025-01-15T01:52:54Z","title":"Multimodal Fake News Video Explanation Generation","summary":"  Multi-modal explanation involves the assessment of the veracity of a variety\nof different content, and relies on multiple information modalities to\ncomprehensively consider the relevance and consistency between modalities. Most\nexisting fake news video detection methods focus on improving accuracy while\nignoring the importance of providing explanations. In this paper, we propose a\nnovel problem - Fake News Video Explanation (FNVE) - Given a multimodal news\ncontaining both video and caption text, we aim to generate natural language\nexplanations to reveal the truth of predictions. To this end, we develop\nFakeNVE, a new dataset of explanations for truthfully multimodal posts, where\neach explanation is a natural language (English) sentence describing the\nattribution of a news thread. We benchmark FakeNVE by using a multimodal\ntransformer-based architecture. Subsequently, a BART-based autoregressive\ndecoder is used as the generator. Empirical results show compelling results for\nvarious baselines (applicable to FNVE) across multiple evaluation metrics. We\nalso perform human evaluation on explanation generation, achieving high scores\nfor both adequacy and fluency.\n","authors":["Lizhi Chen","Zhong Qian","Peifeng Li","Qiaoming Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.08514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04939v2","updated":"2025-01-15T01:12:29Z","published":"2025-01-09T03:04:08Z","title":"Multi-Context Temporal Consistent Modeling for Referring Video Object\n  Segmentation","summary":"  Referring video object segmentation aims to segment objects within a video\ncorresponding to a given text description. Existing transformer-based temporal\nmodeling approaches face challenges related to query inconsistency and the\nlimited consideration of context. Query inconsistency produces unstable masks\nof different objects in the middle of the video. The limited consideration of\ncontext leads to the segmentation of incorrect objects by failing to adequately\naccount for the relationship between the given text and instances. To address\nthese issues, we propose the Multi-context Temporal Consistency Module (MTCM),\nwhich consists of an Aligner and a Multi-Context Enhancer (MCE). The Aligner\nremoves noise from queries and aligns them to achieve query consistency. The\nMCE predicts text-relevant queries by considering multi-context. We applied\nMTCM to four different models, increasing performance across all of them,\nparticularly achieving 47.6 J&F on the MeViS. Code is available at\nhttps://github.com/Choi58/MTCM.\n","authors":["Sun-Hyuk Choi","Hayoung Jo","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2501.04939v2.pdf","comment":"Comment: Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.08506v1","updated":"2025-01-15T00:56:59Z","published":"2025-01-15T00:56:59Z","title":"Exploring the Efficacy of Meta-Learning: Unveiling Superior Data\n  Diversity Utilization of MAML Over Pre-training","summary":"  Currently, data and model size dominate the narrative in the training of\nsuper-large, powerful models. However, there has been a lack of exploration on\nthe effect of other attributes of the training dataset on model performance. We\nhypothesize that dataset diversity can impact the performance of vision models.\nOur study shows positive correlations between test set accuracy and data\ndiversity, providing an argument for furthering the research of dataset\nattributes beyond size. We analyzed pre-training and model-agnostic\nmeta-learning methods on twelve popular visual datasets (e.g., Omniglot,\nCIFAR-FS, Aircraft) and five model configurations, including MAML variants with\ndifferent numbers of inner gradient steps and supervised learning. We show\nmoderate to strong positive correlations (R-squared: 0.15-0.42) between\naccuracy and data diversity and weaker but significant correlations (R-squared:\n~0.2) between loss and diversity. These findings support our hypothesis and\ndemonstrate a promising way for a deeper exploration of how formal data\ndiversity influences model performance. This initial study highlights the\npotential of (Task2Vec) data diversity as a valuable measure in the rapidly\nevolving field of large-scale learning and emphasizes that understanding the\ndataset is key to building more powerful and generalizable models.\n","authors":["Kavita Selva","Satita Vittayaareekul","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07163v2","updated":"2025-01-15T00:54:54Z","published":"2025-01-13T09:49:34Z","title":"Adaptive Noise-Tolerant Network for Image Segmentation","summary":"  Unlike image classification and annotation, for which deep network models\nhave achieved dominating superior performances compared to traditional computer\nvision algorithms, deep learning for automatic image segmentation still faces\ncritical challenges. One of such hurdles is to obtain ground-truth\nsegmentations as the training labels for deep network training. Especially when\nwe study biomedical images, such as histopathological images (histo-images), it\nis unrealistic to ask for manual segmentation labels as the ground truth for\ntraining due to the fine image resolution as well as the large image size and\ncomplexity. In this paper, instead of relying on clean segmentation labels, we\nstudy whether and how integrating imperfect or noisy segmentation results from\noff-the-shelf segmentation algorithms may help achieve better segmentation\nresults through a new Adaptive Noise-Tolerant Network (ANTN) model. We extend\nthe noisy label deep learning to image segmentation with two novel aspects: (1)\nmultiple noisy labels can be integrated into one deep learning model; (2) noisy\nsegmentation modeling, including probabilistic parameters, is adaptive,\ndepending on the given testing image appearance. Implementation of the new ANTN\nmodel on both the synthetic data and real-world histo-images demonstrates its\neffectiveness and superiority over off-the-shelf and other existing\ndeep-learning-based image segmentation algorithms.\n","authors":["Weizhi Li"],"pdf_url":"https://arxiv.org/pdf/2501.07163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08505v1","updated":"2025-01-15T00:54:33Z","published":"2025-01-15T00:54:33Z","title":"Yuan: Yielding Unblemished Aesthetics Through A Unified Network for\n  Visual Imperfections Removal in Generated Images","summary":"  Generative AI presents transformative potential across various domains, from\ncreative arts to scientific visualization. However, the utility of AI-generated\nimagery is often compromised by visual flaws, including anatomical\ninaccuracies, improper object placements, and misplaced textual elements. These\nimperfections pose significant challenges for practical applications. To\novercome these limitations, we introduce \\textit{Yuan}, a novel framework that\nautonomously corrects visual imperfections in text-to-image synthesis.\n\\textit{Yuan} uniquely conditions on both the textual prompt and the segmented\nimage, generating precise masks that identify areas in need of refinement\nwithout requiring manual intervention -- a common constraint in previous\nmethodologies. Following the automated masking process, an advanced inpainting\nmodule seamlessly integrates contextually coherent content into the identified\nregions, preserving the integrity and fidelity of the original image and\nassociated text prompts. Through extensive experimentation on publicly\navailable datasets such as ImageNet100 and Stanford Dogs, along with a\ncustom-generated dataset, \\textit{Yuan} demonstrated superior performance in\neliminating visual imperfections. Our approach consistently achieved higher\nscores in quantitative metrics, including NIQE, BRISQUE, and PI, alongside\nfavorable qualitative evaluations. These results underscore \\textit{Yuan}'s\npotential to significantly enhance the quality and applicability of\nAI-generated images across diverse fields.\n","authors":["Zhenyu Yu","Chee Seng Chan"],"pdf_url":"https://arxiv.org/pdf/2501.08505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08504v1","updated":"2025-01-15T00:54:12Z","published":"2025-01-15T00:54:12Z","title":"SuperSAM: Crafting a SAM Supernetwork via Structured Pruning and\n  Unstructured Parameter Prioritization","summary":"  Neural Architecture Search (NAS) is a powerful approach of automating the\ndesign of efficient neural architectures. In contrast to traditional NAS\nmethods, recently proposed one-shot NAS methods prove to be more efficient in\nperforming NAS. One-shot NAS works by generating a singular weight-sharing\nsupernetwork that acts as a search space (container) of subnetworks. Despite\nits achievements, designing the one-shot search space remains a major\nchallenge. In this work we propose a search space design strategy for Vision\nTransformer (ViT)-based architectures. In particular, we convert the Segment\nAnything Model (SAM) into a weight-sharing supernetwork called SuperSAM. Our\napproach involves automating the search space design via layer-wise structured\npruning and parameter prioritization. While the structured pruning applies\nprobabilistic removal of certain transformer layers, parameter prioritization\nperforms weight reordering and slicing of MLP-blocks in the remaining layers.\nWe train supernetworks on several datasets using the sandwich rule. For\ndeployment, we enhance subnetwork discovery by utilizing a program autotuner to\nidentify efficient subnetworks within the search space. The resulting\nsubnetworks are 30-70% smaller in size compared to the original pre-trained SAM\nViT-B, yet outperform the pretrained model. Our work introduces a new and\neffective method for ViT NAS search-space design.\n","authors":["Waqwoya Abebe","Sadegh Jafari","Sixing Yu","Akash Dutta","Jan Strube","Nathan R. Tallent","Luanzheng Guo","Pablo Munoz","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2501.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14762v3","updated":"2025-01-15T00:53:38Z","published":"2024-11-22T06:50:44Z","title":"Efficient Long Video Tokenization via Coordinate-based Patch\n  Reconstruction","summary":"  Efficient tokenization of videos remains a challenge in training vision\nmodels that can process long videos. One promising direction is to develop a\ntokenizer that can encode long video clips, as it would enable the tokenizer to\nleverage the temporal coherence of videos better for tokenization. However,\ntraining existing tokenizers on long videos often incurs a huge training cost\nas they are trained to reconstruct all the frames at once. In this paper, we\nintroduce CoordTok, a video tokenizer that learns a mapping from\ncoordinate-based representations to the corresponding patches of input videos,\ninspired by recent advances in 3D generative models. In particular, CoordTok\nencodes a video into factorized triplane representations and reconstructs\npatches that correspond to randomly sampled $(x,y,t)$ coordinates. This allows\nfor training large tokenizer models directly on long videos without requiring\nexcessive training resources. Our experiments show that CoordTok can\ndrastically reduce the number of tokens for encoding long video clips. For\ninstance, CoordTok can encode a 128-frame video with 128$\\times$128 resolution\ninto 1280 tokens, while baselines need 6144 or 8192 tokens to achieve similar\nreconstruction quality. We further show that this efficient video tokenization\nenables memory-efficient training of a diffusion transformer that can generate\n128 frames at once.\n","authors":["Huiwon Jang","Sihyun Yu","Jinwoo Shin","Pieter Abbeel","Younggyo Seo"],"pdf_url":"https://arxiv.org/pdf/2411.14762v3.pdf","comment":"Code is available on the project webpage:\n  https://huiwon-jang.github.io/coordtok/"},{"id":"http://arxiv.org/abs/2412.14340v2","updated":"2025-01-15T00:02:00Z","published":"2024-12-18T21:17:02Z","title":"A Unifying Information-theoretic Perspective on Evaluating Generative\n  Models","summary":"  Considering the difficulty of interpreting generative model output, there is\nsignificant current research focused on determining meaningful evaluation\nmetrics. Several recent approaches utilize \"precision\" and \"recall,\" borrowed\nfrom the classification domain, to individually quantify the output fidelity\n(realism) and output diversity (representation of the real data variation),\nrespectively. With the increase in metric proposals, there is a need for a\nunifying perspective, allowing for easier comparison and clearer explanation of\ntheir benefits and drawbacks. To this end, we unify a class of\nkth-nearest-neighbors (kNN)-based metrics under an information-theoretic lens\nusing approaches from kNN density estimation. Additionally, we propose a\ntri-dimensional metric composed of Precision Cross-Entropy (PCE), Recall\nCross-Entropy (RCE), and Recall Entropy (RE), which separately measure fidelity\nand two distinct aspects of diversity, inter- and intra-class. Our\ndomain-agnostic metric, derived from the information-theoretic concepts of\nentropy and cross-entropy, can be dissected for both sample- and mode-level\nanalysis. Our detailed experimental results demonstrate the sensitivity of our\nmetric components to their respective qualities and reveal undesirable\nbehaviors of other metrics.\n","authors":["Alexis Fox","Samarth Swarup","Abhijin Adiga"],"pdf_url":"https://arxiv.org/pdf/2412.14340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09203v1","updated":"2025-01-15T23:36:05Z","published":"2025-01-15T23:36:05Z","title":"Unified Few-shot Crack Segmentation and its Precise 3D Automatic\n  Measurement in Concrete Structures","summary":"  Visual-Spatial Systems has become increasingly essential in concrete crack\ninspection. However, existing methods often lacks adaptability to diverse\nscenarios, exhibits limited robustness in image-based approaches, and struggles\nwith curved or complex geometries. To address these limitations, an innovative\nframework for two-dimensional (2D) crack detection, three-dimensional (3D)\nreconstruction, and 3D automatic crack measurement was proposed by integrating\ncomputer vision technologies and multi-modal Simultaneous localization and\nmapping (SLAM) in this study. Firstly, building on a base DeepLabv3+\nsegmentation model, and incorporating specific refinements utilizing foundation\nmodel Segment Anything Model (SAM), we developed a crack segmentation method\nwith strong generalization across unfamiliar scenarios, enabling the generation\nof precise 2D crack masks. To enhance the accuracy and robustness of 3D\nreconstruction, Light Detection and Ranging (LiDAR) point clouds were utilized\ntogether with image data and segmentation masks. By leveraging both image- and\nLiDAR-SLAM, we developed a multi-frame and multi-modal fusion framework that\nproduces dense, colorized point clouds, effectively capturing crack semantics\nat a 3D real-world scale. Furthermore, the crack geometric attributions were\nmeasured automatically and directly within 3D dense point cloud space,\nsurpassing the limitations of conventional 2D image-based measurements. This\nadvancement makes the method suitable for structural components with curved and\ncomplex 3D geometries. Experimental results across various concrete structures\nhighlight the significant improvements and unique advantages of the proposed\nmethod, demonstrating its effectiveness, accuracy, and robustness in real-world\napplications.\n","authors":["Pengru Deng","Jiapeng Yao","Chun Li","Su Wang","Xinrun Li","Varun Ojha","Xuhui He","Takashi Matsumoto"],"pdf_url":"https://arxiv.org/pdf/2501.09203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09520v2","updated":"2025-01-15T23:21:06Z","published":"2024-09-14T20:11:25Z","title":"Enhancing Skin Disease Diagnosis: Interpretable Visual Concept Discovery\n  with SAM","summary":"  Current AI-assisted skin image diagnosis has achieved dermatologist-level\nperformance in classifying skin cancer, driven by rapid advancements in deep\nlearning architectures. However, unlike traditional vision tasks, skin images\nin general present unique challenges due to the limited availability of\nwell-annotated datasets, complex variations in conditions, and the necessity\nfor detailed interpretations to ensure patient safety. Previous segmentation\nmethods have sought to reduce image noise and enhance diagnostic performance,\nbut these techniques require fine-grained, pixel-level ground truth masks for\ntraining. In contrast, with the rise of foundation models, the Segment Anything\nModel (SAM) has been introduced to facilitate promptable segmentation, enabling\nthe automation of the segmentation process with simple yet effective prompts.\nEfforts applying SAM predominantly focus on dermatoscopy images, which present\nmore easily identifiable lesion boundaries than clinical photos taken with\nsmartphones. This limitation constrains the practicality of these approaches to\nreal-world applications. To overcome the challenges posed by noisy clinical\nphotos acquired via non-standardized protocols and to improve diagnostic\naccessibility, we propose a novel Cross-Attentive Fusion framework for\ninterpretable skin lesion diagnosis. Our method leverages SAM to generate\nvisual concepts for skin diseases using prompts, integrating local visual\nconcepts with global image features to enhance model performance. Extensive\nevaluation on two skin disease datasets demonstrates our proposed method's\neffectiveness on lesion diagnosis and interpretability.\n","authors":["Xin Hu","Janet Wang","Jihun Hamm","Rie R Yotsu","Zhengming Ding"],"pdf_url":"https://arxiv.org/pdf/2409.09520v2.pdf","comment":"This paper is accepted by WACV 2025"},{"id":"http://arxiv.org/abs/2501.09194v1","updated":"2025-01-15T22:55:26Z","published":"2025-01-15T22:55:26Z","title":"Grounding Text-To-Image Diffusion Models For Controlled High-Quality\n  Image Generation","summary":"  Large-scale text-to-image (T2I) diffusion models have demonstrated an\noutstanding performance in synthesizing diverse high-quality visuals from\nnatural language text captions. Multiple layout-to-image models have been\ndeveloped to control the generation process by utilizing a broad array of\nlayouts such as segmentation maps, edges, and human keypoints. In this work, we\npresent ObjectDiffusion, a model that takes inspirations from the top\ncutting-edge image generative frameworks to seamlessly condition T2I models\nwith new bounding boxes capabilities. Specifically, we make substantial\nmodifications to the network architecture introduced in ContorlNet to integrate\nit with the condition processing and injection techniques proposed in GLIGEN.\nObjectDiffusion is initialized with pretraining parameters to leverage the\ngeneration knowledge obtained from training on large-scale datasets. We\nfine-tune ObjectDiffusion on the COCO2017 training dataset and evaluate it on\nthe COCO2017 validation dataset. Our model achieves an AP$_{50}$ of 46.6, an AR\nof 44.5, and a FID of 19.8 outperforming the current SOTA model trained on\nopen-source datasets in all of the three metrics. ObjectDiffusion demonstrates\na distinctive capability in synthesizing diverse, high-quality, high-fidelity\nimages that seamlessly conform to the semantic and spatial control layout.\nEvaluated in qualitative and quantitative tests, ObjectDiffusion exhibits\nremarkable grounding abilities on closed-set and open-set settings across a\nwide variety of contexts. The qualitative assessment verifies the ability of\nObjectDiffusion to generate multiple objects of different sizes and locations.\n","authors":["Ahmad Süleyman","Göksel Biricik"],"pdf_url":"https://arxiv.org/pdf/2501.09194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09187v1","updated":"2025-01-15T22:26:26Z","published":"2025-01-15T22:26:26Z","title":"Patch-aware Vector Quantized Codebook Learning for Unsupervised Visual\n  Defect Detection","summary":"  Unsupervised visual defect detection is critical in industrial applications,\nrequiring a representation space that captures normal data features while\ndetecting deviations. Achieving a balance between expressiveness and\ncompactness is challenging; an overly expressive space risks inefficiency and\nmode collapse, impairing detection accuracy. We propose a novel approach using\nan enhanced VQ-VAE framework optimized for unsupervised defect detection. Our\nmodel introduces a patch-aware dynamic code assignment scheme, enabling\ncontext-sensitive code allocation to optimize spatial representation. This\nstrategy enhances normal-defect distinction and improves detection accuracy\nduring inference. Experiments on MVTecAD, BTAD, and MTSD datasets show our\nmethod achieves state-of-the-art performance.\n","authors":["Qisen Cheng","Shuhui Qu","Janghwan Lee"],"pdf_url":"https://arxiv.org/pdf/2501.09187v1.pdf","comment":"7 pages, Accepted to 36th IEEE ICTAI 2024"},{"id":"http://arxiv.org/abs/2501.09185v1","updated":"2025-01-15T22:23:41Z","published":"2025-01-15T22:23:41Z","title":"Cancer-Net PCa-Seg: Benchmarking Deep Learning Models for Prostate\n  Cancer Segmentation Using Synthetic Correlated Diffusion Imaging","summary":"  Prostate cancer (PCa) is the most prevalent cancer among men in the United\nStates, accounting for nearly 300,000 cases, 29% of all diagnoses and 35,000\ntotal deaths in 2024. Traditional screening methods such as prostate-specific\nantigen (PSA) testing and magnetic resonance imaging (MRI) have been pivotal in\ndiagnosis, but have faced limitations in specificity and generalizability. In\nthis paper, we explore the potential of enhancing PCa lesion segmentation using\na novel MRI modality called synthetic correlated diffusion imaging (CDI$^s$).\nWe employ several state-of-the-art deep learning models, including U-Net,\nSegResNet, Swin UNETR, Attention U-Net, and LightM-UNet, to segment PCa lesions\nfrom a 200 CDI$^s$ patient cohort. We find that SegResNet achieved superior\nsegmentation performance with a Dice-Sorensen coefficient (DSC) of $76.68 \\pm\n0.8$. Notably, the Attention U-Net, while slightly less accurate (DSC $74.82\n\\pm 2.0$), offered a favorable balance between accuracy and computational\nefficiency. Our findings demonstrate the potential of deep learning models in\nimproving PCa lesion segmentation using CDI$^s$ to enhance PCa management and\nclinical support.\n","authors":["Jarett Dewbury","Chi-en Amy Tai","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2501.09185v1.pdf","comment":"8 pages, 2 figures, to be published in Studies in Computational\n  Intelligence. This paper introduces Cancer-Net PCa-Seg, a comprehensive\n  evaluation of deep learning models for prostate cancer segmentation using\n  synthetic correlated diffusion imaging (CDI$^s$). We benchmark five\n  state-of-the-art architectures: U-Net, SegResNet, Swin UNETR, Attention\n  U-Net, and LightM-UNet"},{"id":"http://arxiv.org/abs/2312.11458v3","updated":"2025-01-15T22:17:24Z","published":"2023-12-18T18:59:03Z","title":"GauFRe: Gaussian Deformation Fields for Real-time Dynamic Novel View\n  Synthesis","summary":"  We propose a method that achieves state-of-the-art rendering quality and\nefficiency on monocular dynamic scene reconstruction using deformable 3D\nGaussians. Implicit deformable representations commonly model motion with a\ncanonical space and time-dependent backward-warping deformation field. Our\nmethod, GauFRe, uses a forward-warping deformation to explicitly model\nnon-rigid transformations of scene geometry. Specifically, we propose a\ntemplate set of 3D Gaussians residing in a canonical space, and a\ntime-dependent forward-warping deformation field to model dynamic objects.\nAdditionally, we tailor a 3D Gaussian-specific static component supported by an\ninductive bias-aware initialization approach which allows the deformation field\nto focus on moving scene regions, improving the rendering of complex real-world\nmotion. The differentiable pipeline is optimized end-to-end with a\nself-supervised rendering loss. Experiments show our method achieves\ncompetitive results and higher efficiency than both previous state-of-the-art\nNeRF and Gaussian-based methods. For real-world scenes, GauFRe can train in ~20\nmins and offer 96 FPS real-time rendering on an RTX 3090 GPU. Project website:\nhttps://lynl7130.github.io/gaufre/index.html\n","authors":["Yiqing Liang","Numair Khan","Zhengqin Li","Thu Nguyen-Phuoc","Douglas Lanman","James Tompkin","Lei Xiao"],"pdf_url":"https://arxiv.org/pdf/2312.11458v3.pdf","comment":"WACV 2025. 11 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.09167v1","updated":"2025-01-15T21:36:19Z","published":"2025-01-15T21:36:19Z","title":"Embodied Scene Understanding for Vision Language Models via MetaVQA","summary":"  Vision Language Models (VLMs) demonstrate significant potential as embodied\nAI agents for various mobility applications. However, a standardized,\nclosed-loop benchmark for evaluating their spatial reasoning and sequential\ndecision-making capabilities is lacking. To address this, we present MetaVQA: a\ncomprehensive benchmark designed to assess and enhance VLMs' understanding of\nspatial relationships and scene dynamics through Visual Question Answering\n(VQA) and closed-loop simulations. MetaVQA leverages Set-of-Mark prompting and\ntop-down view ground-truth annotations from nuScenes and Waymo datasets to\nautomatically generate extensive question-answer pairs based on diverse\nreal-world traffic scenarios, ensuring object-centric and context-rich\ninstructions. Our experiments show that fine-tuning VLMs with the MetaVQA\ndataset significantly improves their spatial reasoning and embodied scene\ncomprehension in safety-critical simulations, evident not only in improved VQA\naccuracies but also in emerging safety-aware driving maneuvers. In addition,\nthe learning demonstrates strong transferability from simulation to real-world\nobservation. Code and data will be publicly available at\nhttps://metadriverse.github.io/metavqa .\n","authors":["Weizhen Wang","Chenda Duan","Zhenghao Peng","Yuxin Liu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.09167v1.pdf","comment":"for the project webpage, see https://metadriverse.github.io/metavqa"},{"id":"http://arxiv.org/abs/2501.09162v1","updated":"2025-01-15T21:28:47Z","published":"2025-01-15T21:28:47Z","title":"A Vessel Bifurcation Landmark Pair Dataset for Abdominal CT Deformable\n  Image Registration (DIR) Validation","summary":"  Deformable image registration (DIR) is an enabling technology in many\ndiagnostic and therapeutic tasks. Despite this, DIR algorithms have limited\nclinical use, largely due to a lack of benchmark datasets for quality assurance\nduring development. To support future algorithm development, here we introduce\nour first-of-its-kind abdominal CT DIR benchmark dataset, comprising large\nnumbers of highly accurate landmark pairs on matching blood vessel\nbifurcations. Abdominal CT image pairs of 30 patients were acquired from\nseveral public repositories as well as the authors' institution with IRB\napproval. The two CTs of each pair were originally acquired for the same\npatient on different days. An image processing workflow was developed and\napplied to each image pair: 1) Abdominal organs were segmented with a deep\nlearning model, and image intensity within organ masks was overwritten. 2)\nMatching image patches were manually identified between two CTs of each image\npair 3) Vessel bifurcation landmarks were labeled on one image of each image\npatch pair. 4) Image patches were deformably registered, and landmarks were\nprojected onto the second image. 5) Landmark pair locations were refined\nmanually or with an automated process. This workflow resulted in 1895 total\nlandmark pairs, or 63 per case on average. Estimates of the landmark pair\naccuracy using digital phantoms were 0.7+/-1.2mm. The data is published in\nZenodo at https://doi.org/10.5281/zenodo.14362785. Instructions for use can be\nfound at https://github.com/deshanyang/Abdominal-DIR-QA. This dataset is a\nfirst-of-its-kind for abdominal DIR validation. The number, accuracy, and\ndistribution of landmark pairs will allow for robust validation of DIR\nalgorithms with precision beyond what is currently available.\n","authors":["Edward R Criscuolo","Yao Hao","Zhendong Zhang","Trevor McKeown","Deshan Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09162v1.pdf","comment":"19 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.09155v1","updated":"2025-01-15T21:14:36Z","published":"2025-01-15T21:14:36Z","title":"VCRScore: Image captioning metric based on V\\&L Transformers, CLIP, and\n  precision-recall","summary":"  Image captioning has become an essential Vision & Language research task. It\nis about predicting the most accurate caption given a specific image or video.\nThe research community has achieved impressive results by continuously\nproposing new models and approaches to improve the overall model's performance.\nNevertheless, despite increasing proposals, the performance metrics used to\nmeasure their advances have remained practically untouched through the years. A\nprobe of that, nowadays metrics like BLEU, METEOR, CIDEr, and ROUGE are still\nvery used, aside from more sophisticated metrics such as BertScore and\nClipScore.\n  Hence, it is essential to adjust how are measure the advances, limitations,\nand scopes of the new image captioning proposals, as well as to adapt new\nmetrics to these new advanced image captioning approaches.\n  This work proposes a new evaluation metric for the image captioning problem.\nTo do that, first, it was generated a human-labeled dataset to assess to which\ndegree the captions correlate with the image's content. Taking these human\nscores as ground truth, we propose a new metric, and compare it with several\nwell-known metrics, from classical to newer ones. Outperformed results were\nalso found, and interesting insights were presented and discussed.\n","authors":["Guillermo Ruiz","Tania Ramírez","Daniela Moctezuma"],"pdf_url":"https://arxiv.org/pdf/2501.09155v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2302.13336v2","updated":"2025-01-15T20:50:17Z","published":"2023-02-26T15:45:19Z","title":"Key-Exchange Convolutional Auto-Encoder for Data Augmentation in Early\n  Knee Osteoarthritis Detection","summary":"  Knee Osteoarthritis (KOA) is a common musculoskeletal condition that\nsignificantly affects mobility and quality of life, particularly in elderly\npopulations. However, training deep learning models for early KOA\nclassification is often hampered by the limited availability of annotated\nmedical datasets, owing to the high costs and labour-intensive nature of data\nlabelling. Traditional data augmentation techniques, while useful, rely on\nsimple transformations and fail to introduce sufficient diversity into the\ndataset. To address these challenges, we propose the Key-Exchange Convolutional\nAuto-Encoder (KECAE) as an innovative Artificial Intelligence (AI)-based data\naugmentation strategy for early KOA classification. Our model employs a\nconvolutional autoencoder with a novel key-exchange mechanism that generates\nsynthetic images by selectively exchanging key pathological features between\nX-ray images, which not only diversifies the dataset but also ensures the\nclinical validity of the augmented data. A hybrid loss function is introduced\nto supervise feature learning and reconstruction, integrating multiple\ncomponents, including reconstruction, supervision, and feature separation\nlosses. Experimental results demonstrate that the KECAE-generated data\nsignificantly improve the performance of KOA classification models, with\naccuracy gains of up to 1.98% across various standard and state-of-the-art\narchitectures. Furthermore, a clinical validation study involving expert\nradiologists confirms the anatomical plausibility and diagnostic realism of the\nsynthetic outputs. These findings highlight the potential of KECAE as a robust\ntool for augmenting medical datasets in early KOA detection.\n","authors":["Zhe Wang","Aladine Chetouani","Mohamed Jarraya","Yung Hsin Chen","Yuhua Ru","Fang Chen","Fabian Bauer","Liping Zhang","Didier Hans","Rachid Jennane"],"pdf_url":"https://arxiv.org/pdf/2302.13336v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12141v2","updated":"2025-01-15T20:44:23Z","published":"2022-12-23T04:31:20Z","title":"Human Activity Recognition in an Open World","summary":"  Managing novelty in perception-based human activity recognition (HAR) is\ncritical in realistic settings to improve task performance over time and ensure\nsolution generalization outside of prior seen samples. Novelty manifests in HAR\nas unseen samples, activities, objects, environments, and sensor changes, among\nother ways. Novelty may be task-relevant, such as a new class or new features,\nor task-irrelevant resulting in nuisance novelty, such as never before seen\nnoise, blur, or distorted video recordings. To perform HAR optimally,\nalgorithmic solutions must be tolerant to nuisance novelty, and learn over time\nin the face of novelty. This paper 1) formalizes the definition of novelty in\nHAR building upon the prior definition of novelty in classification tasks, 2)\nproposes an incremental open world learning (OWL) protocol and applies it to\nthe Kinetics datasets to generate a new benchmark KOWL-718, 3) analyzes the\nperformance of current state-of-the-art HAR models when novelty is introduced\nover time, 4) provides a containerized and packaged pipeline for reproducing\nthe OWL protocol and for modifying for any future updates to Kinetics. The\nexperimental analysis includes an ablation study of how the different models\nperform under various conditions as annotated by Kinetics-AVA. The protocol as\nan algorithm for reproducing experiments using the KOWL-718 benchmark will be\npublicly released with code and containers at\nhttps://github.com/prijatelj/human-activity-recognition-in-an-open-world. The\ncode may be used to analyze different annotations and subsets of the Kinetics\ndatasets in an incremental open world fashion, as well as be extended as\nfurther updates to Kinetics are released.\n","authors":["Derek S. Prijatelj","Samuel Grieggs","Jin Huang","Dawei Du","Ameya Shringi","Christopher Funk","Adam Kaufman","Eric Robertson","Walter J. Scheirer"],"pdf_url":"https://arxiv.org/pdf/2212.12141v2.pdf","comment":"37 pages, 16 figures, 3 tables. Published in JAIR 81 on Dec 20, 2024.\n  All author affiliations are from during the paper's original funded work.\n  Updated info and current emails are provided in this version's first page"},{"id":"http://arxiv.org/abs/2501.09138v1","updated":"2025-01-15T20:44:21Z","published":"2025-01-15T20:44:21Z","title":"Few-Shot Adaptation of Training-Free Foundation Model for 3D Medical\n  Image Segmentation","summary":"  Vision foundation models have achieved remarkable progress across various\nimage analysis tasks. In the image segmentation task, foundation models like\nthe Segment Anything Model (SAM) enable generalizable zero-shot segmentation\nthrough user-provided prompts. However, SAM primarily trained on natural\nimages, lacks the domain-specific expertise of medical imaging. This limitation\nposes challenges when applying SAM to medical image segmentation, including the\nneed for extensive fine-tuning on specialized medical datasets and a dependency\non manual prompts, which are both labor-intensive and require intervention from\nmedical experts.\n  This work introduces the Few-shot Adaptation of Training-frEe SAM (FATE-SAM),\na novel method designed to adapt the advanced Segment Anything Model 2 (SAM2)\nfor 3D medical image segmentation. FATE-SAM reassembles pre-trained modules of\nSAM2 to enable few-shot adaptation, leveraging a small number of support\nexamples to capture anatomical knowledge and perform prompt-free segmentation,\nwithout requiring model fine-tuning. To handle the volumetric nature of medical\nimages, we incorporate a Volumetric Consistency mechanism that enhances spatial\ncoherence across 3D slices. We evaluate FATE-SAM on multiple medical imaging\ndatasets and compare it with supervised learning methods, zero-shot SAM\napproaches, and fine-tuned medical SAM methods. Results show that FATE-SAM\ndelivers robust and accurate segmentation while eliminating the need for large\nannotated datasets and expert intervention. FATE-SAM provides a practical,\nefficient solution for medical image segmentation, making it more accessible\nfor clinical applications.\n","authors":["Xingxin He","Yifan Hu","Zhaoye Zhou","Mohamed Jarraya","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13203v2","updated":"2025-01-15T20:41:42Z","published":"2023-03-23T11:57:50Z","title":"Confidence-Driven Deep Learning Framework for Early Detection of Knee\n  Osteoarthritis","summary":"  Knee Osteoarthritis (KOA) is a prevalent musculoskeletal disorder that\nseverely impacts mobility and quality of life, particularly among older adults.\nIts diagnosis often relies on subjective assessments using the\nKellgren-Lawrence (KL) grading system, leading to variability in clinical\nevaluations. To address these challenges, we propose a confidence-driven deep\nlearning framework for early KOA detection, focusing on distinguishing KL-0 and\nKL-2 stages. The Siamese-based framework integrates a novel multi-level feature\nextraction architecture with a hybrid loss strategy. Specifically, multi-level\nGlobal Average Pooling (GAP) layers are employed to extract features from\nvarying network depths, ensuring comprehensive feature representation, while\nthe hybrid loss strategy partitions training samples into high-, medium-, and\nlow-confidence subsets. Tailored loss functions are applied to improve model\nrobustness and effectively handle uncertainty in annotations. Experimental\nresults on the Osteoarthritis Initiative (OAI) dataset demonstrate that the\nproposed framework achieves competitive accuracy, sensitivity, and specificity,\ncomparable to those of expert radiologists. Cohen's kappa values (k > 0.85))\nconfirm substantial agreement, while McNemar's test (p > 0.05) indicates no\nstatistically significant differences between the model and radiologists.\nAdditionally, Confidence distribution analysis reveals that the model emulates\nradiologists' decision-making patterns. These findings highlight the potential\nof the proposed approach to serve as an auxiliary diagnostic tool, enhancing\nearly KOA detection and reducing clinical workload.\n","authors":["Zhe Wang","Aladine Chetouani","Yung Hsin Chen","Yuhua Ru","Fang Chen","Mohamed Jarraya","Fabian Bauer","Liping Zhang","Didier Hans","Rachid Jennane"],"pdf_url":"https://arxiv.org/pdf/2303.13203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09134v1","updated":"2025-01-15T20:37:04Z","published":"2025-01-15T20:37:04Z","title":"Benchmarking Robustness of Contrastive Learning Models for Medical\n  Image-Report Retrieval","summary":"  Medical images and reports offer invaluable insights into patient health. The\nheterogeneity and complexity of these data hinder effective analysis. To bridge\nthis gap, we investigate contrastive learning models for cross-domain\nretrieval, which associates medical images with their corresponding clinical\nreports. This study benchmarks the robustness of four state-of-the-art\ncontrastive learning models: CLIP, CXR-RePaiR, MedCLIP, and CXR-CLIP. We\nintroduce an occlusion retrieval task to evaluate model performance under\nvarying levels of image corruption. Our findings reveal that all evaluated\nmodels are highly sensitive to out-of-distribution data, as evidenced by the\nproportional decrease in performance with increasing occlusion levels. While\nMedCLIP exhibits slightly more robustness, its overall performance remains\nsignificantly behind CXR-CLIP and CXR-RePaiR. CLIP, trained on a\ngeneral-purpose dataset, struggles with medical image-report retrieval,\nhighlighting the importance of domain-specific training data. The evaluation of\nthis work suggests that more effort needs to be spent on improving the\nrobustness of these models. By addressing these limitations, we can develop\nmore reliable cross-domain retrieval models for medical applications.\n","authors":["Demetrio Deanda","Yuktha Priya Masupalli","Jeong Yang","Young Lee","Zechun Cao","Gongbo Liang"],"pdf_url":"https://arxiv.org/pdf/2501.09134v1.pdf","comment":"This work is accepted to AAAI 2025 Workshop -- the 9th International\n  Workshop on Health Intelligence"},{"id":"http://arxiv.org/abs/2501.09129v1","updated":"2025-01-15T20:24:18Z","published":"2025-01-15T20:24:18Z","title":"Deep Self-Supervised Disturbance Mapping with the OPERA Sentinel-1\n  Radiometric Terrain Corrected SAR Backscatter Product","summary":"  Mapping land surface disturbances supports disaster response, resource and\necosystem management, and climate adaptation efforts. Synthetic aperture radar\n(SAR) is an invaluable tool for disturbance mapping, providing consistent\ntime-series images of the ground regardless of weather or illumination\nconditions. Despite SAR's potential for disturbance mapping, processing SAR\ndata to an analysis-ready format requires expertise and significant compute\nresources, particularly for large-scale global analysis. In October 2023,\nNASA's Observational Products for End-Users from Remote Sensing Analysis\n(OPERA) project released the near-global Radiometric Terrain Corrected SAR\nbackscatter from Sentinel-1 (RTC-S1) dataset, providing publicly available,\nanalysis-ready SAR imagery. In this work, we utilize this new dataset to\nsystematically analyze land surface disturbances. As labeling SAR data is often\nprohibitively time-consuming, we train a self-supervised vision transformer -\nwhich requires no labels to train - on OPERA RTC-S1 data to estimate a\nper-pixel distribution from the set of baseline imagery and assess disturbances\nwhen there is significant deviation from the modeled distribution. To test our\nmodel's capability and generality, we evaluate three different natural\ndisasters - which represent high-intensity, abrupt disturbances - from three\ndifferent regions of the world. Across events, our approach yields high quality\ndelineations: F1 scores exceeding 0.6 and Areas Under the Precision-Recall\nCurve exceeding 0.65, consistently outperforming existing SAR disturbance\nmethods. Our findings suggest that a self-supervised vision transformer is\nwell-suited for global disturbance mapping and can be a valuable tool for\noperational, near-global disturbance monitoring, particularly when labeled data\ndoes not exist.\n","authors":["Harris Hardiman-Mostow","Charles Marshak","Alexander L. Handwerger"],"pdf_url":"https://arxiv.org/pdf/2501.09129v1.pdf","comment":"19 pages, 18 figures, 5 tables. Preprint. Submitted to JSTARS"},{"id":"http://arxiv.org/abs/2501.09116v1","updated":"2025-01-15T19:52:02Z","published":"2025-01-15T19:52:02Z","title":"Deep Distance Map Regression Network with Shape-aware Loss for\n  Imbalanced Medical Image Segmentation","summary":"  Small object segmentation, like tumor segmentation, is a difficult and\ncritical task in the field of medical image analysis. Although deep learning\nbased methods have achieved promising performance, they are restricted to the\nuse of binary segmentation mask. Inspired by the rigorous mapping between\nbinary segmentation mask and distance map, we adopt distance map as a novel\nground truth and employ a network to fulfill the computation of distance map.\nSpecially, we propose a new segmentation framework that incorporates the\nexisting binary segmentation network and a light weight regression network\n(dubbed as LR-Net). Thus, the LR-Net can convert the distance map computation\ninto a regression task and leverage the rich information of distance maps.\nAdditionally, we derive a shape-aware loss by employing distance maps as\npenalty map to infer the complete shape of an object. We evaluated our approach\non MICCAI 2017 Liver Tumor Segmentation (LiTS) Challenge dataset and a clinical\ndataset. Experimental results show that our approach outperforms the\nclassification-based methods as well as other existing state-of-the-arts.\n","authors":["Huiyu Li","Xiabi Liu","Said Boumaraf","Xiaopeng Gong","Donghai Liao","Xiaohong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.09116v1.pdf","comment":"Conference"},{"id":"http://arxiv.org/abs/2501.09114v1","updated":"2025-01-15T19:50:56Z","published":"2025-01-15T19:50:56Z","title":"Generative Medical Image Anonymization Based on Latent Code Projection\n  and Optimization","summary":"  Medical image anonymization aims to protect patient privacy by removing\nidentifying information, while preserving the data utility to solve downstream\ntasks. In this paper, we address the medical image anonymization problem with a\ntwo-stage solution: latent code projection and optimization. In the projection\nstage, we design a streamlined encoder to project input images into a latent\nspace and propose a co-training scheme to enhance the projection process. In\nthe optimization stage, we refine the latent code using two deep loss functions\ndesigned to address the trade-off between identity protection and data utility\ndedicated to medical images. Through a comprehensive set of qualitative and\nquantitative experiments, we showcase the effectiveness of our approach on the\nMIMIC-CXR chest X-ray dataset by generating anonymized synthetic images that\ncan serve as training set for detecting lung pathologies. Source codes are\navailable at https://github.com/Huiyu-Li/GMIA.\n","authors":["Huiyu Li","Nicholas Ayache","Hervé Delingette"],"pdf_url":"https://arxiv.org/pdf/2501.09114v1.pdf","comment":"Conference"},{"id":"http://arxiv.org/abs/2501.09101v1","updated":"2025-01-15T19:37:18Z","published":"2025-01-15T19:37:18Z","title":"Relation U-Net","summary":"  Towards clinical interpretations, this paper presents a new\n''output-with-confidence'' segmentation neural network with multiple input\nimages and multiple output segmentation maps and their pairwise relations. A\nconfidence score of the test image without ground-truth can be estimated from\nthe difference among the estimated relation maps. We evaluate the method based\non the widely used vanilla U-Net for segmentation and our new model is named\nRelation U-Net which can output segmentation maps of the input images as well\nas an estimated confidence score of the test image without ground-truth.\nExperimental results on four public datasets show that Relation U-Net can not\nonly provide better accuracy than vanilla U-Net but also estimate a confidence\nscore which is linearly correlated to the segmentation accuracy on test images.\n","authors":["Sheng He","Rina Bao","P. Ellen Grant","Yangming Ou"],"pdf_url":"https://arxiv.org/pdf/2501.09101v1.pdf","comment":"ISIB 2025"},{"id":"http://arxiv.org/abs/2501.09096v1","updated":"2025-01-15T19:29:31Z","published":"2025-01-15T19:29:31Z","title":"Self Pre-training with Adaptive Mask Autoencoders for Variable-Contrast\n  3D Medical Imaging","summary":"  The Masked Autoencoder (MAE) has recently demonstrated effectiveness in\npre-training Vision Transformers (ViT) for analyzing natural images. By\nreconstructing complete images from partially masked inputs, the ViT encoder\ngathers contextual information to predict the missing regions. This capability\nto aggregate context is especially important in medical imaging, where\nanatomical structures are functionally and mechanically linked to surrounding\nregions. However, current methods do not consider variations in the number of\ninput images, which is typically the case in real-world Magnetic Resonance (MR)\nstudies. To address this limitation, we propose a 3D Adaptive Masked\nAutoencoders (AMAE) architecture that accommodates a variable number of 3D\ninput contrasts per subject. A magnetic resonance imaging (MRI) dataset of\n45,364 subjects was used for pretraining and a subset of 1648 training, 193\nvalidation and 215 test subjects were used for finetuning. The performance\ndemonstrates that self pre-training of this adaptive masked autoencoders can\nenhance the infarct segmentation performance by 2.8%-3.7% for ViT-based\nsegmentation models.\n","authors":["Badhan Kumar Das","Gengyan Zhao","Han Liu","Thomas J. Re","Dorin Comaniciu","Eli Gibson","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2501.09096v1.pdf","comment":"5 pages, ISBI 2025 accepted"},{"id":"http://arxiv.org/abs/2311.12068v4","updated":"2025-01-15T19:28:27Z","published":"2023-11-19T17:28:28Z","title":"Enhancing Novel Object Detection via Cooperative Foundational Models","summary":"  In this work, we address the challenging and emergent problem of novel object\ndetection (NOD), focusing on the accurate detection of both known and novel\nobject categories during inference. Traditional object detection algorithms are\ninherently closed-set, limiting their capability to handle NOD. We present a\nnovel approach to transform existing closed-set detectors into open-set\ndetectors. This transformation is achieved by leveraging the complementary\nstrengths of pre-trained foundational models, specifically CLIP and SAM,\nthrough our cooperative mechanism. Furthermore, by integrating this mechanism\nwith state-of-the-art open-set detectors such as GDINO, we establish new\nbenchmarks in object detection performance. Our method achieves 17.42 mAP in\nnovel object detection and 42.08 mAP for known objects on the challenging LVIS\ndataset. Adapting our approach to the COCO OVD split, we surpass the current\nstate-of-the-art by a margin of 7.2 $ \\text{AP}_{50} $ for novel classes. Our\ncode is available at https://rohit901.github.io/coop-foundation-models/ .\n","authors":["Rohit Bharadwaj","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2311.12068v4.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2501.09086v1","updated":"2025-01-15T19:12:59Z","published":"2025-01-15T19:12:59Z","title":"Salient Information Preserving Adversarial Training Improves Clean and\n  Robust Accuracy","summary":"  In this work we introduce Salient Information Preserving Adversarial Training\n(SIP-AT), an intuitive method for relieving the robustness-accuracy trade-off\nincurred by traditional adversarial training. SIP-AT uses salient image regions\nto guide the adversarial training process in such a way that fragile features\ndeemed meaningful by an annotator remain unperturbed during training, allowing\nmodels to learn highly predictive non-robust features without sacrificing\noverall robustness. This technique is compatible with both human-based and\nautomatically generated salience estimates, allowing SIP-AT to be used as a\npart of human-driven model development without forcing SIP-AT to be reliant\nupon additional human data. We perform experiments across multiple datasets and\narchitectures and demonstrate that SIP-AT is able to boost the clean accuracy\nof models while maintaining a high degree of robustness against attacks at\nmultiple epsilon levels. We complement our central experiments with an\nobservational study measuring the rate at which human subjects successfully\nidentify perturbed images. This study helps build a more intuitive\nunderstanding of adversarial attack strength and demonstrates the heightened\nimportance of low-epsilon robustness. Our results demonstrate the efficacy of\nSIP-AT and provide valuable insight into the risks posed by adversarial samples\nof various strengths.\n","authors":["Timothy Redgrave","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2501.09086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19794v3","updated":"2025-01-15T19:04:48Z","published":"2024-12-27T18:47:05Z","title":"MVTamperBench: Evaluating Robustness of Vision-Language Models","summary":"  Recent advancements in Vision-Language Models (VLMs) have enabled significant\nprogress in complex video understanding tasks. However, their robustness to\nreal-world manipulations remains underexplored, limiting their reliability in\ncritical applications. To address this gap, we introduce MVTamperBench, a\ncomprehensive benchmark designed to evaluate VLM's resilience to video\ntampering effects, including rotation, dropping, masking, substitution, and\nrepetition. By systematically assessing state-of-the-art models, MVTamperBench\nreveals substantial variability in robustness, with models like InternVL2-8B\nachieving high performance, while others, such as Llama-VILA1.5-8B, exhibit\nsevere vulnerabilities. To foster broader adoption and reproducibility,\nMVTamperBench is integrated into VLMEvalKit, a modular evaluation toolkit,\nenabling streamlined testing and facilitating advancements in model robustness.\nOur benchmark represents a critical step towards developing tamper-resilient\nVLMs, ensuring their dependability in real-world scenarios.\n  Project Page: https://amitbcp.github.io/MVTamperBench/\n","authors":["Amit Agarwal","Srikant Panda","Angeline Charles","Bhargava Kumar","Hitesh Patel","Priyaranjan Pattnayak","Taki Hasan Rafi","Tejaswini Kumar","Dong-Kyu Chae"],"pdf_url":"https://arxiv.org/pdf/2412.19794v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09055v1","updated":"2025-01-15T18:39:03Z","published":"2025-01-15T18:39:03Z","title":"SHYI: Action Support for Contrastive Learning in High-Fidelity\n  Text-to-Image Generation","summary":"  In this project, we address the issue of infidelity in text-to-image\ngeneration, particularly for actions involving multiple objects. For this we\nbuild on top of the CONFORM framework which uses Contrastive Learning to\nimprove the accuracy of the generated image for multiple objects. However the\ndepiction of actions which involves multiple different object has still large\nroom for improvement. To improve, we employ semantically hypergraphic\ncontrastive adjacency learning, a comprehension of enhanced contrastive\nstructure and \"contrast but link\" technique. We further amend Stable\nDiffusion's understanding of actions by InteractDiffusion. As evaluation\nmetrics we use image-text similarity CLIP and TIFA. In addition, we conducted a\nuser study.\n  Our method shows promising results even with verbs that Stable Diffusion\nunderstands mediocrely. We then provide future directions by analyzing the\nresults.\n  Our codebase can be found on polybox under the link:\nhttps://polybox.ethz.ch/index.php/s/dJm3SWyRohUrFxn\n","authors":["Tianxiang Xia","Lin Xiao","Yannick Montorfani","Francesco Pavia","Enis Simsar","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2501.09055v1.pdf","comment":"Main content 4 pages"},{"id":"http://arxiv.org/abs/2501.08912v1","updated":"2025-01-15T16:20:26Z","published":"2025-01-15T16:20:26Z","title":"Empowering Agricultural Insights: RiceLeafBD -- A Novel Dataset and\n  Optimal Model Selection for Rice Leaf Disease Diagnosis through Transfer\n  Learning Technique","summary":"  The number of people living in this agricultural nation of ours, which is\nsurrounded by lush greenery, is growing on a daily basis. As a result of this,\nthe level of arable land is decreasing, as well as residential houses and\nindustrial factories. The food crisis is becoming the main threat for us in the\nupcoming days. Because on the one hand, the population is increasing, and on\nthe other hand, the amount of food crop production is decreasing due to the\nattack of diseases. Rice is one of the most significant cultivated crops since\nit provides food for more than half of the world's population. Bangladesh is\ndependent on rice (Oryza sativa) as a vital crop for its agriculture, but it\nfaces a significant problem as a result of the ongoing decline in rice yield\nbrought on by common diseases. Early disease detection is the main difficulty\nin rice crop cultivation. In this paper, we proposed our own dataset, which was\ncollected from the Bangladesh field, and also applied deep learning and\ntransfer learning models for the evaluation of the datasets. We elaborately\nexplain our dataset and also give direction for further research work to serve\nsociety using this dataset. We applied a light CNN model and pre-trained\nInceptionNet-V2, EfficientNet-V2, and MobileNet-V2 models, which achieved 91.5%\nperformance for the EfficientNet-V2 model of this work. The results obtained\nassaulted other models and even exceeded approaches that are considered to be\npart of the state of the art. It has been demonstrated by this study that it is\npossible to precisely and effectively identify diseases that affect rice leaves\nusing this unbiased datasets. After analysis of the performance of different\nmodels, the proposed datasets are significant for the society for research work\nto provide solutions for decreasing rice leaf disease.\n","authors":["Sadia Afrin Rimi","Md. Jalal Uddin Chowdhury","Rifat Abdullah","Iftekhar Ahmed","Mahrima Akter Mim","Mohammad Shoaib Rahman"],"pdf_url":"https://arxiv.org/pdf/2501.08912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09051v1","updated":"2025-01-15T12:40:13Z","published":"2025-01-15T12:40:13Z","title":"Polyp detection in colonoscopy images using YOLOv11","summary":"  Colorectal cancer (CRC) is one of the most commonly diagnosed cancers all\nover the world. It starts as a polyp in the inner lining of the colon. To\nprevent CRC, early polyp detection is required. Colonosopy is used for the\ninspection of the colon. Generally, the images taken by the camera placed at\nthe tip of the endoscope are analyzed by the experts manually. Various\ntraditional machine learning models have been used with the rise of machine\nlearning. Recently, deep learning models have shown more effectiveness in polyp\ndetection due to their superiority in generalizing and learning small features.\nThese deep learning models for object detection can be segregated into two\ndifferent types: single-stage and two-stage. Generally, two stage models have\nhigher accuracy than single stage ones but the single stage models have low\ninference time. Hence, single stage models are easy to use for quick object\ndetection. YOLO is one of the singlestage models used successfully for polyp\ndetection. It has drawn the attention of researchers because of its lower\ninference time. The researchers have used Different versions of YOLO so far,\nand with each newer version, the accuracy of the model is increasing. This\npaper aims to see the effectiveness of the recently released YOLOv11 to detect\npolyp. We analyzed the performance for all five models of YOLOv11 (YOLO11n,\nYOLO11s, YOLO11m, YOLO11l, YOLO11x) with Kvasir dataset for the training and\ntesting. Two different versions of the dataset were used. The first consisted\nof the original dataset, and the other was created using augmentation\ntechniques. The performance of all the models with these two versions of the\ndataset have been analysed.\n","authors":["Alok Ranjan Sahoo","Satya Sangram Sahoo","Pavan Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2501.09051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09050v1","updated":"2025-01-15T12:14:15Z","published":"2025-01-15T12:14:15Z","title":"Generating Realistic Synthetic Head Rotation Data for Extended Reality\n  using Deep Learning","summary":"  Extended Reality is a revolutionary method of delivering multimedia content\nto users. A large contributor to its popularity is the sense of immersion and\ninteractivity enabled by having real-world motion reflected in the virtual\nexperience accurately and immediately. This user motion, mainly caused by head\nrotations, induces several technical challenges. For instance, which content is\ngenerated and transmitted depends heavily on where the user is looking.\nSeamless systems, taking user motion into account proactively, will therefore\nrequire accurate predictions of upcoming rotations. Training and evaluating\nsuch predictors requires vast amounts of orientational input data, which is\nexpensive to gather, as it requires human test subjects. A more feasible\napproach is to gather a modest dataset through test subjects, and then extend\nit to a more sizeable set using synthetic data generation methods. In this\nwork, we present a head rotation time series generator based on TimeGAN, an\nextension of the well-known Generative Adversarial Network, designed\nspecifically for generating time series. This approach is able to extend a\ndataset of head rotations with new samples closely matching the distribution of\nthe measured time series.\n","authors":["Jakob Struye","Filip Lemic","Jeroen Famaey"],"pdf_url":"https://arxiv.org/pdf/2501.09050v1.pdf","comment":"Published and presented at International Conference on Multimedia\n  2022 (ACMMM), Workshop on Interactive eXtended Reality (IXR)"},{"id":"http://arxiv.org/abs/2501.09049v1","updated":"2025-01-15T12:11:33Z","published":"2025-01-15T12:11:33Z","title":"Dynamic-Aware Spatio-temporal Representation Learning for Dynamic MRI\n  Reconstruction","summary":"  Dynamic MRI reconstruction, one of inverse problems, has seen a surge by the\nuse of deep learning techniques. Especially, the practical difficulty of\nobtaining ground truth data has led to the emergence of unsupervised learning\napproaches. A recent promising method among them is implicit neural\nrepresentation (INR), which defines the data as a continuous function that maps\ncoordinate values to the corresponding signal values. This allows for filling\nin missing information only with incomplete measurements and solving the\ninverse problem effectively. Nevertheless, previous works incorporating this\nmethod have faced drawbacks such as long optimization time and the need for\nextensive hyperparameter tuning. To address these issues, we propose\nDynamic-Aware INR (DA-INR), an INR-based model for dynamic MRI reconstruction\nthat captures the spatial and temporal continuity of dynamic MRI data in the\nimage domain and explicitly incorporates the temporal redundancy of the data\ninto the model structure. As a result, DA-INR outperforms other models in\nreconstruction quality even at extreme undersampling ratios while significantly\nreducing optimization time and requiring minimal hyperparameter tuning.\n","authors":["Dayoung Baik","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2501.09049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09048v1","updated":"2025-01-15T11:28:36Z","published":"2025-01-15T11:28:36Z","title":"Anthropomorphic Features for On-Line Signatures","summary":"  Many features have been proposed in on-line signature verification.\nGenerally, these features rely on the position of the on-line signature samples\nand their dynamic properties, as recorded by a tablet. This paper proposes a\nnovel feature space to describe efficiently on-line signatures. Since producing\na signature requires a skeletal arm system and its associated muscles, the new\nfeature space is based on characterizing the movement of the shoulder, the\nelbow and the wrist joints when signing. As this motion is not directly\nobtained from a digital tablet, the new features are calculated by means of a\nvirtual skeletal arm (VSA) model, which simulates the architecture of a real\narm and forearm. Specifically, the VSA motion is described by its 3D joint\nposition and its joint angles. These anthropomorphic features are worked out\nfrom both pen position and orientation through the VSA forward and direct\nkinematic model. The anthropomorphic features' robustness is proved by\nachieving state-of-the-art performance with several verifiers and multiple\nbenchmarks on third party signature databases, which were collected with\ndifferent devices and in different languages and scripts.\n","authors":["Moises Diaz","Miguel A. Ferrer","Jose J. Quintana"],"pdf_url":"https://arxiv.org/pdf/2501.09048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09046v1","updated":"2025-01-15T09:52:40Z","published":"2025-01-15T09:52:40Z","title":"Learning Hemodynamic Scalar Fields on Coronary Artery Meshes: A\n  Benchmark of Geometric Deep Learning Models","summary":"  Coronary artery disease, caused by the narrowing of coronary vessels due to\natherosclerosis, is the leading cause of death worldwide. The diagnostic gold\nstandard, fractional flow reserve (FFR), measures the trans-stenotic pressure\nratio during maximal vasodilation but is invasive and costly. This has driven\nthe development of virtual FFR (vFFR) using computational fluid dynamics (CFD)\nto simulate coronary flow. Geometric deep learning algorithms have shown\npromise for learning features on meshes, including cardiovascular research\napplications. This study empirically analyzes various backends for predicting\nvFFR fields in coronary arteries as CFD surrogates, comparing six backends for\nlearning hemodynamics on meshes using CFD solutions as ground truth.\n  The study has two parts: i) Using 1,500 synthetic left coronary artery\nbifurcations, models were trained to predict pressure-related fields for vFFR\nreconstruction, comparing different learning variables. ii) Using 427\npatient-specific CFD simulations, experiments were repeated focusing on the\nbest-performing learning variable from the synthetic dataset.\n  Most backends performed well on the synthetic dataset, especially when\npredicting pressure drop over the manifold. Transformer-based backends\noutperformed others when predicting pressure and vFFR fields and were the only\nmodels achieving strong performance on patient-specific data, excelling in both\naverage per-point error and vFFR accuracy in stenotic lesions.\n  These results suggest geometric deep learning backends can effectively\nreplace CFD for simple geometries, while transformer-based networks are\nsuperior for complex, heterogeneous datasets. Pressure drop was identified as\nthe optimal network output for learning pressure-related fields.\n","authors":["Guido Nannini","Julian Suk","Patryk Rygiel","Simone Saitta","Luca Mariani","Riccardo Maranga","Andrea Baggiano","Gianluca Pontone","Alberto Redaelli"],"pdf_url":"https://arxiv.org/pdf/2501.09046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09045v1","updated":"2025-01-15T08:52:28Z","published":"2025-01-15T08:52:28Z","title":"Spatio-Temporal Foundation Models: Vision, Challenges, and Opportunities","summary":"  Foundation models have revolutionized artificial intelligence, setting new\nbenchmarks in performance and enabling transformative capabilities across a\nwide range of vision and language tasks. However, despite the prevalence of\nspatio-temporal data in critical domains such as transportation, public health,\nand environmental monitoring, spatio-temporal foundation models (STFMs) have\nnot yet achieved comparable success. In this paper, we articulate a vision for\nthe future of STFMs, outlining their essential characteristics and the\ngeneralization capabilities necessary for broad applicability. We critically\nassess the current state of research, identifying gaps relative to these ideal\ntraits, and highlight key challenges that impede their progress. Finally, we\nexplore potential opportunities and directions to advance research towards the\naim of effective and broadly applicable STFMs.\n","authors":["Adam Goodge","Wee Siong Ng","Bryan Hooi","See Kiong Ng"],"pdf_url":"https://arxiv.org/pdf/2501.09045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09044v1","updated":"2025-01-15T07:14:02Z","published":"2025-01-15T07:14:02Z","title":"TCMM: Token Constraint and Multi-Scale Memory Bank of Contrastive\n  Learning for Unsupervised Person Re-identification","summary":"  This paper proposes the ViT Token Constraint and Multi-scale Memory bank\n(TCMM) method to address the patch noises and feature inconsistency in\nunsupervised person re-identification works. Many excellent methods use ViT\nfeatures to obtain pseudo labels and clustering prototypes, then train the\nmodel with contrastive learning. However, ViT processes images by performing\npatch embedding, which inevitably introduces noise in patches and may\ncompromise the performance of the re-identification model. On the other hand,\nprevious memory bank based contrastive methods may lead data inconsistency due\nto the limitation of batch size. Furthermore, existing pseudo label methods\noften discard outlier samples that are difficult to cluster. It sacrifices the\npotential value of outlier samples, leading to limited model diversity and\nrobustness. This paper introduces the ViT Token Constraint to mitigate the\ndamage caused by patch noises to the ViT architecture. The proposed Multi-scale\nMemory enhances the exploration of outlier samples and maintains feature\nconsistency. Experimental results demonstrate that our system achieves\nstate-of-the-art performance on common benchmarks. The project is available at\n\\href{https://github.com/andy412510/TCMM}{https://github.com/andy412510/TCMM}.\n","authors":["Zheng-An Zhu","Hsin-Che Chien","Chen-Kuo Chiang"],"pdf_url":"https://arxiv.org/pdf/2501.09044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09042v1","updated":"2025-01-15T06:58:53Z","published":"2025-01-15T06:58:53Z","title":"CookingDiffusion: Cooking Procedural Image Generation with Stable\n  Diffusion","summary":"  Recent advancements in text-to-image generation models have excelled in\ncreating diverse and realistic images. This success extends to food imagery,\nwhere various conditional inputs like cooking styles, ingredients, and recipes\nare utilized. However, a yet-unexplored challenge is generating a sequence of\nprocedural images based on cooking steps from a recipe. This could enhance the\ncooking experience with visual guidance and possibly lead to an intelligent\ncooking simulation system. To fill this gap, we introduce a novel task called\n\\textbf{cooking procedural image generation}. This task is inherently\ndemanding, as it strives to create photo-realistic images that align with\ncooking steps while preserving sequential consistency. To collectively tackle\nthese challenges, we present \\textbf{CookingDiffusion}, a novel approach that\nleverages Stable Diffusion and three innovative Memory Nets to model procedural\nprompts. These prompts encompass text prompts (representing cooking steps),\nimage prompts (corresponding to cooking images), and multi-modal prompts\n(mixing cooking steps and images), ensuring the consistent generation of\ncooking procedural images. To validate the effectiveness of our approach, we\npreprocess the YouCookII dataset, establishing a new benchmark. Our\nexperimental results demonstrate that our model excels at generating\nhigh-quality cooking procedural images with remarkable consistency across\nsequential cooking steps, as measured by both the FID and the proposed Average\nProcedure Consistency metrics. Furthermore, CookingDiffusion demonstrates the\nability to manipulate ingredients and cooking methods in a recipe. We will make\nour code, models, and dataset publicly accessible.\n","authors":["Yuan Wang","Bin Xhu","Yanbin Hao","Chong-Wah Ngo","Yi Tan","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.04412v2","updated":"2025-01-15T04:46:30Z","published":"2022-03-07T17:22:30Z","title":"ImageNet-Patch: A Dataset for Benchmarking Machine Learning Robustness\n  against Adversarial Patches","summary":"  Adversarial patches are optimized contiguous pixel blocks in an input image\nthat cause a machine-learning model to misclassify it. However, their\noptimization is computationally demanding, and requires careful hyperparameter\ntuning, potentially leading to suboptimal robustness evaluations. To overcome\nthese issues, we propose ImageNet-Patch, a dataset to benchmark\nmachine-learning models against adversarial patches. It consists of a set of\npatches, optimized to generalize across different models, and readily\napplicable to ImageNet data after preprocessing them with affine\ntransformations. This process enables an approximate yet faster robustness\nevaluation, leveraging the transferability of adversarial perturbations. We\nshowcase the usefulness of this dataset by testing the effectiveness of the\ncomputed patches against 127 models. We conclude by discussing how our dataset\ncould be used as a benchmark for robustness, and how our methodology can be\ngeneralized to other domains. We open source our dataset and evaluation code at\nhttps://github.com/pralab/ImageNet-Patch.\n","authors":["Maura Pintor","Daniele Angioni","Angelo Sotgiu","Luca Demetrio","Ambra Demontis","Battista Biggio","Fabio Roli"],"pdf_url":"https://arxiv.org/pdf/2203.04412v2.pdf","comment":"Published in Pattern Recognition. DOI:\n  https://doi.org/10.1016/j.patcog.2022.109064"},{"id":"http://arxiv.org/abs/2501.09041v1","updated":"2025-01-15T04:00:36Z","published":"2025-01-15T04:00:36Z","title":"Generative Visual Commonsense Answering and Explaining with Generative\n  Scene Graph Constructing","summary":"  Visual Commonsense Reasoning, which is regarded as one challenging task to\npursue advanced visual scene comprehension, has been used to diagnose the\nreasoning ability of AI systems. However, reliable reasoning requires a good\ngrasp of the scene's details. Existing work fails to effectively exploit the\nreal-world object relationship information present within the scene, and\ninstead overly relies on knowledge from training memory. Based on these\nobservations, we propose a novel scene-graph-enhanced visual commonsense\nreasoning generation method named \\textit{\\textbf{G2}}, which first utilizes\nthe image patches and LLMs to construct a location-free scene graph, and then\nanswer and explain based on the scene graph's information. We also propose\nautomatic scene graph filtering and selection strategies to absorb valuable\nscene graph information during training. Extensive experiments are conducted on\nthe tasks and datasets of scene graph constructing and visual commonsense\nanswering and explaining, respectively. Experimental results and ablation\nanalysis demonstrate the effectiveness of our proposed framework.\n","authors":["Fan Yuan","Xiaoyuan Fang","Rong Quan","Jing Li","Wei Bi","Xiaogang Xu","Piji Li"],"pdf_url":"https://arxiv.org/pdf/2501.09041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09040v1","updated":"2025-01-15T03:25:25Z","published":"2025-01-15T03:25:25Z","title":"Pseudolabel guided pixels contrast for domain adaptive semantic\n  segmentation","summary":"  Semantic segmentation is essential for comprehending images, but the process\nnecessitates a substantial amount of detailed annotations at the pixel level.\nAcquiring such annotations can be costly in the real-world. Unsupervised domain\nadaptation (UDA) for semantic segmentation is a technique that uses virtual\ndata with labels to train a model and adapts it to real data without labels.\nSome recent works use contrastive learning, which is a powerful method for\nself-supervised learning, to help with this technique. However, these works do\nnot take into account the diversity of features within each class when using\ncontrastive learning, which leads to errors in class prediction. We analyze the\nlimitations of these works and propose a novel framework called Pseudo-label\nGuided Pixel Contrast (PGPC), which overcomes the disadvantages of previous\nmethods. We also investigate how to use more information from target images\nwithout adding noise from pseudo-labels. We test our method on two standard UDA\nbenchmarks and show that it outperforms existing methods. Specifically, we\nachieve relative improvements of 5.1% mIoU and 4.6% mIoU on the Grand Theft\nAuto V (GTA5) to Cityscapes and SYNTHIA to Cityscapes tasks based on DAFormer,\nrespectively. Furthermore, our approach can enhance the performance of other\nUDA approaches without increasing model complexity. Code is available at\nhttps://github.com/embar111/pgpc\n","authors":["Jianzi Xiang","Cailu Wan","Zhu Cao"],"pdf_url":"https://arxiv.org/pdf/2501.09040v1.pdf","comment":"24 pages, 5 figures. Code: https://github.com/embar111/pgpc"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2501.08927v1","updated":"2025-01-15T16:30:45Z","published":"2025-01-15T16:30:45Z","title":"Continuous Approach to Phase (Norm) Retrieval Frames","summary":"  This paper investigates the properties of continuous frames, with a\nparticular focus on phase retrieval and norm retrieval in the context of\nHilbert spaces. We introduce the concept of continuous near-Riesz bases and\nprove their invariance under invertible operators. Some equivalent conditions\nfor phase and norm retrieval property of continuous frames are presented. We\nstudy the stability of phase retrieval under perturbations. Furthermore, tensor\nproduct frames for separable Hilbert spaces are studied, and we establish the\nequivalence of phase retrieval and norm retrieval properties between components\nand their tensor products.\n","authors":["Ramin Farshchian","Rajab Ali Kamyabi-Gol","Fahimeh Arabyani-Neyshaburi","Fatemeh Esmaeelzadeh"],"pdf_url":"https://arxiv.org/pdf/2501.08927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08828v1","updated":"2025-01-15T14:30:13Z","published":"2025-01-15T14:30:13Z","title":"MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents","summary":"  Multi-modal document retrieval is designed to identify and retrieve various\nforms of multi-modal content, such as figures, tables, charts, and layout\ninformation from extensive documents. Despite its significance, there is a\nnotable lack of a robust benchmark to effectively evaluate the performance of\nsystems in multi-modal document retrieval. To address this gap, this work\nintroduces a new benchmark, named as MMDocIR, encompassing two distinct tasks:\npage-level and layout-level retrieval. The former focuses on localizing the\nmost relevant pages within a long document, while the latter targets the\ndetection of specific layouts, offering a more fine-grained granularity than\nwhole-page analysis. A layout can refer to a variety of elements such as\ntextual paragraphs, equations, figures, tables, or charts. The MMDocIR\nbenchmark comprises a rich dataset featuring expertly annotated labels for\n1,685 questions and bootstrapped labels for 173,843 questions, making it a\npivotal resource for advancing multi-modal document retrieval for both training\nand evaluation. Through rigorous experiments, we reveal that (i) visual\nretrievers significantly outperform their text counterparts, (ii) MMDocIR train\nset can effectively benefit the training process of multi-modal document\nretrieval and (iii) text retrievers leveraging on VLM-text perform much better\nthan those using OCR-text. These findings underscores the potential advantages\nof integrating visual elements for multi-modal document retrieval.\n","authors":["Kuicai Dong","Yujing Chang","Xin Deik Goh","Dexun Li","Ruiming Tang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08828v1.pdf","comment":"https://huggingface.co/MMDocIR"},{"id":"http://arxiv.org/abs/2501.08717v1","updated":"2025-01-15T10:58:32Z","published":"2025-01-15T10:58:32Z","title":"$\\texttt{InfoHier}$: Hierarchical Information Extraction via Encoding\n  and Embedding","summary":"  Analyzing large-scale datasets, especially involving complex and\nhigh-dimensional data like images, is particularly challenging. While\nself-supervised learning (SSL) has proven effective for learning\nrepresentations from unlabelled data, it typically focuses on flat,\nnon-hierarchical structures, missing the multi-level relationships present in\nmany real-world datasets. Hierarchical clustering (HC) can uncover these\nrelationships by organizing data into a tree-like structure, but it often\nrelies on rigid similarity metrics that struggle to capture the complexity of\ndiverse data types. To address these we envision $\\texttt{InfoHier}$, a\nframework that combines SSL with HC to jointly learn robust latent\nrepresentations and hierarchical structures. This approach leverages SSL to\nprovide adaptive representations, enhancing HC's ability to capture complex\npatterns. Simultaneously, it integrates HC loss to refine SSL training,\nresulting in representations that are more attuned to the underlying\ninformation hierarchy. $\\texttt{InfoHier}$ has the potential to improve the\nexpressiveness and performance of both clustering and representation learning,\noffering significant benefits for data analysis, management, and information\nretrieval.\n","authors":["Tianru Zhang","Li Ju","Prashant Singh","Salman Toor"],"pdf_url":"https://arxiv.org/pdf/2501.08717v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.08695v1","updated":"2025-01-15T10:09:15Z","published":"2025-01-15T10:09:15Z","title":"Real-time Indexing for Large-scale Recommendation by Streaming Vector\n  Quantization Retriever","summary":"  Retrievers, which form one of the most important recommendation stages, are\nresponsible for efficiently selecting possible positive samples to the later\nstages under strict latency limitations. Because of this, large-scale systems\nalways rely on approximate calculations and indexes to roughly shrink candidate\nscale, with a simple ranking model. Considering simple models lack the ability\nto produce precise predictions, most of the existing methods mainly focus on\nincorporating complicated ranking models. However, another fundamental problem\nof index effectiveness remains unresolved, which also bottlenecks complication.\nIn this paper, we propose a novel index structure: streaming Vector\nQuantization model, as a new generation of retrieval paradigm. Streaming VQ\nattaches items with indexes in real time, granting it immediacy. Moreover,\nthrough meticulous verification of possible variants, it achieves additional\nbenefits like index balancing and reparability, enabling it to support\ncomplicated ranking models as existing approaches. As a lightweight and\nimplementation-friendly architecture, streaming VQ has been deployed and\nreplaced all major retrievers in Douyin and Douyin Lite, resulting in\nremarkable user engagement gain.\n","authors":["Xingyan Bin","Jianfei Cui","Wujie Yan","Zhichen Zhao","Xintian Han","Chongyang Yan","Feng Zhang","Xun Zhou","Qi Wu","Zuotao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08686v1","updated":"2025-01-15T09:32:37Z","published":"2025-01-15T09:32:37Z","title":"Knowledge Graph-based Retrieval-Augmented Generation for Schema Matching","summary":"  Traditional similarity-based schema matching methods are incapable of\nresolving semantic ambiguities and conflicts in domain-specific complex mapping\nscenarios due to missing commonsense and domain-specific knowledge. The\nhallucination problem of large language models (LLMs) also makes it challenging\nfor LLM-based schema matching to address the above issues. Therefore, we\npropose a Knowledge Graph-based Retrieval-Augmented Generation model for Schema\nMatching, referred to as the KG-RAG4SM. In particular, KG-RAG4SM introduces\nnovel vector-based, graph traversal-based, and query-based graph retrievals, as\nwell as a hybrid approach and ranking schemes that identify the most relevant\nsubgraphs from external large knowledge graphs (KGs). We showcase that KG-based\nretrieval-augmented LLMs are capable of generating more accurate results for\ncomplex matching cases without any re-training. Our experimental results show\nthat KG-RAG4SM outperforms the LLM-based state-of-the-art (SOTA) methods (e.g.,\nJellyfish-8B) by 35.89% and 30.50% in terms of precision and F1 score on the\nMIMIC dataset, respectively; KG-RAG4SM with GPT-4o-mini outperforms the\npre-trained language model (PLM)-based SOTA methods (e.g., SMAT) by 69.20% and\n21.97% in terms of precision and F1 score on the Synthea dataset, respectively.\nThe results also demonstrate that our approach is more efficient in end-to-end\nschema matching, and scales to retrieve from large KGs. Our case studies on the\ndataset from the real-world schema matching scenario exhibit that the\nhallucination problem of LLMs for schema matching is well mitigated by our\nsolution.\n","authors":["Chuangtao Ma","Sriom Chakrabarti","Arijit Khan","Bálint Molnár"],"pdf_url":"https://arxiv.org/pdf/2501.08686v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2401.15299v3","updated":"2025-01-15T09:23:55Z","published":"2024-01-27T05:14:17Z","title":"SupplyGraph: A Benchmark Dataset for Supply Chain Planning using Graph\n  Neural Networks","summary":"  Graph Neural Networks (GNNs) have gained traction across different domains\nsuch as transportation, bio-informatics, language processing, and computer\nvision. However, there is a noticeable absence of research on applying GNNs to\nsupply chain networks. Supply chain networks are inherently graph-like in\nstructure, making them prime candidates for applying GNN methodologies. This\nopens up a world of possibilities for optimizing, predicting, and solving even\nthe most complex supply chain problems. A major setback in this approach lies\nin the absence of real-world benchmark datasets to facilitate the research and\nresolution of supply chain problems using GNNs. To address the issue, we\npresent a real-world benchmark dataset for temporal tasks, obtained from one of\nthe leading FMCG companies in Bangladesh, focusing on supply chain planning for\nproduction purposes. The dataset includes temporal data as node features to\nenable sales predictions, production planning, and the identification of\nfactory issues. By utilizing this dataset, researchers can employ GNNs to\naddress numerous supply chain problems, thereby advancing the field of supply\nchain analytics and planning. Source: https://github.com/CIOL-SUST/SupplyGraph\n","authors":["Azmine Toushik Wasi","MD Shafikul Islam","Adipto Raihan Akib"],"pdf_url":"https://arxiv.org/pdf/2401.15299v3.pdf","comment":"Accepted to 4th workshop on Graphs and more Complex structures for\n  Learning and Reasoning, colocated with AAAI 2024"},{"id":"http://arxiv.org/abs/2407.19692v3","updated":"2025-01-15T05:04:59Z","published":"2024-07-29T04:30:38Z","title":"Fusion Self-supervised Learning for Recommendation","summary":"  Recommender systems are widely deployed in various web environments, and\nself-supervised learning (SSL) has recently attracted significant attention in\nthis field. Contrastive learning (CL) stands out as a major SSL paradigm due to\nits robust ability to generate self-supervised signals. Mainstream graph\ncontrastive learning (GCL)-based methods typically implement CL by creating\ncontrastive views through various data augmentation techniques. Despite these\nmethods are effective, we argue that there still exist several challenges. i)\nData augmentation ($e.g.,$ discarding edges or adding noise) necessitates\nadditional graph convolution (GCN) or modeling operations, which are highly\ntime-consuming and potentially harm the embedding quality. ii) Existing\nCL-based methods use traditional CL objectives to capture self-supervised\nsignals. However, few studies have explored obtaining CL objectives from more\nperspectives and have attempted to fuse the varying signals from these CL\nobjectives to enhance recommendation performance.\n  To overcome these challenges, we propose a High-order Fusion Graph\nContrastive Learning (HFGCL) framework for recommendation. Specifically,\ninstead of facilitating data augmentations, we use high-order information from\nGCN process to create contrastive views. Additionally, to integrate\nself-supervised signals from various CL objectives, we propose an advanced CL\nobjective. By ensuring that positive pairs are distanced from negative samples\nderived from both contrastive views, we effectively fuse self-supervised\nsignals from distinct CL objectives, thereby enhancing the mutual information\nbetween positive pairs. Experimental results on three public datasets\ndemonstrate the superior recommendation performance and efficiency of HFGCL\ncompared to the state-of-the-art baselines.\n","authors":["Yu Zhang","Lei Sang","Yi Zhang","Yiwen Zhang","Yun Yang"],"pdf_url":"https://arxiv.org/pdf/2407.19692v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08572v1","updated":"2025-01-15T04:36:55Z","published":"2025-01-15T04:36:55Z","title":"DNMDR: Dynamic Networks and Multi-view Drug Representations for Safe\n  Medication Recommendation","summary":"  Medication Recommendation (MR) is a promising research topic which booms\ndiverse applications in the healthcare and clinical domains. However, existing\nmethods mainly rely on sequential modeling and static graphs for representation\nlearning, which ignore the dynamic correlations in diverse medical events of a\npatient's temporal visits, leading to insufficient global structural\nexploration on nodes. Additionally, mitigating drug-drug interactions (DDIs) is\nanother issue determining the utility of the MR systems. To address the\nchallenges mentioned above, this paper proposes a novel MR method with the\nintegration of dynamic networks and multi-view drug representations (DNMDR).\nSpecifically, weighted snapshot sequences for dynamic heterogeneous networks\nare constructed based on discrete visits in temporal EHRs, and all the dynamic\nnetworks are jointly trained to gain both structural correlations in diverse\nmedical events and temporal dependency in historical health conditions, for\nachieving comprehensive patient representations with both semantic features and\nstructural relationships. Moreover, combining the drug co-occurrences and\nadverse drug-drug interactions (DDIs) in internal view of drug molecule\nstructure and interactive view of drug pairs, the safe drug representations are\navailable to obtain high-quality medication combination recommendation.\nFinally, extensive experiments on real world datasets are conducted for\nperformance evaluation, and the experimental results demonstrate that the\nproposed DNMDR method outperforms the state-of-the-art baseline models with a\nlarge margin on various metrics such as PRAUC, Jaccard, DDI rates and so on.\n","authors":["Guanlin Liu","Xiaomei Yu","Zihao Liu","Xue Li","Xingxu Fan","Xiangwei Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.08572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09186v1","updated":"2025-01-15T22:23:53Z","published":"2025-01-15T22:23:53Z","title":"Guiding Retrieval using LLM-based Listwise Rankers","summary":"  Large Language Models (LLMs) have shown strong promise as rerankers,\nespecially in ``listwise'' settings where an LLM is prompted to rerank several\nsearch results at once. However, this ``cascading'' retrieve-and-rerank\napproach is limited by the bounded recall problem: relevant documents not\nretrieved initially are permanently excluded from the final ranking. Adaptive\nretrieval techniques address this problem, but do not work with listwise\nrerankers because they assume a document's score is computed independently from\nother documents. In this paper, we propose an adaptation of an existing\nadaptive retrieval method that supports the listwise setting and helps guide\nthe retrieval process itself (thereby overcoming the bounded recall problem for\nLLM rerankers). Specifically, our proposed algorithm merges results both from\nthe initial ranking and feedback documents provided by the most relevant\ndocuments seen up to that point. Through extensive experiments across diverse\nLLM rerankers, first stage retrievers, and feedback sources, we demonstrate\nthat our method can improve nDCG@10 by up to 13.23% and recall by 28.02%--all\nwhile keeping the total number of LLM inferences constant and overheads due to\nthe adaptive process minimal. The work opens the door to leveraging LLM-based\nsearch in settings where the initial pool of results is limited, e.g., by\nlegacy systems, or by the cost of deploying a semantic first-stage.\n","authors":["Mandeep Rathee","Sean MacAvaney","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2501.09186v1.pdf","comment":"16 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.09136v1","updated":"2025-01-15T20:40:25Z","published":"2025-01-15T20:40:25Z","title":"Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG","summary":"  Large Language Models (LLMs) have revolutionized artificial intelligence (AI)\nby enabling human like text generation and natural language understanding.\nHowever, their reliance on static training data limits their ability to respond\nto dynamic, real time queries, resulting in outdated or inaccurate outputs.\nRetrieval Augmented Generation (RAG) has emerged as a solution, enhancing LLMs\nby integrating real time data retrieval to provide contextually relevant and\nup-to-date responses. Despite its promise, traditional RAG systems are\nconstrained by static workflows and lack the adaptability required for\nmultistep reasoning and complex task management.\n  Agentic Retrieval-Augmented Generation (Agentic RAG) transcends these\nlimitations by embedding autonomous AI agents into the RAG pipeline. These\nagents leverage agentic design patterns reflection, planning, tool use, and\nmultiagent collaboration to dynamically manage retrieval strategies,\niteratively refine contextual understanding, and adapt workflows to meet\ncomplex task requirements. This integration enables Agentic RAG systems to\ndeliver unparalleled flexibility, scalability, and context awareness across\ndiverse applications.\n  This survey provides a comprehensive exploration of Agentic RAG, beginning\nwith its foundational principles and the evolution of RAG paradigms. It\npresents a detailed taxonomy of Agentic RAG architectures, highlights key\napplications in industries such as healthcare, finance, and education, and\nexamines practical implementation strategies. Additionally, it addresses\nchallenges in scaling these systems, ensuring ethical decision making, and\noptimizing performance for real-world applications, while providing detailed\ninsights into frameworks and tools for implementing Agentic RAG\n","authors":["Aditi Singh","Abul Ehtesham","Saket Kumar","Tala Talaei Khoei"],"pdf_url":"https://arxiv.org/pdf/2501.09136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09134v1","updated":"2025-01-15T20:37:04Z","published":"2025-01-15T20:37:04Z","title":"Benchmarking Robustness of Contrastive Learning Models for Medical\n  Image-Report Retrieval","summary":"  Medical images and reports offer invaluable insights into patient health. The\nheterogeneity and complexity of these data hinder effective analysis. To bridge\nthis gap, we investigate contrastive learning models for cross-domain\nretrieval, which associates medical images with their corresponding clinical\nreports. This study benchmarks the robustness of four state-of-the-art\ncontrastive learning models: CLIP, CXR-RePaiR, MedCLIP, and CXR-CLIP. We\nintroduce an occlusion retrieval task to evaluate model performance under\nvarying levels of image corruption. Our findings reveal that all evaluated\nmodels are highly sensitive to out-of-distribution data, as evidenced by the\nproportional decrease in performance with increasing occlusion levels. While\nMedCLIP exhibits slightly more robustness, its overall performance remains\nsignificantly behind CXR-CLIP and CXR-RePaiR. CLIP, trained on a\ngeneral-purpose dataset, struggles with medical image-report retrieval,\nhighlighting the importance of domain-specific training data. The evaluation of\nthis work suggests that more effort needs to be spent on improving the\nrobustness of these models. By addressing these limitations, we can develop\nmore reliable cross-domain retrieval models for medical applications.\n","authors":["Demetrio Deanda","Yuktha Priya Masupalli","Jeong Yang","Young Lee","Zechun Cao","Gongbo Liang"],"pdf_url":"https://arxiv.org/pdf/2501.09134v1.pdf","comment":"This work is accepted to AAAI 2025 Workshop -- the 9th International\n  Workshop on Health Intelligence"},{"id":"http://arxiv.org/abs/2501.10470v1","updated":"2025-01-15T22:17:01Z","published":"2025-01-15T22:17:01Z","title":"Off-policy Evaluation for Payments at Adyen","summary":"  This paper demonstrates the successful application of Off-Policy Evaluation\n(OPE) to accelerate recommender system development and optimization at Adyen, a\nglobal leader in financial payment processing. Facing the limitations of\ntraditional A/B testing, which proved slow, costly, and often inconclusive, we\nintegrated OPE to enable rapid evaluation of new recommender system variants\nusing historical data. Our analysis, conducted on a billion-scale dataset of\ntransactions, reveals a strong correlation between OPE estimates and online A/B\ntest results, projecting an incremental 9--54 million transactions over a\nsix-month period. We explore the practical challenges and trade-offs associated\nwith deploying OPE in a high-volume production environment, including\nleveraging exploration traffic for data collection, mitigating variance in\nimportance sampling, and ensuring scalability through the use of Apache Spark.\nBy benchmarking various OPE estimators, we provide guidance on their\neffectiveness and integration into the decision-making systems for large-scale\nindustrial payment systems.\n","authors":["Alex Egg"],"pdf_url":"https://arxiv.org/pdf/2501.10470v1.pdf","comment":"10 pages, 5 figures, submitted to RecSys '25"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2501.09009v1","updated":"2025-01-15T18:50:52Z","published":"2025-01-15T18:50:52Z","title":"Towards Fast, Specialized Machine Learning Force Fields: Distilling\n  Foundation Models via Energy Hessians","summary":"  The foundation model (FM) paradigm is transforming Machine Learning Force\nFields (MLFFs), leveraging general-purpose representations and scalable\ntraining to perform a variety of computational chemistry tasks. Although MLFF\nFMs have begun to close the accuracy gap relative to first-principles methods,\nthere is still a strong need for faster inference speed. Additionally, while\nresearch is increasingly focused on general-purpose models which transfer\nacross chemical space, practitioners typically only study a small subset of\nsystems at a given time. This underscores the need for fast, specialized MLFFs\nrelevant to specific downstream applications, which preserve test-time physical\nsoundness while maintaining train-time scalability. In this work, we introduce\na method for transferring general-purpose representations from MLFF foundation\nmodels to smaller, faster MLFFs specialized to specific regions of chemical\nspace. We formulate our approach as a knowledge distillation procedure, where\nthe smaller \"student\" MLFF is trained to match the Hessians of the energy\npredictions of the \"teacher\" foundation model. Our specialized MLFFs can be up\nto 20 $\\times$ faster than the original foundation model, while retaining, and\nin some cases exceeding, its performance and that of undistilled models. We\nalso show that distilling from a teacher model with a direct force\nparameterization into a student model trained with conservative forces (i.e.,\ncomputed as derivatives of the potential energy) successfully leverages the\nrepresentations from the large-scale teacher for improved accuracy, while\nmaintaining energy conservation during test-time molecular dynamics\nsimulations. More broadly, our work suggests a new paradigm for MLFF\ndevelopment, in which foundation models are released along with smaller,\nspecialized simulation \"engines\" for common chemical subsets.\n","authors":["Ishan Amin","Sanjeev Raja","Aditi Krishnapriyan"],"pdf_url":"https://arxiv.org/pdf/2501.09009v1.pdf","comment":"Under Review at ICLR 2025"},{"id":"http://arxiv.org/abs/2501.09006v1","updated":"2025-01-15T18:45:05Z","published":"2025-01-15T18:45:05Z","title":"Improving Stability Estimates in Adversarial Explainable AI through\n  Alternate Search Methods","summary":"  Advances in the effectiveness of machine learning models have come at the\ncost of enormous complexity resulting in a poor understanding of how they\nfunction. Local surrogate methods have been used to approximate the workings of\nthese complex models, but recent work has revealed their vulnerability to\nadversarial attacks where the explanation produced is appreciably different\nwhile the meaning and structure of the complex model's output remains similar.\nThis prior work has focused on the existence of these weaknesses but not on\ntheir magnitude. Here we explore using an alternate search method with the goal\nof finding minimum viable perturbations, the fewest perturbations necessary to\nachieve a fixed similarity value between the original and altered text's\nexplanation. Intuitively, a method that requires fewer perturbations to expose\na given level of instability is inferior to one which requires more. This\nnuance allows for superior comparisons of the stability of explainability\nmethods.\n","authors":["Christopher Burger","Charles Walter"],"pdf_url":"https://arxiv.org/pdf/2501.09006v1.pdf","comment":"9 pages, 3 figures, 5 tables. arXiv admin note: text overlap with\n  arXiv:2406.15839"},{"id":"http://arxiv.org/abs/2302.04851v2","updated":"2025-01-15T18:45:04Z","published":"2023-02-09T18:54:02Z","title":"Delay Sensitive Hierarchical Federated Learning with Stochastic Local\n  Updates","summary":"  The impact of local averaging on the performance of federated learning (FL)\nsystems is studied in the presence of communication delay between the clients\nand the parameter server. To minimize the effect of delay, clients are assigned\ninto different groups, each having its own local parameter server (LPS) that\naggregates its clients' models. The groups' models are then aggregated at a\nglobal parameter server (GPS) that only communicates with the LPSs. Such\nsetting is known as hierarchical FL (HFL). Unlike most works in the literature,\nthe number of local and global communication rounds in our work is randomly\ndetermined by the (different) delays experienced by each group of clients.\nSpecifically, the number of local averaging rounds is tied to a wall-clock time\nperiod coined the sync time $S$, after which the LPSs synchronize their models\nby sharing them with the GPS. Such sync time $S$ is then reapplied until a\nglobal wall-clock time is exhausted.\n  First, an upper bound on the deviation between the updated model at each LPS\nwith respect to that available at the GPS is derived. This is then used as a\ntool to derive the convergence analysis of our proposed delay-sensitive HFL\nalgorithm, first at each LPS individually, and then at the GPS. Our theoretical\nconvergence bound showcases the effects of the whole system's parameters,\nincluding the number of groups, the number of clients per group, and the value\nof $S$. Our results show that the value of $S$ should be carefully chosen,\nespecially since it implicitly governs how the delay statistics affect the\nperformance of HFL in situations where training time is restricted.\n","authors":["Abdulmoneam Ali","Ahmed Arafa"],"pdf_url":"https://arxiv.org/pdf/2302.04851v2.pdf","comment":"To appear in the IEEE Transactions on Cognitive Communications and\n  Networking"},{"id":"http://arxiv.org/abs/2406.00120v4","updated":"2025-01-15T18:30:12Z","published":"2024-05-31T18:22:09Z","title":"Reward Machines for Deep RL in Noisy and Uncertain Environments","summary":"  Reward Machines provide an automaton-inspired structure for specifying\ninstructions, safety constraints, and other temporally extended reward-worthy\nbehaviour. By exposing the underlying structure of a reward function, they\nenable the decomposition of an RL task, leading to impressive gains in sample\nefficiency. Although Reward Machines and similar formal specifications have a\nrich history of application towards sequential decision-making problems, they\ncritically rely on a ground-truth interpretation of the domain-specific\nvocabulary that forms the building blocks of the reward function--such\nground-truth interpretations are elusive in the real world due in part to\npartial observability and noisy sensing. In this work, we explore the use of\nReward Machines for Deep RL in noisy and uncertain environments. We\ncharacterize this problem as a POMDP and propose a suite of RL algorithms that\nexploit task structure under uncertain interpretation of the domain-specific\nvocabulary. Through theory and experiments, we expose pitfalls in naive\napproaches to this problem while simultaneously demonstrating how task\nstructure can be successfully leveraged under noisy interpretations of the\nvocabulary.\n","authors":["Andrew C. Li","Zizhao Chen","Toryn Q. Klassen","Pashootan Vaezipoor","Rodrigo Toro Icarte","Sheila A. McIlraith"],"pdf_url":"https://arxiv.org/pdf/2406.00120v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06848v2","updated":"2025-01-15T18:28:37Z","published":"2025-01-12T15:34:24Z","title":"A General Framework for Inference-time Scaling and Steering of Diffusion\n  Models","summary":"  Diffusion models produce impressive results in modalities ranging from images\nand video to protein design and text. However, generating samples with\nuser-specified properties remains a challenge. Recent research proposes\nfine-tuning models to maximize rewards that capture desired properties, but\nthese methods require expensive training and are prone to mode collapse. In\nthis work, we propose Feynman Kac (FK) steering, an inference-time framework\nfor steering diffusion models with reward functions. FK steering works by\nsampling a system of multiple interacting diffusion processes, called\nparticles, and resampling particles at intermediate steps based on scores\ncomputed using functions called potentials. Potentials are defined using\nrewards for intermediate states and are selected such that a high value\nindicates that the particle will yield a high-reward sample. We explore various\nchoices of potentials, intermediate rewards, and samplers. We evaluate FK\nsteering on text-to-image and text diffusion models. For steering text-to-image\nmodels with a human preference reward, we find that FK steering a 0.8B\nparameter model outperforms a 2.6B parameter fine-tuned model on prompt\nfidelity, with faster sampling and no training. For steering text diffusion\nmodels with rewards for text quality and specific text attributes, we find that\nFK steering generates lower perplexity, more linguistically acceptable outputs\nand enables gradient-free control of attributes like toxicity. Our results\ndemonstrate that inference-time scaling and steering of diffusion models, even\nwith off-the-shelf rewards, can provide significant sample quality gains and\ncontrollability benefits. Code is available at\nhttps://github.com/zacharyhorvitz/Fk-Diffusion-Steering .\n","authors":["Raghav Singhal","Zachary Horvitz","Ryan Teehan","Mengye Ren","Zhou Yu","Kathleen McKeown","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2501.06848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08998v1","updated":"2025-01-15T18:26:35Z","published":"2025-01-15T18:26:35Z","title":"CrystalGRW: Generative Modeling of Crystal Structures with Targeted\n  Properties via Geodesic Random Walks","summary":"  Determining whether a candidate crystalline material is thermodynamically\nstable depends on identifying its true ground-state structure, a central\nchallenge in computational materials science. We introduce CrystalGRW, a\ndiffusion-based generative model on Riemannian manifolds that proposes novel\ncrystal configurations and can predict stable phases validated by density\nfunctional theory. The crystal properties, such as fractional coordinates,\natomic types, and lattice matrices, are represented on suitable Riemannian\nmanifolds, ensuring that new predictions generated through the diffusion\nprocess preserve the periodicity of crystal structures. We incorporate an\nequivariant graph neural network to also account for rotational and\ntranslational symmetries during the generation process. CrystalGRW demonstrates\nthe ability to generate realistic crystal structures that are close to their\nground states with accuracy comparable to existing models, while also enabling\nconditional control, such as specifying a desired crystallographic point group.\nThese features help accelerate materials discovery and inverse design by\noffering stable, symmetry-consistent crystal candidates for experimental\nvalidation.\n","authors":["Krit Tangsongcharoen","Teerachote Pakornchote","Chayanon Atthapak","Natthaphon Choomphon-anomakhun","Annop Ektarawong","Björn Alling","Christopher Sutton","Thiti Bovornratanaraks","Thiparat Chotibut"],"pdf_url":"https://arxiv.org/pdf/2501.08998v1.pdf","comment":"10+12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.08995v1","updated":"2025-01-15T18:23:33Z","published":"2025-01-15T18:23:33Z","title":"VECT-GAN: A variationally encoded generative model for overcoming data\n  scarcity in pharmaceutical science","summary":"  Data scarcity in pharmaceutical research has led to reliance on\nlabour-intensive trial and error approaches for development rather than data\ndriven methods. While Machine Learning offers a solution, existing datasets are\noften small and noisy, limiting their utility. To address this, we developed a\nVariationally Encoded Conditional Tabular Generative Adversarial Network (VECT\nGAN), a novel generative model specifically designed for augmenting small,\nnoisy datasets. We introduce a pipeline where data is augmented before\nregression model development and demonstrate that this consistently and\nsignificantly improves performance over other state of the art tabular\ngenerative models. We apply this pipeline across six pharmaceutical datasets,\nand highlight its real-world applicability by developing novel polymers with\nmedically desirable mucoadhesive properties, which we made and experimentally\ncharacterised. Additionally, we pre-train the model on the ChEMBL database of\ndrug-like molecules, leveraging knowledge distillation to enhance its\ngeneralisability, making it readily available for use on pharmaceutical\ndatasets containing small molecules, which is an extremely common\npharmaceutical task. We demonstrate the power of synthetic data for\nregularising small tabular datasets, highlighting its potential to become\nstandard practice in pharmaceutical model development, and make our method,\nincluding VECT GAN pretrained on ChEMBL available as a pip package.\n","authors":["Youssef Abdalla","Marrisa Taub","Eleanor Hilton","Priya Akkaraju","Alexander Milanovic","Mine Orlu","Abdul W. Basit","Michael T Cook","Tapabrata Chakraborty","David Shorthouse"],"pdf_url":"https://arxiv.org/pdf/2501.08995v1.pdf","comment":"30 pages, 6 primary figures, 3 supplementary figures"},{"id":"http://arxiv.org/abs/2412.18992v2","updated":"2025-01-15T18:07:15Z","published":"2024-12-25T22:06:12Z","title":"Optimal Federated Learning for Functional Mean Estimation under\n  Heterogeneous Privacy Constraints","summary":"  Federated learning (FL) is a distributed machine learning technique designed\nto preserve data privacy and security, and it has gained significant importance\ndue to its broad range of applications. This paper addresses the problem of\noptimal functional mean estimation from discretely sampled data in a federated\nsetting.\n  We consider a heterogeneous framework where the number of individuals,\nmeasurements per individual, and privacy parameters vary across one or more\nservers, under both common and independent design settings. In the common\ndesign setting, the same design points are measured for each individual,\nwhereas in the independent design, each individual has their own random\ncollection of design points. Within this framework, we establish minimax upper\nand lower bounds for the estimation error of the underlying mean function,\nhighlighting the nuanced differences between common and independent designs\nunder distributed privacy constraints.\n  We propose algorithms that achieve the optimal trade-off between privacy and\naccuracy and provide optimality results that quantify the fundamental limits of\nprivate functional mean estimation across diverse distributed settings. These\nresults characterize the cost of privacy and offer practical insights into the\npotential for privacy-preserving statistical analysis in federated\nenvironments.\n","authors":["Tony Cai","Abhinav Chakraborty","Lasse Vuursteen"],"pdf_url":"https://arxiv.org/pdf/2412.18992v2.pdf","comment":"54 pages: 25 page article and 29 pages of appendix"},{"id":"http://arxiv.org/abs/2411.04216v2","updated":"2025-01-15T17:47:22Z","published":"2024-11-06T19:24:34Z","title":"Debiasing Synthetic Data Generated by Deep Generative Models","summary":"  While synthetic data hold great promise for privacy protection, their\nstatistical analysis poses significant challenges that necessitate innovative\nsolutions. The use of deep generative models (DGMs) for synthetic data\ngeneration is known to induce considerable bias and imprecision into synthetic\ndata analyses, compromising their inferential utility as opposed to original\ndata analyses. This bias and uncertainty can be substantial enough to impede\nstatistical convergence rates, even in seemingly straightforward analyses like\nmean calculation. The standard errors of such estimators then exhibit slower\nshrinkage with sample size than the typical 1 over root-$n$ rate. This\ncomplicates fundamental calculations like p-values and confidence intervals,\nwith no straightforward remedy currently available. In response to these\nchallenges, we propose a new strategy that targets synthetic data created by\nDGMs for specific data analyses. Drawing insights from debiased and targeted\nmachine learning, our approach accounts for biases, enhances convergence rates,\nand facilitates the calculation of estimators with easily approximated large\nsample variances. We exemplify our proposal through a simulation study on toy\ndata and two case studies on real-world data, highlighting the importance of\ntailoring DGMs for targeted data analysis. This debiasing strategy contributes\nto advancing the reliability and applicability of synthetic data in statistical\ninference.\n","authors":["Alexander Decruyenaere","Heidelinde Dehaene","Paloma Rabaey","Christiaan Polet","Johan Decruyenaere","Thomas Demeester","Stijn Vansteelandt"],"pdf_url":"https://arxiv.org/pdf/2411.04216v2.pdf","comment":"Accepted for the 38th Conference on Neural Information Processing\n  Systems (NeurIPS 2024), joint first authors"},{"id":"http://arxiv.org/abs/2501.08970v1","updated":"2025-01-15T17:28:53Z","published":"2025-01-15T17:28:53Z","title":"Trusted Machine Learning Models Unlock Private Inference for Problems\n  Currently Infeasible with Cryptography","summary":"  We often interact with untrusted parties. Prioritization of privacy can limit\nthe effectiveness of these interactions, as achieving certain goals\nnecessitates sharing private data. Traditionally, addressing this challenge has\ninvolved either seeking trusted intermediaries or constructing cryptographic\nprotocols that restrict how much data is revealed, such as multi-party\ncomputations or zero-knowledge proofs. While significant advances have been\nmade in scaling cryptographic approaches, they remain limited in terms of the\nsize and complexity of applications they can be used for. In this paper, we\nargue that capable machine learning models can fulfill the role of a trusted\nthird party, thus enabling secure computations for applications that were\npreviously infeasible. In particular, we describe Trusted Capable Model\nEnvironments (TCMEs) as an alternative approach for scaling secure computation,\nwhere capable machine learning model(s) interact under input/output\nconstraints, with explicit information flow control and explicit statelessness.\nThis approach aims to achieve a balance between privacy and computational\nefficiency, enabling private inference where classical cryptographic solutions\nare currently infeasible. We describe a number of use cases that are enabled by\nTCME, and show that even some simple classic cryptographic problems can already\nbe solved with TCME. Finally, we outline current limitations and discuss the\npath forward in implementing them.\n","authors":["Ilia Shumailov","Daniel Ramage","Sarah Meiklejohn","Peter Kairouz","Florian Hartmann","Borja Balle","Eugene Bagdasarian"],"pdf_url":"https://arxiv.org/pdf/2501.08970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05541v2","updated":"2025-01-15T17:23:23Z","published":"2025-01-09T19:27:28Z","title":"Customizable LLM-Powered Chatbot for Behavioral Science Research","summary":"  The rapid advancement of Artificial Intelligence has resulted in the advent\nof Large Language Models (LLMs) with the capacity to produce text that closely\nresembles human communication. These models have been seamlessly integrated\ninto diverse applications, enabling interactive and responsive communication\nacross multiple platforms. The potential utility of chatbots transcends these\ntraditional applications, particularly in research contexts, wherein they can\noffer valuable insights and facilitate the design of innovative experiments. In\nthis study, we present a Customizable LLM-Powered Chatbot (CLPC), a web-based\nchatbot system designed to assist in behavioral science research. The system is\nmeticulously designed to function as an experimental instrument rather than a\nconventional chatbot, necessitating users to input a username and experiment\ncode upon access. This setup facilitates precise data cross-referencing,\nthereby augmenting the integrity and applicability of the data collected for\nresearch purposes. It can be easily expanded to accommodate new basic events as\nneeded; and it allows researchers to integrate their own logging events without\nthe necessity of implementing a separate logging mechanism. It is worth noting\nthat our system was built to assist primarily behavioral science research but\nis not limited to it, it can easily be adapted to assist information retrieval\nresearch or interacting with chat bot agents in general.\n","authors":["Zenon Lamprou","Yashar Moshfeghi"],"pdf_url":"https://arxiv.org/pdf/2501.05541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08963v1","updated":"2025-01-15T17:19:51Z","published":"2025-01-15T17:19:51Z","title":"Training-Aware Risk Control for Intensity Modulated Radiation Therapies\n  Quality Assurance with Conformal Prediction","summary":"  Measurement quality assurance (QA) practices play a key role in the safe use\nof Intensity Modulated Radiation Therapies (IMRT) for cancer treatment. These\npractices have reduced measurement-based IMRT QA failure below 1%. However,\nthese practices are time and labor intensive which can lead to delays in\npatient care. In this study, we examine how conformal prediction methodologies\ncan be used to robustly triage plans. We propose a new training-aware conformal\nrisk control method by combining the benefit of conformal risk control and\nconformal training. We incorporate the decision making thresholds based on the\ngamma passing rate, along with the risk functions used in clinical evaluation,\ninto the design of the risk control framework. Our method achieves high\nsensitivity and specificity and significantly reduces the number of plans\nneeding measurement without generating a huge confidence interval. Our results\ndemonstrate the validity and applicability of conformal prediction methods for\nimproving efficiency and reducing the workload of the IMRT QA process.\n","authors":["Kevin He","David Adam","Sarah Han-Oh","Anqi Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08963v1.pdf","comment":"2024 Machine Learning for Health Symposium"},{"id":"http://arxiv.org/abs/2411.13951v3","updated":"2025-01-15T17:16:22Z","published":"2024-11-21T09:03:12Z","title":"A Discrete-sequence Dataset for Evaluating Online Unsupervised Anomaly\n  Detection Approaches for Multivariate Time Series","summary":"  Benchmarking anomaly detection approaches for multivariate time series is\nchallenging due to the lack of high-quality datasets. Current publicly\navailable datasets are too small, not diverse and feature trivial anomalies,\nwhich hinders measurable progress in this research area. We propose a solution:\na diverse, extensive, and non-trivial dataset generated via state-of-the-art\nsimulation tools that reflects realistic behaviour of an automotive powertrain,\nincluding its multivariate, dynamic and variable-state properties. To cater for\nboth unsupervised and semi-supervised anomaly detection settings, as well as\ntime series generation and forecasting, we make different versions of the\ndataset available, where training and test subsets are offered in contaminated\nand clean versions, depending on the task. We also provide baseline results\nfrom a small selection of approaches based on deterministic and variational\nautoencoders, as well as a non-parametric approach. As expected, the baseline\nexperimentation shows that the approaches trained on the semi-supervised\nversion of the dataset outperform their unsupervised counterparts, highlighting\na need for approaches more robust to contaminated training data.\n","authors":["Lucas Correia","Jan-Christoph Goos","Thomas Bäck","Anna V. Kononova"],"pdf_url":"https://arxiv.org/pdf/2411.13951v3.pdf","comment":"Submitted to the IEEE Transactions on Reliability journal"},{"id":"http://arxiv.org/abs/2312.02186v3","updated":"2025-01-15T17:11:20Z","published":"2023-12-01T20:16:02Z","title":"Identifying Spurious Correlations using Counterfactual Alignment","summary":"  Models driven by spurious correlations often yield poor generalization\nperformance. We propose the counterfactual (CF) alignment method to detect and\nquantify spurious correlations of black box classifiers. Our methodology is\nbased on counterfactual images generated with respect to one classifier being\ninput into other classifiers to see if they also induce changes in the outputs\nof these classifiers. The relationship between these responses can be\nquantified and used to identify specific instances where a spurious correlation\nexists. This is validated by observing intuitive trends in face-attribute and\nwaterbird classifiers, as well as by fabricating spurious correlations and\ndetecting their presence, both visually and quantitatively. Furthermore,\nutilizing the CF alignment method, we demonstrate that we can evaluate robust\noptimization methods (GroupDRO, JTT, and FLAC) by detecting a reduction in\nspurious correlations.\n","authors":["Joseph Paul Cohen","Louis Blankemeier","Akshay Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2312.02186v3.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR), Code:\n  https://github.com/ieee8023/latentshift"},{"id":"http://arxiv.org/abs/2501.08958v1","updated":"2025-01-15T17:09:07Z","published":"2025-01-15T17:09:07Z","title":"Kolmogorov-Arnold Networks for Time Series Granger Causality Inference","summary":"  We introduce Granger Causality Kolmogorov-Arnold Networks (GCKAN), an\ninnovative architecture that extends the recently proposed Kolmogorov-Arnold\nNetworks (KAN) to the domain of causal inference. By extracting base weights\nfrom KAN layers and incorporating the sparsity-inducing penalty along with\nridge regularization, GCKAN infers the Granger causality from time series while\nenabling automatic time lag selection. Additionally, we propose an algorithm\nleveraging time-reversed Granger causality to enhance inference accuracy. The\nalgorithm compares prediction and sparse-inducing losses derived from the\noriginal and time-reversed series, automatically selecting the casual\nrelationship with the higher score or integrating the results to mitigate\nspurious connectivities. Comprehensive experiments conducted on Lorenz-96, gene\nregulatory networks, fMRI BOLD signals, and VAR datasets demonstrate that the\nproposed model achieves competitive performance to state-of-the-art methods in\ninferring Granger causality from nonlinear, high-dimensional, and\nlimited-sample time series.\n","authors":["Meiliang Liu","Yunfang Xu","Zijin Li","Zhengye Si","Xiaoxiao Yang","Xinyue Yang","Zhiwen Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.08958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17137v4","updated":"2025-01-15T16:56:26Z","published":"2024-09-25T17:56:00Z","title":"PACE: Marrying generalization in PArameter-efficient fine-tuning with\n  Consistency rEgularization","summary":"  Parameter-Efficient Fine-Tuning (PEFT) effectively adapts pre-trained\ntransformers to downstream tasks. However, the optimization of tasks\nperformance often comes at the cost of generalizability in fine-tuned models.\nTo address this issue, we theoretically connect smaller weight gradient norms\nduring training and larger datasets to the improvements in model\ngeneralization. Motivated by this connection, we propose reducing gradient\nnorms for enhanced generalization and aligning fine-tuned model with the\npre-trained counterpart to retain knowledge from large-scale pre-training data.\nYet, naive alignment does not guarantee gradient reduction and can potentially\ncause gradient explosion, complicating efforts to manage gradients. To address\nsuch an issue, we propose PACE, marrying generalization of PArameter-efficient\nfine-tuning with Consistency rEgularization. We perturb features learned from\nthe adapter with the multiplicative noise and ensure the fine-tuned model\nremains consistent for same sample under different perturbations. Theoretical\nanalysis shows that PACE not only implicitly regularizes gradients for enhanced\ngeneralization, but also implicitly aligns the fine-tuned and pre-trained\nmodels to retain knowledge. Experimental evidence supports our theories. PACE\nsurpasses existing PEFT methods in visual adaptation tasks (VTAB-1k, FGVC,\nfew-shot learning, domain adaptation) showcasing its potential for\nresource-efficient fine-tuning. It also improves LoRA in text classification\n(GLUE) and mathematical reasoning (GSM-8K). The code is available at\nhttps://github.com/MaxwellYaoNi/PACE\n","authors":["Yao Ni","Shan Zhang","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2409.17137v4.pdf","comment":"Accepted by NeurIPS 2024 as a spotlight"},{"id":"http://arxiv.org/abs/2501.08950v1","updated":"2025-01-15T16:52:21Z","published":"2025-01-15T16:52:21Z","title":"Computing Approximated Fixpoints via Dampened Mann Iteration","summary":"  Fixpoints are ubiquitous in computer science and when dealing with\nquantitative semantics and verification one is commonly led to consider least\nfixpoints of (higher-dimensional) functions over the nonnegative reals. We show\nhow to approximate the least fixpoint of such functions, focusing on the case\nin which they are not known precisely, but represented by a sequence of\napproximating functions that converge to them. We concentrate on monotone and\nnon-expansive functions, for which uniqueness of fixpoints is not guaranteed\nand standard fixpoint iteration schemes might get stuck at a fixpoint that is\nnot the least. Our main contribution is the identification of an iteration\nscheme, a variation of Mann iteration with a dampening factor, which, under\nsuitable conditions, is shown to guarantee convergence to the least fixpoint of\nthe function of interest. We then argue that these results are relevant in the\ncontext of model-based reinforcement learning for Markov decision processes\n(MDPs), showing that the proposed iteration scheme instantiates to MDPs and\nallows us to derive convergence to the optimal expected return. More generally,\nwe show that our results can be used to iterate to the least fixpoint almost\nsurely for systems where the function of interest can be approximated with\ngiven probabilistic error bounds, as it happens for probabilistic systems, such\nas simple stochastic games, that can be explored via sampling.\n","authors":["Paolo Baldan","Sebastian Gurke","Barbara König","Tommaso Padoan","Florian Wittbold"],"pdf_url":"https://arxiv.org/pdf/2501.08950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13749v2","updated":"2025-01-15T16:50:11Z","published":"2024-10-17T16:48:51Z","title":"Supervised Kernel Thinning","summary":"  The kernel thinning algorithm of Dwivedi & Mackey (2024) provides a\nbetter-than-i.i.d. compression of a generic set of points. By generating\nhigh-fidelity coresets of size significantly smaller than the input points, KT\nis known to speed up unsupervised tasks like Monte Carlo integration,\nuncertainty quantification, and non-parametric hypothesis testing, with minimal\nloss in statistical accuracy. In this work, we generalize the KT algorithm to\nspeed up supervised learning problems involving kernel methods. Specifically,\nwe combine two classical algorithms--Nadaraya-Watson (NW) regression or kernel\nsmoothing, and kernel ridge regression (KRR)--with KT to provide a quadratic\nspeed-up in both training and inference times. We show how distribution\ncompression with KT in each setting reduces to constructing an appropriate\nkernel, and introduce the Kernel-Thinned NW and Kernel-Thinned KRR estimators.\nWe prove that KT-based regression estimators enjoy significantly superior\ncomputational efficiency over the full-data estimators and improved statistical\nefficiency over i.i.d. subsampling of the training data. En route, we also\nprovide a novel multiplicative error guarantee for compressing with KT. We\nvalidate our design choices with both simulations and real data experiments.\n","authors":["Albert Gong","Kyuseong Choi","Raaz Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2410.13749v2.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.08941v1","updated":"2025-01-15T16:44:35Z","published":"2025-01-15T16:44:35Z","title":"A Reinforcement Learning Approach to Quiet and Safe UAM Traffic\n  Management","summary":"  Urban air mobility (UAM) is a transformative system that operates various\nsmall aerial vehicles in urban environments to reshape urban transportation.\nHowever, integrating UAM into existing urban environments presents a variety of\ncomplex challenges. Recent analyses of UAM's operational constraints highlight\naircraft noise and system safety as key hurdles to UAM system implementation.\nFuture UAM air traffic management schemes must ensure that the system is both\nquiet and safe. We propose a multi-agent reinforcement learning approach to\nmanage UAM traffic, aiming at both vertical separation assurance and noise\nmitigation. Through extensive training, the reinforcement learning agent learns\nto balance the two primary objectives by employing altitude adjustments in a\nmulti-layer UAM network. The results reveal the tradeoffs among noise impact,\ntraffic congestion, and separation. Overall, our findings demonstrate the\npotential of reinforcement learning in mitigating UAM's noise impact while\nmaintaining safe separation using altitude adjustments\n","authors":["Surya Murthy","John-Paul Clarke","Ufuk Topcu","Zhenyu Gao"],"pdf_url":"https://arxiv.org/pdf/2501.08941v1.pdf","comment":"Paper presented at SciTech 2025"},{"id":"http://arxiv.org/abs/2501.08925v1","updated":"2025-01-15T16:30:29Z","published":"2025-01-15T16:30:29Z","title":"Disentangling Exploration of Large Language Models by Optimal\n  Exploitation","summary":"  Exploration is a crucial skill for self-improvement and open-ended\nproblem-solving. However, it remains uncertain whether large language models\ncan effectively explore the state-space. Existing evaluations predominantly\nfocus on the trade-off between exploration and exploitation, often assessed in\nmulti-armed bandit problems. In contrast, this work isolates exploration as the\nsole objective, tasking the agent with delivering information that enhances\nfuture returns. For the evaluation, we propose to decompose missing rewards\ninto exploration and exploitation components by measuring the optimal\nachievable return for the states already explored. Our experiments with various\nLLMs reveal that most models struggle to sufficiently explore the state-space\nand that weak exploration is insufficient. We observe a positive correlation\nbetween model size and exploration performance, with larger models\ndemonstrating superior capabilities. Furthermore, we show that our\ndecomposition provides insights into differences in behaviors driven by agent\ninstructions during prompt engineering, offering a valuable tool for refining\nLLM performance in exploratory tasks.\n","authors":["Tim Grams","Patrick Betz","Christian Bartelt"],"pdf_url":"https://arxiv.org/pdf/2501.08925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07823v2","updated":"2025-01-15T16:29:38Z","published":"2024-05-13T15:08:02Z","title":"Integrating Multi-Physics Simulations and Machine Learning to Define the\n  Spatter Mechanism and Process Window in Laser Powder Bed Fusion","summary":"  Laser powder bed fusion (LPBF) has shown promise for wide range of\napplications due to its ability to fabricate freeform geometries and generate a\ncontrolled microstructure. However, components generated by LPBF still possess\nsub-optimal mechanical properties due to the defects that are created during\nlaser-material interactions. In this work, we investigate mechanism of spatter\nformation, using a high-fidelity modelling tool that was built to simulate the\nmulti-physics phenomena in LPBF. The modelling tool have the capability to\ncapture the 3D resolution of the meltpool and the spatter behavior. To\nunderstand spatter behavior and formation, we reveal its properties at ejection\nand evaluate its variation from the meltpool, the source where it is formed.\nThe dataset of the spatter and the meltpool collected consist of 50 % spatter\nand 50 % melt pool samples, with features that include position components,\nvelocity components, velocity magnitude, temperature, density and pressure. The\nrelationship between the spatter and the meltpool were evaluated via\ncorrelation analysis and machine learning (ML) algorithms for classification\ntasks. Upon screening different ML algorithms on the dataset, a high accuracy\nwas observed for all the ML models, with ExtraTrees having the highest at 96 %\nand KNN having the lowest at 94 %.\n","authors":["Olabode T. Ajenifujah","Francis Ogoke","Florian Wirth","Jack Beuth","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2405.07823v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08922v1","updated":"2025-01-15T16:26:01Z","published":"2025-01-15T16:26:01Z","title":"Modeling Melt Pool Features and Spatter Using Symbolic Regression and\n  Machine Learning","summary":"  Additive manufacturing (AM) is a rapidly evolving technology that has\nattracted applications across a wide range of fields due to its ability to\nfabricate complex geometries. However, one of the key challenges in AM is\nachieving consistent print quality. This inconsistency is often attributed to\nuncontrolled melt pool dynamics, partly caused by spatter which can lead to\ndefects. Therefore, capturing and controlling the evolution of the melt pool is\ncrucial for enhancing process stability and part quality. In this study, we\ndeveloped a framework to support decision-making in AM operations, facilitating\nquality control and minimizing defects via machine learning (ML) and polynomial\nsymbolic regression models. We implemented experimentally validated\ncomputational tools as a cost-effective approach to collect large datasets from\nlaser powder bed fusion (LPBF) processes. For a dataset consisting of 281\nprocess conditions, parameters such as melt pool dimensions (length, width,\ndepth), melt pool geometry (area, volume), and volume indicated as spatter were\nextracted. Using machine learning (ML) and polynomial symbolic regression\nmodels, a high R2 of over 95 % was achieved in predicting the melt pool\ndimensions and geometry features for both the training and testing datasets,\nwith either process conditions (power and velocity) or melt pool dimensions as\nthe model inputs. In the case of volume indicated as spatter, R2 improved after\nlogarithmic transforming the model inputs, which was either the process\nconditions or the melt pool dimensions. Among the investigated ML models, the\nExtraTree model achieved the highest R2 values of 96.7 % and 87.5 %.\n","authors":["Olabode T. Ajenifujah","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2501.08922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08913v1","updated":"2025-01-15T16:21:09Z","published":"2025-01-15T16:21:09Z","title":"GenAI Content Detection Task 3: Cross-Domain Machine-Generated Text\n  Detection Challenge","summary":"  Recently there have been many shared tasks targeting the detection of\ngenerated text from Large Language Models (LLMs). However, these shared tasks\ntend to focus either on cases where text is limited to one particular domain or\ncases where text can be from many domains, some of which may not be seen during\ntest time. In this shared task, using the newly released RAID benchmark, we aim\nto answer whether or not models can detect generated text from a large, yet\nfixed, number of domains and LLMs, all of which are seen during training. Over\nthe course of three months, our task was attempted by 9 teams with 23 detector\nsubmissions. We find that multiple participants were able to obtain accuracies\nof over 99% on machine-generated text from RAID while maintaining a 5% False\nPositive Rate -- suggesting that detectors are able to robustly detect text\nfrom many domains and models simultaneously. We discuss potential\ninterpretations of this result and provide directions for future research.\n","authors":["Liam Dugan","Andrew Zhu","Firoj Alam","Preslav Nakov","Marianna Apidianaki","Chris Callison-Burch"],"pdf_url":"https://arxiv.org/pdf/2501.08913v1.pdf","comment":"COLING 2025"},{"id":"http://arxiv.org/abs/2501.08907v1","updated":"2025-01-15T16:17:02Z","published":"2025-01-15T16:17:02Z","title":"Projection Implicit Q-Learning with Support Constraint for Offline\n  Reinforcement Learning","summary":"  Offline Reinforcement Learning (RL) faces a critical challenge of\nextrapolation errors caused by out-of-distribution (OOD) actions. Implicit\nQ-Learning (IQL) algorithm employs expectile regression to achieve in-sample\nlearning, effectively mitigating the risks associated with OOD actions.\nHowever, the fixed hyperparameter in policy evaluation and density-based policy\nimprovement method limit its overall efficiency. In this paper, we propose\nProj-IQL, a projective IQL algorithm enhanced with the support constraint. In\nthe policy evaluation phase, Proj-IQL generalizes the one-step approach to a\nmulti-step approach through vector projection, while maintaining in-sample\nlearning and expectile regression framework. In the policy improvement phase,\nProj-IQL introduces support constraint that is more aligned with the policy\nevaluation approach. Furthermore, we theoretically demonstrate that Proj-IQL\nguarantees monotonic policy improvement and enjoys a progressively more\nrigorous criterion for superior actions. Empirical results demonstrate the\nProj-IQL achieves state-of-the-art performance on D4RL benchmarks, especially\nin challenging navigation domains.\n","authors":["Xinchen Han","Hossam Afifi","Michel Marot"],"pdf_url":"https://arxiv.org/pdf/2501.08907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08902v1","updated":"2025-01-15T16:11:24Z","published":"2025-01-15T16:11:24Z","title":"Multi-View Transformers for Airway-To-Lung Ratio Inference on Cardiac CT\n  Scans: The C4R Study","summary":"  The ratio of airway tree lumen to lung size (ALR), assessed at full\ninspiration on high resolution full-lung computed tomography (CT), is a major\nrisk factor for chronic obstructive pulmonary disease (COPD). There is growing\ninterest to infer ALR from cardiac CT images, which are widely available in\nepidemiological cohorts, to investigate the relationship of ALR to severe\nCOVID-19 and post-acute sequelae of SARS-CoV-2 infection (PASC). Previously,\ncardiac scans included approximately 2/3 of the total lung volume with 5-6x\ngreater slice thickness than high-resolution (HR) full-lung (FL) CT. In this\nstudy, we present a novel attention-based Multi-view Swin Transformer to infer\nFL ALR values from segmented cardiac CT scans. For the supervised training we\nexploit paired full-lung and cardiac CTs acquired in the Multi-Ethnic Study of\nAtherosclerosis (MESA). Our network significantly outperforms a proxy direct\nALR inference on segmented cardiac CT scans and achieves accuracy and\nreproducibility comparable with a scan-rescan reproducibility of the FL ALR\nground-truth.\n","authors":["Sneha N. Naik","Elsa D. Angelini","Eric A. Hoffman","Elizabeth C. Oelsner","R. Graham Barr","Benjamin M. Smith","Andrew F. Laine"],"pdf_url":"https://arxiv.org/pdf/2501.08902v1.pdf","comment":"Accepted to appear in Proceedings of International Symposium on\n  Biomedical Imaging (ISBI), 2025"},{"id":"http://arxiv.org/abs/2407.04491v3","updated":"2025-01-15T16:02:08Z","published":"2024-07-05T13:29:30Z","title":"Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular\n  Data","summary":"  For classification and regression on tabular data, the dominance of\ngradient-boosted decision trees (GBDTs) has recently been challenged by often\nmuch slower deep learning methods with extensive hyperparameter tuning. We\naddress this discrepancy by introducing (a) RealMLP, an improved multilayer\nperceptron (MLP), and (b) strong meta-tuned default parameters for GBDTs and\nRealMLP. We tune RealMLP and the default parameters on a meta-train benchmark\nwith 118 datasets and compare them to hyperparameter-optimized versions on a\ndisjoint meta-test benchmark with 90 datasets, as well as the GBDT-friendly\nbenchmark by Grinsztajn et al. (2022). Our benchmark results on medium-to-large\ntabular datasets (1K--500K samples) show that RealMLP offers a favorable\ntime-accuracy tradeoff compared to other neural baselines and is competitive\nwith GBDTs in terms of benchmark scores. Moreover, a combination of RealMLP and\nGBDTs with improved default parameters can achieve excellent results without\nhyperparameter tuning. Finally, we demonstrate that some of RealMLP's\nimprovements can also considerably improve the performance of TabR with default\nparameters.\n","authors":["David Holzmüller","Léo Grinsztajn","Ingo Steinwart"],"pdf_url":"https://arxiv.org/pdf/2407.04491v3.pdf","comment":"NeurIPS 2024. Changes in v3: mention bug in XGBoost results, mention\n  original name of he+5 method. Code is available at\n  github.com/dholzmueller/pytabkit"},{"id":"http://arxiv.org/abs/2501.08888v1","updated":"2025-01-15T15:58:16Z","published":"2025-01-15T15:58:16Z","title":"A Two-Stage Pretraining-Finetuning Framework for Treatment Effect\n  Estimation with Unmeasured Confounding","summary":"  Estimating the conditional average treatment effect (CATE) from observational\ndata plays a crucial role in areas such as e-commerce, healthcare, and\neconomics. Existing studies mainly rely on the strong ignorability assumption\nthat there are no unmeasured confounders, whose presence cannot be tested from\nobservational data and can invalidate any causal conclusion. In contrast, data\ncollected from randomized controlled trials (RCT) do not suffer from\nconfounding, but are usually limited by a small sample size. In this paper, we\npropose a two-stage pretraining-finetuning (TSPF) framework using both\nlarge-scale observational data and small-scale RCT data to estimate the CATE in\nthe presence of unmeasured confounding. In the first stage, a foundational\nrepresentation of covariates is trained to estimate counterfactual outcomes\nthrough large-scale observational data. In the second stage, we propose to\ntrain an augmented representation of the covariates, which is concatenated to\nthe foundational representation obtained in the first stage to adjust for the\nunmeasured confounding. To avoid overfitting caused by the small-scale RCT data\nin the second stage, we further propose a partial parameter initialization\napproach, rather than training a separate network. The superiority of our\napproach is validated on two public datasets with extensive experiments. The\ncode is available at https://github.com/zhouchuanCN/KDD25-TSPF.\n","authors":["Chuan Zhou","Yaxuan Li","Chunyuan Zheng","Haiteng Zhang","Min Zhang","Haoxuan Li","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2501.08888v1.pdf","comment":"KDD 25 Research Track"},{"id":"http://arxiv.org/abs/2501.08887v1","updated":"2025-01-15T15:57:13Z","published":"2025-01-15T15:57:13Z","title":"PAC Learnability of Scenario Decision-Making Algorithms: Necessary and\n  Sufficient Conditions","summary":"  We study the PAC property of scenario decision-making algorithms, that is,\nthe ability to make a decision that has an arbitrarily low risk of violating an\nunknown safety constraint, provided sufficiently many realizations (called\nscenarios) of the safety constraint are sampled. Sufficient conditions for\nscenario decision-making algorithms to be PAC are available in the literature,\nsuch as finiteness of the VC dimension of its associated classifier and\nexistence of a compression scheme. We study the question of whether these\nsufficient conditions are also necessary. We show with counterexamples that\nthis is not the case in general. This contrasts with binary classification\nlearning, for which the analogous conditions are sufficient and necessary.\nPopular scenario decision-making algorithms, such as scenario optimization,\nenjoy additional properties, such as stability and consistency. We show that\neven under these additional assumptions the above conclusions hold. Finally, we\nderive a necessary condition for scenario decision-making algorithms to be PAC,\ninspired by the VC dimension and the so-called no-free-lunch theorem.\n","authors":["Guillaume O. Berger","Raphaël M. Jungers"],"pdf_url":"https://arxiv.org/pdf/2501.08887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08884v1","updated":"2025-01-15T15:53:34Z","published":"2025-01-15T15:53:34Z","title":"Improved Compression Bounds for Scenario Decision Making","summary":"  Scenario decision making offers a flexible way of making decision in an\nuncertain environment while obtaining probabilistic guarantees on the risk of\nfailure of the decision. The idea of this approach is to draw samples of the\nuncertainty and make a decision based on the samples, called \"scenarios\". The\nprobabilistic guarantees take the form of a bound on the probability of\nsampling a set of scenarios that will lead to a decision whose risk of failure\nis above a given maximum tolerance. This bound can be expressed as a function\nof the number of sampled scenarios, the maximum tolerated risk, and some\nintrinsic property of the problem called the \"compression size\". Several such\nbounds have been proposed in the literature under various assumptions on the\nproblem. We propose new bounds that improve upon the existing ones without\nrequiring stronger assumptions on the problem.\n","authors":["Guillaume O. Berger","Raphaël M. Jungers"],"pdf_url":"https://arxiv.org/pdf/2501.08884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08883v1","updated":"2025-01-15T15:53:27Z","published":"2025-01-15T15:53:27Z","title":"Increasing Batch Size Improves Convergence of Stochastic Gradient\n  Descent with Momentum","summary":"  Stochastic gradient descent with momentum (SGDM), which is defined by adding\na momentum term to SGD, has been well studied in both theory and practice.\nTheoretically investigated results showed that the settings of the learning\nrate and momentum weight affect the convergence of SGDM. Meanwhile, practical\nresults showed that the setting of batch size strongly depends on the\nperformance of SGDM. In this paper, we focus on mini-batch SGDM with constant\nlearning rate and constant momentum weight, which is frequently used to train\ndeep neural networks in practice. The contribution of this paper is showing\ntheoretically that using a constant batch size does not always minimize the\nexpectation of the full gradient norm of the empirical loss in training a deep\nneural network, whereas using an increasing batch size definitely minimizes it,\nthat is, increasing batch size improves convergence of mini-batch SGDM. We also\nprovide numerical results supporting our analyses, indicating specifically that\nmini-batch SGDM with an increasing batch size converges to stationary points\nfaster than with a constant batch size. Python implementations of the\noptimizers used in the numerical experiments are available at\nhttps://anonymous.4open.science/r/momentum-increasing-batch-size-888C/.\n","authors":["Keisuke Kamo","Hideaki Iiduka"],"pdf_url":"https://arxiv.org/pdf/2501.08883v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2501.08878v1","updated":"2025-01-15T15:49:46Z","published":"2025-01-15T15:49:46Z","title":"Incrementally Learning Multiple Diverse Data Domains via Multi-Source\n  Dynamic Expansion Model","summary":"  Continual Learning seeks to develop a model capable of incrementally\nassimilating new information while retaining prior knowledge. However, current\nresearch predominantly addresses a straightforward learning context, wherein\nall data samples originate from a singular data domain. This paper shifts focus\nto a more complex and realistic learning environment, characterized by data\nsamples sourced from multiple distinct domains. We tackle this intricate\nlearning challenge by introducing a novel methodology, termed the Multi-Source\nDynamic Expansion Model (MSDEM), which leverages various pre-trained models as\nbackbones and progressively establishes new experts based on them to adapt to\nemerging tasks. Additionally, we propose an innovative dynamic expandable\nattention mechanism designed to selectively harness knowledge from multiple\nbackbones, thereby accelerating the new task learning. Moreover, we introduce a\ndynamic graph weight router that strategically reuses all previously acquired\nparameters and representations for new task learning, maximizing the positive\nknowledge transfer effect, which further improves generalization performance.\nWe conduct a comprehensive series of experiments, and the empirical findings\nindicate that our proposed approach achieves state-of-the-art performance.\n","authors":["Runqing Wu","Fei Ye","Qihe Liu","Guoxi Huang","Jinyu Guo","Rongyao Hu"],"pdf_url":"https://arxiv.org/pdf/2501.08878v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.08376v4","updated":"2025-01-15T15:41:09Z","published":"2023-11-14T18:41:28Z","title":"Ensemble sampling for linear bandits: small ensembles suffice","summary":"  We provide the first useful and rigorous analysis of ensemble sampling for\nthe stochastic linear bandit setting. In particular, we show that, under\nstandard assumptions, for a $d$-dimensional stochastic linear bandit with an\ninteraction horizon $T$, ensemble sampling with an ensemble of size of order $d\n\\log T$ incurs regret at most of the order $(d \\log T)^{5/2} \\sqrt{T}$. Ours is\nthe first result in any structured setting not to require the size of the\nensemble to scale linearly with $T$ -- which defeats the purpose of ensemble\nsampling -- while obtaining near $\\smash{\\sqrt{T}}$ order regret. Our result is\nalso the first to allow for infinite action sets.\n","authors":["David Janz","Alexander E. Litvak","Csaba Szepesvári"],"pdf_url":"https://arxiv.org/pdf/2311.08376v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16749v4","updated":"2025-01-15T15:40:12Z","published":"2024-06-24T15:57:49Z","title":"Inferring stochastic low-rank recurrent neural networks from neural data","summary":"  A central aim in computational neuroscience is to relate the activity of\nlarge populations of neurons to an underlying dynamical system. Models of these\nneural dynamics should ideally be both interpretable and fit the observed data\nwell. Low-rank recurrent neural networks (RNNs) exhibit such interpretability\nby having tractable dynamics. However, it is unclear how to best fit low-rank\nRNNs to data consisting of noisy observations of an underlying stochastic\nsystem. Here, we propose to fit stochastic low-rank RNNs with variational\nsequential Monte Carlo methods. We validate our method on several datasets\nconsisting of both continuous and spiking neural data, where we obtain lower\ndimensional latent dynamics than current state of the art methods.\nAdditionally, for low-rank models with piecewise linear nonlinearities, we show\nhow to efficiently identify all fixed points in polynomial rather than\nexponential cost in the number of units, making analysis of the inferred\ndynamics tractable for large RNNs. Our method both elucidates the dynamical\nsystems underlying experimental recordings and provides a generative model\nwhose trajectories match observed variability.\n","authors":["Matthijs Pals","A Erdem Sağtekin","Felix Pei","Manuel Gloeckler","Jakob H Macke"],"pdf_url":"https://arxiv.org/pdf/2406.16749v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14970v4","updated":"2025-01-15T15:35:22Z","published":"2024-10-19T04:28:44Z","title":"Taming the Long Tail in Human Mobility Prediction","summary":"  With the popularity of location-based services, human mobility prediction\nplays a key role in enhancing personalized navigation, optimizing\nrecommendation systems, and facilitating urban mobility and planning. This\ninvolves predicting a user's next POI (point-of-interest) visit using their\npast visit history. However, the uneven distribution of visitations over time\nand space, namely the long-tail problem in spatial distribution, makes it\ndifficult for AI models to predict those POIs that are less visited by humans.\nIn light of this issue, we propose the Long-Tail Adjusted Next POI Prediction\n(LoTNext) framework for mobility prediction, combining a Long-Tailed Graph\nAdjustment module to reduce the impact of the long-tailed nodes in the user-POI\ninteraction graph and a novel Long-Tailed Loss Adjustment module to adjust loss\nby logit score and sample weight adjustment strategy. Also, we employ the\nauxiliary prediction task to enhance generalization and accuracy. Our\nexperiments with two real-world trajectory datasets demonstrate that LoTNext\nsignificantly surpasses existing state-of-the-art works.\n","authors":["Xiaohang Xu","Renhe Jiang","Chuang Yang","Zipei Fan","Kaoru Sezaki"],"pdf_url":"https://arxiv.org/pdf/2410.14970v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10175v2","updated":"2025-01-15T15:24:32Z","published":"2024-11-15T13:21:26Z","title":"The Surprising Ineffectiveness of Pre-Trained Visual Representations for\n  Model-Based Reinforcement Learning","summary":"  Visual Reinforcement Learning (RL) methods often require extensive amounts of\ndata. As opposed to model-free RL, model-based RL (MBRL) offers a potential\nsolution with efficient data utilization through planning. Additionally, RL\nlacks generalization capabilities for real-world tasks. Prior work has shown\nthat incorporating pre-trained visual representations (PVRs) enhances sample\nefficiency and generalization. While PVRs have been extensively studied in the\ncontext of model-free RL, their potential in MBRL remains largely unexplored.\nIn this paper, we benchmark a set of PVRs on challenging control tasks in a\nmodel-based RL setting. We investigate the data efficiency, generalization\ncapabilities, and the impact of different properties of PVRs on the performance\nof model-based agents. Our results, perhaps surprisingly, reveal that for MBRL\ncurrent PVRs are not more sample efficient than learning representations from\nscratch, and that they do not generalize better to out-of-distribution (OOD)\nsettings. To explain this, we analyze the quality of the trained dynamics\nmodel. Furthermore, we show that data diversity and network architecture are\nthe most important contributors to OOD generalization performance.\n","authors":["Moritz Schneider","Robert Krug","Narunas Vaskevicius","Luigi Palmieri","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2411.10175v2.pdf","comment":"Published at the 38th Conference on Neural Information Processing\n  Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/"},{"id":"http://arxiv.org/abs/2501.08862v1","updated":"2025-01-15T15:22:57Z","published":"2025-01-15T15:22:57Z","title":"ARMOR: Shielding Unlearnable Examples against Data Augmentation","summary":"  Private data, when published online, may be collected by unauthorized parties\nto train deep neural networks (DNNs). To protect privacy, defensive noises can\nbe added to original samples to degrade their learnability by DNNs. Recently,\nunlearnable examples are proposed to minimize the training loss such that the\nmodel learns almost nothing. However, raw data are often pre-processed before\nbeing used for training, which may restore the private information of protected\ndata. In this paper, we reveal the data privacy violation induced by data\naugmentation, a commonly used data pre-processing technique to improve model\ngeneralization capability, which is the first of its kind as far as we are\nconcerned. We demonstrate that data augmentation can significantly raise the\naccuracy of the model trained on unlearnable examples from 21.3% to 66.1%. To\naddress this issue, we propose a defense framework, dubbed ARMOR, to protect\ndata privacy from potential breaches of data augmentation. To overcome the\ndifficulty of having no access to the model training process, we design a\nnon-local module-assisted surrogate model that better captures the effect of\ndata augmentation. In addition, we design a surrogate augmentation selection\nstrategy that maximizes distribution alignment between augmented and\nnon-augmented samples, to choose the optimal augmentation strategy for each\nclass. We also use a dynamic step size adjustment algorithm to enhance the\ndefensive noise generation process. Extensive experiments are conducted on 4\ndatasets and 5 data augmentation methods to verify the performance of ARMOR.\nComparisons with 6 state-of-the-art defense methods have demonstrated that\nARMOR can preserve the unlearnability of protected private data under data\naugmentation. ARMOR reduces the test accuracy of the model trained on augmented\nprotected samples by as much as 60% more than baselines.\n","authors":["Xueluan Gong","Yuji Wang","Yanjiao Chen","Haocheng Dong","Yiming Li","Mengyuan Sun","Shuaike Li","Qian Wang","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18977v2","updated":"2025-01-15T15:22:45Z","published":"2024-12-25T19:38:32Z","title":"CGCOD: Class-Guided Camouflaged Object Detection","summary":"  Camouflaged Object Detection (COD) aims to identify objects that blend\nseamlessly into their surroundings. The inherent visual complexity of\ncamouflaged objects, including their low contrast with the background, diverse\ntextures, and subtle appearance variations, often obscures semantic cues,\nmaking accurate segmentation highly challenging. Existing methods primarily\nrely on visual features, which are insufficient to handle the variability and\nintricacy of camouflaged objects, leading to unstable object perception and\nambiguous segmentation results. To tackle these limitations, we introduce a\nnovel task, class-guided camouflaged object detection (CGCOD), which extends\ntraditional COD task by incorporating object-specific class knowledge to\nenhance detection robustness and accuracy. To facilitate this task, we present\na new dataset, CamoClass, comprising real-world camouflaged objects with class\nannotations. Furthermore, we propose a multi-stage framework, CGNet, which\nincorporates a plug-and-play class prompt generator and a simple yet effective\nclass-guided detector. This establishes a new paradigm for COD, bridging the\ngap between contextual understanding and class-guided detection. Extensive\nexperimental results demonstrate the effectiveness of our flexible framework in\nimproving the performance of proposed and existing detectors by leveraging\nclass-level textual information.\n","authors":["Chenxi Zhang","Qing Zhang","Jiayun Wu","Youwei Pang"],"pdf_url":"https://arxiv.org/pdf/2412.18977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06403v4","updated":"2025-01-15T15:21:46Z","published":"2023-12-11T14:24:24Z","title":"RoME: A Robust Mixed-Effects Bandit Algorithm for Optimizing Mobile\n  Health Interventions","summary":"  Mobile health leverages personalized and contextually tailored interventions\noptimized through bandit and reinforcement learning algorithms. In practice,\nhowever, challenges such as participant heterogeneity, nonstationarity, and\nnonlinear relationships hinder algorithm performance. We propose RoME, a Robust\nMixed-Effects contextual bandit algorithm that simultaneously addresses these\nchallenges via (1) modeling the differential reward with user- and\ntime-specific random effects, (2) network cohesion penalties, and (3) debiased\nmachine learning for flexible estimation of baseline rewards. We establish a\nhigh-probability regret bound that depends solely on the dimension of the\ndifferential-reward model, enabling us to achieve robust regret bounds even\nwhen the baseline reward is highly complex. We demonstrate the superior\nperformance of the RoME algorithm in a simulation and two off-policy evaluation\nstudies.\n","authors":["Easton K. Huch","Jieru Shi","Madeline R. Abbott","Jessica R. Golbus","Alexander Moreno","Walter H. Dempsey"],"pdf_url":"https://arxiv.org/pdf/2312.06403v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11316v2","updated":"2025-01-15T15:07:59Z","published":"2024-06-17T08:26:51Z","title":"Improved Algorithms for Contextual Dynamic Pricing","summary":"  In contextual dynamic pricing, a seller sequentially prices goods based on\ncontextual information. Buyers will purchase products only if the prices are\nbelow their valuations. The goal of the seller is to design a pricing strategy\nthat collects as much revenue as possible. We focus on two different valuation\nmodels. The first assumes that valuations linearly depend on the context and\nare further distorted by noise. Under minor regularity assumptions, our\nalgorithm achieves an optimal regret bound of $\\tilde{\\mathcal{O}}(T^{2/3})$,\nimproving the existing results. The second model removes the linearity\nassumption, requiring only that the expected buyer valuation is\n$\\beta$-H\\\"older in the context. For this model, our algorithm obtains a regret\n$\\tilde{\\mathcal{O}}(T^{d+2\\beta/d+3\\beta})$, where $d$ is the dimension of the\ncontext space.\n","authors":["Matilde Tullii","Solenne Gaucher","Nadav Merlis","Vianney Perchet"],"pdf_url":"https://arxiv.org/pdf/2406.11316v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04195v2","updated":"2025-01-15T15:06:56Z","published":"2023-03-07T19:32:13Z","title":"PRIMO: Private Regression in Multiple Outcomes","summary":"  We introduce a new private regression setting we call Private Regression in\nMultiple Outcomes (PRIMO), inspired by the common situation where a data\nanalyst wants to perform a set of $l$ regressions while preserving privacy,\nwhere the features $X$ are shared across all $l$ regressions, and each\nregression $i \\in [l]$ has a different vector of outcomes $y_i$. Naively\napplying existing private linear regression techniques $l$ times leads to a\n$\\sqrt{l}$ multiplicative increase in error over the standard linear regression\nsetting. We apply a variety of techniques including sufficient statistics\nperturbation (SSP) and geometric projection-based methods to develop scalable\nalgorithms that outperform this baseline across a range of parameter regimes.\nIn particular, we obtain no dependence on l in the asymptotic error when $l$ is\nsufficiently large. Empirically, on the task of genomic risk prediction with\nmultiple phenotypes we find that even for values of $l$ far smaller than the\ntheory would predict, our projection-based method improves the accuracy\nrelative to the variant that doesn't use the projection.\n","authors":["Seth Neel"],"pdf_url":"https://arxiv.org/pdf/2303.04195v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08851v1","updated":"2025-01-15T15:05:49Z","published":"2025-01-15T15:05:49Z","title":"Digital Phenotyping for Adolescent Mental Health: A Feasibility Study\n  Employing Machine Learning to Predict Mental Health Risk From Active and\n  Passive Smartphone Data","summary":"  Background: Adolescents are particularly vulnerable to mental disorders, with\nover 75% of cases manifesting before the age of 25. Research indicates that\nonly 18 to 34% of young people experiencing high levels of depression or\nanxiety symptoms seek support. Digital tools leveraging smartphones offer\nscalable and early intervention opportunities. Objective: Using a novel machine\nlearning framework, this study evaluated the feasibility of integrating active\nand passive smartphone data to predict mental disorders in non-clinical\nadolescents. Specifically, we investigated the utility of the Mindcraft app in\npredicting risks for internalising and externalising disorders, eating\ndisorders, insomnia and suicidal ideation. Methods: Participants (N=103; mean\nage 16.1 years) were recruited from three London schools. Participants\ncompleted the Strengths and Difficulties Questionnaire, the Eating Disorders-15\nQuestionnaire, Sleep Condition Indicator Questionnaire and indicated the\npresence/absence of suicidal ideation. They used the Mindcraft app for 14 days,\ncontributing active data via self-reports and passive data from smartphone\nsensors. A contrastive pretraining phase was applied to enhance user-specific\nfeature stability, followed by supervised fine-tuning. The model evaluation\nemployed leave-one-subject-out cross-validation using balanced accuracy as the\nprimary metric. Results: The integration of active and passive data achieved\nsuperior performance compared to individual data sources, with mean balanced\naccuracies of 0.71 for SDQ-High risk, 0.67 for insomnia, 0.77 for suicidal\nideation and 0.70 for eating disorders. The contrastive learning framework\nstabilised daily behavioural representations, enhancing predictive robustness.\nThis study demonstrates the potential of integrating active and passive\nsmartphone data with advanced machine-learning techniques for predicting mental\nhealth risks.\n","authors":["Balasundaram Kadirvelu","Teresa Bellido Bel","Aglaia Freccero","Martina Di Simplicio","Dasha Nicholls","A Aldo Faisal"],"pdf_url":"https://arxiv.org/pdf/2501.08851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08850v1","updated":"2025-01-15T15:04:10Z","published":"2025-01-15T15:04:10Z","title":"Graph Counterfactual Explainable AI via Latent Space Traversal","summary":"  Explaining the predictions of a deep neural network is a nontrivial task, yet\nhigh-quality explanations for predictions are often a prerequisite for\npractitioners to trust these models. Counterfactual explanations aim to explain\npredictions by finding the ''nearest'' in-distribution alternative input whose\nprediction changes in a pre-specified way. However, it remains an open question\nhow to define this nearest alternative input, whose solution depends on both\nthe domain (e.g. images, graphs, tabular data, etc.) and the specific\napplication considered. For graphs, this problem is complicated i) by their\ndiscrete nature, as opposed to the continuous nature of state-of-the-art graph\nclassifiers; and ii) by the node permutation group acting on the graphs. We\npropose a method to generate counterfactual explanations for any differentiable\nblack-box graph classifier, utilizing a case-specific permutation equivariant\ngraph variational autoencoder. We generate counterfactual explanations in a\ncontinuous fashion by traversing the latent space of the autoencoder across the\nclassification boundary of the classifier, allowing for seamless integration of\ndiscrete graph structure and continuous graph attributes. We empirically\nvalidate the approach on three graph datasets, showing that our model is\nconsistently high-performing and more robust than the baselines.\n","authors":["Andreas Abildtrup Hansen","Paraskevas Pegios","Anna Calissano","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2501.08850v1.pdf","comment":"Published at Northern Lights Deep Learning Conference 2025"},{"id":"http://arxiv.org/abs/2501.08848v1","updated":"2025-01-15T15:00:11Z","published":"2025-01-15T15:00:11Z","title":"RouteNet-Gauss: Hardware-Enhanced Network Modeling with Machine Learning","summary":"  Network simulation is pivotal in network modeling, assisting with tasks\nranging from capacity planning to performance estimation. Traditional\napproaches such as Discrete Event Simulation (DES) face limitations in terms of\ncomputational cost and accuracy. This paper introduces RouteNet-Gauss, a novel\nintegration of a testbed network with a Machine Learning (ML) model to address\nthese challenges. By using the testbed as a hardware accelerator,\nRouteNet-Gauss generates training datasets rapidly and simulates network\nscenarios with high fidelity to real-world conditions. Experimental results\nshow that RouteNet-Gauss significantly reduces prediction errors by up to 95%\nand achieves a 488x speedup in inference time compared to state-of-the-art\nDES-based methods. RouteNet-Gauss's modular architecture is dynamically\nconstructed based on the specific characteristics of the network scenario, such\nas topology and routing. This enables it to understand and generalize to\ndifferent network configurations beyond those seen during training, including\nnetworks up to 10x larger. Additionally, it supports Temporal Aggregated\nPerformance Estimation (TAPE), providing configurable temporal granularity and\nmaintaining high accuracy in flow performance metrics. This approach shows\npromise in improving both simulation efficiency and accuracy, offering a\nvaluable tool for network operators.\n","authors":["Carlos Güemes-Palau","Miquel Ferriol-Galmés","Jordi Paillisse-Vilanova","Albert López-Brescó","Pere Barlet-Ros","Albert Cabellos-Aparicio"],"pdf_url":"https://arxiv.org/pdf/2501.08848v1.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.08822v1","updated":"2025-01-15T14:19:20Z","published":"2025-01-15T14:19:20Z","title":"Deep Learning Meets Queue-Reactive: A Framework for Realistic Limit\n  Order Book Simulation","summary":"  The Queue-Reactive model introduced by Huang et al. (2015) has become a\nstandard tool for limit order book modeling, widely adopted by both researchers\nand practitioners for its simplicity and effectiveness. We present the\nMultidimensional Deep Queue-Reactive (MDQR) model, which extends this framework\nin three ways: it relaxes the assumption of queue independence, enriches the\nstate space with market features, and models the distribution of order sizes.\nThrough a neural network architecture, the model learns complex dependencies\nbetween different price levels and adapts to varying market conditions, while\npreserving the interpretable point-process foundation of the original\nframework. Using data from the Bund futures market, we show that MDQR captures\nkey market properties including the square-root law of market impact,\ncross-queue correlations, and realistic order size patterns. The model\ndemonstrates particular strength in reproducing both conditional and stationary\ndistributions of order sizes, as well as various stylized facts of market\nmicrostructure. The model achieves this while maintaining the computational\nefficiency needed for practical applications such as strategy development\nthrough reinforcement learning or realistic backtesting.\n","authors":["Hamza Bodor","Laurent Carlier"],"pdf_url":"https://arxiv.org/pdf/2501.08822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08821v1","updated":"2025-01-15T14:19:03Z","published":"2025-01-15T14:19:03Z","title":"A Closer Look at the Learnability of Out-of-Distribution (OOD) Detection","summary":"  Machine learning algorithms often encounter different or\n\"out-of-distribution\" (OOD) data at deployment time, and OOD detection is\nfrequently employed to detect these examples. While it works reasonably well in\npractice, existing theoretical results on OOD detection are highly pessimistic.\nIn this work, we take a closer look at this problem, and make a distinction\nbetween uniform and non-uniform learnability, following PAC learning theory. We\ncharacterize under what conditions OOD detection is uniformly and non-uniformly\nlearnable, and we show that in several cases, non-uniform learnability turns a\nnumber of negative results into positive. In all cases where OOD detection is\nlearnable, we provide concrete learning algorithms and a sample-complexity\nanalysis.\n","authors":["Konstantin Garov","Kamalika Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2501.08821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08816v1","updated":"2025-01-15T14:12:59Z","published":"2025-01-15T14:12:59Z","title":"IDEA: Image Description Enhanced CLIP-Adapter","summary":"  CLIP (Contrastive Language-Image Pre-training) has attained great success in\npattern recognition and computer vision. Transferring CLIP to downstream tasks\n(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.\nHowever, current studies primarily focus on either prompt learning for text or\nadapter tuning for vision, without fully exploiting the complementary\ninformation and correlations among image-text pairs. In this paper, we propose\nan Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to\nfew-shot image classification tasks. This method captures fine-grained features\nby leveraging both visual features and textual descriptions of images. IDEA is\na training-free method for CLIP, and it can be comparable to or even exceeds\nstate-of-the-art models on multiple tasks. Furthermore, we introduce\nTrainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable\ncomponents (i.e., a projector and a learnable latent space), further enhancing\nthe model's performance and achieving SOTA results on 11 datasets. As one\nimportant contribution, we employ the Llama model and design a comprehensive\npipeline to generate textual descriptions for images of 11 datasets, resulting\nin a total of 1,637,795 image-text pairs, named \"IMD-11\". Our code and data are\nreleased at https://github.com/FourierAI/IDEA.\n","authors":["Zhipeng Ye","Feng Jiang","Qiufeng Wang","Kaizhu Huang","Jiaqi Huang"],"pdf_url":"https://arxiv.org/pdf/2501.08816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06471v2","updated":"2025-01-15T14:12:04Z","published":"2023-08-12T05:28:49Z","title":"Volterra Accentuated Non-Linear Dynamical Admittance (VANYA) to model\n  Deforestation: An Exemplification from the Amazon Rainforest","summary":"  Intelligent automation supports us against cyclones, droughts, and seismic\nevents with recent technology advancements. Algorithmic learning has advanced\nfields like neuroscience, genetics, and human-computer interaction. Time-series\ndata boosts progress. Challenges persist in adopting these approaches in\ntraditional fields. Neural networks face comprehension and bias issues. AI's\nexpansion across scientific areas is due to adaptable descriptors and\ncombinatorial argumentation. This article focuses on modeling Forest loss using\nthe VANYA Model, incorporating Prey Predator Dynamics. VANYA predicts forest\ncover, demonstrated on Amazon Rainforest data against other forecasters like\nLong Short-Term Memory, N-BEATS, RCN.\n","authors":["Karthik R.","Ramamoorthy A"],"pdf_url":"https://arxiv.org/pdf/2308.06471v2.pdf","comment":"The experimental data used in this article has given wrong practical\n  interpretation. The data has to be updated to improve this"},{"id":"http://arxiv.org/abs/2402.07437v2","updated":"2025-01-15T14:02:51Z","published":"2024-02-12T06:32:53Z","title":"Learning Optimal Tax Design in Nonatomic Congestion Games","summary":"  In multiplayer games, self-interested behavior among the players can harm the\nsocial welfare. Tax mechanisms are a common method to alleviate this issue and\ninduce socially optimal behavior. In this work, we take the initial step of\nlearning the optimal tax that can maximize social welfare with limited feedback\nin congestion games. We propose a new type of feedback named \\emph{equilibrium\nfeedback}, where the tax designer can only observe the Nash equilibrium after\ndeploying a tax plan. Existing algorithms are not applicable due to the\nexponentially large tax function space, nonexistence of the gradient, and\nnonconvexity of the objective. To tackle these challenges, we design a\ncomputationally efficient algorithm that leverages several novel components:\n(1) a piece-wise linear tax to approximate the optimal tax; (2) extra linear\nterms to guarantee a strongly convex potential function; (3) an efficient\nsubroutine to find the exploratory tax that can provide critical information\nabout the game. The algorithm can find an $\\epsilon$-optimal tax with $O(\\beta\nF^2/\\epsilon)$ sample complexity, where $\\beta$ is the smoothness of the cost\nfunction and $F$ is the number of facilities.\n","authors":["Qiwen Cui","Maryam Fazel","Simon S. Du"],"pdf_url":"https://arxiv.org/pdf/2402.07437v2.pdf","comment":"23 pages. Accepted by Conference on Neural Information Processing\n  Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2501.07317v3","updated":"2025-01-15T14:01:15Z","published":"2025-01-13T13:28:03Z","title":"Evaluation of Artificial Intelligence Methods for Lead Time Prediction\n  in Non-Cycled Areas of Automotive Production","summary":"  The present study examines the effectiveness of applying Artificial\nIntelligence methods in an automotive production environment to predict unknown\nlead times in a non-cycle-controlled production area. Data structures are\nanalyzed to identify contextual features and then preprocessed using one-hot\nencoding. Methods selection focuses on supervised machine learning techniques.\nIn supervised learning methods, regression and classification methods are\nevaluated. Continuous regression based on target size distribution is not\nfeasible. Classification methods analysis shows that Ensemble Learning and\nSupport Vector Machines are the most suitable. Preliminary study results\nindicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost\nyield the best results. After further testing and extensive hyperparameter\noptimization, the final method choice is the LightGBM algorithm. Depending on\nfeature availability and prediction interval granularity, relative prediction\naccuracies of up to 90% can be achieved. Further tests highlight the importance\nof periodic retraining of AI models to accurately represent complex production\nprocesses using the database. The research demonstrates that AI methods can be\neffectively applied to highly variable production data, adding business value\nby providing an additional metric for various control tasks while outperforming\ncurrent non AI-based systems.\n","authors":["Cornelius Hake","Jonas Weigele","Frederik Reichert","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2501.07317v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04562v2","updated":"2025-01-15T13:24:49Z","published":"2024-11-07T09:35:22Z","title":"Constrained Latent Action Policies for Model-Based Offline Reinforcement\n  Learning","summary":"  In offline reinforcement learning, a policy is learned using a static dataset\nin the absence of costly feedback from the environment. In contrast to the\nonline setting, only using static datasets poses additional challenges, such as\npolicies generating out-of-distribution samples. Model-based offline\nreinforcement learning methods try to overcome these by learning a model of the\nunderlying dynamics of the environment and using it to guide policy search. It\nis beneficial but, with limited datasets, errors in the model and the issue of\nvalue overestimation among out-of-distribution states can worsen performance.\nCurrent model-based methods apply some notion of conservatism to the Bellman\nupdate, often implemented using uncertainty estimation derived from model\nensembles. In this paper, we propose Constrained Latent Action Policies (C-LAP)\nwhich learns a generative model of the joint distribution of observations and\nactions. We cast policy learning as a constrained objective to always stay\nwithin the support of the latent action distribution, and use the generative\ncapabilities of the model to impose an implicit constraint on the generated\nactions. Thereby eliminating the need to use additional uncertainty penalties\non the Bellman update and significantly decreasing the number of gradient steps\nrequired to learn a policy. We empirically evaluate C-LAP on the D4RL and\nV-D4RL benchmark, and show that C-LAP is competitive to state-of-the-art\nmethods, especially outperforming on datasets with visual observations.\n","authors":["Marvin Alles","Philip Becker-Ehmck","Patrick van der Smagt","Maximilian Karl"],"pdf_url":"https://arxiv.org/pdf/2411.04562v2.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n  2024)"},{"id":"http://arxiv.org/abs/2501.08780v1","updated":"2025-01-15T13:01:47Z","published":"2025-01-15T13:01:47Z","title":"Deep learning for temporal super-resolution 4D Flow MRI","summary":"  4D Flow Magnetic Resonance Imaging (4D Flow MRI) is a non-invasive technique\nfor volumetric, time-resolved blood flow quantification. However, apparent\ntrade-offs between acquisition time, image noise, and resolution limit clinical\napplicability. In particular, in regions of highly transient flow, coarse\ntemporal resolution can hinder accurate capture of physiologically relevant\nflow variations. To overcome these issues, post-processing techniques using\ndeep learning have shown promising results to enhance resolution post-scan\nusing so-called super-resolution networks. However, while super-resolution has\nbeen focusing on spatial upsampling, temporal super-resolution remains largely\nunexplored. The aim of this study was therefore to implement and evaluate a\nresidual network for temporal super-resolution 4D Flow MRI. To achieve this, an\nexisting spatial network (4DFlowNet) was re-designed for temporal upsampling,\nadapting input dimensions, and optimizing internal layer structures. Training\nand testing were performed using synthetic 4D Flow MRI data originating from\npatient-specific in-silico models, as well as using in-vivo datasets. Overall,\nexcellent performance was achieved with input velocities effectively denoised\nand temporally upsampled, with a mean absolute error (MAE) of 1.0 cm/s in an\nunseen in-silico setting, outperforming deterministic alternatives (linear\ninterpolation MAE = 2.3 cm/s, sinc interpolation MAE = 2.6 cm/s). Further, the\nnetwork synthesized high-resolution temporal information from unseen\nlow-resolution in-vivo data, with strong correlation observed at peak flow\nframes. As such, our results highlight the potential of utilizing data-driven\nneural networks for temporal super-resolution 4D Flow MRI, enabling\nhigh-frame-rate flow quantification without extending acquisition times beyond\nclinically acceptable limits.\n","authors":["Pia Callmer","Mia Bonini","Edward Ferdian","David Nordsletten","Daniel Giese","Alistair A. Young","Alexander Fyrdahl","David Marlevi"],"pdf_url":"https://arxiv.org/pdf/2501.08780v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.08779v1","updated":"2025-01-15T13:01:34Z","published":"2025-01-15T13:01:34Z","title":"Nesterov Acceleration for Ensemble Kalman Inversion and Variants","summary":"  Ensemble Kalman inversion (EKI) is a derivative-free, particle-based\noptimization method for solving inverse problems. It can be shown that EKI\napproximates a gradient flow, which allows the application of methods for\naccelerating gradient descent. Here, we show that Nesterov acceleration is\neffective in speeding up the reduction of the EKI cost function on a variety of\ninverse problems. We also implement Nesterov acceleration for two EKI variants,\nunscented Kalman inversion and ensemble transform Kalman inversion. Our\nspecific implementation takes the form of a particle-level nudge that is\ndemonstrably simple to couple in a black-box fashion with any existing EKI\nvariant algorithms, comes with no additional computational expense, and with no\nadditional tuning hyperparameters. This work shows a pathway for future\nresearch to translate advances in gradient-based optimization into advances in\ngradient-free Kalman optimization.\n","authors":["Sydney Vernon","Eviatar Bach","Oliver R. A. Dunbar"],"pdf_url":"https://arxiv.org/pdf/2501.08779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08778v1","updated":"2025-01-15T13:01:32Z","published":"2025-01-15T13:01:32Z","title":"Networked Agents in the Dark: Team Value Learning under Partial\n  Observability","summary":"  We propose a novel cooperative multi-agent reinforcement learning (MARL)\napproach for networked agents. In contrast to previous methods that rely on\ncomplete state information or joint observations, our agents must learn how to\nreach shared objectives under partial observability. During training, they\ncollect individual rewards and approximate a team value function through local\ncommunication, resulting in cooperative behavior. To describe our problem, we\nintroduce the networked dynamic partially observable Markov game framework,\nwhere agents communicate over a switching topology communication network. Our\ndistributed method, DNA-MARL, uses a consensus mechanism for local\ncommunication and gradient descent for local computation. DNA-MARL increases\nthe range of the possible applications of networked agents, being well-suited\nfor real world domains that impose privacy and where the messages may not reach\ntheir recipients. We evaluate DNA-MARL across benchmark MARL scenarios. Our\nresults highlight the superior performance of DNA-MARL over previous methods.\n","authors":["Guilherme S. Varela","Alberto Sardinha","Francisco S. Melo"],"pdf_url":"https://arxiv.org/pdf/2501.08778v1.pdf","comment":"18 pages, 7 figures, 5 tables. Accepted as supplemental material at\n  Proceedings of the 24th International Conference on Autonomous Agents and\n  Multiagent Systems (AAMAS 2025), Detroit, Michigan, USA, May 19 - 23, 2025,\n  IFAAMAS"},{"id":"http://arxiv.org/abs/2311.16054v5","updated":"2025-01-15T12:57:47Z","published":"2023-11-27T18:19:07Z","title":"Metric Space Magnitude for Evaluating the Diversity of Latent\n  Representations","summary":"  The magnitude of a metric space is a novel invariant that provides a measure\nof the 'effective size' of a space across multiple scales, while also capturing\nnumerous geometrical properties, such as curvature, density, or entropy. We\ndevelop a family of magnitude-based measures of the intrinsic diversity of\nlatent representations, formalising a novel notion of dissimilarity between\nmagnitude functions of finite metric spaces. Our measures are provably stable\nunder perturbations of the data, can be efficiently calculated, and enable a\nrigorous multi-scale characterisation and comparison of latent representations.\nWe show their utility and superior performance across different domains and\ntasks, including (i) the automated estimation of diversity, (ii) the detection\nof mode collapse, and (iii) the evaluation of generative models for text,\nimage, and graph data.\n","authors":["Katharina Limbeck","Rayna Andreeva","Rik Sarkar","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2311.16054v5.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n  Systems (NeurIPS) 2024. The code for computing magnitude is available at\n  https://github.com/aidos-lab/magnipy"},{"id":"http://arxiv.org/abs/2501.08760v1","updated":"2025-01-15T12:25:56Z","published":"2025-01-15T12:25:56Z","title":"Leveraging LLM Agents for Translating Network Configurations","summary":"  Configuration translation is a critical and frequent task in network\noperations. When a network device is damaged or outdated, administrators need\nto replace it to maintain service continuity. The replacement devices may\noriginate from different vendors, necessitating configuration translation to\nensure seamless network operation. However, translating configurations manually\nis a labor-intensive and error-prone process. In this paper, we propose an\nintent-based framework for translating network configuration with Large\nLanguage Model (LLM) Agents. The core of our approach is an Intent-based\nRetrieval Augmented Generation (IRAG) module that systematically splits a\nconfiguration file into fragments, extracts intents, and generates accurate\ntranslations. We also design a two-stage verification method to validate the\nsyntax and semantics correctness of the translated configurations. We implement\nand evaluate the proposed method on real-world network configurations.\nExperimental results show that our method achieves 97.74% syntax correctness,\noutperforming state-of-the-art methods in translation accuracy.\n","authors":["Yunze Wei","Xiaohui Xie","Yiwei Zuo","Tianshuo Hu","Xinyi Chen","Kaiwen Chi","Yong Cui"],"pdf_url":"https://arxiv.org/pdf/2501.08760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08002v2","updated":"2025-01-15T11:52:29Z","published":"2025-01-14T10:46:41Z","title":"Maximizing Uncertainty for Federated learning via Bayesian\n  Optimisation-based Model Poisoning","summary":"  As we transition from Narrow Artificial Intelligence towards Artificial Super\nIntelligence, users are increasingly concerned about their privacy and the\ntrustworthiness of machine learning (ML) technology. A common denominator for\nthe metrics of trustworthiness is the quantification of uncertainty inherent in\nDL algorithms, and specifically in the model parameters, input data, and model\npredictions. One of the common approaches to address privacy-related issues in\nDL is to adopt distributed learning such as federated learning (FL), where\nprivate raw data is not shared among users. Despite the privacy-preserving\nmechanisms in FL, it still faces challenges in trustworthiness. Specifically,\nthe malicious users, during training, can systematically create malicious model\nparameters to compromise the models predictive and generative capabilities,\nresulting in high uncertainty about their reliability. To demonstrate malicious\nbehaviour, we propose a novel model poisoning attack method named Delphi which\naims to maximise the uncertainty of the global model output. We achieve this by\ntaking advantage of the relationship between the uncertainty and the model\nparameters of the first hidden layer of the local model. Delphi employs two\ntypes of optimisation , Bayesian Optimisation and Least Squares Trust Region,\nto search for the optimal poisoned model parameters, named as Delphi-BO and\nDelphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise\nthe distance of the predictive probability distribution towards an uncertain\ndistribution of model output. Furthermore, we establish a mathematical proof\nfor the attack effectiveness demonstrated in FL. Numerical results demonstrate\nthat Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR\nhighlighting vulnerability of FL systems to model poisoning attacks.\n","authors":["Marios Aristodemou","Xiaolan Liu","Yuan Wang","Konstantinos G. Kyriakopoulos","Sangarapillai Lambotharan","Qingsong Wei"],"pdf_url":"https://arxiv.org/pdf/2501.08002v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.08738v1","updated":"2025-01-15T11:34:56Z","published":"2025-01-15T11:34:56Z","title":"MeshMask: Physics-Based Simulations with Masked Graph Neural Networks","summary":"  We introduce a novel masked pre-training technique for graph neural networks\n(GNNs) applied to computational fluid dynamics (CFD) problems. By randomly\nmasking up to 40\\% of input mesh nodes during pre-training, we force the model\nto learn robust representations of complex fluid dynamics. We pair this masking\nstrategy with an asymmetric encoder-decoder architecture and gated multi-layer\nperceptrons to further enhance performance. The proposed method achieves\nstate-of-the-art results on seven CFD datasets, including a new challenging\ndataset of 3D intracranial aneurysm simulations with over 250,000 nodes per\nmesh. Moreover, it significantly improves model performance and training\nefficiency across such diverse range of fluid simulation tasks. We demonstrate\nimprovements of up to 60\\% in long-term prediction accuracy compared to\nprevious best models, while maintaining similar computational costs. Notably,\nour approach enables effective pre-training on multiple datasets\nsimultaneously, significantly reducing the time and data required to achieve\nhigh performance on new tasks. Through extensive ablation studies, we provide\ninsights into the optimal masking ratio, architectural choices, and training\nstrategies.\n","authors":["Paul Garnier","Vincent Lannelongue","Jonathan Viquerat","Elie Hachem"],"pdf_url":"https://arxiv.org/pdf/2501.08738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08737v1","updated":"2025-01-15T11:33:52Z","published":"2025-01-15T11:33:52Z","title":"Resource-Constrained Federated Continual Learning: What Does Matter?","summary":"  Federated Continual Learning (FCL) aims to enable sequentially\nprivacy-preserving model training on streams of incoming data that vary in edge\ndevices by preserving previous knowledge while adapting to new data. Current\nFCL literature focuses on restricted data privacy and access to previously seen\ndata while imposing no constraints on the training overhead. This is\nunreasonable for FCL applications in real-world scenarios, where edge devices\nare primarily constrained by resources such as storage, computational budget,\nand label rate. We revisit this problem with a large-scale benchmark and\nanalyze the performance of state-of-the-art FCL approaches under different\nresource-constrained settings. Various typical FCL techniques and six datasets\nin two incremental learning scenarios (Class-IL and Domain-IL) are involved in\nour experiments. Through extensive experiments amounting to a total of over\n1,000+ GPU hours, we find that, under limited resource-constrained settings,\nexisting FCL approaches, with no exception, fail to achieve the expected\nperformance. Our conclusions are consistent in the sensitivity analysis. This\nsuggests that most existing FCL methods are particularly too resource-dependent\nfor real-world deployment. Moreover, we study the performance of typical FCL\ntechniques with resource constraints and shed light on future research\ndirections in FCL.\n","authors":["Yichen Li","Yuying Wang","Jiahua Dong","Haozhao Wang","Yining Qi","Rui Zhang","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2501.08737v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.11165 by other authors"},{"id":"http://arxiv.org/abs/2412.19217v2","updated":"2025-01-15T11:21:16Z","published":"2024-12-26T13:47:04Z","title":"Applying the maximum entropy principle to neural networks enhances\n  multi-species distribution models","summary":"  The rapid expansion of citizen science initiatives has led to a significant\ngrowth of biodiversity databases, and particularly presence-only (PO)\nobservations. PO data are invaluable for understanding species distributions\nand their dynamics, but their use in a Species Distribution Model (SDM) is\ncurtailed by sampling biases and the lack of information on absences. Poisson\npoint processes are widely used for SDMs, with Maxent being one of the most\npopular methods. Maxent maximises the entropy of a probability distribution\nacross sites as a function of predefined transformations of variables, called\nfeatures. In contrast, neural networks and deep learning have emerged as a\npromising technique for automatic feature extraction from complex input\nvariables. Arbitrarily complex transformations of input variables can be\nlearned from the data efficiently through backpropagation and stochastic\ngradient descent (SGD). In this paper, we propose DeepMaxent, which harnesses\nneural networks to automatically learn shared features among species, using the\nmaximum entropy principle. To do so, it employs a normalised Poisson loss where\nfor each species, presence probabilities across sites are modelled by a neural\nnetwork. We evaluate DeepMaxent on a benchmark dataset known for its spatial\nsampling biases, using PO data for calibration and presence-absence (PA) data\nfor validation across six regions with different biological groups and\ncovariates. Our results indicate that DeepMaxent performs better than Maxent\nand other leading SDMs across all regions and taxonomic groups. The method\nperforms particularly well in regions of uneven sampling, demonstrating\nsubstantial potential to increase SDM performances. In particular, our approach\nyields more accurate predictions than traditional single-species models, which\nopens up new possibilities for methodological enhancement.\n","authors":["Maxime Ryckewaert","Diego Marcos","Christophe Botella","Maximilien Servajean","Pierre Bonnet","Alexis Joly"],"pdf_url":"https://arxiv.org/pdf/2412.19217v2.pdf","comment":"Submitted to Methods in Ecology and Evolution"},{"id":"http://arxiv.org/abs/2407.00956v3","updated":"2025-01-15T11:19:30Z","published":"2024-07-01T04:24:07Z","title":"A Closer Look at Deep Learning Methods on Tabular Datasets","summary":"  Tabular data is prevalent across diverse domains in machine learning. While\nclassical methods like tree-based models have long been effective, Deep Neural\nNetwork (DNN)-based methods have recently demonstrated promising performance.\nHowever, the diverse characteristics of methods and the inherent heterogeneity\nof tabular datasets make understanding and interpreting tabular methods both\nchallenging and prone to unstable observations. In this paper, we conduct\nin-depth evaluations and comprehensive analyses of tabular methods, with a\nparticular focus on DNN-based models, using a benchmark of over 300 tabular\ndatasets spanning a wide range of task types, sizes, and domains. First, we\nperform an extensive comparison of 32 state-of-the-art deep and tree-based\nmethods, evaluating their average performance across multiple criteria.\nAlthough method ranks vary across datasets, we empirically find that\ntop-performing methods tend to concentrate within a small subset of tabular\nmodels, regardless of the criteria used. Next, we investigate whether the\ntraining dynamics of deep tabular models can be predicted based on dataset\nproperties. This approach not only offers insights into the behavior of deep\ntabular methods but also identifies a core set of \"meta-features\" that reflect\ndataset heterogeneity. The other subset includes datasets where method ranks\nare consistent with the overall benchmark, acting as a reliable probe for\nfurther tabular analysis.\n","authors":["Han-Jia Ye","Si-Yang Liu","Hao-Run Cai","Qi-Le Zhou","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2407.00956v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07592v3","updated":"2025-01-15T11:18:10Z","published":"2024-06-11T12:15:47Z","title":"MambaLRP: Explaining Selective State Space Sequence Models","summary":"  Recent sequence modeling approaches using selective state space sequence\nmodels, referred to as Mamba models, have seen a surge of interest. These\nmodels allow efficient processing of long sequences in linear time and are\nrapidly being adopted in a wide range of applications such as language\nmodeling, demonstrating promising performance. To foster their reliable use in\nreal-world scenarios, it is crucial to augment their transparency. Our work\nbridges this critical gap by bringing explainability, particularly Layer-wise\nRelevance Propagation (LRP), to the Mamba architecture. Guided by the axiom of\nrelevance conservation, we identify specific components in the Mamba\narchitecture, which cause unfaithful explanations. To remedy this issue, we\npropose MambaLRP, a novel algorithm within the LRP framework, which ensures a\nmore stable and reliable relevance propagation through these components. Our\nproposed method is theoretically sound and excels in achieving state-of-the-art\nexplanation performance across a diverse range of models and datasets.\nMoreover, MambaLRP facilitates a deeper inspection of Mamba architectures,\nuncovering various biases and evaluating their significance. It also enables\nthe analysis of previous speculations regarding the long-range capabilities of\nMamba models.\n","authors":["Farnoush Rezaei Jafari","Grégoire Montavon","Klaus-Robert Müller","Oliver Eberle"],"pdf_url":"https://arxiv.org/pdf/2406.07592v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08729v1","updated":"2025-01-15T11:11:38Z","published":"2025-01-15T11:11:38Z","title":"GRAPPA - A Hybrid Graph Neural Network for Predicting Pure Component\n  Vapor Pressures","summary":"  Although the pure component vapor pressure is one of the most important\nproperties for designing chemical processes, no broadly applicable,\nsufficiently accurate, and open-source prediction method has been available. To\novercome this, we have developed GRAPPA - a hybrid graph neural network for\npredicting vapor pressures of pure components. GRAPPA enables the prediction of\nthe vapor pressure curve of basically any organic molecule, requiring only the\nmolecular structure as input. The new model consists of three parts: A graph\nattention network for the message passing step, a pooling function that\ncaptures long-range interactions, and a prediction head that yields the\ncomponent-specific parameters of the Antoine equation, from which the vapor\npressure can readily and consistently be calculated for any temperature. We\nhave trained and evaluated GRAPPA on experimental vapor pressure data of almost\n25,000 pure components. We found excellent prediction accuracy for unseen\ncomponents, outperforming state-of-the-art group contribution methods and other\nmachine learning approaches in applicability and accuracy. The trained model\nand its code are fully disclosed, and GRAPPA is directly applicable via the\ninteractive website ml-prop.mv.rptu.de.\n","authors":["Marco Hoffmann","Hans Hasse","Fabian Jirasek"],"pdf_url":"https://arxiv.org/pdf/2501.08729v1.pdf","comment":"38 pages, 12 figures"},{"id":"http://arxiv.org/abs/2501.08727v1","updated":"2025-01-15T11:10:37Z","published":"2025-01-15T11:10:37Z","title":"Transformed Low-rank Adaptation via Tensor Decomposition and Its\n  Applications to Text-to-image Models","summary":"  Parameter-Efficient Fine-Tuning (PEFT) of text-to-image models has become an\nincreasingly popular technique with many applications. Among the various PEFT\nmethods, Low-Rank Adaptation (LoRA) and its variants have gained significant\nattention due to their effectiveness, enabling users to fine-tune models with\nlimited computational resources. However, the approximation gap between the\nlow-rank assumption and desired fine-tuning weights prevents the simultaneous\nacquisition of ultra-parameter-efficiency and better performance. To reduce\nthis gap and further improve the power of LoRA, we propose a new PEFT method\nthat combines two classes of adaptations, namely, transform and residual\nadaptations. In specific, we first apply a full-rank and dense transform to the\npre-trained weight. This learnable transform is expected to align the\npre-trained weight as closely as possible to the desired weight, thereby\nreducing the rank of the residual weight. Then, the residual part can be\neffectively approximated by more compact and parameter-efficient structures,\nwith a smaller approximation error. To achieve ultra-parameter-efficiency in\npractice, we design highly flexible and effective tensor decompositions for\nboth the transform and residual adaptations. Additionally, popular PEFT methods\nsuch as DoRA can be summarized under this transform plus residual adaptation\nscheme. Experiments are conducted on fine-tuning Stable Diffusion models in\nsubject-driven and controllable generation. The results manifest that our\nmethod can achieve better performances and parameter efficiency compared to\nLoRA and several baselines.\n","authors":["Zerui Tao","Yuhta Takida","Naoki Murata","Qibin Zhao","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2501.08727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14378v3","updated":"2025-01-15T11:07:35Z","published":"2024-09-22T09:48:45Z","title":"Sparse Low-Ranked Self-Attention Transformer for Remaining Useful\n  Lifetime Prediction of Optical Fiber Amplifiers","summary":"  Optical fiber amplifiers are key elements in present optical networks.\nFailures of these components result in high financial loss of income of the\nnetwork operator as the communication traffic over an affected link is\ninterrupted. Applying Remaining useful lifetime (RUL) prediction in the context\nof Predictive Maintenance (PdM) to optical fiber amplifiers to predict upcoming\nsystem failures at an early stage, so that network outages can be minimized\nthrough planning of targeted maintenance actions, ensures reliability and\nsafety. Optical fiber amplifier are complex systems, that work under various\noperating conditions, which makes correct forecasting a difficult task.\nIncreased monitoring capabilities of systems results in datasets that\nfacilitate the application of data-driven RUL prediction methods. Deep learning\nmodels in particular have shown good performance, but generalization based on\ncomparatively small datasets for RUL prediction is difficult. In this paper, we\npropose Sparse Low-ranked self-Attention Transformer (SLAT) as a novel RUL\nprediction method. SLAT is based on an encoder-decoder architecture, wherein\ntwo parallel working encoders extract features for sensors and time steps. By\nutilizing the self-attention mechanism, long-term dependencies can be learned\nfrom long sequences. The implementation of sparsity in the attention matrix and\na low-rank parametrization reduce overfitting and increase generalization.\nExperimental application to optical fiber amplifiers exemplified on EDFA, as\nwell as a reference dataset from turbofan engines, shows that SLAT outperforms\nthe state-of-the-art methods.\n","authors":["Dominic Schneider","Lutz Rapp"],"pdf_url":"https://arxiv.org/pdf/2409.14378v3.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.08717v1","updated":"2025-01-15T10:58:32Z","published":"2025-01-15T10:58:32Z","title":"$\\texttt{InfoHier}$: Hierarchical Information Extraction via Encoding\n  and Embedding","summary":"  Analyzing large-scale datasets, especially involving complex and\nhigh-dimensional data like images, is particularly challenging. While\nself-supervised learning (SSL) has proven effective for learning\nrepresentations from unlabelled data, it typically focuses on flat,\nnon-hierarchical structures, missing the multi-level relationships present in\nmany real-world datasets. Hierarchical clustering (HC) can uncover these\nrelationships by organizing data into a tree-like structure, but it often\nrelies on rigid similarity metrics that struggle to capture the complexity of\ndiverse data types. To address these we envision $\\texttt{InfoHier}$, a\nframework that combines SSL with HC to jointly learn robust latent\nrepresentations and hierarchical structures. This approach leverages SSL to\nprovide adaptive representations, enhancing HC's ability to capture complex\npatterns. Simultaneously, it integrates HC loss to refine SSL training,\nresulting in representations that are more attuned to the underlying\ninformation hierarchy. $\\texttt{InfoHier}$ has the potential to improve the\nexpressiveness and performance of both clustering and representation learning,\noffering significant benefits for data analysis, management, and information\nretrieval.\n","authors":["Tianru Zhang","Li Ju","Prashant Singh","Salman Toor"],"pdf_url":"https://arxiv.org/pdf/2501.08717v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.08712v1","updated":"2025-01-15T10:54:21Z","published":"2025-01-15T10:54:21Z","title":"Self-supervised Transformation Learning for Equivariant Representations","summary":"  Unsupervised representation learning has significantly advanced various\nmachine learning tasks. In the computer vision domain, state-of-the-art\napproaches utilize transformations like random crop and color jitter to achieve\ninvariant representations, embedding semantically the same inputs despite\ntransformations. However, this can degrade performance in tasks requiring\nprecise features, such as localization or flower classification. To address\nthis, recent research incorporates equivariant representation learning, which\ncaptures transformation-sensitive information. However, current methods depend\non transformation labels and thus struggle with interdependency and complex\ntransformations. We propose Self-supervised Transformation Learning (STL),\nreplacing transformation labels with transformation representations derived\nfrom image pairs. The proposed method ensures transformation representation is\nimage-invariant and learns corresponding equivariant transformations, enhancing\nperformance without increased batch complexity. We demonstrate the approach's\neffectiveness across diverse classification and detection tasks, outperforming\nexisting methods in 7 out of 11 benchmarks and excelling in detection. By\nintegrating complex transformations like AugMix, unusable by prior equivariant\nmethods, this approach enhances performance across tasks, underscoring its\nadaptability and resilience. Additionally, its compatibility with various base\nmodels highlights its flexibility and broad applicability. The code is\navailable at https://github.com/jaemyung-u/stl.\n","authors":["Jaemyung Yu","Jaehyun Choi","Dong-Jae Lee","HyeongGwon Hong","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2501.08712v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n  2024)"},{"id":"http://arxiv.org/abs/2501.08710v1","updated":"2025-01-15T10:50:54Z","published":"2025-01-15T10:50:54Z","title":"Disentangled Interleaving Variational Encoding","summary":"  Conflicting objectives present a considerable challenge in interleaving\nmulti-task learning, necessitating the need for meticulous design and balance\nto ensure effective learning of a representative latent data space across all\ntasks without mutual negative impact. Drawing inspiration from the concept of\nmarginal and conditional probability distributions in probability theory, we\ndesign a principled and well-founded approach to disentangle the original input\ninto marginal and conditional probability distributions in the latent space of\na variational autoencoder. Our proposed model, Deep Disentangled Interleaving\nVariational Encoding (DeepDIVE) learns disentangled features from the original\ninput to form clusters in the embedding space and unifies these features via\nthe cross-attention mechanism in the fusion stage. We theoretically prove that\ncombining the objectives for reconstruction and forecasting fully captures the\nlower bound and mathematically derive a loss function for disentanglement using\nNa\\\"ive Bayes. Under the assumption that the prior is a mixture of log-concave\ndistributions, we also establish that the Kullback-Leibler divergence between\nthe prior and the posterior is upper bounded by a function minimized by the\nminimizer of the cross entropy loss, informing our adoption of radial basis\nfunctions (RBF) and cross entropy with interleaving training for DeepDIVE to\nprovide a justified basis for convergence. Experiments on two public datasets\nshow that DeepDIVE disentangles the original input and yields forecast\naccuracies better than the original VAE and comparable to existing\nstate-of-the-art baselines.\n","authors":["Noelle Y. L. Wong","Eng Yeow Cheu","Zhonglin Chiam"],"pdf_url":"https://arxiv.org/pdf/2501.08710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09495v3","updated":"2025-01-15T10:47:05Z","published":"2024-06-13T17:36:05Z","title":"FADE: Towards Fairness-aware Augmentation for Domain Generalization via\n  Classifier-Guided Score-based Diffusion Models","summary":"  Fairness-aware domain generalization (FairDG) has emerged as a critical\nchallenge for deploying trustworthy AI systems, particularly in scenarios\ninvolving distribution shifts. Traditional methods for addressing fairness have\nfailed in domain generalization due to their lack of consideration for\ndistribution shifts. Although disentanglement has been used to tackle FairDG,\nit is limited by its strong assumptions. To overcome these limitations, we\npropose Fairness-aware Classifier-Guided Score-based Diffusion Models (FADE) as\na novel approach to effectively address the FairDG issue. Specifically, we\nfirst pre-train a score-based diffusion model (SDM) and two classifiers to\nequip the model with strong generalization capabilities across different\ndomains. Then, we guide the SDM using these pre-trained classifiers to\neffectively eliminate sensitive information from the generated data. Finally,\nthe generated fair data is used to train downstream classifiers, ensuring\nrobust performance under new data distributions. Extensive experiments on three\nreal-world datasets demonstrate that FADE not only enhances fairness but also\nimproves accuracy in the presence of distribution shifts. Additionally, FADE\noutperforms existing methods in achieving the best accuracy-fairness\ntrade-offs.\n","authors":["Yujie Lin","Dong Li","Chen Zhao","Minglai Shao","Guihong Wan"],"pdf_url":"https://arxiv.org/pdf/2406.09495v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06484v6","updated":"2025-01-15T10:41:40Z","published":"2024-06-10T17:24:42Z","title":"Parallelizing Linear Transformers with the Delta Rule over Sequence\n  Length","summary":"  Transformers with linear attention (i.e., linear transformers) and\nstate-space models have recently been suggested as a viable linear-time\nalternative to transformers with softmax attention. However, these models still\nunderperform transformers especially on tasks that require in-context\nretrieval. While more expressive variants of linear transformers which replace\nthe additive update in linear transformers with the delta rule (DeltaNet) have\nbeen found to be more effective at associative recall, existing algorithms for\ntraining such models do not parallelize over sequence length and are thus\ninefficient to train on modern hardware. This work describes a\nhardware-efficient algorithm for training linear transformers with the delta\nrule, which exploits a memory-efficient representation for computing products\nof Householder matrices. This algorithm allows us to scale up DeltaNet to\nstandard language modeling settings. We train a 1.3B model for 100B tokens and\nfind that it outperforms recent linear-time baselines such as Mamba and GLA in\nterms of perplexity and zero-shot performance on downstream tasks. We also\nexperiment with two hybrid models which combine DeltaNet layers with (1)\nsliding-window attention layers every other layer or (2) two global attention\nlayers, and find that these hybrids outperform strong transformer baselines.\n","authors":["Songlin Yang","Bailin Wang","Yu Zhang","Yikang Shen","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2406.06484v6.pdf","comment":"Final camera ready"},{"id":"http://arxiv.org/abs/2501.08115v2","updated":"2025-01-15T10:05:39Z","published":"2025-01-14T13:46:07Z","title":"RoHan: Robust Hand Detection in Operation Room","summary":"  Hand-specific localization has garnered significant interest within the\ncomputer vision community. Although there are numerous datasets with hand\nannotations from various angles and settings, domain transfer techniques\nfrequently struggle in surgical environments. This is mainly due to the limited\navailability of gloved hand instances and the unique challenges of operating\nrooms (ORs). Thus, hand-detection models tailored to OR settings require\nextensive training and expensive annotation processes. To overcome these\nchallenges, we present \"RoHan\" - a novel approach for robust hand detection in\nthe OR, leveraging advanced semi-supervised domain adaptation techniques to\ntackle the challenges of varying recording conditions, diverse glove colors,\nand occlusions common in surgical settings. Our methodology encompasses two\nmain stages: (1) data augmentation strategy that utilizes \"Artificial Gloves,\"\na method for augmenting publicly available hand datasets with synthetic images\nof hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that\nimproves detection performance in real-world OR settings through iterative\nprediction refinement and efficient frame filtering. We evaluate our method\nusing two datasets: simulated enterotomy repair and saphenous vein graft\nharvesting. \"RoHan\" substantially reduces the need for extensive labeling and\nmodel training, paving the way for the practical implementation of hand\ndetection technologies in medical settings.\n","authors":["Roi Papo","Sapir Gershov","Tom Friedman","Itay Or","Gil Bolotin","Shlomi Laufer"],"pdf_url":"https://arxiv.org/pdf/2501.08115v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2409.18836v2","updated":"2025-01-15T10:02:02Z","published":"2024-09-27T15:29:32Z","title":"Constructing Confidence Intervals for 'the' Generalization Error -- a\n  Comprehensive Benchmark Study","summary":"  When assessing the quality of prediction models in machine learning,\nconfidence intervals (CIs) for the generalization error, which measures\npredictive performance, are a crucial tool. Luckily, there exist many methods\nfor computing such CIs and new promising approaches are continuously being\nproposed. Typically, these methods combine various resampling procedures, most\npopular among them cross-validation and bootstrapping, with different variance\nestimation techniques. Unfortunately, however, there is currently no consensus\non when any of these combinations may be most reliably employed and how they\ngenerally compare. In this work, we conduct a large-scale study comparing CIs\nfor the generalization error, the first one of such size, where we empirically\nevaluate 13 different CI methods on a total of 19 tabular regression and\nclassification problems, using seven different inducers and a total of eight\nloss functions. We give an overview of the methodological foundations and\ninherent challenges of constructing CIs for the generalization error and\nprovide a concise review of all 13 methods in a unified framework. Finally, the\nCI methods are evaluated in terms of their relative coverage frequency, width,\nand runtime. Based on these findings, we can identify a subset of methods that\nwe would recommend. We also publish the datasets as a benchmarking suite on\nOpenML and our code on GitHub to serve as a basis for further studies.\n","authors":["Hannah Schulz-Kümpel","Sebastian Fischer","Roman Hornung","Anne-Laure Boulesteix","Thomas Nagler","Bernd Bischl"],"pdf_url":"https://arxiv.org/pdf/2409.18836v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05807v2","updated":"2025-01-15T09:53:49Z","published":"2024-10-08T08:40:07Z","title":"Extended convexity and smoothness and their applications in deep\n  learning","summary":"  This paper introduces an optimization framework aimed at providing a\ntheoretical foundation for a class of composite optimization problems,\nparticularly those encountered in deep learning. In this framework, we\nintroduce $\\mathcal{H}(\\phi)$-convexity and $\\mathcal{H}(\\Phi)$-smoothness to\ngeneralize the existing concepts of Lipschitz smoothness and strong convexity.\nFurthermore, we analyze and establish the convergence of both gradient descent\nand stochastic gradient descent methods for objective functions that are\n$\\mathcal{H}(\\Phi)$-smooth. We prove that the optimal convergence rates of\nthese methods depend solely on the homogeneous degree of $\\Phi$. Based on these\nfindings, we construct two types of non-convex and non-smooth optimization\nproblems: deterministic composite and stochastic composite optimization\nproblems, which encompass the majority of optimization problems in deep\nlearning. To address these problems, we develop the gradient structure control\nalgorithm and prove that it can locate approximate global optima. This marks a\nsignificant departure from traditional non-convex analysis framework, which\ntypically settle for stationary points. Therefore, with the introduction of\n$\\mathcal{H}(\\phi)$-convexity and $\\mathcal{H}(\\Phi)$-smoothness, along with\nthe GSC algorithm, the non-convex optimization mechanisms in deep learning can\nbe theoretically explained and supported. Finally, the effectiveness of the\nproposed framework is substantiated through empirical experimentation.\n","authors":["Binchuan Qi","Wei Gong","Li Li"],"pdf_url":"https://arxiv.org/pdf/2410.05807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05301v2","updated":"2025-01-15T09:42:42Z","published":"2024-10-04T12:22:54Z","title":"Diffusion-based Unsupervised Audio-visual Speech Enhancement","summary":"  This paper proposes a new unsupervised audio-visual speech enhancement (AVSE)\napproach that combines a diffusion-based audio-visual speech generative model\nwith a non-negative matrix factorization (NMF) noise model. First, the\ndiffusion model is pre-trained on clean speech conditioned on corresponding\nvideo data to simulate the speech generative distribution. This pre-trained\nmodel is then paired with the NMF-based noise model to estimate clean speech\niteratively. Specifically, a diffusion-based posterior sampling approach is\nimplemented within the reverse diffusion process, where after each iteration, a\nspeech estimate is obtained and used to update the noise parameters.\nExperimental results confirm that the proposed AVSE approach not only\noutperforms its audio-only counterpart but also generalizes better than a\nrecent supervised-generative AVSE method. Additionally, the new inference\nalgorithm offers a better balance between inference speed and performance\ncompared to the previous diffusion-based method. Code and demo available at:\nhttps://jeaneudesayilo.github.io/fast_UdiffSE\n","authors":["Jean-Eudes Ayilo","Mostafa Sadeghi","Romain Serizel","Xavier Alameda-Pineda"],"pdf_url":"https://arxiv.org/pdf/2410.05301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12588v2","updated":"2025-01-15T09:30:18Z","published":"2024-01-23T09:43:30Z","title":"Interpreting Equivariant Representations","summary":"  Latent representations are used extensively for downstream tasks, such as\nvisualization, interpolation or feature extraction of deep learning models.\nInvariant and equivariant neural networks are powerful and well-established\nmodels for enforcing inductive biases. In this paper, we demonstrate that the\ninductive bias imposed on the by an equivariant model must also be taken into\naccount when using latent representations. We show how not accounting for the\ninductive biases leads to decreased performance on downstream tasks, and vice\nversa, how accounting for inductive biases can be done effectively by using an\ninvariant projection of the latent representations. We propose principles for\nhow to choose such a projection, and show the impact of using these principles\nin two common examples: First, we study a permutation equivariant variational\nauto-encoder trained for molecule graph generation; here we show that invariant\nprojections can be designed that incur no loss of information in the resulting\ninvariant representation. Next, we study a rotation-equivariant representation\nused for image classification. Here, we illustrate how random invariant\nprojections can be used to obtain an invariant representation with a high\ndegree of retained information. In both cases, the analysis of invariant latent\nrepresentations proves superior to their equivariant counterparts. Finally, we\nillustrate that the phenomena documented here for equivariant neural networks\nhave counterparts in standard neural networks where invariance is encouraged\nvia augmentation. Thus, while these ambiguities may be known by experienced\ndevelopers of equivariant models, we make both the knowledge as well as\neffective tools to handle the ambiguities available to the broader community.\n","authors":["Andreas Abildtrup Hansen","Anna Calissano","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2401.12588v2.pdf","comment":"This paper was updated to reflect the version accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2407.13977v3","updated":"2025-01-15T09:25:02Z","published":"2024-07-19T02:06:08Z","title":"A Unified Confidence Sequence for Generalized Linear Models, with\n  Applications to Bandits","summary":"  We present a unified likelihood ratio-based confidence sequence (CS) for any\n(self-concordant) generalized linear model (GLM) that is guaranteed to be\nconvex and numerically tight. We show that this is on par or improves upon\nknown CSs for various GLMs, including Gaussian, Bernoulli, and Poisson. In\nparticular, for the first time, our CS for Bernoulli has a\n$\\mathrm{poly}(S)$-free radius where $S$ is the norm of the unknown parameter.\nOur first technical novelty is its derivation, which utilizes a time-uniform\nPAC-Bayesian bound with a uniform prior/posterior, despite the latter being a\nrather unpopular choice for deriving CSs. As a direct application of our new\nCS, we propose a simple and natural optimistic algorithm called OFUGLB,\napplicable to any generalized linear bandits (GLB; Filippi et al. (2010)). Our\nanalysis shows that the celebrated optimistic approach simultaneously attains\nstate-of-the-art regrets for various self-concordant (not necessarily bounded)\nGLBs, and even $\\mathrm{poly}(S)$-free for bounded GLBs, including logistic\nbandits. The regret analysis, our second technical novelty, follows from\ncombining our new CS with a new proof technique that completely avoids the\npreviously widely used self-concordant control lemma (Faury et al., 2020, Lemma\n9). Numerically, OFUGLB outperforms or is at par with prior algorithms for\nlogistic bandits.\n","authors":["Junghyun Lee","Se-Young Yun","Kwang-Sung Jun"],"pdf_url":"https://arxiv.org/pdf/2407.13977v3.pdf","comment":"39 pages, 2 figures, 2 tables; Accepted to the 38th Conference on\n  Neural Information Processing Systems (NeurIPS 2024) (ver3: minor revisions,\n  code refactoring; ver2: major revision, including new experiments,\n  reorganization, fixing typos in the proofs of ver1, etc)"},{"id":"http://arxiv.org/abs/2401.15299v3","updated":"2025-01-15T09:23:55Z","published":"2024-01-27T05:14:17Z","title":"SupplyGraph: A Benchmark Dataset for Supply Chain Planning using Graph\n  Neural Networks","summary":"  Graph Neural Networks (GNNs) have gained traction across different domains\nsuch as transportation, bio-informatics, language processing, and computer\nvision. However, there is a noticeable absence of research on applying GNNs to\nsupply chain networks. Supply chain networks are inherently graph-like in\nstructure, making them prime candidates for applying GNN methodologies. This\nopens up a world of possibilities for optimizing, predicting, and solving even\nthe most complex supply chain problems. A major setback in this approach lies\nin the absence of real-world benchmark datasets to facilitate the research and\nresolution of supply chain problems using GNNs. To address the issue, we\npresent a real-world benchmark dataset for temporal tasks, obtained from one of\nthe leading FMCG companies in Bangladesh, focusing on supply chain planning for\nproduction purposes. The dataset includes temporal data as node features to\nenable sales predictions, production planning, and the identification of\nfactory issues. By utilizing this dataset, researchers can employ GNNs to\naddress numerous supply chain problems, thereby advancing the field of supply\nchain analytics and planning. Source: https://github.com/CIOL-SUST/SupplyGraph\n","authors":["Azmine Toushik Wasi","MD Shafikul Islam","Adipto Raihan Akib"],"pdf_url":"https://arxiv.org/pdf/2401.15299v3.pdf","comment":"Accepted to 4th workshop on Graphs and more Complex structures for\n  Learning and Reasoning, colocated with AAAI 2024"},{"id":"http://arxiv.org/abs/2501.08679v1","updated":"2025-01-15T09:20:02Z","published":"2025-01-15T09:20:02Z","title":"Diagonal Over-parameterization in Reproducing Kernel Hilbert Spaces as\n  an Adaptive Feature Model: Generalization and Adaptivity","summary":"  This paper introduces a diagonal adaptive kernel model that dynamically\nlearns kernel eigenvalues and output coefficients simultaneously during\ntraining. Unlike fixed-kernel methods tied to the neural tangent kernel theory,\nthe diagonal adaptive kernel model adapts to the structure of the truth\nfunction, significantly improving generalization over fixed-kernel methods,\nespecially when the initial kernel is misaligned with the target. Moreover, we\nshow that the adaptivity comes from learning the right eigenvalues during\ntraining, showing a feature learning behavior. By extending to deeper\nparameterization, we further show how extra depth enhances adaptability and\ngeneralization. This study combines the insights from feature learning and\nimplicit regularization and provides new perspective into the adaptivity and\ngeneralization potential of neural networks beyond the kernel regime.\n","authors":["Yicheng Li","Qian Lin"],"pdf_url":"https://arxiv.org/pdf/2501.08679v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2409.00894"},{"id":"http://arxiv.org/abs/2410.10524v2","updated":"2025-01-15T09:17:01Z","published":"2024-10-14T14:04:36Z","title":"Get Rid of Isolation: A Continuous Multi-task Spatio-Temporal Learning\n  Framework","summary":"  Spatiotemporal learning has become a pivotal technique to enable urban\nintelligence. Traditional spatiotemporal models mostly focus on a specific task\nby assuming a same distribution between training and testing sets. However,\ngiven that urban systems are usually dynamic, multi-sourced with imbalanced\ndata distributions, current specific task-specific models fail to generalize to\nnew urban conditions and adapt to new domains without explicitly modeling\ninterdependencies across various dimensions and types of urban data. To this\nend, we argue that there is an essential to propose a Continuous Multi-task\nSpatio-Temporal learning framework (CMuST) to empower collective urban\nintelligence, which reforms the urban spatiotemporal learning from\nsingle-domain to cooperatively multi-dimensional and multi-task learning.\nSpecifically, CMuST proposes a new multi-dimensional spatiotemporal interaction\nnetwork (MSTI) to allow cross-interactions between context and main\nobservations as well as self-interactions within spatial and temporal aspects\nto be exposed, which is also the core for capturing task-level commonality and\npersonalization. To ensure continuous task learning, a novel Rolling Adaptation\ntraining scheme (RoAda) is devised, which not only preserves task uniqueness by\nconstructing data summarization-driven task prompts, but also harnesses\ncorrelated patterns among tasks by iterative model behavior modeling. We\nfurther establish a benchmark of three cities for multi-task spatiotemporal\nlearning, and empirically demonstrate the superiority of CMuST via extensive\nevaluations on these datasets. The impressive improvements on both few-shot\nstreaming data and new domain tasks against existing SOAT methods are achieved.\nCode is available at https://github.com/DILab-USTCSZ/CMuST.\n","authors":["Zhongchao Yi","Zhengyang Zhou","Qihe Huang","Yanjiang Chen","Liheng Yu","Xu Wang","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.10524v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.08678v1","updated":"2025-01-15T09:08:05Z","published":"2025-01-15T09:08:05Z","title":"Investigating Parameter-Efficiency of Hybrid QuGANs Based on Geometric\n  Properties of Generated Sea Route Graphs","summary":"  The demand for artificially generated data for the development, training and\ntesting of new algorithms is omnipresent. Quantum computing (QC), does offer\nthe hope that its inherent probabilistic functionality can be utilised in this\nfield of generative artificial intelligence. In this study, we use\nquantum-classical hybrid generative adversarial networks (QuGANs) to\nartificially generate graphs of shipping routes. We create a training dataset\nbased on real shipping data and investigate to what extent QuGANs are able to\nlearn and reproduce inherent distributions and geometric features of this data.\nWe compare hybrid QuGANs with classical Generative Adversarial Networks (GANs),\nwith a special focus on their parameter efficiency. Our results indicate that\nQuGANs are indeed able to quickly learn and represent underlying geometric\nproperties and distributions, although they seem to have difficulties in\nintroducing variance into the sampled data. Compared to classical GANs of\ngreater size, measured in the number of parameters used, some QuGANs show\nsimilar result quality. Our reference to concrete use cases, such as the\ngeneration of shipping data, provides an illustrative example and demonstrate\nthe potential and diversity in which QC can be used.\n","authors":["Tobias Rohe","Florian Burger","Michael Kölle","Sebastian Wölckert","Maximilian Zorn","Claudia Linnhoff-Popien"],"pdf_url":"https://arxiv.org/pdf/2501.08678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08669v1","updated":"2025-01-15T09:04:19Z","published":"2025-01-15T09:04:19Z","title":"SPEQ: Stabilization Phases for Efficient Q-Learning in High\n  Update-To-Data Ratio Reinforcement Learning","summary":"  A key challenge in Deep Reinforcement Learning is sample efficiency,\nespecially in real-world applications where collecting environment interactions\nis expensive or risky. Recent off-policy algorithms improve sample efficiency\nby increasing the Update-To-Data (UTD) ratio and performing more gradient\nupdates per environment interaction. While this improves sample efficiency, it\nsignificantly increases computational cost due to the higher number of gradient\nupdates required. In this paper we propose a sample-efficient method to improve\ncomputational efficiency by separating training into distinct learning phases\nin order to exploit gradient updates more effectively. Our approach builds on\ntop of the Dropout Q-Functions (DroQ) algorithm and alternates between an\nonline, low UTD ratio training phase, and an offline stabilization phase.\nDuring the stabilization phase, we fine-tune the Q-functions without collecting\nnew environment interactions. This process improves the effectiveness of the\nreplay buffer and reduces computational overhead. Our experimental results on\ncontinuous control problems show that our method achieves results comparable to\nstate-of-the-art, high UTD ratio algorithms while requiring 56\\% fewer gradient\nupdates and 50\\% less training time than DroQ. Our approach offers an effective\nand computationally economical solution while maintaining the same sample\nefficiency as the more costly, high UTD ratio state-of-the-art.\n","authors":["Carlo Romeo","Girolamo Macaluso","Alessandro Sestini","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2501.08669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04671v2","updated":"2025-01-15T09:01:09Z","published":"2024-12-05T23:47:58Z","title":"Fully Distributed, Flexible Compositional Visual Representations via\n  Soft Tensor Products","summary":"  Since the inception of the classicalist vs. connectionist debate, it has been\nargued that the ability to systematically combine symbol-like entities into\ncompositional representations is crucial for human intelligence. In\nconnectionist systems, the field of disentanglement has gained prominence for\nits ability to produce explicitly compositional representations; however, it\nrelies on a fundamentally symbolic, concatenative representation of\ncompositional structure that clashes with the continuous, distributed\nfoundations of deep learning. To resolve this tension, we extend Smolensky's\nTensor Product Representation (TPR) and introduce Soft TPR, a representational\nform that encodes compositional structure in an inherently distributed,\nflexible manner, along with Soft TPR Autoencoder, a theoretically-principled\narchitecture designed specifically to learn Soft TPRs. Comprehensive\nevaluations in the visual representation learning domain demonstrate that the\nSoft TPR framework consistently outperforms conventional disentanglement\nalternatives -- achieving state-of-the-art disentanglement, boosting\nrepresentation learner convergence, and delivering superior sample efficiency\nand low-sample regime performance in downstream tasks. These findings highlight\nthe promise of a distributed and flexible approach to representing\ncompositional structure by potentially enhancing alignment with the core\nprinciples of deep learning over the conventional symbolic approach.\n","authors":["Bethia Sun","Maurice Pagnucco","Yang Song"],"pdf_url":"https://arxiv.org/pdf/2412.04671v2.pdf","comment":"Accepted to Neurips 2024. 10 pages + supplementary"},{"id":"http://arxiv.org/abs/2501.08662v1","updated":"2025-01-15T08:57:41Z","published":"2025-01-15T08:57:41Z","title":"Product of Gaussian Mixture Diffusion Model for non-linear MRI Inversion","summary":"  Diffusion models have recently shown remarkable results in magnetic resonance\nimaging reconstruction. However, the employed networks typically are black-box\nestimators of the (smoothed) prior score with tens of millions of parameters,\nrestricting interpretability and increasing reconstruction time. Furthermore,\nparallel imaging reconstruction algorithms either rely on off-line coil\nsensitivity estimation, which is prone to misalignment and restricting sampling\ntrajectories, or perform per-coil reconstruction, making the computational cost\nproportional to the number of coils. To overcome this, we jointly reconstruct\nthe image and the coil sensitivities using the lightweight,\nparameter-efficient, and interpretable product of Gaussian mixture diffusion\nmodel as an image prior and a classical smoothness priors on the coil\nsensitivities. The proposed method delivers promising results while allowing\nfor fast inference and demonstrating robustness to contrast out-of-distribution\ndata and sampling trajectories, comparable to classical variational penalties\nsuch as total variation. Finally, the probabilistic formulation allows the\ncalculation of the posterior expectation and pixel-wise variance.\n","authors":["Laurenz Nagler","Martin Zach","Thomas Pock"],"pdf_url":"https://arxiv.org/pdf/2501.08662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08653v1","updated":"2025-01-15T08:38:07Z","published":"2025-01-15T08:38:07Z","title":"Fine-grained Spatio-temporal Event Prediction with Self-adaptive Anchor\n  Graph","summary":"  Event prediction tasks often handle spatio-temporal data distributed in a\nlarge spatial area. Different regions in the area exhibit different\ncharacteristics while having latent correlations. This spatial heterogeneity\nand correlations greatly affect the spatio-temporal distributions of event\noccurrences, which has not been addressed by state-of-the-art models. Learning\nspatial dependencies of events in a continuous space is challenging due to its\nfine granularity and a lack of prior knowledge. In this work, we propose a\nnovel Graph Spatio-Temporal Point Process (GSTPP) model for fine-grained event\nprediction. It adopts an encoder-decoder architecture that jointly models the\nstate dynamics of spatially localized regions using neural Ordinary\nDifferential Equations (ODEs). The state evolution is built on the foundation\nof a novel Self-Adaptive Anchor Graph (SAAG) that captures spatial\ndependencies. By adaptively localizing the anchor nodes in the space and\njointly constructing the correlation edges between them, the SAAG enhances the\nmodel's ability of learning complex spatial event patterns. The proposed GSTPP\nmodel greatly improves the accuracy of fine-grained event prediction. Extensive\nexperimental results show that our method greatly improves the prediction\naccuracy over existing spatio-temporal event prediction approaches.\n","authors":["Wang-Tao Zhou","Zhao Kang","Sicong Liu","Lizong Zhang","Ling Tian"],"pdf_url":"https://arxiv.org/pdf/2501.08653v1.pdf","comment":"Accepted to SIAM International Conference on Data Mining 2025\n  (SDM'25)"},{"id":"http://arxiv.org/abs/2501.08649v1","updated":"2025-01-15T08:24:35Z","published":"2025-01-15T08:24:35Z","title":"Joint Learning of Depth and Appearance for Portrait Image Animation","summary":"  2D portrait animation has experienced significant advancements in recent\nyears. Much research has utilized the prior knowledge embedded in large\ngenerative diffusion models to enhance high-quality image manipulation.\nHowever, most methods only focus on generating RGB images as output, and the\nco-generation of consistent visual plus 3D output remains largely\nunder-explored. In our work, we propose to jointly learn the visual appearance\nand depth simultaneously in a diffusion-based portrait image generator. Our\nmethod embraces the end-to-end diffusion paradigm and introduces a new\narchitecture suitable for learning this conditional joint distribution,\nconsisting of a reference network and a channel-expanded diffusion backbone.\nOnce trained, our framework can be efficiently adapted to various downstream\napplications, such as facial depth-to-image and image-to-depth generation,\nportrait relighting, and audio-driven talking head animation with consistent 3D\noutput.\n","authors":["Xinya Ji","Gaspard Zoss","Prashanth Chandran","Lingchen Yang","Xun Cao","Barbara Solenthaler","Derek Bradley"],"pdf_url":"https://arxiv.org/pdf/2501.08649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16705v2","updated":"2025-01-15T08:20:19Z","published":"2024-02-26T16:21:53Z","title":"SelectIT: Selective Instruction Tuning for LLMs via Uncertainty-Aware\n  Self-Reflection","summary":"  Instruction tuning (IT) is crucial to tailoring large language models (LLMs)\ntowards human-centric interactions. Recent advancements have shown that the\ncareful selection of a small, high-quality subset of IT data can significantly\nenhance the performance of LLMs. Despite this, common approaches often rely on\nadditional models or data, which increases costs and limits widespread\nadoption. In this work, we propose a novel approach, termed SelectIT, that\ncapitalizes on the foundational capabilities of the LLM itself. Specifically,\nwe exploit the intrinsic uncertainty present in LLMs to more effectively select\nhigh-quality IT data, without the need for extra resources. Furthermore, we\nintroduce a curated IT dataset, the Selective Alpaca, created by applying\nSelectIT to the Alpaca-GPT4 dataset. Empirical results demonstrate that IT\nusing Selective Alpaca leads to substantial model ability enhancement. The\nrobustness of SelectIT has also been corroborated in various foundation models\nand domain-specific tasks. Our findings suggest that longer and more\ncomputationally intensive IT data may serve as superior sources of IT, offering\nvaluable insights for future research in this area. Data, code, and scripts are\nfreely available at https://github.com/Blue-Raincoat/SelectIT.\n","authors":["Liangxin Liu","Xuebo Liu","Derek F. Wong","Dongfang Li","Ziyi Wang","Baotian Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.16705v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.19212v5","updated":"2025-01-15T08:18:27Z","published":"2024-09-28T02:30:44Z","title":"An Accelerated Algorithm for Stochastic Bilevel Optimization under\n  Unbounded Smoothness","summary":"  This paper investigates a class of stochastic bilevel optimization problems\nwhere the upper-level function is nonconvex with potentially unbounded\nsmoothness and the lower-level problem is strongly convex. These problems have\nsignificant applications in sequential data learning, such as text\nclassification using recurrent neural networks. The unbounded smoothness is\ncharacterized by the smoothness constant of the upper-level function scaling\nlinearly with the gradient norm, lacking a uniform upper bound. Existing\nstate-of-the-art algorithms require $\\widetilde{O}(1/\\epsilon^4)$ oracle calls\nof stochastic gradient or Hessian/Jacobian-vector product to find an\n$\\epsilon$-stationary point. However, it remains unclear if we can further\nimprove the convergence rate when the assumptions for the function in the\npopulation level also hold for each random realization almost surely. To\naddress this issue, we propose a new Accelerated Bilevel Optimization algorithm\nnamed AccBO. The algorithm updates the upper-level variable by normalized\nstochastic gradient descent with recursive momentum and the lower-level\nvariable by the stochastic Nesterov accelerated gradient descent algorithm with\naveraging. We prove that our algorithm achieves an oracle complexity of\n$\\widetilde{O}(1/\\epsilon^3)$ to find an $\\epsilon$-stationary point, when the\nlower-level stochastic gradient's variance is $O(\\epsilon)$. Our proof relies\non a novel lemma characterizing the dynamics of stochastic Nesterov accelerated\ngradient descent algorithm under distribution drift with high probability for\nthe lower-level variable, which is of independent interest and also plays a\ncrucial role in analyzing the hypergradient estimation error over time.\nExperimental results on various tasks confirm that our proposed algorithm\nachieves the predicted theoretical acceleration and significantly outperforms\nbaselines in bilevel optimization.\n","authors":["Xiaochuan Gong","Jie Hao","Mingrui Liu"],"pdf_url":"https://arxiv.org/pdf/2409.19212v5.pdf","comment":"Accepted by NeurIPS 2024. The code is available at\n  https://github.com/MingruiLiu-ML-Lab/Accelerated-Bilevel-Optimization-Unbounded-Smoothness"},{"id":"http://arxiv.org/abs/2304.03271v4","updated":"2025-01-15T08:07:44Z","published":"2023-04-06T17:55:27Z","title":"Making AI Less \"Thirsty\": Uncovering and Addressing the Secret Water\n  Footprint of AI Models","summary":"  The growing carbon footprint of artificial intelligence (AI) has been\nundergoing public scrutiny. Nonetheless, the equally important water\n(withdrawal and consumption) footprint of AI has largely remained under the\nradar. For example, training the GPT-3 language model in Microsoft's\nstate-of-the-art U.S. data centers can directly evaporate 700,000 liters of\nclean freshwater, but such information has been kept a secret. More critically,\nthe global AI demand is projected to account for 4.2-6.6 billion cubic meters\nof water withdrawal in 2027, which is more than the total annual water\nwithdrawal of 4-6 Denmark or half of the United Kingdom. This is concerning, as\nfreshwater scarcity has become one of the most pressing challenges. To respond\nto the global water challenges, AI can, and also must, take social\nresponsibility and lead by example by addressing its own water footprint. In\nthis paper, we provide a principled methodology to estimate the water footprint\nof AI, and also discuss the unique spatial-temporal diversities of AI's runtime\nwater efficiency. Finally, we highlight the necessity of holistically\naddressing water footprint along with carbon footprint to enable truly\nsustainable AI.\n","authors":["Pengfei Li","Jianyi Yang","Mohammad A. Islam","Shaolei Ren"],"pdf_url":"https://arxiv.org/pdf/2304.03271v4.pdf","comment":"Accepted by Communications of the ACM. Source codes available at:\n  https://github.com/Ren-Research/Making-AI-Less-Thirsty"},{"id":"http://arxiv.org/abs/2501.08640v1","updated":"2025-01-15T08:06:03Z","published":"2025-01-15T08:06:03Z","title":"Quantum Reservoir Computing and Risk Bounds","summary":"  We propose a way to bound the generalisation errors of several classes of\nquantum reservoirs using the Rademacher complexity. We give specific,\nparameter-dependent bounds for two particular quantum reservoir classes. We\nanalyse how the generalisation bounds scale with growing numbers of qubits.\nApplying our results to classes with polynomial readout functions, we find that\nthe risk bounds converge in the number of training samples. The explicit\ndependence on the quantum reservoir and readout parameters in our bounds can be\nused to control the generalisation error to a certain extent. It should be\nnoted that the bounds scale exponentially with the number of qubits $n$. The\nupper bounds on the Rademacher complexity can be applied to other reservoir\nclasses that fulfill a few hypotheses on the quantum dynamics and the readout\nfunction.\n","authors":["Naomi Mona Chmielewski","Nina Amini","Joseph Mikael"],"pdf_url":"https://arxiv.org/pdf/2501.08640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12117v3","updated":"2025-01-15T08:03:55Z","published":"2024-07-16T18:59:49Z","title":"MEMO: Fine-grained Tensor Management For Ultra-long Context LLM Training","summary":"  Nowadays, Large Language Models (LLMs) have been trained using extended\ncontext lengths to foster more creative applications. However, long context\ntraining poses great challenges considering the constraint of GPU memory. It\nnot only leads to substantial activation memory consumption during training,\nbut also incurs considerable memory fragmentation. To facilitate long context\ntraining, existing frameworks have adopted strategies such as recomputation and\nvarious forms of parallelisms. Nevertheless, these techniques rely on redundant\ncomputation or extensive communication, resulting in low Model FLOPS\nUtilization (MFU). In this paper, we propose MEMO, a novel LLM training\nframework designed for fine-grained activation memory management. Given the\nquadratic scaling of computation and linear scaling of memory with sequence\nlengths when using FlashAttention, we offload memory-consuming activations to\nCPU memory after each layer's forward pass and fetch them during the backward\npass. To maximize the swapping of activations without hindering computation,\nand to avoid exhausting limited CPU memory, we implement a token-wise\nactivation recomputation and swapping mechanism. Furthermore, we tackle the\nmemory fragmentation issue by employing a bi-level Mixed Integer Programming\n(MIP) approach, optimizing memory reuse across transformer layers. Empirical\nresults demonstrate that MEMO achieves an average of 1.97x and 1.80x MFU\ncompared to Megatron-LM and DeepSpeed, respectively. This improvement is\nattributed to MEMO's ability to minimize memory fragmentation, reduce\nrecomputation and intensive communication, and circumvent the delays associated\nwith the memory reorganization process due to fragmentation. By leveraging\nfine-grained activation memory management, MEMO facilitates efficient training\nof 7B LLM with 1 million sequence length on just 8 A800 GPUs, achieving an MFU\nof 52.30%.\n","authors":["Pinxue Zhao","Hailin Zhang","Fangcheng Fu","Xiaonan Nie","Qibin Liu","Fang Yang","Yuanbo Peng","Dian Jiao","Shuaipeng Li","Jinbao Xue","Yangyu Tao","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2407.12117v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04239v3","updated":"2025-01-15T07:59:39Z","published":"2025-01-08T02:32:48Z","title":"Dynamic Localisation of Spatial-Temporal Graph Neural Network","summary":"  Spatial-temporal data, fundamental to many intelligent applications, reveals\ndependencies indicating causal links between present measurements at specific\nlocations and historical data at the same or other locations. Within this\ncontext, adaptive spatial-temporal graph neural networks (ASTGNNs) have emerged\nas valuable tools for modelling these dependencies, especially through a\ndata-driven approach rather than pre-defined spatial graphs. While this\napproach offers higher accuracy, it presents increased computational demands.\nAddressing this challenge, this paper delves into the concept of localisation\nwithin ASTGNNs, introducing an innovative perspective that spatial dependencies\nshould be dynamically evolving over time. We introduce \\textit{DynAGS}, a\nlocalised ASTGNN framework aimed at maximising efficiency and accuracy in\ndistributed deployment. This framework integrates dynamic localisation,\ntime-evolving spatial graphs, and personalised localisation, all orchestrated\naround the Dynamic Graph Generator, a light-weighted central module leveraging\ncross attention. The central module can integrate historical information in a\nnode-independent manner to enhance the feature representation of nodes at the\ncurrent moment. This improved feature representation is then used to generate a\ndynamic sparse graph without the need for costly data exchanges, and it\nsupports personalised localisation. Performance assessments across two core\nASTGNN architectures and nine real-world datasets from various applications\nreveal that \\textit{DynAGS} outshines current benchmarks, underscoring that the\ndynamic modelling of spatial dependencies can drastically improve model\nexpressibility, flexibility, and system efficiency, especially in distributed\nsettings.\n","authors":["Wenying Duan","Shujun Guo","Wei huang","Hong Rao","Xiaoxi He"],"pdf_url":"https://arxiv.org/pdf/2501.04239v3.pdf","comment":"This paper was accepted by KDD'25"},{"id":"http://arxiv.org/abs/2501.08631v1","updated":"2025-01-15T07:36:19Z","published":"2025-01-15T07:36:19Z","title":"SWSC: Shared Weight for Similar Channel in LLM","summary":"  Large language models (LLMs) have spurred development in multiple industries.\nHowever, the growing number of their parameters brings substantial storage and\ncomputing burdens, making it essential to explore model compression techniques\nfor parameter reduction and easier deployment. We propose SWSC, an LLM\ncompression method based on the concept of Shared Weight for Similar Channel.\nIt uses the K-Means clustering algorithm to cluster model weights\nchannel-by-channel, generating clusters with highly similar vectors within\neach. A representative vector from each cluster is selected to approximately\nreplace all vectors in the cluster, significantly reducing the number of model\nweight parameters. However, approximate restoration will inevitably cause\ndamage to the performance of the model. To tackle this issue, we perform\nsingular value decomposition on the weight error values before and after\ncompression and retain the larger singular values and their corresponding\nsingular vectors to compensate for the accuracy. The experimental results show\nthat our method can effectively ensure the performance of the compressed LLM\neven under low-precision conditions.\n","authors":["Binrui Zeng","Yongtao Tang","Xiaodong Liu","Xiaopeng Li"],"pdf_url":"https://arxiv.org/pdf/2501.08631v1.pdf","comment":"5pages, 3 figures, work in progress"},{"id":"http://arxiv.org/abs/2411.15098v4","updated":"2025-01-15T07:30:29Z","published":"2024-11-22T17:55:15Z","title":"OminiControl: Minimal and Universal Control for Diffusion Transformer","summary":"  In this paper, we introduce OminiControl, a highly versatile and\nparameter-efficient framework that integrates image conditions into pre-trained\nDiffusion Transformer (DiT) models. At its core, OminiControl leverages a\nparameter reuse mechanism, enabling the DiT to encode image conditions using\nitself as a powerful backbone and process them with its flexible multi-modal\nattention processors. Unlike existing methods, which rely heavily on additional\nencoder modules with complex architectures, OminiControl (1) effectively and\nefficiently incorporates injected image conditions with only ~0.1% additional\nparameters, and (2) addresses a wide range of image conditioning tasks in a\nunified manner, including subject-driven generation and spatially-aligned\nconditions such as edges, depth, and more. Remarkably, these capabilities are\nachieved by training on images generated by the DiT itself, which is\nparticularly beneficial for subject-driven generation. Extensive evaluations\ndemonstrate that OminiControl outperforms existing UNet-based and DiT-adapted\nmodels in both subject-driven and spatially-aligned conditional generation.\nAdditionally, we release our training dataset, Subjects200K, a diverse\ncollection of over 200,000 identity-consistent images, along with an efficient\ndata synthesis pipeline to advance research in subject-consistent generation.\n","authors":["Zhenxiong Tan","Songhua Liu","Xingyi Yang","Qiaochu Xue","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15098v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08628v1","updated":"2025-01-15T07:18:51Z","published":"2025-01-15T07:18:51Z","title":"Transformer-based Multivariate Time Series Anomaly Localization","summary":"  With the growing complexity of Cyber-Physical Systems (CPS) and the\nintegration of Internet of Things (IoT), the use of sensors for online\nmonitoring generates large volume of multivariate time series (MTS) data.\nConsequently, the need for robust anomaly diagnosis in MTS is paramount to\nmaintaining system reliability and safety. While significant advancements have\nbeen made in anomaly detection, localization remains a largely underexplored\narea, though crucial for intelligent decision-making. This paper introduces a\nnovel transformer-based model for unsupervised anomaly diagnosis in MTS, with a\nfocus on improving localization performance, through an in-depth analysis of\nthe self-attention mechanism's learning behavior under both normal and\nanomalous conditions. We formulate the anomaly localization problem as a\nthree-stage process: time-step, window, and segment-based. This leads to the\ndevelopment of the Space-Time Anomaly Score (STAS), a new metric inspired by\nthe connection between transformer latent representations and space-time\nstatistical models. STAS is designed to capture individual anomaly behaviors\nand inter-series dependencies, delivering enhanced localization performance.\nAdditionally, the Statistical Feature Anomaly Score (SFAS) complements STAS by\nanalyzing statistical features around anomalies, with their combination helping\nto reduce false alarms. Experiments on real world and synthetic datasets\nillustrate the model's superiority over state-of-the-art methods in both\ndetection and localization tasks.\n","authors":["Charalampos Shimillas","Kleanthis Malialis","Konstantinos Fokianos","Marios M. Polycarpou"],"pdf_url":"https://arxiv.org/pdf/2501.08628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00453v4","updated":"2025-01-15T07:18:43Z","published":"2024-11-01T09:05:47Z","title":"Diffusion Models as Network Optimizers: Explorations and Analysis","summary":"  Network optimization is a fundamental challenge in the Internet of Things\n(IoT) network, often characterized by complex features that make it difficult\nto solve these problems. Recently, generative diffusion models (GDMs) have\nemerged as a promising new approach to network optimization, with the potential\nto directly address these optimization problems. However, the application of\nGDMs in this field is still in its early stages, and there is a noticeable lack\nof theoretical research and empirical findings. In this study, we first explore\nthe intrinsic characteristics of generative models. Next, we provide a concise\ntheoretical proof and intuitive demonstration of the advantages of generative\nmodels over discriminative models in network optimization. Based on this\nexploration, we implement GDMs as optimizers aimed at learning high-quality\nsolution distributions for given inputs, sampling from these distributions\nduring inference to approximate or achieve optimal solutions. Specifically, we\nutilize denoising diffusion probabilistic models (DDPMs) and employ a\nclassifier-free guidance mechanism to manage conditional guidance based on\ninput parameters. We conduct extensive experiments across three challenging\nnetwork optimization problems. By investigating various model configurations\nand the principles of GDMs as optimizers, we demonstrate the ability to\novercome prediction errors and validate the convergence of generated solutions\nto optimal solutions. We provide code and data at\nhttps://github.com/qiyu3816/DiffSG.\n","authors":["Ruihuai Liang","Bo Yang","Pengyu Chen","Xianjin Li","Yifan Xue","Zhiwen Yu","Xuelin Cao","Yan Zhang","Mérouane Debbah","H. Vincent Poor","Chau Yuen"],"pdf_url":"https://arxiv.org/pdf/2411.00453v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10919v3","updated":"2025-01-15T07:17:58Z","published":"2024-08-20T15:04:14Z","title":"CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network","summary":"  In recent years, Wi-Fi sensing has garnered significant attention due to its\nnumerous benefits, such as privacy protection, low cost, and penetration\nability. Extensive research has been conducted in this field, focusing on areas\nsuch as gesture recognition, people identification, and fall detection.\nHowever, many data-driven methods encounter challenges related to domain shift,\nwhere the model fails to perform well in environments different from the\ntraining data. One major factor contributing to this issue is the limited\navailability of Wi-Fi sensing datasets, which makes models learn excessive\nirrelevant information and over-fit to the training set. Unfortunately,\ncollecting large-scale Wi-Fi sensing datasets across diverse scenarios is a\nchallenging task. To address this problem, we propose CrossFi, a siamese\nnetwork-based approach that excels in both in-domain scenario and cross-domain\nscenario, including few-shot, zero-shot scenarios, and even works in few-shot\nnew-class scenario where testing set contains new categories. The core\ncomponent of CrossFi is a sample-similarity calculation network called CSi-Net,\nwhich improves the structure of the siamese network by using an attention\nmechanism to capture similarity information, instead of simply calculating the\ndistance or cosine similarity. Based on it, we develop an extra Weight-Net that\ncan generate a template for each class, so that our CrossFi can work in\ndifferent scenarios. Experimental results demonstrate that our CrossFi achieves\nstate-of-the-art performance across various scenarios. In gesture recognition\ntask, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%\nin one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,\nand 84.75% in one-shot new-class scenario. The code for our model is publicly\navailable at https://github.com/RS2002/CrossFi.\n","authors":["Zijian Zhao","Tingwei Chen","Zhijie Cai","Xiaoyang Li","Hang Li","Qimei Chen","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.10919v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08626v1","updated":"2025-01-15T07:07:48Z","published":"2025-01-15T07:07:48Z","title":"A Learning Algorithm That Attains the Human Optimum in a Repeated\n  Human-Machine Interaction Game","summary":"  When humans interact with learning-based control systems, a common goal is to\nminimize a cost function known only to the human. For instance, an exoskeleton\nmay adapt its assistance in an effort to minimize the human's metabolic\ncost-of-transport. Conventional approaches to synthesizing the learning\nalgorithm solve an inverse problem to infer the human's cost. However, these\nproblems can be ill-posed, hard to solve, or sensitive to problem data. Here we\nshow a game-theoretic learning algorithm that works solely by observing human\nactions to find the cost minimum, avoiding the need to solve an inverse\nproblem. We evaluate the performance of our algorithm in an extensive set of\nhuman subjects experiments, demonstrating consistent convergence to the minimum\nof a prescribed human cost function in scalar and multidimensional\ninstantiations of the game. We conclude by outlining future directions for\ntheoretical and empirical extensions of our results.\n","authors":["Jason T. Isa","Lillian J. Ratliff","Samuel A. Burden"],"pdf_url":"https://arxiv.org/pdf/2501.08626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00961v2","updated":"2025-01-15T06:46:51Z","published":"2025-01-01T21:45:00Z","title":"The Silent Majority: Demystifying Memorization Effect in the Presence of\n  Spurious Correlations","summary":"  Machine learning models often rely on simple spurious features -- patterns in\ntraining data that correlate with targets but are not causally related to them,\nlike image backgrounds in foreground classification. This reliance typically\nleads to imbalanced test performance across minority and majority groups. In\nthis work, we take a closer look at the fundamental cause of such imbalanced\nperformance through the lens of memorization, which refers to the ability to\npredict accurately on \\textit{atypical} examples (minority groups) in the\ntraining set but failing in achieving the same accuracy in the testing set.\nThis paper systematically shows the ubiquitous existence of spurious features\nin a small set of neurons within the network, providing the first-ever evidence\nthat memorization may contribute to imbalanced group performance. Through three\nexperimental sources of converging empirical evidence, we find the property of\na small subset of neurons or channels in memorizing minority group information.\nInspired by these findings, we articulate the hypothesis: the imbalanced group\nperformance is a byproduct of ``noisy'' spurious memorization confined to a\nsmall set of neurons. To further substantiate this hypothesis, we show that\neliminating these unnecessary spurious memorization patterns via a novel\nframework during training can significantly affect the model performance on\nminority groups. Our experimental results across various architectures and\nbenchmarks offer new insights on how neural networks encode core and spurious\nknowledge, laying the groundwork for future research in demystifying robustness\nto spurious correlation.\n","authors":["Chenyu You","Haocheng Dai","Yifei Min","Jasjeet S. Sekhon","Sarang Joshi","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2501.00961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08620v1","updated":"2025-01-15T06:35:39Z","published":"2025-01-15T06:35:39Z","title":"CT-PatchTST: Channel-Time Patch Time-Series Transformer for Long-Term\n  Renewable Energy Forecasting","summary":"  Accurately predicting renewable energy output is crucial for the efficient\nintegration of solar and wind power into modern energy systems. This study\ndevelops and evaluates an advanced deep learning model, Channel-Time Patch\nTime-Series Transformer (CT-PatchTST), to forecast the power output of\nphotovoltaic and wind energy systems using annual offshore wind power, onshore\nwind power, and solar power generation data from Denmark. While the original\nPatch Time-Series Transformer(PatchTST) model employs a channel-independent\n(CI) approach, it tends to overlook inter-channel relationships during\ntraining, potentially leading to a loss of critical information. To address\nthis limitation and further leverage the benefits of increased data granularity\nbrought by CI, we propose CT-PatchTST. This enhanced model improves the\nprocessing of inter-channel information while maintaining the advantages of the\nchannel-independent approach. The predictive performance of CT-PatchTST is\nrigorously analyzed, demonstrating its ability to provide precise and reliable\nenergy forecasts. This work contributes to improving the predictability of\nrenewable energy systems, supporting their broader adoption and integration\ninto energy grids.\n","authors":["Menghao Huo","Kuan Lu","Yuxiao Li","Qiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.08620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08617v1","updated":"2025-01-15T06:33:15Z","published":"2025-01-15T06:33:15Z","title":"RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation","summary":"  Generative AI systems like foundation models (FMs) must align well with human\nvalues to ensure their behavior is helpful and trustworthy. While Reinforcement\nLearning from Human Feedback (RLHF) has shown promise for optimizing model\nperformance using human judgments, existing RLHF pipelines predominantly rely\non immediate feedback, which can fail to accurately reflect the downstream\nimpact of an interaction on users' utility. We demonstrate that feedback based\non evaluators' foresight estimates of downstream consequences systematically\ninduces Goodhart's Law dynamics, incentivizing misaligned behaviors like\nsycophancy and deception and ultimately degrading user outcomes. To alleviate\nthis, we propose decoupling evaluation from prediction by refocusing RLHF on\nhindsight feedback. Our theoretical analysis reveals that conditioning\nevaluator feedback on downstream observations mitigates misalignment and\nimproves expected human utility, even when these observations are simulated by\nthe AI system itself. To leverage this insight in a practical alignment\nalgorithm, we introduce Reinforcement Learning from Hindsight Simulation\n(RLHS), which first simulates plausible consequences and then elicits feedback\nto assess what behaviors were genuinely beneficial in hindsight. We apply RLHS\nto two widely-employed online and offline preference optimization methods --\nProximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) --\nand show empirically that misalignment is significantly reduced with both\nmethods. Through an online human user study, we show that RLHS consistently\noutperforms RLHF in helping users achieve their goals and earns higher\nsatisfaction ratings, despite being trained solely with simulated hindsight\nfeedback. These results underscore the importance of focusing on long-term\nconsequences, even simulated ones, to mitigate misalignment in RLHF.\n","authors":["Kaiqu Liang","Haimin Hu","Ryan Liu","Thomas L. Griffiths","Jaime Fernández Fisac"],"pdf_url":"https://arxiv.org/pdf/2501.08617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08615v1","updated":"2025-01-15T06:30:26Z","published":"2025-01-15T06:30:26Z","title":"Towards Aligned Data Forgetting via Twin Machine Unlearning","summary":"  Modern privacy regulations have spurred the evolution of machine unlearning,\na technique enabling a trained model to efficiently forget specific training\ndata. In prior unlearning methods, the concept of \"data forgetting\" is often\ninterpreted and implemented as achieving zero classification accuracy on such\ndata. Nevertheless, the authentic aim of machine unlearning is to achieve\nalignment between the unlearned model and the gold model, i.e., encouraging\nthem to have identical classification accuracy. On the other hand, the gold\nmodel often exhibits non-zero classification accuracy due to its generalization\nability. To achieve aligned data forgetting, we propose a Twin Machine\nUnlearning (TMU) approach, where a twin unlearning problem is defined\ncorresponding to the original unlearning problem. Consequently, the\ngeneralization-label predictor trained on the twin problem can be transferred\nto the original problem, facilitating aligned data forgetting. Comprehensive\nempirical experiments illustrate that our approach significantly enhances the\nalignment between the unlearned model and the gold model.\n","authors":["Zhenxing Niu","Haoxuan Ji","Yuyao Sun","Zheng Lin","Fei Gao","Yuhang Wang","Haichao Gao"],"pdf_url":"https://arxiv.org/pdf/2501.08615v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2408.11433"},{"id":"http://arxiv.org/abs/2501.08612v1","updated":"2025-01-15T06:20:25Z","published":"2025-01-15T06:20:25Z","title":"Neural Risk-sensitive Satisficing in Contextual Bandits","summary":"  The contextual bandit problem, which is a type of reinforcement learning\ntasks, provides an effective framework for solving challenges in recommendation\nsystems, such as satisfying real-time requirements, enabling personalization,\naddressing cold-start problems. However, contextual bandit algorithms face\nchallenges since they need to handle large state-action spaces sequentially.\nThese challenges include the high costs for learning and balancing exploration\nand exploitation, as well as large variations in performance that depend on the\ndomain of application. To address these challenges, Tsuboya et~al. proposed the\nRegional Linear Risk-sensitive Satisficing (RegLinRS) algorithm. RegLinRS\nswitches between exploration and exploitation based on how well the agent has\nachieved the target. However, the reward expectations in RegLinRS are linearly\napproximated based on features, which limits its applicability when the\nrelationship between features and reward expectations is non-linear. To handle\nmore complex environments, we proposed Neural Risk-sensitive Satisficing\n(NeuralRS), which incorporates neural networks into RegLinRS, and demonstrated\nits utility.\n","authors":["Shogo Ito","Tatsuji Takahashi","Yu Kono"],"pdf_url":"https://arxiv.org/pdf/2501.08612v1.pdf","comment":"Accepted by AROB-ISBC 2025"},{"id":"http://arxiv.org/abs/2407.01960v2","updated":"2025-01-15T06:06:31Z","published":"2024-07-02T05:31:59Z","title":"Zero-shot Video Restoration and Enhancement Using Pre-Trained Image\n  Diffusion Model","summary":"  Diffusion-based zero-shot image restoration and enhancement models have\nachieved great success in various tasks of image restoration and enhancement.\nHowever, directly applying them to video restoration and enhancement results in\nsevere temporal flickering artifacts. In this paper, we propose the first\nframework for zero-shot video restoration and enhancement based on the\npre-trained image diffusion model. By replacing the spatial self-attention\nlayer with the proposed short-long-range (SLR) temporal attention layer, the\npre-trained image diffusion model can take advantage of the temporal\ncorrelation between frames. We further propose temporal consistency guidance,\nspatial-temporal noise sharing, and an early stopping sampling strategy to\nimprove temporally consistent sampling. Our method is a plug-and-play module\nthat can be inserted into any diffusion-based image restoration or enhancement\nmethods to further improve their performance. Experimental results demonstrate\nthe superiority of our proposed method. Our code is available at\nhttps://github.com/cao-cong/ZVRD.\n","authors":["Cong Cao","Huanjing Yue","Xin Liu","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01960v2.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2401.04385v4","updated":"2025-01-15T06:00:17Z","published":"2024-01-09T07:14:45Z","title":"Machine unlearning through fine-grained model parameters perturbation","summary":"  Machine unlearning techniques, which involve retracting data records and\nreducing influence of said data on trained models, help with the user privacy\nprotection objective but incur significant computational costs. Weight\nperturbation-based unlearning is a general approach, but it typically involves\nglobally modifying the parameters. We propose fine-grained Top-K and Random-k\nparameters perturbed inexact machine unlearning strategies that address the\nprivacy needs while keeping the computational costs tractable.\n  In order to demonstrate the efficacy of our strategies we also tackle the\nchallenge of evaluating the effectiveness of machine unlearning by considering\nthe model's generalization performance across both unlearning and remaining\ndata. To better assess the unlearning effect and model generalization, we\npropose novel metrics, namely, the forgetting rate and memory retention rate.\nHowever, for inexact machine unlearning, current metrics are inadequate in\nquantifying the degree of forgetting that occurs after unlearning strategies\nare applied. To address this, we introduce SPD-GAN, which subtly perturbs the\ndistribution of data targeted for unlearning. Then, we evaluate the degree of\nunlearning by measuring the performance difference of the models on the\nperturbed unlearning data before and after the unlearning process. By\nimplementing these innovative techniques and metrics, we achieve\ncomputationally efficacious privacy protection in machine learning applications\nwithout significant sacrifice of model performance. Furthermore, this approach\nprovides a novel method for evaluating the degree of unlearning.\n","authors":["Zhiwei Zuo","Zhuo Tang","Kenli Li","Anwitaman Datta"],"pdf_url":"https://arxiv.org/pdf/2401.04385v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02285v5","updated":"2025-01-15T05:53:54Z","published":"2023-06-04T07:26:20Z","title":"Clarify Confused Nodes via Separated Learning","summary":"  Graph neural networks (GNNs) have achieved remarkable advances in\ngraph-oriented tasks. However, real-world graphs invariably contain a certain\nproportion of heterophilous nodes, challenging the homophily assumption of\ntraditional GNNs and hindering their performance. Most existing studies\ncontinue to design generic models with shared weights between heterophilous and\nhomophilous nodes. Despite the incorporation of high-order messages or\nmulti-channel architectures, these efforts often fall short. A minority of\nstudies attempt to train different node groups separately but suffer from\ninappropriate separation metrics and low efficiency. In this paper, we first\npropose a new metric, termed Neighborhood Confusion (NC), to facilitate a more\nreliable separation of nodes. We observe that node groups with different levels\nof NC values exhibit certain differences in intra-group accuracy and visualized\nembeddings. These pave the way for Neighborhood Confusion-guided Graph\nConvolutional Network (NCGCN), in which nodes are grouped by their NC values\nand accept intra-group weight sharing and message passing. Extensive\nexperiments on both homophilous and heterophilous benchmarks demonstrate that\nour framework can effectively separate nodes and yield significant performance\nimprovement compared to the latest methods. The source code will be available\nin https://github.com/GISec-Team/NCGNN.\n","authors":["Jiajun Zhou","Shengbo Gong","Xuanze Chen","Chenxuan Xie","Shanqing Yu","Qi Xuan","Xiaoniu Yang"],"pdf_url":"https://arxiv.org/pdf/2306.02285v5.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n  Intelligence"},{"id":"http://arxiv.org/abs/2412.09468v2","updated":"2025-01-15T05:25:35Z","published":"2024-12-12T17:15:49Z","title":"STORM: A Spatio-Temporal Factor Model Based on Dual Vector Quantized\n  Variational Autoencoders for Financial Trading","summary":"  In financial trading, factor models are widely used to price assets and\ncapture excess returns from mispricing. Recently, we have witnessed the rise of\nvariational autoencoder-based latent factor models, which learn latent factors\nself-adaptively. While these models focus on modeling overall market\nconditions, they often fail to effectively capture the temporal patterns of\nindividual stocks. Additionally, representing multiple factors as single values\nsimplifies the model but limits its ability to capture complex relationships\nand dependencies. As a result, the learned factors are of low quality and lack\ndiversity, reducing their effectiveness and robustness across different trading\nperiods. To address these issues, we propose a Spatio-Temporal factOR Model\nbased on dual vector quantized variational autoencoders, named STORM, which\nextracts features of stocks from temporal and spatial perspectives, then fuses\nand aligns these features at the fine-grained and semantic level, and\nrepresents the factors as multi-dimensional embeddings. The discrete codebooks\ncluster similar factor embeddings, ensuring orthogonality and diversity, which\nhelps distinguish between different factors and enables factor selection in\nfinancial trading. To show the performance of the proposed factor model, we\napply it to two downstream experiments: portfolio management on two stock\ndatasets and individual trading tasks on six specific stocks. The extensive\nexperiments demonstrate STORM's flexibility in adapting to downstream tasks and\nsuperior performance over baseline models.\n","authors":["Yilei Zhao","Wentao Zhang","Tingran Yang","Yong Jiang","Fei Huang","Wei Yang Bryan Lim"],"pdf_url":"https://arxiv.org/pdf/2412.09468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08591v1","updated":"2025-01-15T05:20:01Z","published":"2025-01-15T05:20:01Z","title":"OpenMLDB: A Real-Time Relational Data Feature Computation System for\n  Online ML","summary":"  Efficient and consistent feature computation is crucial for a wide range of\nonline ML applications. Typically, feature computation is divided into two\ndistinct phases, i.e., offline stage for model training and online stage for\nmodel serving. These phases often rely on execution engines with different\ninterface languages and function implementations, causing significant\ninconsistencies. Moreover, many online ML features involve complex time-series\ncomputations (e.g., functions over varied-length table windows) that differ\nfrom standard streaming and analytical queries. Existing data processing\nsystems (e.g., Spark, Flink, DuckDB) often incur multi-second latencies for\nthese computations, making them unsuitable for real-time online ML applications\nthat demand timely feature updates.\n  This paper presents OpenMLDB, a feature computation system deployed in\n4Paradigm's SageOne platform and over 100 real scenarios. Technically, OpenMLDB\nfirst employs a unified query plan generator for consistent computation results\nacross the offline and online stages, significantly reducing feature deployment\noverhead. Second, OpenMLDB provides an online execution engine that resolves\nperformance bottlenecks caused by long window computations (via\npre-aggregation) and multi-table window unions (via data self-adjusting). It\nalso provides a high-performance offline execution engine with window parallel\noptimization and time-aware data skew resolving. Third, OpenMLDB features a\ncompact data format and stream-focused indexing to maximize memory usage and\naccelerate data access. Evaluations in testing and real workloads reveal\nsignificant performance improvements and resource savings compared to the\nbaseline systems. The open community of OpenMLDB now has over 150 contributors\nand gained 1.6k stars on GitHub.\n","authors":["Xuanhe Zhou","Wei Zhou","Liguo Qi","Hao Zhang","Dihao Chen","Bingsheng He","Mian Lu","Guoliang Li","Fan Wu","Yuqiang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08589v1","updated":"2025-01-15T05:17:38Z","published":"2025-01-15T05:17:38Z","title":"Molecular Graph Contrastive Learning with Line Graph","summary":"  Trapped by the label scarcity in molecular property prediction and drug\ndesign, graph contrastive learning (GCL) came forward. Leading contrastive\nlearning works show two kinds of view generators, that is, random or learnable\ndata corruption and domain knowledge incorporation. While effective, the two\nways also lead to molecular semantics altering and limited generalization\ncapability, respectively. To this end, we relate the \\textbf{L}in\\textbf{E}\ngraph with \\textbf{MO}lecular graph co\\textbf{N}trastive learning and propose a\nnovel method termed \\textit{LEMON}. Specifically, by contrasting the given\ngraph with the corresponding line graph, the graph encoder can freely encode\nthe molecular semantics without omission. Furthermore, we present a new patch\nwith edge attribute fusion and two local contrastive losses enhance information\ntransmission and tackle hard negative samples. Compared with state-of-the-art\n(SOTA) methods for view generation, superior performance on molecular property\nprediction suggests the effectiveness of our proposed framework.\n","authors":["Xueyuan Chen","Shangzhe Li","Ruomei Liu","Bowen Shi","Jiaheng Liu","Junran Wu","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08581v1","updated":"2025-01-15T05:01:14Z","published":"2025-01-15T05:01:14Z","title":"Normalize Then Propagate: Efficient Homophilous Regularization for\n  Few-shot Semi-Supervised Node Classification","summary":"  Graph Neural Networks (GNNs) have demonstrated remarkable ability in\nsemi-supervised node classification. However, most existing GNNs rely heavily\non a large amount of labeled data for training, which is labor-intensive and\nrequires extensive domain knowledge. In this paper, we first analyze the\nrestrictions of GNNs generalization from the perspective of supervision signals\nin the context of few-shot semi-supervised node classification. To address\nthese challenges, we propose a novel algorithm named NormProp, which utilizes\nthe homophily assumption of unlabeled nodes to generate additional supervision\nsignals, thereby enhancing the generalization against label scarcity. The key\nidea is to efficiently capture both the class information and the consistency\nof aggregation during message passing, via decoupling the direction and\nEuclidean norm of node representations. Moreover, we conduct a theoretical\nanalysis to determine the upper bound of Euclidean norm, and then propose\nhomophilous regularization to constraint the consistency of unlabeled nodes.\nExtensive experiments demonstrate that NormProp achieve state-of-the-art\nperformance under low-label rate scenarios with low computational complexity.\n","authors":["Baoming Zhang","MingCai Chen","Jianqing Song","Shuangjie Li","Jie Zhang","Chongjun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08581v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2409.18426v2","updated":"2025-01-15T04:59:43Z","published":"2024-09-27T03:27:46Z","title":"Dual Cone Gradient Descent for Training Physics-Informed Neural Networks","summary":"  Physics-informed neural networks (PINNs) have emerged as a prominent approach\nfor solving partial differential equations (PDEs) by minimizing a combined loss\nfunction that incorporates both boundary loss and PDE residual loss. Despite\ntheir remarkable empirical performance in various scientific computing tasks,\nPINNs often fail to generate reasonable solutions, and such pathological\nbehaviors remain difficult to explain and resolve. In this paper, we identify\nthat PINNs can be adversely trained when gradients of each loss function\nexhibit a significant imbalance in their magnitudes and present a negative\ninner product value. To address these issues, we propose a novel optimization\nframework, Dual Cone Gradient Descent (DCGD), which adjusts the direction of\nthe updated gradient to ensure it falls within a dual cone region. This region\nis defined as a set of vectors where the inner products with both the gradients\nof the PDE residual loss and the boundary loss are non-negative. Theoretically,\nwe analyze the convergence properties of DCGD algorithms in a non-convex\nsetting. On a variety of benchmark equations, we demonstrate that DCGD\noutperforms other optimization algorithms in terms of various evaluation\nmetrics. In particular, DCGD achieves superior predictive accuracy and enhances\nthe stability of training for failure modes of PINNs and complex PDEs, compared\nto existing optimally tuned models. Moreover, DCGD can be further improved by\ncombining it with popular strategies for PINNs, including learning rate\nannealing and the Neural Tangent Kernel (NTK).\n","authors":["Youngsik Hwang","Dong-Young Lim"],"pdf_url":"https://arxiv.org/pdf/2409.18426v2.pdf","comment":"The Thirty-eighth Annual Conference on Neural Information Processing\n  Systems, 2024"},{"id":"http://arxiv.org/abs/2411.02281v2","updated":"2025-01-15T04:51:48Z","published":"2024-11-04T17:09:58Z","title":"Conformal-in-the-Loop for Learning with Imbalanced Noisy Data","summary":"  Class imbalance and label noise are pervasive in large-scale datasets, yet\nmuch of machine learning research assumes well-labeled, balanced data, which\nrarely reflects real world conditions. Existing approaches typically address\neither label noise or class imbalance in isolation, leading to suboptimal\nresults when both issues coexist. In this work, we propose\nConformal-in-the-Loop (CitL), a novel training framework that addresses both\nchallenges with a conformal prediction-based approach. CitL evaluates sample\nuncertainty to adjust weights and prune unreliable examples, enhancing model\nresilience and accuracy with minimal computational cost. Our extensive\nexperiments include a detailed analysis showing how CitL effectively emphasizes\nimpactful data in noisy, imbalanced datasets. Our results show that CitL\nconsistently boosts model performance, achieving up to a 6.1% increase in\nclassification accuracy and a 5.0 mIoU improvement in segmentation. Our code is\npublicly available: CitL.\n","authors":["John Brandon Graham-Knight","Jamil Fayyad","Nourhan Bayasi","Patricia Lasserre","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2411.02281v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2501.08572v1","updated":"2025-01-15T04:36:55Z","published":"2025-01-15T04:36:55Z","title":"DNMDR: Dynamic Networks and Multi-view Drug Representations for Safe\n  Medication Recommendation","summary":"  Medication Recommendation (MR) is a promising research topic which booms\ndiverse applications in the healthcare and clinical domains. However, existing\nmethods mainly rely on sequential modeling and static graphs for representation\nlearning, which ignore the dynamic correlations in diverse medical events of a\npatient's temporal visits, leading to insufficient global structural\nexploration on nodes. Additionally, mitigating drug-drug interactions (DDIs) is\nanother issue determining the utility of the MR systems. To address the\nchallenges mentioned above, this paper proposes a novel MR method with the\nintegration of dynamic networks and multi-view drug representations (DNMDR).\nSpecifically, weighted snapshot sequences for dynamic heterogeneous networks\nare constructed based on discrete visits in temporal EHRs, and all the dynamic\nnetworks are jointly trained to gain both structural correlations in diverse\nmedical events and temporal dependency in historical health conditions, for\nachieving comprehensive patient representations with both semantic features and\nstructural relationships. Moreover, combining the drug co-occurrences and\nadverse drug-drug interactions (DDIs) in internal view of drug molecule\nstructure and interactive view of drug pairs, the safe drug representations are\navailable to obtain high-quality medication combination recommendation.\nFinally, extensive experiments on real world datasets are conducted for\nperformance evaluation, and the experimental results demonstrate that the\nproposed DNMDR method outperforms the state-of-the-art baseline models with a\nlarge margin on various metrics such as PRAUC, Jaccard, DDI rates and so on.\n","authors":["Guanlin Liu","Xiaomei Yu","Zihao Liu","Xue Li","Xingxu Fan","Xiangwei Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.08572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19213v2","updated":"2025-01-15T04:17:38Z","published":"2024-05-29T15:56:33Z","title":"EdgeSight: Enabling Modeless and Cost-Efficient Inference at the Edge","summary":"  Traditional ML inference is evolving toward modeless inference, which\nabstracts the complexity of model selection from users, allowing the system to\nautomatically choose the most appropriate model for each request based on\naccuracy and resource requirements. While prior studies have focused on\nmodeless inference within data centers, this paper tackles the pressing need\nfor cost-efficient modeless inference at the edge -- particularly within its\nunique constraints of limited device memory, volatile network conditions, and\nrestricted power consumption.\n  To overcome these challenges, we propose EdgeSight, a system that provides\ncost-efficient EdgeSight serving for diverse DNNs at the edge. EdgeSight\nemploys an edge-data center (edge-DC) architecture, utilizing confidence\nscaling to reduce the number of model options while meeting diverse accuracy\nrequirements. Additionally, it supports lossy inference in volatile network\nenvironments. Our experimental results show that EdgeSight outperforms existing\nsystems by up to 1.6x in P99 latency for modeless services. Furthermore, our\nFPGA prototype demonstrates similar performance at certain accuracy levels,\nwith a power consumption reduction of up to 3.34x.\n","authors":["ChonLam Lao","Jiaqi Gao","Ganesh Ananthanarayanan","Aditya Akella","Minlan Yu"],"pdf_url":"https://arxiv.org/pdf/2405.19213v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2501.08563v1","updated":"2025-01-15T04:09:21Z","published":"2025-01-15T04:09:21Z","title":"Adaptive Sampled Softmax with Inverted Multi-Index: Methods, Theory and\n  Applications","summary":"  The softmax function is a cornerstone of multi-class classification, integral\nto a wide range of machine learning applications, from large-scale retrieval\nand ranking models to advanced large language models. However, its\ncomputational cost grows linearly with the number of classes, which becomes\nprohibitively expensive in scenarios with millions or even billions of classes.\nThe sampled softmax, which relies on self-normalized importance sampling, has\nemerged as a powerful alternative, significantly reducing computational\ncomplexity. Yet, its estimator remains unbiased only when the sampling\ndistribution matches the true softmax distribution. To improve both\napproximation accuracy and sampling efficiency, we propose the MIDX Sampler, a\nnovel adaptive sampling strategy based on an inverted multi-index approach.\nConcretely, we decompose the softmax probability into several multinomial\nprobabilities, each associated with a specific set of codewords and the last\nassociated with the residual score of queries, thus reducing time complexity to\nthe number of codewords instead of the number of classes. To further boost\nefficiency, we replace the query-specific residual probability with a simple\nuniform distribution, simplifying the computation while retaining high\nperformance. Our method is backed by rigorous theoretical analysis, addressing\nkey concerns such as sampling bias, gradient bias, convergence rates, and\ngeneralization error bounds. The results demonstrate that a smaller divergence\nfrom the ideal softmax distribution leads to faster convergence and improved\ngeneralization. Extensive experiments on large-scale language models,\nsequential recommenders, and extreme multi-class classification tasks confirm\nthat the MIDX-Sampler delivers superior effectiveness and efficiency compared\nto existing approaches.\n","authors":["Jin Chen","Jin Zhang","Xu huang","Yi Yang","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08563v1.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2501.08562v1","updated":"2025-01-15T04:07:06Z","published":"2025-01-15T04:07:06Z","title":"MIAFEx: An Attention-based Feature Extraction Method for Medical Image\n  Classification","summary":"  Feature extraction techniques are crucial in medical image classification;\nhowever, classical feature extractors in addition to traditional machine\nlearning classifiers often exhibit significant limitations in providing\nsufficient discriminative information for complex image sets. While\nConvolutional Neural Networks (CNNs) and Vision Transformer (ViT) have shown\npromise in feature extraction, they are prone to overfitting due to the\ninherent characteristics of medical imaging data, including small sample sizes\nor high intra-class variance. In this work, the Medical Image Attention-based\nFeature Extractor (MIAFEx) is proposed, a novel method that employs a learnable\nrefinement mechanism to enhance the classification token within the Transformer\nencoder architecture. This mechanism adjusts the token based on learned\nweights, improving the extraction of salient features and enhancing the model's\nadaptability to the challenges presented by medical imaging data. The MIAFEx\noutput features quality is compared against classical feature extractors using\ntraditional and hybrid classifiers. Also, the performance of these features is\ncompared against modern CNN and ViT models in classification tasks,\ndemonstrating its superiority in accuracy and robustness across multiple\ncomplex classification medical imaging datasets. This advantage is particularly\npronounced in scenarios with limited training data, where traditional and\nmodern models often struggle to generalize effectively. The source code of this\nproposal can be found at\nhttps://github.com/Oscar-RamosS/Medical-Image-Attention-based-Feature-Extractor-MIAFEx\n","authors":["Oscar Ramos-Soto","Jorge Ramos-Frutos","Ezequiel Perez-Zarate","Diego Oliva","Sandra E. Balderas-Mata"],"pdf_url":"https://arxiv.org/pdf/2501.08562v1.pdf","comment":"In preparation for Journal Submission"},{"id":"http://arxiv.org/abs/2501.08561v1","updated":"2025-01-15T04:04:57Z","published":"2025-01-15T04:04:57Z","title":"ANSR-DT: An Adaptive Neuro-Symbolic Learning and Reasoning Framework for\n  Digital Twins","summary":"  In this paper, we propose an Adaptive Neuro-Symbolic Learning Framework for\ndigital twin technology called ``ANSR-DT.\" Our approach combines pattern\nrecognition algorithms with reinforcement learning and symbolic reasoning to\nenable real-time learning and adaptive intelligence. This integration enhances\nthe understanding of the environment and promotes continuous learning, leading\nto better and more effective decision-making in real-time for applications that\nrequire human-machine collaboration. We evaluated the \\textit{ANSR-DT}\nframework for its ability to learn and adapt to dynamic patterns, observing\nsignificant improvements in decision accuracy, reliability, and\ninterpretability when compared to existing state-of-the-art methods. However,\nchallenges still exist in extracting and integrating symbolic rules in complex\nenvironments, which limits the full potential of our framework in heterogeneous\nsettings. Moreover, our ongoing research aims to address this issue in the\nfuture by ensuring seamless integration of neural models at large. In addition,\nour open-source implementation promotes reproducibility and encourages future\nresearch to build on our foundational work.\n","authors":["Safayat Bin Hakim","Muhammad Adil","Alvaro Velasquez","Houbing Herbert Song"],"pdf_url":"https://arxiv.org/pdf/2501.08561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08558v1","updated":"2025-01-15T03:49:08Z","published":"2025-01-15T03:49:08Z","title":"LAMS: LLM-Driven Automatic Mode Switching for Assistive Teleoperation","summary":"  Teleoperating high degrees-of-freedom (DoF) robotic manipulators via low-DoF\ncontrollers like joysticks often requires frequent switching between control\nmodes, where each mode maps controller movements to specific robot actions.\nManually performing this frequent switching can make teleoperation cumbersome\nand inefficient. On the other hand, existing automatic mode-switching\nsolutions, such as heuristic-based or learning-based methods, are often\ntask-specific and lack generalizability. In this paper, we introduce LLM-Driven\nAutomatic Mode Switching (LAMS), a novel approach that leverages Large Language\nModels (LLMs) to automatically switch control modes based on task context.\nUnlike existing methods, LAMS requires no prior task demonstrations and\nincrementally improves by integrating user-generated mode-switching examples.\nWe validate LAMS through an ablation study and a user study with 10\nparticipants on complex, long-horizon tasks, demonstrating that LAMS\neffectively reduces manual mode switches, is preferred over alternative\nmethods, and improves performance over time. The project website with\nsupplementary materials is at https://lams-assistance.github.io/.\n","authors":["Yiran Tao","Jehan Yang","Dan Ding","Zackory Erickson"],"pdf_url":"https://arxiv.org/pdf/2501.08558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04820v2","updated":"2025-01-15T03:43:22Z","published":"2024-08-09T02:22:51Z","title":"Natural Language Outlines for Code: Literate Programming in the LLM Era","summary":"  We propose using natural language outlines as a novel modality and\ninteraction surface for providing AI assistance to developers throughout the\nsoftware development process. An NL outline for a code function comprises\nmultiple statements written in concise prose, which partition the code and\nsummarize its main ideas in the style of literate programming. Crucially, we\nfind that modern LLMs can generate accurate and high-quality NL outlines in\npractice. Moreover, NL outlines enable a bidirectional sync between code and\nNL, allowing changes in one to be automatically reflected in the other. We\ndiscuss many use cases for NL outlines: they can accelerate understanding and\nnavigation of code and diffs, simplify code maintenance, augment code search,\nsteer code generation, and more. We then propose and compare multiple LLM\nprompting techniques for generating outlines and ask professional developers to\njudge outline quality. Finally, we present two case studies applying NL\noutlines toward code review and malware detection.\n","authors":["Kensen Shi","Deniz Altınbüken","Saswat Anand","Mihai Christodorescu","Katja Grünwedel","Alexa Koenings","Sai Naidu","Anurag Pathak","Marc Rasi","Fredde Ribeiro","Brandon Ruffin","Siddhant Sanyam","Maxim Tabachnyk","Sara Toth","Roy Tu","Tobias Welp","Pengcheng Yin","Manzil Zaheer","Satish Chandra","Charles Sutton"],"pdf_url":"https://arxiv.org/pdf/2408.04820v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02512v2","updated":"2025-01-15T03:23:39Z","published":"2024-09-04T08:21:47Z","title":"Continual Diffuser (CoD): Mastering Continual Offline Reinforcement\n  Learning with Experience Rehearsal","summary":"  Artificial neural networks, especially recent diffusion-based models, have\nshown remarkable superiority in gaming, control, and QA systems, where the\ntraining tasks' datasets are usually static. However, in real-world\napplications, such as robotic control of reinforcement learning (RL), the tasks\nare changing, and new tasks arise in a sequential order. This situation poses\nthe new challenge of plasticity-stability trade-off for training an agent who\ncan adapt to task changes and retain acquired knowledge. In view of this, we\npropose a rehearsal-based continual diffusion model, called Continual Diffuser\n(CoD), to endow the diffuser with the capabilities of quick adaptation\n(plasticity) and lasting retention (stability). Specifically, we first\nconstruct an offline benchmark that contains 90 tasks from multiple domains.\nThen, we train the CoD on each task with sequential modeling and conditional\ngeneration for making decisions. Next, we preserve a small portion of previous\ndatasets as the rehearsal buffer and replay it to retain the acquired\nknowledge. Extensive experiments on a series of tasks show CoD can achieve a\npromising plasticity-stability trade-off and outperform existing\ndiffusion-based methods and other representative baselines on most tasks.\n","authors":["Jifeng Hu","Li Shen","Sili Huang","Zhejian Yang","Hechang Chen","Lichao Sun","Yi Chang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2409.02512v2.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.08552v1","updated":"2025-01-15T03:23:06Z","published":"2025-01-15T03:23:06Z","title":"Reinforcement Learning-Enhanced Procedural Generation for Dynamic\n  Narrative-Driven AR Experiences","summary":"  Procedural Content Generation (PCG) is widely used to create scalable and\ndiverse environments in games. However, existing methods, such as the Wave\nFunction Collapse (WFC) algorithm, are often limited to static scenarios and\nlack the adaptability required for dynamic, narrative-driven applications,\nparticularly in augmented reality (AR) games. This paper presents a\nreinforcement learning-enhanced WFC framework designed for mobile AR\nenvironments. By integrating environment-specific rules and dynamic tile weight\nadjustments informed by reinforcement learning (RL), the proposed method\ngenerates maps that are both contextually coherent and responsive to gameplay\nneeds. Comparative evaluations and user studies demonstrate that the framework\nachieves superior map quality and delivers immersive experiences, making it\nwell-suited for narrative-driven AR games. Additionally, the method holds\npromise for broader applications in education, simulation training, and\nimmersive extended reality (XR) experiences, where dynamic and adaptive\nenvironments are critical.\n","authors":["Aniruddha Srinivas Joshi"],"pdf_url":"https://arxiv.org/pdf/2501.08552v1.pdf","comment":"Number of pages: 13, Number of figures: 4. Accepted for presentation\n  at GRAPP 2025 - 20th International Conference on Computer Graphics Theory and\n  Applications (for additional details on the conference visit\n  https://grapp.scitevents.org). Disclaimer: This preprint may differ from the\n  final version published in the conference proceedings"},{"id":"http://arxiv.org/abs/2501.08551v1","updated":"2025-01-15T03:20:16Z","published":"2025-01-15T03:20:16Z","title":"A Theory of Optimistically Universal Online Learnability for General\n  Concept Classes","summary":"  We provide a full characterization of the concept classes that are\noptimistically universally online learnable with $\\{0, 1\\}$ labels. The notion\nof optimistically universal online learning was defined in [Hanneke, 2021] in\norder to understand learnability under minimal assumptions. In this paper,\nfollowing the philosophy behind that work, we investigate two questions,\nnamely, for every concept class: (1) What are the minimal assumptions on the\ndata process admitting online learnability? (2) Is there a learning algorithm\nwhich succeeds under every data process satisfying the minimal assumptions?\nSuch an algorithm is said to be optimistically universal for the given concept\nclass. We resolve both of these questions for all concept classes, and\nmoreover, as part of our solution, we design general learning algorithms for\neach case. Finally, we extend these algorithms and results to the agnostic\ncase, showing an equivalence between the minimal assumptions on the data\nprocess for learnability in the agnostic and realizable cases, for every\nconcept class, as well as the equivalence of optimistically universal\nlearnability.\n","authors":["Steve Hanneke","Hongao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08551v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.08547v1","updated":"2025-01-15T03:14:18Z","published":"2025-01-15T03:14:18Z","title":"OMEGA: A Low-Latency GNN Serving System for Large Graphs","summary":"  Graph Neural Networks (GNNs) have been widely adopted for their ability to\ncompute expressive node representations in graph datasets. However, serving\nGNNs on large graphs is challenging due to the high communication, computation,\nand memory overheads of constructing and executing computation graphs, which\nrepresent information flow across large neighborhoods. Existing approximation\ntechniques in training can mitigate the overheads but, in serving, still lead\nto high latency and/or accuracy loss. To this end, we propose OMEGA, a system\nthat enables low-latency GNN serving for large graphs with minimal accuracy\nloss through two key ideas. First, OMEGA employs selective recomputation of\nprecomputed embeddings, which allows for reusing precomputed computation\nsubgraphs while selectively recomputing a small fraction to minimize accuracy\nloss. Second, we develop computation graph parallelism, which reduces\ncommunication overhead by parallelizing the creation and execution of\ncomputation graphs across machines. Our evaluation with large graph datasets\nand GNN models shows that OMEGA significantly outperforms state-of-the-art\ntechniques.\n","authors":["Geon-Woo Kim","Donghyun Kim","Jeongyoon Moon","Henry Liu","Tarannum Khan","Anand Iyer","Daehyeok Kim","Aditya Akella"],"pdf_url":"https://arxiv.org/pdf/2501.08547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08187v2","updated":"2025-01-15T02:59:32Z","published":"2025-01-14T15:12:19Z","title":"A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction\n  Following","summary":"  Large language models excel at interpreting complex natural language\ninstructions, enabling them to perform a wide range of tasks. In the life\nsciences, single-cell RNA sequencing (scRNA-seq) data serves as the \"language\nof cellular biology\", capturing intricate gene expression patterns at the\nsingle-cell level. However, interacting with this \"language\" through\nconventional tools is often inefficient and unintuitive, posing challenges for\nresearchers. To address these limitations, we present InstructCell, a\nmulti-modal AI copilot that leverages natural language as a medium for more\ndirect and flexible single-cell analysis. We construct a comprehensive\nmulti-modal instruction dataset that pairs text-based instructions with\nscRNA-seq profiles from diverse tissues and species. Building on this, we\ndevelop a multi-modal cell language architecture capable of simultaneously\ninterpreting and processing both modalities. InstructCell empowers researchers\nto accomplish critical tasks-such as cell type annotation, conditional\npseudo-cell generation, and drug sensitivity prediction-using straightforward\nnatural language commands. Extensive evaluations demonstrate that InstructCell\nconsistently meets or exceeds the performance of existing single-cell\nfoundation models, while adapting to diverse experimental conditions. More\nimportantly, InstructCell provides an accessible and intuitive tool for\nexploring complex single-cell data, lowering technical barriers and enabling\ndeeper biological insights.\n","authors":["Yin Fang","Xinle Deng","Kangwei Liu","Ningyu Zhang","Jingyang Qian","Penghui Yang","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08187v2.pdf","comment":"37 pages; 13 figures; Code: https://github.com/zjunlp/Instructcell,\n  Models: https://huggingface.co/zjunlp/Instructcell-chat,\n  https://huggingface.co/zjunlp/InstructCell-instruct"},{"id":"http://arxiv.org/abs/2501.08538v1","updated":"2025-01-15T02:56:50Z","published":"2025-01-15T02:56:50Z","title":"Homophily-aware Heterogeneous Graph Contrastive Learning","summary":"  Heterogeneous graph pre-training (HGP) has demonstrated remarkable\nperformance across various domains. However, the issue of heterophily in\nreal-world heterogeneous graphs (HGs) has been largely overlooked. To bridge\nthis research gap, we proposed a novel heterogeneous graph contrastive learning\nframework, termed HGMS, which leverages connection strength and multi-view\nself-expression to learn homophilous node representations. Specifically, we\ndesign a heterogeneous edge dropping augmentation strategy that enhances the\nhomophily of augmented views. Moreover, we introduce a multi-view\nself-expressive learning method to infer the homophily between nodes. In\npractice, we develop two approaches to solve the self-expressive matrix. The\nsolved self-expressive matrix serves as an additional augmented view to provide\nhomophilous information and is used to identify false negatives in contrastive\nloss. Extensive experimental results demonstrate the superiority of HGMS across\ndifferent downstream tasks.\n","authors":["Haosen Wang","Chenglong Shi","Can Xu","Surong Yan","Pan Tang"],"pdf_url":"https://arxiv.org/pdf/2501.08538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08537v1","updated":"2025-01-15T02:54:52Z","published":"2025-01-15T02:54:52Z","title":"Complexity Control Facilitates Reasoning-Based Compositional\n  Generalization in Transformers","summary":"  Transformers have demonstrated impressive capabilities across various tasks,\nyet their performance on compositional problems remains a subject of debate. In\nthis study, we investigate the internal mechanisms underlying Transformers'\nbehavior in compositional tasks. We find that complexity control strategies\nsignificantly influence whether the model learns primitive-level rules that\ngeneralize out-of-distribution (reasoning-based solutions) or relies solely on\nmemorized mappings (memory-based solutions). By applying masking strategies to\nthe model's information circuits and employing multiple complexity metrics, we\nreveal distinct internal working mechanisms associated with different solution\ntypes. Further analysis reveals that reasoning-based solutions exhibit a lower\ncomplexity bias, which aligns with the well-studied neuron condensation\nphenomenon. This lower complexity bias is hypothesized to be the key factor\nenabling these solutions to learn reasoning rules. We validate these\nconclusions across multiple real-world datasets, including image generation and\nnatural language processing tasks, confirming the broad applicability of our\nfindings.\n","authors":["Zhongwang Zhang","Pengxiao Lin","Zhiwei Wang","Yaoyu Zhang","Zhi-Qin John Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08537v1.pdf","comment":"Mistakenly submitted as a replacement to 2405.05409v4"},{"id":"http://arxiv.org/abs/2403.15796v3","updated":"2025-01-15T02:48:59Z","published":"2024-03-23T11:03:31Z","title":"Understanding Emergent Abilities of Language Models from the Loss\n  Perspective","summary":"  Recent studies have put into question the belief that emergent abilities in\nlanguage models are exclusive to large models. This skepticism arises from two\nobservations: 1) smaller models can also exhibit high performance on emergent\nabilities and 2) there is doubt on the discontinuous metrics used to measure\nthese abilities. In this paper, we propose to study emergent abilities in the\nlens of pre-training loss, instead of model size or training compute. We\ndemonstrate that the Transformer models with the same pre-training loss, but\ndifferent model and data sizes, generate the same performance on various\ndownstream tasks, with a fixed data corpus, tokenization, and model\narchitecture. We also discover that a model exhibits emergent abilities on\ncertain tasks -- regardless of the continuity of metrics -- when its\npre-training loss falls below a specific threshold. Before reaching this\nthreshold, its performance remains at the level of random guessing. This\ninspires us to redefine emergent abilities as those that manifest in models\nwith lower pre-training losses, highlighting that these abilities cannot be\npredicted by merely extrapolating the performance trends of models with higher\npre-training losses.\n","authors":["Zhengxiao Du","Aohan Zeng","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2403.15796v3.pdf","comment":"23 pages, 8 figures. Accepted in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.08109v2","updated":"2025-01-15T02:48:33Z","published":"2025-01-14T13:40:08Z","title":"Data-driven inventory management for new products: A warm-start and\n  adjusted Dyna-$Q$ approach","summary":"  In this paper, we propose a novel reinforcement learning algorithm for\ninventory management of newly launched products with no or limited historical\ndemand information. The algorithm follows the classic Dyna-$Q$ structure,\nbalancing the model-based and model-free approaches, while accelerating the\ntraining process of Dyna-$Q$ and mitigating the model discrepancy generated by\nthe model-based feedback. Warm-start information from the demand data of\nexisting similar products can be incorporated into the algorithm to further\nstabilize the early-stage training and reduce the variance of the estimated\noptimal policy. Our approach is validated through a case study of bakery\ninventory management with real data. The adjusted Dyna-$Q$ shows up to a 23.7%\nreduction in average daily cost compared with $Q$-learning, and up to a 77.5%\nreduction in training time within the same horizon compared with classic\nDyna-$Q$. By incorporating the warm-start information, it can be found that the\nadjusted Dyna-$Q$ has the lowest total cost, lowest variance in total cost, and\nrelatively low shortage percentages among all the algorithms under a 30-day\ntesting.\n","authors":["Xinye Qu","Longxiao Liu","Wenjie Huang"],"pdf_url":"https://arxiv.org/pdf/2501.08109v2.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.18946v3","updated":"2025-01-15T02:42:42Z","published":"2024-09-27T17:46:05Z","title":"Unconditional stability of a recurrent neural circuit implementing\n  divisive normalization","summary":"  Stability in recurrent neural models poses a significant challenge,\nparticularly in developing biologically plausible neurodynamical models that\ncan be seamlessly trained. Traditional cortical circuit models are notoriously\ndifficult to train due to expansive nonlinearities in the dynamical system,\nleading to an optimization problem with nonlinear stability constraints that\nare difficult to impose. Conversely, recurrent neural networks (RNNs) excel in\ntasks involving sequential data but lack biological plausibility and\ninterpretability. In this work, we address these challenges by linking dynamic\ndivisive normalization (DN) to the stability of ORGaNICs, a biologically\nplausible recurrent cortical circuit model that dynamically achieves DN and\nthat has been shown to simulate a wide range of neurophysiological phenomena.\nBy using the indirect method of Lyapunov, we prove the remarkable property of\nunconditional local stability for an arbitrary-dimensional ORGaNICs circuit\nwhen the recurrent weight matrix is the identity. We thus connect ORGaNICs to a\nsystem of coupled damped harmonic oscillators, which enables us to derive the\ncircuit's energy function, providing a normative principle of what the circuit,\nand individual neurons, aim to accomplish. Further, for a generic recurrent\nweight matrix, we prove the stability of the 2D model and demonstrate\nempirically that stability holds in higher dimensions. Finally, we show that\nORGaNICs can be trained by backpropagation through time without gradient\nclipping/scaling, thanks to its intrinsic stability property and adaptive time\nconstants, which address the problems of exploding, vanishing, and oscillating\ngradients. By evaluating the model's performance on RNN benchmarks, we find\nthat ORGaNICs outperform alternative neurodynamical models on static image\nclassification tasks and perform comparably to LSTMs on sequential tasks.\n","authors":["Shivang Rawat","David J. Heeger","Stefano Martiniani"],"pdf_url":"https://arxiv.org/pdf/2409.18946v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19727v2","updated":"2025-01-15T02:29:14Z","published":"2024-09-29T14:57:45Z","title":"Investigating the Effect of Network Pruning on Performance and\n  Interpretability","summary":"  Deep Neural Networks (DNNs) are often over-parameterized for their tasks and\ncan be compressed quite drastically by removing weights, a process called\npruning. We investigate the impact of different pruning techniques on the\nclassification performance and interpretability of GoogLeNet. We systematically\napply unstructured and structured pruning, as well as connection sparsity\n(pruning of input weights) methods to the network and analyze the outcomes\nregarding the network's performance on the validation set of ImageNet. We also\ncompare different retraining strategies, such as iterative pruning and one-shot\npruning. We find that with sufficient retraining epochs, the performance of the\nnetworks can approximate the performance of the default GoogLeNet - and even\nsurpass it in some cases. To assess interpretability, we employ the Mechanistic\nInterpretability Score (MIS) developed by Zimmermann et al. . Our experiments\nreveal that there is no significant relationship between interpretability and\npruning rate when using MIS as a measure. Additionally, we observe that\nnetworks with extremely low accuracy can still achieve high MIS scores,\nsuggesting that the MIS may not always align with intuitive notions of\ninterpretability, such as understanding the basis of correct decisions.\n","authors":["Jonathan von Rad","Florian Seuffert"],"pdf_url":"https://arxiv.org/pdf/2409.19727v2.pdf","comment":"4 pages, 6 figures"},{"id":"http://arxiv.org/abs/2304.09123v3","updated":"2025-01-15T02:19:34Z","published":"2023-04-18T16:39:51Z","title":"Finite-Sample Bounds for Adaptive Inverse Reinforcement Learning using\n  Passive Langevin Dynamics","summary":"  This paper provides a finite-sample analysis of a passive stochastic gradient\nLangevin dynamics (PSGLD) algorithm. This algorithm is designed to achieve\nadaptive inverse reinforcement learning (IRL). Adaptive IRL aims to estimate\nthe cost function of a forward learner performing a stochastic gradient\nalgorithm (e.g., policy gradient reinforcement learning) by observing their\nestimates in real-time. The PSGLD algorithm is considered passive because it\nincorporates noisy gradients provided by an external stochastic gradient\nalgorithm (forward learner), of which it has no control. The PSGLD algorithm\nacts as a randomized sampler to achieve adaptive IRL by reconstructing the\nforward learner's cost function nonparametrically from the stationary measure\nof a Langevin diffusion. This paper analyzes the non-asymptotic (finite-sample)\nperformance; we provide explicit bounds on the 2-Wasserstein distance between\nPSGLD algorithm sample measure and the stationary measure encoding the cost\nfunction, and provide guarantees for a kernel density estimation scheme which\nreconstructs the cost function from empirical samples. Our analysis uses tools\nfrom the study of Markov diffusion operators. The derived bounds have both\npractical and theoretical significance. They provide finite-time guarantees for\nan adaptive IRL mechanism, and substantially generalize the analytical\nframework of a line of research in passive stochastic gradient algorithms.\n","authors":["Luke Snow","Vikram Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2304.09123v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08521v1","updated":"2025-01-15T02:17:38Z","published":"2025-01-15T02:17:38Z","title":"Mitigating Domain Shift in Federated Learning via Intra- and\n  Inter-Domain Prototypes","summary":"  Federated Learning (FL) has emerged as a decentralized machine learning\ntechnique, allowing clients to train a global model collaboratively without\nsharing private data. However, most FL studies ignore the crucial challenge of\nheterogeneous domains where each client has a distinct feature distribution,\nwhich is common in real-world scenarios. Prototype learning, which leverages\nthe mean feature vectors within the same classes, has become a prominent\nsolution for federated learning under domain skew. However, existing federated\nprototype learning methods only consider inter-domain prototypes on the server\nand overlook intra-domain characteristics. In this work, we introduce a novel\nfederated prototype learning method, namely I$^2$PFL, which incorporates\n$\\textbf{I}$ntra-domain and $\\textbf{I}$nter-domain $\\textbf{P}$rototypes, to\nmitigate domain shifts and learn a generalized global model across multiple\ndomains in federated learning. To construct intra-domain prototypes, we propose\nfeature alignment with MixUp-based augmented prototypes to capture the\ndiversity of local domains and enhance the generalization of local features.\nAdditionally, we introduce a reweighting mechanism for inter-domain prototypes\nto generate generalized prototypes to provide inter-domain knowledge and reduce\ndomain skew across multiple clients. Extensive experiments on the Digits,\nOffice-10, and PACS datasets illustrate the superior performance of our method\ncompared to other baselines.\n","authors":["Huy Q. Le","Ye Lin Tun","Yu Qiao","Minh N. H. Nguyen","Keon Oh Kim","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2501.08521v1.pdf","comment":"13 pages, 9 figures, 10 tables"},{"id":"http://arxiv.org/abs/2501.08515v1","updated":"2025-01-15T01:59:24Z","published":"2025-01-15T01:59:24Z","title":"Learning Hyperplane Tree: A Piecewise Linear and Fully Interpretable\n  Decision-making Framework","summary":"  This paper introduces a novel tree-based model, Learning Hyperplane Tree\n(LHT), which outperforms state-of-the-art (SOTA) tree models for classification\ntasks on several public datasets. The structure of LHT is simple and efficient:\nit partitions the data using several hyperplanes to progressively distinguish\nbetween target and non-target class samples. Although the separation is not\nperfect at each stage, LHT effectively improves the distinction through\nsuccessive partitions. During testing, a sample is classified by evaluating the\nhyperplanes defined in the branching blocks and traversing down the tree until\nit reaches the corresponding leaf block. The class of the test sample is then\ndetermined using the piecewise linear membership function defined in the leaf\nblocks, which is derived through least-squares fitting and fuzzy logic. LHT is\nhighly transparent and interpretable--at each branching block, the contribution\nof each feature to the classification can be clearly observed.\n","authors":["Hongyi Li","Jun Xu","William Ward Armstrong"],"pdf_url":"https://arxiv.org/pdf/2501.08515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00205v2","updated":"2025-01-15T01:46:25Z","published":"2024-10-31T20:56:07Z","title":"Compositional Automata Embeddings for Goal-Conditioned Reinforcement\n  Learning","summary":"  Goal-conditioned reinforcement learning is a powerful way to control an AI\nagent's behavior at runtime. That said, popular goal representations, e.g.,\ntarget states or natural language, are either limited to Markovian tasks or\nrely on ambiguous task semantics. We propose representing temporal goals using\ncompositions of deterministic finite automata (cDFAs) and use cDFAs to guide RL\nagents. cDFAs balance the need for formal temporal semantics with ease of\ninterpretation: if one can understand a flow chart, one can understand a cDFA.\nOn the other hand, cDFAs form a countably infinite concept class with Boolean\nsemantics, and subtle changes to the automaton can result in very different\ntasks, making them difficult to condition agent behavior on. To address this,\nwe observe that all paths through a DFA correspond to a series of reach-avoid\ntasks and propose pre-training graph neural network embeddings on \"reach-avoid\nderived\" DFAs. Through empirical evaluation, we demonstrate that the proposed\npre-training method enables zero-shot generalization to various cDFA task\nclasses and accelerated policy specialization without the myopic suboptimality\nof hierarchical methods.\n","authors":["Beyazit Yalcinkaya","Niklas Lauffer","Marcell Vazquez-Chanlatte","Sanjit A. Seshia"],"pdf_url":"https://arxiv.org/pdf/2411.00205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00656v2","updated":"2025-01-15T01:44:16Z","published":"2024-12-31T21:55:10Z","title":"2 OLMo 2 Furious","summary":"  We present OLMo 2, the next generation of our fully open language models.\nOLMo 2 includes dense autoregressive models with improved architecture and\ntraining recipe, pretraining data mixtures, and instruction tuning recipes. Our\nmodified model architecture and training recipe achieve both better training\nstability and improved per-token efficiency. Our updated pretraining data\nmixture introduces a new, specialized data mix called Dolmino Mix 1124, which\nsignificantly improves model capabilities across many downstream task\nbenchmarks when introduced via late-stage curriculum training (i.e. specialized\ndata during the annealing phase of pretraining). Finally, we incorporate best\npractices from T\\\"ulu 3 to develop OLMo 2-Instruct, focusing on permissive data\nand extending our final-stage reinforcement learning with verifiable rewards\n(RLVR). Our OLMo 2 base models sit at the Pareto frontier of performance to\ncompute, often matching or outperforming open-weight only models like Llama 3.1\nand Qwen 2.5 while using fewer FLOPs and with fully transparent training data,\ncode, and recipe. Our fully open OLMo 2-Instruct models are competitive with or\nsurpassing open-weight only models of comparable size, including Qwen 2.5,\nLlama 3.1 and Gemma 2. We release all OLMo 2 artifacts openly -- models at 7B\nand 13B scales, both pretrained and post-trained, including their full training\ndata, training code and recipes, training logs and thousands of intermediate\ncheckpoints. The final instruction model is available on the Ai2 Playground as\na free research demo.\n","authors":["Team OLMo","Pete Walsh","Luca Soldaini","Dirk Groeneveld","Kyle Lo","Shane Arora","Akshita Bhagia","Yuling Gu","Shengyi Huang","Matt Jordan","Nathan Lambert","Dustin Schwenk","Oyvind Tafjord","Taira Anderson","David Atkinson","Faeze Brahman","Christopher Clark","Pradeep Dasigi","Nouha Dziri","Michal Guerquin","Hamish Ivison","Pang Wei Koh","Jiacheng Liu","Saumya Malik","William Merrill","Lester James V. Miranda","Jacob Morrison","Tyler Murray","Crystal Nam","Valentina Pyatkin","Aman Rangapur","Michael Schmitz","Sam Skjonsberg","David Wadden","Christopher Wilhelm","Michael Wilson","Luke Zettlemoyer","Ali Farhadi","Noah A. Smith","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2501.00656v2.pdf","comment":"Model demo available at playground.allenai.org"},{"id":"http://arxiv.org/abs/2412.19228v2","updated":"2025-01-15T01:16:30Z","published":"2024-12-26T14:09:16Z","title":"Learning Cross-Domain Representations for Transferable Drug\n  Perturbations on Single-Cell Transcriptional Responses","summary":"  Phenotypic drug discovery has attracted widespread attention because of its\npotential to identify bioactive molecules. Transcriptomic profiling provides a\ncomprehensive reflection of phenotypic changes in cellular responses to\nexternal perturbations. In this paper, we propose XTransferCDR, a novel\ngenerative framework designed for feature decoupling and transferable\nrepresentation learning across domains. Given a pair of perturbed expression\nprofiles, our approach decouples the perturbation representations from basal\nstates through domain separation encoders and then cross-transfers them in the\nlatent space. The transferred representations are then used to reconstruct the\ncorresponding perturbed expression profiles via a shared decoder. This\ncross-transfer constraint effectively promotes the learning of transferable\ndrug perturbation representations. We conducted extensive evaluations of our\nmodel on multiple datasets, including single-cell transcriptional responses to\ndrugs and single- and combinatorial genetic perturbations. The experimental\nresults show that XTransferCDR achieved better performance than current\nstate-of-the-art methods, showcasing its potential to advance phenotypic drug\ndiscovery.\n","authors":["Hui Liu","Shikai Jin"],"pdf_url":"https://arxiv.org/pdf/2412.19228v2.pdf","comment":"Accepted by The 39th Annual AAAI Conference on Artificial Intelligenc\n  (AAAI 2025)"},{"id":"http://arxiv.org/abs/2501.08508v1","updated":"2025-01-15T01:10:59Z","published":"2025-01-15T01:10:59Z","title":"Score-based 3D molecule generation with neural fields","summary":"  We introduce a new representation for 3D molecules based on their continuous\natomic density fields. Using this representation, we propose a new model based\non walk-jump sampling for unconditional 3D molecule generation in the\ncontinuous space using neural fields. Our model, FuncMol, encodes molecular\nfields into latent codes using a conditional neural field, samples noisy codes\nfrom a Gaussian-smoothed distribution with Langevin MCMC (walk), denoises these\nsamples in a single step (jump), and finally decodes them into molecular\nfields. FuncMol performs all-atom generation of 3D molecules without\nassumptions on the molecular structure and scales well with the size of\nmolecules, unlike most approaches. Our method achieves competitive results on\ndrug-like molecules and easily scales to macro-cyclic peptides, with at least\none order of magnitude faster sampling. The code is available at\nhttps://github.com/prescient-design/funcmol.\n","authors":["Matthieu Kirchmeyer","Pedro O. Pinheiro","Saeed Saremi"],"pdf_url":"https://arxiv.org/pdf/2501.08508v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.08506v1","updated":"2025-01-15T00:56:59Z","published":"2025-01-15T00:56:59Z","title":"Exploring the Efficacy of Meta-Learning: Unveiling Superior Data\n  Diversity Utilization of MAML Over Pre-training","summary":"  Currently, data and model size dominate the narrative in the training of\nsuper-large, powerful models. However, there has been a lack of exploration on\nthe effect of other attributes of the training dataset on model performance. We\nhypothesize that dataset diversity can impact the performance of vision models.\nOur study shows positive correlations between test set accuracy and data\ndiversity, providing an argument for furthering the research of dataset\nattributes beyond size. We analyzed pre-training and model-agnostic\nmeta-learning methods on twelve popular visual datasets (e.g., Omniglot,\nCIFAR-FS, Aircraft) and five model configurations, including MAML variants with\ndifferent numbers of inner gradient steps and supervised learning. We show\nmoderate to strong positive correlations (R-squared: 0.15-0.42) between\naccuracy and data diversity and weaker but significant correlations (R-squared:\n~0.2) between loss and diversity. These findings support our hypothesis and\ndemonstrate a promising way for a deeper exploration of how formal data\ndiversity influences model performance. This initial study highlights the\npotential of (Task2Vec) data diversity as a valuable measure in the rapidly\nevolving field of large-scale learning and emphasizes that understanding the\ndataset is key to building more powerful and generalizable models.\n","authors":["Kavita Selva","Satita Vittayaareekul","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08504v1","updated":"2025-01-15T00:54:12Z","published":"2025-01-15T00:54:12Z","title":"SuperSAM: Crafting a SAM Supernetwork via Structured Pruning and\n  Unstructured Parameter Prioritization","summary":"  Neural Architecture Search (NAS) is a powerful approach of automating the\ndesign of efficient neural architectures. In contrast to traditional NAS\nmethods, recently proposed one-shot NAS methods prove to be more efficient in\nperforming NAS. One-shot NAS works by generating a singular weight-sharing\nsupernetwork that acts as a search space (container) of subnetworks. Despite\nits achievements, designing the one-shot search space remains a major\nchallenge. In this work we propose a search space design strategy for Vision\nTransformer (ViT)-based architectures. In particular, we convert the Segment\nAnything Model (SAM) into a weight-sharing supernetwork called SuperSAM. Our\napproach involves automating the search space design via layer-wise structured\npruning and parameter prioritization. While the structured pruning applies\nprobabilistic removal of certain transformer layers, parameter prioritization\nperforms weight reordering and slicing of MLP-blocks in the remaining layers.\nWe train supernetworks on several datasets using the sandwich rule. For\ndeployment, we enhance subnetwork discovery by utilizing a program autotuner to\nidentify efficient subnetworks within the search space. The resulting\nsubnetworks are 30-70% smaller in size compared to the original pre-trained SAM\nViT-B, yet outperform the pretrained model. Our work introduces a new and\neffective method for ViT NAS search-space design.\n","authors":["Waqwoya Abebe","Sadegh Jafari","Sixing Yu","Akash Dutta","Jan Strube","Nathan R. Tallent","Luanzheng Guo","Pablo Munoz","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2501.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14762v3","updated":"2025-01-15T00:53:38Z","published":"2024-11-22T06:50:44Z","title":"Efficient Long Video Tokenization via Coordinate-based Patch\n  Reconstruction","summary":"  Efficient tokenization of videos remains a challenge in training vision\nmodels that can process long videos. One promising direction is to develop a\ntokenizer that can encode long video clips, as it would enable the tokenizer to\nleverage the temporal coherence of videos better for tokenization. However,\ntraining existing tokenizers on long videos often incurs a huge training cost\nas they are trained to reconstruct all the frames at once. In this paper, we\nintroduce CoordTok, a video tokenizer that learns a mapping from\ncoordinate-based representations to the corresponding patches of input videos,\ninspired by recent advances in 3D generative models. In particular, CoordTok\nencodes a video into factorized triplane representations and reconstructs\npatches that correspond to randomly sampled $(x,y,t)$ coordinates. This allows\nfor training large tokenizer models directly on long videos without requiring\nexcessive training resources. Our experiments show that CoordTok can\ndrastically reduce the number of tokens for encoding long video clips. For\ninstance, CoordTok can encode a 128-frame video with 128$\\times$128 resolution\ninto 1280 tokens, while baselines need 6144 or 8192 tokens to achieve similar\nreconstruction quality. We further show that this efficient video tokenization\nenables memory-efficient training of a diffusion transformer that can generate\n128 frames at once.\n","authors":["Huiwon Jang","Sihyun Yu","Jinwoo Shin","Pieter Abbeel","Younggyo Seo"],"pdf_url":"https://arxiv.org/pdf/2411.14762v3.pdf","comment":"Code is available on the project webpage:\n  https://huiwon-jang.github.io/coordtok/"},{"id":"http://arxiv.org/abs/2501.08501v1","updated":"2025-01-15T00:38:13Z","published":"2025-01-15T00:38:13Z","title":"Scalable Bayesian Physics-Informed Kolmogorov-Arnold Networks","summary":"  Uncertainty quantification (UQ) plays a pivotal role in scientific machine\nlearning, especially when surrogate models are used to approximate complex\nsystems. Although multilayer perceptions (MLPs) are commonly employed as\nsurrogates, they often suffer from overfitting due to their large number of\nparameters. Kolmogorov-Arnold networks (KANs) offer an alternative solution\nwith fewer parameters. However, gradient-based inference methods, such as\nHamiltonian Monte Carlo (HMC), may result in computational inefficiency when\napplied to KANs, especially for large-scale datasets, due to the high cost of\nback-propagation.To address these challenges, we propose a novel approach,\ncombining the dropout Tikhonov ensemble Kalman inversion (DTEKI) with Chebyshev\nKANs. This gradient-free method effectively mitigates overfitting and enhances\nnumerical stability. Additionally, we incorporate the active subspace method to\nreduce the parameter-space dimensionality, allowing us to improve the accuracy\nof predictions and obtain more reliable uncertainty estimates.Extensive\nexperiments demonstrate the efficacy of our approach in various test cases,\nincluding scenarios with large datasets and high noise levels. Our results show\nthat the new method achieves comparable or better accuracy, much higher\nefficiency as well as stability compared to HMC, in addition to scalability.\nMoreover, by leveraging the low-dimensional parameter subspace, our method\npreserves prediction accuracy while substantially reducing further the\ncomputational cost.\n","authors":["Zhiwei Gao","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2501.08501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14340v2","updated":"2025-01-15T00:02:00Z","published":"2024-12-18T21:17:02Z","title":"A Unifying Information-theoretic Perspective on Evaluating Generative\n  Models","summary":"  Considering the difficulty of interpreting generative model output, there is\nsignificant current research focused on determining meaningful evaluation\nmetrics. Several recent approaches utilize \"precision\" and \"recall,\" borrowed\nfrom the classification domain, to individually quantify the output fidelity\n(realism) and output diversity (representation of the real data variation),\nrespectively. With the increase in metric proposals, there is a need for a\nunifying perspective, allowing for easier comparison and clearer explanation of\ntheir benefits and drawbacks. To this end, we unify a class of\nkth-nearest-neighbors (kNN)-based metrics under an information-theoretic lens\nusing approaches from kNN density estimation. Additionally, we propose a\ntri-dimensional metric composed of Precision Cross-Entropy (PCE), Recall\nCross-Entropy (RCE), and Recall Entropy (RE), which separately measure fidelity\nand two distinct aspects of diversity, inter- and intra-class. Our\ndomain-agnostic metric, derived from the information-theoretic concepts of\nentropy and cross-entropy, can be dissected for both sample- and mode-level\nanalysis. Our detailed experimental results demonstrate the sensitivity of our\nmetric components to their respective qualities and reveal undesirable\nbehaviors of other metrics.\n","authors":["Alexis Fox","Samarth Swarup","Abhijin Adiga"],"pdf_url":"https://arxiv.org/pdf/2412.14340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15473v2","updated":"2025-01-15T23:11:07Z","published":"2024-12-20T01:05:23Z","title":"Predicting Long-Term Student Outcomes from Short-Term EdTech Log Data","summary":"  Educational stakeholders are often particularly interested in sparse, delayed\nstudent outcomes, like end-of-year statewide exams. The rare occurrence of such\nassessments makes it harder to identify students likely to fail such\nassessments, as well as making it slow for researchers and educators to be able\nto assess the effectiveness of particular educational tools. Prior work has\nprimarily focused on using logs from students full usage (e.g. year-long) of an\neducational product to predict outcomes, or considered predictive accuracy\nusing a few minutes to predict outcomes after a short (e.g. 1 hour) session. In\ncontrast, we investigate machine learning predictors using students' logs\nduring their first few hours of usage can provide useful predictive insight\ninto those students' end-of-school year external assessment. We do this on\nthree diverse datasets: from students in Uganda using a literacy game product,\nand from students in the US using two mathematics intelligent tutoring systems.\nWe consider various measures of the accuracy of the resulting predictors,\nincluding its ability to identify students at different parts along the\nassessment performance distribution. Our findings suggest that short-term log\nusage data, from 2-5 hours, can be used to provide valuable signal about\nstudents' long-term external performance.\n","authors":["Ge Gao","Amelia Leon","Andrea Jetten","Jasmine Turner","Husni Almoubayyed","Stephen Fancsali","Emma Brunskill"],"pdf_url":"https://arxiv.org/pdf/2412.15473v2.pdf","comment":"Accepted to the 15th International Learning Analytics and Knowledge\n  Conference (LAK2025)"},{"id":"http://arxiv.org/abs/2412.07051v3","updated":"2025-01-15T22:50:44Z","published":"2024-12-09T23:22:15Z","title":"A Misclassification Network-Based Method for Comparative Genomic\n  Analysis","summary":"  Classifying genome sequences based on metadata has been an active area of\nresearch in comparative genomics for decades with many important applications\nacross the life sciences. Established methods for classifying genomes can be\nbroadly grouped into sequence alignment-based and alignment-free models.\nConventional alignment-based models rely on genome similarity measures\ncalculated based on local sequence alignments or consistent ordering among\nsequences. However, such methods are computationally expensive when dealing\nwith large ensembles of even moderately sized genomes. In contrast,\nalignment-free (AF) approaches measure genome similarity based on summary\nstatistics in an unsupervised setting and are efficient enough to analyze large\ndatasets. However, both alignment-based and AF methods typically assume fixed\nscoring rubrics that lack the flexibility to assign varying importance to\ndifferent parts of the sequences based on prior knowledge. In this study, we\nintegrate AI and network science approaches to develop a comparative genomic\nanalysis framework that addresses these limitations. Our approach, termed the\nGenome Misclassification Network Analysis (GMNA), simultaneously leverages\nmisclassified instances, a learned scoring rubric, and label information to\nclassify genomes based on associated metadata and better understand potential\ndrivers of misclassification. We evaluate the utility of the GMNA using Naive\nBayes and convolutional neural network models, supplemented by additional\nexperiments with transformer-based models, to construct SARS-CoV-2 sampling\nlocation classifiers using over 500,000 viral genome sequences and study the\nresulting network of misclassifications. We demonstrate the global health\npotential of the GMNA by leveraging the SARS-CoV-2 genome misclassification\nnetworks to investigate the role human mobility played in structuring\ngeographic clustering of SARS-CoV-2.\n","authors":["Wan He","Tina Eliassi-Rad","Samuel V. Scarpino"],"pdf_url":"https://arxiv.org/pdf/2412.07051v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09189v1","updated":"2025-01-15T22:33:55Z","published":"2025-01-15T22:33:55Z","title":"Testing Noise Assumptions of Learning Algorithms","summary":"  We pose a fundamental question in computational learning theory: can we\nefficiently test whether a training set satisfies the assumptions of a given\nnoise model? This question has remained unaddressed despite decades of research\non learning in the presence of noise. In this work, we show that this task is\ntractable and present the first efficient algorithm to test various noise\nassumptions on the training data.\n  To model this question, we extend the recently proposed testable learning\nframework of Rubinfeld and Vasilyan (2023) and require a learner to run an\nassociated test that satisfies the following two conditions: (1) whenever the\ntest accepts, the learner outputs a classifier along with a certificate of\noptimality, and (2) the test must pass for any dataset drawn according to a\nspecified modeling assumption on both the marginal distribution and the noise\nmodel. We then consider the problem of learning halfspaces over Gaussian\nmarginals with Massart noise (where each label can be flipped with probability\nless than $1/2$ depending on the input features), and give a fully-polynomial\ntime testable learning algorithm.\n  We also show a separation between the classical setting of learning in the\npresence of structured noise and testable learning. In fact, for the simple\ncase of random classification noise (where each label is flipped with fixed\nprobability $\\eta = 1/2$), we show that testable learning requires\nsuper-polynomial time while classical learning is trivial.\n","authors":["Surbhi Goel","Adam R. Klivans","Konstantinos Stavropoulos","Arsen Vasilyan"],"pdf_url":"https://arxiv.org/pdf/2501.09189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09187v1","updated":"2025-01-15T22:26:26Z","published":"2025-01-15T22:26:26Z","title":"Patch-aware Vector Quantized Codebook Learning for Unsupervised Visual\n  Defect Detection","summary":"  Unsupervised visual defect detection is critical in industrial applications,\nrequiring a representation space that captures normal data features while\ndetecting deviations. Achieving a balance between expressiveness and\ncompactness is challenging; an overly expressive space risks inefficiency and\nmode collapse, impairing detection accuracy. We propose a novel approach using\nan enhanced VQ-VAE framework optimized for unsupervised defect detection. Our\nmodel introduces a patch-aware dynamic code assignment scheme, enabling\ncontext-sensitive code allocation to optimize spatial representation. This\nstrategy enhances normal-defect distinction and improves detection accuracy\nduring inference. Experiments on MVTecAD, BTAD, and MTSD datasets show our\nmethod achieves state-of-the-art performance.\n","authors":["Qisen Cheng","Shuhui Qu","Janghwan Lee"],"pdf_url":"https://arxiv.org/pdf/2501.09187v1.pdf","comment":"7 pages, Accepted to 36th IEEE ICTAI 2024"},{"id":"http://arxiv.org/abs/2501.09178v1","updated":"2025-01-15T22:12:27Z","published":"2025-01-15T22:12:27Z","title":"Enhancing Graph Representation Learning with Localized Topological\n  Features","summary":"  Representation learning on graphs is a fundamental problem that can be\ncrucial in various tasks. Graph neural networks, the dominant approach for\ngraph representation learning, are limited in their representation power.\nTherefore, it can be beneficial to explicitly extract and incorporate\nhigh-order topological and geometric information into these models. In this\npaper, we propose a principled approach to extract the rich connectivity\ninformation of graphs based on the theory of persistent homology. Our method\nutilizes the topological features to enhance the representation learning of\ngraph neural networks and achieve state-of-the-art performance on various node\nclassification and link prediction benchmarks. We also explore the option of\nend-to-end learning of the topological features, i.e., treating topological\ncomputation as a differentiable operator during learning. Our theoretical\nanalysis and empirical study provide insights and potential guidelines for\nemploying topological features in graph learning tasks.\n","authors":["Zuoyu Yan","Qi Zhao","Ze Ye","Tengfei Ma","Liangcai Gao","Zhi Tang","Yusu Wang","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09178v1.pdf","comment":"Accepted in JMLR 2025"},{"id":"http://arxiv.org/abs/2501.09171v1","updated":"2025-01-15T21:46:01Z","published":"2025-01-15T21:46:01Z","title":"Generative AI Takes a Statistics Exam: A Comparison of Performance\n  between ChatGPT3.5, ChatGPT4, and ChatGPT4o-mini","summary":"  Many believe that use of generative AI as a private tutor has the potential\nto shrink access and achievement gaps between students and schools with\nabundant resources versus those with fewer resources. Shrinking the gap is\npossible only if paid and free versions of the platforms perform with the same\naccuracy. In this experiment, we investigate the performance of GPT versions\n3.5, 4.0, and 4o-mini on the same 16-question statistics exam given to a class\nof first-year graduate students. While we do not advocate using any generative\nAI platform to complete an exam, the use of exam questions allows us to explore\naspects of ChatGPT's responses to typical questions that students might\nencounter in a statistics course. Results on accuracy indicate that GPT 3.5\nwould fail the exam, GPT4 would perform well, and GPT4o-mini would perform\nsomewhere in between. While we acknowledge the existence of other Generative\nAI/LLMs, our discussion concerns only ChatGPT because it is the most widely\nused platform on college campuses at this time. We further investigate\ndifferences among the AI platforms in the answers for each problem using\nmethods developed for text analytics, such as reading level evaluation and\ntopic modeling. Results indicate that GPT3.5 and 4o-mini have characteristics\nthat are more similar than either of them have with GPT4.\n","authors":["Monnie McGee","Bivin Sadler"],"pdf_url":"https://arxiv.org/pdf/2501.09171v1.pdf","comment":"24 pages, 2 figures, 3 tables. Submitted for publication August,\n  2024; revision submitted January 2025"},{"id":"http://arxiv.org/abs/2501.09166v1","updated":"2025-01-15T21:33:53Z","published":"2025-01-15T21:33:53Z","title":"Attention is All You Need Until You Need Retention","summary":"  This work introduces a novel Retention Layer mechanism for Transformer based\narchitectures, addressing their inherent lack of intrinsic retention\ncapabilities. Unlike human cognition, which can encode and dynamically recall\nsymbolic templates, Generative Pretrained Transformers rely solely on fixed\npretrained weights and ephemeral context windows, limiting their adaptability.\nThe proposed Retention Layer incorporates a persistent memory module capable of\nreal time data population, dynamic recall, and guided output generation. This\nenhancement allows models to store, update, and reuse observed patterns across\nsessions, enabling incremental learning and bridging the gap between static\npretraining and dynamic, context sensitive adaptation. The Retention Layer\ndesign parallels social learning processes, encompassing attention, retention,\nreproduction, and motivation stages. Technically, it integrates a memory\nattention mechanism and episodic buffers to manage memory scalability, mitigate\noverfitting, and ensure efficient recall. Applications span adaptive personal\nassistants, real time fraud detection, autonomous robotics, content moderation,\nand healthcare diagnostics. In each domain, the retention mechanism enables\nsystems to learn incrementally, personalize outputs, and respond to evolving\nreal world challenges effectively. By emulating key aspects of human learning,\nthis retention enhanced architecture fosters a more fluid and responsive AI\nparadigm, paving the way for dynamic, session aware models that extend the\ncapabilities of traditional Transformers into domains requiring continual\nadaptation.\n","authors":["M. Murat Yaslioglu"],"pdf_url":"https://arxiv.org/pdf/2501.09166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09163v1","updated":"2025-01-15T21:29:29Z","published":"2025-01-15T21:29:29Z","title":"Towards Understanding Extrapolation: a Causal Lens","summary":"  Canonical work handling distribution shifts typically necessitates an entire\ntarget distribution that lands inside the training distribution. However,\npractical scenarios often involve only a handful of target samples, potentially\nlying outside the training support, which requires the capability of\nextrapolation. In this work, we aim to provide a theoretical understanding of\nwhen extrapolation is possible and offer principled methods to achieve it\nwithout requiring an on-support target distribution. To this end, we formulate\nthe extrapolation problem with a latent-variable model that embodies the\nminimal change principle in causal mechanisms. Under this formulation, we cast\nthe extrapolation problem into a latent-variable identification problem. We\nprovide realistic conditions on shift properties and the estimation objectives\nthat lead to identification even when only one off-support target sample is\navailable, tackling the most challenging scenarios. Our theory reveals the\nintricate interplay between the underlying manifold's smoothness and the shift\nproperties. We showcase how our theoretical results inform the design of\npractical adaptation algorithms. Through experiments on both synthetic and\nreal-world data, we validate our theoretical findings and their practical\nimplications.\n","authors":["Lingjing Kong","Guangyi Chen","Petar Stojanov","Haoxuan Li","Eric P. Xing","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09163v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09160v1","updated":"2025-01-15T21:22:09Z","published":"2025-01-15T21:22:09Z","title":"AutoLoop: Fast Visual SLAM Fine-tuning through Agentic Curriculum\n  Learning","summary":"  Current visual SLAM systems face significant challenges in balancing\ncomputational efficiency with robust loop closure handling. Traditional\napproaches require careful manual tuning and incur substantial computational\noverhead, while learning-based methods either lack explicit loop closure\ncapabilities or implement them through computationally expensive methods. We\npresent AutoLoop, a novel approach that combines automated curriculum learning\nwith efficient fine-tuning for visual SLAM systems. Our method employs a DDPG\n(Deep Deterministic Policy Gradient) agent to dynamically adjust loop closure\nweights during training, eliminating the need for manual hyperparameter search\nwhile significantly reducing the required training steps. The approach\npre-computes potential loop closure pairs offline and leverages them through an\nagent-guided curriculum, allowing the model to adapt efficiently to new\nscenarios. Experiments conducted on TartanAir for training and validated across\nmultiple benchmarks including KITTI, EuRoC, ICL-NUIM and TUM RGB-D demonstrate\nthat AutoLoop achieves comparable or superior performance while reducing\ntraining time by an order of magnitude compared to traditional approaches.\nAutoLoop provides a practical solution for rapid adaptation of visual SLAM\nsystems, automating the weight tuning process that traditionally requires\nmultiple manual iterations. Our results show that this automated curriculum\nstrategy not only accelerates training but also maintains or improves the\nmodel's performance across diverse environmental conditions.\n","authors":["Assaf Lahiany","Oren Gal"],"pdf_url":"https://arxiv.org/pdf/2501.09160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10966v2","updated":"2025-01-15T21:20:03Z","published":"2024-12-14T20:54:37Z","title":"FlowDock: Geometric Flow Matching for Generative Protein-Ligand Docking\n  and Affinity Prediction","summary":"  Powerful generative AI models of protein-ligand structure have recently been\nproposed, but few of these methods support both flexible protein-ligand docking\nand affinity estimation. Of those that do, none can directly model multiple\nbinding ligands concurrently or have been rigorously benchmarked on\npharmacologically relevant drug targets, hindering their widespread adoption in\ndrug discovery efforts. In this work, we propose FlowDock, the first deep\ngeometric generative model based on conditional flow matching that learns to\ndirectly map unbound (apo) structures to their bound (holo) counterparts for an\narbitrary number of binding ligands. Furthermore, FlowDock provides predicted\nstructural confidence scores and binding affinity values with each of its\ngenerated protein-ligand complex structures, enabling fast virtual screening of\nnew (multi-ligand) drug targets. For the well-known PoseBusters Benchmark\ndataset, FlowDock outperforms single-sequence AlphaFold 3 with a 51% blind\ndocking success rate using unbound (apo) protein input structures and without\nany information derived from multiple sequence alignments, and for the\nchallenging new DockGen-E dataset, FlowDock outperforms single-sequence\nAlphaFold 3 and matches single-sequence Chai-1 for binding pocket\ngeneralization. Additionally, in the ligand category of the 16th community-wide\nCritical Assessment of Techniques for Structure Prediction (CASP16), FlowDock\nranked among the top-5 methods for pharmacological binding affinity estimation\nacross 140 protein-ligand complexes, demonstrating the efficacy of its learned\nrepresentations in virtual screening. Source code, data, and pre-trained models\nare available at https://github.com/BioinfoMachineLearning/FlowDock.\n","authors":["Alex Morehead","Jianlin Cheng"],"pdf_url":"https://arxiv.org/pdf/2412.10966v2.pdf","comment":"10 pages, 2 tables, 2 algorithms, 7 figures. Code, data, pre-trained\n  models, and baseline method predictions are available at\n  https://github.com/BioinfoMachineLearning/FlowDock"},{"id":"http://arxiv.org/abs/2404.10845v2","updated":"2025-01-15T21:09:22Z","published":"2024-04-16T18:47:07Z","title":"Top-k Multi-Armed Bandit Learning for Content Dissemination in Swarms of\n  Micro-UAVs","summary":"  This paper presents a Micro-Unmanned Aerial Vehicle (UAV)-enhanced content\nmanagement system for disaster scenarios where communication infrastructure is\ngenerally compromised. Utilizing a hybrid network of stationary and mobile\nMicro-UAVs, this system aims to provide crucial content access to isolated\ncommunities. In the developed architecture, stationary anchor UAVs, equipped\nwith vertical and lateral links, serve users in individual disaster-affected\ncommunities. and mobile micro-ferrying UAVs, with enhanced mobility, extend\ncoverage across multiple such communities. The primary goal is to devise a\ncontent dissemination system that dynamically learns caching policies to\nmaximize content accessibility to users left without communication\ninfrastructure. The core contribution is an adaptive content dissemination\nframework that employs a decentralized Top-k Multi-Armed Bandit learning\napproach for efficient UAV caching decisions. This approach accounts for\ngeo-temporal variations in content popularity and diverse user demands.\nAdditionally, a Selective Caching Algorithm is proposed to minimize redundant\ncontent copies by leveraging inter-UAV information sharing. Through functional\nverification and performance evaluation, the proposed framework demonstrates\nimproved system performance and adaptability across varying network sizes,\nmicro-UAV swarms, and content popularity distributions.\n","authors":["Amit Kumar Bhuyan","Hrishikesh Dutta","Subir Biswas"],"pdf_url":"https://arxiv.org/pdf/2404.10845v2.pdf","comment":"16 pages, 8 figures, 2 algorithms, 2 tables, journal"},{"id":"http://arxiv.org/abs/2501.09146v1","updated":"2025-01-15T20:55:13Z","published":"2025-01-15T20:55:13Z","title":"Towards Federated Multi-Armed Bandit Learning for Content Dissemination\n  using Swarm of UAVs","summary":"  This paper introduces an Unmanned Aerial Vehicle - enabled content management\narchitecture that is suitable for critical content access in communities of\nusers that are communication-isolated during diverse types of disaster\nscenarios. The proposed architecture leverages a hybrid network of stationary\nanchor UAVs and mobile Micro-UAVs for ubiquitous content dissemination. The\nanchor UAVs are equipped with both vertical and lateral communication links,\nand they serve local users, while the mobile micro-ferrying UAVs extend\ncoverage across communities with increased mobility. The focus is on developing\na content dissemination system that dynamically learns optimal caching policies\nto maximize content availability. The core innovation is an adaptive content\ndissemination framework based on distributed Federated Multi-Armed Bandit\nlearning. The goal is to optimize UAV content caching decisions based on\ngeo-temporal content popularity and user demand variations. A Selective Caching\nAlgorithm is also introduced to reduce redundant content replication by\nincorporating inter-UAV information sharing. This method strategically\npreserves the uniqueness in user preferences while amalgamating the\nintelligence across a distributed learning system. This approach improves the\nlearning algorithm's ability to adapt to diverse user preferences. Functional\nverification and performance evaluation confirm the proposed architecture's\nutility across different network sizes, UAV swarms, and content popularity\npatterns.\n","authors":["Amit Kumar Bhuyan","Hrishikesh Dutta","Subir Biswas"],"pdf_url":"https://arxiv.org/pdf/2501.09146v1.pdf","comment":"25 pages, 11 figures, 1 table, 4 algorithms, journal"},{"id":"http://arxiv.org/abs/2302.13336v2","updated":"2025-01-15T20:50:17Z","published":"2023-02-26T15:45:19Z","title":"Key-Exchange Convolutional Auto-Encoder for Data Augmentation in Early\n  Knee Osteoarthritis Detection","summary":"  Knee Osteoarthritis (KOA) is a common musculoskeletal condition that\nsignificantly affects mobility and quality of life, particularly in elderly\npopulations. However, training deep learning models for early KOA\nclassification is often hampered by the limited availability of annotated\nmedical datasets, owing to the high costs and labour-intensive nature of data\nlabelling. Traditional data augmentation techniques, while useful, rely on\nsimple transformations and fail to introduce sufficient diversity into the\ndataset. To address these challenges, we propose the Key-Exchange Convolutional\nAuto-Encoder (KECAE) as an innovative Artificial Intelligence (AI)-based data\naugmentation strategy for early KOA classification. Our model employs a\nconvolutional autoencoder with a novel key-exchange mechanism that generates\nsynthetic images by selectively exchanging key pathological features between\nX-ray images, which not only diversifies the dataset but also ensures the\nclinical validity of the augmented data. A hybrid loss function is introduced\nto supervise feature learning and reconstruction, integrating multiple\ncomponents, including reconstruction, supervision, and feature separation\nlosses. Experimental results demonstrate that the KECAE-generated data\nsignificantly improve the performance of KOA classification models, with\naccuracy gains of up to 1.98% across various standard and state-of-the-art\narchitectures. Furthermore, a clinical validation study involving expert\nradiologists confirms the anatomical plausibility and diagnostic realism of the\nsynthetic outputs. These findings highlight the potential of KECAE as a robust\ntool for augmenting medical datasets in early KOA detection.\n","authors":["Zhe Wang","Aladine Chetouani","Mohamed Jarraya","Yung Hsin Chen","Yuhua Ru","Fang Chen","Fabian Bauer","Liping Zhang","Didier Hans","Rachid Jennane"],"pdf_url":"https://arxiv.org/pdf/2302.13336v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09137v1","updated":"2025-01-15T20:43:36Z","published":"2025-01-15T20:43:36Z","title":"Gradient Descent Converges Linearly to Flatter Minima than Gradient Flow\n  in Shallow Linear Networks","summary":"  We study the gradient descent (GD) dynamics of a depth-2 linear neural\nnetwork with a single input and output. We show that GD converges at an\nexplicit linear rate to a global minimum of the training loss, even with a\nlarge stepsize -- about $2/\\textrm{sharpness}$. It still converges for even\nlarger stepsizes, but may do so very slowly. We also characterize the solution\nto which GD converges, which has lower norm and sharpness than the gradient\nflow solution. Our analysis reveals a trade off between the speed of\nconvergence and the magnitude of implicit regularization. This sheds light on\nthe benefits of training at the ``Edge of Stability'', which induces additional\nregularization by delaying convergence and may have implications for training\nmore complex models.\n","authors":["Pierfrancesco Beneventano","Blake Woodworth"],"pdf_url":"https://arxiv.org/pdf/2501.09137v1.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2209.10825v4","updated":"2025-01-15T20:43:18Z","published":"2022-09-22T07:12:48Z","title":"Nonsmooth Nonconvex-Nonconcave Minimax Optimization: Primal-Dual\n  Balancing and Iteration Complexity Analysis","summary":"  Nonconvex-nonconcave minimax optimization has gained widespread interest over\nthe last decade. However, most existing works focus on variants of gradient\ndescent-ascent (GDA) algorithms, which are only applicable to smooth\nnonconvex-concave settings. To address this limitation, we propose a novel\nalgorithm named smoothed proximal linear descent-ascent (smoothed PLDA), which\ncan effectively handle a broad range of structured nonsmooth\nnonconvex-nonconcave minimax problems. Specifically, we consider the setting\nwhere the primal function has a nonsmooth composite structure and the dual\nfunction possesses the Kurdyka-Lojasiewicz (KL) property with exponent $\\theta\n\\in [0,1)$. We introduce a novel convergence analysis framework for smoothed\nPLDA, the key components of which are our newly developed nonsmooth primal\nerror bound and dual error bound. Using this framework, we show that smoothed\nPLDA can find both $\\epsilon$-game-stationary points and\n$\\epsilon$-optimization-stationary points of the problems of interest in\n$\\mathcal{O}(\\epsilon^{-2\\max\\{2\\theta,1\\}})$ iterations. Furthermore, when\n$\\theta \\in [0,\\frac{1}{2}]$, smoothed PLDA achieves the optimal iteration\ncomplexity of $\\mathcal{O}(\\epsilon^{-2})$. To further demonstrate the\neffectiveness and wide applicability of our analysis framework, we show that\ncertain max-structured problem possesses the KL property with exponent\n$\\theta=0$ under mild assumptions. As a by-product, we establish\nalgorithm-independent quantitative relationships among various stationarity\nconcepts, which may be of independent interest.\n","authors":["Jiajin Li","Linglingzhi Zhu","Anthony Man-Cho So"],"pdf_url":"https://arxiv.org/pdf/2209.10825v4.pdf","comment":"Accepted for publication in Mathematical Programming"},{"id":"http://arxiv.org/abs/2501.09134v1","updated":"2025-01-15T20:37:04Z","published":"2025-01-15T20:37:04Z","title":"Benchmarking Robustness of Contrastive Learning Models for Medical\n  Image-Report Retrieval","summary":"  Medical images and reports offer invaluable insights into patient health. The\nheterogeneity and complexity of these data hinder effective analysis. To bridge\nthis gap, we investigate contrastive learning models for cross-domain\nretrieval, which associates medical images with their corresponding clinical\nreports. This study benchmarks the robustness of four state-of-the-art\ncontrastive learning models: CLIP, CXR-RePaiR, MedCLIP, and CXR-CLIP. We\nintroduce an occlusion retrieval task to evaluate model performance under\nvarying levels of image corruption. Our findings reveal that all evaluated\nmodels are highly sensitive to out-of-distribution data, as evidenced by the\nproportional decrease in performance with increasing occlusion levels. While\nMedCLIP exhibits slightly more robustness, its overall performance remains\nsignificantly behind CXR-CLIP and CXR-RePaiR. CLIP, trained on a\ngeneral-purpose dataset, struggles with medical image-report retrieval,\nhighlighting the importance of domain-specific training data. The evaluation of\nthis work suggests that more effort needs to be spent on improving the\nrobustness of these models. By addressing these limitations, we can develop\nmore reliable cross-domain retrieval models for medical applications.\n","authors":["Demetrio Deanda","Yuktha Priya Masupalli","Jeong Yang","Young Lee","Zechun Cao","Gongbo Liang"],"pdf_url":"https://arxiv.org/pdf/2501.09134v1.pdf","comment":"This work is accepted to AAAI 2025 Workshop -- the 9th International\n  Workshop on Health Intelligence"},{"id":"http://arxiv.org/abs/2501.09129v1","updated":"2025-01-15T20:24:18Z","published":"2025-01-15T20:24:18Z","title":"Deep Self-Supervised Disturbance Mapping with the OPERA Sentinel-1\n  Radiometric Terrain Corrected SAR Backscatter Product","summary":"  Mapping land surface disturbances supports disaster response, resource and\necosystem management, and climate adaptation efforts. Synthetic aperture radar\n(SAR) is an invaluable tool for disturbance mapping, providing consistent\ntime-series images of the ground regardless of weather or illumination\nconditions. Despite SAR's potential for disturbance mapping, processing SAR\ndata to an analysis-ready format requires expertise and significant compute\nresources, particularly for large-scale global analysis. In October 2023,\nNASA's Observational Products for End-Users from Remote Sensing Analysis\n(OPERA) project released the near-global Radiometric Terrain Corrected SAR\nbackscatter from Sentinel-1 (RTC-S1) dataset, providing publicly available,\nanalysis-ready SAR imagery. In this work, we utilize this new dataset to\nsystematically analyze land surface disturbances. As labeling SAR data is often\nprohibitively time-consuming, we train a self-supervised vision transformer -\nwhich requires no labels to train - on OPERA RTC-S1 data to estimate a\nper-pixel distribution from the set of baseline imagery and assess disturbances\nwhen there is significant deviation from the modeled distribution. To test our\nmodel's capability and generality, we evaluate three different natural\ndisasters - which represent high-intensity, abrupt disturbances - from three\ndifferent regions of the world. Across events, our approach yields high quality\ndelineations: F1 scores exceeding 0.6 and Areas Under the Precision-Recall\nCurve exceeding 0.65, consistently outperforming existing SAR disturbance\nmethods. Our findings suggest that a self-supervised vision transformer is\nwell-suited for global disturbance mapping and can be a valuable tool for\noperational, near-global disturbance monitoring, particularly when labeled data\ndoes not exist.\n","authors":["Harris Hardiman-Mostow","Charles Marshak","Alexander L. Handwerger"],"pdf_url":"https://arxiv.org/pdf/2501.09129v1.pdf","comment":"19 pages, 18 figures, 5 tables. Preprint. Submitted to JSTARS"},{"id":"http://arxiv.org/abs/2501.09126v1","updated":"2025-01-15T20:13:46Z","published":"2025-01-15T20:13:46Z","title":"Augmenting Human-Annotated Training Data with Large Language Model\n  Generation and Distillation in Open-Response Assessment","summary":"  Large Language Models (LLMs) like GPT-4o can help automate text\nclassification tasks at low cost and scale. However, there are major concerns\nabout the validity and reliability of LLM outputs. By contrast, human coding is\ngenerally more reliable but expensive to procure at scale. In this study, we\npropose a hybrid solution to leverage the strengths of both. We combine\nhuman-coded data and synthetic LLM-produced data to fine-tune a classical\nmachine learning classifier, distilling both into a smaller BERT model. We\nevaluate our method on a human-coded test set as a validity measure for LLM\noutput quality. In three experiments, we systematically vary LLM-generated\nsamples' size, variety, and consistency, informed by best practices in LLM\ntuning. Our findings indicate that augmenting datasets with synthetic samples\nimproves classifier performance, with optimal results achieved at an 80%\nsynthetic to 20% human-coded data ratio. Lower temperature settings of 0.3,\ncorresponding to less variability in LLM generations, produced more stable\nimprovements but also limited model learning from augmented samples. In\ncontrast, higher temperature settings (0.7 and above) introduced greater\nvariability in performance estimates and, at times, lower performance. Hence,\nLLMs may produce more uniform output that classifiers overfit to earlier or\nproduce more diverse output that runs the risk of deteriorating model\nperformance through information irrelevant to the prediction task. Filtering\nout inconsistent synthetic samples did not enhance performance. We conclude\nthat integrating human and LLM-generated data to improve text classification\nmodels in assessment offers a scalable solution that leverages both the\naccuracy of human coding and the variety of LLM outputs.\n","authors":["Conrad Borchers","Danielle R. Thomas","Jionghao Lin","Ralph Abboud","Kenneth R. Koedinger"],"pdf_url":"https://arxiv.org/pdf/2501.09126v1.pdf","comment":"Manuscript accepted to the Second Workshop on Generative AI for\n  Learning Analytics (GenAI-LA) at LAK25"},{"id":"http://arxiv.org/abs/2501.09117v1","updated":"2025-01-15T19:53:14Z","published":"2025-01-15T19:53:14Z","title":"Multi-Class Traffic Assignment using Multi-View Heterogeneous Graph\n  Attention Networks","summary":"  Solving traffic assignment problem for large networks is computationally\nchallenging when conventional optimization-based methods are used. In our\nresearch, we develop an innovative surrogate model for a traffic assignment\nwhen multi-class vehicles are involved. We do so by employing heterogeneous\ngraph neural networks which use a multiple-view graph attention mechanism\ntailored to different vehicle classes, along with additional links connecting\norigin-destination pairs. We also integrate the node-based flow conservation\nlaw into the loss function. As a result, our model adheres to flow conservation\nwhile delivering highly accurate predictions for link flows and utilization\nratios. Through numerical experiments conducted on urban transportation\nnetworks, we demonstrate that our model surpasses traditional neural network\napproaches in convergence speed and predictive accuracy in both user\nequilibrium and system optimal versions of traffic assignment.\n","authors":["Tong Liu","Hadi Meidani"],"pdf_url":"https://arxiv.org/pdf/2501.09117v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2412.19897v2","updated":"2025-01-15T19:51:44Z","published":"2024-12-27T19:17:02Z","title":"Surrogate Modeling for Explainable Predictive Time Series Corrections","summary":"  We introduce a local surrogate approach for explainable time-series\nforecasting. An initially non-interpretable predictive model to improve the\nforecast of a classical time-series 'base model' is used. 'Explainability' of\nthe correction is provided by fitting the base model again to the data from\nwhich the error prediction is removed (subtracted), yielding a difference in\nthe model parameters which can be interpreted. We provide illustrative examples\nto demonstrate the potential of the method to discover and explain underlying\npatterns in the data.\n","authors":["Alfredo Lopez","Florian Sobieczky"],"pdf_url":"https://arxiv.org/pdf/2412.19897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09107v1","updated":"2025-01-15T19:44:15Z","published":"2025-01-15T19:44:15Z","title":"Rethinking Post-Training Quantization: Introducing a Statistical\n  Pre-Calibration Approach","summary":"  As Large Language Models (LLMs) become increasingly computationally complex,\ndeveloping efficient deployment strategies, such as quantization, becomes\ncrucial. State-of-the-art Post-training Quantization (PTQ) techniques often\nrely on calibration processes to maintain the accuracy of these models.\nHowever, while these calibration techniques can enhance performance in certain\ndomains, they may not be as effective in others. This paper aims to draw\nattention to robust statistical approaches that can mitigate such issues. We\npropose a weight-adaptive PTQ method that can be considered a precursor to\ncalibration-based PTQ methods, guiding the quantization process to preserve the\ndistribution of weights by minimizing the Kullback-Leibler divergence between\nthe quantized weights and the originally trained weights. This minimization\nensures that the quantized model retains the Shannon information content of the\noriginal model to a great extent, guaranteeing robust and efficient deployment\nacross many tasks. As such, our proposed approach can perform on par with most\ncommon calibration-based PTQ methods, establishing a new pre-calibration step\nfor further adjusting the quantized weights with calibration. We show that our\npre-calibration results achieve the same accuracy as some existing\ncalibration-based PTQ methods on various LLMs.\n","authors":["Alireza Ghaffari","Sharareh Younesian","Boxing Chen","Vahid Partovi Nia","Masoud Asgharian"],"pdf_url":"https://arxiv.org/pdf/2501.09107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09103v1","updated":"2025-01-15T19:42:18Z","published":"2025-01-15T19:42:18Z","title":"Similarity-Quantized Relative Difference Learning for Improved Molecular\n  Activity Prediction","summary":"  Accurate prediction of molecular activities is crucial for efficient drug\ndiscovery, yet remains challenging due to limited and noisy datasets. We\nintroduce Similarity-Quantized Relative Learning (SQRL), a learning framework\nthat reformulates molecular activity prediction as relative difference learning\nbetween structurally similar pairs of compounds. SQRL uses precomputed\nmolecular similarities to enhance training of graph neural networks and other\narchitectures, and significantly improves accuracy and generalization in\nlow-data regimes common in drug discovery. We demonstrate its broad\napplicability and real-world potential through benchmarking on public datasets\nas well as proprietary industry data. Our findings demonstrate that leveraging\nsimilarity-aware relative differences provides an effective paradigm for\nmolecular activity prediction.\n","authors":["Karina Zadorozhny","Kangway V. Chuang","Bharath Sathappan","Ewan Wallace","Vishnu Sresht","Colin A. Grambow"],"pdf_url":"https://arxiv.org/pdf/2501.09103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09102v1","updated":"2025-01-15T19:37:44Z","published":"2025-01-15T19:37:44Z","title":"Tracking the Takes and Trajectories of English-Language News Narratives\n  across Trustworthy and Worrisome Websites","summary":"  Understanding how misleading and outright false information enters news\necosystems remains a difficult challenge that requires tracking how narratives\nspread across thousands of fringe and mainstream news websites. To do this, we\nintroduce a system that utilizes encoder-based large language models and\nzero-shot stance detection to scalably identify and track news narratives and\ntheir attitudes across over 4,000 factually unreliable, mixed-reliability, and\nfactually reliable English-language news websites. Running our system over an\n18 month period, we track the spread of 146K news stories. Using network-based\ninterference via the NETINF algorithm, we show that the paths of news\nnarratives and the stances of websites toward particular entities can be used\nto uncover slanted propaganda networks (e.g., anti-vaccine and anti-Ukraine)\nand to identify the most influential websites in spreading these attitudes in\nthe broader news ecosystem. We hope that increased visibility into our\ndistributed news ecosystem can help with the reporting and fact-checking of\npropaganda and disinformation.\n","authors":["Hans W. A. Hanley","Emily Okabe","Zakir Durumeric"],"pdf_url":"https://arxiv.org/pdf/2501.09102v1.pdf","comment":"To appear at USENIX Security Symposium 2025. Keywords:\n  Misinformation, News, Narratives, LLMs, Stance-Detection"},{"id":"http://arxiv.org/abs/2311.12068v4","updated":"2025-01-15T19:28:27Z","published":"2023-11-19T17:28:28Z","title":"Enhancing Novel Object Detection via Cooperative Foundational Models","summary":"  In this work, we address the challenging and emergent problem of novel object\ndetection (NOD), focusing on the accurate detection of both known and novel\nobject categories during inference. Traditional object detection algorithms are\ninherently closed-set, limiting their capability to handle NOD. We present a\nnovel approach to transform existing closed-set detectors into open-set\ndetectors. This transformation is achieved by leveraging the complementary\nstrengths of pre-trained foundational models, specifically CLIP and SAM,\nthrough our cooperative mechanism. Furthermore, by integrating this mechanism\nwith state-of-the-art open-set detectors such as GDINO, we establish new\nbenchmarks in object detection performance. Our method achieves 17.42 mAP in\nnovel object detection and 42.08 mAP for known objects on the challenging LVIS\ndataset. Adapting our approach to the COCO OVD split, we surpass the current\nstate-of-the-art by a margin of 7.2 $ \\text{AP}_{50} $ for novel classes. Our\ncode is available at https://rohit901.github.io/coop-foundation-models/ .\n","authors":["Rohit Bharadwaj","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2311.12068v4.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2407.19115v3","updated":"2025-01-15T19:18:35Z","published":"2024-07-26T22:38:11Z","title":"Towards Scalable and Stable Parallelization of Nonlinear RNNs","summary":"  Transformers and linear state space models can be evaluated in parallel on\nmodern hardware, but evaluating nonlinear RNNs appears to be an inherently\nsequential problem. Recently, however, Lim et al. '24 developed an approach\ncalled DEER, which evaluates nonlinear RNNs in parallel by posing the states as\nthe solution to a fixed-point problem. They derived a parallel form of Newton's\nmethod to solve the fixed-point problem and achieved significant speedups over\nsequential evaluation. However, the computational complexity of DEER is cubic\nin the state size, and the algorithm can suffer from numerical instability. We\naddress these limitations with two novel contributions. To reduce the\ncomputational complexity, we apply quasi-Newton approximations and show they\nconverge comparably to Newton, use less memory, and are faster. To stabilize\nDEER, we leverage a connection between the Levenberg-Marquardt algorithm and\nKalman smoothing, which we call ELK. This connection allows us to stabilize\nNewton's method while using efficient parallelized Kalman smoothing algorithms\nto retain performance. Through several experiments, we show that these\ninnovations allow for parallel evaluation of nonlinear RNNs at larger scales\nand with greater stability.\n","authors":["Xavier Gonzalez","Andrew Warrington","Jimmy T. H. Smith","Scott W. Linderman"],"pdf_url":"https://arxiv.org/pdf/2407.19115v3.pdf","comment":"33 pages, 9 figures, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.03383v2","updated":"2025-01-15T19:16:18Z","published":"2025-01-06T20:58:27Z","title":"The Artificial Scientist -- in-transit Machine Learning of Plasma\n  Simulations","summary":"  Increasing HPC cluster sizes and large-scale simulations that produce\npetabytes of data per run, create massive IO and storage challenges for\nanalysis. Deep learning-based techniques, in particular, make use of these\namounts of domain data to extract patterns that help build scientific\nunderstanding. Here, we demonstrate a streaming workflow in which simulation\ndata is streamed directly to a machine-learning (ML) framework, circumventing\nthe file system bottleneck. Data is transformed in transit, asynchronously to\nthe simulation and the training of the model. With the presented workflow, data\noperations can be performed in common and easy-to-use programming languages,\nfreeing the application user from adapting the application output routines. As\na proof-of-concept we consider a GPU accelerated particle-in-cell (PIConGPU)\nsimulation of the Kelvin- Helmholtz instability (KHI). We employ experience\nreplay to avoid catastrophic forgetting in learning from this non-steady\nprocess in a continual manner. We detail challenges addressed while porting and\nscaling to Frontier exascale system.\n","authors":["Jeffrey Kelling","Vicente Bolea","Michael Bussmann","Ankush Checkervarty","Alexander Debus","Jan Ebert","Greg Eisenhauer","Vineeth Gutta","Stefan Kesselheim","Scott Klasky","Richard Pausch","Norbert Podhorszki","Franz Poschel","David Rogers","Jeyhun Rustamov","Steve Schmerler","Ulrich Schramm","Klaus Steiniger","Rene Widera","Anna Willmann","Sunita Chandrasekaran"],"pdf_url":"https://arxiv.org/pdf/2501.03383v2.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.09081v1","updated":"2025-01-15T19:00:47Z","published":"2025-01-15T19:00:47Z","title":"Inferring Transition Dynamics from Value Functions","summary":"  In reinforcement learning, the value function is typically trained to solve\nthe Bellman equation, which connects the current value to future values. This\ntemporal dependency hints that the value function may contain implicit\ninformation about the environment's transition dynamics. By rearranging the\nBellman equation, we show that a converged value function encodes a model of\nthe underlying dynamics of the environment. We build on this insight to propose\na simple method for inferring dynamics models directly from the value function,\npotentially mitigating the need for explicit model learning. Furthermore, we\nexplore the challenges of next-state identifiability, discussing conditions\nunder which the inferred dynamics model is well-defined. Our work provides a\ntheoretical foundation for leveraging value functions in dynamics modeling and\nopens a new avenue for bridging model-free and model-based reinforcement\nlearning.\n","authors":["Jacob Adamczyk"],"pdf_url":"https://arxiv.org/pdf/2501.09081v1.pdf","comment":"Accepted at the AAAI-25 8th Workshop on Generalization in Planning"},{"id":"http://arxiv.org/abs/2501.09080v1","updated":"2025-01-15T19:00:46Z","published":"2025-01-15T19:00:46Z","title":"Average-Reward Reinforcement Learning with Entropy Regularization","summary":"  The average-reward formulation of reinforcement learning (RL) has drawn\nincreased interest in recent years due to its ability to solve\ntemporally-extended problems without discounting. Independently, RL algorithms\nhave benefited from entropy-regularization: an approach used to make the\noptimal policy stochastic, thereby more robust to noise. Despite the distinct\nbenefits of the two approaches, the combination of entropy regularization with\nan average-reward objective is not well-studied in the literature and there has\nbeen limited development of algorithms for this setting. To address this gap in\nthe field, we develop algorithms for solving entropy-regularized average-reward\nRL problems with function approximation. We experimentally validate our method,\ncomparing it with existing algorithms on standard benchmarks for RL.\n","authors":["Jacob Adamczyk","Volodymyr Makarenko","Stas Tiomkin","Rahul V. Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2501.09080v1.pdf","comment":"Accepted at the AAAI-25 Eighth Workshop on Bridging the Gap Between\n  AI Planning and Reinforcement Learning (PRL)"},{"id":"http://arxiv.org/abs/2501.09064v1","updated":"2025-01-15T19:00:01Z","published":"2025-01-15T19:00:01Z","title":"Generative diffusion model with inverse renormalization group flows","summary":"  Diffusion models represent a class of generative models that produce data by\ndenoising a sample corrupted by white noise. Despite the success of diffusion\nmodels in computer vision, audio synthesis, and point cloud generation, so far\nthey overlook inherent multiscale structures in data and have a slow generation\nprocess due to many iteration steps. In physics, the renormalization group\noffers a fundamental framework for linking different scales and giving an\naccurate coarse-grained model. Here we introduce a renormalization group-based\ndiffusion model that leverages multiscale nature of data distributions for\nrealizing a high-quality data generation. In the spirit of renormalization\ngroup procedures, we define a flow equation that progressively erases data\ninformation from fine-scale details to coarse-grained structures. Through\nreversing the renormalization group flows, our model is able to generate\nhigh-quality samples in a coarse-to-fine manner. We validate the versatility of\nthe model through applications to protein structure prediction and image\ngeneration. Our model consistently outperforms conventional diffusion models\nacross standard evaluation metrics, enhancing sample quality and/or\naccelerating sampling speed by an order of magnitude. The proposed method\nalleviates the need for data-dependent tuning of hyperparameters in the\ngenerative diffusion models, showing promise for systematically increasing\nsample efficiency based on the concept of the renormalization group.\n","authors":["Kanta Masuki","Yuto Ashida"],"pdf_url":"https://arxiv.org/pdf/2501.09064v1.pdf","comment":"9+21 pages, 4+11 figures. The code and trained models are available\n  at https://github.com/kantamasuki/RGDM"},{"id":"http://arxiv.org/abs/2501.09052v1","updated":"2025-01-15T13:42:39Z","published":"2025-01-15T13:42:39Z","title":"Continual Test-Time Adaptation for Single Image Defocus Deblurring via\n  Causal Siamese Networks","summary":"  Single image defocus deblurring (SIDD) aims to restore an all-in-focus image\nfrom a defocused one. Distribution shifts in defocused images generally lead to\nperformance degradation of existing methods during out-of-distribution\ninferences. In this work, we gauge the intrinsic reason behind the performance\ndegradation, which is identified as the heterogeneity of lens-specific point\nspread functions. Empirical evidence supports this finding, motivating us to\nemploy a continual test-time adaptation (CTTA) paradigm for SIDD. However,\ntraditional CTTA methods, which primarily rely on entropy minimization, cannot\nsufficiently explore task-dependent information for pixel-level regression\ntasks like SIDD. To address this issue, we propose a novel Siamese\nnetworks-based continual test-time adaptation framework, which adapts source\nmodels to continuously changing target domains only requiring unlabeled target\ndata in an online manner. To further mitigate semantically erroneous textures\nintroduced by source SIDD models under severe degradation, we revisit the\nlearning paradigm through a structural causal model and propose Causal Siamese\nnetworks (CauSiam). Our method leverages large-scale pre-trained\nvision-language models to derive discriminative universal semantic priors and\nintegrates these priors into Siamese networks, ensuring causal identifiability\nbetween blurry inputs and restored images. Extensive experiments demonstrate\nthat CauSiam effectively improves the generalization performance of existing\nSIDD methods in continuously changing domains.\n","authors":["Shuang Cui","Yi Li","Jiangmeng Li","Xiongxin Tang","Bing Su","Fanjiang Xu","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.09052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09048v1","updated":"2025-01-15T11:28:36Z","published":"2025-01-15T11:28:36Z","title":"Anthropomorphic Features for On-Line Signatures","summary":"  Many features have been proposed in on-line signature verification.\nGenerally, these features rely on the position of the on-line signature samples\nand their dynamic properties, as recorded by a tablet. This paper proposes a\nnovel feature space to describe efficiently on-line signatures. Since producing\na signature requires a skeletal arm system and its associated muscles, the new\nfeature space is based on characterizing the movement of the shoulder, the\nelbow and the wrist joints when signing. As this motion is not directly\nobtained from a digital tablet, the new features are calculated by means of a\nvirtual skeletal arm (VSA) model, which simulates the architecture of a real\narm and forearm. Specifically, the VSA motion is described by its 3D joint\nposition and its joint angles. These anthropomorphic features are worked out\nfrom both pen position and orientation through the VSA forward and direct\nkinematic model. The anthropomorphic features' robustness is proved by\nachieving state-of-the-art performance with several verifiers and multiple\nbenchmarks on third party signature databases, which were collected with\ndifferent devices and in different languages and scripts.\n","authors":["Moises Diaz","Miguel A. Ferrer","Jose J. Quintana"],"pdf_url":"https://arxiv.org/pdf/2501.09048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08729v1","updated":"2025-01-15T11:11:38Z","published":"2025-01-15T11:11:38Z","title":"GRAPPA -- A Hybrid Graph Neural Network for Predicting Pure Component\n  Vapor Pressures","summary":"  Although the pure component vapor pressure is one of the most important\nproperties for designing chemical processes, no broadly applicable,\nsufficiently accurate, and open-source prediction method has been available. To\novercome this, we have developed GRAPPA - a hybrid graph neural network for\npredicting vapor pressures of pure components. GRAPPA enables the prediction of\nthe vapor pressure curve of basically any organic molecule, requiring only the\nmolecular structure as input. The new model consists of three parts: A graph\nattention network for the message passing step, a pooling function that\ncaptures long-range interactions, and a prediction head that yields the\ncomponent-specific parameters of the Antoine equation, from which the vapor\npressure can readily and consistently be calculated for any temperature. We\nhave trained and evaluated GRAPPA on experimental vapor pressure data of almost\n25,000 pure components. We found excellent prediction accuracy for unseen\ncomponents, outperforming state-of-the-art group contribution methods and other\nmachine learning approaches in applicability and accuracy. The trained model\nand its code are fully disclosed, and GRAPPA is directly applicable via the\ninteractive website ml-prop.mv.rptu.de.\n","authors":["Marco Hoffmann","Hans Hasse","Fabian Jirasek"],"pdf_url":"https://arxiv.org/pdf/2501.08729v1.pdf","comment":"38 pages, 12 figures"},{"id":"http://arxiv.org/abs/2106.00393v4","updated":"2025-01-15T10:33:52Z","published":"2021-06-01T11:02:22Z","title":"Relational Reasoning Networks","summary":"  Neuro-symbolic methods integrate neural architectures, knowledge\nrepresentation and reasoning. However, they have been struggling at both\ndealing with the intrinsic uncertainty of the observations and scaling to\nreal-world applications. This paper presents Relational Reasoning Networks\n(R2N), a novel end-to-end model that performs relational reasoning in the\nlatent space of a deep learner architecture, where the representations of\nconstants, ground atoms and their manipulations are learned in an integrated\nfashion. Unlike flat architectures like Knowledge Graph Embedders, which can\nonly represent relations between entities, R2Ns define an additional\ncomputational structure, accounting for higher-level relations among the ground\natoms. The considered relations can be explicitly known, like the ones defined\nby logic formulas, or defined as unconstrained correlations among groups of\nground atoms. R2Ns can be applied to purely symbolic tasks or as a\nneuro-symbolic platform to integrate learning and reasoning in heterogeneous\nproblems with both symbolic and feature-based represented entities. The\nproposed model overtakes the limitations of previous neuro-symbolic methods\nthat have been either limited in terms of scalability or expressivity. The\nproposed methodology is shown to achieve state-of-the-art results in different\nexperimental settings.\n","authors":["Giuseppe Marra","Michelangelo Diligenti","Francesco Giannini"],"pdf_url":"https://arxiv.org/pdf/2106.00393v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09046v1","updated":"2025-01-15T09:52:40Z","published":"2025-01-15T09:52:40Z","title":"Learning Hemodynamic Scalar Fields on Coronary Artery Meshes: A\n  Benchmark of Geometric Deep Learning Models","summary":"  Coronary artery disease, caused by the narrowing of coronary vessels due to\natherosclerosis, is the leading cause of death worldwide. The diagnostic gold\nstandard, fractional flow reserve (FFR), measures the trans-stenotic pressure\nratio during maximal vasodilation but is invasive and costly. This has driven\nthe development of virtual FFR (vFFR) using computational fluid dynamics (CFD)\nto simulate coronary flow. Geometric deep learning algorithms have shown\npromise for learning features on meshes, including cardiovascular research\napplications. This study empirically analyzes various backends for predicting\nvFFR fields in coronary arteries as CFD surrogates, comparing six backends for\nlearning hemodynamics on meshes using CFD solutions as ground truth.\n  The study has two parts: i) Using 1,500 synthetic left coronary artery\nbifurcations, models were trained to predict pressure-related fields for vFFR\nreconstruction, comparing different learning variables. ii) Using 427\npatient-specific CFD simulations, experiments were repeated focusing on the\nbest-performing learning variable from the synthetic dataset.\n  Most backends performed well on the synthetic dataset, especially when\npredicting pressure drop over the manifold. Transformer-based backends\noutperformed others when predicting pressure and vFFR fields and were the only\nmodels achieving strong performance on patient-specific data, excelling in both\naverage per-point error and vFFR accuracy in stenotic lesions.\n  These results suggest geometric deep learning backends can effectively\nreplace CFD for simple geometries, while transformer-based networks are\nsuperior for complex, heterogeneous datasets. Pressure drop was identified as\nthe optimal network output for learning pressure-related fields.\n","authors":["Guido Nannini","Julian Suk","Patryk Rygiel","Simone Saitta","Luca Mariani","Riccardo Maranga","Andrea Baggiano","Gianluca Pontone","Alberto Redaelli"],"pdf_url":"https://arxiv.org/pdf/2501.09046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09042v1","updated":"2025-01-15T06:58:53Z","published":"2025-01-15T06:58:53Z","title":"CookingDiffusion: Cooking Procedural Image Generation with Stable\n  Diffusion","summary":"  Recent advancements in text-to-image generation models have excelled in\ncreating diverse and realistic images. This success extends to food imagery,\nwhere various conditional inputs like cooking styles, ingredients, and recipes\nare utilized. However, a yet-unexplored challenge is generating a sequence of\nprocedural images based on cooking steps from a recipe. This could enhance the\ncooking experience with visual guidance and possibly lead to an intelligent\ncooking simulation system. To fill this gap, we introduce a novel task called\n\\textbf{cooking procedural image generation}. This task is inherently\ndemanding, as it strives to create photo-realistic images that align with\ncooking steps while preserving sequential consistency. To collectively tackle\nthese challenges, we present \\textbf{CookingDiffusion}, a novel approach that\nleverages Stable Diffusion and three innovative Memory Nets to model procedural\nprompts. These prompts encompass text prompts (representing cooking steps),\nimage prompts (corresponding to cooking images), and multi-modal prompts\n(mixing cooking steps and images), ensuring the consistent generation of\ncooking procedural images. To validate the effectiveness of our approach, we\npreprocess the YouCookII dataset, establishing a new benchmark. Our\nexperimental results demonstrate that our model excels at generating\nhigh-quality cooking procedural images with remarkable consistency across\nsequential cooking steps, as measured by both the FID and the proposed Average\nProcedure Consistency metrics. Furthermore, CookingDiffusion demonstrates the\nability to manipulate ingredients and cooking methods in a recipe. We will make\nour code, models, and dataset publicly accessible.\n","authors":["Yuan Wang","Bin Xhu","Yanbin Hao","Chong-Wah Ngo","Yi Tan","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.04412v2","updated":"2025-01-15T04:46:30Z","published":"2022-03-07T17:22:30Z","title":"ImageNet-Patch: A Dataset for Benchmarking Machine Learning Robustness\n  against Adversarial Patches","summary":"  Adversarial patches are optimized contiguous pixel blocks in an input image\nthat cause a machine-learning model to misclassify it. However, their\noptimization is computationally demanding, and requires careful hyperparameter\ntuning, potentially leading to suboptimal robustness evaluations. To overcome\nthese issues, we propose ImageNet-Patch, a dataset to benchmark\nmachine-learning models against adversarial patches. It consists of a set of\npatches, optimized to generalize across different models, and readily\napplicable to ImageNet data after preprocessing them with affine\ntransformations. This process enables an approximate yet faster robustness\nevaluation, leveraging the transferability of adversarial perturbations. We\nshowcase the usefulness of this dataset by testing the effectiveness of the\ncomputed patches against 127 models. We conclude by discussing how our dataset\ncould be used as a benchmark for robustness, and how our methodology can be\ngeneralized to other domains. We open source our dataset and evaluation code at\nhttps://github.com/pralab/ImageNet-Patch.\n","authors":["Maura Pintor","Daniele Angioni","Angelo Sotgiu","Luca Demetrio","Ambra Demontis","Battista Biggio","Fabio Roli"],"pdf_url":"https://arxiv.org/pdf/2203.04412v2.pdf","comment":"Published in Pattern Recognition. DOI:\n  https://doi.org/10.1016/j.patcog.2022.109064"},{"id":"http://arxiv.org/abs/2501.09040v1","updated":"2025-01-15T03:25:25Z","published":"2025-01-15T03:25:25Z","title":"Pseudolabel guided pixels contrast for domain adaptive semantic\n  segmentation","summary":"  Semantic segmentation is essential for comprehending images, but the process\nnecessitates a substantial amount of detailed annotations at the pixel level.\nAcquiring such annotations can be costly in the real-world. Unsupervised domain\nadaptation (UDA) for semantic segmentation is a technique that uses virtual\ndata with labels to train a model and adapts it to real data without labels.\nSome recent works use contrastive learning, which is a powerful method for\nself-supervised learning, to help with this technique. However, these works do\nnot take into account the diversity of features within each class when using\ncontrastive learning, which leads to errors in class prediction. We analyze the\nlimitations of these works and propose a novel framework called Pseudo-label\nGuided Pixel Contrast (PGPC), which overcomes the disadvantages of previous\nmethods. We also investigate how to use more information from target images\nwithout adding noise from pseudo-labels. We test our method on two standard UDA\nbenchmarks and show that it outperforms existing methods. Specifically, we\nachieve relative improvements of 5.1% mIoU and 4.6% mIoU on the Grand Theft\nAuto V (GTA5) to Cityscapes and SYNTHIA to Cityscapes tasks based on DAFormer,\nrespectively. Furthermore, our approach can enhance the performance of other\nUDA approaches without increasing model complexity. Code is available at\nhttps://github.com/embar111/pgpc\n","authors":["Jianzi Xiang","Cailu Wan","Zhu Cao"],"pdf_url":"https://arxiv.org/pdf/2501.09040v1.pdf","comment":"24 pages, 5 figures. Code: https://github.com/embar111/pgpc"},{"id":"http://arxiv.org/abs/2501.09770v1","updated":"2025-01-15T19:00:45Z","published":"2025-01-15T19:00:45Z","title":"EVAL: EigenVector-based Average-reward Learning","summary":"  In reinforcement learning, two objective functions have been developed\nextensively in the literature: discounted and averaged rewards. The\ngeneralization to an entropy-regularized setting has led to improved robustness\nand exploration for both of these objectives. Recently, the entropy-regularized\naverage-reward problem was addressed using tools from large deviation theory in\nthe tabular setting. This method has the advantage of linearity, providing\naccess to both the optimal policy and average reward-rate through properties of\na single matrix. In this paper, we extend that framework to more general\nsettings by developing approaches based on function approximation by neural\nnetworks. This formulation reveals new theoretical insights into the\nrelationship between different objectives used in RL. Additionally, we combine\nour algorithm with a posterior policy iteration scheme, showing how our\napproach can also solve the average-reward RL problem without\nentropy-regularization. Using classic control benchmarks, we experimentally\nfind that our method compares favorably with other algorithms in terms of\nstability and rate of convergence.\n","authors":["Jacob Adamczyk","Volodymyr Makarenko","Stas Tiomkin","Rahul V. Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2501.09770v1.pdf","comment":"Accepted at the AAAI-25 8th Workshop on Generalization in Planning.\n  arXiv admin note: text overlap with arXiv:2501.09080"}],"Multimedia":[{"id":"http://arxiv.org/abs/2501.09012v1","updated":"2025-01-15T18:56:22Z","published":"2025-01-15T18:56:22Z","title":"Multimodal LLMs Can Reason about Aesthetics in Zero-Shot","summary":"  We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability\nshall be elicited to evaluate the aesthetics of artworks. To facilitate this\ninvestigation, we construct MM-StyleBench, a novel high-quality dataset for\nbenchmarking artistic stylization. We then develop a principled method for\nhuman preference modeling and perform a systematic correlation analysis between\nMLLMs' responses and human preference. Our experiments reveal an inherent\nhallucination issue of MLLMs in art evaluation, associated with response\nsubjectivity. ArtCoT is proposed, demonstrating that art-specific task\ndecomposition and the use of concrete language boost MLLMs' reasoning ability\nfor aesthetics. Our findings offer valuable insights into MLLMs for art and can\nbenefit a wide range of downstream applications, such as style transfer and\nartistic image generation. Code available at\nhttps://github.com/songrise/MLLM4Art.\n","authors":["Ruixiang Jiang","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09012v1.pdf","comment":"WIP, Homepage https://github.com/songrise/MLLM4Art"},{"id":"http://arxiv.org/abs/2412.11409v3","updated":"2025-01-15T01:59:02Z","published":"2024-12-16T03:25:23Z","title":"Multi-modal and Multi-scale Spatial Environment Understanding for\n  Immersive Visual Text-to-Speech","summary":"  Visual Text-to-Speech (VTTS) aims to take the environmental image as the\nprompt to synthesize the reverberant speech for the spoken content. The\nchallenge of this task lies in understanding the spatial environment from the\nimage. Many attempts have been made to extract global spatial visual\ninformation from the RGB space of an spatial image. However, local and depth\nimage information are crucial for understanding the spatial environment, which\nprevious works have ignored. To address the issues, we propose a novel\nmulti-modal and multi-scale spatial environment understanding scheme to achieve\nimmersive VTTS, termed M2SE-VTTS. The multi-modal aims to take both the RGB and\nDepth spaces of the spatial image to learn more comprehensive spatial\ninformation, and the multi-scale seeks to model the local and global spatial\nknowledge simultaneously. Specifically, we first split the RGB and Depth images\ninto patches and adopt the Gemini-generated environment captions to guide the\nlocal spatial understanding. After that, the multi-modal and multi-scale\nfeatures are integrated by the local-aware global spatial understanding. In\nthis way, M2SE-VTTS effectively models the interactions between local and\nglobal spatial contexts in the multi-modal spatial environment. Objective and\nsubjective evaluations suggest that our model outperforms the advanced\nbaselines in environmental speech generation. The code and audio samples are\navailable at: https://github.com/AI-S2-Lab/M2SE-VTTS.\n","authors":["Rui Liu","Shuwei He","Yifan Hu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2412.11409v3.pdf","comment":"9 pages,2 figures, Accepted by AAAI'2025"},{"id":"http://arxiv.org/abs/2501.08514v1","updated":"2025-01-15T01:52:54Z","published":"2025-01-15T01:52:54Z","title":"Multimodal Fake News Video Explanation Generation","summary":"  Multi-modal explanation involves the assessment of the veracity of a variety\nof different content, and relies on multiple information modalities to\ncomprehensively consider the relevance and consistency between modalities. Most\nexisting fake news video detection methods focus on improving accuracy while\nignoring the importance of providing explanations. In this paper, we propose a\nnovel problem - Fake News Video Explanation (FNVE) - Given a multimodal news\ncontaining both video and caption text, we aim to generate natural language\nexplanations to reveal the truth of predictions. To this end, we develop\nFakeNVE, a new dataset of explanations for truthfully multimodal posts, where\neach explanation is a natural language (English) sentence describing the\nattribution of a news thread. We benchmark FakeNVE by using a multimodal\ntransformer-based architecture. Subsequently, a BART-based autoregressive\ndecoder is used as the generator. Empirical results show compelling results for\nvarious baselines (applicable to FNVE) across multiple evaluation metrics. We\nalso perform human evaluation on explanation generation, achieving high scores\nfor both adequacy and fluency.\n","authors":["Lizhi Chen","Zhong Qian","Peifeng Li","Qiaoming Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.08514v1.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.09014v1","updated":"2025-01-15T18:57:17Z","published":"2025-01-15T18:57:17Z","title":"How Do Generative Models Draw a Software Engineer? A Case Study on\n  Stable Diffusion Bias","summary":"  Generative models are nowadays widely used to generate graphical content used\nfor multiple purposes, e.g. web, art, advertisement. However, it has been shown\nthat the images generated by these models could reinforce societal biases\nalready existing in specific contexts. In this paper, we focus on understanding\nif this is the case when one generates images related to various software\nengineering tasks. In fact, the Software Engineering (SE) community is not\nimmune from gender and ethnicity disparities, which could be amplified by the\nuse of these models. Hence, if used without consciousness, artificially\ngenerated images could reinforce these biases in the SE domain. Specifically,\nwe perform an extensive empirical evaluation of the gender and ethnicity bias\nexposed by three versions of the Stable Diffusion (SD) model (a very popular\nopen-source text-to-image model) - SD 2, SD XL, and SD 3 - towards SE tasks. We\nobtain 6,720 images by feeding each model with two sets of prompts describing\ndifferent software-related tasks: one set includes the Software Engineer\nkeyword, and one set does not include any specification of the person\nperforming the task. Next, we evaluate the gender and ethnicity disparities in\nthe generated images. Results show how all models are significantly biased\ntowards male figures when representing software engineers. On the contrary,\nwhile SD 2 and SD XL are strongly biased towards White figures, SD 3 is\nslightly more biased towards Asian figures. Nevertheless, all models\nsignificantly under-represent Black and Arab figures, regardless of the prompt\nstyle used. The results of our analysis highlight severe concerns about\nadopting those models to generate content for SE tasks and open the field for\nfuture research on bias mitigation in this context.\n","authors":["Tosin Fadahunsi","Giordano d'Aloisio","Antinisca Di Marco","Federica Sarro"],"pdf_url":"https://arxiv.org/pdf/2501.09014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09012v1","updated":"2025-01-15T18:56:22Z","published":"2025-01-15T18:56:22Z","title":"Multimodal LLMs Can Reason about Aesthetics in Zero-Shot","summary":"  We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability\nshall be elicited to evaluate the aesthetics of artworks. To facilitate this\ninvestigation, we construct MM-StyleBench, a novel high-quality dataset for\nbenchmarking artistic stylization. We then develop a principled method for\nhuman preference modeling and perform a systematic correlation analysis between\nMLLMs' responses and human preference. Our experiments reveal an inherent\nhallucination issue of MLLMs in art evaluation, associated with response\nsubjectivity. ArtCoT is proposed, demonstrating that art-specific task\ndecomposition and the use of concrete language boost MLLMs' reasoning ability\nfor aesthetics. Our findings offer valuable insights into MLLMs for art and can\nbenefit a wide range of downstream applications, such as style transfer and\nartistic image generation. Code available at\nhttps://github.com/songrise/MLLM4Art.\n","authors":["Ruixiang Jiang","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09012v1.pdf","comment":"WIP, Homepage https://github.com/songrise/MLLM4Art"},{"id":"http://arxiv.org/abs/2501.09007v1","updated":"2025-01-15T18:47:05Z","published":"2025-01-15T18:47:05Z","title":"AI-RAN: Transforming RAN with AI-driven Computing Infrastructure","summary":"  The radio access network (RAN) landscape is undergoing a transformative shift\nfrom traditional, communication-centric infrastructures towards converged\ncompute-communication platforms. This article introduces AI-RAN which\nintegrates both RAN and artificial intelligence (AI) workloads on the same\ninfrastructure. By doing so, AI-RAN not only meets the performance demands of\nfuture networks but also improves asset utilization. We begin by examining how\nRANs have evolved beyond mobile broadband towards AI-RAN and articulating\nmanifestations of AI-RAN into three forms: AI-for-RAN, AI-on-RAN, and\nAI-and-RAN. Next, we identify the key requirements and enablers for the\nconvergence of communication and computing in AI-RAN. We then provide a\nreference architecture for advancing AI-RAN from concept to practice. To\nillustrate the practical potential of AI-RAN, we present a proof-of-concept\nthat concurrently processes RAN and AI workloads utilizing NVIDIA Grace-Hopper\nGH200 servers. Finally, we conclude the article by outlining future work\ndirections to guide further developments of AI-RAN.\n","authors":["Lopamudra Kundu","Xingqin Lin","Rajesh Gadiyar","Jean-Francois Lacasse","Shuvo Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2501.09007v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.00120v4","updated":"2025-01-15T18:30:12Z","published":"2024-05-31T18:22:09Z","title":"Reward Machines for Deep RL in Noisy and Uncertain Environments","summary":"  Reward Machines provide an automaton-inspired structure for specifying\ninstructions, safety constraints, and other temporally extended reward-worthy\nbehaviour. By exposing the underlying structure of a reward function, they\nenable the decomposition of an RL task, leading to impressive gains in sample\nefficiency. Although Reward Machines and similar formal specifications have a\nrich history of application towards sequential decision-making problems, they\ncritically rely on a ground-truth interpretation of the domain-specific\nvocabulary that forms the building blocks of the reward function--such\nground-truth interpretations are elusive in the real world due in part to\npartial observability and noisy sensing. In this work, we explore the use of\nReward Machines for Deep RL in noisy and uncertain environments. We\ncharacterize this problem as a POMDP and propose a suite of RL algorithms that\nexploit task structure under uncertain interpretation of the domain-specific\nvocabulary. Through theory and experiments, we expose pitfalls in naive\napproaches to this problem while simultaneously demonstrating how task\nstructure can be successfully leveraged under noisy interpretations of the\nvocabulary.\n","authors":["Andrew C. Li","Zizhao Chen","Toryn Q. Klassen","Pashootan Vaezipoor","Rodrigo Toro Icarte","Sheila A. McIlraith"],"pdf_url":"https://arxiv.org/pdf/2406.00120v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08102v2","updated":"2025-01-15T18:10:00Z","published":"2025-01-14T13:19:47Z","title":"Consistency of Responses and Continuations Generated by Large Language\n  Models on Social Media","summary":"  Large Language Models (LLMs) demonstrate remarkable capabilities in text\ngeneration, yet their emotional consistency and semantic coherence in social\nmedia contexts remain insufficiently understood. This study investigates how\nLLMs handle emotional content and maintain semantic relationships through\ncontinuation and response tasks using two open-source models: Gemma and Llama.\nBy analyzing climate change discussions from Twitter and Reddit, we examine\nemotional transitions, intensity patterns, and semantic similarity between\nhuman-authored and LLM-generated content. Our findings reveal that while both\nmodels maintain high semantic coherence, they exhibit distinct emotional\npatterns: Gemma shows a tendency toward negative emotion amplification,\nparticularly anger, while maintaining certain positive emotions like optimism.\nLlama demonstrates superior emotional preservation across a broader spectrum of\naffects. Both models systematically generate responses with attenuated\nemotional intensity compared to human-authored content and show a bias toward\npositive emotions in response tasks. Additionally, both models maintain strong\nsemantic similarity with original texts, though performance varies between\ncontinuation and response tasks. These findings provide insights into LLMs'\nemotional and semantic processing capabilities, with implications for their\ndeployment in social media contexts and human-AI interaction design.\n","authors":["Wenlu Fan","Yuqi Zhu","Chenyang Wang","Bin Wang","Wentao Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08102v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08985v1","updated":"2025-01-15T18:04:21Z","published":"2025-01-15T18:04:21Z","title":"Personality Modeling for Persuasion of Misinformation using AI Agent","summary":"  The proliferation of misinformation on social media platforms has highlighted\nthe need to understand how individual personality traits influence\nsusceptibility to and propagation of misinformation. This study employs an\ninnovative agent-based modeling approach to investigate the relationship\nbetween personality traits and misinformation dynamics. Using six AI agents\nembodying different dimensions of the Big Five personality traits\n(Extraversion, Agreeableness, and Neuroticism), we simulated interactions\nacross six diverse misinformation topics. The experiment, implemented through\nthe AgentScope framework using the GLM-4-Flash model, generated 90 unique\ninteractions, revealing complex patterns in how personality combinations affect\npersuasion and resistance to misinformation. Our findings demonstrate that\nanalytical and critical personality traits enhance effectiveness in\nevidence-based discussions, while non-aggressive persuasion strategies show\nunexpected success in misinformation correction. Notably, agents with critical\ntraits achieved a 59.4% success rate in HIV-related misinformation discussions,\nwhile those employing non-aggressive approaches maintained consistent\npersuasion rates above 40% across different personality combinations. The study\nalso revealed a non-transitive pattern in persuasion effectiveness, challenging\nconventional assumptions about personality-based influence. These results\nprovide crucial insights for developing personality-aware interventions in\ndigital environments and suggest that effective misinformation countermeasures\nshould prioritize emotional connection and trust-building over confrontational\napproaches. The findings contribute to both theoretical understanding of\npersonality-misinformation dynamics and practical strategies for combating\nmisinformation in social media contexts.\n","authors":["Qianmin Lou","Wentao Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08977v1","updated":"2025-01-15T17:47:57Z","published":"2025-01-15T17:47:57Z","title":"Development and Validation of the Provider Documentation Summarization\n  Quality Instrument for Large Language Models","summary":"  As Large Language Models (LLMs) are integrated into electronic health record\n(EHR) workflows, validated instruments are essential to evaluate their\nperformance before implementation. Existing instruments for provider\ndocumentation quality are often unsuitable for the complexities of\nLLM-generated text and lack validation on real-world data. The Provider\nDocumentation Summarization Quality Instrument (PDSQI-9) was developed to\nevaluate LLM-generated clinical summaries. Multi-document summaries were\ngenerated from real-world EHR data across multiple specialties using several\nLLMs (GPT-4o, Mixtral 8x7b, and Llama 3-8b). Validation included Pearson\ncorrelation for substantive validity, factor analysis and Cronbach's alpha for\nstructural validity, inter-rater reliability (ICC and Krippendorff's alpha) for\ngeneralizability, a semi-Delphi process for content validity, and comparisons\nof high- versus low-quality summaries for discriminant validity. Seven\nphysician raters evaluated 779 summaries and answered 8,329 questions,\nachieving over 80% power for inter-rater reliability. The PDSQI-9 demonstrated\nstrong internal consistency (Cronbach's alpha = 0.879; 95% CI: 0.867-0.891) and\nhigh inter-rater reliability (ICC = 0.867; 95% CI: 0.867-0.868), supporting\nstructural validity and generalizability. Factor analysis identified a 4-factor\nmodel explaining 58% of the variance, representing organization, clarity,\naccuracy, and utility. Substantive validity was supported by correlations\nbetween note length and scores for Succinct (rho = -0.200, p = 0.029) and\nOrganized (rho = -0.190, p = 0.037). Discriminant validity distinguished high-\nfrom low-quality summaries (p < 0.001). The PDSQI-9 demonstrates robust\nconstruct validity, supporting its use in clinical practice to evaluate\nLLM-generated summaries and facilitate safer integration of LLMs into\nhealthcare workflows.\n","authors":["Emma Croxford","Yanjun Gao","Nicholas Pellegrino","Karen K. Wong","Graham Wills","Elliot First","Miranda Schnier","Kyle Burton","Cris G. Ebby","Jillian Gorskic","Matthew Kalscheur","Samy Khalil","Marie Pisani","Tyler Rubeor","Peter Stetson","Frank Liao","Cherodeep Goswami","Brian Patterson","Majid Afshar"],"pdf_url":"https://arxiv.org/pdf/2501.08977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00138v3","updated":"2025-01-15T17:44:41Z","published":"2024-10-31T18:37:22Z","title":"Learning Low-Dimensional Strain Models of Soft Robots by Looking at the\n  Evolution of Their Shape with Application to Model-Based Control","summary":"  Obtaining dynamic models of continuum soft robots is central to the analysis\nand control of soft robots, and researchers have devoted much attention to the\nchallenge of proposing both data-driven and first-principle solutions. Both\navenues have, however, shown their limitations; the former lacks structure and\nperforms poorly outside training data, while the latter requires significant\nsimplifications and extensive expert knowledge to be used in practice. This\npaper introduces a streamlined method for learning low-dimensional,\nphysics-based models that are both accurate and easy to interpret. We start\nwith an algorithm that uses image data (i.e., shape evolutions) to determine\nthe minimal necessary segments for describing a soft robot's movement.\nFollowing this, we apply a dynamic regression and strain sparsification\nalgorithm to identify relevant strains and define the model's dynamics. We\nvalidate our approach through simulations with various planar soft\nmanipulators, comparing its performance against other learning strategies,\nshowing that our models are both computationally efficient and 25x more\naccurate on out-of-training distribution inputs. Finally, we demonstrate that\nthanks to the capability of the method of generating physically compatible\nmodels, the learned models can be straightforwardly combined with model-based\ncontrol policies.\n","authors":["Ricardo Valadas","Maximilian Stölzle","Jingyue Liu","Cosimo Della Santina"],"pdf_url":"https://arxiv.org/pdf/2411.00138v3.pdf","comment":"8 pages, appearing in Proceedings of the 2025 IEEE 8th International\n  Conference on Soft Robotics (RoboSoft)"},{"id":"http://arxiv.org/abs/2501.08970v1","updated":"2025-01-15T17:28:53Z","published":"2025-01-15T17:28:53Z","title":"Trusted Machine Learning Models Unlock Private Inference for Problems\n  Currently Infeasible with Cryptography","summary":"  We often interact with untrusted parties. Prioritization of privacy can limit\nthe effectiveness of these interactions, as achieving certain goals\nnecessitates sharing private data. Traditionally, addressing this challenge has\ninvolved either seeking trusted intermediaries or constructing cryptographic\nprotocols that restrict how much data is revealed, such as multi-party\ncomputations or zero-knowledge proofs. While significant advances have been\nmade in scaling cryptographic approaches, they remain limited in terms of the\nsize and complexity of applications they can be used for. In this paper, we\nargue that capable machine learning models can fulfill the role of a trusted\nthird party, thus enabling secure computations for applications that were\npreviously infeasible. In particular, we describe Trusted Capable Model\nEnvironments (TCMEs) as an alternative approach for scaling secure computation,\nwhere capable machine learning model(s) interact under input/output\nconstraints, with explicit information flow control and explicit statelessness.\nThis approach aims to achieve a balance between privacy and computational\nefficiency, enabling private inference where classical cryptographic solutions\nare currently infeasible. We describe a number of use cases that are enabled by\nTCME, and show that even some simple classic cryptographic problems can already\nbe solved with TCME. Finally, we outline current limitations and discuss the\npath forward in implementing them.\n","authors":["Ilia Shumailov","Daniel Ramage","Sarah Meiklejohn","Peter Kairouz","Florian Hartmann","Borja Balle","Eugene Bagdasarian"],"pdf_url":"https://arxiv.org/pdf/2501.08970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08962v1","updated":"2025-01-15T17:18:46Z","published":"2025-01-15T17:18:46Z","title":"An analysis of data variation and bias in image-based dermatological\n  datasets for machine learning classification","summary":"  AI algorithms have become valuable in aiding professionals in healthcare. The\nincreasing confidence obtained by these models is helpful in critical decision\ndemands. In clinical dermatology, classification models can detect malignant\nlesions on patients' skin using only RGB images as input. However, most\nlearning-based methods employ data acquired from dermoscopic datasets on\ntraining, which are large and validated by a gold standard. Clinical models aim\nto deal with classification on users' smartphone cameras that do not contain\nthe corresponding resolution provided by dermoscopy. Also, clinical\napplications bring new challenges. It can contain captures from uncontrolled\nenvironments, skin tone variations, viewpoint changes, noises in data and\nlabels, and unbalanced classes. A possible alternative would be to use transfer\nlearning to deal with the clinical images. However, as the number of samples is\nlow, it can cause degradations on the model's performance; the source\ndistribution used in training differs from the test set. This work aims to\nevaluate the gap between dermoscopic and clinical samples and understand how\nthe dataset variations impact training. It assesses the main differences\nbetween distributions that disturb the model's prediction. Finally, from\nexperiments on different architectures, we argue how to combine the data from\ndivergent distributions, decreasing the impact on the model's final accuracy.\n","authors":["Francisco Mauro","Emanoel Thyago","Othon Vinicius","Rodrigo Abreu","Kelvin Cunha","José Gabriel","Rafael Barros","Thales Bezerra","Manoel Henriques","Natalia Lopes","Érico Moutinho","Jéssica Guido","Tsang Ing Ren","Paulo Borba"],"pdf_url":"https://arxiv.org/pdf/2501.08962v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2411.13951v3","updated":"2025-01-15T17:16:22Z","published":"2024-11-21T09:03:12Z","title":"A Discrete-sequence Dataset for Evaluating Online Unsupervised Anomaly\n  Detection Approaches for Multivariate Time Series","summary":"  Benchmarking anomaly detection approaches for multivariate time series is\nchallenging due to the lack of high-quality datasets. Current publicly\navailable datasets are too small, not diverse and feature trivial anomalies,\nwhich hinders measurable progress in this research area. We propose a solution:\na diverse, extensive, and non-trivial dataset generated via state-of-the-art\nsimulation tools that reflects realistic behaviour of an automotive powertrain,\nincluding its multivariate, dynamic and variable-state properties. To cater for\nboth unsupervised and semi-supervised anomaly detection settings, as well as\ntime series generation and forecasting, we make different versions of the\ndataset available, where training and test subsets are offered in contaminated\nand clean versions, depending on the task. We also provide baseline results\nfrom a small selection of approaches based on deterministic and variational\nautoencoders, as well as a non-parametric approach. As expected, the baseline\nexperimentation shows that the approaches trained on the semi-supervised\nversion of the dataset outperform their unsupervised counterparts, highlighting\na need for approaches more robust to contaminated training data.\n","authors":["Lucas Correia","Jan-Christoph Goos","Thomas Bäck","Anna V. Kononova"],"pdf_url":"https://arxiv.org/pdf/2411.13951v3.pdf","comment":"Submitted to the IEEE Transactions on Reliability journal"},{"id":"http://arxiv.org/abs/2312.02186v3","updated":"2025-01-15T17:11:20Z","published":"2023-12-01T20:16:02Z","title":"Identifying Spurious Correlations using Counterfactual Alignment","summary":"  Models driven by spurious correlations often yield poor generalization\nperformance. We propose the counterfactual (CF) alignment method to detect and\nquantify spurious correlations of black box classifiers. Our methodology is\nbased on counterfactual images generated with respect to one classifier being\ninput into other classifiers to see if they also induce changes in the outputs\nof these classifiers. The relationship between these responses can be\nquantified and used to identify specific instances where a spurious correlation\nexists. This is validated by observing intuitive trends in face-attribute and\nwaterbird classifiers, as well as by fabricating spurious correlations and\ndetecting their presence, both visually and quantitatively. Furthermore,\nutilizing the CF alignment method, we demonstrate that we can evaluate robust\noptimization methods (GroupDRO, JTT, and FLAC) by detecting a reduction in\nspurious correlations.\n","authors":["Joseph Paul Cohen","Louis Blankemeier","Akshay Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2312.02186v3.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR), Code:\n  https://github.com/ieee8023/latentshift"},{"id":"http://arxiv.org/abs/2501.08958v1","updated":"2025-01-15T17:09:07Z","published":"2025-01-15T17:09:07Z","title":"Kolmogorov-Arnold Networks for Time Series Granger Causality Inference","summary":"  We introduce Granger Causality Kolmogorov-Arnold Networks (GCKAN), an\ninnovative architecture that extends the recently proposed Kolmogorov-Arnold\nNetworks (KAN) to the domain of causal inference. By extracting base weights\nfrom KAN layers and incorporating the sparsity-inducing penalty along with\nridge regularization, GCKAN infers the Granger causality from time series while\nenabling automatic time lag selection. Additionally, we propose an algorithm\nleveraging time-reversed Granger causality to enhance inference accuracy. The\nalgorithm compares prediction and sparse-inducing losses derived from the\noriginal and time-reversed series, automatically selecting the casual\nrelationship with the higher score or integrating the results to mitigate\nspurious connectivities. Comprehensive experiments conducted on Lorenz-96, gene\nregulatory networks, fMRI BOLD signals, and VAR datasets demonstrate that the\nproposed model achieves competitive performance to state-of-the-art methods in\ninferring Granger causality from nonlinear, high-dimensional, and\nlimited-sample time series.\n","authors":["Meiliang Liu","Yunfang Xu","Zijin Li","Zhengye Si","Xiaoxiao Yang","Xinyue Yang","Zhiwen Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.08958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08951v1","updated":"2025-01-15T16:56:26Z","published":"2025-01-15T16:56:26Z","title":"Analyzing the Ethical Logic of Six Large Language Models","summary":"  This study examines the ethical reasoning of six prominent generative large\nlanguage models: OpenAI GPT-4o, Meta LLaMA 3.1, Perplexity, Anthropic Claude\n3.5 Sonnet, Google Gemini, and Mistral 7B. The research explores how these\nmodels articulate and apply ethical logic, particularly in response to moral\ndilemmas such as the Trolley Problem, and Heinz Dilemma. Departing from\ntraditional alignment studies, the study adopts an explainability-transparency\nframework, prompting models to explain their ethical reasoning. This approach\nis analyzed through three established ethical typologies: the\nconsequentialist-deontological analytic, Moral Foundations Theory, and the\nKohlberg Stages of Moral Development Model. Findings reveal that LLMs exhibit\nlargely convergent ethical logic, marked by a rationalist, consequentialist\nemphasis, with decisions often prioritizing harm minimization and fairness.\nDespite similarities in pre-training and model architecture, a mixture of\nnuanced and significant differences in ethical reasoning emerge across models,\nreflecting variations in fine-tuning and post-training processes. The models\nconsistently display erudition, caution, and self-awareness, presenting ethical\nreasoning akin to a graduate-level discourse in moral philosophy. In striking\nuniformity these systems all describe their ethical reasoning as more\nsophisticated than what is characteristic of typical human moral logic.\n","authors":["W. Russell Neuman","Chad Coleman","Manan Shah"],"pdf_url":"https://arxiv.org/pdf/2501.08951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08931v1","updated":"2025-01-15T16:34:20Z","published":"2025-01-15T16:34:20Z","title":"Visual WetlandBirds Dataset: Bird Species Identification and Behavior\n  Recognition in Videos","summary":"  The current biodiversity loss crisis makes animal monitoring a relevant field\nof study. In light of this, data collected through monitoring can provide\nessential insights, and information for decision-making aimed at preserving\nglobal biodiversity. Despite the importance of such data, there is a notable\nscarcity of datasets featuring videos of birds, and none of the existing\ndatasets offer detailed annotations of bird behaviors in video format. In\nresponse to this gap, our study introduces the first fine-grained video dataset\nspecifically designed for bird behavior detection and species classification.\nThis dataset addresses the need for comprehensive bird video datasets and\nprovides detailed data on bird actions, facilitating the development of deep\nlearning models to recognize these, similar to the advancements made in human\naction recognition. The proposed dataset comprises 178 videos recorded in\nSpanish wetlands, capturing 13 different bird species performing 7 distinct\nbehavior classes. In addition, we also present baseline results using state of\nthe art models on two tasks: bird behavior recognition and species\nclassification.\n","authors":["Javier Rodriguez-Juan","David Ortiz-Perez","Manuel Benavent-Lledo","David Mulero-Pérez","Pablo Ruiz-Ponce","Adrian Orihuela-Torres","Jose Garcia-Rodriguez","Esther Sebastián-González"],"pdf_url":"https://arxiv.org/pdf/2501.08931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08925v1","updated":"2025-01-15T16:30:29Z","published":"2025-01-15T16:30:29Z","title":"Disentangling Exploration of Large Language Models by Optimal\n  Exploitation","summary":"  Exploration is a crucial skill for self-improvement and open-ended\nproblem-solving. However, it remains uncertain whether large language models\ncan effectively explore the state-space. Existing evaluations predominantly\nfocus on the trade-off between exploration and exploitation, often assessed in\nmulti-armed bandit problems. In contrast, this work isolates exploration as the\nsole objective, tasking the agent with delivering information that enhances\nfuture returns. For the evaluation, we propose to decompose missing rewards\ninto exploration and exploitation components by measuring the optimal\nachievable return for the states already explored. Our experiments with various\nLLMs reveal that most models struggle to sufficiently explore the state-space\nand that weak exploration is insufficient. We observe a positive correlation\nbetween model size and exploration performance, with larger models\ndemonstrating superior capabilities. Furthermore, we show that our\ndecomposition provides insights into differences in behaviors driven by agent\ninstructions during prompt engineering, offering a valuable tool for refining\nLLM performance in exploratory tasks.\n","authors":["Tim Grams","Patrick Betz","Christian Bartelt"],"pdf_url":"https://arxiv.org/pdf/2501.08925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08922v1","updated":"2025-01-15T16:26:01Z","published":"2025-01-15T16:26:01Z","title":"Modeling Melt Pool Features and Spatter Using Symbolic Regression and\n  Machine Learning","summary":"  Additive manufacturing (AM) is a rapidly evolving technology that has\nattracted applications across a wide range of fields due to its ability to\nfabricate complex geometries. However, one of the key challenges in AM is\nachieving consistent print quality. This inconsistency is often attributed to\nuncontrolled melt pool dynamics, partly caused by spatter which can lead to\ndefects. Therefore, capturing and controlling the evolution of the melt pool is\ncrucial for enhancing process stability and part quality. In this study, we\ndeveloped a framework to support decision-making in AM operations, facilitating\nquality control and minimizing defects via machine learning (ML) and polynomial\nsymbolic regression models. We implemented experimentally validated\ncomputational tools as a cost-effective approach to collect large datasets from\nlaser powder bed fusion (LPBF) processes. For a dataset consisting of 281\nprocess conditions, parameters such as melt pool dimensions (length, width,\ndepth), melt pool geometry (area, volume), and volume indicated as spatter were\nextracted. Using machine learning (ML) and polynomial symbolic regression\nmodels, a high R2 of over 95 % was achieved in predicting the melt pool\ndimensions and geometry features for both the training and testing datasets,\nwith either process conditions (power and velocity) or melt pool dimensions as\nthe model inputs. In the case of volume indicated as spatter, R2 improved after\nlogarithmic transforming the model inputs, which was either the process\nconditions or the melt pool dimensions. Among the investigated ML models, the\nExtraTree model achieved the highest R2 values of 96.7 % and 87.5 %.\n","authors":["Olabode T. Ajenifujah","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2501.08922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08907v1","updated":"2025-01-15T16:17:02Z","published":"2025-01-15T16:17:02Z","title":"Projection Implicit Q-Learning with Support Constraint for Offline\n  Reinforcement Learning","summary":"  Offline Reinforcement Learning (RL) faces a critical challenge of\nextrapolation errors caused by out-of-distribution (OOD) actions. Implicit\nQ-Learning (IQL) algorithm employs expectile regression to achieve in-sample\nlearning, effectively mitigating the risks associated with OOD actions.\nHowever, the fixed hyperparameter in policy evaluation and density-based policy\nimprovement method limit its overall efficiency. In this paper, we propose\nProj-IQL, a projective IQL algorithm enhanced with the support constraint. In\nthe policy evaluation phase, Proj-IQL generalizes the one-step approach to a\nmulti-step approach through vector projection, while maintaining in-sample\nlearning and expectile regression framework. In the policy improvement phase,\nProj-IQL introduces support constraint that is more aligned with the policy\nevaluation approach. Furthermore, we theoretically demonstrate that Proj-IQL\nguarantees monotonic policy improvement and enjoys a progressively more\nrigorous criterion for superior actions. Empirical results demonstrate the\nProj-IQL achieves state-of-the-art performance on D4RL benchmarks, especially\nin challenging navigation domains.\n","authors":["Xinchen Han","Hossam Afifi","Michel Marot"],"pdf_url":"https://arxiv.org/pdf/2501.08907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08905v1","updated":"2025-01-15T16:15:16Z","published":"2025-01-15T16:15:16Z","title":"Computing Game Symmetries and Equilibria That Respect Them","summary":"  Strategic interactions can be represented more concisely, and analyzed and\nsolved more efficiently, if we are aware of the symmetries within the\nmultiagent system. Symmetries also have conceptual implications, for example\nfor equilibrium selection. We study the computational complexity of identifying\nand using symmetries. Using the classical framework of normal-form games, we\nconsider game symmetries that can be across some or all players and/or actions.\nWe find a strong connection between game symmetries and graph automorphisms,\nyielding graph automorphism and graph isomorphism completeness results for\ncharacterizing the symmetries present in a game. On the other hand, we also\nshow that the problem becomes polynomial-time solvable when we restrict the\nconsideration of actions in one of two ways.\n  Next, we investigate when exactly game symmetries can be successfully\nleveraged for Nash equilibrium computation. We show that finding a Nash\nequilibrium that respects a given set of symmetries is PPAD- and CLS-complete\nin general-sum and team games respectively -- that is, exactly as hard as\nBrouwer fixed point and gradient descent problems. Finally, we present\npolynomial-time methods for the special cases where we are aware of a vast\nnumber of symmetries, or where the game is two-player zero-sum and we do not\neven know the symmetries.\n","authors":["Emanuel Tewolde","Brian Hu Zhang","Caspar Oesterheld","Tuomas Sandholm","Vincent Conitzer"],"pdf_url":"https://arxiv.org/pdf/2501.08905v1.pdf","comment":"Long and updated version to the published paper in the Proceedings of\n  the 39th Annual AAAI Conference on Artificial Intelligence (AAAI 2025). 24\n  pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2501.08897v1","updated":"2025-01-15T16:06:10Z","published":"2025-01-15T16:06:10Z","title":"Leveraging Large Language Models as Knowledge-Driven Agents for Reliable\n  Retrosynthesis Planning","summary":"  Identifying reliable synthesis pathways in materials chemistry is a complex\ntask, particularly in polymer science, due to the intricate and often\nnon-unique nomenclature of macromolecules. To address this challenge, we\npropose an agent system that integrates large language models (LLMs) and\nknowledge graphs (KGs). By leveraging LLMs' powerful capabilities for\nextracting and recognizing chemical substance names, and storing the extracted\ndata in a structured knowledge graph, our system fully automates the retrieval\nof relevant literatures, extraction of reaction data, database querying,\nconstruction of retrosynthetic pathway trees, further expansion through the\nretrieval of additional literature and recommendation of optimal reaction\npathways. A novel Multi-branched Reaction Pathway Search (MBRPS) algorithm\nenables the exploration of all pathways, with a particular focus on\nmulti-branched ones, helping LLMs overcome weak reasoning in multi-branched\npaths. This work represents the first attempt to develop a fully automated\nretrosynthesis planning agent tailored specially for macromolecules powered by\nLLMs. Applied to polyimide synthesis, our new approach constructs a\nretrosynthetic pathway tree with hundreds of pathways and recommends optimized\nroutes, including both known and novel pathways, demonstrating its\neffectiveness and potential for broader applications.\n","authors":["Qinyu Ma","Yuhao Zhou","Jianfeng Li"],"pdf_url":"https://arxiv.org/pdf/2501.08897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08889v1","updated":"2025-01-15T16:00:43Z","published":"2025-01-15T16:00:43Z","title":"Karatsuba Matrix Multiplication and its Efficient Custom Hardware\n  Implementations","summary":"  While the Karatsuba algorithm reduces the complexity of large integer\nmultiplication, the extra additions required minimize its benefits for smaller\nintegers of more commonly-used bitwidths. In this work, we propose the\nextension of the scalar Karatsuba multiplication algorithm to matrix\nmultiplication, showing how this maintains the reduction in multiplication\ncomplexity of the original Karatsuba algorithm while reducing the complexity of\nthe extra additions. Furthermore, we propose new matrix multiplication hardware\narchitectures for efficiently exploiting this extension of the Karatsuba\nalgorithm in custom hardware. We show that the proposed algorithm and hardware\narchitectures can provide real area or execution time improvements for integer\nmatrix multiplication compared to scalar Karatsuba or conventional matrix\nmultiplication algorithms, while also supporting implementation through proven\nsystolic array and conventional multiplier architectures at the core. We\nprovide a complexity analysis of the algorithm and architectures and evaluate\nthe proposed designs both in isolation and in an end-to-end deep learning\naccelerator system compared to baseline designs and prior state-of-the-art\nworks implemented on the same type of compute platform, demonstrating their\nability to increase the performance-per-area of matrix multiplication hardware.\n","authors":["Trevor E. Pogue","Nicola Nicolici"],"pdf_url":"https://arxiv.org/pdf/2501.08889v1.pdf","comment":"Accepted for publication in IEEE Transactions on Computers;\n  Associated source code available on github at\n  https://github.com/trevorpogue/algebraic-nnhw"},{"id":"http://arxiv.org/abs/2501.08878v1","updated":"2025-01-15T15:49:46Z","published":"2025-01-15T15:49:46Z","title":"Incrementally Learning Multiple Diverse Data Domains via Multi-Source\n  Dynamic Expansion Model","summary":"  Continual Learning seeks to develop a model capable of incrementally\nassimilating new information while retaining prior knowledge. However, current\nresearch predominantly addresses a straightforward learning context, wherein\nall data samples originate from a singular data domain. This paper shifts focus\nto a more complex and realistic learning environment, characterized by data\nsamples sourced from multiple distinct domains. We tackle this intricate\nlearning challenge by introducing a novel methodology, termed the Multi-Source\nDynamic Expansion Model (MSDEM), which leverages various pre-trained models as\nbackbones and progressively establishes new experts based on them to adapt to\nemerging tasks. Additionally, we propose an innovative dynamic expandable\nattention mechanism designed to selectively harness knowledge from multiple\nbackbones, thereby accelerating the new task learning. Moreover, we introduce a\ndynamic graph weight router that strategically reuses all previously acquired\nparameters and representations for new task learning, maximizing the positive\nknowledge transfer effect, which further improves generalization performance.\nWe conduct a comprehensive series of experiments, and the empirical findings\nindicate that our proposed approach achieves state-of-the-art performance.\n","authors":["Runqing Wu","Fei Ye","Qihe Liu","Guoxi Huang","Jinyu Guo","Rongyao Hu"],"pdf_url":"https://arxiv.org/pdf/2501.08878v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.14092v2","updated":"2025-01-15T15:44:02Z","published":"2024-07-19T07:57:31Z","title":"Integrated Push-and-Pull Update Model for Goal-Oriented Effective\n  Communication","summary":"  This paper studies decision-making for goal-oriented effective communication.\nWe consider an end-to-end status update system where a sensing agent (SA)\nobserves a source, generates and transmits updates to an actuation agent (AA),\nwhile the AA takes actions to accomplish a goal at the endpoint. We integrate\nthe push- and pull-based update communication models to obtain a push-and-pull\nmodel, which allows the transmission controller at the SA to decide to push an\nupdate to the AA and the query controller at the AA to pull updates by raising\nqueries at specific time instances. To gauge effectiveness, we utilize a grade\nof effectiveness (GoE) metric incorporating updates' freshness, usefulness, and\ntimeliness of actions as qualitative attributes. We then derive effect-aware\npolicies to maximize the expected discounted sum of updates' effectiveness\nsubject to induced costs. The effect-aware policy at the SA considers the\npotential effectiveness of communicated updates at the endpoint, while at the\nAA, it accounts for the probabilistic evolution of the source and importance of\ngenerated updates. Our results show the proposed push-and-pull model\noutperforms models solely based on push- or pull-based updates both in terms of\nefficiency and effectiveness. Additionally, using effect-aware policies at both\nagents enhances effectiveness compared to periodic and/or probabilistic\neffect-agnostic policies at either or both agents.\n","authors":["Pouya Agheli","Nikolaos Pappas","Petar Popovski","Marios Kountouris"],"pdf_url":"https://arxiv.org/pdf/2407.14092v2.pdf","comment":"Submitted for possible publication"},{"id":"http://arxiv.org/abs/2501.08869v1","updated":"2025-01-15T15:38:56Z","published":"2025-01-15T15:38:56Z","title":"Silent Abandonment in Text-Based Contact Centers: Identifying,\n  Quantifying, and Mitigating its Operational Impacts","summary":"  In the quest to improve services, companies offer customers the option to\ninteract with agents via texting. Such contact centers face unique challenges\ncompared to traditional call centers, as measuring customer experience proxies\nlike abandonment and patience involves uncertainty. A key source of this\nuncertainty is silent abandonment, where customers leave without notifying the\nsystem, wasting agent time and leaving their status unclear. Silent abandonment\nalso obscures whether a customer was served or left. Our goals are to measure\nthe magnitude of silent abandonment and mitigate its effects. Classification\nmodels show that 3%-70% of customers across 17 companies abandon silently. In\none study, 71.3% of abandoning customers did so silently, reducing agent\nefficiency by 3.2% and system capacity by 15.3%, incurring $5,457 in annual\ncosts per agent. We develop an expectation-maximization (EM) algorithm to\nestimate customer patience under uncertainty and identify influencing\ncovariates. We find that companies should use classification models to estimate\nabandonment scope and our EM algorithm to assess patience. We suggest\nstrategies to operationally mitigate the impact of silent abandonment by\npredicting suspected silent-abandonment behavior or changing service design.\nSpecifically, we show that while allowing customers to write while waiting in\nthe queue creates a missing data challenge, it also significantly increases\npatience and reduces service time, leading to reduced abandonment and lower\nstaffing requirements.\n","authors":["Antonio Castellanos","Galit B. Yom-Tov","Yair Goldberg","Jaeyoung Park"],"pdf_url":"https://arxiv.org/pdf/2501.08869v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.11754"},{"id":"http://arxiv.org/abs/2410.14970v4","updated":"2025-01-15T15:35:22Z","published":"2024-10-19T04:28:44Z","title":"Taming the Long Tail in Human Mobility Prediction","summary":"  With the popularity of location-based services, human mobility prediction\nplays a key role in enhancing personalized navigation, optimizing\nrecommendation systems, and facilitating urban mobility and planning. This\ninvolves predicting a user's next POI (point-of-interest) visit using their\npast visit history. However, the uneven distribution of visitations over time\nand space, namely the long-tail problem in spatial distribution, makes it\ndifficult for AI models to predict those POIs that are less visited by humans.\nIn light of this issue, we propose the Long-Tail Adjusted Next POI Prediction\n(LoTNext) framework for mobility prediction, combining a Long-Tailed Graph\nAdjustment module to reduce the impact of the long-tailed nodes in the user-POI\ninteraction graph and a novel Long-Tailed Loss Adjustment module to adjust loss\nby logit score and sample weight adjustment strategy. Also, we employ the\nauxiliary prediction task to enhance generalization and accuracy. Our\nexperiments with two real-world trajectory datasets demonstrate that LoTNext\nsignificantly surpasses existing state-of-the-art works.\n","authors":["Xiaohang Xu","Renhe Jiang","Chuang Yang","Zipei Fan","Kaoru Sezaki"],"pdf_url":"https://arxiv.org/pdf/2410.14970v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10175v2","updated":"2025-01-15T15:24:32Z","published":"2024-11-15T13:21:26Z","title":"The Surprising Ineffectiveness of Pre-Trained Visual Representations for\n  Model-Based Reinforcement Learning","summary":"  Visual Reinforcement Learning (RL) methods often require extensive amounts of\ndata. As opposed to model-free RL, model-based RL (MBRL) offers a potential\nsolution with efficient data utilization through planning. Additionally, RL\nlacks generalization capabilities for real-world tasks. Prior work has shown\nthat incorporating pre-trained visual representations (PVRs) enhances sample\nefficiency and generalization. While PVRs have been extensively studied in the\ncontext of model-free RL, their potential in MBRL remains largely unexplored.\nIn this paper, we benchmark a set of PVRs on challenging control tasks in a\nmodel-based RL setting. We investigate the data efficiency, generalization\ncapabilities, and the impact of different properties of PVRs on the performance\nof model-based agents. Our results, perhaps surprisingly, reveal that for MBRL\ncurrent PVRs are not more sample efficient than learning representations from\nscratch, and that they do not generalize better to out-of-distribution (OOD)\nsettings. To explain this, we analyze the quality of the trained dynamics\nmodel. Furthermore, we show that data diversity and network architecture are\nthe most important contributors to OOD generalization performance.\n","authors":["Moritz Schneider","Robert Krug","Narunas Vaskevicius","Luigi Palmieri","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2411.10175v2.pdf","comment":"Published at the 38th Conference on Neural Information Processing\n  Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/"},{"id":"http://arxiv.org/abs/2501.08862v1","updated":"2025-01-15T15:22:57Z","published":"2025-01-15T15:22:57Z","title":"ARMOR: Shielding Unlearnable Examples against Data Augmentation","summary":"  Private data, when published online, may be collected by unauthorized parties\nto train deep neural networks (DNNs). To protect privacy, defensive noises can\nbe added to original samples to degrade their learnability by DNNs. Recently,\nunlearnable examples are proposed to minimize the training loss such that the\nmodel learns almost nothing. However, raw data are often pre-processed before\nbeing used for training, which may restore the private information of protected\ndata. In this paper, we reveal the data privacy violation induced by data\naugmentation, a commonly used data pre-processing technique to improve model\ngeneralization capability, which is the first of its kind as far as we are\nconcerned. We demonstrate that data augmentation can significantly raise the\naccuracy of the model trained on unlearnable examples from 21.3% to 66.1%. To\naddress this issue, we propose a defense framework, dubbed ARMOR, to protect\ndata privacy from potential breaches of data augmentation. To overcome the\ndifficulty of having no access to the model training process, we design a\nnon-local module-assisted surrogate model that better captures the effect of\ndata augmentation. In addition, we design a surrogate augmentation selection\nstrategy that maximizes distribution alignment between augmented and\nnon-augmented samples, to choose the optimal augmentation strategy for each\nclass. We also use a dynamic step size adjustment algorithm to enhance the\ndefensive noise generation process. Extensive experiments are conducted on 4\ndatasets and 5 data augmentation methods to verify the performance of ARMOR.\nComparisons with 6 state-of-the-art defense methods have demonstrated that\nARMOR can preserve the unlearnability of protected private data under data\naugmentation. ARMOR reduces the test accuracy of the model trained on augmented\nprotected samples by as much as 60% more than baselines.\n","authors":["Xueluan Gong","Yuji Wang","Yanjiao Chen","Haocheng Dong","Yiming Li","Mengyuan Sun","Shuaike Li","Qian Wang","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08851v1","updated":"2025-01-15T15:05:49Z","published":"2025-01-15T15:05:49Z","title":"Digital Phenotyping for Adolescent Mental Health: A Feasibility Study\n  Employing Machine Learning to Predict Mental Health Risk From Active and\n  Passive Smartphone Data","summary":"  Background: Adolescents are particularly vulnerable to mental disorders, with\nover 75% of cases manifesting before the age of 25. Research indicates that\nonly 18 to 34% of young people experiencing high levels of depression or\nanxiety symptoms seek support. Digital tools leveraging smartphones offer\nscalable and early intervention opportunities. Objective: Using a novel machine\nlearning framework, this study evaluated the feasibility of integrating active\nand passive smartphone data to predict mental disorders in non-clinical\nadolescents. Specifically, we investigated the utility of the Mindcraft app in\npredicting risks for internalising and externalising disorders, eating\ndisorders, insomnia and suicidal ideation. Methods: Participants (N=103; mean\nage 16.1 years) were recruited from three London schools. Participants\ncompleted the Strengths and Difficulties Questionnaire, the Eating Disorders-15\nQuestionnaire, Sleep Condition Indicator Questionnaire and indicated the\npresence/absence of suicidal ideation. They used the Mindcraft app for 14 days,\ncontributing active data via self-reports and passive data from smartphone\nsensors. A contrastive pretraining phase was applied to enhance user-specific\nfeature stability, followed by supervised fine-tuning. The model evaluation\nemployed leave-one-subject-out cross-validation using balanced accuracy as the\nprimary metric. Results: The integration of active and passive data achieved\nsuperior performance compared to individual data sources, with mean balanced\naccuracies of 0.71 for SDQ-High risk, 0.67 for insomnia, 0.77 for suicidal\nideation and 0.70 for eating disorders. The contrastive learning framework\nstabilised daily behavioural representations, enhancing predictive robustness.\nThis study demonstrates the potential of integrating active and passive\nsmartphone data with advanced machine-learning techniques for predicting mental\nhealth risks.\n","authors":["Balasundaram Kadirvelu","Teresa Bellido Bel","Aglaia Freccero","Martina Di Simplicio","Dasha Nicholls","A Aldo Faisal"],"pdf_url":"https://arxiv.org/pdf/2501.08851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08850v1","updated":"2025-01-15T15:04:10Z","published":"2025-01-15T15:04:10Z","title":"Graph Counterfactual Explainable AI via Latent Space Traversal","summary":"  Explaining the predictions of a deep neural network is a nontrivial task, yet\nhigh-quality explanations for predictions are often a prerequisite for\npractitioners to trust these models. Counterfactual explanations aim to explain\npredictions by finding the ''nearest'' in-distribution alternative input whose\nprediction changes in a pre-specified way. However, it remains an open question\nhow to define this nearest alternative input, whose solution depends on both\nthe domain (e.g. images, graphs, tabular data, etc.) and the specific\napplication considered. For graphs, this problem is complicated i) by their\ndiscrete nature, as opposed to the continuous nature of state-of-the-art graph\nclassifiers; and ii) by the node permutation group acting on the graphs. We\npropose a method to generate counterfactual explanations for any differentiable\nblack-box graph classifier, utilizing a case-specific permutation equivariant\ngraph variational autoencoder. We generate counterfactual explanations in a\ncontinuous fashion by traversing the latent space of the autoencoder across the\nclassification boundary of the classifier, allowing for seamless integration of\ndiscrete graph structure and continuous graph attributes. We empirically\nvalidate the approach on three graph datasets, showing that our model is\nconsistently high-performing and more robust than the baselines.\n","authors":["Andreas Abildtrup Hansen","Paraskevas Pegios","Anna Calissano","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2501.08850v1.pdf","comment":"Published at Northern Lights Deep Learning Conference 2025"},{"id":"http://arxiv.org/abs/2501.08848v1","updated":"2025-01-15T15:00:11Z","published":"2025-01-15T15:00:11Z","title":"RouteNet-Gauss: Hardware-Enhanced Network Modeling with Machine Learning","summary":"  Network simulation is pivotal in network modeling, assisting with tasks\nranging from capacity planning to performance estimation. Traditional\napproaches such as Discrete Event Simulation (DES) face limitations in terms of\ncomputational cost and accuracy. This paper introduces RouteNet-Gauss, a novel\nintegration of a testbed network with a Machine Learning (ML) model to address\nthese challenges. By using the testbed as a hardware accelerator,\nRouteNet-Gauss generates training datasets rapidly and simulates network\nscenarios with high fidelity to real-world conditions. Experimental results\nshow that RouteNet-Gauss significantly reduces prediction errors by up to 95%\nand achieves a 488x speedup in inference time compared to state-of-the-art\nDES-based methods. RouteNet-Gauss's modular architecture is dynamically\nconstructed based on the specific characteristics of the network scenario, such\nas topology and routing. This enables it to understand and generalize to\ndifferent network configurations beyond those seen during training, including\nnetworks up to 10x larger. Additionally, it supports Temporal Aggregated\nPerformance Estimation (TAPE), providing configurable temporal granularity and\nmaintaining high accuracy in flow performance metrics. This approach shows\npromise in improving both simulation efficiency and accuracy, offering a\nvaluable tool for network operators.\n","authors":["Carlos Güemes-Palau","Miquel Ferriol-Galmés","Jordi Paillisse-Vilanova","Albert López-Brescó","Pere Barlet-Ros","Albert Cabellos-Aparicio"],"pdf_url":"https://arxiv.org/pdf/2501.08848v1.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.08847v1","updated":"2025-01-15T14:59:00Z","published":"2025-01-15T14:59:00Z","title":"Automatic tuning of communication protocols for vehicular ad hoc\n  networks using metaheuristics","summary":"  The emerging field of vehicular ad hoc networks (VANETs) deals with a set of\ncommunicating vehicles which are able to spontaneously interconnect without any\npre-existing infrastructure. In such kind of networks, it is crucial to make an\noptimal configuration of the communication protocols previously to the final\nnetwork deployment. This way, a human designer can obtain an optimal QoS of the\nnetwork beforehand. The problem we consider in this work lies in configuring\nthe File Transfer protocol Configuration (FTC) with the aim of optimizing the\ntransmission time, the number of lost packets, and the amount of data\ntransferred in realistic VANET scenarios. We face the FTC with five\nrepresentative state-of-the-art optimization techniques and compare their\nperformance. These algorithms are: Particle Swarm Optimization (PSO),\nDifferential Evolution (DE), Genetic Algorithm (GA), Evolutionary Strategy\n(ES), and Simulated Annealing (SA). For our tests, two typical environment\ninstances of VANETs for Urban and Highway scenarios have been defined. The\nexperiments using ns- 2 (a well-known realistic VANET simulator) reveal that\nPSO outperforms all the compared algorithms for both studied VANET instances.\n","authors":["José García-Nieto","Jamal Toutouh","Enrique Alba"],"pdf_url":"https://arxiv.org/pdf/2501.08847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08841v1","updated":"2025-01-15T14:52:20Z","published":"2025-01-15T14:52:20Z","title":"Exploring Task-Level Optimal Prompts for Visual In-Context Learning","summary":"  With the development of Vision Foundation Models (VFMs) in recent years,\nVisual In-Context Learning (VICL) has become a better choice compared to\nmodifying models in most scenarios. Different from retraining or fine-tuning\nmodel, VICL does not require modifications to the model's weights or\narchitecture, and only needs a prompt with demonstrations to teach VFM how to\nsolve tasks. Currently, significant computational cost for finding optimal\nprompts for every test sample hinders the deployment of VICL, as determining\nwhich demonstrations to use for constructing prompts is very costly. In this\npaper, however, we find a counterintuitive phenomenon that most test samples\nactually achieve optimal performance under the same prompts, and searching for\nsample-level prompts only costs more time but results in completely identical\nprompts. Therefore, we propose task-level prompting to reduce the cost of\nsearching for prompts during the inference stage and introduce two time-saving\nyet effective task-level prompt search strategies. Extensive experimental\nresults show that our proposed method can identify near-optimal prompts and\nreach the best VICL performance with a minimal cost that prior work has never\nachieved.\n","authors":["Yan Zhu","Huan Ma","Changqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08838v1","updated":"2025-01-15T14:47:02Z","published":"2025-01-15T14:47:02Z","title":"ToMATO: Verbalizing the Mental States of Role-Playing LLMs for\n  Benchmarking Theory of Mind","summary":"  Existing Theory of Mind (ToM) benchmarks diverge from real-world scenarios in\nthree aspects: 1) they assess a limited range of mental states such as beliefs,\n2) false beliefs are not comprehensively explored, and 3) the diverse\npersonality traits of characters are overlooked. To address these challenges,\nwe introduce ToMATO, a new ToM benchmark formulated as multiple-choice QA over\nconversations. ToMATO is generated via LLM-LLM conversations featuring\ninformation asymmetry. By employing a prompting method that requires\nrole-playing LLMs to verbalize their thoughts before each utterance, we capture\nboth first- and second-order mental states across five categories: belief,\nintention, desire, emotion, and knowledge. These verbalized thoughts serve as\nanswers to questions designed to assess the mental states of characters within\nconversations. Furthermore, the information asymmetry introduced by hiding\nthoughts from others induces the generation of false beliefs about various\nmental states. Assigning distinct personality traits to LLMs further\ndiversifies both utterances and thoughts. ToMATO consists of 5.4k questions,\n753 conversations, and 15 personality trait patterns. Our analysis shows that\nthis dataset construction approach frequently generates false beliefs due to\nthe information asymmetry between role-playing LLMs, and effectively reflects\ndiverse personalities. We evaluate nine LLMs on ToMATO and find that even\nGPT-4o mini lags behind human performance, especially in understanding false\nbeliefs, and lacks robustness to various personality traits.\n","authors":["Kazutoshi Shinoda","Nobukatsu Hojo","Kyosuke Nishida","Saki Mizuno","Keita Suzuki","Ryo Masumura","Hiroaki Sugiyama","Kuniko Saito"],"pdf_url":"https://arxiv.org/pdf/2501.08838v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.08828v1","updated":"2025-01-15T14:30:13Z","published":"2025-01-15T14:30:13Z","title":"MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents","summary":"  Multi-modal document retrieval is designed to identify and retrieve various\nforms of multi-modal content, such as figures, tables, charts, and layout\ninformation from extensive documents. Despite its significance, there is a\nnotable lack of a robust benchmark to effectively evaluate the performance of\nsystems in multi-modal document retrieval. To address this gap, this work\nintroduces a new benchmark, named as MMDocIR, encompassing two distinct tasks:\npage-level and layout-level retrieval. The former focuses on localizing the\nmost relevant pages within a long document, while the latter targets the\ndetection of specific layouts, offering a more fine-grained granularity than\nwhole-page analysis. A layout can refer to a variety of elements such as\ntextual paragraphs, equations, figures, tables, or charts. The MMDocIR\nbenchmark comprises a rich dataset featuring expertly annotated labels for\n1,685 questions and bootstrapped labels for 173,843 questions, making it a\npivotal resource for advancing multi-modal document retrieval for both training\nand evaluation. Through rigorous experiments, we reveal that (i) visual\nretrievers significantly outperform their text counterparts, (ii) MMDocIR train\nset can effectively benefit the training process of multi-modal document\nretrieval and (iii) text retrievers leveraging on VLM-text perform much better\nthan those using OCR-text. These findings underscores the potential advantages\nof integrating visual elements for multi-modal document retrieval.\n","authors":["Kuicai Dong","Yujing Chang","Xin Deik Goh","Dexun Li","Ruiming Tang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08828v1.pdf","comment":"https://huggingface.co/MMDocIR"},{"id":"http://arxiv.org/abs/2501.08816v1","updated":"2025-01-15T14:12:59Z","published":"2025-01-15T14:12:59Z","title":"IDEA: Image Description Enhanced CLIP-Adapter","summary":"  CLIP (Contrastive Language-Image Pre-training) has attained great success in\npattern recognition and computer vision. Transferring CLIP to downstream tasks\n(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.\nHowever, current studies primarily focus on either prompt learning for text or\nadapter tuning for vision, without fully exploiting the complementary\ninformation and correlations among image-text pairs. In this paper, we propose\nan Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to\nfew-shot image classification tasks. This method captures fine-grained features\nby leveraging both visual features and textual descriptions of images. IDEA is\na training-free method for CLIP, and it can be comparable to or even exceeds\nstate-of-the-art models on multiple tasks. Furthermore, we introduce\nTrainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable\ncomponents (i.e., a projector and a learnable latent space), further enhancing\nthe model's performance and achieving SOTA results on 11 datasets. As one\nimportant contribution, we employ the Llama model and design a comprehensive\npipeline to generate textual descriptions for images of 11 datasets, resulting\nin a total of 1,637,795 image-text pairs, named \"IMD-11\". Our code and data are\nreleased at https://github.com/FourierAI/IDEA.\n","authors":["Zhipeng Ye","Feng Jiang","Qiufeng Wang","Kaizhu Huang","Jiaqi Huang"],"pdf_url":"https://arxiv.org/pdf/2501.08816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08814v1","updated":"2025-01-15T14:12:38Z","published":"2025-01-15T14:12:38Z","title":"SAIF: A Comprehensive Framework for Evaluating the Risks of Generative\n  AI in the Public Sector","summary":"  The rapid adoption of generative AI in the public sector, encompassing\ndiverse applications ranging from automated public assistance to welfare\nservices and immigration processes, highlights its transformative potential\nwhile underscoring the pressing need for thorough risk assessments. Despite its\ngrowing presence, evaluations of risks associated with AI-driven systems in the\npublic sector remain insufficiently explored. Building upon an established\ntaxonomy of AI risks derived from diverse government policies and corporate\nguidelines, we investigate the critical risks posed by generative AI in the\npublic sector while extending the scope to account for its multimodal\ncapabilities. In addition, we propose a Systematic dAta generatIon Framework\nfor evaluating the risks of generative AI (SAIF). SAIF involves four key\nstages: breaking down risks, designing scenarios, applying jailbreak methods,\nand exploring prompt types. It ensures the systematic and consistent generation\nof prompt data, facilitating a comprehensive evaluation while providing a solid\nfoundation for mitigating the risks. Furthermore, SAIF is designed to\naccommodate emerging jailbreak methods and evolving prompt types, thereby\nenabling effective responses to unforeseen risk scenarios. We believe that this\nstudy can play a crucial role in fostering the safe and responsible integration\nof generative AI into the public sector.\n","authors":["Kyeongryul Lee","Heehyeon Kim","Joyce Jiyoung Whang"],"pdf_url":"https://arxiv.org/pdf/2501.08814v1.pdf","comment":"6 pages, 2 figures, 1 tables. AI for Public Missions (AIPM) Workshop\n  at the 39th AAAI Conference on Artificial Intelligence (AAAI 2025)"},{"id":"http://arxiv.org/abs/2501.08809v1","updated":"2025-01-15T14:08:44Z","published":"2025-01-15T14:08:44Z","title":"XMusic: Towards a Generalized and Controllable Symbolic Music Generation\n  Framework","summary":"  In recent years, remarkable advancements in artificial intelligence-generated\ncontent (AIGC) have been achieved in the fields of image synthesis and text\ngeneration, generating content comparable to that produced by humans. However,\nthe quality of AI-generated music has not yet reached this standard, primarily\ndue to the challenge of effectively controlling musical emotions and ensuring\nhigh-quality outputs. This paper presents a generalized symbolic music\ngeneration framework, XMusic, which supports flexible prompts (i.e., images,\nvideos, texts, tags, and humming) to generate emotionally controllable and\nhigh-quality symbolic music. XMusic consists of two core components, XProjector\nand XComposer. XProjector parses the prompts of various modalities into\nsymbolic music elements (i.e., emotions, genres, rhythms and notes) within the\nprojection space to generate matching music. XComposer contains a Generator and\na Selector. The Generator generates emotionally controllable and melodious\nmusic based on our innovative symbolic music representation, whereas the\nSelector identifies high-quality symbolic music by constructing a multi-task\nlearning scheme involving quality assessment, emotion recognition, and genre\nrecognition tasks. In addition, we build XMIDI, a large-scale symbolic music\ndataset that contains 108,023 MIDI files annotated with precise emotion and\ngenre labels. Objective and subjective evaluations show that XMusic\nsignificantly outperforms the current state-of-the-art methods with impressive\nmusic quality. Our XMusic has been awarded as one of the nine Highlights of\nCollectibles at WAIC 2023. The project homepage of XMusic is\nhttps://xmusic-project.github.io.\n","authors":["Sida Tian","Can Zhang","Wei Yuan","Wei Tan","Wenjie Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.08809v1.pdf","comment":"accepted by TMM"},{"id":"http://arxiv.org/abs/2402.07437v2","updated":"2025-01-15T14:02:51Z","published":"2024-02-12T06:32:53Z","title":"Learning Optimal Tax Design in Nonatomic Congestion Games","summary":"  In multiplayer games, self-interested behavior among the players can harm the\nsocial welfare. Tax mechanisms are a common method to alleviate this issue and\ninduce socially optimal behavior. In this work, we take the initial step of\nlearning the optimal tax that can maximize social welfare with limited feedback\nin congestion games. We propose a new type of feedback named \\emph{equilibrium\nfeedback}, where the tax designer can only observe the Nash equilibrium after\ndeploying a tax plan. Existing algorithms are not applicable due to the\nexponentially large tax function space, nonexistence of the gradient, and\nnonconvexity of the objective. To tackle these challenges, we design a\ncomputationally efficient algorithm that leverages several novel components:\n(1) a piece-wise linear tax to approximate the optimal tax; (2) extra linear\nterms to guarantee a strongly convex potential function; (3) an efficient\nsubroutine to find the exploratory tax that can provide critical information\nabout the game. The algorithm can find an $\\epsilon$-optimal tax with $O(\\beta\nF^2/\\epsilon)$ sample complexity, where $\\beta$ is the smoothness of the cost\nfunction and $F$ is the number of facilities.\n","authors":["Qiwen Cui","Maryam Fazel","Simon S. Du"],"pdf_url":"https://arxiv.org/pdf/2402.07437v2.pdf","comment":"23 pages. Accepted by Conference on Neural Information Processing\n  Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2501.07317v3","updated":"2025-01-15T14:01:15Z","published":"2025-01-13T13:28:03Z","title":"Evaluation of Artificial Intelligence Methods for Lead Time Prediction\n  in Non-Cycled Areas of Automotive Production","summary":"  The present study examines the effectiveness of applying Artificial\nIntelligence methods in an automotive production environment to predict unknown\nlead times in a non-cycle-controlled production area. Data structures are\nanalyzed to identify contextual features and then preprocessed using one-hot\nencoding. Methods selection focuses on supervised machine learning techniques.\nIn supervised learning methods, regression and classification methods are\nevaluated. Continuous regression based on target size distribution is not\nfeasible. Classification methods analysis shows that Ensemble Learning and\nSupport Vector Machines are the most suitable. Preliminary study results\nindicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost\nyield the best results. After further testing and extensive hyperparameter\noptimization, the final method choice is the LightGBM algorithm. Depending on\nfeature availability and prediction interval granularity, relative prediction\naccuracies of up to 90% can be achieved. Further tests highlight the importance\nof periodic retraining of AI models to accurately represent complex production\nprocesses using the database. The research demonstrates that AI methods can be\neffectively applied to highly variable production data, adding business value\nby providing an additional metric for various control tasks while outperforming\ncurrent non AI-based systems.\n","authors":["Cornelius Hake","Jonas Weigele","Frederik Reichert","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2501.07317v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04562v2","updated":"2025-01-15T13:24:49Z","published":"2024-11-07T09:35:22Z","title":"Constrained Latent Action Policies for Model-Based Offline Reinforcement\n  Learning","summary":"  In offline reinforcement learning, a policy is learned using a static dataset\nin the absence of costly feedback from the environment. In contrast to the\nonline setting, only using static datasets poses additional challenges, such as\npolicies generating out-of-distribution samples. Model-based offline\nreinforcement learning methods try to overcome these by learning a model of the\nunderlying dynamics of the environment and using it to guide policy search. It\nis beneficial but, with limited datasets, errors in the model and the issue of\nvalue overestimation among out-of-distribution states can worsen performance.\nCurrent model-based methods apply some notion of conservatism to the Bellman\nupdate, often implemented using uncertainty estimation derived from model\nensembles. In this paper, we propose Constrained Latent Action Policies (C-LAP)\nwhich learns a generative model of the joint distribution of observations and\nactions. We cast policy learning as a constrained objective to always stay\nwithin the support of the latent action distribution, and use the generative\ncapabilities of the model to impose an implicit constraint on the generated\nactions. Thereby eliminating the need to use additional uncertainty penalties\non the Bellman update and significantly decreasing the number of gradient steps\nrequired to learn a policy. We empirically evaluate C-LAP on the D4RL and\nV-D4RL benchmark, and show that C-LAP is competitive to state-of-the-art\nmethods, especially outperforming on datasets with visual observations.\n","authors":["Marvin Alles","Philip Becker-Ehmck","Patrick van der Smagt","Maximilian Karl"],"pdf_url":"https://arxiv.org/pdf/2411.04562v2.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n  2024)"},{"id":"http://arxiv.org/abs/2501.08778v1","updated":"2025-01-15T13:01:32Z","published":"2025-01-15T13:01:32Z","title":"Networked Agents in the Dark: Team Value Learning under Partial\n  Observability","summary":"  We propose a novel cooperative multi-agent reinforcement learning (MARL)\napproach for networked agents. In contrast to previous methods that rely on\ncomplete state information or joint observations, our agents must learn how to\nreach shared objectives under partial observability. During training, they\ncollect individual rewards and approximate a team value function through local\ncommunication, resulting in cooperative behavior. To describe our problem, we\nintroduce the networked dynamic partially observable Markov game framework,\nwhere agents communicate over a switching topology communication network. Our\ndistributed method, DNA-MARL, uses a consensus mechanism for local\ncommunication and gradient descent for local computation. DNA-MARL increases\nthe range of the possible applications of networked agents, being well-suited\nfor real world domains that impose privacy and where the messages may not reach\ntheir recipients. We evaluate DNA-MARL across benchmark MARL scenarios. Our\nresults highlight the superior performance of DNA-MARL over previous methods.\n","authors":["Guilherme S. Varela","Alberto Sardinha","Francisco S. Melo"],"pdf_url":"https://arxiv.org/pdf/2501.08778v1.pdf","comment":"18 pages, 7 figures, 5 tables. Accepted as supplemental material at\n  Proceedings of the 24th International Conference on Autonomous Agents and\n  Multiagent Systems (AAMAS 2025), Detroit, Michigan, USA, May 19 - 23, 2025,\n  IFAAMAS"},{"id":"http://arxiv.org/abs/2501.08774v1","updated":"2025-01-15T12:53:49Z","published":"2025-01-15T12:53:49Z","title":"How Developers Interact with AI: A Taxonomy of Human-AI Collaboration in\n  Software Engineering","summary":"  Artificial intelligence (AI), including large language models and generative\nAI, is emerging as a significant force in software development, offering\ndevelopers powerful tools that span the entire development lifecycle. Although\nsoftware engineering research has extensively studied AI tools in software\ndevelopment, the specific types of interactions between developers and these\nAI-powered tools have only recently begun to receive attention. Understanding\nand improving these interactions has the potential to improve productivity,\ntrust, and efficiency in AI-driven workflows. In this paper, we propose a\ntaxonomy of interaction types between developers and AI tools, identifying\neleven distinct interaction types, such as auto-complete code suggestions,\ncommand-driven actions, and conversational assistance. Building on this\ntaxonomy, we outline a research agenda focused on optimizing AI interactions,\nimproving developer control, and addressing trust and usability challenges in\nAI-assisted development. By establishing a structured foundation for studying\ndeveloper-AI interactions, this paper aims to stimulate research on creating\nmore effective, adaptive AI tools for software development.\n","authors":["Christoph Treude","Marco A. Gerosa"],"pdf_url":"https://arxiv.org/pdf/2501.08774v1.pdf","comment":"Accepted at 2nd ACM International Conference on AI Foundation Models\n  and Software Engineering (FORGE 2025)"},{"id":"http://arxiv.org/abs/2403.10700v2","updated":"2025-01-15T12:45:24Z","published":"2024-03-15T21:36:15Z","title":"Mind the Error! Detection and Localization of Instruction Errors in\n  Vision-and-Language Navigation","summary":"  Vision-and-Language Navigation in Continuous Environments (VLN-CE) is one of\nthe most intuitive yet challenging embodied AI tasks. Agents are tasked to\nnavigate towards a target goal by executing a set of low-level actions,\nfollowing a series of natural language instructions. All VLN-CE methods in the\nliterature assume that language instructions are exact. However, in practice,\ninstructions given by humans can contain errors when describing a spatial\nenvironment due to inaccurate memory or confusion. Current VLN-CE benchmarks do\nnot address this scenario, making the state-of-the-art methods in VLN-CE\nfragile in the presence of erroneous instructions from human users. For the\nfirst time, we propose a novel benchmark dataset that introduces various types\nof instruction errors considering potential human causes. This benchmark\nprovides valuable insight into the robustness of VLN systems in continuous\nenvironments. We observe a noticeable performance drop (up to -25%) in Success\nRate when evaluating the state-of-the-art VLN-CE methods on our benchmark.\nMoreover, we formally define the task of Instruction Error Detection and\nLocalization, and establish an evaluation protocol on top of our benchmark\ndataset. We also propose an effective method, based on a cross-modal\ntransformer architecture, that achieves the best performance in error detection\nand localization, compared to baselines. Surprisingly, our proposed method has\nrevealed errors in the validation set of the two commonly used datasets for\nVLN-CE, i.e., R2R-CE and RxR-CE, demonstrating the utility of our technique in\nother tasks. Code and dataset available at\nhttps://intelligolabs.github.io/R2RIE-CE\n","authors":["Francesco Taioli","Stefano Rosa","Alberto Castellini","Lorenzo Natale","Alessio Del Bue","Alessandro Farinelli","Marco Cristani","Yiming Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10700v2.pdf","comment":"3 figures, 8 pages. Accepted at IROS'24"},{"id":"http://arxiv.org/abs/2501.08760v1","updated":"2025-01-15T12:25:56Z","published":"2025-01-15T12:25:56Z","title":"Leveraging LLM Agents for Translating Network Configurations","summary":"  Configuration translation is a critical and frequent task in network\noperations. When a network device is damaged or outdated, administrators need\nto replace it to maintain service continuity. The replacement devices may\noriginate from different vendors, necessitating configuration translation to\nensure seamless network operation. However, translating configurations manually\nis a labor-intensive and error-prone process. In this paper, we propose an\nintent-based framework for translating network configuration with Large\nLanguage Model (LLM) Agents. The core of our approach is an Intent-based\nRetrieval Augmented Generation (IRAG) module that systematically splits a\nconfiguration file into fragments, extracts intents, and generates accurate\ntranslations. We also design a two-stage verification method to validate the\nsyntax and semantics correctness of the translated configurations. We implement\nand evaluate the proposed method on real-world network configurations.\nExperimental results show that our method achieves 97.74% syntax correctness,\noutperforming state-of-the-art methods in translation accuracy.\n","authors":["Yunze Wei","Xiaohui Xie","Yiwei Zuo","Tianshuo Hu","Xinyi Chen","Kaiwen Chi","Yong Cui"],"pdf_url":"https://arxiv.org/pdf/2501.08760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09537v3","updated":"2025-01-15T12:14:07Z","published":"2023-09-18T07:32:57Z","title":"Towards a performance characteristic curve for model evaluation: an\n  application in information diffusion prediction","summary":"  The information diffusion prediction on social networks aims to predict\nfuture recipients of a message, with practical applications in marketing and\nsocial media. While different prediction models all claim to perform well,\ngeneral frameworks for performance evaluation remain limited. Here, we aim to\nidentify a performance characteristic curve for a model, which captures its\nperformance on tasks of different complexity. We propose a metric based on\ninformation entropy to quantify the randomness in diffusion data. We then\nidentify a scaling pattern between the randomness and the prediction accuracy\nof the model. By properly adjusting the variables, data points by different\nsequence lengths, system sizes, and randomness can all collapse into a single\ncurve. The curve captures a model's inherent capability of making correct\npredictions against increased uncertainty, which we regard as the performance\ncharacteristic curve of the model. The validity of the curve is tested by three\nprediction models in the same family, reaching conclusions in line with\nexisting studies. In addition, we apply the curve to successfully assess the\nperformance of eight state-of-the-art models, providing a clear and\ncomprehensive evaluation even for models that are challenging to differentiate\nwith conventional metrics. Our work reveals a pattern underlying the data\nrandomness and prediction accuracy. The performance characteristic curve\nprovides a new way to evaluate models' performance systematically, and sheds\nlight on future studies on other frameworks for model evaluation.\n","authors":["Wenjin Xie","Xiaomeng Wang","Radosław Michalski","Tao Jia"],"pdf_url":"https://arxiv.org/pdf/2309.09537v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08109v2","updated":"2025-01-15T11:57:34Z","published":"2024-12-11T05:31:39Z","title":"Unseen Horizons: Unveiling the Real Capability of LLM Code Generation\n  Beyond the Familiar","summary":"  Recently, large language models (LLMs) have shown strong potential in code\ngeneration tasks. However, there are still gaps before they can be fully\napplied in actual software development processes. Accurately assessing the code\ngeneration capabilities of large language models has become an important basis\nfor evaluating and improving the models. Some existing works have constructed\ndatasets to evaluate the capabilities of these models. However, the current\nevaluation process may encounter the illusion of \"Specialist in Familiarity\",\nprimarily due to three gaps: the exposure of target code, case timeliness, and\ndependency availability. The fundamental reason for these gaps is that the code\nin current datasets may have been extensively exposed and exercised during the\ntraining phase, and due to the continuous training and development of LLM,\ntheir timeliness has been severely compromised. The key to solve the problem is\nto, as much as possible, evaluate the LLMs using code that they have not\nencountered before. Thus, the fundamental idea in this paper is to draw on the\nconcept of code obfuscation, changing code at different levels while ensuring\nthe functionality and output. To this end, we build a code-obfuscation based\nbenchmark OBFUSEVAL. We first collect 1,354 raw cases from five real-world\nprojects, including function description and code. Then we use three-level\nstrategy (symbol, structure and semantic) to obfuscate descriptions, code and\ncontext dependencies. We evaluate four LLMs on OBFU- SEVAL and compared the\neffectiveness of different obfuscation strategy. We use official test suites of\nthese projects to evaluate the generated code. The results show that after\nobfuscation, the average decrease ratio of test pass rate can up to 62.5%.\n","authors":["Yuanliang Zhang","Yifan Xie","Shanshan Li","Ke Liu","Chong Wang","Zhouyang Jia","Xiangbing Huang","Jie Song","Chaopeng Luo","Zhizheng Zheng","Rulin Xu","Yitong Liu","Si Zheng","Xiangke Liao"],"pdf_url":"https://arxiv.org/pdf/2412.08109v2.pdf","comment":"Accepted by the 47th International Conference on Software Engineering\n  (ICSE 2025)"},{"id":"http://arxiv.org/abs/2501.08002v2","updated":"2025-01-15T11:52:29Z","published":"2025-01-14T10:46:41Z","title":"Maximizing Uncertainty for Federated learning via Bayesian\n  Optimisation-based Model Poisoning","summary":"  As we transition from Narrow Artificial Intelligence towards Artificial Super\nIntelligence, users are increasingly concerned about their privacy and the\ntrustworthiness of machine learning (ML) technology. A common denominator for\nthe metrics of trustworthiness is the quantification of uncertainty inherent in\nDL algorithms, and specifically in the model parameters, input data, and model\npredictions. One of the common approaches to address privacy-related issues in\nDL is to adopt distributed learning such as federated learning (FL), where\nprivate raw data is not shared among users. Despite the privacy-preserving\nmechanisms in FL, it still faces challenges in trustworthiness. Specifically,\nthe malicious users, during training, can systematically create malicious model\nparameters to compromise the models predictive and generative capabilities,\nresulting in high uncertainty about their reliability. To demonstrate malicious\nbehaviour, we propose a novel model poisoning attack method named Delphi which\naims to maximise the uncertainty of the global model output. We achieve this by\ntaking advantage of the relationship between the uncertainty and the model\nparameters of the first hidden layer of the local model. Delphi employs two\ntypes of optimisation , Bayesian Optimisation and Least Squares Trust Region,\nto search for the optimal poisoned model parameters, named as Delphi-BO and\nDelphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise\nthe distance of the predictive probability distribution towards an uncertain\ndistribution of model output. Furthermore, we establish a mathematical proof\nfor the attack effectiveness demonstrated in FL. Numerical results demonstrate\nthat Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR\nhighlighting vulnerability of FL systems to model poisoning attacks.\n","authors":["Marios Aristodemou","Xiaolan Liu","Yuan Wang","Konstantinos G. Kyriakopoulos","Sangarapillai Lambotharan","Qingsong Wei"],"pdf_url":"https://arxiv.org/pdf/2501.08002v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2406.07592v3","updated":"2025-01-15T11:18:10Z","published":"2024-06-11T12:15:47Z","title":"MambaLRP: Explaining Selective State Space Sequence Models","summary":"  Recent sequence modeling approaches using selective state space sequence\nmodels, referred to as Mamba models, have seen a surge of interest. These\nmodels allow efficient processing of long sequences in linear time and are\nrapidly being adopted in a wide range of applications such as language\nmodeling, demonstrating promising performance. To foster their reliable use in\nreal-world scenarios, it is crucial to augment their transparency. Our work\nbridges this critical gap by bringing explainability, particularly Layer-wise\nRelevance Propagation (LRP), to the Mamba architecture. Guided by the axiom of\nrelevance conservation, we identify specific components in the Mamba\narchitecture, which cause unfaithful explanations. To remedy this issue, we\npropose MambaLRP, a novel algorithm within the LRP framework, which ensures a\nmore stable and reliable relevance propagation through these components. Our\nproposed method is theoretically sound and excels in achieving state-of-the-art\nexplanation performance across a diverse range of models and datasets.\nMoreover, MambaLRP facilitates a deeper inspection of Mamba architectures,\nuncovering various biases and evaluating their significance. It also enables\nthe analysis of previous speculations regarding the long-range capabilities of\nMamba models.\n","authors":["Farnoush Rezaei Jafari","Grégoire Montavon","Klaus-Robert Müller","Oliver Eberle"],"pdf_url":"https://arxiv.org/pdf/2406.07592v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14378v3","updated":"2025-01-15T11:07:35Z","published":"2024-09-22T09:48:45Z","title":"Sparse Low-Ranked Self-Attention Transformer for Remaining Useful\n  Lifetime Prediction of Optical Fiber Amplifiers","summary":"  Optical fiber amplifiers are key elements in present optical networks.\nFailures of these components result in high financial loss of income of the\nnetwork operator as the communication traffic over an affected link is\ninterrupted. Applying Remaining useful lifetime (RUL) prediction in the context\nof Predictive Maintenance (PdM) to optical fiber amplifiers to predict upcoming\nsystem failures at an early stage, so that network outages can be minimized\nthrough planning of targeted maintenance actions, ensures reliability and\nsafety. Optical fiber amplifier are complex systems, that work under various\noperating conditions, which makes correct forecasting a difficult task.\nIncreased monitoring capabilities of systems results in datasets that\nfacilitate the application of data-driven RUL prediction methods. Deep learning\nmodels in particular have shown good performance, but generalization based on\ncomparatively small datasets for RUL prediction is difficult. In this paper, we\npropose Sparse Low-ranked self-Attention Transformer (SLAT) as a novel RUL\nprediction method. SLAT is based on an encoder-decoder architecture, wherein\ntwo parallel working encoders extract features for sensors and time steps. By\nutilizing the self-attention mechanism, long-term dependencies can be learned\nfrom long sequences. The implementation of sparsity in the attention matrix and\na low-rank parametrization reduce overfitting and increase generalization.\nExperimental application to optical fiber amplifiers exemplified on EDFA, as\nwell as a reference dataset from turbofan engines, shows that SLAT outperforms\nthe state-of-the-art methods.\n","authors":["Dominic Schneider","Lutz Rapp"],"pdf_url":"https://arxiv.org/pdf/2409.14378v3.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.08712v1","updated":"2025-01-15T10:54:21Z","published":"2025-01-15T10:54:21Z","title":"Self-supervised Transformation Learning for Equivariant Representations","summary":"  Unsupervised representation learning has significantly advanced various\nmachine learning tasks. In the computer vision domain, state-of-the-art\napproaches utilize transformations like random crop and color jitter to achieve\ninvariant representations, embedding semantically the same inputs despite\ntransformations. However, this can degrade performance in tasks requiring\nprecise features, such as localization or flower classification. To address\nthis, recent research incorporates equivariant representation learning, which\ncaptures transformation-sensitive information. However, current methods depend\non transformation labels and thus struggle with interdependency and complex\ntransformations. We propose Self-supervised Transformation Learning (STL),\nreplacing transformation labels with transformation representations derived\nfrom image pairs. The proposed method ensures transformation representation is\nimage-invariant and learns corresponding equivariant transformations, enhancing\nperformance without increased batch complexity. We demonstrate the approach's\neffectiveness across diverse classification and detection tasks, outperforming\nexisting methods in 7 out of 11 benchmarks and excelling in detection. By\nintegrating complex transformations like AugMix, unusable by prior equivariant\nmethods, this approach enhances performance across tasks, underscoring its\nadaptability and resilience. Additionally, its compatibility with various base\nmodels highlights its flexibility and broad applicability. The code is\navailable at https://github.com/jaemyung-u/stl.\n","authors":["Jaemyung Yu","Jaehyun Choi","Dong-Jae Lee","HyeongGwon Hong","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2501.08712v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n  2024)"},{"id":"http://arxiv.org/abs/2406.09495v3","updated":"2025-01-15T10:47:05Z","published":"2024-06-13T17:36:05Z","title":"FADE: Towards Fairness-aware Augmentation for Domain Generalization via\n  Classifier-Guided Score-based Diffusion Models","summary":"  Fairness-aware domain generalization (FairDG) has emerged as a critical\nchallenge for deploying trustworthy AI systems, particularly in scenarios\ninvolving distribution shifts. Traditional methods for addressing fairness have\nfailed in domain generalization due to their lack of consideration for\ndistribution shifts. Although disentanglement has been used to tackle FairDG,\nit is limited by its strong assumptions. To overcome these limitations, we\npropose Fairness-aware Classifier-Guided Score-based Diffusion Models (FADE) as\na novel approach to effectively address the FairDG issue. Specifically, we\nfirst pre-train a score-based diffusion model (SDM) and two classifiers to\nequip the model with strong generalization capabilities across different\ndomains. Then, we guide the SDM using these pre-trained classifiers to\neffectively eliminate sensitive information from the generated data. Finally,\nthe generated fair data is used to train downstream classifiers, ensuring\nrobust performance under new data distributions. Extensive experiments on three\nreal-world datasets demonstrate that FADE not only enhances fairness but also\nimproves accuracy in the presence of distribution shifts. Additionally, FADE\noutperforms existing methods in achieving the best accuracy-fairness\ntrade-offs.\n","authors":["Yujie Lin","Dong Li","Chen Zhao","Minglai Shao","Guihong Wan"],"pdf_url":"https://arxiv.org/pdf/2406.09495v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08306v2","updated":"2025-01-15T10:36:48Z","published":"2024-07-11T08:54:38Z","title":"Let Network Decide What to Learn: Symbolic Music Understanding Model\n  Based on Large-scale Adversarial Pre-training","summary":"  As a crucial aspect of Music Information Retrieval (MIR), Symbolic Music\nUnderstanding (SMU) has garnered significant attention for its potential to\nassist both musicians and enthusiasts in learning and creating music. Recently,\npre-trained language models have been widely adopted in SMU due to the\nsubstantial similarities between symbolic music and natural language, as well\nas the ability of these models to leverage limited music data effectively.\nHowever, some studies have shown the common pre-trained methods like Mask\nLanguage Model (MLM) may introduce bias issues like racism discrimination in\nNatural Language Process (NLP) and affects the performance of downstream tasks,\nwhich also happens in SMU. This bias often arises when masked tokens cannot be\ninferred from their context, forcing the model to overfit the training set\ninstead of generalizing. To address this challenge, we propose\nAdversarial-MidiBERT for SMU, which adaptively determines what to mask during\nMLM via a masker network, rather than employing random masking. By avoiding the\nmasking of tokens that are difficult to infer from context, our model is better\nequipped to capture contextual structures and relationships, rather than merely\nconforming to the training data distribution. We evaluate our method across\nfour SMU tasks, and our approach demonstrates excellent performance in all\ncases. The code for our model is publicly available at\nhttps://github.com/RS2002/Adversarial-MidiBERT.\n","authors":["Zijian Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.08306v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12935v3","updated":"2025-01-15T10:21:30Z","published":"2024-08-23T09:33:48Z","title":"Trustworthy, Responsible, and Safe AI: A Comprehensive Architectural\n  Framework for AI Safety with Challenges and Mitigations","summary":"  AI Safety is an emerging area of critical importance to the safe adoption and\ndeployment of AI systems. With the rapid proliferation of AI and especially\nwith the recent advancement of Generative AI (or GAI), the technology ecosystem\nbehind the design, development, adoption, and deployment of AI systems has\ndrastically changed, broadening the scope of AI Safety to address impacts on\npublic safety and national security. In this paper, we propose a novel\narchitectural framework for understanding and analyzing AI Safety; defining its\ncharacteristics from three perspectives: Trustworthy AI, Responsible AI, and\nSafe AI. We provide an extensive review of current research and advancements in\nAI safety from these perspectives, highlighting their key challenges and\nmitigation approaches. Through examples from state-of-the-art technologies,\nparticularly Large Language Models (LLMs), we present innovative mechanism,\nmethodologies, and techniques for designing and testing AI safety. Our goal is\nto promote advancement in AI safety research, and ultimately enhance people's\ntrust in digital transformation.\n","authors":["Chen Chen","Xueluan Gong","Ziyao Liu","Weifeng Jiang","Si Qi Goh","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2408.12935v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05301v2","updated":"2025-01-15T09:42:42Z","published":"2024-10-04T12:22:54Z","title":"Diffusion-based Unsupervised Audio-visual Speech Enhancement","summary":"  This paper proposes a new unsupervised audio-visual speech enhancement (AVSE)\napproach that combines a diffusion-based audio-visual speech generative model\nwith a non-negative matrix factorization (NMF) noise model. First, the\ndiffusion model is pre-trained on clean speech conditioned on corresponding\nvideo data to simulate the speech generative distribution. This pre-trained\nmodel is then paired with the NMF-based noise model to estimate clean speech\niteratively. Specifically, a diffusion-based posterior sampling approach is\nimplemented within the reverse diffusion process, where after each iteration, a\nspeech estimate is obtained and used to update the noise parameters.\nExperimental results confirm that the proposed AVSE approach not only\noutperforms its audio-only counterpart but also generalizes better than a\nrecent supervised-generative AVSE method. Additionally, the new inference\nalgorithm offers a better balance between inference speed and performance\ncompared to the previous diffusion-based method. Code and demo available at:\nhttps://jeaneudesayilo.github.io/fast_UdiffSE\n","authors":["Jean-Eudes Ayilo","Mostafa Sadeghi","Romain Serizel","Xavier Alameda-Pineda"],"pdf_url":"https://arxiv.org/pdf/2410.05301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06787v2","updated":"2025-01-15T09:39:03Z","published":"2025-01-12T11:54:46Z","title":"Improving Pain Classification using Spatio-Temporal Deep Learning\n  Approaches with Facial Expressions","summary":"  Pain management and severity detection are crucial for effective treatment,\nyet traditional self-reporting methods are subjective and may be unsuitable for\nnon-verbal individuals (people with limited speaking skills). To address this\nlimitation, we explore automated pain detection using facial expressions. Our\nstudy leverages deep learning techniques to improve pain assessment by\nanalyzing facial images from the Pain Emotion Faces Database (PEMF). We propose\ntwo novel approaches1: (1) a hybrid ConvNeXt model combined with Long\nShort-Term Memory (LSTM) blocks to analyze video frames and predict pain\npresence, and (2) a Spatio-Temporal Graph Convolution Network (STGCN)\nintegrated with LSTM to process landmarks from facial images for pain\ndetection. Our work represents the first use of the PEMF dataset for binary\npain classification and demonstrates the effectiveness of these models through\nextensive experimentation. The results highlight the potential of combining\nspatial and temporal features for enhanced pain detection, offering a promising\nadvancement in objective pain assessment methodologies.\n","authors":["Aafaf Ridouan","Amine Bohi","Youssef Mourchid"],"pdf_url":"https://arxiv.org/pdf/2501.06787v2.pdf","comment":"8 pages, 3 figures, 3 tables. Accepted and presented at the 18th\n  International Conference on Machine Vision (ICMV 2024), Edinburgh, UK"},{"id":"http://arxiv.org/abs/2401.15299v3","updated":"2025-01-15T09:23:55Z","published":"2024-01-27T05:14:17Z","title":"SupplyGraph: A Benchmark Dataset for Supply Chain Planning using Graph\n  Neural Networks","summary":"  Graph Neural Networks (GNNs) have gained traction across different domains\nsuch as transportation, bio-informatics, language processing, and computer\nvision. However, there is a noticeable absence of research on applying GNNs to\nsupply chain networks. Supply chain networks are inherently graph-like in\nstructure, making them prime candidates for applying GNN methodologies. This\nopens up a world of possibilities for optimizing, predicting, and solving even\nthe most complex supply chain problems. A major setback in this approach lies\nin the absence of real-world benchmark datasets to facilitate the research and\nresolution of supply chain problems using GNNs. To address the issue, we\npresent a real-world benchmark dataset for temporal tasks, obtained from one of\nthe leading FMCG companies in Bangladesh, focusing on supply chain planning for\nproduction purposes. The dataset includes temporal data as node features to\nenable sales predictions, production planning, and the identification of\nfactory issues. By utilizing this dataset, researchers can employ GNNs to\naddress numerous supply chain problems, thereby advancing the field of supply\nchain analytics and planning. Source: https://github.com/CIOL-SUST/SupplyGraph\n","authors":["Azmine Toushik Wasi","MD Shafikul Islam","Adipto Raihan Akib"],"pdf_url":"https://arxiv.org/pdf/2401.15299v3.pdf","comment":"Accepted to 4th workshop on Graphs and more Complex structures for\n  Learning and Reasoning, colocated with AAAI 2024"},{"id":"http://arxiv.org/abs/2410.10524v2","updated":"2025-01-15T09:17:01Z","published":"2024-10-14T14:04:36Z","title":"Get Rid of Isolation: A Continuous Multi-task Spatio-Temporal Learning\n  Framework","summary":"  Spatiotemporal learning has become a pivotal technique to enable urban\nintelligence. Traditional spatiotemporal models mostly focus on a specific task\nby assuming a same distribution between training and testing sets. However,\ngiven that urban systems are usually dynamic, multi-sourced with imbalanced\ndata distributions, current specific task-specific models fail to generalize to\nnew urban conditions and adapt to new domains without explicitly modeling\ninterdependencies across various dimensions and types of urban data. To this\nend, we argue that there is an essential to propose a Continuous Multi-task\nSpatio-Temporal learning framework (CMuST) to empower collective urban\nintelligence, which reforms the urban spatiotemporal learning from\nsingle-domain to cooperatively multi-dimensional and multi-task learning.\nSpecifically, CMuST proposes a new multi-dimensional spatiotemporal interaction\nnetwork (MSTI) to allow cross-interactions between context and main\nobservations as well as self-interactions within spatial and temporal aspects\nto be exposed, which is also the core for capturing task-level commonality and\npersonalization. To ensure continuous task learning, a novel Rolling Adaptation\ntraining scheme (RoAda) is devised, which not only preserves task uniqueness by\nconstructing data summarization-driven task prompts, but also harnesses\ncorrelated patterns among tasks by iterative model behavior modeling. We\nfurther establish a benchmark of three cities for multi-task spatiotemporal\nlearning, and empirically demonstrate the superiority of CMuST via extensive\nevaluations on these datasets. The impressive improvements on both few-shot\nstreaming data and new domain tasks against existing SOAT methods are achieved.\nCode is available at https://github.com/DILab-USTCSZ/CMuST.\n","authors":["Zhongchao Yi","Zhengyang Zhou","Qihe Huang","Yanjiang Chen","Liheng Yu","Xu Wang","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.10524v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.15512v3","updated":"2025-01-15T09:12:02Z","published":"2024-08-28T03:48:05Z","title":"Toward Automated Simulation Research Workflow through LLM Prompt\n  Engineering Design","summary":"  The advent of Large Language Models (LLMs) has created new opportunities for\nthe automation of scientific research spanning both experimental processes and\ncomputational simulations. This study explores the feasibility of constructing\nan autonomous simulation agent (ASA) powered by LLMs through prompt engineering\nand automated program design to automate the entire simulation research process\naccording to a human-provided research plan. This process includes experimental\ndesign, remote upload and simulation execution, data analysis, and report\ncompilation. Using a well-studied simulation problem of polymer chain\nconformations as a test case, we assessed the long-task completion and\nreliability of ASAs powered by different LLMs, including GPT-4o, Claude-3.5,\netc. Our findings revealed that ASA-GPT-4o achieved near-flawless execution on\ndesignated research missions, underscoring the potential of methods like ASA to\nachieve automation in simulation research processes to enhance research\nefficiency. The outlined automation can be iteratively performed for up to 20\ncycles without human intervention, illustrating the potential of ASA for\nlong-task workflow automation. Additionally, we discussed the intrinsic traits\nof ASA in managing extensive tasks, focusing on self-validation mechanisms, and\nthe balance between local attention and global oversight.\n","authors":["Zhihan Liu","Yubo Chai","Jianfeng Li"],"pdf_url":"https://arxiv.org/pdf/2408.15512v3.pdf","comment":"The source code and example results of ASA can be found at\n  https://github.com/zokaraa/autonomous_simulation_agent"},{"id":"http://arxiv.org/abs/2501.08669v1","updated":"2025-01-15T09:04:19Z","published":"2025-01-15T09:04:19Z","title":"SPEQ: Stabilization Phases for Efficient Q-Learning in High\n  Update-To-Data Ratio Reinforcement Learning","summary":"  A key challenge in Deep Reinforcement Learning is sample efficiency,\nespecially in real-world applications where collecting environment interactions\nis expensive or risky. Recent off-policy algorithms improve sample efficiency\nby increasing the Update-To-Data (UTD) ratio and performing more gradient\nupdates per environment interaction. While this improves sample efficiency, it\nsignificantly increases computational cost due to the higher number of gradient\nupdates required. In this paper we propose a sample-efficient method to improve\ncomputational efficiency by separating training into distinct learning phases\nin order to exploit gradient updates more effectively. Our approach builds on\ntop of the Dropout Q-Functions (DroQ) algorithm and alternates between an\nonline, low UTD ratio training phase, and an offline stabilization phase.\nDuring the stabilization phase, we fine-tune the Q-functions without collecting\nnew environment interactions. This process improves the effectiveness of the\nreplay buffer and reduces computational overhead. Our experimental results on\ncontinuous control problems show that our method achieves results comparable to\nstate-of-the-art, high UTD ratio algorithms while requiring 56\\% fewer gradient\nupdates and 50\\% less training time than DroQ. Our approach offers an effective\nand computationally economical solution while maintaining the same sample\nefficiency as the more costly, high UTD ratio state-of-the-art.\n","authors":["Carlo Romeo","Girolamo Macaluso","Alessandro Sestini","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2501.08669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04671v2","updated":"2025-01-15T09:01:09Z","published":"2024-12-05T23:47:58Z","title":"Fully Distributed, Flexible Compositional Visual Representations via\n  Soft Tensor Products","summary":"  Since the inception of the classicalist vs. connectionist debate, it has been\nargued that the ability to systematically combine symbol-like entities into\ncompositional representations is crucial for human intelligence. In\nconnectionist systems, the field of disentanglement has gained prominence for\nits ability to produce explicitly compositional representations; however, it\nrelies on a fundamentally symbolic, concatenative representation of\ncompositional structure that clashes with the continuous, distributed\nfoundations of deep learning. To resolve this tension, we extend Smolensky's\nTensor Product Representation (TPR) and introduce Soft TPR, a representational\nform that encodes compositional structure in an inherently distributed,\nflexible manner, along with Soft TPR Autoencoder, a theoretically-principled\narchitecture designed specifically to learn Soft TPRs. Comprehensive\nevaluations in the visual representation learning domain demonstrate that the\nSoft TPR framework consistently outperforms conventional disentanglement\nalternatives -- achieving state-of-the-art disentanglement, boosting\nrepresentation learner convergence, and delivering superior sample efficiency\nand low-sample regime performance in downstream tasks. These findings highlight\nthe promise of a distributed and flexible approach to representing\ncompositional structure by potentially enhancing alignment with the core\nprinciples of deep learning over the conventional symbolic approach.\n","authors":["Bethia Sun","Maurice Pagnucco","Yang Song"],"pdf_url":"https://arxiv.org/pdf/2412.04671v2.pdf","comment":"Accepted to Neurips 2024. 10 pages + supplementary"},{"id":"http://arxiv.org/abs/2501.08655v1","updated":"2025-01-15T08:46:20Z","published":"2025-01-15T08:46:20Z","title":"Application of Deep Reinforcement Learning to UAV Swarming for Ground\n  Surveillance","summary":"  This paper summarizes in depth the state of the art of aerial swarms,\ncovering both classical and new reinforcement-learning-based approaches for\ntheir management. Then, it proposes a hybrid AI system, integrating deep\nreinforcement learning in a multi-agent centralized swarm architecture. The\nproposed system is tailored to perform surveillance of a specific area,\nsearching and tracking ground targets, for security and law enforcement\napplications. The swarm is governed by a central swarm controller responsible\nfor distributing different search and tracking tasks among the cooperating\nUAVs. Each UAV agent is then controlled by a collection of cooperative\nsub-agents, whose behaviors have been trained using different deep\nreinforcement learning models, tailored for the different task types proposed\nby the swarm controller. More specifically, proximal policy optimization (PPO)\nalgorithms were used to train the agents' behavior. In addition, several\nmetrics to assess the performance of the swarm in this application were\ndefined. The results obtained through simulation show that our system searches\nthe operation area effectively, acquires the targets in a reasonable time, and\nis capable of tracking them continuously and consistently.\n","authors":["Raúl Arranz","David Carramiñana","Gonzalo de Miguel","Juan A. Besada","Ana M. Bernardos"],"pdf_url":"https://arxiv.org/pdf/2501.08655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08653v1","updated":"2025-01-15T08:38:07Z","published":"2025-01-15T08:38:07Z","title":"Fine-grained Spatio-temporal Event Prediction with Self-adaptive Anchor\n  Graph","summary":"  Event prediction tasks often handle spatio-temporal data distributed in a\nlarge spatial area. Different regions in the area exhibit different\ncharacteristics while having latent correlations. This spatial heterogeneity\nand correlations greatly affect the spatio-temporal distributions of event\noccurrences, which has not been addressed by state-of-the-art models. Learning\nspatial dependencies of events in a continuous space is challenging due to its\nfine granularity and a lack of prior knowledge. In this work, we propose a\nnovel Graph Spatio-Temporal Point Process (GSTPP) model for fine-grained event\nprediction. It adopts an encoder-decoder architecture that jointly models the\nstate dynamics of spatially localized regions using neural Ordinary\nDifferential Equations (ODEs). The state evolution is built on the foundation\nof a novel Self-Adaptive Anchor Graph (SAAG) that captures spatial\ndependencies. By adaptively localizing the anchor nodes in the space and\njointly constructing the correlation edges between them, the SAAG enhances the\nmodel's ability of learning complex spatial event patterns. The proposed GSTPP\nmodel greatly improves the accuracy of fine-grained event prediction. Extensive\nexperimental results show that our method greatly improves the prediction\naccuracy over existing spatio-temporal event prediction approaches.\n","authors":["Wang-Tao Zhou","Zhao Kang","Sicong Liu","Lizong Zhang","Ling Tian"],"pdf_url":"https://arxiv.org/pdf/2501.08653v1.pdf","comment":"Accepted to SIAM International Conference on Data Mining 2025\n  (SDM'25)"},{"id":"http://arxiv.org/abs/2501.08648v1","updated":"2025-01-15T08:24:03Z","published":"2025-01-15T08:24:03Z","title":"MAGNET: Augmenting Generative Decoders with Representation Learning and\n  Infilling Capabilities","summary":"  While originally designed for unidirectional generative modeling,\ndecoder-only large language models (LLMs) are increasingly being adapted for\nbidirectional modeling. However, unidirectional and bidirectional models are\ntypically trained separately with distinct objectives (generation and\nrepresentation learning, respectively). This separation overlooks the\nopportunity for developing a more versatile language model and for these\nobjectives to complement each other. In this work, we introduce MAGNET, an\nadaptation of decoder-only LLMs that enhances their ability to generate robust\nrepresentations and infill missing text spans, while preserving their knowledge\nand text generation capabilities. MAGNET employs three self-supervised training\nobjectives and introduces an attention mechanism that combines bidirectional\nand causal attention, enabling unified training across all objectives. Our\nresults demonstrate that LLMs adapted with MAGNET (1) surpass strong text\nencoders on token-level and sentence-level representation learning tasks, (2)\ngenerate contextually appropriate text infills by leveraging future context,\n(3) retain the ability for open-ended text generation without exhibiting\nrepetition problem, and (4) preserve the knowledge gained by the LLM during\npretraining.\n","authors":["Savya Khosla","Kushal Kafle","Simon Jenni","Handong Zhao","John Collomosse","Jing Shi"],"pdf_url":"https://arxiv.org/pdf/2501.08648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16705v2","updated":"2025-01-15T08:20:19Z","published":"2024-02-26T16:21:53Z","title":"SelectIT: Selective Instruction Tuning for LLMs via Uncertainty-Aware\n  Self-Reflection","summary":"  Instruction tuning (IT) is crucial to tailoring large language models (LLMs)\ntowards human-centric interactions. Recent advancements have shown that the\ncareful selection of a small, high-quality subset of IT data can significantly\nenhance the performance of LLMs. Despite this, common approaches often rely on\nadditional models or data, which increases costs and limits widespread\nadoption. In this work, we propose a novel approach, termed SelectIT, that\ncapitalizes on the foundational capabilities of the LLM itself. Specifically,\nwe exploit the intrinsic uncertainty present in LLMs to more effectively select\nhigh-quality IT data, without the need for extra resources. Furthermore, we\nintroduce a curated IT dataset, the Selective Alpaca, created by applying\nSelectIT to the Alpaca-GPT4 dataset. Empirical results demonstrate that IT\nusing Selective Alpaca leads to substantial model ability enhancement. The\nrobustness of SelectIT has also been corroborated in various foundation models\nand domain-specific tasks. Our findings suggest that longer and more\ncomputationally intensive IT data may serve as superior sources of IT, offering\nvaluable insights for future research in this area. Data, code, and scripts are\nfreely available at https://github.com/Blue-Raincoat/SelectIT.\n","authors":["Liangxin Liu","Xuebo Liu","Derek F. Wong","Dongfang Li","Ziyi Wang","Baotian Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.16705v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2304.03271v4","updated":"2025-01-15T08:07:44Z","published":"2023-04-06T17:55:27Z","title":"Making AI Less \"Thirsty\": Uncovering and Addressing the Secret Water\n  Footprint of AI Models","summary":"  The growing carbon footprint of artificial intelligence (AI) has been\nundergoing public scrutiny. Nonetheless, the equally important water\n(withdrawal and consumption) footprint of AI has largely remained under the\nradar. For example, training the GPT-3 language model in Microsoft's\nstate-of-the-art U.S. data centers can directly evaporate 700,000 liters of\nclean freshwater, but such information has been kept a secret. More critically,\nthe global AI demand is projected to account for 4.2-6.6 billion cubic meters\nof water withdrawal in 2027, which is more than the total annual water\nwithdrawal of 4-6 Denmark or half of the United Kingdom. This is concerning, as\nfreshwater scarcity has become one of the most pressing challenges. To respond\nto the global water challenges, AI can, and also must, take social\nresponsibility and lead by example by addressing its own water footprint. In\nthis paper, we provide a principled methodology to estimate the water footprint\nof AI, and also discuss the unique spatial-temporal diversities of AI's runtime\nwater efficiency. Finally, we highlight the necessity of holistically\naddressing water footprint along with carbon footprint to enable truly\nsustainable AI.\n","authors":["Pengfei Li","Jianyi Yang","Mohammad A. Islam","Shaolei Ren"],"pdf_url":"https://arxiv.org/pdf/2304.03271v4.pdf","comment":"Accepted by Communications of the ACM. Source codes available at:\n  https://github.com/Ren-Research/Making-AI-Less-Thirsty"},{"id":"http://arxiv.org/abs/2501.08641v1","updated":"2025-01-15T08:07:22Z","published":"2025-01-15T08:07:22Z","title":"Reassessing the Role of Chain-of-Thought in Sentiment Analysis: Insights\n  and Limitations","summary":"  The relationship between language and thought remains an unresolved\nphilosophical issue. Existing viewpoints can be broadly categorized into two\nschools: one asserting their independence, and another arguing that language\nconstrains thought. In the context of large language models, this debate raises\na crucial question: Does a language model's grasp of semantic meaning depend on\nthought processes? To explore this issue, we investigate whether reasoning\ntechniques can facilitate semantic understanding. Specifically, we\nconceptualize thought as reasoning, employ chain-of-thought prompting as a\nreasoning technique, and examine its impact on sentiment analysis tasks. The\nexperiments show that chain-of-thought has a minimal impact on sentiment\nanalysis tasks. Both the standard and chain-of-thought prompts focus on aspect\nterms rather than sentiment in the generated content. Furthermore,\ncounterfactual experiments reveal that the model's handling of sentiment tasks\nprimarily depends on information from demonstrations. The experimental results\nsupport the first viewpoint.\n","authors":["Kaiyuan Zheng","Qinghua Zhao","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2501.08641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11344v3","updated":"2025-01-15T07:46:15Z","published":"2024-11-18T07:33:10Z","title":"Mitigating Knowledge Conflicts in Language Model-Driven Question\n  Answering","summary":"  In the context of knowledge-driven seq-to-seq generation tasks, such as\ndocument-based question answering and document summarization systems, two\nfundamental knowledge sources play crucial roles: the inherent knowledge\nembedded within model parameters and the external knowledge obtained through\ncontext. Recent studies revealed a significant challenge: when there exists a\nmisalignment between the model's inherent knowledge and the ground truth\nanswers in training data, the system may exhibit problematic behaviors during\ninference, such as ignoring input context, or generating unfaithful content.\nOur investigation proposes a strategy to minimize hallucination by building\nexplicit connection between source inputs and generated outputs. We\nspecifically target a common hallucination pattern in question answering,\nexamining how the correspondence between entities and their contexts during\nmodel training influences the system's performance at inference time.\n","authors":["Han Cao","Zhaoyang Zhang","Xiangtian Li","Chufan Wu","Hansong Zhang","Wenqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11344v3.pdf","comment":"revised version, more figures"},{"id":"http://arxiv.org/abs/2411.15098v4","updated":"2025-01-15T07:30:29Z","published":"2024-11-22T17:55:15Z","title":"OminiControl: Minimal and Universal Control for Diffusion Transformer","summary":"  In this paper, we introduce OminiControl, a highly versatile and\nparameter-efficient framework that integrates image conditions into pre-trained\nDiffusion Transformer (DiT) models. At its core, OminiControl leverages a\nparameter reuse mechanism, enabling the DiT to encode image conditions using\nitself as a powerful backbone and process them with its flexible multi-modal\nattention processors. Unlike existing methods, which rely heavily on additional\nencoder modules with complex architectures, OminiControl (1) effectively and\nefficiently incorporates injected image conditions with only ~0.1% additional\nparameters, and (2) addresses a wide range of image conditioning tasks in a\nunified manner, including subject-driven generation and spatially-aligned\nconditions such as edges, depth, and more. Remarkably, these capabilities are\nachieved by training on images generated by the DiT itself, which is\nparticularly beneficial for subject-driven generation. Extensive evaluations\ndemonstrate that OminiControl outperforms existing UNet-based and DiT-adapted\nmodels in both subject-driven and spatially-aligned conditional generation.\nAdditionally, we release our training dataset, Subjects200K, a diverse\ncollection of over 200,000 identity-consistent images, along with an efficient\ndata synthesis pipeline to advance research in subject-consistent generation.\n","authors":["Zhenxiong Tan","Songhua Liu","Xingyi Yang","Qiaochu Xue","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15098v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10919v3","updated":"2025-01-15T07:17:58Z","published":"2024-08-20T15:04:14Z","title":"CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network","summary":"  In recent years, Wi-Fi sensing has garnered significant attention due to its\nnumerous benefits, such as privacy protection, low cost, and penetration\nability. Extensive research has been conducted in this field, focusing on areas\nsuch as gesture recognition, people identification, and fall detection.\nHowever, many data-driven methods encounter challenges related to domain shift,\nwhere the model fails to perform well in environments different from the\ntraining data. One major factor contributing to this issue is the limited\navailability of Wi-Fi sensing datasets, which makes models learn excessive\nirrelevant information and over-fit to the training set. Unfortunately,\ncollecting large-scale Wi-Fi sensing datasets across diverse scenarios is a\nchallenging task. To address this problem, we propose CrossFi, a siamese\nnetwork-based approach that excels in both in-domain scenario and cross-domain\nscenario, including few-shot, zero-shot scenarios, and even works in few-shot\nnew-class scenario where testing set contains new categories. The core\ncomponent of CrossFi is a sample-similarity calculation network called CSi-Net,\nwhich improves the structure of the siamese network by using an attention\nmechanism to capture similarity information, instead of simply calculating the\ndistance or cosine similarity. Based on it, we develop an extra Weight-Net that\ncan generate a template for each class, so that our CrossFi can work in\ndifferent scenarios. Experimental results demonstrate that our CrossFi achieves\nstate-of-the-art performance across various scenarios. In gesture recognition\ntask, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%\nin one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,\nand 84.75% in one-shot new-class scenario. The code for our model is publicly\navailable at https://github.com/RS2002/CrossFi.\n","authors":["Zijian Zhao","Tingwei Chen","Zhijie Cai","Xiaoyang Li","Hang Li","Qimei Chen","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.10919v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00961v2","updated":"2025-01-15T06:46:51Z","published":"2025-01-01T21:45:00Z","title":"The Silent Majority: Demystifying Memorization Effect in the Presence of\n  Spurious Correlations","summary":"  Machine learning models often rely on simple spurious features -- patterns in\ntraining data that correlate with targets but are not causally related to them,\nlike image backgrounds in foreground classification. This reliance typically\nleads to imbalanced test performance across minority and majority groups. In\nthis work, we take a closer look at the fundamental cause of such imbalanced\nperformance through the lens of memorization, which refers to the ability to\npredict accurately on \\textit{atypical} examples (minority groups) in the\ntraining set but failing in achieving the same accuracy in the testing set.\nThis paper systematically shows the ubiquitous existence of spurious features\nin a small set of neurons within the network, providing the first-ever evidence\nthat memorization may contribute to imbalanced group performance. Through three\nexperimental sources of converging empirical evidence, we find the property of\na small subset of neurons or channels in memorizing minority group information.\nInspired by these findings, we articulate the hypothesis: the imbalanced group\nperformance is a byproduct of ``noisy'' spurious memorization confined to a\nsmall set of neurons. To further substantiate this hypothesis, we show that\neliminating these unnecessary spurious memorization patterns via a novel\nframework during training can significantly affect the model performance on\nminority groups. Our experimental results across various architectures and\nbenchmarks offer new insights on how neural networks encode core and spurious\nknowledge, laying the groundwork for future research in demystifying robustness\nto spurious correlation.\n","authors":["Chenyu You","Haocheng Dai","Yifei Min","Jasjeet S. Sekhon","Sarang Joshi","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2501.00961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08621v1","updated":"2025-01-15T06:40:26Z","published":"2025-01-15T06:40:26Z","title":"ViBidirectionMT-Eval: Machine Translation for Vietnamese-Chinese and\n  Vietnamese-Lao language pair","summary":"  This paper presents an results of the VLSP 2022-2023 Machine Translation\nShared Tasks, focusing on Vietnamese-Chinese and Vietnamese-Lao machine\ntranslation. The tasks were organized as part of the 9th, 10th annual workshop\non Vietnamese Language and Speech Processing (VLSP 2022, VLSP 2023). The\nobjective of the shared task was to build machine translation systems,\nspecifically targeting Vietnamese-Chinese and Vietnamese-Lao translation\n(corresponding to 4 translation directions). The submission were evaluated on\n1,000 pairs for testing (news and general domains) using established metrics\nlike BLEU [11] and SacreBLEU [12]. Additionally, system outputs also were\nevaluated with human judgment provided by experts in Chinese and Lao languages.\nThese human assessments played a crucial role in ranking the performance of the\nmachine translation models, ensuring a more comprehensive evaluation.\n","authors":["Hong-Viet Tran","Minh-Quy Nguyen","Van-Vinh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.08621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08618v1","updated":"2025-01-15T06:34:34Z","published":"2025-01-15T06:34:34Z","title":"Disjoint Processing Mechanisms of Hierarchical and Linear Grammars in\n  Large Language Models","summary":"  All natural languages are structured hierarchically. In humans, this\nstructural restriction is neurologically coded: when two grammars are presented\nwith identical vocabularies, brain areas responsible for language processing\nare only sensitive to hierarchical grammars. Using large language models\n(LLMs), we investigate whether such functionally distinct hierarchical\nprocessing regions can arise solely from exposure to large-scale language\ndistributions. We generate inputs using English, Italian, Japanese, or nonce\nwords, varying the underlying grammars to conform to either hierarchical or\nlinear/positional rules. Using these grammars, we first observe that language\nmodels show distinct behaviors on hierarchical versus linearly structured\ninputs. Then, we find that the components responsible for processing\nhierarchical grammars are distinct from those that process linear grammars; we\ncausally verify this in ablation experiments. Finally, we observe that\nhierarchy-selective components are also active on nonce grammars; this suggests\nthat hierarchy sensitivity is not tied to meaning, nor in-distribution inputs.\n","authors":["Aruna Sankaranarayanan","Dylan Hadfield-Menell","Aaron Mueller"],"pdf_url":"https://arxiv.org/pdf/2501.08618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08617v1","updated":"2025-01-15T06:33:15Z","published":"2025-01-15T06:33:15Z","title":"RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation","summary":"  Generative AI systems like foundation models (FMs) must align well with human\nvalues to ensure their behavior is helpful and trustworthy. While Reinforcement\nLearning from Human Feedback (RLHF) has shown promise for optimizing model\nperformance using human judgments, existing RLHF pipelines predominantly rely\non immediate feedback, which can fail to accurately reflect the downstream\nimpact of an interaction on users' utility. We demonstrate that feedback based\non evaluators' foresight estimates of downstream consequences systematically\ninduces Goodhart's Law dynamics, incentivizing misaligned behaviors like\nsycophancy and deception and ultimately degrading user outcomes. To alleviate\nthis, we propose decoupling evaluation from prediction by refocusing RLHF on\nhindsight feedback. Our theoretical analysis reveals that conditioning\nevaluator feedback on downstream observations mitigates misalignment and\nimproves expected human utility, even when these observations are simulated by\nthe AI system itself. To leverage this insight in a practical alignment\nalgorithm, we introduce Reinforcement Learning from Hindsight Simulation\n(RLHS), which first simulates plausible consequences and then elicits feedback\nto assess what behaviors were genuinely beneficial in hindsight. We apply RLHS\nto two widely-employed online and offline preference optimization methods --\nProximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) --\nand show empirically that misalignment is significantly reduced with both\nmethods. Through an online human user study, we show that RLHS consistently\noutperforms RLHF in helping users achieve their goals and earns higher\nsatisfaction ratings, despite being trained solely with simulated hindsight\nfeedback. These results underscore the importance of focusing on long-term\nconsequences, even simulated ones, to mitigate misalignment in RLHF.\n","authors":["Kaiqu Liang","Haimin Hu","Ryan Liu","Thomas L. Griffiths","Jaime Fernández Fisac"],"pdf_url":"https://arxiv.org/pdf/2501.08617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06832v4","updated":"2025-01-15T06:30:19Z","published":"2024-03-11T15:48:43Z","title":"Noise-powered Multi-modal Knowledge Graph Representation Framework","summary":"  The rise of Multi-modal Pre-training highlights the necessity for a unified\nMulti-Modal Knowledge Graph (MMKG) representation learning framework. Such a\nframework is essential for embedding structured knowledge into multi-modal\nLarge Language Models effectively, alleviating issues like knowledge\nmisconceptions and multi-modal hallucinations. In this work, we explore the\nefficacy of models in accurately embedding entities within MMKGs through two\npivotal tasks: Multi-modal Knowledge Graph Completion (MKGC) and Multi-modal\nEntity Alignment (MMEA). Building on this foundation, we propose a novel SNAG\nmethod that utilizes a Transformer-based architecture equipped with\nmodality-level noise masking to robustly integrate multi-modal entity features\nin KGs. By incorporating specific training objectives for both MKGC and MMEA,\nour approach achieves SOTA performance across a total of ten datasets,\ndemonstrating its versatility. Moreover, SNAG can not only function as a\nstandalone model but also enhance other existing methods, providing stable\nperformance improvements. Code and data are available at\nhttps://github.com/zjukg/SNAG.\n","authors":["Zhuo Chen","Yin Fang","Yichi Zhang","Lingbing Guo","Jiaoyan Chen","Jeff Z. Pan","Huajun Chen","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.06832v4.pdf","comment":"COLING 2025 Accepted, Repo is available at\n  https://github.com/zjukg/SNAG"},{"id":"http://arxiv.org/abs/2501.08603v1","updated":"2025-01-15T06:00:50Z","published":"2025-01-15T06:00:50Z","title":"Monte Carlo Tree Search for Comprehensive Exploration in LLM-Based\n  Automatic Heuristic Design","summary":"  Handcrafting heuristics for solving complex planning tasks (e.g., NP-hard\ncombinatorial optimization (CO) problems) is a common practice but requires\nextensive domain knowledge. Recently, Large Language Model (LLM)-based\nautomatic heuristics design (AHD) methods have shown promise in generating\nhigh-quality heuristics without manual intervention. Existing LLM-based AHD\nmethods employ a population to maintain a fixed number of top-performing\nLLM-generated heuristics and introduce evolutionary computation (EC) to enhance\nthe population iteratively. However, the population-based procedure brings\ngreedy properties, often resulting in convergence to local optima. Instead, to\nmore comprehensively explore the space of heuristics, we propose using Monte\nCarlo Tree Search (MCTS) for LLM-based heuristic evolution while preserving all\nLLM-generated heuristics in a tree structure. With a novel thought-alignment\nprocess and an exploration-decay technique, the proposed MCTS-AHD method\ndelivers significantly higher-quality heuristics on various complex tasks. Our\ncode is available at https://github.com/zz1358m/MCTS-AHD-master.\n","authors":["Zhi Zheng","Zhuoliang Xie","Zhenkun Wang","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2501.08603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04385v4","updated":"2025-01-15T06:00:17Z","published":"2024-01-09T07:14:45Z","title":"Machine unlearning through fine-grained model parameters perturbation","summary":"  Machine unlearning techniques, which involve retracting data records and\nreducing influence of said data on trained models, help with the user privacy\nprotection objective but incur significant computational costs. Weight\nperturbation-based unlearning is a general approach, but it typically involves\nglobally modifying the parameters. We propose fine-grained Top-K and Random-k\nparameters perturbed inexact machine unlearning strategies that address the\nprivacy needs while keeping the computational costs tractable.\n  In order to demonstrate the efficacy of our strategies we also tackle the\nchallenge of evaluating the effectiveness of machine unlearning by considering\nthe model's generalization performance across both unlearning and remaining\ndata. To better assess the unlearning effect and model generalization, we\npropose novel metrics, namely, the forgetting rate and memory retention rate.\nHowever, for inexact machine unlearning, current metrics are inadequate in\nquantifying the degree of forgetting that occurs after unlearning strategies\nare applied. To address this, we introduce SPD-GAN, which subtly perturbs the\ndistribution of data targeted for unlearning. Then, we evaluate the degree of\nunlearning by measuring the performance difference of the models on the\nperturbed unlearning data before and after the unlearning process. By\nimplementing these innovative techniques and metrics, we achieve\ncomputationally efficacious privacy protection in machine learning applications\nwithout significant sacrifice of model performance. Furthermore, this approach\nprovides a novel method for evaluating the degree of unlearning.\n","authors":["Zhiwei Zuo","Zhuo Tang","Kenli Li","Anwitaman Datta"],"pdf_url":"https://arxiv.org/pdf/2401.04385v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08600v1","updated":"2025-01-15T05:54:33Z","published":"2025-01-15T05:54:33Z","title":"AutoRestTest: A Tool for Automated REST API Testing Using LLMs and MARL","summary":"  As REST APIs have become widespread in modern web services, comprehensive\ntesting of these APIs has become increasingly crucial. Due to the vast search\nspace consisting of operations, parameters, and parameter values along with\ntheir complex dependencies and constraints, current testing tools suffer from\nlow code coverage, leading to suboptimal fault detection. To address this\nlimitation, we present a novel tool, AutoRestTest, which integrates the\nSemantic Operation Dependency Graph (SODG) with Multi-Agent Reinforcement\nLearning (MARL) and large language models (LLMs) for effective REST API\ntesting. AutoRestTest determines operation-dependent parameters using the SODG\nand employs five specialized agents (operation, parameter, value, dependency,\nand header) to identify dependencies of operations and generate operation\nsequences, parameter combinations, and values. AutoRestTest provides a\ncommand-line interface and continuous telemetry on successful operation count,\nunique server errors detected, and time elapsed. Upon completion, AutoRestTest\ngenerates a detailed report highlighting errors detected and operations\nexercised. In this paper, we introduce our tool and present preliminary\nresults.\n","authors":["Tyler Stennett","Myeongsoo Kim","Saurabh Sinha","Alessandro Orso"],"pdf_url":"https://arxiv.org/pdf/2501.08600v1.pdf","comment":"To be published in the 47th IEEE/ACM International Conference on\n  Software Engineering - Demonstration Track (ICSE-Demo 2025)"},{"id":"http://arxiv.org/abs/2501.08598v1","updated":"2025-01-15T05:51:20Z","published":"2025-01-15T05:51:20Z","title":"LlamaRestTest: Effective REST API Testing with Small Language Models","summary":"  Modern web services rely heavily on REST APIs, typically documented using the\nOpenAPI specification. The widespread adoption of this standard has resulted in\nthe development of many black-box testing tools that generate tests based on\nthese specifications. Recent advancements in Natural Language Processing (NLP),\nparticularly with Large Language Models (LLMs), have enhanced REST API testing\nby extracting actionable rules and generating input values from the\nhuman-readable portions of the specification. However, these advancements\noverlook the potential of continuously refining the identified rules and test\ninputs based on server responses. To address this limitation, we present\nLlamaRestTest, a novel approach that employs two custom LLMs to generate\nrealistic test inputs and uncover parameter dependencies during the testing\nprocess by incorporating server responses. These LLMs are created by\nfine-tuning the Llama3-8b model, using mined datasets of REST API example\nvalues and inter-parameter dependencies. We evaluated LlamaRestTest on 12\nreal-world services (including popular services such as Spotify), comparing it\nagainst RESTGPT, a GPT-powered specification-enhancement tool, as well as\nseveral state-of-the-art REST API testing tools, including RESTler, MoRest,\nEvoMaster, and ARAT-RL. Our results show that fine-tuning enables smaller LLMs\nto outperform larger models in detecting actionable rules and generating inputs\nfor REST API testing. We evaluated configurations from the base Llama3-8B to\nfine-tuned versions and explored 2-bit, 4-bit, and 8-bit quantization for\nefficiency. LlamaRestTest surpasses state-of-the-art tools in code coverage and\nerror detection, even with RESTGPT-enhanced specifications, and an ablation\nstudy highlights the impact of its novel components.\n","authors":["Myeongsoo Kim","Saurabh Sinha","Alessandro Orso"],"pdf_url":"https://arxiv.org/pdf/2501.08598v1.pdf","comment":"To be published in the ACM International Conference on the\n  Foundations of Software Engineering (FSE 2025)"},{"id":"http://arxiv.org/abs/2412.09468v2","updated":"2025-01-15T05:25:35Z","published":"2024-12-12T17:15:49Z","title":"STORM: A Spatio-Temporal Factor Model Based on Dual Vector Quantized\n  Variational Autoencoders for Financial Trading","summary":"  In financial trading, factor models are widely used to price assets and\ncapture excess returns from mispricing. Recently, we have witnessed the rise of\nvariational autoencoder-based latent factor models, which learn latent factors\nself-adaptively. While these models focus on modeling overall market\nconditions, they often fail to effectively capture the temporal patterns of\nindividual stocks. Additionally, representing multiple factors as single values\nsimplifies the model but limits its ability to capture complex relationships\nand dependencies. As a result, the learned factors are of low quality and lack\ndiversity, reducing their effectiveness and robustness across different trading\nperiods. To address these issues, we propose a Spatio-Temporal factOR Model\nbased on dual vector quantized variational autoencoders, named STORM, which\nextracts features of stocks from temporal and spatial perspectives, then fuses\nand aligns these features at the fine-grained and semantic level, and\nrepresents the factors as multi-dimensional embeddings. The discrete codebooks\ncluster similar factor embeddings, ensuring orthogonality and diversity, which\nhelps distinguish between different factors and enables factor selection in\nfinancial trading. To show the performance of the proposed factor model, we\napply it to two downstream experiments: portfolio management on two stock\ndatasets and individual trading tasks on six specific stocks. The extensive\nexperiments demonstrate STORM's flexibility in adapting to downstream tasks and\nsuperior performance over baseline models.\n","authors":["Yilei Zhao","Wentao Zhang","Tingran Yang","Yong Jiang","Fei Huang","Wei Yang Bryan Lim"],"pdf_url":"https://arxiv.org/pdf/2412.09468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08591v1","updated":"2025-01-15T05:20:01Z","published":"2025-01-15T05:20:01Z","title":"OpenMLDB: A Real-Time Relational Data Feature Computation System for\n  Online ML","summary":"  Efficient and consistent feature computation is crucial for a wide range of\nonline ML applications. Typically, feature computation is divided into two\ndistinct phases, i.e., offline stage for model training and online stage for\nmodel serving. These phases often rely on execution engines with different\ninterface languages and function implementations, causing significant\ninconsistencies. Moreover, many online ML features involve complex time-series\ncomputations (e.g., functions over varied-length table windows) that differ\nfrom standard streaming and analytical queries. Existing data processing\nsystems (e.g., Spark, Flink, DuckDB) often incur multi-second latencies for\nthese computations, making them unsuitable for real-time online ML applications\nthat demand timely feature updates.\n  This paper presents OpenMLDB, a feature computation system deployed in\n4Paradigm's SageOne platform and over 100 real scenarios. Technically, OpenMLDB\nfirst employs a unified query plan generator for consistent computation results\nacross the offline and online stages, significantly reducing feature deployment\noverhead. Second, OpenMLDB provides an online execution engine that resolves\nperformance bottlenecks caused by long window computations (via\npre-aggregation) and multi-table window unions (via data self-adjusting). It\nalso provides a high-performance offline execution engine with window parallel\noptimization and time-aware data skew resolving. Third, OpenMLDB features a\ncompact data format and stream-focused indexing to maximize memory usage and\naccelerate data access. Evaluations in testing and real workloads reveal\nsignificant performance improvements and resource savings compared to the\nbaseline systems. The open community of OpenMLDB now has over 150 contributors\nand gained 1.6k stars on GitHub.\n","authors":["Xuanhe Zhou","Wei Zhou","Liguo Qi","Hao Zhang","Dihao Chen","Bingsheng He","Mian Lu","Guoliang Li","Fan Wu","Yuqiang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08587v1","updated":"2025-01-15T05:15:54Z","published":"2025-01-15T05:15:54Z","title":"Sound Scene Synthesis at the DCASE 2024 Challenge","summary":"  This paper presents Task 7 at the DCASE 2024 Challenge: sound scene\nsynthesis. Recent advances in sound synthesis and generative models have\nenabled the creation of realistic and diverse audio content. We introduce a\nstandardized evaluation framework for comparing different sound scene synthesis\nsystems, incorporating both objective and subjective metrics. The challenge\nattracted four submissions, which are evaluated using the Fr\\'echet Audio\nDistance (FAD) and human perceptual ratings. Our analysis reveals significant\ninsights into the current capabilities and limitations of sound scene synthesis\nsystems, while also highlighting areas for future improvement in this rapidly\nevolving field.\n","authors":["Mathieu Lagrange","Junwon Lee","Modan Tailleur","Laurie M. Heller","Keunwoo Choi","Brian McFee","Keisuke Imoto","Yuki Okamoto"],"pdf_url":"https://arxiv.org/pdf/2501.08587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18023v3","updated":"2025-01-15T04:47:36Z","published":"2024-02-28T03:38:20Z","title":"Do Large Language Models Mirror Cognitive Language Processing?","summary":"  Large Language Models (LLMs) have demonstrated remarkable abilities in text\ncomprehension and logical reasoning, indicating that the text representations\nlearned by LLMs can facilitate their language processing capabilities. In\nneuroscience, brain cognitive processing signals are typically utilized to\nstudy human language processing. Therefore, it is natural to ask how well the\ntext embeddings from LLMs align with the brain cognitive processing signals,\nand how training strategies affect the LLM-brain alignment? In this paper, we\nemploy Representational Similarity Analysis (RSA) to measure the alignment\nbetween 23 mainstream LLMs and fMRI signals of the brain to evaluate how\neffectively LLMs simulate cognitive language processing. We empirically\ninvestigate the impact of various factors (e.g., pre-training data size, model\nscaling, alignment training, and prompts) on such LLM-brain alignment.\nExperimental results indicate that pre-training data size and model scaling are\npositively correlated with LLM-brain similarity, and alignment training can\nsignificantly improve LLM-brain similarity. Explicit prompts contribute to the\nconsistency of LLMs with brain cognitive language processing, while nonsensical\nnoisy prompts may attenuate such alignment. Additionally, the performance of a\nwide range of LLM evaluations (e.g., MMLU, Chatbot Arena) is highly correlated\nwith the LLM-brain similarity.\n","authors":["Yuqi Ren","Renren Jin","Tongxuan Zhang","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2402.18023v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08569v1","updated":"2025-01-15T04:31:56Z","published":"2025-01-15T04:31:56Z","title":"Evaluating SAT and SMT Solvers on Large-Scale Sudoku Puzzles","summary":"  Modern SMT solvers have revolutionized the approach to constraint\nsatisfaction problems by integrating advanced theory reasoning and encoding\ntechniques. In this work, we evaluate the performance of modern SMT solvers in\nZ3, CVC5 and DPLL(T) against a standard SAT solver in DPLL. By benchmarking\nthese solvers on novel, diverse 25x25 Sudoku puzzles of various difficulty\nlevels created by our improved Sudoku generator, we examine the impact of\nadvanced theory reasoning and encoding techniques. Our findings demonstrate\nthat modern SMT solvers significantly outperform classical SAT solvers. This\nwork highlights the evolution of logical solvers and exemplifies the utility of\nSMT solvers in addressing large-scale constraint satisfaction problems.\n","authors":["Liam Davis","Tairan Ji"],"pdf_url":"https://arxiv.org/pdf/2501.08569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08566v1","updated":"2025-01-15T04:17:48Z","published":"2025-01-15T04:17:48Z","title":"Towards Lightweight and Stable Zero-shot TTS with Self-distilled\n  Representation Disentanglement","summary":"  Zero-shot Text-To-Speech (TTS) synthesis shows great promise for personalized\nvoice customization through voice cloning. However, current methods for\nachieving zero-shot TTS heavily rely on large model scales and extensive\ntraining datasets to ensure satisfactory performance and generalizability\nacross various speakers. This raises concerns regarding both deployment costs\nand data security. In this paper, we present a lightweight and stable zero-shot\nTTS system. We introduce a novel TTS architecture designed to effectively model\nlinguistic content and various speaker attributes from source speech and prompt\nspeech, respectively. Furthermore, we present a two-stage self-distillation\nframework that constructs parallel data pairs for effectively disentangling\nlinguistic content and speakers from the perspective of training data.\nExtensive experiments show that our system exhibits excellent performance and\nsuperior stability on the zero-shot TTS tasks. Moreover, it shows markedly\nsuperior computational efficiency, with RTFs of 0.13 and 0.012 on the CPU and\nGPU, respectively.\n","authors":["Qianniu Chen","Xiaoyang Hao","Bowen Li","Yue Liu","Li Lu"],"pdf_url":"https://arxiv.org/pdf/2501.08566v1.pdf","comment":"5 pages,4 figures"},{"id":"http://arxiv.org/abs/2405.19213v2","updated":"2025-01-15T04:17:38Z","published":"2024-05-29T15:56:33Z","title":"EdgeSight: Enabling Modeless and Cost-Efficient Inference at the Edge","summary":"  Traditional ML inference is evolving toward modeless inference, which\nabstracts the complexity of model selection from users, allowing the system to\nautomatically choose the most appropriate model for each request based on\naccuracy and resource requirements. While prior studies have focused on\nmodeless inference within data centers, this paper tackles the pressing need\nfor cost-efficient modeless inference at the edge -- particularly within its\nunique constraints of limited device memory, volatile network conditions, and\nrestricted power consumption.\n  To overcome these challenges, we propose EdgeSight, a system that provides\ncost-efficient EdgeSight serving for diverse DNNs at the edge. EdgeSight\nemploys an edge-data center (edge-DC) architecture, utilizing confidence\nscaling to reduce the number of model options while meeting diverse accuracy\nrequirements. Additionally, it supports lossy inference in volatile network\nenvironments. Our experimental results show that EdgeSight outperforms existing\nsystems by up to 1.6x in P99 latency for modeless services. Furthermore, our\nFPGA prototype demonstrates similar performance at certain accuracy levels,\nwith a power consumption reduction of up to 3.34x.\n","authors":["ChonLam Lao","Jiaqi Gao","Ganesh Ananthanarayanan","Aditya Akella","Minlan Yu"],"pdf_url":"https://arxiv.org/pdf/2405.19213v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2501.08565v1","updated":"2025-01-15T04:16:28Z","published":"2025-01-15T04:16:28Z","title":"DualOpt: A Dual Divide-and-Optimize Algorithm for the Large-scale\n  Traveling Salesman Problem","summary":"  This paper proposes a dual divide-and-optimize algorithm (DualOpt) for\nsolving the large-scale traveling salesman problem (TSP). DualOpt combines two\ncomplementary strategies to improve both solution quality and computational\nefficiency. The first strategy is a grid-based divide-and-conquer procedure\nthat partitions the TSP into smaller sub-problems, solving them in parallel and\niteratively refining the solution by merging nodes and partial routes. The\nprocess continues until only one grid remains, yielding a high-quality initial\nsolution. The second strategy involves a path-based divide-and-optimize\nprocedure that further optimizes the solution by dividing it into sub-paths,\noptimizing each using a neural solver, and merging them back to progressively\nimprove the overall solution. Extensive experiments conducted on two groups of\nTSP benchmark instances, including randomly generated instances with up to\n100,000 nodes and real-world datasets from TSPLIB, demonstrate the\neffectiveness of DualOpt. The proposed DualOpt achieves highly competitive\nresults compared to 10 state-of-the-art algorithms in the literature. In\nparticular, DualOpt achieves an improvement gap up to 1.40% for the largest\ninstance TSP100K with a remarkable 104x speed-up over the leading heuristic\nsolver LKH3. Additionally, DualOpt demonstrates strong generalization on TSPLIB\nbenchmarks, confirming its capability to tackle diverse real-world TSP\napplications.\n","authors":["Shipei Zhou","Yuandong Ding","Chi Zhang","Zhiguang Cao","Yan Jin"],"pdf_url":"https://arxiv.org/pdf/2501.08565v1.pdf","comment":"Accepted by AAAI-25, February 2025"},{"id":"http://arxiv.org/abs/2501.08561v1","updated":"2025-01-15T04:04:57Z","published":"2025-01-15T04:04:57Z","title":"ANSR-DT: An Adaptive Neuro-Symbolic Learning and Reasoning Framework for\n  Digital Twins","summary":"  In this paper, we propose an Adaptive Neuro-Symbolic Learning Framework for\ndigital twin technology called ``ANSR-DT.\" Our approach combines pattern\nrecognition algorithms with reinforcement learning and symbolic reasoning to\nenable real-time learning and adaptive intelligence. This integration enhances\nthe understanding of the environment and promotes continuous learning, leading\nto better and more effective decision-making in real-time for applications that\nrequire human-machine collaboration. We evaluated the \\textit{ANSR-DT}\nframework for its ability to learn and adapt to dynamic patterns, observing\nsignificant improvements in decision accuracy, reliability, and\ninterpretability when compared to existing state-of-the-art methods. However,\nchallenges still exist in extracting and integrating symbolic rules in complex\nenvironments, which limits the full potential of our framework in heterogeneous\nsettings. Moreover, our ongoing research aims to address this issue in the\nfuture by ensuring seamless integration of neural models at large. In addition,\nour open-source implementation promotes reproducibility and encourages future\nresearch to build on our foundational work.\n","authors":["Safayat Bin Hakim","Muhammad Adil","Alvaro Velasquez","Houbing Herbert Song"],"pdf_url":"https://arxiv.org/pdf/2501.08561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08558v1","updated":"2025-01-15T03:49:08Z","published":"2025-01-15T03:49:08Z","title":"LAMS: LLM-Driven Automatic Mode Switching for Assistive Teleoperation","summary":"  Teleoperating high degrees-of-freedom (DoF) robotic manipulators via low-DoF\ncontrollers like joysticks often requires frequent switching between control\nmodes, where each mode maps controller movements to specific robot actions.\nManually performing this frequent switching can make teleoperation cumbersome\nand inefficient. On the other hand, existing automatic mode-switching\nsolutions, such as heuristic-based or learning-based methods, are often\ntask-specific and lack generalizability. In this paper, we introduce LLM-Driven\nAutomatic Mode Switching (LAMS), a novel approach that leverages Large Language\nModels (LLMs) to automatically switch control modes based on task context.\nUnlike existing methods, LAMS requires no prior task demonstrations and\nincrementally improves by integrating user-generated mode-switching examples.\nWe validate LAMS through an ablation study and a user study with 10\nparticipants on complex, long-horizon tasks, demonstrating that LAMS\neffectively reduces manual mode switches, is preferred over alternative\nmethods, and improves performance over time. The project website with\nsupplementary materials is at https://lams-assistance.github.io/.\n","authors":["Yiran Tao","Jehan Yang","Dan Ding","Zackory Erickson"],"pdf_url":"https://arxiv.org/pdf/2501.08558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04820v2","updated":"2025-01-15T03:43:22Z","published":"2024-08-09T02:22:51Z","title":"Natural Language Outlines for Code: Literate Programming in the LLM Era","summary":"  We propose using natural language outlines as a novel modality and\ninteraction surface for providing AI assistance to developers throughout the\nsoftware development process. An NL outline for a code function comprises\nmultiple statements written in concise prose, which partition the code and\nsummarize its main ideas in the style of literate programming. Crucially, we\nfind that modern LLMs can generate accurate and high-quality NL outlines in\npractice. Moreover, NL outlines enable a bidirectional sync between code and\nNL, allowing changes in one to be automatically reflected in the other. We\ndiscuss many use cases for NL outlines: they can accelerate understanding and\nnavigation of code and diffs, simplify code maintenance, augment code search,\nsteer code generation, and more. We then propose and compare multiple LLM\nprompting techniques for generating outlines and ask professional developers to\njudge outline quality. Finally, we present two case studies applying NL\noutlines toward code review and malware detection.\n","authors":["Kensen Shi","Deniz Altınbüken","Saswat Anand","Mihai Christodorescu","Katja Grünwedel","Alexa Koenings","Sai Naidu","Anurag Pathak","Marc Rasi","Fredde Ribeiro","Brandon Ruffin","Siddhant Sanyam","Maxim Tabachnyk","Sara Toth","Roy Tu","Tobias Welp","Pengcheng Yin","Manzil Zaheer","Satish Chandra","Charles Sutton"],"pdf_url":"https://arxiv.org/pdf/2408.04820v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02512v2","updated":"2025-01-15T03:23:39Z","published":"2024-09-04T08:21:47Z","title":"Continual Diffuser (CoD): Mastering Continual Offline Reinforcement\n  Learning with Experience Rehearsal","summary":"  Artificial neural networks, especially recent diffusion-based models, have\nshown remarkable superiority in gaming, control, and QA systems, where the\ntraining tasks' datasets are usually static. However, in real-world\napplications, such as robotic control of reinforcement learning (RL), the tasks\nare changing, and new tasks arise in a sequential order. This situation poses\nthe new challenge of plasticity-stability trade-off for training an agent who\ncan adapt to task changes and retain acquired knowledge. In view of this, we\npropose a rehearsal-based continual diffusion model, called Continual Diffuser\n(CoD), to endow the diffuser with the capabilities of quick adaptation\n(plasticity) and lasting retention (stability). Specifically, we first\nconstruct an offline benchmark that contains 90 tasks from multiple domains.\nThen, we train the CoD on each task with sequential modeling and conditional\ngeneration for making decisions. Next, we preserve a small portion of previous\ndatasets as the rehearsal buffer and replay it to retain the acquired\nknowledge. Extensive experiments on a series of tasks show CoD can achieve a\npromising plasticity-stability trade-off and outperform existing\ndiffusion-based methods and other representative baselines on most tasks.\n","authors":["Jifeng Hu","Li Shen","Sili Huang","Zhejian Yang","Hechang Chen","Lichao Sun","Yi Chang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2409.02512v2.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.08552v1","updated":"2025-01-15T03:23:06Z","published":"2025-01-15T03:23:06Z","title":"Reinforcement Learning-Enhanced Procedural Generation for Dynamic\n  Narrative-Driven AR Experiences","summary":"  Procedural Content Generation (PCG) is widely used to create scalable and\ndiverse environments in games. However, existing methods, such as the Wave\nFunction Collapse (WFC) algorithm, are often limited to static scenarios and\nlack the adaptability required for dynamic, narrative-driven applications,\nparticularly in augmented reality (AR) games. This paper presents a\nreinforcement learning-enhanced WFC framework designed for mobile AR\nenvironments. By integrating environment-specific rules and dynamic tile weight\nadjustments informed by reinforcement learning (RL), the proposed method\ngenerates maps that are both contextually coherent and responsive to gameplay\nneeds. Comparative evaluations and user studies demonstrate that the framework\nachieves superior map quality and delivers immersive experiences, making it\nwell-suited for narrative-driven AR games. Additionally, the method holds\npromise for broader applications in education, simulation training, and\nimmersive extended reality (XR) experiences, where dynamic and adaptive\nenvironments are critical.\n","authors":["Aniruddha Srinivas Joshi"],"pdf_url":"https://arxiv.org/pdf/2501.08552v1.pdf","comment":"Number of pages: 13, Number of figures: 4. Accepted for presentation\n  at GRAPP 2025 - 20th International Conference on Computer Graphics Theory and\n  Applications (for additional details on the conference visit\n  https://grapp.scitevents.org). Disclaimer: This preprint may differ from the\n  final version published in the conference proceedings"},{"id":"http://arxiv.org/abs/2501.08549v1","updated":"2025-01-15T03:17:24Z","published":"2025-01-15T03:17:24Z","title":"The Devil is in Temporal Token: High Quality Video Reasoning\n  Segmentation","summary":"  Existing methods for Video Reasoning Segmentation rely heavily on a single\nspecial token to represent the object in the keyframe or the entire video,\ninadequately capturing spatial complexity and inter-frame motion. To overcome\nthese challenges, we propose VRS-HQ, an end-to-end video reasoning segmentation\napproach that leverages Multimodal Large Language Models (MLLMs) to inject rich\nspatiotemporal features into hierarchical tokens.Our key innovations include a\nTemporal Dynamic Aggregation (TDA) and a Token-driven Keyframe Selection (TKS).\nSpecifically, we design frame-level <SEG> and temporal-level <TAK> tokens that\nutilize MLLM's autoregressive learning to effectively capture both local and\nglobal information. Subsequently, we apply a similarity-based weighted fusion\nand frame selection strategy, then utilize SAM2 to perform keyframe\nsegmentation and propagation. To enhance keyframe localization accuracy, the\nTKS filters keyframes based on SAM2's occlusion scores during inference. VRS-HQ\nachieves state-of-the-art performance on ReVOS, surpassing VISA by\n5.9%/12.5%/9.1% in J&F scores across the three subsets. These results highlight\nthe strong temporal reasoning and segmentation capabilities of our method. Code\nand model weights will be released at VRS-HQ.\n","authors":["Sitong Gong","Yunzhi Zhuge","Lu Zhang","Zongxin Yang","Pingping Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.08549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08540v1","updated":"2025-01-15T03:00:57Z","published":"2025-01-15T03:00:57Z","title":"Knowledge prompt chaining for semantic modeling","summary":"  The task of building semantics for structured data such as CSV, JSON, and XML\nfiles is highly relevant in the knowledge representation field. Even though we\nhave a vast of structured data on the internet, mapping them to domain\nontologies to build semantics for them is still very challenging as it requires\nthe construction model to understand and learn graph-structured knowledge.\nOtherwise, the task will require human beings' effort and cost. In this paper,\nwe proposed a novel automatic semantic modeling framework: Knowledge Prompt\nChaining. It can serialize the graph-structured knowledge and inject it into\nthe LLMs properly in a Prompt Chaining architecture. Through this knowledge\ninjection and prompting chaining, the model in our framework can learn the\nstructure information and latent space of the graph and generate the semantic\nlabels and semantic graphs following the chains' insturction naturally. Based\non experimental results, our method achieves better performance than existing\nleading techniques, despite using reduced structured input data.\n","authors":["Ning Pei Ding","Jingge Du","Zaiwen Feng"],"pdf_url":"https://arxiv.org/pdf/2501.08540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08187v2","updated":"2025-01-15T02:59:32Z","published":"2025-01-14T15:12:19Z","title":"A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction\n  Following","summary":"  Large language models excel at interpreting complex natural language\ninstructions, enabling them to perform a wide range of tasks. In the life\nsciences, single-cell RNA sequencing (scRNA-seq) data serves as the \"language\nof cellular biology\", capturing intricate gene expression patterns at the\nsingle-cell level. However, interacting with this \"language\" through\nconventional tools is often inefficient and unintuitive, posing challenges for\nresearchers. To address these limitations, we present InstructCell, a\nmulti-modal AI copilot that leverages natural language as a medium for more\ndirect and flexible single-cell analysis. We construct a comprehensive\nmulti-modal instruction dataset that pairs text-based instructions with\nscRNA-seq profiles from diverse tissues and species. Building on this, we\ndevelop a multi-modal cell language architecture capable of simultaneously\ninterpreting and processing both modalities. InstructCell empowers researchers\nto accomplish critical tasks-such as cell type annotation, conditional\npseudo-cell generation, and drug sensitivity prediction-using straightforward\nnatural language commands. Extensive evaluations demonstrate that InstructCell\nconsistently meets or exceeds the performance of existing single-cell\nfoundation models, while adapting to diverse experimental conditions. More\nimportantly, InstructCell provides an accessible and intuitive tool for\nexploring complex single-cell data, lowering technical barriers and enabling\ndeeper biological insights.\n","authors":["Yin Fang","Xinle Deng","Kangwei Liu","Ningyu Zhang","Jingyang Qian","Penghui Yang","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08187v2.pdf","comment":"37 pages; 13 figures; Code: https://github.com/zjunlp/Instructcell,\n  Models: https://huggingface.co/zjunlp/Instructcell-chat,\n  https://huggingface.co/zjunlp/InstructCell-instruct"},{"id":"http://arxiv.org/abs/2403.15796v3","updated":"2025-01-15T02:48:59Z","published":"2024-03-23T11:03:31Z","title":"Understanding Emergent Abilities of Language Models from the Loss\n  Perspective","summary":"  Recent studies have put into question the belief that emergent abilities in\nlanguage models are exclusive to large models. This skepticism arises from two\nobservations: 1) smaller models can also exhibit high performance on emergent\nabilities and 2) there is doubt on the discontinuous metrics used to measure\nthese abilities. In this paper, we propose to study emergent abilities in the\nlens of pre-training loss, instead of model size or training compute. We\ndemonstrate that the Transformer models with the same pre-training loss, but\ndifferent model and data sizes, generate the same performance on various\ndownstream tasks, with a fixed data corpus, tokenization, and model\narchitecture. We also discover that a model exhibits emergent abilities on\ncertain tasks -- regardless of the continuity of metrics -- when its\npre-training loss falls below a specific threshold. Before reaching this\nthreshold, its performance remains at the level of random guessing. This\ninspires us to redefine emergent abilities as those that manifest in models\nwith lower pre-training losses, highlighting that these abilities cannot be\npredicted by merely extrapolating the performance trends of models with higher\npre-training losses.\n","authors":["Zhengxiao Du","Aohan Zeng","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2403.15796v3.pdf","comment":"23 pages, 8 figures. Accepted in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.08109v2","updated":"2025-01-15T02:48:33Z","published":"2025-01-14T13:40:08Z","title":"Data-driven inventory management for new products: A warm-start and\n  adjusted Dyna-$Q$ approach","summary":"  In this paper, we propose a novel reinforcement learning algorithm for\ninventory management of newly launched products with no or limited historical\ndemand information. The algorithm follows the classic Dyna-$Q$ structure,\nbalancing the model-based and model-free approaches, while accelerating the\ntraining process of Dyna-$Q$ and mitigating the model discrepancy generated by\nthe model-based feedback. Warm-start information from the demand data of\nexisting similar products can be incorporated into the algorithm to further\nstabilize the early-stage training and reduce the variance of the estimated\noptimal policy. Our approach is validated through a case study of bakery\ninventory management with real data. The adjusted Dyna-$Q$ shows up to a 23.7%\nreduction in average daily cost compared with $Q$-learning, and up to a 77.5%\nreduction in training time within the same horizon compared with classic\nDyna-$Q$. By incorporating the warm-start information, it can be found that the\nadjusted Dyna-$Q$ has the lowest total cost, lowest variance in total cost, and\nrelatively low shortage percentages among all the algorithms under a 30-day\ntesting.\n","authors":["Xinye Qu","Longxiao Liu","Wenjie Huang"],"pdf_url":"https://arxiv.org/pdf/2501.08109v2.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2412.09887v2","updated":"2025-01-15T02:46:18Z","published":"2024-12-13T06:05:53Z","title":"CSL-L2M: Controllable Song-Level Lyric-to-Melody Generation Based on\n  Conditional Transformer with Fine-Grained Lyric and Musical Controls","summary":"  Lyric-to-melody generation is a highly challenging task in the field of AI\nmusic generation. Due to the difficulty of learning strict yet weak\ncorrelations between lyrics and melodies, previous methods have suffered from\nweak controllability, low-quality and poorly structured generation. To address\nthese challenges, we propose CSL-L2M, a controllable song-level lyric-to-melody\ngeneration method based on an in-attention Transformer decoder with\nfine-grained lyric and musical controls, which is able to generate full-song\nmelodies matched with the given lyrics and user-specified musical attributes.\nSpecifically, we first introduce REMI-Aligned, a novel music representation\nthat incorporates strict syllable- and sentence-level alignments between lyrics\nand melodies, facilitating precise alignment modeling. Subsequently,\nsentence-level semantic lyric embeddings independently extracted from a\nsentence-wise Transformer encoder are combined with word-level part-of-speech\nembeddings and syllable-level tone embeddings as fine-grained controls to\nenhance the controllability of lyrics over melody generation. Then we introduce\nhuman-labeled musical tags, sentence-level statistical musical attributes, and\nlearned musical features extracted from a pre-trained VQ-VAE as coarse-grained,\nfine-grained and high-fidelity controls, respectively, to the generation\nprocess, thereby enabling user control over melody generation. Finally, an\nin-attention Transformer decoder technique is leveraged to exert fine-grained\ncontrol over the full-song melody generation with the aforementioned lyric and\nmusical conditions. Experimental results demonstrate that our proposed CSL-L2M\noutperforms the state-of-the-art models, generating melodies with higher\nquality, better controllability and enhanced structure. Demos and source code\nare available at https://lichaiustc.github.io/CSL-L2M/.\n","authors":["Li Chai","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2412.09887v2.pdf","comment":"Accepted at AAAI-25"},{"id":"http://arxiv.org/abs/2409.18946v3","updated":"2025-01-15T02:42:42Z","published":"2024-09-27T17:46:05Z","title":"Unconditional stability of a recurrent neural circuit implementing\n  divisive normalization","summary":"  Stability in recurrent neural models poses a significant challenge,\nparticularly in developing biologically plausible neurodynamical models that\ncan be seamlessly trained. Traditional cortical circuit models are notoriously\ndifficult to train due to expansive nonlinearities in the dynamical system,\nleading to an optimization problem with nonlinear stability constraints that\nare difficult to impose. Conversely, recurrent neural networks (RNNs) excel in\ntasks involving sequential data but lack biological plausibility and\ninterpretability. In this work, we address these challenges by linking dynamic\ndivisive normalization (DN) to the stability of ORGaNICs, a biologically\nplausible recurrent cortical circuit model that dynamically achieves DN and\nthat has been shown to simulate a wide range of neurophysiological phenomena.\nBy using the indirect method of Lyapunov, we prove the remarkable property of\nunconditional local stability for an arbitrary-dimensional ORGaNICs circuit\nwhen the recurrent weight matrix is the identity. We thus connect ORGaNICs to a\nsystem of coupled damped harmonic oscillators, which enables us to derive the\ncircuit's energy function, providing a normative principle of what the circuit,\nand individual neurons, aim to accomplish. Further, for a generic recurrent\nweight matrix, we prove the stability of the 2D model and demonstrate\nempirically that stability holds in higher dimensions. Finally, we show that\nORGaNICs can be trained by backpropagation through time without gradient\nclipping/scaling, thanks to its intrinsic stability property and adaptive time\nconstants, which address the problems of exploding, vanishing, and oscillating\ngradients. By evaluating the model's performance on RNN benchmarks, we find\nthat ORGaNICs outperform alternative neurodynamical models on static image\nclassification tasks and perform comparably to LSTMs on sequential tasks.\n","authors":["Shivang Rawat","David J. Heeger","Stefano Martiniani"],"pdf_url":"https://arxiv.org/pdf/2409.18946v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08528v1","updated":"2025-01-15T02:37:28Z","published":"2025-01-15T02:37:28Z","title":"Dynamic Portfolio Optimization via Augmented DDPG with Quantum Price\n  Levels-Based Trading Strategy","summary":"  With the development of deep learning, Dynamic Portfolio Optimization (DPO)\nproblem has received a lot of attention in recent years, not only in the field\nof finance but also in the field of deep learning. Some advanced research in\nrecent years has proposed the application of Deep Reinforcement Learning (DRL)\nto the DPO problem, which demonstrated to be more advantageous than supervised\nlearning in solving the DPO problem. However, there are still certain unsolved\nissues: 1) DRL algorithms usually have the problems of slow learning speed and\nhigh sample complexity, which is especially problematic when dealing with\ncomplex financial data. 2) researchers use DRL simply for the purpose of\nobtaining high returns, but pay little attention to the problem of risk control\nand trading strategy, which will affect the stability of model returns. In\norder to address these issues, in this study we revamped the intrinsic\nstructure of the model based on the Deep Deterministic Policy Gradient (DDPG)\nand proposed the Augmented DDPG model. Besides, we also proposed an innovative\nrisk control strategy based on Quantum Price Levels (QPLs) derived from Quantum\nFinance Theory (QFT). Our experimental results revealed that our model has\nbetter profitability as well as risk control ability with less sample\ncomplexity in the DPO problem compared to the baseline models.\n","authors":["Runsheng Lin","Zihan Xing","Mingze Ma","Raymond S. T. Lee"],"pdf_url":"https://arxiv.org/pdf/2501.08528v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2501.08523v1","updated":"2025-01-15T02:25:35Z","published":"2025-01-15T02:25:35Z","title":"Doc-Guided Sent2Sent++: A Sent2Sent++ Agent with Doc-Guided memory for\n  Document-level Machine Translation","summary":"  The field of artificial intelligence has witnessed significant advancements\nin natural language processing, largely attributed to the capabilities of Large\nLanguage Models (LLMs). These models form the backbone of Agents designed to\naddress long-context dependencies, particularly in Document-level Machine\nTranslation (DocMT). DocMT presents unique challenges, with quality,\nconsistency, and fluency being the key metrics for evaluation. Existing\napproaches, such as Doc2Doc and Doc2Sent, either omit sentences or compromise\nfluency. This paper introduces Doc-Guided Sent2Sent++, an Agent that employs an\nincremental sentence-level forced decoding strategy \\textbf{to ensure every\nsentence is translated while enhancing the fluency of adjacent sentences.} Our\nAgent leverages a Doc-Guided Memory, focusing solely on the summary and its\ntranslation, which we find to be an efficient approach to maintaining\nconsistency. Through extensive testing across multiple languages and domains,\nwe demonstrate that Sent2Sent++ outperforms other methods in terms of quality,\nconsistency, and fluency. The results indicate that, our approach has achieved\nsignificant improvements in metrics such as s-COMET, d-COMET, LTCR-$1_f$, and\ndocument-level perplexity (d-ppl). The contributions of this paper include a\ndetailed analysis of current DocMT research, the introduction of the\nSent2Sent++ decoding method, the Doc-Guided Memory mechanism, and validation of\nits effectiveness across languages and domains.\n","authors":["Jiaxin Guo","Yuanchang Luo","Daimeng Wei","Ling Zhang","Zongyao Li","Hengchao Shang","Zhiqiang Rao","Shaojun Li","Jinlong Yang","Zhanglin Wu","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2501.08523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17101v9","updated":"2025-01-15T02:23:44Z","published":"2024-03-25T18:38:54Z","title":"AI Consciousness is Inevitable: A Theoretical Computer Science\n  Perspective","summary":"  We look at consciousness through the lens of Theoretical Computer Science, a\nbranch of mathematics that studies computation under resource limitations. From\nthis perspective, we develop a formal machine model for consciousness. The\nmodel is inspired by Alan Turing's simple yet powerful model of computation and\nBernard Baars' theater model of consciousness. Though extremely simple, the\nmodel (1) aligns at a high level with many of the major scientific theories of\nhuman and animal consciousness, (2) provides explanations at a high level for\nmany phenomena associated with consciousness, and (3) gives insight into how a\nmachine can have subjective consciousness. This combination supports our claim\nthat machine consciousness is not only plausible but inevitable.\n","authors":["Lenore Blum","Manuel Blum"],"pdf_url":"https://arxiv.org/pdf/2403.17101v9.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08521v1","updated":"2025-01-15T02:17:38Z","published":"2025-01-15T02:17:38Z","title":"Mitigating Domain Shift in Federated Learning via Intra- and\n  Inter-Domain Prototypes","summary":"  Federated Learning (FL) has emerged as a decentralized machine learning\ntechnique, allowing clients to train a global model collaboratively without\nsharing private data. However, most FL studies ignore the crucial challenge of\nheterogeneous domains where each client has a distinct feature distribution,\nwhich is common in real-world scenarios. Prototype learning, which leverages\nthe mean feature vectors within the same classes, has become a prominent\nsolution for federated learning under domain skew. However, existing federated\nprototype learning methods only consider inter-domain prototypes on the server\nand overlook intra-domain characteristics. In this work, we introduce a novel\nfederated prototype learning method, namely I$^2$PFL, which incorporates\n$\\textbf{I}$ntra-domain and $\\textbf{I}$nter-domain $\\textbf{P}$rototypes, to\nmitigate domain shifts and learn a generalized global model across multiple\ndomains in federated learning. To construct intra-domain prototypes, we propose\nfeature alignment with MixUp-based augmented prototypes to capture the\ndiversity of local domains and enhance the generalization of local features.\nAdditionally, we introduce a reweighting mechanism for inter-domain prototypes\nto generate generalized prototypes to provide inter-domain knowledge and reduce\ndomain skew across multiple clients. Extensive experiments on the Digits,\nOffice-10, and PACS datasets illustrate the superior performance of our method\ncompared to other baselines.\n","authors":["Huy Q. Le","Ye Lin Tun","Yu Qiao","Minh N. H. Nguyen","Keon Oh Kim","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2501.08521v1.pdf","comment":"13 pages, 9 figures, 10 tables"},{"id":"http://arxiv.org/abs/2501.08518v1","updated":"2025-01-15T02:06:29Z","published":"2025-01-15T02:06:29Z","title":"Easing Seasickness through Attention Redirection with a\n  Mindfulness-Based Brain--Computer Interface","summary":"  Seasickness is a prevalent issue that adversely impacts both passenger\nexperiences and the operational efficiency of maritime crews. While techniques\nthat redirect attention have proven effective in alleviating motion sickness\nsymptoms in terrestrial environments, applying similar strategies to manage\nseasickness poses unique challenges due to the prolonged and intense motion\nenvironment associated with maritime travel. In this study, we propose a\nmindfulness brain-computer interface (BCI), specifically designed to redirect\nattention with the aim of mitigating seasickness symptoms in real-world\nsettings. Our system utilizes a single-channel headband to capture prefrontal\nEEG signals, which are then wirelessly transmitted to computing devices for the\nassessment of mindfulness states. The results are transferred into real-time\nfeedback as mindfulness scores and audiovisual stimuli, facilitating a shift in\nattentional focus from physiological discomfort to mindfulness practices. A\ntotal of 43 individuals participated in a real-world maritime experiment\nconsisted of three sessions: a real-feedback mindfulness session, a resting\nsession, and a pseudofeedback mindfulness session. Notably, 81.39% of\nparticipants reported that the mindfulness BCI intervention was effective, and\nthere was a significant reduction in the severity of seasickness, as measured\nby the Misery Scale (MISC). Furthermore, EEG analysis revealed a decrease in\nthe theta/beta ratio, corresponding with the alleviation of seasickness\nsymptoms. A decrease in overall EEG band power during the real-feedback\nmindfulness session suggests that the mindfulness BCI fosters a more tranquil\nand downregulated state of brain activity. Together, this study presents a\nnovel nonpharmacological, portable, and effective approach for seasickness\nintervention, with the potential to enhance the cruising experience for both\npassengers and crews.\n","authors":["Xiaoyu Bao","Kailin Xu","Jiawei Zhu","Haiyun Huang","Kangning Li","Qiyun Huang","Yuanqing Li"],"pdf_url":"https://arxiv.org/pdf/2501.08518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11409v3","updated":"2025-01-15T01:59:02Z","published":"2024-12-16T03:25:23Z","title":"Multi-modal and Multi-scale Spatial Environment Understanding for\n  Immersive Visual Text-to-Speech","summary":"  Visual Text-to-Speech (VTTS) aims to take the environmental image as the\nprompt to synthesize the reverberant speech for the spoken content. The\nchallenge of this task lies in understanding the spatial environment from the\nimage. Many attempts have been made to extract global spatial visual\ninformation from the RGB space of an spatial image. However, local and depth\nimage information are crucial for understanding the spatial environment, which\nprevious works have ignored. To address the issues, we propose a novel\nmulti-modal and multi-scale spatial environment understanding scheme to achieve\nimmersive VTTS, termed M2SE-VTTS. The multi-modal aims to take both the RGB and\nDepth spaces of the spatial image to learn more comprehensive spatial\ninformation, and the multi-scale seeks to model the local and global spatial\nknowledge simultaneously. Specifically, we first split the RGB and Depth images\ninto patches and adopt the Gemini-generated environment captions to guide the\nlocal spatial understanding. After that, the multi-modal and multi-scale\nfeatures are integrated by the local-aware global spatial understanding. In\nthis way, M2SE-VTTS effectively models the interactions between local and\nglobal spatial contexts in the multi-modal spatial environment. Objective and\nsubjective evaluations suggest that our model outperforms the advanced\nbaselines in environmental speech generation. The code and audio samples are\navailable at: https://github.com/AI-S2-Lab/M2SE-VTTS.\n","authors":["Rui Liu","Shuwei He","Yifan Hu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2412.11409v3.pdf","comment":"9 pages,2 figures, Accepted by AAAI'2025"},{"id":"http://arxiv.org/abs/2411.00205v2","updated":"2025-01-15T01:46:25Z","published":"2024-10-31T20:56:07Z","title":"Compositional Automata Embeddings for Goal-Conditioned Reinforcement\n  Learning","summary":"  Goal-conditioned reinforcement learning is a powerful way to control an AI\nagent's behavior at runtime. That said, popular goal representations, e.g.,\ntarget states or natural language, are either limited to Markovian tasks or\nrely on ambiguous task semantics. We propose representing temporal goals using\ncompositions of deterministic finite automata (cDFAs) and use cDFAs to guide RL\nagents. cDFAs balance the need for formal temporal semantics with ease of\ninterpretation: if one can understand a flow chart, one can understand a cDFA.\nOn the other hand, cDFAs form a countably infinite concept class with Boolean\nsemantics, and subtle changes to the automaton can result in very different\ntasks, making them difficult to condition agent behavior on. To address this,\nwe observe that all paths through a DFA correspond to a series of reach-avoid\ntasks and propose pre-training graph neural network embeddings on \"reach-avoid\nderived\" DFAs. Through empirical evaluation, we demonstrate that the proposed\npre-training method enables zero-shot generalization to various cDFA task\nclasses and accelerated policy specialization without the myopic suboptimality\nof hierarchical methods.\n","authors":["Beyazit Yalcinkaya","Niklas Lauffer","Marcell Vazquez-Chanlatte","Sanjit A. Seshia"],"pdf_url":"https://arxiv.org/pdf/2411.00205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20166v2","updated":"2025-01-15T01:34:46Z","published":"2024-12-28T14:38:16Z","title":"LoL-PIM: Long-Context LLM Decoding with Scalable DRAM-PIM System","summary":"  The expansion of large language models (LLMs) with hundreds of billions of\nparameters presents significant challenges to computational resources,\nparticularly data movement and memory bandwidth. Long-context LLMs, which\nprocess sequences of tens of thousands of tokens, further increase the demand\non the memory system as the complexity in attention layers and key-value cache\nsizes is proportional to the context length. Processing-in-Memory (PIM)\nmaximizes memory bandwidth by moving compute to the data and can address the\nmemory bandwidth challenges; however, PIM is not necessarily scalable to\naccelerate long-context LLM because of limited per-module memory capacity and\nthe inflexibility of fixed-functional unit PIM architecture and static memory\nmanagement. In this work, we propose LoL-PIM which is a multi-node PIM\narchitecture that accelerates long context LLM through hardware-software\nco-design. In particular, we propose how pipeline parallelism can be exploited\nacross a multi-PIM module while a direct PIM access (DPA) controller (or DMA\nfor PIM) is proposed that enables dynamic PIM memory management and results in\nefficient PIM utilization across a diverse range of context length. We\ndeveloped an MLIR-based compiler for LoL-PIM extending a commercial PIM-based\ncompiler where the software modifications were implemented and evaluated, while\nthe hardware changes were modeled in the simulator. Our evaluations demonstrate\nthat LoL-PIM significantly improves throughput and reduces latency for\nlong-context LLM inference, outperforming both multi-GPU and GPU-PIM systems\n(up to 8.54x and 16.0x speedup, respectively), thereby enabling more efficient\ndeployment of LLMs in real-world applications.\n","authors":["Hyucksung Kwon","Kyungmo Koo","Janghyeon Kim","Woongkyu Lee","Minjae Lee","Hyungdeok Lee","Yousub Jung","Jaehan Park","Yosub Song","Byeongsu Yang","Haerang Choi","Guhyun Kim","Jongsoon Won","Woojae Shin","Changhyun Kim","Gyeongcheol Shin","Yongkee Kwon","Ilkon Kim","Euicheol Lim","John Kim","Jungwook Choi"],"pdf_url":"https://arxiv.org/pdf/2412.20166v2.pdf","comment":"15 pages, 12 figures"},{"id":"http://arxiv.org/abs/2412.19228v2","updated":"2025-01-15T01:16:30Z","published":"2024-12-26T14:09:16Z","title":"Learning Cross-Domain Representations for Transferable Drug\n  Perturbations on Single-Cell Transcriptional Responses","summary":"  Phenotypic drug discovery has attracted widespread attention because of its\npotential to identify bioactive molecules. Transcriptomic profiling provides a\ncomprehensive reflection of phenotypic changes in cellular responses to\nexternal perturbations. In this paper, we propose XTransferCDR, a novel\ngenerative framework designed for feature decoupling and transferable\nrepresentation learning across domains. Given a pair of perturbed expression\nprofiles, our approach decouples the perturbation representations from basal\nstates through domain separation encoders and then cross-transfers them in the\nlatent space. The transferred representations are then used to reconstruct the\ncorresponding perturbed expression profiles via a shared decoder. This\ncross-transfer constraint effectively promotes the learning of transferable\ndrug perturbation representations. We conducted extensive evaluations of our\nmodel on multiple datasets, including single-cell transcriptional responses to\ndrugs and single- and combinatorial genetic perturbations. The experimental\nresults show that XTransferCDR achieved better performance than current\nstate-of-the-art methods, showcasing its potential to advance phenotypic drug\ndiscovery.\n","authors":["Hui Liu","Shikai Jin"],"pdf_url":"https://arxiv.org/pdf/2412.19228v2.pdf","comment":"Accepted by The 39th Annual AAAI Conference on Artificial Intelligenc\n  (AAAI 2025)"},{"id":"http://arxiv.org/abs/2501.08506v1","updated":"2025-01-15T00:56:59Z","published":"2025-01-15T00:56:59Z","title":"Exploring the Efficacy of Meta-Learning: Unveiling Superior Data\n  Diversity Utilization of MAML Over Pre-training","summary":"  Currently, data and model size dominate the narrative in the training of\nsuper-large, powerful models. However, there has been a lack of exploration on\nthe effect of other attributes of the training dataset on model performance. We\nhypothesize that dataset diversity can impact the performance of vision models.\nOur study shows positive correlations between test set accuracy and data\ndiversity, providing an argument for furthering the research of dataset\nattributes beyond size. We analyzed pre-training and model-agnostic\nmeta-learning methods on twelve popular visual datasets (e.g., Omniglot,\nCIFAR-FS, Aircraft) and five model configurations, including MAML variants with\ndifferent numbers of inner gradient steps and supervised learning. We show\nmoderate to strong positive correlations (R-squared: 0.15-0.42) between\naccuracy and data diversity and weaker but significant correlations (R-squared:\n~0.2) between loss and diversity. These findings support our hypothesis and\ndemonstrate a promising way for a deeper exploration of how formal data\ndiversity influences model performance. This initial study highlights the\npotential of (Task2Vec) data diversity as a valuable measure in the rapidly\nevolving field of large-scale learning and emphasizes that understanding the\ndataset is key to building more powerful and generalizable models.\n","authors":["Kavita Selva","Satita Vittayaareekul","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14762v3","updated":"2025-01-15T00:53:38Z","published":"2024-11-22T06:50:44Z","title":"Efficient Long Video Tokenization via Coordinate-based Patch\n  Reconstruction","summary":"  Efficient tokenization of videos remains a challenge in training vision\nmodels that can process long videos. One promising direction is to develop a\ntokenizer that can encode long video clips, as it would enable the tokenizer to\nleverage the temporal coherence of videos better for tokenization. However,\ntraining existing tokenizers on long videos often incurs a huge training cost\nas they are trained to reconstruct all the frames at once. In this paper, we\nintroduce CoordTok, a video tokenizer that learns a mapping from\ncoordinate-based representations to the corresponding patches of input videos,\ninspired by recent advances in 3D generative models. In particular, CoordTok\nencodes a video into factorized triplane representations and reconstructs\npatches that correspond to randomly sampled $(x,y,t)$ coordinates. This allows\nfor training large tokenizer models directly on long videos without requiring\nexcessive training resources. Our experiments show that CoordTok can\ndrastically reduce the number of tokens for encoding long video clips. For\ninstance, CoordTok can encode a 128-frame video with 128$\\times$128 resolution\ninto 1280 tokens, while baselines need 6144 or 8192 tokens to achieve similar\nreconstruction quality. We further show that this efficient video tokenization\nenables memory-efficient training of a diffusion transformer that can generate\n128 frames at once.\n","authors":["Huiwon Jang","Sihyun Yu","Jinwoo Shin","Pieter Abbeel","Younggyo Seo"],"pdf_url":"https://arxiv.org/pdf/2411.14762v3.pdf","comment":"Code is available on the project webpage:\n  https://huiwon-jang.github.io/coordtok/"},{"id":"http://arxiv.org/abs/2501.08502v1","updated":"2025-01-15T00:39:21Z","published":"2025-01-15T00:39:21Z","title":"Adapting Whisper for Regional Dialects: Enhancing Public Services for\n  Vulnerable Populations in the United Kingdom","summary":"  We collect novel data in the public service domain to evaluate the capability\nof the state-of-the-art automatic speech recognition (ASR) models in capturing\nregional differences in accents in the United Kingdom (UK), specifically\nfocusing on two accents from Scotland with distinct dialects. This study\naddresses real-world problems where biased ASR models can lead to\nmiscommunication in public services, disadvantaging individuals with regional\naccents particularly those in vulnerable populations. We first examine the\nout-of-the-box performance of the Whisper large-v3 model on a baseline dataset\nand our data. We then explore the impact of fine-tuning Whisper on the\nperformance in the two UK regions and investigate the effectiveness of existing\nmodel evaluation techniques for our real-world application through manual\ninspection of model errors. We observe that the Whisper model has a higher word\nerror rate (WER) on our test datasets compared to the baseline data and\nfine-tuning on a given data improves performance on the test dataset with the\nsame domain and accent. The fine-tuned models also appear to show improved\nperformance when applied to the test data outside of the region it was trained\non suggesting that fine-tuned models may be transferable within parts of the\nUK. Our manual analysis of model outputs reveals the benefits and drawbacks of\nusing WER as an evaluation metric and fine-tuning to adapt to regional\ndialects.\n","authors":["Melissa Torgbi","Andrew Clayman","Jordan J. Speight","Harish Tayyar Madabushi"],"pdf_url":"https://arxiv.org/pdf/2501.08502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20906v4","updated":"2025-01-15T00:10:57Z","published":"2024-07-30T15:26:36Z","title":"Automated Review Generation Method Based on Large Language Models","summary":"  Literature research, vital for scientific work, faces the challenge of\nsurging information volumes exceeding researchers' processing capabilities. We\npresent an automated review generation method based on large language models\n(LLMs) to overcome efficiency bottlenecks and reduce cognitive load. Our\nstatistically validated evaluation framework demonstrates that the generated\nreviews match or exceed manual quality, offering broad applicability across\nresearch fields without requiring users' domain knowledge. Applied to propane\ndehydrogenation (PDH) catalysts, our method swiftly analyzed 343 articles,\naveraging seconds per article per LLM account, producing comprehensive reviews\nspanning 35 topics, with extended analysis of 1041 articles providing insights\ninto catalysts' properties. Through multi-layered quality control, we\neffectively mitigated LLMs' hallucinations, with expert verification confirming\naccuracy and citation integrity while demonstrating hallucination risks reduced\nto below 0.5\\% with 95\\% confidence. Released Windows application enables\none-click review generation, enhancing research productivity and literature\nrecommendation efficiency while setting the stage for broader scientific\nexplorations.\n","authors":["Shican Wu","Xiao Ma","Dehui Luo","Lulu Li","Xiangcheng Shi","Xin Chang","Xiaoyun Lin","Ran Luo","Chunlei Pei","Changying Du","Zhi-Jian Zhao","Jinlong Gong"],"pdf_url":"https://arxiv.org/pdf/2407.20906v4.pdf","comment":"21 pages, 5 figures, 1 tables Code:\n  https://github.com/TJU-ECAT-AI/AutomaticReviewGeneration Data:\n  https://github.com/TJU-ECAT-AI/AutomaticReviewGenerationData This research\n  has been invited for a Short Oral presentation at the 18th ICC -\n  International Congress on Catalysis, taking place in Lyon, France from July\n  14-19, 2024"},{"id":"http://arxiv.org/abs/2412.14340v2","updated":"2025-01-15T00:02:00Z","published":"2024-12-18T21:17:02Z","title":"A Unifying Information-theoretic Perspective on Evaluating Generative\n  Models","summary":"  Considering the difficulty of interpreting generative model output, there is\nsignificant current research focused on determining meaningful evaluation\nmetrics. Several recent approaches utilize \"precision\" and \"recall,\" borrowed\nfrom the classification domain, to individually quantify the output fidelity\n(realism) and output diversity (representation of the real data variation),\nrespectively. With the increase in metric proposals, there is a need for a\nunifying perspective, allowing for easier comparison and clearer explanation of\ntheir benefits and drawbacks. To this end, we unify a class of\nkth-nearest-neighbors (kNN)-based metrics under an information-theoretic lens\nusing approaches from kNN density estimation. Additionally, we propose a\ntri-dimensional metric composed of Precision Cross-Entropy (PCE), Recall\nCross-Entropy (RCE), and Recall Entropy (RE), which separately measure fidelity\nand two distinct aspects of diversity, inter- and intra-class. Our\ndomain-agnostic metric, derived from the information-theoretic concepts of\nentropy and cross-entropy, can be dissected for both sample- and mode-level\nanalysis. Our detailed experimental results demonstrate the sensitivity of our\nmetric components to their respective qualities and reveal undesirable\nbehaviors of other metrics.\n","authors":["Alexis Fox","Samarth Swarup","Abhijin Adiga"],"pdf_url":"https://arxiv.org/pdf/2412.14340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09520v2","updated":"2025-01-15T23:21:06Z","published":"2024-09-14T20:11:25Z","title":"Enhancing Skin Disease Diagnosis: Interpretable Visual Concept Discovery\n  with SAM","summary":"  Current AI-assisted skin image diagnosis has achieved dermatologist-level\nperformance in classifying skin cancer, driven by rapid advancements in deep\nlearning architectures. However, unlike traditional vision tasks, skin images\nin general present unique challenges due to the limited availability of\nwell-annotated datasets, complex variations in conditions, and the necessity\nfor detailed interpretations to ensure patient safety. Previous segmentation\nmethods have sought to reduce image noise and enhance diagnostic performance,\nbut these techniques require fine-grained, pixel-level ground truth masks for\ntraining. In contrast, with the rise of foundation models, the Segment Anything\nModel (SAM) has been introduced to facilitate promptable segmentation, enabling\nthe automation of the segmentation process with simple yet effective prompts.\nEfforts applying SAM predominantly focus on dermatoscopy images, which present\nmore easily identifiable lesion boundaries than clinical photos taken with\nsmartphones. This limitation constrains the practicality of these approaches to\nreal-world applications. To overcome the challenges posed by noisy clinical\nphotos acquired via non-standardized protocols and to improve diagnostic\naccessibility, we propose a novel Cross-Attentive Fusion framework for\ninterpretable skin lesion diagnosis. Our method leverages SAM to generate\nvisual concepts for skin diseases using prompts, integrating local visual\nconcepts with global image features to enhance model performance. Extensive\nevaluation on two skin disease datasets demonstrates our proposed method's\neffectiveness on lesion diagnosis and interpretability.\n","authors":["Xin Hu","Janet Wang","Jihun Hamm","Rie R Yotsu","Zhengming Ding"],"pdf_url":"https://arxiv.org/pdf/2409.09520v2.pdf","comment":"This paper is accepted by WACV 2025"},{"id":"http://arxiv.org/abs/2501.09194v1","updated":"2025-01-15T22:55:26Z","published":"2025-01-15T22:55:26Z","title":"Grounding Text-To-Image Diffusion Models For Controlled High-Quality\n  Image Generation","summary":"  Large-scale text-to-image (T2I) diffusion models have demonstrated an\noutstanding performance in synthesizing diverse high-quality visuals from\nnatural language text captions. Multiple layout-to-image models have been\ndeveloped to control the generation process by utilizing a broad array of\nlayouts such as segmentation maps, edges, and human keypoints. In this work, we\npresent ObjectDiffusion, a model that takes inspirations from the top\ncutting-edge image generative frameworks to seamlessly condition T2I models\nwith new bounding boxes capabilities. Specifically, we make substantial\nmodifications to the network architecture introduced in ContorlNet to integrate\nit with the condition processing and injection techniques proposed in GLIGEN.\nObjectDiffusion is initialized with pretraining parameters to leverage the\ngeneration knowledge obtained from training on large-scale datasets. We\nfine-tune ObjectDiffusion on the COCO2017 training dataset and evaluate it on\nthe COCO2017 validation dataset. Our model achieves an AP$_{50}$ of 46.6, an AR\nof 44.5, and a FID of 19.8 outperforming the current SOTA model trained on\nopen-source datasets in all of the three metrics. ObjectDiffusion demonstrates\na distinctive capability in synthesizing diverse, high-quality, high-fidelity\nimages that seamlessly conform to the semantic and spatial control layout.\nEvaluated in qualitative and quantitative tests, ObjectDiffusion exhibits\nremarkable grounding abilities on closed-set and open-set settings across a\nwide variety of contexts. The qualitative assessment verifies the ability of\nObjectDiffusion to generate multiple objects of different sizes and locations.\n","authors":["Ahmad Süleyman","Göksel Biricik"],"pdf_url":"https://arxiv.org/pdf/2501.09194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09187v1","updated":"2025-01-15T22:26:26Z","published":"2025-01-15T22:26:26Z","title":"Patch-aware Vector Quantized Codebook Learning for Unsupervised Visual\n  Defect Detection","summary":"  Unsupervised visual defect detection is critical in industrial applications,\nrequiring a representation space that captures normal data features while\ndetecting deviations. Achieving a balance between expressiveness and\ncompactness is challenging; an overly expressive space risks inefficiency and\nmode collapse, impairing detection accuracy. We propose a novel approach using\nan enhanced VQ-VAE framework optimized for unsupervised defect detection. Our\nmodel introduces a patch-aware dynamic code assignment scheme, enabling\ncontext-sensitive code allocation to optimize spatial representation. This\nstrategy enhances normal-defect distinction and improves detection accuracy\nduring inference. Experiments on MVTecAD, BTAD, and MTSD datasets show our\nmethod achieves state-of-the-art performance.\n","authors":["Qisen Cheng","Shuhui Qu","Janghwan Lee"],"pdf_url":"https://arxiv.org/pdf/2501.09187v1.pdf","comment":"7 pages, Accepted to 36th IEEE ICTAI 2024"},{"id":"http://arxiv.org/abs/2501.09186v1","updated":"2025-01-15T22:23:53Z","published":"2025-01-15T22:23:53Z","title":"Guiding Retrieval using LLM-based Listwise Rankers","summary":"  Large Language Models (LLMs) have shown strong promise as rerankers,\nespecially in ``listwise'' settings where an LLM is prompted to rerank several\nsearch results at once. However, this ``cascading'' retrieve-and-rerank\napproach is limited by the bounded recall problem: relevant documents not\nretrieved initially are permanently excluded from the final ranking. Adaptive\nretrieval techniques address this problem, but do not work with listwise\nrerankers because they assume a document's score is computed independently from\nother documents. In this paper, we propose an adaptation of an existing\nadaptive retrieval method that supports the listwise setting and helps guide\nthe retrieval process itself (thereby overcoming the bounded recall problem for\nLLM rerankers). Specifically, our proposed algorithm merges results both from\nthe initial ranking and feedback documents provided by the most relevant\ndocuments seen up to that point. Through extensive experiments across diverse\nLLM rerankers, first stage retrievers, and feedback sources, we demonstrate\nthat our method can improve nDCG@10 by up to 13.23% and recall by 28.02%--all\nwhile keeping the total number of LLM inferences constant and overheads due to\nthe adaptive process minimal. The work opens the door to leveraging LLM-based\nsearch in settings where the initial pool of results is limited, e.g., by\nlegacy systems, or by the cost of deploying a semantic first-stage.\n","authors":["Mandeep Rathee","Sean MacAvaney","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2501.09186v1.pdf","comment":"16 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.09182v1","updated":"2025-01-15T22:19:34Z","published":"2025-01-15T22:19:34Z","title":"A Blockchain-Enabled Approach to Cross-Border Compliance and Trust","summary":"  As artificial intelligence (AI) systems become increasingly integral to\ncritical infrastructure and global operations, the need for a unified,\ntrustworthy governance framework is more urgent that ever. This paper proposes\na novel approach to AI governance, utilizing blockchain and distributed ledger\ntechnologies (DLT) to establish a decentralized, globally recognized framework\nthat ensures security, privacy, and trustworthiness of AI systems across\nborders. The paper presents specific implementation scenarios within the\nfinancial sector, outlines a phased deployment timeline over the next decade,\nand addresses potential challenges with solutions grounded in current research.\nBy synthesizing advancements in blockchain, AI ethics, and cybersecurity, this\npaper offers a comprehensive roadmap for a decentralized AI governance\nframework capable of adapting to the complex and evolving landscape of global\nAI regulation.\n","authors":["Vikram Kulothungan"],"pdf_url":"https://arxiv.org/pdf/2501.09182v1.pdf","comment":"This is a preprint of paper that has been accepted for Publication at\n  2024 IEEE International Conference on Trust, Privacy and Security in\n  Intelligent Systems, and Applications"},{"id":"http://arxiv.org/abs/2501.09166v1","updated":"2025-01-15T21:33:53Z","published":"2025-01-15T21:33:53Z","title":"Attention is All You Need Until You Need Retention","summary":"  This work introduces a novel Retention Layer mechanism for Transformer based\narchitectures, addressing their inherent lack of intrinsic retention\ncapabilities. Unlike human cognition, which can encode and dynamically recall\nsymbolic templates, Generative Pretrained Transformers rely solely on fixed\npretrained weights and ephemeral context windows, limiting their adaptability.\nThe proposed Retention Layer incorporates a persistent memory module capable of\nreal time data population, dynamic recall, and guided output generation. This\nenhancement allows models to store, update, and reuse observed patterns across\nsessions, enabling incremental learning and bridging the gap between static\npretraining and dynamic, context sensitive adaptation. The Retention Layer\ndesign parallels social learning processes, encompassing attention, retention,\nreproduction, and motivation stages. Technically, it integrates a memory\nattention mechanism and episodic buffers to manage memory scalability, mitigate\noverfitting, and ensure efficient recall. Applications span adaptive personal\nassistants, real time fraud detection, autonomous robotics, content moderation,\nand healthcare diagnostics. In each domain, the retention mechanism enables\nsystems to learn incrementally, personalize outputs, and respond to evolving\nreal world challenges effectively. By emulating key aspects of human learning,\nthis retention enhanced architecture fosters a more fluid and responsive AI\nparadigm, paving the way for dynamic, session aware models that extend the\ncapabilities of traditional Transformers into domains requiring continual\nadaptation.\n","authors":["M. Murat Yaslioglu"],"pdf_url":"https://arxiv.org/pdf/2501.09166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09164v1","updated":"2025-01-15T21:30:03Z","published":"2025-01-15T21:30:03Z","title":"The Veln(ia)s is in the Details: Evaluating LLM Judgment on Latvian and\n  Lithuanian Short Answer Matching","summary":"  In this work, we address the challenge of evaluating large language models\n(LLMs) on the short answer matching task for Latvian and Lithuanian languages.\nWe introduce novel datasets consisting of 502 Latvian and 690 Lithuanian\nquestion-answer pairs. For each question-answer pair, we generated matched and\nnon-matched answers using a set of alteration rules specifically designed to\nintroduce small but meaningful changes in the text. These generated answers\nserve as test cases to assess the ability of LLMs to detect subtle differences\nin matching of the original answers. A subset of the datasets was manually\nverified for quality and accuracy. Our results show that while larger LLMs,\nsuch as QWEN2.5 72b and LLaMa3.1 70b, demonstrate near-perfect performance in\ndistinguishing matched and non-matched answers, smaller models show more\nvariance. For instance, LLaMa3.1 8b and EuroLLM 9b benefited from few-shot\nexamples, while Mistral Nemo 12b underperformed on detection of subtle text\nalteration, particularly in Lithuanian, even with additional examples. QWEN2.5\n7b and Mistral 7b were able to obtain a strong and comparable performance to\nthe larger 70b models in zero and few shot experiments. Moreover, the\nperformance of Mistral 7b was weaker in few shot experiments.\n","authors":["Yevhen Kostiuk","Oxana Vitman","Łukasz Gagała","Artur Kiulian"],"pdf_url":"https://arxiv.org/pdf/2501.09164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09163v1","updated":"2025-01-15T21:29:29Z","published":"2025-01-15T21:29:29Z","title":"Towards Understanding Extrapolation: a Causal Lens","summary":"  Canonical work handling distribution shifts typically necessitates an entire\ntarget distribution that lands inside the training distribution. However,\npractical scenarios often involve only a handful of target samples, potentially\nlying outside the training support, which requires the capability of\nextrapolation. In this work, we aim to provide a theoretical understanding of\nwhen extrapolation is possible and offer principled methods to achieve it\nwithout requiring an on-support target distribution. To this end, we formulate\nthe extrapolation problem with a latent-variable model that embodies the\nminimal change principle in causal mechanisms. Under this formulation, we cast\nthe extrapolation problem into a latent-variable identification problem. We\nprovide realistic conditions on shift properties and the estimation objectives\nthat lead to identification even when only one off-support target sample is\navailable, tackling the most challenging scenarios. Our theory reveals the\nintricate interplay between the underlying manifold's smoothness and the shift\nproperties. We showcase how our theoretical results inform the design of\npractical adaptation algorithms. Through experiments on both synthetic and\nreal-world data, we validate our theoretical findings and their practical\nimplications.\n","authors":["Lingjing Kong","Guangyi Chen","Petar Stojanov","Haoxuan Li","Eric P. Xing","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09163v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09160v1","updated":"2025-01-15T21:22:09Z","published":"2025-01-15T21:22:09Z","title":"AutoLoop: Fast Visual SLAM Fine-tuning through Agentic Curriculum\n  Learning","summary":"  Current visual SLAM systems face significant challenges in balancing\ncomputational efficiency with robust loop closure handling. Traditional\napproaches require careful manual tuning and incur substantial computational\noverhead, while learning-based methods either lack explicit loop closure\ncapabilities or implement them through computationally expensive methods. We\npresent AutoLoop, a novel approach that combines automated curriculum learning\nwith efficient fine-tuning for visual SLAM systems. Our method employs a DDPG\n(Deep Deterministic Policy Gradient) agent to dynamically adjust loop closure\nweights during training, eliminating the need for manual hyperparameter search\nwhile significantly reducing the required training steps. The approach\npre-computes potential loop closure pairs offline and leverages them through an\nagent-guided curriculum, allowing the model to adapt efficiently to new\nscenarios. Experiments conducted on TartanAir for training and validated across\nmultiple benchmarks including KITTI, EuRoC, ICL-NUIM and TUM RGB-D demonstrate\nthat AutoLoop achieves comparable or superior performance while reducing\ntraining time by an order of magnitude compared to traditional approaches.\nAutoLoop provides a practical solution for rapid adaptation of visual SLAM\nsystems, automating the weight tuning process that traditionally requires\nmultiple manual iterations. Our results show that this automated curriculum\nstrategy not only accelerates training but also maintains or improves the\nmodel's performance across diverse environmental conditions.\n","authors":["Assaf Lahiany","Oren Gal"],"pdf_url":"https://arxiv.org/pdf/2501.09160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10966v2","updated":"2025-01-15T21:20:03Z","published":"2024-12-14T20:54:37Z","title":"FlowDock: Geometric Flow Matching for Generative Protein-Ligand Docking\n  and Affinity Prediction","summary":"  Powerful generative AI models of protein-ligand structure have recently been\nproposed, but few of these methods support both flexible protein-ligand docking\nand affinity estimation. Of those that do, none can directly model multiple\nbinding ligands concurrently or have been rigorously benchmarked on\npharmacologically relevant drug targets, hindering their widespread adoption in\ndrug discovery efforts. In this work, we propose FlowDock, the first deep\ngeometric generative model based on conditional flow matching that learns to\ndirectly map unbound (apo) structures to their bound (holo) counterparts for an\narbitrary number of binding ligands. Furthermore, FlowDock provides predicted\nstructural confidence scores and binding affinity values with each of its\ngenerated protein-ligand complex structures, enabling fast virtual screening of\nnew (multi-ligand) drug targets. For the well-known PoseBusters Benchmark\ndataset, FlowDock outperforms single-sequence AlphaFold 3 with a 51% blind\ndocking success rate using unbound (apo) protein input structures and without\nany information derived from multiple sequence alignments, and for the\nchallenging new DockGen-E dataset, FlowDock outperforms single-sequence\nAlphaFold 3 and matches single-sequence Chai-1 for binding pocket\ngeneralization. Additionally, in the ligand category of the 16th community-wide\nCritical Assessment of Techniques for Structure Prediction (CASP16), FlowDock\nranked among the top-5 methods for pharmacological binding affinity estimation\nacross 140 protein-ligand complexes, demonstrating the efficacy of its learned\nrepresentations in virtual screening. Source code, data, and pre-trained models\nare available at https://github.com/BioinfoMachineLearning/FlowDock.\n","authors":["Alex Morehead","Jianlin Cheng"],"pdf_url":"https://arxiv.org/pdf/2412.10966v2.pdf","comment":"10 pages, 2 tables, 2 algorithms, 7 figures. Code, data, pre-trained\n  models, and baseline method predictions are available at\n  https://github.com/BioinfoMachineLearning/FlowDock"},{"id":"http://arxiv.org/abs/2501.09154v1","updated":"2025-01-15T21:14:09Z","published":"2025-01-15T21:14:09Z","title":"Towards Multilingual LLM Evaluation for Baltic and Nordic languages: A\n  study on Lithuanian History","summary":"  In this work, we evaluated Lithuanian and general history knowledge of\nmultilingual Large Language Models (LLMs) on a multiple-choice\nquestion-answering task. The models were tested on a dataset of Lithuanian\nnational and general history questions translated into Baltic, Nordic, and\nother languages (English, Ukrainian, Arabic) to assess the knowledge sharing\nfrom culturally and historically connected groups. We evaluated GPT-4o,\nLLaMa3.1 8b and 70b, QWEN2.5 7b and 72b, Mistral Nemo 12b, LLaMa3 8b, Mistral\n7b, LLaMa3.2 3b, and Nordic fine-tuned models (GPT-SW3 and LLaMa3 8b).\n  Our results show that GPT-4o consistently outperformed all other models\nacross language groups, with slightly better results for Baltic and Nordic\nlanguages. Larger open-source models like QWEN2.5 72b and LLaMa3.1 70b\nperformed well but showed weaker alignment with Baltic languages. Smaller\nmodels (Mistral Nemo 12b, LLaMa3.2 3b, QWEN 7B, LLaMa3.1 8B, and LLaMa3 8b)\ndemonstrated gaps with LT-related alignment with Baltic languages while\nperforming better on Nordic and other languages. The Nordic fine-tuned models\ndid not surpass multilingual models, indicating that shared cultural or\nhistorical context alone does not guarantee better performance.\n","authors":["Yevhen Kostiuk","Oxana Vitman","Łukasz Gagała","Artur Kiulian"],"pdf_url":"https://arxiv.org/pdf/2501.09154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05079v2","updated":"2025-01-15T20:46:44Z","published":"2025-01-09T09:01:04Z","title":"Multimodal-to-Text Prompt Engineering in Large Language Models Using\n  Feature Embeddings for GNSS Interference Characterization","summary":"  Large language models (LLMs) are advanced AI systems applied across various\ndomains, including NLP, information retrieval, and recommendation systems.\nDespite their adaptability and efficiency, LLMs have not been extensively\nexplored for signal processing tasks, particularly in the domain of global\nnavigation satellite system (GNSS) interference monitoring. GNSS interference\nmonitoring is essential to ensure the reliability of vehicle localization on\nroads, a critical requirement for numerous applications. However, GNSS-based\npositioning is vulnerable to interference from jamming devices, which can\ncompromise its accuracy. The primary objective is to identify, classify, and\nmitigate these interferences. Interpreting GNSS snapshots and the associated\ninterferences presents significant challenges due to the inherent complexity,\nincluding multipath effects, diverse interference types, varying sensor\ncharacteristics, and satellite constellations. In this paper, we extract\nfeatures from a large GNSS dataset and employ LLaVA to retrieve relevant\ninformation from an extensive knowledge base. We employ prompt engineering to\ninterpret the interferences and environmental factors, and utilize t-SNE to\nanalyze the feature embeddings. Our findings demonstrate that the proposed\nmethod is capable of visual and logical reasoning within the GNSS context.\nFurthermore, our pipeline outperforms state-of-the-art machine learning models\nin interference classification tasks.\n","authors":["Harshith Manjunath","Lucas Heublein","Tobias Feigl","Felix Ott"],"pdf_url":"https://arxiv.org/pdf/2501.05079v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06497v2","updated":"2025-01-15T20:43:44Z","published":"2025-01-11T10:22:04Z","title":"PASS: Presentation Automation for Slide Generation and Speech","summary":"  In today's fast-paced world, effective presentations have become an essential\ntool for communication in both online and offline meetings. The crafting of a\ncompelling presentation requires significant time and effort, from gathering\nkey insights to designing slides that convey information clearly and concisely.\nHowever, despite the wealth of resources available, people often find\nthemselves manually extracting crucial points, analyzing data, and organizing\ncontent in a way that ensures clarity and impact. Furthermore, a successful\npresentation goes beyond just the slides; it demands rehearsal and the ability\nto weave a captivating narrative to fully engage the audience. Although there\nhas been some exploration of automating document-to-slide generation, existing\nresearch is largely centered on converting research papers. In addition,\nautomation of the delivery of these presentations has yet to be addressed. We\nintroduce PASS, a pipeline used to generate slides from general Word documents,\ngoing beyond just research papers, which also automates the oral delivery of\nthe generated slides. PASS analyzes user documents to create a dynamic,\nengaging presentation with an AI-generated voice. Additionally, we developed an\nLLM-based evaluation metric to assess our pipeline across three critical\ndimensions of presentations: relevance, coherence, and redundancy. The data and\ncodes are available at https://github.com/AggarwalTushar/PASS.\n","authors":["Tushar Aggarwal","Aarohi Bhand"],"pdf_url":"https://arxiv.org/pdf/2501.06497v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09136v1","updated":"2025-01-15T20:40:25Z","published":"2025-01-15T20:40:25Z","title":"Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG","summary":"  Large Language Models (LLMs) have revolutionized artificial intelligence (AI)\nby enabling human like text generation and natural language understanding.\nHowever, their reliance on static training data limits their ability to respond\nto dynamic, real time queries, resulting in outdated or inaccurate outputs.\nRetrieval Augmented Generation (RAG) has emerged as a solution, enhancing LLMs\nby integrating real time data retrieval to provide contextually relevant and\nup-to-date responses. Despite its promise, traditional RAG systems are\nconstrained by static workflows and lack the adaptability required for\nmultistep reasoning and complex task management.\n  Agentic Retrieval-Augmented Generation (Agentic RAG) transcends these\nlimitations by embedding autonomous AI agents into the RAG pipeline. These\nagents leverage agentic design patterns reflection, planning, tool use, and\nmultiagent collaboration to dynamically manage retrieval strategies,\niteratively refine contextual understanding, and adapt workflows to meet\ncomplex task requirements. This integration enables Agentic RAG systems to\ndeliver unparalleled flexibility, scalability, and context awareness across\ndiverse applications.\n  This survey provides a comprehensive exploration of Agentic RAG, beginning\nwith its foundational principles and the evolution of RAG paradigms. It\npresents a detailed taxonomy of Agentic RAG architectures, highlights key\napplications in industries such as healthcare, finance, and education, and\nexamines practical implementation strategies. Additionally, it addresses\nchallenges in scaling these systems, ensuring ethical decision making, and\noptimizing performance for real-world applications, while providing detailed\ninsights into frameworks and tools for implementing Agentic RAG\n","authors":["Aditi Singh","Abul Ehtesham","Saket Kumar","Tala Talaei Khoei"],"pdf_url":"https://arxiv.org/pdf/2501.09136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09134v1","updated":"2025-01-15T20:37:04Z","published":"2025-01-15T20:37:04Z","title":"Benchmarking Robustness of Contrastive Learning Models for Medical\n  Image-Report Retrieval","summary":"  Medical images and reports offer invaluable insights into patient health. The\nheterogeneity and complexity of these data hinder effective analysis. To bridge\nthis gap, we investigate contrastive learning models for cross-domain\nretrieval, which associates medical images with their corresponding clinical\nreports. This study benchmarks the robustness of four state-of-the-art\ncontrastive learning models: CLIP, CXR-RePaiR, MedCLIP, and CXR-CLIP. We\nintroduce an occlusion retrieval task to evaluate model performance under\nvarying levels of image corruption. Our findings reveal that all evaluated\nmodels are highly sensitive to out-of-distribution data, as evidenced by the\nproportional decrease in performance with increasing occlusion levels. While\nMedCLIP exhibits slightly more robustness, its overall performance remains\nsignificantly behind CXR-CLIP and CXR-RePaiR. CLIP, trained on a\ngeneral-purpose dataset, struggles with medical image-report retrieval,\nhighlighting the importance of domain-specific training data. The evaluation of\nthis work suggests that more effort needs to be spent on improving the\nrobustness of these models. By addressing these limitations, we can develop\nmore reliable cross-domain retrieval models for medical applications.\n","authors":["Demetrio Deanda","Yuktha Priya Masupalli","Jeong Yang","Young Lee","Zechun Cao","Gongbo Liang"],"pdf_url":"https://arxiv.org/pdf/2501.09134v1.pdf","comment":"This work is accepted to AAAI 2025 Workshop -- the 9th International\n  Workshop on Health Intelligence"},{"id":"http://arxiv.org/abs/2501.09114v1","updated":"2025-01-15T19:50:56Z","published":"2025-01-15T19:50:56Z","title":"Generative Medical Image Anonymization Based on Latent Code Projection\n  and Optimization","summary":"  Medical image anonymization aims to protect patient privacy by removing\nidentifying information, while preserving the data utility to solve downstream\ntasks. In this paper, we address the medical image anonymization problem with a\ntwo-stage solution: latent code projection and optimization. In the projection\nstage, we design a streamlined encoder to project input images into a latent\nspace and propose a co-training scheme to enhance the projection process. In\nthe optimization stage, we refine the latent code using two deep loss functions\ndesigned to address the trade-off between identity protection and data utility\ndedicated to medical images. Through a comprehensive set of qualitative and\nquantitative experiments, we showcase the effectiveness of our approach on the\nMIMIC-CXR chest X-ray dataset by generating anonymized synthetic images that\ncan serve as training set for detecting lung pathologies. Source codes are\navailable at https://github.com/Huiyu-Li/GMIA.\n","authors":["Huiyu Li","Nicholas Ayache","Hervé Delingette"],"pdf_url":"https://arxiv.org/pdf/2501.09114v1.pdf","comment":"Conference"},{"id":"http://arxiv.org/abs/2501.09112v1","updated":"2025-01-15T19:46:23Z","published":"2025-01-15T19:46:23Z","title":"Mantis Shrimp: Exploring Photometric Band Utilization in Computer Vision\n  Networks for Photometric Redshift Estimation","summary":"  We present Mantis Shrimp, a multi-survey deep learning model for photometric\nredshift estimation that fuses ultra-violet (GALEX), optical (PanSTARRS), and\ninfrared (UnWISE) imagery. Machine learning is now an established approach for\nphotometric redshift estimation, with generally acknowledged higher performance\nin areas with a high density of spectroscopically identified galaxies over\ntemplate-based methods. Multiple works have shown that image-based\nconvolutional neural networks can outperform tabular-based color/magnitude\nmodels. In comparison to tabular models, image models have additional design\ncomplexities: it is largely unknown how to fuse inputs from different\ninstruments which have different resolutions or noise properties. The Mantis\nShrimp model estimates the conditional density estimate of redshift using\ncutout images. The density estimates are well calibrated and the point\nestimates perform well in the distribution of available spectroscopically\nconfirmed galaxies with (bias = 1e-2), scatter (NMAD = 2.44e-2) and\ncatastrophic outlier rate ($\\eta$=17.53$\\%$). We find that early fusion\napproaches (e.g., resampling and stacking images from different instruments)\nmatch the performance of late fusion approaches (e.g., concatenating latent\nspace representations), so that the design choice ultimately is left to the\nuser. Finally, we study how the models learn to use information across bands,\nfinding evidence that our models successfully incorporates information from all\nsurveys. The applicability of our model to the analysis of large populations of\ngalaxies is limited by the speed of downloading cutouts from external servers;\nhowever, our model could be useful in smaller studies such as generating priors\nover redshift for stellar population synthesis.\n","authors":["Andrew Engel","Nell Byler","Adam Tsou","Gautham Narayan","Emmanuel Bonilla","Ian Smith"],"pdf_url":"https://arxiv.org/pdf/2501.09112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09104v1","updated":"2025-01-15T19:42:41Z","published":"2025-01-15T19:42:41Z","title":"A Non-autoregressive Model for Joint STT and TTS","summary":"  In this paper, we take a step towards jointly modeling automatic speech\nrecognition (STT) and speech synthesis (TTS) in a fully non-autoregressive way.\nWe develop a novel multimodal framework capable of handling the speech and text\nmodalities as input either individually or together. The proposed model can\nalso be trained with unpaired speech or text data owing to its multimodal\nnature. We further propose an iterative refinement strategy to improve the STT\nand TTS performance of our model such that the partial hypothesis at the output\ncan be fed back to the input of our model, thus iteratively improving both STT\nand TTS predictions. We show that our joint model can effectively perform both\nSTT and TTS tasks, outperforming the STT-specific baseline in all tasks and\nperforming competitively with the TTS-specific baseline across a wide range of\nevaluation metrics.\n","authors":["Vishal Sunder","Brian Kingsbury","George Saon","Samuel Thomas","Slava Shechtman Hagai Aronowitz","Eric Fosler-Lussier","Luis Lastras"],"pdf_url":"https://arxiv.org/pdf/2501.09104v1.pdf","comment":"5 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.09102v1","updated":"2025-01-15T19:37:44Z","published":"2025-01-15T19:37:44Z","title":"Tracking the Takes and Trajectories of English-Language News Narratives\n  across Trustworthy and Worrisome Websites","summary":"  Understanding how misleading and outright false information enters news\necosystems remains a difficult challenge that requires tracking how narratives\nspread across thousands of fringe and mainstream news websites. To do this, we\nintroduce a system that utilizes encoder-based large language models and\nzero-shot stance detection to scalably identify and track news narratives and\ntheir attitudes across over 4,000 factually unreliable, mixed-reliability, and\nfactually reliable English-language news websites. Running our system over an\n18 month period, we track the spread of 146K news stories. Using network-based\ninterference via the NETINF algorithm, we show that the paths of news\nnarratives and the stances of websites toward particular entities can be used\nto uncover slanted propaganda networks (e.g., anti-vaccine and anti-Ukraine)\nand to identify the most influential websites in spreading these attitudes in\nthe broader news ecosystem. We hope that increased visibility into our\ndistributed news ecosystem can help with the reporting and fact-checking of\npropaganda and disinformation.\n","authors":["Hans W. A. Hanley","Emily Okabe","Zakir Durumeric"],"pdf_url":"https://arxiv.org/pdf/2501.09102v1.pdf","comment":"To appear at USENIX Security Symposium 2025. Keywords:\n  Misinformation, News, Narratives, LLMs, Stance-Detection"},{"id":"http://arxiv.org/abs/2311.12068v4","updated":"2025-01-15T19:28:27Z","published":"2023-11-19T17:28:28Z","title":"Enhancing Novel Object Detection via Cooperative Foundational Models","summary":"  In this work, we address the challenging and emergent problem of novel object\ndetection (NOD), focusing on the accurate detection of both known and novel\nobject categories during inference. Traditional object detection algorithms are\ninherently closed-set, limiting their capability to handle NOD. We present a\nnovel approach to transform existing closed-set detectors into open-set\ndetectors. This transformation is achieved by leveraging the complementary\nstrengths of pre-trained foundational models, specifically CLIP and SAM,\nthrough our cooperative mechanism. Furthermore, by integrating this mechanism\nwith state-of-the-art open-set detectors such as GDINO, we establish new\nbenchmarks in object detection performance. Our method achieves 17.42 mAP in\nnovel object detection and 42.08 mAP for known objects on the challenging LVIS\ndataset. Adapting our approach to the COCO OVD split, we surpass the current\nstate-of-the-art by a margin of 7.2 $ \\text{AP}_{50} $ for novel classes. Our\ncode is available at https://rohit901.github.io/coop-foundation-models/ .\n","authors":["Rohit Bharadwaj","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2311.12068v4.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2501.09092v1","updated":"2025-01-15T19:24:48Z","published":"2025-01-15T19:24:48Z","title":"SteLLA: A Structured Grading System Using LLMs with RAG","summary":"  Large Language Models (LLMs) have shown strong general capabilities in many\napplications. However, how to make them reliable tools for some specific tasks\nsuch as automated short answer grading (ASAG) remains a challenge. We present\nSteLLA (Structured Grading System Using LLMs with RAG) in which a) Retrieval\nAugmented Generation (RAG) approach is used to empower LLMs specifically on the\nASAG task by extracting structured information from the highly relevant and\nreliable external knowledge based on the instructor-provided reference answer\nand rubric, b) an LLM performs a structured and question-answering-based\nevaluation of student answers to provide analytical grades and feedback. A\nreal-world dataset that contains students' answers in an exam was collected\nfrom a college-level Biology course. Experiments show that our proposed system\ncan achieve substantial agreement with the human grader while providing\nbreak-down grades and feedback on all the knowledge points examined in the\nproblem. A qualitative and error analysis of the feedback generated by GPT4\nshows that GPT4 is good at capturing facts while may be prone to inferring too\nmuch implication from the given text in the grading task which provides\ninsights into the usage of LLMs in the ASAG system.\n","authors":["Hefei Qiu","Brian White","Ashley Ding","Reinaldo Costa","Ali Hachem","Wei Ding","Ping Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09081v1","updated":"2025-01-15T19:00:47Z","published":"2025-01-15T19:00:47Z","title":"Inferring Transition Dynamics from Value Functions","summary":"  In reinforcement learning, the value function is typically trained to solve\nthe Bellman equation, which connects the current value to future values. This\ntemporal dependency hints that the value function may contain implicit\ninformation about the environment's transition dynamics. By rearranging the\nBellman equation, we show that a converged value function encodes a model of\nthe underlying dynamics of the environment. We build on this insight to propose\na simple method for inferring dynamics models directly from the value function,\npotentially mitigating the need for explicit model learning. Furthermore, we\nexplore the challenges of next-state identifiability, discussing conditions\nunder which the inferred dynamics model is well-defined. Our work provides a\ntheoretical foundation for leveraging value functions in dynamics modeling and\nopens a new avenue for bridging model-free and model-based reinforcement\nlearning.\n","authors":["Jacob Adamczyk"],"pdf_url":"https://arxiv.org/pdf/2501.09081v1.pdf","comment":"Accepted at the AAAI-25 8th Workshop on Generalization in Planning"},{"id":"http://arxiv.org/abs/2501.09080v1","updated":"2025-01-15T19:00:46Z","published":"2025-01-15T19:00:46Z","title":"Average-Reward Reinforcement Learning with Entropy Regularization","summary":"  The average-reward formulation of reinforcement learning (RL) has drawn\nincreased interest in recent years due to its ability to solve\ntemporally-extended problems without discounting. Independently, RL algorithms\nhave benefited from entropy-regularization: an approach used to make the\noptimal policy stochastic, thereby more robust to noise. Despite the distinct\nbenefits of the two approaches, the combination of entropy regularization with\nan average-reward objective is not well-studied in the literature and there has\nbeen limited development of algorithms for this setting. To address this gap in\nthe field, we develop algorithms for solving entropy-regularized average-reward\nRL problems with function approximation. We experimentally validate our method,\ncomparing it with existing algorithms on standard benchmarks for RL.\n","authors":["Jacob Adamczyk","Volodymyr Makarenko","Stas Tiomkin","Rahul V. Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2501.09080v1.pdf","comment":"Accepted at the AAAI-25 Eighth Workshop on Bridging the Gap Between\n  AI Planning and Reinforcement Learning (PRL)"},{"id":"http://arxiv.org/abs/2409.09201v3","updated":"2025-01-15T18:52:52Z","published":"2024-09-13T21:28:54Z","title":"Contextual Evaluation of Large Language Models for Classifying Tropical\n  and Infectious Diseases","summary":"  While large language models (LLMs) have shown promise for medical question\nanswering, there is limited work focused on tropical and infectious\ndisease-specific exploration. We build on an opensource tropical and infectious\ndiseases (TRINDs) dataset, expanding it to include demographic and semantic\nclinical and consumer augmentations yielding 11000+ prompts. We evaluate LLM\nperformance on these, comparing generalist and medical LLMs, as well as LLM\noutcomes to human experts. We demonstrate through systematic experimentation,\nthe benefit of contextual information such as demographics, location, gender,\nrisk factors for optimal LLM response. Finally we develop a prototype of\nTRINDs-LM, a research tool that provides a playground to navigate how context\nimpacts LLM outputs for health.\n","authors":["Mercy Asiedu","Nenad Tomasev","Chintan Ghate","Tiya Tiyasirichokchai","Awa Dieng","Oluwatosin Akande","Geoffrey Siwo","Steve Adudans","Sylvanus Aitkins","Odianosen Ehiakhamen","Eric Ndombi","Katherine Heller"],"pdf_url":"https://arxiv.org/pdf/2409.09201v3.pdf","comment":"Accepted at 2 NeurIPS 2024 workshops: Generative AI for Health\n  Workshop and Workshop on Advancements In Medical Foundation Models:\n  Explainability, Robustness, Security, and Beyond"},{"id":"http://arxiv.org/abs/2501.09056v1","updated":"2025-01-15T18:44:01Z","published":"2025-01-15T18:44:01Z","title":"Decompose-ToM: Enhancing Theory of Mind Reasoning in Large Language\n  Models through Simulation and Task Decomposition","summary":"  Theory of Mind (ToM) is the ability to understand and reflect on the mental\nstates of others. Although this capability is crucial for human interaction,\ntesting on Large Language Models (LLMs) reveals that they possess only a\nrudimentary understanding of it. Although the most capable closed-source LLMs\nhave come close to human performance on some ToM tasks, they still perform\npoorly on complex variations of the task that involve more structured\nreasoning. In this work, we utilize the concept of \"pretend-play\", or\n``Simulation Theory'' from cognitive psychology to propose ``Decompose-ToM'':\nan LLM-based inference algorithm that improves model performance on complex ToM\ntasks. We recursively simulate user perspectives and decompose the ToM task\ninto a simpler set of functions: subject identification, question-reframing,\nworld model updation, and knowledge availability. We test the algorithm on\nhigher-order ToM tasks and a task testing for ToM capabilities in a\nconversational setting, demonstrating that our approach shows significant\nimprovement across models compared to baseline methods while requiring minimal\nprompt tuning across tasks and no additional model training.\n","authors":["Sneheel Sarangi","Maha Elgarf","Hanan Salam"],"pdf_url":"https://arxiv.org/pdf/2501.09056v1.pdf","comment":"Accepted to COLING 2025"},{"id":"http://arxiv.org/abs/2501.09051v1","updated":"2025-01-15T12:40:13Z","published":"2025-01-15T12:40:13Z","title":"Polyp detection in colonoscopy images using YOLOv11","summary":"  Colorectal cancer (CRC) is one of the most commonly diagnosed cancers all\nover the world. It starts as a polyp in the inner lining of the colon. To\nprevent CRC, early polyp detection is required. Colonosopy is used for the\ninspection of the colon. Generally, the images taken by the camera placed at\nthe tip of the endoscope are analyzed by the experts manually. Various\ntraditional machine learning models have been used with the rise of machine\nlearning. Recently, deep learning models have shown more effectiveness in polyp\ndetection due to their superiority in generalizing and learning small features.\nThese deep learning models for object detection can be segregated into two\ndifferent types: single-stage and two-stage. Generally, two stage models have\nhigher accuracy than single stage ones but the single stage models have low\ninference time. Hence, single stage models are easy to use for quick object\ndetection. YOLO is one of the singlestage models used successfully for polyp\ndetection. It has drawn the attention of researchers because of its lower\ninference time. The researchers have used Different versions of YOLO so far,\nand with each newer version, the accuracy of the model is increasing. This\npaper aims to see the effectiveness of the recently released YOLOv11 to detect\npolyp. We analyzed the performance for all five models of YOLOv11 (YOLO11n,\nYOLO11s, YOLO11m, YOLO11l, YOLO11x) with Kvasir dataset for the training and\ntesting. Two different versions of the dataset were used. The first consisted\nof the original dataset, and the other was created using augmentation\ntechniques. The performance of all the models with these two versions of the\ndataset have been analysed.\n","authors":["Alok Ranjan Sahoo","Satya Sangram Sahoo","Pavan Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2501.09051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09050v1","updated":"2025-01-15T12:14:15Z","published":"2025-01-15T12:14:15Z","title":"Generating Realistic Synthetic Head Rotation Data for Extended Reality\n  using Deep Learning","summary":"  Extended Reality is a revolutionary method of delivering multimedia content\nto users. A large contributor to its popularity is the sense of immersion and\ninteractivity enabled by having real-world motion reflected in the virtual\nexperience accurately and immediately. This user motion, mainly caused by head\nrotations, induces several technical challenges. For instance, which content is\ngenerated and transmitted depends heavily on where the user is looking.\nSeamless systems, taking user motion into account proactively, will therefore\nrequire accurate predictions of upcoming rotations. Training and evaluating\nsuch predictors requires vast amounts of orientational input data, which is\nexpensive to gather, as it requires human test subjects. A more feasible\napproach is to gather a modest dataset through test subjects, and then extend\nit to a more sizeable set using synthetic data generation methods. In this\nwork, we present a head rotation time series generator based on TimeGAN, an\nextension of the well-known Generative Adversarial Network, designed\nspecifically for generating time series. This approach is able to extend a\ndataset of head rotations with new samples closely matching the distribution of\nthe measured time series.\n","authors":["Jakob Struye","Filip Lemic","Jeroen Famaey"],"pdf_url":"https://arxiv.org/pdf/2501.09050v1.pdf","comment":"Published and presented at International Conference on Multimedia\n  2022 (ACMMM), Workshop on Interactive eXtended Reality (IXR)"},{"id":"http://arxiv.org/abs/2501.09049v1","updated":"2025-01-15T12:11:33Z","published":"2025-01-15T12:11:33Z","title":"Dynamic-Aware Spatio-temporal Representation Learning for Dynamic MRI\n  Reconstruction","summary":"  Dynamic MRI reconstruction, one of inverse problems, has seen a surge by the\nuse of deep learning techniques. Especially, the practical difficulty of\nobtaining ground truth data has led to the emergence of unsupervised learning\napproaches. A recent promising method among them is implicit neural\nrepresentation (INR), which defines the data as a continuous function that maps\ncoordinate values to the corresponding signal values. This allows for filling\nin missing information only with incomplete measurements and solving the\ninverse problem effectively. Nevertheless, previous works incorporating this\nmethod have faced drawbacks such as long optimization time and the need for\nextensive hyperparameter tuning. To address these issues, we propose\nDynamic-Aware INR (DA-INR), an INR-based model for dynamic MRI reconstruction\nthat captures the spatial and temporal continuity of dynamic MRI data in the\nimage domain and explicitly incorporates the temporal redundancy of the data\ninto the model structure. As a result, DA-INR outperforms other models in\nreconstruction quality even at extreme undersampling ratios while significantly\nreducing optimization time and requiring minimal hyperparameter tuning.\n","authors":["Dayoung Baik","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2501.09049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.00393v4","updated":"2025-01-15T10:33:52Z","published":"2021-06-01T11:02:22Z","title":"Relational Reasoning Networks","summary":"  Neuro-symbolic methods integrate neural architectures, knowledge\nrepresentation and reasoning. However, they have been struggling at both\ndealing with the intrinsic uncertainty of the observations and scaling to\nreal-world applications. This paper presents Relational Reasoning Networks\n(R2N), a novel end-to-end model that performs relational reasoning in the\nlatent space of a deep learner architecture, where the representations of\nconstants, ground atoms and their manipulations are learned in an integrated\nfashion. Unlike flat architectures like Knowledge Graph Embedders, which can\nonly represent relations between entities, R2Ns define an additional\ncomputational structure, accounting for higher-level relations among the ground\natoms. The considered relations can be explicitly known, like the ones defined\nby logic formulas, or defined as unconstrained correlations among groups of\nground atoms. R2Ns can be applied to purely symbolic tasks or as a\nneuro-symbolic platform to integrate learning and reasoning in heterogeneous\nproblems with both symbolic and feature-based represented entities. The\nproposed model overtakes the limitations of previous neuro-symbolic methods\nthat have been either limited in terms of scalability or expressivity. The\nproposed methodology is shown to achieve state-of-the-art results in different\nexperimental settings.\n","authors":["Giuseppe Marra","Michelangelo Diligenti","Francesco Giannini"],"pdf_url":"https://arxiv.org/pdf/2106.00393v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09045v1","updated":"2025-01-15T08:52:28Z","published":"2025-01-15T08:52:28Z","title":"Spatio-Temporal Foundation Models: Vision, Challenges, and Opportunities","summary":"  Foundation models have revolutionized artificial intelligence, setting new\nbenchmarks in performance and enabling transformative capabilities across a\nwide range of vision and language tasks. However, despite the prevalence of\nspatio-temporal data in critical domains such as transportation, public health,\nand environmental monitoring, spatio-temporal foundation models (STFMs) have\nnot yet achieved comparable success. In this paper, we articulate a vision for\nthe future of STFMs, outlining their essential characteristics and the\ngeneralization capabilities necessary for broad applicability. We critically\nassess the current state of research, identifying gaps relative to these ideal\ntraits, and highlight key challenges that impede their progress. Finally, we\nexplore potential opportunities and directions to advance research towards the\naim of effective and broadly applicable STFMs.\n","authors":["Adam Goodge","Wee Siong Ng","Bryan Hooi","See Kiong Ng"],"pdf_url":"https://arxiv.org/pdf/2501.09045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09044v1","updated":"2025-01-15T07:14:02Z","published":"2025-01-15T07:14:02Z","title":"TCMM: Token Constraint and Multi-Scale Memory Bank of Contrastive\n  Learning for Unsupervised Person Re-identification","summary":"  This paper proposes the ViT Token Constraint and Multi-scale Memory bank\n(TCMM) method to address the patch noises and feature inconsistency in\nunsupervised person re-identification works. Many excellent methods use ViT\nfeatures to obtain pseudo labels and clustering prototypes, then train the\nmodel with contrastive learning. However, ViT processes images by performing\npatch embedding, which inevitably introduces noise in patches and may\ncompromise the performance of the re-identification model. On the other hand,\nprevious memory bank based contrastive methods may lead data inconsistency due\nto the limitation of batch size. Furthermore, existing pseudo label methods\noften discard outlier samples that are difficult to cluster. It sacrifices the\npotential value of outlier samples, leading to limited model diversity and\nrobustness. This paper introduces the ViT Token Constraint to mitigate the\ndamage caused by patch noises to the ViT architecture. The proposed Multi-scale\nMemory enhances the exploration of outlier samples and maintains feature\nconsistency. Experimental results demonstrate that our system achieves\nstate-of-the-art performance on common benchmarks. The project is available at\n\\href{https://github.com/andy412510/TCMM}{https://github.com/andy412510/TCMM}.\n","authors":["Zheng-An Zhu","Hsin-Che Chien","Chen-Kuo Chiang"],"pdf_url":"https://arxiv.org/pdf/2501.09044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04999v2","updated":"2025-01-15T04:45:12Z","published":"2022-09-12T03:12:04Z","title":"Experimental Study on The Effect of Multi-step Deep Reinforcement\n  Learning in POMDPs","summary":"  Deep Reinforcement Learning (DRL) has made tremendous advances in both\nsimulated and real-world robot control tasks in recent years. This is\nparticularly the case for tasks that can be carefully engineered with a full\nstate representation, and which can then be formulated as a Markov Decision\nProcess (MDP). However, applying DRL strategies designed for MDPs to novel\nrobot control tasks can be challenging, because the available observations may\nbe a partial representation of the state, resulting in a Partially Observable\nMarkov Decision Process (POMDP). This paper considers three popular DRL\nalgorithms, namely Proximal Policy Optimization (PPO), Twin Delayed Deep\nDeterministic Policy Gradient (TD3), and Soft Actor-Critic (SAC), invented for\nMDPs, and studies their performance in POMDP scenarios. While prior work has\nfound that SAC and TD3 typically outperform PPO across a broad range of tasks\nthat can be represented as MDPs, we show that this is not always the case,\nusing three representative POMDP environments. Empirical studies show that this\nis related to multi-step bootstrapping, where multi-step immediate rewards,\ninstead of one-step immediate reward, are used to calculate the target value\nestimation of an observation and action pair. We identify this by observing\nthat the inclusion of multi-step bootstrapping in TD3 (MTD3) and SAC (MSAC)\nresults in improved robustness in POMDP settings.\n","authors":["Lingheng Meng","Rob Gorbet","Michael Burke","Dana Kulić"],"pdf_url":"https://arxiv.org/pdf/2209.04999v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09770v1","updated":"2025-01-15T19:00:45Z","published":"2025-01-15T19:00:45Z","title":"EVAL: EigenVector-based Average-reward Learning","summary":"  In reinforcement learning, two objective functions have been developed\nextensively in the literature: discounted and averaged rewards. The\ngeneralization to an entropy-regularized setting has led to improved robustness\nand exploration for both of these objectives. Recently, the entropy-regularized\naverage-reward problem was addressed using tools from large deviation theory in\nthe tabular setting. This method has the advantage of linearity, providing\naccess to both the optimal policy and average reward-rate through properties of\na single matrix. In this paper, we extend that framework to more general\nsettings by developing approaches based on function approximation by neural\nnetworks. This formulation reveals new theoretical insights into the\nrelationship between different objectives used in RL. Additionally, we combine\nour algorithm with a posterior policy iteration scheme, showing how our\napproach can also solve the average-reward RL problem without\nentropy-regularization. Using classic control benchmarks, we experimentally\nfind that our method compares favorably with other algorithms in terms of\nstability and rate of convergence.\n","authors":["Jacob Adamczyk","Volodymyr Makarenko","Stas Tiomkin","Rahul V. Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2501.09770v1.pdf","comment":"Accepted at the AAAI-25 8th Workshop on Generalization in Planning.\n  arXiv admin note: text overlap with arXiv:2501.09080"}]},"2025-01-16T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.09757v1","updated":"2025-01-16T18:59:53Z","published":"2025-01-16T18:59:53Z","title":"Distilling Multi-modal Large Language Models for Autonomous Driving","summary":"  Autonomous driving demands safe motion planning, especially in critical\n\"long-tail\" scenarios. Recent end-to-end autonomous driving systems leverage\nlarge language models (LLMs) as planners to improve generalizability to rare\nevents. However, using LLMs at test time introduces high computational costs.\nTo address this, we propose DiMA, an end-to-end autonomous driving system that\nmaintains the efficiency of an LLM-free (or vision-based) planner while\nleveraging the world knowledge of an LLM. DiMA distills the information from a\nmulti-modal LLM to a vision-based end-to-end planner through a set of specially\ndesigned surrogate tasks. Under a joint training strategy, a scene encoder\ncommon to both networks produces structured representations that are\nsemantically grounded as well as aligned to the final planning objective.\nNotably, the LLM is optional at inference, enabling robust planning without\ncompromising on efficiency. Training with DiMA results in a 37% reduction in\nthe L2 trajectory error and an 80% reduction in the collision rate of the\nvision-based planner, as well as a 44% trajectory error reduction in longtail\nscenarios. DiMA also achieves state-of-the-art performance on the nuScenes\nplanning benchmark.\n","authors":["Deepti Hegde","Rajeev Yasarla","Hong Cai","Shizhong Han","Apratim Bhattacharyya","Shweta Mahajan","Litian Liu","Risheek Garrepalli","Vishal M. Patel","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2501.09757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09747v1","updated":"2025-01-16T18:57:04Z","published":"2025-01-16T18:57:04Z","title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","summary":"  Autoregressive sequence models, such as Transformer-based vision-language\naction (VLA) policies, can be tremendously effective for capturing complex and\ngeneralizable robotic behaviors. However, such models require us to choose a\ntokenization of our continuous action signals, which determines how the\ndiscrete symbols predicted by the model map to continuous robot actions. We\nfind that current approaches for robot action tokenization, based on simple\nper-dimension, per-timestep binning schemes, typically perform poorly when\nlearning dexterous skills from high-frequency robot data. To address this\nchallenge, we propose a new compression-based tokenization scheme for robot\nactions, based on the discrete cosine transform. Our tokenization approach,\nFrequency-space Action Sequence Tokenization (FAST), enables us to train\nautoregressive VLAs for highly dexterous and high-frequency tasks where\nstandard discretization methods fail completely. Based on FAST, we release\nFAST+, a universal robot action tokenizer, trained on 1M real robot action\ntrajectories. It can be used as a black-box tokenizer for a wide range of robot\naction sequences, with diverse action spaces and control frequencies. Finally,\nwe show that, when combined with the pi0 VLA, our method can scale to training\non 10k hours of robot data and match the performance of diffusion VLAs, while\nreducing training time by up to 5x.\n","authors":["Karl Pertsch","Kyle Stachowicz","Brian Ichter","Danny Driess","Suraj Nair","Quan Vuong","Oier Mees","Chelsea Finn","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2501.09747v1.pdf","comment":"Website: https://www.pi.website/research/fast"},{"id":"http://arxiv.org/abs/2501.09718v1","updated":"2025-01-16T18:06:09Z","published":"2025-01-16T18:06:09Z","title":"FLOL: Fast Baselines for Real-World Low-Light Enhancement","summary":"  Low-Light Image Enhancement (LLIE) is a key task in computational photography\nand imaging. The problem of enhancing images captured during night or in dark\nenvironments has been well-studied in the image signal processing literature.\nHowever, current deep learning-based solutions struggle with efficiency and\nrobustness in real-world scenarios (e.g. scenes with noise, saturated pixels,\nbad illumination). We propose a lightweight neural network that combines image\nprocessing in the frequency and spatial domains. Our method, FLOL+, is one of\nthe fastest models for this task, achieving state-of-the-art results on popular\nreal scenes datasets such as LOL and LSRW. Moreover, we are able to process\n1080p images under 12ms. Code and models at https://github.com/cidautai/FLOL\n","authors":["Juan C. Benito","Daniel Feijoo","Alvaro Garcia","Marcos V. Conde"],"pdf_url":"https://arxiv.org/pdf/2501.09718v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2501.09680v1","updated":"2025-01-16T17:31:27Z","published":"2025-01-16T17:31:27Z","title":"CoNav Chair: Design of a ROS-based Smart Wheelchair for Shared Control\n  Navigation in the Built Environment","summary":"  With the number of people with disabilities (PWD) increasing worldwide each\nyear, the demand for mobility support to enable independent living and social\nintegration is also growing. Wheelchairs commonly support the mobility of PWD\nin both indoor and outdoor environments. However, current powered wheelchairs\n(PWC) often fail to meet the needs of PWD, who may find it difficult to operate\nthem. Furthermore, existing research on robotic wheelchairs typically focuses\neither on full autonomy or enhanced manual control, which can lead to reduced\nefficiency and user trust. To address these issues, this paper proposes a Robot\nOperating System (ROS)-based smart wheelchair, called CoNav Chair, that\nincorporates a shared control navigation algorithm and obstacle avoidance to\nsupport PWD while fostering efficiency and trust between the robot and the\nuser. Our design consists of hardware and software components. Experimental\nresults conducted in a typical indoor social environment demonstrate the\nperformance and effectiveness of the smart wheelchair hardware and software\ndesign. This integrated design promotes trust and autonomy, which are crucial\nfor the acceptance of assistive mobility technologies in the built environment.\n","authors":["Yifan Xu","Qianwei Wang","Jordan Lillie","Vineet Kamat","Carol Menassa"],"pdf_url":"https://arxiv.org/pdf/2501.09680v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.09668v1","updated":"2025-01-16T17:05:54Z","published":"2025-01-16T17:05:54Z","title":"Model Predictive Path Integral Docking of Fully Actuated Surface Vessel","summary":"  Autonomous docking remains one of the most challenging maneuvers in marine\nrobotics, requiring precise control and robust perception in confined spaces.\nThis paper presents a novel approach integrating Model Predictive Path\nIntegral(MPPI) control with real-time LiDAR-based dock detection for autonomous\nsurface vessel docking. Our framework uniquely combines probabilistic\ntrajectory optimization with a multiobjective cost function that simultaneously\nconsiders docking precision, safety constraints, and motion efficiency. The\nMPPI controller generates optimal trajectories by intelligently sampling\ncontrol sequences and evaluating their costs based on dynamic clearance\nrequirements, orientation alignment, and target position objectives. We\nintroduce an adaptive dock detection pipeline that processes LiDAR point clouds\nto extract critical geometric features, enabling real-time updates of docking\nparameters. The proposed method is extensively validated in a physics-based\nsimulation environment that incorporates realistic sensor noise, vessel\ndynamics, and environmental constraints. Results demonstrate successful docking\nfrom various initial positions while maintaining safe clearances and smooth\nmotion characteristics.\n","authors":["Akash Vijayakumar","Atmanand M A","Abhilash Somayajula"],"pdf_url":"https://arxiv.org/pdf/2501.09668v1.pdf","comment":"6 pages, 6 figures, 1 table, UT2025 Conference, IEEE International\n  Symposium on Underwater Technology 2025"},{"id":"http://arxiv.org/abs/2412.12406v3","updated":"2025-01-16T16:55:40Z","published":"2024-12-16T23:17:40Z","title":"Global SLAM in Visual-Inertial Systems with 5G Time-of-Arrival\n  Integration","summary":"  This paper presents a novel approach that integrates 5G Time of Arrival (ToA)\nmeasurements into ORB-SLAM3 to enable global localization and enhance mapping\ncapabilities for indoor drone navigation. We extend ORB-SLAM3's optimization\npipeline to jointly process ToA data from 5G base stations alongside visual and\ninertial measurements while estimating system biases. This integration\ntransforms the inherently local SLAM estimates into globally referenced\ntrajectories and effectively resolves scale ambiguity in monocular\nconfigurations. Our method is evaluated using five real-world indoor datasets\ncollected with RGB-D cameras and inertial measurement units (IMUs),\ncomplemented by simulated 5G ToA measurements at 28 GHz and 78 GHz frequencies\nusing MATLAB and QuaDRiGa. Extensive experiments across four SLAM\nconfigurations (RGB-D, RGB-D-Inertial, Monocular, and Monocular-Inertial)\ndemonstrate that ToA integration enables consistent global positioning across\nall modes while significantly improving local accuracy in minimal sensor\nsetups. Notably, ToA-enhanced monocular SLAM achieves superior local accuracy\n(6.3 cm average) compared to the RGB-D baseline (11.5 cm), and enables reliable\noperation of monocular-inertial SLAM in scenarios where the baseline system\nfails completely. While ToA integration offers limited local accuracy\nimprovements for sensor-rich configurations like RGB-D SLAM, it consistently\nenables robust global localization.\n","authors":["Meisam Kabiri","Holger Voos"],"pdf_url":"https://arxiv.org/pdf/2412.12406v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09649v1","updated":"2025-01-16T16:45:08Z","published":"2025-01-16T16:45:08Z","title":"Monte Carlo Tree Search with Velocity Obstacles for safe and efficient\n  motion planning in dynamic environments","summary":"  Online motion planning is a challenging problem for intelligent robots moving\nin dense environments with dynamic obstacles, e.g., crowds. In this work, we\npropose a novel approach for optimal and safe online motion planning with\nminimal information about dynamic obstacles. Specifically, our approach\nrequires only the current position of the obstacles and their maximum speed,\nbut it does not need any information about their exact trajectories or dynamic\nmodel. The proposed methodology combines Monte Carlo Tree Search (MCTS), for\nonline optimal planning via model simulations, with Velocity Obstacles (VO),\nfor obstacle avoidance. We perform experiments in a cluttered simulated\nenvironment with walls, and up to 40 dynamic obstacles moving with random\nvelocities and directions. With an ablation study, we show the key contribution\nof VO in scaling up the efficiency of MCTS, selecting the safest and most\nrewarding actions in the tree of simulations. Moreover, we show the superiority\nof our methodology with respect to state-of-the-art planners, including\nNon-linear Model Predictive Control (NMPC), in terms of improved collision\nrate, computational and task performance.\n","authors":["Lorenzo Bonanni","Daniele Meli","Alberto Castellini","Alessandro Farinelli"],"pdf_url":"https://arxiv.org/pdf/2501.09649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12105v3","updated":"2025-01-16T15:23:19Z","published":"2024-07-16T18:23:10Z","title":"AeroHaptix: A Wearable Vibrotactile Feedback System for Enhancing\n  Collision Avoidance in UAV Teleoperation","summary":"  Haptic feedback enhances collision avoidance by providing directional\nobstacle information to operators during unmanned aerial vehicle (UAV)\nteleoperation. However, such feedback is often rendered via haptic joysticks,\nwhich are unfamiliar to UAV operators and limited to single-direction force\nfeedback. Additionally, the direct coupling between the input device and the\nfeedback method diminishes operators' sense of control and induces oscillatory\nmovements. To overcome these limitations, we propose AeroHaptix, a wearable\nhaptic feedback system that uses spatial vibrations to simultaneously\ncommunicate multiple obstacle directions to operators, without interfering with\ntheir input control. The layout of vibrotactile actuators was optimized via a\nperceptual study to eliminate perceptual biases and achieve uniform spatial\ncoverage. A novel rendering algorithm, MultiCBF, extended control barrier\nfunctions to support multi-directional feedback. Our system evaluation showed\nthat compared to a no-feedback condition, AeroHaptix effectively reduced the\nnumber of collisions and input disagreement. Furthermore, operators reported\nthat AeroHaptix was more helpful than force feedback, with improved situational\nawareness and comparable workload.\n","authors":["Bingjian Huang","Zhecheng Wang","Qilong Cheng","Siyi Ren","Hanfeng Cai","Antonio Alvarez Valdivia","Karthik Mahadevan","Daniel Wigdor"],"pdf_url":"https://arxiv.org/pdf/2407.12105v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09600v1","updated":"2025-01-16T15:22:06Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n  Prototyping in Virtual Reality Applications","summary":"  SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09490v1","updated":"2025-01-16T12:01:44Z","published":"2025-01-16T12:01:44Z","title":"Comparison of Various SLAM Systems for Mobile Robot in an Indoor\n  Environment","summary":"  This article presents a comparative analysis of a mobile robot trajectories\ncomputed by various ROS-based SLAM systems. For this reason we developed a\nprototype of a mobile robot with common sensors: 2D lidar, a monocular and ZED\nstereo cameras. Then we conducted experiments in a typical office environment\nand collected data from all sensors, running all tested SLAM systems based on\nthe acquired dataset. We studied the following SLAM systems: (a) 2D\nlidar-based: GMapping, Hector SLAM, Cartographer; (b) monocular camera-based:\nLarge Scale Direct monocular SLAM (LSD SLAM), ORB SLAM, Direct Sparse Odometry\n(DSO); and (c) stereo camera-based: ZEDfu, Real-Time Appearance-Based Mapping\n(RTAB map), ORB SLAM, Stereo Parallel Tracking and Mapping (S-PTAM). Since all\nSLAM methods were tested on the same dataset we compared results for different\nSLAM systems with appropriate metrics, demonstrating encouraging results for\nlidar-based Cartographer SLAM, Monocular ORB SLAM and Stereo RTAB Map methods.\n","authors":["Maksim Filipenko","Ilya Afanasyev"],"pdf_url":"https://arxiv.org/pdf/2501.09490v1.pdf","comment":"6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.16485v3","updated":"2025-01-16T11:59:02Z","published":"2024-07-23T14:00:18Z","title":"Learning Constraint Network from Demonstrations via Positive-Unlabeled\n  Learning with Memory Replay","summary":"  Planning for a wide range of real-world tasks necessitates to know and write\nall constraints. However, instances exist where these constraints are either\nunknown or challenging to specify accurately. A possible solution is to infer\nthe unknown constraints from expert demonstration. The majority of prior works\nlimit themselves to learning simple linear constraints, or require strong\nknowledge of the true constraint parameterization or environmental model. To\nmitigate these problems, this paper presents a positive-unlabeled (PU) learning\napproach to infer a continuous, arbitrary and possibly nonlinear, constraint\nfrom demonstration. From a PU learning view, We treat all data in\ndemonstrations as positive (feasible) data, and learn a (sub)-optimal policy to\ngenerate high-reward-winning but potentially infeasible trajectories, which\nserve as unlabeled data containing both feasible and infeasible states. Under\nan assumption on data distribution, a feasible-infeasible classifier (i.e.,\nconstraint model) is learned from the two datasets through a postprocessing PU\nlearning technique. The entire method employs an iterative framework\nalternating between updating the policy, which generates and selects\nhigher-reward policies, and updating the constraint model. Additionally, a\nmemory buffer is introduced to record and reuse samples from previous\niterations to prevent forgetting. The effectiveness of the proposed method is\nvalidated in two Mujoco environments, successfully inferring continuous\nnonlinear constraints and outperforming a baseline method in terms of\nconstraint accuracy and policy safety.\n","authors":["Baiyu Peng","Aude Billard"],"pdf_url":"https://arxiv.org/pdf/2407.16485v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09468v1","updated":"2025-01-16T11:10:10Z","published":"2025-01-16T11:10:10Z","title":"Sensorimotor Control Strategies for Tactile Robotics","summary":"  How are robots becoming smarter at interacting with their surroundings?\nRecent advances have reshaped how robots use tactile sensing to perceive and\nengage with the world. Tactile sensing is a game-changer, allowing robots to\nembed sensorimotor control strategies to interact with complex environments and\nskillfully handle heterogeneous objects. Such control frameworks plan\ncontact-driven motions while staying responsive to sudden changes. We review\nthe latest methods for building perception and control systems in tactile\nrobotics while offering practical guidelines for their design and\nimplementation. We also address key challenges to shape the future of\nintelligent robots.\n","authors":["Enrico Donato","Matteo Lo Preti","Lucia Beccai","Egidio Falotico"],"pdf_url":"https://arxiv.org/pdf/2501.09468v1.pdf","comment":"39 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.01622v2","updated":"2025-01-16T10:30:40Z","published":"2024-08-03T01:09:48Z","title":"Positive-Unlabeled Constraint Learning for Inferring Nonlinear\n  Continuous Constraints Functions from Expert Demonstrations","summary":"  Planning for diverse real-world robotic tasks necessitates to know and write\nall constraints. However, instances exist where these constraints are either\nunknown or challenging to specify accurately. A possible solution is to infer\nthe unknown constraints from expert demonstration. This paper presents a novel\ntwo-step Positive-Unlabeled Constraint Learning (PUCL) algorithm to infer a\ncontinuous constraint function from demonstrations, without requiring prior\nknowledge of the true constraint parameterization or environmental model as\nexisting works. We treat all data in demonstrations as positive (feasible)\ndata, and learn a control policy to generate potentially infeasible\ntrajectories, which serve as unlabeled data. The proposed two-step learning\nframework first identifies reliable infeasible data using a distance metric,\nand secondly learns a binary feasibility classifier (i.e., constraint function)\nfrom the feasible demonstrations and reliable infeasible data. The proposed\nmethod is flexible to learn complex-shaped constraint boundary and will not\nmistakenly classify demonstrations as infeasible as previous methods. The\neffectiveness of the proposed method is verified in four constrained\nenvironments, using a networked policy or a dynamical system policy. It\nsuccessfully infers the continuous nonlinear constraints and outperforms other\nbaseline methods in terms of constraint accuracy and policy safety. This work\nhas been published in IEEE Robotics and Automation Letters (RA-L). Please refer\nto the final version at https://doi.org/10.1109/LRA.2024.3522756\n","authors":["Baiyu Peng","Aude Billard"],"pdf_url":"https://arxiv.org/pdf/2408.01622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09450v1","updated":"2025-01-16T10:25:24Z","published":"2025-01-16T10:25:24Z","title":"Real-Time Generation of Near-Minimum-Energy Trajectories via\n  Constraint-Informed Residual Learning","summary":"  Industrial robotics demands significant energy to operate, making\nenergy-reduction methodologies increasingly important. Strategies for planning\nminimum-energy trajectories typically involve solving nonlinear optimal control\nproblems (OCPs), which rarely cope with real-time requirements. In this paper,\nwe propose a paradigm for generating near minimum-energy trajectories for\nmanipulators by learning from optimal solutions. Our paradigm leverages a\nresidual learning approach, which embeds boundary conditions while focusing on\nlearning only the adjustments needed to steer a standard solution to an optimal\none. Compared to a computationally expensive OCP-based planner, our paradigm\nachieves 87.3% of the performance near the training dataset and 50.8% far from\nthe dataset, while being two to three orders of magnitude faster.\n","authors":["Domenico Dona'","Giovanni Franzese","Cosimo Della Santina","Paolo Boscariol","Basilio Lenzo"],"pdf_url":"https://arxiv.org/pdf/2501.09450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20770v2","updated":"2025-01-16T10:24:32Z","published":"2024-12-30T07:41:01Z","title":"Humanoid Robot RHP Friends: Seamless Combination of Autonomous and\n  Teleoperated Tasks in a Nursing Context","summary":"  This paper describes RHP Friends, a social humanoid robot developed to enable\nassistive robotic deployments in human-coexisting environments. As a use-case\napplication, we present its potential use in nursing by extending its\ncapabilities to operate human devices and tools according to the task and by\nenabling remote assistance operations. To meet a wide variety of tasks and\nsituations in environments designed by and for humans, we developed a system\nthat seamlessly integrates the slim and lightweight robot and several\ntechnologies: locomanipulation, multi-contact motion, teleoperation, and object\ndetection and tracking. We demonstrated the system's usage in a nursing\napplication. The robot efficiently performed the daily task of patient transfer\nand a non-routine task, represented by a request to operate a circuit breaker.\nThis demonstration, held at the 2023 International Robot Exhibition (IREX),\nconducted three times a day over three days.\n","authors":["Mehdi Benallegue","Guillaume Lorthioir","Antonin Dallard","Rafael Cisneros-Limón","Iori Kumagai","Mitsuharu Morisawa","Hiroshi Kaminaga","Masaki Murooka","Antoine Andre","Pierre Gergondet","Kenji Kaneko","Guillaume Caron","Fumio Kanehiro","Abderrahmane Kheddar","Soh Yukizaki","Junichi Karasuyama","Junichi Murakami","Masayuki Kamon"],"pdf_url":"https://arxiv.org/pdf/2412.20770v2.pdf","comment":"IEEE Robotics and Automation Magazine, In press"},{"id":"http://arxiv.org/abs/2411.05548v3","updated":"2025-01-16T08:22:12Z","published":"2024-11-08T13:11:16Z","title":"Equivariant IMU Preintegration with Biases: a Galilean Group Approach","summary":"  This letter proposes a new approach for Inertial Measurement Unit (IMU)\npreintegration, a fundamental building block that can be leveraged in different\noptimization-based Inertial Navigation System (INS) localization solutions.\nInspired by recent advances in equivariant theory applied to biased INSs, we\nderive a discrete-time formulation of the IMU preintegration on\n${\\mathbf{Gal}(3) \\ltimes \\mathfrak{gal}(3)}$, the left-trivialization of the\ntangent group of the Galilean group $\\mathbf{Gal}(3)$. We define a novel\npreintegration error that geometrically couples the navigation states and the\nbias leading to lower linearization error. Our method improves in consistency\ncompared to existing preintegration approaches which treat IMU biases as a\nseparate state-space. Extensive validation against state-of-the-art methods,\nboth in simulation and with real-world IMU data, implementation in the Lie++\nlibrary, and open-source code are provided.\n","authors":["Giulio Delama","Alessandro Fornasier","Robert Mahony","Stephan Weiss"],"pdf_url":"https://arxiv.org/pdf/2411.05548v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09357v1","updated":"2025-01-16T08:08:42Z","published":"2025-01-16T08:08:42Z","title":"Path Planning for a UAV Swarm Using Formation Teaching-Learning-Based\n  Optimization","summary":"  This work addresses the path planning problem for a group of unmanned aerial\nvehicles (UAVs) to maintain a desired formation during operation. Our approach\nformulates the problem as an optimization task by defining a set of fitness\nfunctions that not only ensure the formation but also include constraints for\noptimal and safe UAV operation. To optimize the fitness function and obtain a\nsuboptimal path, we employ the teaching-learning-based optimization algorithm\nand then further enhance it with mechanisms such as mutation, elite strategy,\nand multi-subject combination. A number of simulations and experiments have\nbeen conducted to evaluate the proposed method. The results demonstrate that\nthe algorithm successfully generates valid paths for the UAVs to fly in a\ntriangular formation for an inspection task.\n","authors":["Van Truong Hoang","Manh Duong Phung"],"pdf_url":"https://arxiv.org/pdf/2501.09357v1.pdf","comment":"in Proceedings of the 2025 International Conference on Energy,\n  Infrastructure and Environmental Research (EIER2025)"},{"id":"http://arxiv.org/abs/2501.09338v1","updated":"2025-01-16T07:38:56Z","published":"2025-01-16T07:38:56Z","title":"Robust UAV Path Planning with Obstacle Avoidance for Emergency Rescue","summary":"  The unmanned aerial vehicles (UAVs) are efficient tools for diverse tasks\nsuch as electronic reconnaissance, agricultural operations and disaster relief.\nIn the complex three-dimensional (3D) environments, the path planning with\nobstacle avoidance for UAVs is a significant issue for security assurance. In\nthis paper, we construct a comprehensive 3D scenario with obstacles and no-fly\nzones for dynamic UAV trajectory. Moreover, a novel artificial potential field\nalgorithm coupled with simulated annealing (APF-SA) is proposed to tackle the\nrobust path planning problem. APF-SA modifies the attractive and repulsive\npotential functions and leverages simulated annealing to escape local minimum\nand converge to globally optimal solutions. Simulation results demonstrate that\nthe effectiveness of APF-SA, enabling efficient autonomous path planning for\nUAVs with obstacle avoidance.\n","authors":["Junteng Mao","Ziye Jia","Hanzhi Gu","Chenyu Shi","Haomin Shi","Lijun He","Qihui Wu"],"pdf_url":"https://arxiv.org/pdf/2501.09338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07259v2","updated":"2025-01-16T07:26:52Z","published":"2025-01-13T12:14:48Z","title":"PO-GVINS: Tightly Coupled GNSS-Visual-Inertial Integration with\n  Pose-Only Representation","summary":"  Accurate and reliable positioning is crucial for perception, decision-making,\nand other high-level applications in autonomous driving, unmanned aerial\nvehicles, and intelligent robots. Given the inherent limitations of standalone\nsensors, integrating heterogeneous sensors with complementary capabilities is\none of the most effective approaches to achieving this goal. In this paper, we\npropose a filtering-based, tightly coupled global navigation satellite system\n(GNSS)-visual-inertial positioning framework with a pose-only formulation\napplied to the visual-inertial system (VINS), termed PO-GVINS. Specifically,\nmultiple-view imaging used in current VINS requires a priori of 3D feature,\nthen jointly estimate camera poses and 3D feature position, which inevitably\nintroduces linearization error of the feature as well as facing dimensional\nexplosion. However, the pose-only (PO) formulation, which is demonstrated to be\nequivalent to the multiple-view imaging and has been applied in visual\nreconstruction, represent feature depth using two camera poses and thus 3D\nfeature position is removed from state vector avoiding aforementioned\ndifficulties. Inspired by this, we first apply PO formulation in our VINS,\ni.e., PO-VINS. GNSS raw measurements are then incorporated with integer\nambiguity resolved to achieve accurate and drift-free estimation. Extensive\nexperiments demonstrate that the proposed PO-VINS significantly outperforms the\nmulti-state constrained Kalman filter (MSCKF). By incorporating GNSS\nmeasurements, PO-GVINS achieves accurate, drift-free state estimation, making\nit a robust solution for positioning in challenging environments.\n","authors":["Zhuo Xu","Feng Zhu","Zihang Zhang","Chang Jian","Jiarui Lv","Yuantai Zhang","Xiaohong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09307v1","updated":"2025-01-16T05:40:37Z","published":"2025-01-16T05:40:37Z","title":"RoboReflect: Robotic Reflective Reasoning for Grasping\n  Ambiguous-Condition Objects","summary":"  As robotic technology rapidly develops, robots are being employed in an\nincreasing number of fields. However, due to the complexity of deployment\nenvironments or the prevalence of ambiguous-condition objects, the practical\napplication of robotics still faces many challenges, leading to frequent\nerrors. Traditional methods and some LLM-based approaches, although improved,\nstill require substantial human intervention and struggle with autonomous error\ncorrection in complex scenarios.In this work, we propose RoboReflect, a novel\nframework leveraging large vision-language models (LVLMs) to enable\nself-reflection and autonomous error correction in robotic grasping tasks.\nRoboReflect allows robots to automatically adjust their strategies based on\nunsuccessful attempts until successful execution is achieved.The corrected\nstrategies are saved in a memory for future task reference.We evaluate\nRoboReflect through extensive testing on eight common objects prone to\nambiguous conditions of three categories.Our results demonstrate that\nRoboReflect not only outperforms existing grasp pose estimation methods like\nAnyGrasp and high-level action planning techniques using GPT-4V but also\nsignificantly enhances the robot's ability to adapt and correct errors\nindependently. These findings underscore the critical importance of autonomous\nselfreflection in robotic systems while effectively addressing the challenges\nposed by ambiguous environments.\n","authors":["Zhen Luo","Yixuan Yang","Chang Cai","Yanfu Zhang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.09307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09290v1","updated":"2025-01-16T04:50:15Z","published":"2025-01-16T04:50:15Z","title":"Interoceptive Robots for Convergent Shared Control in Collaborative\n  Construction Work","summary":"  Building autonomous mobile robots (AMRs) with optimized efficiency and\nadaptive capabilities-able to respond to changing task demands and dynamic\nenvironments-is a strongly desired goal for advancing construction robotics.\nSuch robots can play a critical role in enabling automation, reducing\noperational carbon footprints, and supporting modular construction processes.\nInspired by the adaptive autonomy of living organisms, we introduce\ninteroception, which centers on the robot's internal state representation, as a\nfoundation for developing self-reflection and conscious learning to enable\ncontinual learning and adaptability in robotic agents. In this paper, we\nfactorize internal state variables and mathematical properties as \"cognitive\ndissonance\" in shared control paradigms, where human interventions occasionally\noccur. We offer a new perspective on how interoception can help build adaptive\nmotion planning in AMRs by integrating the legacy of heuristic costs from\ngrid/graph-based algorithms with recent advances in neuroscience and\nreinforcement learning. Declarative and procedural knowledge extracted from\nhuman semantic inputs is encoded into a hypergraph model that overlaps with the\nspatial configuration of onsite layout for path planning. In addition, we\ndesign a velocity-replay module using an encoder-decoder architecture with\nfew-shot learning to enable robots to replicate velocity profiles in\ncontextualized scenarios for multi-robot synchronization and handover\ncollaboration. These \"cached\" knowledge representations are demonstrated in\nsimulated environments for multi-robot motion planning and stacking tasks. The\ninsights from this study pave the way toward artificial general intelligence in\nAMRs, fostering their progression from complexity to competence in construction\nautomation.\n","authors":["Xiaoshan Zhou","Carol C. Menassa","Vineet R. Kamat"],"pdf_url":"https://arxiv.org/pdf/2501.09290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09273v1","updated":"2025-01-16T03:44:14Z","published":"2025-01-16T03:44:14Z","title":"ThinTact:Thin Vision-Based Tactile Sensor by Lensless Imaging","summary":"  Vision-based tactile sensors have drawn increasing interest in the robotics\ncommunity. However, traditional lens-based designs impose minimum thickness\nconstraints on these sensors, limiting their applicability in space-restricted\nsettings. In this paper, we propose ThinTact, a novel lensless vision-based\ntactile sensor with a sensing field of over 200 mm2 and a thickness of less\nthan 10 mm.ThinTact utilizes the mask-based lensless imaging technique to map\nthe contact information to CMOS signals. To ensure real-time tactile sensing,\nwe propose a real-time lensless reconstruction algorithm that leverages a\nfrequency-spatial-domain joint filter based on discrete cosine transform (DCT).\nThis algorithm achieves computation significantly faster than existing\noptimization-based methods. Additionally, to improve the sensing quality, we\ndevelop a mask optimization method based on the generic algorithm and the\ncorresponding system matrix calibration algorithm.We evaluate the performance\nof our proposed lensless reconstruction and tactile sensing through qualitative\nand quantitative experiments. Furthermore, we demonstrate ThinTact's practical\napplicability in diverse applications, including texture recognition and\ncontact-rich object manipulation. The paper will appear in the IEEE\nTransactions on Robotics: https://ieeexplore.ieee.org/document/10842357. Video:\nhttps://youtu.be/YrOO9BDMAHo\n","authors":["Jing Xu","Weihang Chen","Hongyu Qian","Dan Wu","Rui Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09273v1.pdf","comment":"\\c{opyright} 2025 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2501.09267v1","updated":"2025-01-16T03:34:36Z","published":"2025-01-16T03:34:36Z","title":"Are Open-Vocabulary Models Ready for Detection of MEP Elements on\n  Construction Sites","summary":"  The construction industry has long explored robotics and computer vision, yet\ntheir deployment on construction sites remains very limited. These technologies\nhave the potential to revolutionize traditional workflows by enhancing\naccuracy, efficiency, and safety in construction management. Ground robots\nequipped with advanced vision systems could automate tasks such as monitoring\nmechanical, electrical, and plumbing (MEP) systems. The present research\nevaluates the applicability of open-vocabulary vision-language models compared\nto fine-tuned, lightweight, closed-set object detectors for detecting MEP\ncomponents using a mobile ground robotic platform. A dataset collected with\ncameras mounted on a ground robot was manually annotated and analyzed to\ncompare model performance. The results demonstrate that, despite the\nversatility of vision-language models, fine-tuned lightweight models still\nlargely outperform them in specialized environments and for domain-specific\ntasks.\n","authors":["Abdalwhab Abdalwhab","Ali Imran","Sina Heydarian","Ivanka Iordanova","David St-Onge"],"pdf_url":"https://arxiv.org/pdf/2501.09267v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.13345v2","updated":"2025-01-16T02:37:08Z","published":"2024-05-22T05:04:44Z","title":"Autonomous Algorithm for Training Autonomous Vehicles with Minimal Human\n  Intervention","summary":"  Recent reinforcement learning (RL) algorithms have demonstrated impressive\nresults in simulated driving environments. However, autonomous vehicles trained\nin simulation often struggle to work well in the real world due to the fidelity\ngap between simulated and real-world environments. While directly training\nreal-world autonomous vehicles with RL algorithms is a promising approach to\nbypass the fidelity gap problem, it presents several challenges. One critical\nyet often overlooked challenge is the need to reset a driving environment\nbetween every episode. This reset process demands significant human\nintervention, leading to poor training efficiency in the real world. In this\npaper, we introduce a novel autonomous algorithm that enables off-the-shelf RL\nalgorithms to train autonomous vehicles with minimal human intervention. Our\nalgorithm reduces unnecessary human intervention by aborting episodes to\nprevent unsafe states and identifying informative initial states for subsequent\nepisodes. The key idea behind identifying informative initial states is to\nestimate the expected amount of information that can be obtained from\nunder-explored but reachable states. Our algorithm also revisits rule-based\nautonomous driving algorithms and highlights their benefits in safely returning\nan autonomous vehicle to initial states. To evaluate how much human\nintervention is required during training, we implement challenging urban\ndriving tasks that require an autonomous vehicle to reset to initial states on\nits own. The experimental results show that our autonomous algorithm is\ntask-agnostic and achieves competitive driving performance with much less human\nintervention than baselines.\n","authors":["Sang-Hyun Lee","Daehyeok Kwon","Seung-Woo Seo"],"pdf_url":"https://arxiv.org/pdf/2405.13345v2.pdf","comment":"8 pages, 6 figures, 2 tables, conference"},{"id":"http://arxiv.org/abs/2405.00846v4","updated":"2025-01-16T01:49:35Z","published":"2024-05-01T20:21:44Z","title":"Gameplay Filters: Robust Zero-Shot Safety through Adversarial\n  Imagination","summary":"  Despite the impressive recent advances in learning-based robot control,\nensuring robustness to out-of-distribution conditions remains an open\nchallenge. Safety filters can, in principle, keep arbitrary control policies\nfrom incurring catastrophic failures by overriding unsafe actions, but existing\nsolutions for complex (e.g., legged) robot dynamics do not span the full motion\nenvelope and instead rely on local, reduced-order models. These filters tend to\noverly restrict agility and can still fail when perturbed away from nominal\nconditions. This paper presents the gameplay filter, a new class of predictive\nsafety filter that continually plays out hypothetical matches between its\nsimulation-trained safety strategy and a virtual adversary co-trained to invoke\nworst-case events and sim-to-real error, and precludes actions that would cause\nfailures down the line. We demonstrate the scalability and robustness of the\napproach with a first-of-its-kind full-order safety filter for (36-D)\nquadrupedal dynamics. Physical experiments on two different quadruped platforms\ndemonstrate the superior zero-shot effectiveness of the gameplay filter under\nlarge perturbations such as tugging and unmodeled terrain. Experiment videos\nand open-source software are available online:\nhttps://saferobotics.org/research/gameplay-filter\n","authors":["Duy P. Nguyen","Kai-Chieh Hsu","Wenhao Yu","Jie Tan","Jaime F. Fisac"],"pdf_url":"https://arxiv.org/pdf/2405.00846v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15922v4","updated":"2025-01-16T23:59:18Z","published":"2024-09-24T09:45:20Z","title":"The Dark Side of Rich Rewards: Understanding and Mitigating Noise in VLM\n  Rewards","summary":"  While Vision-Language Models (VLMs) are increasingly used to generate reward\nsignals for training embodied agents to follow instructions, our research\nreveals that agents guided by VLM rewards often underperform compared to those\nemploying only intrinsic (exploration-driven) rewards, contradicting\nexpectations set by recent work. We hypothesize that false positive rewards --\ninstances where unintended trajectories are incorrectly rewarded -- are more\ndetrimental than false negatives. Our analysis confirms this hypothesis,\nrevealing that the widely used cosine similarity metric is prone to false\npositive reward estimates. To address this, we introduce BiMI ({Bi}nary\n{M}utual {I}nformation), a novel reward function designed to mitigate noise.\nBiMI significantly enhances learning efficiency across diverse and challenging\nembodied navigation environments. Our findings offer a nuanced understanding of\nhow different types of reward noise impact agent learning and highlight the\nimportance of addressing multimodal reward signal noise when training embodied\nagents\n","authors":["Sukai Huang","Shu-Wei Liu","Nir Lipovetzky","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2409.15922v4.pdf","comment":"11 main body pages, 21 appendix pages"},{"id":"http://arxiv.org/abs/2409.20539v3","updated":"2025-01-16T20:14:13Z","published":"2024-09-30T17:41:00Z","title":"Visual collective behaviors on spherical robots","summary":"  The implementation of collective motion, traditionally, disregard the limited\nsensing capabilities of an individual, to instead assuming an omniscient\nperception of the environment. This study implements a visual flocking model in\na ``robot-in-the-loop'' approach to reproduce these behaviors with a flock\ncomposed of 10 independent spherical robots. The model achieves robotic\ncollective motion by only using panoramic visual information of each robot,\nsuch as retinal position, optical size and optic flow of the neighboring\nrobots. We introduce a virtual anchor to confine the collective robotic\nmovements so to avoid wall interactions. For the first time, a simple visual\nrobot-in-the-loop approach succeed in reproducing several collective motion\nphases, in particular, swarming, and milling. Another milestone achieved with\nby this model is bridging the gap between simulation and physical experiments\nby demonstrating nearly identical behaviors in both environments with the same\nvisual model. To conclude, we show that our minimal visual collective motion\nmodel is sufficient to recreate most collective behaviors on a\nrobot-in-the-loop system that is scalable, behaves as numerical simulations\npredict and is easily comparable to traditional models.\n","authors":["Diego Castro","Christophe Eloy","Franck Ruffier"],"pdf_url":"https://arxiv.org/pdf/2409.20539v3.pdf","comment":"26 pages, 16 figures, journal bioinspired and biomimetics"},{"id":"http://arxiv.org/abs/2501.09819v1","updated":"2025-01-16T20:09:40Z","published":"2025-01-16T20:09:40Z","title":"Torque Responsive Metamaterials Enable High Payload Soft Robot Arms","summary":"  Soft robots have struggled to support large forces and moments while also\nsupporting their own weight against gravity. This limits their ability to reach\ncertain configurations necessary for tasks such as inspection and pushing\nobjects up. We have overcome this limitation by creating an electrically driven\nmetamaterial soft arm using handed shearing auxetics (HSA) and bendable\nextendable torque resistant (BETR) shafts. These use the large force and torque\ncapacity of HSAs and the nestable torque transmission of BETRs to create a\nstrong soft arm. We found that the HSA arm was able to push 2.3 kg vertically\nand lift more than 600 g when positioned horizontally, supporting 0.33 Nm of\ntorque at the base. The arm is able to move between waypoints while carrying\nthe large payload and demonstrates consistent movement with path variance below\n5 mm. The HSA arm's ability to perform active grasping with HSA grippers was\nalso demonstrated, requiring 20 N of pull force to dislodge the object.\nFinally, we test the arm in a pipe inspection task. The arm is able to locate\nall the defects while sliding against the inner surface of the pipe,\ndemonstrating its compliance.\n","authors":["Ian Good","Srivatsan Balaji","David Oh","Sawyer Thomas","Jeffrey I. Lipton"],"pdf_url":"https://arxiv.org/pdf/2501.09819v1.pdf","comment":"9 pages, 8 figures, currently under review"},{"id":"http://arxiv.org/abs/2501.09783v1","updated":"2025-01-16T18:59:51Z","published":"2025-01-16T18:59:51Z","title":"GeoManip: Geometric Constraints as General Interfaces for Robot\n  Manipulation","summary":"  We present GeoManip, a framework to enable generalist robots to leverage\nessential conditions derived from object and part relationships, as geometric\nconstraints, for robot manipulation. For example, cutting the carrot requires\nadhering to a geometric constraint: the blade of the knife should be\nperpendicular to the carrot's direction. By interpreting these constraints\nthrough symbolic language representations and translating them into low-level\nactions, GeoManip bridges the gap between natural language and robotic\nexecution, enabling greater generalizability across diverse even unseen tasks,\nobjects, and scenarios. Unlike vision-language-action models that require\nextensive training, operates training-free by utilizing large foundational\nmodels: a constraint generation module that predicts stage-specific geometric\nconstraints and a geometry parser that identifies object parts involved in\nthese constraints. A solver then optimizes trajectories to satisfy inferred\nconstraints from task descriptions and the scene. Furthermore, GeoManip learns\nin-context and provides five appealing human-robot interaction features:\non-the-fly policy adaptation, learning from human demonstrations, learning from\nfailure cases, long-horizon action planning, and efficient data collection for\nimitation learning. Extensive evaluations on both simulations and real-world\nscenarios demonstrate GeoManip's state-of-the-art performance, with superior\nout-of-distribution generalization while avoiding costly model training.\n","authors":["Weiliang Tang","Jia-Hui Pan","Yun-Hui Liu","Masayoshi Tomizuka","Li Erran Li","Chi-Wing Fu","Mingyu Ding"],"pdf_url":"https://arxiv.org/pdf/2501.09783v1.pdf","comment":"32 pages, 13 figures"},{"id":"http://arxiv.org/abs/2501.09782v1","updated":"2025-01-16T18:59:46Z","published":"2025-01-16T18:59:46Z","title":"SMPLest-X: Ultimate Scaling for Expressive Human Pose and Shape\n  Estimation","summary":"  Expressive human pose and shape estimation (EHPS) unifies body, hands, and\nface motion capture with numerous applications. Despite encouraging progress,\ncurrent state-of-the-art methods focus on training innovative architectural\ndesigns on confined datasets. In this work, we investigate the impact of\nscaling up EHPS towards a family of generalist foundation models. 1) For data\nscaling, we perform a systematic investigation on 40 EHPS datasets,\nencompassing a wide range of scenarios that a model trained on any single\ndataset cannot handle. More importantly, capitalizing on insights obtained from\nthe extensive benchmarking process, we optimize our training scheme and select\ndatasets that lead to a significant leap in EHPS capabilities. Ultimately, we\nachieve diminishing returns at 10M training instances from diverse data\nsources. 2) For model scaling, we take advantage of vision transformers (up to\nViT-Huge as the backbone) to study the scaling law of model sizes in EHPS. To\nexclude the influence of algorithmic design, we base our experiments on two\nminimalist architectures: SMPLer-X, which consists of an intermediate step for\nhand and face localization, and SMPLest-X, an even simpler version that reduces\nthe network to its bare essentials and highlights significant advances in the\ncapture of articulated hands. With big data and the large model, the foundation\nmodels exhibit strong performance across diverse test benchmarks and excellent\ntransferability to even unseen environments. Moreover, our finetuning strategy\nturns the generalist into specialist models, allowing them to achieve further\nperformance boosts. Notably, our foundation models consistently deliver\nstate-of-the-art results on seven benchmarks such as AGORA, UBody, EgoBody, and\nour proposed SynHand dataset for comprehensive hand evaluation. (Code is\navailable at: https://github.com/wqyin/SMPLest-X).\n","authors":["Wanqi Yin","Zhongang Cai","Ruisi Wang","Ailing Zeng","Chen Wei","Qingping Sun","Haiyi Mei","Yanjun Wang","Hui En Pang","Mingyuan Zhang","Lei Zhang","Chen Change Loy","Atsushi Yamashita","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09782v1.pdf","comment":"An extension of SMPLer-X [arXiv:2309.17448]. Homepage:\n  https://caizhongang.com/projects/SMPLer-X/"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2408.04097v2","updated":"2025-01-16T17:34:10Z","published":"2024-08-07T21:28:26Z","title":"Quantum Annealing based Power Grid Partitioning for Parallel Simulation","summary":"  Graph partitioning has many applications in powersystems from decentralized\nstate estimation to parallel simulation. Focusing on parallel simulation,\noptimal grid partitioning minimizes the idle time caused by different\nsimulation times for the sub-networks and their components and reduces the\noverhead required to simulate the cuts. Partitioning a graph into two parts\nsuch that, for example, the cut is minimal and the subgraphs have equal size is\nan NP-hard problem. In this paper we show how optimal partitioning of a graph\ncan be obtained using quantum annealing (QA). We show how to map the\nrequirements for optimal splitting to a quadratic unconstrained binary\noptimization (QUBO) formulation and test the proposed formulation using a\ncurrent D-Wave QPU. We show that the necessity to find an embedding of the QUBO\non current D-Wave QPUs limits the problem size to under 200 buses and notably\naffects the time-to-solution. We finally discuss the implications on near-term\nimplementation of QA in combination to traditional CPU or GPU based simulation.\n","authors":["Carsten Hartmann","Junjie Zhang","Carlos D. Gonzalez Calaza","Thiemo Pesch","Kristel Michielsen","Andrea Benigni"],"pdf_url":"https://arxiv.org/pdf/2408.04097v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.05161v2","updated":"2025-01-16T17:12:04Z","published":"2023-06-08T12:49:48Z","title":"Characterization of full-scale denial-of-service","summary":"  This article investigates the resilient control problem for Cyber-Physical\nSystems (CPSs) with multiple sensors, where both sides of the communication\nchannels are affected by Denial-of-Service (DoS) attacks. While previous work\nfocused on characterizing Multi-Channel DoS (MCDoS), this study emphasizes the\ncharacterization of Full-Scale DoS (FSDoS). First, a partial observer technique\nis proposed to address the MCDoS condition. Then, an event-triggered control\nstrategy is designed to handle FSDoS. Finally, the frequency and duration of\nFSDoS are analyzed to ensure the Input-to-State Stability (ISS) of the\nclosed-loop system.\n","authors":["Anindya Basu","Indrani Kar"],"pdf_url":"https://arxiv.org/pdf/2306.05161v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09665v1","updated":"2025-01-16T17:04:13Z","published":"2025-01-16T17:04:13Z","title":"Design-Agnostic Distributed Timing Fault Injection Monitor With\n  End-to-End Design Automation","summary":"  Fault injection attacks induce hardware failures in circuits and exploit\nthese faults to compromise the security of the system. It has been demonstrated\nthat FIAs can bypass system security mechanisms, cause faulty outputs, and gain\naccess to secret information. Certain types of FIAs can be mounted with little\neffort by tampering with clock signals and or the chip operating conditions. To\nmitigate such low cost, yet powerful attacks, we propose a fully synthesizable\nand distributable in situ fault injection monitor that employs a delay locked\nloop to track the pulsewidth of the clock. We further develop a fully automated\ndesign framework to optimize and implement the FIA monitors at any process\nnode. Our design is fabricated and verified in 65 nm CMOS technology with a\nsmall footprint of 1500 um2. It can lock to clock frequencies from 2 MHz to\n1.26 GHz while detecting all 12 types of possible clock glitches, as well as\ntiming FIA injections via the supply voltage, electromagnetic signals, and chip\ntemperature.\n","authors":["Yan He","Yumin Su","Kaiyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09665v1.pdf","comment":"12 pages, 26 figures"},{"id":"http://arxiv.org/abs/2402.14711v4","updated":"2025-01-16T14:35:37Z","published":"2024-02-22T17:11:41Z","title":"Observability for Nonlinear Systems: Connecting Variational Dynamics,\n  Lyapunov Exponents, and Empirical Gramians","summary":"  Observability quantification is a key problem in dynamic network sciences.\nWhile it has been thoroughly studied for linear systems, observability\nquantification for nonlinear networks is less intuitive and more cumbersome.\nOne common approach to quantify observability for nonlinear systems is via the\nEmpirical Gramian (Empr-Gram) -- a generalized form of the Gramian of linear\nsystems. In this paper, we produce three new results. First, we establish that\na variational form of discrete-time autonomous nonlinear systems (computed via\nperturbing initial conditions) yields a so-called Variational Gramian\n(Var-Gram) that is equivalent to the classic Empr-Gram; the former being easier\nto compute than the latter. Via Lyapunov exponents derived from Lyapunov's\ndirect method, the paper's second result derives connections between existing\nobservability measures and Var-Gram. The third result demonstrates the\napplicability of these new notions for sensor selection/placement in nonlinear\nsystems. Numerical case studies demonstrate these three developments and their\nmerits.\n","authors":["Mohamad H. Kazma","Ahmad F. Taha"],"pdf_url":"https://arxiv.org/pdf/2402.14711v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09551v1","updated":"2025-01-16T14:12:03Z","published":"2025-01-16T14:12:03Z","title":"Intra-day Solar and Power Forecast for Optimization of Intraday Market\n  Participation","summary":"  The prediction of solar irradiance enhances reliability in photovoltaic (PV)\nsolar plant generation and grid integration. In Colombia, PV plants face\npenalties if energy production deviates beyond governmental thresholds from\nintraday market offers. This research employs Long Short-Term Memory (LSTM) and\nBidirectional-LSTM (Bi-LSTM) models, utilizing meteorological data from a PV\nplant in El Paso, Cesar, Colombia, to predict solar irradiance with a 6-hour\nhorizon and 10-minute resolution. While Bi-LSTM showed superior performance,\nthe LSTM model achieved comparable results with significantly reduced training\ntime (6 hours versus 18 hours), making it computationally advantageous. The\nLSTM predictions were averaged to create an hourly resolution model, evaluated\nusing Mean Absolute Error, Root-Mean-Square Error, Normalized Root-Mean-Square\nError, and Mean Absolute Percentage Error metrics. Comparison with the Global\nForecast System (GFS) revealed similar performance, with both models\neffectively capturing daily solar irradiance patterns. The forecast model\nintegrates with an Object-Oriented power production model, enabling accurate\nenergy offers in the intraday market while minimizing penalty costs.\n","authors":["Nelson Salazar-Peña","Adolfo Palma-Vergara","Mateo Montes","María Alejandra Vargas-Torres","Adriana Salinas","Andrés Velasco","Alejandra Tabares","Andrés González-Mancera"],"pdf_url":"https://arxiv.org/pdf/2501.09551v1.pdf","comment":"20 pages, 37 figures, 9 tables"},{"id":"http://arxiv.org/abs/2211.12628v2","updated":"2025-01-16T13:40:30Z","published":"2022-11-22T23:25:25Z","title":"Safe Control and Learning Using the Generalized Action Governor","summary":"  This article introduces a general framework for safe control and learning\nbased on the generalized action governor (AG). The AG is a supervisory scheme\nfor augmenting a nominal closed-loop system with the ability of strictly\nhandling prescribed safety constraints. In the first part of this article, we\npresent a generalized AG methodology and analyze its key properties in a\ngeneral setting. Then, we introduce tailored AG design approaches derived from\nthe generalized methodology for linear and discrete systems. Afterward, we\ndiscuss the application of the generalized AG to facilitate safe online\nlearning, which aims at safely evolving control parameters using real-time data\nto enhance control performance in uncertain systems. We present two safe\nlearning algorithms based on, respectively, reinforcement learning and\ndata-driven Koopman operator-based control integrated with the generalized AG\nto exemplify this application. Finally, we illustrate the developments with a\nnumerical example.\n","authors":["Nan Li","Yutong Li","Ilya Kolmanovsky","Anouck Girard","H. Eric Tseng","Dimitar Filev"],"pdf_url":"https://arxiv.org/pdf/2211.12628v2.pdf","comment":"22 pages, 4 figures, submitted to the International Journal of\n  Control"},{"id":"http://arxiv.org/abs/2407.05738v3","updated":"2025-01-16T13:24:25Z","published":"2024-07-08T08:45:52Z","title":"Collaborative Secret and Covert Communications for Multi-User\n  Multi-Antenna Uplink UAV Systems: Design and Optimization","summary":"  Motivated by diverse secure requirements of multi-user in UAV systems, we\npropose a collaborative secret and covert transmission method for multi-antenna\nground users to unmanned aerial vehicle (UAV) communications. Specifically,\nbased on the power domain non-orthogonal multiple access (NOMA), two ground\nusers with distinct security requirements, named Bob and Carlo, superimpose\ntheir signals and transmit the combined signal to the UAV named Alice. An\nadversary Willie attempts to simultaneously eavesdrop Bob's confidential\nmessage and detect whether Carlo is transmitting or not. We derive close-form\nexpressions of the secrecy connection probability (SCP) and the covert\nconnection probability (CCP) to evaluate the link reliability for wiretap and\ncovert transmissions, respectively. Furthermore, we bound the secrecy outage\nprobability (SOP) from Bob to Alice and the detection error probability (DEP)\nof Willie to evaluate the link security for wiretap and covert transmissions,\nrespectively. To characterize the theoretical benchmark of the above model, we\nformulate a weighted multi-objective optimization problem to maximize the\naverage of secret and covert transmission rates subject to constraints SOP,\nDEP, the beamformers of Bob and Carlo, and UAV trajectory parameters. To solve\nthe optimization problem, we propose an iterative optimization algorithm using\nsuccessive convex approximation and block coordinate descent (SCA-BCD) methods.\nOur results reveal the influence of design parameters of the system on the\nwiretap and covert rates, analytically and numerically. In summary, our study\nfills the gaps in joint secret and covert transmission for multi-user\nmulti-antenna uplink UAV communications and provides insights to construct such\nsystems.\n","authors":["Jinpeng Xu","Lin Bai","Xin Xie","Lin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.05738v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09513v1","updated":"2025-01-16T12:57:37Z","published":"2025-01-16T12:57:37Z","title":"A Dataset Generation Toolbox for Dynamic Security Assessment: On the\n  Role of the Security Boundary","summary":"  Dynamic security assessment (DSA) is crucial for ensuring the reliable\noperation of power systems. However, conventional DSA approaches are becoming\nintractable for future power systems, driving interest in more computationally\nefficient data-driven methods. Efficient dataset generation is a cornerstone of\nthese methods. While importance and generic sampling techniques often focus on\noperating points near the system's security boundary, systematic methods for\nsampling in this region remain scarce. Furthermore, the impact of sampling near\nthe security boundary on the performance of data-driven DSA methods has yet to\nbe established. This paper highlights the critical role of accurately capturing\nsecurity boundaries for effective security assessment. As such, we propose a\nnovel method for generating a high number of samples close to the security\nboundary, considering both AC feasibility and small-signal stability. Case\nstudies on the PGLib-OPF 39-bus and PGLib-OPF 162-bus systems demonstrate the\nimportance of including boundary-adjacent operating points in training datasets\nwhile maintaining a balanced distribution of secure and insecure points.\n","authors":["Bastien Giraud","Lola Charles","Agnes Marjorie Nakiganda","Johanna Vorwerk","Spyros Chatzivasileiadis"],"pdf_url":"https://arxiv.org/pdf/2501.09513v1.pdf","comment":"Submitted to IREP 2025 (under review)"},{"id":"http://arxiv.org/abs/2501.09509v1","updated":"2025-01-16T12:43:23Z","published":"2025-01-16T12:43:23Z","title":"Power-Efficient RAN Intelligent Controllers Through Optimized KPI\n  Monitoring","summary":"  The Open Radio Access Network (RAN) paradigm envisions a more flexible,\ninteroperable, and intelligent RAN ecosystem via new open interfaces and\nelements like the RAN Intelligent Controller (RIC). However, the impact of\nthese elements on Open RAN's power consumption remains heavily unexplored. This\nwork for the first time evaluates the impact of Key Performance Indicator (KPI)\nmonitoring on RIC's power consumption using real traffic and power\nmeasurements. By analyzing various RIC-RAN communication scenarios, we identify\nthat RIC's power consumption can become a scalability bottleneck, particularly\nin large-scale deployments, even when RIC is limited to its core operational\nfunctionalities and without incorporating application-specific processes. In\nthis context, also for the first time we explore potential power savings\nthrough the elimination of redundant KPI transmissions, extending existing\ntechniques for identical subscription removal and KPI selection, achieving\nsignificant power consumption gains exceeding 87\\% of the overall RIC power\nconsumption.\n","authors":["João Paulo S. H. Lima","George N. Katsaros","Konstantinos Nikitopoulos"],"pdf_url":"https://arxiv.org/pdf/2501.09509v1.pdf","comment":"Accepted for publication and presentation at IEEE WCNC 2025"},{"id":"http://arxiv.org/abs/2501.09430v1","updated":"2025-01-16T09:58:34Z","published":"2025-01-16T09:58:34Z","title":"HpC: A Calculus for Hybrid and Mobile Systems -- Full Version","summary":"  Networked cybernetic and physical systems of the Internet of Things (IoT)\nimmerse civilian and industrial infrastructures into an interconnected and\ndynamic web of hybrid and mobile devices. The key feature of such systems is\nthe hybrid and tight coupling of mobile and pervasive discrete communications\nin a continuously evolving environment (discrete computations with predominant\ncontinuous dynamics). In the aim of ensuring the correctness and reliability of\nsuch heterogeneous infrastructures, we introduce the hybrid {\\pi}-calculus\n(HpC), to formally capture both mobility, pervasiveness and hybridisation in\ninfrastructures where the network topology and its communicating entities\nevolve continuously in the physical world. The {\\pi}-calculus proposed by Robin\nMilner et al. is a process calculus that can model mobile communications and\ncomputations in a very elegant manner. The HpC we propose is a conservative\nextension of the classical {\\pi}-calculus, i.e., the extension is ``minimal'',\nand yet describes mobility, time and physics of systems, while allowing to lift\nall theoretical results (e.g. bisimulation) to the context of that extension.\nWe showcase the HpC by considering a realistic handover protocol among mobile\ndevices.\n","authors":["Xiong Xu","Jean-Pierre Talpin","Shuling Wang","Hao Wu","Bohua Zhan","Xinxin Liu","Naijun Zhan"],"pdf_url":"https://arxiv.org/pdf/2501.09430v1.pdf","comment":"The published version of this article will be available in the ACM\n  Digital Library as part of the Proceedings of the ACM on Programming\n  Languages issue for SPLASH/OOPSLA 2025. This extended version contains\n  additional appendices, proofs and case studies"},{"id":"http://arxiv.org/abs/2212.10096v3","updated":"2025-01-16T08:48:45Z","published":"2022-12-20T09:14:55Z","title":"Modeling and Predictive Control for the Treatment of Hyperthyroidism","summary":"  In this work, we propose an approach to determine the dosages of antithyroid\nagents to treat hyperthyroid patients. Instead of relying on a trial-and-error\napproach as it is commonly done in clinical practice, we suggest to determine\nthe dosages by means of a model predictive control (MPC) scheme. To this end,\nwe first extend a mathematical model of the pituitary-thyroid feedback loop\nsuch that the intake of methimazole, a common antithyroid agent, can be\nconsidered. Second, based on the extended model, we develop an MPC scheme to\ndetermine suitable dosages. In numerical simulations, we consider scenarios in\nwhich (i) patients are affected by Graves' disease and take the medication\norally and (ii) patients suffering from a life-threatening thyrotoxicosis, in\nwhich the medication is usually given intravenously. Our conceptual study\nsuggests that determining the medication dosages by means of an MPC scheme\ncould be a promising alternative to the currently applied trial-and-error\napproach.\n","authors":["Tobias M. Wolff","Maylin Menzel","Johannes W. Dietrich","Matthias A. Müller"],"pdf_url":"https://arxiv.org/pdf/2212.10096v3.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2501.09357v1","updated":"2025-01-16T08:08:42Z","published":"2025-01-16T08:08:42Z","title":"Path Planning for a UAV Swarm Using Formation Teaching-Learning-Based\n  Optimization","summary":"  This work addresses the path planning problem for a group of unmanned aerial\nvehicles (UAVs) to maintain a desired formation during operation. Our approach\nformulates the problem as an optimization task by defining a set of fitness\nfunctions that not only ensure the formation but also include constraints for\noptimal and safe UAV operation. To optimize the fitness function and obtain a\nsuboptimal path, we employ the teaching-learning-based optimization algorithm\nand then further enhance it with mechanisms such as mutation, elite strategy,\nand multi-subject combination. A number of simulations and experiments have\nbeen conducted to evaluate the proposed method. The results demonstrate that\nthe algorithm successfully generates valid paths for the UAVs to fly in a\ntriangular formation for an inspection task.\n","authors":["Van Truong Hoang","Manh Duong Phung"],"pdf_url":"https://arxiv.org/pdf/2501.09357v1.pdf","comment":"in Proceedings of the 2025 International Conference on Energy,\n  Infrastructure and Environmental Research (EIER2025)"},{"id":"http://arxiv.org/abs/2501.09338v1","updated":"2025-01-16T07:38:56Z","published":"2025-01-16T07:38:56Z","title":"Robust UAV Path Planning with Obstacle Avoidance for Emergency Rescue","summary":"  The unmanned aerial vehicles (UAVs) are efficient tools for diverse tasks\nsuch as electronic reconnaissance, agricultural operations and disaster relief.\nIn the complex three-dimensional (3D) environments, the path planning with\nobstacle avoidance for UAVs is a significant issue for security assurance. In\nthis paper, we construct a comprehensive 3D scenario with obstacles and no-fly\nzones for dynamic UAV trajectory. Moreover, a novel artificial potential field\nalgorithm coupled with simulated annealing (APF-SA) is proposed to tackle the\nrobust path planning problem. APF-SA modifies the attractive and repulsive\npotential functions and leverages simulated annealing to escape local minimum\nand converge to globally optimal solutions. Simulation results demonstrate that\nthe effectiveness of APF-SA, enabling efficient autonomous path planning for\nUAVs with obstacle avoidance.\n","authors":["Junteng Mao","Ziye Jia","Hanzhi Gu","Chenyu Shi","Haomin Shi","Lijun He","Qihui Wu"],"pdf_url":"https://arxiv.org/pdf/2501.09338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09324v1","updated":"2025-01-16T06:46:16Z","published":"2025-01-16T06:46:16Z","title":"Safety-Critical Control for Discrete-time Stochastic Systems with\n  Flexible Safe Bounds using Affine and Quadratic Control Barrier Functions","summary":"  This paper presents a safe controller synthesis of discrete-time stochastic\nsystems using Control Barrier Functions (CBFs). The proposed condition allows\nthe design of a safe controller synthesis that ensures system safety while\navoiding the conservative bounds of safe probabilities. In particular, this\nstudy focuses on the design of CBFs that provide flexibility in the choice of\nfunctions to obtain tighter bounds on the safe probabilities. Numerical\nexamples demonstrate the effectiveness of the approach.\n","authors":["Sotaro Fushimi","Kenta Hoshino","Yuki Nishimura"],"pdf_url":"https://arxiv.org/pdf/2501.09324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09289v1","updated":"2025-01-16T04:46:46Z","published":"2025-01-16T04:46:46Z","title":"Control Barrier Function-Based Safety Filters: Characterization of\n  Undesired Equilibria, Unbounded Trajectories, and Limit Cycles","summary":"  This paper focuses on safety filters designed based on Control Barrier\nFunctions (CBFs): these are modifications of a nominal stabilizing controller\ntypically utilized in safety-critical control applications to render a given\nsubset of states forward invariant. The paper investigates the dynamical\nproperties of the closed-loop systems, with a focus on characterizing\nundesirable behaviors that may emerge due to the use of CBF-based filters.\nThese undesirable behaviors include unbounded trajectories, limit cycles, and\nundesired equilibria, which can be locally stable and even form a continuum.\nOur analysis offer the following contributions: (i) conditions under which\ntrajectories remain bounded and (ii) conditions under which limit cycles do not\nexist; (iii) we show that undesired equilibria can be characterized by solving\nan algebraic equation, and (iv) we provide examples that show that\nasymptotically stable undesired equilibria can exist for a large class of\nnominal controllers and design parameters of the safety filter (even for convex\nsafe sets). Further, for the specific class of planar systems, (v) we provide\nexplicit formulas for the total number of undesired equilibria and the\nproportion of saddle points and asymptotically stable equilibria, and (vi) in\nthe case of linear planar systems, we present an exhaustive analysis of their\nglobal stability properties. Examples throughout the paper illustrate the\nresults.\n","authors":["Pol Mestres","Yiting Chen","Emiliano Dall'anese","Jorge Cortés"],"pdf_url":"https://arxiv.org/pdf/2501.09289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09858v1","updated":"2025-01-16T22:11:03Z","published":"2025-01-16T22:11:03Z","title":"From Explainability to Interpretability: Interpretable Policies in\n  Reinforcement Learning Via Model Explanation","summary":"  Deep reinforcement learning (RL) has shown remarkable success in complex\ndomains, however, the inherent black box nature of deep neural network policies\nraises significant challenges in understanding and trusting the decision-making\nprocesses. While existing explainable RL methods provide local insights, they\nfail to deliver a global understanding of the model, particularly in\nhigh-stakes applications. To overcome this limitation, we propose a novel\nmodel-agnostic approach that bridges the gap between explainability and\ninterpretability by leveraging Shapley values to transform complex deep RL\npolicies into transparent representations. The proposed approach offers two key\ncontributions: a novel approach employing Shapley values to policy\ninterpretation beyond local explanations and a general framework applicable to\noff-policy and on-policy algorithms. We evaluate our approach with three\nexisting deep RL algorithms and validate its performance in two classic control\nenvironments. The results demonstrate that our approach not only preserves the\noriginal models' performance but also generates more stable interpretable\npolicies.\n","authors":["Peilang Li","Umer Siddique","Yongcan Cao"],"pdf_url":"https://arxiv.org/pdf/2501.09858v1.pdf","comment":"Accepted to Deployable AI (DAI) Workshop at the Thirty-Ninth AAAI\n  Conference on Artificial Intelligence (AAAI-25)"},{"id":"http://arxiv.org/abs/2501.09857v1","updated":"2025-01-16T22:03:31Z","published":"2025-01-16T22:03:31Z","title":"Efficient Probabilistic Assessment of Power System Resilience Using the\n  Polynomial Chaos Expansion Method with Enhanced Stability","summary":"  Increasing frequency and intensity of extreme weather events motivates the\nassessment of power system resilience. The random nature of these events and\nthe resulting failures mandates probabilistic resilience assessment, but\nstate-of-the-art methods (e.g., Monte Carlo simulation) are computationally\ninefficient. This paper leverages the polynomial chaos expansion (PCE) method\nto efficiently quantify uncertainty in power system resilience. To address\nrepeatability issues arising from PCE computation with different sample sets,\nwe propose the integration of the Maximin-LHS experiment design method with the\nPCE method. Numerical studies on the IEEE 39-bus system illustrate the improved\nrepeatability and convergence of the proposed method. The enhanced PCE method\nis then used to assess the resilience of the system and propose adaptation\nmeasures to improve it.\n","authors":["Aidan Gerkis","Xiaozhe Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09857v1.pdf","comment":"Submitted to IEEE PESGM 2025"},{"id":"http://arxiv.org/abs/2501.09853v1","updated":"2025-01-16T21:48:24Z","published":"2025-01-16T21:48:24Z","title":"Greening the Grid: Electricity Market Clearing with Consumer-Based\n  Carbon Cost","summary":"  To enhance decarbonization efforts in electric power systems, we propose a\nnovel electricity market clearing model that internalizes the allocation of\nemissions from generations to loads and allows for consideration of\nconsumer-side carbon costs. Specifically, consumers can not only bid for power\nbut also assign a cost to the carbon emissions incurred by their electricity\nuse. These carbon costs provide consumers, ranging from carbon-agnostic to\ncarbon-sensitive, with a tool to actively manage their roles in carbon emission\nmitigation. By incorporating carbon allocation and consumer-side carbon costs,\nthe market clearing is influenced not solely by production and demand dynamics\nbut also by the allocation of carbon emission responsibilities. To demonstrate\nthe effect of our proposed model, we conduct a case study comparing market\nclearing outcomes across various percentages of carbon-sensitive consumers with\ndiffering carbon costs.\n","authors":["Wenqian Jiang","Line Roald"],"pdf_url":"https://arxiv.org/pdf/2501.09853v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.07271v2","updated":"2025-01-16T21:09:57Z","published":"2024-11-10T16:28:42Z","title":"Multi-hop Upstream Anticipatory Traffic Signal Control with Deep\n  Reinforcement Learning","summary":"  Coordination in traffic signal control is crucial for managing congestion in\nurban networks. Existing pressure-based control methods focus only on immediate\nupstream links, leading to suboptimal green time allocation and increased\nnetwork delays. However, effective signal control inherently requires\ncoordination across a broader spatial scope, as the effect of upstream traffic\nshould influence signal control decisions at downstream intersections,\nimpacting a large area in the traffic network. Although agent communication\nusing neural network-based feature extraction can implicitly enhance spatial\nawareness, it significantly increases the learning complexity, adding an\nadditional layer of difficulty to the challenging task of control in deep\nreinforcement learning. To address the issue of learning complexity and myopic\ntraffic pressure definition, our work introduces a novel concept based on\nMarkov chain theory, namely \\textit{multi-hop upstream pressure}, which\ngeneralizes the conventional pressure to account for traffic conditions beyond\nthe immediate upstream links. This farsighted and compact metric informs the\ndeep reinforcement learning agent to preemptively clear the multi-hop upstream\nqueues, guiding the agent to optimize signal timings with a broader spatial\nawareness. Simulations on synthetic and realistic (Toronto) scenarios\ndemonstrate controllers utilizing multi-hop upstream pressure significantly\nreduce overall network delay by prioritizing traffic movements based on a\nbroader understanding of upstream congestion.\n","authors":["Xiaocan Li","Xiaoyu Wang","Ilia Smirnov","Scott Sanner","Baher Abdulhai"],"pdf_url":"https://arxiv.org/pdf/2411.07271v2.pdf","comment":"5 tables, 11 figures"},{"id":"http://arxiv.org/abs/2501.09832v1","updated":"2025-01-16T20:41:00Z","published":"2025-01-16T20:41:00Z","title":"Crossover-BPSO Driven Multi-Agent Technology for Managing Local Energy\n  Systems","summary":"  This article presents a new hybrid algorithm, crossover binary particle swarm\noptimization (crBPSO), for allocating resources in local energy systems via\nmulti-agent (MA) technology. Initially, a hierarchical MA-based architecture in\na grid-connected local energy setup is presented. In this architecture, task\nspecific agents operate in a master-slave manner. Where, the master runs a\nwell-formulated optimization routine aiming at minimizing costs of energy\nprocurement, battery degradation, and load scheduling delay. The slaves update\nthe master on their current status and receive optimal action plans\naccordingly. Simulation results demonstrate that the proposed algorithm\noutperforms selected existing ones by 21\\% in terms average energy system costs\nwhile satisfying customers' energy demand and maintaining the required quality\nof service.\n","authors":["Hafiz Majid Hussain","Ashfaq Ahmad. Pedro H. J. Nardelli"],"pdf_url":"https://arxiv.org/pdf/2501.09832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.20539v3","updated":"2025-01-16T20:14:13Z","published":"2024-09-30T17:41:00Z","title":"Visual collective behaviors on spherical robots","summary":"  The implementation of collective motion, traditionally, disregard the limited\nsensing capabilities of an individual, to instead assuming an omniscient\nperception of the environment. This study implements a visual flocking model in\na ``robot-in-the-loop'' approach to reproduce these behaviors with a flock\ncomposed of 10 independent spherical robots. The model achieves robotic\ncollective motion by only using panoramic visual information of each robot,\nsuch as retinal position, optical size and optic flow of the neighboring\nrobots. We introduce a virtual anchor to confine the collective robotic\nmovements so to avoid wall interactions. For the first time, a simple visual\nrobot-in-the-loop approach succeed in reproducing several collective motion\nphases, in particular, swarming, and milling. Another milestone achieved with\nby this model is bridging the gap between simulation and physical experiments\nby demonstrating nearly identical behaviors in both environments with the same\nvisual model. To conclude, we show that our minimal visual collective motion\nmodel is sufficient to recreate most collective behaviors on a\nrobot-in-the-loop system that is scalable, behaves as numerical simulations\npredict and is easily comparable to traditional models.\n","authors":["Diego Castro","Christophe Eloy","Franck Ruffier"],"pdf_url":"https://arxiv.org/pdf/2409.20539v3.pdf","comment":"26 pages, 16 figures, journal bioinspired and biomimetics"},{"id":"http://arxiv.org/abs/2305.15595v4","updated":"2025-01-16T19:25:09Z","published":"2023-05-24T22:06:37Z","title":"Time-Varying Convex Optimization: A Contraction and Equilibrium Tracking\n  Approach","summary":"  In this article, we provide a novel and broadly-applicable\ncontraction-theoretic approach to continuous-time time-varying convex\noptimization. For any parameter-dependent contracting dynamics, we show that\nthe tracking error is asymptotically proportional to the rate of change of the\nparameter and that the proportionality constant is upper bounded by Lipschitz\nconstant in which the parameter appears divided by the contraction rate of the\ndynamics squared. We additionally establish that augmenting any\nparameter-dependent contracting dynamics with a feedforward prediction term\nensures that the tracking error vanishes exponentially quickly. To apply these\nresults to time-varying convex optimization, we establish the strong\ninfinitesimal contractivity of dynamics solving three canonical problems:\nmonotone inclusions, linear equality-constrained problems, and composite\nminimization problems. For each case, we derive the sharpest-known contraction\nrates and provide explicit bounds on the tracking error between solution\ntrajectories and minimizing trajectories. We validate our theoretical results\non two numerical examples and on an application to control barrier\nfunction-based controller design that involves real hardware.\n","authors":["Alexander Davydov","Veronica Centorrino","Anand Gokhale","Giovanni Russo","Francesco Bullo"],"pdf_url":"https://arxiv.org/pdf/2305.15595v4.pdf","comment":null}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2501.09735v1","updated":"2025-01-16T18:39:19Z","published":"2025-01-16T18:39:19Z","title":"Tensor-based Dinkelbach method for computing generalized tensor\n  eigenvalues and its applications","summary":"  In this paper, we propose a novel tensor-based Dinkelbach--Type method for\ncomputing extremal tensor generalized eigenvalues. We show that the extremal\ntensor generalized eigenvalue can be reformulated as a critical subproblem of\nthe classical Dinkelbach--Type method, which can subsequently be expressed as a\nmultilinear optimization problem (MOP). The MOP is solved under a spherical\nconstraint using an efficient proximal alternative minimization method, in\nwhich we rigorously establish the global convergence. Additionally, the\nequivalent MOP is reformulated as an unconstrained optimization problem,\nallowing for the analysis of the Kurdyka-Lojasiewicz (KL) exponent and\nproviding an explicit expression for the convergence rate of the proposed\nalgorithm. Preliminary numerical experiments on solving extremal tensor\ngeneralized eigenvalues and minimizing high-order trust-region subproblems are\nprovided, validating the efficacy and practical utility of the proposed method.\n","authors":["Haibin Chen","Wenqi Zhu","Coralia Cartis"],"pdf_url":"https://arxiv.org/pdf/2501.09735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09734v1","updated":"2025-01-16T18:37:59Z","published":"2025-01-16T18:37:59Z","title":"Random Subspace Cubic-Regularization Methods, with Applications to\n  Low-Rank Functions","summary":"  We propose and analyze random subspace variants of the second-order Adaptive\nRegularization using Cubics (ARC) algorithm. These methods iteratively restrict\nthe search space to some random subspace of the parameters, constructing and\nminimizing a local model only within this subspace. Thus, our variants only\nrequire access to (small-dimensional) projections of first- and second-order\nproblem derivatives and calculate a reduced step inexpensively. Under suitable\nassumptions, the ensuing methods maintain the optimal first-order, and\nsecond-order, global rates of convergence of (full-dimensional) cubic\nregularization, while showing improved scalability both theoretically and\nnumerically, particularly when applied to low-rank functions. When applied to\nthe latter, our adaptive variant naturally adapts the subspace size to the true\nrank of the function, without knowing it a priori.\n","authors":["Coralia Cartis","Zhen Shao","Edward Tansley"],"pdf_url":"https://arxiv.org/pdf/2501.09734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09727v1","updated":"2025-01-16T18:21:14Z","published":"2025-01-16T18:21:14Z","title":"Convergence of a Deep BSDE solver with jumps","summary":"  We study the error arising in the numerical approximation of FBSDEs and\nrelated PIDEs by means of a deep learning-based method. Our results focus on\ndecoupled FBSDEs with jumps and extend the seminal work of HAn and Long (2020)\nanalyzing the numerical error of the deep BSDE solver proposed in E et al.\n(2017). We provide a priori and a posteriori error estimates for the finite and\ninfinite activity case.\n","authors":["Alessandro Gnoatto","Katharina Oberpriller","Athena Picarelli"],"pdf_url":"https://arxiv.org/pdf/2501.09727v1.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2501.09713v1","updated":"2025-01-16T18:02:50Z","published":"2025-01-16T18:02:50Z","title":"Distributionally Fair Peer-to-Peer Electricity Trading","summary":"  Peer-to-peer energy trading platforms enable direct electricity exchanges\nbetween peers who belong to the same energy community. In a semi-decentralized\nsystem, a community manager adheres to grid restrictions while optimizing\nsocial welfare. However, with no further supervision, some peers can be\ndiscriminated against from participating in the electricity trades. To solve\nthis issue, this paper proposes an optimization-based mechanism to enable\ndistributionally fair peer-to-peer electricity trading. For the implementation\nof our mechanism, peers are grouped by energy poverty level. The proposed model\naims to redistribute the electricity trades to minimize the maximum Wasserstein\ndistance among the transaction distributions linked to the groups while\nlimiting the sacrifice level with a predefined parameter. We demonstrate the\neffectiveness of our proposal using the IEEE 33-bus distribution grid,\nsimulating an energy community with 1600 peers. Results indicate that up to\n70.1% of unfairness can be eliminated by using our proposed model, even\nachieving a full elimination when including a non-profit community photovoltaic\nplant.\n","authors":["Estibalitz Ruiz Irusta","Juan M. Morales"],"pdf_url":"https://arxiv.org/pdf/2501.09713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07052v2","updated":"2025-01-16T17:54:24Z","published":"2023-10-10T22:30:25Z","title":"Uses of Sub-sample Estimates to Reduce Errors in Stochastic Optimization\n  Models","summary":"  Optimization software enables the solution of problems with millions of\nvariables and associated parameters. These parameters are, however, often\nuncertain and represented with an analytical description of the parameter's\ndistribution or with some form of sample. With large numbers of such\nparameters, optimization of the resulting model is often driven by\nmis-specifications or extreme sample characteristics, resulting in solutions\nthat are far from a true optimum. This paper describes how asymptotic\nconvergence results may not be useful in large-scale problems and how the\noptimization of problems based on sub-sample estimates may achieve improved\nresults over models using full-sample solution estimates. A motivating example\nand numerical results from a portfolio optimization problem demonstrate the\npotential improvement. A theoretical analysis also provides insight into the\nstructure of problems where sub-sample optimization may be most beneficial.\n","authors":["John R. Birge"],"pdf_url":"https://arxiv.org/pdf/2310.07052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09670v1","updated":"2025-01-16T17:06:25Z","published":"2025-01-16T17:06:25Z","title":"A Simplification Method for Inequality Constraints in Integer Binary\n  Encoding HOBO Formulations","summary":"  This study proposes a novel method for simplifying inequality constraints in\nHigher-Order Binary Optimization (HOBO) formulations. The proposed method\naddresses challenges associated with Quadratic Unconstrained Binary\nOptimization (QUBO) formulations, specifically the increased computational\ncomplexity and reduced solution accuracy caused by the introduction of slack\nvariables and the resulting growth in auxiliary qubits. By efficiently\nintegrating constraints, the method enhances the computational efficiency and\naccuracy of both quantum and classical solvers. The effectiveness of the\nproposed approach is demonstrated through numerical experiments applied to\ncombinatorial optimization problems. The results indicate that this method\nexpands the applicability of quantum algorithms to high-dimensional problems\nand improves the practicality of classical optimization solvers for\noptimization problems involving inequality constraints.\n","authors":["Yuichiro Minato"],"pdf_url":"https://arxiv.org/pdf/2501.09670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03653v2","updated":"2025-01-16T16:55:21Z","published":"2024-03-06T12:24:05Z","title":"Actuation manifold from snapshot data","summary":"  We propose a data-driven methodology to learn a low-dimensional manifold of\ncontrolled flows. The starting point is resolving snapshot flow data for a\nrepresentative ensemble of actuations. Key enablers for the actuation manifold\nare isometric mapping as encoder and a combination of a neural network and a\nk-nearest-neighbour interpolation as decoder. This methodology is tested for\nthe fluidic pinball, a cluster of three parallel cylinders perpendicular to the\noncoming uniform flow. The centres of these cylinders are the vertices of an\nequilateral triangle pointing upstream. The flow is manipulated by constant\nrotation of the cylinders, i.e. described by three actuation parameters. The\nReynolds number based on a cylinder diameter is chosen to be 30. The unforced\nflow yields statistically symmetric periodic shedding represented by a\none-dimensional limit cycle. The proposed methodology yields a five-dimensional\nmanifold describing a wide range of dynamics with small representation error.\nInterestingly, the manifold coordinates automatically unveil physically\nmeaningful parameters. Two of them describe the downstream periodic vortex\nshedding. The other three describe the near-field actuation, i.e. the strength\nof boat-tailing, the Magnus effect and forward stagnation point. The manifold\nis shown to be a key enabler for control-oriented flow estimation.\n","authors":["Luigi Marra","Guy Y. Cornejo Maceda","Andrea Meilán-Vila","Vanesa Guerrero","Salma Rashwan","Bernd R. Noack","Stefano Discetti","Andrea Ianiro"],"pdf_url":"https://arxiv.org/pdf/2403.03653v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.06930v2","updated":"2025-01-16T15:42:26Z","published":"2024-03-11T17:17:18Z","title":"Heavy Ball Momentum for Non-Strongly Convex Optimization","summary":"  When considering the minimization of a quadratic or strongly convex function,\nit is well known that first-order methods involving an inertial term weighted\nby a constant-in-time parameter are particularly efficient (see Polyak [32],\nNesterov [28], and references therein). By setting the inertial parameter\naccording to the condition number of the objective function, these methods\nguarantee a fast exponential decay of the error. We prove that this type of\nschemes (which are later called Heavy Ball schemes) is relevant in a relaxed\nsetting, i.e. for composite functions satisfying a quadratic growth condition.\nIn particular, we adapt V-FISTA, introduced by Beck in [10] for strongly convex\nfunctions, to this broader class of functions. To the authors' knowledge, the\nresulting worst-case convergence rates are faster than any other in the\nliterature, including those of FISTA restart schemes. No assumption on the set\nof minimizers is required and guarantees are also given in the non-optimal\ncase, i.e. when the condition number is not exactly known. This analysis\nfollows the study of the corresponding continuous-time dynamical system (Heavy\nBall with friction system), for which new convergence results of the trajectory\nare shown.\n","authors":["Jean-François Aujol","Charles Dossal","Hippolyte Labarrière","Aude Rondepierre"],"pdf_url":"https://arxiv.org/pdf/2403.06930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09581v1","updated":"2025-01-16T15:04:03Z","published":"2025-01-16T15:04:03Z","title":"Faces of homogeneous cones and applications to homogeneous chordality","summary":"  A convex cone $\\mathcal{K}$ is said to be homogeneous if its group of\nautomorphisms acts transitively on its relative interior. Important examples of\nhomogeneous cones include symmetric cones and cones of positive semidefinite\n(PSD) matrices that follow a sparsity pattern given by a homogeneous chordal\ngraph. Our goal in this paper is to elucidate the facial structure of\nhomogeneous cones and make it as transparent as the faces of the PSD matrices.\nWe prove that each face of a homogeneous cone $\\mathcal{K}$ is mapped by an\nautomorphism of $\\mathcal{K}$ to one of its finitely many so-called principal\nfaces. Furthermore, constructing such an automorphism can be done\nalgorithmically by making use of a generalized Cholesky decomposition. Among\nother consequences, we give a proof that homogeneous cones are projectionally\nexposed, which strengthens the previous best result that they are amenable.\nUsing our results, we will carefully analyze the facial structure of cones of\nPSD matrices satisfying homogeneous chordality and discuss consequences for the\ncorresponding family of PSD completion problems.\n","authors":["João Gouveia","Masaru Ito","Bruno F. Lourenço"],"pdf_url":"https://arxiv.org/pdf/2501.09581v1.pdf","comment":"29 pages. Comments welcome"},{"id":"http://arxiv.org/abs/2112.05636v2","updated":"2025-01-16T14:54:27Z","published":"2021-12-10T16:10:00Z","title":"OPM, a collection of Optimization Problems in Matlab","summary":"  OPM is a small collection of CUTEst unconstrained and bound-constrained\nnonlinear optimization problems, which can be used in Matlab for testing\noptimization algorithms directly (i.e. without installing additional software).\n","authors":["Serge Gratton","Philippe L. Toint"],"pdf_url":"https://arxiv.org/pdf/2112.05636v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09563v1","updated":"2025-01-16T14:39:34Z","published":"2025-01-16T14:39:34Z","title":"A Multi-agent System for Hybrid Optimization","summary":"  Optimization problems in process engineering, including design and operation,\ncan often pose challenges to many solvers: multi-modal, non-smooth, and\ndiscontinuous models often with large computational requirements. In such\ncases, the optimization problem is often treated as a black box in which only\nthe value of the objective function is required, sometimes with some indication\nof the measure of the violation of the constraints. Such problems have\ntraditionally been tackled through the use of direct search and meta-heuristic\nmethods. The challenge, then, is to determine which of these methods or\ncombination of methods should be considered to make most effective use of\nfinite computational resources.\n  This paper presents a multi-agent system for optimization which enables a set\nof solvers to be applied simultaneously to an optimization problem, including\ndifferent instantiations of any solver. The evaluation of the optimization\nproblem model is controlled by a scheduler agent which facilitates cooperation\nand competition between optimization methods. The architecture and\nimplementation of the agent system is described in detail, including the\nsolver, model evaluation, and scheduler agents. A suite of direct search and\nmeta-heuristic methods has been developed for use with this system. Case\nstudies from process systems engineering applications are presented and the\nresults show the potential benefits of automated cooperation between different\noptimization solvers and motivates the implementation of competition between\nsolvers.\n","authors":["Eric S. Fraga","Veerawat Udomvorakulchai","Miguel Pineda","Lazaros G. Papageorgiou"],"pdf_url":"https://arxiv.org/pdf/2501.09563v1.pdf","comment":"22 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.09560v1","updated":"2025-01-16T14:24:40Z","published":"2025-01-16T14:24:40Z","title":"On a Variant of the Minimum Path Cover Problem in Acyclic Digraphs:\n  Computational Complexity Results and Exact Method","summary":"  The Minimum Path Cover (MPC) problem consists of finding a\nminimum-cardinality set of node-disjoint paths that cover all nodes in a given\ngraph. We explore a variant of the MPC problem on acyclic digraphs (DAGs)\nwhere, given a subset of arcs, each path within the MPC should contain at least\none arc from this subset. We prove that the feasibility problem is strongly\nNP-hard on arbitrary DAGs, but the problem can be solved in polynomial time\nwhen the DAG is the transitive closure of a path.\n  Given that the problem may not always be feasible, our solution focuses on\ncovering a maximum number of nodes with a minimum number of node-disjoint\npaths, such that each path includes at least one arc from the predefined subset\nof arcs. This paper introduces and investigates two integer programming\nformulations for this problem. We propose several valid inequalities to enhance\nthe linear programming relaxations, employing them as cutting planes in a\nbranch-and-cut approach. The procedure is implemented and tested on a wide\nrange of instances, including real-world instances derived from an airline crew\nscheduling problem, demonstrating the effectiveness of the proposed approach.\n","authors":["Nour ElHouda Tellache","Roberto Baldacci"],"pdf_url":"https://arxiv.org/pdf/2501.09560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12628v2","updated":"2025-01-16T13:40:30Z","published":"2022-11-22T23:25:25Z","title":"Safe Control and Learning Using the Generalized Action Governor","summary":"  This article introduces a general framework for safe control and learning\nbased on the generalized action governor (AG). The AG is a supervisory scheme\nfor augmenting a nominal closed-loop system with the ability of strictly\nhandling prescribed safety constraints. In the first part of this article, we\npresent a generalized AG methodology and analyze its key properties in a\ngeneral setting. Then, we introduce tailored AG design approaches derived from\nthe generalized methodology for linear and discrete systems. Afterward, we\ndiscuss the application of the generalized AG to facilitate safe online\nlearning, which aims at safely evolving control parameters using real-time data\nto enhance control performance in uncertain systems. We present two safe\nlearning algorithms based on, respectively, reinforcement learning and\ndata-driven Koopman operator-based control integrated with the generalized AG\nto exemplify this application. Finally, we illustrate the developments with a\nnumerical example.\n","authors":["Nan Li","Yutong Li","Ilya Kolmanovsky","Anouck Girard","H. Eric Tseng","Dimitar Filev"],"pdf_url":"https://arxiv.org/pdf/2211.12628v2.pdf","comment":"22 pages, 4 figures, submitted to the International Journal of\n  Control"},{"id":"http://arxiv.org/abs/2501.09523v1","updated":"2025-01-16T13:17:57Z","published":"2025-01-16T13:17:57Z","title":"Rates of (T-)asymptotic regularity of the generalized\n  Krasnoselskii-Mann-type iteration","summary":"  In this paper we use proof mining methods to compute rates of\n($T$-)asymptotic regularity of the generalized Krasnoselskii-Mann-type\niteration associated to a nonexpansive mapping $T:X\\to X$ in a uniformly convex\nnormed space $X$. For special choices of the parameter sequences, we obtain\nquadratic rates.\n","authors":["Paulo Firmino","Laurentiu Leustean"],"pdf_url":"https://arxiv.org/pdf/2501.09523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09516v1","updated":"2025-01-16T13:02:39Z","published":"2025-01-16T13:02:39Z","title":"Proximal Quasi-Newton Method for Composite Optimization over the Stiefel\n  Manifold","summary":"  In this paper, we consider the composite optimization problems over the\nStiefel manifold. A successful method to solve this class of problems is the\nproximal gradient method proposed by Chen et al. Motivated by the proximal\nNewton-type techniques in the Euclidean space, we present a Riemannian proximal\nquasi-Newton method, named ManPQN, to solve the composite optimization\nproblems. The global convergence of the ManPQN method is proved and iteration\ncomplexity for obtaining an $\\epsilon$-stationary point is analyzed. Under some\nmild conditions, we also establish the local linear convergence result of the\nManPQN method. Numerical results are encouraging, which shows that the proximal\nquasi-Newton technique can be used to accelerate the proximal gradient method.\n","authors":["Qinsi Wang","Wei Hong Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09516v1.pdf","comment":"37 pages, 12 figures"},{"id":"http://arxiv.org/abs/2309.08201v3","updated":"2025-01-16T12:33:37Z","published":"2023-09-15T07:05:33Z","title":"Sparsity-Aware Distributed Learning for Gaussian Processes with Linear\n  Multiple Kernel","summary":"  Gaussian processes (GPs) stand as crucial tools in machine learning and\nsignal processing, with their effectiveness hinging on kernel design and\nhyper-parameter optimization. This paper presents a novel GP linear multiple\nkernel (LMK) and a generic sparsity-aware distributed learning framework to\noptimize the hyper-parameters. The newly proposed grid spectral mixture product\n(GSMP) kernel is tailored for multi-dimensional data, effectively reducing the\nnumber of hyper-parameters while maintaining good approximation capability. We\nfurther demonstrate that the associated hyper-parameter optimization of this\nkernel yields sparse solutions. To exploit the inherent sparsity of the\nsolutions, we introduce the Sparse LInear Multiple Kernel Learning (SLIM-KL)\nframework. The framework incorporates a quantized alternating direction method\nof multipliers (ADMM) scheme for collaborative learning among multiple agents,\nwhere the local optimization problem is solved using a distributed successive\nconvex approximation (DSCA) algorithm. SLIM-KL effectively manages large-scale\nhyper-parameter optimization for the proposed kernel, simultaneously ensuring\ndata privacy and minimizing communication costs. Theoretical analysis\nestablishes convergence guarantees for the learning framework, while\nexperiments on diverse datasets demonstrate the superior prediction performance\nand efficiency of our proposed methods.\n","authors":["Richard Cornelius Suwandi","Zhidi Lin","Feng Yin","Zhiguo Wang","Sergios Theodoridis"],"pdf_url":"https://arxiv.org/pdf/2309.08201v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04613v3","updated":"2025-01-16T12:05:26Z","published":"2024-02-07T06:30:39Z","title":"Wasserstein Gradient Flows for Moreau Envelopes of f-Divergences in\n  Reproducing Kernel Hilbert Spaces","summary":"  Commonly used $f$-divergences of measures, e.g., the Kullback-Leibler\ndivergence, are subject to limitations regarding the support of the involved\nmeasures. A remedy is regularizing the $f$-divergence by a squared maximum mean\ndiscrepancy (MMD) associated with a characteristic kernel $K$. We use the\nkernel mean embedding to show that this regularization can be rewritten as the\nMoreau envelope of some function on the associated reproducing kernel Hilbert\nspace. Then, we exploit well-known results on Moreau envelopes in Hilbert\nspaces to analyze the MMD-regularized $f$-divergences, particularly their\ngradients. Subsequently, we use our findings to analyze Wasserstein gradient\nflows of MMD-regularized $f$-divergences. We provide proof-of-the-concept\nnumerical examples for flows starting from empirical measures. Here, we cover\n$f$-divergences with infinite and finite recession constants. Lastly, we extend\nour results to the tight variational formulation of $f$-divergences and\nnumerically compare the resulting flows.\n","authors":["Viktor Stein","Sebastian Neumayer","Nicolaj Rux","Gabriele Steidl"],"pdf_url":"https://arxiv.org/pdf/2402.04613v3.pdf","comment":"56 pages, 14 figures, 3 tables. Comments welcome! NEW: Incorporated\n  Reviewers' suggestions, added FISTA and tight formulation"},{"id":"http://arxiv.org/abs/2501.09467v1","updated":"2025-01-16T11:03:27Z","published":"2025-01-16T11:03:27Z","title":"Optimal taxes and subsidies to incentivize modal shift for inner-city\n  freight transport","summary":"  With increasing freight demands for inner-city transport, shifting freight\nfrom road to scheduled line services such as buses, metros, trams, and barges\nis a sustainable solution. Public authorities typically impose economic\npolicies, including road taxes and subsidies for scheduled line services, to\nachieve this modal shift. This study models such a policy using a bi-level\napproach: at the upper level, authorities set road taxes and scheduled line\nsubsidies, while at the lower level, freight forwarders arrange transportation\nvia road or a combination of road and scheduled lines. We prove that fully\nsubsidizing the scheduled line is an optimal and budget-efficient policy. Due\nto its computational complexity, we solve the problem heuristically using a\nbi-section algorithm for the upper level and an Adaptive Large Neighbourhood\nSearch for the lower level. Our results show that optimally setting subsidy and\ntax can reduce the driving distance by up to 12.5\\% and substantially increase\nmodal shift, albeit at a higher operational cost due to increased taxes.\nFurthermore, increased scheduled line frequency and decreased geographical\nscatteredness of freight orders increase modal shift. For the partial subsidy\npolicy, we found that an additional budget provides a better trade-off between\nminimizing distance and transportation costs than solely increasing the subsidy\nlevel. In a Berlin, Germany, case study, we find that we can achieve up to\n2.9\\% reduction in driven distance due to 23.2\\% scheduled line usage, which\namounts to an increase of multiple orders of magnitude, despite only using a\nfew stations for transshipment.\n","authors":["Krissada Tundulyasaree","Layla Martin","Rolf N. van Lieshout","Tom Van Woensel"],"pdf_url":"https://arxiv.org/pdf/2501.09467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09452v1","updated":"2025-01-16T10:26:57Z","published":"2025-01-16T10:26:57Z","title":"Mid-term bio-economic optimization of multi-species fisheries","summary":"  In this paper, we analyze the dynamics of a multi-species fisheries system in\nthe presence of harvesting. We solve the problem of finding the optimal\nharvesting strategy for a mid-term horizon with a fixed final stock of each\nspecies, while maximizing the expected present value of total revenues. The\nproblem is formulated as an optimal control problem. For its solution, we\ncombine techniques derived from Pontryagin's Maximum Principle, cyclic\ncoordinate descent and the shooting method. The algorithm we develop can solve\nproblems both with inter-species competition and with predator-prey behaviors.\nSeveral numerical examples are presented to illustrate the different\npossibilities of the method and a study of the dependence of the behavior on\nsome parameters is performed.\n","authors":["L. Bayon","P. Fortuny Ayuso","P. J. Garcia-Nieto","J. A. Otero","P. M. Suarez","C. Tasis"],"pdf_url":"https://arxiv.org/pdf/2501.09452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09385v1","updated":"2025-01-16T08:55:19Z","published":"2025-01-16T08:55:19Z","title":"The Effective Generalized Moment Problem","summary":"  We establish new convergence rates for the moment-sum-of-squares (Moment-SOS)\nrelaxations for the Generalized Moment Problem (GMP). These bounds, which adapt\nto the geometry of the underlying semi-algebraic set, apply to both the\nconvergence of optima, and to the convergence in Hausdorff distance between the\nrelaxation feasibility set and the GMP feasibility set. This research extends\nprevious works limited to specific problems in polynomial optimization, volume\ncomputation and optimal control. We complement our theoretical analysis with an\napplication: minimal rank symmetric tensor decomposition. In the examples, we\nformulate the problem as a GMP, solve using Moment-SOS relaxation, and apply\nthe theoretical results to observe a convergence rate of the relaxations.\n","authors":["Lucas Gamertsfelder","Bernard Mourrain"],"pdf_url":"https://arxiv.org/pdf/2501.09385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16900v2","updated":"2025-01-16T08:48:43Z","published":"2024-05-27T07:46:46Z","title":"Distributed Riemannian Stochastic Gradient Tracking Algorithm on the\n  Stiefel Manifold","summary":"  This paper focus on investigating the distributed Riemannian stochastic\noptimization problem on the Stiefel manifold for multi-agent systems, where all\nthe agents work collaboratively to optimize a function modeled by the average\nof their expectation-valued local costs. Each agent only processes its own\nlocal cost function and communicate with neighboring agents to achieve optimal\nresults while ensuring consensus. Since the local Riemannian gradient in\nstochastic regimes cannot be directly calculated, we will estimate the gradient\nby the average of a variable number of sampled gradient, which however brings\nabout noise to the system. We then propose a distributed Riemannian stochastic\noptimization algorithm on the Stiefel manifold by combining the variable sample\nsize gradient approximation method with the gradient tracking dynamic. It is\nworth noticing that the suitably chosen increasing sample size plays an\nimportant role in improving the algorithm efficiency, as it reduces the noise\nvariance. In an expectation-valued sense, the iterates of all agents are proved\nto converge to a stationary point (or neighborhood) with fixed step sizes. We\nfurther establish the convergence rate of the iterates for the cases when the\nsample size is exponentially increasing, polynomial increasing, or a constant,\nrespectively. Finally, numerical experiments are implemented to demonstrate the\ntheoretical results.\n","authors":["Jishu Zhao","Xi Wang","Jinlong Lei"],"pdf_url":"https://arxiv.org/pdf/2405.16900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09289v1","updated":"2025-01-16T04:46:46Z","published":"2025-01-16T04:46:46Z","title":"Control Barrier Function-Based Safety Filters: Characterization of\n  Undesired Equilibria, Unbounded Trajectories, and Limit Cycles","summary":"  This paper focuses on safety filters designed based on Control Barrier\nFunctions (CBFs): these are modifications of a nominal stabilizing controller\ntypically utilized in safety-critical control applications to render a given\nsubset of states forward invariant. The paper investigates the dynamical\nproperties of the closed-loop systems, with a focus on characterizing\nundesirable behaviors that may emerge due to the use of CBF-based filters.\nThese undesirable behaviors include unbounded trajectories, limit cycles, and\nundesired equilibria, which can be locally stable and even form a continuum.\nOur analysis offer the following contributions: (i) conditions under which\ntrajectories remain bounded and (ii) conditions under which limit cycles do not\nexist; (iii) we show that undesired equilibria can be characterized by solving\nan algebraic equation, and (iv) we provide examples that show that\nasymptotically stable undesired equilibria can exist for a large class of\nnominal controllers and design parameters of the safety filter (even for convex\nsafe sets). Further, for the specific class of planar systems, (v) we provide\nexplicit formulas for the total number of undesired equilibria and the\nproportion of saddle points and asymptotically stable equilibria, and (vi) in\nthe case of linear planar systems, we present an exhaustive analysis of their\nglobal stability properties. Examples throughout the paper illustrate the\nresults.\n","authors":["Pol Mestres","Yiting Chen","Emiliano Dall'anese","Jorge Cortés"],"pdf_url":"https://arxiv.org/pdf/2501.09289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06238v2","updated":"2025-01-16T04:13:54Z","published":"2024-08-12T15:44:15Z","title":"Cislunar Constellation Design for Space Situational Awareness with\n  Time-Expanded p-Median Problem","summary":"  Driven by the surmounting interest for a dedicated infrastructure in cislunar\nspace, this work considers the satellite constellation design for cislunar\nspace situational awareness (CSSA). We propose a linear programming (LP)-based\nformulation that simultaneously tackles the constellation design and\nsensor-tasking subproblems surrounding CSSA. Our approach generates\nconstellation designs that provide coverage with considerations for the\nfield-of-view of observers. We propose a time-expanded p-Median problem\n(TE-p-MP) which considers the optimal placement of p space-based observers into\ndiscretized locations based on orbital slots along libration point orbits,\nsimultaneously with observer pointing directions across discretized time. We\nfurther develop a Lagrangian method for the TE-p-MP, where a relaxed problem\nwith an analytical solution is derived, and customized heuristics leveraging\nthe orbital structure of candidate observer locations are devised. The\nperformance of the proposed formulation is demonstrated with several case\nstudies for CSSA constellations monitoring the cislunar Cone of Shame and a\nperiodic time-varying transit window for low-energy transfers located in the\nEarth-Moon L2 neck region. The proposed problem formulation, along with the\nLagrangian method, is demonstrated to enable a fast assessment of near-optimal\nCSSA constellations, equipping decision-makers with a critical technique for\nexploring the design trade space.\n","authors":["Yuri Shimane","Kento Tomita","Koki Ho"],"pdf_url":"https://arxiv.org/pdf/2408.06238v2.pdf","comment":"49 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.09262v1","updated":"2025-01-16T03:11:50Z","published":"2025-01-16T03:11:50Z","title":"On the convergence of noisy Bayesian Optimization with Expected\n  Improvement","summary":"  Expected improvement (EI) is one of the most widely-used acquisition\nfunctions in Bayesian optimization (BO). Despite its proven success in\napplications for decades, important open questions remain on the theoretical\nconvergence behaviors and rates for EI. In this paper, we contribute to the\nconvergence theories of EI in three novel and critical area. First, we consider\nobjective functions that are under the Gaussian process (GP) prior assumption,\nwhereas existing works mostly focus on functions in the reproducing kernel\nHilbert space (RKHS). Second, we establish the first asymptotic error bound and\nits corresponding rate for GP-EI with noisy observations under the GP prior\nassumption. Third, by investigating the exploration and exploitation of the\nnon-convex EI function, we prove improved error bounds for both the noise-free\nand noisy cases. The improved noiseless bound is extended to the RKHS\nassumption as well.\n","authors":["Jingyi Wang","Haowei Wang","Cosmin G. Petra","Nai-Yuan Chiang"],"pdf_url":"https://arxiv.org/pdf/2501.09262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09249v1","updated":"2025-01-16T02:20:25Z","published":"2025-01-16T02:20:25Z","title":"A Dynamic Unmanned Aerial Vehicle Routing Framework for Urban Traffic\n  Monitoring","summary":"  Unmanned Aerial Vehicles (UAVs) have great potential in urban traffic\nmonitoring due to their rapid speed, cost-effectiveness, and extensive\nfield-of-view, while being unconstrained by traffic congestion. However, their\nlimited flight duration presents critical challenges in sustainable recharging\nstrategies and efficient route planning in long-term monitoring tasks.\nAdditionally, existing approaches for long-term monitoring often neglect the\nevolving nature of urban traffic networks. In this study, we introduce a novel\ndynamic UAV routing framework for long-term, network-wide urban traffic\nmonitoring, leveraging existing ground vehicles as mobile charging stations\nwithout disrupting their operations. To address the complexity of long-term\nmonitoring scenarios involving multiple flights, we decompose the problem into\nmanageable single-flight tasks, in which each flight is modeled as a Team Arc\nOrienteering Problem with Decreasing Profits with the objective to collectively\nmaximize the spatiotemporal network coverage. Between flights, we adaptively\nupdate the edge weights to incorporate real-time traffic changes and revisit\nintervals. We validate our framework through extensive microscopic simulations\nin a modified Sioux Falls network under various scenarios. Comparative results\ndemonstrate that our model outperforms three baseline approaches, especially\nwhen historical information is incomplete or absent. Moreover, we show that our\nmonitoring framework can capture network-wide traffic trends and construct\naccurate Macroscopic Fundamental Diagrams (MFDs). These findings demonstrate\nthe effectiveness of the proposed dynamic UAV routing framework, underscoring\nits suitability for efficient and reliable long-term traffic monitoring. Our\napproach's adaptability and high accuracy in capturing the MFD highlight its\npotential in network-wide traffic control and management applications.\n","authors":["Yumeng Bai","Yiheng Feng"],"pdf_url":"https://arxiv.org/pdf/2501.09249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04262v2","updated":"2025-01-16T02:18:26Z","published":"2024-10-05T18:47:52Z","title":"Thermal Bootstrap of Matrix Quantum Mechanics","summary":"  We implement a bootstrap method that combines Schwinger-Dyson equations,\nthermal inequalities, and semidefinite relaxations of matrix logarithm in the\nungauged one-matrix quantum mechanics, at finite rank N as well as in the large\nN limit, and determine finite temperature observables that interpolate between\navailable analytic results in the low and high temperature limits respectively.\nWe also obtain bootstrap bounds on thermal phase transition as well as\npreliminary results in the ungauged two-matrix quantum mechanics.\n","authors":["Minjae Cho","Barak Gabai","Joshua Sandor","Xi Yin"],"pdf_url":"https://arxiv.org/pdf/2410.04262v2.pdf","comment":"31 pages, 8 figures, v2: references added"},{"id":"http://arxiv.org/abs/2408.12018v3","updated":"2025-01-16T00:42:11Z","published":"2024-08-21T22:05:55Z","title":"Convergence and Bound Computation for Chance Constrained\n  Distributionally Robust Models using Sample Approximation","summary":"  This paper considers a distributionally robust chance constraint model with a\ngeneral ambiguity set. We show that a sample based approximation of this model\nconverges under suitable sufficient conditions. We also show that upper and\nlower bounds on the optimal value of the model can be estimated statistically.\nSpecific ambiguity sets are discussed as examples.\n","authors":["Jiaqi Lei","Sanjay Mehrotra"],"pdf_url":"https://arxiv.org/pdf/2408.12018v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09845v1","updated":"2025-01-16T21:20:41Z","published":"2025-01-16T21:20:41Z","title":"Adaptive Weighted Total Variation boosted by learning techniques in\n  few-view tomographic imaging","summary":"  This study presents the development of a spatially adaptive weighting\nstrategy for Total Variation regularization, aimed at addressing\nunder-determined linear inverse problems. The method leverages the rapid\ncomputation of an accurate approximation of the true image (or its gradient\nmagnitude) through a neural network. Our approach operates without requiring\nprior knowledge of the noise intensity in the data and avoids the iterative\nrecomputation of weights. Additionally, the paper includes a theoretical\nanalysis of the proposed method, establishing its validity as a regularization\napproach. This framework integrates advanced neural network capabilities within\na regularization context, thereby making the results of the networks\ninterpretable. The results are promising as they enable high-quality\nreconstructions from limited-view tomographic measurements.\n","authors":["Elena Morotti","Davide Evangelista","Andrea Sebastiani","Elena Loli Piccolomini"],"pdf_url":"https://arxiv.org/pdf/2501.09845v1.pdf","comment":"23 pages, 8 figures, submitted to journal for peer-review"},{"id":"http://arxiv.org/abs/2501.09828v1","updated":"2025-01-16T20:30:35Z","published":"2025-01-16T20:30:35Z","title":"On the Complexity of p-Order Cone Programs","summary":"  This manuscript explores novel complexity results for the feasibility problem\nover $p$-order cones, extending the foundational work of Porkolab and\nKhachiyan. By leveraging the intrinsic structure of $p$-order cones, we derive\nrefined complexity bounds that surpass those obtained via standard semidefinite\nprogramming reformulations. Our analysis not only improves theoretical bounds\nbut also provides practical insights into the computational efficiency of\nsolving such problems. In addition to establishing complexity results, we\nderive explicit bounds for solutions when the feasibility problem admits one.\nFor infeasible instances, we analyze their discrepancy quantifying the degree\nof infeasibility. Finally, we examine specific cases of interest, highlighting\nscenarios where the geometry of $p$-order cones or problem structure yields\nfurther computational simplifications. These findings contribute to both the\ntheoretical understanding and practical tractability of optimization problems\ninvolving $p$-order cones.\n","authors":["Víctor Blanco","Victor Magron","Miguel Martínez-Antón"],"pdf_url":"https://arxiv.org/pdf/2501.09828v1.pdf","comment":"21 pages, 2 tables"},{"id":"http://arxiv.org/abs/2305.15595v4","updated":"2025-01-16T19:25:09Z","published":"2023-05-24T22:06:37Z","title":"Time-Varying Convex Optimization: A Contraction and Equilibrium Tracking\n  Approach","summary":"  In this article, we provide a novel and broadly-applicable\ncontraction-theoretic approach to continuous-time time-varying convex\noptimization. For any parameter-dependent contracting dynamics, we show that\nthe tracking error is asymptotically proportional to the rate of change of the\nparameter and that the proportionality constant is upper bounded by Lipschitz\nconstant in which the parameter appears divided by the contraction rate of the\ndynamics squared. We additionally establish that augmenting any\nparameter-dependent contracting dynamics with a feedforward prediction term\nensures that the tracking error vanishes exponentially quickly. To apply these\nresults to time-varying convex optimization, we establish the strong\ninfinitesimal contractivity of dynamics solving three canonical problems:\nmonotone inclusions, linear equality-constrained problems, and composite\nminimization problems. For each case, we derive the sharpest-known contraction\nrates and provide explicit bounds on the tracking error between solution\ntrajectories and minimizing trajectories. We validate our theoretical results\non two numerical examples and on an application to control barrier\nfunction-based controller design that involves real hardware.\n","authors":["Alexander Davydov","Veronica Centorrino","Anand Gokhale","Giovanni Russo","Francesco Bullo"],"pdf_url":"https://arxiv.org/pdf/2305.15595v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10473v1","updated":"2025-01-16T07:55:11Z","published":"2025-01-16T07:55:11Z","title":"Generalized TCP-RED dynamical model for Internet congestion control","summary":"  Adaptive management of traffic congestion in the Internet is a complex\nproblem that can gain useful insights from a dynamical approach. In this paper\nwe propose and analyze a one-dimensional, discrete-time nonlinear model for\nInternet congestion control at the routers. Specifically, the states correspond\nto the average queue sizes of the incoming data packets and the dynamical core\nconsists of a monotone or unimodal mapping with a unique fixed point. This\nmodel generalizes a previous one in that additional control parameters are\nintroduced via the data packet drop probability with the objective of enhancing\nstability. To make the analysis more challenging, the original model was shown\nto exhibit the usual features of low-dimensional chaos with respect to several\nsystem and control parameters, e.g., positive Lyapunov exponents and\nFeigenbaum-like bifurcation diagrams. We concentrate first on the theoretical\naspects that may promote the unique stationary state of the system to a global\nattractor, which in our case amounts to global stability. In a second step,\nthose theoretical results are translated into stability domains for robust\nsetting of the new control parameters in practical applications. Numerical\nsimulations confirm that the new parameters make it possible to extend the\nstability domains, in comparison with previous results. Therefore, the present\nwork may lead to an adaptive congestion control algorithm with a more stable\nperformance than other algorithms currently in use.\n","authors":["José M. Amigó","Guillem Duran","Angel Giménez","Oscar Martínez-Bonastre","José Valero"],"pdf_url":"https://arxiv.org/pdf/2501.10473v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.09757v1","updated":"2025-01-16T18:59:53Z","published":"2025-01-16T18:59:53Z","title":"Distilling Multi-modal Large Language Models for Autonomous Driving","summary":"  Autonomous driving demands safe motion planning, especially in critical\n\"long-tail\" scenarios. Recent end-to-end autonomous driving systems leverage\nlarge language models (LLMs) as planners to improve generalizability to rare\nevents. However, using LLMs at test time introduces high computational costs.\nTo address this, we propose DiMA, an end-to-end autonomous driving system that\nmaintains the efficiency of an LLM-free (or vision-based) planner while\nleveraging the world knowledge of an LLM. DiMA distills the information from a\nmulti-modal LLM to a vision-based end-to-end planner through a set of specially\ndesigned surrogate tasks. Under a joint training strategy, a scene encoder\ncommon to both networks produces structured representations that are\nsemantically grounded as well as aligned to the final planning objective.\nNotably, the LLM is optional at inference, enabling robust planning without\ncompromising on efficiency. Training with DiMA results in a 37% reduction in\nthe L2 trajectory error and an 80% reduction in the collision rate of the\nvision-based planner, as well as a 44% trajectory error reduction in longtail\nscenarios. DiMA also achieves state-of-the-art performance on the nuScenes\nplanning benchmark.\n","authors":["Deepti Hegde","Rajeev Yasarla","Hong Cai","Shizhong Han","Apratim Bhattacharyya","Shweta Mahajan","Litian Liu","Risheek Garrepalli","Vishal M. Patel","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2501.09757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09756v1","updated":"2025-01-16T18:59:48Z","published":"2025-01-16T18:59:48Z","title":"SynthLight: Portrait Relighting with Diffusion Model by Learning to\n  Re-render Synthetic Faces","summary":"  We introduce SynthLight, a diffusion model for portrait relighting. Our\napproach frames image relighting as a re-rendering problem, where pixels are\ntransformed in response to changes in environmental lighting conditions. Using\na physically-based rendering engine, we synthesize a dataset to simulate this\nlighting-conditioned transformation with 3D head assets under varying lighting.\nWe propose two training and inference strategies to bridge the gap between the\nsynthetic and real image domains: (1) multi-task training that takes advantage\nof real human portraits without lighting labels; (2) an inference time\ndiffusion sampling procedure based on classifier-free guidance that leverages\nthe input portrait to better preserve details. Our method generalizes to\ndiverse real photographs and produces realistic illumination effects, including\nspecular highlights and cast shadows, while preserving the subject's identity.\nOur quantitative experiments on Light Stage data demonstrate results comparable\nto state-of-the-art relighting methods. Our qualitative results on in-the-wild\nimages showcase rich and unprecedented illumination effects. Project Page:\n\\url{https://vrroom.github.io/synthlight/}\n","authors":["Sumit Chaturvedi","Mengwei Ren","Yannick Hold-Geoffroy","Jingyuan Liu","Julie Dorsey","Zhixin Shu"],"pdf_url":"https://arxiv.org/pdf/2501.09756v1.pdf","comment":"27 pages, 25 figures, Project Page\n  https://vrroom.github.io/synthlight/"},{"id":"http://arxiv.org/abs/2501.09755v1","updated":"2025-01-16T18:59:04Z","published":"2025-01-16T18:59:04Z","title":"Learnings from Scaling Visual Tokenizers for Reconstruction and\n  Generation","summary":"  Visual tokenization via auto-encoding empowers state-of-the-art image and\nvideo generative models by compressing pixels into a latent space. Although\nscaling Transformer-based generators has been central to recent advances, the\ntokenizer component itself is rarely scaled, leaving open questions about how\nauto-encoder design choices influence both its objective of reconstruction and\ndownstream generative performance. Our work aims to conduct an exploration of\nscaling in auto-encoders to fill in this blank. To facilitate this exploration,\nwe replace the typical convolutional backbone with an enhanced Vision\nTransformer architecture for Tokenization (ViTok). We train ViTok on\nlarge-scale image and video datasets far exceeding ImageNet-1K, removing data\nconstraints on tokenizer scaling. We first study how scaling the auto-encoder\nbottleneck affects both reconstruction and generation -- and find that while it\nis highly correlated with reconstruction, its relationship with generation is\nmore complex. We next explored the effect of separately scaling the\nauto-encoders' encoder and decoder on reconstruction and generation\nperformance. Crucially, we find that scaling the encoder yields minimal gains\nfor either reconstruction or generation, while scaling the decoder boosts\nreconstruction but the benefits for generation are mixed. Building on our\nexploration, we design ViTok as a lightweight auto-encoder that achieves\ncompetitive performance with state-of-the-art auto-encoders on ImageNet-1K and\nCOCO reconstruction tasks (256p and 512p) while outperforming existing\nauto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x\nfewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates\ncompetitive performance on image generation for ImageNet-1K and sets new\nstate-of-the-art benchmarks for class-conditional video generation on UCF-101.\n","authors":["Philippe Hansen-Estruch","David Yan","Ching-Yao Chung","Orr Zohar","Jialiang Wang","Tingbo Hou","Tao Xu","Sriram Vishwanath","Peter Vajda","Xinlei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09755v1.pdf","comment":"28 pages, 25 figures, 7 Tables"},{"id":"http://arxiv.org/abs/2501.09754v1","updated":"2025-01-16T18:59:03Z","published":"2025-01-16T18:59:03Z","title":"Lost in Translation, Found in Context: Sign Language Translation with\n  Contextual Cues","summary":"  Our objective is to translate continuous sign language into spoken language\ntext. Inspired by the way human interpreters rely on context for accurate\ntranslation, we incorporate additional contextual cues together with the\nsigning video, into a new translation framework. Specifically, besides visual\nsign recognition features that encode the input video, we integrate\ncomplementary textual information from (i) captions describing the background\nshow, (ii) translation of previous sentences, as well as (iii) pseudo-glosses\ntranscribing the signing. These are automatically extracted and inputted along\nwith the visual features to a pre-trained large language model (LLM), which we\nfine-tune to generate spoken language translations in text form. Through\nextensive ablation studies, we show the positive contribution of each input cue\nto the translation performance. We train and evaluate our approach on BOBSL --\nthe largest British Sign Language dataset currently available. We show that our\ncontextual approach significantly enhances the quality of the translations\ncompared to previously reported results on BOBSL, and also to state-of-the-art\nmethods that we implement as baselines. Furthermore, we demonstrate the\ngenerality of our approach by applying it also to How2Sign, an American Sign\nLanguage dataset, and achieve competitive results.\n","authors":["Youngjoon Jang","Haran Raajesh","Liliane Momeni","Gül Varol","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2501.09754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09753v1","updated":"2025-01-16T18:59:02Z","published":"2025-01-16T18:59:02Z","title":"SRE-Conv: Symmetric Rotation Equivariant Convolution for Biomedical\n  Image Classification","summary":"  Convolutional neural networks (CNNs) are essential tools for computer vision\ntasks, but they lack traditionally desired properties of extracted features\nthat could further improve model performance, e.g., rotational equivariance.\nSuch properties are ubiquitous in biomedical images, which often lack explicit\norientation. While current work largely relies on data augmentation or explicit\nmodules to capture orientation information, this comes at the expense of\nincreased training costs or ineffective approximations of the desired\nequivariance. To overcome these challenges, we propose a novel and efficient\nimplementation of the Symmetric Rotation-Equivariant (SRE) Convolution\n(SRE-Conv) kernel, designed to learn rotation-invariant features while\nsimultaneously compressing the model size. The SRE-Conv kernel can easily be\nincorporated into any CNN backbone. We validate the ability of a deep SRE-CNN\nto capture equivariance to rotation using the public MedMNISTv2 dataset (16\ntotal tasks). SRE-Conv-CNN demonstrated improved rotated image classification\nperformance accuracy on all 16 test datasets in both 2D and 3D images, all\nwhile increasing efficiency with fewer parameters and reduced memory footprint.\nThe code is available at https://github.com/XYPB/SRE-Conv.\n","authors":["Yuexi Du","Jiazhen Zhang","Tal Zeevi","Nicha C. Dvornek","John A. Onofrey"],"pdf_url":"https://arxiv.org/pdf/2501.09753v1.pdf","comment":"Accepted by IEEE ISBI 2025 4-page paper"},{"id":"http://arxiv.org/abs/2403.12953v2","updated":"2025-01-16T18:58:31Z","published":"2024-03-19T17:55:22Z","title":"FutureDepth: Learning to Predict the Future Improves Video Depth\n  Estimation","summary":"  In this paper, we propose a novel video depth estimation approach,\nFutureDepth, which enables the model to implicitly leverage multi-frame and\nmotion cues to improve depth estimation by making it learn to predict the\nfuture at training. More specifically, we propose a future prediction network,\nF-Net, which takes the features of multiple consecutive frames and is trained\nto predict multi-frame features one time step ahead iteratively. In this way,\nF-Net learns the underlying motion and correspondence information, and we\nincorporate its features into the depth decoding process. Additionally, to\nenrich the learning of multiframe correspondence cues, we further leverage a\nreconstruction network, R-Net, which is trained via adaptively masked\nauto-encoding of multiframe feature volumes. At inference time, both F-Net and\nR-Net are used to produce queries to work with the depth decoder, as well as a\nfinal refinement network. Through extensive experiments on several benchmarks,\ni.e., NYUDv2, KITTI, DDAD, and Sintel, which cover indoor, driving, and\nopen-domain scenarios, we show that FutureDepth significantly improves upon\nbaseline models, outperforms existing video depth estimation methods, and sets\nnew state-of-the-art (SOTA) accuracy. Furthermore, FutureDepth is more\nefficient than existing SOTA video depth estimation models and has similar\nlatencies when comparing to monocular models\n","authors":["Rajeev Yasarla","Manish Kumar Singh","Hong Cai","Yunxiao Shi","Jisoo Jeong","Yinhao Zhu","Shizhong Han","Risheek Garrepalli","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2403.12953v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2307.14336v3","updated":"2025-01-16T18:55:36Z","published":"2023-07-26T17:55:32Z","title":"MAMo: Leveraging Memory and Attention for Monocular Video Depth\n  Estimation","summary":"  We propose MAMo, a novel memory and attention frame-work for monocular video\ndepth estimation. MAMo can augment and improve any single-image depth\nestimation networks into video depth estimation models, enabling them to take\nadvantage of the temporal information to predict more accurate depth. In MAMo,\nwe augment model with memory which aids the depth prediction as the model\nstreams through the video. Specifically, the memory stores learned visual and\ndisplacement tokens of the previous time instances. This allows the depth\nnetwork to cross-reference relevant features from the past when predicting\ndepth on the current frame. We introduce a novel scheme to continuously update\nthe memory, optimizing it to keep tokens that correspond with both the past and\nthe present visual information. We adopt attention-based approach to process\nmemory features where we first learn the spatio-temporal relation among the\nresultant visual and displacement memory tokens using self-attention module.\nFurther, the output features of self-attention are aggregated with the current\nvisual features through cross-attention. The cross-attended features are\nfinally given to a decoder to predict depth on the current frame. Through\nextensive experiments on several benchmarks, including KITTI, NYU-Depth V2, and\nDDAD, we show that MAMo consistently improves monocular depth estimation\nnetworks and sets new state-of-the-art (SOTA) accuracy. Notably, our MAMo video\ndepth estimation provides higher accuracy with lower latency, when omparing to\nSOTA cost-volume-based video depth models.\n","authors":["Rajeev Yasarla","Hong Cai","Jisoo Jeong","Yunxiao Shi","Risheek Garrepalli","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2307.14336v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2501.09733v1","updated":"2025-01-16T18:35:45Z","published":"2025-01-16T18:35:45Z","title":"ComplexVAD: Detecting Interaction Anomalies in Video","summary":"  Existing video anomaly detection datasets are inadequate for representing\ncomplex anomalies that occur due to the interactions between objects. The\nabsence of complex anomalies in previous video anomaly detection datasets\naffects research by shifting the focus onto simple anomalies. To address this\nproblem, we introduce a new large-scale dataset: ComplexVAD. In addition, we\npropose a novel method to detect complex anomalies via modeling the\ninteractions between objects using a scene graph with spatio-temporal\nattributes. With our proposed method and two other state-of-the-art video\nanomaly detection methods, we obtain baseline scores on ComplexVAD and\ndemonstrate that our new method outperforms existing works.\n","authors":["Furkan Mumcu","Michael J. Jones","Yasin Yilmaz","Anoop Cherian"],"pdf_url":"https://arxiv.org/pdf/2501.09733v1.pdf","comment":"16 pages, 11 figures, to appear in WACV Workshop ASTAD 2025"},{"id":"http://arxiv.org/abs/2501.09732v1","updated":"2025-01-16T18:30:37Z","published":"2025-01-16T18:30:37Z","title":"Inference-Time Scaling for Diffusion Models beyond Scaling Denoising\n  Steps","summary":"  Generative models have made significant impacts across various domains,\nlargely due to their ability to scale during training by increasing data,\ncomputational resources, and model size, a phenomenon characterized by the\nscaling laws. Recent research has begun to explore inference-time scaling\nbehavior in Large Language Models (LLMs), revealing how performance can further\nimprove with additional computation during inference. Unlike LLMs, diffusion\nmodels inherently possess the flexibility to adjust inference-time computation\nvia the number of denoising steps, although the performance gains typically\nflatten after a few dozen. In this work, we explore the inference-time scaling\nbehavior of diffusion models beyond increasing denoising steps and investigate\nhow the generation performance can further improve with increased computation.\nSpecifically, we consider a search problem aimed at identifying better noises\nfor the diffusion sampling process. We structure the design space along two\naxes: the verifiers used to provide feedback, and the algorithms used to find\nbetter noise candidates. Through extensive experiments on class-conditioned and\ntext-conditioned image generation benchmarks, our findings reveal that\nincreasing inference-time compute leads to substantial improvements in the\nquality of samples generated by diffusion models, and with the complicated\nnature of images, combinations of the components in the framework can be\nspecifically chosen to conform with different application scenario.\n","authors":["Nanye Ma","Shangyuan Tong","Haolin Jia","Hexiang Hu","Yu-Chuan Su","Mingda Zhang","Xuan Yang","Yandong Li","Tommi Jaakkola","Xuhui Jia","Saining Xie"],"pdf_url":"https://arxiv.org/pdf/2501.09732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09720v1","updated":"2025-01-16T18:09:22Z","published":"2025-01-16T18:09:22Z","title":"A Simple Aerial Detection Baseline of Multimodal Language Models","summary":"  The multimodal language models (MLMs) based on generative pre-trained\nTransformer are considered powerful candidates for unifying various domains and\ntasks. MLMs developed for remote sensing (RS) have demonstrated outstanding\nperformance in multiple tasks, such as visual question answering and visual\ngrounding. In addition to visual grounding that detects specific objects\ncorresponded to given instruction, aerial detection, which detects all objects\nof multiple categories, is also a valuable and challenging task for RS\nfoundation models. However, aerial detection has not been explored by existing\nRS MLMs because the autoregressive prediction mechanism of MLMs differs\nsignificantly from the detection outputs. In this paper, we present a simple\nbaseline for applying MLMs to aerial detection for the first time, named\nLMMRotate. Specifically, we first introduce a normalization method to transform\ndetection outputs into textual outputs to be compatible with the MLM framework.\nThen, we propose a evaluation method, which ensures a fair comparison between\nMLMs and conventional object detection models. We construct the baseline by\nfine-tuning open-source general-purpose MLMs and achieve impressive detection\nperformance comparable to conventional detector. We hope that this baseline\nwill serve as a reference for future MLM development, enabling more\ncomprehensive capabilities for understanding RS images. Code is available at\nhttps://github.com/Li-Qingyun/mllm-mmrotate.\n","authors":["Qingyun Li","Yushi Chen","Xinya Shu","Dong Chen","Xin He","Yi Yu","Xue Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09720v1.pdf","comment":"4 pages, 1 table, 4 figures"},{"id":"http://arxiv.org/abs/2501.09718v1","updated":"2025-01-16T18:06:09Z","published":"2025-01-16T18:06:09Z","title":"FLOL: Fast Baselines for Real-World Low-Light Enhancement","summary":"  Low-Light Image Enhancement (LLIE) is a key task in computational photography\nand imaging. The problem of enhancing images captured during night or in dark\nenvironments has been well-studied in the image signal processing literature.\nHowever, current deep learning-based solutions struggle with efficiency and\nrobustness in real-world scenarios (e.g. scenes with noise, saturated pixels,\nbad illumination). We propose a lightweight neural network that combines image\nprocessing in the frequency and spatial domains. Our method, FLOL+, is one of\nthe fastest models for this task, achieving state-of-the-art results on popular\nreal scenes datasets such as LOL and LSRW. Moreover, we are able to process\n1080p images under 12ms. Code and models at https://github.com/cidautai/FLOL\n","authors":["Juan C. Benito","Daniel Feijoo","Alvaro Garcia","Marcos V. Conde"],"pdf_url":"https://arxiv.org/pdf/2501.09718v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2501.09705v1","updated":"2025-01-16T17:57:53Z","published":"2025-01-16T17:57:53Z","title":"Practical Continual Forgetting for Pre-trained Vision Models","summary":"  For privacy and security concerns, the need to erase unwanted information\nfrom pre-trained vision models is becoming evident nowadays. In real-world\nscenarios, erasure requests originate at any time from both users and model\nowners, and these requests usually form a sequence. Therefore, under such a\nsetting, selective information is expected to be continuously removed from a\npre-trained model while maintaining the rest. We define this problem as\ncontinual forgetting and identify three key challenges. (i) For unwanted\nknowledge, efficient and effective deleting is crucial. (ii) For remaining\nknowledge, the impact brought by the forgetting procedure should be minimal.\n(iii) In real-world scenarios, the training samples may be scarce or partially\nmissing during the process of forgetting. To address them, we first propose\nGroup Sparse LoRA (GS-LoRA). Specifically, towards (i), we introduce LoRA\nmodules to fine-tune the FFN layers in Transformer blocks for each forgetting\ntask independently, and towards (ii), a simple group sparse regularization is\nadopted, enabling automatic selection of specific LoRA groups and zeroing out\nthe others. To further extend GS-LoRA to more practical scenarios, we\nincorporate prototype information as additional supervision and introduce a\nmore practical approach, GS-LoRA++. For each forgotten class, we move the\nlogits away from its original prototype. For the remaining classes, we pull the\nlogits closer to their respective prototypes. We conduct extensive experiments\non face recognition, object detection and image classification and demonstrate\nthat our method manages to forget specific classes with minimal impact on other\nclasses. Codes have been released on https://github.com/bjzhb666/GS-LoRA.\n","authors":["Hongbo Zhao","Fei Zhu","Bolin Ni","Feng Zhu","Gaofeng Meng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09695v1","updated":"2025-01-16T17:48:03Z","published":"2025-01-16T17:48:03Z","title":"Mitigating Hallucinations in Large Vision-Language Models via DPO:\n  On-Policy Data Hold the Key","summary":"  Hallucination remains a major challenge for Large Vision-Language Models\n(LVLMs). Direct Preference Optimization (DPO) has gained increasing attention\nas a simple solution to hallucination issues. It directly learns from\nconstructed preference pairs that reflect the severity of hallucinations in\nresponses to the same prompt and image. Nonetheless, different data\nconstruction methods in existing works bring notable performance variations. We\nidentify a crucial factor here: outcomes are largely contingent on whether the\nconstructed data aligns on-policy w.r.t the initial (reference) policy of DPO.\nTheoretical analysis suggests that learning from off-policy data is impeded by\nthe presence of KL-divergence between the updated policy and the reference\npolicy. From the perspective of dataset distribution, we systematically\nsummarize the inherent flaws in existing algorithms that employ DPO to address\nhallucination issues. To alleviate the problems, we propose On-Policy Alignment\n(OPA)-DPO framework, which uniquely leverages expert feedback to correct\nhallucinated responses and aligns both the original and expert-revised\nresponses in an on-policy manner. Notably, with only 4.8k data, OPA-DPO\nachieves an additional reduction in the hallucination rate of LLaVA-1.5-7B:\n13.26% on the AMBER benchmark and 5.39% on the Object-Hal benchmark, compared\nto the previous SOTA algorithm trained with 16k samples.\n","authors":["Zhihe Yang","Xufang Luo","Dongqi Han","Yunjian Xu","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.09695v1.pdf","comment":"18 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.09688v1","updated":"2025-01-16T17:40:19Z","published":"2025-01-16T17:40:19Z","title":"Fine-Grained Image-Text Correspondence with Cost Aggregation for\n  Open-Vocabulary Part Segmentation","summary":"  Open-Vocabulary Part Segmentation (OVPS) is an emerging field for recognizing\nfine-grained parts in unseen categories. We identify two primary challenges in\nOVPS: (1) the difficulty in aligning part-level image-text correspondence, and\n(2) the lack of structural understanding in segmenting object parts. To address\nthese issues, we propose PartCATSeg, a novel framework that integrates\nobject-aware part-level cost aggregation, compositional loss, and structural\nguidance from DINO. Our approach employs a disentangled cost aggregation\nstrategy that handles object and part-level costs separately, enhancing the\nprecision of part-level segmentation. We also introduce a compositional loss to\nbetter capture part-object relationships, compensating for the limited part\nannotations. Additionally, structural guidance from DINO features improves\nboundary delineation and inter-part understanding. Extensive experiments on\nPascal-Part-116, ADE20K-Part-234, and PartImageNet datasets demonstrate that\nour method significantly outperforms state-of-the-art approaches, setting a new\nbaseline for robust generalization to unseen part categories.\n","authors":["Jiho Choi","Seonho Lee","Minhyun Lee","Seungho Lee","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2501.09688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01184v2","updated":"2025-01-16T17:11:06Z","published":"2025-01-02T10:21:34Z","title":"Vulnerability-Aware Spatio-Temporal Learning for Generalizable and\n  Interpretable Deepfake Video Detection","summary":"  Detecting deepfake videos is highly challenging due to the complex\nintertwined spatial and temporal artifacts in forged sequences. Most recent\napproaches rely on binary classifiers trained on both real and fake data.\nHowever, such methods may struggle to focus on important artifacts, which can\nhinder their generalization capability. Additionally, these models often lack\ninterpretability, making it difficult to understand how predictions are made.\nTo address these issues, we propose FakeSTormer, offering two key\ncontributions. First, we introduce a multi-task learning framework with\nadditional spatial and temporal branches that enable the model to focus on\nsubtle spatio-temporal artifacts. These branches also provide interpretability\nby highlighting video regions that may contain artifacts. Second, we propose a\nvideo-level data synthesis algorithm that generates pseudo-fake videos with\nsubtle artifacts, providing the model with high-quality samples and ground\ntruth data for our spatial and temporal branches. Extensive experiments on\nseveral challenging benchmarks demonstrate the competitiveness of our approach\ncompared to recent state-of-the-art methods. The code is available at\nhttps://github.com/10Ring/FakeSTormer.\n","authors":["Dat Nguyen","Marcella Astrid","Anis Kacem","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2501.01184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05728v2","updated":"2025-01-16T17:09:57Z","published":"2025-01-10T05:53:32Z","title":"Super-class guided Transformer for Zero-Shot Attribute Classification","summary":"  Attribute classification is crucial for identifying specific characteristics\nwithin image regions. Vision-Language Models (VLMs) have been effective in\nzero-shot tasks by leveraging their general knowledge from large-scale\ndatasets. Recent studies demonstrate that transformer-based models with\nclass-wise queries can effectively address zero-shot multi-label\nclassification. However, poor utilization of the relationship between seen and\nunseen attributes makes the model lack generalizability. Additionally,\nattribute classification generally involves many attributes, making maintaining\nthe model's scalability difficult. To address these issues, we propose\nSuper-class guided transFormer (SugaFormer), a novel framework that leverages\nsuper-classes to enhance scalability and generalizability for zero-shot\nattribute classification. SugaFormer employs Super-class Query Initialization\n(SQI) to reduce the number of queries, utilizing common semantic information\nfrom super-classes, and incorporates Multi-context Decoding (MD) to handle\ndiverse visual cues. To strengthen generalizability, we introduce two knowledge\ntransfer strategies that utilize VLMs. During training, Super-class guided\nConsistency Regularization (SCR) aligns model's features with VLMs using\nsuper-class guided prompts, and during inference, Zero-shot Retrieval-based\nScore Enhancement (ZRSE) refines predictions for unseen attributes. Extensive\nexperiments demonstrate that SugaFormer achieves state-of-the-art performance\nacross three widely-used attribute classification benchmarks under zero-shot,\nand cross-dataset transfer settings. Our code is available at\nhttps://github.com/mlvlab/SugaFormer.\n","authors":["Sehyung Kim","Chanhyeong Yang","Jihwan Park","Taehoon Song","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2501.05728v2.pdf","comment":"AAAI25"},{"id":"http://arxiv.org/abs/2501.09672v1","updated":"2025-01-16T17:08:12Z","published":"2025-01-16T17:08:12Z","title":"Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP\n  Evaluation Benchmark","summary":"  The proliferation of Vision-Language Models (VLMs) in the past several years\ncalls for rigorous and comprehensive evaluation methods and benchmarks. This\nwork analyzes existing VLM evaluation techniques, including automated metrics,\nAI-based assessments, and human evaluations across diverse tasks. We first\nintroduce Robin - a novel suite of VLMs that we built by combining Large\nLanguage Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use\nRobin to identify shortcomings of current evaluation approaches across scales.\nNext, to overcome the identified limitations, we introduce CHIRP - a new long\nform response benchmark we developed for more robust and complete VLM\nevaluation. We provide open access to the Robin training code, model suite, and\nCHIRP benchmark to promote reproducibility and advance VLM research.\n","authors":["Alexis Roger","Prateek Humane","Daniel Z. Kaplan","Kshitij Gupta","Qi Sun","George Adamopoulos","Jonathan Siu Chi Lim","Quentin Anthony","Edwin Fennell","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2501.09672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01034v2","updated":"2025-01-16T16:45:29Z","published":"2024-02-01T21:45:12Z","title":"VIS-MAE: An Efficient Self-supervised Learning Approach on Medical Image\n  Segmentation and Classification","summary":"  Artificial Intelligence (AI) has the potential to revolutionize diagnosis and\nsegmentation in medical imaging. However, development and clinical\nimplementation face multiple challenges including limited data availability,\nlack of generalizability, and the necessity to incorporate multi-modal data\neffectively. A foundation model, which is a large-scale pre-trained AI model,\noffers a versatile base that can be adapted to a variety of specific tasks and\ncontexts. Here, we present VIsualization and Segmentation Masked AutoEncoder\n(VIS-MAE), novel model weights specifically designed for medical imaging.\nSpecifically, VIS-MAE is trained on a dataset of 2.5 million unlabeled images\nfrom various modalities (CT, MR, PET,X-rays, and ultrasound), using\nself-supervised learning techniques. It is then adapted to classification and\nsegmentation tasks using explicit labels. VIS-MAE has high label efficiency,\noutperforming several benchmark models in both in-domain and out-of-domain\napplications. In addition, VIS-MAE has improved label efficiency as it can\nachieve similar performance to other models with a reduced amount of labeled\ntraining data (50% or 80%) compared to other pre-trained weights. VIS-MAE\nrepresents a significant advancement in medical imaging AI, offering a\ngeneralizable and robust solution for improving segmentation and classification\ntasks while reducing the data annotation workload. The source code of this work\nis available at https://github.com/lzl199704/VIS-MAE.\n","authors":["Zelong Liu","Andrew Tieu","Nikhil Patel","Georgios Soultanidis","Louisa Deyer","Ying Wang","Sean Huver","Alexander Zhou","Yunhao Mei","Zahi A. Fayad","Timothy Deyer","Xueyan Mei"],"pdf_url":"https://arxiv.org/pdf/2402.01034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17097v2","updated":"2025-01-16T16:27:33Z","published":"2024-05-27T12:12:26Z","title":"A Comparative Study on Multi-task Uncertainty Quantification in Semantic\n  Segmentation and Monocular Depth Estimation","summary":"  Deep neural networks excel in perception tasks such as semantic segmentation\nand monocular depth estimation, making them indispensable in safety-critical\napplications like autonomous driving and industrial inspection. However, they\noften suffer from overconfidence and poor explainability, especially for\nout-of-domain data. While uncertainty quantification has emerged as a promising\nsolution to these challenges, multi-task settings have yet to be explored. In\nan effort to shed light on this, we evaluate Monte Carlo Dropout, Deep\nSub-Ensembles, and Deep Ensembles for joint semantic segmentation and monocular\ndepth estimation. Thereby, we reveal that Deep Ensembles stand out as the\npreferred choice, particularly in out-of-domain scenarios, and show the\npotential benefit of multi-task learning with regard to the uncertainty quality\nin comparison to solving both tasks separately. Additionally, we highlight the\nimpact of employing different uncertainty thresholds to classify pixels as\ncertain or uncertain, with the median uncertainty emerging as a robust default.\n","authors":["Steven Landgraf","Markus Hillemann","Theodor Kapler","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2405.17097v2.pdf","comment":"This manuscript is an extended version of a previously published\n  conference paper and is currently in review for a journal"},{"id":"http://arxiv.org/abs/2501.09635v1","updated":"2025-01-16T16:24:21Z","published":"2025-01-16T16:24:21Z","title":"Unified Face Matching and Physical-Digital Spoofing Attack Detection","summary":"  Face recognition technology has dramatically transformed the landscape of\nsecurity, surveillance, and authentication systems, offering a user-friendly\nand non-invasive biometric solution. However, despite its significant\nadvantages, face recognition systems face increasing threats from physical and\ndigital spoofing attacks. Current research typically treats face recognition\nand attack detection as distinct classification challenges. This approach\nnecessitates the implementation of separate models for each task, leading to\nconsiderable computational complexity, particularly on devices with limited\nresources. Such inefficiencies can stifle scalability and hinder performance.\nIn response to these challenges, this paper introduces an innovative unified\nmodel designed for face recognition and detection of physical and digital\nattacks. By leveraging the advanced Swin Transformer backbone and incorporating\nHiLo attention in a convolutional neural network framework, we address unified\nface recognition and spoof attack detection more effectively. Moreover, we\nintroduce augmentation techniques that replicate the traits of physical and\ndigital spoofing cues, significantly enhancing our model robustness. Through\ncomprehensive experimental evaluation across various datasets, we showcase the\neffectiveness of our model in unified face recognition and spoof detection.\nAdditionally, we confirm its resilience against unseen physical and digital\nspoofing attacks, underscoring its potential for real-world applications.\n","authors":["Arun Kunwar","Ajita Rattani"],"pdf_url":"https://arxiv.org/pdf/2501.09635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10729v3","updated":"2025-01-16T16:04:07Z","published":"2024-06-15T20:04:06Z","title":"A Comprehensive Survey of Foundation Models in Medicine","summary":"  Foundation models (FMs) are large-scale deep learning models trained on\nmassive datasets, often using self-supervised learning techniques. These models\nserve as a versatile base for a wide range of downstream tasks, including those\nin medicine and healthcare. FMs have demonstrated remarkable success across\nmultiple healthcare domains. However, existing surveys in this field do not\ncomprehensively cover all areas where FMs have made significant strides. In\nthis survey, we present a comprehensive review of FMs in medicine, focusing on\ntheir evolution, learning strategies, flagship models, applications, and\nassociated challenges. We examine how prominent FMs, such as the BERT and GPT\nfamilies, are transforming various aspects of healthcare, including clinical\nlarge language models, medical image analysis, and omics research.\nAdditionally, we provide a detailed taxonomy of FM-enabled healthcare\napplications, spanning clinical natural language processing, medical computer\nvision, graph learning, and other biology- and omics- related tasks. Despite\nthe transformative potentials of FMs, they also pose unique challenges. This\nsurvey delves into these challenges and highlights open research questions and\nlessons learned to guide researchers and practitioners. Our goal is to provide\nvaluable insights into the capabilities of FMs in health, facilitating\nresponsible deployment and mitigating associated risks.\n","authors":["Wasif Khan","Seowung Leem","Kyle B. See","Joshua K. Wong","Shaoting Zhang","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2406.10729v3.pdf","comment":"Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING"},{"id":"http://arxiv.org/abs/2501.05555v2","updated":"2025-01-16T16:00:37Z","published":"2025-01-09T20:02:10Z","title":"Improving Zero-Shot Object-Level Change Detection by Incorporating\n  Visual Correspondence","summary":"  Detecting object-level changes between two images across possibly different\nviews is a core task in many applications that involve visual inspection or\ncamera surveillance. Existing change-detection approaches suffer from three\nmajor limitations: (1) lack of evaluation on image pairs that contain no\nchanges, leading to unreported false positive rates; (2) lack of\ncorrespondences (i.e., localizing the regions before and after a change); and\n(3) poor zero-shot generalization across different domains. To address these\nissues, we introduce a novel method that leverages change correspondences (a)\nduring training to improve change detection accuracy, and (b) at test time, to\nminimize false positives. That is, we harness the supervision labels of where\nan object is added or removed to supervise change detectors, improving their\naccuracy over previous work by a large margin. Our work is also the first to\npredict correspondences between pairs of detected changes using estimated\nhomography and the Hungarian algorithm. Our model demonstrates superior\nperformance over existing methods, achieving state-of-the-art results in change\ndetection and change correspondence accuracy across both in-distribution and\nzero-shot benchmarks.\n","authors":["Hung Huy Nguyen","Pooyan Rahmanzadehgervi","Long Mai","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.05555v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09617v1","updated":"2025-01-16T15:44:24Z","published":"2025-01-16T15:44:24Z","title":"WMamba: Wavelet-based Mamba for Face Forgery Detection","summary":"  With the rapid advancement of deepfake generation technologies, the demand\nfor robust and accurate face forgery detection algorithms has become\nincreasingly critical. Recent studies have demonstrated that wavelet analysis\ncan uncover subtle forgery artifacts that remain imperceptible in the spatial\ndomain. Wavelets effectively capture important facial contours, which are often\nslender, fine-grained, and global in nature. However, existing wavelet-based\napproaches fail to fully leverage these unique characteristics, resulting in\nsub-optimal feature extraction and limited generalizability. To address this\nchallenge, we introduce WMamba, a novel wavelet-based feature extractor built\nupon the Mamba architecture. WMamba maximizes the utility of wavelet\ninformation through two key innovations. First, we propose Dynamic Contour\nConvolution (DCConv), which employs specially crafted deformable kernels to\nadaptively model slender facial contours. Second, by leveraging the Mamba\narchitecture, our method captures long-range spatial relationships with linear\ncomputational complexity. This efficiency allows for the extraction of\nfine-grained, global forgery artifacts from small image patches. Extensive\nexperimental results show that WMamba achieves state-of-the-art (SOTA)\nperformance, highlighting its effectiveness and superiority in face forgery\ndetection.\n","authors":["Siran Peng","Tianshuo Zhang","Li Gao","Xiangyu Zhu","Haoyuan Zhang","Kai Pang","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2501.09617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09608v1","updated":"2025-01-16T15:32:41Z","published":"2025-01-16T15:32:41Z","title":"Metric Learning with Progressive Self-Distillation for Audio-Visual\n  Embedding Learning","summary":"  Metric learning projects samples into an embedded space, where similarities\nand dissimilarities are quantified based on their learned representations.\nHowever, existing methods often rely on label-guided representation learning,\nwhere representations of different modalities, such as audio and visual data,\nare aligned based on annotated labels. This approach tends to underutilize\nlatent complex features and potential relationships inherent in the\ndistributions of audio and visual data that are not directly tied to the\nlabels, resulting in suboptimal performance in audio-visual embedding learning.\nTo address this issue, we propose a novel architecture that integrates\ncross-modal triplet loss with progressive self-distillation. Our method\nenhances representation learning by leveraging inherent distributions and\ndynamically refining soft audio-visual alignments -- probabilistic alignments\nbetween audio and visual data that capture the inherent relationships beyond\nexplicit labels. Specifically, the model distills audio-visual\ndistribution-based knowledge from annotated labels in a subset of each batch.\nThis self-distilled knowledge is used t\n","authors":["Donghuo Zeng","Kazushi Ikeda"],"pdf_url":"https://arxiv.org/pdf/2501.09608v1.pdf","comment":"5 pages, 3 figures, 2 tables. Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07227v2","updated":"2025-01-16T15:30:54Z","published":"2025-01-13T11:28:49Z","title":"MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning","summary":"  Video causal reasoning aims to achieve a high-level understanding of videos\nfrom a causal perspective. However, it exhibits limitations in its scope,\nprimarily executed in a question-answering paradigm and focusing on brief video\nsegments containing isolated events and basic causal relations, lacking\ncomprehensive and structured causality analysis for videos with multiple\ninterconnected events. To fill this gap, we introduce a new task and dataset,\nMulti-Event Causal Discovery (MECD). It aims to uncover the causal relations\nbetween events distributed chronologically across long videos. Given visual\nsegments and textual descriptions of events, MECD identifies the causal\nassociations between these events to derive a comprehensive and structured\nevent-level video causal graph explaining why and how the result event\noccurred. To address the challenges of MECD, we devise a novel framework\ninspired by the Granger Causality method, incorporating an efficient mask-based\nevent prediction model to perform an Event Granger Test. It estimates causality\nby comparing the predicted result event when premise events are masked versus\nunmasked. Furthermore, we integrate causal inference techniques such as\nfront-door adjustment and counterfactual inference to mitigate challenges in\nMECD like causality confounding and illusory causality. Additionally, context\nchain reasoning is introduced to conduct more robust and generalized reasoning.\nExperiments validate the effectiveness of our framework in reasoning complete\ncausal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%,\nrespectively. Further experiments demonstrate that causal relation graphs can\nalso contribute to downstream video understanding tasks such as video question\nanswering and video event prediction.\n","authors":["Tieyuan Chen","Huabin Liu","Yi Wang","Yihang Chen","Tianyao He","Chaofan Gan","Huanyu He","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07227v2.pdf","comment":"IEEE TPAMI Submission. continuous work of arXiv:2409.17647 (NeurIPS\n  2024)"},{"id":"http://arxiv.org/abs/2501.09600v1","updated":"2025-01-16T15:22:06Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n  Prototyping in Virtual Reality Applications","summary":"  SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01957v2","updated":"2025-01-16T15:00:16Z","published":"2025-01-03T18:59:52Z","title":"VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction","summary":"  Recent Multimodal Large Language Models (MLLMs) have typically focused on\nintegrating visual and textual modalities, with less emphasis placed on the\nrole of speech in enhancing interaction. However, speech plays a crucial role\nin multimodal dialogue systems, and implementing high-performance in both\nvision and speech tasks remains a significant challenge due to the fundamental\nmodality differences. In this paper, we propose a carefully designed\nmulti-stage training methodology that progressively trains LLM to understand\nboth visual and speech information, ultimately enabling fluent vision and\nspeech interaction. Our approach not only preserves strong vision-language\ncapacity, but also enables efficient speech-to-speech dialogue capabilities\nwithout separate ASR and TTS modules, significantly accelerating multimodal\nend-to-end response speed. By comparing our method against state-of-the-art\ncounterparts across benchmarks for image, video, and speech tasks, we\ndemonstrate that our model is equipped with both strong visual and speech\ncapabilities, making near real-time vision and speech interaction.\n","authors":["Chaoyou Fu","Haojia Lin","Xiong Wang","Yi-Fan Zhang","Yunhang Shen","Xiaoyu Liu","Yangze Li","Zuwei Long","Heting Gao","Ke Li","Long Ma","Xiawu Zheng","Rongrong Ji","Xing Sun","Caifeng Shan","Ran He"],"pdf_url":"https://arxiv.org/pdf/2501.01957v2.pdf","comment":"https://github.com/VITA-MLLM/VITA"},{"id":"http://arxiv.org/abs/2501.09579v1","updated":"2025-01-16T14:56:41Z","published":"2025-01-16T14:56:41Z","title":"Sequential PatchCore: Anomaly Detection for Surface Inspection using\n  Synthetic Impurities","summary":"  The appearance of surface impurities (e.g., water stains, fingerprints,\nstickers) is an often-mentioned issue that causes degradation of automated\nvisual inspection systems. At the same time, synthetic data generation\ntechniques for visual surface inspection have focused primarily on generating\nperfect examples and defects, disregarding impurities. This study highlights\nthe importance of considering impurities when generating synthetic data. We\nintroduce a procedural method to include photorealistic water stains in\nsynthetic data. The synthetic datasets are generated to correspond to real\ndatasets and are further used to train an anomaly detection model and\ninvestigate the influence of water stains. The high-resolution images used for\nsurface inspection lead to memory bottlenecks during anomaly detection\ntraining. To address this, we introduce Sequential PatchCore - a method to\nbuild coresets sequentially and make training on large images using\nconsumer-grade hardware tractable. This allows us to perform transfer learning\nusing coresets pre-trained on different dataset versions. Our results show the\nbenefits of using synthetic data for pre-training an explicit coreset anomaly\nmodel and the extended performance benefits of finetuning the coreset using\nreal data. We observed how the impurities and labelling ambiguity lower the\nmodel performance and have additionally reported the defect-wise recall to\nprovide an industrially relevant perspective on model performance.\n","authors":["Runzhou Mao","Juraj Fulir","Christoph Garth","Petra Gospodnetić"],"pdf_url":"https://arxiv.org/pdf/2501.09579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20891v4","updated":"2025-01-16T14:45:36Z","published":"2024-07-30T15:07:13Z","title":"Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian\n  Neural Networks","summary":"  Computational complexity of Bayesian learning is impeding its adoption in\npractical, large-scale tasks. Despite demonstrations of significant merits such\nas improved robustness and resilience to unseen or out-of-distribution inputs\nover their non- Bayesian counterparts, their practical use has faded to near\ninsignificance. In this study, we introduce an innovative framework to mitigate\nthe computational burden of Bayesian neural networks (BNNs). Our approach\nfollows the principle of Bayesian techniques based on deep ensembles, but\nsignificantly reduces their cost via multiple low-rank perturbations of\nparameters arising from a pre-trained neural network. Both vanilla version of\nensembles as well as more sophisticated schemes such as Bayesian learning with\nStein Variational Gradient Descent (SVGD), previously deemed impractical for\nlarge models, can be seamlessly implemented within the proposed framework,\ncalled Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a\ndramatic reduction in the number of trainable parameters required to\napproximate a Bayesian posterior; and ii) it not only maintains, but in some\ninstances, surpasses the performance of conventional Bayesian learning methods\nand non-Bayesian baselines. Our results with large-scale tasks such as\nImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the\neffectiveness and versatility of Bella in building highly scalable and\npractical Bayesian deep models for real-world applications.\n","authors":["Bao Gia Doan","Afshar Shamsi","Xiao-Yu Guo","Arash Mohammadi","Hamid Alinejad-Rokny","Dino Sejdinovic","Damien Teney","Damith C. Ranasinghe","Ehsan Abbasnejad"],"pdf_url":"https://arxiv.org/pdf/2407.20891v4.pdf","comment":"This paper is accepted in AAAI'2025"},{"id":"http://arxiv.org/abs/2412.04755v2","updated":"2025-01-16T14:44:39Z","published":"2024-12-06T03:40:21Z","title":"Latent Space Characterization of Autoencoder Variants","summary":"  Understanding the latent spaces learned by deep learning models is crucial in\nexploring how they represent and generate complex data. Autoencoders (AEs) have\nplayed a key role in the area of representation learning, with numerous\nregularization techniques and training principles developed not only to enhance\ntheir ability to learn compact and robust representations, but also to reveal\nhow different architectures influence the structure and smoothness of the\nlower-dimensional non-linear manifold. We strive to characterize the structure\nof the latent spaces learned by different autoencoders including convolutional\nautoencoders (CAEs), denoising autoencoders (DAEs), and variational\nautoencoders (VAEs) and how they change with the perturbations in the input. By\ncharacterizing the matrix manifolds corresponding to the latent spaces, we\nprovide an explanation for the well-known observation that the latent spaces of\nCAE and DAE form non-smooth manifolds, while that of VAE forms a smooth\nmanifold. We also map the points of the matrix manifold to a Hilbert space\nusing distance preserving transforms and provide an alternate view in terms of\nthe subspaces generated in the Hilbert space as a function of the distortion in\nthe input. The results show that the latent manifolds of CAE and DAE are\nstratified with each stratum being a smooth product manifold, while the\nmanifold of VAE is a smooth product manifold of two symmetric positive definite\nmatrices and a symmetric positive semi-definite matrix.\n","authors":["Anika Shrivastava","Renu Rameshan","Samar Agnihotri"],"pdf_url":"https://arxiv.org/pdf/2412.04755v2.pdf","comment":"9 pages, 6 figures, and 1 table"},{"id":"http://arxiv.org/abs/2501.09565v1","updated":"2025-01-16T14:40:02Z","published":"2025-01-16T14:40:02Z","title":"A New Teacher-Reviewer-Student Framework for Semi-supervised 2D Human\n  Pose Estimation","summary":"  Conventional 2D human pose estimation methods typically require extensive\nlabeled annotations, which are both labor-intensive and expensive. In contrast,\nsemi-supervised 2D human pose estimation can alleviate the above problems by\nleveraging a large amount of unlabeled data along with a small portion of\nlabeled data. Existing semi-supervised 2D human pose estimation methods update\nthe network through backpropagation, ignoring crucial historical information\nfrom the previous training process. Therefore, we propose a novel\nsemi-supervised 2D human pose estimation method by utilizing a newly designed\nTeacher-Reviewer-Student framework. Specifically, we first mimic the phenomenon\nthat human beings constantly review previous knowledge for consolidation to\ndesign our framework, in which the teacher predicts results to guide the\nstudent's learning and the reviewer stores important historical parameters to\nprovide additional supervision signals. Secondly, we introduce a Multi-level\nFeature Learning strategy, which utilizes the outputs from different stages of\nthe backbone to estimate the heatmap to guide network training, enriching the\nsupervisory information while effectively capturing keypoint relationships.\nFinally, we design a data augmentation strategy, i.e., Keypoint-Mix, to perturb\npose information by mixing different keypoints, thus enhancing the network's\nability to discern keypoints. Extensive experiments on publicly available\ndatasets, demonstrate our method achieves significant improvements compared to\nthe existing methods.\n","authors":["Wulian Yun","Mengshi Qi","Fei Peng","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.09565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09555v1","updated":"2025-01-16T14:18:06Z","published":"2025-01-16T14:18:06Z","title":"Text-driven Adaptation of Foundation Models for Few-shot Surgical\n  Workflow Analysis","summary":"  Purpose: Surgical workflow analysis is crucial for improving surgical\nefficiency and safety. However, previous studies rely heavily on large-scale\nannotated datasets, posing challenges in cost, scalability, and reliance on\nexpert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven\nAdaptation), designed to handle various surgical workflow analysis tasks with\nminimal paired image-label data.\n  Methods: Our approach has two key components. First, Few-shot selection-based\nmodality alignment selects a small subset of images and aligns their embeddings\nwith text embeddings from the downstream task, bridging the modality gap.\nSecond, Text-driven adaptation leverages only text data to train a decoder,\neliminating the need for paired image-text data. This decoder is then applied\nto aligned image embeddings, enabling image-related tasks without explicit\nimage-text pairs.\n  Results: We evaluate our approach to generative tasks (image captioning) and\ndiscriminative tasks (triplet recognition and phase recognition). Results show\nthat Surg-FTDA outperforms baselines and generalizes well across downstream\ntasks.\n  Conclusion: We propose a text-driven adaptation approach that mitigates the\nmodality gap and handles multiple downstream tasks in surgical workflow\nanalysis, with minimal reliance on large annotated datasets. The code and\ndataset will be released in https://github.com/TingxuanSix/Surg-FTDA.\n","authors":["Tingxuan Chen","Kun Yuan","Vinkle Srivastav","Nassir Navab","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2501.09555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09552v1","updated":"2025-01-16T14:12:33Z","published":"2025-01-16T14:12:33Z","title":"Exploring AI-based System Design for Pixel-level Protected Health\n  Information Detection in Medical Images","summary":"  De-identification of medical images is a critical step to ensure privacy\nduring data sharing in research and clinical settings. The initial step in this\nprocess involves detecting Protected Health Information (PHI), which can be\nfound in image metadata or imprinted within image pixels. Despite the\nimportance of such systems, there has been limited evaluation of existing\nAI-based solutions, creating barriers to the development of reliable and robust\ntools. In this study, we present an AI-based pipeline for PHI detection,\ncomprising three key components: text detection, text extraction, and analysis\nof PHI content in medical images. By experimenting with exchanging roles of\nvision and language models within the pipeline, we evaluate the performance and\nrecommend the best setup for the PHI detection task.\n","authors":["Tuan Truong","Ivo M. Baltruschat","Mark Klemens","Grit Werner","Matthias Lenga"],"pdf_url":"https://arxiv.org/pdf/2501.09552v1.pdf","comment":"In progress"},{"id":"http://arxiv.org/abs/2404.14388v3","updated":"2025-01-16T14:02:26Z","published":"2024-04-22T17:46:29Z","title":"STROOBnet Optimization via GPU-Accelerated Proximal Recurrence\n  Strategies","summary":"  Spatiotemporal networks' observational capabilities are crucial for accurate\ndata gathering and informed decisions across multiple sectors. This study\nfocuses on the Spatiotemporal Ranged Observer-Observable Bipartite Network\n(STROOBnet), linking observational nodes (e.g., surveillance cameras) to events\nwithin defined geographical regions, enabling efficient monitoring. Using data\nfrom Real-Time Crime Camera (RTCC) systems and Calls for Service (CFS) in New\nOrleans, where RTCC combats rising crime amidst reduced police presence, we\naddress the network's initial observational imbalances. Aiming for uniform\nobservational efficacy, we propose the Proximal Recurrence approach. It\noutperformed traditional clustering methods like k-means and DBSCAN by offering\nholistic event frequency and spatial consideration, enhancing observational\ncoverage.\n","authors":["Ted Edward Holmberg","Mahdi Abdelguerfi","Elias Ioup"],"pdf_url":"https://arxiv.org/pdf/2404.14388v3.pdf","comment":"10 pages, 17 figures, 2023 IEEE International Conference on Big Data\n  (BigData)"},{"id":"http://arxiv.org/abs/2409.07989v2","updated":"2025-01-16T14:01:58Z","published":"2024-09-12T12:34:29Z","title":"Enhancing Few-Shot Image Classification through Learnable Multi-Scale\n  Embedding and Attention Mechanisms","summary":"  In the context of few-shot classification, the goal is to train a classifier\nusing a limited number of samples while maintaining satisfactory performance.\nHowever, traditional metric-based methods exhibit certain limitations in\nachieving this objective. These methods typically rely on a single distance\nvalue between the query feature and support feature, thereby overlooking the\ncontribution of shallow features. To overcome this challenge, we propose a\nnovel approach in this paper. Our approach involves utilizing a multi-output\nembedding network that maps samples into distinct feature spaces. The proposed\nmethod extracts feature vectors at different stages, enabling the model to\ncapture both global and abstract features. By utilizing these diverse feature\nspaces, our model enhances its performance. Moreover, employing a\nself-attention mechanism improves the refinement of features at each stage,\nleading to even more robust representations and improved overall performance.\nFurthermore, assigning learnable weights to each stage significantly improved\nperformance and results. We conducted comprehensive evaluations on the\nMiniImageNet and FC100 datasets, specifically in the 5-way 1-shot and 5-way\n5-shot scenarios. Additionally, we performed cross-domain tasks across eight\nbenchmark datasets, achieving high accuracy in the testing domains. These\nevaluations demonstrate the efficacy of our proposed method in comparison to\nstate-of-the-art approaches. https://github.com/FatemehAskari/MSENet\n","authors":["Fatemeh Askari","Amirreza Fateh","Mohammad Reza Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2409.07989v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09532v1","updated":"2025-01-16T13:34:33Z","published":"2025-01-16T13:34:33Z","title":"AdaFV: Accelerating VLMs with Self-Adaptive Cross-Modality Attention\n  Mixture","summary":"  The success of VLMs often relies on the dynamic high-resolution schema that\nadaptively augments the input images to multiple crops, so that the details of\nthe images can be retained. However, such approaches result in a large number\nof redundant visual tokens, thus significantly reducing the efficiency of the\nVLMs. To improve the VLMs' efficiency without introducing extra training costs,\nmany research works are proposed to reduce the visual tokens by filtering the\nuninformative visual tokens or aggregating their information. Some approaches\npropose to reduce the visual tokens according to the self-attention of VLMs,\nwhich are biased, to result in inaccurate responses. The token reduction\napproaches solely rely on visual cues are text-agnostic, and fail to focus on\nthe areas that are most relevant to the question, especially when the queried\nobjects are non-salient to the image. In this work, we first conduct\nexperiments to show that the original text embeddings are aligned with the\nvisual tokens, without bias on the tailed visual tokens. We then propose a\nself-adaptive cross-modality attention mixture mechanism that dynamically\nleverages the effectiveness of visual saliency and text-to-image similarity in\nthe pre-LLM layers to select the visual tokens that are informative. Extensive\nexperiments demonstrate that the proposed approach achieves state-of-the-art\ntraining-free VLM acceleration performance, especially when the reduction rate\nis sufficiently large.\n","authors":["Jiayi Han","Liang Du","Yiwen Wu","Xiangguo Zhou","Hongwei Du","Weibo Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.09532v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.24031v3","updated":"2025-01-16T13:20:56Z","published":"2024-10-31T15:29:51Z","title":"A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems\n  using Disparity Maps","summary":"  Face recognition technologies are increasingly used in various applications,\nyet they are vulnerable to face spoofing attacks. These spoofing attacks often\ninvolve unique 3D structures, such as printed papers or mobile device screens.\nAlthough stereo-depth cameras can detect such attacks effectively, their\nhigh-cost limits their widespread adoption. Conversely, two-sensor systems\nwithout extrinsic calibration offer a cost-effective alternative but are unable\nto calculate depth using stereo techniques. In this work, we propose a method\nto overcome this challenge by leveraging facial attributes to derive disparity\ninformation and estimate relative depth for anti-spoofing purposes, using\nnon-calibrated systems. We introduce a multi-modal anti-spoofing model, coined\nDisparity Model, that incorporates created disparity maps as a third modality\nalongside the two original sensor modalities. We demonstrate the effectiveness\nof the Disparity Model in countering various spoof attacks using a\ncomprehensive dataset collected from the Intel RealSense ID Solution F455. Our\nmethod outperformed existing methods in the literature, achieving an Equal\nError Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False\nPositive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the\nerrors of the best comparison method, respectively. Additionally, we introduce\na model ensemble that addresses 3D spoof attacks as well, achieving an EER of\n2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a\nstate-of-the-art solution for the challenging task of anti-spoofing in\nnon-calibrated systems that lack depth information.\n","authors":["Ariel Larey","Eyal Rond","Omer Achrack"],"pdf_url":"https://arxiv.org/pdf/2410.24031v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09504v1","updated":"2025-01-16T12:33:48Z","published":"2025-01-16T12:33:48Z","title":"HydraMix: Multi-Image Feature Mixing for Small Data Image Classification","summary":"  Training deep neural networks requires datasets with a large number of\nannotated examples. The collection and annotation of these datasets is not only\nextremely expensive but also faces legal and privacy problems. These factors\nare a significant limitation for many real-world applications. To address this,\nwe introduce HydraMix, a novel architecture that generates new image\ncompositions by mixing multiple different images from the same class. HydraMix\nlearns the fusion of the content of various images guided by a\nsegmentation-based mixing mask in feature space and is optimized via a\ncombination of unsupervised and adversarial training. Our data augmentation\nscheme allows the creation of models trained from scratch on very small\ndatasets. We conduct extensive experiments on ciFAIR-10, STL-10, and\nciFAIR-100. Additionally, we introduce a novel text-image metric to assess the\ngenerality of the augmented datasets. Our results show that HydraMix\noutperforms existing state-of-the-art methods for image classification on small\ndatasets.\n","authors":["Christoph Reinders","Frederik Schubert","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2501.09504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09503v1","updated":"2025-01-16T12:28:39Z","published":"2025-01-16T12:28:39Z","title":"AnyStory: Towards Unified Single and Multiple Subject Personalization in\n  Text-to-Image Generation","summary":"  Recently, large-scale generative models have demonstrated outstanding\ntext-to-image generation capabilities. However, generating high-fidelity\npersonalized images with specific subjects still presents challenges,\nespecially in cases involving multiple subjects. In this paper, we propose\nAnyStory, a unified approach for personalized subject generation. AnyStory not\nonly achieves high-fidelity personalization for single subjects, but also for\nmultiple subjects, without sacrificing subject fidelity. Specifically, AnyStory\nmodels the subject personalization problem in an \"encode-then-route\" manner. In\nthe encoding step, AnyStory utilizes a universal and powerful image encoder,\ni.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve\nhigh-fidelity encoding of subject features. In the routing step, AnyStory\nutilizes a decoupled instance-aware subject router to accurately perceive and\npredict the potential location of the corresponding subject in the latent\nspace, and guide the injection of subject conditions. Detailed experimental\nresults demonstrate the excellent performance of our method in retaining\nsubject details, aligning text descriptions, and personalizing for multiple\nsubjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .\n","authors":["Junjie He","Yuxiang Tuo","Binghui Chen","Chongyang Zhong","Yifeng Geng","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2501.09503v1.pdf","comment":"Tech report; Project page:\n  https://aigcdesigngroup.github.io/AnyStory/"},{"id":"http://arxiv.org/abs/2501.09502v1","updated":"2025-01-16T12:27:05Z","published":"2025-01-16T12:27:05Z","title":"Omni-Emotion: Extending Video MLLM with Detailed Face and Audio Modeling\n  for Multimodal Emotion Analysis","summary":"  Understanding emotions accurately is essential for fields like human-computer\ninteraction. Due to the complexity of emotions and their multi-modal nature\n(e.g., emotions are influenced by facial expressions and audio), researchers\nhave turned to using multi-modal models to understand human emotions rather\nthan single-modality. However, current video multi-modal large language models\n(MLLMs) encounter difficulties in effectively integrating audio and identifying\nsubtle facial micro-expressions. Furthermore, the lack of detailed emotion\nanalysis datasets also limits the development of multimodal emotion analysis.\nTo address these issues, we introduce a self-reviewed dataset and a\nhuman-reviewed dataset, comprising 24,137 coarse-grained samples and 3,500\nmanually annotated samples with detailed emotion annotations, respectively.\nThese datasets allow models to learn from diverse scenarios and better\ngeneralize to real-world applications. Moreover, in addition to the audio\nmodeling, we propose to explicitly integrate facial encoding models into the\nexisting advanced Video MLLM, enabling the MLLM to effectively unify audio and\nthe subtle facial cues for emotion understanding. By aligning these features\nwithin a unified space and employing instruction tuning in our proposed\ndatasets, our Omni-Emotion achieves state-of-the-art performance in both\nemotion recognition and reasoning tasks.\n","authors":["Qize Yang","Detao Bai","Yi-Xing Peng","Xihan Wei"],"pdf_url":"https://arxiv.org/pdf/2501.09502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09499v1","updated":"2025-01-16T12:20:40Z","published":"2025-01-16T12:20:40Z","title":"VanGogh: A Unified Multimodal Diffusion-based Framework for Video\n  Colorization","summary":"  Video colorization aims to transform grayscale videos into vivid color\nrepresentations while maintaining temporal consistency and structural\nintegrity. Existing video colorization methods often suffer from color bleeding\nand lack comprehensive control, particularly under complex motion or diverse\nsemantic cues. To this end, we introduce VanGogh, a unified multimodal\ndiffusion-based framework for video colorization. VanGogh tackles these\nchallenges using a Dual Qformer to align and fuse features from multiple\nmodalities, complemented by a depth-guided generation process and an optical\nflow loss, which help reduce color overflow. Additionally, a color injection\nstrategy and luma channel replacement are implemented to improve generalization\nand mitigate flickering artifacts. Thanks to this design, users can exercise\nboth global and local control over the generation process, resulting in\nhigher-quality colorized videos. Extensive qualitative and quantitative\nevaluations, and user studies, demonstrate that VanGogh achieves superior\ntemporal consistency and color fidelity.Project page:\nhttps://becauseimbatman0.github.io/VanGogh.\n","authors":["Zixun Fang","Zhiheng Liu","Kai Zhu","Yu Liu","Ka Leong Cheng","Wei Zhai","Yang Cao","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2501.09499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09377v3","updated":"2025-01-16T12:12:24Z","published":"2023-06-15T08:18:29Z","title":"Evaluating alignment between humans and neural network representations\n  in image-based learning tasks","summary":"  Humans represent scenes and objects in rich feature spaces, carrying\ninformation that allows us to generalise about category memberships and\nabstract functions with few examples. What determines whether a neural network\nmodel generalises like a human? We tested how well the representations of $86$\npretrained neural network models mapped to human learning trajectories across\ntwo tasks where humans had to learn continuous relationships and categories of\nnatural images. In these tasks, both human participants and neural networks\nsuccessfully identified the relevant stimulus features within a few trials,\ndemonstrating effective generalisation. We found that while training dataset\nsize was a core determinant of alignment with human choices, contrastive\ntraining with multi-modal data (text and imagery) was a common feature of\ncurrently publicly available models that predicted human generalisation.\nIntrinsic dimensionality of representations had different effects on alignment\nfor different model types. Lastly, we tested three sets of human-aligned\nrepresentations and found no consistent improvements in predictive accuracy\ncompared to the baselines. In conclusion, pretrained neural networks can serve\nto extract representations for cognitive models, as they appear to capture some\nfundamental aspects of cognition that are transferable across tasks. Both our\nparadigms and modelling approach offer a novel way to quantify alignment\nbetween neural networks and humans and extend cognitive science into more\nnaturalistic domains.\n","authors":["Can Demircan","Tankred Saanum","Leonardo Pettini","Marcel Binz","Blazej M Baczkowski","Christian F Doeller","Mona M Garvert","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2306.09377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08443v2","updated":"2025-01-16T12:06:35Z","published":"2024-12-26T05:41:31Z","title":"Instruction-Guided Fusion of Multi-Layer Visual Features in Large\n  Vision-Language Models","summary":"  Large Vision-Language Models (LVLMs) have achieved significant success in\nmultimodal tasks by combining pre-trained vision encoders and large language\nmodels. However, current LVLMs mainly rely on features from the final layers of\nthe vision encoder, neglecting complementary information in shallower layers.\nWhile recent methods have explored multi-layer features, they are often\ntask-agnostic. We investigate the contributions of visual features from\ndifferent encoder layers across 18 benchmarks and 6 task categories. Our\nresults show that multi-layer features provide complementary strengths with\nvarying task dependencies, and uniform fusion performs suboptimally. Based on\nthese findings, we propose an instruction-guided vision aggregator that\ndynamically integrates multi-layer features based on textual instructions,\nwithout increasing the number of visual tokens. Extensive evaluations show\nsuperior performance, and analysis reveals the dominance of mid-to-high-level\nfeatures in semantic tasks and the critical role of low-level features in\nfine-grained perception. This work provides valuable insights into the adaptive\nuse of hierarchical visual features in LVLMs, advancing more flexible\nmultimodal systems.\n","authors":["Xu Li","Yi Zheng","Haotian Chen","Xiaolei Chen","Yuxuan Liang","Chenghang Lai","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2501.08443v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09490v1","updated":"2025-01-16T12:01:44Z","published":"2025-01-16T12:01:44Z","title":"Comparison of Various SLAM Systems for Mobile Robot in an Indoor\n  Environment","summary":"  This article presents a comparative analysis of a mobile robot trajectories\ncomputed by various ROS-based SLAM systems. For this reason we developed a\nprototype of a mobile robot with common sensors: 2D lidar, a monocular and ZED\nstereo cameras. Then we conducted experiments in a typical office environment\nand collected data from all sensors, running all tested SLAM systems based on\nthe acquired dataset. We studied the following SLAM systems: (a) 2D\nlidar-based: GMapping, Hector SLAM, Cartographer; (b) monocular camera-based:\nLarge Scale Direct monocular SLAM (LSD SLAM), ORB SLAM, Direct Sparse Odometry\n(DSO); and (c) stereo camera-based: ZEDfu, Real-Time Appearance-Based Mapping\n(RTAB map), ORB SLAM, Stereo Parallel Tracking and Mapping (S-PTAM). Since all\nSLAM methods were tested on the same dataset we compared results for different\nSLAM systems with appropriate metrics, demonstrating encouraging results for\nlidar-based Cartographer SLAM, Monocular ORB SLAM and Stereo RTAB Map methods.\n","authors":["Maksim Filipenko","Ilya Afanasyev"],"pdf_url":"https://arxiv.org/pdf/2501.09490v1.pdf","comment":"6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.09485v1","updated":"2025-01-16T11:44:29Z","published":"2025-01-16T11:44:29Z","title":"The Devil is in the Details: Simple Remedies for Image-to-LiDAR\n  Representation Learning","summary":"  LiDAR is a crucial sensor in autonomous driving, commonly used alongside\ncameras. By exploiting this camera-LiDAR setup and recent advances in image\nrepresentation learning, prior studies have shown the promising potential of\nimage-to-LiDAR distillation. These prior arts focus on the designs of their own\nlosses to effectively distill the pre-trained 2D image representations into a\n3D model. However, the other parts of the designs have been surprisingly\nunexplored. We find that fundamental design elements, e.g., the LiDAR\ncoordinate system, quantization according to the existing input interface, and\ndata utilization, are more critical than developing loss functions, which have\nbeen overlooked in prior works. In this work, we show that simple fixes to\nthese designs notably outperform existing methods by 16% in 3D semantic\nsegmentation on the nuScenes dataset and 13% in 3D object detection on the\nKITTI dataset in downstream task performance. We focus on overlooked design\nchoices along the spatial and temporal axes. Spatially, prior work has used\ncylindrical coordinate and voxel sizes without considering their side effects\nyielded with a commonly deployed sparse convolution layer input interface,\nleading to spatial quantization errors in 3D models. Temporally, existing work\nhas avoided cumbersome data curation by discarding unsynced data, limiting the\nuse to only the small portion of data that is temporally synced across sensors.\nWe analyze these effects and propose simple solutions for each overlooked\naspect.\n","authors":["Wonjun Jo","Kwon Byung-Ki","Kim Ji-Yeon","Hawook Jeong","Kyungdon Joo","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2501.09485v1.pdf","comment":"Accepted to ACCV2024"},{"id":"http://arxiv.org/abs/2501.09481v1","updated":"2025-01-16T11:35:22Z","published":"2025-01-16T11:35:22Z","title":"MonoSOWA: Scalable monocular 3D Object detector Without human\n  Annotations","summary":"  Detecting the three-dimensional position and orientation of objects using a\nsingle RGB camera is a foundational task in computer vision with many important\napplications. Traditionally, 3D object detection methods are trained in a\nfully-supervised setup, requiring vast amounts of human annotations, which are\nlaborious, costly, and do not scale well with the ever-increasing amounts of\ndata being captured.\n  In this paper, we present the first method to train 3D object detectors for\nmonocular RGB cameras without domain-specific human annotations, thus making\norders of magnitude more data available for training. Thanks to newly proposed\nCanonical Object Space, the method can not only exploit data across a variety\nof datasets and camera setups to train a single 3D detector, but unlike\nprevious work it also works out of the box in previously unseen camera setups.\nAll this is crucial for practical applications, where the data and cameras are\nextremely heterogeneous.\n  The method is evaluated on two standard autonomous driving datasets, where it\noutperforms previous works, which, unlike our method, still rely on 2D human\nannotations.\n","authors":["Jan Skvrna","Lukas Neumann"],"pdf_url":"https://arxiv.org/pdf/2501.09481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04747v6","updated":"2025-01-16T11:17:04Z","published":"2022-09-10T22:00:30Z","title":"Diffusion Models in Vision: A Survey","summary":"  Denoising diffusion models represent a recent emerging topic in computer\nvision, demonstrating remarkable results in the area of generative modeling. A\ndiffusion model is a deep generative model that is based on two stages, a\nforward diffusion stage and a reverse diffusion stage. In the forward diffusion\nstage, the input data is gradually perturbed over several steps by adding\nGaussian noise. In the reverse stage, a model is tasked at recovering the\noriginal input data by learning to gradually reverse the diffusion process,\nstep by step. Diffusion models are widely appreciated for the quality and\ndiversity of the generated samples, despite their known computational burdens,\ni.e. low speeds due to the high number of steps involved during sampling. In\nthis survey, we provide a comprehensive review of articles on denoising\ndiffusion models applied in vision, comprising both theoretical and practical\ncontributions in the field. First, we identify and present three generic\ndiffusion modeling frameworks, which are based on denoising diffusion\nprobabilistic models, noise conditioned score networks, and stochastic\ndifferential equations. We further discuss the relations between diffusion\nmodels and other deep generative models, including variational auto-encoders,\ngenerative adversarial networks, energy-based models, autoregressive models and\nnormalizing flows. Then, we introduce a multi-perspective categorization of\ndiffusion models applied in computer vision. Finally, we illustrate the current\nlimitations of diffusion models and envision some interesting directions for\nfuture research.\n","authors":["Florinel-Alin Croitoru","Vlad Hondru","Radu Tudor Ionescu","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2209.04747v6.pdf","comment":"Accepted in IEEE Transactions on Pattern Analysis and Machine\n  Intelligence. 25 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.09466v1","updated":"2025-01-16T10:59:29Z","published":"2025-01-16T10:59:29Z","title":"DEFOM-Stereo: Depth Foundation Model Based Stereo Matching","summary":"  Stereo matching is a key technique for metric depth estimation in computer\nvision and robotics. Real-world challenges like occlusion and non-texture\nhinder accurate disparity estimation from binocular matching cues. Recently,\nmonocular relative depth estimation has shown remarkable generalization using\nvision foundation models. Thus, to facilitate robust stereo matching with\nmonocular depth cues, we incorporate a robust monocular relative depth model\ninto the recurrent stereo-matching framework, building a new framework for\ndepth foundation model-based stereo-matching, DEFOM-Stereo. In the feature\nextraction stage, we construct the combined context and matching feature\nencoder by integrating features from conventional CNNs and DEFOM. In the update\nstage, we use the depth predicted by DEFOM to initialize the recurrent\ndisparity and introduce a scale update module to refine the disparity at the\ncorrect scale. DEFOM-Stereo is verified to have comparable performance on the\nScene Flow dataset with state-of-the-art (SOTA) methods and notably shows much\nstronger zero-shot generalization. Moreover, DEFOM-Stereo achieves SOTA\nperformance on the KITTI 2012, KITTI 2015, Middlebury, and ETH3D benchmarks,\nranking 1st on many metrics. In the joint evaluation under the robust vision\nchallenge, our model simultaneously outperforms previous models on the\nindividual benchmarks. Both results demonstrate the outstanding capabilities of\nthe proposed model.\n","authors":["Hualie Jiang","Zhiqiang Lou","Laiyan Ding","Rui Xu","Minglang Tan","Wenjie Jiang","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2501.09466v1.pdf","comment":"Code: https://github.com/Insta360-Research-Team/DEFOM-Stereo"},{"id":"http://arxiv.org/abs/2312.14150v3","updated":"2025-01-16T10:57:44Z","published":"2023-12-21T18:59:12Z","title":"DriveLM: Driving with Graph Visual Question Answering","summary":"  We study how vision-language models (VLMs) trained on web-scale data can be\nintegrated into end-to-end driving systems to boost generalization and enable\ninteractivity with human users. While recent approaches adapt VLMs to driving\nvia single-round visual question answering (VQA), human drivers reason about\ndecisions in multiple steps. Starting from the localization of key objects,\nhumans estimate object interactions before taking actions. The key insight is\nthat with our proposed task, Graph VQA, where we model graph-structured\nreasoning through perception, prediction and planning question-answer pairs, we\nobtain a suitable proxy task to mimic the human reasoning process. We\ninstantiate datasets (DriveLM-Data) built upon nuScenes and CARLA, and propose\na VLM-based baseline approach (DriveLM-Agent) for jointly performing Graph VQA\nand end-to-end driving. The experiments demonstrate that Graph VQA provides a\nsimple, principled framework for reasoning about a driving scene, and\nDriveLM-Data provides a challenging benchmark for this task. Our DriveLM-Agent\nbaseline performs end-to-end autonomous driving competitively in comparison to\nstate-of-the-art driving-specific architectures. Notably, its benefits are\npronounced when it is evaluated zero-shot on unseen objects or sensor\nconfigurations. We hope this work can be the starting point to shed new light\non how to apply VLMs for autonomous driving. To facilitate future research, all\ncode, data, and models are available to the public.\n","authors":["Chonghao Sima","Katrin Renz","Kashyap Chitta","Li Chen","Hanxue Zhang","Chengen Xie","Jens Beißwenger","Ping Luo","Andreas Geiger","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2312.14150v3.pdf","comment":"Accepted to ECCV 2024 as Oral paper"},{"id":"http://arxiv.org/abs/2501.09465v1","updated":"2025-01-16T10:56:45Z","published":"2025-01-16T10:56:45Z","title":"RE-POSE: Synergizing Reinforcement Learning-Based Partitioning and\n  Offloading for Edge Object Detection","summary":"  Object detection plays a crucial role in smart video analysis, with\napplications ranging from autonomous driving and security to smart cities.\nHowever, achieving real-time object detection on edge devices presents\nsignificant challenges due to their limited computational resources and the\nhigh demands of deep neural network (DNN)-based detection models, particularly\nwhen processing high-resolution video. Conventional strategies, such as input\ndown-sampling and network up-scaling, often compromise detection accuracy for\nfaster performance or lead to higher inference latency. To address these\nissues, this paper introduces RE-POSE, a Reinforcement Learning (RL)-Driven\nPartitioning and Edge Offloading framework designed to optimize the\naccuracy-latency trade-off in resource-constrained edge environments. Our\napproach features an RL-Based Dynamic Clustering Algorithm (RL-DCA) that\npartitions video frames into non-uniform blocks based on object distribution\nand the computational characteristics of DNNs. Furthermore, a parallel edge\noffloading scheme is implemented to distribute these blocks across multiple\nedge servers for concurrent processing. Experimental evaluations show that\nRE-POSE significantly enhances detection accuracy and reduces inference\nlatency, surpassing existing methods.\n","authors":["Jianrui Shi","Yong Zhao","Zeyang Cui","Xiaoming Shen","Minhang Zeng","Xiaojie Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08258v2","updated":"2025-01-16T10:55:41Z","published":"2025-01-14T17:10:02Z","title":"Towards an End-to-End (E2E) Adversarial Learning and Application in the\n  Physical World","summary":"  The traditional learning process of patch-based adversarial attacks,\nconducted in the digital domain and then applied in the physical domain (e.g.,\nvia printed stickers), may suffer from reduced performance due to adversarial\npatches' limited transferability from the digital domain to the physical\ndomain. Given that previous studies have considered using projectors to apply\nadversarial attacks, we raise the following question: can adversarial learning\n(i.e., patch generation) be performed entirely in the physical domain with a\nprojector? In this work, we propose the Physical-domain Adversarial Patch\nLearning Augmentation (PAPLA) framework, a novel end-to-end (E2E) framework\nthat converts adversarial learning from the digital domain to the physical\ndomain using a projector. We evaluate PAPLA across multiple scenarios,\nincluding controlled laboratory settings and realistic outdoor environments,\ndemonstrating its ability to ensure attack success compared to conventional\ndigital learning-physical application (DL-PA) methods. We also analyze the\nimpact of environmental factors, such as projection surface color, projector\nstrength, ambient light, distance, and angle of the target object relative to\nthe camera, on the effectiveness of projected patches. Finally, we demonstrate\nthe feasibility of the attack against a parked car and a stop sign in a\nreal-world outdoor environment. Our results show that under specific\nconditions, E2E adversarial learning in the physical domain eliminates the\ntransferability issue and ensures evasion by object detectors. Finally, we\nprovide insights into the challenges and opportunities of applying adversarial\nlearning in the physical domain and explain where such an approach is more\neffective than using a sticker.\n","authors":["Dudi Biton","Jacob Shams","Satoru Koda","Asaf Shabtai","Yuval Elovici","Ben Nassi"],"pdf_url":"https://arxiv.org/pdf/2501.08258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09460v1","updated":"2025-01-16T10:42:29Z","published":"2025-01-16T10:42:29Z","title":"Normal-NeRF: Ambiguity-Robust Normal Estimation for Highly Reflective\n  Scenes","summary":"  Neural Radiance Fields (NeRF) often struggle with reconstructing and\nrendering highly reflective scenes. Recent advancements have developed various\nreflection-aware appearance models to enhance NeRF's capability to render\nspecular reflections. However, the robust reconstruction of highly reflective\nscenes is still hindered by the inherent shape ambiguity on specular surfaces.\nExisting methods typically rely on additional geometry priors to regularize the\nshape prediction, but this can lead to oversmoothed geometry in complex scenes.\nObserving the critical role of surface normals in parameterizing reflections,\nwe introduce a transmittance-gradient-based normal estimation technique that\nremains robust even under ambiguous shape conditions. Furthermore, we propose a\ndual activated densities module that effectively bridges the gap between smooth\nsurface normals and sharp object boundaries. Combined with a reflection-aware\nappearance model, our proposed method achieves robust reconstruction and\nhigh-fidelity rendering of scenes featuring both highly specular reflections\nand intricate geometric structures. Extensive experiments demonstrate that our\nmethod outperforms existing state-of-the-art methods on various datasets.\n","authors":["Ji Shi","Xianghua Ying","Ruohao Guo","Bowei Xing","Wenzhen Yue"],"pdf_url":"https://arxiv.org/pdf/2501.09460v1.pdf","comment":"AAAI 2025, code available at https://github.com/sjj118/Normal-NeRF"},{"id":"http://arxiv.org/abs/2501.09456v1","updated":"2025-01-16T10:31:51Z","published":"2025-01-16T10:31:51Z","title":"On the Relation between Optical Aperture and Automotive Object Detection","summary":"  We explore the impact of aperture size and shape on automotive camera systems\nfor deep-learning-based tasks like traffic sign recognition and light state\ndetection. A method is proposed to simulate optical effects using the point\nspread function (PSF), enhancing realism and reducing the domain gap between\nsynthetic and real-world images. Computer-generated scenes are refined with\nthis technique to model optical distortions and improve simulation accuracy.\n","authors":["Ofer Bar-Shalom","Tzvi Philipp","Eran Kishon"],"pdf_url":"https://arxiv.org/pdf/2501.09456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09446v1","updated":"2025-01-16T10:20:48Z","published":"2025-01-16T10:20:48Z","title":"Double Visual Defense: Adversarial Pre-training and Instruction Tuning\n  for Improving Vision-Language Model Robustness","summary":"  This paper investigates the robustness of vision-language models against\nadversarial visual perturbations and introduces a novel ``double visual\ndefense\" to enhance this robustness. Unlike previous approaches that resort to\nlightweight adversarial fine-tuning of a pre-trained CLIP model, we perform\nlarge-scale adversarial vision-language pre-training from scratch using\nweb-scale data. We then strengthen the defense by incorporating adversarial\nvisual instruction tuning. The resulting models from each stage, $\\Delta$CLIP\nand $\\Delta^2$LLaVA, show substantially enhanced zero-shot robustness and set a\nnew state-of-the-art in adversarial defense for vision-language models. For\nexample, the adversarial robustness of $\\Delta$CLIP surpasses that of the\nprevious best models on ImageNet-1k by ~20%. %For example, $\\Delta$CLIP\nsurpasses the previous best models on ImageNet-1k by ~20% in terms of\nadversarial robustness. Similarly, compared to prior art, $\\Delta^2$LLaVA\nbrings a ~30% robustness improvement to image captioning task and a ~20%\nrobustness improvement to visual question answering task. Furthermore, our\nmodels exhibit stronger zero-shot recognition capability, fewer hallucinations,\nand superior reasoning performance compared to baselines. Our project page is\nhttps://doublevisualdefense.github.io/.\n","authors":["Zeyu Wang","Cihang Xie","Brian Bartoldson","Bhavya Kailkhura"],"pdf_url":"https://arxiv.org/pdf/2501.09446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15500v4","updated":"2025-01-16T10:20:32Z","published":"2024-07-22T09:31:30Z","title":"TextureCrop: Enhancing Synthetic Image Detection through Texture-based\n  Cropping","summary":"  Generative AI technologies produce increasingly realistic imagery, which,\ndespite its potential for creative applications, can also be misused to produce\nmisleading and harmful content. This renders Synthetic Image Detection (SID)\nmethods essential for identifying AI-generated content online. State-of-the-art\nSID methods typically resize or center-crop input images due to architectural\nor computational constraints, which hampers the detection of artifacts that\nappear in high-resolution images. To address this limitation, we propose\nTextureCrop, an image pre-processing component that can be plugged in any\npre-trained SID model to improve its performance. By focusing on high-frequency\nimage parts where generative artifacts are prevalent, TextureCrop enhances SID\nperformance with manageable memory requirements. Experimental results\ndemonstrate a consistent improvement in AUC across various detectors by 6.1%\ncompared to center cropping and by 15% compared to resizing, across\nhigh-resolution images from the Forensynths, Synthbuster and TWIGMA datasets.\nCode available at https : //github.com/mever-team/texture-crop.\n","authors":["Despina Konstantinidou","Christos Koutlis","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.15500v4.pdf","comment":"10 pages, 7 images"},{"id":"http://arxiv.org/abs/2501.09436v1","updated":"2025-01-16T10:07:44Z","published":"2025-01-16T10:07:44Z","title":"Scaling up self-supervised learning for improved surgical foundation\n  models","summary":"  Foundation models have revolutionized computer vision by achieving vastly\nsuperior performance across diverse tasks through large-scale pretraining on\nextensive datasets. However, their application in surgical computer vision has\nbeen limited. This study addresses this gap by introducing SurgeNetXL, a novel\nsurgical foundation model that sets a new benchmark in surgical computer\nvision. Trained on the largest reported surgical dataset to date, comprising\nover 4.7 million video frames, SurgeNetXL achieves consistent top-tier\nperformance across six datasets spanning four surgical procedures and three\ntasks, including semantic segmentation, phase recognition, and critical view of\nsafety (CVS) classification. Compared with the best-performing surgical\nfoundation models, SurgeNetXL shows mean improvements of 2.4, 9.0, and 12.6\npercent for semantic segmentation, phase recognition, and CVS classification,\nrespectively. Additionally, SurgeNetXL outperforms the best-performing\nImageNet-based variants by 14.4, 4.0, and 1.6 percent in the respective tasks.\nIn addition to advancing model performance, this study provides key insights\ninto scaling pretraining datasets, extending training durations, and optimizing\nmodel architectures specifically for surgical computer vision. These findings\npave the way for improved generalizability and robustness in data-scarce\nscenarios, offering a comprehensive framework for future research in this\ndomain. All models and a subset of the SurgeNetXL dataset, including over 2\nmillion video frames, are publicly available at:\nhttps://github.com/TimJaspers0801/SurgeNet.\n","authors":["Tim J. M. Jaspers","Ronald L. P. D. de Jong","Yiping Li","Carolus H. J. Kusters","Franciscus H. A. Bakker","Romy C. van Jaarsveld","Gino M. Kuiper","Richard van Hillegersberg","Jelle P. Ruurda","Willem M. Brinkman","Josien P. W. Pluim","Peter H. N. de With","Marcel Breeuwer","Yasmina Al Khalil","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2501.09436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09433v1","updated":"2025-01-16T10:03:15Z","published":"2025-01-16T10:03:15Z","title":"CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation","summary":"  The synthesis of high-quality 3D assets from textual or visual inputs has\nbecome a central objective in modern generative modeling. Despite the\nproliferation of 3D generation algorithms, they frequently grapple with\nchallenges such as multi-view inconsistency, slow generation times, low\nfidelity, and surface reconstruction problems. While some studies have\naddressed some of these issues, a comprehensive solution remains elusive. In\nthis paper, we introduce \\textbf{CaPa}, a carve-and-paint framework that\ngenerates high-fidelity 3D assets efficiently. CaPa employs a two-stage\nprocess, decoupling geometry generation from texture synthesis. Initially, a 3D\nlatent diffusion model generates geometry guided by multi-view inputs, ensuring\nstructural consistency across perspectives. Subsequently, leveraging a novel,\nmodel-agnostic Spatially Decoupled Attention, the framework synthesizes\nhigh-resolution textures (up to 4K) for a given geometry. Furthermore, we\npropose a 3D-aware occlusion inpainting algorithm that fills untextured\nregions, resulting in cohesive results across the entire model. This pipeline\ngenerates high-quality 3D assets in less than 30 seconds, providing\nready-to-use outputs for commercial applications. Experimental results\ndemonstrate that CaPa excels in both texture fidelity and geometric stability,\nestablishing a new standard for practical, scalable 3D asset generation.\n","authors":["Hwan Heo","Jangyeong Kim","Seongyeong Lee","Jeong A Wi","Junyoung Choi","Sangjun Ahn"],"pdf_url":"https://arxiv.org/pdf/2501.09433v1.pdf","comment":"project page: https://ncsoft.github.io/CaPa/"},{"id":"http://arxiv.org/abs/2501.09428v1","updated":"2025-01-16T09:57:40Z","published":"2025-01-16T09:57:40Z","title":"AugRefer: Advancing 3D Visual Grounding via Cross-Modal Augmentation and\n  Spatial Relation-based Referring","summary":"  3D visual grounding (3DVG), which aims to correlate a natural language\ndescription with the target object within a 3D scene, is a significant yet\nchallenging task. Despite recent advancements in this domain, existing\napproaches commonly encounter a shortage: a limited amount and diversity of\ntext3D pairs available for training. Moreover, they fall short in effectively\nleveraging different contextual clues (e.g., rich spatial relations within the\n3D visual space) for grounding. To address these limitations, we propose\nAugRefer, a novel approach for advancing 3D visual grounding. AugRefer\nintroduces cross-modal augmentation designed to extensively generate diverse\ntext-3D pairs by placing objects into 3D scenes and creating accurate and\nsemantically rich descriptions using foundation models. Notably, the resulting\npairs can be utilized by any existing 3DVG methods for enriching their training\ndata. Additionally, AugRefer presents a language-spatial adaptive decoder that\neffectively adapts the potential referring objects based on the language\ndescription and various 3D spatial relations. Extensive experiments on three\nbenchmark datasets clearly validate the effectiveness of AugRefer.\n","authors":["Xinyi Wang","Na Zhao","Zhiyuan Han","Dan Guo","Xun Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09428v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.09425v1","updated":"2025-01-16T09:55:42Z","published":"2025-01-16T09:55:42Z","title":"Vision-Language Models Do Not Understand Negation","summary":"  Many practical vision-language applications require models that understand\nnegation, e.g., when using natural language to retrieve images which contain\ncertain objects but not others. Despite advancements in vision-language models\n(VLMs) through large-scale training, their ability to comprehend negation\nremains underexplored. This study addresses the question: how well do current\nVLMs understand negation? We introduce NegBench, a new benchmark designed to\nevaluate negation understanding across 18 task variations and 79k examples\nspanning image, video, and medical datasets. The benchmark consists of two core\ntasks designed to evaluate negation understanding in diverse multimodal\nsettings: Retrieval with Negation and Multiple Choice Questions with Negated\nCaptions. Our evaluation reveals that modern VLMs struggle significantly with\nnegation, often performing at chance level. To address these shortcomings, we\nexplore a data-centric approach wherein we finetune CLIP models on large-scale\nsynthetic datasets containing millions of negated captions. We show that this\napproach can result in a 10% increase in recall on negated queries and a 40%\nboost in accuracy on multiple-choice questions with negated captions.\n","authors":["Kumail Alhamoud","Shaden Alshammari","Yonglong Tian","Guohao Li","Philip Torr","Yoon Kim","Marzyeh Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2501.09425v1.pdf","comment":"Project page: https://negbench.github.io"},{"id":"http://arxiv.org/abs/2501.09420v1","updated":"2025-01-16T09:47:18Z","published":"2025-01-16T09:47:18Z","title":"Dynamic Neural Style Transfer for Artistic Image Generation using VGG19","summary":"  Throughout history, humans have created remarkable works of art, but\nartificial intelligence has only recently started to make strides in generating\nvisually compelling art. Breakthroughs in the past few years have focused on\nusing convolutional neural networks (CNNs) to separate and manipulate the\ncontent and style of images, applying texture synthesis techniques.\nNevertheless, a number of current techniques continue to encounter obstacles,\nincluding lengthy processing times, restricted choices of style images, and the\ninability to modify the weight ratio of styles. We proposed a neural style\ntransfer system that can add various artistic styles to a desired image to\naddress these constraints allowing flexible adjustments to style weight ratios\nand reducing processing time. The system uses the VGG19 model for feature\nextraction, ensuring high-quality, flexible stylization without compromising\ncontent integrity.\n","authors":["Kapil Kashyap","Mehak Garg","Sean Fargose","Sindhu Nair"],"pdf_url":"https://arxiv.org/pdf/2501.09420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09411v1","updated":"2025-01-16T09:38:22Z","published":"2025-01-16T09:38:22Z","title":"Towards Robust and Realistic Human Pose Estimation via WiFi Signals","summary":"  Robust WiFi-based human pose estimation is a challenging task that bridges\ndiscrete and subtle WiFi signals to human skeletons. This paper revisits this\nproblem and reveals two critical yet overlooked issues: 1) cross-domain gap,\ni.e., due to significant variations between source-target domain pose\ndistributions; and 2) structural fidelity gap, i.e., predicted skeletal poses\nmanifest distorted topology, usually with misplaced joints and disproportionate\nbone lengths. This paper fills these gaps by reformulating the task into a\nnovel two-phase framework dubbed DT-Pose: Domain-consistent representation\nlearning and Topology-constrained Pose decoding. Concretely, we first propose a\ntemporal-consistent contrastive learning strategy with uniformity\nregularization, coupled with self-supervised masking-reconstruction operations,\nto enable robust learning of domain-consistent and motion-discriminative\nWiFi-specific representations. Beyond this, we introduce a simple yet effective\npose decoder with task prompts, which integrates Graph Convolution Network\n(GCN) and Transformer layers to constrain the topology structure of the\ngenerated skeleton by exploring the adjacent-overarching relationships among\nhuman joints. Extensive experiments conducted on various benchmark datasets\nhighlight the superior performance of our method in tackling these fundamental\nchallenges in both 2D/3D human pose estimation tasks.\n","authors":["Yang Chen","Jingcai Guo","Song Guo","Jingren Zhou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2501.09411v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2406.04829v3","updated":"2025-01-16T09:31:01Z","published":"2024-06-07T10:54:40Z","title":"IOR: Inversed Objects Replay for Incremental Object Detection","summary":"  Existing Incremental Object Detection (IOD) methods partially alleviate\ncatastrophic forgetting when incrementally detecting new objects in real-world\nscenarios. However, many of these methods rely on the assumption that unlabeled\nold-class objects may co-occur with labeled new-class objects in the\nincremental data. When unlabeled old-class objects are absent, the performance\nof existing methods tends to degrade. The absence can be mitigated by\ngenerating old-class samples, but it incurs high costs. This paper argues that\nprevious generation-based IOD suffers from redundancy, both in the use of\ngenerative models, which require additional training and storage, and in the\noverproduction of generated samples, many of which do not contribute\nsignificantly to performance improvements. To eliminate the redundancy, we\npropose Inversed Objects Replay (IOR). Specifically, we generate old-class\nsamples by inversing the original detectors, thus eliminating the necessity of\ntraining and storing additional generative models. We propose augmented replay\nto reuse the objects in generated samples, reducing redundant generations.\nMoreover, we propose high-value knowledge distillation focusing on the\npositions of old-class objects overwhelmed by the background, which transfers\nthe knowledge to the incremental detector. Extensive experiments conducted on\nMS COCO 2017 demonstrate that our method can efficiently improve detection\nperformance in IOD scenarios with the absence of old-class objects.\n","authors":["Zijia An","Boyu Diao","Libo Huang","Ruiqi Liu","Zhulin An","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2406.04829v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09403v1","updated":"2025-01-16T09:18:59Z","published":"2025-01-16T09:18:59Z","title":"PISCO: Self-Supervised k-Space Regularization for Improved Neural\n  Implicit k-Space Representations of Dynamic MRI","summary":"  Neural implicit k-space representations (NIK) have shown promising results\nfor dynamic magnetic resonance imaging (MRI) at high temporal resolutions. Yet,\nreducing acquisition time, and thereby available training data, results in\nsevere performance drops due to overfitting. To address this, we introduce a\nnovel self-supervised k-space loss function $\\mathcal{L}_\\mathrm{PISCO}$,\napplicable for regularization of NIK-based reconstructions. The proposed loss\nfunction is based on the concept of parallel imaging-inspired self-consistency\n(PISCO), enforcing a consistent global k-space neighborhood relationship\nwithout requiring additional data. Quantitative and qualitative evaluations on\nstatic and dynamic MR reconstructions show that integrating PISCO significantly\nimproves NIK representations. Particularly for high acceleration factors\n(R$\\geq$54), NIK with PISCO achieves superior spatio-temporal reconstruction\nquality compared to state-of-the-art methods. Furthermore, an extensive\nanalysis of the loss assumptions and stability shows PISCO's potential as\nversatile self-supervised k-space loss function for further applications and\narchitectures. Code is available at:\nhttps://github.com/compai-lab/2025-pisco-spieker\n","authors":["Veronika Spieker","Hannah Eichhorn","Wenqi Huang","Jonathan K. Stelter","Tabita Catalan","Rickmer F. Braren","Daniel Rueckert","Francisco Sahli Costabal","Kerstin Hammernik","Dimitrios C. Karampinos","Claudia Prieto","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2501.09403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09396v1","updated":"2025-01-16T09:07:01Z","published":"2025-01-16T09:07:01Z","title":"Joint Transmission and Deblurring: A Semantic Communication Approach\n  Using Events","summary":"  Deep learning-based joint source-channel coding (JSCC) is emerging as a\npromising technology for effective image transmission. However, most existing\napproaches focus on transmitting clear images, overlooking real-world\nchallenges such as motion blur caused by camera shaking or fast-moving objects.\nMotion blur often degrades image quality, making transmission and\nreconstruction more challenging. Event cameras, which asynchronously record\npixel intensity changes with extremely low latency, have shown great potential\nfor motion deblurring tasks. However, the efficient transmission of the\nabundant data generated by event cameras remains a significant challenge. In\nthis work, we propose a novel JSCC framework for the joint transmission of\nblurry images and events, aimed at achieving high-quality reconstructions under\nlimited channel bandwidth. This approach is designed as a deblurring\ntask-oriented JSCC system. Since RGB cameras and event cameras capture the same\nscene through different modalities, their outputs contain both shared and\ndomain-specific information. To avoid repeatedly transmitting the shared\ninformation, we extract and transmit their shared information and\ndomain-specific information, respectively. At the receiver, the received\nsignals are processed by a deblurring decoder to generate clear images.\nAdditionally, we introduce a multi-stage training strategy to train the\nproposed model. Simulation results demonstrate that our method significantly\noutperforms existing JSCC-based image transmission schemes, addressing motion\nblur effectively.\n","authors":["Pujing Yang","Guangyi Zhang","Yunlong Cai","Lei Yu","Guanding Yu"],"pdf_url":"https://arxiv.org/pdf/2501.09396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09393v1","updated":"2025-01-16T09:05:46Z","published":"2025-01-16T09:05:46Z","title":"SVIA: A Street View Image Anonymization Framework for Self-Driving\n  Applications","summary":"  In recent years, there has been an increasing interest in image\nanonymization, particularly focusing on the de-identification of faces and\nindividuals. However, for self-driving applications, merely de-identifying\nfaces and individuals might not provide sufficient privacy protection since\nstreet views like vehicles and buildings can still disclose locations,\ntrajectories, and other sensitive information. Therefore, it remains crucial to\nextend anonymization techniques to street view images to fully preserve the\nprivacy of users, pedestrians, and vehicles. In this paper, we propose a Street\nView Image Anonymization (SVIA) framework for self-driving applications. The\nSVIA framework consists of three integral components: a semantic segmenter to\nsegment an input image into functional regions, an inpainter to generate\nalternatives to privacy-sensitive regions, and a harmonizer to seamlessly\nstitch modified regions to guarantee visual coherence. Compared to existing\nmethods, SVIA achieves a much better trade-off between image generation quality\nand privacy protection, as evidenced by experimental results for five common\nmetrics on two widely used public datasets.\n","authors":["Dongyu Liu","Xuhong Wang","Cen Chen","Yanhao Wang","Shengyue Yao","Yilun Lin"],"pdf_url":"https://arxiv.org/pdf/2501.09393v1.pdf","comment":"8 pages, 6 figures, 3 tables. Accepted by IEEE ITSC 2024"},{"id":"http://arxiv.org/abs/2410.20986v2","updated":"2025-01-16T08:58:44Z","published":"2024-10-28T13:04:44Z","title":"Skinned Motion Retargeting with Dense Geometric Interaction Perception","summary":"  Capturing and maintaining geometric interactions among different body parts\nis crucial for successful motion retargeting in skinned characters. Existing\napproaches often overlook body geometries or add a geometry correction stage\nafter skeletal motion retargeting. This results in conflicts between skeleton\ninteraction and geometry correction, leading to issues such as jittery,\ninterpenetration, and contact mismatches. To address these challenges, we\nintroduce a new retargeting framework, MeshRet, which directly models the dense\ngeometric interactions in motion retargeting. Initially, we establish dense\nmesh correspondences between characters using semantically consistent sensors\n(SCS), effective across diverse mesh topologies. Subsequently, we develop a\nnovel spatio-temporal representation called the dense mesh interaction (DMI)\nfield. This field, a collection of interacting SCS feature vectors, skillfully\ncaptures both contact and non-contact interactions between body geometries. By\naligning the DMI field during retargeting, MeshRet not only preserves motion\nsemantics but also prevents self-interpenetration and ensures contact\npreservation. Extensive experiments on the public Mixamo dataset and our\nnewly-collected ScanRet dataset demonstrate that MeshRet achieves\nstate-of-the-art performance. Code available at\nhttps://github.com/abcyzj/MeshRet.\n","authors":["Zijie Ye","Jia-Wei Liu","Jia Jia","Shikun Sun","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2410.20986v2.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2407.03653v3","updated":"2025-01-16T08:55:49Z","published":"2024-07-04T05:48:28Z","title":"reBEN: Refined BigEarthNet Dataset for Remote Sensing Image Analysis","summary":"  This paper presents refined BigEarthNet (reBEN) that is a large-scale,\nmulti-modal remote sensing dataset constructed to support deep learning (DL)\nstudies for remote sensing image analysis. The reBEN dataset consists of\n549,488 pairs of Sentinel-1 and Sentinel-2 image patches. To construct reBEN,\nwe initially consider the Sentinel-1 and Sentinel-2 tiles used to construct the\nBigEarthNet dataset and then divide them into patches of size 1200 m x 1200 m.\nWe apply atmospheric correction to the Sentinel-2 patches using the latest\nversion of the sen2cor tool, resulting in higher-quality patches compared to\nthose present in BigEarthNet. Each patch is then associated with a pixel-level\nreference map and scene-level multi-labels. This makes reBEN suitable for\npixel- and scene-based learning tasks. The labels are derived from the most\nrecent CORINE Land Cover (CLC) map of 2018 by utilizing the 19-class\nnomenclature as in BigEarthNet. The use of the most recent CLC map results in\novercoming the label noise present in BigEarthNet. Furthermore, we introduce a\nnew geographical-based split assignment algorithm that significantly reduces\nthe spatial correlation among the train, validation, and test sets with respect\nto those present in BigEarthNet. This increases the reliability of the\nevaluation of DL models. To minimize the DL model training time, we introduce\nsoftware tools that convert the reBEN dataset into a DL-optimized data format.\nIn our experiments, we show the potential of reBEN for multi-modal multi-label\nimage classification problems by considering several state-of-the-art DL\nmodels. The pre-trained model weights, associated code, and complete dataset\nare available at https://bigearth.net.\n","authors":["Kai Norman Clasen","Leonard Hackel","Tom Burgert","Gencer Sumbul","Begüm Demir","Volker Markl"],"pdf_url":"https://arxiv.org/pdf/2407.03653v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09372v1","updated":"2025-01-16T08:34:39Z","published":"2025-01-16T08:34:39Z","title":"Image Segmentation with transformers: An Overview, Challenges and Future","summary":"  Image segmentation, a key task in computer vision, has traditionally relied\non convolutional neural networks (CNNs), yet these models struggle with\ncapturing complex spatial dependencies, objects with varying scales, need for\nmanually crafted architecture components and contextual information. This paper\nexplores the shortcomings of CNN-based models and the shift towards transformer\narchitectures -to overcome those limitations. This work reviews\nstate-of-the-art transformer-based segmentation models, addressing\nsegmentation-specific challenges and their solutions. The paper discusses\ncurrent challenges in transformer-based segmentation and outlines promising\nfuture trends, such as lightweight architectures and enhanced data efficiency.\nThis survey serves as a guide for understanding the impact of transformers in\nadvancing segmentation capabilities and overcoming the limitations of\ntraditional models.\n","authors":["Deepjyoti Chetia","Debasish Dutta","Sanjib Kr Kalita"],"pdf_url":"https://arxiv.org/pdf/2501.09372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03659v3","updated":"2025-01-16T08:20:15Z","published":"2025-01-07T09:47:46Z","title":"DehazeGS: Seeing Through Fog with 3D Gaussian Splatting","summary":"  Current novel view synthesis tasks primarily rely on high-quality and clear\nimages. However, in foggy scenes, scattering and attenuation can significantly\ndegrade the reconstruction and rendering quality. Although NeRF-based dehazing\nreconstruction algorithms have been developed, their use of deep fully\nconnected neural networks and per-ray sampling strategies leads to high\ncomputational costs. Moreover, NeRF's implicit representation struggles to\nrecover fine details from hazy scenes. In contrast, recent advancements in 3D\nGaussian Splatting achieve high-quality 3D scene reconstruction by explicitly\nmodeling point clouds into 3D Gaussians. In this paper, we propose leveraging\nthe explicit Gaussian representation to explain the foggy image formation\nprocess through a physically accurate forward rendering process. We introduce\nDehazeGS, a method capable of decomposing and rendering a fog-free background\nfrom participating media using only muti-view foggy images as input. We model\nthe transmission within each Gaussian distribution to simulate the formation of\nfog. During this process, we jointly learn the atmospheric light and scattering\ncoefficient while optimizing the Gaussian representation of the hazy scene. In\nthe inference stage, we eliminate the effects of scattering and attenuation on\nthe Gaussians and directly project them onto a 2D plane to obtain a clear view.\nExperiments on both synthetic and real-world foggy datasets demonstrate that\nDehazeGS achieves state-of-the-art performance in terms of both rendering\nquality and computational efficiency. visualizations are available at\nhttps://dehazegs.github.io/\n","authors":["Jinze Yu","Yiqun Wang","Zhengda Lu","Jianwei Guo","Yong Li","Hongxing Qin","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03659v3.pdf","comment":"9 pages,4 figures"},{"id":"http://arxiv.org/abs/2501.05777v2","updated":"2025-01-16T08:20:11Z","published":"2025-01-10T08:18:37Z","title":"StructSR: Refuse Spurious Details in Real-World Image Super-Resolution","summary":"  Diffusion-based models have shown great promise in real-world image\nsuper-resolution (Real-ISR), but often generate content with structural errors\nand spurious texture details due to the empirical priors and illusions of these\nmodels. To address this issue, we introduce StructSR, a simple, effective, and\nplug-and-play method that enhances structural fidelity and suppresses spurious\ndetails for diffusion-based Real-ISR. StructSR operates without the need for\nadditional fine-tuning, external model priors, or high-level semantic\nknowledge. At its core is the Structure-Aware Screening (SAS) mechanism, which\nidentifies the image with the highest structural similarity to the\nlow-resolution (LR) input in the early inference stage, allowing us to leverage\nit as a historical structure knowledge to suppress the generation of spurious\ndetails. By intervening in the diffusion inference process, StructSR seamlessly\nintegrates with existing diffusion-based Real-ISR models. Our experimental\nresults demonstrate that StructSR significantly improves the fidelity of\nstructure and texture, improving the PSNR and SSIM metrics by an average of\n5.27% and 9.36% on a synthetic dataset (DIV2K-Val) and 4.13% and 8.64% on two\nreal-world datasets (RealSR and DRealSR) when integrated with four\nstate-of-the-art diffusion-based Real-ISR methods.\n","authors":["Yachao Li","Dong Liang","Tianyu Ding","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2501.05777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09363v1","updated":"2025-01-16T08:18:03Z","published":"2025-01-16T08:18:03Z","title":"Identification of Traditional Medicinal Plant Leaves Using an effective\n  Deep Learning model and Self-Curated Dataset","summary":"  Medicinal plants have been a key component in producing traditional and\nmodern medicines, especially in the field of Ayurveda, an ancient Indian\nmedical system. Producing these medicines and collecting and extracting the\nright plant is a crucial step due to the visually similar nature of some\nplants. The extraction of these plants from nonmedicinal plants requires human\nexpert intervention. To solve the issue of accurate plant identification and\nreduce the need for a human expert in the collection process; employing\ncomputer vision methods will be efficient and beneficial. In this paper, we\nhave proposed a model that solves such issues. The proposed model is a custom\nconvolutional neural network (CNN) architecture with 6 convolution layers,\nmax-pooling layers, and dense layers. The model was tested on three different\ndatasets named Indian Medicinal Leaves Image Dataset,MED117 Medicinal Plant\nLeaf Dataset, and the self-curated dataset by the authors. The proposed model\nachieved respective accuracies of 99.5%, 98.4%, and 99.7% using various\noptimizers including Adam, RMSprop, and SGD with momentum.\n","authors":["Deepjyoti Chetia","Sanjib Kr Kalita","Prof Partha Pratim Baruah","Debasish Dutta","Tanaz Akhter"],"pdf_url":"https://arxiv.org/pdf/2501.09363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09361v1","updated":"2025-01-16T08:17:32Z","published":"2025-01-16T08:17:32Z","title":"Strategic Base Representation Learning via Feature Augmentations for\n  Few-Shot Class Incremental Learning","summary":"  Few-shot class incremental learning implies the model to learn new classes\nwhile retaining knowledge of previously learned classes with a small number of\ntraining instances. Existing frameworks typically freeze the parameters of the\npreviously learned classes during the incorporation of new classes. However,\nthis approach often results in suboptimal class separation of previously\nlearned classes, leading to overlap between old and new classes. Consequently,\nthe performance of old classes degrades on new classes. To address these\nchallenges, we propose a novel feature augmentation driven contrastive learning\nframework designed to enhance the separation of previously learned classes to\naccommodate new classes. Our approach involves augmenting feature vectors and\nassigning proxy labels to these vectors. This strategy expands the feature\nspace, ensuring seamless integration of new classes within the expanded space.\nAdditionally, we employ a self-supervised contrastive loss to improve the\nseparation between previous classes. We validate our framework through\nexperiments on three FSCIL benchmark datasets: CIFAR100, miniImageNet, and\nCUB200. The results demonstrate that our Feature Augmentation driven\nContrastive Learning framework significantly outperforms other approaches,\nachieving state-of-the-art performance.\n","authors":["Parinita Nema","Vinod K Kurmi"],"pdf_url":"https://arxiv.org/pdf/2501.09361v1.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2407.21035v2","updated":"2025-01-16T08:08:57Z","published":"2024-07-17T08:19:11Z","title":"Direct Unlearning Optimization for Robust and Safe Text-to-Image Models","summary":"  Recent advancements in text-to-image (T2I) models have unlocked a wide range\nof applications but also present significant risks, particularly in their\npotential to generate unsafe content. To mitigate this issue, researchers have\ndeveloped unlearning techniques to remove the model's ability to generate\npotentially harmful content. However, these methods are easily bypassed by\nadversarial attacks, making them unreliable for ensuring the safety of\ngenerated images. In this paper, we propose Direct Unlearning Optimization\n(DUO), a novel framework for removing Not Safe For Work (NSFW) content from T2I\nmodels while preserving their performance on unrelated topics. DUO employs a\npreference optimization approach using curated paired image data, ensuring that\nthe model learns to remove unsafe visual concepts while retaining unrelated\nfeatures. Furthermore, we introduce an output-preserving regularization term to\nmaintain the model's generative capabilities on safe content. Extensive\nexperiments demonstrate that DUO can robustly defend against various\nstate-of-the-art red teaming methods without significant performance\ndegradation on unrelated topics, as measured by FID and CLIP scores. Our work\ncontributes to the development of safer and more reliable T2I models, paving\nthe way for their responsible deployment in both closed-source and open-source\nscenarios.\n","authors":["Yong-Hyun Park","Sangdoo Yun","Jin-Hwa Kim","Junho Kim","Geonhui Jang","Yonghyun Jeong","Junghyo Jo","Gayoung Lee"],"pdf_url":"https://arxiv.org/pdf/2407.21035v2.pdf","comment":"This paper has been accepted for NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.10869v2","updated":"2025-01-16T08:06:16Z","published":"2024-06-16T09:38:33Z","title":"Geometric Distortion Guided Transformer for Omnidirectional Image\n  Super-Resolution","summary":"  As virtual and augmented reality applications gain popularity,\nomnidirectional image (ODI) super-resolution has become increasingly important.\nUnlike 2D plain images that are formed on a plane, ODIs are projected onto\nspherical surfaces. Applying established image super-resolution methods to\nODIs, therefore, requires performing equirectangular projection (ERP) to map\nthe ODIs onto a plane. ODI super-resolution needs to take into account\ngeometric distortion resulting from ERP. However, without considering such\ngeometric distortion of ERP images, previous deep-learning-based methods only\nutilize a limited range of pixels and may easily miss self-similar textures for\nreconstruction. In this paper, we introduce a novel Geometric Distortion Guided\nTransformer for Omnidirectional image Super-Resolution (GDGT-OSR).\nSpecifically, a distortion modulated rectangle-window self-attention mechanism,\nintegrated with deformable self-attention, is proposed to better perceive the\ndistortion and thus involve more self-similar textures. Distortion modulation\nis achieved through a newly devised distortion guidance generator that produces\nguidance by exploiting the variability of distortion across latitudes.\nFurthermore, we propose a dynamic feature aggregation scheme to adaptively fuse\nthe features from different self-attention modules. We present extensive\nexperimental results on public datasets and show that the new GDGT-OSR\noutperforms methods in existing literature.\n","authors":["Cuixin Yang","Rongkang Dong","Jun Xiao","Cong Zhang","Kin-Man Lam","Fei Zhou","Guoping Qiu"],"pdf_url":"https://arxiv.org/pdf/2406.10869v2.pdf","comment":"13 pages, 12 figures, journal"},{"id":"http://arxiv.org/abs/2501.09355v1","updated":"2025-01-16T08:06:02Z","published":"2025-01-16T08:06:02Z","title":"YETI (YET to Intervene) Proactive Interventions by Multimodal AI Agents\n  in Augmented Reality Tasks","summary":"  Multimodal AI Agents are AI models that have the capability of interactively\nand cooperatively assisting human users to solve day-to-day tasks. Augmented\nReality (AR) head worn devices can uniquely improve the user experience of\nsolving procedural day-to-day tasks by providing egocentric multimodal (audio\nand video) observational capabilities to AI Agents. Such AR capabilities can\nhelp AI Agents see and listen to actions that users take which can relate to\nmultimodal capabilities of human users. Existing AI Agents, either Large\nLanguage Models (LLMs) or Multimodal Vision-Language Models (VLMs) are reactive\nin nature, which means that models cannot take an action without reading or\nlistening to the human user's prompts. Proactivity of AI Agents on the other\nhand can help the human user detect and correct any mistakes in agent observed\ntasks, encourage users when they do tasks correctly or simply engage in\nconversation with the user - akin to a human teaching or assisting a user. Our\nproposed YET to Intervene (YETI) multimodal agent focuses on the research\nquestion of identifying circumstances that may require the agent to intervene\nproactively. This allows the agent to understand when it can intervene in a\nconversation with human users that can help the user correct mistakes on tasks,\nlike cooking, using AR. Our YETI Agent learns scene understanding signals based\non interpretable notions of Structural Similarity (SSIM) on consecutive video\nframes. We also define the alignment signal which the AI Agent can learn to\nidentify if the video frames corresponding to the user's actions on the task\nare consistent with expected actions. These signals are used by our AI Agent to\ndetermine when it should proactively intervene. We compare our results on the\ninstances of proactive intervention in the HoloAssist multimodal benchmark for\nan expert agent guiding a user to complete procedural tasks.\n","authors":["Saptarashmi Bandyopadhyay","Vikas Bahirwani","Lavisha Aggarwal","Bhanu Guda","Lin Li","Andrea Colaco"],"pdf_url":"https://arxiv.org/pdf/2501.09355v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.09350v1","updated":"2025-01-16T08:03:49Z","published":"2025-01-16T08:03:49Z","title":"Making Your Dreams A Reality: Decoding the Dreams into a Coherent Video\n  Story from fMRI Signals","summary":"  This paper studies the brave new idea for Multimedia community, and proposes\na novel framework to convert dreams into coherent video narratives using fMRI\ndata. Essentially, dreams have intrigued humanity for centuries, offering\nglimpses into our subconscious minds. Recent advancements in brain imaging,\nparticularly functional magnetic resonance imaging (fMRI), have provided new\nways to explore the neural basis of dreaming. By combining subjective dream\nexperiences with objective neurophysiological data, we aim to understand the\nvisual aspects of dreams and create complete video narratives. Our process\ninvolves three main steps: reconstructing visual perception, decoding dream\nimagery, and integrating dream stories. Using innovative techniques in fMRI\nanalysis and language modeling, we seek to push the boundaries of dream\nresearch and gain deeper insights into visual experiences during sleep. This\ntechnical report introduces a novel approach to visually decoding dreams using\nfMRI signals and weaving dream visuals into narratives using language models.\nWe gather a dataset of dreams along with descriptions to assess the\neffectiveness of our framework.\n","authors":["Yanwei Fu","Jianxiong Gao","Baofeng Yang","Jianfeng Feng"],"pdf_url":"https://arxiv.org/pdf/2501.09350v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2501.09347v1","updated":"2025-01-16T08:00:17Z","published":"2025-01-16T08:00:17Z","title":"UVRM: A Scalable 3D Reconstruction Model from Unposed Videos","summary":"  Large Reconstruction Models (LRMs) have recently become a popular method for\ncreating 3D foundational models. Training 3D reconstruction models with 2D\nvisual data traditionally requires prior knowledge of camera poses for the\ntraining samples, a process that is both time-consuming and prone to errors.\nConsequently, 3D reconstruction training has been confined to either synthetic\n3D datasets or small-scale datasets with annotated poses. In this study, we\ninvestigate the feasibility of 3D reconstruction using unposed video data of\nvarious objects. We introduce UVRM, a novel 3D reconstruction model capable of\nbeing trained and evaluated on monocular videos without requiring any\ninformation about the pose. UVRM uses a transformer network to implicitly\naggregate video frames into a pose-invariant latent feature space, which is\nthen decoded into a tri-plane 3D representation. To obviate the need for\nground-truth pose annotations during training, UVRM employs a combination of\nthe score distillation sampling (SDS) method and an analysis-by-synthesis\napproach, progressively synthesizing pseudo novel-views using a pre-trained\ndiffusion model. We qualitatively and quantitatively evaluate UVRM's\nperformance on the G-Objaverse and CO3D datasets without relying on pose\ninformation. Extensive experiments show that UVRM is capable of effectively and\nefficiently reconstructing a wide range of 3D objects from unposed videos.\n","authors":["Shiu-hong Kao","Xiao Li","Jinglu Wang","Chi-Keung Tang","Yu-Wing Tai","Yan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.09347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04390v2","updated":"2025-01-16T07:58:06Z","published":"2025-01-08T10:08:09Z","title":"iFADIT: Invertible Face Anonymization via Disentangled Identity\n  Transform","summary":"  Face anonymization aims to conceal the visual identity of a face to safeguard\nthe individual's privacy. Traditional methods like blurring and pixelation can\nlargely remove identifying features, but these techniques significantly degrade\nimage quality and are vulnerable to deep reconstruction attacks. Generative\nmodels have emerged as a promising solution for anonymizing faces while\npreserving a natural appearance. However, many still face limitations in visual\nquality and often overlook the potential to recover the original face from the\nanonymized version, which can be valuable in specific contexts such as image\nforensics. This paper proposes a novel framework named iFADIT, an acronym for\nInvertible Face Anonymization via Disentangled Identity Transform. The\nframework features a disentanglement architecture coupled with a secure\nflow-based model: the former decouples identity information from\nnon-identifying attributes, while the latter transforms the decoupled identity\ninto an anonymized version in an invertible manner controlled by a secret key.\nThe anonymized face can then be reconstructed based on a pre-trained StyleGAN\nthat ensures high image quality and realistic facial details. Recovery of the\noriginal face (aka de-anonymization) is possible upon the availability of the\nmatching secret, by inverting the anonymization process based on the same set\nof model parameters. Furthermore, a dedicated secret-key mechanism along with a\ndual-phase training strategy is devised to ensure the desired properties of\nface anonymization. Qualitative and quantitative experiments demonstrate the\nsuperiority of the proposed approach in anonymity, reversibility, security,\ndiversity, and interpretability over competing methods.\n","authors":["Lin Yuan","Kai Liang","Xiong Li","Tao Wu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2501.04390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09341v1","updated":"2025-01-16T07:50:56Z","published":"2025-01-16T07:50:56Z","title":"SE-BSFV: Online Subspace Learning based Shadow Enhancement and\n  Background Suppression for ViSAR under Complex Background","summary":"  Video synthetic aperture radar (ViSAR) has attracted substantial attention in\nthe moving target detection (MTD) field due to its ability to continuously\nmonitor changes in the target area. In ViSAR, the moving targets' shadows will\nnot offset and defocus, which is widely used as a feature for MTD. However, the\nshadows are difficult to distinguish from the low scattering region in the\nbackground, which will cause more missing and false alarms. Therefore, it is\nworth investigating how to enhance the distinction between the shadows and\nbackground. In this study, we proposed the Shadow Enhancement and Background\nSuppression for ViSAR (SE-BSFV) algorithm. The SE-BSFV algorithm is based on\nthe low-rank representation (LRR) theory and adopts online subspace learning\ntechnique to enhance shadows and suppress background for ViSAR images. Firstly,\nwe use a registration algorithm to register the ViSAR images and utilize\nGaussian mixture distribution (GMD) to model the ViSAR data. Secondly, the\nknowledge learned from the previous frames is leveraged to estimate the GMD\nparameters of the current frame, and the Expectation-maximization (EM)\nalgorithm is used to estimate the subspace parameters. Then, the foreground\nmatrix of the current frame can be obtained. Finally, the alternating direction\nmethod of multipliers (ADMM) is used to eliminate strong scattering objects in\nthe foreground matrix to obtain the final results. The experimental results\nindicate that the SE-BSFV algorithm significantly enhances the shadows'\nsaliency and greatly improves the detection performance while ensuring\nefficiency compared with several other advanced pre-processing algorithms.\n","authors":["Shangqu Yan","Chenyang Luo","Yaowen Fu","Wenpeng Zhang","Wei Yang","Ruofeng Yu"],"pdf_url":"https://arxiv.org/pdf/2501.09341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08331v2","updated":"2025-01-16T07:43:19Z","published":"2025-01-14T18:59:10Z","title":"Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using\n  Real-Time Warped Noise","summary":"  Generative modeling aims to transform random noise into structured outputs.\nIn this work, we enhance video diffusion models by allowing motion control via\nstructured latent noise sampling. This is achieved by just a change in data: we\npre-process training videos to yield structured noise. Consequently, our method\nis agnostic to diffusion model design, requiring no changes to model\narchitectures or training pipelines. Specifically, we propose a novel noise\nwarping algorithm, fast enough to run in real time, that replaces random\ntemporal Gaussianity with correlated warped noise derived from optical flow\nfields, while preserving the spatial Gaussianity. The efficiency of our\nalgorithm enables us to fine-tune modern video diffusion base models using\nwarped noise with minimal overhead, and provide a one-stop solution for a wide\nrange of user-friendly motion control: local object motion control, global\ncamera movement control, and motion transfer. The harmonization between\ntemporal coherence and spatial Gaussianity in our warped noise leads to\neffective motion control while maintaining per-frame pixel quality. Extensive\nexperiments and user studies demonstrate the advantages of our method, making\nit a robust and scalable approach for controlling motion in video diffusion\nmodels. Video results are available on our webpage:\nhttps://vgenai-netflix-eyeline-research.github.io/Go-with-the-Flow. Source code\nand model checkpoints are available on GitHub:\nhttps://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow.\n","authors":["Ryan Burgert","Yuancheng Xu","Wenqi Xian","Oliver Pilarski","Pascal Clausen","Mingming He","Li Ma","Yitong Deng","Lingxiao Li","Mohsen Mousavi","Michael Ryoo","Paul Debevec","Ning Yu"],"pdf_url":"https://arxiv.org/pdf/2501.08331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20406v3","updated":"2025-01-16T07:26:52Z","published":"2024-10-27T10:35:47Z","title":"Point-PRC: A Prompt Learning Based Regulation Framework for\n  Generalizable Point Cloud Analysis","summary":"  This paper investigates the 3D domain generalization (3DDG) ability of large\n3D models based on prevalent prompt learning. Recent works demonstrate the\nperformances of 3D point cloud recognition can be boosted remarkably by\nparameter-efficient prompt tuning. However, we observe that the improvement on\ndownstream tasks comes at the expense of a severe drop in 3D domain\ngeneralization. To resolve this challenge, we present a comprehensive\nregulation framework that allows the learnable prompts to actively interact\nwith the well-learned general knowledge in large 3D models to maintain good\ngeneralization. Specifically, the proposed framework imposes multiple explicit\nconstraints on the prompt learning trajectory by maximizing the mutual\nagreement between task-specific predictions and task-agnostic knowledge. We\ndesign the regulation framework as a plug-and-play module to embed into\nexisting representative large 3D models. Surprisingly, our method not only\nrealizes consistently increasing generalization ability but also enhances\ntask-specific 3D recognition performances across various 3DDG benchmarks by a\nclear margin. Considering the lack of study and evaluation on 3DDG, we also\ncreate three new benchmarks, namely base-to-new, cross-dataset and few-shot\ngeneralization benchmarks, to enrich the field and inspire future research.\nCode and benchmarks are available at\n\\url{https://github.com/auniquesun/Point-PRC}.\n","authors":["Hongyu Sun","Qiuhong Ke","Yongcai Wang","Wang Chen","Kang Yang","Deying Li","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2410.20406v3.pdf","comment":"5 figures, 14 tables; accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09333v1","updated":"2025-01-16T07:07:41Z","published":"2025-01-16T07:07:41Z","title":"Prompt-CAM: A Simpler Interpretable Transformer for Fine-Grained\n  Analysis","summary":"  We present a simple usage of pre-trained Vision Transformers (ViTs) for\nfine-grained analysis, aiming to identify and localize the traits that\ndistinguish visually similar categories, such as different bird species or dog\nbreeds. Pre-trained ViTs such as DINO have shown remarkable capabilities to\nextract localized, informative features. However, using saliency maps like\nGrad-CAM can hardly point out the traits: they often locate the whole object by\na blurred, coarse heatmap, not traits. We propose a novel approach Prompt Class\nAttention Map (Prompt-CAM) to the rescue. Prompt-CAM learns class-specific\nprompts to a pre-trained ViT and uses the corresponding outputs for\nclassification. To classify an image correctly, the true-class prompt must\nattend to the unique image patches not seen in other classes' images, i.e.,\ntraits. As such, the true class's multi-head attention maps reveal traits and\ntheir locations. Implementation-wise, Prompt-CAM is almost a free lunch by\nsimply modifying the prediction head of Visual Prompt Tuning (VPT). This makes\nPrompt-CAM fairly easy to train and apply, sharply contrasting other\ninterpretable methods that design specific models and training processes. It is\neven simpler than the recently published INterpretable TRansformer (INTR),\nwhose encoder-decoder architecture prevents it from leveraging pre-trained\nViTs. Extensive empirical studies on a dozen datasets from various domains\n(e.g., birds, fishes, insects, fungi, flowers, food, and cars) validate\nPrompt-CAM superior interpretation capability.\n","authors":["Arpita Chowdhury","Dipanjyoti Paul","Zheda Mai","Jianyang Gu","Ziheng Zhang","Kazi Sajeed Mehrab","Elizabeth G. Campolongo","Daniel Rubenstein","Charles V. Stewart","Anuj Karpatne","Tanya Berger-Wolf","Yu Su","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2501.09333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19043v2","updated":"2025-01-16T06:46:18Z","published":"2024-06-27T09:50:20Z","title":"CMRxRecon2024: A Multi-Modality, Multi-View K-Space Dataset Boosting\n  Universal Machine Learning for Accelerated Cardiac MRI","summary":"  Cardiac magnetic resonance imaging (MRI) has emerged as a clinically\ngold-standard technique for diagnosing cardiac diseases, thanks to its ability\nto provide diverse information with multiple modalities and anatomical views.\nAccelerated cardiac MRI is highly expected to achieve time-efficient and\npatient-friendly imaging, and then advanced image reconstruction approaches are\nrequired to recover high-quality, clinically interpretable images from\nundersampled measurements. However, the lack of publicly available cardiac MRI\nk-space dataset in terms of both quantity and diversity has severely hindered\nsubstantial technological progress, particularly for data-driven artificial\nintelligence. Here, we provide a standardized, diverse, and high-quality\nCMRxRecon2024 dataset to facilitate the technical development, fair evaluation,\nand clinical transfer of cardiac MRI reconstruction approaches, towards\npromoting the universal frameworks that enable fast and robust reconstructions\nacross different cardiac MRI protocols in clinical practice. To the best of our\nknowledge, the CMRxRecon2024 dataset is the largest and most protocal-diverse\npublicly available cardiac k-space dataset. It is acquired from 330 healthy\nvolunteers, covering commonly used modalities, anatomical views, and\nacquisition trajectories in clinical cardiac MRI workflows. Besides, an open\nplatform with tutorials, benchmarks, and data processing tools is provided to\nfacilitate data usage, advanced method development, and fair performance\nevaluation.\n","authors":["Zi Wang","Fanwen Wang","Chen Qin","Jun Lyu","Cheng Ouyang","Shuo Wang","Yan Li","Mengyao Yu","Haoyu Zhang","Kunyuan Guo","Zhang Shi","Qirong Li","Ziqiang Xu","Yajing Zhang","Hao Li","Sha Hua","Binghua Chen","Longyu Sun","Mengting Sun","Qin Li","Ying-Hua Chu","Wenjia Bai","Jing Qin","Xiahai Zhuang","Claudia Prieto","Alistair Young","Michael Markl","He Wang","Lianming Wu","Guang Yang","Xiaobo Qu","Chengyan Wang"],"pdf_url":"https://arxiv.org/pdf/2406.19043v2.pdf","comment":"23 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.09321v1","updated":"2025-01-16T06:25:56Z","published":"2025-01-16T06:25:56Z","title":"Soft Knowledge Distillation with Multi-Dimensional Cross-Net Attention\n  for Image Restoration Models Compression","summary":"  Transformer-based encoder-decoder models have achieved remarkable success in\nimage-to-image transfer tasks, particularly in image restoration. However,\ntheir high computational complexity-manifested in elevated FLOPs and parameter\ncounts-limits their application in real-world scenarios. Existing knowledge\ndistillation methods in image restoration typically employ lightweight student\nmodels that directly mimic the intermediate features and reconstruction results\nof the teacher, overlooking the implicit attention relationships between them.\nTo address this, we propose a Soft Knowledge Distillation (SKD) strategy that\nincorporates a Multi-dimensional Cross-net Attention (MCA) mechanism for\ncompressing image restoration models. This mechanism facilitates interaction\nbetween the student and teacher across both channel and spatial dimensions,\nenabling the student to implicitly learn the attention matrices. Additionally,\nwe employ a Gaussian kernel function to measure the distance between student\nand teacher features in kernel space, ensuring stable and efficient feature\nlearning. To further enhance the quality of reconstructed images, we replace\nthe commonly used L1 or KL divergence loss with a contrastive learning loss at\nthe image level. Experiments on three tasks-image deraining, deblurring, and\ndenoising-demonstrate that our SKD strategy significantly reduces computational\ncomplexity while maintaining strong image restoration capabilities.\n","authors":["Yongheng Zhang","Danfeng Yan"],"pdf_url":"https://arxiv.org/pdf/2501.09321v1.pdf","comment":"Accepted by ICASSP2025"},{"id":"http://arxiv.org/abs/2501.09311v1","updated":"2025-01-16T05:58:32Z","published":"2025-01-16T05:58:32Z","title":"Shape-Based Single Object Classification Using Ensemble Method\n  Classifiers","summary":"  Nowadays, more and more images are available. Annotation and retrieval of the\nimages pose classification problems, where each class is defined as the group\nof database images labelled with a common semantic label. Various systems have\nbeen proposed for content-based retrieval, as well as for image classification\nand indexing. In this paper, a hierarchical classification framework has been\nproposed for bridging the semantic gap effectively and achieving multi-category\nimage classification. A well known pre-processing and post-processing method\nwas used and applied to three problems; image segmentation, object\nidentification and image classification. The method was applied to classify\nsingle object images from Amazon and Google datasets. The classification was\ntested for four different classifiers; BayesNetwork (BN), Random Forest (RF),\nBagging and Vote. The estimated classification accuracies ranged from 20% to\n99% (using 10-fold cross validation). The Bagging classifier presents the best\nperformance, followed by the Random Forest classifier.\n","authors":["Nur Shazwani Kamarudin","Mokhairi Makhtar","Syadiah Nor Wan Shamsuddin","Syed Abdullah Fadzli"],"pdf_url":"https://arxiv.org/pdf/2501.09311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01432v3","updated":"2025-01-16T05:42:28Z","published":"2024-07-18T19:44:44Z","title":"VLG-CBM: Training Concept Bottleneck Models with Vision-Language\n  Guidance","summary":"  Concept Bottleneck Models (CBMs) provide interpretable prediction by\nintroducing an intermediate Concept Bottleneck Layer (CBL), which encodes\nhuman-understandable concepts to explain models' decision. Recent works\nproposed to utilize Large Language Models and pre-trained Vision-Language\nModels to automate the training of CBMs, making it more scalable and automated.\nHowever, existing approaches still fall short in two aspects: First, the\nconcepts predicted by CBL often mismatch the input image, raising doubts about\nthe faithfulness of interpretation. Second, it has been shown that concept\nvalues encode unintended information: even a set of random concepts could\nachieve comparable test accuracy to state-of-the-art CBMs. To address these\ncritical limitations, in this work, we propose a novel framework called\nVision-Language-Guided Concept Bottleneck Model (VLG-CBM) to enable faithful\ninterpretability with the benefits of boosted performance. Our method leverages\noff-the-shelf open-domain grounded object detectors to provide visually\ngrounded concept annotation, which largely enhances the faithfulness of concept\nprediction while further improving the model performance. In addition, we\npropose a new metric called Number of Effective Concepts (NEC) to control the\ninformation leakage and provide better interpretability. Extensive evaluations\nacross five standard benchmarks show that our method, VLG-CBM, outperforms\nexisting methods by at least 4.27% and up to 51.09% on Accuracy at NEC=5\n(denoted as ANEC-5), and by at least 0.45% and up to 29.78% on average accuracy\n(denoted as ANEC-avg), while preserving both faithfulness and interpretability\nof the learned concepts as demonstrated in extensive experiments.\n","authors":["Divyansh Srivastava","Ge Yan","Tsui-Wei Weng"],"pdf_url":"https://arxiv.org/pdf/2408.01432v3.pdf","comment":"Appeared at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09305v1","updated":"2025-01-16T05:39:50Z","published":"2025-01-16T05:39:50Z","title":"Domain-conditioned and Temporal-guided Diffusion Modeling for\n  Accelerated Dynamic MRI Reconstruction","summary":"  Purpose: To propose a domain-conditioned and temporal-guided diffusion\nmodeling method, termed dynamic Diffusion Modeling (dDiMo), for accelerated\ndynamic MRI reconstruction, enabling diffusion process to characterize\nspatiotemporal information for time-resolved multi-coil Cartesian and\nnon-Cartesian data. Methods: The dDiMo framework integrates temporal\ninformation from time-resolved dimensions, allowing for the concurrent capture\nof intra-frame spatial features and inter-frame temporal dynamics in diffusion\nmodeling. It employs additional spatiotemporal ($x$-$t$) and self-consistent\nfrequency-temporal ($k$-$t$) priors to guide the diffusion process. This\napproach ensures precise temporal alignment and enhances the recovery of fine\nimage details. To facilitate a smooth diffusion process, the nonlinear\nconjugate gradient algorithm is utilized during the reverse diffusion steps.\nThe proposed model was tested on two types of MRI data: Cartesian-acquired\nmulti-coil cardiac MRI and Golden-Angle-Radial-acquired multi-coil\nfree-breathing lung MRI, across various undersampling rates. Results: dDiMo\nachieved high-quality reconstructions at various acceleration factors,\ndemonstrating improved temporal alignment and structural recovery compared to\nother competitive reconstruction methods, both qualitatively and\nquantitatively. This proposed diffusion framework exhibited robust performance\nin handling both Cartesian and non-Cartesian acquisitions, effectively\nreconstructing dynamic datasets in cardiac and lung MRI under different imaging\nconditions. Conclusion: This study introduces a novel diffusion modeling method\nfor dynamic MRI reconstruction.\n","authors":["Liping Zhang","Iris Yuwen Zhou","Sydney B. Montesi","Li Feng","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09305v1.pdf","comment":"21 pages, 15 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.09304v1","updated":"2025-01-16T05:39:28Z","published":"2025-01-16T05:39:28Z","title":"Finding the Trigger: Causal Abductive Reasoning on Video Events","summary":"  This paper introduces a new problem, Causal Abductive Reasoning on Video\nEvents (CARVE), which involves identifying causal relationships between events\nin a video and generating hypotheses about causal chains that account for the\noccurrence of a target event. To facilitate research in this direction, we\ncreate two new benchmark datasets with both synthetic and realistic videos,\naccompanied by trigger-target labels generated through a novel counterfactual\nsynthesis approach. To explore the challenge of solving CARVE, we present a\nCausal Event Relation Network (CERN) that examines the relationships between\nvideo events in temporal and semantic spaces to efficiently determine the\nroot-cause trigger events. Through extensive experiments, we demonstrate the\ncritical roles of event relational representation learning and interaction\nmodeling in solving video causal reasoning challenges. The introduction of the\nCARVE task, along with the accompanying datasets and the CERN framework, will\nadvance future research on video causal reasoning and significantly facilitate\nvarious applications, including video surveillance, root-cause analysis and\nmovie content management.\n","authors":["Thao Minh Le","Vuong Le","Kien Do","Sunil Gupta","Svetha Venkatesh","Truyen Tran"],"pdf_url":"https://arxiv.org/pdf/2501.09304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09302v1","updated":"2025-01-16T05:37:29Z","published":"2025-01-16T05:37:29Z","title":"Creating Virtual Environments with 3D Gaussian Splatting: A Comparative\n  Study","summary":"  3D Gaussian Splatting (3DGS) has recently emerged as an innovative and\nefficient 3D representation technique. While its potential for extended reality\n(XR) applications is frequently highlighted, its practical effectiveness\nremains underexplored. In this work, we examine three distinct 3DGS-based\napproaches for virtual environment (VE) creation, leveraging their unique\nstrengths for efficient and visually compelling scene representation. By\nconducting a comparable study, we evaluate the feasibility of 3DGS in creating\nimmersive VEs, identify its limitations in XR applications, and discuss future\nresearch and development opportunities.\n","authors":["Shi Qiu","Binzhu Xie","Qixuan Liu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2501.09302v1.pdf","comment":"IEEE VR 2025 Posters"},{"id":"http://arxiv.org/abs/2501.09294v1","updated":"2025-01-16T05:01:30Z","published":"2025-01-16T05:01:30Z","title":"Efficient Few-Shot Medical Image Analysis via Hierarchical Contrastive\n  Vision-Language Learning","summary":"  Few-shot learning in medical image classification presents a significant\nchallenge due to the limited availability of annotated data and the complex\nnature of medical imagery. In this work, we propose Adaptive Vision-Language\nFine-tuning with Hierarchical Contrastive Alignment (HiCA), a novel framework\nthat leverages the capabilities of Large Vision-Language Models (LVLMs) for\nmedical image analysis. HiCA introduces a two-stage fine-tuning strategy,\ncombining domain-specific pretraining and hierarchical contrastive learning to\nalign visual and textual representations at multiple levels. We evaluate our\napproach on two benchmark datasets, Chest X-ray and Breast Ultrasound,\nachieving state-of-the-art performance in both few-shot and zero-shot settings.\nFurther analyses demonstrate the robustness, generalizability, and\ninterpretability of our method, with substantial improvements in performance\ncompared to existing baselines. Our work highlights the potential of\nhierarchical contrastive strategies in adapting LVLMs to the unique challenges\nof medical imaging tasks.\n","authors":["Harrison Fuller","Fernando Gabriela Garcia","Victor Flores"],"pdf_url":"https://arxiv.org/pdf/2501.09294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03789v3","updated":"2025-01-16T04:13:10Z","published":"2023-07-07T18:28:44Z","title":"Synthesizing Forestry Images Conditioned on Plant Phenotype Using a\n  Generative Adversarial Network","summary":"  Plant phenology and phenotype prediction using remote sensing data are\nincreasingly gaining attention within the plant science community as a\npromising approach to enhance agricultural productivity. This work focuses on\ngenerating synthetic forestry images that satisfy certain phenotypic\nattributes, viz. canopy greenness. We harness a Generative Adversarial Network\n(GAN) to synthesize biologically plausible and phenotypically stable forestry\nimages conditioned on the greenness of vegetation (a continuous attribute) over\na specific region of interest, describing a particular vegetation type in a\nmixed forest. The training data is based on the automated digital camera\nimagery provided by the National Ecological Observatory Network (NEON) and\nprocessed by the PhenoCam Network. Our method helps render the appearance of\nforest sites specific to a greenness value. The synthetic images are\nsubsequently utilized to predict another phenotypic attribute, viz., redness of\nplants. The quality of the synthetic images is assessed using the Structural\nSIMilarity (SSIM) index and Fr\\'echet Inception Distance (FID). Further, the\ngreenness and redness indices of the synthetic images are compared against\nthose of the original images using Root Mean Squared Percentage Error (RMSPE)\nto evaluate their accuracy and integrity. The generalizability and scalability\nof our proposed GAN model are established by effectively transforming it to\ngenerate synthetic images for other forest sites and vegetation types. From a\nbroader perspective, this approach could be leveraged to visualize forestry\nbased on different phenotypic attributes in the context of various\nenvironmental parameters.\n","authors":["Debasmita Pal","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2307.03789v3.pdf","comment":"Accepted to Pattern Recognition journal"},{"id":"http://arxiv.org/abs/2501.09281v1","updated":"2025-01-16T04:06:59Z","published":"2025-01-16T04:06:59Z","title":"SoccerSynth-Detection: A Synthetic Dataset for Soccer Player Detection","summary":"  In soccer video analysis, player detection is essential for identifying key\nevents and reconstructing tactical positions. The presence of numerous players\nand frequent occlusions, combined with copyright restrictions, severely\nrestricts the availability of datasets, leaving limited options such as\nSoccerNet-Tracking and SportsMOT. These datasets suffer from a lack of\ndiversity, which hinders algorithms from adapting effectively to varied soccer\nvideo contexts. To address these challenges, we developed\nSoccerSynth-Detection, the first synthetic dataset designed for the detection\nof synthetic soccer players. It includes a broad range of random lighting and\ntextures, as well as simulated camera motion blur. We validated its efficacy\nusing the object detection model (Yolov8n) against real-world datasets\n(SoccerNet-Tracking and SportsMoT). In transfer tests, it matched the\nperformance of real datasets and significantly outperformed them in images with\nmotion blur; in pre-training tests, it demonstrated its efficacy as a\npre-training dataset, significantly enhancing the algorithm's overall\nperformance. Our work demonstrates the potential of synthetic datasets to\nreplace real datasets for algorithm training in the field of soccer video\nanalysis.\n","authors":["Haobin Qin","Calvin Yeung","Rikuhei Umemoto","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2501.09281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09278v1","updated":"2025-01-16T03:54:06Z","published":"2025-01-16T03:54:06Z","title":"Text-guided Synthetic Geometric Augmentation for Zero-shot 3D\n  Understanding","summary":"  Zero-shot recognition models require extensive training data for\ngeneralization. However, in zero-shot 3D classification, collecting 3D data and\ncaptions is costly and laborintensive, posing a significant barrier compared to\n2D vision. Recent advances in generative models have achieved unprecedented\nrealism in synthetic data production, and recent research shows the potential\nfor using generated data as training data. Here, naturally raising the\nquestion: Can synthetic 3D data generated by generative models be used as\nexpanding limited 3D datasets? In response, we present a synthetic 3D dataset\nexpansion method, Textguided Geometric Augmentation (TeGA). TeGA is tailored\nfor language-image-3D pretraining, which achieves SoTA in zero-shot 3D\nclassification, and uses a generative textto-3D model to enhance and extend\nlimited 3D datasets. Specifically, we automatically generate text-guided\nsynthetic 3D data and introduce a consistency filtering strategy to discard\nnoisy samples where semantics and geometric shapes do not match with text. In\nthe experiment to double the original dataset size using TeGA, our approach\ndemonstrates improvements over the baselines, achieving zeroshot performance\ngains of 3.0% on Objaverse-LVIS, 4.6% on ScanObjectNN, and 8.7% on ModelNet40.\nThese results demonstrate that TeGA effectively bridges the 3D data gap,\nenabling robust zero-shot 3D classification even with limited real training\ndata and paving the way for zero-shot 3D vision application.\n","authors":["Kohei Torimi","Ryosuke Yamada","Daichi Otsuka","Kensho Hara","Yuki M. Asano","Hirokatsu Kataoka","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2501.09278v1.pdf","comment":"14 pages, 8 figures, this paper is submitted to CVPR"},{"id":"http://arxiv.org/abs/2501.08659v2","updated":"2025-01-16T03:51:49Z","published":"2025-01-15T08:50:52Z","title":"BRIGHT-VO: Brightness-Guided Hybrid Transformer for Visual Odometry with\n  Multi-modality Refinement Module","summary":"  Visual odometry (VO) plays a crucial role in autonomous driving, robotic\nnavigation, and other related tasks by estimating the position and orientation\nof a camera based on visual input. Significant progress has been made in\ndata-driven VO methods, particularly those leveraging deep learning techniques\nto extract image features and estimate camera poses. However, these methods\noften struggle in low-light conditions because of the reduced visibility of\nfeatures and the increased difficulty of matching keypoints. To address this\nlimitation, we introduce BrightVO, a novel VO model based on Transformer\narchitecture, which not only performs front-end visual feature extraction, but\nalso incorporates a multi-modality refinement module in the back-end that\nintegrates Inertial Measurement Unit (IMU) data. Using pose graph optimization,\nthis module iteratively refines pose estimates to reduce errors and improve\nboth accuracy and robustness. Furthermore, we create a synthetic low-light\ndataset, KiC4R, which includes a variety of lighting conditions to facilitate\nthe training and evaluation of VO frameworks in challenging environments.\nExperimental results demonstrate that BrightVO achieves state-of-the-art\nperformance on both the KiC4R dataset and the KITTI benchmarks. Specifically,\nit provides an average improvement of 20% in pose estimation accuracy in normal\noutdoor environments and 259% in low-light conditions, outperforming existing\nmethods. For widespread use and further development, the research work is fully\nopen-source at https://github.com/Anastasiawd/BrightVO.\n","authors":["Dongzhihan Wang","Yang Yang","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08659v2.pdf","comment":"We have identified significant issues in the methodology and data\n  analysis that impact the validity of our conclusions"},{"id":"http://arxiv.org/abs/2501.09277v1","updated":"2025-01-16T03:47:25Z","published":"2025-01-16T03:47:25Z","title":"Bias for Action: Video Implicit Neural Representations with Bias\n  Modulation","summary":"  We propose a new continuous video modeling framework based on implicit neural\nrepresentations (INRs) called ActINR. At the core of our approach is the\nobservation that INRs can be considered as a learnable dictionary, with the\nshapes of the basis functions governed by the weights of the INR, and their\nlocations governed by the biases. Given compact non-linear activation\nfunctions, we hypothesize that an INR's biases are suitable to capture motion\nacross images, and facilitate compact representations for video sequences.\nUsing these observations, we design ActINR to share INR weights across frames\nof a video sequence, while using unique biases for each frame. We further model\nthe biases as the output of a separate INR conditioned on time index to promote\nsmoothness. By training the video INR and this bias INR together, we\ndemonstrate unique capabilities, including $10\\times$ video slow motion,\n$4\\times$ spatial super resolution along with $2\\times$ slow motion, denoising,\nand video inpainting. ActINR performs remarkably well across numerous video\nprocessing tasks (often achieving more than 6dB improvement), setting a new\nstandard for continuous modeling of videos.\n","authors":["Alper Kayabasi","Anil Kumar Vadathya","Guha Balakrishnan","Vishwanath Saragadam"],"pdf_url":"https://arxiv.org/pdf/2501.09277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09268v1","updated":"2025-01-16T03:35:23Z","published":"2025-01-16T03:35:23Z","title":"Knowledge Distillation for Image Restoration : Simultaneous Learning\n  from Degraded and Clean Images","summary":"  Model compression through knowledge distillation has seen extensive\napplication in classification and segmentation tasks. However, its potential in\nimage-to-image translation, particularly in image restoration, remains\nunderexplored. To address this gap, we propose a Simultaneous Learning\nKnowledge Distillation (SLKD) framework tailored for model compression in image\nrestoration tasks. SLKD employs a dual-teacher, single-student architecture\nwith two distinct learning strategies: Degradation Removal Learning (DRL) and\nImage Reconstruction Learning (IRL), simultaneously. In DRL, the student\nencoder learns from Teacher A to focus on removing degradation factors, guided\nby a novel BRISQUE extractor. In IRL, the student decoder learns from Teacher B\nto reconstruct clean images, with the assistance of a proposed PIQE extractor.\nThese strategies enable the student to learn from degraded and clean images\nsimultaneously, ensuring high-quality compression of image restoration models.\nExperimental results across five datasets and three tasks demonstrate that SLKD\nachieves substantial reductions in FLOPs and parameters, exceeding 80\\%, while\nmaintaining strong image restoration performance.\n","authors":["Yongheng Zhang","Danfeng Yan"],"pdf_url":"https://arxiv.org/pdf/2501.09268v1.pdf","comment":"Accepted by ICASSP2025"},{"id":"http://arxiv.org/abs/2501.09267v1","updated":"2025-01-16T03:34:36Z","published":"2025-01-16T03:34:36Z","title":"Are Open-Vocabulary Models Ready for Detection of MEP Elements on\n  Construction Sites","summary":"  The construction industry has long explored robotics and computer vision, yet\ntheir deployment on construction sites remains very limited. These technologies\nhave the potential to revolutionize traditional workflows by enhancing\naccuracy, efficiency, and safety in construction management. Ground robots\nequipped with advanced vision systems could automate tasks such as monitoring\nmechanical, electrical, and plumbing (MEP) systems. The present research\nevaluates the applicability of open-vocabulary vision-language models compared\nto fine-tuned, lightweight, closed-set object detectors for detecting MEP\ncomponents using a mobile ground robotic platform. A dataset collected with\ncameras mounted on a ground robot was manually annotated and analyzed to\ncompare model performance. The results demonstrate that, despite the\nversatility of vision-language models, fine-tuned lightweight models still\nlargely outperform them in specialized environments and for domain-specific\ntasks.\n","authors":["Abdalwhab Abdalwhab","Ali Imran","Sina Heydarian","Ivanka Iordanova","David St-Onge"],"pdf_url":"https://arxiv.org/pdf/2501.09267v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.06848v3","updated":"2025-01-16T03:18:14Z","published":"2025-01-12T15:34:24Z","title":"A General Framework for Inference-time Scaling and Steering of Diffusion\n  Models","summary":"  Diffusion models produce impressive results in modalities ranging from images\nand video to protein design and text. However, generating samples with\nuser-specified properties remains a challenge. Recent research proposes\nfine-tuning models to maximize rewards that capture desired properties, but\nthese methods require expensive training and are prone to mode collapse. In\nthis work, we propose Feynman Kac (FK) steering, an inference-time framework\nfor steering diffusion models with reward functions. FK steering works by\nsampling a system of multiple interacting diffusion processes, called\nparticles, and resampling particles at intermediate steps based on scores\ncomputed using functions called potentials. Potentials are defined using\nrewards for intermediate states and are selected such that a high value\nindicates that the particle will yield a high-reward sample. We explore various\nchoices of potentials, intermediate rewards, and samplers. We evaluate FK\nsteering on text-to-image and text diffusion models. For steering text-to-image\nmodels with a human preference reward, we find that FK steering a 0.8B\nparameter model outperforms a 2.6B parameter fine-tuned model on prompt\nfidelity, with faster sampling and no training. For steering text diffusion\nmodels with rewards for text quality and specific text attributes, we find that\nFK steering generates lower perplexity, more linguistically acceptable outputs\nand enables gradient-free control of attributes like toxicity. Our results\ndemonstrate that inference-time scaling and steering of diffusion models, even\nwith off-the-shelf rewards, can provide significant sample quality gains and\ncontrollability benefits. Code is available at\nhttps://github.com/zacharyhorvitz/Fk-Diffusion-Steering .\n","authors":["Raghav Singhal","Zachary Horvitz","Ryan Teehan","Mengye Ren","Zhou Yu","Kathleen McKeown","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2501.06848v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09259v1","updated":"2025-01-16T03:02:08Z","published":"2025-01-16T03:02:08Z","title":"OpticFusion: Multi-Modal Neural Implicit 3D Reconstruction of\n  Microstructures by Fusing White Light Interferometry and Optical Microscopy","summary":"  White Light Interferometry (WLI) is a precise optical tool for measuring the\n3D topography of microstructures. However, conventional WLI cannot capture the\nnatural color of a sample's surface, which is essential for many microscale\nresearch applications that require both 3D geometry and color information.\nPrevious methods have attempted to overcome this limitation by modifying WLI\nhardware and analysis software, but these solutions are often costly. In this\nwork, we address this challenge from a computer vision multi-modal\nreconstruction perspective for the first time. We introduce OpticFusion, a\nnovel approach that uses an additional digital optical microscope (OM) to\nachieve 3D reconstruction with natural color textures using multi-view WLI and\nOM images. Our method employs a two-step data association process to obtain the\nposes of WLI and OM data. By leveraging the neural implicit representation, we\nfuse multi-modal data and apply color decomposition technology to extract the\nsample's natural color. Tested on our multi-modal dataset of various microscale\nsamples, OpticFusion achieves detailed 3D reconstructions with color textures.\nOur method provides an effective tool for practical applications across\nnumerous microscale research fields. The source code and our real-world dataset\nare available at https://github.com/zju3dv/OpticFusion.\n","authors":["Shuo Chen","Yijin Li","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09259v1.pdf","comment":"3DV 2025"},{"id":"http://arxiv.org/abs/2303.13397v6","updated":"2025-01-16T02:48:38Z","published":"2023-03-23T16:15:18Z","title":"DiffMesh: A Motion-aware Diffusion Framework for Human Mesh Recovery\n  from Videos","summary":"  Human mesh recovery (HMR) provides rich human body information for various\nreal-world applications. While image-based HMR methods have achieved impressive\nresults, they often struggle to recover humans in dynamic scenarios, leading to\ntemporal inconsistencies and non-smooth 3D motion predictions due to the\nabsence of human motion. In contrast, video-based approaches leverage temporal\ninformation to mitigate this issue. In this paper, we present DiffMesh, an\ninnovative motion-aware Diffusion-like framework for video-based HMR. DiffMesh\nestablishes a bridge between diffusion models and human motion, efficiently\ngenerating accurate and smooth output mesh sequences by incorporating human\nmotion within the forward process and reverse process in the diffusion model.\nExtensive experiments are conducted on the widely used datasets (Human3.6M\n\\cite{h36m_pami} and 3DPW \\cite{pw3d2018}), which demonstrate the effectiveness\nand efficiency of our DiffMesh. Visual comparisons in real-world scenarios\nfurther highlight DiffMesh's suitability for practical applications.\n","authors":["Ce Zheng","Xianpeng Liu","Qucheng Peng","Tianfu Wu","Pu Wang","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2303.13397v6.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2501.05264v3","updated":"2025-01-16T02:39:20Z","published":"2025-01-09T14:19:33Z","title":"Towards Balanced Continual Multi-Modal Learning in Human Pose Estimation","summary":"  3D human pose estimation (3D HPE) has emerged as a prominent research topic,\nparticularly in the realm of RGB-based methods. However, RGB images are\nsusceptible to limitations such as sensitivity to lighting conditions and\npotential user discomfort. Consequently, multi-modal sensing, which leverages\nnon-intrusive sensors, is gaining increasing attention. Nevertheless,\nmulti-modal 3D HPE still faces challenges, including modality imbalance and the\nimperative for continual learning. In this work, we introduce a novel balanced\ncontinual multi-modal learning method for 3D HPE, which harnesses the power of\nRGB, LiDAR, mmWave, and WiFi. Specifically, we propose a Shapley value-based\ncontribution algorithm to quantify the contribution of each modality and\nidentify modality imbalance. To address this imbalance, we employ a re-learning\nstrategy. Furthermore, recognizing that raw data is prone to noise\ncontamination, we develop a novel denoising continual learning approach. This\napproach incorporates a noise identification and separation module to mitigate\nthe adverse effects of noise and collaborates with the balanced learning\nstrategy to enhance optimization. Additionally, an adaptive EWC mechanism is\nemployed to alleviate catastrophic forgetting. We conduct extensive experiments\non the widely-adopted multi-modal dataset, MM-Fi, which demonstrate the\nsuperiority of our approach in boosting 3D pose estimation and mitigating\ncatastrophic forgetting in complex scenarios. We will release our codes.\n","authors":["Jiaxuan Peng","Mengshi Qi","Dong Zhao","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.05264v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00689v4","updated":"2025-01-16T02:12:45Z","published":"2023-11-01T17:45:22Z","title":"Collaboration in Immersive Environments: Challenges and Solutions","summary":"  Virtual Reality (VR) and Augmented Reality (AR) tools have been applied in\nall engineering fields in order to avoid the use of physical prototypes, to\ntrain in high-risk situations, and to interpret real or simulated results. In\norder to complete a shared task or assign tasks to the agents in such immersive\nenvironments, collaboration or Shared Cooperative Activities are a necessity.\nCollaboration in immersive environments is an emerging field of research that\naims to study and enhance the ways in which people interact and work together\nin Virtual and Augmented Reality settings. Collaboration in immersive\nenvironments is a complex process that involves different factors such as\ncommunication, coordination, and social presence. This paper provides an\noverview of the current state of research on collaboration in immersive\nenvironments. It discusses the different types of immersive environments,\nincluding VR and AR, and the different forms of collaboration that can occur in\nthese environments. The paper also highlights the challenges and limitations of\ncollaboration in immersive environments, such as the lack of physical cues,\ncost and usability and the need for further research in this area. Overall,\ncollaboration in immersive environments is a promising field with a wide range\nof potential applications, from education to industry, and it can benefit both\nindividuals and groups by enhancing their ability to work together effectively.\n","authors":["Shahin Doroudian"],"pdf_url":"https://arxiv.org/pdf/2311.00689v4.pdf","comment":"Added new references in Networking section"},{"id":"http://arxiv.org/abs/2408.01167v3","updated":"2025-01-16T02:09:15Z","published":"2024-08-02T10:34:23Z","title":"Rethinking Pre-Trained Feature Extractor Selection in Multiple Instance\n  Learning for Whole Slide Image Classification","summary":"  Multiple instance learning (MIL) has become a preferred method for gigapixel\nwhole slide image (WSI) classification without requiring patch-level\nannotations. Current MIL research primarily relies on embedding-based\napproaches, which extract patch features using a pre-trained feature extractor\nand aggregate them for slide-level prediction. Despite the critical role of\nfeature extraction, there is limited guidance on selecting optimal feature\nextractors to maximize WSI performance. This study addresses this gap by\nsystematically evaluating MIL feature extractors across three dimensions:\npre-training dataset, backbone model, and pre-training method. Extensive\nexperiments were conducted on two public WSI datasets (TCGA-NSCLC and\nCamelyon16) using four state-of-the-art (SOTA) MIL models. Our findings reveal\nthat: 1) selecting a robust self-supervised learning (SSL) method has a greater\nimpact on performance than relying solely on an in-domain pre-training dataset;\n2) prioritizing Transformer-based backbones with deeper architectures over\nCNN-based models; and 3) using larger, more diverse pre-training datasets\nsignificantly enhances classification outcomes. We hope that these insights can\nprovide practical guidance for optimizing WSI classification and explain the\nreasons behind the performance advantages of the current SOTA pathology\nfoundation models. Furthermore, this work may inform the development of more\neffective pathology foundation models. Our code is publicly available at\nhttps://github.com/bryanwong17/MIL-Feature-Extractor-Selection\n","authors":["Bryan Wong","Mun Yong Yi"],"pdf_url":"https://arxiv.org/pdf/2408.01167v3.pdf","comment":"Accepted to IEEE International Symposium on Biomedical Imaging (ISBI)\n  2025"},{"id":"http://arxiv.org/abs/2408.01077v3","updated":"2025-01-16T02:08:47Z","published":"2024-08-02T07:52:28Z","title":"PhysMamba: State Space Duality Model for Remote Physiological\n  Measurement","summary":"  Remote Photoplethysmography (rPPG) enables non-contact physiological signal\nextraction from facial videos, offering applications in psychological state\nanalysis, medical assistance, and anti-face spoofing. However, challenges such\nas motion artifacts, lighting variations, and noise limit its real-world\napplicability. To address these issues, we propose PhysMamba, a novel\ndual-pathway time-frequency interaction model based on Synergistic State Space\nDuality (SSSD), which for the first time integrates state space models with\nattention mechanisms in a dual-branch framework. Combined with a Multi-Scale\nQuery (MQ) mechanism, PhysMamba achieves efficient information exchange and\nenhanced feature representation, ensuring robustness under noisy and dynamic\nconditions. Experiments on PURE, UBFC-rPPG, and MMPD datasets demonstrate that\nPhysMamba outperforms state-of-the-art methods, offering superior accuracy and\ngeneralization. This work lays a strong foundation for practical applications\nin non-contact health monitoring, including real-time remote patient care.\n","authors":["Zhixin Yan","Yan Zhong","Hongbin Xu","Wenjun Zhang","Shangru Yi","Lin Shu","Wenxiong Kang"],"pdf_url":"https://arxiv.org/pdf/2408.01077v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10377v4","updated":"2025-01-16T01:30:35Z","published":"2024-07-15T01:11:30Z","title":"Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal\n  MRI Datasets","summary":"  Multi-modal magnetic resonance imaging (MRI) provides information of lesions\nfor computer-aided diagnosis from different views. Deep learning algorithms are\nsuitable for identifying specific anatomical structures, segmenting lesions,\nand classifying diseases. Manual labels are limited due to the high expense,\nwhich hinders further improvement of accuracy. Self-supervised learning,\nparticularly masked image modeling (MIM), has shown promise in utilizing\nunlabeled data. However, we spot model collapse when applying MIM to\nmulti-modal MRI datasets. The performance of downstream tasks does not see any\nimprovement following the collapsed model. To solve model collapse, we analyze\nand address it in two types: complete collapse and dimensional collapse. We\nfind complete collapse occurs because the collapsed loss value in multi-modal\nMRI datasets falls below the normally converged loss value. Based on this, the\nhybrid mask pattern (HMP) masking strategy is introduced to elevate the\ncollapsed loss above the normally converged loss value and avoid complete\ncollapse. Additionally, we reveal that dimensional collapse stems from\ninsufficient feature uniformity in MIM. We mitigate dimensional collapse by\nintroducing the pyramid barlow twins (PBT) module as an explicit regularization\nmethod. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module\nto avoid model collapse multi-modal MRI. Experiments are conducted on three\nmulti-modal MRI datasets to validate the effectiveness of our approach in\npreventing both types of model collapse. By preventing model collapse, the\ntraining of the model becomes more stable, resulting in a decent improvement in\nperformance for segmentation and classification tasks. The code is available at\nhttps://github.com/LinxuanHan/E-MIM.\n","authors":["Linxuan Han","Sa Xiao","Zimeng Li","Haidong Li","Xiuchao Zhao","Yeqing Han","Fumin Guo","Xin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.10377v4.pdf","comment":"This work has been submitted to the lEEE for possible publication.\n  copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2408.05526v2","updated":"2025-01-16T00:54:04Z","published":"2024-08-10T11:48:14Z","title":"CryoBench: Diverse and challenging datasets for the heterogeneity\n  problem in cryo-EM","summary":"  Cryo-electron microscopy (cryo-EM) is a powerful technique for determining\nhigh-resolution 3D biomolecular structures from imaging data. Its unique\nability to capture structural variability has spurred the development of\nheterogeneous reconstruction algorithms that can infer distributions of 3D\nstructures from noisy, unlabeled imaging data. Despite the growing number of\nadvanced methods, progress in the field is hindered by the lack of standardized\nbenchmarks with ground truth information and reliable validation metrics. Here,\nwe introduce CryoBench, a suite of datasets, metrics, and benchmarks for\nheterogeneous reconstruction in cryo-EM. CryoBench includes five datasets\nrepresenting different sources of heterogeneity and degrees of difficulty.\nThese include conformational heterogeneity generated from designed motions of\nantibody complexes or sampled from a molecular dynamics simulation, as well as\ncompositional heterogeneity from mixtures of ribosome assembly states or 100\ncommon complexes present in cells. We then analyze state-of-the-art\nheterogeneous reconstruction tools, including neural and non-neural methods,\nassess their sensitivity to noise, and propose new metrics for quantitative\nevaluation. We hope that CryoBench will be a foundational resource for\naccelerating algorithmic development and evaluation in the cryo-EM and machine\nlearning communities. Project page: https://cryobench.cs.princeton.edu.\n","authors":["Minkyu Jeon","Rishwanth Raghu","Miro Astore","Geoffrey Woollard","Ryan Feathers","Alkin Kaz","Sonya M. Hanson","Pilar Cossio","Ellen D. Zhong"],"pdf_url":"https://arxiv.org/pdf/2408.05526v2.pdf","comment":"Accepted by NeurIPS 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2501.09221v1","updated":"2025-01-16T00:45:05Z","published":"2025-01-16T00:45:05Z","title":"Leveraging Scale-aware Representations for improved\n  Concept-Representation Alignment in ViTs","summary":"  Vision Transformers (ViTs) are increasingly being adopted in various\nsensitive vision applications - like medical diagnosis, facial recognition,\netc. To improve the interpretability of such models, many approaches attempt to\nforward-align them with carefully annotated abstract, human-understandable\nsemantic entities - concepts. Concepts provide global rationales to the model\npredictions and can be quickly understood/intervened on by domain experts. Most\ncurrent research focuses on designing model-agnostic, plug-and-play generic\nconcept-based explainability modules that do not incorporate the inner workings\nof foundation models (e.g., inductive biases, scale invariance, etc.) during\ntraining. To alleviate this issue for ViTs, in this paper, we propose a novel\nConcept Representation Alignment Module (CRAM) which learns both scale and\nposition-aware representations from multi-scale feature pyramids and patch\nrepresentations respectively. CRAM further aligns these representations with\nconcept annotations through an attention matrix. The proposed CRAM module\nimproves the predictive performance of ViT architectures and also provides\naccurate and robust concept explanations as demonstrated on five datasets -\nincluding three widely used benchmarks (CUB, Pascal APY, Concept-MNIST) and 2\nreal-world datasets (AWA2, KITS).\n","authors":["Sanchit Sinha","Guangzhi Xiong","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09217v1","updated":"2025-01-16T00:33:01Z","published":"2025-01-16T00:33:01Z","title":"Adaptive Law-Based Transformation (ALT): A Lightweight Feature\n  Representation for Time Series Classification","summary":"  Time series classification (TSC) is fundamental in numerous domains,\nincluding finance, healthcare, and environmental monitoring. However,\ntraditional TSC methods often struggle with the inherent complexity and\nvariability of time series data. Building on our previous work with the linear\nlaw-based transformation (LLT) - which improved classification accuracy by\ntransforming the feature space based on key data patterns - we introduce\nadaptive law-based transformation (ALT). ALT enhances LLT by incorporating\nvariable-length shifted time windows, enabling it to capture distinguishing\npatterns of various lengths and thereby handle complex time series more\neffectively. By mapping features into a linearly separable space, ALT provides\na fast, robust, and transparent solution that achieves state-of-the-art\nperformance with only a few hyperparameters.\n","authors":["Marcell T. Kurbucz","Balázs Hajós","Balázs P. Halmos","Vince Á. Molnár","Antal Jakovác"],"pdf_url":"https://arxiv.org/pdf/2501.09217v1.pdf","comment":"8 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2405.03762v3","updated":"2025-01-16T00:22:53Z","published":"2024-05-06T18:01:13Z","title":"Swin transformers are robust to distribution and concept drift in\n  endoscopy-based longitudinal rectal cancer assessment","summary":"  Endoscopic images are used at various stages of rectal cancer treatment\nstarting from cancer screening, diagnosis, during treatment to assess response\nand toxicity from treatments such as colitis, and at follow up to detect new\ntumor or local regrowth (LR). However, subjective assessment is highly variable\nand can underestimate the degree of response in some patients, subjecting them\nto unnecessary surgery, or overestimate response that places patients at risk\nof disease spread. Advances in deep learning has shown the ability to produce\nconsistent and objective response assessment for endoscopic images. However,\nmethods for detecting cancers, regrowth, and monitoring response during the\nentire course of patient treatment and follow-up are lacking. This is because,\nautomated diagnosis and rectal cancer response assessment requires methods that\nare robust to inherent imaging illumination variations and confounding\nconditions (blood, scope, blurring) present in endoscopy images as well as\nchanges to the normal lumen and tumor during treatment. Hence, a hierarchical\nshifted window (Swin) transformer was trained to distinguish rectal cancer from\nnormal lumen using endoscopy images. Swin as well as two convolutional\n(ResNet-50, WideResNet-50), and vision transformer (ViT) models were trained\nand evaluated on follow-up longitudinal images to detect LR on private dataset\nas well as on out-of-distribution (OOD) public colonoscopy datasets to detect\npre/non-cancerous polyps. Color shifts were applied using optimal transport to\nsimulate distribution shifts. Swin and ResNet models were similarly accurate in\nthe in-distribution dataset. Swin was more accurate than other methods\n(follow-up: 0.84, OOD: 0.83) even when subject to color shifts (follow-up:\n0.83, OOD: 0.87), indicating capability to provide robust performance for\nlongitudinal cancer assessment.\n","authors":["Jorge Tapias Gomez","Aneesh Rangnekar","Hannah Williams","Hannah Thompson","Julio Garcia-Aguilar","Joshua Jesse Smith","Harini Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2405.03762v3.pdf","comment":"The work has been accepted for publication in 2024 SPIE Medical\n  Imaging conference proceedings"},{"id":"http://arxiv.org/abs/2501.09209v1","updated":"2025-01-16T00:03:04Z","published":"2025-01-16T00:03:04Z","title":"Surgical Visual Understanding (SurgVU) Dataset","summary":"  Owing to recent advances in machine learning and the ability to harvest large\namounts of data during robotic-assisted surgeries, surgical data science is\nripe for foundational work. We present a large dataset of surgical videos and\ntheir accompanying labels for this purpose. We describe how the data was\ncollected and some of its unique attributes. Multiple example problems are\noutlined. Although the dataset was curated for a particular set of scientific\nchallenges (in an accompanying paper), it is general enough to be used for a\nbroad range machine learning questions. Our hope is that this dataset exposes\nthe larger machine learning community to the challenging problems within\nsurgical data science, and becomes a touchstone for future research. The videos\nare available at\nhttps://storage.googleapis.com/isi-surgvu/surgvu24_videos_only.zip, the labels\nat https://storage.googleapis.com/isi-surgvu/surgvu24_labels_updated_v2.zip,\nand a validation set for tool detection problem at\nhttps://storage.googleapis.com/isi-surgvu/cat1_test_set_public.zip.\n","authors":["Aneeq Zia","Max Berniker","Rogerio Nespolo","Conor Perreault","Ziheng Wang","Benjamin Mueller","Ryan Schmidt","Kiran Bhattacharyya","Xi Liu","Anthony Jarc"],"pdf_url":"https://arxiv.org/pdf/2501.09209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09884v1","updated":"2025-01-16T23:54:54Z","published":"2025-01-16T23:54:54Z","title":"Semi-Supervised Image-Based Narrative Extraction: A Case Study with\n  Historical Photographic Records","summary":"  This paper presents a semi-supervised approach to extracting narratives from\nhistorical photographic records using an adaptation of the narrative maps\nalgorithm. We extend the original unsupervised text-based method to work with\nimage data, leveraging deep learning techniques for visual feature extraction\nand similarity computation. Our method is applied to the ROGER dataset, a\ncollection of photographs from the 1928 Sacambaya Expedition in Bolivia\ncaptured by Robert Gerstmann. We compare our algorithmically extracted visual\nnarratives with expert-curated timelines of varying lengths (5 to 30 images) to\nevaluate the effectiveness of our approach. In particular, we use the Dynamic\nTime Warping (DTW) algorithm to match the extracted narratives with the\nexpert-curated baseline. In addition, we asked an expert on the topic to\nqualitatively evaluate a representative example of the resulting narratives.\nOur findings show that the narrative maps approach generally outperforms random\nsampling for longer timelines (10+ images, p < 0.05), with expert evaluation\nconfirming the historical accuracy and coherence of the extracted narratives.\nThis research contributes to the field of computational analysis of visual\ncultural heritage, offering new tools for historians, archivists, and digital\nhumanities scholars to explore and understand large-scale image collections.\nThe method's ability to generate meaningful narratives from visual data opens\nup new possibilities for the study and interpretation of historical events\nthrough photographic evidence.\n","authors":["Fausto German","Brian Keith","Mauricio Matus","Diego Urrutia","Claudio Meneses"],"pdf_url":"https://arxiv.org/pdf/2501.09884v1.pdf","comment":"This paper has been accepted for oral presentation in the findings\n  track of the 47th European Conference on Information Retrieval (ECIR 2025).\n  Source code and experiments are available at\n  https://github.com/faustogerman/ROGER-Concept-Narratives"},{"id":"http://arxiv.org/abs/2501.09878v1","updated":"2025-01-16T23:28:30Z","published":"2025-01-16T23:28:30Z","title":"ASTRA: A Scene-aware TRAnsformer-based model for trajectory prediction","summary":"  We present ASTRA (A} Scene-aware TRAnsformer-based model for trajectory\nprediction), a light-weight pedestrian trajectory forecasting model that\nintegrates the scene context, spatial dynamics, social inter-agent interactions\nand temporal progressions for precise forecasting. We utilised a U-Net-based\nfeature extractor, via its latent vector representation, to capture scene\nrepresentations and a graph-aware transformer encoder for capturing social\ninteractions. These components are integrated to learn an agent-scene aware\nembedding, enabling the model to learn spatial dynamics and forecast the future\ntrajectory of pedestrians. The model is designed to produce both deterministic\nand stochastic outcomes, with the stochastic predictions being generated by\nincorporating a Conditional Variational Auto-Encoder (CVAE). ASTRA also\nproposes a simple yet effective weighted penalty loss function, which helps to\nyield predictions that outperform a wide array of state-of-the-art\ndeterministic and generative models. ASTRA demonstrates an average improvement\nof 27%/10% in deterministic/stochastic settings on the ETH-UCY dataset, and 26%\nimprovement on the PIE dataset, respectively, along with seven times fewer\nparameters than the existing state-of-the-art model (see Figure 1).\nAdditionally, the model's versatility allows it to generalize across different\nperspectives, such as Bird's Eye View (BEV) and Ego-Vehicle View (EVV).\n","authors":["Izzeddin Teeti","Aniket Thomas","Munish Monga","Sachin Kumar","Uddeshya Singh","Andrew Bradley","Biplab Banerjee","Fabio Cuzzolin"],"pdf_url":"https://arxiv.org/pdf/2501.09878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09863v1","updated":"2025-01-16T22:21:00Z","published":"2025-01-16T22:21:00Z","title":"Detection of Vascular Leukoencephalopathy in CT Images","summary":"  Artificial intelligence (AI) has seen a significant surge in popularity,\nparticularly in its application to medicine. This study explores AI's role in\ndiagnosing leukoencephalopathy, a small vessel disease of the brain, and a\nleading cause of vascular dementia and hemorrhagic strokes. We utilized a\ndataset of approximately 1200 patients with axial brain CT scans to train\nconvolutional neural networks (CNNs) for binary disease classification.\nAddressing the challenge of varying scan dimensions due to different patient\nphysiologies, we processed the data to a uniform size and applied three\npreprocessing methods to improve model accuracy. We compared four neural\nnetwork architectures: ResNet50, ResNet50 3D, ConvNext, and Densenet. The\nConvNext model achieved the highest accuracy of 98.5% without any\npreprocessing, outperforming models with 3D convolutions. To gain insights into\nmodel decision-making, we implemented Grad-CAM heatmaps, which highlighted the\nfocus areas of the models on the scans. Our results demonstrate that AI,\nparticularly the ConvNext architecture, can significantly enhance diagnostic\naccuracy for leukoencephalopathy. This study underscores AI's potential in\nadvancing diagnostic methodologies for brain diseases and highlights the\neffectiveness of CNNs in medical imaging applications.\n","authors":["Z. Cernekova","V. Sisik","F. Jafari"],"pdf_url":"https://arxiv.org/pdf/2501.09863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09838v1","updated":"2025-01-16T20:56:32Z","published":"2025-01-16T20:56:32Z","title":"CrossModalityDiffusion: Multi-Modal Novel View Synthesis with Unified\n  Intermediate Representation","summary":"  Geospatial imaging leverages data from diverse sensing modalities-such as EO,\nSAR, and LiDAR, ranging from ground-level drones to satellite views. These\nheterogeneous inputs offer significant opportunities for scene understanding\nbut present challenges in interpreting geometry accurately, particularly in the\nabsence of precise ground truth data. To address this, we propose\nCrossModalityDiffusion, a modular framework designed to generate images across\ndifferent modalities and viewpoints without prior knowledge of scene geometry.\nCrossModalityDiffusion employs modality-specific encoders that take multiple\ninput images and produce geometry-aware feature volumes that encode scene\nstructure relative to their input camera positions. The space where the feature\nvolumes are placed acts as a common ground for unifying input modalities. These\nfeature volumes are overlapped and rendered into feature images from novel\nperspectives using volumetric rendering techniques. The rendered feature images\nare used as conditioning inputs for a modality-specific diffusion model,\nenabling the synthesis of novel images for the desired output modality. In this\npaper, we show that jointly training different modules ensures consistent\ngeometric understanding across all modalities within the framework. We validate\nCrossModalityDiffusion's capabilities on the synthetic ShapeNet cars dataset,\ndemonstrating its effectiveness in generating accurate and consistent novel\nviews across multiple imaging modalities and perspectives.\n","authors":["Alex Berian","Daniel Brignac","JhihYang Wu","Natnael Daba","Abhijit Mahalanobis"],"pdf_url":"https://arxiv.org/pdf/2501.09838v1.pdf","comment":"Accepted in the 2025 WACV workshop GeoCV"},{"id":"http://arxiv.org/abs/2501.09833v1","updated":"2025-01-16T20:42:17Z","published":"2025-01-16T20:42:17Z","title":"EraseBench: Understanding The Ripple Effects of Concept Erasure\n  Techniques","summary":"  Concept erasure techniques have recently gained significant attention for\ntheir potential to remove unwanted concepts from text-to-image models. While\nthese methods often demonstrate success in controlled scenarios, their\nrobustness in real-world applications and readiness for deployment remain\nuncertain. In this work, we identify a critical gap in evaluating sanitized\nmodels, particularly in terms of their performance across various concept\ndimensions. We systematically investigate the failure modes of current concept\nerasure techniques, with a focus on visually similar, binomial, and\nsemantically related concepts. We propose that these interconnected\nrelationships give rise to a phenomenon of concept entanglement resulting in\nripple effects and degradation in image quality. To facilitate more\ncomprehensive evaluation, we introduce EraseBENCH, a multi-dimensional\nbenchmark designed to assess concept erasure methods with greater depth. Our\ndataset includes over 100 diverse concepts and more than 1,000 tailored\nprompts, paired with a comprehensive suite of metrics that together offer a\nholistic view of erasure efficacy. Our findings reveal that even\nstate-of-the-art techniques struggle with maintaining quality post-erasure,\nindicating that these approaches are not yet ready for real-world deployment.\nThis highlights the gap in reliability of the concept erasure techniques.\n","authors":["Ibtihel Amara","Ahmed Imtiaz Humayun","Ivana Kajic","Zarana Parekh","Natalie Harris","Sarah Young","Chirag Nagpal","Najoung Kim","Junfeng He","Cristina Nader Vasconcelos","Deepak Ramachandran","Goolnoosh Farnadi","Katherine Heller","Mohammad Havaei","Negar Rostamzadeh"],"pdf_url":"https://arxiv.org/pdf/2501.09833v1.pdf","comment":"11 pages main; 9 pages supplemental material"},{"id":"http://arxiv.org/abs/2501.05880v2","updated":"2025-01-16T20:35:28Z","published":"2025-01-10T11:32:56Z","title":"TakuNet: an Energy-Efficient CNN for Real-Time Inference on Embedded UAV\n  systems in Emergency Response Scenarios","summary":"  Designing efficient neural networks for embedded devices is a critical\nchallenge, particularly in applications requiring real-time performance, such\nas aerial imaging with drones and UAVs for emergency responses. In this work,\nwe introduce TakuNet, a novel light-weight architecture which employs\ntechniques such as depth-wise convolutions and an early downsampling stem to\nreduce computational complexity while maintaining high accuracy. It leverages\ndense connections for fast convergence during training and uses 16-bit\nfloating-point precision for optimization on embedded hardware accelerators.\nExperimental evaluation on two public datasets shows that TakuNet achieves\nnear-state-of-the-art accuracy in classifying aerial images of emergency\nsituations, despite its minimal parameter count. Real-world tests on embedded\ndevices, namely Jetson Orin Nano and Raspberry Pi, confirm TakuNet's\nefficiency, achieving more than 650 fps on the 15W Jetson board, making it\nsuitable for real-time AI processing on resource-constrained platforms and\nadvancing the applicability of drones in emergency scenarios. The code and\nimplementation details are publicly released.\n","authors":["Daniel Rossi","Guido Borghi","Roberto Vezzani"],"pdf_url":"https://arxiv.org/pdf/2501.05880v2.pdf","comment":"This paper has been accepted at WACVW 2025, which will take place on\n  28/02/2025. The official conference proceedings have not yet been published\n  at the time of submission to arXiv. The final version of the paper,\n  incorporating any changes based on feedback received during the conference,\n  will be included in the proceedings once they are made available"},{"id":"http://arxiv.org/abs/2501.09826v1","updated":"2025-01-16T20:26:30Z","published":"2025-01-16T20:26:30Z","title":"PIXELS: Progressive Image Xemplar-based Editing with Latent Surgery","summary":"  Recent advancements in language-guided diffusion models for image editing are\noften bottle-necked by cumbersome prompt engineering to precisely articulate\ndesired changes. An intuitive alternative calls on guidance from in-the-wild\nimage exemplars to help users bring their imagined edits to life. Contemporary\nexemplar-based editing methods shy away from leveraging the rich latent space\nlearnt by pre-existing large text-to-image (TTI) models and fall back on\ntraining with curated objective functions to achieve the task. Though somewhat\neffective, this demands significant computational resources and lacks\ncompatibility with diverse base models and arbitrary exemplar count. On further\ninvestigation, we also find that these techniques restrict user control to only\napplying uniform global changes over the entire edited region. In this paper,\nwe introduce a novel framework for progressive exemplar-driven editing with\noff-the-shelf diffusion models, dubbed PIXELS, to enable customization by\nproviding granular control over edits, allowing adjustments at the pixel or\nregion level. Our method operates solely during inference to facilitate\nimitative editing, enabling users to draw inspiration from a dynamic number of\nreference images, or multimodal prompts, and progressively incorporate all the\ndesired changes without retraining or fine-tuning existing TTI models. This\ncapability of fine-grained control opens up a range of new possibilities,\nincluding selective modification of individual objects and specifying gradual\nspatial changes. We demonstrate that PIXELS delivers high-quality edits\nefficiently, leading to a notable improvement in quantitative metrics as well\nas human evaluation. By making high-quality image editing more accessible,\nPIXELS has the potential to enable professional-grade edits to a wider audience\nwith the ease of using any open-source image generation model.\n","authors":["Shristi Das Biswas","Matthew Shreve","Xuelu Li","Prateek Singhal","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2501.09826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17400v2","updated":"2025-01-16T20:25:21Z","published":"2024-09-25T22:19:32Z","title":"AgRegNet: A Deep Regression Network for Flower and Fruit Density\n  Estimation, Localization, and Counting in Orchards","summary":"  One of the major challenges for the agricultural industry today is the\nuncertainty in manual labor availability and the associated cost. Automated\nflower and fruit density estimation, localization, and counting could help\nstreamline harvesting, yield estimation, and crop-load management strategies\nsuch as flower and fruitlet thinning. This article proposes a deep\nregression-based network, AgRegNet, to estimate density, count, and location of\nflower and fruit in tree fruit canopies without explicit object detection or\npolygon annotation. Inspired by popular U-Net architecture, AgRegNet is a\nU-shaped network with an encoder-to-decoder skip connection and modified\nConvNeXt-T as an encoder feature extractor. AgRegNet can be trained based on\ninformation from point annotation and leverages segmentation information and\nattention modules (spatial and channel) to highlight relevant flower and fruit\nfeatures while suppressing non-relevant background features. Experimental\nevaluation in apple flower and fruit canopy images under an unstructured\norchard environment showed that AgRegNet achieved promising accuracy as\nmeasured by Structural Similarity Index (SSIM), percentage Mean Absolute Error\n(pMAE) and mean Average Precision (mAP) to estimate flower and fruit density,\ncount, and centroid location, respectively. Specifically, the SSIM, pMAE, and\nmAP values for flower images were 0.938, 13.7%, and 0.81, respectively. For\nfruit images, the corresponding values were 0.910, 5.6%, and 0.93. Since the\nproposed approach relies on information from point annotation, it is suitable\nfor sparsely and densely located objects. This simplified technique will be\nhighly applicable for growers to accurately estimate yields and decide on\noptimal chemical and mechanical flower thinning practices.\n","authors":["Uddhav Bhattarai","Santosh Bhusal","Qin Zhang","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2409.17400v2.pdf","comment":"Published in Computers and Electronics in Agriculture"},{"id":"http://arxiv.org/abs/2501.09817v1","updated":"2025-01-16T20:09:19Z","published":"2025-01-16T20:09:19Z","title":"Generalized Single-Image-Based Morphing Attack Detection Using Deep\n  Representations from Vision Transformer","summary":"  Face morphing attacks have posed severe threats to Face Recognition Systems\n(FRS), which are operated in border control and passport issuance use cases.\nCorrespondingly, morphing attack detection algorithms (MAD) are needed to\ndefend against such attacks. MAD approaches must be robust enough to handle\nunknown attacks in an open-set scenario where attacks can originate from\nvarious morphing generation algorithms, post-processing and the diversity of\nprinters/scanners. The problem of generalization is further pronounced when the\ndetection has to be made on a single suspected image. In this paper, we propose\na generalized single-image-based MAD (S-MAD) algorithm by learning the encoding\nfrom Vision Transformer (ViT) architecture. Compared to CNN-based\narchitectures, ViT model has the advantage on integrating local and global\ninformation and hence can be suitable to detect the morphing traces widely\ndistributed among the face region. Extensive experiments are carried out on\nface morphing datasets generated using publicly available FRGC face datasets.\nSeveral state-of-the-art (SOTA) MAD algorithms, including representative ones\nthat have been publicly evaluated, have been selected and benchmarked with our\nViT-based approach. Obtained results demonstrate the improved detection\nperformance of the proposed S-MAD method on inter-dataset testing (when\ndifferent data is used for training and testing) and comparable performance on\nintra-dataset testing (when the same data is used for training and testing)\nexperimental protocol.\n","authors":["Haoyu Zhang","Raghavendra Ramachandra","Kiran Raja","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.09817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09815v1","updated":"2025-01-16T20:02:13Z","published":"2025-01-16T20:02:13Z","title":"Lossy Compression with Pretrained Diffusion Models","summary":"  We apply the DiffC algorithm (Theis et al. 2022) to Stable Diffusion 1.5,\n2.1, XL, and Flux-dev, and demonstrate that these pretrained models are\nremarkably capable lossy image compressors. A principled algorithm for lossy\ncompression using pretrained diffusion models has been understood since at\nleast Ho et al. 2020, but challenges in reverse-channel coding have prevented\nsuch algorithms from ever being fully implemented. We introduce simple\nworkarounds that lead to the first complete implementation of DiffC, which is\ncapable of compressing and decompressing images using Stable Diffusion in under\n10 seconds. Despite requiring no additional training, our method is competitive\nwith other state-of-the-art generative compression methods at low ultra-low\nbitrates.\n","authors":["Jeremy Vonderfecht","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09782v1","updated":"2025-01-16T18:59:46Z","published":"2025-01-16T18:59:46Z","title":"SMPLest-X: Ultimate Scaling for Expressive Human Pose and Shape\n  Estimation","summary":"  Expressive human pose and shape estimation (EHPS) unifies body, hands, and\nface motion capture with numerous applications. Despite encouraging progress,\ncurrent state-of-the-art methods focus on training innovative architectural\ndesigns on confined datasets. In this work, we investigate the impact of\nscaling up EHPS towards a family of generalist foundation models. 1) For data\nscaling, we perform a systematic investigation on 40 EHPS datasets,\nencompassing a wide range of scenarios that a model trained on any single\ndataset cannot handle. More importantly, capitalizing on insights obtained from\nthe extensive benchmarking process, we optimize our training scheme and select\ndatasets that lead to a significant leap in EHPS capabilities. Ultimately, we\nachieve diminishing returns at 10M training instances from diverse data\nsources. 2) For model scaling, we take advantage of vision transformers (up to\nViT-Huge as the backbone) to study the scaling law of model sizes in EHPS. To\nexclude the influence of algorithmic design, we base our experiments on two\nminimalist architectures: SMPLer-X, which consists of an intermediate step for\nhand and face localization, and SMPLest-X, an even simpler version that reduces\nthe network to its bare essentials and highlights significant advances in the\ncapture of articulated hands. With big data and the large model, the foundation\nmodels exhibit strong performance across diverse test benchmarks and excellent\ntransferability to even unseen environments. Moreover, our finetuning strategy\nturns the generalist into specialist models, allowing them to achieve further\nperformance boosts. Notably, our foundation models consistently deliver\nstate-of-the-art results on seven benchmarks such as AGORA, UBody, EgoBody, and\nour proposed SynHand dataset for comprehensive hand evaluation. (Code is\navailable at: https://github.com/wqyin/SMPLest-X).\n","authors":["Wanqi Yin","Zhongang Cai","Ruisi Wang","Ailing Zeng","Chen Wei","Qingping Sun","Haiyi Mei","Yanjun Wang","Hui En Pang","Mingyuan Zhang","Lei Zhang","Chen Change Loy","Atsushi Yamashita","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09782v1.pdf","comment":"An extension of SMPLer-X [arXiv:2309.17448]. Homepage:\n  https://caizhongang.com/projects/SMPLer-X/"},{"id":"http://arxiv.org/abs/2501.09781v1","updated":"2025-01-16T18:59:10Z","published":"2025-01-16T18:59:10Z","title":"VideoWorld: Exploring Knowledge Learning from Unlabeled Videos","summary":"  This work explores whether a deep generative model can learn complex\nknowledge solely from visual input, in contrast to the prevalent focus on\ntext-based models like large language models (LLMs). We develop VideoWorld, an\nauto-regressive video generation model trained on unlabeled video data, and\ntest its knowledge acquisition abilities in video-based Go and robotic control\ntasks. Our experiments reveal two key findings: (1) video-only training\nprovides sufficient information for learning knowledge, including rules,\nreasoning and planning capabilities, and (2) the representation of visual\nchange is crucial for knowledge acquisition. To improve both the efficiency and\nefficacy of this process, we introduce the Latent Dynamics Model (LDM) as a key\ncomponent of VideoWorld. Remarkably, VideoWorld reaches a 5-dan professional\nlevel in the Video-GoBench with just a 300-million-parameter model, without\nrelying on search algorithms or reward mechanisms typical in reinforcement\nlearning. In robotic tasks, VideoWorld effectively learns diverse control\noperations and generalizes across environments, approaching the performance of\noracle models in CALVIN and RLBench. This study opens new avenues for knowledge\nacquisition from visual data, with all code, data, and models open-sourced for\nfurther research.\n","authors":["Zhongwei Ren","Yunchao Wei","Xun Guo","Yao Zhao","Bingyi Kang","Jiashi Feng","Xiaojie Jin"],"pdf_url":"https://arxiv.org/pdf/2501.09781v1.pdf","comment":"Code and models are released at:\n  https://maverickren.github.io/VideoWorld.github.io/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2404.04269v2","updated":"2025-01-16T18:59:53Z","published":"2024-03-19T23:27:15Z","title":"Algorithmic Collective Action in Recommender Systems: Promoting Songs by\n  Reordering Playlists","summary":"  We investigate algorithmic collective action in transformer-based recommender\nsystems. Our use case is a music streaming platform where a collective of fans\naims to promote the visibility of an underrepresented artist by strategically\nplacing one of their songs in the existing playlists they control. We introduce\ntwo easily implementable strategies to select the position at which to insert\nthe song with the goal to boost recommendations at test time. The strategies\nexploit statistical properties of the learner by targeting discontinuities in\nthe recommendations, and leveraging the long-tail nature of song distributions.\nWe evaluate the efficacy of our strategies using a publicly available\nrecommender system model released by a major music streaming platform. Our\nfindings reveal that through strategic placement even small collectives\n(controlling less than 0.01\\% of the training data) can achieve up to\n$40\\times$ more test time recommendations than an average song with the same\nnumber of training set occurrences. Focusing on the externalities of the\nstrategy, we find that the recommendations of other songs are largely\npreserved, and the newly gained recommendations are distributed across various\nartists. Together, our findings demonstrate how carefully designed collective\naction strategies can be effective while not necessarily being adversarial.\n","authors":["Joachim Baumann","Celestine Mendler-Dünner"],"pdf_url":"https://arxiv.org/pdf/2404.04269v2.pdf","comment":"Published at NeurIPS 2024, camera-ready updates"},{"id":"http://arxiv.org/abs/2501.09749v1","updated":"2025-01-16T18:57:20Z","published":"2025-01-16T18:57:20Z","title":"Enhancing Lexicon-Based Text Embeddings with Large Language Models","summary":"  Recent large language models (LLMs) have demonstrated exceptional performance\non general-purpose text embedding tasks. While dense embeddings have dominated\nrelated research, we introduce the first Lexicon-based EmbeddiNgS (LENS)\nleveraging LLMs that achieve competitive performance on these tasks. Regarding\nthe inherent tokenization redundancy issue and unidirectional attention\nlimitations in traditional causal LLMs, LENS consolidates the vocabulary space\nthrough token embedding clustering, and investigates bidirectional attention\nand various pooling strategies. Specifically, LENS simplifies lexicon matching\nby assigning each dimension to a specific token cluster, where semantically\nsimilar tokens are grouped together, and unlocking the full potential of LLMs\nthrough bidirectional attention. Extensive experiments demonstrate that LENS\noutperforms dense embeddings on the Massive Text Embedding Benchmark (MTEB),\ndelivering compact feature representations that match the sizes of dense\ncounterparts. Notably, combining LENSE with dense embeddings achieves\nstate-of-the-art performance on the retrieval subset of MTEB (i.e. BEIR).\n","authors":["Yibin Lei","Tao Shen","Yu Cao","Andrew Yates"],"pdf_url":"https://arxiv.org/pdf/2501.09749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09243v2","updated":"2025-01-16T16:38:42Z","published":"2024-12-12T12:53:30Z","title":"SPRec: Leveraging Self-Play to Debias Preference Alignment for Large\n  Language Model-based Recommendations","summary":"  Large language models (LLMs) have attracted significant attention in\nrecommendation systems. Current LLM-based recommender systems primarily rely on\nsupervised fine-tuning (SFT) to train the model for recommendation tasks.\nHowever, relying solely on positive samples limits the model's ability to align\nwith user satisfaction and expectations. To address this, researchers have\nintroduced Direct Preference Optimization (DPO), which explicitly aligns\nrecommendations with user preferences using offline preference ranking data.\nDespite its advantages, our theoretical analysis reveals that DPO inherently\nbiases the model towards a few items, exacerbating the filter bubble issue and\nultimately degrading user experience. In this paper, we propose SPRec, a novel\nself-play recommendation framework designed to mitigate over-recommendation and\nimprove fairness without requiring additional data or manual intervention. In\neach self-play iteration, the model undergoes an SFT step followed by a DPO\nstep, treating offline interaction data as positive samples and the predicted\noutputs from the previous iteration as negative samples. This effectively\nre-weights the DPO loss function using the model's logits, adaptively\nsuppressing biased items. Extensive experiments on multiple real-world datasets\ndemonstrate SPRec's effectiveness in enhancing recommendation accuracy and\naddressing fairness concerns. The implementation is available via\nhttps://github.com/RegionCh/SPRec\n","authors":["Chongming Gao","Ruijun Chen","Shuai Yuan","Kexin Huang","Yuanqing Yu","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2412.09243v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09608v1","updated":"2025-01-16T15:32:41Z","published":"2025-01-16T15:32:41Z","title":"Metric Learning with Progressive Self-Distillation for Audio-Visual\n  Embedding Learning","summary":"  Metric learning projects samples into an embedded space, where similarities\nand dissimilarities are quantified based on their learned representations.\nHowever, existing methods often rely on label-guided representation learning,\nwhere representations of different modalities, such as audio and visual data,\nare aligned based on annotated labels. This approach tends to underutilize\nlatent complex features and potential relationships inherent in the\ndistributions of audio and visual data that are not directly tied to the\nlabels, resulting in suboptimal performance in audio-visual embedding learning.\nTo address this issue, we propose a novel architecture that integrates\ncross-modal triplet loss with progressive self-distillation. Our method\nenhances representation learning by leveraging inherent distributions and\ndynamically refining soft audio-visual alignments -- probabilistic alignments\nbetween audio and visual data that capture the inherent relationships beyond\nexplicit labels. Specifically, the model distills audio-visual\ndistribution-based knowledge from annotated labels in a subset of each batch.\nThis self-distilled knowledge is used t\n","authors":["Donghuo Zeng","Kazushi Ikeda"],"pdf_url":"https://arxiv.org/pdf/2501.09608v1.pdf","comment":"5 pages, 3 figures, 2 tables. Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.09493v1","updated":"2025-01-16T12:06:56Z","published":"2025-01-16T12:06:56Z","title":"Evaluating Conversational Recommender Systems with Large Language\n  Models: A User-Centric Evaluation Framework","summary":"  Conversational recommender systems (CRS) involve both recommendation and\ndialogue tasks, which makes their evaluation a unique challenge. Although past\nresearch has analyzed various factors that may affect user satisfaction with\nCRS interactions from the perspective of user studies, few evaluation metrics\nfor CRS have been proposed. Recent studies have shown that LLMs can align with\nhuman preferences, and several LLM-based text quality evaluation measures have\nbeen introduced. However, the application of LLMs in CRS evaluation remains\nrelatively limited. To address this research gap and advance the development of\nuser-centric conversational recommender systems, this study proposes an\nautomated LLM-based CRS evaluation framework, building upon existing research\nin human-computer interaction and psychology. The framework evaluates CRS from\nfour dimensions: dialogue behavior, language expression, recommendation items,\nand response content. We use this framework to evaluate four different\nconversational recommender systems.\n","authors":["Nuo Chen","Quanyu Dai","Xiaoyu Dong","Xiao-Ming Wu","Zhenhua Dong"],"pdf_url":"https://arxiv.org/pdf/2501.09493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04997v2","updated":"2025-01-16T09:58:54Z","published":"2024-01-10T08:28:56Z","title":"Tapping the Potential of Large Language Models as Recommender Systems: A\n  Comprehensive Framework and Empirical Analysis","summary":"  Recently, Large Language Models~(LLMs) such as ChatGPT have showcased\nremarkable abilities in solving general tasks, demonstrating the potential for\napplications in recommender systems. To assess how effectively LLMs can be used\nin recommendation tasks, our study primarily focuses on employing LLMs as\nrecommender systems through prompting engineering. We propose a general\nframework for utilizing LLMs in recommendation tasks, focusing on the\ncapabilities of LLMs as recommenders. To conduct our analysis, we formalize the\ninput of LLMs for recommendation into natural language prompts with two key\naspects, and explain how our framework can be generalized to various\nrecommendation scenarios. As for the use of LLMs as recommenders, we analyze\nthe impact of public availability, tuning strategies, model architecture,\nparameter scale, and context length on recommendation results based on the\nclassification of LLMs. As for prompt engineering, we further analyze the\nimpact of four important components of prompts, \\ie task descriptions, user\ninterest modeling, candidate items construction and prompting strategies. In\neach section, we first define and categorize concepts in line with the existing\nliterature. Then, we propose inspiring research questions followed by detailed\nexperiments on two public datasets, in order to systematically analyze the\nimpact of different factors on performance. Based on our empirical analysis, we\nfinally summarize promising directions to shed lights on future research.\n","authors":["Lanling Xu","Junjie Zhang","Bingqian Li","Jinpeng Wang","Sheng Chen","Wayne Xin Zhao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2401.04997v2.pdf","comment":"52 pages, under review"},{"id":"http://arxiv.org/abs/2501.04635v2","updated":"2025-01-16T09:30:38Z","published":"2025-01-08T17:29:46Z","title":"Knowledge Retrieval Based on Generative AI","summary":"  This study develops a question-answering system based on Retrieval-Augmented\nGeneration (RAG) using Chinese Wikipedia and Lawbank as retrieval sources.\nUsing TTQA and TMMLU+ as evaluation datasets, the system employs BGE-M3 for\ndense vector retrieval to obtain highly relevant search results and\nBGE-reranker to reorder these results based on query relevance. The most\npertinent retrieval outcomes serve as reference knowledge for a Large Language\nModel (LLM), enhancing its ability to answer questions and establishing a\nknowledge retrieval system grounded in generative AI. The system's\neffectiveness is assessed through a two-stage evaluation: automatic and\nassisted performance evaluations. The automatic evaluation calculates accuracy\nby comparing the model's auto-generated labels with ground truth answers,\nmeasuring performance under standardized conditions without human intervention.\nThe assisted performance evaluation involves 20 finance-related multiple-choice\nquestions answered by 20 participants without financial backgrounds. Initially,\nparticipants answer independently. Later, they receive system-generated\nreference information to assist in answering, examining whether the system\nimproves accuracy when assistance is provided. The main contributions of this\nresearch are: (1) Enhanced LLM Capability: By integrating BGE-M3 and\nBGE-reranker, the system retrieves and reorders highly relevant results,\nreduces hallucinations, and dynamically accesses authorized or public knowledge\nsources. (2) Improved Data Privacy: A customized RAG architecture enables local\noperation of the LLM, eliminating the need to send private data to external\nservers. This approach enhances data security, reduces reliance on commercial\nservices, lowers operational costs, and mitigates privacy risks.\n","authors":["Te-Lun Yang","Jyi-Shane Liu","Yuen-Hsien Tseng","Jyh-Shing Roger Jang"],"pdf_url":"https://arxiv.org/pdf/2501.04635v2.pdf","comment":"8 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2501.09384v1","updated":"2025-01-16T08:52:50Z","published":"2025-01-16T08:52:50Z","title":"Evaluating LLM Abilities to Understand Tabular Electronic Health\n  Records: A Comprehensive Study of Patient Data Extraction and Retrieval","summary":"  Electronic Health Record (EHR) tables pose unique challenges among which is\nthe presence of hidden contextual dependencies between medical features with a\nhigh level of data dimensionality and sparsity. This study presents the first\ninvestigation into the abilities of LLMs to comprehend EHRs for patient data\nextraction and retrieval. We conduct extensive experiments using the MIMICSQL\ndataset to explore the impact of the prompt structure, instruction, context,\nand demonstration, of two backbone LLMs, Llama2 and Meditron, based on task\nperformance. Through quantitative and qualitative analyses, our findings show\nthat optimal feature selection and serialization methods can enhance task\nperformance by up to 26.79% compared to naive approaches. Similarly, in-context\nlearning setups with relevant example selection improve data extraction\nperformance by 5.95%. Based on our study findings, we propose guidelines that\nwe believe would help the design of LLM-based models to support health search.\n","authors":["Jesus Lovon","Martin Mouysset","Jo Oleiwan","Jose G. Moreno","Christine Damase-Michel","Lynda Tamine"],"pdf_url":"https://arxiv.org/pdf/2501.09384v1.pdf","comment":"To be published as full paper in the Proceedings of the European\n  Conference on Information Retrieval (ECIR) 2025. Preprint"},{"id":"http://arxiv.org/abs/2405.13238v3","updated":"2025-01-16T08:49:06Z","published":"2024-05-21T22:53:00Z","title":"Enhancing User Interest based on Stream Clustering and Memory Networks\n  in Large-Scale Recommender Systems","summary":"  Recommender Systems (RSs) provide personalized recommendation service based\non user interest, which are widely used in various platforms. However, there\nare lots of users with sparse interest due to lacking consumption behaviors,\nwhich leads to poor recommendation results for them. This problem is widespread\nin large-scale RSs and is particularly difficult to address. To solve this\nproblem, we propose a novel solution named User Interest Enhancement (UIE)\nwhich enhances user interest including user profile and user history behavior\nsequences using the enhancement vectors and personalized enhancement vector\ngenerated based on stream clustering and memory networks from different\nperspectives. UIE not only remarkably improves model performance on the users\nwith sparse interest but also significantly enhance model performance on other\nusers. UIE is an end-to-end solution which is easy to be implemented based on\nranking model. Moreover, we expand our solution and apply similar methods to\nlong-tail items, which also achieves excellent improvement. Furthermore, we\nconduct extensive offline and online experiments in a large-scale industrial\nRS. The results demonstrate that our model outperforms other models remarkably,\nespecially for the users with sparse interest. Until now, UIE has been fully\ndeployed in multiple large-scale RSs and achieved remarkable improvements.\n","authors":["Peng Liu","Nian Wang","Cong Xu","Ming Zhao","Bin Wang","Yi Ren"],"pdf_url":"https://arxiv.org/pdf/2405.13238v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09359v1","updated":"2025-01-16T08:15:21Z","published":"2025-01-16T08:15:21Z","title":"A Multi-tiered Solution for Personalized Baggage Item Recommendations\n  using FastText and Association Rule Mining","summary":"  This paper introduces an intelligent baggage item recommendation system to\noptimize packing for air travelers by providing tailored suggestions based on\nspecific travel needs and destinations. Using FastText word embeddings and\nAssociation Rule Mining (ARM), the system ensures efficient luggage space\nutilization, compliance with weight limits, and an enhanced travel experience.\nThe methodology comprises four phases: (1) data collection and preprocessing\nwith pre-trained FastText embeddings for text representation and similarity\nscoring (2) a content-based recommendation system enriched by user search\nhistory (3) application of ARM to user interactions to uncover meaningful item\nassociations and (4) integration of FastText and ARM for accurate, personalized\nrecommendations. Performance is evaluated using metrics such as coverage,\nsupport, confidence, lift, leverage, and conviction. Results demonstrate the\nsystem's effectiveness in providing relevant suggestions, improving customer\nsatisfaction, and simplifying the packing process. These insights advance\npersonalized recommendations, targeted marketing, and product optimization in\nair travel and beyond.\n","authors":["Mudavath Ravi","Atul Negi"],"pdf_url":"https://arxiv.org/pdf/2501.09359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09354v1","updated":"2025-01-16T08:05:39Z","published":"2025-01-16T08:05:39Z","title":"Style4Rec: Enhancing Transformer-based E-commerce Recommendation Systems\n  with Style and Shopping Cart Information","summary":"  Understanding users' product preferences is essential to the efficacy of a\nrecommendation system. Precision marketing leverages users' historical data to\ndiscern these preferences and recommends products that align with them.\nHowever, recent browsing and purchase records might better reflect current\npurchasing inclinations. Transformer-based recommendation systems have made\nstrides in sequential recommendation tasks, but they often fall short in\nutilizing product image style information and shopping cart data effectively.\nIn light of this, we propose Style4Rec, a transformer-based e-commerce\nrecommendation system that harnesses style and shopping cart information to\nenhance existing transformer-based sequential product recommendation systems.\nStyle4Rec represents a significant step forward in personalized e-commerce\nrecommendations, outperforming benchmarks across various evaluation metrics.\nStyle4Rec resulted in notable improvements: HR@5 increased from 0.681 to 0.735,\nNDCG@5 increased from 0.594 to 0.674, and MRR@5 increased from 0.559 to 0.654.\nWe tested our model using an e-commerce dataset from our partnering company and\nfound that it exceeded established transformer-based sequential recommendation\nbenchmarks across various evaluation metrics. Thus, Style4Rec presents a\nsignificant step forward in personalized e-commerce recommendation systems.\n","authors":["Berke Ugurlu","Ming-Yi Hong","Che Lin"],"pdf_url":"https://arxiv.org/pdf/2501.09354v1.pdf","comment":"9 pages, 6 images, 4 tables"},{"id":"http://arxiv.org/abs/2409.01192v2","updated":"2025-01-16T08:04:43Z","published":"2024-09-02T11:58:56Z","title":"SSD4Rec: A Structured State Space Duality Model for Efficient Sequential\n  Recommendation","summary":"  Sequential recommendation methods are crucial in modern recommender systems\nfor their remarkable capability to understand a user's changing interests based\non past interactions. However, a significant challenge faced by current methods\n(e.g., RNN- or Transformer-based models) is to effectively and efficiently\ncapture users' preferences by modeling long behavior sequences, which impedes\ntheir various applications like short video platforms where user interactions\nare numerous. Recently, an emerging architecture named Mamba, built on state\nspace models (SSM) with efficient hardware-aware designs, has showcased the\ntremendous potential for sequence modeling, presenting a compelling avenue for\naddressing the challenge effectively. Inspired by this, we propose a novel\ngeneric and efficient sequential recommendation backbone, SSD4Rec, which\nexplores the seamless adaptation of Mamba for sequential recommendations.\nSpecifically, SSD4Rec marks the variable- and long-length item sequences with\nsequence registers and processes the item representations with bidirectional\nStructured State Space Duality (SSD) blocks. This not only allows for\nhardware-aware matrix multiplication but also empowers outstanding capabilities\nin variable-length and long-range sequence modeling. Extensive evaluations on\nfour benchmark datasets demonstrate that the proposed model achieves\nstate-of-the-art performance while maintaining near-linear scalability with\nuser sequence length. Our code is publicly available at\nhttps://github.com/ZhangYifeng1995/SSD4Rec.\n","authors":["Haohao Qu","Yifeng Zhang","Liangbo Ning","Wenqi Fan","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2409.01192v2.pdf","comment":"Significant revisions have been implemented in our paper,\n  particularly focusing on both the methodology and experimental sections"},{"id":"http://arxiv.org/abs/2501.09292v1","updated":"2025-01-16T04:56:33Z","published":"2025-01-16T04:56:33Z","title":"To Retrieve or Not to Retrieve? Uncertainty Detection for Dynamic\n  Retrieval Augmented Generation","summary":"  Retrieval-Augmented Generation equips large language models with the\ncapability to retrieve external knowledge, thereby mitigating hallucinations by\nincorporating information beyond the model's intrinsic abilities. However, most\nprior works have focused on invoking retrieval deterministically, which makes\nit unsuitable for tasks such as long-form question answering. Instead,\ndynamically performing retrieval by invoking it only when the underlying LLM\nlacks the required knowledge can be more efficient. In this context, we delve\ndeeper into the question, \"To Retrieve or Not to Retrieve?\" by exploring\nmultiple uncertainty detection methods. We evaluate these methods for the task\nof long-form question answering, employing dynamic retrieval, and present our\ncomparisons. Our findings suggest that uncertainty detection metrics, such as\nDegree Matrix Jaccard and Eccentricity, can reduce the number of retrieval\ncalls by almost half, with only a slight reduction in question-answering\naccuracy.\n","authors":["Kaustubh D. Dhole"],"pdf_url":"https://arxiv.org/pdf/2501.09292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17374v2","updated":"2025-01-16T04:40:18Z","published":"2024-12-23T08:15:34Z","title":"Scenario-Wise Rec: A Multi-Scenario Recommendation Benchmark","summary":"  Multi Scenario Recommendation (MSR) tasks, referring to building a unified\nmodel to enhance performance across all recommendation scenarios, have recently\ngained much attention. However, current research in MSR faces two significant\nchallenges that hinder the field's development: the absence of uniform\nprocedures for multi-scenario dataset processing, thus hindering fair\ncomparisons, and most models being closed-sourced, which complicates\ncomparisons with current SOTA models. Consequently, we introduce our benchmark,\n\\textbf{Scenario-Wise Rec}, which comprises 6 public datasets and 12 benchmark\nmodels, along with a training and evaluation pipeline. Additionally, we\nvalidated the benchmark using an industrial advertising dataset, reinforcing\nits reliability and applicability in real-world scenarios. We aim for this\nbenchmark to offer researchers valuable insights from prior work, enabling the\ndevelopment of novel models based on our benchmark and thereby fostering a\ncollaborative research ecosystem in MSR. Our source code is also publicly\navailable.\n","authors":["Xiaopeng Li","Jingtong Gao","Pengyue Jia","Xiangyu Zhao","Yichao Wang","Wanyu Wang","Yejing Wang","Yuhao Wang","Xiangyu Zhao","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2412.17374v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19692v4","updated":"2025-01-16T02:17:21Z","published":"2024-07-29T04:30:38Z","title":"Fusion Self-supervised Learning for Recommendation","summary":"  Recommender systems are widely deployed in various web environments, and\nself-supervised learning (SSL) has recently attracted significant attention in\nthis field. Contrastive learning (CL) stands out as a major SSL paradigm due to\nits robust ability to generate self-supervised signals. Mainstream graph\ncontrastive learning (GCL)-based methods typically implement CL by creating\ncontrastive views through various data augmentation techniques. Despite these\nmethods are effective, we argue that there still exist several challenges. i)\nData augmentation ($e.g.,$ discarding edges or adding noise) necessitates\nadditional graph convolution (GCN) or modeling operations, which are highly\ntime-consuming and potentially harm the embedding quality. ii) Existing\nCL-based methods use traditional CL objectives to capture self-supervised\nsignals. However, few studies have explored obtaining CL objectives from more\nperspectives and have attempted to fuse the varying signals from these CL\nobjectives to enhance recommendation performance.\n  To overcome these challenges, we propose a Fusion Self-supervised Learning\nframework for recommendation. Specifically, instead of facilitating data\naugmentations, we use high-order information from GCN process to create\ncontrastive views. Additionally, to integrate self-supervised signals from\nvarious CL objectives, we propose an advanced CL objective. By ensuring that\npositive pairs are distanced from negative samples derived from both\ncontrastive views, we effectively fuse self-supervised signals from distinct CL\nobjectives, thereby enhancing the mutual information between positive pairs.\nExperimental results on three public datasets demonstrate the superior\nrecommendation performance and efficiency of HFGCL compared to the\nstate-of-the-art baselines.\n","authors":["Yu Zhang","Lei Sang","Yi Zhang","Yiwen Zhang","Yun Yang"],"pdf_url":"https://arxiv.org/pdf/2407.19692v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09211v1","updated":"2025-01-16T00:06:33Z","published":"2025-01-16T00:06:33Z","title":"Fuzzy Integration of Data Lake Tables","summary":"  Data integration is an important step in any data science pipeline where the\nobjective is to unify the information available in different datasets for\ncomprehensive analysis. Full Disjunction, which is an associative extension of\nthe outer join operator, has been shown to be an effective operator for\nintegrating datasets. It fully preserves and combines the available\ninformation. Existing Full Disjunction algorithms only consider the equi-join\nscenario where only tuples having the same value on joining columns are\nintegrated. This, however, does not realistically represent an open data\nscenario, where datasets come from diverse sources with inconsistent values\n(e.g., synonyms, abbreviations, etc.) and with limited metadata. So, joining\njust on equal values severely limits the ability of Full Disjunction to fully\ncombine datasets. Thus, in this work, we propose an extension of Full\nDisjunction to also account for \"fuzzy\" matches among tuples. We present a\nnovel data-driven approach to enable the joining of approximate or fuzzy\nmatches within Full Disjunction. Experimentally, we show that fuzzy Full\nDisjunction does not add significant time overhead over a state-of-the-art Full\nDisjunction implementation and also that it enhances the integration\neffectiveness.\n","authors":["Aamod Khatiwada","Roee Shraga","Renée J. Miller"],"pdf_url":"https://arxiv.org/pdf/2501.09211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09884v1","updated":"2025-01-16T23:54:54Z","published":"2025-01-16T23:54:54Z","title":"Semi-Supervised Image-Based Narrative Extraction: A Case Study with\n  Historical Photographic Records","summary":"  This paper presents a semi-supervised approach to extracting narratives from\nhistorical photographic records using an adaptation of the narrative maps\nalgorithm. We extend the original unsupervised text-based method to work with\nimage data, leveraging deep learning techniques for visual feature extraction\nand similarity computation. Our method is applied to the ROGER dataset, a\ncollection of photographs from the 1928 Sacambaya Expedition in Bolivia\ncaptured by Robert Gerstmann. We compare our algorithmically extracted visual\nnarratives with expert-curated timelines of varying lengths (5 to 30 images) to\nevaluate the effectiveness of our approach. In particular, we use the Dynamic\nTime Warping (DTW) algorithm to match the extracted narratives with the\nexpert-curated baseline. In addition, we asked an expert on the topic to\nqualitatively evaluate a representative example of the resulting narratives.\nOur findings show that the narrative maps approach generally outperforms random\nsampling for longer timelines (10+ images, p < 0.05), with expert evaluation\nconfirming the historical accuracy and coherence of the extracted narratives.\nThis research contributes to the field of computational analysis of visual\ncultural heritage, offering new tools for historians, archivists, and digital\nhumanities scholars to explore and understand large-scale image collections.\nThe method's ability to generate meaningful narratives from visual data opens\nup new possibilities for the study and interpretation of historical events\nthrough photographic evidence.\n","authors":["Fausto German","Brian Keith","Mauricio Matus","Diego Urrutia","Claudio Meneses"],"pdf_url":"https://arxiv.org/pdf/2501.09884v1.pdf","comment":"This paper has been accepted for oral presentation in the findings\n  track of the 47th European Conference on Information Retrieval (ECIR 2025).\n  Source code and experiments are available at\n  https://github.com/faustogerman/ROGER-Concept-Narratives"},{"id":"http://arxiv.org/abs/2501.09859v1","updated":"2025-01-16T22:12:11Z","published":"2025-01-16T22:12:11Z","title":"Empirical Evaluation of Embedding Models in the Context of Text\n  Classification in Document Review in Construction Delay Disputes","summary":"  Text embeddings are numerical representations of text data, where words,\nphrases, or entire documents are converted into vectors of real numbers. These\nembeddings capture semantic meanings and relationships between text elements in\na continuous vector space. The primary goal of text embeddings is to enable the\nprocessing of text data by machine learning models, which require numerical\ninput. Numerous embedding models have been developed for various applications.\nThis paper presents our work in evaluating different embeddings through a\ncomprehensive comparative analysis of four distinct models, focusing on their\ntext classification efficacy. We employ both K-Nearest Neighbors (KNN) and\nLogistic Regression (LR) to perform binary classification tasks, specifically\ndetermining whether a text snippet is associated with 'delay' or 'not delay'\nwithin a labeled dataset. Our research explores the use of text snippet\nembeddings for training supervised text classification models to identify\ndelay-related statements during the document review process of construction\ndelay disputes. The results of this study highlight the potential of embedding\nmodels to enhance the efficiency and accuracy of document analysis in legal\ncontexts, paving the way for more informed decision-making in complex\ninvestigative scenarios.\n","authors":["Fusheng Wei","Robert Neary","Han Qin","Qiang Mao","Jianping Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09801v1","updated":"2025-01-16T19:12:25Z","published":"2025-01-16T19:12:25Z","title":"Conversational Text Extraction with Large Language Models Using\n  Retrieval-Augmented Systems","summary":"  This study introduces a system leveraging Large Language Models (LLMs) to\nextract text and enhance user interaction with PDF documents via a\nconversational interface. Utilizing Retrieval-Augmented Generation (RAG), the\nsystem provides informative responses to user inquiries while highlighting\nrelevant passages within the PDF. Upon user upload, the system processes the\nPDF, employing sentence embeddings to create a document-specific vector store.\nThis vector store enables efficient retrieval of pertinent sections in response\nto user queries. The LLM then engages in a conversational exchange, using the\nretrieved information to extract text and generate comprehensive, contextually\naware answers. While our approach demonstrates competitive ROUGE values\ncompared to existing state-of-the-art techniques for text extraction and\nsummarization, we acknowledge that further qualitative evaluation is necessary\nto fully assess its effectiveness in real-world applications. The proposed\nsystem gives competitive ROUGE values as compared to existing state-of-the-art\ntechniques for text extraction and summarization, thus offering a valuable tool\nfor researchers, students, and anyone seeking to efficiently extract knowledge\nand gain insights from documents through an intuitive question-answering\ninterface.\n","authors":["Soham Roy","Mitul Goswami","Nisharg Nargund","Suneeta Mohanty","Prasant Kumar Pattnaik"],"pdf_url":"https://arxiv.org/pdf/2501.09801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09751v1","updated":"2025-01-16T18:58:06Z","published":"2025-01-16T18:58:06Z","title":"OmniThink: Expanding Knowledge Boundaries in Machine Writing through\n  Thinking","summary":"  Machine writing with large language models often relies on\nretrieval-augmented generation. However, these approaches remain confined\nwithin the boundaries of the model's predefined scope, limiting the generation\nof content with rich information. Specifically, vanilla-retrieved information\ntends to lack depth, utility, and suffers from redundancy, which negatively\nimpacts the quality of generated articles, leading to shallow, repetitive, and\nunoriginal outputs. To address these issues, we propose OmniThink, a machine\nwriting framework that emulates the human-like process of iterative expansion\nand reflection. The core idea behind OmniThink is to simulate the cognitive\nbehavior of learners as they progressively deepen their knowledge of the\ntopics. Experimental results demonstrate that OmniThink improves the knowledge\ndensity of generated articles without compromising metrics such as coherence\nand depth. Human evaluations and expert feedback further highlight the\npotential of OmniThink to address real-world challenges in the generation of\nlong-form articles.\n","authors":["Zekun Xi","Wenbiao Yin","Jizhan Fang","Jialong Wu","Runnan Fang","Ningyu Zhang","Jiang Yong","Pengjun Xie","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01945v2","updated":"2025-01-16T18:53:23Z","published":"2025-01-03T18:51:18Z","title":"Cold-Start Recommendation towards the Era of Large Language Models\n  (LLMs): A Comprehensive Survey and Roadmap","summary":"  Cold-start problem is one of the long-standing challenges in recommender\nsystems, focusing on accurately modeling new or interaction-limited users or\nitems to provide better recommendations. Due to the diversification of internet\nplatforms and the exponential growth of users and items, the importance of\ncold-start recommendation (CSR) is becoming increasingly evident. At the same\ntime, large language models (LLMs) have achieved tremendous success and possess\nstrong capabilities in modeling user and item information, providing new\npotential for cold-start recommendations. However, the research community on\nCSR still lacks a comprehensive review and reflection in this field. Based on\nthis, in this paper, we stand in the context of the era of large language\nmodels and provide a comprehensive review and discussion on the roadmap,\nrelated literature, and future directions of CSR. Specifically, we have\nconducted an exploration of the development path of how existing CSR utilizes\ninformation, from content features, graph relations, and domain information, to\nthe world knowledge possessed by large language models, aiming to provide new\ninsights for both the research and industrial communities on CSR. Related\nresources of cold-start recommendations are collected and continuously updated\nfor the community in\nhttps://github.com/YuanchenBei/Awesome-Cold-Start-Recommendation.\n","authors":["Weizhi Zhang","Yuanchen Bei","Liangwei Yang","Henry Peng Zou","Peilin Zhou","Aiwei Liu","Yinghui Li","Hao Chen","Jianling Wang","Yu Wang","Feiran Huang","Sheng Zhou","Jiajun Bu","Allen Lin","James Caverlee","Fakhri Karray","Irwin King","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2501.01945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10479v1","updated":"2025-01-16T20:45:11Z","published":"2025-01-16T20:45:11Z","title":"Lossless Compression of Vector IDs for Approximate Nearest Neighbor\n  Search","summary":"  Approximate nearest neighbor search for vectors relies on indexes that are\nmost often accessed from RAM. Therefore, storage is the factor limiting the\nsize of the database that can be served from a machine. Lossy vector\ncompression, i.e., embedding quantization, has been applied extensively to\nreduce the size of indexes. However, for inverted file and graph-based indices,\nauxiliary data such as vector ids and links (edges) can represent most of the\nstorage cost. We introduce and evaluate lossless compression schemes for these\ncases. These approaches are based on asymmetric numeral systems or wavelet\ntrees that exploit the fact that the ordering of ids is irrelevant within the\ndata structures. In some settings, we are able to compress the vector ids by a\nfactor 7, with no impact on accuracy or search runtime. On billion-scale\ndatasets, this results in a reduction of 30% of the index size. Furthermore, we\nshow that for some datasets, these methods can also compress the quantized\nvector codes losslessly, by exploiting sub-optimalities in the original\nquantization algorithm. The source code for our approach available at\nhttps://github.com/facebookresearch/vector_db_id_compression.\n","authors":["Daniel Severo","Giuseppe Ottaviano","Matthew Muckley","Karen Ullrich","Matthijs Douze"],"pdf_url":"https://arxiv.org/pdf/2501.10479v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2404.04269v2","updated":"2025-01-16T18:59:53Z","published":"2024-03-19T23:27:15Z","title":"Algorithmic Collective Action in Recommender Systems: Promoting Songs by\n  Reordering Playlists","summary":"  We investigate algorithmic collective action in transformer-based recommender\nsystems. Our use case is a music streaming platform where a collective of fans\naims to promote the visibility of an underrepresented artist by strategically\nplacing one of their songs in the existing playlists they control. We introduce\ntwo easily implementable strategies to select the position at which to insert\nthe song with the goal to boost recommendations at test time. The strategies\nexploit statistical properties of the learner by targeting discontinuities in\nthe recommendations, and leveraging the long-tail nature of song distributions.\nWe evaluate the efficacy of our strategies using a publicly available\nrecommender system model released by a major music streaming platform. Our\nfindings reveal that through strategic placement even small collectives\n(controlling less than 0.01\\% of the training data) can achieve up to\n$40\\times$ more test time recommendations than an average song with the same\nnumber of training set occurrences. Focusing on the externalities of the\nstrategy, we find that the recommendations of other songs are largely\npreserved, and the newly gained recommendations are distributed across various\nartists. Together, our findings demonstrate how carefully designed collective\naction strategies can be effective while not necessarily being adversarial.\n","authors":["Joachim Baumann","Celestine Mendler-Dünner"],"pdf_url":"https://arxiv.org/pdf/2404.04269v2.pdf","comment":"Published at NeurIPS 2024, camera-ready updates"},{"id":"http://arxiv.org/abs/2501.09753v1","updated":"2025-01-16T18:59:02Z","published":"2025-01-16T18:59:02Z","title":"SRE-Conv: Symmetric Rotation Equivariant Convolution for Biomedical\n  Image Classification","summary":"  Convolutional neural networks (CNNs) are essential tools for computer vision\ntasks, but they lack traditionally desired properties of extracted features\nthat could further improve model performance, e.g., rotational equivariance.\nSuch properties are ubiquitous in biomedical images, which often lack explicit\norientation. While current work largely relies on data augmentation or explicit\nmodules to capture orientation information, this comes at the expense of\nincreased training costs or ineffective approximations of the desired\nequivariance. To overcome these challenges, we propose a novel and efficient\nimplementation of the Symmetric Rotation-Equivariant (SRE) Convolution\n(SRE-Conv) kernel, designed to learn rotation-invariant features while\nsimultaneously compressing the model size. The SRE-Conv kernel can easily be\nincorporated into any CNN backbone. We validate the ability of a deep SRE-CNN\nto capture equivariance to rotation using the public MedMNISTv2 dataset (16\ntotal tasks). SRE-Conv-CNN demonstrated improved rotated image classification\nperformance accuracy on all 16 test datasets in both 2D and 3D images, all\nwhile increasing efficiency with fewer parameters and reduced memory footprint.\nThe code is available at https://github.com/XYPB/SRE-Conv.\n","authors":["Yuexi Du","Jiazhen Zhang","Tal Zeevi","Nicha C. Dvornek","John A. Onofrey"],"pdf_url":"https://arxiv.org/pdf/2501.09753v1.pdf","comment":"Accepted by IEEE ISBI 2025 4-page paper"},{"id":"http://arxiv.org/abs/2501.09747v1","updated":"2025-01-16T18:57:04Z","published":"2025-01-16T18:57:04Z","title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","summary":"  Autoregressive sequence models, such as Transformer-based vision-language\naction (VLA) policies, can be tremendously effective for capturing complex and\ngeneralizable robotic behaviors. However, such models require us to choose a\ntokenization of our continuous action signals, which determines how the\ndiscrete symbols predicted by the model map to continuous robot actions. We\nfind that current approaches for robot action tokenization, based on simple\nper-dimension, per-timestep binning schemes, typically perform poorly when\nlearning dexterous skills from high-frequency robot data. To address this\nchallenge, we propose a new compression-based tokenization scheme for robot\nactions, based on the discrete cosine transform. Our tokenization approach,\nFrequency-space Action Sequence Tokenization (FAST), enables us to train\nautoregressive VLAs for highly dexterous and high-frequency tasks where\nstandard discretization methods fail completely. Based on FAST, we release\nFAST+, a universal robot action tokenizer, trained on 1M real robot action\ntrajectories. It can be used as a black-box tokenizer for a wide range of robot\naction sequences, with diverse action spaces and control frequencies. Finally,\nwe show that, when combined with the pi0 VLA, our method can scale to training\non 10k hours of robot data and match the performance of diffusion VLAs, while\nreducing training time by up to 5x.\n","authors":["Karl Pertsch","Kyle Stachowicz","Brian Ichter","Danny Driess","Suraj Nair","Quan Vuong","Oier Mees","Chelsea Finn","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2501.09747v1.pdf","comment":"Website: https://www.pi.website/research/fast"},{"id":"http://arxiv.org/abs/2501.09745v1","updated":"2025-01-16T18:55:38Z","published":"2025-01-16T18:55:38Z","title":"Suggesting Code Edits in Interactive Machine Learning Notebooks Using\n  Large Language Models","summary":"  Machine learning developers frequently use interactive computational\nnotebooks, such as Jupyter notebooks, to host code for data processing and\nmodel training. Jupyter notebooks provide a convenient tool for writing machine\nlearning pipelines and interactively observing outputs, however, maintaining\nJupyter notebooks, e.g., to add new features or fix bugs, can be challenging\ndue to the length and complexity of the notebooks. Moreover, there is no\nexisting benchmark related to developer edits on Jupyter notebooks. To address\nthis, we present the first dataset of 48,398 Jupyter notebook edits derived\nfrom 20,095 revisions of 792 machine learning repositories on GitHub, and\nperform the first study of the using LLMs to predict code edits in Jupyter\nnotebooks. Our dataset captures granular details of cell-level and line-level\nmodifications, offering a foundation for understanding real-world maintenance\npatterns in machine learning workflows. We observed that the edits on Jupyter\nnotebooks are highly localized, with changes averaging only 166 lines of code\nin repositories. While larger models outperform smaller counterparts in code\nediting, all models have low accuracy on our dataset even after finetuning,\ndemonstrating the complexity of real-world machine learning maintenance tasks.\nOur findings emphasize the critical role of contextual information in improving\nmodel performance and point toward promising avenues for advancing large\nlanguage models' capabilities in engineering machine learning code.\n","authors":["Bihui Jin","Jiayue Wang","Pengyu Nie"],"pdf_url":"https://arxiv.org/pdf/2501.09745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04845v2","updated":"2025-01-16T18:48:36Z","published":"2024-12-06T08:30:01Z","title":"Using Machine Learning to Discover Parsimonious and\n  Physically-Interpretable Representations of Catchment-Scale Rainfall-Runoff\n  Dynamics","summary":"  Despite the excellent real-world predictive performance of modern machine\nlearning (ML) methods, many scientists remain hesitant to discard traditional\nphysical-conceptual (PC) approaches due mainly to their relative\ninterpretability, which contributes to credibility during decision-making. In\nthis context, a currently underexplored aspect of ML is how to develop\nminimally-optimal representations that can facilitate better insight regarding\nsystem functioning. Regardless of how this is achieved, it is arguably true\nthat parsimonious representations better support the advancement of scientific\nunderstanding. Our own view is that ML-based modeling of geoscientific systems\nshould be based in the use of computational units that are fundamentally\ninterpretable by design.\n  This paper continues our exploration of how the strengths of ML can be\nexploited in the service of better understanding via scientific investigation.\nHere, we use the Mass Conserving Perceptron (MCP) as the fundamental\ncomputational unit in a generic network architecture consisting of nodes\narranged in series and parallel to explore several generic and important issues\nrelated to the use of observational data for constructing input-state-output\nmodels of dynamical systems. In the context of lumped catchment modeling, we\nshow that physical interpretability and excellent predictive performance can\nboth be achieved using a relatively parsimonious distributed-state\nmultiple-flow-path network with context-dependent gating and information\nsharing across the nodes, suggesting that MCP-based modeling can play a\nsignificant role in application of ML to geoscientific investigation.\n","authors":["Yuan-Heng Wang","Hoshin V. Gupta"],"pdf_url":"https://arxiv.org/pdf/2412.04845v2.pdf","comment":"74 Pages, 4 Tables, 13 Figures, 11 Tables and 11 Figures in\n  Supplementary Materials"},{"id":"http://arxiv.org/abs/2501.09734v1","updated":"2025-01-16T18:37:59Z","published":"2025-01-16T18:37:59Z","title":"Random Subspace Cubic-Regularization Methods, with Applications to\n  Low-Rank Functions","summary":"  We propose and analyze random subspace variants of the second-order Adaptive\nRegularization using Cubics (ARC) algorithm. These methods iteratively restrict\nthe search space to some random subspace of the parameters, constructing and\nminimizing a local model only within this subspace. Thus, our variants only\nrequire access to (small-dimensional) projections of first- and second-order\nproblem derivatives and calculate a reduced step inexpensively. Under suitable\nassumptions, the ensuing methods maintain the optimal first-order, and\nsecond-order, global rates of convergence of (full-dimensional) cubic\nregularization, while showing improved scalability both theoretically and\nnumerically, particularly when applied to low-rank functions. When applied to\nthe latter, our adaptive variant naturally adapts the subspace size to the true\nrank of the function, without knowing it a priori.\n","authors":["Coralia Cartis","Zhen Shao","Edward Tansley"],"pdf_url":"https://arxiv.org/pdf/2501.09734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09731v1","updated":"2025-01-16T18:30:33Z","published":"2025-01-16T18:30:33Z","title":"Predictions as Surrogates: Revisiting Surrogate Outcomes in the Age of\n  AI","summary":"  We establish a formal connection between the decades-old surrogate outcome\nmodel in biostatistics and economics and the emerging field of\nprediction-powered inference (PPI). The connection treats predictions from\npre-trained models, prevalent in the age of AI, as cost-effective surrogates\nfor expensive outcomes. Building on the surrogate outcomes literature, we\ndevelop recalibrated prediction-powered inference, a more efficient approach to\nstatistical inference than existing PPI proposals. Our method departs from the\nexisting proposals by using flexible machine learning techniques to learn the\noptimal ``imputed loss'' through a step we call recalibration. Importantly, the\nmethod always improves upon the estimator that relies solely on the data with\navailable true outcomes, even when the optimal imputed loss is estimated\nimperfectly, and it achieves the smallest asymptotic variance among PPI\nestimators if the estimate is consistent. Computationally, our optimization\nobjective is convex whenever the loss function that defines the target\nparameter is convex. We further analyze the benefits of recalibration, both\ntheoretically and numerically, in several common scenarios where machine\nlearning predictions systematically deviate from the outcome of interest. We\ndemonstrate significant gains in effective sample size over existing PPI\nproposals via three applications leveraging state-of-the-art machine\nlearning/AI models.\n","authors":["Wenlong Ji","Lihua Lei","Tijana Zrnic"],"pdf_url":"https://arxiv.org/pdf/2501.09731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09729v1","updated":"2025-01-16T18:25:50Z","published":"2025-01-16T18:25:50Z","title":"Generating particle physics Lagrangians with transformers","summary":"  In physics, Lagrangians provide a systematic way to describe laws governing\nphysical systems. In the context of particle physics, they encode the\ninteractions and behavior of the fundamental building blocks of our universe.\nBy treating Lagrangians as complex, rule-based constructs similar to linguistic\nexpressions, we trained a transformer model -- proven to be effective in\nnatural language tasks -- to predict the Lagrangian corresponding to a given\nlist of particles. We report on the transformer's performance in constructing\nLagrangians respecting the Standard Model $\\mathrm{SU}(3)\\times\n\\mathrm{SU}(2)\\times \\mathrm{U}(1)$ gauge symmetries. The resulting model is\nshown to achieve high accuracies (over 90\\%) with Lagrangians up to six matter\nfields, with the capacity to generalize beyond the training distribution,\nalbeit within architectural constraints. We show through an analysis of input\nembeddings that the model has internalized concepts such as group\nrepresentations and conjugation operations as it learned to generate\nLagrangians. We make the model and training datasets available to the\ncommunity. An interactive demonstration can be found at:\n\\url{https://huggingface.co/spaces/JoseEliel/generate-lagrangians}.\n","authors":["Yong Sheng Koay","Rikard Enberg","Stefano Moretti","Eliel Camargo-Molina"],"pdf_url":"https://arxiv.org/pdf/2501.09729v1.pdf","comment":"32 pages, 11 figues, 18 tables"},{"id":"http://arxiv.org/abs/2501.09722v1","updated":"2025-01-16T18:10:37Z","published":"2025-01-16T18:10:37Z","title":"Attention based Bidirectional GRU hybrid model for inappropriate content\n  detection in Urdu language","summary":"  With the increased use of the internet and social networks for online\ndiscussions, the spread of toxic and inappropriate content on social networking\nsites has also increased. Several studies have been conducted in different\nlanguages. However, there is less work done for South Asian languages for\ninappropriate content identification using deep learning techniques. In Urdu\nlanguage, the spellings are not unique, and people write different common\nspellings for the same word, while mixing it other languages, like English in\nthe text makes it more challenging, and limited research work is available to\nprocess such language with the finest algorithms. The use of attention layer\nwith a deep learning model can help handling the long-term dependencies and\nincrease its efficiency . To explore the effects of the attention layer, this\nstudy proposes attention-based Bidirectional GRU hybrid model for identifying\ninappropriate content in Urdu Unicode text language. Four different baseline\ndeep learning models; LSTM, Bi-LSTM, GRU, and TCN, are used to compare the\nperformance of the proposed model. The results of these models were compared\nbased on evaluation metrics, dataset size, and impact of the word embedding\nlayer. The pre-trained Urdu word2Vec embeddings were utilized for our case. Our\nproposed model BiGRU-A outperformed all other baseline models by yielding 84\\%\naccuracy without using pre-trained word2Vec layer. From our experiments, we\nhave established that the attention layer improves the model's efficiency, and\npre-trained word2Vec embedding does not work well with an inappropriate content\ndataset.\n","authors":["Ezzah Shoukat","Rabia Irfan","Iqra Basharat","Muhammad Ali Tahir","Sameen Shaukat"],"pdf_url":"https://arxiv.org/pdf/2501.09722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09705v1","updated":"2025-01-16T17:57:53Z","published":"2025-01-16T17:57:53Z","title":"Practical Continual Forgetting for Pre-trained Vision Models","summary":"  For privacy and security concerns, the need to erase unwanted information\nfrom pre-trained vision models is becoming evident nowadays. In real-world\nscenarios, erasure requests originate at any time from both users and model\nowners, and these requests usually form a sequence. Therefore, under such a\nsetting, selective information is expected to be continuously removed from a\npre-trained model while maintaining the rest. We define this problem as\ncontinual forgetting and identify three key challenges. (i) For unwanted\nknowledge, efficient and effective deleting is crucial. (ii) For remaining\nknowledge, the impact brought by the forgetting procedure should be minimal.\n(iii) In real-world scenarios, the training samples may be scarce or partially\nmissing during the process of forgetting. To address them, we first propose\nGroup Sparse LoRA (GS-LoRA). Specifically, towards (i), we introduce LoRA\nmodules to fine-tune the FFN layers in Transformer blocks for each forgetting\ntask independently, and towards (ii), a simple group sparse regularization is\nadopted, enabling automatic selection of specific LoRA groups and zeroing out\nthe others. To further extend GS-LoRA to more practical scenarios, we\nincorporate prototype information as additional supervision and introduce a\nmore practical approach, GS-LoRA++. For each forgotten class, we move the\nlogits away from its original prototype. For the remaining classes, we pull the\nlogits closer to their respective prototypes. We conduct extensive experiments\non face recognition, object detection and image classification and demonstrate\nthat our method manages to forget specific classes with minimal impact on other\nclasses. Codes have been released on https://github.com/bjzhb666/GS-LoRA.\n","authors":["Hongbo Zhao","Fei Zhu","Bolin Ni","Feng Zhu","Gaofeng Meng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09700v1","updated":"2025-01-16T17:54:56Z","published":"2025-01-16T17:54:56Z","title":"Cueless EEG imagined speech for subject identification: dataset and\n  benchmarks","summary":"  Electroencephalogram (EEG) signals have emerged as a promising modality for\nbiometric identification. While previous studies have explored the use of\nimagined speech with semantically meaningful words for subject identification,\nmost have relied on additional visual or auditory cues. In this study, we\nintroduce a cueless EEG-based imagined speech paradigm, where subjects imagine\nthe pronunciation of semantically meaningful words without any external cues.\nThis innovative approach addresses the limitations of prior methods by\nrequiring subjects to select and imagine words from a predefined list\nnaturally. The dataset comprises over 4,350 trials from 11 subjects across five\nsessions. We assess a variety of classification methods, including traditional\nmachine learning techniques such as Support Vector Machines (SVM) and XGBoost,\nas well as time-series foundation models and deep learning architectures\nspecifically designed for EEG classification, such as EEG Conformer and Shallow\nConvNet. A session-based hold-out validation strategy was employed to ensure\nreliable evaluation and prevent data leakage. Our results demonstrate\noutstanding classification accuracy, reaching 97.93%. These findings highlight\nthe potential of cueless EEG paradigms for secure and reliable subject\nidentification in real-world applications, such as brain-computer interfaces\n(BCIs).\n","authors":["Ali Derakhshesh","Zahra Dehghanian","Reza Ebrahimpour","Hamid R. Rabiee"],"pdf_url":"https://arxiv.org/pdf/2501.09700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09691v1","updated":"2025-01-16T17:44:18Z","published":"2025-01-16T17:44:18Z","title":"A Near-optimal Algorithm for Learning Margin Halfspaces with Massart\n  Noise","summary":"  We study the problem of PAC learning $\\gamma$-margin halfspaces in the\npresence of Massart noise. Without computational considerations, the sample\ncomplexity of this learning problem is known to be\n$\\widetilde{\\Theta}(1/(\\gamma^2 \\epsilon))$. Prior computationally efficient\nalgorithms for the problem incur sample complexity $\\tilde{O}(1/(\\gamma^4\n\\epsilon^3))$ and achieve 0-1 error of $\\eta+\\epsilon$, where $\\eta<1/2$ is the\nupper bound on the noise rate. Recent work gave evidence of an\ninformation-computation tradeoff, suggesting that a quadratic dependence on\n$1/\\epsilon$ is required for computationally efficient algorithms. Our main\nresult is a computationally efficient learner with sample complexity\n$\\widetilde{\\Theta}(1/(\\gamma^2 \\epsilon^2))$, nearly matching this lower\nbound. In addition, our algorithm is simple and practical, relying on online\nSGD on a carefully selected sequence of convex losses.\n","authors":["Ilias Diakonikolas","Nikos Zarifis"],"pdf_url":"https://arxiv.org/pdf/2501.09691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09687v1","updated":"2025-01-16T17:39:25Z","published":"2025-01-16T17:39:25Z","title":"U-Fair: Uncertainty-based Multimodal Multitask Learning for Fairer\n  Depression Detection","summary":"  Machine learning bias in mental health is becoming an increasingly pertinent\nchallenge. Despite promising efforts indicating that multitask approaches often\nwork better than unitask approaches, there is minimal work investigating the\nimpact of multitask learning on performance and fairness in depression\ndetection nor leveraged it to achieve fairer prediction outcomes. In this work,\nwe undertake a systematic investigation of using a multitask approach to\nimprove performance and fairness for depression detection. We propose a novel\ngender-based task-reweighting method using uncertainty grounded in how the\nPHQ-8 questionnaire is structured. Our results indicate that, although a\nmultitask approach improves performance and fairness compared to a unitask\napproach, the results are not always consistent and we see evidence of negative\ntransfer and a reduction in the Pareto frontier, which is concerning given the\nhigh-stake healthcare setting. Our proposed approach of gender-based\nreweighting with uncertainty improves performance and fairness and alleviates\nboth challenges to a certain extent. Our findings on each PHQ-8 subitem task\ndifficulty are also in agreement with the largest study conducted on the PHQ-8\nsubitem discrimination capacity, thus providing the very first tangible\nevidence linking ML findings with large-scale empirical population studies\nconducted on the PHQ-8.\n","authors":["Jiaee Cheong","Aditya Bangar","Sinan Kalkan","Hatice Gunes"],"pdf_url":"https://arxiv.org/pdf/2501.09687v1.pdf","comment":"To appear at the Proceedings of Machine Learning Research 259, 1-14,\n  2024 as part of the Machine Learning for Health (ML4H) Symposium 2024"},{"id":"http://arxiv.org/abs/2501.09685v1","updated":"2025-01-16T17:37:35Z","published":"2025-01-16T17:37:35Z","title":"Reward-Guided Controlled Generation for Inference-Time Alignment in\n  Diffusion Models: Tutorial and Review","summary":"  This tutorial provides an in-depth guide on inference-time guidance and\nalignment methods for optimizing downstream reward functions in diffusion\nmodels. While diffusion models are renowned for their generative modeling\ncapabilities, practical applications in fields such as biology often require\nsample generation that maximizes specific metrics (e.g., stability, affinity in\nproteins, closeness to target structures). In these scenarios, diffusion models\ncan be adapted not only to generate realistic samples but also to explicitly\nmaximize desired measures at inference time without fine-tuning. This tutorial\nexplores the foundational aspects of such inference-time algorithms. We review\nthese methods from a unified perspective, demonstrating that current techniques\n-- such as Sequential Monte Carlo (SMC)-based guidance, value-based sampling,\nand classifier guidance -- aim to approximate soft optimal denoising processes\n(a.k.a. policies in RL) that combine pre-trained denoising processes with value\nfunctions serving as look-ahead functions that predict from intermediate states\nto terminal rewards. Within this framework, we present several novel algorithms\nnot yet covered in the literature. Furthermore, we discuss (1) fine-tuning\nmethods combined with inference-time techniques, (2) inference-time algorithms\nbased on search algorithms such as Monte Carlo tree search, which have received\nlimited attention in current research, and (3) connections between\ninference-time algorithms in language models and diffusion models. The code of\nthis tutorial on protein design is available at\nhttps://github.com/masa-ue/AlignInversePro\n","authors":["Masatoshi Uehara","Yulai Zhao","Chenyu Wang","Xiner Li","Aviv Regev","Sergey Levine","Tommaso Biancalani"],"pdf_url":"https://arxiv.org/pdf/2501.09685v1.pdf","comment":"We plan to add more content/codes. Please let us know if there are\n  any comments"},{"id":"http://arxiv.org/abs/2501.09683v1","updated":"2025-01-16T17:34:49Z","published":"2025-01-16T17:34:49Z","title":"Rough kernel hedging","summary":"  Building on the functional-analytic framework of operator-valued kernels and\nun-truncated signature kernels, we propose a scalable, provably convergent\nsignature-based algorithm for a broad class of high-dimensional, path-dependent\nhedging problems. We make minimal assumptions about market dynamics by\nmodelling them as general geometric rough paths, yielding a fully model-free\napproach. Furthermore, through a representer theorem, we provide theoretical\nguarantees on the existence and uniqueness of a global minimum for the\nresulting optimization problem and derive an analytic solution under highly\ngeneral loss functions. Similar to the popular deep hedging approach, but in a\nmore rigorous fashion, our method can also incorporate additional features via\nthe underlying operator-valued kernel, such as trading signals, news analytics,\nand past hedging decisions, closely aligning with true machine-learning\npractice.\n","authors":["Nicola Muca Cirone","Cristopher Salvi"],"pdf_url":"https://arxiv.org/pdf/2501.09683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04202v7","updated":"2025-01-16T17:28:26Z","published":"2024-03-07T04:12:24Z","title":"Dynamics of Moral Behavior in Heterogeneous Populations of Learning\n  Agents","summary":"  Growing concerns about safety and alignment of AI systems highlight the\nimportance of embedding moral capabilities in artificial agents: a promising\nsolution is the use of learning from experience, i.e., Reinforcement Learning.\nIn multi-agent (social) environments, complex population-level phenomena may\nemerge from interactions between individual learning agents. Many of the\nexisting studies rely on simulated social dilemma environments to study the\ninteractions of independent learning agents; however, they tend to ignore the\nmoral heterogeneity that is likely to be present in societies of agents in\npractice. For example, at different points in time a single learning agent may\nface opponents who are consequentialist (i.e., focused on maximizing outcomes\nover time), norm-based (i.e., conforming to specific norms), or virtue-based\n(i.e., considering a combination of different virtues). The extent to which\nagents' co-development may be impacted by such moral heterogeneity in\npopulations is not well understood. In this paper, we present a study of the\nlearning dynamics of morally heterogeneous populations interacting in a social\ndilemma setting. Using an Iterated Prisoner's Dilemma environment with a\npartner selection mechanism, we investigate the extent to which the prevalence\nof diverse moral agents in populations affects individual agents' learning\nbehaviors and emergent population-level outcomes. We observe several types of\nnon-trivial interactions between pro-social and anti-social agents, and find\nthat certain types of moral agents are able to steer selfish agents towards\nmore cooperative behavior.\n","authors":["Elizaveta Tennant","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2403.04202v7.pdf","comment":"Presented at AIES 2024 (7th AAAI/ACM Conference on AI, Ethics, and\n  Society - San Jose, CA, USA) - see\n  https://ojs.aaai.org/index.php/AIES/article/view/31736"},{"id":"http://arxiv.org/abs/2501.09659v1","updated":"2025-01-16T16:54:40Z","published":"2025-01-16T16:54:40Z","title":"Fokker-Planck to Callan-Symanzik: evolution of weight matrices under\n  training","summary":"  The dynamical evolution of a neural network during training has been an\nincredibly fascinating subject of study. First principal derivation of generic\nevolution of variables in statistical physics systems has proved useful when\nused to describe training dynamics conceptually, which in practice means\nnumerically solving equations such as Fokker-Planck equation. Simulating entire\nnetworks inevitably runs into the curse of dimensionality. In this paper, we\nutilize Fokker-Planck to simulate the probability density evolution of\nindividual weight matrices in the bottleneck layers of a simple\n2-bottleneck-layered auto-encoder and compare the theoretical evolutions\nagainst the empirical ones by examining the output data distributions. We also\nderive physically relevant partial differential equations such as\nCallan-Symanzik and Kardar-Parisi-Zhang equations from the dynamical equation\nwe have.\n","authors":["Wei Bu","Uri Kol","Ziming Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09659v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.09655v1","updated":"2025-01-16T16:51:59Z","published":"2025-01-16T16:51:59Z","title":"A Survey of Research in Large Language Models for Electronic Design\n  Automation","summary":"  Within the rapidly evolving domain of Electronic Design Automation (EDA),\nLarge Language Models (LLMs) have emerged as transformative technologies,\noffering unprecedented capabilities for optimizing and automating various\naspects of electronic design. This survey provides a comprehensive exploration\nof LLM applications in EDA, focusing on advancements in model architectures,\nthe implications of varying model sizes, and innovative customization\ntechniques that enable tailored analytical insights. By examining the\nintersection of LLM capabilities and EDA requirements, the paper highlights the\nsignificant impact these models have on extracting nuanced understandings from\ncomplex datasets. Furthermore, it addresses the challenges and opportunities in\nintegrating LLMs into EDA workflows, paving the way for future research and\napplication in this dynamic field. Through this detailed analysis, the survey\naims to offer valuable insights to professionals in the EDA industry, AI\nresearchers, and anyone interested in the convergence of advanced AI\ntechnologies and electronic design.\n","authors":["Jingyu Pan","Guanglei Zhou","Chen-Chia Chang","Isaac Jacobson","Jiang Hu","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09655v1.pdf","comment":"21 pages, 2 figures, 3 tables, accepted by TODAES"},{"id":"http://arxiv.org/abs/2405.17097v2","updated":"2025-01-16T16:27:33Z","published":"2024-05-27T12:12:26Z","title":"A Comparative Study on Multi-task Uncertainty Quantification in Semantic\n  Segmentation and Monocular Depth Estimation","summary":"  Deep neural networks excel in perception tasks such as semantic segmentation\nand monocular depth estimation, making them indispensable in safety-critical\napplications like autonomous driving and industrial inspection. However, they\noften suffer from overconfidence and poor explainability, especially for\nout-of-domain data. While uncertainty quantification has emerged as a promising\nsolution to these challenges, multi-task settings have yet to be explored. In\nan effort to shed light on this, we evaluate Monte Carlo Dropout, Deep\nSub-Ensembles, and Deep Ensembles for joint semantic segmentation and monocular\ndepth estimation. Thereby, we reveal that Deep Ensembles stand out as the\npreferred choice, particularly in out-of-domain scenarios, and show the\npotential benefit of multi-task learning with regard to the uncertainty quality\nin comparison to solving both tasks separately. Additionally, we highlight the\nimpact of employing different uncertainty thresholds to classify pixels as\ncertain or uncertain, with the median uncertainty emerging as a robust default.\n","authors":["Steven Landgraf","Markus Hillemann","Theodor Kapler","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2405.17097v2.pdf","comment":"This manuscript is an extended version of a previously published\n  conference paper and is currently in review for a journal"},{"id":"http://arxiv.org/abs/2501.09636v1","updated":"2025-01-16T16:25:30Z","published":"2025-01-16T16:25:30Z","title":"LLM-Based Routing in Mixture of Experts: A Novel Framework for Trading","summary":"  Recent advances in deep learning and large language models (LLMs) have\nfacilitated the deployment of the mixture-of-experts (MoE) mechanism in the\nstock investment domain. While these models have demonstrated promising trading\nperformance, they are often unimodal, neglecting the wealth of information\navailable in other modalities, such as textual data. Moreover, the traditional\nneural network-based router selection mechanism fails to consider contextual\nand real-world nuances, resulting in suboptimal expert selection. To address\nthese limitations, we propose LLMoE, a novel framework that employs LLMs as the\nrouter within the MoE architecture. Specifically, we replace the conventional\nneural network-based router with LLMs, leveraging their extensive world\nknowledge and reasoning capabilities to select experts based on historical\nprice data and stock news. This approach provides a more effective and\ninterpretable selection mechanism. Our experiments on multimodal real-world\nstock datasets demonstrate that LLMoE outperforms state-of-the-art MoE models\nand other deep neural network approaches. Additionally, the flexible\narchitecture of LLMoE allows for easy adaptation to various downstream tasks.\n","authors":["Kuan-Ming Liu","Ming-Chih Lo"],"pdf_url":"https://arxiv.org/pdf/2501.09636v1.pdf","comment":"Accepted by AAAI 2025 Workshop on AI for Social Impact - Bridging\n  Innovations in Finance, Social Media, and Crime Prevention"},{"id":"http://arxiv.org/abs/2501.09631v1","updated":"2025-01-16T16:19:53Z","published":"2025-01-16T16:19:53Z","title":"Empowering Large Language Models in Wireless Communication: A Novel\n  Dataset and Fine-Tuning Framework","summary":"  In this work, we develop a specialized dataset aimed at enhancing the\nevaluation and fine-tuning of large language models (LLMs) specifically for\nwireless communication applications. The dataset includes a diverse set of\nmulti-hop questions, including true/false and multiple-choice types, spanning\nvarying difficulty levels from easy to hard. By utilizing advanced language\nmodels for entity extraction and question generation, rigorous data curation\nprocesses are employed to maintain high quality and relevance. Additionally, we\nintroduce a Pointwise V-Information (PVI) based fine-tuning method, providing a\ndetailed theoretical analysis and justification for its use in quantifying the\ninformation content of training data with 2.24\\% and 1.31\\% performance boost\nfor different models compared to baselines, respectively. To demonstrate the\neffectiveness of the fine-tuned models with the proposed methodologies on\npractical tasks, we also consider different tasks, including summarizing\noptimization problems from technical papers and solving the mathematical\nproblems related to non-orthogonal multiple access (NOMA), which are generated\nby using the proposed multi-agent framework. Simulation results show\nsignificant performance gain in summarization tasks with 20.9\\% in the ROUGE-L\nmetrics. We also study the scaling laws of fine-tuning LLMs and the challenges\nLLMs face in the field of wireless communications, offering insights into their\nadaptation to wireless communication tasks. This dataset and fine-tuning\nmethodology aim to enhance the training and evaluation of LLMs, contributing to\nadvancements in LLMs for wireless communication research and applications.\n","authors":["Yushen Lin","Ruichen Zhang","Wenqi Huang","Kaidi Wang","Zhiguo Ding","Daniel K. C. So","Dusit Niyato"],"pdf_url":"https://arxiv.org/pdf/2501.09631v1.pdf","comment":"13 pages, 13 figure, journal"},{"id":"http://arxiv.org/abs/2501.06278v2","updated":"2025-01-16T16:19:24Z","published":"2025-01-10T13:07:11Z","title":"Aligning Brain Activity with Advanced Transformer Models: Exploring the\n  Role of Punctuation in Semantic Processing","summary":"  This research examines the congruence between neural activity and advanced\ntransformer models, emphasizing the semantic significance of punctuation in\ntext understanding. Utilizing an innovative approach originally proposed by\nToneva and Wehbe, we evaluate four advanced transformer models RoBERTa,\nDistiliBERT, ALBERT, and ELECTRA against neural activity data. Our findings\nindicate that RoBERTa exhibits the closest alignment with neural activity,\nsurpassing BERT in accuracy. Furthermore, we investigate the impact of\npunctuation removal on model performance and neural alignment, revealing that\nBERT's accuracy enhances in the absence of punctuation. This study contributes\nto the comprehension of how neural networks represent language and the\ninfluence of punctuation on semantic processing within the human brain.\n","authors":["Zenon Lamprou","Frank Polick","Yashar Moshfeghi"],"pdf_url":"https://arxiv.org/pdf/2501.06278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03840v2","updated":"2025-01-16T16:12:29Z","published":"2024-11-06T11:24:02Z","title":"Flexible task abstractions emerge in linear networks with fast and\n  bounded units","summary":"  Animals survive in dynamic environments changing at arbitrary timescales, but\nsuch data distribution shifts are a challenge to neural networks. To adapt to\nchange, neural systems may change a large number of parameters, which is a slow\nprocess involving forgetting past information. In contrast, animals leverage\ndistribution changes to segment their stream of experience into tasks and\nassociate them with internal task abstracts. Animals can then respond flexibly\nby selecting the appropriate task abstraction. However, how such flexible task\nabstractions may arise in neural systems remains unknown. Here, we analyze a\nlinear gated network where the weights and gates are jointly optimized via\ngradient descent, but with neuron-like constraints on the gates including a\nfaster timescale, nonnegativity, and bounded activity. We observe that the\nweights self-organize into modules specialized for tasks or sub-tasks\nencountered, while the gates layer forms unique representations that switch the\nappropriate weight modules (task abstractions). We analytically reduce the\nlearning dynamics to an effective eigenspace, revealing a virtuous cycle: fast\nadapting gates drive weight specialization by protecting previous knowledge,\nwhile weight specialization in turn increases the update rate of the gating\nlayer. Task switching in the gating layer accelerates as a function of\ncurriculum block size and task training, mirroring key findings in cognitive\nneuroscience. We show that the discovered task abstractions support\ngeneralization through both task and subtask composition, and we extend our\nfindings to a non-linear network switching between two tasks. Overall, our work\noffers a theory of cognitive flexibility in animals as arising from joint\ngradient descent on synaptic and neural gating in a neural network\narchitecture.\n","authors":["Kai Sandbrink","Jan P. Bauer","Alexandra M. Proca","Andrew M. Saxe","Christopher Summerfield","Ali Hummos"],"pdf_url":"https://arxiv.org/pdf/2411.03840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10729v3","updated":"2025-01-16T16:04:07Z","published":"2024-06-15T20:04:06Z","title":"A Comprehensive Survey of Foundation Models in Medicine","summary":"  Foundation models (FMs) are large-scale deep learning models trained on\nmassive datasets, often using self-supervised learning techniques. These models\nserve as a versatile base for a wide range of downstream tasks, including those\nin medicine and healthcare. FMs have demonstrated remarkable success across\nmultiple healthcare domains. However, existing surveys in this field do not\ncomprehensively cover all areas where FMs have made significant strides. In\nthis survey, we present a comprehensive review of FMs in medicine, focusing on\ntheir evolution, learning strategies, flagship models, applications, and\nassociated challenges. We examine how prominent FMs, such as the BERT and GPT\nfamilies, are transforming various aspects of healthcare, including clinical\nlarge language models, medical image analysis, and omics research.\nAdditionally, we provide a detailed taxonomy of FM-enabled healthcare\napplications, spanning clinical natural language processing, medical computer\nvision, graph learning, and other biology- and omics- related tasks. Despite\nthe transformative potentials of FMs, they also pose unique challenges. This\nsurvey delves into these challenges and highlights open research questions and\nlessons learned to guide researchers and practitioners. Our goal is to provide\nvaluable insights into the capabilities of FMs in health, facilitating\nresponsible deployment and mitigating associated risks.\n","authors":["Wasif Khan","Seowung Leem","Kyle B. See","Joshua K. Wong","Shaoting Zhang","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2406.10729v3.pdf","comment":"Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING"},{"id":"http://arxiv.org/abs/2501.09621v1","updated":"2025-01-16T16:00:52Z","published":"2025-01-16T16:00:52Z","title":"Weight for Robustness: A Comprehensive Approach towards Optimal\n  Fault-Tolerant Asynchronous ML","summary":"  We address the challenges of Byzantine-robust training in asynchronous\ndistributed machine learning systems, aiming to enhance efficiency amid massive\nparallelization and heterogeneous computing resources. Asynchronous systems,\nmarked by independently operating workers and intermittent updates, uniquely\nstruggle with maintaining integrity against Byzantine failures, which encompass\nmalicious or erroneous actions that disrupt learning. The inherent delays in\nsuch settings not only introduce additional bias to the system but also obscure\nthe disruptions caused by Byzantine faults. To tackle these issues, we adapt\nthe Byzantine framework to asynchronous dynamics by introducing a novel\nweighted robust aggregation framework. This allows for the extension of robust\naggregators and a recent meta-aggregator to their weighted versions, mitigating\nthe effects of delayed updates. By further incorporating a recent\nvariance-reduction technique, we achieve an optimal convergence rate for the\nfirst time in an asynchronous Byzantine environment. Our methodology is\nrigorously validated through empirical and theoretical analysis, demonstrating\nits effectiveness in enhancing fault tolerance and optimizing performance in\nasynchronous ML systems.\n","authors":["Tehila Dahan","Kfir Y. Levy"],"pdf_url":"https://arxiv.org/pdf/2501.09621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09620v1","updated":"2025-01-16T16:00:37Z","published":"2025-01-16T16:00:37Z","title":"Beyond Reward Hacking: Causal Rewards for Large Language Model Alignment","summary":"  Recent advances in large language models (LLMs) have demonstrated significant\nprogress in performing complex tasks. While Reinforcement Learning from Human\nFeedback (RLHF) has been effective in aligning LLMs with human preferences, it\nis susceptible to spurious correlations in reward modeling. Consequently, it\noften introduces biases-such as length bias, sycophancy, conceptual bias, and\ndiscrimination that hinder the model's ability to capture true causal\nrelationships. To address this, we propose a novel causal reward modeling\napproach that integrates causal inference to mitigate these spurious\ncorrelations. Our method enforces counterfactual invariance, ensuring reward\npredictions remain consistent when irrelevant variables are altered. Through\nexperiments on both synthetic and real-world datasets, we show that our\napproach mitigates various types of spurious correlations effectively,\nresulting in more reliable and fair alignment of LLMs with human preferences.\nAs a drop-in enhancement to the existing RLHF workflow, our causal reward\nmodeling provides a practical way to improve the trustworthiness and fairness\nof LLM finetuning.\n","authors":["Chaoqi Wang","Zhuokai Zhao","Yibo Jiang","Zhaorun Chen","Chen Zhu","Yuxin Chen","Jiayi Liu","Lizhu Zhang","Xiangjun Fan","Hao Ma","Sinong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01818v3","updated":"2025-01-16T15:58:24Z","published":"2023-12-04T11:46:34Z","title":"Hybrid Approaches for Moral Value Alignment in AI Agents: a Manifesto","summary":"  Increasing interest in ensuring the safety of next-generation Artificial\nIntelligence (AI) systems calls for novel approaches to embedding morality into\nautonomous agents. This goal differs qualitatively from traditional\ntask-specific AI methodologies. In this paper, we provide a systematization of\nexisting approaches to the problem of introducing morality in machines -\nmodelled as a continuum. Our analysis suggests that popular techniques lie at\nthe extremes of this continuum - either being fully hard-coded into top-down,\nexplicit rules, or entirely learned in a bottom-up, implicit fashion with no\ndirect statement of any moral principle (this includes learning from human\nfeedback, as applied to the training and finetuning of large language models,\nor LLMs). Given the relative strengths and weaknesses of each type of\nmethodology, we argue that more hybrid solutions are needed to create adaptable\nand robust, yet controllable and interpretable agentic systems. To that end,\nthis paper discusses both the ethical foundations (including deontology,\nconsequentialism and virtue ethics) and implementations of morally aligned AI\nsystems.\n  We present a series of case studies that rely on intrinsic rewards, moral\nconstraints or textual instructions, applied to either pure-Reinforcement\nLearning or LLM-based agents. By analysing these diverse implementations under\none framework, we compare their relative strengths and shortcomings in\ndeveloping morally aligned AI systems. We then discuss strategies for\nevaluating the effectiveness of moral learning agents. Finally, we present open\nresearch questions and implications for the future of AI safety and ethics\nwhich are emerging from this hybrid framework.\n","authors":["Elizaveta Tennant","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2312.01818v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.09980v4","updated":"2025-01-16T15:56:56Z","published":"2022-07-20T15:39:30Z","title":"ReFactor GNNs: Revisiting Factorisation-based Models from a\n  Message-Passing Perspective","summary":"  Factorisation-based Models (FMs), such as DistMult, have enjoyed enduring\nsuccess for Knowledge Graph Completion (KGC) tasks, often outperforming Graph\nNeural Networks (GNNs). However, unlike GNNs, FMs struggle to incorporate node\nfeatures and generalise to unseen nodes in inductive settings. Our work bridges\nthe gap between FMs and GNNs by proposing ReFactor GNNs. This new architecture\ndraws upon both modelling paradigms, which previously were largely thought of\nas disjoint. Concretely, using a message-passing formalism, we show how FMs can\nbe cast as GNNs by reformulating the gradient descent procedure as\nmessage-passing operations, which forms the basis of our ReFactor GNNs. Across\na multitude of well-established KGC benchmarks, our ReFactor GNNs achieve\ncomparable transductive performance to FMs, and state-of-the-art inductive\nperformance while using an order of magnitude fewer parameters.\n","authors":["Yihong Chen","Pushkar Mishra","Luca Franceschi","Pasquale Minervini","Pontus Stenetorp","Sebastian Riedel"],"pdf_url":"https://arxiv.org/pdf/2207.09980v4.pdf","comment":"36th Conference on Neural Information Processing Systems (NeurIPS\n  2022)"},{"id":"http://arxiv.org/abs/2411.12878v2","updated":"2025-01-16T15:46:14Z","published":"2024-11-19T21:53:06Z","title":"Local Anti-Concentration Class: Logarithmic Regret for Greedy Linear\n  Contextual Bandit","summary":"  We study the performance guarantees of exploration-free greedy algorithms for\nthe linear contextual bandit problem. We introduce a novel condition, named the\n\\textit{Local Anti-Concentration} (LAC) condition, which enables a greedy\nbandit algorithm to achieve provable efficiency. We show that the LAC condition\nis satisfied by a broad class of distributions, including Gaussian,\nexponential, uniform, Cauchy, and Student's~$t$ distributions, along with other\nexponential family distributions and their truncated variants. This\nsignificantly expands the class of distributions under which greedy algorithms\ncan perform efficiently. Under our proposed LAC condition, we prove that the\ncumulative expected regret of the greedy algorithm for the linear contextual\nbandit is bounded by $O(\\operatorname{poly} \\log T)$. Our results establish the\nwidest range of distributions known to date that allow a sublinear regret bound\nfor greedy algorithms, further achieving a sharp poly-logarithmic regret.\n","authors":["Seok-Jin Kim","Min-hwan Oh"],"pdf_url":"https://arxiv.org/pdf/2411.12878v2.pdf","comment":"NeurIPS2024"},{"id":"http://arxiv.org/abs/2501.09616v1","updated":"2025-01-16T15:43:32Z","published":"2025-01-16T15:43:32Z","title":"ARMAX identification of low rank graphical models","summary":"  In large-scale systems, complex internal relationships are often present.\nSuch interconnected systems can be effectively described by low rank stochastic\nprocesses. When identifying a predictive model of low rank processes from\nsampling data, the rank-deficient property of spectral densities is often\nobscured by the inevitable measurement noise in practice. However, existing low\nrank identification approaches often did not take noise into explicit\nconsideration, leading to non-negligible inaccuracies even under weak noise. In\nthis paper, we address the identification issue of low rank processes under\nmeasurement noise. We find that the noisy measurement model admits a sparse\nplus low rank structure in latent-variable graphical models. Specifically, we\nfirst decompose the problem into a maximum entropy covariance extension\nproblem, and a low rank graphical estimation problem based on an autoregressive\nmoving-average with exogenous input (ARMAX) model. To identify the ARMAX low\nrank graphical models, we propose an estimation approach based on maximum\nlikelihood. The identifiability and consistency of this approach are proven\nunder certain conditions. Simulation results confirm the reliable performance\nof the entire algorithm in both the parameter estimation and noisy data\nfiltering.\n","authors":["Wenqi Cao","Aming Li"],"pdf_url":"https://arxiv.org/pdf/2501.09616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09611v1","updated":"2025-01-16T15:35:48Z","published":"2025-01-16T15:35:48Z","title":"EVaDE : Event-Based Variational Thompson Sampling for Model-Based\n  Reinforcement Learning","summary":"  Posterior Sampling for Reinforcement Learning (PSRL) is a well-known\nalgorithm that augments model-based reinforcement learning (MBRL) algorithms\nwith Thompson sampling. PSRL maintains posterior distributions of the\nenvironment transition dynamics and the reward function, which are intractable\nfor tasks with high-dimensional state and action spaces. Recent works show that\ndropout, used in conjunction with neural networks, induces variational\ndistributions that can approximate these posteriors. In this paper, we propose\nEvent-based Variational Distributions for Exploration (EVaDE), which are\nvariational distributions that are useful for MBRL, especially when the\nunderlying domain is object-based. We leverage the general domain knowledge of\nobject-based domains to design three types of event-based convolutional layers\nto direct exploration. These layers rely on Gaussian dropouts and are inserted\nbetween the layers of the deep neural network model to help facilitate\nvariational Thompson sampling. We empirically show the effectiveness of\nEVaDE-equipped Simulated Policy Learning (EVaDE-SimPLe) on the 100K Atari game\nsuite.\n","authors":["Siddharth Aravindan","Dixant Mittal","Wee Sun Lee"],"pdf_url":"https://arxiv.org/pdf/2501.09611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09609v1","updated":"2025-01-16T15:34:00Z","published":"2025-01-16T15:34:00Z","title":"Adversarial-Ensemble Kolmogorov Arnold Networks for Enhancing Indoor\n  Wi-Fi Positioning: A Defensive Approach Against Spoofing and Signal\n  Manipulation Attacks","summary":"  The research presents a study on enhancing the robustness of Wi-Fi-based\nindoor positioning systems against adversarial attacks. The goal is to improve\nthe positioning accuracy and resilience of these systems under two attack\nscenarios: Wi-Fi Spoofing and Signal Strength Manipulation. Three models are\ndeveloped and evaluated: a baseline model (M_Base), an adversarially trained\nrobust model (M_Rob), and an ensemble model (M_Ens). All models utilize a\nKolmogorov-Arnold Network (KAN) architecture. The robust model is trained with\nadversarially perturbed data, while the ensemble model combines predictions\nfrom both the base and robust models. Experimental results show that the robust\nmodel reduces positioning error by approximately 10% compared to the baseline,\nachieving 2.03 meters error under Wi-Fi spoofing and 2.00 meters under signal\nstrength manipulation. The ensemble model further outperforms with errors of\n2.01 meters and 1.975 meters for the respective attack types. This analysis\nhighlights the effectiveness of adversarial training techniques in mitigating\nattack impacts. The findings underscore the importance of considering\nadversarial scenarios in developing indoor positioning systems, as improved\nresilience can significantly enhance the accuracy and reliability of such\nsystems in mission-critical environments.\n","authors":["Mitul Goswami","Romit Chatterjee","Somnath Mahato","Prasant Kumar Pattnaik"],"pdf_url":"https://arxiv.org/pdf/2501.09609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08389v3","updated":"2025-01-16T15:32:33Z","published":"2024-09-12T20:37:14Z","title":"Higher-Order Topological Directionality and Directed Simplicial Neural\n  Networks","summary":"  Topological Deep Learning (TDL) has emerged as a paradigm to process and\nlearn from signals defined on higher-order combinatorial topological spaces,\nsuch as simplicial or cell complexes. Although many complex systems have an\nasymmetric relational structure, most TDL models forcibly symmetrize these\nrelationships. In this paper, we first introduce a novel notion of higher-order\ndirectionality and we then design Directed Simplicial Neural Networks\n(Dir-SNNs) based on it. Dir-SNNs are message-passing networks operating on\ndirected simplicial complexes able to leverage directed and possibly asymmetric\ninteractions among the simplices. To our knowledge, this is the first TDL model\nusing a notion of higher-order directionality. We theoretically and empirically\nprove that Dir-SNNs are more expressive than their directed graph counterpart\nin distinguishing isomorphic directed graphs. Experiments on a synthetic source\nlocalization task demonstrate that Dir-SNNs outperform undirected SNNs when the\nunderlying complex is directed, and perform comparably when the underlying\ncomplex is undirected.\n","authors":["Manuel Lecha","Andrea Cavallo","Francesca Dominici","Elvin Isufi","Claudio Battiloro"],"pdf_url":"https://arxiv.org/pdf/2409.08389v3.pdf","comment":"7 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2501.09597v1","updated":"2025-01-16T15:21:18Z","published":"2025-01-16T15:21:18Z","title":"Reducing the Sensitivity of Neural Physics Simulators to Mesh Topology\n  via Pretraining","summary":"  Meshes are used to represent complex objects in high fidelity physics\nsimulators across a variety of domains, such as radar sensing and aerodynamics.\nThere is growing interest in using neural networks to accelerate physics\nsimulations, and also a growing body of work on applying neural networks\ndirectly to irregular mesh data. Since multiple mesh topologies can represent\nthe same object, mesh augmentation is typically required to handle topological\nvariation when training neural networks. Due to the sensitivity of physics\nsimulators to small changes in mesh shape, it is challenging to use these\naugmentations when training neural network-based physics simulators. In this\nwork, we show that variations in mesh topology can significantly reduce the\nperformance of neural network simulators. We evaluate whether pretraining can\nbe used to address this issue, and find that employing an established\nautoencoder pretraining technique with graph embedding models reduces the\nsensitivity of neural network simulators to variations in mesh topology.\nFinally, we highlight future research directions that may further reduce neural\nsimulator sensitivity to mesh topology.\n","authors":["Nathan Vaska","Justin Goodwin","Robin Walters","Rajmonda S. Caceres"],"pdf_url":"https://arxiv.org/pdf/2501.09597v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.09595v1","updated":"2025-01-16T15:20:22Z","published":"2025-01-16T15:20:22Z","title":"IFRA: a machine learning-based Instrumented Fall Risk Assessment Scale\n  derived from Instrumented Timed Up and Go test in stroke patients","summary":"  Effective fall risk assessment is critical for post-stroke patients. The\npresent study proposes a novel, data-informed fall risk assessment method based\non the instrumented Timed Up and Go (ITUG) test data, bringing in many mobility\nmeasures that traditional clinical scales fail to capture. IFRA, which stands\nfor Instrumented Fall Risk Assessment, has been developed using a two-step\nprocess: first, features with the highest predictive power among those\ncollected in a ITUG test have been identified using machine learning\ntechniques; then, a strategy is proposed to stratify patients into low, medium,\nor high-risk strata. The dataset used in our analysis consists of 142\nparticipants, out of which 93 were used for training (15 synthetically\ngenerated), 17 for validation and 32 to test the resulting IFRA scale (22\nnon-fallers and 10 fallers). Features considered in the IFRA scale include gait\nspeed, vertical acceleration during sit-to-walk transition, and turning angular\nvelocity, which align well with established literature on the risk of fall in\nneurological patients. In a comparison with traditional clinical scales such as\nthe traditional Timed Up & Go and the Mini-BESTest, IFRA demonstrates\ncompetitive performance, being the only scale to correctly assign more than\nhalf of the fallers to the high-risk stratum (Fischer's Exact test p = 0.004).\nDespite the dataset's limited size, this is the first proof-of-concept study to\npave the way for future evidence regarding the use of IFRA tool for continuous\npatient monitoring and fall prevention both in clinical stroke rehabilitation\nand at home post-discharge.\n","authors":["Simone Macciò","Alessandro Carfì","Alessio Capitanelli","Peppino Tropea","Massimo Corbo","Fulvio Mastrogiovanni","Michela Picardi"],"pdf_url":"https://arxiv.org/pdf/2501.09595v1.pdf","comment":"26 pages, 2 figures, submitted for review dec 2024"},{"id":"http://arxiv.org/abs/2501.09591v1","updated":"2025-01-16T15:17:27Z","published":"2025-01-16T15:17:27Z","title":"Metrics for Inter-Dataset Similarity with Example Applications in\n  Synthetic Data and Feature Selection Evaluation -- Extended Version","summary":"  Measuring inter-dataset similarity is an important task in machine learning\nand data mining with various use cases and applications. Existing methods for\nmeasuring inter-dataset similarity are computationally expensive, limited, or\nsensitive to different entities and non-trivial choices for parameters. They\nalso lack a holistic perspective on the entire dataset. In this paper, we\npropose two novel metrics for measuring inter-dataset similarity. We discuss\nthe mathematical foundation and the theoretical basis of our proposed metrics.\nWe demonstrate the effectiveness of the proposed metrics by investigating two\napplications in the evaluation of synthetic data and in the evaluation of\nfeature selection methods. The theoretical and empirical studies conducted in\nthis paper illustrate the effectiveness of the proposed metrics.\n","authors":["Muhammad Rajabinasab","Anton D. Lautrup","Arthur Zimek"],"pdf_url":"https://arxiv.org/pdf/2501.09591v1.pdf","comment":"This is the extended version of a paper accepted at 2025 SIAM\n  International Conference on Data Mining (SDM)"},{"id":"http://arxiv.org/abs/2501.09588v1","updated":"2025-01-16T15:11:33Z","published":"2025-01-16T15:11:33Z","title":"Atleus: Accelerating Transformers on the Edge Enabled by 3D\n  Heterogeneous Manycore Architectures","summary":"  Transformer architectures have become the standard neural network model for\nvarious machine learning applications including natural language processing and\ncomputer vision. However, the compute and memory requirements introduced by\ntransformer models make them challenging to adopt for edge applications.\nFurthermore, fine-tuning pre-trained transformers (e.g., foundation models) is\na common task to enhance the model's predictive performance on specific\ntasks/applications. Existing transformer accelerators are oblivious to\ncomplexities introduced by fine-tuning. In this paper, we propose the design of\na three-dimensional (3D) heterogeneous architecture referred to as Atleus that\nincorporates heterogeneous computing resources specifically optimized to\naccelerate transformer models for the dual purposes of fine-tuning and\ninference. Specifically, Atleus utilizes non-volatile memory and systolic array\nfor accelerating transformer computational kernels using an integrated 3D\nplatform. Moreover, we design a suitable NoC to achieve high performance and\nenergy efficiency. Finally, Atleus adopts an effective quantization scheme to\nsupport model compression. Experimental results demonstrate that Atleus\noutperforms existing state-of-the-art by up to 56x and 64.5x in terms of\nperformance and energy efficiency respectively\n","authors":["Pratyush Dhingra","Janardhan Rao Doppa","Partha Pratim Pande"],"pdf_url":"https://arxiv.org/pdf/2501.09588v1.pdf","comment":"Accepted for Publication in IEEE Transactions on Computer-Aided\n  Design of Integrated Circuits and Systems (TCAD)"},{"id":"http://arxiv.org/abs/2307.02229v2","updated":"2025-01-16T15:00:38Z","published":"2023-07-05T12:13:56Z","title":"Hybrid additive modeling with partial dependence for supervised\n  regression and dynamical systems forecasting","summary":"  Learning processes by exploiting restricted domain knowledge is an important\ntask across a plethora of scientific areas, with more and more hybrid training\nmethods additively combining data-driven and model-based approaches. Although\nthe obtained models are more accurate than purely data-driven models, the\noptimization process usually comes with sensitive regularization constraints.\nFurthermore, while such hybrid methods have been tested in various scientific\napplications, they have been mostly tested on dynamical systems, with only\nlimited study about the influence of each model component on global performance\nand parameter identification. In this work, we introduce a new hybrid training\napproach based on partial dependence, which removes the need for intricate\nregularization. Moreover, we assess the performance of hybrid modeling against\ntraditional machine learning methods on standard regression problems. We\ncompare, on both synthetic and real regression problems, several approaches for\ntraining such hybrid models. We focus on hybrid methods that additively combine\na parametric term with a machine learning term and investigate model-agnostic\ntraining procedures. Therefore, experiments are carried out with different\ntypes of machine learning models, including tree-based models and artificial\nneural networks. We also extend our partial dependence optimization process for\ndynamical systems forecasting and compare it to existing schemes.\n","authors":["Yann Claes","Vân Anh Huynh-Thu","Pierre Geurts"],"pdf_url":"https://arxiv.org/pdf/2307.02229v2.pdf","comment":"Extended version of the paper entitled \"Knowledge-Guided Additive\n  Modeling for Supervised Regression\"\n  (https://link.springer.com/chapter/10.1007/978-3-031-45275-8_5), accepted for\n  publication in the Machine Learning journal. The extension includes new\n  experiments in the static setting, along with a dedicated section on the\n  application of our method to the problem of dynamical systems forecasting"},{"id":"http://arxiv.org/abs/2501.09579v1","updated":"2025-01-16T14:56:41Z","published":"2025-01-16T14:56:41Z","title":"Sequential PatchCore: Anomaly Detection for Surface Inspection using\n  Synthetic Impurities","summary":"  The appearance of surface impurities (e.g., water stains, fingerprints,\nstickers) is an often-mentioned issue that causes degradation of automated\nvisual inspection systems. At the same time, synthetic data generation\ntechniques for visual surface inspection have focused primarily on generating\nperfect examples and defects, disregarding impurities. This study highlights\nthe importance of considering impurities when generating synthetic data. We\nintroduce a procedural method to include photorealistic water stains in\nsynthetic data. The synthetic datasets are generated to correspond to real\ndatasets and are further used to train an anomaly detection model and\ninvestigate the influence of water stains. The high-resolution images used for\nsurface inspection lead to memory bottlenecks during anomaly detection\ntraining. To address this, we introduce Sequential PatchCore - a method to\nbuild coresets sequentially and make training on large images using\nconsumer-grade hardware tractable. This allows us to perform transfer learning\nusing coresets pre-trained on different dataset versions. Our results show the\nbenefits of using synthetic data for pre-training an explicit coreset anomaly\nmodel and the extended performance benefits of finetuning the coreset using\nreal data. We observed how the impurities and labelling ambiguity lower the\nmodel performance and have additionally reported the defect-wise recall to\nprovide an industrially relevant perspective on model performance.\n","authors":["Runzhou Mao","Juraj Fulir","Christoph Garth","Petra Gospodnetić"],"pdf_url":"https://arxiv.org/pdf/2501.09579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09572v1","updated":"2025-01-16T14:45:53Z","published":"2025-01-16T14:45:53Z","title":"Towards Spectral Convergence of Locally Linear Embedding on Manifolds\n  with Boundary","summary":"  We study the eigenvalues and eigenfunctions of a differential operator that\ngoverns the asymptotic behavior of the unsupervised learning algorithm known as\nLocally Linear Embedding when a large data set is sampled from an interval or\ndisc. In particular, the differential operator is of second order, mixed-type,\nand degenerates near the boundary. We show that a natural regularity condition\non the eigenfunctions imposes a consistent boundary condition and use the\nFrobenius method to estimate pointwise behavior. We then determine the limiting\nsequence of eigenvalues analytically and compare them to numerical predictions.\nFinally, we propose a variational framework for determining eigenvalues on\nother compact manifolds.\n","authors":["Andrew Lyons"],"pdf_url":"https://arxiv.org/pdf/2501.09572v1.pdf","comment":"26 pages, 7 figures; the author welcomes all comments"},{"id":"http://arxiv.org/abs/2405.17061v3","updated":"2025-01-16T14:45:52Z","published":"2024-05-27T11:31:54Z","title":"Provably Efficient Reinforcement Learning with Multinomial Logit\n  Function Approximation","summary":"  We study a new class of MDPs that employs multinomial logit (MNL) function\napproximation to ensure valid probability distributions over the state space.\nDespite its significant benefits, incorporating the non-linear function raises\nsubstantial challenges in both statistical and computational efficiency. The\nbest-known result of Hwang and Oh [2023] has achieved an\n$\\widetilde{\\mathcal{O}}(\\kappa^{-1}dH^2\\sqrt{K})$ regret upper bound, where\n$\\kappa$ is a problem-dependent quantity, $d$ is the feature dimension, $H$ is\nthe episode length, and $K$ is the number of episodes. However, we observe that\n$\\kappa^{-1}$ exhibits polynomial dependence on the number of reachable states,\nwhich can be as large as the state space size in the worst case and thus\nundermines the motivation for function approximation. Additionally, their\nmethod requires storing all historical data and the time complexity scales\nlinearly with the episode count, which is computationally expensive. In this\nwork, we propose a statistically efficient algorithm that achieves a regret of\n$\\widetilde{\\mathcal{O}}(dH^2\\sqrt{K} + \\kappa^{-1}d^2H^2)$, eliminating the\ndependence on $\\kappa^{-1}$ in the dominant term for the first time. We then\naddress the computational challenges by introducing an enhanced algorithm that\nachieves the same regret guarantee but with only constant cost. Finally, we\nestablish the first lower bound for this problem, justifying the optimality of\nour results in $d$ and $K$.\n","authors":["Long-Fei Li","Yu-Jie Zhang","Peng Zhao","Zhi-Hua Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.17061v3.pdf","comment":"NeurIPS 2024; v3 substantially improves the presentation and further\n  illustrates the role of $\\kappa$ in function approximation"},{"id":"http://arxiv.org/abs/2407.20891v4","updated":"2025-01-16T14:45:36Z","published":"2024-07-30T15:07:13Z","title":"Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian\n  Neural Networks","summary":"  Computational complexity of Bayesian learning is impeding its adoption in\npractical, large-scale tasks. Despite demonstrations of significant merits such\nas improved robustness and resilience to unseen or out-of-distribution inputs\nover their non- Bayesian counterparts, their practical use has faded to near\ninsignificance. In this study, we introduce an innovative framework to mitigate\nthe computational burden of Bayesian neural networks (BNNs). Our approach\nfollows the principle of Bayesian techniques based on deep ensembles, but\nsignificantly reduces their cost via multiple low-rank perturbations of\nparameters arising from a pre-trained neural network. Both vanilla version of\nensembles as well as more sophisticated schemes such as Bayesian learning with\nStein Variational Gradient Descent (SVGD), previously deemed impractical for\nlarge models, can be seamlessly implemented within the proposed framework,\ncalled Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a\ndramatic reduction in the number of trainable parameters required to\napproximate a Bayesian posterior; and ii) it not only maintains, but in some\ninstances, surpasses the performance of conventional Bayesian learning methods\nand non-Bayesian baselines. Our results with large-scale tasks such as\nImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the\neffectiveness and versatility of Bella in building highly scalable and\npractical Bayesian deep models for real-world applications.\n","authors":["Bao Gia Doan","Afshar Shamsi","Xiao-Yu Guo","Arash Mohammadi","Hamid Alinejad-Rokny","Dino Sejdinovic","Damien Teney","Damith C. Ranasinghe","Ehsan Abbasnejad"],"pdf_url":"https://arxiv.org/pdf/2407.20891v4.pdf","comment":"This paper is accepted in AAAI'2025"},{"id":"http://arxiv.org/abs/2501.09571v1","updated":"2025-01-16T14:45:12Z","published":"2025-01-16T14:45:12Z","title":"MatrixNet: Learning over symmetry groups using learned group\n  representations","summary":"  Group theory has been used in machine learning to provide a theoretically\ngrounded approach for incorporating known symmetry transformations in tasks\nfrom robotics to protein modeling. In these applications, equivariant neural\nnetworks use known symmetry groups with predefined representations to learn\nover geometric input data. We propose MatrixNet, a neural network architecture\nthat learns matrix representations of group element inputs instead of using\npredefined representations. MatrixNet achieves higher sample efficiency and\ngeneralization over several standard baselines in prediction tasks over the\nseveral finite groups and the Artin braid group. We also show that MatrixNet\nrespects group relations allowing generalization to group elements of greater\nword length than in the training set.\n","authors":["Lucas Laird","Circe Hsu","Asilata Bapat","Robin Walters"],"pdf_url":"https://arxiv.org/pdf/2501.09571v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.04755v2","updated":"2025-01-16T14:44:39Z","published":"2024-12-06T03:40:21Z","title":"Latent Space Characterization of Autoencoder Variants","summary":"  Understanding the latent spaces learned by deep learning models is crucial in\nexploring how they represent and generate complex data. Autoencoders (AEs) have\nplayed a key role in the area of representation learning, with numerous\nregularization techniques and training principles developed not only to enhance\ntheir ability to learn compact and robust representations, but also to reveal\nhow different architectures influence the structure and smoothness of the\nlower-dimensional non-linear manifold. We strive to characterize the structure\nof the latent spaces learned by different autoencoders including convolutional\nautoencoders (CAEs), denoising autoencoders (DAEs), and variational\nautoencoders (VAEs) and how they change with the perturbations in the input. By\ncharacterizing the matrix manifolds corresponding to the latent spaces, we\nprovide an explanation for the well-known observation that the latent spaces of\nCAE and DAE form non-smooth manifolds, while that of VAE forms a smooth\nmanifold. We also map the points of the matrix manifold to a Hilbert space\nusing distance preserving transforms and provide an alternate view in terms of\nthe subspaces generated in the Hilbert space as a function of the distortion in\nthe input. The results show that the latent manifolds of CAE and DAE are\nstratified with each stratum being a smooth product manifold, while the\nmanifold of VAE is a smooth product manifold of two symmetric positive definite\nmatrices and a symmetric positive semi-definite matrix.\n","authors":["Anika Shrivastava","Renu Rameshan","Samar Agnihotri"],"pdf_url":"https://arxiv.org/pdf/2412.04755v2.pdf","comment":"9 pages, 6 figures, and 1 table"},{"id":"http://arxiv.org/abs/2408.14234v3","updated":"2025-01-16T14:29:47Z","published":"2024-08-26T12:49:41Z","title":"FSDEM: Feature Selection Dynamic Evaluation Metric","summary":"  Expressive evaluation metrics are indispensable for informative experiments\nin all areas, and while several metrics are established in some areas, in\nothers, such as feature selection, only indirect or otherwise limited\nevaluation metrics are found. In this paper, we propose a novel evaluation\nmetric to address several problems of its predecessors and allow for flexible\nand reliable evaluation of feature selection algorithms. The proposed metric is\na dynamic metric with two properties that can be used to evaluate both the\nperformance and the stability of a feature selection algorithm. We conduct\nseveral empirical experiments to illustrate the use of the proposed metric in\nthe successful evaluation of feature selection algorithms. We also provide a\ncomparison and analysis to show the different aspects involved in the\nevaluation of the feature selection algorithms. The results indicate that the\nproposed metric is successful in carrying out the evaluation task for feature\nselection algorithms.\n  This paper is an extended version of a paper published at SISAP 2024.\n","authors":["Muhammad Rajabinasab","Anton D. Lautrup","Tobias Hyrup","Arthur Zimek"],"pdf_url":"https://arxiv.org/pdf/2408.14234v3.pdf","comment":"Short version of this paper is published at 17th International\n  Conference on Similarity Search and Applications, SISAP 2024"},{"id":"http://arxiv.org/abs/2501.09556v1","updated":"2025-01-16T14:18:10Z","published":"2025-01-16T14:18:10Z","title":"Overshoot: Taking advantage of future gradients in momentum-based\n  stochastic optimization","summary":"  Overshoot is a novel, momentum-based stochastic gradient descent optimization\nmethod designed to enhance performance beyond standard and Nesterov's momentum.\nIn conventional momentum methods, gradients from previous steps are aggregated\nwith the gradient at current model weights before taking a step and updating\nthe model. Rather than calculating gradient at the current model weights,\nOvershoot calculates the gradient at model weights shifted in the direction of\nthe current momentum. This sacrifices the immediate benefit of using the\ngradient w.r.t. the exact model weights now, in favor of evaluating at a point,\nwhich will likely be more relevant for future updates. We show that\nincorporating this principle into momentum-based optimizers (SGD with momentum\nand Adam) results in faster convergence (saving on average at least 15% of\nsteps). Overshoot consistently outperforms both standard and Nesterov's\nmomentum across a wide range of tasks and integrates into popular\nmomentum-based optimizers with zero memory and small computational overhead.\n","authors":["Jakub Kopal","Michal Gregor","Santiago de Leon-Martinez","Jakub Simko"],"pdf_url":"https://arxiv.org/pdf/2501.09556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09551v1","updated":"2025-01-16T14:12:03Z","published":"2025-01-16T14:12:03Z","title":"Intra-day Solar and Power Forecast for Optimization of Intraday Market\n  Participation","summary":"  The prediction of solar irradiance enhances reliability in photovoltaic (PV)\nsolar plant generation and grid integration. In Colombia, PV plants face\npenalties if energy production deviates beyond governmental thresholds from\nintraday market offers. This research employs Long Short-Term Memory (LSTM) and\nBidirectional-LSTM (Bi-LSTM) models, utilizing meteorological data from a PV\nplant in El Paso, Cesar, Colombia, to predict solar irradiance with a 6-hour\nhorizon and 10-minute resolution. While Bi-LSTM showed superior performance,\nthe LSTM model achieved comparable results with significantly reduced training\ntime (6 hours versus 18 hours), making it computationally advantageous. The\nLSTM predictions were averaged to create an hourly resolution model, evaluated\nusing Mean Absolute Error, Root-Mean-Square Error, Normalized Root-Mean-Square\nError, and Mean Absolute Percentage Error metrics. Comparison with the Global\nForecast System (GFS) revealed similar performance, with both models\neffectively capturing daily solar irradiance patterns. The forecast model\nintegrates with an Object-Oriented power production model, enabling accurate\nenergy offers in the intraday market while minimizing penalty costs.\n","authors":["Nelson Salazar-Peña","Adolfo Palma-Vergara","Mateo Montes","María Alejandra Vargas-Torres","Adriana Salinas","Andrés Velasco","Alejandra Tabares","Andrés González-Mancera"],"pdf_url":"https://arxiv.org/pdf/2501.09551v1.pdf","comment":"20 pages, 37 figures, 9 tables"},{"id":"http://arxiv.org/abs/2404.14388v3","updated":"2025-01-16T14:02:26Z","published":"2024-04-22T17:46:29Z","title":"STROOBnet Optimization via GPU-Accelerated Proximal Recurrence\n  Strategies","summary":"  Spatiotemporal networks' observational capabilities are crucial for accurate\ndata gathering and informed decisions across multiple sectors. This study\nfocuses on the Spatiotemporal Ranged Observer-Observable Bipartite Network\n(STROOBnet), linking observational nodes (e.g., surveillance cameras) to events\nwithin defined geographical regions, enabling efficient monitoring. Using data\nfrom Real-Time Crime Camera (RTCC) systems and Calls for Service (CFS) in New\nOrleans, where RTCC combats rising crime amidst reduced police presence, we\naddress the network's initial observational imbalances. Aiming for uniform\nobservational efficacy, we propose the Proximal Recurrence approach. It\noutperformed traditional clustering methods like k-means and DBSCAN by offering\nholistic event frequency and spatial consideration, enhancing observational\ncoverage.\n","authors":["Ted Edward Holmberg","Mahdi Abdelguerfi","Elias Ioup"],"pdf_url":"https://arxiv.org/pdf/2404.14388v3.pdf","comment":"10 pages, 17 figures, 2023 IEEE International Conference on Big Data\n  (BigData)"},{"id":"http://arxiv.org/abs/2412.07223v3","updated":"2025-01-16T13:53:47Z","published":"2024-12-10T06:19:08Z","title":"A Consolidated Volatility Prediction with Back Propagation Neural\n  Network and Genetic Algorithm","summary":"  This paper provides a unique approach with AI algorithms to predict emerging\nstock markets volatility. Traditionally, stock volatility is derived from\nhistorical volatility,Monte Carlo simulation and implied volatility as well. In\nthis paper, the writer designs a consolidated model with back-propagation\nneural network and genetic algorithm to predict future volatility of emerging\nstock markets and found that the results are quite accurate with low errors.\n","authors":["Zong Ke","Jingyu Xu","Zizhou Zhang","Yu Cheng","Wenjun Wu"],"pdf_url":"https://arxiv.org/pdf/2412.07223v3.pdf","comment":"6 pages, 7 figures, 1 table, The paper will be published by IEEE on\n  conference: 2024 3rd International Conference on Image Processing, Computer\n  Vision and Machine Learning (ICICML 2024) (V2)"},{"id":"http://arxiv.org/abs/2203.07260v3","updated":"2025-01-16T13:40:01Z","published":"2022-03-14T16:40:57Z","title":"SimHawNet: A Modified Hawkes Process for Temporal Network Simulation","summary":"  Temporal networks allow representing connections between objects while\nincorporating the temporal dimension. While static network models can capture\nunchanging topological regularities, they often fail to model the effects\nassociated with the causal generative process of the network that occurs in\ntime. Hence, exploiting the temporal aspect of networks has been the focus of\nmany recent studies. In this context, we propose a new framework for generative\nmodels of continuous-time temporal networks. We assume that the activation of\nthe edges in a temporal network is driven by a specified temporal point\nprocess. This approach allows to directly model the waiting time between events\nwhile incorporating time-varying history-based features as covariates in the\npredictions. Coupled with a thinning algorithm designed for the simulation of\npoint processes, SimHawNet enables simulation of the evolution of temporal\nnetworks in continuous time. Finally, we introduce a comprehensive evaluation\nframework to assess the performance of such an approach, in which we\ndemonstrate that SimHawNet successfully simulates the evolution of networks\nwith very different generative processes and achieves performance comparable to\nthe state of the art, while being significantly faster.\n","authors":["Mathilde Perez","Raphaël Romero","Bo Kang","Tijl De Bie","Jefrey Lijffijt","Charlotte Laclau"],"pdf_url":"https://arxiv.org/pdf/2203.07260v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09531v1","updated":"2025-01-16T13:30:20Z","published":"2025-01-16T13:30:20Z","title":"MOGNET: A Mux-residual quantized Network leveraging Online-Generated\n  weights","summary":"  This paper presents a compact model architecture called MOGNET, compatible\nwith a resource-limited hardware. MOGNET uses a streamlined Convolutional\nfactorization block based on a combination of 2 point-wise (1x1) convolutions\nwith a group-wise convolution in-between. To further limit the overall model\nsize and reduce the on-chip required memory, the second point-wise\nconvolution's parameters are on-line generated by a Cellular Automaton\nstructure. In addition, MOGNET enables the use of low-precision weights and\nactivations, by taking advantage of a Multiplexer mechanism with a proper\nBitshift rescaling for integrating residual paths without increasing the\nhardware-related complexity. To efficiently train this model we also introduce\na novel weight ternarization method favoring the balance between quantized\nlevels. Experimental results show that given tiny memory budget (sub-2Mb),\nMOGNET can achieve higher accuracy with a clear gap up to 1% at a similar or\neven lower model size compared to recent state-of-the-art methods.\n","authors":["Van Thien Nguyen","William Guicquero","Gilles Sicard"],"pdf_url":"https://arxiv.org/pdf/2501.09531v1.pdf","comment":"Published at IEEE AICAS 2022"},{"id":"http://arxiv.org/abs/2501.09527v1","updated":"2025-01-16T13:23:07Z","published":"2025-01-16T13:23:07Z","title":"Confidence Estimation for Error Detection in Text-to-SQL Systems","summary":"  Text-to-SQL enables users to interact with databases through natural\nlanguage, simplifying the retrieval and synthesis of information. Despite the\nsuccess of large language models (LLMs) in converting natural language\nquestions into SQL queries, their broader adoption is limited by two main\nchallenges: achieving robust generalization across diverse queries and ensuring\ninterpretative confidence in their predictions. To tackle these issues, our\nresearch investigates the integration of selective classifiers into Text-to-SQL\nsystems. We analyse the trade-off between coverage and risk using entropy based\nconfidence estimation with selective classifiers and assess its impact on the\noverall performance of Text-to-SQL models. Additionally, we explore the models'\ninitial calibration and improve it with calibration techniques for better model\nalignment between confidence and accuracy. Our experimental results show that\nencoder-decoder T5 is better calibrated than in-context-learning GPT 4 and\ndecoder-only Llama 3, thus the designated external entropy-based selective\nclassifier has better performance. The study also reveal that, in terms of\nerror detection, selective classifier with a higher probability detects errors\nassociated with irrelevant questions rather than incorrect query generations.\n","authors":["Oleg Somov","Elena Tutubalina"],"pdf_url":"https://arxiv.org/pdf/2501.09527v1.pdf","comment":"15 pages, 11 figures, to be published in AAAI 2025 Proceedings"},{"id":"http://arxiv.org/abs/2501.09525v1","updated":"2025-01-16T13:20:29Z","published":"2025-01-16T13:20:29Z","title":"Class Incremental Fault Diagnosis under Limited Fault Data via\n  Supervised Contrastive Knowledge Distillation","summary":"  Class-incremental fault diagnosis requires a model to adapt to new fault\nclasses while retaining previous knowledge. However, limited research exists\nfor imbalanced and long-tailed data. Extracting discriminative features from\nfew-shot fault data is challenging, and adding new fault classes often demands\ncostly model retraining. Moreover, incremental training of existing methods\nrisks catastrophic forgetting, and severe class imbalance can bias the model's\ndecisions toward normal classes. To tackle these issues, we introduce a\nSupervised Contrastive knowledge distiLlation for class Incremental Fault\nDiagnosis (SCLIFD) framework proposing supervised contrastive knowledge\ndistillation for improved representation learning capability and less\nforgetting, a novel prioritized exemplar selection method for sample replay to\nalleviate catastrophic forgetting, and the Random Forest Classifier to address\nthe class imbalance. Extensive experimentation on simulated and real-world\nindustrial datasets across various imbalance ratios demonstrates the\nsuperiority of SCLIFD over existing approaches. Our code can be found at\nhttps://github.com/Zhang-Henry/SCLIFD_TII.\n","authors":["Hanrong Zhang","Yifei Yao","Zixuan Wang","Jiayuan Su","Mengxuan Li","Peng Peng","Hongwei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09522v1","updated":"2025-01-16T13:17:24Z","published":"2025-01-16T13:17:24Z","title":"Merging Models on the Fly Without Retraining: A Sequential Approach to\n  Scalable Continual Model Merging","summary":"  Deep model merging represents an emerging research direction that combines\nmultiple fine-tuned models to harness their specialized capabilities across\ndifferent tasks and domains. Current model merging techniques focus on merging\nall available models simultaneously, with weight interpolation-based methods\nbeing the predominant approaches. However, these conventional approaches are\nnot well-suited for scenarios where models become available sequentially, and\nthey often suffer from high memory requirements and potential interference\nbetween tasks. In this study, we propose a training-free projection-based\ncontinual merging method that processes models sequentially through orthogonal\nprojections of weight matrices and adaptive scaling mechanisms. Our method\noperates by projecting new parameter updates onto subspaces orthogonal to\nexisting merged parameter updates while using an adaptive scaling mechanism to\nmaintain stable parameter distances, enabling efficient sequential integration\nof task-specific knowledge. Our approach maintains constant memory complexity\nto the number of models, minimizes interference between tasks through\northogonal projections, and retains the performance of previously merged models\nthrough adaptive task vector scaling. Extensive experiments on CLIP-ViT models\ndemonstrate that our method achieves a 5-8% average accuracy improvement while\nmaintaining robust performance in different task orderings.\n","authors":["Anke Tang","Enneng Yang","Li Shen","Yong Luo","Han Hu","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2501.09522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09519v1","updated":"2025-01-16T13:09:37Z","published":"2025-01-16T13:09:37Z","title":"Multi-task deep-learning for sleep event detection and stage\n  classification","summary":"  Polysomnographic sleep analysis is the standard clinical method to accurately\ndiagnose and treat sleep disorders. It is an intricate process which involves\nthe manual identification, classification, and location of multiple sleep event\npatterns. This is complex, for which identification of different types of\nevents involves focusing on different subsets of signals, resulting on an\niterative time-consuming process entailing several visual analysis passes. In\nthis paper we propose a multi-task deep-learning approach for the simultaneous\ndetection of sleep events and hypnogram construction in one single pass. Taking\nas reference state-of-the-art methodology for object-detection in the field of\nComputer Vision, we reformulate the problem for the analysis of multi-variate\ntime sequences, and more specifically for pattern detection in the sleep\nanalysis scenario. We investigate the performance of the resulting method in\nidentifying different assembly combinations of EEG arousals, respiratory events\n(apneas and hypopneas) and sleep stages, also considering different input\nsignal montage configurations. Furthermore, we evaluate our approach using two\nindependent datasets, assessing true-generalization effects involving local and\nexternal validation scenarios. Based on our results, we analyze and discuss our\nmethod's capabilities and its potential wide-range applicability across\ndifferent settings and datasets.\n","authors":["Adriana Anido-Alonso","Diego Alvarez-Estevez"],"pdf_url":"https://arxiv.org/pdf/2501.09519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09512v1","updated":"2025-01-16T12:57:33Z","published":"2025-01-16T12:57:33Z","title":"PIER: A Novel Metric for Evaluating What Matters in Code-Switching","summary":"  Code-switching, the alternation of languages within a single discourse,\npresents a significant challenge for Automatic Speech Recognition. Despite the\nunique nature of the task, performance is commonly measured with established\nmetrics such as Word-Error-Rate (WER). However, in this paper, we question\nwhether these general metrics accurately assess performance on code-switching.\nSpecifically, using both Connectionist-Temporal-Classification and\nEncoder-Decoder models, we show fine-tuning on non-code-switched data from both\nmatrix and embedded language improves classical metrics on code-switching test\nsets, although actual code-switched words worsen (as expected). Therefore, we\npropose Point-of-Interest Error Rate (PIER), a variant of WER that focuses only\non specific words of interest. We instantiate PIER on code-switched utterances\nand show that this more accurately describes the code-switching performance,\nshowing huge room for improvement in future work. This focused evaluation\nallows for a more precise assessment of model performance, particularly in\nchallenging aspects such as inter-word and intra-word code-switching.\n","authors":["Enes Yavuz Ugan","Ngoc-Quan Pham","Leonard Bärmann","Alex Waibel"],"pdf_url":"https://arxiv.org/pdf/2501.09512v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2408.10274v2","updated":"2025-01-16T12:45:17Z","published":"2024-08-17T10:53:06Z","title":"Benchmarking quantum machine learning kernel training for classification\n  tasks","summary":"  Quantum-enhanced machine learning is a rapidly evolving field that aims to\nleverage the unique properties of quantum mechanics to enhance classical\nmachine learning. However, the practical applicability of these methods remains\nan open question, particularly beyond the context of specifically-crafted toy\nproblems, and given the current limitations of quantum hardware. This study\nfocuses on quantum kernel methods in the context of classification tasks. In\nparticular, it examines the performance of Quantum Kernel Estimation (QKE) and\nQuantum Kernel Training (QKT) in connection with two quantum feature mappings,\nnamely ZZFeatureMap and CovariantFeatureMap. Remarkably, these feature maps\nhave been proposed in the literature under the conjecture of possible near-term\nquantum advantage and have shown promising performance in ad-hoc datasets. In\nthis study, we aim to evaluate their versatility and generalization\ncapabilities in a more general benchmark, encompassing both artificial and\nestablished reference datasets. Classical machine learning methods,\nspecifically Support Vector Machines (SVMs) and logistic regression, are also\nincorporated as baseline comparisons. Experimental results indicate that\nquantum methods exhibit varying performance across different datasets. Despite\noutperforming classical methods in ad-hoc datasets, mixed results are obtained\nfor the general case among standard classical benchmarks. Our experiments call\ninto question a general added value of applying QKT optimization, for which the\nadditional computational cost does not necessarily translate into improved\nclassification performance. Instead, it is suggested that a careful choice of\nthe quantum feature map in connection with proper hyperparameterization may\nprove more effective.\n","authors":["Diego Alvarez-Estevez"],"pdf_url":"https://arxiv.org/pdf/2408.10274v2.pdf","comment":"19 pages, 4 figures; extended experiments and datasets, fixed typos;\n  in consideration for publication in IEEE TQE"},{"id":"http://arxiv.org/abs/2501.09506v1","updated":"2025-01-16T12:38:49Z","published":"2025-01-16T12:38:49Z","title":"Multimodal Marvels of Deep Learning in Medical Diagnosis: A\n  Comprehensive Review of COVID-19 Detection","summary":"  This study presents a comprehensive review of the potential of multimodal\ndeep learning (DL) in medical diagnosis, using COVID-19 as a case example.\nMotivated by the success of artificial intelligence applications during the\nCOVID-19 pandemic, this research aims to uncover the capabilities of DL in\ndisease screening, prediction, and classification, and to derive insights that\nenhance the resilience, sustainability, and inclusiveness of science,\ntechnology, and innovation systems. Adopting a systematic approach, we\ninvestigate the fundamental methodologies, data sources, preprocessing steps,\nand challenges encountered in various studies and implementations. We explore\nthe architecture of deep learning models, emphasising their data-specific\nstructures and underlying algorithms. Subsequently, we compare different deep\nlearning strategies utilised in COVID-19 analysis, evaluating them based on\nmethodology, data, performance, and prerequisites for future research. By\nexamining diverse data types and diagnostic modalities, this research\ncontributes to scientific understanding and knowledge of the multimodal\napplication of DL and its effectiveness in diagnosis. We have implemented and\nanalysed 11 deep learning models using COVID-19 image, text, and speech (ie,\ncough) data. Our analysis revealed that the MobileNet model achieved the\nhighest accuracy of 99.97% for COVID-19 image data and 93.73% for speech data\n(i.e., cough). However, the BiGRU model demonstrated superior performance in\nCOVID-19 text classification with an accuracy of 99.89%. The broader\nimplications of this research suggest potential benefits for other domains and\ndisciplines that could leverage deep learning techniques for image, text, and\nspeech analysis.\n","authors":["Md Shofiqul Islama","Khondokar Fida Hasanc","Hasibul Hossain Shajeebd","Humayan Kabir Ranae","Md Saifur Rahmand","Md Munirul Hasanb","AKM Azadf","Ibrahim Abdullahg","Mohammad Ali Moni"],"pdf_url":"https://arxiv.org/pdf/2501.09506v1.pdf","comment":"43 pages"},{"id":"http://arxiv.org/abs/2404.06808v2","updated":"2025-01-16T12:38:39Z","published":"2024-04-10T07:55:10Z","title":"Formation-Controlled Dimensionality Reduction","summary":"  Dimensionality reduction represents the process of generating a low\ndimensional representation of high dimensional data. Motivated by the formation\ncontrol of mobile agents, we propose a nonlinear dynamical system for\ndimensionality reduction. The system consists of two parts; the control of\nneighbor points, addressing local structures, and the control of remote points,\naccounting for global structures.We also include a brief mathematical analysis\nof the model and its numerical procedure. Numerical experiments are performed\non both synthetic and real datasets and comparisons with existing models\ndemonstrate the soundness and effectiveness of the proposed model.\n","authors":["Taeuk Jeong","Yoon Mo Jung","Euntack Lee"],"pdf_url":"https://arxiv.org/pdf/2404.06808v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08201v3","updated":"2025-01-16T12:33:37Z","published":"2023-09-15T07:05:33Z","title":"Sparsity-Aware Distributed Learning for Gaussian Processes with Linear\n  Multiple Kernel","summary":"  Gaussian processes (GPs) stand as crucial tools in machine learning and\nsignal processing, with their effectiveness hinging on kernel design and\nhyper-parameter optimization. This paper presents a novel GP linear multiple\nkernel (LMK) and a generic sparsity-aware distributed learning framework to\noptimize the hyper-parameters. The newly proposed grid spectral mixture product\n(GSMP) kernel is tailored for multi-dimensional data, effectively reducing the\nnumber of hyper-parameters while maintaining good approximation capability. We\nfurther demonstrate that the associated hyper-parameter optimization of this\nkernel yields sparse solutions. To exploit the inherent sparsity of the\nsolutions, we introduce the Sparse LInear Multiple Kernel Learning (SLIM-KL)\nframework. The framework incorporates a quantized alternating direction method\nof multipliers (ADMM) scheme for collaborative learning among multiple agents,\nwhere the local optimization problem is solved using a distributed successive\nconvex approximation (DSCA) algorithm. SLIM-KL effectively manages large-scale\nhyper-parameter optimization for the proposed kernel, simultaneously ensuring\ndata privacy and minimizing communication costs. Theoretical analysis\nestablishes convergence guarantees for the learning framework, while\nexperiments on diverse datasets demonstrate the superior prediction performance\nand efficiency of our proposed methods.\n","authors":["Richard Cornelius Suwandi","Zhidi Lin","Feng Yin","Zhiguo Wang","Sergios Theodoridis"],"pdf_url":"https://arxiv.org/pdf/2309.08201v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10142v2","updated":"2025-01-16T12:29:28Z","published":"2024-09-16T10:13:09Z","title":"AALF: Almost Always Linear Forecasting","summary":"  Recent works for time-series forecasting more and more leverage the high\npredictive power of Deep Learning models. With this increase in model\ncomplexity, however, comes a lack in understanding of the underlying model\ndecision process, which is problematic for high-stakes application scenarios.\nAt the same time, simple, interpretable forecasting methods such as ARIMA still\nperform very well, sometimes on-par, with Deep Learning approaches. We argue\nthat simple models are good enough most of the time, and that forecasting\nperformance could be improved by choosing a Deep Learning method only for few,\nimportant predictions, increasing the overall interpretability of the\nforecasting process. In this context, we propose a novel online model selection\nframework which learns to identify these predictions. An extensive empirical\nstudy on various real-world datasets shows that our selection methodology\nperforms comparable to state-of-the-art online model selections methods in most\ncases while being significantly more interpretable. We find that almost always\nchoosing a simple autoregressive linear model for forecasting results in\ncompetitive performance, suggesting that the need for opaque black-box models\nin time-series forecasting might be smaller than recent works would suggest.\n","authors":["Matthias Jakobs","Thomas Liebig"],"pdf_url":"https://arxiv.org/pdf/2409.10142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09377v3","updated":"2025-01-16T12:12:24Z","published":"2023-06-15T08:18:29Z","title":"Evaluating alignment between humans and neural network representations\n  in image-based learning tasks","summary":"  Humans represent scenes and objects in rich feature spaces, carrying\ninformation that allows us to generalise about category memberships and\nabstract functions with few examples. What determines whether a neural network\nmodel generalises like a human? We tested how well the representations of $86$\npretrained neural network models mapped to human learning trajectories across\ntwo tasks where humans had to learn continuous relationships and categories of\nnatural images. In these tasks, both human participants and neural networks\nsuccessfully identified the relevant stimulus features within a few trials,\ndemonstrating effective generalisation. We found that while training dataset\nsize was a core determinant of alignment with human choices, contrastive\ntraining with multi-modal data (text and imagery) was a common feature of\ncurrently publicly available models that predicted human generalisation.\nIntrinsic dimensionality of representations had different effects on alignment\nfor different model types. Lastly, we tested three sets of human-aligned\nrepresentations and found no consistent improvements in predictive accuracy\ncompared to the baselines. In conclusion, pretrained neural networks can serve\nto extract representations for cognitive models, as they appear to capture some\nfundamental aspects of cognition that are transferable across tasks. Both our\nparadigms and modelling approach offer a novel way to quantify alignment\nbetween neural networks and humans and extend cognitive science into more\nnaturalistic domains.\n","authors":["Can Demircan","Tankred Saanum","Leonardo Pettini","Marcel Binz","Blazej M Baczkowski","Christian F Doeller","Mona M Garvert","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2306.09377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08443v2","updated":"2025-01-16T12:06:35Z","published":"2024-12-26T05:41:31Z","title":"Instruction-Guided Fusion of Multi-Layer Visual Features in Large\n  Vision-Language Models","summary":"  Large Vision-Language Models (LVLMs) have achieved significant success in\nmultimodal tasks by combining pre-trained vision encoders and large language\nmodels. However, current LVLMs mainly rely on features from the final layers of\nthe vision encoder, neglecting complementary information in shallower layers.\nWhile recent methods have explored multi-layer features, they are often\ntask-agnostic. We investigate the contributions of visual features from\ndifferent encoder layers across 18 benchmarks and 6 task categories. Our\nresults show that multi-layer features provide complementary strengths with\nvarying task dependencies, and uniform fusion performs suboptimally. Based on\nthese findings, we propose an instruction-guided vision aggregator that\ndynamically integrates multi-layer features based on textual instructions,\nwithout increasing the number of visual tokens. Extensive evaluations show\nsuperior performance, and analysis reveals the dominance of mid-to-high-level\nfeatures in semantic tasks and the critical role of low-level features in\nfine-grained perception. This work provides valuable insights into the adaptive\nuse of hierarchical visual features in LVLMs, advancing more flexible\nmultimodal systems.\n","authors":["Xu Li","Yi Zheng","Haotian Chen","Xiaolei Chen","Yuxuan Liang","Chenghang Lai","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2501.08443v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04613v3","updated":"2025-01-16T12:05:26Z","published":"2024-02-07T06:30:39Z","title":"Wasserstein Gradient Flows for Moreau Envelopes of f-Divergences in\n  Reproducing Kernel Hilbert Spaces","summary":"  Commonly used $f$-divergences of measures, e.g., the Kullback-Leibler\ndivergence, are subject to limitations regarding the support of the involved\nmeasures. A remedy is regularizing the $f$-divergence by a squared maximum mean\ndiscrepancy (MMD) associated with a characteristic kernel $K$. We use the\nkernel mean embedding to show that this regularization can be rewritten as the\nMoreau envelope of some function on the associated reproducing kernel Hilbert\nspace. Then, we exploit well-known results on Moreau envelopes in Hilbert\nspaces to analyze the MMD-regularized $f$-divergences, particularly their\ngradients. Subsequently, we use our findings to analyze Wasserstein gradient\nflows of MMD-regularized $f$-divergences. We provide proof-of-the-concept\nnumerical examples for flows starting from empirical measures. Here, we cover\n$f$-divergences with infinite and finite recession constants. Lastly, we extend\nour results to the tight variational formulation of $f$-divergences and\nnumerically compare the resulting flows.\n","authors":["Viktor Stein","Sebastian Neumayer","Nicolaj Rux","Gabriele Steidl"],"pdf_url":"https://arxiv.org/pdf/2402.04613v3.pdf","comment":"56 pages, 14 figures, 3 tables. Comments welcome! NEW: Incorporated\n  Reviewers' suggestions, added FISTA and tight formulation"},{"id":"http://arxiv.org/abs/2407.16485v3","updated":"2025-01-16T11:59:02Z","published":"2024-07-23T14:00:18Z","title":"Learning Constraint Network from Demonstrations via Positive-Unlabeled\n  Learning with Memory Replay","summary":"  Planning for a wide range of real-world tasks necessitates to know and write\nall constraints. However, instances exist where these constraints are either\nunknown or challenging to specify accurately. A possible solution is to infer\nthe unknown constraints from expert demonstration. The majority of prior works\nlimit themselves to learning simple linear constraints, or require strong\nknowledge of the true constraint parameterization or environmental model. To\nmitigate these problems, this paper presents a positive-unlabeled (PU) learning\napproach to infer a continuous, arbitrary and possibly nonlinear, constraint\nfrom demonstration. From a PU learning view, We treat all data in\ndemonstrations as positive (feasible) data, and learn a (sub)-optimal policy to\ngenerate high-reward-winning but potentially infeasible trajectories, which\nserve as unlabeled data containing both feasible and infeasible states. Under\nan assumption on data distribution, a feasible-infeasible classifier (i.e.,\nconstraint model) is learned from the two datasets through a postprocessing PU\nlearning technique. The entire method employs an iterative framework\nalternating between updating the policy, which generates and selects\nhigher-reward policies, and updating the constraint model. Additionally, a\nmemory buffer is introduced to record and reuse samples from previous\niterations to prevent forgetting. The effectiveness of the proposed method is\nvalidated in two Mujoco environments, successfully inferring continuous\nnonlinear constraints and outperforming a baseline method in terms of\nconstraint accuracy and policy safety.\n","authors":["Baiyu Peng","Aude Billard"],"pdf_url":"https://arxiv.org/pdf/2407.16485v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09481v1","updated":"2025-01-16T11:35:22Z","published":"2025-01-16T11:35:22Z","title":"MonoSOWA: Scalable monocular 3D Object detector Without human\n  Annotations","summary":"  Detecting the three-dimensional position and orientation of objects using a\nsingle RGB camera is a foundational task in computer vision with many important\napplications. Traditionally, 3D object detection methods are trained in a\nfully-supervised setup, requiring vast amounts of human annotations, which are\nlaborious, costly, and do not scale well with the ever-increasing amounts of\ndata being captured.\n  In this paper, we present the first method to train 3D object detectors for\nmonocular RGB cameras without domain-specific human annotations, thus making\norders of magnitude more data available for training. Thanks to newly proposed\nCanonical Object Space, the method can not only exploit data across a variety\nof datasets and camera setups to train a single 3D detector, but unlike\nprevious work it also works out of the box in previously unseen camera setups.\nAll this is crucial for practical applications, where the data and cameras are\nextremely heterogeneous.\n  The method is evaluated on two standard autonomous driving datasets, where it\noutperforms previous works, which, unlike our method, still rely on 2D human\nannotations.\n","authors":["Jan Skvrna","Lukas Neumann"],"pdf_url":"https://arxiv.org/pdf/2501.09481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09480v1","updated":"2025-01-16T11:32:03Z","published":"2025-01-16T11:32:03Z","title":"Utilizing AI Language Models to Identify Prognostic Factors for Coronary\n  Artery Disease: A Study in Mashhad Residents","summary":"  Abstract: Background: Understanding cardiovascular artery disease risk\nfactors, the leading global cause of mortality, is crucial for influencing its\netiology, prevalence, and treatment. This study aims to evaluate prognostic\nmarkers for coronary artery disease in Mashhad using Naive Bayes, REP Tree,\nJ48, CART, and CHAID algorithms. Methods:\n  Using data from the 2009 MASHAD STUDY, prognostic factors for coronary artery\ndisease were determined with Naive Bayes, REP Tree, J48, CART, CHAID, and\nRandom Forest algorithms using R 3.5.3 and WEKA 3.9.4. Model efficiency was\ncompared by sensitivity, specificity, and accuracy. Cases were patients with\ncoronary artery disease; each had three controls (totally 940). Results:\nPrognostic factors for coronary artery disease in Mashhad residents varied by\nalgorithm. CHAID identified age, myocardial infarction history, and\nhypertension. CART included depression score and physical activity. REP added\neducation level and anxiety score. NB included diabetes and family history. J48\nhighlighted father's heart disease and weight loss. CHAID had the highest\naccuracy (0.80).\n  Conclusion:\n  Key prognostic factors for coronary artery disease in CART and CHAID models\ninclude age, myocardial infarction history, hypertension, depression score,\nphysical activity, and BMI. NB, REP Tree, and J48 identified numerous factors.\nCHAID had the highest accuracy, sensitivity, and specificity. CART offers\nsimpler interpretation, aiding physician and paramedic model selection based on\nspecific. Keywords: RF, Na\\\"ive Bayes, REP, J48 algorithms, Coronary Artery\nDisease (CAD).\n","authors":["Bami Zahra","Behnampour Nasser","Doosti Hassan","Ghayour Mobarhan Majid"],"pdf_url":"https://arxiv.org/pdf/2501.09480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22944v2","updated":"2025-01-16T11:26:02Z","published":"2024-10-30T12:01:48Z","title":"Focus On This, Not That! Steering LLMs With Adaptive Feature\n  Specification","summary":"  Despite the success of Instruction Tuning (IT) in training large language\nmodels (LLMs) to perform arbitrary user-specified tasks, these models often\nstill leverage spurious or biased features learned from their training data,\nleading to undesired behaviours when deploying them in new contexts. In this\nwork, we introduce Focus Instruction Tuning (FIT), which trains LLMs to\ncondition their responses by focusing on specific features whilst ignoring\nothers, leading to different behaviours based on what features are specified.\nAcross several experimental settings, we show that focus-tuned models can be\nadaptively steered by focusing on different features at inference-time: for\ninstance, robustness can be improved by focusing on task-causal features and\nignoring spurious features, and social bias can be mitigated by ignoring\ndemographic categories. Furthermore, FIT can steer behaviour in new contexts,\ngeneralising under distribution shift and to new unseen features at inference\ntime, and thereby facilitating more robust, fair, and controllable LLM\napplications in real-world environments.\n","authors":["Tom A. Lamb","Adam Davies","Alasdair Paren","Philip H. S. Torr","Francesco Pinto"],"pdf_url":"https://arxiv.org/pdf/2410.22944v2.pdf","comment":"28pages, 14 figures"},{"id":"http://arxiv.org/abs/2209.04747v6","updated":"2025-01-16T11:17:04Z","published":"2022-09-10T22:00:30Z","title":"Diffusion Models in Vision: A Survey","summary":"  Denoising diffusion models represent a recent emerging topic in computer\nvision, demonstrating remarkable results in the area of generative modeling. A\ndiffusion model is a deep generative model that is based on two stages, a\nforward diffusion stage and a reverse diffusion stage. In the forward diffusion\nstage, the input data is gradually perturbed over several steps by adding\nGaussian noise. In the reverse stage, a model is tasked at recovering the\noriginal input data by learning to gradually reverse the diffusion process,\nstep by step. Diffusion models are widely appreciated for the quality and\ndiversity of the generated samples, despite their known computational burdens,\ni.e. low speeds due to the high number of steps involved during sampling. In\nthis survey, we provide a comprehensive review of articles on denoising\ndiffusion models applied in vision, comprising both theoretical and practical\ncontributions in the field. First, we identify and present three generic\ndiffusion modeling frameworks, which are based on denoising diffusion\nprobabilistic models, noise conditioned score networks, and stochastic\ndifferential equations. We further discuss the relations between diffusion\nmodels and other deep generative models, including variational auto-encoders,\ngenerative adversarial networks, energy-based models, autoregressive models and\nnormalizing flows. Then, we introduce a multi-perspective categorization of\ndiffusion models applied in computer vision. Finally, we illustrate the current\nlimitations of diffusion models and envision some interesting directions for\nfuture research.\n","authors":["Florinel-Alin Croitoru","Vlad Hondru","Radu Tudor Ionescu","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2209.04747v6.pdf","comment":"Accepted in IEEE Transactions on Pattern Analysis and Machine\n  Intelligence. 25 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.19320v4","updated":"2025-01-16T11:11:30Z","published":"2024-07-27T18:33:10Z","title":"WindsorML: High-Fidelity Computational Fluid Dynamics Dataset For\n  Automotive Aerodynamics","summary":"  This paper presents a new open-source high-fidelity dataset for Machine\nLearning (ML) containing 355 geometric variants of the Windsor body, to help\nthe development and testing of ML surrogate models for external automotive\naerodynamics. Each Computational Fluid Dynamics (CFD) simulation was run with a\nGPU-native high-fidelity Wall-Modeled Large-Eddy Simulations (WMLES) using a\nCartesian immersed-boundary method using more than 280M cells to ensure the\ngreatest possible accuracy. The dataset contains geometry variants that\nexhibits a wide range of flow characteristics that are representative of those\nobserved on road-cars. The dataset itself contains the 3D time-averaged volume\n& boundary data as well as the geometry and force & moment coefficients. This\npaper discusses the validation of the underlying CFD methods as well as\ncontents and structure of the dataset. To the authors knowledge, this\nrepresents the first, large-scale high-fidelity CFD dataset for the Windsor\nbody with a permissive open-source license (CC-BY-SA).\n","authors":["Neil Ashton","Jordan B. Angel","Aditya S. Ghate","Gaetan K. W. Kenway","Man Long Wong","Cetin Kiris","Astrid Walle","Danielle C. Maddix","Gary Page"],"pdf_url":"https://arxiv.org/pdf/2407.19320v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09469v1","updated":"2025-01-16T11:10:38Z","published":"2025-01-16T11:10:38Z","title":"Predicting Air Temperature from Volumetric Urban Morphology with Machine\n  Learning","summary":"  In this study, we firstly introduce a method that converts CityGML data into\nvoxels which works efficiently and fast in high resolution for large scale\ndatasets such as cities but by sacrificing some building details to overcome\nthe limitations of previous voxelization methodologies that have been\ncomputationally intensive and inefficient at transforming large-scale urban\nareas into voxel representations for high resolution. Those voxelized 3D city\ndata from multiple cities and corresponding air temperature data are used to\ndevelop a machine learning model. Before the model training, Gaussian blurring\nis implemented on input data to consider spatial relationships, as a result the\ncorrelation rate between air temperature and volumetric building morphology is\nalso increased after the Gaussian blurring. After the model training, the\nprediction results are not just evaluated with Mean Square Error (MSE) but some\nimage similarity metrics such as Structural Similarity Index Measure (SSIM) and\nLearned Perceptual Image Patch Similarity (LPIPS) that are able to detect and\nconsider spatial relations during the evaluation process. This trained model is\ncapable of predicting the spatial distribution of air temperature by using\nbuilding volume information of corresponding pixel as input. By doing so, this\nresearch aims to assist urban planners in incorporating environmental\nparameters into their planning strategies, thereby facilitating more\nsustainable and inhabitable urban environments.\n","authors":["Berk Kıvılcım","Patrick Erik Bradley"],"pdf_url":"https://arxiv.org/pdf/2501.09469v1.pdf","comment":"30 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.09464v1","updated":"2025-01-16T10:55:05Z","published":"2025-01-16T10:55:05Z","title":"Pruning for Sparse Diffusion Models based on Gradient Flow","summary":"  Diffusion Models (DMs) have impressive capabilities among generation models,\nbut are limited to slower inference speeds and higher computational costs.\nPrevious works utilize one-shot structure pruning to derive lightweight DMs\nfrom pre-trained ones, but this approach often leads to a significant drop in\ngeneration quality and may result in the removal of crucial weights. Thus we\npropose a iterative pruning method based on gradient flow, including the\ngradient flow pruning process and the gradient flow pruning criterion. We\nemploy a progressive soft pruning strategy to maintain the continuity of the\nmask matrix and guide it along the gradient flow of the energy function based\non the pruning criterion in sparse space, thereby avoiding the sudden\ninformation loss typically caused by one-shot pruning. Gradient-flow based\ncriterion prune parameters whose removal increases the gradient norm of loss\nfunction and can enable fast convergence for a pruned model in iterative\npruning stage. Our extensive experiments on widely used datasets demonstrate\nthat our method achieves superior performance in efficiency and consistency\nwith pre-trained models.\n","authors":["Ben Wan","Tianyi Zheng","Zhaoyu Chen","Yuxiao Wang","Jia Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09464v1.pdf","comment":"5 pages, 1 figure, accepted by ICASSP2025"},{"id":"http://arxiv.org/abs/2406.19185v2","updated":"2025-01-16T10:54:59Z","published":"2024-06-27T14:03:49Z","title":"Contrastive Policy Gradient: Aligning LLMs on sequence-level scores in a\n  supervised-friendly fashion","summary":"  Reinforcement Learning (RL) has been used to finetune Large Language Models\n(LLMs) using a reward model trained from preference data, to better align with\nhuman judgment. The recently introduced direct alignment methods, which are\noften simpler, more stable, and computationally lighter, can more directly\nachieve this. However, these approaches cannot optimize arbitrary rewards, and\nthe preference-based ones are not the only rewards of interest for LLMs (eg.,\nunit tests for code generation or textual entailment for summarization, among\nothers). RL-finetuning is usually done with a variation of policy gradient,\nwhich calls for on-policy or near-on-policy samples, requiring costly\ngenerations. We introduce Contrastive Policy Gradient, or CoPG, a simple and\nmathematically principled new RL algorithm that can estimate the optimal policy\neven from off-policy data. It can be seen as an off-policy policy gradient\napproach that does not rely on important sampling techniques and highlights the\nimportance of using (the right) state baseline. We show this approach to\ngeneralize the direct alignment method IPO (identity preference optimization)\nand classic policy gradient. We experiment with the proposed CoPG on a toy\nbandit problem to illustrate its properties, as well as for finetuning LLMs on\na summarization task, using a learned reward function considered as ground\ntruth for the purpose of the experiments.\n","authors":["Yannis Flet-Berliac","Nathan Grinsztajn","Florian Strub","Bill Wu","Eugene Choi","Chris Cremer","Arash Ahmadian","Yash Chandak","Mohammad Gheshlaghi Azar","Olivier Pietquin","Matthieu Geist"],"pdf_url":"https://arxiv.org/pdf/2406.19185v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.07708v3","updated":"2025-01-16T10:46:57Z","published":"2024-09-12T02:25:04Z","title":"Dataset-Free Weight-Initialization on Restricted Boltzmann Machine","summary":"  In feed-forward neural networks, dataset-free weight-initialization methods\nsuch as LeCun, Xavier (or Glorot), and He initializations have been developed.\nThese methods randomly determine the initial values of weight parameters based\non specific distributions (e.g., Gaussian or uniform distributions) without\nusing training datasets. To the best of the authors' knowledge, such a\ndataset-free weight-initialization method is yet to be developed for restricted\nBoltzmann machines (RBMs), which are probabilistic neural networks consisting\nof two layers. In this study, we derive a dataset-free weight-initialization\nmethod for Bernoulli--Bernoulli RBMs based on statistical mechanical analysis.\nIn the proposed weight-initialization method, the weight parameters are drawn\nfrom a Gaussian distribution with zero mean. The standard deviation of the\nGaussian distribution is optimized based on our hypothesis that a standard\ndeviation providing a larger layer correlation (LC) between the two layers\nimproves the learning efficiency. The expression of the LC is derived based on\na statistical mechanical analysis. The optimal value of the standard deviation\ncorresponds to the maximum point of the LC. The proposed weight-initialization\nmethod is identical to Xavier initialization in a specific case (i.e., when the\nsizes of the two layers are the same, the random variables of the layers are\n$\\{-1,1\\}$-binary, and all bias parameters are zero). The validity of the\nproposed weight-initialization method is demonstrated in numerical experiments\nusing a toy and real-world datasets.\n","authors":["Muneki Yasuda","Ryosuke Maeno","Chako Takahashi"],"pdf_url":"https://arxiv.org/pdf/2409.07708v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22801v2","updated":"2025-01-16T10:42:39Z","published":"2024-10-30T08:30:46Z","title":"Machine Learning Nonadiabatic Dynamics: Eliminating Phase Freedom of\n  Nonadiabatic Couplings with the State-Intraction State-Averaged\n  Spin-Restricted Ensemble-Referenced Kohn-Sham Approach","summary":"  Excited-state molecular dynamics (ESMD) simulations near conical\nintersections (CIs) pose significant challenges when using machine learning\npotentials (MLPs). Although MLPs have gained recognition for their integration\ninto mixed quantum-classical (MQC) methods, such as trajectory surface hopping\n(TSH), and their capacity to model correlated electron-nuclear dynamics\nefficiently, difficulties persist in managing nonadiabatic dynamics.\nSpecifically, singularities at CIs and double-valued coupling elements result\nin discontinuities that disrupt the smoothness of predictive functions. Partial\nsolutions have been provided by learning diabatic Hamiltonians with phaseless\nloss functions to these challenges. However, a definitive method for addressing\nthe discontinuities caused by CIs and double-valued coupling elements has yet\nto be developed. Here, we introduce the phaseless coupling term, $\\Delta^2$,\nderived from the square of the off-diagonal elements of the diabatic\nHamiltonian in the state-interaction state-averaged spin-restricted\nensemble-referenced Kohn-Sham (SI-SA-REKS, briefly SSR)(2,2) formalism. This\napproach improves the stability and accuracy of the MLP model by addressing the\nissues arising from CI singularities and double-valued coupling functions. We\napply this method to the penta-2,4-dieniminium cation (PSB3), demonstrating its\neffectiveness in improving MLP training for ML-based nonadiabatic dynamics. Our\nresults show that the $\\Delta^2$ based ML-ESMD method can reproduce ab initio\nESMD simulations, underscoring its potential and efficiency for broader\napplications, particularly in large-scale and long-timescale ESMD simulations.\n","authors":["Sung Wook Moon","Soohaeng Yoo Willow","Tae Hyeon Park","Seung Kyu Min","Chang Woo Myung"],"pdf_url":"https://arxiv.org/pdf/2410.22801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09459v1","updated":"2025-01-16T10:37:07Z","published":"2025-01-16T10:37:07Z","title":"Teaching Wav2Vec2 the Language of the Brain","summary":"  The decoding of continuously spoken speech from neuronal activity has the\npotential to become an important clinical solution for paralyzed patients. Deep\nLearning Brain Computer Interfaces (BCIs) have recently successfully mapped\nneuronal activity to text contents in subjects who attempted to formulate\nspeech. However, only small BCI datasets are available. In contrast, labeled\ndata and pre-trained models for the closely related task of speech recognition\nfrom audio are widely available. One such model is Wav2Vec2 which has been\ntrained in a self-supervised fashion to create meaningful representations of\nspeech audio data. In this study, we show that patterns learned by Wav2Vec2 are\ntransferable to brain data. Specifically, we replace its audio feature\nextractor with an untrained Brain Feature Extractor (BFE) model. We then\nexecute full fine-tuning with pre-trained weights for Wav2Vec2, training ''from\nscratch'' without pre-trained weights as well as freezing a pre-trained\nWav2Vec2 and training only the BFE each for 45 different BFE architectures.\nAcross these experiments, the best run is from full fine-tuning with\npre-trained weights, achieving a Character Error Rate (CER) of 18.54\\%,\noutperforming the best training from scratch run by 20.46\\% and that of frozen\nWav2Vec2 training by 15.92\\% percentage points. These results indicate that\nknowledge transfer from audio speech recognition to brain decoding is possible\nand significantly improves brain decoding performance for the same\narchitectures. Related source code is available at\nhttps://github.com/tfiedlerdev/Wav2Vec2ForBrain.\n","authors":["Tobias Fiedler","Leon Hermann","Florian Müller","Sarel Cohen","Peter Chin","Tobias Friedrich","Eilon Vaadia"],"pdf_url":"https://arxiv.org/pdf/2501.09459v1.pdf","comment":"Paper was submitted to ICASSP 2025 but marginally rejected"},{"id":"http://arxiv.org/abs/2408.01622v2","updated":"2025-01-16T10:30:40Z","published":"2024-08-03T01:09:48Z","title":"Positive-Unlabeled Constraint Learning for Inferring Nonlinear\n  Continuous Constraints Functions from Expert Demonstrations","summary":"  Planning for diverse real-world robotic tasks necessitates to know and write\nall constraints. However, instances exist where these constraints are either\nunknown or challenging to specify accurately. A possible solution is to infer\nthe unknown constraints from expert demonstration. This paper presents a novel\ntwo-step Positive-Unlabeled Constraint Learning (PUCL) algorithm to infer a\ncontinuous constraint function from demonstrations, without requiring prior\nknowledge of the true constraint parameterization or environmental model as\nexisting works. We treat all data in demonstrations as positive (feasible)\ndata, and learn a control policy to generate potentially infeasible\ntrajectories, which serve as unlabeled data. The proposed two-step learning\nframework first identifies reliable infeasible data using a distance metric,\nand secondly learns a binary feasibility classifier (i.e., constraint function)\nfrom the feasible demonstrations and reliable infeasible data. The proposed\nmethod is flexible to learn complex-shaped constraint boundary and will not\nmistakenly classify demonstrations as infeasible as previous methods. The\neffectiveness of the proposed method is verified in four constrained\nenvironments, using a networked policy or a dynamical system policy. It\nsuccessfully infers the continuous nonlinear constraints and outperforms other\nbaseline methods in terms of constraint accuracy and policy safety. This work\nhas been published in IEEE Robotics and Automation Letters (RA-L). Please refer\nto the final version at https://doi.org/10.1109/LRA.2024.3522756\n","authors":["Baiyu Peng","Aude Billard"],"pdf_url":"https://arxiv.org/pdf/2408.01622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19106v2","updated":"2025-01-16T10:29:53Z","published":"2024-12-26T07:48:47Z","title":"ERGNN: Spectral Graph Neural Network With Explicitly-Optimized Rational\n  Graph Filters","summary":"  Approximation-based spectral graph neural networks, which construct graph\nfilters with function approximation, have shown substantial performance in\ngraph learning tasks. Despite their great success, existing works primarily\nemploy polynomial approximation to construct the filters, whereas another\nsuperior option, namely ration approximation, remains underexplored. Although a\nhandful of prior works have attempted to deploy the rational approximation,\ntheir implementations often involve intensive computational demands or still\nresort to polynomial approximations, hindering full potential of the rational\ngraph filters. To address the issues, this paper introduces ERGNN, a novel\nspectral GNN with explicitly-optimized rational filter. ERGNN adopts a unique\ntwo-step framework that sequentially applies the numerator filter and the\ndenominator filter to the input signals, thus streamlining the model paradigm\nwhile enabling explicit optimization of both numerator and denominator of the\nrational filter. Extensive experiments validate the superiority of ERGNN over\nstate-of-the-art methods, establishing it as a practical solution for deploying\nrational-based GNNs.\n","authors":["Guoming Li","Jian Yang","Shangsong Liang"],"pdf_url":"https://arxiv.org/pdf/2412.19106v2.pdf","comment":"Accepted in 2025 IEEE International Conference on Acoustics, Speech,\n  and Signal Processing, ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.09444v1","updated":"2025-01-16T10:17:58Z","published":"2025-01-16T10:17:58Z","title":"Solving the unsolvable: Translating case law in Hong Kong","summary":"  This paper addresses the challenges translating case law under Hong Kong's\nbilingual legal system. It highlights the initial success of translating all\nwritten statutes into Chinese before the 1997 handover, a task mandated by the\nBasic Law. The effort involved significant collaboration among legal,\nlinguistic, and translation experts, resulting in a comprehensive and\nculturally appropriate bilingual legal system. However, translating case law\nremains a significant challenge due to the sheer volume and continuous growth\nof judicial decisions. The paper critiques the governments and judiciarys\nsporadic and uncoordinated efforts to translate case law, contrasting it with\nthe thorough approach previously taken for statute translation. Although the\ngovernment acknowledges the importance of legal bilingualism, it lacks a\nsustainable strategy for translating case law. The Judiciarys position that\ntranslating all judgments is unnecessary, unrealistic, and not cost-effectiveis\nanalyzed and critiqued for its impact on legal transparency and public trust. A\nproposed solution involves leveraging machine translation technology through a\nhuman-machine interactive translation platform, which undergoes two major\ntransitions. Initially based on a neural model, the platform transitions to\nusing a large language model for improved translation accuracy. Furthermore, it\nevolves from a single-agent system to a multi-agent system, incorporating\nTranslator, Annotator, and Proofreader agents. This multi-agent approach,\nsupported by a grant, aims to facilitate efficient, high-quality translation of\njudicial judgments by integrating advanced artificial intelligence and\ncontinuous feedback mechanisms, thus better meeting the needs of a bilingual\nlegal system.\n","authors":["King-kui Sin","Xi Xuan","Chunyu Kit","Clara Ho-yan Chan","Honic Ho-kin Ip"],"pdf_url":"https://arxiv.org/pdf/2501.09444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07700v2","updated":"2025-01-16T10:02:59Z","published":"2025-01-13T21:24:15Z","title":"An Adaptive Collocation Point Strategy For Physics Informed Neural\n  Networks via the QR Discrete Empirical Interpolation Method","summary":"  Physics-informed neural networks (PINNs) have gained significant attention\nfor solving forward and inverse problems related to partial differential\nequations (PDEs). While advancements in loss functions and network\narchitectures have improved PINN accuracy, the impact of collocation point\nsampling on their performance remains underexplored. Fixed sampling methods,\nsuch as uniform random sampling and equispaced grids, can fail to capture\ncritical regions with high solution gradients, limiting their effectiveness for\ncomplex PDEs. Adaptive methods, inspired by adaptive mesh refinement from\ntraditional numerical methods, address this by dynamically updating collocation\npoints during training but may overlook residual dynamics between updates,\npotentially losing valuable information. To overcome this limitation, we\npropose an adaptive collocation point selection strategy utilizing the QR\nDiscrete Empirical Interpolation Method (QR-DEIM), a reduced-order modeling\ntechnique for efficiently approximating nonlinear functions. Our results on\nbenchmark PDEs, including the wave, Allen-Cahn, and Burgers' equations,\ndemonstrate that our QR-DEIM-based approach improves PINN accuracy compared to\nexisting methods, offering a promising direction for adaptive collocation point\nstrategies.\n","authors":["Adrian Celaya","David Fuentes","Beatrice Riviere"],"pdf_url":"https://arxiv.org/pdf/2501.07700v2.pdf","comment":"Submitted to ICML 2025. Under review"},{"id":"http://arxiv.org/abs/2501.09429v1","updated":"2025-01-16T09:58:24Z","published":"2025-01-16T09:58:24Z","title":"ADAGE: A generic two-layer framework for adaptive agent based modelling","summary":"  Agent-based models (ABMs) are valuable for modelling complex, potentially\nout-of-equilibria scenarios. However, ABMs have long suffered from the Lucas\ncritique, stating that agent behaviour should adapt to environmental changes.\nFurthermore, the environment itself often adapts to these behavioural changes,\ncreating a complex bi-level adaptation problem. Recent progress integrating\nmulti-agent reinforcement learning into ABMs introduces adaptive agent\nbehaviour, beginning to address the first part of this critique, however, the\napproaches are still relatively ad hoc, lacking a general formulation, and\nfurthermore, do not tackle the second aspect of simultaneously adapting\nenvironmental level characteristics in addition to the agent behaviours. In\nthis work, we develop a generic two-layer framework for ADaptive AGEnt based\nmodelling (ADAGE) for addressing these problems. This framework formalises the\nbi-level problem as a Stackelberg game with conditional behavioural policies,\nproviding a consolidated framework for adaptive agent-based modelling based on\nsolving a coupled set of non-linear equations. We demonstrate how this generic\napproach encapsulates several common (previously viewed as distinct) ABM tasks,\nsuch as policy design, calibration, scenario generation, and robust behavioural\nlearning under one unified framework. We provide example simulations on\nmultiple complex economic and financial environments, showing the strength of\nthe novel framework under these canonical settings, addressing long-standing\ncritiques of traditional ABMs.\n","authors":["Benjamin Patrick Evans","Sihan Zeng","Sumitra Ganesh","Leo Ardon"],"pdf_url":"https://arxiv.org/pdf/2501.09429v1.pdf","comment":"Accepted at the 2025 International Conference on Autonomous Agents\n  and Multiagent Systems (AAMAS)"},{"id":"http://arxiv.org/abs/2501.09420v1","updated":"2025-01-16T09:47:18Z","published":"2025-01-16T09:47:18Z","title":"Dynamic Neural Style Transfer for Artistic Image Generation using VGG19","summary":"  Throughout history, humans have created remarkable works of art, but\nartificial intelligence has only recently started to make strides in generating\nvisually compelling art. Breakthroughs in the past few years have focused on\nusing convolutional neural networks (CNNs) to separate and manipulate the\ncontent and style of images, applying texture synthesis techniques.\nNevertheless, a number of current techniques continue to encounter obstacles,\nincluding lengthy processing times, restricted choices of style images, and the\ninability to modify the weight ratio of styles. We proposed a neural style\ntransfer system that can add various artistic styles to a desired image to\naddress these constraints allowing flexible adjustments to style weight ratios\nand reducing processing time. The system uses the VGG19 model for feature\nextraction, ensuring high-quality, flexible stylization without compromising\ncontent integrity.\n","authors":["Kapil Kashyap","Mehak Garg","Sean Fargose","Sindhu Nair"],"pdf_url":"https://arxiv.org/pdf/2501.09420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09412v1","updated":"2025-01-16T09:38:39Z","published":"2025-01-16T09:38:39Z","title":"FASP: Fast and Accurate Structured Pruning of Large Language Models","summary":"  The rapid increase in the size of large language models (LLMs) has\nsignificantly escalated their computational and memory demands, posing\nchallenges for efficient deployment, especially on resource-constrained\ndevices. Structured pruning has emerged as an effective model compression\nmethod that can reduce these demands while preserving performance. In this\npaper, we introduce FASP (Fast and Accurate Structured Pruning), a novel\nstructured pruning framework for LLMs that emphasizes both speed and accuracy.\nFASP employs a distinctive pruning structure that interlinks sequential layers,\nallowing for the removal of columns in one layer while simultaneously\neliminating corresponding rows in the preceding layer without incurring\nadditional performance loss. The pruning metric, inspired by Wanda, is\ncomputationally efficient and effectively selects components to prune.\nAdditionally, we propose a restoration mechanism that enhances model fidelity\nby adjusting the remaining weights post-pruning. We evaluate FASP on the OPT\nand LLaMA model families, demonstrating superior performance in terms of\nperplexity and accuracy on downstream tasks compared to state-of-the-art\nmethods. Our approach achieves significant speed-ups, pruning models such as\nOPT-125M in 17 seconds and LLaMA-30B in 15 minutes on a single NVIDIA RTX 4090\nGPU, making it a highly practical solution for optimizing LLMs.\n","authors":["Hanyu Hu","Pengxiang Zhao","Ping Li","Yi Zheng","Zhefeng Wang","Xiaoming Yuan"],"pdf_url":"https://arxiv.org/pdf/2501.09412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09410v1","updated":"2025-01-16T09:36:32Z","published":"2025-01-16T09:36:32Z","title":"MoE$^2$: Optimizing Collaborative Inference for Edge Large Language\n  Models","summary":"  Large language models (LLMs) have demonstrated remarkable capabilities across\na wide range of natural language processing tasks. Exploiting the heterogeneous\ncapabilities of edge LLMs is crucial for diverse emerging applications, as it\nenables greater cost-effectiveness and reduced latency. In this work, we\nintroduce \\textit{Mixture-of-Edge-Experts (MoE$^2$)}, a novel collaborative\ninference framework for edge LLMs. We formulate the joint gating and expert\nselection problem to optimize inference performance under energy and latency\nconstraints. Unlike conventional MoE problems, LLM expert selection is\nsignificantly more challenging due to the combinatorial nature and the\nheterogeneity of edge LLMs across various attributes. To this end, we propose a\ntwo-level expert selection mechanism through which we uncover an\noptimality-preserving property of gating parameters across expert selections.\nThis property enables the decomposition of the training and selection\nprocesses, significantly reducing complexity. Furthermore, we leverage the\nobjective's monotonicity and design a discrete monotonic optimization algorithm\nfor optimal expert selection. We implement edge servers with NVIDIA Jetson AGX\nOrins and NVIDIA RTX 4090 GPUs, and perform extensive experiments. Our results\nvalidate that performance improvements of various LLM models and show that our\nMoE$^2$ method can achieve optimal trade-offs among different delay and energy\nbudgets, and outperforms baselines under various system resource constraints.\n","authors":["Lyudong Jin","Yanning Zhang","Yanhan Li","Shurong Wang","Howard H. Yang","Jian Wu","Meng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09410v1.pdf","comment":"Submitted to IEEE/ACM Transactions on Networking"},{"id":"http://arxiv.org/abs/2501.09403v1","updated":"2025-01-16T09:18:59Z","published":"2025-01-16T09:18:59Z","title":"PISCO: Self-Supervised k-Space Regularization for Improved Neural\n  Implicit k-Space Representations of Dynamic MRI","summary":"  Neural implicit k-space representations (NIK) have shown promising results\nfor dynamic magnetic resonance imaging (MRI) at high temporal resolutions. Yet,\nreducing acquisition time, and thereby available training data, results in\nsevere performance drops due to overfitting. To address this, we introduce a\nnovel self-supervised k-space loss function $\\mathcal{L}_\\mathrm{PISCO}$,\napplicable for regularization of NIK-based reconstructions. The proposed loss\nfunction is based on the concept of parallel imaging-inspired self-consistency\n(PISCO), enforcing a consistent global k-space neighborhood relationship\nwithout requiring additional data. Quantitative and qualitative evaluations on\nstatic and dynamic MR reconstructions show that integrating PISCO significantly\nimproves NIK representations. Particularly for high acceleration factors\n(R$\\geq$54), NIK with PISCO achieves superior spatio-temporal reconstruction\nquality compared to state-of-the-art methods. Furthermore, an extensive\nanalysis of the loss assumptions and stability shows PISCO's potential as\nversatile self-supervised k-space loss function for further applications and\narchitectures. Code is available at:\nhttps://github.com/compai-lab/2025-pisco-spieker\n","authors":["Veronika Spieker","Hannah Eichhorn","Wenqi Huang","Jonathan K. Stelter","Tabita Catalan","Rickmer F. Braren","Daniel Rueckert","Francisco Sahli Costabal","Kerstin Hammernik","Dimitrios C. Karampinos","Claudia Prieto","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2501.09403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09399v1","updated":"2025-01-16T09:11:48Z","published":"2025-01-16T09:11:48Z","title":"Fast Searching of Extreme Operating Conditions for Relay Protection\n  Setting Calculation Based on Graph Neural Network and Reinforcement Learning","summary":"  Searching for the Extreme Operating Conditions (EOCs) is one of the core\nproblems of power system relay protection setting calculation. The current\nmethods based on brute-force search, heuristic algorithms, and mathematical\nprogramming can hardly meet the requirements of today's power systems in terms\nof computation speed due to the drastic changes in operating conditions induced\nby renewables and power electronics. This paper proposes an EOC fast search\nmethod, named Graph Dueling Double Deep Q Network (Graph D3QN), which combines\ngraph neural network and deep reinforcement learning to address this challenge.\nFirst, the EOC search problem is modeled as a Markov decision process, where\nthe information of the underlying power system is extracted using graph neural\nnetworks, so that the EOC of the system can be found via deep reinforcement\nlearning. Then, a two-stage Guided Learning and Free Exploration (GLFE)\ntraining framework is constructed to accelerate the convergence speed of\nreinforcement learning. Finally, the proposed Graph D3QN method is validated\nthrough case studies of searching maximum fault current for relay protection\nsetting calculation on the IEEE 39-bus and 118-bus systems. The experimental\nresults demonstrate that Graph D3QN can reduce the computation time by 10 to\n1000 times while guaranteeing the accuracy of the selected EOCs.\n","authors":["Yan Li","Jingyu Wang","Jiankang Zhang","Huaiqiang Li","Longfei Ren","Yinhong Li","Dongyuan Shi","Xianzhong Duan"],"pdf_url":"https://arxiv.org/pdf/2501.09399v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2406.04088v3","updated":"2025-01-16T09:07:51Z","published":"2024-06-06T13:58:41Z","title":"Deterministic Uncertainty Propagation for Improved Model-Based Offline\n  Reinforcement Learning","summary":"  Current approaches to model-based offline reinforcement learning often\nincorporate uncertainty-based reward penalization to address the distributional\nshift problem. These approaches, commonly known as pessimistic value iteration,\nuse Monte Carlo sampling to estimate the Bellman target to perform temporal\ndifference-based policy evaluation. We find out that the randomness caused by\nthis sampling step significantly delays convergence. We present a theoretical\nresult demonstrating the strong dependency of suboptimality on the number of\nMonte Carlo samples taken per Bellman target calculation. Our main contribution\nis a deterministic approximation to the Bellman target that uses progressive\nmoment matching, a method developed originally for deterministic variational\ninference. The resulting algorithm, which we call Moment Matching Offline\nModel-Based Policy Optimization (MOMBO), propagates the uncertainty of the next\nstate through a nonlinear Q-network in a deterministic fashion by approximating\nthe distributions of hidden layer activations by a normal distribution. We show\nthat it is possible to provide tighter guarantees for the suboptimality of\nMOMBO than the existing Monte Carlo sampling approaches. We also observe MOMBO\nto converge faster than these approaches in a large set of benchmark tasks.\n","authors":["Abdullah Akgül","Manuel Haußmann","Melih Kandemir"],"pdf_url":"https://arxiv.org/pdf/2406.04088v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08710v2","updated":"2025-01-16T09:07:00Z","published":"2025-01-15T10:50:54Z","title":"Disentangled Interleaving Variational Encoding","summary":"  Conflicting objectives present a considerable challenge in interleaving\nmulti-task learning, necessitating the need for meticulous design and balance\nto ensure effective learning of a representative latent data space across all\ntasks without mutual negative impact. Drawing inspiration from the concept of\nmarginal and conditional probability distributions in probability theory, we\ndesign a principled and well-founded approach to disentangle the original input\ninto marginal and conditional probability distributions in the latent space of\na variational autoencoder. Our proposed model, Deep Disentangled Interleaving\nVariational Encoding (DeepDIVE) learns disentangled features from the original\ninput to form clusters in the embedding space and unifies these features via\nthe cross-attention mechanism in the fusion stage. We theoretically prove that\ncombining the objectives for reconstruction and forecasting fully captures the\nlower bound and mathematically derive a loss function for disentanglement using\nNa\\\"ive Bayes. Under the assumption that the prior is a mixture of log-concave\ndistributions, we also establish that the Kullback-Leibler divergence between\nthe prior and the posterior is upper bounded by a function minimized by the\nminimizer of the cross entropy loss, informing our adoption of radial basis\nfunctions (RBF) and cross entropy with interleaving training for DeepDIVE to\nprovide a justified basis for convergence. Experiments on two public datasets\nshow that DeepDIVE disentangles the original input and yields forecast\naccuracies better than the original VAE and comparable to existing\nstate-of-the-art baselines.\n","authors":["Noelle Y. L. Wong","Eng Yeow Cheu","Zhonglin Chiam","Dipti Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2501.08710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09395v1","updated":"2025-01-16T09:06:43Z","published":"2025-01-16T09:06:43Z","title":"ELM-DeepONets: Backpropagation-Free Training of Deep Operator Networks\n  via Extreme Learning Machines","summary":"  Deep Operator Networks (DeepONets) are among the most prominent frameworks\nfor operator learning, grounded in the universal approximation theorem for\noperators. However, training DeepONets typically requires significant\ncomputational resources. To address this limitation, we propose ELM-DeepONets,\nan Extreme Learning Machine (ELM) framework for DeepONets that leverages the\nbackpropagation-free nature of ELM. By reformulating DeepONet training as a\nleast-squares problem for newly introduced parameters, the ELM-DeepONet\napproach significantly reduces training complexity. Validation on benchmark\nproblems, including nonlinear ODEs and PDEs, demonstrates that the proposed\nmethod not only achieves superior accuracy but also drastically reduces\ncomputational costs. This work offers a scalable and efficient alternative for\noperator learning in scientific computing.\n","authors":["Hwijae Son"],"pdf_url":"https://arxiv.org/pdf/2501.09395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09394v1","updated":"2025-01-16T09:06:10Z","published":"2025-01-16T09:06:10Z","title":"Quantum-Enhanced Transformers for Robust Acoustic Scene Classification\n  in IoT Environments","summary":"  The proliferation of Internet of Things (IoT) devices equipped with acoustic\nsensors necessitates robust acoustic scene classification (ASC) capabilities,\neven in noisy and data-limited environments. Traditional machine learning\nmethods often struggle to generalize effectively under such conditions. To\naddress this, we introduce Q-ASC, a novel Quantum-Inspired Acoustic Scene\nClassifier that leverages the power of quantum-inspired transformers. By\nintegrating quantum concepts like superposition and entanglement, Q-ASC\nachieves superior feature learning and enhanced noise resilience compared to\nclassical models. Furthermore, we introduce a Quantum Variational Autoencoder\n(QVAE) based data augmentation technique to mitigate the challenge of limited\nlabeled data in IoT deployments. Extensive evaluations on the Tampere\nUniversity of Technology (TUT) Acoustic Scenes 2016 benchmark dataset\ndemonstrate that Q-ASC achieves remarkable accuracy between 68.3% and 88.5%\nunder challenging conditions, outperforming state-of-the-art methods by over 5%\nin the best case. This research paves the way for deploying intelligent\nacoustic sensing in IoT networks, with potential applications in smart homes,\nindustrial monitoring, and environmental surveillance, even in adverse acoustic\nenvironments.\n","authors":["Minh K. Quan","Mayuri Wijayasundara","Sujeeva Setunge","Pubudu N. Pathirana"],"pdf_url":"https://arxiv.org/pdf/2501.09394v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2306.05515v4","updated":"2025-01-16T08:53:23Z","published":"2023-06-08T19:12:42Z","title":"PeFLL: Personalized Federated Learning by Learning to Learn","summary":"  We present PeFLL, a new personalized federated learning algorithm that\nimproves over the state-of-the-art in three aspects: 1) it produces more\naccurate models, especially in the low-data regime, and not only for clients\npresent during its training phase, but also for any that may emerge in the\nfuture; 2) it reduces the amount of on-client computation and client-server\ncommunication by providing future clients with ready-to-use personalized models\nthat require no additional finetuning or optimization; 3) it comes with\ntheoretical guarantees that establish generalization from the observed clients\nto future ones. At the core of PeFLL lies a learning-to-learn approach that\njointly trains an embedding network and a hypernetwork. The embedding network\nis used to represent clients in a latent descriptor space in a way that\nreflects their similarity to each other. The hypernetwork takes as input such\ndescriptors and outputs the parameters of fully personalized client models. In\ncombination, both networks constitute a learning algorithm that achieves\nstate-of-the-art performance in several personalized federated learning\nbenchmarks.\n","authors":["Jonathan Scott","Hossein Zakerinia","Christoph H. Lampert"],"pdf_url":"https://arxiv.org/pdf/2306.05515v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07124v2","updated":"2025-01-16T08:49:10Z","published":"2025-01-13T08:26:43Z","title":"LLM360 K2: Building a 65B 360-Open-Source Large Language Model from\n  Scratch","summary":"  We detail the training of the LLM360 K2-65B model, scaling up our 360-degree\nOPEN SOURCE approach to the largest and most powerful models under project\nLLM360. While open-source LLMs continue to advance, the answer to \"How are the\nlargest LLMs trained?\" remains unclear within the community. The implementation\ndetails for such high-capacity models are often protected due to business\nconsiderations associated with their high cost. This lack of transparency\nprevents LLM researchers from leveraging valuable insights from prior\nexperience, e.g., \"What are the best practices for addressing loss spikes?\" The\nLLM360 K2 project addresses this gap by providing full transparency and access\nto resources accumulated during the training of LLMs at the largest scale. This\nreport highlights key elements of the K2 project, including our first model, K2\nDIAMOND, a 65 billion-parameter LLM that surpasses LLaMA-65B and rivals\nLLaMA2-70B, while requiring fewer FLOPs and tokens. We detail the\nimplementation steps and present a longitudinal analysis of K2 DIAMOND's\ncapabilities throughout its training process. We also outline ongoing projects\nsuch as TXT360, setting the stage for future models in the series. By offering\npreviously unavailable resources, the K2 project also resonates with the\n360-degree OPEN SOURCE principles of transparency, reproducibility, and\naccessibility, which we believe are vital in the era of resource-intensive AI\nresearch.\n","authors":["Zhengzhong Liu","Bowen Tan","Hongyi Wang","Willie Neiswanger","Tianhua Tao","Haonan Li","Fajri Koto","Yuqi Wang","Suqi Sun","Omkar Pangarkar","Richard Fan","Yi Gu","Victor Miller","Liqun Ma","Liping Tang","Nikhil Ranjan","Yonghao Zhuang","Guowei He","Renxi Wang","Mingkai Deng","Robin Algayres","Yuanzhi Li","Zhiqiang Shen","Preslav Nakov","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2501.07124v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13238v3","updated":"2025-01-16T08:49:06Z","published":"2024-05-21T22:53:00Z","title":"Enhancing User Interest based on Stream Clustering and Memory Networks\n  in Large-Scale Recommender Systems","summary":"  Recommender Systems (RSs) provide personalized recommendation service based\non user interest, which are widely used in various platforms. However, there\nare lots of users with sparse interest due to lacking consumption behaviors,\nwhich leads to poor recommendation results for them. This problem is widespread\nin large-scale RSs and is particularly difficult to address. To solve this\nproblem, we propose a novel solution named User Interest Enhancement (UIE)\nwhich enhances user interest including user profile and user history behavior\nsequences using the enhancement vectors and personalized enhancement vector\ngenerated based on stream clustering and memory networks from different\nperspectives. UIE not only remarkably improves model performance on the users\nwith sparse interest but also significantly enhance model performance on other\nusers. UIE is an end-to-end solution which is easy to be implemented based on\nranking model. Moreover, we expand our solution and apply similar methods to\nlong-tail items, which also achieves excellent improvement. Furthermore, we\nconduct extensive offline and online experiments in a large-scale industrial\nRS. The results demonstrate that our model outperforms other models remarkably,\nespecially for the users with sparse interest. Until now, UIE has been fully\ndeployed in multiple large-scale RSs and achieved remarkable improvements.\n","authors":["Peng Liu","Nian Wang","Cong Xu","Ming Zhao","Bin Wang","Yi Ren"],"pdf_url":"https://arxiv.org/pdf/2405.13238v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04329v4","updated":"2025-01-16T08:46:16Z","published":"2024-06-06T17:59:10Z","title":"Simplified and Generalized Masked Diffusion for Discrete Data","summary":"  Masked (or absorbing) diffusion is actively explored as an alternative to\nautoregressive models for generative modeling of discrete data. However,\nexisting work in this area has been hindered by unnecessarily complex model\nformulations and unclear relationships between different perspectives, leading\nto suboptimal parameterization, training objectives, and ad hoc adjustments to\ncounteract these issues. In this work, we aim to provide a simple and general\nframework that unlocks the full potential of masked diffusion models. We show\nthat the continuous-time variational objective of masked diffusion models is a\nsimple weighted integral of cross-entropy losses. Our framework also enables\ntraining generalized masked diffusion models with state-dependent masking\nschedules. When evaluated by perplexity, our models trained on OpenWebText\nsurpass prior diffusion language models at GPT-2 scale and demonstrate superior\nperformance on 4 out of 5 zero-shot language modeling tasks. Furthermore, our\nmodels vastly outperform previous discrete diffusion models on pixel-level\nimage modeling, achieving 2.75 (CIFAR-10) and 3.40 (ImageNet 64x64) bits per\ndimension that are better than autoregressive models of similar sizes. Our code\nis available at https://github.com/google-deepmind/md4.\n","authors":["Jiaxin Shi","Kehang Han","Zhe Wang","Arnaud Doucet","Michalis K. Titsias"],"pdf_url":"https://arxiv.org/pdf/2406.04329v4.pdf","comment":"NeurIPS 2024. Code is available at:\n  https://github.com/google-deepmind/md4"},{"id":"http://arxiv.org/abs/2408.12112v3","updated":"2025-01-16T08:44:22Z","published":"2024-08-22T03:54:08Z","title":"Balancing Act: Prioritization Strategies for LLM-Designed Restless\n  Bandit Rewards","summary":"  LLMs are increasingly used to design reward functions based on human\npreferences in Reinforcement Learning (RL). We focus on LLM-designed rewards\nfor Restless Multi-Armed Bandits, a framework for allocating limited resources\namong agents. In applications such as public health, this approach empowers\ngrassroots health workers to tailor automated allocation decisions to community\nneeds. In the presence of multiple agents, altering the reward function based\non human preferences can impact subpopulations very differently, leading to\ncomplex tradeoffs and a multi-objective resource allocation problem. We are the\nfirst to present a principled method termed Social Choice Language Model for\ndealing with these tradeoffs for LLM-designed rewards for multiagent planners\nin general and restless bandits in particular. The novel part of our model is a\ntransparent and configurable selection component, called an adjudicator,\nexternal to the LLM that controls complex tradeoffs via a user-selected social\nwelfare function. Our experiments demonstrate that our model reliably selects\nmore effective, aligned, and balanced reward functions compared to purely\nLLM-based approaches.\n","authors":["Shresth Verma","Niclas Boehmer","Lingkai Kong","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2408.12112v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02623v3","updated":"2025-01-16T08:18:01Z","published":"2024-11-04T21:31:04Z","title":"Learning to Assist Humans without Inferring Rewards","summary":"  Assistive agents should make humans' lives easier. Classically, such\nassistance is studied through the lens of inverse reinforcement learning, where\nan assistive agent (e.g., a chatbot, a robot) infers a human's intention and\nthen selects actions to help the human reach that goal. This approach requires\ninferring intentions, which can be difficult in high-dimensional settings. We\nbuild upon prior work that studies assistance through the lens of empowerment:\nan assistive agent aims to maximize the influence of the human's actions such\nthat they exert a greater control over the environmental outcomes and can solve\ntasks in fewer steps. We lift the major limitation of prior work in this\narea--scalability to high-dimensional settings--with contrastive successor\nrepresentations. We formally prove that these representations estimate a\nsimilar notion of empowerment to that studied by prior work and provide a\nready-made mechanism for optimizing it. Empirically, our proposed method\noutperforms prior methods on synthetic benchmarks, and scales to Overcooked, a\ncooperative game setting. Theoretically, our work connects ideas from\ninformation theory, neuroscience, and reinforcement learning, and charts a path\nfor representations to play a critical role in solving assistive problems.\n","authors":["Vivek Myers","Evan Ellis","Sergey Levine","Benjamin Eysenbach","Anca Dragan"],"pdf_url":"https://arxiv.org/pdf/2411.02623v3.pdf","comment":"Conference on Neural Information Processing Systems (NeurIPS), 2024"},{"id":"http://arxiv.org/abs/2501.09352v1","updated":"2025-01-16T08:04:04Z","published":"2025-01-16T08:04:04Z","title":"PAL: Prompting Analytic Learning with Missing Modality for Multi-Modal\n  Class-Incremental Learning","summary":"  Multi-modal class-incremental learning (MMCIL) seeks to leverage multi-modal\ndata, such as audio-visual and image-text pairs, thereby enabling models to\nlearn continuously across a sequence of tasks while mitigating forgetting.\nWhile existing studies primarily focus on the integration and utilization of\nmulti-modal information for MMCIL, a critical challenge remains: the issue of\nmissing modalities during incremental learning phases. This oversight can\nexacerbate severe forgetting and significantly impair model performance. To\nbridge this gap, we propose PAL, a novel exemplar-free framework tailored to\nMMCIL under missing-modality scenarios. Concretely, we devise modality-specific\nprompts to compensate for missing information, facilitating the model to\nmaintain a holistic representation of the data. On this foundation, we\nreformulate the MMCIL problem into a Recursive Least-Squares task, delivering\nan analytical linear solution. Building upon these, PAL not only alleviates the\ninherent under-fitting limitation in analytic learning but also preserves the\nholistic representation of missing-modality data, achieving superior\nperformance with less forgetting across various multi-modal incremental\nscenarios. Extensive experiments demonstrate that PAL significantly outperforms\ncompetitive methods across various datasets, including UPMC-Food101 and\nN24News, showcasing its robustness towards modality absence and its\nanti-forgetting ability to maintain high incremental accuracy.\n","authors":["Xianghu Yue","Yiming Chen","Xueyi Zhang","Xiaoxue Gao","Mengling Feng","Mingrui Lao","Huiping Zhuang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2501.09352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09345v1","updated":"2025-01-16T07:58:33Z","published":"2025-01-16T07:58:33Z","title":"Rational Tuning of LLM Cascades via Probabilistic Modeling","summary":"  Understanding the reliability of large language models (LLMs) has recently\ngarnered significant attention. Given LLMs' propensity to hallucinate, as well\nas their high sensitivity to prompt design, it is already challenging to\npredict the performance of an individual LLM. However, the problem becomes more\ncomplex for compound LLM systems such as cascades, where in addition to each\nmodel's standalone performance, we must understand how the error rates of\ndifferent models interact. In this paper, we present a probabilistic model for\nthe joint performance distribution of a sequence of LLMs, which enables a\nframework for rationally tuning the confidence thresholds of a LLM cascade\nusing continuous optimization. Compared to selecting confidence thresholds\nusing grid search, our parametric Markov-copula model significantly improves\nruntime scaling with respect to the length of the cascade and the desired\nresolution of the cost-error curve, turning them from intractable into\nlow-order polynomial. In addition, the optimal thresholds computed using our\ncontinuous optimization-based algorithm increasingly outperform those found via\ngrid search as cascade length grows, improving the area under the cost-error\ncurve by 1.9% on average for cascades consisting of at least three models.\nOverall, our Markov-copula model provides a rational basis for tuning LLM\ncascade performance and points to the potential of probabilistic methods in\nanalyzing LLM systems.\n","authors":["Michael J. Zellinger","Matt Thomson"],"pdf_url":"https://arxiv.org/pdf/2501.09345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03584v2","updated":"2025-01-16T07:56:42Z","published":"2025-01-07T07:17:04Z","title":"Discriminative Representation learning via Attention-Enhanced\n  Contrastive Learning for Short Text Clustering","summary":"  Contrastive learning has gained significant attention in short text\nclustering, yet it has an inherent drawback of mistakenly identifying samples\nfrom the same category as negatives and then separating them in the feature\nspace (false negative separation), which hinders the generation of superior\nrepresentations. To generate more discriminative representations for efficient\nclustering, we propose a novel short text clustering method, called\nDiscriminative Representation learning via \\textbf{A}ttention-\\textbf{E}nhanced\n\\textbf{C}ontrastive \\textbf{L}earning for Short Text Clustering\n(\\textbf{AECL}). The \\textbf{AECL} consists of two modules which are the\npseudo-label generation module and the contrastive learning module. Both\nmodules build a sample-level attention mechanism to capture similarity\nrelationships between samples and aggregate cross-sample features to generate\nconsistent representations. Then, the former module uses the more\ndiscriminative consistent representation to produce reliable supervision\ninformation for assist clustering, while the latter module explores similarity\nrelationships and consistent representations optimize the construction of\npositive samples to perform similarity-guided contrastive learning, effectively\naddressing the false negative separation issue. Experimental results\ndemonstrate that the proposed \\textbf{AECL} outperforms state-of-the-art\nmethods. If the paper is accepted, we will open-source the code.\n","authors":["Zhihao Yao"],"pdf_url":"https://arxiv.org/pdf/2501.03584v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03337v4","updated":"2025-01-16T07:40:27Z","published":"2024-07-22T07:19:12Z","title":"PsyDI: Towards a Personalized and Progressively In-depth Chatbot for\n  Psychological Measurements","summary":"  In the field of psychology, traditional assessment methods, such as\nstandardized scales, are frequently critiqued for their static nature, lack of\npersonalization, and reduced participant engagement, while comprehensive\ncounseling evaluations are often inaccessible. The complexity of quantifying\npsychological traits further limits these methods. Despite advances with large\nlanguage models (LLMs), many still depend on single-round Question-and-Answer\ninteractions. To bridge this gap, we introduce PsyDI, a personalized and\nprogressively in-depth chatbot designed for psychological measurements,\nexemplified by its application in the Myers-Briggs Type Indicator (MBTI)\nframework. PsyDI leverages user-related multi-modal information and engages in\ncustomized, multi-turn interactions to provide personalized, easily accessible\nmeasurements, while ensuring precise MBTI type determination. To address the\nchallenge of unquantifiable psychological traits, we introduce a novel training\nparadigm that involves learning the ranking of proxy variables associated with\nthese traits, culminating in a robust score model for MBTI measurements. The\nscore model enables PsyDI to conduct comprehensive and precise measurements\nthrough multi-turn interactions within a unified estimation context. Through\nvarious experiments, we validate the efficacy of both the score model and the\nPsyDI pipeline, demonstrating its potential to serve as a general framework for\npsychological measurements. Furthermore, the online deployment of PsyDI has\ngarnered substantial user engagement, with over 3,000 visits, resulting in the\ncollection of numerous multi-turn dialogues annotated with MBTI types, which\nfacilitates further research. The source code for the training and web service\ncomponents is publicly available as a part of OpenDILab at:\nhttps://github.com/opendilab/PsyDI\n","authors":["Xueyan Li","Xinyan Chen","Yazhe Niu","Shuai Hu","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2408.03337v4.pdf","comment":"29 pages, 15 figures"},{"id":"http://arxiv.org/abs/2211.15081v8","updated":"2025-01-16T07:34:31Z","published":"2022-11-28T05:54:24Z","title":"Mitigating Overfitting in Graph Neural Networks via Feature and\n  Hyperplane Perturbation","summary":"  Graph neural networks (GNNs) are commonly used in semi-supervised settings.\nPrevious research has primarily focused on finding appropriate graph filters\n(e.g. aggregation methods) to perform well on both homophilic and heterophilic\ngraphs. While these methods are effective, they can still suffer from the\nsparsity of node features, where the initial data contain few non-zero\nelements. This can lead to overfitting in certain dimensions in the first\nprojection matrix, as training samples may not cover the entire range of graph\nfilters (hyperplanes). To address this, we propose a novel data augmentation\nstrategy. Specifically, by flipping both the initial features and hyperplane,\nwe create additional space for training, which leads to more precise updates of\nthe learnable parameters and improved robustness for unseen features during\ninference. To the best of our knowledge, this is the first attempt to mitigate\nthe overfitting caused by the initial features. Extensive experiments on\nreal-world datasets show that our proposed technique increases node\nclassification accuracy by up to 46.5% relatively.\n","authors":["Yoonhyuk Choi","Jiho Choi","Taewook Ko","Chong-Kwon Kim"],"pdf_url":"https://arxiv.org/pdf/2211.15081v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09336v1","updated":"2025-01-16T07:23:26Z","published":"2025-01-16T07:23:26Z","title":"Estimating shared subspace with AJIVE: the power and limitation of\n  multiple data matrices","summary":"  Integrative data analysis often requires disentangling joint and individual\nvariations across multiple datasets, a challenge commonly addressed by the\nJoint and Individual Variation Explained (JIVE) model. While numerous methods\nhave been developed to estimate the shared subspace under JIVE, the theoretical\nunderstanding of their performance remains limited, particularly in the context\nof multiple matrices and varying levels of subspace misalignment. This paper\nbridges this gap by providing a systematic analysis of shared subspace\nestimation in multi-matrix settings.\n  We focus on the Angle-based Joint and Individual Variation Explained (AJIVE)\nmethod, a two-stage spectral approach, and establish new performance guarantees\nthat uncover its strengths and limitations. Specifically, we show that in high\nsignal-to-noise ratio (SNR) regimes, AJIVE's estimation error decreases with\nthe number of matrices, demonstrating the power of multi-matrix integration.\nConversely, in low-SNR settings, AJIVE exhibits a non-diminishing error,\nhighlighting fundamental limitations. To complement these results, we derive\nminimax lower bounds, showing that AJIVE achieves optimal rates in high-SNR\nregimes. Furthermore, we analyze an oracle-aided spectral estimator to\ndemonstrate that the non-diminishing error in low-SNR scenarios is a\nfundamental barrier. Extensive numerical experiments corroborate our\ntheoretical findings, providing insights into the interplay between SNR, matrix\ncount, and subspace misalignment.\n","authors":["Yuepeng Yang","Cong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.09336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09331v1","updated":"2025-01-16T07:02:05Z","published":"2025-01-16T07:02:05Z","title":"Identifying Information from Observations with Uncertainty and Novelty","summary":"  A machine learning tasks from observations must encounter and process\nuncertainty and novelty, especially when it is expected to maintain performance\nwhen observing new information and to choose the best fitting hypothesis to the\ncurrently observed information. In this context, some key questions arise: what\nis information, how much information did the observations provide, how much\ninformation is required to identify the data-generating process, how many\nobservations remain to get that information, and how does a predictor determine\nthat it has observed novel information? This paper strengthens existing answers\nto these questions by formalizing the notion of \"identifiable information\" that\narises from the language used to express the relationship between distinct\nstates. Model identifiability and sample complexity are defined via computation\nof an indicator function over a set of hypotheses. Their properties and\nasymptotic statistics are described for data-generating processes ranging from\ndeterministic processes to ergodic stationary stochastic processes. This\nconnects the notion of identifying information in finite steps with asymptotic\nstatistics and PAC-learning. The indicator function's computation naturally\nformalizes novel information and its identification from observations with\nrespect to a hypothesis set. We also proved that computable PAC-Bayes learners'\nsample complexity distribution is determined by its moments in terms of the the\nprior probability distribution over a fixed finite hypothesis set.\n","authors":["Derek S. Prijatelj","Timothy J. Ireland","Walter J. Scheirer"],"pdf_url":"https://arxiv.org/pdf/2501.09331v1.pdf","comment":"43 pages, 1 figure, 1 table, and 2 inline algorithms. Submitted to\n  JMLR Jan. 6, 2025"},{"id":"http://arxiv.org/abs/2501.09327v1","updated":"2025-01-16T06:52:58Z","published":"2025-01-16T06:52:58Z","title":"On Learning Informative Trajectory Embeddings for Imitation,\n  Classification and Regression","summary":"  In real-world sequential decision making tasks like autonomous driving,\nrobotics, and healthcare, learning from observed state-action trajectories is\ncritical for tasks like imitation, classification, and clustering. For example,\nself-driving cars must replicate human driving behaviors, while robots and\nhealthcare systems benefit from modeling decision sequences, whether or not\nthey come from expert data. Existing trajectory encoding methods often focus on\nspecific tasks or rely on reward signals, limiting their ability to generalize\nacross domains and tasks. Inspired by the success of embedding models like CLIP\nand BERT in static domains, we propose a novel method for embedding\nstate-action trajectories into a latent space that captures the skills and\ncompetencies in the dynamic underlying decision-making processes. This method\noperates without the need for reward labels, enabling better generalization\nacross diverse domains and tasks. Our contributions are threefold: (1) We\nintroduce a trajectory embedding approach that captures multiple abilities from\nstate-action data. (2) The learned embeddings exhibit strong representational\npower across downstream tasks, including imitation, classification, clustering,\nand regression. (3) The embeddings demonstrate unique properties, such as\ncontrolling agent behaviors in IQ-Learn and an additive structure in the latent\nspace. Experimental results confirm that our method outperforms traditional\napproaches, offering more flexible and powerful trajectory representations for\nvarious applications. Our code is available at\nhttps://github.com/Erasmo1015/vte.\n","authors":["Zichang Ge","Changyu Chen","Arunesh Sinha","Pradeep Varakantham"],"pdf_url":"https://arxiv.org/pdf/2501.09327v1.pdf","comment":"AAMAS 2025"},{"id":"http://arxiv.org/abs/2501.08037v2","updated":"2025-01-16T06:44:29Z","published":"2025-01-14T11:42:51Z","title":"Enhanced SPS Velocity-adaptive Scheme: Access Fairness in 5G NR V2I\n  Networks","summary":"  Vehicle-to-Infrastructure (V2I) technology enables information exchange\nbetween vehicles and road infrastructure. Specifically, when a vehicle\napproaches a roadside unit (RSU), it can exchange information with the RSU to\nobtain accurate data that assists in driving. With the release of the 3rd\nGeneration Partnership Project (3GPP) Release 16, which includes the 5G New\nRadio (NR) Vehicle-to-Everything (V2X) standards, vehicles typically adopt\nmode-2 communication using sensing-based semi-persistent scheduling (SPS) for\nresource allocation. In this approach, vehicles identify candidate resources\nwithin a selection window and exclude ineligible resources based on information\nfrom a sensing window. However, vehicles often drive at different speeds,\nresulting in varying amounts of data transmission with RSUs as they pass by,\nwhich leads to unfair access. Therefore, it is essential to design an access\nscheme that accounts for different vehicle speeds to achieve fair access across\nthe network. This paper formulates an optimization problem for vehicular\nnetworks and proposes a multi-objective optimization scheme to address it by\nadjusting the selection window in the SPS mechanism of 5G NR V2I mode-2.\nSimulation results demonstrate the effectiveness of the proposed scheme\n","authors":["Xiao Xu","Qiong Wu","Pingyi Fan","Kezhi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08037v2.pdf","comment":"This paper has been submitted to IEEE Journal. The source code has\n  been released at:\n  https://github.com/qiongwu86/Enhanced-SPS-Velocity-adaptiveScheme-Access-Fariness-in-5G-NR-V2I-Networks"},{"id":"http://arxiv.org/abs/2501.09320v1","updated":"2025-01-16T06:22:35Z","published":"2025-01-16T06:22:35Z","title":"Cooperative Decentralized Backdoor Attacks on Vertical Federated\n  Learning","summary":"  Federated learning (FL) is vulnerable to backdoor attacks, where adversaries\nalter model behavior on target classification labels by embedding triggers into\ndata samples. While these attacks have received considerable attention in\nhorizontal FL, they are less understood for vertical FL (VFL), where devices\nhold different features of the samples, and only the server holds the labels.\nIn this work, we propose a novel backdoor attack on VFL which (i) does not rely\non gradient information from the server and (ii) considers potential collusion\namong multiple adversaries for sample selection and trigger embedding. Our\nlabel inference model augments variational autoencoders with metric learning,\nwhich adversaries can train locally. A consensus process over the adversary\ngraph topology determines which datapoints to poison. We further propose\nmethods for trigger splitting across the adversaries, with an intensity-based\nimplantation scheme skewing the server towards the trigger. Our convergence\nanalysis reveals the impact of backdoor perturbations on VFL indicated by a\nstationarity gap for the trained model, which we verify empirically as well. We\nconduct experiments comparing our attack with recent backdoor VFL approaches,\nfinding that ours obtains significantly higher success rates for the same main\ntask performance despite not using server information. Additionally, our\nresults verify the impact of collusion on attack performance.\n","authors":["Seohyun Lee","Wenzhi Fang","Anindya Bijoy Das","Seyyedali Hosseinalipour","David J. Love","Christopher G. Brinton"],"pdf_url":"https://arxiv.org/pdf/2501.09320v1.pdf","comment":"This paper is currently under review in the IEEE/ACM Transactions on\n  Networking Special Issue on AI and Networking"},{"id":"http://arxiv.org/abs/2410.01186v3","updated":"2025-01-16T05:50:54Z","published":"2024-10-02T02:38:33Z","title":"Efficient PAC Learning of Halfspaces with Constant Malicious Noise Rate","summary":"  Understanding noise tolerance of machine learning algorithms is a central\nquest in learning theory. In this work, we study the problem of computationally\nefficient PAC learning of halfspaces in the presence of malicious noise, where\nan adversary can corrupt both instances and labels of training samples. The\nbest-known noise tolerance either depends on a target error rate under\ndistributional assumptions or on a margin parameter under large-margin\nconditions. In this work, we show that when both types of conditions are\nsatisfied, it is possible to achieve constant noise tolerance by minimizing a\nreweighted hinge loss. Our key ingredients include: 1) an efficient algorithm\nthat finds weights to control the gradient deterioration from corrupted\nsamples, and 2) a new analysis on the robustness of the hinge loss equipped\nwith such weights.\n","authors":["Jie Shen"],"pdf_url":"https://arxiv.org/pdf/2410.01186v3.pdf","comment":"ALT 2025"},{"id":"http://arxiv.org/abs/2408.01432v3","updated":"2025-01-16T05:42:28Z","published":"2024-07-18T19:44:44Z","title":"VLG-CBM: Training Concept Bottleneck Models with Vision-Language\n  Guidance","summary":"  Concept Bottleneck Models (CBMs) provide interpretable prediction by\nintroducing an intermediate Concept Bottleneck Layer (CBL), which encodes\nhuman-understandable concepts to explain models' decision. Recent works\nproposed to utilize Large Language Models and pre-trained Vision-Language\nModels to automate the training of CBMs, making it more scalable and automated.\nHowever, existing approaches still fall short in two aspects: First, the\nconcepts predicted by CBL often mismatch the input image, raising doubts about\nthe faithfulness of interpretation. Second, it has been shown that concept\nvalues encode unintended information: even a set of random concepts could\nachieve comparable test accuracy to state-of-the-art CBMs. To address these\ncritical limitations, in this work, we propose a novel framework called\nVision-Language-Guided Concept Bottleneck Model (VLG-CBM) to enable faithful\ninterpretability with the benefits of boosted performance. Our method leverages\noff-the-shelf open-domain grounded object detectors to provide visually\ngrounded concept annotation, which largely enhances the faithfulness of concept\nprediction while further improving the model performance. In addition, we\npropose a new metric called Number of Effective Concepts (NEC) to control the\ninformation leakage and provide better interpretability. Extensive evaluations\nacross five standard benchmarks show that our method, VLG-CBM, outperforms\nexisting methods by at least 4.27% and up to 51.09% on Accuracy at NEC=5\n(denoted as ANEC-5), and by at least 0.45% and up to 29.78% on average accuracy\n(denoted as ANEC-avg), while preserving both faithfulness and interpretability\nof the learned concepts as demonstrated in extensive experiments.\n","authors":["Divyansh Srivastava","Ge Yan","Tsui-Wei Weng"],"pdf_url":"https://arxiv.org/pdf/2408.01432v3.pdf","comment":"Appeared at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.15368v2","updated":"2025-01-16T05:40:08Z","published":"2024-11-22T22:29:37Z","title":"The Power of Types: Exploring the Impact of Type Checking on Neural Bug\n  Detection in Dynamically Typed Languages","summary":"  Motivation: Automated bug detection in dynamically typed languages such as\nPython is essential for maintaining code quality. The lack of mandatory type\nannotations in such languages can lead to errors that are challenging to\nidentify early with traditional static analysis tools. Recent progress in deep\nneural networks has led to increased use of neural bug detectors. In statically\ntyped languages, a type checker is integrated into the compiler and thus taken\ninto consideration when the neural bug detector is designed for these\nlanguages.\n  Problem: However, prior studies overlook this aspect during the training and\ntesting of neural bug detectors for dynamically typed languages. When an\noptional type checker is used, assessing existing neural bug detectors on bugs\neasily detectable by type checkers may impact their performance estimation.\nMoreover, including these bugs in the training set of neural bug detectors can\nshift their detection focus toward the wrong type of bugs.\n  Contribution: We explore the impact of type checking on various neural bug\ndetectors for variable misuse bugs, a common type targeted by neural bug\ndetectors. Existing synthetic and real-world datasets are type-checked to\nevaluate the prevalence of type-related bugs. Then, we investigate how\ntype-related bugs influence the training and testing of the neural bug\ndetectors.\n  Findings: Our findings indicate that existing bug detection datasets contain\na significant proportion of type-related bugs. Building on this insight, we\ndiscover integrating the neural bug detector with a type checker can be\nbeneficial, especially when the code is annotated with types. Further\ninvestigation reveals neural bug detectors perform better on type-related bugs\nthan other bugs. Moreover, removing type-related bugs from the training data\nhelps improve neural bug detectors' ability to identify bugs beyond the scope\nof type checkers.\n","authors":["Boqi Chen","José Antonio Hernández López","Gunter Mussbacher","Dániel Varró"],"pdf_url":"https://arxiv.org/pdf/2411.15368v2.pdf","comment":"Accepted by ICSE'25 Research Track"},{"id":"http://arxiv.org/abs/2501.09304v1","updated":"2025-01-16T05:39:28Z","published":"2025-01-16T05:39:28Z","title":"Finding the Trigger: Causal Abductive Reasoning on Video Events","summary":"  This paper introduces a new problem, Causal Abductive Reasoning on Video\nEvents (CARVE), which involves identifying causal relationships between events\nin a video and generating hypotheses about causal chains that account for the\noccurrence of a target event. To facilitate research in this direction, we\ncreate two new benchmark datasets with both synthetic and realistic videos,\naccompanied by trigger-target labels generated through a novel counterfactual\nsynthesis approach. To explore the challenge of solving CARVE, we present a\nCausal Event Relation Network (CERN) that examines the relationships between\nvideo events in temporal and semantic spaces to efficiently determine the\nroot-cause trigger events. Through extensive experiments, we demonstrate the\ncritical roles of event relational representation learning and interaction\nmodeling in solving video causal reasoning challenges. The introduction of the\nCARVE task, along with the accompanying datasets and the CERN framework, will\nadvance future research on video causal reasoning and significantly facilitate\nvarious applications, including video surveillance, root-cause analysis and\nmovie content management.\n","authors":["Thao Minh Le","Vuong Le","Kien Do","Sunil Gupta","Svetha Venkatesh","Truyen Tran"],"pdf_url":"https://arxiv.org/pdf/2501.09304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.20062v3","updated":"2025-01-16T05:21:05Z","published":"2024-06-28T17:20:13Z","title":"Cost-aware Bayesian Optimization via the Pandora's Box Gittins Index","summary":"  Bayesian optimization is a technique for efficiently optimizing unknown\nfunctions in a black-box manner. To handle practical settings where gathering\ndata requires use of finite resources, it is desirable to explicitly\nincorporate function evaluation costs into Bayesian optimization policies. To\nunderstand how to do so, we develop a previously-unexplored connection between\ncost-aware Bayesian optimization and the Pandora's Box problem, a decision\nproblem from economics. The Pandora's Box problem admits a Bayesian-optimal\nsolution based on an expression called the Gittins index, which can be\nreinterpreted as an acquisition function. We study the use of this acquisition\nfunction for cost-aware Bayesian optimization, and demonstrate empirically that\nit performs well, particularly in medium-high dimensions. We further show that\nthis performance carries over to classical Bayesian optimization without\nexplicit evaluation costs. Our work constitutes a first step towards\nintegrating techniques from Gittins index theory into Bayesian optimization.\n","authors":["Qian Xie","Raul Astudillo","Peter I. Frazier","Ziv Scully","Alexander Terenin"],"pdf_url":"https://arxiv.org/pdf/2406.20062v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14641v3","updated":"2025-01-16T05:07:46Z","published":"2023-05-24T02:22:00Z","title":"Graph Analysis Using a GPU-based Parallel Algorithm: Quantum Clustering","summary":"  The article introduces a new method for applying Quantum Clustering to graph\nstructures. Quantum Clustering (QC) is a novel density-based unsupervised\nlearning method that determines cluster centers by constructing a potential\nfunction. In this method, we use the Graph Gradient Descent algorithm to find\nthe centers of clusters. GPU parallelization is utilized for computing\npotential values. We also conducted experiments on five widely used datasets\nand evaluated using four indicators. The results show superior performance of\nthe method. Finally, we discuss the influence of $\\sigma$ on the experimental\nresults.\n","authors":["Zhe Wang","ZhiJie He","Ding Liu"],"pdf_url":"https://arxiv.org/pdf/2305.14641v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09298v1","updated":"2025-01-16T05:07:05Z","published":"2025-01-16T05:07:05Z","title":"Physics-informed deep learning for infectious disease forecasting","summary":"  Accurate forecasting of contagious illnesses has become increasingly\nimportant to public health policymaking, and better prediction could prevent\nthe loss of millions of lives. To better prepare for future pandemics, it is\nessential to improve forecasting methods and capabilities. In this work, we\npropose a new infectious disease forecasting model based on physics-informed\nneural networks (PINNs), an emerging area of scientific machine learning. The\nproposed PINN model incorporates dynamical systems representations of disease\ntransmission into the loss function, thereby assimilating epidemiological\ntheory and data using neural networks (NNs). Our approach is designed to\nprevent model overfitting, which often occurs when training deep learning\nmodels with observation data alone. In addition, we employ an additional\nsub-network to account for mobility, vaccination, and other covariates that\ninfluence the transmission rate, a key parameter in the compartment model. To\ndemonstrate the capability of the proposed model, we examine the performance of\nthe model using state-level COVID-19 data in California. Our simulation results\nshow that predictions of PINN model on the number of cases, deaths, and\nhospitalizations are consistent with existing benchmarks. In particular, the\nPINN model outperforms the basic NN model and naive baseline forecast. We also\nshow that the performance of the PINN model is comparable to a sophisticated\nGaussian infection state space with time dependence (GISST) forecasting model\nthat integrates the compartment model with a data observation model and a\nregression model for inferring parameters in the compartment model.\nNonetheless, the PINN model offers a simpler structure and is easier to\nimplement. Our results show that the proposed forecaster could potentially\nserve as a new computational tool to enhance the current capacity of infectious\ndisease forecasting.\n","authors":["Ying Qian","Éric Marty","Avranil Basu","Eamon B. O'Dea","Xianqiao Wang","Spencer Fox","Pejman Rohani","John M. Drake","He Li"],"pdf_url":"https://arxiv.org/pdf/2501.09298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09283v1","updated":"2025-01-16T04:12:05Z","published":"2025-01-16T04:12:05Z","title":"Free-Knots Kolmogorov-Arnold Network: On the Analysis of Spline Knots\n  and Advancing Stability","summary":"  Kolmogorov-Arnold Neural Networks (KANs) have gained significant attention in\nthe machine learning community. However, their implementation often suffers\nfrom poor training stability and heavy trainable parameter. Furthermore, there\nis limited understanding of the behavior of the learned activation functions\nderived from B-splines. In this work, we analyze the behavior of KANs through\nthe lens of spline knots and derive the lower and upper bound for the number of\nknots in B-spline-based KANs. To address existing limitations, we propose a\nnovel Free Knots KAN that enhances the performance of the original KAN while\nreducing the number of trainable parameters to match the trainable parameter\nscale of standard Multi-Layer Perceptrons (MLPs). Additionally, we introduce\nnew a training strategy to ensure $C^2$ continuity of the learnable spline,\nresulting in smoother activation compared to the original KAN and improve the\ntraining stability by range expansion. The proposed method is comprehensively\nevaluated on 8 datasets spanning various domains, including image, text, time\nseries, multimodal, and function approximation tasks. The promising results\ndemonstrates the feasibility of KAN-based network and the effectiveness of\nproposed method.\n","authors":["Liangwewi Nathan Zheng","Wei Emma Zhang","Lin Yue","Miao Xu","Olaf Maennel","Weitong Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17638v3","updated":"2025-01-16T04:11:29Z","published":"2024-05-27T20:18:20Z","title":"The surprising efficiency of temporal difference learning for rare event\n  prediction","summary":"  We quantify the efficiency of temporal difference (TD) learning over the\ndirect, or Monte Carlo (MC), estimator for policy evaluation in reinforcement\nlearning, with an emphasis on estimation of quantities related to rare events.\nPolicy evaluation is complicated in the rare event setting by the long\ntimescale of the event and by the need for \\emph{relative accuracy} in\nestimates of very small values. Specifically, we focus on least-squares TD\n(LSTD) prediction for finite state Markov chains, and show that LSTD can\nachieve relative accuracy far more efficiently than MC. We prove a central\nlimit theorem for the LSTD estimator and upper bound the \\emph{relative\nasymptotic variance} by simple quantities characterizing the connectivity of\nstates relative to the transition probabilities between them. Using this bound,\nwe show that, even when both the timescale of the rare event and the relative\naccuracy of the MC estimator are exponentially large in the number of states,\nLSTD maintains a fixed level of relative accuracy with a total number of\nobserved transitions of the Markov chain that is only \\emph{polynomially} large\nin the number of states.\n","authors":["Xiaoou Cheng","Jonathan Weare"],"pdf_url":"https://arxiv.org/pdf/2405.17638v3.pdf","comment":"Final camera-ready version published at NeurIPS 2024. Correct an\n  assumption statement and typos, and change/add a few sentences from the last\n  version"},{"id":"http://arxiv.org/abs/2501.09274v1","updated":"2025-01-16T03:44:16Z","published":"2025-01-16T03:44:16Z","title":"Large Language Model is Secretly a Protein Sequence Optimizer","summary":"  We consider the protein sequence engineering problem, which aims to find\nprotein sequences with high fitness levels, starting from a given wild-type\nsequence. Directed evolution has been a dominating paradigm in this field which\nhas an iterative process to generate variants and select via experimental\nfeedback. We demonstrate large language models (LLMs), despite being trained on\nmassive texts, are secretly protein sequence optimizers. With a directed\nevolutionary method, LLM can perform protein engineering through Pareto and\nexperiment-budget constrained optimization, demonstrating success on both\nsynthetic and experimental fitness landscapes.\n","authors":["Yinkai Wang","Jiaxing He","Yuanqi Du","Xiaohui Chen","Jianan Canal Li","Li-Ping Liu","Xiaolin Xu","Soha Hassoun"],"pdf_url":"https://arxiv.org/pdf/2501.09274v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.04498v3","updated":"2025-01-16T03:35:07Z","published":"2024-08-08T14:46:01Z","title":"Model-Based Transfer Learning for Contextual Reinforcement Learning","summary":"  Deep reinforcement learning (RL) is a powerful approach to complex decision\nmaking. However, one issue that limits its practical application is its\nbrittleness, sometimes failing to train in the presence of small changes in the\nenvironment. Motivated by the success of zero-shot transfer-where pre-trained\nmodels perform well on related tasks-we consider the problem of selecting a\ngood set of training tasks to maximize generalization performance across a\nrange of tasks. Given the high cost of training, it is critical to select\ntraining tasks strategically, but not well understood how to do so. We hence\nintroduce Model-Based Transfer Learning (MBTL), which layers on top of existing\nRL methods to effectively solve contextual RL problems. MBTL models the\ngeneralization performance in two parts: 1) the performance set point, modeled\nusing Gaussian processes, and 2) performance loss (generalization gap), modeled\nas a linear function of contextual similarity. MBTL combines these two pieces\nof information within a Bayesian optimization (BO) framework to strategically\nselect training tasks. We show theoretically that the method exhibits sublinear\nregret in the number of training tasks and discuss conditions to further\ntighten regret bounds. We experimentally validate our methods using urban\ntraffic and standard continuous control benchmarks. The experimental results\nsuggest that MBTL can achieve up to 43x improved sample efficiency compared\nwith canonical independent training and multi-task training. Further\nexperiments demonstrate the efficacy of BO and the insensitivity to the\nunderlying RL algorithm and hyperparameters. This work lays the foundations for\ninvestigating explicit modeling of generalization, thereby enabling principled\nyet effective methods for contextual RL.\n","authors":["Jung-Hoon Cho","Vindula Jayawardana","Sirui Li","Cathy Wu"],"pdf_url":"https://arxiv.org/pdf/2408.04498v3.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n  2024)"},{"id":"http://arxiv.org/abs/2403.05811v4","updated":"2025-01-16T03:31:46Z","published":"2024-03-09T06:19:53Z","title":"Statistical Efficiency of Distributional Temporal Difference Learning\n  and Freedman's Inequality in Hilbert Spaces","summary":"  Distributional reinforcement learning (DRL) has achieved empirical success in\nvarious domains. One core task in DRL is distributional policy evaluation,\nwhich involves estimating the return distribution $\\eta^\\pi$ for a given policy\n$\\pi$. Distributional temporal difference learning has been accordingly\nproposed, which extends the classic temporal difference learning (TD) in RL. In\nthis paper, we focus on the non-asymptotic statistical rates of distributional\nTD. To facilitate theoretical analysis, we propose non-parametric\ndistributional TD (NTD). For a $\\gamma$-discounted infinite-horizon tabular\nMarkov decision process, we show that for NTD with a generative model, we need\n$\\tilde{O}(\\varepsilon^{-2}\\mu_{\\min}^{-1}(1-\\gamma)^{-3})$ interactions with\nthe environment to achieve an $\\varepsilon$-optimal estimator with high\nprobability, when the estimation error is measured by the $1$-Wasserstein. This\nsample complexity bound is minimax optimal up to logarithmic factors. In\naddition, we revisit categorical distributional TD (CTD), showing that the same\nnon-asymptotic convergence bounds hold for CTD in the case of the\n$1$-Wasserstein distance. We also extend our analysis to the more general\nsetting where the data generating process is Markovian. In the Markovian\nsetting, we propose variance-reduced variants of NTD and CTD, and show that\nboth can achieve a $\\tilde{O}(\\varepsilon^{-2}\n\\mu_{\\pi,\\min}^{-1}(1-\\gamma)^{-3}+t_{mix}\\mu_{\\pi,\\min}^{-1}(1-\\gamma)^{-1})$\nsample complexity bounds in the case of the $1$-Wasserstein distance, which\nmatches the state-of-the-art statistical results for classic policy evaluation.\nTo achieve the sharp statistical rates, we establish a novel Freedman's\ninequality in Hilbert spaces. This new Freedman's inequality would be of\nindependent interest for statistical analysis of various infinite-dimensional\nonline learning problems.\n","authors":["Yang Peng","Liangyu Zhang","Zhihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05811v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06848v3","updated":"2025-01-16T03:18:14Z","published":"2025-01-12T15:34:24Z","title":"A General Framework for Inference-time Scaling and Steering of Diffusion\n  Models","summary":"  Diffusion models produce impressive results in modalities ranging from images\nand video to protein design and text. However, generating samples with\nuser-specified properties remains a challenge. Recent research proposes\nfine-tuning models to maximize rewards that capture desired properties, but\nthese methods require expensive training and are prone to mode collapse. In\nthis work, we propose Feynman Kac (FK) steering, an inference-time framework\nfor steering diffusion models with reward functions. FK steering works by\nsampling a system of multiple interacting diffusion processes, called\nparticles, and resampling particles at intermediate steps based on scores\ncomputed using functions called potentials. Potentials are defined using\nrewards for intermediate states and are selected such that a high value\nindicates that the particle will yield a high-reward sample. We explore various\nchoices of potentials, intermediate rewards, and samplers. We evaluate FK\nsteering on text-to-image and text diffusion models. For steering text-to-image\nmodels with a human preference reward, we find that FK steering a 0.8B\nparameter model outperforms a 2.6B parameter fine-tuned model on prompt\nfidelity, with faster sampling and no training. For steering text diffusion\nmodels with rewards for text quality and specific text attributes, we find that\nFK steering generates lower perplexity, more linguistically acceptable outputs\nand enables gradient-free control of attributes like toxicity. Our results\ndemonstrate that inference-time scaling and steering of diffusion models, even\nwith off-the-shelf rewards, can provide significant sample quality gains and\ncontrollability benefits. Code is available at\nhttps://github.com/zacharyhorvitz/Fk-Diffusion-Steering .\n","authors":["Raghav Singhal","Zachary Horvitz","Ryan Teehan","Mengye Ren","Zhou Yu","Kathleen McKeown","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2501.06848v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09262v1","updated":"2025-01-16T03:11:50Z","published":"2025-01-16T03:11:50Z","title":"On the convergence of noisy Bayesian Optimization with Expected\n  Improvement","summary":"  Expected improvement (EI) is one of the most widely-used acquisition\nfunctions in Bayesian optimization (BO). Despite its proven success in\napplications for decades, important open questions remain on the theoretical\nconvergence behaviors and rates for EI. In this paper, we contribute to the\nconvergence theories of EI in three novel and critical area. First, we consider\nobjective functions that are under the Gaussian process (GP) prior assumption,\nwhereas existing works mostly focus on functions in the reproducing kernel\nHilbert space (RKHS). Second, we establish the first asymptotic error bound and\nits corresponding rate for GP-EI with noisy observations under the GP prior\nassumption. Third, by investigating the exploration and exploitation of the\nnon-convex EI function, we prove improved error bounds for both the noise-free\nand noisy cases. The improved noiseless bound is extended to the RKHS\nassumption as well.\n","authors":["Jingyi Wang","Haowei Wang","Cosmin G. Petra","Nai-Yuan Chiang"],"pdf_url":"https://arxiv.org/pdf/2501.09262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11654v3","updated":"2025-01-16T03:04:10Z","published":"2024-12-16T10:56:58Z","title":"Smoothness Really Matters: A Simple Yet Effective Approach for\n  Unsupervised Graph Domain Adaptation","summary":"  Unsupervised Graph Domain Adaptation (UGDA) seeks to bridge distribution\nshifts between domains by transferring knowledge from labeled source graphs to\ngiven unlabeled target graphs. Existing UGDA methods primarily focus on\naligning features in the latent space learned by graph neural networks (GNNs)\nacross domains, often overlooking structural shifts, resulting in limited\neffectiveness when addressing structurally complex transfer scenarios. Given\nthe sensitivity of GNNs to local structural features, even slight discrepancies\nbetween source and target graphs could lead to significant shifts in node\nembeddings, thereby reducing the effectiveness of knowledge transfer. To\naddress this issue, we introduce a novel approach for UGDA called Target-Domain\nStructural Smoothing (TDSS). TDSS is a simple and effective method designed to\nperform structural smoothing directly on the target graph, thereby mitigating\nstructural distribution shifts and ensuring the consistency of node\nrepresentations. Specifically, by integrating smoothing techniques with\nneighborhood sampling, TDSS maintains the structural coherence of the target\ngraph while mitigating the risk of over-smoothing. Our theoretical analysis\nshows that TDSS effectively reduces target risk by improving model smoothness.\nEmpirical results on three real-world datasets demonstrate that TDSS\noutperforms recent state-of-the-art baselines, achieving significant\nimprovements across six transfer scenarios. The code is available in\nhttps://github.com/cwei01/TDSS.\n","authors":["Wei Chen","Guo Ye","Yakun Wang","Zhao Zhang","Libang Zhang","Daixin Wang","Zhiqiang Zhang","Fuzhen Zhuang"],"pdf_url":"https://arxiv.org/pdf/2412.11654v3.pdf","comment":"11 pages, Accpected by AAAI2025"},{"id":"http://arxiv.org/abs/2404.13885v2","updated":"2025-01-16T02:45:07Z","published":"2024-04-22T05:12:52Z","title":"Surveying Attitudinal Alignment Between Large Language Models Vs. Humans\n  Towards 17 Sustainable Development Goals","summary":"  Large Language Models (LLMs) have emerged as potent tools for advancing the\nUnited Nations' Sustainable Development Goals (SDGs). However, the attitudinal\ndisparities between LLMs and humans towards these goals can pose significant\nchallenges. This study conducts a comprehensive review and analysis of the\nexisting literature on the attitudes of LLMs towards the 17 SDGs, emphasizing\nthe comparison between their attitudes and support for each goal and those of\nhumans. We examine the potential disparities, primarily focusing on aspects\nsuch as understanding and emotions, cultural and regional differences, task\nobjective variations, and factors considered in the decision-making process.\nThese disparities arise from the underrepresentation and imbalance in LLM\ntraining data, historical biases, quality issues, lack of contextual\nunderstanding, and skewed ethical values reflected. The study also investigates\nthe risks and harms that may arise from neglecting the attitudes of LLMs\ntowards the SDGs, including the exacerbation of social inequalities, racial\ndiscrimination, environmental destruction, and resource wastage. To address\nthese challenges, we propose strategies and recommendations to guide and\nregulate the application of LLMs, ensuring their alignment with the principles\nand goals of the SDGs, and therefore creating a more just, inclusive, and\nsustainable future.\n","authors":["Qingyang Wu","Ying Xu","Tingsong Xiao","Yunze Xiao","Yitong Li","Tianyang Wang","Yichi Zhang","Shanghai Zhong","Yuwei Zhang","Wei Lu","Yifan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09254v1","updated":"2025-01-16T02:43:44Z","published":"2025-01-16T02:43:44Z","title":"Clone-Robust AI Alignment","summary":"  A key challenge in training Large Language Models (LLMs) is properly aligning\nthem with human preferences. Reinforcement Learning with Human Feedback (RLHF)\nuses pairwise comparisons from human annotators to train reward functions and\nhas emerged as a popular alignment method. However, input datasets in RLHF are\nnot necessarily balanced in the types of questions and answers that are\nincluded. Therefore, we want RLHF algorithms to perform well even when the set\nof alternatives is not uniformly distributed. Drawing on insights from social\nchoice theory, we introduce robustness to approximate clones, a desirable\nproperty of RLHF algorithms which requires that adding near-duplicate\nalternatives does not significantly change the learned reward function. We\nfirst demonstrate that the standard RLHF algorithm based on regularized maximum\nlikelihood estimation (MLE) fails to satisfy this property. We then propose the\nweighted MLE, a new RLHF algorithm that modifies the standard regularized MLE\nby weighting alternatives based on their similarity to other alternatives. This\nnew algorithm guarantees robustness to approximate clones while preserving\ndesirable theoretical properties.\n","authors":["Ariel D. Procaccia","Benjamin Schiffer","Shirley Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13345v2","updated":"2025-01-16T02:37:08Z","published":"2024-05-22T05:04:44Z","title":"Autonomous Algorithm for Training Autonomous Vehicles with Minimal Human\n  Intervention","summary":"  Recent reinforcement learning (RL) algorithms have demonstrated impressive\nresults in simulated driving environments. However, autonomous vehicles trained\nin simulation often struggle to work well in the real world due to the fidelity\ngap between simulated and real-world environments. While directly training\nreal-world autonomous vehicles with RL algorithms is a promising approach to\nbypass the fidelity gap problem, it presents several challenges. One critical\nyet often overlooked challenge is the need to reset a driving environment\nbetween every episode. This reset process demands significant human\nintervention, leading to poor training efficiency in the real world. In this\npaper, we introduce a novel autonomous algorithm that enables off-the-shelf RL\nalgorithms to train autonomous vehicles with minimal human intervention. Our\nalgorithm reduces unnecessary human intervention by aborting episodes to\nprevent unsafe states and identifying informative initial states for subsequent\nepisodes. The key idea behind identifying informative initial states is to\nestimate the expected amount of information that can be obtained from\nunder-explored but reachable states. Our algorithm also revisits rule-based\nautonomous driving algorithms and highlights their benefits in safely returning\nan autonomous vehicle to initial states. To evaluate how much human\nintervention is required during training, we implement challenging urban\ndriving tasks that require an autonomous vehicle to reset to initial states on\nits own. The experimental results show that our autonomous algorithm is\ntask-agnostic and achieves competitive driving performance with much less human\nintervention than baselines.\n","authors":["Sang-Hyun Lee","Daehyeok Kwon","Seung-Woo Seo"],"pdf_url":"https://arxiv.org/pdf/2405.13345v2.pdf","comment":"8 pages, 6 figures, 2 tables, conference"},{"id":"http://arxiv.org/abs/2501.00230v2","updated":"2025-01-16T02:28:47Z","published":"2024-12-31T02:46:29Z","title":"Federated Deep Subspace Clustering","summary":"  This paper introduces FDSC, a private-protected subspace clustering (SC)\napproach with federated learning (FC) schema. In each client, there is a deep\nsubspace clustering network accounting for grouping the isolated data, composed\nof a encode network, a self-expressive layer, and a decode network. FDSC is\nachieved by uploading the encode network to communicate with other clients in\nthe server. Besides, FDSC is also enhanced by preserving the local neighborhood\nrelationship in each client. With the effects of federated learning and\nlocality preservation, the learned data features from the encoder are boosted\nso as to enhance the self-expressiveness learning and result in better\nclustering performance. Experiments test FDSC on public datasets and compare\nwith other clustering methods, demonstrating the effectiveness of FDSC.\n","authors":["Yupei Zhang","Ruojia Feng","Yifei Wang","Xuequn Shang"],"pdf_url":"https://arxiv.org/pdf/2501.00230v2.pdf","comment":"8pages,4 figures, 4 Tables"},{"id":"http://arxiv.org/abs/2501.09240v1","updated":"2025-01-16T01:54:23Z","published":"2025-01-16T01:54:23Z","title":"Task Vectors in In-Context Learning: Emergence, Formation, and Benefit","summary":"  In-context learning is a remarkable capability of transformers, referring to\ntheir ability to adapt to specific tasks based on a short history or context.\nPrevious research has found that task-specific information is locally encoded\nwithin models, though their emergence and functionality remain unclear due to\nopaque pre-training processes. In this work, we investigate the formation of\ntask vectors in a controlled setting, using models trained from scratch on\nsynthetic datasets. Our findings confirm that task vectors naturally emerge\nunder certain conditions, but the tasks may be relatively weakly and/or\nnon-locally encoded within the model. To promote strong task vectors encoded at\na prescribed location within the model, we propose an auxiliary training\nmechanism based on a task vector prompting loss (TVP-loss). This method\neliminates the need to search for task-correlated encodings within the trained\nmodel and demonstrably improves robustness and generalization.\n","authors":["Liu Yang","Ziqian Lin","Kangwook Lee","Dimitris Papailiopoulos","Robert Nowak"],"pdf_url":"https://arxiv.org/pdf/2501.09240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09238v1","updated":"2025-01-16T01:50:34Z","published":"2025-01-16T01:50:34Z","title":"Mono-Forward: Backpropagation-Free Algorithm for Efficient Neural\n  Network Training Harnessing Local Errors","summary":"  Backpropagation is the standard method for achieving state-of-the-art\naccuracy in neural network training, but it often imposes high memory costs and\nlacks biological plausibility. In this paper, we introduce the Mono-Forward\nalgorithm, a purely local layerwise learning method inspired by Hinton's\nForward-Forward framework. Unlike backpropagation, Mono-Forward optimizes each\nlayer solely with locally available information, eliminating the reliance on\nglobal error signals. We evaluated Mono-Forward on multi-layer perceptrons and\nconvolutional neural networks across multiple benchmarks, including MNIST,\nFashion-MNIST, CIFAR-10, and CIFAR-100. The test results show that Mono-Forward\nconsistently matches or surpasses the accuracy of backpropagation across all\ntasks, with significantly reduced and more even memory usage, better\nparallelizability, and a comparable convergence rate.\n","authors":["James Gong","Bruce Li","Waleed Abdulla"],"pdf_url":"https://arxiv.org/pdf/2501.09238v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2405.00846v4","updated":"2025-01-16T01:49:35Z","published":"2024-05-01T20:21:44Z","title":"Gameplay Filters: Robust Zero-Shot Safety through Adversarial\n  Imagination","summary":"  Despite the impressive recent advances in learning-based robot control,\nensuring robustness to out-of-distribution conditions remains an open\nchallenge. Safety filters can, in principle, keep arbitrary control policies\nfrom incurring catastrophic failures by overriding unsafe actions, but existing\nsolutions for complex (e.g., legged) robot dynamics do not span the full motion\nenvelope and instead rely on local, reduced-order models. These filters tend to\noverly restrict agility and can still fail when perturbed away from nominal\nconditions. This paper presents the gameplay filter, a new class of predictive\nsafety filter that continually plays out hypothetical matches between its\nsimulation-trained safety strategy and a virtual adversary co-trained to invoke\nworst-case events and sim-to-real error, and precludes actions that would cause\nfailures down the line. We demonstrate the scalability and robustness of the\napproach with a first-of-its-kind full-order safety filter for (36-D)\nquadrupedal dynamics. Physical experiments on two different quadruped platforms\ndemonstrate the superior zero-shot effectiveness of the gameplay filter under\nlarge perturbations such as tugging and unmodeled terrain. Experiment videos\nand open-source software are available online:\nhttps://saferobotics.org/research/gameplay-filter\n","authors":["Duy P. Nguyen","Kai-Chieh Hsu","Wenhao Yu","Jie Tan","Jaime F. Fisac"],"pdf_url":"https://arxiv.org/pdf/2405.00846v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09229v1","updated":"2025-01-16T01:28:45Z","published":"2025-01-16T01:28:45Z","title":"Tessellated Linear Model for Age Prediction from Voice","summary":"  Voice biometric tasks, such as age estimation require modeling the often\ncomplex relationship between voice features and the biometric variable. While\ndeep learning models can handle such complexity, they typically require large\namounts of accurately labeled data to perform well. Such data are often scarce\nfor biometric tasks such as voice-based age prediction. On the other hand,\nsimpler models like linear regression can work with smaller datasets but often\nfail to generalize to the underlying non-linear patterns present in the data.\nIn this paper we propose the Tessellated Linear Model (TLM), a piecewise linear\napproach that combines the simplicity of linear models with the capacity of\nnon-linear functions. TLM tessellates the feature space into convex regions and\nfits a linear model within each region. We optimize the tessellation and the\nlinear models using a hierarchical greedy partitioning. We evaluated TLM on the\nTIMIT dataset on the task of age prediction from voice, where it outperformed\nstate-of-the-art deep learning models.\n","authors":["Dareen Alharthi","Mahsa Zamani","Bhiksha Raj","Rita Singh"],"pdf_url":"https://arxiv.org/pdf/2501.09229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04061v3","updated":"2025-01-16T01:18:40Z","published":"2024-10-05T07:05:21Z","title":"Enhancing Graph Self-Supervised Learning with Graph Interplay","summary":"  Graph self-supervised learning (GSSL) has emerged as a compelling framework\nfor extracting informative representations from graph-structured data without\nextensive reliance on labeled inputs. In this study, we introduce Graph\nInterplay (GIP), an innovative and versatile approach that significantly\nenhances the performance equipped with various existing GSSL methods. To this\nend, GIP advocates direct graph-level communications by introducing random\ninter-graph edges within standard batches. Against GIP's simplicity, we further\ntheoretically show that \\textsc{GIP} essentially performs a principled manifold\nseparation via combining inter-graph message passing and GSSL, bringing about\nmore structured embedding manifolds and thus benefits a series of downstream\ntasks. Our empirical study demonstrates that GIP surpasses the performance of\nprevailing GSSL methods across multiple benchmarks by significant margins,\nhighlighting its potential as a breakthrough approach. Besides, GIP can be\nreadily integrated into a series of GSSL methods and consistently offers\nadditional performance gain. This advancement not only amplifies the capability\nof GSSL but also potentially sets the stage for a novel graph learning paradigm\nin a broader sense.\n","authors":["Xinjian Zhao","Wei Pang","Xiangru Jian","Yaoyao Xu","Chaolong Ying","Tianshu Yu"],"pdf_url":"https://arxiv.org/pdf/2410.04061v3.pdf","comment":"Due to potential implicit data leakage in our experimental setup,\n  where the pretraining dataset was ordered by default labels, we withdraw this\n  manuscript for further self-examination and rigorous validation"},{"id":"http://arxiv.org/abs/2501.09223v1","updated":"2025-01-16T01:03:56Z","published":"2025-01-16T01:03:56Z","title":"Foundations of Large Language Models","summary":"  This is a book about large language models. As indicated by the title, it\nprimarily focuses on foundational concepts rather than comprehensive coverage\nof all cutting-edge technologies. The book is structured into four main\nchapters, each exploring a key area: pre-training, generative models, prompting\ntechniques, and alignment methods. It is intended for college students,\nprofessionals, and practitioners in natural language processing and related\nfields, and can serve as a reference for anyone interested in large language\nmodels.\n","authors":["Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.09223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05526v2","updated":"2025-01-16T00:54:04Z","published":"2024-08-10T11:48:14Z","title":"CryoBench: Diverse and challenging datasets for the heterogeneity\n  problem in cryo-EM","summary":"  Cryo-electron microscopy (cryo-EM) is a powerful technique for determining\nhigh-resolution 3D biomolecular structures from imaging data. Its unique\nability to capture structural variability has spurred the development of\nheterogeneous reconstruction algorithms that can infer distributions of 3D\nstructures from noisy, unlabeled imaging data. Despite the growing number of\nadvanced methods, progress in the field is hindered by the lack of standardized\nbenchmarks with ground truth information and reliable validation metrics. Here,\nwe introduce CryoBench, a suite of datasets, metrics, and benchmarks for\nheterogeneous reconstruction in cryo-EM. CryoBench includes five datasets\nrepresenting different sources of heterogeneity and degrees of difficulty.\nThese include conformational heterogeneity generated from designed motions of\nantibody complexes or sampled from a molecular dynamics simulation, as well as\ncompositional heterogeneity from mixtures of ribosome assembly states or 100\ncommon complexes present in cells. We then analyze state-of-the-art\nheterogeneous reconstruction tools, including neural and non-neural methods,\nassess their sensitivity to noise, and propose new metrics for quantitative\nevaluation. We hope that CryoBench will be a foundational resource for\naccelerating algorithmic development and evaluation in the cryo-EM and machine\nlearning communities. Project page: https://cryobench.cs.princeton.edu.\n","authors":["Minkyu Jeon","Rishwanth Raghu","Miro Astore","Geoffrey Woollard","Ryan Feathers","Alkin Kaz","Sonya M. Hanson","Pilar Cossio","Ellen D. Zhong"],"pdf_url":"https://arxiv.org/pdf/2408.05526v2.pdf","comment":"Accepted by NeurIPS 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2311.12530v4","updated":"2025-01-16T00:53:15Z","published":"2023-11-21T11:21:53Z","title":"An efficient likelihood-free Bayesian inference method based on\n  sequential neural posterior estimation","summary":"  Sequential neural posterior estimation (SNPE) techniques have been recently\nproposed for dealing with simulation-based models with intractable likelihoods.\nUnlike approximate Bayesian computation, SNPE techniques learn the posterior\nfrom sequential simulation using neural network-based conditional density\nestimators by minimizing a specific loss function. The SNPE method proposed by\nLueckmann et al. (2017) used a calibration kernel to boost the sample weights\naround the observed data, resulting in a concentrated loss function. However,\nthe use of calibration kernels may increase the variances of both the empirical\nloss and its gradient, making the training inefficient. To improve the\nstability of SNPE, this paper proposes to use an adaptive calibration kernel\nand several variance reduction techniques. The proposed method greatly speeds\nup the process of training and provides a better approximation of the posterior\nthan the original SNPE method and some existing competitors as confirmed by\nnumerical experiments. We also managed to demonstrate the superiority of the\nproposed method for a high-dimensional model with a real-world dataset.\n","authors":["Yifei Xiong","Xiliang Yang","Sanguo Zhang","Zhijian He"],"pdf_url":"https://arxiv.org/pdf/2311.12530v4.pdf","comment":"28 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.09221v1","updated":"2025-01-16T00:45:05Z","published":"2025-01-16T00:45:05Z","title":"Leveraging Scale-aware Representations for improved\n  Concept-Representation Alignment in ViTs","summary":"  Vision Transformers (ViTs) are increasingly being adopted in various\nsensitive vision applications - like medical diagnosis, facial recognition,\netc. To improve the interpretability of such models, many approaches attempt to\nforward-align them with carefully annotated abstract, human-understandable\nsemantic entities - concepts. Concepts provide global rationales to the model\npredictions and can be quickly understood/intervened on by domain experts. Most\ncurrent research focuses on designing model-agnostic, plug-and-play generic\nconcept-based explainability modules that do not incorporate the inner workings\nof foundation models (e.g., inductive biases, scale invariance, etc.) during\ntraining. To alleviate this issue for ViTs, in this paper, we propose a novel\nConcept Representation Alignment Module (CRAM) which learns both scale and\nposition-aware representations from multi-scale feature pyramids and patch\nrepresentations respectively. CRAM further aligns these representations with\nconcept annotations through an attention matrix. The proposed CRAM module\nimproves the predictive performance of ViT architectures and also provides\naccurate and robust concept explanations as demonstrated on five datasets -\nincluding three widely used benchmarks (CUB, Pascal APY, Concept-MNIST) and 2\nreal-world datasets (AWA2, KITS).\n","authors":["Sanchit Sinha","Guangzhi Xiong","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09217v1","updated":"2025-01-16T00:33:01Z","published":"2025-01-16T00:33:01Z","title":"Adaptive Law-Based Transformation (ALT): A Lightweight Feature\n  Representation for Time Series Classification","summary":"  Time series classification (TSC) is fundamental in numerous domains,\nincluding finance, healthcare, and environmental monitoring. However,\ntraditional TSC methods often struggle with the inherent complexity and\nvariability of time series data. Building on our previous work with the linear\nlaw-based transformation (LLT) - which improved classification accuracy by\ntransforming the feature space based on key data patterns - we introduce\nadaptive law-based transformation (ALT). ALT enhances LLT by incorporating\nvariable-length shifted time windows, enabling it to capture distinguishing\npatterns of various lengths and thereby handle complex time series more\neffectively. By mapping features into a linearly separable space, ALT provides\na fast, robust, and transparent solution that achieves state-of-the-art\nperformance with only a few hyperparameters.\n","authors":["Marcell T. Kurbucz","Balázs Hajós","Balázs P. Halmos","Vince Á. Molnár","Antal Jakovác"],"pdf_url":"https://arxiv.org/pdf/2501.09217v1.pdf","comment":"8 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2409.15922v4","updated":"2025-01-16T23:59:18Z","published":"2024-09-24T09:45:20Z","title":"The Dark Side of Rich Rewards: Understanding and Mitigating Noise in VLM\n  Rewards","summary":"  While Vision-Language Models (VLMs) are increasingly used to generate reward\nsignals for training embodied agents to follow instructions, our research\nreveals that agents guided by VLM rewards often underperform compared to those\nemploying only intrinsic (exploration-driven) rewards, contradicting\nexpectations set by recent work. We hypothesize that false positive rewards --\ninstances where unintended trajectories are incorrectly rewarded -- are more\ndetrimental than false negatives. Our analysis confirms this hypothesis,\nrevealing that the widely used cosine similarity metric is prone to false\npositive reward estimates. To address this, we introduce BiMI ({Bi}nary\n{M}utual {I}nformation), a novel reward function designed to mitigate noise.\nBiMI significantly enhances learning efficiency across diverse and challenging\nembodied navigation environments. Our findings offer a nuanced understanding of\nhow different types of reward noise impact agent learning and highlight the\nimportance of addressing multimodal reward signal noise when training embodied\nagents\n","authors":["Sukai Huang","Shu-Wei Liu","Nir Lipovetzky","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2409.15922v4.pdf","comment":"11 main body pages, 21 appendix pages"},{"id":"http://arxiv.org/abs/2411.01332v2","updated":"2025-01-16T23:37:24Z","published":"2024-11-02T18:30:32Z","title":"A Mechanistic Explanatory Strategy for XAI","summary":"  Despite significant advancements in XAI, scholars note a persistent lack of\nsolid conceptual foundations and integration with broader scientific discourse\non explanation. In response, emerging XAI research draws on explanatory\nstrategies from various sciences and philosophy of science literature to fill\nthese gaps. This paper outlines a mechanistic strategy for explaining the\nfunctional organization of deep learning systems, situating recent advancements\nin AI explainability within a broader philosophical context. According to the\nmechanistic approach, the explanation of opaque AI systems involves identifying\nmechanisms that drive decision-making. For deep neural networks, this means\ndiscerning functionally relevant components -- such as neurons, layers,\ncircuits, or activation patterns -- and understanding their roles through\ndecomposition, localization, and recomposition. Proof-of-principle case studies\nfrom image recognition and language modeling align these theoretical approaches\nwith the latest research from AI labs like OpenAI and Anthropic. This research\nsuggests that a systematic approach to studying model organization can reveal\nelements that simpler (or ''more modest'') explainability techniques might\nmiss, fostering more thoroughly explainable AI. The paper concludes with a\ndiscussion on the epistemic relevance of the mechanistic approach positioned in\nthe context of selected philosophical debates on XAI.\n","authors":["Marcin Rabiza"],"pdf_url":"https://arxiv.org/pdf/2411.01332v2.pdf","comment":"Forthcoming in M\\\"uller, V. C., Dewey, A. R., Dung, L., & L\\\"ohr, G.\n  (Eds.), Philosophy of Artificial Intelligence: The State of the Art, Synthese\n  Library, Berlin: Springer Nature. Please cite the published version"},{"id":"http://arxiv.org/abs/2501.09877v1","updated":"2025-01-16T23:22:17Z","published":"2025-01-16T23:22:17Z","title":"CLAP-S: Support Set Based Adaptation for Downstream Fiber-optic Acoustic\n  Recognition","summary":"  Contrastive Language-Audio Pretraining (CLAP) models have demonstrated\nunprecedented performance in various acoustic signal recognition tasks.\nFiber-optic-based acoustic recognition is one of the most important downstream\ntasks and plays a significant role in environmental sensing. Adapting CLAP for\nfiber-optic acoustic recognition has become an active research area. As a\nnon-conventional acoustic sensor, fiber-optic acoustic recognition presents a\nchallenging, domain-specific, low-shot deployment environment with significant\ndomain shifts due to unique frequency response and noise characteristics. To\naddress these challenges, we propose a support-based adaptation method, CLAP-S,\nwhich linearly interpolates a CLAP Adapter with the Support Set, leveraging\nboth implicit knowledge through fine-tuning and explicit knowledge retrieved\nfrom memory for cross-domain generalization. Experimental results show that our\nmethod delivers competitive performance on both laboratory-recorded fiber-optic\nESC-50 datasets and a real-world fiber-optic gunshot-firework dataset. Our\nresearch also provides valuable insights for other downstream acoustic\nrecognition tasks. The code and gunshot-firework dataset are available at\nhttps://github.com/Jingchensun/clap-s.\n","authors":["Jingchen Sun","Shaobo Han","Wataru Kohno","Changyou Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09877v1.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.09876v1","updated":"2025-01-16T23:14:34Z","published":"2025-01-16T23:14:34Z","title":"Geometry-Preserving Encoder/Decoder in Latent Generative Models","summary":"  Generative modeling aims to generate new data samples that resemble a given\ndataset, with diffusion models recently becoming the most popular generative\nmodel. One of the main challenges of diffusion models is solving the problem in\nthe input space, which tends to be very high-dimensional. Recently, solving\ndiffusion models in the latent space through an encoder that maps from the data\nspace to a lower-dimensional latent space has been considered to make the\ntraining process more efficient and has shown state-of-the-art results. The\nvariational autoencoder (VAE) is the most commonly used encoder/decoder\nframework in this domain, known for its ability to learn latent representations\nand generate data samples. In this paper, we introduce a novel encoder/decoder\nframework with theoretical properties distinct from those of the VAE,\nspecifically designed to preserve the geometric structure of the data\ndistribution. We demonstrate the significant advantages of this\ngeometry-preserving encoder in the training process of both the encoder and\ndecoder. Additionally, we provide theoretical results proving convergence of\nthe training process, including convergence guarantees for encoder training,\nand results showing faster convergence of decoder training when using the\ngeometry-preserving encoder.\n","authors":["Wonjun Lee","Riley C. W. O'Neill","Dongmian Zou","Jeff Calder","Gilad Lerman"],"pdf_url":"https://arxiv.org/pdf/2501.09876v1.pdf","comment":"41 pages"},{"id":"http://arxiv.org/abs/2501.09870v1","updated":"2025-01-16T22:54:12Z","published":"2025-01-16T22:54:12Z","title":"An LLM-Guided Tutoring System for Social Skills Training","summary":"  Social skills training targets behaviors necessary for success in social\ninteractions. However, traditional classroom training for such skills is often\ninsufficient to teach effective communication -- one-to-one interaction in\nreal-world scenarios is preferred to lecture-style information delivery. This\npaper introduces a framework that allows instructors to collaborate with large\nlanguage models to dynamically design realistic scenarios for students to\ncommunicate. Our framework uses these scenarios to enable student rehearsal,\nprovide immediate feedback, and visualize performance for both students and\ninstructors. Unlike traditional intelligent tutoring systems, instructors can\neasily co-create scenarios with a large language model without technical\nskills. Additionally, the system generates new scenario branches in real time\nwhen existing options do not fit the student's response.\n","authors":["Michael Guevarra","Indronil Bhattacharjee","Srijita Das","Christabel Wayllace","Carrie Demmans Epp","Matthew E. Taylor","Alan Tay"],"pdf_url":"https://arxiv.org/pdf/2501.09870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02988v2","updated":"2025-01-16T22:16:11Z","published":"2024-12-04T03:02:55Z","title":"Preference-based Pure Exploration","summary":"  We study the preference-based pure exploration problem for bandits with\nvector-valued rewards. The rewards are ordered using a (given) preference cone\n$\\mathcal{C}$ and our goal is to identify the set of Pareto optimal arms.\nFirst, to quantify the impact of preferences, we derive a novel lower bound on\nsample complexity for identifying the most preferred policy with a confidence\nlevel $1-\\delta$. Our lower bound elicits the role played by the geometry of\nthe preference cone and punctuates the difference in hardness compared to\nexisting best-arm identification variants of the problem. We further explicate\nthis geometry when the rewards follow Gaussian distributions. We then provide a\nconvex relaxation of the lower bound and leverage it to design the\nPreference-based Track and Stop (PreTS) algorithm that identifies the most\npreferred policy. Finally, we show that the sample complexity of PreTS is\nasymptotically tight by deriving a new concentration inequality for\nvector-valued rewards.\n","authors":["Apurv Shukla","Debabrota Basu"],"pdf_url":"https://arxiv.org/pdf/2412.02988v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09858v1","updated":"2025-01-16T22:11:03Z","published":"2025-01-16T22:11:03Z","title":"From Explainability to Interpretability: Interpretable Policies in\n  Reinforcement Learning Via Model Explanation","summary":"  Deep reinforcement learning (RL) has shown remarkable success in complex\ndomains, however, the inherent black box nature of deep neural network policies\nraises significant challenges in understanding and trusting the decision-making\nprocesses. While existing explainable RL methods provide local insights, they\nfail to deliver a global understanding of the model, particularly in\nhigh-stakes applications. To overcome this limitation, we propose a novel\nmodel-agnostic approach that bridges the gap between explainability and\ninterpretability by leveraging Shapley values to transform complex deep RL\npolicies into transparent representations. The proposed approach offers two key\ncontributions: a novel approach employing Shapley values to policy\ninterpretation beyond local explanations and a general framework applicable to\noff-policy and on-policy algorithms. We evaluate our approach with three\nexisting deep RL algorithms and validate its performance in two classic control\nenvironments. The results demonstrate that our approach not only preserves the\noriginal models' performance but also generates more stable interpretable\npolicies.\n","authors":["Peilang Li","Umer Siddique","Yongcan Cao"],"pdf_url":"https://arxiv.org/pdf/2501.09858v1.pdf","comment":"Accepted to Deployable AI (DAI) Workshop at the Thirty-Ninth AAAI\n  Conference on Artificial Intelligence (AAAI-25)"},{"id":"http://arxiv.org/abs/2501.09851v1","updated":"2025-01-16T21:46:53Z","published":"2025-01-16T21:46:53Z","title":"Learning Noisy Halfspaces with a Margin: Massart is No Harder than\n  Random","summary":"  We study the problem of PAC learning $\\gamma$-margin halfspaces with Massart\nnoise. We propose a simple proper learning algorithm, the Perspectron, that has\nsample complexity $\\widetilde{O}((\\epsilon\\gamma)^{-2})$ and achieves\nclassification error at most $\\eta+\\epsilon$ where $\\eta$ is the Massart noise\nrate. Prior works [DGT19,CKMY20] came with worse sample complexity guarantees\n(in both $\\epsilon$ and $\\gamma$) or could only handle random classification\nnoise [DDK+23,KIT+23] -- a much milder noise assumption. We also show that our\nresults extend to the more challenging setting of learning generalized linear\nmodels with a known link function under Massart noise, achieving a similar\nsample complexity to the halfspace case. This significantly improves upon the\nprior state-of-the-art in this setting due to [CKMY20], who introduced this\nmodel.\n","authors":["Gautam Chandrasekaran","Vasilis Kontonis","Konstantinos Stavropoulos","Kevin Tian"],"pdf_url":"https://arxiv.org/pdf/2501.09851v1.pdf","comment":"Appeared in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09849v1","updated":"2025-01-16T21:33:47Z","published":"2025-01-16T21:33:47Z","title":"Coded Deep Learning: Framework and Algorithm","summary":"  The success of deep learning (DL) is often achieved with large models and\nhigh complexity during both training and post-training inferences, hindering\ntraining in resource-limited settings. To alleviate these issues, this paper\nintroduces a new framework dubbed ``coded deep learning'' (CDL), which\nintegrates information-theoretic coding concepts into the inner workings of DL,\nto significantly compress model weights and activations, reduce computational\ncomplexity at both training and post-training inference stages, and enable\nefficient model/data parallelism. Specifically, within CDL, (i) we first\npropose a novel probabilistic method for quantizing both model weights and\nactivations, and its soft differentiable variant which offers an analytic\nformula for gradient calculation during training; (ii) both the forward and\nbackward passes during training are executed over quantized weights and\nactivations, eliminating most floating-point operations and reducing training\ncomplexity; (iii) during training, both weights and activations are entropy\nconstrained so that they are compressible in an information-theoretic sense\nthroughout training, thus reducing communication costs in model/data\nparallelism; and (iv) the trained model in CDL is by default in a quantized\nformat with compressible quantized weights, reducing post-training inference\nand storage complexity. Additionally, a variant of CDL, namely relaxed CDL\n(R-CDL), is presented to further improve the trade-off between validation\naccuracy and compression though requiring full precision in training with other\nadvantageous features of CDL intact. Extensive empirical results show that CDL\nand R-CDL outperform the state-of-the-art algorithms in DNN compression in the\nliterature.\n","authors":["En-hui Yang","Shayan Mohajer Hamidi"],"pdf_url":"https://arxiv.org/pdf/2501.09849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2101.07914v2","updated":"2025-01-16T21:18:48Z","published":"2021-01-20T00:46:52Z","title":"Intelligent Icing Detection Model of Wind Turbine Blades Based on SCADA\n  data","summary":"  Diagnosis of ice accretion on wind turbine blades is all the time a hard nut\nto crack in condition monitoring of wind farms. Existing methods focus on\nmechanism analysis of icing process, deviation degree analysis of feature\nengineering. However, there have not been deep researches of neural networks\napplied in this field at present. Supervisory control and data acquisition\n(SCADA) makes it possible to train networks through continuously providing not\nonly operation parameters and performance parameters of wind turbines but also\nenvironmental parameters and operation modes. This paper explores the\npossibility that using convolutional neural networks (CNNs), generative\nadversarial networks (GANs) and domain adaption learning to establish\nintelligent diagnosis frameworks under different training scenarios.\nSpecifically, PGANC and PGANT are proposed for sufficient and non-sufficient\ntarget wind turbine labeled data, respectively. The basic idea is that we\nconsider a two-stage training with parallel GANs, which are aimed at capturing\nintrinsic features for normal and icing samples, followed by classification CNN\nor domain adaption module in various training cases. Model validation on three\nwind turbine SCADA data shows that two-stage training can effectively improve\nthe model performance. Besides, if there is no sufficient labeled data for a\ntarget turbine, which is an extremely common phenomenon in real industrial\npractices, the addition of domain adaption learning makes the trained model\nshow better performance. Overall, our proposed intelligent diagnosis frameworks\ncan achieve more accurate detection on the same wind turbine and more\ngeneralized capability on a new wind turbine, compared with other machine\nlearning models and conventional CNNs.\n","authors":["Wenqian Jiang","Junyang Jin"],"pdf_url":"https://arxiv.org/pdf/2101.07914v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.07271v2","updated":"2025-01-16T21:09:57Z","published":"2024-11-10T16:28:42Z","title":"Multi-hop Upstream Anticipatory Traffic Signal Control with Deep\n  Reinforcement Learning","summary":"  Coordination in traffic signal control is crucial for managing congestion in\nurban networks. Existing pressure-based control methods focus only on immediate\nupstream links, leading to suboptimal green time allocation and increased\nnetwork delays. However, effective signal control inherently requires\ncoordination across a broader spatial scope, as the effect of upstream traffic\nshould influence signal control decisions at downstream intersections,\nimpacting a large area in the traffic network. Although agent communication\nusing neural network-based feature extraction can implicitly enhance spatial\nawareness, it significantly increases the learning complexity, adding an\nadditional layer of difficulty to the challenging task of control in deep\nreinforcement learning. To address the issue of learning complexity and myopic\ntraffic pressure definition, our work introduces a novel concept based on\nMarkov chain theory, namely \\textit{multi-hop upstream pressure}, which\ngeneralizes the conventional pressure to account for traffic conditions beyond\nthe immediate upstream links. This farsighted and compact metric informs the\ndeep reinforcement learning agent to preemptively clear the multi-hop upstream\nqueues, guiding the agent to optimize signal timings with a broader spatial\nawareness. Simulations on synthetic and realistic (Toronto) scenarios\ndemonstrate controllers utilizing multi-hop upstream pressure significantly\nreduce overall network delay by prioritizing traffic movements based on a\nbroader understanding of upstream congestion.\n","authors":["Xiaocan Li","Xiaoyu Wang","Ilia Smirnov","Scott Sanner","Baher Abdulhai"],"pdf_url":"https://arxiv.org/pdf/2411.07271v2.pdf","comment":"5 tables, 11 figures"},{"id":"http://arxiv.org/abs/2501.06164v2","updated":"2025-01-16T21:07:04Z","published":"2025-01-10T18:39:29Z","title":"Model Alignment Search","summary":"  When can we say that two neural systems are the same? The answer to this\nquestion is goal-dependent, and it is often addressed through correlative\nmethods such as Representational Similarity Analysis (RSA) and Centered Kernel\nAlignment (CKA). What do we miss when we forgo causal explorations, and how can\nwe target specific types of similarity? In this work, we introduce Model\nAlignment Search (MAS), a method for causally exploring distributed\nrepresentational similarity. The method learns invertible linear\ntransformations that align a subspace between two distributed networks'\nrepresentations where causal information can be freely interchanged. We first\nshow that the method can be used to transfer specific causal variables, such as\nthe number of items in a counting task, between networks with different\ntraining seeds. We then explore open questions in number cognition by comparing\ndifferent types of numeric representations in models trained on structurally\ndifferent numeric tasks. We then explore differences between MAS vs preexisting\ncausal similarity methods, and lastly, we introduce a counterfactual latent\nauxiliary loss function that helps shape causally relevant alignments even in\ncases where we do not have causal access to one of the two models for training.\n","authors":["Satchel Grant"],"pdf_url":"https://arxiv.org/pdf/2501.06164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09822v1","updated":"2025-01-16T20:16:49Z","published":"2025-01-16T20:16:49Z","title":"pFedWN: A Personalized Federated Learning Framework for D2D Wireless\n  Networks with Heterogeneous Data","summary":"  Traditional Federated Learning (FL) approaches often struggle with data\nheterogeneity across clients, leading to suboptimal model performance for\nindividual clients. To address this issue, Personalized Federated Learning\n(PFL) emerges as a solution to the challenges posed by non-independent and\nidentically distributed (non-IID) and unbalanced data across clients.\nFurthermore, in most existing decentralized machine learning works, a perfect\ncommunication channel is considered for model parameter transmission between\nclients and servers. However, decentralized PFL over wireless links introduces\nnew challenges, such as resource allocation and interference management. To\novercome these challenges, we formulate a joint optimization problem that\nincorporates the underlying device-to-device (D2D) wireless channel conditions\ninto a server-free PFL approach. The proposed method, dubbed pFedWN, optimizes\nthe learning performance for each client while accounting for the variability\nin D2D wireless channels. To tackle the formulated problem, we divide it into\ntwo sub-problems: PFL neighbor selection and PFL weight assignment. The PFL\nneighbor selection is addressed through channel-aware neighbor selection within\nunlicensed spectrum bands such as ISM bands. Next, to assign PFL weights, we\nutilize the Expectation-Maximization (EM) method to evaluate the similarity\nbetween clients' data and obtain optimal weight distribution among the chosen\nPFL neighbors. Empirical results show that pFedWN provides efficient and\npersonalized learning performance with non-IID and unbalanced datasets.\nFurthermore, it outperforms the existing FL and PFL methods in terms of\nlearning efficacy and robustness, particularly under dynamic and unpredictable\nwireless channel conditions.\n","authors":["Zhou Ni","Masoud Ghazikor","Morteza Hashemi"],"pdf_url":"https://arxiv.org/pdf/2501.09822v1.pdf","comment":"16 pages, 9 figures, 3 tables, submitted to Transactions on\n  Networking"},{"id":"http://arxiv.org/abs/2501.09821v1","updated":"2025-01-16T20:15:12Z","published":"2025-01-16T20:15:12Z","title":"BN-Pool: a Bayesian Nonparametric Approach to Graph Pooling","summary":"  We introduce BN-Pool, the first clustering-based pooling method for Graph\nNeural Networks (GNNs) that adaptively determines the number of supernodes in a\ncoarsened graph. By leveraging a Bayesian non-parametric framework, BN-Pool\nemploys a generative model capable of partitioning graph nodes into an\nunbounded number of clusters. During training, we learn the node-to-cluster\nassignments by combining the supervised loss of the downstream task with an\nunsupervised auxiliary term, which encourages the reconstruction of the\noriginal graph topology while penalizing unnecessary proliferation of clusters.\nThis adaptive strategy allows BN-Pool to automatically discover an optimal\ncoarsening level, offering enhanced flexibility and removing the need to\nspecify sensitive pooling ratios. We show that BN-Pool achieves superior\nperformance across diverse benchmarks.\n","authors":["Daniele Castellana","Filippo Maria Bianchi"],"pdf_url":"https://arxiv.org/pdf/2501.09821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11335v3","updated":"2025-01-16T19:43:21Z","published":"2023-10-17T15:13:33Z","title":"Reinforcement learning with non-ergodic reward increments: robustness\n  via ergodicity transformations","summary":"  Envisioned application areas for reinforcement learning (RL) include\nautonomous driving, precision agriculture, and finance, which all require RL\nagents to make decisions in the real world. A significant challenge hindering\nthe adoption of RL methods in these domains is the non-robustness of\nconventional algorithms. In particular, the focus of RL is typically on the\nexpected value of the return. The expected value is the average over the\nstatistical ensemble of infinitely many trajectories, which can be\nuninformative about the performance of the average individual. For instance,\nwhen we have a heavy-tailed return distribution, the ensemble average can be\ndominated by rare extreme events. Consequently, optimizing the expected value\ncan lead to policies that yield exceptionally high returns with a probability\nthat approaches zero but almost surely result in catastrophic outcomes in\nsingle long trajectories. In this paper, we develop an algorithm that lets RL\nagents optimize the long-term performance of individual trajectories. The\nalgorithm enables the agents to learn robust policies, which we show in an\ninstructive example with a heavy-tailed return distribution and standard RL\nbenchmarks. The key element of the algorithm is a transformation that we learn\nfrom data. This transformation turns the time series of collected returns into\none for whose increments expected value and the average over a long trajectory\ncoincide. Optimizing these increments results in robust policies.\n","authors":["Dominik Baumann","Erfaun Noorani","James Price","Ole Peters","Colm Connaughton","Thomas B. Schön"],"pdf_url":"https://arxiv.org/pdf/2310.11335v3.pdf","comment":"Accepted final version to appear in the Transactions on Machine\n  Learning Research"},{"id":"http://arxiv.org/abs/2501.09804v1","updated":"2025-01-16T19:23:11Z","published":"2025-01-16T19:23:11Z","title":"Enhancing Generalization in Chain of Thought Reasoning for Smaller\n  Models","summary":"  Chain-of-Thought (CoT) reasoning in smaller language models is a challenging\nnatural language process problem yet highly desirable in many real-life\napplications. Existing CoT knowledge distillation methods often suffer from\noverly conservative memorization in smaller LLMs, leading to low generalization\nconfidence. As fully preserving the CoT ability of teacher model is impossible,\nwe hypothesize that adversarial CoT fine-tuning is crucial for developing\nsmaller LLM with robust CoT generalization. To this end, we propose\n\\textit{PRompt-Assisted Domain-Adversarial fine-tuning} (PRADA), a principled\nfine-tuning framework that integrates diverse CoT domains. Specifically, PRADA\npioneers two CoT improvements in smaller LLM: (1) Recovering the\ndomain-invariant feature insight which typically lost during distillation with\ndomain adversarial fine-tuning; (2) Enhancing the domain adaptability of CoT\nprompt engineering by employing domain-adversarial approaches. We theoretically\ndemonstrate the effectiveness of our approach and empirically show that it\nsignificantly outperforms the state of the arts in a wide range of tasks.\nMoreover, our empirical findings reveal that the smaller LLM, when leveraging\nPRADA, aligns closely with domain knowledge, thereby improving the\nexplainability of our approach.\n","authors":["Maxwell J. Yin","Dingyi Jiang","Yongbing Chen","Boyu Wang","Charles Ling"],"pdf_url":"https://arxiv.org/pdf/2501.09804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09803v1","updated":"2025-01-16T19:22:50Z","published":"2025-01-16T19:22:50Z","title":"Graph Neural Networks for Travel Distance Estimation and Route\n  Recommendation Under Probabilistic Hazards","summary":"  Estimating the shortest travel time and providing route recommendation\nbetween different locations in a city or region can quantitatively measure the\nconditions of the transportation network during or after extreme events. One\ncommon approach is to use Dijkstra's Algorithm, which produces the shortest\npath as well as the shortest distance. However, this option is computationally\nexpensive when applied to large-scale networks. This paper proposes a novel\nfast framework based on graph neural networks (GNNs) which approximate the\nsingle-source shortest distance between pairs of locations, and predict the\nsingle-source shortest path subsequently. We conduct multiple experiments on\nsynthetic graphs of different size to demonstrate the feasibility and\ncomputational efficiency of the proposed model. In real-world case studies, we\nalso applied the proposed method of flood risk analysis of coastal urban areas\nto calculate delays in evacuation to public shelters during hurricanes. The\nresults indicate the accuracy and computational efficiency of the GNN model,\nand its potential for effective implementation in emergency planning and\nmanagement.\n","authors":["Tong Liu","Hadi Meidani"],"pdf_url":"https://arxiv.org/pdf/2501.09803v1.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.09751v1","updated":"2025-01-16T18:58:06Z","published":"2025-01-16T18:58:06Z","title":"OmniThink: Expanding Knowledge Boundaries in Machine Writing through\n  Thinking","summary":"  Machine writing with large language models often relies on\nretrieval-augmented generation. However, these approaches remain confined\nwithin the boundaries of the model's predefined scope, limiting the generation\nof content with rich information. Specifically, vanilla-retrieved information\ntends to lack depth, utility, and suffers from redundancy, which negatively\nimpacts the quality of generated articles, leading to shallow, repetitive, and\nunoriginal outputs. To address these issues, we propose OmniThink, a machine\nwriting framework that emulates the human-like process of iterative expansion\nand reflection. The core idea behind OmniThink is to simulate the cognitive\nbehavior of learners as they progressively deepen their knowledge of the\ntopics. Experimental results demonstrate that OmniThink improves the knowledge\ndensity of generated articles without compromising metrics such as coherence\nand depth. Human evaluations and expert feedback further highlight the\npotential of OmniThink to address real-world challenges in the generation of\nlong-form articles.\n","authors":["Zekun Xi","Wenbiao Yin","Jizhan Fang","Jialong Wu","Runnan Fang","Ningyu Zhang","Jiang Yong","Pengjun Xie","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09776v1","updated":"2025-01-16T13:04:15Z","published":"2025-01-16T13:04:15Z","title":"Multi-Head Self-Attending Neural Tucker Factorization","summary":"  Quality-of-service (QoS) data exhibit dynamic temporal patterns that are\ncrucial for accurately predicting missing values. These patterns arise from the\nevolving interactions between users and services, making it essential to\ncapture the temporal dynamics inherent in such data for improved prediction\nperformance. As the size and complexity of QoS datasets increase, existing\nmodels struggle to provide accurate predictions, highlighting the need for more\nflexible and dynamic methods to better capture the underlying patterns in\nlarge-scale QoS data. To address this issue, we introduce a neural\nnetwork-based tensor factorization approach tailored for learning\nspatiotemporal representations of high-dimensional and incomplete (HDI)\ntensors, namely the Multi-head Self-attending Neural Tucker Factorization\n(MSNTucF). The model is elaborately designed for modeling intricate nonlinear\nspatiotemporal feature interaction patterns hidden in real world data with a\ntwo-fold idea. It first employs a neural network structure to generalize the\ntraditional framework of Tucker factorization and then proposes to leverage a\nmulti-head self-attending module to enforce nonlinear latent interaction\nlearning. In empirical studies on two dynamic QoS datasets from real\napplications, the proposed MSNTucF model demonstrates superior performance\ncompared to state-of-the-art benchmark models in estimating missing\nobservations. This highlights its ability to learn non-linear spatiotemporal\nrepresentations of HDI tensors.\n","authors":["Yikai Hou","Peng Tang"],"pdf_url":"https://arxiv.org/pdf/2501.09776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.01493v4","updated":"2025-01-16T11:16:40Z","published":"2022-05-03T13:48:12Z","title":"On the uncertainty principle of neural networks","summary":"  In this study, we explore the inherent trade-off between accuracy and\nrobustness in neural networks, drawing an analogy to the uncertainty principle\nin quantum mechanics. We propose that neural networks are subject to an\nuncertainty relation, which manifests as a fundamental limitation in their\nability to simultaneously achieve high accuracy and robustness against\nadversarial attacks. Through mathematical proofs and empirical evidence, we\ndemonstrate that this trade-off is a natural consequence of the sharp\nboundaries formed between different class concepts during training. Our\nfindings reveal that the complementarity principle, a cornerstone of quantum\nphysics, applies to neural networks, imposing fundamental limits on their\ncapabilities in simultaneous learning of conjugate features. Meanwhile, our\nwork suggests that achieving human-level intelligence through a single network\narchitecture or massive datasets alone may be inherently limited. Our work\nprovides new insights into the theoretical foundations of neural network\nvulnerability and opens up avenues for designing more robust neural network\narchitectures.\n","authors":["Jun-Jie Zhang","Dong-Xiao Zhang","Jian-Nan Chen","Long-Gang Pang","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2205.01493v4.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2004.06963v3","updated":"2025-01-16T08:32:50Z","published":"2020-04-15T09:18:18Z","title":"Hidden Markov Neural Networks","summary":"  We define an evolving in-time Bayesian neural network called a Hidden Markov\nNeural Network, which addresses the crucial challenge in time-series\nforecasting and continual learning: striking a balance between adapting to new\ndata and appropriately forgetting outdated information. This is achieved by\nmodelling the weights of a neural network as the hidden states of a Hidden\nMarkov model, with the observed process defined by the available data. A\nfiltering algorithm is employed to learn a variational approximation of the\nevolving-in-time posterior distribution over the weights. By leveraging a\nsequential variant of Bayes by Backprop, enriched with a stronger\nregularization technique called variational DropConnect, Hidden Markov Neural\nNetworks achieve robust regularization and scalable inference. Experiments on\nMNIST, dynamic classification tasks, and next-frame forecasting in videos\ndemonstrate that Hidden Markov Neural Networks provide strong predictive\nperformance while enabling effective uncertainty quantification.\n","authors":["Lorenzo Rimella","Nick Whiteley"],"pdf_url":"https://arxiv.org/pdf/2004.06963v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2412.07948v2","updated":"2025-01-16T17:56:53Z","published":"2024-12-10T22:22:19Z","title":"Frechet Music Distance: A Metric For Generative Symbolic Music\n  Evaluation","summary":"  In this paper we introduce the Frechet Music Distance (FMD), a novel\nevaluation metric for generative symbolic music models, inspired by the Frechet\nInception Distance (FID) in computer vision and Frechet Audio Distance (FAD) in\ngenerative audio. FMD calculates the distance between distributions of\nreference and generated symbolic music embeddings, capturing abstract musical\nfeatures. We validate FMD across several datasets and models. Results indicate\nthat FMD effectively differentiates model quality, providing a domain-specific\nmetric for evaluating symbolic music generation, and establishing a\nreproducible standard for future research in symbolic music modeling.\n","authors":["Jan Retkowski","Jakub Stępniak","Mateusz Modrzejewski"],"pdf_url":"https://arxiv.org/pdf/2412.07948v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09608v1","updated":"2025-01-16T15:32:41Z","published":"2025-01-16T15:32:41Z","title":"Metric Learning with Progressive Self-Distillation for Audio-Visual\n  Embedding Learning","summary":"  Metric learning projects samples into an embedded space, where similarities\nand dissimilarities are quantified based on their learned representations.\nHowever, existing methods often rely on label-guided representation learning,\nwhere representations of different modalities, such as audio and visual data,\nare aligned based on annotated labels. This approach tends to underutilize\nlatent complex features and potential relationships inherent in the\ndistributions of audio and visual data that are not directly tied to the\nlabels, resulting in suboptimal performance in audio-visual embedding learning.\nTo address this issue, we propose a novel architecture that integrates\ncross-modal triplet loss with progressive self-distillation. Our method\nenhances representation learning by leveraging inherent distributions and\ndynamically refining soft audio-visual alignments -- probabilistic alignments\nbetween audio and visual data that capture the inherent relationships beyond\nexplicit labels. Specifically, the model distills audio-visual\ndistribution-based knowledge from annotated labels in a subset of each batch.\nThis self-distilled knowledge is used t\n","authors":["Donghuo Zeng","Kazushi Ikeda"],"pdf_url":"https://arxiv.org/pdf/2501.09608v1.pdf","comment":"5 pages, 3 figures, 2 tables. Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.09352v1","updated":"2025-01-16T08:04:04Z","published":"2025-01-16T08:04:04Z","title":"PAL: Prompting Analytic Learning with Missing Modality for Multi-Modal\n  Class-Incremental Learning","summary":"  Multi-modal class-incremental learning (MMCIL) seeks to leverage multi-modal\ndata, such as audio-visual and image-text pairs, thereby enabling models to\nlearn continuously across a sequence of tasks while mitigating forgetting.\nWhile existing studies primarily focus on the integration and utilization of\nmulti-modal information for MMCIL, a critical challenge remains: the issue of\nmissing modalities during incremental learning phases. This oversight can\nexacerbate severe forgetting and significantly impair model performance. To\nbridge this gap, we propose PAL, a novel exemplar-free framework tailored to\nMMCIL under missing-modality scenarios. Concretely, we devise modality-specific\nprompts to compensate for missing information, facilitating the model to\nmaintain a holistic representation of the data. On this foundation, we\nreformulate the MMCIL problem into a Recursive Least-Squares task, delivering\nan analytical linear solution. Building upon these, PAL not only alleviates the\ninherent under-fitting limitation in analytic learning but also preserves the\nholistic representation of missing-modality data, achieving superior\nperformance with less forgetting across various multi-modal incremental\nscenarios. Extensive experiments demonstrate that PAL significantly outperforms\ncompetitive methods across various datasets, including UPMC-Food101 and\nN24News, showcasing its robustness towards modality absence and its\nanti-forgetting ability to maintain high incremental accuracy.\n","authors":["Xianghu Yue","Yiming Chen","Xueyi Zhang","Xiaoxue Gao","Mengling Feng","Mingrui Lao","Huiping Zhuang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2501.09352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09291v1","updated":"2025-01-16T04:53:29Z","published":"2025-01-16T04:53:29Z","title":"LAVCap: LLM-based Audio-Visual Captioning using Optimal Transport","summary":"  Automated audio captioning is a task that generates textual descriptions for\naudio content, and recent studies have explored using visual information to\nenhance captioning quality. However, current methods often fail to effectively\nfuse audio and visual data, missing important semantic cues from each modality.\nTo address this, we introduce LAVCap, a large language model (LLM)-based\naudio-visual captioning framework that effectively integrates visual\ninformation with audio to improve audio captioning performance. LAVCap employs\nan optimal transport-based alignment loss to bridge the modality gap between\naudio and visual features, enabling more effective semantic extraction.\nAdditionally, we propose an optimal transport attention module that enhances\naudio-visual fusion using an optimal transport assignment map. Combined with\nthe optimal training strategy, experimental results demonstrate that each\ncomponent of our framework is effective. LAVCap outperforms existing\nstate-of-the-art methods on the AudioCaps dataset, without relying on large\ndatasets or post-processing. Code is available at\nhttps://github.com/NAVER-INTEL-Co-Lab/gaudi-lavcap.\n","authors":["Kyeongha Rho","Hyeongkeun Lee","Valentio Iverson","Joon Son Chung"],"pdf_url":"https://arxiv.org/pdf/2501.09291v1.pdf","comment":"5 pages, 2 figures; Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.09275v1","updated":"2025-01-16T03:44:28Z","published":"2025-01-16T03:44:28Z","title":"MagnetDB: A Longitudinal Torrent Discovery Dataset with IMDb-Matched\n  Movies and TV Shows","summary":"  BitTorrent remains a prominent channel for illicit distribution of\ncopyrighted material, yet the supply side of such content remains understudied.\nWe introduce MagnetDB, a longitudinal dataset of torrents discovered through\nthe BitTorrent DHT between 2018 and 2024, containing more than 28.6 million\ntorrents and metadata of more than 950 million files. While our primary focus\nis on enabling research based on the supply of pirated movies and TV shows, the\ndataset also encompasses other legitimate and illegitimate torrents. By\napplying IMDb-matching and annotation to movie and TV show torrents, MagnetDB\nfacilitates detailed analyses of pirated content evolution in the BitTorrent\nnetwork. Researchers can leverage MagnetDB to examine distribution trends,\nsubcultural practices, and the gift economy within piracy ecosystems. Through\nits scale and temporal scope, MagnetDB presents a unique opportunity for\ninvestigating the broader dynamics of BitTorrent and advancing empirical\nknowledge on digital piracy.\n","authors":["Scott Seidenberger","Noah Pursell","Anindya Maiti"],"pdf_url":"https://arxiv.org/pdf/2501.09275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13397v6","updated":"2025-01-16T02:48:38Z","published":"2023-03-23T16:15:18Z","title":"DiffMesh: A Motion-aware Diffusion Framework for Human Mesh Recovery\n  from Videos","summary":"  Human mesh recovery (HMR) provides rich human body information for various\nreal-world applications. While image-based HMR methods have achieved impressive\nresults, they often struggle to recover humans in dynamic scenarios, leading to\ntemporal inconsistencies and non-smooth 3D motion predictions due to the\nabsence of human motion. In contrast, video-based approaches leverage temporal\ninformation to mitigate this issue. In this paper, we present DiffMesh, an\ninnovative motion-aware Diffusion-like framework for video-based HMR. DiffMesh\nestablishes a bridge between diffusion models and human motion, efficiently\ngenerating accurate and smooth output mesh sequences by incorporating human\nmotion within the forward process and reverse process in the diffusion model.\nExtensive experiments are conducted on the widely used datasets (Human3.6M\n\\cite{h36m_pami} and 3DPW \\cite{pw3d2018}), which demonstrate the effectiveness\nand efficiency of our DiffMesh. Visual comparisons in real-world scenarios\nfurther highlight DiffMesh's suitability for practical applications.\n","authors":["Ce Zheng","Xianpeng Liu","Qucheng Peng","Tianfu Wu","Pu Wang","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2303.13397v6.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2501.09782v1","updated":"2025-01-16T18:59:46Z","published":"2025-01-16T18:59:46Z","title":"SMPLest-X: Ultimate Scaling for Expressive Human Pose and Shape\n  Estimation","summary":"  Expressive human pose and shape estimation (EHPS) unifies body, hands, and\nface motion capture with numerous applications. Despite encouraging progress,\ncurrent state-of-the-art methods focus on training innovative architectural\ndesigns on confined datasets. In this work, we investigate the impact of\nscaling up EHPS towards a family of generalist foundation models. 1) For data\nscaling, we perform a systematic investigation on 40 EHPS datasets,\nencompassing a wide range of scenarios that a model trained on any single\ndataset cannot handle. More importantly, capitalizing on insights obtained from\nthe extensive benchmarking process, we optimize our training scheme and select\ndatasets that lead to a significant leap in EHPS capabilities. Ultimately, we\nachieve diminishing returns at 10M training instances from diverse data\nsources. 2) For model scaling, we take advantage of vision transformers (up to\nViT-Huge as the backbone) to study the scaling law of model sizes in EHPS. To\nexclude the influence of algorithmic design, we base our experiments on two\nminimalist architectures: SMPLer-X, which consists of an intermediate step for\nhand and face localization, and SMPLest-X, an even simpler version that reduces\nthe network to its bare essentials and highlights significant advances in the\ncapture of articulated hands. With big data and the large model, the foundation\nmodels exhibit strong performance across diverse test benchmarks and excellent\ntransferability to even unseen environments. Moreover, our finetuning strategy\nturns the generalist into specialist models, allowing them to achieve further\nperformance boosts. Notably, our foundation models consistently deliver\nstate-of-the-art results on seven benchmarks such as AGORA, UBody, EgoBody, and\nour proposed SynHand dataset for comprehensive hand evaluation. (Code is\navailable at: https://github.com/wqyin/SMPLest-X).\n","authors":["Wanqi Yin","Zhongang Cai","Ruisi Wang","Ailing Zeng","Chen Wei","Qingping Sun","Haiyi Mei","Yanjun Wang","Hui En Pang","Mingyuan Zhang","Lei Zhang","Chen Change Loy","Atsushi Yamashita","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09782v1.pdf","comment":"An extension of SMPLer-X [arXiv:2309.17448]. Homepage:\n  https://caizhongang.com/projects/SMPLer-X/"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.09755v1","updated":"2025-01-16T18:59:04Z","published":"2025-01-16T18:59:04Z","title":"Learnings from Scaling Visual Tokenizers for Reconstruction and\n  Generation","summary":"  Visual tokenization via auto-encoding empowers state-of-the-art image and\nvideo generative models by compressing pixels into a latent space. Although\nscaling Transformer-based generators has been central to recent advances, the\ntokenizer component itself is rarely scaled, leaving open questions about how\nauto-encoder design choices influence both its objective of reconstruction and\ndownstream generative performance. Our work aims to conduct an exploration of\nscaling in auto-encoders to fill in this blank. To facilitate this exploration,\nwe replace the typical convolutional backbone with an enhanced Vision\nTransformer architecture for Tokenization (ViTok). We train ViTok on\nlarge-scale image and video datasets far exceeding ImageNet-1K, removing data\nconstraints on tokenizer scaling. We first study how scaling the auto-encoder\nbottleneck affects both reconstruction and generation -- and find that while it\nis highly correlated with reconstruction, its relationship with generation is\nmore complex. We next explored the effect of separately scaling the\nauto-encoders' encoder and decoder on reconstruction and generation\nperformance. Crucially, we find that scaling the encoder yields minimal gains\nfor either reconstruction or generation, while scaling the decoder boosts\nreconstruction but the benefits for generation are mixed. Building on our\nexploration, we design ViTok as a lightweight auto-encoder that achieves\ncompetitive performance with state-of-the-art auto-encoders on ImageNet-1K and\nCOCO reconstruction tasks (256p and 512p) while outperforming existing\nauto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x\nfewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates\ncompetitive performance on image generation for ImageNet-1K and sets new\nstate-of-the-art benchmarks for class-conditional video generation on UCF-101.\n","authors":["Philippe Hansen-Estruch","David Yan","Ching-Yao Chung","Orr Zohar","Jialiang Wang","Tingbo Hou","Tao Xu","Sriram Vishwanath","Peter Vajda","Xinlei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09755v1.pdf","comment":"28 pages, 25 figures, 7 Tables"},{"id":"http://arxiv.org/abs/2405.08965v3","updated":"2025-01-16T18:56:27Z","published":"2024-05-14T21:12:01Z","title":"Meaning-Typed Programming: Language-level Abstractions and Runtime for\n  GenAI Applications","summary":"  Software is rapidly evolving from being programmed with traditional logical\ncode, to neuro-integrated applications that leverage generative AI and large\nlanguage models (LLMs) for application functionality. This shift increases the\ncomplexity of building applications, as developers now must reasoning about,\nprogram, and prompt LLMs. Despite efforts to create tools to assist with prompt\nengineering, these solutions often introduce additional layers of complexity to\nthe development of neuro-integrated applications. This paper proposes\nmeaning-typed programming (MTP), a novel approach to simplify the creation of\nneuro-integrated applications by introducing new language-level abstractions\nthat hide the complexities of LLM integration. Our key insight is that typical\nconventional code already possesses a high level of semantic richness that can\nbe automatically reasoned about, as it is designed to be readable and\nmaintainable by humans. Leveraging this insight, we conceptualize LLMs as\nmeaning-typed code constructs and introduce a by abstraction at the language\nlevel, MT-IR, a new meaning-based intermediate representation at the compiler\nlevel, and MT Runtime, an automated run-time engine for LLM integration and\noperations. We implement MTP in a production-grade Python super-set language\ncalled Jac and perform an extensive evaluation. Our results demonstrate that\nMTP not only simplifies the development process but also meets or exceeds the\nefficacy of state-of-the-art manual and tool-assisted prompt engineering\ntechniques in terms of accuracy and usability.\n","authors":["Jason Mars","Yiping Kang","Jayanaka L. Dantanarayana","Kugesan Sivasothynathan","Christopher Clarke","Baichuan Li","Krisztian Flautner","Lingjia Tang"],"pdf_url":"https://arxiv.org/pdf/2405.08965v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09744v1","updated":"2025-01-16T18:53:32Z","published":"2025-01-16T18:53:32Z","title":"KU AIGEN ICL EDI@BC8 Track 3: Advancing Phenotype Named Entity\n  Recognition and Normalization for Dysmorphology Physical Examination Reports","summary":"  The objective of BioCreative8 Track 3 is to extract phenotypic key medical\nfindings embedded within EHR texts and subsequently normalize these findings to\ntheir Human Phenotype Ontology (HPO) terms. However, the presence of diverse\nsurface forms in phenotypic findings makes it challenging to accurately\nnormalize them to the correct HPO terms. To address this challenge, we explored\nvarious models for named entity recognition and implemented data augmentation\ntechniques such as synonym marginalization to enhance the normalization step.\nOur pipeline resulted in an exact extraction and normalization F1 score 2.6\\%\nhigher than the mean score of all submissions received in response to the\nchallenge. Furthermore, in terms of the normalization F1 score, our approach\nsurpassed the average performance by 1.9\\%. These findings contribute to the\nadvancement of automated medical data extraction and normalization techniques,\nshowcasing potential pathways for future research and application in the\nbiomedical domain.\n","authors":["Hajung Kim","Chanhwi Kim","Jiwoong Sohn","Tim Beck","Marek Rei","Sunkyu Kim","T Ian Simpson","Joram M Posma","Antoine Lain","Mujeen Sung","Jaewoo Kang"],"pdf_url":"https://arxiv.org/pdf/2501.09744v1.pdf","comment":"This article is part of the Proceedings of the BioCreative VIII\n  Challenge and Workshop: Curation and Evaluation in the era of Generative\n  Models"},{"id":"http://arxiv.org/abs/2412.04845v2","updated":"2025-01-16T18:48:36Z","published":"2024-12-06T08:30:01Z","title":"Using Machine Learning to Discover Parsimonious and\n  Physically-Interpretable Representations of Catchment-Scale Rainfall-Runoff\n  Dynamics","summary":"  Despite the excellent real-world predictive performance of modern machine\nlearning (ML) methods, many scientists remain hesitant to discard traditional\nphysical-conceptual (PC) approaches due mainly to their relative\ninterpretability, which contributes to credibility during decision-making. In\nthis context, a currently underexplored aspect of ML is how to develop\nminimally-optimal representations that can facilitate better insight regarding\nsystem functioning. Regardless of how this is achieved, it is arguably true\nthat parsimonious representations better support the advancement of scientific\nunderstanding. Our own view is that ML-based modeling of geoscientific systems\nshould be based in the use of computational units that are fundamentally\ninterpretable by design.\n  This paper continues our exploration of how the strengths of ML can be\nexploited in the service of better understanding via scientific investigation.\nHere, we use the Mass Conserving Perceptron (MCP) as the fundamental\ncomputational unit in a generic network architecture consisting of nodes\narranged in series and parallel to explore several generic and important issues\nrelated to the use of observational data for constructing input-state-output\nmodels of dynamical systems. In the context of lumped catchment modeling, we\nshow that physical interpretability and excellent predictive performance can\nboth be achieved using a relatively parsimonious distributed-state\nmultiple-flow-path network with context-dependent gating and information\nsharing across the nodes, suggesting that MCP-based modeling can play a\nsignificant role in application of ML to geoscientific investigation.\n","authors":["Yuan-Heng Wang","Hoshin V. Gupta"],"pdf_url":"https://arxiv.org/pdf/2412.04845v2.pdf","comment":"74 Pages, 4 Tables, 13 Figures, 11 Tables and 11 Figures in\n  Supplementary Materials"},{"id":"http://arxiv.org/abs/2501.09725v1","updated":"2025-01-16T18:16:34Z","published":"2025-01-16T18:16:34Z","title":"Parallel multi-objective metaheuristics for smart communications in\n  vehicular networks","summary":"  This article analyzes the use of two parallel multi-objective soft computing\nalgorithms to automatically search for high-quality settings of the Ad hoc On\nDemand Vector routing protocol for vehicular networks. These methods are based\non an evolutionary algorithm and on a swarm intelligence approach. The\nexperimental analysis demonstrates that the configurations computed by our\noptimization algorithms outperform other state-of-the-art optimized ones. In\nturn, the computational efficiency achieved by all the parallel versions is\ngreater than 87 %. Therefore, the line of work presented in this article\nrepresents an efficient framework to improve vehicular communications.\n","authors":["Jamal Toutouh","Enrique Alba"],"pdf_url":"https://arxiv.org/pdf/2501.09725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09720v1","updated":"2025-01-16T18:09:22Z","published":"2025-01-16T18:09:22Z","title":"A Simple Aerial Detection Baseline of Multimodal Language Models","summary":"  The multimodal language models (MLMs) based on generative pre-trained\nTransformer are considered powerful candidates for unifying various domains and\ntasks. MLMs developed for remote sensing (RS) have demonstrated outstanding\nperformance in multiple tasks, such as visual question answering and visual\ngrounding. In addition to visual grounding that detects specific objects\ncorresponded to given instruction, aerial detection, which detects all objects\nof multiple categories, is also a valuable and challenging task for RS\nfoundation models. However, aerial detection has not been explored by existing\nRS MLMs because the autoregressive prediction mechanism of MLMs differs\nsignificantly from the detection outputs. In this paper, we present a simple\nbaseline for applying MLMs to aerial detection for the first time, named\nLMMRotate. Specifically, we first introduce a normalization method to transform\ndetection outputs into textual outputs to be compatible with the MLM framework.\nThen, we propose a evaluation method, which ensures a fair comparison between\nMLMs and conventional object detection models. We construct the baseline by\nfine-tuning open-source general-purpose MLMs and achieve impressive detection\nperformance comparable to conventional detector. We hope that this baseline\nwill serve as a reference for future MLM development, enabling more\ncomprehensive capabilities for understanding RS images. Code is available at\nhttps://github.com/Li-Qingyun/mllm-mmrotate.\n","authors":["Qingyun Li","Yushi Chen","Xinya Shu","Dong Chen","Xin He","Yi Yu","Xue Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09720v1.pdf","comment":"4 pages, 1 table, 4 figures"},{"id":"http://arxiv.org/abs/2404.02933v3","updated":"2025-01-16T18:08:22Z","published":"2024-04-03T01:09:41Z","title":"NL2KQL: From Natural Language to Kusto Query","summary":"  Data is growing rapidly in volume and complexity. Proficiency in database\nquery languages is pivotal for crafting effective queries. As coding assistants\nbecome more prevalent, there is significant opportunity to enhance database\nquery languages. The Kusto Query Language (KQL) is a widely used query language\nfor large semi-structured data such as logs, telemetries, and time-series for\nbig data analytics platforms. This paper introduces NL2KQL an innovative\nframework that uses large language models (LLMs) to convert natural language\nqueries (NLQs) to KQL queries. The proposed NL2KQL framework includes several\nkey components: Schema Refiner which narrows down the schema to its most\npertinent elements; the Few-shot Selector which dynamically selects relevant\nexamples from a few-shot dataset; and the Query Refiner which repairs syntactic\nand semantic errors in KQL queries. Additionally, this study outlines a method\nfor generating large datasets of synthetic NLQ-KQL pairs which are valid within\na specific database contexts. To validate NL2KQL's performance, we utilize an\narray of online (based on query execution) and offline (based on query parsing)\nmetrics. Through ablation studies, the significance of each framework component\nis examined, and the datasets used for benchmarking are made publicly\navailable. This work is the first of its kind and is compared with available\nbaselines to demonstrate its effectiveness.\n","authors":["Xinye Tang","Amir H. Abdi","Jeremias Eichelbaum","Mahan Das","Alex Klein","Nihal Irmak Pakis","William Blum","Daniel L Mace","Tanvi Raja","Namrata Padmanabhan","Ye Xing"],"pdf_url":"https://arxiv.org/pdf/2404.02933v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09709v1","updated":"2025-01-16T18:00:06Z","published":"2025-01-16T18:00:06Z","title":"CyberMentor: AI Powered Learning Tool Platform to Address Diverse\n  Student Needs in Cybersecurity Education","summary":"  Many non-traditional students in cybersecurity programs often lack access to\nadvice from peers, family members and professors, which can hinder their\neducational experiences. Additionally, these students may not fully benefit\nfrom various LLM-powered AI assistants due to issues like content relevance,\nlocality of advice, minimum expertise, and timing. This paper addresses these\nchallenges by introducing an application designed to provide comprehensive\nsupport by answering questions related to knowledge, skills, and career\npreparation advice tailored to the needs of these students. We developed a\nlearning tool platform, CyberMentor, to address the diverse needs and pain\npoints of students majoring in cybersecurity. Powered by agentic workflow and\nGenerative Large Language Models (LLMs), the platform leverages\nRetrieval-Augmented Generation (RAG) for accurate and contextually relevant\ninformation retrieval to achieve accessibility and personalization. We\ndemonstrated its value in addressing knowledge requirements for cybersecurity\neducation and for career marketability, in tackling skill requirements for\nanalytical and programming assignments, and in delivering real time on demand\nlearning support. Using three use scenarios, we showcased CyberMentor in\nfacilitating knowledge acquisition and career preparation and providing\nseamless skill-based guidance and support. We also employed the LangChain\nprompt-based evaluation methodology to evaluate the platform's impact,\nconfirming its strong performance in helpfulness, correctness, and\ncompleteness. These results underscore the system's ability to support students\nin developing practical cybersecurity skills while improving equity and\nsustainability within higher education. Furthermore, CyberMentor's open-source\ndesign allows for adaptation across other disciplines, fostering educational\ninnovation and broadening its potential impact.\n","authors":["Tianyu Wang","Nianjun Zhou","Zhixiong Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09709v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.09707v1","updated":"2025-01-16T17:58:58Z","published":"2025-01-16T17:58:58Z","title":"The Goofus & Gallant Story Corpus for Practical Value Alignment","summary":"  Values or principles are key elements of human society that influence people\nto behave and function according to an accepted standard set of social rules to\nmaintain social order. As AI systems are becoming ubiquitous in human society,\nit is a major concern that they could violate these norms or values and\npotentially cause harm. Thus, to prevent intentional or unintentional harm, AI\nsystems are expected to take actions that align with these principles. Training\nsystems to exhibit this type of behavior is difficult and often requires a\nspecialized dataset. This work presents a multi-modal dataset illustrating\nnormative and non-normative behavior in real-life situations described through\nnatural language and artistic images. This training set contains curated sets\nof images that are designed to teach young children about social principles. We\nargue that this is an ideal dataset to use for training socially normative\nagents given this fact.\n","authors":["Md Sultan Al Nahian","Tasmia Tasrin","Spencer Frazier","Mark Riedl","Brent Harrison"],"pdf_url":"https://arxiv.org/pdf/2501.09707v1.pdf","comment":"Accepted by International Conference on Machine Learning and\n  Applications (ICMLA) 2024. Main Conference, Long Paper"},{"id":"http://arxiv.org/abs/2501.09705v1","updated":"2025-01-16T17:57:53Z","published":"2025-01-16T17:57:53Z","title":"Practical Continual Forgetting for Pre-trained Vision Models","summary":"  For privacy and security concerns, the need to erase unwanted information\nfrom pre-trained vision models is becoming evident nowadays. In real-world\nscenarios, erasure requests originate at any time from both users and model\nowners, and these requests usually form a sequence. Therefore, under such a\nsetting, selective information is expected to be continuously removed from a\npre-trained model while maintaining the rest. We define this problem as\ncontinual forgetting and identify three key challenges. (i) For unwanted\nknowledge, efficient and effective deleting is crucial. (ii) For remaining\nknowledge, the impact brought by the forgetting procedure should be minimal.\n(iii) In real-world scenarios, the training samples may be scarce or partially\nmissing during the process of forgetting. To address them, we first propose\nGroup Sparse LoRA (GS-LoRA). Specifically, towards (i), we introduce LoRA\nmodules to fine-tune the FFN layers in Transformer blocks for each forgetting\ntask independently, and towards (ii), a simple group sparse regularization is\nadopted, enabling automatic selection of specific LoRA groups and zeroing out\nthe others. To further extend GS-LoRA to more practical scenarios, we\nincorporate prototype information as additional supervision and introduce a\nmore practical approach, GS-LoRA++. For each forgotten class, we move the\nlogits away from its original prototype. For the remaining classes, we pull the\nlogits closer to their respective prototypes. We conduct extensive experiments\non face recognition, object detection and image classification and demonstrate\nthat our method manages to forget specific classes with minimal impact on other\nclasses. Codes have been released on https://github.com/bjzhb666/GS-LoRA.\n","authors":["Hongbo Zhao","Fei Zhu","Bolin Ni","Feng Zhu","Gaofeng Meng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07948v2","updated":"2025-01-16T17:56:53Z","published":"2024-12-10T22:22:19Z","title":"Frechet Music Distance: A Metric For Generative Symbolic Music\n  Evaluation","summary":"  In this paper we introduce the Frechet Music Distance (FMD), a novel\nevaluation metric for generative symbolic music models, inspired by the Frechet\nInception Distance (FID) in computer vision and Frechet Audio Distance (FAD) in\ngenerative audio. FMD calculates the distance between distributions of\nreference and generated symbolic music embeddings, capturing abstract musical\nfeatures. We validate FMD across several datasets and models. Results indicate\nthat FMD effectively differentiates model quality, providing a domain-specific\nmetric for evaluating symbolic music generation, and establishing a\nreproducible standard for future research in symbolic music modeling.\n","authors":["Jan Retkowski","Jakub Stępniak","Mateusz Modrzejewski"],"pdf_url":"https://arxiv.org/pdf/2412.07948v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09700v1","updated":"2025-01-16T17:54:56Z","published":"2025-01-16T17:54:56Z","title":"Cueless EEG imagined speech for subject identification: dataset and\n  benchmarks","summary":"  Electroencephalogram (EEG) signals have emerged as a promising modality for\nbiometric identification. While previous studies have explored the use of\nimagined speech with semantically meaningful words for subject identification,\nmost have relied on additional visual or auditory cues. In this study, we\nintroduce a cueless EEG-based imagined speech paradigm, where subjects imagine\nthe pronunciation of semantically meaningful words without any external cues.\nThis innovative approach addresses the limitations of prior methods by\nrequiring subjects to select and imagine words from a predefined list\nnaturally. The dataset comprises over 4,350 trials from 11 subjects across five\nsessions. We assess a variety of classification methods, including traditional\nmachine learning techniques such as Support Vector Machines (SVM) and XGBoost,\nas well as time-series foundation models and deep learning architectures\nspecifically designed for EEG classification, such as EEG Conformer and Shallow\nConvNet. A session-based hold-out validation strategy was employed to ensure\nreliable evaluation and prevent data leakage. Our results demonstrate\noutstanding classification accuracy, reaching 97.93%. These findings highlight\nthe potential of cueless EEG paradigms for secure and reliable subject\nidentification in real-world applications, such as brain-computer interfaces\n(BCIs).\n","authors":["Ali Derakhshesh","Zahra Dehghanian","Reza Ebrahimpour","Hamid R. Rabiee"],"pdf_url":"https://arxiv.org/pdf/2501.09700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09686v1","updated":"2025-01-16T17:37:58Z","published":"2025-01-16T17:37:58Z","title":"Towards Large Reasoning Models: A Survey of Reinforced Reasoning with\n  Large Language Models","summary":"  Language has long been conceived as an essential tool for human reasoning.\nThe breakthrough of Large Language Models (LLMs) has sparked significant\nresearch interest in leveraging these models to tackle complex reasoning tasks.\nResearchers have moved beyond simple autoregressive token generation by\nintroducing the concept of \"thought\" -- a sequence of tokens representing\nintermediate steps in the reasoning process. This innovative paradigm enables\nLLMs' to mimic complex human reasoning processes, such as tree search and\nreflective thinking. Recently, an emerging trend of learning to reason has\napplied reinforcement learning (RL) to train LLMs to master reasoning\nprocesses. This approach enables the automatic generation of high-quality\nreasoning trajectories through trial-and-error search algorithms, significantly\nexpanding LLMs' reasoning capacity by providing substantially more training\ndata. Furthermore, recent studies demonstrate that encouraging LLMs to \"think\"\nwith more tokens during test-time inference can further significantly boost\nreasoning accuracy. Therefore, the train-time and test-time scaling combined to\nshow a new research frontier -- a path toward Large Reasoning Model. The\nintroduction of OpenAI's o1 series marks a significant milestone in this\nresearch direction. In this survey, we present a comprehensive review of recent\nprogress in LLM reasoning. We begin by introducing the foundational background\nof LLMs and then explore the key technical components driving the development\nof large reasoning models, with a focus on automated data construction,\nlearning-to-reason techniques, and test-time scaling. We also analyze popular\nopen-source projects at building large reasoning models, and conclude with open\nchallenges and future research directions.\n","authors":["Fengli Xu","Qianyue Hao","Zefang Zong","Jingwei Wang","Yunke Zhang","Jingyi Wang","Xiaochong Lan","Jiahui Gong","Tianjian Ouyang","Fanjin Meng","Chenyang Shao","Yuwei Yan","Qinglong Yang","Yiwen Song","Sijian Ren","Xinyuan Hu","Yu Li","Jie Feng","Chen Gao","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2501.09686v1.pdf","comment":"36 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.09685v1","updated":"2025-01-16T17:37:35Z","published":"2025-01-16T17:37:35Z","title":"Reward-Guided Controlled Generation for Inference-Time Alignment in\n  Diffusion Models: Tutorial and Review","summary":"  This tutorial provides an in-depth guide on inference-time guidance and\nalignment methods for optimizing downstream reward functions in diffusion\nmodels. While diffusion models are renowned for their generative modeling\ncapabilities, practical applications in fields such as biology often require\nsample generation that maximizes specific metrics (e.g., stability, affinity in\nproteins, closeness to target structures). In these scenarios, diffusion models\ncan be adapted not only to generate realistic samples but also to explicitly\nmaximize desired measures at inference time without fine-tuning. This tutorial\nexplores the foundational aspects of such inference-time algorithms. We review\nthese methods from a unified perspective, demonstrating that current techniques\n-- such as Sequential Monte Carlo (SMC)-based guidance, value-based sampling,\nand classifier guidance -- aim to approximate soft optimal denoising processes\n(a.k.a. policies in RL) that combine pre-trained denoising processes with value\nfunctions serving as look-ahead functions that predict from intermediate states\nto terminal rewards. Within this framework, we present several novel algorithms\nnot yet covered in the literature. Furthermore, we discuss (1) fine-tuning\nmethods combined with inference-time techniques, (2) inference-time algorithms\nbased on search algorithms such as Monte Carlo tree search, which have received\nlimited attention in current research, and (3) connections between\ninference-time algorithms in language models and diffusion models. The code of\nthis tutorial on protein design is available at\nhttps://github.com/masa-ue/AlignInversePro\n","authors":["Masatoshi Uehara","Yulai Zhao","Chenyu Wang","Xiner Li","Aviv Regev","Sergey Levine","Tommaso Biancalani"],"pdf_url":"https://arxiv.org/pdf/2501.09685v1.pdf","comment":"We plan to add more content/codes. Please let us know if there are\n  any comments"},{"id":"http://arxiv.org/abs/2501.09682v1","updated":"2025-01-16T17:34:34Z","published":"2025-01-16T17:34:34Z","title":"Incorporating Quantum Advantage in Quantum Circuit Generation through\n  Genetic Programming","summary":"  Designing efficient quantum circuits that leverage quantum advantage compared\nto classical computing has become increasingly critical. Genetic algorithms\nhave shown potential in generating such circuits through artificial evolution.\nHowever, integrating quantum advantage into the fitness function of these\nalgorithms remains unexplored. In this paper, we aim to enhance the efficiency\nof quantum circuit design by proposing two novel approaches for incorporating\nquantum advantage metrics into the fitness function of genetic algorithms.1 We\nevaluate our approaches based on the Bernstein-Vazirani Problem and the\nUnstructured Database Search Problem as test cases. The results demonstrate\nthat our approaches not only improve the convergence speed of the genetic\nalgorithm but also produce circuits comparable to expert-designed solutions.\nOur findings suggest that automated quantum circuit design using genetic\nalgorithms that incorporate a measure of quantum advantage is a promising\napproach to accelerating the development of quantum algorithms.\n","authors":["Christoph Stein","Michael Färber"],"pdf_url":"https://arxiv.org/pdf/2501.09682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04202v7","updated":"2025-01-16T17:28:26Z","published":"2024-03-07T04:12:24Z","title":"Dynamics of Moral Behavior in Heterogeneous Populations of Learning\n  Agents","summary":"  Growing concerns about safety and alignment of AI systems highlight the\nimportance of embedding moral capabilities in artificial agents: a promising\nsolution is the use of learning from experience, i.e., Reinforcement Learning.\nIn multi-agent (social) environments, complex population-level phenomena may\nemerge from interactions between individual learning agents. Many of the\nexisting studies rely on simulated social dilemma environments to study the\ninteractions of independent learning agents; however, they tend to ignore the\nmoral heterogeneity that is likely to be present in societies of agents in\npractice. For example, at different points in time a single learning agent may\nface opponents who are consequentialist (i.e., focused on maximizing outcomes\nover time), norm-based (i.e., conforming to specific norms), or virtue-based\n(i.e., considering a combination of different virtues). The extent to which\nagents' co-development may be impacted by such moral heterogeneity in\npopulations is not well understood. In this paper, we present a study of the\nlearning dynamics of morally heterogeneous populations interacting in a social\ndilemma setting. Using an Iterated Prisoner's Dilemma environment with a\npartner selection mechanism, we investigate the extent to which the prevalence\nof diverse moral agents in populations affects individual agents' learning\nbehaviors and emergent population-level outcomes. We observe several types of\nnon-trivial interactions between pro-social and anti-social agents, and find\nthat certain types of moral agents are able to steer selfish agents towards\nmore cooperative behavior.\n","authors":["Elizaveta Tennant","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2403.04202v7.pdf","comment":"Presented at AIES 2024 (7th AAAI/ACM Conference on AI, Ethics, and\n  Society - San Jose, CA, USA) - see\n  https://ojs.aaai.org/index.php/AIES/article/view/31736"},{"id":"http://arxiv.org/abs/2501.09674v1","updated":"2025-01-16T17:11:21Z","published":"2025-01-16T17:11:21Z","title":"Authenticated Delegation and Authorized AI Agents","summary":"  The rapid deployment of autonomous AI agents creates urgent challenges around\nauthorization, accountability, and access control in digital spaces. New\nstandards are needed to know whom AI agents act on behalf of and guide their\nuse appropriately, protecting online spaces while unlocking the value of task\ndelegation to autonomous agents. We introduce a novel framework for\nauthenticated, authorized, and auditable delegation of authority to AI agents,\nwhere human users can securely delegate and restrict the permissions and scope\nof agents while maintaining clear chains of accountability. This framework\nbuilds on existing identification and access management protocols, extending\nOAuth 2.0 and OpenID Connect with agent-specific credentials and metadata,\nmaintaining compatibility with established authentication and web\ninfrastructure. Further, we propose a framework for translating flexible,\nnatural language permissions into auditable access control configurations,\nenabling robust scoping of AI agent capabilities across diverse interaction\nmodalities. Taken together, this practical approach facilitates immediate\ndeployment of AI agents while addressing key security and accountability\nconcerns, working toward ensuring agentic AI systems perform only appropriate\nactions and providing a tool for digital service providers to enable AI agent\ninteractions without risking harm from scalable interaction.\n","authors":["Tobin South","Samuele Marro","Thomas Hardjono","Robert Mahari","Cedric Deslandes Whitney","Dazza Greenwood","Alan Chan","Alex Pentland"],"pdf_url":"https://arxiv.org/pdf/2501.09674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09672v1","updated":"2025-01-16T17:08:12Z","published":"2025-01-16T17:08:12Z","title":"Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP\n  Evaluation Benchmark","summary":"  The proliferation of Vision-Language Models (VLMs) in the past several years\ncalls for rigorous and comprehensive evaluation methods and benchmarks. This\nwork analyzes existing VLM evaluation techniques, including automated metrics,\nAI-based assessments, and human evaluations across diverse tasks. We first\nintroduce Robin - a novel suite of VLMs that we built by combining Large\nLanguage Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use\nRobin to identify shortcomings of current evaluation approaches across scales.\nNext, to overcome the identified limitations, we introduce CHIRP - a new long\nform response benchmark we developed for more robust and complete VLM\nevaluation. We provide open access to the Robin training code, model suite, and\nCHIRP benchmark to promote reproducibility and advance VLM research.\n","authors":["Alexis Roger","Prateek Humane","Daniel Z. Kaplan","Kshitij Gupta","Qi Sun","George Adamopoulos","Jonathan Siu Chi Lim","Quentin Anthony","Edwin Fennell","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2501.09672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09653v1","updated":"2025-01-16T16:48:41Z","published":"2025-01-16T16:48:41Z","title":"The Heap: A Contamination-Free Multilingual Code Dataset for Evaluating\n  Large Language Models","summary":"  The recent rise in the popularity of large language models has spurred the\ndevelopment of extensive code datasets needed to train them. This has left\nlimited code available for collection and use in the downstream investigation\nof specific behaviors, or evaluation of large language models without suffering\nfrom data contamination. To address this problem, we release The Heap, a large\nmultilingual dataset covering 57 programming languages that has been\ndeduplicated with respect to other open datasets of code, enabling researchers\nto conduct fair evaluations of large language models without significant data\ncleaning overhead.\n","authors":["Jonathan Katzy","Razvan Mihai Popescu","Arie van Deursen","Maliheh Izadi"],"pdf_url":"https://arxiv.org/pdf/2501.09653v1.pdf","comment":"Pre-Print. Accepted to FORGE 2025 Dataset Track"},{"id":"http://arxiv.org/abs/2501.09649v1","updated":"2025-01-16T16:45:08Z","published":"2025-01-16T16:45:08Z","title":"Monte Carlo Tree Search with Velocity Obstacles for safe and efficient\n  motion planning in dynamic environments","summary":"  Online motion planning is a challenging problem for intelligent robots moving\nin dense environments with dynamic obstacles, e.g., crowds. In this work, we\npropose a novel approach for optimal and safe online motion planning with\nminimal information about dynamic obstacles. Specifically, our approach\nrequires only the current position of the obstacles and their maximum speed,\nbut it does not need any information about their exact trajectories or dynamic\nmodel. The proposed methodology combines Monte Carlo Tree Search (MCTS), for\nonline optimal planning via model simulations, with Velocity Obstacles (VO),\nfor obstacle avoidance. We perform experiments in a cluttered simulated\nenvironment with walls, and up to 40 dynamic obstacles moving with random\nvelocities and directions. With an ablation study, we show the key contribution\nof VO in scaling up the efficiency of MCTS, selecting the safest and most\nrewarding actions in the tree of simulations. Moreover, we show the superiority\nof our methodology with respect to state-of-the-art planners, including\nNon-linear Model Predictive Control (NMPC), in terms of improved collision\nrate, computational and task performance.\n","authors":["Lorenzo Bonanni","Daniele Meli","Alberto Castellini","Alessandro Farinelli"],"pdf_url":"https://arxiv.org/pdf/2501.09649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16600v2","updated":"2025-01-16T16:42:59Z","published":"2024-10-22T00:55:04Z","title":"Convex Markov Games: A Framework for Creativity, Imitation, Fairness,\n  and Safety in Multiagent Learning","summary":"  Behavioral diversity, expert imitation, fairness, safety goals and others\ngive rise to preferences in sequential decision making domains that do not\ndecompose additively across time. We introduce the class of convex Markov games\nthat allow general convex preferences over occupancy measures. Despite infinite\ntime horizon and strictly higher generality than Markov games, pure strategy\nNash equilibria exist. Furthermore, equilibria can be approximated empirically\nby performing gradient descent on an upper bound of exploitability. Our\nexperiments reveal novel solutions to classic repeated normal-form games, find\nfair solutions in a repeated asymmetric coordination game, and prioritize safe\nlong-term behavior in a robot warehouse environment. In the prisoner's dilemma,\nour algorithm leverages transient imitation to find a policy profile that\ndeviates from observed human play only slightly, yet achieves higher per-player\nutility while also being three orders of magnitude less exploitable.\n","authors":["Ian Gemp","Andreas Haupt","Luke Marris","Siqi Liu","Georgios Piliouras"],"pdf_url":"https://arxiv.org/pdf/2410.16600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09646v1","updated":"2025-01-16T16:38:33Z","published":"2025-01-16T16:38:33Z","title":"NS-Gym: Open-Source Simulation Environments and Benchmarks for\n  Non-Stationary Markov Decision Processes","summary":"  In many real-world applications, agents must make sequential decisions in\nenvironments where conditions are subject to change due to various exogenous\nfactors. These non-stationary environments pose significant challenges to\ntraditional decision-making models, which typically assume stationary dynamics.\nNon-stationary Markov decision processes (NS-MDPs) offer a framework to model\nand solve decision problems under such changing conditions. However, the lack\nof standardized benchmarks and simulation tools has hindered systematic\nevaluation and advance in this field. We present NS-Gym, the first simulation\ntoolkit designed explicitly for NS-MDPs, integrated within the popular\nGymnasium framework. In NS-Gym, we segregate the evolution of the environmental\nparameters that characterize non-stationarity from the agent's decision-making\nmodule, allowing for modular and flexible adaptations to dynamic environments.\nWe review prior work in this domain and present a toolkit encapsulating key\nproblem characteristics and types in NS-MDPs. This toolkit is the first effort\nto develop a set of standardized interfaces and benchmark problems to enable\nconsistent and reproducible evaluation of algorithms under non-stationary\nconditions. We also benchmark six algorithmic approaches from prior work on\nNS-MDPs using NS-Gym. Our vision is that NS-Gym will enable researchers to\nassess the adaptability and robustness of their decision-making algorithms to\nnon-stationary conditions.\n","authors":["Nathaniel S. Keplinger","Baiting Luo","Iliyas Bektas","Yunuo Zhang","Kyle Hollins Wray","Aron Laszka","Abhishek Dubey","Ayan Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2501.09646v1.pdf","comment":"23 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.09645v1","updated":"2025-01-16T16:37:33Z","published":"2025-01-16T16:37:33Z","title":"CarMem: Enhancing Long-Term Memory in LLM Voice Assistants through\n  Category-Bounding","summary":"  In today's assistant landscape, personalisation enhances interactions,\nfosters long-term relationships, and deepens engagement. However, many systems\nstruggle with retaining user preferences, leading to repetitive user requests\nand disengagement. Furthermore, the unregulated and opaque extraction of user\npreferences in industry applications raises significant concerns about privacy\nand trust, especially in regions with stringent regulations like Europe. In\nresponse to these challenges, we propose a long-term memory system for voice\nassistants, structured around predefined categories. This approach leverages\nLarge Language Models to efficiently extract, store, and retrieve preferences\nwithin these categories, ensuring both personalisation and transparency. We\nalso introduce a synthetic multi-turn, multi-session conversation dataset\n(CarMem), grounded in real industry data, tailored to an in-car voice assistant\nsetting. Benchmarked on the dataset, our system achieves an F1-score of .78 to\n.95 in preference extraction, depending on category granularity. Our\nmaintenance strategy reduces redundant preferences by 95% and contradictory\nones by 92%, while the accuracy of optimal retrieval is at .87. Collectively,\nthe results demonstrate the system's suitability for industrial applications.\n","authors":["Johannes Kirmayr","Lukas Stappen","Phillip Schneider","Florian Matthes","Elisabeth André"],"pdf_url":"https://arxiv.org/pdf/2501.09645v1.pdf","comment":"Accepted for presentation at the International Conference on\n  Computational Linguistics (COLING 2025)"},{"id":"http://arxiv.org/abs/2501.09640v1","updated":"2025-01-16T16:30:02Z","published":"2025-01-16T16:30:02Z","title":"Electronic Health Records: Towards Digital Twins in Healthcare","summary":"  The pivotal shift from traditional paper-based records to sophisticated\nElectronic Health Records (EHR), enabled systematic collection and analysis of\npatient data through descriptive statistics, providing insight into patterns\nand trends across patient populations. This evolution continued toward\npredictive analytics, allowing healthcare providers to anticipate patient\noutcomes and potential complications before they occur. This progression from\nbasic digital record-keeping to sophisticated predictive modelling and digital\ntwins reflects healthcare's broader evolution toward more integrated,\npatient-centred approaches that combine data-driven insights with personalized\ncare delivery. This chapter explores the evolution and significance of\nhealthcare information systems, beginning with an examination of the\nimplementation of EHR in the UK and the USA. It provides a comprehensive\noverview of the International Classification of Diseases (ICD) system, tracing\nits development from ICD-9 to ICD-10. Central to this discussion is the\nMIMIC-III database, a landmark achievement in healthcare data sharing and\narguably the most comprehensive critical care database freely available to\nresearchers worldwide. MIMIC-III has democratized access to high-quality\nhealthcare data, enabling unprecedented opportunities for research and\nanalysis. The chapter examines its structure, clinical outcome analysis\ncapabilities, and practical applications through case studies, with a\nparticular focus on mortality and length of stay metrics, vital signs\nextraction, and ICD coding. Through detailed entity-relationship diagrams and\npractical examples, the text illustrates MIMIC's complex data structure and\ndemonstrates how different querying approaches can lead to subtly different\nresults, emphasizing the critical importance of understanding the database's\narchitecture for accurate data extraction.\n","authors":["Muhammet Alkan","Hester Huijsdens","Yola Jones","Fani Deligianni"],"pdf_url":"https://arxiv.org/pdf/2501.09640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16641v3","updated":"2025-01-16T16:29:31Z","published":"2024-12-21T14:21:33Z","title":"A Systems Thinking Approach to Algorithmic Fairness","summary":"  Systems thinking provides us with a way to model the algorithmic fairness\nproblem by allowing us to encode prior knowledge and assumptions about where we\nbelieve bias might exist in the data generating process. We can then encode\nthese beliefs as a series of causal graphs, enabling us to link AI/ML systems\nto politics and the law. This allows us to combine techniques from machine\nlearning, causal inference, and system dynamics in order to capture different\nemergent aspects of the fairness problem. We can use systems thinking to help\npolicymakers on both sides of the political aisle to understand the complex\ntrade-offs that exist from different types of fairness policies, providing a\nsociotechnical foundation for designing AI policy that is aligned to their\npolitical agendas.\n","authors":["Chris Lam"],"pdf_url":"https://arxiv.org/pdf/2412.16641v3.pdf","comment":"This paper has been submitted to the 2025 ACM FAccT conference for\n  review"},{"id":"http://arxiv.org/abs/2405.17097v2","updated":"2025-01-16T16:27:33Z","published":"2024-05-27T12:12:26Z","title":"A Comparative Study on Multi-task Uncertainty Quantification in Semantic\n  Segmentation and Monocular Depth Estimation","summary":"  Deep neural networks excel in perception tasks such as semantic segmentation\nand monocular depth estimation, making them indispensable in safety-critical\napplications like autonomous driving and industrial inspection. However, they\noften suffer from overconfidence and poor explainability, especially for\nout-of-domain data. While uncertainty quantification has emerged as a promising\nsolution to these challenges, multi-task settings have yet to be explored. In\nan effort to shed light on this, we evaluate Monte Carlo Dropout, Deep\nSub-Ensembles, and Deep Ensembles for joint semantic segmentation and monocular\ndepth estimation. Thereby, we reveal that Deep Ensembles stand out as the\npreferred choice, particularly in out-of-domain scenarios, and show the\npotential benefit of multi-task learning with regard to the uncertainty quality\nin comparison to solving both tasks separately. Additionally, we highlight the\nimpact of employing different uncertainty thresholds to classify pixels as\ncertain or uncertain, with the median uncertainty emerging as a robust default.\n","authors":["Steven Landgraf","Markus Hillemann","Theodor Kapler","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2405.17097v2.pdf","comment":"This manuscript is an extended version of a previously published\n  conference paper and is currently in review for a journal"},{"id":"http://arxiv.org/abs/2501.09632v1","updated":"2025-01-16T16:20:37Z","published":"2025-01-16T16:20:37Z","title":"Platform-Aware Mission Planning","summary":"  Planning for autonomous systems typically requires reasoning with models at\ndifferent levels of abstraction, and the harmonization of two competing sets of\nobjectives: high-level mission goals that refer to an interaction of the system\nwith the external environment, and low-level platform constraints that aim to\npreserve the integrity and the correct interaction of the subsystems. The\ncomplicated interplay between these two models makes it very hard to reason on\nthe system as a whole, especially when the objective is to find plans with\nrobustness guarantees, considering the non-deterministic behavior of the lower\nlayers of the system.\n  In this paper, we introduce the problem of Platform-Aware Mission Planning\n(PAMP), addressing it in the setting of temporal durative actions. The PAMP\nproblem differs from standard temporal planning for its exists-forall nature:\nthe high-level plan dealing with mission goals is required to satisfy safety\nand executability constraints, for all the possible non-deterministic\nexecutions of the low-level model of the platform and the environment. We\npropose two approaches for solving PAMP. The first baseline approach\namalgamates the mission and platform levels, while the second is based on an\nabstraction-refinement loop that leverages the combination of a planner and a\nverification engine. We prove the soundness and completeness of the proposed\napproaches and validate them experimentally, demonstrating the importance of\nheterogeneous modeling and the superiority of the technique based on\nabstraction-refinement.\n","authors":["Stefan Panjkovic","Alessandro Cimatti","Andrea Micheli","Stefano Tonetta"],"pdf_url":"https://arxiv.org/pdf/2501.09632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09628v1","updated":"2025-01-16T16:17:39Z","published":"2025-01-16T16:17:39Z","title":"Artificial Intelligence-Driven Clinical Decision Support Systems","summary":"  As artificial intelligence (AI) becomes increasingly embedded in healthcare\ndelivery, this chapter explores the critical aspects of developing reliable and\nethical Clinical Decision Support Systems (CDSS). Beginning with the\nfundamental transition from traditional statistical models to sophisticated\nmachine learning approaches, this work examines rigorous validation strategies\nand performance assessment methods, including the crucial role of model\ncalibration and decision curve analysis. The chapter emphasizes that creating\ntrustworthy AI systems in healthcare requires more than just technical\naccuracy; it demands careful consideration of fairness, explainability, and\nprivacy. The challenge of ensuring equitable healthcare delivery through AI is\nstressed, discussing methods to identify and mitigate bias in clinical\npredictive models. The chapter then delves into explainability as a cornerstone\nof human-centered CDSS. This focus reflects the understanding that healthcare\nprofessionals must not only trust AI recommendations but also comprehend their\nunderlying reasoning. The discussion advances in an analysis of privacy\nvulnerabilities in medical AI systems, from data leakage in deep learning\nmodels to sophisticated attacks against model explanations. The text explores\nprivacy-preservation strategies such as differential privacy and federated\nlearning, while acknowledging the inherent trade-offs between privacy\nprotection and model performance. This progression, from technical validation\nto ethical considerations, reflects the multifaceted challenges of developing\nAI systems that can be seamlessly and reliably integrated into daily clinical\npractice while maintaining the highest standards of patient care and data\nprotection.\n","authors":["Muhammet Alkan","Idris Zakariyya","Samuel Leighton","Kaushik Bhargav Sivangi","Christos Anagnostopoulos","Fani Deligianni"],"pdf_url":"https://arxiv.org/pdf/2501.09628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10729v3","updated":"2025-01-16T16:04:07Z","published":"2024-06-15T20:04:06Z","title":"A Comprehensive Survey of Foundation Models in Medicine","summary":"  Foundation models (FMs) are large-scale deep learning models trained on\nmassive datasets, often using self-supervised learning techniques. These models\nserve as a versatile base for a wide range of downstream tasks, including those\nin medicine and healthcare. FMs have demonstrated remarkable success across\nmultiple healthcare domains. However, existing surveys in this field do not\ncomprehensively cover all areas where FMs have made significant strides. In\nthis survey, we present a comprehensive review of FMs in medicine, focusing on\ntheir evolution, learning strategies, flagship models, applications, and\nassociated challenges. We examine how prominent FMs, such as the BERT and GPT\nfamilies, are transforming various aspects of healthcare, including clinical\nlarge language models, medical image analysis, and omics research.\nAdditionally, we provide a detailed taxonomy of FM-enabled healthcare\napplications, spanning clinical natural language processing, medical computer\nvision, graph learning, and other biology- and omics- related tasks. Despite\nthe transformative potentials of FMs, they also pose unique challenges. This\nsurvey delves into these challenges and highlights open research questions and\nlessons learned to guide researchers and practitioners. Our goal is to provide\nvaluable insights into the capabilities of FMs in health, facilitating\nresponsible deployment and mitigating associated risks.\n","authors":["Wasif Khan","Seowung Leem","Kyle B. See","Joshua K. Wong","Shaoting Zhang","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2406.10729v3.pdf","comment":"Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING"},{"id":"http://arxiv.org/abs/2501.05555v2","updated":"2025-01-16T16:00:37Z","published":"2025-01-09T20:02:10Z","title":"Improving Zero-Shot Object-Level Change Detection by Incorporating\n  Visual Correspondence","summary":"  Detecting object-level changes between two images across possibly different\nviews is a core task in many applications that involve visual inspection or\ncamera surveillance. Existing change-detection approaches suffer from three\nmajor limitations: (1) lack of evaluation on image pairs that contain no\nchanges, leading to unreported false positive rates; (2) lack of\ncorrespondences (i.e., localizing the regions before and after a change); and\n(3) poor zero-shot generalization across different domains. To address these\nissues, we introduce a novel method that leverages change correspondences (a)\nduring training to improve change detection accuracy, and (b) at test time, to\nminimize false positives. That is, we harness the supervision labels of where\nan object is added or removed to supervise change detectors, improving their\naccuracy over previous work by a large margin. Our work is also the first to\npredict correspondences between pairs of detected changes using estimated\nhomography and the Hungarian algorithm. Our model demonstrates superior\nperformance over existing methods, achieving state-of-the-art results in change\ndetection and change correspondence accuracy across both in-distribution and\nzero-shot benchmarks.\n","authors":["Hung Huy Nguyen","Pooyan Rahmanzadehgervi","Long Mai","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.05555v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09620v1","updated":"2025-01-16T16:00:37Z","published":"2025-01-16T16:00:37Z","title":"Beyond Reward Hacking: Causal Rewards for Large Language Model Alignment","summary":"  Recent advances in large language models (LLMs) have demonstrated significant\nprogress in performing complex tasks. While Reinforcement Learning from Human\nFeedback (RLHF) has been effective in aligning LLMs with human preferences, it\nis susceptible to spurious correlations in reward modeling. Consequently, it\noften introduces biases-such as length bias, sycophancy, conceptual bias, and\ndiscrimination that hinder the model's ability to capture true causal\nrelationships. To address this, we propose a novel causal reward modeling\napproach that integrates causal inference to mitigate these spurious\ncorrelations. Our method enforces counterfactual invariance, ensuring reward\npredictions remain consistent when irrelevant variables are altered. Through\nexperiments on both synthetic and real-world datasets, we show that our\napproach mitigates various types of spurious correlations effectively,\nresulting in more reliable and fair alignment of LLMs with human preferences.\nAs a drop-in enhancement to the existing RLHF workflow, our causal reward\nmodeling provides a practical way to improve the trustworthiness and fairness\nof LLM finetuning.\n","authors":["Chaoqi Wang","Zhuokai Zhao","Yibo Jiang","Zhaorun Chen","Chen Zhu","Yuxin Chen","Jiayi Liu","Lizhu Zhang","Xiangjun Fan","Hao Ma","Sinong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01818v3","updated":"2025-01-16T15:58:24Z","published":"2023-12-04T11:46:34Z","title":"Hybrid Approaches for Moral Value Alignment in AI Agents: a Manifesto","summary":"  Increasing interest in ensuring the safety of next-generation Artificial\nIntelligence (AI) systems calls for novel approaches to embedding morality into\nautonomous agents. This goal differs qualitatively from traditional\ntask-specific AI methodologies. In this paper, we provide a systematization of\nexisting approaches to the problem of introducing morality in machines -\nmodelled as a continuum. Our analysis suggests that popular techniques lie at\nthe extremes of this continuum - either being fully hard-coded into top-down,\nexplicit rules, or entirely learned in a bottom-up, implicit fashion with no\ndirect statement of any moral principle (this includes learning from human\nfeedback, as applied to the training and finetuning of large language models,\nor LLMs). Given the relative strengths and weaknesses of each type of\nmethodology, we argue that more hybrid solutions are needed to create adaptable\nand robust, yet controllable and interpretable agentic systems. To that end,\nthis paper discusses both the ethical foundations (including deontology,\nconsequentialism and virtue ethics) and implementations of morally aligned AI\nsystems.\n  We present a series of case studies that rely on intrinsic rewards, moral\nconstraints or textual instructions, applied to either pure-Reinforcement\nLearning or LLM-based agents. By analysing these diverse implementations under\none framework, we compare their relative strengths and shortcomings in\ndeveloping morally aligned AI systems. We then discuss strategies for\nevaluating the effectiveness of moral learning agents. Finally, we present open\nresearch questions and implications for the future of AI safety and ethics\nwhich are emerging from this hybrid framework.\n","authors":["Elizaveta Tennant","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2312.01818v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08603v2","updated":"2025-01-16T15:57:03Z","published":"2025-01-15T06:00:50Z","title":"Monte Carlo Tree Search for Comprehensive Exploration in LLM-Based\n  Automatic Heuristic Design","summary":"  Handcrafting heuristics for solving complex planning tasks (e.g., NP-hard\ncombinatorial optimization (CO) problems) is a common practice but requires\nextensive domain knowledge. Recently, Large Language Model (LLM)-based\nautomatic heuristics design (AHD) methods have shown promise in generating\nhigh-quality heuristics without manual intervention. Existing LLM-based AHD\nmethods employ a population to maintain a fixed number of top-performing\nLLM-generated heuristics and introduce evolutionary computation (EC) to enhance\nthe population iteratively. However, the population-based procedure brings\ngreedy properties, often resulting in convergence to local optima. Instead, to\nmore comprehensively explore the space of heuristics, we propose using Monte\nCarlo Tree Search (MCTS) for LLM-based heuristic evolution while preserving all\nLLM-generated heuristics in a tree structure. With a novel thought-alignment\nprocess and an exploration-decay technique, the proposed MCTS-AHD method\ndelivers significantly higher-quality heuristics on various complex tasks. Our\ncode is available at https://github.com/zz1358m/MCTS-AHD-master.\n","authors":["Zhi Zheng","Zhuoliang Xie","Zhenkun Wang","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2501.08603v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.09980v4","updated":"2025-01-16T15:56:56Z","published":"2022-07-20T15:39:30Z","title":"ReFactor GNNs: Revisiting Factorisation-based Models from a\n  Message-Passing Perspective","summary":"  Factorisation-based Models (FMs), such as DistMult, have enjoyed enduring\nsuccess for Knowledge Graph Completion (KGC) tasks, often outperforming Graph\nNeural Networks (GNNs). However, unlike GNNs, FMs struggle to incorporate node\nfeatures and generalise to unseen nodes in inductive settings. Our work bridges\nthe gap between FMs and GNNs by proposing ReFactor GNNs. This new architecture\ndraws upon both modelling paradigms, which previously were largely thought of\nas disjoint. Concretely, using a message-passing formalism, we show how FMs can\nbe cast as GNNs by reformulating the gradient descent procedure as\nmessage-passing operations, which forms the basis of our ReFactor GNNs. Across\na multitude of well-established KGC benchmarks, our ReFactor GNNs achieve\ncomparable transductive performance to FMs, and state-of-the-art inductive\nperformance while using an order of magnitude fewer parameters.\n","authors":["Yihong Chen","Pushkar Mishra","Luca Franceschi","Pasquale Minervini","Pontus Stenetorp","Sebastian Riedel"],"pdf_url":"https://arxiv.org/pdf/2207.09980v4.pdf","comment":"36th Conference on Neural Information Processing Systems (NeurIPS\n  2022)"},{"id":"http://arxiv.org/abs/2501.09608v1","updated":"2025-01-16T15:32:41Z","published":"2025-01-16T15:32:41Z","title":"Metric Learning with Progressive Self-Distillation for Audio-Visual\n  Embedding Learning","summary":"  Metric learning projects samples into an embedded space, where similarities\nand dissimilarities are quantified based on their learned representations.\nHowever, existing methods often rely on label-guided representation learning,\nwhere representations of different modalities, such as audio and visual data,\nare aligned based on annotated labels. This approach tends to underutilize\nlatent complex features and potential relationships inherent in the\ndistributions of audio and visual data that are not directly tied to the\nlabels, resulting in suboptimal performance in audio-visual embedding learning.\nTo address this issue, we propose a novel architecture that integrates\ncross-modal triplet loss with progressive self-distillation. Our method\nenhances representation learning by leveraging inherent distributions and\ndynamically refining soft audio-visual alignments -- probabilistic alignments\nbetween audio and visual data that capture the inherent relationships beyond\nexplicit labels. Specifically, the model distills audio-visual\ndistribution-based knowledge from annotated labels in a subset of each batch.\nThis self-distilled knowledge is used t\n","authors":["Donghuo Zeng","Kazushi Ikeda"],"pdf_url":"https://arxiv.org/pdf/2501.09608v1.pdf","comment":"5 pages, 3 figures, 2 tables. Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.09605v1","updated":"2025-01-16T15:25:44Z","published":"2025-01-16T15:25:44Z","title":"Managed-Retention Memory: A New Class of Memory for the AI Era","summary":"  AI clusters today are one of the major uses of High Bandwidth Memory (HBM).\nHowever, HBM is suboptimal for AI workloads for several reasons. Analysis shows\nHBM is overprovisioned on write performance, but underprovisioned on density\nand read bandwidth, and also has significant energy per bit overheads. It is\nalso expensive, with lower yield than DRAM due to manufacturing complexity. We\npropose a new memory class: Managed-Retention Memory (MRM), which is more\noptimized to store key data structures for AI inference workloads. We believe\nthat MRM may finally provide a path to viability for technologies that were\noriginally proposed to support Storage Class Memory (SCM). These technologies\ntraditionally offered long-term persistence (10+ years) but provided poor IO\nperformance and/or endurance. MRM makes different trade-offs, and by\nunderstanding the workload IO patterns, MRM foregoes long-term data retention\nand write performance for better potential performance on the metrics important\nfor these workloads.\n","authors":["Sergey Legtchenko","Ioan Stefanovici","Richard Black","Antony Rowstron","Junyi Liu","Paolo Costa","Burcu Canakci","Dushyanth Narayanan","Xingbo Wu"],"pdf_url":"https://arxiv.org/pdf/2501.09605v1.pdf","comment":"8 pages (5 content + 3 refs); 1 figure"},{"id":"http://arxiv.org/abs/2501.09597v1","updated":"2025-01-16T15:21:18Z","published":"2025-01-16T15:21:18Z","title":"Reducing the Sensitivity of Neural Physics Simulators to Mesh Topology\n  via Pretraining","summary":"  Meshes are used to represent complex objects in high fidelity physics\nsimulators across a variety of domains, such as radar sensing and aerodynamics.\nThere is growing interest in using neural networks to accelerate physics\nsimulations, and also a growing body of work on applying neural networks\ndirectly to irregular mesh data. Since multiple mesh topologies can represent\nthe same object, mesh augmentation is typically required to handle topological\nvariation when training neural networks. Due to the sensitivity of physics\nsimulators to small changes in mesh shape, it is challenging to use these\naugmentations when training neural network-based physics simulators. In this\nwork, we show that variations in mesh topology can significantly reduce the\nperformance of neural network simulators. We evaluate whether pretraining can\nbe used to address this issue, and find that employing an established\nautoencoder pretraining technique with graph embedding models reduces the\nsensitivity of neural network simulators to variations in mesh topology.\nFinally, we highlight future research directions that may further reduce neural\nsimulator sensitivity to mesh topology.\n","authors":["Nathan Vaska","Justin Goodwin","Robin Walters","Rajmonda S. Caceres"],"pdf_url":"https://arxiv.org/pdf/2501.09597v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.09595v1","updated":"2025-01-16T15:20:22Z","published":"2025-01-16T15:20:22Z","title":"IFRA: a machine learning-based Instrumented Fall Risk Assessment Scale\n  derived from Instrumented Timed Up and Go test in stroke patients","summary":"  Effective fall risk assessment is critical for post-stroke patients. The\npresent study proposes a novel, data-informed fall risk assessment method based\non the instrumented Timed Up and Go (ITUG) test data, bringing in many mobility\nmeasures that traditional clinical scales fail to capture. IFRA, which stands\nfor Instrumented Fall Risk Assessment, has been developed using a two-step\nprocess: first, features with the highest predictive power among those\ncollected in a ITUG test have been identified using machine learning\ntechniques; then, a strategy is proposed to stratify patients into low, medium,\nor high-risk strata. The dataset used in our analysis consists of 142\nparticipants, out of which 93 were used for training (15 synthetically\ngenerated), 17 for validation and 32 to test the resulting IFRA scale (22\nnon-fallers and 10 fallers). Features considered in the IFRA scale include gait\nspeed, vertical acceleration during sit-to-walk transition, and turning angular\nvelocity, which align well with established literature on the risk of fall in\nneurological patients. In a comparison with traditional clinical scales such as\nthe traditional Timed Up & Go and the Mini-BESTest, IFRA demonstrates\ncompetitive performance, being the only scale to correctly assign more than\nhalf of the fallers to the high-risk stratum (Fischer's Exact test p = 0.004).\nDespite the dataset's limited size, this is the first proof-of-concept study to\npave the way for future evidence regarding the use of IFRA tool for continuous\npatient monitoring and fall prevention both in clinical stroke rehabilitation\nand at home post-discharge.\n","authors":["Simone Macciò","Alessandro Carfì","Alessio Capitanelli","Peppino Tropea","Massimo Corbo","Fulvio Mastrogiovanni","Michela Picardi"],"pdf_url":"https://arxiv.org/pdf/2501.09595v1.pdf","comment":"26 pages, 2 figures, submitted for review dec 2024"},{"id":"http://arxiv.org/abs/2407.20891v4","updated":"2025-01-16T14:45:36Z","published":"2024-07-30T15:07:13Z","title":"Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian\n  Neural Networks","summary":"  Computational complexity of Bayesian learning is impeding its adoption in\npractical, large-scale tasks. Despite demonstrations of significant merits such\nas improved robustness and resilience to unseen or out-of-distribution inputs\nover their non- Bayesian counterparts, their practical use has faded to near\ninsignificance. In this study, we introduce an innovative framework to mitigate\nthe computational burden of Bayesian neural networks (BNNs). Our approach\nfollows the principle of Bayesian techniques based on deep ensembles, but\nsignificantly reduces their cost via multiple low-rank perturbations of\nparameters arising from a pre-trained neural network. Both vanilla version of\nensembles as well as more sophisticated schemes such as Bayesian learning with\nStein Variational Gradient Descent (SVGD), previously deemed impractical for\nlarge models, can be seamlessly implemented within the proposed framework,\ncalled Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a\ndramatic reduction in the number of trainable parameters required to\napproximate a Bayesian posterior; and ii) it not only maintains, but in some\ninstances, surpasses the performance of conventional Bayesian learning methods\nand non-Bayesian baselines. Our results with large-scale tasks such as\nImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the\neffectiveness and versatility of Bella in building highly scalable and\npractical Bayesian deep models for real-world applications.\n","authors":["Bao Gia Doan","Afshar Shamsi","Xiao-Yu Guo","Arash Mohammadi","Hamid Alinejad-Rokny","Dino Sejdinovic","Damien Teney","Damith C. Ranasinghe","Ehsan Abbasnejad"],"pdf_url":"https://arxiv.org/pdf/2407.20891v4.pdf","comment":"This paper is accepted in AAAI'2025"},{"id":"http://arxiv.org/abs/2501.09571v1","updated":"2025-01-16T14:45:12Z","published":"2025-01-16T14:45:12Z","title":"MatrixNet: Learning over symmetry groups using learned group\n  representations","summary":"  Group theory has been used in machine learning to provide a theoretically\ngrounded approach for incorporating known symmetry transformations in tasks\nfrom robotics to protein modeling. In these applications, equivariant neural\nnetworks use known symmetry groups with predefined representations to learn\nover geometric input data. We propose MatrixNet, a neural network architecture\nthat learns matrix representations of group element inputs instead of using\npredefined representations. MatrixNet achieves higher sample efficiency and\ngeneralization over several standard baselines in prediction tasks over the\nseveral finite groups and the Artin braid group. We also show that MatrixNet\nrespects group relations allowing generalization to group elements of greater\nword length than in the training set.\n","authors":["Lucas Laird","Circe Hsu","Asilata Bapat","Robin Walters"],"pdf_url":"https://arxiv.org/pdf/2501.09571v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.12549v3","updated":"2025-01-16T14:26:40Z","published":"2024-08-22T17:03:08Z","title":"Modeling Time-Variant Responses of Optical Compressors with Selective\n  State Space Models","summary":"  This paper presents a method for modeling optical dynamic range compressors\nusing deep neural networks with Selective State Space models. The proposed\napproach surpasses previous methods based on recurrent layers by employing a\nSelective State Space block to encode the input audio. It features a refined\ntechnique integrating Feature-wise Linear Modulation and Gated Linear Units to\nadjust the network dynamically, conditioning the compression's attack and\nrelease phases according to external parameters. The proposed architecture is\nwell-suited for low-latency and real-time applications, crucial in live audio\nprocessing. The method has been validated on the analog optical compressors\nTubeTech CL 1B and Teletronix LA-2A, which possess distinct characteristics.\nEvaluation is performed using quantitative metrics and subjective listening\ntests, comparing the proposed method with other state-of-the-art models.\nResults show that our black-box modeling methods outperform all others,\nachieving accurate emulation of the compression process for both seen and\nunseen settings during training. We further show a correlation between this\naccuracy and the sampling density of the control parameters in the dataset and\nidentify settings with fast attack and slow release as the most challenging to\nemulate.\n","authors":["Riccardo Simionato","Stefano Fasciani"],"pdf_url":"https://arxiv.org/pdf/2408.12549v3.pdf","comment":"Journal of the Audio Engineering Society"},{"id":"http://arxiv.org/abs/2501.09555v1","updated":"2025-01-16T14:18:06Z","published":"2025-01-16T14:18:06Z","title":"Text-driven Adaptation of Foundation Models for Few-shot Surgical\n  Workflow Analysis","summary":"  Purpose: Surgical workflow analysis is crucial for improving surgical\nefficiency and safety. However, previous studies rely heavily on large-scale\nannotated datasets, posing challenges in cost, scalability, and reliance on\nexpert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven\nAdaptation), designed to handle various surgical workflow analysis tasks with\nminimal paired image-label data.\n  Methods: Our approach has two key components. First, Few-shot selection-based\nmodality alignment selects a small subset of images and aligns their embeddings\nwith text embeddings from the downstream task, bridging the modality gap.\nSecond, Text-driven adaptation leverages only text data to train a decoder,\neliminating the need for paired image-text data. This decoder is then applied\nto aligned image embeddings, enabling image-related tasks without explicit\nimage-text pairs.\n  Results: We evaluate our approach to generative tasks (image captioning) and\ndiscriminative tasks (triplet recognition and phase recognition). Results show\nthat Surg-FTDA outperforms baselines and generalizes well across downstream\ntasks.\n  Conclusion: We propose a text-driven adaptation approach that mitigates the\nmodality gap and handles multiple downstream tasks in surgical workflow\nanalysis, with minimal reliance on large annotated datasets. The code and\ndataset will be released in https://github.com/TingxuanSix/Surg-FTDA.\n","authors":["Tingxuan Chen","Kun Yuan","Vinkle Srivastav","Nassir Navab","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2501.09555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07989v2","updated":"2025-01-16T14:01:58Z","published":"2024-09-12T12:34:29Z","title":"Enhancing Few-Shot Image Classification through Learnable Multi-Scale\n  Embedding and Attention Mechanisms","summary":"  In the context of few-shot classification, the goal is to train a classifier\nusing a limited number of samples while maintaining satisfactory performance.\nHowever, traditional metric-based methods exhibit certain limitations in\nachieving this objective. These methods typically rely on a single distance\nvalue between the query feature and support feature, thereby overlooking the\ncontribution of shallow features. To overcome this challenge, we propose a\nnovel approach in this paper. Our approach involves utilizing a multi-output\nembedding network that maps samples into distinct feature spaces. The proposed\nmethod extracts feature vectors at different stages, enabling the model to\ncapture both global and abstract features. By utilizing these diverse feature\nspaces, our model enhances its performance. Moreover, employing a\nself-attention mechanism improves the refinement of features at each stage,\nleading to even more robust representations and improved overall performance.\nFurthermore, assigning learnable weights to each stage significantly improved\nperformance and results. We conducted comprehensive evaluations on the\nMiniImageNet and FC100 datasets, specifically in the 5-way 1-shot and 5-way\n5-shot scenarios. Additionally, we performed cross-domain tasks across eight\nbenchmark datasets, achieving high accuracy in the testing domains. These\nevaluations demonstrate the efficacy of our proposed method in comparison to\nstate-of-the-art approaches. https://github.com/FatemehAskari/MSENet\n","authors":["Fatemeh Askari","Amirreza Fateh","Mohammad Reza Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2409.07989v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06513v2","updated":"2025-01-16T13:52:28Z","published":"2024-09-10T13:48:18Z","title":"Sines, Transient, Noise Neural Modeling of Piano Notes","summary":"  This paper introduces a novel method for emulating piano sounds. We propose\nto exploit the sines, transient, and noise decomposition to design a\ndifferentiable spectral modeling synthesizer replicating piano notes. Three\nsub-modules learn these components from piano recordings and generate the\ncorresponding harmonic, transient, and noise signals. Splitting the emulation\ninto three independently trainable models reduces the modeling tasks'\ncomplexity. The quasi-harmonic content is produced using a differentiable\nsinusoidal model guided by physics-derived formulas, whose parameters are\nautomatically estimated from audio recordings. The noise sub-module uses a\nlearnable time-varying filter, and the transients are generated using a deep\nconvolutional network. From singular notes, we emulate the coupling between\ndifferent keys in trichords with a convolutional-based network. Results show\nthe model matches the partial distribution of the target while predicting the\nenergy in the higher part of the spectrum presents more challenges. The energy\ndistribution in the spectra of the transient and noise components is accurate\noverall. While the model is more computationally and memory efficient,\nperceptual tests reveal limitations in accurately modeling the attack phase of\nnotes. Despite this, it generally achieves perceptual accuracy in emulating\nsingle notes and trichords.\n","authors":["Riccardo Simionato","Stefano Fasciani"],"pdf_url":"https://arxiv.org/pdf/2409.06513v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12628v2","updated":"2025-01-16T13:40:30Z","published":"2022-11-22T23:25:25Z","title":"Safe Control and Learning Using the Generalized Action Governor","summary":"  This article introduces a general framework for safe control and learning\nbased on the generalized action governor (AG). The AG is a supervisory scheme\nfor augmenting a nominal closed-loop system with the ability of strictly\nhandling prescribed safety constraints. In the first part of this article, we\npresent a generalized AG methodology and analyze its key properties in a\ngeneral setting. Then, we introduce tailored AG design approaches derived from\nthe generalized methodology for linear and discrete systems. Afterward, we\ndiscuss the application of the generalized AG to facilitate safe online\nlearning, which aims at safely evolving control parameters using real-time data\nto enhance control performance in uncertain systems. We present two safe\nlearning algorithms based on, respectively, reinforcement learning and\ndata-driven Koopman operator-based control integrated with the generalized AG\nto exemplify this application. Finally, we illustrate the developments with a\nnumerical example.\n","authors":["Nan Li","Yutong Li","Ilya Kolmanovsky","Anouck Girard","H. Eric Tseng","Dimitar Filev"],"pdf_url":"https://arxiv.org/pdf/2211.12628v2.pdf","comment":"22 pages, 4 figures, submitted to the International Journal of\n  Control"},{"id":"http://arxiv.org/abs/2501.09534v1","updated":"2025-01-16T13:36:24Z","published":"2025-01-16T13:36:24Z","title":"AI in Support of Diversity and Inclusion","summary":"  In this paper, we elaborate on how AI can support diversity and inclusion and\nexemplify research projects conducted in that direction. We start by looking at\nthe challenges and progress in making large language models (LLMs) more\ntransparent, inclusive, and aware of social biases. Even though LLMs like\nChatGPT have impressive abilities, they struggle to understand different\ncultural contexts and engage in meaningful, human like conversations. A key\nissue is that biases in language processing, especially in machine translation,\ncan reinforce inequality. Tackling these biases requires a multidisciplinary\napproach to ensure AI promotes diversity, fairness, and inclusion. We also\nhighlight AI's role in identifying biased content in media, which is important\nfor improving representation. By detecting unequal portrayals of social groups,\nAI can help challenge stereotypes and create more inclusive technologies.\nTransparent AI algorithms, which clearly explain their decisions, are essential\nfor building trust and reducing bias in AI systems. We also stress AI systems\nneed diverse and inclusive training data. Projects like the Child Growth\nMonitor show how using a wide range of data can help address real world\nproblems like malnutrition and poverty. We present a project that demonstrates\nhow AI can be applied to monitor the role of search engines in spreading\ndisinformation about the LGBTQ+ community. Moreover, we discuss the SignON\nproject as an example of how technology can bridge communication gaps between\nhearing and deaf people, emphasizing the importance of collaboration and mutual\ntrust in developing inclusive AI. Overall, with this paper, we advocate for AI\nsystems that are not only effective but also socially responsible, promoting\nfair and inclusive interactions between humans and machines.\n","authors":["Çiçek Güven","Afra Alishahi","Henry Brighton","Gonzalo Nápoles","Juan Sebastian Olier","Marie Šafář","Eric Postma","Dimitar Shterionov","Mirella De Sisto","Eva Vanmassenhove"],"pdf_url":"https://arxiv.org/pdf/2501.09534v1.pdf","comment":"14 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.08869v2","updated":"2025-01-16T13:30:23Z","published":"2025-01-15T15:38:56Z","title":"Silent Abandonment in Text-Based Contact Centers: Identifying,\n  Quantifying, and Mitigating its Operational Impacts","summary":"  In the quest to improve services, companies offer customers the option to\ninteract with agents via texting. Such contact centers face unique challenges\ncompared to traditional call centers, as measuring customer experience proxies\nlike abandonment and patience involves uncertainty. A key source of this\nuncertainty is silent abandonment, where customers leave without notifying the\nsystem, wasting agent time and leaving their status unclear. Silent abandonment\nalso obscures whether a customer was served or left. Our goals are to measure\nthe magnitude of silent abandonment and mitigate its effects. Classification\nmodels show that 3%-70% of customers across 17 companies abandon silently. In\none study, 71.3% of abandoning customers did so silently, reducing agent\nefficiency by 3.2% and system capacity by 15.3%, incurring $5,457 in annual\ncosts per agent. We develop an expectation-maximization (EM) algorithm to\nestimate customer patience under uncertainty and identify influencing\ncovariates. We find that companies should use classification models to estimate\nabandonment scope and our EM algorithm to assess patience. We suggest\nstrategies to operationally mitigate the impact of silent abandonment by\npredicting suspected silent-abandonment behavior or changing service design.\nSpecifically, we show that while allowing customers to write while waiting in\nthe queue creates a missing data challenge, it also significantly increases\npatience and reduces service time, leading to reduced abandonment and lower\nstaffing requirements.\n","authors":["Antonio Castellanos","Galit B. Yom-Tov","Yair Goldberg","Jaeyoung Park"],"pdf_url":"https://arxiv.org/pdf/2501.08869v2.pdf","comment":"75% of the paper is an updated version of arXiv:2304.11754"},{"id":"http://arxiv.org/abs/2410.24031v3","updated":"2025-01-16T13:20:56Z","published":"2024-10-31T15:29:51Z","title":"A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems\n  using Disparity Maps","summary":"  Face recognition technologies are increasingly used in various applications,\nyet they are vulnerable to face spoofing attacks. These spoofing attacks often\ninvolve unique 3D structures, such as printed papers or mobile device screens.\nAlthough stereo-depth cameras can detect such attacks effectively, their\nhigh-cost limits their widespread adoption. Conversely, two-sensor systems\nwithout extrinsic calibration offer a cost-effective alternative but are unable\nto calculate depth using stereo techniques. In this work, we propose a method\nto overcome this challenge by leveraging facial attributes to derive disparity\ninformation and estimate relative depth for anti-spoofing purposes, using\nnon-calibrated systems. We introduce a multi-modal anti-spoofing model, coined\nDisparity Model, that incorporates created disparity maps as a third modality\nalongside the two original sensor modalities. We demonstrate the effectiveness\nof the Disparity Model in countering various spoof attacks using a\ncomprehensive dataset collected from the Intel RealSense ID Solution F455. Our\nmethod outperformed existing methods in the literature, achieving an Equal\nError Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False\nPositive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the\nerrors of the best comparison method, respectively. Additionally, we introduce\na model ensemble that addresses 3D spoof attacks as well, achieving an EER of\n2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a\nstate-of-the-art solution for the challenging task of anti-spoofing in\nnon-calibrated systems that lack depth information.\n","authors":["Ariel Larey","Eyal Rond","Omer Achrack"],"pdf_url":"https://arxiv.org/pdf/2410.24031v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09525v1","updated":"2025-01-16T13:20:29Z","published":"2025-01-16T13:20:29Z","title":"Class Incremental Fault Diagnosis under Limited Fault Data via\n  Supervised Contrastive Knowledge Distillation","summary":"  Class-incremental fault diagnosis requires a model to adapt to new fault\nclasses while retaining previous knowledge. However, limited research exists\nfor imbalanced and long-tailed data. Extracting discriminative features from\nfew-shot fault data is challenging, and adding new fault classes often demands\ncostly model retraining. Moreover, incremental training of existing methods\nrisks catastrophic forgetting, and severe class imbalance can bias the model's\ndecisions toward normal classes. To tackle these issues, we introduce a\nSupervised Contrastive knowledge distiLlation for class Incremental Fault\nDiagnosis (SCLIFD) framework proposing supervised contrastive knowledge\ndistillation for improved representation learning capability and less\nforgetting, a novel prioritized exemplar selection method for sample replay to\nalleviate catastrophic forgetting, and the Random Forest Classifier to address\nthe class imbalance. Extensive experimentation on simulated and real-world\nindustrial datasets across various imbalance ratios demonstrates the\nsuperiority of SCLIFD over existing approaches. Our code can be found at\nhttps://github.com/Zhang-Henry/SCLIFD_TII.\n","authors":["Hanrong Zhang","Yifei Yao","Zixuan Wang","Jiayuan Su","Mengxuan Li","Peng Peng","Hongwei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13187v3","updated":"2025-01-16T12:46:53Z","published":"2024-10-17T03:32:02Z","title":"aiXcoder-7B: A Lightweight and Effective Large Language Model for Code\n  Processing","summary":"  Large Language Models (LLMs) have been widely used in code completion, and\nresearchers are focusing on scaling up LLMs to improve their accuracy. However,\nlarger LLMs have lower inference efficiency, affecting developers' experience\nand productivity. In this paper, we propose a lightweight and effective LLM for\ncode completion named aiXcoder-7B. Compared to existing LLMs, aiXcoder-7B\nachieves higher code completion accuracy while having smaller scales (i.e., 7\nbillion parameters). We attribute the superiority of aiXcoder-7B to three key\nfactors: (1) Multi-objective training. We employ three training objectives, one\nof which is our proposed Structured Fill-In-the-Middle (SFIM). SFIM considers\nthe syntax structures in code and effectively improves the performance of LLMs\nfor code. (2) Diverse data sampling strategies. They consider inter-file\nrelationships and enhance the capability of LLMs in understanding cross-file\ncontexts. (3) Extensive high-quality data. We establish a rigorous data\ncollection pipeline and consume a total of 1.2 trillion unique tokens for\ntraining aiXcoder-7B. This vast volume of data enables aiXcoder-7B to learn a\nbroad distribution of code. We evaluate aiXcoder-7B in five popular code\ncompletion benchmarks and a new benchmark collected by this paper. The results\nshow that aiXcoder-7B outperforms the latest six LLMs with similar sizes and\neven surpasses four larger LLMs (e.g., StarCoder2-15B and CodeLlama-34B),\npositioning aiXcoder-7B as a lightweight and effective LLM for academia and\nindustry. Finally, we summarize three valuable insights for helping\npractitioners train the next generations of LLMs for code. aiXcoder-7B has been\nopen-souced and gained significant attention. Until January 2025, aiXcoder-7B\nhas received 2,226 GitHub Stars.\n","authors":["Siyuan Jiang","Jia Li","He Zong","Huanyu Liu","Hao Zhu","Shukai Hu","Erlu Li","Jiazheng Ding","Yu Han","Wei Ning","Gen Wang","Yihong Dong","Kechi Zhang","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2410.13187v3.pdf","comment":"(1) Accepted by the 47th International Conference on Software\n  Engineering (ICSE 2025). (2) aiXcoder-7B is available at\n  https://github.com/aixcoder-plugin/aiXcoder-7B"},{"id":"http://arxiv.org/abs/2409.08199v2","updated":"2025-01-16T12:17:18Z","published":"2024-09-12T16:36:39Z","title":"AudioBERT: Audio Knowledge Augmented Language Model","summary":"  Recent studies have identified that language models, pretrained on text-only\ndatasets, often lack elementary visual knowledge, \\textit{e.g.,} colors of\neveryday objects. Motivated by this observation, we ask whether a similar\nshortcoming exists in terms of the \\textit{auditory} knowledge. To answer this\nquestion, we construct a new dataset called AuditoryBench, which consists of\ntwo novel tasks for evaluating auditory knowledge. Based on our analysis using\nthe benchmark, we find that language models also suffer from a severe lack of\nauditory knowledge. To address this limitation, we propose AudioBERT, a novel\nmethod to augment the auditory knowledge of BERT through a retrieval-based\napproach. First, we detect auditory knowledge spans in prompts to query our\nretrieval model efficiently. Then, we inject audio knowledge into BERT and\nswitch on low-rank adaptation for effective adaptation when audio knowledge is\nrequired. Our experiments demonstrate that AudioBERT is quite effective,\nachieving superior performance on the AuditoryBench. The dataset and code are\navailable at \\bulurl{https://github.com/HJ-Ok/AudioBERT}.\n","authors":["Hyunjong Ok","Suho Yoo","Jaeho Lee"],"pdf_url":"https://arxiv.org/pdf/2409.08199v2.pdf","comment":"5 pages, 3 figures, ICASSP 2025"},{"id":"http://arxiv.org/abs/2306.09377v3","updated":"2025-01-16T12:12:24Z","published":"2023-06-15T08:18:29Z","title":"Evaluating alignment between humans and neural network representations\n  in image-based learning tasks","summary":"  Humans represent scenes and objects in rich feature spaces, carrying\ninformation that allows us to generalise about category memberships and\nabstract functions with few examples. What determines whether a neural network\nmodel generalises like a human? We tested how well the representations of $86$\npretrained neural network models mapped to human learning trajectories across\ntwo tasks where humans had to learn continuous relationships and categories of\nnatural images. In these tasks, both human participants and neural networks\nsuccessfully identified the relevant stimulus features within a few trials,\ndemonstrating effective generalisation. We found that while training dataset\nsize was a core determinant of alignment with human choices, contrastive\ntraining with multi-modal data (text and imagery) was a common feature of\ncurrently publicly available models that predicted human generalisation.\nIntrinsic dimensionality of representations had different effects on alignment\nfor different model types. Lastly, we tested three sets of human-aligned\nrepresentations and found no consistent improvements in predictive accuracy\ncompared to the baselines. In conclusion, pretrained neural networks can serve\nto extract representations for cognitive models, as they appear to capture some\nfundamental aspects of cognition that are transferable across tasks. Both our\nparadigms and modelling approach offer a novel way to quantify alignment\nbetween neural networks and humans and extend cognitive science into more\nnaturalistic domains.\n","authors":["Can Demircan","Tankred Saanum","Leonardo Pettini","Marcel Binz","Blazej M Baczkowski","Christian F Doeller","Mona M Garvert","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2306.09377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16485v3","updated":"2025-01-16T11:59:02Z","published":"2024-07-23T14:00:18Z","title":"Learning Constraint Network from Demonstrations via Positive-Unlabeled\n  Learning with Memory Replay","summary":"  Planning for a wide range of real-world tasks necessitates to know and write\nall constraints. However, instances exist where these constraints are either\nunknown or challenging to specify accurately. A possible solution is to infer\nthe unknown constraints from expert demonstration. The majority of prior works\nlimit themselves to learning simple linear constraints, or require strong\nknowledge of the true constraint parameterization or environmental model. To\nmitigate these problems, this paper presents a positive-unlabeled (PU) learning\napproach to infer a continuous, arbitrary and possibly nonlinear, constraint\nfrom demonstration. From a PU learning view, We treat all data in\ndemonstrations as positive (feasible) data, and learn a (sub)-optimal policy to\ngenerate high-reward-winning but potentially infeasible trajectories, which\nserve as unlabeled data containing both feasible and infeasible states. Under\nan assumption on data distribution, a feasible-infeasible classifier (i.e.,\nconstraint model) is learned from the two datasets through a postprocessing PU\nlearning technique. The entire method employs an iterative framework\nalternating between updating the policy, which generates and selects\nhigher-reward policies, and updating the constraint model. Additionally, a\nmemory buffer is introduced to record and reuse samples from previous\niterations to prevent forgetting. The effectiveness of the proposed method is\nvalidated in two Mujoco environments, successfully inferring continuous\nnonlinear constraints and outperforming a baseline method in terms of\nconstraint accuracy and policy safety.\n","authors":["Baiyu Peng","Aude Billard"],"pdf_url":"https://arxiv.org/pdf/2407.16485v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09481v1","updated":"2025-01-16T11:35:22Z","published":"2025-01-16T11:35:22Z","title":"MonoSOWA: Scalable monocular 3D Object detector Without human\n  Annotations","summary":"  Detecting the three-dimensional position and orientation of objects using a\nsingle RGB camera is a foundational task in computer vision with many important\napplications. Traditionally, 3D object detection methods are trained in a\nfully-supervised setup, requiring vast amounts of human annotations, which are\nlaborious, costly, and do not scale well with the ever-increasing amounts of\ndata being captured.\n  In this paper, we present the first method to train 3D object detectors for\nmonocular RGB cameras without domain-specific human annotations, thus making\norders of magnitude more data available for training. Thanks to newly proposed\nCanonical Object Space, the method can not only exploit data across a variety\nof datasets and camera setups to train a single 3D detector, but unlike\nprevious work it also works out of the box in previously unseen camera setups.\nAll this is crucial for practical applications, where the data and cameras are\nextremely heterogeneous.\n  The method is evaluated on two standard autonomous driving datasets, where it\noutperforms previous works, which, unlike our method, still rely on 2D human\nannotations.\n","authors":["Jan Skvrna","Lukas Neumann"],"pdf_url":"https://arxiv.org/pdf/2501.09481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22944v2","updated":"2025-01-16T11:26:02Z","published":"2024-10-30T12:01:48Z","title":"Focus On This, Not That! Steering LLMs With Adaptive Feature\n  Specification","summary":"  Despite the success of Instruction Tuning (IT) in training large language\nmodels (LLMs) to perform arbitrary user-specified tasks, these models often\nstill leverage spurious or biased features learned from their training data,\nleading to undesired behaviours when deploying them in new contexts. In this\nwork, we introduce Focus Instruction Tuning (FIT), which trains LLMs to\ncondition their responses by focusing on specific features whilst ignoring\nothers, leading to different behaviours based on what features are specified.\nAcross several experimental settings, we show that focus-tuned models can be\nadaptively steered by focusing on different features at inference-time: for\ninstance, robustness can be improved by focusing on task-causal features and\nignoring spurious features, and social bias can be mitigated by ignoring\ndemographic categories. Furthermore, FIT can steer behaviour in new contexts,\ngeneralising under distribution shift and to new unseen features at inference\ntime, and thereby facilitating more robust, fair, and controllable LLM\napplications in real-world environments.\n","authors":["Tom A. Lamb","Adam Davies","Alasdair Paren","Philip H. S. Torr","Francesco Pinto"],"pdf_url":"https://arxiv.org/pdf/2410.22944v2.pdf","comment":"28pages, 14 figures"},{"id":"http://arxiv.org/abs/2209.04747v6","updated":"2025-01-16T11:17:04Z","published":"2022-09-10T22:00:30Z","title":"Diffusion Models in Vision: A Survey","summary":"  Denoising diffusion models represent a recent emerging topic in computer\nvision, demonstrating remarkable results in the area of generative modeling. A\ndiffusion model is a deep generative model that is based on two stages, a\nforward diffusion stage and a reverse diffusion stage. In the forward diffusion\nstage, the input data is gradually perturbed over several steps by adding\nGaussian noise. In the reverse stage, a model is tasked at recovering the\noriginal input data by learning to gradually reverse the diffusion process,\nstep by step. Diffusion models are widely appreciated for the quality and\ndiversity of the generated samples, despite their known computational burdens,\ni.e. low speeds due to the high number of steps involved during sampling. In\nthis survey, we provide a comprehensive review of articles on denoising\ndiffusion models applied in vision, comprising both theoretical and practical\ncontributions in the field. First, we identify and present three generic\ndiffusion modeling frameworks, which are based on denoising diffusion\nprobabilistic models, noise conditioned score networks, and stochastic\ndifferential equations. We further discuss the relations between diffusion\nmodels and other deep generative models, including variational auto-encoders,\ngenerative adversarial networks, energy-based models, autoregressive models and\nnormalizing flows. Then, we introduce a multi-perspective categorization of\ndiffusion models applied in computer vision. Finally, we illustrate the current\nlimitations of diffusion models and envision some interesting directions for\nfuture research.\n","authors":["Florinel-Alin Croitoru","Vlad Hondru","Radu Tudor Ionescu","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2209.04747v6.pdf","comment":"Accepted in IEEE Transactions on Pattern Analysis and Machine\n  Intelligence. 25 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.09469v1","updated":"2025-01-16T11:10:38Z","published":"2025-01-16T11:10:38Z","title":"Predicting Air Temperature from Volumetric Urban Morphology with Machine\n  Learning","summary":"  In this study, we firstly introduce a method that converts CityGML data into\nvoxels which works efficiently and fast in high resolution for large scale\ndatasets such as cities but by sacrificing some building details to overcome\nthe limitations of previous voxelization methodologies that have been\ncomputationally intensive and inefficient at transforming large-scale urban\nareas into voxel representations for high resolution. Those voxelized 3D city\ndata from multiple cities and corresponding air temperature data are used to\ndevelop a machine learning model. Before the model training, Gaussian blurring\nis implemented on input data to consider spatial relationships, as a result the\ncorrelation rate between air temperature and volumetric building morphology is\nalso increased after the Gaussian blurring. After the model training, the\nprediction results are not just evaluated with Mean Square Error (MSE) but some\nimage similarity metrics such as Structural Similarity Index Measure (SSIM) and\nLearned Perceptual Image Patch Similarity (LPIPS) that are able to detect and\nconsider spatial relations during the evaluation process. This trained model is\ncapable of predicting the spatial distribution of air temperature by using\nbuilding volume information of corresponding pixel as input. By doing so, this\nresearch aims to assist urban planners in incorporating environmental\nparameters into their planning strategies, thereby facilitating more\nsustainable and inhabitable urban environments.\n","authors":["Berk Kıvılcım","Patrick Erik Bradley"],"pdf_url":"https://arxiv.org/pdf/2501.09469v1.pdf","comment":"30 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.09465v1","updated":"2025-01-16T10:56:45Z","published":"2025-01-16T10:56:45Z","title":"RE-POSE: Synergizing Reinforcement Learning-Based Partitioning and\n  Offloading for Edge Object Detection","summary":"  Object detection plays a crucial role in smart video analysis, with\napplications ranging from autonomous driving and security to smart cities.\nHowever, achieving real-time object detection on edge devices presents\nsignificant challenges due to their limited computational resources and the\nhigh demands of deep neural network (DNN)-based detection models, particularly\nwhen processing high-resolution video. Conventional strategies, such as input\ndown-sampling and network up-scaling, often compromise detection accuracy for\nfaster performance or lead to higher inference latency. To address these\nissues, this paper introduces RE-POSE, a Reinforcement Learning (RL)-Driven\nPartitioning and Edge Offloading framework designed to optimize the\naccuracy-latency trade-off in resource-constrained edge environments. Our\napproach features an RL-Based Dynamic Clustering Algorithm (RL-DCA) that\npartitions video frames into non-uniform blocks based on object distribution\nand the computational characteristics of DNNs. Furthermore, a parallel edge\noffloading scheme is implemented to distribute these blocks across multiple\nedge servers for concurrent processing. Experimental evaluations show that\nRE-POSE significantly enhances detection accuracy and reduces inference\nlatency, surpassing existing methods.\n","authors":["Jianrui Shi","Yong Zhao","Zeyang Cui","Xiaoming Shen","Minhang Zeng","Xiaojie Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01622v2","updated":"2025-01-16T10:30:40Z","published":"2024-08-03T01:09:48Z","title":"Positive-Unlabeled Constraint Learning for Inferring Nonlinear\n  Continuous Constraints Functions from Expert Demonstrations","summary":"  Planning for diverse real-world robotic tasks necessitates to know and write\nall constraints. However, instances exist where these constraints are either\nunknown or challenging to specify accurately. A possible solution is to infer\nthe unknown constraints from expert demonstration. This paper presents a novel\ntwo-step Positive-Unlabeled Constraint Learning (PUCL) algorithm to infer a\ncontinuous constraint function from demonstrations, without requiring prior\nknowledge of the true constraint parameterization or environmental model as\nexisting works. We treat all data in demonstrations as positive (feasible)\ndata, and learn a control policy to generate potentially infeasible\ntrajectories, which serve as unlabeled data. The proposed two-step learning\nframework first identifies reliable infeasible data using a distance metric,\nand secondly learns a binary feasibility classifier (i.e., constraint function)\nfrom the feasible demonstrations and reliable infeasible data. The proposed\nmethod is flexible to learn complex-shaped constraint boundary and will not\nmistakenly classify demonstrations as infeasible as previous methods. The\neffectiveness of the proposed method is verified in four constrained\nenvironments, using a networked policy or a dynamical system policy. It\nsuccessfully infers the continuous nonlinear constraints and outperforms other\nbaseline methods in terms of constraint accuracy and policy safety. This work\nhas been published in IEEE Robotics and Automation Letters (RA-L). Please refer\nto the final version at https://doi.org/10.1109/LRA.2024.3522756\n","authors":["Baiyu Peng","Aude Billard"],"pdf_url":"https://arxiv.org/pdf/2408.01622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09444v1","updated":"2025-01-16T10:17:58Z","published":"2025-01-16T10:17:58Z","title":"Solving the unsolvable: Translating case law in Hong Kong","summary":"  This paper addresses the challenges translating case law under Hong Kong's\nbilingual legal system. It highlights the initial success of translating all\nwritten statutes into Chinese before the 1997 handover, a task mandated by the\nBasic Law. The effort involved significant collaboration among legal,\nlinguistic, and translation experts, resulting in a comprehensive and\nculturally appropriate bilingual legal system. However, translating case law\nremains a significant challenge due to the sheer volume and continuous growth\nof judicial decisions. The paper critiques the governments and judiciarys\nsporadic and uncoordinated efforts to translate case law, contrasting it with\nthe thorough approach previously taken for statute translation. Although the\ngovernment acknowledges the importance of legal bilingualism, it lacks a\nsustainable strategy for translating case law. The Judiciarys position that\ntranslating all judgments is unnecessary, unrealistic, and not cost-effectiveis\nanalyzed and critiqued for its impact on legal transparency and public trust. A\nproposed solution involves leveraging machine translation technology through a\nhuman-machine interactive translation platform, which undergoes two major\ntransitions. Initially based on a neural model, the platform transitions to\nusing a large language model for improved translation accuracy. Furthermore, it\nevolves from a single-agent system to a multi-agent system, incorporating\nTranslator, Annotator, and Proofreader agents. This multi-agent approach,\nsupported by a grant, aims to facilitate efficient, high-quality translation of\njudicial judgments by integrating advanced artificial intelligence and\ncontinuous feedback mechanisms, thus better meeting the needs of a bilingual\nlegal system.\n","authors":["King-kui Sin","Xi Xuan","Chunyu Kit","Clara Ho-yan Chan","Honic Ho-kin Ip"],"pdf_url":"https://arxiv.org/pdf/2501.09444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11005v2","updated":"2025-01-16T10:05:17Z","published":"2024-06-25T20:23:15Z","title":"RAGBench: Explainable Benchmark for Retrieval-Augmented Generation\n  Systems","summary":"  Retrieval-Augmented Generation (RAG) has become a standard architectural\npattern for incorporating domain-specific knowledge into user-facing chat\napplications powered by Large Language Models (LLMs). RAG systems are\ncharacterized by (1) a document retriever that queries a domain-specific corpus\nfor context information relevant to an input query, and (2) an LLM that\ngenerates a response based on the provided query and context. However,\ncomprehensive evaluation of RAG systems remains a challenge due to the lack of\nunified evaluation criteria and annotated datasets. In response, we introduce\nRAGBench: the first comprehensive, large-scale RAG benchmark dataset of 100k\nexamples. It covers five unique industry-specific domains and various RAG task\ntypes. RAGBench examples are sourced from industry corpora such as user\nmanuals, making it particularly relevant for industry applications. Further, we\nformalize the TRACe evaluation framework: a set of explainable and actionable\nRAG evaluation metrics applicable across all RAG domains. We release the\nlabeled dataset at https://huggingface.co/datasets/rungalileo/ragbench.\nRAGBench explainable labels facilitate holistic evaluation of RAG systems,\nenabling actionable feedback for continuous improvement of production\napplications. Thorough extensive benchmarking, we find that LLM-based RAG\nevaluation methods struggle to compete with a finetuned RoBERTa model on the\nRAG evaluation task. We identify areas where existing approaches fall short and\npropose the adoption of RAGBench with TRACe towards advancing the state of RAG\nevaluation systems.\n","authors":["Robert Friel","Masha Belyi","Atindriyo Sanyal"],"pdf_url":"https://arxiv.org/pdf/2407.11005v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09431v1","updated":"2025-01-16T09:59:45Z","published":"2025-01-16T09:59:45Z","title":"A Survey on Responsible LLMs: Inherent Risk, Malicious Use, and\n  Mitigation Strategy","summary":"  While large language models (LLMs) present significant potential for\nsupporting numerous real-world applications and delivering positive social\nimpacts, they still face significant challenges in terms of the inherent risk\nof privacy leakage, hallucinated outputs, and value misalignment, and can be\nmaliciously used for generating toxic content and unethical purposes after been\njailbroken. Therefore, in this survey, we present a comprehensive review of\nrecent advancements aimed at mitigating these issues, organized across the four\nphases of LLM development and usage: data collecting and pre-training,\nfine-tuning and alignment, prompting and reasoning, and post-processing and\nauditing. We elaborate on the recent advances for enhancing the performance of\nLLMs in terms of privacy protection, hallucination reduction, value alignment,\ntoxicity elimination, and jailbreak defenses. In contrast to previous surveys\nthat focus on a single dimension of responsible LLMs, this survey presents a\nunified framework that encompasses these diverse dimensions, providing a\ncomprehensive view of enhancing LLMs to better serve real-world applications.\n","authors":["Huandong Wang","Wenjie Fu","Yingzhou Tang","Zhilong Chen","Yuxi Huang","Jinghua Piao","Chen Gao","Fengli Xu","Tao Jiang","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2501.09431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09429v1","updated":"2025-01-16T09:58:24Z","published":"2025-01-16T09:58:24Z","title":"ADAGE: A generic two-layer framework for adaptive agent based modelling","summary":"  Agent-based models (ABMs) are valuable for modelling complex, potentially\nout-of-equilibria scenarios. However, ABMs have long suffered from the Lucas\ncritique, stating that agent behaviour should adapt to environmental changes.\nFurthermore, the environment itself often adapts to these behavioural changes,\ncreating a complex bi-level adaptation problem. Recent progress integrating\nmulti-agent reinforcement learning into ABMs introduces adaptive agent\nbehaviour, beginning to address the first part of this critique, however, the\napproaches are still relatively ad hoc, lacking a general formulation, and\nfurthermore, do not tackle the second aspect of simultaneously adapting\nenvironmental level characteristics in addition to the agent behaviours. In\nthis work, we develop a generic two-layer framework for ADaptive AGEnt based\nmodelling (ADAGE) for addressing these problems. This framework formalises the\nbi-level problem as a Stackelberg game with conditional behavioural policies,\nproviding a consolidated framework for adaptive agent-based modelling based on\nsolving a coupled set of non-linear equations. We demonstrate how this generic\napproach encapsulates several common (previously viewed as distinct) ABM tasks,\nsuch as policy design, calibration, scenario generation, and robust behavioural\nlearning under one unified framework. We provide example simulations on\nmultiple complex economic and financial environments, showing the strength of\nthe novel framework under these canonical settings, addressing long-standing\ncritiques of traditional ABMs.\n","authors":["Benjamin Patrick Evans","Sihan Zeng","Sumitra Ganesh","Leo Ardon"],"pdf_url":"https://arxiv.org/pdf/2501.09429v1.pdf","comment":"Accepted at the 2025 International Conference on Autonomous Agents\n  and Multiagent Systems (AAMAS)"},{"id":"http://arxiv.org/abs/2501.09420v1","updated":"2025-01-16T09:47:18Z","published":"2025-01-16T09:47:18Z","title":"Dynamic Neural Style Transfer for Artistic Image Generation using VGG19","summary":"  Throughout history, humans have created remarkable works of art, but\nartificial intelligence has only recently started to make strides in generating\nvisually compelling art. Breakthroughs in the past few years have focused on\nusing convolutional neural networks (CNNs) to separate and manipulate the\ncontent and style of images, applying texture synthesis techniques.\nNevertheless, a number of current techniques continue to encounter obstacles,\nincluding lengthy processing times, restricted choices of style images, and the\ninability to modify the weight ratio of styles. We proposed a neural style\ntransfer system that can add various artistic styles to a desired image to\naddress these constraints allowing flexible adjustments to style weight ratios\nand reducing processing time. The system uses the VGG19 model for feature\nextraction, ensuring high-quality, flexible stylization without compromising\ncontent integrity.\n","authors":["Kapil Kashyap","Mehak Garg","Sean Fargose","Sindhu Nair"],"pdf_url":"https://arxiv.org/pdf/2501.09420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09410v1","updated":"2025-01-16T09:36:32Z","published":"2025-01-16T09:36:32Z","title":"MoE$^2$: Optimizing Collaborative Inference for Edge Large Language\n  Models","summary":"  Large language models (LLMs) have demonstrated remarkable capabilities across\na wide range of natural language processing tasks. Exploiting the heterogeneous\ncapabilities of edge LLMs is crucial for diverse emerging applications, as it\nenables greater cost-effectiveness and reduced latency. In this work, we\nintroduce \\textit{Mixture-of-Edge-Experts (MoE$^2$)}, a novel collaborative\ninference framework for edge LLMs. We formulate the joint gating and expert\nselection problem to optimize inference performance under energy and latency\nconstraints. Unlike conventional MoE problems, LLM expert selection is\nsignificantly more challenging due to the combinatorial nature and the\nheterogeneity of edge LLMs across various attributes. To this end, we propose a\ntwo-level expert selection mechanism through which we uncover an\noptimality-preserving property of gating parameters across expert selections.\nThis property enables the decomposition of the training and selection\nprocesses, significantly reducing complexity. Furthermore, we leverage the\nobjective's monotonicity and design a discrete monotonic optimization algorithm\nfor optimal expert selection. We implement edge servers with NVIDIA Jetson AGX\nOrins and NVIDIA RTX 4090 GPUs, and perform extensive experiments. Our results\nvalidate that performance improvements of various LLM models and show that our\nMoE$^2$ method can achieve optimal trade-offs among different delay and energy\nbudgets, and outperforms baselines under various system resource constraints.\n","authors":["Lyudong Jin","Yanning Zhang","Yanhan Li","Shurong Wang","Howard H. Yang","Jian Wu","Meng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09410v1.pdf","comment":"Submitted to IEEE/ACM Transactions on Networking"},{"id":"http://arxiv.org/abs/2501.04635v2","updated":"2025-01-16T09:30:38Z","published":"2025-01-08T17:29:46Z","title":"Knowledge Retrieval Based on Generative AI","summary":"  This study develops a question-answering system based on Retrieval-Augmented\nGeneration (RAG) using Chinese Wikipedia and Lawbank as retrieval sources.\nUsing TTQA and TMMLU+ as evaluation datasets, the system employs BGE-M3 for\ndense vector retrieval to obtain highly relevant search results and\nBGE-reranker to reorder these results based on query relevance. The most\npertinent retrieval outcomes serve as reference knowledge for a Large Language\nModel (LLM), enhancing its ability to answer questions and establishing a\nknowledge retrieval system grounded in generative AI. The system's\neffectiveness is assessed through a two-stage evaluation: automatic and\nassisted performance evaluations. The automatic evaluation calculates accuracy\nby comparing the model's auto-generated labels with ground truth answers,\nmeasuring performance under standardized conditions without human intervention.\nThe assisted performance evaluation involves 20 finance-related multiple-choice\nquestions answered by 20 participants without financial backgrounds. Initially,\nparticipants answer independently. Later, they receive system-generated\nreference information to assist in answering, examining whether the system\nimproves accuracy when assistance is provided. The main contributions of this\nresearch are: (1) Enhanced LLM Capability: By integrating BGE-M3 and\nBGE-reranker, the system retrieves and reorders highly relevant results,\nreduces hallucinations, and dynamically accesses authorized or public knowledge\nsources. (2) Improved Data Privacy: A customized RAG architecture enables local\noperation of the LLM, eliminating the need to send private data to external\nservers. This approach enhances data security, reduces reliance on commercial\nservices, lowers operational costs, and mitigates privacy risks.\n","authors":["Te-Lun Yang","Jyi-Shane Liu","Yuen-Hsien Tseng","Jyh-Shing Roger Jang"],"pdf_url":"https://arxiv.org/pdf/2501.04635v2.pdf","comment":"8 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2501.09395v1","updated":"2025-01-16T09:06:43Z","published":"2025-01-16T09:06:43Z","title":"ELM-DeepONets: Backpropagation-Free Training of Deep Operator Networks\n  via Extreme Learning Machines","summary":"  Deep Operator Networks (DeepONets) are among the most prominent frameworks\nfor operator learning, grounded in the universal approximation theorem for\noperators. However, training DeepONets typically requires significant\ncomputational resources. To address this limitation, we propose ELM-DeepONets,\nan Extreme Learning Machine (ELM) framework for DeepONets that leverages the\nbackpropagation-free nature of ELM. By reformulating DeepONet training as a\nleast-squares problem for newly introduced parameters, the ELM-DeepONet\napproach significantly reduces training complexity. Validation on benchmark\nproblems, including nonlinear ODEs and PDEs, demonstrates that the proposed\nmethod not only achieves superior accuracy but also drastically reduces\ncomputational costs. This work offers a scalable and efficient alternative for\noperator learning in scientific computing.\n","authors":["Hwijae Son"],"pdf_url":"https://arxiv.org/pdf/2501.09395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09394v1","updated":"2025-01-16T09:06:10Z","published":"2025-01-16T09:06:10Z","title":"Quantum-Enhanced Transformers for Robust Acoustic Scene Classification\n  in IoT Environments","summary":"  The proliferation of Internet of Things (IoT) devices equipped with acoustic\nsensors necessitates robust acoustic scene classification (ASC) capabilities,\neven in noisy and data-limited environments. Traditional machine learning\nmethods often struggle to generalize effectively under such conditions. To\naddress this, we introduce Q-ASC, a novel Quantum-Inspired Acoustic Scene\nClassifier that leverages the power of quantum-inspired transformers. By\nintegrating quantum concepts like superposition and entanglement, Q-ASC\nachieves superior feature learning and enhanced noise resilience compared to\nclassical models. Furthermore, we introduce a Quantum Variational Autoencoder\n(QVAE) based data augmentation technique to mitigate the challenge of limited\nlabeled data in IoT deployments. Extensive evaluations on the Tampere\nUniversity of Technology (TUT) Acoustic Scenes 2016 benchmark dataset\ndemonstrate that Q-ASC achieves remarkable accuracy between 68.3% and 88.5%\nunder challenging conditions, outperforming state-of-the-art methods by over 5%\nin the best case. This research paves the way for deploying intelligent\nacoustic sensing in IoT networks, with potential applications in smart homes,\nindustrial monitoring, and environmental surveillance, even in adverse acoustic\nenvironments.\n","authors":["Minh K. Quan","Mayuri Wijayasundara","Sujeeva Setunge","Pubudu N. Pathirana"],"pdf_url":"https://arxiv.org/pdf/2501.09394v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.12112v3","updated":"2025-01-16T08:44:22Z","published":"2024-08-22T03:54:08Z","title":"Balancing Act: Prioritization Strategies for LLM-Designed Restless\n  Bandit Rewards","summary":"  LLMs are increasingly used to design reward functions based on human\npreferences in Reinforcement Learning (RL). We focus on LLM-designed rewards\nfor Restless Multi-Armed Bandits, a framework for allocating limited resources\namong agents. In applications such as public health, this approach empowers\ngrassroots health workers to tailor automated allocation decisions to community\nneeds. In the presence of multiple agents, altering the reward function based\non human preferences can impact subpopulations very differently, leading to\ncomplex tradeoffs and a multi-objective resource allocation problem. We are the\nfirst to present a principled method termed Social Choice Language Model for\ndealing with these tradeoffs for LLM-designed rewards for multiagent planners\nin general and restless bandits in particular. The novel part of our model is a\ntransparent and configurable selection component, called an adjudicator,\nexternal to the LLM that controls complex tradeoffs via a user-selected social\nwelfare function. Our experiments demonstrate that our model reliably selects\nmore effective, aligned, and balanced reward functions compared to purely\nLLM-based approaches.\n","authors":["Shresth Verma","Niclas Boehmer","Lingkai Kong","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2408.12112v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09368v1","updated":"2025-01-16T08:27:40Z","published":"2025-01-16T08:27:40Z","title":"Aligning Instruction Tuning with Pre-training","summary":"  Instruction tuning enhances large language models (LLMs) to follow human\ninstructions across diverse tasks, relying on high-quality datasets to guide\nbehavior. However, these datasets, whether manually curated or synthetically\ngenerated, are often narrowly focused and misaligned with the broad\ndistributions captured during pre-training, limiting LLM generalization and\neffective use of pre-trained knowledge. We propose *Aligning Instruction Tuning\nwith Pre-training* (AITP), a method that bridges this gap by identifying\ncoverage shortfalls in instruction-tuning datasets and rewriting\nunderrepresented pre-training data into high-quality instruction-response\npairs. This approach enriches dataset diversity while preserving task-specific\nobjectives. Evaluations on three fully open LLMs across eight benchmarks\ndemonstrate consistent performance improvements with AITP. Ablations highlight\nthe benefits of adaptive data selection, controlled rewriting, and balanced\nintegration, emphasizing the importance of aligning instruction tuning with\npre-training distributions to unlock the full potential of LLMs.\n","authors":["Yiming Liang","Tianyu Zheng","Xinrun Du","Ge Zhang","Xingwei Qu","Xiang Yue","Chujie Zheng","Jiaheng Liu","Lei Ma","Wenhu Chen","Guoyin Wang","Zhaoxiang Zhang","Wenhao Huang","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02623v3","updated":"2025-01-16T08:18:01Z","published":"2024-11-04T21:31:04Z","title":"Learning to Assist Humans without Inferring Rewards","summary":"  Assistive agents should make humans' lives easier. Classically, such\nassistance is studied through the lens of inverse reinforcement learning, where\nan assistive agent (e.g., a chatbot, a robot) infers a human's intention and\nthen selects actions to help the human reach that goal. This approach requires\ninferring intentions, which can be difficult in high-dimensional settings. We\nbuild upon prior work that studies assistance through the lens of empowerment:\nan assistive agent aims to maximize the influence of the human's actions such\nthat they exert a greater control over the environmental outcomes and can solve\ntasks in fewer steps. We lift the major limitation of prior work in this\narea--scalability to high-dimensional settings--with contrastive successor\nrepresentations. We formally prove that these representations estimate a\nsimilar notion of empowerment to that studied by prior work and provide a\nready-made mechanism for optimizing it. Empirically, our proposed method\noutperforms prior methods on synthetic benchmarks, and scales to Overcooked, a\ncooperative game setting. Theoretically, our work connects ideas from\ninformation theory, neuroscience, and reinforcement learning, and charts a path\nfor representations to play a critical role in solving assistive problems.\n","authors":["Vivek Myers","Evan Ellis","Sergey Levine","Benjamin Eysenbach","Anca Dragan"],"pdf_url":"https://arxiv.org/pdf/2411.02623v3.pdf","comment":"Conference on Neural Information Processing Systems (NeurIPS), 2024"},{"id":"http://arxiv.org/abs/2501.09355v1","updated":"2025-01-16T08:06:02Z","published":"2025-01-16T08:06:02Z","title":"YETI (YET to Intervene) Proactive Interventions by Multimodal AI Agents\n  in Augmented Reality Tasks","summary":"  Multimodal AI Agents are AI models that have the capability of interactively\nand cooperatively assisting human users to solve day-to-day tasks. Augmented\nReality (AR) head worn devices can uniquely improve the user experience of\nsolving procedural day-to-day tasks by providing egocentric multimodal (audio\nand video) observational capabilities to AI Agents. Such AR capabilities can\nhelp AI Agents see and listen to actions that users take which can relate to\nmultimodal capabilities of human users. Existing AI Agents, either Large\nLanguage Models (LLMs) or Multimodal Vision-Language Models (VLMs) are reactive\nin nature, which means that models cannot take an action without reading or\nlistening to the human user's prompts. Proactivity of AI Agents on the other\nhand can help the human user detect and correct any mistakes in agent observed\ntasks, encourage users when they do tasks correctly or simply engage in\nconversation with the user - akin to a human teaching or assisting a user. Our\nproposed YET to Intervene (YETI) multimodal agent focuses on the research\nquestion of identifying circumstances that may require the agent to intervene\nproactively. This allows the agent to understand when it can intervene in a\nconversation with human users that can help the user correct mistakes on tasks,\nlike cooking, using AR. Our YETI Agent learns scene understanding signals based\non interpretable notions of Structural Similarity (SSIM) on consecutive video\nframes. We also define the alignment signal which the AI Agent can learn to\nidentify if the video frames corresponding to the user's actions on the task\nare consistent with expected actions. These signals are used by our AI Agent to\ndetermine when it should proactively intervene. We compare our results on the\ninstances of proactive intervention in the HoloAssist multimodal benchmark for\nan expert agent guiding a user to complete procedural tasks.\n","authors":["Saptarashmi Bandyopadhyay","Vikas Bahirwani","Lavisha Aggarwal","Bhanu Guda","Lin Li","Andrea Colaco"],"pdf_url":"https://arxiv.org/pdf/2501.09355v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.09354v1","updated":"2025-01-16T08:05:39Z","published":"2025-01-16T08:05:39Z","title":"Style4Rec: Enhancing Transformer-based E-commerce Recommendation Systems\n  with Style and Shopping Cart Information","summary":"  Understanding users' product preferences is essential to the efficacy of a\nrecommendation system. Precision marketing leverages users' historical data to\ndiscern these preferences and recommends products that align with them.\nHowever, recent browsing and purchase records might better reflect current\npurchasing inclinations. Transformer-based recommendation systems have made\nstrides in sequential recommendation tasks, but they often fall short in\nutilizing product image style information and shopping cart data effectively.\nIn light of this, we propose Style4Rec, a transformer-based e-commerce\nrecommendation system that harnesses style and shopping cart information to\nenhance existing transformer-based sequential product recommendation systems.\nStyle4Rec represents a significant step forward in personalized e-commerce\nrecommendations, outperforming benchmarks across various evaluation metrics.\nStyle4Rec resulted in notable improvements: HR@5 increased from 0.681 to 0.735,\nNDCG@5 increased from 0.594 to 0.674, and MRR@5 increased from 0.559 to 0.654.\nWe tested our model using an e-commerce dataset from our partnering company and\nfound that it exceeded established transformer-based sequential recommendation\nbenchmarks across various evaluation metrics. Thus, Style4Rec presents a\nsignificant step forward in personalized e-commerce recommendation systems.\n","authors":["Berke Ugurlu","Ming-Yi Hong","Che Lin"],"pdf_url":"https://arxiv.org/pdf/2501.09354v1.pdf","comment":"9 pages, 6 images, 4 tables"},{"id":"http://arxiv.org/abs/2501.09345v1","updated":"2025-01-16T07:58:33Z","published":"2025-01-16T07:58:33Z","title":"Rational Tuning of LLM Cascades via Probabilistic Modeling","summary":"  Understanding the reliability of large language models (LLMs) has recently\ngarnered significant attention. Given LLMs' propensity to hallucinate, as well\nas their high sensitivity to prompt design, it is already challenging to\npredict the performance of an individual LLM. However, the problem becomes more\ncomplex for compound LLM systems such as cascades, where in addition to each\nmodel's standalone performance, we must understand how the error rates of\ndifferent models interact. In this paper, we present a probabilistic model for\nthe joint performance distribution of a sequence of LLMs, which enables a\nframework for rationally tuning the confidence thresholds of a LLM cascade\nusing continuous optimization. Compared to selecting confidence thresholds\nusing grid search, our parametric Markov-copula model significantly improves\nruntime scaling with respect to the length of the cascade and the desired\nresolution of the cost-error curve, turning them from intractable into\nlow-order polynomial. In addition, the optimal thresholds computed using our\ncontinuous optimization-based algorithm increasingly outperform those found via\ngrid search as cascade length grows, improving the area under the cost-error\ncurve by 1.9% on average for cascades consisting of at least three models.\nOverall, our Markov-copula model provides a rational basis for tuning LLM\ncascade performance and points to the potential of probabilistic methods in\nanalyzing LLM systems.\n","authors":["Michael J. Zellinger","Matt Thomson"],"pdf_url":"https://arxiv.org/pdf/2501.09345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09164v4","updated":"2025-01-16T07:50:07Z","published":"2024-07-12T10:59:32Z","title":"TPIA: Towards Target-specific Prompt Injection Attack against\n  Code-oriented Large Language Models","summary":"  Recently, code-oriented large language models (Code LLMs) have been widely\nexploited to simplify and facilitate programming. With these tools, developers\ncan easily generate the desired complete functional code based on incomplete\ncode snippets and natural language prompts. Unfortunately, a few pioneering\nworks revealed that these Code LLMs are vulnerable to backdoor and adversarial\nattacks. The former poisons the training data or model parameters, hijacking\nthe LLMs to generate malicious code snippets when encountering the trigger. The\nlatter crafts malicious adversarial input codes to reduce the quality of the\ngenerated codes. However, both attacks have some inherent limitations: backdoor\nattacks rely on the adversary's capability of controlling the model training\nprocess; adversarial attacks struggle with fulfilling specific malicious\npurposes. This paper presents a novel attack paradigm against Code LLMs, namely\ntarget-specific prompt injection attack (TPIA). TPIA generates non-functional\nperturbations containing the information of malicious instructions and inserts\nthem into the victim's code context by spreading them into potentially used\ndependencies (e.g., packages or RAG's knowledge base). It induces the Code LLMs\nto generate attacker-specified malicious code snippets at the target location.\nIn general, we compress the attacker-specified malicious objective into the\nperturbation by adversarial optimization based on greedy token search. We\ncollect 13 representative malicious objectives to design 31 threat cases for\nthree popular programming languages. We show that our TPIA can successfully\nattack three representative open-source Code LLMs (with an ASR of up to 97.9%)\nand two mainstream commercial Code LLM-integrated applications (with an ASR of\nover 90%) in all threat cases, using only a 12-token perturbation. Our work\nalerts a new practical threat of using Code LLMs.\n","authors":["Yuchen Yang","Hongwei Yao","Bingrun Yang","Yiling He","Yiming Li","Tianwei Zhang","Zhan Qin","Kui Ren","Chun Chen"],"pdf_url":"https://arxiv.org/pdf/2407.09164v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03337v4","updated":"2025-01-16T07:40:27Z","published":"2024-07-22T07:19:12Z","title":"PsyDI: Towards a Personalized and Progressively In-depth Chatbot for\n  Psychological Measurements","summary":"  In the field of psychology, traditional assessment methods, such as\nstandardized scales, are frequently critiqued for their static nature, lack of\npersonalization, and reduced participant engagement, while comprehensive\ncounseling evaluations are often inaccessible. The complexity of quantifying\npsychological traits further limits these methods. Despite advances with large\nlanguage models (LLMs), many still depend on single-round Question-and-Answer\ninteractions. To bridge this gap, we introduce PsyDI, a personalized and\nprogressively in-depth chatbot designed for psychological measurements,\nexemplified by its application in the Myers-Briggs Type Indicator (MBTI)\nframework. PsyDI leverages user-related multi-modal information and engages in\ncustomized, multi-turn interactions to provide personalized, easily accessible\nmeasurements, while ensuring precise MBTI type determination. To address the\nchallenge of unquantifiable psychological traits, we introduce a novel training\nparadigm that involves learning the ranking of proxy variables associated with\nthese traits, culminating in a robust score model for MBTI measurements. The\nscore model enables PsyDI to conduct comprehensive and precise measurements\nthrough multi-turn interactions within a unified estimation context. Through\nvarious experiments, we validate the efficacy of both the score model and the\nPsyDI pipeline, demonstrating its potential to serve as a general framework for\npsychological measurements. Furthermore, the online deployment of PsyDI has\ngarnered substantial user engagement, with over 3,000 visits, resulting in the\ncollection of numerous multi-turn dialogues annotated with MBTI types, which\nfacilitates further research. The source code for the training and web service\ncomponents is publicly available as a part of OpenDILab at:\nhttps://github.com/opendilab/PsyDI\n","authors":["Xueyan Li","Xinyan Chen","Yazhe Niu","Shuai Hu","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2408.03337v4.pdf","comment":"29 pages, 15 figures"},{"id":"http://arxiv.org/abs/2211.15081v8","updated":"2025-01-16T07:34:31Z","published":"2022-11-28T05:54:24Z","title":"Mitigating Overfitting in Graph Neural Networks via Feature and\n  Hyperplane Perturbation","summary":"  Graph neural networks (GNNs) are commonly used in semi-supervised settings.\nPrevious research has primarily focused on finding appropriate graph filters\n(e.g. aggregation methods) to perform well on both homophilic and heterophilic\ngraphs. While these methods are effective, they can still suffer from the\nsparsity of node features, where the initial data contain few non-zero\nelements. This can lead to overfitting in certain dimensions in the first\nprojection matrix, as training samples may not cover the entire range of graph\nfilters (hyperplanes). To address this, we propose a novel data augmentation\nstrategy. Specifically, by flipping both the initial features and hyperplane,\nwe create additional space for training, which leads to more precise updates of\nthe learnable parameters and improved robustness for unseen features during\ninference. To the best of our knowledge, this is the first attempt to mitigate\nthe overfitting caused by the initial features. Extensive experiments on\nreal-world datasets show that our proposed technique increases node\nclassification accuracy by up to 46.5% relatively.\n","authors":["Yoonhyuk Choi","Jiho Choi","Taewook Ko","Chong-Kwon Kim"],"pdf_url":"https://arxiv.org/pdf/2211.15081v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09333v1","updated":"2025-01-16T07:07:41Z","published":"2025-01-16T07:07:41Z","title":"Prompt-CAM: A Simpler Interpretable Transformer for Fine-Grained\n  Analysis","summary":"  We present a simple usage of pre-trained Vision Transformers (ViTs) for\nfine-grained analysis, aiming to identify and localize the traits that\ndistinguish visually similar categories, such as different bird species or dog\nbreeds. Pre-trained ViTs such as DINO have shown remarkable capabilities to\nextract localized, informative features. However, using saliency maps like\nGrad-CAM can hardly point out the traits: they often locate the whole object by\na blurred, coarse heatmap, not traits. We propose a novel approach Prompt Class\nAttention Map (Prompt-CAM) to the rescue. Prompt-CAM learns class-specific\nprompts to a pre-trained ViT and uses the corresponding outputs for\nclassification. To classify an image correctly, the true-class prompt must\nattend to the unique image patches not seen in other classes' images, i.e.,\ntraits. As such, the true class's multi-head attention maps reveal traits and\ntheir locations. Implementation-wise, Prompt-CAM is almost a free lunch by\nsimply modifying the prediction head of Visual Prompt Tuning (VPT). This makes\nPrompt-CAM fairly easy to train and apply, sharply contrasting other\ninterpretable methods that design specific models and training processes. It is\neven simpler than the recently published INterpretable TRansformer (INTR),\nwhose encoder-decoder architecture prevents it from leveraging pre-trained\nViTs. Extensive empirical studies on a dozen datasets from various domains\n(e.g., birds, fishes, insects, fungi, flowers, food, and cars) validate\nPrompt-CAM superior interpretation capability.\n","authors":["Arpita Chowdhury","Dipanjyoti Paul","Zheda Mai","Jianyang Gu","Ziheng Zhang","Kazi Sajeed Mehrab","Elizabeth G. Campolongo","Daniel Rubenstein","Charles V. Stewart","Anuj Karpatne","Tanya Berger-Wolf","Yu Su","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2501.09333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15701v2","updated":"2025-01-16T07:01:37Z","published":"2024-12-20T09:21:15Z","title":"Collaborative Gym: A Framework for Enabling and Evaluating Human-Agent\n  Collaboration","summary":"  Recent advancements in language models (LMs) have sparked growing interest in\ndeveloping LM agents. While fully autonomous agents could excel in many\nscenarios, numerous use cases inherently require them to collaborate with\nhumans due to humans' latent preferences, domain expertise, or need for\ncontrol. To facilitate the study of human-agent collaboration, we present\nCollaborative Gym (Co-Gym), a general framework enabling asynchronous,\ntripartite interaction among agents, humans, and task environments. We\ninstantiate Co-Gym with three representative tasks in both simulated and\nreal-world conditions, and propose an evaluation framework that assesses both\nthe collaboration outcomes and processes. Our findings reveal that\ncollaborative agents consistently outperform their fully autonomous\ncounterparts in task performance within those delivered cases, achieving win\nrates of 86% in Travel Planning, 74% in Tabular Analysis, and 66% in Related\nWork when evaluated by real users. However, our study also highlights\nsignificant challenges in developing collaborative agents, requiring\nadvancements in core aspects of intelligence -- communication capabilities,\nsituational awareness, and balancing autonomy and human control.\n","authors":["Yijia Shao","Vinay Samuel","Yucheng Jiang","John Yang","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2412.15701v2.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2501.09328v1","updated":"2025-01-16T06:59:20Z","published":"2025-01-16T06:59:20Z","title":"Neural Honeytrace: A Robust Plug-and-Play Watermarking Framework against\n  Model Extraction Attacks","summary":"  Developing high-performance deep learning models is resource-intensive,\nleading model owners to utilize Machine Learning as a Service (MLaaS) platforms\ninstead of publicly releasing their models. However, malicious users may\nexploit query interfaces to execute model extraction attacks, reconstructing\nthe target model's functionality locally. While prior research has investigated\ntriggerable watermarking techniques for asserting ownership, existing methods\nface significant challenges: (1) most approaches require additional training,\nresulting in high overhead and limited flexibility, and (2) they often fail to\naccount for advanced attackers, leaving them vulnerable to adaptive attacks.\n  In this paper, we propose Neural Honeytrace, a robust plug-and-play\nwatermarking framework against model extraction attacks. We first formulate a\nwatermark transmission model from an information-theoretic perspective,\nproviding an interpretable account of the principles and limitations of\nexisting triggerable watermarking. Guided by the model, we further introduce:\n(1) a similarity-based training-free watermarking method for plug-and-play and\nflexible watermarking, and (2) a distribution-based multi-step watermark\ninformation transmission strategy for robust watermarking. Comprehensive\nexperiments on four datasets demonstrate that Neural Honeytrace outperforms\nprevious methods in efficiency and resisting adaptive attacks. Neural\nHoneytrace reduces the average number of samples required for a worst-case\nt-Test-based copyright claim from $12,000$ to $200$ with zero training cost.\n","authors":["Yixiao Xu","Binxing Fang","Rui Wang","Yinghai Zhou","Shouling Ji","Yuan Liu","Mohan Li","Zhihong Tian"],"pdf_url":"https://arxiv.org/pdf/2501.09328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09327v1","updated":"2025-01-16T06:52:58Z","published":"2025-01-16T06:52:58Z","title":"On Learning Informative Trajectory Embeddings for Imitation,\n  Classification and Regression","summary":"  In real-world sequential decision making tasks like autonomous driving,\nrobotics, and healthcare, learning from observed state-action trajectories is\ncritical for tasks like imitation, classification, and clustering. For example,\nself-driving cars must replicate human driving behaviors, while robots and\nhealthcare systems benefit from modeling decision sequences, whether or not\nthey come from expert data. Existing trajectory encoding methods often focus on\nspecific tasks or rely on reward signals, limiting their ability to generalize\nacross domains and tasks. Inspired by the success of embedding models like CLIP\nand BERT in static domains, we propose a novel method for embedding\nstate-action trajectories into a latent space that captures the skills and\ncompetencies in the dynamic underlying decision-making processes. This method\noperates without the need for reward labels, enabling better generalization\nacross diverse domains and tasks. Our contributions are threefold: (1) We\nintroduce a trajectory embedding approach that captures multiple abilities from\nstate-action data. (2) The learned embeddings exhibit strong representational\npower across downstream tasks, including imitation, classification, clustering,\nand regression. (3) The embeddings demonstrate unique properties, such as\ncontrolling agent behaviors in IQ-Learn and an additive structure in the latent\nspace. Experimental results confirm that our method outperforms traditional\napproaches, offering more flexible and powerful trajectory representations for\nvarious applications. Our code is available at\nhttps://github.com/Erasmo1015/vte.\n","authors":["Zichang Ge","Changyu Chen","Arunesh Sinha","Pradeep Varakantham"],"pdf_url":"https://arxiv.org/pdf/2501.09327v1.pdf","comment":"AAMAS 2025"},{"id":"http://arxiv.org/abs/2406.19043v2","updated":"2025-01-16T06:46:18Z","published":"2024-06-27T09:50:20Z","title":"CMRxRecon2024: A Multi-Modality, Multi-View K-Space Dataset Boosting\n  Universal Machine Learning for Accelerated Cardiac MRI","summary":"  Cardiac magnetic resonance imaging (MRI) has emerged as a clinically\ngold-standard technique for diagnosing cardiac diseases, thanks to its ability\nto provide diverse information with multiple modalities and anatomical views.\nAccelerated cardiac MRI is highly expected to achieve time-efficient and\npatient-friendly imaging, and then advanced image reconstruction approaches are\nrequired to recover high-quality, clinically interpretable images from\nundersampled measurements. However, the lack of publicly available cardiac MRI\nk-space dataset in terms of both quantity and diversity has severely hindered\nsubstantial technological progress, particularly for data-driven artificial\nintelligence. Here, we provide a standardized, diverse, and high-quality\nCMRxRecon2024 dataset to facilitate the technical development, fair evaluation,\nand clinical transfer of cardiac MRI reconstruction approaches, towards\npromoting the universal frameworks that enable fast and robust reconstructions\nacross different cardiac MRI protocols in clinical practice. To the best of our\nknowledge, the CMRxRecon2024 dataset is the largest and most protocal-diverse\npublicly available cardiac k-space dataset. It is acquired from 330 healthy\nvolunteers, covering commonly used modalities, anatomical views, and\nacquisition trajectories in clinical cardiac MRI workflows. Besides, an open\nplatform with tutorials, benchmarks, and data processing tools is provided to\nfacilitate data usage, advanced method development, and fair performance\nevaluation.\n","authors":["Zi Wang","Fanwen Wang","Chen Qin","Jun Lyu","Cheng Ouyang","Shuo Wang","Yan Li","Mengyao Yu","Haoyu Zhang","Kunyuan Guo","Zhang Shi","Qirong Li","Ziqiang Xu","Yajing Zhang","Hao Li","Sha Hua","Binghua Chen","Longyu Sun","Mengting Sun","Qin Li","Ying-Hua Chu","Wenjia Bai","Jing Qin","Xiahai Zhuang","Claudia Prieto","Alistair Young","Michael Markl","He Wang","Lianming Wu","Guang Yang","Xiaobo Qu","Chengyan Wang"],"pdf_url":"https://arxiv.org/pdf/2406.19043v2.pdf","comment":"23 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.08335v2","updated":"2025-01-16T06:16:43Z","published":"2024-12-21T05:50:48Z","title":"MERaLiON-TextLLM: Cross-Lingual Understanding of Large Language Models\n  in Chinese, Indonesian, Malay, and Singlish","summary":"  Multilingual large language models (MLLMs) have shown impressive capabilities\nacross a variety of languages. However, efficacy can differ greatly between\ndifferent language families, especially for those with limited linguistic\nresources. This report presents MERaLiON-TextLLM, a series of open-source\nlanguage models specifically tailored to improve understanding and generation\nin Chinese, Indonesian, Malay, and Singlish. The initial released model is\nbuilt on Llama-3-8B-Base and refined through a meticulously crafted process of\ncontinued pre-training and weight merging. Our approach achieves performance\nimprovements across benchmarks in these languages, exceeding the capabilities\nof the official Llama-3 models. We provide the model checkpoints as a resource\nto support further research and development in cross-lingual language\nunderstanding.\n","authors":["Xin Huang","Tarun Kumar Vangani","Minh Duc Pham","Xunlong Zou","Bin Wang","Zhengyuan Liu","Ai Ti Aw"],"pdf_url":"https://arxiv.org/pdf/2501.08335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09316v1","updated":"2025-01-16T06:14:58Z","published":"2025-01-16T06:14:58Z","title":"SOP-Agent: Empower General Purpose AI Agent with Domain-Specific SOPs","summary":"  Despite significant advancements in general-purpose AI agents, several\nchallenges still hinder their practical application in real-world scenarios.\nFirst, the limited planning capabilities of Large Language Models (LLM)\nrestrict AI agents from effectively solving complex tasks that require\nlong-horizon planning. Second, general-purpose AI agents struggle to\nefficiently utilize domain-specific knowledge and human expertise. In this\npaper, we introduce the Standard Operational Procedure-guided Agent\n(SOP-agent), a novel framework for constructing domain-specific agents through\npseudocode-style Standard Operational Procedures (SOPs) written in natural\nlanguage. Formally, we represent a SOP as a decision graph, which is traversed\nto guide the agent in completing tasks specified by the SOP. We conduct\nextensive experiments across tasks in multiple domains, including\ndecision-making, search and reasoning, code generation, data cleaning, and\ngrounded customer service. The SOP-agent demonstrates excellent versatility,\nachieving performance superior to general-purpose agent frameworks and\ncomparable to domain-specific agent systems. Additionally, we introduce the\nGrounded Customer Service Benchmark, the first benchmark designed to evaluate\nthe grounded decision-making capabilities of AI agents in customer service\nscenarios based on SOPs.\n","authors":["Anbang Ye","Qianran Ma","Jia Chen","Muqi Li","Tong Li","Fujiao Liu","Siqi Mai","Meichen Lu","Haitao Bao","Yang You"],"pdf_url":"https://arxiv.org/pdf/2501.09316v1.pdf","comment":"35 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.15862v4","updated":"2025-01-16T06:07:12Z","published":"2024-11-24T14:38:59Z","title":"Do LLMs Really Think Step-by-step In Implicit Reasoning?","summary":"  It has been well-known that Chain-of-Thought can remarkably enhance LLMs'\nperformance on complex tasks. However, because it also introduces slower\ninference speeds and higher computational costs, many researches have attempted\nto use implicit CoT, which does not need LLMs to explicitly generate the\nintermediate steps. However, the invisible reasoning process leaves us a doubt\nthat, can implicit CoT really be equal to explicit CoT? Therefore, in this\nstudy, we address this question through experiments. We probe the information\nof intermediate steps from the model's hidden states when it is either trained\nor prompted to perform implicit CoT. The results surprisingly indicate that\nwhen prompted, LLMs hardly think about intermediate steps, suggesting they may\njust rely on experience rather than strict step-by-step reasoning. But when\ntrained, they indeed calculate intermediate steps. Moreover, in both\nsituations, we find the effect of using implicit CoT is susceptible to the\nformat of the problem, reaffirming the current deficiency of implicit CoT.\n","authors":["Yijiong Yu"],"pdf_url":"https://arxiv.org/pdf/2411.15862v4.pdf","comment":"The code is in\n  https://github.com/yuyijiong/if_step_by_step_implicit_CoT"},{"id":"http://arxiv.org/abs/2501.09311v1","updated":"2025-01-16T05:58:32Z","published":"2025-01-16T05:58:32Z","title":"Shape-Based Single Object Classification Using Ensemble Method\n  Classifiers","summary":"  Nowadays, more and more images are available. Annotation and retrieval of the\nimages pose classification problems, where each class is defined as the group\nof database images labelled with a common semantic label. Various systems have\nbeen proposed for content-based retrieval, as well as for image classification\nand indexing. In this paper, a hierarchical classification framework has been\nproposed for bridging the semantic gap effectively and achieving multi-category\nimage classification. A well known pre-processing and post-processing method\nwas used and applied to three problems; image segmentation, object\nidentification and image classification. The method was applied to classify\nsingle object images from Amazon and Google datasets. The classification was\ntested for four different classifiers; BayesNetwork (BN), Random Forest (RF),\nBagging and Vote. The estimated classification accuracies ranged from 20% to\n99% (using 10-fold cross validation). The Bagging classifier presents the best\nperformance, followed by the Random Forest classifier.\n","authors":["Nur Shazwani Kamarudin","Mokhairi Makhtar","Syadiah Nor Wan Shamsuddin","Syed Abdullah Fadzli"],"pdf_url":"https://arxiv.org/pdf/2501.09311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09310v1","updated":"2025-01-16T05:54:59Z","published":"2025-01-16T05:54:59Z","title":"A Study of In-Context-Learning-Based Text-to-SQL Errors","summary":"  Large language models (LLMs) have been adopted to perform text-to-SQL tasks,\nutilizing their in-context learning (ICL) capability to translate natural\nlanguage questions into structured query language (SQL). However, such a\ntechnique faces correctness problems and requires efficient repairing\nsolutions. In this paper, we conduct the first comprehensive study of\ntext-to-SQL errors. Our study covers four representative ICL-based techniques,\nfive basic repairing methods, two benchmarks, and two LLM settings. We find\nthat text-to-SQL errors are widespread and summarize 29 error types of 7\ncategories. We also find that existing repairing attempts have limited\ncorrectness improvement at the cost of high computational overhead with many\nmis-repairs. Based on the findings, we propose MapleRepair, a novel text-to-SQL\nerror detection and repairing framework. The evaluation demonstrates that\nMapleRepair outperforms existing solutions by repairing 13.8% more queries with\nneglectable mis-repairs and 67.4% less overhead.\n","authors":["Jiawei Shen","Chengcheng Wan","Ruoyi Qiao","Jiazhen Zou","Hang Xu","Yuchen Shao","Yueling Zhang","Weikai Miao","Geguang Pu"],"pdf_url":"https://arxiv.org/pdf/2501.09310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09309v1","updated":"2025-01-16T05:46:27Z","published":"2025-01-16T05:46:27Z","title":"Understanding Mental Health Content on Social Media and Its Effect\n  Towards Suicidal Ideation","summary":"  This review underscores the critical need for effective strategies to\nidentify and support individuals with suicidal ideation, exploiting\ntechnological innovations in ML and DL to further suicide prevention efforts.\nThe study details the application of these technologies in analyzing vast\namounts of unstructured social media data to detect linguistic patterns,\nkeywords, phrases, tones, and contextual cues associated with suicidal\nthoughts. It explores various ML and DL models like SVMs, CNNs, LSTM, neural\nnetworks, and their effectiveness in interpreting complex data patterns and\nemotional nuances within text data. The review discusses the potential of these\ntechnologies to serve as a life-saving tool by identifying at-risk individuals\nthrough their digital traces. Furthermore, it evaluates the real-world\neffectiveness, limitations, and ethical considerations of employing these\ntechnologies for suicide prevention, stressing the importance of responsible\ndevelopment and usage. The study aims to fill critical knowledge gaps by\nanalyzing recent studies, methodologies, tools, and techniques in this field.\nIt highlights the importance of synthesizing current literature to inform\npractical tools and suicide prevention efforts, guiding innovation in reliable,\nethical systems for early intervention. This research synthesis evaluates the\nintersection of technology and mental health, advocating for the ethical and\nresponsible application of ML, DL, and NLP to offer life-saving potential\nworldwide while addressing challenges like generalizability, biases, privacy,\nand the need for further research to ensure these technologies do not\nexacerbate existing inequities and harms.\n","authors":["Mohaiminul Islam Bhuiyan","Nur Shazwani Kamarudin","Nur Hafieza Ismail"],"pdf_url":"https://arxiv.org/pdf/2501.09309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15192v3","updated":"2025-01-16T05:35:43Z","published":"2024-04-15T07:59:52Z","title":"Measuring Diversity of Game Scenarios","summary":"  This survey comprehensively reviews the multi-dimensionality of game scenario\ndiversity, spotlighting the innovative use of procedural content generation and\nother fields as cornerstones for enriching player experiences through diverse\ngame scenarios. By traversing a wide array of disciplines, from affective\nmodeling and multi-agent systems to psychological studies, our research\nunderscores the importance of diverse game scenarios in gameplay and education.\nThrough a taxonomy of diversity metrics and evaluation methods, we aim to\nbridge the current gaps in literature and practice, offering insights into\neffective strategies for measuring and integrating diversity in game scenarios.\nOur analysis highlights the necessity for a unified taxonomy to aid developers\nand researchers in crafting more engaging and varied game worlds. This survey\nnot only charts a path for future research in diverse game scenarios but also\nserves as a handbook for industry practitioners seeking to leverage diversity\nas a key component of game design and development.\n","authors":["Yuchen Li","Ziqi Wang","Qingquan Zhang","Bo Yuan","Jialin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.15192v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09292v1","updated":"2025-01-16T04:56:33Z","published":"2025-01-16T04:56:33Z","title":"To Retrieve or Not to Retrieve? Uncertainty Detection for Dynamic\n  Retrieval Augmented Generation","summary":"  Retrieval-Augmented Generation equips large language models with the\ncapability to retrieve external knowledge, thereby mitigating hallucinations by\nincorporating information beyond the model's intrinsic abilities. However, most\nprior works have focused on invoking retrieval deterministically, which makes\nit unsuitable for tasks such as long-form question answering. Instead,\ndynamically performing retrieval by invoking it only when the underlying LLM\nlacks the required knowledge can be more efficient. In this context, we delve\ndeeper into the question, \"To Retrieve or Not to Retrieve?\" by exploring\nmultiple uncertainty detection methods. We evaluate these methods for the task\nof long-form question answering, employing dynamic retrieval, and present our\ncomparisons. Our findings suggest that uncertainty detection metrics, such as\nDegree Matrix Jaccard and Eccentricity, can reduce the number of retrieval\ncalls by almost half, with only a slight reduction in question-answering\naccuracy.\n","authors":["Kaustubh D. Dhole"],"pdf_url":"https://arxiv.org/pdf/2501.09292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09291v1","updated":"2025-01-16T04:53:29Z","published":"2025-01-16T04:53:29Z","title":"LAVCap: LLM-based Audio-Visual Captioning using Optimal Transport","summary":"  Automated audio captioning is a task that generates textual descriptions for\naudio content, and recent studies have explored using visual information to\nenhance captioning quality. However, current methods often fail to effectively\nfuse audio and visual data, missing important semantic cues from each modality.\nTo address this, we introduce LAVCap, a large language model (LLM)-based\naudio-visual captioning framework that effectively integrates visual\ninformation with audio to improve audio captioning performance. LAVCap employs\nan optimal transport-based alignment loss to bridge the modality gap between\naudio and visual features, enabling more effective semantic extraction.\nAdditionally, we propose an optimal transport attention module that enhances\naudio-visual fusion using an optimal transport assignment map. Combined with\nthe optimal training strategy, experimental results demonstrate that each\ncomponent of our framework is effective. LAVCap outperforms existing\nstate-of-the-art methods on the AudioCaps dataset, without relying on large\ndatasets or post-processing. Code is available at\nhttps://github.com/NAVER-INTEL-Co-Lab/gaudi-lavcap.\n","authors":["Kyeongha Rho","Hyeongkeun Lee","Valentio Iverson","Joon Son Chung"],"pdf_url":"https://arxiv.org/pdf/2501.09291v1.pdf","comment":"5 pages, 2 figures; Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.09284v1","updated":"2025-01-16T04:17:56Z","published":"2025-01-16T04:17:56Z","title":"SEAL: Entangled White-box Watermarks on Low-Rank Adaptation","summary":"  Recently, LoRA and its variants have become the de facto strategy for\ntraining and sharing task-specific versions of large pretrained models, thanks\nto their efficiency and simplicity. However, the issue of copyright protection\nfor LoRA weights, especially through watermark-based techniques, remains\nunderexplored. To address this gap, we propose SEAL (SEcure wAtermarking on\nLoRA weights), the universal whitebox watermarking for LoRA. SEAL embeds a\nsecret, non-trainable matrix between trainable LoRA weights, serving as a\npassport to claim ownership. SEAL then entangles the passport with the LoRA\nweights through training, without extra loss for entanglement, and distributes\nthe finetuned weights after hiding the passport. When applying SEAL, we\nobserved no performance degradation across commonsense reasoning,\ntextual/visual instruction tuning, and text-to-image synthesis tasks. We\ndemonstrate that SEAL is robust against a variety of known attacks: removal,\nobfuscation, and ambiguity attacks.\n","authors":["Giyeong Oh","Seajin Kim","Woohyun Cho","Sangkyu Lee","Jiwan Chung","Dokyung Song","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2501.09284v1.pdf","comment":"26 pages, 16 tables, 9 figures, initial version"},{"id":"http://arxiv.org/abs/2405.17638v3","updated":"2025-01-16T04:11:29Z","published":"2024-05-27T20:18:20Z","title":"The surprising efficiency of temporal difference learning for rare event\n  prediction","summary":"  We quantify the efficiency of temporal difference (TD) learning over the\ndirect, or Monte Carlo (MC), estimator for policy evaluation in reinforcement\nlearning, with an emphasis on estimation of quantities related to rare events.\nPolicy evaluation is complicated in the rare event setting by the long\ntimescale of the event and by the need for \\emph{relative accuracy} in\nestimates of very small values. Specifically, we focus on least-squares TD\n(LSTD) prediction for finite state Markov chains, and show that LSTD can\nachieve relative accuracy far more efficiently than MC. We prove a central\nlimit theorem for the LSTD estimator and upper bound the \\emph{relative\nasymptotic variance} by simple quantities characterizing the connectivity of\nstates relative to the transition probabilities between them. Using this bound,\nwe show that, even when both the timescale of the rare event and the relative\naccuracy of the MC estimator are exponentially large in the number of states,\nLSTD maintains a fixed level of relative accuracy with a total number of\nobserved transitions of the Markov chain that is only \\emph{polynomially} large\nin the number of states.\n","authors":["Xiaoou Cheng","Jonathan Weare"],"pdf_url":"https://arxiv.org/pdf/2405.17638v3.pdf","comment":"Final camera-ready version published at NeurIPS 2024. Correct an\n  assumption statement and typos, and change/add a few sentences from the last\n  version"},{"id":"http://arxiv.org/abs/2501.09279v1","updated":"2025-01-16T03:57:38Z","published":"2025-01-16T03:57:38Z","title":"Text Semantics to Flexible Design: A Residential Layout Generation\n  Method Based on Stable Diffusion Model","summary":"  Flexibility in the AI-based residential layout design remains a significant\nchallenge, as traditional methods like rule-based heuristics and graph-based\ngeneration often lack flexibility and require substantial design knowledge from\nusers. To address these limitations, we propose a cross-modal design approach\nbased on the Stable Diffusion model for generating flexible residential\nlayouts. The method offers multiple input types for learning objectives,\nallowing users to specify both boundaries and layouts. It incorporates natural\nlanguage as design constraints and introduces ControlNet to enable stable\nlayout generation through two distinct pathways. We also present a scheme that\nencapsulates design expertise within a knowledge graph and translates it into\nnatural language, providing an interpretable representation of design\nknowledge. This comprehensibility and diversity of input options enable\nprofessionals and non-professionals to directly express design requirements,\nenhancing flexibility and controllability. Finally, experiments verify the\nflexibility of the proposed methods under multimodal constraints better than\nstate-of-the-art models, even when specific semantic information about room\nareas or connections is incomplete.\n","authors":["Zijin Qiu","Jiepeng Liu","Yi Xia","Hongtuo Qi","Pengkun Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09274v1","updated":"2025-01-16T03:44:16Z","published":"2025-01-16T03:44:16Z","title":"Large Language Model is Secretly a Protein Sequence Optimizer","summary":"  We consider the protein sequence engineering problem, which aims to find\nprotein sequences with high fitness levels, starting from a given wild-type\nsequence. Directed evolution has been a dominating paradigm in this field which\nhas an iterative process to generate variants and select via experimental\nfeedback. We demonstrate large language models (LLMs), despite being trained on\nmassive texts, are secretly protein sequence optimizers. With a directed\nevolutionary method, LLM can perform protein engineering through Pareto and\nexperiment-budget constrained optimization, demonstrating success on both\nsynthetic and experimental fitness landscapes.\n","authors":["Yinkai Wang","Jiaxing He","Yuanqi Du","Xiaohui Chen","Jianan Canal Li","Li-Ping Liu","Xiaolin Xu","Soha Hassoun"],"pdf_url":"https://arxiv.org/pdf/2501.09274v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.09265v1","updated":"2025-01-16T03:30:47Z","published":"2025-01-16T03:30:47Z","title":"Perspective Transition of Large Language Models for Solving Subjective\n  Tasks","summary":"  Large language models (LLMs) have revolutionized the field of natural\nlanguage processing, enabling remarkable progress in various tasks. Different\nfrom objective tasks such as commonsense reasoning and arithmetic\nquestion-answering, the performance of LLMs on subjective tasks is still\nlimited, where the perspective on the specific problem plays crucial roles for\nbetter interpreting the context and giving proper response. For example, in\ncertain scenarios, LLMs may perform better when answering from an expert role\nperspective, potentially eliciting their relevant domain knowledge. In\ncontrast, in some scenarios, LLMs may provide more accurate responses when\nanswering from a third-person standpoint, enabling a more comprehensive\nunderstanding of the problem and potentially mitigating inherent biases. In\nthis paper, we propose Reasoning through Perspective Transition (RPT), a method\nbased on in-context learning that enables LLMs to dynamically select among\ndirect, role, and third-person perspectives for the best way to solve\ncorresponding subjective problem. Through extensive experiments on totally 12\nsubjective tasks by using both closed-source and open-source LLMs including\nGPT-4, GPT-3.5, Llama-3, and Qwen-2, our method outperforms widely used single\nfixed perspective based methods such as chain-of-thought prompting and expert\nprompting, highlights the intricate ways that LLMs can adapt their perspectives\nto provide nuanced and contextually appropriate responses for different\nproblems.\n","authors":["Xiaolong Wang","Yuanchi Zhang","Ziyue Wang","Yuzhuang Xu","Fuwen Luo","Yile Wang","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09818v3","updated":"2025-01-16T03:29:23Z","published":"2024-12-13T03:15:05Z","title":"MERaLiON-AudioLLM: Bridging Audio and Language with Large Language\n  Models","summary":"  We introduce MERaLiON-AudioLLM (Multimodal Empathetic Reasoning and Learning\nin One Network), the first speech-text model tailored for Singapore's\nmultilingual and multicultural landscape. Developed under the National Large\nLanguage Models Funding Initiative, Singapore, MERaLiON-AudioLLM integrates\nadvanced speech and text processing to address the diverse linguistic nuances\nof local accents and dialects, enhancing accessibility and usability in\ncomplex, multilingual environments. Our results demonstrate improvements in\nboth speech recognition and task-specific understanding, positioning\nMERaLiON-AudioLLM as a pioneering solution for region specific AI applications.\nWe envision this release to set a precedent for future models designed to\naddress localised linguistic and cultural contexts in a global framework.\n","authors":["Yingxu He","Zhuohan Liu","Shuo Sun","Bin Wang","Wenyu Zhang","Xunlong Zou","Nancy F. Chen","Ai Ti Aw"],"pdf_url":"https://arxiv.org/pdf/2412.09818v3.pdf","comment":"https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"},{"id":"http://arxiv.org/abs/2406.15477v2","updated":"2025-01-16T03:26:36Z","published":"2024-06-16T23:01:10Z","title":"CrisisSense-LLM: Instruction Fine-Tuned Large Language Model for\n  Multi-label Social Media Text Classification in Disaster Informatics","summary":"  In the field of crisis/disaster informatics, social media is increasingly\nbeing used for improving situational awareness to inform response and relief\nefforts. Efficient and accurate text classification tools have been a focal\narea of investigation in crisis informatics. However, current methods mostly\nrely on single-label text classification models, which fails to capture\ndifferent insights embedded in dynamic and multifaceted disaster-related social\nmedia data. This study introduces a novel approach to disaster text\nclassification by enhancing a pre-trained Large Language Model (LLM) through\ninstruction fine-tuning targeted for multi-label classification of\ndisaster-related tweets. Our methodology involves creating a comprehensive\ninstruction dataset from disaster-related tweets, which is then used to\nfine-tune an open-source LLM, thereby embedding it with disaster-specific\nknowledge. This fine-tuned model can classify multiple aspects of\ndisaster-related information simultaneously, such as the type of event,\ninformativeness, and involvement of human aid, significantly improving the\nutility of social media data for situational awareness in disasters. The\nresults demonstrate that this approach enhances the categorization of critical\ninformation from social media posts, thereby facilitating a more effective\ndeployment for situational awareness during emergencies. This research paves\nthe way for more advanced, adaptable, and robust disaster management tools,\nleveraging the capabilities of LLMs to improve real-time situational awareness\nand response strategies in disaster scenarios.\n","authors":["Kai Yin","Chengkai Liu","Ali Mostafavi","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2406.15477v2.pdf","comment":"Relevant source code and data is available:\n  https://github.com/KaiYin97/CrsisLLM"},{"id":"http://arxiv.org/abs/2411.13599v2","updated":"2025-01-16T03:17:25Z","published":"2024-11-19T07:45:58Z","title":"Can ChatGPT Overcome Behavioral Biases in the Financial Sector?\n  Classify-and-Rethink: Multi-Step Zero-Shot Reasoning in the Gold Investment","summary":"  Large Language Models (LLMs) have achieved remarkable success recently,\ndisplaying exceptional capabilities in creating understandable and organized\ntext. These LLMs have been utilized in diverse fields, such as clinical\nresearch, where domain-specific models like Med-Palm have achieved human-level\nperformance. Recently, researchers have employed advanced prompt engineering to\nenhance the general reasoning ability of LLMs. Despite the remarkable success\nof zero-shot Chain-of-Thoughts (CoT) in solving general reasoning tasks, the\npotential of these methods still remains paid limited attention in the\nfinancial reasoning task.To address this issue, we explore multiple prompt\nstrategies and incorporated semantic news information to improve LLMs'\nperformance on financial reasoning tasks.To the best of our knowledge, we are\nthe first to explore this important issue by applying ChatGPT to the gold\ninvestment.In this work, our aim is to investigate the financial reasoning\ncapabilities of LLMs and their capacity to generate logical and persuasive\ninvestment opinions. We will use ChatGPT, one of the most powerful LLMs\nrecently, and prompt engineering to achieve this goal. Our research will focus\non understanding the ability of LLMs in sophisticated analysis and reasoning\nwithin the context of investment decision-making. Our study finds that ChatGPT\nwith CoT prompt can provide more explainable predictions and overcome\nbehavioral biases, which is crucial in finance-related tasks and can achieve\nhigher investment returns.\n","authors":["Shuoling Liu","Gaoguo Jia","Yuhang Jiang","Liyuan Chen","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.13599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11654v3","updated":"2025-01-16T03:04:10Z","published":"2024-12-16T10:56:58Z","title":"Smoothness Really Matters: A Simple Yet Effective Approach for\n  Unsupervised Graph Domain Adaptation","summary":"  Unsupervised Graph Domain Adaptation (UGDA) seeks to bridge distribution\nshifts between domains by transferring knowledge from labeled source graphs to\ngiven unlabeled target graphs. Existing UGDA methods primarily focus on\naligning features in the latent space learned by graph neural networks (GNNs)\nacross domains, often overlooking structural shifts, resulting in limited\neffectiveness when addressing structurally complex transfer scenarios. Given\nthe sensitivity of GNNs to local structural features, even slight discrepancies\nbetween source and target graphs could lead to significant shifts in node\nembeddings, thereby reducing the effectiveness of knowledge transfer. To\naddress this issue, we introduce a novel approach for UGDA called Target-Domain\nStructural Smoothing (TDSS). TDSS is a simple and effective method designed to\nperform structural smoothing directly on the target graph, thereby mitigating\nstructural distribution shifts and ensuring the consistency of node\nrepresentations. Specifically, by integrating smoothing techniques with\nneighborhood sampling, TDSS maintains the structural coherence of the target\ngraph while mitigating the risk of over-smoothing. Our theoretical analysis\nshows that TDSS effectively reduces target risk by improving model smoothness.\nEmpirical results on three real-world datasets demonstrate that TDSS\noutperforms recent state-of-the-art baselines, achieving significant\nimprovements across six transfer scenarios. The code is available in\nhttps://github.com/cwei01/TDSS.\n","authors":["Wei Chen","Guo Ye","Yakun Wang","Zhao Zhang","Libang Zhang","Daixin Wang","Zhiqiang Zhang","Fuzhen Zhuang"],"pdf_url":"https://arxiv.org/pdf/2412.11654v3.pdf","comment":"11 pages, Accpected by AAAI2025"},{"id":"http://arxiv.org/abs/2407.03131v4","updated":"2025-01-16T02:54:35Z","published":"2024-07-03T14:13:00Z","title":"MVGT: A Multi-view Graph Transformer Based on Spatial Relations for EEG\n  Emotion Recognition","summary":"  Electroencephalography (EEG), a technique that records electrical activity\nfrom the scalp using electrodes, plays a vital role in affective computing.\nHowever, fully utilizing the multi-domain characteristics of EEG signals\nremains a significant challenge. Traditional single-perspective analyses often\nfail to capture the complex interplay of temporal, frequency, and spatial\ndimensions in EEG data. To address this, we introduce a multi-view graph\ntransformer (MVGT) based on spatial relations that integrates information\nacross three domains: temporal dynamics from continuous series, frequency\nfeatures extracted from frequency bands, and inter-channel relationships\ncaptured through several spatial encodings. This comprehensive approach allows\nmodel to capture the nuanced properties inherent in EEG signals, enhancing its\nflexibility and representational power. Evaluation on publicly available\ndatasets demonstrates that MVGT surpasses state-of-the-art methods in\nperformance. The results highlight its ability to extract multi-domain\ninformation and effectively model inter-channel relationships, showcasing its\npotential for EEG-based emotion recognition tasks.\n","authors":["Yanjie Cui","Xiaohong Liu","Jing Liang","Yamin Fu"],"pdf_url":"https://arxiv.org/pdf/2407.03131v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13397v6","updated":"2025-01-16T02:48:38Z","published":"2023-03-23T16:15:18Z","title":"DiffMesh: A Motion-aware Diffusion Framework for Human Mesh Recovery\n  from Videos","summary":"  Human mesh recovery (HMR) provides rich human body information for various\nreal-world applications. While image-based HMR methods have achieved impressive\nresults, they often struggle to recover humans in dynamic scenarios, leading to\ntemporal inconsistencies and non-smooth 3D motion predictions due to the\nabsence of human motion. In contrast, video-based approaches leverage temporal\ninformation to mitigate this issue. In this paper, we present DiffMesh, an\ninnovative motion-aware Diffusion-like framework for video-based HMR. DiffMesh\nestablishes a bridge between diffusion models and human motion, efficiently\ngenerating accurate and smooth output mesh sequences by incorporating human\nmotion within the forward process and reverse process in the diffusion model.\nExtensive experiments are conducted on the widely used datasets (Human3.6M\n\\cite{h36m_pami} and 3DPW \\cite{pw3d2018}), which demonstrate the effectiveness\nand efficiency of our DiffMesh. Visual comparisons in real-world scenarios\nfurther highlight DiffMesh's suitability for practical applications.\n","authors":["Ce Zheng","Xianpeng Liu","Qucheng Peng","Tianfu Wu","Pu Wang","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2303.13397v6.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2404.13885v2","updated":"2025-01-16T02:45:07Z","published":"2024-04-22T05:12:52Z","title":"Surveying Attitudinal Alignment Between Large Language Models Vs. Humans\n  Towards 17 Sustainable Development Goals","summary":"  Large Language Models (LLMs) have emerged as potent tools for advancing the\nUnited Nations' Sustainable Development Goals (SDGs). However, the attitudinal\ndisparities between LLMs and humans towards these goals can pose significant\nchallenges. This study conducts a comprehensive review and analysis of the\nexisting literature on the attitudes of LLMs towards the 17 SDGs, emphasizing\nthe comparison between their attitudes and support for each goal and those of\nhumans. We examine the potential disparities, primarily focusing on aspects\nsuch as understanding and emotions, cultural and regional differences, task\nobjective variations, and factors considered in the decision-making process.\nThese disparities arise from the underrepresentation and imbalance in LLM\ntraining data, historical biases, quality issues, lack of contextual\nunderstanding, and skewed ethical values reflected. The study also investigates\nthe risks and harms that may arise from neglecting the attitudes of LLMs\ntowards the SDGs, including the exacerbation of social inequalities, racial\ndiscrimination, environmental destruction, and resource wastage. To address\nthese challenges, we propose strategies and recommendations to guide and\nregulate the application of LLMs, ensuring their alignment with the principles\nand goals of the SDGs, and therefore creating a more just, inclusive, and\nsustainable future.\n","authors":["Qingyang Wu","Ying Xu","Tingsong Xiao","Yunze Xiao","Yitong Li","Tianyang Wang","Yichi Zhang","Shanghai Zhong","Yuwei Zhang","Wei Lu","Yifan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09254v1","updated":"2025-01-16T02:43:44Z","published":"2025-01-16T02:43:44Z","title":"Clone-Robust AI Alignment","summary":"  A key challenge in training Large Language Models (LLMs) is properly aligning\nthem with human preferences. Reinforcement Learning with Human Feedback (RLHF)\nuses pairwise comparisons from human annotators to train reward functions and\nhas emerged as a popular alignment method. However, input datasets in RLHF are\nnot necessarily balanced in the types of questions and answers that are\nincluded. Therefore, we want RLHF algorithms to perform well even when the set\nof alternatives is not uniformly distributed. Drawing on insights from social\nchoice theory, we introduce robustness to approximate clones, a desirable\nproperty of RLHF algorithms which requires that adding near-duplicate\nalternatives does not significantly change the learned reward function. We\nfirst demonstrate that the standard RLHF algorithm based on regularized maximum\nlikelihood estimation (MLE) fails to satisfy this property. We then propose the\nweighted MLE, a new RLHF algorithm that modifies the standard regularized MLE\nby weighting alternatives based on their similarity to other alternatives. This\nnew algorithm guarantees robustness to approximate clones while preserving\ndesirable theoretical properties.\n","authors":["Ariel D. Procaccia","Benjamin Schiffer","Shirley Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05264v3","updated":"2025-01-16T02:39:20Z","published":"2025-01-09T14:19:33Z","title":"Towards Balanced Continual Multi-Modal Learning in Human Pose Estimation","summary":"  3D human pose estimation (3D HPE) has emerged as a prominent research topic,\nparticularly in the realm of RGB-based methods. However, RGB images are\nsusceptible to limitations such as sensitivity to lighting conditions and\npotential user discomfort. Consequently, multi-modal sensing, which leverages\nnon-intrusive sensors, is gaining increasing attention. Nevertheless,\nmulti-modal 3D HPE still faces challenges, including modality imbalance and the\nimperative for continual learning. In this work, we introduce a novel balanced\ncontinual multi-modal learning method for 3D HPE, which harnesses the power of\nRGB, LiDAR, mmWave, and WiFi. Specifically, we propose a Shapley value-based\ncontribution algorithm to quantify the contribution of each modality and\nidentify modality imbalance. To address this imbalance, we employ a re-learning\nstrategy. Furthermore, recognizing that raw data is prone to noise\ncontamination, we develop a novel denoising continual learning approach. This\napproach incorporates a noise identification and separation module to mitigate\nthe adverse effects of noise and collaborates with the balanced learning\nstrategy to enhance optimization. Additionally, an adaptive EWC mechanism is\nemployed to alleviate catastrophic forgetting. We conduct extensive experiments\non the widely-adopted multi-modal dataset, MM-Fi, which demonstrate the\nsuperiority of our approach in boosting 3D pose estimation and mitigating\ncatastrophic forgetting in complex scenarios. We will release our codes.\n","authors":["Jiaxuan Peng","Mengshi Qi","Dong Zhao","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.05264v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02797v3","updated":"2025-01-16T02:31:20Z","published":"2024-01-05T13:22:12Z","title":"PeFoMed: Parameter Efficient Fine-tuning of Multimodal Large Language\n  Models for Medical Imaging","summary":"  Multimodal large language models (MLLMs) represent an evolutionary expansion\nin the capabilities of traditional large language models, enabling them to\ntackle challenges that surpass the scope of purely text-based applications. It\nleverages the knowledge previously encoded within these language models,\nthereby enhancing their applicability and functionality in the reign of\nmultimodal contexts. Recent works investigate the adaptation of MLLMs as a\nuniversal solution to address medical multi-modal problems as a generative\ntask. In this paper, we propose a parameter efficient framework for fine-tuning\nMLLMs, specifically validated on medical visual question answering (Med-VQA)\nand medical report generation (MRG) tasks, using public benchmark datasets. We\nalso introduce an evaluation metric using the 5-point Likert scale and its\nweighted average value to measure the quality of the generated reports for MRG\ntasks, where the scale ratings are labelled by both humans manually and the\nGPT-4 model. We further assess the consistency of performance metrics across\ntraditional measures, GPT-4, and human ratings for both VQA and MRG tasks. The\nresults indicate that semantic similarity assessments using GPT-4 align closely\nwith human annotators and provide greater stability, yet they reveal a\ndiscrepancy when compared to conventional lexical similarity measurements. This\nquestions the reliability of lexical similarity metrics for evaluating the\nperformance of generative models in Med-VQA and report generation tasks.\nBesides, our fine-tuned model significantly outperforms GPT-4v. This indicates\nthat without additional fine-tuning, multi-modal models like GPT-4v do not\nperform effectively on medical imaging tasks. The code will be available here:\nhttps://github.com/jinlHe/PeFoMed.\n","authors":["Jinlong He","Pengfei Li","Gang Liu","Genrong He","Zhaolin Chen","Shenjun Zhong"],"pdf_url":"https://arxiv.org/pdf/2401.02797v3.pdf","comment":"12 pages, 8 figures, 12 tables"},{"id":"http://arxiv.org/abs/2501.00230v2","updated":"2025-01-16T02:28:47Z","published":"2024-12-31T02:46:29Z","title":"Federated Deep Subspace Clustering","summary":"  This paper introduces FDSC, a private-protected subspace clustering (SC)\napproach with federated learning (FC) schema. In each client, there is a deep\nsubspace clustering network accounting for grouping the isolated data, composed\nof a encode network, a self-expressive layer, and a decode network. FDSC is\nachieved by uploading the encode network to communicate with other clients in\nthe server. Besides, FDSC is also enhanced by preserving the local neighborhood\nrelationship in each client. With the effects of federated learning and\nlocality preservation, the learned data features from the encoder are boosted\nso as to enhance the self-expressiveness learning and result in better\nclustering performance. Experiments test FDSC on public datasets and compare\nwith other clustering methods, demonstrating the effectiveness of FDSC.\n","authors":["Yupei Zhang","Ruojia Feng","Yifei Wang","Xuequn Shang"],"pdf_url":"https://arxiv.org/pdf/2501.00230v2.pdf","comment":"8pages,4 figures, 4 Tables"},{"id":"http://arxiv.org/abs/2405.18731v3","updated":"2025-01-16T02:26:40Z","published":"2024-05-29T03:21:09Z","title":"VBIM-Net: Variational Born Iterative Network for Inverse Scattering\n  Problems","summary":"  Recently, studies have shown the potential of integrating field-type\niterative methods with deep learning (DL) techniques in solving inverse\nscattering problems (ISPs). In this article, we propose a novel Variational\nBorn Iterative Network, namely, VBIM-Net, to solve the full-wave ISPs with\nsignificantly improved structural rationality and inversion quality. The\nproposed VBIM-Net emulates the alternating updates of the total electric field\nand the contrast in the variational Born iterative method (VBIM) by multiple\nlayers of subnetworks. We embed the analytical calculation of the contrast\nvariation into each subnetwork, converting the scattered field residual into an\napproximate contrast variation and then enhancing it by a U-Net, thus avoiding\nthe requirement of matched measurement dimension and grid resolution as in\nexisting approaches. The total field and contrast of each layer's output is\nsupervised in the loss function of VBIM-Net, imposing soft physical constraints\non the variables in the subnetworks, which benefits the model's performance. In\naddition, we design a training scheme with extra noise to enhance the model's\nstability. Extensive numerical results on synthetic and experimental data both\nverify the inversion quality, generalization ability, and robustness of the\nproposed VBIM-Net. This work may provide some new inspiration for the design of\nefficient field-type DL schemes.\n","authors":["Ziqing Xing","Zhaoyang Zhang","Zirui Chen","Yusong Wang","Haoran Ma","Zhun Wei"],"pdf_url":"https://arxiv.org/pdf/2405.18731v3.pdf","comment":"Accepted by IEEE Transactions on Geoscience and Remote Sensing"},{"id":"http://arxiv.org/abs/2403.02914v2","updated":"2025-01-16T02:10:39Z","published":"2024-03-05T12:31:24Z","title":"DynST: Dynamic Sparse Training for Resource-Constrained Spatio-Temporal\n  Forecasting","summary":"  The ever-increasing sensor service, though opening a precious path and\nproviding a deluge of earth system data for deep-learning-oriented earth\nscience, sadly introduce a daunting obstacle to their industrial level\ndeployment. Concretely, earth science systems rely heavily on the extensive\ndeployment of sensors, however, the data collection from sensors is constrained\nby complex geographical and social factors, making it challenging to achieve\ncomprehensive coverage and uniform deployment. To alleviate the obstacle,\ntraditional approaches to sensor deployment utilize specific algorithms to\ndesign and deploy sensors. These methods \\textit{dynamically adjust the\nactivation times of sensors to optimize the detection process across each\nsub-region}. Regrettably, formulating an activation strategy generally based on\nhistorical observations and geographic characteristics, which make the methods\nand resultant models were neither simple nor practical. Worse still, the\ncomplex technical design may ultimately lead to a model with weak\ngeneralizability. In this paper, we introduce for the first time the concept of\nspatio-temporal data dynamic sparse training and are committed to adaptively,\ndynamically filtering important sensor distributions. To our knowledge, this is\nthe \\textbf{first} proposal (\\textit{termed DynST}) of an\n\\textbf{industry-level} deployment optimization concept at the data level.\nHowever, due to the existence of the temporal dimension, pruning of\nspatio-temporal data may lead to conflicts at different timestamps. To achieve\nthis goal, we employ dynamic merge technology, along with ingenious dimensional\nmapping to mitigate potential impacts caused by the temporal aspect. During the\ntraining process, DynST utilize iterative pruning and sparse training,\nrepeatedly identifying and dynamically removing sensor perception areas that\ncontribute the least to future predictions.\n","authors":["Hao Wu","Haomin Wen","Guibin Zhang","Yutong Xia","Yuxuan Liang","Yu Zheng","Qingsong Wen","Kun Wang"],"pdf_url":"https://arxiv.org/pdf/2403.02914v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09239v1","updated":"2025-01-16T01:52:30Z","published":"2025-01-16T01:52:30Z","title":"AI-based Identity Fraud Detection: A Systematic Review","summary":"  With the rapid development of digital services, a large volume of personally\nidentifiable information (PII) is stored online and is subject to cyberattacks\nsuch as Identity fraud. Most recently, the use of Artificial Intelligence (AI)\nenabled deep fake technologies has significantly increased the complexity of\nidentity fraud. Fraudsters may use these technologies to create highly\nsophisticated counterfeit personal identification documents, photos and videos.\nThese advancements in the identity fraud landscape pose challenges for identity\nfraud detection and society at large. There is a pressing need to review and\nunderstand identity fraud detection methods, their limitations and potential\nsolutions. This research aims to address this important need by using the\nwell-known systematic literature review method. This paper reviewed a selected\nset of 43 papers across 4 major academic literature databases. In particular,\nthe review results highlight the two types of identity fraud prevention and\ndetection methods, in-depth and open challenges. The results were also\nconsolidated into a taxonomy of AI-based identity fraud detection and\nprevention methods including key insights and trends. Overall, this paper\nprovides a foundational knowledge base to researchers and practitioners for\nfurther research and development in this important area of digital identity\nfraud.\n","authors":["Chuo Jun Zhang","Asif Q. Gill","Bo Liu","Memoona J. Anwar"],"pdf_url":"https://arxiv.org/pdf/2501.09239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10377v4","updated":"2025-01-16T01:30:35Z","published":"2024-07-15T01:11:30Z","title":"Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal\n  MRI Datasets","summary":"  Multi-modal magnetic resonance imaging (MRI) provides information of lesions\nfor computer-aided diagnosis from different views. Deep learning algorithms are\nsuitable for identifying specific anatomical structures, segmenting lesions,\nand classifying diseases. Manual labels are limited due to the high expense,\nwhich hinders further improvement of accuracy. Self-supervised learning,\nparticularly masked image modeling (MIM), has shown promise in utilizing\nunlabeled data. However, we spot model collapse when applying MIM to\nmulti-modal MRI datasets. The performance of downstream tasks does not see any\nimprovement following the collapsed model. To solve model collapse, we analyze\nand address it in two types: complete collapse and dimensional collapse. We\nfind complete collapse occurs because the collapsed loss value in multi-modal\nMRI datasets falls below the normally converged loss value. Based on this, the\nhybrid mask pattern (HMP) masking strategy is introduced to elevate the\ncollapsed loss above the normally converged loss value and avoid complete\ncollapse. Additionally, we reveal that dimensional collapse stems from\ninsufficient feature uniformity in MIM. We mitigate dimensional collapse by\nintroducing the pyramid barlow twins (PBT) module as an explicit regularization\nmethod. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module\nto avoid model collapse multi-modal MRI. Experiments are conducted on three\nmulti-modal MRI datasets to validate the effectiveness of our approach in\npreventing both types of model collapse. By preventing model collapse, the\ntraining of the model becomes more stable, resulting in a decent improvement in\nperformance for segmentation and classification tasks. The code is available at\nhttps://github.com/LinxuanHan/E-MIM.\n","authors":["Linxuan Han","Sa Xiao","Zimeng Li","Haidong Li","Xiuchao Zhao","Yeqing Han","Fumin Guo","Xin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.10377v4.pdf","comment":"This work has been submitted to the lEEE for possible publication.\n  copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2410.04061v3","updated":"2025-01-16T01:18:40Z","published":"2024-10-05T07:05:21Z","title":"Enhancing Graph Self-Supervised Learning with Graph Interplay","summary":"  Graph self-supervised learning (GSSL) has emerged as a compelling framework\nfor extracting informative representations from graph-structured data without\nextensive reliance on labeled inputs. In this study, we introduce Graph\nInterplay (GIP), an innovative and versatile approach that significantly\nenhances the performance equipped with various existing GSSL methods. To this\nend, GIP advocates direct graph-level communications by introducing random\ninter-graph edges within standard batches. Against GIP's simplicity, we further\ntheoretically show that \\textsc{GIP} essentially performs a principled manifold\nseparation via combining inter-graph message passing and GSSL, bringing about\nmore structured embedding manifolds and thus benefits a series of downstream\ntasks. Our empirical study demonstrates that GIP surpasses the performance of\nprevailing GSSL methods across multiple benchmarks by significant margins,\nhighlighting its potential as a breakthrough approach. Besides, GIP can be\nreadily integrated into a series of GSSL methods and consistently offers\nadditional performance gain. This advancement not only amplifies the capability\nof GSSL but also potentially sets the stage for a novel graph learning paradigm\nin a broader sense.\n","authors":["Xinjian Zhao","Wei Pang","Xiangru Jian","Yaoyao Xu","Chaolong Ying","Tianshu Yu"],"pdf_url":"https://arxiv.org/pdf/2410.04061v3.pdf","comment":"Due to potential implicit data leakage in our experimental setup,\n  where the pretraining dataset was ordered by default labels, we withdraw this\n  manuscript for further self-examination and rigorous validation"},{"id":"http://arxiv.org/abs/2501.09223v1","updated":"2025-01-16T01:03:56Z","published":"2025-01-16T01:03:56Z","title":"Foundations of Large Language Models","summary":"  This is a book about large language models. As indicated by the title, it\nprimarily focuses on foundational concepts rather than comprehensive coverage\nof all cutting-edge technologies. The book is structured into four main\nchapters, each exploring a key area: pre-training, generative models, prompting\ntechniques, and alignment methods. It is intended for college students,\nprofessionals, and practitioners in natural language processing and related\nfields, and can serve as a reference for anyone interested in large language\nmodels.\n","authors":["Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.09223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05526v2","updated":"2025-01-16T00:54:04Z","published":"2024-08-10T11:48:14Z","title":"CryoBench: Diverse and challenging datasets for the heterogeneity\n  problem in cryo-EM","summary":"  Cryo-electron microscopy (cryo-EM) is a powerful technique for determining\nhigh-resolution 3D biomolecular structures from imaging data. Its unique\nability to capture structural variability has spurred the development of\nheterogeneous reconstruction algorithms that can infer distributions of 3D\nstructures from noisy, unlabeled imaging data. Despite the growing number of\nadvanced methods, progress in the field is hindered by the lack of standardized\nbenchmarks with ground truth information and reliable validation metrics. Here,\nwe introduce CryoBench, a suite of datasets, metrics, and benchmarks for\nheterogeneous reconstruction in cryo-EM. CryoBench includes five datasets\nrepresenting different sources of heterogeneity and degrees of difficulty.\nThese include conformational heterogeneity generated from designed motions of\nantibody complexes or sampled from a molecular dynamics simulation, as well as\ncompositional heterogeneity from mixtures of ribosome assembly states or 100\ncommon complexes present in cells. We then analyze state-of-the-art\nheterogeneous reconstruction tools, including neural and non-neural methods,\nassess their sensitivity to noise, and propose new metrics for quantitative\nevaluation. We hope that CryoBench will be a foundational resource for\naccelerating algorithmic development and evaluation in the cryo-EM and machine\nlearning communities. Project page: https://cryobench.cs.princeton.edu.\n","authors":["Minkyu Jeon","Rishwanth Raghu","Miro Astore","Geoffrey Woollard","Ryan Feathers","Alkin Kaz","Sonya M. Hanson","Pilar Cossio","Ellen D. Zhong"],"pdf_url":"https://arxiv.org/pdf/2408.05526v2.pdf","comment":"Accepted by NeurIPS 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2501.09218v1","updated":"2025-01-16T00:33:17Z","published":"2025-01-16T00:33:17Z","title":"Interpretable Droplet Digital PCR Assay for Trustworthy Molecular\n  Diagnostics","summary":"  Accurate molecular quantification is essential for advancing research and\ndiagnostics in fields such as infectious diseases, cancer biology, and genetic\ndisorders. Droplet digital PCR (ddPCR) has emerged as a gold standard for\nachieving absolute quantification. While computational ddPCR technologies have\nadvanced significantly, achieving automatic interpretation and consistent\nadaptability across diverse operational environments remains a challenge. To\naddress these limitations, we introduce the intelligent interpretable droplet\ndigital PCR (I2ddPCR) assay, a comprehensive framework integrating front-end\npredictive models (for droplet segmentation and classification) with GPT-4o\nmultimodal large language model (MLLM, for context-aware explanations and\nrecommendations) to automate and enhance ddPCR image analysis. This approach\nsurpasses the state-of-the-art models, affording 99.05% accuracy in processing\ncomplex ddPCR images containing over 300 droplets per image with varying\nsignal-to-noise ratios (SNRs). By combining specialized neural networks and\nlarge language models, the I2ddPCR assay offers a robust and adaptable solution\nfor absolute molecular quantification, achieving a sensitivity capable of\ndetecting low-abundance targets as low as 90.32 copies/{\\mu}L. Furthermore, it\nimproves model's transparency through detailed explanation and troubleshooting\nguidance, empowering users to make informed decisions. This innovative\nframework has the potential to benefit molecular diagnostics, disease research,\nand clinical applications, especially in resource-constrained settings.\n","authors":["Yuanyuan Wei","Yucheng Wu","Fuyang Qu","Yao Mu","Yi-Ping Ho","Ho-Pui Ho","Wu Yuan","Mingkun Xu"],"pdf_url":"https://arxiv.org/pdf/2501.09218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09217v1","updated":"2025-01-16T00:33:01Z","published":"2025-01-16T00:33:01Z","title":"Adaptive Law-Based Transformation (ALT): A Lightweight Feature\n  Representation for Time Series Classification","summary":"  Time series classification (TSC) is fundamental in numerous domains,\nincluding finance, healthcare, and environmental monitoring. However,\ntraditional TSC methods often struggle with the inherent complexity and\nvariability of time series data. Building on our previous work with the linear\nlaw-based transformation (LLT) - which improved classification accuracy by\ntransforming the feature space based on key data patterns - we introduce\nadaptive law-based transformation (ALT). ALT enhances LLT by incorporating\nvariable-length shifted time windows, enabling it to capture distinguishing\npatterns of various lengths and thereby handle complex time series more\neffectively. By mapping features into a linearly separable space, ALT provides\na fast, robust, and transparent solution that achieves state-of-the-art\nperformance with only a few hyperparameters.\n","authors":["Marcell T. Kurbucz","Balázs Hajós","Balázs P. Halmos","Vince Á. Molnár","Antal Jakovác"],"pdf_url":"https://arxiv.org/pdf/2501.09217v1.pdf","comment":"8 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2411.01332v2","updated":"2025-01-16T23:37:24Z","published":"2024-11-02T18:30:32Z","title":"A Mechanistic Explanatory Strategy for XAI","summary":"  Despite significant advancements in XAI, scholars note a persistent lack of\nsolid conceptual foundations and integration with broader scientific discourse\non explanation. In response, emerging XAI research draws on explanatory\nstrategies from various sciences and philosophy of science literature to fill\nthese gaps. This paper outlines a mechanistic strategy for explaining the\nfunctional organization of deep learning systems, situating recent advancements\nin AI explainability within a broader philosophical context. According to the\nmechanistic approach, the explanation of opaque AI systems involves identifying\nmechanisms that drive decision-making. For deep neural networks, this means\ndiscerning functionally relevant components -- such as neurons, layers,\ncircuits, or activation patterns -- and understanding their roles through\ndecomposition, localization, and recomposition. Proof-of-principle case studies\nfrom image recognition and language modeling align these theoretical approaches\nwith the latest research from AI labs like OpenAI and Anthropic. This research\nsuggests that a systematic approach to studying model organization can reveal\nelements that simpler (or ''more modest'') explainability techniques might\nmiss, fostering more thoroughly explainable AI. The paper concludes with a\ndiscussion on the epistemic relevance of the mechanistic approach positioned in\nthe context of selected philosophical debates on XAI.\n","authors":["Marcin Rabiza"],"pdf_url":"https://arxiv.org/pdf/2411.01332v2.pdf","comment":"Forthcoming in M\\\"uller, V. C., Dewey, A. R., Dung, L., & L\\\"ohr, G.\n  (Eds.), Philosophy of Artificial Intelligence: The State of the Art, Synthese\n  Library, Berlin: Springer Nature. Please cite the published version"},{"id":"http://arxiv.org/abs/2501.09878v1","updated":"2025-01-16T23:28:30Z","published":"2025-01-16T23:28:30Z","title":"ASTRA: A Scene-aware TRAnsformer-based model for trajectory prediction","summary":"  We present ASTRA (A} Scene-aware TRAnsformer-based model for trajectory\nprediction), a light-weight pedestrian trajectory forecasting model that\nintegrates the scene context, spatial dynamics, social inter-agent interactions\nand temporal progressions for precise forecasting. We utilised a U-Net-based\nfeature extractor, via its latent vector representation, to capture scene\nrepresentations and a graph-aware transformer encoder for capturing social\ninteractions. These components are integrated to learn an agent-scene aware\nembedding, enabling the model to learn spatial dynamics and forecast the future\ntrajectory of pedestrians. The model is designed to produce both deterministic\nand stochastic outcomes, with the stochastic predictions being generated by\nincorporating a Conditional Variational Auto-Encoder (CVAE). ASTRA also\nproposes a simple yet effective weighted penalty loss function, which helps to\nyield predictions that outperform a wide array of state-of-the-art\ndeterministic and generative models. ASTRA demonstrates an average improvement\nof 27%/10% in deterministic/stochastic settings on the ETH-UCY dataset, and 26%\nimprovement on the PIE dataset, respectively, along with seven times fewer\nparameters than the existing state-of-the-art model (see Figure 1).\nAdditionally, the model's versatility allows it to generalize across different\nperspectives, such as Bird's Eye View (BEV) and Ego-Vehicle View (EVV).\n","authors":["Izzeddin Teeti","Aniket Thomas","Munish Monga","Sachin Kumar","Uddeshya Singh","Andrew Bradley","Biplab Banerjee","Fabio Cuzzolin"],"pdf_url":"https://arxiv.org/pdf/2501.09878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09858v1","updated":"2025-01-16T22:11:03Z","published":"2025-01-16T22:11:03Z","title":"From Explainability to Interpretability: Interpretable Policies in\n  Reinforcement Learning Via Model Explanation","summary":"  Deep reinforcement learning (RL) has shown remarkable success in complex\ndomains, however, the inherent black box nature of deep neural network policies\nraises significant challenges in understanding and trusting the decision-making\nprocesses. While existing explainable RL methods provide local insights, they\nfail to deliver a global understanding of the model, particularly in\nhigh-stakes applications. To overcome this limitation, we propose a novel\nmodel-agnostic approach that bridges the gap between explainability and\ninterpretability by leveraging Shapley values to transform complex deep RL\npolicies into transparent representations. The proposed approach offers two key\ncontributions: a novel approach employing Shapley values to policy\ninterpretation beyond local explanations and a general framework applicable to\noff-policy and on-policy algorithms. We evaluate our approach with three\nexisting deep RL algorithms and validate its performance in two classic control\nenvironments. The results demonstrate that our approach not only preserves the\noriginal models' performance but also generates more stable interpretable\npolicies.\n","authors":["Peilang Li","Umer Siddique","Yongcan Cao"],"pdf_url":"https://arxiv.org/pdf/2501.09858v1.pdf","comment":"Accepted to Deployable AI (DAI) Workshop at the Thirty-Ninth AAAI\n  Conference on Artificial Intelligence (AAAI-25)"},{"id":"http://arxiv.org/abs/2101.07914v2","updated":"2025-01-16T21:18:48Z","published":"2021-01-20T00:46:52Z","title":"Intelligent Icing Detection Model of Wind Turbine Blades Based on SCADA\n  data","summary":"  Diagnosis of ice accretion on wind turbine blades is all the time a hard nut\nto crack in condition monitoring of wind farms. Existing methods focus on\nmechanism analysis of icing process, deviation degree analysis of feature\nengineering. However, there have not been deep researches of neural networks\napplied in this field at present. Supervisory control and data acquisition\n(SCADA) makes it possible to train networks through continuously providing not\nonly operation parameters and performance parameters of wind turbines but also\nenvironmental parameters and operation modes. This paper explores the\npossibility that using convolutional neural networks (CNNs), generative\nadversarial networks (GANs) and domain adaption learning to establish\nintelligent diagnosis frameworks under different training scenarios.\nSpecifically, PGANC and PGANT are proposed for sufficient and non-sufficient\ntarget wind turbine labeled data, respectively. The basic idea is that we\nconsider a two-stage training with parallel GANs, which are aimed at capturing\nintrinsic features for normal and icing samples, followed by classification CNN\nor domain adaption module in various training cases. Model validation on three\nwind turbine SCADA data shows that two-stage training can effectively improve\nthe model performance. Besides, if there is no sufficient labeled data for a\ntarget turbine, which is an extremely common phenomenon in real industrial\npractices, the addition of domain adaption learning makes the trained model\nshow better performance. Overall, our proposed intelligent diagnosis frameworks\ncan achieve more accurate detection on the same wind turbine and more\ngeneralized capability on a new wind turbine, compared with other machine\nlearning models and conventional CNNs.\n","authors":["Wenqian Jiang","Junyang Jin"],"pdf_url":"https://arxiv.org/pdf/2101.07914v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.07271v2","updated":"2025-01-16T21:09:57Z","published":"2024-11-10T16:28:42Z","title":"Multi-hop Upstream Anticipatory Traffic Signal Control with Deep\n  Reinforcement Learning","summary":"  Coordination in traffic signal control is crucial for managing congestion in\nurban networks. Existing pressure-based control methods focus only on immediate\nupstream links, leading to suboptimal green time allocation and increased\nnetwork delays. However, effective signal control inherently requires\ncoordination across a broader spatial scope, as the effect of upstream traffic\nshould influence signal control decisions at downstream intersections,\nimpacting a large area in the traffic network. Although agent communication\nusing neural network-based feature extraction can implicitly enhance spatial\nawareness, it significantly increases the learning complexity, adding an\nadditional layer of difficulty to the challenging task of control in deep\nreinforcement learning. To address the issue of learning complexity and myopic\ntraffic pressure definition, our work introduces a novel concept based on\nMarkov chain theory, namely \\textit{multi-hop upstream pressure}, which\ngeneralizes the conventional pressure to account for traffic conditions beyond\nthe immediate upstream links. This farsighted and compact metric informs the\ndeep reinforcement learning agent to preemptively clear the multi-hop upstream\nqueues, guiding the agent to optimize signal timings with a broader spatial\nawareness. Simulations on synthetic and realistic (Toronto) scenarios\ndemonstrate controllers utilizing multi-hop upstream pressure significantly\nreduce overall network delay by prioritizing traffic movements based on a\nbroader understanding of upstream congestion.\n","authors":["Xiaocan Li","Xiaoyu Wang","Ilia Smirnov","Scott Sanner","Baher Abdulhai"],"pdf_url":"https://arxiv.org/pdf/2411.07271v2.pdf","comment":"5 tables, 11 figures"},{"id":"http://arxiv.org/abs/2501.06164v2","updated":"2025-01-16T21:07:04Z","published":"2025-01-10T18:39:29Z","title":"Model Alignment Search","summary":"  When can we say that two neural systems are the same? The answer to this\nquestion is goal-dependent, and it is often addressed through correlative\nmethods such as Representational Similarity Analysis (RSA) and Centered Kernel\nAlignment (CKA). What do we miss when we forgo causal explorations, and how can\nwe target specific types of similarity? In this work, we introduce Model\nAlignment Search (MAS), a method for causally exploring distributed\nrepresentational similarity. The method learns invertible linear\ntransformations that align a subspace between two distributed networks'\nrepresentations where causal information can be freely interchanged. We first\nshow that the method can be used to transfer specific causal variables, such as\nthe number of items in a counting task, between networks with different\ntraining seeds. We then explore open questions in number cognition by comparing\ndifferent types of numeric representations in models trained on structurally\ndifferent numeric tasks. We then explore differences between MAS vs preexisting\ncausal similarity methods, and lastly, we introduce a counterfactual latent\nauxiliary loss function that helps shape causally relevant alignments even in\ncases where we do not have causal access to one of the two models for training.\n","authors":["Satchel Grant"],"pdf_url":"https://arxiv.org/pdf/2501.06164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09838v1","updated":"2025-01-16T20:56:32Z","published":"2025-01-16T20:56:32Z","title":"CrossModalityDiffusion: Multi-Modal Novel View Synthesis with Unified\n  Intermediate Representation","summary":"  Geospatial imaging leverages data from diverse sensing modalities-such as EO,\nSAR, and LiDAR, ranging from ground-level drones to satellite views. These\nheterogeneous inputs offer significant opportunities for scene understanding\nbut present challenges in interpreting geometry accurately, particularly in the\nabsence of precise ground truth data. To address this, we propose\nCrossModalityDiffusion, a modular framework designed to generate images across\ndifferent modalities and viewpoints without prior knowledge of scene geometry.\nCrossModalityDiffusion employs modality-specific encoders that take multiple\ninput images and produce geometry-aware feature volumes that encode scene\nstructure relative to their input camera positions. The space where the feature\nvolumes are placed acts as a common ground for unifying input modalities. These\nfeature volumes are overlapped and rendered into feature images from novel\nperspectives using volumetric rendering techniques. The rendered feature images\nare used as conditioning inputs for a modality-specific diffusion model,\nenabling the synthesis of novel images for the desired output modality. In this\npaper, we show that jointly training different modules ensures consistent\ngeometric understanding across all modalities within the framework. We validate\nCrossModalityDiffusion's capabilities on the synthetic ShapeNet cars dataset,\ndemonstrating its effectiveness in generating accurate and consistent novel\nviews across multiple imaging modalities and perspectives.\n","authors":["Alex Berian","Daniel Brignac","JhihYang Wu","Natnael Daba","Abhijit Mahalanobis"],"pdf_url":"https://arxiv.org/pdf/2501.09838v1.pdf","comment":"Accepted in the 2025 WACV workshop GeoCV"},{"id":"http://arxiv.org/abs/2409.17400v2","updated":"2025-01-16T20:25:21Z","published":"2024-09-25T22:19:32Z","title":"AgRegNet: A Deep Regression Network for Flower and Fruit Density\n  Estimation, Localization, and Counting in Orchards","summary":"  One of the major challenges for the agricultural industry today is the\nuncertainty in manual labor availability and the associated cost. Automated\nflower and fruit density estimation, localization, and counting could help\nstreamline harvesting, yield estimation, and crop-load management strategies\nsuch as flower and fruitlet thinning. This article proposes a deep\nregression-based network, AgRegNet, to estimate density, count, and location of\nflower and fruit in tree fruit canopies without explicit object detection or\npolygon annotation. Inspired by popular U-Net architecture, AgRegNet is a\nU-shaped network with an encoder-to-decoder skip connection and modified\nConvNeXt-T as an encoder feature extractor. AgRegNet can be trained based on\ninformation from point annotation and leverages segmentation information and\nattention modules (spatial and channel) to highlight relevant flower and fruit\nfeatures while suppressing non-relevant background features. Experimental\nevaluation in apple flower and fruit canopy images under an unstructured\norchard environment showed that AgRegNet achieved promising accuracy as\nmeasured by Structural Similarity Index (SSIM), percentage Mean Absolute Error\n(pMAE) and mean Average Precision (mAP) to estimate flower and fruit density,\ncount, and centroid location, respectively. Specifically, the SSIM, pMAE, and\nmAP values for flower images were 0.938, 13.7%, and 0.81, respectively. For\nfruit images, the corresponding values were 0.910, 5.6%, and 0.93. Since the\nproposed approach relies on information from point annotation, it is suitable\nfor sparsely and densely located objects. This simplified technique will be\nhighly applicable for growers to accurately estimate yields and decide on\noptimal chemical and mechanical flower thinning practices.\n","authors":["Uddhav Bhattarai","Santosh Bhusal","Qin Zhang","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2409.17400v2.pdf","comment":"Published in Computers and Electronics in Agriculture"},{"id":"http://arxiv.org/abs/2501.09825v1","updated":"2025-01-16T20:24:56Z","published":"2025-01-16T20:24:56Z","title":"Bridging Language Barriers in Healthcare: A Study on Arabic LLMs","summary":"  This paper investigates the challenges of developing large language models\n(LLMs) proficient in both multilingual understanding and medical knowledge. We\ndemonstrate that simply translating medical data does not guarantee strong\nperformance on clinical tasks in the target language. Our experiments reveal\nthat the optimal language mix in training data varies significantly across\ndifferent medical tasks. We find that larger models with carefully calibrated\nlanguage ratios achieve superior performance on native-language clinical tasks.\nFurthermore, our results suggest that relying solely on fine-tuning may not be\nthe most effective approach for incorporating new language knowledge into LLMs.\nInstead, data and computationally intensive pretraining methods may still be\nnecessary to achieve optimal performance in multilingual medical settings.\nThese findings provide valuable guidance for building effective and inclusive\nmedical AI systems for diverse linguistic communities.\n","authors":["Nada Saadi","Tathagata Raha","Clément Christophe","Marco AF Pimentel","Ronnie Rajan","Praveen K Kanithi"],"pdf_url":"https://arxiv.org/pdf/2501.09825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09817v1","updated":"2025-01-16T20:09:19Z","published":"2025-01-16T20:09:19Z","title":"Generalized Single-Image-Based Morphing Attack Detection Using Deep\n  Representations from Vision Transformer","summary":"  Face morphing attacks have posed severe threats to Face Recognition Systems\n(FRS), which are operated in border control and passport issuance use cases.\nCorrespondingly, morphing attack detection algorithms (MAD) are needed to\ndefend against such attacks. MAD approaches must be robust enough to handle\nunknown attacks in an open-set scenario where attacks can originate from\nvarious morphing generation algorithms, post-processing and the diversity of\nprinters/scanners. The problem of generalization is further pronounced when the\ndetection has to be made on a single suspected image. In this paper, we propose\na generalized single-image-based MAD (S-MAD) algorithm by learning the encoding\nfrom Vision Transformer (ViT) architecture. Compared to CNN-based\narchitectures, ViT model has the advantage on integrating local and global\ninformation and hence can be suitable to detect the morphing traces widely\ndistributed among the face region. Extensive experiments are carried out on\nface morphing datasets generated using publicly available FRGC face datasets.\nSeveral state-of-the-art (SOTA) MAD algorithms, including representative ones\nthat have been publicly evaluated, have been selected and benchmarked with our\nViT-based approach. Obtained results demonstrate the improved detection\nperformance of the proposed S-MAD method on inter-dataset testing (when\ndifferent data is used for training and testing) and comparable performance on\nintra-dataset testing (when the same data is used for training and testing)\nexperimental protocol.\n","authors":["Haoyu Zhang","Raghavendra Ramachandra","Kiran Raja","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.09817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09804v1","updated":"2025-01-16T19:23:11Z","published":"2025-01-16T19:23:11Z","title":"Enhancing Generalization in Chain of Thought Reasoning for Smaller\n  Models","summary":"  Chain-of-Thought (CoT) reasoning in smaller language models is a challenging\nnatural language process problem yet highly desirable in many real-life\napplications. Existing CoT knowledge distillation methods often suffer from\noverly conservative memorization in smaller LLMs, leading to low generalization\nconfidence. As fully preserving the CoT ability of teacher model is impossible,\nwe hypothesize that adversarial CoT fine-tuning is crucial for developing\nsmaller LLM with robust CoT generalization. To this end, we propose\n\\textit{PRompt-Assisted Domain-Adversarial fine-tuning} (PRADA), a principled\nfine-tuning framework that integrates diverse CoT domains. Specifically, PRADA\npioneers two CoT improvements in smaller LLM: (1) Recovering the\ndomain-invariant feature insight which typically lost during distillation with\ndomain adversarial fine-tuning; (2) Enhancing the domain adaptability of CoT\nprompt engineering by employing domain-adversarial approaches. We theoretically\ndemonstrate the effectiveness of our approach and empirically show that it\nsignificantly outperforms the state of the arts in a wide range of tasks.\nMoreover, our empirical findings reveal that the smaller LLM, when leveraging\nPRADA, aligns closely with domain knowledge, thereby improving the\nexplainability of our approach.\n","authors":["Maxwell J. Yin","Dingyi Jiang","Yongbing Chen","Boyu Wang","Charles Ling"],"pdf_url":"https://arxiv.org/pdf/2501.09804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09751v1","updated":"2025-01-16T18:58:06Z","published":"2025-01-16T18:58:06Z","title":"OmniThink: Expanding Knowledge Boundaries in Machine Writing through\n  Thinking","summary":"  Machine writing with large language models often relies on\nretrieval-augmented generation. However, these approaches remain confined\nwithin the boundaries of the model's predefined scope, limiting the generation\nof content with rich information. Specifically, vanilla-retrieved information\ntends to lack depth, utility, and suffers from redundancy, which negatively\nimpacts the quality of generated articles, leading to shallow, repetitive, and\nunoriginal outputs. To address these issues, we propose OmniThink, a machine\nwriting framework that emulates the human-like process of iterative expansion\nand reflection. The core idea behind OmniThink is to simulate the cognitive\nbehavior of learners as they progressively deepen their knowledge of the\ntopics. Experimental results demonstrate that OmniThink improves the knowledge\ndensity of generated articles without compromising metrics such as coherence\nand depth. Human evaluations and expert feedback further highlight the\npotential of OmniThink to address real-world challenges in the generation of\nlong-form articles.\n","authors":["Zekun Xi","Wenbiao Yin","Jizhan Fang","Jialong Wu","Runnan Fang","Ningyu Zhang","Jiang Yong","Pengjun Xie","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01945v2","updated":"2025-01-16T18:53:23Z","published":"2025-01-03T18:51:18Z","title":"Cold-Start Recommendation towards the Era of Large Language Models\n  (LLMs): A Comprehensive Survey and Roadmap","summary":"  Cold-start problem is one of the long-standing challenges in recommender\nsystems, focusing on accurately modeling new or interaction-limited users or\nitems to provide better recommendations. Due to the diversification of internet\nplatforms and the exponential growth of users and items, the importance of\ncold-start recommendation (CSR) is becoming increasingly evident. At the same\ntime, large language models (LLMs) have achieved tremendous success and possess\nstrong capabilities in modeling user and item information, providing new\npotential for cold-start recommendations. However, the research community on\nCSR still lacks a comprehensive review and reflection in this field. Based on\nthis, in this paper, we stand in the context of the era of large language\nmodels and provide a comprehensive review and discussion on the roadmap,\nrelated literature, and future directions of CSR. Specifically, we have\nconducted an exploration of the development path of how existing CSR utilizes\ninformation, from content features, graph relations, and domain information, to\nthe world knowledge possessed by large language models, aiming to provide new\ninsights for both the research and industrial communities on CSR. Related\nresources of cold-start recommendations are collected and continuously updated\nfor the community in\nhttps://github.com/YuanchenBei/Awesome-Cold-Start-Recommendation.\n","authors":["Weizhi Zhang","Yuanchen Bei","Liangwei Yang","Henry Peng Zou","Peilin Zhou","Aiwei Liu","Yinghui Li","Hao Chen","Jianling Wang","Yu Wang","Feiran Huang","Sheng Zhou","Jiajun Bu","Allen Lin","James Caverlee","Fakhri Karray","Irwin King","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2501.01945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09775v1","updated":"2025-01-16T10:27:51Z","published":"2025-01-16T10:27:51Z","title":"Multiple Choice Questions: Reasoning Makes Large Language Models (LLMs)\n  More Self-Confident Even When They Are Wrong","summary":"  One of the most widely used methods to evaluate LLMs are Multiple Choice\nQuestion (MCQ) tests. MCQ benchmarks enable the testing of LLM knowledge on\nalmost any topic at scale as the results can be processed automatically. To\nhelp the LLM answer, a few examples called few shots can be included in the\nprompt. Moreover, the LLM can be asked to answer the question directly with the\nselected option or to first provide the reasoning and then the selected answer,\nwhich is known as chain of thought. In addition to checking whether the\nselected answer is correct, the evaluation can look at the LLM-estimated\nprobability of its response as an indication of the confidence of the LLM in\nthe response. In this paper, we study how the LLM confidence in its answer\ndepends on whether the model has been asked to answer directly or to provide\nthe reasoning before answering. The results of the evaluation of questions on a\nwide range of topics in seven different models show that LLMs are more\nconfident in their answers when they provide reasoning before the answer. This\noccurs regardless of whether the selected answer is correct. Our hypothesis is\nthat this behavior is due to the reasoning that modifies the probability of the\nselected answer, as the LLM predicts the answer based on the input question and\nthe reasoning that supports the selection made. Therefore, LLM estimated\nprobabilities seem to have intrinsic limitations that should be understood in\norder to use them in evaluation procedures. Interestingly, the same behavior\nhas been observed in humans, for whom explaining an answer increases confidence\nin its correctness.\n","authors":["Tairan Fu","Javier Conde","Gonzalo Martínez","María Grandury","Pedro Reviriego"],"pdf_url":"https://arxiv.org/pdf/2501.09775v1.pdf","comment":null}]},"2025-01-17T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.10356v1","updated":"2025-01-17T18:57:39Z","published":"2025-01-17T18:57:39Z","title":"DexForce: Extracting Force-informed Actions from Kinesthetic\n  Demonstrations for Dexterous Manipulation","summary":"  Imitation learning requires high-quality demonstrations consisting of\nsequences of state-action pairs. For contact-rich dexterous manipulation tasks\nthat require fine-grained dexterity, the actions in these state-action pairs\nmust produce the right forces. Current widely-used methods for collecting\ndexterous manipulation demonstrations are difficult to use for demonstrating\ncontact-rich tasks due to unintuitive human-to-robot motion retargeting and the\nlack of direct haptic feedback. Motivated by this, we propose DexForce, a\nmethod for collecting demonstrations of contact-rich dexterous manipulation.\nDexForce leverages contact forces, measured during kinesthetic demonstrations,\nto compute force-informed actions for policy learning. We use DexForce to\ncollect demonstrations for six tasks and show that policies trained on our\nforce-informed actions achieve an average success rate of 76% across all tasks.\nIn contrast, policies trained directly on actions that do not account for\ncontact forces have near-zero success rates. We also conduct a study ablating\nthe inclusion of force data in policy observations. We find that while using\nforce data never hurts policy performance, it helps the most for tasks that\nrequire an advanced level of precision and coordination, like opening an\nAirPods case and unscrewing a nut.\n","authors":["Claire Chen","Zhongchun Yu","Hojung Choi","Mark Cutkosky","Jeannette Bohg"],"pdf_url":"https://arxiv.org/pdf/2501.10356v1.pdf","comment":"Videos can be found here:\n  https://clairelc.github.io/dexforce.github.io/"},{"id":"http://arxiv.org/abs/2411.06627v2","updated":"2025-01-17T18:38:58Z","published":"2024-11-10T23:22:49Z","title":"Optimal Virtual Model Control for Robotics: Design and Tuning of\n  Passivity-Based Controllers","summary":"  Passivity-based control is a cornerstone of control theory and an established\ndesign approach in robotics. Its strength is based on the passivity theorem,\nwhich provides a powerful interconnection framework for robotics. However, the\ndesign of passivity-based controllers and their optimal tuning remain\nchallenging. We propose here an intuitive design approach for fully actuated\nrobots, where the control action is determined by a `virtual-mechanism' as in\nclassical virtual model control. The result is a robot whose controlled\nbehavior can be understood in terms of physics. We achieve optimal tuning by\napplying algorithmic differentiation to ODE simulations of the rigid body\ndynamics. Overall, this leads to a flexible design and optimization approach:\nstability is proven by passivity of the virtual mechanism, while performance is\nobtained by optimization using algorithmic differentiation.\n","authors":["Daniel Larby","Fulvio Forni"],"pdf_url":"https://arxiv.org/pdf/2411.06627v2.pdf","comment":"14 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.09600v2","updated":"2025-01-17T17:07:31Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n  Prototyping in Virtual Reality Applications","summary":"  SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.07600v4","updated":"2025-01-17T16:52:03Z","published":"2022-09-15T20:27:54Z","title":"STPOTR: Simultaneous Human Trajectory and Pose Prediction Using a\n  Non-Autoregressive Transformer for Robot Following Ahead","summary":"  In this paper, we develop a neural network model to predict future human\nmotion from an observed human motion history. We propose a non-autoregressive\ntransformer architecture to leverage its parallel nature for easier training\nand fast, accurate predictions at test time. The proposed architecture divides\nhuman motion prediction into two parts: 1) the human trajectory, which is the\nhip joint 3D position over time and 2) the human pose which is the all other\njoints 3D positions over time with respect to a fixed hip joint. We propose to\nmake the two predictions simultaneously, as the shared representation can\nimprove the model performance. Therefore, the model consists of two sets of\nencoders and decoders. First, a multi-head attention module applied to encoder\noutputs improves human trajectory. Second, another multi-head self-attention\nmodule applied to encoder outputs concatenated with decoder outputs facilitates\nlearning of temporal dependencies. Our model is well-suited for robotic\napplications in terms of test accuracy and speed, and compares favorably with\nrespect to state-of-the-art methods. We demonstrate the real-world\napplicability of our work via the Robot Follow-Ahead task, a challenging yet\npractical case study for our proposed model.\n","authors":["Mohammad Mahdavian","Payam Nikdel","Mahdi TaherAhmadi","Mo Chen"],"pdf_url":"https://arxiv.org/pdf/2209.07600v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09374v2","updated":"2025-01-17T15:52:06Z","published":"2024-10-12T05:35:27Z","title":"ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras","summary":"  Event-based visual odometry is a specific branch of visual Simultaneous\nLocalization and Mapping (SLAM) techniques, which aims at solving tracking and\nmapping subproblems (typically in parallel), by exploiting the special working\nprinciples of neuromorphic (i.e., event-based) cameras. Due to the\nmotion-dependent nature of event data, explicit data association (i.e., feature\nmatching) under large-baseline view-point changes is difficult to establish,\nmaking direct methods a more rational choice. However, state-of-the-art direct\nmethods are limited by the high computational complexity of the mapping\nsub-problem and the degeneracy of camera pose tracking in certain degrees of\nfreedom (DoF) in rotation. In this paper, we tackle these issues by building an\nevent-based stereo visual-inertial odometry system on top of a direct pipeline.\nSpecifically, to speed up the mapping operation, we propose an efficient\nstrategy for sampling contour points according to the local dynamics of events.\nThe mapping performance is also improved in terms of structure completeness and\nlocal smoothness by merging the temporal stereo and static stereo results. To\ncircumvent the degeneracy of camera pose tracking in recovering the pitch and\nyaw components of general 6-DoF motion, we introduce IMU measurements as motion\npriors via pre-integration. To this end, a compact back-end is proposed for\ncontinuously updating the IMU bias and predicting the linear velocity, enabling\nan accurate motion prediction for camera pose tracking. The resulting system\nscales well with modern high-resolution event cameras and leads to better\nglobal positioning accuracy in large-scale outdoor environments. Extensive\nevaluations on five publicly available datasets featuring different resolutions\nand scenarios justify the superior performance of the proposed system against\nfive state-of-the-art methods.\n","authors":["Junkai Niu","Sheng Zhong","Xiuyuan Lu","Shaojie Shen","Guillermo Gallego","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.09374v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10262v1","updated":"2025-01-17T15:43:49Z","published":"2025-01-17T15:43:49Z","title":"Deployment of an Aerial Multi-agent System for Automated Task Execution\n  in Large-scale Underground Mining Environments","summary":"  In this article, we present a framework for deploying an aerial multi-agent\nsystem in large-scale subterranean environments with minimal infrastructure for\nsupporting multi-agent operations. The multi-agent objective is to optimally\nand reactively allocate and execute inspection tasks in a mine, which are\nentered by a mine operator on-the-fly. The assignment of currently available\ntasks to the team of agents is accomplished through an auction-based system,\nwhere the agents bid for available tasks, which are used by a central\nauctioneer to optimally assigns tasks to agents. A mobile Wi-Fi mesh supports\ninter-agent communication and bi-directional communication between the agents\nand the task allocator, while the task execution is performed completely\ninfrastructure-free. Given a task to be accomplished, a reliable and modular\nagent behavior is synthesized by generating behavior trees from a pool of agent\ncapabilities, using a back-chaining approach. The auction system in the\nproposed framework is reactive and supports addition of new operator-specified\ntasks on-the-go, at any point through a user-friendly operator interface. The\nframework has been validated in a real underground mining environment using\nthree aerial agents, with several inspection locations spread in an environment\nof almost 200 meters. The proposed framework can be utilized for missions\ninvolving rapid inspection, gas detection, distributed sensing and mapping etc.\nin a subterranean environment. The proposed framework and its field deployment\ncontributes towards furthering reliable automation in large-scale subterranean\nenvironments to offload both routine and dangerous tasks from human operators\nto autonomous aerial robots.\n","authors":["Niklas Dahlquist","Samuel Nordström","Nikolaos Stathoulopoulos","Björn Lindqvist","Akshit Saradagi","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2501.10262v1.pdf","comment":"Submitted to IEEE Transactions on Field Robotics"},{"id":"http://arxiv.org/abs/2405.04392v2","updated":"2025-01-17T15:21:52Z","published":"2024-05-07T15:14:49Z","title":"BILTS: A Bi-Invariant Similarity Measure for Robust Object Trajectory\n  Recognition under Reference Frame Variations","summary":"  When similar object motions are performed in diverse contexts but are meant\nto be recognized under a single classification, these contextual variations act\nas disturbances that negatively affect accurate motion recognition. In this\npaper, we focus on contextual variations caused by reference frame variations.\nTo robustly deal with these variations, similarity measures have been\nintroduced that compare object motion trajectories in a context-invariant\nmanner. However, most are highly sensitive to noise near singularities, where\nthe measure is not uniquely defined, and lack bi-invariance (invariance to both\nworld and body frame variations). To address these issues, we propose the novel\n\\textit{Bi-Invariant Local Trajectory-Shape Similarity} (BILTS) measure.\nCompared to other measures, the BILTS measure uniquely offers bi-invariance,\nboundedness, and third-order shape identity. Aimed at practical\nimplementations, we devised a discretized and regularized version of the BILTS\nmeasure which shows exceptional robustness to singularities. This is\ndemonstrated through rigorous recognition experiments using multiple datasets.\nOn average, BILTS attained the highest recognition ratio and least sensitivity\nto contextual variations compared to other invariant object motion similarity\nmeasures. We believe that the BILTS measure is a valuable tool for recognizing\nmotions performed in diverse contexts and has potential in other applications,\nincluding the recognition, segmentation, and adaptation of both motion and\nforce trajectories.\n","authors":["Arno Verduyn","Erwin Aertbeliën","Glenn Maes","Joris De Schutter","Maxim Vochten"],"pdf_url":"https://arxiv.org/pdf/2405.04392v2.pdf","comment":"This work has been submitted as a regular research paper for\n  consideration in the Journal of Intelligent & Robotic Systems. The content in\n  this preprint is identical to the version submitted for peer review, except\n  for formatting differences required by the journal"},{"id":"http://arxiv.org/abs/2501.10156v1","updated":"2025-01-17T12:41:51Z","published":"2025-01-17T12:41:51Z","title":"Tethered Variable Inertial Attitude Control Mechanisms through a Modular\n  Jumping Limbed Robot","summary":"  This paper presents the concept of a tethered variable inertial attitude\ncontrol mechanism for a modular jumping-limbed robot designed for planetary\nexploration in low-gravity environments. The system, named SPLITTER, comprises\ntwo sub-10 kg quadrupedal robots connected by a tether, capable of executing\nsuccessive jumping gaits and stabilizing in-flight using inertial morphing\ntechnology. Through model predictive control (MPC), attitude control was\ndemonstrated by adjusting the limbs and tether length to modulate the system's\nprincipal moments of inertia. Our results indicate that this control strategy\nallows the robot to stabilize during flight phases without needing traditional\nflywheel-based systems or relying on aerodynamics, making the approach\nmass-efficient and ideal for small-scale planetary robots' successive jumps.\nThe paper outlines the dynamics, MPC formulation for inertial morphing,\nactuator requirements, and simulation results, illustrating the potential of\nagile exploration for small-scale rovers in low-gravity environments like the\nMoon or asteroids.\n","authors":["Yusuke Tanaka","Alvin Zhu","Dennis Hong"],"pdf_url":"https://arxiv.org/pdf/2501.10156v1.pdf","comment":"Proceeding to IEEE Aerospace Conference 2025"},{"id":"http://arxiv.org/abs/2412.19567v2","updated":"2025-01-17T12:10:42Z","published":"2024-12-27T10:10:52Z","title":"Safe Interval Randomized Path Planning For Manipulators","summary":"  Planning safe paths in 3D workspace for high DoF robotic systems, such as\nmanipulators, is a challenging problem, especially when the environment is\npopulated with the dynamic obstacles that need to be avoided. In this case the\ntime dimension should be taken into account that further increases the\ncomplexity of planning. To mitigate this issue we suggest to combine\nsafe-interval path planning (a prominent technique in heuristic search) with\nthe randomized planning, specifically, with the bidirectional rapidly-exploring\nrandom trees (RRT-Connect) - a fast and efficient algorithm for\nhigh-dimensional planning. Leveraging a dedicated technique of fast computation\nof the safe intervals we end up with an efficient planner dubbed SI-RRT. We\ncompare it with the state of the art and show that SI-RRT consistently\noutperforms the competitors both in runtime and solution cost.\n  Our implementation of SI-RRT is publicly available at\nhttps://github.com/PathPlanning/ManipulationPlanning-SI-RRT\n","authors":["Nuraddin Kerimov","Aleksandr Onegin","Konstantin Yakovlev"],"pdf_url":"https://arxiv.org/pdf/2412.19567v2.pdf","comment":"Submitted to The 35th International Conference on Automated Planning\n  and Scheduling (ICAPS 2025)"},{"id":"http://arxiv.org/abs/2501.10105v1","updated":"2025-01-17T10:45:22Z","published":"2025-01-17T10:45:22Z","title":"Universal Actions for Enhanced Embodied Foundation Models","summary":"  Training on diverse, internet-scale data is a key factor in the success of\nrecent large foundation models. Yet, using the same recipe for building\nembodied agents has faced noticeable difficulties. Despite the availability of\nmany crowd-sourced embodied datasets, their action spaces often exhibit\nsignificant heterogeneity due to distinct physical embodiment and control\ninterfaces for different robots, causing substantial challenges in developing\nembodied foundation models using cross-domain data. In this paper, we introduce\nUniAct, a new embodied foundation modeling framework operating in a tokenized\nUniversal Action Space. Our learned universal actions capture the generic\natomic behaviors across diverse robots by exploiting their shared structural\nfeatures, and enable enhanced cross-domain data utilization and\ncross-embodiment generalizations by eliminating the notorious heterogeneity.\nThe universal actions can be efficiently translated back to heterogeneous\nactionable commands by simply adding embodiment-specific details, from which\nfast adaptation to new robots becomes simple and straightforward. Our 0.5B\ninstantiation of UniAct outperforms 14X larger SOTA embodied foundation models\nin extensive evaluations on various real-world and simulation robots,\nshowcasing exceptional cross-embodiment control and adaptation capability,\nhighlighting the crucial benefit of adopting universal actions. Project page:\nhttps://github.com/2toinf/UniAct\n","authors":["Jinliang Zheng","Jianxiong Li","Dongxiu Liu","Yinan Zheng","Zhihao Wang","Zhonghong Ou","Yu Liu","Jingjing Liu","Ya-Qin Zhang","Xianyuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2501.10105v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.10100v1","updated":"2025-01-17T10:39:09Z","published":"2025-01-17T10:39:09Z","title":"Robotic World Model: A Neural Network Simulator for Robust Policy\n  Optimization in Robotics","summary":"  Learning robust and generalizable world models is crucial for enabling\nefficient and scalable robotic control in real-world environments. In this\nwork, we introduce a novel framework for learning world models that accurately\ncapture complex, partially observable, and stochastic dynamics. The proposed\nmethod employs a dual-autoregressive mechanism and self-supervised training to\nachieve reliable long-horizon predictions without relying on domain-specific\ninductive biases, ensuring adaptability across diverse robotic tasks. We\nfurther propose a policy optimization framework that leverages world models for\nefficient training in imagined environments and seamless deployment in\nreal-world systems. Through extensive experiments, our approach consistently\noutperforms state-of-the-art methods, demonstrating superior autoregressive\nprediction accuracy, robustness to noise, and generalization across\nmanipulation and locomotion tasks. Notably, policies trained with our method\nare successfully deployed on ANYmal D hardware in a zero-shot transfer,\nachieving robust performance with minimal sim-to-real performance loss. This\nwork advances model-based reinforcement learning by addressing the challenges\nof long-horizon prediction, error accumulation, and sim-to-real transfer. By\nproviding a scalable and robust framework, the introduced methods pave the way\nfor adaptive and efficient robotic systems in real-world applications.\n","authors":["Chenhao Li","Andreas Krause","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2501.10100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10074v1","updated":"2025-01-17T09:46:27Z","published":"2025-01-17T09:46:27Z","title":"SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and\n  Chain-of-Thought for Embodied Task Planning","summary":"  Spatial reasoning is an essential problem in embodied AI research. Efforts to\nenhance spatial reasoning abilities through supplementary spatial data and\nfine-tuning have proven limited and ineffective when addressing complex\nembodied tasks, largely due to their dependence on language-based outputs.\nWhile some approaches have introduced a point-based action space to mitigate\nthis issue, they fall short in managing more intricate tasks within complex\nenvironments. This deficiency arises from their failure to fully exploit the\ninherent thinking and reasoning capabilities that are fundamental strengths of\nVision-Language Models (VLMs). To address these limitations, we propose a novel\napproach named SpatialCoT, specifically designed to bolster the spatial\nreasoning capabilities of VLMs. Our approach comprises two stages: spatial\ncoordinate bi-directional alignment, which aligns vision-language inputs with\nspatial coordinates, and chain-of-thought spatial grounding, which harnesses\nthe reasoning capabilities of language models for advanced spatial reasoning.\nWe evaluate SpatialCoT on challenging navigation and manipulation tasks, both\nin simulation and real-world settings. Experimental results demonstrate that\nour method significantly outperforms previous state-of-the-art approaches in\nboth tasks.\n","authors":["Yuecheng Liu","Dafeng Chi","Shiguang Wu","Zhanguang Zhang","Yaochen Hu","Lingfeng Zhang","Yingxue Zhang","Shuang Wu","Tongtong Cao","Guowei Huang","Guangjian Tian","Xingyue Quan","Jianye Hao","Yuzheng Zhuang"],"pdf_url":"https://arxiv.org/pdf/2501.10074v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.10066v1","updated":"2025-01-17T09:37:51Z","published":"2025-01-17T09:37:51Z","title":"A Comprehensive Insights into Drones: History, Classification,\n  Architecture, Navigation, Applications, Challenges, and Future Trends","summary":"  Unmanned Aerial Vehicles (UAVs), commonly known as Drones, are one of 21st\ncentury most transformative technologies. Emerging first for military use,\nadvancements in materials, electronics, and software have catapulted drones\ninto multipurpose tools for a wide range of industries. In this paper, we have\ncovered the history, taxonomy, architecture, navigation systems and branched\nactivities for the same. It explores important future trends like autonomous\nnavigation, AI integration, and obstacle avoidance systems, emphasizing how\nthey contribute to improving the efficiency and versatility of drones. It also\nlooks at the major challenges like technical, environmental, economic,\nregulatory and ethical, that limit the actual take-up of drones, as well as\ntrends that are likely to mitigate these obstacles in the future. This work\noffers a structured synthesis of existing studies and perspectives that enable\ninsights about how drones will transform agriculture, logistics, healthcare,\ndisaster management, and other areas, while also identifying new opportunities\nfor innovation and development.\n","authors":["Ruchita Singh","Sandeep Kumar"],"pdf_url":"https://arxiv.org/pdf/2501.10066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18251v2","updated":"2025-01-17T05:49:00Z","published":"2024-05-28T15:02:09Z","title":"Sensor-Based Distributionally Robust Control for Safe Robot Navigation\n  in Dynamic Environments","summary":"  We introduce a novel method for mobile robot navigation in dynamic, unknown\nenvironments, leveraging onboard sensing and distributionally robust\noptimization to impose probabilistic safety constraints. Our method introduces\na distributionally robust control barrier function (DR-CBF) that directly\nintegrates noisy sensor measurements and state estimates to define safety\nconstraints. This approach is applicable to a wide range of control-affine\ndynamics, generalizable to robots with complex geometries, and capable of\noperating at real-time control frequencies. Coupled with a control Lyapunov\nfunction (CLF) for path following, the proposed CLF-DR-CBF control synthesis\nmethod achieves safe, robust, and efficient navigation in challenging\nenvironments. We demonstrate the effectiveness and robustness of our approach\nfor safe autonomous navigation under uncertainty in simulations and real-world\nexperiments with differential-drive robots.\n","authors":["Kehan Long","Yinzhuang Yi","Zhirui Dai","Sylvia Herbert","Jorge Cortés","Nikolay Atanasov"],"pdf_url":"https://arxiv.org/pdf/2405.18251v2.pdf","comment":"Project page: https://existentialrobotics.org/DRO_Safe_Navigation"},{"id":"http://arxiv.org/abs/2501.09937v1","updated":"2025-01-17T03:20:39Z","published":"2025-01-17T03:20:39Z","title":"Adaptive Twisting Sliding Control for Integrated Attack UAV's Autopilot\n  and Guidance","summary":"  This paper investigates an adaptive sliding-mode control for an integrated\nUAV autopilot and guidance system. First, a two-dimensional mathematical model\nof the system is derived by considering the incorporated lateral dynamics and\nrelative kinematics of the UAV and its potential target of attack. Then, a\nsliding surface is derived utilizing the zero-effort miss distance. An adaptive\ntwisting sliding mode (ATSMC) algorithm is applied to the integrated system.\nSimulation and comparisons have been accomplished. The results show our\nproposed design performs well in interception precision, even with high\nnonlinearity, uncertainties, disturbances, and abrupt changes in the target's\nmovement, thanks to the adaptation strategy.\n","authors":["Minh Tu Nguyen","Van Truong Hoang","Manh Duong Phung","Van Hoa Doan"],"pdf_url":"https://arxiv.org/pdf/2501.09937v1.pdf","comment":"in Proceedings of the 2025 International Conference on Energy,\n  Infrastructure and Environmental Research (EIER2025)"},{"id":"http://arxiv.org/abs/2501.09905v1","updated":"2025-01-17T01:32:18Z","published":"2025-01-17T01:32:18Z","title":"SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon\n  Visuomotor Learning","summary":"  We present a low-cost quadruped manipulation system that solves long-horizon\nreal-world tasks, trained by reinforcement learning purely in simulation. The\nsystem comprises 1) a hierarchical design of a high-level policy for\nvisual-mobile manipulation following instructions, and a low-level policy for\nquadruped movement and limb-control, 2) a progressive policy expansion approach\nfor solving the long-horizon task together with a teacher-student framework for\nefficient high-level training of the high-level visuomotor policy, and 3) a\nsuite of techniques for minimizing sim-to-real gaps.\n  With budget-friendly but limited reliability and performance hardware, and\njust one wrist-mounted RGB camera, the entire system fully trained in\nsimulation achieves high success rates for long horizon tasks involving search,\nmove, grasp, and drop-into, with fluid sim-to-real transfer in a wide variety\nof indoor and outdoor scenes and lighting conditions.Extensive real-world\nevaluations show that on the long horizon mobile manipulation tasks, our system\nachieves good performance when transferred to real both in terms of task\nsuccess rate and execution efficiency. Finally, we discuss the necessity of our\nsim-to-real techniques for legged mobile manipulation, and show their ablation\nperformance.\n","authors":["Haichao Zhang","Haonan Yu","Le Zhao","Andrew Choi","Qinxun Bai","Yiqing Yang","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2501.09905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08396v4","updated":"2025-01-17T01:07:17Z","published":"2023-10-12T15:08:36Z","title":"Uncertainty-Aware Planning for Heterogeneous Robot Teams using Dynamic\n  Topological Graphs and Mixed-Integer Programming","summary":"  Multi-robot planning and coordination in uncertain environments is a\nfundamental computational challenge, since the belief space increases\nexponentially with the number of robots. In this paper, we address the problem\nof planning in uncertain environments with a heterogeneous robot team of fast\nscout vehicles for information gathering and more risk-averse carrier robots\nfrom which the scouts vehicles are deployed. To overcome the computational\nchallenges, we represent the environment and operational scenario using a\ntopological graph, where the parameters of the edge weight distributions vary\nwith the state of the robot team on the graph, and we formulate a\ncomputationally efficient mixed-integer program which removes the dependence on\nthe number of robots from its decision space. Our formulation results in the\ncapability to generate optimal multi-robot, long-horizon plans in seconds that\ncould otherwise be computationally intractable. Ultimately our approach enables\nreal-time re-planning, since the computation time is significantly faster than\nthe time to execute one step. We evaluate our approach in a scenario where the\nrobot team must traverse an environment while minimizing detection by observers\nin positions that are uncertain to the robot team. We demonstrate that our\napproach is computationally tractable, can improve performance in the presence\nof imperfect information, and can be adjusted for different risk profiles.\n","authors":["Cora A. Dimmig","Kevin C. Wolfe","Bradley Woosley","Marin Kobilarov","Joseph Moore"],"pdf_url":"https://arxiv.org/pdf/2310.08396v4.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.09898v1","updated":"2025-01-17T01:01:44Z","published":"2025-01-17T01:01:44Z","title":"FoundationStereo: Zero-Shot Stereo Matching","summary":"  Tremendous progress has been made in deep stereo matching to excel on\nbenchmark datasets through per-domain fine-tuning. However, achieving strong\nzero-shot generalization - a hallmark of foundation models in other computer\nvision tasks - remains challenging for stereo matching. We introduce\nFoundationStereo, a foundation model for stereo depth estimation designed to\nachieve strong zero-shot generalization. To this end, we first construct a\nlarge-scale (1M stereo pairs) synthetic training dataset featuring large\ndiversity and high photorealism, followed by an automatic self-curation\npipeline to remove ambiguous samples. We then design a number of network\narchitecture components to enhance scalability, including a side-tuning feature\nbackbone that adapts rich monocular priors from vision foundation models to\nmitigate the sim-to-real gap, and long-range context reasoning for effective\ncost volume filtering. Together, these components lead to strong robustness and\naccuracy across domains, establishing a new standard in zero-shot stereo depth\nestimation.\n","authors":["Bowen Wen","Matthew Trepte","Joseph Aribido","Jan Kautz","Orazio Gallo","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2501.09898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14593v2","updated":"2025-01-17T01:00:13Z","published":"2024-11-21T21:23:46Z","title":"A Systematic Study of Multi-Agent Deep Reinforcement Learning for Safe\n  and Robust Autonomous Highway Ramp Entry","summary":"  Vehicles today can drive themselves on highways and driverless robotaxis\noperate in major cities, with more sophisticated levels of autonomous driving\nexpected to be available and become more common in the future. Yet, technically\nspeaking, so-called \"Level 5\" (L5) operation, corresponding to full autonomy,\nhas not been achieved. For that to happen, functions such as fully autonomous\nhighway ramp entry must be available, and provide provably safe, and reliably\nrobust behavior to enable full autonomy. We present a systematic study of a\nhighway ramp function that controls the vehicles forward-moving actions to\nminimize collisions with the stream of highway traffic into which a merging\n(ego) vehicle enters. We take a game-theoretic multi-agent (MA) approach to\nthis problem and study the use of controllers based on deep reinforcement\nlearning (DRL). The virtual environment of the MA DRL uses self-play with\nsimulated data where merging vehicles safely learn to control longitudinal\nposition during a taper-type merge. The work presented in this paper extends\nexisting work by studying the interaction of more than two vehicles (agents)\nand does so by systematically expanding the road scene with additional traffic\nand ego vehicles. While previous work on the two-vehicle setting established\nthat collision-free controllers are theoretically impossible in fully\ndecentralized, non-coordinated environments, we empirically show that\ncontrollers learned using our approach are nearly ideal when measured against\nidealized optimal controllers.\n","authors":["Larry Schester","Luis E. Ortiz"],"pdf_url":"https://arxiv.org/pdf/2411.14593v2.pdf","comment":"9 pages, 9 figures; added support ack"},{"id":"http://arxiv.org/abs/2403.11396v2","updated":"2025-01-17T00:26:48Z","published":"2024-03-18T01:08:18Z","title":"Beyond Uncertainty: Risk-Aware Active View Acquisition for Safe Robot\n  Navigation and 3D Scene Understanding with FisherRF","summary":"  The active view acquisition problem has been extensively studied in the\ncontext of robot navigation using NeRF and 3D Gaussian Splatting. To enhance\nscene reconstruction efficiency and ensure robot safety, we propose the\nRisk-aware Environment Masking (RaEM) framework. RaEM leverages coherent risk\nmeasures to dynamically prioritize safety-critical regions of the unknown\nenvironment, guiding active view acquisition algorithms toward identifying the\nnext-best-view (NBV). Integrated with FisherRF, which selects the NBV by\nmaximizing expected information gain, our framework achieves a dual objective:\nimproving robot safety and increasing efficiency in risk-aware 3D scene\nreconstruction and understanding. Extensive high-fidelity experiments validate\nthe effectiveness of our approach, demonstrating its ability to establish a\nrobust and safety-focused framework for active robot exploration and 3D scene\nunderstanding.\n","authors":["Guangyi Liu","Wen Jiang","Boshu Lei","Vivek Pandey","Kostas Daniilidis","Nader Motee"],"pdf_url":"https://arxiv.org/pdf/2403.11396v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05494v3","updated":"2025-01-17T21:51:45Z","published":"2024-10-07T21:05:45Z","title":"Tactile Displays Driven by Projected Light","summary":"  Tactile displays that lend tangible form to digital content could transform\ncomputing interactions. However, achieving the resolution, speed, and dynamic\nrange needed for perceptual fidelity remains challenging. We present a tactile\ndisplay that directly converts projected light into visible tactile patterns\nvia a photomechanical surface populated with millimeter-scale optotactile\npixels. The pixels transduce incident light into mechanical displacements\nthrough photostimulated thermal gas expansion, yielding millimeter scale\ndisplacements with response times of 2 to 100 milliseconds. Employing projected\nlight for power transmission and addressing renders these displays highly\nscalable. We demonstrate optically driven displays with up to 1,511 addressable\npixels -- several times more pixels than any prior tactile display attaining\ncomparable performance. Perceptual studies confirm that these displays can\nreproduce diverse spatiotemporal tactile patterns with high fidelity. This\nresearch establishes a foundation for practical, versatile high-resolution\ntactile displays driven by light.\n","authors":["Max Linnander","Dustin Goetz","Gregory Reardon","Vijay Kumar","Elliot Hawkes","Yon Visell"],"pdf_url":"https://arxiv.org/pdf/2410.05494v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10561v1","updated":"2025-01-17T21:31:11Z","published":"2025-01-17T21:31:11Z","title":"Early Failure Detection in Autonomous Surgical Soft-Tissue Manipulation\n  via Uncertainty Quantification","summary":"  Autonomous surgical robots are a promising solution to the increasing demand\nfor surgery amid a shortage of surgeons. Recent work has proposed\nlearning-based approaches for the autonomous manipulation of soft tissue.\nHowever, due to variability in tissue geometries and stiffnesses, these methods\ndo not always perform optimally, especially in out-of-distribution settings. We\npropose, develop, and test the first application of uncertainty quantification\nto learned surgical soft-tissue manipulation policies as an early\nidentification system for task failures. We analyze two different methods of\nuncertainty quantification, deep ensembles and Monte Carlo dropout, and find\nthat deep ensembles provide a stronger signal of future task success or\nfailure. We validate our approach using the physical daVinci Research Kit\n(dVRK) surgical robot to perform physical soft-tissue manipulation. We show\nthat we are able to successfully detect task failure and request human\nintervention when necessary while still enabling autonomous manipulation when\npossible. Our learned tissue manipulation policy with uncertainty-based early\nfailure detection achieves a zero-shot sim2real performance improvement of\n47.5% over the prior state of the art in learned soft-tissue manipulation. We\nalso show that our method generalizes well to new types of tissue as well as to\na bimanual soft tissue manipulation task.\n","authors":["Jordan Thompson","Ronald Koe","Anthony Le","Gabriella Goodman","Daniel S. Brown","Alan Kuntz"],"pdf_url":"https://arxiv.org/pdf/2501.10561v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.10513v1","updated":"2025-01-17T19:13:21Z","published":"2025-01-17T19:13:21Z","title":"ConfigBot: Adaptive Resource Allocation for Robot Applications in\n  Dynamic Environments","summary":"  The growing use of autonomous mobile service robots (AMSRs) in dynamic\nenvironments requires flexible management of compute resources to optimize the\nperformance of diverse tasks such as navigation, localization, perception, and\nso on. Current robot deployments, which oftentimes rely on static\nconfigurations (of the OS, applications, etc.) and system over-provisioning,\nfall short since they do not account for the tasks' performance variations\nresulting in poor system-wide behavior such as robot instability and/or\ninefficient resource use. This paper presents ConfigBot, a system designed to\nadaptively reconfigure AMSR applications to meet a predefined performance\nspecification by leveraging runtime profiling and automated configuration\ntuning. Through experiments on a Boston Dynamics Spot robot equipped with\nNVIDIA AGX Orin, we demonstrate ConfigBot's efficacy in maintaining system\nstability and optimizing resource allocation across diverse scenarios. Our\nfindings highlight the promise of tailored and dynamic configurations for robot\ndeployments.\n","authors":["Rohit Dwivedula","Sadanand Modak","Aditya Akella","Joydeep Biswas","Daehyeok Kim","Christopher J. Rossbach"],"pdf_url":"https://arxiv.org/pdf/2501.10513v1.pdf","comment":"14 pages, 13 figures, 6 tables"},{"id":"http://arxiv.org/abs/2501.10499v1","updated":"2025-01-17T15:16:43Z","published":"2025-01-17T15:16:43Z","title":"Learning More With Less: Sample Efficient Dynamics Learning and\n  Model-Based RL for Loco-Manipulation","summary":"  Combining the agility of legged locomotion with the capabilities of\nmanipulation, loco-manipulation platforms have the potential to perform complex\ntasks in real-world applications. To this end, state-of-the-art quadrupeds with\nattached manipulators, such as the Boston Dynamics Spot, have emerged to\nprovide a capable and robust platform. However, both the complexity of\nloco-manipulation control, as well as the black-box nature of commercial\nplatforms pose challenges for developing accurate dynamics models and control\npolicies. We address these challenges by developing a hand-crafted kinematic\nmodel for a quadruped-with-arm platform and, together with recent advances in\nBayesian Neural Network (BNN)-based dynamics learning using physical priors,\nefficiently learn an accurate dynamics model from data. We then derive control\npolicies for loco-manipulation via model-based reinforcement learning (RL). We\ndemonstrate the effectiveness of this approach on hardware using the Boston\nDynamics Spot with a manipulator, accurately performing dynamic end-effector\ntrajectory tracking even in low data regimes.\n","authors":["Benjamin Hoffman","Jin Cheng","Chenhao Li","Stelian Coros"],"pdf_url":"https://arxiv.org/pdf/2501.10499v1.pdf","comment":"Master Thesis at ETH Zurich"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2403.15780v3","updated":"2025-01-17T18:42:52Z","published":"2024-03-23T09:32:23Z","title":"A Fairness-Oriented Reinforcement Learning Approach for the Operation\n  and Control of Shared Micromobility Services","summary":"  As Machine Learning grows in popularity across various fields, equity has\nbecome a key focus for the AI community. However, fairness-oriented approaches\nare still underexplored in smart mobility. Addressing this gap, our study\ninvestigates the balance between performance optimization and algorithmic\nfairness in shared micromobility services providing a novel framework based on\nReinforcement Learning. Exploiting Q-learning, the proposed methodology\nachieves equitable outcomes in terms of the Gini index across different areas\ncharacterized by their distance from central hubs. Through vehicle rebalancing,\nthe provided scheme maximizes operator performance while ensuring fairness\nprinciples for users, reducing iniquity by up to 85% while only increasing\ncosts by 30% (w.r.t. applying no equity adjustment). A case study with\nsynthetic data validates our insights and highlights the importance of fairness\nin urban micromobility (source code:\nhttps://github.com/mcederle99/FairMSS.git).\n","authors":["Matteo Cederle","Luca Vittorio Piron","Marina Ceccon","Federico Chiariotti","Alessandro Fabris","Marco Fabris","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2403.15780v3.pdf","comment":"6 pages, 3 figures, accepted at the 2025 American Control Conference\n  (ACC) on January 17th, 2025"},{"id":"http://arxiv.org/abs/2501.10337v1","updated":"2025-01-17T18:21:25Z","published":"2025-01-17T18:21:25Z","title":"Uncertainty-Aware Digital Twins: Robust Model Predictive Control using\n  Time-Series Deep Quantile Learning","summary":"  Digital Twins, virtual replicas of physical systems that enable real-time\nmonitoring, model updates, predictions, and decision-making, present novel\navenues for proactive control strategies for autonomous systems. However,\nachieving real-time decision-making in Digital Twins considering uncertainty\nnecessitates an efficient uncertainty quantification (UQ) approach and\noptimization driven by accurate predictions of system behaviors, which remains\na challenge for learning-based methods. This paper presents a simultaneous\nmulti-step robust model predictive control (MPC) framework that incorporates\nreal-time decision-making with uncertainty awareness for Digital Twin systems.\nLeveraging a multistep ahead predictor named Time-Series Dense Encoder (TiDE)\nas the surrogate model, this framework differs from conventional MPC models\nthat provide only one-step ahead predictions. In contrast, TiDE can predict\nfuture states within the prediction horizon in a one-shot, significantly\naccelerating MPC. Furthermore, quantile regression is employed with the\ntraining of TiDE to perform flexible while computationally efficient UQ on data\nuncertainty. Consequently, with the deep learning quantiles, the robust MPC\nproblem is formulated into a deterministic optimization problem and provides a\nsafety buffer that accommodates disturbances to enhance constraint satisfaction\nrate. As a result, the proposed method outperforms existing robust MPC methods\nby providing less-conservative UQ and has demonstrated efficacy in an\nengineering case study involving Directed Energy Deposition (DED) additive\nmanufacturing. This proactive while uncertainty-aware control capability\npositions the proposed method as a potent tool for future Digital Twin\napplications and real-time process control in engineering systems.\n","authors":["Yi-Ping Chen","Ying-Kuan Tsai","Vispi Karkaria","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.10337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11668v3","updated":"2025-01-17T17:52:29Z","published":"2023-11-20T11:07:33Z","title":"AIaaS for ORAN-based 6G Networks: Multi-time Scale Slice Resource\n  Management with DRL","summary":"  This paper addresses how to handle slice resources for 6G networks at\ndifferent time scales in an architecture based on an open radio access network\n(ORAN). The proposed solution includes artificial intelligence (AI) at the edge\nof the network and applies two control-level loops to obtain optimal\nperformance compared to other techniques. The ORAN facilitates programmable\nnetwork architectures to support such multi-time scale management using AI\napproaches. The proposed algorithms analyze the maximum utilization of\nresources from slice performance to take decisions at the inter-slice level.\nInter-slice intelligent agents work at a non-real-time level to reconfigure\nresources within various slices. Further than meeting the slice requirements,\nthe intra-slice objective must also include the minimization of maximum\nresource utilization. This enables smart utilization of the resources within\neach slice without affecting slice performance. Here, each xApp that is an\nintra-slice agent aims at meeting the optimal quality of service (QoS) of the\nusers, but at the same time, some inter-slice objectives should be included to\ncoordinate intra- and inter-slice agents. This is done without penalizing the\nmain intra-slice objective. All intelligent agents use deep reinforcement\nlearning (DRL) algorithms to meet their objectives. We have presented results\nfor enhanced mobile broadband (eMBB), ultra-reliable low latency (URLLC), and\nmassive machine type communication (mMTC) slice categories.\n","authors":["Suvidha Mhatre","Ferran Adelantado","Kostas Ramantas","Christos Verikoukis"],"pdf_url":"https://arxiv.org/pdf/2311.11668v3.pdf","comment":"Updated to reflect acceptance in IEEE ICC 2024: IEEE International\n  Conference on Communications, Denver, CO, USA, 2024, pp. 5407-5412, doi:\n  10.1109/ICC51166.2024.10622601"},{"id":"http://arxiv.org/abs/2211.15652v3","updated":"2025-01-17T17:38:36Z","published":"2022-11-28T18:56:20Z","title":"Stochastic Optimal Control via Local Occupation Measures","summary":"  Viewing stochastic processes through the lens of occupation measures has\nproved to be a powerful angle of attack for the theoretical and computational\nanalysis of stochastic optimal control problems. We present a simple\nmodification of the traditional occupation measure framework derived from\nresolving the occupation measures locally on a partition of the control\nproblem's space-time domain. This notion of local occupation measures provides\nfine-grained control over the construction of structured semidefinite\nprogramming relaxations for a rich class of stochastic optimal control problems\nwith embedded diffusion and jump processes via the moment-sum-of-squares\nhierarchy. As such, it bridges the gap between discretization-based\napproximations to the Hamilton-Jacobi-Bellmann equations and occupation measure\nrelaxations. We demonstrate with examples that this approach enables the\ncomputation of high quality bounds for the optimal value of a large class of\nstochastic optimal control problems with significant performance gains relative\nto the traditional occupation measure framework.\n","authors":["Flemming Holtorf","Alan Edelman","Christopher Rackauckas"],"pdf_url":"https://arxiv.org/pdf/2211.15652v3.pdf","comment":"22 pages, 4 figures, associated implementation:\n  https://github.com/FHoltorf/MarkovBounds.jl"},{"id":"http://arxiv.org/abs/2410.02895v2","updated":"2025-01-17T17:26:56Z","published":"2024-10-03T18:40:00Z","title":"Near Optimal Approximations and Finite Memory Policies for POMPDs with\n  Continuous Spaces","summary":"  We study an approximation method for partially observed Markov decision\nprocesses (POMDPs) with continuous spaces. Belief MDP reduction, which has been\nthe standard approach to study POMDPs requires rigorous approximation methods\nfor practical applications, due to the state space being lifted to the space of\nprobability measures. Generalizing recent work, in this paper we present\nrigorous approximation methods via discretizing the observation space and\nconstructing a fully observed finite MDP model using a finite length history of\nthe discrete observations and control actions. We show that the resulting\npolicy is near-optimal under some regularity assumptions on the channel, and\nunder certain controlled filter stability requirements for the hidden state\nprocess. Furthermore, by quantizing the measurements, we are able to utilize\nrefined filter stability conditions. We also provide a Q learning algorithm\nthat uses a finite memory of discretized information variables, and prove its\nconvergence to the optimality equation of the finite fully observed MDP\nconstructed using the approximation method.\n","authors":["Ali Devran Kara","Erhan Bayraktar","Serdar Yuksel"],"pdf_url":"https://arxiv.org/pdf/2410.02895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00753v2","updated":"2025-01-17T16:37:23Z","published":"2024-09-01T15:46:55Z","title":"Generalized Multi-hop Traffic Pressure for Heterogeneous Traffic\n  Perimeter Control","summary":"  Perimeter control (PC) prevents loss of traffic network capacity due to\ncongestion in urban areas. Homogeneous PC allows all access points to a\nprotected region to have identical permitted inflow. However, homogeneous PC\nperforms poorly when the congestion in the protected region is heterogeneous\n(e.g., imbalanced demand) since the homogeneous PC does not consider specific\ntraffic conditions around each perimeter intersection. When the protected\nregion has spatially heterogeneous congestion, one needs to modulate the\nperimeter inflow rate to be higher near low-density regions and vice versa for\nhigh-density regions. A na\\\"ive approach is to leverage 1-hop traffic pressure\nto measure traffic condition around perimeter intersections, but such metric is\ntoo spatially myopic for PC. To address this issue, we formulate multi-hop\ndownstream pressure grounded on Markov chain theory, which ``looks deeper''\ninto the protected region beyond perimeter intersections. In addition, we\nformulate a two-stage hierarchical control scheme that can leverage this novel\nmulti-hop pressure to redistribute the total permitted inflow provided by a\npre-trained deep reinforcement learning homogeneous control policy.\nExperimental results show that our heterogeneous PC approaches leveraging\nmulti-hop pressure significantly outperform homogeneous PC in scenarios where\nthe origin-destination flows are highly imbalanced with high spatial\nheterogeneity. Moveover, our approach is shown to be robust against turning\nratio uncertainties by a sensitivity analysis.\n","authors":["Xiaocan Li","Xiaoyu Wang","Ilia Smirnov","Scott Sanner","Baher Abdulhai"],"pdf_url":"https://arxiv.org/pdf/2409.00753v2.pdf","comment":"11 pages main body, 13 figures, journal paper"},{"id":"http://arxiv.org/abs/2501.10262v1","updated":"2025-01-17T15:43:49Z","published":"2025-01-17T15:43:49Z","title":"Deployment of an Aerial Multi-agent System for Automated Task Execution\n  in Large-scale Underground Mining Environments","summary":"  In this article, we present a framework for deploying an aerial multi-agent\nsystem in large-scale subterranean environments with minimal infrastructure for\nsupporting multi-agent operations. The multi-agent objective is to optimally\nand reactively allocate and execute inspection tasks in a mine, which are\nentered by a mine operator on-the-fly. The assignment of currently available\ntasks to the team of agents is accomplished through an auction-based system,\nwhere the agents bid for available tasks, which are used by a central\nauctioneer to optimally assigns tasks to agents. A mobile Wi-Fi mesh supports\ninter-agent communication and bi-directional communication between the agents\nand the task allocator, while the task execution is performed completely\ninfrastructure-free. Given a task to be accomplished, a reliable and modular\nagent behavior is synthesized by generating behavior trees from a pool of agent\ncapabilities, using a back-chaining approach. The auction system in the\nproposed framework is reactive and supports addition of new operator-specified\ntasks on-the-go, at any point through a user-friendly operator interface. The\nframework has been validated in a real underground mining environment using\nthree aerial agents, with several inspection locations spread in an environment\nof almost 200 meters. The proposed framework can be utilized for missions\ninvolving rapid inspection, gas detection, distributed sensing and mapping etc.\nin a subterranean environment. The proposed framework and its field deployment\ncontributes towards furthering reliable automation in large-scale subterranean\nenvironments to offload both routine and dangerous tasks from human operators\nto autonomous aerial robots.\n","authors":["Niklas Dahlquist","Samuel Nordström","Nikolaos Stathoulopoulos","Björn Lindqvist","Akshit Saradagi","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2501.10262v1.pdf","comment":"Submitted to IEEE Transactions on Field Robotics"},{"id":"http://arxiv.org/abs/2501.10236v1","updated":"2025-01-17T15:00:41Z","published":"2025-01-17T15:00:41Z","title":"Actively Coupled Sensor Configuration and Planning in Unknown Dynamic\n  Environments","summary":"  We address the problem of path-planning for an autonomous mobile vehicle,\ncalled the ego vehicle, in an unknown andtime-varying environment. The\nobjective is for the ego vehicle to minimize exposure to a\nspatiotemporally-varying unknown scalar field called the threat field. Noisy\nmeasurements of the threat field are provided by a network of mobile sensors.\nWeaddress the problem of optimally configuring (placing) these sensors in the\nenvironment. To this end, we propose sensor reconfiguration by maximizing a\nreward function composed of three different elements. First, the reward\nincludes an informa tion measure that we call context-relevant mutual\ninformation (CRMI). Unlike typical sensor placement techniques that maxi mize\nmutual information of the measurements and environment state, CRMI directly\nquantifies uncertainty reduction in the ego path cost while it moves in the\nenvironment. Therefore, the CRMI introduces active coupling between the ego\nvehicle and the sensor network. Second, the reward includes a penalty on the\ndistances traveled by the sensors. Third, the reward includes a measure of\nproximity of the sensors to the ego vehicle. Although we do not consider\ncommunication issues in this paper, such proximity is of relevance for future\nwork that addresses communications between the sensors and the ego vehicle. We\nillustrate and analyze the proposed technique via numerical simulations.\n","authors":["Prakash Poudel","Jeffrey DesRoches","Raghvendra V. Cowlagi"],"pdf_url":"https://arxiv.org/pdf/2501.10236v1.pdf","comment":"Draft submitted to the 2025 American Control Conference"},{"id":"http://arxiv.org/abs/2501.01170v2","updated":"2025-01-17T14:55:54Z","published":"2025-01-02T09:47:38Z","title":"Automated monitoring of bee colony movement in the hive during winter\n  season","summary":"  In this study, we have experimentally modelled the movement of a bee colony\nin a hive during the winter season and developed a monitoring system that\nallows tracking the movement of the bee colony and honey consumption. The\nmonitoring system consists of four load cells connected to the RP2040\ncontroller based on the Raspberry Pi Pico board, from which data is transmitted\nvia the MQTT protocol to the Raspberry Pi 5 microcomputer via a Wi-Fi network.\nThe processed data from the Raspberry Pi 5 is recorded in a MySQL database. The\nalgorithm for finding the location of the bee colony in the hive works\ncorrectly, the trajectory of movement based on the data from the sensors\nrepeats the physical movement in the experiment, which is an imitation of the\nmovement of the bee colony in real conditions. The proposed monitoring system\nprovides continuous observation of the bee colony without adversely affecting\nits natural activities and can be integrated with various wireless data\nnetworks. This is a promising tool for improving the efficiency of beekeeping\nand maintaining the health of bee colonies.\n","authors":["Rostyslav Koroliuk","Vyacheslav Nykytyuk","Vitaliy Tymoshchuk","Veronika Soyka","Dmytro Tymoshchuk"],"pdf_url":"https://arxiv.org/pdf/2501.01170v2.pdf","comment":"Paper Accepted at BAIT 2024 CEUR-WS, see\n  https://ceur-ws.org/Vol-3842/paper9.pdf"},{"id":"http://arxiv.org/abs/2501.10201v1","updated":"2025-01-17T13:51:05Z","published":"2025-01-17T13:51:05Z","title":"ODMA-Based Cell-Free Unsourced Random Access with Successive\n  Interference Cancellation","summary":"  We consider the unsourced random access problem with multiple receivers and\npropose a cell-free type solution for that. In our proposed scheme, the active\nusers transmit their signals to the access points (APs) distributed in a\ngeographical area and connected to a central processing unit (CPU). The\ntransmitted signals are composed of a pilot and polar codeword, where the polar\ncodeword bits occupy a small fraction of the data part of the transmission\nframe. The receiver operations of pilot detection and channel and symbol\nestimation take place at the APs, while the actual message bits are detected at\nthe CPU by combining the symbol estimates from the APs forwarded over the\nfronthaul. The effect of the successfully decoded messages is then subtracted\nat the APs. Numerical examples illustrate that the proposed scheme can support\nup to 1400 users with a high energy efficiency, and the distributed structure\ndecreases the error probability by more than two orders of magnitude.\n","authors":["Mert Ozates","Mohammad Kazemi","Eduard Jorswieck","Deniz Gunduz"],"pdf_url":"https://arxiv.org/pdf/2501.10201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10196v1","updated":"2025-01-17T13:39:54Z","published":"2025-01-17T13:39:54Z","title":"Pricing Mechanisms versus Non-Pricing Mechanisms for Demand Side\n  Management in Microgrids","summary":"  In this paper, we compare pricing and non-pricing mechanisms for implementing\ndemand-side management (DSM) mechanisms in a neighborhood in Helsinki, Finland.\nWe compare load steering based on peak load-reduction using the profile\nsteering method, and load steering based on market price signals, in terms of\npeak loads, losses, and device profiles. We found that there are significant\ndifferences between the two methods; the peak-load reduction control strategies\ncontribute to reducing peak power and improving power flow stability, while\nstrategies primarily based on prices result in higher peaks and increased grid\nlosses. Our results highlight the need to potentially move away from\nmarket-price-based DSM to DSM incentivization and control strategies that are\nbased on peak load reductions and other system requirements.\n","authors":["Cassia Nunes Almeida","Arun Narayanan","Hafiz Majid Hussain","Pedro H. J. Nardelli"],"pdf_url":"https://arxiv.org/pdf/2501.10196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10097v1","updated":"2025-01-17T10:33:48Z","published":"2025-01-17T10:33:48Z","title":"Decomposition and Quantification of SOTIF Requirements for Perception\n  Systems of Autonomous Vehicles","summary":"  Ensuring the safety of autonomous vehicles (AVs) is paramount before they can\nbe introduced to the market.\n  More specifically, securing the Safety of the Intended Functionality (SOTIF)\nposes a notable challenge; while ISO 21448 outlines numerous activities to\nrefine the performance of AVs, it offers minimal quantitative guidance. This\npaper endeavors to decompose the acceptance criterion into quantitative\nperception requirements, aiming to furnish developers with requirements that\nare not only understandable but also actionable. This paper introduces a risk\ndecomposition methodology to derive SOTIF requirements for perception. More\nexplicitly, for subsystemlevel safety requirements, we define a collision\nseverity model to establish requirements for state uncertainty and present a\nBayesian model to discern requirements for existence uncertainty.\n  For component-level safety requirements, we proposed a decomposition method\nbased on the Shapley value. Our findings indicate that these methods can\neffectively decompose the system-level safety requirements into quantitative\nperception requirements, potentially facilitating the safety verification of\nvarious AV components.\n","authors":["Ruilin Yu","Cheng Wang","Yuxin Zhang","Fuming Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.10097v1.pdf","comment":"14pages,13figures,4tables,Journal Article"},{"id":"http://arxiv.org/abs/2501.10093v1","updated":"2025-01-17T10:28:14Z","published":"2025-01-17T10:28:14Z","title":"An Energy-Aware RIoT System: Analysis, Modeling and Prediction in the\n  SUPERIOT Framework","summary":"  This paper presents a comprehensive analysis of the energy consumption\ncharacteristics of a Silicon (Si)-based Reconfigurable IoT (RIoT) node\ndeveloped in the initial phase of the SUPERIOT project, focusing on key\noperating states, including Bluetooth Low Energy (BLE) communication,\nNarrow-Band Visible Light Communication (NBVLC), sensing, and E-ink display.\nExtensive measurements were conducted to establish a detailed energy profile,\nwhich serves as a benchmark for evaluating the effectiveness of subsequent\noptimizations and future node iterations. To minimize the energy consumption,\nmultiple optimizations were implemented at both the software and hardware\nlevels, achieving a reduction of over 60% in total energy usage through\nsoftware modifications alone. Further improvements were realized by optimizing\nthe E-ink display driving waveform and implementing a very low-power mode for\nnon-communication activities. Based on the measured data, three\nmeasurement-based energy consumption models were developed to characterize the\nenergy behavior of the node under: (i) normal, unoptimized operation, (ii)\nlow-power, software-optimized operation, and (iii) very low-power,\nhardware-optimized operation. These models, validated with new measurement\ndata, achieved an accuracy exceeding 97%, confirming their reliability for\npredicting energy consumption in diverse configurations.\n","authors":["Mohammud J. Bocus","Juha Hakkinen","Helder Fontes","Marcin Drzewiecki","Senhui Qiu","Kerstin Eder","Robert Piechocki"],"pdf_url":"https://arxiv.org/pdf/2501.10093v1.pdf","comment":"14 pages, 13 figures, 11 tables"},{"id":"http://arxiv.org/abs/2501.10063v1","updated":"2025-01-17T09:27:43Z","published":"2025-01-17T09:27:43Z","title":"Hybrid Parallel Collaborative Simulation Framework Integrating Device\n  Physics with Circuit Dynamics for PDAE-Modeled Power Electronic Equipment","summary":"  Optimizing high-performance power electronic equipment, such as power\nconverters, requires multiscale simulations that incorporate the physics of\npower semiconductor devices and the dynamics of other circuit components,\nespecially in conducting Design of Experiments (DoEs), defining the safe\noperating area of devices, and analyzing failures related to semiconductor\ndevices. However, current methodologies either overlook the intricacies of\ndevice physics or do not achieve satisfactory computational speeds. To bridge\nthis gap, this paper proposes a Hybrid-Parallel Collaborative (HPC) framework\nspecifically designed to analyze the Partial Differential Algebraic Equation\n(PDAE) modeled power electronic equipment, integrating the device physics and\ncircuit dynamics. The HPC framework employs a dynamic iteration to tackle the\nchallenges inherent in solving the coupled nonlinear PDAE system, and utilizes\na hybrid-parallel computing strategy to reduce computing time. Physics-based\nsystem partitioning along with hybrid-process-thread parallelization on shared\nand distributed memory are employed, facilitating the simulation of hundreds of\npartial differential equations (PDEs)-modeled devices simultaneously without\ncompromising speed. Experiments based on the hybrid line commutated converter\nand reverse-blocking integrated gate-commutated thyristors are conducted under\n3 typical real-world scenarios: semiconductor device optimization for the\nconverter; converter design optimization; and device failure analysis. The HPC\nframework delivers simulation speed up to 60 times faster than the leading\ncommercial software, while maintaining carrier-level accuracy in the\nexperiments. This shows great potential for comprehensive analysis and\ncollaborative optimization of devices and electronic power equipment,\nparticularly in extreme conditions and failure scenarios.\n","authors":["Qingyuan Shi","Chijie Zhuang","Jiapeng Liu","Bo Lin","Xiyu Peng","Dan Wu","Zhicheng Liu","Rong Zeng"],"pdf_url":"https://arxiv.org/pdf/2501.10063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10030v1","updated":"2025-01-17T08:33:44Z","published":"2025-01-17T08:33:44Z","title":"Informativity Conditions for Multiple Signals: Properties, Experimental\n  Design, and Applications","summary":"  Recent studies highlight the importance of persistently exciting condition in\nsingle signal sequence for model identification and data-driven control\nmethodologies. However, maintaining prolonged excitation in control signals\nintroduces significant challenges, as continuous excitation can reduce the\nlifetime of mechanical devices. In this paper, we introduce three informativity\nconditions for various types of multi-signal data, each augmented by weight\nfactors. We explore the interrelations between these conditions and their rank\nproperties in linear time-invariant systems. Furthermore, we introduce\nopen-loop experimental design methods tailored to each of the three conditions,\nwhich can synthesize the required excitation conditions either offline or\nonline, even in the presence of limited information within each signal segment.\nWe demonstrate the effectiveness of these informativity conditions in\nleast-squares identification. Additionally, all three conditions can extend\nWillems' fundamental lemma and are utilized to assess the properties of the\nsystem. Illustrative examples confirm that these conditions yield satisfactory\noutcomes in both least-squares identification and the construction of\ndata-driven controllers.\n","authors":["Ao Cao","Fuyong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13086v3","updated":"2025-01-17T08:00:45Z","published":"2024-12-17T16:58:05Z","title":"Higher-Order Sinusoidal Input Describing Functions for Open-Loop and\n  Closed-Loop Reset Control with Application to Mechatronics Systems","summary":"  Reset control enhances the performance of high-precision mechatronics\nsystems. This paper introduces a generalized reset feedback control structure\nthat integrates a single reset-state reset controller, a shaping filter for\ntuning reset actions, and linear compensators arranged in series and parallel\nconfigurations with the reset controller. This structure offers greater tuning\nflexibility to optimize reset control performance. However, frequency-domain\nanalysis for such systems remains underdeveloped. To address this gap, this\nstudy makes three key contributions: (1) developing Higher-Order Sinusoidal\nInput Describing Functions (HOSIDFs) for open-loop reset control systems; (2)\nderiving HOSIDFs for closed-loop reset control systems and establishing a\nconnection with open-loop analysis; and (3) creating a MATLAB-based App to\nimplement these methods, providing mechatronics engineers with a practical tool\nfor reset control system design and analysis. The accuracy of the proposed\nmethods is validated through simulations and experiments. Finally, the utility\nof the proposed methods is demonstrated through case studies that analyze and\ncompare the performance of three controllers: a PID controller, a reset\ncontroller, and a shaped reset controller on a precision motion stage. Both\nanalytical and experimental results demonstrate that the shaped reset\ncontroller provides higher tracking precision while reducing actuation forces,\noutperforming both the reset and PID controllers. These findings highlight the\neffectiveness of the proposed frequency-domain methods in analyzing and\noptimizing the performance of reset-controlled mechatronics systems.\n","authors":["Xinxin Zhang","S. Hassan HosseinNia"],"pdf_url":"https://arxiv.org/pdf/2412.13086v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18251v2","updated":"2025-01-17T05:49:00Z","published":"2024-05-28T15:02:09Z","title":"Sensor-Based Distributionally Robust Control for Safe Robot Navigation\n  in Dynamic Environments","summary":"  We introduce a novel method for mobile robot navigation in dynamic, unknown\nenvironments, leveraging onboard sensing and distributionally robust\noptimization to impose probabilistic safety constraints. Our method introduces\na distributionally robust control barrier function (DR-CBF) that directly\nintegrates noisy sensor measurements and state estimates to define safety\nconstraints. This approach is applicable to a wide range of control-affine\ndynamics, generalizable to robots with complex geometries, and capable of\noperating at real-time control frequencies. Coupled with a control Lyapunov\nfunction (CLF) for path following, the proposed CLF-DR-CBF control synthesis\nmethod achieves safe, robust, and efficient navigation in challenging\nenvironments. We demonstrate the effectiveness and robustness of our approach\nfor safe autonomous navigation under uncertainty in simulations and real-world\nexperiments with differential-drive robots.\n","authors":["Kehan Long","Yinzhuang Yi","Zhirui Dai","Sylvia Herbert","Jorge Cortés","Nikolay Atanasov"],"pdf_url":"https://arxiv.org/pdf/2405.18251v2.pdf","comment":"Project page: https://existentialrobotics.org/DRO_Safe_Navigation"},{"id":"http://arxiv.org/abs/2302.04344v3","updated":"2025-01-17T05:21:29Z","published":"2023-02-08T21:27:16Z","title":"Learning Dynamical Systems by Leveraging Data from Similar Systems","summary":"  We consider the problem of learning the dynamics of a linear system when one\nhas access to data generated by an auxiliary system that shares similar (but\nnot identical) dynamics, in addition to data from the true system. We use a\nweighted least squares approach, and provide finite sample error bounds of the\nlearned model as a function of the number of samples and various system\nparameters from the two systems as well as the weight assigned to the auxiliary\ndata. We show that the auxiliary data can help to reduce the intrinsic system\nidentification error due to noise, at the price of adding a portion of error\nthat is due to the differences between the two system models. We further\nprovide a data-dependent bound that is computable when some prior knowledge\nabout the systems, such as upper bounds on noise levels and model difference,\nis available. This bound can also be used to determine the weight that should\nbe assigned to the auxiliary data during the model training stage.\n","authors":["Lei Xin","Lintao Ye","George Chiu","Shreyas Sundaram"],"pdf_url":"https://arxiv.org/pdf/2302.04344v3.pdf","comment":"15 pages,9 figures"},{"id":"http://arxiv.org/abs/2501.09948v1","updated":"2025-01-17T04:20:43Z","published":"2025-01-17T04:20:43Z","title":"AI Explainability for Power Electronics: From a Lipschitz Continuity\n  Perspective","summary":"  Lifecycle management of power converters continues to thrive with emerging\nartificial intelligence (AI) solutions, yet AI mathematical explainability\nremains unexplored in power electronics (PE) community. The lack of theoretical\nrigor challenges adoption in mission-critical applications. Therefore, this\nletter proposes a generic framework to evaluate mathematical explainability,\nhighlighting inference stability and training convergence from a Lipschitz\ncontinuity perspective. Inference stability governs consistent outputs under\ninput perturbations, essential for robust real-time control and fault\ndiagnosis. Training convergence guarantees stable learning dynamics,\nfacilitating accurate modeling in PE contexts. Additionally, a Lipschitz-aware\nlearning rate selection strategy is introduced to accelerate convergence while\nmitigating overshoots and oscillations. The feasibility of the proposed\nLipschitz-oriented framework is demonstrated by validating the mathematical\nexplainability of a state-of-the-art physics-in-architecture neural network,\nand substantiated through empirical case studies on dual-active-bridge\nconverters. This letter serves as a clarion call for the PE community to\nembrace mathematical explainability, heralding a transformative era of\ntrustworthy and explainable AI solutions that potentially redefine the future\nof power electronics.\n","authors":["Xinze Li","Fanfan Lin","Homer Alan Mantooth","Juan José Rodríguez-Andina"],"pdf_url":"https://arxiv.org/pdf/2501.09948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09944v1","updated":"2025-01-17T03:52:54Z","published":"2025-01-17T03:52:54Z","title":"Minimum-Time Sequential Traversal by a Team of Small Unmanned Aerial\n  Vehicles in an Unknown Environment with Winds","summary":"  We consider the problem of transporting multiple packages from an initial\nlocation to a destination location in a windy urban environment using a team of\nSUAVs. Each SUAV carries one package. We assume that the wind field is unknown,\nbut wind speed can be measured by SUAVs during flight. The SUAVs fly\nsequentially one after the other, measure wind speeds along their trajectories,\nand report the measurements to a central computer. The overall objective is to\nminimize the total travel time of all SUAVs, which is in turn related to the\nnumber of SUAV traversals through the environment. For a discretized\nenvironment modeled by a graph, we describe a method to estimate wind speeds\nand the time of traversal for each SUAV path. Each SUAV traverses a\nminimum-time path planned based on the current wind field estimate. We study\ncases of static and time-varying wind fields with and without measurement\nnoise. For each case, we demonstrate via numerical simulation that the proposed\nmethod finds the optimal path after a minimal number of traversals.\n","authors":["Jeffrey A. DesRoches","Raghvendra V. Cowlagi"],"pdf_url":"https://arxiv.org/pdf/2501.09944v1.pdf","comment":"Draft submitted to the 2025 American Control Conference"},{"id":"http://arxiv.org/abs/2202.02300v2","updated":"2025-01-17T01:37:58Z","published":"2022-02-04T18:37:24Z","title":"From Semi-Infinite Constraints to Structured Robust Policies: Optimal\n  Gain Selection for Financial Systems","summary":"  This paper studies the robust optimal gain selection problem for financial\ntrading systems, formulated within a \\emph{double linear policy} framework,\nwhich allocates capital across long and short positions. The key objective is\nto guarantee \\emph{robust positive expected} (RPE) profits uniformly across a\nrange of uncertain market conditions while ensuring risk control. This problem\nleads to a robust optimization formulation with \\emph{semi-infinite}\nconstraints, where the uncertainty is modeled by a bounded set of possible\nreturn parameters. We address this by transforming semi-infinite constraints\ninto structured policies -- the \\emph{balanced} policy and the\n\\emph{complementary} policy -- which enable explicit characterization of the\noptimal solution. Additionally, we propose a novel graphical approach to\nefficiently solve the robust gain selection problem, drastically reducing\ncomputational complexity. Empirical validation on historical stock price data\ndemonstrates superior performance in terms of risk-adjusted returns and\ndownside risk compared to conventional strategies. This framework generalizes\nclassical mean-variance optimization by incorporating robustness\nconsiderations, offering a systematic and efficient solution for robust trading\nunder uncertainty.\n","authors":["Chung-Han Hsieh"],"pdf_url":"https://arxiv.org/pdf/2202.02300v2.pdf","comment":"Submitted for possible publication"},{"id":"http://arxiv.org/abs/2411.14593v2","updated":"2025-01-17T01:00:13Z","published":"2024-11-21T21:23:46Z","title":"A Systematic Study of Multi-Agent Deep Reinforcement Learning for Safe\n  and Robust Autonomous Highway Ramp Entry","summary":"  Vehicles today can drive themselves on highways and driverless robotaxis\noperate in major cities, with more sophisticated levels of autonomous driving\nexpected to be available and become more common in the future. Yet, technically\nspeaking, so-called \"Level 5\" (L5) operation, corresponding to full autonomy,\nhas not been achieved. For that to happen, functions such as fully autonomous\nhighway ramp entry must be available, and provide provably safe, and reliably\nrobust behavior to enable full autonomy. We present a systematic study of a\nhighway ramp function that controls the vehicles forward-moving actions to\nminimize collisions with the stream of highway traffic into which a merging\n(ego) vehicle enters. We take a game-theoretic multi-agent (MA) approach to\nthis problem and study the use of controllers based on deep reinforcement\nlearning (DRL). The virtual environment of the MA DRL uses self-play with\nsimulated data where merging vehicles safely learn to control longitudinal\nposition during a taper-type merge. The work presented in this paper extends\nexisting work by studying the interaction of more than two vehicles (agents)\nand does so by systematically expanding the road scene with additional traffic\nand ego vehicles. While previous work on the two-vehicle setting established\nthat collision-free controllers are theoretically impossible in fully\ndecentralized, non-coordinated environments, we empirically show that\ncontrollers learned using our approach are nearly ideal when measured against\nidealized optimal controllers.\n","authors":["Larry Schester","Luis E. Ortiz"],"pdf_url":"https://arxiv.org/pdf/2411.14593v2.pdf","comment":"9 pages, 9 figures; added support ack"},{"id":"http://arxiv.org/abs/2501.09889v1","updated":"2025-01-17T00:39:55Z","published":"2025-01-17T00:39:55Z","title":"Learning port maneuvers from data for automatic guidance of Unmanned\n  Surface Vehicles","summary":"  At shipping ports, some repetitive maneuvering tasks such as entering/leaving\nport, transporting goods inside it or just making surveillance activities, can\nbe efficiently and quickly carried out by a domestic pilot according to his\nexperience. This know-how can be seized by Unmanned Surface Vehicles (USV) in\norder to autonomously replicate the same tasks. However, the inherent\nnonlinearity of ship trajectories and environmental perturbations as wind or\nmarine currents make it difficult to learn a model and its respective control.\nWe therefore present a data-driven learning and control methodology for USV,\nwhich is based on Gaussian Mixture Model, Gaussian Mixture Regression and the\nSontag's universal formula. Our approach is capable to learn the nonlinear\ndynamics as well as guarantee the convergence toward the target with a robust\ncontroller. Real data have been collected through experiments with a vessel at\nthe port of Ceuta. The complex trajectories followed by an expert have been\nlearned including the robust controller. The effect of the controller over\nnoise/perturbations are presented, a measure of error is used to compare\nestimates and real data trajectories, and finally, an analysis of computational\ncomplexity is performed.\n","authors":["Yeyson A. Becerra-Mora","José Ángel Acosta","Ángel Rodríguez Castaño"],"pdf_url":"https://arxiv.org/pdf/2501.09889v1.pdf","comment":"Preprint submitted to journal (under review). 25 pages, 13 figures, 3\n  tables"},{"id":"http://arxiv.org/abs/2501.10610v1","updated":"2025-01-17T23:59:08Z","published":"2025-01-17T23:59:08Z","title":"Automated Water Irrigation System","summary":"  This paper presents the design and implementation of an automated water\nirrigation system aimed at optimizing plant care through precision moisture\nmonitoring and controlled water delivery. The system uses a capacitive soil\nmoisture sensor, an ADC (analog-to-digital converter), and a relay-driven water\npump to ensure plants receive adequate hydration based on real-time data. In\naddition, this work aims to build on existing applications for Raspberry Pi\n(4B) and Arduino-based automatic irrigation systems by integrating advanced\ncalibration methods, employing optimized algorithms, and introducing new\ntechnologies to further enhance overall system efficiency and reliability.\n","authors":["Matthew Okner","David Veksler"],"pdf_url":"https://arxiv.org/pdf/2501.10610v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2501.10605v1","updated":"2025-01-17T23:37:21Z","published":"2025-01-17T23:37:21Z","title":"Wasserstein Adaptive Value Estimation for Actor-Critic Reinforcement\n  Learning","summary":"  We present Wasserstein Adaptive Value Estimation for Actor-Critic (WAVE), an\napproach to enhance stability in deep reinforcement learning through adaptive\nWasserstein regularization. Our method addresses the inherent instability of\nactor-critic algorithms by incorporating an adaptively weighted Wasserstein\nregularization term into the critic's loss function. We prove that WAVE\nachieves $\\mathcal{O}\\left(\\frac{1}{k}\\right)$ convergence rate for the\ncritic's mean squared error and provide theoretical guarantees for stability\nthrough Wasserstein-based regularization. Using the Sinkhorn approximation for\ncomputational efficiency, our approach automatically adjusts the regularization\nbased on the agent's performance. Theoretical analysis and experimental results\ndemonstrate that WAVE achieves superior performance compared to standard\nactor-critic methods.\n","authors":["Ali Baheri","Zahra Sharooei","Chirayu Salgarkar"],"pdf_url":"https://arxiv.org/pdf/2501.10605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10592v1","updated":"2025-01-17T22:53:21Z","published":"2025-01-17T22:53:21Z","title":"Analytical Models of Frequency and Voltage in Large-Scale All-Inverter\n  Power Systems","summary":"  Low-order frequency response models for power systems have a decades-long\nhistory in optimization and control problems such as unit commitment, economic\ndispatch, and wide-area control. With a few exceptions, these models are built\nupon the Newtonian mechanics of synchronous generators, assuming that the\nfrequency dynamics across a system are approximately homogeneous, and assume\nthe dynamics of nodal voltages for most operating conditions are negligible,\nand thus are not directly computed at all buses. As a result, the use of system\nfrequency models results in the systematic underestimation of frequency minimum\nnadir and maximum RoCoF, and provides no insight into the reactive\npower-voltage dynamics. This paper proposes a low-order model of both frequency\nand voltage response in grid-forming inverter-dominated power systems. The\nproposed model accounts for spatial-temporal variations in frequency and\nvoltage behavior across a system and as a result, demonstrates the\nheterogeneity of frequency response in future renewable power systems.\nElectromagnetic transient (EMT) simulations are used to validate the utility,\naccuracy, and computational efficiency of these models, setting the basis for\nthem to serve as fast, scalable alternatives to EMT simulation, especially when\ndealing with very large-scale systems, for both planning and operational\nstudies.\n","authors":["Marena Trujillo","Amir Sajadi","Bri-Mathias Hodge"],"pdf_url":"https://arxiv.org/pdf/2501.10592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12406v2","updated":"2025-01-17T20:27:35Z","published":"2023-09-21T18:11:53Z","title":"Safety Index Synthesis with State-dependent Control Space","summary":"  This paper introduces an approach for synthesizing feasible safety indices to\nderive safe control laws under state-dependent control spaces. The problem,\nreferred to as Safety Index Synthesis (SIS), is challenging because it requires\nthe existence of feasible control input in all states and leads to an infinite\nnumber of constraints. The proposed method leverages Positivstellensatz to\nformulate SIS as a nonlinear programming (NP) problem. We formally prove that\nthe NP solutions yield safe control laws with two imperative guarantees:\nforward invariance within user-defined safe regions and finite-time convergence\nto those regions. A numerical study validates the effectiveness of our\napproach.\n","authors":["Rui Chen","Weiye Zhao","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2309.12406v2.pdf","comment":"2024 American Control Conference (ACC)"},{"id":"http://arxiv.org/abs/2501.07273v2","updated":"2025-01-17T20:11:01Z","published":"2025-01-13T12:36:11Z","title":"An Extended Survey and a Comparison Framework for Dataflow Models of\n  Computation and Communication","summary":"  Dataflow Model of Computation and Communications (DF MoCCs) is a formalism\nused to specify the behavior of Cyber-Physical Systems (CPSs). DF MoCCs are\nwidely used in the design of CPSs, as they provide a high-level of abstraction\nto specify the system's behavior. DF MoCCs rules give semantics to a dataflow\nspecification of a CPS, and static analysis algorithms rely on these semantics\nto guarantee safety properties of the dataflow specification, such as bounded\nmemory usage and deadlock freeness. A wide range of DF MoCCs exists, each with\nits own characteristics and static analyses. This paper presents a survey of\nthose DF MoCCs and a classification in eight categories. In addition, DF MoCCs\nare characterized by a comprehensive list of features and static analyses,\nwhich reflect their expressiveness and analyzability. Based on this\ncharacterization, a framework is proposed to compare the expressiveness and the\nanalyzability of DF MoCCs quantitatively.\n","authors":["Guillaume Roumage","Selma Azaiez","Cyril Faure","Stéphane Louise"],"pdf_url":"https://arxiv.org/pdf/2501.07273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10523v1","updated":"2025-01-17T19:53:26Z","published":"2025-01-17T19:53:26Z","title":"Multiclass Queue Scheduling Under Slowdown: An Approximate Dynamic\n  Programming Approach","summary":"  In many service systems, especially those in healthcare, customer waiting\ntimes can result in increased service requirements. Such service slowdowns can\nsignificantly impact system performance. Therefore, it is important to properly\naccount for their impact when designing scheduling policies. Scheduling under\nwait-dependent service times is challenging, especially when multiple customer\nclasses are heterogeneously affected by waiting. In this work, we study\nscheduling policies in multiclass, multiserver queues with wait-dependent\nservice slowdowns. We propose a simulation-based Approximate Dynamic\nProgramming (ADP) algorithm to find close-to-optimal scheduling policies. The\nADP algorithm (i) represents the policy using classifiers based on the index\npolicy structure, (ii) leverages a coupling method to estimate the differences\nof the relative value functions directly, and (iii) uses adaptive sampling for\nefficient state-space exploration. Through extensive numerical experiments, we\nillustrate that the ADP algorithm generates close-to-optimal policies that\noutperform well-known benchmarks. We also provide insights into the structure\nof the optimal policy, which reveals an important trade-off between\ninstantaneous cost reduction and preventing the system from reaching high-cost\nequilibria. Lastly, we conduct a case study on scheduling admissions into\nrehabilitation care to illustrate the effectiveness of the ADP algorithm in\npractice.\n","authors":["Jing Dong","Berk Görgülü","Vahid Sarhangian"],"pdf_url":"https://arxiv.org/pdf/2501.10523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10492v1","updated":"2025-01-17T12:13:04Z","published":"2025-01-17T12:13:04Z","title":"ACCEPT: Diagnostic Forecasting of Battery Degradation Through\n  Contrastive Learning","summary":"  Modeling lithium-ion battery (LIB) degradation offers significant cost\nsavings and enhances the safety and reliability of electric vehicles (EVs) and\nbattery energy storage systems (BESS). Whilst data-driven methods have received\ngreat attention for forecasting degradation, they often demonstrate limited\ngeneralization ability and tend to underperform particularly in critical\nscenarios involving accelerated degradation, which are crucial to predict\naccurately. These methods also fail to elucidate the underlying causes of\ndegradation. Alternatively, physical models provide a deeper understanding, but\ntheir complex parameters and inherent uncertainties limit their applicability\nin real-world settings. To this end, we propose a new model - ACCEPT. Our novel\nframework uses contrastive learning to map the relationship between the\nunderlying physical degradation parameters and observable operational\nquantities, combining the benefits of both approaches. Furthermore, due to the\nsimilarity of degradation paths between LIBs with the same chemistry, this\nmodel transfers non-trivially to most downstream tasks, allowing for zero-shot\ninference. Additionally, since categorical features can be included in the\nmodel, it can generalize to other LIB chemistries. This work establishes a\nfoundational battery degradation model, providing reliable forecasts across a\nrange of battery types and operating conditions.\n","authors":["James Sadler","Rizwaan Mohammed","Michael Castle","Kotub Uddin"],"pdf_url":"https://arxiv.org/pdf/2501.10492v1.pdf","comment":null}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2405.09809v5","updated":"2025-01-17T18:12:17Z","published":"2024-05-16T04:42:21Z","title":"Dynamic Sensor Selection for Biomarker Discovery","summary":"  Advances in methods of biological data collection are driving the rapid\ngrowth of comprehensive datasets across clinical and research settings. These\ndatasets provide the opportunity to monitor biological systems in greater depth\nand at finer time steps than was achievable in the past. Classically,\nbiomarkers are used to represent and track key aspects of a biological system.\nBiomarkers retain utility even with the availability of large datasets, since\nmonitoring and interpreting changes in a vast number of molecules remains\nimpractical. However, given the large number of molecules in these datasets, a\nmajor challenge is identifying the best biomarkers for a particular setting\nHere, we apply principles of observability theory to establish a general\nmethodology for biomarker selection. We demonstrate that observability measures\neffectively identify biologically meaningful sensors in a range of time series\ntranscriptomics data. Motivated by the practical considerations of biological\nsystems, we introduce the method of dynamic sensor selection (DSS) to maximize\nobservability over time, thus enabling observability over regimes where system\ndynamics themselves are subject to change. This observability framework is\nflexible, capable of modeling gene expression dynamics and using auxiliary\ndata, including chromosome conformation, to select biomarkers. Additionally, we\ndemonstrate the applicability of this approach beyond genomics by evaluating\nthe observability of neural activity These applications demonstrate the utility\nof observability-guided biomarker selection for across a wide range of\nbiological systems, from agriculture and biomanufacturing to neural\napplications and beyond.\n","authors":["Joshua Pickard","Cooper Stansbury","Amit Surana","Lindsey Muir","Anthony Bloch","Indika Rajapakse"],"pdf_url":"https://arxiv.org/pdf/2405.09809v5.pdf","comment":"21 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.13828v4","updated":"2025-01-17T17:52:07Z","published":"2024-08-25T13:07:34Z","title":"Centralized Reduction of Decentralized Stochastic Control Models and\n  their weak-Feller Regularity","summary":"  Decentralized stochastic control problems involving general\nstate/measurement/action spaces are intrinsically difficult to study because of\nthe inapplicability of standard tools from centralized (single-agent)\nstochastic control. In this paper, we address some of these challenges for\ndecentralized stochastic control with standard Borel spaces under two different\nbut tightly related information structures: the one-step delayed information\nsharing pattern (OSDISP), and the $K$-step periodic information sharing pattern\n(KSPISP). We will show that the one-step delayed and $K$-step periodic problems\ncan be reduced to a centralized Markov Decision Process (MDP), generalizing\nprior results which considered finite, linear, or static models, by addressing\nseveral measurability and topological questions. We then provide sufficient\nconditions for the transition kernels of both centralized reductions to be\nweak-Feller. The existence and separated nature of optimal policies under both\ninformation structures are then established. The weak Feller regularity also\nfacilitates rigorous approximation and learning theoretic results, as shown in\nthe paper.\n","authors":["Omar Mrani-Zentar","Serdar Yüksel"],"pdf_url":"https://arxiv.org/pdf/2408.13828v4.pdf","comment":"A summary of the results was presented in CDC'24"},{"id":"http://arxiv.org/abs/2211.15652v3","updated":"2025-01-17T17:38:36Z","published":"2022-11-28T18:56:20Z","title":"Stochastic Optimal Control via Local Occupation Measures","summary":"  Viewing stochastic processes through the lens of occupation measures has\nproved to be a powerful angle of attack for the theoretical and computational\nanalysis of stochastic optimal control problems. We present a simple\nmodification of the traditional occupation measure framework derived from\nresolving the occupation measures locally on a partition of the control\nproblem's space-time domain. This notion of local occupation measures provides\nfine-grained control over the construction of structured semidefinite\nprogramming relaxations for a rich class of stochastic optimal control problems\nwith embedded diffusion and jump processes via the moment-sum-of-squares\nhierarchy. As such, it bridges the gap between discretization-based\napproximations to the Hamilton-Jacobi-Bellmann equations and occupation measure\nrelaxations. We demonstrate with examples that this approach enables the\ncomputation of high quality bounds for the optimal value of a large class of\nstochastic optimal control problems with significant performance gains relative\nto the traditional occupation measure framework.\n","authors":["Flemming Holtorf","Alan Edelman","Christopher Rackauckas"],"pdf_url":"https://arxiv.org/pdf/2211.15652v3.pdf","comment":"22 pages, 4 figures, associated implementation:\n  https://github.com/FHoltorf/MarkovBounds.jl"},{"id":"http://arxiv.org/abs/2410.09182v2","updated":"2025-01-17T17:30:13Z","published":"2024-10-11T18:35:48Z","title":"On the Hypomonotone Class of Variational Inequalities","summary":"  This paper studies the behavior of the extragradient algorithm [Korpelevich,\n1976] when applied to hypomonotone operators, a class of problems that extends\nbeyond the classical monotone setting. To support the understanding of this\nvariational inequality problem class, we focus on a subclass of hypomonotone\nlinear operators, characterizing them based on their eigenvalues and providing\nconcrete examples. While the extragradient method is widely recognized for its\nefficiency in solving variational inequalities involving monotone and Lipschitz\ncontinuous operators, we demonstrate that it does not guarantee convergence in\nthe hypomonotone case. In particular, we construct a counterexample where the\nextragradient method diverges regardless of the step size. A numerical\nexperiment is presented to support this result.\n","authors":["Khaled Alomar","Tatjana Chavdarova"],"pdf_url":"https://arxiv.org/pdf/2410.09182v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02895v2","updated":"2025-01-17T17:26:56Z","published":"2024-10-03T18:40:00Z","title":"Near Optimal Approximations and Finite Memory Policies for POMPDs with\n  Continuous Spaces","summary":"  We study an approximation method for partially observed Markov decision\nprocesses (POMDPs) with continuous spaces. Belief MDP reduction, which has been\nthe standard approach to study POMDPs requires rigorous approximation methods\nfor practical applications, due to the state space being lifted to the space of\nprobability measures. Generalizing recent work, in this paper we present\nrigorous approximation methods via discretizing the observation space and\nconstructing a fully observed finite MDP model using a finite length history of\nthe discrete observations and control actions. We show that the resulting\npolicy is near-optimal under some regularity assumptions on the channel, and\nunder certain controlled filter stability requirements for the hidden state\nprocess. Furthermore, by quantizing the measurements, we are able to utilize\nrefined filter stability conditions. We also provide a Q learning algorithm\nthat uses a finite memory of discretized information variables, and prove its\nconvergence to the optimality equation of the finite fully observed MDP\nconstructed using the approximation method.\n","authors":["Ali Devran Kara","Erhan Bayraktar","Serdar Yuksel"],"pdf_url":"https://arxiv.org/pdf/2410.02895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10306v1","updated":"2025-01-17T16:59:08Z","published":"2025-01-17T16:59:08Z","title":"Micro-Macro Decomposition of Particle Swarm Optimization Methods","summary":"  Solving non-convex minimization problems using multi-particle metaheuristic\nderivative-free optimization methods is still active area of research. Popular\nmethods are Particle Swarm Optimization (PSO) methods, that iteratively update\na population of particles according to dynamics inspired by social interactions\nbetween individuals. We present a modification to include constrained\nminimization problems using exact penalization. Additionally, we utilize the\nhierarchical structure of PSO to introduce a micro-macro decomposition of the\nalgorithm. The probability density of particles is written as a convex\ncombination of microscopic and macroscopic contributions, and both parts are\npropagated separately. The decomposition is dynamically updated based on\nheuristic considerations. Numerical examples compare the results obtained using\nthe algorithm in the microscopic scale, in the macroscopic scale, and, using\nthe new micro-macro decomposition.\n","authors":["Michael Herty","Sara Veneruso"],"pdf_url":"https://arxiv.org/pdf/2501.10306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06536v4","updated":"2025-01-17T16:16:09Z","published":"2024-01-12T12:23:27Z","title":"Controlling the Rates of a Chain of Harmonic Oscillators with a Point\n  Langevin Thermostat","summary":"  We consider the control problem of controlling the rates of an infinite chain\nof coupled harmonic oscillators with a Langevin thermostat at the origin. We\nstudy the effect of two types of open-loop boundary controls, impulsive control\nand linear memory-feedback control, in the high frequency limit. We investigate\ntheir action on the reflection-transmission coefficients for the wave energy\nfor the scattering of the thermostat. Our study shows that the impulsive\nboundary controls have no impact on the rates and are thus not appropriate to\nact on the system, despite their physical meaning and relevance. In contrast,\nthe second kind of control that we propose, which is less standard and uses the\npast of the state solution of the system, is adequate and relevant. We prove\nthat any triple of rates satisfying appropriate assumptions is asymptotically\nreachable thanks to the linear memory-feedback controls that we design\nexplicitly.\n","authors":["Amirali Hannani","Minh-Binh Tran","Minh Nhat Phung","Emmanuel Trélat"],"pdf_url":"https://arxiv.org/pdf/2401.06536v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17867v2","updated":"2025-01-17T16:05:14Z","published":"2024-05-28T06:31:47Z","title":"Adjustable Robust Nonlinear Network Design Without Controllable Elements\n  under Load Scenario Uncertainties","summary":"  We study network design problems for nonlinear and nonconvex flow models\nwithout controllable elements under load scenario uncertainties, i.e., under\nuncertain injections and withdrawals. To this end, we apply the concept of\nadjustable robust optimization to compute a network design that admits a\nfeasible transport for all, possibly infinitely many, load scenarios within a\ngiven uncertainty set. For solving the corresponding adjustable robust\nmixed-integer nonlinear optimization problem, we show that a given network\ndesign is robust feasible, i.e., it admits a feasible transport for all load\nscenario uncertainties, if and only if a finite number of worst-case load\nscenarios can be routed through the network. We compute these worst-case\nscenarios by solving polynomially many nonlinear optimization problems.\nEmbedding this result for robust feasibility in an adversarial approach leads\nto an exact algorithm that computes an optimal robust network design in a\nfinite number of iterations. Since all of the results are valid for general\npotential-based flows, the approach can be applied to different utility\nnetworks such as gas, hydrogen, or water networks. We finally demonstrate the\napplicability of the method by computing robust gas networks that are protected\nfrom future load fluctuations.\n","authors":["Johannes Thürauf","Julia Grübel","Martin Schmidt"],"pdf_url":"https://arxiv.org/pdf/2405.17867v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10268v1","updated":"2025-01-17T15:53:19Z","published":"2025-01-17T15:53:19Z","title":"Fixed Confidence and Fixed Tolerance Bi-level Optimization for Selecting\n  the Best Optimized System","summary":"  In this paper, we study a fixed-confidence, fixed-tolerance formulation of a\nclass of stochastic bi-level optimization problems, where the upper-level\nproblem selects from a finite set of systems based on a performance metric, and\nthe lower-level problem optimizes continuous decision variables for each\nsystem. Notably, the objective functions for the upper and lower levels can\ndiffer. This class of problems has a wide range of applications, including\nmodel selection, ranking and selection under input uncertainty, and optimal\ndesign. To address this, we propose a multi-stage Pruning-Optimization\nframework that alternates between comparing the performance of different\nsystems (Pruning) and optimizing systems (Optimization). % In the Pruning\nstage, we design a sequential algorithm that identifies and eliminates inferior\nsystems through systematic performance evaluations. In the Optimization stage,\nthe goal is to solve for a near-optimal solution that meets specified\nconfidence and tolerance requirements. This multi-stage framework is designed\nto enhance computational efficiency by pruning inferior systems with high\ntolerance early on, thereby avoiding unnecessary computational efforts. We\ndemonstrate the effectiveness of the proposed algorithm through both\ntheoretical analysis of statistical validity and sample complexity and\nnumerical experiments.\n","authors":["Yuhao Wang","Seong-Hee Kim","Enlu Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.10268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10258v1","updated":"2025-01-17T15:40:03Z","published":"2025-01-17T15:40:03Z","title":"DADA: Dual Averaging with Distance Adaptation","summary":"  We present a novel universal gradient method for solving convex optimization\nproblems. Our algorithm -- Dual Averaging with Distance Adaptation (DADA) -- is\nbased on the classical scheme of dual averaging and dynamically adjusts its\ncoefficients based on observed gradients and the distance between iterates and\nthe starting point, eliminating the need for problem-specific parameters. DADA\nis a universal algorithm that simultaneously works for a broad spectrum of\nproblem classes, provided the local growth of the objective function around its\nminimizer can be bounded. Particular examples of such problem classes are\nnonsmooth Lipschitz functions, Lipschitz-smooth functions, H\\\"older-smooth\nfunctions, functions with high-order Lipschitz derivative,\nquasi-self-concordant functions, and $(L_0,L_1)$-smooth functions. Crucially,\nDADA is applicable to both unconstrained and constrained problems, even when\nthe domain is unbounded, without requiring prior knowledge of the number of\niterations or desired accuracy.\n","authors":["Mohammad Moshtaghifar","Anton Rodomanov","Daniil Vankov","Sebastian Stich"],"pdf_url":"https://arxiv.org/pdf/2501.10258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.08622v3","updated":"2025-01-17T15:37:57Z","published":"2022-05-17T20:42:32Z","title":"Solving Optimal Control Problems of Rigid-Body Dynamics with Collisions\n  Using the Hybrid Minimum Principle","summary":"  Collisions are common in many dynamical systems with real applications. They\ncan be formulated as hybrid dynamical systems with discontinuities\nautomatically triggered when states transverse certain manifolds. We present an\nalgorithm for the optimal control problem of such hybrid dynamical systems\nbased on solving the equations derived from the hybrid minimum principle (HMP).\nThe algorithm is an iterative scheme following the spirit of the method of\nsuccessive approximations (MSA), and it is robust to undesired collisions\nobserved in the initial guesses. We propose several techniques to address the\nadditional numerical challenges introduced by the presence of discontinuities.\nThe algorithm is tested on disc collision problems whose optimal solutions\nexhibit one or multiple collisions. Linear convergence in terms of iteration\nsteps and asymptotic first-order accuracy in terms of time discretization are\nobserved when the algorithm is implemented with the forward-Euler scheme. The\nnumerical results demonstrate that the proposed algorithm has better accuracy\nand convergence than direct methods based on gradient descent. Furthermore, the\nalgorithm is also simpler, more accurate, and more stable than a deep\nreinforcement learning method.\n","authors":["Wei Hu","Jihao Long","Yaohua Zang","Weinan E","Jiequn Han"],"pdf_url":"https://arxiv.org/pdf/2205.08622v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13344v2","updated":"2025-01-17T15:12:58Z","published":"2024-08-23T19:38:19Z","title":"Extremum Seeking for Linear Time-Varying Systems with Unknown Control\n  Directions","summary":"  We consider bounded extremum seeking controls for time-varying linear systems\nwith uncertain coefficient matrices and measurement uncertainty. Using a new\nchange of variables, Lyapunov functions, and a comparison principle, we provide\npractical exponential stability bounds for the states of the closed loop\nsystems that hold for all nonnegative times. For the first time for linear\ntime-varying systems with unknown control directions, we consider bounded\nextremum seeking controls in the presence of uncertain time-varying input\ndelays with small time-varying delay uncertainties, and we provide reduction\nmodel controllers to compensate for the constant part of the delays.\n","authors":["Frederic Mazenc","Michael Malisoff","Emilia Fridman"],"pdf_url":"https://arxiv.org/pdf/2408.13344v2.pdf","comment":"15 pages, 1 figure"},{"id":"http://arxiv.org/abs/2412.03478v3","updated":"2025-01-17T14:38:31Z","published":"2024-12-04T17:09:29Z","title":"Solving Monge problem by Hilbert space embeddings of probability\n  measures","summary":"  We propose deep learning methods for classical Monge's optimal mass\ntransportation problems, where where the distribution constraint is treated as\npenalty terms defined by the maximum mean discrepancy in the theory of Hilbert\nspace embeddings of probability measures. We prove that the transport maps\ngiven by the proposed methods converge to optimal transport maps in the problem\nwith $L^2$ cost. Several numerical experiments validate our methods. In\nparticular, we show that our methods are applicable to large-scale Monge\nproblems.\n","authors":["Takafumi Saito","Yumiharu Nakano"],"pdf_url":"https://arxiv.org/pdf/2412.03478v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09670v2","updated":"2025-01-17T14:03:07Z","published":"2025-01-16T17:06:25Z","title":"A Simplification Method for Inequality Constraints in Integer Binary\n  Encoding HOBO Formulations","summary":"  This study proposes a novel method for simplifying inequality constraints in\nHigher-Order Binary Optimization (HOBO) formulations. The proposed method\naddresses challenges associated with Quadratic Unconstrained Binary\nOptimization (QUBO) formulations, specifically the increased computational\ncomplexity and reduced solution accuracy caused by the introduction of slack\nvariables and the resulting growth in auxiliary qubits. By efficiently\nintegrating constraints, the method enhances the computational efficiency and\naccuracy of both quantum and classical solvers. The effectiveness of the\nproposed approach is demonstrated through numerical experiments applied to\ncombinatorial optimization problems. The results indicate that this method\nexpands the applicability of quantum algorithms to high-dimensional problems\nand improves the practicality of classical optimization solvers for\noptimization problems involving inequality constraints.\n","authors":["Yuichiro Minato"],"pdf_url":"https://arxiv.org/pdf/2501.09670v2.pdf","comment":"The assumptions of the paper are overly restrictive, and there is a\n  critical error"},{"id":"http://arxiv.org/abs/2405.08450v2","updated":"2025-01-17T13:42:21Z","published":"2024-05-14T09:13:48Z","title":"Effective Front-Descent Algorithms with Convergence Guarantees","summary":"  In this manuscript, we address continuous unconstrained multi-objective\noptimization problems and we discuss descent type methods for the\nreconstruction of the Pareto set. Specifically, we analyze the class of Front\nDescent methods, which generalizes the Front Steepest Descent algorithm\nallowing the employment of suitable, effective search directions (e.g., Newton,\nQuasi-Newton, Barzilai-Borwein). We provide a deep characterization of the\nbehavior and the mechanisms of the algorithmic framework, and we prove that,\nunder reasonable assumptions, standard convergence results and some complexity\nbounds hold for the generalized approach. Moreover, we prove that popular\nsearch directions can indeed be soundly used within the framework. Then, we\nprovide a completely novel type of convergence results, concerning the sequence\nof sets produced by the procedure. In particular, iterate sets are shown to\nasymptotically approach stationarity for all of their points; the convergence\nresult is accompanied by a worst-case iteration complexity bound; additionally,\nin finite precision settings, the sets are shown to only be enriched through\nexploration steps in later iterations, and suitable stopping conditions can be\ndevised. Finally, the results from a large experimental benchmark show that the\nproposed class of approaches far outperforms state-of-the-art methodologies.\n","authors":["Matteo Lapucci","Pierluigi Mansueto","Davide Pucci"],"pdf_url":"https://arxiv.org/pdf/2405.08450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10173v1","updated":"2025-01-17T13:14:53Z","published":"2025-01-17T13:14:53Z","title":"Optimal Restart Strategies for Parameter-dependent Optimization\n  Algorithms","summary":"  This paper examines restart strategies for algorithms whose successful\ntermination depends on an unknown parameter $\\lambda$. After each restart,\n$\\lambda$ is increased, until the algorithm terminates successfully. It is\nassumed that there is a unique, unknown, optimal value for $\\lambda$. For the\nalgorithm to run successfully, this value must be reached or surpassed. The key\nquestion is whether there exists an optimal strategy for selecting $\\lambda$\nafter each restart taking into account that the computational costs (runtime)\nincreases with $\\lambda$. In this work, potential restart strategies are\nclassified into parameter-dependent strategy types. A loss function is\nintroduced to quantify the wasted computational cost relative to the optimal\nstrategy. A crucial requirement for any efficient restart strategy is that its\nloss, relative to the optimal $\\lambda$, remains bounded. To this end, upper\nand lower bounds of the loss are derived. Using these bounds it will be shown\nthat not all strategy types are bounded. However, for a particular strategy\ntype, where $\\lambda$ is increased multiplicatively by a constant factor\n$\\lambda$, the relative loss function is bounded. Furthermore, it will be\ndemonstrated that within this strategy type, there exists an optimal value for\n$\\lambda$ that minimizes the maximum relative loss. In the asymptotic limit,\nthis optimal choice of $\\lambda$ does not depend on the unknown optimal\n$\\lambda$.\n","authors":["Lisa Schönenberger","Hans-Georg Beyer"],"pdf_url":"https://arxiv.org/pdf/2501.10173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07122v3","updated":"2025-01-17T11:08:15Z","published":"2024-09-11T09:19:46Z","title":"Decentralized Conjugate Gradient and Memoryless BFGS Methods","summary":"  This paper proposes a new decentralized conjugate gradient (NDCG) method and\na decentralized memoryless BFGS (DMBFGS) method for the nonconvex and strongly\nconvex decentralized optimization problem, respectively, of minimizing a finite\nsum of continuously differentiable functions over a fixed-connected undirected\nnetwork. Gradient tracking techniques are applied in these two methods to\nenhance their convergence properties and the numerical stability. In\nparticular, we show global convergence of NDCG with constant stepsize for\ngeneral nonconvex smooth decentralized optimization. Our new DMBFGS method uses\na scaled memoryless BFGS technique and only requires gradient information to\napproximate second-order information of the component functions in the\nobjective. We also establish global convergence and linear convergence rate of\nDMBFGS with constant stepsize for strongly convex smooth decentralized\noptimization. Our numerical results show that NDCG and DMBFGS are very\nefficient in terms of both iteration and communication cost compared with other\nstate-of-the-art methods for solving smooth decentralized optimization.\n","authors":["Liping Wang","Hao Wu","Hongchao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.07122v3.pdf","comment":"30 pages,27 figures"},{"id":"http://arxiv.org/abs/2408.17113v2","updated":"2025-01-17T10:11:36Z","published":"2024-08-30T08:55:29Z","title":"A Two-Timescale Decision-Hazard-Decision Formulation for Storage Usage\n  Values Calculation","summary":"  The penetration of renewable energies requires additional storages to deal\nwith intermittency. Accordingly, there is growing interest in evaluating the\nopportunity cost (usage value) associated with stored energy in large storages,\na cost obtained by solving a multistage stochastic optimization problem. Today,\nto compute usage values under uncertainties, an adequacy resource problem is\nsolved using stochastic dynamic programming assuming a hazard-decision\ninformation structure. This modelling assumes complete knowledge of the coming\nweek uncertainties, which is not adapted to the system operation as the\nintermittency occurs at smaller timescale. We equip the twotimescale problem\nwith a new information structure considering planning and recourse decisions:\ndecision-hazard-decision. This structure is used to decompose the multistage\ndecision-making process into a nonanticipative planning step in which the\non/off decisions for the thermal units are made, and a recourse step in which\nthe power modulation decisions are made once the uncertainties have been\ndisclosed. In a numerical case, we illustrate how usage values are sensitive as\nhow the disclosure of information is modelled.\n","authors":["Camila Martinez Parra","Michel de Lara","Jean-Philippe Chancelier","Pierre Carpentier","Jean-Marc Janin","Manuel Ruiz"],"pdf_url":"https://arxiv.org/pdf/2408.17113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.00260v2","updated":"2025-01-17T09:32:54Z","published":"2022-04-30T13:10:17Z","title":"Optimal Control of Several Motion Models","summary":"  This paper is devoted to the study of the dynamic optimization of several\ncontrolled crowd motion models in the general planar settings, which is an\napplication of a class of optimal control problems involving a general\nnonconvex sweeping process with perturbations. A set of necessary optimality\nconditions for such optimal control problems involving the crowd motion models\nwith multiple agents and obstacles is obtained and analyzed. Several effective\nalgorithms based on such necessary optimality conditions are proposed and\nvarious nontrivial illustrative examples together with their simulations are\nalso presented. The implementation of all the considered motion models can be\nfound via the link:\nhttps://github.com/tancao1128/Optimal_Control_of_Several_Motion_Models with the\ninstruction and demonstration video uploaded at\nhttps://www.youtube.com/watch?v=B8DQ0wvCtIQ.\n","authors":["Tan H. Cao","Nilson Chapagain","Haejoon Lee","Phung Ngoc Thi","Nguyen Nang Thieu"],"pdf_url":"https://arxiv.org/pdf/2205.00260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10051v1","updated":"2025-01-17T09:15:18Z","published":"2025-01-17T09:15:18Z","title":"A Family of Controllable Momentum Coefficients for Forward-Backward\n  Accelerated Algorithms","summary":"  Nesterov's accelerated gradient method (NAG) marks a pivotal advancement in\ngradient-based optimization, achieving faster convergence compared to the\nvanilla gradient descent method for convex functions. However, its algorithmic\ncomplexity when applied to strongly convex functions remains unknown, as noted\nin the comprehensive review by Chambolle and Pock [2016]. This issue, aside\nfrom the critical step size, was addressed by Li et al. [2024b], with the\nmonotonic case further explored by Fu and Shi [2024]. In this paper, we\nintroduce a family of controllable momentum coefficients for forward-backward\naccelerated methods, focusing on the critical step size $s=1/L$. Unlike\ntraditional linear forms, the proposed momentum coefficients follow an\n$\\alpha$-th power structure, where the parameter $r$ is adaptively tuned to\n$\\alpha$. Using a Lyapunov function specifically designed for $\\alpha$, we\nestablish a controllable $O\\left(1/k^{2\\alpha} \\right)$ convergence rate for\nthe NAG-$\\alpha$ method, provided that $r > 2\\alpha$. At the critical step\nsize, NAG-$\\alpha$ achieves an inverse polynomial convergence rate of arbitrary\ndegree by adjusting $r$ according to $\\alpha > 0$. We further simplify the\nLyapunov function by expressing it in terms of the iterative sequences $x_k$\nand $y_k$, eliminating the need for phase-space representations. This\nsimplification enables us to extend the controllable $O \\left(1/k^{2\\alpha}\n\\right)$ rate to the monotonic variant, M-NAG-$\\alpha$, thereby enhancing\noptimization efficiency. Finally, by leveraging the fundamental inequality for\ncomposite functions, we extended the controllable $O\\left(1/k^{2\\alpha}\n\\right)$ rate to proximal algorithms, including the fast iterative\nshrinkage-thresholding algorithm (FISTA-$\\alpha$) and its monotonic counterpart\n(M-FISTA-$\\alpha$).\n","authors":["Mingwei Fu","Bin Shi"],"pdf_url":"https://arxiv.org/pdf/2501.10051v1.pdf","comment":"22 pages, 1 figure. arXiv admin note: text overlap with\n  arXiv:2412.13527"},{"id":"http://arxiv.org/abs/2501.09385v2","updated":"2025-01-17T08:34:05Z","published":"2025-01-16T08:55:19Z","title":"The Effective Generalized Moment Problem","summary":"  We establish new convergence rates for the moment-sum-of-squares (Moment-SOS)\nrelaxations for the Generalized Moment Problem (GMP). These bounds, which adapt\nto the geometry of the underlying semi-algebraic set, apply to both the\nconvergence of optima, and to the convergence in Hausdorff distance between the\nrelaxation feasibility set and the GMP feasibility set. This research extends\nprevious works limited to specific problems in polynomial optimization, volume\ncomputation and optimal control. We complement our theoretical analysis with an\napplication: minimal rank symmetric tensor decomposition. In the examples, we\nformulate the problem as a GMP, solve using Moment-SOS relaxation, and apply\nthe theoretical results to observe a convergence rate of the relaxations.\n","authors":["Lucas Gamertsfelder","Bernard Mourrain"],"pdf_url":"https://arxiv.org/pdf/2501.09385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01359v2","updated":"2025-01-17T08:32:01Z","published":"2024-12-02T10:37:06Z","title":"A Bottom-Up Approach to Optimizing the Solar Organic Rankine Cycle for\n  Transactive Energy Trading","summary":"  Solar Organic Rankine Cycle (ORC)-based power generation plants leverage\nsolar irradiation to produce thermal energy, offering a highly compatible\nrenewable technology due to the alignment between solar irradiation\ntemperatures and ORC operating requirements. Their superior performance\ncompared to steam Rankine cycles in small-scale applications makes them\nparticularly relevant within the smart grid and microgrid contexts. This study\nexplores the role of ORC in peer-to-peer (P2P) energy trading within\nrenewable-based community microgrids, where consumers become prosumers,\nsimultaneously producing and consuming energy while engaging in virtual trading\nat the distribution system level. Focusing on a microgrid integrating solar ORC\nwith a storage system to meet consumer demand, the paper highlights the\nimportance of combining these technologies with storage to enhance\npredictability and competitiveness with conventional energy plants, despite\nmanagement challenges. A methodology based on operations research techniques is\ndeveloped to optimize system performance. Furthermore, the impact of various\ntechnological parameters of the solar ORC on the system's performance is\nexamined. The study concludes by assessing the value of solar ORC within the\ntransactive energy trading framework across different configurations and\nscenarios. Results demonstrate an average 16\\% reduction in operational costs,\nshowcasing the benefits of implementing a predictable and manageable system in\nP2P transactive energy trading.\n","authors":["Silvia Anna Cordieri","Chiara Bordin","Sambeet Mishra"],"pdf_url":"https://arxiv.org/pdf/2412.01359v2.pdf","comment":"This is a preprint of a paper accepted for publication to the \"Energy\n  Systems\" Journal of Springer"},{"id":"http://arxiv.org/abs/2405.18251v2","updated":"2025-01-17T05:49:00Z","published":"2024-05-28T15:02:09Z","title":"Sensor-Based Distributionally Robust Control for Safe Robot Navigation\n  in Dynamic Environments","summary":"  We introduce a novel method for mobile robot navigation in dynamic, unknown\nenvironments, leveraging onboard sensing and distributionally robust\noptimization to impose probabilistic safety constraints. Our method introduces\na distributionally robust control barrier function (DR-CBF) that directly\nintegrates noisy sensor measurements and state estimates to define safety\nconstraints. This approach is applicable to a wide range of control-affine\ndynamics, generalizable to robots with complex geometries, and capable of\noperating at real-time control frequencies. Coupled with a control Lyapunov\nfunction (CLF) for path following, the proposed CLF-DR-CBF control synthesis\nmethod achieves safe, robust, and efficient navigation in challenging\nenvironments. We demonstrate the effectiveness and robustness of our approach\nfor safe autonomous navigation under uncertainty in simulations and real-world\nexperiments with differential-drive robots.\n","authors":["Kehan Long","Yinzhuang Yi","Zhirui Dai","Sylvia Herbert","Jorge Cortés","Nikolay Atanasov"],"pdf_url":"https://arxiv.org/pdf/2405.18251v2.pdf","comment":"Project page: https://existentialrobotics.org/DRO_Safe_Navigation"},{"id":"http://arxiv.org/abs/2501.09946v1","updated":"2025-01-17T04:00:50Z","published":"2025-01-17T04:00:50Z","title":"Client-Centric Federated Adaptive Optimization","summary":"  Federated Learning (FL) is a distributed learning paradigm where clients\ncollaboratively train a model while keeping their own data private. With an\nincreasing scale of clients and models, FL encounters two key challenges,\nclient drift due to a high degree of statistical/system heterogeneity, and lack\nof adaptivity. However, most existing FL research is based on unrealistic\nassumptions that virtually ignore system heterogeneity. In this paper, we\npropose Client-Centric Federated Adaptive Optimization, which is a class of\nnovel federated adaptive optimization approaches. We enable several features in\nthis framework such as arbitrary client participation, asynchronous server\naggregation, and heterogeneous local computing, which are ubiquitous in\nreal-world FL systems but are missed in most existing works. We provide a\nrigorous convergence analysis of our proposed framework for general nonconvex\nobjectives, which is shown to converge with the best-known rate. Extensive\nexperiments show that our approaches consistently outperform the baseline by a\nlarge margin across benchmarks.\n","authors":["Jianhui Sun","Xidong Wu","Heng Huang","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.02300v2","updated":"2025-01-17T01:37:58Z","published":"2022-02-04T18:37:24Z","title":"From Semi-Infinite Constraints to Structured Robust Policies: Optimal\n  Gain Selection for Financial Systems","summary":"  This paper studies the robust optimal gain selection problem for financial\ntrading systems, formulated within a \\emph{double linear policy} framework,\nwhich allocates capital across long and short positions. The key objective is\nto guarantee \\emph{robust positive expected} (RPE) profits uniformly across a\nrange of uncertain market conditions while ensuring risk control. This problem\nleads to a robust optimization formulation with \\emph{semi-infinite}\nconstraints, where the uncertainty is modeled by a bounded set of possible\nreturn parameters. We address this by transforming semi-infinite constraints\ninto structured policies -- the \\emph{balanced} policy and the\n\\emph{complementary} policy -- which enable explicit characterization of the\noptimal solution. Additionally, we propose a novel graphical approach to\nefficiently solve the robust gain selection problem, drastically reducing\ncomputational complexity. Empirical validation on historical stock price data\ndemonstrates superior performance in terms of risk-adjusted returns and\ndownside risk compared to conventional strategies. This framework generalizes\nclassical mean-variance optimization by incorporating robustness\nconsiderations, offering a systematic and efficient solution for robust trading\nunder uncertainty.\n","authors":["Chung-Han Hsieh"],"pdf_url":"https://arxiv.org/pdf/2202.02300v2.pdf","comment":"Submitted for possible publication"},{"id":"http://arxiv.org/abs/2407.13496v3","updated":"2025-01-17T23:12:04Z","published":"2024-07-18T13:26:12Z","title":"Solvability and Optimal Controls of Impulsive Stochastic Evolution\n  Equations in Hilbert Spaces","summary":"  This paper investigates the solvability and optimal control of a class of\nimpulsive stochastic differential equations (SDEs) within a Hilbert space\nsetting. First, we establish the existence and uniqueness of mild solutions for\nthe proposed impulsive stochastic system, leveraging fixed-point theorems and\nappropriate analytical techniques. Next, we identify and derive the necessary\nconditions for the existence of optimal control pairs, ensuring the feasibility\nand effectiveness of the control solutions. Finally, to validate and\ndemonstrate the practical applicability of our theoretical findings, we provide\na detailed example showcasing the utility of the results in real-world\nscenarios.\n","authors":["Javad A. Asadzade","Nazim I. Mahmudov"],"pdf_url":"https://arxiv.org/pdf/2407.13496v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10572v1","updated":"2025-01-17T22:01:07Z","published":"2025-01-17T22:01:07Z","title":"Generic uniqueness and conjugate points for optimal control problems","summary":"  The paper is concerned with an optimal control problem on $\\mathbb{R}^n$,\nwhere the dynamics is linear w.r.t.~the control functions. For a terminal cost\n$\\psi$ in a $mathcal{G}_\\delta$ set of $\\mathcal{C}^4(\\mathbb{R}^n)$ (i.e., in\na countable intersection of open dense subsets), two main results are\nproved.Namely: the set $\\Gamma_\\psi\\subset\\mathbb{R}^n$ of conjugate points is\nclosed, with locally bounded $(n-2)$-dimensional Hausdorff measure. Moreover,\nthe set of initial points $y\\in \\mathbb{R}^n\\setminus\\Gamma_\\psi$, which admit\ntwo or more globally optimal trajectories, is contained in the union of a\nlocally finite family of embedded manifolds. In particular, the value function\nis continuously differentiable on an open, dense subset of $\\mathbb{R}^n$.\n","authors":["Alberto Bressan","Marco Mazzola","Khai T. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.10572v1.pdf","comment":"18 pages, 1 figure"},{"id":"http://arxiv.org/abs/2501.10523v1","updated":"2025-01-17T19:53:26Z","published":"2025-01-17T19:53:26Z","title":"Multiclass Queue Scheduling Under Slowdown: An Approximate Dynamic\n  Programming Approach","summary":"  In many service systems, especially those in healthcare, customer waiting\ntimes can result in increased service requirements. Such service slowdowns can\nsignificantly impact system performance. Therefore, it is important to properly\naccount for their impact when designing scheduling policies. Scheduling under\nwait-dependent service times is challenging, especially when multiple customer\nclasses are heterogeneously affected by waiting. In this work, we study\nscheduling policies in multiclass, multiserver queues with wait-dependent\nservice slowdowns. We propose a simulation-based Approximate Dynamic\nProgramming (ADP) algorithm to find close-to-optimal scheduling policies. The\nADP algorithm (i) represents the policy using classifiers based on the index\npolicy structure, (ii) leverages a coupling method to estimate the differences\nof the relative value functions directly, and (iii) uses adaptive sampling for\nefficient state-space exploration. Through extensive numerical experiments, we\nillustrate that the ADP algorithm generates close-to-optimal policies that\noutperform well-known benchmarks. We also provide insights into the structure\nof the optimal policy, which reveals an important trade-off between\ninstantaneous cost reduction and preventing the system from reaching high-cost\nequilibria. Lastly, we conduct a case study on scheduling admissions into\nrehabilitation care to illustrate the effectiveness of the ADP algorithm in\npractice.\n","authors":["Jing Dong","Berk Görgülü","Vahid Sarhangian"],"pdf_url":"https://arxiv.org/pdf/2501.10523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10051v1","updated":"2025-01-17T09:15:18Z","published":"2025-01-17T09:15:18Z","title":"A Family of Controllable Momentum Coefficients for Forward-Backward\n  Accelerated Algorithms","summary":"  Nesterov's accelerated gradient method (NAG) marks a pivotal advancement in\ngradient-based optimization, achieving faster convergence compared to the\nvanilla gradient descent method for convex functions. However, its algorithmic\ncomplexity when applied to strongly convex functions remains unknown, as noted\nin the comprehensive review by Chambolle and Pock [2016]. This issue, aside\nfrom the critical step size, was addressed by Li et al. [2024b], with the\nmonotonic case further explored by Fu and Shi [2024]. In this paper, we\nintroduce a family of controllable momentum coefficients for forward-backward\naccelerated methods, focusing on the critical step size $s=1/L$. Unlike\ntraditional linear forms, the proposed momentum coefficients follow an\n$\\alpha$-th power structure, where the parameter $r$ is adaptively tuned to\n$\\alpha$. Using a Lyapunov function specifically designed for $\\alpha$, we\nestablish a controllable $O\\left(1/k^{2\\alpha} \\right)$ convergence rate for\nthe NAG-$\\alpha$ method, provided that $r > 2\\alpha$. At the critical step\nsize, NAG-$\\alpha$ achieves an inverse polynomial convergence rate of arbitrary\ndegree by adjusting $r$ according to $\\alpha > 0$. We further simplify the\nLyapunov function by expressing it in terms of the iterative sequences $x_k$\nand $y_k$, eliminating the need for phase-space representations. This\nsimplification enables us to extend the controllable $O \\left(1/k^{2\\alpha}\n\\right)$ rate to the monotonic variant, M-NAG-$\\alpha$, thereby enhancing\noptimization efficiency. Finally, by leveraging the fundamental inequality for\ncomposite functions, we extended the controllable $O\\left(1/k^{2\\alpha}\n\\right)$ rate to proximal algorithms, including the fast iterative\nshrinkage-thresholding algorithm (FISTA-$\\alpha$) and its monotonic counterpart\n(M-FISTA-$\\alpha$).\n","authors":["Mingwei Fu","Bin Shi"],"pdf_url":"https://arxiv.org/pdf/2501.10051v1.pdf","comment":"22 pages, 1 figure"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.10360v1","updated":"2025-01-17T18:59:55Z","published":"2025-01-17T18:59:55Z","title":"FaceXBench: Evaluating Multimodal LLMs on Face Understanding","summary":"  Multimodal Large Language Models (MLLMs) demonstrate impressive\nproblem-solving abilities across a wide range of tasks and domains. However,\ntheir capacity for face understanding has not been systematically studied. To\naddress this gap, we introduce FaceXBench, a comprehensive benchmark designed\nto evaluate MLLMs on complex face understanding tasks. FaceXBench includes\n5,000 multimodal multiple-choice questions derived from 25 public datasets and\na newly created dataset, FaceXAPI. These questions cover 14 tasks across 6\nbroad categories, assessing MLLMs' face understanding abilities in bias and\nfairness, face authentication, recognition, analysis, localization and tool\nretrieval. Using FaceXBench, we conduct an extensive evaluation of 26\nopen-source MLLMs alongside 2 proprietary models, revealing the unique\nchallenges in complex face understanding tasks. We analyze the models across\nthree evaluation settings: zero-shot, in-context task description, and\nchain-of-thought prompting. Our detailed analysis reveals that current MLLMs,\nincluding advanced models like GPT-4o, and GeminiPro 1.5, show significant room\nfor improvement. We believe FaceXBench will be a crucial resource for\ndeveloping MLLMs equipped to perform sophisticated face understanding. Code:\nhttps://github.com/Kartik-3004/facexbench\n","authors":["Kartik Narayan","Vibashan VS","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2501.10360v1.pdf","comment":"Project Page: https://kartik-3004.github.io/facexbench/"},{"id":"http://arxiv.org/abs/2501.10357v1","updated":"2025-01-17T18:57:57Z","published":"2025-01-17T18:57:57Z","title":"Zero-Shot Monocular Scene Flow Estimation in the Wild","summary":"  Large models have shown generalization across datasets for many low-level\nvision tasks, like depth estimation, but no such general models exist for scene\nflow. Even though scene flow has wide potential use, it is not used in practice\nbecause current predictive models do not generalize well. We identify three key\nchallenges and propose solutions for each.First, we create a method that\njointly estimates geometry and motion for accurate prediction. Second, we\nalleviate scene flow data scarcity with a data recipe that affords us 1M\nannotated training samples across diverse synthetic scenes. Third, we evaluate\ndifferent parameterizations for scene flow prediction and adopt a natural and\neffective parameterization. Our resulting model outperforms existing methods as\nwell as baselines built on large-scale models in terms of 3D end-point error,\nand shows zero-shot generalization to the casually captured videos from DAVIS\nand the robotic manipulation scenes from RoboTAP. Overall, our approach makes\nscene flow prediction more practical in-the-wild.\n","authors":["Yiqing Liang","Abhishek Badki","Hang Su","James Tompkin","Orazio Gallo"],"pdf_url":"https://arxiv.org/pdf/2501.10357v1.pdf","comment":"Project Website: https://research.nvidia.com/labs/zero_msf"},{"id":"http://arxiv.org/abs/2501.10343v1","updated":"2025-01-17T18:34:47Z","published":"2025-01-17T18:34:47Z","title":"3rd Workshop on Maritime Computer Vision (MaCVi) 2025: Challenge Results","summary":"  The 3rd Workshop on Maritime Computer Vision (MaCVi) 2025 addresses maritime\ncomputer vision for Unmanned Surface Vehicles (USV) and underwater. This report\noffers a comprehensive overview of the findings from the challenges. We provide\nboth statistical and qualitative analyses, evaluating trends from over 700\nsubmissions. All datasets, evaluation code, and the leaderboard are available\nto the public at https://macvi.org/workshop/macvi25.\n","authors":["Benjamin Kiefer","Lojze Žust","Jon Muhovič","Matej Kristan","Janez Perš","Matija Teršek","Uma Mudenagudi Chaitra Desai","Arnold Wiliem","Marten Kreis","Nikhil Akalwadi","Yitong Quan","Zhiqiang Zhong","Zhe Zhang","Sujie Liu","Xuran Chen","Yang Yang","Matej Fabijanić","Fausto Ferreira","Seongju Lee","Junseok Lee","Kyoobin Lee","Shanliang Yao","Runwei Guan","Xiaoyu Huang","Yi Ni","Himanshu Kumar","Yuan Feng","Yi-Ching Cheng","Tzu-Yu Lin","Chia-Ming Lee","Chih-Chung Hsu","Jannik Sheikh","Andreas Michel","Wolfgang Gross","Martin Weinmann","Josip Šarić","Yipeng Lin","Xiang Yang","Nan Jiang","Yutang Lu","Fei Feng","Ali Awad","Evan Lucas","Ashraf Saleem","Ching-Heng Cheng","Yu-Fan Lin","Tzu-Yu Lin","Chih-Chung Hsu"],"pdf_url":"https://arxiv.org/pdf/2501.10343v1.pdf","comment":"Part of the MaCVi 2025 workshop"},{"id":"http://arxiv.org/abs/2412.19794v4","updated":"2025-01-17T18:18:21Z","published":"2024-12-27T18:47:05Z","title":"MVTamperBench: Evaluating Robustness of Vision-Language Models","summary":"  Multimodal Large Language Models (MLLMs) have driven major advances in video\nunderstanding, yet their vulnerability to adversarial tampering and\nmanipulations remains underexplored. To address this gap, we introduce\nMVTamperBench, a benchmark that systematically evaluates MLLM robustness\nagainst five prevalent tampering techniques: rotation, masking, substitution,\nrepetition, and dropping. Built from 3.4K original videos-expanded to over 17K\ntampered clips spanning 19 video tasks.\n  MVTamperBench challenges models to detect manipulations in spatial and\ntemporal coherence. We evaluate 45 recent MLLMs from 15+ model families,\nrevealing substantial variability in resilience across tampering types and\nshowing that larger parameter counts do not necessarily guarantee robustness.\nMVTamperBench sets a new benchmark for developing tamper-resilient MLLM in\nsafety-critical applications, including detecting clickbait, preventing harmful\ncontent distribution, and enforcing policies on media platforms. We release all\ncode and data to foster open research in trustworthy video understanding.\n  Code: https://amitbcp.github.io/MVTamperBench/ Data:\nhttps://huggingface.co/datasets/Srikant86/MVTamperBench\n","authors":["Amit Agarwal","Srikant Panda","Angeline Charles","Bhargava Kumar","Hitesh Patel","Priyaranjan Pattnayak","Taki Hasan Rafi","Tejaswini Kumar","Dong-Kyu Chae"],"pdf_url":"https://arxiv.org/pdf/2412.19794v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10325v1","updated":"2025-01-17T17:56:52Z","published":"2025-01-17T17:56:52Z","title":"DiffStereo: High-Frequency Aware Diffusion Model for Stereo Image\n  Restoration","summary":"  Diffusion models (DMs) have achieved promising performance in image\nrestoration but haven't been explored for stereo images. The application of DM\nin stereo image restoration is confronted with a series of challenges. The need\nto reconstruct two images exacerbates DM's computational cost. Additionally,\nexisting latent DMs usually focus on semantic information and remove\nhigh-frequency details as redundancy during latent compression, which is\nprecisely what matters for image restoration. To address the above problems, we\npropose a high-frequency aware diffusion model, DiffStereo for stereo image\nrestoration as the first attempt at DM in this domain. Specifically, DiffStereo\nfirst learns latent high-frequency representations (LHFR) of HQ images. DM is\nthen trained in the learned space to estimate LHFR for stereo images, which are\nfused into a transformer-based stereo image restoration network providing\nbeneficial high-frequency information of corresponding HQ images. The\nresolution of LHFR is kept the same as input images, which preserves the\ninherent texture from distortion. And the compression in channels alleviates\nthe computational burden of DM. Furthermore, we devise a position encoding\nscheme when integrating the LHFR into the restoration network, enabling\ndistinctive guidance in different depths of the restoration network.\nComprehensive experiments verify that by combining generative DM and\ntransformer, DiffStereo achieves both higher reconstruction accuracy and better\nperceptual quality on stereo super-resolution, deblurring, and low-light\nenhancement compared with state-of-the-art methods.\n","authors":["Huiyun Cao","Yuan Shi","Bin Xia","Xiaoyu Jin","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2501.10325v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.10324v1","updated":"2025-01-17T17:56:27Z","published":"2025-01-17T17:56:27Z","title":"New Fashion Products Performance Forecasting: A Survey on Evolutions,\n  Models and Emerging Trends","summary":"  The fast fashion industry's insatiable demand for new styles and rapid\nproduction cycles has led to a significant environmental burden.\nOverproduction, excessive waste, and harmful chemicals have contributed to the\nnegative environmental impact of the industry. To mitigate these issues, a\nparadigm shift that prioritizes sustainability and efficiency is urgently\nneeded. Integrating learning-based predictive analytics into the fashion\nindustry represents a significant opportunity to address environmental\nchallenges and drive sustainable practices. By forecasting fashion trends and\noptimizing production, brands can reduce their ecological footprint while\nremaining competitive in a rapidly changing market. However, one of the key\nchallenges in forecasting fashion sales is the dynamic nature of consumer\npreferences. Fashion is acyclical, with trends constantly evolving and\nresurfacing. In addition, cultural changes and unexpected events can disrupt\nestablished patterns. This problem is also known as New Fashion Products\nPerformance Forecasting (NFPPF), and it has recently gained more and more\ninterest in the global research landscape. Given its multidisciplinary nature,\nthe field of NFPPF has been approached from many different angles. This\ncomprehensive survey wishes to provide an up-to-date overview that focuses on\nlearning-based NFPPF strategies. The survey is based on the Preferred Reporting\nItems for Systematic Reviews and Meta-Analyses (PRISMA) methodological flow,\nallowing for a systematic and complete literature review. In particular, we\npropose the first taxonomy that covers the learning panorama for NFPPF,\nexamining in detail the different methodologies used to increase the amount of\nmultimodal information, as well as the state-of-the-art available datasets.\nFinally, we discuss the challenges and future directions.\n","authors":["Andrea Avogaro","Luigi Capogrosso","Andrea Toaiari","Franco Fummi","Marco Cristani"],"pdf_url":"https://arxiv.org/pdf/2501.10324v1.pdf","comment":"Accepted at the Springer Nature Computer Science journal"},{"id":"http://arxiv.org/abs/2501.10318v1","updated":"2025-01-17T17:41:47Z","published":"2025-01-17T17:41:47Z","title":"HiMix: Reducing Computational Complexity in Large Vision-Language Models","summary":"  Benefiting from recent advancements in large language models and modality\nalignment techniques, existing Large Vision-Language Models(LVLMs) have\nachieved prominent performance across a wide range of scenarios. However, the\nexcessive computational complexity limits the widespread use of these models in\npractical applications. We argue that one main bottleneck in computational\ncomplexity is caused by the involvement of redundant vision sequences in model\ncomputation. This is inspired by a reassessment of the efficiency of vision and\nlanguage information transmission in the language decoder of LVLMs. Then, we\npropose a novel hierarchical vision-language interaction mechanism called\nHierarchical Vision injection for Mixture Attention (HiMix). In HiMix, only the\nlanguage sequence undergoes full forward propagation, while the vision sequence\ninteracts with the language at specific stages within each language decoder\nlayer. It is striking that our approach significantly reduces computational\ncomplexity with minimal performance loss. Specifically, HiMix achieves a 10x\nreduction in the computational cost of the language decoder across multiple\nLVLM models while maintaining comparable performance. This highlights the\nadvantages of our method, and we hope our research brings new perspectives to\nthe field of vision-language understanding. Project Page:\nhttps://xuange923.github.io/HiMix\n","authors":["Xuange Zhang","Dengjie Li","Bo Liu","Zenghao Bao","Yao Zhou","Baisong Yang","Zhongying Liu","Yujie Zhong","Zheng Zhao","Tongtong Yuan"],"pdf_url":"https://arxiv.org/pdf/2501.10318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09600v2","updated":"2025-01-17T17:07:31Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n  Prototyping in Virtual Reality Applications","summary":"  SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10283v1","updated":"2025-01-17T16:26:24Z","published":"2025-01-17T16:26:24Z","title":"GSTAR: Gaussian Surface Tracking and Reconstruction","summary":"  3D Gaussian Splatting techniques have enabled efficient photo-realistic\nrendering of static scenes. Recent works have extended these approaches to\nsupport surface reconstruction and tracking. However, tracking dynamic surfaces\nwith 3D Gaussians remains challenging due to complex topology changes, such as\nsurfaces appearing, disappearing, or splitting. To address these challenges, we\npropose GSTAR, a novel method that achieves photo-realistic rendering, accurate\nsurface reconstruction, and reliable 3D tracking for general dynamic scenes\nwith changing topology. Given multi-view captures as input, GSTAR binds\nGaussians to mesh faces to represent dynamic objects. For surfaces with\nconsistent topology, GSTAR maintains the mesh topology and tracks the meshes\nusing Gaussians. In regions where topology changes, GSTAR adaptively unbinds\nGaussians from the mesh, enabling accurate registration and the generation of\nnew surfaces based on these optimized Gaussians. Additionally, we introduce a\nsurface-based scene flow method that provides robust initialization for\ntracking between frames. Experiments demonstrate that our method effectively\ntracks and reconstructs dynamic surfaces, enabling a range of applications. Our\nproject page with the code release is available at\nhttps://chengwei-zheng.github.io/GSTAR/.\n","authors":["Chengwei Zheng","Lixin Xue","Juan Zarate","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2501.10283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09374v2","updated":"2025-01-17T15:52:06Z","published":"2024-10-12T05:35:27Z","title":"ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras","summary":"  Event-based visual odometry is a specific branch of visual Simultaneous\nLocalization and Mapping (SLAM) techniques, which aims at solving tracking and\nmapping subproblems (typically in parallel), by exploiting the special working\nprinciples of neuromorphic (i.e., event-based) cameras. Due to the\nmotion-dependent nature of event data, explicit data association (i.e., feature\nmatching) under large-baseline view-point changes is difficult to establish,\nmaking direct methods a more rational choice. However, state-of-the-art direct\nmethods are limited by the high computational complexity of the mapping\nsub-problem and the degeneracy of camera pose tracking in certain degrees of\nfreedom (DoF) in rotation. In this paper, we tackle these issues by building an\nevent-based stereo visual-inertial odometry system on top of a direct pipeline.\nSpecifically, to speed up the mapping operation, we propose an efficient\nstrategy for sampling contour points according to the local dynamics of events.\nThe mapping performance is also improved in terms of structure completeness and\nlocal smoothness by merging the temporal stereo and static stereo results. To\ncircumvent the degeneracy of camera pose tracking in recovering the pitch and\nyaw components of general 6-DoF motion, we introduce IMU measurements as motion\npriors via pre-integration. To this end, a compact back-end is proposed for\ncontinuously updating the IMU bias and predicting the linear velocity, enabling\nan accurate motion prediction for camera pose tracking. The resulting system\nscales well with modern high-resolution event cameras and leads to better\nglobal positioning accuracy in large-scale outdoor environments. Extensive\nevaluations on five publicly available datasets featuring different resolutions\nand scenarios justify the superior performance of the proposed system against\nfive state-of-the-art methods.\n","authors":["Junkai Niu","Sheng Zhong","Xiuyuan Lu","Shaojie Shen","Guillermo Gallego","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.09374v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10266v1","updated":"2025-01-17T15:48:37Z","published":"2025-01-17T15:48:37Z","title":"MutualForce: Mutual-Aware Enhancement for 4D Radar-LiDAR 3D Object\n  Detection","summary":"  Radar and LiDAR have been widely used in autonomous driving as LiDAR provides\nrich structure information, and radar demonstrates high robustness under\nadverse weather. Recent studies highlight the effectiveness of fusing radar and\nLiDAR point clouds. However, challenges remain due to the modality misalignment\nand information loss during feature extractions. To address these issues, we\npropose a 4D radar-LiDAR framework to mutually enhance their representations.\nInitially, the indicative features from radar are utilized to guide both radar\nand LiDAR geometric feature learning. Subsequently, to mitigate their sparsity\ngap, the shape information from LiDAR is used to enrich radar BEV features.\nExtensive experiments on the View-of-Delft (VoD) dataset demonstrate our\napproach's superiority over existing methods, achieving the highest mAP of\n71.76% across the entire area and 86.36\\% within the driving corridor.\nEspecially for cars, we improve the AP by 4.17% and 4.20% due to the strong\nindicative features and symmetric shapes.\n","authors":["Xiangyuan Peng","Huawei Sun","Kay Bierzynski","Anton Fischbacher","Lorenzo Servadei","Robert Wille"],"pdf_url":"https://arxiv.org/pdf/2501.10266v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2405.04392v2","updated":"2025-01-17T15:21:52Z","published":"2024-05-07T15:14:49Z","title":"BILTS: A Bi-Invariant Similarity Measure for Robust Object Trajectory\n  Recognition under Reference Frame Variations","summary":"  When similar object motions are performed in diverse contexts but are meant\nto be recognized under a single classification, these contextual variations act\nas disturbances that negatively affect accurate motion recognition. In this\npaper, we focus on contextual variations caused by reference frame variations.\nTo robustly deal with these variations, similarity measures have been\nintroduced that compare object motion trajectories in a context-invariant\nmanner. However, most are highly sensitive to noise near singularities, where\nthe measure is not uniquely defined, and lack bi-invariance (invariance to both\nworld and body frame variations). To address these issues, we propose the novel\n\\textit{Bi-Invariant Local Trajectory-Shape Similarity} (BILTS) measure.\nCompared to other measures, the BILTS measure uniquely offers bi-invariance,\nboundedness, and third-order shape identity. Aimed at practical\nimplementations, we devised a discretized and regularized version of the BILTS\nmeasure which shows exceptional robustness to singularities. This is\ndemonstrated through rigorous recognition experiments using multiple datasets.\nOn average, BILTS attained the highest recognition ratio and least sensitivity\nto contextual variations compared to other invariant object motion similarity\nmeasures. We believe that the BILTS measure is a valuable tool for recognizing\nmotions performed in diverse contexts and has potential in other applications,\nincluding the recognition, segmentation, and adaptation of both motion and\nforce trajectories.\n","authors":["Arno Verduyn","Erwin Aertbeliën","Glenn Maes","Joris De Schutter","Maxim Vochten"],"pdf_url":"https://arxiv.org/pdf/2405.04392v2.pdf","comment":"This work has been submitted as a regular research paper for\n  consideration in the Journal of Intelligent & Robotic Systems. The content in\n  this preprint is identical to the version submitted for peer review, except\n  for formatting differences required by the journal"},{"id":"http://arxiv.org/abs/2403.03728v2","updated":"2025-01-17T15:15:15Z","published":"2024-03-06T14:18:24Z","title":"Bridging Diversity and Uncertainty in Active learning with\n  Self-Supervised Pre-Training","summary":"  This study addresses the integration of diversity-based and uncertainty-based\nsampling strategies in active learning, particularly within the context of\nself-supervised pre-trained models. We introduce a straightforward heuristic\ncalled TCM that mitigates the cold start problem while maintaining strong\nperformance across various data levels. By initially applying TypiClust for\ndiversity sampling and subsequently transitioning to uncertainty sampling with\nMargin, our approach effectively combines the strengths of both strategies. Our\nexperiments demonstrate that TCM consistently outperforms existing methods\nacross various datasets in both low and high data regimes.\n","authors":["Paul Doucet","Benjamin Estermann","Till Aczel","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2403.03728v2.pdf","comment":"Accepted at ICLR 2024 Workshop on Practical Machine Learning for Low\n  Resource Settings (PML4LRS)"},{"id":"http://arxiv.org/abs/2501.10219v1","updated":"2025-01-17T14:33:05Z","published":"2025-01-17T14:33:05Z","title":"Robust Egoistic Rigid Body Localization","summary":"  We consider a robust and self-reliant (or \"egoistic\") variation of the rigid\nbody localization (RBL) problem, in which a primary rigid body seeks to\nestimate the pose (i.e., location and orientation) of another rigid body (or\n\"target\"), relative to its own, without the assistance of external\ninfrastructure, without prior knowledge of the shape of the target, and taking\ninto account the possibility that the available observations are incomplete.\nThree complementary contributions are then offered for such a scenario. The\nfirst is a method to estimate the translation vector between the center point\nof both rigid bodies, which unlike existing techniques does not require that\nboth objects have the same shape or even the same number of landmark points.\nThis technique is shown to significantly outperform the state-of-the-art (SotA)\nunder complete information, but to be sensitive to data erasures, even when\nenhanced by matrix completion methods. The second contribution, designed to\noffer improved performance in the presence of incomplete information, offers a\nrobust alternative to the latter, at the expense of a slight relative loss\nunder complete information. Finally, the third contribution is a scheme for the\nestimation of the rotation matrix describing the relative orientation of the\ntarget rigid body with respect to the primary. Comparisons of the proposed\nschemes and SotA techniques demonstrate the advantage of the contributed\nmethods in terms of root mean square error (RMSE) performance under fully\ncomplete information and incomplete conditions.\n","authors":["Niclas Führling","Giuseppe Thadeu Freitas de Abreu","David González G.","Osvaldo Gonsa"],"pdf_url":"https://arxiv.org/pdf/2501.10219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10733v5","updated":"2025-01-17T14:22:06Z","published":"2024-10-14T17:15:07Z","title":"Deep Compression Autoencoder for Efficient High-Resolution Diffusion\n  Models","summary":"  We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder\nmodels for accelerating high-resolution diffusion models. Existing autoencoder\nmodels have demonstrated impressive results at a moderate spatial compression\nratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for\nhigh spatial compression ratios (e.g., 64x). We address this challenge by\nintroducing two key techniques: (1) Residual Autoencoding, where we design our\nmodels to learn residuals based on the space-to-channel transformed features to\nalleviate the optimization difficulty of high spatial-compression autoencoders;\n(2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases\ntraining strategy for mitigating the generalization penalty of high\nspatial-compression autoencoders. With these designs, we improve the\nautoencoder's spatial compression ratio up to 128 while maintaining the\nreconstruction quality. Applying our DC-AE to latent diffusion models, we\nachieve significant speedup without accuracy drop. For example, on ImageNet\n512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup\non H100 GPU for UViT-H while achieving a better FID, compared with the widely\nused SD-VAE-f8 autoencoder. Our code is available at\nhttps://github.com/mit-han-lab/efficientvit.\n","authors":["Junyu Chen","Han Cai","Junsong Chen","Enze Xie","Shang Yang","Haotian Tang","Muyang Li","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2410.10733v5.pdf","comment":"Preprint. First two authors contributed equally to this work. Update:\n  fix typo"},{"id":"http://arxiv.org/abs/2501.10212v1","updated":"2025-01-17T14:12:52Z","published":"2025-01-17T14:12:52Z","title":"Disharmony: Forensics using Reverse Lighting Harmonization","summary":"  Content generation and manipulation approaches based on deep learning methods\nhave seen significant advancements, leading to an increased need for techniques\nto detect whether an image has been generated or edited. Another area of\nresearch focuses on the insertion and harmonization of objects within images.\nIn this study, we explore the potential of using harmonization data in\nconjunction with a segmentation model to enhance the detection of edited image\nregions. These edits can be either manually crafted or generated using deep\nlearning methods. Our findings demonstrate that this approach can effectively\nidentify such edits. Existing forensic models often overlook the detection of\nharmonized objects in relation to the background, but our proposed Disharmony\nNetwork addresses this gap. By utilizing an aggregated dataset of harmonization\ntechniques, our model outperforms existing forensic networks in identifying\nharmonized objects integrated into their backgrounds, and shows potential for\ndetecting various forms of edits, including virtual try-on tasks.\n","authors":["Philip Wootaek Shin","Jack Sampson","Vijaykrishnan Narayanan","Andres Marquez","Mahantesh Halappanavar"],"pdf_url":"https://arxiv.org/pdf/2501.10212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10209v1","updated":"2025-01-17T14:08:32Z","published":"2025-01-17T14:08:32Z","title":"Hypercone Assisted Contour Generation for Out-of-Distribution Detection","summary":"  Recent advances in the field of out-of-distribution (OOD) detection have\nplaced great emphasis on learning better representations suited to this task.\nWhile there are distance-based approaches, distributional awareness has seldom\nbeen exploited for better performance. We present HAC$_k$-OOD, a novel OOD\ndetection method that makes no distributional assumption about the data, but\nautomatically adapts to its distribution. Specifically, HAC$_k$-OOD constructs\na set of hypercones by maximizing the angular distance to neighbors in a given\ndata-point's vicinity to approximate the contour within which in-distribution\n(ID) data-points lie. Experimental results show state-of-the-art FPR@95 and\nAUROC performance on Near-OOD detection and on Far-OOD detection on the\nchallenging CIFAR-100 benchmark without explicitly training for OOD\nperformance.\n","authors":["Annita Vapsi","Andrés Muñoz","Nancy Thomas","Keshav Ramani","Daniel Borrajo"],"pdf_url":"https://arxiv.org/pdf/2501.10209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10199v1","updated":"2025-01-17T13:48:04Z","published":"2025-01-17T13:48:04Z","title":"Adaptive Clustering for Efficient Phenotype Segmentation of UAV\n  Hyperspectral Data","summary":"  Unmanned Aerial Vehicles (UAVs) combined with Hyperspectral imaging (HSI)\noffer potential for environmental and agricultural applications by capturing\ndetailed spectral information that enables the prediction of invisible features\nlike biochemical leaf properties. However, the data-intensive nature of HSI\nposes challenges for remote devices, which have limited computational resources\nand storage. This paper introduces an Online Hyperspectral Simple Linear\nIterative Clustering algorithm (OHSLIC) framework for real-time tree phenotype\nsegmentation. OHSLIC reduces inherent noise and computational demands through\nadaptive incremental clustering and a lightweight neural network, which\nphenotypes trees using leaf contents such as chlorophyll, carotenoids, and\nanthocyanins. A hyperspectral dataset is created using a custom simulator that\nincorporates realistic leaf parameters, and light interactions. Results\ndemonstrate that OHSLIC achieves superior regression accuracy and segmentation\nperformance compared to pixel- or window-based methods while significantly\nreducing inference time. The method`s adaptive clustering enables dynamic\ntrade-offs between computational efficiency and accuracy, paving the way for\nscalable edge-device deployment in HSI applications.\n","authors":["Ciem Cornelissen","Sam Leroux","Pieter Simoens"],"pdf_url":"https://arxiv.org/pdf/2501.10199v1.pdf","comment":"accepted WACV 2025 GeoCV workshop"},{"id":"http://arxiv.org/abs/2501.10197v1","updated":"2025-01-17T13:44:54Z","published":"2025-01-17T13:44:54Z","title":"CSHNet: A Novel Information Asymmetric Image Translation Method","summary":"  Despite advancements in cross-domain image translation, challenges persist in\nasymmetric tasks such as SAR-to-Optical and Sketch-to-Instance conversions,\nwhich involve transforming data from a less detailed domain into one with\nricher content. Traditional CNN-based methods are effective at capturing fine\ndetails but struggle with global structure, leading to unwanted merging of\nimage regions. To address this, we propose the CNN-Swin Hybrid Network\n(CSHNet), which combines two key modules: Swin Embedded CNN (SEC) and CNN\nEmbedded Swin (CES), forming the SEC-CES-Bottleneck (SCB). SEC leverages CNN's\ndetailed feature extraction while integrating the Swin Transformer's structural\nbias. CES, in turn, preserves the Swin Transformer's global integrity,\ncompensating for CNN's lack of focus on structure. Additionally, CSHNet\nincludes two components designed to enhance cross-domain information retention:\nthe Interactive Guided Connection (IGC), which enables dynamic information\nexchange between SEC and CES, and Adaptive Edge Perception Loss (AEPL), which\nmaintains structural boundaries during translation. Experimental results show\nthat CSHNet outperforms existing methods in both visual quality and performance\nmetrics across scene-level and instance-level datasets. Our code is available\nat: https://github.com/XduShi/CSHNet.\n","authors":["Xi Yang","Haoyuan Shi","Zihan Wang","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2501.10197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13309v2","updated":"2025-01-17T13:28:01Z","published":"2023-12-20T04:35:00Z","title":"Generate E-commerce Product Background by Integrating Category\n  Commonality and Personalized Style","summary":"  The state-of-the-art methods for e-commerce product background generation\nsuffer from the inefficiency of designing product-wise prompts when scaling up\nthe production, as well as the ineffectiveness of describing fine-grained\nstyles when customizing personalized backgrounds for some specific brands. To\naddress these obstacles, we integrate the category commonality and personalized\nstyle into diffusion models. Concretely, we propose a Category-Wise Generator\nto enable large-scale background generation with only one model for the first\ntime. A unique identifier in the prompt is assigned to each category, whose\nattention is located on the background by a mask-guided cross attention layer\nto learn the category-wise style. Furthermore, for products with specific and\nfine-grained requirements in layout, elements, etc, a Personality-Wise\nGenerator is devised to learn such personalized style directly from a reference\nimage to resolve textual ambiguities, and is trained in a self-supervised\nmanner for more efficient training data usage. To advance research in this\nfield, the first large-scale e-commerce product background generation dataset\nBG60k is constructed, which covers more than 60k product images from over 2k\ncategories. Experiments demonstrate that our method could generate high-quality\nbackgrounds for different categories, and maintain the personalized background\nstyle of reference images. BG60k will be available at\n\\url{https://github.com/Whileherham/BG60k}.\n","authors":["Haohan Wang","Wei Feng","Yaoyu Li","Zheng Zhang","Jingjing Lv","Junjie Shen","Zhangang Lin","Jingping Shao"],"pdf_url":"https://arxiv.org/pdf/2312.13309v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.08295v2","updated":"2025-01-17T13:22:51Z","published":"2025-01-14T18:22:21Z","title":"LayerAnimate: Layer-specific Control for Animation","summary":"  Animated video separates foreground and background elements into layers, with\ndistinct processes for sketching, refining, coloring, and in-betweening.\nExisting video generation methods typically treat animation as a monolithic\ndata domain, lacking fine-grained control over individual layers. In this\npaper, we introduce LayerAnimate, a novel architectural approach that enhances\nfine-grained control over individual animation layers within a video diffusion\nmodel, allowing users to independently manipulate foreground and background\nelements in distinct layers. To address the challenge of limited layer-specific\ndata, we propose a data curation pipeline that features automated element\nsegmentation, motion-state hierarchical merging, and motion coherence\nrefinement. Through quantitative and qualitative comparisons, and user study,\nwe demonstrate that LayerAnimate outperforms current methods in terms of\nanimation quality, control precision, and usability, making it an ideal tool\nfor both professional animators and amateur enthusiasts. This framework opens\nup new possibilities for layer-specific animation applications and creative\nflexibility. Our code is available at https://layeranimate.github.io.\n","authors":["Yuxue Yang","Lue Fan","Zuzeng Lin","Feng Wang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08295v2.pdf","comment":"Project page: https://layeranimate.github.io"},{"id":"http://arxiv.org/abs/2501.10157v1","updated":"2025-01-17T12:42:30Z","published":"2025-01-17T12:42:30Z","title":"Structure-guided Deep Multi-View Clustering","summary":"  Deep multi-view clustering seeks to utilize the abundant information from\nmultiple views to improve clustering performance. However, most of the existing\nclustering methods often neglect to fully mine multi-view structural\ninformation and fail to explore the distribution of multi-view data, limiting\nclustering performance. To address these limitations, we propose a\nstructure-guided deep multi-view clustering model. Specifically, we introduce a\npositive sample selection strategy based on neighborhood relationships, coupled\nwith a corresponding loss function. This strategy constructs multi-view nearest\nneighbor graphs to dynamically redefine positive sample pairs, enabling the\nmining of local structural information within multi-view data and enhancing the\nreliability of positive sample selection. Additionally, we introduce a Gaussian\ndistribution model to uncover latent structural information and introduce a\nloss function to reduce discrepancies between view embeddings. These two\nstrategies explore multi-view structural information and data distribution from\ndifferent perspectives, enhancing consistency across views and increasing\nintra-cluster compactness. Experimental evaluations demonstrate the efficacy of\nour method, showing significant improvements in clustering performance on\nmultiple benchmark datasets compared to state-of-the-art multi-view clustering\napproaches.\n","authors":["Jinrong Cui","Xiaohuang Wu","Haitao Zhang","Chongjie Dong","Jie Wen"],"pdf_url":"https://arxiv.org/pdf/2501.10157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10144v1","updated":"2025-01-17T12:12:33Z","published":"2025-01-17T12:12:33Z","title":"A Vision-Language Framework for Multispectral Scene Representation Using\n  Language-Grounded Features","summary":"  Scene understanding in remote sensing often faces challenges in generating\naccurate representations for complex environments such as various land use\nareas or coastal regions, which may also include snow, clouds, or haze. To\naddress this, we present a vision-language framework named Spectral LLaVA,\nwhich integrates multispectral data with vision-language alignment techniques\nto enhance scene representation and description. Using the BigEarthNet v2\ndataset from Sentinel-2, we establish a baseline with RGB-based scene\ndescriptions and further demonstrate substantial improvements through the\nincorporation of multispectral information. Our framework optimizes a\nlightweight linear projection layer for alignment while keeping the vision\nbackbone of SpectralGPT frozen. Our experiments encompass scene classification\nusing linear probing and language modeling for jointly performing scene\nclassification and description generation. Our results highlight Spectral\nLLaVA's ability to produce detailed and accurate descriptions, particularly for\nscenarios where RGB data alone proves inadequate, while also enhancing\nclassification performance by refining SpectralGPT features into semantically\nmeaningful representations.\n","authors":["Enes Karanfil","Nevrez Imamoglu","Erkut Erdem","Aykut Erdem"],"pdf_url":"https://arxiv.org/pdf/2501.10144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18373v3","updated":"2025-01-17T11:53:42Z","published":"2023-11-30T09:14:37Z","title":"A Survey on Deep Learning for Polyp Segmentation: Techniques, Challenges\n  and Future Trends","summary":"  Early detection and assessment of polyps play a crucial role in the\nprevention and treatment of colorectal cancer (CRC). Polyp segmentation\nprovides an effective solution to assist clinicians in accurately locating and\nsegmenting polyp regions. In the past, people often relied on manually\nextracted lower-level features such as color, texture, and shape, which often\nhad issues capturing global context and lacked robustness to complex scenarios.\nWith the advent of deep learning, more and more outstanding medical image\nsegmentation algorithms based on deep learning networks have emerged, making\nsignificant progress in this field. This paper provides a comprehensive review\nof polyp segmentation algorithms. We first review some traditional algorithms\nbased on manually extracted features and deep segmentation algorithms, then\ndetail benchmark datasets related to the topic. Specifically, we carry out a\ncomprehensive evaluation of recent deep learning models and results based on\npolyp sizes, considering the pain points of research topics and differences in\nnetwork structures. Finally, we discuss the challenges of polyp segmentation\nand future trends in this field. The models, benchmark datasets, and source\ncode links we collected are all published at\nhttps://github.com/taozh2017/Awesome-Polyp-Segmentation.\n","authors":["Jiaxin Mei","Tao Zhou","Kaiwen Huang","Yizhe Zhang","Yi Zhou","Ye Wu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2311.18373v3.pdf","comment":"Have been published in Visual Intelligence"},{"id":"http://arxiv.org/abs/2403.16954v2","updated":"2025-01-17T11:42:58Z","published":"2024-03-25T17:16:27Z","title":"Isolated Diffusion: Optimizing Multi-Concept Text-to-Image Generation\n  Training-Freely with Isolated Diffusion Guidance","summary":"  Large-scale text-to-image diffusion models have achieved great success in\nsynthesizing high-quality and diverse images given target text prompts. Despite\nthe revolutionary image generation ability, current state-of-the-art models\nstill struggle to deal with multi-concept generation accurately in many cases.\nThis phenomenon is known as ``concept bleeding\" and displays as the unexpected\noverlapping or merging of various concepts. This paper presents a general\napproach for text-to-image diffusion models to address the mutual interference\nbetween different subjects and their attachments in complex scenes, pursuing\nbetter text-image consistency. The core idea is to isolate the synthesizing\nprocesses of different concepts. We propose to bind each attachment to\ncorresponding subjects separately with split text prompts. Besides, we\nintroduce a revision method to fix the concept bleeding problem in\nmulti-subject synthesis. We first depend on pre-trained object detection and\nsegmentation models to obtain the layouts of subjects. Then we isolate and\nresynthesize each subject individually with corresponding text prompts to avoid\nmutual interference. Overall, we achieve a training-free strategy, named\nIsolated Diffusion, to optimize multi-concept text-to-image synthesis. It is\ncompatible with the latest Stable Diffusion XL (SDXL) and prior Stable\nDiffusion (SD) models. We compare our approach with alternative methods using a\nvariety of multi-concept text prompts and demonstrate its effectiveness with\nclear advantages in text-image consistency and user study.\n","authors":["Jingyuan Zhu","Huimin Ma","Jiansheng Chen","Jian Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.16954v2.pdf","comment":"Accepted by IEEE Transactions on Visualization and Computer Graphics"},{"id":"http://arxiv.org/abs/2501.10131v1","updated":"2025-01-17T11:39:47Z","published":"2025-01-17T11:39:47Z","title":"ACE: Anatomically Consistent Embeddings in Composition and Decomposition","summary":"  Medical images acquired from standardized protocols show consistent\nmacroscopic or microscopic anatomical structures, and these structures consist\nof composable/decomposable organs and tissues, but existing self-supervised\nlearning (SSL) methods do not appreciate such composable/decomposable structure\nattributes inherent to medical images. To overcome this limitation, this paper\nintroduces a novel SSL approach called ACE to learn anatomically consistent\nembedding via composition and decomposition with two key branches: (1) global\nconsistency, capturing discriminative macro-structures via extracting global\nfeatures; (2) local consistency, learning fine-grained anatomical details from\ncomposable/decomposable patch features via corresponding matrix matching.\nExperimental results across 6 datasets 2 backbones, evaluated in few-shot\nlearning, fine-tuning, and property analysis, show ACE's superior robustness,\ntransferability, and clinical potential. The innovations of our ACE lie in\ngrid-wise image cropping, leveraging the intrinsic properties of\ncompositionality and decompositionality of medical images, bridging the\nsemantic gap from high-level pathologies to low-level tissue anomalies, and\nproviding a new SSL method for medical imaging.\n","authors":["Ziyu Zhou","Haozhe Luo","Mohammad Reza Hosseinzadeh Taher","Jiaxuan Pang","Xiaowei Ding","Michael Gotway","Jianming Liang"],"pdf_url":"https://arxiv.org/pdf/2501.10131v1.pdf","comment":"Accepted by WACV 2025"},{"id":"http://arxiv.org/abs/2501.10129v1","updated":"2025-01-17T11:36:38Z","published":"2025-01-17T11:36:38Z","title":"Spatio-temporal Graph Learning on Adaptive Mined Key Frames for\n  High-performance Multi-Object Tracking","summary":"  In the realm of multi-object tracking, the challenge of accurately capturing\nthe spatial and temporal relationships between objects in video sequences\nremains a significant hurdle. This is further complicated by frequent\noccurrences of mutual occlusions among objects, which can lead to tracking\nerrors and reduced performance in existing methods. Motivated by these\nchallenges, we propose a novel adaptive key frame mining strategy that\naddresses the limitations of current tracking approaches. Specifically, we\nintroduce a Key Frame Extraction (KFE) module that leverages reinforcement\nlearning to adaptively segment videos, thereby guiding the tracker to exploit\nthe intrinsic logic of the video content. This approach allows us to capture\nstructured spatial relationships between different objects as well as the\ntemporal relationships of objects across frames. To tackle the issue of object\nocclusions, we have developed an Intra-Frame Feature Fusion (IFF) module.\nUnlike traditional graph-based methods that primarily focus on inter-frame\nfeature fusion, our IFF module uses a Graph Convolutional Network (GCN) to\nfacilitate information exchange between the target and surrounding objects\nwithin a frame. This innovation significantly enhances target\ndistinguishability and mitigates tracking loss and appearance similarity due to\nocclusions. By combining the strengths of both long and short trajectories and\nconsidering the spatial relationships between objects, our proposed tracker\nachieves impressive results on the MOT17 dataset, i.e., 68.6 HOTA, 81.0 IDF1,\n66.6 AssA, and 893 IDS, proving its effectiveness and accuracy.\n","authors":["Futian Wang","Fengxiang Liu","Xiao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10128v1","updated":"2025-01-17T11:32:33Z","published":"2025-01-17T11:32:33Z","title":"FECT: Classification of Breast Cancer Pathological Images Based on\n  Fusion Features","summary":"  Breast cancer is one of the most common cancers among women globally, with\nearly diagnosis and precise classification being crucial. With the advancement\nof deep learning and computer vision, the automatic classification of breast\ntissue pathological images has emerged as a research focus. Existing methods\ntypically rely on singular cell or tissue features and lack design\nconsiderations for morphological characteristics of challenging-to-classify\ncategories, resulting in suboptimal classification performance. To address\nthese problems, we proposes a novel breast cancer tissue classification model\nthat Fused features of Edges, Cells, and Tissues (FECT), employing the\nResMTUNet and an attention-based aggregator to extract and aggregate these\nfeatures. Extensive testing on the BRACS dataset demonstrates that our model\nsurpasses current advanced methods in terms of classification accuracy and F1\nscores. Moreover, due to its feature fusion that aligns with the diagnostic\napproach of pathologists, our model exhibits interpretability and holds promise\nfor significant roles in future clinical applications.\n","authors":["Jiacheng Hao","Yiqing Liu","Siqi Zeng","Yonghong He"],"pdf_url":"https://arxiv.org/pdf/2501.10128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04162v2","updated":"2025-01-17T11:30:28Z","published":"2023-08-08T09:48:00Z","title":"Expression Prompt Collaboration Transformer for Universal Referring\n  Video Object Segmentation","summary":"  Audio-guided Video Object Segmentation (A-VOS) and Referring Video Object\nSegmentation (R-VOS) are two highly related tasks that both aim to segment\nspecific objects from video sequences according to expression prompts. However,\ndue to the challenges of modeling representations for different modalities,\nexisting methods struggle to strike a balance between interaction flexibility\nand localization precision. In this paper, we address this problem from two\nperspectives: the alignment of audio and text and the deep interaction among\naudio, text, and visual modalities. First, we propose a universal architecture,\nthe Expression Prompt Collaboration Transformer, herein EPCFormer. Next, we\npropose an Expression Alignment (EA) mechanism for audio and text. The proposed\nEPCFormer exploits the fact that audio and text prompts referring to the same\nobjects are semantically equivalent by using contrastive learning for both\ntypes of expressions. Then, to facilitate deep interactions among audio, text,\nand visual modalities, we introduce an Expression-Visual Attention (EVA)\nmodule. The knowledge of video object segmentation in terms of the expression\nprompts can seamlessly transfer between the two tasks by deeply exploring\ncomplementary cues between text and audio. Experiments on well-recognized\nbenchmarks demonstrate that our EPCFormer attains state-of-the-art results on\nboth tasks. The source code will be made publicly available at\nhttps://github.com/lab206/EPCFormer.\n","authors":["Jiajun Chen","Jiacheng Lin","Guojin Zhong","Haolong Fu","Ke Nai","Kailun Yang","Zhiyong Li"],"pdf_url":"https://arxiv.org/pdf/2308.04162v2.pdf","comment":"Accepted to Knowledge-Based Systems (KBS). The source code will be\n  made publicly available at https://github.com/lab206/EPCFormer"},{"id":"http://arxiv.org/abs/2501.07888v2","updated":"2025-01-17T11:06:34Z","published":"2025-01-14T06:54:39Z","title":"Tarsier2: Advancing Large Vision-Language Models from Detailed Video\n  Description to Comprehensive Video Understanding","summary":"  We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM)\ndesigned for generating detailed and accurate video descriptions, while also\nexhibiting superior general video understanding capabilities. Tarsier2 achieves\nsignificant advancements through three key upgrades: (1) Scaling pre-training\ndata from 11M to 40M video-text pairs, enriching both volume and diversity; (2)\nPerforming fine-grained temporal alignment during supervised fine-tuning; (3)\nUsing model-based sampling to automatically construct preference data and\napplying DPO training for optimization. Extensive experiments show that\nTarsier2-7B consistently outperforms leading proprietary models, including\nGPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K\nbenchmark, Tarsier2-7B improves F1 by 2.8\\% over GPT-4o and 5.8\\% over\nGemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\\%\nperformance advantage over GPT-4o and +24.9\\% over Gemini-1.5-Pro. Tarsier2-7B\nalso sets new state-of-the-art results across 15 public benchmarks, spanning\ntasks such as video question-answering, video grounding, hallucination test,\nand embodied question-answering, demonstrating its versatility as a robust\ngeneralist vision-language model.\n","authors":["Liping Yuan","Jiawei Wang","Haomiao Sun","Yuchen Zhang","Yuan Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07888v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17458v2","updated":"2025-01-17T10:59:58Z","published":"2024-06-25T10:53:57Z","title":"Continuous Urban Change Detection from Satellite Image Time Series with\n  Temporal Feature Refinement and Multi-Task Integration","summary":"  Urbanization advances at unprecedented rates, resulting in negative effects\non the environment and human well-being. Remote sensing has the potential to\nmitigate these effects by supporting sustainable development strategies with\naccurate information on urban growth. Deep learning-based methods have achieved\npromising urban change detection results from optical satellite image pairs\nusing convolutional neural networks (ConvNets), transformers, and a multi-task\nlearning setup. However, transformers have not been leveraged for urban change\ndetection with multi-temporal data, i.e., >2 images, and multi-task learning\nmethods lack integration approaches that combine change and segmentation\noutputs. To fill this research gap, we propose a continuous urban change\ndetection method that identifies changes in each consecutive image pair of a\nsatellite image time series (SITS). Specifically, we propose a temporal feature\nrefinement (TFR) module that utilizes self-attention to improve ConvNet-based\nmulti-temporal building representations. Furthermore, we propose a multi-task\nintegration (MTI) module that utilizes Markov networks to find an optimal\nbuilding map time series based on segmentation and dense change outputs. The\nproposed method effectively identifies urban changes based on high-resolution\nSITS acquired by the PlanetScope constellation (F1 score 0.551) and Gaofen-2\n(F1 score 0.440). Moreover, our experiments on two challenging datasets\ndemonstrate the effectiveness of the proposed method compared to bi-temporal\nand multi-temporal urban change detection and segmentation methods.\n","authors":["Sebastian Hafner","Heng Fang","Hossein Azizpour","Yifang Ban"],"pdf_url":"https://arxiv.org/pdf/2406.17458v2.pdf","comment":"Under review at IEEE Transactions on Geoscience and Remote Sensing,\n  Code will be available at https://github.com/SebastianHafner/ContUrbanCD.git"},{"id":"http://arxiv.org/abs/2412.16146v2","updated":"2025-01-17T10:56:33Z","published":"2024-12-20T18:50:36Z","title":"Mamba2D: A Natively Multi-Dimensional State-Space Model for Vision Tasks","summary":"  State-Space Models (SSMs) have recently emerged as a powerful and efficient\nalternative to the long-standing transformer architecture. However, existing\nSSM conceptualizations retain deeply rooted biases from their roots in natural\nlanguage processing. This constrains their ability to appropriately model the\nspatially-dependent characteristics of visual inputs. In this paper, we address\nthese limitations by re-deriving modern selective state-space techniques,\nstarting from a natively multidimensional formulation. Currently, prior works\nattempt to apply natively 1D SSMs to 2D data (i.e. images) by relying on\narbitrary combinations of 1D scan directions to capture spatial dependencies.\nIn contrast, Mamba2D improves upon this with a single 2D scan direction that\nfactors in both dimensions of the input natively, effectively modelling spatial\ndependencies when constructing hidden states. Mamba2D shows comparable\nperformance to prior adaptations of SSMs for vision tasks, on standard image\nclassification evaluations with the ImageNet-1K dataset. Source code is\navailable at https://github.com/cocoalex00/Mamba2D.\n","authors":["Enis Baty","Alejandro Hernández Díaz","Chris Bridges","Rebecca Davidson","Steve Eckersley","Simon Hadfield"],"pdf_url":"https://arxiv.org/pdf/2412.16146v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10110v1","updated":"2025-01-17T10:53:03Z","published":"2025-01-17T10:53:03Z","title":"DiffVSR: Enhancing Real-World Video Super-Resolution with Diffusion\n  Models for Advanced Visual Quality and Temporal Consistency","summary":"  Diffusion models have demonstrated exceptional capabilities in image\ngeneration and restoration, yet their application to video super-resolution\nfaces significant challenges in maintaining both high fidelity and temporal\nconsistency. We present DiffVSR, a diffusion-based framework for real-world\nvideo super-resolution that effectively addresses these challenges through key\ninnovations. For intra-sequence coherence, we develop a multi-scale temporal\nattention module and temporal-enhanced VAE decoder that capture fine-grained\nmotion details. To ensure inter-sequence stability, we introduce a noise\nrescheduling mechanism with an interweaved latent transition approach, which\nenhances temporal consistency without additional training overhead. We propose\na progressive learning strategy that transitions from simple to complex\ndegradations, enabling robust optimization despite limited high-quality video\ndata. Extensive experiments demonstrate that DiffVSR delivers superior results\nin both visual quality and temporal consistency, setting a new performance\nstandard in real-world video super-resolution.\n","authors":["Xiaohui Li","Yihao Liu","Shuo Cao","Ziyan Chen","Shaobin Zhuang","Xiangyu Chen","Yinan He","Yi Wang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2501.10110v1.pdf","comment":"Project page: \\url{https://xh9998.github.io/DiffVSR-project/}"},{"id":"http://arxiv.org/abs/2501.10105v1","updated":"2025-01-17T10:45:22Z","published":"2025-01-17T10:45:22Z","title":"Universal Actions for Enhanced Embodied Foundation Models","summary":"  Training on diverse, internet-scale data is a key factor in the success of\nrecent large foundation models. Yet, using the same recipe for building\nembodied agents has faced noticeable difficulties. Despite the availability of\nmany crowd-sourced embodied datasets, their action spaces often exhibit\nsignificant heterogeneity due to distinct physical embodiment and control\ninterfaces for different robots, causing substantial challenges in developing\nembodied foundation models using cross-domain data. In this paper, we introduce\nUniAct, a new embodied foundation modeling framework operating in a tokenized\nUniversal Action Space. Our learned universal actions capture the generic\natomic behaviors across diverse robots by exploiting their shared structural\nfeatures, and enable enhanced cross-domain data utilization and\ncross-embodiment generalizations by eliminating the notorious heterogeneity.\nThe universal actions can be efficiently translated back to heterogeneous\nactionable commands by simply adding embodiment-specific details, from which\nfast adaptation to new robots becomes simple and straightforward. Our 0.5B\ninstantiation of UniAct outperforms 14X larger SOTA embodied foundation models\nin extensive evaluations on various real-world and simulation robots,\nshowcasing exceptional cross-embodiment control and adaptation capability,\nhighlighting the crucial benefit of adopting universal actions. Project page:\nhttps://github.com/2toinf/UniAct\n","authors":["Jinliang Zheng","Jianxiong Li","Dongxiu Liu","Yinan Zheng","Zhihao Wang","Zhonghong Ou","Yu Liu","Jingjing Liu","Ya-Qin Zhang","Xianyuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2501.10105v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.10098v1","updated":"2025-01-17T10:35:58Z","published":"2025-01-17T10:35:58Z","title":"landmarker: a Toolkit for Anatomical Landmark Localization in 2D/3D\n  Images","summary":"  Anatomical landmark localization in 2D/3D images is a critical task in\nmedical imaging. Although many general-purpose tools exist for landmark\nlocalization in classical computer vision tasks, such as pose estimation, they\nlack the specialized features and modularity necessary for anatomical landmark\nlocalization applications in the medical domain. Therefore, we introduce\nlandmarker, a Python package built on PyTorch. The package provides a\ncomprehensive, flexible toolkit for developing and evaluating landmark\nlocalization algorithms, supporting a range of methodologies, including static\nand adaptive heatmap regression. landmarker enhances the accuracy of landmark\nidentification, streamlines research and development processes, and supports\nvarious image formats and preprocessing pipelines. Its modular design allows\nusers to customize and extend the toolkit for specific datasets and\napplications, accelerating innovation in medical imaging. landmarker addresses\na critical need for precision and customization in landmark localization tasks\nnot adequately met by existing general-purpose pose estimation tools.\n","authors":["Jef Jonkers","Luc Duchateau","Glenn Van Wallendael","Sofie Van Hoecke"],"pdf_url":"https://arxiv.org/pdf/2501.10098v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.15977v3","updated":"2025-01-17T10:33:14Z","published":"2023-07-29T13:00:42Z","title":"Model Synthesis for Zero-Shot Model Attribution","summary":"  Nowadays, generative models are shaping various fields such as art, design,\nand human-computer interaction, yet accompanied by challenges related to\ncopyright infringement and content management. In response, existing research\nseeks to identify the unique fingerprints on the images they generate, which\ncan be leveraged to attribute the generated images to their source models.\nExisting methods, however, are constrained to identifying models within a\nstatic set included in the classifier training, failing to adapt to newly\nemerged unseen models dynamically. To bridge this gap, we aim to develop a\ngeneralized model fingerprint extractor capable of zero-shot attribution,\neffectively attributes unseen models without exposure during training. Central\nto our method is a model synthesis technique, which generates numerous\nsynthetic models mimicking the fingerprint patterns of real-world generative\nmodels. The design of the synthesis technique is motivated by observations on\nhow the basic generative model's architecture building blocks and parameters\ninfluence fingerprint patterns, and it is validated through two designed\nmetrics that examine synthetic models' fidelity and diversity. Our experiments\ndemonstrate that this fingerprint extractor, trained solely on synthetic\nmodels, achieves impressive zero-shot generalization on a wide range of\nreal-world generative models, improving model identification and verification\naccuracy on unseen models by over 40% and 15%, respectively, compared to\nexisting approaches.\n","authors":["Tianyun Yang","Juan Cao","Danding Wang","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2307.15977v3.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2309.00494v2","updated":"2025-01-17T10:31:13Z","published":"2023-09-01T14:40:25Z","title":"Multi-stage Deep Learning Artifact Reduction for Pallel-beam Computed\n  Tomography","summary":"  Computed Tomography (CT) using synchrotron radiation is a powerful technique\nthat, compared to lab-CT techniques, boosts high spatial and temporal\nresolution while also providing access to a range of contrast-formation\nmechanisms. The acquired projection data is typically processed by a\ncomputational pipeline composed of multiple stages. Artifacts introduced during\ndata acquisition can propagate through the pipeline, and degrade image quality\nin the reconstructed images. Recently, deep learning has shown significant\npromise in enhancing image quality for images representing scientific data.\nThis success has driven increasing adoption of deep learning techniques in CT\nimaging. Various approaches have been proposed to incorporate deep learning\ninto computational pipelines, but each has limitations in addressing artifacts\neffectively and efficiently in synchrotron CT, either in properly addressing\nthe specific artifacts, or in computational efficiency.\n  Recognizing these challenges, we introduce a novel method that incorporates\nseparate deep learning models at each stage of the tomography\npipeline-projection, sinogram, and reconstruction-to address specific artifacts\nlocally in a data-driven way. Our approach includes bypass connections that\nfeed both the outputs from previous stages and raw data to subsequent stages,\nminimizing the risk of error propagation. Extensive evaluations on both\nsimulated and real-world datasets illustrate that our approach effectively\nreduces artifacts and outperforms comparison methods.\n","authors":["Jiayang Shi","Daniel M. Pelt","K. Joost Batenburg"],"pdf_url":"https://arxiv.org/pdf/2309.00494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10089v1","updated":"2025-01-17T10:16:18Z","published":"2025-01-17T10:16:18Z","title":"Classifier Ensemble for Efficient Uncertainty Calibration of Deep Neural\n  Networks for Image Classification","summary":"  This paper investigates novel classifier ensemble techniques for uncertainty\ncalibration applied to various deep neural networks for image classification.\nWe evaluate both accuracy and calibration metrics, focusing on Expected\nCalibration Error (ECE) and Maximum Calibration Error (MCE). Our work compares\ndifferent methods for building simple yet efficient classifier ensembles,\nincluding majority voting and several metamodel-based approaches. Our\nevaluation reveals that while state-of-the-art deep neural networks for image\nclassification achieve high accuracy on standard datasets, they frequently\nsuffer from significant calibration errors. Basic ensemble techniques like\nmajority voting provide modest improvements, while metamodel-based ensembles\nconsistently reduce ECE and MCE across all architectures. Notably, the largest\nof our compared metamodels demonstrate the most substantial calibration\nimprovements, with minimal impact on accuracy. Moreover, classifier ensembles\nwith metamodels outperform traditional model ensembles in calibration\nperformance, while requiring significantly fewer parameters. In comparison to\ntraditional post-hoc calibration methods, our approach removes the need for a\nseparate calibration dataset. These findings underscore the potential of our\nproposed metamodel-based classifier ensembles as an efficient and effective\napproach to improving model calibration, thereby contributing to more reliable\ndeep learning systems.\n","authors":["Michael Schulze","Nikolas Ebert","Laurenz Reichardt","Oliver Wasenmüller"],"pdf_url":"https://arxiv.org/pdf/2501.10089v1.pdf","comment":"This paper has been accepted at International Conference on Computer\n  Vision Theory and Applications (VISAPP), 2025"},{"id":"http://arxiv.org/abs/2410.05820v2","updated":"2025-01-17T10:01:30Z","published":"2024-10-08T08:49:47Z","title":"IncSAR: A Dual Fusion Incremental Learning Framework for SAR Target\n  Recognition","summary":"  Deep learning techniques have achieved significant success in Synthetic\nAperture Radar (SAR) target recognition using predefined datasets in static\nscenarios. However, real-world applications demand that models incrementally\nlearn new information without forgetting previously acquired knowledge. The\nchallenge of catastrophic forgetting, where models lose past knowledge when\nadapting to new tasks, remains a critical issue. In this paper, we introduce\nIncSAR, an incremental learning framework designed to tackle catastrophic\nforgetting in SAR target recognition. IncSAR combines the power of a Vision\nTransformer (ViT) and a custom-designed Convolutional Neural Network (CNN) in a\ndual-branch architecture, integrated via a late-fusion strategy. Additionally,\nwe explore the use of TinyViT to reduce computational complexity and propose an\nattention mechanism to dynamically enhance feature representation. To mitigate\nthe speckle noise inherent in SAR images, we employ a denoising module based on\na neural network approximation of Robust Principal Component Analysis (RPCA),\nleveraging a simple neural network for efficient noise reduction in SAR\nimagery. Moreover, a random projection layer improves the linear separability\nof features, and a variant of Linear Discriminant Analysis (LDA) decorrelates\nextracted class prototypes for better generalization. Extensive experiments on\nthe MSTAR, SAR-AIRcraft-1.0, and OpenSARShip benchmark datasets demonstrate\nthat IncSAR significantly outperforms state-of-the-art approaches, achieving a\n99.63\\% average accuracy and a 0.33\\% performance drop, representing an 89\\%\nimprovement in retention compared to existing techniques. The source code is\navailable at https://github.com/geokarant/IncSAR.\n","authors":["George Karantaidis","Athanasios Pantsios","Ioannis Kompatsiaris","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2410.05820v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10081v1","updated":"2025-01-17T09:55:41Z","published":"2025-01-17T09:55:41Z","title":"Leveraging Confident Image Regions for Source-Free Domain-Adaptive\n  Object Detection","summary":"  Source-free domain-adaptive object detection is an interesting but scarcely\naddressed topic. It aims at adapting a source-pretrained detector to a distinct\ntarget domain without resorting to source data during adaptation. So far, there\nis no data augmentation scheme tailored to source-free domain-adaptive object\ndetection. To this end, this paper presents a novel data augmentation approach\nthat cuts out target image regions where the detector is confident, augments\nthem along with their respective pseudo-labels, and joins them into a\nchallenging target image to adapt the detector. As the source data is out of\nreach during adaptation, we implement our approach within a teacher-student\nlearning paradigm to ensure that the model does not collapse during the\nadaptation procedure. We evaluated our approach on three adaptation benchmarks\nof traffic scenes, scoring new state-of-the-art on two of them.\n","authors":["Mohamed Lamine Mekhalfi","Davide Boscaini","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2501.10081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10080v1","updated":"2025-01-17T09:55:05Z","published":"2025-01-17T09:55:05Z","title":"Few-shot Structure-Informed Machinery Part Segmentation with Foundation\n  Models and Graph Neural Networks","summary":"  This paper proposes a novel approach to few-shot semantic segmentation for\nmachinery with multiple parts that exhibit spatial and hierarchical\nrelationships. Our method integrates the foundation models CLIPSeg and Segment\nAnything Model (SAM) with the interest point detector SuperPoint and a graph\nconvolutional network (GCN) to accurately segment machinery parts. By providing\n1 to 25 annotated samples, our model, evaluated on a purely synthetic dataset\ndepicting a truck-mounted loading crane, achieves effective segmentation across\nvarious levels of detail. Training times are kept under five minutes on\nconsumer GPUs. The model demonstrates robust generalization to real data,\nachieving a qualitative synthetic-to-real generalization with a $J\\&F$ score of\n92.2 on real data using 10 synthetic support samples. When benchmarked on the\nDAVIS 2017 dataset, it achieves a $J\\&F$ score of 71.5 in semi-supervised video\nsegmentation with three support samples. This method's fast training times and\neffective generalization to real data make it a valuable tool for autonomous\nsystems interacting with machinery and infrastructure, and illustrate the\npotential of combined and orchestrated foundation models for few-shot\nsegmentation tasks.\n","authors":["Michael Schwingshackl","Fabio Francisco Oberweger","Markus Murschitz"],"pdf_url":"https://arxiv.org/pdf/2501.10080v1.pdf","comment":"Accepted at Winter Conference on Applications of Computer Vision\n  (WACV) 2025. Code and available at\n  https://github.com/AIT-Assistive-Autonomous-Systems/Hopomop"},{"id":"http://arxiv.org/abs/2411.19939v2","updated":"2025-01-17T09:50:55Z","published":"2024-11-29T18:56:37Z","title":"VLSBench: Unveiling Visual Leakage in Multimodal Safety","summary":"  Safety concerns of Multimodal large language models (MLLMs) have gradually\nbecome an important problem in various applications. Surprisingly, previous\nworks indicate a counter-intuitive phenomenon that using textual unlearning to\nalign MLLMs achieves comparable safety performances with MLLMs trained with\nimage-text pairs. To explain such a counter-intuitive phenomenon, we discover a\nvisual safety information leakage (VSIL) problem in existing multimodal safety\nbenchmarks, i.e., the potentially risky and sensitive content in the image has\nbeen revealed in the textual query. In this way, MLLMs can easily refuse these\nsensitive text-image queries according to textual queries. However, image-text\npairs without VSIL are common in real-world scenarios and are overlooked by\nexisting multimodal safety benchmarks. To this end, we construct multimodal\nvisual leakless safety benchmark (VLSBench) preventing visual safety leakage\nfrom image to textual query with 2.4k image-text pairs. Experimental results\nindicate that VLSBench poses a significant challenge to both open-source and\nclose-source MLLMs, including LLaVA, Qwen2-VL, Llama3.2-Vision, and GPT-4o.\nThis study demonstrates that textual alignment is enough for multimodal safety\nscenarios with VSIL, while multimodal alignment is a more promising solution\nfor multimodal safety scenarios without VSIL. Please see our code and data at:\nhttps://hxhcreate.github.io/vlsbench.github.io/\n","authors":["Xuhao Hu","Dongrui Liu","Hao Li","Xuanjing Huang","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2411.19939v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10075v1","updated":"2025-01-17T09:47:27Z","published":"2025-01-17T09:47:27Z","title":"Robust Change Captioning in Remote Sensing: SECOND-CC Dataset and\n  MModalCC Framework","summary":"  Remote sensing change captioning (RSICC) aims to describe changes between\nbitemporal images in natural language. Existing methods often fail under\nchallenges like illumination differences, viewpoint changes, blur effects,\nleading to inaccuracies, especially in no-change regions. Moreover, the images\nacquired at different spatial resolutions and have registration errors tend to\naffect the captions. To address these issues, we introduce SECOND-CC, a novel\nRSICC dataset featuring high-resolution RGB image pairs, semantic segmentation\nmaps, and diverse real-world scenarios. SECOND-CC which contains 6,041 pairs of\nbitemporal RS images and 30,205 sentences describing the differences between\nimages. Additionally, we propose MModalCC, a multimodal framework that\nintegrates semantic and visual data using advanced attention mechanisms,\nincluding Cross-Modal Cross Attention (CMCA) and Multimodal Gated Cross\nAttention (MGCA). Detailed ablation studies and attention visualizations\nfurther demonstrate its effectiveness and ability to address RSICC challenges.\nComprehensive experiments show that MModalCC outperforms state-of-the-art RSICC\nmethods, including RSICCformer, Chg2Cap, and PSNet with +4.6% improvement on\nBLEU4 score and +9.6% improvement on CIDEr score. We will make our dataset and\ncodebase publicly available to facilitate future research at\nhttps://github.com/ChangeCapsInRS/SecondCC\n","authors":["Ali Can Karaca","M. Enes Ozelbas","Saadettin Berber","Orkhan Karimli","Turabi Yildirim","M. Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2501.10075v1.pdf","comment":"This work has been submitted to the IEEE Transactions on Geoscience\n  and Remote Sensing journal for possible publication"},{"id":"http://arxiv.org/abs/2501.10074v1","updated":"2025-01-17T09:46:27Z","published":"2025-01-17T09:46:27Z","title":"SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and\n  Chain-of-Thought for Embodied Task Planning","summary":"  Spatial reasoning is an essential problem in embodied AI research. Efforts to\nenhance spatial reasoning abilities through supplementary spatial data and\nfine-tuning have proven limited and ineffective when addressing complex\nembodied tasks, largely due to their dependence on language-based outputs.\nWhile some approaches have introduced a point-based action space to mitigate\nthis issue, they fall short in managing more intricate tasks within complex\nenvironments. This deficiency arises from their failure to fully exploit the\ninherent thinking and reasoning capabilities that are fundamental strengths of\nVision-Language Models (VLMs). To address these limitations, we propose a novel\napproach named SpatialCoT, specifically designed to bolster the spatial\nreasoning capabilities of VLMs. Our approach comprises two stages: spatial\ncoordinate bi-directional alignment, which aligns vision-language inputs with\nspatial coordinates, and chain-of-thought spatial grounding, which harnesses\nthe reasoning capabilities of language models for advanced spatial reasoning.\nWe evaluate SpatialCoT on challenging navigation and manipulation tasks, both\nin simulation and real-world settings. Experimental results demonstrate that\nour method significantly outperforms previous state-of-the-art approaches in\nboth tasks.\n","authors":["Yuecheng Liu","Dafeng Chi","Shiguang Wu","Zhanguang Zhang","Yaochen Hu","Lingfeng Zhang","Yingxue Zhang","Shuang Wu","Tongtong Cao","Guowei Huang","Guangjian Tian","Xingyue Quan","Jianye Hao","Yuzheng Zhuang"],"pdf_url":"https://arxiv.org/pdf/2501.10074v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.10071v1","updated":"2025-01-17T09:43:14Z","published":"2025-01-17T09:43:14Z","title":"CLIP-PCQA: Exploring Subjective-Aligned Vision-Language Modeling for\n  Point Cloud Quality Assessment","summary":"  In recent years, No-Reference Point Cloud Quality Assessment (NR-PCQA)\nresearch has achieved significant progress. However, existing methods mostly\nseek a direct mapping function from visual data to the Mean Opinion Score\n(MOS), which is contradictory to the mechanism of practical subjective\nevaluation. To address this, we propose a novel language-driven PCQA method\nnamed CLIP-PCQA. Considering that human beings prefer to describe visual\nquality using discrete quality descriptions (e.g., \"excellent\" and \"poor\")\nrather than specific scores, we adopt a retrieval-based mapping strategy to\nsimulate the process of subjective assessment. More specifically, based on the\nphilosophy of CLIP, we calculate the cosine similarity between the visual\nfeatures and multiple textual features corresponding to different quality\ndescriptions, in which process an effective contrastive loss and learnable\nprompts are introduced to enhance the feature extraction. Meanwhile, given the\npersonal limitations and bias in subjective experiments, we further covert the\nfeature similarities into probabilities and consider the Opinion Score\nDistribution (OSD) rather than a single MOS as the final target. Experimental\nresults show that our CLIP-PCQA outperforms other State-Of-The-Art (SOTA)\napproaches.\n","authors":["Yating Liu","Yujie Zhang","Ziyu Shan","Yiling Xu"],"pdf_url":"https://arxiv.org/pdf/2501.10071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09365v4","updated":"2025-01-17T09:40:59Z","published":"2024-05-15T14:17:44Z","title":"SARATR-X: Towards Building A Foundation Model for SAR Target Recognition","summary":"  Despite the remarkable progress in synthetic aperture radar automatic target\nrecognition (SAR ATR), recent efforts have concentrated on detecting and\nclassifying a specific category, e.g., vehicles, ships, airplanes, or\nbuildings. One of the fundamental limitations of the top-performing SAR ATR\nmethods is that the learning paradigm is supervised, task-specific,\nlimited-category, closed-world learning, which depends on massive amounts of\naccurately annotated samples that are expensively labeled by expert SAR\nanalysts and have limited generalization capability and scalability. In this\nwork, we make the first attempt towards building a foundation model for SAR\nATR, termed SARATR-X. SARATR-X learns generalizable representations via\nself-supervised learning (SSL) and provides a cornerstone for label-efficient\nmodel adaptation to generic SAR target detection and classification tasks.\nSpecifically, SARATR-X is trained on 0.18 M unlabelled SAR target samples,\nwhich are curated by combining contemporary benchmarks and constitute the\nlargest publicly available dataset till now. Considering the characteristics of\nSAR images, a backbone tailored for SAR ATR is carefully designed, and a\ntwo-step SSL method endowed with multi-scale gradient features was applied to\nensure the feature diversity and model scalability of SARATR-X. The\ncapabilities of SARATR-X are evaluated on classification under few-shot and\nrobustness settings and detection across various categories and scenes, and\nimpressive performance is achieved, often competitive with or even superior to\nprior fully supervised, semi-supervised, or self-supervised algorithms. Our\nSARATR-X and the curated dataset are released at\nhttps://github.com/waterdisappear/SARATR-X to foster research into foundation\nmodels for SAR image interpretation.\n","authors":["Weijie Li","Wei Yang","Yuenan Hou","Li Liu","Yongxiang Liu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2405.09365v4.pdf","comment":"20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.10067v1","updated":"2025-01-17T09:38:43Z","published":"2025-01-17T09:38:43Z","title":"FiLo++: Zero-/Few-Shot Anomaly Detection by Fused Fine-Grained\n  Descriptions and Deformable Localization","summary":"  Anomaly detection methods typically require extensive normal samples from the\ntarget class for training, limiting their applicability in scenarios that\nrequire rapid adaptation, such as cold start. Zero-shot and few-shot anomaly\ndetection do not require labeled samples from the target class in advance,\nmaking them a promising research direction. Existing zero-shot and few-shot\napproaches often leverage powerful multimodal models to detect and localize\nanomalies by comparing image-text similarity. However, their handcrafted\ngeneric descriptions fail to capture the diverse range of anomalies that may\nemerge in different objects, and simple patch-level image-text matching often\nstruggles to localize anomalous regions of varying shapes and sizes. To address\nthese issues, this paper proposes the FiLo++ method, which consists of two key\ncomponents. The first component, Fused Fine-Grained Descriptions (FusDes),\nutilizes large language models to generate anomaly descriptions for each object\ncategory, combines both fixed and learnable prompt templates and applies a\nruntime prompt filtering method, producing more accurate and task-specific\ntextual descriptions. The second component, Deformable Localization (DefLoc),\nintegrates the vision foundation model Grounding DINO with position-enhanced\ntext descriptions and a Multi-scale Deformable Cross-modal Interaction (MDCI)\nmodule, enabling accurate localization of anomalies with various shapes and\nsizes. In addition, we design a position-enhanced patch matching approach to\nimprove few-shot anomaly detection performance. Experiments on multiple\ndatasets demonstrate that FiLo++ achieves significant performance improvements\ncompared with existing methods. Code will be available at\nhttps://github.com/CASIA-IVA-Lab/FiLo.\n","authors":["Zhaopeng Gu","Bingke Zhu","Guibo Zhu","Yingying Chen","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10064v1","updated":"2025-01-17T09:29:33Z","published":"2025-01-17T09:29:33Z","title":"One-D-Piece: Image Tokenizer Meets Quality-Controllable Compression","summary":"  Current image tokenization methods require a large number of tokens to\ncapture the information contained within images. Although the amount of\ninformation varies across images, most image tokenizers only support\nfixed-length tokenization, leading to inefficiency in token allocation. In this\nstudy, we introduce One-D-Piece, a discrete image tokenizer designed for\nvariable-length tokenization, achieving quality-controllable mechanism. To\nenable variable compression rate, we introduce a simple but effective\nregularization mechanism named \"Tail Token Drop\" into discrete one-dimensional\nimage tokenizers. This method encourages critical information to concentrate at\nthe head of the token sequence, enabling support of variadic tokenization,\nwhile preserving state-of-the-art reconstruction quality. We evaluate our\ntokenizer across multiple reconstruction quality metrics and find that it\ndelivers significantly better perceptual quality than existing\nquality-controllable compression methods, including JPEG and WebP, at smaller\nbyte sizes. Furthermore, we assess our tokenizer on various downstream computer\nvision tasks, including image classification, object detection, semantic\nsegmentation, and depth estimation, confirming its adaptability to numerous\napplications compared to other variable-rate methods. Our approach demonstrates\nthe versatility of variable-length discrete image tokenization, establishing a\nnew paradigm in both compression efficiency and reconstruction performance.\nFinally, we validate the effectiveness of tail token drop via detailed analysis\nof tokenizers.\n","authors":["Keita Miwa","Kento Sasaki","Hidehisa Arai","Tsubasa Takahashi","Yu Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2501.10064v1.pdf","comment":"Our Project Page:\n  https://turingmotors.github.io/one-d-piece-tokenizer"},{"id":"http://arxiv.org/abs/2404.03703v3","updated":"2025-01-17T09:03:57Z","published":"2024-04-04T07:49:39Z","title":"Mitigating analytical variability in fMRI results with style transfer","summary":"  We propose a novel approach to improve the reproducibility of neuroimaging\nresults by converting statistic maps across different functional MRI pipelines.\nWe make the assumption that pipelines used to compute fMRI statistic maps can\nbe considered as a style component and we propose to use different generative\nmodels, among which, Generative Adversarial Networks (GAN) and Diffusion Models\n(DM) to convert statistic maps across different pipelines. We explore the\nperformance of multiple GAN frameworks, and design a new DM framework for\nunsupervised multi-domain styletransfer. We constrain the generation of 3D fMRI\nstatistic maps using the latent space of an auxiliary classifier that\ndistinguishes statistic maps from different pipelines and extend traditional\nsampling techniques used in DM to improve the transition performance. Our\nexperiments demonstrate that our proposed methods aresuccessful: pipelines can\nindeed be transferred as a style component, providing animportant source of\ndata augmentation for future medical studies.\n","authors":["Elodie Germani","Camille Maumet","Elisa Fromont"],"pdf_url":"https://arxiv.org/pdf/2404.03703v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12709v2","updated":"2025-01-17T09:03:17Z","published":"2024-12-17T09:23:46Z","title":"Accelerating lensed quasars discovery and modeling with physics-informed\n  variational autoencoders","summary":"  Strongly lensed quasars provide valuable insights into the rate of cosmic\nexpansion, the distribution of dark matter in foreground deflectors, and the\ncharacteristics of quasar hosts. However, detecting them in astronomical images\nis difficult due to the prevalence of non-lensing objects. To address this\nchallenge, we developed a generative deep learning model called VariLens, built\nupon a physics-informed variational autoencoder. This model seamlessly\nintegrates three essential modules: image reconstruction, object\nclassification, and lens modeling, offering a fast and comprehensive approach\nto strong lens analysis. VariLens is capable of rapidly determining both (1)\nthe probability that an object is a lens system and (2) key parameters of a\nsingular isothermal ellipsoid (SIE) mass model -- including the Einstein radius\n($\\theta_\\mathrm{E}$), lens center, and ellipticity -- in just milliseconds\nusing a single CPU. A direct comparison of VariLens estimates with traditional\nlens modeling for 20 known lensed quasars within the Subaru Hyper Suprime-Cam\n(HSC) footprint shows good agreement, with both results consistent within\n$2\\sigma$ for systems with $\\theta_\\mathrm{E}<3$ arcsecs. To identify new\nlensed quasar candidates, we begin with an initial sample of approximately 80\nmillion sources, combining HSC data with multiwavelength information from\nvarious surveys. After applying a photometric preselection aimed at locating\n$z>1.5$ sources, the number of candidates is reduced to 710,966. Subsequently,\nVariLens highlights 13,831 sources, each showing a high likelihood of being a\nlens. A visual assessment of these objects results in 42 promising candidates\nthat await spectroscopic confirmation. These results underscore the potential\nof automated deep learning pipelines to efficiently detect and model strong\nlenses in large datasets.\n","authors":["Irham T. Andika","Stefan Schuldt","Sherry H. Suyu","Satadru Bag","Raoul Cañameras","Alejandra Melo","Claudio Grillo","James H. H. Chan"],"pdf_url":"https://arxiv.org/pdf/2412.12709v2.pdf","comment":"Submitted to the Astronomy & Astrophysics journal and updated to\n  reflect the revised version. The paper consists of 17 main pages, 14 figures,\n  and 5 tables. We welcome feedback and comments from readers!"},{"id":"http://arxiv.org/abs/2501.10040v1","updated":"2025-01-17T08:56:17Z","published":"2025-01-17T08:56:17Z","title":"LWGANet: A Lightweight Group Attention Backbone for Remote Sensing\n  Visual Tasks","summary":"  Remote sensing (RS) visual tasks have gained significant academic and\npractical importance. However, they encounter numerous challenges that hinder\neffective feature extraction, including the detection and recognition of\nmultiple objects exhibiting substantial variations in scale within a single\nimage. While prior dual-branch or multi-branch architectural strategies have\nbeen effective in managing these object variances, they have concurrently\nresulted in considerable increases in computational demands and parameter\ncounts. Consequently, these architectures are rendered less viable for\ndeployment on resource-constrained devices. Contemporary lightweight backbone\nnetworks, designed primarily for natural images, frequently encounter\ndifficulties in effectively extracting features from multi-scale objects, which\ncompromises their efficacy in RS visual tasks. This article introduces LWGANet,\na specialized lightweight backbone network tailored for RS visual tasks,\nincorporating a novel lightweight group attention (LWGA) module designed to\naddress these specific challenges. LWGA module, tailored for RS imagery,\nadeptly harnesses redundant features to extract a wide range of spatial\ninformation, from local to global scales, without introducing additional\ncomplexity or computational overhead. This facilitates precise feature\nextraction across multiple scales within an efficient framework.LWGANet was\nrigorously evaluated across twelve datasets, which span four crucial RS visual\ntasks: scene classification, oriented object detection, semantic segmentation,\nand change detection. The results confirm LWGANet's widespread applicability\nand its ability to maintain an optimal balance between high performance and low\ncomplexity, achieving SOTA results across diverse datasets. LWGANet emerged as\na novel solution for resource-limited scenarios requiring robust RS image\nprocessing capabilities.\n","authors":["Wei Lu","Si-Bao Chen","Chris H. Q. Ding","Jin Tang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2501.10040v1.pdf","comment":"12 pages, 8 figures, Remote sensing"},{"id":"http://arxiv.org/abs/2404.01604v3","updated":"2025-01-17T08:33:10Z","published":"2024-04-02T02:52:05Z","title":"WaveDH: Wavelet Sub-bands Guided ConvNet for Efficient Image Dehazing","summary":"  The surge in interest regarding image dehazing has led to notable\nadvancements in deep learning-based single image dehazing approaches,\nexhibiting impressive performance in recent studies. Despite these strides,\nmany existing methods fall short in meeting the efficiency demands of practical\napplications. In this paper, we introduce WaveDH, a novel and compact ConvNet\ndesigned to address this efficiency gap in image dehazing. Our WaveDH leverages\nwavelet sub-bands for guided up-and-downsampling and frequency-aware feature\nrefinement. The key idea lies in utilizing wavelet decomposition to extract\nlow-and-high frequency components from feature levels, allowing for faster\nprocessing while upholding high-quality reconstruction. The downsampling block\nemploys a novel squeeze-and-attention scheme to optimize the feature\ndownsampling process in a structurally compact manner through wavelet domain\nlearning, preserving discriminative features while discarding noise components.\nIn our upsampling block, we introduce a dual-upsample and fusion mechanism to\nenhance high-frequency component awareness, aiding in the reconstruction of\nhigh-frequency details. Departing from conventional dehazing methods that treat\nlow-and-high frequency components equally, our feature refinement block\nstrategically processes features with a frequency-aware approach. By employing\na coarse-to-fine methodology, it not only refines the details at frequency\nlevels but also significantly optimizes computational costs. The refinement is\nperformed in a maximum 8x downsampled feature space, striking a favorable\nefficiency-vs-accuracy trade-off. Extensive experiments demonstrate that our\nmethod, WaveDH, outperforms many state-of-the-art methods on several image\ndehazing benchmarks with significantly reduced computational costs. Our code is\navailable at https://github.com/AwesomeHwang/WaveDH.\n","authors":["Seongmin Hwang","Daeyoung Han","Cheolkon Jung","Moongu Jeon"],"pdf_url":"https://arxiv.org/pdf/2404.01604v3.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2307.09059v4","updated":"2025-01-17T08:32:40Z","published":"2023-07-18T08:23:46Z","title":"Text-guided Image Restoration and Semantic Enhancement for Text-to-Image\n  Person Retrieval","summary":"  The goal of Text-to-Image Person Retrieval (TIPR) is to retrieve specific\nperson images according to the given textual descriptions. A primary challenge\nin this task is bridging the substantial representational gap between visual\nand textual modalities. The prevailing methods map texts and images into\nunified embedding space for matching, while the intricate semantic\ncorrespondences between texts and images are still not effectively constructed.\nTo address this issue, we propose a novel TIPR framework to build fine-grained\ninteractions and alignment between person images and the corresponding texts.\nSpecifically, via fine-tuning the Contrastive Language-Image Pre-training\n(CLIP) model, a visual-textual dual encoder is firstly constructed, to\npreliminarily align the image and text features. Secondly, a Text-guided Image\nRestoration (TIR) auxiliary task is proposed to map abstract textual entities\nto specific image regions, improving the alignment between local textual and\nvisual embeddings. Additionally, a cross-modal triplet loss is presented to\nhandle hard samples, and further enhance the model's discriminability for minor\ndifferences. Moreover, a pruning-based text data augmentation approach is\nproposed to enhance focus on essential elements in descriptions, thereby\navoiding excessive model attention to less significant information. The\nexperimental results show our proposed method outperforms state-of-the-art\nmethods on three popular benchmark datasets, and the code will be made publicly\navailable at https://github.com/Delong-liu-bupt/SEN.\n","authors":["Delong Liu","Haiwen Li","Zhicheng Zhao","Yuan Dong"],"pdf_url":"https://arxiv.org/pdf/2307.09059v4.pdf","comment":"The paper was withdrawn due to a dispute among the authors regarding\n  the content of the article"},{"id":"http://arxiv.org/abs/2501.09278v2","updated":"2025-01-17T08:20:59Z","published":"2025-01-16T03:54:06Z","title":"Text-guided Synthetic Geometric Augmentation for Zero-shot 3D\n  Understanding","summary":"  Zero-shot recognition models require extensive training data for\ngeneralization. However, in zero-shot 3D classification, collecting 3D data and\ncaptions is costly and laborintensive, posing a significant barrier compared to\n2D vision. Recent advances in generative models have achieved unprecedented\nrealism in synthetic data production, and recent research shows the potential\nfor using generated data as training data. Here, naturally raising the\nquestion: Can synthetic 3D data generated by generative models be used as\nexpanding limited 3D datasets? In response, we present a synthetic 3D dataset\nexpansion method, Textguided Geometric Augmentation (TeGA). TeGA is tailored\nfor language-image-3D pretraining, which achieves SoTA in zero-shot 3D\nclassification, and uses a generative textto-3D model to enhance and extend\nlimited 3D datasets. Specifically, we automatically generate text-guided\nsynthetic 3D data and introduce a consistency filtering strategy to discard\nnoisy samples where semantics and geometric shapes do not match with text. In\nthe experiment to double the original dataset size using TeGA, our approach\ndemonstrates improvements over the baselines, achieving zeroshot performance\ngains of 3.0% on Objaverse-LVIS, 4.6% on ScanObjectNN, and 8.7% on ModelNet40.\nThese results demonstrate that TeGA effectively bridges the 3D data gap,\nenabling robust zero-shot 3D classification even with limited real training\ndata and paving the way for zero-shot 3D vision application.\n","authors":["Kohei Torimi","Ryosuke Yamada","Daichi Otsuka","Kensho Hara","Yuki M. Asano","Hirokatsu Kataoka","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2501.09278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10021v1","updated":"2025-01-17T08:10:53Z","published":"2025-01-17T08:10:53Z","title":"X-Dyna: Expressive Dynamic Human Image Animation","summary":"  We introduce X-Dyna, a novel zero-shot, diffusion-based pipeline for\nanimating a single human image using facial expressions and body movements\nderived from a driving video, that generates realistic, context-aware dynamics\nfor both the subject and the surrounding environment. Building on prior\napproaches centered on human pose control, X-Dyna addresses key shortcomings\ncausing the loss of dynamic details, enhancing the lifelike qualities of human\nvideo animations. At the core of our approach is the Dynamics-Adapter, a\nlightweight module that effectively integrates reference appearance context\ninto the spatial attentions of the diffusion backbone while preserving the\ncapacity of motion modules in synthesizing fluid and intricate dynamic details.\nBeyond body pose control, we connect a local control module with our model to\ncapture identity-disentangled facial expressions, facilitating accurate\nexpression transfer for enhanced realism in animated scenes. Together, these\ncomponents form a unified framework capable of learning physical human motion\nand natural scene dynamics from a diverse blend of human and scene videos.\nComprehensive qualitative and quantitative evaluations demonstrate that X-Dyna\noutperforms state-of-the-art methods, creating highly lifelike and expressive\nanimations. The code is available at https://github.com/bytedance/X-Dyna.\n","authors":["Di Chang","Hongyi Xu","You Xie","Yipeng Gao","Zhengfei Kuang","Shengqu Cai","Chenxu Zhang","Guoxian Song","Chao Wang","Yichun Shi","Zeyuan Chen","Shijie Zhou","Linjie Luo","Gordon Wetzstein","Mohammad Soleymani"],"pdf_url":"https://arxiv.org/pdf/2501.10021v1.pdf","comment":"Project page:https://x-dyna.github.io/xdyna.github.io/\n  Code:https://github.com/bytedance/X-Dyna"},{"id":"http://arxiv.org/abs/2501.10020v1","updated":"2025-01-17T08:09:06Z","published":"2025-01-17T08:09:06Z","title":"Textoon: Generating Vivid 2D Cartoon Characters from Text Descriptions","summary":"  The 2D cartoon style is a prominent art form in digital character creation,\nparticularly popular among younger audiences. While advancements in digital\nhuman technology have spurred extensive research into photorealistic digital\nhumans and 3D characters, interactive 2D cartoon characters have received\ncomparatively less attention. Unlike 3D counterparts, which require\nsophisticated construction and resource-intensive rendering, Live2D, a\nwidely-used format for 2D cartoon characters, offers a more efficient\nalternative, which allows to animate 2D characters in a manner that simulates\n3D movement without the necessity of building a complete 3D model. Furthermore,\nLive2D employs lightweight HTML5 (H5) rendering, improving both accessibility\nand efficiency. In this technical report, we introduce Textoon, an innovative\nmethod for generating diverse 2D cartoon characters in the Live2D format based\non text descriptions. The Textoon leverages cutting-edge language and vision\nmodels to comprehend textual intentions and generate 2D appearance, capable of\ncreating a wide variety of stunning and interactive 2D characters within one\nminute. The project homepage is https://human3daigc.github.io/Textoon_webpage/.\n","authors":["Chao He","Jianqiang Ren","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2501.10020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10018v1","updated":"2025-01-17T08:03:02Z","published":"2025-01-17T08:03:02Z","title":"DiffuEraser: A Diffusion Model for Video Inpainting","summary":"  Recent video inpainting algorithms integrate flow-based pixel propagation\nwith transformer-based generation to leverage optical flow for restoring\ntextures and objects using information from neighboring frames, while\ncompleting masked regions through visual Transformers. However, these\napproaches often encounter blurring and temporal inconsistencies when dealing\nwith large masks, highlighting the need for models with enhanced generative\ncapabilities. Recently, diffusion models have emerged as a prominent technique\nin image and video generation due to their impressive performance. In this\npaper, we introduce DiffuEraser, a video inpainting model based on stable\ndiffusion, designed to fill masked regions with greater details and more\ncoherent structures. We incorporate prior information to provide initialization\nand weak conditioning,which helps mitigate noisy artifacts and suppress\nhallucinations. Additionally, to improve temporal consistency during\nlong-sequence inference, we expand the temporal receptive fields of both the\nprior model and DiffuEraser, and further enhance consistency by leveraging the\ntemporal smoothing property of Video Diffusion Models. Experimental results\ndemonstrate that our proposed method outperforms state-of-the-art techniques in\nboth content completeness and temporal consistency while maintaining acceptable\nefficiency.\n","authors":["Xiaowen Li","Haolan Xue","Peiran Ren","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2501.10018v1.pdf","comment":"11pages, 13figures"},{"id":"http://arxiv.org/abs/2501.06770v2","updated":"2025-01-17T08:02:51Z","published":"2025-01-12T10:31:33Z","title":"SuperNeRF-GAN: A Universal 3D-Consistent Super-Resolution Framework for\n  Efficient and Enhanced 3D-Aware Image Synthesis","summary":"  Neural volume rendering techniques, such as NeRF, have revolutionized\n3D-aware image synthesis by enabling the generation of images of a single scene\nor object from various camera poses. However, the high computational cost of\nNeRF presents challenges for synthesizing high-resolution (HR) images. Most\nexisting methods address this issue by leveraging 2D super-resolution, which\ncompromise 3D-consistency. Other methods propose radiance manifolds or\ntwo-stage generation to achieve 3D-consistent HR synthesis, yet they are\nlimited to specific synthesis tasks, reducing their universality. To tackle\nthese challenges, we propose SuperNeRF-GAN, a universal framework for\n3D-consistent super-resolution. A key highlight of SuperNeRF-GAN is its\nseamless integration with NeRF-based 3D-aware image synthesis methods and it\ncan simultaneously enhance the resolution of generated images while preserving\n3D-consistency and reducing computational cost. Specifically, given a\npre-trained generator capable of producing a NeRF representation such as\ntri-plane, we first perform volume rendering to obtain a low-resolution image\nwith corresponding depth and normal map. Then, we employ a NeRF\nSuper-Resolution module which learns a network to obtain a high-resolution\nNeRF. Next, we propose a novel Depth-Guided Rendering process which contains\nthree simple yet effective steps, including the construction of a\nboundary-correct multi-depth map through depth aggregation, a normal-guided\ndepth super-resolution and a depth-guided NeRF rendering. Experimental results\ndemonstrate the superior efficiency, 3D-consistency, and quality of our\napproach. Additionally, ablation studies confirm the effectiveness of our\nproposed components.\n","authors":["Peng Zheng","Linzhi Huang","Yizhou Yu","Yi Chang","Yilin Wang","Rui Ma"],"pdf_url":"https://arxiv.org/pdf/2501.06770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08850v2","updated":"2025-01-17T08:02:09Z","published":"2024-09-13T14:06:12Z","title":"DX2CT: Diffusion Model for 3D CT Reconstruction from Bi or Mono-planar\n  2D X-ray(s)","summary":"  Computational tomography (CT) provides high-resolution medical imaging, but\nit can expose patients to high radiation. X-ray scanners have low radiation\nexposure, but their resolutions are low. This paper proposes a new conditional\ndiffusion model, DX2CT, that reconstructs three-dimensional (3D) CT volumes\nfrom bi or mono-planar X-ray image(s). Proposed DX2CT consists of two key\ncomponents: 1) modulating feature maps extracted from two-dimensional (2D)\nX-ray(s) with 3D positions of CT volume using a new transformer and 2)\neffectively using the modulated 3D position-aware feature maps as conditions of\nDX2CT. In particular, the proposed transformer can provide conditions with rich\ninformation of a target CT slice to the conditional diffusion model, enabling\nhigh-quality CT reconstruction. Our experiments with the bi or mono-planar\nX-ray(s) benchmark datasets show that proposed DX2CT outperforms several\nstate-of-the-art methods. Our codes and model will be available at:\nhttps://www.github.com/intyeger/DX2CT.\n","authors":["Yun Su Jeong","Hye Bin Yoo","Il Yong Chun"],"pdf_url":"https://arxiv.org/pdf/2409.08850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10011v1","updated":"2025-01-17T07:48:37Z","published":"2025-01-17T07:48:37Z","title":"Mitigating Hallucinations on Object Attributes using Multiview Images\n  and Negative Instructions","summary":"  Current popular Large Vision-Language Models (LVLMs) are suffering from\nHallucinations on Object Attributes (HoOA), leading to incorrect determination\nof fine-grained attributes in the input images. Leveraging significant\nadvancements in 3D generation from a single image, this paper proposes a novel\nmethod to mitigate HoOA in LVLMs. This method utilizes multiview images sampled\nfrom generated 3D representations as visual prompts for LVLMs, thereby\nproviding more visual information from other viewpoints. Furthermore, we\nobserve the input order of multiple multiview images significantly affects the\nperformance of LVLMs. Consequently, we have devised Multiview Image Augmented\nVLM (MIAVLM), incorporating a Multiview Attributes Perceiver (MAP) submodule\ncapable of simultaneously eliminating the influence of input image order and\naligning visual information from multiview images with Large Language Models\n(LLMs). Besides, we designed and employed negative instructions to mitigate\nLVLMs' bias towards ``Yes\" responses. Comprehensive experiments demonstrate the\neffectiveness of our method.\n","authors":["Zhijie Tan","Yuzhi Li","Shengwei Meng","Xiang Yuan","Weiping Li","Tong Mo","Bingce Wang","Xu Chu"],"pdf_url":"https://arxiv.org/pdf/2501.10011v1.pdf","comment":"2025 IEEE International Conference on Acoustics, Speech, and Signal\n  Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2501.09999v1","updated":"2025-01-17T07:30:16Z","published":"2025-01-17T07:30:16Z","title":"Deep Learning for Early Alzheimer Disease Detection with MRI Scans","summary":"  Alzheimer's Disease is a neurodegenerative condition characterized by\ndementia and impairment in neurological function. The study primarily focuses\non the individuals above age 40, affecting their memory, behavior, and\ncognitive processes of the brain. Alzheimer's disease requires diagnosis by a\ndetailed assessment of MRI scans and neuropsychological tests of the patients.\nThis project compares existing deep learning models in the pursuit of enhancing\nthe accuracy and efficiency of AD diagnosis, specifically focusing on the\nConvolutional Neural Network, Bayesian Convolutional Neural Network, and the\nU-net model with the Open Access Series of Imaging Studies brain MRI dataset.\nBesides, to ensure robustness and reliability in the model evaluations, we\naddress the challenge of imbalance in data. We then perform rigorous evaluation\nto determine strengths and weaknesses for each model by considering\nsensitivity, specificity, and computational efficiency. This comparative\nanalysis would shed light on the future role of AI in revolutionizing AD\ndiagnostics but also paved ways for future innovation in medical imaging and\nthe management of neurodegenerative diseases.\n","authors":["Mohammad Rafsan","Tamer Oraby","Upal Roy","Sanjeev Kumar","Hansapani Rodrigo"],"pdf_url":"https://arxiv.org/pdf/2501.09999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09994v1","updated":"2025-01-17T07:24:58Z","published":"2025-01-17T07:24:58Z","title":"Multi-Modal Attention Networks for Enhanced Segmentation and Depth\n  Estimation of Subsurface Defects in Pulse Thermography","summary":"  AI-driven pulse thermography (PT) has become a crucial tool in\nnon-destructive testing (NDT), enabling automatic detection of hidden anomalies\nin various industrial components. Current state-of-the-art techniques feed\nsegmentation and depth estimation networks compressed PT sequences using either\nPrincipal Component Analysis (PCA) or Thermographic Signal Reconstruction\n(TSR). However, treating these two modalities independently constrains the\nperformance of PT inspection models as these representations possess\ncomplementary semantic features. To address this limitation, this work proposes\nPT-Fusion, a multi-modal attention-based fusion network that fuses both PCA and\nTSR modalities for defect segmentation and depth estimation of subsurface\ndefects in PT setups. PT-Fusion introduces novel feature fusion modules,\nEncoder Attention Fusion Gate (EAFG) and Attention Enhanced Decoding Block\n(AEDB), to fuse PCA and TSR features for enhanced segmentation and depth\nestimation of subsurface defects. In addition, a novel data augmentation\ntechnique is proposed based on random data sampling from thermographic\nsequences to alleviate the scarcity of PT datasets. The proposed method is\nbenchmarked against state-of-the-art PT inspection models, including U-Net,\nattention U-Net, and 3D-CNN on the Universit\\'e Laval IRT-PVC dataset. The\nresults demonstrate that PT-Fusion outperforms the aforementioned models in\ndefect segmentation and depth estimation accuracies with a margin of 10%.\n","authors":["Mohammed Salah","Naoufel Werghi","Davor Svetinovic","Yusra Abdulrahman"],"pdf_url":"https://arxiv.org/pdf/2501.09994v1.pdf","comment":"Pulse thermography, infrared thermography, defect segmentation,\n  multi-modal networks, attention mechanism"},{"id":"http://arxiv.org/abs/2412.11076v3","updated":"2025-01-17T07:21:34Z","published":"2024-12-15T06:20:41Z","title":"MoRe: Class Patch Attention Needs Regularization for Weakly Supervised\n  Semantic Segmentation","summary":"  Weakly Supervised Semantic Segmentation (WSSS) with image-level labels\ntypically uses Class Activation Maps (CAM) to achieve dense predictions.\nRecently, Vision Transformer (ViT) has provided an alternative to generate\nlocalization maps from class-patch attention. However, due to insufficient\nconstraints on modeling such attention, we observe that the Localization\nAttention Maps (LAM) often struggle with the artifact issue, i.e., patch\nregions with minimal semantic relevance are falsely activated by class tokens.\nIn this work, we propose MoRe to address this issue and further explore the\npotential of LAM. Our findings suggest that imposing additional regularization\non class-patch attention is necessary. To this end, we first view the attention\nas a novel directed graph and propose the Graph Category Representation module\nto implicitly regularize the interaction among class-patch entities. It ensures\nthat class tokens dynamically condense the related patch information and\nsuppress unrelated artifacts at a graph level. Second, motivated by the\nobservation that CAM from classification weights maintains smooth localization\nof objects, we devise the Localization-informed Regularization module to\nexplicitly regularize the class-patch attention. It directly mines the token\nrelations from CAM and further supervises the consistency between class and\npatch tokens in a learnable manner. Extensive experiments are conducted on\nPASCAL VOC and MS COCO, validating that MoRe effectively addresses the artifact\nissue and achieves state-of-the-art performance, surpassing recent single-stage\nand even multi-stage methods. Code is available at\nhttps://github.com/zwyang6/MoRe.\n","authors":["Zhiwei Yang","Yucong Meng","Kexue Fu","Shuo Wang","Zhijian Song"],"pdf_url":"https://arxiv.org/pdf/2412.11076v3.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2404.13733v4","updated":"2025-01-17T07:15:16Z","published":"2024-04-21T18:19:27Z","title":"Elucidating the Design Space of Dataset Condensation","summary":"  Dataset condensation, a concept within data-centric learning, efficiently\ntransfers critical attributes from an original dataset to a synthetic version,\nmaintaining both diversity and realism. This approach significantly improves\nmodel training efficiency and is adaptable across multiple application areas.\nPrevious methods in dataset condensation have faced challenges: some incur high\ncomputational costs which limit scalability to larger datasets (e.g., MTT,\nDREAM, and TESLA), while others are restricted to less optimal design spaces,\nwhich could hinder potential improvements, especially in smaller datasets\n(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a\ncomprehensive design framework that includes specific, effective strategies\nlike implementing soft category-aware matching and adjusting the learning rate\nschedule. These strategies are grounded in empirical evidence and theoretical\nbacking. Our resulting approach, Elucidate Dataset Condensation (EDC),\nestablishes a benchmark for both small and large-scale dataset condensation. In\nour testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on\nImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a\ncompression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM,\nand RDED by margins of 27.3%, 17.2%, and 6.6%, respectively.\n","authors":["Shitong Shao","Zikai Zhou","Huanran Chen","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.13733v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2312.10725v2","updated":"2025-01-17T07:01:43Z","published":"2023-12-17T14:14:31Z","title":"Harnessing small projectors and multiple views for efficient vision\n  pretraining","summary":"  Recent progress in self-supervised (SSL) visual representation learning has\nled to the development of several different proposed frameworks that rely on\naugmentations of images but use different loss functions. However, there are\nfew theoretically grounded principles to guide practice, so practical\nimplementation of each SSL framework requires several heuristics to achieve\ncompetitive performance. In this work, we build on recent analytical results to\ndesign practical recommendations for competitive and efficient SSL that are\ngrounded in theory. Specifically, recent theory tells us that existing SSL\nframeworks are minimizing the same idealized loss, which is to learn features\nthat best match the data similarity kernel defined by the augmentations used.\nWe show how this idealized loss can be reformulated to a functionally\nequivalent loss that is more efficient to compute. We study the implicit bias\nof using gradient descent to minimize our reformulated loss function and find\nthat using a stronger orthogonalization constraint with a reduced projector\ndimensionality should yield good representations. Furthermore, the theory tells\nus that approximating the reformulated loss should be improved by increasing\nthe number of augmentations, and as such using multiple augmentations should\nlead to improved convergence. We empirically verify our findings on CIFAR, STL\nand Imagenet datasets, wherein we demonstrate an improved linear readout\nperformance when training a ResNet-backbone using our theoretically grounded\nrecommendations. Remarkably, we also demonstrate that by leveraging these\ninsights, we can reduce the pretraining dataset size by up to 2$\\times$ while\nmaintaining downstream accuracy simply by using more data augmentations. Taken\ntogether, our work provides theoretically grounded recommendations that can be\nused to improve SSL convergence and efficiency.\n","authors":["Kumar Krishna Agrawal","Arna Ghosh","Shagun Sodhani","Adam Oberman","Blake Richards"],"pdf_url":"https://arxiv.org/pdf/2312.10725v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.00095v2","updated":"2025-01-17T06:53:45Z","published":"2024-11-27T15:45:02Z","title":"OPCap:Object-aware Prompting Captioning","summary":"  In the field of image captioning, the phenomenon where missing or nonexistent\nobjects are used to explain an image is referred to as object bias (or\nhallucination). To mitigate this issue, we propose a target-aware prompting\nstrategy. This method first extracts object labels and their spatial\ninformation from the image using an object detector. Then, an attribute\npredictor further refines the semantic features of the objects. These refined\nfeatures are subsequently integrated and fed into the decoder, enhancing the\nmodel's understanding of the image context. Experimental results on the COCO\nand nocaps datasets demonstrate that OPCap effectively mitigates hallucination\nand significantly improves the quality of generated captions.\n","authors":["Feiyang Huang"],"pdf_url":"https://arxiv.org/pdf/2412.00095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09982v1","updated":"2025-01-17T06:46:10Z","published":"2025-01-17T06:46:10Z","title":"RichSpace: Enriching Text-to-Video Prompt Space via Text Embedding\n  Interpolation","summary":"  Text-to-video generation models have made impressive progress, but they still\nstruggle with generating videos with complex features. This limitation often\narises from the inability of the text encoder to produce accurate embeddings,\nwhich hinders the video generation model. In this work, we propose a novel\napproach to overcome this challenge by selecting the optimal text embedding\nthrough interpolation in the embedding space. We demonstrate that this method\nenables the video generation model to produce the desired videos. Additionally,\nwe introduce a simple algorithm using perpendicular foot embeddings and cosine\nsimilarity to identify the optimal interpolation embedding. Our findings\nhighlight the importance of accurate text embeddings and offer a pathway for\nimproving text-to-video generation performance.\n","authors":["Yuefan Cao","Chengyue Gong","Xiaoyu Li","Yingyu Liang","Zhizhou Sha","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2501.09982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14197v3","updated":"2025-01-17T06:46:00Z","published":"2024-08-26T11:53:09Z","title":"Driving in the Occupancy World: Vision-Centric 4D Occupancy Forecasting\n  and Planning via World Models for Autonomous Driving","summary":"  World models envision potential future states based on various ego actions.\nThey embed extensive knowledge about the driving environment, facilitating safe\nand scalable autonomous driving. Most existing methods primarily focus on\neither data generation or the pretraining paradigms of world models. Unlike the\naforementioned prior works, we propose Drive-OccWorld, which adapts a\nvision-centric 4D forecasting world model to end-to-end planning for autonomous\ndriving. Specifically, we first introduce a semantic and motion-conditional\nnormalization in the memory module, which accumulates semantic and dynamic\ninformation from historical BEV embeddings. These BEV features are then\nconveyed to the world decoder for future occupancy and flow forecasting,\nconsidering both geometry and spatiotemporal modeling. Additionally, we propose\ninjecting flexible action conditions, such as velocity, steering angle,\ntrajectory, and commands, into the world model to enable controllable\ngeneration and facilitate a broader range of downstream applications.\nFurthermore, we explore integrating the generative capabilities of the 4D world\nmodel with end-to-end planning, enabling continuous forecasting of future\nstates and the selection of optimal trajectories using an occupancy-based cost\nfunction. Comprehensive experiments conducted on the nuScenes,\nnuScenes-Occupancy, and Lyft-Level5 datasets illustrate that our method can\ngenerate plausible and controllable 4D occupancy, paving the way for\nadvancements in driving world generation and end-to-end planning. Project page:\nhttps://drive-occworld.github.io/\n","authors":["Yu Yang","Jianbiao Mei","Yukai Ma","Siliang Du","Wenqing Chen","Yijie Qian","Yuxiang Feng","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2408.14197v3.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2411.18967v2","updated":"2025-01-17T06:44:38Z","published":"2024-11-28T07:36:29Z","title":"Deep Plug-and-Play HIO Approach for Phase Retrieval","summary":"  In the phase retrieval problem, the aim is the recovery of an unknown image\nfrom intensity-only measurements such as Fourier intensity. Although there are\nseveral solution approaches, solving this problem is challenging due to its\nnonlinear and ill-posed nature. Recently, learning-based approaches have\nemerged as powerful alternatives to the analytical methods for several inverse\nproblems. In the context of phase retrieval, a novel plug-and-play approach\nthat exploits learning-based prior and efficient update steps has been\npresented at the Computational Optical Sensing and Imaging topical meeting,\nwith demonstrated state-of-the-art performance. The key idea was to incorporate\nlearning-based prior to the Gerchberg-Saxton type algorithms through\nplug-and-play regularization. In this paper, we present the mathematical\ndevelopment of the method including the derivation of its analytical update\nsteps based on half-quadratic splitting and comparatively evaluate its\nperformance through extensive simulations on a large test dataset. The results\nshow the effectiveness of the method in terms of both image quality,\ncomputational efficiency, and robustness to initialization and noise.\n","authors":["Cagatay Isil","Figen S. Oktem"],"pdf_url":"https://arxiv.org/pdf/2411.18967v2.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.09980v1","updated":"2025-01-17T06:43:03Z","published":"2025-01-17T06:43:03Z","title":"Aneumo: A Large-Scale Comprehensive Synthetic Dataset of Aneurysm\n  Hemodynamics","summary":"  Intracranial aneurysm (IA) is a common cerebrovascular disease that is\nusually asymptomatic but may cause severe subarachnoid hemorrhage (SAH) if\nruptured. Although clinical practice is usually based on individual factors and\nmorphological features of the aneurysm, its pathophysiology and hemodynamic\nmechanisms remain controversial. To address the limitations of current\nresearch, this study constructed a comprehensive hemodynamic dataset of\nintracranial aneurysms. The dataset is based on 466 real aneurysm models, and\n10,000 synthetic models were generated by resection and deformation operations,\nincluding 466 aneurysm-free models and 9,534 deformed aneurysm models. The\ndataset also provides medical image-like segmentation mask files to support\ninsightful analysis. In addition, the dataset contains hemodynamic data\nmeasured at eight steady-state flow rates (0.001 to 0.004 kg/s), including\ncritical parameters such as flow velocity, pressure, and wall shear stress,\nproviding a valuable resource for investigating aneurysm pathogenesis and\nclinical prediction. This dataset will help advance the understanding of the\npathologic features and hemodynamic mechanisms of intracranial aneurysms and\nsupport in-depth research in related fields. Dataset hosted at\nhttps://github.com/Xigui-Li/Aneumo.\n","authors":["Xigui Li","Yuanye Zhou","Feiyang Xiao","Xin Guo","Yichi Zhang","Chen Jiang","Jianchao Ge","Xiansheng Wang","Qimeng Wang","Taiwei Zhang","Chensen Lin","Yuan Cheng","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2501.09980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09978v1","updated":"2025-01-17T06:40:20Z","published":"2025-01-17T06:40:20Z","title":"GaussianAvatar-Editor: Photorealistic Animatable Gaussian Head Avatar\n  Editor","summary":"  We introduce GaussianAvatar-Editor, an innovative framework for text-driven\nediting of animatable Gaussian head avatars that can be fully controlled in\nexpression, pose, and viewpoint. Unlike static 3D Gaussian editing, editing\nanimatable 4D Gaussian avatars presents challenges related to motion occlusion\nand spatial-temporal inconsistency. To address these issues, we propose the\nWeighted Alpha Blending Equation (WABE). This function enhances the blending\nweight of visible Gaussians while suppressing the influence on non-visible\nGaussians, effectively handling motion occlusion during editing. Furthermore,\nto improve editing quality and ensure 4D consistency, we incorporate\nconditional adversarial learning into the editing process. This strategy helps\nto refine the edited results and maintain consistency throughout the animation.\nBy integrating these methods, our GaussianAvatar-Editor achieves photorealistic\nand consistent results in animatable 4D Gaussian editing. We conduct\ncomprehensive experiments across various subjects to validate the effectiveness\nof our proposed techniques, which demonstrates the superiority of our approach\nover existing methods. More results and code are available at: [Project\nLink](https://xiangyueliu.github.io/GaussianAvatar-Editor/).\n","authors":["Xiangyue Liu","Kunming Luo","Heng Li","Qi Zhang","Yuan Liu","Li Yi","Ping Tan"],"pdf_url":"https://arxiv.org/pdf/2501.09978v1.pdf","comment":"Accepted to 3DV 2025. [Project\n  Link](https://xiangyueliu.github.io/GaussianAvatar-Editor/)"},{"id":"http://arxiv.org/abs/2501.08443v3","updated":"2025-01-17T06:33:23Z","published":"2024-12-26T05:41:31Z","title":"Instruction-Guided Fusion of Multi-Layer Visual Features in Large\n  Vision-Language Models","summary":"  Large Vision-Language Models (LVLMs) have achieved remarkable success in a\nwide range of multimodal tasks by integrating pre-trained vision encoders and\nlarge language models. However, current LVLMs primarily rely on visual features\nextracted from the final layers of the vision encoder, overlooking the\ncomplementary information available in shallower layers. While recent\napproaches have explored the use of multilayer visual features in LVLMs, they\ntend to be task-agnostic and fail to examine the dependencies of hierarchical\nvisual features on specific tasks. To address these gaps, we systematically\ninvestigate the contributions of visual features from different encoder layers\nusing 18 benchmarks spanning 6 task categories. Our findings reveal that\nmultilayer features provide complementary strengths with varying task\ndependencies, and uniform fusion leads to suboptimal performance. Building on\nthese insights, we propose the instruction-guided vision aggregator, a module\nthat dynamically integrates multi-layer visual features based on textual\ninstructions, without increasing the number of visual tokens. Extensive\nevaluations demonstrate the superior performance of our method. Additionally,\nan in-depth analysis of the aggregator's behavior highlights the dominance of\nmid-to-high-level features in semantic-rich tasks and the critical role of\nlow-level features in fine-grained perception.\n","authors":["Xu Li","Yi Zheng","Haotian Chen","Xiaolei Chen","Yuxuan Liang","Chenghang Lai","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2501.08443v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09967v1","updated":"2025-01-17T06:16:57Z","published":"2025-01-17T06:16:57Z","title":"Explainable artificial intelligence (XAI): from inherent explainability\n  to large language models","summary":"  Artificial Intelligence (AI) has continued to achieve tremendous success in\nrecent times. However, the decision logic of these frameworks is often not\ntransparent, making it difficult for stakeholders to understand, interpret or\nexplain their behavior. This limitation hinders trust in machine learning\nsystems and causes a general reluctance towards their adoption in practical\napplications, particularly in mission-critical domains like healthcare and\nautonomous driving. Explainable AI (XAI) techniques facilitate the\nexplainability or interpretability of machine learning models, enabling users\nto discern the basis of the decision and possibly avert undesirable behavior.\nThis comprehensive survey details the advancements of explainable AI methods,\nfrom inherently interpretable models to modern approaches for achieving\ninterpretability of various black box models, including large language models\n(LLMs). Additionally, we review explainable AI techniques that leverage LLM and\nvision-language model (VLM) frameworks to automate or improve the\nexplainability of other machine learning models. The use of LLM and VLM as\ninterpretability methods particularly enables high-level, semantically\nmeaningful explanations of model decisions and behavior. Throughout the paper,\nwe highlight the scientific principles, strengths and weaknesses of\nstate-of-the-art methods and outline different areas of improvement. Where\nappropriate, we also present qualitative and quantitative comparison results of\nvarious methods to show how they compare. Finally, we discuss the key\nchallenges of XAI and directions for future research.\n","authors":["Fuseini Mumuni","Alhassan Mumuni"],"pdf_url":"https://arxiv.org/pdf/2501.09967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19070v3","updated":"2025-01-17T06:13:20Z","published":"2023-10-29T16:49:45Z","title":"Myriad: Large Multimodal Model by Applying Vision Experts for Industrial\n  Anomaly Detection","summary":"  Due to the training configuration, traditional industrial anomaly detection\n(IAD) methods have to train a specific model for each deployment scenario,\nwhich is insufficient to meet the requirements of modern design and\nmanufacturing. On the contrary, large multimodal models~(LMMs) have shown\neminent generalization ability on various vision tasks, and their perception\nand comprehension capabilities imply the potential of applying LMMs on IAD\ntasks. However, we observe that even though the LMMs have abundant knowledge\nabout industrial anomaly detection in the textual domain, the LMMs are unable\nto leverage the knowledge due to the modality gap between textual and visual\ndomains. To stimulate the relevant knowledge in LMMs and adapt the LMMs towards\nanomaly detection tasks, we introduce existing IAD methods as vision experts\nand present a novel large multimodal model applying vision experts for\nindustrial anomaly detection~(abbreviated to {Myriad}). Specifically, we\nutilize the anomaly map generated by the vision experts as guidance for LMMs,\nsuch that the vision model is guided to pay more attention to anomalous\nregions. Then, the visual features are modulated via an adapter to fit the\nanomaly detection tasks, which are fed into the language model together with\nthe vision expert guidance and human instructions to generate the final\noutputs. Extensive experiments are applied on MVTec-AD, VisA, and PCB Bank\nbenchmarks demonstrate that our proposed method not only performs favorably\nagainst state-of-the-art methods, but also inherits the flexibility and\ninstruction-following ability of LMMs in the field of IAD. Source code and\npre-trained models are publicly available at\n\\url{https://github.com/tzjtatata/Myriad}.\n","authors":["Yuanze Li","Haolin Wang","Shihao Yuan","Ming Liu","Debin Zhao","Yiwen Guo","Chen Xu","Guangming Shi","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2310.19070v3.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.13632v4","updated":"2025-01-17T06:09:13Z","published":"2023-12-21T07:48:54Z","title":"TraceFL: Interpretability-Driven Debugging in Federated Learning via\n  Neuron Provenance","summary":"  In Federated Learning, clients train models on local data and send updates to\na central server, which aggregates them into a global model using a fusion\nalgorithm. This collaborative yet privacy-preserving training comes at a cost.\nFL developers face significant challenges in attributing global model\npredictions to specific clients. Localizing responsible clients is a crucial\nstep towards (a) excluding clients primarily responsible for incorrect\npredictions and (b) encouraging clients who contributed high-quality models to\ncontinue participating in the future. Existing ML debugging approaches are\ninherently inapplicable as they are designed for single-model, centralized\ntraining.\n  We introduce TraceFL, a fine-grained neuron provenance capturing mechanism\nthat identifies clients responsible for a global model's prediction by tracking\nthe flow of information from individual clients to the global model. Since\ninference on different inputs activates a different set of neurons of the\nglobal model, TraceFL dynamically quantifies the significance of the global\nmodel's neurons in a given prediction, identifying the most crucial neurons in\nthe global model. It then maps them to the corresponding neurons in every\nparticipating client to determine each client's contribution, ultimately\nlocalizing the responsible client. We evaluate TraceFL on six datasets,\nincluding two real-world medical imaging datasets and four neural networks,\nincluding advanced models such as GPT. TraceFL achieves 99% accuracy in\nlocalizing the responsible client in FL tasks spanning both image and text\nclassification tasks. At a time when state-of-the-artML debugging approaches\nare mostly domain-specific (e.g., image classification only), TraceFL is the\nfirst technique to enable highly accurate automated reasoning across a wide\nrange of FL applications.\n","authors":["Waris Gill","Ali Anwar","Muhammad Ali Gulzar"],"pdf_url":"https://arxiv.org/pdf/2312.13632v4.pdf","comment":"Accepted at 2025 IEEE/ACM 47th International Conference on Software\n  Engineering (ICSE)"},{"id":"http://arxiv.org/abs/2406.14534v3","updated":"2025-01-17T05:59:20Z","published":"2024-06-20T17:47:30Z","title":"Epicardium Prompt-guided Real-time Cardiac Ultrasound Frame-to-volume\n  Registration","summary":"  A comprehensive guidance view for cardiac interventional surgery can be\nprovided by the real-time fusion of the intraoperative 2D images and\npreoperative 3D volume based on the ultrasound frame-to-volume registration.\nHowever, cardiac ultrasound images are characterized by a low signal-to-noise\nratio and small differences between adjacent frames, coupled with significant\ndimension variations between 2D frames and 3D volumes to be registered,\nresulting in real-time and accurate cardiac ultrasound frame-to-volume\nregistration being a very challenging task. This paper introduces a lightweight\nend-to-end Cardiac Ultrasound frame-to-volume Registration network, termed\nCU-Reg. Specifically, the proposed model leverages epicardium prompt-guided\nanatomical clues to reinforce the interaction of 2D sparse and 3D dense\nfeatures, followed by a voxel-wise local-global aggregation of enhanced\nfeatures, thereby boosting the cross-dimensional matching effectiveness of\nlow-quality ultrasound modalities. We further embed an inter-frame\ndiscriminative regularization term within the hybrid supervised learning to\nincrease the distinction between adjacent slices in the same ultrasound volume\nto ensure registration stability. Experimental results on the reprocessed CAMUS\ndataset demonstrate that our CU-Reg surpasses existing methods in terms of\nregistration accuracy and efficiency, meeting the guidance requirements of\nclinical cardiac interventional surgery.\n","authors":["Long Lei","Jun Zhou","Jialun Pei","Baoliang Zhao","Yueming Jin","Yuen-Chun Jeremy Teoh","Jing Qin","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2406.14534v3.pdf","comment":"This paper has been accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2501.09960v1","updated":"2025-01-17T05:23:26Z","published":"2025-01-17T05:23:26Z","title":"Discrete Prior-based Temporal-coherent Content Prediction for Blind Face\n  Video Restoration","summary":"  Blind face video restoration aims to restore high-fidelity details from\nvideos subjected to complex and unknown degradations. This task poses a\nsignificant challenge of managing temporal heterogeneity while at the same time\nmaintaining stable face attributes. In this paper, we introduce a Discrete\nPrior-based Temporal-Coherent content prediction transformer to address the\nchallenge, and our model is referred to as DP-TempCoh. Specifically, we\nincorporate a spatial-temporal-aware content prediction module to synthesize\nhigh-quality content from discrete visual priors, conditioned on degraded video\ntokens. To further enhance the temporal coherence of the predicted content, a\nmotion statistics modulation module is designed to adjust the content, based on\ndiscrete motion priors in terms of cross-frame mean and variance. As a result,\nthe statistics of the predicted content can match with that of real videos over\ntime. By performing extensive experiments, we verify the effectiveness of the\ndesign elements and demonstrate the superior performance of our DP-TempCoh in\nboth synthetically and naturally degraded video restoration.\n","authors":["Lianxin Xie","Bingbing Zheng","Wen Xue","Yunfei Zhang","Le Jiang","Ruotao Xu","Si Wu","Hau-San Wong"],"pdf_url":"https://arxiv.org/pdf/2501.09960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17134v3","updated":"2025-01-17T05:23:10Z","published":"2023-05-26T17:59:21Z","title":"NeuManifold: Neural Watertight Manifold Reconstruction with Efficient\n  and High-Quality Rendering Support","summary":"  We present a method for generating high-quality watertight manifold meshes\nfrom multi-view input images. Existing volumetric rendering methods are robust\nin optimization but tend to generate noisy meshes with poor topology.\nDifferentiable rasterization-based methods can generate high-quality meshes but\nare sensitive to initialization. Our method combines the benefits of both\nworlds; we take the geometry initialization obtained from neural volumetric\nfields, and further optimize the geometry as well as a compact neural texture\nrepresentation with differentiable rasterizers. Through extensive experiments,\nwe demonstrate that our method can generate accurate mesh reconstructions with\nfaithful appearance that are comparable to previous volume rendering methods\nwhile being an order of magnitude faster in rendering. We also show that our\ngenerated mesh and neural texture reconstruction is compatible with existing\ngraphics pipelines and enables downstream 3D applications such as simulation.\nProject page: https://sarahweiii.github.io/neumanifold/\n","authors":["Xinyue Wei","Fanbo Xiang","Sai Bi","Anpei Chen","Kalyan Sunkavalli","Zexiang Xu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2305.17134v3.pdf","comment":"Project page: https://sarahweiii.github.io/neumanifold/"},{"id":"http://arxiv.org/abs/2404.01249v2","updated":"2025-01-17T05:13:29Z","published":"2024-04-01T17:12:47Z","title":"FireANTs: Adaptive Riemannian Optimization for Multi-Scale Diffeomorphic\n  Matching","summary":"  The paper proposes FireANTs, the first multi-scale Adaptive Riemannian\nOptimization algorithm for dense diffeomorphic image matching. One of the most\ncritical and understudied aspects of diffeomorphic image matching algorithms\nare its highly ill-conditioned nature. We quantitatively capture the extent of\nill-conditioning in a typical MRI matching task, motivating the need for an\nadaptive optimization algorithm for diffeomorphic matching. To this end,\nFireANTs generalizes the concept of momentum and adaptive estimates of the\nHessian to mitigate this ill-conditioning in the non-Euclidean space of\ndiffeomorphisms. Unlike common non-Euclidean manifolds, we also formalize\nconsiderations for multi-scale optimization of diffeomorphisms. Our rigorous\nmathematical results and operational contributions lead to a state-of-the-art\ndense matching algorithm that can be applied to generic image data with\nremarkable accuracy and robustness. We demonstrate consistent improvements in\nimage matching performance across a spectrum of community-standard medical and\nbiological correspondence matching challenges spanning a wide variety of image\nmodalities, anatomies, resolutions, acquisition protocols, and preprocessing\npipelines. This improvement is supplemented by from 300x up to 3200x speedup\nover existing state-of-the-art algorithms. For the first time, we perform\ndiffeomorphic matching of sub-micron mouse cortex volumes at native resolution.\nOur fast implementation also enables hyperparameter studies that were\nintractable with existing correspondence matching algorithms.\n","authors":["Rohit Jena","Pratik Chaudhari","James C. Gee"],"pdf_url":"https://arxiv.org/pdf/2404.01249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04565v2","updated":"2025-01-17T05:13:06Z","published":"2025-01-08T15:25:19Z","title":"Learnable Scaled Gradient Descent for Guaranteed Robust Tensor PCA","summary":"  Robust tensor principal component analysis (RTPCA) aims to separate the\nlow-rank and sparse components from multi-dimensional data, making it an\nessential technique in the signal processing and computer vision fields.\nRecently emerging tensor singular value decomposition (t-SVD) has gained\nconsiderable attention for its ability to better capture the low-rank structure\nof tensors compared to traditional matrix SVD. However, existing methods often\nrely on the computationally expensive tensor nuclear norm (TNN), which limits\ntheir scalability for real-world tensors. To address this issue, we explore an\nefficient scaled gradient descent (SGD) approach within the t-SVD framework for\nthe first time, and propose the RTPCA-SGD method. Theoretically, we rigorously\nestablish the recovery guarantees of RTPCA-SGD under mild assumptions,\ndemonstrating that with appropriate parameter selection, it achieves linear\nconvergence to the true low-rank tensor at a constant rate, independent of the\ncondition number. To enhance its practical applicability, we further propose a\nlearnable self-supervised deep unfolding model, which enables effective\nparameter learning. Numerical experiments on both synthetic and real-world\ndatasets demonstrate the superior performance of the proposed methods while\nmaintaining competitive computational efficiency, especially consuming less\ntime than RTPCA-TNN.\n","authors":["Lanlan Feng","Ce Zhu","Yipeng Liu","Saiprasad Ravishankar","Longxiu Huang"],"pdf_url":"https://arxiv.org/pdf/2501.04565v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00771v2","updated":"2025-01-17T04:47:11Z","published":"2024-10-01T15:07:07Z","title":"Empowering Large Language Model for Continual Video Question Answering\n  with Collaborative Prompting","summary":"  In recent years, the rapid increase in online video content has underscored\nthe limitations of static Video Question Answering (VideoQA) models trained on\nfixed datasets, as they struggle to adapt to new questions or tasks posed by\nnewly available content. In this paper, we explore the novel challenge of\nVideoQA within a continual learning framework, and empirically identify a\ncritical issue: fine-tuning a large language model (LLM) for a sequence of\ntasks often results in catastrophic forgetting. To address this, we propose\nCollaborative Prompting (ColPro), which integrates specific question constraint\nprompting, knowledge acquisition prompting, and visual temporal awareness\nprompting. These prompts aim to capture textual question context, visual\ncontent, and video temporal dynamics in VideoQA, a perspective underexplored in\nprior research. Experimental results on the NExT-QA and DramaQA datasets show\nthat ColPro achieves superior performance compared to existing approaches,\nachieving 55.14\\% accuracy on NExT-QA and 71.24\\% accuracy on DramaQA,\nhighlighting its practical relevance and effectiveness.\n","authors":["Chen Cai","Zheng Wang","Jianjun Gao","Wenyang Liu","Ye Lu","Runzhong Zhang","Kim-Hui Yap"],"pdf_url":"https://arxiv.org/pdf/2410.00771v2.pdf","comment":"Accepted by main EMNLP 2024"},{"id":"http://arxiv.org/abs/2501.09947v1","updated":"2025-01-17T04:14:09Z","published":"2025-01-17T04:14:09Z","title":"Surface-SOS: Self-Supervised Object Segmentation via Neural Surface\n  Representation","summary":"  Self-supervised Object Segmentation (SOS) aims to segment objects without any\nannotations. Under conditions of multi-camera inputs, the structural, textural\nand geometrical consistency among each view can be leveraged to achieve\nfine-grained object segmentation. To make better use of the above information,\nwe propose Surface representation based Self-supervised Object Segmentation\n(Surface-SOS), a new framework to segment objects for each view by 3D surface\nrepresentation from multi-view images of a scene. To model high-quality\ngeometry surfaces for complex scenes, we design a novel scene representation\nscheme, which decomposes the scene into two complementary neural representation\nmodules respectively with a Signed Distance Function (SDF). Moreover,\nSurface-SOS is able to refine single-view segmentation with multi-view\nunlabeled images, by introducing coarse segmentation masks as additional input.\nTo the best of our knowledge, Surface-SOS is the first self-supervised approach\nthat leverages neural surface representation to break the dependence on large\namounts of annotated data and strong constraints. These constraints typically\ninvolve observing target objects against a static background or relying on\ntemporal supervision in videos. Extensive experiments on standard benchmarks\nincluding LLFF, CO3D, BlendedMVS, TUM and several real-world scenes show that\nSurface-SOS always yields finer object masks than its NeRF-based counterparts\nand surpasses supervised single-view baselines remarkably. Code is available\nat: https://github.com/zhengxyun/Surface-SOS.\n","authors":["Xiaoyun Zheng","Liwei Liao","Jianbo Jiao","Feng Gao","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09947v1.pdf","comment":"Accepted by TIP"},{"id":"http://arxiv.org/abs/2406.04829v4","updated":"2025-01-17T03:39:23Z","published":"2024-06-07T10:54:40Z","title":"IOR: Inversed Objects Replay for Incremental Object Detection","summary":"  Existing Incremental Object Detection (IOD) methods partially alleviate\ncatastrophic forgetting when incrementally detecting new objects in real-world\nscenarios. However, many of these methods rely on the assumption that unlabeled\nold-class objects may co-occur with labeled new-class objects in the\nincremental data. When unlabeled old-class objects are absent, the performance\nof existing methods tends to degrade. The absence can be mitigated by\ngenerating old-class samples, but it incurs high costs. This paper argues that\nprevious generation-based IOD suffers from redundancy, both in the use of\ngenerative models, which require additional training and storage, and in the\noverproduction of generated samples, many of which do not contribute\nsignificantly to performance improvements. To eliminate the redundancy, we\npropose Inversed Objects Replay (IOR). Specifically, we generate old-class\nsamples by inversing the original detectors, thus eliminating the necessity of\ntraining and storing additional generative models. We propose augmented replay\nto reuse the objects in generated samples, reducing redundant generations.\nMoreover, we propose high-value knowledge distillation focusing on the\npositions of old-class objects overwhelmed by the background, which transfers\nthe knowledge to the incremental detector. Extensive experiments conducted on\nMS COCO 2017 demonstrate that our method can efficiently improve detection\nperformance in IOD scenarios with the absence of old-class objects.\n","authors":["Zijia An","Boyu Diao","Libo Huang","Ruiqi Liu","Zhulin An","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2406.04829v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09938v1","updated":"2025-01-17T03:29:41Z","published":"2025-01-17T03:29:41Z","title":"A Multi-Scale Feature Extraction and Fusion Deep Learning Method for\n  Classification of Wheat Diseases","summary":"  Wheat is an important source of dietary fiber and protein that is negatively\nimpacted by a number of risks to its growth. The difficulty of identifying and\nclassifying wheat diseases is discussed with an emphasis on wheat loose smut,\nleaf rust, and crown and root rot. Addressing conditions like crown and root\nrot, this study introduces an innovative approach that integrates multi-scale\nfeature extraction with advanced image segmentation techniques to enhance\nclassification accuracy. The proposed method uses neural network models\nXception, Inception V3, and ResNet 50 to train on a large wheat disease\nclassification dataset 2020 in conjunction with an ensemble of machine vision\nclassifiers, including voting and stacking. The study shows that the suggested\nmethodology has a superior accuracy of 99.75% in the classification of wheat\ndiseases when compared to current state-of-the-art approaches. A deep learning\nensemble model Xception showed the highest accuracy.\n","authors":["Sajjad Saleem","Adil Hussain","Nabila Majeed","Zahid Akhtar","Kamran Siddique"],"pdf_url":"https://arxiv.org/pdf/2501.09938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09935v1","updated":"2025-01-17T03:16:15Z","published":"2025-01-17T03:16:15Z","title":"Physics-informed DeepCT: Sinogram Wavelet Decomposition Meets Masked\n  Diffusion","summary":"  Diffusion model shows remarkable potential on sparse-view computed tomography\n(SVCT) reconstruction. However, when a network is trained on a limited sample\nspace, its generalization capability may be constrained, which degrades\nperformance on unfamiliar data. For image generation tasks, this can lead to\nissues such as blurry details and inconsistencies between regions. To alleviate\nthis problem, we propose a Sinogram-based Wavelet random decomposition And\nRandom mask diffusion Model (SWARM) for SVCT reconstruction. Specifically,\nintroducing a random mask strategy in the sinogram effectively expands the\nlimited training sample space. This enables the model to learn a broader range\nof data distributions, enhancing its understanding and generalization of data\nuncertainty. In addition, applying a random training strategy to the\nhigh-frequency components of the sinogram wavelet enhances feature\nrepresentation and improves the ability to capture details in different\nfrequency bands, thereby improving performance and robustness. Two-stage\niterative reconstruction method is adopted to ensure the global consistency of\nthe reconstructed image while refining its details. Experimental results\ndemonstrate that SWARM outperforms competing approaches in both quantitative\nand qualitative performance across various datasets.\n","authors":["Zekun Zhou","Tan Liu","Bing Yu","Yanru Gong","Liu Shi","Qiegen Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08881v3","updated":"2025-01-17T02:51:41Z","published":"2024-08-03T20:41:35Z","title":"Challenge Summary U-MedSAM: Uncertainty-aware MedSAM for Medical Image\n  Segmentation","summary":"  Medical Image Foundation Models have proven to be powerful tools for mask\nprediction across various datasets. However, accurately assessing the\nuncertainty of their predictions remains a significant challenge. To address\nthis, we propose a new model, U-MedSAM, which integrates the MedSAM model with\nan uncertainty-aware loss function and the Sharpness-Aware Minimization\n(SharpMin) optimizer. The uncertainty-aware loss function automatically\ncombines region-based, distribution-based, and pixel-based loss designs to\nenhance segmentation accuracy and robustness. SharpMin improves generalization\nby finding flat minima in the loss landscape, thereby reducing overfitting. Our\nmethod was evaluated in the CVPR24 MedSAM on Laptop challenge, where U-MedSAM\ndemonstrated promising performance.\n","authors":["Xin Wang","Xiaoyu Liu","Peng Huang","Pu Huang","Shu Hu","Hongtu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.08881v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.17496"},{"id":"http://arxiv.org/abs/2501.09927v1","updated":"2025-01-17T02:47:25Z","published":"2025-01-17T02:47:25Z","title":"IE-Bench: Advancing the Measurement of Text-Driven Image Editing for\n  Human Perception Alignment","summary":"  Recent advances in text-driven image editing have been significant, yet the\ntask of accurately evaluating these edited images continues to pose a\nconsiderable challenge. Different from the assessment of text-driven image\ngeneration, text-driven image editing is characterized by simultaneously\nconditioning on both text and a source image. The edited images often retain an\nintrinsic connection to the original image, which dynamically change with the\nsemantics of the text. However, previous methods tend to solely focus on\ntext-image alignment or have not aligned with human perception. In this work,\nwe introduce the Text-driven Image Editing Benchmark suite (IE-Bench) to\nenhance the assessment of text-driven edited images. IE-Bench includes a\ndatabase contains diverse source images, various editing prompts and the\ncorresponding results different editing methods, and total 3,010 Mean Opinion\nScores (MOS) provided by 25 human subjects. Furthermore, we introduce IE-QA, a\nmulti-modality source-aware quality assessment method for text-driven image\nediting. To the best of our knowledge, IE-Bench offers the first IQA dataset\nand model tailored for text-driven image editing. Extensive experiments\ndemonstrate IE-QA's superior subjective-alignments on the text-driven image\nediting task compared with previous metrics. We will make all related data and\ncode available to the public.\n","authors":["Shangkun Sun","Bowen Qu","Xiaoyu Liang","Songlin Fan","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2501.09927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09926v1","updated":"2025-01-17T02:47:14Z","published":"2025-01-17T02:47:14Z","title":"ForestProtector: An IoT Architecture Integrating Machine Vision and Deep\n  Reinforcement Learning for Efficient Wildfire Monitoring","summary":"  Early detection of forest fires is crucial to minimizing the environmental\nand socioeconomic damage they cause. Indeed, a fire's duration directly\ncorrelates with the difficulty and cost of extinguishing it. For instance, a\nfire burning for 1 minute might require 1 liter of water to extinguish, while a\n2-minute fire could demand 100 liters, and a 10-minute fire might necessitate\n1,000 liters. On the other hand, existing fire detection systems based on novel\ntechnologies (e.g., remote sensing, PTZ cameras, UAVs) are often expensive and\nrequire human intervention, making continuous monitoring of large areas\nimpractical. To address this challenge, this work proposes a low-cost forest\nfire detection system that utilizes a central gateway device with computer\nvision capabilities to monitor a 360{\\deg} field of view for smoke at long\ndistances. A deep reinforcement learning agent enhances surveillance by\ndynamically controlling the camera's orientation, leveraging real-time sensor\ndata (smoke levels, ambient temperature, and humidity) from distributed IoT\ndevices. This approach enables automated wildfire monitoring across expansive\nareas while reducing false positives.\n","authors":["Kenneth Bonilla-Ormachea","Horacio Cuizaga","Edwin Salcedo","Sebastian Castro","Sergio Fernandez-Testa","Misael Mamani"],"pdf_url":"https://arxiv.org/pdf/2501.09926v1.pdf","comment":"Accepted for publication in the proceedings of the 11th International\n  Conference on Automation, Robotics, and Applications (ICARA 2025)"},{"id":"http://arxiv.org/abs/2501.09921v1","updated":"2025-01-17T02:27:59Z","published":"2025-01-17T02:27:59Z","title":"TalkingEyes: Pluralistic Speech-Driven 3D Eye Gaze Animation","summary":"  Although significant progress has been made in the field of speech-driven 3D\nfacial animation recently, the speech-driven animation of an indispensable\nfacial component, eye gaze, has been overlooked by recent research. This is\nprimarily due to the weak correlation between speech and eye gaze, as well as\nthe scarcity of audio-gaze data, making it very challenging to generate 3D eye\ngaze motion from speech alone. In this paper, we propose a novel data-driven\nmethod which can generate diverse 3D eye gaze motions in harmony with the\nspeech. To achieve this, we firstly construct an audio-gaze dataset that\ncontains about 14 hours of audio-mesh sequences featuring high-quality eye gaze\nmotion, head motion and facial motion simultaneously. The motion data is\nacquired by performing lightweight eye gaze fitting and face reconstruction on\nvideos from existing audio-visual datasets. We then tailor a novel\nspeech-to-motion translation framework in which the head motions and eye gaze\nmotions are jointly generated from speech but are modeled in two separate\nlatent spaces. This design stems from the physiological knowledge that the\nrotation range of eyeballs is less than that of head. Through mapping the\nspeech embedding into the two latent spaces, the difficulty in modeling the\nweak correlation between speech and non-verbal motion is thus attenuated.\nFinally, our TalkingEyes, integrated with a speech-driven 3D facial motion\ngenerator, can synthesize eye gaze motion, eye blinks, head motion and facial\nmotion collectively from speech. Extensive quantitative and qualitative\nevaluations demonstrate the superiority of the proposed method in generating\ndiverse and natural 3D eye gaze motions from speech. The project page of this\npaper is: https://lkjkjoiuiu.github.io/TalkingEyes_Home/\n","authors":["Yixiang Zhuang","Chunshan Ma","Yao Cheng","Xuan Cheng","Jing Liao","Juncong Lin"],"pdf_url":"https://arxiv.org/pdf/2501.09921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07227v3","updated":"2025-01-17T02:27:29Z","published":"2025-01-13T11:28:49Z","title":"MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning","summary":"  Video causal reasoning aims to achieve a high-level understanding of videos\nfrom a causal perspective. However, it exhibits limitations in its scope,\nprimarily executed in a question-answering paradigm and focusing on brief video\nsegments containing isolated events and basic causal relations, lacking\ncomprehensive and structured causality analysis for videos with multiple\ninterconnected events. To fill this gap, we introduce a new task and dataset,\nMulti-Event Causal Discovery (MECD). It aims to uncover the causal relations\nbetween events distributed chronologically across long videos. Given visual\nsegments and textual descriptions of events, MECD identifies the causal\nassociations between these events to derive a comprehensive and structured\nevent-level video causal graph explaining why and how the result event\noccurred. To address the challenges of MECD, we devise a novel framework\ninspired by the Granger Causality method, incorporating an efficient mask-based\nevent prediction model to perform an Event Granger Test. It estimates causality\nby comparing the predicted result event when premise events are masked versus\nunmasked. Furthermore, we integrate causal inference techniques such as\nfront-door adjustment and counterfactual inference to mitigate challenges in\nMECD like causality confounding and illusory causality. Additionally, context\nchain reasoning is introduced to conduct more robust and generalized reasoning.\nExperiments validate the effectiveness of our framework in reasoning complete\ncausal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%,\nrespectively. Further experiments demonstrate that causal relation graphs can\nalso contribute to downstream video understanding tasks such as video question\nanswering and video event prediction.\n","authors":["Tieyuan Chen","Huabin Liu","Yi Wang","Yihang Chen","Tianyao He","Chaofan Gan","Huanyu He","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07227v3.pdf","comment":"IEEE TPAMI Submission. continuous work of arXiv:2409.17647 (NeurIPS\n  2024)"},{"id":"http://arxiv.org/abs/2408.07832v8","updated":"2025-01-17T02:18:00Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":"  Error slice discovery is crucial to diagnose and mitigate model errors.\nCurrent clustering or discrete attribute-based slice discovery methods face key\nlimitations: 1) clustering results in incoherent slices, while assigning\ndiscrete attributes to slices leads to incomplete coverage of error patterns\ndue to missing or insufficient attributes; 2) these methods lack complex\nreasoning, preventing them from fully explaining model biases; 3) they fail to\nintegrate \\textit{domain knowledge}, limiting their usage in specialized fields\n\\eg radiology. We propose\\ladder (\\underline{La}nguage-\\underline{D}riven\n\\underline{D}iscovery and \\underline{E}rror \\underline{R}ectification), to\naddress the limitations by: (1) leveraging the flexibility of natural language\nto address incompleteness, (2) employing LLM's latent \\textit{domain knowledge}\nand advanced reasoning to analyze sentences and derive testable hypotheses\ndirectly, identifying biased attributes, and form coherent error slices without\nclustering. Existing mitigation methods typically address only the\nworst-performing group, often amplifying errors in other subgroups. In\ncontrast,\\ladder generates pseudo attributes from the discovered hypotheses to\nmitigate errors across all biases without explicit attribute annotations or\nprior knowledge of bias. Rigorous evaluations on 6 datasets spanning natural\nand medical images -- comparing 200+ classifiers with diverse architectures,\npretraining strategies, and LLMs -- show that\\ladder consistently outperforms\nexisting baselines in discovering and mitigating biases.\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Shyam Visweswaran","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02427v2","updated":"2025-01-17T01:47:13Z","published":"2025-01-05T03:12:30Z","title":"MetaNeRV: Meta Neural Representations for Videos with Spatial-Temporal\n  Guidance","summary":"  Neural Representations for Videos (NeRV) has emerged as a promising implicit\nneural representation (INR) approach for video analysis, which represents\nvideos as neural networks with frame indexes as inputs. However, NeRV-based\nmethods are time-consuming when adapting to a large number of diverse videos,\nas each video requires a separate NeRV model to be trained from scratch. In\naddition, NeRV-based methods spatially require generating a high-dimension\nsignal (i.e., an entire image) from the input of a low-dimension timestamp, and\na video typically consists of tens of frames temporally that have a minor\nchange between adjacent frames. To improve the efficiency of video\nrepresentation, we propose Meta Neural Representations for Videos, named\nMetaNeRV, a novel framework for fast NeRV representation for unseen videos.\nMetaNeRV leverages a meta-learning framework to learn an optimal parameter\ninitialization, which serves as a good starting point for adapting to new\nvideos. To address the unique spatial and temporal characteristics of video\nmodality, we further introduce spatial-temporal guidance to improve the\nrepresentation capabilities of MetaNeRV. Specifically, the spatial guidance\nwith a multi-resolution loss aims to capture the information from different\nresolution stages, and the temporal guidance with an effective progressive\nlearning strategy could gradually refine the number of fitted frames during the\nmeta-learning process. Extensive experiments conducted on multiple datasets\ndemonstrate the superiority of MetaNeRV for video representations and video\ncompression.\n","authors":["Jialong Guo","Ke liu","Jiangchao Yao","Zhihua Wang","Jiajun Bu","Haishuai Wang"],"pdf_url":"https://arxiv.org/pdf/2501.02427v2.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.09905v1","updated":"2025-01-17T01:32:18Z","published":"2025-01-17T01:32:18Z","title":"SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon\n  Visuomotor Learning","summary":"  We present a low-cost quadruped manipulation system that solves long-horizon\nreal-world tasks, trained by reinforcement learning purely in simulation. The\nsystem comprises 1) a hierarchical design of a high-level policy for\nvisual-mobile manipulation following instructions, and a low-level policy for\nquadruped movement and limb-control, 2) a progressive policy expansion approach\nfor solving the long-horizon task together with a teacher-student framework for\nefficient high-level training of the high-level visuomotor policy, and 3) a\nsuite of techniques for minimizing sim-to-real gaps.\n  With budget-friendly but limited reliability and performance hardware, and\njust one wrist-mounted RGB camera, the entire system fully trained in\nsimulation achieves high success rates for long horizon tasks involving search,\nmove, grasp, and drop-into, with fluid sim-to-real transfer in a wide variety\nof indoor and outdoor scenes and lighting conditions.Extensive real-world\nevaluations show that on the long horizon mobile manipulation tasks, our system\nachieves good performance when transferred to real both in terms of task\nsuccess rate and execution efficiency. Finally, we discuss the necessity of our\nsim-to-real techniques for legged mobile manipulation, and show their ablation\nperformance.\n","authors":["Haichao Zhang","Haonan Yu","Le Zhao","Andrew Choi","Qinxun Bai","Yiqing Yang","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2501.09905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.03415v4","updated":"2025-01-17T01:13:19Z","published":"2022-03-04T06:41:40Z","title":"Keep It Accurate and Robust: An Enhanced Nuclei Analysis Framework","summary":"  Accurate segmentation and classification of nuclei in histology images is\ncritical but challenging due to nuclei heterogeneity, staining variations, and\ntissue complexity. Existing methods often struggle with limited dataset\nvariability, with patches extracted from similar whole slide images (WSI),\nmaking models prone to falling into local optima. Here we propose a new\nframework to address this limitation and enable robust nuclear analysis. Our\nmethod leverages dual-level ensemble modeling to overcome issues stemming from\nlimited dataset variation. Intra-ensembling applies diverse transformations to\nindividual samples, while inter-ensembling combines networks of different\nscales. We also introduce enhancements to the HoVer-Net architecture, including\nupdated encoders, nested dense decoding and model regularization strategy. We\nachieve state-of-the-art results on public benchmarks, including 1st place for\nnuclear composition prediction and 3rd place for segmentation/classification in\nthe 2022 Colon Nuclei Identification and Counting (CoNIC) Challenge. This\nsuccess validates our approach for accurate histological nuclei analysis.\nExtensive experiments and ablation studies provide insights into optimal\nnetwork design choices and training techniques. In conclusion, this work\nproposes an improved framework advancing the state-of-the-art in nuclei\nanalysis. We release our code and models\n(https://github.com/WinnieLaugh/CONIC_Pathology_AI) to serve as a toolkit for\nthe community.\n","authors":["Wenhua Zhang","Sen Yang","Meiwei Luo","Chuan He","Yuchen Li","Jun Zhang","Xiyue Wang","Fang Wang"],"pdf_url":"https://arxiv.org/pdf/2203.03415v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09898v1","updated":"2025-01-17T01:01:44Z","published":"2025-01-17T01:01:44Z","title":"FoundationStereo: Zero-Shot Stereo Matching","summary":"  Tremendous progress has been made in deep stereo matching to excel on\nbenchmark datasets through per-domain fine-tuning. However, achieving strong\nzero-shot generalization - a hallmark of foundation models in other computer\nvision tasks - remains challenging for stereo matching. We introduce\nFoundationStereo, a foundation model for stereo depth estimation designed to\nachieve strong zero-shot generalization. To this end, we first construct a\nlarge-scale (1M stereo pairs) synthetic training dataset featuring large\ndiversity and high photorealism, followed by an automatic self-curation\npipeline to remove ambiguous samples. We then design a number of network\narchitecture components to enhance scalability, including a side-tuning feature\nbackbone that adapts rich monocular priors from vision foundation models to\nmitigate the sim-to-real gap, and long-range context reasoning for effective\ncost volume filtering. Together, these components lead to strong robustness and\naccuracy across domains, establishing a new standard in zero-shot stereo depth\nestimation.\n","authors":["Bowen Wen","Matthew Trepte","Joseph Aribido","Jan Kautz","Orazio Gallo","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2501.09898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09887v1","updated":"2025-01-17T00:18:34Z","published":"2025-01-17T00:18:34Z","title":"FLORA: Formal Language Model Enables Robust Training-free Zero-shot\n  Object Referring Analysis","summary":"  Object Referring Analysis (ORA), commonly known as referring expression\ncomprehension, requires the identification and localization of specific objects\nin an image based on natural descriptions. Unlike generic object detection, ORA\nrequires both accurate language understanding and precise visual localization,\nmaking it inherently more complex. Although recent pre-trained large visual\ngrounding detectors have achieved significant progress, they heavily rely on\nextensively labeled data and time-consuming learning. To address these, we\nintroduce a novel, training-free framework for zero-shot ORA, termed FLORA\n(Formal Language for Object Referring and Analysis). FLORA harnesses the\ninherent reasoning capabilities of large language models (LLMs) and integrates\na formal language model - a logical framework that regulates language within\nstructured, rule-based descriptions - to provide effective zero-shot ORA. More\nspecifically, our formal language model (FLM) enables an effective,\nlogic-driven interpretation of object descriptions without necessitating any\ntraining processes. Built upon FLM-regulated LLM outputs, we further devise a\nBayesian inference framework and employ appropriate off-the-shelf interpretive\nmodels to finalize the reasoning, delivering favorable robustness against LLM\nhallucinations and compelling ORA performance in a training-free manner. In\npractice, our FLORA boosts the zero-shot performance of existing pretrained\ngrounding detectors by up to around 45%. Our comprehensive evaluation across\ndifferent challenging datasets also confirms that FLORA consistently surpasses\ncurrent state-of-the-art zero-shot methods in both detection and segmentation\ntasks associated with zero-shot ORA. We believe our probabilistic parsing and\nreasoning of the LLM outputs elevate the reliability and interpretability of\nzero-shot ORA. We shall release codes upon publication.\n","authors":["Zhe Chen","Zijing Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09887v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2109.03538v2","updated":"2025-01-17T14:54:04Z","published":"2021-09-08T10:41:23Z","title":"Tracing Affordance and Item Adoption on Music Streaming Platforms","summary":"  Popular music streaming platforms offer users a diverse network of content\nexploration through a triad of affordances: organic, algorithmic and editorial\naccess modes. Whilst offering great potential for discovery, such platform\ndevelopments also pose the modern user with daily adoption decisions on two\nfronts: platform affordance adoption and the adoption of recommendations\ntherein. Following a carefully constrained set of Deezer users over a 2-year\nobservation period, our work explores factors driving user behaviour in the\nbroad sense, by differentiating users on the basis of their temporal daily\nusage, adoption of the main platform affordances, and the ways in which they\nreact to them, especially in terms of recommendation adoption. Diverging from a\nperspective common in studies on the effects of recommendation, we assume and\nconfirm that users exhibit very diverse behaviours in using and adopting the\nplatform affordances. The resulting complex and quite heterogeneous picture\ndemonstrates that there is no blanket answer for adoption practices of both\nrecommendation features and recommendations.\n","authors":["Dougal Shakespeare","Camille Roth"],"pdf_url":"https://arxiv.org/pdf/2109.03538v2.pdf","comment":"ISMIR 2021 pre-print"},{"id":"http://arxiv.org/abs/2501.10179v1","updated":"2025-01-17T13:24:13Z","published":"2025-01-17T13:24:13Z","title":"A Simple but Effective Closed-form Solution for Extreme Multi-label\n  Learning","summary":"  Extreme multi-label learning (XML) is a task of assigning multiple labels\nfrom an extremely large set of labels to each data instance. Many current\nhigh-performance XML models are composed of a lot of hyperparameters, which\ncomplicates the tuning process. Additionally, the models themselves are adapted\nspecifically to XML, which complicates their reimplementation. To remedy this\nproblem, we propose a simple method based on ridge regression for XML. The\nproposed method not only has a closed-form solution but also is composed of a\nsingle hyperparameter. Since there are no precedents on applying ridge\nregression to XML, this paper verified the performance of the method by using\nvarious XML benchmark datasets. Furthermore, we enhanced the prediction of\nlow-frequency labels in XML, which hold informative content. This prediction is\nessential yet challenging because of the limited amount of data. Here, we\nemployed a simple frequency-based weighting. This approach greatly simplifies\nthe process compared with existing techniques. Experimental results revealed\nthat it can achieve levels of performance comparable to, or even exceeding,\nthose of models with numerous hyperparameters. Additionally, we found that the\nfrequency-based weighting significantly improved the predictive performance for\nlow-frequency labels, while requiring almost no changes in implementation. The\nsource code for the proposed method is available on github at\nhttps://github.com/cars1015/XML-ridge.\n","authors":["Kazuma Onishi","Katsuhiko Hayashi"],"pdf_url":"https://arxiv.org/pdf/2501.10179v1.pdf","comment":"10pages, Accepted at ECIR25"},{"id":"http://arxiv.org/abs/2501.10165v1","updated":"2025-01-17T12:56:28Z","published":"2025-01-17T12:56:28Z","title":"MechIR: A Mechanistic Interpretability Framework for Information\n  Retrieval","summary":"  Mechanistic interpretability is an emerging diagnostic approach for neural\nmodels that has gained traction in broader natural language processing domains.\nThis paradigm aims to provide attribution to components of neural systems where\ncausal relationships between hidden layers and output were previously\nuninterpretable. As the use of neural models in IR for retrieval and evaluation\nbecomes ubiquitous, we need to ensure that we can interpret why a model\nproduces a given output for both transparency and the betterment of systems.\nThis work comprises a flexible framework for diagnostic analysis and\nintervention within these highly parametric neural systems specifically\ntailored for IR tasks and architectures. In providing such a framework, we look\nto facilitate further research in interpretable IR with a broader scope for\npractical interventions derived from mechanistic interpretability. We provide\npreliminary analysis and look to demonstrate our framework through an axiomatic\nlens to show its applications and ease of use for those IR practitioners\ninexperienced in this emerging paradigm.\n","authors":["Andrew Parry","Catherine Chen","Carsten Eickhoff","Sean MacAvaney"],"pdf_url":"https://arxiv.org/pdf/2501.10165v1.pdf","comment":"5 pages, 2 figures, Accepted to ECIR 2025 as a Demo Paper"},{"id":"http://arxiv.org/abs/2406.12634v2","updated":"2025-01-17T12:19:05Z","published":"2024-06-18T14:01:53Z","title":"News Without Borders: Domain Adaptation of Multilingual Sentence\n  Embeddings for Cross-lingual News Recommendation","summary":"  Rapidly growing numbers of multilingual news consumers pose an increasing\nchallenge to news recommender systems in terms of providing customized\nrecommendations. First, existing neural news recommenders, even when powered by\nmultilingual language models (LMs), suffer substantial performance losses in\nzero-shot cross-lingual transfer (ZS-XLT). Second, the current paradigm of\nfine-tuning the backbone LM of a neural recommender on task-specific data is\ncomputationally expensive and infeasible in few-shot recommendation and\ncold-start setups, where data is scarce or completely unavailable. In this\nwork, we propose a news-adapted sentence encoder (NaSE), domain-specialized\nfrom a pretrained massively multilingual sentence encoder (SE). To this end, we\nconstruct and leverage PolyNews and PolyNewsParallel, two multilingual\nnews-specific corpora. With the news-adapted multilingual SE in place, we test\nthe effectiveness of (i.e., question the need for) supervised fine-tuning for\nnews recommendation, and propose a simple and strong baseline based on (i)\nfrozen NaSE embeddings and (ii) late click-behavior fusion. We show that NaSE\nachieves state-of-the-art performance in ZS-XLT in true cold-start and few-shot\nnews recommendation.\n","authors":["Andreea Iana","Fabian David Schmidt","Goran Glavaš","Heiko Paulheim"],"pdf_url":"https://arxiv.org/pdf/2406.12634v2.pdf","comment":"Accepted at the 47th European Conference on Information Retrieval\n  (ECIR 2025) Appendix A is provided only in the arXiv version"},{"id":"http://arxiv.org/abs/2501.10143v1","updated":"2025-01-17T12:11:46Z","published":"2025-01-17T12:11:46Z","title":"A Worrying Reproducibility Study of Intent-Aware Recommendation Models","summary":"  Lately, we have observed a growing interest in intent-aware recommender\nsystems (IARS). The promise of such systems is that they are capable of\ngenerating better recommendations by predicting and considering the underlying\nmotivations and short-term goals of consumers. From a technical perspective,\nvarious sophisticated neural models were recently proposed in this emerging and\npromising area. In the broader context of complex neural recommendation models,\na growing number of research works unfortunately indicates that (i) reproducing\nsuch works is often difficult and (ii) that the true benefits of such models\nmay be limited in reality, e.g., because the reported improvements were\nobtained through comparisons with untuned or weak baselines. In this work, we\ninvestigate if recent research in IARS is similarly affected by such problems.\nSpecifically, we tried to reproduce five contemporary IARS models that were\npublished in top-level outlets, and we benchmarked them against a number of\ntraditional non-neural recommendation models. In two of the cases, running the\nprovided code with the optimal hyperparameters reported in the paper did not\nyield the results reported in the paper. Worryingly, we find that all examined\nIARS approaches are consistently outperformed by at least one traditional\nmodel. These findings point to sustained methodological issues and to a\npressing need for more rigorous scholarly practices.\n","authors":["Faisal Shehzad","Maurizio Ferrari Dacrema","Dietmar Jannach"],"pdf_url":"https://arxiv.org/pdf/2501.10143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10120v1","updated":"2025-01-17T11:12:28Z","published":"2025-01-17T11:12:28Z","title":"PaSa: An LLM Agent for Comprehensive Academic Paper Search","summary":"  We introduce PaSa, an advanced Paper Search agent powered by large language\nmodels. PaSa can autonomously make a series of decisions, including invoking\nsearch tools, reading papers, and selecting relevant references, to ultimately\nobtain comprehensive and accurate results for complex scholarly queries. We\noptimize PaSa using reinforcement learning with a synthetic dataset,\nAutoScholarQuery, which includes 35k fine-grained academic queries and\ncorresponding papers sourced from top-tier AI conference publications.\nAdditionally, we develop RealScholarQuery, a benchmark collecting real-world\nacademic queries to assess PaSa performance in more realistic scenarios.\nDespite being trained on synthetic data, PaSa significantly outperforms\nexisting baselines on RealScholarQuery, including Google, Google Scholar,\nGoogle with GPT-4 for paraphrased queries, chatGPT (search-enabled GPT-4o),\nGPT-o1, and PaSa-GPT-4o (PaSa implemented by prompting GPT-4o). Notably,\nPaSa-7B surpasses the best Google-based baseline, Google with GPT-4o, by 37.78%\nin recall@20 and 39.90% in recall@50. It also exceeds PaSa-GPT-4o by 30.36% in\nrecall and 4.25% in precision. Model, datasets, and code are available at\nhttps://github.com/bytedance/pasa.\n","authors":["Yichen He","Guanhua Huang","Peiyuan Feng","Yuan Lin","Yuchen Zhang","Hang Li","Weinan E"],"pdf_url":"https://arxiv.org/pdf/2501.10120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12997v2","updated":"2025-01-17T10:02:38Z","published":"2024-12-17T15:21:28Z","title":"Enabling Low-Resource Language Retrieval: Establishing Baselines for\n  Urdu MS MARCO","summary":"  As the Information Retrieval (IR) field increasingly recognizes the\nimportance of inclusivity, addressing the needs of low-resource languages\nremains a significant challenge. This paper introduces the first large-scale\nUrdu IR dataset, created by translating the MS MARCO dataset through machine\ntranslation. We establish baseline results through zero-shot learning for IR in\nUrdu and subsequently apply the mMARCO multilingual IR methodology to this\nnewly translated dataset. Our findings demonstrate that the fine-tuned model\n(Urdu-mT5-mMARCO) achieves a Mean Reciprocal Rank (MRR@10) of 0.247 and a\nRecall@10 of 0.439, representing significant improvements over zero-shot\nresults and showing the potential for expanding IR access for Urdu speakers. By\nbridging access gaps for speakers of low-resource languages, this work not only\nadvances multilingual IR research but also emphasizes the ethical and societal\nimportance of inclusive IR technologies. This work provides valuable insights\ninto the challenges and solutions for improving language representation and\nlays the groundwork for future research, especially in South Asian languages,\nwhich can benefit from the adaptable methods used in this study.\n","authors":["Umer Butt","Stalin Veranasi","Günter Neumann"],"pdf_url":"https://arxiv.org/pdf/2412.12997v2.pdf","comment":"7 pages, ECIR 2025, conference camera-ready version"},{"id":"http://arxiv.org/abs/2501.02178v2","updated":"2025-01-17T08:44:57Z","published":"2025-01-04T04:02:23Z","title":"The Application of Large Language Models in Recommendation Systems","summary":"  The integration of Large Language Models into recommendation frameworks\npresents key advantages for personalization and adaptability of experiences to\nthe users. Classic methods of recommendations, such as collaborative filtering\nand content-based filtering, are seriously limited in the solution of\ncold-start problems, sparsity of data, and lack of diversity in information\nconsidered. LLMs, of which GPT-4 is a good example, have emerged as powerful\ntools that enable recommendation frameworks to tap into unstructured data\nsources such as user reviews, social interactions, and text-based content. By\nanalyzing these data sources, LLMs improve the accuracy and relevance of\nrecommendations, thereby overcoming some of the limitations of traditional\napproaches. This work discusses applications of LLMs in recommendation systems,\nespecially in electronic commerce, social media platforms, streaming services,\nand educational technologies. This showcases how LLMs enrich recommendation\ndiversity, user engagement, and the system's adaptability; yet it also looks\ninto the challenges connected to their technical implementation. This can also\nbe presented as a study that shows the potential of LLMs for changing user\nexperiences and making innovation possible in industries.\n","authors":["Peiyang Yu","Zeqiu Xu","Jiani Wang","Xiaochuan Xu"],"pdf_url":"https://arxiv.org/pdf/2501.02178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09831v2","updated":"2025-01-17T07:48:39Z","published":"2024-08-19T09:27:45Z","title":"Ranking Generated Answers: On the Agreement of Retrieval Models with\n  Humans on Consumer Health Questions","summary":"  Evaluating the output of generative large language models (LLMs) is\nchallenging and difficult to scale. Many evaluations of LLMs focus on tasks\nsuch as single-choice question-answering or text classification. These tasks\nare not suitable for assessing open-ended question-answering capabilities,\nwhich are critical in domains where expertise is required. One such domain is\nhealth, where misleading or incorrect answers can have a negative impact on a\nuser's well-being. Using human experts to evaluate the quality of LLM answers\nis generally considered the gold standard, but expert annotation is costly and\nslow. We present a method for evaluating LLM answers that uses ranking models\ntrained on annotated document collections as a substitute for explicit\nrelevance judgements and apply it to the CLEF 2021 eHealth dataset. In a user\nstudy, our method correlates with the preferences of a human expert (Kendall's\n$\\tau=0.64$). It is also consistent with previous findings in that the quality\nof generated answers improves with the size of the model and more sophisticated\nprompting strategies.\n","authors":["Sebastian Heineking","Jonas Probst","Daniel Steinbach","Martin Potthast","Harrisen Scells"],"pdf_url":"https://arxiv.org/pdf/2408.09831v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13238v4","updated":"2025-01-17T03:44:29Z","published":"2024-05-21T22:53:00Z","title":"Enhancing User Interest based on Stream Clustering and Memory Networks\n  in Large-Scale Recommender Systems","summary":"  Recommender Systems (RSs) provide personalized recommendation service based\non user interest, which are widely used in various platforms. However, there\nare lots of users with sparse interest due to lacking consumption behaviors,\nwhich leads to poor recommendation results for them. This problem is widespread\nin large-scale RSs and is particularly difficult to address. To solve this\nproblem, we propose a novel solution named User Interest Enhancement (UIE)\nwhich enhances user interest including user profile and user history behavior\nsequences using the enhancement vectors and personalized enhancement vector\ngenerated based on stream clustering and memory networks from different\nperspectives. UIE not only remarkably improves model performance on the users\nwith sparse interest but also significantly enhance model performance on other\nusers. UIE is an end-to-end solution which is easy to be implemented based on\nranking model. Moreover, we expand our solution and apply similar methods to\nlong-tail items, which also achieves excellent improvement. Furthermore, we\nconduct extensive offline and online experiments in a large-scale industrial\nRS. The results demonstrate that our model outperforms other models remarkably,\nespecially for the users with sparse interest. Until now, UIE has been fully\ndeployed in multiple large-scale RSs and achieved remarkable improvements.\n","authors":["Peng Liu","Nian Wang","Cong Xu","Ming Zhao","Bin Wang","Yi Ren"],"pdf_url":"https://arxiv.org/pdf/2405.13238v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09940v1","updated":"2025-01-17T03:42:18Z","published":"2025-01-17T03:42:18Z","title":"Passage Segmentation of Documents for Extractive Question Answering","summary":"  Retrieval-Augmented Generation (RAG) has proven effective in open-domain\nquestion answering. However, the chunking process, which is essential to this\npipeline, often receives insufficient attention relative to retrieval and\nsynthesis components. This study emphasizes the critical role of chunking in\nimproving the performance of both dense passage retrieval and the end-to-end\nRAG pipeline. We then introduce the Logits-Guided Multi-Granular Chunker\n(LGMGC), a novel framework that splits long documents into contextualized,\nself-contained chunks of varied granularity. Our experimental results,\nevaluated on two benchmark datasets, demonstrate that LGMGC not only improves\nthe retrieval step but also outperforms existing chunking methods when\nintegrated into a RAG pipeline.\n","authors":["Zuhong Liu","Charles-Elie Simon","Fabien Caspani"],"pdf_url":"https://arxiv.org/pdf/2501.09940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10548v1","updated":"2025-01-17T20:43:47Z","published":"2025-01-17T20:43:47Z","title":"Diffusion Models in Recommendation Systems: A Survey","summary":"  Recommender systems remain an essential topic due to its wide application in\nvarious domains and the business potential behind them. With the rise of deep\nlearning, common solutions have leveraged neural networks to facilitate\ncollaborative filtering, and some have turned to generative adversarial\nnetworks to augment the dataset and tackle the data sparsity issue. However,\nthey are limited in learning the complex user and item distribution and still\nsuffer from model collapse. Given the great generation capability exhibited by\ndiffusion models in computer vision recently, many recommender systems have\nadopted diffusion models and found improvements in performance for various\ntasks. Diffusion models in recommender systems excel in managing complex user\nand item distributions and do not suffer from mode collapse. With these\nadvantages, the amount of research in this domain have been growing rapidly and\ncalling for a systematic survey. In this survey paper, we present and propose a\ntaxonomy on past research papers in recommender systems that utilize diffusion\nmodels. Distinct from a prior survey paper that categorizes based on the role\nof the diffusion model, we categorize based on the recommendation task at hand.\nThe decision originates from the rationale that after all, the adoption of\ndiffusion models is to enhance the recommendation performance, not vice versa:\nadapting the recommendation task to enable diffusion models. Nonetheless, we\noffer a unique perspective for diffusion models in recommender systems\ncomplementary to existing surveys. We present the foundation algorithms in\ndiffusion models and their applications in recommender systems to summarize the\nrapid development in this field. Finally, we discuss open research directions\nto prepare and encourage further efforts to advance the field. We compile the\nrelevant papers in a public GitHub repository.\n","authors":["Ting-Ruen Wei","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2501.10548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19576v2","updated":"2025-01-17T20:11:59Z","published":"2024-11-29T09:47:32Z","title":"On Explaining Recommendations with Large Language Models: A Review","summary":"  The rise of Large Language Models (LLMs), such as LLaMA and ChatGPT, has\nopened new opportunities for enhancing recommender systems through improved\nexplainability. This paper provides a systematic literature review focused on\nleveraging LLMs to generate explanations for recommendations -- a critical\naspect for fostering transparency and user trust. We conducted a comprehensive\nsearch within the ACM Guide to Computing Literature, covering publications from\nthe launch of ChatGPT (November 2022) to the present (November 2024). Our\nsearch yielded 232 articles, but after applying inclusion criteria, only six\nwere identified as directly addressing the use of LLMs in explaining\nrecommendations. This scarcity highlights that, despite the rise of LLMs, their\napplication in explainable recommender systems is still in an early stage. We\nanalyze these select studies to understand current methodologies, identify\nchallenges, and suggest directions for future research. Our findings underscore\nthe potential of LLMs improving explanations of recommender systems and\nencourage the development of more transparent and user-centric recommendation\nexplanation solutions.\n","authors":["Alan Said"],"pdf_url":"https://arxiv.org/pdf/2411.19576v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2403.15780v3","updated":"2025-01-17T18:42:52Z","published":"2024-03-23T09:32:23Z","title":"A Fairness-Oriented Reinforcement Learning Approach for the Operation\n  and Control of Shared Micromobility Services","summary":"  As Machine Learning grows in popularity across various fields, equity has\nbecome a key focus for the AI community. However, fairness-oriented approaches\nare still underexplored in smart mobility. Addressing this gap, our study\ninvestigates the balance between performance optimization and algorithmic\nfairness in shared micromobility services providing a novel framework based on\nReinforcement Learning. Exploiting Q-learning, the proposed methodology\nachieves equitable outcomes in terms of the Gini index across different areas\ncharacterized by their distance from central hubs. Through vehicle rebalancing,\nthe provided scheme maximizes operator performance while ensuring fairness\nprinciples for users, reducing iniquity by up to 85% while only increasing\ncosts by 30% (w.r.t. applying no equity adjustment). A case study with\nsynthetic data validates our insights and highlights the importance of fairness\nin urban micromobility (source code:\nhttps://github.com/mcederle99/FairMSS.git).\n","authors":["Matteo Cederle","Luca Vittorio Piron","Marina Ceccon","Federico Chiariotti","Alessandro Fabris","Marco Fabris","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2403.15780v3.pdf","comment":"6 pages, 3 figures, accepted at the 2025 American Control Conference\n  (ACC) on January 17th, 2025"},{"id":"http://arxiv.org/abs/2501.10348v1","updated":"2025-01-17T18:42:46Z","published":"2025-01-17T18:42:46Z","title":"Credit Risk Identification in Supply Chains Using Generative Adversarial\n  Networks","summary":"  Credit risk management within supply chains has emerged as a critical\nresearch area due to its significant implications for operational stability and\nfinancial sustainability. The intricate interdependencies among supply chain\nparticipants mean that credit risks can propagate across networks, with impacts\nvarying by industry. This study explores the application of Generative\nAdversarial Networks (GANs) to enhance credit risk identification in supply\nchains. GANs enable the generation of synthetic credit risk scenarios,\naddressing challenges related to data scarcity and imbalanced datasets. By\nleveraging GAN-generated data, the model improves predictive accuracy while\neffectively capturing dynamic and temporal dependencies in supply chain data.\nThe research focuses on three representative industries-manufacturing (steel),\ndistribution (pharmaceuticals), and services (e-commerce) to assess\nindustry-specific credit risk contagion. Experimental results demonstrate that\nthe GAN-based model outperforms traditional methods, including logistic\nregression, decision trees, and neural networks, achieving superior accuracy,\nrecall, and F1 scores. The findings underscore the potential of GANs in\nproactive risk management, offering robust tools for mitigating financial\ndisruptions in supply chains. Future research could expand the model by\nincorporating external market factors and supplier relationships to further\nenhance predictive capabilities. Keywords- Generative Adversarial Networks\n(GANs); Supply Chain Risk; Credit Risk Identification; Machine Learning; Data\nAugmentation\n","authors":["Zizhou Zhang","Xinshi Li","Yu Cheng","Zhenrui Chen","Qianying Liu"],"pdf_url":"https://arxiv.org/pdf/2501.10348v1.pdf","comment":"The paper will be published and indexed by IEEE at 2025 8th\n  International Conference on Advanced Algorithms and Control Engineering\n  (ICAACE 2025)"},{"id":"http://arxiv.org/abs/2501.10347v1","updated":"2025-01-17T18:40:38Z","published":"2025-01-17T18:40:38Z","title":"ColNet: Collaborative Optimization in Decentralized Federated Multi-task\n  Learning Systems","summary":"  The integration of Federated Learning (FL) and Multi-Task Learning (MTL) has\nbeen explored to address client heterogeneity, with Federated Multi-Task\nLearning (FMTL) treating each client as a distinct task. However, most existing\nresearch focuses on data heterogeneity (e.g., addressing non-IID data) rather\nthan task heterogeneity, where clients solve fundamentally different tasks.\nAdditionally, much of the work relies on centralized settings with a server\nmanaging the federation, leaving the more challenging domain of decentralized\nFMTL largely unexplored. Thus, this work bridges this gap by proposing ColNet,\na framework designed for heterogeneous tasks in decentralized federated\nenvironments. ColNet divides models into the backbone and task-specific layers,\nforming groups of similar clients, with group leaders performing\nconflict-averse cross-group aggregation. A pool of experiments with different\nfederations demonstrated ColNet outperforms the compared aggregation schemes in\ndecentralized settings with label and task heterogeneity scenarios.\n","authors":["Chao Feng","Nicolas Fazli Kohler","Alberto Huertas Celdran","Gerome Bovet","Burkhard Stiller"],"pdf_url":"https://arxiv.org/pdf/2501.10347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10342v1","updated":"2025-01-17T18:33:58Z","published":"2025-01-17T18:33:58Z","title":"Hybrid Deep Learning Model for epileptic seizure classification by using\n  1D-CNN with multi-head attention mechanism","summary":"  Epilepsy is a prevalent neurological disorder globally, impacting around 50\nmillion people \\cite{WHO_epilepsy_50million}. Epileptic seizures result from\nsudden abnormal electrical activity in the brain, which can be read as sudden\nand significant changes in the EEG signal of the brain. The signal can vary in\nseverity and frequency, which results in loss of consciousness and muscle\ncontractions for a short period of time \\cite{epilepsyfoundation_myoclonic}.\nIndividuals with epilepsy often face significant employment challenges due to\nsafety concerns in certain work environments. Many jobs that involve working at\nheights, operating heavy machinery, or in other potentially hazardous settings\nmay be restricted for people with seizure disorders. This certainly limits job\noptions and economic opportunities for those living with epilepsy.\n","authors":["Mohammed Guhdar","Ramadhan J. Mstafa","Abdulhakeem O. Mohammed"],"pdf_url":"https://arxiv.org/pdf/2501.10342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09327v2","updated":"2025-01-17T18:30:04Z","published":"2025-01-16T06:52:58Z","title":"On Learning Informative Trajectory Embeddings for Imitation,\n  Classification and Regression","summary":"  In real-world sequential decision making tasks like autonomous driving,\nrobotics, and healthcare, learning from observed state-action trajectories is\ncritical for tasks like imitation, classification, and clustering. For example,\nself-driving cars must replicate human driving behaviors, while robots and\nhealthcare systems benefit from modeling decision sequences, whether or not\nthey come from expert data. Existing trajectory encoding methods often focus on\nspecific tasks or rely on reward signals, limiting their ability to generalize\nacross domains and tasks. Inspired by the success of embedding models like CLIP\nand BERT in static domains, we propose a novel method for embedding\nstate-action trajectories into a latent space that captures the skills and\ncompetencies in the dynamic underlying decision-making processes. This method\noperates without the need for reward labels, enabling better generalization\nacross diverse domains and tasks. Our contributions are threefold: (1) We\nintroduce a trajectory embedding approach that captures multiple abilities from\nstate-action data. (2) The learned embeddings exhibit strong representational\npower across downstream tasks, including imitation, classification, clustering,\nand regression. (3) The embeddings demonstrate unique properties, such as\ncontrolling agent behaviors in IQ-Learn and an additive structure in the latent\nspace. Experimental results confirm that our method outperforms traditional\napproaches, offering more flexible and powerful trajectory representations for\nvarious applications. Our code is available at\nhttps://github.com/Erasmo1015/vte.\n","authors":["Zichang Ge","Changyu Chen","Arunesh Sinha","Pradeep Varakantham"],"pdf_url":"https://arxiv.org/pdf/2501.09327v2.pdf","comment":"AAMAS 2025"},{"id":"http://arxiv.org/abs/2403.01204v2","updated":"2025-01-17T18:15:40Z","published":"2024-03-02T12:45:01Z","title":"Stochastic gradient descent for streaming linear and rectified linear\n  systems with adversarial corruptions","summary":"  We propose SGD-exp, a stochastic gradient descent approach for linear and\nReLU regressions under Massart noise (adversarial semi-random corruption model)\nfor the fully streaming setting. We show novel nearly linear convergence\nguarantees of SGD-exp to the true parameter with up to $50\\%$ Massart\ncorruption rate, and with any corruption rate in the case of symmetric\noblivious corruptions. This is the first convergence guarantee result for\nrobust ReLU regression in the streaming setting, and it shows the improved\nconvergence rate over previous robust methods for $L_1$ linear regression due\nto a choice of an exponentially decaying step size, known for its efficiency in\npractice. Our analysis is based on the drift analysis of a discrete stochastic\nprocess, which could also be interesting on its own.\n","authors":["Halyun Jeong","Deanna Needell","Elizaveta Rebrova"],"pdf_url":"https://arxiv.org/pdf/2403.01204v2.pdf","comment":"Submitted to a journal"},{"id":"http://arxiv.org/abs/2501.10324v1","updated":"2025-01-17T17:56:27Z","published":"2025-01-17T17:56:27Z","title":"New Fashion Products Performance Forecasting: A Survey on Evolutions,\n  Models and Emerging Trends","summary":"  The fast fashion industry's insatiable demand for new styles and rapid\nproduction cycles has led to a significant environmental burden.\nOverproduction, excessive waste, and harmful chemicals have contributed to the\nnegative environmental impact of the industry. To mitigate these issues, a\nparadigm shift that prioritizes sustainability and efficiency is urgently\nneeded. Integrating learning-based predictive analytics into the fashion\nindustry represents a significant opportunity to address environmental\nchallenges and drive sustainable practices. By forecasting fashion trends and\noptimizing production, brands can reduce their ecological footprint while\nremaining competitive in a rapidly changing market. However, one of the key\nchallenges in forecasting fashion sales is the dynamic nature of consumer\npreferences. Fashion is acyclical, with trends constantly evolving and\nresurfacing. In addition, cultural changes and unexpected events can disrupt\nestablished patterns. This problem is also known as New Fashion Products\nPerformance Forecasting (NFPPF), and it has recently gained more and more\ninterest in the global research landscape. Given its multidisciplinary nature,\nthe field of NFPPF has been approached from many different angles. This\ncomprehensive survey wishes to provide an up-to-date overview that focuses on\nlearning-based NFPPF strategies. The survey is based on the Preferred Reporting\nItems for Systematic Reviews and Meta-Analyses (PRISMA) methodological flow,\nallowing for a systematic and complete literature review. In particular, we\npropose the first taxonomy that covers the learning panorama for NFPPF,\nexamining in detail the different methodologies used to increase the amount of\nmultimodal information, as well as the state-of-the-art available datasets.\nFinally, we discuss the challenges and future directions.\n","authors":["Andrea Avogaro","Luigi Capogrosso","Andrea Toaiari","Franco Fummi","Marco Cristani"],"pdf_url":"https://arxiv.org/pdf/2501.10324v1.pdf","comment":"Accepted at the Springer Nature Computer Science journal"},{"id":"http://arxiv.org/abs/2501.10322v1","updated":"2025-01-17T17:51:53Z","published":"2025-01-17T17:51:53Z","title":"Hierarchical Autoregressive Transformers: Combining Byte-~and Word-Level\n  Processing for Robust, Adaptable Language Models","summary":"  Tokenization is a fundamental step in natural language processing, breaking\ntext into units that computational models can process. While learned subword\ntokenizers have become the de-facto standard, they present challenges such as\nlarge vocabularies, limited adaptability to new domains or languages, and\nsensitivity to spelling errors and variations. To overcome these limitations,\nwe investigate a hierarchical architecture for autoregressive language\nmodelling that combines character-level and word-level processing. It employs a\nlightweight character-level encoder to convert character sequences into word\nembeddings, which are then processed by a word-level backbone model and decoded\nback into characters via a compact character-level decoder. This method retains\nthe sequence compression benefits of word-level tokenization without relying on\na rigid, predefined vocabulary. We demonstrate, at scales up to 7 billion\nparameters, that hierarchical transformers match the downstream task\nperformance of subword-tokenizer-based models while exhibiting significantly\ngreater robustness to input perturbations. Additionally, during continued\npretraining on an out-of-domain language, our model trains almost twice as\nfast, achieves superior performance on the target language, and retains more of\nits previously learned knowledge. Hierarchical transformers pave the way for\nNLP systems that are more robust, flexible, and generalizable across languages\nand domains.\n","authors":["Pit Neitemeier","Björn Deiseroth","Constantin Eichenberg","Lukas Balles"],"pdf_url":"https://arxiv.org/pdf/2501.10322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10321v1","updated":"2025-01-17T17:51:22Z","published":"2025-01-17T17:51:22Z","title":"Towards Human-Guided, Data-Centric LLM Co-Pilots","summary":"  Machine learning (ML) has the potential to revolutionize healthcare, but its\nadoption is often hindered by the disconnect between the needs of domain\nexperts and translating these needs into robust and valid ML tools. Despite\nrecent advances in LLM-based co-pilots to democratize ML for non-technical\ndomain experts, these systems remain predominantly focused on model-centric\naspects while overlooking critical data-centric challenges. This limitation is\nproblematic in complex real-world settings where raw data often contains\ncomplex issues, such as missing values, label noise, and domain-specific\nnuances requiring tailored handling. To address this we introduce CliMB-DC, a\nhuman-guided, data-centric framework for LLM co-pilots that combines advanced\ndata-centric tools with LLM-driven reasoning to enable robust, context-aware\ndata processing. At its core, CliMB-DC introduces a novel, multi-agent\nreasoning system that combines a strategic coordinator for dynamic planning and\nadaptation with a specialized worker agent for precise execution. Domain\nexpertise is then systematically incorporated to guide the reasoning process\nusing a human-in-the-loop approach. To guide development, we formalize a\ntaxonomy of key data-centric challenges that co-pilots must address.\nThereafter, to address the dimensions of the taxonomy, we integrate\nstate-of-the-art data-centric tools into an extensible, open-source\narchitecture, facilitating the addition of new tools from the research\ncommunity. Empirically, using real-world healthcare datasets we demonstrate\nCliMB-DC's ability to transform uncurated datasets into ML-ready formats,\nsignificantly outperforming existing co-pilot baselines for handling\ndata-centric challenges. CliMB-DC promises to empower domain experts from\ndiverse domains -- healthcare, finance, social sciences and more -- to actively\nparticipate in driving real-world impact using ML.\n","authors":["Evgeny Saveliev","Jiashuo Liu","Nabeel Seedat","Anders Boyd","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2501.10321v1.pdf","comment":"Saveliev, Liu & Seedat contributed equally"},{"id":"http://arxiv.org/abs/2209.07600v4","updated":"2025-01-17T16:52:03Z","published":"2022-09-15T20:27:54Z","title":"STPOTR: Simultaneous Human Trajectory and Pose Prediction Using a\n  Non-Autoregressive Transformer for Robot Following Ahead","summary":"  In this paper, we develop a neural network model to predict future human\nmotion from an observed human motion history. We propose a non-autoregressive\ntransformer architecture to leverage its parallel nature for easier training\nand fast, accurate predictions at test time. The proposed architecture divides\nhuman motion prediction into two parts: 1) the human trajectory, which is the\nhip joint 3D position over time and 2) the human pose which is the all other\njoints 3D positions over time with respect to a fixed hip joint. We propose to\nmake the two predictions simultaneously, as the shared representation can\nimprove the model performance. Therefore, the model consists of two sets of\nencoders and decoders. First, a multi-head attention module applied to encoder\noutputs improves human trajectory. Second, another multi-head self-attention\nmodule applied to encoder outputs concatenated with decoder outputs facilitates\nlearning of temporal dependencies. Our model is well-suited for robotic\napplications in terms of test accuracy and speed, and compares favorably with\nrespect to state-of-the-art methods. We demonstrate the real-world\napplicability of our work via the Robot Follow-Ahead task, a challenging yet\npractical case study for our proposed model.\n","authors":["Mohammad Mahdavian","Payam Nikdel","Mahdi TaherAhmadi","Mo Chen"],"pdf_url":"https://arxiv.org/pdf/2209.07600v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15839v2","updated":"2025-01-17T16:49:25Z","published":"2024-06-22T12:59:12Z","title":"The Effect of Similarity Measures on Accurate Stability Estimates for\n  Local Surrogate Models in Text-based Explainable AI","summary":"  Recent work has investigated the vulnerability of local surrogate methods to\nadversarial perturbations on a machine learning (ML) model's inputs, where the\nexplanation is manipulated while the meaning and structure of the original\ninput remains similar under the complex model. Although weaknesses across many\nmethods have been shown to exist, the reasons behind why remain little\nexplored. Central to the concept of adversarial attacks on explainable AI (XAI)\nis the similarity measure used to calculate how one explanation differs from\nanother. A poor choice of similarity measure can lead to erroneous conclusions\non the efficacy of an XAI method. Too sensitive a measure results in\nexaggerated vulnerability, while too coarse understates its weakness. We\ninvestigate a variety of similarity measures designed for text-based ranked\nlists, including Kendall's Tau, Spearman's Footrule, and Rank-biased Overlap to\ndetermine how substantial changes in the type of measure or threshold of\nsuccess affect the conclusions generated from common adversarial attack\nprocesses. Certain measures are found to be overly sensitive, resulting in\nerroneous estimates of stability.\n","authors":["Christopher Burger","Charles Walter","Thai Le"],"pdf_url":"https://arxiv.org/pdf/2406.15839v2.pdf","comment":"11 pages, 8 Tables (Minor edits for clarity and grammar)"},{"id":"http://arxiv.org/abs/2412.18263v4","updated":"2025-01-17T16:40:21Z","published":"2024-12-24T08:25:38Z","title":"High-Rank Irreducible Cartesian Tensor Decomposition and Bases of\n  Equivariant Spaces","summary":"  Irreducible Cartesian tensors (ICTs) play a crucial role in the design of\nequivariant graph neural networks, as well as in theoretical chemistry and\nchemical physics. Meanwhile, the design space of available linear operations on\ntensors that preserve symmetry presents a significant challenge. The ICT\ndecomposition and a basis of this equivariant space are difficult to obtain for\nhigh-rank tensors. After decades of research, Bonvicini (2024) recently\nachieves an explicit ICT decomposition for $n=5$ with factorial time/space\ncomplexity. In this work we, for the first time, obtains decomposition matrices\nfor ICTs up to rank $n=9$ with reduced and affordable complexity, by\nconstructing what we call path matrices. The path matrices are obtained via\nperforming chain-like contractions with Clebsch-Gordan matrices following the\nparentage scheme. We prove and leverage that the concatenation of path matrices\nis an orthonormal change-of-basis matrix between the Cartesian tensor product\nspace and the spherical direct sum spaces. Furthermore, we identify a complete\northogonal basis for the equivariant space, rather than a spanning set\n(Pearce-Crump, 2023), through this path matrices technique. To the best of our\nknowledge, this is also the first analytic, rather than numerical, method for\ntheoretically obtaining arbitrary rank orthogonal ICT decomposition matrices\nand orthogonal equivariant bases. We further extend our result to the arbitrary\ntensor product and direct sum spaces, enabling free design between different\nspaces while keeping symmetry. The Python code is available at\nhttps://github.com/ShihaoShao-GH/ICT-decomposition-and-equivariant-bases, where\nthe $n=6,\\dots,9$ ICT decomposition matrices are obtained in 1s, 3s, 11s, and\n4m32s on 28-cores Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz, respectively.\n","authors":["Shihao Shao","Yikang Li","Zhouchen Lin","Qinghua Cui"],"pdf_url":"https://arxiv.org/pdf/2412.18263v4.pdf","comment":"47 pages"},{"id":"http://arxiv.org/abs/2409.00753v2","updated":"2025-01-17T16:37:23Z","published":"2024-09-01T15:46:55Z","title":"Generalized Multi-hop Traffic Pressure for Heterogeneous Traffic\n  Perimeter Control","summary":"  Perimeter control (PC) prevents loss of traffic network capacity due to\ncongestion in urban areas. Homogeneous PC allows all access points to a\nprotected region to have identical permitted inflow. However, homogeneous PC\nperforms poorly when the congestion in the protected region is heterogeneous\n(e.g., imbalanced demand) since the homogeneous PC does not consider specific\ntraffic conditions around each perimeter intersection. When the protected\nregion has spatially heterogeneous congestion, one needs to modulate the\nperimeter inflow rate to be higher near low-density regions and vice versa for\nhigh-density regions. A na\\\"ive approach is to leverage 1-hop traffic pressure\nto measure traffic condition around perimeter intersections, but such metric is\ntoo spatially myopic for PC. To address this issue, we formulate multi-hop\ndownstream pressure grounded on Markov chain theory, which ``looks deeper''\ninto the protected region beyond perimeter intersections. In addition, we\nformulate a two-stage hierarchical control scheme that can leverage this novel\nmulti-hop pressure to redistribute the total permitted inflow provided by a\npre-trained deep reinforcement learning homogeneous control policy.\nExperimental results show that our heterogeneous PC approaches leveraging\nmulti-hop pressure significantly outperform homogeneous PC in scenarios where\nthe origin-destination flows are highly imbalanced with high spatial\nheterogeneity. Moveover, our approach is shown to be robust against turning\nratio uncertainties by a sensitivity analysis.\n","authors":["Xiaocan Li","Xiaoyu Wang","Ilia Smirnov","Scott Sanner","Baher Abdulhai"],"pdf_url":"https://arxiv.org/pdf/2409.00753v2.pdf","comment":"11 pages main body, 13 figures, journal paper"},{"id":"http://arxiv.org/abs/2401.07836v3","updated":"2025-01-17T16:35:27Z","published":"2024-01-15T17:06:02Z","title":"Two Types of AI Existential Risk: Decisive and Accumulative","summary":"  The conventional discourse on existential risks (x-risks) from AI typically\nfocuses on abrupt, dire events caused by advanced AI systems, particularly\nthose that might achieve or surpass human-level intelligence. These events have\nsevere consequences that either lead to human extinction or irreversibly\ncripple human civilization to a point beyond recovery. This discourse, however,\noften neglects the serious possibility of AI x-risks manifesting incrementally\nthrough a series of smaller yet interconnected disruptions, gradually crossing\ncritical thresholds over time. This paper contrasts the conventional \"decisive\nAI x-risk hypothesis\" with an \"accumulative AI x-risk hypothesis.\" While the\nformer envisions an overt AI takeover pathway, characterized by scenarios like\nuncontrollable superintelligence, the latter suggests a different causal\npathway to existential catastrophes. This involves a gradual accumulation of\ncritical AI-induced threats such as severe vulnerabilities and systemic erosion\nof economic and political structures. The accumulative hypothesis suggests a\nboiling frog scenario where incremental AI risks slowly converge, undermining\nsocietal resilience until a triggering event results in irreversible collapse.\nThrough systems analysis, this paper examines the distinct assumptions\ndifferentiating these two hypotheses. It is then argued that the accumulative\nview can reconcile seemingly incompatible perspectives on AI risks. The\nimplications of differentiating between these causal pathways -- the decisive\nand the accumulative -- for the governance of AI as well as long-term AI safety\nare discussed.\n","authors":["Atoosa Kasirzadeh"],"pdf_url":"https://arxiv.org/pdf/2401.07836v3.pdf","comment":"Journal article for Philosophical Studies"},{"id":"http://arxiv.org/abs/2501.10290v1","updated":"2025-01-17T16:34:45Z","published":"2025-01-17T16:34:45Z","title":"Pairwise Elimination with Instance-Dependent Guarantees for Bandits with\n  Cost Subsidy","summary":"  Multi-armed bandits (MAB) are commonly used in sequential online\ndecision-making when the reward of each decision is an unknown random variable.\nIn practice however, the typical goal of maximizing total reward may be less\nimportant than minimizing the total cost of the decisions taken, subject to a\nreward constraint. For example, we may seek to make decisions that have at\nleast the reward of a reference ``default'' decision, with as low a cost as\npossible. This problem was recently introduced in the Multi-Armed Bandits with\nCost Subsidy (MAB-CS) framework. MAB-CS is broadly applicable to problem\ndomains where a primary metric (cost) is constrained by a secondary metric\n(reward), and the rewards are unknown. In our work, we address variants of\nMAB-CS including ones with reward constrained by the reward of a known\nreference arm or by the subsidized best reward. We introduce the\nPairwise-Elimination (PE) algorithm for the known reference arm variant and\ngeneralize PE to PE-CS for the subsidized best reward variant. Our\ninstance-dependent analysis of PE and PE-CS reveals that both algorithms have\nan order-wise logarithmic upper bound on Cost and Quality Regret, making our\npolicies the first with such a guarantee. Moreover, by comparing our upper and\nlower bound results we establish that PE is order-optimal for all known\nreference arm problem instances. Finally, experiments are conducted using the\nMovieLens 25M and Goodreads datasets for both PE and PE-CS revealing the\neffectiveness of PE and the superior balance between performance and\nreliability offered by PE-CS compared to baselines from the literature.\n","authors":["Ishank Juneja","Carlee Joe-Wong","Osman Yağan"],"pdf_url":"https://arxiv.org/pdf/2501.10290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09635v3","updated":"2025-01-17T16:11:42Z","published":"2024-11-14T18:01:02Z","title":"Counterfactual Uncertainty Quantification of Factual Estimand of\n  Efficacy from Before-and-After Treatment Repeated Measures Randomized\n  Controlled Trials","summary":"  The ideal estimand for comparing treatment $Rx$ with a control $C$ is the\n$\\textit{counterfactual}$ efficacy $Rx:C$, the expected differential outcome\nbetween $Rx$ and $C$ if each patient were given $\\textit{both}$. One hundred\nyears ago, Neyman (1923a) proved unbiased $\\textit{point estimation}$ of\ncounterfactual efficacy from designed $\\textit{factual}$ experiments is\nachievable. But he left the determination of how much might the counterfactual\nvariance of this estimate be smaller than the factual variance as an open\nchallenge. This article shows $\\textit{counterfactual}$ uncertainty\nquantification (CUQ), quantifying uncertainty for factual point estimates but\nin a counterfactual setting, is achievable for Randomized Controlled Trials\n(RCTs) with Before-and-After treatment Repeated Measures which are common in\nmany therapeutic areas. We achieve CUQ whose variability is typically smaller\nthan factual UQ by creating a new statistical modeling principle called ETZ.\n  We urge caution in using predictors with measurement error which violates\nstandard regression assumption and can cause $\\textit{attenuation}$ in\nestimating treatment effects. Fortunately, we prove that, for traditional\nmedicine in general, and for targeted therapy with efficacy defined as averaged\nover the population, counterfactual point estimation is unbiased. However, for\nboth Real Human and Digital Twins approaches, predicting treatment effect in\n$\\textit{subgroups}$ may have attenuation bias.\n","authors":["Xingya Wang","Yang Han","Yushi Liu","Szu-Yu Tang","Jason C. Hsu"],"pdf_url":"https://arxiv.org/pdf/2411.09635v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12215v2","updated":"2025-01-17T16:04:05Z","published":"2023-06-21T12:15:57Z","title":"Automated Machine Learning for Remaining Useful Life Predictions","summary":"  Being able to predict the remaining useful life (RUL) of an engineering\nsystem is an important task in prognostics and health management. Recently,\ndata-driven approaches to RUL predictions are becoming prevalent over\nmodel-based approaches since no underlying physical knowledge of the\nengineering system is required. Yet, this just replaces required expertise of\nthe underlying physics with machine learning (ML) expertise, which is often\nalso not available. Automated machine learning (AutoML) promises to build\nend-to-end ML pipelines automatically enabling domain experts without ML\nexpertise to create their own models. This paper introduces AutoRUL, an\nAutoML-driven end-to-end approach for automatic RUL predictions. AutoRUL\ncombines fine-tuned standard regression methods to an ensemble with high\npredictive power. By evaluating the proposed method on eight real-world and\nsynthetic datasets against state-of-the-art hand-crafted models, we show that\nAutoML provides a viable alternative to hand-crafted data-driven RUL\npredictions. Consequently, creating RUL predictions can be made more accessible\nfor domain experts using AutoML by eliminating ML expertise from data-driven\nmodel construction.\n","authors":["Marc-André Zöller","Fabian Mauthe","Peter Zeiler","Marius Lindauer","Marco F. Huber"],"pdf_url":"https://arxiv.org/pdf/2306.12215v2.pdf","comment":"Manuscript accepted at IEEE SMC 2023"},{"id":"http://arxiv.org/abs/2501.10273v1","updated":"2025-01-17T16:01:05Z","published":"2025-01-17T16:01:05Z","title":"SEANN: A Domain-Informed Neural Network for Epidemiological Insights","summary":"  In epidemiology, traditional statistical methods such as logistic regression,\nlinear regression, and other parametric models are commonly employed to\ninvestigate associations between predictors and health outcomes. However,\nnon-parametric machine learning techniques, such as deep neural networks\n(DNNs), coupled with explainable AI (XAI) tools, offer new opportunities for\nthis task. Despite their potential, these methods face challenges due to the\nlimited availability of high-quality, high-quantity data in this field. To\naddress these challenges, we introduce SEANN, a novel approach for informed\nDNNs that leverages a prevalent form of domain-specific knowledge: Pooled\nEffect Sizes (PES). PESs are commonly found in published Meta-Analysis studies,\nin different forms, and represent a quantitative form of a scientific\nconsensus. By direct integration within the learning procedure using a custom\nloss, we experimentally demonstrate significant improvements in the\ngeneralizability of predictive performances and the scientific plausibility of\nextracted relationships compared to a domain-knowledge agnostic neural network\nin a scarce and noisy data setting.\n","authors":["Jean-Baptiste Guimbaud","Marc Plantevit","Léa Maître","Rémy Cazabet"],"pdf_url":"https://arxiv.org/pdf/2501.10273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14904v2","updated":"2025-01-17T15:59:34Z","published":"2024-06-21T06:51:13Z","title":"Enhancing reliability in prediction intervals using point forecasters:\n  Heteroscedastic Quantile Regression and Width-Adaptive Conformal Inference","summary":"  Constructing prediction intervals for time series forecasting is challenging,\nparticularly when practitioners rely solely on point forecasts. While previous\nresearch has focused on creating increasingly efficient intervals, we argue\nthat standard measures alone are inadequate. Beyond efficiency, prediction\nintervals must adapt their width based on the difficulty of the prediction\nwhile preserving coverage regardless of complexity. To address these issues, we\npropose combining Heteroscedastic Quantile Regression (HQR) with Width-Adaptive\nConformal Inference (WACI). This integrated procedure guarantees theoretical\ncoverage and enables interval widths to vary with predictive uncertainty. We\nassess its performance using both a synthetic example and a real world\nElectricity Price Forecasting scenario. Our results show that this combined\napproach meets or surpasses typical benchmarks for validity and efficiency,\nwhile also fulfilling important yet often overlooked practical requirements.\n","authors":["Carlos Sebastián","Carlos E. González-Guillén","Jesús Juan"],"pdf_url":"https://arxiv.org/pdf/2406.14904v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.08101v2","updated":"2025-01-17T15:57:52Z","published":"2024-05-13T18:28:39Z","title":"Can machine learning unlock new insights into high-frequency trading?","summary":"  We design and train machine learning models to capture the nonlinear\ninteractions between financial market dynamics and high-frequency trading (HFT)\nactivity. In doing so, we introduce new metrics to identify liquidity-demanding\nand -supplying HFT strategies. Both types of HFT strategies increase activity\nin response to information events and decrease it when trading speed is\nrestricted, with liquidity-supplying strategies demonstrating greater\nresponsiveness. Liquidity-demanding HFT is positively linked with latency\narbitrage opportunities, whereas liquidity-supplying HFT is negatively related,\naligning with theoretical expectations. Our metrics have implications for\nunderstanding the information production process in financial markets.\n","authors":["G. Ibikunle","B. Moews","D. Muravyev","K. Rzayev"],"pdf_url":"https://arxiv.org/pdf/2405.08101v2.pdf","comment":"66 pages, 6 figures, 11 tables"},{"id":"http://arxiv.org/abs/2501.10261v1","updated":"2025-01-17T15:42:42Z","published":"2025-01-17T15:42:42Z","title":"Logarithmic Regret for Nonlinear Control","summary":"  We address the problem of learning to control an unknown nonlinear dynamical\nsystem through sequential interactions. Motivated by high-stakes applications\nin which mistakes can be catastrophic, such as robotics and healthcare, we\nstudy situations where it is possible for fast sequential learning to occur.\nFast sequential learning is characterized by the ability of the learning agent\nto incur logarithmic regret relative to a fully-informed baseline. We\ndemonstrate that fast sequential learning is achievable in a diverse class of\ncontinuous control problems where the system dynamics depend smoothly on\nunknown parameters, provided the optimal control policy is persistently\nexciting. Additionally, we derive a regret bound which grows with the square\nroot of the number of interactions for cases where the optimal policy is not\npersistently exciting. Our results provide the first regret bounds for\ncontrolling nonlinear dynamical systems depending nonlinearly on unknown\nparameters. We validate the trends our theory predicts in simulation on a\nsimple dynamical system.\n","authors":["James Wang","Bruce D. Lee","Ingvar Ziemann","Nikolai Matni"],"pdf_url":"https://arxiv.org/pdf/2501.10261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10258v1","updated":"2025-01-17T15:40:03Z","published":"2025-01-17T15:40:03Z","title":"DADA: Dual Averaging with Distance Adaptation","summary":"  We present a novel universal gradient method for solving convex optimization\nproblems. Our algorithm -- Dual Averaging with Distance Adaptation (DADA) -- is\nbased on the classical scheme of dual averaging and dynamically adjusts its\ncoefficients based on observed gradients and the distance between iterates and\nthe starting point, eliminating the need for problem-specific parameters. DADA\nis a universal algorithm that simultaneously works for a broad spectrum of\nproblem classes, provided the local growth of the objective function around its\nminimizer can be bounded. Particular examples of such problem classes are\nnonsmooth Lipschitz functions, Lipschitz-smooth functions, H\\\"older-smooth\nfunctions, functions with high-order Lipschitz derivative,\nquasi-self-concordant functions, and $(L_0,L_1)$-smooth functions. Crucially,\nDADA is applicable to both unconstrained and constrained problems, even when\nthe domain is unbounded, without requiring prior knowledge of the number of\niterations or desired accuracy.\n","authors":["Mohammad Moshtaghifar","Anton Rodomanov","Daniil Vankov","Sebastian Stich"],"pdf_url":"https://arxiv.org/pdf/2501.10258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10256v1","updated":"2025-01-17T15:39:21Z","published":"2025-01-17T15:39:21Z","title":"Unsupervised Rhythm and Voice Conversion of Dysarthric to Healthy Speech\n  for ASR","summary":"  Automatic speech recognition (ASR) systems are well known to perform poorly\non dysarthric speech. Previous works have addressed this by speaking rate\nmodification to reduce the mismatch with typical speech. Unfortunately, these\napproaches rely on transcribed speech data to estimate speaking rates and\nphoneme durations, which might not be available for unseen speakers. Therefore,\nwe combine unsupervised rhythm and voice conversion methods based on\nself-supervised speech representations to map dysarthric to typical speech. We\nevaluate the outputs with a large ASR model pre-trained on healthy speech\nwithout further fine-tuning and find that the proposed rhythm conversion\nespecially improves performance for speakers of the Torgo corpus with more\nsevere cases of dysarthria. Code and audio samples are available at\nhttps://idiap.github.io/RnV .\n","authors":["Karl El Hajal","Enno Hermann","Ajinkya Kulkarni","Mathew Magimai. -Doss"],"pdf_url":"https://arxiv.org/pdf/2501.10256v1.pdf","comment":"Accepted at ICASSP 2025 Satellite Workshop: Workshop on Speech\n  Pathology Analysis and DEtection (SPADE)"},{"id":"http://arxiv.org/abs/2501.09274v2","updated":"2025-01-17T15:22:00Z","published":"2025-01-16T03:44:16Z","title":"Large Language Model is Secretly a Protein Sequence Optimizer","summary":"  We consider the protein sequence engineering problem, which aims to find\nprotein sequences with high fitness levels, starting from a given wild-type\nsequence. Directed evolution has been a dominating paradigm in this field which\nhas an iterative process to generate variants and select via experimental\nfeedback. We demonstrate large language models (LLMs), despite being trained on\nmassive texts, are secretly protein sequence optimizers. With a directed\nevolutionary method, LLM can perform protein engineering through Pareto and\nexperiment-budget constrained optimization, demonstrating success on both\nsynthetic and experimental fitness landscapes.\n","authors":["Yinkai Wang","Jiaxing He","Yuanqi Du","Xiaohui Chen","Jianan Canal Li","Li-Ping Liu","Xiaolin Xu","Soha Hassoun"],"pdf_url":"https://arxiv.org/pdf/2501.09274v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2403.03728v2","updated":"2025-01-17T15:15:15Z","published":"2024-03-06T14:18:24Z","title":"Bridging Diversity and Uncertainty in Active learning with\n  Self-Supervised Pre-Training","summary":"  This study addresses the integration of diversity-based and uncertainty-based\nsampling strategies in active learning, particularly within the context of\nself-supervised pre-trained models. We introduce a straightforward heuristic\ncalled TCM that mitigates the cold start problem while maintaining strong\nperformance across various data levels. By initially applying TypiClust for\ndiversity sampling and subsequently transitioning to uncertainty sampling with\nMargin, our approach effectively combines the strengths of both strategies. Our\nexperiments demonstrate that TCM consistently outperforms existing methods\nacross various datasets in both low and high data regimes.\n","authors":["Paul Doucet","Benjamin Estermann","Till Aczel","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2403.03728v2.pdf","comment":"Accepted at ICLR 2024 Workshop on Practical Machine Learning for Low\n  Resource Settings (PML4LRS)"},{"id":"http://arxiv.org/abs/2501.10245v1","updated":"2025-01-17T15:14:58Z","published":"2025-01-17T15:14:58Z","title":"Over-the-Air Multi-Sensor Inference with Neural Networks Using\n  Memristor-Based Analog Computing","summary":"  Deep neural networks provide reliable solutions for many classification and\nregression tasks; however, their application in real-time wireless systems with\nsimple sensor networks is limited due to high energy consumption and\nsignificant bandwidth needs. This study proposes a multi-sensor wireless\ninference system with memristor-based analog computing. Given the sensors'\nlimited computational capabilities, the features from the network's front end\nare transmitted to a central device where an $L_p$-norm inspired approximation\nof the maximum operation is employed to achieve transformation-invariant\nfeatures, enabling efficient over-the-air transmission. We also introduce a\ntrainable over-the-air sensor fusion method based on $L_p$-norm inspired\ncombining function that customizes sensor fusion to match the network and\nsensor distribution characteristics, enhancing adaptability. To address the\nenergy constraints of sensors, we utilize memristors, known for their\nenergy-efficient in-memory computing, enabling analog-domain computations that\nreduce energy use and computational overhead in edge computing. This dual\napproach of memristors and $L_p$-norm inspired sensor fusion fosters\nenergy-efficient computational and transmission paradigms and serves as a\npractical energy-efficient solution with minimal performance loss.\n","authors":["Busra Tegin","Muhammad Atif Ali","Tolga M Duman"],"pdf_url":"https://arxiv.org/pdf/2501.10245v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2501.10240v1","updated":"2025-01-17T15:09:57Z","published":"2025-01-17T15:09:57Z","title":"Challenges and recommendations for Electronic Health Records data\n  extraction and preparation for dynamic prediction modelling in hospitalized\n  patients -- a practical guide","summary":"  Dynamic predictive modeling using electronic health record (EHR) data has\ngained significant attention in recent years. The reliability and\ntrustworthiness of such models depend heavily on the quality of the underlying\ndata, which is largely determined by the stages preceding the model\ndevelopment: data extraction from EHR systems and data preparation. We list\nover forty challenges encountered during these stages and provide actionable\nrecommendations for addressing them. These challenges are organized into four\ncategories: cohort definition, outcome definition, feature engineering, and\ndata cleaning. This list is designed to serve as a practical guide for data\nextraction engineers and researchers, supporting better practices and improving\nthe quality and real-world applicability of dynamic prediction models in\nclinical settings.\n","authors":["Elena Albu","Shan Gao","Pieter Stijnen","Frank E. Rademakers","Bas C T van Bussel","Taya Collyer","Tina Hernandez-Boussard","Laure Wynants","Ben Van Calster"],"pdf_url":"https://arxiv.org/pdf/2501.10240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10235v1","updated":"2025-01-17T15:00:20Z","published":"2025-01-17T15:00:20Z","title":"SpaceTime: Causal Discovery from Non-Stationary Time Series","summary":"  Understanding causality is challenging and often complicated by changing\ncausal relationships over time and across environments. Climate patterns, for\nexample, shift over time with recurring seasonal trends, while also depending\non geographical characteristics such as ecosystem variability. Existing methods\nfor discovering causal graphs from time series either assume stationarity, do\nnot permit both temporal and spatial distribution changes, or are unaware of\nlocations with the same causal relationships. In this work, we therefore unify\nthe three tasks of causal graph discovery in the non-stationary multi-context\nsetting, of reconstructing temporal regimes, and of partitioning datasets and\ntime intervals into those where invariant causal relationships hold. To\nconstruct a consistent score that forms the basis of our method, we employ the\nMinimum Description Length principle. Our resulting algorithm SPACETIME\nsimultaneously accounts for heterogeneity across space and non-stationarity\nover time. Given multiple time series, it discovers regime changepoints and a\ntemporal causal graph using non-parametric functional modeling and kernelized\ndiscrepancy testing. We also show that our method provides insights into\nreal-world phenomena such as river-runoff measured at different catchments and\nbiosphere-atmosphere interactions across ecosystems.\n","authors":["Sarah Mameche","Lénaïg Cornanguer","Urmi Ninad","Jilles Vreeken"],"pdf_url":"https://arxiv.org/pdf/2501.10235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10234v1","updated":"2025-01-17T14:56:20Z","published":"2025-01-17T14:56:20Z","title":"Counterfactual Explanations for k-means and Gaussian Clustering","summary":"  Counterfactuals have been recognized as an effective approach to explain\nclassifier decisions. Nevertheless, they have not yet been considered in the\ncontext of clustering. In this work, we propose the use of counterfactuals to\nexplain clustering solutions. First, we present a general definition for\ncounterfactuals for model-based clustering that includes plausibility and\nfeasibility constraints. Then we consider the counterfactual generation problem\nfor k-means and Gaussian clustering assuming Euclidean distance. Our approach\ntakes as input the factual, the target cluster, a binary mask indicating\nactionable or immutable features and a plausibility factor specifying how far\nfrom the cluster boundary the counterfactual should be placed. In the k-means\nclustering case, analytical mathematical formulas are presented for computing\nthe optimal solution, while in the Gaussian clustering case (assuming full,\ndiagonal, or spherical covariances) our method requires the numerical solution\nof a nonlinear equation with a single parameter only. We demonstrate the\nadvantages of our approach through illustrative examples and quantitative\nexperimental comparisons.\n","authors":["Georgios Vardakas","Antonia Karra","Evaggelia Pitoura","Aristidis Likas"],"pdf_url":"https://arxiv.org/pdf/2501.10234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10229v1","updated":"2025-01-17T14:51:03Z","published":"2025-01-17T14:51:03Z","title":"Amortized Bayesian Mixture Models","summary":"  Finite mixtures are a broad class of models useful in scenarios where\nobserved data is generated by multiple distinct processes but without explicit\ninformation about the responsible process for each data point. Estimating\nBayesian mixture models is computationally challenging due to issues such as\nhigh-dimensional posterior inference and label switching. Furthermore,\ntraditional methods such as MCMC are applicable only if the likelihoods for\neach mixture component are analytically tractable.\n  Amortized Bayesian Inference (ABI) is a simulation-based framework for\nestimating Bayesian models using generative neural networks. This allows the\nfitting of models without explicit likelihoods, and provides fast inference.\nABI is therefore an attractive framework for estimating mixture models. This\npaper introduces a novel extension of ABI tailored to mixture models. We\nfactorize the posterior into a distribution of the parameters and a\ndistribution of (categorical) mixture indicators, which allows us to use a\ncombination of generative neural networks for parameter inference, and\nclassification networks for mixture membership identification. The proposed\nframework accommodates both independent and dependent mixture models, enabling\nfiltering and smoothing. We validate and demonstrate our approach through\nsynthetic and real-world datasets.\n","authors":["Šimon Kucharský","Paul Christian Bürkner"],"pdf_url":"https://arxiv.org/pdf/2501.10229v1.pdf","comment":"34 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.10221v1","updated":"2025-01-17T14:37:54Z","published":"2025-01-17T14:37:54Z","title":"Modelling Activity Scheduling Behaviour with Deep Generative Machine\n  Learning","summary":"  We model human activity scheduling behaviour using a deep generative machine\nlearning approach. Activity schedules, which represent the activities and\nassociated travel behaviours of individuals, are a core component of many\napplied models in the transport, energy and epidemiology domains. Our data\ndriven approach learns human preferences and scheduling logic without the need\nfor complex interacting combinations of sub-models and custom-rules, this makes\nour approach significantly faster and simpler to operate that existing\napproaches. We find activity schedule data combines aspects of both continuous\nimage data and also discrete text data, requiring novel approaches. We\nadditionally contribute a novel schedule representation and comprehensive\nevaluation framework for generated schedules. Evaluation shows our approach is\nable to rapidly generate large, diverse and realistic synthetic samples of\nactivity schedules.\n","authors":["Fred Shone","Tim Hillel"],"pdf_url":"https://arxiv.org/pdf/2501.10221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13780v2","updated":"2025-01-17T14:26:37Z","published":"2024-10-17T17:19:48Z","title":"Optimal Quantization for Matrix Multiplication","summary":"  Recent work in machine learning community proposed multiple methods for\nperforming lossy compression (quantization) of large matrices. This\nquantization is important for accelerating matrix multiplication (main\ncomponent of large language models), which is often bottlenecked by the speed\nof loading these matrices from memory. Unlike classical vector quantization and\nrate-distortion theory, the goal of these new compression algorithms is to be\nable to approximate not the matrices themselves, but their matrix product.\nSpecifically, given a pair of real matrices $A,B$ an encoder (compressor) is\napplied to each of them independently producing descriptions with $R$ bits per\nentry. These representations subsequently are used by the decoder to estimate\nmatrix product $A^\\top B$. In this work, we provide a non-asymptotic lower\nbound on the mean squared error of this approximation (as a function of rate\n$R$) for the case of matrices $A,B$ with iid Gaussian entries. Algorithmically,\nwe construct a universal quantizer based on nested lattices with an explicit\nguarantee of approximation error for any (non-random) pair of matrices $A$, $B$\nin terms of only Frobenius norms $\\|\\bar{A}\\|_F, \\|\\bar{B}\\|_F$ and\n$\\|\\bar{A}^\\top \\bar{B}\\|_F$, where $\\bar{A},\\bar{B}$ are versions of $A,B$\nwith zero-centered columns, respectively. For iid Gaussian matrices our\nquantizer achieves the lower bound and is, thus, asymptotically optimal. A\npractical low-complexity version of our quantizer achieves performance quite\nclose to optimal. In addition, we derive rate-distortion function for matrix\nmultiplication of iid Gaussian matrices, which exhibits an interesting\nphase-transition at $R\\approx 0.906$ bit/entry.\n","authors":["Or Ordentlich","Yury Polyanskiy"],"pdf_url":"https://arxiv.org/pdf/2410.13780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10216v1","updated":"2025-01-17T14:23:54Z","published":"2025-01-17T14:23:54Z","title":"The Relevance of AWS Chronos: An Evaluation of Standard Methods for Time\n  Series Forecasting with Limited Tuning","summary":"  A systematic comparison of Chronos, a transformer-based time series\nforecasting framework, against traditional approaches including ARIMA and\nProphet. We evaluate these models across multiple time horizons and user\ncategories, with a focus on the impact of historical context length. Our\nanalysis reveals that while Chronos demonstrates superior performance for\nlonger-term predictions and maintains accuracy with increased context,\ntraditional models show significant degradation as context length increases. We\nfind that prediction quality varies systematically between user classes,\nsuggesting that underlying behavior patterns always influence model\nperformance. This study provides a case for deploying Chronos in real-world\napplications where limited model tuning is feasible, especially in scenarios\nrequiring longer prediction.\n","authors":["Matthew Baron","Alex Karpinski"],"pdf_url":"https://arxiv.org/pdf/2501.10216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10214v1","updated":"2025-01-17T14:13:48Z","published":"2025-01-17T14:13:48Z","title":"Temporal Graph MLP Mixer for Spatio-Temporal Forecasting","summary":"  Spatiotemporal forecasting is critical in applications such as traffic\nprediction, climate modeling, and environmental monitoring. However, the\nprevalence of missing data in real-world sensor networks significantly\ncomplicates this task. In this paper, we introduce the Temporal Graph MLP-Mixer\n(T-GMM), a novel architecture designed to address these challenges. The model\ncombines node-level processing with patch-level subgraph encoding to capture\nlocalized spatial dependencies while leveraging a three-dimensional MLP-Mixer\nto handle temporal, spatial, and feature-based dependencies. Experiments on the\nAQI, ENGRAD, PV-US and METR-LA datasets demonstrate the model's ability to\neffectively forecast even in the presence of significant missing data. While\nnot surpassing state-of-the-art models in all scenarios, the T-GMM exhibits\nstrong learning capabilities, particularly in capturing long-range\ndependencies. These results highlight its potential for robust, scalable\nspatiotemporal forecasting.\n","authors":["Muhammad Bilal","Luis Carretero Lopez"],"pdf_url":"https://arxiv.org/pdf/2501.10214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01477v2","updated":"2025-01-17T14:10:15Z","published":"2024-11-03T08:30:29Z","title":"DPCL-Diff: The Temporal Knowledge Graph Reasoning Based on Graph Node\n  Diffusion Model with Dual-Domain Periodic Contrastive Learning","summary":"  Temporal knowledge graph (TKG) reasoning that infers future missing facts is\nan essential and challenging task. Predicting future events typically relies on\nclosely related historical facts, yielding more accurate results for repetitive\nor periodic events. However, for future events with sparse historical\ninteractions, the effectiveness of this method, which focuses on leveraging\nhigh-frequency historical information, diminishes. Recently, the capabilities\nof diffusion models in image generation have opened new opportunities for TKG\nreasoning. Therefore, we propose a graph node diffusion model with dual-domain\nperiodic contrastive learning (DPCL-Diff). Graph node diffusion model (GNDiff)\nintroduces noise into sparsely related events to simulate new events,\ngenerating high-quality data that better conforms to the actual distribution.\nThis generative mechanism significantly enhances the model's ability to reason\nabout new events. Additionally, the dual-domain periodic contrastive learning\n(DPCL) maps periodic and non-periodic event entities to Poincar\\'e and\nEuclidean spaces, leveraging their characteristics to distinguish similar\nperiodic events effectively. Experimental results on four public datasets\ndemonstrate that DPCL-Diff significantly outperforms state-of-the-art TKG\nmodels in event prediction, demonstrating our approach's effectiveness. This\nstudy also investigates the combined effectiveness of GNDiff and DPCL in TKG\ntasks.\n","authors":["Yukun Cao","Lisheng Wang","Luobin Huang"],"pdf_url":"https://arxiv.org/pdf/2411.01477v2.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.10209v1","updated":"2025-01-17T14:08:32Z","published":"2025-01-17T14:08:32Z","title":"Hypercone Assisted Contour Generation for Out-of-Distribution Detection","summary":"  Recent advances in the field of out-of-distribution (OOD) detection have\nplaced great emphasis on learning better representations suited to this task.\nWhile there are distance-based approaches, distributional awareness has seldom\nbeen exploited for better performance. We present HAC$_k$-OOD, a novel OOD\ndetection method that makes no distributional assumption about the data, but\nautomatically adapts to its distribution. Specifically, HAC$_k$-OOD constructs\na set of hypercones by maximizing the angular distance to neighbors in a given\ndata-point's vicinity to approximate the contour within which in-distribution\n(ID) data-points lie. Experimental results show state-of-the-art FPR@95 and\nAUROC performance on Near-OOD detection and on Far-OOD detection on the\nchallenging CIFAR-100 benchmark without explicitly training for OOD\nperformance.\n","authors":["Annita Vapsi","Andrés Muñoz","Nancy Thomas","Keshav Ramani","Daniel Borrajo"],"pdf_url":"https://arxiv.org/pdf/2501.10209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14393v4","updated":"2025-01-17T13:56:50Z","published":"2024-06-20T15:12:27Z","title":"Jailbreaking as a Reward Misspecification Problem","summary":"  The widespread adoption of large language models (LLMs) has raised concerns\nabout their safety and reliability, particularly regarding their vulnerability\nto adversarial attacks. In this paper, we propose a novel perspective that\nattributes this vulnerability to reward misspecification during the alignment\nprocess. This misspecification occurs when the reward function fails to\naccurately capture the intended behavior, leading to misaligned model outputs.\nWe introduce a metric ReGap to quantify the extent of reward misspecification\nand demonstrate its effectiveness and robustness in detecting harmful backdoor\nprompts. Building upon these insights, we present ReMiss, a system for\nautomated red teaming that generates adversarial prompts in a\nreward-misspecified space. ReMiss achieves state-of-the-art attack success\nrates on the AdvBench benchmark against various target aligned LLMs while\npreserving the human readability of the generated prompts. Furthermore, these\nattacks on open-source models demonstrate high transferability to closed-source\nmodels like GPT-4o and out-of-distribution tasks from HarmBench. Detailed\nanalysis highlights the unique advantages of the proposed reward\nmisspecification objective compared to previous methods, offering new insights\nfor improving LLM safety and robustness.\n","authors":["Zhihui Xie","Jiahui Gao","Lei Li","Zhenguo Li","Qi Liu","Lingpeng Kong"],"pdf_url":"https://arxiv.org/pdf/2406.14393v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10202v1","updated":"2025-01-17T13:51:14Z","published":"2025-01-17T13:51:14Z","title":"Provably Safeguarding a Classifier from OOD and Adversarial Samples: an\n  Extreme Value Theory Approach","summary":"  This paper introduces a novel method, Sample-efficient Probabilistic\nDetection using Extreme Value Theory (SPADE), which transforms a classifier\ninto an abstaining classifier, offering provable protection against\nout-of-distribution and adversarial samples. The approach is based on a\nGeneralized Extreme Value (GEV) model of the training distribution in the\nclassifier's latent space, enabling the formal characterization of OOD samples.\nInterestingly, under mild assumptions, the GEV model also allows for formally\ncharacterizing adversarial samples. The abstaining classifier, which rejects\nsamples based on their assessment by the GEV model, provably avoids OOD and\nadversarial samples. The empirical validation of the approach, conducted on\nvarious neural architectures (ResNet, VGG, and Vision Transformer) and medium\nand large-sized datasets (CIFAR-10, CIFAR-100, and ImageNet), demonstrates its\nfrugality, stability, and efficiency compared to the state of the art.\n","authors":["Nicolas Atienza","Christophe Labreuche","Johanne Cohen","Michele Sebag"],"pdf_url":"https://arxiv.org/pdf/2501.10202v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2407.11812v4","updated":"2025-01-17T13:42:32Z","published":"2024-07-16T15:02:18Z","title":"Boosting drug-disease association prediction for drug repositioning via\n  dual-feature extraction and cross-dual-domain decoding","summary":"  The extraction of biomedical data has significant academic and practical\nvalue in contemporary biomedical sciences. In recent years, drug repositioning,\na cost-effective strategy for drug development by discovering new indications\nfor approved drugs, has gained increasing attention. However, many existing\ndrug repositioning methods focus on mining information from adjacent nodes in\nbiomedical networks without considering the potential inter-relationships\nbetween the feature spaces of drugs and diseases. This can lead to inaccurate\nencoding, resulting in biased mined drug-disease association information. To\naddress this limitation, we propose a new model called Dual-Feature Drug\nRepurposing Neural Network (DFDRNN). DFDRNN allows the mining of two features\n(similarity and association) from the drug-disease biomedical networks to\nencode drugs and diseases. A self-attention mechanism is utilized to extract\nneighbor feature information. It incorporates two dual-feature extraction\nmodules: the single-domain dual-feature extraction (SDDFE) module for\nextracting features within a single domain (drugs or diseases) and the\ncross-domain dual-feature extraction (CDDFE) module for extracting features\nacross domains. By utilizing these modules, we ensure more appropriate encoding\nof drugs and diseases. A cross-dual-domain decoder is also designed to predict\ndrug-disease associations in both domains. Our proposed DFDRNN model\noutperforms six state-of-the-art methods on four benchmark datasets, achieving\nan average AUROC of 0.946 and an average AUPR of 0.597. Case studies on two\ndiseases show that the proposed DFDRNN model can be applied in real-world\nscenarios, demonstrating its significant potential in drug repositioning.\n","authors":["Enqiang Zhu","Xiang Li","Chanjuan Liu","Nikhil R. Pal"],"pdf_url":"https://arxiv.org/pdf/2407.11812v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10195v1","updated":"2025-01-17T13:39:51Z","published":"2025-01-17T13:39:51Z","title":"Contributions to the Decision Theoretic Foundations of Machine Learning\n  and Robust Statistics under Weakly Structured Information","summary":"  This habilitation thesis is cumulative and, therefore, is collecting and\nconnecting research that I (together with several co-authors) have conducted\nover the last few years. Thus, the absolute core of the work is formed by the\nten publications listed on page 5 under the name Contributions 1 to 10. The\nreferences to the complete versions of these articles are also found in this\nlist, making them as easily accessible as possible for readers wishing to dive\ndeep into the different research projects. The chapters following this thesis,\nnamely Parts A to C and the concluding remarks, serve to place the articles in\na larger scientific context, to (briefly) explain their respective content on a\nless formal level, and to highlight some interesting perspectives for future\nresearch in their respective contexts. Naturally, therefore, the following\npresentation has neither the level of detail nor the formal rigor that can\n(hopefully) be found in the papers. The purpose of the following text is to\nprovide the reader an easy and high-level access to this interesting and\nimportant research field as a whole, thereby, advertising it to a broader\naudience.\n","authors":["Christoph Jansen"],"pdf_url":"https://arxiv.org/pdf/2501.10195v1.pdf","comment":"Habilitation Thesis"},{"id":"http://arxiv.org/abs/2501.10193v1","updated":"2025-01-17T13:39:10Z","published":"2025-01-17T13:39:10Z","title":"Surrogate-based multiscale analysis of experiments on thermoplastic\n  composites under off-axis loading","summary":"  In this paper, we present a surrogate-based multiscale approach to model\nconstant strain-rate and creep experiments on unidirectional thermoplastic\ncomposites under off-axis loading. In previous contributions, these experiments\nwere modeled through a single-scale micromechanical simulation under the\nassumption of macroscopic homogeneity. Although efficient and accurate in many\nscenarios, simulations with low-off axis angles showed significant\ndiscrepancies with the experiments. It was hypothesized that the mismatch was\ncaused by macroscopic inhomogeneity, which would require a multiscale approach\nto capture it. However, full-field multiscale simulations remain\ncomputationally prohibitive. To address this issue, we replace the micromodel\nwith a Physically Recurrent Neural Network (PRNN), a surrogate model that\ncombines data-driven components with embedded constitutive models to capture\nhistory-dependent behavior naturally. The explainability of the latent space of\nthis network is also explored in a transfer learning strategy that requires no\nre-training. With the surrogate-based simulations, we confirm the hypothesis\nraised on the inhomogeneity of the macroscopic strain field and gain insights\ninto the influence of adjustment of the experimental setup with oblique\nend-tabs. Results from the surrogate-based multiscale approach show better\nagreement with experiments than the single-scale micromechanical approach over\na wide range of settings, although with limited accuracy on the creep\nexperiments, where macroscopic test effects were implicitly taken into account\nin the material properties calibration.\n","authors":["M. A. Maia","I. B. C. M. Rocha","D. Kovačević","F. P. van der Meer"],"pdf_url":"https://arxiv.org/pdf/2501.10193v1.pdf","comment":"21 pages. 31 figures"},{"id":"http://arxiv.org/abs/2304.11960v3","updated":"2025-01-17T13:34:49Z","published":"2023-04-24T09:53:33Z","title":"Bandit on the Hunt: Dynamic Crawling for Cyber Threat Intelligence","summary":"  Public information contains valuable Cyber Threat Intelligence (CTI) that is\nused to prevent attacks in the future. Ideally, the learnings from previous\nattacks help to mitigate all those that follow. While there are standards for\nsharing this information, much of it is shared in non-standardized news\narticles or blog posts. It is a time-consuming task to monitor online sources\nfor threats and even then, one can never be sure, to use the right sources.\nCurrent research propose extractors of Indicators of Compromise from known\nsources, while the identification of new sources is rarely considered. This\npaper proposes a focused crawler focused on the CTI domain based on multi-armed\nbandit ( MAB) and different crawling strategies. It uses SBERT to identify\nrelevant documents, while dynamically adapt its crawling path. We propose a\nsystem called ThreatCrawl, which achieve a harvest rate of over 25% and is able\nto expand its used seed by over 300%, while retaining focus on the topic at\nhand. In addition, this crawler identified previously unknown but highly\nrelevant overview pages, datasets, and domains.\n","authors":["Philipp Kuehn","Dilara Nadermahmoodi","Markus Bayer","Christian Reuter"],"pdf_url":"https://arxiv.org/pdf/2304.11960v3.pdf","comment":"6 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2501.10181v1","updated":"2025-01-17T13:26:12Z","published":"2025-01-17T13:26:12Z","title":"Improved learning rates in multi-unit uniform price auctions","summary":"  Motivated by the strategic participation of electricity producers in\nelectricity day-ahead market, we study the problem of online learning in\nrepeated multi-unit uniform price auctions focusing on the adversarial opposing\nbid setting. The main contribution of this paper is the introduction of a new\nmodeling of the bid space. Indeed, we prove that a learning algorithm\nleveraging the structure of this problem achieves a regret of\n$\\tilde{O}(K^{4/3}T^{2/3})$ under bandit feedback, improving over the bound of\n$\\tilde{O}(K^{7/4}T^{3/4})$ previously obtained in the literature. This\nimproved regret rate is tight up to logarithmic terms. Inspired by electricity\nreserve markets, we further introduce a different feedback model under which\nall winning bids are revealed. This feedback interpolates between the\nfull-information and bandit scenarios depending on the auctions' results. We\nprove that, under this feedback, the algorithm that we propose achieves regret\n$\\tilde{O}(K^{5/2}\\sqrt{T})$.\n","authors":["Marius Potfer","Dorian Baudry","Hugo Richard","Vianney Perchet","Cheng Wan"],"pdf_url":"https://arxiv.org/pdf/2501.10181v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.10179v1","updated":"2025-01-17T13:24:13Z","published":"2025-01-17T13:24:13Z","title":"A Simple but Effective Closed-form Solution for Extreme Multi-label\n  Learning","summary":"  Extreme multi-label learning (XML) is a task of assigning multiple labels\nfrom an extremely large set of labels to each data instance. Many current\nhigh-performance XML models are composed of a lot of hyperparameters, which\ncomplicates the tuning process. Additionally, the models themselves are adapted\nspecifically to XML, which complicates their reimplementation. To remedy this\nproblem, we propose a simple method based on ridge regression for XML. The\nproposed method not only has a closed-form solution but also is composed of a\nsingle hyperparameter. Since there are no precedents on applying ridge\nregression to XML, this paper verified the performance of the method by using\nvarious XML benchmark datasets. Furthermore, we enhanced the prediction of\nlow-frequency labels in XML, which hold informative content. This prediction is\nessential yet challenging because of the limited amount of data. Here, we\nemployed a simple frequency-based weighting. This approach greatly simplifies\nthe process compared with existing techniques. Experimental results revealed\nthat it can achieve levels of performance comparable to, or even exceeding,\nthose of models with numerous hyperparameters. Additionally, we found that the\nfrequency-based weighting significantly improved the predictive performance for\nlow-frequency labels, while requiring almost no changes in implementation. The\nsource code for the proposed method is available on github at\nhttps://github.com/cars1015/XML-ridge.\n","authors":["Kazuma Onishi","Katsuhiko Hayashi"],"pdf_url":"https://arxiv.org/pdf/2501.10179v1.pdf","comment":"10pages, Accepted at ECIR25"},{"id":"http://arxiv.org/abs/2501.10172v1","updated":"2025-01-17T13:07:52Z","published":"2025-01-17T13:07:52Z","title":"Mean and Variance Estimation Complexity in Arbitrary Distributions via\n  Wasserstein Minimization","summary":"  Parameter estimation is a fundamental challenge in machine learning, crucial\nfor tasks such as neural network weight fitting and Bayesian inference. This\npaper focuses on the complexity of estimating translation $\\boldsymbol{\\mu} \\in\n\\mathbb{R}^l$ and shrinkage $\\sigma \\in \\mathbb{R}_{++}$ parameters for a\ndistribution of the form $\\frac{1}{\\sigma^l} f_0 \\left( \\frac{\\boldsymbol{x} -\n\\boldsymbol{\\mu}}{\\sigma} \\right)$, where $f_0$ is a known density in\n$\\mathbb{R}^l$ given $n$ samples. We highlight that while the problem is\nNP-hard for Maximum Likelihood Estimation (MLE), it is possible to obtain\n$\\varepsilon$-approximations for arbitrary $\\varepsilon > 0$ within\n$\\text{poly} \\left( \\frac{1}{\\varepsilon} \\right)$ time using the Wasserstein\ndistance.\n","authors":["Valentio Iverson","Stephen Vavasis"],"pdf_url":"https://arxiv.org/pdf/2501.10172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10162v1","updated":"2025-01-17T12:51:25Z","published":"2025-01-17T12:51:25Z","title":"Convex Physics Informed Neural Networks for the Monge-Ampère Optimal\n  Transport Problem","summary":"  Optimal transportation of raw material from suppliers to customers is an\nissue arising in logistics that is addressed here with a continuous model\nrelying on optimal transport theory. A physics informed neuralnetwork method is\nadvocated here for the solution of the corresponding generalized Monge-Amp`ere\nequation. Convex neural networks are advocated to enforce the convexity of the\nsolution to the Monge-Amp\\`ere equation and obtain a suitable approximation of\nthe optimal transport map. A particular focus is set on the enforcement of\ntransport boundary conditions in the loss function. Numerical experiments\nillustrate the solution to the optimal transport problem in several\nconfigurations, and sensitivity analyses are performed.\n","authors":["Alexandre Caboussat","Anna Peruso"],"pdf_url":"https://arxiv.org/pdf/2501.10162v1.pdf","comment":"17 pages, 14 figures. Submitted to Engineering Computations on 26\n  September 2024"},{"id":"http://arxiv.org/abs/2409.16302v2","updated":"2025-01-17T12:27:40Z","published":"2024-09-10T11:00:24Z","title":"How Redundant Is the Transformer Stack in Speech Representation Models?","summary":"  Self-supervised speech representation models, particularly those leveraging\ntransformer architectures, have demonstrated remarkable performance across\nvarious tasks such as speech recognition, speaker identification, and emotion\ndetection. Recent studies on transformer models revealed a high redundancy\nbetween layers and the potential for significant pruning, which we will\ninvestigate here for transformer-based speech representation models. We perform\na detailed analysis of layer similarity in speech representation models using\nthree similarity metrics: cosine similarity, centered kernel alignment, and\nmutual nearest-neighbor alignment. Our findings reveal a block-like structure\nof high similarity, suggesting two main processing steps and significant\nredundancy of layers. We demonstrate the effectiveness of pruning\ntransformer-based speech representation models without the need for\npost-training, achieving up to 40% reduction in transformer layers while\nmaintaining over 95% of the model's predictive capacity. Furthermore, we employ\na knowledge distillation method to substitute the entire transformer stack with\nmimicking layers, reducing the network size 95-98% and the inference time by up\nto 94%. This substantial decrease in computational load occurs without\nconsiderable performance loss, suggesting that the transformer stack is almost\ncompletely redundant for downstream applications of speech representation\nmodels.\n","authors":["Teresa Dorszewski","Albert Kjøller Jacobsen","Lenka Tětková","Lars Kai Hansen"],"pdf_url":"https://arxiv.org/pdf/2409.16302v2.pdf","comment":"To appear at ICASSP 2025 (excluding appendix)"},{"id":"http://arxiv.org/abs/2501.10153v1","updated":"2025-01-17T12:24:28Z","published":"2025-01-17T12:24:28Z","title":"Region-wise stacking ensembles for estimating brain-age using MRI","summary":"  Predictive modeling using structural magnetic resonance imaging (MRI) data is\na prominent approach to study brain-aging. Machine learning algorithms and\nfeature extraction methods have been employed to improve predictions and\nexplore healthy and accelerated aging e.g. neurodegenerative and psychiatric\ndisorders. The high-dimensional MRI data pose challenges to building\ngeneralizable and interpretable models as well as for data privacy. Common\npractices are resampling or averaging voxels within predefined parcels, which\nreduces anatomical specificity and biological interpretability as voxels within\na region may differently relate to aging. Effectively, naive fusion by\naveraging can result in information loss and reduced accuracy. We present a\nconceptually novel two-level stacking ensemble (SE) approach. The first level\ncomprises regional models for predicting individuals' age based on voxel-wise\ninformation, fused by a second-level model yielding final predictions. Eight\ndata fusion scenarios were explored using as input Gray matter volume (GMV)\nestimates from four datasets covering the adult lifespan. Performance, measured\nusing mean absolute error (MAE), R2, correlation and prediction bias, showed\nthat SE outperformed the region-wise averages. The best performance was\nobtained when first-level regional predictions were obtained as out-of-sample\npredictions on the application site with second-level models trained on\nindependent and site-specific data (MAE=4.75 vs baseline regional mean GMV\nMAE=5.68). Performance improved as more datasets were used for training.\nFirst-level predictions showed improved and more robust aging signal providing\nnew biological insights and enhanced data privacy. Overall, the SE improves\naccuracy compared to the baseline while preserving or enhancing data privacy.\n","authors":["Georgios Antonopoulos","Shammi More","Simon B. Eickhoff","Federico Raimondo","Kaustubh R. Patil"],"pdf_url":"https://arxiv.org/pdf/2501.10153v1.pdf","comment":"version1"},{"id":"http://arxiv.org/abs/2501.10141v1","updated":"2025-01-17T12:05:24Z","published":"2025-01-17T12:05:24Z","title":"Enhancing UAV Path Planning Efficiency Through Accelerated Learning","summary":"  Unmanned Aerial Vehicles (UAVs) are increasingly essential in various fields\nsuch as surveillance, reconnaissance, and telecommunications. This study aims\nto develop a learning algorithm for the path planning of UAV wireless\ncommunication relays, which can reduce storage requirements and accelerate Deep\nReinforcement Learning (DRL) convergence. Assuming the system possesses terrain\nmaps of the area and can estimate user locations using localization algorithms\nor direct GPS reporting, it can input these parameters into the learning\nalgorithms to achieve optimized path planning performance. However, higher\nresolution terrain maps are necessary to extract topological information such\nas terrain height, object distances, and signal blockages. This requirement\nincreases memory and storage demands on UAVs while also lengthening convergence\ntimes in DRL algorithms. Similarly, defining the telecommunication coverage map\nin UAV wireless communication relays using these terrain maps and user position\nestimations demands higher memory and storage utilization for the learning path\nplanning algorithms. Our approach reduces path planning training time by\napplying a dimensionality reduction technique based on Principal Component\nAnalysis (PCA), sample combination, Prioritized Experience Replay (PER), and\nthe combination of Mean Squared Error (MSE) and Mean Absolute Error (MAE) loss\ncalculations in the coverage map estimates, thereby enhancing a Twin Delayed\nDeep Deterministic Policy Gradient (TD3) algorithm. The proposed solution\nreduces the convergence episodes needed for basic training by approximately\nfour times compared to the traditional TD3.\n","authors":["Joseanne Viana","Boris Galkin","Lester Ho","Holger Claussen"],"pdf_url":"https://arxiv.org/pdf/2501.10141v1.pdf","comment":"This paper was accepted in https://camad2024.ieee-camad.org/\n  conference but it is not available from the conference yet"},{"id":"http://arxiv.org/abs/2501.10139v1","updated":"2025-01-17T12:01:56Z","published":"2025-01-17T12:01:56Z","title":"Conformal Prediction Sets with Improved Conditional Coverage using Trust\n  Scores","summary":"  Standard conformal prediction offers a marginal guarantee on coverage, but\nfor prediction sets to be truly useful, they should ideally ensure coverage\nconditional on each test point. Unfortunately, it is impossible to achieve\nexact, distribution-free conditional coverage in finite samples. In this work,\nwe propose an alternative conformal prediction algorithm that targets coverage\nwhere it matters most--in instances where a classifier is overconfident in its\nincorrect predictions. We start by dissecting miscoverage events in\nmarginally-valid conformal prediction, and show that miscoverage rates vary\nbased on the classifier's confidence and its deviation from the Bayes optimal\nclassifier. Motivated by this insight, we develop a variant of conformal\nprediction that targets coverage conditional on a reduced set of two variables:\nthe classifier's confidence in a prediction and a nonparametric trust score\nthat measures its deviation from the Bayes classifier. Empirical evaluation on\nmultiple image datasets shows that our method generally improves conditional\ncoverage properties compared to standard conformal prediction, including\nclass-conditional coverage, coverage over arbitrary subgroups, and coverage\nover demographic groups.\n","authors":["Jivat Neet Kaur","Michael I. Jordan","Ahmed Alaa"],"pdf_url":"https://arxiv.org/pdf/2501.10139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10137v1","updated":"2025-01-17T11:59:56Z","published":"2025-01-17T11:59:56Z","title":"Visual Exploration of Stopword Probabilities in Topic Models","summary":"  Stopword removal is a critical stage in many Machine Learning methods but\noften receives little consideration, it interferes with the model\nvisualizations and disrupts user confidence. Inappropriately chosen or hastily\nomitted stopwords not only lead to suboptimal performance but also\nsignificantly affect the quality of models, thus reducing the willingness of\npractitioners and stakeholders to rely on the output visualizations. This paper\nproposes a novel extraction method that provides a corpus-specific\nprobabilistic estimation of stopword likelihood and an interactive\nvisualization system to support their analysis. We evaluated our approach and\ninterface using real-world data, a commonly used Machine Learning method (Topic\nModelling), and a comprehensive qualitative experiment probing user confidence.\nThe results of our work show that our system increases user confidence in the\ncredibility of topic models by (1) returning reasonable probabilities, (2)\ngenerating an appropriate and representative extension of common stopword\nlists, and (3) providing an adjustable threshold for estimating and analyzing\nstopwords visually. Finally, we discuss insights, recommendations, and best\npractices to support practitioners while improving the output of Machine\nLearning methods and topic model visualizations with robust stopword analysis\nand removal.\n","authors":["Shuangjiang Xue","Pierre Le Bras","David A. Robb","Mike J. Chantler","Stefano Padilla"],"pdf_url":"https://arxiv.org/pdf/2501.10137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10134v1","updated":"2025-01-17T11:49:49Z","published":"2025-01-17T11:49:49Z","title":"Exploring the Impact of Generative Artificial Intelligence in Education:\n  A Thematic Analysis","summary":"  The recent advancements in Generative Artificial intelligence (GenAI)\ntechnology have been transformative for the field of education. Large Language\nModels (LLMs) such as ChatGPT and Bard can be leveraged to automate boilerplate\ntasks, create content for personalised teaching, and handle repetitive tasks to\nallow more time for creative thinking. However, it is important to develop\nguidelines, policies, and assessment methods in the education sector to ensure\nthe responsible integration of these tools. In this article, thematic analysis\nhas been performed on seven essays obtained from professionals in the education\nsector to understand the advantages and pitfalls of using GenAI models such as\nChatGPT and Bard in education. Exploratory Data Analysis (EDA) has been\nperformed on the essays to extract further insights from the text. The study\nfound several themes which highlight benefits and drawbacks of GenAI tools, as\nwell as suggestions to overcome these limitations and ensure that students are\nusing these tools in a responsible and ethical manner.\n","authors":["Abhishek Kaushik","Sargam Yadav","Andrew Browne","David Lillis","David Williams","Jack Mc Donnell","Peadar Grant","Siobhan Connolly Kernan","Shubham Sharma","Mansi Arora"],"pdf_url":"https://arxiv.org/pdf/2501.10134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09636v2","updated":"2025-01-17T11:44:53Z","published":"2025-01-16T16:25:30Z","title":"LLM-Based Routing in Mixture of Experts: A Novel Framework for Trading","summary":"  Recent advances in deep learning and large language models (LLMs) have\nfacilitated the deployment of the mixture-of-experts (MoE) mechanism in the\nstock investment domain. While these models have demonstrated promising trading\nperformance, they are often unimodal, neglecting the wealth of information\navailable in other modalities, such as textual data. Moreover, the traditional\nneural network-based router selection mechanism fails to consider contextual\nand real-world nuances, resulting in suboptimal expert selection. To address\nthese limitations, we propose LLMoE, a novel framework that employs LLMs as the\nrouter within the MoE architecture. Specifically, we replace the conventional\nneural network-based router with LLMs, leveraging their extensive world\nknowledge and reasoning capabilities to select experts based on historical\nprice data and stock news. This approach provides a more effective and\ninterpretable selection mechanism. Our experiments on multimodal real-world\nstock datasets demonstrate that LLMoE outperforms state-of-the-art MoE models\nand other deep neural network approaches. Additionally, the flexible\narchitecture of LLMoE allows for easy adaptation to various downstream tasks.\n","authors":["Kuan-Ming Liu","Ming-Chih Lo"],"pdf_url":"https://arxiv.org/pdf/2501.09636v2.pdf","comment":"Accepted by AAAI 2025 Workshop on AI for Social Impact - Bridging\n  Innovations in Finance, Social Media, and Crime Prevention"},{"id":"http://arxiv.org/abs/2501.10124v1","updated":"2025-01-17T11:27:58Z","published":"2025-01-17T11:27:58Z","title":"Gene Regulatory Network Inference in the Presence of Selection Bias and\n  Latent Confounders","summary":"  Gene Regulatory Network Inference (GRNI) aims to identify causal\nrelationships among genes using gene expression data, providing insights into\nregulatory mechanisms. A significant yet often overlooked challenge is\nselection bias, a process where only cells meeting specific criteria, such as\ngene expression thresholds, survive or are observed, distorting the true joint\ndistribution of genes and thus biasing GRNI results. Furthermore, gene\nexpression is influenced by latent confounders, such as non-coding RNAs, which\nadd complexity to GRNI. To address these challenges, we propose GISL (Gene\nRegulatory Network Inference in the presence of Selection bias and Latent\nconfounders), a novel algorithm to infer true regulatory relationships in the\npresence of selection and confounding issues. Leveraging data obtained via\nmultiple gene perturbation experiments, we show that the true regulatory\nrelationships, as well as selection processes and latent confounders can be\npartially identified without strong parametric models and under mild graphical\nassumptions. Experimental results on both synthetic and real-world single-cell\ngene expression datasets demonstrate the superiority of GISL over existing\nmethods.\n","authors":["Gongxu Luo","Haoyue Dai","Boyang Sun","Loka Li","Biwei Huang","Petar Stojanov","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.10124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10120v1","updated":"2025-01-17T11:12:28Z","published":"2025-01-17T11:12:28Z","title":"PaSa: An LLM Agent for Comprehensive Academic Paper Search","summary":"  We introduce PaSa, an advanced Paper Search agent powered by large language\nmodels. PaSa can autonomously make a series of decisions, including invoking\nsearch tools, reading papers, and selecting relevant references, to ultimately\nobtain comprehensive and accurate results for complex scholarly queries. We\noptimize PaSa using reinforcement learning with a synthetic dataset,\nAutoScholarQuery, which includes 35k fine-grained academic queries and\ncorresponding papers sourced from top-tier AI conference publications.\nAdditionally, we develop RealScholarQuery, a benchmark collecting real-world\nacademic queries to assess PaSa performance in more realistic scenarios.\nDespite being trained on synthetic data, PaSa significantly outperforms\nexisting baselines on RealScholarQuery, including Google, Google Scholar,\nGoogle with GPT-4 for paraphrased queries, chatGPT (search-enabled GPT-4o),\nGPT-o1, and PaSa-GPT-4o (PaSa implemented by prompting GPT-4o). Notably,\nPaSa-7B surpasses the best Google-based baseline, Google with GPT-4o, by 37.78%\nin recall@20 and 39.90% in recall@50. It also exceeds PaSa-GPT-4o by 30.36% in\nrecall and 4.25% in precision. Model, datasets, and code are available at\nhttps://github.com/bytedance/pasa.\n","authors":["Yichen He","Guanhua Huang","Peiyuan Feng","Yuan Lin","Yuchen Zhang","Hang Li","Weinan E"],"pdf_url":"https://arxiv.org/pdf/2501.10120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10100v1","updated":"2025-01-17T10:39:09Z","published":"2025-01-17T10:39:09Z","title":"Robotic World Model: A Neural Network Simulator for Robust Policy\n  Optimization in Robotics","summary":"  Learning robust and generalizable world models is crucial for enabling\nefficient and scalable robotic control in real-world environments. In this\nwork, we introduce a novel framework for learning world models that accurately\ncapture complex, partially observable, and stochastic dynamics. The proposed\nmethod employs a dual-autoregressive mechanism and self-supervised training to\nachieve reliable long-horizon predictions without relying on domain-specific\ninductive biases, ensuring adaptability across diverse robotic tasks. We\nfurther propose a policy optimization framework that leverages world models for\nefficient training in imagined environments and seamless deployment in\nreal-world systems. Through extensive experiments, our approach consistently\noutperforms state-of-the-art methods, demonstrating superior autoregressive\nprediction accuracy, robustness to noise, and generalization across\nmanipulation and locomotion tasks. Notably, policies trained with our method\nare successfully deployed on ANYmal D hardware in a zero-shot transfer,\nachieving robust performance with minimal sim-to-real performance loss. This\nwork advances model-based reinforcement learning by addressing the challenges\nof long-horizon prediction, error accumulation, and sim-to-real transfer. By\nproviding a scalable and robust framework, the introduced methods pave the way\nfor adaptive and efficient robotic systems in real-world applications.\n","authors":["Chenhao Li","Andreas Krause","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2501.10100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10098v1","updated":"2025-01-17T10:35:58Z","published":"2025-01-17T10:35:58Z","title":"landmarker: a Toolkit for Anatomical Landmark Localization in 2D/3D\n  Images","summary":"  Anatomical landmark localization in 2D/3D images is a critical task in\nmedical imaging. Although many general-purpose tools exist for landmark\nlocalization in classical computer vision tasks, such as pose estimation, they\nlack the specialized features and modularity necessary for anatomical landmark\nlocalization applications in the medical domain. Therefore, we introduce\nlandmarker, a Python package built on PyTorch. The package provides a\ncomprehensive, flexible toolkit for developing and evaluating landmark\nlocalization algorithms, supporting a range of methodologies, including static\nand adaptive heatmap regression. landmarker enhances the accuracy of landmark\nidentification, streamlines research and development processes, and supports\nvarious image formats and preprocessing pipelines. Its modular design allows\nusers to customize and extend the toolkit for specific datasets and\napplications, accelerating innovation in medical imaging. landmarker addresses\na critical need for precision and customization in landmark localization tasks\nnot adequately met by existing general-purpose pose estimation tools.\n","authors":["Jef Jonkers","Luc Duchateau","Glenn Van Wallendael","Sofie Van Hoecke"],"pdf_url":"https://arxiv.org/pdf/2501.10098v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.00494v2","updated":"2025-01-17T10:31:13Z","published":"2023-09-01T14:40:25Z","title":"Multi-stage Deep Learning Artifact Reduction for Pallel-beam Computed\n  Tomography","summary":"  Computed Tomography (CT) using synchrotron radiation is a powerful technique\nthat, compared to lab-CT techniques, boosts high spatial and temporal\nresolution while also providing access to a range of contrast-formation\nmechanisms. The acquired projection data is typically processed by a\ncomputational pipeline composed of multiple stages. Artifacts introduced during\ndata acquisition can propagate through the pipeline, and degrade image quality\nin the reconstructed images. Recently, deep learning has shown significant\npromise in enhancing image quality for images representing scientific data.\nThis success has driven increasing adoption of deep learning techniques in CT\nimaging. Various approaches have been proposed to incorporate deep learning\ninto computational pipelines, but each has limitations in addressing artifacts\neffectively and efficiently in synchrotron CT, either in properly addressing\nthe specific artifacts, or in computational efficiency.\n  Recognizing these challenges, we introduce a novel method that incorporates\nseparate deep learning models at each stage of the tomography\npipeline-projection, sinogram, and reconstruction-to address specific artifacts\nlocally in a data-driven way. Our approach includes bypass connections that\nfeed both the outputs from previous stages and raw data to subsequent stages,\nminimizing the risk of error propagation. Extensive evaluations on both\nsimulated and real-world datasets illustrate that our approach effectively\nreduces artifacts and outperforms comparison methods.\n","authors":["Jiayang Shi","Daniel M. Pelt","K. Joost Batenburg"],"pdf_url":"https://arxiv.org/pdf/2309.00494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10088v1","updated":"2025-01-17T10:15:03Z","published":"2025-01-17T10:15:03Z","title":"A recursive Bayesian neural network for constitutive modeling of sands\n  under monotonic loading","summary":"  In geotechnical engineering, constitutive models play a crucial role in\ndescribing soil behavior under varying loading conditions. Data-driven deep\nlearning (DL) models offer a promising alternative for developing predictive\nconstitutive models. When prediction is the primary focus, quantifying the\npredictive uncertainty of a trained DL model and communicating this uncertainty\nto end users is crucial for informed decision-making.\n  This study proposes a recursive Bayesian neural network (rBNN) framework,\nwhich builds upon recursive feedforward neural networks (rFFNNs) by introducing\ngeneralized Bayesian inference for uncertainty quantification. A significant\ncontribution of this work is the incorporation of a sliding window approach in\nrFFNNs, allowing the models to effectively capture temporal dependencies across\nload steps. The rBNN extends this framework by treating model parameters as\nrandom variables, with their posterior distributions inferred using generalized\nvariational inference.\n  The proposed framework is validated on two datasets: (i) a numerically\nsimulated consolidated drained (CD) triaxial dataset employing a hardening soil\nmodel and (ii) an experimental dataset comprising 28 CD triaxial tests on\nBaskarp sand. Comparative analyses with LSTM, Bi-LSTM, and GRU models\ndemonstrate that the deterministic rFFNN achieves superior predictive accuracy,\nattributed to its transparent structure and sliding window design. While the\nrBNN marginally trails in accuracy for the experimental case, it provides\nrobust confidence intervals, addressing data sparsity and measurement noise in\nexperimental conditions. The study underscores the trade-offs between\ndeterministic and probabilistic approaches and the potential of rBNNs for\nuncertainty-aware constitutive modeling.\n","authors":["Toiba Noor","Soban Nasir Lone","G. V. Ramana","Rajdip Nayek"],"pdf_url":"https://arxiv.org/pdf/2501.10088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10084v1","updated":"2025-01-17T10:05:11Z","published":"2025-01-17T10:05:11Z","title":"Two-level Solar Irradiance Clustering with Season Identification: A\n  Comparative Analysis","summary":"  Solar irradiance clustering can enhance solar power capacity planning and\nhelp improve forecasting models by identifying similar irradiance patterns\ninfluenced by seasonal and weather changes. In this study, we adopt an\nefficient two-level clustering approach to automatically identify seasons using\nthe clear sky irradiance in first level and subsequently to identify daily\ncloud level as clear, cloudy and partly cloudy within each season in second\nlevel. In the second level of clustering, three methods are compared, namely,\nDaily Irradiance Index (DII or $\\beta$), Euclidean Distance (ED), and Dynamic\nTime Warping (DTW) distance. The DII is computed as the ratio of time integral\nof measured irradiance to time integral of the clear sky irradiance. The\nidentified clusters were compared quantitatively using established clustering\nmetrics and qualitatively by comparing the mean irradiance profiles. The\nresults clearly establish the superiority of the $\\beta$-based clustering\napproach as the leader, setting a new benchmark for solar irradiance clustering\nstudies. Moreover, $\\beta$-based clustering remains effective even for annual\ndata unlike the time-series methods which suffer significant performance\ndegradation. Interestingly, contrary to expectations, ED-based clustering\noutperforms the more compute-intensive DTW distance-based clustering. The\nmethod has been rigorously validated using data from two distinct US locations,\ndemonstrating robust scalability for larger datasets and potential\napplicability for other locations.\n","authors":["Roshni Agrawal","Sivakumar Subramanian","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2501.10084v1.pdf","comment":"30 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.15580v3","updated":"2025-01-17T10:03:39Z","published":"2024-07-22T12:16:56Z","title":"Annealed Multiple Choice Learning: Overcoming limitations of\n  Winner-takes-all with annealing","summary":"  We introduce Annealed Multiple Choice Learning (aMCL) which combines\nsimulated annealing with MCL. MCL is a learning framework handling ambiguous\ntasks by predicting a small set of plausible hypotheses. These hypotheses are\ntrained using the Winner-takes-all (WTA) scheme, which promotes the diversity\nof the predictions. However, this scheme may converge toward an arbitrarily\nsuboptimal local minimum, due to the greedy nature of WTA. We overcome this\nlimitation using annealing, which enhances the exploration of the hypothesis\nspace during training. We leverage insights from statistical physics and\ninformation theory to provide a detailed description of the model training\ntrajectory. Additionally, we validate our algorithm by extensive experiments on\nsynthetic datasets, on the standard UCI benchmark, and on speech separation.\n","authors":["David Perera","Victor Letzelter","Théo Mariotte","Adrien Cortés","Mickael Chen","Slim Essid","Gaël Richard"],"pdf_url":"https://arxiv.org/pdf/2407.15580v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.10077v1","updated":"2025-01-17T09:49:46Z","published":"2025-01-17T09:49:46Z","title":"Double descent in quantum machine learning","summary":"  The double descent phenomenon challenges traditional statistical learning\ntheory by revealing scenarios where larger models do not necessarily lead to\nreduced performance on unseen data. While this counterintuitive behavior has\nbeen observed in a variety of classical machine learning models, particularly\nmodern neural network architectures, it remains elusive within the context of\nquantum machine learning. In this work, we analytically demonstrate that\nquantum learning models can exhibit double descent behavior by drawing on\ninsights from linear regression and random matrix theory. Additionally, our\nnumerical experiments on quantum kernel methods across different real-world\ndatasets and system sizes further confirm the existence of a test error peak, a\ncharacteristic feature of double descent. Our findings provide evidence that\nquantum models can operate in the modern, overparameterized regime without\nexperiencing overfitting, thereby opening pathways to improved learning\nperformance beyond traditional statistical learning theory.\n","authors":["Marie Kempkes","Aroosa Ijaz","Elies Gil-Fuster","Carlos Bravo-Prieto","Jakob Spiegelberg","Evert van Nieuwenburg","Vedran Dunjko"],"pdf_url":"https://arxiv.org/pdf/2501.10077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10075v1","updated":"2025-01-17T09:47:27Z","published":"2025-01-17T09:47:27Z","title":"Robust Change Captioning in Remote Sensing: SECOND-CC Dataset and\n  MModalCC Framework","summary":"  Remote sensing change captioning (RSICC) aims to describe changes between\nbitemporal images in natural language. Existing methods often fail under\nchallenges like illumination differences, viewpoint changes, blur effects,\nleading to inaccuracies, especially in no-change regions. Moreover, the images\nacquired at different spatial resolutions and have registration errors tend to\naffect the captions. To address these issues, we introduce SECOND-CC, a novel\nRSICC dataset featuring high-resolution RGB image pairs, semantic segmentation\nmaps, and diverse real-world scenarios. SECOND-CC which contains 6,041 pairs of\nbitemporal RS images and 30,205 sentences describing the differences between\nimages. Additionally, we propose MModalCC, a multimodal framework that\nintegrates semantic and visual data using advanced attention mechanisms,\nincluding Cross-Modal Cross Attention (CMCA) and Multimodal Gated Cross\nAttention (MGCA). Detailed ablation studies and attention visualizations\nfurther demonstrate its effectiveness and ability to address RSICC challenges.\nComprehensive experiments show that MModalCC outperforms state-of-the-art RSICC\nmethods, including RSICCformer, Chg2Cap, and PSNet with +4.6% improvement on\nBLEU4 score and +9.6% improvement on CIDEr score. We will make our dataset and\ncodebase publicly available to facilitate future research at\nhttps://github.com/ChangeCapsInRS/SecondCC\n","authors":["Ali Can Karaca","M. Enes Ozelbas","Saadettin Berber","Orkhan Karimli","Turabi Yildirim","M. Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2501.10075v1.pdf","comment":"This work has been submitted to the IEEE Transactions on Geoscience\n  and Remote Sensing journal for possible publication"},{"id":"http://arxiv.org/abs/2501.07124v3","updated":"2025-01-17T09:39:17Z","published":"2025-01-13T08:26:43Z","title":"LLM360 K2: Building a 65B 360-Open-Source Large Language Model from\n  Scratch","summary":"  We detail the training of the LLM360 K2-65B model, scaling up our 360-degree\nOPEN SOURCE approach to the largest and most powerful models under project\nLLM360. While open-source LLMs continue to advance, the answer to \"How are the\nlargest LLMs trained?\" remains unclear within the community. The implementation\ndetails for such high-capacity models are often protected due to business\nconsiderations associated with their high cost. This lack of transparency\nprevents LLM researchers from leveraging valuable insights from prior\nexperience, e.g., \"What are the best practices for addressing loss spikes?\" The\nLLM360 K2 project addresses this gap by providing full transparency and access\nto resources accumulated during the training of LLMs at the largest scale. This\nreport highlights key elements of the K2 project, including our first model, K2\nDIAMOND, a 65 billion-parameter LLM that surpasses LLaMA-65B and rivals\nLLaMA2-70B, while requiring fewer FLOPs and tokens. We detail the\nimplementation steps and present a longitudinal analysis of K2 DIAMOND's\ncapabilities throughout its training process. We also outline ongoing projects\nsuch as TXT360, setting the stage for future models in the series. By offering\npreviously unavailable resources, the K2 project also resonates with the\n360-degree OPEN SOURCE principles of transparency, reproducibility, and\naccessibility, which we believe are vital in the era of resource-intensive AI\nresearch.\n","authors":["Zhengzhong Liu","Bowen Tan","Hongyi Wang","Willie Neiswanger","Tianhua Tao","Haonan Li","Fajri Koto","Yuqi Wang","Suqi Sun","Omkar Pangarkar","Richard Fan","Yi Gu","Victor Miller","Liqun Ma","Liping Tang","Nikhil Ranjan","Yonghao Zhuang","Guowei He","Renxi Wang","Mingkai Deng","Robin Algayres","Yuanzhi Li","Zhiqiang Shen","Preslav Nakov","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2501.07124v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03594v2","updated":"2025-01-17T09:37:36Z","published":"2024-11-29T05:57:37Z","title":"BatchLLM: Optimizing Large Batched LLM Inference with Global Prefix\n  Sharing and Throughput-oriented Token Batching","summary":"  Large language models (LLMs) increasingly play an important role in a wide\nrange of information processing and management tasks. Many of these tasks are\nperformed in large batches or even offline, and the performance indictor for\nwhich is throughput. These tasks usually show the characteristic of prefix\nsharing, where different prompt input can partially show the common prefix.\nHowever, the existing LLM inference engines tend to optimize the streaming\nrequests and show limitations of supporting the large batched tasks with the\nprefix sharing characteristic. The existing solutions use the LRU-based cache\nto reuse the KV context of common prefix between requests. The KV context that\nare about to be reused may prematurely evicted with the implicit cache\nmanagement. Besides, the streaming oriented systems do not leverage the\nrequest-batch information and can not mix the decoding tokens with the prefill\nchunks to the best for the batched scenarios, and thus fails to saturate the\nGPU. We propose BatchLLM to address the above problems. BatchLLM explicitly\nidentifies the common prefixes globally. The requests sharing the same prefix\nwill be scheduled together to reuse the KV context the best. BatchLLM reorders\nthe requests and schedules the requests with larger ratio of decoding first to\nbetter mix the decoding tokens with the latter prefill chunks, and applies\nmemory-centric token batching to enlarge the token-batch sizes, which helps to\nincrease the GPU utilization. Finally, BatchLLM optimizes the prefix-shared\nAttention kernel with horizontal fusion to reduce tail effect and kernel launch\noverhead. Extensive evaluation shows that BatchLLM outperforms vLLM and SGLang\nby 1.3$\\times$ to 10.8$\\times$ on a set of microbenchmarks and a typical\nindustry workload under different hardware environments.\n","authors":["Zhen Zheng","Xin Ji","Taosong Fang","Fanghao Zhou","Chuanjie Liu","Gang Peng"],"pdf_url":"https://arxiv.org/pdf/2412.03594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20262v3","updated":"2025-01-17T09:32:54Z","published":"2024-03-29T16:13:31Z","title":"ELITR-Bench: A Meeting Assistant Benchmark for Long-Context Language\n  Models","summary":"  Research on Large Language Models (LLMs) has recently witnessed an increasing\ninterest in extending the models' context size to better capture dependencies\nwithin long documents. While benchmarks have been proposed to assess long-range\nabilities, existing efforts primarily considered generic tasks that are not\nnecessarily aligned with real-world applications. In contrast, we propose a new\nbenchmark for long-context LLMs focused on a practical meeting assistant\nscenario in which the long contexts consist of transcripts obtained by\nautomatic speech recognition, presenting unique challenges for LLMs due to the\ninherent noisiness and oral nature of such data. Our benchmark, ELITR-Bench,\naugments the existing ELITR corpus by adding 271 manually crafted questions\nwith their ground-truth answers, as well as noisy versions of meeting\ntranscripts altered to target different Word Error Rate levels. Our experiments\nwith 12 long-context LLMs on ELITR-Bench confirm the progress made across\nsuccessive generations of both proprietary and open models, and point out their\ndiscrepancies in terms of robustness to transcript noise. We also provide a\nthorough analysis of our GPT-4-based evaluation, including insights from a\ncrowdsourcing study. Our findings indicate that while GPT-4's scores align with\nhuman judges, its ability to distinguish beyond three score levels may be\nlimited.\n","authors":["Thibaut Thonet","Jos Rozen","Laurent Besacier"],"pdf_url":"https://arxiv.org/pdf/2403.20262v3.pdf","comment":"Published in COLING 2025"},{"id":"http://arxiv.org/abs/2501.10064v1","updated":"2025-01-17T09:29:33Z","published":"2025-01-17T09:29:33Z","title":"One-D-Piece: Image Tokenizer Meets Quality-Controllable Compression","summary":"  Current image tokenization methods require a large number of tokens to\ncapture the information contained within images. Although the amount of\ninformation varies across images, most image tokenizers only support\nfixed-length tokenization, leading to inefficiency in token allocation. In this\nstudy, we introduce One-D-Piece, a discrete image tokenizer designed for\nvariable-length tokenization, achieving quality-controllable mechanism. To\nenable variable compression rate, we introduce a simple but effective\nregularization mechanism named \"Tail Token Drop\" into discrete one-dimensional\nimage tokenizers. This method encourages critical information to concentrate at\nthe head of the token sequence, enabling support of variadic tokenization,\nwhile preserving state-of-the-art reconstruction quality. We evaluate our\ntokenizer across multiple reconstruction quality metrics and find that it\ndelivers significantly better perceptual quality than existing\nquality-controllable compression methods, including JPEG and WebP, at smaller\nbyte sizes. Furthermore, we assess our tokenizer on various downstream computer\nvision tasks, including image classification, object detection, semantic\nsegmentation, and depth estimation, confirming its adaptability to numerous\napplications compared to other variable-rate methods. Our approach demonstrates\nthe versatility of variable-length discrete image tokenization, establishing a\nnew paradigm in both compression efficiency and reconstruction performance.\nFinally, we validate the effectiveness of tail token drop via detailed analysis\nof tokenizers.\n","authors":["Keita Miwa","Kento Sasaki","Hidehisa Arai","Tsubasa Takahashi","Yu Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2501.10064v1.pdf","comment":"Our Project Page:\n  https://turingmotors.github.io/one-d-piece-tokenizer"},{"id":"http://arxiv.org/abs/2501.10062v1","updated":"2025-01-17T09:27:08Z","published":"2025-01-17T09:27:08Z","title":"OMoE: Diversifying Mixture of Low-Rank Adaptation by Orthogonal\n  Finetuning","summary":"  Building mixture-of-experts (MoE) architecture for Low-rank adaptation (LoRA)\nis emerging as a potential direction in parameter-efficient fine-tuning (PEFT)\nfor its modular design and remarkable performance. However, simply stacking the\nnumber of experts cannot guarantee significant improvement. In this work, we\nfirst conduct qualitative analysis to indicate that experts collapse to similar\nrepresentations in vanilla MoE, limiting the capacity of modular design and\ncomputational efficiency. Ulteriorly, Our analysis reveals that the performance\nof previous MoE variants maybe limited by a lack of diversity among experts.\nMotivated by these findings, we propose Orthogonal Mixture-of-Experts (OMoE), a\nresource-efficient MoE variant that trains experts in an orthogonal manner to\npromote diversity. In OMoE, a Gram-Schmidt process is leveraged to enforce that\nthe experts' representations lie within the Stiefel manifold. By applying\northogonal constraints directly to the architecture, OMoE keeps the learning\nobjective unchanged, without compromising optimality. Our method is simple and\nalleviates memory bottlenecks, as it incurs minimal experts compared to vanilla\nMoE models. Experiments on diverse commonsense reasoning benchmarks demonstrate\nthat OMoE can consistently achieve stable and efficient performance improvement\nwhen compared with the state-of-the-art methods while significantly reducing\nthe number of required experts.\n","authors":["Jinyuan Feng","Zhiqiang Pu","Tianyi Hu","Dongmin Li","Xiaolin Ai","Huimu Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10054v1","updated":"2025-01-17T09:20:56Z","published":"2025-01-17T09:20:56Z","title":"Accelerating Large Language Models through Partially Linear Feed-Forward\n  Network","summary":"  Large language models (LLMs) demonstrate remarkable capabilities but face\ndeployment challenges due to their massive parameter counts. While existing\ncompression techniques like pruning can reduce model size, it leads to\nsignificant accuracy degradation under high compression ratios. We present a\nnovel perspective inspired by constant folding in compiler optimization. Our\napproach enables parameter reduction by treating activation functions in LLMs\nas linear functions.\n  However, recent LLMs use complex non-linear activations like GELU that\nprevent direct application of this technique. We propose TARDIS, which enables\noptimization of LLMs with non-linear activations by partially approximating\nthem with linear functions in frequently occurring input ranges. For outlier\ninputs, TARDIS employs an online predictor to dynamically fall back to original\ncomputations.\n  Our experiments demonstrate that TARDIS achieves 80% parameter reduction in\nfeed-forward networks, while significantly outperforming state-of-the-art\npruning methods Wanda and RIA with up to 65% higher accuracy. In practical\ndeployments for a 7B model, TARDIS achieves 1.6x end-to-end inference speedup\nwhen integrated with the vLLM serving system, and 1.4x speedup with the widely\nadopted HuggingFace implementation, while incurring only a 10.9% accuracy\ntrade-off.\n","authors":["Gansen Hu","Zhaoguo Wang","Jinglin Wei","Wei Huang","Haibo Chen"],"pdf_url":"https://arxiv.org/pdf/2501.10054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10050v1","updated":"2025-01-17T09:13:49Z","published":"2025-01-17T09:13:49Z","title":"Tracking student skills real-time through a continuous-variable dynamic\n  Bayesian network","summary":"  The field of Knowledge Tracing is focused on predicting the success rate of a\nstudent for a given skill. Modern methods like Deep Knowledge Tracing provide\naccurate estimates given enough data, but being based on neural networks they\nstruggle to explain how these estimates are formed. More classical methods like\nDynamic Bayesian Networks can do this, but they cannot give data on the\naccuracy of their estimates and often struggle to incorporate new observations\nin real-time due to their high computational load.\n  This paper presents a novel method, Performance Distribution Tracing (PDT),\nin which the distribution of the success rate is traced live. It uses a Dynamic\nBayesian Network with continuous random variables as nodes. By tracing the\nsuccess rate distribution, there is always data available on the accuracy of\nany success rate estimation. In addition, it makes it possible to combine data\nfrom similar/related skills to come up with a more informed estimate of success\nrates. This makes it possible to predict exercise success rates, providing both\nexplainability and an accuracy indication, even when an exercise requires a\ncombination of different skills to solve. And through the use of the beta\ndistribution functions as conjugate priors, all distributions are available in\nanalytical form, allowing efficient online updates upon new observations.\nExperiments have shown that the resulting estimates generally feel sufficiently\naccurate to end-users such that they accept recommendations based on them.\n","authors":["Hildo Bijl"],"pdf_url":"https://arxiv.org/pdf/2501.10050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10049v1","updated":"2025-01-17T09:10:34Z","published":"2025-01-17T09:10:34Z","title":"PandaSkill -- Player Performance and Skill Rating in Esports:\n  Application to League of Legends","summary":"  To take the esports scene to the next level, we introduce PandaSkill, a\nframework for assessing player performance and skill rating. Traditional rating\nsystems like Elo and TrueSkill often overlook individual contributions and face\nchallenges in professional esports due to limited game data and fragmented\ncompetitive scenes. PandaSkill leverages machine learning to estimate in-game\nplayer performance from individual player statistics. Each in-game role is\nmodeled independently, ensuring a fair comparison between them. Then, using\nthese performance scores, PandaSkill updates the player skill ratings using the\nBayesian framework OpenSkill in a free-for-all setting. In this setting, skill\nratings are updated solely based on performance scores rather than game\noutcomes, hightlighting individual contributions. To address the challenge of\nisolated rating pools that hinder cross-regional comparisons, PandaSkill\nintroduces a dual-rating system that combines players' regional ratings with a\nmeta-rating representing each region's overall skill level. Applying PandaSkill\nto five years of professional League of Legends matches worldwide, we show that\nour method produces skill ratings that better predict game outcomes and align\nmore closely with expert opinions compared to existing methods.\n","authors":["Maxime De Bois","Flora Parmentier","Raphaël Puget","Matthew Tanti","Jordan Peltier"],"pdf_url":"https://arxiv.org/pdf/2501.10049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10048v1","updated":"2025-01-17T09:09:01Z","published":"2025-01-17T09:09:01Z","title":"Virtual Nodes Improve Long-term Traffic Prediction","summary":"  Effective traffic prediction is a cornerstone of intelligent transportation\nsystems, enabling precise forecasts of traffic flow, speed, and congestion.\nWhile traditional spatio-temporal graph neural networks (ST-GNNs) have achieved\nnotable success in short-term traffic forecasting, their performance in\nlong-term predictions remains limited. This challenge arises from\nover-squashing problem, where bottlenecks and limited receptive fields restrict\ninformation flow and hinder the modeling of global dependencies. To address\nthese challenges, this study introduces a novel framework that incorporates\nvirtual nodes, which are additional nodes added to the graph and connected to\nexisting nodes, in order to aggregate information across the entire graph\nwithin a single GNN layer. Our proposed model incorporates virtual nodes by\nconstructing a semi-adaptive adjacency matrix. This matrix integrates\ndistance-based and adaptive adjacency matrices, allowing the model to leverage\ngeographical information while also learning task-specific features from data.\nExperimental results demonstrate that the inclusion of virtual nodes\nsignificantly enhances long-term prediction accuracy while also improving\nlayer-wise sensitivity to mitigate the over-squashing problem. Virtual nodes\nalso offer enhanced explainability by focusing on key intersections and\nhigh-traffic areas, as shown by the visualization of their adjacency matrix\nweights on road network heat maps. Our advanced approach enhances the\nunderstanding and management of urban traffic systems, making it particularly\nwell-suited for real-world applications.\n","authors":["Xiaoyang Cao","Dingyi Zhuang","Jinhua Zhao","Shenhao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03703v3","updated":"2025-01-17T09:03:57Z","published":"2024-04-04T07:49:39Z","title":"Mitigating analytical variability in fMRI results with style transfer","summary":"  We propose a novel approach to improve the reproducibility of neuroimaging\nresults by converting statistic maps across different functional MRI pipelines.\nWe make the assumption that pipelines used to compute fMRI statistic maps can\nbe considered as a style component and we propose to use different generative\nmodels, among which, Generative Adversarial Networks (GAN) and Diffusion Models\n(DM) to convert statistic maps across different pipelines. We explore the\nperformance of multiple GAN frameworks, and design a new DM framework for\nunsupervised multi-domain styletransfer. We constrain the generation of 3D fMRI\nstatistic maps using the latent space of an auxiliary classifier that\ndistinguishes statistic maps from different pipelines and extend traditional\nsampling techniques used in DM to improve the transition performance. Our\nexperiments demonstrate that our proposed methods aresuccessful: pipelines can\nindeed be transferred as a style component, providing animportant source of\ndata augmentation for future medical studies.\n","authors":["Elodie Germani","Camille Maumet","Elisa Fromont"],"pdf_url":"https://arxiv.org/pdf/2404.03703v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12709v2","updated":"2025-01-17T09:03:17Z","published":"2024-12-17T09:23:46Z","title":"Accelerating lensed quasars discovery and modeling with physics-informed\n  variational autoencoders","summary":"  Strongly lensed quasars provide valuable insights into the rate of cosmic\nexpansion, the distribution of dark matter in foreground deflectors, and the\ncharacteristics of quasar hosts. However, detecting them in astronomical images\nis difficult due to the prevalence of non-lensing objects. To address this\nchallenge, we developed a generative deep learning model called VariLens, built\nupon a physics-informed variational autoencoder. This model seamlessly\nintegrates three essential modules: image reconstruction, object\nclassification, and lens modeling, offering a fast and comprehensive approach\nto strong lens analysis. VariLens is capable of rapidly determining both (1)\nthe probability that an object is a lens system and (2) key parameters of a\nsingular isothermal ellipsoid (SIE) mass model -- including the Einstein radius\n($\\theta_\\mathrm{E}$), lens center, and ellipticity -- in just milliseconds\nusing a single CPU. A direct comparison of VariLens estimates with traditional\nlens modeling for 20 known lensed quasars within the Subaru Hyper Suprime-Cam\n(HSC) footprint shows good agreement, with both results consistent within\n$2\\sigma$ for systems with $\\theta_\\mathrm{E}<3$ arcsecs. To identify new\nlensed quasar candidates, we begin with an initial sample of approximately 80\nmillion sources, combining HSC data with multiwavelength information from\nvarious surveys. After applying a photometric preselection aimed at locating\n$z>1.5$ sources, the number of candidates is reduced to 710,966. Subsequently,\nVariLens highlights 13,831 sources, each showing a high likelihood of being a\nlens. A visual assessment of these objects results in 42 promising candidates\nthat await spectroscopic confirmation. These results underscore the potential\nof automated deep learning pipelines to efficiently detect and model strong\nlenses in large datasets.\n","authors":["Irham T. Andika","Stefan Schuldt","Sherry H. Suyu","Satadru Bag","Raoul Cañameras","Alejandra Melo","Claudio Grillo","James H. H. Chan"],"pdf_url":"https://arxiv.org/pdf/2412.12709v2.pdf","comment":"Submitted to the Astronomy & Astrophysics journal and updated to\n  reflect the revised version. The paper consists of 17 main pages, 14 figures,\n  and 5 tables. We welcome feedback and comments from readers!"},{"id":"http://arxiv.org/abs/2501.08995v2","updated":"2025-01-17T08:58:48Z","published":"2025-01-15T18:23:33Z","title":"VECT-GAN: A variationally encoded generative model for overcoming data\n  scarcity in pharmaceutical science","summary":"  Data scarcity in pharmaceutical research has led to reliance on\nlabour-intensive trial-and-error approaches for development rather than\ndata-driven methods. While Machine Learning offers a solution, existing\ndatasets are often small and noisy, limiting their utility. To address this, we\ndeveloped a Variationally Encoded Conditional Tabular Generative Adversarial\nNetwork (VECT-GAN), a novel generative model specifically designed for\naugmenting small, noisy datasets. We introduce a pipeline where data is\naugmented before regression model development and demonstrate that this\nconsistently and significantly improves performance over other state-of-the-art\ntabular generative models. We apply this pipeline across six pharmaceutical\ndatasets, and highlight its real-world applicability by developing novel\npolymers with medically desirable mucoadhesive properties, which we made and\nexperimentally characterised. Additionally, we pre-train the model on the\nChEMBL database of drug-like molecules, leveraging knowledge distillation to\nenhance its generalisability, making it readily available for use on\npharmaceutical datasets containing small molecules, an extremely common\npharmaceutical task. We demonstrate the power of synthetic data for\nregularising small tabular datasets, highlighting its potential to become\nstandard practice in pharmaceutical model development, and make our method,\nincluding VECT-GAN pre-trained on ChEMBL available as a pip package.\n","authors":["Youssef Abdalla","Marrisa Taub","Eleanor Hilton","Priya Akkaraju","Alexander Milanovic","Mine Orlu","Abdul W. Basit","Michael T Cook","Tapabrata Chakraborti","David Shorthouse"],"pdf_url":"https://arxiv.org/pdf/2501.08995v2.pdf","comment":"30 pages, 6 primary figures, 3 supplementary figures"},{"id":"http://arxiv.org/abs/2412.04778v2","updated":"2025-01-17T08:58:17Z","published":"2024-12-06T05:00:01Z","title":"IterL2Norm: Fast Iterative L2-Normalization","summary":"  Transformer-based large language models are a memory-bound model whose\noperation is based on a large amount of data that are marginally reused. Thus,\nthe data movement between a host and accelerator likely dictates the total\nwall-clock time. Layer normalization is one of the key workloads in the\ntransformer model, following each of multi-head attention and feed-forward\nnetwork blocks. To reduce data movement, layer normalization needs to be\nperformed on the same chip as the matrix-matrix multiplication engine. To this\nend, we introduce an iterative L2-normalization method for 1D input\n(IterL2Norm), ensuring fast convergence to the steady-state solution within\nfive iteration steps and high precision, outperforming the fast inverse square\nroot algorithm in six out of nine cases for FP32 and five out of nine for\nBFloat16 across the embedding lengths used in the OPT models. Implemented in\n32/28nm CMOS, the IterL2Norm macro normalizes $d$-dimensional vectors, where\n$64 \\leq d \\leq 1024$, with a latency of 116-227 cycles at 100MHz/1.05V.\n","authors":["ChangMin Ye","Yonguk Sim","Youngchae Kim","SeongMin Jin","Doo Seok Jeong"],"pdf_url":"https://arxiv.org/pdf/2412.04778v2.pdf","comment":"Design, Automation & Test in Europe Conference 2025"},{"id":"http://arxiv.org/abs/2501.08305v2","updated":"2025-01-17T08:56:04Z","published":"2025-01-14T18:41:15Z","title":"Benchmarking Graph Representations and Graph Neural Networks for\n  Multivariate Time Series Classification","summary":"  Multivariate Time Series Classification (MTSC) enables the analysis if\ncomplex temporal data, and thus serves as a cornerstone in various real-world\napplications, ranging from healthcare to finance. Since the relationship among\nvariables in MTS usually contain crucial cues, a large number of graph-based\nMTSC approaches have been proposed, as the graph topology and edges can\nexplicitly represent relationships among variables (channels), where not only\nvarious MTS graph representation learning strategies but also different Graph\nNeural Networks (GNNs) have been explored. Despite such progresses, there is no\ncomprehensive study that fairly benchmarks and investigates the performances of\nexisting widely-used graph representation learning strategies/GNN classifiers\nin the application of different MTSC tasks. In this paper, we present the first\nbenchmark which systematically investigates the effectiveness of the\nwidely-used three node feature definition strategies, four edge feature\nlearning strategies and five GNN architecture, resulting in 60 different\nvariants for graph-based MTSC. These variants are developed and evaluated with\na standardized data pipeline and training/validation/testing strategy on 26\nwidely-used suspensor MTSC datasets. Our experiments highlight that node\nfeatures significantly influence MTSC performance, while the visualization of\nedge features illustrates why adaptive edge learning outperforms other edge\nfeature learning methods. The code of the proposed benchmark is publicly\navailable at\n\\url{https://github.com/CVI-yangwn/Benchmark-GNN-for-Multivariate-Time-Series-Classification}.\n","authors":["Wennuo Yang","Shiling Wu","Yuzhi Zhou","Cheng Luo","Xilin He","Weicheng Xie","Linlin Shen","Siyang Song"],"pdf_url":"https://arxiv.org/pdf/2501.08305v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17188v2","updated":"2025-01-17T08:38:45Z","published":"2024-06-25T00:02:01Z","title":"Geometric Median (GM) Matching for Robust Data Pruning","summary":"  Large-scale data collections in the wild, are invariably noisy. Thus\ndeveloping data pruning strategies that remain robust even in the presence of\ncorruption is critical in practice. In this work, we propose Geometric Median\n($\\gm$) Matching -- a herding style greedy algorithm that yields a $k$-subset\nsuch that the mean of the subset approximates the geometric median of the\n(potentially) noisy dataset. Theoretically, we show that $\\gm$ Matching enjoys\nan improved $\\gO(1/k)$ scaling over $\\gO(1/\\sqrt{k})$ scaling of uniform\nsampling; while achieving {\\bf optimal breakdown point} of {\\bf 1/2} even under\n{\\bf arbitrary} corruption. Extensive experiments across several popular deep\nlearning benchmarks indicate that $\\gm$ Matching consistently improves over\nprior state-of-the-art; the gains become more profound at high rates of\ncorruption and aggressive pruning rates; making $\\gm$ Matching a strong\nbaseline for future research in robust data pruning.\n","authors":["Anish Acharya","Inderjit S Dhillon","Sujay Sanghavi"],"pdf_url":"https://arxiv.org/pdf/2406.17188v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12671v4","updated":"2025-01-17T08:14:18Z","published":"2023-10-19T12:00:33Z","title":"Neural networks for insurance pricing with frequency and severity data:\n  a benchmark study from data preprocessing to technical tariff","summary":"  Insurers usually turn to generalized linear models for modeling claim\nfrequency and severity data. Due to their success in other fields, machine\nlearning techniques are gaining popularity within the actuarial toolbox. Our\npaper contributes to the literature on frequency-severity insurance pricing\nwith machine learning via deep learning structures. We present a benchmark\nstudy on four insurance data sets with frequency and severity targets in the\npresence of multiple types of input features. We compare in detail the\nperformance of: a generalized linear model on binned input data, a\ngradient-boosted tree model, a feed-forward neural network (FFNN), and the\ncombined actuarial neural network (CANN). The CANNs combine a baseline\nprediction established with a GLM and GBM, respectively, with a neural network\ncorrection. We explain the data preprocessing steps with specific focus on the\nmultiple types of input features typically present in tabular insurance data\nsets, such as postal codes, numeric and categorical covariates. Autoencoders\nare used to embed the categorical variables into the neural network, and we\nexplore their potential advantages in a frequency-severity setting. Model\nperformance is evaluated not only on out-of-sample deviance but also using\nstatistical and calibration performance criteria and managerial tools to get\nmore nuanced insights. Finally, we construct global surrogate models for the\nneural nets' frequency and severity models. These surrogates enable the\ntranslation of the essential insights captured by the FFNNs or CANNs to GLMs.\nAs such, a technical tariff table results that can easily be deployed in\npractice.\n","authors":["Freek Holvoet","Katrien Antonio","Roel Henckaerts"],"pdf_url":"https://arxiv.org/pdf/2310.12671v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16105v2","updated":"2025-01-17T08:02:37Z","published":"2023-09-28T02:13:13Z","title":"Differentially Private Secure Multiplication: Hiding Information in the\n  Rubble of Noise","summary":"  We consider the problem of private distributed multi-party multiplication. It\nis well-established that Shamir secret-sharing coding strategies can enable\nperfect information-theoretic privacy in distributed computation via the\ncelebrated algorithm of Ben Or, Goldwasser and Wigderson (the \"BGW algorithm\").\nHowever, perfect privacy and accuracy require an honest majority, that is, $N\n\\geq 2t+1$ compute nodes are required to ensure privacy against any $t$\ncolluding adversarial nodes. By allowing for some controlled amount of\ninformation leakage and approximate multiplication instead of exact\nmultiplication, we study coding schemes for the setting where the number of\nhonest nodes can be a minority, that is $N< 2t+1.$ We develop a tight\ncharacterization privacy-accuracy trade-off for cases where $N < 2t+1$ by\nmeasuring information leakage using {differential} privacy instead of perfect\nprivacy, and using the mean squared error metric for accuracy. A novel\ntechnical aspect is an intricately layered noise distribution that merges ideas\nfrom differential privacy and Shamir secret-sharing at different layers.\n","authors":["Viveck R. Cadambe","Ateet Devulapalli","Haewon Jeong","Flavio P. Calmon"],"pdf_url":"https://arxiv.org/pdf/2309.16105v2.pdf","comment":"Extended version of papers presented in IEEE ISIT 2022, IEEE ISIT\n  2023 and TPDP 2023"},{"id":"http://arxiv.org/abs/2501.10010v1","updated":"2025-01-17T07:48:18Z","published":"2025-01-17T07:48:18Z","title":"Adaptive Spatiotemporal Augmentation for Improving Dynamic Graph\n  Learning","summary":"  Dynamic graph augmentation is used to improve the performance of dynamic\nGNNs. Most methods assume temporal locality, meaning that recent edges are more\ninfluential than earlier edges. However, for temporal changes in edges caused\nby random noise, overemphasizing recent edges while neglecting earlier ones may\nlead to the model capturing noise. To address this issue, we propose STAA\n(SpatioTemporal Activity-Aware Random Walk Diffusion). STAA identifies nodes\nlikely to have noisy edges in spatiotemporal dimensions. Spatially, it analyzes\ncritical topological positions through graph wavelet coefficients. Temporally,\nit analyzes edge evolution through graph wavelet coefficient change rates.\nThen, random walks are used to reduce the weights of noisy edges, deriving a\ndiffusion matrix containing spatiotemporal information as an augmented\nadjacency matrix for dynamic GNN learning. Experiments on multiple datasets\nshow that STAA outperforms other dynamic graph augmentation methods in node\nclassification and link prediction tasks.\n","authors":["Xu Chu","Hanlin Xue","Bingce Wang","Xiaoyang Liu","Weiping Li","Tong Mo","Tuoyu Feng","Zhijie Tan"],"pdf_url":"https://arxiv.org/pdf/2501.10010v1.pdf","comment":"2025 IEEE International Conference on Acoustics, Speech, and Signal\n  Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2408.03195v3","updated":"2025-01-17T07:29:14Z","published":"2024-08-06T13:55:51Z","title":"RELIEF: Reinforcement Learning Empowered Graph Feature Prompt Tuning","summary":"  The advent of the \"pre-train, prompt\" paradigm has recently extended its\ngeneralization ability and data efficiency to graph representation learning,\nfollowing its achievements in Natural Language Processing (NLP). Initial graph\nprompt tuning approaches tailored specialized prompting functions for Graph\nNeural Network (GNN) models pre-trained with specific strategies, such as edge\nprediction, thus limiting their applicability. In contrast, another pioneering\nline of research has explored universal prompting via adding prompts to the\ninput graph's feature space, thereby removing the reliance on specific\npre-training strategies. However, the necessity to add feature prompts to all\nnodes remains an open question. Motivated by findings from prompt tuning\nresearch in the NLP domain, which suggest that highly capable pre-trained\nmodels need less conditioning signal to achieve desired behaviors, we advocate\nfor strategically incorporating necessary and lightweight feature prompts to\ncertain graph nodes to enhance downstream task performance. This introduces a\ncombinatorial optimization problem, requiring a policy to decide 1) which nodes\nto prompt and 2) what specific feature prompts to attach. We then address the\nproblem by framing the prompt incorporation process as a sequential\ndecision-making problem and propose our method, RELIEF, which employs\nReinforcement Learning (RL) to optimize it. At each step, the RL agent selects\na node (discrete action) and determines the prompt content (continuous action),\naiming to maximize cumulative performance gain. Extensive experiments on graph\nand node-level tasks with various pre-training strategies in few-shot scenarios\ndemonstrate that our RELIEF outperforms fine-tuning and other prompt-based\napproaches in classification performance and data efficiency. The code is\navailable at https://github.com/JasonZhujp/RELIEF.\n","authors":["Jiapeng Zhu","Zichen Ding","Jianxiang Yu","Jiaqi Tan","Xiang Li","Weining Qian"],"pdf_url":"https://arxiv.org/pdf/2408.03195v3.pdf","comment":"Accepted by SIGKDD 2025 (camera-ready version). Due to the space\n  limitation, please refer to the V2 version for more details"},{"id":"http://arxiv.org/abs/2404.13733v4","updated":"2025-01-17T07:15:16Z","published":"2024-04-21T18:19:27Z","title":"Elucidating the Design Space of Dataset Condensation","summary":"  Dataset condensation, a concept within data-centric learning, efficiently\ntransfers critical attributes from an original dataset to a synthetic version,\nmaintaining both diversity and realism. This approach significantly improves\nmodel training efficiency and is adaptable across multiple application areas.\nPrevious methods in dataset condensation have faced challenges: some incur high\ncomputational costs which limit scalability to larger datasets (e.g., MTT,\nDREAM, and TESLA), while others are restricted to less optimal design spaces,\nwhich could hinder potential improvements, especially in smaller datasets\n(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a\ncomprehensive design framework that includes specific, effective strategies\nlike implementing soft category-aware matching and adjusting the learning rate\nschedule. These strategies are grounded in empirical evidence and theoretical\nbacking. Our resulting approach, Elucidate Dataset Condensation (EDC),\nestablishes a benchmark for both small and large-scale dataset condensation. In\nour testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on\nImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a\ncompression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM,\nand RDED by margins of 27.3%, 17.2%, and 6.6%, respectively.\n","authors":["Shitong Shao","Zikai Zhou","Huanran Chen","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.13733v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.15084v2","updated":"2025-01-17T07:12:55Z","published":"2024-12-19T17:29:44Z","title":"AceMath: Advancing Frontier Math Reasoning with Post-Training and Reward\n  Modeling","summary":"  In this paper, we introduce AceMath, a suite of frontier math models that\nexcel in solving complex math problems, along with highly effective reward\nmodels capable of evaluating generated solutions and reliably identifying the\ncorrect ones. To develop the instruction-tuned math models, we propose a\nsupervised fine-tuning (SFT) process that first achieves competitive\nperformance across general domains, followed by targeted fine-tuning for the\nmath domain using a carefully curated set of prompts and synthetically\ngenerated responses. The resulting model, AceMath-72B-Instruct greatly\noutperforms Qwen2.5-Math-72B-Instruct, GPT-4o and Claude-3.5 Sonnet. To develop\nmath-specialized reward model, we first construct AceMath-RewardBench, a\ncomprehensive and robust benchmark for evaluating math reward models across\ndiverse problems and difficulty levels. After that, we present a systematic\napproach to build our math reward models. The resulting model, AceMath-72B-RM,\nconsistently outperforms state-of-the-art reward models. Furthermore, when\ncombining AceMath-72B-Instruct with AceMath-72B-RM, we achieve the highest\naverage rm@8 score across the math reasoning benchmarks. We release model\nweights, training data, and evaluation benchmarks at:\nhttps://research.nvidia.com/labs/adlr/acemath\n","authors":["Zihan Liu","Yang Chen","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2412.15084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10725v2","updated":"2025-01-17T07:01:43Z","published":"2023-12-17T14:14:31Z","title":"Harnessing small projectors and multiple views for efficient vision\n  pretraining","summary":"  Recent progress in self-supervised (SSL) visual representation learning has\nled to the development of several different proposed frameworks that rely on\naugmentations of images but use different loss functions. However, there are\nfew theoretically grounded principles to guide practice, so practical\nimplementation of each SSL framework requires several heuristics to achieve\ncompetitive performance. In this work, we build on recent analytical results to\ndesign practical recommendations for competitive and efficient SSL that are\ngrounded in theory. Specifically, recent theory tells us that existing SSL\nframeworks are minimizing the same idealized loss, which is to learn features\nthat best match the data similarity kernel defined by the augmentations used.\nWe show how this idealized loss can be reformulated to a functionally\nequivalent loss that is more efficient to compute. We study the implicit bias\nof using gradient descent to minimize our reformulated loss function and find\nthat using a stronger orthogonalization constraint with a reduced projector\ndimensionality should yield good representations. Furthermore, the theory tells\nus that approximating the reformulated loss should be improved by increasing\nthe number of augmentations, and as such using multiple augmentations should\nlead to improved convergence. We empirically verify our findings on CIFAR, STL\nand Imagenet datasets, wherein we demonstrate an improved linear readout\nperformance when training a ResNet-backbone using our theoretically grounded\nrecommendations. Remarkably, we also demonstrate that by leveraging these\ninsights, we can reduce the pretraining dataset size by up to 2$\\times$ while\nmaintaining downstream accuracy simply by using more data augmentations. Taken\ntogether, our work provides theoretically grounded recommendations that can be\nused to improve SSL convergence and efficiency.\n","authors":["Kumar Krishna Agrawal","Arna Ghosh","Shagun Sodhani","Adam Oberman","Blake Richards"],"pdf_url":"https://arxiv.org/pdf/2312.10725v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09982v1","updated":"2025-01-17T06:46:10Z","published":"2025-01-17T06:46:10Z","title":"RichSpace: Enriching Text-to-Video Prompt Space via Text Embedding\n  Interpolation","summary":"  Text-to-video generation models have made impressive progress, but they still\nstruggle with generating videos with complex features. This limitation often\narises from the inability of the text encoder to produce accurate embeddings,\nwhich hinders the video generation model. In this work, we propose a novel\napproach to overcome this challenge by selecting the optimal text embedding\nthrough interpolation in the embedding space. We demonstrate that this method\nenables the video generation model to produce the desired videos. Additionally,\nwe introduce a simple algorithm using perpendicular foot embeddings and cosine\nsimilarity to identify the optimal interpolation embedding. Our findings\nhighlight the importance of accurate text embeddings and offer a pathway for\nimproving text-to-video generation performance.\n","authors":["Yuefan Cao","Chengyue Gong","Xiaoyu Li","Yingyu Liang","Zhizhou Sha","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2501.09982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18967v2","updated":"2025-01-17T06:44:38Z","published":"2024-11-28T07:36:29Z","title":"Deep Plug-and-Play HIO Approach for Phase Retrieval","summary":"  In the phase retrieval problem, the aim is the recovery of an unknown image\nfrom intensity-only measurements such as Fourier intensity. Although there are\nseveral solution approaches, solving this problem is challenging due to its\nnonlinear and ill-posed nature. Recently, learning-based approaches have\nemerged as powerful alternatives to the analytical methods for several inverse\nproblems. In the context of phase retrieval, a novel plug-and-play approach\nthat exploits learning-based prior and efficient update steps has been\npresented at the Computational Optical Sensing and Imaging topical meeting,\nwith demonstrated state-of-the-art performance. The key idea was to incorporate\nlearning-based prior to the Gerchberg-Saxton type algorithms through\nplug-and-play regularization. In this paper, we present the mathematical\ndevelopment of the method including the derivation of its analytical update\nsteps based on half-quadratic splitting and comparatively evaluate its\nperformance through extensive simulations on a large test dataset. The results\nshow the effectiveness of the method in terms of both image quality,\ncomputational efficiency, and robustness to initialization and noise.\n","authors":["Cagatay Isil","Figen S. Oktem"],"pdf_url":"https://arxiv.org/pdf/2411.18967v2.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.09980v1","updated":"2025-01-17T06:43:03Z","published":"2025-01-17T06:43:03Z","title":"Aneumo: A Large-Scale Comprehensive Synthetic Dataset of Aneurysm\n  Hemodynamics","summary":"  Intracranial aneurysm (IA) is a common cerebrovascular disease that is\nusually asymptomatic but may cause severe subarachnoid hemorrhage (SAH) if\nruptured. Although clinical practice is usually based on individual factors and\nmorphological features of the aneurysm, its pathophysiology and hemodynamic\nmechanisms remain controversial. To address the limitations of current\nresearch, this study constructed a comprehensive hemodynamic dataset of\nintracranial aneurysms. The dataset is based on 466 real aneurysm models, and\n10,000 synthetic models were generated by resection and deformation operations,\nincluding 466 aneurysm-free models and 9,534 deformed aneurysm models. The\ndataset also provides medical image-like segmentation mask files to support\ninsightful analysis. In addition, the dataset contains hemodynamic data\nmeasured at eight steady-state flow rates (0.001 to 0.004 kg/s), including\ncritical parameters such as flow velocity, pressure, and wall shear stress,\nproviding a valuable resource for investigating aneurysm pathogenesis and\nclinical prediction. This dataset will help advance the understanding of the\npathologic features and hemodynamic mechanisms of intracranial aneurysms and\nsupport in-depth research in related fields. Dataset hosted at\nhttps://github.com/Xigui-Li/Aneumo.\n","authors":["Xigui Li","Yuanye Zhou","Feiyang Xiao","Xin Guo","Yichi Zhang","Chen Jiang","Jianchao Ge","Xiansheng Wang","Qimeng Wang","Taiwei Zhang","Chensen Lin","Yuan Cheng","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2501.09980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13649v2","updated":"2025-01-17T06:38:23Z","published":"2024-08-24T18:30:15Z","title":"Tree-structured Markov random fields with Poisson marginal distributions","summary":"  A new family of tree-structured Markov random fields for a vector of discrete\ncounting random variables is introduced. According to the characteristics of\nthe family, the marginal distributions of the Markov random fields are all\nPoisson with the same mean, and are untied from the strength or structure of\ntheir built-in dependence. This key feature is uncommon for Markov random\nfields and most convenient for applications purposes. The specific properties\nof this new family confer a straightforward sampling procedure and analytic\nexpressions for the joint probability mass function and the joint probability\ngenerating function of the vector of counting random variables, thus granting\ncomputational methods that scale well to vectors of high dimension. We study\nthe distribution of the sum of random variables constituting a Markov random\nfield from the proposed family, analyze a random variable's individual\ncontribution to that sum through expected allocations, and establish stochastic\norderings to assess a wide understanding of their behavior.\n","authors":["Benjamin Côté","Hélène Cossette","Etienne Marceau"],"pdf_url":"https://arxiv.org/pdf/2408.13649v2.pdf","comment":"27 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.08443v3","updated":"2025-01-17T06:33:23Z","published":"2024-12-26T05:41:31Z","title":"Instruction-Guided Fusion of Multi-Layer Visual Features in Large\n  Vision-Language Models","summary":"  Large Vision-Language Models (LVLMs) have achieved remarkable success in a\nwide range of multimodal tasks by integrating pre-trained vision encoders and\nlarge language models. However, current LVLMs primarily rely on visual features\nextracted from the final layers of the vision encoder, overlooking the\ncomplementary information available in shallower layers. While recent\napproaches have explored the use of multilayer visual features in LVLMs, they\ntend to be task-agnostic and fail to examine the dependencies of hierarchical\nvisual features on specific tasks. To address these gaps, we systematically\ninvestigate the contributions of visual features from different encoder layers\nusing 18 benchmarks spanning 6 task categories. Our findings reveal that\nmultilayer features provide complementary strengths with varying task\ndependencies, and uniform fusion leads to suboptimal performance. Building on\nthese insights, we propose the instruction-guided vision aggregator, a module\nthat dynamically integrates multi-layer visual features based on textual\ninstructions, without increasing the number of visual tokens. Extensive\nevaluations demonstrate the superior performance of our method. Additionally,\nan in-depth analysis of the aggregator's behavior highlights the dominance of\nmid-to-high-level features in semantic-rich tasks and the critical role of\nlow-level features in fine-grained perception.\n","authors":["Xu Li","Yi Zheng","Haotian Chen","Xiaolei Chen","Yuxuan Liang","Chenghang Lai","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2501.08443v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09967v1","updated":"2025-01-17T06:16:57Z","published":"2025-01-17T06:16:57Z","title":"Explainable artificial intelligence (XAI): from inherent explainability\n  to large language models","summary":"  Artificial Intelligence (AI) has continued to achieve tremendous success in\nrecent times. However, the decision logic of these frameworks is often not\ntransparent, making it difficult for stakeholders to understand, interpret or\nexplain their behavior. This limitation hinders trust in machine learning\nsystems and causes a general reluctance towards their adoption in practical\napplications, particularly in mission-critical domains like healthcare and\nautonomous driving. Explainable AI (XAI) techniques facilitate the\nexplainability or interpretability of machine learning models, enabling users\nto discern the basis of the decision and possibly avert undesirable behavior.\nThis comprehensive survey details the advancements of explainable AI methods,\nfrom inherently interpretable models to modern approaches for achieving\ninterpretability of various black box models, including large language models\n(LLMs). Additionally, we review explainable AI techniques that leverage LLM and\nvision-language model (VLM) frameworks to automate or improve the\nexplainability of other machine learning models. The use of LLM and VLM as\ninterpretability methods particularly enables high-level, semantically\nmeaningful explanations of model decisions and behavior. Throughout the paper,\nwe highlight the scientific principles, strengths and weaknesses of\nstate-of-the-art methods and outline different areas of improvement. Where\nappropriate, we also present qualitative and quantitative comparison results of\nvarious methods to show how they compare. Finally, we discuss the key\nchallenges of XAI and directions for future research.\n","authors":["Fuseini Mumuni","Alhassan Mumuni"],"pdf_url":"https://arxiv.org/pdf/2501.09967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13632v4","updated":"2025-01-17T06:09:13Z","published":"2023-12-21T07:48:54Z","title":"TraceFL: Interpretability-Driven Debugging in Federated Learning via\n  Neuron Provenance","summary":"  In Federated Learning, clients train models on local data and send updates to\na central server, which aggregates them into a global model using a fusion\nalgorithm. This collaborative yet privacy-preserving training comes at a cost.\nFL developers face significant challenges in attributing global model\npredictions to specific clients. Localizing responsible clients is a crucial\nstep towards (a) excluding clients primarily responsible for incorrect\npredictions and (b) encouraging clients who contributed high-quality models to\ncontinue participating in the future. Existing ML debugging approaches are\ninherently inapplicable as they are designed for single-model, centralized\ntraining.\n  We introduce TraceFL, a fine-grained neuron provenance capturing mechanism\nthat identifies clients responsible for a global model's prediction by tracking\nthe flow of information from individual clients to the global model. Since\ninference on different inputs activates a different set of neurons of the\nglobal model, TraceFL dynamically quantifies the significance of the global\nmodel's neurons in a given prediction, identifying the most crucial neurons in\nthe global model. It then maps them to the corresponding neurons in every\nparticipating client to determine each client's contribution, ultimately\nlocalizing the responsible client. We evaluate TraceFL on six datasets,\nincluding two real-world medical imaging datasets and four neural networks,\nincluding advanced models such as GPT. TraceFL achieves 99% accuracy in\nlocalizing the responsible client in FL tasks spanning both image and text\nclassification tasks. At a time when state-of-the-artML debugging approaches\nare mostly domain-specific (e.g., image classification only), TraceFL is the\nfirst technique to enable highly accurate automated reasoning across a wide\nrange of FL applications.\n","authors":["Waris Gill","Ali Anwar","Muhammad Ali Gulzar"],"pdf_url":"https://arxiv.org/pdf/2312.13632v4.pdf","comment":"Accepted at 2025 IEEE/ACM 47th International Conference on Software\n  Engineering (ICSE)"},{"id":"http://arxiv.org/abs/2302.04344v3","updated":"2025-01-17T05:21:29Z","published":"2023-02-08T21:27:16Z","title":"Learning Dynamical Systems by Leveraging Data from Similar Systems","summary":"  We consider the problem of learning the dynamics of a linear system when one\nhas access to data generated by an auxiliary system that shares similar (but\nnot identical) dynamics, in addition to data from the true system. We use a\nweighted least squares approach, and provide finite sample error bounds of the\nlearned model as a function of the number of samples and various system\nparameters from the two systems as well as the weight assigned to the auxiliary\ndata. We show that the auxiliary data can help to reduce the intrinsic system\nidentification error due to noise, at the price of adding a portion of error\nthat is due to the differences between the two system models. We further\nprovide a data-dependent bound that is computable when some prior knowledge\nabout the systems, such as upper bounds on noise levels and model difference,\nis available. This bound can also be used to determine the weight that should\nbe assigned to the auxiliary data during the model training stage.\n","authors":["Lei Xin","Lintao Ye","George Chiu","Shreyas Sundaram"],"pdf_url":"https://arxiv.org/pdf/2302.04344v3.pdf","comment":"15 pages,9 figures"},{"id":"http://arxiv.org/abs/2501.09954v1","updated":"2025-01-17T04:57:42Z","published":"2025-01-17T04:57:42Z","title":"AIRCHITECT v2: Learning the Hardware Accelerator Design Space through\n  Unified Representations","summary":"  Design space exploration (DSE) plays a crucial role in enabling custom\nhardware architectures, particularly for emerging applications like AI, where\noptimized and specialized designs are essential. With the growing complexity of\ndeep neural networks (DNNs) and the introduction of advanced foundational\nmodels (FMs), the design space for DNN accelerators is expanding at an\nexponential rate. Additionally, this space is highly non-uniform and\nnon-convex, making it increasingly difficult to navigate and optimize.\nTraditional DSE techniques rely on search-based methods, which involve\niterative sampling of the design space to find the optimal solution. However,\nthis process is both time-consuming and often fails to converge to the global\noptima for such design spaces. Recently, AIrchitect v1, the first attempt to\naddress the limitations of search-based techniques, transformed DSE into a\nconstant-time classification problem using recommendation networks. In this\nwork, we propose AIrchitect v2, a more accurate and generalizable\nlearning-based DSE technique applicable to large-scale design spaces that\novercomes the shortcomings of earlier approaches. Specifically, we devise an\nencoder-decoder transformer model that (a) encodes the complex design space\ninto a uniform intermediate representation using contrastive learning and (b)\nleverages a novel unified representation blending the advantages of\nclassification and regression to effectively explore the large DSE space\nwithout sacrificing accuracy. Experimental results evaluated on 10^5 real DNN\nworkloads demonstrate that, on average, AIrchitect v2 outperforms existing\ntechniques by 15% in identifying optimal design points. Furthermore, to\ndemonstrate the generalizability of our method, we evaluate performance on\nunseen model workloads (LLMs) and attain a 1.7x improvement in inference\nlatency on the identified hardware architecture.\n","authors":["Jamin Seo","Akshat Ramachandran","Yu-Chuan Chuang","Anirudh Itagi","Tushar Krishna"],"pdf_url":"https://arxiv.org/pdf/2501.09954v1.pdf","comment":"Accepted to DATE 2025"},{"id":"http://arxiv.org/abs/2411.10435v2","updated":"2025-01-17T04:53:18Z","published":"2024-11-15T18:56:00Z","title":"The Spatial Complexity of Optical Computing and How to Reduce It","summary":"  Similar to algorithms, which consume time and memory to run, hardware\nrequires resources to function. For devices processing physical waves,\nimplementing operations needs sufficient \"space,\" as dictated by wave physics.\nHow much space is needed to perform a certain function is a fundamental\nquestion in optics, with recent research addressing it for given mathematical\noperations, but not for more general computing tasks, e.g., classification.\nInspired by computational complexity theory, we study the \"spatial complexity\"\nof optical computing systems in terms of scaling laws - specifically, how their\nphysical dimensions must scale as the dimension of the mathematical operation\nincreases - and propose a new paradigm for designing optical computing systems:\nspace-efficient neuromorphic optics, based on structural sparsity constraints\nand neural pruning methods motivated by wave physics (notably, the concept of\n\"overlapping nonlocality\"). On two mainstream platforms, free-space optics and\non-chip integrated photonics, our methods demonstrate substantial size\nreductions (to 1%-10% the size of conventional designs) with minimal compromise\non performance. Our theoretical and computational results reveal a trend of\ndiminishing returns on accuracy as structure dimensions increase, providing a\nnew perspective for interpreting and approaching the ultimate limits of optical\ncomputing - a balanced trade-off between device size and accuracy.\n","authors":["Yandong Li","Francesco Monticone"],"pdf_url":"https://arxiv.org/pdf/2411.10435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17308v2","updated":"2025-01-17T04:30:17Z","published":"2024-09-25T19:35:58Z","title":"Consistent estimation of generative model representations in the data\n  kernel perspective space","summary":"  Generative models, such as large language models and text-to-image diffusion\nmodels, produce relevant information when presented a query. Different models\nmay produce different information when presented the same query. As the\nlandscape of generative models evolves, it is important to develop techniques\nto study and analyze differences in model behaviour. In this paper we present\nnovel theoretical results for embedding-based representations of generative\nmodels in the context of a set of queries. In particular, we establish\nsufficient conditions for the consistent estimation of the model embeddings in\nsituations where the query set and the number of models grow.\n","authors":["Aranyak Acharyya","Michael W. Trosset","Carey E. Priebe","Hayden S. Helm"],"pdf_url":"https://arxiv.org/pdf/2409.17308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09949v1","updated":"2025-01-17T04:24:31Z","published":"2025-01-17T04:24:31Z","title":"MultiPruner: Balanced Structure Removal in Foundation Models","summary":"  Recently, state-of-the-art approaches for pruning large pre-trained models\n(LPMs) have demonstrated that the training-free removal of non-critical\nresidual blocks in Transformers is viable for reducing model size, achieving\nresults that outperform previous training-free pruning approaches. Motivated by\nthese findings, we extend BlockPruner (Zhong et al., 2024) and propose\nMultiPruner, a pruning approach that surpasses recent training-free pruning\nmethods by adopting a multidimensional, iterative, fine-grained pruning\nstrategy. In MultiPruner, multidimensional pruning reinstates the structural\nbalance in block-pruned models by sequentially compressing along three\ndimensions: i) residual blocks, ii) channels of multilayer perceptrons (MLP),\nand iii) attention heads. This solution enhances zero-shot accuracy on\ndownstream tasks compared to other techniques while improving model compression\nratios, producing compressed models with fewer computing and memory\nrequirements. Extensive experiments demonstrate the advantages of the proposed\nmethod across various large pre-trained models. The code and pruning\nconfigurations are available at\nhttps://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.\n","authors":["J. Pablo Muñoz","Jinjie Yuan","Nilesh Jain"],"pdf_url":"https://arxiv.org/pdf/2501.09949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11156v4","updated":"2025-01-17T04:21:47Z","published":"2023-03-17T17:53:19Z","title":"Can AI-Generated Text be Reliably Detected?","summary":"  Large Language Models (LLMs) perform impressively well in various\napplications. However, the potential for misuse of these models in activities\nsuch as plagiarism, generating fake news, and spamming has raised concern about\ntheir responsible use. Consequently, the reliable detection of AI-generated\ntext has become a critical area of research. AI text detectors have shown to be\neffective under their specific settings. In this paper, we stress-test the\nrobustness of these AI text detectors in the presence of an attacker. We\nintroduce recursive paraphrasing attack to stress test a wide range of\ndetection schemes, including the ones using the watermarking as well as neural\nnetwork-based detectors, zero shot classifiers, and retrieval-based detectors.\nOur experiments conducted on passages, each approximately 300 tokens long,\nreveal the varying sensitivities of these detectors to our attacks. Our\nfindings indicate that while our recursive paraphrasing method can\nsignificantly reduce detection rates, it only slightly degrades text quality in\nmany cases, highlighting potential vulnerabilities in current detection systems\nin the presence of an attacker. Additionally, we investigate the susceptibility\nof watermarked LLMs to spoofing attacks aimed at misclassifying human-written\ntext as AI-generated. We demonstrate that an attacker can infer hidden AI text\nsignatures without white-box access to the detection method, potentially\nleading to reputational risks for LLM developers. Finally, we provide a\ntheoretical framework connecting the AUROC of the best possible detector to the\nTotal Variation distance between human and AI text distributions. This analysis\noffers insights into the fundamental challenges of reliable detection as\nlanguage models continue to advance. Our code is publicly available at\nhttps://github.com/vinusankars/Reliability-of-AI-text-detectors.\n","authors":["Vinu Sankar Sadasivan","Aounon Kumar","Sriram Balasubramanian","Wenxiao Wang","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2303.11156v4.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2501.09946v1","updated":"2025-01-17T04:00:50Z","published":"2025-01-17T04:00:50Z","title":"Client-Centric Federated Adaptive Optimization","summary":"  Federated Learning (FL) is a distributed learning paradigm where clients\ncollaboratively train a model while keeping their own data private. With an\nincreasing scale of clients and models, FL encounters two key challenges,\nclient drift due to a high degree of statistical/system heterogeneity, and lack\nof adaptivity. However, most existing FL research is based on unrealistic\nassumptions that virtually ignore system heterogeneity. In this paper, we\npropose Client-Centric Federated Adaptive Optimization, which is a class of\nnovel federated adaptive optimization approaches. We enable several features in\nthis framework such as arbitrary client participation, asynchronous server\naggregation, and heterogeneous local computing, which are ubiquitous in\nreal-world FL systems but are missed in most existing works. We provide a\nrigorous convergence analysis of our proposed framework for general nonconvex\nobjectives, which is shown to converge with the best-known rate. Extensive\nexperiments show that our approaches consistently outperform the baseline by a\nlarge margin across benchmarks.\n","authors":["Jianhui Sun","Xidong Wu","Heng Huang","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13238v4","updated":"2025-01-17T03:44:29Z","published":"2024-05-21T22:53:00Z","title":"Enhancing User Interest based on Stream Clustering and Memory Networks\n  in Large-Scale Recommender Systems","summary":"  Recommender Systems (RSs) provide personalized recommendation service based\non user interest, which are widely used in various platforms. However, there\nare lots of users with sparse interest due to lacking consumption behaviors,\nwhich leads to poor recommendation results for them. This problem is widespread\nin large-scale RSs and is particularly difficult to address. To solve this\nproblem, we propose a novel solution named User Interest Enhancement (UIE)\nwhich enhances user interest including user profile and user history behavior\nsequences using the enhancement vectors and personalized enhancement vector\ngenerated based on stream clustering and memory networks from different\nperspectives. UIE not only remarkably improves model performance on the users\nwith sparse interest but also significantly enhance model performance on other\nusers. UIE is an end-to-end solution which is easy to be implemented based on\nranking model. Moreover, we expand our solution and apply similar methods to\nlong-tail items, which also achieves excellent improvement. Furthermore, we\nconduct extensive offline and online experiments in a large-scale industrial\nRS. The results demonstrate that our model outperforms other models remarkably,\nespecially for the users with sparse interest. Until now, UIE has been fully\ndeployed in multiple large-scale RSs and achieved remarkable improvements.\n","authors":["Peng Liu","Nian Wang","Cong Xu","Ming Zhao","Bin Wang","Yi Ren"],"pdf_url":"https://arxiv.org/pdf/2405.13238v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16950v5","updated":"2025-01-17T03:43:53Z","published":"2024-03-25T17:11:28Z","title":"Aligning with Human Judgement: The Role of Pairwise Preference in Large\n  Language Model Evaluators","summary":"  Large Language Models (LLMs) have demonstrated promising capabilities as\nautomatic evaluators in assessing the quality of generated natural language.\nHowever, LLMs still exhibit biases in evaluation and often struggle to generate\ncoherent evaluations that align with human assessments. In this work, we first\nconduct a systematic study of the misalignment between LLM evaluators and human\nevaluation, revealing that existing calibration methods aimed at mitigating\nbiases of LLMs are insufficient for effectively aligning LLM evaluators.\nInspired by the use of preference data in RLHF, we formulate the evaluation as\na ranking problem and introduce Pairwise-preference Search (PAIRS), an\nuncertainty-guided search-based rank aggregation method that employs LLMs to\nconduct pairwise comparisons locally and efficiently ranks candidate texts\nglobally. PAIRS achieves state-of-the-art performance on representative\nevaluation tasks in long-form generations and demonstrates significant\nimprovements over direct scoring. Furthermore, we provide insights into the\nrole of pairwise preference in quantifying the transitivity of LLMs and\ndemonstrate how PAIRS benefits from calibration using debiased pairwise\nevaluations.\n","authors":["Yinhong Liu","Han Zhou","Zhijiang Guo","Ehsan Shareghi","Ivan Vulić","Anna Korhonen","Nigel Collier"],"pdf_url":"https://arxiv.org/pdf/2403.16950v5.pdf","comment":"This paper has been accepted by COLM 2024"},{"id":"http://arxiv.org/abs/2501.09938v1","updated":"2025-01-17T03:29:41Z","published":"2025-01-17T03:29:41Z","title":"A Multi-Scale Feature Extraction and Fusion Deep Learning Method for\n  Classification of Wheat Diseases","summary":"  Wheat is an important source of dietary fiber and protein that is negatively\nimpacted by a number of risks to its growth. The difficulty of identifying and\nclassifying wheat diseases is discussed with an emphasis on wheat loose smut,\nleaf rust, and crown and root rot. Addressing conditions like crown and root\nrot, this study introduces an innovative approach that integrates multi-scale\nfeature extraction with advanced image segmentation techniques to enhance\nclassification accuracy. The proposed method uses neural network models\nXception, Inception V3, and ResNet 50 to train on a large wheat disease\nclassification dataset 2020 in conjunction with an ensemble of machine vision\nclassifiers, including voting and stacking. The study shows that the suggested\nmethodology has a superior accuracy of 99.75% in the classification of wheat\ndiseases when compared to current state-of-the-art approaches. A deep learning\nensemble model Xception showed the highest accuracy.\n","authors":["Sajjad Saleem","Adil Hussain","Nabila Majeed","Zahid Akhtar","Kamran Siddique"],"pdf_url":"https://arxiv.org/pdf/2501.09938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09934v1","updated":"2025-01-17T03:15:03Z","published":"2025-01-17T03:15:03Z","title":"HEART: Achieving Timely Multi-Model Training for\n  Vehicle-Edge-Cloud-Integrated Hierarchical Federated Learning","summary":"  The rapid growth of AI-enabled Internet of Vehicles (IoV) calls for efficient\nmachine learning (ML) solutions that can handle high vehicular mobility and\ndecentralized data. This has motivated the emergence of Hierarchical Federated\nLearning over vehicle-edge-cloud architectures (VEC-HFL). Nevertheless, one\naspect which is underexplored in the literature on VEC-HFL is that vehicles\noften need to execute multiple ML tasks simultaneously, where this multi-model\ntraining environment introduces crucial challenges. First, improper aggregation\nrules can lead to model obsolescence and prolonged training times. Second,\nvehicular mobility may result in inefficient data utilization by preventing the\nvehicles from returning their models to the network edge. Third, achieving a\nbalanced resource allocation across diverse tasks becomes of paramount\nimportance as it majorly affects the effectiveness of collaborative training.\nWe take one of the first steps towards addressing these challenges via\nproposing a framework for multi-model training in dynamic VEC-HFL with the goal\nof minimizing global training latency while ensuring balanced training across\nvarious tasks-a problem that turns out to be NP-hard. To facilitate timely\nmodel training, we introduce a hybrid synchronous-asynchronous aggregation\nrule. Building on this, we present a novel method called Hybrid Evolutionary\nAnd gReedy allocaTion (HEART). The framework operates in two stages: first, it\nachieves balanced task scheduling through a hybrid heuristic approach that\ncombines improved Particle Swarm Optimization (PSO) and Genetic Algorithms\n(GA); second, it employs a low-complexity greedy algorithm to determine the\ntraining priority of assigned tasks on vehicles. Experiments on real-world\ndatasets demonstrate the superiority of HEART over existing methods.\n","authors":["Xiaohong Yang","Minghui Liwang","Xianbin Wang","Zhipeng Cheng","Seyyedali Hosseinalipour","Huaiyu Dai","Zhenzhen Jiao"],"pdf_url":"https://arxiv.org/pdf/2501.09934v1.pdf","comment":"14 pages, 6 figures,"},{"id":"http://arxiv.org/abs/2501.09933v1","updated":"2025-01-17T03:14:43Z","published":"2025-01-17T03:14:43Z","title":"Statistical Inference for Sequential Feature Selection after Domain\n  Adaptation","summary":"  In high-dimensional regression, feature selection methods, such as sequential\nfeature selection (SeqFS), are commonly used to identify relevant features.\nWhen data is limited, domain adaptation (DA) becomes crucial for transferring\nknowledge from a related source domain to a target domain, improving\ngeneralization performance. Although SeqFS after DA is an important task in\nmachine learning, none of the existing methods can guarantee the reliability of\nits results. In this paper, we propose a novel method for testing the features\nselected by SeqFS-DA. The main advantage of the proposed method is its\ncapability to control the false positive rate (FPR) below a significance level\n$\\alpha$ (e.g., 0.05). Additionally, a strategic approach is introduced to\nenhance the statistical power of the test. Furthermore, we provide extensions\nof the proposed method to SeqFS with model selection criteria including AIC,\nBIC, and adjusted R-squared. Extensive experiments are conducted on both\nsynthetic and real-world datasets to validate the theoretical results and\ndemonstrate the proposed method's superior performance.\n","authors":["Duong Tan Loc","Nguyen Thang Loi","Vo Nguyen Le Duy"],"pdf_url":"https://arxiv.org/pdf/2501.09933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10960v4","updated":"2025-01-17T03:09:24Z","published":"2024-07-15T17:55:42Z","title":"Fast Matrix Multiplications for Lookup Table-Quantized LLMs","summary":"  The deployment of large language models (LLMs) is often constrained by memory\nbandwidth, where the primary bottleneck is the cost of transferring model\nparameters from the GPU's global memory to its registers. When coupled with\ncustom kernels that fuse the dequantization and matmul operations, weight-only\nquantization can thus enable faster inference by reducing the amount of memory\nmovement. However, developing high-performance kernels for weight-quantized\nLLMs presents substantial challenges, especially when the weights are\ncompressed to non-evenly-divisible bit widths (e.g., 3 bits) with non-uniform,\nlookup table (LUT) quantization. This paper describes FLUTE, a flexible lookup\ntable engine for LUT-quantized LLMs, which uses offline restructuring of the\nquantized weight matrix to minimize bit manipulations associated with\nunpacking, and vectorization and duplication of the lookup table to mitigate\nshared memory bandwidth constraints. At batch sizes < 32 and quantization group\nsize of 128 (typical in LLM inference), the FLUTE kernel can be 2-4x faster\nthan existing GEMM kernels. As an application of FLUTE, we explore a simple\nextension to lookup table-based NormalFloat quantization and apply it to\nquantize LLaMA3 to various configurations, obtaining competitive quantization\nperformance against strong baselines while obtaining an end-to-end throughput\nincrease of 1.5 to 2 times.\n","authors":["Han Guo","William Brandon","Radostin Cholakov","Jonathan Ragan-Kelley","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2407.10960v4.pdf","comment":"EMNLP 2024 (Findings)"},{"id":"http://arxiv.org/abs/2501.09929v1","updated":"2025-01-17T02:55:23Z","published":"2025-01-17T02:55:23Z","title":"Steering Large Language Models with Feature Guided Activation Additions","summary":"  Effective and reliable control over large language model (LLM) behavior is a\nsignificant challenge. While activation steering methods, which add steering\nvectors to a model's hidden states, are a promising approach, existing\ntechniques often lack precision and interpretability in how they influence\nmodel outputs. We introduce Feature Guided Activation Additions (FGAA), a novel\nactivation steering method that leverages insights from Contrastive Activation\nAddition (CAA) and Sparse Autoencoder-Targeted Steering (SAE-TS). By operating\nin the latent space of a Sparse Autoencoder (SAE) and employing optimization\ntechniques to select desired SAE features, FGAA constructs precise steering\nvectors that provide better steering effects while maintaining coherence of\nsteered model outputs. In this regard, evaluations on Gemma-2-2B and Gemma-2-9B\nmodels across various steering tasks demonstrate that FGAA outperforms existing\nsteering methods of CAA, SAE decoder steering, and SAE-TS. Our results also\nhighlight important trade-offs between steering scale and general model\ncapabilities that are consistent across all tested steering methods.\n","authors":["Samuel Soo","Wesley Teng","Chandrasekaran Balaganesh"],"pdf_url":"https://arxiv.org/pdf/2501.09929v1.pdf","comment":"7 maintext pages, 14 appendix pages"},{"id":"http://arxiv.org/abs/2412.15559v3","updated":"2025-01-17T02:50:03Z","published":"2024-12-20T04:36:58Z","title":"Spatial Clustering of Citizen Science Data Improves Downstream Species\n  Distribution Models","summary":"  Citizen science biodiversity data present great opportunities for ecology and\nconservation across vast spatial and temporal scales. However, the\nopportunistic nature of these data lacks the sampling structure required by\nmodeling methodologies that address a pervasive challenge in ecological data\ncollection: imperfect detection, i.e., the likelihood of under-observing\nspecies on field surveys. Occupancy modeling is an example of an approach that\naccounts for imperfect detection by explicitly modeling the observation process\nseparately from the biological process of habitat selection. This produces\nspecies distribution models that speak to the pattern of the species on a\nlandscape after accounting for imperfect detection in the data, rather than the\npattern of species observations corrupted by errors. To achieve this benefit,\noccupancy models require multiple surveys of a site across which the site's\nstatus (i.e., occupied or not) is assumed constant. Since citizen science data\nare not collected under the required repeated-visit protocol, observations may\nbe grouped into sites post hoc. Existing approaches for constructing sites\ndiscard some observations and/or consider only geographic distance and not\nenvironmental similarity. In this study, we compare ten approaches for site\nconstruction in terms of their impact on downstream species distribution models\nfor 31 bird species in Oregon, using observations recorded in the eBird\ndatabase. We find that occupancy models built on sites constructed by spatial\nclustering algorithms perform better than existing alternatives.\n","authors":["Nahian Ahmed","Mark Roth","Tyler A. Hallman","W. Douglas Robinson","Rebecca A. Hutchinson"],"pdf_url":"https://arxiv.org/pdf/2412.15559v3.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.09923v1","updated":"2025-01-17T02:40:04Z","published":"2025-01-17T02:40:04Z","title":"Study on a Fast Solver for Combined Field Integral Equations of 3D\n  Conducting Bodies Based on Graph Neural Networks","summary":"  In this paper, we present a graph neural networks (GNNs)-based fast solver\n(GraphSolver) for solving combined field integral equations (CFIEs) of 3D\nconducting bodies. Rao-Wilton-Glisson (RWG) basis functions are employed to\ndiscretely and accurately represent the geometry of 3D conducting bodies. A\nconcise and informative graph representation is then constructed by treating\neach RWG function as a node in the graph, enabling the flow of current between\nnodes. With the transformed graphs, GraphSolver is developed to directly\npredict real and imaginary parts of the x, y and z components of the surface\ncurrent densities at each node (RWG function). Numerical results demonstrate\nthe efficacy of GraphSolver in solving CFIEs for 3D conducting bodies with\nvarying levels of geometric complexity, including basic 3D targets,\nmissile-shaped targets, and airplane-shaped targets.\n","authors":["Tao Shan","Xin Zhang","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2501.09923v1.pdf","comment":"10 pages,11 figures"},{"id":"http://arxiv.org/abs/2402.05982v3","updated":"2025-01-17T02:28:17Z","published":"2024-02-08T13:02:05Z","title":"Decoupled Sequence and Structure Generation for Realistic Antibody\n  Design","summary":"  Recently, deep learning has made rapid progress in antibody design, which\nplays a key role in the advancement of therapeutics. A dominant paradigm is to\ntrain a model to jointly generate the antibody sequence and the structure as a\ncandidate. However, the joint generation requires the model to generate both\nthe discrete amino acid categories and the continuous 3D coordinates; this\nlimits the space of possible architectures and may lead to suboptimal\nperformance. In response, we propose an antibody sequence-structure decoupling\n(ASSD) framework, which separates sequence generation and structure prediction.\nAlthough our approach is simple, our idea allows the use of powerful neural\narchitectures and demonstrates notable performance improvements. We also find\nthat the widely used non-autoregressive generators promote sequences with\noverly repeating tokens. Such sequences are both out-of-distribution and prone\nto undesirable developability properties that can trigger harmful immune\nresponses in patients. To resolve this, we introduce a composition-based\nobjective that allows an efficient trade-off between high performance and low\ntoken repetition. ASSD shows improved performance in various antibody design\nexperiments, while the composition-based objective successfully mitigates token\nrepetition of non-autoregressive models.\n","authors":["Nayoung Kim","Minsu Kim","Sungsoo Ahn","Jinkyoo Park"],"pdf_url":"https://arxiv.org/pdf/2402.05982v3.pdf","comment":"22 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.14440v3","updated":"2025-01-17T01:49:21Z","published":"2024-05-23T11:14:35Z","title":"Bayesian Adaptive Calibration and Optimal Design","summary":"  The process of calibrating computer models of natural phenomena is essential\nfor applications in the physical sciences, where plenty of domain knowledge can\nbe embedded into simulations and then calibrated against real observations.\nCurrent machine learning approaches, however, mostly rely on rerunning\nsimulations over a fixed set of designs available in the observed data,\npotentially neglecting informative correlations across the design space and\nrequiring a large amount of simulations. Instead, we consider the calibration\nprocess from the perspective of Bayesian adaptive experimental design and\npropose a data-efficient algorithm to run maximally informative simulations\nwithin a batch-sequential process. At each round, the algorithm jointly\nestimates the parameters of the posterior distribution and optimal designs by\nmaximising a variational lower bound of the expected information gain. The\nsimulator is modelled as a sample from a Gaussian process, which allows us to\ncorrelate simulations and observed data with the unknown calibration\nparameters. We show the benefits of our method when compared to related\napproaches across synthetic and real-data problems.\n","authors":["Rafael Oliveira","Dino Sejdinovic","David Howard","Edwin V. Bonilla"],"pdf_url":"https://arxiv.org/pdf/2405.14440v3.pdf","comment":"NeurIPS 2024 final revision"},{"id":"http://arxiv.org/abs/2402.18540v2","updated":"2025-01-17T01:43:21Z","published":"2024-02-28T18:23:49Z","title":"Keeping LLMs Aligned After Fine-tuning: The Crucial Role of Prompt\n  Templates","summary":"  Public LLMs such as the Llama 2-Chat underwent alignment training and were\nconsidered safe. Recently Qi et al. [2024] reported that even benign\nfine-tuning on seemingly safe datasets can give rise to unsafe behaviors in the\nmodels. The current paper is about methods and best practices to mitigate such\nloss of alignment. We focus on the setting where a public model is fine-tuned\nbefore serving users for specific usage, where the model should improve on the\ndownstream task while maintaining alignment. Through extensive experiments on\nseveral chat models (Meta's Llama 2-Chat, Mistral AI's Mistral 7B Instruct\nv0.2, and OpenAI's GPT-3.5 Turbo), this paper uncovers that the prompt\ntemplates used during fine-tuning and inference play a crucial role in\npreserving safety alignment, and proposes the ``Pure Tuning, Safe Testing''\n(PTST) strategy -- fine-tune models without a safety prompt, but include it at\ntest time. This seemingly counterintuitive strategy incorporates an intended\ndistribution shift to encourage alignment preservation. Fine-tuning experiments\non GSM8K, ChatDoctor, and OpenOrca show that PTST significantly reduces the\nrise of unsafe behaviors.\n","authors":["Kaifeng Lyu","Haoyu Zhao","Xinran Gu","Dingli Yu","Anirudh Goyal","Sanjeev Arora"],"pdf_url":"https://arxiv.org/pdf/2402.18540v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09905v1","updated":"2025-01-17T01:32:18Z","published":"2025-01-17T01:32:18Z","title":"SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon\n  Visuomotor Learning","summary":"  We present a low-cost quadruped manipulation system that solves long-horizon\nreal-world tasks, trained by reinforcement learning purely in simulation. The\nsystem comprises 1) a hierarchical design of a high-level policy for\nvisual-mobile manipulation following instructions, and a low-level policy for\nquadruped movement and limb-control, 2) a progressive policy expansion approach\nfor solving the long-horizon task together with a teacher-student framework for\nefficient high-level training of the high-level visuomotor policy, and 3) a\nsuite of techniques for minimizing sim-to-real gaps.\n  With budget-friendly but limited reliability and performance hardware, and\njust one wrist-mounted RGB camera, the entire system fully trained in\nsimulation achieves high success rates for long horizon tasks involving search,\nmove, grasp, and drop-into, with fluid sim-to-real transfer in a wide variety\nof indoor and outdoor scenes and lighting conditions.Extensive real-world\nevaluations show that on the long horizon mobile manipulation tasks, our system\nachieves good performance when transferred to real both in terms of task\nsuccess rate and execution efficiency. Finally, we discuss the necessity of our\nsim-to-real techniques for legged mobile manipulation, and show their ablation\nperformance.\n","authors":["Haichao Zhang","Haonan Yu","Le Zhao","Andrew Choi","Qinxun Bai","Yiqing Yang","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2501.09905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09900v1","updated":"2025-01-17T01:13:44Z","published":"2025-01-17T01:13:44Z","title":"SBAMDT: Bayesian Additive Decision Trees with Adaptive Soft\n  Semi-multivariate Split Rules","summary":"  Bayesian Additive Regression Trees [BART, Chipman et al., 2010] have gained\nsignificant popularity due to their remarkable predictive performance and\nability to quantify uncertainty. However, standard decision tree models rely on\nrecursive data splits at each decision node, using deterministic decision rules\nbased on a single univariate feature. This approach limits their ability to\neffectively capture complex decision boundaries, particularly in scenarios\ninvolving multiple features, such as spatial domains, or when transitions are\neither sharp or smoothly varying. In this paper, we introduce a novel\nprobabilistic additive decision tree model that employs a soft split rule. This\nmethod enables highly flexible splits that leverage both univariate and\nmultivariate features, while also respecting the geometric properties of the\nfeature domain. Notably, the probabilistic split rule adapts dynamically across\ndecision nodes, allowing the model to account for varying levels of smoothness\nin the regression function. We demonstrate the utility of the proposed model\nthrough comparisons with existing tree-based models on synthetic datasets and a\nNew York City education dataset.\n","authors":["Stamatina Lamprinakou","Huiyan Sang","Bledar A. Konomi","Ligang Lu"],"pdf_url":"https://arxiv.org/pdf/2501.09900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04421v2","updated":"2025-01-17T01:11:16Z","published":"2024-09-06T17:30:45Z","title":"RLPF: Reinforcement Learning from Prediction Feedback for User\n  Summarization with LLMs","summary":"  LLM-powered personalization agent systems employ Large Language Models (LLMs)\nto predict users' behavior from their past activities. However, their\neffectiveness often hinges on the ability to effectively leverage extensive,\nlong user historical data due to its inherent noise and length of such data.\nExisting pretrained LLMs may generate summaries that are concise but lack the\nnecessary context for downstream tasks, hindering their utility in\npersonalization systems. To address these challenges, we introduce\nReinforcement Learning from Prediction Feedback (RLPF). RLPF fine-tunes LLMs to\ngenerate concise, human-readable user summaries that are optimized for\ndownstream task performance. By maximizing the usefulness of the generated\nsummaries, RLPF effectively distills extensive user history data while\npreserving essential information for downstream tasks. Our empirical evaluation\ndemonstrates significant improvements in both extrinsic downstream task utility\nand intrinsic summary quality, surpassing baseline methods by up to 22% on\ndownstream task performance and achieving an up to 84.59% win rate on\nFactuality, Abstractiveness, and Readability. RLPF also achieves a remarkable\n74% reduction in context length while improving performance on 16 out of 19\nunseen tasks and/or datasets, showcasing its generalizability. This approach\noffers a promising solution for enhancing LLM personalization by effectively\ntransforming long, noisy user histories into informative and human-readable\nrepresentations.\n","authors":["Jiaxing Wu","Lin Ning","Luyang Liu","Harrison Lee","Neo Wu","Chao Wang","Sushant Prakash","Shawn O'Banion","Bradley Green","Jun Xie"],"pdf_url":"https://arxiv.org/pdf/2409.04421v2.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.09898v1","updated":"2025-01-17T01:01:44Z","published":"2025-01-17T01:01:44Z","title":"FoundationStereo: Zero-Shot Stereo Matching","summary":"  Tremendous progress has been made in deep stereo matching to excel on\nbenchmark datasets through per-domain fine-tuning. However, achieving strong\nzero-shot generalization - a hallmark of foundation models in other computer\nvision tasks - remains challenging for stereo matching. We introduce\nFoundationStereo, a foundation model for stereo depth estimation designed to\nachieve strong zero-shot generalization. To this end, we first construct a\nlarge-scale (1M stereo pairs) synthetic training dataset featuring large\ndiversity and high photorealism, followed by an automatic self-curation\npipeline to remove ambiguous samples. We then design a number of network\narchitecture components to enhance scalability, including a side-tuning feature\nbackbone that adapts rich monocular priors from vision foundation models to\nmitigate the sim-to-real gap, and long-range context reasoning for effective\ncost volume filtering. Together, these components lead to strong robustness and\naccuracy across domains, establishing a new standard in zero-shot stereo depth\nestimation.\n","authors":["Bowen Wen","Matthew Trepte","Joseph Aribido","Jan Kautz","Orazio Gallo","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2501.09898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14593v2","updated":"2025-01-17T01:00:13Z","published":"2024-11-21T21:23:46Z","title":"A Systematic Study of Multi-Agent Deep Reinforcement Learning for Safe\n  and Robust Autonomous Highway Ramp Entry","summary":"  Vehicles today can drive themselves on highways and driverless robotaxis\noperate in major cities, with more sophisticated levels of autonomous driving\nexpected to be available and become more common in the future. Yet, technically\nspeaking, so-called \"Level 5\" (L5) operation, corresponding to full autonomy,\nhas not been achieved. For that to happen, functions such as fully autonomous\nhighway ramp entry must be available, and provide provably safe, and reliably\nrobust behavior to enable full autonomy. We present a systematic study of a\nhighway ramp function that controls the vehicles forward-moving actions to\nminimize collisions with the stream of highway traffic into which a merging\n(ego) vehicle enters. We take a game-theoretic multi-agent (MA) approach to\nthis problem and study the use of controllers based on deep reinforcement\nlearning (DRL). The virtual environment of the MA DRL uses self-play with\nsimulated data where merging vehicles safely learn to control longitudinal\nposition during a taper-type merge. The work presented in this paper extends\nexisting work by studying the interaction of more than two vehicles (agents)\nand does so by systematically expanding the road scene with additional traffic\nand ego vehicles. While previous work on the two-vehicle setting established\nthat collision-free controllers are theoretically impossible in fully\ndecentralized, non-coordinated environments, we empirically show that\ncontrollers learned using our approach are nearly ideal when measured against\nidealized optimal controllers.\n","authors":["Larry Schester","Luis E. Ortiz"],"pdf_url":"https://arxiv.org/pdf/2411.14593v2.pdf","comment":"9 pages, 9 figures; added support ack"},{"id":"http://arxiv.org/abs/2501.09893v1","updated":"2025-01-17T00:45:10Z","published":"2025-01-17T00:45:10Z","title":"Sparse Binary Representation Learning for Knowledge Tracing","summary":"  Knowledge tracing (KT) models aim to predict students' future performance\nbased on their historical interactions. Most existing KT models rely\nexclusively on human-defined knowledge concepts (KCs) associated with\nexercises. As a result, the effectiveness of these models is highly dependent\non the quality and completeness of the predefined KCs. Human errors in labeling\nand the cost of covering all potential underlying KCs can limit model\nperformance.\n  In this paper, we propose a KT model, Sparse Binary Representation KT\n(SBRKT), that generates new KC labels, referred to as auxiliary KCs, which can\naugment the predefined KCs to address the limitations of relying solely on\nhuman-defined KCs. These are learned through a binary vector representation,\nwhere each bit indicates the presence (one) or absence (zero) of an auxiliary\nKC. The resulting discrete representation allows these auxiliary KCs to be\nutilized in training any KT model that incorporates KCs. Unlike pre-trained\ndense embeddings, which are limited to models designed to accept such vectors,\nour discrete representations are compatible with both classical models, such as\nBayesian Knowledge Tracing (BKT), and modern deep learning approaches.\n  To generate this discrete representation, SBRKT employs a binarization method\nthat learns a sparse representation, fully trainable via stochastic gradient\ndescent. Additionally, SBRKT incorporates a recurrent neural network (RNN) to\ncapture temporal dynamics and predict future student responses by effectively\ncombining the auxiliary and predefined KCs. Experimental results demonstrate\nthat SBRKT outperforms the tested baselines on several datasets and achieves\ncompetitive performance on others. Furthermore, incorporating the learned\nauxiliary KCs consistently enhances the performance of BKT across all tested\ndatasets.\n","authors":["Yahya Badran","Christine Preisach"],"pdf_url":"https://arxiv.org/pdf/2501.09893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09597v2","updated":"2025-01-17T00:25:18Z","published":"2024-10-12T17:23:34Z","title":"A Complete Characterization of Learnability for Stochastic Noisy Bandits","summary":"  We study the stochastic noisy bandit problem with an unknown reward function\n$f^*$ in a known function class $\\mathcal{F}$. Formally, a model $M$ maps arms\n$\\pi$ to a probability distribution $M(\\pi)$ of reward. A model class\n$\\mathcal{M}$ is a collection of models. For each model $M$, define its mean\nreward function $f^M(\\pi)=\\mathbb{E}_{r \\sim M(\\pi)}[r]$. In the bandit\nlearning problem, we proceed in rounds, pulling one arm $\\pi$ each round and\nobserving a reward sampled from $M(\\pi)$. With knowledge of $\\mathcal{M}$,\nsupposing that the true model $M\\in \\mathcal{M}$, the objective is to identify\nan arm $\\hat{\\pi}$ of near-maximal mean reward $f^M(\\hat{\\pi})$ with high\nprobability in a bounded number of rounds. If this is possible, then the model\nclass is said to be learnable.\n  Importantly, a result of \\cite{hanneke2023bandit} shows there exist model\nclasses for which learnability is undecidable. However, the model class they\nconsider features deterministic rewards, and they raise the question of whether\nlearnability is decidable for classes containing sufficiently noisy models. For\nthe first time, we answer this question in the positive by giving a complete\ncharacterization of learnability for model classes with arbitrary noise. In\naddition to that, we also describe the full spectrum of possible optimal query\ncomplexities. Further, we prove adaptivity is sometimes necessary to achieve\nthe optimal query complexity. Last, we revisit an important complexity measure\nfor interactive decision making, the Decision-Estimation-Coefficient\n\\citep{foster2021statistical,foster2023tight}, and propose a new variant of the\nDEC which also characterizes learnability in this setting.\n","authors":["Steve Hanneke","Kun Wang"],"pdf_url":"https://arxiv.org/pdf/2410.09597v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.01130v2","updated":"2025-01-17T17:04:07Z","published":"2021-02-01T19:45:47Z","title":"Comparing hundreds of machine learning classifiers and discrete choice\n  models in predicting travel behavior: an empirical benchmark","summary":"  Numerous studies have compared machine learning (ML) and discrete choice\nmodels (DCMs) in predicting travel demand. However, these studies often lack\ngeneralizability as they compare models deterministically without considering\ncontextual variations. To address this limitation, our study develops an\nempirical benchmark by designing a tournament model, thus efficiently\nsummarizing a large number of experiments, quantifying the randomness in model\ncomparisons, and using formal statistical tests to differentiate between the\nmodel and contextual effects. This benchmark study compares two large-scale\ndata sources: a database compiled from literature review summarizing 136\nexperiments from 35 studies, and our own experiment data, encompassing a total\nof 6,970 experiments from 105 models and 12 model families. This benchmark\nstudy yields two key findings. Firstly, many ML models, particularly the\nensemble methods and deep learning, statistically outperform the DCM family\n(i.e., multinomial, nested, and mixed logit models). However, this study also\nhighlights the crucial role of the contextual factors (i.e., data sources,\ninputs and choice categories), which can explain models' predictive performance\nmore effectively than the differences in model types alone. Model performance\nvaries significantly with data sources, improving with larger sample sizes and\nlower dimensional alternative sets. After controlling all the model and\ncontextual factors, significant randomness still remains, implying inherent\nuncertainty in such model comparisons. Overall, we suggest that future\nresearchers shift more focus from context-specific model comparisons towards\nexamining model transferability across contexts and characterizing the inherent\nuncertainty in ML, thus creating more robust and generalizable next-generation\ntravel demand models.\n","authors":["Shenhao Wang","Baichuan Mo","Yunhan Zheng","Stephane Hess","Jinhua Zhao"],"pdf_url":"https://arxiv.org/pdf/2102.01130v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2501.10275v1","updated":"2025-01-17T16:05:45Z","published":"2025-01-17T16:05:45Z","title":"Using Technology in Digital Humanities for Learning and Knowledge\n  Dissemination","summary":"  Research on Digital Humanities (DH) has been boosted due to the investment in\ntechnology for developing access and interaction tools for handling Humanities\nand Heritage data. The availability of these tools lowers the distance between\nDH scholars and data generators, and students at various levels, not only\nbecause it facilitates access to information but also through the dissemination\ntechnologies used in these tools, designed for the improvement of user\nexperience. Most of the disciplines associated with the humanities involve\ngeographical and temporal references, often integrated. These references have\nbeen scientifically and pedagogically handled for centuries and are established\nthrough the use of maps and timelines. Both these supports have been\nimplemented and used digitally and their potential has been risen through their\ninnovative integration with narratives, storytelling and story maps, enabling\nthe telling of historical events in narratives superimposed on maps. These can\nbe enhanced when supported by rich data, such as images, videos, sound, and\ntheir possible combinations in virtual and augmented reality. In this paper, we\ndescribe an initial set of tools which use a subset of these technologies and\ndata types to enable learning and dissemination of Humanities data and\nknowledge. We describe how techniques for making data available and tools for\nenhancing interaction with these data can improve user experience and\npotentiate learning and dissemination.\n","authors":["Armanda Rodrigues","Nuno Correia"],"pdf_url":"https://arxiv.org/pdf/2501.10275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10075v1","updated":"2025-01-17T09:47:27Z","published":"2025-01-17T09:47:27Z","title":"Robust Change Captioning in Remote Sensing: SECOND-CC Dataset and\n  MModalCC Framework","summary":"  Remote sensing change captioning (RSICC) aims to describe changes between\nbitemporal images in natural language. Existing methods often fail under\nchallenges like illumination differences, viewpoint changes, blur effects,\nleading to inaccuracies, especially in no-change regions. Moreover, the images\nacquired at different spatial resolutions and have registration errors tend to\naffect the captions. To address these issues, we introduce SECOND-CC, a novel\nRSICC dataset featuring high-resolution RGB image pairs, semantic segmentation\nmaps, and diverse real-world scenarios. SECOND-CC which contains 6,041 pairs of\nbitemporal RS images and 30,205 sentences describing the differences between\nimages. Additionally, we propose MModalCC, a multimodal framework that\nintegrates semantic and visual data using advanced attention mechanisms,\nincluding Cross-Modal Cross Attention (CMCA) and Multimodal Gated Cross\nAttention (MGCA). Detailed ablation studies and attention visualizations\nfurther demonstrate its effectiveness and ability to address RSICC challenges.\nComprehensive experiments show that MModalCC outperforms state-of-the-art RSICC\nmethods, including RSICCformer, Chg2Cap, and PSNet with +4.6% improvement on\nBLEU4 score and +9.6% improvement on CIDEr score. We will make our dataset and\ncodebase publicly available to facilitate future research at\nhttps://github.com/ChangeCapsInRS/SecondCC\n","authors":["Ali Can Karaca","M. Enes Ozelbas","Saadettin Berber","Orkhan Karimli","Turabi Yildirim","M. Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2501.10075v1.pdf","comment":"This work has been submitted to the IEEE Transactions on Geoscience\n  and Remote Sensing journal for possible publication"},{"id":"http://arxiv.org/abs/2501.10071v1","updated":"2025-01-17T09:43:14Z","published":"2025-01-17T09:43:14Z","title":"CLIP-PCQA: Exploring Subjective-Aligned Vision-Language Modeling for\n  Point Cloud Quality Assessment","summary":"  In recent years, No-Reference Point Cloud Quality Assessment (NR-PCQA)\nresearch has achieved significant progress. However, existing methods mostly\nseek a direct mapping function from visual data to the Mean Opinion Score\n(MOS), which is contradictory to the mechanism of practical subjective\nevaluation. To address this, we propose a novel language-driven PCQA method\nnamed CLIP-PCQA. Considering that human beings prefer to describe visual\nquality using discrete quality descriptions (e.g., \"excellent\" and \"poor\")\nrather than specific scores, we adopt a retrieval-based mapping strategy to\nsimulate the process of subjective assessment. More specifically, based on the\nphilosophy of CLIP, we calculate the cosine similarity between the visual\nfeatures and multiple textual features corresponding to different quality\ndescriptions, in which process an effective contrastive loss and learnable\nprompts are introduced to enhance the feature extraction. Meanwhile, given the\npersonal limitations and bias in subjective experiments, we further covert the\nfeature similarities into probabilities and consider the Opinion Score\nDistribution (OSD) rather than a single MOS as the final target. Experimental\nresults show that our CLIP-PCQA outperforms other State-Of-The-Art (SOTA)\napproaches.\n","authors":["Yating Liu","Yujie Zhang","Ziyu Shan","Yiling Xu"],"pdf_url":"https://arxiv.org/pdf/2501.10071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09972v1","updated":"2025-01-17T06:30:11Z","published":"2025-01-17T06:30:11Z","title":"GVMGen: A General Video-to-Music Generation Model with Hierarchical\n  Attentions","summary":"  Composing music for video is essential yet challenging, leading to a growing\ninterest in automating music generation for video applications. Existing\napproaches often struggle to achieve robust music-video correspondence and\ngenerative diversity, primarily due to inadequate feature alignment methods and\ninsufficient datasets. In this study, we present General Video-to-Music\nGeneration model (GVMGen), designed for generating high-related music to the\nvideo input. Our model employs hierarchical attentions to extract and align\nvideo features with music in both spatial and temporal dimensions, ensuring the\npreservation of pertinent features while minimizing redundancy. Remarkably, our\nmethod is versatile, capable of generating multi-style music from different\nvideo inputs, even in zero-shot scenarios. We also propose an evaluation model\nalong with two novel objective metrics for assessing video-music alignment.\nAdditionally, we have compiled a large-scale dataset comprising diverse types\nof video-music pairs. Experimental results demonstrate that GVMGen surpasses\nprevious models in terms of music-video correspondence, generative diversity,\nand application universality.\n","authors":["Heda Zuo","Weitao You","Junxian Wu","Shihong Ren","Pei Chen","Mingxu Zhou","Yujia Lu","Lingyun Sun"],"pdf_url":"https://arxiv.org/pdf/2501.09972v1.pdf","comment":"Accepted by the 39th AAAI Conference on Artificial Intelligence\n  (AAAI-25)"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.10343v1","updated":"2025-01-17T18:34:47Z","published":"2025-01-17T18:34:47Z","title":"3rd Workshop on Maritime Computer Vision (MaCVi) 2025: Challenge Results","summary":"  The 3rd Workshop on Maritime Computer Vision (MaCVi) 2025 addresses maritime\ncomputer vision for Unmanned Surface Vehicles (USV) and underwater. This report\noffers a comprehensive overview of the findings from the challenges. We provide\nboth statistical and qualitative analyses, evaluating trends from over 700\nsubmissions. All datasets, evaluation code, and the leaderboard are available\nto the public at https://macvi.org/workshop/macvi25.\n","authors":["Benjamin Kiefer","Lojze Žust","Jon Muhovič","Matej Kristan","Janez Perš","Matija Teršek","Uma Mudenagudi Chaitra Desai","Arnold Wiliem","Marten Kreis","Nikhil Akalwadi","Yitong Quan","Zhiqiang Zhong","Zhe Zhang","Sujie Liu","Xuran Chen","Yang Yang","Matej Fabijanić","Fausto Ferreira","Seongju Lee","Junseok Lee","Kyoobin Lee","Shanliang Yao","Runwei Guan","Xiaoyu Huang","Yi Ni","Himanshu Kumar","Yuan Feng","Yi-Ching Cheng","Tzu-Yu Lin","Chia-Ming Lee","Chih-Chung Hsu","Jannik Sheikh","Andreas Michel","Wolfgang Gross","Martin Weinmann","Josip Šarić","Yipeng Lin","Xiang Yang","Nan Jiang","Yutang Lu","Fei Feng","Ali Awad","Evan Lucas","Ashraf Saleem","Ching-Heng Cheng","Yu-Fan Lin","Tzu-Yu Lin","Chih-Chung Hsu"],"pdf_url":"https://arxiv.org/pdf/2501.10343v1.pdf","comment":"Part of the MaCVi 2025 workshop"},{"id":"http://arxiv.org/abs/2501.09327v2","updated":"2025-01-17T18:30:04Z","published":"2025-01-16T06:52:58Z","title":"On Learning Informative Trajectory Embeddings for Imitation,\n  Classification and Regression","summary":"  In real-world sequential decision making tasks like autonomous driving,\nrobotics, and healthcare, learning from observed state-action trajectories is\ncritical for tasks like imitation, classification, and clustering. For example,\nself-driving cars must replicate human driving behaviors, while robots and\nhealthcare systems benefit from modeling decision sequences, whether or not\nthey come from expert data. Existing trajectory encoding methods often focus on\nspecific tasks or rely on reward signals, limiting their ability to generalize\nacross domains and tasks. Inspired by the success of embedding models like CLIP\nand BERT in static domains, we propose a novel method for embedding\nstate-action trajectories into a latent space that captures the skills and\ncompetencies in the dynamic underlying decision-making processes. This method\noperates without the need for reward labels, enabling better generalization\nacross diverse domains and tasks. Our contributions are threefold: (1) We\nintroduce a trajectory embedding approach that captures multiple abilities from\nstate-action data. (2) The learned embeddings exhibit strong representational\npower across downstream tasks, including imitation, classification, clustering,\nand regression. (3) The embeddings demonstrate unique properties, such as\ncontrolling agent behaviors in IQ-Learn and an additive structure in the latent\nspace. Experimental results confirm that our method outperforms traditional\napproaches, offering more flexible and powerful trajectory representations for\nvarious applications. Our code is available at\nhttps://github.com/Erasmo1015/vte.\n","authors":["Zichang Ge","Changyu Chen","Arunesh Sinha","Pradeep Varakantham"],"pdf_url":"https://arxiv.org/pdf/2501.09327v2.pdf","comment":"AAMAS 2025"},{"id":"http://arxiv.org/abs/2501.10332v1","updated":"2025-01-17T18:05:04Z","published":"2025-01-17T18:05:04Z","title":"Agent4Edu: Generating Learner Response Data by Generative Agents for\n  Intelligent Education Systems","summary":"  Personalized learning represents a promising educational strategy within\nintelligent educational systems, aiming to enhance learners' practice\nefficiency. However, the discrepancy between offline metrics and online\nperformance significantly impedes their progress. To address this challenge, we\nintroduce Agent4Edu, a novel personalized learning simulator leveraging recent\nadvancements in human intelligence through large language models (LLMs).\nAgent4Edu features LLM-powered generative agents equipped with learner profile,\nmemory, and action modules tailored to personalized learning algorithms. The\nlearner profiles are initialized using real-world response data, capturing\npractice styles and cognitive factors. Inspired by human psychology theory, the\nmemory module records practice facts and high-level summaries, integrating\nreflection mechanisms. The action module supports various behaviors, including\nexercise understanding, analysis, and response generation. Each agent can\ninteract with personalized learning algorithms, such as computerized adaptive\ntesting, enabling a multifaceted evaluation and enhancement of customized\nservices. Through a comprehensive assessment, we explore the strengths and\nweaknesses of Agent4Edu, emphasizing the consistency and discrepancies in\nresponses between agents and human learners. The code, data, and appendix are\npublicly available at https://github.com/bigdata-ustc/Agent4Edu.\n","authors":["Weibo Gao","Qi Liu","Linan Yue","Fangzhou Yao","Rui Lv","Zheng Zhang","Hao Wang","Zhenya Huang"],"pdf_url":"https://arxiv.org/pdf/2501.10332v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.10326v1","updated":"2025-01-17T17:56:58Z","published":"2025-01-17T17:56:58Z","title":"Large language models for automated scholarly paper review: A survey","summary":"  Large language models (LLMs) have significantly impacted human society,\ninfluencing various domains. Among them, academia is not simply a domain\naffected by LLMs, but it is also the pivotal force in the development of LLMs.\nIn academic publications, this phenomenon is represented during the\nincorporation of LLMs into the peer review mechanism for reviewing manuscripts.\nWe proposed the concept of automated scholarly paper review (ASPR) in our\nprevious paper. As the incorporation grows, it now enters the coexistence phase\nof ASPR and peer review, which is described in that paper. LLMs hold\ntransformative potential for the full-scale implementation of ASPR, but they\nalso pose new issues and challenges that need to be addressed. In this survey\npaper, we aim to provide a holistic view of ASPR in the era of LLMs. We begin\nwith a survey to find out which LLMs are used to conduct ASPR. Then, we review\nwhat ASPR-related technological bottlenecks have been solved with the\nincorporation of LLM technology. After that, we move on to explore new methods,\nnew datasets, new source code, and new online systems that come with LLMs for\nASPR. Furthermore, we summarize the performance and issues of LLMs in ASPR, and\ninvestigate the attitudes and reactions of publishers and academia to ASPR.\nLastly, we discuss the challenges associated with the development of LLMs for\nASPR. We hope this survey can serve as an inspirational reference for the\nresearchers and promote the progress of ASPR for its actual implementation.\n","authors":["Zhenzhen Zhuang","Jiandong Chen","Hongfeng Xu","Yuwen Jiang","Jialiang Lin"],"pdf_url":"https://arxiv.org/pdf/2501.10326v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2501.10322v1","updated":"2025-01-17T17:51:53Z","published":"2025-01-17T17:51:53Z","title":"Hierarchical Autoregressive Transformers: Combining Byte-~and Word-Level\n  Processing for Robust, Adaptable Language Models","summary":"  Tokenization is a fundamental step in natural language processing, breaking\ntext into units that computational models can process. While learned subword\ntokenizers have become the de-facto standard, they present challenges such as\nlarge vocabularies, limited adaptability to new domains or languages, and\nsensitivity to spelling errors and variations. To overcome these limitations,\nwe investigate a hierarchical architecture for autoregressive language\nmodelling that combines character-level and word-level processing. It employs a\nlightweight character-level encoder to convert character sequences into word\nembeddings, which are then processed by a word-level backbone model and decoded\nback into characters via a compact character-level decoder. This method retains\nthe sequence compression benefits of word-level tokenization without relying on\na rigid, predefined vocabulary. We demonstrate, at scales up to 7 billion\nparameters, that hierarchical transformers match the downstream task\nperformance of subword-tokenizer-based models while exhibiting significantly\ngreater robustness to input perturbations. Additionally, during continued\npretraining on an out-of-domain language, our model trains almost twice as\nfast, achieves superior performance on the target language, and retains more of\nits previously learned knowledge. Hierarchical transformers pave the way for\nNLP systems that are more robust, flexible, and generalizable across languages\nand domains.\n","authors":["Pit Neitemeier","Björn Deiseroth","Constantin Eichenberg","Lukas Balles"],"pdf_url":"https://arxiv.org/pdf/2501.10322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.10021v3","updated":"2025-01-17T17:37:42Z","published":"2021-07-21T11:31:57Z","title":"Neuradicon: operational representation learning of neuroimaging reports","summary":"  Radiological reports typically summarize the content and interpretation of\nimaging studies in unstructured form that precludes quantitative analysis. This\nlimits the monitoring of radiological services to throughput undifferentiated\nby content, impeding specific, targeted operational optimization. Here we\npresent Neuradicon, a natural language processing (NLP) framework for\nquantitative analysis of neuroradiological reports. Our framework is a hybrid\nof rule-based and artificial intelligence models to represent neurological\nreports in succinct, quantitative form optimally suited to operational\nguidance. We demonstrate the application of Neuradicon to operational\nphenotyping of a corpus of 336,569 reports, and report excellent\ngeneralizability across time and two independent healthcare institutions.\n","authors":["Henry Watkins","Robert Gray","Adam Julius","Yee-Haur Mah","Walter H. L. Pinaya","Paul Wright","Ashwani Jha","Holger Engleitner","Jorge Cardoso","Sebastien Ourselin","Geraint Rees","Rolf Jaeger","Parashkev Nachev"],"pdf_url":"https://arxiv.org/pdf/2107.10021v3.pdf","comment":"26 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.10300v1","updated":"2025-01-17T16:51:03Z","published":"2025-01-17T16:51:03Z","title":"An Ontology for Social Determinants of Education (SDoEd) based on\n  Human-AI Collaborative Approach","summary":"  The use of computational ontologies is well-established in the field of\nMedical Informatics. The topic of Social Determinants of Health (SDoH) has also\nreceived extensive attention. Work at the intersection of ontologies and SDoH\nhas been published. However, a standardized framework for Social Determinants\nof Education (SDoEd) is lacking. In this paper, we are closing the gap by\nintroducing an SDoEd ontology for creating a precise conceptualization of the\ninterplay between life circumstances of students and their possible educational\nachievements. The ontology was developed utilizing suggestions from\nChatGPT-3.5-010422 and validated using peer-reviewed research articles. The\nfirst version of developed ontology was evaluated by human experts in the field\nof education and validated using standard ontology evaluation software. This\nversion of the SDoEd ontology contains 231 domain concepts, 10 object\nproperties, and 24 data properties\n","authors":["Navya Martin Kollapally","James Geller","Patricia Morreale","Daehan Kwak"],"pdf_url":"https://arxiv.org/pdf/2501.10300v1.pdf","comment":"Accepted in CONSORTIUM FOR COMPUTING SCIENCES IN COLLEGES"},{"id":"http://arxiv.org/abs/2408.09594v2","updated":"2025-01-17T16:44:35Z","published":"2024-08-18T20:59:59Z","title":"Moonshine: Distilling Game Content Generators into Steerable Generative\n  Models","summary":"  Procedural Content Generation via Machine Learning (PCGML) has enhanced game\ncontent creation, yet challenges in controllability and limited training data\npersist. This study addresses these issues by distilling a constructive PCG\nalgorithm into a controllable PCGML model. We first generate a large amount of\ncontent with a constructive algorithm and label it using a Large Language Model\n(LLM). We use these synthetic labels to condition two PCGML models for\ncontent-specific generation, a diffusion model and the five-dollar model. This\nneural network distillation process ensures that the generation aligns with the\noriginal algorithm while introducing controllability through plain text. We\ndefine this text-conditioned PCGML as a Text-to-game-Map (T2M) task, offering\nan alternative to prevalent text-to-image multi-modal tasks. We compare our\ndistilled models with the baseline constructive algorithm. Our analysis of the\nvariety, accuracy, and quality of our generation demonstrates the efficacy of\ndistilling constructive methods into controllable text-conditioned PCGML\nmodels.\n","authors":["Yuhe Nie","Michael Middleton","Tim Merino","Nidhushan Kanagaraja","Ashutosh Kumar","Zhan Zhuang","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2408.09594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07836v3","updated":"2025-01-17T16:35:27Z","published":"2024-01-15T17:06:02Z","title":"Two Types of AI Existential Risk: Decisive and Accumulative","summary":"  The conventional discourse on existential risks (x-risks) from AI typically\nfocuses on abrupt, dire events caused by advanced AI systems, particularly\nthose that might achieve or surpass human-level intelligence. These events have\nsevere consequences that either lead to human extinction or irreversibly\ncripple human civilization to a point beyond recovery. This discourse, however,\noften neglects the serious possibility of AI x-risks manifesting incrementally\nthrough a series of smaller yet interconnected disruptions, gradually crossing\ncritical thresholds over time. This paper contrasts the conventional \"decisive\nAI x-risk hypothesis\" with an \"accumulative AI x-risk hypothesis.\" While the\nformer envisions an overt AI takeover pathway, characterized by scenarios like\nuncontrollable superintelligence, the latter suggests a different causal\npathway to existential catastrophes. This involves a gradual accumulation of\ncritical AI-induced threats such as severe vulnerabilities and systemic erosion\nof economic and political structures. The accumulative hypothesis suggests a\nboiling frog scenario where incremental AI risks slowly converge, undermining\nsocietal resilience until a triggering event results in irreversible collapse.\nThrough systems analysis, this paper examines the distinct assumptions\ndifferentiating these two hypotheses. It is then argued that the accumulative\nview can reconcile seemingly incompatible perspectives on AI risks. The\nimplications of differentiating between these causal pathways -- the decisive\nand the accumulative -- for the governance of AI as well as long-term AI safety\nare discussed.\n","authors":["Atoosa Kasirzadeh"],"pdf_url":"https://arxiv.org/pdf/2401.07836v3.pdf","comment":"Journal article for Philosophical Studies"},{"id":"http://arxiv.org/abs/2306.12215v2","updated":"2025-01-17T16:04:05Z","published":"2023-06-21T12:15:57Z","title":"Automated Machine Learning for Remaining Useful Life Predictions","summary":"  Being able to predict the remaining useful life (RUL) of an engineering\nsystem is an important task in prognostics and health management. Recently,\ndata-driven approaches to RUL predictions are becoming prevalent over\nmodel-based approaches since no underlying physical knowledge of the\nengineering system is required. Yet, this just replaces required expertise of\nthe underlying physics with machine learning (ML) expertise, which is often\nalso not available. Automated machine learning (AutoML) promises to build\nend-to-end ML pipelines automatically enabling domain experts without ML\nexpertise to create their own models. This paper introduces AutoRUL, an\nAutoML-driven end-to-end approach for automatic RUL predictions. AutoRUL\ncombines fine-tuned standard regression methods to an ensemble with high\npredictive power. By evaluating the proposed method on eight real-world and\nsynthetic datasets against state-of-the-art hand-crafted models, we show that\nAutoML provides a viable alternative to hand-crafted data-driven RUL\npredictions. Consequently, creating RUL predictions can be made more accessible\nfor domain experts using AutoML by eliminating ML expertise from data-driven\nmodel construction.\n","authors":["Marc-André Zöller","Fabian Mauthe","Peter Zeiler","Marius Lindauer","Marco F. Huber"],"pdf_url":"https://arxiv.org/pdf/2306.12215v2.pdf","comment":"Manuscript accepted at IEEE SMC 2023"},{"id":"http://arxiv.org/abs/2501.10273v1","updated":"2025-01-17T16:01:05Z","published":"2025-01-17T16:01:05Z","title":"SEANN: A Domain-Informed Neural Network for Epidemiological Insights","summary":"  In epidemiology, traditional statistical methods such as logistic regression,\nlinear regression, and other parametric models are commonly employed to\ninvestigate associations between predictors and health outcomes. However,\nnon-parametric machine learning techniques, such as deep neural networks\n(DNNs), coupled with explainable AI (XAI) tools, offer new opportunities for\nthis task. Despite their potential, these methods face challenges due to the\nlimited availability of high-quality, high-quantity data in this field. To\naddress these challenges, we introduce SEANN, a novel approach for informed\nDNNs that leverages a prevalent form of domain-specific knowledge: Pooled\nEffect Sizes (PES). PESs are commonly found in published Meta-Analysis studies,\nin different forms, and represent a quantitative form of a scientific\nconsensus. By direct integration within the learning procedure using a custom\nloss, we experimentally demonstrate significant improvements in the\ngeneralizability of predictive performances and the scientific plausibility of\nextracted relationships compared to a domain-knowledge agnostic neural network\nin a scarce and noisy data setting.\n","authors":["Jean-Baptiste Guimbaud","Marc Plantevit","Léa Maître","Rémy Cazabet"],"pdf_url":"https://arxiv.org/pdf/2501.10273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10256v1","updated":"2025-01-17T15:39:21Z","published":"2025-01-17T15:39:21Z","title":"Unsupervised Rhythm and Voice Conversion of Dysarthric to Healthy Speech\n  for ASR","summary":"  Automatic speech recognition (ASR) systems are well known to perform poorly\non dysarthric speech. Previous works have addressed this by speaking rate\nmodification to reduce the mismatch with typical speech. Unfortunately, these\napproaches rely on transcribed speech data to estimate speaking rates and\nphoneme durations, which might not be available for unseen speakers. Therefore,\nwe combine unsupervised rhythm and voice conversion methods based on\nself-supervised speech representations to map dysarthric to typical speech. We\nevaluate the outputs with a large ASR model pre-trained on healthy speech\nwithout further fine-tuning and find that the proposed rhythm conversion\nespecially improves performance for speakers of the Torgo corpus with more\nsevere cases of dysarthria. Code and audio samples are available at\nhttps://idiap.github.io/RnV .\n","authors":["Karl El Hajal","Enno Hermann","Ajinkya Kulkarni","Mathew Magimai. -Doss"],"pdf_url":"https://arxiv.org/pdf/2501.10256v1.pdf","comment":"Accepted at ICASSP 2025 Satellite Workshop: Workshop on Speech\n  Pathology Analysis and DEtection (SPADE)"},{"id":"http://arxiv.org/abs/2501.09686v2","updated":"2025-01-17T15:24:53Z","published":"2025-01-16T17:37:58Z","title":"Towards Large Reasoning Models: A Survey on Scaling LLM Reasoning\n  Capabilities","summary":"  Language has long been conceived as an essential tool for human reasoning.\nThe breakthrough of Large Language Models (LLMs) has sparked significant\nresearch interest in leveraging these models to tackle complex reasoning tasks.\nResearchers have moved beyond simple autoregressive token generation by\nintroducing the concept of \"thought\" -- a sequence of tokens representing\nintermediate steps in the reasoning process. This innovative paradigm enables\nLLMs' to mimic complex human reasoning processes, such as tree search and\nreflective thinking. Recently, an emerging trend of learning to reason has\napplied reinforcement learning (RL) to train LLMs to master reasoning\nprocesses. This approach enables the automatic generation of high-quality\nreasoning trajectories through trial-and-error search algorithms, significantly\nexpanding LLMs' reasoning capacity by providing substantially more training\ndata. Furthermore, recent studies demonstrate that encouraging LLMs to \"think\"\nwith more tokens during test-time inference can further significantly boost\nreasoning accuracy. Therefore, the train-time and test-time scaling combined to\nshow a new research frontier -- a path toward Large Reasoning Model. The\nintroduction of OpenAI's o1 series marks a significant milestone in this\nresearch direction. In this survey, we present a comprehensive review of recent\nprogress in LLM reasoning. We begin by introducing the foundational background\nof LLMs and then explore the key technical components driving the development\nof large reasoning models, with a focus on automated data construction,\nlearning-to-reason techniques, and test-time scaling. We also analyze popular\nopen-source projects at building large reasoning models, and conclude with open\nchallenges and future research directions.\n","authors":["Fengli Xu","Qianyue Hao","Zefang Zong","Jingwei Wang","Yunke Zhang","Jingyi Wang","Xiaochong Lan","Jiahui Gong","Tianjian Ouyang","Fanjin Meng","Chenyang Shao","Yuwei Yan","Qinglong Yang","Yiwen Song","Sijian Ren","Xinyuan Hu","Yu Li","Jie Feng","Chen Gao","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2501.09686v2.pdf","comment":"36 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.09274v2","updated":"2025-01-17T15:22:00Z","published":"2025-01-16T03:44:16Z","title":"Large Language Model is Secretly a Protein Sequence Optimizer","summary":"  We consider the protein sequence engineering problem, which aims to find\nprotein sequences with high fitness levels, starting from a given wild-type\nsequence. Directed evolution has been a dominating paradigm in this field which\nhas an iterative process to generate variants and select via experimental\nfeedback. We demonstrate large language models (LLMs), despite being trained on\nmassive texts, are secretly protein sequence optimizers. With a directed\nevolutionary method, LLM can perform protein engineering through Pareto and\nexperiment-budget constrained optimization, demonstrating success on both\nsynthetic and experimental fitness landscapes.\n","authors":["Yinkai Wang","Jiaxing He","Yuanqi Du","Xiaohui Chen","Jianan Canal Li","Li-Ping Liu","Xiaolin Xu","Soha Hassoun"],"pdf_url":"https://arxiv.org/pdf/2501.09274v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2403.03728v2","updated":"2025-01-17T15:15:15Z","published":"2024-03-06T14:18:24Z","title":"Bridging Diversity and Uncertainty in Active learning with\n  Self-Supervised Pre-Training","summary":"  This study addresses the integration of diversity-based and uncertainty-based\nsampling strategies in active learning, particularly within the context of\nself-supervised pre-trained models. We introduce a straightforward heuristic\ncalled TCM that mitigates the cold start problem while maintaining strong\nperformance across various data levels. By initially applying TypiClust for\ndiversity sampling and subsequently transitioning to uncertainty sampling with\nMargin, our approach effectively combines the strengths of both strategies. Our\nexperiments demonstrate that TCM consistently outperforms existing methods\nacross various datasets in both low and high data regimes.\n","authors":["Paul Doucet","Benjamin Estermann","Till Aczel","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2403.03728v2.pdf","comment":"Accepted at ICLR 2024 Workshop on Practical Machine Learning for Low\n  Resource Settings (PML4LRS)"},{"id":"http://arxiv.org/abs/2501.10243v1","updated":"2025-01-17T15:11:30Z","published":"2025-01-17T15:11:30Z","title":"Random-Key Algorithms for Optimizing Integrated Operating Room\n  Scheduling","summary":"  Efficient surgery room scheduling is essential for hospital efficiency,\npatient satisfaction, and resource utilization. This study addresses this\nchallenge by introducing a novel concept of Random-Key Optimizer (RKO),\nrigorously tested on literature and new, real-world inspired instances. Our\ncombinatorial optimization problem incorporates multi-room scheduling,\nequipment scheduling, and complex availability constraints for rooms, patients,\nand surgeons, facilitating rescheduling and enhancing operational flexibility.\nThe RKO approach represents solutions as points in a continuous space, which\nare then mapped in the problem solution space via a deterministic function\nknown as a decoder. The core idea is to operate metaheuristics and heuristics\nin the random-key space, unaware of the original solution space. We design the\nBiased Random-Key Genetic Algorithm with $Q$-Learning, Simulated Annealing, and\nIterated Local Search for use within an RKO framework, employing a single\ndecoder function. The proposed metaheuristics are complemented by lower-bound\nformulations, providing optimal gaps for evaluating the effectiveness of the\nheuristic results. Our results demonstrate significant lower and upper bounds\nimprovements for the literature instances, notably proving one optimal result.\nFurthermore, the best-proposed metaheuristic efficiently generates schedules\nfor the newly introduced instances, even in highly constrained scenarios. This\nresearch offers valuable insights and practical solutions for improving surgery\nscheduling processes, offering tangible benefits to hospitals by optimising\nresource allocation, reducing patient wait times, and enhancing overall\noperational efficiency.\n","authors":["Bruno Salezze Vieira","Eduardo Machado Silva","Antonio Augusto Chaves"],"pdf_url":"https://arxiv.org/pdf/2501.10243v1.pdf","comment":"38 pages, Preprint submitted to Applied Soft Computing"},{"id":"http://arxiv.org/abs/2501.10240v1","updated":"2025-01-17T15:09:57Z","published":"2025-01-17T15:09:57Z","title":"Challenges and recommendations for Electronic Health Records data\n  extraction and preparation for dynamic prediction modelling in hospitalized\n  patients -- a practical guide","summary":"  Dynamic predictive modeling using electronic health record (EHR) data has\ngained significant attention in recent years. The reliability and\ntrustworthiness of such models depend heavily on the quality of the underlying\ndata, which is largely determined by the stages preceding the model\ndevelopment: data extraction from EHR systems and data preparation. We list\nover forty challenges encountered during these stages and provide actionable\nrecommendations for addressing them. These challenges are organized into four\ncategories: cohort definition, outcome definition, feature engineering, and\ndata cleaning. This list is designed to serve as a practical guide for data\nextraction engineers and researchers, supporting better practices and improving\nthe quality and real-world applicability of dynamic prediction models in\nclinical settings.\n","authors":["Elena Albu","Shan Gao","Pieter Stijnen","Frank E. Rademakers","Bas C T van Bussel","Taya Collyer","Tina Hernandez-Boussard","Laure Wynants","Ben Van Calster"],"pdf_url":"https://arxiv.org/pdf/2501.10240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11414v3","updated":"2025-01-17T15:08:47Z","published":"2023-12-18T18:18:10Z","title":"The Animal-AI Environment: A Virtual Laboratory For Comparative\n  Cognition and Artificial Intelligence Research","summary":"  The Animal-AI Environment is a unique game-based research platform designed\nto facilitate collaboration between the artificial intelligence and comparative\ncognition research communities. In this paper, we present the latest version of\nthe Animal-AI Environment, outlining several major features that make the game\nmore engaging for humans and more complex for AI systems. These features\ninclude interactive buttons, reward dispensers, and player notifications, as\nwell as an overhaul of the environment's graphics and processing for\nsignificant improvements in agent training time and quality of the human player\nexperience. We provide detailed guidance on how to build computational and\nbehavioural experiments with the Animal-AI Environment. We present results from\na series of agents, including the state-of-the-art deep reinforcement learning\nagent Dreamer-v3, on newly designed tests and the Animal-AI Testbed of 900\ntasks inspired by research in the field of comparative cognition. The Animal-AI\nEnvironment offers a new approach for modelling cognition in humans and\nnon-human animals, and for building biologically inspired artificial\nintelligence.\n","authors":["Konstantinos Voudouris","Ibrahim Alhas","Wout Schellaert","Matteo G. Mecattaf","Ben Slater","Matthew Crosby","Joel Holmes","John Burden","Niharika Chaubey","Niall Donnelly","Matishalin Patel","Marta Halina","José Hernández-Orallo","Lucy G. Cheke"],"pdf_url":"https://arxiv.org/pdf/2312.11414v3.pdf","comment":"37 pages, 16 figures, 6 tables"},{"id":"http://arxiv.org/abs/2410.13780v2","updated":"2025-01-17T14:26:37Z","published":"2024-10-17T17:19:48Z","title":"Optimal Quantization for Matrix Multiplication","summary":"  Recent work in machine learning community proposed multiple methods for\nperforming lossy compression (quantization) of large matrices. This\nquantization is important for accelerating matrix multiplication (main\ncomponent of large language models), which is often bottlenecked by the speed\nof loading these matrices from memory. Unlike classical vector quantization and\nrate-distortion theory, the goal of these new compression algorithms is to be\nable to approximate not the matrices themselves, but their matrix product.\nSpecifically, given a pair of real matrices $A,B$ an encoder (compressor) is\napplied to each of them independently producing descriptions with $R$ bits per\nentry. These representations subsequently are used by the decoder to estimate\nmatrix product $A^\\top B$. In this work, we provide a non-asymptotic lower\nbound on the mean squared error of this approximation (as a function of rate\n$R$) for the case of matrices $A,B$ with iid Gaussian entries. Algorithmically,\nwe construct a universal quantizer based on nested lattices with an explicit\nguarantee of approximation error for any (non-random) pair of matrices $A$, $B$\nin terms of only Frobenius norms $\\|\\bar{A}\\|_F, \\|\\bar{B}\\|_F$ and\n$\\|\\bar{A}^\\top \\bar{B}\\|_F$, where $\\bar{A},\\bar{B}$ are versions of $A,B$\nwith zero-centered columns, respectively. For iid Gaussian matrices our\nquantizer achieves the lower bound and is, thus, asymptotically optimal. A\npractical low-complexity version of our quantizer achieves performance quite\nclose to optimal. In addition, we derive rate-distortion function for matrix\nmultiplication of iid Gaussian matrices, which exhibits an interesting\nphase-transition at $R\\approx 0.906$ bit/entry.\n","authors":["Or Ordentlich","Yury Polyanskiy"],"pdf_url":"https://arxiv.org/pdf/2410.13780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10733v5","updated":"2025-01-17T14:22:06Z","published":"2024-10-14T17:15:07Z","title":"Deep Compression Autoencoder for Efficient High-Resolution Diffusion\n  Models","summary":"  We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder\nmodels for accelerating high-resolution diffusion models. Existing autoencoder\nmodels have demonstrated impressive results at a moderate spatial compression\nratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for\nhigh spatial compression ratios (e.g., 64x). We address this challenge by\nintroducing two key techniques: (1) Residual Autoencoding, where we design our\nmodels to learn residuals based on the space-to-channel transformed features to\nalleviate the optimization difficulty of high spatial-compression autoencoders;\n(2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases\ntraining strategy for mitigating the generalization penalty of high\nspatial-compression autoencoders. With these designs, we improve the\nautoencoder's spatial compression ratio up to 128 while maintaining the\nreconstruction quality. Applying our DC-AE to latent diffusion models, we\nachieve significant speedup without accuracy drop. For example, on ImageNet\n512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup\non H100 GPU for UViT-H while achieving a better FID, compared with the widely\nused SD-VAE-f8 autoencoder. Our code is available at\nhttps://github.com/mit-han-lab/efficientvit.\n","authors":["Junyu Chen","Han Cai","Junsong Chen","Enze Xie","Shang Yang","Haotian Tang","Muyang Li","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2410.10733v5.pdf","comment":"Preprint. First two authors contributed equally to this work. Update:\n  fix typo"},{"id":"http://arxiv.org/abs/2501.10190v1","updated":"2025-01-17T13:37:58Z","published":"2025-01-17T13:37:58Z","title":"Temporal Causal Reasoning with (Non-Recursive) Structural Equation\n  Models","summary":"  Structural Equation Models (SEM) are the standard approach to representing\ncausal dependencies between variables in causal models. In this paper we\npropose a new interpretation of SEMs when reasoning about Actual Causality, in\nwhich SEMs are viewed as mechanisms transforming the dynamics of exogenous\nvariables into the dynamics of endogenous variables. This allows us to combine\ncounterfactual causal reasoning with existing temporal logic formalisms, and to\nintroduce a temporal logic, CPLTL, for causal reasoning about such structures.\nWe show that the standard restriction to so-called \\textit{recursive} models\n(with no cycles in the dependency graph) is not necessary in our approach,\nallowing us to reason about mutually dependent processes and feedback loops.\nFinally, we introduce new notions of model equivalence for temporal causal\nmodels, and show that CPLTL has an efficient model-checking procedure.\n","authors":["Maksim Gladyshev","Natasha Alechina","Mehdi Dastani","Dragan Doder","Brian Logan"],"pdf_url":"https://arxiv.org/pdf/2501.10190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10187v1","updated":"2025-01-17T13:32:28Z","published":"2025-01-17T13:32:28Z","title":"Good things come in small packages: Should we adopt Lite-GPUs in AI\n  infrastructure?","summary":"  To match the blooming demand of generative AI workloads, GPU designers have\nso far been trying to pack more and more compute and memory into single complex\nand expensive packages. However, there is growing uncertainty about the\nscalability of individual GPUs and thus AI clusters, as state-of-the-art GPUs\nare already displaying packaging, yield, and cooling limitations. We propose to\nrethink the design and scaling of AI clusters through efficiently-connected\nlarge clusters of Lite-GPUs, GPUs with single, small dies and a fraction of the\ncapabilities of larger GPUs. We think recent advances in co-packaged optics can\nbe key in overcoming the communication challenges of distributing AI workloads\nonto more Lite-GPUs. In this paper, we present the key benefits of Lite-GPUs on\nmanufacturing cost, blast radius, yield, and power efficiency; and discuss\nsystems opportunities and challenges around resource, workload, memory, and\nnetwork management.\n","authors":["Burcu Canakci","Junyi Liu","Xingbo Wu","Nathanaël Cheriere","Paolo Costa","Sergey Legtchenko","Dushyanth Narayanan","Ant Rowstron"],"pdf_url":"https://arxiv.org/pdf/2501.10187v1.pdf","comment":"5+ pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.10186v1","updated":"2025-01-17T13:32:19Z","published":"2025-01-17T13:32:19Z","title":"Generative Artificial Intelligence: Implications for Biomedical and\n  Health Professions Education","summary":"  Generative AI has had a profound impact on biomedicine and health, both in\nprofessional work and in education. Based on large language models (LLMs),\ngenerative AI has been found to perform as well as humans in simulated\nsituations taking medical board exams, answering clinical questions, solving\nclinical cases, applying clinical reasoning, and summarizing information.\nGenerative AI is also being used widely in education, performing well in\nacademic courses and their assessments. This review summarizes the successes of\nLLMs and highlights some of their challenges in the context of education, most\nnotably aspects that may undermines the acquisition of knowledge and skills for\nprofessional work. It then provides recommendations for best practices\novercoming shortcomings for LLM use in education. Although there are challenges\nfor use of generative AI in education, all students and faculty, in biomedicine\nand health and beyond, must have understanding and be competent in its use.\n","authors":["William Hersh"],"pdf_url":"https://arxiv.org/pdf/2501.10186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13309v2","updated":"2025-01-17T13:28:01Z","published":"2023-12-20T04:35:00Z","title":"Generate E-commerce Product Background by Integrating Category\n  Commonality and Personalized Style","summary":"  The state-of-the-art methods for e-commerce product background generation\nsuffer from the inefficiency of designing product-wise prompts when scaling up\nthe production, as well as the ineffectiveness of describing fine-grained\nstyles when customizing personalized backgrounds for some specific brands. To\naddress these obstacles, we integrate the category commonality and personalized\nstyle into diffusion models. Concretely, we propose a Category-Wise Generator\nto enable large-scale background generation with only one model for the first\ntime. A unique identifier in the prompt is assigned to each category, whose\nattention is located on the background by a mask-guided cross attention layer\nto learn the category-wise style. Furthermore, for products with specific and\nfine-grained requirements in layout, elements, etc, a Personality-Wise\nGenerator is devised to learn such personalized style directly from a reference\nimage to resolve textual ambiguities, and is trained in a self-supervised\nmanner for more efficient training data usage. To advance research in this\nfield, the first large-scale e-commerce product background generation dataset\nBG60k is constructed, which covers more than 60k product images from over 2k\ncategories. Experiments demonstrate that our method could generate high-quality\nbackgrounds for different categories, and maintain the personalized background\nstyle of reference images. BG60k will be available at\n\\url{https://github.com/Whileherham/BG60k}.\n","authors":["Haohan Wang","Wei Feng","Yaoyu Li","Zheng Zhang","Jingjing Lv","Junjie Shen","Zhangang Lin","Jingping Shao"],"pdf_url":"https://arxiv.org/pdf/2312.13309v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.10179v1","updated":"2025-01-17T13:24:13Z","published":"2025-01-17T13:24:13Z","title":"A Simple but Effective Closed-form Solution for Extreme Multi-label\n  Learning","summary":"  Extreme multi-label learning (XML) is a task of assigning multiple labels\nfrom an extremely large set of labels to each data instance. Many current\nhigh-performance XML models are composed of a lot of hyperparameters, which\ncomplicates the tuning process. Additionally, the models themselves are adapted\nspecifically to XML, which complicates their reimplementation. To remedy this\nproblem, we propose a simple method based on ridge regression for XML. The\nproposed method not only has a closed-form solution but also is composed of a\nsingle hyperparameter. Since there are no precedents on applying ridge\nregression to XML, this paper verified the performance of the method by using\nvarious XML benchmark datasets. Furthermore, we enhanced the prediction of\nlow-frequency labels in XML, which hold informative content. This prediction is\nessential yet challenging because of the limited amount of data. Here, we\nemployed a simple frequency-based weighting. This approach greatly simplifies\nthe process compared with existing techniques. Experimental results revealed\nthat it can achieve levels of performance comparable to, or even exceeding,\nthose of models with numerous hyperparameters. Additionally, we found that the\nfrequency-based weighting significantly improved the predictive performance for\nlow-frequency labels, while requiring almost no changes in implementation. The\nsource code for the proposed method is available on github at\nhttps://github.com/cars1015/XML-ridge.\n","authors":["Kazuma Onishi","Katsuhiko Hayashi"],"pdf_url":"https://arxiv.org/pdf/2501.10179v1.pdf","comment":"10pages, Accepted at ECIR25"},{"id":"http://arxiv.org/abs/2501.10160v1","updated":"2025-01-17T12:48:48Z","published":"2025-01-17T12:48:48Z","title":"CSSDM Ontology to Enable Continuity of Care Data Interoperability","summary":"  The rapid advancement of digital technologies and recent global pandemic\nscenarios have led to a growing focus on how these technologies can enhance\nhealthcare service delivery and workflow to address crises. Action plans that\nconsolidate existing digital transformation programs are being reviewed to\nestablish core infrastructure and foundations for sustainable healthcare\nsolutions. Reforming health and social care to personalize home care, for\nexample, can help avoid treatment in overcrowded acute hospital settings and\nimprove the experiences and outcomes for both healthcare professionals and\nservice users. In this information-intensive domain, addressing the\ninteroperability challenge through standards-based roadmaps is crucial for\nenabling effective connections between health and social care services. This\napproach facilitates safe and trustworthy data workflows between different\nhealthcare system providers. In this paper, we present a methodology for\nextracting, transforming, and loading data through a semi-automated process\nusing a Common Semantic Standardized Data Model (CSSDM) to create personalized\nhealthcare knowledge graph (KG). The CSSDM is grounded in the formal ontology\nof ISO 13940 ContSys and incorporates FHIR-based specifications to support\nstructural attributes for generating KGs. We propose that the CSSDM facilitates\ndata harmonization and linking, offering an alternative approach to\ninteroperability. This approach promotes a novel form of collaboration between\ncompanies developing health information systems and cloud-enabled health\nservices. Consequently, it provides multiple stakeholders with access to\nhigh-quality data and information sharing.\n","authors":["Subhashis Das","Debashis Naskar","Sara Rodriguez Gonzalez","Pamela Hussey"],"pdf_url":"https://arxiv.org/pdf/2501.10160v1.pdf","comment":"6 pages, 5 figures, Published in: 2024 IEEE International Conference\n  on Bioinformatics and Biomedicine (BIBM)"},{"id":"http://arxiv.org/abs/2501.10153v1","updated":"2025-01-17T12:24:28Z","published":"2025-01-17T12:24:28Z","title":"Region-wise stacking ensembles for estimating brain-age using MRI","summary":"  Predictive modeling using structural magnetic resonance imaging (MRI) data is\na prominent approach to study brain-aging. Machine learning algorithms and\nfeature extraction methods have been employed to improve predictions and\nexplore healthy and accelerated aging e.g. neurodegenerative and psychiatric\ndisorders. The high-dimensional MRI data pose challenges to building\ngeneralizable and interpretable models as well as for data privacy. Common\npractices are resampling or averaging voxels within predefined parcels, which\nreduces anatomical specificity and biological interpretability as voxels within\na region may differently relate to aging. Effectively, naive fusion by\naveraging can result in information loss and reduced accuracy. We present a\nconceptually novel two-level stacking ensemble (SE) approach. The first level\ncomprises regional models for predicting individuals' age based on voxel-wise\ninformation, fused by a second-level model yielding final predictions. Eight\ndata fusion scenarios were explored using as input Gray matter volume (GMV)\nestimates from four datasets covering the adult lifespan. Performance, measured\nusing mean absolute error (MAE), R2, correlation and prediction bias, showed\nthat SE outperformed the region-wise averages. The best performance was\nobtained when first-level regional predictions were obtained as out-of-sample\npredictions on the application site with second-level models trained on\nindependent and site-specific data (MAE=4.75 vs baseline regional mean GMV\nMAE=5.68). Performance improved as more datasets were used for training.\nFirst-level predictions showed improved and more robust aging signal providing\nnew biological insights and enhanced data privacy. Overall, the SE improves\naccuracy compared to the baseline while preserving or enhancing data privacy.\n","authors":["Georgios Antonopoulos","Shammi More","Simon B. Eickhoff","Federico Raimondo","Kaustubh R. Patil"],"pdf_url":"https://arxiv.org/pdf/2501.10153v1.pdf","comment":"version1"},{"id":"http://arxiv.org/abs/2501.10151v1","updated":"2025-01-17T12:23:42Z","published":"2025-01-17T12:23:42Z","title":"Topology-Driven Attribute Recovery for Attribute Missing Graph Learning\n  in Social Internet of Things","summary":"  With the advancement of information technology, the Social Internet of Things\n(SIoT) has fostered the integration of physical devices and social networks,\ndeepening the study of complex interaction patterns. Text Attribute Graphs\n(TAGs) capture both topological structures and semantic attributes, enhancing\nthe analysis of complex interactions within the SIoT. However, existing graph\nlearning methods are typically designed for complete attributed graphs, and the\ncommon issue of missing attributes in Attribute Missing Graphs (AMGs) increases\nthe difficulty of analysis tasks. To address this, we propose the\nTopology-Driven Attribute Recovery (TDAR) framework, which leverages\ntopological data for AMG learning. TDAR introduces an improved pre-filling\nmethod for initial attribute recovery using native graph topology.\nAdditionally, it dynamically adjusts propagation weights and incorporates\nhomogeneity strategies within the embedding space to suit AMGs' unique\ntopological structures, effectively reducing noise during information\npropagation. Extensive experiments on public datasets demonstrate that TDAR\nsignificantly outperforms state-of-the-art methods in attribute reconstruction\nand downstream tasks, offering a robust solution to the challenges posed by\nAMGs. The code is available at https://github.com/limengran98/TDAR.\n","authors":["Mengran Li","Junzhou Chen","Chenyun Yu","Guanying Jiang","Ronghui Zhang","Yanming Shen","Houbing Herbert Song"],"pdf_url":"https://arxiv.org/pdf/2501.10151v1.pdf","comment":"Accepted by IEEE Internet of Things Journal"},{"id":"http://arxiv.org/abs/2501.10150v1","updated":"2025-01-17T12:23:30Z","published":"2025-01-17T12:23:30Z","title":"Dual Debiasing: Remove Stereotypes and Keep Factual Gender for Fair\n  Language Modeling and Translation","summary":"  Mitigation of biases, such as language models' reliance on gender\nstereotypes, is a crucial endeavor required for the creation of reliable and\nuseful language technology. The crucial aspect of debiasing is to ensure that\nthe models preserve their versatile capabilities, including their ability to\nsolve language tasks and equitably represent various genders. To address this\nissue, we introduce a streamlined Dual Dabiasing Algorithm through Model\nAdaptation (2DAMA). Novel Dual Debiasing enables robust reduction of\nstereotypical bias while preserving desired factual gender information encoded\nby language models. We show that 2DAMA effectively reduces gender bias in\nEnglish and is one of the first approaches facilitating the mitigation of\nstereotypical tendencies in translation. The proposed method's key advantage is\nthe preservation of factual gender cues, which are useful in a wide range of\nnatural language processing tasks.\n","authors":["Tomasz Limisiewicz","David Mareček","Tomáš Musil"],"pdf_url":"https://arxiv.org/pdf/2501.10150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12634v2","updated":"2025-01-17T12:19:05Z","published":"2024-06-18T14:01:53Z","title":"News Without Borders: Domain Adaptation of Multilingual Sentence\n  Embeddings for Cross-lingual News Recommendation","summary":"  Rapidly growing numbers of multilingual news consumers pose an increasing\nchallenge to news recommender systems in terms of providing customized\nrecommendations. First, existing neural news recommenders, even when powered by\nmultilingual language models (LMs), suffer substantial performance losses in\nzero-shot cross-lingual transfer (ZS-XLT). Second, the current paradigm of\nfine-tuning the backbone LM of a neural recommender on task-specific data is\ncomputationally expensive and infeasible in few-shot recommendation and\ncold-start setups, where data is scarce or completely unavailable. In this\nwork, we propose a news-adapted sentence encoder (NaSE), domain-specialized\nfrom a pretrained massively multilingual sentence encoder (SE). To this end, we\nconstruct and leverage PolyNews and PolyNewsParallel, two multilingual\nnews-specific corpora. With the news-adapted multilingual SE in place, we test\nthe effectiveness of (i.e., question the need for) supervised fine-tuning for\nnews recommendation, and propose a simple and strong baseline based on (i)\nfrozen NaSE embeddings and (ii) late click-behavior fusion. We show that NaSE\nachieves state-of-the-art performance in ZS-XLT in true cold-start and few-shot\nnews recommendation.\n","authors":["Andreea Iana","Fabian David Schmidt","Goran Glavaš","Heiko Paulheim"],"pdf_url":"https://arxiv.org/pdf/2406.12634v2.pdf","comment":"Accepted at the 47th European Conference on Information Retrieval\n  (ECIR 2025) Appendix A is provided only in the arXiv version"},{"id":"http://arxiv.org/abs/2412.18836v2","updated":"2025-01-17T12:18:44Z","published":"2024-12-25T08:49:43Z","title":"MRI2Speech: Speech Synthesis from Articulatory Movements Recorded by\n  Real-time MRI","summary":"  Previous real-time MRI (rtMRI)-based speech synthesis models depend heavily\non noisy ground-truth speech. Applying loss directly over ground truth\nmel-spectrograms entangles speech content with MRI noise, resulting in poor\nintelligibility. We introduce a novel approach that adapts the multi-modal\nself-supervised AV-HuBERT model for text prediction from rtMRI and incorporates\na new flow-based duration predictor for speaker-specific alignment. The\npredicted text and durations are then used by a speech decoder to synthesize\naligned speech in any novel voice. We conduct thorough experiments on two\ndatasets and demonstrate our method's generalization ability to unseen\nspeakers. We assess our framework's performance by masking parts of the rtMRI\nvideo to evaluate the impact of different articulators on text prediction. Our\nmethod achieves a $15.18\\%$ Word Error Rate (WER) on the USC-TIMIT MRI corpus,\nmarking a huge improvement over the current state-of-the-art. Speech samples\nare available at https://mri2speech.github.io/MRI2Speech/\n","authors":["Neil Shah","Ayan Kashyap","Shirish Karande","Vineet Gandhi"],"pdf_url":"https://arxiv.org/pdf/2412.18836v2.pdf","comment":"Accepted at IEEE ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.10048v2","updated":"2025-01-17T12:12:28Z","published":"2024-09-16T07:20:33Z","title":"Audio-Driven Reinforcement Learning for Head-Orientation in Naturalistic\n  Environments","summary":"  Although deep reinforcement learning (DRL) approaches in audio signal\nprocessing have seen substantial progress in recent years, audio-driven DRL for\ntasks such as navigation, gaze control and head-orientation control in the\ncontext of human-robot interaction have received little attention. Here, we\npropose an audio-driven DRL framework in which we utilise deep Q-learning to\ndevelop an autonomous agent that orients towards a talker in the acoustic\nenvironment based on stereo speech recordings. Our results show that the agent\nlearned to perform the task at a near perfect level when trained on speech\nsegments in anechoic environments (that is, without reverberation). The\npresence of reverberation in naturalistic acoustic environments affected the\nagent's performance, although the agent still substantially outperformed a\nbaseline, randomly acting agent. Finally, we quantified the degree of\ngeneralization of the proposed DRL approach across naturalistic acoustic\nenvironments. Our experiments revealed that policies learned by agents trained\non medium or high reverb environments generalized to low reverb environments,\nbut policies learned by agents trained on anechoic or low reverb environments\ndid not generalize to medium or high reverb environments. Taken together, this\nstudy demonstrates the potential of audio-driven DRL for tasks such as\nhead-orientation control and highlights the need for training strategies that\nenable robust generalization across environments for real-world audio-driven\nDRL applications.\n","authors":["Wessel Ledder","Yuzhen Qin","Kiki van der Heijden"],"pdf_url":"https://arxiv.org/pdf/2409.10048v2.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.10141v1","updated":"2025-01-17T12:05:24Z","published":"2025-01-17T12:05:24Z","title":"Enhancing UAV Path Planning Efficiency Through Accelerated Learning","summary":"  Unmanned Aerial Vehicles (UAVs) are increasingly essential in various fields\nsuch as surveillance, reconnaissance, and telecommunications. This study aims\nto develop a learning algorithm for the path planning of UAV wireless\ncommunication relays, which can reduce storage requirements and accelerate Deep\nReinforcement Learning (DRL) convergence. Assuming the system possesses terrain\nmaps of the area and can estimate user locations using localization algorithms\nor direct GPS reporting, it can input these parameters into the learning\nalgorithms to achieve optimized path planning performance. However, higher\nresolution terrain maps are necessary to extract topological information such\nas terrain height, object distances, and signal blockages. This requirement\nincreases memory and storage demands on UAVs while also lengthening convergence\ntimes in DRL algorithms. Similarly, defining the telecommunication coverage map\nin UAV wireless communication relays using these terrain maps and user position\nestimations demands higher memory and storage utilization for the learning path\nplanning algorithms. Our approach reduces path planning training time by\napplying a dimensionality reduction technique based on Principal Component\nAnalysis (PCA), sample combination, Prioritized Experience Replay (PER), and\nthe combination of Mean Squared Error (MSE) and Mean Absolute Error (MAE) loss\ncalculations in the coverage map estimates, thereby enhancing a Twin Delayed\nDeep Deterministic Policy Gradient (TD3) algorithm. The proposed solution\nreduces the convergence episodes needed for basic training by approximately\nfour times compared to the traditional TD3.\n","authors":["Joseanne Viana","Boris Galkin","Lester Ho","Holger Claussen"],"pdf_url":"https://arxiv.org/pdf/2501.10141v1.pdf","comment":"This paper was accepted in https://camad2024.ieee-camad.org/\n  conference but it is not available from the conference yet"},{"id":"http://arxiv.org/abs/2501.10139v1","updated":"2025-01-17T12:01:56Z","published":"2025-01-17T12:01:56Z","title":"Conformal Prediction Sets with Improved Conditional Coverage using Trust\n  Scores","summary":"  Standard conformal prediction offers a marginal guarantee on coverage, but\nfor prediction sets to be truly useful, they should ideally ensure coverage\nconditional on each test point. Unfortunately, it is impossible to achieve\nexact, distribution-free conditional coverage in finite samples. In this work,\nwe propose an alternative conformal prediction algorithm that targets coverage\nwhere it matters most--in instances where a classifier is overconfident in its\nincorrect predictions. We start by dissecting miscoverage events in\nmarginally-valid conformal prediction, and show that miscoverage rates vary\nbased on the classifier's confidence and its deviation from the Bayes optimal\nclassifier. Motivated by this insight, we develop a variant of conformal\nprediction that targets coverage conditional on a reduced set of two variables:\nthe classifier's confidence in a prediction and a nonparametric trust score\nthat measures its deviation from the Bayes classifier. Empirical evaluation on\nmultiple image datasets shows that our method generally improves conditional\ncoverage properties compared to standard conformal prediction, including\nclass-conditional coverage, coverage over arbitrary subgroups, and coverage\nover demographic groups.\n","authors":["Jivat Neet Kaur","Michael I. Jordan","Ahmed Alaa"],"pdf_url":"https://arxiv.org/pdf/2501.10139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02957v3","updated":"2025-01-17T11:59:23Z","published":"2024-05-05T14:53:51Z","title":"Agent Hospital: A Simulacrum of Hospital with Evolvable Medical Agents","summary":"  The recent rapid development of large language models (LLMs) has sparked a\nnew wave of technological revolution in medical artificial intelligence (AI).\nWhile LLMs are designed to understand and generate text like a human,\nautonomous agents that utilize LLMs as their \"brain\" have exhibited\ncapabilities beyond text processing such as planning, reflection, and using\ntools by enabling their \"bodies\" to interact with the environment. We introduce\na simulacrum of hospital called Agent Hospital that simulates the entire\nprocess of treating illness, in which all patients, nurses, and doctors are\nLLM-powered autonomous agents. Within the simulacrum, doctor agents are able to\nevolve by treating a large number of patient agents without the need to label\ntraining data manually. After treating tens of thousands of patient agents in\nthe simulacrum (human doctors may take several years in the real world), the\nevolved doctor agents outperform state-of-the-art medical agent methods on the\nMedQA benchmark comprising US Medical Licensing Examination (USMLE) test\nquestions. Our methods of simulacrum construction and agent evolution have the\npotential in benefiting a broad range of applications beyond medical AI.\n","authors":["Junkai Li","Yunghwei Lai","Weitao Li","Jingyi Ren","Meng Zhang","Xinhui Kang","Siyu Wang","Peng Li","Ya-Qin Zhang","Weizhi Ma","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2405.02957v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10134v1","updated":"2025-01-17T11:49:49Z","published":"2025-01-17T11:49:49Z","title":"Exploring the Impact of Generative Artificial Intelligence in Education:\n  A Thematic Analysis","summary":"  The recent advancements in Generative Artificial intelligence (GenAI)\ntechnology have been transformative for the field of education. Large Language\nModels (LLMs) such as ChatGPT and Bard can be leveraged to automate boilerplate\ntasks, create content for personalised teaching, and handle repetitive tasks to\nallow more time for creative thinking. However, it is important to develop\nguidelines, policies, and assessment methods in the education sector to ensure\nthe responsible integration of these tools. In this article, thematic analysis\nhas been performed on seven essays obtained from professionals in the education\nsector to understand the advantages and pitfalls of using GenAI models such as\nChatGPT and Bard in education. Exploratory Data Analysis (EDA) has been\nperformed on the essays to extract further insights from the text. The study\nfound several themes which highlight benefits and drawbacks of GenAI tools, as\nwell as suggestions to overcome these limitations and ensure that students are\nusing these tools in a responsible and ethical manner.\n","authors":["Abhishek Kaushik","Sargam Yadav","Andrew Browne","David Lillis","David Williams","Jack Mc Donnell","Peadar Grant","Siobhan Connolly Kernan","Shubham Sharma","Mansi Arora"],"pdf_url":"https://arxiv.org/pdf/2501.10134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07629v3","updated":"2025-01-17T11:37:04Z","published":"2024-12-10T16:08:14Z","title":"Piece of Table: A Divide-and-Conquer Approach for Selecting Sub-Tables\n  in Table Question Answering","summary":"  Applying language models (LMs) to tables is challenging due to the inherent\nstructural differences between two-dimensional tables and one-dimensional text\nfor which the LMs were originally designed. Furthermore, when applying\nlinearized tables to LMs, the maximum token lengths often imposed in\nself-attention calculations make it difficult to comprehensively understand the\ncontext spread across large tables. To address these challenges, we present\nPieTa (Piece of Table), a new framework for sub-table-based question answering\n(QA). PieTa operates through an iterative process of dividing tables into\nsmaller windows, using LMs to select relevant cells within each window, and\nmerging these cells into a sub-table. This multi-resolution approach captures\ndependencies across multiple rows and columns while avoiding the limitations\ncaused by long context inputs. Instantiated as a simple iterative sub-table\nunion algorithm, PieTa demonstrates improved performance over previous\nsub-table-based QA approaches.\n","authors":["Wonjin Lee","Kyumin Kim","Sungjae Lee","Jihun Lee","Kwang In Kim"],"pdf_url":"https://arxiv.org/pdf/2412.07629v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10129v1","updated":"2025-01-17T11:36:38Z","published":"2025-01-17T11:36:38Z","title":"Spatio-temporal Graph Learning on Adaptive Mined Key Frames for\n  High-performance Multi-Object Tracking","summary":"  In the realm of multi-object tracking, the challenge of accurately capturing\nthe spatial and temporal relationships between objects in video sequences\nremains a significant hurdle. This is further complicated by frequent\noccurrences of mutual occlusions among objects, which can lead to tracking\nerrors and reduced performance in existing methods. Motivated by these\nchallenges, we propose a novel adaptive key frame mining strategy that\naddresses the limitations of current tracking approaches. Specifically, we\nintroduce a Key Frame Extraction (KFE) module that leverages reinforcement\nlearning to adaptively segment videos, thereby guiding the tracker to exploit\nthe intrinsic logic of the video content. This approach allows us to capture\nstructured spatial relationships between different objects as well as the\ntemporal relationships of objects across frames. To tackle the issue of object\nocclusions, we have developed an Intra-Frame Feature Fusion (IFF) module.\nUnlike traditional graph-based methods that primarily focus on inter-frame\nfeature fusion, our IFF module uses a Graph Convolutional Network (GCN) to\nfacilitate information exchange between the target and surrounding objects\nwithin a frame. This innovation significantly enhances target\ndistinguishability and mitigates tracking loss and appearance similarity due to\nocclusions. By combining the strengths of both long and short trajectories and\nconsidering the spatial relationships between objects, our proposed tracker\nachieves impressive results on the MOT17 dataset, i.e., 68.6 HOTA, 81.0 IDF1,\n66.6 AssA, and 893 IDS, proving its effectiveness and accuracy.\n","authors":["Futian Wang","Fengxiang Liu","Xiao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00900v3","updated":"2025-01-17T11:18:37Z","published":"2023-09-02T10:32:53Z","title":"Large Process Models: A Vision for Business Process Management in the\n  Age of Generative AI","summary":"  The continued success of Large Language Models (LLMs) and other generative\nartificial intelligence approaches highlights the advantages that large\ninformation corpora can have over rigidly defined symbolic models, but also\nserves as a proof-point of the challenges that purely statistics-based\napproaches have in terms of safety and trustworthiness. As a framework for\ncontextualizing the potential, as well as the limitations of LLMs and other\nfoundation model-based technologies, we propose the concept of a Large Process\nModel (LPM) that combines the correlation power of LLMs with the analytical\nprecision and reliability of knowledge-based systems and automated reasoning\napproaches. LPMs are envisioned to directly utilize the wealth of process\nmanagement experience that experts have accumulated, as well as process\nperformance data of organizations with diverse characteristics, e.g.,\\\nregarding size, region, or industry. In this vision, the proposed LPM would\nallow organizations to receive context-specific (tailored) process and other\nbusiness models, analytical deep-dives, and improvement recommendations. As\nsuch, they would allow to substantially decrease the time and effort required\nfor business transformation, while also allowing for deeper, more impactful,\nand more actionable insights than previously possible. We argue that\nimplementing an LPM is feasible, but also highlight limitations and research\nchallenges that need to be solved to implement particular aspects of the LPM\nvision.\n","authors":["Timotheus Kampik","Christian Warmuth","Adrian Rebmann","Ron Agam","Lukas N. P. Egger","Andreas Gerber","Johannes Hoffart","Jonas Kolk","Philipp Herzig","Gero Decker","Han van der Aa","Artem Polyvyanyy","Stefanie Rinderle-Ma","Ingo Weber","Matthias Weidlich"],"pdf_url":"https://arxiv.org/pdf/2309.00900v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12750v2","updated":"2025-01-17T11:10:05Z","published":"2024-05-21T13:02:27Z","title":"Generative AI in Cybersecurity: A Comprehensive Review of LLM\n  Applications and Vulnerabilities","summary":"  This paper provides a comprehensive review of the future of cybersecurity\nthrough Generative AI and Large Language Models (LLMs). We explore LLM\napplications across various domains, including hardware design security,\nintrusion detection, software engineering, design verification, cyber threat\nintelligence, malware detection, and phishing detection. We present an overview\nof LLM evolution and its current state, focusing on advancements in models such\nas GPT-4, GPT-3.5, Mixtral-8x7B, BERT, Falcon2, and LLaMA. Our analysis extends\nto LLM vulnerabilities, such as prompt injection, insecure output handling,\ndata poisoning, DDoS attacks, and adversarial instructions. We delve into\nmitigation strategies to protect these models, providing a comprehensive look\nat potential attack scenarios and prevention techniques. Furthermore, we\nevaluate the performance of 42 LLM models in cybersecurity knowledge and\nhardware security, highlighting their strengths and weaknesses. We thoroughly\nevaluate cybersecurity datasets for LLM training and testing, covering the\nlifecycle from data creation to usage and identifying gaps for future research.\nIn addition, we review new strategies for leveraging LLMs, including techniques\nlike Half-Quadratic Quantization (HQQ), Reinforcement Learning with Human\nFeedback (RLHF), Direct Preference Optimization (DPO), Quantized Low-Rank\nAdapters (QLoRA), and Retrieval-Augmented Generation (RAG). These insights aim\nto enhance real-time cybersecurity defenses and improve the sophistication of\nLLM applications in threat detection and response. Our paper provides a\nfoundational understanding and strategic direction for integrating LLMs into\nfuture cybersecurity frameworks, emphasizing innovation and robust model\ndeployment to safeguard against evolving cyber threats.\n","authors":["Mohamed Amine Ferrag","Fatima Alwahedi","Ammar Battah","Bilel Cherif","Abdechakour Mechri","Norbert Tihanyi","Tamas Bisztray","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2405.12750v2.pdf","comment":"52 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.07888v2","updated":"2025-01-17T11:06:34Z","published":"2025-01-14T06:54:39Z","title":"Tarsier2: Advancing Large Vision-Language Models from Detailed Video\n  Description to Comprehensive Video Understanding","summary":"  We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM)\ndesigned for generating detailed and accurate video descriptions, while also\nexhibiting superior general video understanding capabilities. Tarsier2 achieves\nsignificant advancements through three key upgrades: (1) Scaling pre-training\ndata from 11M to 40M video-text pairs, enriching both volume and diversity; (2)\nPerforming fine-grained temporal alignment during supervised fine-tuning; (3)\nUsing model-based sampling to automatically construct preference data and\napplying DPO training for optimization. Extensive experiments show that\nTarsier2-7B consistently outperforms leading proprietary models, including\nGPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K\nbenchmark, Tarsier2-7B improves F1 by 2.8\\% over GPT-4o and 5.8\\% over\nGemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\\%\nperformance advantage over GPT-4o and +24.9\\% over Gemini-1.5-Pro. Tarsier2-7B\nalso sets new state-of-the-art results across 15 public benchmarks, spanning\ntasks such as video question-answering, video grounding, hallucination test,\nand embodied question-answering, demonstrating its versatility as a robust\ngeneralist vision-language model.\n","authors":["Liping Yuan","Jiawei Wang","Haomiao Sun","Yuchen Zhang","Yuan Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07888v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10662v4","updated":"2025-01-17T11:02:03Z","published":"2024-07-15T12:25:49Z","title":"XEQ Scale for Evaluating XAI Experience Quality","summary":"  Explainable Artificial Intelligence (XAI) aims to improve the transparency of\nautonomous decision-making through explanations. Recent literature has\nemphasised users' need for holistic \"multi-shot\" explanations and personalised\nengagement with XAI systems. We refer to this user-centred interaction as an\nXAI Experience. Despite advances in creating XAI experiences, evaluating them\nin a user-centred manner has remained challenging. In response, we developed\nthe XAI Experience Quality (XEQ) Scale. XEQ quantifies the quality of\nexperiences across four dimensions: learning, utility, fulfilment and\nengagement. These contributions extend the state-of-the-art of XAI evaluation,\nmoving beyond the one-dimensional metrics frequently developed to assess\nsingle-shot explanations. This paper presents the XEQ scale development and\nvalidation process, including content validation with XAI experts, and\ndiscriminant and construct validation through a large-scale pilot study. Our\npilot study results offer strong evidence that establishes the XEQ Scale as a\ncomprehensive framework for evaluating user-centred XAI experiences.\n","authors":["Anjana Wijekoon","Nirmalie Wiratunga","David Corsar","Kyle Martin","Ikechukwu Nkisi-Orji","Belen Díaz-Agudo","Derek Bridge"],"pdf_url":"https://arxiv.org/pdf/2407.10662v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10114v1","updated":"2025-01-17T10:58:12Z","published":"2025-01-17T10:58:12Z","title":"Infrastructure for AI Agents","summary":"  Increasingly many AI systems can plan and execute interactions in open-ended\nenvironments, such as making phone calls or buying online goods. As developers\ngrow the space of tasks that such AI agents can accomplish, we will need tools\nboth to unlock their benefits and manage their risks. Current tools are largely\ninsufficient because they are not designed to shape how agents interact with\nexisting institutions (e.g., legal and economic systems) or actors (e.g.,\ndigital service providers, humans, other AI agents). For example, alignment\ntechniques by nature do not assure counterparties that some human will be held\naccountable when a user instructs an agent to perform an illegal action. To\nfill this gap, we propose the concept of agent infrastructure: technical\nsystems and shared protocols external to agents that are designed to mediate\nand influence their interactions with and impacts on their environments. Agent\ninfrastructure comprises both new tools and reconfigurations or extensions of\nexisting tools. For example, to facilitate accountability, protocols that tie\nusers to agents could build upon existing systems for user authentication, such\nas OpenID. Just as the Internet relies on infrastructure like HTTPS, we argue\nthat agent infrastructure will be similarly indispensable to ecosystems of\nagents. We identify three functions for agent infrastructure: 1) attributing\nactions, properties, and other information to specific agents, their users, or\nother actors; 2) shaping agents' interactions; and 3) detecting and remedying\nharmful actions from agents. We propose infrastructure that could help achieve\neach function, explaining use cases, adoption, limitations, and open questions.\nMaking progress on agent infrastructure can prepare society for the adoption of\nmore advanced agents.\n","authors":["Alan Chan","Kevin Wei","Sihao Huang","Nitarshan Rajkumar","Elija Perrier","Seth Lazar","Gillian K. Hadfield","Markus Anderljung"],"pdf_url":"https://arxiv.org/pdf/2501.10114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10107v1","updated":"2025-01-17T10:50:22Z","published":"2025-01-17T10:50:22Z","title":"BBPOS: BERT-based Part-of-Speech Tagging for Uzbek","summary":"  This paper advances NLP research for the low-resource Uzbek language by\nevaluating two previously untested monolingual Uzbek BERT models on the\npart-of-speech (POS) tagging task and introducing the first publicly available\nUPOS-tagged benchmark dataset for Uzbek. Our fine-tuned models achieve 91%\naverage accuracy, outperforming the baseline multi-lingual BERT as well as the\nrule-based tagger. Notably, these models capture intermediate POS changes\nthrough affixes and demonstrate context sensitivity, unlike existing rule-based\ntaggers.\n","authors":["Latofat Bobojonova","Arofat Akhundjanova","Phil Ostheimer","Sophie Fellenz"],"pdf_url":"https://arxiv.org/pdf/2501.10107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10106v1","updated":"2025-01-17T10:47:11Z","published":"2025-01-17T10:47:11Z","title":"LLM Reasoner and Automated Planner: A new NPC approach","summary":"  In domains requiring intelligent agents to emulate plausible human-like\nbehaviour, such as formative simulations, traditional techniques like behaviour\ntrees encounter significant challenges. Large Language Models (LLMs), despite\nnot always yielding optimal solutions, usually offer plausible and human-like\nresponses to a given problem. In this paper, we exploit this capability and\npropose a novel architecture that integrates an LLM for decision-making with a\nclassical automated planner that can generate sound plans for that decision.\nThe combination aims to equip an agent with the ability to make decisions in\nvarious situations, even if they were not anticipated during the design phase.\n","authors":["Israel Puerta-Merino","Jordi Sabater-Mir"],"pdf_url":"https://arxiv.org/pdf/2501.10106v1.pdf","comment":"15 pages, 7 figures, extended version of the homonymous paper\n  submitted to the Catalan Conference on Artificial Intelligent (CCIA) 2025"},{"id":"http://arxiv.org/abs/2501.10105v1","updated":"2025-01-17T10:45:22Z","published":"2025-01-17T10:45:22Z","title":"Universal Actions for Enhanced Embodied Foundation Models","summary":"  Training on diverse, internet-scale data is a key factor in the success of\nrecent large foundation models. Yet, using the same recipe for building\nembodied agents has faced noticeable difficulties. Despite the availability of\nmany crowd-sourced embodied datasets, their action spaces often exhibit\nsignificant heterogeneity due to distinct physical embodiment and control\ninterfaces for different robots, causing substantial challenges in developing\nembodied foundation models using cross-domain data. In this paper, we introduce\nUniAct, a new embodied foundation modeling framework operating in a tokenized\nUniversal Action Space. Our learned universal actions capture the generic\natomic behaviors across diverse robots by exploiting their shared structural\nfeatures, and enable enhanced cross-domain data utilization and\ncross-embodiment generalizations by eliminating the notorious heterogeneity.\nThe universal actions can be efficiently translated back to heterogeneous\nactionable commands by simply adding embodiment-specific details, from which\nfast adaptation to new robots becomes simple and straightforward. Our 0.5B\ninstantiation of UniAct outperforms 14X larger SOTA embodied foundation models\nin extensive evaluations on various real-world and simulation robots,\nshowcasing exceptional cross-embodiment control and adaptation capability,\nhighlighting the crucial benefit of adopting universal actions. Project page:\nhttps://github.com/2toinf/UniAct\n","authors":["Jinliang Zheng","Jianxiong Li","Dongxiu Liu","Yinan Zheng","Zhihao Wang","Zhonghong Ou","Yu Liu","Jingjing Liu","Ya-Qin Zhang","Xianyuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2501.10105v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.10100v1","updated":"2025-01-17T10:39:09Z","published":"2025-01-17T10:39:09Z","title":"Robotic World Model: A Neural Network Simulator for Robust Policy\n  Optimization in Robotics","summary":"  Learning robust and generalizable world models is crucial for enabling\nefficient and scalable robotic control in real-world environments. In this\nwork, we introduce a novel framework for learning world models that accurately\ncapture complex, partially observable, and stochastic dynamics. The proposed\nmethod employs a dual-autoregressive mechanism and self-supervised training to\nachieve reliable long-horizon predictions without relying on domain-specific\ninductive biases, ensuring adaptability across diverse robotic tasks. We\nfurther propose a policy optimization framework that leverages world models for\nefficient training in imagined environments and seamless deployment in\nreal-world systems. Through extensive experiments, our approach consistently\noutperforms state-of-the-art methods, demonstrating superior autoregressive\nprediction accuracy, robustness to noise, and generalization across\nmanipulation and locomotion tasks. Notably, policies trained with our method\nare successfully deployed on ANYmal D hardware in a zero-shot transfer,\nachieving robust performance with minimal sim-to-real performance loss. This\nwork advances model-based reinforcement learning by addressing the challenges\nof long-horizon prediction, error accumulation, and sim-to-real transfer. By\nproviding a scalable and robust framework, the introduced methods pave the way\nfor adaptive and efficient robotic systems in real-world applications.\n","authors":["Chenhao Li","Andreas Krause","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2501.10100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10098v1","updated":"2025-01-17T10:35:58Z","published":"2025-01-17T10:35:58Z","title":"landmarker: a Toolkit for Anatomical Landmark Localization in 2D/3D\n  Images","summary":"  Anatomical landmark localization in 2D/3D images is a critical task in\nmedical imaging. Although many general-purpose tools exist for landmark\nlocalization in classical computer vision tasks, such as pose estimation, they\nlack the specialized features and modularity necessary for anatomical landmark\nlocalization applications in the medical domain. Therefore, we introduce\nlandmarker, a Python package built on PyTorch. The package provides a\ncomprehensive, flexible toolkit for developing and evaluating landmark\nlocalization algorithms, supporting a range of methodologies, including static\nand adaptive heatmap regression. landmarker enhances the accuracy of landmark\nidentification, streamlines research and development processes, and supports\nvarious image formats and preprocessing pipelines. Its modular design allows\nusers to customize and extend the toolkit for specific datasets and\napplications, accelerating innovation in medical imaging. landmarker addresses\na critical need for precision and customization in landmark localization tasks\nnot adequately met by existing general-purpose pose estimation tools.\n","authors":["Jef Jonkers","Luc Duchateau","Glenn Van Wallendael","Sofie Van Hoecke"],"pdf_url":"https://arxiv.org/pdf/2501.10098v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.10091v1","updated":"2025-01-17T10:25:41Z","published":"2025-01-17T10:25:41Z","title":"How Do Programming Students Use Generative AI?","summary":"  Programming students have a widespread access to powerful Generative AI tools\nlike ChatGPT. While this can help understand the learning material and assist\nwith exercises, educators are voicing more and more concerns about an\nover-reliance on generated outputs and lack of critical thinking skills. It is\nthus important to understand how students actually use generative AI and what\nimpact this could have on their learning behavior. To this end, we conducted a\nstudy including an exploratory experiment with 37 programming students, giving\nthem monitored access to ChatGPT while solving a code understanding and\nimproving exercise. While only 23 of the students actually opted to use the\nchatbot, the majority of those eventually prompted it to simply generate a full\nsolution. We observed two prevalent usage strategies: to seek knowledge about\ngeneral concepts and to directly generate solutions. Instead of using the bot\nto comprehend the code and their own mistakes, students often got trapped in a\nvicious cycle of submitting wrong generated code and then asking the bot for a\nfix. Those who self-reported using generative AI regularly were more likely to\nprompt the bot to generate a solution. Our findings indicate that concerns\nabout potential decrease in programmers' agency and productivity with\nGenerative AI are justified. We discuss how researchers and educators can\nrespond to the potential risk of students uncritically over-relying on\ngenerative AI. We also discuss potential modifications to our study design for\nlarge-scale replications.\n","authors":["Christian Rahe","Walid Maalej"],"pdf_url":"https://arxiv.org/pdf/2501.10091v1.pdf","comment":"preprint; accepted to ACM International Conference on the Foundations\n  of Software Engineering (FSE) 2025"},{"id":"http://arxiv.org/abs/2412.12997v2","updated":"2025-01-17T10:02:38Z","published":"2024-12-17T15:21:28Z","title":"Enabling Low-Resource Language Retrieval: Establishing Baselines for\n  Urdu MS MARCO","summary":"  As the Information Retrieval (IR) field increasingly recognizes the\nimportance of inclusivity, addressing the needs of low-resource languages\nremains a significant challenge. This paper introduces the first large-scale\nUrdu IR dataset, created by translating the MS MARCO dataset through machine\ntranslation. We establish baseline results through zero-shot learning for IR in\nUrdu and subsequently apply the mMARCO multilingual IR methodology to this\nnewly translated dataset. Our findings demonstrate that the fine-tuned model\n(Urdu-mT5-mMARCO) achieves a Mean Reciprocal Rank (MRR@10) of 0.247 and a\nRecall@10 of 0.439, representing significant improvements over zero-shot\nresults and showing the potential for expanding IR access for Urdu speakers. By\nbridging access gaps for speakers of low-resource languages, this work not only\nadvances multilingual IR research but also emphasizes the ethical and societal\nimportance of inclusive IR technologies. This work provides valuable insights\ninto the challenges and solutions for improving language representation and\nlays the groundwork for future research, especially in South Asian languages,\nwhich can benefit from the adaptable methods used in this study.\n","authors":["Umer Butt","Stalin Veranasi","Günter Neumann"],"pdf_url":"https://arxiv.org/pdf/2412.12997v2.pdf","comment":"7 pages, ECIR 2025, conference camera-ready version"},{"id":"http://arxiv.org/abs/2411.19939v2","updated":"2025-01-17T09:50:55Z","published":"2024-11-29T18:56:37Z","title":"VLSBench: Unveiling Visual Leakage in Multimodal Safety","summary":"  Safety concerns of Multimodal large language models (MLLMs) have gradually\nbecome an important problem in various applications. Surprisingly, previous\nworks indicate a counter-intuitive phenomenon that using textual unlearning to\nalign MLLMs achieves comparable safety performances with MLLMs trained with\nimage-text pairs. To explain such a counter-intuitive phenomenon, we discover a\nvisual safety information leakage (VSIL) problem in existing multimodal safety\nbenchmarks, i.e., the potentially risky and sensitive content in the image has\nbeen revealed in the textual query. In this way, MLLMs can easily refuse these\nsensitive text-image queries according to textual queries. However, image-text\npairs without VSIL are common in real-world scenarios and are overlooked by\nexisting multimodal safety benchmarks. To this end, we construct multimodal\nvisual leakless safety benchmark (VLSBench) preventing visual safety leakage\nfrom image to textual query with 2.4k image-text pairs. Experimental results\nindicate that VLSBench poses a significant challenge to both open-source and\nclose-source MLLMs, including LLaVA, Qwen2-VL, Llama3.2-Vision, and GPT-4o.\nThis study demonstrates that textual alignment is enough for multimodal safety\nscenarios with VSIL, while multimodal alignment is a more promising solution\nfor multimodal safety scenarios without VSIL. Please see our code and data at:\nhttps://hxhcreate.github.io/vlsbench.github.io/\n","authors":["Xuhao Hu","Dongrui Liu","Hao Li","Xuanjing Huang","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2411.19939v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10156v3","updated":"2025-01-17T09:49:37Z","published":"2024-11-15T12:59:46Z","title":"Mitigating Sycophancy in Decoder-Only Transformer Architectures:\n  Synthetic Data Intervention","summary":"  To address the sycophancy problem caused by reinforcement learning from human\nfeedback in large language models, this research applies synthetic data\nintervention technology to the decoder-only transformer architecture. Based on\nthe research gaps in the existing literature, the researcher designed an\nexperimental process to reduce the tendency of models to cater by generating\ndiversified data, and used GPT4o as an experimental tool for verification. The\nexperiment used 100 true and false questions, and compared the performance of\nthe model trained with synthetic data intervention and the original untrained\nmodel on multiple indicators. The results show that the SDI training model\nsupports the technology in terms of accuracy rate and sycophancy rate and has\nsignificant effectiveness in reducing sycophancy phenomena. Notably, the data\nset, experimental process, code and data results have been uploaded to Github,\nthe link is https://github.com/brucewang123456789/GeniusTrail.git.\n","authors":["Libo Wang"],"pdf_url":"https://arxiv.org/pdf/2411.10156v3.pdf","comment":"This research is also submitted to OpenReview. The main text is 9\n  pages (excluding citations), 7 figures, and 1 table"},{"id":"http://arxiv.org/abs/2501.10075v1","updated":"2025-01-17T09:47:27Z","published":"2025-01-17T09:47:27Z","title":"Robust Change Captioning in Remote Sensing: SECOND-CC Dataset and\n  MModalCC Framework","summary":"  Remote sensing change captioning (RSICC) aims to describe changes between\nbitemporal images in natural language. Existing methods often fail under\nchallenges like illumination differences, viewpoint changes, blur effects,\nleading to inaccuracies, especially in no-change regions. Moreover, the images\nacquired at different spatial resolutions and have registration errors tend to\naffect the captions. To address these issues, we introduce SECOND-CC, a novel\nRSICC dataset featuring high-resolution RGB image pairs, semantic segmentation\nmaps, and diverse real-world scenarios. SECOND-CC which contains 6,041 pairs of\nbitemporal RS images and 30,205 sentences describing the differences between\nimages. Additionally, we propose MModalCC, a multimodal framework that\nintegrates semantic and visual data using advanced attention mechanisms,\nincluding Cross-Modal Cross Attention (CMCA) and Multimodal Gated Cross\nAttention (MGCA). Detailed ablation studies and attention visualizations\nfurther demonstrate its effectiveness and ability to address RSICC challenges.\nComprehensive experiments show that MModalCC outperforms state-of-the-art RSICC\nmethods, including RSICCformer, Chg2Cap, and PSNet with +4.6% improvement on\nBLEU4 score and +9.6% improvement on CIDEr score. We will make our dataset and\ncodebase publicly available to facilitate future research at\nhttps://github.com/ChangeCapsInRS/SecondCC\n","authors":["Ali Can Karaca","M. Enes Ozelbas","Saadettin Berber","Orkhan Karimli","Turabi Yildirim","M. Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2501.10075v1.pdf","comment":"This work has been submitted to the IEEE Transactions on Geoscience\n  and Remote Sensing journal for possible publication"},{"id":"http://arxiv.org/abs/2501.10074v1","updated":"2025-01-17T09:46:27Z","published":"2025-01-17T09:46:27Z","title":"SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and\n  Chain-of-Thought for Embodied Task Planning","summary":"  Spatial reasoning is an essential problem in embodied AI research. Efforts to\nenhance spatial reasoning abilities through supplementary spatial data and\nfine-tuning have proven limited and ineffective when addressing complex\nembodied tasks, largely due to their dependence on language-based outputs.\nWhile some approaches have introduced a point-based action space to mitigate\nthis issue, they fall short in managing more intricate tasks within complex\nenvironments. This deficiency arises from their failure to fully exploit the\ninherent thinking and reasoning capabilities that are fundamental strengths of\nVision-Language Models (VLMs). To address these limitations, we propose a novel\napproach named SpatialCoT, specifically designed to bolster the spatial\nreasoning capabilities of VLMs. Our approach comprises two stages: spatial\ncoordinate bi-directional alignment, which aligns vision-language inputs with\nspatial coordinates, and chain-of-thought spatial grounding, which harnesses\nthe reasoning capabilities of language models for advanced spatial reasoning.\nWe evaluate SpatialCoT on challenging navigation and manipulation tasks, both\nin simulation and real-world settings. Experimental results demonstrate that\nour method significantly outperforms previous state-of-the-art approaches in\nboth tasks.\n","authors":["Yuecheng Liu","Dafeng Chi","Shiguang Wu","Zhanguang Zhang","Yaochen Hu","Lingfeng Zhang","Yingxue Zhang","Shuang Wu","Tongtong Cao","Guowei Huang","Guangjian Tian","Xingyue Quan","Jianye Hao","Yuzheng Zhuang"],"pdf_url":"https://arxiv.org/pdf/2501.10074v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.10069v1","updated":"2025-01-17T09:42:48Z","published":"2025-01-17T09:42:48Z","title":"A Survey on LLM Test-Time Compute via Search: Tasks, LLM Profiling,\n  Search Algorithms, and Relevant Frameworks","summary":"  LLM test-time compute (or LLM inference) via search has emerged as a\npromising research area with rapid developments. However, current frameworks\noften adopt distinct perspectives on three key aspects (task definition, LLM\nprofiling, and search procedures), making direct comparisons challenging.\nMoreover, the search algorithms employed often diverge from standard\nimplementations, and their specific characteristics are not thoroughly\nspecified. In this survey, we provide a comprehensive technical review that\nunifies task definitions and provides modular definitions of LLM profiling and\nsearch procedures. The definitions enable precise comparisons of various LLM\ninference frameworks while highlighting their departures from conventional\nsearch algorithms. We also discuss the applicability, performance, and\nefficiency of these methods. For further details and ongoing updates, please\nrefer to our GitHub repository:\nhttps://github.com/xinzhel/LLM-Agent-Survey/blob/main/search.md\n","authors":["Xinzhe Li"],"pdf_url":"https://arxiv.org/pdf/2501.10069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03594v2","updated":"2025-01-17T09:37:36Z","published":"2024-11-29T05:57:37Z","title":"BatchLLM: Optimizing Large Batched LLM Inference with Global Prefix\n  Sharing and Throughput-oriented Token Batching","summary":"  Large language models (LLMs) increasingly play an important role in a wide\nrange of information processing and management tasks. Many of these tasks are\nperformed in large batches or even offline, and the performance indictor for\nwhich is throughput. These tasks usually show the characteristic of prefix\nsharing, where different prompt input can partially show the common prefix.\nHowever, the existing LLM inference engines tend to optimize the streaming\nrequests and show limitations of supporting the large batched tasks with the\nprefix sharing characteristic. The existing solutions use the LRU-based cache\nto reuse the KV context of common prefix between requests. The KV context that\nare about to be reused may prematurely evicted with the implicit cache\nmanagement. Besides, the streaming oriented systems do not leverage the\nrequest-batch information and can not mix the decoding tokens with the prefill\nchunks to the best for the batched scenarios, and thus fails to saturate the\nGPU. We propose BatchLLM to address the above problems. BatchLLM explicitly\nidentifies the common prefixes globally. The requests sharing the same prefix\nwill be scheduled together to reuse the KV context the best. BatchLLM reorders\nthe requests and schedules the requests with larger ratio of decoding first to\nbetter mix the decoding tokens with the latter prefill chunks, and applies\nmemory-centric token batching to enlarge the token-batch sizes, which helps to\nincrease the GPU utilization. Finally, BatchLLM optimizes the prefix-shared\nAttention kernel with horizontal fusion to reduce tail effect and kernel launch\noverhead. Extensive evaluation shows that BatchLLM outperforms vLLM and SGLang\nby 1.3$\\times$ to 10.8$\\times$ on a set of microbenchmarks and a typical\nindustry workload under different hardware environments.\n","authors":["Zhen Zheng","Xin Ji","Taosong Fang","Fanghao Zhou","Chuanjie Liu","Gang Peng"],"pdf_url":"https://arxiv.org/pdf/2412.03594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20262v3","updated":"2025-01-17T09:32:54Z","published":"2024-03-29T16:13:31Z","title":"ELITR-Bench: A Meeting Assistant Benchmark for Long-Context Language\n  Models","summary":"  Research on Large Language Models (LLMs) has recently witnessed an increasing\ninterest in extending the models' context size to better capture dependencies\nwithin long documents. While benchmarks have been proposed to assess long-range\nabilities, existing efforts primarily considered generic tasks that are not\nnecessarily aligned with real-world applications. In contrast, we propose a new\nbenchmark for long-context LLMs focused on a practical meeting assistant\nscenario in which the long contexts consist of transcripts obtained by\nautomatic speech recognition, presenting unique challenges for LLMs due to the\ninherent noisiness and oral nature of such data. Our benchmark, ELITR-Bench,\naugments the existing ELITR corpus by adding 271 manually crafted questions\nwith their ground-truth answers, as well as noisy versions of meeting\ntranscripts altered to target different Word Error Rate levels. Our experiments\nwith 12 long-context LLMs on ELITR-Bench confirm the progress made across\nsuccessive generations of both proprietary and open models, and point out their\ndiscrepancies in terms of robustness to transcript noise. We also provide a\nthorough analysis of our GPT-4-based evaluation, including insights from a\ncrowdsourcing study. Our findings indicate that while GPT-4's scores align with\nhuman judges, its ability to distinguish beyond three score levels may be\nlimited.\n","authors":["Thibaut Thonet","Jos Rozen","Laurent Besacier"],"pdf_url":"https://arxiv.org/pdf/2403.20262v3.pdf","comment":"Published in COLING 2025"},{"id":"http://arxiv.org/abs/2501.10054v1","updated":"2025-01-17T09:20:56Z","published":"2025-01-17T09:20:56Z","title":"Accelerating Large Language Models through Partially Linear Feed-Forward\n  Network","summary":"  Large language models (LLMs) demonstrate remarkable capabilities but face\ndeployment challenges due to their massive parameter counts. While existing\ncompression techniques like pruning can reduce model size, it leads to\nsignificant accuracy degradation under high compression ratios. We present a\nnovel perspective inspired by constant folding in compiler optimization. Our\napproach enables parameter reduction by treating activation functions in LLMs\nas linear functions.\n  However, recent LLMs use complex non-linear activations like GELU that\nprevent direct application of this technique. We propose TARDIS, which enables\noptimization of LLMs with non-linear activations by partially approximating\nthem with linear functions in frequently occurring input ranges. For outlier\ninputs, TARDIS employs an online predictor to dynamically fall back to original\ncomputations.\n  Our experiments demonstrate that TARDIS achieves 80% parameter reduction in\nfeed-forward networks, while significantly outperforming state-of-the-art\npruning methods Wanda and RIA with up to 65% higher accuracy. In practical\ndeployments for a 7B model, TARDIS achieves 1.6x end-to-end inference speedup\nwhen integrated with the vLLM serving system, and 1.4x speedup with the widely\nadopted HuggingFace implementation, while incurring only a 10.9% accuracy\ntrade-off.\n","authors":["Gansen Hu","Zhaoguo Wang","Jinglin Wei","Wei Huang","Haibo Chen"],"pdf_url":"https://arxiv.org/pdf/2501.10054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17482v2","updated":"2025-01-17T09:17:30Z","published":"2024-07-02T08:07:27Z","title":"Reinforcement Learning from Human Feedback: Whose Culture, Whose Values,\n  Whose Perspectives?","summary":"  We argue for the epistemic and ethical advantages of pluralism in\nReinforcement Learning from Human Feedback (RLHF) in the context of Large\nLanguage Models (LLM). Drawing on social epistemology and pluralist philosophy\nof science, we suggest ways in which RHLF can be made more responsive to human\nneeds and how we can address challenges along the way. The paper concludes with\nan agenda for change, i.e. concrete, actionable steps to improve LLM\ndevelopment.\n","authors":["Kristian González Barman","Simon Lohse","Henk de Regt"],"pdf_url":"https://arxiv.org/pdf/2407.17482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10053v1","updated":"2025-01-17T09:16:13Z","published":"2025-01-17T09:16:13Z","title":"AirRAG: Activating Intrinsic Reasoning for Retrieval Augmented\n  Generation via Tree-based Search","summary":"  Leveraging the autonomous decision-making capabilities of large language\nmodels (LLMs) demonstrates superior performance in reasoning tasks. Despite the\nsuccesses of iterative or recursive retrieval-augmented generation (RAG), they\noften are trapped in a single solution space when confronted with complex\ntasks. In this paper, we propose a novel thinking pattern in RAG which\nintegrates system analysis with efficient reasoning actions, significantly\nactivating intrinsic reasoning capabilities and expanding the solution space of\nspecific tasks via Monte Carlo Tree Search (MCTS), dubbed AirRAG. Specifically,\nour approach designs five fundamental reasoning actions that are expanded to a\nwide tree-based reasoning spaces using MCTS. The extension also uses\nself-consistency verification to explore potential reasoning paths and\nimplement inference scaling. In addition, computationally optimal strategies\nare used to apply more inference computation to key actions to achieve further\nperformance improvements. Experimental results demonstrate the effectiveness of\nAirRAG through considerable performance gains over complex QA datasets.\nFurthermore, AirRAG is flexible and lightweight, making it easy to integrate\nwith other advanced technologies.\n","authors":["Wenfeng Feng","Chuzhan Hao","Yuewei Zhang","Jingyi Song","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10053v1.pdf","comment":"17 pages, 14 figures"},{"id":"http://arxiv.org/abs/2501.10048v1","updated":"2025-01-17T09:09:01Z","published":"2025-01-17T09:09:01Z","title":"Virtual Nodes Improve Long-term Traffic Prediction","summary":"  Effective traffic prediction is a cornerstone of intelligent transportation\nsystems, enabling precise forecasts of traffic flow, speed, and congestion.\nWhile traditional spatio-temporal graph neural networks (ST-GNNs) have achieved\nnotable success in short-term traffic forecasting, their performance in\nlong-term predictions remains limited. This challenge arises from\nover-squashing problem, where bottlenecks and limited receptive fields restrict\ninformation flow and hinder the modeling of global dependencies. To address\nthese challenges, this study introduces a novel framework that incorporates\nvirtual nodes, which are additional nodes added to the graph and connected to\nexisting nodes, in order to aggregate information across the entire graph\nwithin a single GNN layer. Our proposed model incorporates virtual nodes by\nconstructing a semi-adaptive adjacency matrix. This matrix integrates\ndistance-based and adaptive adjacency matrices, allowing the model to leverage\ngeographical information while also learning task-specific features from data.\nExperimental results demonstrate that the inclusion of virtual nodes\nsignificantly enhances long-term prediction accuracy while also improving\nlayer-wise sensitivity to mitigate the over-squashing problem. Virtual nodes\nalso offer enhanced explainability by focusing on key intersections and\nhigh-traffic areas, as shown by the visualization of their adjacency matrix\nweights on road network heat maps. Our advanced approach enhances the\nunderstanding and management of urban traffic systems, making it particularly\nwell-suited for real-world applications.\n","authors":["Xiaoyang Cao","Dingyi Zhuang","Jinhua Zhao","Shenhao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03703v3","updated":"2025-01-17T09:03:57Z","published":"2024-04-04T07:49:39Z","title":"Mitigating analytical variability in fMRI results with style transfer","summary":"  We propose a novel approach to improve the reproducibility of neuroimaging\nresults by converting statistic maps across different functional MRI pipelines.\nWe make the assumption that pipelines used to compute fMRI statistic maps can\nbe considered as a style component and we propose to use different generative\nmodels, among which, Generative Adversarial Networks (GAN) and Diffusion Models\n(DM) to convert statistic maps across different pipelines. We explore the\nperformance of multiple GAN frameworks, and design a new DM framework for\nunsupervised multi-domain styletransfer. We constrain the generation of 3D fMRI\nstatistic maps using the latent space of an auxiliary classifier that\ndistinguishes statistic maps from different pipelines and extend traditional\nsampling techniques used in DM to improve the transition performance. Our\nexperiments demonstrate that our proposed methods aresuccessful: pipelines can\nindeed be transferred as a style component, providing animportant source of\ndata augmentation for future medical studies.\n","authors":["Elodie Germani","Camille Maumet","Elisa Fromont"],"pdf_url":"https://arxiv.org/pdf/2404.03703v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10041v1","updated":"2025-01-17T08:56:49Z","published":"2025-01-17T08:56:49Z","title":"Spatiotemporal Prediction of Secondary Crashes by Rebalancing Dynamic\n  and Static Data with Generative Adversarial Networks","summary":"  Data imbalance is a common issue in analyzing and predicting sudden traffic\nevents. Secondary crashes constitute only a small proportion of all crashes.\nThese secondary crashes, triggered by primary crashes, significantly exacerbate\ntraffic congestion and increase the severity of incidents. However, the severe\nimbalance of secondary crash data poses significant challenges for prediction\nmodels, affecting their generalization ability and prediction accuracy.\nExisting methods fail to fully address the complexity of traffic crash data,\nparticularly the coexistence of dynamic and static features, and often struggle\nto effectively handle data samples of varying lengths. Furthermore, most\ncurrent studies predict the occurrence probability and spatiotemporal\ndistribution of secondary crashes separately, lacking an integrated solution.\nTo address these challenges, this study proposes a hybrid model named\nVarFusiGAN-Transformer, aimed at improving the fidelity of secondary crash data\ngeneration and jointly predicting the occurrence and spatiotemporal\ndistribution of secondary crashes. The VarFusiGAN-Transformer model employs\nLong Short-Term Memory (LSTM) networks to enhance the generation of\nmultivariate long-time series data, incorporating a static data generator and\nan auxiliary discriminator to model the joint distribution of dynamic and\nstatic features. In addition, the model's prediction module achieves\nsimultaneous prediction of both the occurrence and spatiotemporal distribution\nof secondary crashes. Compared to existing methods, the proposed model\ndemonstrates superior performance in generating high-fidelity data and\nimproving prediction accuracy.\n","authors":["Junlan Chen","Yiqun Li","Chenyu Ling","Ziyuan Pu","Xiucheng Guo"],"pdf_url":"https://arxiv.org/pdf/2501.10041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17188v2","updated":"2025-01-17T08:38:45Z","published":"2024-06-25T00:02:01Z","title":"Geometric Median (GM) Matching for Robust Data Pruning","summary":"  Large-scale data collections in the wild, are invariably noisy. Thus\ndeveloping data pruning strategies that remain robust even in the presence of\ncorruption is critical in practice. In this work, we propose Geometric Median\n($\\gm$) Matching -- a herding style greedy algorithm that yields a $k$-subset\nsuch that the mean of the subset approximates the geometric median of the\n(potentially) noisy dataset. Theoretically, we show that $\\gm$ Matching enjoys\nan improved $\\gO(1/k)$ scaling over $\\gO(1/\\sqrt{k})$ scaling of uniform\nsampling; while achieving {\\bf optimal breakdown point} of {\\bf 1/2} even under\n{\\bf arbitrary} corruption. Extensive experiments across several popular deep\nlearning benchmarks indicate that $\\gm$ Matching consistently improves over\nprior state-of-the-art; the gains become more profound at high rates of\ncorruption and aggressive pruning rates; making $\\gm$ Matching a strong\nbaseline for future research in robust data pruning.\n","authors":["Anish Acharya","Inderjit S Dhillon","Sujay Sanghavi"],"pdf_url":"https://arxiv.org/pdf/2406.17188v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09059v4","updated":"2025-01-17T08:32:40Z","published":"2023-07-18T08:23:46Z","title":"Text-guided Image Restoration and Semantic Enhancement for Text-to-Image\n  Person Retrieval","summary":"  The goal of Text-to-Image Person Retrieval (TIPR) is to retrieve specific\nperson images according to the given textual descriptions. A primary challenge\nin this task is bridging the substantial representational gap between visual\nand textual modalities. The prevailing methods map texts and images into\nunified embedding space for matching, while the intricate semantic\ncorrespondences between texts and images are still not effectively constructed.\nTo address this issue, we propose a novel TIPR framework to build fine-grained\ninteractions and alignment between person images and the corresponding texts.\nSpecifically, via fine-tuning the Contrastive Language-Image Pre-training\n(CLIP) model, a visual-textual dual encoder is firstly constructed, to\npreliminarily align the image and text features. Secondly, a Text-guided Image\nRestoration (TIR) auxiliary task is proposed to map abstract textual entities\nto specific image regions, improving the alignment between local textual and\nvisual embeddings. Additionally, a cross-modal triplet loss is presented to\nhandle hard samples, and further enhance the model's discriminability for minor\ndifferences. Moreover, a pruning-based text data augmentation approach is\nproposed to enhance focus on essential elements in descriptions, thereby\navoiding excessive model attention to less significant information. The\nexperimental results show our proposed method outperforms state-of-the-art\nmethods on three popular benchmark datasets, and the code will be made publicly\navailable at https://github.com/Delong-liu-bupt/SEN.\n","authors":["Delong Liu","Haiwen Li","Zhicheng Zhao","Yuan Dong"],"pdf_url":"https://arxiv.org/pdf/2307.09059v4.pdf","comment":"The paper was withdrawn due to a dispute among the authors regarding\n  the content of the article"},{"id":"http://arxiv.org/abs/2501.09368v2","updated":"2025-01-17T08:23:03Z","published":"2025-01-16T08:27:40Z","title":"Aligning Instruction Tuning with Pre-training","summary":"  Instruction tuning enhances large language models (LLMs) to follow human\ninstructions across diverse tasks, relying on high-quality datasets to guide\nbehavior. However, these datasets, whether manually curated or synthetically\ngenerated, are often narrowly focused and misaligned with the broad\ndistributions captured during pre-training, limiting LLM generalization and\neffective use of pre-trained knowledge. We propose *Aligning Instruction Tuning\nwith Pre-training* (AITP), a method that bridges this gap by identifying\ncoverage shortfalls in instruction-tuning datasets and rewriting\nunderrepresented pre-training data into high-quality instruction-response\npairs. This approach enriches dataset diversity while preserving task-specific\nobjectives. Evaluations on three fully open LLMs across eight benchmarks\ndemonstrate consistent performance improvements with AITP. Ablations highlight\nthe benefits of adaptive data selection, controlled rewriting, and balanced\nintegration, emphasizing the importance of aligning instruction tuning with\npre-training distributions to unlock the full potential of LLMs.\n","authors":["Yiming Liang","Tianyu Zheng","Xinrun Du","Ge Zhang","Xingwei Qu","Xiang Yue","Chujie Zheng","Jiaheng Liu","Lei Ma","Wenhu Chen","Guoyin Wang","Zhaoxiang Zhang","Wenhao Huang","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09368v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10024v1","updated":"2025-01-17T08:20:32Z","published":"2025-01-17T08:20:32Z","title":"Automatic Speech Recognition for Sanskrit with Transfer Learning","summary":"  Sanskrit, one of humanity's most ancient languages, has a vast collection of\nbooks and manuscripts on diverse topics that have been accumulated over\nmillennia. However, its digital content (audio and text), which is vital for\nthe training of AI systems, is profoundly limited. Furthermore, its intricate\nlinguistics make it hard to develop robust NLP tools for wider accessibility.\nGiven these constraints, we have developed an automatic speech recognition\nmodel for Sanskrit by employing transfer learning mechanism on OpenAI's Whisper\nmodel. After carefully optimising the hyper-parameters, we obtained promising\nresults with our transfer-learned model achieving a word error rate of 15.42%\non Vaksancayah dataset. An online demo of our model is made available for the\nuse of public and to evaluate its performance firsthand thereby paving the way\nfor improved accessibility and technological support for Sanskrit learning in\nthe modern era.\n","authors":["Bidit Sadhukhan","Swami Punyeshwarananda"],"pdf_url":"https://arxiv.org/pdf/2501.10024v1.pdf","comment":"Paper has been accepted at the 4th International Conference on\n  Computer, Communication, Control & Information Technology (C3IT), Hooghly,\n  India, 2024, pp. 1-5"},{"id":"http://arxiv.org/abs/2501.10017v1","updated":"2025-01-17T07:53:27Z","published":"2025-01-17T07:53:27Z","title":"Enhancing Crash Frequency Modeling Based on Augmented Multi-Type Data by\n  Hybrid VAE-Diffusion-Based Generative Neural Networks","summary":"  Crash frequency modelling analyzes the impact of factors like traffic volume,\nroad geometry, and environmental conditions on crash occurrences. Inaccurate\npredictions can distort our understanding of these factors, leading to\nmisguided policies and wasted resources, which jeopardize traffic safety. A key\nchallenge in crash frequency modelling is the prevalence of excessive zero\nobservations, caused by underreporting, the low probability of crashes, and\nhigh data collection costs. These zero observations often reduce model accuracy\nand introduce bias, complicating safety decision making. While existing\napproaches, such as statistical methods, data aggregation, and resampling,\nattempt to address this issue, they either rely on restrictive assumptions or\nresult in significant information loss, distorting crash data. To overcome\nthese limitations, we propose a hybrid VAE-Diffusion neural network, designed\nto reduce zero observations and handle the complexities of multi-type tabular\ncrash data (count, ordinal, nominal, and real-valued variables). We assess the\nsynthetic data quality generated by this model through metrics like similarity,\naccuracy, diversity, and structural consistency, and compare its predictive\nperformance against traditional statistical models. Our findings demonstrate\nthat the hybrid VAE-Diffusion model outperforms baseline models across all\nmetrics, offering a more effective approach to augmenting crash data and\nimproving the accuracy of crash frequency predictions. This study highlights\nthe potential of synthetic data to enhance traffic safety by improving crash\nfrequency modelling and informing better policy decisions.\n","authors":["Junlan Chen","Qijie He","Pei Liu","Wei Ma","Ziyuan Pu"],"pdf_url":"https://arxiv.org/pdf/2501.10017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10011v1","updated":"2025-01-17T07:48:37Z","published":"2025-01-17T07:48:37Z","title":"Mitigating Hallucinations on Object Attributes using Multiview Images\n  and Negative Instructions","summary":"  Current popular Large Vision-Language Models (LVLMs) are suffering from\nHallucinations on Object Attributes (HoOA), leading to incorrect determination\nof fine-grained attributes in the input images. Leveraging significant\nadvancements in 3D generation from a single image, this paper proposes a novel\nmethod to mitigate HoOA in LVLMs. This method utilizes multiview images sampled\nfrom generated 3D representations as visual prompts for LVLMs, thereby\nproviding more visual information from other viewpoints. Furthermore, we\nobserve the input order of multiple multiview images significantly affects the\nperformance of LVLMs. Consequently, we have devised Multiview Image Augmented\nVLM (MIAVLM), incorporating a Multiview Attributes Perceiver (MAP) submodule\ncapable of simultaneously eliminating the influence of input image order and\naligning visual information from multiview images with Large Language Models\n(LLMs). Besides, we designed and employed negative instructions to mitigate\nLVLMs' bias towards ``Yes\" responses. Comprehensive experiments demonstrate the\neffectiveness of our method.\n","authors":["Zhijie Tan","Yuzhi Li","Shengwei Meng","Xiang Yuan","Weiping Li","Tong Mo","Bingce Wang","Xu Chu"],"pdf_url":"https://arxiv.org/pdf/2501.10011v1.pdf","comment":"2025 IEEE International Conference on Acoustics, Speech, and Signal\n  Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2501.10010v1","updated":"2025-01-17T07:48:18Z","published":"2025-01-17T07:48:18Z","title":"Adaptive Spatiotemporal Augmentation for Improving Dynamic Graph\n  Learning","summary":"  Dynamic graph augmentation is used to improve the performance of dynamic\nGNNs. Most methods assume temporal locality, meaning that recent edges are more\ninfluential than earlier edges. However, for temporal changes in edges caused\nby random noise, overemphasizing recent edges while neglecting earlier ones may\nlead to the model capturing noise. To address this issue, we propose STAA\n(SpatioTemporal Activity-Aware Random Walk Diffusion). STAA identifies nodes\nlikely to have noisy edges in spatiotemporal dimensions. Spatially, it analyzes\ncritical topological positions through graph wavelet coefficients. Temporally,\nit analyzes edge evolution through graph wavelet coefficient change rates.\nThen, random walks are used to reduce the weights of noisy edges, deriving a\ndiffusion matrix containing spatiotemporal information as an augmented\nadjacency matrix for dynamic GNN learning. Experiments on multiple datasets\nshow that STAA outperforms other dynamic graph augmentation methods in node\nclassification and link prediction tasks.\n","authors":["Xu Chu","Hanlin Xue","Bingce Wang","Xiaoyang Liu","Weiping Li","Tong Mo","Tuoyu Feng","Zhijie Tan"],"pdf_url":"https://arxiv.org/pdf/2501.10010v1.pdf","comment":"2025 IEEE International Conference on Acoustics, Speech, and Signal\n  Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2501.09999v1","updated":"2025-01-17T07:30:16Z","published":"2025-01-17T07:30:16Z","title":"Deep Learning for Early Alzheimer Disease Detection with MRI Scans","summary":"  Alzheimer's Disease is a neurodegenerative condition characterized by\ndementia and impairment in neurological function. The study primarily focuses\non the individuals above age 40, affecting their memory, behavior, and\ncognitive processes of the brain. Alzheimer's disease requires diagnosis by a\ndetailed assessment of MRI scans and neuropsychological tests of the patients.\nThis project compares existing deep learning models in the pursuit of enhancing\nthe accuracy and efficiency of AD diagnosis, specifically focusing on the\nConvolutional Neural Network, Bayesian Convolutional Neural Network, and the\nU-net model with the Open Access Series of Imaging Studies brain MRI dataset.\nBesides, to ensure robustness and reliability in the model evaluations, we\naddress the challenge of imbalance in data. We then perform rigorous evaluation\nto determine strengths and weaknesses for each model by considering\nsensitivity, specificity, and computational efficiency. This comparative\nanalysis would shed light on the future role of AI in revolutionizing AD\ndiagnostics but also paved ways for future innovation in medical imaging and\nthe management of neurodegenerative diseases.\n","authors":["Mohammad Rafsan","Tamer Oraby","Upal Roy","Sanjeev Kumar","Hansapani Rodrigo"],"pdf_url":"https://arxiv.org/pdf/2501.09999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09997v1","updated":"2025-01-17T07:30:01Z","published":"2025-01-17T07:30:01Z","title":"Attention-guided Self-reflection for Zero-shot Hallucination Detection\n  in Large Language Models","summary":"  Hallucination has emerged as a significant barrier to the effective\napplication of Large Language Models (LLMs). In this work, we introduce a novel\nAttention-Guided SElf-Reflection (AGSER) approach for zero-shot hallucination\ndetection in LLMs. The AGSER method utilizes attention contributions to\ncategorize the input query into attentive and non-attentive queries. Each query\nis then processed separately through the LLMs, allowing us to compute\nconsistency scores between the generated responses and the original answer. The\ndifference between the two consistency scores serves as a hallucination\nestimator. In addition to its efficacy in detecting hallucinations, AGSER\nnotably reduces computational complexity, requiring only three passes through\nthe LLM and utilizing two sets of tokens. We have conducted extensive\nexperiments with four widely-used LLMs across three different hallucination\nbenchmarks, demonstrating that our approach significantly outperforms existing\nmethods in zero-shot hallucination detection.\n","authors":["Qiang Liu","Xinlong Chen","Yue Ding","Shizhen Xu","Shu Wu","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09996v1","updated":"2025-01-17T07:26:28Z","published":"2025-01-17T07:26:28Z","title":"Fast energy-aware OLSR routing in VANETs by means of a parallel\n  evolutionary algorithm","summary":"  This work tackles the problem of reducing the power consumption of the OLSR\nrouting protocol in vehicular networks. Nowadays, energy-aware and green\ncommunication protocols are important research topics, specially when deploying\nwireless mobile networks. This article introduces a fast automatic methodology\nto search for energy-efficient OLSR configurations by using a parallel\nevolutionary algorithm. The experimental analysis demonstrates that significant\nimprovements over the standard configuration can be attained in terms of power\nconsumption, with no noteworthy loss in the QoS.\n","authors":["Jamal Toutouh","Sergio Nesmachnow","Enrique Alba"],"pdf_url":"https://arxiv.org/pdf/2501.09996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09994v1","updated":"2025-01-17T07:24:58Z","published":"2025-01-17T07:24:58Z","title":"Multi-Modal Attention Networks for Enhanced Segmentation and Depth\n  Estimation of Subsurface Defects in Pulse Thermography","summary":"  AI-driven pulse thermography (PT) has become a crucial tool in\nnon-destructive testing (NDT), enabling automatic detection of hidden anomalies\nin various industrial components. Current state-of-the-art techniques feed\nsegmentation and depth estimation networks compressed PT sequences using either\nPrincipal Component Analysis (PCA) or Thermographic Signal Reconstruction\n(TSR). However, treating these two modalities independently constrains the\nperformance of PT inspection models as these representations possess\ncomplementary semantic features. To address this limitation, this work proposes\nPT-Fusion, a multi-modal attention-based fusion network that fuses both PCA and\nTSR modalities for defect segmentation and depth estimation of subsurface\ndefects in PT setups. PT-Fusion introduces novel feature fusion modules,\nEncoder Attention Fusion Gate (EAFG) and Attention Enhanced Decoding Block\n(AEDB), to fuse PCA and TSR features for enhanced segmentation and depth\nestimation of subsurface defects. In addition, a novel data augmentation\ntechnique is proposed based on random data sampling from thermographic\nsequences to alleviate the scarcity of PT datasets. The proposed method is\nbenchmarked against state-of-the-art PT inspection models, including U-Net,\nattention U-Net, and 3D-CNN on the Universit\\'e Laval IRT-PVC dataset. The\nresults demonstrate that PT-Fusion outperforms the aforementioned models in\ndefect segmentation and depth estimation accuracies with a margin of 10%.\n","authors":["Mohammed Salah","Naoufel Werghi","Davor Svetinovic","Yusra Abdulrahman"],"pdf_url":"https://arxiv.org/pdf/2501.09994v1.pdf","comment":"Pulse thermography, infrared thermography, defect segmentation,\n  multi-modal networks, attention mechanism"},{"id":"http://arxiv.org/abs/2404.13733v4","updated":"2025-01-17T07:15:16Z","published":"2024-04-21T18:19:27Z","title":"Elucidating the Design Space of Dataset Condensation","summary":"  Dataset condensation, a concept within data-centric learning, efficiently\ntransfers critical attributes from an original dataset to a synthetic version,\nmaintaining both diversity and realism. This approach significantly improves\nmodel training efficiency and is adaptable across multiple application areas.\nPrevious methods in dataset condensation have faced challenges: some incur high\ncomputational costs which limit scalability to larger datasets (e.g., MTT,\nDREAM, and TESLA), while others are restricted to less optimal design spaces,\nwhich could hinder potential improvements, especially in smaller datasets\n(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a\ncomprehensive design framework that includes specific, effective strategies\nlike implementing soft category-aware matching and adjusting the learning rate\nschedule. These strategies are grounded in empirical evidence and theoretical\nbacking. Our resulting approach, Elucidate Dataset Condensation (EDC),\nestablishes a benchmark for both small and large-scale dataset condensation. In\nour testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on\nImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a\ncompression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM,\nand RDED by margins of 27.3%, 17.2%, and 6.6%, respectively.\n","authors":["Shitong Shao","Zikai Zhou","Huanran Chen","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.13733v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.15084v2","updated":"2025-01-17T07:12:55Z","published":"2024-12-19T17:29:44Z","title":"AceMath: Advancing Frontier Math Reasoning with Post-Training and Reward\n  Modeling","summary":"  In this paper, we introduce AceMath, a suite of frontier math models that\nexcel in solving complex math problems, along with highly effective reward\nmodels capable of evaluating generated solutions and reliably identifying the\ncorrect ones. To develop the instruction-tuned math models, we propose a\nsupervised fine-tuning (SFT) process that first achieves competitive\nperformance across general domains, followed by targeted fine-tuning for the\nmath domain using a carefully curated set of prompts and synthetically\ngenerated responses. The resulting model, AceMath-72B-Instruct greatly\noutperforms Qwen2.5-Math-72B-Instruct, GPT-4o and Claude-3.5 Sonnet. To develop\nmath-specialized reward model, we first construct AceMath-RewardBench, a\ncomprehensive and robust benchmark for evaluating math reward models across\ndiverse problems and difficulty levels. After that, we present a systematic\napproach to build our math reward models. The resulting model, AceMath-72B-RM,\nconsistently outperforms state-of-the-art reward models. Furthermore, when\ncombining AceMath-72B-Instruct with AceMath-72B-RM, we achieve the highest\naverage rm@8 score across the math reasoning benchmarks. We release model\nweights, training data, and evaluation benchmarks at:\nhttps://research.nvidia.com/labs/adlr/acemath\n","authors":["Zihan Liu","Yang Chen","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2412.15084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10725v2","updated":"2025-01-17T07:01:43Z","published":"2023-12-17T14:14:31Z","title":"Harnessing small projectors and multiple views for efficient vision\n  pretraining","summary":"  Recent progress in self-supervised (SSL) visual representation learning has\nled to the development of several different proposed frameworks that rely on\naugmentations of images but use different loss functions. However, there are\nfew theoretically grounded principles to guide practice, so practical\nimplementation of each SSL framework requires several heuristics to achieve\ncompetitive performance. In this work, we build on recent analytical results to\ndesign practical recommendations for competitive and efficient SSL that are\ngrounded in theory. Specifically, recent theory tells us that existing SSL\nframeworks are minimizing the same idealized loss, which is to learn features\nthat best match the data similarity kernel defined by the augmentations used.\nWe show how this idealized loss can be reformulated to a functionally\nequivalent loss that is more efficient to compute. We study the implicit bias\nof using gradient descent to minimize our reformulated loss function and find\nthat using a stronger orthogonalization constraint with a reduced projector\ndimensionality should yield good representations. Furthermore, the theory tells\nus that approximating the reformulated loss should be improved by increasing\nthe number of augmentations, and as such using multiple augmentations should\nlead to improved convergence. We empirically verify our findings on CIFAR, STL\nand Imagenet datasets, wherein we demonstrate an improved linear readout\nperformance when training a ResNet-backbone using our theoretically grounded\nrecommendations. Remarkably, we also demonstrate that by leveraging these\ninsights, we can reduce the pretraining dataset size by up to 2$\\times$ while\nmaintaining downstream accuracy simply by using more data augmentations. Taken\ntogether, our work provides theoretically grounded recommendations that can be\nused to improve SSL convergence and efficiency.\n","authors":["Kumar Krishna Agrawal","Arna Ghosh","Shagun Sodhani","Adam Oberman","Blake Richards"],"pdf_url":"https://arxiv.org/pdf/2312.10725v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09328v2","updated":"2025-01-17T06:50:23Z","published":"2025-01-16T06:59:20Z","title":"Neural Honeytrace: A Robust Plug-and-Play Watermarking Framework against\n  Model Extraction Attacks","summary":"  Developing high-performance deep learning models is resource-intensive,\nleading model owners to utilize Machine Learning as a Service (MLaaS) platforms\ninstead of publicly releasing their models. However, malicious users may\nexploit query interfaces to execute model extraction attacks, reconstructing\nthe target model's functionality locally. While prior research has investigated\ntriggerable watermarking techniques for asserting ownership, existing methods\nface significant challenges: (1) most approaches require additional training,\nresulting in high overhead and limited flexibility, and (2) they often fail to\naccount for advanced attackers, leaving them vulnerable to adaptive attacks.\n  In this paper, we propose Neural Honeytrace, a robust plug-and-play\nwatermarking framework against model extraction attacks. We first formulate a\nwatermark transmission model from an information-theoretic perspective,\nproviding an interpretable account of the principles and limitations of\nexisting triggerable watermarking. Guided by the model, we further introduce:\n(1) a similarity-based training-free watermarking method for plug-and-play and\nflexible watermarking, and (2) a distribution-based multi-step watermark\ninformation transmission strategy for robust watermarking. Comprehensive\nexperiments on four datasets demonstrate that Neural Honeytrace outperforms\nprevious methods in efficiency and resisting adaptive attacks. Neural\nHoneytrace reduces the average number of samples required for a worst-case\nt-Test-based copyright claim from $12,000$ to $200$ with zero training cost.\n","authors":["Yixiao Xu","Binxing Fang","Rui Wang","Yinghai Zhou","Shouling Ji","Yuan Liu","Mohan Li","Zhihong Tian"],"pdf_url":"https://arxiv.org/pdf/2501.09328v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09982v1","updated":"2025-01-17T06:46:10Z","published":"2025-01-17T06:46:10Z","title":"RichSpace: Enriching Text-to-Video Prompt Space via Text Embedding\n  Interpolation","summary":"  Text-to-video generation models have made impressive progress, but they still\nstruggle with generating videos with complex features. This limitation often\narises from the inability of the text encoder to produce accurate embeddings,\nwhich hinders the video generation model. In this work, we propose a novel\napproach to overcome this challenge by selecting the optimal text embedding\nthrough interpolation in the embedding space. We demonstrate that this method\nenables the video generation model to produce the desired videos. Additionally,\nwe introduce a simple algorithm using perpendicular foot embeddings and cosine\nsimilarity to identify the optimal interpolation embedding. Our findings\nhighlight the importance of accurate text embeddings and offer a pathway for\nimproving text-to-video generation performance.\n","authors":["Yuefan Cao","Chengyue Gong","Xiaoyu Li","Yingyu Liang","Zhizhou Sha","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2501.09982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09980v1","updated":"2025-01-17T06:43:03Z","published":"2025-01-17T06:43:03Z","title":"Aneumo: A Large-Scale Comprehensive Synthetic Dataset of Aneurysm\n  Hemodynamics","summary":"  Intracranial aneurysm (IA) is a common cerebrovascular disease that is\nusually asymptomatic but may cause severe subarachnoid hemorrhage (SAH) if\nruptured. Although clinical practice is usually based on individual factors and\nmorphological features of the aneurysm, its pathophysiology and hemodynamic\nmechanisms remain controversial. To address the limitations of current\nresearch, this study constructed a comprehensive hemodynamic dataset of\nintracranial aneurysms. The dataset is based on 466 real aneurysm models, and\n10,000 synthetic models were generated by resection and deformation operations,\nincluding 466 aneurysm-free models and 9,534 deformed aneurysm models. The\ndataset also provides medical image-like segmentation mask files to support\ninsightful analysis. In addition, the dataset contains hemodynamic data\nmeasured at eight steady-state flow rates (0.001 to 0.004 kg/s), including\ncritical parameters such as flow velocity, pressure, and wall shear stress,\nproviding a valuable resource for investigating aneurysm pathogenesis and\nclinical prediction. This dataset will help advance the understanding of the\npathologic features and hemodynamic mechanisms of intracranial aneurysms and\nsupport in-depth research in related fields. Dataset hosted at\nhttps://github.com/Xigui-Li/Aneumo.\n","authors":["Xigui Li","Yuanye Zhou","Feiyang Xiao","Xin Guo","Yichi Zhang","Chen Jiang","Jianchao Ge","Xiansheng Wang","Qimeng Wang","Taiwei Zhang","Chensen Lin","Yuan Cheng","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2501.09980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09972v1","updated":"2025-01-17T06:30:11Z","published":"2025-01-17T06:30:11Z","title":"GVMGen: A General Video-to-Music Generation Model with Hierarchical\n  Attentions","summary":"  Composing music for video is essential yet challenging, leading to a growing\ninterest in automating music generation for video applications. Existing\napproaches often struggle to achieve robust music-video correspondence and\ngenerative diversity, primarily due to inadequate feature alignment methods and\ninsufficient datasets. In this study, we present General Video-to-Music\nGeneration model (GVMGen), designed for generating high-related music to the\nvideo input. Our model employs hierarchical attentions to extract and align\nvideo features with music in both spatial and temporal dimensions, ensuring the\npreservation of pertinent features while minimizing redundancy. Remarkably, our\nmethod is versatile, capable of generating multi-style music from different\nvideo inputs, even in zero-shot scenarios. We also propose an evaluation model\nalong with two novel objective metrics for assessing video-music alignment.\nAdditionally, we have compiled a large-scale dataset comprising diverse types\nof video-music pairs. Experimental results demonstrate that GVMGen surpasses\nprevious models in terms of music-video correspondence, generative diversity,\nand application universality.\n","authors":["Heda Zuo","Weitao You","Junxian Wu","Shihong Ren","Pei Chen","Mingxu Zhou","Yujia Lu","Lingyun Sun"],"pdf_url":"https://arxiv.org/pdf/2501.09972v1.pdf","comment":"Accepted by the 39th AAAI Conference on Artificial Intelligence\n  (AAAI-25)"},{"id":"http://arxiv.org/abs/2501.09967v1","updated":"2025-01-17T06:16:57Z","published":"2025-01-17T06:16:57Z","title":"Explainable artificial intelligence (XAI): from inherent explainability\n  to large language models","summary":"  Artificial Intelligence (AI) has continued to achieve tremendous success in\nrecent times. However, the decision logic of these frameworks is often not\ntransparent, making it difficult for stakeholders to understand, interpret or\nexplain their behavior. This limitation hinders trust in machine learning\nsystems and causes a general reluctance towards their adoption in practical\napplications, particularly in mission-critical domains like healthcare and\nautonomous driving. Explainable AI (XAI) techniques facilitate the\nexplainability or interpretability of machine learning models, enabling users\nto discern the basis of the decision and possibly avert undesirable behavior.\nThis comprehensive survey details the advancements of explainable AI methods,\nfrom inherently interpretable models to modern approaches for achieving\ninterpretability of various black box models, including large language models\n(LLMs). Additionally, we review explainable AI techniques that leverage LLM and\nvision-language model (VLM) frameworks to automate or improve the\nexplainability of other machine learning models. The use of LLM and VLM as\ninterpretability methods particularly enables high-level, semantically\nmeaningful explanations of model decisions and behavior. Throughout the paper,\nwe highlight the scientific principles, strengths and weaknesses of\nstate-of-the-art methods and outline different areas of improvement. Where\nappropriate, we also present qualitative and quantitative comparison results of\nvarious methods to show how they compare. Finally, we discuss the key\nchallenges of XAI and directions for future research.\n","authors":["Fuseini Mumuni","Alhassan Mumuni"],"pdf_url":"https://arxiv.org/pdf/2501.09967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13632v4","updated":"2025-01-17T06:09:13Z","published":"2023-12-21T07:48:54Z","title":"TraceFL: Interpretability-Driven Debugging in Federated Learning via\n  Neuron Provenance","summary":"  In Federated Learning, clients train models on local data and send updates to\na central server, which aggregates them into a global model using a fusion\nalgorithm. This collaborative yet privacy-preserving training comes at a cost.\nFL developers face significant challenges in attributing global model\npredictions to specific clients. Localizing responsible clients is a crucial\nstep towards (a) excluding clients primarily responsible for incorrect\npredictions and (b) encouraging clients who contributed high-quality models to\ncontinue participating in the future. Existing ML debugging approaches are\ninherently inapplicable as they are designed for single-model, centralized\ntraining.\n  We introduce TraceFL, a fine-grained neuron provenance capturing mechanism\nthat identifies clients responsible for a global model's prediction by tracking\nthe flow of information from individual clients to the global model. Since\ninference on different inputs activates a different set of neurons of the\nglobal model, TraceFL dynamically quantifies the significance of the global\nmodel's neurons in a given prediction, identifying the most crucial neurons in\nthe global model. It then maps them to the corresponding neurons in every\nparticipating client to determine each client's contribution, ultimately\nlocalizing the responsible client. We evaluate TraceFL on six datasets,\nincluding two real-world medical imaging datasets and four neural networks,\nincluding advanced models such as GPT. TraceFL achieves 99% accuracy in\nlocalizing the responsible client in FL tasks spanning both image and text\nclassification tasks. At a time when state-of-the-artML debugging approaches\nare mostly domain-specific (e.g., image classification only), TraceFL is the\nfirst technique to enable highly accurate automated reasoning across a wide\nrange of FL applications.\n","authors":["Waris Gill","Ali Anwar","Muhammad Ali Gulzar"],"pdf_url":"https://arxiv.org/pdf/2312.13632v4.pdf","comment":"Accepted at 2025 IEEE/ACM 47th International Conference on Software\n  Engineering (ICSE)"},{"id":"http://arxiv.org/abs/2411.05844v2","updated":"2025-01-17T05:33:54Z","published":"2024-11-06T15:32:28Z","title":"LEGO-GraphRAG: Modularizing Graph-based Retrieval-Augmented Generation\n  for Design Space Exploration","summary":"  GraphRAG integrates (knowledge) graphs with large language models (LLMs) to\nimprove reasoning accuracy and contextual relevance. Despite its promising\napplications and strong relevance to multiple research communities, such as\ndatabases and natural language processing, GraphRAG currently lacks modular\nworkflow analysis, systematic solution frameworks, and insightful empirical\nstudies. To bridge these gaps, we propose LEGO-GraphRAG, a modular framework\nthat enables: 1) fine-grained decomposition of the GraphRAG workflow, 2)\nsystematic classification of existing techniques and implemented GraphRAG\ninstances, and 3) creation of new GraphRAG instances. Our framework facilitates\ncomprehensive empirical studies of GraphRAG on large-scale real-world graphs\nand diverse query sets, revealing insights into balancing reasoning quality,\nruntime efficiency, and token or GPU cost, that are essential for building\nadvanced GraphRAG systems.\n","authors":["Yukun Cao","Zengyi Gao","Zhiyang Li","Xike Xie","Kevin Zhou","Jianliang Xu"],"pdf_url":"https://arxiv.org/pdf/2411.05844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09284v2","updated":"2025-01-17T04:59:32Z","published":"2025-01-16T04:17:56Z","title":"SEAL: Entangled White-box Watermarks on Low-Rank Adaptation","summary":"  Recently, LoRA and its variants have become the de facto strategy for\ntraining and sharing task-specific versions of large pretrained models, thanks\nto their efficiency and simplicity. However, the issue of copyright protection\nfor LoRA weights, especially through watermark-based techniques, remains\nunderexplored. To address this gap, we propose SEAL (SEcure wAtermarking on\nLoRA weights), the universal whitebox watermarking for LoRA. SEAL embeds a\nsecret, non-trainable matrix between trainable LoRA weights, serving as a\npassport to claim ownership. SEAL then entangles the passport with the LoRA\nweights through training, without extra loss for entanglement, and distributes\nthe finetuned weights after hiding the passport. When applying SEAL, we\nobserved no performance degradation across commonsense reasoning,\ntextual/visual instruction tuning, and text-to-image synthesis tasks. We\ndemonstrate that SEAL is robust against a variety of known attacks: removal,\nobfuscation, and ambiguity attacks.\n","authors":["Giyeong Oh","Saejin Kim","Woohyun Cho","Sangkyu Lee","Jiwan Chung","Dokyung Song","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2501.09284v2.pdf","comment":"Author name corrected"},{"id":"http://arxiv.org/abs/2501.09954v1","updated":"2025-01-17T04:57:42Z","published":"2025-01-17T04:57:42Z","title":"AIRCHITECT v2: Learning the Hardware Accelerator Design Space through\n  Unified Representations","summary":"  Design space exploration (DSE) plays a crucial role in enabling custom\nhardware architectures, particularly for emerging applications like AI, where\noptimized and specialized designs are essential. With the growing complexity of\ndeep neural networks (DNNs) and the introduction of advanced foundational\nmodels (FMs), the design space for DNN accelerators is expanding at an\nexponential rate. Additionally, this space is highly non-uniform and\nnon-convex, making it increasingly difficult to navigate and optimize.\nTraditional DSE techniques rely on search-based methods, which involve\niterative sampling of the design space to find the optimal solution. However,\nthis process is both time-consuming and often fails to converge to the global\noptima for such design spaces. Recently, AIrchitect v1, the first attempt to\naddress the limitations of search-based techniques, transformed DSE into a\nconstant-time classification problem using recommendation networks. In this\nwork, we propose AIrchitect v2, a more accurate and generalizable\nlearning-based DSE technique applicable to large-scale design spaces that\novercomes the shortcomings of earlier approaches. Specifically, we devise an\nencoder-decoder transformer model that (a) encodes the complex design space\ninto a uniform intermediate representation using contrastive learning and (b)\nleverages a novel unified representation blending the advantages of\nclassification and regression to effectively explore the large DSE space\nwithout sacrificing accuracy. Experimental results evaluated on 10^5 real DNN\nworkloads demonstrate that, on average, AIrchitect v2 outperforms existing\ntechniques by 15% in identifying optimal design points. Furthermore, to\ndemonstrate the generalizability of our method, we evaluate performance on\nunseen model workloads (LLMs) and attain a 1.7x improvement in inference\nlatency on the identified hardware architecture.\n","authors":["Jamin Seo","Akshat Ramachandran","Yu-Chuan Chuang","Anirudh Itagi","Tushar Krishna"],"pdf_url":"https://arxiv.org/pdf/2501.09954v1.pdf","comment":"Accepted to DATE 2025"},{"id":"http://arxiv.org/abs/2309.10444v5","updated":"2025-01-17T04:45:45Z","published":"2023-09-19T09:04:15Z","title":"Exploring Iterative Enhancement for Improving Learnersourced\n  Multiple-Choice Question Explanations with Large Language Models","summary":"  Large language models exhibit superior capabilities in processing and\nunderstanding language, yet their applications in educational contexts remain\nunderexplored. Learnersourcing enhances learning by engaging students in\ncreating their own educational content. When learnersourcing multiple-choice\nquestions, creating explanations for the solution of a question is a crucial\nstep; it helps other students understand the solution and promotes a deeper\nunderstanding of related concepts. However, it is often difficult for students\nto craft effective solution explanations, due to limited subject understanding.\nTo help scaffold the task of automated explanation generation, we present and\nevaluate a framework called \"ILearner-LLM\", that iteratively enhances the\ngenerated explanations for the given questions with large language models.\nComprising an explanation generation model and an explanation evaluation model,\nthe framework generates high-quality student-aligned explanations by\niteratively feeding the quality rating score from the evaluation model back\ninto the instruction prompt of the explanation generation model. Experimental\nresults demonstrate the effectiveness of our ILearner-LLM on LLaMA2-13B and\nGPT-4 to generate higher quality explanations that are closer to those written\nby students on five PeerWise datasets. Our findings represent a promising path\nto enrich the learnersourcing experience for students and to enhance the\ncapabilities of large language models for educational applications.\n","authors":["Qiming Bao","Juho Leinonen","Alex Yuxuan Peng","Wanjun Zhong","Gaël Gendron","Timothy Pistotti","Alice Huang","Paul Denny","Michael Witbrock","Jiamou Liu"],"pdf_url":"https://arxiv.org/pdf/2309.10444v5.pdf","comment":"The short version (v4) has been accepted as a non-archival workshop\n  paper at AGI@ICLR 2024, and the full version has been accepted by the main\n  track of AAAI/EAAI 2025"},{"id":"http://arxiv.org/abs/2310.09430v5","updated":"2025-01-17T04:39:38Z","published":"2023-10-13T22:29:15Z","title":"Assessing and Enhancing the Robustness of Large Language Models with\n  Task Structure Variations for Logical Reasoning","summary":"  Large language models (LLMs), such as LLaMA, Alpaca, Vicuna, GPT-3.5 and\nGPT-4, have advanced the performance of AI systems on various natural language\nprocessing tasks to human-like levels. However, their generalisation and\nrobustness when performing logical reasoning has not been sufficiently\nassessed. To comprehensively evaluate this ability, we develop three new\nlogical reasoning datasets named \"ReClor-plus\", \"LogiQA-plus\" and\n\"LogiQAv2-plus\" that extend standard logical reasoning datasets to evaluate the\nrobustness of the LLM's reasoning. For each, we create three subsets: the first\nwith randomly shuffled options, the second with the correct choices replaced by\n\"none of the other options is correct\", and the third with a combination of\nshuffling and substitution. Experiments on these datasets show that these\nsimple augmentations greatly hinder the models' performance. Despite their high\nperformance on the original publicly available datasets, we find that all\nmodels perform poorly on these newly constructed datasets. We also demonstrate\nthat introducing task variations into the training set can markedly improve the\nmodel's performance on both the original and our developed datasets. Finally,\nwe show that applying logic-driven data augmentation for fine-tuning and\nprompting can enhance generalisation in both discriminative and generative\nmodels, offering a path to improving their robustness for tasks involving\nlogical reasoning. Source code and data are made publicly available at\nhttps://github.com/Strong-AI-Lab/Logical-and-abstract-reasoning.\n","authors":["Qiming Bao","Gael Gendron","Alex Yuxuan Peng","Wanjun Zhong","Neset Tan","Yang Chen","Michael Witbrock","Jiamou Liu"],"pdf_url":"https://arxiv.org/pdf/2310.09430v5.pdf","comment":"The short version (v3) was accepted for oral presentation at the\n  first LLM@IJCAI 2023 non-archival symposium, and the full version was\n  accepted by ICONIP 2024"},{"id":"http://arxiv.org/abs/2407.01892v2","updated":"2025-01-17T04:29:47Z","published":"2024-07-02T02:27:46Z","title":"GRASP: A Grid-Based Benchmark for Evaluating Commonsense Spatial\n  Reasoning","summary":"  Spatial reasoning, an important faculty of human cognition with many\npractical applications, is one of the core commonsense skills that is not\npurely language-based and, for satisfying (as opposed to optimal) solutions,\nrequires some minimum degree of planning. Existing benchmarks of Commonsense\nSpatial Reasoning (CSR) tend to evaluate how Large Language Models (LLMs)\ninterpret text-based spatial $\\textit{descriptions}$ rather than directly\nevaluate a plan produced by the LLM in response to a $\\textit{specific}$\nspatial reasoning problem. In this paper, we construct a large-scale benchmark\ncalled GRASP, which consists of 16,000 grid-based environments where the agent\nis tasked with an energy collection problem. These environments include 100\ngrid instances instantiated using each of the 160 different grid settings,\ninvolving five different energy distributions, two modes of agent starting\nposition, and two distinct obstacle configurations, as well as three kinds of\nagent constraints. Using GRASP, we compare classic baseline approaches, such as\nrandom walk and greedy search methods, with advanced LLMs like GPT-3.5-Turbo,\nGPT-4o, and GPT-o1-mini. The experimental results indicate that even these\nadvanced LLMs struggle to consistently achieve satisfactory solutions.\n","authors":["Zhisheng Tang","Mayank Kejriwal"],"pdf_url":"https://arxiv.org/pdf/2407.01892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17640v2","updated":"2025-01-17T04:26:44Z","published":"2024-09-26T08:44:38Z","title":"T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training\n  on an Assistant Task for a Target Task","summary":"  Long text summarization, gradually being essential for efficiently processing\nlarge volumes of information, stays challenging for Large Language Models\n(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced\ntraining datasets and the high requirement of contextual details dealing. To\naddress the issue, we design a novel zero-shot transfer learning framework,\nabbreviated as T3, to iteratively training a baseline LLM on an assistant task\nfor the target task, where the former should own richer data resources and\nshare structural or semantic similarity with the latter. In practice, T3 is\napproached to deal with the long text summarization task by utilizing question\nanswering as the assistant task, and further validated its effectiveness on the\nBBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14%\nimprovement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore\ncompared to three baseline LLMs, demonstrating its potential for more\nassistant-target task combinations.\n","authors":["Xindi Tong","Yujin Zhu","Shijian Fan","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.17640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09949v1","updated":"2025-01-17T04:24:31Z","published":"2025-01-17T04:24:31Z","title":"MultiPruner: Balanced Structure Removal in Foundation Models","summary":"  Recently, state-of-the-art approaches for pruning large pre-trained models\n(LPMs) have demonstrated that the training-free removal of non-critical\nresidual blocks in Transformers is viable for reducing model size, achieving\nresults that outperform previous training-free pruning approaches. Motivated by\nthese findings, we extend BlockPruner (Zhong et al., 2024) and propose\nMultiPruner, a pruning approach that surpasses recent training-free pruning\nmethods by adopting a multidimensional, iterative, fine-grained pruning\nstrategy. In MultiPruner, multidimensional pruning reinstates the structural\nbalance in block-pruned models by sequentially compressing along three\ndimensions: i) residual blocks, ii) channels of multilayer perceptrons (MLP),\nand iii) attention heads. This solution enhances zero-shot accuracy on\ndownstream tasks compared to other techniques while improving model compression\nratios, producing compressed models with fewer computing and memory\nrequirements. Extensive experiments demonstrate the advantages of the proposed\nmethod across various large pre-trained models. The code and pruning\nconfigurations are available at\nhttps://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.\n","authors":["J. Pablo Muñoz","Jinjie Yuan","Nilesh Jain"],"pdf_url":"https://arxiv.org/pdf/2501.09949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11156v4","updated":"2025-01-17T04:21:47Z","published":"2023-03-17T17:53:19Z","title":"Can AI-Generated Text be Reliably Detected?","summary":"  Large Language Models (LLMs) perform impressively well in various\napplications. However, the potential for misuse of these models in activities\nsuch as plagiarism, generating fake news, and spamming has raised concern about\ntheir responsible use. Consequently, the reliable detection of AI-generated\ntext has become a critical area of research. AI text detectors have shown to be\neffective under their specific settings. In this paper, we stress-test the\nrobustness of these AI text detectors in the presence of an attacker. We\nintroduce recursive paraphrasing attack to stress test a wide range of\ndetection schemes, including the ones using the watermarking as well as neural\nnetwork-based detectors, zero shot classifiers, and retrieval-based detectors.\nOur experiments conducted on passages, each approximately 300 tokens long,\nreveal the varying sensitivities of these detectors to our attacks. Our\nfindings indicate that while our recursive paraphrasing method can\nsignificantly reduce detection rates, it only slightly degrades text quality in\nmany cases, highlighting potential vulnerabilities in current detection systems\nin the presence of an attacker. Additionally, we investigate the susceptibility\nof watermarked LLMs to spoofing attacks aimed at misclassifying human-written\ntext as AI-generated. We demonstrate that an attacker can infer hidden AI text\nsignatures without white-box access to the detection method, potentially\nleading to reputational risks for LLM developers. Finally, we provide a\ntheoretical framework connecting the AUROC of the best possible detector to the\nTotal Variation distance between human and AI text distributions. This analysis\noffers insights into the fundamental challenges of reliable detection as\nlanguage models continue to advance. Our code is publicly available at\nhttps://github.com/vinusankars/Reliability-of-AI-text-detectors.\n","authors":["Vinu Sankar Sadasivan","Aounon Kumar","Sriram Balasubramanian","Wenxiao Wang","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2303.11156v4.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2501.09948v1","updated":"2025-01-17T04:20:43Z","published":"2025-01-17T04:20:43Z","title":"AI Explainability for Power Electronics: From a Lipschitz Continuity\n  Perspective","summary":"  Lifecycle management of power converters continues to thrive with emerging\nartificial intelligence (AI) solutions, yet AI mathematical explainability\nremains unexplored in power electronics (PE) community. The lack of theoretical\nrigor challenges adoption in mission-critical applications. Therefore, this\nletter proposes a generic framework to evaluate mathematical explainability,\nhighlighting inference stability and training convergence from a Lipschitz\ncontinuity perspective. Inference stability governs consistent outputs under\ninput perturbations, essential for robust real-time control and fault\ndiagnosis. Training convergence guarantees stable learning dynamics,\nfacilitating accurate modeling in PE contexts. Additionally, a Lipschitz-aware\nlearning rate selection strategy is introduced to accelerate convergence while\nmitigating overshoots and oscillations. The feasibility of the proposed\nLipschitz-oriented framework is demonstrated by validating the mathematical\nexplainability of a state-of-the-art physics-in-architecture neural network,\nand substantiated through empirical case studies on dual-active-bridge\nconverters. This letter serves as a clarion call for the PE community to\nembrace mathematical explainability, heralding a transformative era of\ntrustworthy and explainable AI solutions that potentially redefine the future\nof power electronics.\n","authors":["Xinze Li","Fanfan Lin","Homer Alan Mantooth","Juan José Rodríguez-Andina"],"pdf_url":"https://arxiv.org/pdf/2501.09948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09946v1","updated":"2025-01-17T04:00:50Z","published":"2025-01-17T04:00:50Z","title":"Client-Centric Federated Adaptive Optimization","summary":"  Federated Learning (FL) is a distributed learning paradigm where clients\ncollaboratively train a model while keeping their own data private. With an\nincreasing scale of clients and models, FL encounters two key challenges,\nclient drift due to a high degree of statistical/system heterogeneity, and lack\nof adaptivity. However, most existing FL research is based on unrealistic\nassumptions that virtually ignore system heterogeneity. In this paper, we\npropose Client-Centric Federated Adaptive Optimization, which is a class of\nnovel federated adaptive optimization approaches. We enable several features in\nthis framework such as arbitrary client participation, asynchronous server\naggregation, and heterogeneous local computing, which are ubiquitous in\nreal-world FL systems but are missed in most existing works. We provide a\nrigorous convergence analysis of our proposed framework for general nonconvex\nobjectives, which is shown to converge with the best-known rate. Extensive\nexperiments show that our approaches consistently outperform the baseline by a\nlarge margin across benchmarks.\n","authors":["Jianhui Sun","Xidong Wu","Heng Huang","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16950v5","updated":"2025-01-17T03:43:53Z","published":"2024-03-25T17:11:28Z","title":"Aligning with Human Judgement: The Role of Pairwise Preference in Large\n  Language Model Evaluators","summary":"  Large Language Models (LLMs) have demonstrated promising capabilities as\nautomatic evaluators in assessing the quality of generated natural language.\nHowever, LLMs still exhibit biases in evaluation and often struggle to generate\ncoherent evaluations that align with human assessments. In this work, we first\nconduct a systematic study of the misalignment between LLM evaluators and human\nevaluation, revealing that existing calibration methods aimed at mitigating\nbiases of LLMs are insufficient for effectively aligning LLM evaluators.\nInspired by the use of preference data in RLHF, we formulate the evaluation as\na ranking problem and introduce Pairwise-preference Search (PAIRS), an\nuncertainty-guided search-based rank aggregation method that employs LLMs to\nconduct pairwise comparisons locally and efficiently ranks candidate texts\nglobally. PAIRS achieves state-of-the-art performance on representative\nevaluation tasks in long-form generations and demonstrates significant\nimprovements over direct scoring. Furthermore, we provide insights into the\nrole of pairwise preference in quantifying the transitivity of LLMs and\ndemonstrate how PAIRS benefits from calibration using debiased pairwise\nevaluations.\n","authors":["Yinhong Liu","Han Zhou","Zhijiang Guo","Ehsan Shareghi","Ivan Vulić","Anna Korhonen","Nigel Collier"],"pdf_url":"https://arxiv.org/pdf/2403.16950v5.pdf","comment":"This paper has been accepted by COLM 2024"},{"id":"http://arxiv.org/abs/2404.02933v4","updated":"2025-01-17T03:19:16Z","published":"2024-04-03T01:09:41Z","title":"NL2KQL: From Natural Language to Kusto Query","summary":"  Data is growing rapidly in volume and complexity. Proficiency in database\nquery languages is pivotal for crafting effective queries. As coding assistants\nbecome more prevalent, there is significant opportunity to enhance database\nquery languages. The Kusto Query Language (KQL) is a widely used query language\nfor large semi-structured data such as logs, telemetries, and time-series for\nbig data analytics platforms. This paper introduces NL2KQL an innovative\nframework that uses large language models (LLMs) to convert natural language\nqueries (NLQs) to KQL queries. The proposed NL2KQL framework includes several\nkey components: Schema Refiner which narrows down the schema to its most\npertinent elements; the Few-shot Selector which dynamically selects relevant\nexamples from a few-shot dataset; and the Query Refiner which repairs syntactic\nand semantic errors in KQL queries. Additionally, this study outlines a method\nfor generating large datasets of synthetic NLQ-KQL pairs which are valid within\na specific database contexts. To validate NL2KQL's performance, we utilize an\narray of online (based on query execution) and offline (based on query parsing)\nmetrics. Through ablation studies, the significance of each framework component\nis examined, and the datasets used for benchmarking are made publicly\navailable. This work is the first of its kind and is compared with available\nbaselines to demonstrate its effectiveness.\n","authors":["Xinye Tang","Amir H. Abdi","Jeremias Eichelbaum","Mahan Das","Alex Klein","Nihal Irmak Pakis","William Blum","Daniel L Mace","Tanvi Raja","Namrata Padmanabhan","Ye Xing"],"pdf_url":"https://arxiv.org/pdf/2404.02933v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09934v1","updated":"2025-01-17T03:15:03Z","published":"2025-01-17T03:15:03Z","title":"HEART: Achieving Timely Multi-Model Training for\n  Vehicle-Edge-Cloud-Integrated Hierarchical Federated Learning","summary":"  The rapid growth of AI-enabled Internet of Vehicles (IoV) calls for efficient\nmachine learning (ML) solutions that can handle high vehicular mobility and\ndecentralized data. This has motivated the emergence of Hierarchical Federated\nLearning over vehicle-edge-cloud architectures (VEC-HFL). Nevertheless, one\naspect which is underexplored in the literature on VEC-HFL is that vehicles\noften need to execute multiple ML tasks simultaneously, where this multi-model\ntraining environment introduces crucial challenges. First, improper aggregation\nrules can lead to model obsolescence and prolonged training times. Second,\nvehicular mobility may result in inefficient data utilization by preventing the\nvehicles from returning their models to the network edge. Third, achieving a\nbalanced resource allocation across diverse tasks becomes of paramount\nimportance as it majorly affects the effectiveness of collaborative training.\nWe take one of the first steps towards addressing these challenges via\nproposing a framework for multi-model training in dynamic VEC-HFL with the goal\nof minimizing global training latency while ensuring balanced training across\nvarious tasks-a problem that turns out to be NP-hard. To facilitate timely\nmodel training, we introduce a hybrid synchronous-asynchronous aggregation\nrule. Building on this, we present a novel method called Hybrid Evolutionary\nAnd gReedy allocaTion (HEART). The framework operates in two stages: first, it\nachieves balanced task scheduling through a hybrid heuristic approach that\ncombines improved Particle Swarm Optimization (PSO) and Genetic Algorithms\n(GA); second, it employs a low-complexity greedy algorithm to determine the\ntraining priority of assigned tasks on vehicles. Experiments on real-world\ndatasets demonstrate the superiority of HEART over existing methods.\n","authors":["Xiaohong Yang","Minghui Liwang","Xianbin Wang","Zhipeng Cheng","Seyyedali Hosseinalipour","Huaiyu Dai","Zhenzhen Jiao"],"pdf_url":"https://arxiv.org/pdf/2501.09934v1.pdf","comment":"14 pages, 6 figures,"},{"id":"http://arxiv.org/abs/2501.09929v1","updated":"2025-01-17T02:55:23Z","published":"2025-01-17T02:55:23Z","title":"Steering Large Language Models with Feature Guided Activation Additions","summary":"  Effective and reliable control over large language model (LLM) behavior is a\nsignificant challenge. While activation steering methods, which add steering\nvectors to a model's hidden states, are a promising approach, existing\ntechniques often lack precision and interpretability in how they influence\nmodel outputs. We introduce Feature Guided Activation Additions (FGAA), a novel\nactivation steering method that leverages insights from Contrastive Activation\nAddition (CAA) and Sparse Autoencoder-Targeted Steering (SAE-TS). By operating\nin the latent space of a Sparse Autoencoder (SAE) and employing optimization\ntechniques to select desired SAE features, FGAA constructs precise steering\nvectors that provide better steering effects while maintaining coherence of\nsteered model outputs. In this regard, evaluations on Gemma-2-2B and Gemma-2-9B\nmodels across various steering tasks demonstrate that FGAA outperforms existing\nsteering methods of CAA, SAE decoder steering, and SAE-TS. Our results also\nhighlight important trade-offs between steering scale and general model\ncapabilities that are consistent across all tested steering methods.\n","authors":["Samuel Soo","Wesley Teng","Chandrasekaran Balaganesh"],"pdf_url":"https://arxiv.org/pdf/2501.09929v1.pdf","comment":"7 maintext pages, 14 appendix pages"},{"id":"http://arxiv.org/abs/2408.08881v3","updated":"2025-01-17T02:51:41Z","published":"2024-08-03T20:41:35Z","title":"Challenge Summary U-MedSAM: Uncertainty-aware MedSAM for Medical Image\n  Segmentation","summary":"  Medical Image Foundation Models have proven to be powerful tools for mask\nprediction across various datasets. However, accurately assessing the\nuncertainty of their predictions remains a significant challenge. To address\nthis, we propose a new model, U-MedSAM, which integrates the MedSAM model with\nan uncertainty-aware loss function and the Sharpness-Aware Minimization\n(SharpMin) optimizer. The uncertainty-aware loss function automatically\ncombines region-based, distribution-based, and pixel-based loss designs to\nenhance segmentation accuracy and robustness. SharpMin improves generalization\nby finding flat minima in the loss landscape, thereby reducing overfitting. Our\nmethod was evaluated in the CVPR24 MedSAM on Laptop challenge, where U-MedSAM\ndemonstrated promising performance.\n","authors":["Xin Wang","Xiaoyu Liu","Peng Huang","Pu Huang","Shu Hu","Hongtu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.08881v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.17496"},{"id":"http://arxiv.org/abs/2501.09928v1","updated":"2025-01-17T02:48:29Z","published":"2025-01-17T02:48:29Z","title":"Dialogue Benchmark Generation from Knowledge Graphs with Cost-Effective\n  Retrieval-Augmented LLMs","summary":"  Dialogue benchmarks are crucial in training and evaluating chatbots engaging\nin domain-specific conversations. Knowledge graphs (KGs) represent semantically\nrich and well-organized data spanning various domains, such as DBLP, DBpedia,\nand YAGO. Traditionally, dialogue benchmarks have been manually created from\ndocuments, neglecting the potential of KGs in automating this process. Some\nquestion-answering benchmarks are automatically generated using extensive\npreprocessing from KGs, but they do not support dialogue generation. This paper\nintroduces Chatty-Gen, a novel multi-stage retrieval-augmented generation\nplatform for automatically generating high-quality dialogue benchmarks tailored\nto a specific domain using a KG. Chatty-Gen decomposes the generation process\ninto manageable stages and uses assertion rules for automatic validation\nbetween stages. Our approach enables control over intermediate results to\nprevent time-consuming restarts due to hallucinations. It also reduces reliance\non costly and more powerful commercial LLMs. Chatty-Gen eliminates upfront\nprocessing of the entire KG using efficient query-based retrieval to find\nrepresentative subgraphs based on the dialogue context. Our experiments with\nseveral real and large KGs demonstrate that Chatty-Gen significantly\noutperforms state-of-the-art systems and ensures consistent model and system\nperformance across multiple LLMs of diverse capabilities, such as GPT-4o,\nGemini 1.5, Llama 3, and Mistral.\n","authors":["Reham Omar","Omij Mangukiya","Essam Mansour"],"pdf_url":"https://arxiv.org/pdf/2501.09928v1.pdf","comment":"The paper is publsihed in SIGMOD 2025"},{"id":"http://arxiv.org/abs/2501.09927v1","updated":"2025-01-17T02:47:25Z","published":"2025-01-17T02:47:25Z","title":"IE-Bench: Advancing the Measurement of Text-Driven Image Editing for\n  Human Perception Alignment","summary":"  Recent advances in text-driven image editing have been significant, yet the\ntask of accurately evaluating these edited images continues to pose a\nconsiderable challenge. Different from the assessment of text-driven image\ngeneration, text-driven image editing is characterized by simultaneously\nconditioning on both text and a source image. The edited images often retain an\nintrinsic connection to the original image, which dynamically change with the\nsemantics of the text. However, previous methods tend to solely focus on\ntext-image alignment or have not aligned with human perception. In this work,\nwe introduce the Text-driven Image Editing Benchmark suite (IE-Bench) to\nenhance the assessment of text-driven edited images. IE-Bench includes a\ndatabase contains diverse source images, various editing prompts and the\ncorresponding results different editing methods, and total 3,010 Mean Opinion\nScores (MOS) provided by 25 human subjects. Furthermore, we introduce IE-QA, a\nmulti-modality source-aware quality assessment method for text-driven image\nediting. To the best of our knowledge, IE-Bench offers the first IQA dataset\nand model tailored for text-driven image editing. Extensive experiments\ndemonstrate IE-QA's superior subjective-alignments on the text-driven image\nediting task compared with previous metrics. We will make all related data and\ncode available to the public.\n","authors":["Shangkun Sun","Bowen Qu","Xiaoyu Liang","Songlin Fan","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2501.09927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09926v1","updated":"2025-01-17T02:47:14Z","published":"2025-01-17T02:47:14Z","title":"ForestProtector: An IoT Architecture Integrating Machine Vision and Deep\n  Reinforcement Learning for Efficient Wildfire Monitoring","summary":"  Early detection of forest fires is crucial to minimizing the environmental\nand socioeconomic damage they cause. Indeed, a fire's duration directly\ncorrelates with the difficulty and cost of extinguishing it. For instance, a\nfire burning for 1 minute might require 1 liter of water to extinguish, while a\n2-minute fire could demand 100 liters, and a 10-minute fire might necessitate\n1,000 liters. On the other hand, existing fire detection systems based on novel\ntechnologies (e.g., remote sensing, PTZ cameras, UAVs) are often expensive and\nrequire human intervention, making continuous monitoring of large areas\nimpractical. To address this challenge, this work proposes a low-cost forest\nfire detection system that utilizes a central gateway device with computer\nvision capabilities to monitor a 360{\\deg} field of view for smoke at long\ndistances. A deep reinforcement learning agent enhances surveillance by\ndynamically controlling the camera's orientation, leveraging real-time sensor\ndata (smoke levels, ambient temperature, and humidity) from distributed IoT\ndevices. This approach enables automated wildfire monitoring across expansive\nareas while reducing false positives.\n","authors":["Kenneth Bonilla-Ormachea","Horacio Cuizaga","Edwin Salcedo","Sebastian Castro","Sergio Fernandez-Testa","Misael Mamani"],"pdf_url":"https://arxiv.org/pdf/2501.09926v1.pdf","comment":"Accepted for publication in the proceedings of the 11th International\n  Conference on Automation, Robotics, and Applications (ICARA 2025)"},{"id":"http://arxiv.org/abs/2501.09923v1","updated":"2025-01-17T02:40:04Z","published":"2025-01-17T02:40:04Z","title":"Study on a Fast Solver for Combined Field Integral Equations of 3D\n  Conducting Bodies Based on Graph Neural Networks","summary":"  In this paper, we present a graph neural networks (GNNs)-based fast solver\n(GraphSolver) for solving combined field integral equations (CFIEs) of 3D\nconducting bodies. Rao-Wilton-Glisson (RWG) basis functions are employed to\ndiscretely and accurately represent the geometry of 3D conducting bodies. A\nconcise and informative graph representation is then constructed by treating\neach RWG function as a node in the graph, enabling the flow of current between\nnodes. With the transformed graphs, GraphSolver is developed to directly\npredict real and imaginary parts of the x, y and z components of the surface\ncurrent densities at each node (RWG function). Numerical results demonstrate\nthe efficacy of GraphSolver in solving CFIEs for 3D conducting bodies with\nvarying levels of geometric complexity, including basic 3D targets,\nmissile-shaped targets, and airplane-shaped targets.\n","authors":["Tao Shan","Xin Zhang","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2501.09923v1.pdf","comment":"10 pages,11 figures"},{"id":"http://arxiv.org/abs/2501.09918v1","updated":"2025-01-17T02:20:52Z","published":"2025-01-17T02:20:52Z","title":"GenSC-6G: A Prototype Testbed for Integrated Generative AI, Quantum, and\n  Semantic Communication","summary":"  We introduce a prototyping testbed, GenSC-6G, developed to generate a\ncomprehensive dataset that supports the integration of generative artificial\nintelligence (AI), quantum computing, and semantic communication for emerging\nsixth-generation (6G) applications. The GenSC-6G dataset is designed with\nnoise-augmented synthetic data optimized for semantic decoding, classification,\nand localization tasks, significantly enhancing flexibility for diverse\nAI-driven communication applications. This adaptable prototype supports\nseamless modifications across baseline models, communication modules, and\ngoal-oriented decoders. Case studies demonstrate its application in lightweight\nclassification, semantic upsampling, and edge-based language inference under\nnoise conditions. The GenSC-6G dataset serves as a scalable and robust resource\nfor developing goal-oriented communication systems tailored to the growing\ndemands of 6G networks.\n","authors":["Brian E. Arfeto","Shehbaz Tariq","Uman Khalid","Trung Q. Duong","Hyundong Shin"],"pdf_url":"https://arxiv.org/pdf/2501.09918v1.pdf","comment":"SUBMITTED FOR PUBLICATION IN IEEE COMMUNICATIONS MAGAZINE"},{"id":"http://arxiv.org/abs/2501.09913v1","updated":"2025-01-17T02:02:12Z","published":"2025-01-17T02:02:12Z","title":"Towards A Litmus Test for Common Sense","summary":"  This paper is the second in a planned series aimed at envisioning a path to\nsafe and beneficial artificial intelligence. Building on the conceptual\ninsights of \"Common Sense Is All You Need,\" we propose a more formal litmus\ntest for common sense, adopting an axiomatic approach that combines minimal\nprior knowledge (MPK) constraints with diagonal or Godel-style arguments to\ncreate tasks beyond the agent's known concept set. We discuss how this approach\napplies to the Abstraction and Reasoning Corpus (ARC), acknowledging\ntraining/test data constraints, physical or virtual embodiment, and large\nlanguage models (LLMs). We also integrate observations regarding emergent\ndeceptive hallucinations, in which more capable AI systems may intentionally\nfabricate plausible yet misleading outputs to disguise knowledge gaps. The\noverarching theme is that scaling AI without ensuring common sense risks\nintensifying such deceptive tendencies, thereby undermining safety and trust.\nAligning with the broader goal of developing beneficial AI without causing\nharm, our axiomatic litmus test not only diagnoses whether an AI can handle\ntruly novel concepts but also provides a stepping stone toward an ethical,\nreliable foundation for future safe, beneficial, and aligned artificial\nintelligence.\n","authors":["Hugo Latapie"],"pdf_url":"https://arxiv.org/pdf/2501.09913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.20550v2","updated":"2025-01-17T01:44:44Z","published":"2024-09-30T17:51:15Z","title":"LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism,\n  and Mitigation","summary":"  Code generation aims to automatically generate code from input requirements,\nsignificantly enhancing development efficiency. Recent large language models\n(LLMs) based approaches have shown promising results and revolutionized code\ngeneration task. Despite the promising performance, LLMs often generate\ncontents with hallucinations, especially for the code generation scenario\nrequiring the handling of complex contextual dependencies in practical\ndevelopment process. Although previous study has analyzed hallucinations in\nLLM-powered code generation, the study is limited to standalone function\ngeneration. In this paper, we conduct an empirical study to study the\nphenomena, mechanism, and mitigation of LLM hallucinations within more\npractical and complex development contexts in repository-level generation\nscenario. First, we manually examine the code generation results from six\nmainstream LLMs to establish a hallucination taxonomy of LLM-generated code.\nNext, we elaborate on the phenomenon of hallucinations, analyze their\ndistribution across different models. We then analyze causes of hallucinations\nand identify four potential factors contributing to hallucinations. Finally, we\npropose an RAG-based mitigation method, which demonstrates consistent\neffectiveness in all studied LLMs. The replication package including code,\ndata, and experimental results is available at\nhttps://github.com/DeepSoftwareAnalytics/LLMCodingHallucination\n","authors":["Ziyao Zhang","Yanlin Wang","Chong Wang","Jiachi Chen","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.20550v2.pdf","comment":"Accepted by ISSTA 2025"},{"id":"http://arxiv.org/abs/2402.18540v2","updated":"2025-01-17T01:43:21Z","published":"2024-02-28T18:23:49Z","title":"Keeping LLMs Aligned After Fine-tuning: The Crucial Role of Prompt\n  Templates","summary":"  Public LLMs such as the Llama 2-Chat underwent alignment training and were\nconsidered safe. Recently Qi et al. [2024] reported that even benign\nfine-tuning on seemingly safe datasets can give rise to unsafe behaviors in the\nmodels. The current paper is about methods and best practices to mitigate such\nloss of alignment. We focus on the setting where a public model is fine-tuned\nbefore serving users for specific usage, where the model should improve on the\ndownstream task while maintaining alignment. Through extensive experiments on\nseveral chat models (Meta's Llama 2-Chat, Mistral AI's Mistral 7B Instruct\nv0.2, and OpenAI's GPT-3.5 Turbo), this paper uncovers that the prompt\ntemplates used during fine-tuning and inference play a crucial role in\npreserving safety alignment, and proposes the ``Pure Tuning, Safe Testing''\n(PTST) strategy -- fine-tune models without a safety prompt, but include it at\ntest time. This seemingly counterintuitive strategy incorporates an intended\ndistribution shift to encourage alignment preservation. Fine-tuning experiments\non GSM8K, ChatDoctor, and OpenOrca show that PTST significantly reduces the\nrise of unsafe behaviors.\n","authors":["Kaifeng Lyu","Haoyu Zhao","Xinran Gu","Dingli Yu","Anirudh Goyal","Sanjeev Arora"],"pdf_url":"https://arxiv.org/pdf/2402.18540v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09905v1","updated":"2025-01-17T01:32:18Z","published":"2025-01-17T01:32:18Z","title":"SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon\n  Visuomotor Learning","summary":"  We present a low-cost quadruped manipulation system that solves long-horizon\nreal-world tasks, trained by reinforcement learning purely in simulation. The\nsystem comprises 1) a hierarchical design of a high-level policy for\nvisual-mobile manipulation following instructions, and a low-level policy for\nquadruped movement and limb-control, 2) a progressive policy expansion approach\nfor solving the long-horizon task together with a teacher-student framework for\nefficient high-level training of the high-level visuomotor policy, and 3) a\nsuite of techniques for minimizing sim-to-real gaps.\n  With budget-friendly but limited reliability and performance hardware, and\njust one wrist-mounted RGB camera, the entire system fully trained in\nsimulation achieves high success rates for long horizon tasks involving search,\nmove, grasp, and drop-into, with fluid sim-to-real transfer in a wide variety\nof indoor and outdoor scenes and lighting conditions.Extensive real-world\nevaluations show that on the long horizon mobile manipulation tasks, our system\nachieves good performance when transferred to real both in terms of task\nsuccess rate and execution efficiency. Finally, we discuss the necessity of our\nsim-to-real techniques for legged mobile manipulation, and show their ablation\nperformance.\n","authors":["Haichao Zhang","Haonan Yu","Le Zhao","Andrew Choi","Qinxun Bai","Yiqing Yang","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2501.09905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06120v5","updated":"2025-01-17T01:18:48Z","published":"2024-11-09T09:03:08Z","title":"Evaluating the Propensity of Generative AI for Producing Harmful\n  Disinformation During an Election Cycle","summary":"  Generative Artificial Intelligence offers a powerful tool for adversaries who\nwish to engage in influence operations, such as the Chinese Spamouflage\noperation and the Russian Internet Research Agency effort that both sought to\ninterfere with recent US election cycles. Therefore, this study seeks to\ninvestigate the propensity of current generative AI models for producing\nharmful disinformation during an election cycle. The probability that different\ngenerative AI models produced disinformation when given adversarial prompts was\nevaluated, in addition the associated harm. This allows for the expected harm\nfor each model to be computed and it was discovered that Copilot and Gemini\ntied for the overall safest performance by realizing the lowest expected harm,\nwhile GPT-4o produced the greatest rates of harmful disinformation, resulting\nin much higher expected harm scores. The impact of disinformation category was\nalso investigated and Gemini was safest within the political category of\ndisinformation due to mitigation attempts made by developers during the\nelection, while Copilot was safest for topics related to health. Moreover,\ncharacteristics of adversarial roles were discovered that led to greater\nexpected harm across all models. Finally, classification models were developed\nthat predicted disinformation production based on the conditions considered in\nthis study, which offers insight into factors important for predicting\ndisinformation production. Based on all of these insights, recommendations are\nprovided that seek to mitigate factors that lead to harmful disinformation\nbeing produced by generative AI models. It is hoped that developers will use\nthese insights to improve future models.\n","authors":["Erik J Schlicht"],"pdf_url":"https://arxiv.org/pdf/2411.06120v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04421v2","updated":"2025-01-17T01:11:16Z","published":"2024-09-06T17:30:45Z","title":"RLPF: Reinforcement Learning from Prediction Feedback for User\n  Summarization with LLMs","summary":"  LLM-powered personalization agent systems employ Large Language Models (LLMs)\nto predict users' behavior from their past activities. However, their\neffectiveness often hinges on the ability to effectively leverage extensive,\nlong user historical data due to its inherent noise and length of such data.\nExisting pretrained LLMs may generate summaries that are concise but lack the\nnecessary context for downstream tasks, hindering their utility in\npersonalization systems. To address these challenges, we introduce\nReinforcement Learning from Prediction Feedback (RLPF). RLPF fine-tunes LLMs to\ngenerate concise, human-readable user summaries that are optimized for\ndownstream task performance. By maximizing the usefulness of the generated\nsummaries, RLPF effectively distills extensive user history data while\npreserving essential information for downstream tasks. Our empirical evaluation\ndemonstrates significant improvements in both extrinsic downstream task utility\nand intrinsic summary quality, surpassing baseline methods by up to 22% on\ndownstream task performance and achieving an up to 84.59% win rate on\nFactuality, Abstractiveness, and Readability. RLPF also achieves a remarkable\n74% reduction in context length while improving performance on 16 out of 19\nunseen tasks and/or datasets, showcasing its generalizability. This approach\noffers a promising solution for enhancing LLM personalization by effectively\ntransforming long, noisy user histories into informative and human-readable\nrepresentations.\n","authors":["Jiaxing Wu","Lin Ning","Luyang Liu","Harrison Lee","Neo Wu","Chao Wang","Sushant Prakash","Shawn O'Banion","Bradley Green","Jun Xie"],"pdf_url":"https://arxiv.org/pdf/2409.04421v2.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2411.14593v2","updated":"2025-01-17T01:00:13Z","published":"2024-11-21T21:23:46Z","title":"A Systematic Study of Multi-Agent Deep Reinforcement Learning for Safe\n  and Robust Autonomous Highway Ramp Entry","summary":"  Vehicles today can drive themselves on highways and driverless robotaxis\noperate in major cities, with more sophisticated levels of autonomous driving\nexpected to be available and become more common in the future. Yet, technically\nspeaking, so-called \"Level 5\" (L5) operation, corresponding to full autonomy,\nhas not been achieved. For that to happen, functions such as fully autonomous\nhighway ramp entry must be available, and provide provably safe, and reliably\nrobust behavior to enable full autonomy. We present a systematic study of a\nhighway ramp function that controls the vehicles forward-moving actions to\nminimize collisions with the stream of highway traffic into which a merging\n(ego) vehicle enters. We take a game-theoretic multi-agent (MA) approach to\nthis problem and study the use of controllers based on deep reinforcement\nlearning (DRL). The virtual environment of the MA DRL uses self-play with\nsimulated data where merging vehicles safely learn to control longitudinal\nposition during a taper-type merge. The work presented in this paper extends\nexisting work by studying the interaction of more than two vehicles (agents)\nand does so by systematically expanding the road scene with additional traffic\nand ego vehicles. While previous work on the two-vehicle setting established\nthat collision-free controllers are theoretically impossible in fully\ndecentralized, non-coordinated environments, we empirically show that\ncontrollers learned using our approach are nearly ideal when measured against\nidealized optimal controllers.\n","authors":["Larry Schester","Luis E. Ortiz"],"pdf_url":"https://arxiv.org/pdf/2411.14593v2.pdf","comment":"9 pages, 9 figures; added support ack"},{"id":"http://arxiv.org/abs/2501.09891v1","updated":"2025-01-17T00:41:44Z","published":"2025-01-17T00:41:44Z","title":"Evolving Deeper LLM Thinking","summary":"  We explore an evolutionary search strategy for scaling inference time compute\nin Large Language Models. The proposed approach, Mind Evolution, uses a\nlanguage model to generate, recombine and refine candidate responses. The\nproposed approach avoids the need to formalize the underlying inference problem\nwhenever a solution evaluator is available. Controlling for inference cost, we\nfind that Mind Evolution significantly outperforms other inference strategies\nsuch as Best-of-N and Sequential Revision in natural language planning tasks.\nIn the TravelPlanner and Natural Plan benchmarks, Mind Evolution solves more\nthan 98% of the problem instances using Gemini 1.5 Pro without the use of a\nformal solver.\n","authors":["Kuang-Huei Lee","Ian Fischer","Yueh-Hua Wu","Dave Marwood","Shumeet Baluja","Dale Schuurmans","Xinyun Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09890v1","updated":"2025-01-17T00:40:35Z","published":"2025-01-17T00:40:35Z","title":"Exploring the Implementation of AI in Early Onset Interviews to Help\n  Mitigate Bias","summary":"  This paper investigates the application of artificial intelligence (AI) in\nearly-stage recruitment interviews in order to reduce inherent bias,\nspecifically sentiment bias. Traditional interviewers are often subject to\nseveral biases, including interviewer bias, social desirability effects, and\neven confirmation bias. In turn, this leads to non-inclusive hiring practices,\nand a less diverse workforce. This study further analyzes various AI\ninterventions that are present in the marketplace today such as multimodal\nplatforms and interactive candidate assessment tools in order to gauge the\ncurrent market usage of AI in early-stage recruitment. However, this paper aims\nto use a unique AI system that was developed to transcribe and analyze\ninterview dynamics, which emphasize skill and knowledge over emotional\nsentiments. Results indicate that AI effectively minimizes sentiment-driven\nbiases by 41.2%, suggesting its revolutionizing power in companies' recruitment\nprocesses for improved equity and efficiency.\n","authors":["Nishka Lal","Omar Benkraouda"],"pdf_url":"https://arxiv.org/pdf/2501.09890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09597v2","updated":"2025-01-17T00:25:18Z","published":"2024-10-12T17:23:34Z","title":"A Complete Characterization of Learnability for Stochastic Noisy Bandits","summary":"  We study the stochastic noisy bandit problem with an unknown reward function\n$f^*$ in a known function class $\\mathcal{F}$. Formally, a model $M$ maps arms\n$\\pi$ to a probability distribution $M(\\pi)$ of reward. A model class\n$\\mathcal{M}$ is a collection of models. For each model $M$, define its mean\nreward function $f^M(\\pi)=\\mathbb{E}_{r \\sim M(\\pi)}[r]$. In the bandit\nlearning problem, we proceed in rounds, pulling one arm $\\pi$ each round and\nobserving a reward sampled from $M(\\pi)$. With knowledge of $\\mathcal{M}$,\nsupposing that the true model $M\\in \\mathcal{M}$, the objective is to identify\nan arm $\\hat{\\pi}$ of near-maximal mean reward $f^M(\\hat{\\pi})$ with high\nprobability in a bounded number of rounds. If this is possible, then the model\nclass is said to be learnable.\n  Importantly, a result of \\cite{hanneke2023bandit} shows there exist model\nclasses for which learnability is undecidable. However, the model class they\nconsider features deterministic rewards, and they raise the question of whether\nlearnability is decidable for classes containing sufficiently noisy models. For\nthe first time, we answer this question in the positive by giving a complete\ncharacterization of learnability for model classes with arbitrary noise. In\naddition to that, we also describe the full spectrum of possible optimal query\ncomplexities. Further, we prove adaptivity is sometimes necessary to achieve\nthe optimal query complexity. Last, we revisit an important complexity measure\nfor interactive decision making, the Decision-Estimation-Coefficient\n\\citep{foster2021statistical,foster2023tight}, and propose a new variant of the\nDEC which also characterizes learnability in this setting.\n","authors":["Steve Hanneke","Kun Wang"],"pdf_url":"https://arxiv.org/pdf/2410.09597v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1712.04182v3","updated":"2025-01-17T06:56:43Z","published":"2017-12-12T09:25:02Z","title":"A Generic Model for Swarm Intelligence and Its Validations","summary":"  The modeling of emergent swarm intelligence constitutes a major challenge and\nit has been tackled in a number of different ways. However, existing approaches\nfail to capture the nature of swarm intelligence and they are either too\nabstract for practical application or not generic enough to describe the\nvarious types of emergence phenomena. In this paper, a contradiction-centric\nmodel for swarm intelligence is proposed, in which individu-als determine their\nbehaviors based on their internal contradictions whilst they associate and\ninteract to update their contradictions. The model hypothesizes that 1) the\nemergence of swarm intelligence is rooted in the de-velopment of individuals'\ninternal contradictions and the interactions taking place between individuals\nand the environment, and 2) swarm intelligence is essentially a combinative\nreflection of the configurations of individuals' internal contradictions and\nthe distributions of these contradictions across individuals. The model is\nformally described and five swarm intelligence systems are studied to\nillustrate its broad applicability. The studies confirm the generic character\nof the model and its effectiveness for describing the emergence of various\nkinds of swarm intelligence; and they also demonstrate that the model is\nstraightforward to apply, without the need for complicated computations.\n","authors":["Wenpin Jiao"],"pdf_url":"https://arxiv.org/pdf/1712.04182v3.pdf","comment":"15 pages"}]},"2025-01-21T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.09898v2","updated":"2025-01-21T18:46:52Z","published":"2025-01-17T01:01:44Z","title":"FoundationStereo: Zero-Shot Stereo Matching","summary":"  Tremendous progress has been made in deep stereo matching to excel on\nbenchmark datasets through per-domain fine-tuning. However, achieving strong\nzero-shot generalization - a hallmark of foundation models in other computer\nvision tasks - remains challenging for stereo matching. We introduce\nFoundationStereo, a foundation model for stereo depth estimation designed to\nachieve strong zero-shot generalization. To this end, we first construct a\nlarge-scale (1M stereo pairs) synthetic training dataset featuring large\ndiversity and high photorealism, followed by an automatic self-curation\npipeline to remove ambiguous samples. We then design a number of network\narchitecture components to enhance scalability, including a side-tuning feature\nbackbone that adapts rich monocular priors from vision foundation models to\nmitigate the sim-to-real gap, and long-range context reasoning for effective\ncost volume filtering. Together, these components lead to strong robustness and\naccuracy across domains, establishing a new standard in zero-shot stereo depth\nestimation. Project page: https://nvlabs.github.io/FoundationStereo/\n","authors":["Bowen Wen","Matthew Trepte","Joseph Aribido","Jan Kautz","Orazio Gallo","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2501.09898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12234v1","updated":"2025-01-21T15:57:15Z","published":"2025-01-21T15:57:15Z","title":"Multi-Agent Feedback Motion Planning using Probably Approximately\n  Correct Nonlinear Model Predictive Control","summary":"  For many tasks, multi-robot teams often provide greater efficiency,\nrobustness, and resiliency. However, multi-robot collaboration in real-world\nscenarios poses a number of major challenges, especially when dynamic robots\nmust balance competing objectives like formation control and obstacle avoidance\nin the presence of stochastic dynamics and sensor uncertainty. In this paper,\nwe propose a distributed, multi-agent receding-horizon feedback motion planning\napproach using Probably Approximately Correct Nonlinear Model Predictive\nControl (PAC-NMPC) that is able to reason about both model and measurement\nuncertainty to achieve robust multi-agent formation control while navigating\ncluttered obstacle fields and avoiding inter-robot collisions. Our approach\nrelies not only on the underlying PAC-NMPC algorithm but also on a terminal\ncost-function derived from gyroscopic obstacle avoidance. Through numerical\nsimulation, we show that our distributed approach performs on par with a\ncentralized formulation, that it offers improved performance in the case of\nsignificant measurement noise, and that it can scale to more complex dynamical\nsystems.\n","authors":["Mark Gonzales","Adam Polevoy","Marin Kobilarov","Joseph Moore"],"pdf_url":"https://arxiv.org/pdf/2501.12234v1.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.00275v3","updated":"2025-01-21T15:33:35Z","published":"2024-08-01T04:29:34Z","title":"A Search-to-Control Reinforcement Learning Based Framework for Quadrotor\n  Local Planning in Dense Environments","summary":"  Agile flight in complex environments poses significant challenges to current\nmotion planning methods, as they often fail to fully leverage the quadrotor's\ndynamic potential, leading to performance failures and reduced efficiency\nduring aggressive maneuvers. Existing approaches frequently decouple trajectory\noptimization from control generation and neglect the dynamics, further limiting\ntheir ability to generate aggressive and feasible motions. To address these\nchallenges, we introduce an enhanced Search-to-Control planning framework that\nintegrates visibility path searching with reinforcement learning (RL) control\ngeneration, directly accounting for dynamics and bridging the gap between\nplanning and control. Our method first extracts control points from\ncollision-free paths using a proposed heuristic search, which are then refined\nby an RL policy to generate low-level control commands for the quadrotor's\ncontroller, utilizing reduced-dimensional obstacle observations for efficient\ninference with lightweight neural networks. We validate the framework through\nsimulations and real-world experiments, demonstrating improved time efficiency\nand dynamic maneuverability compared to existing methods, while confirming its\nrobustness and applicability. To support further research, We will release our\nimplementation as an open-source package.\n","authors":["Zhaohong Liu","Wenxuan Gao","Yinshuai Sun","Peng Dong"],"pdf_url":"https://arxiv.org/pdf/2408.00275v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12214v1","updated":"2025-01-21T15:32:33Z","published":"2025-01-21T15:32:33Z","title":"Improving robot understanding using conversational AI: demonstration and\n  feasibility study","summary":"  Explanations constitute an important aspect of successful human robot\ninteractions and can enhance robot understanding. To improve the understanding\nof the robot, we have developed four levels of explanation (LOE) based on two\nquestions: what needs to be explained, and why the robot has made a particular\ndecision. The understandable robot requires a communicative action when there\nis disparity between the human s mental model of the robot and the robots state\nof mind. This communicative action was generated by utilizing a conversational\nAI platform to generate explanations. An adaptive dialog was implemented for\ntransition from one LOE to another. Here, we demonstrate the adaptive dialog in\na collaborative task with errors and provide results of a feasibility study\nwith users.\n","authors":["Shikhar Kumar","Yael Edan"],"pdf_url":"https://arxiv.org/pdf/2501.12214v1.pdf","comment":"40th Anniversary, IEEE International Conference on Robotics and\n  Automation,2024"},{"id":"http://arxiv.org/abs/2501.12128v1","updated":"2025-01-21T13:42:06Z","published":"2025-01-21T13:42:06Z","title":"Evaluating Efficiency and Engagement in Scripted and LLM-Enhanced\n  Human-Robot Interactions","summary":"  To achieve natural and intuitive interaction with people, HRI frameworks\ncombine a wide array of methods for human perception, intention communication,\nhuman-aware navigation and collaborative action. In practice, when encountering\nunpredictable behavior of people or unexpected states of the environment, these\nframeworks may lack the ability to dynamically recognize such states, adapt and\nrecover to resume the interaction. Large Language Models (LLMs), owing to their\nadvanced reasoning capabilities and context retention, present a promising\nsolution for enhancing robot adaptability. This potential, however, may not\ndirectly translate to improved interaction metrics. This paper considers a\nrepresentative interaction with an industrial robot involving approach,\ninstruction, and object manipulation, implemented in two conditions: (1) fully\nscripted and (2) including LLM-enhanced responses. We use gaze tracking and\nquestionnaires to measure the participants' task efficiency, engagement, and\nrobot perception. The results indicate higher subjective ratings for the LLM\ncondition, but objective metrics show that the scripted condition performs\ncomparably, particularly in efficiency and focus during simple tasks. We also\nnote that the scripted condition may have an edge over LLM-enhanced responses\nin terms of response latency and energy consumption, especially for trivial and\nrepetitive interactions.\n","authors":["Tim Schreiter","Jens V. Rüppel","Rishi Hazra","Andrey Rudenko","Martin Magnusson","Achim J. Lilienthal"],"pdf_url":"https://arxiv.org/pdf/2501.12128v1.pdf","comment":"Accepted as a Late-Breaking Report to the 2025, 20th ACM/IEEE\n  International Conference on Human-Robot Interaction (HRI)"},{"id":"http://arxiv.org/abs/2501.12073v1","updated":"2025-01-21T11:59:07Z","published":"2025-01-21T11:59:07Z","title":"Towards autonomous photogrammetric forest inventory using a lightweight\n  under-canopy robotic drone","summary":"  Drones are increasingly used in forestry to capture high-resolution remote\nsensing data. While operations above the forest canopy are already highly\nautomated, flying inside forests remains challenging, primarily relying on\nmanual piloting. Inside dense forests, reliance on the Global Navigation\nSatellite System (GNSS) for localization is not feasible. Additionally, the\ndrone must autonomously adjust its flight path to avoid collisions. Recently,\nadvancements in robotics have enabled autonomous drone flights in GNSS-denied\nobstacle-rich areas. In this article, a step towards autonomous forest data\ncollection is taken by building a prototype of a robotic under-canopy drone\nutilizing state-of-the-art open-source methods and validating its performance\nfor data collection inside forests. The autonomous flight capability was\nevaluated through multiple test flights in two boreal forest test sites. The\ntree parameter estimation capability was studied by conducting diameter at\nbreast height (DBH) estimation using onboard stereo camera data and\nphotogrammetric methods. The prototype conducted flights in selected\nchallenging forest environments, and the experiments showed excellent\nperformance in forest reconstruction with a miniaturized stereoscopic\nphotogrammetric system. The stem detection algorithm managed to identify 79.31\n% of the stems. The DBH estimation had a root mean square error (RMSE) of 3.33\ncm (12.79 %) and a bias of 1.01 cm (3.87 %) across all trees. For trees with a\nDBH less than 30 cm, the RMSE was 1.16 cm (5.74 %), and the bias was 0.13 cm\n(0.64 %). When considering the overall performance in terms of DBH accuracy,\nautonomy, and forest complexity, the proposed approach was superior compared to\nmethods proposed in the scientific literature. Results provided valuable\ninsights into autonomous forest reconstruction using drones, and several\nfurther development topics were proposed.\n","authors":["Väinö Karjalainen","Niko Koivumäki","Teemu Hakala","Jesse Muhojoki","Eric Hyyppä","Anand George","Juha Suomalainen","Eija Honkavaara"],"pdf_url":"https://arxiv.org/pdf/2501.12073v1.pdf","comment":"35 pages, 13 Figures"},{"id":"http://arxiv.org/abs/2310.03505v2","updated":"2025-01-21T10:37:18Z","published":"2023-10-05T12:35:09Z","title":"RadaRays: Real-time Simulation of Rotating FMCW Radar for Mobile\n  Robotics via Hardware-accelerated Ray Tracing","summary":"  RadaRays allows for the accurate modeling and simulation of rotating FMCW\nradar sensors in complex environments, including the simulation of reflection,\nrefraction, and scattering of radar waves. Our software is able to handle large\nnumbers of objects and materials in real-time, making it suitable for use in a\nvariety of mobile robotics applications. We demonstrate the effectiveness of\nRadaRays through a series of experiments and show that it can more accurately\nreproduce the behavior of FMCW radar sensors in a variety of environments,\ncompared to the ray casting-based lidar-like simulations that are commonly used\nin simulators for autonomous driving such as CARLA. Our experiments\nadditionally serve as a valuable reference point for researchers to evaluate\ntheir own radar simulations. By using RadaRays, developers can significantly\nreduce the time and cost associated with prototyping and testing FMCW\nradar-based algorithms. We also provide a Gazebo plugin that makes our work\naccessible to the mobile robotics community.\n","authors":["Alexander Mock","Martin Magnusson","Joachim Hertzberg"],"pdf_url":"https://arxiv.org/pdf/2310.03505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12025v1","updated":"2025-01-21T10:34:04Z","published":"2025-01-21T10:34:04Z","title":"Low-Cost 3D printed, Biocompatible Ionic Polymer Membranes for Soft\n  Actuators","summary":"  Ionic polymer actuators, in essence, consist of ion exchange polymers\nsandwiched between layers of electrodes. They have recently gained recognition\nas promising candidates for soft actuators due to their lightweight nature,\nnoise-free operation, and low-driving voltages. However, the materials\ntraditionally utilized to develop them are often not human/environmentally\nfriendly. Thus, to address this issue, researchers have been focusing on\ndeveloping biocompatible versions of this actuator. Despite this, such\nactuators still face challenges in achieving high performance, in payload\ncapacity, bending capabilities, and response time. In this paper, we present a\nbiocompatible ionic polymer actuator whose membrane is fully 3D printed\nutilizing a direct ink writing method. The structure of the printed membranes\nconsists of biodegradable ionic fluid encapsulated within layers of activated\ncarbon polymers. From the microscopic observations of its structure, we\nconfirmed that the ionic polymer is well encapsulated. The actuators can\nachieve a bending performance of up to 124$^\\circ$ (curvature of 0.82\n$\\text{cm}^{-1}$), which, to our knowledge, is the highest curvature attained\nby any bending ionic polymer actuator to date. It can operate comfortably up to\na 2 Hz driving frequency and can achieve blocked forces of up to 0.76 mN. Our\nresults showcase a promising, high-performing biocompatible ionic polymer\nactuator, whose membrane can be easily manufactured in a single step using a\nstandard FDM 3D printer. This approach paves the way for creating customized\ndesigns for functional soft robotic applications, including human-interactive\ndevices, in the near future.\n","authors":["Nils Trümpler","Ryo Kanno","Niu David","Anja Huch","Pham Huy Nguyen","Maksims Jurinovs","Gustav Nyström","Sergejs Gaidukovs","Mirko Kovac"],"pdf_url":"https://arxiv.org/pdf/2501.12025v1.pdf","comment":"6 pages, 8 figures, Accepted in IEEE International Conference on Soft\n  Robotics 2025 (Robosoft)"},{"id":"http://arxiv.org/abs/2307.09105v3","updated":"2025-01-21T09:41:04Z","published":"2023-07-18T09:54:01Z","title":"Sampling-based Model Predictive Control Leveraging Parallelizable\n  Physics Simulations","summary":"  We present a method for sampling-based model predictive control that makes\nuse of a generic physics simulator as the dynamical model. In particular, we\npropose a Model Predictive Path Integral controller (MPPI), that uses the\nGPU-parallelizable IsaacGym simulator to compute the forward dynamics of a\nproblem. By doing so, we eliminate the need for explicit encoding of robot\ndynamics and contacts with objects for MPPI. Since no explicit dynamic modeling\nis required, our method is easily extendable to different objects and robots\nand allows one to solve complex navigation and contact-rich tasks. We\ndemonstrate the effectiveness of this method in several simulated and\nreal-world settings, among which mobile navigation with collision avoidance,\nnon-prehensile manipulation, and whole-body control for high-dimensional\nconfiguration spaces. This method is a powerful and accessible open-source tool\nto solve a large variety of contact-rich motion planning tasks.\n","authors":["Corrado Pezzato","Chadi Salmi","Elia Trevisan","Max Spahn","Javier Alonso-Mora","Carlos Hernández Corbato"],"pdf_url":"https://arxiv.org/pdf/2307.09105v3.pdf","comment":"Accepted for RA-L. Code and videos available at\n  https://autonomousrobots.nl/paper_websites/isaac-mppi"},{"id":"http://arxiv.org/abs/2410.06052v4","updated":"2025-01-21T08:48:08Z","published":"2024-10-08T13:54:04Z","title":"Concurrent-Learning Based Relative Localization in Shape Formation of\n  Robot Swarms (Extended version)","summary":"  In this paper, we address the shape formation problem for massive robot\nswarms in environments where external localization systems are unavailable.\nAchieving this task effectively with solely onboard measurements is still\nscarcely explored and faces some practical challenges. To solve this\nchallenging problem, we propose the following novel results. Firstly, to\nestimate the relative positions among neighboring robots, a concurrent-learning\nbased estimator is proposed. It relaxes the persistent excitation condition\nrequired in the classical ones such as least-square estimator. Secondly, we\nintroduce a finite-time agreement protocol to determine the shape location.\nThis is achieved by estimating the relative position between each robot and a\nrandomly assigned seed robot. The initial position of the seed one marks the\nshape location. Thirdly, based on the theoretical results of the relative\nlocalization, a novel behavior-based control strategy is devised. This strategy\nnot only enables adaptive shape formation of large group of robots but also\nenhances the observability of inter-robot relative localization. Numerical\nsimulation results are provided to verify the performance of our proposed\nstrategy compared to the state-of-the-art ones. Additionally, outdoor\nexperiments on real robots further demonstrate the practical effectiveness and\nrobustness of our methods.\n","authors":["Jinhu Lü","Kunrui Ze","Shuoyu Yue","Kexin Liu","Wei Wang","Guibin Sun"],"pdf_url":"https://arxiv.org/pdf/2410.06052v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11945v1","updated":"2025-01-21T07:41:12Z","published":"2025-01-21T07:41:12Z","title":"Learning to Hop for a Single-Legged Robot with Parallel Mechanism","summary":"  This work presents the application of reinforcement learning to improve the\nperformance of a highly dynamic hopping system with a parallel mechanism.\nUnlike serial mechanisms, parallel mechanisms can not be accurately simulated\ndue to the complexity of their kinematic constraints and closed-loop\nstructures. Besides, learning to hop suffers from prolonged aerial phase and\nthe sparse nature of the rewards. To address them, we propose a learning\nframework to encode long-history feedback to account for the under-actuation\nbrought by the prolonged aerial phase. In the proposed framework, we also\nintroduce a simplified serial configuration for the parallel design to avoid\ndirectly simulating parallel structure during the training. A torque-level\nconversion is designed to deal with the parallel-serial conversion to handle\nthe sim-to-real issue. Simulation and hardware experiments have been conducted\nto validate this framework.\n","authors":["Hongbo Zhang","Xiangyu Chu","Yanlin Chen","Yunxi Tang","Linzhu Yue","Yun-Hui Liu","Kwok Wai Samuel Au"],"pdf_url":"https://arxiv.org/pdf/2501.11945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11938v1","updated":"2025-01-21T07:27:14Z","published":"2025-01-21T07:27:14Z","title":"Navigating Robot Swarm Through a Virtual Tube with Flow-Adaptive\n  Distribution Control","summary":"  With the rapid development of robot swarm technology and its diverse\napplications, navigating robot swarms through complex environments has emerged\nas a critical research direction. To ensure safe navigation and avoid potential\ncollisions with obstacles, the concept of virtual tubes has been introduced to\ndefine safe and navigable regions. However, current control methods in virtual\ntubes face the congestion issues, particularly in narrow virtual tubes with low\nthroughput. To address these challenges, we first originally introduce the\nconcepts of virtual tube area and flow capacity, and develop an new evolution\nmodel for the spatial density function. Next, we propose a novel control method\nthat combines a modified artificial potential field (APF) for swarm navigation\nand density feedback control for distribution regulation, under which a\nsaturated velocity command is designed. Then, we generate a global velocity\nfield that not only ensures collision-free navigation through the virtual tube,\nbut also achieves locally input-to-state stability (LISS) for density tracking\nerrors, both of which are rigorously proven. Finally, numerical simulations and\nrealistic applications validate the effectiveness and advantages of the\nproposed method in managing robot swarms within narrow virtual tubes.\n","authors":["Yongwei Zhang","Shuli Lv","Kairong Liu","Quanyi Liang","Quan Quan","Zhikun She"],"pdf_url":"https://arxiv.org/pdf/2501.11938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11930v1","updated":"2025-01-21T07:08:53Z","published":"2025-01-21T07:08:53Z","title":"Nocturnal eye inspired liquid to gas phase change soft actuator with\n  Laser-Induced-Graphene: enhanced environmental light harvesting and\n  photothermal conversion","summary":"  Robotic systems' mobility is constrained by power sources and wiring. While\npneumatic actuators remain tethered to air supplies, we developed a new\nactuator utilizing light energy. Inspired by nocturnal animals' eyes, we\ndesigned a bilayer soft actuator incorporating Laser-Induced Graphene (LIG) on\nthe inner surface of a silicone layer. This design maintains silicone's\ntransparency and flexibility while achieving 54% faster response time compared\nto conventional actuators through enhanced photothermal conversion.\n","authors":["Maina Sogabe","Youhyun Kim","Kenji Kawashima"],"pdf_url":"https://arxiv.org/pdf/2501.11930v1.pdf","comment":"23pages, 8 figures, journal paper"},{"id":"http://arxiv.org/abs/2310.20151v2","updated":"2025-01-21T06:26:43Z","published":"2023-10-31T03:37:11Z","title":"Multi-Agent Consensus Seeking via Large Language Models","summary":"  Multi-agent systems driven by large language models (LLMs) have shown\npromising abilities for solving complex tasks in a collaborative manner. This\nwork considers a fundamental problem in multi-agent collaboration: consensus\nseeking. When multiple agents work together, we are interested in how they can\nreach a consensus through inter-agent negotiation. To that end, this work\nstudies a consensus-seeking task where the state of each agent is a numerical\nvalue and they negotiate with each other to reach a consensus value. It is\nrevealed that when not explicitly directed on which strategy should be adopted,\nthe LLM-driven agents primarily use the average strategy for consensus seeking\nalthough they may occasionally use some other strategies. Moreover, this work\nanalyzes the impact of the agent number, agent personality, and network\ntopology on the negotiation process. The findings reported in this work can\npotentially lay the foundations for understanding the behaviors of LLM-driven\nmulti-agent systems for solving more complex tasks. Furthermore, LLM-driven\nconsensus seeking is applied to a multi-robot aggregation task. This\napplication demonstrates the potential of LLM-driven agents to achieve\nzero-shot autonomous planning for multi-robot collaboration tasks. Project\nwebsite: windylab.github.io/ConsensusLLM/.\n","authors":["Huaben Chen","Wenkang Ji","Lufeng Xu","Shiyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.20151v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00204v5","updated":"2025-01-21T06:08:46Z","published":"2024-03-30T00:46:43Z","title":"AirPilot: Interpretable PPO-based DRL Auto-Tuned Nonlinear PID Drone\n  Controller for Robust Autonomous Flights","summary":"  Navigation precision, speed and stability are crucial for safe Unmanned\nAerial Vehicle (UAV) flight maneuvers and effective flight mission executions\nin dynamic environments. Different flight missions may have varying objectives,\nsuch as minimizing energy consumption, achieving precise positioning, or\nmaximizing speed. A controller that can adapt to different objectives on the\nfly is highly valuable. Proportional Integral Derivative (PID) controllers are\none of the most popular and widely used control algorithms for drones and other\ncontrol systems, but their linear control algorithm fails to capture the\nnonlinear nature of the dynamic wind conditions and complex drone system.\nManually tuning the PID gains for various missions can be time-consuming and\nrequires significant expertise. This paper aims to revolutionize drone flight\ncontrol by presenting the AirPilot, a nonlinear Deep Reinforcement Learning\n(DRL) - enhanced Proportional Integral Derivative (PID) drone controller using\nProximal Policy Optimization (PPO). AirPilot controller combines the simplicity\nand effectiveness of traditional PID control with the adaptability, learning\ncapability, and optimization potential of DRL. This makes it better suited for\nmodern drone applications where the environment is dynamic, and\nmission-specific performance demands are high. We employed a COEX Clover\nautonomous drone for training the DRL agent within the simulator and\nimplemented it in a real-world lab setting, which marks a significant milestone\nas one of the first attempts to apply a DRL-based flight controller on an\nactual drone. Airpilot is capable of reducing the navigation error of the\ndefault PX4 PID position controller by 90%, improving effective navigation\nspeed of a fine-tuned PID controller by 21%, reducing settling time and\novershoot by 17% and 16% respectively.\n","authors":["Junyang Zhang","Cristian Emanuel Ocampo Rivera","Kyle Tyni","Steven Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.00204v5.pdf","comment":"9 pages, 20 figures"},{"id":"http://arxiv.org/abs/2408.07855v3","updated":"2025-01-21T05:59:47Z","published":"2024-08-14T23:48:26Z","title":"Complementarity-Free Multi-Contact Modeling and Optimization for\n  Dexterous Manipulation","summary":"  A significant barrier preventing model-based methods from achieving real-time\nand versatile dexterous robotic manipulation is the inherent complexity of\nmulti-contact dynamics. Traditionally formulated as complementarity models,\nmulti-contact dynamics introduces non-smoothness and combinatorial complexity,\ncomplicating contact-rich planning and optimization. In this paper, we\ncircumvent these challenges by introducing a lightweight yet capable\nmulti-contact model. Our new model, derived from the duality of\noptimization-based contact models, dispenses with the complementarity\nconstructs entirely, providing computational advantages such as closed-form\ntime stepping, differentiability, automatic satisfaction with Coulomb friction\nlaw, and minimal hyperparameter tuning. We demonstrate the effectiveness and\nefficiency of the model for planning and control in a range of challenging\ndexterous manipulation tasks, including fingertip 3D in-air manipulation,\nTriFinger in-hand manipulation, and Allegro hand on-palm reorientation, all\nperformed with diverse objects. Our method consistently achieves\nstate-of-the-art results: (I) a 96.5% average success rate across all objects\nand tasks, (II) high manipulation accuracy with an average reorientation error\nof 11{\\deg} and position error of 7.8mm, and (III) contact-implicit model\npredictive control running at 50-100 Hz for all objects and tasks. These\nresults are achieved with minimal hyperparameter tuning.\n","authors":["Wanxin Jin"],"pdf_url":"https://arxiv.org/pdf/2408.07855v3.pdf","comment":"Video demo: https://youtu.be/NsL4hbSXvFg"},{"id":"http://arxiv.org/abs/2310.15846v4","updated":"2025-01-21T05:44:03Z","published":"2023-10-24T13:58:10Z","title":"Optimal Spatial-Temporal Triangulation for Bearing-Only Cooperative\n  Motion Estimation","summary":"  Vision-based cooperative motion estimation is an important problem for many\nmulti-robot systems such as cooperative aerial target pursuit. This problem can\nbe formulated as bearing-only cooperative motion estimation, where the visual\nmeasurement is modeled as a bearing vector pointing from the camera to the\ntarget. The conventional approaches for bearing-only cooperative estimation are\nmainly based on the framework distributed Kalman filtering (DKF). In this\npaper, we propose a new optimal bearing-only cooperative estimation algorithm,\nnamed spatial-temporal triangulation, based on the method of distributed\nrecursive least squares, which provides a more flexible framework for designing\ndistributed estimators than DKF. The design of the algorithm fully incorporates\nall the available information and the specific triangulation geometric\nconstraint. As a result, the algorithm has superior estimation performance than\nthe state-of-the-art DKF algorithms in terms of both accuracy and convergence\nspeed as verified by numerical simulation. We rigorously prove the exponential\nconvergence of the proposed algorithm. Moreover, to verify the effectiveness of\nthe proposed algorithm under practical challenging conditions, we develop a\nvision-based cooperative aerial target pursuit system, which is the first of\nsuch fully autonomous systems so far to the best of our knowledge.\n","authors":["Canlun Zheng","Yize Mi","Hanqing Guo","Huaben Chen","Zhiyun Lin","Shiyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.15846v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11893v1","updated":"2025-01-21T05:03:06Z","published":"2025-01-21T05:03:06Z","title":"DynoSAM: Open-Source Smoothing and Mapping Framework for Dynamic SLAM","summary":"  Traditional Visual Simultaneous Localization and Mapping (vSLAM) systems\nfocus solely on static scene structures, overlooking dynamic elements in the\nenvironment. Although effective for accurate visual odometry in complex\nscenarios, these methods discard crucial information about moving objects. By\nincorporating this information into a Dynamic SLAM framework, the motion of\ndynamic entities can be estimated, enhancing navigation whilst ensuring\naccurate localization. However, the fundamental formulation of Dynamic SLAM\nremains an open challenge, with no consensus on the optimal approach for\naccurate motion estimation within a SLAM pipeline. Therefore, we developed\nDynoSAM, an open-source framework for Dynamic SLAM that enables the efficient\nimplementation, testing, and comparison of various Dynamic SLAM optimization\nformulations. DynoSAM integrates static and dynamic measurements into a unified\noptimization problem solved using factor graphs, simultaneously estimating\ncamera poses, static scene, object motion or poses, and object structures. We\nevaluate DynoSAM across diverse simulated and real-world datasets, achieving\nstate-of-the-art motion estimation in indoor and outdoor environments, with\nsubstantial improvements over existing systems. Additionally, we demonstrate\nDynoSAM utility in downstream applications, including 3D reconstruction of\ndynamic scenes and trajectory prediction, thereby showcasing potential for\nadvancing dynamic object-aware SLAM systems. DynoSAM is open-sourced at\nhttps://github.com/ACFR-RPG/DynOSAM.\n","authors":["Jesse Morris","Yiduo Wang","Mikolaj Kliniewski","Viorela Ila"],"pdf_url":"https://arxiv.org/pdf/2501.11893v1.pdf","comment":"20 pages, 10 figures. Submitted to T-RO Visual SLAM SI 2025"},{"id":"http://arxiv.org/abs/2501.11887v1","updated":"2025-01-21T04:53:17Z","published":"2025-01-21T04:53:17Z","title":"Connection-Coordination Rapport (CCR) Scale: A Dual-Factor Scale to\n  Measure Human-Robot Rapport","summary":"  Robots, particularly in service and companionship roles, must develop\npositive relationships with people they interact with regularly to be\nsuccessful. These positive human-robot relationships can be characterized as\nestablishing \"rapport,\" which indicates mutual understanding and interpersonal\nconnection that form the groundwork for successful long-term human-robot\ninteraction. However, the human-robot interaction research literature lacks\nscale instruments to assess human-robot rapport in a variety of situations. In\nthis work, we developed the 18-item Connection-Coordination Rapport (CCR) Scale\nto measure human-robot rapport. We first ran Study 1 (N = 288) where online\nparticipants rated videos of human-robot interactions using a set of candidate\nitems. Our Study 1 results showed the discovery of two factors in our scale,\nwhich we named \"Connection\" and \"Coordination.\" We then evaluated this scale by\nrunning Study 2 (N = 201) where online participants rated a new set of\nhuman-robot interaction videos with our scale and an existing rapport scale\nfrom virtual agents research for comparison. We also validated our scale by\nreplicating a prior in-person human-robot interaction study, Study 3 (N = 44),\nand found that rapport is rated significantly greater when participants\ninteracted with a responsive robot (responsive condition) as opposed to an\nunresponsive robot (unresponsive condition). Results from these studies\ndemonstrate high reliability and validity for the CCR scale, which can be used\nto measure rapport in both first-person and third-person perspectives. We\nencourage the adoption of this scale in future studies to measure rapport in a\nvariety of human-robot interactions.\n","authors":["Ting-Han Lin","Hannah Dinner","Tsz Long Leung","Bilge Mutlu","J. Gregory Trafton","Sarah Sebo"],"pdf_url":"https://arxiv.org/pdf/2501.11887v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.08907v3","updated":"2025-01-21T04:09:42Z","published":"2024-07-12T00:57:36Z","title":"Tightly-Coupled LiDAR-IMU-Wheel Odometry with an Online Neural Kinematic\n  Model Learning via Factor Graph Optimization","summary":"  Environments lacking geometric features (e.g., tunnels and long straight\ncorridors) are challenging for LiDAR-based odometry algorithms because LiDAR\npoint clouds degenerate in such environments. For wheeled robots, a wheel\nkinematic model (i.e., wheel odometry) can improve the reliability of the\nodometry estimation. However, the kinematic model suffers from complex motions\n(e.g., wheel slippage, lateral movement) in the case of skid-steering robots\nparticularly because this robot model rotates by skidding its wheels.\nFurthermore, these errors change nonlinearly when the wheel slippage is large\n(e.g., drifting) and are subject to terrain-dependent parameters. To\nsimultaneously tackle point cloud degeneration and the kinematic model errors,\nwe developed a LiDAR-IMU-wheel odometry algorithm incorporating online training\nof a neural network that learns the kinematic model of wheeled robots with\nnonlinearity. We propose to train the neural network online on a factor graph\nalong with robot states, allowing the learning-based kinematic model to adapt\nto the current terrain condition. The proposed method jointly solves online\ntraining of the neural network and LiDARIMUwheel odometry on a unified factor\ngraph to retain the consistency of all those constraints. Through experiments,\nwe first verified that the proposed network adapted to a changing environment,\nresulting in an accurate odometry estimation across different environments. We\nthen confirmed that the proposed odometry estimation algorithm was robust\nagainst point cloud degeneration and nonlinearity (e.g., large wheel slippage\nby drifting) of the kinematic model.\n","authors":["Taku Okawara","Kenji Koide","Shuji Oishi","Masashi Yokozuka","Atsuhiko Banno","Kentaro Uno","Kazuya Yoshida"],"pdf_url":"https://arxiv.org/pdf/2407.08907v3.pdf","comment":"https://youtu.be/CvRVhdda7Cw"},{"id":"http://arxiv.org/abs/2408.11051v2","updated":"2025-01-21T04:06:09Z","published":"2024-08-20T17:57:46Z","title":"FLAME: Learning to Navigate with Multimodal LLM in Urban Environments","summary":"  Large Language Models (LLMs) have demonstrated potential in\nVision-and-Language Navigation (VLN) tasks, yet current applications face\nchallenges. While LLMs excel in general conversation scenarios, they struggle\nwith specialized navigation tasks, yielding suboptimal performance compared to\nspecialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied\nAgent), a novel Multimodal LLM-based agent and architecture designed for urban\nVLN tasks that efficiently handles multiple observations. Our approach\nimplements a three-phase tuning technique for effective adaptation to\nnavigation tasks, including single perception tuning for street view\ndescription, multiple perception tuning for route summarization, and end-to-end\ntraining on VLN datasets. The augmented datasets are synthesized automatically.\nExperimental results demonstrate FLAME's superiority over existing methods,\nsurpassing state-of-the-art methods by a 7.3% increase in task completion on\nTouchdown dataset. This work showcases the potential of Multimodal LLMs (MLLMs)\nin complex navigation tasks, representing an advancement towards applications\nof MLLMs in the field of embodied intelligence.\n","authors":["Yunzhe Xu","Yiyuan Pan","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11051v2.pdf","comment":"Accepted to AAAI 2025 (Oral)"},{"id":"http://arxiv.org/abs/2405.16960v2","updated":"2025-01-21T03:49:48Z","published":"2024-05-27T08:55:17Z","title":"DCPI-Depth: Explicitly Infusing Dense Correspondence Prior to\n  Unsupervised Monocular Depth Estimation","summary":"  There has been a recent surge of interest in learning to perceive depth from\nmonocular videos in an unsupervised fashion. A key challenge in this field is\nachieving robust and accurate depth estimation in challenging scenarios,\nparticularly in regions with weak textures or where dynamic objects are\npresent. This study makes three major contributions by delving deeply into\ndense correspondence priors to provide existing frameworks with explicit\ngeometric constraints. The first novelty is a contextual-geometric depth\nconsistency loss, which employs depth maps triangulated from dense\ncorrespondences based on estimated ego-motion to guide the learning of depth\nperception from contextual information, since explicitly triangulated depth\nmaps capture accurate relative distances among pixels. The second novelty\narises from the observation that there exists an explicit, deducible\nrelationship between optical flow divergence and depth gradient. A differential\nproperty correlation loss is, therefore, designed to refine depth estimation\nwith a specific emphasis on local variations. The third novelty is a\nbidirectional stream co-adjustment strategy that enhances the interaction\nbetween rigid and optical flows, encouraging the former towards more accurate\ncorrespondence and making the latter more adaptable across various scenarios\nunder the static scene hypotheses. DCPI-Depth, a framework that incorporates\nall these innovative components and couples two bidirectional and collaborative\nstreams, achieves state-of-the-art performance and generalizability across\nmultiple public datasets, outperforming all existing prior arts. Specifically,\nit demonstrates accurate depth estimation in texture-less and dynamic regions,\nand shows more reasonable smoothness. Our source code will be publicly\navailable at mias.group/DCPI-Depth upon publication.\n","authors":["Mengtan Zhang","Yi Feng","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2405.16960v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.18313v5","updated":"2025-01-21T02:38:32Z","published":"2024-09-26T21:44:11Z","title":"Embodied-RAG: General Non-parametric Embodied Memory for Retrieval and\n  Generation","summary":"  There is no limit to how much a robot might explore and learn, but all of\nthat knowledge needs to be searchable and actionable. Within language research,\nretrieval augmented generation (RAG) has become the workhorse of large-scale\nnon-parametric knowledge; however, existing techniques do not directly transfer\nto the embodied domain, which is multimodal, where data is highly correlated,\nand perception requires abstraction. To address these challenges, we introduce\nEmbodied-RAG, a framework that enhances the foundational model of an embodied\nagent with a non-parametric memory system capable of autonomously constructing\nhierarchical knowledge for both navigation and language generation.\nEmbodied-RAG handles a full range of spatial and semantic resolutions across\ndiverse environments and query types, whether for a specific object or a\nholistic description of ambiance. At its core, Embodied-RAG's memory is\nstructured as a semantic forest, storing language descriptions at varying\nlevels of detail. This hierarchical organization allows the system to\nefficiently generate context-sensitive outputs across different robotic\nplatforms. We demonstrate that Embodied-RAG effectively bridges RAG to the\nrobotics domain, successfully handling over 250 explanation and navigation\nqueries across kilometer-level environments, highlighting its promise as a\ngeneral-purpose non-parametric system for embodied agents.\n","authors":["Quanting Xie","So Yeon Min","Pengliang Ji","Yue Yang","Tianyi Zhang","Kedi Xu","Aarav Bajaj","Ruslan Salakhutdinov","Matthew Johnson-Roberson","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2409.18313v5.pdf","comment":"Web: https://quanting-xie.github.io/Embodied-RAG-web/"},{"id":"http://arxiv.org/abs/2501.11803v1","updated":"2025-01-21T00:44:18Z","published":"2025-01-21T00:44:18Z","title":"Automating High Quality RT Planning at Scale","summary":"  Radiotherapy (RT) planning is complex, subjective, and time-intensive.\nAdvances in artificial intelligence (AI) promise to improve its precision,\nefficiency, and consistency, but progress is often limited by the scarcity of\nlarge, standardized datasets. To address this, we introduce the Automated\nIterative RT Planning (AIRTP) system, a scalable solution for generating\nhigh-quality treatment plans. This scalable solution is designed to generate\nsubstantial volumes of consistently high-quality treatment plans, overcoming a\nkey obstacle in the advancement of AI-driven RT planning. Our AIRTP pipeline\nadheres to clinical guidelines and automates essential steps, including\norgan-at-risk (OAR) contouring, helper structure creation, beam setup,\noptimization, and plan quality improvement, using AI integrated with RT\nplanning software like Eclipse of Varian. Furthermore, a novel approach for\ndetermining optimization parameters to reproduce 3D dose distributions, i.e. a\nmethod to convert dose predictions to deliverable treatment plans constrained\nby machine limitations. A comparative analysis of plan quality reveals that our\nautomated pipeline produces treatment plans of quality comparable to those\ngenerated manually, which traditionally require several hours of labor per\nplan. Committed to public research, the first data release of our AIRTP\npipeline includes nine cohorts covering head-and-neck and lung cancer sites to\nsupport an AAPM 2025 challenge. This data set features more than 10 times the\nnumber of plans compared to the largest existing well-curated public data set\nto our best knowledge.\nRepo:{https://github.com/RiqiangGao/GDP-HMM_AAPMChallenge}\n","authors":["Riqiang Gao","Mamadou Diallo","Han Liu","Anthony Magliari","Jonathan Sackett","Wilko Verbakel","Sandra Meyers","Masoud Zarepisheh","Rafe Mcbeth","Simon Arberet","Martin Kraus","Florin C. Ghesu","Ali Kamen"],"pdf_url":"https://arxiv.org/pdf/2501.11803v1.pdf","comment":"Related to GDP-HMM grand challenge"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2501.12362v1","updated":"2025-01-21T18:43:02Z","published":"2025-01-21T18:43:02Z","title":"ARM-IRL: Adaptive Resilience Metric Quantification Using Inverse\n  Reinforcement Learning","summary":"  Resilience of safety-critical systems is gaining importance, particularly\nwith the increasing number of cyber and physical threats. Cyber-physical\nthreats are becoming increasingly prevalent, as digital systems are ubiquitous\nin critical infrastructure. The challenge with determining the resilience of\ncyber-physical systems is identifying a set of resilience metrics that can\nadapt to the changing states of the system. A static resilience metric can lead\nto an inaccurate estimation of system state, and can result in unintended\nconsequences against cyber threats. In this work, we propose a data-driven\nmethod for adaptive resilience metric learning. The primary goal is to learn a\nsingle resilience metric by formulating an inverse reinforcement learning\nproblem that learns a reward or objective from a set of control actions from an\nexpert. It learns the structure or parameters of the reward function based on\ninformation provided by expert demonstrations. Most prior work has considered\nstatic weights or theories from fuzzy logic to formulate a single resilience\nmetric. Instead, this work learns the resilience metric, represented as reward\nfunction, using adversarial inverse reinforcement learning, to determine the\noptimal policy through training the generator discriminator in parallel. We\nevaluate our proposed technique in scenarios such as optimal communication\nnetwork rerouting, power distribution network reconfiguration, and a combined\ncyber-physical restoration of critical load using the IEEE 123-bus system.\n","authors":["Abhijeet Sahu","Venkatesh Venkataramanan","Richard Macwan"],"pdf_url":"https://arxiv.org/pdf/2501.12362v1.pdf","comment":"13 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.12288v1","updated":"2025-01-21T16:58:49Z","published":"2025-01-21T16:58:49Z","title":"Microgrid Operation Control with State-of-Charge- Dependent Storage\n  Power Constraints","summary":"  The microgrid concept offers high flexibility and resilience due to the\npossibility of switching between grid-connected and stand-alone operation. This\nrenders microgrids an auspicious solution for rural areas and critical\ninfrastructure. In standalone or islanded mode, the main objective is cost\nminimization while ensuring a safe and reliable operation. Optimal operation\nschemes for microgrids usually assume fixed power limits for energy storage\nunits. This, however, is not sufficient for lithiumion energy storage systems,\nwhich often come with dynamic power limits that depend on the state of charge.\nThese limits are especially prominent when the state of charge is close to its\nboundaries. In this paper, dynamic constraints for energy storages are modelled\nusing convex polytopes and fitted to experimental data acquired from an 11.6\nkWh lithium-ion energy storage system. The polytopic constraints are integrated\nin a model predictive control scheme that was designed for a standalone\nmicrogrid composed of a fuel cell, a photovoltaic generator and a lithium-ion\nenergy storage system. To evaluate the advantages, a case study with two\nconfigurations is performed. The model predictive controller without polytopic\nconstraints led to constraint violations in 11.77 % of the simulation time\nsteps with a maximum deviation of 118 % above the power limits. The\nconfiguration with polytopic constraints in contrary led to no violations over\nthe entire simulation horizon.\n","authors":["E. D. Gomez Anccas","C. A. Hans","D. Schulz"],"pdf_url":"https://arxiv.org/pdf/2501.12288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12279v1","updated":"2025-01-21T16:50:33Z","published":"2025-01-21T16:50:33Z","title":"Spatial exponential decay of perturbations in optimal control of general\n  evolution equations","summary":"  We analyze the robustness of optimally controlled evolution equations with\nrespect to spatially localized perturbations. We prove that if the involved\noperators are domain-uniformly stabilizable and detectable, then these\nlocalized perturbations only have a local effect on the optimal solution. We\ncharacterize this domain-uniform stabilizability and detectability for the\ntransport equation with constant transport velocity, showing that even for\nunitary semigroups, optimality implies exponential damping. Finally, we extend\nour result to the case of a space-dependent transport velocity. Numerical\nexamples in one space dimension complement the theoretical results.\n","authors":["Simone Göttlich","Benedikt Oppeneiger","Manuel Schaller","Karl Worthmann"],"pdf_url":"https://arxiv.org/pdf/2501.12279v1.pdf","comment":"46 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.12256v1","updated":"2025-01-21T16:23:56Z","published":"2025-01-21T16:23:56Z","title":"Lie-Bracket Nash Equilibrium Seeking with Bounded Update Rates for\n  Noncooperative Games","summary":"  This paper proposes a novel approach for local convergence to Nash\nequilibrium in quadratic noncooperative games based on a distributed\nLie-bracket extremum seeking control scheme. This is the first instance of\nnoncooperative games being tackled in a model-free fashion integrated with the\nextremum seeking method of bounded update rates. In particular, the stability\nanalysis is carried out using Lie-bracket approximation and Lyapunov's direct\nmethod. We quantify the size of the ultimate small residual sets around the\nNash equilibrium and illustrate the theoretical results numerically on an\nexample in an oligopoly setting.\n","authors":["Victor Hugo Pereira Rodrigues","Tiago Roux Oliveira","Miroslav Krstic","Tamer Basar"],"pdf_url":"https://arxiv.org/pdf/2501.12256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12236v1","updated":"2025-01-21T15:58:21Z","published":"2025-01-21T15:58:21Z","title":"Fast sparse optimization via adaptive shrinkage","summary":"  The need for fast sparse optimization is emerging, e.g., to deal with\nlarge-dimensional data-driven problems and to track time-varying systems. In\nthe framework of linear sparse optimization, the iterative\nshrinkage-thresholding algorithm is a valuable method to solve Lasso, which is\nparticularly appreciated for its ease of implementation. Nevertheless, it\nconverges slowly. In this paper, we develop a proximal method, based on\nlogarithmic regularization, which turns out to be an iterative\nshrinkage-thresholding algorithm with adaptive shrinkage hyperparameter. This\nadaptivity substantially enhances the trajectory of the algorithm, in a way\nthat yields faster convergence, while keeping the simplicity of the original\nmethod. Our contribution is twofold: on the one hand, we derive and analyze the\nproposed algorithm; on the other hand, we validate its fast convergence via\nnumerical experiments and we discuss the performance with respect to\nstate-of-the-art algorithms.\n","authors":["Vito Cerone","Sophie M. Fosson","Diego Regruto"],"pdf_url":"https://arxiv.org/pdf/2501.12236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01215v2","updated":"2025-01-21T15:03:25Z","published":"2024-02-02T08:38:50Z","title":"Intraday Power Trading for Imbalance Markets: An Adaptive Risk-Averse\n  Strategy using Mixture Models","summary":"  Efficient markets are characterised by profit-driven participants\ncontinuously refining their positions towards the latest insights. Margins for\nprofit generation are generally small, shaping a difficult landscape for\nautomated trading strategies. This paper introduces a novel intraday power\ntrading strategy tailored for single-price balancing markets. The strategy\nrelies on a strategically devised mixture model to forecast future system\nimbalance prices and is formulated as a stochastic optimization problem with\ndecision-dependent distributions to address two primary challenges: (i) the\nimpact of trading positions on the system imbalance price and (ii) the\nuncertainty inherent in the model. The first challenge is tackled by adjusting\nthe model to account for price changes after taking a position. For the second\nchallenge, a coherent risk measure is added to the cost function to take\nadditional uncertainties into account. This paper introduces a methodology to\nselect the tuning parameter of this risk measure adaptively by continuously\nquantifying the performance of the strategy on a window of recently observed\ndata. The strategy is validated with a simulation on the Belgian electricity\nmarket using real-time market data. The adaptive tuning approach leads to\nhigher absolute profits, while also reducing the number of trades.\n","authors":["Robin Bruneel","Mathijs Schuurmans","Panagiotis Patrinos"],"pdf_url":"https://arxiv.org/pdf/2402.01215v2.pdf","comment":"Submitted to Applied Energy [Elsevier]"},{"id":"http://arxiv.org/abs/2501.09509v2","updated":"2025-01-21T14:34:31Z","published":"2025-01-16T12:43:23Z","title":"Power-Efficient RAN Intelligent Controllers Through Optimized KPI\n  Monitoring","summary":"  The Open Radio Access Network (RAN) paradigm envisions a more flexible,\ninteroperable, and intelligent RAN ecosystem via new open interfaces and\nelements like the RAN Intelligent Controller (RIC). However, the impact of\nthese elements on Open RAN's power consumption remains heavily unexplored. This\nwork for the first time evaluates the impact of Key Performance Indicator (KPI)\nmonitoring on RIC's power consumption using real traffic and power\nmeasurements. By analyzing various RIC-RAN communication scenarios, we identify\nthat RIC's power consumption can become a scalability bottleneck, particularly\nin large-scale deployments, even when RIC is limited to its core operational\nfunctionalities and without incorporating application-specific processes. In\nthis context, also for the first time we explore potential power savings\nthrough the elimination of redundant KPI transmissions, extending existing\ntechniques for identical subscription removal and KPI selection, achieving\nsignificant power consumption gains exceeding 87\\% of the overall RIC power\nconsumption.\n","authors":["João Paulo S. H. Lima","George N. Katsaros","Konstantinos Nikitopoulos"],"pdf_url":"https://arxiv.org/pdf/2501.09509v2.pdf","comment":"Accepted for publication and presentation at IEEE WCNC 2025"},{"id":"http://arxiv.org/abs/2501.12156v1","updated":"2025-01-21T14:09:10Z","published":"2025-01-21T14:09:10Z","title":"Characterization of Invariance, Periodic Solutions and Optimization of\n  Dynamic Financial Networks","summary":"  Cascading failures, such as bankruptcies and defaults, pose a serious threat\nfor the resilience of the global financial system. Indeed, because of the\ncomplex investment and cross-holding relations within the system, failures can\noccur as a result of the propagation of a financial collapse from one\norganization to another. While this problem has been studied in depth from a\nstatic angle, namely, when the system is at an equilibrium, we take a different\nperspective and study the corresponding dynamical system. The contribution of\nthis paper is threefold. First, we carry out a systematic analysis of the\nregions of attraction and invariance of the system orthants, defined by the\npositive and negative values of the organizations' equity. Second, we\ninvestigate periodic solutions and show through a counterexample that there\ncould exist periodic solutions of period greater than 2. Finally, we study the\nproblem of finding the smallest cash injection that would bring the system to\nthe maximal invariant region of the positive orthant.\n","authors":["Leonardo Stella","Dario Bauso","Franco Blanchini","Patrizio Colaneri"],"pdf_url":"https://arxiv.org/pdf/2501.12156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12113v1","updated":"2025-01-21T13:17:16Z","published":"2025-01-21T13:17:16Z","title":"Dual NUP Representations and Min-Maximization in Factor Graphs","summary":"  Normals with unknown parameters (NUP) can be used to convert nontrivial\nmodel-based estimation problems into iterations of linear least-squares or\nGaussian estimation problems. In this paper, we extend this approach by\naugmenting factor graphs with convex-dual variables and pertinent NUP\nrepresentations. In particular, in a state space setting, we propose a new\niterative forward-backward algorithm that is dual to a recently proposed\nbackward-forward algorithm.\n","authors":["Yun-Peng Li","Hans-Andrea Loeliger"],"pdf_url":"https://arxiv.org/pdf/2501.12113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12043v1","updated":"2025-01-21T11:03:59Z","published":"2025-01-21T11:03:59Z","title":"High-Fidelity Coherent-One-Way QKD Simulation Framework for 6G Networks:\n  Bridging Theory and Reality","summary":"  Quantum key distribution (QKD) has been emerged as a promising solution for\nguaranteeing information-theoretic security. Inspired by this, a great amount\nof research effort has been recently put on designing and testing QKD systems\nas well as articulating preliminary application scenarios. However, due to the\nconsiderable high-cost of QKD equipment, a lack of QKD communication system\ndesign tools, wide deployment of such systems and networks is challenging.\nMotivated by this, this paper introduces a QKD communication system design\ntool. First we articulate key operation elements of the QKD, and explain the\nfeasibility and applicability of coherent-one-way (COW) QKD solutions. Next, we\nfocus on documenting the corresponding simulation framework as well as defining\nthe key performance metrics, i.e., quantum bit error rate (QBER), and secrecy\nkey rate. To verify the accuracy of the simulation framework, we design and\ndeploy a real-world QKD setup. We perform extensive experiments for three\ndeployments of diverse transmission distance in the presence or absence of a\nQKD eavesdropper. The results reveal an acceptable match between simulations\nand experiments rendering the simulation framework a suitable tool for QKD\ncommunication system design.\n","authors":["Aitor Brazaola-Vicario","Vasileios Kouvakis","Stylianos E. Trevlakis","Alejandra Ruiz","Alexandros-Apostolos A. Boulogeorgos","Theodoros Tsiftsis","Dusit Niyato"],"pdf_url":"https://arxiv.org/pdf/2501.12043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09551v2","updated":"2025-01-21T09:45:38Z","published":"2025-01-16T14:12:03Z","title":"Intra-day Solar and Power Forecast for Optimization of Intraday Market\n  Participation","summary":"  The prediction of solar irradiance enhances reliability in photovoltaic (PV)\nsolar plant generation and grid integration. In Colombia, PV plants face\npenalties if energy production deviates beyond governmental thresholds from\nintraday market offers. This research employs Long Short-Term Memory (LSTM) and\nBidirectional-LSTM (Bi-LSTM) models, utilizing meteorological data from a PV\nplant in El Paso, Cesar, Colombia, to predict solar irradiance with a 6-hour\nhorizon and 10-minute resolution. While Bi-LSTM showed superior performance,\nthe LSTM model achieved comparable results with significantly reduced training\ntime (6 hours versus 18 hours), making it computationally advantageous. The\nLSTM predictions were averaged to create an hourly resolution model, evaluated\nusing Mean Absolute Error, Root-Mean-Square Error, Normalized Root-Mean-Square\nError, and Mean Absolute Percentage Error metrics. Comparison with the Global\nForecast System (GFS) revealed similar performance, with both models\neffectively capturing daily solar irradiance patterns. The forecast model\nintegrates with an Object-Oriented power production model, enabling accurate\nenergy offers in the intraday market while minimizing penalty costs.\n","authors":["Nelson Salazar-Pena","Adolfo Palma-Vergara","Mateo Montes-Vera","Maria Alejandra Vargas-Torres","Adriana Salinas","Andres Velasco","Alejandra Tabares","Andres Gonzalez-Mancera"],"pdf_url":"https://arxiv.org/pdf/2501.09551v2.pdf","comment":"20 pages, 37 figures, 9 tables"},{"id":"http://arxiv.org/abs/2410.15984v2","updated":"2025-01-21T09:36:20Z","published":"2024-10-21T13:15:49Z","title":"Lossless optimal transient control for rigid bodies in 3D space","summary":"  In this letter, we propose a control scheme for rigid bodies designed to\noptimise transient behaviors. The search space for the optimal control input is\nparameterized to yield a passive, specifically lossless, nonlinear feedback\ncontroller. As a result, it can be combined with other stabilizing controllers\nwithout compromising the stability of the closed-loop system. The controller\ncommands torques generating fictitious gyroscopic effects characteristics of 3D\nrotational rigid body motions, and as such does not inject nor extract kinetic\nenergy from the system. We validate the controller in simulation using a model\npredictive control (MPC) scheme, successfully combining stability and\nperformance in a stabilization task with obstacle avoidance constraints.\n","authors":["Riccardo Zanella","Federico Califano","Antonio Franchi","Stefano Stramigioli"],"pdf_url":"https://arxiv.org/pdf/2410.15984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00554v2","updated":"2025-01-21T08:18:36Z","published":"2024-03-01T14:24:20Z","title":"Distributed MPC for autonomous ships on inland waterways with\n  collaborative collision avoidance","summary":"  This paper presents a distributed solution for the problem of collaborative\ncollision avoidance for autonomous inland waterway ships. A two-layer collision\navoidance framework that considers inland waterway traffic regulations is\nproposed to increase navigational safety for autonomous ships. Our approach\nallows for modifying traffic rules without changing the collision avoidance\nalgorithm, and is based on a novel formulation of model predictive control\n(MPC) for collision avoidance of ships. This MPC formulation is designed for\ninland waterway traffic and can handle complex scenarios. The alternating\ndirection method of multipliers is used as a scheme for exchanging and\nnegotiating intentions among ships. Simulation results show that the proposed\nalgorithm can comply with traffic rules. Furthermore, the proposed algorithm\ncan safely deviate from traffic rules when necessary to increase efficiency in\ncomplex scenarios.\n","authors":["Hoang Anh Tran","Tor Arne Johansen","Rudy R. Negenborn"],"pdf_url":"https://arxiv.org/pdf/2403.00554v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11938v1","updated":"2025-01-21T07:27:14Z","published":"2025-01-21T07:27:14Z","title":"Navigating Robot Swarm Through a Virtual Tube with Flow-Adaptive\n  Distribution Control","summary":"  With the rapid development of robot swarm technology and its diverse\napplications, navigating robot swarms through complex environments has emerged\nas a critical research direction. To ensure safe navigation and avoid potential\ncollisions with obstacles, the concept of virtual tubes has been introduced to\ndefine safe and navigable regions. However, current control methods in virtual\ntubes face the congestion issues, particularly in narrow virtual tubes with low\nthroughput. To address these challenges, we first originally introduce the\nconcepts of virtual tube area and flow capacity, and develop an new evolution\nmodel for the spatial density function. Next, we propose a novel control method\nthat combines a modified artificial potential field (APF) for swarm navigation\nand density feedback control for distribution regulation, under which a\nsaturated velocity command is designed. Then, we generate a global velocity\nfield that not only ensures collision-free navigation through the virtual tube,\nbut also achieves locally input-to-state stability (LISS) for density tracking\nerrors, both of which are rigorously proven. Finally, numerical simulations and\nrealistic applications validate the effectiveness and advantages of the\nproposed method in managing robot swarms within narrow virtual tubes.\n","authors":["Yongwei Zhang","Shuli Lv","Kairong Liu","Quanyi Liang","Quan Quan","Zhikun She"],"pdf_url":"https://arxiv.org/pdf/2501.11938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11921v1","updated":"2025-01-21T06:49:06Z","published":"2025-01-21T06:49:06Z","title":"Goal-oriented Transmission Scheduling: Structure-guided DRL with a\n  Unified Dual On-policy and Off-policy Approach","summary":"  Goal-oriented communications prioritize application-driven objectives over\ndata accuracy, enabling intelligent next-generation wireless systems. Efficient\nscheduling in multi-device, multi-channel systems poses significant challenges\ndue to high-dimensional state and action spaces. We address these challenges by\nderiving key structural properties of the optimal solution to the goal-oriented\nscheduling problem, incorporating Age of Information (AoI) and channel states.\nSpecifically, we establish the monotonicity of the optimal state value function\n(a measure of long-term system performance) w.r.t. channel states and prove its\nasymptotic convexity w.r.t. AoI states. Additionally, we derive the\nmonotonicity of the optimal policy w.r.t. channel states, advancing the\ntheoretical framework for optimal scheduling. Leveraging these insights, we\npropose the structure-guided unified dual on-off policy DRL (SUDO-DRL), a\nhybrid algorithm that combines the stability of on-policy training with the\nsample efficiency of off-policy methods. Through a novel structural property\nevaluation framework, SUDO-DRL enables effective and scalable training,\naddressing the complexities of large-scale systems. Numerical results show\nSUDO-DRL improves system performance by up to 45% and reduces convergence time\nby 40% compared to state-of-the-art methods. It also effectively handles\nscheduling in much larger systems, where off-policy DRL fails and on-policy\nbenchmarks exhibit significant performance loss, demonstrating its scalability\nand efficacy in goal-oriented communications.\n","authors":["Jiazheng Chen","Wanchun Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11921v1.pdf","comment":"Paper submitted to IEEE"},{"id":"http://arxiv.org/abs/2501.11915v1","updated":"2025-01-21T06:41:40Z","published":"2025-01-21T06:41:40Z","title":"Stabilizing Optimal Control for Nonlinear Stochastic Systems: A\n  Parametric Gradient-Based Approach","summary":"  This study proposes a method for designing stabilizing suboptimal controllers\nfor nonlinear stochastic systems. These systems include time-invariant\nstochastic parameters that represent uncertainty of dynamics, posing two key\ndifficulties in optimal control. Firstly, the time-invariant stochastic nature\nviolates the principle of optimality and Hamilton-Jacobi equations, which are\nfundamental tools for solving optimal control problems. Secondly, nonlinear\nsystems must be robustly stabilized against these stochastic parameters. To\novercome these difficulties simultaneously, this study presents a\nparametric-gradient-based method with a penalty function. A controller and cost\nfunction are parameterized using basis functions, and a gradient method is\nemployed to optimize the controller by minimizing the parameterized cost\nfunction. Crucial challenges in this approach are parameterizing the cost\nfunction appropriately and deriving the gradient of the cost. This study\nprovides explicit formulations of an optimally parameterized cost and its\ngradient. Furthermore, a suitable penalty function is proposed to ensure robust\nstability, even when using the gradient method. Consequently, the gradient\nmethod produces a suboptimal feedback controller that guarantees the robust\nstability. The effectiveness of the proposed method is demonstrated through\nnumerical simulations, highlighting its performance in comparison with other\nbaseline methods.\n","authors":["Yuji Ito","Kenji Fujimoto"],"pdf_url":"https://arxiv.org/pdf/2501.11915v1.pdf","comment":"This paper is submitted to a journal for possible publication. The\n  copyright of this paper may be transferred without notice, after which this\n  version may no longer be accessible"},{"id":"http://arxiv.org/abs/2310.20151v2","updated":"2025-01-21T06:26:43Z","published":"2023-10-31T03:37:11Z","title":"Multi-Agent Consensus Seeking via Large Language Models","summary":"  Multi-agent systems driven by large language models (LLMs) have shown\npromising abilities for solving complex tasks in a collaborative manner. This\nwork considers a fundamental problem in multi-agent collaboration: consensus\nseeking. When multiple agents work together, we are interested in how they can\nreach a consensus through inter-agent negotiation. To that end, this work\nstudies a consensus-seeking task where the state of each agent is a numerical\nvalue and they negotiate with each other to reach a consensus value. It is\nrevealed that when not explicitly directed on which strategy should be adopted,\nthe LLM-driven agents primarily use the average strategy for consensus seeking\nalthough they may occasionally use some other strategies. Moreover, this work\nanalyzes the impact of the agent number, agent personality, and network\ntopology on the negotiation process. The findings reported in this work can\npotentially lay the foundations for understanding the behaviors of LLM-driven\nmulti-agent systems for solving more complex tasks. Furthermore, LLM-driven\nconsensus seeking is applied to a multi-robot aggregation task. This\napplication demonstrates the potential of LLM-driven agents to achieve\nzero-shot autonomous planning for multi-robot collaboration tasks. Project\nwebsite: windylab.github.io/ConsensusLLM/.\n","authors":["Huaben Chen","Wenkang Ji","Lufeng Xu","Shiyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.20151v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00204v5","updated":"2025-01-21T06:08:46Z","published":"2024-03-30T00:46:43Z","title":"AirPilot: Interpretable PPO-based DRL Auto-Tuned Nonlinear PID Drone\n  Controller for Robust Autonomous Flights","summary":"  Navigation precision, speed and stability are crucial for safe Unmanned\nAerial Vehicle (UAV) flight maneuvers and effective flight mission executions\nin dynamic environments. Different flight missions may have varying objectives,\nsuch as minimizing energy consumption, achieving precise positioning, or\nmaximizing speed. A controller that can adapt to different objectives on the\nfly is highly valuable. Proportional Integral Derivative (PID) controllers are\none of the most popular and widely used control algorithms for drones and other\ncontrol systems, but their linear control algorithm fails to capture the\nnonlinear nature of the dynamic wind conditions and complex drone system.\nManually tuning the PID gains for various missions can be time-consuming and\nrequires significant expertise. This paper aims to revolutionize drone flight\ncontrol by presenting the AirPilot, a nonlinear Deep Reinforcement Learning\n(DRL) - enhanced Proportional Integral Derivative (PID) drone controller using\nProximal Policy Optimization (PPO). AirPilot controller combines the simplicity\nand effectiveness of traditional PID control with the adaptability, learning\ncapability, and optimization potential of DRL. This makes it better suited for\nmodern drone applications where the environment is dynamic, and\nmission-specific performance demands are high. We employed a COEX Clover\nautonomous drone for training the DRL agent within the simulator and\nimplemented it in a real-world lab setting, which marks a significant milestone\nas one of the first attempts to apply a DRL-based flight controller on an\nactual drone. Airpilot is capable of reducing the navigation error of the\ndefault PX4 PID position controller by 90%, improving effective navigation\nspeed of a fine-tuned PID controller by 21%, reducing settling time and\novershoot by 17% and 16% respectively.\n","authors":["Junyang Zhang","Cristian Emanuel Ocampo Rivera","Kyle Tyni","Steven Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.00204v5.pdf","comment":"9 pages, 20 figures"},{"id":"http://arxiv.org/abs/2501.11903v1","updated":"2025-01-21T05:47:55Z","published":"2025-01-21T05:47:55Z","title":"Finding the nearest bounded-real port-Hamiltonian system","summary":"  In this paper, we consider linear time-invariant continuous control systems\nwhich are bounded real, also known as scattering passive. Our main theoretical\ncontribution is to show the equivalence between such systems and\nport-Hamiltonian (PH) systems whose factors satisfy certain linear matrix\ninequalities. Based on this result, we propose a formulation for the problem of\nfinding the nearest bounded-real system to a given system, and design an\nalgorithm combining alternating optimization and Nesterov's fast gradient\nmethod. This formulation also allows us to check whether a given system is\nbounded real by solving a semidefinite program, and provide a PH\nparametrization for it. We illustrate our proposed algorithms on real and\nsynthetic data sets.\n","authors":["Karim Cherifi","Nicolas Gillis","Punit Sharma"],"pdf_url":"https://arxiv.org/pdf/2501.11903v1.pdf","comment":"20 pages, code, experiments and data available from\n  https://gitlab.com/ngillis/nearestBRsysPHform"},{"id":"http://arxiv.org/abs/2501.11820v1","updated":"2025-01-21T01:58:09Z","published":"2025-01-21T01:58:09Z","title":"Comparative Analysis of Control Strategies for Position Regulation in DC\n  Servo Motors","summary":"  A servomotor is a closed-loop system designed for precise movement control,\nutilizing position feedback to achieve accurate final positions. Due to the\nability to deliver higher power output and operate at enhanced speeds, DC servo\nmotors are considered ideal for applications requiring precision and\nperformance. This research aims to design, simulate, and compare various\ncontrol strategies for precise position control in DC servo motors (DSM). The\ncontrollers evaluated in this study include proportional (P),\nproportional-integral (PI), proportional-integral-derivative (PID),\nstate-feedback controllers (SFC), and state-feedback controllers augmented with\nintegral action (SFCIA). The performance of these controllers was evaluated\nusing MATLAB simulations, characterized by overshoot, settling time,\nsteady-state error, rise time, and peak time. The results indicate that the\nstate-feedback controller with integral action (SFCIA) surpasses other control\nstrategies by achieving zero steady-state error, minimal overshoot, the\nshortest settling time, and optimized rise and peak times. These findings\nhighlight the effectiveness of SFCIA for tasks requiring high levels of\nstability, precision, and dynamic performance.\n","authors":["Raihan Khan Akash"],"pdf_url":"https://arxiv.org/pdf/2501.11820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08532v2","updated":"2025-01-21T00:34:26Z","published":"2025-01-15T02:42:31Z","title":"Scenarios Generation-based Multiple Interval Prediction Method for\n  Electricity Prices","summary":"  This paper introduces an innovative interval prediction methodology aimed at\naddressing the limitations of current evaluation indicators while enhancing\nprediction accuracy and reliability. To achieve this, new evaluation metrics\nare proposed, offering a comprehensive assessment of interval prediction\nmethods across both all-sample and single-sample scenarios. Additionally, a\nnovel Pattern-Diversity Conditional Time-Series Generative Adversarial Network\n(PDCTSGAN) is developed, designed to generate realistic scenarios and support a\nnew interval prediction framework based on scenario generation. The PDCTSGAN\nmodel incorporates unique modifications to random noise inputs, enabling the\ncreation of pattern-diverse and realistic scenarios. These scenarios are then\nutilized to produce multiple interval patterns characterized by high coverage\nprobability and reduced average width. The proposed approach is validated\nthrough detailed case studies, and the paper concludes with a discussion of\nfuture research directions to further refine interval prediction techniques.\n","authors":["Lu Xin"],"pdf_url":"https://arxiv.org/pdf/2501.08532v2.pdf","comment":null}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2501.12357v1","updated":"2025-01-21T18:38:35Z","published":"2025-01-21T18:38:35Z","title":"Ensemble control of n-level quantum systems with a scalar control","summary":"  In this paper we discuss how a general bilinear finite-dimensional closed\nquantum system with dispersed parameters can be steered between eigenstates. We\nshow that, under suitable conditions on the separation of spectral gaps and the\nboundedness of parameter dispersion, rotating wave and adiabatic approximations\ncan be employed in cascade to achieve population inversion between arbitrary\neigenstates. We propose an explicit control law and test numerically the\nsharpness of the conditions on several examples.\n","authors":["Ruikang Liang","Ugo Boscain","Mario Sigalotti"],"pdf_url":"https://arxiv.org/pdf/2501.12357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12355v1","updated":"2025-01-21T18:34:35Z","published":"2025-01-21T18:34:35Z","title":"Extending the Leader-First Follower Structure for Bearing-only Formation\n  Control on Directed Graphs","summary":"  This work proposes an extension to the leader-first follower (LFF) class of\ngraphs used to solve the bearing-only formation control problem over directed\ngraphs. The first contribution provides an equilibrium, stability, and\nconvergence analysis for a one-follower, multi-leader system (which is not an\nLFF graph). We then propose an extension to the LFF structure, termed\n\\emph{ordered} LFF graphs, that allows for additional forward directed edges to\nbe included. Using the results of the one-follower multi-leader system we show\nthat the ordered LFF graphs can be used to solve the directed bearing-only\nformation control problem. We also show that these structures offer improved\nconvergence speed as compared to the LFF graphs. Numerical simulations are\nprovided to validate the results.\n","authors":["Jiacheng Shi","Daniel Zelazo"],"pdf_url":"https://arxiv.org/pdf/2501.12355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12279v1","updated":"2025-01-21T16:50:33Z","published":"2025-01-21T16:50:33Z","title":"Spatial exponential decay of perturbations in optimal control of general\n  evolution equations","summary":"  We analyze the robustness of optimally controlled evolution equations with\nrespect to spatially localized perturbations. We prove that if the involved\noperators are domain-uniformly stabilizable and detectable, then these\nlocalized perturbations only have a local effect on the optimal solution. We\ncharacterize this domain-uniform stabilizability and detectability for the\ntransport equation with constant transport velocity, showing that even for\nunitary semigroups, optimality implies exponential damping. Finally, we extend\nour result to the case of a space-dependent transport velocity. Numerical\nexamples in one space dimension complement the theoretical results.\n","authors":["Simone Göttlich","Benedikt Oppeneiger","Manuel Schaller","Karl Worthmann"],"pdf_url":"https://arxiv.org/pdf/2501.12279v1.pdf","comment":"46 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.14053v2","updated":"2025-01-21T16:30:51Z","published":"2024-09-21T08:03:16Z","title":"Quantitative convergence for mean field control with common noise and\n  degenerate idiosyncratic noise","summary":"  We consider the convergence problem in the setting of mean field control with\ncommon noise and degenerate idiosyncratic noise. Our main results establish a\nrate of convergence of the finite-dimensional value functions $V^N$ towards the\nmean field value function $U$. In the case that the idiosyncratic noise is\nconstant (but possibly degenerate), we obtain the rate $N^{-1/(d+7)}$, which is\nclose to the conjectured optimal rate $N^{-1/d}$, and improves on the existing\nliterature even in the non-degenerate setting. In the case that the\nidiosyncratic noise can be both non-constant and degenerate, the argument is\nmore complicated, and we instead find the rate $N^{-1/(3d + 19)}$. Our proof\nstrategy builds on the one initiated in [Daudin, Delarue, Jackson - JFA, 2024]\nin the case of non-degenerate idiosyncratic noise and zero common noise, which\nconsists of approximating $U$ by more regular functions which are almost\nsubsolutions of the infinite-dimensional Hamilton-Jacobi equation solved by\n$U$. Because of the different noise structure, several new steps are necessary\nin order to produce an appropriate mollification scheme. In addition to our\nmain convergence results, we investigate the case of zero idiosyncratic noise,\nand show that sharper results can be obtained there by purely control-theoretic\narguments. We also provide examples to demonstrate that the value function is\nsensitive to the choice of admissible controls in the zero noise setting.\n","authors":["Alekos Cecchin","Samuel Daudin","Joe Jackson","Mattia Martini"],"pdf_url":"https://arxiv.org/pdf/2409.14053v2.pdf","comment":"Some fixes in Section 6 (deterministic case)"},{"id":"http://arxiv.org/abs/2501.12256v1","updated":"2025-01-21T16:23:56Z","published":"2025-01-21T16:23:56Z","title":"Lie-Bracket Nash Equilibrium Seeking with Bounded Update Rates for\n  Noncooperative Games","summary":"  This paper proposes a novel approach for local convergence to Nash\nequilibrium in quadratic noncooperative games based on a distributed\nLie-bracket extremum seeking control scheme. This is the first instance of\nnoncooperative games being tackled in a model-free fashion integrated with the\nextremum seeking method of bounded update rates. In particular, the stability\nanalysis is carried out using Lie-bracket approximation and Lyapunov's direct\nmethod. We quantify the size of the ultimate small residual sets around the\nNash equilibrium and illustrate the theoretical results numerically on an\nexample in an oligopoly setting.\n","authors":["Victor Hugo Pereira Rodrigues","Tiago Roux Oliveira","Miroslav Krstic","Tamer Basar"],"pdf_url":"https://arxiv.org/pdf/2501.12256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12243v1","updated":"2025-01-21T16:03:42Z","published":"2025-01-21T16:03:42Z","title":"FOCUS: First Order Concentrated Updating Scheme","summary":"  Large language models (LLMs) demonstrate remarkable performance, and\nimproving their pre-training process appears to be key to enhancing their\ncapabilities further. Based on the documented success of Adam, learning rate\ndecay, and weight decay, we hypothesize that the pre-training loss landscape\nfeatures a narrowing valley structure. Through experiments with synthetic loss\nfunctions, we discover that when gradient query noise is high relative to the\nvalley's sharpness, Adam's performance falls behind that of Signum because Adam\nreduces the effective step size too drastically. This observation led us to\ndevelop FOCUS, an optimizer that enhances Signum by incorporating attraction\ntoward moving averaged parameters, allowing it to handle noise better while\nmaintaining larger step sizes. In training GPT-2, FOCUS proves to be more\nstable than Signum and faster than Adam. These results suggest that gradient\nnoise may be an underappreciated limiting factor in LLM training, and FOCUS\noffers promising solutions.\n","authors":["Yizhou Liu","Ziming Liu","Jeff Gore"],"pdf_url":"https://arxiv.org/pdf/2501.12243v1.pdf","comment":"19 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.12236v1","updated":"2025-01-21T15:58:21Z","published":"2025-01-21T15:58:21Z","title":"Fast sparse optimization via adaptive shrinkage","summary":"  The need for fast sparse optimization is emerging, e.g., to deal with\nlarge-dimensional data-driven problems and to track time-varying systems. In\nthe framework of linear sparse optimization, the iterative\nshrinkage-thresholding algorithm is a valuable method to solve Lasso, which is\nparticularly appreciated for its ease of implementation. Nevertheless, it\nconverges slowly. In this paper, we develop a proximal method, based on\nlogarithmic regularization, which turns out to be an iterative\nshrinkage-thresholding algorithm with adaptive shrinkage hyperparameter. This\nadaptivity substantially enhances the trajectory of the algorithm, in a way\nthat yields faster convergence, while keeping the simplicity of the original\nmethod. Our contribution is twofold: on the one hand, we derive and analyze the\nproposed algorithm; on the other hand, we validate its fast convergence via\nnumerical experiments and we discuss the performance with respect to\nstate-of-the-art algorithms.\n","authors":["Vito Cerone","Sophie M. Fosson","Diego Regruto"],"pdf_url":"https://arxiv.org/pdf/2501.12236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12219v1","updated":"2025-01-21T15:43:26Z","published":"2025-01-21T15:43:26Z","title":"Convergence of time-delayed opinion dynamics with complex interaction\n  types","summary":"  In opinion dynamics, time delays in agent-to-agent interactions are\nubiquitous, which can substantially disrupt the dynamical processes rooted in\nagents' opinion exchange, decision-making, and feedback mechanisms. However, a\nthorough comprehension of quantitative impacts of time delays on the opinion\nevolution, considering diverse interaction types and system dynamics, remains\nabsent. In this paper, we conduct a systematic investigation into the\nconvergence and the associated rate of time-delayed opinion dynamics with\ndiverse interaction types in both discrete-time and continuous-time systems.\nFor the discrete-time system, we commence by establishing sufficient conditions\nfor its convergence on arbitrary signed interaction networks. These conditions\nshow that the convergence is determined solely by the topology of the\ninteraction network and remains impervious to the magnitude of the time delay.\nSubsequently, we examine the influence of random and other interaction types on\nthe convergence rate and discover that time delays tend to decelerate this\nrate. Regarding the continuous-time system, we derive the feasible domain of\nthe delay that ensures the convergence of opinion dynamics, revealing that,\nunlike the discrete-time scenarios, large time delays can instigate the\ndivergence of opinions. Specifically, we prove that for both random and other\ninteraction types, small delays can expedite the convergence of continuous-time\nsystem. Finally, we present simulation examples to demonstrate the\neffectiveness and robustness of our research findings.\n","authors":["Lingling Yao","Aming Li"],"pdf_url":"https://arxiv.org/pdf/2501.12219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12189v1","updated":"2025-01-21T14:50:19Z","published":"2025-01-21T14:50:19Z","title":"MirrorCBO: A consensus-based optimization method in the spirit of mirror\n  descent","summary":"  In this work we propose MirrorCBO, a consensus-based optimization (CBO)\nmethod which generalizes standard CBO in the same way that mirror descent\ngeneralizes gradient descent. For this we apply the CBO methodology to a swarm\nof dual particles and retain the primal particle positions by applying the\ninverse of the mirror map, which we parametrize as the subdifferential of a\nstrongly convex function $\\phi$. In this way, we combine the advantages of a\nderivative-free non-convex optimization algorithm with those of mirror descent.\nAs a special case, the method extends CBO to optimization problems with convex\nconstraints. Assuming bounds on the Bregman distance associated to $\\phi$, we\nprovide asymptotic convergence results for MirrorCBO with explicit exponential\nrate. Another key contribution is an exploratory numerical study of this new\nalgorithm across different application settings, focusing on (i)\nsparsity-inducing optimization, and (ii) constrained optimization,\ndemonstrating the competitive performance of MirrorCBO. We observe empirically\nthat the method can also be used for optimization on (non-convex) submanifolds\nof Euclidean space, can be adapted to mirrored versions of other recent CBO\nvariants, and that it inherits from mirror descent the capability to select\ndesirable minimizers, like sparse ones. We also include an overview of recent\nCBO approaches for constrained optimization and compare their performance to\nMirrorCBO.\n","authors":["Leon Bungert","Franca Hoffmann","Doh Yeon Kim","Tim Roith"],"pdf_url":"https://arxiv.org/pdf/2501.12189v1.pdf","comment":"64 pages, 18 figures, 19 tables"},{"id":"http://arxiv.org/abs/2501.12156v1","updated":"2025-01-21T14:09:10Z","published":"2025-01-21T14:09:10Z","title":"Characterization of Invariance, Periodic Solutions and Optimization of\n  Dynamic Financial Networks","summary":"  Cascading failures, such as bankruptcies and defaults, pose a serious threat\nfor the resilience of the global financial system. Indeed, because of the\ncomplex investment and cross-holding relations within the system, failures can\noccur as a result of the propagation of a financial collapse from one\norganization to another. While this problem has been studied in depth from a\nstatic angle, namely, when the system is at an equilibrium, we take a different\nperspective and study the corresponding dynamical system. The contribution of\nthis paper is threefold. First, we carry out a systematic analysis of the\nregions of attraction and invariance of the system orthants, defined by the\npositive and negative values of the organizations' equity. Second, we\ninvestigate periodic solutions and show through a counterexample that there\ncould exist periodic solutions of period greater than 2. Finally, we study the\nproblem of finding the smallest cash injection that would bring the system to\nthe maximal invariant region of the positive orthant.\n","authors":["Leonardo Stella","Dario Bauso","Franco Blanchini","Patrizio Colaneri"],"pdf_url":"https://arxiv.org/pdf/2501.12156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12063v1","updated":"2025-01-21T11:35:24Z","published":"2025-01-21T11:35:24Z","title":"Gram-like matrix preserving extensions of noncommutative polynomials to\n  sum of Hermitian Squares","summary":"  Given a nonnegative noncommutative polynomial $f$, equivalently a sum of\nHermitian squares (SOHS), there exists a positive semidefinite Gram matrix that\nencrypts all essential information of $f$. There are no available methods for\nextending a noncommutative polynomial to a SOHS keeping the Gram matrices\nunperturbed. As a remedy, we introduce an equally significant notion of\nGram-like matrices and provide linear algebraic techniques to get the desired\nextensions. We further use positive semidefinite completion problem to get SOHS\nand provide criteria in terms of chordal graphs and 2-regular projective\nalgebraic sets.\n","authors":["Arijit Mukherjee","Arindam Sutradhar"],"pdf_url":"https://arxiv.org/pdf/2501.12063v1.pdf","comment":"All comments are welcome"},{"id":"http://arxiv.org/abs/2411.11062v2","updated":"2025-01-21T11:03:09Z","published":"2024-11-17T12:51:14Z","title":"Dynamic Programming: From Local Optimality to Global Optimality","summary":"  In the theory of dynamic programming, an optimal policy is a policy whose\nlifetime value dominates that of all other policies from every possible initial\ncondition in the state space. This raises a natural question: when does\noptimality from a single state imply optimality from every state? We show that,\nin a general setting, irreducibility of the transition kernel is sufficient for\nthis property. Our results have important implications for modern policy-based\nalgorithms used to solve large-scale dynamic programs in reinforcement learning\nand other fields.\n","authors":["John Stachurski","Jingni Yang","Ziyue Yang"],"pdf_url":"https://arxiv.org/pdf/2411.11062v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08845v2","updated":"2025-01-21T10:55:36Z","published":"2024-01-16T21:43:17Z","title":"Top Feasible-Arm Subset Identification in Constrained Multi-Armed Bandit\n  with Limited Budget","summary":"  We present an algorithm, \"constrained successive accept or reject (CSAR),\"\nfor the problem of identifying the subset of top feasible-arms from a given\nfinite set of arms with the limited sampling-budget equal to a given\ntime-horizon when the sequential dynamics of the arms follows the model of a\nconstrained multi-armed bandit. We provide a finite-time upper bound on the\nprobability of the incorrect identification by CSAR that converges to zero with\nan exponential rate in the sampling-budget.\n","authors":["Hyeong Soo Chang"],"pdf_url":"https://arxiv.org/pdf/2401.08845v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08018v4","updated":"2025-01-21T10:02:44Z","published":"2024-07-10T19:52:37Z","title":"A Stochastic Objective-Function-Free Adaptive Regularization Method with\n  Optimal Complexity","summary":"  A fully stochastic second-order adaptive-regularization method for\nunconstrained nonconvex optimization is presented which never computes the\nobjective-function value, but yet achieves the optimal\n$\\mathcal{O}(\\epsilon^{-3/2})$ complexity bound for finding first-order\ncritical points. The method is noise-tolerant and the inexactness conditions\nrequired for convergence depend on the history of past steps. Applications to\ncases where derivative evaluation is inexact and to minimization of finite sums\nby sampling are discussed. Numerical experiments on large binary classification\nproblems illustrate the potential of the new method.\n","authors":["Serge Gratton","Sadok Jerad","Philippe L. Toint"],"pdf_url":"https://arxiv.org/pdf/2407.08018v4.pdf","comment":"32 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.11988v1","updated":"2025-01-21T09:15:33Z","published":"2025-01-21T09:15:33Z","title":"Growth model with externalities for energetic transition via MFG with\n  common external variable","summary":"  This article introduces a novel mean-field game model for multi-sector\neconomic growth in which a dynamically evolving externality, influenced by the\ncollective actions of agents, plays a central role. Building on classical\ngrowth theories and integrating environmental considerations, the framework\nincorporates common noise to capture shared uncertainties among agents about\nthe externality variable. We demonstrate the existence and uniqueness of a\nstrong mean-field game equilibrium by reformulating the equilibrium conditions\nas a Forward-Backward Stochastic Differential Equation under the stochastic\nmaximum principle and establishing a contraction argument to ensure a unique\nsolution. We provide a numerical resolution for a specified model using a\nfixed-point approach combined with neural network approximations.\n","authors":["Pierre Lavigne","Quentin Petit","Xavier Warin"],"pdf_url":"https://arxiv.org/pdf/2501.11988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11955v1","updated":"2025-01-21T08:02:17Z","published":"2025-01-21T08:02:17Z","title":"Simultaneously decoding the unknown stationary state and function\n  parameters for mean field games","summary":"  Mean field games (MFGs) offer a versatile framework for modeling large-scale\ninteractive systems across multiple domains. This paper builds upon a previous\nwork, by developing a state-of-the-art unified approach to decode or design the\nunknown stationary state of MFGs, in addition to the underlying parameter\nfunctions governing their behavior. This result is novel, even in the general\nrealm of inverse problems for nonlinear PDEs. By enabling agents to distill\ncrucial insights from observed data and unveil intricate hidden structures and\nunknown states within MFG systems, our approach surmounts a significant\nobstacle, enhancing the applicability of MFGs in real-world scenarios. This\nadvancement not only enriches our understanding of MFG dynamics but also\nbroadens the scope for their practical deployment in various contexts.\n","authors":["Hongyu Liu","Catharine W. K. Lo"],"pdf_url":"https://arxiv.org/pdf/2501.11955v1.pdf","comment":"Keywords: Mean field games, inverse problems, Cauchy dataset, unique\n  continuation principle, unique identifiability, unknown stationary solutions"},{"id":"http://arxiv.org/abs/2501.11946v1","updated":"2025-01-21T07:42:18Z","published":"2025-01-21T07:42:18Z","title":"Towards Solutions of Manipulation Tasks via Optimal Control of Projected\n  Dynamical Systems","summary":"  We introduce a modeling framework for manipulation planning based on the\nformulation of the dynamics as a projected dynamical system. This method uses\nimplicit signed distance functions and their gradients to formulate an\nequivalent gradient complementarity system. The optimal control problem is then\nsolved via a direct method, discretized using finite-elements with switch\ndetection. An extension to this approach is provided in the form of a friction\nformulation commonly used in quasi-static models. We show that this approach is\nable to generate trajectories for problems including multiple pushers,\nfriction, and non-convex objects modeled as unions of convex ellipsoids with\nreasonable computational effort.\n","authors":["Anton Pozharskiy","Armin Nurkanović","Moritz Diehl"],"pdf_url":"https://arxiv.org/pdf/2501.11946v1.pdf","comment":"5 pages, 3 figures, Accepted for Robotics Science and Systems 2024,\n  Frontiers of Optimization workshop"},{"id":"http://arxiv.org/abs/2501.11915v1","updated":"2025-01-21T06:41:40Z","published":"2025-01-21T06:41:40Z","title":"Stabilizing Optimal Control for Nonlinear Stochastic Systems: A\n  Parametric Gradient-Based Approach","summary":"  This study proposes a method for designing stabilizing suboptimal controllers\nfor nonlinear stochastic systems. These systems include time-invariant\nstochastic parameters that represent uncertainty of dynamics, posing two key\ndifficulties in optimal control. Firstly, the time-invariant stochastic nature\nviolates the principle of optimality and Hamilton-Jacobi equations, which are\nfundamental tools for solving optimal control problems. Secondly, nonlinear\nsystems must be robustly stabilized against these stochastic parameters. To\novercome these difficulties simultaneously, this study presents a\nparametric-gradient-based method with a penalty function. A controller and cost\nfunction are parameterized using basis functions, and a gradient method is\nemployed to optimize the controller by minimizing the parameterized cost\nfunction. Crucial challenges in this approach are parameterizing the cost\nfunction appropriately and deriving the gradient of the cost. This study\nprovides explicit formulations of an optimally parameterized cost and its\ngradient. Furthermore, a suitable penalty function is proposed to ensure robust\nstability, even when using the gradient method. Consequently, the gradient\nmethod produces a suboptimal feedback controller that guarantees the robust\nstability. The effectiveness of the proposed method is demonstrated through\nnumerical simulations, highlighting its performance in comparison with other\nbaseline methods.\n","authors":["Yuji Ito","Kenji Fujimoto"],"pdf_url":"https://arxiv.org/pdf/2501.11915v1.pdf","comment":"This paper is submitted to a journal for possible publication. The\n  copyright of this paper may be transferred without notice, after which this\n  version may no longer be accessible"},{"id":"http://arxiv.org/abs/2311.07065v2","updated":"2025-01-21T05:48:22Z","published":"2023-11-13T04:11:25Z","title":"On non-approximability of zero loss global ${\\mathcal L}^2$ minimizers\n  by gradient descent in Deep Learning","summary":"  We analyze geometric aspects of the gradient descent algorithm in Deep\nLearning (DL), and give a detailed discussion of the circumstance that in\nunderparametrized DL networks, zero loss minimization can generically not be\nattained. As a consequence, we conclude that the distribution of training\ninputs must necessarily be non-generic in order to produce zero loss\nminimizers, both for the method constructed in [Chen-Munoz Ewald 2023, 2024],\nor for gradient descent [Chen 2025] (which assume clustering of training data).\n","authors":["Thomas Chen","Patricia Muñoz Ewald"],"pdf_url":"https://arxiv.org/pdf/2311.07065v2.pdf","comment":"AMS Latex, 7 pages. Title changed, statement of Corollary 1.6\n  corrected"},{"id":"http://arxiv.org/abs/2501.11903v1","updated":"2025-01-21T05:47:55Z","published":"2025-01-21T05:47:55Z","title":"Finding the nearest bounded-real port-Hamiltonian system","summary":"  In this paper, we consider linear time-invariant continuous control systems\nwhich are bounded real, also known as scattering passive. Our main theoretical\ncontribution is to show the equivalence between such systems and\nport-Hamiltonian (PH) systems whose factors satisfy certain linear matrix\ninequalities. Based on this result, we propose a formulation for the problem of\nfinding the nearest bounded-real system to a given system, and design an\nalgorithm combining alternating optimization and Nesterov's fast gradient\nmethod. This formulation also allows us to check whether a given system is\nbounded real by solving a semidefinite program, and provide a PH\nparametrization for it. We illustrate our proposed algorithms on real and\nsynthetic data sets.\n","authors":["Karim Cherifi","Nicolas Gillis","Punit Sharma"],"pdf_url":"https://arxiv.org/pdf/2501.11903v1.pdf","comment":"20 pages, code, experiments and data available from\n  https://gitlab.com/ngillis/nearestBRsysPHform"},{"id":"http://arxiv.org/abs/2501.09137v2","updated":"2025-01-21T03:05:17Z","published":"2025-01-15T20:43:36Z","title":"Gradient Descent Converges Linearly to Flatter Minima than Gradient Flow\n  in Shallow Linear Networks","summary":"  We study the gradient descent (GD) dynamics of a depth-2 linear neural\nnetwork with a single input and output. We show that GD converges at an\nexplicit linear rate to a global minimum of the training loss, even with a\nlarge stepsize -- about $2/\\textrm{sharpness}$. It still converges for even\nlarger stepsizes, but may do so very slowly. We also characterize the solution\nto which GD converges, which has lower norm and sharpness than the gradient\nflow solution. Our analysis reveals a trade off between the speed of\nconvergence and the magnitude of implicit regularization. This sheds light on\nthe benefits of training at the ``Edge of Stability'', which induces additional\nregularization by delaying convergence and may have implications for training\nmore complex models.\n","authors":["Pierfrancesco Beneventano","Blake Woodworth"],"pdf_url":"https://arxiv.org/pdf/2501.09137v2.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.02573v2","updated":"2025-01-21T01:31:35Z","published":"2024-11-04T20:10:59Z","title":"Optimization Algorithm Design via Electric Circuits","summary":"  We present a novel methodology for convex optimization algorithm design using\nideas from electric RLC circuits. Given an optimization problem, the first\nstage of the methodology is to design an appropriate electric circuit whose\ncontinuous-time dynamics converge to the solution of the optimization problem\nat hand. Then, the second stage is an automated, computer-assisted\ndiscretization of the continuous-time dynamics, yielding a provably convergent\ndiscrete-time algorithm. Our methodology recovers many classical (distributed)\noptimization algorithms and enables users to quickly design and explore a wide\nrange of new algorithms with convergence guarantees.\n","authors":["Stephen P. Boyd","Tetiana Parshakova","Ernest K. Ryu","Jaewook J. Suh"],"pdf_url":"https://arxiv.org/pdf/2411.02573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11808v1","updated":"2025-01-21T01:11:00Z","published":"2025-01-21T01:11:00Z","title":"Distributed Saddle-Point Dynamics in Multilayer Networks","summary":"  Multilayer networks provide a more advanced and comprehensive framework for\nmodeling real-world systems compared to traditional single-layer and multiplex\nnetworks. Unlike single-layer models, multilayer networks have multiple\ninteracting layers, each with unique topological features. In this paper, we\ngeneralize previously developed results for distributed optimization in\nmultiplex networks to the more general case of multilayer networks by employing\na tensor formalism to represent multilayer networks and their tensor-Laplacian\ndiffusion dynamics. Although multiplex networks are a special case of\nmultilayer networks, where each layer has the same number of replica nodes\nconnected one-to-one, this generalized framework removes the need for replica\nnodes, allowing variability in both topology and number of nodes across layers.\nThis approach provides a fully generalized structure for distributed\noptimization in multilayer networks and enables more complex interlayer\nconnections. We derive the multilayer combinatorial Laplacian tensor and extend\nthe distributed gradient descent algorithm. We provide a theoretical analysis\nof the convergence of algorithms. Numerical examples validate our approach, and\nwe explore the impact of heterogeneous layer topologies and complex interlayer\ndynamics on consensus time, underscoring their implications for real-world\nmultilayer systems.\n","authors":["Christian D. Rodríguez-Camargo","Andrés F. Urquijo-Rodríguez","Eduardo Mojica-Nava"],"pdf_url":"https://arxiv.org/pdf/2501.11808v1.pdf","comment":"10 pages, 7 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.12393v1","updated":"2025-01-21T18:59:59Z","published":"2025-01-21T18:59:59Z","title":"Towards Affordance-Aware Articulation Synthesis for Rigged Objects","summary":"  Rigged objects are commonly used in artist pipelines, as they can flexibly\nadapt to different scenes and postures. However, articulating the rigs into\nrealistic affordance-aware postures (e.g., following the context, respecting\nthe physics and the personalities of the object) remains time-consuming and\nheavily relies on human labor from experienced artists. In this paper, we\ntackle the novel problem and design A3Syn. With a given context, such as the\nenvironment mesh and a text prompt of the desired posture, A3Syn synthesizes\narticulation parameters for arbitrary and open-domain rigged objects obtained\nfrom the Internet. The task is incredibly challenging due to the lack of\ntraining data, and we do not make any topological assumptions about the\nopen-domain rigs. We propose using 2D inpainting diffusion model and several\ncontrol techniques to synthesize in-context affordance information. Then, we\ndevelop an efficient bone correspondence alignment using a combination of\ndifferentiable rendering and semantic correspondence. A3Syn has stable\nconvergence, completes in minutes, and synthesizes plausible affordance on\ndifferent combinations of in-the-wild object rigs and scenes.\n","authors":["Yu-Chu Yu","Chieh Hubert Lin","Hsin-Ying Lee","Chaoyang Wang","Yu-Chiang Frank Wang","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2501.12393v1.pdf","comment":"Project page: https://chuyu.org/research/a3syn"},{"id":"http://arxiv.org/abs/2501.12392v1","updated":"2025-01-21T18:59:53Z","published":"2025-01-21T18:59:53Z","title":"Learning segmentation from point trajectories","summary":"  We consider the problem of segmenting objects in videos based on their motion\nand no other forms of supervision. Prior work has often approached this problem\nby using the principle of common fate, namely the fact that the motion of\npoints that belong to the same object is strongly correlated. However, most\nauthors have only considered instantaneous motion from optical flow. In this\nwork, we present a way to train a segmentation network using long-term point\ntrajectories as a supervisory signal to complement optical flow. The key\ndifficulty is that long-term motion, unlike instantaneous motion, is difficult\nto model -- any parametric approximation is unlikely to capture complex motion\npatterns over long periods of time. We instead draw inspiration from subspace\nclustering approaches, proposing a loss function that seeks to group the\ntrajectories into low-rank matrices where the motion of object points can be\napproximately explained as a linear combination of other point tracks. Our\nmethod outperforms the prior art on motion-based segmentation, which shows the\nutility of long-term motion and the effectiveness of our formulation.\n","authors":["Laurynas Karazija","Iro Laina","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2501.12392v1.pdf","comment":"NeurIPS 2024 Spotlight. Project\n  https://www.robots.ox.ac.uk/~vgg/research/lrtl/"},{"id":"http://arxiv.org/abs/2501.12390v1","updated":"2025-01-21T18:59:46Z","published":"2025-01-21T18:59:46Z","title":"GPS as a Control Signal for Image Generation","summary":"  We show that the GPS tags contained in photo metadata provide a useful\ncontrol signal for image generation. We train GPS-to-image models and use them\nfor tasks that require a fine-grained understanding of how images vary within a\ncity. In particular, we train a diffusion model to generate images conditioned\non both GPS and text. The learned model generates images that capture the\ndistinctive appearance of different neighborhoods, parks, and landmarks. We\nalso extract 3D models from 2D GPS-to-image models through score distillation\nsampling, using GPS conditioning to constrain the appearance of the\nreconstruction from each viewpoint. Our evaluations suggest that our\nGPS-conditioned models successfully learn to generate images that vary based on\nlocation, and that GPS conditioning improves estimated 3D structure.\n","authors":["Chao Feng","Ziyang Chen","Aleksander Holynski","Alexei A. Efros","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2501.12390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12389v1","updated":"2025-01-21T18:59:31Z","published":"2025-01-21T18:59:31Z","title":"Taming Teacher Forcing for Masked Autoregressive Video Generation","summary":"  We introduce MAGI, a hybrid video generation framework that combines masked\nmodeling for intra-frame generation with causal modeling for next-frame\ngeneration. Our key innovation, Complete Teacher Forcing (CTF), conditions\nmasked frames on complete observation frames rather than masked ones (namely\nMasked Teacher Forcing, MTF), enabling a smooth transition from token-level\n(patch-level) to frame-level autoregressive generation. CTF significantly\noutperforms MTF, achieving a +23% improvement in FVD scores on first-frame\nconditioned video prediction. To address issues like exposure bias, we employ\ntargeted training strategies, setting a new benchmark in autoregressive video\ngeneration. Experiments show that MAGI can generate long, coherent video\nsequences exceeding 100 frames, even when trained on as few as 16 frames,\nhighlighting its potential for scalable, high-quality video generation.\n","authors":["Deyu Zhou","Quan Sun","Yuang Peng","Kun Yan","Runpei Dong","Duomin Wang","Zheng Ge","Nan Duan","Xiangyu Zhang","Lionel M. Ni","Heung-Yeung Shum"],"pdf_url":"https://arxiv.org/pdf/2501.12389v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.12387v1","updated":"2025-01-21T18:59:23Z","published":"2025-01-21T18:59:23Z","title":"Continuous 3D Perception Model with Persistent State","summary":"  We present a unified framework capable of solving a broad range of 3D tasks.\nOur approach features a stateful recurrent model that continuously updates its\nstate representation with each new observation. Given a stream of images, this\nevolving state can be used to generate metric-scale pointmaps (per-pixel 3D\npoints) for each new input in an online fashion. These pointmaps reside within\na common coordinate system, and can be accumulated into a coherent, dense scene\nreconstruction that updates as new images arrive. Our model, called CUT3R\n(Continuous Updating Transformer for 3D Reconstruction), captures rich priors\nof real-world scenes: not only can it predict accurate pointmaps from image\nobservations, but it can also infer unseen regions of the scene by probing at\nvirtual, unobserved views. Our method is simple yet highly flexible, naturally\naccepting varying lengths of images that may be either video streams or\nunordered photo collections, containing both static and dynamic content. We\nevaluate our method on various 3D/4D tasks and demonstrate competitive or\nstate-of-the-art performance in each. Project Page: https://cut3r.github.io/\n","authors":["Qianqian Wang","Yifei Zhang","Aleksander Holynski","Alexei A. Efros","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2501.12387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12386v1","updated":"2025-01-21T18:59:00Z","published":"2025-01-21T18:59:00Z","title":"InternVideo2.5: Empowering Video MLLMs with Long and Rich Context\n  Modeling","summary":"  This paper aims to improve the performance of video multimodal large language\nmodels (MLLM) via long and rich context (LRC) modeling. As a result, we develop\na new version of InternVideo2.5 with a focus on enhancing the original MLLMs'\nability to perceive fine-grained details and capture long-form temporal\nstructure in videos. Specifically, our approach incorporates dense vision task\nannotations into MLLMs using direct preference optimization and develops\ncompact spatiotemporal representations through adaptive hierarchical token\ncompression. Experimental results demonstrate this unique design of LRC greatly\nimproves the results of video MLLM in mainstream video understanding benchmarks\n(short & long), enabling the MLLM to memorize significantly longer video inputs\n(at least 6x longer than the original), and master specialized vision\ncapabilities like object tracking and segmentation. Our work highlights the\nimportance of multimodal context richness (length and fineness) in empowering\nMLLM's innate abilites (focus and memory), providing new insights for future\nresearch on video MLLM. Code and models are available at\nhttps://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2.5\n","authors":["Yi Wang","Xinhao Li","Ziang Yan","Yinan He","Jiashuo Yu","Xiangyu Zeng","Chenting Wang","Changlian Ma","Haian Huang","Jianfei Gao","Min Dou","Kai Chen","Wenhai Wang","Yu Qiao","Yali Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.12386v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2501.12384v1","updated":"2025-01-21T18:57:34Z","published":"2025-01-21T18:57:34Z","title":"CCESAR: Coastline Classification-Extraction From SAR Images Using\n  CNN-U-Net Combination","summary":"  In this article, we improve the deep learning solution for coastline\nextraction from Synthetic Aperture Radar (SAR) images by proposing a two-stage\nmodel involving image classification followed by segmentation. We hypothesize\nthat a single segmentation model usually used for coastline detection is\ninsufficient to characterize different coastline types. We demonstrate that the\nneed for a two-stage workflow prevails through different compression levels of\nthese images. Our results from experiments using a combination of CNN and U-Net\nmodels on Sentinel-1 images show that the two-stage workflow, coastline\nclassification-extraction from SAR images (CCESAR) outperforms a single U-Net\nsegmentation model.\n","authors":["Vidhu Arora","Shreyan Gupta","Ananthakrishna Kudupu","Aditya Priyadarshi","Aswathi Mundayatt","Jaya Sreevalsan-Nair"],"pdf_url":"https://arxiv.org/pdf/2501.12384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12382v1","updated":"2025-01-21T18:56:41Z","published":"2025-01-21T18:56:41Z","title":"DiffDoctor: Diagnosing Image Diffusion Models Before Treating","summary":"  In spite of the recent progress, image diffusion models still produce\nartifacts. A common solution is to refine an established model with a quality\nassessment system, which generally rates an image in its entirety. In this\nwork, we believe problem-solving starts with identification, yielding the\nrequest that the model should be aware of not just the presence of defects in\nan image, but their specific locations. Motivated by this, we propose\nDiffDoctor, a two-stage pipeline to assist image diffusion models in generating\nfewer artifacts. Concretely, the first stage targets developing a robust\nartifact detector, for which we collect a dataset of over 1M flawed synthesized\nimages and set up an efficient human-in-the-loop annotation process,\nincorporating a carefully designed class-balance strategy. The learned artifact\ndetector is then involved in the second stage to tune the diffusion model\nthrough assigning a per-pixel confidence map for each synthesis. Extensive\nexperiments on text-to-image diffusion models demonstrate the effectiveness of\nour artifact detector as well as the soundness of our diagnose-then-treat\ndesign.\n","authors":["Yiyang Wang","Xi Chen","Xiaogang Xu","Sihui Ji","Yu Liu","Yujun Shen","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.12382v1.pdf","comment":"8 pages of main body and 2 pages of references, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.12381v1","updated":"2025-01-21T18:56:19Z","published":"2025-01-21T18:56:19Z","title":"Parallel Sequence Modeling via Generalized Spatial Propagation Network","summary":"  We present the Generalized Spatial Propagation Network (GSPN), a new\nattention mechanism optimized for vision tasks that inherently captures 2D\nspatial structures. Existing attention models, including transformers, linear\nattention, and state-space models like Mamba, process multi-dimensional data as\n1D sequences, compromising spatial coherence and efficiency. GSPN overcomes\nthese limitations by directly operating on spatially coherent image data and\nforming dense pairwise connections through a line-scan approach. Central to\nGSPN is the Stability-Context Condition, which ensures stable, context-aware\npropagation across 2D sequences and reduces the effective sequence length to\n$\\sqrt{N}$ for a square map with N elements, significantly enhancing\ncomputational efficiency. With learnable, input-dependent weights and no\nreliance on positional embeddings, GSPN achieves superior spatial fidelity and\nstate-of-the-art performance in vision tasks, including ImageNet\nclassification, class-guided image generation, and text-to-image generation.\nNotably, GSPN accelerates SD-XL with softmax-attention by over $84\\times$ when\ngenerating 16K images.\n","authors":["Hongjun Wang","Wonmin Byeon","Jiarui Xu","Jinwei Gu","Ka Chun Cheung","Xiaolong Wang","Kai Han","Jan Kautz","Sifei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.12381v1.pdf","comment":"Project page: http://whj363636.github.io/GSPN/"},{"id":"http://arxiv.org/abs/2501.12380v1","updated":"2025-01-21T18:56:18Z","published":"2025-01-21T18:56:18Z","title":"MMVU: Measuring Expert-Level Multi-Discipline Video Understanding","summary":"  We introduce MMVU, a comprehensive expert-level, multi-discipline benchmark\nfor evaluating foundation models in video understanding. MMVU includes 3,000\nexpert-annotated questions spanning 27 subjects across four core disciplines:\nScience, Healthcare, Humanities & Social Sciences, and Engineering. Compared to\nprior benchmarks, MMVU features three key advancements. First, it challenges\nmodels to apply domain-specific knowledge and perform expert-level reasoning to\nanalyze specialized-domain videos, moving beyond the basic visual perception\ntypically assessed in current video benchmarks. Second, each example is\nannotated by human experts from scratch. We implement strict data quality\ncontrols to ensure the high quality of the dataset. Finally, each example is\nenriched with expert-annotated reasoning rationals and relevant domain\nknowledge, facilitating in-depth analysis. We conduct an extensive evaluation\nof 32 frontier multimodal foundation models on MMVU. The latest\nSystem-2-capable models, o1 and Gemini 2.0 Flash Thinking, achieve the highest\nperformance among the tested models. However, they still fall short of matching\nhuman expertise. Through in-depth error analyses and case studies, we offer\nactionable insights for future advancements in expert-level,\nknowledge-intensive video understanding for specialized domains.\n","authors":["Yilun Zhao","Lujing Xie","Haowei Zhang","Guo Gan","Yitao Long","Zhiyuan Hu","Tongyan Hu","Weiyuan Chen","Chuhan Li","Junyang Song","Zhijian Xu","Chengye Wang","Weifeng Pan","Ziyao Shangguan","Xiangru Tang","Zhenwen Liang","Yixin Liu","Chen Zhao","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2501.12380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12375v1","updated":"2025-01-21T18:53:30Z","published":"2025-01-21T18:53:30Z","title":"Video Depth Anything: Consistent Depth Estimation for Super-Long Videos","summary":"  Depth Anything has achieved remarkable success in monocular depth estimation\nwith strong generalization ability. However, it suffers from temporal\ninconsistency in videos, hindering its practical applications. Various methods\nhave been proposed to alleviate this issue by leveraging video generation\nmodels or introducing priors from optical flow and camera poses. Nonetheless,\nthese methods are only applicable to short videos (< 10 seconds) and require a\ntrade-off between quality and computational efficiency. We propose Video Depth\nAnything for high-quality, consistent depth estimation in super-long videos\n(over several minutes) without sacrificing efficiency. We base our model on\nDepth Anything V2 and replace its head with an efficient spatial-temporal head.\nWe design a straightforward yet effective temporal consistency loss by\nconstraining the temporal depth gradient, eliminating the need for additional\ngeometric priors. The model is trained on a joint dataset of video depth and\nunlabeled images, similar to Depth Anything V2. Moreover, a novel\nkey-frame-based strategy is developed for long video inference. Experiments\nshow that our model can be applied to arbitrarily long videos without\ncompromising quality, consistency, or generalization ability. Comprehensive\nevaluations on multiple video benchmarks demonstrate that our approach sets a\nnew state-of-the-art in zero-shot video depth estimation. We offer models of\ndifferent scales to support a range of scenarios, with our smallest model\ncapable of real-time performance at 30 FPS.\n","authors":["Sili Chen","Hengkai Guo","Shengnan Zhu","Feihu Zhang","Zilong Huang","Jiashi Feng","Bingyi Kang"],"pdf_url":"https://arxiv.org/pdf/2501.12375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12369v1","updated":"2025-01-21T18:49:06Z","published":"2025-01-21T18:49:06Z","title":"DARB-Splatting: Generalizing Splatting with Decaying Anisotropic Radial\n  Basis Functions","summary":"  Splatting-based 3D reconstruction methods have gained popularity with the\nadvent of 3D Gaussian Splatting, efficiently synthesizing high-quality novel\nviews. These methods commonly resort to using exponential family functions,\nsuch as the Gaussian function, as reconstruction kernels due to their\nanisotropic nature, ease of projection, and differentiability in rasterization.\nHowever, the field remains restricted to variations within the exponential\nfamily, leaving generalized reconstruction kernels largely underexplored,\npartly due to the lack of easy integrability in 3D to 2D projections. In this\nlight, we show that a class of decaying anisotropic radial basis functions\n(DARBFs), which are non-negative functions of the Mahalanobis distance,\nsupports splatting by approximating the Gaussian function's closed-form\nintegration advantage. With this fresh perspective, we demonstrate up to 34%\nfaster convergence during training and a 15% reduction in memory consumption\nacross various DARB reconstruction kernels, while maintaining comparable PSNR,\nSSIM, and LPIPS results. We will make the code available.\n","authors":["Vishagar Arunan","Saeedha Nazar","Hashiru Pramuditha","Vinasirajan Viruthshaan","Sameera Ramasinghe","Simon Lucey","Ranga Rodrigo"],"pdf_url":"https://arxiv.org/pdf/2501.12369v1.pdf","comment":"Link to the project page:\n  https://randomnerds.github.io/darbs.github.io/"},{"id":"http://arxiv.org/abs/2501.12368v1","updated":"2025-01-21T18:47:32Z","published":"2025-01-21T18:47:32Z","title":"InternLM-XComposer2.5-Reward: A Simple Yet Effective Multi-Modal Reward\n  Model","summary":"  Despite the promising performance of Large Vision Language Models (LVLMs) in\nvisual understanding, they occasionally generate incorrect outputs. While\nreward models (RMs) with reinforcement learning or test-time scaling offer the\npotential for improving generation quality, a critical gap remains: publicly\navailable multi-modal RMs for LVLMs are scarce, and the implementation details\nof proprietary models are often unclear. We bridge this gap with\nInternLM-XComposer2.5-Reward (IXC-2.5-Reward), a simple yet effective\nmulti-modal reward model that aligns LVLMs with human preferences. To ensure\nthe robustness and versatility of IXC-2.5-Reward, we set up a high-quality\nmulti-modal preference corpus spanning text, image, and video inputs across\ndiverse domains, such as instruction following, general understanding,\ntext-rich documents, mathematical reasoning, and video understanding.\nIXC-2.5-Reward achieves excellent results on the latest multi-modal reward\nmodel benchmark and shows competitive performance on text-only reward model\nbenchmarks. We further demonstrate three key applications of IXC-2.5-Reward:\n(1) Providing a supervisory signal for RL training. We integrate IXC-2.5-Reward\nwith Proximal Policy Optimization (PPO) yields IXC-2.5-Chat, which shows\nconsistent improvements in instruction following and multi-modal open-ended\ndialogue; (2) Selecting the best response from candidate responses for\ntest-time scaling; and (3) Filtering outlier or noisy samples from existing\nimage and video instruction tuning training data. To ensure reproducibility and\nfacilitate further research, we have open-sourced all model weights and\ntraining recipes at https://github.com/InternLM/InternLM-XComposer\n","authors":["Yuhang Zang","Xiaoyi Dong","Pan Zhang","Yuhang Cao","Ziyu Liu","Shengyuan Ding","Shenxi Wu","Yubo Ma","Haodong Duan","Wenwei Zhang","Kai Chen","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.12368v1.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2501.09898v2","updated":"2025-01-21T18:46:52Z","published":"2025-01-17T01:01:44Z","title":"FoundationStereo: Zero-Shot Stereo Matching","summary":"  Tremendous progress has been made in deep stereo matching to excel on\nbenchmark datasets through per-domain fine-tuning. However, achieving strong\nzero-shot generalization - a hallmark of foundation models in other computer\nvision tasks - remains challenging for stereo matching. We introduce\nFoundationStereo, a foundation model for stereo depth estimation designed to\nachieve strong zero-shot generalization. To this end, we first construct a\nlarge-scale (1M stereo pairs) synthetic training dataset featuring large\ndiversity and high photorealism, followed by an automatic self-curation\npipeline to remove ambiguous samples. We then design a number of network\narchitecture components to enhance scalability, including a side-tuning feature\nbackbone that adapts rich monocular priors from vision foundation models to\nmitigate the sim-to-real gap, and long-range context reasoning for effective\ncost volume filtering. Together, these components lead to strong robustness and\naccuracy across domains, establishing a new standard in zero-shot stereo depth\nestimation. Project page: https://nvlabs.github.io/FoundationStereo/\n","authors":["Bowen Wen","Matthew Trepte","Joseph Aribido","Jan Kautz","Orazio Gallo","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2501.09898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12356v1","updated":"2025-01-21T18:36:18Z","published":"2025-01-21T18:36:18Z","title":"Vision-Language Models for Automated Chest X-ray Interpretation:\n  Leveraging ViT and GPT-2","summary":"  Radiology plays a pivotal role in modern medicine due to its non-invasive\ndiagnostic capabilities. However, the manual generation of unstructured medical\nreports is time consuming and prone to errors. It creates a significant\nbottleneck in clinical workflows. Despite advancements in AI-generated\nradiology reports, challenges remain in achieving detailed and accurate report\ngeneration. In this study we have evaluated different combinations of\nmultimodal models that integrate Computer Vision and Natural Language\nProcessing to generate comprehensive radiology reports. We employed a\npretrained Vision Transformer (ViT-B16) and a SWIN Transformer as the image\nencoders. The BART and GPT-2 models serve as the textual decoders. We used\nChest X-ray images and reports from the IU-Xray dataset to evaluate the\nusability of the SWIN Transformer-BART, SWIN Transformer-GPT-2, ViT-B16-BART\nand ViT-B16-GPT-2 models for report generation. We aimed at finding the best\ncombination among the models. The SWIN-BART model performs as the\nbest-performing model among the four models achieving remarkable results in\nalmost all the evaluation metrics like ROUGE, BLEU and BERTScore.\n","authors":["Md. Rakibul Islam","Md. Zahid Hossain","Mustofa Ahmed","Most. Sharmin Sultana Samu"],"pdf_url":"https://arxiv.org/pdf/2501.12356v1.pdf","comment":"Preprint, manuscript under-review"},{"id":"http://arxiv.org/abs/2501.12331v1","updated":"2025-01-21T18:05:11Z","published":"2025-01-21T18:05:11Z","title":"Cinepro: Robust Training of Foundation Models for Cancer Detection in\n  Prostate Ultrasound Cineloops","summary":"  Prostate cancer (PCa) detection using deep learning (DL) models has shown\npotential for enhancing real-time guidance during biopsies. However, prostate\nultrasound images lack pixel-level cancer annotations, introducing label noise.\nCurrent approaches often focus on limited regions of interest (ROIs),\ndisregarding anatomical context necessary for accurate diagnosis. Foundation\nmodels can overcome this limitation by analyzing entire images to capture\nglobal spatial relationships; however, they still encounter challenges stemming\nfrom the weak labels associated with coarse pathology annotations in ultrasound\ndata. We introduce Cinepro, a novel framework that strengthens foundation\nmodels' ability to localize PCa in ultrasound cineloops. Cinepro adapts robust\ntraining by integrating the proportion of cancer tissue reported by pathology\nin a biopsy core into its loss function to address label noise, providing a\nmore nuanced supervision. Additionally, it leverages temporal data across\nmultiple frames to apply robust augmentations, enhancing the model's ability to\nlearn stable cancer-related features. Cinepro demonstrates superior performance\non a multi-center prostate ultrasound dataset, achieving an AUROC of 77.1% and\na balanced accuracy of 83.8%, surpassing current benchmarks. These findings\nunderscore Cinepro's promise in advancing foundation models for weakly labeled\nultrasound data.\n","authors":["Mohamed Harmanani","Amoon Jamzad","Minh Nguyen Nhat To","Paul F. R. Wilson","Zhuoxin Guo","Fahimeh Fooladgar","Samira Sojoudi","Mahdi Gilany","Silvia Chang","Peter Black","Michael Leveridge","Robert Siemens","Purang Abolmaesumi","Parvin Mousavi"],"pdf_url":"https://arxiv.org/pdf/2501.12331v1.pdf","comment":"accepted to IEEE ISBI 2025"},{"id":"http://arxiv.org/abs/2501.12327v1","updated":"2025-01-21T17:50:43Z","published":"2025-01-21T17:50:43Z","title":"VARGPT: Unified Understanding and Generation in a Visual Autoregressive\n  Multimodal Large Language Model","summary":"  We present VARGPT, a novel multimodal large language model (MLLM) that\nunifies visual understanding and generation within a single autoregressive\nframework. VARGPT employs a next-token prediction paradigm for visual\nunderstanding and a next-scale prediction paradigm for visual autoregressive\ngeneration. VARGPT innovatively extends the LLaVA architecture, achieving\nefficient scale-wise autoregressive visual generation within MLLMs while\nseamlessly accommodating mixed-modal input and output within a single model\nframework. Our VARGPT undergoes a three-stage unified training process on\nspecially curated datasets, comprising a pre-training phase and two mixed\nvisual instruction-tuning phases. The unified training strategy are designed to\nachieve alignment between visual and textual features, enhance instruction\nfollowing for both understanding and generation, and improve visual generation\nquality, respectively. Despite its LLAVA-based architecture for multimodel\nunderstanding, VARGPT significantly outperforms LLaVA-1.5 across various\nvision-centric benchmarks, such as visual question-answering and reasoning\ntasks. Notably, VARGPT naturally supports capabilities in autoregressive visual\ngeneration and instruction-to-image synthesis, showcasing its versatility in\nboth visual understanding and generation tasks. Project page is at:\n\\url{https://vargpt-1.github.io/}\n","authors":["Xianwei Zhuang","Yuxin Xie","Yufan Deng","Liming Liang","Jinghan Ru","Yuguo Yin","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2501.12327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12326v1","updated":"2025-01-21T17:48:10Z","published":"2025-01-21T17:48:10Z","title":"UI-TARS: Pioneering Automated GUI Interaction with Native Agents","summary":"  This paper introduces UI-TARS, a native GUI agent model that solely perceives\nthe screenshots as input and performs human-like interactions (e.g., keyboard\nand mouse operations). Unlike prevailing agent frameworks that depend on\nheavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts\nand workflows, UI-TARS is an end-to-end model that outperforms these\nsophisticated frameworks. Experiments demonstrate its superior performance:\nUI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating\nperception, grounding, and GUI task execution. Notably, in the OSWorld\nbenchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15\nsteps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld,\nUI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several\nkey innovations: (1) Enhanced Perception: leveraging a large-scale dataset of\nGUI screenshots for context-aware understanding of UI elements and precise\ncaptioning; (2) Unified Action Modeling, which standardizes actions into a\nunified space across platforms and achieves precise grounding and interaction\nthrough large-scale action traces; (3) System-2 Reasoning, which incorporates\ndeliberate reasoning into multi-step decision making, involving multiple\nreasoning patterns such as task decomposition, reflection thinking, milestone\nrecognition, etc. (4) Iterative Training with Reflective Online Traces, which\naddresses the data bottleneck by automatically collecting, filtering, and\nreflectively refining new interaction traces on hundreds of virtual machines.\nThrough iterative training and reflection tuning, UI-TARS continuously learns\nfrom its mistakes and adapts to unforeseen situations with minimal human\nintervention. We also analyze the evolution path of GUI agents to guide the\nfurther development of this domain.\n","authors":["Yujia Qin","Yining Ye","Junjie Fang","Haoming Wang","Shihao Liang","Shizuo Tian","Junda Zhang","Jiahao Li","Yunxin Li","Shijue Huang","Wanjun Zhong","Kuanye Li","Jiale Yang","Yu Miao","Woyu Lin","Longxiang Liu","Xu Jiang","Qianli Ma","Jingyu Li","Xiaojun Xiao","Kai Cai","Chuang Li","Yaowei Zheng","Chaolin Jin","Chen Li","Xiao Zhou","Minchao Wang","Haoli Chen","Zhaojian Li","Haihua Yang","Haifeng Liu","Feng Lin","Tao Peng","Xin Liu","Guang Shi"],"pdf_url":"https://arxiv.org/pdf/2501.12326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16766v2","updated":"2025-01-21T17:47:28Z","published":"2024-09-25T09:24:53Z","title":"Let There Be Light: Robust Lensless Imaging Under External Illumination\n  With Deep Learning","summary":"  Lensless cameras relax the design constraints of traditional cameras by\nshifting image formation from analog optics to digital post-processing. While\nnew camera designs and applications can be enabled, lensless imaging is very\nsensitive to unwanted interference (other sources, noise, etc.). In this work,\nwe address a prevalent noise source that has not been studied for lensless\nimaging: external illumination e.g. from ambient and direct lighting. Being\nrobust to a variety of lighting conditions would increase the practicality and\nadoption of lensless imaging. To this end, we propose multiple recovery\napproaches that account for external illumination by incorporating its estimate\ninto the image recovery process. At the core is a physics-based reconstruction\nthat combines learnable image recovery and denoisers, all of whose parameters\nare trained using experimentally gathered data. Compared to standard\nreconstruction methods, our approach yields significant qualitative and\nquantitative improvements. We open-source our implementations and a 25K dataset\nof measurements under multiple lighting conditions.\n","authors":["Eric Bezzam","Stefan Peters","Martin Vetterli"],"pdf_url":"https://arxiv.org/pdf/2409.16766v2.pdf","comment":"4 pages, dataset: https://doi.org/10.57967/hf/2970, accepted to\n  ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.12323v1","updated":"2025-01-21T17:42:06Z","published":"2025-01-21T17:42:06Z","title":"Deep Learning Based Segmentation of Blood Vessels from H&E Stained\n  Oesophageal Adenocarcinoma Whole-Slide Images","summary":"  Blood vessels (BVs) play a critical role in the Tumor Micro-Environment\n(TME), potentially influencing cancer progression and treatment response.\nHowever, manually quantifying BVs in Hematoxylin and Eosin (H&E) stained images\nis challenging and labor-intensive due to their heterogeneous appearances. We\npropose a novel approach of constructing guiding maps to improve the\nperformance of state-of-the-art segmentation models for BV segmentation, the\nguiding maps encourage the models to learn representative features of BVs. This\nis particularly beneficial for computational pathology, where labeled training\ndata is often limited and large models are prone to overfitting. We have\nquantitative and qualitative results to demonstrate the efficacy of our\napproach in improving segmentation accuracy. In future, we plan to validate\nthis method to segment BVs across various tissue types and investigate the role\nof cellular structures in relation to BVs in the TME.\n","authors":["Jiaqi Lv","Stefan S Antonowicz","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2501.12323v1.pdf","comment":"Accepted by ISBI 2025"},{"id":"http://arxiv.org/abs/2501.12319v1","updated":"2025-01-21T17:38:55Z","published":"2025-01-21T17:38:55Z","title":"Metric for Evaluating Performance of Reference-Free Demorphing Methods","summary":"  A facial morph is an image created by combining two (or more) face images\npertaining to two (or more) distinct identities. Reference-free face demorphing\ninverts the process and tries to recover the face images constituting a facial\nmorph without using any other information. However, there is no consensus on\nthe evaluation metrics to be used to evaluate and compare such demorphing\ntechniques. In this paper, we first analyze the shortcomings of the demorphing\nmetrics currently used in the literature. We then propose a new metric called\nbiometrically cross-weighted IQA that overcomes these issues and extensively\nbenchmark current methods on the proposed metric to show its efficacy.\nExperiments on three existing demorphing methods and six datasets on two\ncommonly used face matchers validate the efficacy of our proposed metric.\n","authors":["Nitish Shukla","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2501.12319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12318v1","updated":"2025-01-21T17:38:42Z","published":"2025-01-21T17:38:42Z","title":"BlanketGen2-Fit3D: Synthetic Blanket Augmentation Towards Improving\n  Real-World In-Bed Blanket Occluded Human Pose Estimation","summary":"  Human Pose Estimation (HPE) from monocular RGB images is crucial for clinical\nin-bed skeleton-based action recognition, however, it poses unique challenges\nfor HPE models due to the frequent presence of blankets occluding the person,\nwhile labeled HPE data in this scenario is scarce. To address this we introduce\nBlanketGen2-Fit3D (BG2-Fit3D), an augmentation of Fit3D dataset that contains\n1,217,312 frames with synthetic photo-realistic blankets. To generate it we\nused BlanketGen2, our new and improved version of our BlanketGen pipeline that\nsimulates synthetic blankets using ground-truth Skinned Multi-Person Linear\nmodel (SMPL) meshes and then renders them as transparent images that can be\nlayered on top of the original frames. This dataset was used in combination\nwith the original Fit3D to finetune the ViTPose-B HPE model, to evaluate\nsynthetic blanket augmentation effectiveness. The trained models were further\nevaluated on a real-world blanket occluded in-bed HPE dataset (SLP dataset).\nComparing architectures trained on only Fit3D with the ones trained with our\nsynthetic blanket augmentation the later improved pose estimation performance\non BG2-Fit3D, the synthetic blanket occluded dataset significantly to (0.977\nPercentage of Correct Keypoints (PCK), 0.149 Normalized Mean Error (NME)) with\nan absolute 4.4% PCK increase. Furthermore, the test results on SLP\ndemonstrated the utility of synthetic data augmentation by improving\nperformance by an absolute 2.3% PCK, on real-world images with the poses\noccluded by real blankets. These results show synthetic blanket augmentation\nhas the potential to improve in-bed blanket occluded HPE from RGB images. The\ndataset as well as the code will be made available to the public.\n","authors":["Tamás Karácsony","João Carmona","João Paulo Silva Cunha"],"pdf_url":"https://arxiv.org/pdf/2501.12318v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.14477v2","updated":"2025-01-21T17:15:10Z","published":"2024-05-23T12:06:00Z","title":"LiteVAE: Lightweight and Efficient Variational Autoencoders for Latent\n  Diffusion Models","summary":"  Advances in latent diffusion models (LDMs) have revolutionized\nhigh-resolution image generation, but the design space of the autoencoder that\nis central to these systems remains underexplored. In this paper, we introduce\nLiteVAE, a new autoencoder design for LDMs, which leverages the 2D discrete\nwavelet transform to enhance scalability and computational efficiency over\nstandard variational autoencoders (VAEs) with no sacrifice in output quality.\nWe investigate the training methodologies and the decoder architecture of\nLiteVAE and propose several enhancements that improve the training dynamics and\nreconstruction quality. Our base LiteVAE model matches the quality of the\nestablished VAEs in current LDMs with a six-fold reduction in encoder\nparameters, leading to faster training and lower GPU memory requirements, while\nour larger model outperforms VAEs of comparable complexity across all evaluated\nmetrics (rFID, LPIPS, PSNR, and SSIM).\n","authors":["Seyedmorteza Sadat","Jakob Buhmann","Derek Bradley","Otmar Hilliges","Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2405.14477v2.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.12299v1","updated":"2025-01-21T17:11:25Z","published":"2025-01-21T17:11:25Z","title":"Sublinear Variational Optimization of Gaussian Mixture Models with\n  Millions to Billions of Parameters","summary":"  Gaussian Mixture Models (GMMs) range among the most frequently used machine\nlearning models. However, training large, general GMMs becomes computationally\nprohibitive for datasets with many data points $N$ of high-dimensionality $D$.\nFor GMMs with arbitrary covariances, we here derive a highly efficient\nvariational approximation, which is integrated with mixtures of factor\nanalyzers (MFAs). For GMMs with $C$ components, our proposed algorithm\nsignificantly reduces runtime complexity per iteration from\n$\\mathcal{O}(NCD^2)$ to a complexity scaling linearly with $D$ and remaining\nconstant w.r.t. $C$. Numerical validation of this theoretical complexity\nreduction then shows the following: the distance evaluations required for the\nentire GMM optimization process scale sublinearly with $NC$. On large-scale\nbenchmarks, this sublinearity results in speed-ups of an order-of-magnitude\ncompared to the state-of-the-art. As a proof of concept, we train GMMs with\nover 10 billion parameters on about 100 million images, and observe training\ntimes of approximately nine hours on a single state-of-the-art CPU.\n","authors":["Sebastian Salwig","Till Kahlke","Florian Hirschberger","Dennis Forster","Jörg Lücke"],"pdf_url":"https://arxiv.org/pdf/2501.12299v1.pdf","comment":"22 pages, 6 figures (and 17 pages, 3 figures in Appendix)"},{"id":"http://arxiv.org/abs/2501.12296v1","updated":"2025-01-21T17:03:06Z","published":"2025-01-21T17:03:06Z","title":"RALAD: Bridging the Real-to-Sim Domain Gap in Autonomous Driving with\n  Retrieval-Augmented Learning","summary":"  In the pursuit of robust autonomous driving systems, models trained on\nreal-world datasets often struggle to adapt to new environments, particularly\nwhen confronted with corner cases such as extreme weather conditions.\nCollecting these corner cases in the real world is non-trivial, which\nnecessitates the use of simulators for validation. However,the high\ncomputational cost and the domain gap in data distribution have hindered the\nseamless transition between real and simulated driving scenarios. To tackle\nthis challenge, we propose Retrieval-Augmented Learning for Autonomous Driving\n(RALAD), a novel framework designed to bridge the real-to-sim gap at a low\ncost. RALAD features three primary designs, including (1) domain adaptation via\nan enhanced Optimal Transport (OT) method that accounts for both individual and\ngrouped image distances, (2) a simple and unified framework that can be applied\nto various models, and (3) efficient fine-tuning techniques that freeze the\ncomputationally expensive layers while maintaining robustness. Experimental\nresults demonstrate that RALAD compensates for the performance degradation in\nsimulated environments while maintaining accuracy in real-world scenarios\nacross three different models. Taking Cross View as an example, the mIOU and\nmAP metrics in real-world scenarios remain stable before and after RALAD\nfine-tuning, while in simulated environments,the mIOU and mAP metrics are\nimproved by 10.30% and 12.29%, respectively. Moreover, the re-training cost of\nour approach is reduced by approximately 88.1%. Our code is available at\nhttps://github.com/JiachengZuo/RALAD.git.\n","authors":["Jiacheng Zuo","Haibo Hu","Zikang Zhou","Yufei Cui","Ziquan Liu","Jianping Wang","Nan Guan","Jin Wang","Chun Jason Xue"],"pdf_url":"https://arxiv.org/pdf/2501.12296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12295v1","updated":"2025-01-21T17:02:51Z","published":"2025-01-21T17:02:51Z","title":"Towards Accurate Unified Anomaly Segmentation","summary":"  Unsupervised anomaly detection (UAD) from images strives to model normal data\ndistributions, creating discriminative representations to distinguish and\nprecisely localize anomalies. Despite recent advancements in the efficient and\nunified one-for-all scheme, challenges persist in accurately segmenting\nanomalies for further monitoring. Moreover, this problem is obscured by the\nwidely-used AUROC metric under imbalanced UAD settings. This motivates us to\nemphasize the significance of precise segmentation of anomaly pixels using pAP\nand DSC as metrics. To address the unsolved segmentation task, we introduce the\nUnified Anomaly Segmentation (UniAS). UniAS presents a multi-level hybrid\npipeline that progressively enhances normal information from coarse to fine,\nincorporating a novel multi-granularity gated CNN (MGG-CNN) into Transformer\nlayers to explicitly aggregate local details from different granularities.\nUniAS achieves state-of-the-art anomaly segmentation performance, attaining\n65.12/59.33 and 40.06/32.50 in pAP/DSC on the MVTec-AD and VisA datasets,\nrespectively, surpassing previous methods significantly. The codes are shared\nat https://github.com/Mwxinnn/UniAS.\n","authors":["Wenxin Ma","Qingsong Yao","Xiang Zhang","Zhelong Huang","Zihang Jiang","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.12295v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.12289v1","updated":"2025-01-21T16:59:13Z","published":"2025-01-21T16:59:13Z","title":"Regressor-Guided Image Editing Regulates Emotional Response to Reduce\n  Online Engagement","summary":"  Emotions are known to mediate the relationship between users' content\nconsumption and their online engagement, with heightened emotional intensity\nleading to increased engagement. Building on this insight, we propose three\nregressor-guided image editing approaches aimed at diminishing the emotional\nimpact of images. These include (i) a parameter optimization approach based on\nglobal image transformations known to influence emotions, (ii) an optimization\napproach targeting the style latent space of a generative adversarial network,\nand (iii) a diffusion-based approach employing classifier guidance and\nclassifier-free guidance. Our findings demonstrate that approaches can\neffectively alter the emotional properties of images while maintaining high\nvisual quality. Optimization-based methods primarily adjust low-level\nproperties like color hues and brightness, whereas the diffusion-based approach\nintroduces semantic changes, such as altering appearance or facial expressions.\nNotably, results from a behavioral study reveal that only the diffusion-based\napproach successfully elicits changes in viewers' emotional responses while\npreserving high perceived image quality. In future work, we will investigate\nthe impact of these image adaptations on internet user behavior.\n","authors":["Christoph Gebhardt","Robin Willardt","Seyedmorteza Sadat","Chih-Wei Ning","Andreas Brombach","Jie Song","Otmar Hilliges","Christian Holz"],"pdf_url":"https://arxiv.org/pdf/2501.12289v1.pdf","comment":"39 pages, 22 figures"},{"id":"http://arxiv.org/abs/2501.12275v1","updated":"2025-01-21T16:44:51Z","published":"2025-01-21T16:44:51Z","title":"With Great Backbones Comes Great Adversarial Transferability","summary":"  Advances in self-supervised learning (SSL) for machine vision have improved\nrepresentation robustness and model performance, giving rise to pre-trained\nbackbones like \\emph{ResNet} and \\emph{ViT} models tuned with SSL methods such\nas \\emph{SimCLR}. Due to the computational and data demands of pre-training,\nthe utilization of such backbones becomes a strenuous necessity. However,\nemploying these backbones may inherit vulnerabilities to adversarial attacks.\nWhile adversarial robustness has been studied under \\emph{white-box} and\n\\emph{black-box} settings, the robustness of models tuned on pre-trained\nbackbones remains largely unexplored. Additionally, the role of tuning\nmeta-information in mitigating exploitation risks is unclear. This work\nsystematically evaluates the adversarial robustness of such models across\n$20,000$ combinations of tuning meta-information, including fine-tuning\ntechniques, backbone families, datasets, and attack types. We propose using\nproxy models to transfer attacks, simulating varying levels of target knowledge\nby fine-tuning these proxies with diverse configurations. Our findings reveal\nthat proxy-based attacks approach the effectiveness of \\emph{white-box}\nmethods, even with minimal tuning knowledge. We also introduce a naive\n\"backbone attack,\" leveraging only the backbone to generate adversarial\nsamples, which outperforms \\emph{black-box} attacks and rivals \\emph{white-box}\nmethods, highlighting critical risks in model-sharing practices. Finally, our\nablations reveal how increasing tuning meta-information impacts attack\ntransferability, measuring each meta-information combination.\n","authors":["Erik Arakelyan","Karen Hambardzumyan","Davit Papikyan","Pasquale Minervini","Albert Gordo","Isabelle Augenstein","Aram H. Markosyan"],"pdf_url":"https://arxiv.org/pdf/2501.12275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12269v1","updated":"2025-01-21T16:40:44Z","published":"2025-01-21T16:40:44Z","title":"Benchmarking Image Perturbations for Testing Automated Driving\n  Assistance Systems","summary":"  Advanced Driver Assistance Systems (ADAS) based on deep neural networks\n(DNNs) are widely used in autonomous vehicles for critical perception tasks\nsuch as object detection, semantic segmentation, and lane recognition. However,\nthese systems are highly sensitive to input variations, such as noise and\nchanges in lighting, which can compromise their effectiveness and potentially\nlead to safety-critical failures.\n  This study offers a comprehensive empirical evaluation of image\nperturbations, techniques commonly used to assess the robustness of DNNs, to\nvalidate and improve the robustness and generalization of ADAS perception\nsystems. We first conducted a systematic review of the literature, identifying\n38 categories of perturbations. Next, we evaluated their effectiveness in\nrevealing failures in two different ADAS, both at the component and at the\nsystem level. Finally, we explored the use of perturbation-based data\naugmentation and continuous learning strategies to improve ADAS adaptation to\nnew operational design domains. Our results demonstrate that all categories of\nimage perturbations successfully expose robustness issues in ADAS and that the\nuse of dataset augmentation and continuous learning significantly improves ADAS\nperformance in novel, unseen environments.\n","authors":["Stefano Carlo Lambertenghi","Hannes Leonhard","Andrea Stocco"],"pdf_url":"https://arxiv.org/pdf/2501.12269v1.pdf","comment":"Accepted for publication at the 18th IEEE International Conference on\n  Software Testing, Verification and Validation (ICST 2025)"},{"id":"http://arxiv.org/abs/2408.10202v2","updated":"2025-01-21T16:39:56Z","published":"2024-08-19T17:57:28Z","title":"SANER: Annotation-free Societal Attribute Neutralizer for Debiasing CLIP","summary":"  Large-scale vision-language models, such as CLIP, are known to contain\nsocietal bias regarding protected attributes (e.g., gender, age). This paper\naims to address the problems of societal bias in CLIP. Although previous\nstudies have proposed to debias societal bias through adversarial learning or\ntest-time projecting, our comprehensive study of these works identifies two\ncritical limitations: 1) loss of attribute information when it is explicitly\ndisclosed in the input and 2) use of the attribute annotations during debiasing\nprocess. To mitigate societal bias in CLIP and overcome these limitations\nsimultaneously, we introduce a simple-yet-effective debiasing method called\nSANER (societal attribute neutralizer) that eliminates attribute information\nfrom CLIP text features only of attribute-neutral descriptions. Experimental\nresults show that SANER, which does not require attribute annotations and\npreserves original information for attribute-specific descriptions,\ndemonstrates superior debiasing ability than the existing methods.\nAdditionally, we observe that SANER does not require retraining CLIP from\nscratch with the original dataset. Moreover, the debiased model can be directly\napplied to the text-to-image generation model by simply replacing the text\nencoder.\n","authors":["Yusuke Hirota","Min-Hung Chen","Chien-Yi Wang","Yuta Nakashima","Yu-Chiang Frank Wang","Ryo Hachiuma"],"pdf_url":"https://arxiv.org/pdf/2408.10202v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12267v1","updated":"2025-01-21T16:39:09Z","published":"2025-01-21T16:39:09Z","title":"VipDiff: Towards Coherent and Diverse Video Inpainting via Training-free\n  Denoising Diffusion Models","summary":"  Recent video inpainting methods have achieved encouraging improvements by\nleveraging optical flow to guide pixel propagation from reference frames either\nin the image space or feature space. However, they would produce severe\nartifacts in the mask center when the masked area is too large and no pixel\ncorrespondences can be found for the center. Recently, diffusion models have\ndemonstrated impressive performance in generating diverse and high-quality\nimages, and have been exploited in a number of works for image inpainting.\nThese methods, however, cannot be applied directly to videos to produce\ntemporal-coherent inpainting results. In this paper, we propose a training-free\nframework, named VipDiff, for conditioning diffusion model on the reverse\ndiffusion process to produce temporal-coherent inpainting results without\nrequiring any training data or fine-tuning the pre-trained diffusion models.\nVipDiff takes optical flow as guidance to extract valid pixels from reference\nframes to serve as constraints in optimizing the randomly sampled Gaussian\nnoise, and uses the generated results for further pixel propagation and\nconditional generation. VipDiff also allows for generating diverse video\ninpainting results over different sampled noise. Experiments demonstrate that\nVipDiff can largely outperform state-of-the-art video inpainting methods in\nterms of both spatial-temporal coherence and fidelity.\n","authors":["Chaohao Xie","Kai Han","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2501.12267v1.pdf","comment":"10 pages, 5 Figures (Accepted at WACV 2025)"},{"id":"http://arxiv.org/abs/2501.12266v1","updated":"2025-01-21T16:38:04Z","published":"2025-01-21T16:38:04Z","title":"CBVLM: Training-free Explainable Concept-based Large Vision Language\n  Models for Medical Image Classification","summary":"  The main challenges limiting the adoption of deep learning-based solutions in\nmedical workflows are the availability of annotated data and the lack of\ninterpretability of such systems. Concept Bottleneck Models (CBMs) tackle the\nlatter by constraining the final disease prediction on a set of predefined and\nhuman-interpretable concepts. However, the increased interpretability achieved\nthrough these concept-based explanations implies a higher annotation burden.\nMoreover, if a new concept needs to be added, the whole system needs to be\nretrained. Inspired by the remarkable performance shown by Large\nVision-Language Models (LVLMs) in few-shot settings, we propose a simple, yet\neffective, methodology, CBVLM, which tackles both of the aforementioned\nchallenges. First, for each concept, we prompt the LVLM to answer if the\nconcept is present in the input image. Then, we ask the LVLM to classify the\nimage based on the previous concept predictions. Moreover, in both stages, we\nincorporate a retrieval module responsible for selecting the best examples for\nin-context learning. By grounding the final diagnosis on the predicted\nconcepts, we ensure explainability, and by leveraging the few-shot capabilities\nof LVLMs, we drastically lower the annotation cost. We validate our approach\nwith extensive experiments across four medical datasets and twelve LVLMs (both\ngeneric and medical) and show that CBVLM consistently outperforms CBMs and\ntask-specific supervised methods without requiring any training and using just\na few annotated examples. More information on our project page:\nhttps://cristianopatricio.github.io/CBVLM/.\n","authors":["Cristiano Patrício","Isabel Rio-Torto","Jaime S. Cardoso","Luís F. Teixeira","João C. Neves"],"pdf_url":"https://arxiv.org/pdf/2501.12266v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.12263v1","updated":"2025-01-21T16:34:16Z","published":"2025-01-21T16:34:16Z","title":"mmCooper: A Multi-agent Multi-stage Communication-efficient and\n  Collaboration-robust Cooperative Perception Framework","summary":"  Collaborative perception significantly enhances individual vehicle perception\nperformance through the exchange of sensory information among agents. However,\nreal-world deployment faces challenges due to bandwidth constraints and\ninevitable calibration errors during information exchange. To address these\nissues, we propose mmCooper, a novel multi-agent, multi-stage,\ncommunication-efficient, and collaboration-robust cooperative perception\nframework. Our framework leverages a multi-stage collaboration strategy that\ndynamically and adaptively balances intermediate- and late-stage information to\nshare among agents, enhancing perceptual performance while maintaining\ncommunication efficiency. To support robust collaboration despite potential\nmisalignments and calibration errors, our framework captures multi-scale\ncontextual information for robust fusion in the intermediate stage and\ncalibrates the received detection results to improve accuracy in the late\nstage. We validate the effectiveness of mmCooper through extensive experiments\non real-world and simulated datasets. The results demonstrate the superiority\nof our proposed framework and the effectiveness of each component.\n","authors":["Bingyi Liu","Jian Teng","Hongfei Xue","Enshu Wang","Chuanhui Zhu","Pu Wang","Libing Wu"],"pdf_url":"https://arxiv.org/pdf/2501.12263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12255v1","updated":"2025-01-21T16:23:05Z","published":"2025-01-21T16:23:05Z","title":"HAC++: Towards 100X Compression of 3D Gaussian Splatting","summary":"  3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel\nview synthesis, boasting rapid rendering speed with high fidelity. However, the\nsubstantial Gaussians and their associated attributes necessitate effective\ncompression techniques. Nevertheless, the sparse and unorganized nature of the\npoint cloud of Gaussians (or anchors in our paper) presents challenges for\ncompression. To achieve a compact size, we propose HAC++, which leverages the\nrelationships between unorganized anchors and a structured hash grid, utilizing\ntheir mutual information for context modeling. Additionally, HAC++ captures\nintra-anchor contextual relationships to further enhance compression\nperformance. To facilitate entropy coding, we utilize Gaussian distributions to\nprecisely estimate the probability of each quantized attribute, where an\nadaptive quantization module is proposed to enable high-precision quantization\nof these attributes for improved fidelity restoration. Moreover, we incorporate\nan adaptive masking strategy to eliminate invalid Gaussians and anchors.\nOverall, HAC++ achieves a remarkable size reduction of over 100X compared to\nvanilla 3DGS when averaged on all datasets, while simultaneously improving\nfidelity. It also delivers more than 20X size reduction compared to\nScaffold-GS. Our code is available at\nhttps://github.com/YihangChen-ee/HAC-plus.\n","authors":["Yihang Chen","Qianyi Wu","Weiyao Lin","Mehrtash Harandi","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2501.12255v1.pdf","comment":"IEEE TPAMI Submission. This paper is an extension of HAC at\n  arXiv:2403.14530 (ECCV 2024)"},{"id":"http://arxiv.org/abs/2501.12254v1","updated":"2025-01-21T16:19:38Z","published":"2025-01-21T16:19:38Z","title":"Memory Storyboard: Leveraging Temporal Segmentation for Streaming\n  Self-Supervised Learning from Egocentric Videos","summary":"  Self-supervised learning holds the promise to learn good representations from\nreal-world continuous uncurated data streams. However, most existing works in\nvisual self-supervised learning focus on static images or artificial data\nstreams. Towards exploring a more realistic learning substrate, we investigate\nstreaming self-supervised learning from long-form real-world egocentric video\nstreams. Inspired by the event segmentation mechanism in human perception and\nmemory, we propose \"Memory Storyboard\" that groups recent past frames into\ntemporal segments for more effective summarization of the past visual streams\nfor memory replay. To accommodate efficient temporal segmentation, we propose a\ntwo-tier memory hierarchy: the recent past is stored in a short-term memory,\nand the storyboard temporal segments are then transferred to a long-term\nmemory. Experiments on real-world egocentric video datasets including SAYCam\nand KrishnaCam show that contrastive learning objectives on top of storyboard\nframes result in semantically meaningful representations which outperform those\nproduced by state-of-the-art unsupervised continual learning methods.\n","authors":["Yanlai Yang","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2501.12254v1.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.05884v2","updated":"2025-01-21T16:11:44Z","published":"2024-11-08T10:05:14Z","title":"Untrained Perceptual Loss for image denoising of line-like structures in\n  MR images","summary":"  In the acquisition of Magnetic Resonance (MR) images shorter scan times lead\nto higher image noise. Therefore, automatic image denoising using deep learning\nmethods is of high interest. MR images containing line-like structures such as\nroots or vessels yield special characteristics as they display connected\nstructures and yield sparse information. For this kind of data, it is important\nto consider voxel neighborhoods when training a denoising network. In this\npaper, we translate the Perceptual Loss to 3D data by comparing feature maps of\nuntrained networks in the loss function as done previously for 2D data. We\ntested the performance of untrained Perceptual Loss (uPL) on 3D image denoising\nof MR images displaying brain vessels (MR angiograms - MRA) and images of plant\nroots in soil. We investigate the impact of various uPL characteristics such as\nweight initialization, network depth, kernel size, and pooling operations on\nthe results. We tested the performance of the uPL loss on four Rician noise\nlevels using evaluation metrics such as the Structural Similarity Index Metric\n(SSIM). We observe, that our uPL outperforms conventional loss functions such\nas the L1 loss or a loss based on the Structural Similarity Index Metric\n(SSIM). The uPL network's initialization is not important, while network depth\nand pooling operations impact denoising performance. E.g. for both datasets a\nnetwork with five convolutional layers led to the best performance while a\nnetwork with more layers led to a performance drop. We also find that small uPL\nnetworks led to better or comparable results than using large networks such as\nVGG. We observe superior performance of our loss for both datasets, all noise\nlevels, and three network architectures. In conclusion, for images containing\nline-like structures, uPL is an alternative to other loss functions for 3D\nimage denoising.\n","authors":["Elisabeth Pfaehler","Daniel Pflugfelder","Hanno Scharr"],"pdf_url":"https://arxiv.org/pdf/2411.05884v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12246v1","updated":"2025-01-21T16:07:32Z","published":"2025-01-21T16:07:32Z","title":"Video Deblurring by Sharpness Prior Detection and Edge Information","summary":"  Video deblurring is essential task for autonomous driving, facial\nrecognition, and security surveillance. Traditional methods directly estimate\nmotion blur kernels, often introducing artifacts and leading to poor results.\nRecent approaches utilize the detection of sharp frames within video sequences\nto enhance deblurring. However, existing datasets rely on fixed number of sharp\nframes, which may be too restrictive for some applications and may introduce a\nbias during model training. To address these limitations and enhance domain\nadaptability, this work first introduces GoPro Random Sharp (GoProRS), a new\ndataset where the the frequency of sharp frames within the sequence is\ncustomizable, allowing more diverse training and testing scenarios.\nFurthermore, it presents a novel video deblurring model, called SPEINet, that\nintegrates sharp frame features into blurry frame reconstruction through an\nattention-based encoder-decoder architecture, a lightweight yet robust sharp\nframe detection and an edge extraction phase. Extensive experimental results\ndemonstrate that SPEINet outperforms state-of-the-art methods across multiple\ndatasets, achieving an average of +3.2% PSNR improvement over recent\ntechniques. Given such promising results, we believe that both the proposed\nmodel and dataset pave the way for future advancements in video deblurring\nbased on the detection of sharp frames.\n","authors":["Yang Tian","Fabio Brau","Giulio Rossolini","Giorgio Buttazzo","Hao Meng"],"pdf_url":"https://arxiv.org/pdf/2501.12246v1.pdf","comment":"Under review in Pattern Recognition"},{"id":"http://arxiv.org/abs/2501.12245v1","updated":"2025-01-21T16:04:53Z","published":"2025-01-21T16:04:53Z","title":"Quality Enhancement of Radiographic X-ray Images by Interpretable\n  Mapping","summary":"  X-ray imaging is the most widely used medical imaging modality. However, in\nthe common practice, inconsistency in the initial presentation of X-ray images\nis a common complaint by radiologists. Different patient positions, patient\nhabitus and scanning protocols can lead to differences in image presentations,\ne.g., differences in brightness and contrast globally or regionally. To\ncompensate for this, additional work will be executed by clinical experts to\nadjust the images to the desired presentation, which can be time-consuming.\nExisting deep-learning-based end-to-end solutions can automatically correct\nimages with promising performances. Nevertheless, these methods are hard to be\ninterpreted and difficult to be understood by clinical experts. In this\nmanuscript, a novel interpretable mapping method by deep learning is proposed,\nwhich automatically enhances the image brightness and contrast globally and\nlocally. Meanwhile, because the model is inspired by the workflow of the\nbrightness and contrast manipulation, it can provide interpretable pixel maps\nfor explaining the motivation of image enhancement. The experiment on the\nclinical datasets show the proposed method can provide consistent brightness\nand contrast correction on X-ray images with accuracy of 24.75 dB PSNR and\n0.8431 SSIM.\n","authors":["Hongxu Yang","Najib Akram Aboobacker","Xiaomeng Dong","German Gonzalez","Lehel Ferenczi","Gopal Avinash"],"pdf_url":"https://arxiv.org/pdf/2501.12245v1.pdf","comment":"SPIE Medical Imaging 2025"},{"id":"http://arxiv.org/abs/2501.12244v1","updated":"2025-01-21T16:04:39Z","published":"2025-01-21T16:04:39Z","title":"Zero-shot Bias Correction: Efficient MR Image Inhomogeneity Reduction\n  Without Any Data","summary":"  In recent years, deep neural networks for image inhomogeneity reduction have\nshown promising results. However, current methods with (un)supervised solutions\nrequire preparing a training dataset, which is expensive and laborious for data\ncollection. In this work, we demonstrate a novel zero-shot deep neural\nnetworks, which requires no data for pre-training and dedicated assumption of\nthe bias field. The designed light-weight CNN enables an efficient zero-shot\nadaptation for bias-corrupted image correction. Our method provides a novel\nsolution to mitigate the biased corrupted image as iterative homogeneity\nrefinement, which therefore ensures the considered issue can be solved easier\nwith stable convergence of zero-shot optimization. Extensive comparison on\ndifferent datasets show that the proposed method performs better than current\ndata-free N4 methods in both efficiency and accuracy.\n","authors":["Hongxu Yang","Edina Timko","Brice Fernandez"],"pdf_url":"https://arxiv.org/pdf/2501.12244v1.pdf","comment":"Accepted by ISBI 2025. Supported by IHI PREDICTOM Project"},{"id":"http://arxiv.org/abs/2501.12239v1","updated":"2025-01-21T15:59:21Z","published":"2025-01-21T15:59:21Z","title":"Investigating Market Strength Prediction with CNNs on Candlestick Chart\n  Images","summary":"  This paper investigates predicting market strength solely from candlestick\nchart images to assist investment decisions. The core research problem is\ndeveloping an effective computer vision-based model using raw candlestick\nvisuals without time-series data. We specifically analyze the impact of\nincorporating candlestick patterns that were detected by YOLOv8. The study\nimplements two approaches: pure CNN on chart images and a Decomposer\narchitecture detecting patterns. Experiments utilize diverse financial datasets\nspanning stocks, cryptocurrencies, and forex assets. Key findings demonstrate\ncandlestick patterns do not improve model performance over only image data in\nour research. The significance is illuminating limitations in candlestick image\nsignals. Performance peaked at approximately 0.7 accuracy, below more complex\ntime-series models. Outcomes reveal challenges in distilling sufficient\npredictive power from visual shapes alone, motivating the incorporation of\nother data modalities. This research clarifies how purely image-based models\ncan inform trading while confirming patterns add little value over raw charts.\nOur content is endeavored to be delineated into distinct sections, each\nautonomously furnishing a unique contribution while maintaining cohesive\nlinkage. Note that, the examples discussed herein are not limited to the scope,\napplicability, or knowledge outlined in the paper.\n","authors":["Thanh Nam Duong","Trung Kien Hoang","Quoc Khanh Duong","Quoc Dat Dinh","Duc Hoan Le","Huy Tuan Nguyen","Xuan Bach Nguyen","Quy Ban Tran"],"pdf_url":"https://arxiv.org/pdf/2501.12239v1.pdf","comment":"ACMLC 2025; 8 pages"},{"id":"http://arxiv.org/abs/2501.12235v1","updated":"2025-01-21T15:58:16Z","published":"2025-01-21T15:58:16Z","title":"DLEN: Dual Branch of Transformer for Low-Light Image Enhancement in Dual\n  Domains","summary":"  Low-light image enhancement (LLE) aims to improve the visual quality of\nimages captured in poorly lit conditions, which often suffer from low\nbrightness, low contrast, noise, and color distortions. These issues hinder the\nperformance of computer vision tasks such as object detection, facial\nrecognition, and autonomous driving.Traditional enhancement techniques, such as\nmulti-scale fusion and histogram equalization, fail to preserve fine details\nand often struggle with maintaining the natural appearance of enhanced images\nunder complex lighting conditions. Although the Retinex theory provides a\nfoundation for image decomposition, it often amplifies noise, leading to\nsuboptimal image quality. In this paper, we propose the Dual Light Enhance\nNetwork (DLEN), a novel architecture that incorporates two distinct attention\nmechanisms, considering both spatial and frequency domains. Our model\nintroduces a learnable wavelet transform module in the illumination estimation\nphase, preserving high- and low-frequency components to enhance edge and\ntexture details. Additionally, we design a dual-branch structure that leverages\nthe power of the Transformer architecture to enhance both the illumination and\nstructural components of the image.Through extensive experiments, our model\noutperforms state-of-the-art methods on standard benchmarks.Code is available\nhere: https://github.com/LaLaLoXX/DLEN\n","authors":["Junyu Xia","Jiesong Bai","Yihang Dong"],"pdf_url":"https://arxiv.org/pdf/2501.12235v1.pdf","comment":"10pages,6figures"},{"id":"http://arxiv.org/abs/2501.12231v1","updated":"2025-01-21T15:55:06Z","published":"2025-01-21T15:55:06Z","title":"InsTALL: Context-aware Instructional Task Assistance with Multi-modal\n  Large Language Models","summary":"  The improved competence of generative models can help building multi-modal\nvirtual assistants that leverage modalities beyond language. By observing\nhumans performing multi-step tasks, one can build assistants that have\nsituational awareness of actions and tasks being performed, enabling them to\ncater assistance based on this understanding. In this paper, we develop a\nContext-aware Instructional Task Assistant with Multi-modal Large Language\nModels (InsTALL) that leverages an online visual stream (e.g. a user's screen\nshare or video recording) and responds in real-time to user queries related to\nthe task at hand. To enable useful assistance, InsTALL 1) trains a multi-modal\nmodel on task videos and paired textual data, and 2) automatically extracts\ntask graph from video data and leverages it at training and inference time. We\nshow InsTALL achieves state-of-the-art performance across proposed sub-tasks\nconsidered for multimodal activity understanding -- task recognition (TR),\naction recognition (AR), next action prediction (AP), and plan prediction (PP)\n-- and outperforms existing baselines on two novel sub-tasks related to\nautomatic error identification.\n","authors":["Pha Nguyen","Sailik Sengupta","Girik Malik","Arshit Gupta","Bonan Min"],"pdf_url":"https://arxiv.org/pdf/2501.12231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12224v1","updated":"2025-01-21T15:49:29Z","published":"2025-01-21T15:49:29Z","title":"TokenVerse: Versatile Multi-concept Personalization in Token Modulation\n  Space","summary":"  We present TokenVerse -- a method for multi-concept personalization,\nleveraging a pre-trained text-to-image diffusion model. Our framework can\ndisentangle complex visual elements and attributes from as little as a single\nimage, while enabling seamless plug-and-play generation of combinations of\nconcepts extracted from multiple images. As opposed to existing works,\nTokenVerse can handle multiple images with multiple concepts each, and supports\na wide-range of concepts, including objects, accessories, materials, pose, and\nlighting. Our work exploits a DiT-based text-to-image model, in which the input\ntext affects the generation through both attention and modulation (shift and\nscale). We observe that the modulation space is semantic and enables localized\ncontrol over complex concepts. Building on this insight, we devise an\noptimization-based framework that takes as input an image and a text\ndescription, and finds for each word a distinct direction in the modulation\nspace. These directions can then be used to generate new images that combine\nthe learned concepts in a desired configuration. We demonstrate the\neffectiveness of TokenVerse in challenging personalization settings, and\nshowcase its advantages over existing methods. project's webpage in\nhttps://token-verse.github.io/\n","authors":["Daniel Garibi","Shahar Yadin","Roni Paiss","Omer Tov","Shiran Zada","Ariel Ephrat","Tomer Michaeli","Inbar Mosseri","Tali Dekel"],"pdf_url":"https://arxiv.org/pdf/2501.12224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12218v1","updated":"2025-01-21T15:39:40Z","published":"2025-01-21T15:39:40Z","title":"Exploring Temporally-Aware Features for Point Tracking","summary":"  Point tracking in videos is a fundamental task with applications in robotics,\nvideo editing, and more. While many vision tasks benefit from pre-trained\nfeature backbones to improve generalizability, point tracking has primarily\nrelied on simpler backbones trained from scratch on synthetic data, which may\nlimit robustness in real-world scenarios. Additionally, point tracking requires\ntemporal awareness to ensure coherence across frames, but using\ntemporally-aware features is still underexplored. Most current methods often\nemploy a two-stage process: an initial coarse prediction followed by a\nrefinement stage to inject temporal information and correct errors from the\ncoarse stage. These approach, however, is computationally expensive and\npotentially redundant if the feature backbone itself captures sufficient\ntemporal information.\n  In this work, we introduce Chrono, a feature backbone specifically designed\nfor point tracking with built-in temporal awareness. Leveraging pre-trained\nrepresentations from self-supervised learner DINOv2 and enhanced with a\ntemporal adapter, Chrono effectively captures long-term temporal context,\nenabling precise prediction even without the refinement stage. Experimental\nresults demonstrate that Chrono achieves state-of-the-art performance in a\nrefiner-free setting on the TAP-Vid-DAVIS and TAP-Vid-Kinetics datasets, among\ncommon feature backbones used in point tracking as well as DINOv2, with\nexceptional efficiency. Project page: https://cvlab-kaist.github.io/Chrono/\n","authors":["Inès Hyeonsu Kim","Seokju Cho","Jiahui Huang","Jung Yi","Joon-Young Lee","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2501.12218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12217v1","updated":"2025-01-21T15:39:29Z","published":"2025-01-21T15:39:29Z","title":"Early Detection and Classification of Breast Cancer Using Deep Learning\n  Techniques","summary":"  Breast cancer is one of the deadliest cancers causing about massive number of\npatients to die annually all over the world according to the WHO. It is a kind\nof cancer that develops when the tissues of the breast grow rapidly and\nunboundly. This fatality rate can be prevented if the cancer is detected before\nit gets malignant. Using automation for early-age detection of breast cancer,\nArtificial Intelligence and Machine Learning technologies can be implemented\nfor the best outcome. In this study, we are using the Breast Cancer Image\nClassification dataset collected from the Kaggle depository, which comprises\n9248 Breast Ultrasound Images and is classified into three categories: Benign,\nMalignant, and Normal which refers to non-cancerous, cancerous, and normal\nimages.This research introduces three pretrained model featuring custom\nclassifiers that includes ResNet50, MobileNet, and VGG16, along with a custom\nCNN model utilizing the ReLU activation function.The models ResNet50,\nMobileNet, VGG16, and a custom CNN recorded accuracies of 98.41%, 97.91%,\n98.19%, and 92.94% on the dataset, correspondingly, with ResNet50 achieving the\nhighest accuracy of 98.41%.This model, with its deep and powerful architecture,\nis particularly successful in detecting aberrant cells as well as cancerous or\nnon-cancerous tumors. These accuracies show that the Machine Learning methods\nare more compatible for the classification and early detection of breast\ncancer.\n","authors":["Mst. Mumtahina Labonno","D. M. Asadujjaman","Md. Mahfujur Rahman","Abdullah Tamim","Mst. Jannatul Ferdous","Rafi Muttaki Mahi"],"pdf_url":"https://arxiv.org/pdf/2501.12217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01957v3","updated":"2025-01-21T15:36:41Z","published":"2025-01-03T18:59:52Z","title":"VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction","summary":"  Recent Multimodal Large Language Models (MLLMs) have typically focused on\nintegrating visual and textual modalities, with less emphasis placed on the\nrole of speech in enhancing interaction. However, speech plays a crucial role\nin multimodal dialogue systems, and implementing high-performance in both\nvision and speech tasks remains a significant challenge due to the fundamental\nmodality differences. In this paper, we propose a carefully designed\nmulti-stage training methodology that progressively trains LLM to understand\nboth visual and speech information, ultimately enabling fluent vision and\nspeech interaction. Our approach not only preserves strong vision-language\ncapacity, but also enables efficient speech-to-speech dialogue capabilities\nwithout separate ASR and TTS modules, significantly accelerating multimodal\nend-to-end response speed. By comparing our method against state-of-the-art\ncounterparts across benchmarks for image, video, and speech tasks, we\ndemonstrate that our model is equipped with both strong visual and speech\ncapabilities, making near real-time vision and speech interaction.\n","authors":["Chaoyou Fu","Haojia Lin","Xiong Wang","Yi-Fan Zhang","Yunhang Shen","Xiaoyu Liu","Haoyu Cao","Zuwei Long","Heting Gao","Ke Li","Long Ma","Xiawu Zheng","Rongrong Ji","Xing Sun","Caifeng Shan","Ran He"],"pdf_url":"https://arxiv.org/pdf/2501.01957v3.pdf","comment":"https://github.com/VITA-MLLM/VITA (2K+ Stars by now)"},{"id":"http://arxiv.org/abs/2501.12216v1","updated":"2025-01-21T15:36:08Z","published":"2025-01-21T15:36:08Z","title":"RL-RC-DoT: A Block-level RL agent for Task-Aware Video Compression","summary":"  Video encoders optimize compression for human perception by minimizing\nreconstruction error under bit-rate constraints. In many modern applications\nsuch as autonomous driving, an overwhelming majority of videos serve as input\nfor AI systems performing tasks like object recognition or segmentation, rather\nthan being watched by humans. It is therefore useful to optimize the encoder\nfor a downstream task instead of for perceptual image quality. However, a major\nchallenge is how to combine such downstream optimization with existing standard\nvideo encoders, which are highly efficient and popular. Here, we address this\nchallenge by controlling the Quantization Parameters (QPs) at the macro-block\nlevel to optimize the downstream task. This granular control allows us to\nprioritize encoding for task-relevant regions within each frame. We formulate\nthis optimization problem as a Reinforcement Learning (RL) task, where the\nagent learns to balance long-term implications of choosing QPs on both task\nperformance and bit-rate constraints. Notably, our policy does not require the\ndownstream task as an input during inference, making it suitable for streaming\napplications and edge devices such as vehicles. We demonstrate significant\nimprovements in two tasks, car detection, and ROI (saliency) encoding. Our\napproach improves task performance for a given bit rate compared to traditional\ntask agnostic encoding methods, paving the way for more efficient task-aware\nvideo compression.\n","authors":["Uri Gadot","Assaf Shocher","Shie Mannor","Gal Chechik","Assaf Hallak"],"pdf_url":"https://arxiv.org/pdf/2501.12216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16640v2","updated":"2025-01-21T15:25:51Z","published":"2024-03-25T11:28:52Z","title":"Multi-Scale Texture Loss for CT denoising with GANs","summary":"  Generative Adversarial Networks (GANs) have proved as a powerful framework\nfor denoising applications in medical imaging. However, GAN-based denoising\nalgorithms still suffer from limitations in capturing complex relationships\nwithin the images. In this regard, the loss function plays a crucial role in\nguiding the image generation process, encompassing how much a synthetic image\ndiffers from a real image. To grasp highly complex and non-linear textural\nrelationships in the training process, this work presents a novel approach to\ncapture and embed multi-scale texture information into the loss function. Our\nmethod introduces a differentiable multi-scale texture representation of the\nimages dynamically aggregated by a self-attention layer, thus exploiting\nend-to-end gradient-based optimization. We validate our approach by carrying\nout extensive experiments in the context of low-dose CT denoising, a\nchallenging application that aims to enhance the quality of noisy CT scans. We\nutilize three publicly available datasets, including one simulated and two real\ndatasets. The results are promising as compared to other well-established loss\nfunctions, being also consistent across three different GAN architectures. The\ncode is available at:\nhttps://github.com/TrainLaboratory/MultiScaleTextureLoss-MSTLF\n","authors":["Francesco Di Feola","Lorenzo Tronchin","Valerio Guarrasi","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2403.16640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12206v1","updated":"2025-01-21T15:22:31Z","published":"2025-01-21T15:22:31Z","title":"Fixing Imbalanced Attention to Mitigate In-Context Hallucination of\n  Large Vision-Language Model","summary":"  Large Vision Language Models (LVLMs) have demonstrated remarkable\ncapabilities in understanding and describing visual content, achieving\nstate-of-the-art performance across various vision-language tasks. However,\nthese models frequently exhibit hallucination behavior, where they generate\ndescriptions containing objects or details absent in the input image. Our work\ninvestigates this phenomenon by analyzing attention patterns across transformer\nlayers and heads, revealing that hallucinations often stem from progressive\ndegradation of visual grounding in deeper layers. We propose a novel attention\nmodification approach that combines selective token emphasis and head-specific\nmodulation to maintain visual grounding throughout the generation process. Our\nmethod introduces two key components: (1) a dual-stream token selection\nmechanism that identifies and prioritizes both locally informative and\nspatially significant visual tokens, and (2) an attention head-specific\nmodulation strategy that differentially amplifies visual information processing\nbased on measured visual sensitivity of individual attention heads. Through\nextensive experimentation on the MSCOCO dataset, we demonstrate that our\napproach reduces hallucination rates by up to 62.3\\% compared to baseline\nmodels while maintaining comparable task performance. Our analysis reveals that\nselectively modulating tokens across attention heads with varying levels of\nvisual sensitivity can significantly improve visual grounding without requiring\nmodel retraining.\n","authors":["Kazi Hasan Ibn Arif","Sajib Acharjee Dip","Khizar Hussain","Lang Zhang","Chris Thomas"],"pdf_url":"https://arxiv.org/pdf/2501.12206v1.pdf","comment":"10 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2501.12203v1","updated":"2025-01-21T15:18:55Z","published":"2025-01-21T15:18:55Z","title":"Explainability for Vision Foundation Models: A Survey","summary":"  As artificial intelligence systems become increasingly integrated into daily\nlife, the field of explainability has gained significant attention. This trend\nis particularly driven by the complexity of modern AI models and their\ndecision-making processes. The advent of foundation models, characterized by\ntheir extensive generalization capabilities and emergent uses, has further\ncomplicated this landscape. Foundation models occupy an ambiguous position in\nthe explainability domain: their complexity makes them inherently challenging\nto interpret, yet they are increasingly leveraged as tools to construct\nexplainable models. In this survey, we explore the intersection of foundation\nmodels and eXplainable AI (XAI) in the vision domain. We begin by compiling a\ncomprehensive corpus of papers that bridge these fields. Next, we categorize\nthese works based on their architectural characteristics. We then discuss the\nchallenges faced by current research in integrating XAI within foundation\nmodels. Furthermore, we review common evaluation methodologies for these\ncombined approaches. Finally, we present key observations and insights from our\nsurvey, offering directions for future research in this rapidly evolving field.\n","authors":["Rémi Kazmierczak","Eloïse Berthier","Goran Frehse","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2501.12203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18675v3","updated":"2025-01-21T15:16:59Z","published":"2024-12-24T20:28:07Z","title":"TAB: Transformer Attention Bottlenecks enable User Intervention and\n  Debugging in Vision-Language Models","summary":"  Multi-head self-attention (MHSA) is a key component of Transformers, a widely\npopular architecture in both language and vision. Multiple heads intuitively\nenable different parallel processes over the same input. Yet, they also obscure\nthe attribution of each input patch to the output of a model. We propose a\nnovel 1-head Transformer Attention Bottleneck (TAB) layer, inserted after the\ntraditional MHSA architecture, to serve as an attention bottleneck for\ninterpretability and intervention. Unlike standard self-attention, TAB\nconstrains the total attention over all patches to $\\in [0, 1]$. That is, when\nthe total attention is 0, no visual information is propagated further into the\nnetwork and the vision-language model (VLM) would default to a generic,\nimage-independent response. To demonstrate the advantages of TAB, we train VLMs\nwith TAB to perform image difference captioning. Over three datasets, our\nmodels perform similarly to baseline VLMs in captioning but the bottleneck is\nsuperior in localizing changes and in identifying when no changes occur. TAB is\nthe first architecture to enable users to intervene by editing attention, which\noften produces expected outputs by VLMs.\n","authors":["Pooyan Rahmanzadehgervi","Hung Huy Nguyen","Rosanne Liu","Long Mai","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2412.18675v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12202v1","updated":"2025-01-21T15:16:54Z","published":"2025-01-21T15:16:54Z","title":"Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D\n  Assets Generation","summary":"  We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for\ngenerating high-resolution textured 3D assets. This system includes two\nfoundation components: a large-scale shape generation model -- Hunyuan3D-DiT,\nand a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape\ngenerative model, built on a scalable flow-based diffusion transformer, aims to\ncreate geometry that properly aligns with a given condition image, laying a\nsolid foundation for downstream applications. The texture synthesis model,\nbenefiting from strong geometric and diffusion priors, produces high-resolution\nand vibrant texture maps for either generated or hand-crafted meshes.\nFurthermore, we build Hunyuan3D-Studio -- a versatile, user-friendly production\nplatform that simplifies the re-creation process of 3D assets. It allows both\nprofessional and amateur users to manipulate or even animate their meshes\nefficiently. We systematically evaluate our models, showing that Hunyuan3D 2.0\noutperforms previous state-of-the-art models, including the open-source models\nand closed-source models in geometry details, condition alignment, texture\nquality, and etc. Hunyuan3D 2.0 is publicly released in order to fill the gaps\nin the open-source 3D community for large-scale foundation generative models.\nThe code and pre-trained weights of our models are available at:\nhttps://github.com/Tencent/Hunyuan3D-2\n","authors":["Zibo Zhao","Zeqiang Lai","Qingxiang Lin","Yunfei Zhao","Haolin Liu","Shuhui Yang","Yifei Feng","Mingxin Yang","Sheng Zhang","Xianghui Yang","Huiwen Shi","Sicong Liu","Junta Wu","Yihang Lian","Fan Yang","Ruining Tang","Zebin He","Xinzhou Wang","Jian Liu","Xuhui Zuo","Zhuo Chen","Biwen Lei","Haohan Weng","Jing Xu","Yiling Zhu","Xinhai Liu","Lixin Xu","Changrong Hu","Tianyu Huang","Lifu Wang","Jihong Zhang","Meng Chen","Liang Dong","Yiwen Jia","Yulin Cai","Jiaao Yu","Yixuan Tang","Hao Zhang","Zheng Ye","Peng He","Runzhou Wu","Chao Zhang","Yonghao Tan","Jie Xiao","Yangyu Tao","Jianchen Zhu","Jinbao Xue","Kai Liu","Chongqing Zhao","Xinming Wu","Zhichao Hu","Lei Qin","Jianbing Peng","Zhan Li","Minghui Chen","Xipeng Zhang","Lin Niu","Paige Wang","Yingkai Wang","Haozhao Kuang","Zhongyi Fan","Xu Zheng","Weihao Zhuang","YingPing He","Tian Liu","Yong Yang","Di Wang","Yuhong Liu","Jie Jiang","Jingwei Huang","Chunchao Guo"],"pdf_url":"https://arxiv.org/pdf/2501.12202v1.pdf","comment":"GitHub link: https://github.com/Tencent/Hunyuan3D-2"},{"id":"http://arxiv.org/abs/2501.12191v1","updated":"2025-01-21T14:56:47Z","published":"2025-01-21T14:56:47Z","title":"A margin-based replacement for cross-entropy loss","summary":"  Cross-entropy (CE) loss is the de-facto standard for training deep neural\nnetworks to perform classification. However, CE-trained deep neural networks\nstruggle with robustness and generalisation issues. To alleviate these issues,\nwe propose high error margin (HEM) loss, a variant of multi-class margin loss\nthat overcomes the training issues of other margin-based losses. We evaluate\nHEM extensively on a range of architectures and datasets. We find that HEM loss\nis more effective than cross-entropy loss across a wide range of tasks: unknown\nclass rejection, adversarial robustness, learning with imbalanced data,\ncontinual learning, and semantic segmentation (a pixel-level classification\ntask). Despite all training hyper-parameters being chosen for CE loss, HEM is\ninferior to CE only in terms of clean accuracy and this difference is\ninsignificant. We also compare HEM to specialised losses that have previously\nbeen proposed to improve performance on specific tasks. LogitNorm, a loss\nachieving state-of-the-art performance on unknown class rejection, produces\nsimilar performance to HEM for this task, but is much poorer for continual\nlearning and semantic segmentation. Logit-adjusted loss, designed for\nimbalanced data, has superior results to HEM for that task, but performs more\npoorly on unknown class rejection and semantic segmentation. DICE, a popular\nloss for semantic segmentation, is inferior to HEM loss on all tasks, including\nsemantic segmentation. Thus, HEM often out-performs specialised losses, and in\ncontrast to them, is a general-purpose replacement for CE loss.\n","authors":["Michael W. Spratling","Heiko H. Schütt"],"pdf_url":"https://arxiv.org/pdf/2501.12191v1.pdf","comment":"Code: https://codeberg.org/mwspratling/HEMLoss"},{"id":"http://arxiv.org/abs/2403.02302v4","updated":"2025-01-21T14:50:25Z","published":"2024-03-04T18:32:12Z","title":"Beyond Specialization: Assessing the Capabilities of MLLMs in Age and\n  Gender Estimation","summary":"  Multimodal Large Language Models (MLLMs) have recently gained immense\npopularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as\nopen-source ones such as LLaVA, are essentially general-purpose models and are\napplied to solve a wide variety of tasks, including those in computer vision.\nThese neural networks possess such strong general knowledge and reasoning\nabilities that they have proven capable of working even on tasks for which they\nwere not specifically trained. We compared the capabilities of the most\npowerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task\nof age and gender estimation with our state-of-the-art specialized model,\nMiVOLO. We also updated MiVOLO and provide details and new metrics in this\narticle. This comparison has yielded some interesting results and insights\nabout the strengths and weaknesses of the participating models. Furthermore, we\nattempted various ways to fine-tune the ShareGPT4V model for this specific\ntask, aiming to achieve state-of-the-art results in this particular challenge.\nAlthough such a model would not be practical in production, as it is incredibly\nexpensive compared to a specialized model like MiVOLO, it could be very useful\nin some tasks, like data annotation.\n","authors":["Maksim Kuprashevich","Grigorii Alekseenko","Irina Tolstykh"],"pdf_url":"https://arxiv.org/pdf/2403.02302v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11303v3","updated":"2025-01-21T14:40:56Z","published":"2024-02-17T15:03:25Z","title":"FViT: A Focal Vision Transformer with Gabor Filter","summary":"  Vision transformers have achieved encouraging progress in various computer\nvision tasks. A common belief is that this is attributed to the capability of\nself-attention in modeling the global dependencies among feature tokens.\nHowever, self-attention still faces several challenges in dense prediction\ntasks, including high computational complexity and absence of desirable\ninductive bias. To alleviate these issues, the potential advantages of\ncombining vision transformers with Gabor filters are revisited, and a learnable\nGabor filter (LGF) using convolution is proposed. The LGF does not rely on\nself-attention, and it is used to simulate the response of fundamental cells in\nthe biological visual system to the input images. This encourages vision\ntransformers to focus on discriminative feature representations of targets\nacross different scales and orientations. In addition, a Bionic Focal Vision\n(BFV) block is designed based on the LGF. This block draws inspiration from\nneuroscience and introduces a Dual-Path Feed Forward Network (DPFFN) to emulate\nthe parallel and cascaded information processing scheme of the biological\nvisual cortex. Furthermore, a unified and efficient family of pyramid backbone\nnetworks called Focal Vision Transformers (FViTs) is developed by stacking BFV\nblocks. Experimental results indicate that FViTs demonstrate superior\nperformance in various vision tasks. In terms of computational efficiency and\nscalability, FViTs show significant advantages compared with other\ncounterparts.\n","authors":["Yulong Shi","Mingwei Sun","Yongshuai Wang","Zengqiang Chen"],"pdf_url":"https://arxiv.org/pdf/2402.11303v3.pdf","comment":"This work has been submitted to Elsevier for possible publication"},{"id":"http://arxiv.org/abs/2501.12178v1","updated":"2025-01-21T14:35:35Z","published":"2025-01-21T14:35:35Z","title":"High-dimensional multimodal uncertainty estimation by manifold\n  alignment:Application to 3D right ventricular strain computations","summary":"  Confidence in the results is a key ingredient to improve the adoption of\nmachine learning methods by clinicians. Uncertainties on the results have been\nconsidered in the literature, but mostly those originating from the learning\nand processing methods. Uncertainty on the data is hardly challenged, as a\nsingle sample is often considered representative enough of each subject\nincluded in the analysis. In this paper, we propose a representation learning\nstrategy to estimate local uncertainties on a physiological descriptor (here,\nmyocardial deformation) previously obtained from medical images by different\ndefinitions or computations. We first use manifold alignment to match the\nlatent representations associated to different high-dimensional input\ndescriptors. Then, we formulate plausible distributions of latent\nuncertainties, and finally exploit them to reconstruct uncertainties on the\ninput high-dimensional descriptors. We demonstrate its relevance for the\nquantification of myocardial deformation (strain) from 3D echocardiographic\nimage sequences of the right ventricle, for which a lack of consensus exists in\nits definition and which directional component to use. We used a database of\n100 control subjects with right ventricle overload, for which different types\nof strain are available at each point of the right ventricle endocardial\nsurface mesh. Our approach quantifies local uncertainties on myocardial\ndeformation from different descriptors defining this physiological concept.\nSuch uncertainties cannot be directly estimated by local statistics on such\ndescriptors, potentially of heterogeneous types. Beyond this controlled\nillustrative application, our methodology has the potential to be generalized\nto many other population analyses considering heterogeneous high-dimensional\ndescriptors.\n","authors":["Maxime Di Folco","Gabriel Bernardino","Patrick Clarysse","Nicolas Duchateau"],"pdf_url":"https://arxiv.org/pdf/2501.12178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12173v1","updated":"2025-01-21T14:32:47Z","published":"2025-01-21T14:32:47Z","title":"ComposeAnyone: Controllable Layout-to-Human Generation with Decoupled\n  Multimodal Conditions","summary":"  Building on the success of diffusion models, significant advancements have\nbeen made in multimodal image generation tasks. Among these, human image\ngeneration has emerged as a promising technique, offering the potential to\nrevolutionize the fashion design process. However, existing methods often focus\nsolely on text-to-image or image reference-based human generation, which fails\nto satisfy the increasingly sophisticated demands. To address the limitations\nof flexibility and precision in human generation, we introduce ComposeAnyone, a\ncontrollable layout-to-human generation method with decoupled multimodal\nconditions. Specifically, our method allows decoupled control of any part in\nhand-drawn human layouts using text or reference images, seamlessly integrating\nthem during the generation process. The hand-drawn layout, which utilizes\ncolor-blocked geometric shapes such as ellipses and rectangles, can be easily\ndrawn, offering a more flexible and accessible way to define spatial layouts.\nAdditionally, we introduce the ComposeHuman dataset, which provides decoupled\ntext and reference image annotations for different components of each human\nimage, enabling broader applications in human image generation tasks. Extensive\nexperiments on multiple datasets demonstrate that ComposeAnyone generates human\nimages with better alignment to given layouts, text descriptions, and reference\nimages, showcasing its multi-task capability and controllability.\n","authors":["Shiyue Zhang","Zheng Chong","Xi Lu","Wenqing Zhang","Haoxiang Li","Xujie Zhang","Jiehui Huang","Xiao Dong","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2501.12173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12169v1","updated":"2025-01-21T14:29:27Z","published":"2025-01-21T14:29:27Z","title":"SVGS-DSGAT: An IoT-Enabled Innovation in Underwater Robotic Object\n  Detection Technology","summary":"  With the advancement of Internet of Things (IoT) technology, underwater\ntarget detection and tracking have become increasingly important for ocean\nmonitoring and resource management. Existing methods often fall short in\nhandling high-noise and low-contrast images in complex underwater environments,\nlacking precision and robustness. This paper introduces a novel SVGS-DSGAT\nmodel that combines GraphSage, SVAM, and DSGAT modules, enhancing feature\nextraction and target detection capabilities through graph neural networks and\nattention mechanisms. The model integrates IoT technology to facilitate\nreal-time data collection and processing, optimizing resource allocation and\nmodel responsiveness. Experimental results demonstrate that the SVGS-DSGAT\nmodel achieves an mAP of 40.8% on the URPC 2020 dataset and 41.5% on the\nSeaDronesSee dataset, significantly outperforming existing mainstream models.\nThis IoT-enhanced approach not only excels in high-noise and complex\nbackgrounds but also improves the overall efficiency and scalability of the\nsystem. This research provides an effective IoT solution for underwater target\ndetection technology, offering significant practical application value and\nbroad development prospects.\n","authors":["Dongli Wu","Ling Luo"],"pdf_url":"https://arxiv.org/pdf/2501.12169v1.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.12157v1","updated":"2025-01-21T14:09:58Z","published":"2025-01-21T14:09:58Z","title":"Fast-RF-Shimming: Accelerate RF Shimming in 7T MRI using Deep Learning","summary":"  Ultrahigh field (UHF) Magnetic Resonance Imaging (MRI) provides a high\nsignal-to-noise ratio (SNR), enabling exceptional spatial resolution for\nclinical diagnostics and research. However, higher fields introduce challenges\nsuch as transmit radiofrequency (RF) field inhomogeneities, which result in\nuneven flip angles and image intensity artifacts. These artifacts degrade image\nquality and limit clinical adoption. Traditional RF shimming methods, including\nMagnitude Least Squares (MLS) optimization, mitigate RF field inhomogeneity but\nare time-intensive and often require the presence of the patient. Recent\nmachine learning methods, such as RF Shim Prediction by Iteratively Projected\nRidge Regression and other deep learning architectures, offer alternative\napproaches but face challenges such as extensive training requirements, limited\ncomplexity, and practical data constraints. This paper introduces a holistic\nlearning-based framework called Fast RF Shimming, which achieves a 5000-fold\nspeedup compared to MLS methods. First, random-initialized Adaptive Moment\nEstimation (Adam) derives reference shimming weights from multichannel RF\nfields. Next, a Residual Network (ResNet) maps RF fields to shimming outputs\nwhile incorporating a confidence parameter into the loss function. Finally, a\nNon-uniformity Field Detector (NFD) identifies extreme non-uniform outcomes.\nComparative evaluations demonstrate significant improvements in both speed and\npredictive accuracy. The proposed pipeline also supports potential extensions,\nsuch as the integration of anatomical priors or multi-echo data, to enhance the\nrobustness of RF field correction. This approach offers a faster and more\nefficient solution to RF shimming challenges in UHF MRI.\n","authors":["Zhengyi Lu","Hao Liang","Ming Lu","Xiao Wang","Xinqiang Yan","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2501.12157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12150v1","updated":"2025-01-21T14:01:10Z","published":"2025-01-21T14:01:10Z","title":"DNRSelect: Active Best View Selection for Deferred Neural Rendering","summary":"  Deferred neural rendering (DNR) is an emerging computer graphics pipeline\ndesigned for high-fidelity rendering and robotic perception. However, DNR\nheavily relies on datasets composed of numerous ray-traced images and demands\nsubstantial computational resources. It remains under-explored how to reduce\nthe reliance on high-quality ray-traced images while maintaining the rendering\nfidelity. In this paper, we propose DNRSelect, which integrates a reinforcement\nlearning-based view selector and a 3D texture aggregator for deferred neural\nrendering. We first propose a novel view selector for deferred neural rendering\nbased on reinforcement learning, which is trained on easily obtained rasterized\nimages to identify the optimal views. By acquiring only a few ray-traced images\nfor these selected views, the selector enables DNR to achieve high-quality\nrendering. To further enhance spatial awareness and geometric consistency in\nDNR, we introduce a 3D texture aggregator that fuses pyramid features from\ndepth maps and normal maps with UV maps. Given that acquiring ray-traced images\nis more time-consuming than generating rasterized images, DNRSelect minimizes\nthe need for ray-traced data by using only a few selected views while still\nachieving high-fidelity rendering results. We conduct detailed experiments and\nablation studies on the NeRF-Synthetic dataset to demonstrate the effectiveness\nof DNRSelect. The code will be released.\n","authors":["Dongli Wu","Haochen Li","Xiaobao Wei"],"pdf_url":"https://arxiv.org/pdf/2501.12150v1.pdf","comment":"7 pages, 8 figures, submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2501.12119v1","updated":"2025-01-21T13:30:16Z","published":"2025-01-21T13:30:16Z","title":"ENTIRE: Learning-based Volume Rendering Time Prediction","summary":"  We present ENTIRE, a novel approach for volume rendering time prediction.\nTime-dependent volume data from simulations or experiments typically comprise\ncomplex deforming structures across hundreds or thousands of time steps, which\nin addition to the camera configuration has a significant impact on rendering\nperformance. We first extract a feature vector from a volume that captures its\nstructure that is relevant for rendering time performance. Then we combine this\nfeature vector with further relevant parameters (e.g. camera setup), and with\nthis perform the final prediction. Our experiments conducted on various\ndatasets demonstrate that our model is capable of efficiently achieving high\nprediction accuracy with fast response rates. We showcase ENTIRE's capability\nof enabling dynamic parameter adaptation for stable frame rates and load\nbalancing in two case studies.\n","authors":["Zikai Yin","Hamid Gadirov","Jiri Kosinka","Steffen Frey"],"pdf_url":"https://arxiv.org/pdf/2501.12119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12115v1","updated":"2025-01-21T13:25:32Z","published":"2025-01-21T13:25:32Z","title":"Meta-Sparsity: Learning Optimal Sparse Structures in Multi-task Networks\n  through Meta-learning","summary":"  This paper presents meta-sparsity, a framework for learning model sparsity,\nbasically learning the parameter that controls the degree of sparsity, that\nallows deep neural networks (DNNs) to inherently generate optimal sparse shared\nstructures in multi-task learning (MTL) setting. This proposed approach enables\nthe dynamic learning of sparsity patterns across a variety of tasks, unlike\ntraditional sparsity methods that rely heavily on manual hyperparameter tuning.\nInspired by Model Agnostic Meta-Learning (MAML), the emphasis is on learning\nshared and optimally sparse parameters in multi-task scenarios by implementing\na penalty-based, channel-wise structured sparsity during the meta-training\nphase. This method improves the model's efficacy by removing unnecessary\nparameters and enhances its ability to handle both seen and previously unseen\ntasks. The effectiveness of meta-sparsity is rigorously evaluated by extensive\nexperiments on two datasets, NYU-v2 and CelebAMask-HQ, covering a broad\nspectrum of tasks ranging from pixel-level to image-level predictions. The\nresults show that the proposed approach performs well across many tasks,\nindicating its potential as a versatile tool for creating efficient and\nadaptable sparse neural networks. This work, therefore, presents an approach\ntowards learning sparsity, contributing to the efforts in the field of sparse\nneural networks and suggesting new directions for research towards parsimonious\nmodels.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2501.12115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05693v2","updated":"2025-01-21T13:15:29Z","published":"2024-04-08T17:18:30Z","title":"Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic\n  Segmentation for Satellite Imagery","summary":"  Satellite imagery is crucial for tasks like environmental monitoring and\nurban planning. Typically, it relies on semantic segmentation or Land Use Land\nCover (LULC) classification to categorize each pixel. Despite the advancements\nbrought about by Deep Neural Networks (DNNs), their performance in segmentation\ntasks is hindered by challenges such as limited availability of labeled data,\nclass imbalance and the inherent variability and complexity of satellite\nimages. In order to mitigate those issues, our study explores the effectiveness\nof a Cut-and-Paste augmentation technique for semantic segmentation in\nsatellite images. We adapt this augmentation, which usually requires labeled\ninstances, to the case of semantic segmentation. By leveraging the connected\ncomponents in the semantic segmentation labels, we extract instances that are\nthen randomly pasted during training. Using the DynamicEarthNet dataset and a\nU-Net model for evaluation, we found that this augmentation significantly\nenhances the mIoU score on the test set from 37.9 to 44.1. This finding\nhighlights the potential of the Cut-and-Paste augmentation to improve the\ngeneralization capabilities of semantic segmentation models in satellite\nimagery.\n","authors":["Ionut M. Motoi","Leonardo Saraceni","Daniele Nardi","Thomas A. Ciarfuglia"],"pdf_url":"https://arxiv.org/pdf/2404.05693v2.pdf","comment":"Published in: IGARSS 2024 - 2024 IEEE International Geoscience and\n  Remote Sensing Symposium"},{"id":"http://arxiv.org/abs/2501.12104v1","updated":"2025-01-21T12:55:04Z","published":"2025-01-21T12:55:04Z","title":"Teacher Encoder-Student Decoder Denoising Guided Segmentation Network\n  for Anomaly Detection","summary":"  Visual anomaly detection is a highly challenging task, often categorized as a\none-class classification and segmentation problem. Recent studies have\ndemonstrated that the student-teacher (S-T) framework effectively addresses\nthis challenge. However, most S-T frameworks rely solely on pre-trained teacher\nnetworks to guide student networks in learning multi-scale similar features,\noverlooking the potential of the student networks to enhance learning through\nmulti-scale feature fusion. In this study, we propose a novel model named\nPFADSeg, which integrates a pre-trained teacher network, a denoising student\nnetwork with multi-scale feature fusion, and a guided anomaly segmentation\nnetwork into a unified framework. By adopting a unique teacher-encoder and\nstudent-decoder denoising mode, the model improves the student network's\nability to learn from teacher network features. Furthermore, an adaptive\nfeature fusion mechanism is introduced to train a self-supervised segmentation\nnetwork that synthesizes anomaly masks autonomously, significantly increasing\ndetection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves\nstate-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean\nprecision of 76.4%, and an instance-level mean precision of 78.7%.\n","authors":["ShiXuan Song","Hao Chen","Shu Hu","Xin Wang","Jinrong Hu","Xi Wu"],"pdf_url":"https://arxiv.org/pdf/2501.12104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12102v1","updated":"2025-01-21T12:49:30Z","published":"2025-01-21T12:49:30Z","title":"Proxies for Distortion and Consistency with Applications for Real-World\n  Image Restoration","summary":"  Real-world image restoration deals with the recovery of images suffering from\nan unknown degradation. This task is typically addressed while being given only\ndegraded images, without their corresponding ground-truth versions. In this\nhard setting, designing and evaluating restoration algorithms becomes highly\nchallenging. This paper offers a suite of tools that can serve both the design\nand assessment of real-world image restoration algorithms. Our work starts by\nproposing a trained model that predicts the chain of degradations a given\nreal-world measured input has gone through. We show how this estimator can be\nused to approximate the consistency -- the match between the measurements and\nany proposed recovered image. We also use this estimator as a guiding force for\nthe design of a simple and highly-effective plug-and-play real-world image\nrestoration algorithm, leveraging a pre-trained diffusion-based image prior.\nFurthermore, this work proposes no-reference proxy measures of MSE and LPIPS,\nwhich, without access to the ground-truth images, allow ranking of real-world\nimage restoration algorithms according to their (approximate) MSE and LPIPS.\nThe proposed suite provides a versatile, first of its kind framework for\nevaluating and comparing blind image restoration algorithms in real-world\nscenarios.\n","authors":["Sean Man","Guy Ohayon","Ron Raphaeli","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2501.12102v1.pdf","comment":"Project page in https://man-sean.github.io/elad-website/"},{"id":"http://arxiv.org/abs/2412.08344v3","updated":"2025-01-21T12:30:57Z","published":"2024-12-11T12:34:37Z","title":"CoDTS: Enhancing Sparsely Supervised Collaborative Perception with a\n  Dual Teacher-Student Framework","summary":"  Current collaborative perception methods often rely on fully annotated\ndatasets, which can be expensive to obtain in practical situations. To reduce\nannotation costs, some works adopt sparsely supervised learning techniques and\ngenerate pseudo labels for the missing instances. However, these methods fail\nto achieve an optimal confidence threshold that harmonizes the quality and\nquantity of pseudo labels. To address this issue, we propose an end-to-end\nCollaborative perception Dual Teacher-Student framework (CoDTS), which employs\nadaptive complementary learning to produce both high-quality and high-quantity\npseudo labels. Specifically, the Main Foreground Mining (MFM) module generates\nhigh-quality pseudo labels based on the prediction of the static teacher.\nSubsequently, the Supplement Foreground Mining (SFM) module ensures a balance\nbetween the quality and quantity of pseudo labels by adaptively identifying\nmissing instances based on the prediction of the dynamic teacher. Additionally,\nthe Neighbor Anchor Sampling (NAS) module is incorporated to enhance the\nrepresentation of pseudo labels. To promote the adaptive complementary\nlearning, we implement a staged training strategy that trains the student and\ndynamic teacher in a mutually beneficial manner. Extensive experiments\ndemonstrate that the CoDTS effectively ensures an optimal balance of pseudo\nlabels in both quality and quantity, establishing a new state-of-the-art in\nsparsely supervised collaborative perception.\n","authors":["Yushan Han","Hui Zhang","Honglei Zhang","Jing Wang","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2412.08344v3.pdf","comment":"AAAI 2025 (Oral)"},{"id":"http://arxiv.org/abs/2501.12087v1","updated":"2025-01-21T12:29:45Z","published":"2025-01-21T12:29:45Z","title":"UAV-Assisted Real-Time Disaster Detection Using Optimized Transformer\n  Model","summary":"  Disaster recovery and management present significant challenges, particularly\nin unstable environments and hard-to-reach terrains. These difficulties can be\novercome by employing unmanned aerial vehicles (UAVs) equipped with onboard\nembedded platforms and camera sensors. In this work, we address the critical\nneed for accurate and timely disaster detection by enabling onboard aerial\nimagery processing and avoiding connectivity, privacy, and latency issues\ndespite the challenges posed by limited onboard hardware resources. We propose\na UAV-assisted edge framework for real-time disaster management, leveraging our\nproposed model optimized for real-time aerial image classification. The\noptimization of the model employs post-training quantization techniques. For\nreal-world disaster scenarios, we introduce a novel dataset, DisasterEye,\nfeaturing UAV-captured disaster scenes as well as ground-level images taken by\nindividuals on-site. Experimental results demonstrate the effectiveness of our\nmodel, achieving high accuracy with reduced inference latency and memory usage\non resource-constrained devices. The framework's scalability and adaptability\nmake it a robust solution for real-time disaster detection on resource-limited\nUAV platforms.\n","authors":["Branislava Jankovic","Sabina Jangirova","Waseem Ullah","Latif U. Khan","Mohsen Guizani"],"pdf_url":"https://arxiv.org/pdf/2501.12087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12086v1","updated":"2025-01-21T12:28:36Z","published":"2025-01-21T12:28:36Z","title":"DSTSA-GCN: Advancing Skeleton-Based Gesture Recognition with\n  Semantic-Aware Spatio-Temporal Topology Modeling","summary":"  Graph convolutional networks (GCNs) have emerged as a powerful tool for\nskeleton-based action and gesture recognition, thanks to their ability to model\nspatial and temporal dependencies in skeleton data. However, existing GCN-based\nmethods face critical limitations: (1) they lack effective spatio-temporal\ntopology modeling that captures dynamic variations in skeletal motion, and (2)\nthey struggle to model multiscale structural relationships beyond local joint\nconnectivity. To address these issues, we propose a novel framework called\nDynamic Spatial-Temporal Semantic Awareness Graph Convolutional Network\n(DSTSA-GCN). DSTSA-GCN introduces three key modules: Group Channel-wise Graph\nConvolution (GC-GC), Group Temporal-wise Graph Convolution (GT-GC), and\nMulti-Scale Temporal Convolution (MS-TCN). GC-GC and GT-GC operate in parallel\nto independently model channel-specific and frame-specific correlations,\nenabling robust topology learning that accounts for temporal variations.\nAdditionally, both modules employ a grouping strategy to adaptively capture\nmultiscale structural relationships. Complementing this, MS-TCN enhances\ntemporal modeling through group-wise temporal convolutions with diverse\nreceptive fields. Extensive experiments demonstrate that DSTSA-GCN\nsignificantly improves the topology modeling capabilities of GCNs, achieving\nstate-of-the-art performance on benchmark datasets for gesture and action\nrecognition, including SHREC17 Track, DHG-14\\/28, NTU-RGB+D, and NTU-RGB+D-120.\n","authors":["Hu Cui","Renjing Huang","Ruoyu Zhang","Tessai Hayama"],"pdf_url":"https://arxiv.org/pdf/2501.12086v1.pdf","comment":"submit to Neurocomputing"},{"id":"http://arxiv.org/abs/2501.12085v1","updated":"2025-01-21T12:22:15Z","published":"2025-01-21T12:22:15Z","title":"Scalable Whole Slide Image Representation Using K-Mean Clustering and\n  Fisher Vector Aggregation","summary":"  Whole slide images (WSIs) are high-resolution, gigapixel sized images that\npose significant computational challenges for traditional machine learning\nmodels due to their size and heterogeneity.In this paper, we present a scalable\nand efficient methodology for WSI classification by leveraging patch-based\nfeature extraction, clustering, and Fisher vector encoding. Initially, WSIs are\ndivided into fixed size patches, and deep feature embeddings are extracted from\neach patch using a pre-trained convolutional neural network (CNN). These\npatch-level embeddings are subsequently clustered using K-means clustering,\nwhere each cluster aggregates semantically similar regions of the WSI. To\neffectively summarize each cluster, Fisher vector representations are computed\nby modeling the distribution of patch embeddings in each cluster as a\nparametric Gaussian mixture model (GMM). The Fisher vectors from each cluster\nare concatenated into a high-dimensional feature vector, creating a compact and\ninformative representation of the entire WSI. This feature vector is then used\nby a classifier to predict the WSI's diagnostic label. Our method captures\nlocal and global tissue structures and yields robust performance for\nlarge-scale WSI classification, demonstrating superior accuracy and scalability\ncompared to other approaches.\n","authors":["Ravi Kant Gupta","Shounak Das","Ardhendu Sekhar","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2501.12085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12082v1","updated":"2025-01-21T12:15:16Z","published":"2025-01-21T12:15:16Z","title":"A Multi-annotated and Multi-modal Dataset for Wide-angle Video Quality\n  Assessment","summary":"  Wide-angle video is favored for its wide viewing angle and ability to capture\na large area of scenery, making it an ideal choice for sports and adventure\nrecording. However, wide-angle video is prone to deformation, exposure and\nother distortions, resulting in poor video quality and affecting the perception\nand experience, which may seriously hinder its application in fields such as\ncompetitive sports. Up to now, few explorations focus on the quality assessment\nissue of wide-angle video. This deficiency primarily stems from the absence of\na specialized dataset for wide-angle videos. To bridge this gap, we construct\nthe first Multi-annotated and multi-modal Wide-angle Video quality assessment\n(MWV) dataset. Then, the performances of state-of-the-art video quality methods\non the MWV dataset are investigated by inter-dataset testing and intra-dataset\ntesting. Experimental results show that these methods impose significant\nlimitations on their applicability.\n","authors":["Bo Hu","Wei Wang","Chunyi Li","Lihuo He","Leida Li","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2501.12082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12073v1","updated":"2025-01-21T11:59:07Z","published":"2025-01-21T11:59:07Z","title":"Towards autonomous photogrammetric forest inventory using a lightweight\n  under-canopy robotic drone","summary":"  Drones are increasingly used in forestry to capture high-resolution remote\nsensing data. While operations above the forest canopy are already highly\nautomated, flying inside forests remains challenging, primarily relying on\nmanual piloting. Inside dense forests, reliance on the Global Navigation\nSatellite System (GNSS) for localization is not feasible. Additionally, the\ndrone must autonomously adjust its flight path to avoid collisions. Recently,\nadvancements in robotics have enabled autonomous drone flights in GNSS-denied\nobstacle-rich areas. In this article, a step towards autonomous forest data\ncollection is taken by building a prototype of a robotic under-canopy drone\nutilizing state-of-the-art open-source methods and validating its performance\nfor data collection inside forests. The autonomous flight capability was\nevaluated through multiple test flights in two boreal forest test sites. The\ntree parameter estimation capability was studied by conducting diameter at\nbreast height (DBH) estimation using onboard stereo camera data and\nphotogrammetric methods. The prototype conducted flights in selected\nchallenging forest environments, and the experiments showed excellent\nperformance in forest reconstruction with a miniaturized stereoscopic\nphotogrammetric system. The stem detection algorithm managed to identify 79.31\n% of the stems. The DBH estimation had a root mean square error (RMSE) of 3.33\ncm (12.79 %) and a bias of 1.01 cm (3.87 %) across all trees. For trees with a\nDBH less than 30 cm, the RMSE was 1.16 cm (5.74 %), and the bias was 0.13 cm\n(0.64 %). When considering the overall performance in terms of DBH accuracy,\nautonomy, and forest complexity, the proposed approach was superior compared to\nmethods proposed in the scientific literature. Results provided valuable\ninsights into autonomous forest reconstruction using drones, and several\nfurther development topics were proposed.\n","authors":["Väinö Karjalainen","Niko Koivumäki","Teemu Hakala","Jesse Muhojoki","Eric Hyyppä","Anand George","Juha Suomalainen","Eija Honkavaara"],"pdf_url":"https://arxiv.org/pdf/2501.12073v1.pdf","comment":"35 pages, 13 Figures"},{"id":"http://arxiv.org/abs/2501.12071v1","updated":"2025-01-21T11:54:37Z","published":"2025-01-21T11:54:37Z","title":"Co-Paced Learning Strategy Based on Confidence for Flying Bird Object\n  Detection Model Training","summary":"  To mitigate the adverse effects of hard samples on the training of the Flying\nBird Object Detection (FBOD) model for surveillance videos, we propose a\nCo-Paced Learning Based on Confidence (CPL-BC) strategy and apply this strategy\nto the training process of the FBOD model. This strategy involves maintaining\ntwo models with identical structures but different initial parameter\nconfigurations, which collaborate with each other to select easy samples with\nprediction confidence exceeding a set threshold for training. As training\nprogresses, the strategy gradually lowers the threshold, allowing more samples\nto participate, enhancing the model's ability to recognize objects from easy to\nhard. Before applying the CPL-BC strategy to train the FBOD models, we\ninitially trained the two FBOD models to equip them with the capability to\nassess the difficulty level of flying bird object samples. Experimental results\non two different datasets of flying bird objects in surveillance videos\ndemonstrate that, compared to other model learning strategies, CPL-BC\nsignificantly improves detection accuracy, verifying the effectiveness and\nadvancement of this method.\n","authors":["Zi-Wei Sun","Ze-Xi Hua","Heng-Chao Li","Yan Li"],"pdf_url":"https://arxiv.org/pdf/2501.12071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12060v1","updated":"2025-01-21T11:30:51Z","published":"2025-01-21T11:30:51Z","title":"GaussianVideo: Efficient Video Representation Through 2D Gaussian\n  Splatting","summary":"  3D Gaussian splats have emerged as a revolutionary, effective, learned\nrepresentation for static 3D scenes. In this work, we explore using 2D Gaussian\nsplats as a new primitive for representing videos. We propose GaussianVideo, an\napproach to learning a set of 2D Gaussian splats that can effectively represent\nvideo frames. GaussianVideo incorporates the following techniques: (i) To\nexploit temporal redundancy among adjacent frames, which can speed up training\nand improve the compression efficiency, we predict the Gaussian splats of a\nframe based on its previous frame; (ii) To control the trade-offs between file\nsize and quality, we remove Gaussian splats with low contribution to the video\nquality; (iii) To capture dynamics in videos, we randomly add Gaussian splats\nto fit content with large motion or newly-appeared objects; (iv) To handle\nsignificant changes in the scene, we detect key frames based on loss\ndifferences during the learning process. Experiment results show that\nGaussianVideo achieves good rate-distortion trade-offs, comparable to\nstate-of-the-art video codecs such as AV1 and VVC, and a rendering speed of\n1500 fps for a 1920x1080 video.\n","authors":["Longan Wang","Yuang Shi","Wei Tsang Ooi"],"pdf_url":"https://arxiv.org/pdf/2501.12060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12057v1","updated":"2025-01-21T11:27:54Z","published":"2025-01-21T11:27:54Z","title":"Unified 3D MRI Representations via Sequence-Invariant Contrastive\n  Learning","summary":"  Self-supervised deep learning has accelerated 2D natural image analysis but\nremains difficult to translate into 3D MRI, where data are scarce and\npre-trained 2D backbones cannot capture volumetric context. We present a\nsequence-invariant self-supervised framework leveraging quantitative MRI\n(qMRI). By simulating multiple MRI contrasts from a single 3D qMRI scan and\nenforcing consistent representations across these contrasts, we learn\nanatomy-centric rather than sequence-specific features. This yields a robust 3D\nencoder that performs strongly across varied tasks and protocols. Experiments\non healthy brain segmentation (IXI), stroke lesion segmentation (ARC), and MRI\ndenoising show significant gains over baseline SSL approaches, especially in\nlow-data settings (up to +8.3% Dice, +4.2 dB PSNR). Our model also generalises\neffectively to unseen sites, demonstrating potential for more scalable and\nclinically reliable volumetric analysis. All code and trained models are\npublicly available.\n","authors":["Liam Chalcroft","Jenny Cronin","Cathy J. Price","John Ashburner"],"pdf_url":"https://arxiv.org/pdf/2501.12057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12054v1","updated":"2025-01-21T11:26:02Z","published":"2025-01-21T11:26:02Z","title":"ORCAst: Operational High-Resolution Current Forecasts","summary":"  We present ORCAst, a multi-stage, multi-arm network for Operational\nhigh-Resolution Current forecAsts over one week. Producing real-time nowcasts\nand forecasts of ocean surface currents is a challenging problem due to\nindirect or incomplete information from satellite remote sensing data. Entirely\ntrained on real satellite data and in situ measurements from drifters, our\nmodel learns to forecast global ocean surface currents using various sources of\nground truth observations in a multi-stage learning procedure. Our multi-arm\nencoder-decoder model architecture allows us to first predict sea surface\nheight and geostrophic currents from larger quantities of nadir and SWOT\naltimetry data, before learning to predict ocean surface currents from much\nmore sparse in situ measurements from drifters. Training our model on specific\nregions improves performance. Our model achieves stronger nowcast and forecast\nperformance in predicting ocean surface currents than various state-of-the-art\nmethods.\n","authors":["Pierre Garcia","Inès Larroche","Amélie Pesnec","Hannah Bull","Théo Archambault","Evangelos Moschos","Alexandre Stegner","Anastase Charantonis","Dominique Béréziat"],"pdf_url":"https://arxiv.org/pdf/2501.12054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12052v1","updated":"2025-01-21T11:25:44Z","published":"2025-01-21T11:25:44Z","title":"Aggrotech: Leveraging Deep Learning for Sustainable Tomato Disease\n  Management","summary":"  Tomato crop health plays a critical role in ensuring agricultural\nproductivity and food security. Timely and accurate detection of diseases\naffecting tomato plants is vital for effective disease management. In this\nstudy, we propose a deep learning-based approach for Tomato Leaf Disease\nDetection using two well-established convolutional neural networks (CNNs),\nnamely VGG19 and Inception v3. The experiment is conducted on the Tomato\nVillages Dataset, encompassing images of both healthy tomato leaves and leaves\nafflicted by various diseases. The VGG19 model is augmented with fully\nconnected layers, while the Inception v3 model is modified to incorporate a\nglobal average pooling layer and a dense classification layer. Both models are\ntrained on the prepared dataset, and their performances are evaluated on a\nseparate test set. This research employs VGG19 and Inception v3 models on the\nTomato Villages dataset (4525 images) for tomato leaf disease detection. The\nmodels' accuracy of 93.93% with dropout layers demonstrates their usefulness\nfor crop health monitoring. The paper suggests a deep learning-based strategy\nthat includes normalization, resizing, dataset preparation, and unique model\narchitectures. During training, VGG19 and Inception v3 serve as feature\nextractors, with possible data augmentation and fine-tuning. Metrics like\naccuracy, precision, recall, and F1 score are obtained through evaluation on a\ntest set and offer important insights into the strengths and shortcomings of\nthe model. The method has the potential for practical use in precision\nagriculture and could help tomato crops prevent illness early on.\n","authors":["MD Mehraz Hosen","Md. Hasibul Islam"],"pdf_url":"https://arxiv.org/pdf/2501.12052v1.pdf","comment":"10 pages, 6 figures, ROC curves, confusion matrix analysis, and\n  classification reports"},{"id":"http://arxiv.org/abs/2501.12048v1","updated":"2025-01-21T11:21:16Z","published":"2025-01-21T11:21:16Z","title":"Adaptive Class Learning to Screen Diabetic Disorders in Fundus Images of\n  Eye","summary":"  The prevalence of ocular illnesses is growing globally, presenting a\nsubstantial public health challenge. Early detection and timely intervention\nare crucial for averting visual impairment and enhancing patient prognosis.\nThis research introduces a new framework called Class Extension with Limited\nData (CELD) to train a classifier to categorize retinal fundus images. The\nclassifier is initially trained to identify relevant features concerning\nHealthy and Diabetic Retinopathy (DR) classes and later fine-tuned to adapt to\nthe task of classifying the input images into three classes: Healthy, DR, and\nGlaucoma. This strategy allows the model to gradually enhance its\nclassification capabilities, which is beneficial in situations where there are\nonly a limited number of labeled datasets available. Perturbation methods are\nalso used to identify the input image characteristics responsible for\ninfluencing the models decision-making process. We achieve an overall accuracy\nof 91% on publicly available datasets.\n","authors":["Shramana Dey","Pallabi Dutta","Riddhasree Bhattacharyya","Surochita Pal","Sushmita Mitra","Rajiv Raman"],"pdf_url":"https://arxiv.org/pdf/2501.12048v1.pdf","comment":"Accepted at International Conference on Pattern Recognition (ICPR)\n  2024"},{"id":"http://arxiv.org/abs/2501.12030v1","updated":"2025-01-21T10:48:13Z","published":"2025-01-21T10:48:13Z","title":"Advancing Earth Observation: A Survey on AI-Powered Image Processing in\n  Satellites","summary":"  Advancements in technology and reduction in it's cost have led to a\nsubstantial growth in the quality & quantity of imagery captured by Earth\nObservation (EO) satellites. This has presented a challenge to the efficacy of\nthe traditional workflow of transmitting this imagery to Earth for processing.\nAn approach to addressing this issue is to use pre-trained artificial\nintelligence models to process images on-board the satellite, but this is\ndifficult given the constraints within a satellite's environment. This paper\nprovides an up-to-date and thorough review of research related to image\nprocessing on-board Earth observation satellites. The significant constraints\nare detailed along with the latest strategies to mitigate them.\n","authors":["Aidan Duggan","Bruno Andrade","Haithem Afli"],"pdf_url":"https://arxiv.org/pdf/2501.12030v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.12023v1","updated":"2025-01-21T10:33:19Z","published":"2025-01-21T10:33:19Z","title":"Comparative Analysis of Pre-trained Deep Learning Models and DINOv2 for\n  Cushing's Syndrome Diagnosis in Facial Analysis","summary":"  Cushing's syndrome is a condition caused by excessive glucocorticoid\nsecretion from the adrenal cortex, often manifesting with moon facies and\nplethora, making facial data crucial for diagnosis. Previous studies have used\npre-trained convolutional neural networks (CNNs) for diagnosing Cushing's\nsyndrome using frontal facial images. However, CNNs are better at capturing\nlocal features, while Cushing's syndrome often presents with global facial\nfeatures. Transformer-based models like ViT and SWIN, which utilize\nself-attention mechanisms, can better capture long-range dependencies and\nglobal features. Recently, DINOv2, a foundation model based on visual\nTransformers, has gained interest. This study compares the performance of\nvarious pre-trained models, including CNNs, Transformer-based models, and\nDINOv2, in diagnosing Cushing's syndrome. We also analyze gender bias and the\nimpact of freezing mechanisms on DINOv2. Our results show that\nTransformer-based models and DINOv2 outperformed CNNs, with ViT achieving the\nhighest F1 score of 85.74%. Both the pre-trained model and DINOv2 had higher\naccuracy for female samples. DINOv2 also showed improved performance when\nfreezing parameters. In conclusion, Transformer-based models and DINOv2 are\neffective for Cushing's syndrome classification.\n","authors":["Hongjun Liu","Changwei Song","Jiaqi Qiang","Jianqiang Li","Hui Pan","Lin Lu","Xiao Long","Qing Zhao","Jiuzuo Huang","Shi Chen"],"pdf_url":"https://arxiv.org/pdf/2501.12023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12022v1","updated":"2025-01-21T10:25:09Z","published":"2025-01-21T10:25:09Z","title":"Foreign object segmentation in chest x-rays through anatomy-guided shape\n  insertion","summary":"  In this paper, we tackle the challenge of instance segmentation for foreign\nobjects in chest radiographs, commonly seen in postoperative follow-ups with\nstents, pacemakers, or ingested objects in children. The diversity of foreign\nobjects complicates dense annotation, as shown in insufficient existing\ndatasets. To address this, we propose the simple generation of synthetic data\nthrough (1) insertion of arbitrary shapes (lines, polygons, ellipses) with\nvarying contrasts and opacities, and (2) cut-paste augmentations from a small\nset of semi-automatically extracted labels. These insertions are guided by\nanatomy labels to ensure realistic placements, such as stents appearing only in\nrelevant vessels. Our approach enables networks to segment complex structures\nwith minimal manually labeled data. Notably, it achieves performance comparable\nto fully supervised models while using 93\\% fewer manual annotations.\n","authors":["Constantin Seibold","Hamza Kalisch","Lukas Heine","Simon Reiß","Jens Kleesiek"],"pdf_url":"https://arxiv.org/pdf/2501.12022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12020v1","updated":"2025-01-21T10:21:19Z","published":"2025-01-21T10:21:19Z","title":"On the \"Illusion\" of Gender Bias in Face Recognition: Explaining the\n  Fairness Issue Through Non-demographic Attributes","summary":"  Face recognition systems (FRS) exhibit significant accuracy differences based\non the user's gender. Since such a gender gap reduces the trustworthiness of\nFRS, more recent efforts have tried to find the causes. However, these studies\nmake use of manually selected, correlated, and small-sized sets of facial\nfeatures to support their claims. In this work, we analyse gender bias in face\nrecognition by successfully extending the search domain to decorrelated\ncombinations of 40 non-demographic facial characteristics. First, we propose a\ntoolchain to effectively decorrelate and aggregate facial attributes to enable\na less-biased gender analysis on large-scale data. Second, we introduce two new\nfairness metrics to measure fairness with and without context. Based on these\ngrounds, we thirdly present a novel unsupervised algorithm able to reliably\nidentify attribute combinations that lead to vanishing bias when used as filter\npredicates for balanced testing datasets. The experiments show that the gender\ngap vanishes when images of male and female subjects share specific attributes,\nclearly indicating that the issue is not a question of biology but of the\nsocial definition of appearance. These findings could reshape our understanding\nof fairness in face biometrics and provide insights into FRS, helping to\naddress gender bias issues.\n","authors":["Paul Jonas Kurz","Haiyu Wu","Kevin W. Bowyer","Philipp Terhörst"],"pdf_url":"https://arxiv.org/pdf/2501.12020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12016v1","updated":"2025-01-21T10:16:00Z","published":"2025-01-21T10:16:00Z","title":"Are Traditional Deep Learning Model Approaches as Effective as a\n  Retinal-Specific Foundation Model for Ocular and Systemic Disease Detection?","summary":"  Background: RETFound, a self-supervised, retina-specific foundation model\n(FM), showed potential in downstream applications. However, its comparative\nperformance with traditional deep learning (DL) models remains incompletely\nunderstood. This study aimed to evaluate RETFound against three\nImageNet-pretrained supervised DL models (ResNet50, ViT-base, SwinV2) in\ndetecting ocular and systemic diseases.\n  Methods: We fine-tuned/trained RETFound and three DL models on full datasets,\n50%, 20%, and fixed sample sizes (400, 200, 100 images, with half comprising\ndisease cases; for each DR severity class, 100 and 50 cases were used.\nFine-tuned models were tested internally using the SEED (53,090 images) and\nAPTOS-2019 (3,672 images) datasets and externally validated on population-based\n(BES, CIEMS, SP2, UKBB) and open-source datasets (ODIR-5k, PAPILA, GAMMA,\nIDRiD, MESSIDOR-2). Model performance was compared using area under the\nreceiver operating characteristic curve (AUC) and Z-tests with Bonferroni\ncorrection (P<0.05/3).\n  Interpretation: Traditional DL models are mostly comparable to RETFound for\nocular disease detection with large datasets. However, RETFound is superior in\nsystemic disease detection with smaller datasets. These findings offer valuable\ninsights into the respective merits and limitation of traditional models and\nFMs.\n","authors":["Samantha Min Er Yew","Xiaofeng Lei","Jocelyn Hui Lin Goh","Yibing Chen","Sahana Srinivasan","Miao-li Chee","Krithi Pushpanathan","Ke Zou","Qingshan Hou","Zhi Da Soh","Cancan Xue","Marco Chak Yan Yu","Charumathi Sabanayagam","E Shyong Tai","Xueling Sim","Yaxing Wang","Jost B. Jonas","Vinay Nangia","Gabriel Dawei Yang","Emma Anran Ran","Carol Yim-Lui Cheung","Yangqin Feng","Jun Zhou","Rick Siow Mong Goh","Yukun Zhou","Pearse A. Keane","Yong Liu","Ching-Yu Cheng","Yih-Chung Tham"],"pdf_url":"https://arxiv.org/pdf/2501.12016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05406v4","updated":"2025-01-21T10:07:10Z","published":"2023-09-11T12:12:52Z","title":"Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI\n  Generation and Diffuse Glioma Growth Prediction","summary":"  Diffuse gliomas are malignant brain tumors that grow widespread through the\nbrain. The complex interactions between neoplastic cells and normal tissue, as\nwell as the treatment-induced changes often encountered, make glioma tumor\ngrowth modeling challenging. In this paper, we present a novel end-to-end\nnetwork capable of future predictions of tumor masks and multi-parametric\nmagnetic resonance images (MRI) of how the tumor will look at any future time\npoints for different treatment plans. Our approach is based on cutting-edge\ndiffusion probabilistic models and deep-segmentation neural networks. We\nincluded sequential multi-parametric MRI and treatment information as\nconditioning inputs to guide the generative diffusion process as well as a\njoint segmentation process. This allows for tumor growth estimates and\nrealistic MRI generation at any given treatment and time point. We trained the\nmodel using real-world postoperative longitudinal MRI data with glioma tumor\ngrowth trajectories represented as tumor segmentation maps over time. The model\ndemonstrates promising performance across various tasks, including generating\nhigh-quality multi-parametric MRI with tumor masks, performing time-series\ntumor segmentations, and providing uncertainty estimates. Combined with the\ntreatment-aware generated MRI, the tumor growth predictions with uncertainty\nestimates can provide useful information for clinical decision-making.\n","authors":["Qinghui Liu","Elies Fuster-Garcia","Ivar Thokle Hovden","Bradley J MacIntosh","Edvard Grødem","Petter Brandal","Carles Lopez-Mateu","Donatas Sederevicius","Karoline Skogen","Till Schellhorn","Atle Bjørnerud","Kyrre Eeg Emblem"],"pdf_url":"https://arxiv.org/pdf/2309.05406v4.pdf","comment":"preprints in the IEEE-TMI"},{"id":"http://arxiv.org/abs/2501.11992v1","updated":"2025-01-21T09:23:22Z","published":"2025-01-21T09:23:22Z","title":"Survey on Hand Gesture Recognition from Visual Input","summary":"  Hand gesture recognition has become an important research area, driven by the\ngrowing demand for human-computer interaction in fields such as sign language\nrecognition, virtual and augmented reality, and robotics. Despite the rapid\ngrowth of the field, there are few surveys that comprehensively cover recent\nresearch developments, available solutions, and benchmark datasets. This survey\naddresses this gap by examining the latest advancements in hand gesture and 3D\nhand pose recognition from various types of camera input data including RGB\nimages, depth images, and videos from monocular or multiview cameras, examining\nthe differing methodological requirements of each approach. Furthermore, an\noverview of widely used datasets is provided, detailing their main\ncharacteristics and application domains. Finally, open challenges such as\nachieving robust recognition in real-world environments, handling occlusions,\nensuring generalization across diverse users, and addressing computational\nefficiency for real-time applications are highlighted to guide future research\ndirections. By synthesizing the objectives, methodologies, and applications of\nrecent studies, this survey offers valuable insights into current trends,\nchallenges, and opportunities for future research in human hand gesture\nrecognition.\n","authors":["Manousos Linardakis","Iraklis Varlamis","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2501.11992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09411v2","updated":"2025-01-21T08:59:54Z","published":"2025-01-16T09:38:22Z","title":"Towards Robust and Realistic Human Pose Estimation via WiFi Signals","summary":"  Robust WiFi-based human pose estimation is a challenging task that bridges\ndiscrete and subtle WiFi signals to human skeletons. This paper revisits this\nproblem and reveals two critical yet overlooked issues: 1) cross-domain gap,\ni.e., due to significant variations between source-target domain pose\ndistributions; and 2) structural fidelity gap, i.e., predicted skeletal poses\nmanifest distorted topology, usually with misplaced joints and disproportionate\nbone lengths. This paper fills these gaps by reformulating the task into a\nnovel two-phase framework dubbed DT-Pose: Domain-consistent representation\nlearning and Topology-constrained Pose decoding. Concretely, we first propose a\ntemporal-consistent contrastive learning strategy with uniformity\nregularization, coupled with self-supervised masking-reconstruction operations,\nto enable robust learning of domain-consistent and motion-discriminative\nWiFi-specific representations. Beyond this, we introduce a simple yet effective\npose decoder with task prompts, which integrates Graph Convolution Network\n(GCN) and Transformer layers to constrain the topology structure of the\ngenerated skeleton by exploring the adjacent-overarching relationships among\nhuman joints. Extensive experiments conducted on various benchmark datasets\nhighlight the superior performance of our method in tackling these fundamental\nchallenges in both 2D/3D human pose estimation tasks.\n","authors":["Yang Chen","Jingcai Guo","Song Guo","Jingren Zhou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2501.09411v2.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.11971v1","updated":"2025-01-21T08:33:32Z","published":"2025-01-21T08:33:32Z","title":"SMamba: Sparse Mamba for Event-based Object Detection","summary":"  Transformer-based methods have achieved remarkable performance in event-based\nobject detection, owing to the global modeling ability. However, they neglect\nthe influence of non-event and noisy regions and process them uniformly,\nleading to high computational overhead. To mitigate computation cost, some\nresearchers propose window attention based sparsification strategies to discard\nunimportant regions, which sacrifices the global modeling ability and results\nin suboptimal performance. To achieve better trade-off between accuracy and\nefficiency, we propose Sparse Mamba (SMamba), which performs adaptive\nsparsification to reduce computational effort while maintaining global modeling\ncapability. Specifically, a Spatio-Temporal Continuity Assessment module is\nproposed to measure the information content of tokens and discard uninformative\nones by leveraging the spatiotemporal distribution differences between activity\nand noise events. Based on the assessment results, an Information-Prioritized\nLocal Scan strategy is designed to shorten the scan distance between\nhigh-information tokens, facilitating interactions among them in the spatial\ndimension. Furthermore, to extend the global interaction from 2D space to 3D\nrepresentations, a Global Channel Interaction module is proposed to aggregate\nchannel information from a global spatial perspective. Results on three\ndatasets (Gen1, 1Mpx, and eTram) demonstrate that our model outperforms other\nmethods in both performance and efficiency.\n","authors":["Nan Yang","Yang Wang","Zhanwen Liu","Meng Li","Yisheng An","Xiangmo Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.11971v1.pdf","comment":"AAAI2025"},{"id":"http://arxiv.org/abs/2501.06714v2","updated":"2025-01-21T08:33:26Z","published":"2025-01-12T04:44:44Z","title":"F3D-Gaus: Feed-forward 3D-aware Generation on ImageNet with\n  Cycle-Consistent Gaussian Splatting","summary":"  This paper tackles the problem of generalizable 3D-aware generation from\nmonocular datasets, e.g., ImageNet. The key challenge of this task is learning\na robust 3D-aware representation without multi-view or dynamic data, while\nensuring consistent texture and geometry across different viewpoints. Although\nsome baseline methods are capable of 3D-aware generation, the quality of the\ngenerated images still lags behind state-of-the-art 2D generation approaches,\nwhich excel in producing high-quality, detailed images. To address this severe\nlimitation, we propose a novel feed-forward pipeline based on pixel-aligned\nGaussian Splatting, coined as F3D-Gaus, which can produce more realistic and\nreliable 3D renderings from monocular inputs. In addition, we introduce a\nself-supervised cycle-consistent constraint to enforce cross-view consistency\nin the learned 3D representation. This training strategy naturally allows\naggregation of multiple aligned Gaussian primitives and significantly\nalleviates the interpolation limitations inherent in single-view pixel-aligned\nGaussian Splatting. Furthermore, we incorporate video model priors to perform\ngeometry-aware refinement, enhancing the generation of fine details in\nwide-viewpoint scenarios and improving the model's capability to capture\nintricate 3D textures. Extensive experiments demonstrate that our approach not\nonly achieves high-quality, multi-view consistent 3D-aware generation from\nmonocular datasets, but also significantly improves training and inference\nefficiency.\n","authors":["Yuxin Wang","Qianyi Wu","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2501.06714v2.pdf","comment":"Project Page: https://w-ted.github.io/publications/F3D-Gaus"},{"id":"http://arxiv.org/abs/2501.03659v4","updated":"2025-01-21T08:09:03Z","published":"2025-01-07T09:47:46Z","title":"DehazeGS: Seeing Through Fog with 3D Gaussian Splatting","summary":"  Current novel view synthesis tasks primarily rely on high-quality and clear\nimages. However, in foggy scenes, scattering and attenuation can significantly\ndegrade the reconstruction and rendering quality. Although NeRF-based dehazing\nreconstruction algorithms have been developed, their use of deep fully\nconnected neural networks and per-ray sampling strategies leads to high\ncomputational costs. Moreover, NeRF's implicit representation struggles to\nrecover fine details from hazy scenes. In contrast, recent advancements in 3D\nGaussian Splatting achieve high-quality 3D scene reconstruction by explicitly\nmodeling point clouds into 3D Gaussians. In this paper, we propose leveraging\nthe explicit Gaussian representation to explain the foggy image formation\nprocess through a physically accurate forward rendering process. We introduce\nDehazeGS, a method capable of decomposing and rendering a fog-free background\nfrom participating media using only muti-view foggy images as input. We model\nthe transmission within each Gaussian distribution to simulate the formation of\nfog. During this process, we jointly learn the atmospheric light and scattering\ncoefficient while optimizing the Gaussian representation of the hazy scene. In\nthe inference stage, we eliminate the effects of scattering and attenuation on\nthe Gaussians and directly project them onto a 2D plane to obtain a clear view.\nExperiments on both synthetic and real-world foggy datasets demonstrate that\nDehazeGS achieves state-of-the-art performance in terms of both rendering\nquality and computational efficiency. visualizations are available at\nhttps://dehazegs.github.io/\n","authors":["Jinze Yu","Yiqun Wang","Zhengda Lu","Jianwei Guo","Yong Li","Hongxing Qin","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03659v4.pdf","comment":"9 pages,4 figures. visualizations are available at\n  https://dehazegs.github.io/"},{"id":"http://arxiv.org/abs/2501.11927v1","updated":"2025-01-21T07:03:11Z","published":"2025-01-21T07:03:11Z","title":"A Lightweight and Interpretable Deepfakes Detection Framework","summary":"  The recent realistic creation and dissemination of so-called deepfakes poses\na serious threat to social life, civil rest, and law. Celebrity defaming,\nelection manipulation, and deepfakes as evidence in court of law are few\npotential consequences of deepfakes. The availability of open source trained\nmodels based on modern frameworks such as PyTorch or TensorFlow, video\nmanipulations Apps such as FaceApp and REFACE, and economical computing\ninfrastructure has easen the creation of deepfakes. Most of the existing\ndetectors focus on detecting either face-swap, lip-sync, or puppet master\ndeepfakes, but a unified framework to detect all three types of deepfakes is\nhardly explored. This paper presents a unified framework that exploits the\npower of proposed feature fusion of hybrid facial landmarks and our novel heart\nrate features for detection of all types of deepfakes. We propose novel heart\nrate features and fused them with the facial landmark features to better\nextract the facial artifacts of fake videos and natural variations available in\nthe original videos. We used these features to train a light-weight XGBoost to\nclassify between the deepfake and bonafide videos. We evaluated the performance\nof our framework on the world leaders dataset (WLDR) that contains all types of\ndeepfakes. Experimental results illustrate that the proposed framework offers\nsuperior detection performance over the comparative deepfakes detection\nmethods. Performance comparison of our framework against the LSTM-FCN, a\ncandidate of deep learning model, shows that proposed model achieves similar\nresults, however, it is more interpretable.\n","authors":["Muhammad Umar Farooq","Ali Javed","Khalid Mahmood Malik","Muhammad Anas Raza"],"pdf_url":"https://arxiv.org/pdf/2501.11927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11923v1","updated":"2025-01-21T06:55:31Z","published":"2025-01-21T06:55:31Z","title":"Progressive Cross Attention Network for Flood Segmentation using\n  Multispectral Satellite Imagery","summary":"  In recent years, the integration of deep learning techniques with remote\nsensing technology has revolutionized the way natural hazards, such as floods,\nare monitored and managed. However, existing methods for flood segmentation\nusing remote sensing data often overlook the utility of correlative features\namong multispectral satellite information. In this study, we introduce a\nprogressive cross attention network (ProCANet), a deep learning model that\nprogressively applies both self- and cross-attention mechanisms to\nmultispectral features, generating optimal feature combinations for flood\nsegmentation. The proposed model was compared with state-of-the-art approaches\nusing Sen1Floods11 dataset and our bespoke flood data generated for the Citarum\nRiver basin, Indonesia. Our model demonstrated superior performance with the\nhighest Intersection over Union (IoU) score of 0.815. Our results in this\nstudy, coupled with the ablation assessment comparing scenarios with and\nwithout attention across various modalities, opens a promising path for\nenhancing the accuracy of flood analysis using remote sensing technology.\n","authors":["Vicky Feliren","Fithrothul Khikmah","Irfan Dwiki Bhaswara","Bahrul I. Nasution","Alex M. Lechner","Muhamad Risqi U. Saputra"],"pdf_url":"https://arxiv.org/pdf/2501.11923v1.pdf","comment":"5 pages, 4 figures, published in IEEE Geoscience and Remote Sensing\n  Letters"},{"id":"http://arxiv.org/abs/2312.02253v2","updated":"2025-01-21T06:03:07Z","published":"2023-12-04T18:35:27Z","title":"Diversify, Don't Fine-Tune: Scaling Up Visual Recognition Training with\n  Synthetic Images","summary":"  Recent advances in generative deep learning have enabled the creation of\nhigh-quality synthetic images in text-to-image generation. Prior work shows\nthat fine-tuning a pretrained diffusion model on ImageNet and generating\nsynthetic training images from the finetuned model can enhance an ImageNet\nclassifier's performance. However, performance degrades as synthetic images\noutnumber real ones. In this paper, we explore whether generative fine-tuning\nis essential for this improvement and whether it is possible to further scale\nup training using more synthetic data. We present a new framework leveraging\noff-the-shelf generative models to generate synthetic training images,\naddressing multiple challenges: class name ambiguity, lack of diversity in\nnaive prompts, and domain shifts. Specifically, we leverage large language\nmodels (LLMs) and CLIP to resolve class name ambiguity. To diversify images, we\npropose contextualized diversification (CD) and stylized diversification (SD)\nmethods, also prompted by LLMs. Finally, to mitigate domain shifts, we leverage\ndomain adaptation techniques with auxiliary batch normalization for synthetic\nimages. Our framework consistently enhances recognition model performance with\nmore synthetic data, up to 6x of original ImageNet size showcasing the\npotential of synthetic data for improved recognition models and strong\nout-of-domain generalization.\n","authors":["Zhuoran Yu","Chenchen Zhu","Sean Culatana","Raghuraman Krishnamoorthi","Fanyi Xiao","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2312.02253v2.pdf","comment":"Accepted by Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2410.05413v3","updated":"2025-01-21T05:43:51Z","published":"2024-10-07T18:26:30Z","title":"Implicitly Learned Neural Phase Functions for Basis-Free Point Spread\n  Function Engineering","summary":"  Point spread function (PSF) engineering is vital for precisely controlling\nthe focus of light in computational imaging, with applications in neural\nimaging, fluorescence microscopy, and biophotonics. The PSF is derived from the\nmagnitude of the Fourier transform of a phase function, making the construction\nof the phase function given the PSF (PSF engineering) an ill-posed inverse\nproblem. Traditional PSF engineering methods rely on physical basis functions,\nlimiting their ability to generalize across the range of PSFs required for\nimaging tasks. We introduce a novel approach leveraging implicit neural\nrepresentations that overcome the limitations of pixel-wise optimization\nmethods. Our approach achieves a median MSSIM of 0.8162 and a mean MSSIM of\n0.5634, compared to a median MSSIM of 0.0 and a mean MSSIM of 0.1841 with\npixel-wise optimization when learning randomly generated phase functions. Our\napproach also achieves a median PSNR of 10.38 dB and a mean PSNR of 8.672 dB,\ncompared to a median PSNR of 6.653 dB and a mean PSNR of 6.660 dB with\npixel-wise optimization for this task.\n","authors":["Aleksey Valouev"],"pdf_url":"https://arxiv.org/pdf/2410.05413v3.pdf","comment":"3 pages, 7 figures. To be published in ICVISP 2024\n  (https://www.icvisp.org/)"},{"id":"http://arxiv.org/abs/2501.11901v1","updated":"2025-01-21T05:41:09Z","published":"2025-01-21T05:41:09Z","title":"Enhancing Adversarial Transferability via Component-Wise Augmentation\n  Method","summary":"  Deep Neural Networks (DNNs) are highly vulnerable to adversarial examples,\nwhich pose significant challenges in security-sensitive applications. Among\nvarious adversarial attack strategies, input transformation-based attacks have\ndemonstrated remarkable effectiveness in enhancing adversarial transferability.\nHowever, existing methods fail to diversify attention regions across models\nadequately and introduce excessive information loss during transformations. In\nthis paper, we introduce a novel input transformation-based method, termed\nComponent-Wise Augmentation (CWA), designed to enhance transferability by\nlocally applying block-wise transformations. CWA strategically integrates\ninterpolation and selective rotation on individual image blocks to diversify\nmodel attention regions while preserving semantic integrity. Extensive\nexperiments on the standard ImageNet dataset show that CWA consistently\noutperforms state-of-the-art methods in both attack success rates and stability\nacross CNN- and Transformer-based models, while also demonstrating superior\nperformance against multiple defense methods.\n","authors":["Hangyu Liu","Bo Peng","Pengxiang Ding","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11901v1.pdf","comment":"13pages,5 figures"},{"id":"http://arxiv.org/abs/2407.21004v3","updated":"2025-01-21T05:31:45Z","published":"2024-07-30T17:51:44Z","title":"Evolver: Chain-of-Evolution Prompting to Boost Large Multimodal Models\n  for Hateful Meme Detection","summary":"  Recent advances show that two-stream approaches have achieved outstanding\nperformance in hateful meme detection. However, hateful memes constantly evolve\nas new memes emerge by fusing progressive cultural ideas, making existing\nmethods obsolete or ineffective. In this work, we explore the potential of\nLarge Multimodal Models (LMMs) for hateful meme detection. To this end, we\npropose Evolver, which incorporates LMMs via Chain-of-Evolution (CoE)\nPrompting, by integrating the evolution attribute and in-context information of\nmemes. Specifically, Evolver simulates the evolving and expressing process of\nmemes and reasons through LMMs in a step-by-step manner. First, an evolutionary\npair mining module retrieves the top-k most similar memes in the external\ncurated meme set with the input meme. Second, an evolutionary information\nextractor is designed to summarize the semantic regularities between the paired\nmemes for prompting. Finally, a contextual relevance amplifier enhances the\nin-context hatefulness information to boost the search for evolutionary\nprocesses. Extensive experiments on public FHM, MAMI, and HarM datasets show\nthat CoE prompting can be incorporated into existing LMMs to improve their\nperformance. More encouragingly, it can serve as an interpretive tool to\npromote the understanding of the evolution of social memes. [Homepage]\n(https://github.com/inFaaa/Evolver)\n","authors":["Jinfa Huang","Jinsheng Pan","Zhongwei Wan","Hanjia Lyu","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2407.21004v3.pdf","comment":"accepted by COLING 2025"},{"id":"http://arxiv.org/abs/2501.11899v1","updated":"2025-01-21T05:29:34Z","published":"2025-01-21T05:29:34Z","title":"LASER: Lip Landmark Assisted Speaker Detection for Robustness","summary":"  Active Speaker Detection (ASD) aims to identify speaking individuals in\ncomplex visual scenes. While humans can easily detect speech by matching lip\nmovements to audio, current ASD models struggle to establish this\ncorrespondence, often misclassifying non-speaking instances when audio and lip\nmovements are unsynchronized. To address this limitation, we propose Lip\nlandmark Assisted Speaker dEtection for Robustness (LASER). Unlike models that\nrely solely on facial frames, LASER explicitly focuses on lip movements by\nintegrating lip landmarks in training. Specifically, given a face track, LASER\nextracts frame-level visual features and the 2D coordinates of lip landmarks\nusing a lightweight detector. These coordinates are encoded into dense feature\nmaps, providing spatial and structural information on lip positions.\nRecognizing that landmark detectors may sometimes fail under challenging\nconditions (e.g., low resolution, occlusions, extreme angles), we incorporate\nan auxiliary consistency loss to align predictions from both lip-aware and\nface-only features, ensuring reliable performance even when lip data is absent.\nExtensive experiments across multiple datasets show that LASER outperforms\nstate-of-the-art models, especially in scenarios with desynchronized audio and\nvisuals, demonstrating robust performance in real-world video contexts. Code is\navailable at \\url{https://github.com/plnguyen2908/LASER_ASD}.\n","authors":["Le Thien Phuc Nguyen","Zhuoran Yu","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2501.11899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11895v1","updated":"2025-01-21T05:15:10Z","published":"2025-01-21T05:15:10Z","title":"Contrastive Masked Autoencoders for Character-Level Open-Set Writer\n  Identification","summary":"  In the realm of digital forensics and document authentication, writer\nidentification plays a crucial role in determining the authors of documents\nbased on handwriting styles. The primary challenge in writer-id is the\n\"open-set scenario\", where the goal is accurately recognizing writers unseen\nduring the model training. To overcome this challenge, representation learning\nis the key. This method can capture unique handwriting features, enabling it to\nrecognize styles not previously encountered during training. Building on this\nconcept, this paper introduces the Contrastive Masked Auto-Encoders (CMAE) for\nCharacter-level Open-Set Writer Identification. We merge Masked Auto-Encoders\n(MAE) with Contrastive Learning (CL) to simultaneously and respectively capture\nsequential information and distinguish diverse handwriting styles.\nDemonstrating its effectiveness, our model achieves state-of-the-art (SOTA)\nresults on the CASIA online handwriting dataset, reaching an impressive\nprecision rate of 89.7%. Our study advances universal writer-id with a\nsophisticated representation learning approach, contributing substantially to\nthe ever-evolving landscape of digital handwriting analysis, and catering to\nthe demands of an increasingly interconnected world.\n","authors":["Xiaowei Jiang","Wenhao Ma","Yiqun Duan","Thomas Do","Chin-Teng Lin"],"pdf_url":"https://arxiv.org/pdf/2501.11895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11884v1","updated":"2025-01-21T04:35:27Z","published":"2025-01-21T04:35:27Z","title":"Fast Underwater Scene Reconstruction using Multi-View Stereo and\n  Physical Imaging","summary":"  Underwater scene reconstruction poses a substantial challenge because of the\nintricate interplay between light and the medium, resulting in scattering and\nabsorption effects that make both depth estimation and rendering more complex.\nWhile recent Neural Radiance Fields (NeRF) based methods for underwater scenes\nachieve high-quality results by modeling and separating the scattering medium,\nthey still suffer from slow training and rendering speeds. To address these\nlimitations, we propose a novel method that integrates Multi-View Stereo (MVS)\nwith a physics-based underwater image formation model. Our approach consists of\ntwo branches: one for depth estimation using the traditional cost volume\npipeline of MVS, and the other for rendering based on the physics-based image\nformation model. The depth branch improves scene geometry, while the medium\nbranch determines the scattering parameters to achieve precise scene rendering.\nUnlike traditional MVSNet methods that rely on ground-truth depth, our method\ndoes not necessitate the use of depth truth, thus allowing for expedited\ntraining and rendering processes. By leveraging the medium subnet to estimate\nthe medium parameters and combining this with a color MLP for rendering, we\nrestore the true colors of underwater scenes and achieve higher-fidelity\ngeometric representations. Experimental results show that our method enables\nhigh-quality synthesis of novel views in scattering media, clear views\nrestoration by removing the medium, and outperforms existing methods in\nrendering quality and training efficiency.\n","authors":["Shuyi Hu","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11876v1","updated":"2025-01-21T04:11:04Z","published":"2025-01-21T04:11:04Z","title":"FNIN: A Fourier Neural Operator-based Numerical Integration Network for\n  Surface-form-gradients","summary":"  Surface-from-gradients (SfG) aims to recover a three-dimensional (3D) surface\nfrom its gradients. Traditional methods encounter significant challenges in\nachieving high accuracy and handling high-resolution inputs, particularly\nfacing the complex nature of discontinuities and the inefficiencies associated\nwith large-scale linear solvers. Although recent advances in deep learning,\nsuch as photometric stereo, have enhanced normal estimation accuracy, they do\nnot fully address the intricacies of gradient-based surface reconstruction. To\novercome these limitations, we propose a Fourier neural operator-based\nNumerical Integration Network (FNIN) within a two-stage optimization framework.\nIn the first stage, our approach employs an iterative architecture for\nnumerical integration, harnessing an advanced Fourier neural operator to\napproximate the solution operator in Fourier space. Additionally, a\nself-learning attention mechanism is incorporated to effectively detect and\nhandle discontinuities. In the second stage, we refine the surface\nreconstruction by formulating a weighted least squares problem, addressing the\nidentified discontinuities rationally. Extensive experiments demonstrate that\nour method achieves significant improvements in both accuracy and efficiency\ncompared to current state-of-the-art solvers. This is particularly evident in\nhandling high-resolution images with complex data, achieving errors of fewer\nthan 0.1 mm on tested objects.\n","authors":["Jiaqi Leng","Yakun Ju","Yuanxu Duan","Jiangnan Zhang","Qingxuan Lv","Zuxuan Wu","Hao Fan"],"pdf_url":"https://arxiv.org/pdf/2501.11876v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2408.11051v2","updated":"2025-01-21T04:06:09Z","published":"2024-08-20T17:57:46Z","title":"FLAME: Learning to Navigate with Multimodal LLM in Urban Environments","summary":"  Large Language Models (LLMs) have demonstrated potential in\nVision-and-Language Navigation (VLN) tasks, yet current applications face\nchallenges. While LLMs excel in general conversation scenarios, they struggle\nwith specialized navigation tasks, yielding suboptimal performance compared to\nspecialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied\nAgent), a novel Multimodal LLM-based agent and architecture designed for urban\nVLN tasks that efficiently handles multiple observations. Our approach\nimplements a three-phase tuning technique for effective adaptation to\nnavigation tasks, including single perception tuning for street view\ndescription, multiple perception tuning for route summarization, and end-to-end\ntraining on VLN datasets. The augmented datasets are synthesized automatically.\nExperimental results demonstrate FLAME's superiority over existing methods,\nsurpassing state-of-the-art methods by a 7.3% increase in task completion on\nTouchdown dataset. This work showcases the potential of Multimodal LLMs (MLLMs)\nin complex navigation tasks, representing an advancement towards applications\nof MLLMs in the field of embodied intelligence.\n","authors":["Yunzhe Xu","Yiyuan Pan","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11051v2.pdf","comment":"Accepted to AAAI 2025 (Oral)"},{"id":"http://arxiv.org/abs/2412.10718v4","updated":"2025-01-21T04:00:36Z","published":"2024-12-14T07:22:03Z","title":"Grid: Omni Visual Generation","summary":"  Visual generation has witnessed remarkable progress in single-image tasks,\nyet extending these capabilities to temporal sequences remains challenging.\nCurrent approaches either build specialized video models from scratch with\nenormous computational costs or add separate motion modules to image\ngenerators, both requiring learning temporal dynamics anew. We observe that\nmodern image generation models possess underutilized potential in handling\nstructured layouts with implicit temporal understanding. Building on this\ninsight, we introduce GRID, which reformulates temporal sequences as grid\nlayouts, enabling holistic processing of visual sequences while leveraging\nexisting model capabilities. Through a parallel flow-matching training strategy\nwith coarse-to-fine scheduling, our approach achieves up to 67 faster inference\nspeeds while using <1/1000 of the computational resources compared to\nspecialized models. Extensive experiments demonstrate that GRID not only excels\nin temporal tasks from Text-to-Video to 3D Editing but also preserves strong\nperformance in image generation, establishing itself as an efficient and\nversatile omni-solution for visual generation.\n","authors":["Cong Wan","Xiangyang Luo","Hao Luo","Zijian Cai","Yiren Song","Yunlong Zhao","Yifan Bai","Yuhang He","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2412.10718v4.pdf","comment":"Codes: https://github.com/Should-AI-Lab/GRID"},{"id":"http://arxiv.org/abs/2405.16960v2","updated":"2025-01-21T03:49:48Z","published":"2024-05-27T08:55:17Z","title":"DCPI-Depth: Explicitly Infusing Dense Correspondence Prior to\n  Unsupervised Monocular Depth Estimation","summary":"  There has been a recent surge of interest in learning to perceive depth from\nmonocular videos in an unsupervised fashion. A key challenge in this field is\nachieving robust and accurate depth estimation in challenging scenarios,\nparticularly in regions with weak textures or where dynamic objects are\npresent. This study makes three major contributions by delving deeply into\ndense correspondence priors to provide existing frameworks with explicit\ngeometric constraints. The first novelty is a contextual-geometric depth\nconsistency loss, which employs depth maps triangulated from dense\ncorrespondences based on estimated ego-motion to guide the learning of depth\nperception from contextual information, since explicitly triangulated depth\nmaps capture accurate relative distances among pixels. The second novelty\narises from the observation that there exists an explicit, deducible\nrelationship between optical flow divergence and depth gradient. A differential\nproperty correlation loss is, therefore, designed to refine depth estimation\nwith a specific emphasis on local variations. The third novelty is a\nbidirectional stream co-adjustment strategy that enhances the interaction\nbetween rigid and optical flows, encouraging the former towards more accurate\ncorrespondence and making the latter more adaptable across various scenarios\nunder the static scene hypotheses. DCPI-Depth, a framework that incorporates\nall these innovative components and couples two bidirectional and collaborative\nstreams, achieves state-of-the-art performance and generalizability across\nmultiple public datasets, outperforming all existing prior arts. Specifically,\nit demonstrates accurate depth estimation in texture-less and dynamic regions,\nand shows more reasonable smoothness. Our source code will be publicly\navailable at mias.group/DCPI-Depth upon publication.\n","authors":["Mengtan Zhang","Yi Feng","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2405.16960v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.09658v2","updated":"2025-01-21T03:43:41Z","published":"2024-12-12T08:23:07Z","title":"SEGT: A General Spatial Expansion Group Transformer for nuScenes\n  Lidar-based Object Detection Task","summary":"  In the technical report, we present a novel transformer-based framework for\nnuScenes lidar-based object detection task, termed Spatial Expansion Group\nTransformer (SEGT). To efficiently handle the irregular and sparse nature of\npoint cloud, we propose migrating the voxels into distinct specialized ordered\nfields with the general spatial expansion strategies, and employ group\nattention mechanisms to extract the exclusive feature maps within each field.\nSubsequently, we integrate the feature representations across different ordered\nfields by alternately applying diverse expansion strategies, thereby enhancing\nthe model's ability to capture comprehensive spatial information. The method\nwas evaluated on the nuScenes lidar-based object detection test dataset,\nachieving an NDS score of 73.9 without Test-Time Augmentation (TTA) and 74.5\nwith TTA, demonstrating the effectiveness of the proposed method. Notably, our\nmethod ranks the 1st place in the nuScenes lidar-based object detection task.\n","authors":["Cheng Mei","Hao He","Yahui Liu","Zhenhua Guo"],"pdf_url":"https://arxiv.org/pdf/2412.09658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01097v2","updated":"2025-01-21T03:32:22Z","published":"2025-01-02T06:46:13Z","title":"EliGen: Entity-Level Controlled Image Generation with Regional Attention","summary":"  Recent advancements in diffusion models have significantly advanced\ntext-to-image generation, yet global text prompts alone remain insufficient for\nachieving fine-grained control over individual entities within an image. To\naddress this limitation, we present EliGen, a novel framework for Entity-Level\ncontrolled Image Generation. We introduce regional attention, a mechanism for\ndiffusion transformers that requires no additional parameters, seamlessly\nintegrating entity prompts and arbitrary-shaped spatial masks. By contributing\na high-quality dataset with fine-grained spatial and semantic entity-level\nannotations, we train EliGen to achieve robust and accurate entity-level\nmanipulation, surpassing existing methods in both spatial precision and image\nquality. Additionally, we propose an inpainting fusion pipeline, extending\nEliGen's capabilities to multi-entity image inpainting tasks. We further\ndemonstrate its flexibility by integrating it with other open-source models\nsuch as IP-Adapter, In-Context LoRA and MLLM, unlocking new creative\npossibilities. The source code, model, and dataset are published at\nhttps://github.com/modelscope/DiffSynth-Studio.\n","authors":["Hong Zhang","Zhongjie Duan","Xingjun Wang","Yingda Chen","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.01097v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11858v1","updated":"2025-01-21T03:22:10Z","published":"2025-01-21T03:22:10Z","title":"EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents","summary":"  Multimodal Large Language Models (MLLMs) have shown significant advancements,\nproviding a promising future for embodied agents. Existing benchmarks for\nevaluating MLLMs primarily utilize static images or videos, limiting\nassessments to non-interactive scenarios. Meanwhile, existing embodied AI\nbenchmarks are task-specific and not diverse enough, which do not adequately\nevaluate the embodied capabilities of MLLMs. To address this, we propose\nEmbodiedEval, a comprehensive and interactive evaluation benchmark for MLLMs\nwith embodied tasks. EmbodiedEval features 328 distinct tasks within 125 varied\n3D scenes, each of which is rigorously selected and annotated. It covers a\nbroad spectrum of existing embodied AI tasks with significantly enhanced\ndiversity, all within a unified simulation and evaluation framework tailored\nfor MLLMs. The tasks are organized into five categories: navigation, object\ninteraction, social interaction, attribute question answering, and spatial\nquestion answering to assess different capabilities of the agents. We evaluated\nthe state-of-the-art MLLMs on EmbodiedEval and found that they have a\nsignificant shortfall compared to human level on embodied tasks. Our analysis\ndemonstrates the limitations of existing MLLMs in embodied capabilities,\nproviding insights for their future development. We open-source all evaluation\ndata and simulation framework at https://github.com/thunlp/EmbodiedEval.\n","authors":["Zhili Cheng","Yuge Tu","Ran Li","Shiqi Dai","Jinyi Hu","Shengding Hu","Jiahao Li","Yang Shi","Tianyu Yu","Weize Chen","Lei Shi","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2501.11858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11854v1","updated":"2025-01-21T03:10:52Z","published":"2025-01-21T03:10:52Z","title":"WaveNet-SF: A Hybrid Network for Retinal Disease Detection Based on\n  Wavelet Transform in the Spatial-Frequency Domain","summary":"  Retinal diseases are a leading cause of vision impairment and blindness, with\ntimely diagnosis being critical for effective treatment. Optical Coherence\nTomography (OCT) has become a standard imaging modality for retinal disease\ndiagnosis, but OCT images often suffer from issues such as speckle noise,\ncomplex lesion shapes, and varying lesion sizes, making interpretation\nchallenging. In this paper, we propose a novel framework, WaveNet-SF, to\nenhance retinal disease detection by integrating spatial-domain and\nfrequency-domain learning. The framework utilizes wavelet transforms to\ndecompose OCT images into low- and high-frequency components, enabling the\nmodel to extract both global structural features and fine-grained details. To\nimprove lesion detection, we introduce a multi-scale wavelet spatial attention\n(MSW-SA) module, which enhances the model's focus on regions of interest at\nmultiple scales. Additionally, a high-frequency feature compensation block\n(HFFC) is incorporated to recover edge information lost during wavelet\ndecomposition, suppress noise, and preserve fine details crucial for lesion\ndetection. Our approach achieves state-of-the-art (SOTA) classification\naccuracies of 97.82% and 99. 58% on the OCT-C8 and OCT2017 datasets,\nrespectively, surpassing existing methods. These results demonstrate the\nefficacy of WaveNet-SF in addressing the challenges of OCT image analysis and\nits potential as a powerful tool for retinal disease diagnosis.\n","authors":["Jilan Cheng","Guoli Long","Zeyu Zhang","Zhenjia Qi","Hanyu Wang","Libin Lu","Shuihua Wang","Yudong Zhang","Jin Hong"],"pdf_url":"https://arxiv.org/pdf/2501.11854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10734v3","updated":"2025-01-21T03:00:04Z","published":"2024-12-14T08:08:40Z","title":"OmniHD-Scenes: A Next-Generation Multimodal Dataset for Autonomous\n  Driving","summary":"  The rapid advancement of deep learning has intensified the need for\ncomprehensive data for use by autonomous driving algorithms. High-quality\ndatasets are crucial for the development of effective data-driven autonomous\ndriving solutions. Next-generation autonomous driving datasets must be\nmultimodal, incorporating data from advanced sensors that feature extensive\ndata coverage, detailed annotations, and diverse scene representation. To\naddress this need, we present OmniHD-Scenes, a large-scale multimodal dataset\nthat provides comprehensive omnidirectional high-definition data. The\nOmniHD-Scenes dataset combines data from 128-beam LiDAR, six cameras, and six\n4D imaging radar systems to achieve full environmental perception. The dataset\ncomprises 1501 clips, each approximately 30-s long, totaling more than 450K\nsynchronized frames and more than 5.85 million synchronized sensor data points.\nWe also propose a novel 4D annotation pipeline. To date, we have annotated 200\nclips with more than 514K precise 3D bounding boxes. These clips also include\nsemantic segmentation annotations for static scene elements. Additionally, we\nintroduce a novel automated pipeline for generation of the dense occupancy\nground truth, which effectively leverages information from non-key frames.\nAlongside the proposed dataset, we establish comprehensive evaluation metrics,\nbaseline models, and benchmarks for 3D detection and semantic occupancy\nprediction. These benchmarks utilize surround-view cameras and 4D imaging radar\nto explore cost-effective sensor solutions for autonomous driving applications.\nExtensive experiments demonstrate the effectiveness of our low-cost sensor\nconfiguration and its robustness under adverse conditions. Data will be\nreleased at https://www.2077ai.com/OmniHD-Scenes.\n","authors":["Lianqing Zheng","Long Yang","Qunshu Lin","Wenjin Ai","Minghao Liu","Shouyi Lu","Jianan Liu","Hongze Ren","Jingyue Mo","Xiaokai Bai","Jie Bai","Zhixiong Ma","Xichan Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.10734v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11841v1","updated":"2025-01-21T02:51:10Z","published":"2025-01-21T02:51:10Z","title":"Survey on Monocular Metric Depth Estimation","summary":"  Monocular Depth Estimation (MDE) is a fundamental computer vision task\nunderpinning applications such as spatial understanding, 3D reconstruction, and\nautonomous driving. While deep learning-based MDE methods can predict relative\ndepth from a single image, their lack of metric scale information often results\nin scale inconsistencies, limiting their utility in downstream tasks like\nvisual SLAM, 3D reconstruction, and novel view synthesis. Monocular Metric\nDepth Estimation (MMDE) addresses these challenges by enabling precise,\nscene-scale depth inference. MMDE improves depth consistency, enhances\nsequential task stability, simplifies integration into downstream applications,\nand broadens practical use cases. This paper provides a comprehensive review of\ndepth estimation technologies, highlighting the evolution from geometry-based\nmethods to state-of-the-art deep learning approaches. It emphasizes\nadvancements in scale-agnostic methods, which are crucial for enabling\nzero-shot generalization as the foundational capability for MMDE. Recent\nprogress in zero-shot MMDE research is explored, focusing on challenges such as\nmodel generalization and the loss of detail at scene boundaries. Innovative\nstrategies to address these issues include unlabelled data augmentation, image\npatching, architectural optimization, and generative techniques. These\nadvancements, analyzed in detail, demonstrate significant contributions to\novercoming existing limitations. Finally, this paper synthesizes recent\ndevelopments in zero-shot MMDE, identifies unresolved challenges, and outlines\nfuture research directions. By offering a clear roadmap and cutting-edge\ninsights, this work aims to deepen understanding of MMDE, inspire novel\napplications, and drive technological innovation.\n","authors":["Jiuling Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11836v1","updated":"2025-01-21T02:44:05Z","published":"2025-01-21T02:44:05Z","title":"Data-driven Detection and Evaluation of Damages in Concrete Structures:\n  Using Deep Learning and Computer Vision","summary":"  Structural integrity is vital for maintaining the safety and longevity of\nconcrete infrastructures such as bridges, tunnels, and walls. Traditional\nmethods for detecting damages like cracks and spalls are labor-intensive,\ntime-consuming, and prone to human error. To address these challenges, this\nstudy explores advanced data-driven techniques using deep learning for\nautomated damage detection and analysis. Two state-of-the-art instance\nsegmentation models, YOLO-v7 instance segmentation and Mask R-CNN, were\nevaluated using a dataset comprising 400 images, augmented to 10,995 images\nthrough geometric and color-based transformations to enhance robustness. The\nmodels were trained and validated using a dataset split into 90% training set,\nvalidation and test set 10%. Performance metrics such as precision, recall,\nmean average precision (mAP@0.5), and frames per second (FPS) were used for\nevaluation. YOLO-v7 achieved a superior mAP@0.5 of 96.1% and processed 40 FPS,\noutperforming Mask R-CNN, which achieved a mAP@0.5 of 92.1% with a slower\nprocessing speed of 18 FPS. The findings recommend YOLO-v7 instance\nsegmentation model for real-time, high-speed structural health monitoring,\nwhile Mask R-CNN is better suited for detailed offline assessments. This study\ndemonstrates the potential of deep learning to revolutionize infrastructure\nmaintenance, offering a scalable and efficient solution for automated damage\ndetection.\n","authors":["Saeid Ataei","Saeed Adibnazari","Seyyed Taghi Ataei"],"pdf_url":"https://arxiv.org/pdf/2501.11836v1.pdf","comment":"17 pages, 10 figures. This study focuses on the data-driven detection\n  and evaluation of damages in concrete structures using deep learning and\n  computer vision techniques"},{"id":"http://arxiv.org/abs/2404.10292v2","updated":"2025-01-21T02:43:24Z","published":"2024-04-16T05:29:14Z","title":"From Data Deluge to Data Curation: A Filtering-WoRA Paradigm for\n  Efficient Text-based Person Search","summary":"  In text-based person search endeavors, data generation has emerged as a\nprevailing practice, addressing concerns over privacy preservation and the\narduous task of manual annotation. Although the number of synthesized data can\nbe infinite in theory, the scientific conundrum persists that how much\ngenerated data optimally fuels subsequent model training. We observe that only\na subset of the data in these constructed datasets plays a decisive role.\nTherefore, we introduce a new Filtering-WoRA paradigm, which contains a\nfiltering algorithm to identify this crucial data subset and WoRA (Weighted\nLow-Rank Adaptation) learning strategy for light fine-tuning. The filtering\nalgorithm is based on the cross-modality relevance to remove the lots of coarse\nmatching synthesis pairs. As the number of data decreases, we do not need to\nfine-tune the entire model. Therefore, we propose a WoRA learning strategy to\nefficiently update a minimal portion of model parameters. WoRA streamlines the\nlearning process, enabling heightened efficiency in extracting knowledge from\nfewer, yet potent, data instances. Extensive experimentation validates the\nefficacy of pretraining, where our model achieves advanced and efficient\nretrieval performance on challenging real-world benchmarks. Notably, on the\nCUHK-PEDES dataset, we have achieved a competitive mAP of 67.02% while reducing\nmodel training time by 19.82%.\n","authors":["Jintao Sun","Hao Fei","Zhedong Zheng","Gangyi Ding"],"pdf_url":"https://arxiv.org/pdf/2404.10292v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11210v3","updated":"2025-01-21T02:10:01Z","published":"2024-06-17T05:03:44Z","title":"Zero-Shot Scene Change Detection","summary":"  We present a novel, training-free approach to scene change detection. Our\nmethod leverages tracking models, which inherently perform change detection\nbetween consecutive frames of video by identifying common objects and detecting\nnew or missing objects. Specifically, our method takes advantage of the change\ndetection effect of the tracking model by inputting reference and query images\ninstead of consecutive frames. Furthermore, we focus on the content gap and\nstyle gap between two input images in change detection, and address both issues\nby proposing adaptive content threshold and style bridging layers,\nrespectively. Finally, we extend our approach to video, leveraging rich\ntemporal information to enhance the performance of scene change detection. We\ncompare our approach and baseline through various experiments. While existing\ntrain-based baseline tend to specialize only in the trained domain, our method\nshows consistent performance across various domains, proving the\ncompetitiveness of our approach.\n","authors":["Kyusik Cho","Dong Yeop Kim","Euntai Kim"],"pdf_url":"https://arxiv.org/pdf/2406.11210v3.pdf","comment":"AAAI 2025. Code available at: https://github.com/kyusik-cho/ZSSCD"},{"id":"http://arxiv.org/abs/2501.11815v1","updated":"2025-01-21T01:45:56Z","published":"2025-01-21T01:45:56Z","title":"CogMorph: Cognitive Morphing Attacks for Text-to-Image Models","summary":"  The development of text-to-image (T2I) generative models, that enable the\ncreation of high-quality synthetic images from textual prompts, has opened new\nfrontiers in creative design and content generation. However, this paper\nreveals a significant and previously unrecognized ethical risk inherent in this\ntechnology and introduces a novel method, termed the Cognitive Morphing Attack\n(CogMorph), which manipulates T2I models to generate images that retain the\noriginal core subjects but embeds toxic or harmful contextual elements. This\nnuanced manipulation exploits the cognitive principle that human perception of\nconcepts is shaped by the entire visual scene and its context, producing images\nthat amplify emotional harm far beyond attacks that merely preserve the\noriginal semantics. To address this, we first construct an imagery toxicity\ntaxonomy spanning 10 major and 48 sub-categories, aligned with human\ncognitive-perceptual dimensions, and further build a toxicity risk matrix\nresulting in 1,176 high-quality T2I toxic prompts. Based on this, our CogMorph\nfirst introduces Cognitive Toxicity Augmentation, which develops a cognitive\ntoxicity knowledge base with rich external toxic representations for humans\n(e.g., fine-grained visual features) that can be utilized to further guide the\noptimization of adversarial prompts. In addition, we present Contextual\nHierarchical Morphing, which hierarchically extracts critical parts of the\noriginal prompt (e.g., scenes, subjects, and body parts), and then iteratively\nretrieves and fuses toxic features to inject harmful contexts. Extensive\nexperiments on multiple open-sourced T2I models and black-box commercial APIs\n(e.g., DALLE-3) demonstrate the efficacy of CogMorph which significantly\noutperforms other baselines by large margins (+20.62\\% on average).\n","authors":["Zonglei Jing","Zonghao Ying","Le Wang","Siyuan Liang","Aishan Liu","Xianglong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2501.11815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10826v2","updated":"2025-01-21T01:28:53Z","published":"2024-03-16T06:26:52Z","title":"MambaMOT: State-Space Model as Motion Predictor for Multi-Object\n  Tracking","summary":"  In the field of multi-object tracking (MOT), traditional methods often rely\non the Kalman filter for motion prediction, leveraging its strengths in linear\nmotion scenarios. However, the inherent limitations of these methods become\nevident when confronted with complex, nonlinear motions and occlusions\nprevalent in dynamic environments like sports and dance. This paper explores\nthe possibilities of replacing the Kalman filter with a learning-based motion\nmodel that effectively enhances tracking accuracy and adaptability beyond the\nconstraints of Kalman filter-based tracker. In this paper, our proposed method\nMambaMOT and MambaMOT+, demonstrate advanced performance on challenging MOT\ndatasets such as DanceTrack and SportsMOT, showcasing their ability to handle\nintricate, non-linear motion patterns and frequent occlusions more effectively\nthan traditional methods.\n","authors":["Hsiang-Wei Huang","Cheng-Yen Yang","Wenhao Chai","Zhongyu Jiang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.10826v2.pdf","comment":"Accepted by ICASSP 2025. Previous version paper title: Exploring\n  Learning-based Motion Models in Multi-Object Tracking"},{"id":"http://arxiv.org/abs/2501.09672v2","updated":"2025-01-21T01:04:52Z","published":"2025-01-16T17:08:12Z","title":"Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP\n  Evaluation Benchmark","summary":"  The proliferation of Vision-Language Models (VLMs) in the past several years\ncalls for rigorous and comprehensive evaluation methods and benchmarks. This\nwork analyzes existing VLM evaluation techniques, including automated metrics,\nAI-based assessments, and human evaluations across diverse tasks. We first\nintroduce Robin - a novel suite of VLMs that we built by combining Large\nLanguage Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use\nRobin to identify shortcomings of current evaluation approaches across scales.\nNext, to overcome the identified limitations, we introduce CHIRP - a new long\nform response benchmark we developed for more robust and complete VLM\nevaluation. We provide open access to the Robin training code, model suite, and\nCHIRP benchmark to promote reproducibility and advance VLM research.\n","authors":["Alexis Roger","Prateek Humane","Daniel Z. Kaplan","Kshitij Gupta","Qi Sun","George Adamopoulos","Jonathan Siu Chi Lim","Quentin Anthony","Edwin Fennell","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2501.09672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08506v2","updated":"2025-01-21T01:01:20Z","published":"2025-01-15T00:56:59Z","title":"Exploring the Efficacy of Meta-Learning: Unveiling Superior Data\n  Diversity Utilization of MAML Over Pre-training","summary":"  Currently, data and model size dominate the narrative in the training of\nsuper-large, powerful models. However, there has been a lack of exploration on\nthe effect of other attributes of the training dataset on model performance. We\nhypothesize that dataset diversity can impact the performance of vision models.\nOur study shows positive correlations between test set accuracy and data\ndiversity, providing an argument for furthering the research of dataset\nattributes beyond size. We analyzed pre-training and model-agnostic\nmeta-learning methods on twelve popular visual datasets (e.g., Omniglot,\nCIFAR-FS, Aircraft) and five model configurations, including MAML variants with\ndifferent numbers of inner gradient steps and supervised learning. We show\nmoderate to strong positive correlations (R-squared: 0.15-0.42) between\naccuracy and data diversity and weaker but significant correlations (R-squared:\n~0.2) between loss and diversity. These findings support our hypothesis and\ndemonstrate a promising way for a deeper exploration of how formal data\ndiversity influences model performance. This initial study highlights the\npotential of (Task2Vec) data diversity as a valuable measure in the rapidly\nevolving field of large-scale learning and emphasizes that understanding the\ndataset is key to building more powerful and generalizable models.\n","authors":["Kavita Selva","Satita Vittayaareekul","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11800v1","updated":"2025-01-21T00:34:32Z","published":"2025-01-21T00:34:32Z","title":"TFLOP: Table Structure Recognition Framework with Layout Pointer\n  Mechanism","summary":"  Table Structure Recognition (TSR) is a task aimed at converting table images\ninto a machine-readable format (e.g. HTML), to facilitate other applications\nsuch as information retrieval. Recent works tackle this problem by identifying\nthe HTML tags and text regions, where the latter is used for text extraction\nfrom the table document. These works however, suffer from misalignment issues\nwhen mapping text into the identified text regions. In this paper, we introduce\na new TSR framework, called TFLOP (TSR Framework with LayOut Pointer\nmechanism), which reformulates the conventional text region prediction and\nmatching into a direct text region pointing problem. Specifically, TFLOP\nutilizes text region information to identify both the table's structure tags\nand its aligned text regions, simultaneously. Without the need for region\nprediction and alignment, TFLOP circumvents the additional text region matching\nstage, which requires finely-calibrated post-processing. TFLOP also employs\nspan-aware contrastive supervision to enhance the pointing mechanism in tables\nwith complex structure. As a result, TFLOP achieves the state-of-the-art\nperformance across multiple benchmarks such as PubTabNet, FinTabNet, and\nSynthTabNet. In our extensive experiments, TFLOP not only exhibits competitive\nperformance but also shows promising results on industrial document TSR\nscenarios such as documents with watermarks or in non-English domain.\n","authors":["Minsoo Khang","Teakgyu Hong"],"pdf_url":"https://arxiv.org/pdf/2501.11800v1.pdf","comment":"Published in IJCAI Proceedings 2024"},{"id":"http://arxiv.org/abs/2501.11795v1","updated":"2025-01-21T00:07:55Z","published":"2025-01-21T00:07:55Z","title":"Provably effective detection of effective data poisoning attacks","summary":"  This paper establishes a mathematically precise definition of dataset\npoisoning attack and proves that the very act of effectively poisoning a\ndataset ensures that the attack can be effectively detected. On top of a\nmathematical guarantee that dataset poisoning is identifiable by a new\nstatistical test that we call the Conformal Separability Test, we provide\nexperimental evidence that we can adequately detect poisoning attempts in the\nreal world.\n","authors":["Jonathan Gallagher","Yasaman Esfandiari","Callen MacPhee","Michael Warren"],"pdf_url":"https://arxiv.org/pdf/2501.11795v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2501.12310v1","updated":"2025-01-21T17:26:50Z","published":"2025-01-21T17:26:50Z","title":"Optimizing Leaky Private Information Retrieval Codes to Achieve\n  ${O}(\\log K)$ Leakage Ratio Exponent","summary":"  We study the problem of leaky private information retrieval (L-PIR), where\nthe amount of privacy leakage is measured by the pure differential privacy\nparameter, referred to as the leakage ratio exponent. Unlike the previous L-PIR\nscheme proposed by Samy et al., which only adjusted the probability allocation\nto the clean (low-cost) retrieval pattern, we optimize the probabilities\nassigned to all the retrieval patterns jointly. It is demonstrated that the\noptimal retrieval pattern probability distribution is quite sophisticated and\nhas a layered structure: the retrieval patterns associated with the random key\nvalues of lower Hamming weights should be assigned higher probabilities. This\nnew scheme provides a significant improvement, leading to an ${O}(\\log K)$\nleakage ratio exponent with fixed download cost $D$ and number of servers $N$,\nin contrast to the previous art that only achieves a $\\Theta(K)$ exponent,\nwhere $K$ is the number of messages.\n","authors":["Wenyuan Zhao","Yu-Shin Huang","Chao Tian","Alex Sprintson"],"pdf_url":"https://arxiv.org/pdf/2501.12310v1.pdf","comment":"Long version of the paper submitted to ISIT 2025. 8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2403.17643v3","updated":"2025-01-21T15:52:55Z","published":"2024-03-26T12:23:34Z","title":"S+t-SNE -- Bringing Dimensionality Reduction to Data Streams","summary":"  We present S+t-SNE, an adaptation of the t-SNE algorithm designed to handle\ninfinite data streams. The core idea behind S+t-SNE is to update the t-SNE\nembedding incrementally as new data arrives, ensuring scalability and\nadaptability to handle streaming scenarios. By selecting the most important\npoints at each step, the algorithm ensures scalability while keeping\ninformative visualisations. By employing a blind method for drift management,\nthe algorithm adjusts the embedding space, which facilitates the visualisation\nof evolving data dynamics. Our experimental evaluations demonstrate the\neffectiveness and efficiency of S+t-SNE, whilst highlighting its ability to\ncapture patterns in a streaming scenario. We hope our approach offers\nresearchers and practitioners a real-time tool for understanding and\ninterpreting high-dimensional data.\n","authors":["Pedro C. Vieira","João P. Montrezol","João T. Vieira","João Gama"],"pdf_url":"https://arxiv.org/pdf/2403.17643v3.pdf","comment":"This preprint has undergone peer review but does not have any\n  post-submission improvements or corrections. Full version after peer-review\n  and post-acceptance improvements was presented at IDA2024\n  (https://ida2024.blogs.dsv.su.se/)"},{"id":"http://arxiv.org/abs/2405.16789v2","updated":"2025-01-21T15:40:43Z","published":"2024-05-27T03:24:01Z","title":"NoteLLM-2: Multimodal Large Representation Models for Recommendation","summary":"  Large Language Models (LLMs) have demonstrated exceptional proficiency in\ntext understanding and embedding tasks. However, their potential in multimodal\nrepresentation, particularly for item-to-item (I2I) recommendations, remains\nunderexplored. While leveraging existing Multimodal Large Language Models\n(MLLMs) for such tasks is promising, challenges arise due to their delayed\nrelease compared to corresponding LLMs and the inefficiency in representation\ntasks. To address these issues, we propose an end-to-end fine-tuning method\nthat customizes the integration of any existing LLMs and vision encoders for\nefficient multimodal representation. Preliminary experiments revealed that\nfine-tuned LLMs often neglect image content. To counteract this, we propose\nNoteLLM-2, a novel framework that enhances visual information. Specifically, we\npropose two approaches: first, a prompt-based method that segregates visual and\ntextual content, employing a multimodal In-Context Learning strategy to balance\nfocus across modalities; second, a late fusion technique that directly\nintegrates visual information into the final representations. Extensive\nexperiments, both online and offline, demonstrate the effectiveness of our\napproach. Code is available at\nhttps://github.com/Applied-Machine-Learning-Lab/NoteLLM.\n","authors":["Chao Zhang","Haoxin Zhang","Shiwei Wu","Di Wu","Tong Xu","Xiangyu Zhao","Yan Gao","Yao Hu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2405.16789v2.pdf","comment":"Accepted by KDD'25 ADS track"},{"id":"http://arxiv.org/abs/2501.12176v1","updated":"2025-01-21T14:34:11Z","published":"2025-01-21T14:34:11Z","title":"DataPro -- A Standardized Data Understanding and Processing Procedure: A\n  Case Study of an Eco-Driving Project","summary":"  A systematic pipeline for data processing and knowledge discovery is\nessential to extracting knowledge from big data and making recommendations for\noperational decision-making. The CRISP-DM model is the de-facto standard for\ndeveloping data-mining projects in practice. However, advancements in data\nprocessing technologies require enhancements to this framework. This paper\npresents the DataPro (a standardized data understanding and processing\nprocedure) model, which extends CRISP-DM and emphasizes the link between data\nscientists and stakeholders by adding the \"technical understanding\" and\n\"implementation\" phases. Firstly, the \"technical understanding\" phase aligns\nbusiness demands with technical requirements, ensuring the technical team's\naccurate comprehension of business goals. Next, the \"implementation\" phase\nfocuses on the practical application of developed data science models, ensuring\ntheoretical models are effectively applied in business contexts. Furthermore,\nclearly defining roles and responsibilities in each phase enhances management\nand communication among all participants. Afterward, a case study on an\neco-driving data science project for fuel efficiency analysis in the Danish\npublic transportation sector illustrates the application of the DataPro model.\nBy following the proposed framework, the project identified key business\nobjectives, translated them into technical requirements, and developed models\nthat provided actionable insights for reducing fuel consumption. Finally, the\nmodel is evaluated qualitatively, demonstrating its superiority over other data\nscience procedures.\n","authors":["Zhipeng Ma","Bo Nørregaard Jørgensen","Zheng Grace Ma"],"pdf_url":"https://arxiv.org/pdf/2501.12176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12175v1","updated":"2025-01-21T14:33:07Z","published":"2025-01-21T14:33:07Z","title":"Less is More: Information Bottleneck Denoised Multimedia Recommendation","summary":"  Empowered by semantic-rich content information, multimedia recommendation has\nemerged as a potent personalized technique. Current endeavors center around\nharnessing multimedia content to refine item representation or uncovering\nlatent item-item structures based on modality similarity. Despite the\neffectiveness, we posit that these methods are usually suboptimal due to the\nintroduction of irrelevant multimedia features into recommendation tasks. This\nstems from the fact that generic multimedia feature extractors, while\nwell-designed for domain-specific tasks, can inadvertently introduce\ntask-irrelevant features, leading to potential misguidance of recommenders. In\nthis work, we propose a denoised multimedia recommendation paradigm via the\nInformation Bottleneck principle (IB). Specifically, we propose a novel\nInformation Bottleneck denoised Multimedia Recommendation (IBMRec) model to\ntackle the irrelevant feature issue. IBMRec removes task-irrelevant features\nfrom both feature and item-item structure perspectives, which are implemented\nby two-level IB learning modules: feature-level (FIB) and graph-level (GIB). In\nparticular, FIB focuses on learning the minimal yet sufficient multimedia\nfeatures. This is achieved by maximizing the mutual information between\nmultimedia representation and recommendation tasks, while concurrently\nminimizing it between multimedia representation and pre-trained multimedia\nfeatures. Furthermore, GIB is designed to learn the robust item-item graph\nstructure, it refines the item-item graph based on preference affinity, then\nminimizes the mutual information between the original graph and the refined\none. Extensive experiments across three benchmarks validate the effectiveness\nof our proposed model, showcasing high performance, and applicability to\nvarious multimedia recommenders.\n","authors":["Yonghui Yang","Le Wu","Zhuangzhuang He","Zhengwei Wu","Richang Hong","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2501.12175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10436v2","updated":"2025-01-21T12:37:21Z","published":"2024-05-16T20:42:54Z","title":"Positional encoding is not the same as context: A study on positional\n  encoding for sequential recommendation","summary":"  The rapid growth of streaming media and e-commerce has driven advancements in\nrecommendation systems, particularly Sequential Recommendation Systems (SRS).\nThese systems employ users' interaction histories to predict future\npreferences. While recent research has focused on architectural innovations\nlike transformer blocks and feature extraction, positional encodings, crucial\nfor capturing temporal patterns, have received less attention. These encodings\nare often conflated with contextual, such as the temporal footprint, which\nprevious works tend to treat as interchangeable with positional information.\nThis paper highlights the critical distinction between temporal footprint and\npositional encodings, demonstrating that the latter offers unique relational\ncues between items, which the temporal footprint alone cannot provide. Through\nextensive experimentation on eight Amazon datasets and subsets, we assess the\nimpact of various encodings on performance metrics and training stability. We\nintroduce new positional encodings and investigate integration strategies that\nimprove both metrics and stability, surpassing state-of-the-art results at the\ntime of this work's initial preprint. Importantly, we demonstrate that\nselecting the appropriate encoding is not only key to better performance but\nalso essential for building robust, reliable SRS models.\n","authors":["Alejo Lopez-Avila","Jinhua Du","Abbas Shimary","Ze Li"],"pdf_url":"https://arxiv.org/pdf/2405.10436v2.pdf","comment":"18 pages, 6 figures, 21 tables"},{"id":"http://arxiv.org/abs/2501.11963v1","updated":"2025-01-21T08:21:45Z","published":"2025-01-21T08:21:45Z","title":"A Contrastive Framework with User, Item and Review Alignment for\n  Recommendation","summary":"  Learning effective latent representations for users and items is the\ncornerstone of recommender systems. Traditional approaches rely on user-item\ninteraction data to map users and items into a shared latent space, but the\nsparsity of interactions often poses challenges. While leveraging user reviews\ncould mitigate this sparsity, existing review-aware recommendation models often\nexhibit two key limitations. First, they typically rely on reviews as\nadditional features, but reviews are not universal, with many users and items\nlacking them. Second, such approaches do not integrate reviews into the\nuser-item space, leading to potential divergence or inconsistency among user,\nitem, and review representations. To overcome these limitations, our work\nintroduces a Review-centric Contrastive Alignment Framework for Recommendation\n(ReCAFR), which incorporates reviews into the core learning process, ensuring\nalignment among user, item, and review representations within a unified space.\nSpecifically, we leverage two self-supervised contrastive strategies that not\nonly exploit review-based augmentation to alleviate sparsity, but also align\nthe tripartite representations to enhance robustness. Empirical studies on\npublic benchmark datasets demonstrate the effectiveness and robustness of\nReCAFR.\n","authors":["Hoang V. Dong","Yuan Fang","Hady W. Lauw"],"pdf_url":"https://arxiv.org/pdf/2501.11963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05360v2","updated":"2025-01-21T07:39:11Z","published":"2024-07-07T13:23:15Z","title":"Redefining POI Popularity: Integrating User Preferences and Recency for\n  Enhanced Recommendations","summary":"  The task of point-of-interest (POI) recommendation is to predict users'\nimmediate future movements based on their previous records and present\ncircumstances. Popularity is considered as one of the primary deciding factors\nfor selecting the next place to visit. Existing approaches mainly focused on\nthe number of check-ins to model the popularity of a POI. However, not enough\nattention is paid to the temporal impact or number of people check-ins for a\nparticular POI. Thus, to prioritize more on recent check-ins, we propose\nrecency-oriented definition of POI's popularity by considering the temporal\neffect of the POIs, the number of check-ins, as well as the number of users who\nregistered in those check-ins. Our experimental results on real dataset show\nthe efficacy of the proposed approach.\n","authors":["Alif Al Hasan","Md. Musfique Anwar","M. Arifur Rahman"],"pdf_url":"https://arxiv.org/pdf/2407.05360v2.pdf","comment":"This paper was presented at MIET-2024"},{"id":"http://arxiv.org/abs/2407.09137v2","updated":"2025-01-21T07:19:49Z","published":"2024-07-12T10:16:03Z","title":"A Look Into News Avoidance Through AWRS: An Avoidance-Aware Recommender\n  System","summary":"  In recent years, journalists have expressed concerns about the increasing\ntrend of news article avoidance, especially within specific domains. This issue\nhas been exacerbated by the rise of recommender systems. Our research indicates\nthat recommender systems should consider avoidance as a fundamental factor. We\nargue that news articles can be characterized by three principal elements:\nexposure, relevance, and avoidance, all of which are closely interconnected. To\naddress these challenges, we introduce AWRS, an Avoidance-Aware Recommender\nSystem. This framework incorporates avoidance awareness when recommending news,\nbased on the premise that news article avoidance conveys significant\ninformation about user preferences. Evaluation results on three news datasets\nin different languages (English, Norwegian, and Japanese) demonstrate that our\nmethod outperforms existing approaches.\n","authors":["Igor L. R. Azevedo","Toyotaro Suzumura","Yuichiro Yasui"],"pdf_url":"https://arxiv.org/pdf/2407.09137v2.pdf","comment":"SIAM International Conference on Data Mining (SDM25)"},{"id":"http://arxiv.org/abs/2501.11916v1","updated":"2025-01-21T06:43:16Z","published":"2025-01-21T06:43:16Z","title":"Generating with Fairness: A Modality-Diffused Counterfactual Framework\n  for Incomplete Multimodal Recommendations","summary":"  Incomplete scenario is a prevalent, practical, yet challenging setting in\nMultimodal Recommendations (MMRec), where some item modalities are missing due\nto various factors. Recently, a few efforts have sought to improve the\nrecommendation accuracy by exploring generic structures from incomplete data.\nHowever, two significant gaps persist: 1) the difficulty in accurately\ngenerating missing data due to the limited ability to capture modality\ndistributions; and 2) the critical but overlooked visibility bias, where items\nwith missing modalities are more likely to be disregarded due to the\nprioritization of items' multimodal data over user preference alignment. This\nbias raises serious concerns about the fair treatment of items. To bridge these\ntwo gaps, we propose a novel Modality-Diffused Counterfactual (MoDiCF)\nframework for incomplete multimodal recommendations. MoDiCF features two key\nmodules: a novel modality-diffused data completion module and a new\ncounterfactual multimodal recommendation module. The former, equipped with a\nparticularly designed multimodal generative framework, accurately generates and\niteratively refines missing data from learned modality-specific distribution\nspaces. The latter, grounded in the causal perspective, effectively mitigates\nthe negative causal effects of visibility bias and thus assures fairness in\nrecommendations. Both modules work collaboratively to address the two\naforementioned significant gaps for generating more accurate and fair results.\nExtensive experiments on three real-world datasets demonstrate the superior\nperformance of MoDiCF in terms of both recommendation accuracy and fairness\n","authors":["Jin Li","Shoujin Wang","Qi Zhang","Shui Yu","Fang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.11916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11911v1","updated":"2025-01-21T06:12:49Z","published":"2025-01-21T06:12:49Z","title":"Integrate Temporal Graph Learning into LLM-based Temporal Knowledge\n  Graph Model","summary":"  Temporal Knowledge Graph Forecasting (TKGF) aims to predict future events\nbased on the observed events in history. Recently, Large Language Models (LLMs)\nhave exhibited remarkable capabilities, generating significant research\ninterest in their application for reasoning over temporal knowledge graphs\n(TKGs). Existing LLM-based methods have integrated retrieved historical facts\nor static graph representations into LLMs. Despite the notable performance of\nLLM-based methods, they are limited by the insufficient modeling of temporal\npatterns and ineffective cross-modal alignment between graph and language,\nhindering the ability of LLMs to fully grasp the temporal and structural\ninformation in TKGs. To tackle these issues, we propose a novel framework\nTGL-LLM to integrate temporal graph learning into LLM-based temporal knowledge\ngraph model. Specifically, we introduce temporal graph learning to capture the\ntemporal and relational patterns and obtain the historical graph embedding.\nFurthermore, we design a hybrid graph tokenization to sufficiently model the\ntemporal patterns within LLMs. To achieve better alignment between graph and\nlanguage, we employ a two-stage training paradigm to finetune LLMs on\nhigh-quality and diverse data, thereby resulting in better performance.\nExtensive experiments on three real-world datasets show that our approach\noutperforms a range of state-of-the-art (SOTA) methods.\n","authors":["He Chang","Jie Wu","Zhulin Tao","Yunshan Ma","Xianglin Huang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2501.11911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11870v1","updated":"2025-01-21T03:56:23Z","published":"2025-01-21T03:56:23Z","title":"Coarse-to-Fine Lightweight Meta-Embedding for ID-Based Recommendation","summary":"  The state-of-the-art recommendation systems have shifted the attention to\nefficient recommendation, e.g., on-device recommendation, under memory\nconstraints. To this end, the existing methods either focused on the\nlightweight embeddings for both users and items, or involved on-device systems\nenjoying the compact embeddings to enhance reusability and reduces space\ncomplexity. However, they focus solely on the coarse granularity of embedding,\nwhile overlook the fine-grained semantic nuances, to adversarially downgrade\nthe efficacy of meta-embeddings in capturing the intricate relationship over\nboth user and item, consequently resulting into the suboptimal recommendations.\nIn this paper, we aim to study how the meta-embedding can efficiently learn\nvaried grained semantics, together with how the fine-grained meta-embedding can\nstrengthen the representation of coarse-grained meta-embedding. To answer these\nquestions, we develop a novel graph neural networks (GNNs) based recommender\nwhere each user and item serves as the node, linked directly to coarse-grained\nvirtual nodes and indirectly to fine-grained virtual nodes, ensuring different\ngrained semantic learning, while disclosing: 1) In contrast to coarse-grained\nsemantics, fine-grained semantics are well captured through sparse\nmeta-embeddings, which adaptively 2) balance the embedding uniqueness and\nmemory constraint. Additionally, the initialization method come up upon\nSparsePCA, along with a soft thresholding activation function to render the\nsparseness of the meta-embeddings. We propose a weight bridging update strategy\nthat focuses on matching each coarse-grained meta-embedding with several\nfine-grained meta-embeddings based on the users/items' semantics. Extensive\nexperiments substantiate our method's superiority over existing baselines. Our\ncode is available at https://github.com/htyjers/C2F-MetaEmbed.\n","authors":["Yang Wang","Haipeng Liu","Zeqian Yi","Biao Qian","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11870v1.pdf","comment":"16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.18396v2","updated":"2025-01-21T03:44:41Z","published":"2024-12-24T12:39:23Z","title":"Contrastive Representation for Interactive Recommendation","summary":"  Interactive Recommendation (IR) has gained significant attention recently for\nits capability to quickly capture dynamic interest and optimize both short and\nlong term objectives. IR agents are typically implemented through Deep\nReinforcement Learning (DRL), because DRL is inherently compatible with the\ndynamic nature of IR. However, DRL is currently not perfect for IR. Due to the\nlarge action space and sample inefficiency problem, training DRL recommender\nagents is challenging. The key point is that useful features cannot be\nextracted as high-quality representations for the recommender agent to optimize\nits policy. To tackle this problem, we propose Contrastive Representation for\nInteractive Recommendation (CRIR). CRIR efficiently extracts latent, high-level\npreference ranking features from explicit interaction, and leverages the\nfeatures to enhance users' representation. Specifically, the CRIR provides\nrepresentation through one representation network, and refines it through our\nproposed Preference Ranking Contrastive Learning (PRCL). The key insight of\nPRCL is that it can perform contrastive learning without relying on\ncomputations involving high-level representations or large potential action\nsets. Furthermore, we also propose a data exploiting mechanism and an agent\ntraining mechanism to better adapt CRIR to the DRL backbone. Extensive\nexperiments have been carried out to show our method's superior improvement on\nthe sample efficiency while training an DRL-based IR agent.\n","authors":["Jingyu Li","Zhiyong Feng","Dongxiao He","Hongqi Chen","Qinghang Gao","Guoli Wu"],"pdf_url":"https://arxiv.org/pdf/2412.18396v2.pdf","comment":"AAAI-2025 Accepted paper"},{"id":"http://arxiv.org/abs/2408.10159v4","updated":"2025-01-21T03:40:14Z","published":"2024-08-19T17:09:32Z","title":"Customizing Language Models with Instance-wise LoRA for Sequential\n  Recommendation","summary":"  Sequential recommendation systems predict the next interaction item based on\nusers' past interactions, aligning recommendations with individual preferences.\nLeveraging the strengths of Large Language Models (LLMs) in knowledge\ncomprehension and reasoning, recent approaches are eager to apply LLMs to\nsequential recommendation. A common paradigm is converting user behavior\nsequences into instruction data, and fine-tuning the LLM with\nparameter-efficient fine-tuning (PEFT) methods like Low-Rank Adaption (LoRA).\nHowever, the uniform application of LoRA across diverse user behaviors is\ninsufficient to capture individual variability, resulting in negative transfer\nbetween disparate sequences. To address these challenges, we propose\nInstance-wise LoRA (iLoRA). We innovatively treat the sequential recommendation\ntask as a form of multi-task learning, integrating LoRA with the Mixture of\nExperts (MoE) framework. This approach encourages different experts to capture\nvarious aspects of user behavior. Additionally, we introduce a sequence\nrepresentation guided gate function that generates customized expert\nparticipation weights for each user sequence, which allows dynamic parameter\nadjustment for instance-wise recommendations. In sequential recommendation,\niLoRA achieves an average relative improvement of 11.4\\% over basic LoRA in the\nhit ratio metric, with less than a 1\\% relative increase in trainable\nparameters. Extensive experiments on three benchmark datasets demonstrate the\neffectiveness of iLoRA, highlighting its superior performance compared to\nexisting methods in mitigating negative transfer and improving recommendation\naccuracy. Our data and code are available at\nhttps://github.com/AkaliKong/iLoRA.\n","authors":["Xiaoyu Kong","Jiancan Wu","An Zhang","Leheng Sheng","Hui Lin","Xiang Wang","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2408.10159v4.pdf","comment":"NeurIPS 2024 poster"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2501.12392v1","updated":"2025-01-21T18:59:53Z","published":"2025-01-21T18:59:53Z","title":"Learning segmentation from point trajectories","summary":"  We consider the problem of segmenting objects in videos based on their motion\nand no other forms of supervision. Prior work has often approached this problem\nby using the principle of common fate, namely the fact that the motion of\npoints that belong to the same object is strongly correlated. However, most\nauthors have only considered instantaneous motion from optical flow. In this\nwork, we present a way to train a segmentation network using long-term point\ntrajectories as a supervisory signal to complement optical flow. The key\ndifficulty is that long-term motion, unlike instantaneous motion, is difficult\nto model -- any parametric approximation is unlikely to capture complex motion\npatterns over long periods of time. We instead draw inspiration from subspace\nclustering approaches, proposing a loss function that seeks to group the\ntrajectories into low-rank matrices where the motion of object points can be\napproximately explained as a linear combination of other point tracks. Our\nmethod outperforms the prior art on motion-based segmentation, which shows the\nutility of long-term motion and the effectiveness of our formulation.\n","authors":["Laurynas Karazija","Iro Laina","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2501.12392v1.pdf","comment":"NeurIPS 2024 Spotlight. Project\n  https://www.robots.ox.ac.uk/~vgg/research/lrtl/"},{"id":"http://arxiv.org/abs/2501.12391v1","updated":"2025-01-21T18:59:49Z","published":"2025-01-21T18:59:49Z","title":"Physics of Skill Learning","summary":"  We aim to understand physics of skill learning, i.e., how skills are learned\nin neural networks during training. We start by observing the Domino effect,\ni.e., skills are learned sequentially, and notably, some skills kick off\nlearning right after others complete learning, similar to the sequential fall\nof domino cards. To understand the Domino effect and relevant behaviors of\nskill learning, we take physicists' approach of abstraction and simplification.\nWe propose three models with varying complexities -- the Geometry model, the\nResource model, and the Domino model, trading between reality and simplicity.\nThe Domino effect can be reproduced in the Geometry model, whose resource\ninterpretation inspires the Resource model, which can be further simplified to\nthe Domino model. These models present different levels of abstraction and\nsimplification; each is useful to study some aspects of skill learning. The\nGeometry model provides interesting insights into neural scaling laws and\noptimizers; the Resource model sheds light on the learning dynamics of\ncompositional tasks; the Domino model reveals the benefits of modularity. These\nmodels are not only conceptually interesting -- e.g., we show how Chinchilla\nscaling laws can emerge from the Geometry model, but also are useful in\npractice by inspiring algorithmic development -- e.g., we show how simple\nalgorithmic changes, motivated by these toy models, can speed up the training\nof deep learning models.\n","authors":["Ziming Liu","Yizhou Liu","Eric J. Michaud","Jeff Gore","Max Tegmark"],"pdf_url":"https://arxiv.org/pdf/2501.12391v1.pdf","comment":"25 pages, 20 figures. Codes are available at\n  https://github.com/KindXiaoming/physics_of_skill_learning"},{"id":"http://arxiv.org/abs/2501.12385v1","updated":"2025-01-21T18:58:38Z","published":"2025-01-21T18:58:38Z","title":"Audio Texture Manipulation by Exemplar-Based Analogy","summary":"  Audio texture manipulation involves modifying the perceptual characteristics\nof a sound to achieve specific transformations, such as adding, removing, or\nreplacing auditory elements. In this paper, we propose an exemplar-based\nanalogy model for audio texture manipulation. Instead of conditioning on\ntext-based instructions, our method uses paired speech examples, where one clip\nrepresents the original sound and another illustrates the desired\ntransformation. The model learns to apply the same transformation to new input,\nallowing for the manipulation of sound textures. We construct a quadruplet\ndataset representing various editing tasks, and train a latent diffusion model\nin a self-supervised manner. We show through quantitative evaluations and\nperceptual studies that our model outperforms text-conditioned baselines and\ngeneralizes to real-world, out-of-distribution, and non-speech scenarios.\nProject page: https://berkeley-speech-group.github.io/audio-texture-analogy/\n","authors":["Kan Jen Cheng","Tingle Li","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2501.12385v1.pdf","comment":"ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.12384v1","updated":"2025-01-21T18:57:34Z","published":"2025-01-21T18:57:34Z","title":"CCESAR: Coastline Classification-Extraction From SAR Images Using\n  CNN-U-Net Combination","summary":"  In this article, we improve the deep learning solution for coastline\nextraction from Synthetic Aperture Radar (SAR) images by proposing a two-stage\nmodel involving image classification followed by segmentation. We hypothesize\nthat a single segmentation model usually used for coastline detection is\ninsufficient to characterize different coastline types. We demonstrate that the\nneed for a two-stage workflow prevails through different compression levels of\nthese images. Our results from experiments using a combination of CNN and U-Net\nmodels on Sentinel-1 images show that the two-stage workflow, coastline\nclassification-extraction from SAR images (CCESAR) outperforms a single U-Net\nsegmentation model.\n","authors":["Vidhu Arora","Shreyan Gupta","Ananthakrishna Kudupu","Aditya Priyadarshi","Aswathi Mundayatt","Jaya Sreevalsan-Nair"],"pdf_url":"https://arxiv.org/pdf/2501.12384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12381v1","updated":"2025-01-21T18:56:19Z","published":"2025-01-21T18:56:19Z","title":"Parallel Sequence Modeling via Generalized Spatial Propagation Network","summary":"  We present the Generalized Spatial Propagation Network (GSPN), a new\nattention mechanism optimized for vision tasks that inherently captures 2D\nspatial structures. Existing attention models, including transformers, linear\nattention, and state-space models like Mamba, process multi-dimensional data as\n1D sequences, compromising spatial coherence and efficiency. GSPN overcomes\nthese limitations by directly operating on spatially coherent image data and\nforming dense pairwise connections through a line-scan approach. Central to\nGSPN is the Stability-Context Condition, which ensures stable, context-aware\npropagation across 2D sequences and reduces the effective sequence length to\n$\\sqrt{N}$ for a square map with N elements, significantly enhancing\ncomputational efficiency. With learnable, input-dependent weights and no\nreliance on positional embeddings, GSPN achieves superior spatial fidelity and\nstate-of-the-art performance in vision tasks, including ImageNet\nclassification, class-guided image generation, and text-to-image generation.\nNotably, GSPN accelerates SD-XL with softmax-attention by over $84\\times$ when\ngenerating 16K images.\n","authors":["Hongjun Wang","Wonmin Byeon","Jiarui Xu","Jinwei Gu","Ka Chun Cheung","Xiaolong Wang","Kai Han","Jan Kautz","Sifei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.12381v1.pdf","comment":"Project page: http://whj363636.github.io/GSPN/"},{"id":"http://arxiv.org/abs/2501.12370v1","updated":"2025-01-21T18:51:15Z","published":"2025-01-21T18:51:15Z","title":"Parameters vs FLOPs: Scaling Laws for Optimal Sparsity for\n  Mixture-of-Experts Language Models","summary":"  Scaling the capacity of language models has consistently proven to be a\nreliable approach for improving performance and unlocking new capabilities.\nCapacity can be primarily defined by two dimensions: the number of model\nparameters and the compute per example. While scaling typically involves\nincreasing both, the precise interplay between these factors and their combined\ncontribution to overall capacity remains not fully understood. We explore this\nrelationship in the context of sparse Mixture-of-Expert models (MoEs), which\nallow scaling the number of parameters without proportionally increasing the\nFLOPs per example. We investigate how varying the sparsity level, i.e., the\nratio of non-active to total parameters, affects model performance in terms of\nboth pretraining and downstream performance. We find that under different\nconstraints (e.g. parameter size and total training compute), there is an\noptimal level of sparsity that improves both training efficiency and model\nperformance. These results provide a better understanding of the impact of\nsparsity in scaling laws for MoEs and complement existing works in this area,\noffering insights for designing more efficient architectures.\n","authors":["Samira Abnar","Harshay Shah","Dan Busbridge","Alaaeldin Mohamed Elnouby Ali","Josh Susskind","Vimal Thilak"],"pdf_url":"https://arxiv.org/pdf/2501.12370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09898v2","updated":"2025-01-21T18:46:52Z","published":"2025-01-17T01:01:44Z","title":"FoundationStereo: Zero-Shot Stereo Matching","summary":"  Tremendous progress has been made in deep stereo matching to excel on\nbenchmark datasets through per-domain fine-tuning. However, achieving strong\nzero-shot generalization - a hallmark of foundation models in other computer\nvision tasks - remains challenging for stereo matching. We introduce\nFoundationStereo, a foundation model for stereo depth estimation designed to\nachieve strong zero-shot generalization. To this end, we first construct a\nlarge-scale (1M stereo pairs) synthetic training dataset featuring large\ndiversity and high photorealism, followed by an automatic self-curation\npipeline to remove ambiguous samples. We then design a number of network\narchitecture components to enhance scalability, including a side-tuning feature\nbackbone that adapts rich monocular priors from vision foundation models to\nmitigate the sim-to-real gap, and long-range context reasoning for effective\ncost volume filtering. Together, these components lead to strong robustness and\naccuracy across domains, establishing a new standard in zero-shot stereo depth\nestimation. Project page: https://nvlabs.github.io/FoundationStereo/\n","authors":["Bowen Wen","Matthew Trepte","Joseph Aribido","Jan Kautz","Orazio Gallo","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2501.09898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12367v1","updated":"2025-01-21T18:46:27Z","published":"2025-01-21T18:46:27Z","title":"Budget-constrained Collaborative Renewable Energy Forecasting Market","summary":"  Accurate power forecasting from renewable energy sources (RES) is crucial for\nintegrating additional RES capacity into the power system and realizing\nsustainability goals. This work emphasizes the importance of integrating\ndecentralized spatio-temporal data into forecasting models. However,\ndecentralized data ownership presents a critical obstacle to the success of\nsuch spatio-temporal models, and incentive mechanisms to foster data-sharing\nneed to be considered. The main contributions are a) a comparative analysis of\nthe forecasting models, advocating for efficient and interpretable spline LASSO\nregression models, and b) a bidding mechanism within the data/analytics market\nto ensure fair compensation for data providers and enable both buyers and\nsellers to express their data price requirements. Furthermore, an incentive\nmechanism for time series forecasting is proposed, effectively incorporating\nprice constraints and preventing redundant feature allocation. Results show\nsignificant accuracy improvements and potential monetary gains for data\nsellers. For wind power data, an average root mean squared error improvement of\nover 10% was achieved by comparing forecasts generated by the proposal with\nlocally generated ones.\n","authors":["Carla Goncalves","Ricardo J. Bessa","Tiago Teixeira","Joao Vinagre"],"pdf_url":"https://arxiv.org/pdf/2501.12367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12365v1","updated":"2025-01-21T18:45:09Z","published":"2025-01-21T18:45:09Z","title":"Efficient Algorithm for Sparse Fourier Transform of Generalized q-ary\n  Functions","summary":"  Computing the Fourier transform of a $q$-ary function\n$f:\\mathbb{Z}_{q}^n\\rightarrow \\mathbb{R}$, which maps $q$-ary sequences to\nreal numbers, is an important problem in mathematics with wide-ranging\napplications in biology, signal processing, and machine learning. Previous\nstudies have shown that, under the sparsity assumption, the Fourier transform\ncan be computed efficiently using fast and sample-efficient algorithms.\nHowever, in many practical settings, the function is defined over a more\ngeneral space -- the space of generalized $q$-ary sequences $\\mathbb{Z}_{q_1}\n\\times \\mathbb{Z}_{q_2} \\times \\cdots \\times \\mathbb{Z}_{q_n}$ -- where each\n$\\mathbb{Z}_{q_i}$ corresponds to integers modulo $q_i$. A naive approach\ninvolves setting $q=\\max_i{q_i}$ and treating the function as $q$-ary, which\nresults in heavy computational overheads. Herein, we develop GFast, an\nalgorithm that computes the $S$-sparse Fourier transform of $f$ with a sample\ncomplexity of $O(Sn)$, computational complexity of $O(Sn \\log N)$, and a\nfailure probability that approaches zero as $N=\\prod_{i=1}^n q_i \\rightarrow\n\\infty$ with $S = N^\\delta$ for some $0 \\leq \\delta < 1$. In the presence of\nnoise, we further demonstrate that a robust version of GFast computes the\ntransform with a sample complexity of $O(Sn^2)$ and computational complexity of\n$O(Sn^2 \\log N)$ under the same high probability guarantees. Using large-scale\nsynthetic experiments, we demonstrate that GFast computes the sparse Fourier\ntransform of generalized $q$-ary functions using $16\\times$ fewer samples and\nrunning $8\\times$ faster than existing algorithms. In real-world protein\nfitness datasets, GFast explains the predictive interactions of a neural\nnetwork with $>25\\%$ smaller normalized mean-squared error compared to existing\nalgorithms.\n","authors":["Darin Tsui","Kunal Talreja","Amirali Aghazadeh"],"pdf_url":"https://arxiv.org/pdf/2501.12365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03821v2","updated":"2025-01-21T18:42:42Z","published":"2025-01-07T14:35:09Z","title":"The Choice of Normalization Influences Shrinkage in Regularized\n  Regression","summary":"  Regularized models are often sensitive to the scales of the features in the\ndata and it has therefore become standard practice to normalize (center and\nscale) the features before fitting the model. But there are many different ways\nto normalize the features and the choice may have dramatic effects on the\nresulting model. In spite of this, there has so far been no research on this\ntopic. In this paper, we begin to bridge this knowledge gap by studying\nnormalization in the context of lasso, ridge, and elastic net regression. We\nfocus on normal and binary features and show that the class balances of binary\nfeatures directly influences the regression coefficients and that this effect\ndepends on the combination of normalization and regularization methods used. We\ndemonstrate that this effect can be mitigated by scaling binary features with\ntheir variance in the case of the lasso and standard deviation in the case of\nridge regression, but that this comes at the cost of increased variance. For\nthe elastic net, we show that scaling the penalty weights, rather than the\nfeatures, can achieve the same effect. Finally, we also tackle mixes of binary\nand normal features as well as interactions and provide some initial results on\nhow to normalize features in these cases.\n","authors":["Johan Larsson","Jonas Wallin"],"pdf_url":"https://arxiv.org/pdf/2501.03821v2.pdf","comment":"27 pages, 21 figures"},{"id":"http://arxiv.org/abs/2501.12359v1","updated":"2025-01-21T18:39:48Z","published":"2025-01-21T18:39:48Z","title":"Measured Hockey-Stick Divergence and its Applications to Quantum\n  Pufferfish Privacy","summary":"  The hockey-stick divergence is a fundamental quantity characterizing several\nstatistical privacy frameworks that ensure privacy for classical and quantum\ndata. In such quantum privacy frameworks, the adversary is allowed to perform\nall possible measurements. However, in practice, there are typically\nlimitations to the set of measurements that can be performed. To this end,\nhere, we comprehensively analyze the measured hockey-stick divergence under\nseveral classes of practically relevant measurement classes. We prove several\nof its properties, including data processing and convexity. We show that it is\nefficiently computable by semi-definite programming for some classes of\nmeasurements and can be analytically evaluated for Werner and isotropic states.\nNotably, we show that the measured hockey-stick divergence characterizes\noptimal privacy parameters in the quantum pufferfish privacy framework. With\nthis connection and the developed technical tools, we enable methods to\nquantify and audit privacy for several practically relevant settings. Lastly,\nwe introduce the measured hockey-stick divergence of channels and explore its\napplications in ensuring privacy for channels.\n","authors":["Theshani Nuradha","Vishal Singh","Mark M. Wilde"],"pdf_url":"https://arxiv.org/pdf/2501.12359v1.pdf","comment":"21 pages, submission to the 2025 International Symposium on\n  Information Theory to be held at University of Michigan"},{"id":"http://arxiv.org/abs/2501.12354v1","updated":"2025-01-21T18:33:08Z","published":"2025-01-21T18:33:08Z","title":"Diffusion-aware Censored Gaussian Processes for Demand Modelling","summary":"  Inferring the true demand for a product or a service from aggregate data is\noften challenging due to the limited available supply, thus resulting in\nobservations that are censored and correspond to the realized demand, thereby\nnot accounting for the unsatisfied demand. Censored regression models are able\nto account for the effect of censoring due to the limited supply, but they\ndon't consider the effect of substitutions, which may cause the demand for\nsimilar alternative products or services to increase. This paper proposes\nDiffusion-aware Censored Demand Models, which combine a Tobit likelihood with a\ngraph diffusion process in order to model the latent process of transfer of\nunsatisfied demand between similar products or services. We instantiate this\nnew class of models under the framework of GPs and, based on both simulated and\nreal-world data for modeling sales, bike-sharing demand, and EV charging\ndemand, demonstrate its ability to better recover the true demand and produce\nmore accurate out-of-sample predictions.\n","authors":["Filipe Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2501.12354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12352v1","updated":"2025-01-21T18:32:31Z","published":"2025-01-21T18:32:31Z","title":"Test-time regression: a unifying framework for designing sequence models\n  with associative memory","summary":"  Sequences provide a remarkably general way to represent and process\ninformation. This powerful abstraction has placed sequence modeling at the\ncenter of modern deep learning applications, inspiring numerous architectures\nfrom transformers to recurrent networks. While this fragmented development has\nyielded powerful models, it has left us without a unified framework to\nunderstand their fundamental similarities and explain their effectiveness. We\npresent a unifying framework motivated by an empirical observation: effective\nsequence models must be able to perform associative recall. Our key insight is\nthat memorizing input tokens through an associative memory is equivalent to\nperforming regression at test-time. This regression-memory correspondence\nprovides a framework for deriving sequence models that can perform associative\nrecall, offering a systematic lens to understand seemingly ad-hoc architectural\nchoices. We show numerous recent architectures -- including linear attention\nmodels, their gated variants, state-space models, online learners, and softmax\nattention -- emerge naturally as specific approaches to test-time regression.\nEach architecture corresponds to three design choices: the relative importance\nof each association, the regressor function class, and the optimization\nalgorithm. This connection leads to new understanding: we provide theoretical\njustification for QKNorm in softmax attention, and we motivate higher-order\ngeneralizations of softmax attention. Beyond unification, our work unlocks\ndecades of rich statistical tools that can guide future development of more\npowerful yet principled sequence models.\n","authors":["Ke Alexander Wang","Jiaxin Shi","Emily B. Fox"],"pdf_url":"https://arxiv.org/pdf/2501.12352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12344v1","updated":"2025-01-21T18:22:16Z","published":"2025-01-21T18:22:16Z","title":"CYCle: Choosing Your Collaborators Wisely to Enhance Collaborative\n  Fairness in Decentralized Learning","summary":"  Collaborative learning (CL) enables multiple participants to jointly train\nmachine learning (ML) models on decentralized data sources without raw data\nsharing. While the primary goal of CL is to maximize the expected accuracy gain\nfor each participant, it is also important to ensure that the gains are fairly\ndistributed. Specifically, no client should be negatively impacted by the\ncollaboration, and the individual gains must ideally be commensurate with the\ncontributions. Most existing CL algorithms require central coordination and\nfocus on the gain maximization objective while ignoring collaborative fairness.\nIn this work, we first show that the existing measure of collaborative fairness\nbased on the correlation between accuracy values without and with collaboration\nhas drawbacks because it does not account for negative collaboration gain. We\nargue that maximizing mean collaboration gain (MCG) while simultaneously\nminimizing the collaboration gain spread (CGS) is a fairer alternative. Next,\nwe propose the CYCle protocol that enables individual participants in a private\ndecentralized learning (PDL) framework to achieve this objective through a\nnovel reputation scoring method based on gradient alignment between the local\ncross-entropy and distillation losses. Experiments on the CIFAR-10, CIFAR-100,\nand Fed-ISIC2019 datasets empirically demonstrate the effectiveness of the\nCYCle protocol to ensure positive and fair collaboration gain for all\nparticipants, even in cases where the data distributions of participants are\nhighly skewed. For the simple mean estimation problem with two participants, we\nalso theoretically show that CYCle performs better than standard FedAvg,\nespecially when there is large statistical heterogeneity.\n","authors":["Nurbek Tastan","Samuel Horvath","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2501.12344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14655v2","updated":"2025-01-21T18:14:30Z","published":"2024-10-18T17:48:27Z","title":"Bridging the Training-Inference Gap in LLMs by Leveraging Self-Generated\n  Tokens","summary":"  Language models are often trained to maximize the likelihood of the next\ntoken given past tokens in the training dataset. However, during inference\ntime, they are utilized differently, generating text sequentially and\nauto-regressively by using previously generated tokens as input to predict the\nnext one. Marginal differences in predictions at each step can cascade over\nsuccessive steps, resulting in different distributions from what the models\nwere trained for and potentially leading to unpredictable behavior. This paper\nproposes two simple approaches based on model own generation to address this\ndiscrepancy between the training and inference time. Our first approach is\nBatch-Scheduled Sampling, where, during training, we stochastically choose\nbetween the ground-truth token from the dataset and the model's own generated\ntoken as input to predict the next token. This is done in an offline manner,\nmodifying the context window by interleaving ground-truth tokens with those\ngenerated by the model. Our second approach is Reference-Answer-based\nCorrection, where we explicitly incorporate a self-correction capability into\nthe model during training. This enables the model to effectively self-correct\nthe gaps between the generated sequences and the ground truth data without\nrelying on an external oracle model. By incorporating our proposed strategies\nduring training, we have observed an overall improvement in performance\ncompared to baseline methods, as demonstrated by our extensive experiments\nusing summarization, general question-answering, and math question-answering\ntasks.\n","authors":["Zhepeng Cen","Yao Liu","Siliang Zeng","Pratik Chaudhari","Huzefa Rangwala","George Karypis","Rasool Fakoor"],"pdf_url":"https://arxiv.org/pdf/2410.14655v2.pdf","comment":"Published in TMLR"},{"id":"http://arxiv.org/abs/2501.12332v1","updated":"2025-01-21T18:06:54Z","published":"2025-01-21T18:06:54Z","title":"Automatic Labelling with Open-source LLMs using Dynamic Label Schema\n  Integration","summary":"  Acquiring labelled training data remains a costly task in real world machine\nlearning projects to meet quantity and quality requirements. Recently Large\nLanguage Models (LLMs), notably GPT-4, have shown great promises in labelling\ndata with high accuracy. However, privacy and cost concerns prevent the\nubiquitous use of GPT-4. In this work, we explore effectively leveraging\nopen-source models for automatic labelling. We identify integrating label\nschema as a promising technology but found that naively using the label\ndescription for classification leads to poor performance on high cardinality\ntasks. To address this, we propose Retrieval Augmented Classification (RAC) for\nwhich LLM performs inferences for one label at a time using corresponding label\nschema; we start with the most related label and iterates until a label is\nchosen by the LLM. We show that our method, which dynamically integrates label\ndescription, leads to performance improvements in labelling tasks. We further\nshow that by focusing only on the most promising labels, RAC can trade off\nbetween label quality and coverage - a property we leverage to automatically\nlabel our internal datasets.\n","authors":["Thomas Walshe","Sae Young Moon","Chunyang Xiao","Yawwani Gunawardana","Fran Silavong"],"pdf_url":"https://arxiv.org/pdf/2501.12332v1.pdf","comment":"11 pages, 1 figure"},{"id":"http://arxiv.org/abs/2411.05852v2","updated":"2025-01-21T18:06:03Z","published":"2024-11-06T22:00:07Z","title":"$\\spadesuit$ SPADE $\\spadesuit$ Split Peak Attention DEcomposition","summary":"  Demand forecasting faces challenges induced by Peak Events (PEs)\ncorresponding to special periods such as promotions and holidays. Peak events\ncreate significant spikes in demand followed by demand ramp down periods.\nNeural networks like MQCNN and MQT overreact to demand peaks by carrying over\nthe elevated PE demand into subsequent Post-Peak-Event (PPE) periods, resulting\nin significantly over-biased forecasts. To tackle this challenge, we introduce\na neural forecasting model called Split Peak Attention DEcomposition, SPADE.\nThis model reduces the impact of PEs on subsequent forecasts by modeling\nforecasting as consisting of two separate tasks: one for PEs; and the other for\nthe rest. Its architecture then uses masked convolution filters and a\nspecialized Peak Attention module. We show SPADE's performance on a worldwide\nretail dataset with hundreds of millions of products. Our results reveal an\noverall PPE improvement of 4.5%, a 30% improvement for most affected forecasts\nafter promotions and holidays, and an improvement in PE accuracy by 3.9%,\nrelative to current production models.\n","authors":["Malcolm Wolff","Kin G. Olivares","Boris Oreshkin","Sunny Ruan","Sitan Yang","Abhinav Katoch","Shankar Ramasubramanian","Youxin Zhang","Michael W. Mahoney","Dmitry Efimov","Vincent Quenneville-Bélair"],"pdf_url":"https://arxiv.org/pdf/2411.05852v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12331v1","updated":"2025-01-21T18:05:11Z","published":"2025-01-21T18:05:11Z","title":"Cinepro: Robust Training of Foundation Models for Cancer Detection in\n  Prostate Ultrasound Cineloops","summary":"  Prostate cancer (PCa) detection using deep learning (DL) models has shown\npotential for enhancing real-time guidance during biopsies. However, prostate\nultrasound images lack pixel-level cancer annotations, introducing label noise.\nCurrent approaches often focus on limited regions of interest (ROIs),\ndisregarding anatomical context necessary for accurate diagnosis. Foundation\nmodels can overcome this limitation by analyzing entire images to capture\nglobal spatial relationships; however, they still encounter challenges stemming\nfrom the weak labels associated with coarse pathology annotations in ultrasound\ndata. We introduce Cinepro, a novel framework that strengthens foundation\nmodels' ability to localize PCa in ultrasound cineloops. Cinepro adapts robust\ntraining by integrating the proportion of cancer tissue reported by pathology\nin a biopsy core into its loss function to address label noise, providing a\nmore nuanced supervision. Additionally, it leverages temporal data across\nmultiple frames to apply robust augmentations, enhancing the model's ability to\nlearn stable cancer-related features. Cinepro demonstrates superior performance\non a multi-center prostate ultrasound dataset, achieving an AUROC of 77.1% and\na balanced accuracy of 83.8%, surpassing current benchmarks. These findings\nunderscore Cinepro's promise in advancing foundation models for weakly labeled\nultrasound data.\n","authors":["Mohamed Harmanani","Amoon Jamzad","Minh Nguyen Nhat To","Paul F. R. Wilson","Zhuoxin Guo","Fahimeh Fooladgar","Samira Sojoudi","Mahdi Gilany","Silvia Chang","Peter Black","Michael Leveridge","Robert Siemens","Purang Abolmaesumi","Parvin Mousavi"],"pdf_url":"https://arxiv.org/pdf/2501.12331v1.pdf","comment":"accepted to IEEE ISBI 2025"},{"id":"http://arxiv.org/abs/2501.12330v1","updated":"2025-01-21T17:59:11Z","published":"2025-01-21T17:59:11Z","title":"The Gap Between Principle and Practice of Lossy Image Coding","summary":"  Lossy image coding is the art of computing that is principally bounded by the\nimage's rate-distortion function. This bound, though never accurately\ncharacterized, has been approached practically via deep learning technologies\nin recent years. Indeed, learned image coding schemes allow direct optimization\nof the joint rate-distortion cost, thereby outperforming the handcrafted image\ncoding schemes by a large margin. Still, it is observed that there is room for\nfurther improvement in the rate-distortion performance of learned image coding.\nIn this article, we identify the gap between the ideal rate-distortion function\nforecasted by Shannon's information theory and the empirical rate-distortion\nfunction achieved by the state-of-the-art learned image coding schemes,\nrevealing that the gap is incurred by five different effects: modeling effect,\napproximation effect, amortization effect, digitization effect, and asymptotic\neffect. We design simulations and experiments to quantitively evaluate the last\nthree effects, which demonstrates the high potential of future lossy image\ncoding technologies.\n","authors":["Haotian Zhang","Dong Liu"],"pdf_url":"https://arxiv.org/pdf/2501.12330v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.18067v3","updated":"2025-01-21T17:50:47Z","published":"2024-10-23T17:48:28Z","title":"Beyond Position: the emergence of wavelet-like properties in\n  Transformers","summary":"  This paper studies how transformer models develop robust wavelet-like\nproperties that effectively compensate for the theoretical limitations of\nRotary Position Embeddings (RoPE), providing insights into how these networks\nprocess sequential information across different scales. Through theoretical\nanalysis and empirical validation across models ranging from 1B to 12B\nparameters, we show that attention heads naturally evolve to implement\nmulti-resolution processing analogous to wavelet transforms. Our analysis\nestablishes that attention heads consistently organize into complementary\nfrequency bands with systematic power distribution patterns, and these\nwavelet-like characteristics become more pronounced in larger models. We\nprovide mathematical analysis showing how these properties align with optimal\nsolutions to the fundamental uncertainty principle between positional precision\nand frequency resolution. Our findings suggest that the effectiveness of modern\ntransformer architectures stems significantly from their development of optimal\nmulti-resolution decompositions that naturally address the theoretical\nconstraints of position encoding.\n","authors":["Valeria Ruscio","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2410.18067v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12314v1","updated":"2025-01-21T17:28:52Z","published":"2025-01-21T17:28:52Z","title":"Uncertainty Quantification With Noise Injection in Neural Networks: A\n  Bayesian Perspective","summary":"  Model uncertainty quantification involves measuring and evaluating the\nuncertainty linked to a model's predictions, helping assess their reliability\nand confidence. Noise injection is a technique used to enhance the robustness\nof neural networks by introducing randomness. In this paper, we establish a\nconnection between noise injection and uncertainty quantification from a\nBayesian standpoint. We theoretically demonstrate that injecting noise into the\nweights of a neural network is equivalent to Bayesian inference on a deep\nGaussian process. Consequently, we introduce a Monte Carlo Noise Injection\n(MCNI) method, which involves injecting noise into the parameters during\ntraining and performing multiple forward propagations during inference to\nestimate the uncertainty of the prediction. Through simulation and experiments\non regression and classification tasks, our method demonstrates superior\nperformance compared to the baseline model.\n","authors":["Xueqiong Yuan","Jipeng Li","Ercan Engin Kuruoglu"],"pdf_url":"https://arxiv.org/pdf/2501.12314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12309v1","updated":"2025-01-21T17:26:15Z","published":"2025-01-21T17:26:15Z","title":"A Hybrid Supervised and Self-Supervised Graph Neural Network for\n  Edge-Centric Applications","summary":"  This paper presents a novel graph-based deep learning model for tasks\ninvolving relations between two nodes (edge-centric tasks), where the focus\nlies on predicting relationships and interactions between pairs of nodes rather\nthan node properties themselves. This model combines supervised and\nself-supervised learning, taking into account for the loss function the\nembeddings learned and patterns with and without ground truth. Additionally it\nincorporates an attention mechanism that leverages both node and edge features.\nThe architecture, trained end-to-end, comprises two primary components:\nembedding generation and prediction. First, a graph neural network (GNN)\ntransform raw node features into dense, low-dimensional embeddings,\nincorporating edge attributes. Then, a feedforward neural model processes the\nnode embeddings to produce the final output. Experiments demonstrate that our\nmodel matches or exceeds existing methods for protein-protein interactions\nprediction and Gene Ontology (GO) terms prediction. The model also performs\neffectively with one-hot encoding for node features, providing a solution for\nthe previously unsolved problem of predicting similarity between compounds with\nunknown structures.\n","authors":["Eugenio Borzone","Leandro Di Persia","Matias Gerard"],"pdf_url":"https://arxiv.org/pdf/2501.12309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14477v2","updated":"2025-01-21T17:15:10Z","published":"2024-05-23T12:06:00Z","title":"LiteVAE: Lightweight and Efficient Variational Autoencoders for Latent\n  Diffusion Models","summary":"  Advances in latent diffusion models (LDMs) have revolutionized\nhigh-resolution image generation, but the design space of the autoencoder that\nis central to these systems remains underexplored. In this paper, we introduce\nLiteVAE, a new autoencoder design for LDMs, which leverages the 2D discrete\nwavelet transform to enhance scalability and computational efficiency over\nstandard variational autoencoders (VAEs) with no sacrifice in output quality.\nWe investigate the training methodologies and the decoder architecture of\nLiteVAE and propose several enhancements that improve the training dynamics and\nreconstruction quality. Our base LiteVAE model matches the quality of the\nestablished VAEs in current LDMs with a six-fold reduction in encoder\nparameters, leading to faster training and lower GPU memory requirements, while\nour larger model outperforms VAEs of comparable complexity across all evaluated\nmetrics (rFID, LPIPS, PSNR, and SSIM).\n","authors":["Seyedmorteza Sadat","Jakob Buhmann","Derek Bradley","Otmar Hilliges","Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2405.14477v2.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.12299v1","updated":"2025-01-21T17:11:25Z","published":"2025-01-21T17:11:25Z","title":"Sublinear Variational Optimization of Gaussian Mixture Models with\n  Millions to Billions of Parameters","summary":"  Gaussian Mixture Models (GMMs) range among the most frequently used machine\nlearning models. However, training large, general GMMs becomes computationally\nprohibitive for datasets with many data points $N$ of high-dimensionality $D$.\nFor GMMs with arbitrary covariances, we here derive a highly efficient\nvariational approximation, which is integrated with mixtures of factor\nanalyzers (MFAs). For GMMs with $C$ components, our proposed algorithm\nsignificantly reduces runtime complexity per iteration from\n$\\mathcal{O}(NCD^2)$ to a complexity scaling linearly with $D$ and remaining\nconstant w.r.t. $C$. Numerical validation of this theoretical complexity\nreduction then shows the following: the distance evaluations required for the\nentire GMM optimization process scale sublinearly with $NC$. On large-scale\nbenchmarks, this sublinearity results in speed-ups of an order-of-magnitude\ncompared to the state-of-the-art. As a proof of concept, we train GMMs with\nover 10 billion parameters on about 100 million images, and observe training\ntimes of approximately nine hours on a single state-of-the-art CPU.\n","authors":["Sebastian Salwig","Till Kahlke","Florian Hirschberger","Dennis Forster","Jörg Lücke"],"pdf_url":"https://arxiv.org/pdf/2501.12299v1.pdf","comment":"22 pages, 6 figures (and 17 pages, 3 figures in Appendix)"},{"id":"http://arxiv.org/abs/2501.12285v1","updated":"2025-01-21T16:54:39Z","published":"2025-01-21T16:54:39Z","title":"Implementation of an Asymmetric Adjusted Activation Function for Class\n  Imbalance Credit Scoring","summary":"  Credit scoring is a systematic approach to evaluate a borrower's probability\nof default (PD) on a bank loan. The data associated with such scenarios are\ncharacteristically imbalanced, complicating binary classification owing to the\noften-underestimated cost of misclassification during the classifier's learning\nprocess. Considering the high imbalance ratio (IR) of these datasets, we\nintroduce an innovative yet straightforward optimized activation function by\nincorporating an IR-dependent asymmetric adjusted factor embedded Sigmoid\nactivation function (ASIG). The embedding of ASIG makes the sensitive margin of\nthe Sigmoid function auto-adjustable, depending on the imbalance nature of the\ndatasets distributed, thereby giving the activation function an asymmetric\ncharacteristic that prevents the underrepresentation of the minority class\n(positive samples) during the classifier's learning process. The experimental\nresults show that the ASIG-embedded-classifier outperforms traditional\nclassifiers on datasets across wide-ranging IRs in the downstream\ncredit-scoring task. The algorithm also shows robustness and stability, even\nwhen the IR is ultra-high. Therefore, the algorithm provides a competitive\nalternative in the financial industry, especially in credit scoring, possessing\nthe ability to effectively process highly imbalanced distribution data.\n","authors":["Xia Li","Hanghang Zheng","Kunpeng Tao","Mao Mao"],"pdf_url":"https://arxiv.org/pdf/2501.12285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12281v1","updated":"2025-01-21T16:52:42Z","published":"2025-01-21T16:52:42Z","title":"MoGERNN: An Inductive Traffic Predictor for Unobserved Locations in\n  Dynamic Sensing Networks","summary":"  Given a partially observed road network, how can we predict the traffic state\nof unobserved locations? While deep learning approaches show exceptional\nperformance in traffic prediction, most assume sensors at all locations of\ninterest, which is impractical due to financial constraints. Furthermore, these\nmethods typically require costly retraining when sensor configurations change.\nWe propose MoGERNN, an inductive spatio-temporal graph representation model, to\naddress these challenges. Inspired by the Mixture of Experts approach in Large\nLanguage Models, we introduce a Mixture of Graph Expert (MoGE) block to model\ncomplex spatial dependencies through multiple graph message aggregators and a\nsparse gating network. This block estimates initial states for unobserved\nlocations, which are then processed by a GRU-based Encoder-Decoder that\nintegrates a graph message aggregator to capture spatio-temporal dependencies\nand predict future states. Experiments on two real-world datasets show MoGERNN\nconsistently outperforms baseline methods for both observed and unobserved\nlocations. MoGERNN can accurately predict congestion evolution even in areas\nwithout sensors, offering valuable information for traffic management.\nMoreover, MoGERNN is adaptable to dynamic sensing networks, maintaining\ncompetitive performance even compared to its retrained counterpart. Tests with\ndifferent numbers of available sensors confirm its consistent superiority, and\nablation studies validate the effectiveness of its key modules.\n","authors":["Qishen Zhou","Yifan Zhang","Michail A. Makridis","Anastasios Kouvelas","Yibing Wang","Simon Hu"],"pdf_url":"https://arxiv.org/pdf/2501.12281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12275v1","updated":"2025-01-21T16:44:51Z","published":"2025-01-21T16:44:51Z","title":"With Great Backbones Comes Great Adversarial Transferability","summary":"  Advances in self-supervised learning (SSL) for machine vision have improved\nrepresentation robustness and model performance, giving rise to pre-trained\nbackbones like \\emph{ResNet} and \\emph{ViT} models tuned with SSL methods such\nas \\emph{SimCLR}. Due to the computational and data demands of pre-training,\nthe utilization of such backbones becomes a strenuous necessity. However,\nemploying these backbones may inherit vulnerabilities to adversarial attacks.\nWhile adversarial robustness has been studied under \\emph{white-box} and\n\\emph{black-box} settings, the robustness of models tuned on pre-trained\nbackbones remains largely unexplored. Additionally, the role of tuning\nmeta-information in mitigating exploitation risks is unclear. This work\nsystematically evaluates the adversarial robustness of such models across\n$20,000$ combinations of tuning meta-information, including fine-tuning\ntechniques, backbone families, datasets, and attack types. We propose using\nproxy models to transfer attacks, simulating varying levels of target knowledge\nby fine-tuning these proxies with diverse configurations. Our findings reveal\nthat proxy-based attacks approach the effectiveness of \\emph{white-box}\nmethods, even with minimal tuning knowledge. We also introduce a naive\n\"backbone attack,\" leveraging only the backbone to generate adversarial\nsamples, which outperforms \\emph{black-box} attacks and rivals \\emph{white-box}\nmethods, highlighting critical risks in model-sharing practices. Finally, our\nablations reveal how increasing tuning meta-information impacts attack\ntransferability, measuring each meta-information combination.\n","authors":["Erik Arakelyan","Karen Hambardzumyan","Davit Papikyan","Pasquale Minervini","Albert Gordo","Isabelle Augenstein","Aram H. Markosyan"],"pdf_url":"https://arxiv.org/pdf/2501.12275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12254v1","updated":"2025-01-21T16:19:38Z","published":"2025-01-21T16:19:38Z","title":"Memory Storyboard: Leveraging Temporal Segmentation for Streaming\n  Self-Supervised Learning from Egocentric Videos","summary":"  Self-supervised learning holds the promise to learn good representations from\nreal-world continuous uncurated data streams. However, most existing works in\nvisual self-supervised learning focus on static images or artificial data\nstreams. Towards exploring a more realistic learning substrate, we investigate\nstreaming self-supervised learning from long-form real-world egocentric video\nstreams. Inspired by the event segmentation mechanism in human perception and\nmemory, we propose \"Memory Storyboard\" that groups recent past frames into\ntemporal segments for more effective summarization of the past visual streams\nfor memory replay. To accommodate efficient temporal segmentation, we propose a\ntwo-tier memory hierarchy: the recent past is stored in a short-term memory,\nand the storyboard temporal segments are then transferred to a long-term\nmemory. Experiments on real-world egocentric video datasets including SAYCam\nand KrishnaCam show that contrastive learning objectives on top of storyboard\nframes result in semantically meaningful representations which outperform those\nproduced by state-of-the-art unsupervised continual learning methods.\n","authors":["Yanlai Yang","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2501.12254v1.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.05884v2","updated":"2025-01-21T16:11:44Z","published":"2024-11-08T10:05:14Z","title":"Untrained Perceptual Loss for image denoising of line-like structures in\n  MR images","summary":"  In the acquisition of Magnetic Resonance (MR) images shorter scan times lead\nto higher image noise. Therefore, automatic image denoising using deep learning\nmethods is of high interest. MR images containing line-like structures such as\nroots or vessels yield special characteristics as they display connected\nstructures and yield sparse information. For this kind of data, it is important\nto consider voxel neighborhoods when training a denoising network. In this\npaper, we translate the Perceptual Loss to 3D data by comparing feature maps of\nuntrained networks in the loss function as done previously for 2D data. We\ntested the performance of untrained Perceptual Loss (uPL) on 3D image denoising\nof MR images displaying brain vessels (MR angiograms - MRA) and images of plant\nroots in soil. We investigate the impact of various uPL characteristics such as\nweight initialization, network depth, kernel size, and pooling operations on\nthe results. We tested the performance of the uPL loss on four Rician noise\nlevels using evaluation metrics such as the Structural Similarity Index Metric\n(SSIM). We observe, that our uPL outperforms conventional loss functions such\nas the L1 loss or a loss based on the Structural Similarity Index Metric\n(SSIM). The uPL network's initialization is not important, while network depth\nand pooling operations impact denoising performance. E.g. for both datasets a\nnetwork with five convolutional layers led to the best performance while a\nnetwork with more layers led to a performance drop. We also find that small uPL\nnetworks led to better or comparable results than using large networks such as\nVGG. We observe superior performance of our loss for both datasets, all noise\nlevels, and three network architectures. In conclusion, for images containing\nline-like structures, uPL is an alternative to other loss functions for 3D\nimage denoising.\n","authors":["Elisabeth Pfaehler","Daniel Pflugfelder","Hanno Scharr"],"pdf_url":"https://arxiv.org/pdf/2411.05884v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12243v1","updated":"2025-01-21T16:03:42Z","published":"2025-01-21T16:03:42Z","title":"FOCUS: First Order Concentrated Updating Scheme","summary":"  Large language models (LLMs) demonstrate remarkable performance, and\nimproving their pre-training process appears to be key to enhancing their\ncapabilities further. Based on the documented success of Adam, learning rate\ndecay, and weight decay, we hypothesize that the pre-training loss landscape\nfeatures a narrowing valley structure. Through experiments with synthetic loss\nfunctions, we discover that when gradient query noise is high relative to the\nvalley's sharpness, Adam's performance falls behind that of Signum because Adam\nreduces the effective step size too drastically. This observation led us to\ndevelop FOCUS, an optimizer that enhances Signum by incorporating attraction\ntoward moving averaged parameters, allowing it to handle noise better while\nmaintaining larger step sizes. In training GPT-2, FOCUS proves to be more\nstable than Signum and faster than Adam. These results suggest that gradient\nnoise may be an underappreciated limiting factor in LLM training, and FOCUS\noffers promising solutions.\n","authors":["Yizhou Liu","Ziming Liu","Jeff Gore"],"pdf_url":"https://arxiv.org/pdf/2501.12243v1.pdf","comment":"19 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.12236v1","updated":"2025-01-21T15:58:21Z","published":"2025-01-21T15:58:21Z","title":"Fast sparse optimization via adaptive shrinkage","summary":"  The need for fast sparse optimization is emerging, e.g., to deal with\nlarge-dimensional data-driven problems and to track time-varying systems. In\nthe framework of linear sparse optimization, the iterative\nshrinkage-thresholding algorithm is a valuable method to solve Lasso, which is\nparticularly appreciated for its ease of implementation. Nevertheless, it\nconverges slowly. In this paper, we develop a proximal method, based on\nlogarithmic regularization, which turns out to be an iterative\nshrinkage-thresholding algorithm with adaptive shrinkage hyperparameter. This\nadaptivity substantially enhances the trajectory of the algorithm, in a way\nthat yields faster convergence, while keeping the simplicity of the original\nmethod. Our contribution is twofold: on the one hand, we derive and analyze the\nproposed algorithm; on the other hand, we validate its fast convergence via\nnumerical experiments and we discuss the performance with respect to\nstate-of-the-art algorithms.\n","authors":["Vito Cerone","Sophie M. Fosson","Diego Regruto"],"pdf_url":"https://arxiv.org/pdf/2501.12236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12226v1","updated":"2025-01-21T15:51:07Z","published":"2025-01-21T15:51:07Z","title":"CDW-CoT: Clustered Distance-Weighted Chain-of-Thoughts Reasoning","summary":"  Large Language Models (LLMs) have recently achieved impressive results in\ncomplex reasoning tasks through Chain of Thought (CoT) prompting. However, most\nexisting CoT methods rely on using the same prompts, whether manually designed\nor automatically generated, to handle the entire dataset. This\none-size-fits-all approach may fail to meet the specific needs arising from the\ndiversities within a single dataset. To solve this problem, we propose the\nClustered Distance-Weighted Chain of Thought (CDW-CoT) method, which\ndynamically constructs prompts tailored to the characteristics of each data\ninstance by integrating clustering and prompt optimization techniques. Our\nmethod employs clustering algorithms to categorize the dataset into distinct\ngroups, from which a candidate pool of prompts is selected to reflect the\ninherent diversity within the dataset. For each cluster, CDW-CoT trains the\noptimal prompt probability distribution tailored to their specific\ncharacteristics. Finally, it dynamically constructs a unique prompt probability\ndistribution for each test instance, based on its proximity to cluster centers,\nfrom which prompts are selected for reasoning. CDW-CoT consistently outperforms\ntraditional CoT methods across six datasets, including commonsense, symbolic,\nand mathematical reasoning tasks. Specifically, when compared to manual CoT,\nCDW-CoT achieves an average accuracy improvement of 25.34% on LLaMA2 (13B) and\n15.72% on LLaMA3 (8B).\n","authors":["Yuanheng Fang","Guoqing Chao","Wenqiang Lei","Shaobo Li","Dianhui Chu"],"pdf_url":"https://arxiv.org/pdf/2501.12226v1.pdf","comment":"aaai25(poster)"},{"id":"http://arxiv.org/abs/2501.12217v1","updated":"2025-01-21T15:39:29Z","published":"2025-01-21T15:39:29Z","title":"Early Detection and Classification of Breast Cancer Using Deep Learning\n  Techniques","summary":"  Breast cancer is one of the deadliest cancers causing about massive number of\npatients to die annually all over the world according to the WHO. It is a kind\nof cancer that develops when the tissues of the breast grow rapidly and\nunboundly. This fatality rate can be prevented if the cancer is detected before\nit gets malignant. Using automation for early-age detection of breast cancer,\nArtificial Intelligence and Machine Learning technologies can be implemented\nfor the best outcome. In this study, we are using the Breast Cancer Image\nClassification dataset collected from the Kaggle depository, which comprises\n9248 Breast Ultrasound Images and is classified into three categories: Benign,\nMalignant, and Normal which refers to non-cancerous, cancerous, and normal\nimages.This research introduces three pretrained model featuring custom\nclassifiers that includes ResNet50, MobileNet, and VGG16, along with a custom\nCNN model utilizing the ReLU activation function.The models ResNet50,\nMobileNet, VGG16, and a custom CNN recorded accuracies of 98.41%, 97.91%,\n98.19%, and 92.94% on the dataset, correspondingly, with ResNet50 achieving the\nhighest accuracy of 98.41%.This model, with its deep and powerful architecture,\nis particularly successful in detecting aberrant cells as well as cancerous or\nnon-cancerous tumors. These accuracies show that the Machine Learning methods\nare more compatible for the classification and early detection of breast\ncancer.\n","authors":["Mst. Mumtahina Labonno","D. M. Asadujjaman","Md. Mahfujur Rahman","Abdullah Tamim","Mst. Jannatul Ferdous","Rafi Muttaki Mahi"],"pdf_url":"https://arxiv.org/pdf/2501.12217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12216v1","updated":"2025-01-21T15:36:08Z","published":"2025-01-21T15:36:08Z","title":"RL-RC-DoT: A Block-level RL agent for Task-Aware Video Compression","summary":"  Video encoders optimize compression for human perception by minimizing\nreconstruction error under bit-rate constraints. In many modern applications\nsuch as autonomous driving, an overwhelming majority of videos serve as input\nfor AI systems performing tasks like object recognition or segmentation, rather\nthan being watched by humans. It is therefore useful to optimize the encoder\nfor a downstream task instead of for perceptual image quality. However, a major\nchallenge is how to combine such downstream optimization with existing standard\nvideo encoders, which are highly efficient and popular. Here, we address this\nchallenge by controlling the Quantization Parameters (QPs) at the macro-block\nlevel to optimize the downstream task. This granular control allows us to\nprioritize encoding for task-relevant regions within each frame. We formulate\nthis optimization problem as a Reinforcement Learning (RL) task, where the\nagent learns to balance long-term implications of choosing QPs on both task\nperformance and bit-rate constraints. Notably, our policy does not require the\ndownstream task as an input during inference, making it suitable for streaming\napplications and edge devices such as vehicles. We demonstrate significant\nimprovements in two tasks, car detection, and ROI (saliency) encoding. Our\napproach improves task performance for a given bit rate compared to traditional\ntask agnostic encoding methods, paving the way for more efficient task-aware\nvideo compression.\n","authors":["Uri Gadot","Assaf Shocher","Shie Mannor","Gal Chechik","Assaf Hallak"],"pdf_url":"https://arxiv.org/pdf/2501.12216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12215v1","updated":"2025-01-21T15:33:55Z","published":"2025-01-21T15:33:55Z","title":"Automatic selection of the best neural architecture for time series\n  forecasting via multi-objective optimization and Pareto optimality conditions","summary":"  Time series forecasting plays a pivotal role in a wide range of applications,\nincluding weather prediction, healthcare, structural health monitoring,\npredictive maintenance, energy systems, and financial markets. While models\nsuch as LSTM, GRU, Transformers, and State-Space Models (SSMs) have become\nstandard tools in this domain, selecting the optimal architecture remains a\nchallenge. Performance comparisons often depend on evaluation metrics and the\ndatasets under analysis, making the choice of a universally optimal model\ncontroversial. In this work, we introduce a flexible automated framework for\ntime series forecasting that systematically designs and evaluates diverse\nnetwork architectures by integrating LSTM, GRU, multi-head Attention, and SSM\nblocks. Using a multi-objective optimization approach, our framework determines\nthe number, sequence, and combination of blocks to align with specific\nrequirements and evaluation objectives. From the resulting Pareto-optimal\narchitectures, the best model for a given context is selected via a\nuser-defined preference function. We validate our framework across four\ndistinct real-world applications. Results show that a single-layer GRU or LSTM\nis usually optimal when minimizing training time alone. However, when\nmaximizing accuracy or balancing multiple objectives, the best architectures\nare often composite designs incorporating multiple block types in specific\nconfigurations. By employing a weighted preference function, users can resolve\ntrade-offs between objectives, revealing novel, context-specific optimal\narchitectures. Our findings underscore that no single neural architecture is\nuniversally optimal for time series forecasting. Instead, the best-performing\nmodel emerges as a data-driven composite architecture tailored to user-defined\ncriteria and evaluation objectives.\n","authors":["Qianying Cao","Shanqing Liu","Alan John Varghese","Jerome Darbon","Michael Triantafyllou","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2501.12215v1.pdf","comment":"35 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.12212v1","updated":"2025-01-21T15:29:11Z","published":"2025-01-21T15:29:11Z","title":"Quantitative Error Bounds for Scaling Limits of Stochastic Iterative\n  Algorithms","summary":"  Stochastic iterative algorithms, including stochastic gradient descent (SGD)\nand stochastic gradient Langevin dynamics (SGLD), are widely utilized for\noptimization and sampling in large-scale and high-dimensional problems in\nmachine learning, statistics, and engineering. Numerous works have bounded the\nparameter error in, and characterized the uncertainty of, these approximations.\nOne common approach has been to use scaling limit analyses to relate the\ndistribution of algorithm sample paths to a continuous-time stochastic process\napproximation, particularly in asymptotic setups. Focusing on the univariate\nsetting, in this paper, we build on previous work to derive non-asymptotic\nfunctional approximation error bounds between the algorithm sample paths and\nthe Ornstein-Uhlenbeck approximation using an infinite-dimensional version of\nStein's method of exchangeable pairs. We show that this bound implies weak\nconvergence under modest additional assumptions and leads to a bound on the\nerror of the variance of the iterate averages of the algorithm. Furthermore, we\nuse our main result to construct error bounds in terms of two common metrics:\nthe L\\'{e}vy-Prokhorov and bounded Wasserstein distances. Our results provide a\nfoundation for developing similar error bounds for the multivariate setting and\nfor more sophisticated stochastic approximation algorithms.\n","authors":["Xiaoyu Wang","Mikolaj J. Kasprzak","Jeffrey Negrea","Solesne Bourguin","Jonathan H. Huggins"],"pdf_url":"https://arxiv.org/pdf/2501.12212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16640v2","updated":"2025-01-21T15:25:51Z","published":"2024-03-25T11:28:52Z","title":"Multi-Scale Texture Loss for CT denoising with GANs","summary":"  Generative Adversarial Networks (GANs) have proved as a powerful framework\nfor denoising applications in medical imaging. However, GAN-based denoising\nalgorithms still suffer from limitations in capturing complex relationships\nwithin the images. In this regard, the loss function plays a crucial role in\nguiding the image generation process, encompassing how much a synthetic image\ndiffers from a real image. To grasp highly complex and non-linear textural\nrelationships in the training process, this work presents a novel approach to\ncapture and embed multi-scale texture information into the loss function. Our\nmethod introduces a differentiable multi-scale texture representation of the\nimages dynamically aggregated by a self-attention layer, thus exploiting\nend-to-end gradient-based optimization. We validate our approach by carrying\nout extensive experiments in the context of low-dose CT denoising, a\nchallenging application that aims to enhance the quality of noisy CT scans. We\nutilize three publicly available datasets, including one simulated and two real\ndatasets. The results are promising as compared to other well-established loss\nfunctions, being also consistent across three different GAN architectures. The\ncode is available at:\nhttps://github.com/TrainLaboratory/MultiScaleTextureLoss-MSTLF\n","authors":["Francesco Di Feola","Lorenzo Tronchin","Valerio Guarrasi","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2403.16640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12204v1","updated":"2025-01-21T15:20:22Z","published":"2025-01-21T15:20:22Z","title":"Score Combining for Contrastive OOD Detection","summary":"  In out-of-distribution (OOD) detection, one is asked to classify whether a\ntest sample comes from a known inlier distribution or not. We focus on the case\nwhere the inlier distribution is defined by a training dataset and there exists\nno additional knowledge about the novelties that one is likely to encounter.\nThis problem is also referred to as novelty detection, one-class\nclassification, and unsupervised anomaly detection. The current literature\nsuggests that contrastive learning techniques are state-of-the-art for OOD\ndetection. We aim to improve on those techniques by combining/ensembling their\nscores using the framework of null hypothesis testing and, in particular, a\nnovel generalized likelihood ratio test (GLRT). We demonstrate that our\nproposed GLRT-based technique outperforms the state-of-the-art CSI and SupCSI\ntechniques from Tack et al. 2020 in dataset-vs-dataset experiments with\nCIFAR-10, SVHN, LSUN, ImageNet, and CIFAR-100, as well as leave-one-class-out\nexperiments with CIFAR-10. We also demonstrate that our GLRT outperforms the\nscore-combining methods of Fisher, Bonferroni, Simes, Benjamini-Hochwald, and\nStouffer in our application.\n","authors":["Edward T. Reehorst","Philip Schniter"],"pdf_url":"https://arxiv.org/pdf/2501.12204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12199v1","updated":"2025-01-21T15:10:14Z","published":"2025-01-21T15:10:14Z","title":"Experience-replay Innovative Dynamics","summary":"  Despite its groundbreaking success, multi-agent reinforcement learning (MARL)\nstill suffers from instability and nonstationarity. Replicator dynamics, the\nmost well-known model from evolutionary game theory (EGT), provide a\ntheoretical framework for the convergence of the trajectories to Nash\nequilibria and, as a result, have been used to ensure formal guarantees for\nMARL algorithms in stable game settings. However, they exhibit the opposite\nbehavior in other settings, which poses the problem of finding alternatives to\nensure convergence. In contrast, innovative dynamics, such as the Brown-von\nNeumann-Nash (BNN) or Smith, result in periodic trajectories with the potential\nto approximate Nash equilibria. Yet, no MARL algorithms based on these dynamics\nhave been proposed. In response to this challenge, we develop a novel\nexperience replay-based MARL algorithm that incorporates revision protocols as\ntunable hyperparameters. We demonstrate, by appropriately adjusting the\nrevision protocols, that the behavior of our algorithm mirrors the trajectories\nresulting from these dynamics. Importantly, our contribution provides a\nframework capable of extending the theoretical guarantees of MARL algorithms\nbeyond replicator dynamics. Finally, we corroborate our theoretical findings\nwith empirical results.\n","authors":["Tuo Zhang","Leonardo Stella","Julian Barreiro Gomez"],"pdf_url":"https://arxiv.org/pdf/2501.12199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12194v1","updated":"2025-01-21T15:02:31Z","published":"2025-01-21T15:02:31Z","title":"An End-to-End Approach for Korean Wakeword Systems with Speaker\n  Authentication","summary":"  Wakeword detection plays a critical role in enabling AI assistants to listen\nto user voices and interact effectively. However, for languages other than\nEnglish, there is a significant lack of pre-trained wakeword models.\nAdditionally, systems that merely determine the presence of a wakeword can pose\nserious privacy concerns. In this paper, we propose an end-to-end approach that\ntrains wakewords for Non-English languages, particulary Korean, and uses this\nto develop a Voice Authentication model to protect user privacy. Our\nimplementation employs an open-source platform OpenWakeWord, which performs\nwakeword detection using an FCN (Fully-Connected Network) architecture. Once a\nwakeword is detected, our custom-developed code calculates cosine similarity\nfor robust user authentication. Experimental results demonstrate the\neffectiveness of our approach, achieving a 16.79% and a 6.6% Equal Error Rate\n(EER) each in the Wakeword Detection and the Voice Authentication. These\nfindings highlight the model's potential in providing secure and accurate\nwakeword detection and authentication for Korean users.\n","authors":["Geonwoo Seo"],"pdf_url":"https://arxiv.org/pdf/2501.12194v1.pdf","comment":"19 pages, 10 figures, implementation code available at\n  https://github.com/gws8820/securewakeword-model,\n  https://github.com/gws8820/wyoming-securewakeword, demo video at\n  https://www.youtube.com/watch?v=F3AXUbL-i-o"},{"id":"http://arxiv.org/abs/2501.12193v1","updated":"2025-01-21T15:01:34Z","published":"2025-01-21T15:01:34Z","title":"MyDigiTwin: A Privacy-Preserving Framework for Personalized\n  Cardiovascular Risk Prediction and Scenario Exploration","summary":"  Cardiovascular disease (CVD) remains a leading cause of death, and primary\nprevention through personalized interventions is crucial. This paper introduces\nMyDigiTwin, a framework that integrates health digital twins with personal\nhealth environments to empower patients in exploring personalized health\nscenarios while ensuring data privacy. MyDigiTwin uses federated learning to\ntrain predictive models across distributed datasets without transferring raw\ndata, and a novel data harmonization framework addresses semantic and format\ninconsistencies in health data. A proof-of-concept demonstrates the feasibility\nof harmonizing and using cohort data to train privacy-preserving CVD prediction\nmodels. This framework offers a scalable solution for proactive, personalized\ncardiovascular care and sets the stage for future applications in real-world\nhealthcare settings.\n","authors":["Héctor Cadavid","Hyunho Mo","Bauke Arends","Katarzyna Dziopa","Esther E. Bron","Daniel Bos","Sonja Georgievska","Pim van der Harst"],"pdf_url":"https://arxiv.org/pdf/2501.12193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11900v4","updated":"2025-01-21T14:57:22Z","published":"2024-10-14T19:39:11Z","title":"FLARE: Faithful Logic-Aided Reasoning and Exploration","summary":"  Modern Question Answering (QA) and Reasoning approaches based on Large\nLanguage Models (LLMs) commonly use prompting techniques, such as\nChain-of-Thought (CoT), assuming the resulting generation will have a more\ngranular exploration and reasoning over the question space and scope. However,\nsuch methods struggle with generating outputs that are faithful to the\nintermediate chain of reasoning produced by the model. On the other end of the\nspectrum, neuro-symbolic methods such as Faithful CoT (F-CoT) propose to\ncombine LLMs with external symbolic solvers. While such approaches boast a high\ndegree of faithfulness, they usually require a model trained for code\ngeneration and struggle with tasks that are ambiguous or hard to formalise\nstrictly. We introduce $\\textbf{F}$aithful $\\textbf{L}$ogic-$\\textbf{A}$ided\n$\\textbf{R}$easoning and $\\textbf{E}$xploration ($\\textbf{FLARE}$), a novel\ninterpretable approach for traversing the problem space using task\ndecompositions. We use the LLM to plan a solution, soft-formalise the query\ninto facts and predicates using a logic programming code and simulate that code\nexecution using an exhaustive multi-hop search over the defined space. Our\nmethod allows us to compute the faithfulness of the reasoning process w.r.t.\nthe generated code and analyse the steps of the multi-hop search without\nrelying on external solvers. Our methods achieve SOTA results on $\\mathbf{7}$\nout of $\\mathbf{9}$ diverse reasoning benchmarks. We also show that model\nfaithfulness positively correlates with overall performance and further\ndemonstrate that $\\textbf{FLARE}$ allows pinpointing the decisive factors\nsufficient for and leading to the correct answer with optimal reasoning during\nthe multi-hop search.\n","authors":["Erik Arakelyan","Pasquale Minervini","Pat Verga","Patrick Lewis","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2410.11900v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12191v1","updated":"2025-01-21T14:56:47Z","published":"2025-01-21T14:56:47Z","title":"A margin-based replacement for cross-entropy loss","summary":"  Cross-entropy (CE) loss is the de-facto standard for training deep neural\nnetworks to perform classification. However, CE-trained deep neural networks\nstruggle with robustness and generalisation issues. To alleviate these issues,\nwe propose high error margin (HEM) loss, a variant of multi-class margin loss\nthat overcomes the training issues of other margin-based losses. We evaluate\nHEM extensively on a range of architectures and datasets. We find that HEM loss\nis more effective than cross-entropy loss across a wide range of tasks: unknown\nclass rejection, adversarial robustness, learning with imbalanced data,\ncontinual learning, and semantic segmentation (a pixel-level classification\ntask). Despite all training hyper-parameters being chosen for CE loss, HEM is\ninferior to CE only in terms of clean accuracy and this difference is\ninsignificant. We also compare HEM to specialised losses that have previously\nbeen proposed to improve performance on specific tasks. LogitNorm, a loss\nachieving state-of-the-art performance on unknown class rejection, produces\nsimilar performance to HEM for this task, but is much poorer for continual\nlearning and semantic segmentation. Logit-adjusted loss, designed for\nimbalanced data, has superior results to HEM for that task, but performs more\npoorly on unknown class rejection and semantic segmentation. DICE, a popular\nloss for semantic segmentation, is inferior to HEM loss on all tasks, including\nsemantic segmentation. Thus, HEM often out-performs specialised losses, and in\ncontrast to them, is a general-purpose replacement for CE loss.\n","authors":["Michael W. Spratling","Heiko H. Schütt"],"pdf_url":"https://arxiv.org/pdf/2501.12191v1.pdf","comment":"Code: https://codeberg.org/mwspratling/HEMLoss"},{"id":"http://arxiv.org/abs/2406.07455v2","updated":"2025-01-21T14:53:36Z","published":"2024-06-11T17:01:41Z","title":"Reinforcement Learning from Human Feedback without Reward Inference:\n  Model-Free Algorithm and Instance-Dependent Analysis","summary":"  In this paper, we study reinforcement learning from human feedback (RLHF)\nunder an episodic Markov decision process with a general trajectory-wise reward\nmodel. We developed a model-free RLHF best policy identification algorithm,\ncalled $\\mathsf{BSAD}$, without explicit reward model inference, which is a\ncritical intermediate step in the contemporary RLHF paradigms for training\nlarge language models (LLM). The algorithm identifies the optimal policy\ndirectly from human preference information in a backward manner, employing a\ndueling bandit sub-routine that constantly duels actions to identify the\nsuperior one. $\\mathsf{BSAD}$ adopts a reward-free exploration and\nbest-arm-identification-like adaptive stopping criteria to equalize the\nvisitation among all states in the same decision step while moving to the\nprevious step as soon as the optimal action is identifiable, leading to a\nprovable, instance-dependent sample complexity\n$\\tilde{\\mathcal{O}}(c_{\\mathcal{M}}SA^3H^3M\\log\\frac{1}{\\delta})$ which\nresembles the result in classic RL, where $c_{\\mathcal{M}}$ is the\ninstance-dependent constant and $M$ is the batch size. Moreover,\n$\\mathsf{BSAD}$ can be transformed into an explore-then-commit algorithm with\nlogarithmic regret and generalized to discounted MDPs using a frame-based\napproach. Our results show: (i) sample-complexity-wise, RLHF is not\nsignificantly harder than classic RL and (ii) end-to-end RLHF may deliver\nimproved performance by avoiding pitfalls in reward inferring such as overfit\nand distribution shift.\n","authors":["Qining Zhang","Honghao Wei","Lei Ying"],"pdf_url":"https://arxiv.org/pdf/2406.07455v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02154v2","updated":"2025-01-21T14:51:49Z","published":"2024-09-03T17:27:12Z","title":"COmoving Computer Acceleration (COCA): $N$-body simulations in an\n  emulated frame of reference","summary":"  $N$-body simulations are computationally expensive, so machine-learning\n(ML)-based emulation techniques have emerged as a way to increase their speed.\nAlthough fast, surrogate models have limited trustworthiness due to potentially\nsubstantial emulation errors that current approaches cannot correct for. To\nalleviate this problem, we introduce COmoving Computer Acceleration (COCA), a\nhybrid framework interfacing ML with an $N$-body simulator. The correct\nphysical equations of motion are solved in an emulated frame of reference, so\nthat any emulation error is corrected by design. This approach corresponds to\nsolving for the perturbation of particle trajectories around the machine-learnt\nsolution, which is computationally cheaper than obtaining the full solution,\nyet is guaranteed to converge to the truth as one increases the number of force\nevaluations. Although applicable to any ML algorithm and $N$-body simulator,\nthis approach is assessed in the particular case of particle-mesh cosmological\nsimulations in a frame of reference predicted by a convolutional neural\nnetwork, where the time dependence is encoded as an additional input parameter\nto the network. COCA efficiently reduces emulation errors in particle\ntrajectories, requiring far fewer force evaluations than running the\ncorresponding simulation without ML. We obtain accurate final density and\nvelocity fields for a reduced computational budget. We demonstrate that this\nmethod shows robustness when applied to examples outside the range of the\ntraining data. When compared to the direct emulation of the Lagrangian\ndisplacement field using the same training resources, COCA's ability to correct\nemulation errors results in more accurate predictions. COCA makes $N$-body\nsimulations cheaper by skipping unnecessary force evaluations, while still\nsolving the correct equations of motion and correcting for emulation errors\nmade by ML.\n","authors":["Deaglan J. Bartlett","Marco Chiarenza","Ludvig Doeser","Florent Leclercq"],"pdf_url":"https://arxiv.org/pdf/2409.02154v2.pdf","comment":"23 pages, 13 figures. Accepted for publication in A&A"},{"id":"http://arxiv.org/abs/2403.02302v4","updated":"2025-01-21T14:50:25Z","published":"2024-03-04T18:32:12Z","title":"Beyond Specialization: Assessing the Capabilities of MLLMs in Age and\n  Gender Estimation","summary":"  Multimodal Large Language Models (MLLMs) have recently gained immense\npopularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as\nopen-source ones such as LLaVA, are essentially general-purpose models and are\napplied to solve a wide variety of tasks, including those in computer vision.\nThese neural networks possess such strong general knowledge and reasoning\nabilities that they have proven capable of working even on tasks for which they\nwere not specifically trained. We compared the capabilities of the most\npowerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task\nof age and gender estimation with our state-of-the-art specialized model,\nMiVOLO. We also updated MiVOLO and provide details and new metrics in this\narticle. This comparison has yielded some interesting results and insights\nabout the strengths and weaknesses of the participating models. Furthermore, we\nattempted various ways to fine-tune the ShareGPT4V model for this specific\ntask, aiming to achieve state-of-the-art results in this particular challenge.\nAlthough such a model would not be practical in production, as it is incredibly\nexpensive compared to a specialized model like MiVOLO, it could be very useful\nin some tasks, like data annotation.\n","authors":["Maksim Kuprashevich","Grigorii Alekseenko","Irina Tolstykh"],"pdf_url":"https://arxiv.org/pdf/2403.02302v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12189v1","updated":"2025-01-21T14:50:19Z","published":"2025-01-21T14:50:19Z","title":"MirrorCBO: A consensus-based optimization method in the spirit of mirror\n  descent","summary":"  In this work we propose MirrorCBO, a consensus-based optimization (CBO)\nmethod which generalizes standard CBO in the same way that mirror descent\ngeneralizes gradient descent. For this we apply the CBO methodology to a swarm\nof dual particles and retain the primal particle positions by applying the\ninverse of the mirror map, which we parametrize as the subdifferential of a\nstrongly convex function $\\phi$. In this way, we combine the advantages of a\nderivative-free non-convex optimization algorithm with those of mirror descent.\nAs a special case, the method extends CBO to optimization problems with convex\nconstraints. Assuming bounds on the Bregman distance associated to $\\phi$, we\nprovide asymptotic convergence results for MirrorCBO with explicit exponential\nrate. Another key contribution is an exploratory numerical study of this new\nalgorithm across different application settings, focusing on (i)\nsparsity-inducing optimization, and (ii) constrained optimization,\ndemonstrating the competitive performance of MirrorCBO. We observe empirically\nthat the method can also be used for optimization on (non-convex) submanifolds\nof Euclidean space, can be adapted to mirrored versions of other recent CBO\nvariants, and that it inherits from mirror descent the capability to select\ndesirable minimizers, like sparse ones. We also include an overview of recent\nCBO approaches for constrained optimization and compare their performance to\nMirrorCBO.\n","authors":["Leon Bungert","Franca Hoffmann","Doh Yeon Kim","Tim Roith"],"pdf_url":"https://arxiv.org/pdf/2501.12189v1.pdf","comment":"64 pages, 18 figures, 19 tables"},{"id":"http://arxiv.org/abs/2406.15152v3","updated":"2025-01-21T14:38:22Z","published":"2024-06-21T13:55:34Z","title":"Generative Topological Networks","summary":"  Generative methods have recently seen significant improvements by generating\nin a lower-dimensional latent representation of the data. However, many of the\ngenerative methods applied in the latent space remain complex and difficult to\ntrain. Further, it is not entirely clear why transitioning to a\nlower-dimensional latent space can improve generative quality. In this work, we\nintroduce a new and simple generative method grounded in topology theory --\nGenerative Topological Networks (GTNs) -- which also provides insights into why\nlower-dimensional latent-space representations might be better-suited for data\ngeneration. GTNs are simple to train -- they employ a standard supervised\nlearning approach and do not suffer from common generative pitfalls such as\nmode collapse, posterior collapse or the need to pose constraints on the neural\nnetwork architecture. We demonstrate the use of GTNs on several datasets,\nincluding MNIST, CelebA, CIFAR-10 and the Hands and Palm Images dataset by\ntraining GTNs on a lower-dimensional latent representation of the data. We show\nthat GTNs can improve upon VAEs and that they are quick to converge, generating\nrealistic samples in early epochs. Further, we use the topological\nconsiderations behind the development of GTNs to offer insights into why\ngenerative models may benefit from operating on a lower-dimensional latent\nspace, highlighting the important link between the intrinsic dimension of the\ndata and the dimension in which the data is generated. Particularly, we\ndemonstrate that generating in high dimensional ambient spaces may be a\ncontributing factor to out-of-distribution samples generated by diffusion\nmodels. We also highlight other topological properties that are important to\nconsider when using and designing generative models. Our code is available at:\nhttps://github.com/alonalj/GTN\n","authors":["Alona Levy-Jurgenson","Zohar Yakhini"],"pdf_url":"https://arxiv.org/pdf/2406.15152v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06589v2","updated":"2025-01-21T14:33:38Z","published":"2025-01-11T17:06:30Z","title":"Ladder-residual: parallelism-aware architecture for accelerating large\n  model inference with communication overlapping","summary":"  Large language model inference is both memory-intensive and time-consuming,\noften requiring distributed algorithms to efficiently scale. Various model\nparallelism strategies are used in multi-gpu training and inference to\npartition computation across multiple devices, reducing memory load and\ncomputation time. However, using model parallelism necessitates communication\nof information between GPUs, which has been a major bottleneck and limits the\ngains obtained by scaling up the number of devices. We introduce Ladder\nResidual, a simple architectural modification applicable to all residual-based\nmodels that enables straightforward overlapping that effectively hides the\nlatency of communication. Our insight is that in addition to systems\noptimization, one can also redesign the model architecture to decouple\ncommunication from computation. While Ladder Residual can allow\ncommunication-computation decoupling in conventional parallelism patterns, we\nfocus on Tensor Parallelism in this paper, which is particularly bottlenecked\nby its heavy communication. For a Transformer model with 70B parameters,\napplying Ladder Residual to all its layers can achieve 30% end-to-end wall\nclock speed up at inference time with TP sharding over 8 devices. We refer the\nresulting Transformer model as the Ladder Transformer. We train a 1B and 3B\nLadder Transformer from scratch and observe comparable performance to a\nstandard dense transformer baseline. We also show that it is possible to\nconvert parts of the Llama-3.1 8B model to our Ladder Residual architecture\nwith minimal accuracy degradation by only retraining for 3B tokens.\n","authors":["Muru Zhang","Mayank Mishra","Zhongzhu Zhou","William Brandon","Jue Wang","Yoon Kim","Jonathan Ragan-Kelley","Shuaiwen Leon Song","Ben Athiwaratkun","Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2501.06589v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12174v1","updated":"2025-01-21T14:32:50Z","published":"2025-01-21T14:32:50Z","title":"BiMarker: Enhancing Text Watermark Detection for Large Language Models\n  with Bipolar Watermarks","summary":"  The rapid proliferation of Large Language Models (LLMs) has raised concerns\nabout misuse and the challenges of distinguishing AI-generated text from\nhuman-written content. Existing watermarking techniques, such as \\kgw, still\nface limitations under low watermark strength, stringent false-positive\nrequirements, and low-entropy scenarios. Our analysis reveals that current\ndetection methods rely on coarse estimates of non-watermarked text, which\nconstrains watermark detectability. We propose the Bipolar Watermark\n(BiMarker), a novel approach that divides generated text into positive and\nnegative poles, leveraging the difference in green token counts for detection.\nThis differential mechanism significantly enhances the detectability of\nwatermarked text. Theoretical analysis and experimental results demonstrate\nBiMarker's effectiveness and compatibility with existing optimization\ntechniques, offering a new optimization dimension for watermarking in\nLLM-generated content.\n","authors":["Zhuang Li"],"pdf_url":"https://arxiv.org/pdf/2501.12174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12166v1","updated":"2025-01-21T14:26:03Z","published":"2025-01-21T14:26:03Z","title":"Beyond Window-Based Detection: A Graph-Centric Framework for Discrete\n  Log Anomaly Detection","summary":"  Detecting anomalies in discrete event logs is critical for ensuring system\nreliability, security, and efficiency. Traditional window-based methods for log\nanomaly detection often suffer from context bias and fuzzy localization, which\nhinder their ability to precisely and efficiently identify anomalies. To\naddress these challenges, we propose a graph-centric framework, TempoLog, which\nleverages multi-scale temporal graph networks for discrete log anomaly\ndetection. Unlike conventional methods, TempoLog constructs continuous-time\ndynamic graphs directly from event logs, eliminating the need for fixed-size\nwindow grouping. By representing log templates as nodes and their temporal\nrelationships as edges, the framework dynamically captures both local and\nglobal dependencies across multiple temporal scales. Additionally, a\nsemantic-aware model enhances detection by incorporating rich contextual\ninformation. Extensive experiments on public datasets demonstrate that our\nmethod achieves state-of-the-art performance in event-level anomaly detection,\nsignificantly outperforming existing approaches in both accuracy and\nefficiency.\n","authors":["Jiaxing Qi","Chang Zeng","Zhongzhi Luan","Shaohan Huang","Shu Yang","Yao Lu","Hailong Yang","Depei Qian"],"pdf_url":"https://arxiv.org/pdf/2501.12166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06003v2","updated":"2025-01-21T14:17:23Z","published":"2025-01-10T14:34:46Z","title":"Learning to generate feasible graphs using graph grammars","summary":"  Generative methods for graphs need to be sufficiently flexible to model\ncomplex dependencies between sets of nodes. At the same time, the generated\ngraphs need to satisfy domain-dependent feasibility conditions, that is, they\nshould not violate certain constraints that would make their interpretation\nimpossible within the given application domain (e.g. a molecular graph where an\natom has a very large number of chemical bounds). Crucially, constraints can\ninvolve not only local but also long-range dependencies: for example, the\nmaximal length of a cycle can be bounded.\n  Currently, a large class of generative approaches for graphs, such as methods\nbased on artificial neural networks, is based on message passing schemes. These\napproaches suffer from information 'dilution' issues that severely limit the\nmaximal range of the dependencies that can be modeled. To address this problem,\nwe propose a generative approach based on the notion of graph grammars. The key\nnovel idea is to introduce a domain-dependent coarsening procedure to provide\nshort-cuts for long-range dependencies.\n  We show the effectiveness of our proposal in two domains: 1) small drugs and\n2) RNA secondary structures. In the first case, we compare the quality of the\ngenerated molecular graphs via the Molecular Sets (MOSES) benchmark suite,\nwhich evaluates the distance between generated and real molecules, their\nlipophilicity, synthesizability, and drug-likeness. In the second case, we show\nthat the approach can generate very large graphs (with hundreds of nodes) that\nare accepted as valid examples for a desired RNA family by the \"Infernal\"\ncovariance model, a state-of-the-art RNA classifier.\n  Our implementation is available on github:\ngithub.com/fabriziocosta/GraphLearn\n","authors":["Stefan Mautner","Rolf Backofen","Fabrizio Costa"],"pdf_url":"https://arxiv.org/pdf/2501.06003v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12162v1","updated":"2025-01-21T14:15:01Z","published":"2025-01-21T14:15:01Z","title":"AdaServe: SLO-Customized LLM Serving with Fine-Grained Speculative\n  Decoding","summary":"  This paper introduces AdaServe, the first LLM serving system to support SLO\ncustomization through fine-grained speculative decoding. AdaServe leverages the\nlogits of a draft model to predict the speculative accuracy of tokens and\nemploys a theoretically optimal algorithm to construct token trees for\nverification. To accommodate diverse SLO requirements without compromising\nthroughput, AdaServe employs a speculation-and-selection scheme that first\nconstructs candidate token trees for each request and then dynamically selects\ntokens to meet individual SLO constraints while optimizing throughput.\nComprehensive evaluations demonstrate that AdaServe achieves up to 73% higher\nSLO attainment and 74% higher goodput compared to state-of-the-art systems.\nThese results underscore AdaServe's potential to enhance the efficiency and\nadaptability of LLM deployments across varied application scenarios.\n","authors":["Zikun Li","Zhuofu Chen","Remi Delacourt","Gabriele Oliaro","Zeyu Wang","Qinghan Chen","Shuhuai Lin","April Yang","Zhihao Zhang","Zhuoming Chen","Sean Lai","Xupeng Miao","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2501.12162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12147v1","updated":"2025-01-21T14:00:43Z","published":"2025-01-21T14:00:43Z","title":"Improving Influence-based Instruction Tuning Data Selection for Balanced\n  Learning of Diverse Capabilities","summary":"  Selecting appropriate training data is crucial for effective instruction\nfine-tuning of large language models (LLMs), which aims to (1) elicit strong\ncapabilities, and (2) achieve balanced performance across a diverse range of\ntasks. Influence-based methods show promise in achieving (1) by estimating the\ncontribution of each training example to the model's predictions, but often\nstruggle with (2). Our systematic investigation reveals that this\nunderperformance can be attributed to an inherent bias where certain tasks\nintrinsically have greater influence than others. As a result, data selection\nis often biased towards these tasks, not only hurting the model's performance\non others but also, counterintuitively, harms performance on these\nhigh-influence tasks themselves.\n  As a remedy, we propose BIDS, a Balanced and Influential Data Selection\nalgorithm. BIDS first normalizes influence scores of the training data, and\nthen iteratively balances data selection by choosing the training example with\nthe highest influence on the most underrepresented task. Experiments with both\nLlama-3 and Mistral-v0.3 on seven benchmarks spanning five diverse capabilities\nshow that BIDS consistently outperforms both state-of-the-art influence-based\nalgorithms and other non-influence-based selection frameworks. Surprisingly,\ntraining on a 15% subset selected by BIDS can even outperform full-dataset\ntraining with a much more balanced performance. Our analysis further highlights\nthe importance of both instance-level normalization and iterative optimization\nof selected data for balanced learning of diverse capabilities.\n","authors":["Qirun Dai","Dylan Zhang","Jiaqi W. Ma","Hao Peng"],"pdf_url":"https://arxiv.org/pdf/2501.12147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15127v2","updated":"2025-01-21T13:55:17Z","published":"2024-11-22T18:46:30Z","title":"PRIMUS: Pretraining IMU Encoders with Multimodal Self-Supervision","summary":"  Sensing human motions through Inertial Measurement Units (IMUs) embedded in\npersonal devices has enabled significant applications in health and wellness.\nLabeled IMU data is scarce, however, unlabeled or weakly labeled IMU data can\nbe used to model human motions. For video or text modalities, the \"pretrain and\nadapt\" approach utilizes large volumes of unlabeled or weakly labeled data to\nbuild a strong feature extractor, followed by adaptation to specific tasks\nusing limited labeled data. However, pretraining methods are poorly understood\nfor IMU data, and pipelines are rarely evaluated on out-of-domain tasks. We\npropose PRIMUS: a method for PRetraining IMU encoderS that uses a novel\npretraining objective that is empirically validated based on downstream\nperformance on both in-domain and out-of-domain datasets. The PRIMUS objective\neffectively enhances downstream performance by combining self-supervision,\nmultimodal, and nearest-neighbor supervision. With fewer than 500 labeled\nsamples per class, PRIMUS improves test accuracy by up to 15%, compared to\nstate-of-the-art baselines. To benefit the broader community, we have\nopen-sourced our code at github.com/nokia-bell-labs/pretrained-imu-encoders.\n","authors":["Arnav M. Das","Chi Ian Tang","Fahim Kawsar","Mohammad Malekzadeh"],"pdf_url":"https://arxiv.org/pdf/2411.15127v2.pdf","comment":"Presented at ICASSP 2025. Also presented under the title \"PRIMUS:\n  Pretraining IMU Encoders with Multimodal and Self-Supervised Learning\" at\n  NeurIPS 2024 TSALM Workshop (Time Series in the Age of Large Models)"},{"id":"http://arxiv.org/abs/2501.12136v1","updated":"2025-01-21T13:49:38Z","published":"2025-01-21T13:49:38Z","title":"Heterogeneous Federated Learning Systems for Time-Series Power\n  Consumption Prediction with Multi-Head Embedding Mechanism","summary":"  Time-series prediction is increasingly popular in a variety of applications,\nsuch as smart factories and smart transportation. Researchers have used various\ntechniques to predict power consumption, but existing models lack discussion of\ncollaborative learning and privacy issues among multiple clients. To address\nthese issues, we propose Multi-Head Heterogeneous Federated Learning (MHHFL)\nsystems that consist of multiple head networks, which independently act as\ncarriers for federated learning. In the federated period, each head network is\nembedded into 2-dimensional vectors and shared with the centralized source\npool. MHHFL then selects appropriate source networks and blends the head\nnetworks as knowledge transfer in federated learning. The experimental results\nshow that the proposed MHHFL systems significantly outperform the benchmark and\nstate-of-the-art systems and reduce the prediction error by 24.9% to 94.1%. The\nablation studies demonstrate the effectiveness of the proposed mechanisms in\nthe MHHFL (head network embedding and selection mechanisms), which\nsignificantly outperforms traditional federated average and random transfer.\n","authors":["Jia-Hao Syu","Jerry Chun-Wei Lin","Gautam Srivastava","Unil Yun"],"pdf_url":"https://arxiv.org/pdf/2501.12136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12133v1","updated":"2025-01-21T13:46:23Z","published":"2025-01-21T13:46:23Z","title":"Distributed Multi-Head Learning Systems for Power Consumption Prediction","summary":"  As more and more automatic vehicles, power consumption prediction becomes a\nvital issue for task scheduling and energy management. Most research focuses on\nautomatic vehicles in transportation, but few focus on automatic ground\nvehicles (AGVs) in smart factories, which face complex environments and\ngenerate large amounts of data. There is an inevitable trade-off between\nfeature diversity and interference. In this paper, we propose Distributed\nMulti-Head learning (DMH) systems for power consumption prediction in smart\nfactories. Multi-head learning mechanisms are proposed in DMH to reduce noise\ninterference and improve accuracy. Additionally, DMH systems are designed as\ndistributed and split learning, reducing the client-to-server transmission\ncost, sharing knowledge without sharing local data and models, and enhancing\nthe privacy and security levels. Experimental results show that the proposed\nDMH systems rank in the top-2 on most datasets and scenarios. DMH-E system\nreduces the error of the state-of-the-art systems by 14.5% to 24.0%.\nEffectiveness studies demonstrate the effectiveness of Pearson\ncorrelation-based feature engineering, and feature grouping with the proposed\nmulti-head learning further enhances prediction performance.\n","authors":["Jia-Hao Syu","Jerry Chun-Wei Lin","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2501.12133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12125v1","updated":"2025-01-21T13:38:06Z","published":"2025-01-21T13:38:06Z","title":"Heterogeneous Federated Learning System for Sparse Healthcare\n  Time-Series Prediction","summary":"  In this paper, we propose a heterogeneous federated learning (HFL) system for\nsparse time series prediction in healthcare, which is a decentralized federated\nlearning algorithm with heterogeneous transfers. We design dense and sparse\nfeature tensors to deal with the sparsity of data sources. Heterogeneous\nfederated learning is developed to share asynchronous parts of networks and\nselect appropriate models for knowledge transfer. Experimental results show\nthat the proposed HFL achieves the lowest prediction error among all benchmark\nsystems on eight out of ten prediction tasks, with MSE reduction of 94.8%,\n48.3%, and 52.1% compared to the benchmark systems. These results demonstrate\nthe effectiveness of HFL in transferring knowledge from heterogeneous domains,\nespecially in the smaller target domain. Ablation studies then demonstrate the\neffectiveness of the designed mechanisms for heterogeneous domain selection and\nswitching in predicting healthcare time series with privacy, model security,\nand heterogeneous knowledge transfer.\n","authors":["Jia-Hao Syu","Jerry Chun-Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2501.12125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12121v1","updated":"2025-01-21T13:33:45Z","published":"2025-01-21T13:33:45Z","title":"Optimally-Weighted Maximum Mean Discrepancy Framework for Continual\n  Learning","summary":"  Continual learning has emerged as a pivotal area of research, primarily due\nto its advantageous characteristic that allows models to persistently acquire\nand retain information. However, catastrophic forgetting can severely impair\nmodel performance. In this study, we tackle the issue of network forgetting by\nintroducing a novel framework termed Optimally-Weighted Maximum Mean\nDiscrepancy (OWMMD), which imposes penalties on representation alterations via\na Multi-Level Feature Matching Mechanism (MLFMM). Furthermore, we propose an\nAdaptive Regularization Optimization (ARO) strategy to refine the adaptive\nweight vectors, which autonomously assess the significance of each feature\nlayer throughout the optimization process. We conduct a comprehensive series of\nexperiments, benchmarking our proposed method against several established\nbaselines. The empirical findings indicate that our approach achieves\nstate-of-the-art performance.\n","authors":["KaiHui Huang","RunQing Wu","Fei Ye"],"pdf_url":"https://arxiv.org/pdf/2501.12121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12119v1","updated":"2025-01-21T13:30:16Z","published":"2025-01-21T13:30:16Z","title":"ENTIRE: Learning-based Volume Rendering Time Prediction","summary":"  We present ENTIRE, a novel approach for volume rendering time prediction.\nTime-dependent volume data from simulations or experiments typically comprise\ncomplex deforming structures across hundreds or thousands of time steps, which\nin addition to the camera configuration has a significant impact on rendering\nperformance. We first extract a feature vector from a volume that captures its\nstructure that is relevant for rendering time performance. Then we combine this\nfeature vector with further relevant parameters (e.g. camera setup), and with\nthis perform the final prediction. Our experiments conducted on various\ndatasets demonstrate that our model is capable of efficiently achieving high\nprediction accuracy with fast response rates. We showcase ENTIRE's capability\nof enabling dynamic parameter adaptation for stable frame rates and load\nbalancing in two case studies.\n","authors":["Zikai Yin","Hamid Gadirov","Jiri Kosinka","Steffen Frey"],"pdf_url":"https://arxiv.org/pdf/2501.12119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12118v1","updated":"2025-01-21T13:29:36Z","published":"2025-01-21T13:29:36Z","title":"Regularized dynamical parametric approximation of stiff evolution\n  problems","summary":"  Evolutionary deep neural networks have emerged as a rapidly growing field of\nresearch. This paper studies numerical integrators for such and other classes\nof nonlinear parametrizations $ u(t) = \\Phi(\\theta(t)) $, where the evolving\nparameters $\\theta(t)$ are to be computed. The primary focus is on tackling the\nchallenges posed by the combination of stiff evolution problems and irregular\nparametrizations, which typically arise with neural networks, tensor networks,\nflocks of evolving Gaussians, and in further cases of overparametrization. We\npropose and analyse regularized parametric versions of the implicit Euler\nmethod and higher-order implicit Runge--Kutta methods for the time integration\nof the parameters in nonlinear approximations to evolutionary partial\ndifferential equations and large systems of stiff ordinary differential\nequations. At each time step, an ill-conditioned nonlinear optimization problem\nis solved approximately with a few regularized Gauss--Newton iterations. Error\nbounds for the resulting parametric integrator are derived by relating the\ncomputationally accessible Gauss--Newton iteration for the parameters to the\ncomputationally inaccessible Newton iteration for the underlying non-parametric\ntime integration scheme. The theoretical findings are supported by numerical\nexperiments that are designed to show key properties of the proposed parametric\nintegrators.\n","authors":["Christian Lubich","Jörg Nick"],"pdf_url":"https://arxiv.org/pdf/2501.12118v1.pdf","comment":"33 pages, 22 figures"},{"id":"http://arxiv.org/abs/2501.12116v1","updated":"2025-01-21T13:25:56Z","published":"2025-01-21T13:25:56Z","title":"Efficient PINNs: Multi-Head Unimodular Regularization of the Solutions\n  Space","summary":"  We present a machine learning framework to facilitate the solution of\nnonlinear multiscale differential equations and, especially, inverse problems\nusing Physics-Informed Neural Networks (PINNs). This framework is based on what\nis called multihead (MH) training, which involves training the network to learn\na general space of all solutions for a given set of equations with certain\nvariability, rather than learning a specific solution of the system. This setup\nis used with a second novel technique that we call Unimodular Regularization\n(UR) of the latent space of solutions. We show that the multihead approach,\ncombined with the regularization, significantly improves the efficiency of\nPINNs by facilitating the transfer learning process thereby enabling the\nfinding of solutions for nonlinear, coupled, and multiscale differential\nequations.\n","authors":["Pedro Tarancón-Álvarez","Pablo Tejerina-Pérez","Raul Jimenez","Pavlos Protopapas"],"pdf_url":"https://arxiv.org/pdf/2501.12116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12115v1","updated":"2025-01-21T13:25:32Z","published":"2025-01-21T13:25:32Z","title":"Meta-Sparsity: Learning Optimal Sparse Structures in Multi-task Networks\n  through Meta-learning","summary":"  This paper presents meta-sparsity, a framework for learning model sparsity,\nbasically learning the parameter that controls the degree of sparsity, that\nallows deep neural networks (DNNs) to inherently generate optimal sparse shared\nstructures in multi-task learning (MTL) setting. This proposed approach enables\nthe dynamic learning of sparsity patterns across a variety of tasks, unlike\ntraditional sparsity methods that rely heavily on manual hyperparameter tuning.\nInspired by Model Agnostic Meta-Learning (MAML), the emphasis is on learning\nshared and optimally sparse parameters in multi-task scenarios by implementing\na penalty-based, channel-wise structured sparsity during the meta-training\nphase. This method improves the model's efficacy by removing unnecessary\nparameters and enhances its ability to handle both seen and previously unseen\ntasks. The effectiveness of meta-sparsity is rigorously evaluated by extensive\nexperiments on two datasets, NYU-v2 and CelebAMask-HQ, covering a broad\nspectrum of tasks ranging from pixel-level to image-level predictions. The\nresults show that the proposed approach performs well across many tasks,\nindicating its potential as a versatile tool for creating efficient and\nadaptable sparse neural networks. This work, therefore, presents an approach\ntowards learning sparsity, contributing to the efforts in the field of sparse\nneural networks and suggesting new directions for research towards parsimonious\nmodels.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2501.12115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12113v1","updated":"2025-01-21T13:17:16Z","published":"2025-01-21T13:17:16Z","title":"Dual NUP Representations and Min-Maximization in Factor Graphs","summary":"  Normals with unknown parameters (NUP) can be used to convert nontrivial\nmodel-based estimation problems into iterations of linear least-squares or\nGaussian estimation problems. In this paper, we extend this approach by\naugmenting factor graphs with convex-dual variables and pertinent NUP\nrepresentations. In particular, in a state space setting, we propose a new\niterative forward-backward algorithm that is dual to a recently proposed\nbackward-forward algorithm.\n","authors":["Yun-Peng Li","Hans-Andrea Loeliger"],"pdf_url":"https://arxiv.org/pdf/2501.12113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05693v2","updated":"2025-01-21T13:15:29Z","published":"2024-04-08T17:18:30Z","title":"Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic\n  Segmentation for Satellite Imagery","summary":"  Satellite imagery is crucial for tasks like environmental monitoring and\nurban planning. Typically, it relies on semantic segmentation or Land Use Land\nCover (LULC) classification to categorize each pixel. Despite the advancements\nbrought about by Deep Neural Networks (DNNs), their performance in segmentation\ntasks is hindered by challenges such as limited availability of labeled data,\nclass imbalance and the inherent variability and complexity of satellite\nimages. In order to mitigate those issues, our study explores the effectiveness\nof a Cut-and-Paste augmentation technique for semantic segmentation in\nsatellite images. We adapt this augmentation, which usually requires labeled\ninstances, to the case of semantic segmentation. By leveraging the connected\ncomponents in the semantic segmentation labels, we extract instances that are\nthen randomly pasted during training. Using the DynamicEarthNet dataset and a\nU-Net model for evaluation, we found that this augmentation significantly\nenhances the mIoU score on the test set from 37.9 to 44.1. This finding\nhighlights the potential of the Cut-and-Paste augmentation to improve the\ngeneralization capabilities of semantic segmentation models in satellite\nimagery.\n","authors":["Ionut M. Motoi","Leonardo Saraceni","Daniele Nardi","Thomas A. Ciarfuglia"],"pdf_url":"https://arxiv.org/pdf/2404.05693v2.pdf","comment":"Published in: IGARSS 2024 - 2024 IEEE International Geoscience and\n  Remote Sensing Symposium"},{"id":"http://arxiv.org/abs/2501.12102v1","updated":"2025-01-21T12:49:30Z","published":"2025-01-21T12:49:30Z","title":"Proxies for Distortion and Consistency with Applications for Real-World\n  Image Restoration","summary":"  Real-world image restoration deals with the recovery of images suffering from\nan unknown degradation. This task is typically addressed while being given only\ndegraded images, without their corresponding ground-truth versions. In this\nhard setting, designing and evaluating restoration algorithms becomes highly\nchallenging. This paper offers a suite of tools that can serve both the design\nand assessment of real-world image restoration algorithms. Our work starts by\nproposing a trained model that predicts the chain of degradations a given\nreal-world measured input has gone through. We show how this estimator can be\nused to approximate the consistency -- the match between the measurements and\nany proposed recovered image. We also use this estimator as a guiding force for\nthe design of a simple and highly-effective plug-and-play real-world image\nrestoration algorithm, leveraging a pre-trained diffusion-based image prior.\nFurthermore, this work proposes no-reference proxy measures of MSE and LPIPS,\nwhich, without access to the ground-truth images, allow ranking of real-world\nimage restoration algorithms according to their (approximate) MSE and LPIPS.\nThe proposed suite provides a versatile, first of its kind framework for\nevaluating and comparing blind image restoration algorithms in real-world\nscenarios.\n","authors":["Sean Man","Guy Ohayon","Ron Raphaeli","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2501.12102v1.pdf","comment":"Project page in https://man-sean.github.io/elad-website/"},{"id":"http://arxiv.org/abs/2501.12085v1","updated":"2025-01-21T12:22:15Z","published":"2025-01-21T12:22:15Z","title":"Scalable Whole Slide Image Representation Using K-Mean Clustering and\n  Fisher Vector Aggregation","summary":"  Whole slide images (WSIs) are high-resolution, gigapixel sized images that\npose significant computational challenges for traditional machine learning\nmodels due to their size and heterogeneity.In this paper, we present a scalable\nand efficient methodology for WSI classification by leveraging patch-based\nfeature extraction, clustering, and Fisher vector encoding. Initially, WSIs are\ndivided into fixed size patches, and deep feature embeddings are extracted from\neach patch using a pre-trained convolutional neural network (CNN). These\npatch-level embeddings are subsequently clustered using K-means clustering,\nwhere each cluster aggregates semantically similar regions of the WSI. To\neffectively summarize each cluster, Fisher vector representations are computed\nby modeling the distribution of patch embeddings in each cluster as a\nparametric Gaussian mixture model (GMM). The Fisher vectors from each cluster\nare concatenated into a high-dimensional feature vector, creating a compact and\ninformative representation of the entire WSI. This feature vector is then used\nby a classifier to predict the WSI's diagnostic label. Our method captures\nlocal and global tissue structures and yields robust performance for\nlarge-scale WSI classification, demonstrating superior accuracy and scalability\ncompared to other approaches.\n","authors":["Ravi Kant Gupta","Shounak Das","Ardhendu Sekhar","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2501.12085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12074v1","updated":"2025-01-21T12:00:52Z","published":"2025-01-21T12:00:52Z","title":"Optimizing Portfolio Performance through Clustering and Sharpe\n  Ratio-Based Optimization: A Comparative Backtesting Approach","summary":"  Optimizing portfolio performance is a fundamental challenge in financial\nmodeling, requiring the integration of advanced clustering techniques and\ndata-driven optimization strategies. This paper introduces a comparative\nbacktesting approach that combines clustering-based portfolio segmentation and\nSharpe ratio-based optimization to enhance investment decision-making.\n  First, we segment a diverse set of financial assets into clusters based on\ntheir historical log-returns using K-Means clustering. This segmentation\nenables the grouping of assets with similar return characteristics,\nfacilitating targeted portfolio construction.\n  Next, for each cluster, we apply a Sharpe ratio-based optimization model to\nderive optimal weights that maximize risk-adjusted returns. Unlike traditional\nmean-variance optimization, this approach directly incorporates the trade-off\nbetween returns and volatility, resulting in a more balanced allocation of\nresources within each cluster.\n  The proposed framework is evaluated through a backtesting study using\nhistorical data spanning multiple asset classes. Optimized portfolios for each\ncluster are constructed and their cumulative returns are compared over time\nagainst a traditional equal-weighted benchmark portfolio.\n","authors":["Keon Vin Park"],"pdf_url":"https://arxiv.org/pdf/2501.12074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12067v1","updated":"2025-01-21T11:42:09Z","published":"2025-01-21T11:42:09Z","title":"EDoRA: Efficient Weight-Decomposed Low-Rank Adaptation via Singular\n  Value Decomposition","summary":"  Parameter-efficient fine-tuning methods, such as LoRA, reduces the number of\ntrainable parameters. However, they often suffer from scalability issues and\ndifferences between their learning pattern and full fine-tuning. To overcome\nthese limitations, we propose Efficient Weight-Decomposed Low-Rank Adaptation\n(EDoRA): a novel PEFT method that decomposes pre-trained weights into magnitude\nand directional components. By freezing low-rank matrices, initializing them by\nsingular value decomposition, and introducing a small trainable matrix between\nthem, EDoRA achieves substantial reduction in trainable parameters while\nmaintaining learning capacity. Experimental results on the GLUE benchmark\ndemonstrate that EDoRA achieves competitive or superior performance compared to\nstate-of-the-art methods, such as LoRA and DoRA, with up to 30x fewer trainable\nparameters. This makes EDoRA a highly efficient solution for adapting LLMs to\ndiverse tasks under memory-constrained settings. Code is available at\nhttps://github.com/Hamid-Nasiri/EDoRA .\n","authors":["Hamid Nasiri","Peter Garraghan"],"pdf_url":"https://arxiv.org/pdf/2501.12067v1.pdf","comment":"10 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.12061v1","updated":"2025-01-21T11:31:01Z","published":"2025-01-21T11:31:01Z","title":"Tackling Uncertainties in Multi-Agent Reinforcement Learning through\n  Integration of Agent Termination Dynamics","summary":"  Multi-Agent Reinforcement Learning (MARL) has gained significant traction for\nsolving complex real-world tasks, but the inherent stochasticity and\nuncertainty in these environments pose substantial challenges to efficient and\nrobust policy learning. While Distributional Reinforcement Learning has been\nsuccessfully applied in single-agent settings to address risk and uncertainty,\nits application in MARL is substantially limited. In this work, we propose a\nnovel approach that integrates distributional learning with a safety-focused\nloss function to improve convergence in cooperative MARL tasks. Specifically,\nwe introduce a Barrier Function based loss that leverages safety metrics,\nidentified from inherent faults in the system, into the policy learning\nprocess. This additional loss term helps mitigate risks and encourages safer\nexploration during the early stages of training. We evaluate our method in the\nStarCraft II micromanagement benchmark, where our approach demonstrates\nimproved convergence and outperforms state-of-the-art baselines in terms of\nboth safety and task completion. Our results suggest that incorporating safety\nconsiderations can significantly enhance learning performance in complex,\nmulti-agent environments.\n","authors":["Somnath Hazra","Pallab Dasgupta","Soumyajit Dey"],"pdf_url":"https://arxiv.org/pdf/2501.12061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12052v1","updated":"2025-01-21T11:25:44Z","published":"2025-01-21T11:25:44Z","title":"Aggrotech: Leveraging Deep Learning for Sustainable Tomato Disease\n  Management","summary":"  Tomato crop health plays a critical role in ensuring agricultural\nproductivity and food security. Timely and accurate detection of diseases\naffecting tomato plants is vital for effective disease management. In this\nstudy, we propose a deep learning-based approach for Tomato Leaf Disease\nDetection using two well-established convolutional neural networks (CNNs),\nnamely VGG19 and Inception v3. The experiment is conducted on the Tomato\nVillages Dataset, encompassing images of both healthy tomato leaves and leaves\nafflicted by various diseases. The VGG19 model is augmented with fully\nconnected layers, while the Inception v3 model is modified to incorporate a\nglobal average pooling layer and a dense classification layer. Both models are\ntrained on the prepared dataset, and their performances are evaluated on a\nseparate test set. This research employs VGG19 and Inception v3 models on the\nTomato Villages dataset (4525 images) for tomato leaf disease detection. The\nmodels' accuracy of 93.93% with dropout layers demonstrates their usefulness\nfor crop health monitoring. The paper suggests a deep learning-based strategy\nthat includes normalization, resizing, dataset preparation, and unique model\narchitectures. During training, VGG19 and Inception v3 serve as feature\nextractors, with possible data augmentation and fine-tuning. Metrics like\naccuracy, precision, recall, and F1 score are obtained through evaluation on a\ntest set and offer important insights into the strengths and shortcomings of\nthe model. The method has the potential for practical use in precision\nagriculture and could help tomato crops prevent illness early on.\n","authors":["MD Mehraz Hosen","Md. Hasibul Islam"],"pdf_url":"https://arxiv.org/pdf/2501.12052v1.pdf","comment":"10 pages, 6 figures, ROC curves, confusion matrix analysis, and\n  classification reports"},{"id":"http://arxiv.org/abs/2501.12050v1","updated":"2025-01-21T11:23:38Z","published":"2025-01-21T11:23:38Z","title":"Parameterised Quantum Circuits for Novel Representation Learning in\n  Speech Emotion Recognition","summary":"  Speech Emotion Recognition (SER) is a complex and challenging task in\nhuman-computer interaction due to the intricate dependencies of features and\nthe overlapping nature of emotional expressions conveyed through speech.\nAlthough traditional deep learning methods have shown effectiveness, they often\nstruggle to capture subtle emotional variations and overlapping states. This\npaper introduces a hybrid classical-quantum framework that integrates\nParameterised Quantum Circuits (PQCs) with conventional Convolutional Neural\nNetwork (CNN) architectures. By leveraging quantum properties such as\nsuperposition and entanglement, the proposed model enhances feature\nrepresentation and captures complex dependencies more effectively than\nclassical methods. Experimental evaluations conducted on benchmark datasets,\nincluding IEMOCAP, RECOLA, and MSP-Improv, demonstrate that the hybrid model\nachieves higher accuracy in both binary and multi-class emotion classification\nwhile significantly reducing the number of trainable parameters. While a few\nexisting studies have explored the feasibility of using Quantum Circuits to\nreduce model complexity, none have successfully shown how they can enhance\naccuracy. This study is the first to demonstrate that Quantum Circuits has the\npotential to improve the accuracy of SER. The findings highlight the promise of\nQML to transform SER, suggesting a promising direction for future research and\npractical applications in emotion-aware systems.\n","authors":["Thejan Rajapakshe","Rajib Rana","Farina Riaz","Sara Khalifa","Björn W. Schuller"],"pdf_url":"https://arxiv.org/pdf/2501.12050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12046v1","updated":"2025-01-21T11:16:05Z","published":"2025-01-21T11:16:05Z","title":"Communication-Efficient and Privacy-Adaptable Mechanism for Federated\n  Learning","summary":"  Training machine learning models on decentralized private data via federated\nlearning (FL) poses two key challenges: communication efficiency and privacy\nprotection. In this work, we address these challenges within the trusted\naggregator model by introducing a novel approach called the\nCommunication-Efficient and Privacy-Adaptable Mechanism (CEPAM), achieving both\nobjectives simultaneously. In particular, CEPAM leverages the rejection-sampled\nuniversal quantizer (RSUQ), a construction of randomized vector quantizer whose\nresulting distortion is equivalent to a prescribed noise, such as Gaussian or\nLaplace noise, enabling joint differential privacy and compression. Moreover,\nwe analyze the trade-offs among user privacy, global utility, and transmission\nrate of CEPAM by defining appropriate metrics for FL with differential privacy\nand compression. Our CEPAM provides the additional benefit of privacy\nadaptability, allowing clients and the server to customize privacy protection\nbased on required accuracy and protection. We assess CEPAM's utility\nperformance using MNIST dataset, demonstrating that CEPAM surpasses baseline\nmodels in terms of learning accuracy.\n","authors":["Chih Wei Ling","Youqi Wu","Jiande Sun","Cheuk Ting Li","Linqi Song","Weitao Xu"],"pdf_url":"https://arxiv.org/pdf/2501.12046v1.pdf","comment":"18 pages, 3 figures, Submitted to 2025 IEEE International Symposium\n  on Information Theory"},{"id":"http://arxiv.org/abs/2501.12032v1","updated":"2025-01-21T10:53:17Z","published":"2025-01-21T10:53:17Z","title":"In-Network Preprocessing of Recommender Systems on Multi-Tenant\n  SmartNICs","summary":"  Keeping ML-based recommender models up-to-date as data drifts and evolves is\nessential to maintain accuracy. As a result, online data preprocessing plays an\nincreasingly important role in serving recommender systems. Existing solutions\nemploy multiple CPU workers to saturate the input bandwidth of a single\ntraining node. Such an approach results in high deployment costs and energy\nconsumption. For instance, a recent report from industrial deployments shows\nthat data storage and ingestion pipelines can account for over 60\\% of the\npower consumption in a recommender system. In this paper, we tackle the issue\nfrom a hardware perspective by introducing Piper, a flexible and\nnetwork-attached accelerator that executes data loading and preprocessing\npipelines in a streaming fashion. As part of the design, we define MiniPipe,\nthe smallest pipeline unit enabling multi-pipeline implementation by executing\nvarious data preprocessing tasks across the single board, giving Piper the\nability to be reconfigured at runtime. Our results, using publicly released\ncommercial pipelines, show that Piper, prototyped on a power-efficient FPGA,\nachieves a 39$\\sim$105$\\times$ speedup over a server-grade, 128-core CPU and\n3$\\sim$17$\\times$ speedup over GPUs like RTX 3090 and A100 in multiple\npipelines. The experimental analysis demonstrates that Piper provides\nadvantages in both latency and energy efficiency for preprocessing tasks in\nrecommender systems, providing an alternative design point for systems that\ntoday are in very high demand.\n","authors":["Yu Zhu","Wenqi Jiang","Gustavo Alonso"],"pdf_url":"https://arxiv.org/pdf/2501.12032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06066v2","updated":"2025-01-21T10:48:54Z","published":"2025-01-10T15:57:23Z","title":"Distilling Calibration via Conformalized Credal Inference","summary":"  Deploying artificial intelligence (AI) models on edge devices involves a\ndelicate balance between meeting stringent complexity constraints, such as\nlimited memory and energy resources, and ensuring reliable performance in\nsensitive decision-making tasks. One way to enhance reliability is through\nuncertainty quantification via Bayesian inference. This approach, however,\ntypically necessitates maintaining and running multiple models in an ensemble,\nwhich may exceed the computational limits of edge devices. This paper\nintroduces a low-complexity methodology to address this challenge by distilling\ncalibration information from a more complex model. In an offline phase,\npredictive probabilities generated by a high-complexity cloud-based model are\nleveraged to determine a threshold based on the typical divergence between the\ncloud and edge models. At run time, this threshold is used to construct credal\nsets -- ranges of predictive probabilities that are guaranteed, with a\nuser-selected confidence level, to include the predictions of the cloud model.\nThe credal sets are obtained through thresholding of a divergence measure in\nthe simplex of predictive probabilities. Experiments on visual and language\ntasks demonstrate that the proposed approach, termed Conformalized Distillation\nfor Credal Inference (CD-CI), significantly improves calibration performance\ncompared to low-complexity Bayesian methods, such as Laplace approximation,\nmaking it a practical and efficient solution for edge AI deployments.\n","authors":["Jiayi Huang","Sangwoo Park","Nicola Paoletti","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2501.06066v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.12030v1","updated":"2025-01-21T10:48:13Z","published":"2025-01-21T10:48:13Z","title":"Advancing Earth Observation: A Survey on AI-Powered Image Processing in\n  Satellites","summary":"  Advancements in technology and reduction in it's cost have led to a\nsubstantial growth in the quality & quantity of imagery captured by Earth\nObservation (EO) satellites. This has presented a challenge to the efficacy of\nthe traditional workflow of transmitting this imagery to Earth for processing.\nAn approach to addressing this issue is to use pre-trained artificial\nintelligence models to process images on-board the satellite, but this is\ndifficult given the constraints within a satellite's environment. This paper\nprovides an up-to-date and thorough review of research related to image\nprocessing on-board Earth observation satellites. The significant constraints\nare detailed along with the latest strategies to mitigate them.\n","authors":["Aidan Duggan","Bruno Andrade","Haithem Afli"],"pdf_url":"https://arxiv.org/pdf/2501.12030v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.09512v2","updated":"2025-01-21T10:39:27Z","published":"2025-01-16T12:57:33Z","title":"PIER: A Novel Metric for Evaluating What Matters in Code-Switching","summary":"  Code-switching, the alternation of languages within a single discourse,\npresents a significant challenge for Automatic Speech Recognition. Despite the\nunique nature of the task, performance is commonly measured with established\nmetrics such as Word-Error-Rate (WER). However, in this paper, we question\nwhether these general metrics accurately assess performance on code-switching.\nSpecifically, using both Connectionist-Temporal-Classification and\nEncoder-Decoder models, we show fine-tuning on non-code-switched data from both\nmatrix and embedded language improves classical metrics on code-switching test\nsets, although actual code-switched words worsen (as expected). Therefore, we\npropose Point-of-Interest Error Rate (PIER), a variant of WER that focuses only\non specific words of interest. We instantiate PIER on code-switched utterances\nand show that this more accurately describes the code-switching performance,\nshowing huge room for improvement in future work. This focused evaluation\nallows for a more precise assessment of model performance, particularly in\nchallenging aspects such as inter-word and intra-word code-switching.\n","authors":["Enes Yavuz Ugan","Ngoc-Quan Pham","Leonard Bärmann","Alex Waibel"],"pdf_url":"https://arxiv.org/pdf/2501.09512v2.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.12023v1","updated":"2025-01-21T10:33:19Z","published":"2025-01-21T10:33:19Z","title":"Comparative Analysis of Pre-trained Deep Learning Models and DINOv2 for\n  Cushing's Syndrome Diagnosis in Facial Analysis","summary":"  Cushing's syndrome is a condition caused by excessive glucocorticoid\nsecretion from the adrenal cortex, often manifesting with moon facies and\nplethora, making facial data crucial for diagnosis. Previous studies have used\npre-trained convolutional neural networks (CNNs) for diagnosing Cushing's\nsyndrome using frontal facial images. However, CNNs are better at capturing\nlocal features, while Cushing's syndrome often presents with global facial\nfeatures. Transformer-based models like ViT and SWIN, which utilize\nself-attention mechanisms, can better capture long-range dependencies and\nglobal features. Recently, DINOv2, a foundation model based on visual\nTransformers, has gained interest. This study compares the performance of\nvarious pre-trained models, including CNNs, Transformer-based models, and\nDINOv2, in diagnosing Cushing's syndrome. We also analyze gender bias and the\nimpact of freezing mechanisms on DINOv2. Our results show that\nTransformer-based models and DINOv2 outperformed CNNs, with ViT achieving the\nhighest F1 score of 85.74%. Both the pre-trained model and DINOv2 had higher\naccuracy for female samples. DINOv2 also showed improved performance when\nfreezing parameters. In conclusion, Transformer-based models and DINOv2 are\neffective for Cushing's syndrome classification.\n","authors":["Hongjun Liu","Changwei Song","Jiaqi Qiang","Jianqiang Li","Hui Pan","Lin Lu","Xiao Long","Qing Zhao","Jiuzuo Huang","Shi Chen"],"pdf_url":"https://arxiv.org/pdf/2501.12023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17207v6","updated":"2025-01-21T10:16:03Z","published":"2023-09-29T12:59:28Z","title":"Memory Gym: Towards Endless Tasks to Benchmark Memory Capabilities of\n  Agents","summary":"  Memory Gym presents a suite of 2D partially observable environments, namely\nMortar Mayhem, Mystery Path, and Searing Spotlights, designed to benchmark\nmemory capabilities in decision-making agents. These environments, originally\nwith finite tasks, are expanded into innovative, endless formats, mirroring the\nescalating challenges of cumulative memory games such as \"I packed my bag\".\nThis progression in task design shifts the focus from merely assessing sample\nefficiency to also probing the levels of memory effectiveness in dynamic,\nprolonged scenarios. To address the gap in available memory-based Deep\nReinforcement Learning baselines, we introduce an implementation within the\nopen-source CleanRL library that integrates Transformer-XL (TrXL) with Proximal\nPolicy Optimization. This approach utilizes TrXL as a form of episodic memory,\nemploying a sliding window technique. Our comparative study between the Gated\nRecurrent Unit (GRU) and TrXL reveals varied performances across our finite and\nendless tasks. TrXL, on the finite environments, demonstrates superior\neffectiveness over GRU, but only when utilizing an auxiliary loss to\nreconstruct observations. Notably, GRU makes a remarkable resurgence in all\nendless tasks, consistently outperforming TrXL by significant margins. Website\nand Source Code: https://marcometer.github.io/jmlr_2024.github.io/\n","authors":["Marco Pleines","Matthias Pallasch","Frank Zimmer","Mike Preuss"],"pdf_url":"https://arxiv.org/pdf/2309.17207v6.pdf","comment":"40 pages, 12 figures, 7 tables, accepted at JMLR"},{"id":"http://arxiv.org/abs/2501.12016v1","updated":"2025-01-21T10:16:00Z","published":"2025-01-21T10:16:00Z","title":"Are Traditional Deep Learning Model Approaches as Effective as a\n  Retinal-Specific Foundation Model for Ocular and Systemic Disease Detection?","summary":"  Background: RETFound, a self-supervised, retina-specific foundation model\n(FM), showed potential in downstream applications. However, its comparative\nperformance with traditional deep learning (DL) models remains incompletely\nunderstood. This study aimed to evaluate RETFound against three\nImageNet-pretrained supervised DL models (ResNet50, ViT-base, SwinV2) in\ndetecting ocular and systemic diseases.\n  Methods: We fine-tuned/trained RETFound and three DL models on full datasets,\n50%, 20%, and fixed sample sizes (400, 200, 100 images, with half comprising\ndisease cases; for each DR severity class, 100 and 50 cases were used.\nFine-tuned models were tested internally using the SEED (53,090 images) and\nAPTOS-2019 (3,672 images) datasets and externally validated on population-based\n(BES, CIEMS, SP2, UKBB) and open-source datasets (ODIR-5k, PAPILA, GAMMA,\nIDRiD, MESSIDOR-2). Model performance was compared using area under the\nreceiver operating characteristic curve (AUC) and Z-tests with Bonferroni\ncorrection (P<0.05/3).\n  Interpretation: Traditional DL models are mostly comparable to RETFound for\nocular disease detection with large datasets. However, RETFound is superior in\nsystemic disease detection with smaller datasets. These findings offer valuable\ninsights into the respective merits and limitation of traditional models and\nFMs.\n","authors":["Samantha Min Er Yew","Xiaofeng Lei","Jocelyn Hui Lin Goh","Yibing Chen","Sahana Srinivasan","Miao-li Chee","Krithi Pushpanathan","Ke Zou","Qingshan Hou","Zhi Da Soh","Cancan Xue","Marco Chak Yan Yu","Charumathi Sabanayagam","E Shyong Tai","Xueling Sim","Yaxing Wang","Jost B. Jonas","Vinay Nangia","Gabriel Dawei Yang","Emma Anran Ran","Carol Yim-Lui Cheung","Yangqin Feng","Jun Zhou","Rick Siow Mong Goh","Yukun Zhou","Pearse A. Keane","Yong Liu","Ching-Yu Cheng","Yih-Chung Tham"],"pdf_url":"https://arxiv.org/pdf/2501.12016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12012v1","updated":"2025-01-21T10:06:19Z","published":"2025-01-21T10:06:19Z","title":"TabularARGN: A Flexible and Efficient Auto-Regressive Framework for\n  Generating High-Fidelity Synthetic Data","summary":"  Synthetic data generation for tabular datasets must balance fidelity,\nefficiency, and versatility to meet the demands of real-world applications. We\nintroduce the Tabular Auto-Regressive Generative Network (TabularARGN), a\nflexible framework designed to handle mixed-type, multivariate, and sequential\ndatasets. By training on all possible conditional probabilities, TabularARGN\nsupports advanced features such as fairness-aware generation, imputation, and\nconditional generation on any subset of columns. The framework achieves\nstate-of-the-art synthetic data quality while significantly reducing training\nand inference times, making it ideal for large-scale datasets with diverse\nstructures. Evaluated across established benchmarks, including realistic\ndatasets with complex relationships, TabularARGN demonstrates its capability to\nsynthesize high-quality data efficiently. By unifying flexibility and\nperformance, this framework paves the way for practical synthetic data\ngeneration across industries.\n","authors":["Paul Tiwald","Ivona Krchova","Andrey Sidorenko","Mariana Vargas-Vieyra","Mario Scriminaci","Michael Platzer"],"pdf_url":"https://arxiv.org/pdf/2501.12012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15100v4","updated":"2025-01-21T10:00:47Z","published":"2024-09-23T15:11:40Z","title":"Robust Federated Learning Over the Air: Combating Heavy-Tailed Noise\n  with Median Anchored Clipping","summary":"  Leveraging over-the-air computations for model aggregation is an effective\napproach to cope with the communication bottleneck in federated edge learning.\nBy exploiting the superposition properties of multi-access channels, this\napproach facilitates an integrated design of communication and computation,\nthereby enhancing system privacy while reducing implementation costs. However,\nthe inherent electromagnetic interference in radio channels often exhibits\nheavy-tailed distributions, giving rise to exceptionally strong noise in\nglobally aggregated gradients that can significantly deteriorate the training\nperformance. To address this issue, we propose a novel gradient clipping\nmethod, termed Median Anchored Clipping (MAC), to combat the detrimental\neffects of heavy-tailed noise. We also derive analytical expressions for the\nconvergence rate of model training with analog over-the-air federated learning\nunder MAC, which quantitatively demonstrates the effect of MAC on training\nperformance. Extensive experimental results show that the proposed MAC\nalgorithm effectively mitigates the impact of heavy-tailed noise, hence\nsubstantially enhancing system robustness.\n","authors":["Jiaxing Li","Zihan Chen","Kai Fong Ernest Chong","Bikramjit Das","Tony Q. S. Quek","Howard H. Yang"],"pdf_url":"https://arxiv.org/pdf/2409.15100v4.pdf","comment":"This is the full version of the paper, and the appendix contains a\n  complete convergence analysis under non-convex conditions"},{"id":"http://arxiv.org/abs/2501.12005v1","updated":"2025-01-21T09:55:21Z","published":"2025-01-21T09:55:21Z","title":"A note on the relations between mixture models, maximum-likelihood and\n  entropic optimal transport","summary":"  This note aims to demonstrate that performing maximum-likelihood estimation\nfor a mixture model is equivalent to minimizing over the parameters an optimal\ntransport problem with entropic regularization. The objective is pedagogical:\nwe seek to present this already known result in a concise and hopefully simple\nmanner. We give an illustration with Gaussian mixture models by showing that\nthe standard EM algorithm is a specific block-coordinate descent on an optimal\ntransport loss.\n","authors":["Titouan Vayer","Etienne Lasalle"],"pdf_url":"https://arxiv.org/pdf/2501.12005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02482v3","updated":"2025-01-21T09:46:38Z","published":"2024-12-03T14:45:46Z","title":"What should a neuron aim for? Designing local objective functions based\n  on information theory","summary":"  In modern deep neural networks, the learning dynamics of the individual\nneurons is often obscure, as the networks are trained via global optimization.\nConversely, biological systems build on self-organized, local learning,\nachieving robustness and efficiency with limited global information. We here\nshow how self-organization between individual artificial neurons can be\nachieved by designing abstract bio-inspired local learning goals. These goals\nare parameterized using a recent extension of information theory, Partial\nInformation Decomposition (PID), which decomposes the information that a set of\ninformation sources holds about an outcome into unique, redundant and\nsynergistic contributions. Our framework enables neurons to locally shape the\nintegration of information from various input classes, i.e. feedforward,\nfeedback, and lateral, by selecting which of the three inputs should contribute\nuniquely, redundantly or synergistically to the output. This selection is\nexpressed as a weighted sum of PID terms, which, for a given problem, can be\ndirectly derived from intuitive reasoning or via numerical optimization,\noffering a window into understanding task-relevant local information\nprocessing. Achieving neuron-level interpretability while enabling strong\nperformance using local learning, our work advances a principled\ninformation-theoretic foundation for local learning strategies.\n","authors":["Andreas C. Schneider","Valentin Neuhaus","David A. Ehrlich","Abdullah Makkeh","Alexander S. Ecker","Viola Priesemann","Michael Wibral"],"pdf_url":"https://arxiv.org/pdf/2412.02482v3.pdf","comment":"24 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.09551v2","updated":"2025-01-21T09:45:38Z","published":"2025-01-16T14:12:03Z","title":"Intra-day Solar and Power Forecast for Optimization of Intraday Market\n  Participation","summary":"  The prediction of solar irradiance enhances reliability in photovoltaic (PV)\nsolar plant generation and grid integration. In Colombia, PV plants face\npenalties if energy production deviates beyond governmental thresholds from\nintraday market offers. This research employs Long Short-Term Memory (LSTM) and\nBidirectional-LSTM (Bi-LSTM) models, utilizing meteorological data from a PV\nplant in El Paso, Cesar, Colombia, to predict solar irradiance with a 6-hour\nhorizon and 10-minute resolution. While Bi-LSTM showed superior performance,\nthe LSTM model achieved comparable results with significantly reduced training\ntime (6 hours versus 18 hours), making it computationally advantageous. The\nLSTM predictions were averaged to create an hourly resolution model, evaluated\nusing Mean Absolute Error, Root-Mean-Square Error, Normalized Root-Mean-Square\nError, and Mean Absolute Percentage Error metrics. Comparison with the Global\nForecast System (GFS) revealed similar performance, with both models\neffectively capturing daily solar irradiance patterns. The forecast model\nintegrates with an Object-Oriented power production model, enabling accurate\nenergy offers in the intraday market while minimizing penalty costs.\n","authors":["Nelson Salazar-Pena","Adolfo Palma-Vergara","Mateo Montes-Vera","Maria Alejandra Vargas-Torres","Adriana Salinas","Andres Velasco","Alejandra Tabares","Andres Gonzalez-Mancera"],"pdf_url":"https://arxiv.org/pdf/2501.09551v2.pdf","comment":"20 pages, 37 figures, 9 tables"},{"id":"http://arxiv.org/abs/2409.20135v5","updated":"2025-01-21T09:25:25Z","published":"2024-09-30T09:34:31Z","title":"Federated Instruction Tuning of LLMs with Domain Coverage Augmentation","summary":"  Federated Domain-specific Instruction Tuning (FedDIT) utilizes limited\ncross-client private data together with various strategies of instruction\naugmentation, ultimately boosting model performance within specific domains. To\ndate, the factors affecting FedDIT remain unclear, and existing instruction\naugmentation methods primarily focus on the centralized setting without\nconsidering distributed environments. Our experiments reveal that the\ncross-client domain coverage, rather than data heterogeneity, drives model\nperformance in FedDIT. In response, we propose FedDCA, which optimizes domain\ncoverage through greedy client center selection and retrieval-based\naugmentation. At its core, the greedy selection procedure iteratively picks\nclient centers that maximize the diversity and coverage of the instruction\nspace while avoiding redundancy with previously selected centers. This ensures\nbroad yet efficient coverage of the domain distribution across clients. For\nclient-side computational efficiency and system scalability, FedDCA$^*$, the\nvariant of FedDCA, utilizes heterogeneous encoders with server-side feature\nalignment. Extensive experiments across code, medical, financial, and\nmathematical domains substantiate the effectiveness of both methods, as well as\nplug-and-play capability. We further analyze privacy preservation against\nmemory extraction attacks, showing that while privacy leakage risk is\nindependent of augmented public data ratio, it decreases or converges as\ntraining progresses.\n","authors":["Zezhou Wang","Yaxin Du","Xingjun Ma","Yugang Jiang","Zhuzhong Qian","Siheng Chen"],"pdf_url":"https://arxiv.org/pdf/2409.20135v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11979v1","updated":"2025-01-21T08:52:47Z","published":"2025-01-21T08:52:47Z","title":"Linear Feedback Control Systems for Iterative Prompt Optimization in\n  Large Language Models","summary":"  Large Language Models (LLMs) have revolutionized various applications by\ngenerating outputs based on given prompts. However, achieving the desired\noutput requires iterative prompt refinement. This paper presents a novel\napproach that draws parallels between the iterative prompt optimization process\nin LLMs and feedback control systems. We iteratively refine the prompt by\ntreating the deviation between the LLM output and the desired result as an\nerror term until the output criteria are met. This process is akin to a\nfeedback control system, where the LLM, despite being non-linear and\nnon-deterministic, is managed using principles from linear feedback control\nsystems. We explore the application of different types of controllers within\nthis framework, providing a mathematical foundation for integrating linear\nfeedback control mechanisms with LLMs.\n","authors":["Rupesh Raj Karn"],"pdf_url":"https://arxiv.org/pdf/2501.11979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11972v1","updated":"2025-01-21T08:34:10Z","published":"2025-01-21T08:34:10Z","title":"\"FRAME: Forward Recursive Adaptive Model Extraction -- A Technique for\n  Advance Feature Selection\"","summary":"  Feature selection is a crucial preprocessing step in machine learning,\nimpacting model performance, interpretability, and computational efficiency.\nThis study introduces a novel hybrid approach, the Forward Recursive Adaptive\nModel Extraction Technique (FRAME), which combines Forward Selection and\nRecursive Feature Elimination (RFE) to enhance feature selection across diverse\ndatasets. FRAME integrates the strengths of both methods, balancing exploration\nand exploitation of features to optimize selection. A comprehensive evaluation\nof FRAME was conducted against traditional methods such as SelectKBest and\nLasso Regression, using high-dimensional, noisy, and heterogeneous datasets.\nThe results demonstrate that FRAME consistently delivers superior predictive\nperformance based on downstream machine learning evaluation metrics. It\neffectively reduces dimensionality while maintaining robust model performance,\nmaking it particularly valuable for applications requiring interpretable and\naccurate predictions, such as biomedical diagnostics. This study highlights the\nimportance of assessing feature selection methods across varied datasets to\nensure their robustness and generalizability. The findings suggest that FRAME\nhas significant potential for further enhancement, particularly through\nintegration with deep learning architectures for adaptive and real-time feature\nselection in dynamic environments. By advancing feature selection\nmethodologies, FRAME offers a practical and effective solution to improve\nmachine learning applications across multiple domains.\n","authors":["Nachiket Kapure","Harsh Joshi","Parul Kumari","Rajeshwari mistri","Manasi Mali"],"pdf_url":"https://arxiv.org/pdf/2501.11972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11968v1","updated":"2025-01-21T08:28:10Z","published":"2025-01-21T08:28:10Z","title":"Bridging Visualization and Optimization: Multimodal Large Language\n  Models on Graph-Structured Combinatorial Optimization","summary":"  Graph-structured combinatorial challenges are inherently difficult due to\ntheir nonlinear and intricate nature, often rendering traditional computational\nmethods ineffective or expensive. However, these challenges can be more\nnaturally tackled by humans through visual representations that harness our\ninnate ability for spatial reasoning. In this study, we propose transforming\ngraphs into images to preserve their higher-order structural features\naccurately, revolutionizing the representation used in solving graph-structured\ncombinatorial tasks. This approach allows machines to emulate human-like\nprocessing in addressing complex combinatorial challenges. By combining the\ninnovative paradigm powered by multimodal large language models (MLLMs) with\nsimple search techniques, we aim to develop a novel and effective framework for\ntackling such problems. Our investigation into MLLMs spanned a variety of\ngraph-based tasks, from combinatorial problems like influence maximization to\nsequential decision-making in network dismantling, as well as addressing six\nfundamental graph-related issues. Our findings demonstrate that MLLMs exhibit\nexceptional spatial intelligence and a distinctive capability for handling\nthese problems, significantly advancing the potential for machines to\ncomprehend and analyze graph-structured data with a depth and intuition akin to\nhuman cognition. These results also imply that integrating MLLMs with simple\noptimization strategies could form a novel and efficient approach for\nnavigating graph-structured combinatorial challenges without complex\nderivations, computationally demanding training and fine-tuning.\n","authors":["Jie Zhao","Kang Hao Cheong","Witold Pedrycz"],"pdf_url":"https://arxiv.org/pdf/2501.11968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05415v2","updated":"2025-01-21T08:21:45Z","published":"2025-01-09T18:17:27Z","title":"Uncertainty-aware Knowledge Tracing","summary":"  Knowledge Tracing (KT) is crucial in education assessment, which focuses on\ndepicting students' learning states and assessing students' mastery of\nsubjects. With the rise of modern online learning platforms, particularly\nmassive open online courses (MOOCs), an abundance of interaction data has\ngreatly advanced the development of the KT technology. Previous research\ncommonly adopts deterministic representation to capture students' knowledge\nstates, which neglects the uncertainty during student interactions and thus\nfails to model the true knowledge state in learning process. In light of this,\nwe propose an Uncertainty-Aware Knowledge Tracing model (UKT) which employs\nstochastic distribution embeddings to represent the uncertainty in student\ninteractions, with a Wasserstein self-attention mechanism designed to capture\nthe transition of state distribution in student learning behaviors.\nAdditionally, we introduce the aleatory uncertainty-aware contrastive learning\nloss, which strengthens the model's robustness towards different types of\nuncertainties. Extensive experiments on six real-world datasets demonstrate\nthat UKT not only significantly surpasses existing deep learning-based models\nin KT prediction, but also shows unique advantages in handling the uncertainty\nof student interactions.\n","authors":["Weihua Cheng","Hanwen Du","Chunxiao Li","Ersheng Ni","Liangdi Tan","Tianqi Xu","Yongxin Ni"],"pdf_url":"https://arxiv.org/pdf/2501.05415v2.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2406.02044v2","updated":"2025-01-21T08:17:27Z","published":"2024-06-04T07:27:36Z","title":"QROA: A Black-Box Query-Response Optimization Attack on LLMs","summary":"  Large Language Models (LLMs) have surged in popularity in recent months, yet\nthey possess concerning capabilities for generating harmful content when\nmanipulated. This study introduces the Query-Response Optimization Attack\n(QROA), an optimization-based strategy designed to exploit LLMs through a\nblack-box, query-only interaction. QROA adds an optimized trigger to a\nmalicious instruction to compel the LLM to generate harmful content. Unlike\nprevious approaches, QROA does not require access to the model's logit\ninformation or any other internal data and operates solely through the standard\nquery-response interface of LLMs. Inspired by deep Q-learning and Greedy\ncoordinate descent, the method iteratively updates tokens to maximize a\ndesigned reward function. We tested our method on various LLMs such as Vicuna,\nFalcon, and Mistral, achieving an Attack Success Rate (ASR) over 80\\%. We also\ntested the model against Llama2-chat, the fine-tuned version of Llama2 designed\nto resist Jailbreak attacks, achieving good ASR with a suboptimal initial\ntrigger seed. This study demonstrates the feasibility of generating jailbreak\nattacks against deployed LLMs in the public domain using black-box optimization\nmethods, enabling more comprehensive safety testing of LLMs.\n","authors":["Hussein Jawad","Nicolas J. -B. BRUNEL"],"pdf_url":"https://arxiv.org/pdf/2406.02044v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11959v1","updated":"2025-01-21T08:10:02Z","published":"2025-01-21T08:10:02Z","title":"Noise-Resilient Point-wise Anomaly Detection in Time Series Using Weak\n  Segment Labels","summary":"  Detecting anomalies in temporal data has gained significant attention across\nvarious real-world applications, aiming to identify unusual events and mitigate\npotential hazards. In practice, situations often involve a mix of segment-level\nlabels (detected abnormal events with segments of time points) and unlabeled\ndata (undetected events), while the ideal algorithmic outcome should be\npoint-level predictions. Therefore, the huge label information gap between\ntraining data and targets makes the task challenging. In this study, we\nformulate the above imperfect information as noisy labels and propose\nNRdetector, a noise-resilient framework that incorporates confidence-based\nsample selection, robust segment-level learning, and data-centric point-level\ndetection for multivariate time series anomaly detection. Particularly, to\nbridge the information gap between noisy segment-level labels and missing\npoint-level labels, we develop a novel loss function that can effectively\nmitigate the label noise and consider the temporal features. It encourages the\nsmoothness of consecutive points and the separability of points from segments\nwith different labels. Extensive experiments on real-world multivariate time\nseries datasets with 11 different evaluation metrics demonstrate that\nNRdetector consistently achieves robust results across multiple real-world\ndatasets, outperforming various baselines adapted to operate in our setting.\n","authors":["Yaxuan Wang","Hao Cheng","Jing Xiong","Qingsong Wen","Han Jia","Ruixuan Song","Liyuan Zhang","Zhaowei Zhu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11959v1.pdf","comment":"Accepted by 2025 ACM SIGKDD International Conference on Knowledge\n  Discovery and Data Mining (KDD'25)"},{"id":"http://arxiv.org/abs/2501.11949v1","updated":"2025-01-21T07:47:03Z","published":"2025-01-21T07:47:03Z","title":"GLAM: Global-Local Variation Awareness in Mamba-based World Model","summary":"  Mimicking the real interaction trajectory in the inference of the world model\nhas been shown to improve the sample efficiency of model-based reinforcement\nlearning (MBRL) algorithms. Many methods directly use known state sequences for\nreasoning. However, this approach fails to enhance the quality of reasoning by\ncapturing the subtle variation between states. Much like how humans infer\ntrends in event development from this variation, in this work, we introduce\nGlobal-Local variation Awareness Mamba-based world model (GLAM) that improves\nreasoning quality by perceiving and predicting variation between states. GLAM\ncomprises two Mambabased parallel reasoning modules, GMamba and LMamba, which\nfocus on perceiving variation from global and local perspectives, respectively,\nduring the reasoning process. GMamba focuses on identifying patterns of\nvariation between states in the input sequence and leverages these patterns to\nenhance the prediction of future state variation. LMamba emphasizes reasoning\nabout unknown information, such as rewards, termination signals, and visual\nrepresentations, by perceiving variation in adjacent states. By integrating the\nstrengths of the two modules, GLAM accounts for highervalue variation in\nenvironmental changes, providing the agent with more efficient\nimagination-based training. We demonstrate that our method outperforms existing\nmethods in normalized human scores on the Atari 100k benchmark.\n","authors":["Qian He","Wenqi Liang","Chunhui Hao","Gan Sun","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2501.11949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14738v6","updated":"2025-01-21T07:43:12Z","published":"2024-12-19T11:10:48Z","title":"Training Graph Neural Networks Using Non-Robust Samples","summary":"  Graph Neural Networks (GNNs) are a highly effective neural network\narchitecture for processing graph -- structured data. Unlike traditional neural\nnetworks that rely solely on the features of the data as input, GNNs leverage\nboth the graph structure, which represents the relationships between data\npoints, and the feature matrix of the data to optimize their feature\nrepresentation. This unique capability enables GNNs to achieve superior\nperformance across various tasks. However, it also makes GNNs more susceptible\nto noise from both the graph structure and data features, which can\nsignificantly increase the training difficulty and degrade their performance.\nTo address this issue, this paper proposes a novel method for selecting\nnoise-sensitive training samples from the original training set to construct a\nsmaller yet more effective training set for model training. These samples are\nused to help improve the model's ability to correctly process data in noisy\nenvironments. We have evaluated our approach on three of the most classical GNN\nmodels -- GCN, GAT, and GraphSAGE -- as well as three widely used benchmark\ndatasets: Cora, Citeseer, and PubMed. Our experiments demonstrate that the\nproposed method can substantially boost the training of Graph Neural Networks\ncompared to using randomly sampled training sets of the same size from the\noriginal training set and the larger original full training set.\n","authors":["Yongyu Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14738v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01144v3","updated":"2025-01-21T07:34:54Z","published":"2025-01-02T08:57:00Z","title":"BlockDialect: Block-wise Fine-grained Mixed Format Quantization for\n  Energy-Efficient LLM Inference","summary":"  The rapidly increasing size of large language models (LLMs) presents\nsignificant challenges in memory usage and computational costs. Quantizing both\nweights and activations can address these issues, with hardware-supported\nfine-grained scaling emerging as a promising solution to mitigate outliers.\nHowever, existing methods struggle to capture nuanced block data distributions.\nWe propose BlockDialect, a block-wise fine-grained mixed format technique that\nassigns a per-block optimal number format from a formatbook for better data\nrepresentation. Additionally, we introduce DialectFP4, a formatbook of FP4\nvariants (akin to dialects) that adapt to diverse data distributions. To\nleverage this efficiently, we propose a two-stage approach for online\nDialectFP4 activation quantization. Importantly, DialectFP4 ensures energy\nefficiency by selecting representable values as scaled integers compatible with\nlow-precision integer arithmetic. BlockDialect achieves 10.78% (7.48%) accuracy\ngain on the LLaMA3-8B (LLaMA2-7B) model compared to MXFP4 format with lower bit\nusage per data, while being only 5.45% (2.69%) below full precision even when\nquantizing full-path matrix multiplication. Focusing on how to represent over\nhow to scale, our work presents a promising path for energy-efficient LLM\ninference.\n","authors":["Wonsuk Jang","Thierry Tambe"],"pdf_url":"https://arxiv.org/pdf/2501.01144v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11937v1","updated":"2025-01-21T07:27:05Z","published":"2025-01-21T07:27:05Z","title":"MeshONet: A Generalizable and Efficient Operator Learning Method for\n  Structured Mesh Generation","summary":"  Mesh generation plays a crucial role in scientific computing. Traditional\nmesh generation methods, such as TFI and PDE-based methods, often struggle to\nachieve a balance between efficiency and mesh quality. To address this\nchallenge, physics-informed intelligent learning methods have recently emerged,\nsignificantly improving generation efficiency while maintaining high mesh\nquality. However, physics-informed methods fail to generalize when applied to\npreviously unseen geometries, as even small changes in the boundary shape\nnecessitate burdensome retraining to adapt to new geometric variations. In this\npaper, we introduce MeshONet, the first generalizable intelligent learning\nmethod for structured mesh generation. The method transforms the mesh\ngeneration task into an operator learning problem with multiple input and\nsolution functions. To effectively overcome the multivariable mapping\nrestriction of operator learning methods, we propose a dual-branch,\nshared-trunk architecture to approximate the mapping between function spaces\nbased on input-output pairs. Experimental results show that MeshONet achieves a\nspeedup of up to four orders of magnitude in generation efficiency over\ntraditional methods. It also enables generalization to different geometries\nwithout retraining, greatly enhancing the practicality of intelligent methods.\n","authors":["Jing Xiao","Xinhai Chen","Qingling Wang","Jie Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09506v2","updated":"2025-01-21T07:09:59Z","published":"2025-01-16T12:38:49Z","title":"Multimodal Marvels of Deep Learning in Medical Diagnosis: A\n  Comprehensive Review of COVID-19 Detection","summary":"  This study presents a comprehensive review of the potential of multimodal\ndeep learning (DL) in medical diagnosis, using COVID-19 as a case example.\nMotivated by the success of artificial intelligence applications during the\nCOVID-19 pandemic, this research aims to uncover the capabilities of DL in\ndisease screening, prediction, and classification, and to derive insights that\nenhance the resilience, sustainability, and inclusiveness of science,\ntechnology, and innovation systems. Adopting a systematic approach, we\ninvestigate the fundamental methodologies, data sources, preprocessing steps,\nand challenges encountered in various studies and implementations. We explore\nthe architecture of deep learning models, emphasising their data-specific\nstructures and underlying algorithms. Subsequently, we compare different deep\nlearning strategies utilised in COVID-19 analysis, evaluating them based on\nmethodology, data, performance, and prerequisites for future research. By\nexamining diverse data types and diagnostic modalities, this research\ncontributes to scientific understanding and knowledge of the multimodal\napplication of DL and its effectiveness in diagnosis. We have implemented and\nanalysed 11 deep learning models using COVID-19 image, text, and speech (ie,\ncough) data. Our analysis revealed that the MobileNet model achieved the\nhighest accuracy of 99.97% for COVID-19 image data and 93.73% for speech data\n(i.e., cough). However, the BiGRU model demonstrated superior performance in\nCOVID-19 text classification with an accuracy of 99.89%. The broader\nimplications of this research suggest potential benefits for other domains and\ndisciplines that could leverage deep learning techniques for image, text, and\nspeech analysis.\n","authors":["Md Shofiqul Islam","Khondokar Fida Hasan","Hasibul Hossain Shajeeb","Humayan Kabir Rana","Md Saifur Rahmand","Md Munirul Hasan","AKM Azad","Ibrahim Abdullah","Mohammad Ali Moni"],"pdf_url":"https://arxiv.org/pdf/2501.09506v2.pdf","comment":"43 pages"},{"id":"http://arxiv.org/abs/2501.11929v1","updated":"2025-01-21T07:07:58Z","published":"2025-01-21T07:07:58Z","title":"ALoFTRAG: Automatic Local Fine Tuning for Retrieval Augmented Generation","summary":"  Retrieval Augmented Generation (RAG) systems have been shown to improve the\naccuracy of Large Language Model (LLM) outputs. However, these models can often\nachieve low accuracy when applied to new data domains.\n  We introduce the Automatic Local Fine Tuning of Retrieval Augmented\nGeneration models (ALoFTRAG) framework, designed to improve the accuracy of RAG\nsystems on a given domain by training LLMs without manually labeled data or\nusing larger teacher models.\n  By generating and filtering synthetic training data and performing LoRA\nfine-tuning, ALoFTRAG improves citation and answer accuracy across 20 datasets\nin 26 languages by, on average, 8.3% and 3.0% respectively.\n  Our results demonstrate that ALoFTRAG offers a practical, cost-effective, and\ndata-secure solution for improving RAG accuracy, making it particularly\napplicable to sensitive domains such as healthcare and finance.\n","authors":["Peter Devine"],"pdf_url":"https://arxiv.org/pdf/2501.11929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11923v1","updated":"2025-01-21T06:55:31Z","published":"2025-01-21T06:55:31Z","title":"Progressive Cross Attention Network for Flood Segmentation using\n  Multispectral Satellite Imagery","summary":"  In recent years, the integration of deep learning techniques with remote\nsensing technology has revolutionized the way natural hazards, such as floods,\nare monitored and managed. However, existing methods for flood segmentation\nusing remote sensing data often overlook the utility of correlative features\namong multispectral satellite information. In this study, we introduce a\nprogressive cross attention network (ProCANet), a deep learning model that\nprogressively applies both self- and cross-attention mechanisms to\nmultispectral features, generating optimal feature combinations for flood\nsegmentation. The proposed model was compared with state-of-the-art approaches\nusing Sen1Floods11 dataset and our bespoke flood data generated for the Citarum\nRiver basin, Indonesia. Our model demonstrated superior performance with the\nhighest Intersection over Union (IoU) score of 0.815. Our results in this\nstudy, coupled with the ablation assessment comparing scenarios with and\nwithout attention across various modalities, opens a promising path for\nenhancing the accuracy of flood analysis using remote sensing technology.\n","authors":["Vicky Feliren","Fithrothul Khikmah","Irfan Dwiki Bhaswara","Bahrul I. Nasution","Alex M. Lechner","Muhamad Risqi U. Saputra"],"pdf_url":"https://arxiv.org/pdf/2501.11923v1.pdf","comment":"5 pages, 4 figures, published in IEEE Geoscience and Remote Sensing\n  Letters"},{"id":"http://arxiv.org/abs/2501.11921v1","updated":"2025-01-21T06:49:06Z","published":"2025-01-21T06:49:06Z","title":"Goal-oriented Transmission Scheduling: Structure-guided DRL with a\n  Unified Dual On-policy and Off-policy Approach","summary":"  Goal-oriented communications prioritize application-driven objectives over\ndata accuracy, enabling intelligent next-generation wireless systems. Efficient\nscheduling in multi-device, multi-channel systems poses significant challenges\ndue to high-dimensional state and action spaces. We address these challenges by\nderiving key structural properties of the optimal solution to the goal-oriented\nscheduling problem, incorporating Age of Information (AoI) and channel states.\nSpecifically, we establish the monotonicity of the optimal state value function\n(a measure of long-term system performance) w.r.t. channel states and prove its\nasymptotic convexity w.r.t. AoI states. Additionally, we derive the\nmonotonicity of the optimal policy w.r.t. channel states, advancing the\ntheoretical framework for optimal scheduling. Leveraging these insights, we\npropose the structure-guided unified dual on-off policy DRL (SUDO-DRL), a\nhybrid algorithm that combines the stability of on-policy training with the\nsample efficiency of off-policy methods. Through a novel structural property\nevaluation framework, SUDO-DRL enables effective and scalable training,\naddressing the complexities of large-scale systems. Numerical results show\nSUDO-DRL improves system performance by up to 45% and reduces convergence time\nby 40% compared to state-of-the-art methods. It also effectively handles\nscheduling in much larger systems, where off-policy DRL fails and on-policy\nbenchmarks exhibit significant performance loss, demonstrating its scalability\nand efficacy in goal-oriented communications.\n","authors":["Jiazheng Chen","Wanchun Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11921v1.pdf","comment":"Paper submitted to IEEE"},{"id":"http://arxiv.org/abs/2501.11919v1","updated":"2025-01-21T06:47:27Z","published":"2025-01-21T06:47:27Z","title":"Improving Fine-Tuning with Latent Cluster Correction","summary":"  The existence of salient semantic clusters in the latent spaces of a neural\nnetwork during training strongly correlates its final accuracy on\nclassification tasks. This paper proposes a novel fine-tuning method that\nboosts performance by optimising the formation of these latent clusters, using\nthe Louvain community detection algorithm and a specifically designed\nclustering loss function. We present preliminary results that demonstrate the\nviability of this process on classical neural network architectures during\nfine-tuning on the CIFAR-100 dataset.\n","authors":["Cédric Ho Thanh"],"pdf_url":"https://arxiv.org/pdf/2501.11919v1.pdf","comment":"8 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2409.18915v2","updated":"2025-01-21T06:10:22Z","published":"2024-09-27T17:00:32Z","title":"A-FedPD: Aligning Dual-Drift is All Federated Primal-Dual Learning Needs","summary":"  As a popular paradigm for juggling data privacy and collaborative training,\nfederated learning (FL) is flourishing to distributively process the large\nscale of heterogeneous datasets on edged clients. Due to bandwidth limitations\nand security considerations, it ingeniously splits the original problem into\nmultiple subproblems to be solved in parallel, which empowers primal dual\nsolutions to great application values in FL. In this paper, we review the\nrecent development of classical federated primal dual methods and point out a\nserious common defect of such methods in non-convex scenarios, which we say is\na \"dual drift\" caused by dual hysteresis of those longstanding inactive clients\nunder partial participation training. To further address this problem, we\npropose a novel Aligned Federated Primal Dual (A-FedPD) method, which\nconstructs virtual dual updates to align global consensus and local dual\nvariables for those protracted unparticipated local clients. Meanwhile, we\nprovide a comprehensive analysis of the optimization and generalization\nefficiency for the A-FedPD method on smooth non-convex objectives, which\nconfirms its high efficiency and practicality. Extensive experiments are\nconducted on several classical FL setups to validate the effectiveness of our\nproposed method.\n","authors":["Yan Sun","Li Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2409.18915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00204v5","updated":"2025-01-21T06:08:46Z","published":"2024-03-30T00:46:43Z","title":"AirPilot: Interpretable PPO-based DRL Auto-Tuned Nonlinear PID Drone\n  Controller for Robust Autonomous Flights","summary":"  Navigation precision, speed and stability are crucial for safe Unmanned\nAerial Vehicle (UAV) flight maneuvers and effective flight mission executions\nin dynamic environments. Different flight missions may have varying objectives,\nsuch as minimizing energy consumption, achieving precise positioning, or\nmaximizing speed. A controller that can adapt to different objectives on the\nfly is highly valuable. Proportional Integral Derivative (PID) controllers are\none of the most popular and widely used control algorithms for drones and other\ncontrol systems, but their linear control algorithm fails to capture the\nnonlinear nature of the dynamic wind conditions and complex drone system.\nManually tuning the PID gains for various missions can be time-consuming and\nrequires significant expertise. This paper aims to revolutionize drone flight\ncontrol by presenting the AirPilot, a nonlinear Deep Reinforcement Learning\n(DRL) - enhanced Proportional Integral Derivative (PID) drone controller using\nProximal Policy Optimization (PPO). AirPilot controller combines the simplicity\nand effectiveness of traditional PID control with the adaptability, learning\ncapability, and optimization potential of DRL. This makes it better suited for\nmodern drone applications where the environment is dynamic, and\nmission-specific performance demands are high. We employed a COEX Clover\nautonomous drone for training the DRL agent within the simulator and\nimplemented it in a real-world lab setting, which marks a significant milestone\nas one of the first attempts to apply a DRL-based flight controller on an\nactual drone. Airpilot is capable of reducing the navigation error of the\ndefault PX4 PID position controller by 90%, improving effective navigation\nspeed of a fine-tuned PID controller by 21%, reducing settling time and\novershoot by 17% and 16% respectively.\n","authors":["Junyang Zhang","Cristian Emanuel Ocampo Rivera","Kyle Tyni","Steven Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.00204v5.pdf","comment":"9 pages, 20 figures"},{"id":"http://arxiv.org/abs/2405.16441v3","updated":"2025-01-21T06:04:12Z","published":"2024-05-26T05:50:39Z","title":"Categorical Flow Matching on Statistical Manifolds","summary":"  We introduce Statistical Flow Matching (SFM), a novel and mathematically\nrigorous flow-matching framework on the manifold of parameterized probability\nmeasures inspired by the results from information geometry. We demonstrate the\neffectiveness of our method on the discrete generation problem by instantiating\nSFM on the manifold of categorical distributions whose geometric properties\nremain unexplored in previous discrete generative models. Utilizing the Fisher\ninformation metric, we equip the manifold with a Riemannian structure whose\nintrinsic geometries are effectively leveraged by following the shortest paths\nof geodesics. We develop an efficient training and sampling algorithm that\novercomes numerical stability issues with a diffeomorphism between manifolds.\nOur distinctive geometric perspective of statistical manifolds allows us to\napply optimal transport during training and interpret SFM as following the\nsteepest direction of the natural gradient. Unlike previous models that rely on\nvariational bounds for likelihood estimation, SFM enjoys the exact likelihood\ncalculation for arbitrary probability measures. We manifest that SFM can learn\nmore complex patterns on the statistical manifold where existing models often\nfail due to strong prior assumptions. Comprehensive experiments on real-world\ngenerative tasks ranging from image, text to biological domains further\ndemonstrate that SFM achieves higher sampling quality and likelihood than other\ndiscrete diffusion or flow-based models.\n","authors":["Chaoran Cheng","Jiahan Li","Jian Peng","Ge Liu"],"pdf_url":"https://arxiv.org/pdf/2405.16441v3.pdf","comment":"Accepted to NeurIPS 2024 as a conference paper"},{"id":"http://arxiv.org/abs/2312.02253v2","updated":"2025-01-21T06:03:07Z","published":"2023-12-04T18:35:27Z","title":"Diversify, Don't Fine-Tune: Scaling Up Visual Recognition Training with\n  Synthetic Images","summary":"  Recent advances in generative deep learning have enabled the creation of\nhigh-quality synthetic images in text-to-image generation. Prior work shows\nthat fine-tuning a pretrained diffusion model on ImageNet and generating\nsynthetic training images from the finetuned model can enhance an ImageNet\nclassifier's performance. However, performance degrades as synthetic images\noutnumber real ones. In this paper, we explore whether generative fine-tuning\nis essential for this improvement and whether it is possible to further scale\nup training using more synthetic data. We present a new framework leveraging\noff-the-shelf generative models to generate synthetic training images,\naddressing multiple challenges: class name ambiguity, lack of diversity in\nnaive prompts, and domain shifts. Specifically, we leverage large language\nmodels (LLMs) and CLIP to resolve class name ambiguity. To diversify images, we\npropose contextualized diversification (CD) and stylized diversification (SD)\nmethods, also prompted by LLMs. Finally, to mitigate domain shifts, we leverage\ndomain adaptation techniques with auxiliary batch normalization for synthetic\nimages. Our framework consistently enhances recognition model performance with\nmore synthetic data, up to 6x of original ImageNet size showcasing the\npotential of synthetic data for improved recognition models and strong\nout-of-domain generalization.\n","authors":["Zhuoran Yu","Chenchen Zhu","Sean Culatana","Raghuraman Krishnamoorthi","Fanyi Xiao","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2312.02253v2.pdf","comment":"Accepted by Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2311.07065v2","updated":"2025-01-21T05:48:22Z","published":"2023-11-13T04:11:25Z","title":"On non-approximability of zero loss global ${\\mathcal L}^2$ minimizers\n  by gradient descent in Deep Learning","summary":"  We analyze geometric aspects of the gradient descent algorithm in Deep\nLearning (DL), and give a detailed discussion of the circumstance that in\nunderparametrized DL networks, zero loss minimization can generically not be\nattained. As a consequence, we conclude that the distribution of training\ninputs must necessarily be non-generic in order to produce zero loss\nminimizers, both for the method constructed in [Chen-Munoz Ewald 2023, 2024],\nor for gradient descent [Chen 2025] (which assume clustering of training data).\n","authors":["Thomas Chen","Patricia Muñoz Ewald"],"pdf_url":"https://arxiv.org/pdf/2311.07065v2.pdf","comment":"AMS Latex, 7 pages. Title changed, statement of Corollary 1.6\n  corrected"},{"id":"http://arxiv.org/abs/2501.11899v1","updated":"2025-01-21T05:29:34Z","published":"2025-01-21T05:29:34Z","title":"LASER: Lip Landmark Assisted Speaker Detection for Robustness","summary":"  Active Speaker Detection (ASD) aims to identify speaking individuals in\ncomplex visual scenes. While humans can easily detect speech by matching lip\nmovements to audio, current ASD models struggle to establish this\ncorrespondence, often misclassifying non-speaking instances when audio and lip\nmovements are unsynchronized. To address this limitation, we propose Lip\nlandmark Assisted Speaker dEtection for Robustness (LASER). Unlike models that\nrely solely on facial frames, LASER explicitly focuses on lip movements by\nintegrating lip landmarks in training. Specifically, given a face track, LASER\nextracts frame-level visual features and the 2D coordinates of lip landmarks\nusing a lightweight detector. These coordinates are encoded into dense feature\nmaps, providing spatial and structural information on lip positions.\nRecognizing that landmark detectors may sometimes fail under challenging\nconditions (e.g., low resolution, occlusions, extreme angles), we incorporate\nan auxiliary consistency loss to align predictions from both lip-aware and\nface-only features, ensuring reliable performance even when lip data is absent.\nExtensive experiments across multiple datasets show that LASER outperforms\nstate-of-the-art models, especially in scenarios with desynchronized audio and\nvisuals, demonstrating robust performance in real-world video contexts. Code is\navailable at \\url{https://github.com/plnguyen2908/LASER_ASD}.\n","authors":["Le Thien Phuc Nguyen","Zhuoran Yu","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2501.11899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11898v1","updated":"2025-01-21T05:20:02Z","published":"2025-01-21T05:20:02Z","title":"Highly Efficient Rotation-Invariant Spectral Embedding for Scalable\n  Incomplete Multi-View Clustering","summary":"  Incomplete multi-view clustering presents significant challenges due to\nmissing views. Although many existing graph-based methods aim to recover\nmissing instances or complete similarity matrices with promising results, they\nstill face several limitations: (1) Recovered data may be unsuitable for\nspectral clustering, as these methods often ignore guidance from spectral\nanalysis; (2) Complex optimization processes require high computational burden,\nhindering scalability to large-scale problems; (3) Most methods do not address\nthe rotational mismatch problem in spectral embeddings. To address these\nissues, we propose a highly efficient rotation-invariant spectral embedding\n(RISE) method for scalable incomplete multi-view clustering. RISE learns\nview-specific embeddings from incomplete bipartite graphs to capture the\ncomplementary information. Meanwhile, a complete consensus representation with\nsecond-order rotation-invariant property is recovered from these incomplete\nembeddings in a unified model. Moreover, we design a fast alternating\noptimization algorithm with linear complexity and promising convergence to\nsolve the proposed formulation. Extensive experiments on multiple datasets\ndemonstrate the effectiveness, scalability, and efficiency of RISE compared to\nthe state-of-the-art methods.\n","authors":["Xinxin Wang","Yongshan Zhang","Yicong Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.11898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11895v1","updated":"2025-01-21T05:15:10Z","published":"2025-01-21T05:15:10Z","title":"Contrastive Masked Autoencoders for Character-Level Open-Set Writer\n  Identification","summary":"  In the realm of digital forensics and document authentication, writer\nidentification plays a crucial role in determining the authors of documents\nbased on handwriting styles. The primary challenge in writer-id is the\n\"open-set scenario\", where the goal is accurately recognizing writers unseen\nduring the model training. To overcome this challenge, representation learning\nis the key. This method can capture unique handwriting features, enabling it to\nrecognize styles not previously encountered during training. Building on this\nconcept, this paper introduces the Contrastive Masked Auto-Encoders (CMAE) for\nCharacter-level Open-Set Writer Identification. We merge Masked Auto-Encoders\n(MAE) with Contrastive Learning (CL) to simultaneously and respectively capture\nsequential information and distinguish diverse handwriting styles.\nDemonstrating its effectiveness, our model achieves state-of-the-art (SOTA)\nresults on the CASIA online handwriting dataset, reaching an impressive\nprecision rate of 89.7%. Our study advances universal writer-id with a\nsophisticated representation learning approach, contributing substantially to\nthe ever-evolving landscape of digital handwriting analysis, and catering to\nthe demands of an increasingly interconnected world.\n","authors":["Xiaowei Jiang","Wenhao Ma","Yiqun Duan","Thomas Do","Chin-Teng Lin"],"pdf_url":"https://arxiv.org/pdf/2501.11895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08501v2","updated":"2025-01-21T04:51:11Z","published":"2025-01-15T00:38:13Z","title":"Scalable Bayesian Physics-Informed Kolmogorov-Arnold Networks","summary":"  Uncertainty quantification (UQ) plays a pivotal role in scientific machine\nlearning, especially when surrogate models are used to approximate complex\nsystems. Although multilayer perceptions (MLPs) are commonly employed as\nsurrogates, they often suffer from overfitting due to their large number of\nparameters. Kolmogorov-Arnold networks (KANs) offer an alternative solution\nwith fewer parameters. However, gradient-based inference methods, such as\nHamiltonian Monte Carlo (HMC), may result in computational inefficiency when\napplied to KANs, especially for large-scale datasets, due to the high cost of\nback-propagation. To address these challenges, we propose a novel approach,\ncombining the dropout Tikhonov ensemble Kalman inversion (DTEKI) with Chebyshev\nKANs. This gradient-free method effectively mitigates overfitting and enhances\nnumerical stability. Additionally, we incorporate the active subspace method to\nreduce the parameter-space dimensionality, allowing us to improve the accuracy\nof predictions and obtain more reliable uncertainty estimates. Extensive\nexperiments demonstrate the efficacy of our approach in various test cases,\nincluding scenarios with large datasets and high noise levels. Our results show\nthat the new method achieves comparable or better accuracy, much higher\nefficiency as well as stability compared to HMC, in addition to scalability.\nMoreover, by leveraging the low-dimensional parameter subspace, our method\npreserves prediction accuracy while substantially reducing further the\ncomputational cost.\n","authors":["Zhiwei Gao","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2501.08501v2.pdf","comment":"28 pages, 19 figures"},{"id":"http://arxiv.org/abs/2410.23649v2","updated":"2025-01-21T04:42:10Z","published":"2024-10-31T05:40:08Z","title":"Deep Convolutional Neural Networks on Multiclass Classification of\n  Three-Dimensional Brain Images for Parkinson's Disease Stage Prediction","summary":"  Parkinson's disease (PD), a degenerative disorder of the central nervous\nsystem, is commonly diagnosed using functional medical imaging techniques such\nas single-photon emission computed tomography (SPECT). In this study, we\nutilized two SPECT data sets (n = 634 and n = 202) from different hospitals to\ndevelop a model capable of accurately predicting PD stages, a multiclass\nclassification task. We used the entire three-dimensional (3D) brain images as\ninput and experimented with various model architectures. Initially, we treated\nthe 3D images as sequences of two-dimensional (2D) slices and fed them\nsequentially into 2D convolutional neural network (CNN) models pretrained on\nImageNet, averaging the outputs to obtain the final predicted stage. We also\napplied 3D CNN models pretrained on Kinetics-400. Additionally, we incorporated\nan attention mechanism to account for the varying importance of different\nslices in the prediction process. To further enhance model efficacy and\nrobustness, we simultaneously trained the two data sets using weight sharing, a\ntechnique known as cotraining. Our results demonstrated that 2D models\npretrained on ImageNet outperformed 3D models pretrained on Kinetics-400, and\nmodels utilizing the attention mechanism outperformed both 2D and 3D models.\nThe cotraining technique proved effective in improving model performance when\nthe cotraining data sets were sufficiently large.\n","authors":["Guan-Hua Huang","Wan-Chen Lai","Tai-Been Chen","Chien-Chin Hsu","Huei-Yung Chen","Yi-Chen Wu","Li-Ren Yeh"],"pdf_url":"https://arxiv.org/pdf/2410.23649v2.pdf","comment":"38 pages, 7 figures, and 4 tables. This paper has been accepted for\n  publication in Journal of Imaging Informatics in Medicine"},{"id":"http://arxiv.org/abs/2402.00976v4","updated":"2025-01-21T04:20:26Z","published":"2024-02-01T19:47:31Z","title":"Investigating Recurrent Transformers with Dynamic Halt","summary":"  In this paper, we comprehensively study the inductive biases of two major\napproaches to augmenting Transformers with a recurrent mechanism: (1) the\napproach of incorporating a depth-wise recurrence similar to Universal\nTransformers; and (2) the approach of incorporating a chunk-wise temporal\nrecurrence like Temporal Latent Bottleneck. Furthermore, we propose and\ninvestigate novel ways to extend and combine the above methods - for example,\nwe propose a global mean-based dynamic halting mechanism for Universal\nTransformers and an augmentation of Temporal Latent Bottleneck with elements\nfrom Universal Transformer. We compare the models and probe their inductive\nbiases in several diagnostic tasks, such as Long Range Arena (LRA), flip-flop\nlanguage modeling, ListOps, and Logical Inference. The code is released in:\nhttps://github.com/JRC1995/InvestigatingRecurrentTransformers/tree/main\n","authors":["Jishnu Ray Chowdhury","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2402.00976v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11880v1","updated":"2025-01-21T04:16:46Z","published":"2025-01-21T04:16:46Z","title":"Community-Aware Temporal Walks: Parameter-Free Representation Learning\n  on Continuous-Time Dynamic Graphs","summary":"  Dynamic graph representation learning plays a crucial role in understanding\nevolving behaviors. However, existing methods often struggle with flexibility,\nadaptability, and the preservation of temporal and structural dynamics. To\naddress these issues, we propose Community-aware Temporal Walks (CTWalks), a\nnovel framework for representation learning on continuous-time dynamic graphs.\nCTWalks integrates three key components: a community-based parameter-free\ntemporal walk sampling mechanism, an anonymization strategy enriched with\ncommunity labels, and an encoding process that leverages continuous temporal\ndynamics modeled via ordinary differential equations (ODEs). This design\nenables precise modeling of both intra- and inter-community interactions,\noffering a fine-grained representation of evolving temporal patterns in\ncontinuous-time dynamic graphs. CTWalks theoretically overcomes locality bias\nin walks and establishes its connection to matrix factorization. Experiments on\nbenchmark datasets demonstrate that CTWalks outperforms established methods in\ntemporal link prediction tasks, achieving higher accuracy while maintaining\nrobustness.\n","authors":["He Yu","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11873v1","updated":"2025-01-21T04:04:39Z","published":"2025-01-21T04:04:39Z","title":"Demons in the Detail: On Implementing Load Balancing Loss for Training\n  Specialized Mixture-of-Expert Models","summary":"  This paper revisits the implementation of\n$\\textbf{L}$oad-$\\textbf{b}$alancing $\\textbf{L}$oss (LBL) when training\nMixture-of-Experts (MoEs) models. Specifically, LBL for MoEs is defined as $N_E\n\\sum_{i=1}^{N_E} f_i p_i$, where $N_E$ is the total number of experts, $f_i$\nrepresents the frequency of expert $i$ being selected, and $p_i$ denotes the\naverage gating score of the expert $i$. Existing MoE training frameworks\nusually employ the parallel training strategy so that $f_i$ and the LBL are\ncalculated within a $\\textbf{micro-batch}$ and then averaged across parallel\ngroups. In essence, a micro-batch for training billion-scale LLMs normally\ncontains very few sequences. So, the micro-batch LBL is almost at the sequence\nlevel, and the router is pushed to distribute the token evenly within each\nsequence. Under this strict constraint, even tokens from a domain-specific\nsequence ($\\textit{e.g.}$, code) are uniformly routed to all experts, thereby\ninhibiting expert specialization. In this work, we propose calculating LBL\nusing a $\\textbf{global-batch}$ to loose this constraint. Because a\nglobal-batch contains much more diverse sequences than a micro-batch, which\nwill encourage load balance at the corpus level. Specifically, we introduce an\nextra communication step to synchronize $f_i$ across micro-batches and then use\nit to calculate the LBL. Through experiments on training MoEs-based LLMs (up to\n$\\textbf{42.8B}$ total parameters and $\\textbf{400B}$ tokens), we surprisingly\nfind that the global-batch LBL strategy yields excellent performance gains in\nboth pre-training perplexity and downstream tasks. Our analysis reveals that\nthe global-batch LBL also greatly improves the domain specialization of MoE\nexperts.\n","authors":["Zihan Qiu","Zeyu Huang","Bo Zheng","Kaiyue Wen","Zekun Wang","Rui Men","Ivan Titov","Dayiheng Liu","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2501.11873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11866v1","updated":"2025-01-21T03:47:37Z","published":"2025-01-21T03:47:37Z","title":"Evaluating multiple models using labeled and unlabeled data","summary":"  It remains difficult to evaluate machine learning classifiers in the absence\nof a large, labeled dataset. While labeled data can be prohibitively expensive\nor impossible to obtain, unlabeled data is plentiful. Here, we introduce\nSemi-Supervised Model Evaluation (SSME), a method that uses both labeled and\nunlabeled data to evaluate machine learning classifiers. SSME is the first\nevaluation method to take advantage of the fact that: (i) there are frequently\nmultiple classifiers for the same task, (ii) continuous classifier scores are\noften available for all classes, and (iii) unlabeled data is often far more\nplentiful than labeled data. The key idea is to use a semi-supervised mixture\nmodel to estimate the joint distribution of ground truth labels and classifier\npredictions. We can then use this model to estimate any metric that is a\nfunction of classifier scores and ground truth labels (e.g., accuracy or\nexpected calibration error). We present experiments in four domains where\nobtaining large labeled datasets is often impractical: (1) healthcare, (2)\ncontent moderation, (3) molecular property prediction, and (4) image\nannotation. Our results demonstrate that SSME estimates performance more\naccurately than do competing methods, reducing error by 5.1x relative to using\nlabeled data alone and 2.4x relative to the next best competing method. SSME\nalso improves accuracy when evaluating performance across subsets of the test\ndistribution (e.g., specific demographic subgroups) and when evaluating the\nperformance of language models.\n","authors":["Divya Shanmugam","Shuvom Sadhuka","Manish Raghavan","John Guttag","Bonnie Berger","Emma Pierson"],"pdf_url":"https://arxiv.org/pdf/2501.11866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11860v1","updated":"2025-01-21T03:26:16Z","published":"2025-01-21T03:26:16Z","title":"Bayesian Despeckling of Structured Sources","summary":"  Speckle noise is a fundamental challenge in coherent imaging systems,\nsignificantly degrading image quality. Over the past decades, numerous\ndespeckling algorithms have been developed for applications such as Synthetic\nAperture Radar (SAR) and digital holography. In this paper, we aim to establish\na theoretically grounded approach to despeckling. We propose a method\napplicable to general structured stationary stochastic sources. We demonstrate\nthe effectiveness of the proposed method on piecewise constant sources.\nAdditionally, we theoretically derive a lower bound on the despeckling\nperformance for such sources. The proposed depseckler applied to the 1-Markov\nstructured sources achieves better reconstruction performance with no strong\nsimplification of the ground truth signal model or speckle noise.\n","authors":["Ali Zafari","Shirin Jalali"],"pdf_url":"https://arxiv.org/pdf/2501.11860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16288v2","updated":"2025-01-21T03:19:58Z","published":"2024-08-29T06:40:01Z","title":"OpenFGL: A Comprehensive Benchmark for Federated Graph Learning","summary":"  Federated graph learning (FGL) is a promising distributed training paradigm\nfor graph neural networks across multiple local systems without direct data\nsharing. This approach inherently involves large-scale distributed graph\nprocessing, which closely aligns with the challenges and research focuses of\ngraph-based data systems. Despite the proliferation of FGL, the diverse\nmotivations from real-world applications, spanning various research backgrounds\nand settings, pose a significant challenge to fair evaluation. To fill this\ngap, we propose OpenFGL, a unified benchmark designed for the primary FGL\nscenarios: Graph-FL and Subgraph-FL. Specifically, OpenFGL includes 42 graph\ndatasets from 18 application domains, 8 federated data simulation strategies\nthat emphasize different graph properties, and 5 graph-based downstream tasks.\nAdditionally, it offers 18 recently proposed SOTA FGL algorithms through a\nuser-friendly API, enabling a thorough comparison and comprehensive evaluation\nof their effectiveness, robustness, and efficiency. Our empirical results\ndemonstrate the capabilities of FGL while also highlighting its potential\nlimitations, providing valuable insights for future research in this growing\nfield, particularly in fostering greater interdisciplinary collaboration\nbetween FGL and data systems.\n","authors":["Xunkai Li","Yinlin Zhu","Boyang Pang","Guochen Yan","Yeyu Yan","Zening Li","Zhengyu Wu","Wentao Zhang","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16288v2.pdf","comment":"Accepted by VLDB 2025"},{"id":"http://arxiv.org/abs/2501.11852v1","updated":"2025-01-21T03:08:59Z","published":"2025-01-21T03:08:59Z","title":"Cross-Entropy Attacks to Language Models via Rare Event Simulation","summary":"  Black-box textual adversarial attacks are challenging due to the lack of\nmodel information and the discrete, non-differentiable nature of text. Existing\nmethods often lack versatility for attacking different models, suffer from\nlimited attacking performance due to the inefficient optimization with word\nsaliency ranking, and frequently sacrifice semantic integrity to achieve better\nattack outcomes. This paper introduces a novel approach to textual adversarial\nattacks, which we call Cross-Entropy Attacks (CEA), that uses Cross-Entropy\noptimization to address the above issues. Our CEA approach defines adversarial\nobjectives for both soft-label and hard-label settings and employs CE\noptimization to identify optimal replacements. Through extensive experiments on\ndocument classification and language translation problems, we demonstrate that\nour attack method excels in terms of attacking performance, imperceptibility,\nand sentence quality.\n","authors":["Mingze Ni","Yongshun Gong","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11847v1","updated":"2025-01-21T03:06:30Z","published":"2025-01-21T03:06:30Z","title":"A Survey on Memory-Efficient Large-Scale Model Training in AI for\n  Science","summary":"  Scientific research faces high costs and inefficiencies with traditional\nmethods, but the rise of deep learning and large language models (LLMs) offers\ninnovative solutions. This survey reviews LLM applications across scientific\nfields such as biology, medicine, chemistry, and meteorology, underscoring\ntheir role in advancing research. However, the continuous expansion of model\nsize has led to significant memory demands, hindering further development and\napplication of LLMs for science. To address this, we review memory-efficient\ntraining techniques for LLMs based on the transformer architecture, including\ndistributed training, mixed precision training, and gradient checkpointing.\nUsing AlphaFold 2 as an example, we demonstrate how tailored memory\noptimization methods can reduce storage needs while preserving prediction\naccuracy. We also discuss the challenges of memory optimization in practice and\npotential future directions, hoping to provide valuable insights for\nresearchers and engineers.\n","authors":["Kaiyuan Tian","Linbo Qiao","Baihui Liu","Gongqingjian Jiang","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.11847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09137v2","updated":"2025-01-21T03:05:17Z","published":"2025-01-15T20:43:36Z","title":"Gradient Descent Converges Linearly to Flatter Minima than Gradient Flow\n  in Shallow Linear Networks","summary":"  We study the gradient descent (GD) dynamics of a depth-2 linear neural\nnetwork with a single input and output. We show that GD converges at an\nexplicit linear rate to a global minimum of the training loss, even with a\nlarge stepsize -- about $2/\\textrm{sharpness}$. It still converges for even\nlarger stepsizes, but may do so very slowly. We also characterize the solution\nto which GD converges, which has lower norm and sharpness than the gradient\nflow solution. Our analysis reveals a trade off between the speed of\nconvergence and the magnitude of implicit regularization. This sheds light on\nthe benefits of training at the ``Edge of Stability'', which induces additional\nregularization by delaying convergence and may have implications for training\nmore complex models.\n","authors":["Pierfrancesco Beneventano","Blake Woodworth"],"pdf_url":"https://arxiv.org/pdf/2501.09137v2.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.11464v2","updated":"2025-01-21T02:59:38Z","published":"2024-03-18T04:31:38Z","title":"FedSPU: Personalized Federated Learning for Resource-constrained Devices\n  with Stochastic Parameter Update","summary":"  Personalized Federated Learning (PFL) is widely employed in IoT applications\nto handle high-volume, non-iid client data while ensuring data privacy.\nHowever, heterogeneous edge devices owned by clients may impose varying degrees\nof resource constraints, causing computation and communication bottlenecks for\nPFL. Federated Dropout has emerged as a popular strategy to address this\nchallenge, wherein only a subset of the global model, i.e. a sub-model, is\ntrained on a client's device, thereby reducing computation and communication\noverheads. Nevertheless, the dropout-based model-pruning strategy may introduce\nbias, particularly towards non-iid local data. When biased sub-models absorb\nhighly divergent parameters from other clients, performance degradation becomes\ninevitable. In response, we propose federated learning with stochastic\nparameter update (FedSPU). Unlike dropout that tailors the global model to\nsmall-size local sub-models, FedSPU maintains the full model architecture on\neach device but randomly freezes a certain percentage of neurons in the local\nmodel during training while updating the remaining neurons. This approach\nensures that a portion of the local model remains personalized, thereby\nenhancing the model's robustness against biased parameters from other clients.\nExperimental results demonstrate that FedSPU outperforms federated dropout by\n7.57% on average in terms of accuracy. Furthermore, an introduced early\nstopping scheme leads to a significant reduction of the training time by\n24.8%-70.4% while maintaining high accuracy.\n","authors":["Ziru Niu","Hai Dong","A. K. Qin"],"pdf_url":"https://arxiv.org/pdf/2403.11464v2.pdf","comment":"AAAI 2025 Oral"},{"id":"http://arxiv.org/abs/2411.09854v2","updated":"2025-01-21T02:52:42Z","published":"2024-11-15T00:23:59Z","title":"Fair Secretaries with Unfair Predictions","summary":"  Algorithms with predictions is a recent framework for decision-making under\nuncertainty that leverages the power of machine-learned predictions without\nmaking any assumption about their quality. The goal in this framework is for\nalgorithms to achieve an improved performance when the predictions are accurate\nwhile maintaining acceptable guarantees when the predictions are erroneous. A\nserious concern with algorithms that use predictions is that these predictions\ncan be biased and, as a result, cause the algorithm to make decisions that are\ndeemed unfair. We show that this concern manifests itself in the classical\nsecretary problem in the learning-augmented setting -- the state-of-the-art\nalgorithm can have zero probability of accepting the best candidate, which we\ndeem unfair, despite promising to accept a candidate whose expected value is at\nleast $\\max\\{\\Omega (1) , 1 - O(\\epsilon)\\}$ times the optimal value, where\n$\\epsilon$ is the prediction error. We show how to preserve this promise while\nalso guaranteeing to accept the best candidate with probability $\\Omega(1)$.\nOur algorithm and analysis are based on a new \"pegging\" idea that diverges from\nexisting works and simplifies/unifies some of their results. Finally, we extend\nto the $k$-secretary problem and complement our theoretical analysis with\nexperiments.\n","authors":["Eric Balkanski","Will Ma","Andreas Maggiori"],"pdf_url":"https://arxiv.org/pdf/2411.09854v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.11839v1","updated":"2025-01-21T02:48:23Z","published":"2025-01-21T02:48:23Z","title":"Supervised Learning for Analog and RF Circuit Design: Benchmarks and\n  Comparative Insights","summary":"  Automating analog and radio-frequency (RF) circuit design using machine\nlearning (ML) significantly reduces the time and effort required for parameter\noptimization. This study explores supervised ML-based approaches for designing\ncircuit parameters from performance specifications across various circuit\ntypes, including homogeneous and heterogeneous designs. By evaluating diverse\nML models, from neural networks like transformers to traditional methods like\nrandom forests, we identify the best-performing models for each circuit. Our\nresults show that simpler circuits, such as low-noise amplifiers, achieve\nexceptional accuracy with mean relative errors as low as 0.3% due to their\nlinear parameter-performance relationships. In contrast, complex circuits, like\npower amplifiers and voltage-controlled oscillators, present challenges due to\ntheir non-linear interactions and larger design spaces. For heterogeneous\ncircuits, our approach achieves an 88% reduction in errors with increased\ntraining data, with the receiver achieving a mean relative error as low as\n0.23%, showcasing the scalability and accuracy of the proposed methodology.\nAdditionally, we provide insights into model strengths, with transformers\nexcelling in capturing non-linear mappings and k-nearest neighbors performing\nrobustly in moderately linear parameter spaces, especially in heterogeneous\ncircuits with larger datasets. This work establishes a foundation for extending\nML-driven design automation, enabling more efficient and scalable circuit\ndesign workflows.\n","authors":["Asal Mehradfar","Xuzhe Zhao","Yue Niu","Sara Babakniya","Mahdi Alesheikh","Hamidreza Aghasi","Salman Avestimehr"],"pdf_url":"https://arxiv.org/pdf/2501.11839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11836v1","updated":"2025-01-21T02:44:05Z","published":"2025-01-21T02:44:05Z","title":"Data-driven Detection and Evaluation of Damages in Concrete Structures:\n  Using Deep Learning and Computer Vision","summary":"  Structural integrity is vital for maintaining the safety and longevity of\nconcrete infrastructures such as bridges, tunnels, and walls. Traditional\nmethods for detecting damages like cracks and spalls are labor-intensive,\ntime-consuming, and prone to human error. To address these challenges, this\nstudy explores advanced data-driven techniques using deep learning for\nautomated damage detection and analysis. Two state-of-the-art instance\nsegmentation models, YOLO-v7 instance segmentation and Mask R-CNN, were\nevaluated using a dataset comprising 400 images, augmented to 10,995 images\nthrough geometric and color-based transformations to enhance robustness. The\nmodels were trained and validated using a dataset split into 90% training set,\nvalidation and test set 10%. Performance metrics such as precision, recall,\nmean average precision (mAP@0.5), and frames per second (FPS) were used for\nevaluation. YOLO-v7 achieved a superior mAP@0.5 of 96.1% and processed 40 FPS,\noutperforming Mask R-CNN, which achieved a mAP@0.5 of 92.1% with a slower\nprocessing speed of 18 FPS. The findings recommend YOLO-v7 instance\nsegmentation model for real-time, high-speed structural health monitoring,\nwhile Mask R-CNN is better suited for detailed offline assessments. This study\ndemonstrates the potential of deep learning to revolutionize infrastructure\nmaintenance, offering a scalable and efficient solution for automated damage\ndetection.\n","authors":["Saeid Ataei","Saeed Adibnazari","Seyyed Taghi Ataei"],"pdf_url":"https://arxiv.org/pdf/2501.11836v1.pdf","comment":"17 pages, 10 figures. This study focuses on the data-driven detection\n  and evaluation of damages in concrete structures using deep learning and\n  computer vision techniques"},{"id":"http://arxiv.org/abs/2410.08067v3","updated":"2025-01-21T02:40:27Z","published":"2024-10-10T16:01:51Z","title":"Reward-Augmented Data Enhances Direct Preference Alignment of LLMs","summary":"  Preference alignment in Large Language Models (LLMs) has significantly\nimproved their ability to adhere to human instructions and intentions. However,\nexisting direct alignment algorithms primarily focus on relative preferences\nand often overlook the qualitative aspects of responses. Striving to maximize\nthe implicit reward gap between the chosen and the slightly inferior rejected\nresponses can cause overfitting and unnecessary unlearning of the high-quality\nrejected responses. The unawareness of the reward scores also drives the LLM to\nindiscriminately favor the low-quality chosen responses and fail to generalize\nto responses with the highest rewards, which are sparse in data. To overcome\nthese shortcomings, our study introduces reward-conditioned LLM policies that\ndiscern and learn from the entire spectrum of response quality within the\ndataset, helping extrapolate to more optimal regions. We propose an effective\nyet simple data relabeling method that conditions the preference pairs on\nquality scores to construct a reward-augmented dataset. This dataset is easily\nintegrated with existing direct alignment algorithms and is applicable to any\npreference dataset. The experimental results across instruction-following\nbenchmarks including AlpacaEval, MT-Bench, and Arena-Hard-Auto demonstrate that\nour approach consistently boosts the performance of DPO by a considerable\nmargin across diverse models. Additionally, our method improves the average\naccuracy on various academic benchmarks. When applying our method to on-policy\ndata, the resulting DPO model achieves SOTA results on AlpacaEval. Through\nablation studies, we demonstrate that our method not only maximizes the utility\nof preference data but also mitigates the issue of unlearning, demonstrating\nits broad effectiveness beyond mere dataset expansion. Our code is available at\nhttps://github.com/shenao-zhang/reward-augmented-preference.\n","authors":["Shenao Zhang","Zhihan Liu","Boyi Liu","Yufeng Zhang","Yingxiang Yang","Yongfei Liu","Liyu Chen","Tao Sun","Zhaoran Wang"],"pdf_url":"https://arxiv.org/pdf/2410.08067v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18313v5","updated":"2025-01-21T02:38:32Z","published":"2024-09-26T21:44:11Z","title":"Embodied-RAG: General Non-parametric Embodied Memory for Retrieval and\n  Generation","summary":"  There is no limit to how much a robot might explore and learn, but all of\nthat knowledge needs to be searchable and actionable. Within language research,\nretrieval augmented generation (RAG) has become the workhorse of large-scale\nnon-parametric knowledge; however, existing techniques do not directly transfer\nto the embodied domain, which is multimodal, where data is highly correlated,\nand perception requires abstraction. To address these challenges, we introduce\nEmbodied-RAG, a framework that enhances the foundational model of an embodied\nagent with a non-parametric memory system capable of autonomously constructing\nhierarchical knowledge for both navigation and language generation.\nEmbodied-RAG handles a full range of spatial and semantic resolutions across\ndiverse environments and query types, whether for a specific object or a\nholistic description of ambiance. At its core, Embodied-RAG's memory is\nstructured as a semantic forest, storing language descriptions at varying\nlevels of detail. This hierarchical organization allows the system to\nefficiently generate context-sensitive outputs across different robotic\nplatforms. We demonstrate that Embodied-RAG effectively bridges RAG to the\nrobotics domain, successfully handling over 250 explanation and navigation\nqueries across kilometer-level environments, highlighting its promise as a\ngeneral-purpose non-parametric system for embodied agents.\n","authors":["Quanting Xie","So Yeon Min","Pengliang Ji","Yue Yang","Tianyi Zhang","Kedi Xu","Aarav Bajaj","Ruslan Salakhutdinov","Matthew Johnson-Roberson","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2409.18313v5.pdf","comment":"Web: https://quanting-xie.github.io/Embodied-RAG-web/"},{"id":"http://arxiv.org/abs/2501.11835v1","updated":"2025-01-21T02:38:28Z","published":"2025-01-21T02:38:28Z","title":"Hybrid Adaptive Modeling using Neural Networks Trained with Nonlinear\n  Dynamics Based Features","summary":"  Accurate models are essential for design, performance prediction, control,\nand diagnostics in complex engineering systems. Physics-based models excel\nduring the design phase but often become outdated during system deployment due\nto changing operational conditions, unknown interactions, excitations, and\nparametric drift. While data-based models can capture the current state of\ncomplex systems, they face significant challenges, including excessive data\ndependence, limited generalizability to changing conditions, and inability to\npredict parametric dependence. This has led to combining physics and data in\nmodeling, termed physics-infused machine learning, often using numerical\nsimulations from physics-based models. This paper introduces a novel approach\nthat departs from standard techniques by uncovering information from nonlinear\ndynamical modeling and embedding it in data-based models. The goal is to create\na hybrid adaptive modeling framework that integrates data-based modeling with\nnewly measured data and analytical nonlinear dynamical models for enhanced\naccuracy, parametric dependence, and improved generalizability. By explicitly\nincorporating nonlinear dynamic phenomena through perturbation methods, the\npredictive capabilities are more realistic and insightful compared to knowledge\nobtained from brute-force numerical simulations. In particular, perturbation\nmethods are utilized to derive asymptotic solutions which are parameterized to\ngenerate frequency responses. Frequency responses provide comprehensive\ninsights into dynamics and nonlinearity which are quantified and extracted as\nhigh-quality features. A machine-learning model, trained by these features,\ntracks parameter variations and updates the mismatched model. The results\ndemonstrate that this adaptive modeling method outperforms numerical gray box\nmodels in prediction accuracy and computational efficiency.\n","authors":["Zihan Liu","Prashant N. Kambali","C. Nataraj"],"pdf_url":"https://arxiv.org/pdf/2501.11835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04561v3","updated":"2025-01-21T02:18:46Z","published":"2023-11-08T09:48:42Z","title":"Information-Theoretic Generalization Bounds for Transductive Learning\n  and its Applications","summary":"  In this paper, we establish generalization bounds for transductive learning\nalgorithms in the context of information theory and PAC-Bayes, covering both\nthe random sampling and the random splitting setting. First, we show that the\ntransductive generalization gap can be controlled by the mutual information\nbetween training label selection and the hypothesis. Next, we propose the\nconcept of transductive supersample and use it to derive transductive\ninformation-theoretic bounds involving conditional mutual information and\ndifferent information measures. We further establish transductive PAC-Bayesian\nbounds with weaker assumptions on the type of loss function and the number of\ntraining and test data points. Lastly, we use the theoretical results to derive\nupper bounds for adaptive optimization algorithms under the transductive\nlearning setting. We also apply them to semi-supervised learning and\ntransductive graph learning scenarios, meanwhile validating the derived bounds\nby experiments on synthetic and real-world datasets.\n","authors":["Huayi Tang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2311.04561v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11830v1","updated":"2025-01-21T02:15:45Z","published":"2025-01-21T02:15:45Z","title":"ShadowGenes: Leveraging Recurring Patterns within Computational Graphs\n  for Model Genealogy","summary":"  Machine learning model genealogy enables practitioners to determine which\narchitectural family a neural network belongs to. In this paper, we introduce\nShadowGenes, a novel, signature-based method for identifying a given model's\narchitecture, type, and family. Our method involves building a computational\ngraph of the model that is agnostic of its serialization format, then analyzing\nits internal operations to identify unique patterns, and finally building and\nrefining signatures based on these. We highlight important workings of the\nunderlying engine and demonstrate the technique used to construct a signature\nand scan a given model. This approach to model genealogy can be applied to\nmodel files without the need for additional external information. We test\nShadowGenes on a labeled dataset of over 1,400 models and achieve a mean true\npositive rate of 97.49% and a precision score of 99.51%; which validates the\ntechnique as a practical method for model genealogy. This enables practitioners\nto understand the use cases of a given model, the internal computational\nprocess, and identify possible security risks, such as the potential for model\nbackdooring.\n","authors":["Kasimir Schulz","Kieran Evans"],"pdf_url":"https://arxiv.org/pdf/2501.11830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11827v1","updated":"2025-01-21T02:10:50Z","published":"2025-01-21T02:10:50Z","title":"PXGen: A Post-hoc Explainable Method for Generative Models","summary":"  With the rapid growth of generative AI in numerous applications, explainable\nAI (XAI) plays a crucial role in ensuring the responsible development and\ndeployment of generative AI technologies. XAI has undergone notable\nadvancements and widespread adoption in recent years, reflecting a concerted\npush to enhance the transparency, interpretability, and credibility of AI\nsystems. Recent research emphasizes that a proficient XAI method should adhere\nto a set of criteria, primarily focusing on two key areas. Firstly, it should\nensure the quality and fluidity of explanations, encompassing aspects like\nfaithfulness, plausibility, completeness, and tailoring to individual needs.\nSecondly, the design principle of the XAI system or mechanism should cover the\nfollowing factors such as reliability, resilience, the verifiability of its\noutputs, and the transparency of its algorithm. However, research in XAI for\ngenerative models remains relatively scarce, with little exploration into how\nsuch methods can effectively meet these criteria in that domain. In this work,\nwe propose PXGen, a post-hoc explainable method for generative models. Given a\nmodel that needs to be explained, PXGen prepares two materials for the\nexplanation, the Anchor set and intrinsic & extrinsic criteria. Those materials\nare customizable by users according to their purpose and requirements. Via the\ncalculation of each criterion, each anchor has a set of feature values and\nPXGen provides examplebased explanation methods according to the feature values\namong all the anchors and illustrated and visualized to the users via tractable\nalgorithms such as k-dispersion or k-center.\n","authors":["Yen-Lung Huang","Ming-Hsi Weng","Hao-Tsung Yang"],"pdf_url":"https://arxiv.org/pdf/2501.11827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11823v1","updated":"2025-01-21T02:02:35Z","published":"2025-01-21T02:02:35Z","title":"Toward Scalable Graph Unlearning: A Node Influence Maximization based\n  Approach","summary":"  Machine unlearning, as a pivotal technology for enhancing model robustness\nand data privacy, has garnered significant attention in prevalent web mining\napplications, especially in thriving graph-based scenarios. However, most\nexisting graph unlearning (GU) approaches face significant challenges due to\nthe intricate interactions among web-scale graph elements during the model\ntraining: (1) The gradient-driven node entanglement hinders the complete\nknowledge removal in response to unlearning requests; (2) The billion-level\ngraph elements in the web scenarios present inevitable scalability issues. To\nbreak the above limitations, we open up a new perspective by drawing a\nconnection between GU and conventional social influence maximization. To this\nend, we propose Node Influence Maximization (NIM) through the decoupled\ninfluence propagation model and fine-grained influence function in a scalable\nmanner, which is crafted to be a plug-and-play strategy to identify potential\nnodes affected by unlearning entities. This approach enables offline execution\nindependent of GU, allowing it to be seamlessly integrated into most GU methods\nto improve their unlearning performance. Based on this, we introduce Scalable\nGraph Unlearning (SGU) as a new fine-tuned framework, which balances the\nforgetting and reasoning capability of the unlearned model by entity-specific\noptimizations. Extensive experiments on 14 datasets, including large-scale\nogbn-papers100M, have demonstrated the effectiveness of our approach.\nSpecifically, NIM enhances the forgetting capability of most GU methods, while\nSGU achieves comprehensive SOTA performance and maintains scalability.\n","authors":["Xunkai Li","Bowen Fan","Zhengyu Wu","Zhiyu Li","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11823v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2501.11818v1","updated":"2025-01-21T01:56:33Z","published":"2025-01-21T01:56:33Z","title":"Group-Agent Reinforcement Learning with Heterogeneous Agents","summary":"  Group-agent reinforcement learning (GARL) is a newly arising learning\nscenario, where multiple reinforcement learning agents study together in a\ngroup, sharing knowledge in an asynchronous fashion. The goal is to improve the\nlearning performance of each individual agent. Under a more general\nheterogeneous setting where different agents learn using different algorithms,\nwe advance GARL by designing novel and effective group-learning mechanisms.\nThey guide the agents on whether and how to learn from action choices from the\nothers, and allow the agents to adopt available policy and value function\nmodels sent by another agent if they perform better. We have conducted\nextensive experiments on a total of 43 different Atari 2600 games to\ndemonstrate the superior performance of the proposed method. After the group\nlearning, among the 129 agents examined, 96% are able to achieve a learning\nspeed-up, and 72% are able to learn over 100 times faster. Also, around 41% of\nthose agents have achieved a higher accumulated reward score by learning in\nless than 5% of the time steps required by a single agent when learning on its\nown.\n","authors":["Kaiyue Wu","Xiao-Jun Zeng","Tingting Mu"],"pdf_url":"https://arxiv.org/pdf/2501.11818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11817v1","updated":"2025-01-21T01:52:02Z","published":"2025-01-21T01:52:02Z","title":"Toward Effective Digraph Representation Learning: A Magnetic Adaptive\n  Propagation based Approach","summary":"  The $q$-parameterized magnetic Laplacian serves as the foundation of directed\ngraph (digraph) convolution, enabling this kind of digraph neural network\n(MagDG) to encode node features and structural insights by complex-domain\nmessage passing. As a generalization of undirected methods, MagDG shows\nsuperior capability in modeling intricate web-scale topology. Despite the great\nsuccess achieved by existing MagDGs, limitations still exist: (1) Hand-crafted\n$q$: The performance of MagDGs depends on selecting an appropriate\n$q$-parameter to construct suitable graph propagation equations in the complex\ndomain. This parameter tuning, driven by downstream tasks, limits model\nflexibility and significantly increases manual effort. (2) Coarse Message\nPassing: Most approaches treat all nodes with the same complex-domain\npropagation and aggregation rules, neglecting their unique digraph contexts.\nThis oversight results in sub-optimal performance. To address the above issues,\nwe propose two key techniques: (1) MAP is crafted to be a plug-and-play\ncomplex-domain propagation optimization strategy in the context of digraph\nlearning, enabling seamless integration into any MagDG to improve predictions\nwhile enjoying high running efficiency. (2) MAP++ is a new digraph learning\nframework, further incorporating a learnable mechanism to achieve adaptively\nedge-wise propagation and node-wise aggregation in the complex domain for\nbetter performance. Extensive experiments on 12 datasets demonstrate that MAP\nenjoys flexibility for it can be incorporated with any MagDG, and scalability\nas it can deal with web-scale digraphs. MAP++ achieves SOTA predictive\nperformance on 4 different downstream tasks.\n","authors":["Xunkai Li","Daohan Su","Zhengyu Wu","Guang Zeng","Hongchao Qin","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11817v1.pdf","comment":"Accepted by WWW 2025"},{"id":"http://arxiv.org/abs/2410.22658v2","updated":"2025-01-21T01:37:47Z","published":"2024-10-30T02:57:35Z","title":"Incremental Learning of Retrievable Skills For Efficient Continual Task\n  Adaptation","summary":"  Continual Imitation Learning (CiL) involves extracting and accumulating task\nknowledge from demonstrations across multiple stages and tasks to achieve a\nmulti-task policy. With recent advancements in foundation models, there has\nbeen a growing interest in adapter-based CiL approaches, where adapters are\nestablished parameter-efficiently for tasks newly demonstrated. While these\napproaches isolate parameters for specific tasks and tend to mitigate\ncatastrophic forgetting, they limit knowledge sharing among different\ndemonstrations. We introduce IsCiL, an adapter-based CiL framework that\naddresses this limitation of knowledge sharing by incrementally learning\nshareable skills from different demonstrations, thus enabling sample-efficient\ntask adaptation using the skills particularly in non-stationary CiL\nenvironments. In IsCiL, demonstrations are mapped into the state embedding\nspace, where proper skills can be retrieved upon input states through\nprototype-based memory. These retrievable skills are incrementally learned on\ntheir corresponding adapters. Our CiL experiments with complex tasks in\nFranka-Kitchen and Meta-World demonstrate robust performance of IsCiL in both\ntask adaptation and sample-efficiency. We also show a simple extension of IsCiL\nfor task unlearning scenarios.\n","authors":["Daehee Lee","Minjong Yoo","Woo Kyung Kim","Wonje Choi","Honguk Woo"],"pdf_url":"https://arxiv.org/pdf/2410.22658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11813v1","updated":"2025-01-21T01:36:12Z","published":"2025-01-21T01:36:12Z","title":"Utilising Deep Learning to Elicit Expert Uncertainty","summary":"  Recent work [ 14 ] has introduced a method for prior elicitation that\nutilizes records of expert decisions to infer a prior distribution. While this\nmethod provides a promising approach to eliciting expert uncertainty, it has\nonly been demonstrated using tabular data, which may not entirely represent the\ninformation used by experts to make decisions. In this paper, we demonstrate\nhow analysts can adopt a deep learning approach to utilize the method proposed\nin [14 ] with the actual information experts use. We provide an overview of\ndeep learning models that can effectively model expert decision-making to\nelicit distributions that capture expert uncertainty and present an example\nexamining the risk of colon cancer to show in detail how these models can be\nused.\n","authors":["Julia R. Falconer","Eibe Frank","Devon L. L. Polaschek","Chaitanya Joshi"],"pdf_url":"https://arxiv.org/pdf/2501.11813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02573v2","updated":"2025-01-21T01:31:35Z","published":"2024-11-04T20:10:59Z","title":"Optimization Algorithm Design via Electric Circuits","summary":"  We present a novel methodology for convex optimization algorithm design using\nideas from electric RLC circuits. Given an optimization problem, the first\nstage of the methodology is to design an appropriate electric circuit whose\ncontinuous-time dynamics converge to the solution of the optimization problem\nat hand. Then, the second stage is an automated, computer-assisted\ndiscretization of the continuous-time dynamics, yielding a provably convergent\ndiscrete-time algorithm. Our methodology recovers many classical (distributed)\noptimization algorithms and enables users to quickly design and explore a wide\nrange of new algorithms with convergence guarantees.\n","authors":["Stephen P. Boyd","Tetiana Parshakova","Ernest K. Ryu","Jaewook J. Suh"],"pdf_url":"https://arxiv.org/pdf/2411.02573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08496v2","updated":"2025-01-21T01:01:37Z","published":"2025-01-14T23:59:23Z","title":"Quantifying the Importance of Data Alignment in Downstream Model\n  Performance","summary":"  Contrary to the conventional emphasis on dataset size, we explore the role of\ndata alignment -- an often overlooked aspect of data quality -- in training\ncapable Large Language Models (LLMs). To do so, we use the Task2Vec-based\nalignment coefficient, a quantitative measure of the similarity between two\ndatasets, to quantify the impact of alignment between training data and\nevaluation data on downstream performance. In particular, we conduct controlled\n\\textit{interventional} experiments for two settings: 1. the impact of\nincreased alignment coefficients between various pre-training (pt) against\nevaluation datasets, and 2. the impact of increased alignment coefficients\nbetween domain specific fine-tuning (ft) against domain specific evaluation.\nThe domain specific task we explore is Autoformalization -- the machine\ntranslation task between natural language and code for formal verification. In\nboth settings, we find a strong, predictable negative correlation between the\nalignment coefficient of a model's training and evaluation data and the model's\nloss/perplexity on the respective downstream task. These findings suggest a\nre-evaluation of LLM training approaches, demonstrating the relevance of data\nalignment compared to data quantity, especially in specialized downstream tasks\nsuch as Autoformalization.\n","authors":["Krrish Chawla","Aryan Sahai","Mario DePavia","Sudharsan Sundar","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08506v2","updated":"2025-01-21T01:01:20Z","published":"2025-01-15T00:56:59Z","title":"Exploring the Efficacy of Meta-Learning: Unveiling Superior Data\n  Diversity Utilization of MAML Over Pre-training","summary":"  Currently, data and model size dominate the narrative in the training of\nsuper-large, powerful models. However, there has been a lack of exploration on\nthe effect of other attributes of the training dataset on model performance. We\nhypothesize that dataset diversity can impact the performance of vision models.\nOur study shows positive correlations between test set accuracy and data\ndiversity, providing an argument for furthering the research of dataset\nattributes beyond size. We analyzed pre-training and model-agnostic\nmeta-learning methods on twelve popular visual datasets (e.g., Omniglot,\nCIFAR-FS, Aircraft) and five model configurations, including MAML variants with\ndifferent numbers of inner gradient steps and supervised learning. We show\nmoderate to strong positive correlations (R-squared: 0.15-0.42) between\naccuracy and data diversity and weaker but significant correlations (R-squared:\n~0.2) between loss and diversity. These findings support our hypothesis and\ndemonstrate a promising way for a deeper exploration of how formal data\ndiversity influences model performance. This initial study highlights the\npotential of (Task2Vec) data diversity as a valuable measure in the rapidly\nevolving field of large-scale learning and emphasizes that understanding the\ndataset is key to building more powerful and generalizable models.\n","authors":["Kavita Selva","Satita Vittayaareekul","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11803v1","updated":"2025-01-21T00:44:18Z","published":"2025-01-21T00:44:18Z","title":"Automating High Quality RT Planning at Scale","summary":"  Radiotherapy (RT) planning is complex, subjective, and time-intensive.\nAdvances in artificial intelligence (AI) promise to improve its precision,\nefficiency, and consistency, but progress is often limited by the scarcity of\nlarge, standardized datasets. To address this, we introduce the Automated\nIterative RT Planning (AIRTP) system, a scalable solution for generating\nhigh-quality treatment plans. This scalable solution is designed to generate\nsubstantial volumes of consistently high-quality treatment plans, overcoming a\nkey obstacle in the advancement of AI-driven RT planning. Our AIRTP pipeline\nadheres to clinical guidelines and automates essential steps, including\norgan-at-risk (OAR) contouring, helper structure creation, beam setup,\noptimization, and plan quality improvement, using AI integrated with RT\nplanning software like Eclipse of Varian. Furthermore, a novel approach for\ndetermining optimization parameters to reproduce 3D dose distributions, i.e. a\nmethod to convert dose predictions to deliverable treatment plans constrained\nby machine limitations. A comparative analysis of plan quality reveals that our\nautomated pipeline produces treatment plans of quality comparable to those\ngenerated manually, which traditionally require several hours of labor per\nplan. Committed to public research, the first data release of our AIRTP\npipeline includes nine cohorts covering head-and-neck and lung cancer sites to\nsupport an AAPM 2025 challenge. This data set features more than 10 times the\nnumber of plans compared to the largest existing well-curated public data set\nto our best knowledge.\nRepo:{https://github.com/RiqiangGao/GDP-HMM_AAPMChallenge}\n","authors":["Riqiang Gao","Mamadou Diallo","Han Liu","Anthony Magliari","Jonathan Sackett","Wilko Verbakel","Sandra Meyers","Masoud Zarepisheh","Rafe Mcbeth","Simon Arberet","Martin Kraus","Florin C. Ghesu","Ali Kamen"],"pdf_url":"https://arxiv.org/pdf/2501.11803v1.pdf","comment":"Related to GDP-HMM grand challenge"},{"id":"http://arxiv.org/abs/2501.11795v1","updated":"2025-01-21T00:07:55Z","published":"2025-01-21T00:07:55Z","title":"Provably effective detection of effective data poisoning attacks","summary":"  This paper establishes a mathematically precise definition of dataset\npoisoning attack and proves that the very act of effectively poisoning a\ndataset ensures that the attack can be effectively detected. On top of a\nmathematical guarantee that dataset poisoning is identifiable by a new\nstatistical test that we call the Conformal Separability Test, we provide\nexperimental evidence that we can adequately detect poisoning attempts in the\nreal world.\n","authors":["Jonathan Gallagher","Yasaman Esfandiari","Callen MacPhee","Michael Warren"],"pdf_url":"https://arxiv.org/pdf/2501.11795v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2501.12060v1","updated":"2025-01-21T11:30:51Z","published":"2025-01-21T11:30:51Z","title":"GaussianVideo: Efficient Video Representation Through 2D Gaussian\n  Splatting","summary":"  3D Gaussian splats have emerged as a revolutionary, effective, learned\nrepresentation for static 3D scenes. In this work, we explore using 2D Gaussian\nsplats as a new primitive for representing videos. We propose GaussianVideo, an\napproach to learning a set of 2D Gaussian splats that can effectively represent\nvideo frames. GaussianVideo incorporates the following techniques: (i) To\nexploit temporal redundancy among adjacent frames, which can speed up training\nand improve the compression efficiency, we predict the Gaussian splats of a\nframe based on its previous frame; (ii) To control the trade-offs between file\nsize and quality, we remove Gaussian splats with low contribution to the video\nquality; (iii) To capture dynamics in videos, we randomly add Gaussian splats\nto fit content with large motion or newly-appeared objects; (iv) To handle\nsignificant changes in the scene, we detect key frames based on loss\ndifferences during the learning process. Experiment results show that\nGaussianVideo achieves good rate-distortion trade-offs, comparable to\nstate-of-the-art video codecs such as AV1 and VVC, and a rendering speed of\n1500 fps for a 1920x1080 video.\n","authors":["Longan Wang","Yuang Shi","Wei Tsang Ooi"],"pdf_url":"https://arxiv.org/pdf/2501.12060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10292v2","updated":"2025-01-21T02:43:24Z","published":"2024-04-16T05:29:14Z","title":"From Data Deluge to Data Curation: A Filtering-WoRA Paradigm for\n  Efficient Text-based Person Search","summary":"  In text-based person search endeavors, data generation has emerged as a\nprevailing practice, addressing concerns over privacy preservation and the\narduous task of manual annotation. Although the number of synthesized data can\nbe infinite in theory, the scientific conundrum persists that how much\ngenerated data optimally fuels subsequent model training. We observe that only\na subset of the data in these constructed datasets plays a decisive role.\nTherefore, we introduce a new Filtering-WoRA paradigm, which contains a\nfiltering algorithm to identify this crucial data subset and WoRA (Weighted\nLow-Rank Adaptation) learning strategy for light fine-tuning. The filtering\nalgorithm is based on the cross-modality relevance to remove the lots of coarse\nmatching synthesis pairs. As the number of data decreases, we do not need to\nfine-tune the entire model. Therefore, we propose a WoRA learning strategy to\nefficiently update a minimal portion of model parameters. WoRA streamlines the\nlearning process, enabling heightened efficiency in extracting knowledge from\nfewer, yet potent, data instances. Extensive experimentation validates the\nefficacy of pretraining, where our model achieves advanced and efficient\nretrieval performance on challenging real-world benchmarks. Notably, on the\nCUHK-PEDES dataset, we have achieved a competitive mAP of 67.02% while reducing\nmodel training time by 19.82%.\n","authors":["Jintao Sun","Hao Fei","Zhedong Zheng","Gangyi Ding"],"pdf_url":"https://arxiv.org/pdf/2404.10292v2.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.12392v1","updated":"2025-01-21T18:59:53Z","published":"2025-01-21T18:59:53Z","title":"Learning segmentation from point trajectories","summary":"  We consider the problem of segmenting objects in videos based on their motion\nand no other forms of supervision. Prior work has often approached this problem\nby using the principle of common fate, namely the fact that the motion of\npoints that belong to the same object is strongly correlated. However, most\nauthors have only considered instantaneous motion from optical flow. In this\nwork, we present a way to train a segmentation network using long-term point\ntrajectories as a supervisory signal to complement optical flow. The key\ndifficulty is that long-term motion, unlike instantaneous motion, is difficult\nto model -- any parametric approximation is unlikely to capture complex motion\npatterns over long periods of time. We instead draw inspiration from subspace\nclustering approaches, proposing a loss function that seeks to group the\ntrajectories into low-rank matrices where the motion of object points can be\napproximately explained as a linear combination of other point tracks. Our\nmethod outperforms the prior art on motion-based segmentation, which shows the\nutility of long-term motion and the effectiveness of our formulation.\n","authors":["Laurynas Karazija","Iro Laina","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2501.12392v1.pdf","comment":"NeurIPS 2024 Spotlight. Project\n  https://www.robots.ox.ac.uk/~vgg/research/lrtl/"},{"id":"http://arxiv.org/abs/2501.12391v1","updated":"2025-01-21T18:59:49Z","published":"2025-01-21T18:59:49Z","title":"Physics of Skill Learning","summary":"  We aim to understand physics of skill learning, i.e., how skills are learned\nin neural networks during training. We start by observing the Domino effect,\ni.e., skills are learned sequentially, and notably, some skills kick off\nlearning right after others complete learning, similar to the sequential fall\nof domino cards. To understand the Domino effect and relevant behaviors of\nskill learning, we take physicists' approach of abstraction and simplification.\nWe propose three models with varying complexities -- the Geometry model, the\nResource model, and the Domino model, trading between reality and simplicity.\nThe Domino effect can be reproduced in the Geometry model, whose resource\ninterpretation inspires the Resource model, which can be further simplified to\nthe Domino model. These models present different levels of abstraction and\nsimplification; each is useful to study some aspects of skill learning. The\nGeometry model provides interesting insights into neural scaling laws and\noptimizers; the Resource model sheds light on the learning dynamics of\ncompositional tasks; the Domino model reveals the benefits of modularity. These\nmodels are not only conceptually interesting -- e.g., we show how Chinchilla\nscaling laws can emerge from the Geometry model, but also are useful in\npractice by inspiring algorithmic development -- e.g., we show how simple\nalgorithmic changes, motivated by these toy models, can speed up the training\nof deep learning models.\n","authors":["Ziming Liu","Yizhou Liu","Eric J. Michaud","Jeff Gore","Max Tegmark"],"pdf_url":"https://arxiv.org/pdf/2501.12391v1.pdf","comment":"25 pages, 20 figures. Codes are available at\n  https://github.com/KindXiaoming/physics_of_skill_learning"},{"id":"http://arxiv.org/abs/2501.12380v1","updated":"2025-01-21T18:56:18Z","published":"2025-01-21T18:56:18Z","title":"MMVU: Measuring Expert-Level Multi-Discipline Video Understanding","summary":"  We introduce MMVU, a comprehensive expert-level, multi-discipline benchmark\nfor evaluating foundation models in video understanding. MMVU includes 3,000\nexpert-annotated questions spanning 27 subjects across four core disciplines:\nScience, Healthcare, Humanities & Social Sciences, and Engineering. Compared to\nprior benchmarks, MMVU features three key advancements. First, it challenges\nmodels to apply domain-specific knowledge and perform expert-level reasoning to\nanalyze specialized-domain videos, moving beyond the basic visual perception\ntypically assessed in current video benchmarks. Second, each example is\nannotated by human experts from scratch. We implement strict data quality\ncontrols to ensure the high quality of the dataset. Finally, each example is\nenriched with expert-annotated reasoning rationals and relevant domain\nknowledge, facilitating in-depth analysis. We conduct an extensive evaluation\nof 32 frontier multimodal foundation models on MMVU. The latest\nSystem-2-capable models, o1 and Gemini 2.0 Flash Thinking, achieve the highest\nperformance among the tested models. However, they still fall short of matching\nhuman expertise. Through in-depth error analyses and case studies, we offer\nactionable insights for future advancements in expert-level,\nknowledge-intensive video understanding for specialized domains.\n","authors":["Yilun Zhao","Lujing Xie","Haowei Zhang","Guo Gan","Yitao Long","Zhiyuan Hu","Tongyan Hu","Weiyuan Chen","Chuhan Li","Junyang Song","Zhijian Xu","Chengye Wang","Weifeng Pan","Ziyao Shangguan","Xiangru Tang","Zhenwen Liang","Yixin Liu","Chen Zhao","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2501.12380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12375v1","updated":"2025-01-21T18:53:30Z","published":"2025-01-21T18:53:30Z","title":"Video Depth Anything: Consistent Depth Estimation for Super-Long Videos","summary":"  Depth Anything has achieved remarkable success in monocular depth estimation\nwith strong generalization ability. However, it suffers from temporal\ninconsistency in videos, hindering its practical applications. Various methods\nhave been proposed to alleviate this issue by leveraging video generation\nmodels or introducing priors from optical flow and camera poses. Nonetheless,\nthese methods are only applicable to short videos (< 10 seconds) and require a\ntrade-off between quality and computational efficiency. We propose Video Depth\nAnything for high-quality, consistent depth estimation in super-long videos\n(over several minutes) without sacrificing efficiency. We base our model on\nDepth Anything V2 and replace its head with an efficient spatial-temporal head.\nWe design a straightforward yet effective temporal consistency loss by\nconstraining the temporal depth gradient, eliminating the need for additional\ngeometric priors. The model is trained on a joint dataset of video depth and\nunlabeled images, similar to Depth Anything V2. Moreover, a novel\nkey-frame-based strategy is developed for long video inference. Experiments\nshow that our model can be applied to arbitrarily long videos without\ncompromising quality, consistency, or generalization ability. Comprehensive\nevaluations on multiple video benchmarks demonstrate that our approach sets a\nnew state-of-the-art in zero-shot video depth estimation. We offer models of\ndifferent scales to support a range of scenarios, with our smallest model\ncapable of real-time performance at 30 FPS.\n","authors":["Sili Chen","Hengkai Guo","Shengnan Zhu","Feihu Zhang","Zilong Huang","Jiashi Feng","Bingyi Kang"],"pdf_url":"https://arxiv.org/pdf/2501.12375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12374v1","updated":"2025-01-21T18:53:21Z","published":"2025-01-21T18:53:21Z","title":"Expertise elevates AI usage: experimental evidence comparing laypeople\n  and professional artists","summary":"  Novel capacities of generative AI to analyze and generate cultural artifacts\nraise inevitable questions about the nature and value of artistic education and\nhuman expertise. Has AI already leveled the playing field between professional\nartists and laypeople, or do trained artistic expressive capacity, curation\nskills and experience instead enhance the ability to use these new tools? In\nthis pre-registered study, we conduct experimental comparisons between 50\nactive artists and a demographically matched sample of laypeople. We designed\ntwo tasks to approximate artistic practice for testing their capabilities in\nboth faithful and creative image creation: replicating a reference image, and\nmoving as far away as possible from it. We developed a bespoke platform where\nparticipants used a modern text-to-image model to complete both tasks. We also\ncollected and compared participants' sentiments towards AI. On average, artists\nproduced more faithful and creative outputs than their lay counterparts,\nalthough only by a small margin. While AI may ease content creation,\nprofessional expertise is still valuable - even within the confined space of\ngenerative AI itself. Finally, we also explored how well an exemplary\nvision-capable large language model (GPT-4o) would complete the same tasks, if\ngiven the role of an image generation agent, and found it performed on par in\ncopying but outperformed even artists in the creative task. The very best\nresults were still produced by humans in both tasks. These outcomes highlight\nthe importance of integrating artistic skills with AI training to prepare\nartists and other visual professionals for a technologically evolving\nlandscape. We see a potential in collaborative synergy with generative AI,\nwhich could reshape creative industries and education in the arts.\n","authors":["Thomas F. Eisenmann","Andres Karjus","Mar Canet Sola","Levin Brinkmann","Bramantyo Ibrahim Supriyatno","Iyad Rahwan"],"pdf_url":"https://arxiv.org/pdf/2501.12374v1.pdf","comment":"Eisenmann and Karjus contributed equally to this work and share first\n  authorship"},{"id":"http://arxiv.org/abs/2501.12372v1","updated":"2025-01-21T18:52:15Z","published":"2025-01-21T18:52:15Z","title":"Is Long Context All You Need? Leveraging LLM's Extended Context for\n  NL2SQL","summary":"  Large Language Models (LLMs) have demonstrated impressive capabilities across\na range of natural language processing tasks. In particular, improvements in\nreasoning abilities and the expansion of context windows have opened new\navenues for leveraging these powerful models. NL2SQL is challenging in that the\nnatural language question is inherently ambiguous, while the SQL generation\nrequires a precise understanding of complex data schema and semantics. One\napproach to this semantic ambiguous problem is to provide more and sufficient\ncontextual information.\n  In this work, we explore the performance and the latency trade-offs of the\nextended context window (a.k.a., long context) offered by Google's\nstate-of-the-art LLM (\\textit{gemini-1.5-pro}). We study the impact of various\ncontextual information, including column example values, question and SQL query\npairs, user-provided hints, SQL documentation, and schema. To the best of our\nknowledge, this is the first work to study how the extended context window and\nextra contextual information can help NL2SQL generation with respect to both\naccuracy and latency cost. We show that long context LLMs are robust and do not\nget lost in the extended contextual information. Additionally, our long-context\nNL2SQL pipeline based on Google's \\textit{gemini-pro-1.5} achieve a strong\nperformance with 67.41\\% on BIRD benchmark (dev) without finetuning and\nexpensive self-consistency based techniques.\n","authors":["Yeounoh Chung","Gaurav T. Kakkar","Yu Gan","Brenton Milne","Fatma Ozcan"],"pdf_url":"https://arxiv.org/pdf/2501.12372v1.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.12370v1","updated":"2025-01-21T18:51:15Z","published":"2025-01-21T18:51:15Z","title":"Parameters vs FLOPs: Scaling Laws for Optimal Sparsity for\n  Mixture-of-Experts Language Models","summary":"  Scaling the capacity of language models has consistently proven to be a\nreliable approach for improving performance and unlocking new capabilities.\nCapacity can be primarily defined by two dimensions: the number of model\nparameters and the compute per example. While scaling typically involves\nincreasing both, the precise interplay between these factors and their combined\ncontribution to overall capacity remains not fully understood. We explore this\nrelationship in the context of sparse Mixture-of-Expert models (MoEs), which\nallow scaling the number of parameters without proportionally increasing the\nFLOPs per example. We investigate how varying the sparsity level, i.e., the\nratio of non-active to total parameters, affects model performance in terms of\nboth pretraining and downstream performance. We find that under different\nconstraints (e.g. parameter size and total training compute), there is an\noptimal level of sparsity that improves both training efficiency and model\nperformance. These results provide a better understanding of the impact of\nsparsity in scaling laws for MoEs and complement existing works in this area,\noffering insights for designing more efficient architectures.\n","authors":["Samira Abnar","Harshay Shah","Dan Busbridge","Alaaeldin Mohamed Elnouby Ali","Josh Susskind","Vimal Thilak"],"pdf_url":"https://arxiv.org/pdf/2501.12370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12369v1","updated":"2025-01-21T18:49:06Z","published":"2025-01-21T18:49:06Z","title":"DARB-Splatting: Generalizing Splatting with Decaying Anisotropic Radial\n  Basis Functions","summary":"  Splatting-based 3D reconstruction methods have gained popularity with the\nadvent of 3D Gaussian Splatting, efficiently synthesizing high-quality novel\nviews. These methods commonly resort to using exponential family functions,\nsuch as the Gaussian function, as reconstruction kernels due to their\nanisotropic nature, ease of projection, and differentiability in rasterization.\nHowever, the field remains restricted to variations within the exponential\nfamily, leaving generalized reconstruction kernels largely underexplored,\npartly due to the lack of easy integrability in 3D to 2D projections. In this\nlight, we show that a class of decaying anisotropic radial basis functions\n(DARBFs), which are non-negative functions of the Mahalanobis distance,\nsupports splatting by approximating the Gaussian function's closed-form\nintegration advantage. With this fresh perspective, we demonstrate up to 34%\nfaster convergence during training and a 15% reduction in memory consumption\nacross various DARB reconstruction kernels, while maintaining comparable PSNR,\nSSIM, and LPIPS results. We will make the code available.\n","authors":["Vishagar Arunan","Saeedha Nazar","Hashiru Pramuditha","Vinasirajan Viruthshaan","Sameera Ramasinghe","Simon Lucey","Ranga Rodrigo"],"pdf_url":"https://arxiv.org/pdf/2501.12369v1.pdf","comment":"Link to the project page:\n  https://randomnerds.github.io/darbs.github.io/"},{"id":"http://arxiv.org/abs/2501.12352v1","updated":"2025-01-21T18:32:31Z","published":"2025-01-21T18:32:31Z","title":"Test-time regression: a unifying framework for designing sequence models\n  with associative memory","summary":"  Sequences provide a remarkably general way to represent and process\ninformation. This powerful abstraction has placed sequence modeling at the\ncenter of modern deep learning applications, inspiring numerous architectures\nfrom transformers to recurrent networks. While this fragmented development has\nyielded powerful models, it has left us without a unified framework to\nunderstand their fundamental similarities and explain their effectiveness. We\npresent a unifying framework motivated by an empirical observation: effective\nsequence models must be able to perform associative recall. Our key insight is\nthat memorizing input tokens through an associative memory is equivalent to\nperforming regression at test-time. This regression-memory correspondence\nprovides a framework for deriving sequence models that can perform associative\nrecall, offering a systematic lens to understand seemingly ad-hoc architectural\nchoices. We show numerous recent architectures -- including linear attention\nmodels, their gated variants, state-space models, online learners, and softmax\nattention -- emerge naturally as specific approaches to test-time regression.\nEach architecture corresponds to three design choices: the relative importance\nof each association, the regressor function class, and the optimization\nalgorithm. This connection leads to new understanding: we provide theoretical\njustification for QKNorm in softmax attention, and we motivate higher-order\ngeneralizations of softmax attention. Beyond unification, our work unlocks\ndecades of rich statistical tools that can guide future development of more\npowerful yet principled sequence models.\n","authors":["Ke Alexander Wang","Jiaxin Shi","Emily B. Fox"],"pdf_url":"https://arxiv.org/pdf/2501.12352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12339v1","updated":"2025-01-21T18:13:43Z","published":"2025-01-21T18:13:43Z","title":"Treefix: Enabling Execution with a Tree of Prefixes","summary":"  The ability to execute code is a prerequisite for various dynamic program\nanalyses. Learning-guided execution has been proposed as an approach to enable\nthe execution of arbitrary code snippets by letting a neural model predict\nlikely values for any missing variables. Although state-of-the-art\nlearning-guided execution approaches, such as LExecutor, can enable the\nexecution of a relative high amount of code, they are limited to predicting a\nrestricted set of possible values and do not use any feedback from previous\nexecutions to execute even more code. This paper presents Treefix, a novel\nlearning-guided execution approach that leverages LLMs to iteratively create\ncode prefixes that enable the execution of a given code snippet. The approach\naddresses the problem in a multi-step fashion, where each step uses feedback\nabout the code snippet and its execution to instruct an LLM to improve a\npreviously generated prefix. This process iteratively creates a tree of\nprefixes, a subset of which is returned to the user as prefixes that maximize\nthe number of executed lines in the code snippet. In our experiments with two\ndatasets of Python code snippets, Treefix achieves 25% and 7% more coverage\nrelative to the current state of the art in learning-guided execution, covering\na total of 84% and 82% of all lines in the code snippets.\n","authors":["Beatriz Souza","Michael Pradel"],"pdf_url":"https://arxiv.org/pdf/2501.12339v1.pdf","comment":"Accepted in research track of the EEE/ACM International Conference on\n  Software Engineering (ICSE) 2025"},{"id":"http://arxiv.org/abs/2501.12336v1","updated":"2025-01-21T18:10:43Z","published":"2025-01-21T18:10:43Z","title":"FuocChuVIP123 at CoMeDi Shared Task: Disagreement Ranking with\n  XLM-Roberta Sentence Embeddings and Deep Neural Regression","summary":"  This paper presents results of our system for CoMeDi Shared Task, focusing on\nSubtask 2: Disagreement Ranking. Our system leverages sentence embeddings\ngenerated by the paraphrase-xlm-r-multilingual-v1 model, combined with a deep\nneural regression model incorporating batch normalization and dropout for\nimproved generalization. By predicting the mean of pairwise judgment\ndifferences between annotators, our method explicitly targets disagreement\nranking, diverging from traditional \"gold label\" aggregation approaches. We\noptimized our system with a customized architecture and training procedure,\nachieving competitive performance in Spearman correlation against mean\ndisagreement labels. Our results highlight the importance of robust embeddings,\neffective model architecture, and careful handling of judgment differences for\nranking disagreement in multilingual contexts. These findings provide insights\ninto the use of contextualized representations for ordinal judgment tasks and\nopen avenues for further refinement of disagreement prediction models.\n","authors":["Phuoc Duong Huy Chu"],"pdf_url":"https://arxiv.org/pdf/2501.12336v1.pdf","comment":"Accepted at COMEDI shared Task, Workshop at COLING 2025"},{"id":"http://arxiv.org/abs/2501.12332v1","updated":"2025-01-21T18:06:54Z","published":"2025-01-21T18:06:54Z","title":"Automatic Labelling with Open-source LLMs using Dynamic Label Schema\n  Integration","summary":"  Acquiring labelled training data remains a costly task in real world machine\nlearning projects to meet quantity and quality requirements. Recently Large\nLanguage Models (LLMs), notably GPT-4, have shown great promises in labelling\ndata with high accuracy. However, privacy and cost concerns prevent the\nubiquitous use of GPT-4. In this work, we explore effectively leveraging\nopen-source models for automatic labelling. We identify integrating label\nschema as a promising technology but found that naively using the label\ndescription for classification leads to poor performance on high cardinality\ntasks. To address this, we propose Retrieval Augmented Classification (RAC) for\nwhich LLM performs inferences for one label at a time using corresponding label\nschema; we start with the most related label and iterates until a label is\nchosen by the LLM. We show that our method, which dynamically integrates label\ndescription, leads to performance improvements in labelling tasks. We further\nshow that by focusing only on the most promising labels, RAC can trade off\nbetween label quality and coverage - a property we leverage to automatically\nlabel our internal datasets.\n","authors":["Thomas Walshe","Sae Young Moon","Chunyang Xiao","Yawwani Gunawardana","Fran Silavong"],"pdf_url":"https://arxiv.org/pdf/2501.12332v1.pdf","comment":"11 pages, 1 figure"},{"id":"http://arxiv.org/abs/2410.18067v3","updated":"2025-01-21T17:50:47Z","published":"2024-10-23T17:48:28Z","title":"Beyond Position: the emergence of wavelet-like properties in\n  Transformers","summary":"  This paper studies how transformer models develop robust wavelet-like\nproperties that effectively compensate for the theoretical limitations of\nRotary Position Embeddings (RoPE), providing insights into how these networks\nprocess sequential information across different scales. Through theoretical\nanalysis and empirical validation across models ranging from 1B to 12B\nparameters, we show that attention heads naturally evolve to implement\nmulti-resolution processing analogous to wavelet transforms. Our analysis\nestablishes that attention heads consistently organize into complementary\nfrequency bands with systematic power distribution patterns, and these\nwavelet-like characteristics become more pronounced in larger models. We\nprovide mathematical analysis showing how these properties align with optimal\nsolutions to the fundamental uncertainty principle between positional precision\nand frequency resolution. Our findings suggest that the effectiveness of modern\ntransformer architectures stems significantly from their development of optimal\nmulti-resolution decompositions that naturally address the theoretical\nconstraints of position encoding.\n","authors":["Valeria Ruscio","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2410.18067v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12326v1","updated":"2025-01-21T17:48:10Z","published":"2025-01-21T17:48:10Z","title":"UI-TARS: Pioneering Automated GUI Interaction with Native Agents","summary":"  This paper introduces UI-TARS, a native GUI agent model that solely perceives\nthe screenshots as input and performs human-like interactions (e.g., keyboard\nand mouse operations). Unlike prevailing agent frameworks that depend on\nheavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts\nand workflows, UI-TARS is an end-to-end model that outperforms these\nsophisticated frameworks. Experiments demonstrate its superior performance:\nUI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating\nperception, grounding, and GUI task execution. Notably, in the OSWorld\nbenchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15\nsteps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld,\nUI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several\nkey innovations: (1) Enhanced Perception: leveraging a large-scale dataset of\nGUI screenshots for context-aware understanding of UI elements and precise\ncaptioning; (2) Unified Action Modeling, which standardizes actions into a\nunified space across platforms and achieves precise grounding and interaction\nthrough large-scale action traces; (3) System-2 Reasoning, which incorporates\ndeliberate reasoning into multi-step decision making, involving multiple\nreasoning patterns such as task decomposition, reflection thinking, milestone\nrecognition, etc. (4) Iterative Training with Reflective Online Traces, which\naddresses the data bottleneck by automatically collecting, filtering, and\nreflectively refining new interaction traces on hundreds of virtual machines.\nThrough iterative training and reflection tuning, UI-TARS continuously learns\nfrom its mistakes and adapts to unforeseen situations with minimal human\nintervention. We also analyze the evolution path of GUI agents to guide the\nfurther development of this domain.\n","authors":["Yujia Qin","Yining Ye","Junjie Fang","Haoming Wang","Shihao Liang","Shizuo Tian","Junda Zhang","Jiahao Li","Yunxin Li","Shijue Huang","Wanjun Zhong","Kuanye Li","Jiale Yang","Yu Miao","Woyu Lin","Longxiang Liu","Xu Jiang","Qianli Ma","Jingyu Li","Xiaojun Xiao","Kai Cai","Chuang Li","Yaowei Zheng","Chaolin Jin","Chen Li","Xiao Zhou","Minchao Wang","Haoli Chen","Zhaojian Li","Haihua Yang","Haifeng Liu","Feng Lin","Tao Peng","Xin Liu","Guang Shi"],"pdf_url":"https://arxiv.org/pdf/2501.12326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10622v3","updated":"2025-01-21T17:20:31Z","published":"2024-12-14T00:05:42Z","title":"A recent evaluation on the performance of LLMs on radiation oncology\n  physics using questions of randomly shuffled options","summary":"  Purpose: We present an updated study evaluating the performance of large\nlanguage models (LLMs) in answering radiation oncology physics questions,\nfocusing on the recently released models.\n  Methods: A set of 100 multiple-choice radiation oncology physics questions,\npreviously created by a well-experienced physicist, was used for this study.\nThe answer options of the questions were randomly shuffled to create \"new\" exam\nsets. Five LLMs -- OpenAI o1-preview, GPT-4o, LLaMA 3.1 (405B), Gemini 1.5 Pro,\nand Claude 3.5 Sonnet -- with the versions released before September 30, 2024,\nwere queried using these new exam sets. To evaluate their deductive reasoning\nability, the correct answer options in the questions were replaced with \"None\nof the above.\" Then, the explain-first and step-by-step instruction prompts\nwere used to test if this strategy improved their reasoning ability. The\nperformance of the LLMs was compared with the answers from medical physicists.\n  Results: All models demonstrated expert-level performance on these questions,\nwith o1-preview even surpassing medical physicists with a majority vote. When\nreplacing the correct answer options with 'None of the above', all models\nexhibited a considerable decline in performance, suggesting room for\nimprovement. The explain-first and step-by-step instruction prompts helped\nenhance the reasoning ability of the LLaMA 3.1 (405B), Gemini 1.5 Pro, and\nClaude 3.5 Sonnet models.\n  Conclusion: These recently released LLMs demonstrated expert-level\nperformance in answering radiation oncology physics questions, exhibiting great\npotential to assist in radiation oncology physics education and training.\n","authors":["Peilong Wang","Jason Holmes","Zhengliang Liu","Dequan Chen","Tianming Liu","Jiajian Shen","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2412.10622v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12300v1","updated":"2025-01-21T17:13:13Z","published":"2025-01-21T17:13:13Z","title":"LLM-Assisted Knowledge Graph Completion for Curriculum and Domain\n  Modelling in Personalized Higher Education Recommendations","summary":"  While learning personalization offers great potential for learners, modern\npractices in higher education require a deeper consideration of domain models\nand learning contexts, to develop effective personalization algorithms. This\npaper introduces an innovative approach to higher education curriculum\nmodelling that utilizes large language models (LLMs) for knowledge graph (KG)\ncompletion, with the goal of creating personalized learning-path\nrecommendations. Our research focuses on modelling university subjects and\nlinking their topics to corresponding domain models, enabling the integration\nof learning modules from different faculties and institutions in the student's\nlearning path. Central to our approach is a collaborative process, where LLMs\nassist human experts in extracting high-quality, fine-grained topics from\nlecture materials. We develop a domain, curriculum, and user models for\nuniversity modules and stakeholders. We implement this model to create the KG\nfrom two study modules: Embedded Systems and Development of Embedded Systems\nUsing FPGA. The resulting KG structures the curriculum and links it to the\ndomain models. We evaluate our approach through qualitative expert feedback and\nquantitative graph quality metrics. Domain experts validated the relevance and\naccuracy of the model, while the graph quality metrics measured the structural\nproperties of our KG. Our results show that the LLM-assisted graph completion\napproach enhances the ability to connect related courses across disciplines to\npersonalize the learning experience. Expert feedback also showed high\nacceptance of the proposed collaborative approach for concept extraction and\nclassification.\n","authors":["Hasan Abu-Rasheed","Constance Jumbo","Rashed Al Amin","Christian Weber","Veit Wiese","Roman Obermaisser","Madjid Fathi"],"pdf_url":"https://arxiv.org/pdf/2501.12300v1.pdf","comment":"Accepted in the IEEE Global Engineering Education Conference\n  (EDUCON2025), London, UK, 22-25 April, 2025"},{"id":"http://arxiv.org/abs/2501.12296v1","updated":"2025-01-21T17:03:06Z","published":"2025-01-21T17:03:06Z","title":"RALAD: Bridging the Real-to-Sim Domain Gap in Autonomous Driving with\n  Retrieval-Augmented Learning","summary":"  In the pursuit of robust autonomous driving systems, models trained on\nreal-world datasets often struggle to adapt to new environments, particularly\nwhen confronted with corner cases such as extreme weather conditions.\nCollecting these corner cases in the real world is non-trivial, which\nnecessitates the use of simulators for validation. However,the high\ncomputational cost and the domain gap in data distribution have hindered the\nseamless transition between real and simulated driving scenarios. To tackle\nthis challenge, we propose Retrieval-Augmented Learning for Autonomous Driving\n(RALAD), a novel framework designed to bridge the real-to-sim gap at a low\ncost. RALAD features three primary designs, including (1) domain adaptation via\nan enhanced Optimal Transport (OT) method that accounts for both individual and\ngrouped image distances, (2) a simple and unified framework that can be applied\nto various models, and (3) efficient fine-tuning techniques that freeze the\ncomputationally expensive layers while maintaining robustness. Experimental\nresults demonstrate that RALAD compensates for the performance degradation in\nsimulated environments while maintaining accuracy in real-world scenarios\nacross three different models. Taking Cross View as an example, the mIOU and\nmAP metrics in real-world scenarios remain stable before and after RALAD\nfine-tuning, while in simulated environments,the mIOU and mAP metrics are\nimproved by 10.30% and 12.29%, respectively. Moreover, the re-training cost of\nour approach is reduced by approximately 88.1%. Our code is available at\nhttps://github.com/JiachengZuo/RALAD.git.\n","authors":["Jiacheng Zuo","Haibo Hu","Zikang Zhou","Yufei Cui","Ziquan Liu","Jianping Wang","Nan Guan","Jin Wang","Chun Jason Xue"],"pdf_url":"https://arxiv.org/pdf/2501.12296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12289v1","updated":"2025-01-21T16:59:13Z","published":"2025-01-21T16:59:13Z","title":"Regressor-Guided Image Editing Regulates Emotional Response to Reduce\n  Online Engagement","summary":"  Emotions are known to mediate the relationship between users' content\nconsumption and their online engagement, with heightened emotional intensity\nleading to increased engagement. Building on this insight, we propose three\nregressor-guided image editing approaches aimed at diminishing the emotional\nimpact of images. These include (i) a parameter optimization approach based on\nglobal image transformations known to influence emotions, (ii) an optimization\napproach targeting the style latent space of a generative adversarial network,\nand (iii) a diffusion-based approach employing classifier guidance and\nclassifier-free guidance. Our findings demonstrate that approaches can\neffectively alter the emotional properties of images while maintaining high\nvisual quality. Optimization-based methods primarily adjust low-level\nproperties like color hues and brightness, whereas the diffusion-based approach\nintroduces semantic changes, such as altering appearance or facial expressions.\nNotably, results from a behavioral study reveal that only the diffusion-based\napproach successfully elicits changes in viewers' emotional responses while\npreserving high perceived image quality. In future work, we will investigate\nthe impact of these image adaptations on internet user behavior.\n","authors":["Christoph Gebhardt","Robin Willardt","Seyedmorteza Sadat","Chih-Wei Ning","Andreas Brombach","Jie Song","Otmar Hilliges","Christian Holz"],"pdf_url":"https://arxiv.org/pdf/2501.12289v1.pdf","comment":"39 pages, 22 figures"},{"id":"http://arxiv.org/abs/2501.12285v1","updated":"2025-01-21T16:54:39Z","published":"2025-01-21T16:54:39Z","title":"Implementation of an Asymmetric Adjusted Activation Function for Class\n  Imbalance Credit Scoring","summary":"  Credit scoring is a systematic approach to evaluate a borrower's probability\nof default (PD) on a bank loan. The data associated with such scenarios are\ncharacteristically imbalanced, complicating binary classification owing to the\noften-underestimated cost of misclassification during the classifier's learning\nprocess. Considering the high imbalance ratio (IR) of these datasets, we\nintroduce an innovative yet straightforward optimized activation function by\nincorporating an IR-dependent asymmetric adjusted factor embedded Sigmoid\nactivation function (ASIG). The embedding of ASIG makes the sensitive margin of\nthe Sigmoid function auto-adjustable, depending on the imbalance nature of the\ndatasets distributed, thereby giving the activation function an asymmetric\ncharacteristic that prevents the underrepresentation of the minority class\n(positive samples) during the classifier's learning process. The experimental\nresults show that the ASIG-embedded-classifier outperforms traditional\nclassifiers on datasets across wide-ranging IRs in the downstream\ncredit-scoring task. The algorithm also shows robustness and stability, even\nwhen the IR is ultra-high. Therefore, the algorithm provides a competitive\nalternative in the financial industry, especially in credit scoring, possessing\nthe ability to effectively process highly imbalanced distribution data.\n","authors":["Xia Li","Hanghang Zheng","Kunpeng Tao","Mao Mao"],"pdf_url":"https://arxiv.org/pdf/2501.12285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12275v1","updated":"2025-01-21T16:44:51Z","published":"2025-01-21T16:44:51Z","title":"With Great Backbones Comes Great Adversarial Transferability","summary":"  Advances in self-supervised learning (SSL) for machine vision have improved\nrepresentation robustness and model performance, giving rise to pre-trained\nbackbones like \\emph{ResNet} and \\emph{ViT} models tuned with SSL methods such\nas \\emph{SimCLR}. Due to the computational and data demands of pre-training,\nthe utilization of such backbones becomes a strenuous necessity. However,\nemploying these backbones may inherit vulnerabilities to adversarial attacks.\nWhile adversarial robustness has been studied under \\emph{white-box} and\n\\emph{black-box} settings, the robustness of models tuned on pre-trained\nbackbones remains largely unexplored. Additionally, the role of tuning\nmeta-information in mitigating exploitation risks is unclear. This work\nsystematically evaluates the adversarial robustness of such models across\n$20,000$ combinations of tuning meta-information, including fine-tuning\ntechniques, backbone families, datasets, and attack types. We propose using\nproxy models to transfer attacks, simulating varying levels of target knowledge\nby fine-tuning these proxies with diverse configurations. Our findings reveal\nthat proxy-based attacks approach the effectiveness of \\emph{white-box}\nmethods, even with minimal tuning knowledge. We also introduce a naive\n\"backbone attack,\" leveraging only the backbone to generate adversarial\nsamples, which outperforms \\emph{black-box} attacks and rivals \\emph{white-box}\nmethods, highlighting critical risks in model-sharing practices. Finally, our\nablations reveal how increasing tuning meta-information impacts attack\ntransferability, measuring each meta-information combination.\n","authors":["Erik Arakelyan","Karen Hambardzumyan","Davit Papikyan","Pasquale Minervini","Albert Gordo","Isabelle Augenstein","Aram H. Markosyan"],"pdf_url":"https://arxiv.org/pdf/2501.12275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12273v1","updated":"2025-01-21T16:44:12Z","published":"2025-01-21T16:44:12Z","title":"Condor: Enhance LLM Alignment with Knowledge-Driven Data Synthesis and\n  Refinement","summary":"  The quality of Supervised Fine-Tuning (SFT) data plays a critical role in\nenhancing the conversational capabilities of Large Language Models (LLMs).\nHowever, as LLMs become more advanced, the availability of high-quality\nhuman-annotated SFT data has become a significant bottleneck, necessitating a\ngreater reliance on synthetic training data. In this work, we introduce Condor,\na novel two-stage synthetic data generation framework that incorporates World\nKnowledge Tree and Self-Reflection Refinement to produce high-quality SFT data\nat scale. Our experimental results demonstrate that a base model fine-tuned on\nonly 20K Condor-generated samples achieves superior performance compared to\ncounterparts. The additional refinement stage in Condor further enables\niterative self-improvement for LLMs at various scales (up to 72B), validating\nthe effectiveness of our approach. Furthermore, our investigation into the\nscaling for synthetic data in post-training reveals substantial unexplored\npotential for performance improvements, opening promising avenues for future\nresearch.\n","authors":["Maosong Cao","Taolin Zhang","Mo Li","Chuyu Zhang","Yunxin Liu","Haodong Duan","Songyang Zhang","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2501.12273v1.pdf","comment":"Tech Report. Github: https://github.com/InternLM/Condor"},{"id":"http://arxiv.org/abs/2501.12266v1","updated":"2025-01-21T16:38:04Z","published":"2025-01-21T16:38:04Z","title":"CBVLM: Training-free Explainable Concept-based Large Vision Language\n  Models for Medical Image Classification","summary":"  The main challenges limiting the adoption of deep learning-based solutions in\nmedical workflows are the availability of annotated data and the lack of\ninterpretability of such systems. Concept Bottleneck Models (CBMs) tackle the\nlatter by constraining the final disease prediction on a set of predefined and\nhuman-interpretable concepts. However, the increased interpretability achieved\nthrough these concept-based explanations implies a higher annotation burden.\nMoreover, if a new concept needs to be added, the whole system needs to be\nretrained. Inspired by the remarkable performance shown by Large\nVision-Language Models (LVLMs) in few-shot settings, we propose a simple, yet\neffective, methodology, CBVLM, which tackles both of the aforementioned\nchallenges. First, for each concept, we prompt the LVLM to answer if the\nconcept is present in the input image. Then, we ask the LVLM to classify the\nimage based on the previous concept predictions. Moreover, in both stages, we\nincorporate a retrieval module responsible for selecting the best examples for\nin-context learning. By grounding the final diagnosis on the predicted\nconcepts, we ensure explainability, and by leveraging the few-shot capabilities\nof LVLMs, we drastically lower the annotation cost. We validate our approach\nwith extensive experiments across four medical datasets and twelve LVLMs (both\ngeneric and medical) and show that CBVLM consistently outperforms CBMs and\ntask-specific supervised methods without requiring any training and using just\na few annotated examples. More information on our project page:\nhttps://cristianopatricio.github.io/CBVLM/.\n","authors":["Cristiano Patrício","Isabel Rio-Torto","Jaime S. Cardoso","Luís F. Teixeira","João C. Neves"],"pdf_url":"https://arxiv.org/pdf/2501.12266v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.12231v1","updated":"2025-01-21T15:55:06Z","published":"2025-01-21T15:55:06Z","title":"InsTALL: Context-aware Instructional Task Assistance with Multi-modal\n  Large Language Models","summary":"  The improved competence of generative models can help building multi-modal\nvirtual assistants that leverage modalities beyond language. By observing\nhumans performing multi-step tasks, one can build assistants that have\nsituational awareness of actions and tasks being performed, enabling them to\ncater assistance based on this understanding. In this paper, we develop a\nContext-aware Instructional Task Assistant with Multi-modal Large Language\nModels (InsTALL) that leverages an online visual stream (e.g. a user's screen\nshare or video recording) and responds in real-time to user queries related to\nthe task at hand. To enable useful assistance, InsTALL 1) trains a multi-modal\nmodel on task videos and paired textual data, and 2) automatically extracts\ntask graph from video data and leverages it at training and inference time. We\nshow InsTALL achieves state-of-the-art performance across proposed sub-tasks\nconsidered for multimodal activity understanding -- task recognition (TR),\naction recognition (AR), next action prediction (AP), and plan prediction (PP)\n-- and outperforms existing baselines on two novel sub-tasks related to\nautomatic error identification.\n","authors":["Pha Nguyen","Sailik Sengupta","Girik Malik","Arshit Gupta","Bonan Min"],"pdf_url":"https://arxiv.org/pdf/2501.12231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17643v3","updated":"2025-01-21T15:52:55Z","published":"2024-03-26T12:23:34Z","title":"S+t-SNE -- Bringing Dimensionality Reduction to Data Streams","summary":"  We present S+t-SNE, an adaptation of the t-SNE algorithm designed to handle\ninfinite data streams. The core idea behind S+t-SNE is to update the t-SNE\nembedding incrementally as new data arrives, ensuring scalability and\nadaptability to handle streaming scenarios. By selecting the most important\npoints at each step, the algorithm ensures scalability while keeping\ninformative visualisations. By employing a blind method for drift management,\nthe algorithm adjusts the embedding space, which facilitates the visualisation\nof evolving data dynamics. Our experimental evaluations demonstrate the\neffectiveness and efficiency of S+t-SNE, whilst highlighting its ability to\ncapture patterns in a streaming scenario. We hope our approach offers\nresearchers and practitioners a real-time tool for understanding and\ninterpreting high-dimensional data.\n","authors":["Pedro C. Vieira","João P. Montrezol","João T. Vieira","João Gama"],"pdf_url":"https://arxiv.org/pdf/2403.17643v3.pdf","comment":"This preprint has undergone peer review but does not have any\n  post-submission improvements or corrections. Full version after peer-review\n  and post-acceptance improvements was presented at IDA2024\n  (https://ida2024.blogs.dsv.su.se/)"},{"id":"http://arxiv.org/abs/2501.12222v1","updated":"2025-01-21T15:48:27Z","published":"2025-01-21T15:48:27Z","title":"Strong phonon-mediated high temperature superconductivity in\n  Li$_2$AuH$_6$ under ambient pressure","summary":"  We used our developed AI search engine~(InvDesFlow) to perform extensive\ninvestigations regarding ambient stable superconducting hydrides. A cubic\nstructure Li$_2$AuH$_6$ with Au-H octahedral motifs is identified to be a\ncandidate. After performing thermodynamical analysis, we provide a feasible\nroute to experimentally synthesize this material via the known LiAu and LiH\ncompounds under ambient pressure. The further first-principles calculations\nsuggest that Li$_2$AuH$_6$ shows a high superconducting transition temperature\n($T_c$) $\\sim$ 140 K under ambient pressure. The H-1$s$ electrons strongly\ncouple with phonon modes of vibrations of Au-H octahedrons as well as\nvibrations of Li atoms, where the latter is not taken seriously in other\npreviously similar cases. Hence, different from previous claims of searching\nmetallic covalent bonds to find high-$T_c$ superconductors, we emphasize here\nthe importance of those phonon modes with strong electron-phonon coupling\n(EPC). And we suggest that one can intercalate atoms into binary or ternary\nhydrides to introduce more potential phonon modes with strong EPC, which is an\neffective approach to find high-$T_c$ superconductors within multicomponent\ncompounds.\n","authors":["Zhenfeng Ouyang","Bo-Wen Yao","Xiao-Qi Han","Peng-Jie Guo","Ze-Feng Gao","Zhong-Yi Lu"],"pdf_url":"https://arxiv.org/pdf/2501.12222v1.pdf","comment":"6 pages; 4 figures"},{"id":"http://arxiv.org/abs/2410.11773v6","updated":"2025-01-21T15:21:48Z","published":"2024-10-15T16:53:44Z","title":"Time-Series Foundation Model for Value-at-Risk Forecasting","summary":"  This study is the first to analyze the performance of a time-series\nfoundation model for Value-at-Risk (VaR), which essentially forecasts the\nleft-tail quantiles of returns. Foundation models, pre-trained on diverse\ndatasets, can be applied in a zero-shot setting with minimal data or further\nimproved through finetuning. We compare Google's TimesFM model to conventional\nparametric and non-parametric models, including GARCH and Generalized\nAutoregressive Score (GAS), using 19 years of daily returns from the S&P 100\nindex and its constituents. Backtesting with over 8.5 years of out-of-sample\ndata shows that the fine-tuned foundation model consistently outperforms\ntraditional methods in actual-over-expected ratios. For the quantile score loss\nfunction, it performs comparably to the best econometric model, GAS. Overall,\nthe foundation model ranks as the best or among the top performers across the\n0.01, 0.025, 0.05, and 0.1 quantile forecasting. Fine-tuning significantly\nimproves accuracy, showing that zero-shot use is not optimal for VaR.\n","authors":["Anubha Goel","Puneet Pasricha","Juho Kanniainen"],"pdf_url":"https://arxiv.org/pdf/2410.11773v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12194v1","updated":"2025-01-21T15:02:31Z","published":"2025-01-21T15:02:31Z","title":"An End-to-End Approach for Korean Wakeword Systems with Speaker\n  Authentication","summary":"  Wakeword detection plays a critical role in enabling AI assistants to listen\nto user voices and interact effectively. However, for languages other than\nEnglish, there is a significant lack of pre-trained wakeword models.\nAdditionally, systems that merely determine the presence of a wakeword can pose\nserious privacy concerns. In this paper, we propose an end-to-end approach that\ntrains wakewords for Non-English languages, particulary Korean, and uses this\nto develop a Voice Authentication model to protect user privacy. Our\nimplementation employs an open-source platform OpenWakeWord, which performs\nwakeword detection using an FCN (Fully-Connected Network) architecture. Once a\nwakeword is detected, our custom-developed code calculates cosine similarity\nfor robust user authentication. Experimental results demonstrate the\neffectiveness of our approach, achieving a 16.79% and a 6.6% Equal Error Rate\n(EER) each in the Wakeword Detection and the Voice Authentication. These\nfindings highlight the model's potential in providing secure and accurate\nwakeword detection and authentication for Korean users.\n","authors":["Geonwoo Seo"],"pdf_url":"https://arxiv.org/pdf/2501.12194v1.pdf","comment":"19 pages, 10 figures, implementation code available at\n  https://github.com/gws8820/securewakeword-model,\n  https://github.com/gws8820/wyoming-securewakeword, demo video at\n  https://www.youtube.com/watch?v=F3AXUbL-i-o"},{"id":"http://arxiv.org/abs/2410.11900v4","updated":"2025-01-21T14:57:22Z","published":"2024-10-14T19:39:11Z","title":"FLARE: Faithful Logic-Aided Reasoning and Exploration","summary":"  Modern Question Answering (QA) and Reasoning approaches based on Large\nLanguage Models (LLMs) commonly use prompting techniques, such as\nChain-of-Thought (CoT), assuming the resulting generation will have a more\ngranular exploration and reasoning over the question space and scope. However,\nsuch methods struggle with generating outputs that are faithful to the\nintermediate chain of reasoning produced by the model. On the other end of the\nspectrum, neuro-symbolic methods such as Faithful CoT (F-CoT) propose to\ncombine LLMs with external symbolic solvers. While such approaches boast a high\ndegree of faithfulness, they usually require a model trained for code\ngeneration and struggle with tasks that are ambiguous or hard to formalise\nstrictly. We introduce $\\textbf{F}$aithful $\\textbf{L}$ogic-$\\textbf{A}$ided\n$\\textbf{R}$easoning and $\\textbf{E}$xploration ($\\textbf{FLARE}$), a novel\ninterpretable approach for traversing the problem space using task\ndecompositions. We use the LLM to plan a solution, soft-formalise the query\ninto facts and predicates using a logic programming code and simulate that code\nexecution using an exhaustive multi-hop search over the defined space. Our\nmethod allows us to compute the faithfulness of the reasoning process w.r.t.\nthe generated code and analyse the steps of the multi-hop search without\nrelying on external solvers. Our methods achieve SOTA results on $\\mathbf{7}$\nout of $\\mathbf{9}$ diverse reasoning benchmarks. We also show that model\nfaithfulness positively correlates with overall performance and further\ndemonstrate that $\\textbf{FLARE}$ allows pinpointing the decisive factors\nsufficient for and leading to the correct answer with optimal reasoning during\nthe multi-hop search.\n","authors":["Erik Arakelyan","Pasquale Minervini","Pat Verga","Patrick Lewis","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2410.11900v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02302v4","updated":"2025-01-21T14:50:25Z","published":"2024-03-04T18:32:12Z","title":"Beyond Specialization: Assessing the Capabilities of MLLMs in Age and\n  Gender Estimation","summary":"  Multimodal Large Language Models (MLLMs) have recently gained immense\npopularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as\nopen-source ones such as LLaVA, are essentially general-purpose models and are\napplied to solve a wide variety of tasks, including those in computer vision.\nThese neural networks possess such strong general knowledge and reasoning\nabilities that they have proven capable of working even on tasks for which they\nwere not specifically trained. We compared the capabilities of the most\npowerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task\nof age and gender estimation with our state-of-the-art specialized model,\nMiVOLO. We also updated MiVOLO and provide details and new metrics in this\narticle. This comparison has yielded some interesting results and insights\nabout the strengths and weaknesses of the participating models. Furthermore, we\nattempted various ways to fine-tune the ShareGPT4V model for this specific\ntask, aiming to achieve state-of-the-art results in this particular challenge.\nAlthough such a model would not be practical in production, as it is incredibly\nexpensive compared to a specialized model like MiVOLO, it could be very useful\nin some tasks, like data annotation.\n","authors":["Maksim Kuprashevich","Grigorii Alekseenko","Irina Tolstykh"],"pdf_url":"https://arxiv.org/pdf/2403.02302v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15152v3","updated":"2025-01-21T14:38:22Z","published":"2024-06-21T13:55:34Z","title":"Generative Topological Networks","summary":"  Generative methods have recently seen significant improvements by generating\nin a lower-dimensional latent representation of the data. However, many of the\ngenerative methods applied in the latent space remain complex and difficult to\ntrain. Further, it is not entirely clear why transitioning to a\nlower-dimensional latent space can improve generative quality. In this work, we\nintroduce a new and simple generative method grounded in topology theory --\nGenerative Topological Networks (GTNs) -- which also provides insights into why\nlower-dimensional latent-space representations might be better-suited for data\ngeneration. GTNs are simple to train -- they employ a standard supervised\nlearning approach and do not suffer from common generative pitfalls such as\nmode collapse, posterior collapse or the need to pose constraints on the neural\nnetwork architecture. We demonstrate the use of GTNs on several datasets,\nincluding MNIST, CelebA, CIFAR-10 and the Hands and Palm Images dataset by\ntraining GTNs on a lower-dimensional latent representation of the data. We show\nthat GTNs can improve upon VAEs and that they are quick to converge, generating\nrealistic samples in early epochs. Further, we use the topological\nconsiderations behind the development of GTNs to offer insights into why\ngenerative models may benefit from operating on a lower-dimensional latent\nspace, highlighting the important link between the intrinsic dimension of the\ndata and the dimension in which the data is generated. Particularly, we\ndemonstrate that generating in high dimensional ambient spaces may be a\ncontributing factor to out-of-distribution samples generated by diffusion\nmodels. We also highlight other topological properties that are important to\nconsider when using and designing generative models. Our code is available at:\nhttps://github.com/alonalj/GTN\n","authors":["Alona Levy-Jurgenson","Zohar Yakhini"],"pdf_url":"https://arxiv.org/pdf/2406.15152v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04180v2","updated":"2025-01-21T14:25:45Z","published":"2025-01-07T23:16:31Z","title":"HIVEX: A High-Impact Environment Suite for Multi-Agent Research\n  (extended version)","summary":"  Games have been vital test beds for the rapid development of Agent-based\nresearch. Remarkable progress has been achieved in the past, but it is unclear\nif the findings equip for real-world problems. While pressure grows, some of\nthe most critical ecological challenges can find mitigation and prevention\nsolutions through technology and its applications. Most real-world domains\ninclude multi-agent scenarios and require machine-machine and human-machine\ncollaboration. Open-source environments have not advanced and are often toy\nscenarios, too abstract or not suitable for multi-agent research. By mimicking\nreal-world problems and increasing the complexity of environments, we hope to\nadvance state-of-the-art multi-agent research and inspire researchers to work\non immediate real-world problems. Here, we present HIVEX, an environment suite\nto benchmark multi-agent research focusing on ecological challenges. HIVEX\nincludes the following environments: Wind Farm Control, Wildfire Resource\nManagement, Drone-Based Reforestation, Ocean Plastic Collection, and Aerial\nWildfire Suppression. We provide environments, training examples, and baselines\nfor the main and sub-tasks. All trained models resulting from the experiments\nof this work are hosted on Hugging Face. We also provide a leaderboard on\nHugging Face and encourage the community to submit models trained on our\nenvironment suite.\n","authors":["Philipp Dominic Siedler"],"pdf_url":"https://arxiv.org/pdf/2501.04180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12162v1","updated":"2025-01-21T14:15:01Z","published":"2025-01-21T14:15:01Z","title":"AdaServe: SLO-Customized LLM Serving with Fine-Grained Speculative\n  Decoding","summary":"  This paper introduces AdaServe, the first LLM serving system to support SLO\ncustomization through fine-grained speculative decoding. AdaServe leverages the\nlogits of a draft model to predict the speculative accuracy of tokens and\nemploys a theoretically optimal algorithm to construct token trees for\nverification. To accommodate diverse SLO requirements without compromising\nthroughput, AdaServe employs a speculation-and-selection scheme that first\nconstructs candidate token trees for each request and then dynamically selects\ntokens to meet individual SLO constraints while optimizing throughput.\nComprehensive evaluations demonstrate that AdaServe achieves up to 73% higher\nSLO attainment and 74% higher goodput compared to state-of-the-art systems.\nThese results underscore AdaServe's potential to enhance the efficiency and\nadaptability of LLM deployments across varied application scenarios.\n","authors":["Zikun Li","Zhuofu Chen","Remi Delacourt","Gabriele Oliaro","Zeyu Wang","Qinghan Chen","Shuhuai Lin","April Yang","Zhihao Zhang","Zhuoming Chen","Sean Lai","Xupeng Miao","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2501.12162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12149v1","updated":"2025-01-21T14:01:06Z","published":"2025-01-21T14:01:06Z","title":"On the practical applicability of modern DFT functionals for chemical\n  computations. Case study of DM21 applicability for geometry optimization","summary":"  Density functional theory (DFT) is probably the most promising approach for\nquantum chemistry calculations considering its good balance between\ncalculations precision and speed. In recent years, several neural network-based\nfunctionals have been developed for exchange-correlation energy approximation\nin DFT, DM21 developed by Google Deepmind being the most notable between them.\nThis study focuses on evaluating the efficiency of DM21 functional in\npredicting molecular geometries, with a focus on the influence of oscillatory\nbehavior in neural network exchange-correlation functionals. We implemented\ngeometry optimization in PySCF for the DM21 functional in geometry optimization\nproblem, compared its performance with traditional functionals, and tested it\non various benchmarks. Our findings reveal both the potential and the current\nchallenges of using neural network functionals for geometry optimization in\nDFT. We propose a solution extending the practical applicability of such\nfunctionals and allowing to model new substances with their help.\n","authors":["Kirill Kulaev","Alexander Ryabov","Michael Medvedev","Evgeny Burnaev","Vladimir Vanovskiy"],"pdf_url":"https://arxiv.org/pdf/2501.12149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12147v1","updated":"2025-01-21T14:00:43Z","published":"2025-01-21T14:00:43Z","title":"Improving Influence-based Instruction Tuning Data Selection for Balanced\n  Learning of Diverse Capabilities","summary":"  Selecting appropriate training data is crucial for effective instruction\nfine-tuning of large language models (LLMs), which aims to (1) elicit strong\ncapabilities, and (2) achieve balanced performance across a diverse range of\ntasks. Influence-based methods show promise in achieving (1) by estimating the\ncontribution of each training example to the model's predictions, but often\nstruggle with (2). Our systematic investigation reveals that this\nunderperformance can be attributed to an inherent bias where certain tasks\nintrinsically have greater influence than others. As a result, data selection\nis often biased towards these tasks, not only hurting the model's performance\non others but also, counterintuitively, harms performance on these\nhigh-influence tasks themselves.\n  As a remedy, we propose BIDS, a Balanced and Influential Data Selection\nalgorithm. BIDS first normalizes influence scores of the training data, and\nthen iteratively balances data selection by choosing the training example with\nthe highest influence on the most underrepresented task. Experiments with both\nLlama-3 and Mistral-v0.3 on seven benchmarks spanning five diverse capabilities\nshow that BIDS consistently outperforms both state-of-the-art influence-based\nalgorithms and other non-influence-based selection frameworks. Surprisingly,\ntraining on a 15% subset selected by BIDS can even outperform full-dataset\ntraining with a much more balanced performance. Our analysis further highlights\nthe importance of both instance-level normalization and iterative optimization\nof selected data for balanced learning of diverse capabilities.\n","authors":["Qirun Dai","Dylan Zhang","Jiaqi W. Ma","Hao Peng"],"pdf_url":"https://arxiv.org/pdf/2501.12147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12123v1","updated":"2025-01-21T13:37:28Z","published":"2025-01-21T13:37:28Z","title":"FedCLEAN: byzantine defense by CLustering Errors of Activation maps in\n  Non-IID federated learning environments","summary":"  Federated Learning (FL) enables clients to collaboratively train a global\nmodel using their local datasets while reinforcing data privacy. However, FL is\nsusceptible to poisoning attacks. Existing defense mechanisms assume that\nclients' data are independent and identically distributed (IID), making them\nineffective in real-world applications where data are non-IID. This paper\npresents FedCLEAN, the first defense capable of filtering attackers' model\nupdates in a non-IID FL environment. The originality of FedCLEAN is twofold.\nFirst, it relies on a client confidence score derived from the reconstruction\nerrors of each client's model activation maps for a given trigger set, with\nreconstruction errors obtained by means of a Conditional Variational\nAutoencoder trained according to a novel server-side strategy. Second, we\npropose an ad-hoc trust propagation algorithm based on client scores, which\nallows building a cluster of benign clients while flagging potential attackers.\nExperimental results on the datasets MNIST and FashionMNIST demonstrate the\nrobustness of FedCLEAN against Byzantine attackers in non-IID scenarios and a\nclose-to-zero benign client misclassification rate, even in the absence of an\nattack.\n","authors":["Mehdi Ben Ghali","Reda Bellafqira","Gouenou Coatrieux"],"pdf_url":"https://arxiv.org/pdf/2501.12123v1.pdf","comment":"19 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.12116v1","updated":"2025-01-21T13:25:56Z","published":"2025-01-21T13:25:56Z","title":"Efficient PINNs: Multi-Head Unimodular Regularization of the Solutions\n  Space","summary":"  We present a machine learning framework to facilitate the solution of\nnonlinear multiscale differential equations and, especially, inverse problems\nusing Physics-Informed Neural Networks (PINNs). This framework is based on what\nis called multihead (MH) training, which involves training the network to learn\na general space of all solutions for a given set of equations with certain\nvariability, rather than learning a specific solution of the system. This setup\nis used with a second novel technique that we call Unimodular Regularization\n(UR) of the latent space of solutions. We show that the multihead approach,\ncombined with the regularization, significantly improves the efficiency of\nPINNs by facilitating the transfer learning process thereby enabling the\nfinding of solutions for nonlinear, coupled, and multiscale differential\nequations.\n","authors":["Pedro Tarancón-Álvarez","Pablo Tejerina-Pérez","Raul Jimenez","Pavlos Protopapas"],"pdf_url":"https://arxiv.org/pdf/2501.12116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12106v1","updated":"2025-01-21T12:56:47Z","published":"2025-01-21T12:56:47Z","title":"Can open source large language models be used for tumor documentation in\n  Germany? -- An evaluation on urological doctors' notes","summary":"  Tumor documentation in Germany is largely done manually, requiring reading\npatient records and entering data into structured databases. Large language\nmodels (LLMs) could potentially enhance this process by improving efficiency\nand reliability. This evaluation tests eleven different open source LLMs with\nsizes ranging from 1-70 billion model parameters on three basic tasks of the\ntumor documentation process: identifying tumor diagnoses, assigning ICD-10\ncodes, and extracting the date of first diagnosis. For evaluating the LLMs on\nthese tasks, a dataset of annotated text snippets based on anonymized doctors'\nnotes from urology was prepared. Different prompting strategies were used to\ninvestigate the effect of the number of examples in few-shot prompting and to\nexplore the capabilities of the LLMs in general. The models Llama 3.1 8B,\nMistral 7B, and Mistral NeMo 12 B performed comparably well in the tasks.\nModels with less extensive training data or having fewer than 7 billion\nparameters showed notably lower performance, while larger models did not\ndisplay performance gains. Examples from a different medical domain than\nurology could also improve the outcome in few-shot prompting, which\ndemonstrates the ability of LLMs to handle tasks needed for tumor\ndocumentation. Open source LLMs show a strong potential for automating tumor\ndocumentation. Models from 7-12 billion parameters could offer an optimal\nbalance between performance and resource efficiency. With tailored fine-tuning\nand well-designed prompting, these models might become important tools for\nclinical documentation in the future. The code for the evaluation is available\nfrom https://github.com/stefan-m-lenz/UroLlmEval. We also release the dataset\nas a new valuable resource that addresses the shortage of authentic and easily\naccessible benchmarks in German-language medical NLP.\n","authors":["Stefan Lenz","Arsenij Ustjanzew","Marco Jeray","Torsten Panholzer"],"pdf_url":"https://arxiv.org/pdf/2501.12106v1.pdf","comment":"48 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.12104v1","updated":"2025-01-21T12:55:04Z","published":"2025-01-21T12:55:04Z","title":"Teacher Encoder-Student Decoder Denoising Guided Segmentation Network\n  for Anomaly Detection","summary":"  Visual anomaly detection is a highly challenging task, often categorized as a\none-class classification and segmentation problem. Recent studies have\ndemonstrated that the student-teacher (S-T) framework effectively addresses\nthis challenge. However, most S-T frameworks rely solely on pre-trained teacher\nnetworks to guide student networks in learning multi-scale similar features,\noverlooking the potential of the student networks to enhance learning through\nmulti-scale feature fusion. In this study, we propose a novel model named\nPFADSeg, which integrates a pre-trained teacher network, a denoising student\nnetwork with multi-scale feature fusion, and a guided anomaly segmentation\nnetwork into a unified framework. By adopting a unique teacher-encoder and\nstudent-decoder denoising mode, the model improves the student network's\nability to learn from teacher network features. Furthermore, an adaptive\nfeature fusion mechanism is introduced to train a self-supervised segmentation\nnetwork that synthesizes anomaly masks autonomously, significantly increasing\ndetection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves\nstate-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean\nprecision of 76.4%, and an instance-level mean precision of 78.7%.\n","authors":["ShiXuan Song","Hao Chen","Shu Hu","Xin Wang","Jinrong Hu","Xi Wu"],"pdf_url":"https://arxiv.org/pdf/2501.12104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12102v1","updated":"2025-01-21T12:49:30Z","published":"2025-01-21T12:49:30Z","title":"Proxies for Distortion and Consistency with Applications for Real-World\n  Image Restoration","summary":"  Real-world image restoration deals with the recovery of images suffering from\nan unknown degradation. This task is typically addressed while being given only\ndegraded images, without their corresponding ground-truth versions. In this\nhard setting, designing and evaluating restoration algorithms becomes highly\nchallenging. This paper offers a suite of tools that can serve both the design\nand assessment of real-world image restoration algorithms. Our work starts by\nproposing a trained model that predicts the chain of degradations a given\nreal-world measured input has gone through. We show how this estimator can be\nused to approximate the consistency -- the match between the measurements and\nany proposed recovered image. We also use this estimator as a guiding force for\nthe design of a simple and highly-effective plug-and-play real-world image\nrestoration algorithm, leveraging a pre-trained diffusion-based image prior.\nFurthermore, this work proposes no-reference proxy measures of MSE and LPIPS,\nwhich, without access to the ground-truth images, allow ranking of real-world\nimage restoration algorithms according to their (approximate) MSE and LPIPS.\nThe proposed suite provides a versatile, first of its kind framework for\nevaluating and comparing blind image restoration algorithms in real-world\nscenarios.\n","authors":["Sean Man","Guy Ohayon","Ron Raphaeli","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2501.12102v1.pdf","comment":"Project page in https://man-sean.github.io/elad-website/"},{"id":"http://arxiv.org/abs/2405.10436v2","updated":"2025-01-21T12:37:21Z","published":"2024-05-16T20:42:54Z","title":"Positional encoding is not the same as context: A study on positional\n  encoding for sequential recommendation","summary":"  The rapid growth of streaming media and e-commerce has driven advancements in\nrecommendation systems, particularly Sequential Recommendation Systems (SRS).\nThese systems employ users' interaction histories to predict future\npreferences. While recent research has focused on architectural innovations\nlike transformer blocks and feature extraction, positional encodings, crucial\nfor capturing temporal patterns, have received less attention. These encodings\nare often conflated with contextual, such as the temporal footprint, which\nprevious works tend to treat as interchangeable with positional information.\nThis paper highlights the critical distinction between temporal footprint and\npositional encodings, demonstrating that the latter offers unique relational\ncues between items, which the temporal footprint alone cannot provide. Through\nextensive experimentation on eight Amazon datasets and subsets, we assess the\nimpact of various encodings on performance metrics and training stability. We\nintroduce new positional encodings and investigate integration strategies that\nimprove both metrics and stability, surpassing state-of-the-art results at the\ntime of this work's initial preprint. Importantly, we demonstrate that\nselecting the appropriate encoding is not only key to better performance but\nalso essential for building robust, reliable SRS models.\n","authors":["Alejo Lopez-Avila","Jinhua Du","Abbas Shimary","Ze Li"],"pdf_url":"https://arxiv.org/pdf/2405.10436v2.pdf","comment":"18 pages, 6 figures, 21 tables"},{"id":"http://arxiv.org/abs/2501.12085v1","updated":"2025-01-21T12:22:15Z","published":"2025-01-21T12:22:15Z","title":"Scalable Whole Slide Image Representation Using K-Mean Clustering and\n  Fisher Vector Aggregation","summary":"  Whole slide images (WSIs) are high-resolution, gigapixel sized images that\npose significant computational challenges for traditional machine learning\nmodels due to their size and heterogeneity.In this paper, we present a scalable\nand efficient methodology for WSI classification by leveraging patch-based\nfeature extraction, clustering, and Fisher vector encoding. Initially, WSIs are\ndivided into fixed size patches, and deep feature embeddings are extracted from\neach patch using a pre-trained convolutional neural network (CNN). These\npatch-level embeddings are subsequently clustered using K-means clustering,\nwhere each cluster aggregates semantically similar regions of the WSI. To\neffectively summarize each cluster, Fisher vector representations are computed\nby modeling the distribution of patch embeddings in each cluster as a\nparametric Gaussian mixture model (GMM). The Fisher vectors from each cluster\nare concatenated into a high-dimensional feature vector, creating a compact and\ninformative representation of the entire WSI. This feature vector is then used\nby a classifier to predict the WSI's diagnostic label. Our method captures\nlocal and global tissue structures and yields robust performance for\nlarge-scale WSI classification, demonstrating superior accuracy and scalability\ncompared to other approaches.\n","authors":["Ravi Kant Gupta","Shounak Das","Ardhendu Sekhar","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2501.12085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12067v1","updated":"2025-01-21T11:42:09Z","published":"2025-01-21T11:42:09Z","title":"EDoRA: Efficient Weight-Decomposed Low-Rank Adaptation via Singular\n  Value Decomposition","summary":"  Parameter-efficient fine-tuning methods, such as LoRA, reduces the number of\ntrainable parameters. However, they often suffer from scalability issues and\ndifferences between their learning pattern and full fine-tuning. To overcome\nthese limitations, we propose Efficient Weight-Decomposed Low-Rank Adaptation\n(EDoRA): a novel PEFT method that decomposes pre-trained weights into magnitude\nand directional components. By freezing low-rank matrices, initializing them by\nsingular value decomposition, and introducing a small trainable matrix between\nthem, EDoRA achieves substantial reduction in trainable parameters while\nmaintaining learning capacity. Experimental results on the GLUE benchmark\ndemonstrate that EDoRA achieves competitive or superior performance compared to\nstate-of-the-art methods, such as LoRA and DoRA, with up to 30x fewer trainable\nparameters. This makes EDoRA a highly efficient solution for adapting LLMs to\ndiverse tasks under memory-constrained settings. Code is available at\nhttps://github.com/Hamid-Nasiri/EDoRA .\n","authors":["Hamid Nasiri","Peter Garraghan"],"pdf_url":"https://arxiv.org/pdf/2501.12067v1.pdf","comment":"10 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2411.18279v7","updated":"2025-01-21T11:36:03Z","published":"2024-11-27T12:13:39Z","title":"Large Language Model-Brained GUI Agents: A Survey","summary":"  GUIs have long been central to human-computer interaction, providing an\nintuitive and visually-driven way to access and interact with digital systems.\nThe advent of LLMs, particularly multimodal models, has ushered in a new era of\nGUI automation. They have demonstrated exceptional capabilities in natural\nlanguage understanding, code generation, and visual processing. This has paved\nthe way for a new generation of LLM-brained GUI agents capable of interpreting\ncomplex GUI elements and autonomously executing actions based on natural\nlanguage instructions. These agents represent a paradigm shift, enabling users\nto perform intricate, multi-step tasks through simple conversational commands.\nTheir applications span across web navigation, mobile app interactions, and\ndesktop automation, offering a transformative user experience that\nrevolutionizes how individuals interact with software. This emerging field is\nrapidly advancing, with significant progress in both research and industry.\n  To provide a structured understanding of this trend, this paper presents a\ncomprehensive survey of LLM-brained GUI agents, exploring their historical\nevolution, core components, and advanced techniques. We address research\nquestions such as existing GUI agent frameworks, the collection and utilization\nof data for training specialized GUI agents, the development of large action\nmodels tailored for GUI tasks, and the evaluation metrics and benchmarks\nnecessary to assess their effectiveness. Additionally, we examine emerging\napplications powered by these agents. Through a detailed analysis, this survey\nidentifies key research gaps and outlines a roadmap for future advancements in\nthe field. By consolidating foundational knowledge and state-of-the-art\ndevelopments, this work aims to guide both researchers and practitioners in\novercoming challenges and unlocking the full potential of LLM-brained GUI\nagents.\n","authors":["Chaoyun Zhang","Shilin He","Jiaxu Qian","Bowen Li","Liqun Li","Si Qin","Yu Kang","Minghua Ma","Guyue Liu","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.18279v7.pdf","comment":"The collection of papers reviewed in this survey will be hosted and\n  regularly updated on the GitHub repository:\n  https://github.com/vyokky/LLM-Brained-GUI-Agents-Survey Additionally, a\n  searchable webpage is available at https://aka.ms/gui-agent for easier access\n  and exploration"},{"id":"http://arxiv.org/abs/2501.12048v1","updated":"2025-01-21T11:21:16Z","published":"2025-01-21T11:21:16Z","title":"Adaptive Class Learning to Screen Diabetic Disorders in Fundus Images of\n  Eye","summary":"  The prevalence of ocular illnesses is growing globally, presenting a\nsubstantial public health challenge. Early detection and timely intervention\nare crucial for averting visual impairment and enhancing patient prognosis.\nThis research introduces a new framework called Class Extension with Limited\nData (CELD) to train a classifier to categorize retinal fundus images. The\nclassifier is initially trained to identify relevant features concerning\nHealthy and Diabetic Retinopathy (DR) classes and later fine-tuned to adapt to\nthe task of classifying the input images into three classes: Healthy, DR, and\nGlaucoma. This strategy allows the model to gradually enhance its\nclassification capabilities, which is beneficial in situations where there are\nonly a limited number of labeled datasets available. Perturbation methods are\nalso used to identify the input image characteristics responsible for\ninfluencing the models decision-making process. We achieve an overall accuracy\nof 91% on publicly available datasets.\n","authors":["Shramana Dey","Pallabi Dutta","Riddhasree Bhattacharyya","Surochita Pal","Sushmita Mitra","Rajiv Raman"],"pdf_url":"https://arxiv.org/pdf/2501.12048v1.pdf","comment":"Accepted at International Conference on Pattern Recognition (ICPR)\n  2024"},{"id":"http://arxiv.org/abs/2501.12033v1","updated":"2025-01-21T10:55:50Z","published":"2025-01-21T10:55:50Z","title":"Harnessing Generative Pre-Trained Transformer for Datacenter Packet\n  Trace Generation","summary":"  Today, the rapid growth of applications reliant on datacenters calls for new\nadvancements to meet the increasing traffic and computational demands. Traffic\ntraces from datacenters are essential for further development and optimization\nof future datacenters. However, traces are rarely released to the public.\nResearchers often use simplified mathematical models that lack the depth needed\nto recreate intricate traffic patterns and, thus, miss optimization\nopportunities found in realistic traffic. In this preliminary work, we\nintroduce DTG-GPT, a packet-level Datacenter Traffic Generator (DTG), based on\nthe generative pre-trained transformer (GPT) architecture used by many\nstate-of-the-art large language models. We train our model on a small set of\navailable traffic traces from different domains and offer a simple methodology\nto evaluate the fidelity of the generated traces to their original\ncounterparts. We show that DTG-GPT can synthesize novel traces that mimic the\nspatiotemporal patterns found in real traffic traces. We further demonstrate\nthat DTG-GPT can generate traces for networks of different scales while\nmaintaining fidelity. Our findings indicate the potential that, in the future,\nsimilar models to DTG-GPT will allow datacenter operators to release traffic\ninformation to the research community via trained GPT models.\n","authors":["Chen Griner"],"pdf_url":"https://arxiv.org/pdf/2501.12033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06066v2","updated":"2025-01-21T10:48:54Z","published":"2025-01-10T15:57:23Z","title":"Distilling Calibration via Conformalized Credal Inference","summary":"  Deploying artificial intelligence (AI) models on edge devices involves a\ndelicate balance between meeting stringent complexity constraints, such as\nlimited memory and energy resources, and ensuring reliable performance in\nsensitive decision-making tasks. One way to enhance reliability is through\nuncertainty quantification via Bayesian inference. This approach, however,\ntypically necessitates maintaining and running multiple models in an ensemble,\nwhich may exceed the computational limits of edge devices. This paper\nintroduces a low-complexity methodology to address this challenge by distilling\ncalibration information from a more complex model. In an offline phase,\npredictive probabilities generated by a high-complexity cloud-based model are\nleveraged to determine a threshold based on the typical divergence between the\ncloud and edge models. At run time, this threshold is used to construct credal\nsets -- ranges of predictive probabilities that are guaranteed, with a\nuser-selected confidence level, to include the predictions of the cloud model.\nThe credal sets are obtained through thresholding of a divergence measure in\nthe simplex of predictive probabilities. Experiments on visual and language\ntasks demonstrate that the proposed approach, termed Conformalized Distillation\nfor Credal Inference (CD-CI), significantly improves calibration performance\ncompared to low-complexity Bayesian methods, such as Laplace approximation,\nmaking it a practical and efficient solution for edge AI deployments.\n","authors":["Jiayi Huang","Sangwoo Park","Nicola Paoletti","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2501.06066v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.12015v1","updated":"2025-01-21T10:13:28Z","published":"2025-01-21T10:13:28Z","title":"Full Proportional Justified Representation","summary":"  In multiwinner approval voting, forming a committee that proportionally\nrepresents voters' approval ballots is an essential task. The notion of\njustified representation (JR) demands that any large \"cohesive\" group of voters\nshould be proportionally \"represented\". The \"cohesiveness\" is defined in\ndifferent ways; two common ways are the following: (C1) demands that the group\nunanimously approves a set of candidates proportional to its size, while (C2)\nrequires each member to approve at least a fixed fraction of such a set.\nSimilarly, \"representation\" have been considered in different ways: (R1) the\ncoalition's collective utility from the winning set exceeds that of any\nproportionally sized alternative, and (R2) for any proportionally sized\nalternative, at least one member of the coalition derives less utility from it\nthan from the winning set.\n  Three of the four possible combinations have been extensively studied:\n(C1)-(R1) defines Proportional Justified Representation (PJR), (C1)-(R2)\ndefines Extended Justified Representation (EJR), (C2)-(R2) defines Full\nJustified Representation (FJR). All three have merits, but also drawbacks. PJR\nis the weakest notion, and perhaps not sufficiently demanding; EJR may not be\ncompatible with perfect representation; and it is open whether a committee\nsatisfying FJR can be found efficiently.\n  We study the combination (C2)-(R1), which we call Full Proportional Justified\nRepresentation (FPJR). We investigate FPJR's properties and find that it shares\nPJR's advantages over EJR: several proportionality axioms (e.g. priceability,\nperfect representation) imply FPJR and PJR but not EJR. We also find that\nefficient rules like the greedy Monroe rule and the method of equal shares\nsatisfy FPJR, matching a key advantage of EJR over FJR. However, the\nProportional Approval Voting (PAV) rule may violate FPJR, so neither of EJR and\nFPJR implies the other.\n","authors":["Yusuf Hakan Kalayci","Jiasen Liu","David Kempe"],"pdf_url":"https://arxiv.org/pdf/2501.12015v1.pdf","comment":"18 pages, Accepted to AAMAS 25"},{"id":"http://arxiv.org/abs/2403.04652v3","updated":"2025-01-21T10:12:05Z","published":"2024-03-07T16:52:49Z","title":"Yi: Open Foundation Models by 01.AI","summary":"  We introduce the Yi model family, a series of language and multimodal models\nthat demonstrate strong multi-dimensional capabilities. The Yi model family is\nbased on 6B and 34B pretrained language models, then we extend them to chat\nmodels, 200K long context models, depth-upscaled models, and vision-language\nmodels. Our base models achieve strong performance on a wide range of\nbenchmarks like MMLU, and our finetuned chat models deliver strong human\npreference rate on major evaluation platforms like AlpacaEval and Chatbot\nArena. Building upon our scalable super-computing infrastructure and the\nclassical transformer architecture, we attribute the performance of Yi models\nprimarily to its data quality resulting from our data-engineering efforts. For\npretraining, we construct 3.1 trillion tokens of English and Chinese corpora\nusing a cascaded data deduplication and quality filtering pipeline. For\nfinetuning, we polish a small scale (less than 10K) instruction dataset over\nmultiple iterations such that every single instance has been verified directly\nby our machine learning engineers. For vision-language, we combine the chat\nlanguage model with a vision transformer encoder and train the model to align\nvisual representations to the semantic space of the language model. We further\nextend the context length to 200K through lightweight continual pretraining and\ndemonstrate strong needle-in-a-haystack retrieval performance. We show that\nextending the depth of the pretrained checkpoint through continual pretraining\nfurther improves performance. We believe that given our current results,\ncontinuing to scale up model parameters using thoroughly optimized data will\nlead to even stronger frontier models.\n","authors":["01. AI"," :","Alex Young","Bei Chen","Chao Li","Chengen Huang","Ge Zhang","Guanwei Zhang","Guoyin Wang","Heng Li","Jiangcheng Zhu","Jianqun Chen","Jing Chang","Kaidong Yu","Peng Liu","Qiang Liu","Shawn Yue","Senbin Yang","Shiming Yang","Wen Xie","Wenhao Huang","Xiaohui Hu","Xiaoyi Ren","Xinyao Niu","Pengcheng Nie","Yanpeng Li","Yuchi Xu","Yudong Liu","Yue Wang","Yuxuan Cai","Zhenyu Gu","Zhiyuan Liu","Zonghong Dai"],"pdf_url":"https://arxiv.org/pdf/2403.04652v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15100v4","updated":"2025-01-21T10:00:47Z","published":"2024-09-23T15:11:40Z","title":"Robust Federated Learning Over the Air: Combating Heavy-Tailed Noise\n  with Median Anchored Clipping","summary":"  Leveraging over-the-air computations for model aggregation is an effective\napproach to cope with the communication bottleneck in federated edge learning.\nBy exploiting the superposition properties of multi-access channels, this\napproach facilitates an integrated design of communication and computation,\nthereby enhancing system privacy while reducing implementation costs. However,\nthe inherent electromagnetic interference in radio channels often exhibits\nheavy-tailed distributions, giving rise to exceptionally strong noise in\nglobally aggregated gradients that can significantly deteriorate the training\nperformance. To address this issue, we propose a novel gradient clipping\nmethod, termed Median Anchored Clipping (MAC), to combat the detrimental\neffects of heavy-tailed noise. We also derive analytical expressions for the\nconvergence rate of model training with analog over-the-air federated learning\nunder MAC, which quantitatively demonstrates the effect of MAC on training\nperformance. Extensive experimental results show that the proposed MAC\nalgorithm effectively mitigates the impact of heavy-tailed noise, hence\nsubstantially enhancing system robustness.\n","authors":["Jiaxing Li","Zihan Chen","Kai Fong Ernest Chong","Bikramjit Das","Tony Q. S. Quek","Howard H. Yang"],"pdf_url":"https://arxiv.org/pdf/2409.15100v4.pdf","comment":"This is the full version of the paper, and the appendix contains a\n  complete convergence analysis under non-convex conditions"},{"id":"http://arxiv.org/abs/2501.11992v1","updated":"2025-01-21T09:23:22Z","published":"2025-01-21T09:23:22Z","title":"Survey on Hand Gesture Recognition from Visual Input","summary":"  Hand gesture recognition has become an important research area, driven by the\ngrowing demand for human-computer interaction in fields such as sign language\nrecognition, virtual and augmented reality, and robotics. Despite the rapid\ngrowth of the field, there are few surveys that comprehensively cover recent\nresearch developments, available solutions, and benchmark datasets. This survey\naddresses this gap by examining the latest advancements in hand gesture and 3D\nhand pose recognition from various types of camera input data including RGB\nimages, depth images, and videos from monocular or multiview cameras, examining\nthe differing methodological requirements of each approach. Furthermore, an\noverview of widely used datasets is provided, detailing their main\ncharacteristics and application domains. Finally, open challenges such as\nachieving robust recognition in real-world environments, handling occlusions,\nensuring generalization across diverse users, and addressing computational\nefficiency for real-time applications are highlighted to guide future research\ndirections. By synthesizing the objectives, methodologies, and applications of\nrecent studies, this survey offers valuable insights into current trends,\nchallenges, and opportunities for future research in human hand gesture\nrecognition.\n","authors":["Manousos Linardakis","Iraklis Varlamis","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2501.11992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11977v1","updated":"2025-01-21T08:51:12Z","published":"2025-01-21T08:51:12Z","title":"Leveraging Graph Structures and Large Language Models for End-to-End\n  Synthetic Task-Oriented Dialogues","summary":"  Training task-oriented dialogue systems is both costly and time-consuming,\ndue to the need for high-quality datasets encompassing diverse intents.\nTraditional methods depend on extensive human annotation, while recent\nadvancements leverage large language models (LLMs) to generate synthetic data.\nHowever, these approaches often require custom prompts or code, limiting\naccessibility for non-technical users. We introduce GraphTOD, an end-to-end\nframework that simplifies the generation of task-oriented dialogues. Users can\ncreate dialogues by specifying transition graphs in JSON format. Our evaluation\ndemonstrates that GraphTOD generates high-quality dialogues across various\ndomains, significantly lowering the cost and complexity of dataset creation.\n","authors":["Maya Medjad","Hugo Imbert","Bruno Yun","Raphaël Szymocha","Frédéric Armetta"],"pdf_url":"https://arxiv.org/pdf/2501.11977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11968v1","updated":"2025-01-21T08:28:10Z","published":"2025-01-21T08:28:10Z","title":"Bridging Visualization and Optimization: Multimodal Large Language\n  Models on Graph-Structured Combinatorial Optimization","summary":"  Graph-structured combinatorial challenges are inherently difficult due to\ntheir nonlinear and intricate nature, often rendering traditional computational\nmethods ineffective or expensive. However, these challenges can be more\nnaturally tackled by humans through visual representations that harness our\ninnate ability for spatial reasoning. In this study, we propose transforming\ngraphs into images to preserve their higher-order structural features\naccurately, revolutionizing the representation used in solving graph-structured\ncombinatorial tasks. This approach allows machines to emulate human-like\nprocessing in addressing complex combinatorial challenges. By combining the\ninnovative paradigm powered by multimodal large language models (MLLMs) with\nsimple search techniques, we aim to develop a novel and effective framework for\ntackling such problems. Our investigation into MLLMs spanned a variety of\ngraph-based tasks, from combinatorial problems like influence maximization to\nsequential decision-making in network dismantling, as well as addressing six\nfundamental graph-related issues. Our findings demonstrate that MLLMs exhibit\nexceptional spatial intelligence and a distinctive capability for handling\nthese problems, significantly advancing the potential for machines to\ncomprehend and analyze graph-structured data with a depth and intuition akin to\nhuman cognition. These results also imply that integrating MLLMs with simple\noptimization strategies could form a novel and efficient approach for\nnavigating graph-structured combinatorial challenges without complex\nderivations, computationally demanding training and fine-tuning.\n","authors":["Jie Zhao","Kang Hao Cheong","Witold Pedrycz"],"pdf_url":"https://arxiv.org/pdf/2501.11968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11960v1","updated":"2025-01-21T08:13:10Z","published":"2025-01-21T08:13:10Z","title":"TAD-Bench: A Comprehensive Benchmark for Embedding-Based Text Anomaly\n  Detection","summary":"  Text anomaly detection is crucial for identifying spam, misinformation, and\noffensive language in natural language processing tasks. Despite the growing\nadoption of embedding-based methods, their effectiveness and generalizability\nacross diverse application scenarios remain under-explored. To address this, we\npresent TAD-Bench, a comprehensive benchmark designed to systematically\nevaluate embedding-based approaches for text anomaly detection. TAD-Bench\nintegrates multiple datasets spanning different domains, combining\nstate-of-the-art embeddings from large language models with a variety of\nanomaly detection algorithms. Through extensive experiments, we analyze the\ninterplay between embeddings and detection methods, uncovering their strengths,\nweaknesses, and applicability to different tasks. These findings offer new\nperspectives on building more robust, efficient, and generalizable anomaly\ndetection systems for real-world applications.\n","authors":["Yang Cao","Sikun Yang","Chen Li","Haolong Xiang","Lianyong Qi","Bo Liu","Rongsheng Li","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11937v1","updated":"2025-01-21T07:27:05Z","published":"2025-01-21T07:27:05Z","title":"MeshONet: A Generalizable and Efficient Operator Learning Method for\n  Structured Mesh Generation","summary":"  Mesh generation plays a crucial role in scientific computing. Traditional\nmesh generation methods, such as TFI and PDE-based methods, often struggle to\nachieve a balance between efficiency and mesh quality. To address this\nchallenge, physics-informed intelligent learning methods have recently emerged,\nsignificantly improving generation efficiency while maintaining high mesh\nquality. However, physics-informed methods fail to generalize when applied to\npreviously unseen geometries, as even small changes in the boundary shape\nnecessitate burdensome retraining to adapt to new geometric variations. In this\npaper, we introduce MeshONet, the first generalizable intelligent learning\nmethod for structured mesh generation. The method transforms the mesh\ngeneration task into an operator learning problem with multiple input and\nsolution functions. To effectively overcome the multivariable mapping\nrestriction of operator learning methods, we propose a dual-branch,\nshared-trunk architecture to approximate the mapping between function spaces\nbased on input-output pairs. Experimental results show that MeshONet achieves a\nspeedup of up to four orders of magnitude in generation efficiency over\ntraditional methods. It also enables generalization to different geometries\nwithout retraining, greatly enhancing the practicality of intelligent methods.\n","authors":["Jing Xiao","Xinhai Chen","Qingling Wang","Jie Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11935v1","updated":"2025-01-21T07:16:18Z","published":"2025-01-21T07:16:18Z","title":"Webvs. LLMs: An Empirical Study of Learning Behaviors of CS2 Students","summary":"  LLMs such as ChatGPT have been widely adopted by students in higher education\nas tools for learning programming and related concepts. However, it remains\nunclear how effective students are and what strategies students use while\nlearning with LLMs. Since the majority of students' experiences in online\nself-learning have come through using search engines such as Google, evaluating\nAI tools in this context can help us address these gaps. In this mixed methods\nresearch, we conducted an exploratory within-subjects study to understand how\nCS2 students learn programming concepts using both LLMs as well as traditional\nonline methods such as educational websites and videos to examine how students\napproach learning within and across both scenarios. We discovered that students\nfound it easier to learn a more difficult concept using traditional methods\nthan using ChatGPT. We also found that students ask fewer follow-ups and use\nmore keyword-based queries for search engines while their prompts to LLMs tend\nto explicitly ask for information.\n","authors":["Aayush Kumar","Daniel Prol","Amin Alipour","Sruti Srinivasa Ragavan"],"pdf_url":"https://arxiv.org/pdf/2501.11935v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2501.11927v1","updated":"2025-01-21T07:03:11Z","published":"2025-01-21T07:03:11Z","title":"A Lightweight and Interpretable Deepfakes Detection Framework","summary":"  The recent realistic creation and dissemination of so-called deepfakes poses\na serious threat to social life, civil rest, and law. Celebrity defaming,\nelection manipulation, and deepfakes as evidence in court of law are few\npotential consequences of deepfakes. The availability of open source trained\nmodels based on modern frameworks such as PyTorch or TensorFlow, video\nmanipulations Apps such as FaceApp and REFACE, and economical computing\ninfrastructure has easen the creation of deepfakes. Most of the existing\ndetectors focus on detecting either face-swap, lip-sync, or puppet master\ndeepfakes, but a unified framework to detect all three types of deepfakes is\nhardly explored. This paper presents a unified framework that exploits the\npower of proposed feature fusion of hybrid facial landmarks and our novel heart\nrate features for detection of all types of deepfakes. We propose novel heart\nrate features and fused them with the facial landmark features to better\nextract the facial artifacts of fake videos and natural variations available in\nthe original videos. We used these features to train a light-weight XGBoost to\nclassify between the deepfake and bonafide videos. We evaluated the performance\nof our framework on the world leaders dataset (WLDR) that contains all types of\ndeepfakes. Experimental results illustrate that the proposed framework offers\nsuperior detection performance over the comparative deepfakes detection\nmethods. Performance comparison of our framework against the LSTM-FCN, a\ncandidate of deep learning model, shows that proposed model achieves similar\nresults, however, it is more interpretable.\n","authors":["Muhammad Umar Farooq","Ali Javed","Khalid Mahmood Malik","Muhammad Anas Raza"],"pdf_url":"https://arxiv.org/pdf/2501.11927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11924v1","updated":"2025-01-21T06:59:25Z","published":"2025-01-21T06:59:25Z","title":"Make Full Use of Testing Information: An Integrated Accelerated Testing\n  and Evaluation Method for Autonomous Driving Systems","summary":"  Testing and evaluation is an important step before the large-scale\napplication of the autonomous driving systems (ADSs). Based on the three level\nof scenario abstraction theory, a testing can be performed within a logical\nscenario, followed by an evaluation stage which is inputted with the testing\nresults of each concrete scenario generated from the logical parameter space.\nDuring the above process, abundant testing information is produced which is\nbeneficial for comprehensive and accurate evaluations. To make full use of\ntesting information, this paper proposes an Integrated accelerated Testing and\nEvaluation Method (ITEM). Based on a Monte Carlo Tree Search (MCTS) paradigm\nand a dual surrogates testing framework proposed in our previous work, this\npaper applies the intermediate information (i.e., the tree structure, including\nthe affiliation of each historical sampled point with the subspaces and the\nparent-child relationship between subspaces) generated during the testing stage\ninto the evaluation stage to achieve accurate hazardous domain identification.\nMoreover, to better serve this purpose, the UCB calculation method is improved\nto allow the search algorithm to focus more on the hazardous domain boundaries.\nFurther, a stopping condition is constructed based on the convergence of the\nsearch algorithm. Ablation and comparative experiments are then conducted to\nverify the effectiveness of the improvements and the superiority of the\nproposed method. The experimental results show that ITEM could well identify\nthe hazardous domains in both low- and high-dimensional cases, regardless of\nthe shape of the hazardous domains, indicating its generality and potential for\nthe safety evaluation of ADSs.\n","authors":["Xinzheng Wu","Junyi Chen","Jianfeng Wu","Longgao Zhang","Tian Xia","Yong Shen"],"pdf_url":"https://arxiv.org/pdf/2501.11924v1.pdf","comment":"15 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.11921v1","updated":"2025-01-21T06:49:06Z","published":"2025-01-21T06:49:06Z","title":"Goal-oriented Transmission Scheduling: Structure-guided DRL with a\n  Unified Dual On-policy and Off-policy Approach","summary":"  Goal-oriented communications prioritize application-driven objectives over\ndata accuracy, enabling intelligent next-generation wireless systems. Efficient\nscheduling in multi-device, multi-channel systems poses significant challenges\ndue to high-dimensional state and action spaces. We address these challenges by\nderiving key structural properties of the optimal solution to the goal-oriented\nscheduling problem, incorporating Age of Information (AoI) and channel states.\nSpecifically, we establish the monotonicity of the optimal state value function\n(a measure of long-term system performance) w.r.t. channel states and prove its\nasymptotic convexity w.r.t. AoI states. Additionally, we derive the\nmonotonicity of the optimal policy w.r.t. channel states, advancing the\ntheoretical framework for optimal scheduling. Leveraging these insights, we\npropose the structure-guided unified dual on-off policy DRL (SUDO-DRL), a\nhybrid algorithm that combines the stability of on-policy training with the\nsample efficiency of off-policy methods. Through a novel structural property\nevaluation framework, SUDO-DRL enables effective and scalable training,\naddressing the complexities of large-scale systems. Numerical results show\nSUDO-DRL improves system performance by up to 45% and reduces convergence time\nby 40% compared to state-of-the-art methods. It also effectively handles\nscheduling in much larger systems, where off-policy DRL fails and on-policy\nbenchmarks exhibit significant performance loss, demonstrating its scalability\nand efficacy in goal-oriented communications.\n","authors":["Jiazheng Chen","Wanchun Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11921v1.pdf","comment":"Paper submitted to IEEE"},{"id":"http://arxiv.org/abs/2501.11918v1","updated":"2025-01-21T06:46:55Z","published":"2025-01-21T06:46:55Z","title":"LuxVeri at GenAI Detection Task 3: Cross-Domain Detection of\n  AI-Generated Text Using Inverse Perplexity-Weighted Ensemble of Fine-Tuned\n  Transformer Models","summary":"  This paper presents our approach for Task 3 of the GenAI content detection\nworkshop at COLING-2025, focusing on Cross-Domain Machine-Generated Text (MGT)\nDetection. We propose an ensemble of fine-tuned transformer models, enhanced by\ninverse perplexity weighting, to improve classification accuracy across diverse\ntext domains. For Subtask A (Non-Adversarial MGT Detection), we combined a\nfine-tuned RoBERTa-base model with an OpenAI detector-integrated RoBERTa-base\nmodel, achieving an aggregate TPR score of 0.826, ranking 10th out of 23\ndetectors. In Subtask B (Adversarial MGT Detection), our fine-tuned\nRoBERTa-base model achieved a TPR score of 0.801, securing 8th out of 22\ndetectors. Our results demonstrate the effectiveness of inverse\nperplexity-based weighting for enhancing generalization and performance in both\nnon-adversarial and adversarial MGT detection, highlighting the potential for\ntransformer models in cross-domain AI-generated content detection.\n","authors":["Md Kamrujjaman Mobin","Md Saiful Islam"],"pdf_url":"https://arxiv.org/pdf/2501.11918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11914v1","updated":"2025-01-21T06:32:32Z","published":"2025-01-21T06:32:32Z","title":"LuxVeri at GenAI Detection Task 1: Inverse Perplexity Weighted Ensemble\n  for Robust Detection of AI-Generated Text across English and Multilingual\n  Contexts","summary":"  This paper presents a system developed for Task 1 of the COLING 2025 Workshop\non Detecting AI-Generated Content, focusing on the binary classification of\nmachine-generated versus human-written text. Our approach utilizes an ensemble\nof models, with weights assigned according to each model's inverse perplexity,\nto enhance classification accuracy. For the English text detection task, we\ncombined RoBERTa-base, RoBERTa-base with the OpenAI detector, and\nBERT-base-cased, achieving a Macro F1-score of 0.7458, which ranked us 12th out\nof 35 teams. We ensembled RemBERT, XLM-RoBERTa-base, and\nBERT-base-multilingual-case for the multilingual text detection task, employing\nthe same inverse perplexity weighting technique. This resulted in a Macro\nF1-score of 0.7513, positioning us 4th out of 25 teams. Our results demonstrate\nthe effectiveness of inverse perplexity weighting in improving the robustness\nof machine-generated text detection across both monolingual and multilingual\nsettings, highlighting the potential of ensemble methods for this challenging\ntask.\n","authors":["Md Kamrujjaman Mobin","Md Saiful Islam"],"pdf_url":"https://arxiv.org/pdf/2501.11914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12315v7","updated":"2025-01-21T06:31:34Z","published":"2024-06-18T06:37:26Z","title":"A Comprehensive Study of Structural Pruning for Vision Models","summary":"  Structural pruning has emerged as a promising approach for producing more\nefficient models. Nevertheless, the community suffers from a lack of\nstandardized benchmarks and metrics, leaving the progress in this area not\nfully comprehended. To fill this gap, we present the first comprehensive\nbenchmark, termed PruningBench, for structural pruning. PruningBench showcases\nthe following three characteristics: 1) PruningBench employs a unified and\nconsistent framework for evaluating the effectiveness of diverse structural\npruning techniques; 2) PruningBench systematically evaluates 16 existing\npruning methods, encompassing a wide array of models (e.g., CNNs and ViTs) and\ntasks (e.g., classification and detection); 3) PruningBench provides easily\nimplementable interfaces to facilitate the implementation of future pruning\nmethods, and enables the subsequent researchers to incorporate their work into\nour leaderboards. We provide an online pruning platform for customizing pruning\ntasks and reproducing all results in this paper. Leaderboard results can also\nbe available.\n","authors":["Changhao Li","Haoling Li","Mengqi Xue","Gongfan Fang","Sheng Zhou","Zunlei Feng","Huiqiong Wang","Mingli Song","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2406.12315v7.pdf","comment":"This paper aims to introduce an evaluation benchmark for structural\n  pruning. The complete text spans 25 pages"},{"id":"http://arxiv.org/abs/2411.08561v2","updated":"2025-01-21T06:12:26Z","published":"2024-11-13T12:18:00Z","title":"LogLLM: Log-based Anomaly Detection Using Large Language Models","summary":"  Software systems often record important runtime information in logs to help\nwith troubleshooting. Log-based anomaly detection has become a key research\narea that aims to identify system issues through log data, ultimately enhancing\nthe reliability of software systems. Traditional deep learning methods often\nstruggle to capture the semantic information embedded in log data, which is\ntypically organized in natural language. In this paper, we propose LogLLM, a\nlog-based anomaly detection framework that leverages large language models\n(LLMs). LogLLM employs BERT for extracting semantic vectors from log messages,\nwhile utilizing Llama, a transformer decoder-based model, for classifying log\nsequences. Additionally, we introduce a projector to align the vector\nrepresentation spaces of BERT and Llama, ensuring a cohesive understanding of\nlog semantics. Unlike conventional methods that require log parsers to extract\ntemplates, LogLLM preprocesses log messages with regular expressions,\nstreamlining the entire process. Our framework is trained through a novel\nthree-stage procedure designed to enhance performance and adaptability.\nExperimental results across four public datasets demonstrate that LogLLM\noutperforms state-of-the-art methods. Even when handling unstable logs, it\neffectively captures the semantic meaning of log messages and detects anomalies\naccurately.\n","authors":["Wei Guan","Jian Cao","Shiyou Qian","Jianqi Gao","Chun Ouyang"],"pdf_url":"https://arxiv.org/pdf/2411.08561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02253v2","updated":"2025-01-21T06:03:07Z","published":"2023-12-04T18:35:27Z","title":"Diversify, Don't Fine-Tune: Scaling Up Visual Recognition Training with\n  Synthetic Images","summary":"  Recent advances in generative deep learning have enabled the creation of\nhigh-quality synthetic images in text-to-image generation. Prior work shows\nthat fine-tuning a pretrained diffusion model on ImageNet and generating\nsynthetic training images from the finetuned model can enhance an ImageNet\nclassifier's performance. However, performance degrades as synthetic images\noutnumber real ones. In this paper, we explore whether generative fine-tuning\nis essential for this improvement and whether it is possible to further scale\nup training using more synthetic data. We present a new framework leveraging\noff-the-shelf generative models to generate synthetic training images,\naddressing multiple challenges: class name ambiguity, lack of diversity in\nnaive prompts, and domain shifts. Specifically, we leverage large language\nmodels (LLMs) and CLIP to resolve class name ambiguity. To diversify images, we\npropose contextualized diversification (CD) and stylized diversification (SD)\nmethods, also prompted by LLMs. Finally, to mitigate domain shifts, we leverage\ndomain adaptation techniques with auxiliary batch normalization for synthetic\nimages. Our framework consistently enhances recognition model performance with\nmore synthetic data, up to 6x of original ImageNet size showcasing the\npotential of synthetic data for improved recognition models and strong\nout-of-domain generalization.\n","authors":["Zhuoran Yu","Chenchen Zhu","Sean Culatana","Raghuraman Krishnamoorthi","Fanyi Xiao","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2312.02253v2.pdf","comment":"Accepted by Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2501.11909v1","updated":"2025-01-21T06:00:14Z","published":"2025-01-21T06:00:14Z","title":"Bridging the Communication Gap: Evaluating AI Labeling Practices for\n  Trustworthy AI Development","summary":"  As artificial intelligence (AI) becomes integral to economy and society,\ncommunication gaps between developers, users, and stakeholders hinder trust and\ninformed decision-making. High-level AI labels, inspired by frameworks like EU\nenergy labels, have been proposed to make the properties of AI models more\ntransparent. Without requiring deep technical expertise, they can inform on the\ntrade-off between predictive performance and resource efficiency. However, the\npractical benefits and limitations of AI labeling remain underexplored. This\nstudy evaluates AI labeling through qualitative interviews along four key\nresearch questions. Based on thematic analysis and inductive coding, we found a\nbroad range of practitioners to be interested in AI labeling (RQ1). They see\nbenefits for alleviating communication gaps and aiding non-expert\ndecision-makers, however limitations, misunderstandings, and suggestions for\nimprovement were also discussed (RQ2). Compared to other reporting formats,\ninterviewees positively evaluated the reduced complexity of labels, increasing\noverall comprehensibility (RQ3). Trust was influenced most by usability and the\ncredibility of the responsible labeling authority, with mixed preferences for\nself-certification versus third-party certification (RQ4). Our Insights\nhighlight that AI labels pose a trade-off between simplicity and complexity,\nwhich could be resolved by developing customizable and interactive labeling\nframeworks to address diverse user needs. Transparent labeling of resource\nefficiency also nudged interviewee priorities towards paying more attention to\nsustainability aspects during AI development. This study validates AI labels as\na valuable tool for enhancing trust and communication in AI, offering\nactionable guidelines for their refinement and standardization.\n","authors":["Raphael Fischer","Magdalena Wischnewski","Alexander van der Staay","Katharina Poitz","Christian Janiesch","Thomas Liebig"],"pdf_url":"https://arxiv.org/pdf/2501.11909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07065v2","updated":"2025-01-21T05:48:22Z","published":"2023-11-13T04:11:25Z","title":"On non-approximability of zero loss global ${\\mathcal L}^2$ minimizers\n  by gradient descent in Deep Learning","summary":"  We analyze geometric aspects of the gradient descent algorithm in Deep\nLearning (DL), and give a detailed discussion of the circumstance that in\nunderparametrized DL networks, zero loss minimization can generically not be\nattained. As a consequence, we conclude that the distribution of training\ninputs must necessarily be non-generic in order to produce zero loss\nminimizers, both for the method constructed in [Chen-Munoz Ewald 2023, 2024],\nor for gradient descent [Chen 2025] (which assume clustering of training data).\n","authors":["Thomas Chen","Patricia Muñoz Ewald"],"pdf_url":"https://arxiv.org/pdf/2311.07065v2.pdf","comment":"AMS Latex, 7 pages. Title changed, statement of Corollary 1.6\n  corrected"},{"id":"http://arxiv.org/abs/2501.11900v1","updated":"2025-01-21T05:30:20Z","published":"2025-01-21T05:30:20Z","title":"Panoramic Interests: Stylistic-Content Aware Personalized Headline\n  Generation","summary":"  Personalized news headline generation aims to provide users with\nattention-grabbing headlines that are tailored to their preferences. Prevailing\nmethods focus on user-oriented content preferences, but most of them overlook\nthe fact that diverse stylistic preferences are integral to users' panoramic\ninterests, leading to suboptimal personalization. In view of this, we propose a\nnovel Stylistic-Content Aware Personalized Headline Generation (SCAPE)\nframework. SCAPE extracts both content and stylistic features from headlines\nwith the aid of large language model (LLM) collaboration. It further adaptively\nintegrates users' long- and short-term interests through a contrastive\nlearning-based hierarchical fusion network. By incorporating the panoramic\ninterests into the headline generator, SCAPE reflects users' stylistic-content\npreferences during the generation process. Extensive experiments on the\nreal-world dataset PENS demonstrate the superiority of SCAPE over baselines.\n","authors":["Junhong Lian","Xiang Ao","Xinyu Liu","Yang Liu","Qing He"],"pdf_url":"https://arxiv.org/pdf/2501.11900v1.pdf","comment":"Accepted to The ACM Web Conference 2025 (WWW'25, short paper)"},{"id":"http://arxiv.org/abs/2501.11896v1","updated":"2025-01-21T05:17:08Z","published":"2025-01-21T05:17:08Z","title":"Systematic Abductive Reasoning via Diverse Relation Representations in\n  Vector-symbolic Architecture","summary":"  In abstract visual reasoning, monolithic deep learning models suffer from\nlimited interpretability and generalization, while existing neuro-symbolic\napproaches fall short in capturing the diversity and systematicity of\nattributes and relation representations. To address these challenges, we\npropose a Systematic Abductive Reasoning model with diverse relation\nrepresentations (Rel-SAR) in Vector-symbolic Architecture (VSA) to solve\nRaven's Progressive Matrices (RPM). To derive attribute representations with\nsymbolic reasoning potential, we introduce not only various types of atomic\nvectors that represent numeric, periodic and logical semantics, but also the\nstructured high-dimentional representation (SHDR) for the overall Grid\ncomponent. For systematic reasoning, we propose novel numerical and logical\nrelation functions and perform rule abduction and execution in a unified\nframework that integrates these relation representations. Experimental results\ndemonstrate that Rel-SAR achieves significant improvement on RPM tasks and\nexhibits robust out-of-distribution generalization. Rel-SAR leverages the\nsynergy between HD attribute representations and symbolic reasoning to achieve\nsystematic abductive reasoning with both interpretable and computable\nsemantics.\n","authors":["Zhong-Hua Sun","Ru-Yuan Zhang","Zonglei Zhen","Da-Hui Wang","Yong-Jie Li","Xiaohong Wan","Hongzhi You"],"pdf_url":"https://arxiv.org/pdf/2501.11896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23649v2","updated":"2025-01-21T04:42:10Z","published":"2024-10-31T05:40:08Z","title":"Deep Convolutional Neural Networks on Multiclass Classification of\n  Three-Dimensional Brain Images for Parkinson's Disease Stage Prediction","summary":"  Parkinson's disease (PD), a degenerative disorder of the central nervous\nsystem, is commonly diagnosed using functional medical imaging techniques such\nas single-photon emission computed tomography (SPECT). In this study, we\nutilized two SPECT data sets (n = 634 and n = 202) from different hospitals to\ndevelop a model capable of accurately predicting PD stages, a multiclass\nclassification task. We used the entire three-dimensional (3D) brain images as\ninput and experimented with various model architectures. Initially, we treated\nthe 3D images as sequences of two-dimensional (2D) slices and fed them\nsequentially into 2D convolutional neural network (CNN) models pretrained on\nImageNet, averaging the outputs to obtain the final predicted stage. We also\napplied 3D CNN models pretrained on Kinetics-400. Additionally, we incorporated\nan attention mechanism to account for the varying importance of different\nslices in the prediction process. To further enhance model efficacy and\nrobustness, we simultaneously trained the two data sets using weight sharing, a\ntechnique known as cotraining. Our results demonstrated that 2D models\npretrained on ImageNet outperformed 3D models pretrained on Kinetics-400, and\nmodels utilizing the attention mechanism outperformed both 2D and 3D models.\nThe cotraining technique proved effective in improving model performance when\nthe cotraining data sets were sufficiently large.\n","authors":["Guan-Hua Huang","Wan-Chen Lai","Tai-Been Chen","Chien-Chin Hsu","Huei-Yung Chen","Yi-Chen Wu","Li-Ren Yeh"],"pdf_url":"https://arxiv.org/pdf/2410.23649v2.pdf","comment":"38 pages, 7 figures, and 4 tables. This paper has been accepted for\n  publication in Journal of Imaging Informatics in Medicine"},{"id":"http://arxiv.org/abs/2402.00976v4","updated":"2025-01-21T04:20:26Z","published":"2024-02-01T19:47:31Z","title":"Investigating Recurrent Transformers with Dynamic Halt","summary":"  In this paper, we comprehensively study the inductive biases of two major\napproaches to augmenting Transformers with a recurrent mechanism: (1) the\napproach of incorporating a depth-wise recurrence similar to Universal\nTransformers; and (2) the approach of incorporating a chunk-wise temporal\nrecurrence like Temporal Latent Bottleneck. Furthermore, we propose and\ninvestigate novel ways to extend and combine the above methods - for example,\nwe propose a global mean-based dynamic halting mechanism for Universal\nTransformers and an augmentation of Temporal Latent Bottleneck with elements\nfrom Universal Transformer. We compare the models and probe their inductive\nbiases in several diagnostic tasks, such as Long Range Arena (LRA), flip-flop\nlanguage modeling, ListOps, and Logical Inference. The code is released in:\nhttps://github.com/JRC1995/InvestigatingRecurrentTransformers/tree/main\n","authors":["Jishnu Ray Chowdhury","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2402.00976v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11880v1","updated":"2025-01-21T04:16:46Z","published":"2025-01-21T04:16:46Z","title":"Community-Aware Temporal Walks: Parameter-Free Representation Learning\n  on Continuous-Time Dynamic Graphs","summary":"  Dynamic graph representation learning plays a crucial role in understanding\nevolving behaviors. However, existing methods often struggle with flexibility,\nadaptability, and the preservation of temporal and structural dynamics. To\naddress these issues, we propose Community-aware Temporal Walks (CTWalks), a\nnovel framework for representation learning on continuous-time dynamic graphs.\nCTWalks integrates three key components: a community-based parameter-free\ntemporal walk sampling mechanism, an anonymization strategy enriched with\ncommunity labels, and an encoding process that leverages continuous temporal\ndynamics modeled via ordinary differential equations (ODEs). This design\nenables precise modeling of both intra- and inter-community interactions,\noffering a fine-grained representation of evolving temporal patterns in\ncontinuous-time dynamic graphs. CTWalks theoretically overcomes locality bias\nin walks and establishes its connection to matrix factorization. Experiments on\nbenchmark datasets demonstrate that CTWalks outperforms established methods in\ntemporal link prediction tasks, achieving higher accuracy while maintaining\nrobustness.\n","authors":["He Yu","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11877v1","updated":"2025-01-21T04:11:59Z","published":"2025-01-21T04:11:59Z","title":"From Drafts to Answers: Unlocking LLM Potential via Aggregation\n  Fine-Tuning","summary":"  Scaling data and model size has been proven effective for boosting the\nperformance of large language models. In addition to training-time scaling,\nrecent studies have revealed that increasing test-time computational resources\ncan further improve performance. In this work, we introduce Aggregation\nFine-Tuning (AFT), a supervised finetuning paradigm where the model learns to\nsynthesize multiple draft responses, referred to as proposals, into a single,\nrefined answer, termed aggregation. At inference time, a propose-and-aggregate\nstrategy further boosts performance by iteratively generating proposals and\naggregating them. Empirical evaluations on benchmark datasets show that\nAFT-trained models substantially outperform standard SFT. Notably, an AFT\nmodel, fine-tuned from Llama3.1-8B-Base with only 64k data, achieves a 41.3% LC\nwin rate on AlpacaEval 2, surpassing significantly larger LLMs such as\nLlama3.1-405B-Instruct and GPT4. By combining sequential refinement and\nparallel sampling, the propose-and-aggregate framework scales inference-time\ncomputation in a flexible manner. Overall, These findings position AFT as a\npromising approach to unlocking additional capabilities of LLMs without\nresorting to increasing data volume or model size.\n","authors":["Yafu Li","Zhilin Wang","Tingchen Fu","Ganqu Cui","Sen Yang","Yu Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.11877v1.pdf","comment":"20 pages; work in progress"},{"id":"http://arxiv.org/abs/2406.12624v5","updated":"2025-01-21T04:10:13Z","published":"2024-06-18T13:49:54Z","title":"Judging the Judges: Evaluating Alignment and Vulnerabilities in\n  LLMs-as-Judges","summary":"  Offering a promising solution to the scalability challenges associated with\nhuman evaluation, the LLM-as-a-judge paradigm is rapidly gaining traction as an\napproach to evaluating large language models (LLMs). However, there are still\nmany open questions about the strengths and weaknesses of this paradigm, and\nwhat potential biases it may hold. In this paper, we present a comprehensive\nstudy of the performance of various LLMs acting as judges, focusing on a clean\nscenario in which inter-human agreement is high. Investigating thirteen judge\nmodels of different model sizes and families, judging answers of nine different\n'examtaker models' - both base and instruction-tuned - we find that only the\nbest (and largest) models achieve reasonable alignment with humans. However,\nthey are still quite far behind inter-human agreement and their assigned scores\nmay still differ with up to 5 points from human-assigned scores. In terms of\ntheir ranking of the nine exam-taker models, instead, also smaller models and\neven the lexical metric contains may provide a reasonable signal. Through error\nanalysis and other studies, we identify vulnerabilities in judge models, such\nas their sensitivity to prompt complexity and length, and a tendency toward\nleniency. The fact that even the best judges differ from humans in this\ncomparatively simple setup suggest that caution may be wise when using judges\nin more complex setups. Lastly, our research rediscovers the importance of\nusing alignment metrics beyond simple percent alignment, showing that judges\nwith high percent agreement can still assign vastly different scores.\n","authors":["Aman Singh Thakur","Kartik Choudhary","Venkat Srinik Ramayapally","Sankaran Vaidyanathan","Dieuwke Hupkes"],"pdf_url":"https://arxiv.org/pdf/2406.12624v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11051v2","updated":"2025-01-21T04:06:09Z","published":"2024-08-20T17:57:46Z","title":"FLAME: Learning to Navigate with Multimodal LLM in Urban Environments","summary":"  Large Language Models (LLMs) have demonstrated potential in\nVision-and-Language Navigation (VLN) tasks, yet current applications face\nchallenges. While LLMs excel in general conversation scenarios, they struggle\nwith specialized navigation tasks, yielding suboptimal performance compared to\nspecialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied\nAgent), a novel Multimodal LLM-based agent and architecture designed for urban\nVLN tasks that efficiently handles multiple observations. Our approach\nimplements a three-phase tuning technique for effective adaptation to\nnavigation tasks, including single perception tuning for street view\ndescription, multiple perception tuning for route summarization, and end-to-end\ntraining on VLN datasets. The augmented datasets are synthesized automatically.\nExperimental results demonstrate FLAME's superiority over existing methods,\nsurpassing state-of-the-art methods by a 7.3% increase in task completion on\nTouchdown dataset. This work showcases the potential of Multimodal LLMs (MLLMs)\nin complex navigation tasks, representing an advancement towards applications\nof MLLMs in the field of embodied intelligence.\n","authors":["Yunzhe Xu","Yiyuan Pan","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11051v2.pdf","comment":"Accepted to AAAI 2025 (Oral)"},{"id":"http://arxiv.org/abs/2501.11870v1","updated":"2025-01-21T03:56:23Z","published":"2025-01-21T03:56:23Z","title":"Coarse-to-Fine Lightweight Meta-Embedding for ID-Based Recommendation","summary":"  The state-of-the-art recommendation systems have shifted the attention to\nefficient recommendation, e.g., on-device recommendation, under memory\nconstraints. To this end, the existing methods either focused on the\nlightweight embeddings for both users and items, or involved on-device systems\nenjoying the compact embeddings to enhance reusability and reduces space\ncomplexity. However, they focus solely on the coarse granularity of embedding,\nwhile overlook the fine-grained semantic nuances, to adversarially downgrade\nthe efficacy of meta-embeddings in capturing the intricate relationship over\nboth user and item, consequently resulting into the suboptimal recommendations.\nIn this paper, we aim to study how the meta-embedding can efficiently learn\nvaried grained semantics, together with how the fine-grained meta-embedding can\nstrengthen the representation of coarse-grained meta-embedding. To answer these\nquestions, we develop a novel graph neural networks (GNNs) based recommender\nwhere each user and item serves as the node, linked directly to coarse-grained\nvirtual nodes and indirectly to fine-grained virtual nodes, ensuring different\ngrained semantic learning, while disclosing: 1) In contrast to coarse-grained\nsemantics, fine-grained semantics are well captured through sparse\nmeta-embeddings, which adaptively 2) balance the embedding uniqueness and\nmemory constraint. Additionally, the initialization method come up upon\nSparsePCA, along with a soft thresholding activation function to render the\nsparseness of the meta-embeddings. We propose a weight bridging update strategy\nthat focuses on matching each coarse-grained meta-embedding with several\nfine-grained meta-embeddings based on the users/items' semantics. Extensive\nexperiments substantiate our method's superiority over existing baselines. Our\ncode is available at https://github.com/htyjers/C2F-MetaEmbed.\n","authors":["Yang Wang","Haipeng Liu","Zeqian Yi","Biao Qian","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11870v1.pdf","comment":"16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.10159v4","updated":"2025-01-21T03:40:14Z","published":"2024-08-19T17:09:32Z","title":"Customizing Language Models with Instance-wise LoRA for Sequential\n  Recommendation","summary":"  Sequential recommendation systems predict the next interaction item based on\nusers' past interactions, aligning recommendations with individual preferences.\nLeveraging the strengths of Large Language Models (LLMs) in knowledge\ncomprehension and reasoning, recent approaches are eager to apply LLMs to\nsequential recommendation. A common paradigm is converting user behavior\nsequences into instruction data, and fine-tuning the LLM with\nparameter-efficient fine-tuning (PEFT) methods like Low-Rank Adaption (LoRA).\nHowever, the uniform application of LoRA across diverse user behaviors is\ninsufficient to capture individual variability, resulting in negative transfer\nbetween disparate sequences. To address these challenges, we propose\nInstance-wise LoRA (iLoRA). We innovatively treat the sequential recommendation\ntask as a form of multi-task learning, integrating LoRA with the Mixture of\nExperts (MoE) framework. This approach encourages different experts to capture\nvarious aspects of user behavior. Additionally, we introduce a sequence\nrepresentation guided gate function that generates customized expert\nparticipation weights for each user sequence, which allows dynamic parameter\nadjustment for instance-wise recommendations. In sequential recommendation,\niLoRA achieves an average relative improvement of 11.4\\% over basic LoRA in the\nhit ratio metric, with less than a 1\\% relative increase in trainable\nparameters. Extensive experiments on three benchmark datasets demonstrate the\neffectiveness of iLoRA, highlighting its superior performance compared to\nexisting methods in mitigating negative transfer and improving recommendation\naccuracy. Our data and code are available at\nhttps://github.com/AkaliKong/iLoRA.\n","authors":["Xiaoyu Kong","Jiancan Wu","An Zhang","Leheng Sheng","Hui Lin","Xiang Wang","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2408.10159v4.pdf","comment":"NeurIPS 2024 poster"},{"id":"http://arxiv.org/abs/2408.16288v2","updated":"2025-01-21T03:19:58Z","published":"2024-08-29T06:40:01Z","title":"OpenFGL: A Comprehensive Benchmark for Federated Graph Learning","summary":"  Federated graph learning (FGL) is a promising distributed training paradigm\nfor graph neural networks across multiple local systems without direct data\nsharing. This approach inherently involves large-scale distributed graph\nprocessing, which closely aligns with the challenges and research focuses of\ngraph-based data systems. Despite the proliferation of FGL, the diverse\nmotivations from real-world applications, spanning various research backgrounds\nand settings, pose a significant challenge to fair evaluation. To fill this\ngap, we propose OpenFGL, a unified benchmark designed for the primary FGL\nscenarios: Graph-FL and Subgraph-FL. Specifically, OpenFGL includes 42 graph\ndatasets from 18 application domains, 8 federated data simulation strategies\nthat emphasize different graph properties, and 5 graph-based downstream tasks.\nAdditionally, it offers 18 recently proposed SOTA FGL algorithms through a\nuser-friendly API, enabling a thorough comparison and comprehensive evaluation\nof their effectiveness, robustness, and efficiency. Our empirical results\ndemonstrate the capabilities of FGL while also highlighting its potential\nlimitations, providing valuable insights for future research in this growing\nfield, particularly in fostering greater interdisciplinary collaboration\nbetween FGL and data systems.\n","authors":["Xunkai Li","Yinlin Zhu","Boyang Pang","Guochen Yan","Yeyu Yan","Zening Li","Zhengyu Wu","Wentao Zhang","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16288v2.pdf","comment":"Accepted by VLDB 2025"},{"id":"http://arxiv.org/abs/2501.11849v1","updated":"2025-01-21T03:07:21Z","published":"2025-01-21T03:07:21Z","title":"Network-informed Prompt Engineering against Organized Astroturf\n  Campaigns under Extreme Class Imbalance","summary":"  Detecting organized political campaigns is of paramount importance in\nfighting against disinformation on social media. Existing approaches for the\nidentification of such organized actions employ techniques mostly from network\nscience, graph machine learning and natural language processing. Their ultimate\ngoal is to analyze the relationships and interactions (e.g. re-posting) among\nusers and the textual similarities of their posts. Despite their effectiveness\nin recognizing astroturf campaigns, these methods face significant challenges,\nnotably the class imbalance in available training datasets. To mitigate this\nissue, recent methods usually resort to data augmentation or increasing the\nnumber of positive samples, which may not always be feasible or sufficient in\nreal-world settings. Following a different path, in this paper, we propose a\nnovel framework for identifying astroturf campaigns based solely on large\nlanguage models (LLMs), introducing a Balanced Retrieval-Augmented Generation\n(Balanced RAG) component. Our approach first gives both textual information\nconcerning the posts (in our case tweets) and the user interactions of the\nsocial network as input to a language model. Then, through prompt engineering\nand the proposed Balanced RAG method, it effectively detects coordinated\ndisinformation campaigns on X (Twitter). The proposed framework does not\nrequire any training or fine-tuning of the language model. Instead, by\nstrategically harnessing the strengths of prompt engineering and Balanced RAG,\nit facilitates LLMs to overcome the effects of class imbalance and effectively\nidentify coordinated political campaigns. The experimental results demonstrate\nthat by incorporating the proposed prompt engineering and Balanced RAG methods,\nour framework outperforms the traditional graph-based baselines, achieving\n2x-3x improvements in terms of precision, recall and F1 scores.\n","authors":["Nikos Kanakaris","Heng Ping","Xiongye Xiao","Nesreen K. Ahmed","Luca Luceri","Emilio Ferrara","Paul Bogdan"],"pdf_url":"https://arxiv.org/pdf/2501.11849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11847v1","updated":"2025-01-21T03:06:30Z","published":"2025-01-21T03:06:30Z","title":"A Survey on Memory-Efficient Large-Scale Model Training in AI for\n  Science","summary":"  Scientific research faces high costs and inefficiencies with traditional\nmethods, but the rise of deep learning and large language models (LLMs) offers\ninnovative solutions. This survey reviews LLM applications across scientific\nfields such as biology, medicine, chemistry, and meteorology, underscoring\ntheir role in advancing research. However, the continuous expansion of model\nsize has led to significant memory demands, hindering further development and\napplication of LLMs for science. To address this, we review memory-efficient\ntraining techniques for LLMs based on the transformer architecture, including\ndistributed training, mixed precision training, and gradient checkpointing.\nUsing AlphaFold 2 as an example, we demonstrate how tailored memory\noptimization methods can reduce storage needs while preserving prediction\naccuracy. We also discuss the challenges of memory optimization in practice and\npotential future directions, hoping to provide valuable insights for\nresearchers and engineers.\n","authors":["Kaiyuan Tian","Linbo Qiao","Baihui Liu","Gongqingjian Jiang","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.11847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08095v4","updated":"2025-01-21T02:51:53Z","published":"2024-01-16T03:39:35Z","title":"DurFlex-EVC: Duration-Flexible Emotional Voice Conversion Leveraging\n  Discrete Representations without Text Alignment","summary":"  Emotional voice conversion (EVC) involves modifying various acoustic\ncharacteristics, such as pitch and spectral envelope, to match a desired\nemotional state while preserving the speaker's identity. Existing EVC methods\noften rely on text transcriptions or time-alignment information and struggle to\nhandle varying speech durations effectively. In this paper, we propose\nDurFlex-EVC, a duration-flexible EVC framework that operates without the need\nfor text or alignment information. We introduce a unit aligner that models\ncontextual information by aligning speech with discrete units representing\ncontent, eliminating the need for text or speech-text alignment. Additionally,\nwe design a style autoencoder that effectively disentangles content and\nemotional style, allowing precise manipulation of the emotional characteristics\nof the speech. We further enhance emotional expressiveness through a\nhierarchical stylize encoder that applies the target emotional style at\nmultiple hierarchical levels, refining the stylization process to improve the\nnaturalness and expressiveness of the converted speech. Experimental results\nfrom subjective and objective evaluations demonstrate that our approach\noutperforms baseline models, effectively handling duration variability and\nenhancing emotional expressiveness in the converted speech.\n","authors":["Hyung-Seok Oh","Sang-Hoon Lee","Deok-Hyeon Cho","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2401.08095v4.pdf","comment":"15 pages, 11 figures, 12 tables"},{"id":"http://arxiv.org/abs/2501.11839v1","updated":"2025-01-21T02:48:23Z","published":"2025-01-21T02:48:23Z","title":"Supervised Learning for Analog and RF Circuit Design: Benchmarks and\n  Comparative Insights","summary":"  Automating analog and radio-frequency (RF) circuit design using machine\nlearning (ML) significantly reduces the time and effort required for parameter\noptimization. This study explores supervised ML-based approaches for designing\ncircuit parameters from performance specifications across various circuit\ntypes, including homogeneous and heterogeneous designs. By evaluating diverse\nML models, from neural networks like transformers to traditional methods like\nrandom forests, we identify the best-performing models for each circuit. Our\nresults show that simpler circuits, such as low-noise amplifiers, achieve\nexceptional accuracy with mean relative errors as low as 0.3% due to their\nlinear parameter-performance relationships. In contrast, complex circuits, like\npower amplifiers and voltage-controlled oscillators, present challenges due to\ntheir non-linear interactions and larger design spaces. For heterogeneous\ncircuits, our approach achieves an 88% reduction in errors with increased\ntraining data, with the receiver achieving a mean relative error as low as\n0.23%, showcasing the scalability and accuracy of the proposed methodology.\nAdditionally, we provide insights into model strengths, with transformers\nexcelling in capturing non-linear mappings and k-nearest neighbors performing\nrobustly in moderately linear parameter spaces, especially in heterogeneous\ncircuits with larger datasets. This work establishes a foundation for extending\nML-driven design automation, enabling more efficient and scalable circuit\ndesign workflows.\n","authors":["Asal Mehradfar","Xuzhe Zhao","Yue Niu","Sara Babakniya","Mahdi Alesheikh","Hamidreza Aghasi","Salman Avestimehr"],"pdf_url":"https://arxiv.org/pdf/2501.11839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11836v1","updated":"2025-01-21T02:44:05Z","published":"2025-01-21T02:44:05Z","title":"Data-driven Detection and Evaluation of Damages in Concrete Structures:\n  Using Deep Learning and Computer Vision","summary":"  Structural integrity is vital for maintaining the safety and longevity of\nconcrete infrastructures such as bridges, tunnels, and walls. Traditional\nmethods for detecting damages like cracks and spalls are labor-intensive,\ntime-consuming, and prone to human error. To address these challenges, this\nstudy explores advanced data-driven techniques using deep learning for\nautomated damage detection and analysis. Two state-of-the-art instance\nsegmentation models, YOLO-v7 instance segmentation and Mask R-CNN, were\nevaluated using a dataset comprising 400 images, augmented to 10,995 images\nthrough geometric and color-based transformations to enhance robustness. The\nmodels were trained and validated using a dataset split into 90% training set,\nvalidation and test set 10%. Performance metrics such as precision, recall,\nmean average precision (mAP@0.5), and frames per second (FPS) were used for\nevaluation. YOLO-v7 achieved a superior mAP@0.5 of 96.1% and processed 40 FPS,\noutperforming Mask R-CNN, which achieved a mAP@0.5 of 92.1% with a slower\nprocessing speed of 18 FPS. The findings recommend YOLO-v7 instance\nsegmentation model for real-time, high-speed structural health monitoring,\nwhile Mask R-CNN is better suited for detailed offline assessments. This study\ndemonstrates the potential of deep learning to revolutionize infrastructure\nmaintenance, offering a scalable and efficient solution for automated damage\ndetection.\n","authors":["Saeid Ataei","Saeed Adibnazari","Seyyed Taghi Ataei"],"pdf_url":"https://arxiv.org/pdf/2501.11836v1.pdf","comment":"17 pages, 10 figures. This study focuses on the data-driven detection\n  and evaluation of damages in concrete structures using deep learning and\n  computer vision techniques"},{"id":"http://arxiv.org/abs/2410.08067v3","updated":"2025-01-21T02:40:27Z","published":"2024-10-10T16:01:51Z","title":"Reward-Augmented Data Enhances Direct Preference Alignment of LLMs","summary":"  Preference alignment in Large Language Models (LLMs) has significantly\nimproved their ability to adhere to human instructions and intentions. However,\nexisting direct alignment algorithms primarily focus on relative preferences\nand often overlook the qualitative aspects of responses. Striving to maximize\nthe implicit reward gap between the chosen and the slightly inferior rejected\nresponses can cause overfitting and unnecessary unlearning of the high-quality\nrejected responses. The unawareness of the reward scores also drives the LLM to\nindiscriminately favor the low-quality chosen responses and fail to generalize\nto responses with the highest rewards, which are sparse in data. To overcome\nthese shortcomings, our study introduces reward-conditioned LLM policies that\ndiscern and learn from the entire spectrum of response quality within the\ndataset, helping extrapolate to more optimal regions. We propose an effective\nyet simple data relabeling method that conditions the preference pairs on\nquality scores to construct a reward-augmented dataset. This dataset is easily\nintegrated with existing direct alignment algorithms and is applicable to any\npreference dataset. The experimental results across instruction-following\nbenchmarks including AlpacaEval, MT-Bench, and Arena-Hard-Auto demonstrate that\nour approach consistently boosts the performance of DPO by a considerable\nmargin across diverse models. Additionally, our method improves the average\naccuracy on various academic benchmarks. When applying our method to on-policy\ndata, the resulting DPO model achieves SOTA results on AlpacaEval. Through\nablation studies, we demonstrate that our method not only maximizes the utility\nof preference data but also mitigates the issue of unlearning, demonstrating\nits broad effectiveness beyond mere dataset expansion. Our code is available at\nhttps://github.com/shenao-zhang/reward-augmented-preference.\n","authors":["Shenao Zhang","Zhihan Liu","Boyi Liu","Yufeng Zhang","Yingxiang Yang","Yongfei Liu","Liyu Chen","Tao Sun","Zhaoran Wang"],"pdf_url":"https://arxiv.org/pdf/2410.08067v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18313v5","updated":"2025-01-21T02:38:32Z","published":"2024-09-26T21:44:11Z","title":"Embodied-RAG: General Non-parametric Embodied Memory for Retrieval and\n  Generation","summary":"  There is no limit to how much a robot might explore and learn, but all of\nthat knowledge needs to be searchable and actionable. Within language research,\nretrieval augmented generation (RAG) has become the workhorse of large-scale\nnon-parametric knowledge; however, existing techniques do not directly transfer\nto the embodied domain, which is multimodal, where data is highly correlated,\nand perception requires abstraction. To address these challenges, we introduce\nEmbodied-RAG, a framework that enhances the foundational model of an embodied\nagent with a non-parametric memory system capable of autonomously constructing\nhierarchical knowledge for both navigation and language generation.\nEmbodied-RAG handles a full range of spatial and semantic resolutions across\ndiverse environments and query types, whether for a specific object or a\nholistic description of ambiance. At its core, Embodied-RAG's memory is\nstructured as a semantic forest, storing language descriptions at varying\nlevels of detail. This hierarchical organization allows the system to\nefficiently generate context-sensitive outputs across different robotic\nplatforms. We demonstrate that Embodied-RAG effectively bridges RAG to the\nrobotics domain, successfully handling over 250 explanation and navigation\nqueries across kilometer-level environments, highlighting its promise as a\ngeneral-purpose non-parametric system for embodied agents.\n","authors":["Quanting Xie","So Yeon Min","Pengliang Ji","Yue Yang","Tianyi Zhang","Kedi Xu","Aarav Bajaj","Ruslan Salakhutdinov","Matthew Johnson-Roberson","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2409.18313v5.pdf","comment":"Web: https://quanting-xie.github.io/Embodied-RAG-web/"},{"id":"http://arxiv.org/abs/2501.11833v1","updated":"2025-01-21T02:29:15Z","published":"2025-01-21T02:29:15Z","title":"Is your LLM trapped in a Mental Set? Investigative study on how mental\n  sets affect the reasoning capabilities of LLMs","summary":"  In this paper, we present an investigative study on how Mental Sets influence\nthe reasoning capabilities of LLMs. LLMs have excelled in diverse natural\nlanguage processing (NLP) tasks, driven by advancements in parameter-efficient\nfine-tuning (PEFT) and emergent capabilities like in-context learning (ICL).\nFor complex reasoning tasks, selecting the right model for PEFT or ICL is\ncritical, often relying on scores on benchmarks such as MMLU, MATH, and GSM8K.\nHowever, current evaluation methods, based on metrics like F1 Score or\nreasoning chain assessments by larger models, overlook a key dimension:\nadaptability to unfamiliar situations and overcoming entrenched thinking\npatterns. In cognitive psychology, Mental Set refers to the tendency to persist\nwith previously successful strategies, even when they become inefficient - a\nchallenge for problem solving and reasoning. We compare the performance of LLM\nmodels like Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct and GPT-4o in the\npresence of mental sets. To the best of our knowledge, this is the first study\nto integrate cognitive psychology concepts into the evaluation of LLMs for\ncomplex reasoning tasks, providing deeper insights into their adaptability and\nproblem-solving efficacy.\n","authors":["Saiful Haq","Niyati Chhaya","Piyush Pandey","Pushpak Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2501.11833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18963v3","updated":"2025-01-21T02:28:29Z","published":"2024-03-27T19:16:56Z","title":"Leveraging Quantum Superposition to Infer the Dynamic Behavior of a\n  Spatial-Temporal Neural Network Signaling Model","summary":"  The exploration of new problem classes for quantum computation is an active\narea of research. In this paper, we introduce and solve a novel problem class\nrelated to dynamics on large-scale networks relevant to neurobiology and\nmachine learning. Specifically, we ask if a network can sustain inherent\ndynamic activity beyond some arbitrary observation time or if the activity\nceases through quiescence or saturation via an epileptic-like state. We show\nthat this class of problems can be formulated and structured to take advantage\nof quantum superposition and solved efficiently using the Deutsch-Jozsa and\nGrover quantum algorithms. To do so, we extend their functionality to address\nthe unique requirements of how input (sub)sets into the algorithms must be\nmathematically structured while simultaneously constructing the inputs so that\nmeasurement outputs can be interpreted as meaningful properties of the network\ndynamics. This, in turn, allows us to answer the question we pose.\n","authors":["Gabriel A. Silva"],"pdf_url":"https://arxiv.org/pdf/2403.18963v3.pdf","comment":"36 pages, 4 figures. See\n  https://github.com/gabe-alex-silva/Network_Dynamics_QuantumSim/tree/main for\n  code details"},{"id":"http://arxiv.org/abs/2501.11828v1","updated":"2025-01-21T02:14:07Z","published":"2025-01-21T02:14:07Z","title":"Fact-Preserved Personalized News Headline Generation","summary":"  Personalized news headline generation, aiming at generating user-specific\nheadlines based on readers' preferences, burgeons a recent flourishing research\ndirection. Existing studies generally inject a user interest embedding into an\nencoderdecoder headline generator to make the output personalized, while the\nfactual consistency of headlines is inadequate to be verified. In this paper,\nwe propose a framework Fact-Preserved Personalized News Headline Generation\n(short for FPG), to prompt a tradeoff between personalization and consistency.\nIn FPG, the similarity between the candidate news to be exposed and the\nhistorical clicked news is used to give different levels of attention to key\nfacts in the candidate news, and the similarity scores help to learn a\nfact-aware global user embedding. Besides, an additional training procedure\nbased on contrastive learning is devised to further enhance the factual\nconsistency of generated headlines. Extensive experiments conducted on a\nreal-world benchmark PENS validate the superiority of FPG, especially on the\ntradeoff between personalization and factual consistency.\n","authors":["Zhao Yang","Junhong Lian","Xiang Ao"],"pdf_url":"https://arxiv.org/pdf/2501.11828v1.pdf","comment":"Accepted by IEEE ICDM 2023, Short paper, 6 pages"},{"id":"http://arxiv.org/abs/2501.11827v1","updated":"2025-01-21T02:10:50Z","published":"2025-01-21T02:10:50Z","title":"PXGen: A Post-hoc Explainable Method for Generative Models","summary":"  With the rapid growth of generative AI in numerous applications, explainable\nAI (XAI) plays a crucial role in ensuring the responsible development and\ndeployment of generative AI technologies. XAI has undergone notable\nadvancements and widespread adoption in recent years, reflecting a concerted\npush to enhance the transparency, interpretability, and credibility of AI\nsystems. Recent research emphasizes that a proficient XAI method should adhere\nto a set of criteria, primarily focusing on two key areas. Firstly, it should\nensure the quality and fluidity of explanations, encompassing aspects like\nfaithfulness, plausibility, completeness, and tailoring to individual needs.\nSecondly, the design principle of the XAI system or mechanism should cover the\nfollowing factors such as reliability, resilience, the verifiability of its\noutputs, and the transparency of its algorithm. However, research in XAI for\ngenerative models remains relatively scarce, with little exploration into how\nsuch methods can effectively meet these criteria in that domain. In this work,\nwe propose PXGen, a post-hoc explainable method for generative models. Given a\nmodel that needs to be explained, PXGen prepares two materials for the\nexplanation, the Anchor set and intrinsic & extrinsic criteria. Those materials\nare customizable by users according to their purpose and requirements. Via the\ncalculation of each criterion, each anchor has a set of feature values and\nPXGen provides examplebased explanation methods according to the feature values\namong all the anchors and illustrated and visualized to the users via tractable\nalgorithms such as k-dispersion or k-center.\n","authors":["Yen-Lung Huang","Ming-Hsi Weng","Hao-Tsung Yang"],"pdf_url":"https://arxiv.org/pdf/2501.11827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11823v1","updated":"2025-01-21T02:02:35Z","published":"2025-01-21T02:02:35Z","title":"Toward Scalable Graph Unlearning: A Node Influence Maximization based\n  Approach","summary":"  Machine unlearning, as a pivotal technology for enhancing model robustness\nand data privacy, has garnered significant attention in prevalent web mining\napplications, especially in thriving graph-based scenarios. However, most\nexisting graph unlearning (GU) approaches face significant challenges due to\nthe intricate interactions among web-scale graph elements during the model\ntraining: (1) The gradient-driven node entanglement hinders the complete\nknowledge removal in response to unlearning requests; (2) The billion-level\ngraph elements in the web scenarios present inevitable scalability issues. To\nbreak the above limitations, we open up a new perspective by drawing a\nconnection between GU and conventional social influence maximization. To this\nend, we propose Node Influence Maximization (NIM) through the decoupled\ninfluence propagation model and fine-grained influence function in a scalable\nmanner, which is crafted to be a plug-and-play strategy to identify potential\nnodes affected by unlearning entities. This approach enables offline execution\nindependent of GU, allowing it to be seamlessly integrated into most GU methods\nto improve their unlearning performance. Based on this, we introduce Scalable\nGraph Unlearning (SGU) as a new fine-tuned framework, which balances the\nforgetting and reasoning capability of the unlearned model by entity-specific\noptimizations. Extensive experiments on 14 datasets, including large-scale\nogbn-papers100M, have demonstrated the effectiveness of our approach.\nSpecifically, NIM enhances the forgetting capability of most GU methods, while\nSGU achieves comprehensive SOTA performance and maintains scalability.\n","authors":["Xunkai Li","Bowen Fan","Zhengyu Wu","Zhiyu Li","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11823v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2501.06465v2","updated":"2025-01-21T01:56:11Z","published":"2025-01-11T07:35:51Z","title":"MedCT: A Clinical Terminology Graph for Generative AI Applications in\n  Healthcare","summary":"  We introduce the world's first clinical terminology for the Chinese\nhealthcare community, namely MedCT, accompanied by a clinical foundation model\nMedBERT and an entity linking model MedLink. The MedCT system enables\nstandardized and programmable representation of Chinese clinical data,\nsuccessively stimulating the development of new medicines, treatment pathways,\nand better patient outcomes for the populous Chinese community. Moreover, the\nMedCT knowledge graph provides a principled mechanism to minimize the\nhallucination problem of large language models (LLMs), therefore achieving\nsignificant levels of accuracy and safety in LLM-based clinical applications.\nBy leveraging the LLMs' emergent capabilities of generativeness and\nexpressiveness, we were able to rapidly built a production-quality terminology\nsystem and deployed to real-world clinical field within three months, while\nclassical terminologies like SNOMED CT have gone through more than twenty years\ndevelopment. Our experiments show that the MedCT system achieves\nstate-of-the-art (SOTA) performance in semantic matching and entity linking\ntasks, not only for Chinese but also for English. We also conducted a\nlongitudinal field experiment by applying MedCT and LLMs in a representative\nspectrum of clinical tasks, including electronic health record (EHR)\nauto-generation and medical document search for diagnostic decision making. Our\nstudy shows a multitude of values of MedCT for clinical workflows and patient\noutcomes, especially in the new genre of clinical LLM applications. We present\nour approach in sufficient engineering detail, such that implementing a\nclinical terminology for other non-English societies should be readily\nreproducible. We openly release our terminology, models and algorithms, along\nwith real-world clinical datasets for the development.\n","authors":["Ye Chen","Dongdong Huang","Haoyun Xu","Cong Fu","Lin Sheng","Qingli Zhou","Yuqiang Shen","Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06465v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11817v1","updated":"2025-01-21T01:52:02Z","published":"2025-01-21T01:52:02Z","title":"Toward Effective Digraph Representation Learning: A Magnetic Adaptive\n  Propagation based Approach","summary":"  The $q$-parameterized magnetic Laplacian serves as the foundation of directed\ngraph (digraph) convolution, enabling this kind of digraph neural network\n(MagDG) to encode node features and structural insights by complex-domain\nmessage passing. As a generalization of undirected methods, MagDG shows\nsuperior capability in modeling intricate web-scale topology. Despite the great\nsuccess achieved by existing MagDGs, limitations still exist: (1) Hand-crafted\n$q$: The performance of MagDGs depends on selecting an appropriate\n$q$-parameter to construct suitable graph propagation equations in the complex\ndomain. This parameter tuning, driven by downstream tasks, limits model\nflexibility and significantly increases manual effort. (2) Coarse Message\nPassing: Most approaches treat all nodes with the same complex-domain\npropagation and aggregation rules, neglecting their unique digraph contexts.\nThis oversight results in sub-optimal performance. To address the above issues,\nwe propose two key techniques: (1) MAP is crafted to be a plug-and-play\ncomplex-domain propagation optimization strategy in the context of digraph\nlearning, enabling seamless integration into any MagDG to improve predictions\nwhile enjoying high running efficiency. (2) MAP++ is a new digraph learning\nframework, further incorporating a learnable mechanism to achieve adaptively\nedge-wise propagation and node-wise aggregation in the complex domain for\nbetter performance. Extensive experiments on 12 datasets demonstrate that MAP\nenjoys flexibility for it can be incorporated with any MagDG, and scalability\nas it can deal with web-scale digraphs. MAP++ achieves SOTA predictive\nperformance on 4 different downstream tasks.\n","authors":["Xunkai Li","Daohan Su","Zhengyu Wu","Guang Zeng","Hongchao Qin","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11817v1.pdf","comment":"Accepted by WWW 2025"},{"id":"http://arxiv.org/abs/2410.22658v2","updated":"2025-01-21T01:37:47Z","published":"2024-10-30T02:57:35Z","title":"Incremental Learning of Retrievable Skills For Efficient Continual Task\n  Adaptation","summary":"  Continual Imitation Learning (CiL) involves extracting and accumulating task\nknowledge from demonstrations across multiple stages and tasks to achieve a\nmulti-task policy. With recent advancements in foundation models, there has\nbeen a growing interest in adapter-based CiL approaches, where adapters are\nestablished parameter-efficiently for tasks newly demonstrated. While these\napproaches isolate parameters for specific tasks and tend to mitigate\ncatastrophic forgetting, they limit knowledge sharing among different\ndemonstrations. We introduce IsCiL, an adapter-based CiL framework that\naddresses this limitation of knowledge sharing by incrementally learning\nshareable skills from different demonstrations, thus enabling sample-efficient\ntask adaptation using the skills particularly in non-stationary CiL\nenvironments. In IsCiL, demonstrations are mapped into the state embedding\nspace, where proper skills can be retrieved upon input states through\nprototype-based memory. These retrievable skills are incrementally learned on\ntheir corresponding adapters. Our CiL experiments with complex tasks in\nFranka-Kitchen and Meta-World demonstrate robust performance of IsCiL in both\ntask adaptation and sample-efficiency. We also show a simple extension of IsCiL\nfor task unlearning scenarios.\n","authors":["Daehee Lee","Minjong Yoo","Woo Kyung Kim","Wonje Choi","Honguk Woo"],"pdf_url":"https://arxiv.org/pdf/2410.22658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09672v2","updated":"2025-01-21T01:04:52Z","published":"2025-01-16T17:08:12Z","title":"Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP\n  Evaluation Benchmark","summary":"  The proliferation of Vision-Language Models (VLMs) in the past several years\ncalls for rigorous and comprehensive evaluation methods and benchmarks. This\nwork analyzes existing VLM evaluation techniques, including automated metrics,\nAI-based assessments, and human evaluations across diverse tasks. We first\nintroduce Robin - a novel suite of VLMs that we built by combining Large\nLanguage Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use\nRobin to identify shortcomings of current evaluation approaches across scales.\nNext, to overcome the identified limitations, we introduce CHIRP - a new long\nform response benchmark we developed for more robust and complete VLM\nevaluation. We provide open access to the Robin training code, model suite, and\nCHIRP benchmark to promote reproducibility and advance VLM research.\n","authors":["Alexis Roger","Prateek Humane","Daniel Z. Kaplan","Kshitij Gupta","Qi Sun","George Adamopoulos","Jonathan Siu Chi Lim","Quentin Anthony","Edwin Fennell","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2501.09672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08496v2","updated":"2025-01-21T01:01:37Z","published":"2025-01-14T23:59:23Z","title":"Quantifying the Importance of Data Alignment in Downstream Model\n  Performance","summary":"  Contrary to the conventional emphasis on dataset size, we explore the role of\ndata alignment -- an often overlooked aspect of data quality -- in training\ncapable Large Language Models (LLMs). To do so, we use the Task2Vec-based\nalignment coefficient, a quantitative measure of the similarity between two\ndatasets, to quantify the impact of alignment between training data and\nevaluation data on downstream performance. In particular, we conduct controlled\n\\textit{interventional} experiments for two settings: 1. the impact of\nincreased alignment coefficients between various pre-training (pt) against\nevaluation datasets, and 2. the impact of increased alignment coefficients\nbetween domain specific fine-tuning (ft) against domain specific evaluation.\nThe domain specific task we explore is Autoformalization -- the machine\ntranslation task between natural language and code for formal verification. In\nboth settings, we find a strong, predictable negative correlation between the\nalignment coefficient of a model's training and evaluation data and the model's\nloss/perplexity on the respective downstream task. These findings suggest a\nre-evaluation of LLM training approaches, demonstrating the relevance of data\nalignment compared to data quantity, especially in specialized downstream tasks\nsuch as Autoformalization.\n","authors":["Krrish Chawla","Aryan Sahai","Mario DePavia","Sudharsan Sundar","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08506v2","updated":"2025-01-21T01:01:20Z","published":"2025-01-15T00:56:59Z","title":"Exploring the Efficacy of Meta-Learning: Unveiling Superior Data\n  Diversity Utilization of MAML Over Pre-training","summary":"  Currently, data and model size dominate the narrative in the training of\nsuper-large, powerful models. However, there has been a lack of exploration on\nthe effect of other attributes of the training dataset on model performance. We\nhypothesize that dataset diversity can impact the performance of vision models.\nOur study shows positive correlations between test set accuracy and data\ndiversity, providing an argument for furthering the research of dataset\nattributes beyond size. We analyzed pre-training and model-agnostic\nmeta-learning methods on twelve popular visual datasets (e.g., Omniglot,\nCIFAR-FS, Aircraft) and five model configurations, including MAML variants with\ndifferent numbers of inner gradient steps and supervised learning. We show\nmoderate to strong positive correlations (R-squared: 0.15-0.42) between\naccuracy and data diversity and weaker but significant correlations (R-squared:\n~0.2) between loss and diversity. These findings support our hypothesis and\ndemonstrate a promising way for a deeper exploration of how formal data\ndiversity influences model performance. This initial study highlights the\npotential of (Task2Vec) data diversity as a valuable measure in the rapidly\nevolving field of large-scale learning and emphasizes that understanding the\ndataset is key to building more powerful and generalizable models.\n","authors":["Kavita Selva","Satita Vittayaareekul","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11799v1","updated":"2025-01-21T00:32:49Z","published":"2025-01-21T00:32:49Z","title":"Policy-Adaptable Methods For Resolving Normative Conflicts Through\n  Argumentation and Graph Colouring","summary":"  In a multi-agent system, one may choose to govern the behaviour of an agent\nby imposing norms, which act as guidelines for how agents should act either all\nof the time or in given situations. However, imposing multiple norms on one or\nmore agents may result in situations where these norms conflict over how the\nagent should behave. In any system with normative conflicts (such as safe\nreinforcement models or systems which monitor safety protocols), one must\ndecide which norms should be followed such that the most important and most\nrelevant norms are maintained. We introduce a new method for resolving\nnormative conflicts through argumentation and graph colouring which is\ncompatible with a variety of normative conflict resolution policies. We prove\nthat this method always creates an admissible set of arguments under\nargumentation semantics, meaning that it produces coherent outputs. We also\nintroduce more robust variants of this method, each building upon their\npredecessor to create a superior output, and we include further mathematical\nproof of their coherence. Our most advanced variant uses the existing concept\nof curtailment, where one norm may supersede another without fully eliminating\nit. The methods we introduce are all compatible with various pre-existing\npolicies for resolving normative conflicts. Empirical evaluations are also\nperformed to compare our algorithms to each other and to others in existing\nliterature.\n","authors":["Johnny Joyce"],"pdf_url":"https://arxiv.org/pdf/2501.11799v1.pdf","comment":"Written and submitted as master's thesis for University of\n  Southampton in 2020"}]},"2025-01-20T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2403.06222v2","updated":"2025-01-20T22:22:21Z","published":"2024-03-10T13:59:18Z","title":"Robust Predictive Motion Planning by Learning Obstacle Uncertainty","summary":"  Safe motion planning for robotic systems in dynamic environments is\nnontrivial in the presence of uncertain obstacles, where estimation of obstacle\nuncertainties is crucial in predicting future motions of dynamic obstacles. The\nworst-case characterization gives a conservative uncertainty prediction and may\nresult in infeasible motion planning for the ego robotic system. In this paper,\nan efficient, robust, and safe motion-planing algorithm is developed by\nlearning the obstacle uncertainties online. More specifically, the unknown yet\nintended control set of obstacles is efficiently computed by solving a linear\nprogramming problem. The learned control set is used to compute forward\nreachable sets of obstacles that are less conservative than the worst-case\nprediction. Based on the forward prediction, a robust model predictive\ncontroller is designed to compute a safe reference trajectory for the ego\nrobotic system that remains outside the reachable sets of obstacles over the\nprediction horizon. The method is applied to a car-like mobile robot in both\nsimulations and hardware experiments to demonstrate its effectiveness.\n","authors":["Jian Zhou","Yulong Gao","Ola Johansson","Björn Olofsson","Erik Frisk"],"pdf_url":"https://arxiv.org/pdf/2403.06222v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11742v1","updated":"2025-01-20T21:00:26Z","published":"2025-01-20T21:00:26Z","title":"Force-Aware Autonomous Robotic Surgery","summary":"  This work demonstrates the benefits of using tool-tissue interaction forces\nin the design of autonomous systems in robot-assisted surgery (RAS). Autonomous\nsystems in surgery must manipulate tissues of different stiffness levels and\nhence should apply different levels of forces accordingly. We hypothesize that\nthis ability is enabled by using force measurements as input to policies\nlearned from human demonstrations. To test this hypothesis, we use\nAction-Chunking Transformers (ACT) to train two policies through imitation\nlearning for automated tissue retraction with the da Vinci Research Kit (dVRK).\nTo quantify the effects of using tool-tissue interaction force data, we trained\na \"no force policy\" that uses the vision and robot kinematic data, and compared\nit to a \"force policy\" that uses force, vision and robot kinematic data. When\ntested on a previously seen tissue sample, the force policy is 3 times more\nsuccessful in autonomously performing the task compared with the no force\npolicy. In addition, the force policy is more gentle with the tissue compared\nwith the no force policy, exerting on average 62% less force on the tissue.\nWhen tested on a previously unseen tissue sample, the force policy is 3.5 times\nmore successful in autonomously performing the task, exerting an order of\nmagnitude less forces on the tissue, compared with the no force policy. These\nresults open the door to design force-aware autonomous systems that can meet\nthe surgical guidelines for tissue handling, especially using the newly\nreleased RAS systems with force feedback capabilities such as the da Vinci 5.\n","authors":["Alaa Eldin Abdelaal","Jiaying Fang","Tim N. Reinhart","Jacob A. Mejia","Tony Z. Zhao","Jeannette Bohg","Allison M. Okamura"],"pdf_url":"https://arxiv.org/pdf/2501.11742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06121v3","updated":"2025-01-20T19:22:24Z","published":"2023-05-10T13:11:23Z","title":"Transformer-Based Model for Monocular Visual Odometry: A Video\n  Understanding Approach","summary":"  Estimating the camera's pose given images from a single camera is a\ntraditional task in mobile robots and autonomous vehicles. This problem is\ncalled monocular visual odometry and often relies on geometric approaches that\nrequire considerable engineering effort for a specific scenario. Deep learning\nmethods have been shown to be generalizable after proper training and with a\nlarge amount of available data. Transformer-based architectures have dominated\nthe state-of-the-art in natural language processing and computer vision tasks,\nsuch as image and video understanding. In this work, we deal with the monocular\nvisual odometry as a video understanding task to estimate the 6 degrees of\nfreedom of a camera's pose. We contribute by presenting the TSformer-VO model\nbased on spatio-temporal self-attention mechanisms to extract features from\nclips and estimate the motions in an end-to-end manner. Our approach achieved\ncompetitive state-of-the-art performance compared with geometry-based and deep\nlearning-based methods on the KITTI visual odometry dataset, outperforming the\nDeepVO implementation highly accepted in the visual odometry community. The\ncode is publicly available at https://github.com/aofrancani/TSformer-VO.\n","authors":["André O. Françani","Marcos R. O. A. Maximo"],"pdf_url":"https://arxiv.org/pdf/2305.06121v3.pdf","comment":"This work has been accepted for publication in IEEE Access"},{"id":"http://arxiv.org/abs/2403.13730v2","updated":"2025-01-20T18:36:51Z","published":"2024-03-20T16:39:48Z","title":"Projection-free computation of robust controllable sets with constrained\n  zonotopes","summary":"  We study the problem of computing robust controllable sets for discrete-time\nlinear systems with additive uncertainty. We propose a tractable and scalable\napproach to inner- and outer-approximate robust controllable sets using\nconstrained zonotopes, when the additive uncertainty set is a symmetric,\nconvex, and compact set. Our least-squares-based approach uses novel\nclosed-form approximations of the Pontryagin difference between a constrained\nzonotopic minuend and a symmetric, convex, and compact subtrahend. Unlike\nexisting approaches, our approach does not rely on convex optimization solvers,\nand is projection-free for ellipsoidal and zonotopic uncertainty sets. We also\npropose a least-squares-based approach to compute a convex, polyhedral\nouter-approximation to constrained zonotopes, and characterize sufficient\nconditions under which all these approximations are exact. We demonstrate the\ncomputational efficiency and scalability of our approach in several case\nstudies, including the design of abort-safe rendezvous trajectories for a\nspacecraft in near-rectilinear halo orbit under uncertainty. Our approach can\ninner-approximate a 20-step robust controllable set for a 100-dimensional\nlinear system in under 15 seconds on a standard computer.\n","authors":["Abraham P. Vinod","Avishai Weiss","Stefano Di Cairano"],"pdf_url":"https://arxiv.org/pdf/2403.13730v2.pdf","comment":"23 pages, 7 figures; Accepted for publication at Automatica. See\n  https://youtu.be/6BPmHgxD3OI for the use of the proposed method in a\n  simplified abort-safe rendezvous problem"},{"id":"http://arxiv.org/abs/2308.07275v4","updated":"2025-01-20T16:56:11Z","published":"2023-08-14T17:04:08Z","title":"On Semidefinite Relaxations for Matrix-Weighted State-Estimation\n  Problems in Robotics","summary":"  In recent years, there has been remarkable progress in the development of\nso-called certifiable perception methods, which leverage semidefinite, convex\nrelaxations to find global optima of perception problems in robotics. However,\nmany of these relaxations rely on simplifying assumptions that facilitate the\nproblem formulation, such as an isotropic measurement noise distribution. In\nthis paper, we explore the tightness of the semidefinite relaxations of\nmatrix-weighted (anisotropic) state-estimation problems and reveal the\nlimitations lurking therein: matrix-weighted factors can cause convex\nrelaxations to lose tightness. In particular, we show that the semidefinite\nrelaxations of localization problems with matrix weights may be tight only for\nlow noise levels. To better understand this issue, we introduce a theoretical\nconnection between the posterior uncertainty of the state estimate and the\ncertificate matrix obtained via convex relaxation. With this connection in\nmind, we empirically explore the factors that contribute to this loss of\ntightness and demonstrate that redundant constraints can be used to regain it.\nAs a second technical contribution of this paper, we show that the\nstate-of-the-art relaxation of scalar-weighted SLAM cannot be used when matrix\nweights are considered. We provide an alternate formulation and show that its\nSDP relaxation is not tight (even for very low noise levels) unless specific\nredundant constraints are used. We demonstrate the tightness of our\nformulations on both simulated and real-world data.\n","authors":["Connor Holmes","Frederike Dümbgen","Timothy D Barfoot"],"pdf_url":"https://arxiv.org/pdf/2308.07275v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09624v4","updated":"2025-01-20T16:51:42Z","published":"2024-12-12T18:59:57Z","title":"GenEx: Generating an Explorable World","summary":"  Understanding, navigating, and exploring the 3D physical real world has long\nbeen a central challenge in the development of artificial intelligence. In this\nwork, we take a step toward this goal by introducing GenEx, a system capable of\nplanning complex embodied world exploration, guided by its generative\nimagination that forms priors (expectations) about the surrounding\nenvironments. GenEx generates an entire 3D-consistent imaginative environment\nfrom as little as a single RGB image, bringing it to life through panoramic\nvideo streams. Leveraging scalable 3D world data curated from Unreal Engine,\nour generative model is rounded in the physical world. It captures a continuous\n360-degree environment with little effort, offering a boundless landscape for\nAI agents to explore and interact with. GenEx achieves high-quality world\ngeneration, robust loop consistency over long trajectories, and demonstrates\nstrong 3D capabilities such as consistency and active 3D mapping. Powered by\ngenerative imagination of the world, GPT-assisted agents are equipped to\nperform complex embodied tasks, including both goal-agnostic exploration and\ngoal-driven navigation. These agents utilize predictive expectation regarding\nunseen parts of the physical world to refine their beliefs, simulate different\noutcomes based on potential decisions, and make more informed choices. In\nsummary, we demonstrate that GenEx provides a transformative platform for\nadvancing embodied AI in imaginative spaces and brings potential for extending\nthese capabilities to real-world exploration.\n","authors":["Taiming Lu","Tianmin Shu","Junfei Xiao","Luoxin Ye","Jiahao Wang","Cheng Peng","Chen Wei","Daniel Khashabi","Rama Chellappa","Alan Yuille","Jieneng Chen"],"pdf_url":"https://arxiv.org/pdf/2412.09624v4.pdf","comment":"Website: GenEx.world"},{"id":"http://arxiv.org/abs/2501.11554v1","updated":"2025-01-20T15:41:33Z","published":"2025-01-20T15:41:33Z","title":"Event-based vision for egomotion estimation using precise event timing","summary":"  Egomotion estimation is crucial for applications such as autonomous\nnavigation and robotics, where accurate and real-time motion tracking is\nrequired. However, traditional methods relying on inertial sensors are highly\nsensitive to external conditions, and suffer from drifts leading to large\ninaccuracies over long distances. Vision-based methods, particularly those\nutilising event-based vision sensors, provide an efficient alternative by\ncapturing data only when changes are perceived in the scene. This approach\nminimises power consumption while delivering high-speed, low-latency feedback.\nIn this work, we propose a fully event-based pipeline for egomotion estimation\nthat processes the event stream directly within the event-based domain. This\nmethod eliminates the need for frame-based intermediaries, allowing for\nlow-latency and energy-efficient motion estimation. We construct a shallow\nspiking neural network using a synaptic gating mechanism to convert precise\nevent timing into bursts of spikes. These spikes encode local optical flow\nvelocities, and the network provides an event-based readout of egomotion. We\nevaluate the network's performance on a dedicated chip, demonstrating strong\npotential for low-latency, low-power motion estimation. Additionally,\nsimulations of larger networks show that the system achieves state-of-the-art\naccuracy in egomotion estimation tasks with event-based cameras, making it a\npromising solution for real-time, power-constrained robotics applications.\n","authors":["Hugh Greatorex","Michele Mastella","Madison Cotteret","Ole Richter","Elisabetta Chicca"],"pdf_url":"https://arxiv.org/pdf/2501.11554v1.pdf","comment":"10 pages, 7 figures. Supplementary material: 4 pages, 1 figure"},{"id":"http://arxiv.org/abs/2501.11553v1","updated":"2025-01-20T15:41:16Z","published":"2025-01-20T15:41:16Z","title":"Clinically Ready Magnetic Microrobots for Targeted Therapies","summary":"  Systemic drug administration often causes off-target effects limiting the\nefficacy of advanced therapies. Targeted drug delivery approaches increase\nlocal drug concentrations at the diseased site while minimizing systemic drug\nexposure. We present a magnetically guided microrobotic drug delivery system\ncapable of precise navigation under physiological conditions. This platform\nintegrates a clinical electromagnetic navigation system, a custom-designed\nrelease catheter, and a dissolvable capsule for accurate therapeutic delivery.\nIn vitro tests showed precise navigation in human vasculature models, and in\nvivo experiments confirmed tracking under fluoroscopy and successful navigation\nin large animal models. The microrobot balances magnetic material\nconcentration, contrast agent loading, and therapeutic drug capacity, enabling\neffective hosting of therapeutics despite the integration complexity of its\ncomponents, offering a promising solution for precise targeted drug delivery.\n","authors":["Fabian C. Landers","Lukas Hertle","Vitaly Pustovalov","Derick Sivakumaran","Oliver Brinkmann","Kirstin Meiners","Pascal Theiler","Valentin Gantenbein","Andrea Veciana","Michael Mattmann","Silas Riss","Simone Gervasoni","Christophe Chautems","Hao Ye","Semih Sevim","Andreas D. Flouris","Josep Puigmartí-Luis","Tiago Sotto Mayor","Pedro Alves","Tessa Lühmann","Xiangzhong Chen","Nicole Ochsenbein","Ueli Moehrlen","Philipp Gruber","Miriam Weisskopf","Quentin Boehler","Salvador Pané","Bradley J. Nelson"],"pdf_url":"https://arxiv.org/pdf/2501.11553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01744v2","updated":"2025-01-20T13:54:28Z","published":"2024-12-02T17:40:29Z","title":"The Dilemma of Decision-Making in the Real World: When Robots Struggle\n  to Make Choices Due to Situational Constraints","summary":"  In order to demonstrate the limitations of assistive robotic capabilities in\nnoisy real-world environments, we propose a Decision-Making Scenario analysis\napproach that examines the challenges due to user and environmental\nuncertainty, and incorporates these into user studies. The scenarios highlight\nhow personalization can be achieved through more human-robot collaboration,\nparticularly in relation to individuals with visual, physical, cognitive,\nauditory impairments, clinical needs, environmental factors (noise, light\nlevels, clutter), and daily living activities. Our goal is for this\ncontribution to prompt reflection and aid in the design of improved robots\n(embodiment, sensors, actuation, cognition) and their behavior, and we aim to\nintroduces a groundbreaking strategy to enhance human-robot collaboration,\naddressing the complexities of decision-making under uncertainty through a\nScenario analysis approach. By emphasizing user-centered design principles and\noffering actionable solutions to real-world challenges, this work aims to\nidentify key decision-making challenges and propose potential solutions.\n","authors":["Khairidine Benali","Praminda Caleb-Solly"],"pdf_url":"https://arxiv.org/pdf/2412.01744v2.pdf","comment":"Accepted at TAROS 2024"},{"id":"http://arxiv.org/abs/2501.09600v3","updated":"2025-01-20T13:45:54Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n  Prototyping in Virtual Reality Applications","summary":"  SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v3.pdf","comment":"Accepted to IEEE VR 2025"},{"id":"http://arxiv.org/abs/2501.11434v1","updated":"2025-01-20T12:13:03Z","published":"2025-01-20T12:13:03Z","title":"An Incremental Sampling and Segmentation-Based Approach for Motion\n  Planning Infeasibility","summary":"  We present a simple and easy-to-implement algorithm to detect plan\ninfeasibility in kinematic motion planning. Our method involves approximating\nthe robot's configuration space to a discrete space, where each degree of\nfreedom has a finite set of values. The obstacle region separates the free\nconfiguration space into different connected regions. For a path to exist\nbetween the start and goal configurations, they must lie in the same connected\nregion of the free space. Thus, to ascertain plan infeasibility, we merely need\nto sample adequate points from the obstacle region that isolate start and goal.\nAccordingly, we progressively construct the configuration space by sampling\nfrom the discretized space and updating the bitmap cells representing obstacle\nregions. Subsequently, we partition this partially built configuration space to\nidentify different connected components within it and assess the connectivity\nof the start and goal cells. We illustrate this methodology on five different\nscenarios with configuration spaces having up to 5 degree-of-freedom (DOF).\n","authors":["Antony Thomas","Fulvio Mastrogiovanni","Marco Baglietto"],"pdf_url":"https://arxiv.org/pdf/2501.11434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11260v1","updated":"2025-01-20T04:00:02Z","published":"2025-01-20T04:00:02Z","title":"A Survey of World Models for Autonomous Driving","summary":"  Recent breakthroughs in autonomous driving have revolutionized the way\nvehicles perceive and interact with their surroundings. In particular, world\nmodels have emerged as a linchpin technology, offering high-fidelity\nrepresentations of the driving environment that integrate multi-sensor data,\nsemantic cues, and temporal dynamics. Such models unify perception, prediction,\nand planning, thereby enabling autonomous systems to make rapid, informed\ndecisions under complex and often unpredictable conditions. Research trends\nspan diverse areas, including 4D occupancy prediction and generative data\nsynthesis, all of which bolster scene understanding and trajectory forecasting.\nNotably, recent works exploit large-scale pretraining and advanced\nself-supervised learning to scale up models' capacity for rare-event simulation\nand real-time interaction. In addressing key challenges -- ranging from domain\nadaptation and long-tail anomaly detection to multimodal fusion -- these world\nmodels pave the way for more robust, reliable, and adaptable autonomous driving\nsolutions. This survey systematically reviews the state of the art,\ncategorizing techniques by their focus on future prediction, behavior planning,\nand the interaction between the two. We also identify potential directions for\nfuture research, emphasizing holistic integration, improved computational\nefficiency, and advanced simulation. Our comprehensive analysis underscores the\ntransformative role of world models in driving next-generation autonomous\nsystems toward safer and more equitable mobility.\n","authors":["Tuo Feng","Wenguan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2501.11260v1.pdf","comment":"Ongoing project"},{"id":"http://arxiv.org/abs/2406.02916v2","updated":"2025-01-20T03:42:21Z","published":"2024-06-05T04:10:29Z","title":"Real-time Motion Planning for autonomous vehicles in dynamic\n  environments","summary":"  Recent advancements in self-driving car technologies have enabled them to\nnavigate autonomously through various environments. However, one of the\ncritical challenges in autonomous vehicle operation is trajectory planning,\nespecially in dynamic environments with moving obstacles. This research aims to\ntackle this challenge by proposing a robust algorithm tailored for autonomous\ncars operating in dynamic environments with moving obstacles. The algorithm\nintroduces two main innovations. Firstly, it defines path density by adjusting\nthe number of waypoints along the trajectory, optimizing their distribution for\naccuracy in curved areas and reducing computational complexity in straight\nsections. Secondly, it integrates hierarchical motion planning algorithms,\ncombining global planning with an enhanced $A^*$ graph-based method and local\nplanning using the time elastic band algorithm with moving obstacle detection\nconsidering different motion models. The proposed algorithm is adaptable for\ndifferent vehicle types and mobile robots, making it versatile for real-world\napplications. Simulation results demonstrate its effectiveness across various\nconditions, promising safer and more efficient navigation for autonomous\nvehicles in dynamic environments. These modifications significantly improve\ntrajectory planning capabilities, addressing a crucial aspect of autonomous\nvehicle technology.\n","authors":["Mohammad Dehghani Tezerjani","Dominic Carrillo","Deyuan Qu","Sudip Dhakal","Amir Mirzaeinia","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2406.02916v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.11279v5","updated":"2025-01-20T02:39:33Z","published":"2024-03-17T17:37:21Z","title":"N-dimensional Convex Obstacle Avoidance using Hybrid Feedback Control\n  (Extended version)","summary":"  This paper addresses the autonomous robot navigation problem in a priori\nunknown n-dimensional environments containing convex obstacles of arbitrary\nshapes and sizes. We propose a hybrid feedback control scheme that guarantees\nsafe and global asymptotic convergence of the robot to a predefined target\nlocation. The proposed control strategy relies on a switching mechanism\nallowing the robot to operate either in the move-to-target mode or the\nobstacle-avoidance mode, based on its proximity to the obstacles and the\navailability of a clear straight path between the robot and the target. In the\nobstacle-avoidance mode, the robot is constrained to move within a\ntwo-dimensional plane that intersects the obstacle being avoided and the\ntarget, preventing it from retracing its path. The effectiveness of the\nproposed hybrid feedback controller is demonstrated through simulations in\ntwo-dimensional and three-dimensional environments.\n","authors":["Mayur Sawant","Ilia Polushin","Abdelhamid Tayebi"],"pdf_url":"https://arxiv.org/pdf/2403.11279v5.pdf","comment":"21 pages, 21 figures"},{"id":"http://arxiv.org/abs/2501.11202v1","updated":"2025-01-20T00:22:44Z","published":"2025-01-20T00:22:44Z","title":"Online Hybrid-Belief POMDP with Coupled Semantic-Geometric Models and\n  Semantic Safety Awareness","summary":"  Robots operating in complex and unknown environments frequently require\ngeometric-semantic representations of the environment to safely perform their\ntasks. While inferring the environment, they must account for many possible\nscenarios when planning future actions. Since objects' class types are discrete\nand the robot's self-pose and the objects' poses are continuous, the\nenvironment can be represented by a hybrid discrete-continuous belief which is\nupdated according to models and incoming data. Prior probabilities and\nobservation models representing the environment can be learned from data using\ndeep learning algorithms. Such models often couple environmental semantic and\ngeometric properties. As a result, semantic variables are interconnected,\ncausing semantic state space dimensionality to increase exponentially. In this\npaper, we consider planning under uncertainty using partially observable Markov\ndecision processes (POMDPs) with hybrid semantic-geometric beliefs. The models\nand priors consider the coupling between semantic and geometric variables.\nWithin POMDP, we introduce the concept of semantically aware safety. Obtaining\nrepresentative samples of the theoretical hybrid belief, required for\nestimating the value function, is very challenging. As a key contribution, we\ndevelop a novel form of the hybrid belief and leverage it to sample\nrepresentative samples. We show that under certain conditions, the value\nfunction and probability of safety can be calculated efficiently with an\nexplicit expectation over all possible semantic mappings. Our simulations show\nthat our estimates of the objective function and probability of safety achieve\nsimilar levels of accuracy compared to estimators that run exhaustively on the\nentire semantic state-space using samples from the theoretical hybrid belief.\nNevertheless, the complexity of our estimators is polynomial rather than\nexponential.\n","authors":["Tuvy Lemberg","Vadim Indelman"],"pdf_url":"https://arxiv.org/pdf/2501.11202v1.pdf","comment":"18 pages, 11 figures"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2403.06222v2","updated":"2025-01-20T22:22:21Z","published":"2024-03-10T13:59:18Z","title":"Robust Predictive Motion Planning by Learning Obstacle Uncertainty","summary":"  Safe motion planning for robotic systems in dynamic environments is\nnontrivial in the presence of uncertain obstacles, where estimation of obstacle\nuncertainties is crucial in predicting future motions of dynamic obstacles. The\nworst-case characterization gives a conservative uncertainty prediction and may\nresult in infeasible motion planning for the ego robotic system. In this paper,\nan efficient, robust, and safe motion-planing algorithm is developed by\nlearning the obstacle uncertainties online. More specifically, the unknown yet\nintended control set of obstacles is efficiently computed by solving a linear\nprogramming problem. The learned control set is used to compute forward\nreachable sets of obstacles that are less conservative than the worst-case\nprediction. Based on the forward prediction, a robust model predictive\ncontroller is designed to compute a safe reference trajectory for the ego\nrobotic system that remains outside the reachable sets of obstacles over the\nprediction horizon. The method is applied to a car-like mobile robot in both\nsimulations and hardware experiments to demonstrate its effectiveness.\n","authors":["Jian Zhou","Yulong Gao","Ola Johansson","Björn Olofsson","Erik Frisk"],"pdf_url":"https://arxiv.org/pdf/2403.06222v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04508v2","updated":"2025-01-20T21:35:15Z","published":"2025-01-08T13:55:22Z","title":"Linear Model of Aggregated Homogeneous Energy Storage Elements with\n  Realizable Dispatch Guarantees","summary":"  To optimize battery dispatch, a model is required that can predict the state\nof charge (SOC) trajectory and ensure dispatch is admissible (i.e., does not\nlead to unexpected SOC saturation). However, battery dispatch optimization is\ninherently challenging since batteries cannot simultaneously charge and\ndischarge, which begets a non-convex complementarity constraint. In this paper,\nwe consider a composition of energy storage elements that can charge or\ndischarge independently and provide a sufficient linear energy storage model of\nthe composite battery. This permits convex optimization of the composite\nbattery SOC trajectory while ensuring admissibility of the resulting\n(aggregated) power schedule and disaggregation to the individual energy storage\nelements.\n","authors":["Mazen Elsaadany","Mads R. Almassalkhi","Simon H. Tindemans"],"pdf_url":"https://arxiv.org/pdf/2501.04508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11704v1","updated":"2025-01-20T19:37:01Z","published":"2025-01-20T19:37:01Z","title":"Ultra-High Reliability by Predictive Interference Management Using\n  Extreme Value Theory","summary":"  Ultra-reliable low-latency communications (URLLC) require innovative\napproaches to modeling channel and interference dynamics, extending beyond\ntraditional average estimates to encompass entire statistical distributions,\nincluding rare and extreme events that challenge achieving ultra-reliability\nperformance regions. In this paper, we propose a risk-sensitive approach based\non extreme value theory (EVT) to predict the signal-to-interference-plus-noise\nratio (SINR) for efficient resource allocation in URLLC systems. We employ EVT\nto estimate the statistics of rare and extreme interference values, and kernel\ndensity estimation (KDE) to model the distribution of non-extreme events. Using\na mixture model, we develop an interference prediction algorithm based on\nquantile prediction, introducing a confidence level parameter to balance\nreliability and resource usage. While accounting for the risk sensitivity of\ninterference estimates, the prediction outcome is then used for appropriate\nresource allocation of a URLLC transmission under link outage constraints.\nSimulation results demonstrate that the proposed method outperforms the\nstate-of-the-art first-order discrete-time Markov chain (DTMC) approach by\nreducing outage rates up to 100-fold, achieving target outage probabilities as\nlow as \\(10^{-7}\\). Simultaneously, it minimizes radio resource usage\n\\(\\simnot15 \\%\\) compared to DTMC, while remaining only \\(\\simnot20 \\%\\) above\nthe optimal case with perfect interference knowledge, resulting in\nsignificantly higher prediction accuracy. Additionally, the method is\nsample-efficient, able to predict interference effectively with minimal\ntraining data.\n","authors":["Fateme Salehi","Aamir Mahmood","Sinem Coleri","Mikael Gidlund"],"pdf_url":"https://arxiv.org/pdf/2501.11704v1.pdf","comment":"6 pages, 5 figures, Accepted for IEEE ICC 2025"},{"id":"http://arxiv.org/abs/2501.11699v1","updated":"2025-01-20T19:24:43Z","published":"2025-01-20T19:24:43Z","title":"Power Ramp-Rate Control via Power Regulation for Storageless\n  Grid-Connected Photovoltaic Systems","summary":"  Photovoltaic Power Ramp-Rate Control (PRRC) constitutes a key ancillary\nservice for future power systems. Although its implementation through the\ninstallation of storage systems or irradiance sensors has been widely\ninvestigated, fewer studies have explored the power curtailment approach. The\nlatter lacks efficiency, as it voluntarily produces power discharges, yet it is\na cost-effective solution in terms of capital expenditures. This paper proposes\na novel storageless and sensorless photovoltaic PRRC for grid-connected\napplications in which the photovoltaic power, rather than the voltage, is the\ncontrolled magnitude. The aforementioned contribution makes the effective\ntracking of the power ramp-rate limit possible compared to the existing methods\nin the literature. The method is assisted by a real-time curve-fitting\nalgorithm that estimates the Maximum Power Point while operating suboptimally.\nThus, no direct temperature or irradiance measurement systems are needed. The\nvalidation of the proposed PRRC strategy has been tested by simulation and\ncompared to another approach available in the literature, considering\nreal-field highly variable irradiance data. Experimental validation of the\nproposed strategy has been performed in real time via Controller\nHardware-in-the-Loop.\n","authors":["Jose Miguel Riquelme-Dominguez","Francisco de Paula García-López","Sergio Martinez"],"pdf_url":"https://arxiv.org/pdf/2501.11699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11655v1","updated":"2025-01-20T18:38:51Z","published":"2025-01-20T18:38:51Z","title":"KKL Observer Synthesis for Nonlinear Systems via Physics-Informed\n  Learning","summary":"  This paper proposes a novel learning approach for designing\nKazantzis-Kravaris/Luenberger (KKL) observers for autonomous nonlinear systems.\nThe design of a KKL observer involves finding an injective map that transforms\nthe system state into a higher-dimensional observer state, whose dynamics is\nlinear and stable. The observer's state is then mapped back to the original\nsystem coordinates via the inverse map to obtain the state estimate. However,\nfinding this transformation and its inverse is quite challenging. We propose to\nsequentially approximate these maps by neural networks that are trained using\nphysics-informed learning. We generate synthetic data for training by\nnumerically solving the system and observer dynamics. Theoretical guarantees\nfor the robustness of state estimation against approximation error and system\nuncertainties are provided. Additionally, a systematic method for optimizing\nobserver performance through parameter selection is presented. The\neffectiveness of the proposed approach is demonstrated through numerical\nsimulations on benchmark examples and its application to sensor fault detection\nand isolation in a network of Kuramoto oscillators using learned KKL observers.\n","authors":["M. Umar B. Niazi","John Cao","Matthieu Barreau","Karl Henrik Johansson"],"pdf_url":"https://arxiv.org/pdf/2501.11655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13730v2","updated":"2025-01-20T18:36:51Z","published":"2024-03-20T16:39:48Z","title":"Projection-free computation of robust controllable sets with constrained\n  zonotopes","summary":"  We study the problem of computing robust controllable sets for discrete-time\nlinear systems with additive uncertainty. We propose a tractable and scalable\napproach to inner- and outer-approximate robust controllable sets using\nconstrained zonotopes, when the additive uncertainty set is a symmetric,\nconvex, and compact set. Our least-squares-based approach uses novel\nclosed-form approximations of the Pontryagin difference between a constrained\nzonotopic minuend and a symmetric, convex, and compact subtrahend. Unlike\nexisting approaches, our approach does not rely on convex optimization solvers,\nand is projection-free for ellipsoidal and zonotopic uncertainty sets. We also\npropose a least-squares-based approach to compute a convex, polyhedral\nouter-approximation to constrained zonotopes, and characterize sufficient\nconditions under which all these approximations are exact. We demonstrate the\ncomputational efficiency and scalability of our approach in several case\nstudies, including the design of abort-safe rendezvous trajectories for a\nspacecraft in near-rectilinear halo orbit under uncertainty. Our approach can\ninner-approximate a 20-step robust controllable set for a 100-dimensional\nlinear system in under 15 seconds on a standard computer.\n","authors":["Abraham P. Vinod","Avishai Weiss","Stefano Di Cairano"],"pdf_url":"https://arxiv.org/pdf/2403.13730v2.pdf","comment":"23 pages, 7 figures; Accepted for publication at Automatica. See\n  https://youtu.be/6BPmHgxD3OI for the use of the proposed method in a\n  simplified abort-safe rendezvous problem"},{"id":"http://arxiv.org/abs/2501.11633v1","updated":"2025-01-20T18:05:55Z","published":"2025-01-20T18:05:55Z","title":"PSO-based Sliding Mode Current Control of Grid-Forming Inverter in\n  Rotating Frame","summary":"  The Grid-Forming Inverter (GFMI) is an emerging topic that is attracting\nsignificant attention from both academic and industrial communities,\nparticularly in the area of control design. The Decoupled Average Model-based\nSliding Mode Current Controller (DAM-SMC) has been used to address the need\nsuch as fast response, fixed switching frequency, and no overshoot to avoid\nexceeding current limits. Typically, the control parameters for DAM-SMC are\nchosen based on expert knowledge and certain assumptions. However, these\nparameters may not achieve optimized performance due to system dynamics and\nuncertainties. To address this, this paper proposes a Particle Swarm\nOptimization (PSO)-based DAM-SMC controller, which inherits the control laws\nfrom DAM-SMC but optimizes the control parameters offline using PSO. The main\ngoal is to reduce chattering and achieve smaller tracking errors. The proposed\nmethod is compared with other metaheuristic optimization algorithms, such as\nGenetic Algorithm (GA) and Simulated Annealing (SA). Simulations are performed\nin MATLAB/Simulink across various scenarios to evaluate the effectiveness of\nthe proposed controller. The proposed approach achieves a substantial reduction\nin convergence time, decreasing it by 86.36% compared to the GA and by 88.89%\ncompared to SA. Furthermore, the tracking error is reduced by 11.61% compared\nto the conventional DAM-SMC algorithm. The robustness of the proposed method is\nvalidated under critical conditions, where plant and control model parameters\nvaried by up to 40%.\n","authors":["Quang-Manh Hoang","Guilherme Vieira Hollweg","Akhtar Hussain","Sina Zarrabian","Wencong Su","Van-Hai Bui"],"pdf_url":"https://arxiv.org/pdf/2501.11633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11626v1","updated":"2025-01-20T17:54:24Z","published":"2025-01-20T17:54:24Z","title":"DRL-Based Maximization of the Sum Cross-Layer Achievable Rate for\n  Networks Under Jamming","summary":"  In quasi-static wireless networks characterized by infrequent changes in the\ntransmission schedules of user equipment (UE), malicious jammers can easily\ndeteriorate network performance. Accordingly, a key challenge in these networks\nis managing channel access amidst jammers and under dynamic channel conditions.\nIn this context, we propose a robust learning-based mechanism for channel\naccess in multi-cell quasi-static networks under jamming. The network comprises\nmultiple legitimate UEs, including predefined UEs (pUEs) with stochastic\npredefined schedules and an intelligent UE (iUE) with an undefined transmission\nschedule, all transmitting over a shared, time-varying uplink channel. Jammers\ntransmit unwanted packets to disturb the pUEs' and the iUE's communication. The\niUE's learning process is based on the deep reinforcement learning (DRL)\nframework, utilizing a residual network (ResNet)-based deep Q-Network (DQN). To\ncoexist in the network and maximize the network's sum cross-layer achievable\nrate (SCLAR), the iUE must learn the unknown network dynamics while\nconcurrently adapting to dynamic channel conditions. Our simulation results\nreveal that, with properly defined state space, action space, and rewards in\nDRL, the iUE can effectively coexist in the network, maximizing channel\nutilization and the network's SCLAR by judiciously selecting transmission time\nslots and thus avoiding collisions and jamming.\n","authors":["Abdul Basit","Muddasir Rahim","Tri Nhu Do","Nadir Adam","Georges Kaddoum"],"pdf_url":"https://arxiv.org/pdf/2501.11626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04796v2","updated":"2025-01-20T16:41:19Z","published":"2025-01-08T19:23:45Z","title":"Democratic Resilience and Sociotechnical Shocks","summary":"  We focus on the potential fragility of democratic elections given modern\ninformation-communication technologies (ICT) in the Web 2.0 era. Our work\nprovides an explanation for the cascading attrition of public officials\nrecently in the United States and offers potential policy interventions from a\ndynamic system's perspective. We propose that micro-level heterogeneity across\nindividuals within crucial institutions leads to vulnerabilities of election\nsupport systems at the macro scale. Our analysis provides comparative\nstatistics to measure the fragility of systems against targeted harassment,\ndisinformation campaigns, and other adversarial manipulations that are now\ncheaper to scale and deploy. Our analysis also informs policy interventions\nthat seek to retain public officials and increase voter turnout. We show how\nlimited resources (for example, salary incentives to public officials and\ntargeted interventions to increase voter turnout) can be allocated at the\npopulation level to improve these outcomes and maximally enhance democratic\nresilience. On the one hand, structural and individual heterogeneity cause\nsystemic fragility that adversarial actors can exploit, but also provide\nopportunities for effective interventions that offer significant global\nimprovements from limited and localized actions.\n","authors":["M. Amin Rahimian","Michael P. Colaresi"],"pdf_url":"https://arxiv.org/pdf/2501.04796v2.pdf","comment":"Computational and Mathematical Organization Theory, forthcoming"},{"id":"http://arxiv.org/abs/2501.11553v1","updated":"2025-01-20T15:41:16Z","published":"2025-01-20T15:41:16Z","title":"Clinically Ready Magnetic Microrobots for Targeted Therapies","summary":"  Systemic drug administration often causes off-target effects limiting the\nefficacy of advanced therapies. Targeted drug delivery approaches increase\nlocal drug concentrations at the diseased site while minimizing systemic drug\nexposure. We present a magnetically guided microrobotic drug delivery system\ncapable of precise navigation under physiological conditions. This platform\nintegrates a clinical electromagnetic navigation system, a custom-designed\nrelease catheter, and a dissolvable capsule for accurate therapeutic delivery.\nIn vitro tests showed precise navigation in human vasculature models, and in\nvivo experiments confirmed tracking under fluoroscopy and successful navigation\nin large animal models. The microrobot balances magnetic material\nconcentration, contrast agent loading, and therapeutic drug capacity, enabling\neffective hosting of therapeutics despite the integration complexity of its\ncomponents, offering a promising solution for precise targeted drug delivery.\n","authors":["Fabian C. Landers","Lukas Hertle","Vitaly Pustovalov","Derick Sivakumaran","Oliver Brinkmann","Kirstin Meiners","Pascal Theiler","Valentin Gantenbein","Andrea Veciana","Michael Mattmann","Silas Riss","Simone Gervasoni","Christophe Chautems","Hao Ye","Semih Sevim","Andreas D. Flouris","Josep Puigmartí-Luis","Tiago Sotto Mayor","Pedro Alves","Tessa Lühmann","Xiangzhong Chen","Nicole Ochsenbein","Ueli Moehrlen","Philipp Gruber","Miriam Weisskopf","Quentin Boehler","Salvador Pané","Bradley J. Nelson"],"pdf_url":"https://arxiv.org/pdf/2501.11553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11542v1","updated":"2025-01-20T15:28:20Z","published":"2025-01-20T15:28:20Z","title":"DLinear-based Prediction of Remaining Useful Life of Lithium-Ion\n  Batteries: Feature Engineering through Explainable Artificial Intelligence","summary":"  Accurate prediction of the Remaining Useful Life (RUL) of lithium-ion\nbatteries is essential for ensuring safety, reducing maintenance costs, and\noptimizing usage. However, predicting RUL is challenging due to the nonlinear\ncharacteristics of the degradation caused by complex chemical reactions.\nMachine learning allows precise predictions by learning the latent functions of\ndegradation relationships based on cycling behavior. This study introduces an\naccurate RUL prediction approach based on feature engineering and DLinear,\napplied to the dataset from NASA's Prognostics Center of Excellence. Among the\n20 features generated from current, voltage, temperature, and time provided in\nthis dataset, key features contributing to degradation are selected using\nPearson correlation coefficient and Shapley values. Shapley value-based feature\nselection effectively reflects cell-to-cell variability, showing similar\nimportance rankings across all cells. The DLinear-based RUL prediction using\nkey features efficiently captures the time-series trend, demonstrating\nsignificantly better performance compared to Long Short-Term Memory and\nTransformer models.\n","authors":["Minsu Kim","Jaehyun Oh","Sang-Young Lee","Junghwan Kim"],"pdf_url":"https://arxiv.org/pdf/2501.11542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11532v1","updated":"2025-01-20T15:15:36Z","published":"2025-01-20T15:15:36Z","title":"Early Stopping Bayesian Optimization for Controller Tuning","summary":"  Manual tuning of performance-critical controller parameters can be tedious\nand sub-optimal. Bayesian Optimization (BO) is an increasingly popular\npractical alternative to automatically optimize controller parameters from few\nexperiments. Standard BO practice is to evaluate the closed-loop performance of\nparameters proposed during optimization on an episode with a fixed length.\nHowever, fixed-length episodes can be wasteful. For example, continuing an\nepisode where already the start shows undesirable behavior such as strong\noscillations seems pointless. Therefore, we propose a BO method that stops an\nepisode early if suboptimality becomes apparent before an episode is completed.\nSuch early stopping results in partial observations of the controller's\nperformance, which cannot directly be included in standard BO. We propose three\nheuristics to facilitate partially observed episodes in BO. Through five\nnumerical and one hardware experiment, we demonstrate that early stopping BO\ncan substantially reduce the time needed for optimization.\n","authors":["David Stenger","Dominik Scheurenberg","Heike Vallery","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2501.11532v1.pdf","comment":"Accepted for publication at CDC 2024"},{"id":"http://arxiv.org/abs/2501.11522v1","updated":"2025-01-20T14:55:55Z","published":"2025-01-20T14:55:55Z","title":"Optimal Trajectory Control of Geometrically Exact Strings with\n  Space-Time Finite Elements","summary":"  In this contribution, we present a variational space-time formulation which\ngenerates an optimal feed-forward controller for geometrically exact strings.\nMore concretely, the optimization problem is solved with an indirect approach,\nand the space-time finite element method translates the problem to a set of\nalgebraic equations. Thereby, only the positional field and the corresponding\nadjoint variable field are approximated by continuous shape functions, which\nmakes the discretization of a velocity field unnecessary. In addition, the\nvariational formulation can be solved using commercial or open source finite\nelement packages. The entire approach can also be interpreted as a\nmultiple-shooting method for solving the optimality conditions based on the\nsemi-discrete problem. The performance of our approach is demonstrated by a\nnumerical test.\n","authors":["Tobias Thoma","Paul Kotyczka"],"pdf_url":"https://arxiv.org/pdf/2501.11522v1.pdf","comment":"6 pages, 6 figures, submitted to the 23rd European Control Conference\n  (ECC 2025)"},{"id":"http://arxiv.org/abs/2501.11495v1","updated":"2025-01-20T14:03:00Z","published":"2025-01-20T14:03:00Z","title":"Discrete-Time Passivity-Based Control using Hermite-Obreschkoff Methods","summary":"  The motivation for this paper is the implementation of nonlinear state\nfeedback control, designed based on the continuous-time plant model, in a\nsampled control loop under relatively slow sampling. In previous work we have\nshown that using one-step predictions of the target dynamics with higher order\nintegration schemes, together with possibly higher order input shaping, is a\nsimple and effective way to increase the feasible sampling times until\nperformance degradation and instability occur. In this contribution we present\na unifying derivation for arbitrary orders of the previously used Lobatto IIIA\ncollocation and Hermite interpolation schemes through the Hermite-Obreschkoff\nformula. We derive, moreover, an IDA-PBC controller for a magnetic levitation\nsystem, which requires a non-constant target interconnection matrix, and show\nexperimental results.\n","authors":["Le Zhang","Paul Kotyczka"],"pdf_url":"https://arxiv.org/pdf/2501.11495v1.pdf","comment":"6 pages, 4 figures, submitted to the 13th IFAC Symposium on Nonlinear\n  Control Systems 2025"},{"id":"http://arxiv.org/abs/2501.11467v1","updated":"2025-01-20T12:56:00Z","published":"2025-01-20T12:56:00Z","title":"Fixed Point Certificates for Reachability and Expected Rewards in MDPs","summary":"  The possibility of errors in human-engineered formal verification software,\nsuch as model checkers, poses a serious threat to the purpose of these tools.\nAn established approach to mitigate this problem are certificates --\nlightweight, easy-to-check proofs of the verification results. In this paper,\nwe develop novel certificates for model checking of Markov decision processes\n(MDPs) with quantitative reachability and expected reward properties. Our\napproach is conceptually simple and relies almost exclusively on elementary\nfixed point theory. Our certificates work for arbitrary finite MDPs and can be\nreadily computed with little overhead using standard algorithms. We formalize\nthe soundness of our certificates in Isabelle/HOL and provide a formally\nverified certificate checker. Moreover, we augment existing algorithms in the\nprobabilistic model checker Storm with the ability to produce certificates and\ndemonstrate practical applicability by conducting the first formal\ncertification of the reference results in the Quantitative Verification\nBenchmark Set.\n","authors":["Krishnendu Chatterjee","Tim Quatmann","Maximilian Schäffeler","Maximilian Weininger","Tobias Winkler","Daniel Zilken"],"pdf_url":"https://arxiv.org/pdf/2501.11467v1.pdf","comment":"Extended version of the TACAS 2025 paper"},{"id":"http://arxiv.org/abs/2501.11406v1","updated":"2025-01-20T11:11:48Z","published":"2025-01-20T11:11:48Z","title":"Efficient Reduction of Interconnected Subsystem Models using Abstracted\n  Environments","summary":"  We present two frameworks for structure-preserving model order reduction of\ninterconnected subsystems, improving tractability of the reduction methods\nwhile ensuring stability and accuracy bounds of the reduced interconnected\nmodel. Instead of reducing each subsystem independently, we take a low-order\nabstraction of its environment into account to better capture the dynamics\nrelevant to the external input-output behaviour of the interconnected system,\nthereby increasing accuracy of the reduced interconnected model. This approach\nsignificantly reduces the computational costs of reduction by abstracting\ninstead of fully retaining the environment. The two frameworks differ in how\nthey generate these abstracted environments: one abstracts the environment as a\nwhole, whereas the other abstracts each individual subsystem. By relating\nlow-level errors introduced by reduction and abstraction to the resulting\nhigh-level error on the interconnected system, we are able to translate\nhigh-level accuracy requirements (on the reduced interconnected system) to\nlow-level specifications (on abstraction and reduction errors) using techniques\nfrom robust performance analysis. By adhering to these low-level\nspecifications, restricting the introduced low-level errors, both frameworks\nautomatically guarantee the accuracy and stability of the reduced\ninterconnected system. We demonstrate the effectiveness of both frameworks by\napplying them to a structural dynamics model of a two-stroke wafer stage,\nachieving improved accuracy and/or greater reduction compared to an existing\nmethod from literature.\n","authors":["Luuk Poort","Bart Besselink","Rob H. B. Fey","Nathan van de Wouw"],"pdf_url":"https://arxiv.org/pdf/2501.11406v1.pdf","comment":"17 pages, 12 figures and 2 tables, to appear in the European Journal\n  of Control"},{"id":"http://arxiv.org/abs/2411.08759v2","updated":"2025-01-20T10:15:10Z","published":"2024-11-13T16:46:10Z","title":"Clutter-Aware Target Detection for ISAC in a Millimeter-Wave Cell-Free\n  Massive MIMO System","summary":"  In this paper, we investigate the performance of an integrated sensing and\ncommunication (ISAC) system within a cell-free massive multiple-input\nmultiple-output (MIMO) system. Each access point (AP) operates in the\nmillimeter-wave (mmWave) frequency band. The APs jointly serve the user\nequipments (UEs) in the downlink while simultaneously detecting a target\nthrough dedicated sensing beams, which are directed toward a reconfigurable\nintelligent surface (RIS). Although the AP-RIS, RIS-target, and AP-target\nchannels have both line-of-sight (LoS) and non-line-of-sight (NLoS) parts, it\nis assumed only knowledge of the LoS paths is available. A key contribution of\nthis study is the consideration of clutter, which degrades the target detection\nif not handled. We propose an algorithm to alternatively optimize the transmit\npower allocation and the RIS phase-shift matrix, maximizing the target\nsignal-to-clutter-plus-noise ratio (SCNR) while ensuring a minimum\nsignal-to-interference-plus-noise ratio (SINR) for the UEs. Numerical results\ndemonstrate that exploiting clutter subspace significantly enhances detection\nprobability, particularly at high clutter-to-noise ratios, and reveal that an\nincreased number of transmit side clusters impair detection performance.\nFinally, we highlight the performance gains achieved using a dedicated sensing\nstream.\n","authors":["Steven Rivetti","Ozlem Tugfe Demir","Emil Bjornson","Mikael Skoglund"],"pdf_url":"https://arxiv.org/pdf/2411.08759v2.pdf","comment":"submitted to IEEE ICC25 WORKSHOPS"},{"id":"http://arxiv.org/abs/2501.11374v1","updated":"2025-01-20T10:05:01Z","published":"2025-01-20T10:05:01Z","title":"Linear ADRC is equivalent to PID with set-point weighting and\n  measurement filter","summary":"  We show that linear Active Disturbance-Rejection Control (ADRC) tuned using\nthe \"bandwidth method\" is equivalent to PI(D) control with set-point weighting\nand a lowpass filter on the measurement signal. We also provide simple\nexpressions that make it possible to implement linear ADRC for first and\nsecond-order systems using commonplace two degree-of-freedom PID\nimplementations. The expressions are equivalent to ADRC in the response from\nmeasurements, and a slight approximation in the response from references.\n","authors":["Fredrik Bagge Carlson"],"pdf_url":"https://arxiv.org/pdf/2501.11374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11338v1","updated":"2025-01-20T08:44:52Z","published":"2025-01-20T08:44:52Z","title":"Driver Behavior Soft-Sensor Based on Neurofuzzy Systems and Weighted\n  Projection on Principal Components","summary":"  This work has as main objective the development of a soft-sensor to classify,\nin real time, the behaviors of drivers when they are at the controls of a\nvehicle. Efficient classification of drivers' behavior while driving, using\nonly the measurements of the sensors already incorporated in the vehicles and\nwithout the need to add extra hardware (smart phones, cameras, etc.), is a\nchallenge. The main advantage of using only the data center signals of modern\nvehicles is economical. The classification of the driving behavior and the\nwarning to the driver of dangerous behaviors without the need to add extra\nhardware (and their software) to the vehicle, would allow the direct\nintegration of these classifiers into the current vehicles without incurring a\ngreater cost in the manufacture of the vehicles and therefore be an added\nvalue. In this work, the classification is obtained based only on speed,\nacceleration and inertial measurements which are already present in many modern\nvehicles. The proposed algorithm is based on a structure made by several\nNeurofuzzy systems with the combination of projected data in components of\nvarious Principal Component Analysis. A comparison with several types of\nclassical classifying algorithms has been made.\n","authors":["Juan Manuel Escaño","Miguel A. Ridao-Olivar","Carmelina Ierardi","Adolfo J. Sánchez","Kumars Rouzbehi"],"pdf_url":"https://arxiv.org/pdf/2501.11338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11333v1","updated":"2025-01-20T08:36:56Z","published":"2025-01-20T08:36:56Z","title":"A Dynamic Improvement Framework for Vehicular Task Offloading","summary":"  In this paper, the task offloading from vehicles with random velocities is\noptimized via a novel dynamic improvement framework. Particularly, in a\nvehicular network with multiple vehicles and base stations (BSs), computing\ntasks of vehicles are offloaded via BSs to an edge server. Due to the random\nvelocities, the exact trajectories of vehicles cannot be predicted in advance.\nHence, instead of deterministic optimization, the cell association, uplink time\nand throughput allocation of multiple vehicles in a period of task offloading\nare formulated as a finite-horizon Markov decision process. In the proposed\nsolution framework, we first obtain a reference scheduling scheme of cell\nassociation, uplink time and throughput allocation via deterministic\noptimization at the very beginning. The reference scheduling scheme is then\nused to approximate the value functions of the Bellman's equations, and the\nactual scheduling action is determined in each time slot according to the\ncurrent system state and approximate value functions. Thus, the intensive\ncomputation for value iteration in the conventional solution is eliminated.\nMoreover, a non-trivial average cost upper bound is provided for the proposed\nsolution framework. In the simulation, the random trajectories of vehicles are\ngenerated from a high-fidelity traffic simulator. It is shown that the\nperformance gain of the proposed scheduling framework over the baselines is\nsignificant.\n","authors":["Qianren Li","Yuncong Hong","Bojie Lv","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04287v4","updated":"2025-01-20T08:32:53Z","published":"2024-05-07T12:58:37Z","title":"Asymmetry of Frequency Distribution in Power Systems: Sources,\n  Estimation, Impact and Control","summary":"  This paper analyses an emerging real-world phenomena in inverter-based\nrenewable-dominated power systems, namely, asymmetry of frequency distribution.\nThe paper first provides a rationale on why asymmetry reduces the \"quality\" of\nthe frequency control and system operation. Then it provides qualitative\ntheoretical insights that explain asymmetry in terms of the nonlinearity of\nreal-world power systems and associated models. In particular network losses\nand pitch angle-based frequency control of wind power plants are discussed.\nThen the paper proposes a nonlinear compensation control to reduce the\nasymmetry as well as a statistical metric based on the frequency probability\ndistribution to quantify the level of asymmetry in a power system. Real-world\ndata obtained from the Irish and Australian transmission systems serve to\nsupport the theoretical appraisal, whereas simulations based on an IEEE\nbenchmark system show the effectiveness of the proposed nonlinear compensation.\nThe case study also shows that, while automatic generation control reduces\nasymmetry, frequency control limits and droop-based frequency support provided\nby wind generation using a tight deadband of 15 mHz, namely active power\ncontrol, leads to a significant increase in the asymmetry of the frequency\nprobability distribution.\n","authors":["Taulant Kerci","Federico Milano"],"pdf_url":"https://arxiv.org/pdf/2405.04287v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11328v2","updated":"2025-01-20T07:30:21Z","published":"2024-08-21T04:21:30Z","title":"Fast State Stabilization using Deep Reinforcement Learning for\n  Measurement-based Quantum Feedback Control","summary":"  The stabilization of quantum states is a fundamental problem for realizing\nvarious quantum technologies. Measurement-based-feedback strategies have\ndemonstrated powerful performance, and the construction of quantum control\nsignals using measurement information has attracted great interest. However,\nthe interaction between quantum systems and the environment is inevitable,\nespecially when measurements are introduced, which leads to decoherence. To\nmitigate decoherence, it is desirable to stabilize quantum systems faster,\nthereby reducing the time of interaction with the environment. In this paper,\nwe utilize information obtained from measurement and apply deep reinforcement\nlearning (DRL) algorithms, without explicitly constructing specific complex\nmeasurement-control mappings, to rapidly drive random initial quantum state to\nthe target state. The proposed DRL algorithm has the ability to speed up the\nconvergence to a target state, which shortens the interaction between quantum\nsystems and their environments to protect coherence. Simulations are performed\non two-qubit and three-qubit systems, and the results show that our algorithm\ncan successfully stabilize random initial quantum system to the target\nentangled state, with a convergence time faster than traditional methods such\nas Lyapunov feedback control and several DRL algorithms with different reward\nfunctions. Moreover, it exhibits robustness against imperfect measurements and\ndelays in system evolution.\n","authors":["Chunxiang Song","Yanan Liu","Daoyi Dong","Hidehiro Yonezawa"],"pdf_url":"https://arxiv.org/pdf/2408.11328v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08147v7","updated":"2025-01-20T05:17:05Z","published":"2024-10-10T17:31:36Z","title":"The Bouc-Wen Model for Binary Direct Collinear Collisions of Convex\n  Viscoplastic Bodies","summary":"  We study mathematical models of binary direct collinear collisions of convex\nviscoplastic bodies based on two incremental collision laws that employ the\nBouc-Wen differential model of hysteresis to represent the elastoplastic\nbehavior of the materials of the colliding bodies. These collision laws are the\nBouc-Wen-Simon-Hunt-Crossley Collision Law (BWSHCCL) and the Bouc-Wen-Maxwell\nCollision Law (BWMCL). The BWSHCCL comprises of the Bouc-Wen model amended with\na nonlinear Hertzian elastic spring element and connected in parallel to a\nnonlinear displacement-dependent and velocity-dependent energy dissipation\nelement. The BWMCL comprises of the Bouc-Wen model amended with a nonlinear\nHertzian elastic spring element and connected in series to a linear\nvelocity-dependent energy dissipation element. The mathematical models of the\ncollision process are presented in the form of finite-dimensional initial value\nproblems. We show that the models possess favorable analytical properties\n(e.g., global existence, uniqueness, and boundedness of the solutions) under\nsuitable restrictions on the values of their parameters. Furthermore, based on\nthe results of two model parameter identification studies, we demonstrate that\ngood agreement can be attained between experimental data and numerical\napproximations of the behavior of the mathematical models across a wide range\nof initial relative velocities of the colliding bodies while using\nparameterizations of the models that are independent of the initial relative\nvelocity.\n","authors":["Mihails Milehins","Dan B. Marghitu"],"pdf_url":"https://arxiv.org/pdf/2410.08147v7.pdf","comment":"15 pages; 5 figures; (v1-v5) a variety of amendments; (v6) updated\n  scaling/nondimensionalization and introduced amendments based on external\n  feedback; (v7) further minor amendments; the associated code/data are\n  available from https://gitlab.com/user9716869/BWBCL"},{"id":"http://arxiv.org/abs/2501.11255v1","updated":"2025-01-20T03:42:12Z","published":"2025-01-20T03:42:12Z","title":"Bounding the Settling Time of Finite-Time Stable Systems using Sum of\n  Squares","summary":"  Finite-time stability (FTS) of a differential equation guarantees that\nsolutions reach a given equilibrium point in finite time, where the time of\nconvergence depends on the initial state of the system. For traditional\nstability notions such as exponential stability, the convex optimization\nframework of Sum-of-Squares (SoS) enables the computation of polynomial\nLyapunov functions to certify stability. However, finite-time stable systems\nare characterized by non-Lipschitz, non-polynomial vector fields, rendering\nstandard SoS methods inapplicable. To this end, in this paper, we show that the\ncomputation of a non-polynomial Lyapunov function certifying finite-time\nstability can be reformulated as computation of a polynomial one under a\nparticular transformation that we develop in this work. As a result, SoS can be\nutilized to compute a Lyapunov function for FTS. This Lyapunov function can\nthen be used to obtain a bound on the settling time. We first present this\napproach for the scalar case and then extend it to the multivariate case.\nNumerical examples demonstrate the effectiveness of our approach in both\ncertifying finite-time stability and computing accurate settling time bounds.\nThis work represents the first combination of SoS programming with settling\ntime bounds for finite-time stable systems.\n","authors":["Sengiyumva Kisole","Kunal Garg","Matthew Peet"],"pdf_url":"https://arxiv.org/pdf/2501.11255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11246v1","updated":"2025-01-20T03:14:59Z","published":"2025-01-20T03:14:59Z","title":"Unlocking the Potential: A Novel Tool for Assessing Untapped\n  Micro-Pumped Hydro Energy Storage Systems in Michigan","summary":"  This study presents an innovative tool designed to unlock the potential of\nMichigan's lakes and dams for applications such as water resource management\nand renewable energy generation. Given Michigan's relatively flat landscape,\nthe focus is on systems that could serve as micro-hydro energy storage\nsolutions. To ensure accuracy and reliability, the tool incorporates extensive\ndata gathered from authorized sources, covering more than 420 water facilities\nand potential reservoirs in the state. These data are used as part of a case\nstudy to evaluate the tool's capabilities. Key parameters assessed include\nhorizontal and vertical distances (head), volume, and the total storage\ncapacity of each reservoir, measured in GWh. By analyzing these factors, the\ntool determines the suitability of various lakes and dams for hydroelectric\npower generation, and other uses based on the horizontal and vertical threshold\ndistances. Its robust assessment framework integrates these metrics to\ncomprehensively evaluate each site's potential. The tool's friendly interface\nand advanced data visualization features make the findings easy to interpret,\nfacilitating optimal resource utilization and informed decision-making for\nstate authorities. Hence, this tool represents a meaningful advancement in\nmanaging Michigan's water resources sustainably, promoting environmentally\nfriendly practices, and supporting economic development.\n","authors":["Sharaf K. Magableh","Xuesong Wang","Oraib Dawaghreh","Caisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11279v5","updated":"2025-01-20T02:39:33Z","published":"2024-03-17T17:37:21Z","title":"N-dimensional Convex Obstacle Avoidance using Hybrid Feedback Control\n  (Extended version)","summary":"  This paper addresses the autonomous robot navigation problem in a priori\nunknown n-dimensional environments containing convex obstacles of arbitrary\nshapes and sizes. We propose a hybrid feedback control scheme that guarantees\nsafe and global asymptotic convergence of the robot to a predefined target\nlocation. The proposed control strategy relies on a switching mechanism\nallowing the robot to operate either in the move-to-target mode or the\nobstacle-avoidance mode, based on its proximity to the obstacles and the\navailability of a clear straight path between the robot and the target. In the\nobstacle-avoidance mode, the robot is constrained to move within a\ntwo-dimensional plane that intersects the obstacle being avoided and the\ntarget, preventing it from retracing its path. The effectiveness of the\nproposed hybrid feedback controller is demonstrated through simulations in\ntwo-dimensional and three-dimensional environments.\n","authors":["Mayur Sawant","Ilia Polushin","Abdelhamid Tayebi"],"pdf_url":"https://arxiv.org/pdf/2403.11279v5.pdf","comment":"21 pages, 21 figures"},{"id":"http://arxiv.org/abs/2406.14372v3","updated":"2025-01-20T02:20:21Z","published":"2024-06-20T14:42:45Z","title":"Ring-LWE based encrypted controller with unlimited number of recursive\n  multiplications and effect of error growth","summary":"  In this paper, we propose an encrypted dynamic controller that executes an\nunlimited number of recursive homomorphic multiplications on a Ring Learning\nWith Errors (Ring-LWE) based cryptosystem without bootstrapping. The proposed\ncontroller exhibits lower computational complexity compared to existing\nencrypted controllers implemented on LWE based schemes due to the polynomial\nstructure of Ring-LWE. However, the structural difference introduces additional\ndifficulties in analyzing the effect of error growth; Ring-LWE based schemes\ninject multiple error coefficients when encrypting a single message, which\naccumulate under recursive homomorphic multiplications. We show that their\neffect on the control performance can be arbitrarily bounded by the closed-loop\nstability, thus recovering the performance of the unencrypted controller.\nFurthermore, a novel method to ``pack'' a vector into a polynomial is\npresented, which enhances computational and memory efficiency when applied to\nthe proposed encrypted controller. The effectiveness of the proposed design is\ndemonstrated through numerical simulations.\n","authors":["Yeongjun Jang","Joowon Lee","Seonhong Min","Hyesun Kwak","Junsoo Kim","Yongsoo Song"],"pdf_url":"https://arxiv.org/pdf/2406.14372v3.pdf","comment":"12 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.00594v2","updated":"2025-01-20T01:54:47Z","published":"2024-09-01T03:13:57Z","title":"CSAC Drift Modeling Considering GPS Signal Quality in the Case of GPS\n  Signal Unavailability","summary":"  The Global Positioning System (GPS), one of the Global Navigation Satellite\nSystems (GNSS), provides accurate position, navigation and time (PNT)\ninformation to various applications. One of the application that is highly\nreceiving attention is satellite vehicles, especially Low Earth Orbit (LEO)\nsatellites. Due to their limited ways to get PNT information and low\nperformance of their onboard clocks, GPS system time (GPST) provided by GPS is\na good reference clock to synchronize. However, GPS is well-known for its\nvulnerability to intentional or unintentional interference. This study aims to\nmaintain the onboard clock with less error relative to the GPST even when the\nGPS signal is disrupted. In this study, we analyzed two major factors that\naffects the quality of the GPS measurements: the number of the visible\nsatellites and the geometry of the satellites. Then, we proposed a weighted\nmodel for a Chip-Scale Atomic Clock (CSAC) that mitigates the clock error\nrelative to the GPST while considering the two factors. Based on this model, a\nstand-alone CSAC could maintain its error less than 4 microseconds, even in a\nsituation where no GPS signals are received for 12 hours.\n","authors":["Seunghyeon Park","Joon Hyo Rhee"],"pdf_url":"https://arxiv.org/pdf/2409.00594v2.pdf","comment":"Submitted to ICCAS 2024"}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2402.11268v2","updated":"2025-01-20T22:07:40Z","published":"2024-02-17T12:59:42Z","title":"Constrained Hellinger-Kantorovich barycenters: least-cost soft and conic\n  multi-marginal formulations","summary":"  We show that the problem of finding the barycenter in the\nHellinger-Kantorovich setting admits a least-cost soft multi-marginal\nformulation, provided that a one-sided hard marginal constraint is introduced.\nThe constrained approach is then shown to admit a conic multi-marginal\nreformulation based on defining a single joint multi-marginal perspective cost\nfunction in the conic multi-marginal formulation, as opposed to separate\ntwo-marginal perspective cost functions for each two-marginal problem in the\ncoupled-two-marginal formulation, as was studied previously in literature. We\nfurther establish that, as in the Wasserstein metric, the recently introduced\nframework of unbalanced multi-marginal optimal transport can be reformulated\nusing the notion of the least cost. Subsequently, we discuss an example when\ninput measures are Dirac masses and numerically solve an example for Gaussian\nmeasures. Finally, we also explore why the constrained approach can be seen as\na natural extension of a Wasserstein space barycenter to the unbalanced\nsetting.\n","authors":["Maciej Buze"],"pdf_url":"https://arxiv.org/pdf/2402.11268v2.pdf","comment":"25 pages, 1 figure, accepted version"},{"id":"http://arxiv.org/abs/2501.11700v1","updated":"2025-01-20T19:24:59Z","published":"2025-01-20T19:24:59Z","title":"A Decomposition Framework for Nonlinear Nonconvex Two-Stage Optimization","summary":"  We propose a new decomposition framework for continuous nonlinear constrained\ntwo-stage optimization, where both first- and second-stage problems can be\nnonconvex. A smoothing technique based on an interior-point formulation renders\nthe optimal solution of the second-stage problem differentiable with respect to\nthe first-stage parameters. As a consequence, efficient off-the-shelf\noptimization packages can be utilized. We show that the solution of the\nnonconvex second-stage problem behaves locally like a differentiable function\nso that existing proofs can be applied for the global convergence of the\nfirst-stage. We also prove fast local convergence of the algorithm as the\nbarrier parameter is driven to zero. Numerical experiments for large-scale\ninstances demonstrate the computational advantages of the decomposition\nframework.\n","authors":["Yuchen Lou","Xinyi Luo","Andreas Wächter","Ermin Wei"],"pdf_url":"https://arxiv.org/pdf/2501.11700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16167v2","updated":"2025-01-20T19:09:12Z","published":"2024-08-28T23:15:46Z","title":"Free Lunch in the Forest: Functionally-Identical Pruning of Boosted Tree\n  Ensembles","summary":"  Tree ensembles, including boosting methods, are highly effective and widely\nused for tabular data. However, large ensembles lack interpretability and\nrequire longer inference times. We introduce a method to prune a tree ensemble\ninto a reduced version that is \"functionally identical\" to the original model.\nIn other words, our method guarantees that the prediction function stays\nunchanged for any possible input. As a consequence, this pruning algorithm is\nlossless for any aggregated metric. We formalize the problem of functionally\nidentical pruning on ensembles, introduce an exact optimization model, and\nprovide a fast yet highly effective method to prune large ensembles. Our\nalgorithm iteratively prunes considering a finite set of points, which is\nincrementally augmented using an adversarial model. In multiple computational\nexperiments, we show that our approach is a \"free lunch\", significantly\nreducing the ensemble size without altering the model's behavior. Thus, we can\npreserve state-of-the-art performance at a fraction of the original model's\nsize.\n","authors":["Youssouf Emine","Alexandre Forel","Idriss Malek","Thibaut Vidal"],"pdf_url":"https://arxiv.org/pdf/2408.16167v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11673v1","updated":"2025-01-20T18:55:51Z","published":"2025-01-20T18:55:51Z","title":"Randomized Kaczmarz Methods with Beyond-Krylov Convergence","summary":"  Randomized Kaczmarz methods form a family of linear system solvers which\nconverge by repeatedly projecting their iterates onto randomly sampled\nequations. While effective in some contexts, such as highly over-determined\nleast squares, Kaczmarz methods are traditionally deemed secondary to Krylov\nsubspace methods, since this latter family of solvers can exploit outliers in\nthe input's singular value distribution to attain fast convergence on\nill-conditioned systems.\n  In this paper, we introduce Kaczmarz++, an accelerated randomized block\nKaczmarz algorithm that exploits outlying singular values in the input to\nattain a fast Krylov-style convergence. Moreover, we show that Kaczmarz++\ncaptures large outlying singular values provably faster than popular Krylov\nmethods, for both over- and under-determined systems. We also develop an\noptimized variant for positive semidefinite systems, called CD++, demonstrating\nempirically that it is competitive in arithmetic operations with both CG and\nGMRES on a collection of benchmark problems. To attain these results, we\nintroduce several novel algorithmic improvements to the Kaczmarz framework,\nincluding adaptive momentum acceleration, Tikhonov-regularized projections, and\na memoization scheme for reusing information from previously sampled\nequation~blocks.\n","authors":["Michał Dereziński","Deanna Needell","Elizaveta Rebrova","Jiaming Yang"],"pdf_url":"https://arxiv.org/pdf/2501.11673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11662v1","updated":"2025-01-20T18:44:17Z","published":"2025-01-20T18:44:17Z","title":"On a Lemma by Brézis and Haraux","summary":"  We propose several applications of an often overlooked part of the 1976 paper\nby Br\\'ezis and Haraux, in which the celebrated Br\\'ezis--Haraux theorem was\nestablished. Our results unify and extend various existing ones on the range of\na composite monotone operator and provide new insight into the seminal work by\nBr\\'ezis and Haraux.\n","authors":["Minh N. Bùi"],"pdf_url":"https://arxiv.org/pdf/2501.11662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13730v2","updated":"2025-01-20T18:36:51Z","published":"2024-03-20T16:39:48Z","title":"Projection-free computation of robust controllable sets with constrained\n  zonotopes","summary":"  We study the problem of computing robust controllable sets for discrete-time\nlinear systems with additive uncertainty. We propose a tractable and scalable\napproach to inner- and outer-approximate robust controllable sets using\nconstrained zonotopes, when the additive uncertainty set is a symmetric,\nconvex, and compact set. Our least-squares-based approach uses novel\nclosed-form approximations of the Pontryagin difference between a constrained\nzonotopic minuend and a symmetric, convex, and compact subtrahend. Unlike\nexisting approaches, our approach does not rely on convex optimization solvers,\nand is projection-free for ellipsoidal and zonotopic uncertainty sets. We also\npropose a least-squares-based approach to compute a convex, polyhedral\nouter-approximation to constrained zonotopes, and characterize sufficient\nconditions under which all these approximations are exact. We demonstrate the\ncomputational efficiency and scalability of our approach in several case\nstudies, including the design of abort-safe rendezvous trajectories for a\nspacecraft in near-rectilinear halo orbit under uncertainty. Our approach can\ninner-approximate a 20-step robust controllable set for a 100-dimensional\nlinear system in under 15 seconds on a standard computer.\n","authors":["Abraham P. Vinod","Avishai Weiss","Stefano Di Cairano"],"pdf_url":"https://arxiv.org/pdf/2403.13730v2.pdf","comment":"23 pages, 7 figures; Accepted for publication at Automatica. See\n  https://youtu.be/6BPmHgxD3OI for the use of the proposed method in a\n  simplified abort-safe rendezvous problem"},{"id":"http://arxiv.org/abs/2409.08935v2","updated":"2025-01-20T18:07:30Z","published":"2024-09-13T15:55:05Z","title":"Optimization and Generalization Guarantees for Weight Normalization","summary":"  Weight normalization (WeightNorm) is widely used in practice for the training\nof deep neural networks and modern deep learning libraries have built-in\nimplementations of it. In this paper, we provide the first theoretical\ncharacterizations of both optimization and generalization of deep WeightNorm\nmodels with smooth activation functions. For optimization, from the form of the\nHessian of the loss, we note that a small Hessian of the predictor leads to a\ntractable analysis. Thus, we bound the spectral norm of the Hessian of\nWeightNorm networks and show its dependence on the network width and weight\nnormalization terms--the latter being unique to networks without WeightNorm.\nThen, we use this bound to establish training convergence guarantees under\nsuitable assumptions for gradient decent. For generalization, we use WeightNorm\nto get a uniform convergence based generalization bound, which is independent\nfrom the width and depends sublinearly on the depth. Finally, we present\nexperimental results which illustrate how the normalization terms and other\nquantities of theoretical interest relate to the training of WeightNorm\nnetworks.\n","authors":["Pedro Cisneros-Velarde","Zhijie Chen","Sanmi Koyejo","Arindam Banerjee"],"pdf_url":"https://arxiv.org/pdf/2409.08935v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03282v2","updated":"2025-01-20T17:29:02Z","published":"2024-04-04T08:09:33Z","title":"Patient Transport in Hospitals: A Literature Review of Operations\n  Research and Management Science Methods","summary":"  Most activities in hospitals require the presence of the patient. Delays in\npatient transport can disrupt operations, potentially resulting in idle staff,\nunderutilized equipment, and postponed procedures, which in turn lead to lost\nrevenue, unnecessary costs across many different areas and departments, and\nlower patient satisfaction. Consequently, patient transport planning is a\ncentral operational task in hospitals. This paper provides the first literature\nreview of Operations Research and Management Science approaches for\nnon-emergency, intra-hospital patient transport. We structure the different\npatient transport problems considered in the literature according to several\nmain characteristics and introduce a five-field notation that allows for a\nconcise representation of different problem variants. We then analyze the\nrelevant literature with respect to different aspects related to the considered\nproblem variant, the employed modeling and solution techniques, as well as the\ndata used and the level of practical implementation achieved. Based on our\nliterature analysis and semi-structured interviews with hospital practitioners,\nwe compare current hospital practices and the existing literature, identify\nresearch gaps, and formulate an agenda for relevant future research.\n","authors":["Tom Lorenz Klein","Clemens Thielen"],"pdf_url":"https://arxiv.org/pdf/2404.03282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04096v3","updated":"2025-01-20T17:27:42Z","published":"2023-10-06T08:53:50Z","title":"Accelerated Affine-Invariant Convergence Rates of the Frank-Wolfe\n  Algorithm with Open-Loop Step-Sizes","summary":"  Recent papers have shown that the Frank-Wolfe algorithm (FW) with open-loop\nstep-sizes exhibits rates of convergence faster than the iconic\n$\\mathcal{O}(t^{-1})$ rate. In particular, when the minimizer of a strongly\nconvex function over a polytope lies in the relative interior of a feasible\nregion face, the FW with open-loop step-sizes $\\eta_t = \\frac{\\ell}{t+\\ell}$\nfor $\\ell \\in \\mathbb{N}_{\\geq 2}$ has accelerated convergence\n$\\mathcal{O}(t^{-2})$ in contrast to the rate $\\Omega(t^{-1-\\epsilon})$\nattainable with more complex line-search or short-step step-sizes. Given the\nrelevance of this scenario in data science problems, research has grown to\nexplore the settings enabling acceleration in open-loop FW. However, despite\nFW's well-known affine invariance, existing acceleration results for open-loop\nFW are affine-dependent. This paper remedies this gap in the literature by\nmerging two recent research trajectories: affine invariance (Wirth et al.,\n2023b) and open-loop step-sizes (Pena, 2021). In particular, we extend all\nknown non-affine-invariant convergence rates for FW with open-loop step-sizes\nto affine-invariant results.\n","authors":["Elias Wirth","Javier Pena","Sebastian Pokutta"],"pdf_url":"https://arxiv.org/pdf/2310.04096v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11616v1","updated":"2025-01-20T17:27:03Z","published":"2025-01-20T17:27:03Z","title":"Fully Adaptive Zeroth-Order Method for Minimizing Functions with\n  Compressible Gradients","summary":"  We propose an adaptive zeroth-order method for minimizing differentiable\nfunctions with $L$-Lipschitz continuous gradients. The method is designed to\ntake advantage of the eventual compressibility of the gradient of the objective\nfunction, but it does not require knowledge of the approximate sparsity level\n$s$ or the Lipschitz constant $L$ of the gradient. We show that the new method\nperforms no more than $O\\left(n^{2}\\epsilon^{-2}\\right)$ function evaluations\nto find an $\\epsilon$-approximate stationary point of an objective function\nwith $n$ variables. Assuming additionally that the gradients of the objective\nfunction are compressible, we obtain an improved complexity bound of\n$O\\left(s\\log\\left(n\\right)\\epsilon^{-2}\\right)$ function evaluations, which\nholds with high probability. Preliminary numerical results illustrate the\nefficiency of the proposed method and demonstrate that it can significantly\noutperform its non-adaptive counterpart.\n","authors":["Geovani Nunes Grapiglia","Daniel McKenzie"],"pdf_url":"https://arxiv.org/pdf/2501.11616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11608v1","updated":"2025-01-20T17:11:16Z","published":"2025-01-20T17:11:16Z","title":"Improved Mixing and Pressure Loss Formulations for Gas Network\n  Optimization","summary":"  Non-convex, nonlinear gas network optimization models are used to determine\nthe feasibility of flows on existing networks given constraints on network\nflows, gas mixing, and pressure loss along pipes. This work improves two\nexisting gas network models: a discrete mixed-integer nonlinear program (MINLP)\nthat uses binary variables to model positive and negative flows, and a\ncontinuous nonlinear program (NLP) that implements complementarity constraints\nwith continuous variables. We introduce cuts to expedite the MINLP and we\nformulate two new pressure loss models that leverage the flow-splitting\nvariables: one that is highly accurate and another that is simpler but less\naccurate. In computational tests using the global solver BARON our cuts and\naccurate pressure loss improves: (1) the average run time of the MINLP by a\nfactor of 35, (2) the stability of the MINLP by solving every tested instance\nwithin 2.5 minutes (the baseline model timed out on 25% of instances), (3) the\nstability of the NLP by solving more instances than the baseline. Our simpler\npressure loss model further improved run times in the MINLP (by a factor of 48\nversus the baseline MINLP), but was unstable in the context of the NLP.\n","authors":["Geonhee Kim","Christopher Lourenco","Daphne Skipper","Luze Xu"],"pdf_url":"https://arxiv.org/pdf/2501.11608v1.pdf","comment":"26 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.07275v4","updated":"2025-01-20T16:56:11Z","published":"2023-08-14T17:04:08Z","title":"On Semidefinite Relaxations for Matrix-Weighted State-Estimation\n  Problems in Robotics","summary":"  In recent years, there has been remarkable progress in the development of\nso-called certifiable perception methods, which leverage semidefinite, convex\nrelaxations to find global optima of perception problems in robotics. However,\nmany of these relaxations rely on simplifying assumptions that facilitate the\nproblem formulation, such as an isotropic measurement noise distribution. In\nthis paper, we explore the tightness of the semidefinite relaxations of\nmatrix-weighted (anisotropic) state-estimation problems and reveal the\nlimitations lurking therein: matrix-weighted factors can cause convex\nrelaxations to lose tightness. In particular, we show that the semidefinite\nrelaxations of localization problems with matrix weights may be tight only for\nlow noise levels. To better understand this issue, we introduce a theoretical\nconnection between the posterior uncertainty of the state estimate and the\ncertificate matrix obtained via convex relaxation. With this connection in\nmind, we empirically explore the factors that contribute to this loss of\ntightness and demonstrate that redundant constraints can be used to regain it.\nAs a second technical contribution of this paper, we show that the\nstate-of-the-art relaxation of scalar-weighted SLAM cannot be used when matrix\nweights are considered. We provide an alternate formulation and show that its\nSDP relaxation is not tight (even for very low noise levels) unless specific\nredundant constraints are used. We demonstrate the tightness of our\nformulations on both simulated and real-world data.\n","authors":["Connor Holmes","Frederike Dümbgen","Timothy D Barfoot"],"pdf_url":"https://arxiv.org/pdf/2308.07275v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08967v3","updated":"2025-01-20T16:52:07Z","published":"2023-01-21T16:11:42Z","title":"Infinite-dimensional port-Hamiltonian systems with a stationary\n  interface","summary":"  We consider two systems of two conservation laws that are defined on\ncomplementary, one-dimensional spatial intervals and coupled by an interface as\na single port-Hamiltonian system. In case of a fixed interface position, we\ncharacterize the boundary and interface conditions for which the associated\nport-Hamiltonian operator generates a contraction semigroup. Furthermore, we\npresent sufficient conditions for the exponential stability of the generated\n$C_0$-semigroup. The results are illustrated by the example of two acoustic\nwaveguides coupled by a membrane interface.\n","authors":["Alexander Kilian","Bernhard Maschke","Andrii Mironchenko","Fabian Wirth"],"pdf_url":"https://arxiv.org/pdf/2301.08967v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2301.07344"},{"id":"http://arxiv.org/abs/2501.11576v1","updated":"2025-01-20T16:26:21Z","published":"2025-01-20T16:26:21Z","title":"Riemannian Optimization for Holevo Capacity","summary":"  Computing the classical capacity of a noisy quantum channel is crucial for\nunderstanding the limits of communication over quantum channels. However, its\nevaluation remains challenging due to the difficulty of computing the Holevo\ncapacity and the even greater difficulty of regularization. In this work, we\nformulate the computation of the Holevo capacity as an optimization problem on\na product manifold constructed from probability distributions and their\ncorresponding pure input states for a quantum channel. A Riemannian gradient\ndescent algorithm is proposed to solve the problem, providing lower bounds on\nthe classical capacity of general quantum channels and outperforming existing\nmethods in numerical experiments in both efficiency and scale.\n","authors":["Chengkai Zhu","Renfeng Peng","Bin Gao","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11576v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2303.14985v4","updated":"2025-01-20T16:13:59Z","published":"2023-03-27T08:29:04Z","title":"When does subtracting a rank-one approximation decrease tensor rank?","summary":"  Subtracting a critical rank-one approximation from a matrix always results in\na matrix with a lower rank. This is not true for tensors in general. Motivated\nby this, we ask the question: what is the closure of the set of those tensors\nfor which subtracting some of its critical rank-one approximation from it and\nrepeating the process we will eventually get to zero? In this article, we show\nhow to construct this variety of tensors and we show how this is connected to\nthe bottleneck points of the variety of rank-one tensors (and in general to the\nsingular locus of the hyperdeterminant), and how this variety can be equal to\nand in some cases be more than (weakly) orthogonally decomposable tensors.\n","authors":["Emil Horobet","Ettore Teixeira Turatti"],"pdf_url":"https://arxiv.org/pdf/2303.14985v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11544v1","updated":"2025-01-20T15:30:54Z","published":"2025-01-20T15:30:54Z","title":"A Survey of Exact and Approximation Algorithms for Linear-Parametric\n  Optimization Problems","summary":"  Linear-parametric optimization, where multiple objectives are combined into a\nsingle objective using linear combinations with parameters as coefficients, has\nnumerous links to other fields in optimization and a wide range of application\nareas. In this survey, we provide a comprehensive overview of structural\nresults and algorithmic strategies for solving linear-parametric optimization\nproblems exactly and approximately. Transferring concepts from related areas\nsuch as multi-objective optimization provides further relevant results. The\nsurvey consists of two parts: First, we list strategies that work in a general\nfashion and do not rely on specific problem structures. Second, we look at\nwell-studied parametric optimization problems and cover both important\ntheoretical results and specialized algorithmic approaches for these problems.\nAmong these problems are parametric variants of shortest path problems, minimum\ncost flow and maximum flow problems, spanning tree problems, the knapsack\nproblem, and matching problems. Overall, we cover the results from 128\npublications (and refer to 33 supplemental works) published between 1963 and\n2024.\n","authors":["Levin Nemesch","Stefan Ruzika","Clemens Thielen","Alina Wittmann"],"pdf_url":"https://arxiv.org/pdf/2501.11544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11446v1","updated":"2025-01-20T12:33:35Z","published":"2025-01-20T12:33:35Z","title":"Global Exponential Stabilization for a Simplified Fluid-Particle\n  Interaction System","summary":"  This work considers a system coupling a viscous Burgers equation (aimed to\ndescribe a simplified model of $1D$ fluid flow) with the ODE describing the\nmotion of a point mass moving inside the fluid. The point mass is possibly\nunder the action of a feedback control. Our main contributions are that we\nprove two global exponential stability results. More precisely, we first show\nthat the velocity field corresponding to the free dynamics case is globally\nexponentially stable. We next show that, in the presence of the feedback\ncontrol both the velocity field and the distance from the mass point to a\nprescribed target position decay exponentially. The proofs of these results\nheavily rely on the use of a special test function allowing both to prove that\nthe mass point stays away from the boundary and to construct a perturbed\nLyapunov function.\n","authors":["Marius Tucsnak","Zhuo Xu"],"pdf_url":"https://arxiv.org/pdf/2501.11446v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2501.11397v1","updated":"2025-01-20T10:54:01Z","published":"2025-01-20T10:54:01Z","title":"Lagrangian Duality for Mixed-Integer Semidefinite Programming: Theory\n  and Algorithms","summary":"  This paper presents the Lagrangian duality theory for mixed-integer\nsemidefinite programming (MISDP). We derive the Lagrangian dual problem and\nprove that the resulting Lagrangian dual bound dominates the bound obtained\nfrom the continuous relaxation of the MISDP problem. We present a hierarchy of\nLagrangian dual bounds by exploiting the theory of integer positive\nsemidefinite matrices and propose three algorithms for obtaining those bounds.\nOur algorithms are variants of well-known algorithms for minimizing\nnon-differentiable convex functions. The numerical results on the max-$k$-cut\nproblem show that the Lagrangian dual bounds are substantially stronger than\nthe semidefinite programming bound obtained by relaxing integrality, already\nfor lower levels in the hierarchy. Computational costs for computing our bounds\nare small.\n","authors":["Frank de Meijer","Renata Sotirov"],"pdf_url":"https://arxiv.org/pdf/2501.11397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11394v1","updated":"2025-01-20T10:47:21Z","published":"2025-01-20T10:47:21Z","title":"Large deviations for sticky-reflecting Brownian motion with boundary\n  diffusion","summary":"  We study a Schilder-type large deviation principle for sticky-reflected\nBrownian motion with boundary diffusion, both at the static and sample path\nlevel in the short-time limit. A sharp transition for the rate function occurs,\ndepending on whether the tangential boundary diffusion is faster or slower than\nin the interior of the domain. The resulting intrinsic distance naturally gives\nrise to a novel optimal transport model, where motion and kinetic energy are\ntreated differently in the interior and along the boundary.\n","authors":["Jean-Baptiste Casteras","Leonard Monsaingeon","Luca Nenna"],"pdf_url":"https://arxiv.org/pdf/2501.11394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11382v1","updated":"2025-01-20T10:19:37Z","published":"2025-01-20T10:19:37Z","title":"Global Regularity Estimates for Optimal Transport via Entropic\n  Regularisation","summary":"  We develop a general approach to prove global regularity estimates for\nquadratic optimal transport using the entropic regularisation of the problem.\n","authors":["Nathael Gozlan","Maxime Sylvestre"],"pdf_url":"https://arxiv.org/pdf/2501.11382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11355v1","updated":"2025-01-20T09:34:57Z","published":"2025-01-20T09:34:57Z","title":"Relax-and-round strategies for solving the Unit Commitment problem with\n  AC Power Flow constraints","summary":"  The Unit Commitment problem with AC power flow constraints (UC-ACOPF) is a\nnon-convex mixed-integer nonlinear programming (MINLP) problem encountered in\npower systems. Its combination of combinatorial complexity and non-convex\nnonlinear constraints makes it particularly challenging. A common approach to\ntackle this issue is to relax the integrality condition, but this often results\nin infeasible solutions. Consequently, rounding heuristics are frequently\nemployed to restore integer feasibility. This paper addresses recent\nadvancements in heuristics aimed at quickly obtaining feasible solutions for\nthe UC-ACOPF problem, focusing specifically on direct relax-and-round\nstrategies. We propose a model-based heuristic that rescales the solution of\nthe integer-relaxed problem before rounding. Furthermore, we introduce rounding\nformulas designed to enforce combinatorial constraints and aim to maintain AC\nfeasibility in the resulting solutions. These methodologies are compared\nagainst standard direct rounding techniques in the literature, applied to a\n6-bus and a 118-bus test systems. Additionally, we integrate the proposed\nheuristics into an implementation of the Feasibility Pump (FP) method,\ndemonstrating their utility and potential to enhance existing rounding\nstrategies.\n","authors":["D. Gómez","S. Göttlich","A. Ríos","P. Salgado"],"pdf_url":"https://arxiv.org/pdf/2501.11355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11352v1","updated":"2025-01-20T09:21:56Z","published":"2025-01-20T09:21:56Z","title":"A mixed finite elements approximation of inverse source problems for the\n  wave equation with variable coefficients using observability","summary":"  We consider an inverse problem for the linear one-dimensional wave equation\nwith variable coefficients consisting in determining an unknown source term\nfrom a boundary observation. A method to obtain approximations of this inverse\nproblem using a space discretization based on a mixed finite element method is\nproposed and analyzed. Its stability and convergence relay on a new uniform\nboundary observability property with respect to the discretization parameter.\n","authors":["Carlos Castro","Sorin Micu"],"pdf_url":"https://arxiv.org/pdf/2501.11352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10670v4","updated":"2025-01-20T07:12:03Z","published":"2024-10-14T16:14:32Z","title":"Barrier Function for Bilevel Optimization with Coupled Lower-Level\n  Constraints: Formulation, Approximation and Algorithms","summary":"  In this paper, we consider bilevel optimization problem where the lower-level\nhas coupled constraints, i.e. the constraints depend both on the upper- and\nlower-level variables. In particular, we consider two settings for the\nlower-level problem. The first is when the objective is strongly convex and the\nconstraints are convex with respect to the lower-level variable; The second is\nwhen the lower-level is a linear program. We propose to utilize a barrier\nfunction reformulation to translate the problem into an unconstrained problem.\nBy developing a series of new techniques, we proved that both the hyperfunction\nvalue and hypergradient of the barrier reformulated problem (uniformly)\nconverge to those of the original problem under minimal assumptions. Further,\nto overcome the non-Lipschitz smoothness of hyperfunction and lower-level\nproblem for barrier reformulated problems, we design an adaptive algorithm that\nensures a non-asymptotic convergence guarantee. We also design an algorithm\nthat converges to the stationary point of the original problem asymptotically\nunder certain assumptions. The proposed algorithms require minimal assumptions,\nand to our knowledge, they are the first with convergence guarantees when the\nlower-level problem is a linear program. Numerical experiments are conducted to\nshow the effectiveness of the proposed method.\n","authors":["Xiaotian Jiang","Jiaxiang Li","Mingyi Hong","Shuzhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.10670v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17241v2","updated":"2025-01-20T05:45:54Z","published":"2024-03-25T22:42:26Z","title":"Finite convergence of the Moment-SOS hierarchy for polynomial matrix\n  optimization","summary":"  This paper studies the matrix Moment-SOS hierarchy for solving polynomial\nmatrix optimization. Our first result is to show the finite convergence of this\nhierarchy, if the nondegeneracy condition, strict complementarity condition and\nsecond order sufficient condition hold at every minimizer, under the\nArchimedean property. A useful criterion for detecting the finite convergence\nis the flat truncation. Our second result is to show that every minimizer of\nthe moment relaxation must have a flat truncation when the relaxation order is\nbig enough, under the above mentioned optimality conditions. These results give\nconnections between nonlinear semidefinite optimization theory and Moment-SOS\nmethods for solving polynomial matrix optimization.\n","authors":["Lei Huang","Jiawang Nie"],"pdf_url":"https://arxiv.org/pdf/2403.17241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11255v1","updated":"2025-01-20T03:42:12Z","published":"2025-01-20T03:42:12Z","title":"Bounding the Settling Time of Finite-Time Stable Systems using Sum of\n  Squares","summary":"  Finite-time stability (FTS) of a differential equation guarantees that\nsolutions reach a given equilibrium point in finite time, where the time of\nconvergence depends on the initial state of the system. For traditional\nstability notions such as exponential stability, the convex optimization\nframework of Sum-of-Squares (SoS) enables the computation of polynomial\nLyapunov functions to certify stability. However, finite-time stable systems\nare characterized by non-Lipschitz, non-polynomial vector fields, rendering\nstandard SoS methods inapplicable. To this end, in this paper, we show that the\ncomputation of a non-polynomial Lyapunov function certifying finite-time\nstability can be reformulated as computation of a polynomial one under a\nparticular transformation that we develop in this work. As a result, SoS can be\nutilized to compute a Lyapunov function for FTS. This Lyapunov function can\nthen be used to obtain a bound on the settling time. We first present this\napproach for the scalar case and then extend it to the multivariate case.\nNumerical examples demonstrate the effectiveness of our approach in both\ncertifying finite-time stability and computing accurate settling time bounds.\nThis work represents the first combination of SoS programming with settling\ntime bounds for finite-time stable systems.\n","authors":["Sengiyumva Kisole","Kunal Garg","Matthew Peet"],"pdf_url":"https://arxiv.org/pdf/2501.11255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09009v2","updated":"2025-01-20T03:02:33Z","published":"2023-03-16T00:49:54Z","title":"Accelerated Gradient and Skew-Symmetric Splitting Methods for a Class of\n  Monotone Operator Equations","summary":"  A class of monotone operator equations, which can be decomposed into sum of\nthe gradient of a strongly convex function and a linear and skew-symmetric\noperator, is considered in this work. Based on discretization of the\ngeneralized gradient flow, gradient and skew-symmetric splitting (GSS) methods\nare proposed and proved to converge in linear rates. To further accelerate the\nconvergence, an accelerated gradient flow is proposed and accelerated gradient\nand skew-symmetric splitting (AGSS) methods are developed, which extends the\nacceleration among the existing works on the convex minimization to a more\ngeneral class of monotone operator equations. In particular, when applied to\nsmooth saddle point systems with bilinear coupling, a linear convergent method\nwith optimal lower iteration complexity is proposed. The robustness and\nefficiency of GSS and AGSS methods are verified via extensive numerical\nexperiments.\n","authors":["Long Chen","Jingrong Wei"],"pdf_url":"https://arxiv.org/pdf/2303.09009v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19721v2","updated":"2025-01-20T02:58:05Z","published":"2024-05-30T06:07:09Z","title":"Generalized Bayesian Nash Equilibrium with Continuous Type and Action\n  Spaces","summary":"  Bayesian game is a strategic decision-making model where each player's type\nparameter characterizing its own objective is private information: each player\nknows its own type but not its rivals' types, and Bayesian Nash equilibrium\n(BNE) is an outcome of this game where each player makes a strategic optimal\ndecision according to its own type under the Nash conjecture. In this paper, we\nadvance the literature by considering a generalized Bayesian game where each\nplayer's action space depends on its own type parameter and the rivals'\nactions. This reflects the fact that in practical applications, a firm's\nfeasible action is often related to its own type (e.g. marginal cost) and the\nrivals' actions (e.g. common resource constraints in a competitive market).\nUnder some moderate conditions, we demonstrate existence of continuous\ngeneralized Bayesian Nash equilibria (GBNE) and uniqueness of such an\nequilibrium when each player's action space is only dependent on its type. In\nthe case that each player's action space is also dependent on rivals' actions,\nwe give a simple example to show that uniqueness of GBNE is not guaranteed\nunder standard monotone conditions. To compute an approximate GBNE, we restrict\neach player's response function to the space of polynomial functions of its\ntype parameter and consequently convert the GBNE problem to a stochastic\ngeneralized Nash equilibrium problem (SGNE). To justify the approximation, we\ndiscuss convergence of the approximation scheme. Some preliminary numerical\ntest results show that the approximation scheme works well.\n","authors":["Yuan Tao","Huifu Xu"],"pdf_url":"https://arxiv.org/pdf/2405.19721v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19058v2","updated":"2025-01-20T02:27:07Z","published":"2024-12-26T05:03:16Z","title":"A System of BSDEs with Singular Terminal Values Arising in Optimal\n  Liquidation with Regime Switching","summary":"  We study a stochastic control problem with regime switching arising in an\noptimal liquidation problem with dark pools and multiple regimes. The new\nfeature of this model is that it introduces a system of BSDEs with jumps and\nwith singular terminal values, which appears in literature for the first time.\nThe existence result for this system is obtained. As a result, we solve the\nstochastic control problem with regime switching. More importantly, the\nuniqueness result of this system is also obtained, in contrast to merely\nminimal solutions established in most related literature.\n","authors":["Guanxing Fu","Xiaomin Shi","Zuo Quan Xu"],"pdf_url":"https://arxiv.org/pdf/2412.19058v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2501.09670v3","updated":"2025-01-20T02:17:17Z","published":"2025-01-16T17:06:25Z","title":"A Simplification Method for Inequality Constraints in Integer Binary\n  Encoding HOBO Formulations","summary":"  This study proposes a novel method for simplifying inequality constraints in\nHigher-Order Binary Optimization (HOBO) formulations. The proposed method\naddresses challenges associated with Quadratic Unconstrained Binary\nOptimization (QUBO) formulations, specifically the increased computational\ncomplexity and reduced solution accuracy caused by the introduction of slack\nvariables and the resulting growth in auxiliary qubits. By efficiently\nintegrating constraints, the method enhances the computational efficiency and\naccuracy of both quantum and classical solvers. The effectiveness of the\nproposed approach is demonstrated through numerical experiments applied to\ncombinatorial optimization problems. The results indicate that this method\nexpands the applicability of quantum algorithms to high-dimensional problems\nand improves the practicality of classical optimization solvers for\noptimization problems involving inequality constraints.\n","authors":["Yuichiro Minato"],"pdf_url":"https://arxiv.org/pdf/2501.09670v3.pdf","comment":"The assumptions of the paper are overly restrictive, and there is a\n  critical error"},{"id":"http://arxiv.org/abs/2404.04832v3","updated":"2025-01-20T02:11:23Z","published":"2024-04-07T06:52:20Z","title":"Robotic Sorting Systems: Robot Management and Layout Design Optimization","summary":"  In the contemporary logistics industry, automation plays a pivotal role in\nenhancing production efficiency and expanding industrial scale. Autonomous\nmobile robots, in particular, have become integral to the modernization efforts\nin warehouses. One noteworthy application in robotic warehousing is the robotic\nsorting system (RSS), distinguished by its characteristics such as\ncost-effectiveness, simplicity, scalability, and adaptable throughput control.\nWhile previous research has focused on analyzing the efficiency of RSS, it\noften assumed an ideal robot management system ignoring potential queuing\ndelays by assuming constant travel times. This study relaxes this assumption\nand explores the quantitative relationship between RSS configuration parameters\nand system throughput. We introduce a novel robot traffic management method,\nnamed the rhythmic control for sorting scenario (RC-S), for RSS operations,\nequipped with an estimation formula establishing the relationship between\nsystem performance and configurations. Simulations validate that RC-S reduces\naverage service time by 10.3\\% compared to the classical cooperative A*\nalgorithm, while also improving throughput and runtime. Based on the\nperformance analysis of RC-S, we further develop a layout optimization model\nfor RSS, considering RSS configuration, desired throughput, and costs, to\nminimize expenses and determine the best layout. Numerical studies show that at\nlower throughput levels, facility costs dominate, while at higher throughput\nlevels, labor costs prevail. Additionally, due to traffic efficiency\nlimitations, RSS is well-suited for small-scale operations like\nend-of-supply-chain distribution centers.\n","authors":["Tong Zhao","Xi Lin","Fang He","Hanwen Dai"],"pdf_url":"https://arxiv.org/pdf/2404.04832v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14522v5","updated":"2025-01-20T00:26:58Z","published":"2023-06-26T08:54:46Z","title":"Nonconvex Stochastic Bregman Proximal Gradient Method with Application\n  to Deep Learning","summary":"  Stochastic gradient methods for minimizing nonconvex composite objective\nfunctions typically rely on the Lipschitz smoothness of the differentiable\npart, but this assumption fails in many important problem classes like\nquadratic inverse problems and neural network training, leading to instability\nof the algorithms in both theory and practice. To address this, we propose a\nfamily of stochastic Bregman proximal gradient (SBPG) methods that only require\nsmooth adaptivity. SBPG replaces the quadratic approximation in SGD with a\nBregman proximity measure, offering a better approximation model that handles\nnon-Lipschitz gradients in nonconvex objectives. We establish the convergence\nproperties of vanilla SBPG and show it achieves optimal sample complexity in\nthe nonconvex setting. Experimental results on quadratic inverse problems\ndemonstrate SBPG's robustness in terms of stepsize selection and sensitivity to\nthe initial point. Furthermore, we introduce a momentum-based variant, MSBPG,\nwhich enhances convergence by relaxing the mini-batch size requirement while\npreserving the optimal oracle complexity. We apply MSBPG to the training of\ndeep neural networks, utilizing a polynomial kernel function to ensure smooth\nadaptivity of the loss function. Experimental results on benchmark datasets\nconfirm the effectiveness and robustness of MSBPG in training neural networks.\nGiven its negligible additional computational cost compared to SGD in\nlarge-scale optimization, MSBPG shows promise as a universal open-source\noptimizer for future applications.\n","authors":["Kuangyu Ding","Jingyang Li","Kim-Chuan Toh"],"pdf_url":"https://arxiv.org/pdf/2306.14522v5.pdf","comment":"44 pages"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2406.14596v5","updated":"2025-01-20T23:33:33Z","published":"2024-06-20T17:45:02Z","title":"VLM Agents Generate Their Own Memories: Distilling Experience into\n  Embodied Programs of Thought","summary":"  Large-scale LLMs and VLMs excel at few-shot learning but require high-quality\nexamples. We introduce In-Context Abstraction Learning (ICAL), which\niteratively refines suboptimal trajectories into high-quality data with\noptimized actions and detailed reasoning. Given an inefficient demonstration, a\nVLM corrects actions and annotates causal relationships, object states,\nsubgoals, and task-relevant visuals, forming \"programs of thought.\" With human\nfeedback, these programs are improved as the agent executes them in a similar\nenvironment. The resulting examples, used as prompt context or fine-tuning\ndata, significantly boost decision-making while reducing human feedback needs.\nICAL surpasses state-of-the-art in TEACh (dialogue-based instruction\nfollowing), VisualWebArena (multimodal web agents), and Ego4D (egocentric video\naction anticipation). In TEACh, combining fine-tuning and retrieval on ICAL\nexamples outperforms raw human demonstrations and expert examples, achieving a\n17.5% increase in goal-condition success. In VisualWebArena,\nretrieval-augmented GPT-4V with ICAL improves task success rate 1.6x over\nGPT-4V, while fine-tuning Qwen2-VL achieves a 2.8x improvement. In Ego4D, ICAL\noutperforms few-shot GPT-4V and remains competitive with supervised models.\nOverall, ICAL scales 2x better than raw human demonstrations and reduces manual\nprompt engineering.\n","authors":["Gabriel Sarch","Lawrence Jang","Michael J. Tarr","William W. Cohen","Kenneth Marino","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2406.14596v5.pdf","comment":"Project website: https://ical-learning.github.io/"},{"id":"http://arxiv.org/abs/2501.10021v2","updated":"2025-01-20T23:26:41Z","published":"2025-01-17T08:10:53Z","title":"X-Dyna: Expressive Dynamic Human Image Animation","summary":"  We introduce X-Dyna, a novel zero-shot, diffusion-based pipeline for\nanimating a single human image using facial expressions and body movements\nderived from a driving video, that generates realistic, context-aware dynamics\nfor both the subject and the surrounding environment. Building on prior\napproaches centered on human pose control, X-Dyna addresses key shortcomings\ncausing the loss of dynamic details, enhancing the lifelike qualities of human\nvideo animations. At the core of our approach is the Dynamics-Adapter, a\nlightweight module that effectively integrates reference appearance context\ninto the spatial attentions of the diffusion backbone while preserving the\ncapacity of motion modules in synthesizing fluid and intricate dynamic details.\nBeyond body pose control, we connect a local control module with our model to\ncapture identity-disentangled facial expressions, facilitating accurate\nexpression transfer for enhanced realism in animated scenes. Together, these\ncomponents form a unified framework capable of learning physical human motion\nand natural scene dynamics from a diverse blend of human and scene videos.\nComprehensive qualitative and quantitative evaluations demonstrate that X-Dyna\noutperforms state-of-the-art methods, creating highly lifelike and expressive\nanimations. The code is available at https://github.com/bytedance/X-Dyna.\n","authors":["Di Chang","Hongyi Xu","You Xie","Yipeng Gao","Zhengfei Kuang","Shengqu Cai","Chenxu Zhang","Guoxian Song","Chao Wang","Yichun Shi","Zeyuan Chen","Shijie Zhou","Linjie Luo","Gordon Wetzstein","Mohammad Soleymani"],"pdf_url":"https://arxiv.org/pdf/2501.10021v2.pdf","comment":"Project page:https://x-dyna.github.io/xdyna.github.io/\n  Code:https://github.com/bytedance/X-Dyna\n  Model:https://huggingface.co/Boese0601/X-Dyna"},{"id":"http://arxiv.org/abs/2501.11784v1","updated":"2025-01-20T23:17:57Z","published":"2025-01-20T23:17:57Z","title":"Generating visual explanations from deep networks using implicit neural\n  representations","summary":"  Explaining deep learning models in a way that humans can easily understand is\nessential for responsible artificial intelligence applications. Attribution\nmethods constitute an important area of explainable deep learning. The\nattribution problem involves finding parts of the network's input that are the\nmost responsible for the model's output. In this work, we demonstrate that\nimplicit neural representations (INRs) constitute a good framework for\ngenerating visual explanations. Firstly, we utilize coordinate-based implicit\nnetworks to reformulate and extend the extremal perturbations technique and\ngenerate attribution masks. Experimental results confirm the usefulness of our\nmethod. For instance, by proper conditioning of the implicit network, we obtain\nattribution masks that are well-behaved with respect to the imposed area\nconstraints. Secondly, we present an iterative INR-based method that can be\nused to generate multiple non-overlapping attribution masks for the same image.\nWe depict that a deep learning model may associate the image label with both\nthe appearance of the object of interest as well as with areas and textures\nusually accompanying the object. Our study demonstrates that implicit networks\nare well-suited for the generation of attribution masks and can provide\ninteresting insights about the performance of deep learning models.\n","authors":["Michal Byra","Henrik Skibbe"],"pdf_url":"https://arxiv.org/pdf/2501.11784v1.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2501.11776v1","updated":"2025-01-20T22:44:53Z","published":"2025-01-20T22:44:53Z","title":"EfficientVITON: An Efficient Virtual Try-On Model using Optimized\n  Diffusion Process","summary":"  Would not it be much more convenient for everybody to try on clothes by only\nlooking into a mirror ? The answer to that problem is virtual try-on, enabling\nusers to digitally experiment with outfits. The core challenge lies in\nrealistic image-to-image translation, where clothing must fit diverse human\nforms, poses, and figures. Early methods, which used 2D transformations,\noffered speed, but image quality was often disappointing and lacked the nuance\nof deep learning. Though GAN-based techniques enhanced realism, their\ndependence on paired data proved limiting. More adaptable methods offered great\nvisuals but demanded significant computing power and time. Recent advances in\ndiffusion models have shown promise for high-fidelity translation, yet the\ncurrent crop of virtual try-on tools still struggle with detail loss and\nwarping issues. To tackle these challenges, this paper proposes EfficientVITON,\na new virtual try-on system leveraging the impressive pre-trained Stable\nDiffusion model for better images and deployment feasibility. The system\nincludes a spatial encoder to maintain clothings finer details and zero\ncross-attention blocks to capture the subtleties of how clothes fit a human\nbody. Input images are carefully prepared, and the diffusion process has been\ntweaked to significantly cut generation time without image quality loss. The\ntraining process involves two distinct stages of fine-tuning, carefully\nincorporating a balance of loss functions to ensure both accurate try-on\nresults and high-quality visuals. Rigorous testing on the VITON-HD dataset,\nsupplemented with real-world examples, has demonstrated that EfficientVITON\nachieves state-of-the-art results.\n","authors":["Mostafa Atef","Mariam Ayman","Ahmed Rashed","Ashrakat Saeed","Abdelrahman Saeed","Ahmed Fares"],"pdf_url":"https://arxiv.org/pdf/2501.11776v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2401.13213v5","updated":"2025-01-20T22:21:13Z","published":"2024-01-24T03:56:07Z","title":"Common-Sense Bias Modeling for Classification Tasks","summary":"  Machine learning model bias can arise from dataset composition: correlated\nsensitive features can distort the downstream classification model's decision\nboundary and lead to performance differences along these features. Existing\nde-biasing works tackle the most prominent bias features, such as colors of\ndigits or background of animals. However, real-world datasets often include a\nlarge number of feature correlations that intrinsically manifest in the data as\ncommon sense information. Such spurious visual cues can further reduce model\nrobustness. Thus, domain practitioners desire a comprehensive understanding of\ncorrelations and the flexibility to address relevant biases. To this end, we\npropose a novel framework to extract comprehensive biases in image datasets\nbased on textual descriptions, a common sense-rich modality. Specifically,\nfeatures are constructed by clustering noun phrase embeddings with similar\nsemantics. The presence of each feature across the dataset is inferred, and\ntheir co-occurrence statistics are measured, with spurious correlations\noptionally examined by a human-in-the-loop module. Downstream experiments show\nthat our method uncovers novel model biases in multiple image benchmark\ndatasets. Furthermore, the discovered bias can be mitigated by simple data\nre-weighting to de-correlate the features, outperforming state-of-the-art\nunsupervised bias mitigation methods.\n","authors":["Miao Zhang","Zee fryer","Ben Colman","Ali Shahriyari","Gaurav Bharaj"],"pdf_url":"https://arxiv.org/pdf/2401.13213v5.pdf","comment":"Accepted for AAAI Conference on Artificial Intelligence (AAAI)"},{"id":"http://arxiv.org/abs/2501.06368v2","updated":"2025-01-20T22:06:39Z","published":"2025-01-10T22:41:02Z","title":"Towards Robust Nonlinear Subspace Clustering: A Kernel Learning Approach","summary":"  Kernel-based subspace clustering, which addresses the nonlinear structures in\ndata, is an evolving area of research. Despite noteworthy progressions,\nprevailing methodologies predominantly grapple with limitations relating to (i)\nthe influence of predefined kernels on model performance; (ii) the difficulty\nof preserving the original manifold structures in the nonlinear space; (iii)\nthe dependency of spectral-type strategies on the ideal block diagonal\nstructure of the affinity matrix. This paper presents DKLM, a novel paradigm\nfor kernel-induced nonlinear subspace clustering. DKLM provides a data-driven\napproach that directly learns the kernel from the data's self-representation,\nensuring adaptive weighting and satisfying the multiplicative triangle\ninequality constraint, which enhances the robustness of the learned kernel. By\nleveraging this learned kernel, DKLM preserves the local manifold structure of\ndata in a nonlinear space while promoting the formation of an optimal\nblock-diagonal affinity matrix. A thorough theoretical examination of DKLM\nreveals its relationship with existing clustering paradigms. Comprehensive\nexperiments on synthetic and real-world datasets demonstrate the effectiveness\nof the proposed method.\n","authors":["Kunpeng Xu","Lifei Chen","Shengrui Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06368v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05970v2","updated":"2025-01-20T21:45:19Z","published":"2024-10-08T12:17:42Z","title":"PDF-WuKong: A Large Multimodal Model for Efficient Long PDF Reading with\n  End-to-End Sparse Sampling","summary":"  Multimodal document understanding is a challenging task to process and\ncomprehend large amounts of textual and visual information. Recent advances in\nLarge Language Models (LLMs) have significantly improved the performance of\nthis task. However, existing methods typically focus on either plain text or a\nlimited number of document images, struggling to handle long PDF documents with\ninterleaved text and images, especially for academic papers. In this paper, we\nintroduce PDF-WuKong, a multimodal large language model (MLLM) which is\ndesigned to enhance multimodal question-answering (QA) for long PDF documents.\nPDF-WuKong incorporates a sparse sampler that operates on both text and image\nrepresentations, significantly improving the efficiency and capability of the\nMLLM. The sparse sampler is integrated with the MLLM's image encoder and\nselects the paragraphs or diagrams most pertinent to user queries for\nprocessing by the language model. To effectively train and evaluate our model,\nwe construct PaperPDF, a dataset consisting of a broad collection of English\nand Chinese academic papers. Multiple strategies are proposed to automatically\ngenerate 1.1 million QA pairs along with their corresponding evidence sources.\nExperimental results demonstrate the superiority and high efficiency of our\napproach over other models on the task of long multimodal document\nunderstanding, surpassing proprietary products by an average of 8.6% on F1. Our\ncode and dataset will be released at https://github.com/yh-hust/PDF-Wukong.\n","authors":["Xudong Xie","Hao Yan","Liang Yin","Yang Liu","Jing Ding","Minghui Liao","Yuliang Liu","Wei Chen","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2410.05970v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11758v1","updated":"2025-01-20T21:35:34Z","published":"2025-01-20T21:35:34Z","title":"A Review Paper of the Effects of Distinct Modalities and ML Techniques\n  to Distracted Driving Detection","summary":"  Distracted driving remains a significant global challenge with severe human\nand economic repercussions, demanding improved detection and intervention\nstrategies. While previous studies have extensively explored single-modality\napproaches, recent research indicates that these systems often fall short in\nidentifying complex distraction patterns, particularly cognitive distractions.\nThis systematic review addresses critical gaps by providing a comprehensive\nanalysis of machine learning (ML) and deep learning (DL) techniques applied\nacross various data modalities - visual,, sensory, auditory, and multimodal. By\ncategorizing and evaluating studies based on modality, data accessibility, and\nmethodology, this review clarifies which approaches yield the highest accuracy\nand are best suited for specific distracted driving detection goals. The\nfindings offer clear guidance on the advantages of multimodal versus\nsingle-modal systems and capture the latest advancements in the field.\nUltimately, this review contributes valuable insights for developing robust\ndistracted driving detection frameworks, supporting enhanced road safety and\nmitigation strategies.\n","authors":["Anthony. Dontoh","Stephanie. Ivey","Logan. Sirbaugh","Armstrong. Aboah"],"pdf_url":"https://arxiv.org/pdf/2501.11758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11755v1","updated":"2025-01-20T21:30:56Z","published":"2025-01-20T21:30:56Z","title":"A generalizable 3D framework and model for self-supervised learning in\n  medical imaging","summary":"  Current self-supervised learning methods for 3D medical imaging rely on\nsimple pretext formulations and organ- or modality-specific datasets, limiting\ntheir generalizability and scalability. We present 3DINO, a cutting-edge SSL\nmethod adapted to 3D datasets, and use it to pretrain 3DINO-ViT: a\ngeneral-purpose medical imaging model, on an exceptionally large, multimodal,\nand multi-organ dataset of ~100,000 3D medical imaging scans from over 10\norgans. We validate 3DINO-ViT using extensive experiments on numerous medical\nimaging segmentation and classification tasks. Our results demonstrate that\n3DINO-ViT generalizes across modalities and organs, including\nout-of-distribution tasks and datasets, outperforming state-of-the-art methods\non the majority of evaluation metrics and labeled dataset sizes. Our 3DINO\nframework and 3DINO-ViT will be made available to enable research on 3D\nfoundation models or further finetuning for a wide range of medical imaging\napplications.\n","authors":["Tony Xu","Sepehr Hosseini","Chris Anderson","Anthony Rinaldi","Rahul G. Krishnan","Anne L. Martel","Maged Goubran"],"pdf_url":"https://arxiv.org/pdf/2501.11755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11752v1","updated":"2025-01-20T21:24:15Z","published":"2025-01-20T21:24:15Z","title":"Are generative models fair? A study of racial bias in dermatological\n  image generation","summary":"  Racial bias in medicine, particularly in dermatology, presents significant\nethical and clinical challenges. It often results from the underrepresentation\nof darker skin tones in training datasets for machine learning models. While\nefforts to address bias in dermatology have focused on improving dataset\ndiversity and mitigating disparities in discriminative models, the impact of\nracial bias on generative models remains underexplored. Generative models, such\nas Variational Autoencoders (VAEs), are increasingly used in healthcare\napplications, yet their fairness across diverse skin tones is currently not\nwell understood. In this study, we evaluate the fairness of generative models\nin clinical dermatology with respect to racial bias. For this purpose, we first\ntrain a VAE with a perceptual loss to generate and reconstruct high-quality\nskin images across different skin tones. We utilize the Fitzpatrick17k dataset\nto examine how racial bias influences the representation and performance of\nthese models. Our findings indicate that the VAE is influenced by the diversity\nof skin tones in the training dataset, with better performance observed for\nlighter skin tones. Additionally, the uncertainty estimates produced by the VAE\nare ineffective in assessing the model's fairness. These results highlight the\nneed for improved uncertainty quantification mechanisms to detect and address\nracial bias in generative models for trustworthy healthcare technologies.\n","authors":["Miguel López-Pérez","Søren Hauberg","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2501.11752v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.11746v1","updated":"2025-01-20T21:09:33Z","published":"2025-01-20T21:09:33Z","title":"SILO: Solving Inverse Problems with Latent Operators","summary":"  Consistent improvement of image priors over the years has led to the\ndevelopment of better inverse problem solvers. Diffusion models are the\nnewcomers to this arena, posing the strongest known prior to date. Recently,\nsuch models operating in a latent space have become increasingly predominant\ndue to their efficiency. In recent works, these models have been applied to\nsolve inverse problems. Working in the latent space typically requires multiple\napplications of an Autoencoder during the restoration process, which leads to\nboth computational and restoration quality challenges. In this work, we propose\na new approach for handling inverse problems with latent diffusion models,\nwhere a learned degradation function operates within the latent space,\nemulating a known image space degradation. Usage of the learned operator\nreduces the dependency on the Autoencoder to only the initial and final steps\nof the restoration process, facilitating faster sampling and superior\nrestoration quality. We demonstrate the effectiveness of our method on a\nvariety of image restoration tasks and datasets, achieving significant\nimprovements over prior art.\n","authors":["Ron Raphaeli","Sean Man","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2501.11746v1.pdf","comment":"Project page in https://ronraphaeli.github.io/SILO-website/"},{"id":"http://arxiv.org/abs/2501.11741v1","updated":"2025-01-20T21:00:12Z","published":"2025-01-20T21:00:12Z","title":"FaceSORT: a Multi-Face Tracking Method based on Biometric and Appearance\n  Features","summary":"  Tracking multiple faces is a difficult problem, as there may be partially\noccluded or lateral faces. In multiple face tracking, association is typically\nbased on (biometric) face features. However, the models used to extract these\nface features usually require frontal face images, which can limit the tracking\nperformance. In this work, a multi-face tracking method inspired by StrongSort,\nFaceSORT, is proposed. To mitigate the problem of partially occluded or lateral\nfaces, biometric face features are combined with visual appearance features\n(i.e., generated by a generic object classifier), with both features are\nextracted from the same face patch. A comprehensive experimental evaluation is\nperformed, including a comparison of different face descriptors, an evaluation\nof different parameter settings, and the application of a different similarity\nmetric. All experiments are conducted with a new multi-face tracking dataset\nand a subset of the ChokePoint dataset. The `Paris Lodron University Salzburg\nFaces in a Queue' dataset consists of a total of seven fully annotated\nsequences (12730 frames) and is made publicly available as part of this work.\nTogether with this dataset, annotations of 6 sequences from the ChokePoint\ndataset are also provided.\n","authors":["Robert Jöchl","Andreas Uhl"],"pdf_url":"https://arxiv.org/pdf/2501.11741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11734v1","updated":"2025-01-20T20:40:28Z","published":"2025-01-20T20:40:28Z","title":"MedicoSAM: Towards foundation models for medical image segmentation","summary":"  Medical image segmentation is an important analysis task in clinical practice\nand research. Deep learning has massively advanced the field, but current\napproaches are mostly based on models trained for a specific task. Training\nsuch models or adapting them to a new condition is costly due to the need for\n(manually) labeled data. The emergence of vision foundation models, especially\nSegment Anything, offers a path to universal segmentation for medical images,\novercoming these issues. Here, we study how to improve Segment Anything for\nmedical images by comparing different finetuning strategies on a large and\ndiverse dataset. We evaluate the finetuned models on a wide range of\ninteractive and (automatic) semantic segmentation tasks. We find that the\nperformance can be clearly improved for interactive segmentation. However,\nsemantic segmentation does not benefit from pretraining on medical images. Our\nbest model, MedicoSAM, is publicly available at\nhttps://github.com/computational-cell-analytics/medico-sam. We show that it is\ncompatible with existing tools for data annotation and believe that it will be\nof great practical value.\n","authors":["Anwai Archit","Luca Freckmann","Constantin Pape"],"pdf_url":"https://arxiv.org/pdf/2501.11734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11733v1","updated":"2025-01-20T20:35:46Z","published":"2025-01-20T20:35:46Z","title":"Mobile-Agent-E: Self-Evolving Mobile Assistant for Complex Tasks","summary":"  Smartphones have become indispensable in modern life, yet navigating complex\ntasks on mobile devices often remains frustrating. Recent advancements in large\nmultimodal model (LMM)-based mobile agents have demonstrated the ability to\nperceive and act in mobile environments. However, current approaches face\nsignificant limitations: they fall short in addressing real-world human needs,\nstruggle with reasoning-intensive and long-horizon tasks, and lack mechanisms\nto learn and improve from prior experiences. To overcome these challenges, we\nintroduce Mobile-Agent-E, a hierarchical multi-agent framework capable of\nself-evolution through past experience. By hierarchical, we mean an explicit\nseparation of high-level planning and low-level action execution. The framework\ncomprises a Manager, responsible for devising overall plans by breaking down\ncomplex tasks into subgoals, and four subordinate agents--Perceptor, Operator,\nAction Reflector, and Notetaker--which handle fine-grained visual perception,\nimmediate action execution, error verification, and information aggregation,\nrespectively. Mobile-Agent-E also features a novel self-evolution module which\nmaintains a persistent long-term memory comprising Tips and Shortcuts. Tips are\ngeneral guidance and lessons learned from prior tasks on how to effectively\ninteract with the environment. Shortcuts are reusable, executable sequences of\natomic operations tailored for specific subroutines. The inclusion of Tips and\nShortcuts facilitates continuous refinement in performance and efficiency.\nAlongside this framework, we introduce Mobile-Eval-E, a new benchmark featuring\ncomplex mobile tasks requiring long-horizon, multi-app interactions. Empirical\nresults show that Mobile-Agent-E achieves a 22% absolute improvement over\nprevious state-of-the-art approaches across three foundation model backbones.\nProject page: https://x-plug.github.io/MobileAgent.\n","authors":["Zhenhailong Wang","Haiyang Xu","Junyang Wang","Xi Zhang","Ming Yan","Ji Zhang","Fei Huang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2501.11733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11729v1","updated":"2025-01-20T20:27:50Z","published":"2025-01-20T20:27:50Z","title":"SeRpEnt: Selective Resampling for Expressive State Space Models","summary":"  State Space Models (SSMs) have recently enjoyed a rise to prominence in the\nfield of deep learning for sequence modeling, especially as an alternative to\nTransformers. Their success stems from avoiding two well-known drawbacks of\nattention-based models: quadratic complexity with respect to the sequence\nlength and inability to model long-range dependencies. The SSM variant Mamba\nhas demonstrated performance comparable to Transformers without any form of\nattention, thanks to the use of a selective mechanism for the state parameters.\nSelectivity, however, is only evaluated empirically and the reasons of its\neffectiveness remain unclear. In this work, we show how selectivity is related\nto the sequence processing. Our analysis shows that selective time intervals in\nMamba act as linear approximators of information. Then, we propose our SeRpEnt\narchitecture, a SSM that further exploits selectivity to compress sequences in\nan information-aware fashion. It employs a resampling mechanism that aggregates\nelements based on their information content. Our empirical results in the Long\nRange Arena benchmark and other language modeling tasks show benefits of the\nSeRpEnt's resampling mechanism.\n","authors":["Stefano Rando","Luca Romani","Matteo Migliarini","Luca Franco","Denis Gudovskiy","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2501.11729v1.pdf","comment":"19 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.11715v1","updated":"2025-01-20T19:55:50Z","published":"2025-01-20T19:55:50Z","title":"GL-ICNN: An End-To-End Interpretable Convolutional Neural Network for\n  the Diagnosis and Prediction of Alzheimer's Disease","summary":"  Deep learning methods based on Convolutional Neural Networks (CNNs) have\nshown great potential to improve early and accurate diagnosis of Alzheimer's\ndisease (AD) dementia based on imaging data. However, these methods have yet to\nbe widely adopted in clinical practice, possibly due to the limited\ninterpretability of deep learning models. The Explainable Boosting Machine\n(EBM) is a glass-box model but cannot learn features directly from input\nimaging data. In this study, we propose a novel interpretable model that\ncombines CNNs and EBMs for the diagnosis and prediction of AD. We develop an\ninnovative training strategy that alternatingly trains the CNN component as a\nfeature extractor and the EBM component as the output block to form an\nend-to-end model. The model takes imaging data as input and provides both\npredictions and interpretable feature importance measures. We validated the\nproposed model on the Alzheimer's Disease Neuroimaging Initiative (ADNI)\ndataset and the Health-RI Parelsnoer Neurodegenerative Diseases Biobank (PND)\nas an external testing set. The proposed model achieved an area-under-the-curve\n(AUC) of 0.956 for AD and control classification, and 0.694 for the prediction\nof conversion of mild cognitive impairment (MCI) to AD on the ADNI cohort. The\nproposed model is a glass-box model that achieves a comparable performance with\nother state-of-the-art black-box models. Our code is publicly available at:\nhttps://anonymous.4open.science/r/GL-ICNN.\n","authors":["Wenjie Kang","Lize Jiskoot","Peter De Deyn","Geert Biessels","Huiberdina Koek","Jurgen Claassen","Huub Middelkoop","Wiesje Flier","Willemijn J. Jansen","Stefan Klein","Esther Bron"],"pdf_url":"https://arxiv.org/pdf/2501.11715v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.06121v3","updated":"2025-01-20T19:22:24Z","published":"2023-05-10T13:11:23Z","title":"Transformer-Based Model for Monocular Visual Odometry: A Video\n  Understanding Approach","summary":"  Estimating the camera's pose given images from a single camera is a\ntraditional task in mobile robots and autonomous vehicles. This problem is\ncalled monocular visual odometry and often relies on geometric approaches that\nrequire considerable engineering effort for a specific scenario. Deep learning\nmethods have been shown to be generalizable after proper training and with a\nlarge amount of available data. Transformer-based architectures have dominated\nthe state-of-the-art in natural language processing and computer vision tasks,\nsuch as image and video understanding. In this work, we deal with the monocular\nvisual odometry as a video understanding task to estimate the 6 degrees of\nfreedom of a camera's pose. We contribute by presenting the TSformer-VO model\nbased on spatio-temporal self-attention mechanisms to extract features from\nclips and estimate the motions in an end-to-end manner. Our approach achieved\ncompetitive state-of-the-art performance compared with geometry-based and deep\nlearning-based methods on the KITTI visual odometry dataset, outperforming the\nDeepVO implementation highly accepted in the visual odometry community. The\ncode is publicly available at https://github.com/aofrancani/TSformer-VO.\n","authors":["André O. Françani","Marcos R. O. A. Maximo"],"pdf_url":"https://arxiv.org/pdf/2305.06121v3.pdf","comment":"This work has been accepted for publication in IEEE Access"},{"id":"http://arxiv.org/abs/2301.07666v2","updated":"2025-01-20T18:51:08Z","published":"2023-01-18T17:20:08Z","title":"DDS: Decoupled Dynamic Scene-Graph Generation Network","summary":"  Scene-graph generation involves creating a structural representation of the\nrelationships between objects in a scene by predicting subject-object-relation\ntriplets from input data. Existing methods show poor performance in detecting\ntriplets outside of a predefined set, primarily due to their reliance on\ndependent feature learning. To address this issue, we propose DDS -- a\ndecoupled dynamic scene-graph generation network -- that consists of two\nindependent branches that can disentangle extracted features. The key\ninnovation of the current paper is the decoupling of the features representing\nthe relationships from those of the objects, which enables the detection of\nnovel object-relationship combinations. The DDS model is evaluated on three\ndatasets and outperforms previous methods by a significant margin, especially\nin detecting previously unseen triplets.\n","authors":["A S M Iftekhar","Raphael Ruschel","Satish Kumar","Suya You","B. S. Manjunath"],"pdf_url":"https://arxiv.org/pdf/2301.07666v2.pdf","comment":"Accepted in WACV 2025"},{"id":"http://arxiv.org/abs/2412.05335v2","updated":"2025-01-20T18:50:07Z","published":"2024-12-05T23:04:45Z","title":"Flexible Mesh Segmentation via Reeb Graph Representation of Geometrical\n  and Topological Features","summary":"  This paper presents a new mesh segmentation method that integrates\ngeometrical and topological features through a flexible Reeb graph\nrepresentation. The algorithm consists of three phases: construction of the\nReeb graph using the improved topological skeleton approach, topological\nsimplification of the graph by cancelling critical points while preserving\nessential features, and generation of contiguous segments via an adaptive\nregion-growth process that takes geometric and topological criteria into\naccount. Operating with a computational complexity of O(n log(n)) for a mesh of\nn vertices, the method demonstrates both efficiency and scalability. An\nevaluation through case studies, including part-based decomposition with Shape\nDiameter Function and terrain analysis with Shape Index, validates the\neffectiveness of the method in completely different applications. The results\nestablish this approach as a robust framework for advanced geometric analysis\nof meshes, connecting the geometric and topological features of shapes.\n","authors":["Florian Beguet","Sandrine Lanquetin","Romain Raffin"],"pdf_url":"https://arxiv.org/pdf/2412.05335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11653v1","updated":"2025-01-20T18:33:46Z","published":"2025-01-20T18:33:46Z","title":"Dynamic Scene Understanding from Vision-Language Representations","summary":"  Images depicting complex, dynamic scenes are challenging to parse\nautomatically, requiring both high-level comprehension of the overall situation\nand fine-grained identification of participating entities and their\ninteractions. Current approaches use distinct methods tailored to sub-tasks\nsuch as Situation Recognition and detection of Human-Human and Human-Object\nInteractions. However, recent advances in image understanding have often\nleveraged web-scale vision-language (V&L) representations to obviate\ntask-specific engineering. In this work, we propose a framework for dynamic\nscene understanding tasks by leveraging knowledge from modern, frozen V&L\nrepresentations. By framing these tasks in a generic manner - as predicting and\nparsing structured text, or by directly concatenating representations to the\ninput of existing models - we achieve state-of-the-art results while using a\nminimal number of trainable parameters relative to existing approaches.\nMoreover, our analysis of dynamic knowledge of these representations shows that\nrecent, more powerful representations effectively encode dynamic scene\nsemantics, making this approach newly possible.\n","authors":["Shahaf Pruss","Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2501.11653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01777v2","updated":"2025-01-20T18:14:47Z","published":"2024-11-04T03:58:09Z","title":"Learning predictable and robust neural representations by straightening\n  image sequences","summary":"  Prediction is a fundamental capability of all living organisms, and has been\nproposed as an objective for learning sensory representations. Recent work\ndemonstrates that in primate visual systems, prediction is facilitated by\nneural representations that follow straighter temporal trajectories than their\ninitial photoreceptor encoding, which allows for prediction by linear\nextrapolation. Inspired by these experimental findings, we develop a\nself-supervised learning (SSL) objective that explicitly quantifies and\npromotes straightening. We demonstrate the power of this objective in training\ndeep feedforward neural networks on smoothly-rendered synthetic image sequences\nthat mimic commonly-occurring properties of natural videos. The learned model\ncontains neural embeddings that are predictive, but also factorize the\ngeometric, photometric, and semantic attributes of objects. The representations\nalso prove more robust to noise and adversarial attacks compared to previous\nSSL methods that optimize for invariance to random augmentations. Moreover,\nthese beneficial properties can be transferred to other training procedures by\nusing the straightening objective as a regularizer, suggesting a broader\nutility for straightening as a principle for robust unsupervised learning.\n","authors":["Xueyan Niu","Cristina Savin","Eero P. Simoncelli"],"pdf_url":"https://arxiv.org/pdf/2411.01777v2.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.11623v1","updated":"2025-01-20T17:46:12Z","published":"2025-01-20T17:46:12Z","title":"Early evidence of how LLMs outperform traditional systems on OCR/HTR\n  tasks for historical records","summary":"  We explore the ability of two LLMs -- GPT-4o and Claude Sonnet 3.5 -- to\ntranscribe historical handwritten documents in a tabular format and compare\ntheir performance to traditional OCR/HTR systems: EasyOCR, Keras, Pytesseract,\nand TrOCR. Considering the tabular form of the data, two types of experiments\nare executed: one where the images are split line by line and the other where\nthe entire scan is used as input. Based on CER and BLEU, we demonstrate that\nLLMs outperform the conventional OCR/HTR methods. Moreover, we also compare the\nevaluated CER and BLEU scores to human evaluations to better judge the outputs\nof whole-scan experiments and understand influential factors for CER and BLEU.\nCombining judgments from all the evaluation metrics, we conclude that two-shot\nGPT-4o for line-by-line images and two-shot Claude Sonnet 3.5 for whole-scan\nimages yield the transcriptions of the historical records most similar to the\nground truth.\n","authors":["Seorin Kim","Julien Baudru","Wouter Ryckbosch","Hugues Bersini","Vincent Ginis"],"pdf_url":"https://arxiv.org/pdf/2501.11623v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.09624v4","updated":"2025-01-20T16:51:42Z","published":"2024-12-12T18:59:57Z","title":"GenEx: Generating an Explorable World","summary":"  Understanding, navigating, and exploring the 3D physical real world has long\nbeen a central challenge in the development of artificial intelligence. In this\nwork, we take a step toward this goal by introducing GenEx, a system capable of\nplanning complex embodied world exploration, guided by its generative\nimagination that forms priors (expectations) about the surrounding\nenvironments. GenEx generates an entire 3D-consistent imaginative environment\nfrom as little as a single RGB image, bringing it to life through panoramic\nvideo streams. Leveraging scalable 3D world data curated from Unreal Engine,\nour generative model is rounded in the physical world. It captures a continuous\n360-degree environment with little effort, offering a boundless landscape for\nAI agents to explore and interact with. GenEx achieves high-quality world\ngeneration, robust loop consistency over long trajectories, and demonstrates\nstrong 3D capabilities such as consistency and active 3D mapping. Powered by\ngenerative imagination of the world, GPT-assisted agents are equipped to\nperform complex embodied tasks, including both goal-agnostic exploration and\ngoal-driven navigation. These agents utilize predictive expectation regarding\nunseen parts of the physical world to refine their beliefs, simulate different\noutcomes based on potential decisions, and make more informed choices. In\nsummary, we demonstrate that GenEx provides a transformative platform for\nadvancing embodied AI in imaginative spaces and brings potential for extending\nthese capabilities to real-world exploration.\n","authors":["Taiming Lu","Tianmin Shu","Junfei Xiao","Luoxin Ye","Jiahao Wang","Cheng Peng","Chen Wei","Daniel Khashabi","Rama Chellappa","Alan Yuille","Jieneng Chen"],"pdf_url":"https://arxiv.org/pdf/2412.09624v4.pdf","comment":"Website: GenEx.world"},{"id":"http://arxiv.org/abs/2501.11586v1","updated":"2025-01-20T16:44:37Z","published":"2025-01-20T16:44:37Z","title":"Compressibility Analysis for the differentiable shift-variant Filtered\n  Backprojection Model","summary":"  The differentiable shift-variant filtered backprojection (FBP) model enables\nthe reconstruction of cone-beam computed tomography (CBCT) data for any\nnon-circular trajectories. This method employs deep learning technique to\nestimate the redundancy weights required for reconstruction, given knowledge of\nthe specific trajectory at optimization time. However, computing the redundancy\nweight for each projection remains computationally intensive. This paper\npresents a novel approach to compress and optimize the differentiable\nshift-variant FBP model based on Principal Component Analysis (PCA). We apply\nPCA to the redundancy weights learned from sinusoidal trajectory projection\ndata, revealing significant parameter redundancy in the original model. By\nintegrating PCA directly into the differentiable shift-variant FBP\nreconstruction pipeline, we develop a method that decomposes the redundancy\nweight layer parameters into a trainable eigenvector matrix, compressed\nweights, and a mean vector. This innovative technique achieves a remarkable\n97.25% reduction in trainable parameters without compromising reconstruction\naccuracy. As a result, our algorithm significantly decreases the complexity of\nthe differentiable shift-variant FBP model and greatly improves training speed.\nThese improvements make the model substantially more practical for real-world\napplications.\n","authors":["Chengze Ye","Linda-Sophie Schneider","Yipeng Sun","Mareike Thies","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2501.11586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19158v2","updated":"2025-01-20T16:42:04Z","published":"2024-11-28T14:00:00Z","title":"Bayesian Deconvolution of Astronomical Images with Diffusion Models:\n  Quantifying Prior-Driven Features in Reconstructions","summary":"  Deconvolution of astronomical images is a key aspect of recovering the\nintrinsic properties of celestial objects, especially when considering\nground-based observations. This paper explores the use of diffusion models\n(DMs) and the Diffusion Posterior Sampling (DPS) algorithm to solve this\ninverse problem task. We apply score-based DMs trained on high-resolution\ncosmological simulations, through a Bayesian setting to compute a posterior\ndistribution given the observations available. By considering the redshift and\nthe pixel scale as parameters of our inverse problem, the tool can be easily\nadapted to any dataset. We test our model on Hyper Supreme Camera (HSC) data\nand show that we reach resolutions comparable to those obtained by Hubble Space\nTelescope (HST) images. Most importantly, we quantify the uncertainty of\nreconstructions and propose a metric to identify prior-driven features in the\nreconstructed images, which is key in view of applying these methods for\nscientific purposes.\n","authors":["Alessio Spagnoletti","Alexandre Boucaud","Marc Huertas-Company","Wassim Kabalan","Biswajit Biswas"],"pdf_url":"https://arxiv.org/pdf/2411.19158v2.pdf","comment":"5+5 pages, 16 figures, Machine Learning and the Physical Sciences\n  Workshop, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.11561v1","updated":"2025-01-20T16:04:57Z","published":"2025-01-20T16:04:57Z","title":"Teaching Large Language Models to Regress Accurate Image Quality Scores\n  using Score Distribution","summary":"  With the rapid advancement of Multi-modal Large Language Models (MLLMs),\nMLLM-based Image Quality Assessment (IQA) methods have shown promising\nperformance in linguistic quality description. However, current methods still\nfall short in accurately scoring image quality. In this work, we aim to\nleverage MLLMs to regress accurate quality scores. A key challenge is that the\nquality score is inherently continuous, typically modeled as a Gaussian\ndistribution, whereas MLLMs generate discrete token outputs. This mismatch\nnecessitates score discretization. Previous approaches discretize the mean\nscore into a one-hot label, resulting in information loss and failing to\ncapture inter-image relationships. We propose a distribution-based approach\nthat discretizes the score distribution into a soft label. This method\npreserves the characteristics of the score distribution, achieving high\naccuracy and maintaining inter-image relationships. Moreover, to address\ndataset variation, where different IQA datasets exhibit various distributions,\nwe introduce a fidelity loss based on Thurstone's model. This loss captures\nintra-dataset relationships, facilitating co-training across multiple IQA\ndatasets. With these designs, we develop the distribution-based Depicted image\nQuality Assessment model for Score regression (DeQA-Score). Experiments across\nmultiple benchmarks show that DeQA-Score stably outperforms baselines in score\nregression. Also, DeQA-Score can predict the score distribution that closely\naligns with human annotations. Codes and model weights have been released in\nhttps://depictqa.github.io/deqa-score/.\n","authors":["Zhiyuan You","Xin Cai","Jinjin Gu","Tianfan Xue","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2501.11561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07613v2","updated":"2025-01-20T15:59:39Z","published":"2024-09-11T20:50:41Z","title":"Token Turing Machines are Efficient Vision Models","summary":"  We propose Vision Token Turing Machines (ViTTM), an efficient, low-latency,\nmemory-augmented Vision Transformer (ViT). Our approach builds on Neural Turing\nMachines and Token Turing Machines, which were applied to NLP and sequential\nvisual understanding tasks. ViTTMs are designed for non-sequential computer\nvision tasks such as image classification and segmentation. Our model creates\ntwo sets of tokens: process tokens and memory tokens; process tokens pass\nthrough encoder blocks and read-write from memory tokens at each encoder block\nin the network, allowing them to store and retrieve information from memory. By\nensuring that there are fewer process tokens than memory tokens, we are able to\nreduce the inference time of the network while maintaining its accuracy. On\nImageNet-1K, the state-of-the-art ViT-B has median latency of 529.5ms and 81.0%\naccuracy, while our ViTTM-B is 56% faster (234.1ms), with 2.4 times fewer\nFLOPs, with an accuracy of 82.9%. On ADE20K semantic segmentation, ViT-B\nachieves 45.65mIoU at 13.8 frame-per-second (FPS) whereas our ViTTM-B model\nacheives a 45.17 mIoU with 26.8 FPS (+94%).\n","authors":["Purvish Jajal","Nick John Eliopoulos","Benjamin Shiue-Hal Chou","George K. Thiravathukal","James C. Davis","Yung-Hsiang Lu"],"pdf_url":"https://arxiv.org/pdf/2409.07613v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2501.11554v1","updated":"2025-01-20T15:41:33Z","published":"2025-01-20T15:41:33Z","title":"Event-based vision for egomotion estimation using precise event timing","summary":"  Egomotion estimation is crucial for applications such as autonomous\nnavigation and robotics, where accurate and real-time motion tracking is\nrequired. However, traditional methods relying on inertial sensors are highly\nsensitive to external conditions, and suffer from drifts leading to large\ninaccuracies over long distances. Vision-based methods, particularly those\nutilising event-based vision sensors, provide an efficient alternative by\ncapturing data only when changes are perceived in the scene. This approach\nminimises power consumption while delivering high-speed, low-latency feedback.\nIn this work, we propose a fully event-based pipeline for egomotion estimation\nthat processes the event stream directly within the event-based domain. This\nmethod eliminates the need for frame-based intermediaries, allowing for\nlow-latency and energy-efficient motion estimation. We construct a shallow\nspiking neural network using a synaptic gating mechanism to convert precise\nevent timing into bursts of spikes. These spikes encode local optical flow\nvelocities, and the network provides an event-based readout of egomotion. We\nevaluate the network's performance on a dedicated chip, demonstrating strong\npotential for low-latency, low-power motion estimation. Additionally,\nsimulations of larger networks show that the system achieves state-of-the-art\naccuracy in egomotion estimation tasks with event-based cameras, making it a\npromising solution for real-time, power-constrained robotics applications.\n","authors":["Hugh Greatorex","Michele Mastella","Madison Cotteret","Ole Richter","Elisabetta Chicca"],"pdf_url":"https://arxiv.org/pdf/2501.11554v1.pdf","comment":"10 pages, 7 figures. Supplementary material: 4 pages, 1 figure"},{"id":"http://arxiv.org/abs/2501.11535v1","updated":"2025-01-20T15:21:16Z","published":"2025-01-20T15:21:16Z","title":"A baseline for machine-learning-based hepatocellular carcinoma diagnosis\n  using multi-modal clinical data","summary":"  The objective of this paper is to provide a baseline for performing\nmulti-modal data classification on a novel open multimodal dataset of\nhepatocellular carcinoma (HCC), which includes both image data\n(contrast-enhanced CT and MRI images) and tabular data (the clinical laboratory\ntest data as well as case report forms). TNM staging is the classification\ntask. Features from the vectorized preprocessed tabular data and radiomics\nfeatures from contrast-enhanced CT and MRI images are collected. Feature\nselection is performed based on mutual information. An XGBoost classifier\npredicts the TNM staging and it shows a prediction accuracy of $0.89 \\pm 0.05$\nand an AUC of $0.93 \\pm 0.03$. The classifier shows that this high level of\nprediction accuracy can only be obtained by combining image and clinical\nlaboratory data and therefore is a good example case where multi-model\nclassification is mandatory to achieve accurate results.\n","authors":["Binwu Wang","Isaac Rodriguez","Leon Breitinger","Fabian Tollens","Timo Itzel","Dennis Grimm","Andrei Sirazitdinov","Matthias Frölich","Stefan Schönberg","Andreas Teufel","Jürgen Hesser","Wenzhao Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.11535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03789v2","updated":"2025-01-20T15:19:37Z","published":"2024-08-07T14:14:05Z","title":"Counterfactuals and Uncertainty-Based Explainable Paradigm for the\n  Automated Detection and Segmentation of Renal Cysts in Computed Tomography\n  Images: A Multi-Center Study","summary":"  Routine computed tomography (CT) scans often detect a wide range of renal\ncysts, some of which may be malignant. Early and precise localization of these\ncysts can significantly aid quantitative image analysis. Current segmentation\nmethods, however, do not offer sufficient interpretability at the feature and\npixel levels, emphasizing the necessity for an explainable framework that can\ndetect and rectify model inaccuracies. We developed an interpretable\nsegmentation framework and validated it on a multi-centric dataset. A\nVariational Autoencoder Generative Adversarial Network (VAE-GAN) was employed\nto learn the latent representation of 3D input patches and reconstruct input\nimages. Modifications in the latent representation using the gradient of the\nsegmentation model generated counterfactual explanations for varying dice\nsimilarity coefficients (DSC). Radiomics features extracted from these\ncounterfactual images, using a ground truth cyst mask, were analyzed to\ndetermine their correlation with segmentation performance. The DSCs for the\noriginal and VAE-GAN reconstructed images for counterfactual image generation\nshowed no significant differences. Counterfactual explanations highlighted how\nvariations in cyst image features influence segmentation outcomes and showed\nmodel discrepancies. Radiomics features correlating positively and negatively\nwith dice scores were identified. The uncertainty of the predicted segmentation\nmasks was estimated using posterior sampling of the weight space. The\ncombination of counterfactual explanations and uncertainty maps provided a\ndeeper understanding of the image features within the segmented renal cysts\nthat lead to high uncertainty. The proposed segmentation framework not only\nachieved high segmentation accuracy but also increased interpretability\nregarding how image features impact segmentation performance.\n","authors":["Zohaib Salahuddin","Abdalla Ibrahim","Sheng Kuang","Yousif Widaatalla","Razvan L. Miclea","Oliver Morin","Spencer Behr","Marnix P. M. Kop","Tom Marcelissen","Patricia Zondervan","Auke Jager","Philippe Lambin","Henry C Woodruff"],"pdf_url":"https://arxiv.org/pdf/2408.03789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.02010v2","updated":"2025-01-20T14:56:15Z","published":"2022-04-05T06:14:52Z","title":"LatentGAN Autoencoder: Learning Disentangled Latent Distribution","summary":"  In autoencoder, the encoder generally approximates the latent distribution\nover the dataset, and the decoder generates samples using this learned latent\ndistribution. There is very little control over the latent vector as using the\nrandom latent vector for generation will lead to trivial outputs. This work\ntries to address this issue by using the LatentGAN generator to directly learn\nto approximate the latent distribution of the autoencoder and show meaningful\nresults on MNIST, 3D Chair, and CelebA datasets, an additional\ninformation-theoretic constrain is used which successfully learns to control\nautoencoder latent distribution. With this, our model also achieves an error\nrate of 2.38 on MNIST unsupervised image classification, which is better as\ncompared to InfoGAN and AAE.\n","authors":["Sanket Kalwar","Animikh Aich","Tanay Dixit","Adit Chhabra"],"pdf_url":"https://arxiv.org/pdf/2204.02010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11520v1","updated":"2025-01-20T14:49:54Z","published":"2025-01-20T14:49:54Z","title":"Fundus Image Quality Assessment and Enhancement: a Systematic Review","summary":"  As an affordable and convenient eye scan, fundus photography holds the\npotential for preventing vision impairment, especially in resource-limited\nregions. However, fundus image degradation is common under intricate imaging\nenvironments, impacting following diagnosis and treatment. Consequently, image\nquality assessment (IQA) and enhancement (IQE) are essential for ensuring the\nclinical value and reliability of fundus images. While existing reviews offer\nsome overview of this field, a comprehensive analysis of the interplay between\nIQA and IQE, along with their clinical deployment challenges, is lacking. This\npaper addresses this gap by providing a thorough review of fundus IQA and IQE\nalgorithms, research advancements, and practical applications. We outline the\nfundamentals of the fundus photography imaging system and the associated\ninterferences, and then systematically summarize the paradigms in fundus IQA\nand IQE. Furthermore, we discuss the practical challenges and solutions in\ndeploying IQA and IQE, as well as offer insights into potential future research\ndirections.\n","authors":["Heng Li","Haojin Li","Mingyang Ou","Xiangyang Yu","Xiaoqing Zhang","Ke Niu","Huazhu Fu","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11515v1","updated":"2025-01-20T14:45:07Z","published":"2025-01-20T14:45:07Z","title":"UltraFusion: Ultra High Dynamic Imaging using Exposure Fusion","summary":"  Capturing high dynamic range (HDR) scenes is one of the most important issues\nin camera design. Majority of cameras use exposure fusion technique, which\nfuses images captured by different exposure levels, to increase dynamic range.\nHowever, this approach can only handle images with limited exposure difference,\nnormally 3-4 stops. When applying to very high dynamic scenes where a large\nexposure difference is required, this approach often fails due to incorrect\nalignment or inconsistent lighting between inputs, or tone mapping artifacts.\nIn this work, we propose UltraFusion, the first exposure fusion technique that\ncan merge input with 9 stops differences. The key idea is that we model the\nexposure fusion as a guided inpainting problem, where the under-exposed image\nis used as a guidance to fill the missing information of over-exposed highlight\nin the over-exposed region. Using under-exposed image as a soft guidance,\ninstead of a hard constrain, our model is robust to potential alignment issue\nor lighting variations. Moreover, utilizing the image prior of the generative\nmodel, our model also generates natural tone mapping, even for very\nhigh-dynamic range scene. Our approach outperforms HDR-Transformer on latest\nHDR benchmarks. Moreover, to test its performance in ultra high dynamic range\nscene, we capture a new real-world exposure fusion benchmark, UltraFusion\nDataset, with exposure difference up to 9 stops, and experiments show that\n\\model~can generate beautiful and high-quality fusion results under various\nscenarios. An online demo is provided at\nhttps://openimaginglab.github.io/UltraFusion/.\n","authors":["Zixuan Chen","Yujin Wang","Xin Cai","Zhiyuan You","Zheming Lu","Fan Zhang","Shi Guo","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2501.11515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11513v1","updated":"2025-01-20T14:43:43Z","published":"2025-01-20T14:43:43Z","title":"Transferability of labels between multilens cameras","summary":"  In this work, a new method for automatically extending Bounding Box (BB) and\nmask labels across different channels on multilens cameras is presented. For\nthat purpose, the proposed method combines the well known phase correlation\nmethod with a refinement process. During the first step, images are aligned by\nlocalizing the peak of intensity obtained in the spatial domain after\nperforming the cross correlation process in the frequency domain. The second\nstep consists of obtaining the best possible transformation by using an\niterative process maximising the IoU (Intersection over Union) metric. Results\nshow that, by using this method, labels could be transferred across different\nlens on a camera with an accuracy over 90% in most cases and just by using 65\nms in the whole process. Once the transformations are obtained, artificial RGB\nimages are generated, for labeling them so as to transfer this information into\neach of the other lens. This work will allow users to use this type of cameras\nin more fields rather than satellite or medical imagery, giving the chance of\nlabeling even invisible objects in the visible spectrum.\n","authors":["Ignacio de Loyola Páez-Ubieta","Daniel Frau-Alfaro","Santiago T. Puente"],"pdf_url":"https://arxiv.org/pdf/2501.11513v1.pdf","comment":"This is a preprint version of the work accepted at 20th International\n  Conference on Computer Vision Theory and Applications (VISAPP 2025)"},{"id":"http://arxiv.org/abs/2501.11512v1","updated":"2025-01-20T14:41:29Z","published":"2025-01-20T14:41:29Z","title":"Multitask Auxiliary Network for Perceptual Quality Assessment of\n  Non-Uniformly Distorted Omnidirectional Images","summary":"  Omnidirectional image quality assessment (OIQA) has been widely investigated\nin the past few years and achieved much success. However, most of existing\nstudies are dedicated to solve the uniform distortion problem in OIQA, which\nhas a natural gap with the non-uniform distortion problem, and their ability in\ncapturing non-uniform distortion is far from satisfactory. To narrow this gap,\nin this paper, we propose a multitask auxiliary network for non-uniformly\ndistorted omnidirectional images, where the parameters are optimized by jointly\ntraining the main task and other auxiliary tasks. The proposed network mainly\nconsists of three parts: a backbone for extracting multiscale features from the\nviewport sequence, a multitask feature selection module for dynamically\nallocating specific features to different tasks, and auxiliary sub-networks for\nguiding the proposed model to capture local distortion and global quality\nchange. Extensive experiments conducted on two large-scale OIQA databases\ndemonstrate that the proposed model outperforms other state-of-the-art OIQA\nmetrics, and these auxiliary sub-networks contribute to improve the performance\nof the proposed model. The source code is available at\nhttps://github.com/RJL2000/MTAOIQA.\n","authors":["Jiebin Yan","Jiale Rao","Junjie Chen","Ziwen Tan","Weide Liu","Yuming Fang"],"pdf_url":"https://arxiv.org/pdf/2501.11512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11511v1","updated":"2025-01-20T14:39:50Z","published":"2025-01-20T14:39:50Z","title":"Subjective and Objective Quality Assessment of Non-Uniformly Distorted\n  Omnidirectional Images","summary":"  Omnidirectional image quality assessment (OIQA) has been one of the hot\ntopics in IQA with the continuous development of VR techniques, and achieved\nmuch success in the past few years. However, most studies devote themselves to\nthe uniform distortion issue, i.e., all regions of an omnidirectional image are\nperturbed by the ``same amount'' of noise, while ignoring the non-uniform\ndistortion issue, i.e., partial regions undergo ``different amount'' of\nperturbation with the other regions in the same omnidirectional image.\nAdditionally, nearly all OIQA models are verified on the platforms containing a\nlimited number of samples, which largely increases the over-fitting risk and\ntherefore impedes the development of OIQA. To alleviate these issues, we\nelaborately explore this topic from both subjective and objective perspectives.\nSpecifically, we construct a large OIQA database containing 10,320\nnon-uniformly distorted omnidirectional images, each of which is generated by\nconsidering quality impairments on one or two camera len(s). Then we\nmeticulously conduct psychophysical experiments and delve into the influence of\nboth holistic and individual factors (i.e., distortion range and viewing\ncondition) on omnidirectional image quality. Furthermore, we propose a\nperception-guided OIQA model for non-uniform distortion by adaptively\nsimulating users' viewing behavior. Experimental results demonstrate that the\nproposed model outperforms state-of-the-art methods. The source code is\navailable at https://github.com/RJL2000/OIQAND.\n","authors":["Jiebin Yan","Jiale Rao","Xuelin Liu","Yuming Fang","Yifan Zuo","Weide Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13302v2","updated":"2025-01-20T14:34:42Z","published":"2024-06-19T07:42:48Z","title":"SituationalLLM: Proactive Language Models with Scene Awareness for\n  Dynamic, Contextual Task Guidance","summary":"  Large language models (LLMs) have achieved remarkable success in text-based\ntasks but often struggle to provide actionable guidance in real-world physical\nenvironments. This is because of their inability to recognize their limited\nunderstanding of the user's physical context. We present SituationalLLM, a\nnovel approach that integrates structured scene information into an LLM to\ndeliver proactive, context-aware assistance. By encoding objects, attributes,\nand relationships in a custom Scene Graph Language, SituationalLLM actively\nidentifies gaps in environmental context and seeks clarifications during user\ninteractions. This behavior emerges from training on the Situational Awareness\nDatabase for Instruct-Tuning (SAD-Instruct), which combines diverse,\nscenario-specific scene graphs with iterative, dialogue-based refinements.\nExperimental results indicate that SituationalLLM outperforms generic LLM\nbaselines in task specificity, reliability, and adaptability, paving the way\nfor environment-aware AI assistants capable of delivering robust, user-centric\nguidance under real-world constraints.\n","authors":["Muhammad Saif Ullah Khan","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2406.13302v2.pdf","comment":"Submitted to Open Research Europe"},{"id":"http://arxiv.org/abs/2501.11508v1","updated":"2025-01-20T14:30:38Z","published":"2025-01-20T14:30:38Z","title":"See In Detail: Enhancing Sparse-view 3D Gaussian Splatting with Local\n  Depth and Semantic Regularization","summary":"  3D Gaussian Splatting (3DGS) has shown remarkable performance in novel view\nsynthesis. However, its rendering quality deteriorates with sparse inphut\nviews, leading to distorted content and reduced details. This limitation\nhinders its practical application. To address this issue, we propose a\nsparse-view 3DGS method. Given the inherently ill-posed nature of sparse-view\nrendering, incorporating prior information is crucial. We propose a semantic\nregularization technique, using features extracted from the pretrained DINO-ViT\nmodel, to ensure multi-view semantic consistency. Additionally, we propose\nlocal depth regularization, which constrains depth values to improve\ngeneralization on unseen views. Our method outperforms state-of-the-art novel\nview synthesis approaches, achieving up to 0.4dB improvement in terms of PSNR\non the LLFF dataset, with reduced distortion and enhanced visual quality.\n","authors":["Zongqi He","Zhe Xiao","Kin-Chung Chan","Yushen Zuo","Jun Xiao","Kin-Man Lam"],"pdf_url":"https://arxiv.org/pdf/2501.11508v1.pdf","comment":"5 pages, 5 figures, has been accepted by the ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.11493v1","updated":"2025-01-20T13:59:41Z","published":"2025-01-20T13:59:41Z","title":"Communication-Efficient Federated Learning Based on Explanation-Guided\n  Pruning for Remote Sensing Image Classification","summary":"  Federated learning (FL) is a decentralized machine learning paradigm, where\nmultiple clients collaboratively train a global model by exchanging only model\nupdates with the central server without sharing the local data of clients. Due\nto the large volume of model updates required to be transmitted between clients\nand the central server, most FL systems are associated with high transfer costs\n(i.e., communication overhead). This issue is more critical for operational\napplications in remote sensing (RS), especially when large-scale RS data is\nprocessed and analyzed through FL systems with restricted communication\nbandwidth. To address this issue, we introduce an explanation-guided pruning\nstrategy for communication-efficient FL in the context of RS image\nclassification. Our pruning strategy is defined based on the layerwise\nrelevance propagation (LRP) driven explanations to: 1) efficiently and\neffectively identify the most relevant and informative model parameters (to be\nexchanged between clients and the central server); and 2) eliminate the\nnon-informative ones to minimize the volume of model updates. The experimental\nresults on the BigEarthNet-S2 dataset demonstrate that our strategy effectively\nreduces the number of shared model updates, while increasing the generalization\nability of the global model. The code of this work will be publicly available\nat https://git.tu-berlin.de/rsim/FL-LRP\n","authors":["Jonas Klotz","Barış Büyüktaş","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2501.11493v1.pdf","comment":"Submitted to the IEEE International Geoscience and Remote Sensing\n  Symposium (IGARSS) 2025"},{"id":"http://arxiv.org/abs/2409.13389v2","updated":"2025-01-20T13:53:40Z","published":"2024-09-20T10:43:10Z","title":"Feature-Centered First Order Structure Tensor Scale-Space in 2D and 3D","summary":"  The structure tensor method is often used for 2D and 3D analysis of imaged\nstructures, but its results are in many cases very dependent on the user's\nchoice of method parameters. We simplify this parameter choice in first order\nstructure tensor scale-space by directly connecting the width of the derivative\nfilter to the size of image features. By introducing a ring-filter step, we\nsubstitute the Gaussian integration/smoothing with a method that more\naccurately shifts the derivative filter response from feature edges to their\ncenter. We further demonstrate how extracted structural measures can be used to\ncorrect known inaccuracies in the scale map, resulting in a reliable\nrepresentation of the feature sizes both in 2D and 3D. Compared to the\ntraditional first order structure tensor, or previous structure tensor\nscale-space approaches, our solution is much more accurate and can serve as an\nout-of-the-box method for extracting a wide range of structural parameters with\nminimal user input.\n","authors":["Pawel Tomasz Pieta","Anders Bjorholm Dahl","Jeppe Revall Frisvad","Siavash Arjomand Bigdeli","Anders Nymark Christensen"],"pdf_url":"https://arxiv.org/pdf/2409.13389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09600v3","updated":"2025-01-20T13:45:54Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n  Prototyping in Virtual Reality Applications","summary":"  SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v3.pdf","comment":"Accepted to IEEE VR 2025"},{"id":"http://arxiv.org/abs/2410.23142v2","updated":"2025-01-20T13:41:21Z","published":"2024-10-30T15:58:03Z","title":"FAIR-TAT: Improving Model Fairness Using Targeted Adversarial Training","summary":"  Deep neural networks are susceptible to adversarial attacks and common\ncorruptions, which undermine their robustness. In order to enhance model\nresilience against such challenges, Adversarial Training (AT) has emerged as a\nprominent solution. Nevertheless, adversarial robustness is often attained at\nthe expense of model fairness during AT, i.e., disparity in class-wise\nrobustness of the model. While distinctive classes become more robust towards\nsuch adversaries, hard to detect classes suffer. Recently, research has focused\non improving model fairness specifically for perturbed images, overlooking the\naccuracy of the most likely non-perturbed data. Additionally, despite their\nrobustness against the adversaries encountered during model training,\nstate-of-the-art adversarial trained models have difficulty maintaining\nrobustness and fairness when confronted with diverse adversarial threats or\ncommon corruptions. In this work, we address the above concerns by introducing\na novel approach called Fair Targeted Adversarial Training (FAIR-TAT). We show\nthat using targeted adversarial attacks for adversarial training (instead of\nuntargeted attacks) can allow for more favorable trade-offs with respect to\nadversarial fairness. Empirical results validate the efficacy of our approach.\n","authors":["Tejaswini Medi","Steffen Jung","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2410.23142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11485v1","updated":"2025-01-20T13:36:30Z","published":"2025-01-20T13:36:30Z","title":"SimLabel: Consistency-Guided OOD Detection with Pretrained\n  Vision-Language Models","summary":"  Detecting out-of-distribution (OOD) data is crucial in real-world machine\nlearning applications, particularly in safety-critical domains. Existing\nmethods often leverage language information from vision-language models (VLMs)\nto enhance OOD detection by improving confidence estimation through rich\nclass-wise text information. However, when building OOD detection score upon on\nin-distribution (ID) text-image affinity, existing works either focus on each\nID class or whole ID label sets, overlooking inherent ID classes' connection.\nWe find that the semantic information across different ID classes is beneficial\nfor effective OOD detection. We thus investigate the ability of image-text\ncomprehension among different semantic-related ID labels in VLMs and propose a\nnovel post-hoc strategy called SimLabel. SimLabel enhances the separability\nbetween ID and OOD samples by establishing a more robust image-class similarity\nmetric that considers consistency over a set of similar class labels. Extensive\nexperiments demonstrate the superior performance of SimLabel on various\nzero-shot OOD detection benchmarks. The proposed model is also extended to\nvarious VLM-backbones, demonstrating its good generalization ability. Our\ndemonstration and implementation codes are available at:\nhttps://github.com/ShuZou-1/SimLabel.\n","authors":["Shu Zou","Xinyu Tian","Qinyu Zhao","Zhaoyuan Yang","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11485v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2409.16902v2","updated":"2025-01-20T13:01:46Z","published":"2024-09-25T13:10:03Z","title":"Towards Underwater Camouflaged Object Tracking: Benchmark and Baselines","summary":"  Over the past decade, significant progress has been made in visual object\ntracking, largely due to the availability of large-scale datasets. However,\nexisting tracking datasets are primarily focused on open-air scenarios, which\ngreatly limits the development of object tracking in underwater environments.\nTo bridge this gap, we take a step forward by proposing the first large-scale\nmultimodal underwater camouflaged object tracking dataset, namely UW-COT220.\nBased on the proposed dataset, this paper first comprehensively evaluates\ncurrent advanced visual object tracking methods and SAM- and SAM2-based\ntrackers in challenging underwater environments. Our findings highlight the\nimprovements of SAM2 over SAM, demonstrating its enhanced ability to handle the\ncomplexities of underwater camouflaged objects. Furthermore, we propose a novel\nvision-language tracking framework called VL-SAM2, based on the video\nfoundation model SAM2. Experimental results demonstrate that our VL-SAM2\nachieves state-of-the-art performance on the UW-COT220 dataset. The dataset and\ncodes can be accessible at\n\\color{magenta}{https://github.com/983632847/Awesome-Multimodal-Object-Tracking}.\n","authors":["Chunhui Zhang","Li Liu","Guanjie Huang","Hao Wen","Xi Zhou","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.16902v2.pdf","comment":"Preprint. Work in Progress. Extended Version of WebUOT-1M on NeurIPS\n  2024"},{"id":"http://arxiv.org/abs/2501.11469v1","updated":"2025-01-20T12:56:28Z","published":"2025-01-20T12:56:28Z","title":"MASS: Overcoming Language Bias in Image-Text Matching","summary":"  Pretrained visual-language models have made significant advancements in\nmultimodal tasks, including image-text retrieval. However, a major challenge in\nimage-text matching lies in language bias, where models predominantly rely on\nlanguage priors and neglect to adequately consider the visual content. We thus\npresent Multimodal ASsociation Score (MASS), a framework that reduces the\nreliance on language priors for better visual accuracy in image-text matching\nproblems. It can be seamlessly incorporated into existing visual-language\nmodels without necessitating additional training. Our experiments have shown\nthat MASS effectively lessens language bias without losing an understanding of\nlinguistic compositionality. Overall, MASS offers a promising solution for\nenhancing image-text matching performance in visual-language models.\n","authors":["Jiwan Chung","Seungwon Lim","Sangkyu Lee","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2501.11469v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.11462v1","updated":"2025-01-20T12:50:00Z","published":"2025-01-20T12:50:00Z","title":"On the Adversarial Vulnerabilities of Transfer Learning in Remote\n  Sensing","summary":"  The use of pretrained models from general computer vision tasks is widespread\nin remote sensing, significantly reducing training costs and improving\nperformance. However, this practice also introduces vulnerabilities to\ndownstream tasks, where publicly available pretrained models can be used as a\nproxy to compromise downstream models. This paper presents a novel Adversarial\nNeuron Manipulation method, which generates transferable perturbations by\nselectively manipulating single or multiple neurons in pretrained models.\nUnlike existing attacks, this method eliminates the need for domain-specific\ninformation, making it more broadly applicable and efficient. By targeting\nmultiple fragile neurons, the perturbations achieve superior attack\nperformance, revealing critical vulnerabilities in deep learning models.\nExperiments on diverse models and remote sensing datasets validate the\neffectiveness of the proposed method. This low-access adversarial neuron\nmanipulation technique highlights a significant security risk in transfer\nlearning models, emphasizing the urgent need for more robust defenses in their\ndesign when addressing the safety-critical remote sensing tasks.\n","authors":["Tao Bai","Xingjian Tian","Yonghao Xu","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2501.11462v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.03006v2","updated":"2025-01-20T12:46:06Z","published":"2025-01-06T13:32:16Z","title":"TransPixeler: Advancing Text-to-Video Generation with Transparency","summary":"  Text-to-video generative models have made significant strides, enabling\ndiverse applications in entertainment, advertising, and education. However,\ngenerating RGBA video, which includes alpha channels for transparency, remains\na challenge due to limited datasets and the difficulty of adapting existing\nmodels. Alpha channels are crucial for visual effects (VFX), allowing\ntransparent elements like smoke and reflections to blend seamlessly into\nscenes. We introduce TransPixeler, a method to extend pretrained video models\nfor RGBA generation while retaining the original RGB capabilities. TransPixar\nleverages a diffusion transformer (DiT) architecture, incorporating\nalpha-specific tokens and using LoRA-based fine-tuning to jointly generate RGB\nand alpha channels with high consistency. By optimizing attention mechanisms,\nTransPixar preserves the strengths of the original RGB model and achieves\nstrong alignment between RGB and alpha channels despite limited training data.\nOur approach effectively generates diverse and consistent RGBA videos,\nadvancing the possibilities for VFX and interactive content creation.\n","authors":["Luozhou Wang","Yijun Li","Zhifei Chen","Jui-Hsien Wang","Zhifei Zhang","He Zhang","Zhe Lin","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2501.03006v2.pdf","comment":"Project page: https://wileewang.github.io/TransPixar/"},{"id":"http://arxiv.org/abs/2501.10283v2","updated":"2025-01-20T12:34:18Z","published":"2025-01-17T16:26:24Z","title":"GSTAR: Gaussian Surface Tracking and Reconstruction","summary":"  3D Gaussian Splatting techniques have enabled efficient photo-realistic\nrendering of static scenes. Recent works have extended these approaches to\nsupport surface reconstruction and tracking. However, tracking dynamic surfaces\nwith 3D Gaussians remains challenging due to complex topology changes, such as\nsurfaces appearing, disappearing, or splitting. To address these challenges, we\npropose GSTAR, a novel method that achieves photo-realistic rendering, accurate\nsurface reconstruction, and reliable 3D tracking for general dynamic scenes\nwith changing topology. Given multi-view captures as input, GSTAR binds\nGaussians to mesh faces to represent dynamic objects. For surfaces with\nconsistent topology, GSTAR maintains the mesh topology and tracks the meshes\nusing Gaussians. In regions where topology changes, GSTAR adaptively unbinds\nGaussians from the mesh, enabling accurate registration and the generation of\nnew surfaces based on these optimized Gaussians. Additionally, we introduce a\nsurface-based scene flow method that provides robust initialization for\ntracking between frames. Experiments demonstrate that our method effectively\ntracks and reconstructs dynamic surfaces, enabling a range of applications. Our\nproject page with the code release is available at\nhttps://eth-ait.github.io/GSTAR/.\n","authors":["Chengwei Zheng","Lixin Xue","Juan Zarate","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2501.10283v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11428v1","updated":"2025-01-20T11:56:40Z","published":"2025-01-20T11:56:40Z","title":"Enhancing Coronary Artery Calcium Scoring via Multi-Organ Segmentation\n  on Non-Contrast Cardiac Computed Tomography","summary":"  Despite coronary artery calcium scoring being considered a largely solved\nproblem within the realm of medical artificial intelligence, this paper argues\nthat significant improvements can still be made. By shifting the focus from\npathology detection to a deeper understanding of anatomy, the novel algorithm\nproposed in the paper both achieves high accuracy in coronary artery calcium\nscoring and offers enhanced interpretability of the results. This approach not\nonly aids in the precise quantification of calcifications in coronary arteries,\nbut also provides valuable insights into the underlying anatomical structures.\nThrough this anatomically-informed methodology, the paper shows how a nuanced\nunderstanding of the heart's anatomy can lead to more accurate and\ninterpretable results in the field of cardiovascular health. We demonstrate the\nsuperior accuracy of the proposed method by evaluating it on an open-source\nmulti-vendor dataset, where we obtain results at the inter-observer level,\nsurpassing the current state of the art. Finally, the qualitative analyses show\nthe practical value of the algorithm in such tasks as labeling coronary artery\ncalcifications, identifying aortic calcifications, and filtering out false\npositive detections due to noise.\n","authors":["Jakub Nalepa","Tomasz Bartczak","Mariusz Bujny","Jarosław Gośliński","Katarzyna Jesionek","Wojciech Malara","Filip Malawski","Karol Miszalski-Jamka","Patrycja Rewa","Marcin Kostur"],"pdf_url":"https://arxiv.org/pdf/2501.11428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06863v6","updated":"2025-01-20T11:02:52Z","published":"2024-07-09T13:50:43Z","title":"Beyond Aesthetics: Cultural Competence in Text-to-Image Models","summary":"  Text-to-Image (T2I) models are being increasingly adopted in diverse global\ncommunities where they create visual representations of their unique cultures.\nCurrent T2I benchmarks primarily focus on faithfulness, aesthetics, and realism\nof generated images, overlooking the critical dimension of cultural competence.\nIn this work, we introduce a framework to evaluate cultural competence of T2I\nmodels along two crucial dimensions: cultural awareness and cultural diversity,\nand present a scalable approach using a combination of structured knowledge\nbases and large language models to build a large dataset of cultural artifacts\nto enable this evaluation. In particular, we apply this approach to build CUBE\n(CUltural BEnchmark for Text-to-Image models), a first-of-its-kind benchmark to\nevaluate cultural competence of T2I models. CUBE covers cultural artifacts\nassociated with 8 countries across different geo-cultural regions and along 3\nconcepts: cuisine, landmarks, and art. CUBE consists of 1) CUBE-1K, a set of\nhigh-quality prompts that enable the evaluation of cultural awareness, and 2)\nCUBE-CSpace, a larger dataset of cultural artifacts that serves as grounding to\nevaluate cultural diversity. We also introduce cultural diversity as a novel\nT2I evaluation component, leveraging quality-weighted Vendi score. Our\nevaluations reveal significant gaps in the cultural awareness of existing\nmodels across countries and provide valuable insights into the cultural\ndiversity of T2I outputs for under-specified prompts. Our methodology is\nextendable to other cultural regions and concepts, and can facilitate the\ndevelopment of T2I models that better cater to the global population.\n","authors":["Nithish Kannen","Arif Ahmad","Marco Andreetto","Vinodkumar Prabhakaran","Utsav Prabhu","Adji Bousso Dieng","Pushpak Bhattacharyya","Shachi Dave"],"pdf_url":"https://arxiv.org/pdf/2407.06863v6.pdf","comment":"NeurIPS 2024 camera-ready version"},{"id":"http://arxiv.org/abs/2411.01251v2","updated":"2025-01-20T10:48:57Z","published":"2024-11-02T14:02:45Z","title":"Enhancing Diabetic Retinopathy Detection with CNN-Based Models: A\n  Comparative Study of UNET and Stacked UNET Architectures","summary":"  Diabetic Retinopathy DR is a severe complication of diabetes. Damaged or\nabnormal blood vessels can cause loss of vision. The need for massive screening\nof a large population of diabetic patients has generated an interest in a\ncomputer-aided fully automatic diagnosis of DR. In the realm of Deep learning\nframeworks, particularly convolutional neural networks CNNs, have shown great\ninterest and promise in detecting DR by analyzing retinal images. However,\nseveral challenges have been faced in the application of deep learning in this\ndomain. High-quality, annotated datasets are scarce, and the variations in\nimage quality and class imbalances pose significant hurdles in developing a\ndependable model. In this paper, we demonstrate the proficiency of two\nConvolutional Neural Networks CNNs based models, UNET and Stacked UNET\nutilizing the APTOS Asia Pacific Tele-Ophthalmology Society Dataset. This\nsystem achieves an accuracy of 92.81% for the UNET and 93.32% for the stacked\nUNET architecture. The architecture classifies the images into five categories\nranging from 0 to 4, where 0 is no DR and 4 is proliferative DR.\n","authors":["Ameya Uppina","S Navaneetha Krishnan","Talluri Krishna Sai Teja","Nikhil N Iyer","Joe Dhanith P R"],"pdf_url":"https://arxiv.org/pdf/2411.01251v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02496v2","updated":"2025-01-20T09:59:05Z","published":"2024-08-05T14:19:03Z","title":"Automatic rating of incomplete hippocampal inversions evaluated across\n  multiple cohorts","summary":"  Incomplete Hippocampal Inversion (IHI), sometimes called hippocampal\nmalrotation, is an atypical anatomical pattern of the hippocampus found in\nabout 20% of the general population. IHI can be visually assessed on coronal\nslices of T1 weighted MR images, using a composite score that combines four\nanatomical criteria. IHI has been associated with several brain disorders\n(epilepsy, schizophrenia). However, these studies were based on small samples.\nFurthermore, the factors (genetic or environmental) that contribute to the\ngenesis of IHI are largely unknown. Large-scale studies are thus needed to\nfurther understand IHI and their potential relationships to neurological and\npsychiatric disorders. However, visual evaluation is long and tedious,\njustifying the need for an automatic method. In this paper, we propose, for the\nfirst time, to automatically rate IHI. We proceed by predicting four anatomical\ncriteria, which are then summed up to form the IHI score, providing the\nadvantage of an interpretable score. We provided an extensive experimental\ninvestigation of different machine learning methods and training strategies. We\nperformed automatic rating using a variety of deep learning models (conv5-FC3,\nResNet and SECNN) as well as a ridge regression. We studied the generalization\nof our models using different cohorts and performed multi-cohort learning. We\nrelied on a large population of 2,008 participants from the IMAGEN study, 993\nand 403 participants from the QTIM/QTAB studies as well as 985 subjects from\nthe UKBiobank. We showed that deep learning models outperformed a ridge\nregression. We demonstrated that the performances of the conv5-FC3 network were\nat least as good as more complex networks while maintaining a low complexity\nand computation time. We showed that training on a single cohort may lack in\nvariability while training on several cohorts improves generalization.\n","authors":["Lisa Hemforth","Baptiste Couvy-Duchesne","Kevin De Matos","Camille Brianceau","Matthieu Joulot","Tobias Banaschewski","Arun L. W. Bokde","Sylvane Desrivières","Herta Flor","Antoine Grigis","Hugh Garavan","Penny Gowland","Andreas Heinz","Rüdiger Brühl","Jean-Luc Martinot","Marie-Laure Paillère Martinot","Eric Artiges","Dimitri Papadopoulos","Herve Lemaitre","Tomas Paus","Luise Poustka","Sarah Hohmann","Nathalie Holz","Juliane H. Fröhner","Michael N. Smolka","Nilakshi Vaidya","Henrik Walter","Robert Whelan","Gunter Schumann","Christian Büchel","JB Poline","Bernd Itterman","Vincent Frouin","Alexandre Martin","IMAGEN study group","Claire Cury","Olivier Colliot"],"pdf_url":"https://arxiv.org/pdf/2408.02496v2.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n  Biomedical Imaging (MELBA) https://melba-journal.org/2024:016"},{"id":"http://arxiv.org/abs/2501.11361v1","updated":"2025-01-20T09:46:12Z","published":"2025-01-20T09:46:12Z","title":"Block Flow: Learning Straight Flow on Data Blocks","summary":"  Flow-matching models provide a powerful framework for various applications,\noffering efficient sampling and flexible probability path modeling. These\nmodels are characterized by flows with low curvature in learned generative\ntrajectories, which results in reduced truncation error at each sampling step.\nTo further reduce curvature, we propose block matching. This novel approach\nleverages label information to partition the data distribution into blocks and\nmatch them with a prior distribution parameterized using the same label\ninformation, thereby learning straighter flows. We demonstrate that the\nvariance of the prior distribution can control the curvature upper bound of\nforward trajectories in flow-matching models. By designing flexible\nregularization strategies to adjust this variance, we achieve optimal\ngeneration performance, effectively balancing the trade-off between maintaining\ndiversity in generated samples and minimizing numerical solver errors. Our\nresults demonstrate competitive performance with models of the same parameter\nscale.Code is available at \\url{https://github.com/wpp13749/block_flow}.\n","authors":["Zibin Wang","Zhiyuan Ouyang","Xiangyun Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10543v8","updated":"2025-01-20T09:25:22Z","published":"2023-11-17T14:10:55Z","title":"Unified theory for joint covariance properties under geometric image\n  transformations for spatio-temporal receptive fields according to the\n  generalized Gaussian derivative model for visual receptive fields","summary":"  The influence of natural image transformations on receptive field responses\nis crucial for modelling visual operations in computer vision and biological\nvision. In this regard, covariance properties with respect to geometric image\ntransformations in the earliest layers of the visual hierarchy are essential\nfor expressing robust image operations, and for formulating invariant visual\noperations at higher levels.\n  This paper defines and proves a set of joint covariance properties for\nspatio-temporal receptive fields in terms of spatio-temporal derivative\noperators applied to spatio-temporally smoothed image data under compositions\nof spatial scaling transformations, spatial affine transformations, Galilean\ntransformations and temporal scaling transformations. Specifically, the derived\nrelations show how the parameters of the receptive fields need to be\ntransformed, in order to match the output from spatio-temporal receptive fields\nunder composed spatio-temporal image transformations.\n  For this purpose, we also fundamentally extend the notion of scale-normalized\nderivatives to affine-normalized derivatives, that are computed based on\nspatial smoothing with affine Gaussian kernels, and analyze the covariance\nproperties of the resulting affine-normalized derivatives for the affine group\nas well as for important subgroups thereof.\n  We conclude with a geometric analysis, showing how the derived joint\ncovariance properties make it possible to relate or match spatio-temporal\nreceptive field responses, when observing, possibly moving, local surface\npatches from different views, under locally linearized perspective or\nprojective transformations, as well as when observing different instances of\nspatio-temporal events, that may occur either faster or slower between\ndifferent views of similar spatio-temporal events.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2311.10543v8.pdf","comment":"46 pages, 19 figures. Note: From version 4, this paper considers a\n  different form of joint composition of the geometric image transformations\n  than in the earlier versions"},{"id":"http://arxiv.org/abs/2501.11351v1","updated":"2025-01-20T09:18:41Z","published":"2025-01-20T09:18:41Z","title":"Automatic Labelling & Semantic Segmentation with 4D Radar Tensors","summary":"  In this paper, an automatic labelling process is presented for automotive\ndatasets, leveraging on complementary information from LiDAR and camera. The\ngenerated labels are then used as ground truth with the corresponding 4D radar\ndata as inputs to a proposed semantic segmentation network, to associate a\nclass label to each spatial voxel. Promising results are shown by applying both\napproaches to the publicly shared RaDelft dataset, with the proposed network\nachieving over 65% of the LiDAR detection performance, improving 13.2% in\nvehicle detection probability, and reducing 0.54 m in terms of Chamfer\ndistance, compared to variants inspired from the literature.\n","authors":["Botao Sun","Ignacio Roldan","Francesco Fioranelli"],"pdf_url":"https://arxiv.org/pdf/2501.11351v1.pdf","comment":"Accepted in ICASSP 2025"},{"id":"http://arxiv.org/abs/2406.16901v3","updated":"2025-01-20T09:12:29Z","published":"2024-05-31T15:17:12Z","title":"ECGrecover: a Deep Learning Approach for Electrocardiogram Signal\n  Completion","summary":"  In this work, we address the challenge of reconstructing the complete 12-lead\nECG signal from its incomplete parts. We focus on two main scenarios: (i)\nreconstructing missing signal segments within an ECG lead and (ii) recovering\nentire leads from signal in another unique lead. Two emerging clinical\napplications emphasize the relevance of our work. The first is the increasing\nneed to digitize paper-stored ECGs for utilization in AI-based applications,\noften limited to digital 12 lead 10s ECGs. The second is the widespread use of\nwearable devices that record ECGs but typically capture only one or a few\nleads. In both cases, a non-negligible amount of information is lost or not\nrecorded. Our approach aims to recover this missing signal. We propose\nECGrecover, a U-Net neural network model trained on a novel composite objective\nfunction to address the reconstruction problem. This function incorporates both\nspatial and temporal features of the ECG by combining the distance in amplitude\nand sycnhronization through time between the reconstructed and the real digital\nsignals. We used real-life ECG datasets and through comprehensive assessments\ncompared ECGrecover with three state-of-the-art methods based on generative\nadversarial networks (EKGAN, Pix2Pix) as well as the CopyPaste strategy. The\nresults demonstrated that ECGrecover consistently outperformed state-of-the-art\nmethods in standard distortion metrics as well as in preserving critical ECG\ncharacteristics, particularly the P, QRS, and T wave coordinates.\n","authors":["Alex Lence","Federica Granese","Ahmad Fall","Blaise Hanczar","Joe-Elie Salem","Jean-Daniel Zucker","Edi Prifti"],"pdf_url":"https://arxiv.org/pdf/2406.16901v3.pdf","comment":"31 pages, 14 figures, 29 tables, conference paper"},{"id":"http://arxiv.org/abs/2501.11347v1","updated":"2025-01-20T09:12:06Z","published":"2025-01-20T09:12:06Z","title":"EndoChat: Grounded Multimodal Large Language Model for Endoscopic\n  Surgery","summary":"  Recently, Multimodal Large Language Models (MLLMs) have demonstrated their\nimmense potential in computer-aided diagnosis and decision-making. In the\ncontext of robotic-assisted surgery, MLLMs can serve as effective tools for\nsurgical training and guidance. However, there is still a lack of MLLMs\nspecialized for surgical scene understanding in clinical applications. In this\nwork, we introduce EndoChat to address various dialogue paradigms and subtasks\nin surgical scene understanding that surgeons encounter. To train our EndoChat,\nwe construct the Surg-396K dataset through a novel pipeline that systematically\nextracts surgical information and generates structured annotations based on\ncollected large-scale endoscopic surgery datasets. Furthermore, we introduce a\nmulti-scale visual token interaction mechanism and a visual contrast-based\nreasoning mechanism to enhance the model's representation learning and\nreasoning capabilities. Our model achieves state-of-the-art performance across\nfive dialogue paradigms and eight surgical scene understanding tasks.\nAdditionally, we conduct evaluations with professional surgeons, most of whom\nprovide positive feedback on collaborating with EndoChat. Overall, these\nresults demonstrate that our EndoChat has great potential to significantly\nadvance training and automation in robotic-assisted surgery.\n","authors":["Guankun Wang","Long Bai","Junyi Wang","Kun Yuan","Zhen Li","Tianxu Jiang","Xiting He","Jinlin Wu","Zhen Chen","Zhen Lei","Hongbin Liu","Jiazheng Wang","Fan Zhang","Nicolas Padoy","Nassir Navab","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2501.11347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03177v2","updated":"2025-01-20T09:02:20Z","published":"2024-11-05T15:22:26Z","title":"On Improved Conditioning Mechanisms and Pre-training Strategies for\n  Diffusion Models","summary":"  Large-scale training of latent diffusion models (LDMs) has enabled\nunprecedented quality in image generation. However, the key components of the\nbest performing LDM training recipes are oftentimes not available to the\nresearch community, preventing apple-to-apple comparisons and hindering the\nvalidation of progress in the field. In this work, we perform an in-depth study\nof LDM training recipes focusing on the performance of models and their\ntraining efficiency. To ensure apple-to-apple comparisons, we re-implement five\npreviously published models with their corresponding recipes. Through our\nstudy, we explore the effects of (i)~the mechanisms used to condition the\ngenerative model on semantic information (e.g., text prompt) and control\nmetadata (e.g., crop size, random flip flag, etc.) on the model performance,\nand (ii)~the transfer of the representations learned on smaller and\nlower-resolution datasets to larger ones on the training efficiency and model\nperformance. We then propose a novel conditioning mechanism that disentangles\nsemantic and control metadata conditionings and sets a new state-of-the-art in\nclass-conditional generation on the ImageNet-1k dataset -- with FID\nimprovements of 7% on 256 and 8% on 512 resolutions -- as well as text-to-image\ngeneration on the CC12M dataset -- with FID improvements of 8% on 256 and 23%\non 512 resolution.\n","authors":["Tariq Berrada Ifriqi","Pietro Astolfi","Melissa Hall","Reyhane Askari-Hemmat","Yohann Benchetrit","Marton Havasi","Matthew Muckley","Karteek Alahari","Adriana Romero-Soriano","Jakob Verbeek","Michal Drozdzal"],"pdf_url":"https://arxiv.org/pdf/2411.03177v2.pdf","comment":"Accepted as a conference paper (poster) for NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.05362v2","updated":"2025-01-20T08:59:02Z","published":"2024-11-08T06:36:31Z","title":"From Transparent to Opaque: Rethinking Neural Implicit Surfaces with\n  $α$-NeuS","summary":"  Traditional 3D shape reconstruction techniques from multi-view images, such\nas structure from motion and multi-view stereo, face challenges in\nreconstructing transparent objects. Recent advances in neural radiance fields\nand its variants primarily address opaque or transparent objects, encountering\ndifficulties to reconstruct both transparent and opaque objects simultaneously.\nThis paper introduces $\\alpha$-Neus -- an extension of NeuS -- that proves NeuS\nis unbiased for materials from fully transparent to fully opaque. We find that\ntransparent and opaque surfaces align with the non-negative local minima and\nthe zero iso-surface, respectively, in the learned distance field of NeuS.\nTraditional iso-surfacing extraction algorithms, such as marching cubes, which\nrely on fixed iso-values, are ill-suited for such data. We develop a method to\nextract the transparent and opaque surface simultaneously based on DCUDF. To\nvalidate our approach, we construct a benchmark that includes both real-world\nand synthetic scenes, demonstrating its practical utility and effectiveness.\nOur data and code are publicly available at\nhttps://github.com/728388808/alpha-NeuS.\n","authors":["Haoran Zhang","Junkai Deng","Xuhui Chen","Fei Hou","Wencheng Wang","Hong Qin","Chen Qian","Ying He"],"pdf_url":"https://arxiv.org/pdf/2411.05362v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.11340v1","updated":"2025-01-20T08:58:56Z","published":"2025-01-20T08:58:56Z","title":"GenVidBench: A Challenging Benchmark for Detecting AI-Generated Video","summary":"  The rapid advancement of video generation models has made it increasingly\nchallenging to distinguish AI-generated videos from real ones. This issue\nunderscores the urgent need for effective AI-generated video detectors to\nprevent the dissemination of false information through such videos. However,\nthe development of high-performance generative video detectors is currently\nimpeded by the lack of large-scale, high-quality datasets specifically designed\nfor generative video detection. To this end, we introduce GenVidBench, a\nchallenging AI-generated video detection dataset with several key advantages:\n1) Cross Source and Cross Generator: The cross-generation source mitigates the\ninterference of video content on the detection. The cross-generator ensures\ndiversity in video attributes between the training and test sets, preventing\nthem from being overly similar. 2) State-of-the-Art Video Generators: The\ndataset includes videos from 8 state-of-the-art AI video generators, ensuring\nthat it covers the latest advancements in the field of video generation. 3)\nRich Semantics: The videos in GenVidBench are analyzed from multiple dimensions\nand classified into various semantic categories based on their content. This\nclassification ensures that the dataset is not only large but also diverse,\naiding in the development of more generalized and effective detection models.\nWe conduct a comprehensive evaluation of different advanced video generators\nand present a challenging setting. Additionally, we present rich experimental\nresults including advanced video classification models as baselines. With the\nGenVidBench, researchers can efficiently develop and evaluate AI-generated\nvideo detection models. Datasets and code are available at\nhttps://genvidbench.github.io.\n","authors":["Zhenliang Ni","Qiangyu Yan","Mouxiao Huang","Tianning Yuan","Yehui Tang","Hailin Hu","Xinghao Chen","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11325v1","updated":"2025-01-20T08:09:36Z","published":"2025-01-20T08:09:36Z","title":"CatV2TON: Taming Diffusion Transformers for Vision-Based Virtual Try-On\n  with Temporal Concatenation","summary":"  Virtual try-on (VTON) technology has gained attention due to its potential to\ntransform online retail by enabling realistic clothing visualization of images\nand videos. However, most existing methods struggle to achieve high-quality\nresults across image and video try-on tasks, especially in long video\nscenarios. In this work, we introduce CatV2TON, a simple and effective\nvision-based virtual try-on (V2TON) method that supports both image and video\ntry-on tasks with a single diffusion transformer model. By temporally\nconcatenating garment and person inputs and training on a mix of image and\nvideo datasets, CatV2TON achieves robust try-on performance across static and\ndynamic settings. For efficient long-video generation, we propose an\noverlapping clip-based inference strategy that uses sequential frame guidance\nand Adaptive Clip Normalization (AdaCN) to maintain temporal consistency with\nreduced resource demands. We also present ViViD-S, a refined video try-on\ndataset, achieved by filtering back-facing frames and applying 3D mask\nsmoothing for enhanced temporal consistency. Comprehensive experiments\ndemonstrate that CatV2TON outperforms existing methods in both image and video\ntry-on tasks, offering a versatile and reliable solution for realistic virtual\ntry-ons across diverse scenarios.\n","authors":["Zheng Chong","Wenqing Zhang","Shiyue Zhang","Jun Zheng","Xiao Dong","Haoxiang Li","Yiling Wu","Dongmei Jiang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2501.11325v1.pdf","comment":"11 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2411.15843v3","updated":"2025-01-20T08:08:12Z","published":"2024-11-24T13:48:16Z","title":"Unveil Inversion and Invariance in Flow Transformer for Versatile Image\n  Editing","summary":"  Leveraging the large generative prior of the flow transformer for tuning-free\nimage editing requires authentic inversion to project the image into the\nmodel's domain and a flexible invariance control mechanism to preserve\nnon-target contents. However, the prevailing diffusion inversion performs\ndeficiently in flow-based models, and the invariance control cannot reconcile\ndiverse rigid and non-rigid editing tasks. To address these, we systematically\nanalyze the \\textbf{inversion and invariance} control based on the flow\ntransformer. Specifically, we unveil that the Euler inversion shares a similar\nstructure to DDIM yet is more susceptible to the approximation error. Thus, we\npropose a two-stage inversion to first refine the velocity estimation and then\ncompensate for the leftover error, which pivots closely to the model prior and\nbenefits editing. Meanwhile, we propose the invariance control that manipulates\nthe text features within the adaptive layer normalization, connecting the\nchanges in the text prompt to image semantics. This mechanism can\nsimultaneously preserve the non-target contents while allowing rigid and\nnon-rigid manipulation, enabling a wide range of editing types such as visual\ntext, quantity, facial expression, etc. Experiments on versatile scenarios\nvalidate that our framework achieves flexible and accurate editing, unlocking\nthe potential of the flow transformer for versatile image editing.\n","authors":["Pengcheng Xu","Boyuan Jiang","Xiaobin Hu","Donghao Luo","Qingdong He","Jiangning Zhang","Chengjie Wang","Yunsheng Wu","Charles Ling","Boyu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15843v3.pdf","comment":"Project Page: https://pengchengpcx.github.io/EditFT/"},{"id":"http://arxiv.org/abs/2412.17741v3","updated":"2025-01-20T07:57:50Z","published":"2024-12-23T17:44:05Z","title":"Reasoning to Attend: Try to Understand How <SEG> Token Works","summary":"  Current Large Multimodal Models (LMMs) empowered visual grounding typically\nrely on $\\texttt{<SEG>}$ token as a text prompt to jointly optimize the\nvision-language model (e.g., LLaVA) and the downstream task-specified model\n(\\eg, SAM). However, we observe that little research has looked into how it\nworks. In this work, we first visualize the similarity maps, which are obtained\nby computing the semantic similarity between the $\\texttt{<SEG>}$ token and the\nimage token embeddings derived from the last hidden layer in both the LLaVA\nencoder and SAM decoder. Intriguingly, we have found that a striking\nconsistency holds in terms of activation responses in the similarity map,which\nreveals that what $\\texttt{<SEG>}$ token contributes to is the semantic\nsimilarity within image-text pairs. Specifically, $\\texttt{<SEG>}$ token, a\nplaceholder expanded in text vocabulary, extensively queries among individual\ntokenized image patches to match the semantics of an object from text to the\npaired image while the Large Language Models (LLMs) are being fine-tuned. Upon\nthe above findings, we present READ, which facilitates LMMs' resilient\n$\\textbf{REA}$soning capability of where to atten$\\textbf{D}$ under the\nguidance of highly activated points borrowed from similarity maps. Remarkably,\nREAD features an intuitive design, Similarity as Points module (SasP), which\ncan be seamlessly applied to $\\texttt{<SEG>}$-like paradigms in a plug-and-play\nfashion. Also, extensive experiments have been conducted on the ReasonSeg and\nRefCOCO(+/g) datasets. To validate whether READ suffers from catastrophic\nforgetting of previous skills after fine-tuning, we further assess its\ngeneration ability on an augmented FP-RefCOCO(+/g) dataset. All codes and\nmodels are publicly available at https://github.com/rui-qian/READ.\n","authors":["Rui Qian","Xin Yin","Dejing Dou"],"pdf_url":"https://arxiv.org/pdf/2412.17741v3.pdf","comment":"https://github.com/rui-qian/READ"},{"id":"http://arxiv.org/abs/2406.08020v2","updated":"2025-01-20T07:50:12Z","published":"2024-06-12T09:21:28Z","title":"Generalizable Disaster Damage Assessment via Change Detection with\n  Vision Foundation Model","summary":"  The increasing frequency and intensity of natural disasters call for rapid\nand accurate damage assessment. In response, disaster benchmark datasets from\nhigh-resolution satellite imagery have been constructed to develop methods for\ndetecting damaged areas. However, these methods face significant challenges\nwhen applied to previously unseen regions due to the limited geographical and\ndisaster-type diversity in the existing datasets. We introduce DAVI (Disaster\nAssessment with VIsion foundation model), a novel approach that addresses\ndomain disparities and detects structural damage at the building level without\nrequiring ground-truth labels for target regions. DAVI combines task-specific\nknowledge from a model trained on source regions with task-agnostic knowledge\nfrom an image segmentation model to generate pseudo labels indicating potential\ndamage in target regions. It then utilizes a two-stage refinement process,\nwhich operate at both pixel and image levels, to accurately identify changes in\ndisaster-affected areas. Our evaluation, including a case study on the 2023\nT\\\"urkiye earthquake, demonstrates that our model achieves exceptional\nperformance across diverse terrains (e.g., North America, Asia, and the Middle\nEast) and disaster types (e.g., wildfires, hurricanes, and tsunamis). This\nconfirms its robustness in disaster assessment without dependence on\nground-truth labels and highlights its practical applicability.\n","authors":["Kyeongjin Ahn","Sungwon Han","Sungwon Park","Jihee Kim","Sangyoon Park","Meeyoung Cha"],"pdf_url":"https://arxiv.org/pdf/2406.08020v2.pdf","comment":"Accepted to AAAI 2025 (oral)"},{"id":"http://arxiv.org/abs/2501.11319v1","updated":"2025-01-20T07:45:42Z","published":"2025-01-20T07:45:42Z","title":"StyleSSP: Sampling StartPoint Enhancement for Training-free\n  Diffusion-based Method for Style Transfer","summary":"  Training-free diffusion-based methods have achieved remarkable success in\nstyle transfer, eliminating the need for extensive training or fine-tuning.\nHowever, due to the lack of targeted training for style information extraction\nand constraints on the content image layout, training-free methods often suffer\nfrom layout changes of original content and content leakage from style images.\nThrough a series of experiments, we discovered that an effective startpoint in\nthe sampling stage significantly enhances the style transfer process. Based on\nthis discovery, we propose StyleSSP, which focuses on obtaining a better\nstartpoint to address layout changes of original content and content leakage\nfrom style image. StyleSSP comprises two key components: (1) Frequency\nManipulation: To improve content preservation, we reduce the low-frequency\ncomponents of the DDIM latent, allowing the sampling stage to pay more\nattention to the layout of content images; and (2) Negative Guidance via\nInversion: To mitigate the content leakage from style image, we employ negative\nguidance in the inversion stage to ensure that the startpoint of the sampling\nstage is distanced from the content of style image. Experiments show that\nStyleSSP surpasses previous training-free style transfer baselines,\nparticularly in preserving original content and minimizing the content leakage\nfrom style image.\n","authors":["Ruojun Xu","Weijie Xi","Xiaodi Wang","Yongbo Mao","Zach Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.11319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11318v1","updated":"2025-01-20T07:44:09Z","published":"2025-01-20T07:44:09Z","title":"Nested Annealed Training Scheme for Generative Adversarial Networks","summary":"  Recently, researchers have proposed many deep generative models, including\ngenerative adversarial networks(GANs) and denoising diffusion models. Although\nsignificant breakthroughs have been made and empirical success has been\nachieved with the GAN, its mathematical underpinnings remain relatively\nunknown. This paper focuses on a rigorous mathematical theoretical framework:\nthe composite-functional-gradient GAN (CFG)[1]. Specifically, we reveal the\ntheoretical connection between the CFG model and score-based models. We find\nthat the training objective of the CFG discriminator is equivalent to finding\nan optimal D(x). The optimal gradient of D(x) differentiates the integral of\nthe differences between the score functions of real and synthesized samples.\nConversely, training the CFG generator involves finding an optimal G(x) that\nminimizes this difference. In this paper, we aim to derive an annealed weight\npreceding the weight of the CFG discriminator. This new explicit theoretical\nexplanation model is called the annealed CFG method. To overcome the limitation\nof the annealed CFG method, as the method is not readily applicable to the SOTA\nGAN model, we propose a nested annealed training scheme (NATS). This scheme\nkeeps the annealed weight from the CFG method and can be seamlessly adapted to\nvarious GAN models, no matter their structural, loss, or regularization\ndifferences. We conduct thorough experimental evaluations on various benchmark\ndatasets for image generation. The results show that our annealed CFG and NATS\nmethods significantly improve the quality and diversity of the synthesized\nsamples. This improvement is clear when comparing the CFG method and the SOTA\nGAN models.\n","authors":["Chang Wan","Ming-Hsuan Yang","Minglu Li","Yunliang Jiang","Zhonglong Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.11318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14455v2","updated":"2025-01-20T07:28:44Z","published":"2024-06-20T16:14:43Z","title":"MM-GTUNets: Unified Multi-Modal Graph Deep Learning for Brain Disorders\n  Prediction","summary":"  Graph deep learning (GDL) has demonstrated impressive performance in\npredicting population-based brain disorders (BDs) through the integration of\nboth imaging and non-imaging data. However, the effectiveness of GDL based\nmethods heavily depends on the quality of modeling the multi-modal population\ngraphs and tends to degrade as the graph scale increases. Furthermore, these\nmethods often constrain interactions between imaging and non-imaging data to\nnode-edge interactions within the graph, overlooking complex inter-modal\ncorrelations, leading to suboptimal outcomes. To overcome these challenges, we\npropose MM-GTUNets, an end-to-end graph transformer based multi-modal graph\ndeep learning (MMGDL) framework designed for brain disorders prediction at\nlarge scale. Specifically, to effectively leverage rich multi-modal information\nrelated to diseases, we introduce Modality Reward Representation Learning\n(MRRL) which adaptively constructs population graphs using a reward system.\nAdditionally, we employ variational autoencoder to reconstruct latent\nrepresentations of non-imaging features aligned with imaging features. Based on\nthis, we propose Adaptive Cross-Modal Graph Learning (ACMGL), which captures\ncritical modality-specific and modality-shared features through a unified\nGTUNet encoder taking advantages of Graph UNet and Graph Transformer, and\nfeature fusion module. We validated our method on two public multi-modal\ndatasets ABIDE and ADHD-200, demonstrating its superior performance in\ndiagnosing BDs. Our code is available at https://github.com/NZWANG/MM-GTUNets.\n","authors":["Luhui Cai","Weiming Zeng","Hongyu Chen","Hua Zhang","Yueyang Li","Yu Feng","Hongjie Yan","Lingbin Bian","Wai Ting Siok","Nizhuan Wang"],"pdf_url":"https://arxiv.org/pdf/2406.14455v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11310v1","updated":"2025-01-20T07:24:39Z","published":"2025-01-20T07:24:39Z","title":"Anomaly Detection for Industrial Applications, Its Challenges,\n  Solutions, and Future Directions: A Review","summary":"  Anomaly detection from images captured using camera sensors is one of the\nmainstream applications at the industrial level. Particularly, it maintains the\nquality and optimizes the efficiency in production processes across diverse\nindustrial tasks, including advanced manufacturing and aerospace engineering.\nTraditional anomaly detection workflow is based on a manual inspection by human\noperators, which is a tedious task. Advances in intelligent automated\ninspection systems have revolutionized the Industrial Anomaly Detection (IAD)\nprocess. Recent vision-based approaches can automatically extract, process, and\ninterpret features using computer vision and align with the goals of automation\nin industrial operations. In light of the shift in inspection methodologies,\nthis survey reviews studies published since 2019, with a specific focus on\nvision-based anomaly detection. The components of an IAD pipeline that are\noverlooked in existing surveys are presented, including areas related to data\nacquisition, preprocessing, learning mechanisms, and evaluation. In addition to\nthe collected publications, several scientific and industry-related challenges\nand their perspective solutions are highlighted. Popular and relevant\nindustrial datasets are also summarized, providing further insight into\ninspection applications. Finally, future directions of vision-based IAD are\ndiscussed, offering researchers insight into the state-of-the-art of industrial\ninspection.\n","authors":["Abdelrahman Alzarooni","Ehtesham Iqbal","Samee Ullah Khan","Sajid Javed","Brain Moyo","Yusra Abdulrahman"],"pdf_url":"https://arxiv.org/pdf/2501.11310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11309v1","updated":"2025-01-20T07:23:11Z","published":"2025-01-20T07:23:11Z","title":"Finer-CAM: Spotting the Difference Reveals Finer Details for Visual\n  Explanation","summary":"  Class activation map (CAM) has been widely used to highlight image regions\nthat contribute to class predictions. Despite its simplicity and computational\nefficiency, CAM often struggles to identify discriminative regions that\ndistinguish visually similar fine-grained classes. Prior efforts address this\nlimitation by introducing more sophisticated explanation processes, but at the\ncost of extra complexity. In this paper, we propose Finer-CAM, a method that\nretains CAM's efficiency while achieving precise localization of discriminative\nregions. Our key insight is that the deficiency of CAM lies not in \"how\" it\nexplains, but in \"what\" it explains}. Specifically, previous methods attempt to\nidentify all cues contributing to the target class's logit value, which\ninadvertently also activates regions predictive of visually similar classes. By\nexplicitly comparing the target class with similar classes and spotting their\ndifferences, Finer-CAM suppresses features shared with other classes and\nemphasizes the unique, discriminative details of the target class. Finer-CAM is\neasy to implement, compatible with various CAM methods, and can be extended to\nmulti-modal models for accurate localization of specific concepts.\nAdditionally, Finer-CAM allows adjustable comparison strength, enabling users\nto selectively highlight coarse object contours or fine discriminative details.\nQuantitatively, we show that masking out the top 5% of activated pixels by\nFiner-CAM results in a larger relative confidence drop compared to baselines.\nThe source code and demo are available at\nhttps://github.com/Imageomics/Finer-CAM.\n","authors":["Ziheng Zhang","Jianyang Gu","Arpita Chowdhury","Zheda Mai","David Carlyn","Tanya Berger-Wolf","Yu Su","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2501.11309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11299v1","updated":"2025-01-20T06:56:30Z","published":"2025-01-20T06:56:30Z","title":"MIFNet: Learning Modality-Invariant Features for Generalizable\n  Multimodal Image Matching","summary":"  Many keypoint detection and description methods have been proposed for image\nmatching or registration. While these methods demonstrate promising performance\nfor single-modality image matching, they often struggle with multimodal data\nbecause the descriptors trained on single-modality data tend to lack robustness\nagainst the non-linear variations present in multimodal data. Extending such\nmethods to multimodal image matching often requires well-aligned multimodal\ndata to learn modality-invariant descriptors. However, acquiring such data is\noften costly and impractical in many real-world scenarios. To address this\nchallenge, we propose a modality-invariant feature learning network (MIFNet) to\ncompute modality-invariant features for keypoint descriptions in multimodal\nimage matching using only single-modality training data. Specifically, we\npropose a novel latent feature aggregation module and a cumulative hybrid\naggregation module to enhance the base keypoint descriptors trained on\nsingle-modality data by leveraging pre-trained features from Stable Diffusion\nmodels. We validate our method with recent keypoint detection and description\nmethods in three multimodal retinal image datasets (CF-FA, CF-OCT, EMA-OCTA)\nand two remote sensing datasets (Optical-SAR and Optical-NIR). Extensive\nexperiments demonstrate that the proposed MIFNet is able to learn\nmodality-invariant feature for multimodal image matching without accessing the\ntargeted modality and has good zero-shot generalization ability. The source\ncode will be made publicly available.\n","authors":["Yepeng Liu","Zhichao Sun","Baosheng Yu","Yitian Zhao","Bo Du","Yongchao Xu","Jun Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.11299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09082v2","updated":"2025-01-20T06:43:48Z","published":"2024-12-12T09:08:13Z","title":"Towards Long-Horizon Vision-Language Navigation: Platform, Benchmark and\n  Method","summary":"  Existing Vision-Language Navigation (VLN) methods primarily focus on\nsingle-stage navigation, limiting their effectiveness in multi-stage and\nlong-horizon tasks within complex and dynamic environments. To address these\nlimitations, we propose a novel VLN task, named Long-Horizon Vision-Language\nNavigation (LH-VLN), which emphasizes long-term planning and decision\nconsistency across consecutive subtasks. Furthermore, to support LH-VLN, we\ndevelop an automated data generation platform NavGen, which constructs datasets\nwith complex task structures and improves data utility through a bidirectional,\nmulti-granularity generation approach. To accurately evaluate complex tasks, we\nconstruct the Long-Horizon Planning and Reasoning in VLN (LHPR-VLN) benchmark\nconsisting of 3,260 tasks with an average of 150 task steps, serving as the\nfirst dataset specifically designed for the long-horizon vision-language\nnavigation task. Furthermore, we propose Independent Success Rate (ISR),\nConditional Success Rate (CSR), and CSR weight by Ground Truth (CGT) metrics,\nto provide fine-grained assessments of task completion. To improve model\nadaptability in complex tasks, we propose a novel Multi-Granularity Dynamic\nMemory (MGDM) module that integrates short-term memory blurring with long-term\nmemory retrieval to enable flexible navigation in dynamic environments. Our\nplatform, benchmark and method supply LH-VLN with a robust data generation\npipeline, comprehensive model evaluation dataset, reasonable metrics, and a\nnovel VLN model, establishing a foundational framework for advancing LH-VLN.\n","authors":["Xinshuai Song","Weixing Chen","Yang Liu","Vincent Chan","Guanbin Li","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2412.09082v2.pdf","comment":"A novel Vision-Language Navigation task: Long-Horizon Vision-Language\n  Navigation, project page: https://hcplab-sysu.github.io/LH-VLN/"},{"id":"http://arxiv.org/abs/2501.02771v2","updated":"2025-01-20T06:41:09Z","published":"2025-01-06T05:24:38Z","title":"WorldPose: A World Cup Dataset for Global 3D Human Pose Estimation","summary":"  We present WorldPose, a novel dataset for advancing research in multi-person\nglobal pose estimation in the wild, featuring footage from the 2022 FIFA World\nCup. While previous datasets have primarily focused on local poses, often\nlimited to a single person or in constrained, indoor settings, the\ninfrastructure deployed for this sporting event allows access to multiple fixed\nand moving cameras in different stadiums. We exploit the static multi-view\nsetup of HD cameras to recover the 3D player poses and motions with\nunprecedented accuracy given capture areas of more than 1.75 acres. We then\nleverage the captured players' motions and field markings to calibrate a moving\nbroadcasting camera. The resulting dataset comprises more than 80 sequences\nwith approx 2.5 million 3D poses and a total traveling distance of over 120 km.\nSubsequently, we conduct an in-depth analysis of the SOTA methods for global\npose estimation. Our experiments demonstrate that WorldPose challenges existing\nmulti-person techniques, supporting the potential for new research in this area\nand others, such as sports analysis. All pose annotations (in SMPL format),\nbroadcasting camera parameters and footage will be released for academic\nresearch purposes.\n","authors":["Tianjian Jiang","Johsan Billingham","Sebastian Müksch","Juan Zarate","Nicolas Evans","Martin R. Oswald","Marc Pollefeys","Otmar Hilliges","Manuel Kaufmann","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2501.02771v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04842v2","updated":"2025-01-20T06:32:52Z","published":"2024-12-06T08:27:53Z","title":"UniMLVG: Unified Framework for Multi-view Long Video Generation with\n  Comprehensive Control Capabilities for Autonomous Driving","summary":"  The creation of diverse and realistic driving scenarios has become essential\nto enhance perception and planning capabilities of the autonomous driving\nsystem. However, generating long-duration, surround-view consistent driving\nvideos remains a significant challenge. To address this, we present UniMLVG, a\nunified framework designed to generate extended street multi-perspective videos\nunder precise control. By integrating single- and multi-view driving videos\ninto the training data, our approach updates cross-frame and cross-view modules\nacross three stages with different training objectives, substantially boosting\nthe diversity and quality of generated visual content. Additionally, we employ\nthe explicit viewpoint modeling in multi-view video generation to effectively\nimprove motion transition consistency. Capable of handling various input\nreference formats (e.g., text, images, or video), our UniMLVG generates\nhigh-quality multi-view videos according to the corresponding condition\nconstraints such as 3D bounding boxes or frame-level text descriptions.\nCompared to the best models with similar capabilities, our framework achieves\nimprovements of 21.4% in FID and 36.5% in FVD.\n","authors":["Rui Chen","Zehuan Wu","Yichen Liu","Yuxin Guo","Jingcheng Ni","Haifeng Xia","Siyu Xia"],"pdf_url":"https://arxiv.org/pdf/2412.04842v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18216v2","updated":"2025-01-20T06:04:13Z","published":"2024-12-24T06:45:36Z","title":"ICM-Assistant: Instruction-tuning Multimodal Large Language Models for\n  Rule-based Explainable Image Content Moderation","summary":"  Controversial contents largely inundate the Internet, infringing various\ncultural norms and child protection standards. Traditional Image Content\nModeration (ICM) models fall short in producing precise moderation decisions\nfor diverse standards, while recent multimodal large language models (MLLMs),\nwhen adopted to general rule-based ICM, often produce classification and\nexplanation results that are inconsistent with human moderators. Aiming at\nflexible, explainable, and accurate ICM, we design a novel rule-based dataset\ngeneration pipeline, decomposing concise human-defined rules and leveraging\nwell-designed multi-stage prompts to enrich short explicit image annotations.\nOur ICM-Instruct dataset includes detailed moderation explanation and\nmoderation Q-A pairs. Built upon it, we create our ICM-Assistant model in the\nframework of rule-based ICM, making it readily applicable in real practice. Our\nICM-Assistant model demonstrates exceptional performance and flexibility.\nSpecifically, it significantly outperforms existing approaches on various\nsources, improving both the moderation classification (36.8% on average) and\nmoderation explanation quality (26.6% on average) consistently over existing\nMLLMs. Code/Data is available at https://github.com/zhaoyuzhi/ICM-Assistant.\n","authors":["Mengyang Wu","Yuzhi Zhao","Jialun Cao","Mingjie Xu","Zhongming Jiang","Xuehui Wang","Qinbin Li","Guangneng Hu","Shengchao Qin","Chi-Wing Fu"],"pdf_url":"https://arxiv.org/pdf/2412.18216v2.pdf","comment":"Accepted by the AAAI 2025"},{"id":"http://arxiv.org/abs/2501.11288v1","updated":"2025-01-20T05:50:39Z","published":"2025-01-20T05:50:39Z","title":"PD-SORT: Occlusion-Robust Multi-Object Tracking Using Pseudo-Depth Cues","summary":"  Multi-object tracking (MOT) is a rising topic in video processing\ntechnologies and has important application value in consumer electronics.\nCurrently, tracking-by-detection (TBD) is the dominant paradigm for MOT, which\nperforms target detection and association frame by frame. However, the\nassociation performance of TBD methods degrades in complex scenes with heavy\nocclusions, which hinders the application of such methods in real-world\nscenarios.To this end, we incorporate pseudo-depth cues to enhance the\nassociation performance and propose Pseudo-Depth SORT (PD-SORT). First, we\nextend the Kalman filter state vector with pseudo-depth states. Second, we\nintroduce a novel depth volume IoU (DVIoU) by combining the conventional 2D IoU\nwith pseudo-depth. Furthermore, we develop a quantized pseudo-depth measurement\n(QPDM) strategy for more robust data association. Besides, we also integrate\ncamera motion compensation (CMC) to handle dynamic camera situations. With the\nabove designs, PD-SORT significantly alleviates the occlusion-induced ambiguous\nassociations and achieves leading performances on DanceTrack, MOT17, and MOT20.\nNote that the improvement is especially obvious on DanceTrack, where objects\nshow complex motions, similar appearances, and frequent occlusions. The code is\navailable at https://github.com/Wangyc2000/PD_SORT.\n","authors":["Yanchao Wang","Dawei Zhang","Run Li","Zhonglong Zheng","Minglu Li"],"pdf_url":"https://arxiv.org/pdf/2501.11288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17523v2","updated":"2025-01-20T05:44:07Z","published":"2024-12-23T12:47:04Z","title":"Constructing Fair Latent Space for Intersection of Fairness and\n  Explainability","summary":"  As the use of machine learning models has increased, numerous studies have\naimed to enhance fairness. However, research on the intersection of fairness\nand explainability remains insufficient, leading to potential issues in gaining\nthe trust of actual users. Here, we propose a novel module that constructs a\nfair latent space, enabling faithful explanation while ensuring fairness. The\nfair latent space is constructed by disentangling and redistributing labels and\nsensitive attributes, allowing the generation of counterfactual explanations\nfor each type of information. Our module is attached to a pretrained generative\nmodel, transforming its biased latent space into a fair latent space.\nAdditionally, since only the module needs to be trained, there are advantages\nin terms of time and cost savings, without the need to train the entire\ngenerative model. We validate the fair latent space with various fairness\nmetrics and demonstrate that our approach can effectively provide explanations\nfor biased decisions and assurances of fairness.\n","authors":["Hyungjun Joo","Hyeonggeun Han","Sehwan Kim","Sangwoo Hong","Jungwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2412.17523v2.pdf","comment":"14 pages, 5 figures, accepted in AAAI 2025"},{"id":"http://arxiv.org/abs/2501.11276v1","updated":"2025-01-20T05:12:31Z","published":"2025-01-20T05:12:31Z","title":"ITCFN: Incomplete Triple-Modal Co-Attention Fusion Network for Mild\n  Cognitive Impairment Conversion Prediction","summary":"  Alzheimer's disease (AD) is a common neurodegenerative disease among the\nelderly. Early prediction and timely intervention of its prodromal stage, mild\ncognitive impairment (MCI), can decrease the risk of advancing to AD. Combining\ninformation from various modalities can significantly improve predictive\naccuracy. However, challenges such as missing data and heterogeneity across\nmodalities complicate multimodal learning methods as adding more modalities can\nworsen these issues. Current multimodal fusion techniques often fail to adapt\nto the complexity of medical data, hindering the ability to identify\nrelationships between modalities. To address these challenges, we propose an\ninnovative multimodal approach for predicting MCI conversion, focusing\nspecifically on the issues of missing positron emission tomography (PET) data\nand integrating diverse medical information. The proposed incomplete\ntriple-modal MCI conversion prediction network is tailored for this purpose.\nThrough the missing modal generation module, we synthesize the missing PET data\nfrom the magnetic resonance imaging and extract features using specifically\ndesigned encoders. We also develop a channel aggregation module and a\ntriple-modal co-attention fusion module to reduce feature redundancy and\nachieve effective multimodal data fusion. Furthermore, we design a loss\nfunction to handle missing modality issues and align cross-modal features.\nThese components collectively harness multimodal data to boost network\nperformance. Experimental results on the ADNI1 and ADNI2 datasets show that our\nmethod significantly surpasses existing unimodal and other multimodal models.\nOur code is available at https://github.com/justinhxy/ITFC.\n","authors":["Xiangyang Hu","Xiangyu Shen","Yifei Sun","Xuhao Shan","Wenwen Min","Liyilei Su","Xiaomao Fan","Ahmed Elazab","Ruiquan Ge","Changmiao Wang","Xiaopeng Fan"],"pdf_url":"https://arxiv.org/pdf/2501.11276v1.pdf","comment":"5 pages, 1 figure, accepted by IEEE ISBI 2025"},{"id":"http://arxiv.org/abs/2106.14490v6","updated":"2025-01-20T04:45:01Z","published":"2021-06-28T09:09:14Z","title":"Making Images Real Again: A Comprehensive Survey on Deep Image\n  Composition","summary":"  As a common image editing operation, image composition (object insertion)\naims to combine the foreground from one image and another background image,\nresulting in a composite image. However, there are many issues that could make\nthe composite images unrealistic. These issues can be summarized as the\ninconsistency between foreground and background, which includes appearance\ninconsistency (e.g., incompatible illumination), geometry inconsistency (e.g.,\nunreasonable size), and semantic inconsistency (e.g., mismatched semantic\ncontext). Image composition task could be decomposed into multiple sub-tasks,\nin which each sub-task targets at one or more issues. Specifically, object\nplacement aims to find reasonable scale, location, and shape for the\nforeground. Image blending aims to address the unnatural boundary between\nforeground and background. Image harmonization aims to adjust the illumination\nstatistics of foreground. Shadow (resp., reflection) generation aims to\ngenerate plausible shadow (resp., reflection) for the foreground. These\nsub-tasks can be executed sequentially or parallelly to acquire realistic\ncomposite images. To the best of our knowledge, there is no previous survey on\nimage composition (object insertion). In this paper, we conduct comprehensive\nsurvey over the sub-tasks and combinatorial task of image composition (object\ninsertion). For each one, we summarize the existing methods, available\ndatasets, and common evaluation metrics. We have also contributed the first\nimage composition toolbox libcom, which assembles 10+ image composition related\nfunctions.\n","authors":["Li Niu","Wenyan Cong","Liu Liu","Yan Hong","Bo Zhang","Jing Liang","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2106.14490v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11270v1","updated":"2025-01-20T04:39:13Z","published":"2025-01-20T04:39:13Z","title":"Spatiotemporal Air Quality Mapping in Urban Areas Using Sparse Sensor\n  Data, Satellite Imagery, Meteorological Factors, and Spatial Features","summary":"  Monitoring air pollution is crucial for protecting human health from exposure\nto harmful substances. Traditional methods of air quality monitoring, such as\nground-based sensors and satellite-based remote sensing, face limitations due\nto high deployment costs, sparse sensor coverage, and environmental\ninterferences. To address these challenges, this paper proposes a framework for\nhigh-resolution spatiotemporal Air Quality Index (AQI) mapping using sparse\nsensor data, satellite imagery, and various spatiotemporal factors. By\nleveraging Graph Neural Networks (GNNs), we estimate AQI values at unmonitored\nlocations based on both spatial and temporal dependencies. The framework\nincorporates a wide range of environmental features, including meteorological\ndata, road networks, points of interest (PoIs), population density, and urban\ngreen spaces, which enhance prediction accuracy. We illustrate the use of our\napproach through a case study in Lahore, Pakistan, where multi-resolution data\nis used to generate the air quality index map at a fine spatiotemporal scale.\n","authors":["Osama Ahmad","Zubair Khalid","Muhammad Tahir","Momin Uppal"],"pdf_url":"https://arxiv.org/pdf/2501.11270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11263v1","updated":"2025-01-20T04:11:09Z","published":"2025-01-20T04:11:09Z","title":"Towards Loss-Resilient Image Coding for Unstable Satellite Networks","summary":"  Geostationary Earth Orbit (GEO) satellite communication demonstrates\nsignificant advantages in emergency short burst data services. However,\nunstable satellite networks, particularly those with frequent packet loss,\npresent a severe challenge to accurate image transmission. To address it, we\npropose a loss-resilient image coding approach that leverages end-to-end\noptimization in learned image compression (LIC). Our method builds on the\nchannel-wise progressive coding framework, incorporating Spatial-Channel\nRearrangement (SCR) on the encoder side and Mask Conditional Aggregation (MCA)\non the decoder side to improve reconstruction quality with unpredictable\nerrors. By integrating the Gilbert-Elliot model into the training process, we\nenhance the model's ability to generalize in real-world network conditions.\nExtensive evaluations show that our approach outperforms traditional and deep\nlearning-based methods in terms of compression performance and stability under\ndiverse packet loss, offering robust and efficient progressive transmission\neven in challenging environments. Code is available at\nhttps://github.com/NJUVISION/LossResilientLIC.\n","authors":["Hongwei Sha","Muchen Dong","Quanyou Luo","Ming Lu","Hao Chen","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2501.11263v1.pdf","comment":"Accepted as a poster presentation at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.10110v2","updated":"2025-01-20T04:00:53Z","published":"2025-01-17T10:53:03Z","title":"DiffVSR: Enhancing Real-World Video Super-Resolution with Diffusion\n  Models for Advanced Visual Quality and Temporal Consistency","summary":"  Diffusion models have demonstrated exceptional capabilities in image\ngeneration and restoration, yet their application to video super-resolution\nfaces significant challenges in maintaining both high fidelity and temporal\nconsistency. We present DiffVSR, a diffusion-based framework for real-world\nvideo super-resolution that effectively addresses these challenges through key\ninnovations. For intra-sequence coherence, we develop a multi-scale temporal\nattention module and temporal-enhanced VAE decoder that capture fine-grained\nmotion details. To ensure inter-sequence stability, we introduce a noise\nrescheduling mechanism with an interweaved latent transition approach, which\nenhances temporal consistency without additional training overhead. We propose\na progressive learning strategy that transitions from simple to complex\ndegradations, enabling robust optimization despite limited high-quality video\ndata. Extensive experiments demonstrate that DiffVSR delivers superior results\nin both visual quality and temporal consistency, setting a new performance\nstandard in real-world video super-resolution.\n","authors":["Xiaohui Li","Yihao Liu","Shuo Cao","Ziyan Chen","Shaobin Zhuang","Xiangyu Chen","Yinan He","Yi Wang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2501.10110v2.pdf","comment":"Project page: https://xh9998.github.io/DiffVSR-project/"},{"id":"http://arxiv.org/abs/2501.11260v1","updated":"2025-01-20T04:00:02Z","published":"2025-01-20T04:00:02Z","title":"A Survey of World Models for Autonomous Driving","summary":"  Recent breakthroughs in autonomous driving have revolutionized the way\nvehicles perceive and interact with their surroundings. In particular, world\nmodels have emerged as a linchpin technology, offering high-fidelity\nrepresentations of the driving environment that integrate multi-sensor data,\nsemantic cues, and temporal dynamics. Such models unify perception, prediction,\nand planning, thereby enabling autonomous systems to make rapid, informed\ndecisions under complex and often unpredictable conditions. Research trends\nspan diverse areas, including 4D occupancy prediction and generative data\nsynthesis, all of which bolster scene understanding and trajectory forecasting.\nNotably, recent works exploit large-scale pretraining and advanced\nself-supervised learning to scale up models' capacity for rare-event simulation\nand real-time interaction. In addressing key challenges -- ranging from domain\nadaptation and long-tail anomaly detection to multimodal fusion -- these world\nmodels pave the way for more robust, reliable, and adaptable autonomous driving\nsolutions. This survey systematically reviews the state of the art,\ncategorizing techniques by their focus on future prediction, behavior planning,\nand the interaction between the two. We also identify potential directions for\nfuture research, emphasizing holistic integration, improved computational\nefficiency, and advanced simulation. Our comprehensive analysis underscores the\ntransformative role of world models in driving next-generation autonomous\nsystems toward safer and more equitable mobility.\n","authors":["Tuo Feng","Wenguan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2501.11260v1.pdf","comment":"Ongoing project"},{"id":"http://arxiv.org/abs/2501.11258v1","updated":"2025-01-20T03:54:30Z","published":"2025-01-20T03:54:30Z","title":"Enhancing Uncertainty Estimation in Semantic Segmentation via\n  Monte-Carlo Frequency Dropout","summary":"  Monte-Carlo (MC) Dropout provides a practical solution for estimating\npredictive distributions in deterministic neural networks. Traditional dropout,\napplied within the signal space, may fail to account for frequency-related\nnoise common in medical imaging, leading to biased predictive estimates. A\nnovel approach extends Dropout to the frequency domain, allowing stochastic\nattenuation of signal frequencies during inference. This creates diverse global\ntextural variations in feature maps while preserving structural integrity -- a\nfactor we hypothesize and empirically show is contributing to accurately\nestimating uncertainties in semantic segmentation. We evaluated traditional\nMC-Dropout and the MC-frequency Dropout in three segmentation tasks involving\ndifferent imaging modalities: (i) prostate zones in biparametric MRI, (ii)\nliver tumors in contrast-enhanced CT, and (iii) lungs in chest X-ray scans. Our\nresults show that MC-Frequency Dropout improves calibration, convergence, and\nsemantic uncertainty, thereby improving prediction scrutiny, boundary\ndelineation, and has the potential to enhance medical decision-making.\n","authors":["Tal Zeevi","Lawrence H. Staib","John A. Onofrey"],"pdf_url":"https://arxiv.org/pdf/2501.11258v1.pdf","comment":"Accepted by IEEE ISBI 2025 4-page paper. Code for the implementation\n  is available at https://github.com/talze/frequency-dropout"},{"id":"http://arxiv.org/abs/2501.11253v1","updated":"2025-01-20T03:34:49Z","published":"2025-01-20T03:34:49Z","title":"How Well Do Supervised 3D Models Transfer to Medical Imaging Tasks?","summary":"  The pre-training and fine-tuning paradigm has become prominent in transfer\nlearning. For example, if the model is pre-trained on ImageNet and then\nfine-tuned to PASCAL, it can significantly outperform that trained on PASCAL\nfrom scratch. While ImageNet pre-training has shown enormous success, it is\nformed in 2D, and the learned features are for classification tasks; when\ntransferring to more diverse tasks, like 3D image segmentation, its performance\nis inevitably compromised due to the deviation from the original ImageNet\ncontext. A significant challenge lies in the lack of large, annotated 3D\ndatasets rivaling the scale of ImageNet for model pre-training. To overcome\nthis challenge, we make two contributions. Firstly, we construct AbdomenAtlas\n1.1 that comprises 9,262 three-dimensional computed tomography (CT) volumes\nwith high-quality, per-voxel annotations of 25 anatomical structures and pseudo\nannotations of seven tumor types. Secondly, we develop a suite of models that\nare pre-trained on our AbdomenAtlas 1.1 for transfer learning. Our preliminary\nanalyses indicate that the model trained only with 21 CT volumes, 672 masks,\nand 40 GPU hours has a transfer learning ability similar to the model trained\nwith 5,050 (unlabeled) CT volumes and 1,152 GPU hours. More importantly, the\ntransfer learning ability of supervised models can further scale up with larger\nannotated datasets, achieving significantly better performance than preexisting\npre-trained models, irrespective of their pre-training methodologies or data\nsources. We hope this study can facilitate collective efforts in constructing\nlarger 3D medical datasets and more releases of supervised pre-trained models.\n","authors":["Wenxuan Li","Alan Yuille","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.11253v1.pdf","comment":"Accepted to ICLR-2024"},{"id":"http://arxiv.org/abs/2501.11249v1","updated":"2025-01-20T03:28:34Z","published":"2025-01-20T03:28:34Z","title":"Enhancing SAR Object Detection with Self-Supervised Pre-training on\n  Masked Auto-Encoders","summary":"  Supervised fine-tuning methods (SFT) perform great efficiency on artificial\nintelligence interpretation in SAR images, leveraging the powerful\nrepresentation knowledge from pre-training models. Due to the lack of\ndomain-specific pre-trained backbones in SAR images, the traditional strategies\nare loading the foundation pre-train models of natural scenes such as ImageNet,\nwhose characteristics of images are extremely different from SAR images. This\nmay hinder the model performance on downstream tasks when adopting SFT on\nsmall-scale annotated SAR data. In this paper, an self-supervised learning\n(SSL) method of masked image modeling based on Masked Auto-Encoders (MAE) is\nproposed to learn feature representations of SAR images during the pre-training\nprocess and benefit the object detection task in SAR images of SFT. The\nevaluation experiments on the large-scale SAR object detection benchmark named\nSARDet-100k verify that the proposed method captures proper latent\nrepresentations of SAR images and improves the model generalization in\ndownstream tasks by converting the pre-trained domain from natural scenes to\nSAR images through SSL. The proposed method achieves an improvement of 1.3 mAP\non the SARDet-100k benchmark compared to only the SFT strategies.\n","authors":["Xinyang Pu","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2501.11249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11241v1","updated":"2025-01-20T03:02:00Z","published":"2025-01-20T03:02:00Z","title":"Irony in Emojis: A Comparative Study of Human and LLM Interpretation","summary":"  Emojis have become a universal language in online communication, often\ncarrying nuanced and context-dependent meanings. Among these, irony poses a\nsignificant challenge for Large Language Models (LLMs) due to its inherent\nincongruity between appearance and intent. This study examines the ability of\nGPT-4o to interpret irony in emojis. By prompting GPT-4o to evaluate the\nlikelihood of specific emojis being used to express irony on social media and\ncomparing its interpretations with human perceptions, we aim to bridge the gap\nbetween machine and human understanding. Our findings reveal nuanced insights\ninto GPT-4o's interpretive capabilities, highlighting areas of alignment with\nand divergence from human behavior. Additionally, this research underscores the\nimportance of demographic factors, such as age and gender, in shaping emoji\ninterpretation and evaluates how these factors influence GPT-4o's performance.\n","authors":["Yawen Zheng","Hanjia Lyu","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2501.11241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11236v1","updated":"2025-01-20T02:48:07Z","published":"2025-01-20T02:48:07Z","title":"A New Formulation of Lipschitz Constrained With Functional Gradient\n  Learning for GANs","summary":"  This paper introduces a promising alternative method for training Generative\nAdversarial Networks (GANs) on large-scale datasets with clear theoretical\nguarantees. GANs are typically learned through a minimax game between a\ngenerator and a discriminator, which is known to be empirically unstable.\nPrevious learning paradigms have encountered mode collapse issues without a\ntheoretical solution. To address these challenges, we propose a novel\nLipschitz-constrained Functional Gradient GANs learning (Li-CFG) method to\nstabilize the training of GAN and provide a theoretical foundation for\neffectively increasing the diversity of synthetic samples by reducing the\nneighborhood size of the latent vector. Specifically, we demonstrate that the\nneighborhood size of the latent vector can be reduced by increasing the norm of\nthe discriminator gradient, resulting in enhanced diversity of synthetic\nsamples. To efficiently enlarge the norm of the discriminator gradient, we\nintroduce a novel {\\epsilon}-centered gradient penalty that amplifies the norm\nof the discriminator gradient using the hyper-parameter {\\epsilon}. In\ncomparison to other constraints, our method enlarging the discriminator norm,\nthus obtaining the smallest neighborhood size of the latent vector. Extensive\nexperiments on benchmark datasets for image generation demonstrate the efficacy\nof the Li-CFG method and the {\\epsilon}-centered gradient penalty. The results\nshowcase improved stability and increased diversity of synthetic samples.\n","authors":["Chang Wan","Ke Fan","Xinwei Sun","Yanwei Fu","Minglu Li","Yunliang Jiang","Zhonglong Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.11236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03670v2","updated":"2025-01-20T02:44:19Z","published":"2024-11-06T05:09:34Z","title":"Touchstone Benchmark: Are We on the Right Way for Evaluating AI\n  Algorithms for Medical Segmentation?","summary":"  How can we test AI performance? This question seems trivial, but it isn't.\nStandard benchmarks often have problems such as in-distribution and small-size\ntest sets, oversimplified metrics, unfair comparisons, and short-term outcome\npressure. As a consequence, good performance on standard benchmarks does not\nguarantee success in real-world scenarios. To address these problems, we\npresent Touchstone, a large-scale collaborative segmentation benchmark of 9\ntypes of abdominal organs. This benchmark is based on 5,195 training CT scans\nfrom 76 hospitals around the world and 5,903 testing CT scans from 11\nadditional hospitals. This diverse test set enhances the statistical\nsignificance of benchmark results and rigorously evaluates AI algorithms across\nvarious out-of-distribution scenarios. We invited 14 inventors of 19 AI\nalgorithms to train their algorithms, while our team, as a third party,\nindependently evaluated these algorithms on three test sets. In addition, we\nalso evaluated pre-existing AI frameworks--which, differing from algorithms,\nare more flexible and can support different algorithms--including MONAI from\nNVIDIA, nnU-Net from DKFZ, and numerous other open-source frameworks. We are\ncommitted to expanding this benchmark to encourage more innovation of AI\nalgorithms for the medical domain.\n","authors":["Pedro R. A. S. Bassi","Wenxuan Li","Yucheng Tang","Fabian Isensee","Zifu Wang","Jieneng Chen","Yu-Cheng Chou","Yannick Kirchhoff","Maximilian Rokuss","Ziyan Huang","Jin Ye","Junjun He","Tassilo Wald","Constantin Ulrich","Michael Baumgartner","Saikat Roy","Klaus H. Maier-Hein","Paul Jaeger","Yiwen Ye","Yutong Xie","Jianpeng Zhang","Ziyang Chen","Yong Xia","Zhaohu Xing","Lei Zhu","Yousef Sadegheih","Afshin Bozorgpour","Pratibha Kumari","Reza Azad","Dorit Merhof","Pengcheng Shi","Ting Ma","Yuxin Du","Fan Bai","Tiejun Huang","Bo Zhao","Haonan Wang","Xiaomeng Li","Hanxue Gu","Haoyu Dong","Jichen Yang","Maciej A. Mazurowski","Saumya Gupta","Linshan Wu","Jiaxin Zhuang","Hao Chen","Holger Roth","Daguang Xu","Matthew B. Blaschko","Sergio Decherchi","Andrea Cavalli","Alan L. Yuille","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.03670v2.pdf","comment":"Accepted to NeurIPS-2024"},{"id":"http://arxiv.org/abs/2409.06993v3","updated":"2025-01-20T02:42:46Z","published":"2024-09-11T03:54:40Z","title":"RICAU-Net: Residual-block Inspired Coordinate Attention U-Net for\n  Segmentation of Small and Sparse Calcium Lesions in Cardiac CT","summary":"  The Agatston score, which is the sum of the calcification in the four main\ncoronary arteries, has been widely used in the diagnosis of coronary artery\ndisease (CAD). However, many studies have emphasized the importance of the\nvessel-specific Agatston score, as calcification in a specific vessel is\nsignificantly correlated with the occurrence of coronary heart disease (CHD).\nIn this paper, we propose the Residual-block Inspired Coordinate Attention\nU-Net (RICAU-Net), which incorporates coordinate attention in two distinct\nmanners and a customized combo loss function for lesion-specific coronary\nartery calcium (CAC) segmentation. This approach aims to tackle the high\nclass-imbalance issue associated with small and sparse CAC lesions.\nExperimental results and the ablation study demonstrate that the proposed\nmethod outperforms the five other U-Net based methods used in medical\napplications, by achieving the highest per-lesion Dice scores across all four\nlesions.\n","authors":["Doyoung Park","Jinsoo Kim","Qi Chang","Shuang Leng","Liang Zhong","Lohendran Baskaran"],"pdf_url":"https://arxiv.org/pdf/2409.06993v3.pdf","comment":"Accepted by IEEE ISBI 2025"},{"id":"http://arxiv.org/abs/2501.11231v1","updated":"2025-01-20T02:31:00Z","published":"2025-01-20T02:31:00Z","title":"KPL: Training-Free Medical Knowledge Mining of Vision-Language Models","summary":"  Visual Language Models such as CLIP excel in image recognition due to\nextensive image-text pre-training. However, applying the CLIP inference in\nzero-shot classification, particularly for medical image diagnosis, faces\nchallenges due to: 1) the inadequacy of representing image classes solely with\nsingle category names; 2) the modal gap between the visual and text spaces\ngenerated by CLIP encoders. Despite attempts to enrich disease descriptions\nwith large language models, the lack of class-specific knowledge often leads to\npoor performance. In addition, empirical evidence suggests that existing proxy\nlearning methods for zero-shot image classification on natural image datasets\nexhibit instability when applied to medical datasets. To tackle these\nchallenges, we introduce the Knowledge Proxy Learning (KPL) to mine knowledge\nfrom CLIP. KPL is designed to leverage CLIP's multimodal understandings for\nmedical image classification through Text Proxy Optimization and Multimodal\nProxy Learning. Specifically, KPL retrieves image-relevant knowledge\ndescriptions from the constructed knowledge-enhanced base to enrich semantic\ntext proxies. It then harnesses input images and these descriptions, encoded\nvia CLIP, to stably generate multimodal proxies that boost the zero-shot\nclassification performance. Extensive experiments conducted on both medical and\nnatural image datasets demonstrate that KPL enables effective zero-shot image\nclassification, outperforming all baselines. These findings highlight the great\npotential in this paradigm of mining knowledge from CLIP for medical image\nclassification and broader areas.\n","authors":["Jiaxiang Liu","Tianxiang Hu","Jiawei Du","Ruiyuan Zhang","Joey Tianyi Zhou","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11231v1.pdf","comment":"AAAI(Oral)"},{"id":"http://arxiv.org/abs/2501.11229v1","updated":"2025-01-20T02:29:34Z","published":"2025-01-20T02:29:34Z","title":"Successive Interference Cancellation-aided Diffusion Models for Joint\n  Channel Estimation and Data Detection in Low Rank Channel Scenarios","summary":"  This paper proposes a novel joint channel-estimation and source-detection\nalgorithm using successive interference cancellation (SIC)-aided generative\nscore-based diffusion models. Prior work in this area focuses on massive MIMO\nscenarios, which are typically characterized by full-rank channels, and fail in\nlow-rank channel scenarios. The proposed algorithm outperforms existing methods\nin joint source-channel estimation, especially in low-rank scenarios where the\nnumber of users exceeds the number of antennas at the access point (AP). The\nproposed score-based iterative diffusion process estimates the gradient of the\nprior distribution on partial channels, and recursively updates the estimated\nchannel parts as well as the source. Extensive simulation results show that the\nproposed method outperforms the baseline methods in terms of normalized mean\nsquared error (NMSE) and symbol error rate (SER) in both full-rank and low-rank\nchannel scenarios, while having a more dominant effect in the latter, at\nvarious signal-to-noise ratios (SNR).\n","authors":["Sagnik Bhattacharya","Muhammad Ahmed Mohsin","Kamyar Rajabalifardi","John M. Cioffi"],"pdf_url":"https://arxiv.org/pdf/2501.11229v1.pdf","comment":"Published at IEEE ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.11225v1","updated":"2025-01-20T02:19:26Z","published":"2025-01-20T02:19:26Z","title":"CNN-based TEM image denoising from first principles","summary":"  Transmission electron microscope (TEM) images are often corrupted by noise,\nhindering their interpretation. To address this issue, we propose a deep\nlearning-based approach using simulated images. Using density functional theory\ncalculations with a set of pseudo-atomic orbital basis sets, we generate highly\naccurate ground truth images. We introduce four types of noise into these\nsimulations to create realistic training datasets. Each type of noise is then\nused to train a separate convolutional neural network (CNN) model. Our results\nshow that these CNNs are effective in reducing noise, even when applied to\nimages with different noise levels than those used during training. However, we\nobserve limitations in some cases, particularly in preserving the integrity of\ncircular shapes and avoiding visible artifacts between image patches. To\novercome these challenges, we propose alternative training strategies and\nfuture research directions. This study provides a valuable framework for\ntraining deep learning models for TEM image denoising.\n","authors":["Jinwoong Chae","Sungwook Hong","Sungkyu Kim","Sungroh Yoon","Gunn Kim"],"pdf_url":"https://arxiv.org/pdf/2501.11225v1.pdf","comment":"10 pages and 4 figures"},{"id":"http://arxiv.org/abs/2501.10357v2","updated":"2025-01-20T02:14:32Z","published":"2025-01-17T18:57:57Z","title":"Zero-Shot Monocular Scene Flow Estimation in the Wild","summary":"  Large models have shown generalization across datasets for many low-level\nvision tasks, like depth estimation, but no such general models exist for scene\nflow. Even though scene flow has wide potential use, it is not used in practice\nbecause current predictive models do not generalize well. We identify three key\nchallenges and propose solutions for each. First, we create a method that\njointly estimates geometry and motion for accurate prediction. Second, we\nalleviate scene flow data scarcity with a data recipe that affords us 1M\nannotated training samples across diverse synthetic scenes. Third, we evaluate\ndifferent parameterizations for scene flow prediction and adopt a natural and\neffective parameterization. Our resulting model outperforms existing methods as\nwell as baselines built on large-scale models in terms of 3D end-point error,\nand shows zero-shot generalization to the casually captured videos from DAVIS\nand the robotic manipulation scenes from RoboTAP. Overall, our approach makes\nscene flow prediction more practical in-the-wild.\n","authors":["Yiqing Liang","Abhishek Badki","Hang Su","James Tompkin","Orazio Gallo"],"pdf_url":"https://arxiv.org/pdf/2501.10357v2.pdf","comment":"Project Website: https://research.nvidia.com/labs/lpr/zero_msf//"},{"id":"http://arxiv.org/abs/2501.11221v1","updated":"2025-01-20T02:11:51Z","published":"2025-01-20T02:11:51Z","title":"Finding Reproducible and Prognostic Radiomic Features in Variable Slice\n  Thickness Contrast Enhanced CT of Colorectal Liver Metastases","summary":"  Establishing the reproducibility of radiomic signatures is a critical step in\nthe path to clinical adoption of quantitative imaging biomarkers; however,\nradiomic signatures must also be meaningfully related to an outcome of clinical\nimportance to be of value for personalized medicine. In this study, we analyze\nboth the reproducibility and prognostic value of radiomic features extracted\nfrom the liver parenchyma and largest liver metastases in contrast enhanced CT\nscans of patients with colorectal liver metastases (CRLM). A prospective cohort\nof 81 patients from two major US cancer centers was used to establish the\nreproducibility of radiomic features extracted from images reconstructed with\ndifferent slice thicknesses. A publicly available, single-center cohort of 197\npreoperative scans from patients who underwent hepatic resection for treatment\nof CRLM was used to evaluate the prognostic value of features and models to\npredict overall survival. A standard set of 93 features was extracted from all\nimages, with a set of eight different extractor settings. The feature\nextraction settings producing the most reproducible, as well as the most\nprognostically discriminative feature values were highly dependent on both the\nregion of interest and the specific feature in question. While the best overall\npredictive model was produced using features extracted with a particular\nsetting, without accounting for reproducibility, (C-index = 0.630\n(0.603--0.649)) an equivalent-performing model (C-index = 0.629 (0.605--0.645))\nwas produced by pooling features from all extraction settings, and thresholding\nfeatures with low reproducibility ($\\mathrm{CCC} \\geq 0.85$), prior to feature\nselection. Our findings support a data-driven approach to feature extraction\nand selection, preferring the inclusion of many features, and narrowing feature\nselection based on reproducibility when relevant data is available.\n","authors":["Jacob J. Peoples","Mohammad Hamghalam","Imani James","Maida Wasim","Natalie Gangai","Hyunseon Christine Kang","X. John Rong","Yun Shin Chun","Richard K. G. Do","Amber L. Simpson"],"pdf_url":"https://arxiv.org/pdf/2501.11221v1.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n  Biomedical Imaging (MELBA) https://melba-journal.org/2024:032"},{"id":"http://arxiv.org/abs/2501.11218v1","updated":"2025-01-20T01:49:37Z","published":"2025-01-20T01:49:37Z","title":"Leveraging GANs For Active Appearance Models Optimized Model Fitting","summary":"  Generative Adversarial Networks (GANs) have gained prominence in refining\nmodel fitting tasks in computer vision, particularly in domains involving\ndeformable models like Active Appearance Models (AAMs). This paper explores the\nintegration of GANs to enhance the AAM fitting process, addressing challenges\nin optimizing nonlinear parameters associated with appearance and shape\nvariations. By leveraging GANs' adversarial training framework, the aim is to\nminimize fitting errors and improve convergence rates. Achieving robust\nperformance even in cases with high appearance variability and occlusions. Our\napproach demonstrates significant improvements in accuracy and computational\nefficiency compared to traditional optimization techniques, thus establishing\nGANs as a potent tool for advanced image model fitting.\n","authors":["Anurag Awasthi"],"pdf_url":"https://arxiv.org/pdf/2501.11218v1.pdf","comment":"9 pages, 2 figures, in proceeding at conference"},{"id":"http://arxiv.org/abs/2501.11211v1","updated":"2025-01-20T01:03:50Z","published":"2025-01-20T01:03:50Z","title":"Ditto: Accelerating Diffusion Model via Temporal Value Similarity","summary":"  Diffusion models achieve superior performance in image generation tasks.\nHowever, it incurs significant computation overheads due to its iterative\nstructure. To address these overheads, we analyze this iterative structure and\nobserve that adjacent time steps in diffusion models exhibit high value\nsimilarity, leading to narrower differences between consecutive time steps. We\nadapt these characteristics to a quantized diffusion model and reveal that the\nmajority of these differences can be represented with reduced bit-width, and\neven zero. Based on our observations, we propose the Ditto algorithm, a\ndifference processing algorithm that leverages temporal similarity with\nquantization to enhance the efficiency of diffusion models. By exploiting the\nnarrower differences and the distributive property of layer operations, it\nperforms full bit-width operations for the initial time step and processes\nsubsequent steps with temporal differences. In addition, Ditto execution flow\noptimization is designed to mitigate the memory overhead of temporal difference\nprocessing, further boosting the efficiency of the Ditto algorithm. We also\ndesign the Ditto hardware, a specialized hardware accelerator, fully exploiting\nthe dynamic characteristics of the proposed algorithm. As a result, the Ditto\nhardware achieves up to 1.5x speedup and 17.74% energy saving compared to other\naccelerators.\n","authors":["Sungbin Kim","Hyunwuk Lee","Wonho Cho","Mincheol Park","Won Woo Ro"],"pdf_url":"https://arxiv.org/pdf/2501.11211v1.pdf","comment":"Accepted for publication at the 2025 IEEE International Symposium on\n  High-Performance Computer Architecture (HPCA 2025)"},{"id":"http://arxiv.org/abs/2404.16375v2","updated":"2025-01-20T00:29:19Z","published":"2024-04-25T07:29:17Z","title":"List Items One by One: A New Data Source and Learning Paradigm for\n  Multimodal LLMs","summary":"  Set-of-Mark (SoM) Prompting unleashes the visual grounding capability of\nGPT-4V, by enabling the model to associate visual objects with tags inserted on\nthe image. These tags, marked with alphanumerics, can be indexed via text\ntokens for easy reference. Despite the extraordinary performance from GPT-4V,\nwe observe that other Multimodal Large Language Models (MLLMs) struggle to\nunderstand these visual tags. To promote the learning of SoM prompting for\nopen-source models, we propose a new learning paradigm: \"list items one by\none,\" which asks the model to enumerate and describe all visual tags placed on\nthe image following the alphanumeric orders of tags. By integrating our curated\ndataset with other visual instruction tuning datasets, we are able to equip\nexisting MLLMs with the SoM prompting ability. Furthermore, we evaluate our\nfinetuned SoM models on five MLLM benchmarks. We find that this new dataset,\neven in a relatively small size (10k-30k images with tags), significantly\nenhances visual reasoning capabilities and reduces hallucinations for MLLMs.\nPerhaps surprisingly, these improvements persist even when the visual tags are\nomitted from input images during inference. This suggests the potential of\n\"list items one by one\" as a new paradigm for training MLLMs, which strengthens\nthe object-text alignment through the use of visual tags in the training stage.\nFinally, we conduct analyses by probing trained models to understand the\nworking mechanism of SoM. Our code and data are available at\n\\url{https://github.com/zzxslp/SoM-LLaVA}.\n","authors":["An Yan","Zhengyuan Yang","Junda Wu","Wanrong Zhu","Jianwei Yang","Linjie Li","Kevin Lin","Jianfeng Wang","Julian McAuley","Jianfeng Gao","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16375v2.pdf","comment":"published at COLM-2024"},{"id":"http://arxiv.org/abs/2501.11203v1","updated":"2025-01-20T00:26:14Z","published":"2025-01-20T00:26:14Z","title":"Advancing Oyster Phenotype Segmentation with Multi-Network Ensemble and\n  Multi-Scale mechanism","summary":"  Phenotype segmentation is pivotal in analysing visual features of living\norganisms, enhancing our understanding of their characteristics. In the context\nof oysters, meat quality assessment is paramount, focusing on shell, meat,\ngonad, and muscle components. Traditional manual inspection methods are\ntime-consuming and subjective, prompting the adoption of machine vision\ntechnology for efficient and objective evaluation. We explore machine vision's\ncapacity for segmenting oyster components, leading to the development of a\nmulti-network ensemble approach with a global-local hierarchical attention\nmechanism. This approach integrates predictions from diverse models and\naddresses challenges posed by varying scales, ensuring robust instance\nsegmentation across components. Finally, we provide a comprehensive evaluation\nof the proposed method's performance using different real-world datasets,\nhighlighting its efficacy and robustness in enhancing oyster phenotype\nsegmentation.\n","authors":["Wenli Yang","Yanyu Chen","Andrew Trotter","Byeong Kang"],"pdf_url":"https://arxiv.org/pdf/2501.11203v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2501.11759v1","updated":"2025-01-20T21:38:36Z","published":"2025-01-20T21:38:36Z","title":"Poison-RAG: Adversarial Data Poisoning Attacks on Retrieval-Augmented\n  Generation in Recommender Systems","summary":"  This study presents Poison-RAG, a framework for adversarial data poisoning\nattacks targeting retrieval-augmented generation (RAG)-based recommender\nsystems. Poison-RAG manipulates item metadata, such as tags and descriptions,\nto influence recommendation outcomes. Using item metadata generated through a\nlarge language model (LLM) and embeddings derived via the OpenAI API, we\nexplore the impact of adversarial poisoning attacks on provider-side, where\nattacks are designed to promote long-tail items and demote popular ones. Two\nattack strategies are proposed: local modifications, which personalize tags for\neach item using BERT embeddings, and global modifications, applying uniform\ntags across the dataset. Experiments conducted on the MovieLens dataset in a\nblack-box setting reveal that local strategies improve manipulation\neffectiveness by up to 50\\%, while global strategies risk boosting already\npopular items. Results indicate that popular items are more susceptible to\nattacks, whereas long-tail items are harder to manipulate. Approximately 70\\%\nof items lack tags, presenting a cold-start challenge; data augmentation and\nsynthesis are proposed as potential defense mechanisms to enhance RAG-based\nsystems' resilience. The findings emphasize the need for robust metadata\nmanagement to safeguard recommendation frameworks. Code and data are available\nat https://github.com/atenanaz/Poison-RAG.\n","authors":["Fatemeh Nazary","Yashar Deldjoo","Tommaso di Noia"],"pdf_url":"https://arxiv.org/pdf/2501.11759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11671v1","updated":"2025-01-20T18:55:38Z","published":"2025-01-20T18:55:38Z","title":"Exploring Preference-Guided Diffusion Model for Cross-Domain\n  Recommendation","summary":"  Cross-domain recommendation (CDR) has been proven as a promising way to\nalleviate the cold-start issue, in which the most critical problem is how to\ndraw an informative user representation in the target domain via the transfer\nof user preference existing in the source domain. Prior efforts mostly follow\nthe embedding-and-mapping paradigm, which first integrate the preference into\nuser representation in the source domain, and then perform a mapping function\non this representation to the target domain. However, they focus on mapping\nfeatures across domains, neglecting to explicitly model the preference\nintegration process, which may lead to learning coarse user representation.\nDiffusion models (DMs), which contribute to more accurate user/item\nrepresentations due to their explicit information injection capability, have\nachieved promising performance in recommendation systems. Nevertheless, these\nDMs-based methods cannot directly account for valuable user preference in other\ndomains, leading to challenges in adapting to the transfer of preference for\ncold-start users. Consequently, the feasibility of DMs for CDR remains\nunderexplored. To this end, we explore to utilize the explicit information\ninjection capability of DMs for user preference integration and propose a\nPreference-Guided Diffusion Model for CDR to cold-start users, termed as DMCDR.\nSpecifically, we leverage a preference encoder to establish the preference\nguidance signal with the user's interaction history in the source domain. Then,\nwe explicitly inject the preference guidance signal into the user\nrepresentation step by step to guide the reverse process, and ultimately\ngenerate the personalized user representation in the target domain, thus\nachieving the transfer of user preference across domains. Furthermore, we\ncomprehensively explore the impact of six DMs-based variants on CDR.\n","authors":["Xiaodong Li","Hengzhu Tang","Jiawei Sheng","Xinghua Zhang","Li Gao","Suqi Cheng","Dawei Yin","Tingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11671v1.pdf","comment":"This paper is accepted by KDD'2025"},{"id":"http://arxiv.org/abs/2501.11632v1","updated":"2025-01-20T18:02:03Z","published":"2025-01-20T18:02:03Z","title":"Biomedical Knowledge Graph: A Survey of Domains, Tasks, and Real-World\n  Applications","summary":"  Biomedical knowledge graphs (BKGs) have emerged as powerful tools for\norganizing and leveraging the vast and complex data found across the biomedical\nfield. Yet, current reviews of BKGs often limit their scope to specific domains\nor methods, overlooking the broader landscape and the rapid technological\nprogress reshaping it. In this survey, we address this gap by offering a\nsystematic review of BKGs from three core perspectives: domains, tasks, and\napplications. We begin by examining how BKGs are constructed from diverse data\nsources, including molecular interactions, pharmacological datasets, and\nclinical records. Next, we discuss the essential tasks enabled by BKGs,\nfocusing on knowledge management, retrieval, reasoning, and interpretation.\nFinally, we highlight real-world applications in precision medicine, drug\ndiscovery, and scientific research, illustrating the translational impact of\nBKGs across multiple sectors. By synthesizing these perspectives into a unified\nframework, this survey not only clarifies the current state of BKG research but\nalso establishes a foundation for future exploration, enabling both innovative\nmethodological advances and practical implementations.\n","authors":["Yuxing Lu","Sin Yee Goi","Xukai Zhao","Jinzhuo Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11632v1.pdf","comment":"45 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.11628v1","updated":"2025-01-20T17:59:21Z","published":"2025-01-20T17:59:21Z","title":"Investigating the Scalability of Approximate Sparse Retrieval Algorithms\n  to Massive Datasets","summary":"  Learned sparse text embeddings have gained popularity due to their\neffectiveness in top-k retrieval and inherent interpretability. Their\ndistributional idiosyncrasies, however, have long hindered their use in\nreal-world retrieval systems. That changed with the recent development of\napproximate algorithms that leverage the distributional properties of sparse\nembeddings to speed up retrieval. Nonetheless, in much of the existing\nliterature, evaluation has been limited to datasets with only a few million\ndocuments such as MSMARCO. It remains unclear how these systems behave on much\nlarger datasets and what challenges lurk in larger scales. To bridge that gap,\nwe investigate the behavior of state-of-the-art retrieval algorithms on massive\ndatasets. We compare and contrast the recently-proposed Seismic and graph-based\nsolutions adapted from dense retrieval. We extensively evaluate Splade\nembeddings of 138M passages from MsMarco-v2 and report indexing time and other\nefficiency and effectiveness metrics.\n","authors":["Sebastian Bruch","Franco Maria Nardini","Cosimo Rulli","Rossano Venturini","Leonardo Venuta"],"pdf_url":"https://arxiv.org/pdf/2501.11628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11570v1","updated":"2025-01-20T16:19:19Z","published":"2025-01-20T16:19:19Z","title":"Uncertainty Estimation in the Real World: A Study on Music Emotion\n  Recognition","summary":"  Any data annotation for subjective tasks shows potential variations between\nindividuals. This is particularly true for annotations of emotional responses\nto musical stimuli. While older approaches to music emotion recognition systems\nfrequently addressed this uncertainty problem through probabilistic modeling,\nmodern systems based on neural networks tend to ignore the variability and\nfocus only on predicting central tendencies of human subjective responses. In\nthis work, we explore several methods for estimating not only the central\ntendencies of the subjective responses to a musical stimulus, but also for\nestimating the uncertainty associated with these responses. In particular, we\ninvestigate probabilistic loss functions and inference-time random sampling.\nExperimental results indicate that while the modeling of the central tendencies\nis achievable, modeling of the uncertainty in subjective responses proves\nsignificantly more challenging with currently available approaches even when\nempirical estimates of variations in the responses are available.\n","authors":["Karn N. Watcharasupat","Yiwei Ding","T. Aleksandra Ma","Pavan Seshadri","Alexander Lerch"],"pdf_url":"https://arxiv.org/pdf/2501.11570v1.pdf","comment":"To be presented as a Findings paper at the 2025 European Conference\n  on Information Retrieval (ECIR)"},{"id":"http://arxiv.org/abs/2501.11499v1","updated":"2025-01-20T14:08:55Z","published":"2025-01-20T14:08:55Z","title":"KEIR @ ECIR 2025: The Second Workshop on Knowledge-Enhanced Information\n  Retrieval","summary":"  Pretrained language models (PLMs) like BERT and GPT-4 have become the\nfoundation for modern information retrieval (IR) systems. However, existing\nPLM-based IR models primarily rely on the knowledge learned during training for\nprediction, limiting their ability to access and incorporate external,\nup-to-date, or domain-specific information. Therefore, current information\nretrieval systems struggle with semantic nuances, context relevance, and\ndomain-specific issues. To address these challenges, we propose the second\nKnowledge-Enhanced Information Retrieval workshop (KEIR @ ECIR 2025) as a\nplatform to discuss innovative approaches that integrate external knowledge,\naiming to enhance the effectiveness of information retrieval in a rapidly\nevolving technological landscape. The goal of this workshop is to bring\ntogether researchers from academia and industry to discuss various aspects of\nknowledge-enhanced information retrieval.\n","authors":["Zihan Wang","Jinyuan Fang","Giacomo Frisoni","Zhuyun Dai","Zaiqiao Meng","Gianluca Moro","Emine Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2501.11499v1.pdf","comment":"KEIR @ ECIR 2025 workshop"},{"id":"http://arxiv.org/abs/2501.11441v1","updated":"2025-01-20T12:29:09Z","published":"2025-01-20T12:29:09Z","title":"Ontology Matching with Large Language Models and Prioritized Depth-First\n  Search","summary":"  Ontology matching (OM) plays a key role in enabling data interoperability and\nknowledge sharing, but it remains challenging due to the need for large\ntraining datasets and limited vocabulary processing in machine learning\napproaches. Recently, methods based on Large Language Model (LLMs) have shown\ngreat promise in OM, particularly through the use of a retrieve-then-prompt\npipeline. In this approach, relevant target entities are first retrieved and\nthen used to prompt the LLM to predict the final matches. Despite their\npotential, these systems still present limited performance and high\ncomputational overhead. To address these issues, we introduce MILA, a novel\napproach that embeds a retrieve-identify-prompt pipeline within a prioritized\ndepth-first search (PDFS) strategy. This approach efficiently identifies a\nlarge number of semantic correspondences with high accuracy, limiting LLM\nrequests to only the most borderline cases. We evaluated MILA using the\nbiomedical challenge proposed in the 2023 and 2024 editions of the Ontology\nAlignment Evaluation Initiative. Our method achieved the highest F-Measure in\nfour of the five unsupervised tasks, outperforming state-of-the-art OM systems\nby up to 17%. It also performed better than or comparable to the leading\nsupervised OM systems. MILA further exhibited task-agnostic performance,\nremaining stable across all tasks and settings, while significantly reducing\nLLM requests. These findings highlight that high-performance LLM-based OM can\nbe achieved through a combination of programmed (PDFS), learned (embedding\nvectors), and prompting-based heuristics, without the need of domain-specific\nheuristics or fine-tuning.\n","authors":["Maria Taboada","Diego Martinez","Mohammed Arideh","Rosa Mosquera"],"pdf_url":"https://arxiv.org/pdf/2501.11441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11403v1","updated":"2025-01-20T11:06:05Z","published":"2025-01-20T11:06:05Z","title":"Verifying Cross-modal Entity Consistency in News using Vision-language\n  Models","summary":"  The web has become a crucial source of information, but it is also used to\nspread disinformation, often conveyed through multiple modalities like images\nand text. The identification of inconsistent cross-modal information, in\nparticular entities such as persons, locations, and events, is critical to\ndetect disinformation. Previous works either identify out-of-context\ndisinformation by assessing the consistency of images to the whole document,\nneglecting relations of individual entities, or focus on generic entities that\nare not relevant to news. So far, only few approaches have addressed the task\nof validating entity consistency between images and text in news. However, the\npotential of large vision-language models (LVLMs) has not been explored yet. In\nthis paper, we propose an LVLM-based framework for verifying Cross-modal Entity\nConsistency~(LVLM4CEC), to assess whether persons, locations and events in news\narticles are consistent across both modalities. We suggest effective prompting\nstrategies for LVLMs for entity verification that leverage reference images\ncrawled from web. Moreover, we extend three existing datasets for the task of\nentity verification in news providing manual ground-truth data. Our results\nshow the potential of LVLMs for automating cross-modal entity verification,\nshowing improved accuracy in identifying persons and events when using evidence\nimages. Moreover, our method outperforms a baseline for location and event\nverification in documents. The datasets and source code are available on GitHub\nat \\url{https://github.com/TIBHannover/LVLM4CEC}.\n","authors":["Sahar Tahmasebi","Eric Müller-Budack","Ralph Ewerth"],"pdf_url":"https://arxiv.org/pdf/2501.11403v1.pdf","comment":"Accepted for publication in: European Conference on Information\n  Retrieval (ECIR) 2025"},{"id":"http://arxiv.org/abs/2501.11391v1","updated":"2025-01-20T10:35:36Z","published":"2025-01-20T10:35:36Z","title":"Revisiting Language Models in Neural News Recommender Systems","summary":"  Neural news recommender systems (RSs) have integrated language models (LMs)\nto encode news articles with rich textual information into representations,\nthereby improving the recommendation process. Most studies suggest that (i)\nnews RSs achieve better performance with larger pre-trained language models\n(PLMs) than shallow language models (SLMs), and (ii) that large language models\n(LLMs) outperform PLMs. However, other studies indicate that PLMs sometimes\nlead to worse performance than SLMs. Thus, it remains unclear whether using\nlarger LMs consistently improves the performance of news RSs. In this paper, we\nrevisit, unify, and extend these comparisons of the effectiveness of LMs in\nnews RSs using the real-world MIND dataset. We find that (i) larger LMs do not\nnecessarily translate to better performance in news RSs, and (ii) they require\nstricter fine-tuning hyperparameter selection and greater computational\nresources to achieve optimal recommendation performance than smaller LMs. On\nthe positive side, our experiments show that larger LMs lead to better\nrecommendation performance for cold-start users: they alleviate dependency on\nextensive user interaction history and make recommendations more reliant on the\nnews content.\n","authors":["Yuyue Zhao","Jin Huang","David Vos","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2501.11391v1.pdf","comment":"16 pages, ECIR 2025, the 47th European Conference on Information\n  Retrieval"},{"id":"http://arxiv.org/abs/2501.11342v1","updated":"2025-01-20T09:03:18Z","published":"2025-01-20T09:03:18Z","title":"Disentangled Modeling of Preferences and Social Influence for Group\n  Recommendation","summary":"  The group recommendation (GR) aims to suggest items for a group of users in\nsocial networks. Existing work typically considers individual preferences as\nthe sole factor in aggregating group preferences. Actually, social influence is\nalso an important factor in modeling users' contributions to the final group\ndecision. However, existing methods either neglect the social influence of\nindividual members or bundle preferences and social influence together as a\nunified representation. As a result, these models emphasize the preferences of\nthe majority within the group rather than the actual interaction items, which\nwe refer to as the preference bias issue in GR. Moreover, the self-supervised\nlearning (SSL) strategies they designed to address the issue of group data\nsparsity fail to account for users' contextual social weights when regulating\ngroup representations, leading to suboptimal results. To tackle these issues,\nwe propose a novel model based on Disentangled Modeling of Preferences and\nSocial Influence for Group Recommendation (DisRec). Concretely, we first design\na user-level disentangling network to disentangle the preferences and social\ninfluence of group members with separate embedding propagation schemes based on\n(hyper)graph convolution networks. We then introduce a socialbased contrastive\nlearning strategy, selectively excluding user nodes based on their social\nimportance to enhance group representations and alleviate the group-level data\nsparsity issue. The experimental results demonstrate that our model\nsignificantly outperforms state-of-the-art methods on two realworld datasets.\n","authors":["Guangze Ye","Wen Wu","Guoqing Wang","Xi Chen","Hong Zheng","Liang He"],"pdf_url":"https://arxiv.org/pdf/2501.11342v1.pdf","comment":"AAAI 2025 Oral"},{"id":"http://arxiv.org/abs/2501.11233v1","updated":"2025-01-20T02:31:52Z","published":"2025-01-20T02:31:52Z","title":"PlotEdit: Natural Language-Driven Accessible Chart Editing in PDFs via\n  Multimodal LLM Agents","summary":"  Chart visualizations, while essential for data interpretation and\ncommunication, are predominantly accessible only as images in PDFs, lacking\nsource data tables and stylistic information. To enable effective editing of\ncharts in PDFs or digital scans, we present PlotEdit, a novel multi-agent\nframework for natural language-driven end-to-end chart image editing via\nself-reflective LLM agents. PlotEdit orchestrates five LLM agents: (1)\nChart2Table for data table extraction, (2) Chart2Vision for style attribute\nidentification, (3) Chart2Code for retrieving rendering code, (4) Instruction\nDecomposition Agent for parsing user requests into executable steps, and (5)\nMultimodal Editing Agent for implementing nuanced chart component modifications\n- all coordinated through multimodal feedback to maintain visual fidelity.\nPlotEdit outperforms existing baselines on the ChartCraft dataset across style,\nlayout, format, and data-centric edits, enhancing accessibility for visually\nchallenged users and improving novice productivity.\n","authors":["Kanika Goswami","Puneet Mathur","Ryan Rossi","Franck Dernoncourt"],"pdf_url":"https://arxiv.org/pdf/2501.11233v1.pdf","comment":"Accepted at ECIR 2025"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2410.05637v2","updated":"2025-01-20T23:38:41Z","published":"2024-10-08T02:40:04Z","title":"Federated Neural Nonparametric Point Processes","summary":"  Temporal point processes (TPPs) are effective for modeling event occurrences\nover time, but they struggle with sparse and uncertain events in federated\nsystems, where privacy is a major concern. To address this, we propose\n\\textit{FedPP}, a Federated neural nonparametric Point Process model. FedPP\nintegrates neural embeddings into Sigmoidal Gaussian Cox Processes (SGCPs) on\nthe client side, which is a flexible and expressive class of TPPs, allowing it\nto generate highly flexible intensity functions that capture client-specific\nevent dynamics and uncertainties while efficiently summarizing historical\nrecords. For global aggregation, FedPP introduces a divergence-based mechanism\nthat communicates the distributions of SGCPs' kernel hyperparameters between\nthe server and clients, while keeping client-specific parameters local to\nensure privacy and personalization. FedPP effectively captures event\nuncertainty and sparsity, and extensive experiments demonstrate its superior\nperformance in federated settings, particularly with KL divergence and\nWasserstein distance-based global aggregation.\n","authors":["Hui Chen","Xuhui Fan","Hengyu Liu","Yaqiong Li","Zhilin Zhao","Feng Zhou","Christopher John Quinn","Longbing Cao"],"pdf_url":"https://arxiv.org/pdf/2410.05637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14596v5","updated":"2025-01-20T23:33:33Z","published":"2024-06-20T17:45:02Z","title":"VLM Agents Generate Their Own Memories: Distilling Experience into\n  Embodied Programs of Thought","summary":"  Large-scale LLMs and VLMs excel at few-shot learning but require high-quality\nexamples. We introduce In-Context Abstraction Learning (ICAL), which\niteratively refines suboptimal trajectories into high-quality data with\noptimized actions and detailed reasoning. Given an inefficient demonstration, a\nVLM corrects actions and annotates causal relationships, object states,\nsubgoals, and task-relevant visuals, forming \"programs of thought.\" With human\nfeedback, these programs are improved as the agent executes them in a similar\nenvironment. The resulting examples, used as prompt context or fine-tuning\ndata, significantly boost decision-making while reducing human feedback needs.\nICAL surpasses state-of-the-art in TEACh (dialogue-based instruction\nfollowing), VisualWebArena (multimodal web agents), and Ego4D (egocentric video\naction anticipation). In TEACh, combining fine-tuning and retrieval on ICAL\nexamples outperforms raw human demonstrations and expert examples, achieving a\n17.5% increase in goal-condition success. In VisualWebArena,\nretrieval-augmented GPT-4V with ICAL improves task success rate 1.6x over\nGPT-4V, while fine-tuning Qwen2-VL achieves a 2.8x improvement. In Ego4D, ICAL\noutperforms few-shot GPT-4V and remains competitive with supervised models.\nOverall, ICAL scales 2x better than raw human demonstrations and reduces manual\nprompt engineering.\n","authors":["Gabriel Sarch","Lawrence Jang","Michael J. Tarr","William W. Cohen","Kenneth Marino","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2406.14596v5.pdf","comment":"Project website: https://ical-learning.github.io/"},{"id":"http://arxiv.org/abs/2501.11786v1","updated":"2025-01-20T23:19:15Z","published":"2025-01-20T23:19:15Z","title":"Synthetic Data Can Mislead Evaluations: Membership Inference as Machine\n  Text Detection","summary":"  Recent work shows membership inference attacks (MIAs) on large language\nmodels (LLMs) produce inconclusive results, partly due to difficulties in\ncreating non-member datasets without temporal shifts. While researchers have\nturned to synthetic data as an alternative, we show this approach can be\nfundamentally misleading. Our experiments indicate that MIAs function as\nmachine-generated text detectors, incorrectly identifying synthetic data as\ntraining samples regardless of the data source. This behavior persists across\ndifferent model architectures and sizes, from open-source models to commercial\nones such as GPT-3.5. Even synthetic text generated by different, potentially\nlarger models is classified as training data by the target model. Our findings\nhighlight a serious concern: using synthetic data in membership evaluations may\nlead to false conclusions about model memorization and data leakage. We caution\nthat this issue could affect other evaluations using model signals such as loss\nwhere synthetic or machine-generated translated data substitutes for real-world\nsamples.\n","authors":["Ali Naseh","Niloofar Mireshghallah"],"pdf_url":"https://arxiv.org/pdf/2501.11786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11779v1","updated":"2025-01-20T23:10:13Z","published":"2025-01-20T23:10:13Z","title":"Glinthawk: A Two-Tiered Architecture for High-Throughput LLM Inference","summary":"  Large Language Models (LLM) have revolutionized natural language processing,\nbut their inference demands substantial resources, while under-utilizing\nhigh-end accelerators like GPUs. A major bottleneck arises from the attention\nmechanism, which requires storing large key-value caches, limiting the maximum\nachievable throughput way below the available computing resources. Current\napproaches attempt to mitigate this issue through memory-efficient attention\nand paging mechanisms, but remained constrained by the assumption that all\noperations must be performed on high-end accelerators.\n  In this work, we propose Glinthawk, a two-tiered architecture that decouples\nthe attention mechanism from the rest of the Transformer model. This approach\nallows the memory requirements for attention to scale independently, enabling\nlarger batch sizes and more efficient use of the high-end accelerators. We\nprototype Glinthawk with NVIDIA T4 GPUs as one tier and standard CPU VMs as the\nother. Compared to a traditional single-tier setup, it improves throughput by\n$5.9\\times$ and reduces cost of generation by $2.8\\times$. For longer sequence\nlengths, it achieves $16.3\\times$ throughput improvement at $2.4\\times$ less\ncost. Our evaluation shows that this architecture can tolerate moderate network\nlatency with minimal performance degradation, making it highly effective for\nlatency-tolerant, throughput-oriented applications such as batch processing. We\nshared our prototype publicly at \\url{https://github.com/microsoft/glinthawk}.\n","authors":["Pouya Hamadanian","Sadjad Fouladi"],"pdf_url":"https://arxiv.org/pdf/2501.11779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11773v1","updated":"2025-01-20T22:36:28Z","published":"2025-01-20T22:36:28Z","title":"Can Bayesian Neural Networks Make Confident Predictions?","summary":"  Bayesian inference promises a framework for principled uncertainty\nquantification of neural network predictions. Barriers to adoption include the\ndifficulty of fully characterizing posterior distributions on network\nparameters and the interpretability of posterior predictive distributions. We\ndemonstrate that under a discretized prior for the inner layer weights, we can\nexactly characterize the posterior predictive distribution as a Gaussian\nmixture. This setting allows us to define equivalence classes of network\nparameter values which produce the same likelihood (training error) and to\nrelate the elements of these classes to the network's scaling regime -- defined\nvia ratios of the training sample size, the size of each layer, and the number\nof final layer parameters. Of particular interest are distinct parameter\nrealizations that map to low training error and yet correspond to distinct\nmodes in the posterior predictive distribution. We identify settings that\nexhibit such predictive multimodality, and thus provide insight into the\naccuracy of unimodal posterior approximations. We also characterize the\ncapacity of a model to \"learn from data\" by evaluating contraction of the\nposterior predictive in different scaling regimes.\n","authors":["Katharine Fisher","Youssef Marzouk"],"pdf_url":"https://arxiv.org/pdf/2501.11773v1.pdf","comment":"Mathematics of Modern Machine Learning Workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.15418v2","updated":"2025-01-20T22:10:33Z","published":"2024-11-23T02:39:27Z","title":"Scaling Structure Aware Virtual Screening to Billions of Molecules with\n  SPRINT","summary":"  Virtual screening of small molecules against protein targets can accelerate\ndrug discovery and development by predicting drug-target interactions (DTIs).\nHowever, structure-based methods like molecular docking are too slow to allow\nfor broad proteome-scale screens, limiting their application in screening for\noff-target effects or new molecular mechanisms. Recently, vector-based methods\nusing protein language models (PLMs) have emerged as a complementary approach\nthat bypasses explicit 3D structure modeling. Here, we develop SPRINT, a\nvector-based approach for screening entire chemical libraries against whole\nproteomes for DTIs and novel mechanisms of action. SPRINT improves on prior\nwork by using a self-attention based architecture and structure-aware PLMs to\nlearn drug-target co-embeddings for binder prediction, search, and retrieval.\nSPRINT achieves SOTA enrichment factors in virtual screening on LIT-PCBA, DTI\nclassification benchmarks, and binding affinity prediction benchmarks, while\nproviding interpretability in the form of residue-level attention maps. In\naddition to being both accurate and interpretable, SPRINT is ultra-fast:\nquerying the whole human proteome against the ENAMINE Real Database (6.7B\ndrugs) for the 100 most likely binders per protein takes 16 minutes. SPRINT\npromises to enable virtual screening at an unprecedented scale, opening up new\nopportunities for in silico drug repurposing and development. SPRINT is\navailable on the web as ColabScreen: https://bit.ly/colab-screen\n","authors":["Andrew T. McNutt","Abhinav K. Adduri","Caleb N. Ellington","Monica T. Dayao","Eric P. Xing","Hosein Mohimani","David R. Koes"],"pdf_url":"https://arxiv.org/pdf/2411.15418v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06368v2","updated":"2025-01-20T22:06:39Z","published":"2025-01-10T22:41:02Z","title":"Towards Robust Nonlinear Subspace Clustering: A Kernel Learning Approach","summary":"  Kernel-based subspace clustering, which addresses the nonlinear structures in\ndata, is an evolving area of research. Despite noteworthy progressions,\nprevailing methodologies predominantly grapple with limitations relating to (i)\nthe influence of predefined kernels on model performance; (ii) the difficulty\nof preserving the original manifold structures in the nonlinear space; (iii)\nthe dependency of spectral-type strategies on the ideal block diagonal\nstructure of the affinity matrix. This paper presents DKLM, a novel paradigm\nfor kernel-induced nonlinear subspace clustering. DKLM provides a data-driven\napproach that directly learns the kernel from the data's self-representation,\nensuring adaptive weighting and satisfying the multiplicative triangle\ninequality constraint, which enhances the robustness of the learned kernel. By\nleveraging this learned kernel, DKLM preserves the local manifold structure of\ndata in a nonlinear space while promoting the formation of an optimal\nblock-diagonal affinity matrix. A thorough theoretical examination of DKLM\nreveals its relationship with existing clustering paradigms. Comprehensive\nexperiments on synthetic and real-world datasets demonstrate the effectiveness\nof the proposed method.\n","authors":["Kunpeng Xu","Lifei Chen","Shengrui Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06368v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05501v2","updated":"2025-01-20T22:06:19Z","published":"2025-01-09T18:43:05Z","title":"Strategy Masking: A Method for Guardrails in Value-based Reinforcement\n  Learning Agents","summary":"  The use of reward functions to structure AI learning and decision making is\ncore to the current reinforcement learning paradigm; however, without careful\ndesign of reward functions, agents can learn to solve problems in ways that may\nbe considered \"undesirable\" or \"unethical.\" Without thorough understanding of\nthe incentives a reward function creates, it can be difficult to impose\nprincipled yet general control mechanisms over its behavior. In this paper, we\nstudy methods for constructing guardrails for AI agents that use reward\nfunctions to learn decision making. We introduce a novel approach, which we\ncall strategy masking, to explicitly learn and then suppress undesirable AI\nagent behavior. We apply our method to study lying in AI agents and show that\nit can be used to effectively modify agent behavior by suppressing lying\npost-training without compromising agent ability to perform effectively.\n","authors":["Jonathan Keane","Sam Keyser","Jeremy Kedziora"],"pdf_url":"https://arxiv.org/pdf/2501.05501v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09685v2","updated":"2025-01-20T22:00:26Z","published":"2025-01-16T17:37:35Z","title":"Inference-Time Alignment in Diffusion Models with Reward-Guided\n  Generation: Tutorial and Review","summary":"  This tutorial provides an in-depth guide on inference-time guidance and\nalignment methods for optimizing downstream reward functions in diffusion\nmodels. While diffusion models are renowned for their generative modeling\ncapabilities, practical applications in fields such as biology often require\nsample generation that maximizes specific metrics (e.g., stability, affinity in\nproteins, closeness to target structures). In these scenarios, diffusion models\ncan be adapted not only to generate realistic samples but also to explicitly\nmaximize desired measures at inference time without fine-tuning. This tutorial\nexplores the foundational aspects of such inference-time algorithms. We review\nthese methods from a unified perspective, demonstrating that current techniques\n-- such as Sequential Monte Carlo (SMC)-based guidance, value-based sampling,\nand classifier guidance -- aim to approximate soft optimal denoising processes\n(a.k.a. policies in RL) that combine pre-trained denoising processes with value\nfunctions serving as look-ahead functions that predict from intermediate states\nto terminal rewards. Within this framework, we present several novel algorithms\nnot yet covered in the literature. Furthermore, we discuss (1) fine-tuning\nmethods combined with inference-time techniques, (2) inference-time algorithms\nbased on search algorithms such as Monte Carlo tree search, which have received\nlimited attention in current research, and (3) connections between\ninference-time algorithms in language models and diffusion models. The code of\nthis tutorial on protein design is available at\nhttps://github.com/masa-ue/AlignInversePro\n","authors":["Masatoshi Uehara","Yulai Zhao","Chenyu Wang","Xiner Li","Aviv Regev","Sergey Levine","Tommaso Biancalani"],"pdf_url":"https://arxiv.org/pdf/2501.09685v2.pdf","comment":"We plan to add more content and codes. Please let us know if there\n  are any comments or missing citations"},{"id":"http://arxiv.org/abs/2501.11765v1","updated":"2025-01-20T21:58:35Z","published":"2025-01-20T21:58:35Z","title":"Is logical analysis performed by transformers taking place in\n  self-attention or in the fully connected part?","summary":"  Transformers architecture apply self-attention to tokens represented as\nvectors, before a fully connected (neuronal network) layer. These two parts can\nbe layered many times. Traditionally, self-attention is seen as a mechanism for\naggregating information before logical operations are performed by the fully\nconnected layer. In this paper, we show, that quite counter-intuitively, the\nlogical analysis can also be performed within the self-attention. For this we\nimplement a handcrafted single-level encoder layer which performs the logical\nanalysis within self-attention. We then study the scenario in which a one-level\ntransformer model undergoes self-learning using gradient descent. We\ninvestigate whether the model utilizes fully connected layers or self-attention\nmechanisms for logical analysis when it has the choice. Given that gradient\ndescent can become stuck at undesired zeros, we explicitly calculate these\nunwanted zeros and find ways to avoid them. We do all this in the context of\npredicting grammatical category pairs of adjacent tokens in a text. We believe\nthat our findings have broader implications for understanding the potential\nlogical operations performed by self-attention.\n","authors":["Evgeniy Shin","Heinrich Matzinger"],"pdf_url":"https://arxiv.org/pdf/2501.11765v1.pdf","comment":"42 pages, 3 figures, to be submitted"},{"id":"http://arxiv.org/abs/2501.11762v1","updated":"2025-01-20T21:53:34Z","published":"2025-01-20T21:53:34Z","title":"Disentangling stellar atmospheric parameters in astronomical spectra\n  using Generative Adversarial Neural Networks","summary":"  A method based on Generative Adversaria! Networks (GANs) is developed for\ndisentangling the physical (effective temperature and gravity) and chemical\n(metallicity, overabundance of a-elements with respect to iron) atmospheric\nproperties in astronomical spectra. Using a projection of the stellar spectra,\ncommonly called latent space, in which the contribution dueto one or several\nmain stellar physicochemical properties is minimised while others are enhanced,\nit was possible to maximise the information related to certain properties,\nwhich can then be extracted using artificial neural networks (ANN) as\nregressors with higher accuracy than a reference method based on the use of ANN\ntrained with the original spectra. Methods. Our model utilises autoencoders,\ncomprising two artificial neural networks: an encoder anda decoder which\ntransform input data into a low-dimensional representation known as latent\nspace. It also uses discriminators, which are additional neural networks aimed\nat transforming the traditional autoencoder training into an adversaria!\napproach, to disentangle or reinforce the astrophysical parameters from the\nlatent space. The GANDALF tool is described. It was developed to define, train,\nand test our GAN model with a web framework to show how the disentangling\nalgorithm works visually. It is open to the community in Github. Results. The\nperformance of our approach for retrieving atmospheric stellar properties from\nspectra is demonstrated using Gaia Radial Velocity Spectrograph (RVS) data from\nDR3. We use a data-driven perspective and obtain very competitive values, ali\nwithin the literature errors, and with the advantage of an important\ndimensionality reduction of the data to be processed.\n","authors":["Minia Manteiga","Raúl Santoveña","Marco A. Álvarez","Carlos Dafonte","Manuel G. Penedo","Silvana Navarro","Luis Corral"],"pdf_url":"https://arxiv.org/pdf/2501.11762v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.11746v1","updated":"2025-01-20T21:09:33Z","published":"2025-01-20T21:09:33Z","title":"SILO: Solving Inverse Problems with Latent Operators","summary":"  Consistent improvement of image priors over the years has led to the\ndevelopment of better inverse problem solvers. Diffusion models are the\nnewcomers to this arena, posing the strongest known prior to date. Recently,\nsuch models operating in a latent space have become increasingly predominant\ndue to their efficiency. In recent works, these models have been applied to\nsolve inverse problems. Working in the latent space typically requires multiple\napplications of an Autoencoder during the restoration process, which leads to\nboth computational and restoration quality challenges. In this work, we propose\na new approach for handling inverse problems with latent diffusion models,\nwhere a learned degradation function operates within the latent space,\nemulating a known image space degradation. Usage of the learned operator\nreduces the dependency on the Autoencoder to only the initial and final steps\nof the restoration process, facilitating faster sampling and superior\nrestoration quality. We demonstrate the effectiveness of our method on a\nvariety of image restoration tasks and datasets, achieving significant\nimprovements over prior art.\n","authors":["Ron Raphaeli","Sean Man","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2501.11746v1.pdf","comment":"Project page in https://ronraphaeli.github.io/SILO-website/"},{"id":"http://arxiv.org/abs/2501.11745v1","updated":"2025-01-20T21:07:44Z","published":"2025-01-20T21:07:44Z","title":"Personalized Federated Learning for Cellular VR: Online Learning and\n  Dynamic Caching","summary":"  Delivering an immersive experience to virtual reality (VR) users through\nwireless connectivity offers the freedom to engage from anywhere at any time.\nNevertheless, it is challenging to ensure seamless wireless connectivity that\ndelivers real-time and high-quality videos to the VR users. This paper proposes\na field of view (FoV) aware caching for mobile edge computing (MEC)-enabled\nwireless VR network. In particular, the FoV of each VR user is\ncached/prefetched at the base stations (BSs) based on the caching strategies\ntailored to each BS. Specifically, decentralized and personalized federated\nlearning (DP-FL) based caching strategies with guarantees are presented.\nConsidering VR systems composed of multiple VR devices and BSs, a DP-FL caching\nalgorithm is implemented at each BS to personalize content delivery for VR\nusers. The utilized DP-FL algorithm guarantees a probably approximately correct\n(PAC) bound on the conditional average cache hit. Further, to reduce the cost\nof communicating gradients, one-bit quantization of the stochastic gradient\ndescent (OBSGD) is proposed, and a convergence guarantee of\n$\\mathcal{O}(1/\\sqrt{T})$ is obtained for the proposed algorithm, where $T$ is\nthe number of iterations. Additionally, to better account for the wireless\nchannel dynamics, the FoVs are grouped into multicast or unicast groups based\non the number of requesting VR users. The performance of the proposed DP-FL\nalgorithm is validated through realistic VR head-tracking dataset, and the\nproposed algorithm is shown to have better performance in terms of average\ndelay and cache hit as compared to baseline algorithms.\n","authors":["Krishnendu S. Tharakan","Hayssam Dahrouj","Nour Kouzayha","Hesham ElSawy","Tareq Y. Al-Naffouri"],"pdf_url":"https://arxiv.org/pdf/2501.11745v1.pdf","comment":"accepted for publication in IEEE Transactions on Communications"},{"id":"http://arxiv.org/abs/2501.11743v1","updated":"2025-01-20T21:04:29Z","published":"2025-01-20T21:04:29Z","title":"Non-Reversible Langevin Algorithms for Constrained Sampling","summary":"  We consider the constrained sampling problem where the goal is to sample from\na target distribution on a constrained domain. We propose skew-reflected\nnon-reversible Langevin dynamics (SRNLD), a continuous-time stochastic\ndifferential equation with skew-reflected boundary. We obtain non-asymptotic\nconvergence rate of SRNLD to the target distribution in both total variation\nand 1-Wasserstein distances. By breaking reversibility, we show that the\nconvergence is faster than the special case of the reversible dynamics. Based\non the discretization of SRNLD, we propose skew-reflected non-reversible\nLangevin Monte Carlo (SRNLMC), and obtain non-asymptotic discretization error\nfrom SRNLD, and convergence guarantees to the target distribution in\n1-Wasserstein distance. We show better performance guarantees than the\nprojected Langevin Monte Carlo in the literature that is based on the\nreversible dynamics. Numerical experiments are provided for both synthetic and\nreal datasets to show efficiency of the proposed algorithms.\n","authors":["Hengrong Du","Qi Feng","Changwei Tu","Xiaoyu Wang","Lingjiong Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.11743v1.pdf","comment":"30 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.18846v2","updated":"2025-01-20T20:57:43Z","published":"2024-03-08T04:08:40Z","title":"The Blind Normalized Stein Variational Gradient Descent-Based Detection\n  for Intelligent Random Access in Cellular IoT","summary":"  The lack of an efficient preamble detection algorithm remains a challenge for\nsolving preamble collision problems in intelligent random access (RA) in the\ncellular Internet of Things (IoT). To address this problem, we present an early\npreamble detection scheme based on a maximum likelihood estimation (MLE) model\nat the first step of the grant-based RA procedure. A novel blind normalized\nStein variational gradient descent (SVGD)-based detector is proposed to obtain\nan approximate solution to the MLE model. First, by exploring the relationship\nbetween the Hadamard transform and wavelet packet transform, a new modified\nHadamard transform (MHT) is developed to separate high-frequency components\nfrom signals using the second-order derivative filter. Next, to eliminate noise\nand mitigate the vanishing gradients problem in the SVGD-based detectors, the\nblock MHT layer is designed based on the MHT, scaling layer, soft-thresholding\nlayer, inverse MHT and sparsity penalty. Then, the blind normalized SVGD\nalgorithm is derived to perform preamble detection without prior knowledge of\nnoise power and the number of active IoT devices. The experimental results show\nthe proposed block MHT layer outperforms other transform-based methods in terms\nof computation costs and denoising performance. Furthermore, with the\nassistance of the block MHT layer, the proposed blind normalized SVGD algorithm\nachieves a higher preamble detection accuracy and throughput than other\nstate-of-the-art detection methods.\n","authors":["Xin Zhu","Ahmet Enis Cetin"],"pdf_url":"https://arxiv.org/pdf/2403.18846v2.pdf","comment":"Accepted by the IEEE Internet of Things Journal"},{"id":"http://arxiv.org/abs/2501.11730v1","updated":"2025-01-20T20:29:40Z","published":"2025-01-20T20:29:40Z","title":"Transformer Vibration Forecasting for Advancing Rail Safety and\n  Maintenance 4.0","summary":"  Maintaining railway axles is critical to preventing severe accidents and\nfinancial losses. The railway industry is increasingly interested in advanced\ncondition monitoring techniques to enhance safety and efficiency, moving beyond\ntraditional periodic inspections toward Maintenance 4.0.\n  This study introduces a robust Deep Autoregressive solution that integrates\nseamlessly with existing systems to avert mechanical failures. Our approach\nsimulates and predicts vibration signals under various conditions and fault\nscenarios, improving dataset robustness for more effective detection systems.\nThese systems can alert maintenance needs, preventing accidents preemptively.\nWe use experimental vibration signals from accelerometers on train axles.\n  Our primary contributions include a transformer model, ShaftFormer, designed\nfor processing time series data, and an alternative model incorporating\nspectral methods and enhanced observation models. Simulating vibration signals\nunder diverse conditions mitigates the high cost of obtaining experimental\nsignals for all scenarios. Given the non-stationary nature of railway vibration\nsignals, influenced by speed and load changes, our models address these\ncomplexities, offering a powerful tool for predictive maintenance in the rail\nindustry.\n","authors":["Darío C. Larese","Almudena Bravo Cerrada","Gabriel Dambrosio Tomei","Alejandro Guerrero-López","Pablo M. Olmos","María Jesús Gómez García"],"pdf_url":"https://arxiv.org/pdf/2501.11730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11729v1","updated":"2025-01-20T20:27:50Z","published":"2025-01-20T20:27:50Z","title":"SeRpEnt: Selective Resampling for Expressive State Space Models","summary":"  State Space Models (SSMs) have recently enjoyed a rise to prominence in the\nfield of deep learning for sequence modeling, especially as an alternative to\nTransformers. Their success stems from avoiding two well-known drawbacks of\nattention-based models: quadratic complexity with respect to the sequence\nlength and inability to model long-range dependencies. The SSM variant Mamba\nhas demonstrated performance comparable to Transformers without any form of\nattention, thanks to the use of a selective mechanism for the state parameters.\nSelectivity, however, is only evaluated empirically and the reasons of its\neffectiveness remain unclear. In this work, we show how selectivity is related\nto the sequence processing. Our analysis shows that selective time intervals in\nMamba act as linear approximators of information. Then, we propose our SeRpEnt\narchitecture, a SSM that further exploits selectivity to compress sequences in\nan information-aware fashion. It employs a resampling mechanism that aggregates\nelements based on their information content. Our empirical results in the Long\nRange Arena benchmark and other language modeling tasks show benefits of the\nSeRpEnt's resampling mechanism.\n","authors":["Stefano Rando","Luca Romani","Matteo Migliarini","Luca Franco","Denis Gudovskiy","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2501.11729v1.pdf","comment":"19 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.19223v4","updated":"2025-01-20T20:26:51Z","published":"2024-11-28T15:48:02Z","title":"On the Unknowable Limits to Prediction","summary":"  We propose a rigorous decomposition of predictive error, highlighting that\nnot all 'irreducible' error is genuinely immutable. Many domains stand to\nbenefit from iterative enhancements in measurement, construct validity, and\nmodeling. Our approach demonstrates how apparently 'unpredictable' outcomes can\nbecome more tractable with improved data (across both target and features) and\nrefined algorithms. By distinguishing aleatoric from epistemic error, we\ndelineate how accuracy may asymptotically improve--though inherent\nstochasticity may remain--and offer a robust framework for advancing\ncomputational research.\n","authors":["Jiani Yan","Charles Rahal"],"pdf_url":"https://arxiv.org/pdf/2411.19223v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11721v1","updated":"2025-01-20T20:07:18Z","published":"2025-01-20T20:07:18Z","title":"Explain-Query-Test: Self-Evaluating LLMs Via Explanation and\n  Comprehension Discrepancy","summary":"  Large language models (LLMs) have demonstrated remarkable proficiency in\ngenerating detailed and coherent explanations of complex concepts. However, the\nextent to which these models truly comprehend the concepts they articulate\nremains unclear. To assess the level of comprehension of a model relative to\nthe content it generates, we implemented a self-evaluation pipeline where\nmodels: (i) given a topic generate an excerpt with information about the topic,\n(ii) given an excerpt generate question-answer pairs, and finally (iii) given a\nquestion generate an answer. We refer to this self-evaluation approach as\nExplain-Query-Test (EQT). Interestingly, the accuracy on generated questions\nresulting from running the EQT pipeline correlates strongly with the model\nperformance as verified by typical benchmarks such as MMLU-Pro. In other words,\nEQT's performance is predictive of MMLU-Pro's, and EQT can be used to rank\nmodels without the need for any external source of evaluation data other than\nlists of topics of interest. Moreover, our results reveal a disparity between\nthe models' ability to produce detailed explanations and their performance on\nquestions related to those explanations. This gap highlights fundamental\nlimitations in the internal knowledge representation and reasoning abilities of\ncurrent LLMs. We release the code at https://github.com/asgsaeid/EQT.\n","authors":["Saeid Asgari Taghanaki","Joao Monteiro"],"pdf_url":"https://arxiv.org/pdf/2501.11721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11720v1","updated":"2025-01-20T20:06:31Z","published":"2025-01-20T20:06:31Z","title":"Prediction of Lung Metastasis from Hepatocellular Carcinoma using the\n  SEER Database","summary":"  Hepatocellular carcinoma (HCC) is a leading cause of cancer-related\nmortality, with lung metastases being the most common site of distant spread\nand significantly worsening prognosis. Despite the growing availability of\nclinical and demographic data, predictive models for lung metastasis in HCC\nremain limited in scope and clinical applicability. In this study, we develop\nand validate an end-to-end machine learning pipeline using data from the\nSurveillance, Epidemiology, and End Results (SEER) database. We evaluated three\nmachine learning models (Random Forest, XGBoost, and Logistic Regression)\nalongside a multilayer perceptron (MLP) neural network. Our models achieved\nhigh AUROC values and recall, with the Random Forest and MLP models\ndemonstrating the best overall performance (AUROC = 0.82). However, the low\nprecision across models highlights the challenges of accurately predicting\npositive cases. To address these limitations, we developed a custom loss\nfunction incorporating recall optimization, enabling the MLP model to achieve\nthe highest sensitivity. An ensemble approach further improved overall recall\nby leveraging the strengths of individual models. Feature importance analysis\nrevealed key predictors such as surgery status, tumor staging, and follow up\nduration, emphasizing the relevance of clinical interventions and disease\nprogression in metastasis prediction. While this study demonstrates the\npotential of machine learning for identifying high-risk patients, limitations\ninclude reliance on imbalanced datasets, incomplete feature annotations, and\nthe low precision of predictions. Future work should leverage the expanding\nSEER dataset, improve data imputation techniques, and explore advanced\npre-trained models to enhance predictive accuracy and clinical utility.\n","authors":["Jeff J. H. Kim","George R. Nahass","Yang Dai","Theja Tulabandhula"],"pdf_url":"https://arxiv.org/pdf/2501.11720v1.pdf","comment":"JJHK and GRN contributed equally, YD and TT are co-corresponding. 11\n  pages, 7 figures, 1 Table"},{"id":"http://arxiv.org/abs/2410.01859v3","updated":"2025-01-20T20:01:56Z","published":"2024-10-02T03:21:01Z","title":"Enhancing End Stage Renal Disease Outcome Prediction: A Multi-Sourced\n  Data-Driven Approach","summary":"  Objective: To improve prediction of Chronic Kidney Disease (CKD) progression\nto End Stage Renal Disease (ESRD) using machine learning (ML) and deep learning\n(DL) models applied to an integrated clinical and claims dataset of varying\nobservation windows, supported by explainable AI (XAI) to enhance\ninterpretability and reduce bias.\n  Materials and Methods: We utilized data about 10,326 CKD patients, combining\ntheir clinical and claims information from 2009 to 2018. Following data\npreprocessing, cohort identification, and feature engineering, we evaluated\nmultiple statistical, ML and DL models using data extracted from five distinct\nobservation windows. Feature importance and Shapley value analysis were\nemployed to understand key predictors. Models were tested for robustness,\nclinical relevance, misclassification errors and bias issues.\n  Results: Integrated data models outperformed those using single data sources,\nwith the Long Short-Term Memory (LSTM) model achieving the highest AUC (0.93)\nand F1 score (0.65). A 24-month observation window was identified as optimal\nfor balancing early detection and prediction accuracy. The 2021 eGFR equation\nimproved prediction accuracy and reduced racial bias, notably for African\nAmerican patients. Discussion: Improved ESRD prediction accuracy, results\ninterpretability and bias mitigation strategies presented in this study have\nthe potential to significantly enhance CKD and ESRD management, support\ntargeted early interventions and reduce healthcare disparities.\n  Conclusion: This study presents a robust framework for predicting ESRD\noutcomes in CKD patients, improving clinical decision-making and patient care\nthrough multi-sourced, integrated data and AI/ML methods. Future research will\nexpand data integration and explore the application of this framework to other\nchronic diseases.\n","authors":["Yubo Li","Rema Padman"],"pdf_url":"https://arxiv.org/pdf/2410.01859v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11714v1","updated":"2025-01-20T19:54:51Z","published":"2025-01-20T19:54:51Z","title":"The Transition from Centralized Machine Learning to Federated Learning\n  for Mental Health in Education: A Survey of Current Methods and Future\n  Directions","summary":"  Research has increasingly explored the application of artificial intelligence\n(AI) and machine learning (ML) within the mental health domain to enhance both\npatient care and healthcare provider efficiency. Given that mental health\nchallenges frequently emerge during early adolescence -- the critical years of\nhigh school and college -- investigating AI/ML-driven mental health solutions\nwithin the education domain is of paramount importance. Nevertheless,\nconventional AI/ML techniques follow a centralized model training architecture,\nwhich poses privacy risks due to the need for transferring students' sensitive\ndata from institutions, universities, and clinics to central servers. Federated\nlearning (FL) has emerged as a solution to address these risks by enabling\ndistributed model training while maintaining data privacy. Despite its\npotential, research on applying FL to analyze students' mental health remains\nlimited. In this paper, we aim to address this limitation by proposing a\nroadmap for integrating FL into mental health data analysis within educational\nsettings. We begin by providing an overview of mental health issues among\nstudents and reviewing existing studies where ML has been applied to address\nthese challenges. Next, we examine broader applications of FL in the mental\nhealth domain to emphasize the lack of focus on educational contexts. Finally,\nwe propose promising research directions focused on using FL to address mental\nhealth issues in the education sector, which entails discussing the synergies\nbetween the proposed directions with broader human-centered domains. By\ncategorizing the proposed research directions into short- and long-term\nstrategies and highlighting the unique challenges at each stage, we aim to\nencourage the development of privacy-conscious AI/ML-driven mental health\nsolutions.\n","authors":["Maryam Ebrahimi","Rajeev Sahay","Seyyedali Hosseinalipour","Bita Akram"],"pdf_url":"https://arxiv.org/pdf/2501.11714v1.pdf","comment":"18 pages, 1 figure, 4 tables"},{"id":"http://arxiv.org/abs/2501.11711v1","updated":"2025-01-20T19:52:31Z","published":"2025-01-20T19:52:31Z","title":"Leveraging graph neural networks and mobility data for COVID-19\n  forecasting","summary":"  The COVID-19 pandemic has victimized over 7 million people to date, prompting\ndiverse research efforts. Spatio-temporal models combining mobility data with\nmachine learning have gained attention for disease forecasting. Here, we\nexplore Graph Convolutional Recurrent Network (GCRN) and Graph Convolutional\nLong Short-Term Memory (GCLSTM), which combine the power of Graph Neural\nNetworks (GNN) with traditional architectures that deal with sequential data.\nThe aim is to forecast future values of COVID-19 cases in Brazil and China by\nleveraging human mobility networks, whose nodes represent geographical\nlocations and links are flows of vehicles or people. We show that employing\nbackbone extraction to filter out negligible connections in the mobility\nnetwork enhances predictive stability. Comparing regression and classification\ntasks demonstrates that binary classification yields smoother, more\ninterpretable results. Interestingly, we observe qualitatively equivalent\nresults for both Brazil and China datasets by introducing sliding windows of\nvariable size and prediction horizons. Compared to prior studies, introducing\nthe sliding window and the network backbone extraction strategies yields\nimprovements of about 80% in root mean squared errors.\n","authors":["Fernando H. O. Duarte","Gladston J. P. Moreira","Eduardo J. S. Luz","Leonardo B. L. Santos","Vander L. S. Freitas"],"pdf_url":"https://arxiv.org/pdf/2501.11711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11706v1","updated":"2025-01-20T19:38:50Z","published":"2025-01-20T19:38:50Z","title":"Trustformer: A Trusted Federated Transformer","summary":"  Transformers, a cornerstone of deep-learning architectures for sequential\ndata, have achieved state-of-the-art results in tasks like Natural Language\nProcessing (NLP). Models such as BERT and GPT-3 exemplify their success and\nhave driven the rise of large language models (LLMs). However, a critical\nchallenge persists: safeguarding the privacy of data used in LLM training.\nPrivacy-preserving techniques like Federated Learning (FL) offer potential\nsolutions, but practical limitations hinder their effectiveness for Transformer\ntraining. Two primary issues are (I) the risk of sensitive information leakage\ndue to aggregation methods like FedAvg or FedSGD, and (II) the high\ncommunication overhead caused by the large size of Transformer models.\n  This paper introduces a novel FL method that reduces communication overhead\nwhile maintaining competitive utility. Our approach avoids sharing full model\nweights by simulating a global model locally. We apply k-means clustering to\neach Transformer layer, compute centroids locally, and transmit only these\ncentroids to the server instead of full weights or gradients. To enhance\nsecurity, we leverage Intel SGX for secure transmission of centroids. Evaluated\non a translation task, our method achieves utility comparable to\nstate-of-the-art baselines while significantly reducing communication costs.\nThis provides a more efficient and privacy-preserving FL solution for\nTransformer models.\n","authors":["Ali Abbasi Tadi","Dima Alhadidi","Luis Rueda"],"pdf_url":"https://arxiv.org/pdf/2501.11706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11695v1","updated":"2025-01-20T19:20:13Z","published":"2025-01-20T19:20:13Z","title":"Spatially-Delineated Domain-Adapted AI Classification: An Application\n  for Oncology Data","summary":"  Given multi-type point maps from different place-types (e.g., tumor regions),\nour objective is to develop a classifier trained on the source place-type to\naccurately distinguish between two classes of the target place-type based on\ntheir point arrangements. This problem is societally important for many\napplications, such as generating clinical hypotheses for designing new\nimmunotherapies for cancer treatment. The challenge lies in the spatial\nvariability, the inherent heterogeneity and variation observed in spatial\nproperties or arrangements across different locations (i.e., place-types).\nPrevious techniques focus on self-supervised tasks to learn domain-invariant\nfeatures and mitigate domain differences; however, they often neglect the\nunderlying spatial arrangements among data points, leading to significant\ndiscrepancies across different place-types. We explore a novel multi-task\nself-learning framework that targets spatial arrangements, such as spatial\nmix-up masking and spatial contrastive predictive coding, for\nspatially-delineated domain-adapted AI classification. Experimental results on\nreal-world datasets (e.g., oncology data) show that the proposed framework\nprovides higher prediction accuracy than baseline methods.\n","authors":["Majid Farhadloo","Arun Sharma","Alexey Leontovich","Svetomir N. Markovic","Shashi Shekhar"],"pdf_url":"https://arxiv.org/pdf/2501.11695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11689v1","updated":"2025-01-20T19:14:26Z","published":"2025-01-20T19:14:26Z","title":"Randomness, exchangeability, and conformal prediction","summary":"  This note continues development of the functional theory of randomness, a\nmodification of the algorithmic theory of randomness getting rid of unspecified\nadditive constants. It introduces new kinds of confidence predictors, including\nrandomness predictors (the most general confidence predictors based on the\nassumption of IID observations) and exchangeability predictors (the most\ngeneral confidence predictors based on the assumption of exchangeable\nobservations). The main result implies that both are close to conformal\npredictors and quantifies the difference between them.\n","authors":["Vladimir Vovk"],"pdf_url":"https://arxiv.org/pdf/2501.11689v1.pdf","comment":"14 pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.16167v2","updated":"2025-01-20T19:09:12Z","published":"2024-08-28T23:15:46Z","title":"Free Lunch in the Forest: Functionally-Identical Pruning of Boosted Tree\n  Ensembles","summary":"  Tree ensembles, including boosting methods, are highly effective and widely\nused for tabular data. However, large ensembles lack interpretability and\nrequire longer inference times. We introduce a method to prune a tree ensemble\ninto a reduced version that is \"functionally identical\" to the original model.\nIn other words, our method guarantees that the prediction function stays\nunchanged for any possible input. As a consequence, this pruning algorithm is\nlossless for any aggregated metric. We formalize the problem of functionally\nidentical pruning on ensembles, introduce an exact optimization model, and\nprovide a fast yet highly effective method to prune large ensembles. Our\nalgorithm iteratively prunes considering a finite set of points, which is\nincrementally augmented using an adversarial model. In multiple computational\nexperiments, we show that our approach is a \"free lunch\", significantly\nreducing the ensemble size without altering the model's behavior. Thus, we can\npreserve state-of-the-art performance at a fraction of the original model's\nsize.\n","authors":["Youssouf Emine","Alexandre Forel","Idriss Malek","Thibaut Vidal"],"pdf_url":"https://arxiv.org/pdf/2408.16167v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11673v1","updated":"2025-01-20T18:55:51Z","published":"2025-01-20T18:55:51Z","title":"Randomized Kaczmarz Methods with Beyond-Krylov Convergence","summary":"  Randomized Kaczmarz methods form a family of linear system solvers which\nconverge by repeatedly projecting their iterates onto randomly sampled\nequations. While effective in some contexts, such as highly over-determined\nleast squares, Kaczmarz methods are traditionally deemed secondary to Krylov\nsubspace methods, since this latter family of solvers can exploit outliers in\nthe input's singular value distribution to attain fast convergence on\nill-conditioned systems.\n  In this paper, we introduce Kaczmarz++, an accelerated randomized block\nKaczmarz algorithm that exploits outlying singular values in the input to\nattain a fast Krylov-style convergence. Moreover, we show that Kaczmarz++\ncaptures large outlying singular values provably faster than popular Krylov\nmethods, for both over- and under-determined systems. We also develop an\noptimized variant for positive semidefinite systems, called CD++, demonstrating\nempirically that it is competitive in arithmetic operations with both CG and\nGMRES on a collection of benchmark problems. To attain these results, we\nintroduce several novel algorithmic improvements to the Kaczmarz framework,\nincluding adaptive momentum acceleration, Tikhonov-regularized projections, and\na memoization scheme for reusing information from previously sampled\nequation~blocks.\n","authors":["Michał Dereziński","Deanna Needell","Elizaveta Rebrova","Jiaming Yang"],"pdf_url":"https://arxiv.org/pdf/2501.11673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11657v1","updated":"2025-01-20T18:39:11Z","published":"2025-01-20T18:39:11Z","title":"Classification of HI Galaxy Profiles Using Unsupervised Learning and\n  Convolutional Neural Networks: A Comparative Analysis and Methodological\n  Cases of Studies","summary":"  Hydrogen, the most abundant element in the universe, is crucial for\nunderstanding galaxy formation and evolution. The 21 cm neutral atomic hydrogen\n- HI spectral line maps the gas kinematics within galaxies, providing key\ninsights into interactions, galactic structure, and star formation processes.\nWith new radio instruments, the volume and complexity of data is increasing. To\nanalyze and classify integrated HI spectral profiles in a efficient way, this\nwork presents a framework that integrates Machine Learning techniques,\ncombining unsupervised methods and CNNs. To this end, we apply our framework to\na selected subsample of 318 spectral HI profiles of the CIG and 30.780 profiles\nfrom the Arecibo Legacy Fast ALFA Survey catalogue. Data pre-processing\ninvolved the Busyfit package and iterative fitting with polynomial, Gaussian,\nand double-Lorentzian models. Clustering methods, including K-means, spectral\nclustering, DBSCAN, and agglomerative clustering, were used for feature\nextraction and to bootstrap classification we applied K-NN, SVM, and Random\nForest classifiers, optimizing accuracy with CNN. Additionally, we introduced a\n2D model of the profiles to enhance classification by adding dimensionality to\nthe data. Three 2D models were generated based on transformations and\nnormalised versions to quantify the level of asymmetry. These methods were\ntested in a previous analytical classification study conducted by the Analysis\nof the Interstellar Medium in Isolated Galaxies group. This approach enhances\nclassification accuracy and aims to establish a methodology that could be\napplied to data analysis in future surveys conducted with the Square Kilometre\nArray (SKA), currently under construction. All materials, code, and models have\nbeen made publicly available in an open-access repository, adhering to FAIR\nprinciples.\n","authors":["Gabriel Jaimes-Illanes","Manuel Parra-Royon","Laura Darriba-Pol","Javier Moldón","Amidou Sorgho","Susana Sánchez-Expósito","Julián Garrido-Sánchez","Lourdes Verdes-Montenegro"],"pdf_url":"https://arxiv.org/pdf/2501.11657v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.11655v1","updated":"2025-01-20T18:38:51Z","published":"2025-01-20T18:38:51Z","title":"KKL Observer Synthesis for Nonlinear Systems via Physics-Informed\n  Learning","summary":"  This paper proposes a novel learning approach for designing\nKazantzis-Kravaris/Luenberger (KKL) observers for autonomous nonlinear systems.\nThe design of a KKL observer involves finding an injective map that transforms\nthe system state into a higher-dimensional observer state, whose dynamics is\nlinear and stable. The observer's state is then mapped back to the original\nsystem coordinates via the inverse map to obtain the state estimate. However,\nfinding this transformation and its inverse is quite challenging. We propose to\nsequentially approximate these maps by neural networks that are trained using\nphysics-informed learning. We generate synthetic data for training by\nnumerically solving the system and observer dynamics. Theoretical guarantees\nfor the robustness of state estimation against approximation error and system\nuncertainties are provided. Additionally, a systematic method for optimizing\nobserver performance through parameter selection is presented. The\neffectiveness of the proposed approach is demonstrated through numerical\nsimulations on benchmark examples and its application to sensor fault detection\nand isolation in a network of Kuramoto oscillators using learned KKL observers.\n","authors":["M. Umar B. Niazi","John Cao","Matthieu Barreau","Karl Henrik Johansson"],"pdf_url":"https://arxiv.org/pdf/2501.11655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11653v1","updated":"2025-01-20T18:33:46Z","published":"2025-01-20T18:33:46Z","title":"Dynamic Scene Understanding from Vision-Language Representations","summary":"  Images depicting complex, dynamic scenes are challenging to parse\nautomatically, requiring both high-level comprehension of the overall situation\nand fine-grained identification of participating entities and their\ninteractions. Current approaches use distinct methods tailored to sub-tasks\nsuch as Situation Recognition and detection of Human-Human and Human-Object\nInteractions. However, recent advances in image understanding have often\nleveraged web-scale vision-language (V&L) representations to obviate\ntask-specific engineering. In this work, we propose a framework for dynamic\nscene understanding tasks by leveraging knowledge from modern, frozen V&L\nrepresentations. By framing these tasks in a generic manner - as predicting and\nparsing structured text, or by directly concatenating representations to the\ninput of existing models - we achieve state-of-the-art results while using a\nminimal number of trainable parameters relative to existing approaches.\nMoreover, our analysis of dynamic knowledge of these representations shows that\nrecent, more powerful representations effectively encode dynamic scene\nsemantics, making this approach newly possible.\n","authors":["Shahaf Pruss","Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2501.11653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11651v1","updated":"2025-01-20T18:33:33Z","published":"2025-01-20T18:33:33Z","title":"Advancing Language Model Reasoning through Reinforcement Learning and\n  Inference Scaling","summary":"  Large language models (LLMs) have demonstrated remarkable capabilities in\ncomplex reasoning tasks. However, existing approaches mainly rely on imitation\nlearning and struggle to achieve effective test-time scaling. While\nreinforcement learning (RL) holds promise for enabling self-exploration and\nlearning from feedback, recent attempts yield only modest improvements in\ncomplex reasoning. In this paper, we present T1 to scale RL by encouraging\nexploration and understand inference scaling. We first initialize the LLM using\nsynthesized chain-of-thought data that integrates trial-and-error and\nself-verification. To scale RL training, we promote increased sampling\ndiversity through oversampling. We further employ an entropy bonus as an\nauxiliary loss, alongside a dynamic anchor for regularization to facilitate\nreward optimization. We demonstrate that T1 with open LLMs as its base exhibits\ninference scaling behavior and achieves superior performance on challenging\nmath reasoning benchmarks. For example, T1 with Qwen2.5-32B as the base model\noutperforms the recent Qwen QwQ-32B-Preview model on MATH500, AIME2024, and\nOmni-math-500. More importantly, we present a simple strategy to examine\ninference scaling, where increased inference budgets directly lead to T1's\nbetter performance without any additional verification. We will open-source the\nT1 models and the data used to train them at \\url{https://github.com/THUDM/T1}.\n","authors":["Zhenyu Hou","Xin Lv","Rui Lu","Jiajie Zhang","Yujiang Li","Zijun Yao","Juanzi Li","Jie Tang","Yuxiao Dong"],"pdf_url":"https://arxiv.org/pdf/2501.11651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11638v1","updated":"2025-01-20T18:12:59Z","published":"2025-01-20T18:12:59Z","title":"Class Imbalance in Anomaly Detection: Learning from an Exactly Solvable\n  Model","summary":"  Class imbalance (CI) is a longstanding problem in machine learning, slowing\ndown training and reducing performances. Although empirical remedies exist, it\nis often unclear which ones work best and when, due to the lack of an\noverarching theory. We address a common case of imbalance, that of anomaly (or\noutlier) detection. We provide a theoretical framework to analyze, interpret\nand address CI. It is based on an exact solution of the teacher-student\nperceptron model, through replica theory. Within this framework, one can\ndistinguish several sources of CI: either intrinsic, train or test imbalance.\nOur analysis reveals that the optimal train imbalance is generally different\nfrom 50%, with a non trivial dependence on the intrinsic imbalance, the\nabundance of data and on the noise in the learning. Moreover, there is a\ncrossover between a small noise training regime where results are independent\nof the noise level to a high noise regime where performances quickly degrade\nwith noise. Our results challenge some of the conventional wisdom on CI and\noffer practical guidelines to address it.\n","authors":["F. S. Pezzicoli","V. Ros","F. P. Landes","M. Baity-Jesi"],"pdf_url":"https://arxiv.org/pdf/2501.11638v1.pdf","comment":"27 pages, 14 figures"},{"id":"http://arxiv.org/abs/2409.08935v2","updated":"2025-01-20T18:07:30Z","published":"2024-09-13T15:55:05Z","title":"Optimization and Generalization Guarantees for Weight Normalization","summary":"  Weight normalization (WeightNorm) is widely used in practice for the training\nof deep neural networks and modern deep learning libraries have built-in\nimplementations of it. In this paper, we provide the first theoretical\ncharacterizations of both optimization and generalization of deep WeightNorm\nmodels with smooth activation functions. For optimization, from the form of the\nHessian of the loss, we note that a small Hessian of the predictor leads to a\ntractable analysis. Thus, we bound the spectral norm of the Hessian of\nWeightNorm networks and show its dependence on the network width and weight\nnormalization terms--the latter being unique to networks without WeightNorm.\nThen, we use this bound to establish training convergence guarantees under\nsuitable assumptions for gradient decent. For generalization, we use WeightNorm\nto get a uniform convergence based generalization bound, which is independent\nfrom the width and depends sublinearly on the depth. Finally, we present\nexperimental results which illustrate how the normalization terms and other\nquantities of theoretical interest relate to the training of WeightNorm\nnetworks.\n","authors":["Pedro Cisneros-Velarde","Zhijie Chen","Sanmi Koyejo","Arindam Banerjee"],"pdf_url":"https://arxiv.org/pdf/2409.08935v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05870v3","updated":"2025-01-20T18:01:06Z","published":"2024-06-09T17:55:55Z","title":"Machine Against the RAG: Jamming Retrieval-Augmented Generation with\n  Blocker Documents","summary":"  Retrieval-augmented generation (RAG) systems respond to queries by retrieving\nrelevant documents from a knowledge database and applying an LLM to the\nretrieved documents. We demonstrate that RAG systems that operate on databases\nwith untrusted content are vulnerable to denial-of-service attacks we call\njamming. An adversary can add a single ``blocker'' document to the database\nthat will be retrieved in response to a specific query and result in the RAG\nsystem not answering this query - ostensibly because it lacks the relevant\ninformation or because the answer is unsafe.\n  We describe and measure the efficacy of several methods for generating\nblocker documents, including a new method based on black-box optimization. This\nmethod (1) does not rely on instruction injection, (2) does not require the\nadversary to know the embedding or LLM used by the target RAG system, and (3)\ndoes not rely on an auxiliary LLM.\n  We evaluate jamming attacks on several LLMs and embeddings and demonstrate\nthat the existing safety metrics for LLMs do not capture their vulnerability to\njamming. We then discuss defenses against blocker documents.\n","authors":["Avital Shafran","Roei Schuster","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2406.05870v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11623v1","updated":"2025-01-20T17:46:12Z","published":"2025-01-20T17:46:12Z","title":"Early evidence of how LLMs outperform traditional systems on OCR/HTR\n  tasks for historical records","summary":"  We explore the ability of two LLMs -- GPT-4o and Claude Sonnet 3.5 -- to\ntranscribe historical handwritten documents in a tabular format and compare\ntheir performance to traditional OCR/HTR systems: EasyOCR, Keras, Pytesseract,\nand TrOCR. Considering the tabular form of the data, two types of experiments\nare executed: one where the images are split line by line and the other where\nthe entire scan is used as input. Based on CER and BLEU, we demonstrate that\nLLMs outperform the conventional OCR/HTR methods. Moreover, we also compare the\nevaluated CER and BLEU scores to human evaluations to better judge the outputs\nof whole-scan experiments and understand influential factors for CER and BLEU.\nCombining judgments from all the evaluation metrics, we conclude that two-shot\nGPT-4o for line-by-line images and two-shot Claude Sonnet 3.5 for whole-scan\nimages yield the transcriptions of the historical records most similar to the\nground truth.\n","authors":["Seorin Kim","Julien Baudru","Wouter Ryckbosch","Hugues Bersini","Vincent Ginis"],"pdf_url":"https://arxiv.org/pdf/2501.11623v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.11622v1","updated":"2025-01-20T17:43:17Z","published":"2025-01-20T17:43:17Z","title":"Causal Learning for Heterogeneous Subgroups Based on Nonlinear Causal\n  Kernel Clustering","summary":"  Due to the challenge posed by multi-source and heterogeneous data collected\nfrom diverse environments, causal relationships among features can exhibit\nvariations influenced by different time spans, regions, or strategies. This\ndiversity makes a single causal model inadequate for accurately representing\ncomplex causal relationships in all observational data, a crucial consideration\nin causal learning. To address this challenge, we introduce the nonlinear\nCausal Kernel Clustering method designed for heterogeneous subgroup causal\nlearning, illuminating variations in causal relationships across diverse\nsubgroups. It comprises two primary components. First, the construction of a\nsample mapping function forms the basis of the subsequent nonlinear causal\nkernel. This function assesses the differences in potential nonlinear causal\nrelationships in various samples, supported by our causal identifiability\ntheory. Second, a nonlinear causal kernel is proposed for clustering\nheterogeneous subgroups. Experimental results showcase the exceptional\nperformance of our method in accurately identifying heterogeneous subgroups and\neffectively enhancing causal learning, leading to a great reduction in\nprediction error.\n","authors":["Lu Liu","Yang Tang","Kexuan Zhang","Qiyu Sun"],"pdf_url":"https://arxiv.org/pdf/2501.11622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04323v3","updated":"2025-01-20T17:41:59Z","published":"2025-01-08T07:47:43Z","title":"Navigating the Designs of Privacy-Preserving Fine-tuning for Large\n  Language Models","summary":"  Instruction tuning has proven effective in enhancing Large Language Models'\n(LLMs) performance on downstream tasks. However, real-world fine-tuning faces\ninherent conflicts between model providers' intellectual property protection,\nclients' data privacy requirements, and tuning costs. While recent approaches\nlike split learning and offsite tuning demonstrate promising architectures for\nprivacy-preserving fine-tuning, there is a gap in systematically addressing the\nmultidimensional trade-offs required for diverse real-world deployments. We\npropose several indicative evaluation metrics to guide design trade-offs for\nprivacy-preserving fine-tuning and a series of example designs, collectively\nnamed GuardedTuning; they result from novel combinations of system\narchitectures with adapted privacy-enhancement methods and emerging computation\ntechniques. Each design represents distinct trade-offs across model utility,\nprivacy guarantees, and costs. Experimental results demonstrate that these\ndesigns protect against data reconstruction attacks while maintaining\ncompetitive fine-tuning performance.\n","authors":["Haonan Shi","Tu Ouyang","An Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04323v3.pdf","comment":"Accepted to WWW 2025"},{"id":"http://arxiv.org/abs/2501.11621v1","updated":"2025-01-20T17:36:04Z","published":"2025-01-20T17:36:04Z","title":"Trojan Detection Through Pattern Recognition for Large Language Models","summary":"  Trojan backdoors can be injected into large language models at various\nstages, including pretraining, fine-tuning, and in-context learning, posing a\nsignificant threat to the model's alignment. Due to the nature of causal\nlanguage modeling, detecting these triggers is challenging given the vast\nsearch space. In this study, we propose a multistage framework for detecting\nTrojan triggers in large language models consisting of token filtration,\ntrigger identification, and trigger verification. We discuss existing trigger\nidentification methods and propose two variants of a black-box trigger\ninversion method that rely on output logits, utilizing beam search and greedy\ndecoding respectively. We show that the verification stage is critical in the\nprocess and propose semantic-preserving prompts and special perturbations to\ndifferentiate between actual Trojan triggers and other adversarial strings that\ndisplay similar characteristics. The evaluation of our approach on the TrojAI\nand RLHF poisoned model datasets demonstrates promising results.\n","authors":["Vedant Bhasin","Matthew Yudin","Razvan Stefanescu","Rauf Izmailov"],"pdf_url":"https://arxiv.org/pdf/2501.11621v1.pdf","comment":"20 pages, 11 Figures"},{"id":"http://arxiv.org/abs/2501.11597v1","updated":"2025-01-20T16:56:10Z","published":"2025-01-20T16:56:10Z","title":"Fairness Testing through Extreme Value Theory","summary":"  Data-driven software is increasingly being used as a critical component of\nautomated decision-support systems. Since this class of software learns its\nlogic from historical data, it can encode or amplify discriminatory practices.\nPrevious research on algorithmic fairness has focused on improving average-case\nfairness. On the other hand, fairness at the extreme ends of the spectrum,\nwhich often signifies lasting and impactful shifts in societal attitudes, has\nreceived significantly less emphasis.\n  Leveraging the statistics of extreme value theory (EVT), we propose a novel\nfairness criterion called extreme counterfactual discrimination (ECD). This\ncriterion estimates the worst-case amounts of disadvantage in outcomes for\nindividuals solely based on their memberships in a protected group. Utilizing\ntools from search-based software engineering and generative AI, we present a\nrandomized algorithm that samples a statistically significant set of points\nfrom the tail of ML outcome distributions even if the input dataset lacks a\nsufficient number of relevant samples.\n  We conducted several experiments on four ML models (deep neural networks,\nlogistic regression, and random forests) over 10 socially relevant tasks from\nthe literature on algorithmic fairness. First, we evaluate the generative AI\nmethods and find that they generate sufficient samples to infer valid EVT\ndistribution in 95% of cases. Remarkably, we found that the prevalent bias\nmitigators reduce the average-case discrimination but increase the worst-case\ndiscrimination significantly in 5% of cases. We also observed that even the\ntail-aware mitigation algorithm -- MiniMax-Fairness -- increased the worst-case\ndiscrimination in 30% of cases. We propose a novel ECD-based mitigator that\nimproves fairness in the tail in 90% of cases with no degradation of the\naverage-case discrimination.\n","authors":["Verya Monjezi","Ashutosh Trivedi","Vladik Kreinovich","Saeid Tizpaz-Niari"],"pdf_url":"https://arxiv.org/pdf/2501.11597v1.pdf","comment":"In IEEE/ACM 47th International Conference on Software Engineering\n  (ICSE'25)"},{"id":"http://arxiv.org/abs/2501.11592v1","updated":"2025-01-20T16:50:59Z","published":"2025-01-20T16:50:59Z","title":"Training-free Ultra Small Model for Universal Sparse Reconstruction in\n  Compressed Sensing","summary":"  Pre-trained large models attract widespread attention in recent years, but\nthey face challenges in applications that require high interpretability or have\nlimited resources, such as physical sensing, medical imaging, and\nbioinformatics. Compressed Sensing (CS) is a well-proved theory that drives\nmany recent breakthroughs in these applications. However, as a typical\nunder-determined linear system, CS suffers from excessively long sparse\nreconstruction times when using traditional iterative methods, particularly\nwith large-scale data. Current AI methods like deep unfolding fail to\nsubstitute them because pre-trained models exhibit poor generality beyond their\ntraining conditions and dataset distributions, or lack interpretability.\nInstead of following the big model fervor, this paper proposes ultra-small\nartificial neural models called coefficients learning (CL), enabling\ntraining-free and rapid sparse reconstruction while perfectly inheriting the\ngenerality and interpretability of traditional iterative methods, bringing new\nfeature of incorporating prior knowledges. In CL, a signal of length $n$ only\nneeds a minimal of $n$ trainable parameters. A case study model called CLOMP is\nimplemented for evaluation. Experiments are conducted on both synthetic and\nreal one-dimensional and two-dimensional signals, demonstrating significant\nimprovements in efficiency and accuracy. Compared to representative iterative\nmethods, CLOMP improves efficiency by 100 to 1000 folds for large-scale data.\nTest results on eight diverse image datasets indicate that CLOMP improves\nstructural similarity index by 292%, 98%, 45% for sampling rates of 0.1, 0.3,\n0.5, respectively. We believe this method can truly usher CS reconstruction\ninto the AI era, benefiting countless under-determined linear systems that rely\non sparse solution.\n","authors":["Chaoqing Tang","Huanze Zhuang","Guiyun Tian","Zhenli Zeng","Yi Ding","Wenzhong Liu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2501.11592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11587v1","updated":"2025-01-20T16:46:26Z","published":"2025-01-20T16:46:26Z","title":"Recurrent Diffusion for Large-Scale Parameter Generation","summary":"  Parameter generation has struggled to scale up for a long time, significantly\nlimiting its range of applications. In this study, we introduce\n\\textbf{R}ecurrent diffusion for large-scale \\textbf{P}arameter\n\\textbf{G}eneration, called \\textbf{RPG}. We first divide the trained\nparameters into non-overlapping parts, after which a recurrent model is\nproposed to learn their relationships. The recurrent model's outputs, as\nconditions, are then fed into a diffusion model to generate the neural network\nparameters. Using only a single GPU, recurrent diffusion enables us to generate\npopular vision and language models such as ConvNeXt-L and LoRA parameters of\nLLaMA-7B. Meanwhile, across various architectures and tasks, the generated\nparameters consistently perform comparable results over trained networks.\nNotably, our approach also shows the potential to generate models for handling\nunseen tasks, which largely increases the practicality of parameter generation.\nOur code is available\n\\href{https://github.com/NUS-HPC-AI-Lab/Recurrent-Parameter-Generation}{here}.\n","authors":["Kai Wang","Dongwen Tang","Wangbo Zhao","Yang You"],"pdf_url":"https://arxiv.org/pdf/2501.11587v1.pdf","comment":"Generating 200 million parameters in just minutes"},{"id":"http://arxiv.org/abs/2501.11584v1","updated":"2025-01-20T16:42:31Z","published":"2025-01-20T16:42:31Z","title":"GCSAM: Gradient Centralized Sharpness Aware Minimization","summary":"  The generalization performance of deep neural networks (DNNs) is a critical\nfactor in achieving robust model behavior on unseen data. Recent studies have\nhighlighted the importance of sharpness-based measures in promoting\ngeneralization by encouraging convergence to flatter minima. Among these\napproaches, Sharpness-Aware Minimization (SAM) has emerged as an effective\noptimization technique for reducing the sharpness of the loss landscape,\nthereby improving generalization. However, SAM's computational overhead and\nsensitivity to noisy gradients limit its scalability and efficiency. To address\nthese challenges, we propose Gradient-Centralized Sharpness-Aware Minimization\n(GCSAM), which incorporates Gradient Centralization (GC) to stabilize gradients\nand accelerate convergence. GCSAM normalizes gradients before the ascent step,\nreducing noise and variance, and improving stability during training. Our\nevaluations indicate that GCSAM consistently outperforms SAM and the Adam\noptimizer in terms of generalization and computational efficiency. These\nfindings demonstrate GCSAM's effectiveness across diverse domains, including\ngeneral and medical imaging tasks.\n","authors":["Mohamed Hassan","Aleksandar Vakanski","Boyu Zhang","Min Xian"],"pdf_url":"https://arxiv.org/pdf/2501.11584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17830v2","updated":"2025-01-20T16:31:45Z","published":"2024-06-25T14:00:55Z","title":"Treatment of Statistical Estimation Problems in Randomized Smoothing for\n  Adversarial Robustness","summary":"  Randomized smoothing is a popular certified defense against adversarial\nattacks. In its essence, we need to solve a problem of statistical estimation\nwhich is usually very time-consuming since we need to perform numerous (usually\n$10^5$) forward passes of the classifier for every point to be certified. In\nthis paper, we review the statistical estimation problems for randomized\nsmoothing to find out if the computational burden is necessary. In particular,\nwe consider the (standard) task of adversarial robustness where we need to\ndecide if a point is robust at a certain radius or not using as few samples as\npossible while maintaining statistical guarantees. We present estimation\nprocedures employing confidence sequences enjoying the same statistical\nguarantees as the standard methods, with the optimal sample complexities for\nthe estimation task and empirically demonstrate their good performance.\nAdditionally, we provide a randomized version of Clopper-Pearson confidence\nintervals resulting in strictly stronger certificates.\n","authors":["Vaclav Voracek"],"pdf_url":"https://arxiv.org/pdf/2406.17830v2.pdf","comment":"comments are welcome; neurips 2024"},{"id":"http://arxiv.org/abs/2501.11577v1","updated":"2025-01-20T16:28:04Z","published":"2025-01-20T16:28:04Z","title":"Rethinking Membership Inference Attacks Against Transfer Learning","summary":"  Transfer learning, successful in knowledge translation across related tasks,\nfaces a substantial privacy threat from membership inference attacks (MIAs).\nThese attacks, despite posing significant risk to ML model's training data,\nremain limited-explored in transfer learning. The interaction between teacher\nand student models in transfer learning has not been thoroughly explored in\nMIAs, potentially resulting in an under-examined aspect of privacy\nvulnerabilities within transfer learning. In this paper, we propose a new MIA\nvector against transfer learning, to determine whether a specific data point\nwas used to train the teacher model while only accessing the student model in a\nwhite-box setting. Our method delves into the intricate relationship between\nteacher and student models, analyzing the discrepancies in hidden layer\nrepresentations between the student model and its shadow counterpart. These\nidentified differences are then adeptly utilized to refine the shadow model's\ntraining process and to inform membership inference decisions effectively. Our\nmethod, evaluated across four datasets in diverse transfer learning tasks,\nreveals that even when an attacker only has access to the student model, the\nteacher model's training data remains susceptible to MIAs. We believe our work\nunveils the unexplored risk of membership inference in transfer learning.\n","authors":["Cong Wu","Jing Chen","Qianru Fang","Kun He","Ziming Zhao","Hao Ren","Guowen Xu","Yang Liu","Yang Xiang"],"pdf_url":"https://arxiv.org/pdf/2501.11577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21347v2","updated":"2025-01-20T16:24:12Z","published":"2024-07-31T05:32:37Z","title":"Differentially Private Block-wise Gradient Shuffle for Deep Learning","summary":"  Traditional Differentially Private Stochastic Gradient Descent (DP-SGD)\nintroduces statistical noise on top of gradients drawn from a Gaussian\ndistribution to ensure privacy. This paper introduces the novel Differentially\nPrivate Block-wise Gradient Shuffle (DP-BloGS) algorithm for deep learning.\nBloGS builds off of existing private deep learning literature, but makes a\ndefinitive shift by taking a probabilistic approach to gradient noise\nintroduction through shuffling modeled after information theoretic privacy\nanalyses. The theoretical results presented in this paper show that the\ncombination of shuffling, parameter-specific block size selection, batch layer\nclipping, and gradient accumulation allows DP-BloGS to achieve training times\nclose to that of non-private training while maintaining similar privacy and\nutility guarantees to DP-SGD. DP-BloGS is found to be significantly more\nresistant to data extraction attempts than DP-SGD. The theoretical results are\nvalidated by the experimental findings.\n","authors":["David Zagardo"],"pdf_url":"https://arxiv.org/pdf/2407.21347v2.pdf","comment":"The results are genuine, but the math is wrong! Please do not use\n  this method for your Differential Privacy implementations"},{"id":"http://arxiv.org/abs/2406.19507v2","updated":"2025-01-20T16:23:41Z","published":"2024-06-27T19:58:11Z","title":"Too Good to be True? Turn Any Model Differentially Private With\n  DP-Weights","summary":"  Imagine training a machine learning model with Differentially Private\nStochastic Gradient Descent (DP-SGD), only to discover post-training that the\nnoise level was either too high, crippling your model's utility, or too low,\ncompromising privacy. The dreaded realization hits: you must start the lengthy\ntraining process from scratch. But what if you could avoid this retraining\nnightmare? In this study, we introduce a groundbreaking approach (to our\nknowledge) that applies differential privacy noise to the model's weights after\ntraining. We offer a comprehensive mathematical proof for this novel approach's\nprivacy bounds, use formal methods to validate its privacy guarantees, and\nempirically evaluate its effectiveness using membership inference attacks and\nperformance evaluations. This method allows for a single training run, followed\nby post-hoc noise adjustments to achieve optimal privacy-utility trade-offs. We\ncompare this novel fine-tuned model (DP-Weights model) to a traditional DP-SGD\nmodel, demonstrating that our approach yields statistically similar performance\nand privacy guarantees. Our results validate the efficacy of post-training\nnoise application, promising significant time savings and flexibility in\nfine-tuning differential privacy parameters, making it a practical alternative\nfor deploying differentially private models in real-world scenarios.\n","authors":["David Zagardo"],"pdf_url":"https://arxiv.org/pdf/2406.19507v2.pdf","comment":"The results are genuine, but the math is wrong! Please do not use\n  this method for your Differential Privacy implementations"},{"id":"http://arxiv.org/abs/2501.11570v1","updated":"2025-01-20T16:19:19Z","published":"2025-01-20T16:19:19Z","title":"Uncertainty Estimation in the Real World: A Study on Music Emotion\n  Recognition","summary":"  Any data annotation for subjective tasks shows potential variations between\nindividuals. This is particularly true for annotations of emotional responses\nto musical stimuli. While older approaches to music emotion recognition systems\nfrequently addressed this uncertainty problem through probabilistic modeling,\nmodern systems based on neural networks tend to ignore the variability and\nfocus only on predicting central tendencies of human subjective responses. In\nthis work, we explore several methods for estimating not only the central\ntendencies of the subjective responses to a musical stimulus, but also for\nestimating the uncertainty associated with these responses. In particular, we\ninvestigate probabilistic loss functions and inference-time random sampling.\nExperimental results indicate that while the modeling of the central tendencies\nis achievable, modeling of the uncertainty in subjective responses proves\nsignificantly more challenging with currently available approaches even when\nempirical estimates of variations in the responses are available.\n","authors":["Karn N. Watcharasupat","Yiwei Ding","T. Aleksandra Ma","Pavan Seshadri","Alexander Lerch"],"pdf_url":"https://arxiv.org/pdf/2501.11570v1.pdf","comment":"To be presented as a Findings paper at the 2025 European Conference\n  on Information Retrieval (ECIR)"},{"id":"http://arxiv.org/abs/2501.11568v1","updated":"2025-01-20T16:18:40Z","published":"2025-01-20T16:18:40Z","title":"Graph Defense Diffusion Model","summary":"  Graph Neural Networks (GNNs) demonstrate significant potential in various\napplications but remain highly vulnerable to adversarial attacks, which can\ngreatly degrade their performance. Existing graph purification methods attempt\nto address this issue by filtering attacked graphs; however, they struggle to\neffectively defend against multiple types of adversarial attacks simultaneously\ndue to their limited flexibility, and they lack comprehensive modeling of graph\ndata due to their heavy reliance on heuristic prior knowledge. To overcome\nthese challenges, we propose a more versatile approach for defending against\nadversarial attacks on graphs. In this work, we introduce the Graph Defense\nDiffusion Model (GDDM), a flexible purification method that leverages the\ndenoising and modeling capabilities of diffusion models. The iterative nature\nof diffusion models aligns well with the stepwise process of adversarial\nattacks, making them particularly suitable for defense. By iteratively adding\nand removing noise, GDDM effectively purifies attacked graphs, restoring their\noriginal structure and features. Our GDDM consists of two key components: (1)\nGraph Structure-Driven Refiner, which preserves the basic fidelity of the graph\nduring the denoising process, and ensures that the generated graph remains\nconsistent with the original scope; and (2) Node Feature-Constrained\nRegularizer, which removes residual impurities from the denoised graph, further\nenhances the purification effect. Additionally, we design tailored denoising\nstrategies to handle different types of adversarial attacks, improving the\nmodel's adaptability to various attack scenarios. Extensive experiments\nconducted on three real-world datasets demonstrate that GDDM outperforms\nstate-of-the-art methods in defending against a wide range of adversarial\nattacks, showcasing its robustness and effectiveness.\n","authors":["Xin He","Wenqi Fan","Yili Wang","Chengyi Liu","Rui Miao","Xin Juan","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11568v1.pdf","comment":"13 pages,5 figures"},{"id":"http://arxiv.org/abs/2501.11560v1","updated":"2025-01-20T16:02:26Z","published":"2025-01-20T16:02:26Z","title":"Explainable Lane Change Prediction for Near-Crash Scenarios Using\n  Knowledge Graph Embeddings and Retrieval Augmented Generation","summary":"  Lane-changing maneuvers, particularly those executed abruptly or in risky\nsituations, are a significant cause of road traffic accidents. However, current\nresearch mainly focuses on predicting safe lane changes. Furthermore, existing\naccident datasets are often based on images only and lack comprehensive sensory\ndata. In this work, we focus on predicting risky lane changes using the CRASH\ndataset (our own collected dataset specifically for risky lane changes), and\nsafe lane changes (using the HighD dataset). Then, we leverage KG and Bayesian\ninference to predict these maneuvers using linguistic contextual information,\nenhancing the model's interpretability and transparency. The model achieved a\n91.5% f1-score with anticipation time extending to four seconds for risky lane\nchanges, and a 90.0% f1-score for predicting safe lane changes with the same\nanticipation time. We validate our model by integrating it into a vehicle\nwithin the CARLA simulator in scenarios that involve risky lane changes. The\nmodel managed to anticipate sudden lane changes, thus providing automated\nvehicles with further time to plan and execute appropriate safe reactions.\nFinally, to enhance the explainability of our model, we utilize RAG to provide\nclear and natural language explanations for the given prediction.\n","authors":["M. Manzour","A. Ballardini","R. Izquierdo","M. Á. Sotelo"],"pdf_url":"https://arxiv.org/pdf/2501.11560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07613v2","updated":"2025-01-20T15:59:39Z","published":"2024-09-11T20:50:41Z","title":"Token Turing Machines are Efficient Vision Models","summary":"  We propose Vision Token Turing Machines (ViTTM), an efficient, low-latency,\nmemory-augmented Vision Transformer (ViT). Our approach builds on Neural Turing\nMachines and Token Turing Machines, which were applied to NLP and sequential\nvisual understanding tasks. ViTTMs are designed for non-sequential computer\nvision tasks such as image classification and segmentation. Our model creates\ntwo sets of tokens: process tokens and memory tokens; process tokens pass\nthrough encoder blocks and read-write from memory tokens at each encoder block\nin the network, allowing them to store and retrieve information from memory. By\nensuring that there are fewer process tokens than memory tokens, we are able to\nreduce the inference time of the network while maintaining its accuracy. On\nImageNet-1K, the state-of-the-art ViT-B has median latency of 529.5ms and 81.0%\naccuracy, while our ViTTM-B is 56% faster (234.1ms), with 2.4 times fewer\nFLOPs, with an accuracy of 82.9%. On ADE20K semantic segmentation, ViT-B\nachieves 45.65mIoU at 13.8 frame-per-second (FPS) whereas our ViTTM-B model\nacheives a 45.17 mIoU with 26.8 FPS (+94%).\n","authors":["Purvish Jajal","Nick John Eliopoulos","Benjamin Shiue-Hal Chou","George K. Thiravathukal","James C. Davis","Yung-Hsiang Lu"],"pdf_url":"https://arxiv.org/pdf/2409.07613v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2501.11557v1","updated":"2025-01-20T15:52:43Z","published":"2025-01-20T15:52:43Z","title":"Secure Resource Allocation via Constrained Deep Reinforcement Learning","summary":"  The proliferation of Internet of Things (IoT) devices and the advent of 6G\ntechnologies have introduced computationally intensive tasks that often surpass\nthe processing capabilities of user devices. Efficient and secure resource\nallocation in serverless multi-cloud edge computing environments is essential\nfor supporting these demands and advancing distributed computing. However,\nexisting solutions frequently struggle with the complexity of multi-cloud\ninfrastructures, robust security integration, and effective application of\ntraditional deep reinforcement learning (DRL) techniques under system\nconstraints. To address these challenges, we present SARMTO, a novel framework\nthat integrates an action-constrained DRL model. SARMTO dynamically balances\nresource allocation, task offloading, security, and performance by utilizing a\nMarkov decision process formulation, an adaptive security mechanism, and\nsophisticated optimization techniques. Extensive simulations across varying\nscenarios, including different task loads, data sizes, and MEC capacities, show\nthat SARMTO consistently outperforms five baseline approaches, achieving up to\na 40% reduction in system costs and a 41.5% improvement in energy efficiency\nover state-of-the-art methods. These enhancements highlight SARMTO's potential\nto revolutionize resource management in intricate distributed computing\nenvironments, opening the door to more efficient and secure IoT and edge\ncomputing applications.\n","authors":["Jianfei Sun","Qiang Gao","Cong Wu","Yuxian Li","Jiacheng Wang","Dusit Niyato"],"pdf_url":"https://arxiv.org/pdf/2501.11557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11555v1","updated":"2025-01-20T15:43:10Z","published":"2025-01-20T15:43:10Z","title":"Beyond R-barycenters: an effective averaging method on Stiefel and\n  Grassmann manifolds","summary":"  In this paper, the issue of averaging data on a manifold is addressed. While\nthe Fr\\'echet mean resulting from Riemannian geometry appears ideal, it is\nunfortunately not always available and often computationally very expensive. To\novercome this, R-barycenters have been proposed and successfully applied to\nStiefel and Grassmann manifolds. However, R-barycenters still suffer severe\nlimitations as they rely on iterative algorithms and complicated operators. We\npropose simpler, yet efficient, barycenters that we call RL-barycenters. We\nshow that, in the setting relevant to most applications, our framework yields\nastonishingly simple barycenters: arithmetic means projected onto the manifold.\nWe apply this approach to the Stiefel and Grassmann manifolds. On simulated\ndata, our approach is competitive with respect to existing averaging methods,\nwhile computationally cheaper.\n","authors":["Florent Bouchard","Nils Laurent","Salem Said","Nicolas Le Bihan"],"pdf_url":"https://arxiv.org/pdf/2501.11555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11378v2","updated":"2025-01-20T15:39:58Z","published":"2024-12-16T02:05:49Z","title":"FinLoRA: Finetuning Quantized Financial Large Language Models Using\n  Low-Rank Adaptation","summary":"  Finetuned large language models (LLMs) have shown remarkable performance in\nfinancial tasks, such as sentiment analysis and information retrieval. Due to\nprivacy concerns, finetuning and deploying Financial LLMs (FinLLMs) locally are\ncrucial for institutions. However, finetuning FinLLMs poses challenges\nincluding GPU memory constraints and long input sequences. In this paper, we\nemploy quantized low-rank adaptation (QLoRA) to finetune FinLLMs, which\nleverage low-rank matrix decomposition and quantization techniques to\nsignificantly reduce computational requirements while maintaining high model\nperformance. We also employ data and pipeline parallelism to enable local\nfinetuning using cost-effective, widely accessible GPUs. Experiments on\nfinancial datasets demonstrate that our method achieves substantial\nimprovements in accuracy, GPU memory usage, and time efficiency, underscoring\nthe potential of lowrank methods for scalable and resource-efficient LLM\nfinetuning.\n","authors":["Dannong Wang","Daniel Kim","Bo Jin","Xingjian Zhao","Tianfan Fu","Steve Yang","Xiao-Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2412.11378v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18267v2","updated":"2025-01-20T15:32:14Z","published":"2024-09-26T20:21:46Z","title":"Using dynamic loss weighting to boost improvements in forecast stability","summary":"  Rolling origin forecast instability refers to variability in forecasts for a\nspecific period induced by updating the forecast when new data points become\navailable. Recently, an extension to the N-BEATS model for univariate time\nseries point forecasting was proposed to include forecast stability as an\nadditional optimization objective, next to accuracy. It was shown that more\nstable forecasts can be obtained without harming accuracy by minimizing a\ncomposite loss function that contains both a forecast error and a forecast\ninstability component, with a static hyperparameter to control the impact of\nstability. In this paper, we empirically investigate whether further\nimprovements in stability can be obtained without compromising accuracy by\napplying dynamic loss weighting algorithms, which change the loss weights\nduring training. We show that existing dynamic loss weighting methods can\nachieve this objective and provide insights into why this might be the case.\nAdditionally, we propose an extension to the Random Weighting approach --\nTask-Aware Random Weighting -- which also achieves this objective.\n","authors":["Daan Caljon","Jeff Vercauteren","Simon De Vos","Wouter Verbeke","Jente Van Belle"],"pdf_url":"https://arxiv.org/pdf/2409.18267v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11542v1","updated":"2025-01-20T15:28:20Z","published":"2025-01-20T15:28:20Z","title":"DLinear-based Prediction of Remaining Useful Life of Lithium-Ion\n  Batteries: Feature Engineering through Explainable Artificial Intelligence","summary":"  Accurate prediction of the Remaining Useful Life (RUL) of lithium-ion\nbatteries is essential for ensuring safety, reducing maintenance costs, and\noptimizing usage. However, predicting RUL is challenging due to the nonlinear\ncharacteristics of the degradation caused by complex chemical reactions.\nMachine learning allows precise predictions by learning the latent functions of\ndegradation relationships based on cycling behavior. This study introduces an\naccurate RUL prediction approach based on feature engineering and DLinear,\napplied to the dataset from NASA's Prognostics Center of Excellence. Among the\n20 features generated from current, voltage, temperature, and time provided in\nthis dataset, key features contributing to degradation are selected using\nPearson correlation coefficient and Shapley values. Shapley value-based feature\nselection effectively reflects cell-to-cell variability, showing similar\nimportance rankings across all cells. The DLinear-based RUL prediction using\nkey features efficiently captures the time-series trend, demonstrating\nsignificantly better performance compared to Long Short-Term Memory and\nTransformer models.\n","authors":["Minsu Kim","Jaehyun Oh","Sang-Young Lee","Junghwan Kim"],"pdf_url":"https://arxiv.org/pdf/2501.11542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11540v1","updated":"2025-01-20T15:25:17Z","published":"2025-01-20T15:25:17Z","title":"A Hands-free Spatial Selection and Interaction Technique using Gaze and\n  Blink Input with Blink Prediction for Extended Reality","summary":"  Gaze-based interaction techniques have created significant interest in the\nfield of spatial interaction. Many of these methods require additional input\nmodalities, such as hand gestures (e.g., gaze coupled with pinch). Those can be\nuncomfortable and difficult to perform in public or limited spaces, and pose\nchallenges for users who are unable to execute pinch gestures. To address these\naspects, we propose a novel, hands-free Gaze+Blink interaction technique that\nleverages the user's gaze and intentional eye blinks. This technique enables\nusers to perform selections by executing intentional blinks. It facilitates\ncontinuous interactions, such as scrolling or drag-and-drop, through eye blinks\ncoupled with head movements. So far, this concept has not been explored for\nhands-free spatial interaction techniques. We evaluated the performance and\nuser experience (UX) of our Gaze+Blink method with two user studies and\ncompared it with Gaze+Pinch in a realistic user interface setup featuring\ncommon menu interaction tasks. Study 1 demonstrated that while Gaze+Blink\nachieved comparable selection speeds, it was prone to accidental selections\nresulting from unintentional blinks. In Study 2 we explored an enhanced\ntechnique employing a deep learning algorithms for filtering out unintentional\nblinks.\n","authors":["Tim Rolff","Jenny Gabel","Lauren Zerbin","Niklas Hypki","Susanne Schmidt","Markus Lappe","Frank Steinicke"],"pdf_url":"https://arxiv.org/pdf/2501.11540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11538v1","updated":"2025-01-20T15:23:16Z","published":"2025-01-20T15:23:16Z","title":"DenoMAE: A Multimodal Autoencoder for Denoising Modulation Signals","summary":"  We propose Denoising Masked Autoencoder (Deno-MAE), a novel multimodal\nautoencoder framework for denoising modulation signals during pretraining.\nDenoMAE extends the concept of masked autoencoders by incorporating multiple\ninput modalities, including noise as an explicit modality, to enhance\ncross-modal learning and improve denoising performance. The network is\npre-trained using unlabeled noisy modulation signals and constellation\ndiagrams, effectively learning to reconstruct their equivalent noiseless\nsignals and diagrams. Deno-MAE achieves state-of-the-art accuracy in automatic\nmodulation classification tasks with significantly fewer training samples,\ndemonstrating a 10% reduction in unlabeled pretraining data and a 3% reduction\nin labeled fine-tuning data compared to existing approaches. Moreover, our\nmodel exhibits robust performance across varying signal-to-noise ratios (SNRs)\nand supports extrapolation on unseen lower SNRs. The results indicate that\nDenoMAE is an efficient, flexible, and data-efficient solution for denoising\nand classifying modulation signals in challenging noise-intensive environments.\n","authors":["Atik Faysal","Taha Boushine","Mohammad Rostami","Reihaneh Gh. Roshan","Huaxia Wang","Nikhil Muralidhar","Avimanyu Sahoo","Yu-Dong Yao"],"pdf_url":"https://arxiv.org/pdf/2501.11538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11533v1","updated":"2025-01-20T15:17:24Z","published":"2025-01-20T15:17:24Z","title":"The impact of intrinsic rewards on exploration in Reinforcement Learning","summary":"  One of the open challenges in Reinforcement Learning is the hard exploration\nproblem in sparse reward environments. Various types of intrinsic rewards have\nbeen proposed to address this challenge by pushing towards diversity. This\ndiversity might be imposed at different levels, favouring the agent to explore\ndifferent states, policies or behaviours (State, Policy and Skill level\ndiversity, respectively). However, the impact of diversity on the agent's\nbehaviour remains unclear. In this work, we aim to fill this gap by studying\nthe effect of different levels of diversity imposed by intrinsic rewards on the\nexploration patterns of RL agents. We select four intrinsic rewards (State\nCount, Intrinsic Curiosity Module (ICM), Maximum Entropy, and Diversity is all\nyou need (DIAYN)), each pushing for a different diversity level. We conduct an\nempirical study on MiniGrid environment to compare their impact on exploration\nconsidering various metrics related to the agent's exploration, namely:\nepisodic return, observation coverage, agent's position coverage, policy\nentropy, and timeframes to reach the sparse reward. The main outcome of the\nstudy is that State Count leads to the best exploration performance in the case\nof low-dimensional observations. However, in the case of RGB observations, the\nperformance of State Count is highly degraded mostly due to representation\nlearning challenges. Conversely, Maximum Entropy is less impacted, resulting in\na more robust exploration, despite being not always optimal. Lastly, our\nempirical study revealed that learning diverse skills with DIAYN, often linked\nto improved robustness and generalisation, does not promote exploration in\nMiniGrid environments. This is because: i) learning the skill space itself can\nbe challenging, and ii) exploration within the skill space prioritises\ndifferentiating between behaviours rather than achieving uniform state\nvisitation.\n","authors":["Aya Kayal","Eduardo Pignatelli","Laura Toni"],"pdf_url":"https://arxiv.org/pdf/2501.11533v1.pdf","comment":"45 pages, 17 figures. Submitted to Neural Computing and Applications\n  Journal"},{"id":"http://arxiv.org/abs/2402.13630v3","updated":"2025-01-20T15:12:02Z","published":"2024-02-21T09:06:31Z","title":"UniGraph: Learning a Unified Cross-Domain Foundation Model for\n  Text-Attributed Graphs","summary":"  Foundation models like ChatGPT and GPT-4 have revolutionized artificial\nintelligence, exhibiting remarkable abilities to generalize across a wide array\nof tasks and applications beyond their initial training objectives. However,\ngraph learning has predominantly focused on single-graph models, tailored to\nspecific tasks or datasets, lacking the ability to transfer learned knowledge\nto different domains. This limitation stems from the inherent complexity and\ndiversity of graph structures, along with the different feature and label\nspaces specific to graph data. In this paper, we recognize text as an effective\nunifying medium and employ Text-Attributed Graphs (TAGs) to leverage this\npotential. We present our UniGraph framework, designed to learn a foundation\nmodel for TAGs, which is capable of generalizing to unseen graphs and tasks\nacross diverse domains. Unlike single-graph models that use pre-computed node\nfeatures of varying dimensions as input, our approach leverages textual\nfeatures for unifying node representations, even for graphs such as molecular\ngraphs that do not naturally have textual features. We propose a novel cascaded\narchitecture of Language Models (LMs) and Graph Neural Networks (GNNs) as\nbackbone networks. Additionally, we propose the first pre-training algorithm\nspecifically designed for large-scale self-supervised learning on TAGs, based\non Masked Graph Modeling. We introduce graph instruction tuning using Large\nLanguage Models (LLMs) to enable zero-shot prediction ability. Our\ncomprehensive experiments across various graph learning tasks and domains\ndemonstrate the model's effectiveness in self-supervised representation\nlearning on unseen graphs, few-shot in-context transfer, and zero-shot\ntransfer, even surpassing or matching the performance of GNNs that have\nundergone supervised training on target datasets.\n","authors":["Yufei He","Yuan Sui","Xiaoxin He","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.13630v3.pdf","comment":"KDD 2025"},{"id":"http://arxiv.org/abs/2501.11526v1","updated":"2025-01-20T15:08:19Z","published":"2025-01-20T15:08:19Z","title":"Meta-Instance Selection. Instance Selection as a Classification Problem\n  with Meta-Features","summary":"  Data pruning, or instance selection, is an important problem in machine\nlearning especially in terms of nearest neighbour classifier. However, in data\npruning which speeds up the prediction phase, there is an issue related to the\nspeed and efficiency of the process itself. In response, the study proposes an\napproach involving transforming the instance selection process into a\nclassification task conducted in a unified meta-feature space where each\ninstance can be classified and assigned to either the \"to keep\" or \"to remove\"\nclass. This approach requires training an appropriate meta-classifier, which\ncan be developed based on historical instance selection results from other\ndatasets using reference instance selection methods as a labeling tool. This\nwork proposes constructing the meta-feature space based on properties extracted\nfrom the nearest neighbor graph. Experiments conducted on 17 datasets of\nvarying sizes and five reference instance selection methods (ENN, Drop3, ICF,\nHMN-EI, and CCIS) demonstrate that the proposed solution achieves results\ncomparable to reference instance selection methods while significantly reducing\ncomputational complexity. In the proposed approach, the computational\ncomplexity of the system depends only on identifying the k-nearest neighbors\nfor each data sample and running the meta-classifier. Additionally, the study\ndiscusses the choice of meta-classifier, recommending the use of Balanced\nRandom Forest.\n","authors":["Marcin Blachnik","Piotr Ciepliński"],"pdf_url":"https://arxiv.org/pdf/2501.11526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11525v1","updated":"2025-01-20T15:07:59Z","published":"2025-01-20T15:07:59Z","title":"Technical Report for the Forgotten-by-Design Project: Targeted\n  Obfuscation for Machine Learning","summary":"  The right to privacy, enshrined in various human rights declarations, faces\nnew challenges in the age of artificial intelligence (AI). This paper explores\nthe concept of the Right to be Forgotten (RTBF) within AI systems, contrasting\nit with traditional data erasure methods. We introduce Forgotten by Design, a\nproactive approach to privacy preservation that integrates instance-specific\nobfuscation techniques during the AI model training process. Unlike machine\nunlearning, which modifies models post-training, our method prevents sensitive\ndata from being embedded in the first place. Using the LIRA membership\ninference attack, we identify vulnerable data points and propose defenses that\ncombine additive gradient noise and weighting schemes. Our experiments on the\nCIFAR-10 dataset demonstrate that our techniques reduce privacy risks by at\nleast an order of magnitude while maintaining model accuracy (at 95%\nsignificance). Additionally, we present visualization methods for the\nprivacy-utility trade-off, providing a clear framework for balancing privacy\nrisk and model accuracy. This work contributes to the development of\nprivacy-preserving AI systems that align with human cognitive processes of\nmotivated forgetting, offering a robust framework for safeguarding sensitive\ninformation and ensuring compliance with privacy regulations.\n","authors":["Rickard Brännvall","Laurynas Adomaitis","Olof Görnerup","Anass Sedrati"],"pdf_url":"https://arxiv.org/pdf/2501.11525v1.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2204.02010v2","updated":"2025-01-20T14:56:15Z","published":"2022-04-05T06:14:52Z","title":"LatentGAN Autoencoder: Learning Disentangled Latent Distribution","summary":"  In autoencoder, the encoder generally approximates the latent distribution\nover the dataset, and the decoder generates samples using this learned latent\ndistribution. There is very little control over the latent vector as using the\nrandom latent vector for generation will lead to trivial outputs. This work\ntries to address this issue by using the LatentGAN generator to directly learn\nto approximate the latent distribution of the autoencoder and show meaningful\nresults on MNIST, 3D Chair, and CelebA datasets, an additional\ninformation-theoretic constrain is used which successfully learns to control\nautoencoder latent distribution. With this, our model also achieves an error\nrate of 2.38 on MNIST unsupervised image classification, which is better as\ncompared to InfoGAN and AAE.\n","authors":["Sanket Kalwar","Animikh Aich","Tanay Dixit","Adit Chhabra"],"pdf_url":"https://arxiv.org/pdf/2204.02010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15809v3","updated":"2025-01-20T14:26:16Z","published":"2024-06-22T10:25:55Z","title":"LaMSUM: Amplifying Voices Against Harassment through LLM Guided\n  Extractive Summarization of User Incident Reports","summary":"  Citizen reporting platforms like Safe City in India help the public and\nauthorities stay informed about sexual harassment incidents. However, the high\nvolume of data shared on these platforms makes reviewing each individual case\nchallenging. Therefore, a summarization algorithm capable of processing and\nunderstanding various Indian code-mixed languages is essential. In recent\nyears, Large Language Models (LLMs) have shown exceptional performance in NLP\ntasks, including summarization. LLMs inherently produce abstractive summaries\nby paraphrasing the original text, while the generation of extractive summaries\n- selecting specific subsets from the original text - through LLMs remains\nlargely unexplored. Moreover, LLMs have a limited context window size,\nrestricting the amount of data that can be processed at once. We tackle these\nchallenge by introducing LaMSUM, a novel multi-level framework designed to\ngenerate extractive summaries for large collections of Safe City posts using\nLLMs. LaMSUM integrates summarization with different voting methods to achieve\nrobust summaries. Extensive evaluation using three popular LLMs (Llama, Mistral\nand GPT-4o) demonstrates that LaMSUM outperforms state-of-the-art extractive\nsummarization methods for Safe City posts. Overall, this work represents one of\nthe first attempts to achieve extractive summarization through LLMs, and is\nlikely to support stakeholders by offering a comprehensive overview and\nenabling them to develop effective policies to minimize incidents of\nunwarranted harassment.\n","authors":["Garima Chhikara","Anurag Sharma","V. Gurucharan","Kripabandhu Ghosh","Abhijnan Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2406.15809v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.13742v2","updated":"2025-01-20T14:06:27Z","published":"2024-11-20T22:54:23Z","title":"Benchmarking a wide range of optimisers for solving the Fermi-Hubbard\n  model using the variational quantum eigensolver","summary":"  We numerically benchmark 30 optimisers on 372 instances of the variational\nquantum eigensolver for solving the Fermi-Hubbard system with the Hamiltonian\nvariational ansatz. We rank the optimisers with respect to metrics such as\nfinal energy achieved and function calls needed to get within a certain\ntolerance level, and find that the best performing optimisers are variants of\ngradient descent such as Momentum and ADAM (using finite difference), SPSA,\nCMAES, and BayesMGD. We also perform gradient analysis and observe that the\nstep size for finite difference has a very significant impact. We also consider\nusing simultaneous perturbation (inspired by SPSA) as a gradient subroutine:\nhere finite difference can lead to a more precise estimate of the ground state\nbut uses more calls, whereas simultaneous perturbation can converge quicker but\nmay be less precise in the later stages. Finally, we also study the quantum\nnatural gradient algorithm: we implement this method for 1-dimensional\nFermi-Hubbard systems, and find that whilst it can reach a lower energy with\nfewer iterations, this improvement is typically lost when taking total function\ncalls into account. Our method involves performing careful hyperparameter\nsweeping on 4 instances. We present a variety of analysis and figures, detailed\noptimiser notes, and discuss future directions.\n","authors":["Benjamin D. M. Jones","Lana Mineh","Ashley Montanaro"],"pdf_url":"https://arxiv.org/pdf/2411.13742v2.pdf","comment":"43 pages, 30 figures. Version 2 contains minor edits and additional\n  references. Associated data can be found at\n  https://doi.org/10.5281/zenodo.13960674"},{"id":"http://arxiv.org/abs/2501.11496v1","updated":"2025-01-20T14:03:40Z","published":"2025-01-20T14:03:40Z","title":"Generative AI and Large Language Models in Language Preservation:\n  Opportunities and Challenges","summary":"  Generative AI and large-scale language models (LLM) have emerged as powerful\ntools in language preservation, particularly for near-native and endangered\nlanguages. With the increasing reliance on technology for communication,\neducation, and cultural documentation, new opportunities have emerged to\nmitigate the dramatic decline of linguistic diversity worldwide. This paper\nexamines the role of generative AIs and LLMs in preserving endangered\nlanguages, highlighting the risks and challenges associated with their use. We\nanalyze the underlying technologies driving these models, including natural\nlanguage processing (NLP) and deep learning, and explore several cases where\nthese technologies have been applied to low-resource languages. Additionally,\nwe discuss ethical considerations, data scarcity issues, and technical\nchallenges while proposing solutions to enhance AI-driven language\npreservation.\n","authors":["Vincent Koc"],"pdf_url":"https://arxiv.org/pdf/2501.11496v1.pdf","comment":"10 pages, 1 figure, submitted for IEEE publication"},{"id":"http://arxiv.org/abs/2501.07999v2","updated":"2025-01-20T13:54:59Z","published":"2025-01-14T10:41:46Z","title":"Unsupervised Feature Construction for Anomaly Detection in Time Series\n  -- An Evaluation","summary":"  To detect anomalies with precision and without prior knowledge in time\nseries, is it better to build a detector from the initial temporal\nrepresentation, or to compute a new (tabular) representation using an existing\nautomatic variable construction library? In this article, we address this\nquestion by conducting an in-depth experimental study for two popular detectors\n(Isolation Forest and Local Outlier Factor). The obtained results, for 5\ndifferent datasets, show that the new representation, computed using the\ntsfresh library, allows Isolation Forest to significantly improve its\nperformance.\n","authors":["Marine Hamon","Vincent Lemaire","Nour Eddine Yassine Nair-Benrekia","Samuel Berlemont","Julien Cumin"],"pdf_url":"https://arxiv.org/pdf/2501.07999v2.pdf","comment":"7"},{"id":"http://arxiv.org/abs/2410.23142v2","updated":"2025-01-20T13:41:21Z","published":"2024-10-30T15:58:03Z","title":"FAIR-TAT: Improving Model Fairness Using Targeted Adversarial Training","summary":"  Deep neural networks are susceptible to adversarial attacks and common\ncorruptions, which undermine their robustness. In order to enhance model\nresilience against such challenges, Adversarial Training (AT) has emerged as a\nprominent solution. Nevertheless, adversarial robustness is often attained at\nthe expense of model fairness during AT, i.e., disparity in class-wise\nrobustness of the model. While distinctive classes become more robust towards\nsuch adversaries, hard to detect classes suffer. Recently, research has focused\non improving model fairness specifically for perturbed images, overlooking the\naccuracy of the most likely non-perturbed data. Additionally, despite their\nrobustness against the adversaries encountered during model training,\nstate-of-the-art adversarial trained models have difficulty maintaining\nrobustness and fairness when confronted with diverse adversarial threats or\ncommon corruptions. In this work, we address the above concerns by introducing\na novel approach called Fair Targeted Adversarial Training (FAIR-TAT). We show\nthat using targeted adversarial attacks for adversarial training (instead of\nuntargeted attacks) can allow for more favorable trade-offs with respect to\nadversarial fairness. Empirical results validate the efficacy of our approach.\n","authors":["Tejaswini Medi","Steffen Jung","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2410.23142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11478v1","updated":"2025-01-20T13:20:41Z","published":"2025-01-20T13:20:41Z","title":"Graph-defined Language Learning with LLMs","summary":"  Recent efforts leverage Large Language Models (LLMs) for modeling\ntext-attributed graph structures in node classification tasks. These approaches\ndescribe graph structures for LLMs to understand or aggregate LLM-generated\ntextual attribute embeddings through graph structure. However, these approaches\nface two main limitations in modeling graph structures with LLMs. (i) Graph\ndescriptions become verbose in describing high-order graph structure. (ii)\nTextual attributes alone do not contain adequate graph structure information.\nIt is challenging to model graph structure concisely and adequately with LLMs.\nLLMs lack built-in mechanisms to model graph structures directly. They also\nstruggle with complex long-range dependencies between high-order nodes and\ntarget nodes.\n  Inspired by the observation that LLMs pre-trained on one language can achieve\nexceptional performance on another with minimal additional training, we propose\n\\textbf{G}raph-\\textbf{D}efined \\textbf{L}anguage for \\textbf{L}arge\n\\textbf{L}anguage \\textbf{M}odel (GDL4LLM). This novel framework enables LLMs\nto transfer their powerful language understanding capabilities to\ngraph-structured data. GDL4LLM translates graphs into a graph language corpus\ninstead of graph descriptions and pre-trains LLMs on this corpus to adequately\nunderstand graph structures. During fine-tuning, this corpus describes the\nstructural information of target nodes concisely with only a few tokens. By\ntreating graphs as a new language, GDL4LLM enables LLMs to model graph\nstructures adequately and concisely for node classification tasks. Extensive\nexperiments on three real-world datasets demonstrate that GDL4LLM outperforms\ndescription-based and textual attribute embeddings-based baselines by\nefficiently modeling different orders of graph structure with LLMs.\n","authors":["Huachi Zhou","Jiahe Du","Chuang Zhou","Chang Yang","Yilin Xiao","Yuxuan Xie","Xiao Huang"],"pdf_url":"https://arxiv.org/pdf/2501.11478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11469v1","updated":"2025-01-20T12:56:28Z","published":"2025-01-20T12:56:28Z","title":"MASS: Overcoming Language Bias in Image-Text Matching","summary":"  Pretrained visual-language models have made significant advancements in\nmultimodal tasks, including image-text retrieval. However, a major challenge in\nimage-text matching lies in language bias, where models predominantly rely on\nlanguage priors and neglect to adequately consider the visual content. We thus\npresent Multimodal ASsociation Score (MASS), a framework that reduces the\nreliance on language priors for better visual accuracy in image-text matching\nproblems. It can be seamlessly incorporated into existing visual-language\nmodels without necessitating additional training. Our experiments have shown\nthat MASS effectively lessens language bias without losing an understanding of\nlinguistic compositionality. Overall, MASS offers a promising solution for\nenhancing image-text matching performance in visual-language models.\n","authors":["Jiwan Chung","Seungwon Lim","Sangkyu Lee","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2501.11469v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.11454v1","updated":"2025-01-20T12:41:17Z","published":"2025-01-20T12:41:17Z","title":"Improving thermal state preparation of Sachdev-Ye-Kitaev model with\n  reinforcement learning on quantum hardware","summary":"  The Sachdev-Ye-Kitaev (SYK) model, known for its strong quantum correlations\nand chaotic behavior, serves as a key platform for quantum gravity studies.\nHowever, variationally preparing thermal states on near-term quantum processors\nfor large systems (N>12, where N is the number of Majorana fermions) presents a\nsignificant challenge due to the rapid growth in the complexity of\nparameterized quantum circuits. This paper addresses this challenge by\nintegrating reinforcement learning (RL) with convolutional neural networks,\nemploying an iterative approach to optimize the quantum circuit and its\nparameters. The refinement process is guided by a composite reward signal\nderived from entropy and the expectation values of the SYK Hamiltonian. This\napproach reduces the number of CNOT gates by two orders of magnitude for\nsystems N>10 compared to traditional methods like first-order Trotterization.\nWe demonstrate the effectiveness of the RL framework in both noiseless and\nnoisy quantum hardware environments, maintaining high accuracy in thermal state\npreparation. This work contributes to the advancement of a scalable, RL-based\nframework with applications for computations of thermal out-of-time-order\ncorrelators in quantum many-body systems and quantum gravity studies on\nnear-term quantum hardware.\n","authors":["Akash Kundu"],"pdf_url":"https://arxiv.org/pdf/2501.11454v1.pdf","comment":"The code and the data will be available soon. Comments are welcomed!"},{"id":"http://arxiv.org/abs/2501.11430v1","updated":"2025-01-20T12:06:54Z","published":"2025-01-20T12:06:54Z","title":"A Survey on Diffusion Models for Anomaly Detection","summary":"  Diffusion models (DMs) have emerged as a powerful class of generative AI\nmodels, showing remarkable potential in anomaly detection (AD) tasks across\nvarious domains, such as cybersecurity, fraud detection, healthcare, and\nmanufacturing. The intersection of these two fields, termed diffusion models\nfor anomaly detection (DMAD), offers promising solutions for identifying\ndeviations in increasingly complex and high-dimensional data. In this survey,\nwe systematically review recent advances in DMAD research and investigate their\ncapabilities. We begin by presenting the fundamental concepts of AD and DMs,\nfollowed by a comprehensive analysis of classic DM architectures including\nDDPMs, DDIMs, and Score SDEs. We further categorize existing DMAD methods into\nreconstruction-based, density-based, and hybrid approaches, providing detailed\nexaminations of their methodological innovations. We also explore the diverse\ntasks across different data modalities, encompassing image, time series, video,\nand multimodal data analysis. Furthermore, we discuss critical challenges and\nemerging research directions, including computational efficiency, model\ninterpretability, robustness enhancement, edge-cloud collaboration, and\nintegration with large language models. The collection of DMAD research papers\nand resources is available at https://github.com/fdjingliu/DMAD.\n","authors":["Jing Liu","Zhenchao Ma","Zepu Wang","Yang Liu","Zehua Wang","Peng Sun","Liang Song","Bo Hu","Azzedine Boukerche","Victor C. M. Leung"],"pdf_url":"https://arxiv.org/pdf/2501.11430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11428v1","updated":"2025-01-20T11:56:40Z","published":"2025-01-20T11:56:40Z","title":"Enhancing Coronary Artery Calcium Scoring via Multi-Organ Segmentation\n  on Non-Contrast Cardiac Computed Tomography","summary":"  Despite coronary artery calcium scoring being considered a largely solved\nproblem within the realm of medical artificial intelligence, this paper argues\nthat significant improvements can still be made. By shifting the focus from\npathology detection to a deeper understanding of anatomy, the novel algorithm\nproposed in the paper both achieves high accuracy in coronary artery calcium\nscoring and offers enhanced interpretability of the results. This approach not\nonly aids in the precise quantification of calcifications in coronary arteries,\nbut also provides valuable insights into the underlying anatomical structures.\nThrough this anatomically-informed methodology, the paper shows how a nuanced\nunderstanding of the heart's anatomy can lead to more accurate and\ninterpretable results in the field of cardiovascular health. We demonstrate the\nsuperior accuracy of the proposed method by evaluating it on an open-source\nmulti-vendor dataset, where we obtain results at the inter-observer level,\nsurpassing the current state of the art. Finally, the qualitative analyses show\nthe practical value of the algorithm in such tasks as labeling coronary artery\ncalcifications, identifying aortic calcifications, and filtering out false\npositive detections due to noise.\n","authors":["Jakub Nalepa","Tomasz Bartczak","Mariusz Bujny","Jarosław Gośliński","Katarzyna Jesionek","Wojciech Malara","Filip Malawski","Karol Miszalski-Jamka","Patrycja Rewa","Marcin Kostur"],"pdf_url":"https://arxiv.org/pdf/2501.11428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11422v1","updated":"2025-01-20T11:39:22Z","published":"2025-01-20T11:39:22Z","title":"Multi-View Spectral Clustering for Graphs with Multiple View Structures","summary":"  Despite the fundamental importance of clustering, to this day, much of the\nrelevant research is still based on ambiguous foundations, leading to an\nunclear understanding of whether or how the various clustering methods are\nconnected with each other. In this work, we provide an additional stepping\nstone towards resolving such ambiguities by presenting a general clustering\nframework that subsumes a series of seemingly disparate clustering methods,\nincluding various methods belonging to the wildly popular spectral clustering\nframework. In fact, the generality of the proposed framework is additionally\ncapable of shedding light to the largely unexplored area of multi-view graphs\nwhose each view may have differently clustered nodes. In turn, we propose\nGenClus: a method that is simultaneously an instance of this framework and a\ngeneralization of spectral clustering, while also being closely related to\nk-means as well. This results in a principled alternative to the few existing\nmethods studying this special type of multi-view graphs. Then, we conduct\nin-depth experiments, which demonstrate that GenClus is more computationally\nefficient than existing methods, while also attaining similar or better\nclustering performance. Lastly, a qualitative real-world case-study further\ndemonstrates the ability of GenClus to produce meaningful clusterings.\n","authors":["Yorgos Tsitsikas","Evangelos E. Papalexakis"],"pdf_url":"https://arxiv.org/pdf/2501.11422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11421v1","updated":"2025-01-20T11:39:09Z","published":"2025-01-20T11:39:09Z","title":"Online Clustering with Bandit Information","summary":"  We study the problem of online clustering within the multi-armed bandit\nframework under the fixed confidence setting. In this multi-armed bandit\nproblem, we have $M$ arms, each providing i.i.d. samples that follow a\nmultivariate Gaussian distribution with an {\\em unknown} mean and a known unit\ncovariance. The arms are grouped into $K$ clusters based on the distance\nbetween their means using the Single Linkage (SLINK) clustering algorithm on\nthe means of the arms. Since the true means are unknown, the objective is to\nobtain the above clustering of the arms with the minimum number of samples\ndrawn from the arms, subject to an upper bound on the error probability. We\nintroduce a novel algorithm, Average Tracking Bandit Online Clustering (ATBOC),\nand prove that this algorithm is order optimal, meaning that the upper bound on\nits expected sample complexity for given error probability $\\delta$ is within a\nfactor of 2 of an instance-dependent lower bound as $\\delta \\rightarrow 0$.\nFurthermore, we propose a computationally more efficient algorithm, Lower and\nUpper Confidence Bound-based Bandit Online Clustering (LUCBBOC), inspired by\nthe LUCB algorithm for best arm identification. Simulation results demonstrate\nthat the performance of LUCBBOC is comparable to that of ATBOC. We numerically\nassess the effectiveness of the proposed algorithms through numerical\nexperiments on both synthetic datasets and the real-world MovieLens dataset. To\nthe best of our knowledge, this is the first work on bandit online clustering\nthat allows arms with different means in a cluster and $K$ greater than 2.\n","authors":["G Dhinesh Chandran","Srinivas Reddy Kota","Srikrishna Bhashyam"],"pdf_url":"https://arxiv.org/pdf/2501.11421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11414v1","updated":"2025-01-20T11:28:45Z","published":"2025-01-20T11:28:45Z","title":"Algorithm Selection with Probing Trajectories: Benchmarking the Choice\n  of Classifier Model","summary":"  Recent approaches to training algorithm selectors in the black-box\noptimisation domain have advocated for the use of training data that is\nalgorithm-centric in order to encapsulate information about how an algorithm\nperforms on an instance, rather than relying on information derived from\nfeatures of the instance itself. Probing-trajectories that consist of a\nsequence of objective performance per function evaluation obtained from a short\nrun of an algorithm have recently shown particular promise in training accurate\nselectors. However, training models on this type of data requires an\nappropriately chosen classifier given the sequential nature of the data. There\nare currently no clear guidelines for choosing the most appropriate classifier\nfor algorithm selection using time-series data from the plethora of models\navailable. To address this, we conduct a large benchmark study using 17\ndifferent classifiers and three types of trajectory on a classification task\nusing the BBOB benchmark suite using both leave-one-instance out and\nleave-one-problem out cross-validation. In contrast to previous studies using\ntabular data, we find that the choice of classifier has a significant impact,\nshowing that feature-based and interval-based models are the best choices.\n","authors":["Quentin Renau","Emma Hart"],"pdf_url":"https://arxiv.org/pdf/2501.11414v1.pdf","comment":"To appear in Applications of Evolutionary Computation 28th\n  International Conference, EvoApplications 2025"},{"id":"http://arxiv.org/abs/2501.11413v1","updated":"2025-01-20T11:26:36Z","published":"2025-01-20T11:26:36Z","title":"Generalization and Informativeness of Weighted Conformal Risk Control\n  Under Covariate Shift","summary":"  Predictive models are often required to produce reliable predictions under\nstatistical conditions that are not matched to the training data. A common type\nof training-testing mismatch is covariate shift, where the conditional\ndistribution of the target variable given the input features remains fixed,\nwhile the marginal distribution of the inputs changes. Weighted conformal risk\ncontrol (W-CRC) uses data collected during the training phase to convert point\npredictions into prediction sets with valid risk guarantees at test time\ndespite the presence of a covariate shift. However, while W-CRC provides\nstatistical reliability, its efficiency -- measured by the size of the\nprediction sets -- can only be assessed at test time. In this work, we relate\nthe generalization properties of the base predictor to the efficiency of W-CRC\nunder covariate shifts. Specifically, we derive a bound on the inefficiency of\nthe W-CRC predictor that depends on algorithmic hyperparameters and\ntask-specific quantities available at training time. This bound offers insights\non relationships between the informativeness of the prediction sets, the extent\nof the covariate shift, and the size of the calibration and training sets.\nExperiments on fingerprinting-based localization validate the theoretical\nresults.\n","authors":["Matteo Zecchin","Fredrik Hellström","Sangwoo Park","Shlomo Shamai","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2501.11413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11409v1","updated":"2025-01-20T11:16:44Z","published":"2025-01-20T11:16:44Z","title":"Unsupervised Learning in Echo State Networks for Input Reconstruction","summary":"  Conventional echo state networks (ESNs) require supervised learning to train\nthe readout layer, using the desired outputs as training data. In this study,\nwe focus on input reconstruction (IR), which refers to training the readout\nlayer to reproduce the input time series in its output. We reformulate the\nlearning algorithm of the ESN readout layer to perform IR using unsupervised\nlearning (UL). By conducting theoretical analysis and numerical experiments, we\ndemonstrate that IR in ESNs can be effectively implemented under realistic\nconditions without explicitly using the desired outputs as training data; in\nthis way, UL is enabled. Furthermore, we demonstrate that applications relying\non IR, such as dynamical system replication and noise filtering, can be\nreformulated within the UL framework. Our findings establish a theoretically\nsound and universally applicable IR formulation, along with its related tasks\nin ESNs. This work paves the way for novel predictions and highlights\nunresolved theoretical challenges in ESNs, particularly in the context of\ntime-series processing methods and computational models of the brain.\n","authors":["Taiki Yamada","Yuichi Katori","Kantaro Fujiwara"],"pdf_url":"https://arxiv.org/pdf/2501.11409v1.pdf","comment":"16 pages, 7 figures, regular paper"},{"id":"http://arxiv.org/abs/2501.11407v1","updated":"2025-01-20T11:14:11Z","published":"2025-01-20T11:14:11Z","title":"A Truly Sparse and General Implementation of Gradient-Based Synaptic\n  Plasticity","summary":"  Online synaptic plasticity rules derived from gradient descent achieve high\naccuracy on a wide range of practical tasks. However, their software\nimplementation often requires tediously hand-derived gradients or using\ngradient backpropagation which sacrifices the online capability of the rules.\nIn this work, we present a custom automatic differentiation (AD) pipeline for\nsparse and online implementation of gradient-based synaptic plasticity rules\nthat generalizes to arbitrary neuron models. Our work combines the programming\nease of backpropagation-type methods for forward AD while being\nmemory-efficient. To achieve this, we exploit the advantageous compute and\nmemory scaling of online synaptic plasticity by providing an inherently sparse\nimplementation of AD where expensive tensor contractions are replaced with\nsimple element-wise multiplications if the tensors are diagonal. Gradient-based\nsynaptic plasticity rules such as eligibility propagation (e-prop) have exactly\nthis property and thus profit immensely from this feature. We demonstrate the\nalignment of our gradients with respect to gradient backpropagation on an\nsynthetic task where e-prop gradients are exact, as well as audio speech\nclassification benchmarks. We demonstrate how memory utilization scales with\nnetwork size without dependence on the sequence length, as expected from\nforward AD methods.\n","authors":["Jamie Lohoff","Anil Kaya","Florian Assmuth","Emre Neftci"],"pdf_url":"https://arxiv.org/pdf/2501.11407v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.09215v3","updated":"2025-01-20T11:08:42Z","published":"2023-12-14T18:46:35Z","title":"Self-Adaptive Physics-Informed Quantum Machine Learning for Solving\n  Differential Equations","summary":"  Chebyshev polynomials have shown significant promise as an efficient tool for\nboth classical and quantum neural networks to solve linear and nonlinear\ndifferential equations. In this work, we adapt and generalize this framework in\na quantum machine learning setting for a variety of problems, including the 2D\nPoisson's equation, second-order linear differential equation, system of\ndifferential equations, nonlinear Duffing and Riccati equation. In particular,\nwe propose in the quantum setting a modified Self-Adaptive Physics-Informed\nNeural Network (SAPINN) approach, where self-adaptive weights are applied to\nproblems with multi-objective loss functions. We further explore capturing\ncorrelations in our loss function using a quantum-correlated measurement,\nresulting in improved accuracy for initial value problems. We analyse also the\nuse of entangling layers and their impact on the solution accuracy for\nsecond-order differential equations. The results indicate a promising approach\nto the near-term evaluation of differential equations on quantum devices.\n","authors":["Abhishek Setty","Rasul Abdusalamov","Felix Motzoi"],"pdf_url":"https://arxiv.org/pdf/2312.09215v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13093v2","updated":"2025-01-20T11:03:41Z","published":"2024-05-21T12:57:10Z","title":"Graph neural networks informed locally by thermodynamics","summary":"  Thermodynamics-informed neural networks employ inductive biases for the\nenforcement of the first and second principles of thermodynamics. To construct\nthese biases, a metriplectic evolution of the system is assumed. This provides\nexcellent results, when compared to uninformed, black box networks. While the\ndegree of accuracy can be increased in one or two orders of magnitude, in the\ncase of graph networks, this requires assembling global Poisson and dissipation\nmatrices, which breaks the local structure of such networks. In order to avoid\nthis drawback, a local version of the metriplectic biases has been developed in\nthis work, which avoids the aforementioned matrix assembly, thus preserving the\nnode-by-node structure of the graph networks. We apply this framework for\nexamples in the fields of solid and fluid mechanics. Our approach demonstrates\nsignificant computational efficiency and strong generalization capabilities,\naccurately making inferences on examples significantly different from those\nencountered during training.\n","authors":["Alicia Tierz","Iciar Alfaro","David González","Francisco Chinesta","Elías Cueto"],"pdf_url":"https://arxiv.org/pdf/2405.13093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01251v2","updated":"2025-01-20T10:48:57Z","published":"2024-11-02T14:02:45Z","title":"Enhancing Diabetic Retinopathy Detection with CNN-Based Models: A\n  Comparative Study of UNET and Stacked UNET Architectures","summary":"  Diabetic Retinopathy DR is a severe complication of diabetes. Damaged or\nabnormal blood vessels can cause loss of vision. The need for massive screening\nof a large population of diabetic patients has generated an interest in a\ncomputer-aided fully automatic diagnosis of DR. In the realm of Deep learning\nframeworks, particularly convolutional neural networks CNNs, have shown great\ninterest and promise in detecting DR by analyzing retinal images. However,\nseveral challenges have been faced in the application of deep learning in this\ndomain. High-quality, annotated datasets are scarce, and the variations in\nimage quality and class imbalances pose significant hurdles in developing a\ndependable model. In this paper, we demonstrate the proficiency of two\nConvolutional Neural Networks CNNs based models, UNET and Stacked UNET\nutilizing the APTOS Asia Pacific Tele-Ophthalmology Society Dataset. This\nsystem achieves an accuracy of 92.81% for the UNET and 93.32% for the stacked\nUNET architecture. The architecture classifies the images into five categories\nranging from 0 to 4, where 0 is no DR and 4 is proliferative DR.\n","authors":["Ameya Uppina","S Navaneetha Krishnan","Talluri Krishna Sai Teja","Nikhil N Iyer","Joe Dhanith P R"],"pdf_url":"https://arxiv.org/pdf/2411.01251v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08227v3","updated":"2025-01-20T10:38:29Z","published":"2023-12-13T15:47:30Z","title":"Differentially Private Gradient Flow based on the Sliced Wasserstein\n  Distance","summary":"  Safeguarding privacy in sensitive training data is paramount, particularly in\nthe context of generative modeling. This can be achieved through either\ndifferentially private stochastic gradient descent or a differentially private\nmetric for training models or generators. In this paper, we introduce a novel\ndifferentially private generative modeling approach based on a gradient flow in\nthe space of probability measures. To this end, we define the gradient flow of\nthe Gaussian-smoothed Sliced Wasserstein Distance, including the associated\nstochastic differential equation (SDE). By discretizing and defining a\nnumerical scheme for solving this SDE, we demonstrate the link between\nsmoothing and differential privacy based on a Gaussian mechanism, due to a\nspecific form of the SDE's drift term. We then analyze the differential privacy\nguarantee of our gradient flow, which accounts for both the smoothing and the\nWiener process introduced by the SDE itself. Experiments show that our proposed\nmodel can generate higher-fidelity data at a low privacy budget compared to a\ngenerator-based model, offering a promising alternative.\n","authors":["Ilana Sebag","Muni Sreenivas Pydi","Jean-Yves Franceschi","Alain Rakotomamonjy","Mike Gartrell","Jamal Atif","Alexandre Allauzen"],"pdf_url":"https://arxiv.org/pdf/2312.08227v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11388v1","updated":"2025-01-20T10:33:07Z","published":"2025-01-20T10:33:07Z","title":"UniTrans: A Unified Vertical Federated Knowledge Transfer Framework for\n  Enhancing Cross-Hospital Collaboration","summary":"  Cross-hospital collaboration has the potential to address disparities in\nmedical resources across different regions. However, strict privacy regulations\nprohibit the direct sharing of sensitive patient information between hospitals.\nVertical federated learning (VFL) offers a novel privacy-preserving machine\nlearning paradigm that maximizes data utility across multiple hospitals.\nTraditional VFL methods, however, primarily benefit patients with overlapping\ndata, leaving vulnerable non-overlapping patients without guaranteed\nimprovements in medical prediction services. While some knowledge transfer\ntechniques can enhance the prediction performance for non-overlapping patients,\nthey fall short in addressing scenarios where overlapping and non-overlapping\npatients belong to different domains, resulting in challenges such as feature\nheterogeneity and label heterogeneity. To address these issues, we propose a\nnovel unified vertical federated knowledge transfer framework (Unitrans). Our\nframework consists of three key steps. First, we extract the federated\nrepresentation of overlapping patients by employing an effective vertical\nfederated representation learning method to model multi-party joint features\nonline. Next, each hospital learns a local knowledge transfer module offline,\nenabling the transfer of knowledge from the federated representation of\noverlapping patients to the enriched representation of local non-overlapping\npatients in a domain-adaptive manner. Finally, hospitals utilize these enriched\nlocal representations to enhance performance across various downstream medical\nprediction tasks. Experiments on real-world medical datasets validate the\nframework's dual effectiveness in both intra-domain and cross-domain knowledge\ntransfer. The code of \\method is available at\n\\url{https://github.com/Chung-ju/Unitrans}.\n","authors":["Chung-ju Huang","Yuanpeng He","Xiao Han","Wenpin Jiao","Zhi Jin","Leye Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11384v1","updated":"2025-01-20T10:24:33Z","published":"2025-01-20T10:24:33Z","title":"Transductive Conformal Inference for Ranking","summary":"  We introduce a method based on Conformal Prediction (CP) to quantify the\nuncertainty of full ranking algorithms. We focus on a specific scenario where\n$n + m$ items are to be ranked by some ''black box'' algorithm. It is assumed\nthat the relative (ground truth) ranking of n of them is known. The objective\nis then to quantify the error made by the algorithm on the ranks of the m new\nitems among the total $(n + m)$. In such a setting, the true ranks of the n\noriginal items in the total $(n + m)$ depend on the (unknown) true ranks of the\nm new ones. Consequently, we have no direct access to a calibration set to\napply a classical CP method. To address this challenge, we propose to construct\ndistribution-free bounds of the unknown conformity scores using recent results\non the distribution of conformal p-values. Using these scores upper bounds, we\nprovide valid prediction sets for the rank of any item. We also control the\nfalse coverage proportion, a crucial quantity when dealing with multiple\nprediction sets. Finally, we empirically show on both synthetic and real data\nthe efficiency of our CP method.\n","authors":["Jean-Baptiste Fermanian","Pierre Humbert","Gilles Blanchard"],"pdf_url":"https://arxiv.org/pdf/2501.11384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03840v2","updated":"2025-01-20T10:22:31Z","published":"2025-01-07T14:50:05Z","title":"Machine learning applications in archaeological practices: a review","summary":"  Artificial intelligence and machine learning applications in archaeology have\nincreased significantly in recent years, and these now span all subfields,\ngeographical regions, and time periods. The prevalence and success of these\napplications have remained largely unexamined, as recent reviews on the use of\nmachine learning in archaeology have only focused only on specific subfields of\narchaeology. Our review examined an exhaustive corpus of 135 articles published\nbetween 1997 and 2022. We observed a significant increase in the number of\npublications from 2019 onwards. Automatic structure detection and artefact\nclassification were the most represented tasks in the articles reviewed,\nfollowed by taphonomy, and archaeological predictive modelling. From the\nreview, clustering and unsupervised methods were underrepresented compared to\nsupervised models. Artificial neural networks and ensemble learning account for\ntwo thirds of the total number of models used. However, if machine learning\nmodels are gaining in popularity they remain subject to misunderstanding. We\nobserved, in some cases, poorly defined requirements and caveats of the machine\nlearning methods used. Furthermore, the goals and the needs of machine learning\napplications for archaeological purposes are in some cases unclear or poorly\nexpressed. To address this, we proposed a workflow guide for archaeologists to\ndevelop coherent and consistent methodologies adapted to their research\nquestions, project scale and data. As in many other areas, machine learning is\nrapidly becoming an important tool in archaeological research and practice,\nuseful for the analyses of large and multivariate data, although not without\nlimitations. This review highlights the importance of well-defined and\nwell-reported structured methodologies and collaborative practices to maximise\nthe potential of applications of machine learning methods in archaeology.\n","authors":["Mathias Bellat","Jordy D. Orellana Figueroa","Jonathan S. Reeves","Ruhollah Taghizadeh-Mehrjardi","Claudio Tennie","Thomas Scholten"],"pdf_url":"https://arxiv.org/pdf/2501.03840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10049v2","updated":"2025-01-20T10:21:30Z","published":"2025-01-17T09:10:34Z","title":"PandaSkill -- Player Performance and Skill Rating in Esports:\n  Application to League of Legends","summary":"  To take the esports scene to the next level, we introduce PandaSkill, a\nframework for assessing player performance and skill rating. Traditional rating\nsystems like Elo and TrueSkill often overlook individual contributions and face\nchallenges in professional esports due to limited game data and fragmented\ncompetitive scenes. PandaSkill leverages machine learning to estimate in-game\nplayer performance from individual player statistics. Each in-game role is\nmodeled independently, ensuring a fair comparison between them. Then, using\nthese performance scores, PandaSkill updates the player skill ratings using the\nBayesian framework OpenSkill in a free-for-all setting. In this setting, skill\nratings are updated solely based on performance scores rather than game\noutcomes, hightlighting individual contributions. To address the challenge of\nisolated rating pools that hinder cross-regional comparisons, PandaSkill\nintroduces a dual-rating system that combines players' regional ratings with a\nmeta-rating representing each region's overall skill level. Applying PandaSkill\nto five years of professional League of Legends matches worldwide, we show that\nour method produces skill ratings that better predict game outcomes and align\nmore closely with expert opinions compared to existing methods.\n","authors":["Maxime De Bois","Flora Parmentier","Raphaël Puget","Matthew Tanti","Jordan Peltier"],"pdf_url":"https://arxiv.org/pdf/2501.10049v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.05270v3","updated":"2025-01-20T10:18:11Z","published":"2024-12-06T18:55:34Z","title":"APOLLO: SGD-like Memory, AdamW-level Performance","summary":"  Large language models (LLMs) are notoriously memory-intensive during\ntraining, particularly with the popular AdamW optimizer. This memory burden\nnecessitates using more or higher-end GPUs or reducing batch sizes, limiting\ntraining scalability and throughput. To address this, various memory-efficient\noptimizers have been proposed to reduce optimizer memory usage. However, they\nface critical challenges: (i) reliance on costly SVD operations; (ii)\nsignificant performance trade-offs compared to AdamW; and (iii) still\nsubstantial optimizer memory overhead to maintain competitive performance.\n  In this work, we identify that AdamW's learning rate adaptation rule can be\neffectively coarsened as a structured learning rate update. Based on this\ninsight, we propose Approximated Gradient Scaling for Memory-Efficient LLM\nOptimization (APOLLO), which approximates learning rate scaling using an\nauxiliary low-rank optimizer state based on pure random projection. This\nstructured learning rate update rule makes APOLLO highly tolerant to further\nmemory reductions while delivering comparable pre-training performance. Even\nits rank-1 variant, APOLLO-Mini, achieves superior pre-training performance\ncompared to AdamW with SGD-level memory costs.\n  Extensive experiments demonstrate that the APOLLO series performs on-par with\nor better than AdamW, while achieving greater memory savings by nearly\neliminating the optimization states of AdamW. These savings provide significant\nsystem-level benefits: (1) Enhanced Throughput: 3x throughput on an 8xA100-80GB\nsetup compared to AdamW by supporting 4x larger batch sizes. (2) Improved Model\nScalability: Pre-training LLaMA-13B with naive DDP on A100-80GB GPUs without\nsystem-level optimizations. (3) Low-End GPU Friendly Pre-training: Pre-training\nLLaMA-7B on a single GPU using less than 12 GB of memory with weight\nquantization.\n","authors":["Hanqing Zhu","Zhenyu Zhang","Wenyan Cong","Xi Liu","Sem Park","Vikas Chandra","Bo Long","David Z. Pan","Zhangyang Wang","Jinwon Lee"],"pdf_url":"https://arxiv.org/pdf/2412.05270v3.pdf","comment":"Preprint; update code link and visualization"},{"id":"http://arxiv.org/abs/2408.02496v2","updated":"2025-01-20T09:59:05Z","published":"2024-08-05T14:19:03Z","title":"Automatic rating of incomplete hippocampal inversions evaluated across\n  multiple cohorts","summary":"  Incomplete Hippocampal Inversion (IHI), sometimes called hippocampal\nmalrotation, is an atypical anatomical pattern of the hippocampus found in\nabout 20% of the general population. IHI can be visually assessed on coronal\nslices of T1 weighted MR images, using a composite score that combines four\nanatomical criteria. IHI has been associated with several brain disorders\n(epilepsy, schizophrenia). However, these studies were based on small samples.\nFurthermore, the factors (genetic or environmental) that contribute to the\ngenesis of IHI are largely unknown. Large-scale studies are thus needed to\nfurther understand IHI and their potential relationships to neurological and\npsychiatric disorders. However, visual evaluation is long and tedious,\njustifying the need for an automatic method. In this paper, we propose, for the\nfirst time, to automatically rate IHI. We proceed by predicting four anatomical\ncriteria, which are then summed up to form the IHI score, providing the\nadvantage of an interpretable score. We provided an extensive experimental\ninvestigation of different machine learning methods and training strategies. We\nperformed automatic rating using a variety of deep learning models (conv5-FC3,\nResNet and SECNN) as well as a ridge regression. We studied the generalization\nof our models using different cohorts and performed multi-cohort learning. We\nrelied on a large population of 2,008 participants from the IMAGEN study, 993\nand 403 participants from the QTIM/QTAB studies as well as 985 subjects from\nthe UKBiobank. We showed that deep learning models outperformed a ridge\nregression. We demonstrated that the performances of the conv5-FC3 network were\nat least as good as more complex networks while maintaining a low complexity\nand computation time. We showed that training on a single cohort may lack in\nvariability while training on several cohorts improves generalization.\n","authors":["Lisa Hemforth","Baptiste Couvy-Duchesne","Kevin De Matos","Camille Brianceau","Matthieu Joulot","Tobias Banaschewski","Arun L. W. Bokde","Sylvane Desrivières","Herta Flor","Antoine Grigis","Hugh Garavan","Penny Gowland","Andreas Heinz","Rüdiger Brühl","Jean-Luc Martinot","Marie-Laure Paillère Martinot","Eric Artiges","Dimitri Papadopoulos","Herve Lemaitre","Tomas Paus","Luise Poustka","Sarah Hohmann","Nathalie Holz","Juliane H. Fröhner","Michael N. Smolka","Nilakshi Vaidya","Henrik Walter","Robert Whelan","Gunter Schumann","Christian Büchel","JB Poline","Bernd Itterman","Vincent Frouin","Alexandre Martin","IMAGEN study group","Claire Cury","Olivier Colliot"],"pdf_url":"https://arxiv.org/pdf/2408.02496v2.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n  Biomedical Imaging (MELBA) https://melba-journal.org/2024:016"},{"id":"http://arxiv.org/abs/2501.11361v1","updated":"2025-01-20T09:46:12Z","published":"2025-01-20T09:46:12Z","title":"Block Flow: Learning Straight Flow on Data Blocks","summary":"  Flow-matching models provide a powerful framework for various applications,\noffering efficient sampling and flexible probability path modeling. These\nmodels are characterized by flows with low curvature in learned generative\ntrajectories, which results in reduced truncation error at each sampling step.\nTo further reduce curvature, we propose block matching. This novel approach\nleverages label information to partition the data distribution into blocks and\nmatch them with a prior distribution parameterized using the same label\ninformation, thereby learning straighter flows. We demonstrate that the\nvariance of the prior distribution can control the curvature upper bound of\nforward trajectories in flow-matching models. By designing flexible\nregularization strategies to adjust this variance, we achieve optimal\ngeneration performance, effectively balancing the trade-off between maintaining\ndiversity in generated samples and minimizing numerical solver errors. Our\nresults demonstrate competitive performance with models of the same parameter\nscale.Code is available at \\url{https://github.com/wpp13749/block_flow}.\n","authors":["Zibin Wang","Zhiyuan Ouyang","Xiangyun Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11360v1","updated":"2025-01-20T09:44:07Z","published":"2025-01-20T09:44:07Z","title":"Federated Learning with Sample-level Client Drift Mitigation","summary":"  Federated Learning (FL) suffers from severe performance degradation due to\nthe data heterogeneity among clients. Existing works reveal that the\nfundamental reason is that data heterogeneity can cause client drift where the\nlocal model update deviates from the global one, and thus they usually tackle\nthis problem from the perspective of calibrating the obtained local update.\nDespite effectiveness, existing methods substantially lack a deep understanding\nof how heterogeneous data samples contribute to the formation of client drift.\nIn this paper, we bridge this gap by identifying that the drift can be viewed\nas a cumulative manifestation of biases present in all local samples and the\nbias between samples is different. Besides, the bias dynamically changes as the\nFL training progresses. Motivated by this, we propose FedBSS that first\nmitigates the heterogeneity issue in a sample-level manner, orthogonal to\nexisting methods. Specifically, the core idea of our method is to adopt a\nbias-aware sample selection scheme that dynamically selects the samples from\nsmall biases to large epoch by epoch to train progressively the local model in\neach round. In order to ensure the stability of training, we set the\ndiversified knowledge acquisition stage as the warm-up stage to avoid the local\noptimality caused by knowledge deviation in the early stage of the model.\nEvaluation results show that FedBSS outperforms state-of-the-art baselines. In\naddition, we also achieved effective results on feature distribution skew and\nnoise label dataset setting, which proves that FedBSS can not only reduce\nheterogeneity, but also has scalability and robustness.\n","authors":["Haoran Xu","Jiaze Li","Wanyi Wu","Hao Ren"],"pdf_url":"https://arxiv.org/pdf/2501.11360v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.11357v1","updated":"2025-01-20T09:38:30Z","published":"2025-01-20T09:38:30Z","title":"On the Dimension of Pullback Attractors in Recurrent Neural Networks","summary":"  Recurrent Neural Networks (RNNs) are high-dimensional state space models\ncapable of learning functions on sequence data. Recently, it has been\nconjectured that reservoir computers, a particular class of RNNs, trained on\nobservations of a dynamical systems can be interpreted as embeddings. This\nresult has been established for the case of linear reservoir systems. In this\nwork, we use a nonautonomous dynamical systems approach to establish an upper\nbound for the fractal dimension of the subset of reservoir state space\napproximated during training and prediction phase. We prove that when the input\nsequences comes from an Nin-dimensional invertible dynamical system, the\nfractal dimension of this set is bounded above by Nin. The result obtained here\nare useful in dimensionality reduction of computation in RNNs as well as\nestimating fractal dimensions of dynamical systems from limited observations of\ntheir time series. It is also a step towards understanding embedding properties\nof reservoir computers.\n","authors":["Muhammed Fadera"],"pdf_url":"https://arxiv.org/pdf/2501.11357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10322v2","updated":"2025-01-20T09:33:21Z","published":"2025-01-17T17:51:53Z","title":"Hierarchical Autoregressive Transformers: Combining Byte- and Word-Level\n  Processing for Robust, Adaptable Language Models","summary":"  Tokenization is a fundamental step in natural language processing, breaking\ntext into units that computational models can process. While learned subword\ntokenizers have become the de-facto standard, they present challenges such as\nlarge vocabularies, limited adaptability to new domains or languages, and\nsensitivity to spelling errors and variations. To overcome these limitations,\nwe investigate a hierarchical architecture for autoregressive language\nmodelling that combines character-level and word-level processing. It employs a\nlightweight character-level encoder to convert character sequences into word\nembeddings, which are then processed by a word-level backbone model and decoded\nback into characters via a compact character-level decoder. This method retains\nthe sequence compression benefits of word-level tokenization without relying on\na rigid, predefined vocabulary. We demonstrate, at scales up to 7 billion\nparameters, that hierarchical transformers match the downstream task\nperformance of subword-tokenizer-based models while exhibiting significantly\ngreater robustness to input perturbations. Additionally, during continued\npretraining on an out-of-domain language, our model trains almost twice as\nfast, achieves superior performance on the target language, and retains more of\nits previously learned knowledge. Hierarchical transformers pave the way for\nNLP systems that are more robust, flexible, and generalizable across languages\nand domains.\n","authors":["Pit Neitemeier","Björn Deiseroth","Constantin Eichenberg","Lukas Balles"],"pdf_url":"https://arxiv.org/pdf/2501.10322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11350v1","updated":"2025-01-20T09:18:30Z","published":"2025-01-20T09:18:30Z","title":"Adaptive parameters identification for nonlinear dynamics using deep\n  permutation invariant networks","summary":"  The promising outcomes of dynamical system identification techniques, such as\nSINDy [Brunton et al. 2016], highlight their advantages in providing\nqualitative interpretability and extrapolation compared to non-interpretable\ndeep neural networks [Rudin 2019]. These techniques suffer from parameter\nupdating in real-time use cases, especially when the system parameters are\nlikely to change during or between processes. Recently, the OASIS [Bhadriraju\net al. 2020] framework introduced a data-driven technique to address the\nlimitations of real-time dynamical system parameters updating, yielding\ninteresting results. Nevertheless, we show in this work that superior\nperformance can be achieved using more advanced model architectures. We present\nan innovative encoding approach, based mainly on the use of Set Encoding\nmethods of sequence data, which give accurate adaptive model identification for\ncomplex dynamic systems, with variable input time series length. Two Set\nEncoding methods are used, the first is Deep Set [Zaheer et al. 2017], and the\nsecond is Set Transformer [Lee et al. 2019]. Comparing Set Transformer to OASIS\nframework on Lotka Volterra for real-time local dynamical system identification\nand time series forecasting, we find that the Set Transformer architecture is\nwell adapted to learning relationships within data sets. We then compare the\ntwo Set Encoding methods based on the Lorenz system for online global dynamical\nsystem identification. Finally, we trained a Deep Set model to perform\nidentification and characterization of abnormalities for 1D heat-transfer\nproblem.\n","authors":["Mouad Elaarabi","Domenico Borzacchiello","Yves Le Guennec","Philippe Le Bot","Sebastien Comas-Cardona"],"pdf_url":"https://arxiv.org/pdf/2501.11350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16901v3","updated":"2025-01-20T09:12:29Z","published":"2024-05-31T15:17:12Z","title":"ECGrecover: a Deep Learning Approach for Electrocardiogram Signal\n  Completion","summary":"  In this work, we address the challenge of reconstructing the complete 12-lead\nECG signal from its incomplete parts. We focus on two main scenarios: (i)\nreconstructing missing signal segments within an ECG lead and (ii) recovering\nentire leads from signal in another unique lead. Two emerging clinical\napplications emphasize the relevance of our work. The first is the increasing\nneed to digitize paper-stored ECGs for utilization in AI-based applications,\noften limited to digital 12 lead 10s ECGs. The second is the widespread use of\nwearable devices that record ECGs but typically capture only one or a few\nleads. In both cases, a non-negligible amount of information is lost or not\nrecorded. Our approach aims to recover this missing signal. We propose\nECGrecover, a U-Net neural network model trained on a novel composite objective\nfunction to address the reconstruction problem. This function incorporates both\nspatial and temporal features of the ECG by combining the distance in amplitude\nand sycnhronization through time between the reconstructed and the real digital\nsignals. We used real-life ECG datasets and through comprehensive assessments\ncompared ECGrecover with three state-of-the-art methods based on generative\nadversarial networks (EKGAN, Pix2Pix) as well as the CopyPaste strategy. The\nresults demonstrated that ECGrecover consistently outperformed state-of-the-art\nmethods in standard distortion metrics as well as in preserving critical ECG\ncharacteristics, particularly the P, QRS, and T wave coordinates.\n","authors":["Alex Lence","Federica Granese","Ahmad Fall","Blaise Hanczar","Joe-Elie Salem","Jean-Daniel Zucker","Edi Prifti"],"pdf_url":"https://arxiv.org/pdf/2406.16901v3.pdf","comment":"31 pages, 14 figures, 29 tables, conference paper"},{"id":"http://arxiv.org/abs/2501.11341v1","updated":"2025-01-20T08:59:05Z","published":"2025-01-20T08:59:05Z","title":"Lee and Seung (2000)'s Algorithms for Non-negative Matrix Factorization:\n  A Supplementary Proof Guide","summary":"  Lee and Seung (2000) introduced numerical solutions for non-negative matrix\nfactorization (NMF) using iterative multiplicative update algorithms. These\nalgorithms have been actively utilized as dimensionality reduction tools for\nhigh-dimensional non-negative data and learning algorithms for artificial\nneural networks. Despite a considerable amount of literature on the\napplications of the NMF algorithms, detailed explanations about their\nformulation and derivation are lacking. This report provides supplementary\ndetails to help understand the formulation and derivation of the proofs as used\nin the original paper.\n","authors":["Sungjae Cho"],"pdf_url":"https://arxiv.org/pdf/2501.11341v1.pdf","comment":"17 pages; 3 figures; 10 subfigures"},{"id":"http://arxiv.org/abs/2501.10348v2","updated":"2025-01-20T08:54:34Z","published":"2025-01-17T18:42:46Z","title":"Credit Risk Identification in Supply Chains Using Generative Adversarial\n  Networks","summary":"  Credit risk management within supply chains has emerged as a critical\nresearch area due to its significant implications for operational stability and\nfinancial sustainability. The intricate interdependencies among supply chain\nparticipants mean that credit risks can propagate across networks, with impacts\nvarying by industry. This study explores the application of Generative\nAdversarial Networks (GANs) to enhance credit risk identification in supply\nchains. GANs enable the generation of synthetic credit risk scenarios,\naddressing challenges related to data scarcity and imbalanced datasets. By\nleveraging GAN-generated data, the model improves predictive accuracy while\neffectively capturing dynamic and temporal dependencies in supply chain data.\nThe research focuses on three representative industries-manufacturing (steel),\ndistribution (pharmaceuticals), and services (e-commerce) to assess\nindustry-specific credit risk contagion. Experimental results demonstrate that\nthe GAN-based model outperforms traditional methods, including logistic\nregression, decision trees, and neural networks, achieving superior accuracy,\nrecall, and F1 scores. The findings underscore the potential of GANs in\nproactive risk management, offering robust tools for mitigating financial\ndisruptions in supply chains. Future research could expand the model by\nincorporating external market factors and supplier relationships to further\nenhance predictive capabilities. Keywords- Generative Adversarial Networks\n(GANs); Supply Chain Risk; Credit Risk Identification; Machine Learning; Data\nAugmentation\n","authors":["Zizhou Zhang","Xinshi Li","Yu Cheng","Zhenrui Chen","Qianying Liu"],"pdf_url":"https://arxiv.org/pdf/2501.10348v2.pdf","comment":"The paper will be published and indexed by IEEE at 2025 8th\n  International Conference on Advanced Algorithms and Control Engineering\n  (ICAACE 2025)"},{"id":"http://arxiv.org/abs/2501.11326v1","updated":"2025-01-20T08:10:15Z","published":"2025-01-20T08:10:15Z","title":"The \"Law\" of the Unconscious Contrastive Learner: Probabilistic\n  Alignment of Unpaired Modalities","summary":"  While internet-scale data often comes in pairs (e.g., audio/image,\nimage/text), we often want to perform inferences over modalities unseen\ntogether in the training data (e.g., audio/text). Empirically, this can often\nbe addressed by learning multiple contrastive embedding spaces between existing\nmodality pairs, implicitly hoping that unseen modality pairs will end up being\naligned. This theoretical paper proves that this hope is well founded, under\ncertain assumptions. Starting with the proper Bayesian approach of integrating\nout intermediate modalities, we show that directly comparing the\nrepresentations of data from unpaired modalities can recover the same\nlikelihood ratio. Our analysis builds on prior work on the geometry and\nprobabilistic interpretation of contrastive representations, showing how these\nrepresentations can answer many of the same inferences as probabilistic\ngraphical models. Our analysis suggests two new ways of using contrastive\nrepresentations: in settings with pre-trained contrastive models, and for\nhandling language ambiguity in reinforcement learning. Our numerical\nexperiments study the importance of our assumptions and demonstrate these new\napplications.\n","authors":["Yongwei Che","Benjamin Eysenbach"],"pdf_url":"https://arxiv.org/pdf/2501.11326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15843v3","updated":"2025-01-20T08:08:12Z","published":"2024-11-24T13:48:16Z","title":"Unveil Inversion and Invariance in Flow Transformer for Versatile Image\n  Editing","summary":"  Leveraging the large generative prior of the flow transformer for tuning-free\nimage editing requires authentic inversion to project the image into the\nmodel's domain and a flexible invariance control mechanism to preserve\nnon-target contents. However, the prevailing diffusion inversion performs\ndeficiently in flow-based models, and the invariance control cannot reconcile\ndiverse rigid and non-rigid editing tasks. To address these, we systematically\nanalyze the \\textbf{inversion and invariance} control based on the flow\ntransformer. Specifically, we unveil that the Euler inversion shares a similar\nstructure to DDIM yet is more susceptible to the approximation error. Thus, we\npropose a two-stage inversion to first refine the velocity estimation and then\ncompensate for the leftover error, which pivots closely to the model prior and\nbenefits editing. Meanwhile, we propose the invariance control that manipulates\nthe text features within the adaptive layer normalization, connecting the\nchanges in the text prompt to image semantics. This mechanism can\nsimultaneously preserve the non-target contents while allowing rigid and\nnon-rigid manipulation, enabling a wide range of editing types such as visual\ntext, quantity, facial expression, etc. Experiments on versatile scenarios\nvalidate that our framework achieves flexible and accurate editing, unlocking\nthe potential of the flow transformer for versatile image editing.\n","authors":["Pengcheng Xu","Boyuan Jiang","Xiaobin Hu","Donghao Luo","Qingdong He","Jiangning Zhang","Chengjie Wang","Yunsheng Wu","Charles Ling","Boyu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15843v3.pdf","comment":"Project Page: https://pengchengpcx.github.io/EditFT/"},{"id":"http://arxiv.org/abs/2501.11323v1","updated":"2025-01-20T08:02:15Z","published":"2025-01-20T08:02:15Z","title":"Physics-Informed Machine Learning for Efficient Reconfigurable\n  Intelligent Surface Design","summary":"  Reconfigurable intelligent surface (RIS) is a two-dimensional periodic\nstructure integrated with a large number of reflective elements, which can\nmanipulate electromagnetic waves in a digital way, offering great potentials\nfor wireless communication and radar detection applications. However,\nconventional RIS designs highly rely on extensive full-wave EM simulations that\nare extremely time-consuming. To address this challenge, we propose a\nmachine-learning-assisted approach for efficient RIS design. An accurate and\nfast model to predict the reflection coefficient of RIS element is developed by\ncombining a multi-layer perceptron neural network (MLP) and a dual-port\nnetwork, which can significantly reduce tedious EM simulations in the network\ntraining. A RIS has been practically designed based on the proposed method. To\nverify the proposed method, the RIS has also been fabricated and measured. The\nexperimental results are in good agreement with the simulation results, which\nvalidates the efficacy of the proposed method in RIS design.\n","authors":["Zhen Zhang","Jun Hui Qiu","Jun Wei Zhang","Hui Dong Li","Dong Tang","Qiang Cheng","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2501.11323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11318v1","updated":"2025-01-20T07:44:09Z","published":"2025-01-20T07:44:09Z","title":"Nested Annealed Training Scheme for Generative Adversarial Networks","summary":"  Recently, researchers have proposed many deep generative models, including\ngenerative adversarial networks(GANs) and denoising diffusion models. Although\nsignificant breakthroughs have been made and empirical success has been\nachieved with the GAN, its mathematical underpinnings remain relatively\nunknown. This paper focuses on a rigorous mathematical theoretical framework:\nthe composite-functional-gradient GAN (CFG)[1]. Specifically, we reveal the\ntheoretical connection between the CFG model and score-based models. We find\nthat the training objective of the CFG discriminator is equivalent to finding\nan optimal D(x). The optimal gradient of D(x) differentiates the integral of\nthe differences between the score functions of real and synthesized samples.\nConversely, training the CFG generator involves finding an optimal G(x) that\nminimizes this difference. In this paper, we aim to derive an annealed weight\npreceding the weight of the CFG discriminator. This new explicit theoretical\nexplanation model is called the annealed CFG method. To overcome the limitation\nof the annealed CFG method, as the method is not readily applicable to the SOTA\nGAN model, we propose a nested annealed training scheme (NATS). This scheme\nkeeps the annealed weight from the CFG method and can be seamlessly adapted to\nvarious GAN models, no matter their structural, loss, or regularization\ndifferences. We conduct thorough experimental evaluations on various benchmark\ndatasets for image generation. The results show that our annealed CFG and NATS\nmethods significantly improve the quality and diversity of the synthesized\nsamples. This improvement is clear when comparing the CFG method and the SOTA\nGAN models.\n","authors":["Chang Wan","Ming-Hsuan Yang","Minglu Li","Yunliang Jiang","Zhonglong Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.11318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03226v2","updated":"2025-01-20T07:35:57Z","published":"2025-01-06T18:59:13Z","title":"BoostStep: Boosting mathematical capability of Large Language Models via\n  improved single-step reasoning","summary":"  Cutting-edge large language models (LLMs) demonstrate promising performance\nin solving complex math problems with a divide-and-conquer pipeline and the\nassistance of in-context learning (ICL) examples. However, their potential for\nimprovement is limited by two critical problems within their ICL examples:\ngranularity-mismatch and the ensuing negative-effect noise problem.\nSpecifically, the LLMs are capable of the dividing process yet mostly failed by\ninaccurate reasoning within a few conquer steps, while the ICL examples\nretrieved in question-grained sometimes lack relevant steps for a specific\nchallenging reasoning step. Further, this disconnect may hinder the correct\nreasoning due to its irrelevance. To this end, we focus on improving the\nreasoning quality within each step and present BoostStep. BoostStep aligns the\ngranularity between the retrieving and reasoning on step grained, and provides\nhighly related ICL examples for each reasoning step with a novel `first-try'\nstrategy. BoostStep provides more relevant examples than the coarse\nquestion-grained strategy, enhancing the model reasoning quality within each\nstep steadily. BoostStep is a general and robust reasoning-enhancing method\nthat not only improves standalone reasoning performance but also integrates\nseamlessly with Monte Carlo Tree Search methods (MCTS) to refine both candidate\ngeneration and decision-making. Quantitatively, it improves GPT-4o and\nQwen2.5-Math-72B by 3.6\\% and 2.0\\% respectively on various mathematical\nbenchmarks, and 7.5\\% gain combined with MCTS.\n","authors":["Beichen Zhang","Yuhong Liu","Xiaoyi Dong","Yuhang Zang","Pan Zhang","Haodong Duan","Yuhang Cao","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03226v2.pdf","comment":"Codes and Data are available at\n  https://github.com/beichenzbc/BoostStep"},{"id":"http://arxiv.org/abs/2501.11311v1","updated":"2025-01-20T07:28:41Z","published":"2025-01-20T07:28:41Z","title":"A2SB: Audio-to-Audio Schrodinger Bridges","summary":"  Audio in the real world may be perturbed due to numerous factors, causing the\naudio quality to be degraded. The following work presents an audio restoration\nmodel tailored for high-res music at 44.1kHz. Our model, Audio-to-Audio\nSchrodinger Bridges (A2SB), is capable of both bandwidth extension (predicting\nhigh-frequency components) and inpainting (re-generating missing segments).\nCritically, A2SB is end-to-end without need of a vocoder to predict waveform\noutputs, able to restore hour-long audio inputs, and trained on permissively\nlicensed music data. A2SB is capable of achieving state-of-the-art bandwidth\nextension and inpainting quality on several out-of-distribution music test\nsets. Our demo website is https: //research.nvidia.com/labs/adlr/A2SB/.\n","authors":["Zhifeng Kong","Kevin J Shih","Weili Nie","Arash Vahdat","Sang-gil Lee","Joao Felipe Santos","Ante Jukic","Rafael Valle","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2501.11311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11306v1","updated":"2025-01-20T07:12:40Z","published":"2025-01-20T07:12:40Z","title":"Collaborative Imputation of Urban Time Series through Cross-city\n  Meta-learning","summary":"  Urban time series, such as mobility flows, energy consumption, and pollution\nrecords, encapsulate complex urban dynamics and structures. However, data\ncollection in each city is impeded by technical challenges such as budget\nlimitations and sensor failures, necessitating effective data imputation\ntechniques that can enhance data quality and reliability. Existing imputation\nmodels, categorized into learning-based and analytics-based paradigms, grapple\nwith the trade-off between capacity and generalizability. Collaborative\nlearning to reconstruct data across multiple cities holds the promise of\nbreaking this trade-off. Nevertheless, urban data's inherent irregularity and\nheterogeneity issues exacerbate challenges of knowledge sharing and\ncollaboration across cities. To address these limitations, we propose a novel\ncollaborative imputation paradigm leveraging meta-learned implicit neural\nrepresentations (INRs). INRs offer a continuous mapping from domain coordinates\nto target values, integrating the strengths of both paradigms. By imposing\nembedding theory, we first employ continuous parameterization to handle\nirregularity and reconstruct the dynamical system. We then introduce a\ncross-city collaborative learning scheme through model-agnostic meta learning,\nincorporating hierarchical modulation and normalization techniques to\naccommodate multiscale representations and reduce variance in response to\nheterogeneity. Extensive experiments on a diverse urban dataset from 20 global\ncities demonstrate our model's superior imputation performance and\ngeneralizability, underscoring the effectiveness of collaborative imputation in\nresource-constrained settings.\n","authors":["Tong Nie","Wei Ma","Jian Sun","Yu Yang","Jiannong Cao"],"pdf_url":"https://arxiv.org/pdf/2501.11306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11305v1","updated":"2025-01-20T07:11:03Z","published":"2025-01-20T07:11:03Z","title":"Generalizable Spectral Embedding with an Application to UMAP","summary":"  Spectral Embedding (SE) is a popular method for dimensionality reduction,\napplicable across diverse domains. Nevertheless, its current implementations\nface three prominent drawbacks which curtail its broader applicability:\ngeneralizability (i.e., out-of-sample extension), scalability, and eigenvectors\nseparation. In this paper, we introduce GrEASE: Generalizable and Efficient\nApproximate Spectral Embedding, a novel deep-learning approach designed to\naddress these limitations. GrEASE incorporates an efficient post-processing\nstep to achieve eigenvectors separation, while ensuring both generalizability\nand scalability, allowing for the computation of the Laplacian's eigenvectors\non unseen data. This method expands the applicability of SE to a wider range of\ntasks and can enhance its performance in existing applications. We empirically\ndemonstrate GrEASE's ability to consistently approximate and generalize SE,\nwhile ensuring scalability. Additionally, we show how GrEASE can be leveraged\nto enhance existing methods. Specifically, we focus on UMAP, a leading\nvisualization technique, and introduce NUMAP, a generalizable version of UMAP\npowered by GrEASE. Our codes are publicly available.\n","authors":["Nir Ben-Ari","Amitai Yacobi","Uri Shaham"],"pdf_url":"https://arxiv.org/pdf/2501.11305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11293v1","updated":"2025-01-20T06:28:27Z","published":"2025-01-20T06:28:27Z","title":"A Machine Learning Framework for Handling Unreliable Absence Label and\n  Class Imbalance for Marine Stinger Beaching Prediction","summary":"  Bluebottles (\\textit{Physalia} spp.) are marine stingers resembling\njellyfish, whose presence on Australian beaches poses a significant public risk\ndue to their venomous nature. Understanding the environmental factors driving\nbluebottles ashore is crucial for mitigating their impact, and machine learning\ntools are to date relatively unexplored. We use bluebottle marine stinger\npresence/absence data from beaches in Eastern Sydney, Australia, and compare\nmachine learning models (Multilayer Perceptron, Random Forest, and XGBoost) to\nidentify factors influencing their presence. We address challenges such as\nclass imbalance, class overlap, and unreliable absence data by employing data\naugmentation techniques, including the Synthetic Minority Oversampling\nTechnique (SMOTE), Random Undersampling, and Synthetic Negative Approach that\nexcludes the negative class. Our results show that SMOTE failed to resolve\nclass overlap, but the presence-focused approach effectively handled imbalance,\nclass overlap, and ambiguous absence data. The data attributes such as the wind\ndirection, which is a circular variable, emerged as a key factor influencing\nbluebottle presence, confirming previous inference studies. However, in the\nabsence of population dynamics, biological behaviours, and life cycles, the\nbest predictive model appears to be Random Forests combined with Synthetic\nNegative Approach. This research contributes to mitigating the risks posed by\nbluebottles to beachgoers and provides insights into handling class overlap and\nunreliable negative class in environmental modelling.\n","authors":["Amuche Ibenegbu","Amandine Schaeffer","Pierre Lafaye de Micheaux","Rohitash Chandra"],"pdf_url":"https://arxiv.org/pdf/2501.11293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17523v2","updated":"2025-01-20T05:44:07Z","published":"2024-12-23T12:47:04Z","title":"Constructing Fair Latent Space for Intersection of Fairness and\n  Explainability","summary":"  As the use of machine learning models has increased, numerous studies have\naimed to enhance fairness. However, research on the intersection of fairness\nand explainability remains insufficient, leading to potential issues in gaining\nthe trust of actual users. Here, we propose a novel module that constructs a\nfair latent space, enabling faithful explanation while ensuring fairness. The\nfair latent space is constructed by disentangling and redistributing labels and\nsensitive attributes, allowing the generation of counterfactual explanations\nfor each type of information. Our module is attached to a pretrained generative\nmodel, transforming its biased latent space into a fair latent space.\nAdditionally, since only the module needs to be trained, there are advantages\nin terms of time and cost savings, without the need to train the entire\ngenerative model. We validate the fair latent space with various fairness\nmetrics and demonstrate that our approach can effectively provide explanations\nfor biased decisions and assurances of fairness.\n","authors":["Hyungjun Joo","Hyeonggeun Han","Sehwan Kim","Sangwoo Hong","Jungwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2412.17523v2.pdf","comment":"14 pages, 5 figures, accepted in AAAI 2025"},{"id":"http://arxiv.org/abs/2501.11284v1","updated":"2025-01-20T05:44:01Z","published":"2025-01-20T05:44:01Z","title":"RedStar: Does Scaling Long-CoT Data Unlock Better Slow-Reasoning\n  Systems?","summary":"  Can scaling transform reasoning? In this work, we explore the untapped\npotential of scaling Long Chain-of-Thought (Long-CoT) data to 1000k samples,\npioneering the development of a slow-thinking model, RedStar. Through extensive\nexperiments with various LLMs and different sizes, we uncover the ingredients\nfor specialization and scale for Long-CoT training. Surprisingly, even smaller\nmodels show significant performance gains with limited data, revealing the\nsample efficiency of Long-CoT and the critical role of sample difficulty in the\nlearning process. Our findings demonstrate that Long-CoT reasoning can be\neffectively triggered with just a few thousand examples, while larger models\nachieve unparalleled improvements. We also introduce reinforcement learning\n(RL)-scale training as a promising direction for advancing slow-thinking\nsystems. RedStar shines across domains: on the MATH-Hard benchmark,\nRedStar-code-math boosts performance from 66.2\\% to 81.6\\%, and on the USA Math\nOlympiad (AIME), it solves 46.7\\% of problems using only 21k mixed-code-math\ndatasets. In multimodal tasks like GeoQA and MathVista-GEO, RedStar-Geo\nachieves competitive results with minimal Long-CoT data, outperforming other\nslow-thinking systems like QvQ-Preview. Compared to QwQ, RedStar strikes the\nperfect balance between reasoning and generalizability. Our work highlights\nthat, with careful tuning, scaling Long-CoT can unlock extraordinary reasoning\ncapabilities-even with limited dataset and set a new standard for slow-thinking\nmodels across diverse challenges. Our data and models are released at\nhttps://huggingface.co/RedStar-Reasoning.\n","authors":["Haotian Xu","Xing Wu","Weinong Wang","Zhongzhi Li","Da Zheng","Boyuan Chen","Yi Hu","Shijia Kang","Jiaming Ji","Yingying Zhang","Zhijiang Guo","Yaodong Yang","Muhan Zhang","Debing Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11284v1.pdf","comment":"technique-report, https://huggingface.co/RedStar-Reasoning"},{"id":"http://arxiv.org/abs/2501.11280v1","updated":"2025-01-20T05:25:51Z","published":"2025-01-20T05:25:51Z","title":"Empirical Bayes Estimation for Lasso-Type Regularizers: Analysis of\n  Automatic Relevance Determination","summary":"  This paper focuses on linear regression models with non-conjugate\nsparsity-inducing regularizers such as lasso and group lasso. Although\nempirical Bayes approach enables us to estimate the regularization parameter,\nlittle is known on the properties of the estimators. In particular, there are\nmany unexplained aspects regarding the specific conditions under which the\nmechanism of automatic relevance determination (ARD) occurs. In this paper, we\nderive the empirical Bayes estimators for the group lasso regularized linear\nregression models with a limited number of parameters. It is shown that the\nestimators diverge under a certain condition, giving rise to the ARD mechanism.\nWe also prove that empirical Bayes methods can produce ARD mechanism in general\nregularized linear regression models and clarify the conditions under which\nmodels such as ridge, lasso, and group lasso can produce ARD mechanism.\n","authors":["Tsukasa Yoshida","Kazuho Watanabe"],"pdf_url":"https://arxiv.org/pdf/2501.11280v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2501.11275v1","updated":"2025-01-20T05:11:49Z","published":"2025-01-20T05:11:49Z","title":"Higher Order Approximation Rates for ReLU CNNs in Korobov Spaces","summary":"  This paper investigates the $L_p$ approximation error for higher order\nKorobov functions using deep convolutional neural networks (CNNs) with ReLU\nactivation. For target functions having a mixed derivative of order m+1 in each\ndirection, we improve classical approximation rate of second order to (m+1)-th\norder (modulo a logarithmic factor) in terms of the depth of CNNs. The key\ningredient in our analysis is approximate representation of high-order sparse\ngrid basis functions by CNNs. The results suggest that higher order\nexpressivity of CNNs does not severely suffer from the curse of dimensionality.\n","authors":["Yuwen Li","Guozhi Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11270v1","updated":"2025-01-20T04:39:13Z","published":"2025-01-20T04:39:13Z","title":"Spatiotemporal Air Quality Mapping in Urban Areas Using Sparse Sensor\n  Data, Satellite Imagery, Meteorological Factors, and Spatial Features","summary":"  Monitoring air pollution is crucial for protecting human health from exposure\nto harmful substances. Traditional methods of air quality monitoring, such as\nground-based sensors and satellite-based remote sensing, face limitations due\nto high deployment costs, sparse sensor coverage, and environmental\ninterferences. To address these challenges, this paper proposes a framework for\nhigh-resolution spatiotemporal Air Quality Index (AQI) mapping using sparse\nsensor data, satellite imagery, and various spatiotemporal factors. By\nleveraging Graph Neural Networks (GNNs), we estimate AQI values at unmonitored\nlocations based on both spatial and temporal dependencies. The framework\nincorporates a wide range of environmental features, including meteorological\ndata, road networks, points of interest (PoIs), population density, and urban\ngreen spaces, which enhance prediction accuracy. We illustrate the use of our\napproach through a case study in Lahore, Pakistan, where multi-resolution data\nis used to generate the air quality index map at a fine spatiotemporal scale.\n","authors":["Osama Ahmad","Zubair Khalid","Muhammad Tahir","Momin Uppal"],"pdf_url":"https://arxiv.org/pdf/2501.11270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11268v1","updated":"2025-01-20T04:26:34Z","published":"2025-01-20T04:26:34Z","title":"Sparse L0-norm based Kernel-free Quadratic Surface Support Vector\n  Machines","summary":"  Kernel-free quadratic surface support vector machine (SVM) models have gained\nsignificant attention in machine learning. However, introducing a quadratic\nclassifier increases the model's complexity by quadratically expanding the\nnumber of parameters relative to the dimensionality of the data, exacerbating\noverfitting. To address this, we propose sparse $\\ell_0$-norm based Kernel-free\nquadratic surface SVMs, designed to mitigate overfitting and enhance\ninterpretability. Given the intractable nature of these models, we present a\npenalty decomposition algorithm to efficiently obtain first-order optimality\npoints. Our analysis shows that the subproblems in this framework either admit\nclosed-form solutions or can leverage duality theory to improve computational\nefficiency. Through empirical evaluations on real-world datasets, we\ndemonstrate the efficacy and robustness of our approach, showcasing its\npotential to advance Kernel-free quadratic surface SVMs in practical\napplications while addressing overfitting concerns. All the implemented models\nand experiment codes are available at\n\\url{https://github.com/raminzandvakili/L0-QSVM}.\n","authors":["Ahmad Mousavi","Ramin Zandvakili"],"pdf_url":"https://arxiv.org/pdf/2501.11268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11267v1","updated":"2025-01-20T04:26:21Z","published":"2025-01-20T04:26:21Z","title":"Communication-Efficient Federated Learning by Quantized Variance\n  Reduction for Heterogeneous Wireless Edge Networks","summary":"  Federated learning (FL) has been recognized as a viable solution for\nlocal-privacy-aware collaborative model training in wireless edge networks, but\nits practical deployment is hindered by the high communication overhead caused\nby frequent and costly server-device synchronization. Notably, most existing\ncommunication-efficient FL algorithms fail to reduce the significant\ninter-device variance resulting from the prevalent issue of device\nheterogeneity. This variance severely decelerates algorithm convergence,\nincreasing communication overhead and making it more challenging to achieve a\nwell-performed model. In this paper, we propose a novel communication-efficient\nFL algorithm, named FedQVR, which relies on a sophisticated variance-reduced\nscheme to achieve heterogeneity-robustness in the presence of quantized\ntransmission and heterogeneous local updates among active edge devices.\nComprehensive theoretical analysis justifies that FedQVR is inherently\nresilient to device heterogeneity and has a comparable convergence rate even\nwith a small number of quantization bits, yielding significant communication\nsavings. Besides, considering non-ideal wireless channels, we propose FedQVR-E\nwhich enhances the convergence of FedQVR by performing joint allocation of\nbandwidth and quantization bits across devices under constrained transmission\ndelays. Extensive experimental results are also presented to demonstrate the\nsuperior performance of the proposed algorithms over their counterparts in\nterms of both communication efficiency and application performance.\n","authors":["Shuai Wang","Yanqing Xu","Chaoqun You","Mingjie Shao","Tony Q. S. Quek"],"pdf_url":"https://arxiv.org/pdf/2501.11267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03271v3","updated":"2025-01-20T04:24:56Z","published":"2025-01-05T00:08:52Z","title":"DPO Kernels: A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich\n  Paradigm for Direct Preference Optimization","summary":"  The rapid rise of large language models (LLMs) has unlocked many applications\nbut also underscores the challenge of aligning them with diverse values and\npreferences. Direct Preference Optimization (DPO) is central to alignment but\nconstrained by fixed divergences and limited feature transformations. We\npropose DPO-Kernels, which integrates kernel methods to address these issues\nthrough four key contributions: (i) Kernelized Representations with polynomial,\nRBF, Mahalanobis, and spectral kernels for richer transformations, plus a\nhybrid loss combining embedding-based and probability-based objectives; (ii)\nDivergence Alternatives (Jensen-Shannon, Hellinger, Renyi, Bhattacharyya,\nWasserstein, and f-divergences) for greater stability; (iii) Data-Driven\nSelection metrics that automatically choose the best kernel-divergence pair;\nand (iv) a Hierarchical Mixture of Kernels for both local precision and global\nmodeling. Evaluations on 12 datasets demonstrate state-of-the-art performance\nin factuality, safety, reasoning, and instruction following. Grounded in\nHeavy-Tailed Self-Regularization, DPO-Kernels maintains robust generalization\nfor LLMs, offering a comprehensive resource for further alignment research.\n","authors":["Amitava Das","Suranjana Trivedy","Danush Khanna","Rajarshi Roy","Gurpreet Singh","Basab Ghosh","Yaswanth Narsupalli","Vinija Jain","Vasu Sharma","Aishwarya Naresh Reganti","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2501.03271v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11265v1","updated":"2025-01-20T04:19:24Z","published":"2025-01-20T04:19:24Z","title":"A Metric Topology of Deep Learning for Data Classification","summary":"  Empirically, Deep Learning (DL) has demonstrated unprecedented success in\npractical applications. However, DL remains by and large a mysterious\n\"black-box\", spurring recent theoretical research to build its mathematical\nfoundations. In this paper, we investigate DL for data classification through\nthe prism of metric topology. Considering that conventional Euclidean metric\nover the network parameter space typically fails to discriminate DL networks\naccording to their classification outcomes, we propose from a probabilistic\npoint of view a meaningful distance measure, whereby DL networks yielding\nsimilar classification performances are close. The proposed distance measure\ndefines such an equivalent relation among network parameter vectors that\nnetworks performing equally well belong to the same equivalent class.\nInterestingly, our proposed distance measure can provably serve as a metric on\nthe quotient set modulo the equivalent relation. Then, under quite mild\nconditions it is shown that, apart from a vanishingly small subset of networks\nlikely to predict non-unique labels, our proposed metric space is compact, and\ncoincides with the well-known quotient topological space. Our study contributes\nto fundamental understanding of DL, and opens up new ways of studying DL using\nfruitful metric space theory.\n","authors":["Jwo-Yuh Wu","Liang-Chi Huang","Wen-Hsuan Li","Chun-Hung Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07021v2","updated":"2025-01-20T04:00:48Z","published":"2025-01-13T02:47:49Z","title":"Neural Probabilistic Circuits: Enabling Compositional and Interpretable\n  Predictions through Logical Reasoning","summary":"  End-to-end deep neural networks have achieved remarkable success across\nvarious domains but are often criticized for their lack of interpretability.\nWhile post hoc explanation methods attempt to address this issue, they often\nfail to accurately represent these black-box models, resulting in misleading or\nincomplete explanations. To overcome these challenges, we propose an inherently\ntransparent model architecture called Neural Probabilistic Circuits (NPCs),\nwhich enable compositional and interpretable predictions through logical\nreasoning. In particular, an NPC consists of two modules: an attribute\nrecognition model, which predicts probabilities for various attributes, and a\ntask predictor built on a probabilistic circuit, which enables logical\nreasoning over recognized attributes to make class predictions. To train NPCs,\nwe introduce a three-stage training algorithm comprising attribute recognition,\ncircuit construction, and joint optimization. Moreover, we theoretically\ndemonstrate that an NPC's error is upper-bounded by a linear combination of the\nerrors from its modules. To further demonstrate the interpretability of NPC, we\nprovide both the most probable explanations and the counterfactual\nexplanations. Empirical results on four benchmark datasets show that NPCs\nstrike a balance between interpretability and performance, achieving results\ncompetitive even with those of end-to-end black-box models while providing\nenhanced interpretability.\n","authors":["Weixin Chen","Simon Yu","Huajie Shao","Lui Sha","Han Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07021v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11258v1","updated":"2025-01-20T03:54:30Z","published":"2025-01-20T03:54:30Z","title":"Enhancing Uncertainty Estimation in Semantic Segmentation via\n  Monte-Carlo Frequency Dropout","summary":"  Monte-Carlo (MC) Dropout provides a practical solution for estimating\npredictive distributions in deterministic neural networks. Traditional dropout,\napplied within the signal space, may fail to account for frequency-related\nnoise common in medical imaging, leading to biased predictive estimates. A\nnovel approach extends Dropout to the frequency domain, allowing stochastic\nattenuation of signal frequencies during inference. This creates diverse global\ntextural variations in feature maps while preserving structural integrity -- a\nfactor we hypothesize and empirically show is contributing to accurately\nestimating uncertainties in semantic segmentation. We evaluated traditional\nMC-Dropout and the MC-frequency Dropout in three segmentation tasks involving\ndifferent imaging modalities: (i) prostate zones in biparametric MRI, (ii)\nliver tumors in contrast-enhanced CT, and (iii) lungs in chest X-ray scans. Our\nresults show that MC-Frequency Dropout improves calibration, convergence, and\nsemantic uncertainty, thereby improving prediction scrutiny, boundary\ndelineation, and has the potential to enhance medical decision-making.\n","authors":["Tal Zeevi","Lawrence H. Staib","John A. Onofrey"],"pdf_url":"https://arxiv.org/pdf/2501.11258v1.pdf","comment":"Accepted by IEEE ISBI 2025 4-page paper. Code for the implementation\n  is available at https://github.com/talze/frequency-dropout"},{"id":"http://arxiv.org/abs/2410.11317v2","updated":"2025-01-20T03:50:35Z","published":"2024-10-15T06:31:04Z","title":"Deciphering the Chaos: Enhancing Jailbreak Attacks via Adversarial\n  Prompt Translation","summary":"  Automatic adversarial prompt generation provides remarkable success in\njailbreaking safely-aligned large language models (LLMs). Existing\ngradient-based attacks, while demonstrating outstanding performance in\njailbreaking white-box LLMs, often generate garbled adversarial prompts with\nchaotic appearance. These adversarial prompts are difficult to transfer to\nother LLMs, hindering their performance in attacking unknown victim models. In\nthis paper, for the first time, we delve into the semantic meaning embedded in\ngarbled adversarial prompts and propose a novel method that \"translates\" them\ninto coherent and human-readable natural language adversarial prompts. In this\nway, we can effectively uncover the semantic information that triggers\nvulnerabilities of the model and unambiguously transfer it to the victim model,\nwithout overlooking the adversarial information hidden in the garbled text, to\nenhance jailbreak attacks. It also offers a new approach to discovering\neffective designs for jailbreak prompts, advancing the understanding of\njailbreak attacks. Experimental results demonstrate that our method\nsignificantly improves the success rate of jailbreak attacks against various\nsafety-aligned LLMs and outperforms state-of-the-arts by large margins. With at\nmost 10 queries, our method achieves an average attack success rate of 81.8% in\nattacking 7 commercial closed-source LLMs, including GPT and Claude-3 series,\non HarmBench. Our method also achieves over 90% attack success rates against\nLlama-2-Chat models on AdvBench, despite their outstanding resistance to\njailbreak attacks. Code at:\nhttps://github.com/qizhangli/Adversarial-Prompt-Translator.\n","authors":["Qizhang Li","Xiaochen Yang","Wangmeng Zuo","Yiwen Guo"],"pdf_url":"https://arxiv.org/pdf/2410.11317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04022v2","updated":"2025-01-20T03:44:43Z","published":"2024-10-05T03:54:25Z","title":"Efficient Large-Scale Urban Parking Prediction: Graph Coarsening Based\n  on Real-Time Parking Service Capability","summary":"  With the sharp increase in the number of vehicles, the issue of parking\ndifficulties has emerged as an urgent challenge that many cities need to\naddress promptly. In the task of predicting large-scale urban parking data,\nexisting research often lacks effective deep learning models and strategies. To\ntackle this challenge, this paper proposes an innovative framework for\npredicting large-scale urban parking graphs leveraging real-time service\ncapabilities, aimed at improving the accuracy and efficiency of parking\npredictions. Specifically, we introduce a graph attention mechanism that\nassesses the real-time service capabilities of parking lots to construct a\ndynamic parking graph that accurately reflects real preferences in parking\nbehavior. To effectively handle large-scale parking data, this study combines\ngraph coarsening techniques with temporal convolutional autoencoders to achieve\nunified dimension reduction of the complex urban parking graph structure and\nfeatures. Subsequently, we use a spatio-temporal graph convolutional model to\nmake predictions based on the coarsened graph, and a pre-trained\nautoencoder-decoder module restores the predicted results to their original\ndata dimensions, completing the task. Our methodology has been rigorously\ntested on a real dataset from parking lots in Shenzhen. The experimental\nresults indicate that compared to traditional parking prediction models, our\nframework achieves improvements of 46.8\\% and 30.5\\% in accuracy and\nefficiency, respectively. Remarkably, with the expansion of the graph's scale,\nour framework's advantages become even more apparent, showcasing its\nsubstantial potential for solving complex urban parking dilemmas in practical\nscenarios.\n","authors":["Yixuan Wang","Zhenwu Chen","Kangshuai Zhang","Yunduan Cui","Yang Yang","Lei Peng"],"pdf_url":"https://arxiv.org/pdf/2410.04022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11247v1","updated":"2025-01-20T03:21:20Z","published":"2025-01-20T03:21:20Z","title":"Multivariate Wireless Link Quality Prediction Based on Pre-trained Large\n  Language Models","summary":"  Accurate and reliable link quality prediction (LQP) is crucial for optimizing\nnetwork performance, ensuring communication stability, and enhancing user\nexperience in wireless communications. However, LQP faces significant\nchallenges due to the dynamic and lossy nature of wireless links, which are\ninfluenced by interference, multipath effects, fading, and blockage. In this\npaper, we propose GAT-LLM, a novel multivariate wireless link quality\nprediction model that combines Large Language Models (LLMs) with Graph\nAttention Networks (GAT) to enable accurate and reliable multivariate LQP of\nwireless communications. By framing LQP as a time series prediction task and\nappropriately preprocessing the input data, we leverage LLMs to improve the\naccuracy of link quality prediction. To address the limitations of LLMs in\nmultivariate prediction due to typically handling one-dimensional data, we\nintegrate GAT to model interdependencies among multiple variables across\ndifferent protocol layers, enhancing the model's ability to handle complex\ndependencies. Experimental results demonstrate that GAT-LLM significantly\nimproves the accuracy and robustness of link quality prediction, particularly\nin multi-step prediction scenarios.\n","authors":["Zhuangzhuang Yan","Xinyu Gu","Shilong Fan","Zhenyu Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06701v2","updated":"2025-01-20T03:16:01Z","published":"2025-01-12T03:49:47Z","title":"Sequential Portfolio Selection under Latent Side Information-Dependence\n  Structure: Optimality and Universal Learning Algorithms","summary":"  This paper investigates the investment problem of constructing an optimal\nno-short sequential portfolio strategy in a market with a latent dependence\nstructure between asset prices and partly unobservable side information, which\nis often high-dimensional. The results demonstrate that a dynamic strategy,\nwhich forms a portfolio based on perfect knowledge of the dependence structure\nand full market information over time, may not grow at a higher rate infinitely\noften than a constant strategy, which remains invariant over time.\nSpecifically, if the market is stationary, implying that the dependence\nstructure is statistically stable, the growth rate of an optimal dynamic\nstrategy, utilizing the maximum capacity of the entire market information,\nalmost surely decays over time into an equilibrium state, asymptotically\nconverging to the growth rate of a constant strategy.\n  Technically, this work reassesses the common belief that a constant strategy\nonly attains the optimal limiting growth rate of dynamic strategies when the\nmarket process is identically and independently distributed. By analyzing the\ndynamic log-optimal portfolio strategy as the optimal benchmark in a stationary\nmarket with side information, we show that a random optimal constant strategy\nalmost surely exists, even when a limiting growth rate for the dynamic strategy\ndoes not. Consequently, two approaches to learning algorithms for portfolio\nconstruction are discussed, demonstrating the safety of removing side\ninformation from the learning process while still guaranteeing an asymptotic\ngrowth rate comparable to that of the optimal dynamic strategy.\n","authors":["Duy Khanh Lam"],"pdf_url":"https://arxiv.org/pdf/2501.06701v2.pdf","comment":"34 pages, working paper, second draft (with the remark in section 3.2\n  removed from the first draft)"},{"id":"http://arxiv.org/abs/2501.11240v1","updated":"2025-01-20T03:01:58Z","published":"2025-01-20T03:01:58Z","title":"Fast instance-specific algorithm configuration with graph neural network","summary":"  Combinatorial optimization (CO) problems are pivotal across various\nindustrial applications, where the speed of solving these problems is crucial.\nImproving the performance of CO solvers across diverse input instances requires\nfine-tuning solver parameters for each instance. However, this tuning process\nis time-consuming, and the time required increases with the number of\ninstances. To address this, a method called instance-specific algorithm\nconfiguration (ISAC) has been devised. This approach involves two main steps:\ntraining and execution. During the training step, features are extracted from\nvarious instances and then grouped into clusters. For each cluster, parameters\nare fine-tuned. This cluster-specific tuning process results in a set of\ngeneralized parameters for instances belonging to each class. In the execution\nstep, features are extracted from an unknown instance to determine its cluster,\nand the corresponding pre-tuned parameters are applied. Generally, the running\ntime of a solver is evaluated by the time to solution ($TTS$). However, methods\nlike ISAC require preprocessing. Therefore, the total execution time is\n$T_{tot}=TTS+T_{tune}$, where $T_{tune}$ represents the tuning time. While the\ngoal is to minimize $T_{tot}$, it is important to note that extracting features\nin the ISAC method requires a certain amount of computational time. The\nextracting features include summary statistics of the solver execution logs,\nwhich takes several 10 seconds. This research presents a method to\nsignificantly reduce the time of the ISAC execution step by streamlining\nfeature extraction and class determination with a graph neural network.\nExperimental results show that $T_{tune}$ in the execution step, which take\nseveral 10 seconds in the original ISAC manner, could be reduced to\nsub-seconds.\n","authors":["Shingo Aihara","Matthieu Parizy"],"pdf_url":"https://arxiv.org/pdf/2501.11240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11238v1","updated":"2025-01-20T02:57:02Z","published":"2025-01-20T02:57:02Z","title":"WSSM: Geographic-enhanced hierarchical state-space model for global\n  station weather forecast","summary":"  Global Station Weather Forecasting (GSWF), a prominent meteorological\nresearch area, is pivotal in providing timely localized weather predictions.\nDespite the progress existing models have made in the overall accuracy of the\nGSWF, executing high-precision extreme event prediction still presents a\nsubstantial challenge. The recent emergence of state-space models, with their\nability to efficiently capture continuous-time dynamics and latent states,\noffer potential solutions. However, early investigations indicated that Mamba\nunderperforms in the context of GSWF, suggesting further adaptation and\noptimization. To tackle this problem, in this paper, we introduce Weather\nState-space Model (WSSM), a novel Mamba-based approach tailored for GSWF.\nGeographical knowledge is integrated in addition to the widely-used positional\nencoding to represent the absolute special-temporal position. The multi-scale\ntime-frequency features are synthesized from coarse to fine to model the\nseasonal to extreme weather dynamic. Our method effectively improves the\noverall prediction accuracy and addresses the challenge of forecasting extreme\nweather events. The state-of-the-art results obtained on the Weather-5K subset\nunderscore the efficacy of the WSSM\n","authors":["Songru Yang","Zili Liu","Zhenwei Shi","Zhengxia Zou"],"pdf_url":"https://arxiv.org/pdf/2501.11238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09929v2","updated":"2025-01-20T02:51:47Z","published":"2025-01-17T02:55:23Z","title":"Steering Large Language Models with Feature Guided Activation Additions","summary":"  Effective and reliable control over large language model (LLM) behavior is a\nsignificant challenge. While activation steering methods, which add steering\nvectors to a model's hidden states, are a promising approach, existing\ntechniques often lack precision and interpretability in how they influence\nmodel outputs. We introduce Feature Guided Activation Additions (FGAA), a novel\nactivation steering method that leverages insights from Contrastive Activation\nAddition (CAA) and Sparse Autoencoder-Targeted Steering (SAE-TS). By operating\nin the latent space of a Sparse Autoencoder (SAE) and employing optimization\ntechniques to select desired SAE features, FGAA constructs precise steering\nvectors that provide better steering effects while maintaining coherence of\nsteered model outputs. In this regard, evaluations on Gemma-2-2B and Gemma-2-9B\nmodels across various steering tasks demonstrate that FGAA outperforms existing\nsteering methods of CAA, SAE decoder steering, and SAE-TS. Our results also\nhighlight important trade-offs between steering scale and general model\ncapabilities that are consistent across all tested steering methods.\n","authors":["Samuel Soo","Wesley Teng","Chandrasekaran Balaganesh"],"pdf_url":"https://arxiv.org/pdf/2501.09929v2.pdf","comment":"7 maintext pages, 14 appendix pages"},{"id":"http://arxiv.org/abs/2501.11236v1","updated":"2025-01-20T02:48:07Z","published":"2025-01-20T02:48:07Z","title":"A New Formulation of Lipschitz Constrained With Functional Gradient\n  Learning for GANs","summary":"  This paper introduces a promising alternative method for training Generative\nAdversarial Networks (GANs) on large-scale datasets with clear theoretical\nguarantees. GANs are typically learned through a minimax game between a\ngenerator and a discriminator, which is known to be empirically unstable.\nPrevious learning paradigms have encountered mode collapse issues without a\ntheoretical solution. To address these challenges, we propose a novel\nLipschitz-constrained Functional Gradient GANs learning (Li-CFG) method to\nstabilize the training of GAN and provide a theoretical foundation for\neffectively increasing the diversity of synthetic samples by reducing the\nneighborhood size of the latent vector. Specifically, we demonstrate that the\nneighborhood size of the latent vector can be reduced by increasing the norm of\nthe discriminator gradient, resulting in enhanced diversity of synthetic\nsamples. To efficiently enlarge the norm of the discriminator gradient, we\nintroduce a novel {\\epsilon}-centered gradient penalty that amplifies the norm\nof the discriminator gradient using the hyper-parameter {\\epsilon}. In\ncomparison to other constraints, our method enlarging the discriminator norm,\nthus obtaining the smallest neighborhood size of the latent vector. Extensive\nexperiments on benchmark datasets for image generation demonstrate the efficacy\nof the Li-CFG method and the {\\epsilon}-centered gradient penalty. The results\nshowcase improved stability and increased diversity of synthetic samples.\n","authors":["Chang Wan","Ke Fan","Xinwei Sun","Yanwei Fu","Minglu Li","Yunliang Jiang","Zhonglong Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.11236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11222v1","updated":"2025-01-20T02:16:03Z","published":"2025-01-20T02:16:03Z","title":"An Imbalanced Learning-based Sampling Method for Physics-informed Neural\n  Networks","summary":"  This paper introduces Residual-based Smote (RSmote), an innovative local\nadaptive sampling technique tailored to improve the performance of\nPhysics-Informed Neural Networks (PINNs) through imbalanced learning\nstrategies. Traditional residual-based adaptive sampling methods, while\neffective in enhancing PINN accuracy, often struggle with efficiency and high\nmemory consumption, particularly in high-dimensional problems. RSmote addresses\nthese challenges by targeting regions with high residuals and employing\noversampling techniques from imbalanced learning to refine the sampling\nprocess. Our approach is underpinned by a rigorous theoretical analysis that\nsupports the effectiveness of RSmote in managing computational resources more\nefficiently. Through extensive evaluations, we benchmark RSmote against the\nstate-of-the-art Residual-based Adaptive Distribution (RAD) method across a\nvariety of dimensions and differential equations. The results demonstrate that\nRSmote not only achieves or exceeds the accuracy of RAD but also significantly\nreduces memory usage, making it particularly advantageous in high-dimensional\nscenarios. These contributions position RSmote as a robust and\nresource-efficient solution for solving complex partial differential equations,\nespecially when computational constraints are a critical consideration.\n","authors":["Jiaqi Luo","Yahong Yang","Yuan Yuan","Shixin Xu","Wenrui Hao"],"pdf_url":"https://arxiv.org/pdf/2501.11222v1.pdf","comment":"11 figures,7 tables"},{"id":"http://arxiv.org/abs/2501.11218v1","updated":"2025-01-20T01:49:37Z","published":"2025-01-20T01:49:37Z","title":"Leveraging GANs For Active Appearance Models Optimized Model Fitting","summary":"  Generative Adversarial Networks (GANs) have gained prominence in refining\nmodel fitting tasks in computer vision, particularly in domains involving\ndeformable models like Active Appearance Models (AAMs). This paper explores the\nintegration of GANs to enhance the AAM fitting process, addressing challenges\nin optimizing nonlinear parameters associated with appearance and shape\nvariations. By leveraging GANs' adversarial training framework, the aim is to\nminimize fitting errors and improve convergence rates. Achieving robust\nperformance even in cases with high appearance variability and occlusions. Our\napproach demonstrates significant improvements in accuracy and computational\nefficiency compared to traditional optimization techniques, thus establishing\nGANs as a potent tool for advanced image model fitting.\n","authors":["Anurag Awasthi"],"pdf_url":"https://arxiv.org/pdf/2501.11218v1.pdf","comment":"9 pages, 2 figures, in proceeding at conference"},{"id":"http://arxiv.org/abs/2501.11214v1","updated":"2025-01-20T01:28:49Z","published":"2025-01-20T01:28:49Z","title":"Mitigating Spatial Disparity in Urban Prediction Using Residual-Aware\n  Spatiotemporal Graph Neural Networks: A Chicago Case Study","summary":"  Urban prediction tasks, such as forecasting traffic flow, temperature, and\ncrime rates, are crucial for efficient urban planning and management. However,\nexisting Spatiotemporal Graph Neural Networks (ST-GNNs) often rely solely on\naccuracy, overlooking spatial and demographic disparities in their predictions.\nThis oversight can lead to imbalanced resource allocation and exacerbate\nexisting inequities in urban areas. This study introduces a Residual-Aware\nAttention (RAA) Block and an equality-enhancing loss function to address these\ndisparities. By adapting the adjacency matrix during training and incorporating\nspatial disparity metrics, our approach aims to reduce local segregation of\nresiduals and errors. We applied our methodology to urban prediction tasks in\nChicago, utilizing a travel demand dataset as an example. Our model achieved a\n48% significant improvement in fairness metrics with only a 9% increase in\nerror metrics. Spatial analysis of residual distributions revealed that models\nwith RAA Blocks produced more equitable prediction results, particularly by\nreducing errors clustered in central regions. Attention maps demonstrated the\nmodel's ability to dynamically adjust focus, leading to more balanced\npredictions. Case studies of various community areas in Chicago further\nillustrated the effectiveness of our approach in addressing spatial and\ndemographic disparities, supporting more balanced and equitable urban planning\nand policy-making.\n","authors":["Dingyi Zhuang","Hanyong Xu","Xiaotong Guo","Yunhan Zheng","Shenhao Wang","Jinhua Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.11214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11213v1","updated":"2025-01-20T01:25:07Z","published":"2025-01-20T01:25:07Z","title":"Risk Analysis of Flowlines in the Oil and Gas Sector: A GIS and Machine\n  Learning Approach","summary":"  This paper presents a risk analysis of flowlines in the oil and gas sector\nusing Geographic Information Systems (GIS) and machine learning (ML).\nFlowlines, vital conduits transporting oil, gas, and water from wellheads to\nsurface facilities, often face under-assessment compared to transmission\npipelines. This study addresses this gap using advanced tools to predict and\nmitigate failures, improving environmental safety and reducing human exposure.\nExtensive datasets from the Colorado Energy and Carbon Management Commission\n(ECMC) were processed through spatial matching, feature engineering, and\ngeometric extraction to build robust predictive models. Various ML algorithms,\nincluding logistic regression, support vector machines, gradient boosting\ndecision trees, and K-Means clustering, were used to assess and classify risks,\nwith ensemble classifiers showing superior accuracy, especially when paired\nwith Principal Component Analysis (PCA) for dimensionality reduction. Finally,\na thorough data analysis highlighted spatial and operational factors\ninfluencing risks, identifying high-risk zones for focused monitoring. Overall,\nthe study demonstrates the transformative potential of integrating GIS and ML\nin flowline risk management, proposing a data-driven approach that emphasizes\nthe need for accurate data and refined models to improve safety in petroleum\nextraction.\n","authors":["I. Chittumuri","N. Alshehab","R. J. Voss","L. L. Douglass","S. Kamrava","Y. Fan","J. Miskimins","W. Fleckenstein","S. Bandyopadhyay"],"pdf_url":"https://arxiv.org/pdf/2501.11213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11211v1","updated":"2025-01-20T01:03:50Z","published":"2025-01-20T01:03:50Z","title":"Ditto: Accelerating Diffusion Model via Temporal Value Similarity","summary":"  Diffusion models achieve superior performance in image generation tasks.\nHowever, it incurs significant computation overheads due to its iterative\nstructure. To address these overheads, we analyze this iterative structure and\nobserve that adjacent time steps in diffusion models exhibit high value\nsimilarity, leading to narrower differences between consecutive time steps. We\nadapt these characteristics to a quantized diffusion model and reveal that the\nmajority of these differences can be represented with reduced bit-width, and\neven zero. Based on our observations, we propose the Ditto algorithm, a\ndifference processing algorithm that leverages temporal similarity with\nquantization to enhance the efficiency of diffusion models. By exploiting the\nnarrower differences and the distributive property of layer operations, it\nperforms full bit-width operations for the initial time step and processes\nsubsequent steps with temporal differences. In addition, Ditto execution flow\noptimization is designed to mitigate the memory overhead of temporal difference\nprocessing, further boosting the efficiency of the Ditto algorithm. We also\ndesign the Ditto hardware, a specialized hardware accelerator, fully exploiting\nthe dynamic characteristics of the proposed algorithm. As a result, the Ditto\nhardware achieves up to 1.5x speedup and 17.74% energy saving compared to other\naccelerators.\n","authors":["Sungbin Kim","Hyunwuk Lee","Wonho Cho","Mincheol Park","Won Woo Ro"],"pdf_url":"https://arxiv.org/pdf/2501.11211v1.pdf","comment":"Accepted for publication at the 2025 IEEE International Symposium on\n  High-Performance Computer Architecture (HPCA 2025)"},{"id":"http://arxiv.org/abs/2302.13425v6","updated":"2025-01-20T00:39:08Z","published":"2023-02-26T22:30:08Z","title":"A Survey on Uncertainty Quantification Methods for Deep Learning","summary":"  Deep neural networks (DNNs) have achieved tremendous success in making\naccurate predictions for computer vision, natural language processing, as well\nas science and engineering domains. However, it is also well-recognized that\nDNNs sometimes make unexpected, incorrect, but overconfident predictions. This\ncan cause serious consequences in high-stake applications, such as autonomous\ndriving, medical diagnosis, and disaster response. Uncertainty quantification\n(UQ) aims to estimate the confidence of DNN predictions beyond prediction\naccuracy. In recent years, many UQ methods have been developed for DNNs. It is\nof great practical value to systematically categorize these UQ methods and\ncompare their advantages and disadvantages. However, existing surveys mostly\nfocus on categorizing UQ methodologies from a neural network architecture\nperspective or a Bayesian perspective and ignore the source of uncertainty that\neach methodology can incorporate, making it difficult to select an appropriate\nUQ method in practice. To fill the gap, this paper presents a systematic\ntaxonomy of UQ methods for DNNs based on the types of uncertainty sources (data\nuncertainty versus model uncertainty). We summarize the advantages and\ndisadvantages of methods in each category. We show how our taxonomy of UQ\nmethodologies can potentially help guide the choice of UQ method in different\nmachine learning problems (e.g., active learning, robustness, and reinforcement\nlearning). We also identify current research gaps and propose several future\nresearch directions.\n","authors":["Wenchong He","Zhe Jiang","Tingsong Xiao","Zelin Xu","Yukun Li"],"pdf_url":"https://arxiv.org/pdf/2302.13425v6.pdf","comment":"39 pages, 13 figures"},{"id":"http://arxiv.org/abs/2306.14522v5","updated":"2025-01-20T00:26:58Z","published":"2023-06-26T08:54:46Z","title":"Nonconvex Stochastic Bregman Proximal Gradient Method with Application\n  to Deep Learning","summary":"  Stochastic gradient methods for minimizing nonconvex composite objective\nfunctions typically rely on the Lipschitz smoothness of the differentiable\npart, but this assumption fails in many important problem classes like\nquadratic inverse problems and neural network training, leading to instability\nof the algorithms in both theory and practice. To address this, we propose a\nfamily of stochastic Bregman proximal gradient (SBPG) methods that only require\nsmooth adaptivity. SBPG replaces the quadratic approximation in SGD with a\nBregman proximity measure, offering a better approximation model that handles\nnon-Lipschitz gradients in nonconvex objectives. We establish the convergence\nproperties of vanilla SBPG and show it achieves optimal sample complexity in\nthe nonconvex setting. Experimental results on quadratic inverse problems\ndemonstrate SBPG's robustness in terms of stepsize selection and sensitivity to\nthe initial point. Furthermore, we introduce a momentum-based variant, MSBPG,\nwhich enhances convergence by relaxing the mini-batch size requirement while\npreserving the optimal oracle complexity. We apply MSBPG to the training of\ndeep neural networks, utilizing a polynomial kernel function to ensure smooth\nadaptivity of the loss function. Experimental results on benchmark datasets\nconfirm the effectiveness and robustness of MSBPG in training neural networks.\nGiven its negligible additional computational cost compared to SGD in\nlarge-scale optimization, MSBPG shows promise as a universal open-source\noptimizer for future applications.\n","authors":["Kuangyu Ding","Jingyang Li","Kim-Chuan Toh"],"pdf_url":"https://arxiv.org/pdf/2306.14522v5.pdf","comment":"44 pages"},{"id":"http://arxiv.org/abs/2501.07032v2","updated":"2025-01-20T00:26:53Z","published":"2025-01-13T03:07:39Z","title":"PRKAN: Parameter-Reduced Kolmogorov-Arnold Networks","summary":"  Kolmogorov-Arnold Networks (KANs) represent an innovation in neural network\narchitectures, offering a compelling alternative to Multi-Layer Perceptrons\n(MLPs) in models such as Convolutional Neural Networks (CNNs), Recurrent Neural\nNetworks (RNNs), and Transformers. By advancing network design, KANs drive\ngroundbreaking research and enable transformative applications across various\nscientific domains involving neural networks. However, existing KANs often\nrequire significantly more parameters in their network layers than MLPs. To\naddress this limitation, this paper introduces PRKANs (Parameter-Reduced\nKolmogorov-Arnold Networks), which employ several methods to reduce the\nparameter count in KAN layers, making them comparable to MLP layers.\nExperimental results on the MNIST and Fashion-MNIST datasets demonstrate that\nPRKANs outperform several existing KANs, and their variant with attention\nmechanisms rivals the performance of MLPs, albeit with slightly longer training\ntimes. Furthermore, the study highlights the advantages of Gaussian Radial\nBasis Functions (GRBFs) and layer normalization in KAN designs. The repository\nfor this work is available at: https://github.com/hoangthangta/All-KAN.\n","authors":["Hoang-Thang Ta","Duy-Quy Thai","Anh Tran","Grigori Sidorov","Alexander Gelbukh"],"pdf_url":"https://arxiv.org/pdf/2501.07032v2.pdf","comment":"23 pages"}],"Multimedia":[{"id":"http://arxiv.org/abs/2501.11403v1","updated":"2025-01-20T11:06:05Z","published":"2025-01-20T11:06:05Z","title":"Verifying Cross-modal Entity Consistency in News using Vision-language\n  Models","summary":"  The web has become a crucial source of information, but it is also used to\nspread disinformation, often conveyed through multiple modalities like images\nand text. The identification of inconsistent cross-modal information, in\nparticular entities such as persons, locations, and events, is critical to\ndetect disinformation. Previous works either identify out-of-context\ndisinformation by assessing the consistency of images to the whole document,\nneglecting relations of individual entities, or focus on generic entities that\nare not relevant to news. So far, only few approaches have addressed the task\nof validating entity consistency between images and text in news. However, the\npotential of large vision-language models (LVLMs) has not been explored yet. In\nthis paper, we propose an LVLM-based framework for verifying Cross-modal Entity\nConsistency~(LVLM4CEC), to assess whether persons, locations and events in news\narticles are consistent across both modalities. We suggest effective prompting\nstrategies for LVLMs for entity verification that leverage reference images\ncrawled from web. Moreover, we extend three existing datasets for the task of\nentity verification in news providing manual ground-truth data. Our results\nshow the potential of LVLMs for automating cross-modal entity verification,\nshowing improved accuracy in identifying persons and events when using evidence\nimages. Moreover, our method outperforms a baseline for location and event\nverification in documents. The datasets and source code are available on GitHub\nat \\url{https://github.com/TIBHannover/LVLM4CEC}.\n","authors":["Sahar Tahmasebi","Eric Müller-Budack","Ralph Ewerth"],"pdf_url":"https://arxiv.org/pdf/2501.11403v1.pdf","comment":"Accepted for publication in: European Conference on Information\n  Retrieval (ECIR) 2025"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.11790v1","updated":"2025-01-20T23:41:22Z","published":"2025-01-20T23:41:22Z","title":"Benchmarking Large Language Models via Random Variables","summary":"  With the continuous advancement of large language models (LLMs) in\nmathematical reasoning, evaluating their performance in this domain has become\na prominent research focus. Recent studies have raised concerns about the\nreliability of current mathematical benchmarks, highlighting issues such as\nsimplistic design and potential data leakage. Therefore, creating a reliable\nbenchmark that effectively evaluates the genuine capabilities of LLMs in\nmathematical reasoning remains a significant challenge. To address this, we\npropose RV-Bench, a framework for Benchmarking LLMs via Random Variables in\nmathematical reasoning. Specifically, the background content of a random\nvariable question (RV question) mirrors the original problem in existing\nstandard benchmarks, but the variable combinations are randomized into\ndifferent values. LLMs must fully understand the problem-solving process for\nthe original problem to correctly answer RV questions with various combinations\nof variable values. As a result, the LLM's genuine capability in mathematical\nreasoning is reflected by its accuracy on RV-Bench. Extensive experiments are\nconducted with 29 representative LLMs across 900+ RV questions. A leaderboard\nfor RV-Bench ranks the genuine capability of these LLMs. Further analysis of\naccuracy dropping indicates that current LLMs still struggle with complex\nmathematical reasoning problems.\n","authors":["Zijin Hong","Hao Wu","Su Dong","Junnan Dong","Yilin Xiao","Yujing Zhang","Zhu Wang","Feiran Huang","Linyi Li","Hongxia Yang","Xiao Huang"],"pdf_url":"https://arxiv.org/pdf/2501.11790v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2410.05637v2","updated":"2025-01-20T23:38:41Z","published":"2024-10-08T02:40:04Z","title":"Federated Neural Nonparametric Point Processes","summary":"  Temporal point processes (TPPs) are effective for modeling event occurrences\nover time, but they struggle with sparse and uncertain events in federated\nsystems, where privacy is a major concern. To address this, we propose\n\\textit{FedPP}, a Federated neural nonparametric Point Process model. FedPP\nintegrates neural embeddings into Sigmoidal Gaussian Cox Processes (SGCPs) on\nthe client side, which is a flexible and expressive class of TPPs, allowing it\nto generate highly flexible intensity functions that capture client-specific\nevent dynamics and uncertainties while efficiently summarizing historical\nrecords. For global aggregation, FedPP introduces a divergence-based mechanism\nthat communicates the distributions of SGCPs' kernel hyperparameters between\nthe server and clients, while keeping client-specific parameters local to\nensure privacy and personalization. FedPP effectively captures event\nuncertainty and sparsity, and extensive experiments demonstrate its superior\nperformance in federated settings, particularly with KL divergence and\nWasserstein distance-based global aggregation.\n","authors":["Hui Chen","Xuhui Fan","Hengyu Liu","Yaqiong Li","Zhilin Zhao","Feng Zhou","Christopher John Quinn","Longbing Cao"],"pdf_url":"https://arxiv.org/pdf/2410.05637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14570v2","updated":"2025-01-20T23:37:35Z","published":"2024-12-19T06:41:06Z","title":"Characterising Simulation-Based Program Equilibria","summary":"  In Tennenholtz's program equilibrium, players of a game submit programs to\nplay on their behalf. Each program receives the other programs' source code and\noutputs an action. This can model interactions involving AI agents, mutually\ntransparent institutions, or commitments. Tennenholtz (2004) proves a folk\ntheorem for program games, but the equilibria constructed are very brittle. We\ntherefore consider simulation-based programs -- i.e., programs that work by\nrunning opponents' programs. These are relatively robust (in particular, two\nprograms that act the same are treated the same) and are more practical than\nproof-based approaches. Oesterheld's (2019) $\\epsilon$Grounded$\\pi$Bot is such\nan approach. Unfortunately, it is not generally applicable to games of three or\nmore players, and only allows for a limited range of equilibria in two player\ngames. In this paper, we propose a generalisation to Oesterheld's (2019)\n$\\epsilon$Grounded$\\pi$Bot. We prove a folk theorem for our programs in a\nsetting with access to a shared source of randomness. We then characterise\ntheir equilibria in a setting without shared randomness. Both with and without\nshared randomness, we achieve a much wider range of equilibria than\nOesterheld's (2019) $\\epsilon$Grounded$\\pi$Bot. Finally, we explore the limits\nof simulation-based program equilibrium, showing that the Tennenholtz folk\ntheorem cannot be attained by simulation-based programs without access to\nshared randomness.\n","authors":["Emery Cooper","Caspar Oesterheld","Vincent Conitzer"],"pdf_url":"https://arxiv.org/pdf/2412.14570v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14596v5","updated":"2025-01-20T23:33:33Z","published":"2024-06-20T17:45:02Z","title":"VLM Agents Generate Their Own Memories: Distilling Experience into\n  Embodied Programs of Thought","summary":"  Large-scale LLMs and VLMs excel at few-shot learning but require high-quality\nexamples. We introduce In-Context Abstraction Learning (ICAL), which\niteratively refines suboptimal trajectories into high-quality data with\noptimized actions and detailed reasoning. Given an inefficient demonstration, a\nVLM corrects actions and annotates causal relationships, object states,\nsubgoals, and task-relevant visuals, forming \"programs of thought.\" With human\nfeedback, these programs are improved as the agent executes them in a similar\nenvironment. The resulting examples, used as prompt context or fine-tuning\ndata, significantly boost decision-making while reducing human feedback needs.\nICAL surpasses state-of-the-art in TEACh (dialogue-based instruction\nfollowing), VisualWebArena (multimodal web agents), and Ego4D (egocentric video\naction anticipation). In TEACh, combining fine-tuning and retrieval on ICAL\nexamples outperforms raw human demonstrations and expert examples, achieving a\n17.5% increase in goal-condition success. In VisualWebArena,\nretrieval-augmented GPT-4V with ICAL improves task success rate 1.6x over\nGPT-4V, while fine-tuning Qwen2-VL achieves a 2.8x improvement. In Ego4D, ICAL\noutperforms few-shot GPT-4V and remains competitive with supervised models.\nOverall, ICAL scales 2x better than raw human demonstrations and reduces manual\nprompt engineering.\n","authors":["Gabriel Sarch","Lawrence Jang","Michael J. Tarr","William W. Cohen","Kenneth Marino","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2406.14596v5.pdf","comment":"Project website: https://ical-learning.github.io/"},{"id":"http://arxiv.org/abs/2501.11782v1","updated":"2025-01-20T23:14:23Z","published":"2025-01-20T23:14:23Z","title":"Human-AI Collaborative Game Testing with Vision Language Models","summary":"  As modern video games become increasingly complex, traditional manual testing\nmethods are proving costly and inefficient, limiting the ability to ensure\nhigh-quality game experiences. While advancements in Artificial Intelligence\n(AI) offer the potential to assist human testers, the effectiveness of AI in\ntruly enhancing real-world human performance remains underexplored. This study\ninvestigates how AI can improve game testing by developing and experimenting\nwith an AI-assisted workflow that leverages state-of-the-art machine learning\nmodels for defect detection. Through an experiment involving 800 test cases and\n276 participants of varying backgrounds, we evaluate the effectiveness of AI\nassistance under four conditions: with or without AI support, and with or\nwithout detailed knowledge of defects and design documentation. The results\nindicate that AI assistance significantly improves defect identification\nperformance, particularly when paired with detailed knowledge. However,\nchallenges arise when AI errors occur, negatively impacting human\ndecision-making. Our findings show the importance of optimizing human-AI\ncollaboration and implementing strategies to mitigate the effects of AI\ninaccuracies. By this research, we demonstrate AI's potential and problems in\nenhancing efficiency and accuracy in game testing workflows and offers\npractical insights for integrating AI into the testing process.\n","authors":["Boran Zhang","Muhan Xu","Zhijun Pan"],"pdf_url":"https://arxiv.org/pdf/2501.11782v1.pdf","comment":"Experiment Report"},{"id":"http://arxiv.org/abs/2409.02038v2","updated":"2025-01-20T22:24:48Z","published":"2024-09-03T16:37:45Z","title":"BEAVER: An Enterprise Benchmark for Text-to-SQL","summary":"  Existing text-to-SQL benchmarks have largely been constructed from web tables\nwith human-generated question-SQL pairs. LLMs typically show strong results on\nthese benchmarks, leading to a belief that LLMs are effective at text-to-SQL\ntasks. However, how these results transfer to enterprise settings is unclear\nbecause tables in enterprise databases might differ substantially from web\ntables in structure and content. To contend with this problem, we introduce a\nnew dataset BEAVER, the first enterprise text-to-SQL benchmark sourced from\nreal private enterprise data warehouses. This dataset includes natural language\nqueries and their correct SQL statements, which we collected from actual query\nlogs. We then benchmark off-the-shelf LLMs on this dataset. LLMs perform\npoorly, even when augmented with standard prompt engineering and RAG\ntechniques. We identify three main reasons for the poor performance: (1)\nschemas of enterprise tables are more complex than the schemas in public data,\nresulting in SQL-generation tasks intrinsically harder; (2) business-oriented\nquestions are often more complex, requiring joins over multiple tables,\naggregations, and nested queries; (3) public LLMs cannot train on private\nenterprise data warehouses that are not publicly accessible, and therefore it\nis difficult for the model to learn to solve (1) and (2). We believe BEAVER\nwill facilitate future research in building text-to-SQL systems that perform\nbetter in enterprise settings.\n","authors":["Peter Baile Chen","Fabian Wenz","Yi Zhang","Devin Yang","Justin Choi","Nesime Tatbul","Michael Cafarella","Çağatay Demiralp","Michael Stonebraker"],"pdf_url":"https://arxiv.org/pdf/2409.02038v2.pdf","comment":"Dataset and code are available at\n  https://peterbaile.github.io/beaver/"},{"id":"http://arxiv.org/abs/2501.05501v2","updated":"2025-01-20T22:06:19Z","published":"2025-01-09T18:43:05Z","title":"Strategy Masking: A Method for Guardrails in Value-based Reinforcement\n  Learning Agents","summary":"  The use of reward functions to structure AI learning and decision making is\ncore to the current reinforcement learning paradigm; however, without careful\ndesign of reward functions, agents can learn to solve problems in ways that may\nbe considered \"undesirable\" or \"unethical.\" Without thorough understanding of\nthe incentives a reward function creates, it can be difficult to impose\nprincipled yet general control mechanisms over its behavior. In this paper, we\nstudy methods for constructing guardrails for AI agents that use reward\nfunctions to learn decision making. We introduce a novel approach, which we\ncall strategy masking, to explicitly learn and then suppress undesirable AI\nagent behavior. We apply our method to study lying in AI agents and show that\nit can be used to effectively modify agent behavior by suppressing lying\npost-training without compromising agent ability to perform effectively.\n","authors":["Jonathan Keane","Sam Keyser","Jeremy Kedziora"],"pdf_url":"https://arxiv.org/pdf/2501.05501v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09685v2","updated":"2025-01-20T22:00:26Z","published":"2025-01-16T17:37:35Z","title":"Inference-Time Alignment in Diffusion Models with Reward-Guided\n  Generation: Tutorial and Review","summary":"  This tutorial provides an in-depth guide on inference-time guidance and\nalignment methods for optimizing downstream reward functions in diffusion\nmodels. While diffusion models are renowned for their generative modeling\ncapabilities, practical applications in fields such as biology often require\nsample generation that maximizes specific metrics (e.g., stability, affinity in\nproteins, closeness to target structures). In these scenarios, diffusion models\ncan be adapted not only to generate realistic samples but also to explicitly\nmaximize desired measures at inference time without fine-tuning. This tutorial\nexplores the foundational aspects of such inference-time algorithms. We review\nthese methods from a unified perspective, demonstrating that current techniques\n-- such as Sequential Monte Carlo (SMC)-based guidance, value-based sampling,\nand classifier guidance -- aim to approximate soft optimal denoising processes\n(a.k.a. policies in RL) that combine pre-trained denoising processes with value\nfunctions serving as look-ahead functions that predict from intermediate states\nto terminal rewards. Within this framework, we present several novel algorithms\nnot yet covered in the literature. Furthermore, we discuss (1) fine-tuning\nmethods combined with inference-time techniques, (2) inference-time algorithms\nbased on search algorithms such as Monte Carlo tree search, which have received\nlimited attention in current research, and (3) connections between\ninference-time algorithms in language models and diffusion models. The code of\nthis tutorial on protein design is available at\nhttps://github.com/masa-ue/AlignInversePro\n","authors":["Masatoshi Uehara","Yulai Zhao","Chenyu Wang","Xiner Li","Aviv Regev","Sergey Levine","Tommaso Biancalani"],"pdf_url":"https://arxiv.org/pdf/2501.09685v2.pdf","comment":"We plan to add more content and codes. Please let us know if there\n  are any comments or missing citations"},{"id":"http://arxiv.org/abs/2501.11765v1","updated":"2025-01-20T21:58:35Z","published":"2025-01-20T21:58:35Z","title":"Is logical analysis performed by transformers taking place in\n  self-attention or in the fully connected part?","summary":"  Transformers architecture apply self-attention to tokens represented as\nvectors, before a fully connected (neuronal network) layer. These two parts can\nbe layered many times. Traditionally, self-attention is seen as a mechanism for\naggregating information before logical operations are performed by the fully\nconnected layer. In this paper, we show, that quite counter-intuitively, the\nlogical analysis can also be performed within the self-attention. For this we\nimplement a handcrafted single-level encoder layer which performs the logical\nanalysis within self-attention. We then study the scenario in which a one-level\ntransformer model undergoes self-learning using gradient descent. We\ninvestigate whether the model utilizes fully connected layers or self-attention\nmechanisms for logical analysis when it has the choice. Given that gradient\ndescent can become stuck at undesired zeros, we explicitly calculate these\nunwanted zeros and find ways to avoid them. We do all this in the context of\npredicting grammatical category pairs of adjacent tokens in a text. We believe\nthat our findings have broader implications for understanding the potential\nlogical operations performed by self-attention.\n","authors":["Evgeniy Shin","Heinrich Matzinger"],"pdf_url":"https://arxiv.org/pdf/2501.11765v1.pdf","comment":"42 pages, 3 figures, to be submitted"},{"id":"http://arxiv.org/abs/2410.05970v2","updated":"2025-01-20T21:45:19Z","published":"2024-10-08T12:17:42Z","title":"PDF-WuKong: A Large Multimodal Model for Efficient Long PDF Reading with\n  End-to-End Sparse Sampling","summary":"  Multimodal document understanding is a challenging task to process and\ncomprehend large amounts of textual and visual information. Recent advances in\nLarge Language Models (LLMs) have significantly improved the performance of\nthis task. However, existing methods typically focus on either plain text or a\nlimited number of document images, struggling to handle long PDF documents with\ninterleaved text and images, especially for academic papers. In this paper, we\nintroduce PDF-WuKong, a multimodal large language model (MLLM) which is\ndesigned to enhance multimodal question-answering (QA) for long PDF documents.\nPDF-WuKong incorporates a sparse sampler that operates on both text and image\nrepresentations, significantly improving the efficiency and capability of the\nMLLM. The sparse sampler is integrated with the MLLM's image encoder and\nselects the paragraphs or diagrams most pertinent to user queries for\nprocessing by the language model. To effectively train and evaluate our model,\nwe construct PaperPDF, a dataset consisting of a broad collection of English\nand Chinese academic papers. Multiple strategies are proposed to automatically\ngenerate 1.1 million QA pairs along with their corresponding evidence sources.\nExperimental results demonstrate the superiority and high efficiency of our\napproach over other models on the task of long multimodal document\nunderstanding, surpassing proprietary products by an average of 8.6% on F1. Our\ncode and dataset will be released at https://github.com/yh-hust/PDF-Wukong.\n","authors":["Xudong Xie","Hao Yan","Liang Yin","Yang Liu","Jing Ding","Minghui Liao","Yuliang Liu","Wei Chen","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2410.05970v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11747v1","updated":"2025-01-20T21:10:22Z","published":"2025-01-20T21:10:22Z","title":"Optimizing Pretraining Data Mixtures with LLM-Estimated Utility","summary":"  Large Language Models improve with increasing amounts of high-quality\ntraining data. However, leveraging larger datasets requires balancing quality,\nquantity, and diversity across sources. After evaluating nine baseline methods\nunder both compute- and data-constrained scenarios, we find token-count\nheuristics outperform manual and learned mixes, indicating that simple\napproaches accounting for dataset size and diversity are surprisingly\neffective. Building on this insight, we propose two complementary approaches:\nUtiliMax, which extends token-based heuristics by incorporating utility\nestimates from reduced-scale ablations, achieving up to a 10.6x speedup over\nmanual baselines; and Model Estimated Data Utility (MEDU), which leverages LLMs\nto estimate data utility from small samples, matching ablation-based\nperformance while reducing computational requirements by $\\sim$200x. Together,\nthese approaches establish a new framework for automated, compute-efficient\ndata mixing that is robust across training regimes.\n","authors":["William Held","Bhargavi Paranjape","Punit Singh Koura","Mike Lewis","Frank Zhang","Todor Mihaylov"],"pdf_url":"https://arxiv.org/pdf/2501.11747v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.11746v1","updated":"2025-01-20T21:09:33Z","published":"2025-01-20T21:09:33Z","title":"SILO: Solving Inverse Problems with Latent Operators","summary":"  Consistent improvement of image priors over the years has led to the\ndevelopment of better inverse problem solvers. Diffusion models are the\nnewcomers to this arena, posing the strongest known prior to date. Recently,\nsuch models operating in a latent space have become increasingly predominant\ndue to their efficiency. In recent works, these models have been applied to\nsolve inverse problems. Working in the latent space typically requires multiple\napplications of an Autoencoder during the restoration process, which leads to\nboth computational and restoration quality challenges. In this work, we propose\na new approach for handling inverse problems with latent diffusion models,\nwhere a learned degradation function operates within the latent space,\nemulating a known image space degradation. Usage of the learned operator\nreduces the dependency on the Autoencoder to only the initial and final steps\nof the restoration process, facilitating faster sampling and superior\nrestoration quality. We demonstrate the effectiveness of our method on a\nvariety of image restoration tasks and datasets, achieving significant\nimprovements over prior art.\n","authors":["Ron Raphaeli","Sean Man","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2501.11746v1.pdf","comment":"Project page in https://ronraphaeli.github.io/SILO-website/"},{"id":"http://arxiv.org/abs/2403.18846v2","updated":"2025-01-20T20:57:43Z","published":"2024-03-08T04:08:40Z","title":"The Blind Normalized Stein Variational Gradient Descent-Based Detection\n  for Intelligent Random Access in Cellular IoT","summary":"  The lack of an efficient preamble detection algorithm remains a challenge for\nsolving preamble collision problems in intelligent random access (RA) in the\ncellular Internet of Things (IoT). To address this problem, we present an early\npreamble detection scheme based on a maximum likelihood estimation (MLE) model\nat the first step of the grant-based RA procedure. A novel blind normalized\nStein variational gradient descent (SVGD)-based detector is proposed to obtain\nan approximate solution to the MLE model. First, by exploring the relationship\nbetween the Hadamard transform and wavelet packet transform, a new modified\nHadamard transform (MHT) is developed to separate high-frequency components\nfrom signals using the second-order derivative filter. Next, to eliminate noise\nand mitigate the vanishing gradients problem in the SVGD-based detectors, the\nblock MHT layer is designed based on the MHT, scaling layer, soft-thresholding\nlayer, inverse MHT and sparsity penalty. Then, the blind normalized SVGD\nalgorithm is derived to perform preamble detection without prior knowledge of\nnoise power and the number of active IoT devices. The experimental results show\nthe proposed block MHT layer outperforms other transform-based methods in terms\nof computation costs and denoising performance. Furthermore, with the\nassistance of the block MHT layer, the proposed blind normalized SVGD algorithm\nachieves a higher preamble detection accuracy and throughput than other\nstate-of-the-art detection methods.\n","authors":["Xin Zhu","Ahmet Enis Cetin"],"pdf_url":"https://arxiv.org/pdf/2403.18846v2.pdf","comment":"Accepted by the IEEE Internet of Things Journal"},{"id":"http://arxiv.org/abs/2501.11739v1","updated":"2025-01-20T20:54:06Z","published":"2025-01-20T20:54:06Z","title":"Episodic memory in AI agents poses risks that should be studied and\n  mitigated","summary":"  Most current AI models have little ability to store and later retrieve a\nrecord or representation of what they do. In human cognition, episodic memories\nplay an important role in both recall of the past as well as planning for the\nfuture. The ability to form and use episodic memories would similarly enable a\nbroad range of improved capabilities in an AI agent that interacts with and\ntakes actions in the world. Researchers have begun directing more attention to\ndeveloping memory abilities in AI models. It is therefore likely that models\nwith such capability will be become widespread in the near future. This could\nin some ways contribute to making such AI agents safer by enabling users to\nbetter monitor, understand, and control their actions. However, as a new\ncapability with wide applications, we argue that it will also introduce\nsignificant new risks that researchers should begin to study and address. We\noutline these risks and benefits and propose four principles to guide the\ndevelopment of episodic memory capabilities so that these will enhance, rather\nthan undermine, the effort to keep AI safe and trustworthy.\n","authors":["Chad DeChant"],"pdf_url":"https://arxiv.org/pdf/2501.11739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11730v1","updated":"2025-01-20T20:29:40Z","published":"2025-01-20T20:29:40Z","title":"Transformer Vibration Forecasting for Advancing Rail Safety and\n  Maintenance 4.0","summary":"  Maintaining railway axles is critical to preventing severe accidents and\nfinancial losses. The railway industry is increasingly interested in advanced\ncondition monitoring techniques to enhance safety and efficiency, moving beyond\ntraditional periodic inspections toward Maintenance 4.0.\n  This study introduces a robust Deep Autoregressive solution that integrates\nseamlessly with existing systems to avert mechanical failures. Our approach\nsimulates and predicts vibration signals under various conditions and fault\nscenarios, improving dataset robustness for more effective detection systems.\nThese systems can alert maintenance needs, preventing accidents preemptively.\nWe use experimental vibration signals from accelerometers on train axles.\n  Our primary contributions include a transformer model, ShaftFormer, designed\nfor processing time series data, and an alternative model incorporating\nspectral methods and enhanced observation models. Simulating vibration signals\nunder diverse conditions mitigates the high cost of obtaining experimental\nsignals for all scenarios. Given the non-stationary nature of railway vibration\nsignals, influenced by speed and load changes, our models address these\ncomplexities, offering a powerful tool for predictive maintenance in the rail\nindustry.\n","authors":["Darío C. Larese","Almudena Bravo Cerrada","Gabriel Dambrosio Tomei","Alejandro Guerrero-López","Pablo M. Olmos","María Jesús Gómez García"],"pdf_url":"https://arxiv.org/pdf/2501.11730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19223v4","updated":"2025-01-20T20:26:51Z","published":"2024-11-28T15:48:02Z","title":"On the Unknowable Limits to Prediction","summary":"  We propose a rigorous decomposition of predictive error, highlighting that\nnot all 'irreducible' error is genuinely immutable. Many domains stand to\nbenefit from iterative enhancements in measurement, construct validity, and\nmodeling. Our approach demonstrates how apparently 'unpredictable' outcomes can\nbecome more tractable with improved data (across both target and features) and\nrefined algorithms. By distinguishing aleatoric from epistemic error, we\ndelineate how accuracy may asymptotically improve--though inherent\nstochasticity may remain--and offer a robust framework for advancing\ncomputational research.\n","authors":["Jiani Yan","Charles Rahal"],"pdf_url":"https://arxiv.org/pdf/2411.19223v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16788v2","updated":"2025-01-20T20:17:59Z","published":"2024-12-21T22:02:06Z","title":"DCOR: Anomaly Detection in Attributed Networks via Dual Contrastive\n  Learning Reconstruction","summary":"  Anomaly detection using a network-based approach is one of the most efficient\nways to identify abnormal events such as fraud, security breaches, and system\nfaults in a variety of applied domains. While most of the earlier works address\nthe complex nature of graph-structured data and predefined anomalies, the\nimpact of data attributes and emerging anomalies are often neglected. This\npaper introduces DCOR, a novel approach on attributed networks that integrates\nreconstruction-based anomaly detection with Contrastive Learning. Utilizing a\nGraph Neural Network (GNN) framework, DCOR contrasts the reconstructed\nadjacency and feature matrices from both the original and augmented graphs to\ndetect subtle anomalies. We employed comprehensive experimental studies on\nbenchmark datasets through standard evaluation measures. The results show that\nDCOR significantly outperforms state-of-the-art methods. Obtained results\ndemonstrate the efficacy of proposed approach in attributed networks with the\npotential of uncovering new patterns of anomalies.\n","authors":["Hossein Rafieizadeh","Hadi Zare","Mohsen Ghassemi Parsa","Hadi Davardoust","Meshkat Shariat Bagheri"],"pdf_url":"https://arxiv.org/pdf/2412.16788v2.pdf","comment":"Accepted at the Thirteenth International Conference on Complex\n  Networks and Their Applications"},{"id":"http://arxiv.org/abs/2501.11715v1","updated":"2025-01-20T19:55:50Z","published":"2025-01-20T19:55:50Z","title":"GL-ICNN: An End-To-End Interpretable Convolutional Neural Network for\n  the Diagnosis and Prediction of Alzheimer's Disease","summary":"  Deep learning methods based on Convolutional Neural Networks (CNNs) have\nshown great potential to improve early and accurate diagnosis of Alzheimer's\ndisease (AD) dementia based on imaging data. However, these methods have yet to\nbe widely adopted in clinical practice, possibly due to the limited\ninterpretability of deep learning models. The Explainable Boosting Machine\n(EBM) is a glass-box model but cannot learn features directly from input\nimaging data. In this study, we propose a novel interpretable model that\ncombines CNNs and EBMs for the diagnosis and prediction of AD. We develop an\ninnovative training strategy that alternatingly trains the CNN component as a\nfeature extractor and the EBM component as the output block to form an\nend-to-end model. The model takes imaging data as input and provides both\npredictions and interpretable feature importance measures. We validated the\nproposed model on the Alzheimer's Disease Neuroimaging Initiative (ADNI)\ndataset and the Health-RI Parelsnoer Neurodegenerative Diseases Biobank (PND)\nas an external testing set. The proposed model achieved an area-under-the-curve\n(AUC) of 0.956 for AD and control classification, and 0.694 for the prediction\nof conversion of mild cognitive impairment (MCI) to AD on the ADNI cohort. The\nproposed model is a glass-box model that achieves a comparable performance with\nother state-of-the-art black-box models. Our code is publicly available at:\nhttps://anonymous.4open.science/r/GL-ICNN.\n","authors":["Wenjie Kang","Lize Jiskoot","Peter De Deyn","Geert Biessels","Huiberdina Koek","Jurgen Claassen","Huub Middelkoop","Wiesje Flier","Willemijn J. Jansen","Stefan Klein","Esther Bron"],"pdf_url":"https://arxiv.org/pdf/2501.11715v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.03121v3","updated":"2025-01-20T19:52:00Z","published":"2023-12-05T20:40:37Z","title":"Evaluating Agents using Social Choice Theory","summary":"  We argue that many general evaluation problems can be viewed through the lens\nof voting theory. Each task is interpreted as a separate voter, which requires\nonly ordinal rankings or pairwise comparisons of agents to produce an overall\nevaluation. By viewing the aggregator as a social welfare function, we are able\nto leverage centuries of research in social choice theory to derive principled\nevaluation frameworks with axiomatic foundations. These evaluations are\ninterpretable and flexible, while avoiding many of the problems currently\nfacing cross-task evaluation. We apply this Voting-as-Evaluation (VasE)\nframework across multiple settings, including reinforcement learning, large\nlanguage models, and humans. In practice, we observe that VasE can be more\nrobust than popular evaluation frameworks (Elo and Nash averaging), discovers\nproperties in the evaluation data not evident from scores alone, and can\npredict outcomes better than Elo in a complex seven-player game. We identify\none particular approach, maximal lotteries, that satisfies important\nconsistency properties relevant to evaluation, is computationally efficient\n(polynomial in the size of the evaluation data), and identifies game-theoretic\ncycles.\n","authors":["Marc Lanctot","Kate Larson","Yoram Bachrach","Luke Marris","Zun Li","Avishkar Bhoopchand","Thomas Anthony","Brian Tanner","Anna Koop"],"pdf_url":"https://arxiv.org/pdf/2312.03121v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17257v2","updated":"2025-01-20T19:40:29Z","published":"2024-10-05T22:37:45Z","title":"Code-Driven Law NO, Normware SI!","summary":"  With the digitalization of society, the interest, the debates and the\nresearch efforts concerning \"code\", \"law\", \"artificial intelligence\", and their\nvarious relationships, have been widely increasing. Yet, most arguments\nprimarily focus on contemporary computational methods and artifacts\n(inferential models constructed via machine-learning methods, rule-based\nsystems, smart contracts), rather than attempting to identify more fundamental\nmechanisms. Aiming to go beyond this conceptual limitation, this paper\nintroduces and elaborates on \"normware\" as an explicit additional stance --\ncomplementary to software and hardware -- for the interpretation and the design\nof artificial devices. By means of a few examples, I will argue that a\nnormware-centred perspective provides a more adequate abstraction to study and\ndesign interactions between computational systems and human institutions, and\nmay help with the design and development of technical interventions within\nwider socio-technical views.\n","authors":["Giovanni Sileno"],"pdf_url":"https://arxiv.org/pdf/2410.17257v2.pdf","comment":"First version of the paper presented at CRCL 2022"},{"id":"http://arxiv.org/abs/2501.11705v1","updated":"2025-01-20T19:38:21Z","published":"2025-01-20T19:38:21Z","title":"Human services organizations and the responsible integration of AI:\n  Considering ethics and contextualizing risk(s)","summary":"  This paper examines the responsible integration of artificial intelligence\n(AI) in human services organizations (HSOs), proposing a nuanced framework for\nevaluating AI applications across multiple dimensions of risk. The authors\nargue that ethical concerns about AI deployment -- including professional\njudgment displacement, environmental impact, model bias, and data laborer\nexploitation -- vary significantly based on implementation context and specific\nuse cases. They challenge the binary view of AI adoption, demonstrating how\ndifferent applications present varying levels of risk that can often be\neffectively managed through careful implementation strategies. The paper\nhighlights promising solutions, such as local large language models, that can\nfacilitate responsible AI integration while addressing common ethical concerns.\nThe authors propose a dimensional risk assessment approach that considers\nfactors like data sensitivity, professional oversight requirements, and\npotential impact on client wellbeing. They conclude by outlining a path forward\nthat emphasizes empirical evaluation, starting with lower-risk applications and\nbuilding evidence-based understanding through careful experimentation. This\napproach enables organizations to maintain high ethical standards while\nthoughtfully exploring how AI might enhance their capacity to serve clients and\ncommunities effectively.\n","authors":["Brian E. Perron","Lauri Goldkind","Zia Qi","Bryan G. Victor"],"pdf_url":"https://arxiv.org/pdf/2501.11705v1.pdf","comment":"1 figure. Journal of Technology in Human Services (2025)"},{"id":"http://arxiv.org/abs/2305.06121v3","updated":"2025-01-20T19:22:24Z","published":"2023-05-10T13:11:23Z","title":"Transformer-Based Model for Monocular Visual Odometry: A Video\n  Understanding Approach","summary":"  Estimating the camera's pose given images from a single camera is a\ntraditional task in mobile robots and autonomous vehicles. This problem is\ncalled monocular visual odometry and often relies on geometric approaches that\nrequire considerable engineering effort for a specific scenario. Deep learning\nmethods have been shown to be generalizable after proper training and with a\nlarge amount of available data. Transformer-based architectures have dominated\nthe state-of-the-art in natural language processing and computer vision tasks,\nsuch as image and video understanding. In this work, we deal with the monocular\nvisual odometry as a video understanding task to estimate the 6 degrees of\nfreedom of a camera's pose. We contribute by presenting the TSformer-VO model\nbased on spatio-temporal self-attention mechanisms to extract features from\nclips and estimate the motions in an end-to-end manner. Our approach achieved\ncompetitive state-of-the-art performance compared with geometry-based and deep\nlearning-based methods on the KITTI visual odometry dataset, outperforming the\nDeepVO implementation highly accepted in the visual odometry community. The\ncode is publicly available at https://github.com/aofrancani/TSformer-VO.\n","authors":["André O. Françani","Marcos R. O. A. Maximo"],"pdf_url":"https://arxiv.org/pdf/2305.06121v3.pdf","comment":"This work has been accepted for publication in IEEE Access"},{"id":"http://arxiv.org/abs/2501.11695v1","updated":"2025-01-20T19:20:13Z","published":"2025-01-20T19:20:13Z","title":"Spatially-Delineated Domain-Adapted AI Classification: An Application\n  for Oncology Data","summary":"  Given multi-type point maps from different place-types (e.g., tumor regions),\nour objective is to develop a classifier trained on the source place-type to\naccurately distinguish between two classes of the target place-type based on\ntheir point arrangements. This problem is societally important for many\napplications, such as generating clinical hypotheses for designing new\nimmunotherapies for cancer treatment. The challenge lies in the spatial\nvariability, the inherent heterogeneity and variation observed in spatial\nproperties or arrangements across different locations (i.e., place-types).\nPrevious techniques focus on self-supervised tasks to learn domain-invariant\nfeatures and mitigate domain differences; however, they often neglect the\nunderlying spatial arrangements among data points, leading to significant\ndiscrepancies across different place-types. We explore a novel multi-task\nself-learning framework that targets spatial arrangements, such as spatial\nmix-up masking and spatial contrastive predictive coding, for\nspatially-delineated domain-adapted AI classification. Experimental results on\nreal-world datasets (e.g., oncology data) show that the proposed framework\nprovides higher prediction accuracy than baseline methods.\n","authors":["Majid Farhadloo","Arun Sharma","Alexey Leontovich","Svetomir N. Markovic","Shashi Shekhar"],"pdf_url":"https://arxiv.org/pdf/2501.11695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09104v2","updated":"2025-01-20T18:35:40Z","published":"2025-01-15T19:42:41Z","title":"A Non-autoregressive Model for Joint STT and TTS","summary":"  In this paper, we take a step towards jointly modeling automatic speech\nrecognition (STT) and speech synthesis (TTS) in a fully non-autoregressive way.\nWe develop a novel multimodal framework capable of handling the speech and text\nmodalities as input either individually or together. The proposed model can\nalso be trained with unpaired speech or text data owing to its multimodal\nnature. We further propose an iterative refinement strategy to improve the STT\nand TTS performance of our model such that the partial hypothesis at the output\ncan be fed back to the input of our model, thus iteratively improving both STT\nand TTS predictions. We show that our joint model can effectively perform both\nSTT and TTS tasks, outperforming the STT-specific baseline in all tasks and\nperforming competitively with the TTS-specific baseline across a wide range of\nevaluation metrics.\n","authors":["Vishal Sunder","Brian Kingsbury","George Saon","Samuel Thomas","Slava Shechtman","Hagai Aronowitz","Eric Fosler-Lussier","Luis Lastras"],"pdf_url":"https://arxiv.org/pdf/2501.09104v2.pdf","comment":"5 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.11639v1","updated":"2025-01-20T18:13:18Z","published":"2025-01-20T18:13:18Z","title":"StAyaL | Multilingual Style Transfer","summary":"  Stylistic text generation plays a vital role in enhancing communication by\nreflecting the nuances of individual expression. This paper presents a novel\napproach for generating text in a specific speaker's style across different\nlanguages. We show that by leveraging only 100 lines of text, an individuals\nunique style can be captured as a high-dimensional embedding, which can be used\nfor both text generation and stylistic translation. This methodology breaks\ndown the language barrier by transferring the style of a speaker between\nlanguages. The paper is structured into three main phases: augmenting the\nspeaker's data with stylistically consistent external sources, separating style\nfrom content using machine learning and deep learning techniques, and\ngenerating an abstract style profile by mean pooling the learned embeddings.\nThe proposed approach is shown to be topic-agnostic, with test accuracy and F1\nscores of 74.9\\% and 0.75, respectively. The results demonstrate the potential\nof the style profile for multilingual communication, paving the way for further\napplications in personalized content generation and cross-linguistic stylistic\ntransfer.\n","authors":["Karishma Thakrar","Katrina Lawrence","Kyle Howard"],"pdf_url":"https://arxiv.org/pdf/2501.11639v1.pdf","comment":"The primary authors, Karishma Thakrar and Katrina Lawrence,\n  contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.08935v2","updated":"2025-01-20T18:07:30Z","published":"2024-09-13T15:55:05Z","title":"Optimization and Generalization Guarantees for Weight Normalization","summary":"  Weight normalization (WeightNorm) is widely used in practice for the training\nof deep neural networks and modern deep learning libraries have built-in\nimplementations of it. In this paper, we provide the first theoretical\ncharacterizations of both optimization and generalization of deep WeightNorm\nmodels with smooth activation functions. For optimization, from the form of the\nHessian of the loss, we note that a small Hessian of the predictor leads to a\ntractable analysis. Thus, we bound the spectral norm of the Hessian of\nWeightNorm networks and show its dependence on the network width and weight\nnormalization terms--the latter being unique to networks without WeightNorm.\nThen, we use this bound to establish training convergence guarantees under\nsuitable assumptions for gradient decent. For generalization, we use WeightNorm\nto get a uniform convergence based generalization bound, which is independent\nfrom the width and depends sublinearly on the depth. Finally, we present\nexperimental results which illustrate how the normalization terms and other\nquantities of theoretical interest relate to the training of WeightNorm\nnetworks.\n","authors":["Pedro Cisneros-Velarde","Zhijie Chen","Sanmi Koyejo","Arindam Banerjee"],"pdf_url":"https://arxiv.org/pdf/2409.08935v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11632v1","updated":"2025-01-20T18:02:03Z","published":"2025-01-20T18:02:03Z","title":"Biomedical Knowledge Graph: A Survey of Domains, Tasks, and Real-World\n  Applications","summary":"  Biomedical knowledge graphs (BKGs) have emerged as powerful tools for\norganizing and leveraging the vast and complex data found across the biomedical\nfield. Yet, current reviews of BKGs often limit their scope to specific domains\nor methods, overlooking the broader landscape and the rapid technological\nprogress reshaping it. In this survey, we address this gap by offering a\nsystematic review of BKGs from three core perspectives: domains, tasks, and\napplications. We begin by examining how BKGs are constructed from diverse data\nsources, including molecular interactions, pharmacological datasets, and\nclinical records. Next, we discuss the essential tasks enabled by BKGs,\nfocusing on knowledge management, retrieval, reasoning, and interpretation.\nFinally, we highlight real-world applications in precision medicine, drug\ndiscovery, and scientific research, illustrating the translational impact of\nBKGs across multiple sectors. By synthesizing these perspectives into a unified\nframework, this survey not only clarifies the current state of BKG research but\nalso establishes a foundation for future exploration, enabling both innovative\nmethodological advances and practical implementations.\n","authors":["Yuxing Lu","Sin Yee Goi","Xukai Zhao","Jinzhuo Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11632v1.pdf","comment":"45 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.11631v1","updated":"2025-01-20T18:01:42Z","published":"2025-01-20T18:01:42Z","title":"Noise-Agnostic Multitask Whisper Training for Reducing False Alarm\n  Errors in Call-for-Help Detection","summary":"  Keyword spotting is often implemented by keyword classifier to the encoder in\nacoustic models, enabling the classification of predefined or open vocabulary\nkeywords. Although keyword spotting is a crucial task in various applications\nand can be extended to call-for-help detection in emergencies, however, the\nprevious method often suffers from scalability limitations due to retraining\nrequired to introduce new keywords or adapt to changing contexts. We explore a\nsimple yet effective approach that leverages off-the-shelf pretrained ASR\nmodels to address these challenges, especially in call-for-help detection\nscenarios. Furthermore, we observed a substantial increase in false alarms when\ndeploying call-for-help detection system in real-world scenarios due to noise\nintroduced by microphones or different environments. To address this, we\npropose a novel noise-agnostic multitask learning approach that integrates a\nnoise classification head into the ASR encoder. Our method enhances the model's\nrobustness to noisy environments, leading to a significant reduction in false\nalarms and improved overall call-for-help performance. Despite the added\ncomplexity of multitask learning, our approach is computationally efficient and\nprovides a promising solution for call-for-help detection in real-world\nscenarios.\n","authors":["Myeonghoon Ryu","June-Woo Kim","Minseok Oh","Suji Lee","Han Park"],"pdf_url":"https://arxiv.org/pdf/2501.11631v1.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.11623v1","updated":"2025-01-20T17:46:12Z","published":"2025-01-20T17:46:12Z","title":"Early evidence of how LLMs outperform traditional systems on OCR/HTR\n  tasks for historical records","summary":"  We explore the ability of two LLMs -- GPT-4o and Claude Sonnet 3.5 -- to\ntranscribe historical handwritten documents in a tabular format and compare\ntheir performance to traditional OCR/HTR systems: EasyOCR, Keras, Pytesseract,\nand TrOCR. Considering the tabular form of the data, two types of experiments\nare executed: one where the images are split line by line and the other where\nthe entire scan is used as input. Based on CER and BLEU, we demonstrate that\nLLMs outperform the conventional OCR/HTR methods. Moreover, we also compare the\nevaluated CER and BLEU scores to human evaluations to better judge the outputs\nof whole-scan experiments and understand influential factors for CER and BLEU.\nCombining judgments from all the evaluation metrics, we conclude that two-shot\nGPT-4o for line-by-line images and two-shot Claude Sonnet 3.5 for whole-scan\nimages yield the transcriptions of the historical records most similar to the\nground truth.\n","authors":["Seorin Kim","Julien Baudru","Wouter Ryckbosch","Hugues Bersini","Vincent Ginis"],"pdf_url":"https://arxiv.org/pdf/2501.11623v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.05139v3","updated":"2025-01-20T17:43:26Z","published":"2024-12-06T15:56:11Z","title":"A Practical Examination of AI-Generated Text Detectors for Large\n  Language Models","summary":"  The proliferation of large language models has raised growing concerns about\ntheir misuse, particularly in cases where AI-generated text is falsely\nattributed to human authors. Machine-generated content detectors claim to\neffectively identify such text under various conditions and from any language\nmodel. This paper critically evaluates these claims by assessing several\npopular detectors (RADAR, Wild, T5Sentinel, Fast-DetectGPT, GPTID, LogRank,\nBinoculars) on a range of domains, datasets, and models that these detectors\nhave not previously encountered. We employ various prompting strategies to\nsimulate adversarial attacks, demonstrating that even moderate efforts can\nsignificantly evade detection. We emphasize the importance of the true positive\nrate at a specific false positive rate (TPR@FPR) metric and demonstrate that\nthese detectors perform poorly in certain settings, with TPR@.01 as low as 0%.\nOur findings suggest that both trained and zero-shot detectors struggle to\nmaintain high sensitivity while achieving a reasonable true positive rate.\n","authors":["Brian Tufts","Xuandong Zhao","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2412.05139v3.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2501.08167v2","updated":"2025-01-20T17:34:20Z","published":"2025-01-14T14:49:14Z","title":"Potential and Perils of Large Language Models as Judges of Unstructured\n  Textual Data","summary":"  Rapid advancements in large language models have unlocked remarkable\ncapabilities when it comes to processing and summarizing unstructured text\ndata. This has implications for the analysis of rich, open-ended datasets, such\nas survey responses, where LLMs hold the promise of efficiently distilling key\nthemes and sentiments. However, as organizations increasingly turn to these\npowerful AI systems to make sense of textual feedback, a critical question\narises, can we trust LLMs to accurately represent the perspectives contained\nwithin these text based datasets? While LLMs excel at generating human-like\nsummaries, there is a risk that their outputs may inadvertently diverge from\nthe true substance of the original responses. Discrepancies between the\nLLM-generated outputs and the actual themes present in the data could lead to\nflawed decision-making, with far-reaching consequences for organizations. This\nresearch investigates the effectiveness of LLM-as-judge models to evaluate the\nthematic alignment of summaries generated by other LLMs. We utilized an\nAnthropic Claude model to generate thematic summaries from open-ended survey\nresponses, with Amazon's Titan Express, Nova Pro, and Meta's Llama serving as\njudges. This LLM-as-judge approach was compared to human evaluations using\nCohen's kappa, Spearman's rho, and Krippendorff's alpha, validating a scalable\nalternative to traditional human centric evaluation methods. Our findings\nreveal that while LLM-as-judge offer a scalable solution comparable to human\nraters, humans may still excel at detecting subtle, context-specific nuances.\nOur research contributes to the growing body of knowledge on AI assisted text\nanalysis. Further, we provide recommendations for future research, emphasizing\nthe need for careful consideration when generalizing LLM-as-judge models across\nvarious contexts and use cases.\n","authors":["Rewina Bedemariam","Natalie Perez","Sreyoshi Bhaduri","Satya Kapoor","Alex Gil","Elizabeth Conjar","Ikkei Itoku","David Theil","Aman Chadha","Naumaan Nayyar"],"pdf_url":"https://arxiv.org/pdf/2501.08167v2.pdf","comment":"11 pages, 1 appendix"},{"id":"http://arxiv.org/abs/2501.11613v1","updated":"2025-01-20T17:19:02Z","published":"2025-01-20T17:19:02Z","title":"Conversation Routines: A Prompt Engineering Framework for Task-Oriented\n  Dialog Systems","summary":"  This study introduces Conversation Routines (CR), a structured prompt\nengineering framework for developing task-oriented dialog systems using Large\nLanguage Models (LLMs). While LLMs demonstrate remarkable natural language\nunderstanding capabilities, engineering them to reliably execute complex\nbusiness workflows remains challenging. The proposed CR framework enables the\ndevelopment of Conversation Agentic Systems (CAS) through natural language\nspecifications, embedding task-oriented logic within LLM prompts. This approach\nprovides a systematic methodology for designing and implementing complex\nconversational workflows while maintaining behavioral consistency. We\ndemonstrate the framework's effectiveness through two proof of concept\nimplementations: a Train Ticket Booking System and an Interactive\nTroubleshooting Copilot. These case studies validate CR's capability to encode\nsophisticated behavioral patterns and decision logic while preserving natural\nconversational flexibility. Results show that CR enables domain experts to\ndesign conversational workflows in natural language while leveraging custom\nenterprise functionalities (tools) developed by software engineers, creating an\nefficient division of responsibilities where developers focus on core API\nimplementation and domain experts handle conversation design. While the\nframework shows promise in accessibility and adaptability, we identify key\nchallenges including computational overhead, non-deterministic behavior, and\ndomain-specific logic optimization. Future research directions include\nenhancing system robustness, improving scalability for complex multi-agent\ninteractions, and addressing the identified limitations across diverse business\napplications.\n","authors":["Giorgio Robino"],"pdf_url":"https://arxiv.org/pdf/2501.11613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11599v1","updated":"2025-01-20T17:00:41Z","published":"2025-01-20T17:00:41Z","title":"SR-FoT: A Syllogistic-Reasoning Framework of Thought for Large Language\n  Models Tackling Knowledge-based Reasoning Tasks","summary":"  Deductive reasoning is a crucial logical capability that assists us in\nsolving complex problems based on existing knowledge. Although augmented by\nChain-of-Thought prompts, Large Language Models (LLMs) might not follow the\ncorrect reasoning paths. Enhancing the deductive reasoning abilities of LLMs,\nand leveraging their extensive built-in knowledge for various reasoning tasks,\nremains an open question. Attempting to mimic the human deductive reasoning\nparadigm, we propose a multi-stage Syllogistic-Reasoning Framework of Thought\n(SR-FoT) that enables LLMs to perform syllogistic deductive reasoning to handle\ncomplex knowledge-based reasoning tasks. Our SR-FoT begins by interpreting the\nquestion and then uses the interpretation and the original question to propose\na suitable major premise. It proceeds by generating and answering minor premise\nquestions in two stages to match the minor premises. Finally, it guides LLMs to\nuse the previously generated major and minor premises to perform syllogistic\ndeductive reasoning to derive the answer to the original question. Extensive\nand thorough experiments on knowledge-based reasoning tasks have demonstrated\nthe effectiveness and advantages of our SR-FoT.\n","authors":["Wentao Wan","Zhuojie Yang","Yongcan Chen","Chenglin Luo","Ruilin Wang","Kehao Cai","Nan Kang","Liang Lin","Keze Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11599v1.pdf","comment":"This paper has been accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.11597v1","updated":"2025-01-20T16:56:10Z","published":"2025-01-20T16:56:10Z","title":"Fairness Testing through Extreme Value Theory","summary":"  Data-driven software is increasingly being used as a critical component of\nautomated decision-support systems. Since this class of software learns its\nlogic from historical data, it can encode or amplify discriminatory practices.\nPrevious research on algorithmic fairness has focused on improving average-case\nfairness. On the other hand, fairness at the extreme ends of the spectrum,\nwhich often signifies lasting and impactful shifts in societal attitudes, has\nreceived significantly less emphasis.\n  Leveraging the statistics of extreme value theory (EVT), we propose a novel\nfairness criterion called extreme counterfactual discrimination (ECD). This\ncriterion estimates the worst-case amounts of disadvantage in outcomes for\nindividuals solely based on their memberships in a protected group. Utilizing\ntools from search-based software engineering and generative AI, we present a\nrandomized algorithm that samples a statistically significant set of points\nfrom the tail of ML outcome distributions even if the input dataset lacks a\nsufficient number of relevant samples.\n  We conducted several experiments on four ML models (deep neural networks,\nlogistic regression, and random forests) over 10 socially relevant tasks from\nthe literature on algorithmic fairness. First, we evaluate the generative AI\nmethods and find that they generate sufficient samples to infer valid EVT\ndistribution in 95% of cases. Remarkably, we found that the prevalent bias\nmitigators reduce the average-case discrimination but increase the worst-case\ndiscrimination significantly in 5% of cases. We also observed that even the\ntail-aware mitigation algorithm -- MiniMax-Fairness -- increased the worst-case\ndiscrimination in 30% of cases. We propose a novel ECD-based mitigator that\nimproves fairness in the tail in 90% of cases with no degradation of the\naverage-case discrimination.\n","authors":["Verya Monjezi","Ashutosh Trivedi","Vladik Kreinovich","Saeid Tizpaz-Niari"],"pdf_url":"https://arxiv.org/pdf/2501.11597v1.pdf","comment":"In IEEE/ACM 47th International Conference on Software Engineering\n  (ICSE'25)"},{"id":"http://arxiv.org/abs/2501.11592v1","updated":"2025-01-20T16:50:59Z","published":"2025-01-20T16:50:59Z","title":"Training-free Ultra Small Model for Universal Sparse Reconstruction in\n  Compressed Sensing","summary":"  Pre-trained large models attract widespread attention in recent years, but\nthey face challenges in applications that require high interpretability or have\nlimited resources, such as physical sensing, medical imaging, and\nbioinformatics. Compressed Sensing (CS) is a well-proved theory that drives\nmany recent breakthroughs in these applications. However, as a typical\nunder-determined linear system, CS suffers from excessively long sparse\nreconstruction times when using traditional iterative methods, particularly\nwith large-scale data. Current AI methods like deep unfolding fail to\nsubstitute them because pre-trained models exhibit poor generality beyond their\ntraining conditions and dataset distributions, or lack interpretability.\nInstead of following the big model fervor, this paper proposes ultra-small\nartificial neural models called coefficients learning (CL), enabling\ntraining-free and rapid sparse reconstruction while perfectly inheriting the\ngenerality and interpretability of traditional iterative methods, bringing new\nfeature of incorporating prior knowledges. In CL, a signal of length $n$ only\nneeds a minimal of $n$ trainable parameters. A case study model called CLOMP is\nimplemented for evaluation. Experiments are conducted on both synthetic and\nreal one-dimensional and two-dimensional signals, demonstrating significant\nimprovements in efficiency and accuracy. Compared to representative iterative\nmethods, CLOMP improves efficiency by 100 to 1000 folds for large-scale data.\nTest results on eight diverse image datasets indicate that CLOMP improves\nstructural similarity index by 292%, 98%, 45% for sampling rates of 0.1, 0.3,\n0.5, respectively. We believe this method can truly usher CS reconstruction\ninto the AI era, benefiting countless under-determined linear systems that rely\non sparse solution.\n","authors":["Chaoqing Tang","Huanze Zhuang","Guiyun Tian","Zhenli Zeng","Yi Ding","Wenzhong Liu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2501.11592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11587v1","updated":"2025-01-20T16:46:26Z","published":"2025-01-20T16:46:26Z","title":"Recurrent Diffusion for Large-Scale Parameter Generation","summary":"  Parameter generation has struggled to scale up for a long time, significantly\nlimiting its range of applications. In this study, we introduce\n\\textbf{R}ecurrent diffusion for large-scale \\textbf{P}arameter\n\\textbf{G}eneration, called \\textbf{RPG}. We first divide the trained\nparameters into non-overlapping parts, after which a recurrent model is\nproposed to learn their relationships. The recurrent model's outputs, as\nconditions, are then fed into a diffusion model to generate the neural network\nparameters. Using only a single GPU, recurrent diffusion enables us to generate\npopular vision and language models such as ConvNeXt-L and LoRA parameters of\nLLaMA-7B. Meanwhile, across various architectures and tasks, the generated\nparameters consistently perform comparable results over trained networks.\nNotably, our approach also shows the potential to generate models for handling\nunseen tasks, which largely increases the practicality of parameter generation.\nOur code is available\n\\href{https://github.com/NUS-HPC-AI-Lab/Recurrent-Parameter-Generation}{here}.\n","authors":["Kai Wang","Dongwen Tang","Wangbo Zhao","Yang You"],"pdf_url":"https://arxiv.org/pdf/2501.11587v1.pdf","comment":"Generating 200 million parameters in just minutes"},{"id":"http://arxiv.org/abs/2407.21347v2","updated":"2025-01-20T16:24:12Z","published":"2024-07-31T05:32:37Z","title":"Differentially Private Block-wise Gradient Shuffle for Deep Learning","summary":"  Traditional Differentially Private Stochastic Gradient Descent (DP-SGD)\nintroduces statistical noise on top of gradients drawn from a Gaussian\ndistribution to ensure privacy. This paper introduces the novel Differentially\nPrivate Block-wise Gradient Shuffle (DP-BloGS) algorithm for deep learning.\nBloGS builds off of existing private deep learning literature, but makes a\ndefinitive shift by taking a probabilistic approach to gradient noise\nintroduction through shuffling modeled after information theoretic privacy\nanalyses. The theoretical results presented in this paper show that the\ncombination of shuffling, parameter-specific block size selection, batch layer\nclipping, and gradient accumulation allows DP-BloGS to achieve training times\nclose to that of non-private training while maintaining similar privacy and\nutility guarantees to DP-SGD. DP-BloGS is found to be significantly more\nresistant to data extraction attempts than DP-SGD. The theoretical results are\nvalidated by the experimental findings.\n","authors":["David Zagardo"],"pdf_url":"https://arxiv.org/pdf/2407.21347v2.pdf","comment":"The results are genuine, but the math is wrong! Please do not use\n  this method for your Differential Privacy implementations"},{"id":"http://arxiv.org/abs/2406.19507v2","updated":"2025-01-20T16:23:41Z","published":"2024-06-27T19:58:11Z","title":"Too Good to be True? Turn Any Model Differentially Private With\n  DP-Weights","summary":"  Imagine training a machine learning model with Differentially Private\nStochastic Gradient Descent (DP-SGD), only to discover post-training that the\nnoise level was either too high, crippling your model's utility, or too low,\ncompromising privacy. The dreaded realization hits: you must start the lengthy\ntraining process from scratch. But what if you could avoid this retraining\nnightmare? In this study, we introduce a groundbreaking approach (to our\nknowledge) that applies differential privacy noise to the model's weights after\ntraining. We offer a comprehensive mathematical proof for this novel approach's\nprivacy bounds, use formal methods to validate its privacy guarantees, and\nempirically evaluate its effectiveness using membership inference attacks and\nperformance evaluations. This method allows for a single training run, followed\nby post-hoc noise adjustments to achieve optimal privacy-utility trade-offs. We\ncompare this novel fine-tuned model (DP-Weights model) to a traditional DP-SGD\nmodel, demonstrating that our approach yields statistically similar performance\nand privacy guarantees. Our results validate the efficacy of post-training\nnoise application, promising significant time savings and flexibility in\nfine-tuning differential privacy parameters, making it a practical alternative\nfor deploying differentially private models in real-world scenarios.\n","authors":["David Zagardo"],"pdf_url":"https://arxiv.org/pdf/2406.19507v2.pdf","comment":"The results are genuine, but the math is wrong! Please do not use\n  this method for your Differential Privacy implementations"},{"id":"http://arxiv.org/abs/2404.03685v8","updated":"2025-01-20T16:20:29Z","published":"2024-04-01T13:12:27Z","title":"Cooperative Evolutionary Pressure and Diminishing Returns Might Explain\n  the Fermi Paradox: On What Super-AIs Are Like","summary":"  With an evolutionary approach, the basis of morality can be explained as\nadaptations to problems of cooperation. With 'evolution' taken in a broad\nsense, AIs that satisfy the conditions for evolution to apply will be subject\nto the same cooperative evolutionary pressure as biological entities. Here the\nadaptiveness of increased cooperation as material safety and wealth increase is\ndiscussed -- for humans, for other societies, and for AIs. Diminishing\nbeneficial returns from increased access to material resources also suggests\nthe possibility that, on the whole, there will be no incentive to for instance\ncolonize entire galaxies, thus providing a possible explanation of the Fermi\nparadox, wondering where everybody is. It is further argued that old societies\ncould engender, give way to, super-AIs, since it is likely that super-AIs are\nfeasible, and fitter. Closing is an aside on effective ways for morals and\ngoals to affect life and society, emphasizing environments, cultures, and laws,\nand exemplified by how to eat.\n  'Diminishing returns' is defined, as less than roots, the inverse of\ninfeasibility. It is also noted that there can be no exponential colonization\nor reproduction, for mathematical reasons, as each entity takes up a certain\namount of space. Appended are an algorithm for colonizing for example a galaxy\nquickly, models of the evolution of cooperation and fairness under diminishing\nreturns, and software for simulating signaling development.\n","authors":["Daniel Vallstrom"],"pdf_url":"https://arxiv.org/pdf/2404.03685v8.pdf","comment":"35 pages, 10 figures. Added figures, expansions, definitions, proofs,\n  references"},{"id":"http://arxiv.org/abs/2501.11560v1","updated":"2025-01-20T16:02:26Z","published":"2025-01-20T16:02:26Z","title":"Explainable Lane Change Prediction for Near-Crash Scenarios Using\n  Knowledge Graph Embeddings and Retrieval Augmented Generation","summary":"  Lane-changing maneuvers, particularly those executed abruptly or in risky\nsituations, are a significant cause of road traffic accidents. However, current\nresearch mainly focuses on predicting safe lane changes. Furthermore, existing\naccident datasets are often based on images only and lack comprehensive sensory\ndata. In this work, we focus on predicting risky lane changes using the CRASH\ndataset (our own collected dataset specifically for risky lane changes), and\nsafe lane changes (using the HighD dataset). Then, we leverage KG and Bayesian\ninference to predict these maneuvers using linguistic contextual information,\nenhancing the model's interpretability and transparency. The model achieved a\n91.5% f1-score with anticipation time extending to four seconds for risky lane\nchanges, and a 90.0% f1-score for predicting safe lane changes with the same\nanticipation time. We validate our model by integrating it into a vehicle\nwithin the CARLA simulator in scenarios that involve risky lane changes. The\nmodel managed to anticipate sudden lane changes, thus providing automated\nvehicles with further time to plan and execute appropriate safe reactions.\nFinally, to enhance the explainability of our model, we utilize RAG to provide\nclear and natural language explanations for the given prediction.\n","authors":["M. Manzour","A. Ballardini","R. Izquierdo","M. Á. Sotelo"],"pdf_url":"https://arxiv.org/pdf/2501.11560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15825v2","updated":"2025-01-20T15:37:50Z","published":"2024-09-24T07:38:38Z","title":"60 Data Points are Sufficient to Fine-Tune LLMs for Question-Answering","summary":"  Large language models (LLMs) encode extensive world knowledge through\npre-training on massive datasets, which can then be fine-tuned for the\nquestion-answering (QA) task. However, effective strategies for fine-tuning\nLLMs for the QA task remain largely unexplored. To address this gap, we\ncategorize supervised fine-tuning (SFT) data based on the extent of knowledge\nmemorized by the pretrained LLMs and conduct a series of empirical analyses.\nOur experiments, involving four LLMs from three different model families, focus\non three key factors: the amount of data required for SFT, the impact of\ndifferent SFT datasets on model performance, and how data requirements vary\nacross LLMs. The results show that as few as 60 data points during the SFT\nstage can activate the knowledge encoded during pre-training, enabling LLMs to\nperform the QA task. Additionally, SFT with data of varying memory levels has a\nsignificant impact on LLM performance, with the optimal dataset differing based\non the specific model being fine-tuned. Future research will delve deeper into\nthe mechanisms underlying these phenomena.\n","authors":["Junjie Ye","Yuming Yang","Qi Zhang","Tao Gui","Xuanjing Huang","Peng Wang","Zhongchao Shi","Jianping Fan"],"pdf_url":"https://arxiv.org/pdf/2409.15825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11533v1","updated":"2025-01-20T15:17:24Z","published":"2025-01-20T15:17:24Z","title":"The impact of intrinsic rewards on exploration in Reinforcement Learning","summary":"  One of the open challenges in Reinforcement Learning is the hard exploration\nproblem in sparse reward environments. Various types of intrinsic rewards have\nbeen proposed to address this challenge by pushing towards diversity. This\ndiversity might be imposed at different levels, favouring the agent to explore\ndifferent states, policies or behaviours (State, Policy and Skill level\ndiversity, respectively). However, the impact of diversity on the agent's\nbehaviour remains unclear. In this work, we aim to fill this gap by studying\nthe effect of different levels of diversity imposed by intrinsic rewards on the\nexploration patterns of RL agents. We select four intrinsic rewards (State\nCount, Intrinsic Curiosity Module (ICM), Maximum Entropy, and Diversity is all\nyou need (DIAYN)), each pushing for a different diversity level. We conduct an\nempirical study on MiniGrid environment to compare their impact on exploration\nconsidering various metrics related to the agent's exploration, namely:\nepisodic return, observation coverage, agent's position coverage, policy\nentropy, and timeframes to reach the sparse reward. The main outcome of the\nstudy is that State Count leads to the best exploration performance in the case\nof low-dimensional observations. However, in the case of RGB observations, the\nperformance of State Count is highly degraded mostly due to representation\nlearning challenges. Conversely, Maximum Entropy is less impacted, resulting in\na more robust exploration, despite being not always optimal. Lastly, our\nempirical study revealed that learning diverse skills with DIAYN, often linked\nto improved robustness and generalisation, does not promote exploration in\nMiniGrid environments. This is because: i) learning the skill space itself can\nbe challenging, and ii) exploration within the skill space prioritises\ndifferentiating between behaviours rather than achieving uniform state\nvisitation.\n","authors":["Aya Kayal","Eduardo Pignatelli","Laura Toni"],"pdf_url":"https://arxiv.org/pdf/2501.11533v1.pdf","comment":"45 pages, 17 figures. Submitted to Neural Computing and Applications\n  Journal"},{"id":"http://arxiv.org/abs/2501.11526v1","updated":"2025-01-20T15:08:19Z","published":"2025-01-20T15:08:19Z","title":"Meta-Instance Selection. Instance Selection as a Classification Problem\n  with Meta-Features","summary":"  Data pruning, or instance selection, is an important problem in machine\nlearning especially in terms of nearest neighbour classifier. However, in data\npruning which speeds up the prediction phase, there is an issue related to the\nspeed and efficiency of the process itself. In response, the study proposes an\napproach involving transforming the instance selection process into a\nclassification task conducted in a unified meta-feature space where each\ninstance can be classified and assigned to either the \"to keep\" or \"to remove\"\nclass. This approach requires training an appropriate meta-classifier, which\ncan be developed based on historical instance selection results from other\ndatasets using reference instance selection methods as a labeling tool. This\nwork proposes constructing the meta-feature space based on properties extracted\nfrom the nearest neighbor graph. Experiments conducted on 17 datasets of\nvarying sizes and five reference instance selection methods (ENN, Drop3, ICF,\nHMN-EI, and CCIS) demonstrate that the proposed solution achieves results\ncomparable to reference instance selection methods while significantly reducing\ncomputational complexity. In the proposed approach, the computational\ncomplexity of the system depends only on identifying the k-nearest neighbors\nfor each data sample and running the meta-classifier. Additionally, the study\ndiscusses the choice of meta-classifier, recommending the use of Balanced\nRandom Forest.\n","authors":["Marcin Blachnik","Piotr Ciepliński"],"pdf_url":"https://arxiv.org/pdf/2501.11526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11525v1","updated":"2025-01-20T15:07:59Z","published":"2025-01-20T15:07:59Z","title":"Technical Report for the Forgotten-by-Design Project: Targeted\n  Obfuscation for Machine Learning","summary":"  The right to privacy, enshrined in various human rights declarations, faces\nnew challenges in the age of artificial intelligence (AI). This paper explores\nthe concept of the Right to be Forgotten (RTBF) within AI systems, contrasting\nit with traditional data erasure methods. We introduce Forgotten by Design, a\nproactive approach to privacy preservation that integrates instance-specific\nobfuscation techniques during the AI model training process. Unlike machine\nunlearning, which modifies models post-training, our method prevents sensitive\ndata from being embedded in the first place. Using the LIRA membership\ninference attack, we identify vulnerable data points and propose defenses that\ncombine additive gradient noise and weighting schemes. Our experiments on the\nCIFAR-10 dataset demonstrate that our techniques reduce privacy risks by at\nleast an order of magnitude while maintaining model accuracy (at 95%\nsignificance). Additionally, we present visualization methods for the\nprivacy-utility trade-off, providing a clear framework for balancing privacy\nrisk and model accuracy. This work contributes to the development of\nprivacy-preserving AI systems that align with human cognitive processes of\nmotivated forgetting, offering a robust framework for safeguarding sensitive\ninformation and ensuring compliance with privacy regulations.\n","authors":["Rickard Brännvall","Laurynas Adomaitis","Olof Görnerup","Anass Sedrati"],"pdf_url":"https://arxiv.org/pdf/2501.11525v1.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.11498v1","updated":"2025-01-20T14:06:40Z","published":"2025-01-20T14:06:40Z","title":"Dialect2SQL: A Novel Text-to-SQL Dataset for Arabic Dialects with a\n  Focus on Moroccan Darija","summary":"  The task of converting natural language questions (NLQs) into executable SQL\nqueries, known as text-to-SQL, has gained significant interest in recent years,\nas it enables non-technical users to interact with relational databases. Many\nbenchmarks, such as SPIDER and WikiSQL, have contributed to the development of\nnew models and the evaluation of their performance. In addition, other\ndatasets, like SEDE and BIRD, have introduced more challenges and complexities\nto better map real-world scenarios. However, these datasets primarily focus on\nhigh-resource languages such as English and Chinese. In this work, we introduce\nDialect2SQL, the first large-scale, cross-domain text-to-SQL dataset in an\nArabic dialect. It consists of 9,428 NLQ-SQL pairs across 69 databases in\nvarious domains. Along with SQL-related challenges such as long schemas, dirty\nvalues, and complex queries, our dataset also incorporates the complexities of\nthe Moroccan dialect, which is known for its diverse source languages, numerous\nborrowed words, and unique expressions. This demonstrates that our dataset will\nbe a valuable contribution to both the text-to-SQL community and the\ndevelopment of resources for low-resource languages.\n","authors":["Salmane Chafik","Saad Ezzini","Ismail Berrada"],"pdf_url":"https://arxiv.org/pdf/2501.11498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09368v3","updated":"2025-01-20T14:05:35Z","published":"2025-01-16T08:27:40Z","title":"Aligning Instruction Tuning with Pre-training","summary":"  Instruction tuning enhances large language models (LLMs) to follow human\ninstructions across diverse tasks, relying on high-quality datasets to guide\nbehavior. However, these datasets, whether manually curated or synthetically\ngenerated, are often narrowly focused and misaligned with the broad\ndistributions captured during pre-training, limiting LLM generalization and\neffective use of pre-trained knowledge. We propose Aligning Instruction Tuning\nwith Pre-training (AITP), a method that bridges this gap by identifying\ncoverage shortfalls in instruction-tuning datasets and rewriting\nunderrepresented pre-training data into high-quality instruction-response\npairs. This approach enriches dataset diversity while preserving task-specific\nobjectives. Evaluations on three fully open LLMs across eight benchmarks\ndemonstrate consistent performance improvements with AITP. Ablations highlight\nthe benefits of adaptive data selection, controlled rewriting, and balanced\nintegration, emphasizing the importance of aligning instruction tuning with\npre-training distributions to unlock the full potential of LLMs.\n","authors":["Yiming Liang","Tianyu Zheng","Xinrun Du","Ge Zhang","Jiaheng Liu","Xingwei Qu","Wenqiang Zu","Xingrun Xing","Chujie Zheng","Lei Ma","Wenhu Chen","Guoyin Wang","Zhaoxiang Zhang","Wenhao Huang","Xiang Yue","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09368v3.pdf","comment":"arXiv admin note: text overlap with arXiv:hep-ph/9811436 by other\n  authors"},{"id":"http://arxiv.org/abs/2501.11496v1","updated":"2025-01-20T14:03:40Z","published":"2025-01-20T14:03:40Z","title":"Generative AI and Large Language Models in Language Preservation:\n  Opportunities and Challenges","summary":"  Generative AI and large-scale language models (LLM) have emerged as powerful\ntools in language preservation, particularly for near-native and endangered\nlanguages. With the increasing reliance on technology for communication,\neducation, and cultural documentation, new opportunities have emerged to\nmitigate the dramatic decline of linguistic diversity worldwide. This paper\nexamines the role of generative AIs and LLMs in preserving endangered\nlanguages, highlighting the risks and challenges associated with their use. We\nanalyze the underlying technologies driving these models, including natural\nlanguage processing (NLP) and deep learning, and explore several cases where\nthese technologies have been applied to low-resource languages. Additionally,\nwe discuss ethical considerations, data scarcity issues, and technical\nchallenges while proposing solutions to enhance AI-driven language\npreservation.\n","authors":["Vincent Koc"],"pdf_url":"https://arxiv.org/pdf/2501.11496v1.pdf","comment":"10 pages, 1 figure, submitted for IEEE publication"},{"id":"http://arxiv.org/abs/2501.11493v1","updated":"2025-01-20T13:59:41Z","published":"2025-01-20T13:59:41Z","title":"Communication-Efficient Federated Learning Based on Explanation-Guided\n  Pruning for Remote Sensing Image Classification","summary":"  Federated learning (FL) is a decentralized machine learning paradigm, where\nmultiple clients collaboratively train a global model by exchanging only model\nupdates with the central server without sharing the local data of clients. Due\nto the large volume of model updates required to be transmitted between clients\nand the central server, most FL systems are associated with high transfer costs\n(i.e., communication overhead). This issue is more critical for operational\napplications in remote sensing (RS), especially when large-scale RS data is\nprocessed and analyzed through FL systems with restricted communication\nbandwidth. To address this issue, we introduce an explanation-guided pruning\nstrategy for communication-efficient FL in the context of RS image\nclassification. Our pruning strategy is defined based on the layerwise\nrelevance propagation (LRP) driven explanations to: 1) efficiently and\neffectively identify the most relevant and informative model parameters (to be\nexchanged between clients and the central server); and 2) eliminate the\nnon-informative ones to minimize the volume of model updates. The experimental\nresults on the BigEarthNet-S2 dataset demonstrate that our strategy effectively\nreduces the number of shared model updates, while increasing the generalization\nability of the global model. The code of this work will be publicly available\nat https://git.tu-berlin.de/rsim/FL-LRP\n","authors":["Jonas Klotz","Barış Büyüktaş","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2501.11493v1.pdf","comment":"Submitted to the IEEE International Geoscience and Remote Sensing\n  Symposium (IGARSS) 2025"},{"id":"http://arxiv.org/abs/2501.11478v1","updated":"2025-01-20T13:20:41Z","published":"2025-01-20T13:20:41Z","title":"Graph-defined Language Learning with LLMs","summary":"  Recent efforts leverage Large Language Models (LLMs) for modeling\ntext-attributed graph structures in node classification tasks. These approaches\ndescribe graph structures for LLMs to understand or aggregate LLM-generated\ntextual attribute embeddings through graph structure. However, these approaches\nface two main limitations in modeling graph structures with LLMs. (i) Graph\ndescriptions become verbose in describing high-order graph structure. (ii)\nTextual attributes alone do not contain adequate graph structure information.\nIt is challenging to model graph structure concisely and adequately with LLMs.\nLLMs lack built-in mechanisms to model graph structures directly. They also\nstruggle with complex long-range dependencies between high-order nodes and\ntarget nodes.\n  Inspired by the observation that LLMs pre-trained on one language can achieve\nexceptional performance on another with minimal additional training, we propose\n\\textbf{G}raph-\\textbf{D}efined \\textbf{L}anguage for \\textbf{L}arge\n\\textbf{L}anguage \\textbf{M}odel (GDL4LLM). This novel framework enables LLMs\nto transfer their powerful language understanding capabilities to\ngraph-structured data. GDL4LLM translates graphs into a graph language corpus\ninstead of graph descriptions and pre-trains LLMs on this corpus to adequately\nunderstand graph structures. During fine-tuning, this corpus describes the\nstructural information of target nodes concisely with only a few tokens. By\ntreating graphs as a new language, GDL4LLM enables LLMs to model graph\nstructures adequately and concisely for node classification tasks. Extensive\nexperiments on three real-world datasets demonstrate that GDL4LLM outperforms\ndescription-based and textual attribute embeddings-based baselines by\nefficiently modeling different orders of graph structure with LLMs.\n","authors":["Huachi Zhou","Jiahe Du","Chuang Zhou","Chang Yang","Yilin Xiao","Yuxuan Xie","Xiao Huang"],"pdf_url":"https://arxiv.org/pdf/2501.11478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16902v2","updated":"2025-01-20T13:01:46Z","published":"2024-09-25T13:10:03Z","title":"Towards Underwater Camouflaged Object Tracking: Benchmark and Baselines","summary":"  Over the past decade, significant progress has been made in visual object\ntracking, largely due to the availability of large-scale datasets. However,\nexisting tracking datasets are primarily focused on open-air scenarios, which\ngreatly limits the development of object tracking in underwater environments.\nTo bridge this gap, we take a step forward by proposing the first large-scale\nmultimodal underwater camouflaged object tracking dataset, namely UW-COT220.\nBased on the proposed dataset, this paper first comprehensively evaluates\ncurrent advanced visual object tracking methods and SAM- and SAM2-based\ntrackers in challenging underwater environments. Our findings highlight the\nimprovements of SAM2 over SAM, demonstrating its enhanced ability to handle the\ncomplexities of underwater camouflaged objects. Furthermore, we propose a novel\nvision-language tracking framework called VL-SAM2, based on the video\nfoundation model SAM2. Experimental results demonstrate that our VL-SAM2\nachieves state-of-the-art performance on the UW-COT220 dataset. The dataset and\ncodes can be accessible at\n\\color{magenta}{https://github.com/983632847/Awesome-Multimodal-Object-Tracking}.\n","authors":["Chunhui Zhang","Li Liu","Guanjie Huang","Hao Wen","Xi Zhou","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.16902v2.pdf","comment":"Preprint. Work in Progress. Extended Version of WebUOT-1M on NeurIPS\n  2024"},{"id":"http://arxiv.org/abs/2501.11454v1","updated":"2025-01-20T12:41:17Z","published":"2025-01-20T12:41:17Z","title":"Improving thermal state preparation of Sachdev-Ye-Kitaev model with\n  reinforcement learning on quantum hardware","summary":"  The Sachdev-Ye-Kitaev (SYK) model, known for its strong quantum correlations\nand chaotic behavior, serves as a key platform for quantum gravity studies.\nHowever, variationally preparing thermal states on near-term quantum processors\nfor large systems (N>12, where N is the number of Majorana fermions) presents a\nsignificant challenge due to the rapid growth in the complexity of\nparameterized quantum circuits. This paper addresses this challenge by\nintegrating reinforcement learning (RL) with convolutional neural networks,\nemploying an iterative approach to optimize the quantum circuit and its\nparameters. The refinement process is guided by a composite reward signal\nderived from entropy and the expectation values of the SYK Hamiltonian. This\napproach reduces the number of CNOT gates by two orders of magnitude for\nsystems N>10 compared to traditional methods like first-order Trotterization.\nWe demonstrate the effectiveness of the RL framework in both noiseless and\nnoisy quantum hardware environments, maintaining high accuracy in thermal state\npreparation. This work contributes to the advancement of a scalable, RL-based\nframework with applications for computations of thermal out-of-time-order\ncorrelators in quantum many-body systems and quantum gravity studies on\nnear-term quantum hardware.\n","authors":["Akash Kundu"],"pdf_url":"https://arxiv.org/pdf/2501.11454v1.pdf","comment":"The code and the data will be available soon. Comments are welcomed!"},{"id":"http://arxiv.org/abs/2501.11447v1","updated":"2025-01-20T12:34:51Z","published":"2025-01-20T12:34:51Z","title":"Decomposing Interventional Causality into Synergistic, Redundant, and\n  Unique Components","summary":"  We introduce a novel framework for decomposing interventional causal effects\ninto synergistic, redundant, and unique components, building on the intuition\nof Partial Information Decomposition (PID) and the principle of M\\\"obius\ninversion. While recent work has explored a similar decomposition of an\nobservational measure, we argue that a proper causal decomposition must be\ninterventional in nature. We develop a mathematical approach that\nsystematically quantifies how causal power is distributed among variables in a\nsystem, using a recently derived closed-form expression for the M\\\"obius\nfunction of the redundancy lattice. The formalism is then illustrated by\ndecomposing the causal power in logic gates, cellular automata, and chemical\nreaction networks. Our results reveal how the distribution of causal power can\nbe context- and parameter-dependent. This decomposition provides new insights\ninto complex systems by revealing how causal influences are shared and combined\namong multiple variables, with potential applications ranging from attribution\nof responsibility in legal or AI systems, to the analysis of biological\nnetworks or climate models.\n","authors":["Abel Jansma"],"pdf_url":"https://arxiv.org/pdf/2501.11447v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.11430v1","updated":"2025-01-20T12:06:54Z","published":"2025-01-20T12:06:54Z","title":"A Survey on Diffusion Models for Anomaly Detection","summary":"  Diffusion models (DMs) have emerged as a powerful class of generative AI\nmodels, showing remarkable potential in anomaly detection (AD) tasks across\nvarious domains, such as cybersecurity, fraud detection, healthcare, and\nmanufacturing. The intersection of these two fields, termed diffusion models\nfor anomaly detection (DMAD), offers promising solutions for identifying\ndeviations in increasingly complex and high-dimensional data. In this survey,\nwe systematically review recent advances in DMAD research and investigate their\ncapabilities. We begin by presenting the fundamental concepts of AD and DMs,\nfollowed by a comprehensive analysis of classic DM architectures including\nDDPMs, DDIMs, and Score SDEs. We further categorize existing DMAD methods into\nreconstruction-based, density-based, and hybrid approaches, providing detailed\nexaminations of their methodological innovations. We also explore the diverse\ntasks across different data modalities, encompassing image, time series, video,\nand multimodal data analysis. Furthermore, we discuss critical challenges and\nemerging research directions, including computational efficiency, model\ninterpretability, robustness enhancement, edge-cloud collaboration, and\nintegration with large language models. The collection of DMAD research papers\nand resources is available at https://github.com/fdjingliu/DMAD.\n","authors":["Jing Liu","Zhenchao Ma","Zepu Wang","Yang Liu","Zehua Wang","Peng Sun","Liang Song","Bo Hu","Azzedine Boukerche","Victor C. M. Leung"],"pdf_url":"https://arxiv.org/pdf/2501.11430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16641v4","updated":"2025-01-20T12:03:45Z","published":"2024-12-21T14:21:33Z","title":"A Systems Thinking Approach to Algorithmic Fairness","summary":"  Systems thinking provides us with a way to model the algorithmic fairness\nproblem by allowing us to encode prior knowledge and assumptions about where we\nbelieve bias might exist in the data generating process. We can then encode\nthese beliefs as a series of causal graphs, enabling us to link AI/ML systems\nto politics and the law. This allows us to combine techniques from machine\nlearning, causal inference, and system dynamics in order to capture different\nemergent aspects of the fairness problem. We can use systems thinking to help\npolicymakers on both sides of the political aisle to understand the complex\ntrade-offs that exist from different types of fairness policies, providing a\nsociotechnical foundation for designing AI policy that is aligned to their\npolitical agendas and with society's values.\n","authors":["Chris Lam"],"pdf_url":"https://arxiv.org/pdf/2412.16641v4.pdf","comment":"This paper has been submitted to the 2025 ACM FAccT conference for\n  review"},{"id":"http://arxiv.org/abs/2501.11429v1","updated":"2025-01-20T12:00:36Z","published":"2025-01-20T12:00:36Z","title":"The Explanation Game -- Rekindled (Extended Version)","summary":"  Recent work demonstrated the existence of critical flaws in the current use\nof Shapley values in explainable AI (XAI), i.e. the so-called SHAP scores.\nThese flaws are significant in that the scores provided to a human\ndecision-maker can be misleading. Although these negative results might appear\nto indicate that Shapley values ought not be used in XAI, this paper argues\notherwise. Concretely, this paper proposes a novel definition of SHAP scores\nthat overcomes existing flaws. Furthermore, the paper outlines a practically\nefficient solution for the rigorous estimation of the novel SHAP scores.\nPreliminary experimental results confirm our claims, and further underscore the\nflaws of the current SHAP scores.\n","authors":["Joao Marques-Silva","Xuanxiang Huang","Olivier Letoffe"],"pdf_url":"https://arxiv.org/pdf/2501.11429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11428v1","updated":"2025-01-20T11:56:40Z","published":"2025-01-20T11:56:40Z","title":"Enhancing Coronary Artery Calcium Scoring via Multi-Organ Segmentation\n  on Non-Contrast Cardiac Computed Tomography","summary":"  Despite coronary artery calcium scoring being considered a largely solved\nproblem within the realm of medical artificial intelligence, this paper argues\nthat significant improvements can still be made. By shifting the focus from\npathology detection to a deeper understanding of anatomy, the novel algorithm\nproposed in the paper both achieves high accuracy in coronary artery calcium\nscoring and offers enhanced interpretability of the results. This approach not\nonly aids in the precise quantification of calcifications in coronary arteries,\nbut also provides valuable insights into the underlying anatomical structures.\nThrough this anatomically-informed methodology, the paper shows how a nuanced\nunderstanding of the heart's anatomy can lead to more accurate and\ninterpretable results in the field of cardiovascular health. We demonstrate the\nsuperior accuracy of the proposed method by evaluating it on an open-source\nmulti-vendor dataset, where we obtain results at the inter-observer level,\nsurpassing the current state of the art. Finally, the qualitative analyses show\nthe practical value of the algorithm in such tasks as labeling coronary artery\ncalcifications, identifying aortic calcifications, and filtering out false\npositive detections due to noise.\n","authors":["Jakub Nalepa","Tomasz Bartczak","Mariusz Bujny","Jarosław Gośliński","Katarzyna Jesionek","Wojciech Malara","Filip Malawski","Karol Miszalski-Jamka","Patrycja Rewa","Marcin Kostur"],"pdf_url":"https://arxiv.org/pdf/2501.11428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11425v1","updated":"2025-01-20T11:46:04Z","published":"2025-01-20T11:46:04Z","title":"Agent-R: Training Language Model Agents to Reflect via Iterative\n  Self-Training","summary":"  Large Language Models (LLMs) agents are increasingly pivotal for addressing\ncomplex tasks in interactive environments. Existing work mainly focuses on\nenhancing performance through behavior cloning from stronger experts, yet such\napproaches often falter in real-world applications, mainly due to the inability\nto recover from errors. However, step-level critique data is difficult and\nexpensive to collect. Automating and dynamically constructing self-critique\ndatasets is thus crucial to empowering models with intelligent agent\ncapabilities. In this work, we propose an iterative self-training framework,\nAgent-R, that enables language Agent to Reflect on the fly. Unlike traditional\nmethods that reward or penalize actions based on correctness, Agent-R leverages\nMCTS to construct training data that recover correct trajectories from\nerroneous ones. A key challenge of agent reflection lies in the necessity for\ntimely revision rather than waiting until the end of a rollout. To address\nthis, we introduce a model-guided critique construction mechanism: the actor\nmodel identifies the first error step (within its current capability) in a\nfailed trajectory. Starting from it, we splice it with the adjacent correct\npath, which shares the same parent node in the tree. This strategy enables the\nmodel to learn reflection based on its current policy, therefore yielding\nbetter learning efficiency. To further explore the scalability of this\nself-improvement paradigm, we investigate iterative refinement of both error\ncorrection capabilities and dataset construction. Our findings demonstrate that\nAgent-R continuously improves the model's ability to recover from errors and\nenables timely error correction. Experiments on three interactive environments\nshow that Agent-R effectively equips agents to correct erroneous actions while\navoiding loops, achieving superior performance compared to baseline methods\n(+5.59%).\n","authors":["Siyu Yuan","Zehui Chen","Zhiheng Xi","Junjie Ye","Zhengyin Du","Jiecao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.11425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11422v1","updated":"2025-01-20T11:39:22Z","published":"2025-01-20T11:39:22Z","title":"Multi-View Spectral Clustering for Graphs with Multiple View Structures","summary":"  Despite the fundamental importance of clustering, to this day, much of the\nrelevant research is still based on ambiguous foundations, leading to an\nunclear understanding of whether or how the various clustering methods are\nconnected with each other. In this work, we provide an additional stepping\nstone towards resolving such ambiguities by presenting a general clustering\nframework that subsumes a series of seemingly disparate clustering methods,\nincluding various methods belonging to the wildly popular spectral clustering\nframework. In fact, the generality of the proposed framework is additionally\ncapable of shedding light to the largely unexplored area of multi-view graphs\nwhose each view may have differently clustered nodes. In turn, we propose\nGenClus: a method that is simultaneously an instance of this framework and a\ngeneralization of spectral clustering, while also being closely related to\nk-means as well. This results in a principled alternative to the few existing\nmethods studying this special type of multi-view graphs. Then, we conduct\nin-depth experiments, which demonstrate that GenClus is more computationally\nefficient than existing methods, while also attaining similar or better\nclustering performance. Lastly, a qualitative real-world case-study further\ndemonstrates the ability of GenClus to produce meaningful clusterings.\n","authors":["Yorgos Tsitsikas","Evangelos E. Papalexakis"],"pdf_url":"https://arxiv.org/pdf/2501.11422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11417v1","updated":"2025-01-20T11:34:28Z","published":"2025-01-20T11:34:28Z","title":"Neural Contextual Reinforcement Framework for Logical Structure Language\n  Generation","summary":"  The Neural Contextual Reinforcement Framework introduces an innovative\napproach to enhancing the logical coherence and structural consistency of text\ngenerated by large language models. Leveraging reinforcement learning\nprinciples, the framework integrates custom reward functions and dynamic\ncontext alignment mechanisms to address challenges inherent in maintaining\nlong-range dependencies across extended sequences. The architecture\nincorporates multi-head attention layers and hierarchical encoding modules,\nenabling the model to produce outputs that align closely with human\nexpectations of logical structure and semantic flow. Quantitative evaluations\nacross diverse datasets demonstrate substantial improvements in coherence\nmetrics, perplexity reduction, and semantic alignment, showcasing the\nframework's ability to outperform baseline models in both general and\ndomain-specific tasks. Qualitative analyses further highlight the framework's\ncapacity to generate text with improved narrative clarity and reduced\nredundancy, reflecting its effectiveness in balancing fluency with structural\nprecision. In addition to its performance gains, the framework exhibits\nrobustness in handling noisy input data and scalability across varying model\nsizes, reinforcing its versatility in practical applications. Experimental\nresults reveal that optimal context window sizes significantly influence\ncoherence outcomes, showing the importance of architectural flexibility in\nadapting to diverse linguistic structures. Cross-lingual performance\nevaluations affirm the framework's adaptability to multiple languages,\nextending its utility beyond monolingual contexts. Resource efficiency analyses\nindicate a reduction in computational overhead compared to traditional\napproaches, emphasizing the practicality of the framework for large-scale\ndeployment.\n","authors":["Marcus Irvin","William Cooper","Edward Hughes","Jessica Morgan","Christopher Hamilton"],"pdf_url":"https://arxiv.org/pdf/2501.11417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11413v1","updated":"2025-01-20T11:26:36Z","published":"2025-01-20T11:26:36Z","title":"Generalization and Informativeness of Weighted Conformal Risk Control\n  Under Covariate Shift","summary":"  Predictive models are often required to produce reliable predictions under\nstatistical conditions that are not matched to the training data. A common type\nof training-testing mismatch is covariate shift, where the conditional\ndistribution of the target variable given the input features remains fixed,\nwhile the marginal distribution of the inputs changes. Weighted conformal risk\ncontrol (W-CRC) uses data collected during the training phase to convert point\npredictions into prediction sets with valid risk guarantees at test time\ndespite the presence of a covariate shift. However, while W-CRC provides\nstatistical reliability, its efficiency -- measured by the size of the\nprediction sets -- can only be assessed at test time. In this work, we relate\nthe generalization properties of the base predictor to the efficiency of W-CRC\nunder covariate shifts. Specifically, we derive a bound on the inefficiency of\nthe W-CRC predictor that depends on algorithmic hyperparameters and\ntask-specific quantities available at training time. This bound offers insights\non relationships between the informativeness of the prediction sets, the extent\nof the covariate shift, and the size of the calibration and training sets.\nExperiments on fingerprinting-based localization validate the theoretical\nresults.\n","authors":["Matteo Zecchin","Fredrik Hellström","Sangwoo Park","Shlomo Shamai","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2501.11413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11409v1","updated":"2025-01-20T11:16:44Z","published":"2025-01-20T11:16:44Z","title":"Unsupervised Learning in Echo State Networks for Input Reconstruction","summary":"  Conventional echo state networks (ESNs) require supervised learning to train\nthe readout layer, using the desired outputs as training data. In this study,\nwe focus on input reconstruction (IR), which refers to training the readout\nlayer to reproduce the input time series in its output. We reformulate the\nlearning algorithm of the ESN readout layer to perform IR using unsupervised\nlearning (UL). By conducting theoretical analysis and numerical experiments, we\ndemonstrate that IR in ESNs can be effectively implemented under realistic\nconditions without explicitly using the desired outputs as training data; in\nthis way, UL is enabled. Furthermore, we demonstrate that applications relying\non IR, such as dynamical system replication and noise filtering, can be\nreformulated within the UL framework. Our findings establish a theoretically\nsound and universally applicable IR formulation, along with its related tasks\nin ESNs. This work paves the way for novel predictions and highlights\nunresolved theoretical challenges in ESNs, particularly in the context of\ntime-series processing methods and computational models of the brain.\n","authors":["Taiki Yamada","Yuichi Katori","Kantaro Fujiwara"],"pdf_url":"https://arxiv.org/pdf/2501.11409v1.pdf","comment":"16 pages, 7 figures, regular paper"},{"id":"http://arxiv.org/abs/2501.11407v1","updated":"2025-01-20T11:14:11Z","published":"2025-01-20T11:14:11Z","title":"A Truly Sparse and General Implementation of Gradient-Based Synaptic\n  Plasticity","summary":"  Online synaptic plasticity rules derived from gradient descent achieve high\naccuracy on a wide range of practical tasks. However, their software\nimplementation often requires tediously hand-derived gradients or using\ngradient backpropagation which sacrifices the online capability of the rules.\nIn this work, we present a custom automatic differentiation (AD) pipeline for\nsparse and online implementation of gradient-based synaptic plasticity rules\nthat generalizes to arbitrary neuron models. Our work combines the programming\nease of backpropagation-type methods for forward AD while being\nmemory-efficient. To achieve this, we exploit the advantageous compute and\nmemory scaling of online synaptic plasticity by providing an inherently sparse\nimplementation of AD where expensive tensor contractions are replaced with\nsimple element-wise multiplications if the tensors are diagonal. Gradient-based\nsynaptic plasticity rules such as eligibility propagation (e-prop) have exactly\nthis property and thus profit immensely from this feature. We demonstrate the\nalignment of our gradients with respect to gradient backpropagation on an\nsynthetic task where e-prop gradients are exact, as well as audio speech\nclassification benchmarks. We demonstrate how memory utilization scales with\nnetwork size without dependence on the sequence length, as expected from\nforward AD methods.\n","authors":["Jamie Lohoff","Anil Kaya","Florian Assmuth","Emre Neftci"],"pdf_url":"https://arxiv.org/pdf/2501.11407v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.13093v2","updated":"2025-01-20T11:03:41Z","published":"2024-05-21T12:57:10Z","title":"Graph neural networks informed locally by thermodynamics","summary":"  Thermodynamics-informed neural networks employ inductive biases for the\nenforcement of the first and second principles of thermodynamics. To construct\nthese biases, a metriplectic evolution of the system is assumed. This provides\nexcellent results, when compared to uninformed, black box networks. While the\ndegree of accuracy can be increased in one or two orders of magnitude, in the\ncase of graph networks, this requires assembling global Poisson and dissipation\nmatrices, which breaks the local structure of such networks. In order to avoid\nthis drawback, a local version of the metriplectic biases has been developed in\nthis work, which avoids the aforementioned matrix assembly, thus preserving the\nnode-by-node structure of the graph networks. We apply this framework for\nexamples in the fields of solid and fluid mechanics. Our approach demonstrates\nsignificant computational efficiency and strong generalization capabilities,\naccurately making inferences on examples significantly different from those\nencountered during training.\n","authors":["Alicia Tierz","Iciar Alfaro","David González","Francisco Chinesta","Elías Cueto"],"pdf_url":"https://arxiv.org/pdf/2405.13093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.05270v3","updated":"2025-01-20T10:18:11Z","published":"2024-12-06T18:55:34Z","title":"APOLLO: SGD-like Memory, AdamW-level Performance","summary":"  Large language models (LLMs) are notoriously memory-intensive during\ntraining, particularly with the popular AdamW optimizer. This memory burden\nnecessitates using more or higher-end GPUs or reducing batch sizes, limiting\ntraining scalability and throughput. To address this, various memory-efficient\noptimizers have been proposed to reduce optimizer memory usage. However, they\nface critical challenges: (i) reliance on costly SVD operations; (ii)\nsignificant performance trade-offs compared to AdamW; and (iii) still\nsubstantial optimizer memory overhead to maintain competitive performance.\n  In this work, we identify that AdamW's learning rate adaptation rule can be\neffectively coarsened as a structured learning rate update. Based on this\ninsight, we propose Approximated Gradient Scaling for Memory-Efficient LLM\nOptimization (APOLLO), which approximates learning rate scaling using an\nauxiliary low-rank optimizer state based on pure random projection. This\nstructured learning rate update rule makes APOLLO highly tolerant to further\nmemory reductions while delivering comparable pre-training performance. Even\nits rank-1 variant, APOLLO-Mini, achieves superior pre-training performance\ncompared to AdamW with SGD-level memory costs.\n  Extensive experiments demonstrate that the APOLLO series performs on-par with\nor better than AdamW, while achieving greater memory savings by nearly\neliminating the optimization states of AdamW. These savings provide significant\nsystem-level benefits: (1) Enhanced Throughput: 3x throughput on an 8xA100-80GB\nsetup compared to AdamW by supporting 4x larger batch sizes. (2) Improved Model\nScalability: Pre-training LLaMA-13B with naive DDP on A100-80GB GPUs without\nsystem-level optimizations. (3) Low-End GPU Friendly Pre-training: Pre-training\nLLaMA-7B on a single GPU using less than 12 GB of memory with weight\nquantization.\n","authors":["Hanqing Zhu","Zhenyu Zhang","Wenyan Cong","Xi Liu","Sem Park","Vikas Chandra","Bo Long","David Z. Pan","Zhangyang Wang","Jinwon Lee"],"pdf_url":"https://arxiv.org/pdf/2412.05270v3.pdf","comment":"Preprint; update code link and visualization"},{"id":"http://arxiv.org/abs/2501.11378v1","updated":"2025-01-20T10:14:52Z","published":"2025-01-20T10:14:52Z","title":"Investigation of Whisper ASR Hallucinations Induced by Non-Speech Audio","summary":"  Hallucinations of deep neural models are amongst key challenges in automatic\nspeech recognition (ASR). In this paper, we investigate hallucinations of the\nWhisper ASR model induced by non-speech audio segments present during\ninference. By inducting hallucinations with various types of sounds, we show\nthat there exists a set of hallucinations that appear frequently. We then study\nhallucinations caused by the augmentation of speech with such sounds. Finally,\nwe describe the creation of a bag of hallucinations (BoH) that allows to remove\nthe effect of hallucinations through the post-processing of text\ntranscriptions. The results of our experiments show that such post-processing\nis capable of reducing word error rate (WER) and acts as a good safeguard\nagainst problematic hallucinations.\n","authors":["Mateusz Barański","Jan Jasiński","Julitta Bartolewska","Stanisław Kacprzak","Marcin Witkowski","Konrad Kowalczyk"],"pdf_url":"https://arxiv.org/pdf/2501.11378v1.pdf","comment":"Accepted for IEEE ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.11360v1","updated":"2025-01-20T09:44:07Z","published":"2025-01-20T09:44:07Z","title":"Federated Learning with Sample-level Client Drift Mitigation","summary":"  Federated Learning (FL) suffers from severe performance degradation due to\nthe data heterogeneity among clients. Existing works reveal that the\nfundamental reason is that data heterogeneity can cause client drift where the\nlocal model update deviates from the global one, and thus they usually tackle\nthis problem from the perspective of calibrating the obtained local update.\nDespite effectiveness, existing methods substantially lack a deep understanding\nof how heterogeneous data samples contribute to the formation of client drift.\nIn this paper, we bridge this gap by identifying that the drift can be viewed\nas a cumulative manifestation of biases present in all local samples and the\nbias between samples is different. Besides, the bias dynamically changes as the\nFL training progresses. Motivated by this, we propose FedBSS that first\nmitigates the heterogeneity issue in a sample-level manner, orthogonal to\nexisting methods. Specifically, the core idea of our method is to adopt a\nbias-aware sample selection scheme that dynamically selects the samples from\nsmall biases to large epoch by epoch to train progressively the local model in\neach round. In order to ensure the stability of training, we set the\ndiversified knowledge acquisition stage as the warm-up stage to avoid the local\noptimality caused by knowledge deviation in the early stage of the model.\nEvaluation results show that FedBSS outperforms state-of-the-art baselines. In\naddition, we also achieved effective results on feature distribution skew and\nnoise label dataset setting, which proves that FedBSS can not only reduce\nheterogeneity, but also has scalability and robustness.\n","authors":["Haoran Xu","Jiaze Li","Wanyi Wu","Hao Ren"],"pdf_url":"https://arxiv.org/pdf/2501.11360v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.11357v1","updated":"2025-01-20T09:38:30Z","published":"2025-01-20T09:38:30Z","title":"On the Dimension of Pullback Attractors in Recurrent Neural Networks","summary":"  Recurrent Neural Networks (RNNs) are high-dimensional state space models\ncapable of learning functions on sequence data. Recently, it has been\nconjectured that reservoir computers, a particular class of RNNs, trained on\nobservations of a dynamical systems can be interpreted as embeddings. This\nresult has been established for the case of linear reservoir systems. In this\nwork, we use a nonautonomous dynamical systems approach to establish an upper\nbound for the fractal dimension of the subset of reservoir state space\napproximated during training and prediction phase. We prove that when the input\nsequences comes from an Nin-dimensional invertible dynamical system, the\nfractal dimension of this set is bounded above by Nin. The result obtained here\nare useful in dimensionality reduction of computation in RNNs as well as\nestimating fractal dimensions of dynamical systems from limited observations of\ntheir time series. It is also a step towards understanding embedding properties\nof reservoir computers.\n","authors":["Muhammed Fadera"],"pdf_url":"https://arxiv.org/pdf/2501.11357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11354v1","updated":"2025-01-20T09:33:44Z","published":"2025-01-20T09:33:44Z","title":"Towards Advancing Code Generation with Large Language Models: A Research\n  Roadmap","summary":"  Recently, we have witnessed the rapid development of large language models,\nwhich have demonstrated excellent capabilities in the downstream task of code\ngeneration. However, despite their potential, LLM-based code generation still\nfaces numerous technical and evaluation challenges, particularly when embedded\nin real-world development. In this paper, we present our vision for current\nresearch directions, and provide an in-depth analysis of existing studies on\nthis task. We propose a six-layer vision framework that categorizes code\ngeneration process into distinct phases, namely Input Phase, Orchestration\nPhase, Development Phase, and Validation Phase. Additionally, we outline our\nvision workflow, which reflects on the currently prevalent frameworks. We\nsystematically analyse the challenges faced by large language models, including\nthose LLM-based agent frameworks, in code generation tasks. With these, we\noffer various perspectives and actionable recommendations in this area. Our aim\nis to provide guidelines for improving the reliability, robustness and\nusability of LLM-based code generation systems. Ultimately, this work seeks to\naddress persistent challenges and to provide practical suggestions for a more\npragmatic LLM-based solution for future code generation endeavors.\n","authors":["Haolin Jin","Huaming Chen","Qinghua Lu","Liming Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.11354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10322v2","updated":"2025-01-20T09:33:21Z","published":"2025-01-17T17:51:53Z","title":"Hierarchical Autoregressive Transformers: Combining Byte- and Word-Level\n  Processing for Robust, Adaptable Language Models","summary":"  Tokenization is a fundamental step in natural language processing, breaking\ntext into units that computational models can process. While learned subword\ntokenizers have become the de-facto standard, they present challenges such as\nlarge vocabularies, limited adaptability to new domains or languages, and\nsensitivity to spelling errors and variations. To overcome these limitations,\nwe investigate a hierarchical architecture for autoregressive language\nmodelling that combines character-level and word-level processing. It employs a\nlightweight character-level encoder to convert character sequences into word\nembeddings, which are then processed by a word-level backbone model and decoded\nback into characters via a compact character-level decoder. This method retains\nthe sequence compression benefits of word-level tokenization without relying on\na rigid, predefined vocabulary. We demonstrate, at scales up to 7 billion\nparameters, that hierarchical transformers match the downstream task\nperformance of subword-tokenizer-based models while exhibiting significantly\ngreater robustness to input perturbations. Additionally, during continued\npretraining on an out-of-domain language, our model trains almost twice as\nfast, achieves superior performance on the target language, and retains more of\nits previously learned knowledge. Hierarchical transformers pave the way for\nNLP systems that are more robust, flexible, and generalizable across languages\nand domains.\n","authors":["Pit Neitemeier","Björn Deiseroth","Constantin Eichenberg","Lukas Balles"],"pdf_url":"https://arxiv.org/pdf/2501.10322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03177v2","updated":"2025-01-20T09:02:20Z","published":"2024-11-05T15:22:26Z","title":"On Improved Conditioning Mechanisms and Pre-training Strategies for\n  Diffusion Models","summary":"  Large-scale training of latent diffusion models (LDMs) has enabled\nunprecedented quality in image generation. However, the key components of the\nbest performing LDM training recipes are oftentimes not available to the\nresearch community, preventing apple-to-apple comparisons and hindering the\nvalidation of progress in the field. In this work, we perform an in-depth study\nof LDM training recipes focusing on the performance of models and their\ntraining efficiency. To ensure apple-to-apple comparisons, we re-implement five\npreviously published models with their corresponding recipes. Through our\nstudy, we explore the effects of (i)~the mechanisms used to condition the\ngenerative model on semantic information (e.g., text prompt) and control\nmetadata (e.g., crop size, random flip flag, etc.) on the model performance,\nand (ii)~the transfer of the representations learned on smaller and\nlower-resolution datasets to larger ones on the training efficiency and model\nperformance. We then propose a novel conditioning mechanism that disentangles\nsemantic and control metadata conditionings and sets a new state-of-the-art in\nclass-conditional generation on the ImageNet-1k dataset -- with FID\nimprovements of 7% on 256 and 8% on 512 resolutions -- as well as text-to-image\ngeneration on the CC12M dataset -- with FID improvements of 8% on 256 and 23%\non 512 resolution.\n","authors":["Tariq Berrada Ifriqi","Pietro Astolfi","Melissa Hall","Reyhane Askari-Hemmat","Yohann Benchetrit","Marton Havasi","Matthew Muckley","Karteek Alahari","Adriana Romero-Soriano","Jakob Verbeek","Michal Drozdzal"],"pdf_url":"https://arxiv.org/pdf/2411.03177v2.pdf","comment":"Accepted as a conference paper (poster) for NeurIPS 2024"},{"id":"http://arxiv.org/abs/2305.16092v2","updated":"2025-01-20T08:52:28Z","published":"2023-05-25T14:24:37Z","title":"AI Techniques in the Microservices Life-Cycle: A Systematic Mapping\n  Study","summary":"  The use of AI in microservices (MSs) is an emerging field as indicated by a\nsubstantial number of surveys. However these surveys focus on a specific\nproblem using specific AI techniques, therefore not fully capturing the growth\nof research and the rise and disappearance of trends. In our systematic mapping\nstudy, we take an exhaustive approach to reveal all possible connections\nbetween the use of AI techniques for improving any quality attribute (QA) of\nMSs during the DevOps phases. Our results include 16 research themes that\nconnect to the intersection of particular QAs, AI domains and DevOps phases.\nMoreover by mapping identified future research challenges and relevant industry\ndomains, we can show that many studies aim to deliver prototypes to be\nautomated at a later stage, aiming at providing exploitable products in a\nnumber of key industry domains.\n","authors":["Sergio Moreschini","Shahrzad Pour","Ivan Lanese","Daniel Balouek-Thomert","Justus Bogner","Xiaozhou Li","Fabiano Pecorelli","Jacopo Soldani","Eddy Truyen","Davide Taibi"],"pdf_url":"https://arxiv.org/pdf/2305.16092v2.pdf","comment":"Currently under review at a journal"},{"id":"http://arxiv.org/abs/2501.11335v1","updated":"2025-01-20T08:40:15Z","published":"2025-01-20T08:40:15Z","title":"Few-shot Policy (de)composition in Conversational Question Answering","summary":"  The task of policy compliance detection (PCD) is to determine if a scenario\nis in compliance with respect to a set of written policies. In a conversational\nsetting, the results of PCD can indicate if clarifying questions must be asked\nto determine compliance status. Existing approaches usually claim to have\nreasoning capabilities that are latent or require a large amount of annotated\ndata. In this work, we propose logical decomposition for policy compliance\n(LDPC): a neuro-symbolic framework to detect policy compliance using large\nlanguage models (LLMs) in a few-shot setting. By selecting only a few exemplars\nalongside recently developed prompting techniques, we demonstrate that our\napproach soundly reasons about policy compliance conversations by extracting\nsub-questions to be answered, assigning truth values from contextual\ninformation, and explicitly producing a set of logic statements from the given\npolicies. The formulation of explicit logic graphs can in turn help answer\nPCDrelated questions with increased transparency and explainability. We apply\nthis approach to the popular PCD and conversational machine reading benchmark,\nShARC, and show competitive performance with no task-specific finetuning. We\nalso leverage the inherently interpretable architecture of LDPC to understand\nwhere errors occur, revealing ambiguities in the ShARC dataset and highlighting\nthe challenges involved with reasoning for conversational question answering.\n","authors":["Kyle Erwin","Guy Axelrod","Maria Chang","Achille Fokoue","Maxwell Crouse","Soham Dan","Tian Gao","Rosario Uceda-Sosa","Ndivhuwo Makondo","Naweed Khan","Alexander Gray"],"pdf_url":"https://arxiv.org/pdf/2501.11335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12809v2","updated":"2025-01-20T08:12:51Z","published":"2024-08-23T03:06:04Z","title":"DutyTTE: Deciphering Uncertainty in Origin-Destination Travel Time\n  Estimation","summary":"  Uncertainty quantification in travel time estimation (TTE) aims to estimate\nthe confidence interval for travel time, given the origin (O), destination (D),\nand departure time (T). Accurately quantifying this uncertainty requires\ngenerating the most likely path and assessing travel time uncertainty along the\npath. This involves two main challenges: 1) Predicting a path that aligns with\nthe ground truth, and 2) modeling the impact of travel time in each segment on\noverall uncertainty under varying conditions. We propose DutyTTE to address\nthese challenges. For the first challenge, we introduce a deep reinforcement\nlearning method to improve alignment between the predicted path and the ground\ntruth, providing more accurate travel time information from road segments to\nimprove TTE. For the second challenge, we propose a mixture of experts guided\nuncertainty quantification mechanism to better capture travel time uncertainty\nfor each segment under varying contexts. Additionally, we calibrate our results\nusing Hoeffding's upper-confidence bound to provide statistical guarantees for\nthe estimated confidence intervals. Extensive experiments on two real-world\ndatasets demonstrate the superiority of our proposed method.\n","authors":["Xiaowei Mao","Yan Lin","Shengnan Guo","Yubin Chen","Xingyu Xian","Haomin Wen","Qisen Xu","Youfang Lin","Huaiyu Wan"],"pdf_url":"https://arxiv.org/pdf/2408.12809v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2501.11325v1","updated":"2025-01-20T08:09:36Z","published":"2025-01-20T08:09:36Z","title":"CatV2TON: Taming Diffusion Transformers for Vision-Based Virtual Try-On\n  with Temporal Concatenation","summary":"  Virtual try-on (VTON) technology has gained attention due to its potential to\ntransform online retail by enabling realistic clothing visualization of images\nand videos. However, most existing methods struggle to achieve high-quality\nresults across image and video try-on tasks, especially in long video\nscenarios. In this work, we introduce CatV2TON, a simple and effective\nvision-based virtual try-on (V2TON) method that supports both image and video\ntry-on tasks with a single diffusion transformer model. By temporally\nconcatenating garment and person inputs and training on a mix of image and\nvideo datasets, CatV2TON achieves robust try-on performance across static and\ndynamic settings. For efficient long-video generation, we propose an\noverlapping clip-based inference strategy that uses sequential frame guidance\nand Adaptive Clip Normalization (AdaCN) to maintain temporal consistency with\nreduced resource demands. We also present ViViD-S, a refined video try-on\ndataset, achieved by filtering back-facing frames and applying 3D mask\nsmoothing for enhanced temporal consistency. Comprehensive experiments\ndemonstrate that CatV2TON outperforms existing methods in both image and video\ntry-on tasks, offering a versatile and reliable solution for realistic virtual\ntry-ons across diverse scenarios.\n","authors":["Zheng Chong","Wenqing Zhang","Shiyue Zhang","Jun Zheng","Xiao Dong","Haoxiang Li","Yiling Wu","Dongmei Jiang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2501.11325v1.pdf","comment":"11 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.03226v2","updated":"2025-01-20T07:35:57Z","published":"2025-01-06T18:59:13Z","title":"BoostStep: Boosting mathematical capability of Large Language Models via\n  improved single-step reasoning","summary":"  Cutting-edge large language models (LLMs) demonstrate promising performance\nin solving complex math problems with a divide-and-conquer pipeline and the\nassistance of in-context learning (ICL) examples. However, their potential for\nimprovement is limited by two critical problems within their ICL examples:\ngranularity-mismatch and the ensuing negative-effect noise problem.\nSpecifically, the LLMs are capable of the dividing process yet mostly failed by\ninaccurate reasoning within a few conquer steps, while the ICL examples\nretrieved in question-grained sometimes lack relevant steps for a specific\nchallenging reasoning step. Further, this disconnect may hinder the correct\nreasoning due to its irrelevance. To this end, we focus on improving the\nreasoning quality within each step and present BoostStep. BoostStep aligns the\ngranularity between the retrieving and reasoning on step grained, and provides\nhighly related ICL examples for each reasoning step with a novel `first-try'\nstrategy. BoostStep provides more relevant examples than the coarse\nquestion-grained strategy, enhancing the model reasoning quality within each\nstep steadily. BoostStep is a general and robust reasoning-enhancing method\nthat not only improves standalone reasoning performance but also integrates\nseamlessly with Monte Carlo Tree Search methods (MCTS) to refine both candidate\ngeneration and decision-making. Quantitatively, it improves GPT-4o and\nQwen2.5-Math-72B by 3.6\\% and 2.0\\% respectively on various mathematical\nbenchmarks, and 7.5\\% gain combined with MCTS.\n","authors":["Beichen Zhang","Yuhong Liu","Xiaoyi Dong","Yuhang Zang","Pan Zhang","Haodong Duan","Yuhang Cao","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03226v2.pdf","comment":"Codes and Data are available at\n  https://github.com/beichenzbc/BoostStep"},{"id":"http://arxiv.org/abs/2501.11309v1","updated":"2025-01-20T07:23:11Z","published":"2025-01-20T07:23:11Z","title":"Finer-CAM: Spotting the Difference Reveals Finer Details for Visual\n  Explanation","summary":"  Class activation map (CAM) has been widely used to highlight image regions\nthat contribute to class predictions. Despite its simplicity and computational\nefficiency, CAM often struggles to identify discriminative regions that\ndistinguish visually similar fine-grained classes. Prior efforts address this\nlimitation by introducing more sophisticated explanation processes, but at the\ncost of extra complexity. In this paper, we propose Finer-CAM, a method that\nretains CAM's efficiency while achieving precise localization of discriminative\nregions. Our key insight is that the deficiency of CAM lies not in \"how\" it\nexplains, but in \"what\" it explains}. Specifically, previous methods attempt to\nidentify all cues contributing to the target class's logit value, which\ninadvertently also activates regions predictive of visually similar classes. By\nexplicitly comparing the target class with similar classes and spotting their\ndifferences, Finer-CAM suppresses features shared with other classes and\nemphasizes the unique, discriminative details of the target class. Finer-CAM is\neasy to implement, compatible with various CAM methods, and can be extended to\nmulti-modal models for accurate localization of specific concepts.\nAdditionally, Finer-CAM allows adjustable comparison strength, enabling users\nto selectively highlight coarse object contours or fine discriminative details.\nQuantitatively, we show that masking out the top 5% of activated pixels by\nFiner-CAM results in a larger relative confidence drop compared to baselines.\nThe source code and demo are available at\nhttps://github.com/Imageomics/Finer-CAM.\n","authors":["Ziheng Zhang","Jianyang Gu","Arpita Chowdhury","Zheda Mai","David Carlyn","Tanya Berger-Wolf","Yu Su","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2501.11309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11306v1","updated":"2025-01-20T07:12:40Z","published":"2025-01-20T07:12:40Z","title":"Collaborative Imputation of Urban Time Series through Cross-city\n  Meta-learning","summary":"  Urban time series, such as mobility flows, energy consumption, and pollution\nrecords, encapsulate complex urban dynamics and structures. However, data\ncollection in each city is impeded by technical challenges such as budget\nlimitations and sensor failures, necessitating effective data imputation\ntechniques that can enhance data quality and reliability. Existing imputation\nmodels, categorized into learning-based and analytics-based paradigms, grapple\nwith the trade-off between capacity and generalizability. Collaborative\nlearning to reconstruct data across multiple cities holds the promise of\nbreaking this trade-off. Nevertheless, urban data's inherent irregularity and\nheterogeneity issues exacerbate challenges of knowledge sharing and\ncollaboration across cities. To address these limitations, we propose a novel\ncollaborative imputation paradigm leveraging meta-learned implicit neural\nrepresentations (INRs). INRs offer a continuous mapping from domain coordinates\nto target values, integrating the strengths of both paradigms. By imposing\nembedding theory, we first employ continuous parameterization to handle\nirregularity and reconstruct the dynamical system. We then introduce a\ncross-city collaborative learning scheme through model-agnostic meta learning,\nincorporating hierarchical modulation and normalization techniques to\naccommodate multiscale representations and reduce variance in response to\nheterogeneity. Extensive experiments on a diverse urban dataset from 20 global\ncities demonstrate our model's superior imputation performance and\ngeneralizability, underscoring the effectiveness of collaborative imputation in\nresource-constrained settings.\n","authors":["Tong Nie","Wei Ma","Jian Sun","Yu Yang","Jiannong Cao"],"pdf_url":"https://arxiv.org/pdf/2501.11306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11301v1","updated":"2025-01-20T07:05:15Z","published":"2025-01-20T07:05:15Z","title":"Question-to-Question Retrieval for Hallucination-Free Knowledge Access:\n  An Approach for Wikipedia and Wikidata Question Answering","summary":"  This paper introduces an approach to question answering over knowledge bases\nlike Wikipedia and Wikidata by performing \"question-to-question\" matching and\nretrieval from a dense vector embedding store. Instead of embedding document\ncontent, we generate a comprehensive set of questions for each logical content\nunit using an instruction-tuned LLM. These questions are vector-embedded and\nstored, mapping to the corresponding content. Vector embedding of user queries\nare then matched against this question vector store. The highest similarity\nscore leads to direct retrieval of the associated article content, eliminating\nthe need for answer generation. Our method achieves high cosine similarity ( >\n0.9 ) for relevant question pairs, enabling highly precise retrieval. This\napproach offers several advantages including computational efficiency, rapid\nresponse times, and increased scalability. We demonstrate its effectiveness on\nWikipedia and Wikidata, including multimedia content through structured fact\nretrieval from Wikidata, opening up new pathways for multimodal question\nanswering.\n","authors":["Santhosh Thottingal"],"pdf_url":"https://arxiv.org/pdf/2501.11301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01031v2","updated":"2025-01-20T06:42:48Z","published":"2025-01-02T03:26:13Z","title":"ValuesRAG: Enhancing Cultural Alignment Through Retrieval-Augmented\n  Contextual Learning","summary":"  Cultural values alignment in Large Language Models (LLMs) is a critical\nchallenge due to their tendency to embed Western-centric biases from training\ndata, leading to misrepresentations and fairness issues in cross-cultural\ncontexts. Recent approaches, such as role-assignment and few-shot learning,\noften struggle with reliable cultural alignment as they heavily rely on\npre-trained knowledge, lack scalability, and fail to capture nuanced cultural\nvalues effectively. To address these issues, we propose ValuesRAG, a novel and\neffective framework that applies Retrieval-Augmented Generation (RAG) with\nIn-Context Learning (ICL) to integrate cultural and demographic knowledge\ndynamically during text generation. Leveraging the World Values Survey (WVS)\ndataset, ValuesRAG first generates summaries of values for each individual.\nSubsequently, we curate several representative regional datasets to serve as\ntest datasets and retrieve relevant summaries of values based on demographic\nfeatures, followed by a reranking step to select the top-k relevant summaries.\nValuesRAG consistently outperforms baseline methods, both in the main\nexperiment and in the ablation study where only the values summary was\nprovided. Notably, ValuesRAG demonstrates an accuracy of 21% improvement over\nother baseline methods, highlighting its potential to foster culturally aligned\nAI systems and enhance the inclusivity of AI-driven applications.\n","authors":["Wonduk Seo","Zonghao Yuan","Yi Bu"],"pdf_url":"https://arxiv.org/pdf/2501.01031v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2501.11293v1","updated":"2025-01-20T06:28:27Z","published":"2025-01-20T06:28:27Z","title":"A Machine Learning Framework for Handling Unreliable Absence Label and\n  Class Imbalance for Marine Stinger Beaching Prediction","summary":"  Bluebottles (\\textit{Physalia} spp.) are marine stingers resembling\njellyfish, whose presence on Australian beaches poses a significant public risk\ndue to their venomous nature. Understanding the environmental factors driving\nbluebottles ashore is crucial for mitigating their impact, and machine learning\ntools are to date relatively unexplored. We use bluebottle marine stinger\npresence/absence data from beaches in Eastern Sydney, Australia, and compare\nmachine learning models (Multilayer Perceptron, Random Forest, and XGBoost) to\nidentify factors influencing their presence. We address challenges such as\nclass imbalance, class overlap, and unreliable absence data by employing data\naugmentation techniques, including the Synthetic Minority Oversampling\nTechnique (SMOTE), Random Undersampling, and Synthetic Negative Approach that\nexcludes the negative class. Our results show that SMOTE failed to resolve\nclass overlap, but the presence-focused approach effectively handled imbalance,\nclass overlap, and ambiguous absence data. The data attributes such as the wind\ndirection, which is a circular variable, emerged as a key factor influencing\nbluebottle presence, confirming previous inference studies. However, in the\nabsence of population dynamics, biological behaviours, and life cycles, the\nbest predictive model appears to be Random Forests combined with Synthetic\nNegative Approach. This research contributes to mitigating the risks posed by\nbluebottles to beachgoers and provides insights into handling class overlap and\nunreliable negative class in environmental modelling.\n","authors":["Amuche Ibenegbu","Amandine Schaeffer","Pierre Lafaye de Micheaux","Rohitash Chandra"],"pdf_url":"https://arxiv.org/pdf/2501.11293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17523v2","updated":"2025-01-20T05:44:07Z","published":"2024-12-23T12:47:04Z","title":"Constructing Fair Latent Space for Intersection of Fairness and\n  Explainability","summary":"  As the use of machine learning models has increased, numerous studies have\naimed to enhance fairness. However, research on the intersection of fairness\nand explainability remains insufficient, leading to potential issues in gaining\nthe trust of actual users. Here, we propose a novel module that constructs a\nfair latent space, enabling faithful explanation while ensuring fairness. The\nfair latent space is constructed by disentangling and redistributing labels and\nsensitive attributes, allowing the generation of counterfactual explanations\nfor each type of information. Our module is attached to a pretrained generative\nmodel, transforming its biased latent space into a fair latent space.\nAdditionally, since only the module needs to be trained, there are advantages\nin terms of time and cost savings, without the need to train the entire\ngenerative model. We validate the fair latent space with various fairness\nmetrics and demonstrate that our approach can effectively provide explanations\nfor biased decisions and assurances of fairness.\n","authors":["Hyungjun Joo","Hyeonggeun Han","Sehwan Kim","Sangwoo Hong","Jungwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2412.17523v2.pdf","comment":"14 pages, 5 figures, accepted in AAAI 2025"},{"id":"http://arxiv.org/abs/2501.11284v1","updated":"2025-01-20T05:44:01Z","published":"2025-01-20T05:44:01Z","title":"RedStar: Does Scaling Long-CoT Data Unlock Better Slow-Reasoning\n  Systems?","summary":"  Can scaling transform reasoning? In this work, we explore the untapped\npotential of scaling Long Chain-of-Thought (Long-CoT) data to 1000k samples,\npioneering the development of a slow-thinking model, RedStar. Through extensive\nexperiments with various LLMs and different sizes, we uncover the ingredients\nfor specialization and scale for Long-CoT training. Surprisingly, even smaller\nmodels show significant performance gains with limited data, revealing the\nsample efficiency of Long-CoT and the critical role of sample difficulty in the\nlearning process. Our findings demonstrate that Long-CoT reasoning can be\neffectively triggered with just a few thousand examples, while larger models\nachieve unparalleled improvements. We also introduce reinforcement learning\n(RL)-scale training as a promising direction for advancing slow-thinking\nsystems. RedStar shines across domains: on the MATH-Hard benchmark,\nRedStar-code-math boosts performance from 66.2\\% to 81.6\\%, and on the USA Math\nOlympiad (AIME), it solves 46.7\\% of problems using only 21k mixed-code-math\ndatasets. In multimodal tasks like GeoQA and MathVista-GEO, RedStar-Geo\nachieves competitive results with minimal Long-CoT data, outperforming other\nslow-thinking systems like QvQ-Preview. Compared to QwQ, RedStar strikes the\nperfect balance between reasoning and generalizability. Our work highlights\nthat, with careful tuning, scaling Long-CoT can unlock extraordinary reasoning\ncapabilities-even with limited dataset and set a new standard for slow-thinking\nmodels across diverse challenges. Our data and models are released at\nhttps://huggingface.co/RedStar-Reasoning.\n","authors":["Haotian Xu","Xing Wu","Weinong Wang","Zhongzhi Li","Da Zheng","Boyuan Chen","Yi Hu","Shijia Kang","Jiaming Ji","Yingying Zhang","Zhijiang Guo","Yaodong Yang","Muhan Zhang","Debing Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11284v1.pdf","comment":"technique-report, https://huggingface.co/RedStar-Reasoning"},{"id":"http://arxiv.org/abs/2406.11239v3","updated":"2025-01-20T05:17:54Z","published":"2024-06-17T06:07:32Z","title":"SilverSpeak: Evading AI-Generated Text Detectors using Homoglyphs","summary":"  The advent of Large Language Models (LLMs) has enabled the generation of text\nthat increasingly exhibits human-like characteristics. As the detection of such\ncontent is of significant importance, substantial research has been conducted\nwith the objective of developing reliable AI-generated text detectors. These\ndetectors have demonstrated promising results on test data, but recent research\nhas revealed that they can be circumvented by employing different techniques.\n  In this paper, we present homoglyph-based attacks (A $\\rightarrow$ Cyrillic\nA) as a means of circumventing existing detectors. We conduct a comprehensive\nevaluation to assess the effectiveness of these attacks on seven detectors,\nincluding ArguGPT, Binoculars, DetectGPT, Fast-DetectGPT, Ghostbuster, OpenAI's\ndetector, and watermarking techniques, on five different datasets. Our findings\ndemonstrate that homoglyph-based attacks can effectively circumvent\nstate-of-the-art detectors, leading them to classify all texts as either\nAI-generated or human-written (decreasing the average Matthews Correlation\nCoefficient from 0.64 to -0.01). Through further examination, we extract the\ntechnical justification underlying the success of the attacks, which varies\nacross detectors. Finally, we discuss the implications of these findings and\npotential defenses against such attacks.\n","authors":["Aldan Creo","Shushanta Pudasaini"],"pdf_url":"https://arxiv.org/pdf/2406.11239v3.pdf","comment":"Workshop on Detecting AI Generated Content at COLING 2025"},{"id":"http://arxiv.org/abs/2501.11270v1","updated":"2025-01-20T04:39:13Z","published":"2025-01-20T04:39:13Z","title":"Spatiotemporal Air Quality Mapping in Urban Areas Using Sparse Sensor\n  Data, Satellite Imagery, Meteorological Factors, and Spatial Features","summary":"  Monitoring air pollution is crucial for protecting human health from exposure\nto harmful substances. Traditional methods of air quality monitoring, such as\nground-based sensors and satellite-based remote sensing, face limitations due\nto high deployment costs, sparse sensor coverage, and environmental\ninterferences. To address these challenges, this paper proposes a framework for\nhigh-resolution spatiotemporal Air Quality Index (AQI) mapping using sparse\nsensor data, satellite imagery, and various spatiotemporal factors. By\nleveraging Graph Neural Networks (GNNs), we estimate AQI values at unmonitored\nlocations based on both spatial and temporal dependencies. The framework\nincorporates a wide range of environmental features, including meteorological\ndata, road networks, points of interest (PoIs), population density, and urban\ngreen spaces, which enhance prediction accuracy. We illustrate the use of our\napproach through a case study in Lahore, Pakistan, where multi-resolution data\nis used to generate the air quality index map at a fine spatiotemporal scale.\n","authors":["Osama Ahmad","Zubair Khalid","Muhammad Tahir","Momin Uppal"],"pdf_url":"https://arxiv.org/pdf/2501.11270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03271v3","updated":"2025-01-20T04:24:56Z","published":"2025-01-05T00:08:52Z","title":"DPO Kernels: A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich\n  Paradigm for Direct Preference Optimization","summary":"  The rapid rise of large language models (LLMs) has unlocked many applications\nbut also underscores the challenge of aligning them with diverse values and\npreferences. Direct Preference Optimization (DPO) is central to alignment but\nconstrained by fixed divergences and limited feature transformations. We\npropose DPO-Kernels, which integrates kernel methods to address these issues\nthrough four key contributions: (i) Kernelized Representations with polynomial,\nRBF, Mahalanobis, and spectral kernels for richer transformations, plus a\nhybrid loss combining embedding-based and probability-based objectives; (ii)\nDivergence Alternatives (Jensen-Shannon, Hellinger, Renyi, Bhattacharyya,\nWasserstein, and f-divergences) for greater stability; (iii) Data-Driven\nSelection metrics that automatically choose the best kernel-divergence pair;\nand (iv) a Hierarchical Mixture of Kernels for both local precision and global\nmodeling. Evaluations on 12 datasets demonstrate state-of-the-art performance\nin factuality, safety, reasoning, and instruction following. Grounded in\nHeavy-Tailed Self-Regularization, DPO-Kernels maintains robust generalization\nfor LLMs, offering a comprehensive resource for further alignment research.\n","authors":["Amitava Das","Suranjana Trivedy","Danush Khanna","Rajarshi Roy","Gurpreet Singh","Basab Ghosh","Yaswanth Narsupalli","Vinija Jain","Vasu Sharma","Aishwarya Naresh Reganti","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2501.03271v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11264v1","updated":"2025-01-20T04:11:21Z","published":"2025-01-20T04:11:21Z","title":"Code Readability in the Age of Large Language Models: An Industrial Case\n  Study from Atlassian","summary":"  Programmers spend a significant amount of time reading code during the\nsoftware development process. This trend is amplified by the emergence of large\nlanguage models (LLMs) that automatically generate code. However, little is\nknown about the readability of the LLM-generated code and whether it is still\nimportant from practitioners' perspectives in this new era. In this paper, we\nconduct a survey to explore the practitioners' perspectives on code readability\nin the age of LLMs and investigate the readability of our LLM-based software\ndevelopment agents framework, HULA, by comparing its generated code with\nhuman-written code in real-world scenarios. Overall, the findings underscore\nthat (1) readability remains a critical aspect of software development; (2) the\nreadability of our LLM-generated code is comparable to human-written code,\nfostering the establishment of appropriate trust and driving the broad adoption\nof our LLM-powered software development platform.\n","authors":["Wannita Takerngsaksiri","Micheal Fu","Chakkrit Tantithamthavorn","Jirat Pasuksmit","Kun Chen","Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2501.11264v1.pdf","comment":"6 pages, 2 figures, 5 tables, under review"},{"id":"http://arxiv.org/abs/2501.07021v2","updated":"2025-01-20T04:00:48Z","published":"2025-01-13T02:47:49Z","title":"Neural Probabilistic Circuits: Enabling Compositional and Interpretable\n  Predictions through Logical Reasoning","summary":"  End-to-end deep neural networks have achieved remarkable success across\nvarious domains but are often criticized for their lack of interpretability.\nWhile post hoc explanation methods attempt to address this issue, they often\nfail to accurately represent these black-box models, resulting in misleading or\nincomplete explanations. To overcome these challenges, we propose an inherently\ntransparent model architecture called Neural Probabilistic Circuits (NPCs),\nwhich enable compositional and interpretable predictions through logical\nreasoning. In particular, an NPC consists of two modules: an attribute\nrecognition model, which predicts probabilities for various attributes, and a\ntask predictor built on a probabilistic circuit, which enables logical\nreasoning over recognized attributes to make class predictions. To train NPCs,\nwe introduce a three-stage training algorithm comprising attribute recognition,\ncircuit construction, and joint optimization. Moreover, we theoretically\ndemonstrate that an NPC's error is upper-bounded by a linear combination of the\nerrors from its modules. To further demonstrate the interpretability of NPC, we\nprovide both the most probable explanations and the counterfactual\nexplanations. Empirical results on four benchmark datasets show that NPCs\nstrike a balance between interpretability and performance, achieving results\ncompetitive even with those of end-to-end black-box models while providing\nenhanced interpretability.\n","authors":["Weixin Chen","Simon Yu","Huajie Shao","Lui Sha","Han Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07021v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04022v2","updated":"2025-01-20T03:44:43Z","published":"2024-10-05T03:54:25Z","title":"Efficient Large-Scale Urban Parking Prediction: Graph Coarsening Based\n  on Real-Time Parking Service Capability","summary":"  With the sharp increase in the number of vehicles, the issue of parking\ndifficulties has emerged as an urgent challenge that many cities need to\naddress promptly. In the task of predicting large-scale urban parking data,\nexisting research often lacks effective deep learning models and strategies. To\ntackle this challenge, this paper proposes an innovative framework for\npredicting large-scale urban parking graphs leveraging real-time service\ncapabilities, aimed at improving the accuracy and efficiency of parking\npredictions. Specifically, we introduce a graph attention mechanism that\nassesses the real-time service capabilities of parking lots to construct a\ndynamic parking graph that accurately reflects real preferences in parking\nbehavior. To effectively handle large-scale parking data, this study combines\ngraph coarsening techniques with temporal convolutional autoencoders to achieve\nunified dimension reduction of the complex urban parking graph structure and\nfeatures. Subsequently, we use a spatio-temporal graph convolutional model to\nmake predictions based on the coarsened graph, and a pre-trained\nautoencoder-decoder module restores the predicted results to their original\ndata dimensions, completing the task. Our methodology has been rigorously\ntested on a real dataset from parking lots in Shenzhen. The experimental\nresults indicate that compared to traditional parking prediction models, our\nframework achieves improvements of 46.8\\% and 30.5\\% in accuracy and\nefficiency, respectively. Remarkably, with the expansion of the graph's scale,\nour framework's advantages become even more apparent, showcasing its\nsubstantial potential for solving complex urban parking dilemmas in practical\nscenarios.\n","authors":["Yixuan Wang","Zhenwu Chen","Kangshuai Zhang","Yunduan Cui","Yang Yang","Lei Peng"],"pdf_url":"https://arxiv.org/pdf/2410.04022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11238v1","updated":"2025-01-20T02:57:02Z","published":"2025-01-20T02:57:02Z","title":"WSSM: Geographic-enhanced hierarchical state-space model for global\n  station weather forecast","summary":"  Global Station Weather Forecasting (GSWF), a prominent meteorological\nresearch area, is pivotal in providing timely localized weather predictions.\nDespite the progress existing models have made in the overall accuracy of the\nGSWF, executing high-precision extreme event prediction still presents a\nsubstantial challenge. The recent emergence of state-space models, with their\nability to efficiently capture continuous-time dynamics and latent states,\noffer potential solutions. However, early investigations indicated that Mamba\nunderperforms in the context of GSWF, suggesting further adaptation and\noptimization. To tackle this problem, in this paper, we introduce Weather\nState-space Model (WSSM), a novel Mamba-based approach tailored for GSWF.\nGeographical knowledge is integrated in addition to the widely-used positional\nencoding to represent the absolute special-temporal position. The multi-scale\ntime-frequency features are synthesized from coarse to fine to model the\nseasonal to extreme weather dynamic. Our method effectively improves the\noverall prediction accuracy and addresses the challenge of forecasting extreme\nweather events. The state-of-the-art results obtained on the Weather-5K subset\nunderscore the efficacy of the WSSM\n","authors":["Songru Yang","Zili Liu","Zhenwei Shi","Zhengxia Zou"],"pdf_url":"https://arxiv.org/pdf/2501.11238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09929v2","updated":"2025-01-20T02:51:47Z","published":"2025-01-17T02:55:23Z","title":"Steering Large Language Models with Feature Guided Activation Additions","summary":"  Effective and reliable control over large language model (LLM) behavior is a\nsignificant challenge. While activation steering methods, which add steering\nvectors to a model's hidden states, are a promising approach, existing\ntechniques often lack precision and interpretability in how they influence\nmodel outputs. We introduce Feature Guided Activation Additions (FGAA), a novel\nactivation steering method that leverages insights from Contrastive Activation\nAddition (CAA) and Sparse Autoencoder-Targeted Steering (SAE-TS). By operating\nin the latent space of a Sparse Autoencoder (SAE) and employing optimization\ntechniques to select desired SAE features, FGAA constructs precise steering\nvectors that provide better steering effects while maintaining coherence of\nsteered model outputs. In this regard, evaluations on Gemma-2-2B and Gemma-2-9B\nmodels across various steering tasks demonstrate that FGAA outperforms existing\nsteering methods of CAA, SAE decoder steering, and SAE-TS. Our results also\nhighlight important trade-offs between steering scale and general model\ncapabilities that are consistent across all tested steering methods.\n","authors":["Samuel Soo","Wesley Teng","Chandrasekaran Balaganesh"],"pdf_url":"https://arxiv.org/pdf/2501.09929v2.pdf","comment":"7 maintext pages, 14 appendix pages"},{"id":"http://arxiv.org/abs/2411.03670v2","updated":"2025-01-20T02:44:19Z","published":"2024-11-06T05:09:34Z","title":"Touchstone Benchmark: Are We on the Right Way for Evaluating AI\n  Algorithms for Medical Segmentation?","summary":"  How can we test AI performance? This question seems trivial, but it isn't.\nStandard benchmarks often have problems such as in-distribution and small-size\ntest sets, oversimplified metrics, unfair comparisons, and short-term outcome\npressure. As a consequence, good performance on standard benchmarks does not\nguarantee success in real-world scenarios. To address these problems, we\npresent Touchstone, a large-scale collaborative segmentation benchmark of 9\ntypes of abdominal organs. This benchmark is based on 5,195 training CT scans\nfrom 76 hospitals around the world and 5,903 testing CT scans from 11\nadditional hospitals. This diverse test set enhances the statistical\nsignificance of benchmark results and rigorously evaluates AI algorithms across\nvarious out-of-distribution scenarios. We invited 14 inventors of 19 AI\nalgorithms to train their algorithms, while our team, as a third party,\nindependently evaluated these algorithms on three test sets. In addition, we\nalso evaluated pre-existing AI frameworks--which, differing from algorithms,\nare more flexible and can support different algorithms--including MONAI from\nNVIDIA, nnU-Net from DKFZ, and numerous other open-source frameworks. We are\ncommitted to expanding this benchmark to encourage more innovation of AI\nalgorithms for the medical domain.\n","authors":["Pedro R. A. S. Bassi","Wenxuan Li","Yucheng Tang","Fabian Isensee","Zifu Wang","Jieneng Chen","Yu-Cheng Chou","Yannick Kirchhoff","Maximilian Rokuss","Ziyan Huang","Jin Ye","Junjun He","Tassilo Wald","Constantin Ulrich","Michael Baumgartner","Saikat Roy","Klaus H. Maier-Hein","Paul Jaeger","Yiwen Ye","Yutong Xie","Jianpeng Zhang","Ziyang Chen","Yong Xia","Zhaohu Xing","Lei Zhu","Yousef Sadegheih","Afshin Bozorgpour","Pratibha Kumari","Reza Azad","Dorit Merhof","Pengcheng Shi","Ting Ma","Yuxin Du","Fan Bai","Tiejun Huang","Bo Zhao","Haonan Wang","Xiaomeng Li","Hanxue Gu","Haoyu Dong","Jichen Yang","Maciej A. Mazurowski","Saumya Gupta","Linshan Wu","Jiaxin Zhuang","Hao Chen","Holger Roth","Daguang Xu","Matthew B. Blaschko","Sergio Decherchi","Andrea Cavalli","Alan L. Yuille","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.03670v2.pdf","comment":"Accepted to NeurIPS-2024"},{"id":"http://arxiv.org/abs/2501.11223v1","updated":"2025-01-20T02:16:19Z","published":"2025-01-20T02:16:19Z","title":"Reasoning Language Models: A Blueprint","summary":"  Reasoning language models (RLMs), also known as Large Reasoning Models\n(LRMs), such as OpenAI's o1 and o3, DeepSeek-V3, and Alibaba's QwQ, have\nredefined AI's problem-solving capabilities by extending large language models\n(LLMs) with advanced reasoning mechanisms. Yet, their high costs, proprietary\nnature, and complex architectures - uniquely combining Reinforcement Learning\n(RL), search heuristics, and LLMs - present accessibility and scalability\nchallenges. To address these, we propose a comprehensive blueprint that\norganizes RLM components into a modular framework, based on a survey and\nanalysis of all RLM works. This blueprint incorporates diverse reasoning\nstructures (chains, trees, graphs, and nested forms), reasoning strategies\n(e.g., Monte Carlo Tree Search, Beam Search), RL concepts (policy, value models\nand others), and supervision schemes (Output-Based and Process-Based\nSupervision). We also provide detailed mathematical formulations and\nalgorithmic specifications to simplify RLM implementation. By showing how\nschemes like LLaMA-Berry, QwQ, Journey Learning, and Graph of Thoughts fit as\nspecial cases, we demonstrate the blueprint's versatility and unifying\npotential. To illustrate its utility, we introduce x1, a modular implementation\nfor rapid RLM prototyping and experimentation. Using x1 and a literature\nreview, we provide key insights, such as multi-phase training for policy and\nvalue models, and the importance of familiar training distributions. Finally,\nwe outline how RLMs can integrate with a broader LLM ecosystem, including tools\nand databases. Our work demystifies RLM construction, democratizes advanced\nreasoning capabilities, and fosters innovation, aiming to mitigate the gap\nbetween \"rich AI\" and \"poor AI\" by lowering barriers to RLM development and\nexperimentation.\n","authors":["Maciej Besta","Julia Barth","Eric Schreiber","Ales Kubicek","Afonso Catarino","Robert Gerstenberger","Piotr Nyczyk","Patrick Iff","Yueling Li","Sam Houliston","Tomasz Sternal","Marcin Copik","Grzegorz Kwaśniewski","Jürgen Müller","Łukasz Flis","Hannes Eberhard","Hubert Niewiadomski","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2501.11223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11218v1","updated":"2025-01-20T01:49:37Z","published":"2025-01-20T01:49:37Z","title":"Leveraging GANs For Active Appearance Models Optimized Model Fitting","summary":"  Generative Adversarial Networks (GANs) have gained prominence in refining\nmodel fitting tasks in computer vision, particularly in domains involving\ndeformable models like Active Appearance Models (AAMs). This paper explores the\nintegration of GANs to enhance the AAM fitting process, addressing challenges\nin optimizing nonlinear parameters associated with appearance and shape\nvariations. By leveraging GANs' adversarial training framework, the aim is to\nminimize fitting errors and improve convergence rates. Achieving robust\nperformance even in cases with high appearance variability and occlusions. Our\napproach demonstrates significant improvements in accuracy and computational\nefficiency compared to traditional optimization techniques, thus establishing\nGANs as a potent tool for advanced image model fitting.\n","authors":["Anurag Awasthi"],"pdf_url":"https://arxiv.org/pdf/2501.11218v1.pdf","comment":"9 pages, 2 figures, in proceeding at conference"},{"id":"http://arxiv.org/abs/2404.16375v2","updated":"2025-01-20T00:29:19Z","published":"2024-04-25T07:29:17Z","title":"List Items One by One: A New Data Source and Learning Paradigm for\n  Multimodal LLMs","summary":"  Set-of-Mark (SoM) Prompting unleashes the visual grounding capability of\nGPT-4V, by enabling the model to associate visual objects with tags inserted on\nthe image. These tags, marked with alphanumerics, can be indexed via text\ntokens for easy reference. Despite the extraordinary performance from GPT-4V,\nwe observe that other Multimodal Large Language Models (MLLMs) struggle to\nunderstand these visual tags. To promote the learning of SoM prompting for\nopen-source models, we propose a new learning paradigm: \"list items one by\none,\" which asks the model to enumerate and describe all visual tags placed on\nthe image following the alphanumeric orders of tags. By integrating our curated\ndataset with other visual instruction tuning datasets, we are able to equip\nexisting MLLMs with the SoM prompting ability. Furthermore, we evaluate our\nfinetuned SoM models on five MLLM benchmarks. We find that this new dataset,\neven in a relatively small size (10k-30k images with tags), significantly\nenhances visual reasoning capabilities and reduces hallucinations for MLLMs.\nPerhaps surprisingly, these improvements persist even when the visual tags are\nomitted from input images during inference. This suggests the potential of\n\"list items one by one\" as a new paradigm for training MLLMs, which strengthens\nthe object-text alignment through the use of visual tags in the training stage.\nFinally, we conduct analyses by probing trained models to understand the\nworking mechanism of SoM. Our code and data are available at\n\\url{https://github.com/zzxslp/SoM-LLaVA}.\n","authors":["An Yan","Zhengyuan Yang","Junda Wu","Wanrong Zhu","Jianwei Yang","Linjie Li","Kevin Lin","Jianfeng Wang","Julian McAuley","Jianfeng Gao","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16375v2.pdf","comment":"published at COLM-2024"}]},"2025-01-19T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2410.07166v3","updated":"2025-01-19T19:29:50Z","published":"2024-10-09T17:59:00Z","title":"Embodied Agent Interface: Benchmarking LLMs for Embodied Decision Making","summary":"  We aim to evaluate Large Language Models (LLMs) for embodied decision making.\nWhile a significant body of work has been leveraging LLMs for decision making\nin embodied environments, we still lack a systematic understanding of their\nperformance because they are usually applied in different domains, for\ndifferent purposes, and built based on different inputs and outputs.\nFurthermore, existing evaluations tend to rely solely on a final success rate,\nmaking it difficult to pinpoint what ability is missing in LLMs and where the\nproblem lies, which in turn blocks embodied agents from leveraging LLMs\neffectively and selectively. To address these limitations, we propose a\ngeneralized interface (Embodied Agent Interface) that supports the\nformalization of various types of tasks and input-output specifications of\nLLM-based modules. Specifically, it allows us to unify 1) a broad set of\nembodied decision-making tasks involving both state and temporally extended\ngoals, 2) four commonly-used LLM-based modules for decision making: goal\ninterpretation, subgoal decomposition, action sequencing, and transition\nmodeling, and 3) a collection of fine-grained metrics which break down\nevaluation into various types of errors, such as hallucination errors,\naffordance errors, various types of planning errors, etc. Overall, our\nbenchmark offers a comprehensive assessment of LLMs' performance for different\nsubtasks, pinpointing the strengths and weaknesses in LLM-powered embodied AI\nsystems, and providing insights for effective and selective use of LLMs in\nembodied decision making.\n","authors":["Manling Li","Shiyu Zhao","Qineng Wang","Kangrui Wang","Yu Zhou","Sanjana Srivastava","Cem Gokmen","Tony Lee","Li Erran Li","Ruohan Zhang","Weiyu Liu","Percy Liang","Li Fei-Fei","Jiayuan Mao","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2410.07166v3.pdf","comment":"Accepted for oral presentation at NeurIPS 2024 in the Datasets and\n  Benchmarks track. Final Camera version"},{"id":"http://arxiv.org/abs/2501.11149v1","updated":"2025-01-19T19:14:05Z","published":"2025-01-19T19:14:05Z","title":"CART-MPC: Coordinating Assistive Devices for Robot-Assisted Transferring\n  with Multi-Agent Model Predictive Control","summary":"  Bed-to-wheelchair transferring is a ubiquitous activity of daily living\n(ADL), but especially challenging for caregiving robots with limited payloads.\nWe develop a novel algorithm that leverages the presence of other assistive\ndevices: a Hoyer sling and a wheelchair for coarse manipulation of heavy loads,\nalongside a robot arm for fine-grained manipulation of deformable objects\n(Hoyer sling straps). We instrument the Hoyer sling and wheelchair with\nactuators and sensors so that they can become intelligent agents in the\nalgorithm. We then focus on one subtask of the transferring ADL -- tying Hoyer\nsling straps to the sling bar -- that exemplifies the challenges of transfer:\nmulti-agent planning, deformable object manipulation, and generalization to\nvarying hook shapes, sling materials, and care recipient bodies. To address\nthese challenges, we propose CART-MPC, a novel algorithm based on turn-taking\nmulti-agent model predictive control that uses a learned neural dynamics model\nfor a keypoint-based representation of the deformable Hoyer sling strap, and a\nnovel cost function that leverages linking numbers from knot theory and neural\namortization to accelerate inference. We validate it in both RCareWorld\nsimulation and real-world environments. In simulation, CART-MPC successfully\ngeneralizes across diverse hook designs, sling materials, and care recipient\nbody shapes. In the real world, we show zero-shot sim-to-real generalization\ncapabilities to tie deformable Hoyer sling straps on a sling bar towards\ntransferring a manikin from a hospital bed to a wheelchair. See our website for\nsupplementary materials: https://emprise.cs.cornell.edu/cart-mpc/.\n","authors":["Ruolin Ye","Shuaixing Chen","Yunting Yan","Joyce Yang","Christina Ge","Jose Barreiros","Kate Tsui","Tom Silver","Tapomayukh Bhattacharjee"],"pdf_url":"https://arxiv.org/pdf/2501.11149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11111v1","updated":"2025-01-19T16:55:50Z","published":"2025-01-19T16:55:50Z","title":"OpenLiDARMap: Zero-Drift Point Cloud Mapping using Map Priors","summary":"  Accurate localization is a critical component of mobile autonomous systems,\nespecially in Global Navigation Satellite Systems (GNSS)-denied environments\nwhere traditional methods fail. In such scenarios, environmental sensing is\nessential for reliable operation. However, approaches such as LiDAR odometry\nand Simultaneous Localization and Mapping (SLAM) suffer from drift over long\ndistances, especially in the absence of loop closures. Map-based localization\noffers a robust alternative, but the challenge lies in creating and\ngeoreferencing maps without GNSS support. To address this issue, we propose a\nmethod for creating georeferenced maps without GNSS by using publicly available\ndata, such as building footprints and surface models derived from sparse aerial\nscans. Our approach integrates these data with onboard LiDAR scans to produce\ndense, accurate, georeferenced 3D point cloud maps. By combining an Iterative\nClosest Point (ICP) scan-to-scan and scan-to-map matching strategy, we achieve\nhigh local consistency without suffering from long-term drift. Thus, we\neliminate the reliance on GNSS for the creation of georeferenced maps. The\nresults demonstrate that LiDAR-only mapping can produce accurate georeferenced\npoint cloud maps when augmented with existing map priors.\n","authors":["Dominik Kulmer","Maximilian Leitenstern","Marcel Weinmann","Markus Lienkamp"],"pdf_url":"https://arxiv.org/pdf/2501.11111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12716v5","updated":"2025-01-19T16:32:25Z","published":"2024-12-17T09:30:31Z","title":"Unsupervised UAV 3D Trajectories Estimation with Sparse Point Clouds","summary":"  Compact UAV systems, while advancing delivery and surveillance, pose\nsignificant security challenges due to their small size, which hinders\ndetection by traditional methods. This paper presents a cost-effective,\nunsupervised UAV detection method using spatial-temporal sequence processing to\nfuse multiple LiDAR scans for accurate UAV tracking in real-world scenarios.\nOur approach segments point clouds into foreground and background, analyzes\nspatial-temporal data, and employs a scoring mechanism to enhance detection\naccuracy. Tested on a public dataset, our solution placed 4th in the CVPR 2024\nUG2+ Challenge, demonstrating its practical effectiveness. We plan to\nopen-source all designs, code, and sample data for the research community\ngithub.com/lianghanfang/UnLiDAR-UAV-Est.\n","authors":["Hanfang Liang","Yizhuo Yang","Jinming Hu","Jianfei Yang","Fen Liu","Shenghai Yuan"],"pdf_url":"https://arxiv.org/pdf/2412.12716v5.pdf","comment":"This paper has been accepted for presentation at the IEEE\n  International Conference on Acoustics, Speech, and Signal Processing (ICASSP)\n  2025. 2025 IEEE Trademark. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses"},{"id":"http://arxiv.org/abs/2312.14436v3","updated":"2025-01-19T16:21:24Z","published":"2023-12-22T04:56:37Z","title":"REBEL: Reward Regularization-Based Approach for Robotic Reinforcement\n  Learning from Human Feedback","summary":"  The effectiveness of reinforcement learning (RL) agents in continuous control\nrobotics tasks is mainly dependent on the design of the underlying reward\nfunction, which is highly prone to reward hacking. A misalignment between the\nreward function and underlying human preferences (values, social norms) can\nlead to catastrophic outcomes in the real world especially in the context of\nrobotics for critical decision making. Recent methods aim to mitigate\nmisalignment by learning reward functions from human preferences and\nsubsequently performing policy optimization. However, these methods\ninadvertently introduce a distribution shift during reward learning due to\nignoring the dependence of agent-generated trajectories on the reward learning\nobjective, ultimately resulting in sub-optimal alignment. Hence, in this work,\nwe address this challenge by advocating for the adoption of regularized reward\nfunctions that more accurately mirror the intended behaviors of the agent. We\npropose a novel concept of reward regularization within the robotic RLHF (RL\nfrom Human Feedback) framework, which we refer to as \\emph{agent preferences}.\nOur approach uniquely incorporates not just human feedback in the form of\npreferences but also considers the preferences of the RL agent itself during\nthe reward function learning process. This dual consideration significantly\nmitigates the issue of distribution shift in RLHF with a computationally\ntractable algorithm. We provide a theoretical justification for the proposed\nalgorithm by formulating the robotic RLHF problem as a bilevel optimization\nproblem and developing a computationally tractable version of the same. We\ndemonstrate the efficiency of our algorithm {\\ours} in several continuous\ncontrol benchmarks in DeepMind Control Suite \\cite{tassa2018deepmind}.\n","authors":["Souradip Chakraborty","Anukriti Singh","Amisha Bhaskar","Pratap Tokekar","Dinesh Manocha","Amrit Singh Bedi"],"pdf_url":"https://arxiv.org/pdf/2312.14436v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11088v1","updated":"2025-01-19T15:56:04Z","published":"2025-01-19T15:56:04Z","title":"Multi-LiCa: A Motion and Targetless Multi LiDAR-to-LiDAR Calibration\n  Framework","summary":"  Today's autonomous vehicles rely on a multitude of sensors to perceive their\nenvironment. To improve the perception or create redundancy, the sensor's\nalignment relative to each other must be known. With Multi-LiCa, we present a\nnovel approach for the alignment, e.g. calibration. We present an automatic\nmotion- and targetless approach for the extrinsic multi LiDAR-to-LiDAR\ncalibration without the need for additional sensor modalities or an initial\ntransformation input. We propose a two-step process with feature-based matching\nfor the coarse alignment and a GICP-based fine registration in combination with\na cost-based matching strategy. Our approach can be applied to any number of\nsensors and positions if there is a partial overlap between the field of view\nof single sensors. We show that our pipeline is better generalized to different\nsensor setups and scenarios and is on par or better in calibration accuracy\nthan existing approaches. The presented framework is integrated in ROS 2 but\ncan also be used as a standalone application. To build upon our work, our\nsource code is available at: https://github.com/TUMFTM/Multi_LiCa.\n","authors":["Dominik Kulmer","Ilir Tahiraj","Andrii Chumak","Markus Lienkamp"],"pdf_url":"https://arxiv.org/pdf/2501.11088v1.pdf","comment":"2024 IEEE International Conference on Multisensor Fusion and\n  Integration for Intelligent Systems, 2835-947X"},{"id":"http://arxiv.org/abs/2412.12698v5","updated":"2025-01-19T15:29:46Z","published":"2024-12-17T09:16:28Z","title":"Audio Array-Based 3D UAV Trajectory Estimation with LiDAR\n  Pseudo-Labeling","summary":"  As small unmanned aerial vehicles (UAVs) become increasingly prevalent, there\nis growing concern regarding their impact on public safety and privacy,\nhighlighting the need for advanced tracking and trajectory estimation\nsolutions. In response, this paper introduces a novel framework that utilizes\naudio array for 3D UAV trajectory estimation. Our approach incorporates a\nself-supervised learning model, starting with the conversion of audio data into\nmel-spectrograms, which are analyzed through an encoder to extract crucial\ntemporal and spectral information. Simultaneously, UAV trajectories are\nestimated using LiDAR point clouds via unsupervised methods. These LiDAR-based\nestimations act as pseudo labels, enabling the training of an Audio Perception\nNetwork without requiring labeled data. In this architecture, the LiDAR-based\nsystem operates as the Teacher Network, guiding the Audio Perception Network,\nwhich serves as the Student Network. Once trained, the model can independently\npredict 3D trajectories using only audio signals, with no need for LiDAR data\nor external ground truth during deployment. To further enhance precision, we\napply Gaussian Process modeling for improved spatiotemporal tracking. Our\nmethod delivers top-tier performance on the MMAUD dataset, establishing a new\nbenchmark in trajectory estimation using self-supervised learning techniques\nwithout reliance on ground truth annotations.\n","authors":["Allen Lei","Tianchen Deng","Han Wang","Jianfei Yang","Shenghai Yuan"],"pdf_url":"https://arxiv.org/pdf/2412.12698v5.pdf","comment":"Accepted for ICASSP"},{"id":"http://arxiv.org/abs/2501.10991v1","updated":"2025-01-19T09:23:57Z","published":"2025-01-19T09:23:57Z","title":"Front Hair Styling Robot System Using Path Planning for Root-Centric\n  Strand Adjustment","summary":"  Hair styling is a crucial aspect of personal grooming, significantly\ninfluenced by the appearance of front hair. While brushing is commonly used\nboth to detangle hair and for styling purposes, existing research primarily\nfocuses on robotic systems for detangling hair, with limited exploration into\nrobotic hair styling. This research presents a novel robotic system designed to\nautomatically adjust front hairstyles, with an emphasis on path planning for\nroot-centric strand adjustment. The system utilizes images to compare the\ncurrent hair state with the desired target state through an orientation map of\nhair strands. By concentrating on the differences in hair orientation and\nspecifically targeting adjustments at the root of each strand, the system\nperforms detailed styling tasks. The path planning approach ensures effective\nalignment of the hairstyle with the target, and a closed-loop mechanism refines\nthese adjustments to accurately evolve the hairstyle towards the desired\noutcome. Experimental results demonstrate that the proposed system achieves a\nhigh degree of similarity and consistency in front hair styling, showing\npromising results for automated, precise hairstyle adjustments.\n","authors":["Soonhyo Kim","Naoaki Kanazawa","Shun Hasegawa","Kento Kawaharazuka","Kei Okada"],"pdf_url":"https://arxiv.org/pdf/2501.10991v1.pdf","comment":"Accepted at IEEE/SICE SII2025"},{"id":"http://arxiv.org/abs/2404.17906v3","updated":"2025-01-19T05:53:33Z","published":"2024-04-27T13:33:42Z","title":"VIEW: Visual Imitation Learning with Waypoints","summary":"  Robots can use Visual Imitation Learning (VIL) to learn manipulation tasks\nfrom video demonstrations. However, translating visual observations into\nactionable robot policies is challenging due to the high-dimensional nature of\nvideo data. This challenge is further exacerbated by the morphological\ndifferences between humans and robots, especially when the video demonstrations\nfeature humans performing tasks. To address these problems we introduce Visual\nImitation lEarning with Waypoints (VIEW), an algorithm that significantly\nenhances the sample efficiency of human-to-robot VIL. VIEW achieves this\nefficiency using a multi-pronged approach: extracting a condensed prior\ntrajectory that captures the demonstrator's intent, employing an agent-agnostic\nreward function for feedback on the robot's actions, and utilizing an\nexploration algorithm that efficiently samples around waypoints in the\nextracted trajectory. VIEW also segments the human trajectory into grasp and\ntask phases to further accelerate learning efficiency. Through comprehensive\nsimulations and real-world experiments, VIEW demonstrates improved performance\ncompared to current state-of-the-art VIL methods. VIEW enables robots to learn\nmanipulation tasks involving multiple objects from arbitrarily long video\ndemonstrations. Additionally, it can learn standard manipulation tasks such as\npushing or moving objects from a single video demonstration in under 30\nminutes, with fewer than 20 real-world rollouts. Code and videos here:\nhttps://collab.me.vt.edu/view/\n","authors":["Ananth Jonnavittula","Sagar Parekh","Dylan P. Losey"],"pdf_url":"https://arxiv.org/pdf/2404.17906v3.pdf","comment":"27 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.10950v1","updated":"2025-01-19T05:13:39Z","published":"2025-01-19T05:13:39Z","title":"Factor Graph-Based Active SLAM for Spacecraft Proximity Operations","summary":"  We investigate a scenario where a chaser spacecraft or satellite equipped\nwith a monocular camera navigates in close proximity to a target spacecraft.\nThe satellite's primary objective is to construct a representation of the\noperational environment and localize itself within it, utilizing the available\nimage data. We frame the joint task of state trajectory and map estimation as\nan instance of smoothing-based simultaneous localization and mapping (SLAM),\nwhere the underlying structure of the problem is represented as a factor graph.\nRather than considering estimation and planning as separate tasks, we propose\nto control the camera observations to actively reduce the uncertainty of the\nestimation variables, the spacecraft state, and the map landmarks. This is\naccomplished by adopting an information-theoretic metric to reason about the\nimpact of candidate actions on the evolution of the belief state. Numerical\nsimulations indicate that the proposed method successfully captures the\ninterplay between planning and estimation, hence yielding reduced uncertainty\nand higher accuracy when compared to commonly adopted passive sensing\nstrategies.\n","authors":["Lorenzo Ticozzi","Panagiotis Tsiotras"],"pdf_url":"https://arxiv.org/pdf/2501.10950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10924v1","updated":"2025-01-19T02:58:22Z","published":"2025-01-19T02:58:22Z","title":"Adaptive Target Localization under Uncertainty using Multi-Agent Deep\n  Reinforcement Learning with Knowledge Transfer","summary":"  Target localization is a critical task in sensitive applications, where\nmultiple sensing agents communicate and collaborate to identify the target\nlocation based on sensor readings. Existing approaches investigated the use of\nMulti-Agent Deep Reinforcement Learning (MADRL) to tackle target localization.\nNevertheless, these methods do not consider practical uncertainties, like false\nalarms when the target does not exist or when it is unreachable due to\nenvironmental complexities. To address these drawbacks, this work proposes a\nnovel MADRL-based method for target localization in uncertain environments. The\nproposed MADRL method employs Proximal Policy Optimization to optimize the\ndecision-making of sensing agents, which is represented in the form of an\nactor-critic structure using Convolutional Neural Networks. The observations of\nthe agents are designed in an optimized manner to capture essential information\nin the environment, and a team-based reward functions is proposed to produce\ncooperative agents. The MADRL method covers three action dimensionalities that\ncontrol the agents' mobility to search the area for the target, detect its\nexistence, and determine its reachability. Using the concept of Transfer\nLearning, a Deep Learning model builds on the knowledge from the MADRL model to\naccurately estimating the target location if it is unreachable, resulting in\nshared representations between the models for faster learning and lower\ncomputational complexity. Collectively, the final combined model is capable of\nsearching for the target, determining its existence and reachability, and\nestimating its location accurately. The proposed method is tested using a\nradioactive target localization environment and benchmarked against existing\nmethods, showing its efficacy.\n","authors":["Ahmed Alagha","Rabeb Mizouni","Shakti Singh","Jamal Bentahar","Hadi Otrok"],"pdf_url":"https://arxiv.org/pdf/2501.10924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03682v2","updated":"2025-01-19T01:11:12Z","published":"2024-11-06T06:06:07Z","title":"LEGATO: Cross-Embodiment Imitation Using a Grasping Tool","summary":"  Cross-embodiment imitation learning enables policies trained on specific\nembodiments to transfer across different robots, unlocking the potential for\nlarge-scale imitation learning that is both cost-effective and highly reusable.\nThis paper presents LEGATO, a cross-embodiment imitation learning framework for\nvisuomotor skill transfer across varied kinematic morphologies. We introduce a\nhandheld gripper that unifies action and observation spaces, allowing tasks to\nbe defined consistently across robots. We train visuomotor policies on task\ndemonstrations using this gripper through imitation learning, applying\ntransformation to a motion-invariant space for computing the training loss.\nGripper motions generated by the policies are retargeted into\nhigh-degree-of-freedom whole-body motions using inverse kinematics for\ndeployment across diverse embodiments. Our evaluations in simulation and\nreal-robot experiments highlight the framework's effectiveness in learning and\ntransferring visuomotor skills across various robots. More information can be\nfound at the project page: https://ut-hcrl.github.io/LEGATO.\n","authors":["Mingyo Seo","H. Andy Park","Shenli Yuan","Yuke Zhu","Luis Sentis"],"pdf_url":"https://arxiv.org/pdf/2411.03682v2.pdf","comment":"Accepted to RA-L"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2410.20275v2","updated":"2025-01-19T22:50:14Z","published":"2024-10-26T21:05:54Z","title":"Advancing Hybrid Quantum Neural Network for Alternative Current Optimal\n  Power Flow","summary":"  Alternative Current Optimal Power Flow (AC-OPF) is essential for efficient\nplanning and real-time operation in power systems but is NP-hard and\nnon-convex, leading to significant computational challenges. Neural networks\n(NNs) offer computational speedups in solving OPF but face issues like\ndependency on large datasets, scalability limitations, and inability to enforce\nphysical constraints, compromising solution reliability. To overcome these\nlimitations, this paper proposes hybrid Quantum Neural Networks (QNNs) that\nintegrate quantum computing principles into neural network architectures.\nLeveraging quantum mechanics properties such as superposition and entanglement,\nQNNs can capture complex input-output relationships more effectively and learn\nfrom small or noisy datasets.To further improve the performance of QNNs and\ninvestigate the interplay between classical and quantum components in hybrid\narchitectures, we incorporate advanced techniques, including residual learning\nand physics-informed machine learning, into the hybrid QNN designs. These\nenhancements aim to boost convergence efficiency, lower errors, superior\ngeneralization, and robustness to quantum noise. Simulation results demonstrate\nthat these enhanced hybrid QNNs outperform typical hybrid QNNs in solving OPF\nproblems. This work provides valuable insights into the design and optimization\nof hybrid QNNs, highlighting the potential of quantum computation for broader\napplications in power systems.\n","authors":["Ze Hu","Ziqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.20275v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11188v1","updated":"2025-01-19T22:12:18Z","published":"2025-01-19T22:12:18Z","title":"Global Attitude Synchronization for Multi-agent Systems on SO(3)","summary":"  In this paper, we address the problem of attitude synchronization for a group\nof rigid body systems evolving on SO(3). The interaction among these systems is\nmodeled through an undirected, connected, and acyclic graph topology. First, we\npresent an almost global continuous distributed attitude synchronization scheme\nwith rigorously proven stability guarantees. Thereafter, we propose two global\ndistributed hybrid attitude synchronization schemes on SO(3). The first scheme\nis a hybrid control law that leverages angular velocities and relative\norientations to achieve global alignment to a common orientation. The second\nscheme eliminates the dependence on angular velocities by introducing dynamic\nauxiliary variables, while ensuring global asymptotic attitude synchronization.\nThis velocity-free control scheme relies exclusively on attitude information.\nSimulation results are provided to illustrate the effectiveness of the proposed\ndistributed attitude synchronization schemes.\n","authors":["Mouaad Boughellaba","Soulaimane Berkane","Abdelhamid Tayebi"],"pdf_url":"https://arxiv.org/pdf/2501.11188v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.01928"},{"id":"http://arxiv.org/abs/2501.09143v2","updated":"2025-01-19T21:39:29Z","published":"2025-01-15T20:50:15Z","title":"Reducing real-time complexity via sub-control Lyapunov functions: from\n  theory to experiments","summary":"  The techniques to design control Lyapunov functions (CLF), along with a\nproper stabilizing feedback, possibly in the presence of constraints, often\nprovide control laws that are too complex for proper implementation online,\nespecially when an optimization problem is involved. In this work, we show how\nto acquire an alternative, computationally attractive feedback. Given a nominal\nCLF and a nominal state feedback, we say that a different positive definite\nfunction is a Sub-control Lyapunov function (SCLF) if its Lyapunov derivative\nis negative-definite and bounded above by the Lyapunov derivative of the\nnominal function with the nominal control. It turns out that if we consider a\nfamily of basis functions, then a SCLF can be computed by linear programming,\nwith an infinite number of constraints. The idea is that although the offline\ncomputational burden to achieve the new controller and solve the linear program\nis considerable, the online computational burden is drastically reduced.\nComprehensive simulations and experiments on drone control are conducted to\ndemonstrate the effectiveness of the study.\n","authors":["Huu-Thinh Do","Franco Blanchini","Stefano Miani","Ionela Prodan"],"pdf_url":"https://arxiv.org/pdf/2501.09143v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13436v2","updated":"2025-01-19T21:39:21Z","published":"2024-02-21T00:09:41Z","title":"Optimisation of design parameters to improve performance of a planar\n  electromagnetic actuator","summary":"  Planar electromagnetic actuators based on the principle of linear motors are\nwidely employed for micro and nano positioning applications. These actuators\nusually employ a planar magnetic platform driven by a co-planar electromagnetic\ncoil. While these actuators offer a large motion range and high positioning\nresolution, their actuation bandwidth is limited due to relatively small\nelectromagnetic stiffness. We report optimization of the design parameters of\nthe electromagnetic coil and the magnetic assembly to maximize the\nelectromagnetic force and stiffness. Firstly, we derive closed-form expressions\nfor the electromagnetic forces and stiffness, which enable us to express these\nquantities in terms of the design parameters of the actuator. Secondly, based\non these derived expressions, we estimate the optimum values of the design\nparameters to maximize force and stiffness. Notably, for the optimum design\nparameters, the force and stiffness per unit volume can be increased by two and\nthree orders of magnitude, respectively by reducing the pitch of the\nelectromagnetic coil by a factor of 10. Lastly, we develop an electromagnetic\nactuator and evaluate its performance using a Microelectromechanical system\n(MEMS) based force sensor. By operating the force sensor in a feedback loop, we\nprecisely measure the generated electromagnetic forces for different design\nparameters of the actuator. The experimental results obtained align closely\nwith the analytical values, with an error of less than 15%.\n","authors":["K. S. Vikrant","D. Dadkhah","S. O. Reza Moheimani"],"pdf_url":"https://arxiv.org/pdf/2402.13436v2.pdf","comment":"The paper has been published on IEEE Transactions on Magnetics"},{"id":"http://arxiv.org/abs/2501.11136v1","updated":"2025-01-19T18:14:12Z","published":"2025-01-19T18:14:12Z","title":"A Novel Switch-Type Policy Network for Resource Allocation Problems:\n  Technical Report","summary":"  Deep Reinforcement Learning (DRL) has become a powerful tool for developing\ncontrol policies in queueing networks, but the common use of Multi-layer\nPerceptron (MLP) neural networks in these applications has significant\ndrawbacks. MLP architectures, while versatile, often suffer from poor sample\nefficiency and a tendency to overfit training environments, leading to\nsuboptimal performance on new, unseen networks. In response to these issues, we\nintroduce a switch-type neural network (STN) architecture designed to improve\nthe efficiency and generalization of DRL policies in queueing networks. The STN\nleverages structural patterns from traditional non-learning policies, ensuring\nconsistent action choices across similar states. This design not only\nstreamlines the learning process but also fosters better generalization by\nreducing the tendency to overfit. Our works presents three key contributions:\nfirst, the development of the STN as a more effective alternative to MLPs;\nsecond, empirical evidence showing that STNs achieve superior sample efficiency\nin various training scenarios; and third, experimental results demonstrating\nthat STNs match MLP performance in familiar environments and significantly\noutperform them in new settings. By embedding domain-specific knowledge, the\nSTN enhances the Proximal Policy Optimization (PPO) algorithm's effectiveness\nwithout compromising performance, suggesting its suitability for a wide range\nof queueing network control problems.\n","authors":["Jerrod Wigmore","Brooke Shrader","Eytan Modiano"],"pdf_url":"https://arxiv.org/pdf/2501.11136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10974v1","updated":"2025-01-19T07:27:24Z","published":"2025-01-19T07:27:24Z","title":"Sequential Change Detection for Learning in Piecewise Stationary Bandit\n  Environments","summary":"  A finite-horizon variant of the quickest change detection problem is\ninvestigated, which is motivated by a change detection problem that arises in\npiecewise stationary bandits. The goal is to minimize the \\emph{latency}, which\nis smallest threshold such that the probability that the detection delay\nexceeds the threshold is below a desired low level, while controlling the false\nalarm probability to a desired low level. When the pre- and post-change\ndistributions are unknown, two tests are proposed as candidate solutions. These\ntests are shown to attain order optimality in terms of the horizon.\nFurthermore, the growth in their latencies with respect to the false alarm\nprobability and late detection probability satisfies a property that is\ndesirable in regret analysis for piecewise stationary bandits. Numerical\nresults are provided to validate the theoretical performance results.\n","authors":["Yu-Han Huang","Venugopal V. Veeravalli"],"pdf_url":"https://arxiv.org/pdf/2501.10974v1.pdf","comment":"15 pages, 2 figures. arXiv admin note: text overlap with\n  arXiv:2501.01291"},{"id":"http://arxiv.org/abs/2501.10920v1","updated":"2025-01-19T02:18:57Z","published":"2025-01-19T02:18:57Z","title":"Data Enrichment Opportunities for Distribution Grid Cable Networks using\n  Variational Autoencoders","summary":"  Electricity distribution cable networks suffer from incomplete and unbalanced\ndata, hindering the effectiveness of machine learning models for predictive\nmaintenance and reliability evaluation. Features such as the installation date\nof the cables are frequently missing. To address data scarcity, this study\ninvestigates the application of Variational Autoencoders (VAEs) for data\nenrichment, synthetic data generation, imbalanced data handling, and outlier\ndetection. Based on a proof-of-concept case study for Denmark, targeting the\nimputation of missing age information in cable network asset registers, the\nanalysis underlines the potential of generative models to support data-driven\nmaintenance. However, the study also highlights several areas for improvement,\nincluding enhanced feature importance analysis, incorporating network\ncharacteristics and external features, and handling biases in missing data.\nFuture initiatives should expand the application of VAEs by incorporating\nsemi-supervised learning, advanced sampling techniques, and additional\ndistribution grid elements, including low-voltage networks, into the analysis.\n","authors":["Konrad Sundsgaard","Kutay Bölat","Guangya Yang"],"pdf_url":"https://arxiv.org/pdf/2501.10920v1.pdf","comment":null}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2501.09143v2","updated":"2025-01-19T21:39:29Z","published":"2025-01-15T20:50:15Z","title":"Reducing real-time complexity via sub-control Lyapunov functions: from\n  theory to experiments","summary":"  The techniques to design control Lyapunov functions (CLF), along with a\nproper stabilizing feedback, possibly in the presence of constraints, often\nprovide control laws that are too complex for proper implementation online,\nespecially when an optimization problem is involved. In this work, we show how\nto acquire an alternative, computationally attractive feedback. Given a nominal\nCLF and a nominal state feedback, we say that a different positive definite\nfunction is a Sub-control Lyapunov function (SCLF) if its Lyapunov derivative\nis negative-definite and bounded above by the Lyapunov derivative of the\nnominal function with the nominal control. It turns out that if we consider a\nfamily of basis functions, then a SCLF can be computed by linear programming,\nwith an infinite number of constraints. The idea is that although the offline\ncomputational burden to achieve the new controller and solve the linear program\nis considerable, the online computational burden is drastically reduced.\nComprehensive simulations and experiments on drone control are conducted to\ndemonstrate the effectiveness of the study.\n","authors":["Huu-Thinh Do","Franco Blanchini","Stefano Miani","Ionela Prodan"],"pdf_url":"https://arxiv.org/pdf/2501.09143v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13990v2","updated":"2025-01-19T18:43:04Z","published":"2024-12-18T16:10:44Z","title":"A geodesic convexity-like structure for the polar decomposition of a\n  square matrix","summary":"  We make a full landscape analysis of the (generally non-convex) orthogonal\nProcrustes problem. This problem is equivalent to computing the polar factor of\na square matrix. We reveal a convexity-like structure, which explains the\nalready established tractability of the problem and show that gradient descent\nin the orthogonal group computes the polar factor of a square matrix with\nlinear convergence rate if the matrix is invertible and with an algebraic one\nif the matrix is singular. These results are similar to the ones of Alimisis\nand Vandereycken (2024) for the symmetric eigenvalue problem.\n","authors":["Foivos Alimisis","Bart Vandereycken"],"pdf_url":"https://arxiv.org/pdf/2412.13990v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11127v1","updated":"2025-01-19T17:35:45Z","published":"2025-01-19T17:35:45Z","title":"A Regularized Online Newton Method for Stochastic Convex Bandits with\n  Linear Vanishing Noise","summary":"  We study a stochastic convex bandit problem where the subgaussian noise\nparameter is assumed to decrease linearly as the learner selects actions closer\nand closer to the minimizer of the convex loss function. Accordingly, we\npropose a Regularized Online Newton Method (RONM) for solving the problem,\nbased on the Online Newton Method (ONM) of arXiv:2406.06506. Our RONM reaches a\npolylogarithmic regret in the time horizon $n$ when the loss function grows\nquadratically in the constraint set, which recovers the results of\narXiv:2402.12042 in linear bandits. Our analyses rely on the growth rate of the\nprecision matrix $\\Sigma_t^{-1}$ in ONM and we find that linear growth solves\nthe question exactly. These analyses also help us obtain better convergence\nrates when the loss function grows faster. We also study and analyze two new\nbandit models: stochastic convex bandits with noise scaled to a subgaussian\nparameter function and convex bandits with stochastic multiplicative noise.\n","authors":["Jingxin Zhan","Yuchen Xin","Kaicheng Jin","Zhihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02604v3","updated":"2025-01-19T17:25:26Z","published":"2024-08-05T16:27:38Z","title":"Learning rheological parameters of non-Newtonian fluids from velocimetry\n  data","summary":"  We solve a Bayesian inverse Navier-Stokes (N-S) problem that assimilates\nvelocimetry data in order to jointly reconstruct the flow field and learn the\nunknown N-S parameters. By incorporating a Carreau shear-thinning viscosity\nmodel into the N-S problem, we devise an algorithm that learns the most likely\nCarreau parameters of a shear-thinning fluid, and estimates their\nuncertainties, from velocimetry data alone. We then conduct a flow-MRI\nexperiment to obtain velocimetry data of an axisymmetric laminar jet through an\nidealised medical device (FDA nozzle) for a blood analogue fluid. We show that\nthe algorithm can successfully reconstruct the flow field by learning the most\nlikely Carreau parameters, and that the learned parameters are in very good\nagreement with rheometry measurements. The algorithm accepts any algebraic\neffective viscosity model, as long as the model is differentiable, and it can\nbe extended to more complicated non-Newtonian fluids (e.g. Oldroyd-B fluid) if\na viscoelastic model is incorporated into the N-S problem.\n","authors":["Alexandros Kontogiannis","Richard Hodgkinson","Emily L. Manchester"],"pdf_url":"https://arxiv.org/pdf/2408.02604v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10986v1","updated":"2025-01-19T08:42:02Z","published":"2025-01-19T08:42:02Z","title":"Global Independence of Irrelevant Alternatives, State-Salient Decision\n  Rules and the Strict Condorcet Choice Function","summary":"  We present a simple proof of a well-known axiomatic characterization of\nstate-salient decision rules, using Weak Dominance Criterion and Global\nIndependence of Irrelevant Alternatives. Subsequently we provide a simple\naxiomatic characterization of the Strict-Condorcet choice function on the\ndomain of all preference profiles that have a strict-Condorcet winner, assuming\nthat if the first two ranks are occupied by the same two alternatives in all\nstates of nature, then the chosen alternative will be the one from these two\nthat is preferred to the other with probability greater than half-provided such\nan alternative exists. We also show that this result is not valid if we extend\nthe domain to the set of all preference profiles that have a unique\nweak-Condorcet winner.\n","authors":["Somdeb Lahiri"],"pdf_url":"https://arxiv.org/pdf/2501.10986v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.10984v1","updated":"2025-01-19T08:37:27Z","published":"2025-01-19T08:37:27Z","title":"Self-CephaloNet: A Two-stage Novel Framework using Operational Neural\n  Network for Cephalometric Analysis","summary":"  Cephalometric analysis is essential for the diagnosis and treatment planning\nof orthodontics. In lateral cephalograms, however, the manual detection of\nanatomical landmarks is a time-consuming procedure. Deep learning solutions\nhold the potential to address the time constraints associated with certain\ntasks; however, concerns regarding their performance have been observed. To\naddress this critical issue, we proposed an end-to-end cascaded deep learning\nframework (Self-CepahloNet) for the task, which demonstrated benchmark\nperformance over the ISBI 2015 dataset in predicting 19 dental landmarks. Due\nto their adaptive nodal capabilities, Self-ONN (self-operational neural\nnetworks) demonstrate superior learning performance for complex feature spaces\nover conventional convolutional neural networks. To leverage this attribute, we\nintroduced a novel self-bottleneck in the HRNetV2 (High Resolution Network)\nbackbone, which has exhibited benchmark performance on the ISBI 2015 dataset\nfor the dental landmark detection task. Our first-stage results surpassed\nprevious studies, showcasing the efficacy of our singular end-to-end deep\nlearning model, which achieved a remarkable 70.95% success rate in detecting\ncephalometric landmarks within a 2mm range for the Test1 and Test2 datasets.\nMoreover, the second stage significantly improved overall performance, yielding\nan impressive 82.25% average success rate for the datasets above within the\nsame 2mm distance. Furthermore, external validation was conducted using the PKU\ncephalogram dataset. Our model demonstrated a commendable success rate of\n75.95% within the 2mm range.\n","authors":["Md. Shaheenur Islam Sumon","Khandaker Reajul Islam","Tanzila Rafique","Gazi Shamim Hassan","Md. Sakib Abrar Hossain","Kanchon Kanti Podder","Noha Barhom","Faleh Tamimi","Abdulrahman Alqahtani","Muhammad E. H. Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2501.10984v1.pdf","comment":"The paper has been accepted for publication in Neural Computing and\n  Applications"},{"id":"http://arxiv.org/abs/2403.18552v2","updated":"2025-01-19T07:57:42Z","published":"2024-03-27T13:32:12Z","title":"Generalized convergence of the deep BSDE method: a step towards\n  fully-coupled FBSDEs and applications in stochastic control","summary":"  We are concerned with high-dimensional coupled FBSDE systems approximated by\nthe deep BSDE method of Han et al. (2018). It was shown by Han and Long (2020)\nthat the errors induced by the deep BSDE method admit a posteriori estimate\ndepending on the loss function, whenever the backward equation only couples\ninto the forward diffusion through the Y process. We generalize this result to\ndrift coefficients that may also depend on Z, and give sufficient conditions\nfor convergence under standard assumptions. The resulting conditions are\ndirectly verifiable for any equation. Consequently, unlike in earlier theory,\nour convergence analysis enables the treatment of FBSDEs stemming from\nstochastic optimal control problems. In particular, we provide a theoretical\njustification for the non-convergence of the deep BSDE method observed in\nrecent literature, and present direct guidelines for when convergence can be\nguaranteed in practice. Our theoretical findings are supported by several\nnumerical experiments in high-dimensional settings.\n","authors":["Balint Negyesi","Zhipeng Huang","Cornelis W. Oosterlee"],"pdf_url":"https://arxiv.org/pdf/2403.18552v2.pdf","comment":"25 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2501.10972v1","updated":"2025-01-19T07:18:08Z","published":"2025-01-19T07:18:08Z","title":"Multi-View Clustering Meets High-Dimensional Mixed Data: A Fusion\n  Regularized Method","summary":"  Multi-view clustering leverages consistent and complementary information\nacross multiple views to provide more comprehensive insights than analysis of\nsingle-view data. However, the heterogeneity and redundancy of high-dimensional\nmixed multi-view data pose significant challenges to the existing clustering\ntechniques. In this paper, we propose a novel multi-view fusion regularized\nclustering method with adaptive group sparsity, enabling reliable clustering\nwhile effectively capturing local features. Technically, for multi-view data\nwith mixed features exhibiting different distributions, different losses or\ndivergence metrics are considered with a collective fusion penalty to obtain\ncommon groups. Moreover, the non-convex group sparsity consisting of\ninter-group sparsity and intra-group sparsity is utilized to screen informative\nfeatures, thereby enhancing the robustness. Furthermore, we develop an\neffective proximal alternating direction method of multipliers (ADMM) and each\nsubproblem admits a closed-form solution. It is rigorously proven that this\nalgorithm globally converges to a Karush-Kuhn-Tucker (KKT) point, while\nestablishing the equivalence between local minimum points and KKT points within\na certain region. Extensive numerical experiments on both simulated and real\ndata validate the superior performance of the presented method in clustering\naccuracy and feature selection.\n","authors":["Xiangru Xing","Yan Li","Xin Wang","Huangyue Chen","Xianchao Xiu"],"pdf_url":"https://arxiv.org/pdf/2501.10972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04777v3","updated":"2025-01-19T06:20:58Z","published":"2024-09-07T09:37:19Z","title":"Optimization Hyper-parameter Laws for Large Language Models","summary":"  Large Language Models have driven significant AI advancements, yet their\ntraining is resource-intensive and highly sensitive to hyper-parameter\nselection. While scaling laws provide valuable guidance on model size and data\nrequirements, they fall short in choosing dynamic hyper-parameters, such as\nlearning-rate (LR) schedules, that evolve during training. To bridge this gap,\nwe present Optimization Hyper-parameter Laws (Opt-Laws), a framework that\neffectively captures the relationship between hyper-parameters and training\noutcomes, enabling the pre-selection of potential optimal schedules. Grounded\nin stochastic differential equations, Opt-Laws introduce novel mathematical\ninterpretability and offer a robust theoretical foundation for some popular LR\nschedules. Our extensive validation across diverse model sizes and data scales\ndemonstrates Opt-Laws' ability to accurately predict training loss and identify\noptimal LR schedule candidates in pre-training, continual training, and\nfine-tuning scenarios. This approach significantly reduces computational costs\nwhile enhancing overall model performance.\n","authors":["Xingyu Xie","Kuangyu Ding","Shuicheng Yan","Kim-Chuan Toh","Tianwen Wei"],"pdf_url":"https://arxiv.org/pdf/2409.04777v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10934v1","updated":"2025-01-19T04:02:00Z","published":"2025-01-19T04:02:00Z","title":"Automatic Calibration of Mesoscopic Traffic Simulation Using Vehicle\n  Trajectory Data","summary":"  Traffic simulation models have long been popular in modern traffic planning\nand operation applications. Efficient calibration of simulation models is\nusually a crucial step in a simulation study. However, traditional calibration\nprocedures are often resource-intensive and time-consuming, limiting the\nbroader adoption of simulation models. In this study, a vehicle\ntrajectory-based automatic calibration framework for mesoscopic traffic\nsimulation is proposed. The framework incorporates behavior models from both\nthe demand and the supply sides of a traffic network. An optimization-based\nnetwork flow estimation model is designed for demand and route choice\ncalibration. Dimensionality reduction techniques are incorporated to define the\nzoning system and the path choice set. A stochastic approximation model is\nestablished for capacity and driving behavior parameter calibration. The\napplicability and performance of the calibration framework are demonstrated\nthrough a case study for the City of Birmingham network in Michigan.\n","authors":["Ran Sun","Zihao Wang","Xingmin Wang","Henry X. Liu"],"pdf_url":"https://arxiv.org/pdf/2501.10934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10918v1","updated":"2025-01-19T02:02:21Z","published":"2025-01-19T02:02:21Z","title":"Packing Dijoins in Weighted Chordal Digraphs","summary":"  In a digraph, a dicut is a cut where all the arcs cross in one direction. A\ndijoin is a subset of arcs that intersects every dicut. Edmonds and Giles\nconjectured that in a weighted digraph, the minimum weight of a dicut is equal\nto the maximum size of a packing of dijoins. This has been disproved. However,\nthe unweighted version conjectured by Woodall remains open. We prove that the\nEdmonds-Giles conjecture is true if the underlying undirected graph is chordal.\nWe also give a strongly polynomial time algorithm to construct such a packing.\n","authors":["Gérard Cornuéjols","Siyue Liu","R. Ravi"],"pdf_url":"https://arxiv.org/pdf/2501.10918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08330v2","updated":"2025-01-19T01:14:03Z","published":"2025-01-14T18:59:09Z","title":"Gradient Equilibrium in Online Learning: Theory and Applications","summary":"  We present a new perspective on online learning that we refer to as gradient\nequilibrium: a sequence of iterates achieves gradient equilibrium if the\naverage of gradients of losses along the sequence converges to zero. In\ngeneral, this condition is not implied by nor implies sublinear regret. It\nturns out that gradient equilibrium is achievable by standard online learning\nmethods such as gradient descent and mirror descent with constant step sizes\n(rather than decaying step sizes, as is usually required for no regret).\nFurther, as we show through examples, gradient equilibrium translates into an\ninterpretable and meaningful property in online prediction problems spanning\nregression, classification, quantile estimation, and others. Notably, we show\nthat the gradient equilibrium framework can be used to develop a debiasing\nscheme for black-box predictions under arbitrary distribution shift, based on\nsimple post hoc online descent updates. We also show that post hoc gradient\nupdates can be used to calibrate predicted quantiles under distribution shift,\nand that the framework leads to unbiased Elo scores for pairwise preference\nprediction.\n","authors":["Anastasios N. Angelopoulos","Michael I. Jordan","Ryan J. Tibshirani"],"pdf_url":"https://arxiv.org/pdf/2501.08330v2.pdf","comment":"Code available at\n  https://github.com/aangelopoulos/gradient-equilibrium/"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.11196v1","updated":"2025-01-19T23:58:16Z","published":"2025-01-19T23:58:16Z","title":"Enhancing Brain Tumor Segmentation Using Channel Attention and Transfer\n  learning","summary":"  Accurate and efficient segmentation of brain tumors is critical for\ndiagnosis, treatment planning, and monitoring in clinical practice. In this\nstudy, we present an enhanced ResUNet architecture for automatic brain tumor\nsegmentation, integrating an EfficientNetB0 encoder, a channel attention\nmechanism, and an Atrous Spatial Pyramid Pooling (ASPP) module. The\nEfficientNetB0 encoder leverages pre-trained features to improve feature\nextraction efficiency, while the channel attention mechanism enhances the\nmodel's focus on tumor-relevant features. ASPP enables multiscale contextual\nlearning, crucial for handling tumors of varying sizes and shapes. The proposed\nmodel was evaluated on two benchmark datasets: TCGA LGG and BraTS 2020.\nExperimental results demonstrate that our method consistently outperforms the\nbaseline ResUNet and its EfficientNet variant, achieving Dice coefficients of\n0.903 and 0.851 and HD95 scores of 9.43 and 3.54 for whole tumor and tumor core\nregions on the BraTS 2020 dataset, respectively. compared with state-of-the-art\nmethods, our approach shows competitive performance, particularly in whole\ntumor and tumor core segmentation. These results indicate that combining a\npowerful encoder with attention mechanisms and ASPP can significantly enhance\nbrain tumor segmentation performance. The proposed approach holds promise for\nfurther optimization and application in other medical image segmentation tasks.\n","authors":["Majid Behzadpour","Ebrahim Azizi","Kai Wu","Bengie L. Ortiz"],"pdf_url":"https://arxiv.org/pdf/2501.11196v1.pdf","comment":"13 pages, 1 figure"},{"id":"http://arxiv.org/abs/2412.10908v3","updated":"2025-01-19T23:02:47Z","published":"2024-12-14T17:35:27Z","title":"Do large language vision models understand 3D shapes?","summary":"  Large vision language models (LVLM) are the leading A.I approach for\nachieving a general visual understanding of the world. Models such as GPT,\nClaude, Gemini, and LLama can use images to understand and analyze complex\nvisual scenes. 3D objects and shapes are the basic building blocks of the\nworld, recognizing them is a fundamental part of human perception. The goal of\nthis work is to test whether LVLMs truly understand 3D shapes by testing the\nmodels ability to identify and match objects of the exact same 3D shapes but\nwith different orientations and materials/textures. A large number of test\nimages were created using CGI with a huge number of highly diverse objects,\nmaterials, and scenes. The results of this test show that the ability of such\nmodels to match 3D shapes is significantly below humans but much higher than\nrandom guesses. Suggesting that the models have gained some abstract\nunderstanding of 3D shapes but still trail far beyond humans in this task.\nMainly it seems that the models can easily identify the same object with a\ndifferent orientation as well as matching identical 3D shapes of the same\norientation but with different materials and textures. However, when both the\nobject material and orientation are changed, all models perform poorly relative\nto humans. Code and benchmark are available.\n","authors":["Sagi Eppel"],"pdf_url":"https://arxiv.org/pdf/2412.10908v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15658v3","updated":"2025-01-19T22:25:54Z","published":"2024-06-21T21:33:16Z","title":"TorchSpatial: A Location Encoding Framework and Benchmark for Spatial\n  Representation Learning","summary":"  Spatial representation learning (SRL) aims at learning general-purpose neural\nnetwork representations from various types of spatial data (e.g., points,\npolylines, polygons, networks, images, etc.) in their native formats. Learning\ngood spatial representations is a fundamental problem for various downstream\napplications such as species distribution modeling, weather forecasting,\ntrajectory generation, geographic question answering, etc. Even though SRL has\nbecome the foundation of almost all geospatial artificial intelligence (GeoAI)\nresearch, we have not yet seen significant efforts to develop an extensive deep\nlearning framework and benchmark to support SRL model development and\nevaluation. To fill this gap, we propose TorchSpatial, a learning framework and\nbenchmark for location (point) encoding, which is one of the most fundamental\ndata types of spatial representation learning. TorchSpatial contains three key\ncomponents: 1) a unified location encoding framework that consolidates 15\ncommonly recognized location encoders, ensuring scalability and reproducibility\nof the implementations; 2) the LocBench benchmark tasks encompassing 7\ngeo-aware image classification and 10 geo-aware image regression datasets; 3) a\ncomprehensive suite of evaluation metrics to quantify geo-aware model's overall\nperformance as well as their geographic bias, with a novel Geo-Bias Score\nmetric. Finally, we provide a detailed analysis and insights into the model\nperformance and geographic bias of different location encoders. We believe\nTorchSpatial will foster future advancement of spatial representation learning\nand spatial fairness in GeoAI research. The TorchSpatial model framework and\nLocBench benchmark are available at https://github.com/seai-lab/TorchSpatial,\nand the Geo-Bias Score evaluation framework is available at\nhttps://github.com/seai-lab/PyGBS.\n","authors":["Nemin Wu","Qian Cao","Zhangyu Wang","Zeping Liu","Yanlin Qi","Jielu Zhang","Joshua Ni","Xiaobai Yao","Hongxu Ma","Lan Mu","Stefano Ermon","Tanuja Ganu","Akshay Nambi","Ni Lao","Gengchen Mai"],"pdf_url":"https://arxiv.org/pdf/2406.15658v3.pdf","comment":"10 pages, 2 figures. Accepted by NeurIPS 2024 Datasets and Benchmarks\n  Track"},{"id":"http://arxiv.org/abs/2501.11175v1","updated":"2025-01-19T21:25:53Z","published":"2025-01-19T21:25:53Z","title":"ProKeR: A Kernel Perspective on Few-Shot Adaptation of Large\n  Vision-Language Models","summary":"  The growing popularity of Contrastive Language-Image Pretraining (CLIP) has\nled to its widespread application in various visual downstream tasks. To\nenhance CLIP's effectiveness and versatility, efficient few-shot adaptation\ntechniques have been widely adopted. Among these approaches, training-free\nmethods, particularly caching methods exemplified by Tip-Adapter, have gained\nattention for their lightweight adaptation without the need for additional\nfine-tuning. In this paper, we revisit Tip-Adapter from a kernel perspective,\nshowing that caching methods function as local adapters and are connected to a\nwell-established kernel literature. Drawing on this insight, we offer a\ntheoretical understanding of how these methods operate and suggest multiple\navenues for enhancing the Tip-Adapter baseline. Notably, our analysis shows the\nimportance of incorporating global information in local adapters. Therefore, we\nsubsequently propose a global method that learns a proximal regularizer in a\nreproducing kernel Hilbert space (RKHS) using CLIP as a base learner. Our\nmethod, which we call ProKeR (Proximal Kernel ridge Regression), has a closed\nform solution and achieves state-of-the-art performances across 11 datasets in\nthe standard few-shot adaptation benchmark.\n","authors":["Yassir Bendou","Amine Ouasfi","Vincent Gripon","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2501.11175v1.pdf","comment":"Code available at https://ybendou.github.io/ProKeR"},{"id":"http://arxiv.org/abs/2501.11171v1","updated":"2025-01-19T21:16:39Z","published":"2025-01-19T21:16:39Z","title":"Counteracting temporal attacks in Video Copy Detection","summary":"  Video Copy Detection (VCD) plays a crucial role in copyright protection and\ncontent verification by identifying duplicates and near-duplicates in\nlarge-scale video databases. The META AI Challenge on video copy detection\nprovided a benchmark for evaluating state-of-the-art methods, with the\nDual-level detection approach emerging as a winning solution. This method\nintegrates Video Editing Detection and Frame Scene Detection to handle\nadversarial transformations and large datasets efficiently. However, our\nanalysis reveals significant limitations in the VED component, particularly in\nits ability to handle exact copies. Moreover, Dual-level detection shows\nvulnerability to temporal attacks. To address it, we propose an improved frame\nselection strategy based on local maxima of interframe differences, which\nenhances robustness against adversarial temporal modifications while\nsignificantly reducing computational overhead. Our method achieves an increase\nof 1.4 to 5.8 times in efficiency over the standard 1 FPS approach. Compared to\nDual-level detection method, our approach maintains comparable micro-average\nprecision ($\\mu$AP) while also demonstrating improved robustness against\ntemporal attacks. Given 56\\% reduced representation size and the inference time\nof more than 2 times faster, our approach is more suitable to real-world\nresource restriction.\n","authors":["Katarzyna Fojcik","Piotr Syga"],"pdf_url":"https://arxiv.org/pdf/2501.11171v1.pdf","comment":"14 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.11168v1","updated":"2025-01-19T21:03:36Z","published":"2025-01-19T21:03:36Z","title":"DeepEyeNet: Adaptive Genetic Bayesian Algorithm Based Hybrid\n  ConvNeXtTiny Framework For Multi-Feature Glaucoma Eye Diagnosis","summary":"  Glaucoma is a leading cause of irreversible blindness worldwide, emphasizing\nthe critical need for early detection and intervention. In this paper, we\npresent DeepEyeNet, a novel and comprehensive framework for automated glaucoma\ndetection using retinal fundus images. Our approach integrates advanced image\nstandardization through dynamic thresholding, precise optic disc and cup\nsegmentation via a U-Net model, and comprehensive feature extraction\nencompassing anatomical and texture-based features. We employ a customized\nConvNeXtTiny based Convolutional Neural Network (CNN) classifier, optimized\nusing our Adaptive Genetic Bayesian Optimization (AGBO) algorithm. This\nproposed AGBO algorithm balances exploration and exploitation in hyperparameter\ntuning, leading to significant performance improvements. Experimental results\non the EyePACS-AIROGS-light-V2 dataset demonstrate that DeepEyeNet achieves a\nhigh classification accuracy of 95.84%, which was possible due to the effective\noptimization provided by the novel AGBO algorithm, outperforming existing\nmethods. The integration of sophisticated image processing techniques, deep\nlearning, and optimized hyperparameter tuning through our proposed AGBO\nalgorithm positions DeepEyeNet as a promising tool for early glaucoma detection\nin clinical settings.\n","authors":["Angshuman Roy","Anuvab Sen","Soumyajit Gupta","Soham Haldar","Subhrajit Deb","Taraka Nithin Vankala","Arkapravo Das"],"pdf_url":"https://arxiv.org/pdf/2501.11168v1.pdf","comment":"7 pages, 12 figures, 3 Tables, Accepted by 15th IEEE Symposium Series\n  on Computational Intelligence (SSCI) 2025, Trondheim, Norway, Europe"},{"id":"http://arxiv.org/abs/2501.11159v1","updated":"2025-01-19T20:15:13Z","published":"2025-01-19T20:15:13Z","title":"LiFT: Lightweight, FPGA-tailored 3D object detection based on LiDAR data","summary":"  This paper presents LiFT, a lightweight, fully quantized 3D object detection\nalgorithm for LiDAR data, optimized for real-time inference on FPGA platforms.\nThrough an in-depth analysis of FPGA-specific limitations, we identify a set of\nFPGA-induced constraints that shape the algorithm's design. These include a\ncomputational complexity limit of 30 GMACs (billion multiply-accumulate\noperations), INT8 quantization for weights and activations, 2D cell-based\nprocessing instead of 3D voxels, and minimal use of skip connections. To meet\nthese constraints while maximizing performance, LiFT combines novel mechanisms\nwith state-of-the-art techniques such as reparameterizable convolutions and\nfully sparse architecture. Key innovations include the Dual-bound Pillar\nFeature Net, which boosts performance without increasing complexity, and an\nefficient scheme for INT8 quantization of input features. With a computational\ncost of just 20.73 GMACs, LiFT stands out as one of the few algorithms\ntargeting minimal-complexity 3D object detection. Among comparable methods,\nLiFT ranks first, achieving an mAP of 51.84% and an NDS of 61.01% on the\nchallenging NuScenes validation dataset. The code will be available at\nhttps://github.com/vision-agh/lift.\n","authors":["Konrad Lis","Tomasz Kryjak","Marek Gorgon"],"pdf_url":"https://arxiv.org/pdf/2501.11159v1.pdf","comment":"The paper has been accepted for the DASIP 2025 workshop in\n  conjunction with the HiPEAC 2025 conference in Barcelona"},{"id":"http://arxiv.org/abs/2501.11153v1","updated":"2025-01-19T19:36:09Z","published":"2025-01-19T19:36:09Z","title":"Efficient Frame Extraction: A Novel Approach Through Frame Similarity\n  and Surgical Tool Tracking for Video Segmentation","summary":"  The interest in leveraging Artificial Intelligence (AI) for surgical\nprocedures to automate analysis has witnessed a significant surge in recent\nyears. One of the primary tools for recording surgical procedures and\nconducting subsequent analyses, such as performance assessment, is through\nvideos. However, these operative videos tend to be notably lengthy compared to\nother fields, spanning from thirty minutes to several hours, which poses a\nchallenge for AI models to effectively learn from them. Despite this challenge,\nthe foreseeable increase in the volume of such videos in the near future\nnecessitates the development and implementation of innovative techniques to\ntackle this issue effectively. In this article, we propose a novel technique\ncalled Kinematics Adaptive Frame Recognition (KAFR) that can efficiently\neliminate redundant frames to reduce dataset size and computation time while\nretaining useful frames to improve accuracy. Specifically, we compute the\nsimilarity between consecutive frames by tracking the movement of surgical\ntools. Our approach follows these steps: i) Tracking phase: a YOLOv8 model is\nutilized to detect tools presented in the scene, ii) Similarity phase:\nSimilarities between consecutive frames are computed by estimating variation in\nthe spatial positions and velocities of the tools, iii) Classification phase: A\nX3D CNN is trained to classify segmentation. We evaluate the effectiveness of\nour approach by analyzing datasets obtained through retrospective reviews of\ncases at two referral centers. The Gastrojejunostomy (GJ) dataset covers\nprocedures performed between 2017 to 2021, while the Pancreaticojejunostomy\n(PJ) dataset spans from 2011 to 2022 at the same centers. By adaptively\nselecting relevant frames, we achieve a tenfold reduction in the number of\nframes while improving accuracy by 4.32% (from 0.749 to 0.7814).\n","authors":["Huu Phong Nguyen","Shekhar Madhav Khairnar","Sofia Garces Palacios","Amr Al-Abbas","Francisco Antunes","Bernardete Ribeiro","Melissa E. Hogg","Amer H. Zureikat","Patricio M. Polanco","Herbert Zeh III","Ganesh Sankaranarayanan"],"pdf_url":"https://arxiv.org/pdf/2501.11153v1.pdf","comment":"17"},{"id":"http://arxiv.org/abs/2501.07017v2","updated":"2025-01-19T19:06:32Z","published":"2025-01-13T02:33:28Z","title":"UNetVL: Enhancing 3D Medical Image Segmentation with Chebyshev KAN\n  Powered Vision-LSTM","summary":"  3D medical image segmentation has progressed considerably due to\nConvolutional Neural Networks (CNNs) and Vision Transformers (ViTs), yet these\nmethods struggle to balance long-range dependency acquisition with\ncomputational efficiency. To address this challenge, we propose UNETVL (U-Net\nVision-LSTM), a novel architecture that leverages recent advancements in\ntemporal information processing. UNETVL incorporates Vision-LSTM (ViL) for\nimproved scalability and memory functions, alongside an efficient Chebyshev\nKolmogorov-Arnold Networks (KAN) to handle complex and long-range dependency\npatterns more effectively. We validated our method on the ACDC and AMOS2022\n(post challenge Task 2) benchmark datasets, showing a significant improvement\nin mean Dice score compared to recent state-of-the-art approaches, especially\nover its predecessor, UNETR, with increases of 7.3% on ACDC and 15.6% on AMOS,\nrespectively. Extensive ablation studies were conducted to demonstrate the\nimpact of each component in UNETVL, providing a comprehensive understanding of\nits architecture. Our code is available at https://github.com/tgrex6/UNETVL,\nfacilitating further research and applications in this domain.\n","authors":["Xuhui Guo","Tanmoy Dam","Rohan Dhamdhere","Gourav Modanwal","Anant Madabhushi"],"pdf_url":"https://arxiv.org/pdf/2501.07017v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14672v3","updated":"2025-01-19T19:03:04Z","published":"2024-08-26T22:39:08Z","title":"Physically Feasible Semantic Segmentation","summary":"  State-of-the-art semantic segmentation models are typically optimized in a\ndata-driven fashion, minimizing solely per-pixel or per-segment classification\nobjectives on their training data. This purely data-driven paradigm often leads\nto absurd segmentations, especially when the domain of input images is shifted\nfrom the one encountered during training. For instance, state-of-the-art models\nmay assign the label ``road to a segment that is located above a segment that\nis respectively labeled as ``sky, although our knowledge of the physical world\ndictates that such a configuration is not feasible for images captured by\nforward-facing upright cameras. Our method, Physically Feasible Semantic\nSegmentation (PhyFea), first extracts explicit constraints that govern spatial\nclass relations from the semantic segmentation training set at hand in an\noffline, data-driven fashion, and then enforces a morphological yet\ndifferentiable loss that penalizes violations of these constraints during\ntraining to promote prediction feasibility. PhyFea is a plug-and-play method\nand yields consistent and significant performance improvements over diverse\nstate-of-the-art networks on which we implement it across the ADE20K,\nCityscapes, and ACDC datasets. Code and models will be made publicly available.\n","authors":["Shamik Basu","Luc Van Gool","Christos Sakaridis"],"pdf_url":"https://arxiv.org/pdf/2408.14672v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11140v1","updated":"2025-01-19T18:53:30Z","published":"2025-01-19T18:53:30Z","title":"CLOFAI: A Dataset of Real And Fake Image Classification Tasks for\n  Continual Learning","summary":"  The rapid advancement of generative AI models capable of creating realistic\nmedia has led to a need for classifiers that can accurately distinguish between\ngenuine and artificially-generated images. A significant challenge for these\nclassifiers emerges when they encounter images from generative models that are\nnot represented in their training data, usually resulting in diminished\nperformance. A typical approach is to periodically update the classifier's\ntraining data with images from the new generative models then retrain the\nclassifier on the updated dataset. However, in some real-life scenarios,\nstorage, computational, or privacy constraints render this approach\nimpractical. Additionally, models used in security applications may be required\nto rapidly adapt. In these circumstances, continual learning provides a\npromising alternative, as the classifier can be updated without retraining on\nthe entire dataset. In this paper, we introduce a new dataset called CLOFAI\n(Continual Learning On Fake and Authentic Images), which takes the form of a\ndomain-incremental image classification problem. Moreover, we showcase the\napplicability of this dataset as a benchmark for evaluating continual learning\nmethodologies. In doing this, we set a baseline on our novel dataset using\nthree foundational continual learning methods -- EWC, GEM, and Experience\nReplay -- and find that EWC performs poorly, while GEM and Experience Replay\nshow promise, performing significantly better than a Naive baseline. The\ndataset and code to run the experiments can be accessed from the following\nGitHub repository: https://github.com/Will-Doherty/CLOFAI.\n","authors":["William Doherty","Anton Lee","Heitor Murilo Gomes"],"pdf_url":"https://arxiv.org/pdf/2501.11140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11132v1","updated":"2025-01-19T18:01:39Z","published":"2025-01-19T18:01:39Z","title":"Advanced technology in railway track monitoring using the GPR Technique:\n  A Review","summary":"  Subsurface evaluation of railway tracks is crucial for safe operation, as it\nallows for the early detection and remediation of potential structural\nweaknesses or defects that could lead to accidents or derailments. Ground\nPenetrating Radar (GPR) is an electromagnetic survey technique as advanced\nnon-destructive technology (NDT) that can be used to monitor railway tracks.\nThis technology is well-suited for railway applications due to the sub-layered\ncomposition of the track, which includes ties, ballast, sub-ballast, and\nsubgrade regions. It can detect defects such as ballast pockets, fouled\nballast, poor drainage, and subgrade settlement. The paper reviews recent works\non advanced technology and interpretations of GPR data collected for different\nlayers. Further, this paper demonstrates the current techniques for using\nsynthetic modeling to calibrate real-world GPR data, enhancing accuracy in\nidentifying subsurface features like ballast conditions and structural\nanomalies and applying various algorithms to refine GPR data analysis. These\ninclude Support Vector Machine (SVM) for classifying railway ballast types,\nFuzzy C-means, and Generalized Regression Neural Networks for high-accuracy\ndefect classification. Deep learning techniques, particularly Convolutional\nNeural Networks (CNNs) and Recurrent Neural Networks (RNNs) are also\nhighlighted for their effectiveness in recognizing patterns associated with\ndefects in GPR images. The article specifically focuses on the development of a\nConvolutional Recurrent Neural Network (CRNN) model, which combines CNN and RNN\narchitectures for efficient processing of GPR data. This model demonstrates\nenhanced detection capabilities and faster processing compared to traditional\nobject detection models like Faster R-CNN.\n","authors":["Farhad Kooban","Aleksandra Radlińska","Reza Mousapour","Maryam Saraei"],"pdf_url":"https://arxiv.org/pdf/2501.11132v1.pdf","comment":"2nd Canadian & Cold Regions Rail Research Conference 2024 (CCRC 2024)"},{"id":"http://arxiv.org/abs/2308.14177v5","updated":"2025-01-19T17:43:30Z","published":"2023-08-27T18:38:57Z","title":"AI-Generated Content (AIGC) for Various Data Modalities: A Survey","summary":"  AI-generated content (AIGC) methods aim to produce text, images, videos, 3D\nassets, and other media using AI algorithms. Due to its wide range of\napplications and the potential of recent works, AIGC developments -- especially\nin Machine Learning (ML) and Deep Learning (DL) -- have been attracting\nsignificant attention, and this survey focuses on comprehensively reviewing\nsuch advancements in ML/DL. AIGC methods have been developed for various data\nmodalities, such as image, video, text, 3D shape, 3D scene, 3D human avatar, 3D\nmotion, and audio -- each presenting unique characteristics and challenges.\nFurthermore, there have been significant developments in cross-modality AIGC\nmethods, where generative methods receive conditioning input in one modality\nand produce outputs in another. Examples include going from various modalities\nto image, video, 3D, and audio. This paper provides a comprehensive review of\nAIGC methods across different data modalities, including both single-modality\nand cross-modality methods, highlighting the various challenges, representative\nworks, and recent technical directions in each setting. We also survey the\nrepresentative datasets throughout the modalities, and present comparative\nresults for various modalities. Moreover, we discuss the typical applications\nof AIGC methods in various domains, challenges, and future research directions.\n","authors":["Lin Geng Foo","Hossein Rahmani","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.14177v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10485v2","updated":"2025-01-19T17:41:49Z","published":"2024-06-15T03:30:29Z","title":"A Label is Worth a Thousand Images in Dataset Distillation","summary":"  Data $\\textit{quality}$ is a crucial factor in the performance of machine\nlearning models, a principle that dataset distillation methods exploit by\ncompressing training datasets into much smaller counterparts that maintain\nsimilar downstream performance. Understanding how and why data distillation\nmethods work is vital not only for improving these methods but also for\nrevealing fundamental characteristics of \"good\" training data. However, a major\nchallenge in achieving this goal is the observation that distillation\napproaches, which rely on sophisticated but mostly disparate methods to\ngenerate synthetic data, have little in common with each other. In this work,\nwe highlight a largely overlooked aspect common to most of these methods: the\nuse of soft (probabilistic) labels. Through a series of ablation experiments,\nwe study the role of soft labels in depth. Our results reveal that the main\nfactor explaining the performance of state-of-the-art distillation methods is\nnot the specific techniques used to generate synthetic data but rather the use\nof soft labels. Furthermore, we demonstrate that not all soft labels are\ncreated equal; they must contain $\\textit{structured information}$ to be\nbeneficial. We also provide empirical scaling laws that characterize the\neffectiveness of soft labels as a function of images-per-class in the distilled\ndataset and establish an empirical Pareto frontier for data-efficient learning.\nCombined, our findings challenge conventional wisdom in dataset distillation,\nunderscore the importance of soft labels in learning, and suggest new\ndirections for improving distillation methods. Code for all experiments is\navailable at https://github.com/sunnytqin/no-distillation.\n","authors":["Tian Qin","Zhiwei Deng","David Alvarez-Melis"],"pdf_url":"https://arxiv.org/pdf/2406.10485v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.11124v1","updated":"2025-01-19T17:31:40Z","published":"2025-01-19T17:31:40Z","title":"Rethinking Pseudo-Label Guided Learning for Weakly Supervised Temporal\n  Action Localization from the Perspective of Noise Correction","summary":"  Pseudo-label learning methods have been widely applied in weakly-supervised\ntemporal action localization. Existing works directly utilize weakly-supervised\nbase model to generate instance-level pseudo-labels for training the\nfully-supervised detection head. We argue that the noise in pseudo-labels would\ninterfere with the learning of fully-supervised detection head, leading to\nsignificant performance leakage. Issues with noisy labels include:(1)\ninaccurate boundary localization; (2) undetected short action clips; (3)\nmultiple adjacent segments incorrectly detected as one segment. To target these\nissues, we introduce a two-stage noisy label learning strategy to harness every\npotential useful signal in noisy labels. First, we propose a frame-level\npseudo-label generation model with a context-aware denoising algorithm to\nrefine the boundaries. Second, we introduce an online-revised teacher-student\nframework with a missing instance compensation module and an ambiguous instance\ncorrection module to solve the short-action-missing and many-to-one problems.\nBesides, we apply a high-quality pseudo-label mining loss in our online-revised\nteacher-student framework to add different weights to the noisy labels to train\nmore effectively. Our model outperforms the previous state-of-the-art method in\ndetection accuracy and inference speed greatly upon the THUMOS14 and\nActivityNet v1.2 benchmarks.\n","authors":["Quan Zhang","Yuxin Qi","Xi Tang","Rui Yuan","Xi Lin","Ke Zhang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2501.11124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12716v5","updated":"2025-01-19T16:32:25Z","published":"2024-12-17T09:30:31Z","title":"Unsupervised UAV 3D Trajectories Estimation with Sparse Point Clouds","summary":"  Compact UAV systems, while advancing delivery and surveillance, pose\nsignificant security challenges due to their small size, which hinders\ndetection by traditional methods. This paper presents a cost-effective,\nunsupervised UAV detection method using spatial-temporal sequence processing to\nfuse multiple LiDAR scans for accurate UAV tracking in real-world scenarios.\nOur approach segments point clouds into foreground and background, analyzes\nspatial-temporal data, and employs a scoring mechanism to enhance detection\naccuracy. Tested on a public dataset, our solution placed 4th in the CVPR 2024\nUG2+ Challenge, demonstrating its practical effectiveness. We plan to\nopen-source all designs, code, and sample data for the research community\ngithub.com/lianghanfang/UnLiDAR-UAV-Est.\n","authors":["Hanfang Liang","Yizhuo Yang","Jinming Hu","Jianfei Yang","Fen Liu","Shenghai Yuan"],"pdf_url":"https://arxiv.org/pdf/2412.12716v5.pdf","comment":"This paper has been accepted for presentation at the IEEE\n  International Conference on Acoustics, Speech, and Signal Processing (ICASSP)\n  2025. 2025 IEEE Trademark. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses"},{"id":"http://arxiv.org/abs/2501.11102v1","updated":"2025-01-19T16:22:28Z","published":"2025-01-19T16:22:28Z","title":"RDG-GS: Relative Depth Guidance with Gaussian Splatting for Real-time\n  Sparse-View 3D Rendering","summary":"  Efficiently synthesizing novel views from sparse inputs while maintaining\naccuracy remains a critical challenge in 3D reconstruction. While advanced\ntechniques like radiance fields and 3D Gaussian Splatting achieve rendering\nquality and impressive efficiency with dense view inputs, they suffer from\nsignificant geometric reconstruction errors when applied to sparse input views.\nMoreover, although recent methods leverage monocular depth estimation to\nenhance geometric learning, their dependence on single-view estimated depth\noften leads to view inconsistency issues across different viewpoints.\nConsequently, this reliance on absolute depth can introduce inaccuracies in\ngeometric information, ultimately compromising the quality of scene\nreconstruction with Gaussian splats. In this paper, we present RDG-GS, a novel\nsparse-view 3D rendering framework with Relative Depth Guidance based on 3D\nGaussian Splatting. The core innovation lies in utilizing relative depth\nguidance to refine the Gaussian field, steering it towards view-consistent\nspatial geometric representations, thereby enabling the reconstruction of\naccurate geometric structures and capturing intricate textures. First, we\ndevise refined depth priors to rectify the coarse estimated depth and insert\nglobal and fine-grained scene information to regular Gaussians. Building on\nthis, to address spatial geometric inaccuracies from absolute depth, we propose\nrelative depth guidance by optimizing the similarity between spatially\ncorrelated patches of depth and images. Additionally, we also directly deal\nwith the sparse areas challenging to converge by the adaptive sampling for\nquick densification. Across extensive experiments on Mip-NeRF360, LLFF, DTU,\nand Blender, RDG-GS demonstrates state-of-the-art rendering quality and\nefficiency, making a significant advancement for real-world application.\n","authors":["Chenlu Zhan","Yufei Zhang","Yu Lin","Gaoang Wang","Hongwei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11102v1.pdf","comment":"24 pages, 12 figures"},{"id":"http://arxiv.org/abs/2501.11097v1","updated":"2025-01-19T16:17:20Z","published":"2025-01-19T16:17:20Z","title":"Unit Region Encoding: A Unified and Compact Geometry-aware\n  Representation for Floorplan Applications","summary":"  We present the Unit Region Encoding of floorplans, which is a unified and\ncompact geometry-aware encoding representation for various applications,\nranging from interior space planning, floorplan metric learning to floorplan\ngeneration tasks. The floorplans are represented as the latent encodings on a\nset of boundary-adaptive unit region partition based on the clustering of the\nproposed geometry-aware density map. The latent encodings are extracted by a\ntrained network (URE-Net) from the input dense density map and other available\nsemantic maps. Compared to the over-segmented rasterized images and the\nroom-level graph structures, our representation can be flexibly adapted to\ndifferent applications with the sliced unit regions while achieving higher\naccuracy performance and better visual quality. We conduct a variety of\nexperiments and compare to the state-of-the-art methods on the aforementioned\napplications to validate the superiority of our representation, as well as\nextensive ablation studies to demonstrate the effect of our slicing choices.\n","authors":["Huichao Zhang","Pengyu Wang","Manyi Li","Zuojun Li","Yaguang Wu"],"pdf_url":"https://arxiv.org/pdf/2501.11097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11096v1","updated":"2025-01-19T16:14:30Z","published":"2025-01-19T16:14:30Z","title":"Reproducibility review of \"Why Not Other Classes\": Towards\n  Class-Contrastive Back-Propagation Explanations","summary":"  \"Why Not Other Classes?\": Towards Class-Contrastive Back-Propagation\nExplanations (Wang & Wang, 2022) provides a method for contrastively explaining\nwhy a certain class in a neural network image classifier is chosen above\nothers. This method consists of using back-propagation-based explanation\nmethods from after the softmax layer rather than before. Our work consists of\nreproducing the work in the original paper. We also provide extensions to the\npaper by evaluating the method on XGradCAM, FullGrad, and Vision Transformers\nto evaluate its generalization capabilities. The reproductions show similar\nresults as the original paper, with the only difference being the visualization\nof heatmaps which could not be reproduced to look similar. The generalization\nseems to be generally good, with implementations working for Vision\nTransformers and alternative back-propagation methods. We also show that the\noriginal paper suffers from issues such as a lack of detail in the method and\nan erroneous equation which makes reproducibility difficult. To remedy this we\nprovide an open-source repository containing all code used for this project.\n","authors":["Arvid Eriksson","Anton Israelsson","Mattias Kallhauge"],"pdf_url":"https://arxiv.org/pdf/2501.11096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11087v1","updated":"2025-01-19T15:50:33Z","published":"2025-01-19T15:50:33Z","title":"Leveraging counterfactual concepts for debugging and improving CNN model\n  performance","summary":"  Counterfactual explanation methods have recently received significant\nattention for explaining CNN-based image classifiers due to their ability to\nprovide easily understandable explanations that align more closely with human\nreasoning. However, limited attention has been given to utilizing\nexplainability methods to improve model performance. In this paper, we propose\nto leverage counterfactual concepts aiming to enhance the performance of CNN\nmodels in image classification tasks. Our proposed approach utilizes\ncounterfactual reasoning to identify crucial filters used in the\ndecision-making process. Following this, we perform model retraining through\nthe design of a novel methodology and loss functions that encourage the\nactivation of class-relevant important filters and discourage the activation of\nirrelevant filters for each class. This process effectively minimizes the\ndeviation of activation patterns of local predictions and the global activation\npatterns of their respective inferred classes. By incorporating counterfactual\nexplanations, we validate unseen model predictions and identify\nmisclassifications. The proposed methodology provides insights into potential\nweaknesses and biases in the model's learning process, enabling targeted\nimprovements and enhanced performance. Experimental results on publicly\navailable datasets have demonstrated an improvement of 1-2\\%, validating the\neffectiveness of the approach.\n","authors":["Syed Ali Tariq","Tehseen Zia"],"pdf_url":"https://arxiv.org/pdf/2501.11087v1.pdf","comment":"This manuscript is currently under consideration for publication in\n  Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2311.18564v2","updated":"2025-01-19T15:28:24Z","published":"2023-11-30T13:55:29Z","title":"Leveraging Local Patch Alignment to Seam-cutting for Large Parallax\n  Image Stitching","summary":"  Seam cutting methods have been proven effective in the composition step of\nimage stitching, especially for images with parallax. However, current seam\ncutting can be seen as the subsequent step after the image alignment is\nsettled. Its effectiveness usually depends on the fact that images can be\nroughly aligned such that a local region exists where an unnoticeable seam can\nbe found. Current alignment methods often fall short of expectations for images\nwith large parallax, and most efforts are devoted to improving the alignment\naccuracy.\n  In this paper, we argue that by adding a simple Local Patch Alignment Module\n(LPAM) into the seam cutting, the final result can be efficiently improved for\nlarge parallax image stitching. Concretely, we first evaluate the quality of\npixels along the estimated seam of the seam cutting method. Then, for pixels\nwith low qualities, we separate their enclosing patches in the aligned images\nand locally align them by constructing modified dense correspondences via SIFT\nflow. Finally, we composite the aligned patches via seam cutting and merge them\ninto the original aligned result to generate the final mosaic. Experiments show\nthat introducing LPAM can effectively and efficiently improve the stitching\nresults.\n","authors":["Tianli Liao","Chenyang Zhao","Lei Li","Heling Cao"],"pdf_url":"https://arxiv.org/pdf/2311.18564v2.pdf","comment":"In peer review"},{"id":"http://arxiv.org/abs/2501.11069v1","updated":"2025-01-19T15:05:15Z","published":"2025-01-19T15:05:15Z","title":"Refinement Module based on Parse Graph of Feature Map for Human Pose\n  Estimation","summary":"  Parse graphs of the human body can be obtained in the human brain to help\nhumans complete the human pose estimation (HPE). It contains a hierarchical\nstructure, like a tree structure, and context relations among nodes. Many\nresearchers pre-design the parse graph of body structure, and then design\nframework for HPE. However, these frameworks are difficulty adapting when\nencountering situations that differ from the preset human structure. Different\nfrom them, we regard the feature map as a whole, similarly to human body, so\nthe feature map can be optimized based on parse graphs and each node feature is\nlearned implicitly instead of explicitly, which means it can flexibly respond\nto different human body structure. In this paper, we design the Refinement\nModule based on the Parse Graph of feature map (RMPG), which includes two\nstages: top-down decomposition and bottom-up combination. In the top-down\ndecomposition stage, the feature map is decomposed into multiple sub-feature\nmaps along the channel and their context relations are calculated to obtain\ntheir respective context information. In the bottom-up combination stage, the\nsub-feature maps and their context information are combined to obtain refined\nsub-feature maps, and then these refined sub-feature maps are concatenated to\nobtain the refined feature map. Additionally ,we design a top-down framework by\nusing multiple RMPG modules for HPE, some of which are supervised to obtain\ncontext relations among body parts. Our framework achieves excellent results on\nthe COCO keypoint detection, CrowdPose and MPII human pose datasets. More\nimportantly, our experiments also demonstrate the effectiveness of RMPG on\ndifferent methods, including SimpleBaselines, Hourglass, and ViTPose.\n","authors":["Shibang Liu","Xuemei Xie","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2501.11069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09844v2","updated":"2025-01-19T15:03:39Z","published":"2024-12-13T04:27:08Z","title":"Real-time Identity Defenses against Malicious Personalization of\n  Diffusion Models","summary":"  Personalized generative diffusion models, capable of synthesizing highly\nrealistic images based on a few reference portraits, may pose substantial\nsocial, ethical, and legal risks via identity replication. Existing defense\nmechanisms rely on computationally intensive adversarial perturbations tailored\nto individual images, rendering them impractical for real-world deployment.\nThis study introduces the Real-time Identity Defender (RID), a neural network\ndesigned to generate adversarial perturbations through a single forward pass,\nbypassing the need for image-specific optimization. RID achieves unprecedented\nefficiency, with defense times as low as 0.12 seconds on a single NVIDIA A100\n80G GPU (4,400 times faster than leading methods) and 1.1 seconds per image on\na standard Intel i9 CPU, making it suitable for edge devices such as\nsmartphones. Despite its efficiency, RID achieves promising protection\nperformance across visual and quantitative benchmarks, effectively mitigating\nidentity replication risks. Our analysis reveals that RID's perturbations mimic\nthe efficacy of traditional defenses while exhibiting properties distinct from\nnatural noise, such as Gaussian perturbations. To enhance robustness, we extend\nRID into an ensemble framework that integrates multiple pre-trained\ntext-to-image diffusion models, ensuring resilience against black-box attacks\nand post-processing techniques, including image compression and purification.\nOur model is envisioned to play a crucial role in safeguarding portrait rights,\nthereby preventing illegal and unethical uses.\n","authors":["Hanzhong Guo","Shen Nie","Chao Du","Tianyu Pang","Hao Sun","Chongxuan Li"],"pdf_url":"https://arxiv.org/pdf/2412.09844v2.pdf","comment":"21 pages, 7 figures (RID)"},{"id":"http://arxiv.org/abs/2501.11063v1","updated":"2025-01-19T14:41:55Z","published":"2025-01-19T14:41:55Z","title":"Enhancing Sample Utilization in Noise-Robust Deep Metric Learning With\n  Subgroup-Based Positive-Pair Selection","summary":"  The existence of noisy labels in real-world data negatively impacts the\nperformance of deep learning models. Although much research effort has been\ndevoted to improving the robustness towards noisy labels in classification\ntasks, the problem of noisy labels in deep metric learning (DML) remains\nunder-explored. Existing noisy label learning methods designed for DML mainly\ndiscard suspicious noisy samples, resulting in a waste of the training data. To\naddress this issue, we propose a noise-robust DML framework with SubGroup-based\nPositive-pair Selection (SGPS), which constructs reliable positive pairs for\nnoisy samples to enhance the sample utilization. Specifically, SGPS first\neffectively identifies clean and noisy samples by a probability-based clean\nsample selectionstrategy. To further utilize the remaining noisy samples, we\ndiscover their potential similar samples based on the subgroup information\ngiven by a subgroup generation module and then aggregate them into informative\npositive prototypes for each noisy sample via a positive prototype generation\nmodule. Afterward, a new contrastive loss is tailored for the noisy samples\nwith their selected positive pairs. SGPS can be easily integrated into the\ntraining process of existing pair-wise DML tasks, like image retrieval and face\nrecognition. Extensive experiments on multiple synthetic and real-world\nlarge-scale label noise datasets demonstrate the effectiveness of our proposed\nmethod. Without any bells and whistles, our SGPS framework outperforms the\nstate-of-the-art noisy label DML methods. Code is available at\n\\url{https://github.com/smuelpeng/SGPS-NoiseFreeDML}.\n","authors":["Zhipeng Yu","Qianqian Xu","Yangbangyan Jiang","Yingfei Sun","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2501.11063v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2108.01431,\n  arXiv:2103.16047 by other authors"},{"id":"http://arxiv.org/abs/2501.11053v1","updated":"2025-01-19T14:09:04Z","published":"2025-01-19T14:09:04Z","title":"Learning with Open-world Noisy Data via Class-independent Margin in Dual\n  Representation Space","summary":"  Learning with Noisy Labels (LNL) aims to improve the model generalization\nwhen facing data with noisy labels, and existing methods generally assume that\nnoisy labels come from known classes, called closed-set noise. However, in\nreal-world scenarios, noisy labels from similar unknown classes, i.e., open-set\nnoise, may occur during the training and inference stage. Such open-world noisy\nlabels may significantly impact the performance of LNL methods. In this study,\nwe propose a novel dual-space joint learning method to robustly handle the\nopen-world noise. To mitigate model overfitting on closed-set and open-set\nnoises, a dual representation space is constructed by two networks. One is a\nprojection network that learns shared representations in the prototype space,\nwhile the other is a One-Vs-All (OVA) network that makes predictions using\nunique semantic representations in the class-independent space. Then, bi-level\ncontrastive learning and consistency regularization are introduced in two\nspaces to enhance the detection capability for data with unknown classes. To\nbenefit from the memorization effects across different types of samples,\nclass-independent margin criteria are designed for sample identification, which\nselects clean samples, weights closed-set noise, and filters open-set noise\neffectively. Extensive experiments demonstrate that our method outperforms the\nstate-of-the-art methods and achieves an average accuracy improvement of 4.55\\%\nand an AUROC improvement of 6.17\\% on CIFAR80N.\n","authors":["Linchao Pan","Can Gao","Jie Zhou","Jinbao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11053v1.pdf","comment":"7 pages of main text, 4 pages of appendix, accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2501.11043v1","updated":"2025-01-19T13:29:41Z","published":"2025-01-19T13:29:41Z","title":"BF-STVSR: B-Splines and Fourier-Best Friends for High Fidelity\n  Spatial-Temporal Video Super-Resolution","summary":"  Enhancing low-resolution, low-frame-rate videos to high-resolution,\nhigh-frame-rate quality is essential for a seamless user experience, motivating\nadvancements in Continuous Spatial-Temporal Video Super Resolution (C-STVSR).\nWhile prior methods employ Implicit Neural Representation (INR) for continuous\nencoding, they often struggle to capture the complexity of video data, relying\non simple coordinate concatenation and pre-trained optical flow network for\nmotion representation. Interestingly, we find that adding position encoding,\ncontrary to common observations, does not improve-and even degrade performance.\nThis issue becomes particularly pronounced when combined with pre-trained\noptical flow networks, which can limit the model's flexibility. To address\nthese issues, we propose BF-STVSR, a C-STVSR framework with two key modules\ntailored to better represent spatial and temporal characteristics of video: 1)\nB-spline Mapper for smooth temporal interpolation, and 2) Fourier Mapper for\ncapturing dominant spatial frequencies. Our approach achieves state-of-the-art\nPSNR and SSIM performance, showing enhanced spatial details and natural\ntemporal consistency.\n","authors":["Eunjin Kim","Hyeonjin Kim","Kyong Hwan Jin","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2501.11043v1.pdf","comment":"11pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.11030v1","updated":"2025-01-19T12:37:41Z","published":"2025-01-19T12:37:41Z","title":"Tracking Mouse from Incomplete Body-Part Observations and Deep-Learned\n  Deformable-Mouse Model Motion-Track Constraint for Behavior Analysis","summary":"  Tracking mouse body parts in video is often incomplete due to occlusions such\nthat - e.g. - subsequent action and behavior analysis is impeded. In this\nconceptual work, videos from several perspectives are integrated via global\nexterior camera orientation; body part positions are estimated by 3D\ntriangulation and bundle adjustment. Consistency of overall 3D track\nreconstruction is achieved by introduction of a 3D mouse model, deep-learned\nbody part movements, and global motion-track smoothness constraint. The\nresulting 3D body and body part track estimates are substantially more complete\nthan the original single-frame-based body part detection, therefore, allowing\nimproved animal behavior analysis.\n","authors":["Olaf Hellwich","Niek Andresen","Katharina Hohlbaum","Marcus N. Boon","Monika Kwiatkowski","Simon Matern","Patrik Reiske","Henning Sprekeler","Christa ThöneReineke","Lars Lewejohann","Huma Ghani Zada","Michael Brück","Soledad Traverso"],"pdf_url":"https://arxiv.org/pdf/2501.11030v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2501.11020v1","updated":"2025-01-19T11:49:35Z","published":"2025-01-19T11:49:35Z","title":"Car-GS: Addressing Reflective and Transparent Surface Challenges in 3D\n  Car Reconstruction","summary":"  3D car modeling is crucial for applications in autonomous driving systems,\nvirtual and augmented reality, and gaming. However, due to the distinctive\nproperties of cars, such as highly reflective and transparent surface\nmaterials, existing methods often struggle to achieve accurate 3D car\nreconstruction.To address these limitations, we propose Car-GS, a novel\napproach designed to mitigate the effects of specular highlights and the\ncoupling of RGB and geometry in 3D geometric and shading reconstruction (3DGS).\nOur method incorporates three key innovations: First, we introduce\nview-dependent Gaussian primitives to effectively model surface reflections.\nSecond, we identify the limitations of using a shared opacity parameter for\nboth image rendering and geometric attributes when modeling transparent\nobjects. To overcome this, we assign a learnable geometry-specific opacity to\neach 2D Gaussian primitive, dedicated solely to rendering depth and normals.\nThird, we observe that reconstruction errors are most prominent when the camera\nview is nearly orthogonal to glass surfaces. To address this issue, we develop\na quality-aware supervision module that adaptively leverages normal priors from\na pre-trained large-scale normal model.Experimental results demonstrate that\nCar-GS achieves precise reconstruction of car surfaces and significantly\noutperforms prior methods. The project page is available at\nhttps://lcc815.github.io/Car-GS.\n","authors":["Congcong Li","Jin Wang","Xiaomeng Wang","Xingchen Zhou","Wei Wu","Yuzhi Zhang","Tongyi Cao"],"pdf_url":"https://arxiv.org/pdf/2501.11020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03765v2","updated":"2025-01-19T11:34:58Z","published":"2025-01-07T13:09:44Z","title":"Image Segmentation: Inducing graph-based learning","summary":"  This study explores the potential of graph neural networks (GNNs) to enhance\nsemantic segmentation across diverse image modalities. We evaluate the\neffectiveness of a novel GNN-based U-Net architecture on three distinct\ndatasets: PascalVOC, a standard benchmark for natural image segmentation,\nWoodScape, a challenging dataset of fisheye images commonly used in autonomous\ndriving, introducing significant geometric distortions; and ISIC2016, a dataset\nof dermoscopic images for skin lesion segmentation. We compare our proposed\nUNet-GNN model against established convolutional neural networks (CNNs) based\nsegmentation models, including U-Net and U-Net++, as well as the\ntransformer-based SwinUNet. Unlike these methods, which primarily rely on local\nconvolutional operations or global self-attention, GNNs explicitly model\nrelationships between image regions by constructing and operating on a graph\nrepresentation of the image features. This approach allows the model to capture\nlong-range dependencies and complex spatial relationships, which we hypothesize\nwill be particularly beneficial for handling geometric distortions present in\nfisheye imagery and capturing intricate boundaries in medical images. Our\nanalysis demonstrates the versatility of GNNs in addressing diverse\nsegmentation challenges and highlights their potential to improve segmentation\naccuracy in various applications, including autonomous driving and medical\nimage analysis.\n","authors":["Aryan Singh","Pepijn Van de Ven","Ciarán Eising","Patrick Denny"],"pdf_url":"https://arxiv.org/pdf/2501.03765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11014v1","updated":"2025-01-19T11:18:34Z","published":"2025-01-19T11:18:34Z","title":"Transfer Learning Strategies for Pathological Foundation Models: A\n  Systematic Evaluation in Brain Tumor Classification","summary":"  Foundation models pretrained on large-scale pathology datasets have shown\npromising results across various diagnostic tasks. Here, we present a\nsystematic evaluation of transfer learning strategies for brain tumor\nclassification using these models. We analyzed 252 cases comprising five major\ntumor types: glioblastoma, astrocytoma, oligodendroglioma, primary central\nnervous system lymphoma, and metastatic tumors. Comparing state-of-the-art\nfoundation models with conventional approaches, we found that foundation models\ndemonstrated robust classification performance with as few as 10 patches per\ncase, challenging the traditional assumption that extensive per-case image\nsampling is necessary. Furthermore, our evaluation revealed that simple\ntransfer learning strategies like linear probing were sufficient, while\nfine-tuning often degraded model performance. These findings suggest a paradigm\nshift from extensive data collection to efficient utilization of pretrained\nfeatures, providing practical implications for implementing AI-assisted\ndiagnosis in clinical pathology.\n","authors":["Ken Enda","Yoshitaka Oda","Zen-ichi Tanei","Wang Lei","Masumi Tsuda","Takahiro Ogawa","Shinya Tanaka"],"pdf_url":"https://arxiv.org/pdf/2501.11014v1.pdf","comment":"25 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.11007v1","updated":"2025-01-19T10:47:49Z","published":"2025-01-19T10:47:49Z","title":"HFGCN:Hypergraph Fusion Graph Convolutional Networks for Skeleton-Based\n  Action Recognition","summary":"  In recent years, action recognition has received much attention and wide\napplication due to its important role in video understanding. Most of the\nresearches on action recognition methods focused on improving the performance\nvia various deep learning methods rather than the classification of skeleton\npoints. The topological modeling between skeleton points and body parts was\nseldom considered. Although some studies have used a data-driven approach to\nclassify the topology of the skeleton point, the nature of the skeleton point\nin terms of kinematics has not been taken into consideration. Therefore, in\nthis paper, we draw on the theory of kinematics to adapt the topological\nrelations of the skeleton point and propose a topological relation\nclassification based on body parts and distance from core of body. To\nsynthesize these topological relations for action recognition, we propose a\nnovel Hypergraph Fusion Graph Convolutional Network (HFGCN). In particular, the\nproposed model is able to focus on the human skeleton points and the different\nbody parts simultaneously, and thus construct the topology, which improves the\nrecognition accuracy obviously. We use a hypergraph to represent the\ncategorical relationships of these skeleton points and incorporate the\nhypergraph into a graph convolution network to model the higher-order\nrelationships among the skeleton points and enhance the feature representation\nof the network. In addition, our proposed hypergraph attention module and\nhypergraph graph convolution module optimize topology modeling in temporal and\nchannel dimensions, respectively, to further enhance the feature representation\nof the network. We conducted extensive experiments on three widely used\ndatasets.The results validate that our proposed method can achieve the best\nperformance when compared with the state-of-the-art skeleton-based methods.\n","authors":["Pengcheng Dong","Wenbo Wan","Huaxiang Zhang","Jiande Sun"],"pdf_url":"https://arxiv.org/pdf/2501.11007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10984v1","updated":"2025-01-19T08:37:27Z","published":"2025-01-19T08:37:27Z","title":"Self-CephaloNet: A Two-stage Novel Framework using Operational Neural\n  Network for Cephalometric Analysis","summary":"  Cephalometric analysis is essential for the diagnosis and treatment planning\nof orthodontics. In lateral cephalograms, however, the manual detection of\nanatomical landmarks is a time-consuming procedure. Deep learning solutions\nhold the potential to address the time constraints associated with certain\ntasks; however, concerns regarding their performance have been observed. To\naddress this critical issue, we proposed an end-to-end cascaded deep learning\nframework (Self-CepahloNet) for the task, which demonstrated benchmark\nperformance over the ISBI 2015 dataset in predicting 19 dental landmarks. Due\nto their adaptive nodal capabilities, Self-ONN (self-operational neural\nnetworks) demonstrate superior learning performance for complex feature spaces\nover conventional convolutional neural networks. To leverage this attribute, we\nintroduced a novel self-bottleneck in the HRNetV2 (High Resolution Network)\nbackbone, which has exhibited benchmark performance on the ISBI 2015 dataset\nfor the dental landmark detection task. Our first-stage results surpassed\nprevious studies, showcasing the efficacy of our singular end-to-end deep\nlearning model, which achieved a remarkable 70.95% success rate in detecting\ncephalometric landmarks within a 2mm range for the Test1 and Test2 datasets.\nMoreover, the second stage significantly improved overall performance, yielding\nan impressive 82.25% average success rate for the datasets above within the\nsame 2mm distance. Furthermore, external validation was conducted using the PKU\ncephalogram dataset. Our model demonstrated a commendable success rate of\n75.95% within the 2mm range.\n","authors":["Md. Shaheenur Islam Sumon","Khandaker Reajul Islam","Tanzila Rafique","Gazi Shamim Hassan","Md. Sakib Abrar Hossain","Kanchon Kanti Podder","Noha Barhom","Faleh Tamimi","Abdulrahman Alqahtani","Muhammad E. H. Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2501.10984v1.pdf","comment":"The paper has been accepted for publication in Neural Computing and\n  Applications"},{"id":"http://arxiv.org/abs/2401.17857v4","updated":"2025-01-19T08:31:42Z","published":"2024-01-31T14:19:03Z","title":"SAGD: Boundary-Enhanced Segment Anything in 3D Gaussian via Gaussian\n  Decomposition","summary":"  3D Gaussian Splatting has emerged as an alternative 3D representation for\nnovel view synthesis, benefiting from its high-quality rendering results and\nreal-time rendering speed. However, the 3D Gaussians learned by 3D-GS have\nambiguous structures without any geometry constraints. This inherent issue in\n3D-GS leads to a rough boundary when segmenting individual objects. To remedy\nthese problems, we propose SAGD, a conceptually simple yet effective\nboundary-enhanced segmentation pipeline for 3D-GS to improve segmentation\naccuracy while preserving segmentation speed. Specifically, we introduce a\nGaussian Decomposition scheme, which ingeniously utilizes the special structure\nof 3D Gaussian, finds out, and then decomposes the boundary Gaussians.\nMoreover, to achieve fast interactive 3D segmentation, we introduce a novel\ntraining-free pipeline by lifting a 2D foundation model to 3D-GS. Extensive\nexperiments demonstrate that our approach achieves high-quality 3D segmentation\nwithout rough boundary issues, which can be easily applied to other scene\nediting tasks.\n","authors":["Xu Hu","Yuxi Wang","Lue Fan","Chuanchen Luo","Junsong Fan","Zhen Lei","Qing Li","Junran Peng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17857v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12525v3","updated":"2025-01-19T08:06:33Z","published":"2024-12-17T04:33:31Z","title":"CREST: An Efficient Conjointly-trained Spike-driven Framework for\n  Event-based Object Detection Exploiting Spatiotemporal Dynamics","summary":"  Event-based cameras feature high temporal resolution, wide dynamic range, and\nlow power consumption, which is ideal for high-speed and low-light object\ndetection. Spiking neural networks (SNNs) are promising for event-based object\nrecognition and detection due to their spiking nature but lack efficient\ntraining methods, leading to gradient vanishing and high computational\ncomplexity, especially in deep SNNs. Additionally, existing SNN frameworks\noften fail to effectively handle multi-scale spatiotemporal features, leading\nto increased data redundancy and reduced accuracy. To address these issues, we\npropose CREST, a novel conjointly-trained spike-driven framework to exploit\nspatiotemporal dynamics in event-based object detection. We introduce the\nconjoint learning rule to accelerate SNN learning and alleviate gradient\nvanishing. It also supports dual operation modes for efficient and flexible\nimplementation on different hardware types. Additionally, CREST features a\nfully spike-driven framework with a multi-scale spatiotemporal event integrator\n(MESTOR) and a spatiotemporal-IoU (ST-IoU) loss. Our approach achieves superior\nobject recognition & detection performance and up to 100X energy efficiency\ncompared with state-of-the-art SNN algorithms on three datasets, providing an\nefficient solution for event-based object detection algorithms suitable for SNN\nhardware implementation.\n","authors":["Ruixin Mao","Aoyu Shen","Lin Tang","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.12525v3.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.10977v1","updated":"2025-01-19T07:53:39Z","published":"2025-01-19T07:53:39Z","title":"SMARTe-VR: Student Monitoring and Adaptive Response Technology for\n  e-learning in Virtual Reality","summary":"  This work introduces SMARTe-VR, a platform for student monitoring in an\nimmersive virtual reality environment designed for online education. SMARTe-VR\nis aimed to gather data for adaptive learning, focusing on facial biometrics\nand learning metadata. The platform allows instructors to create tailored\nlearning sessions with video lectures, featuring an interface with an Auto QA\nsystem to evaluate understanding, interaction tools (e.g., textbook\nhighlighting and lecture tagging), and real-time feedback. Additionally, we\nrelease a dataset containing 5 research challenges with data from 10 users in\nVR-based TOEIC sessions. This dataset, spanning over 25 hours, includes facial\nfeatures, learning metadata, 450 responses, question difficulty levels, concept\ntags, and understanding labels. Alongside the database, we present preliminary\nexperiments using Item Response Theory models, adapted for understanding\ndetection using facial features. Two architectures were explored: a Temporal\nConvolutional Network for local features and a Multilayer Perceptron for global\nfeatures.\n","authors":["Roberto Daza","Lin Shengkai","Aythami Morales","Julian Fierrez","Katashi Nagao"],"pdf_url":"https://arxiv.org/pdf/2501.10977v1.pdf","comment":"Published in the Workshop on Artificial Intelligence for Education\n  (AI4EDU) at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.10969v1","updated":"2025-01-19T07:08:36Z","published":"2025-01-19T07:08:36Z","title":"AI Based Font Pair Suggestion Modelling For Graphic Design","summary":"  One of the key challenges of AI generated designs in Microsoft Designer is\nselecting the most contextually relevant and novel fonts for the design\nsuggestions. Previous efforts involved manually mapping design intent to fonts.\nThough this was high quality, this method does not scale for a large number of\nfonts (3000+) and numerous user intents for graphic design. In this work we\ncreate font visual embeddings, a font stroke width algorithm, a font category\nto font mapping dataset, an LLM-based category utilization description and a\nlightweight, low latency knowledge-distilled mini language model (Mini LM V2)\nto recommend multiple pairs of contextual heading and subheading fonts for\nbeautiful and intuitive designs. We also utilize a weighted scoring mechanism,\nnearest neighbor approach and stratified sampling to rank the font pairs and\nbring novelty to the predictions.\n","authors":["Aryan Singh","Sumithra Bhakthavatsalam"],"pdf_url":"https://arxiv.org/pdf/2501.10969v1.pdf","comment":"In the Microsoft Journal of Applied Research (MSJAR), Volume 21, July\n  2024"},{"id":"http://arxiv.org/abs/2501.10967v1","updated":"2025-01-19T07:00:46Z","published":"2025-01-19T07:00:46Z","title":"Advancing General Multimodal Capability of Vision-language Models with\n  Pyramid-descent Visual Position Encoding","summary":"  Vision-language Models (VLMs) have shown remarkable capabilities in advancing\ngeneral artificial intelligence, yet the irrational encoding of visual\npositions persists in inhibiting the models' comprehensive perception\nperformance across different levels of granularity. In this work, we propose\nPyramid-descent Visual Position Encoding (PyPE), a novel approach designed to\nenhance the perception of visual tokens within VLMs. By assigning visual\nposition indexes from the periphery to the center and expanding the central\nreceptive field incrementally, PyPE addresses the limitations of traditional\nraster-scan methods and mitigates the long-term decay effects induced by Rotary\nPosition Embedding (RoPE). Our method reduces the relative distance between\ninterrelated visual elements and instruction tokens, promoting a more rational\nallocation of attention weights and allowing for a multi-granularity perception\nof visual elements and countering the over-reliance on anchor tokens. Extensive\nexperimental evaluations demonstrate that PyPE consistently improves the\ngeneral capabilities of VLMs across various sizes. Code is available at\nhttps://github.com/SakuraTroyChen/PyPE.\n","authors":["Zhanpeng Chen","Mingxiao Li","Ziyang Chen","Nan Du","Xiaolong Li","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2501.10967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17305v2","updated":"2025-01-19T06:58:01Z","published":"2024-12-23T05:52:32Z","title":"Exploiting Label Skewness for Spiking Neural Networks in Federated\n  Learning","summary":"  The energy efficiency of deep spiking neural networks (SNNs) aligns with the\nconstraints of resource-limited edge devices, positioning SNNs as a promising\nfoundation for intelligent applications leveraging the extensive data collected\nby these devices. To address data privacy concerns when deploying SNNs on edge\ndevices, federated learning (FL) facilitates collaborative model training by\nleveraging data distributed across edge devices without transmitting local data\nto a central server. However, existing FL approaches struggle with label-skewed\ndata across devices, which leads to drift in local SNN models and degrades the\nperformance of the global SNN model. In this paper, we propose a novel\nframework called FedLEC, which incorporates intra-client label weight\ncalibration to balance the learning intensity across local labels and\ninter-client knowledge distillation to mitigate local SNN model bias caused by\nlabel absence. Extensive experiments with three different structured SNNs\nacross five datasets (i.e., three non-neuromorphic and two neuromorphic\ndatasets) demonstrate the efficiency of FedLEC. Compared to eight\nstate-of-the-art FL algorithms, FedLEC achieves an average accuracy improvement\nof approximately 11.59% for the global SNN model under various label skew\ndistribution settings.\n","authors":["Di Yu","Xin Du","Linshan Jiang","Huijing Zhang","Shunwen Bai","Shuiguang Deng"],"pdf_url":"https://arxiv.org/pdf/2412.17305v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10966v1","updated":"2025-01-19T06:57:45Z","published":"2025-01-19T06:57:45Z","title":"DC-PCN: Point Cloud Completion Network with Dual-Codebook Guided\n  Quantization","summary":"  Point cloud completion aims to reconstruct complete 3D shapes from partial 3D\npoint clouds. With advancements in deep learning techniques, various methods\nfor point cloud completion have been developed. Despite achieving encouraging\nresults, a significant issue remains: these methods often overlook the\nvariability in point clouds sampled from a single 3D object surface. This\nvariability can lead to ambiguity and hinder the achievement of more precise\ncompletion results. Therefore, in this study, we introduce a novel point cloud\ncompletion network, namely Dual-Codebook Point Completion Network (DC-PCN),\nfollowing an encder-decoder pipeline. The primary objective of DC-PCN is to\nformulate a singular representation of sampled point clouds originating from\nthe same 3D surface. DC-PCN introduces a dual-codebook design to quantize\npoint-cloud representations from a multilevel perspective. It consists of an\nencoder-codebook and a decoder-codebook, designed to capture distinct point\ncloud patterns at shallow and deep levels. Additionally, to enhance the\ninformation flow between these two codebooks, we devise an information exchange\nmechanism. This approach ensures that crucial features and patterns from both\nshallow and deep levels are effectively utilized for completion. Extensive\nexperiments on the PCN, ShapeNet\\_Part, and ShapeNet34 datasets demonstrate the\nstate-of-the-art performance of our method.\n","authors":["Qiuxia Wu","Haiyang Huang","Kunming Su","Zhiyong Wang","Kun Hu"],"pdf_url":"https://arxiv.org/pdf/2501.10966v1.pdf","comment":"AAAI25 Accepted"},{"id":"http://arxiv.org/abs/2501.01420v2","updated":"2025-01-19T06:56:18Z","published":"2025-01-02T18:59:05Z","title":"A Multi-task Supervised Compression Model for Split Computing","summary":"  Split computing ($\\neq$ split learning) is a promising approach to deep\nlearning models for resource-constrained edge computing systems, where weak\nsensor (mobile) devices are wirelessly connected to stronger edge servers\nthrough channels with limited communication capacity. State-of-theart work on\nsplit computing presents methods for single tasks such as image classification,\nobject detection, or semantic segmentation. The application of existing methods\nto multitask problems degrades model accuracy and/or significantly increase\nruntime latency. In this study, we propose Ladon, the first multi-task-head\nsupervised compression model for multi-task split computing. Experimental\nresults show that the multi-task supervised compression model either\noutperformed or rivaled strong lightweight baseline models in terms of\npredictive performance for ILSVRC 2012, COCO 2017, and PASCAL VOC 2012 datasets\nwhile learning compressed representations at its early layers. Furthermore, our\nmodels reduced end-to-end latency (by up to 95.4%) and energy consumption of\nmobile devices (by up to 88.2%) in multi-task split computing scenarios.\n","authors":["Yoshitomo Matsubara","Matteo Mendula","Marco Levorato"],"pdf_url":"https://arxiv.org/pdf/2501.01420v2.pdf","comment":"Accepted at WACV 2025. Code and models are available at\n  https://github.com/yoshitomo-matsubara/ladon-multi-task-sc2"},{"id":"http://arxiv.org/abs/2406.14118v2","updated":"2025-01-19T06:49:57Z","published":"2024-06-20T09:03:26Z","title":"Prediction and Reference Quality Adaptation for Learned Video\n  Compression","summary":"  Temporal prediction is one of the most important technologies for video\ncompression. Various prediction coding modes are designed in traditional video\ncodecs. Traditional video codecs will adaptively to decide the optimal coding\nmode according to the prediction quality and reference quality. Recently,\nlearned video codecs have made great progress. However, they did not\neffectively address the problem of prediction and reference quality adaptation,\nwhich limits the effective utilization of temporal prediction and reduction of\nreconstruction error propagation. Therefore, in this paper, we first propose a\nconfidence-based prediction quality adaptation (PQA) module to provide explicit\ndiscrimination for the spatial and channel-wise prediction quality difference.\nWith this module, the prediction with low quality will be suppressed and that\nwith high quality will be enhanced. The codec can adaptively decide which\nspatial or channel location of predictions to use. Then, we further propose a\nreference quality adaptation (RQA) module and an associated repeat-long\ntraining strategy to provide dynamic spatially variant filters for diverse\nreference qualities. With these filters, our codec can adapt to different\nreference qualities, making it easier to achieve the target reconstruction\nquality and reduce the reconstruction error propagation. Experimental results\nverify that our proposed modules can effectively help our codec achieve a\nhigher compression performance.\n","authors":["Xihua Sheng","Li Li","Dong Liu","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2406.14118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08604v5","updated":"2025-01-19T06:34:24Z","published":"2024-08-16T08:45:25Z","title":"Bi-Directional Deep Contextual Video Compression","summary":"  Deep video compression has made remarkable process in recent years, with the\nmajority of advancements concentrated on P-frame coding. Although efforts to\nenhance B-frame coding are ongoing, their compression performance is still far\nbehind that of traditional bi-directional video codecs. In this paper, we\nintroduce a bi-directional deep contextual video compression scheme tailored\nfor B-frames, termed DCVC-B, to improve the compression performance of deep\nB-frame coding. Our scheme mainly has three key innovations. First, we develop\na bi-directional motion difference context propagation method for effective\nmotion difference coding, which significantly reduces the bit cost of\nbi-directional motions. Second, we propose a bi-directional contextual\ncompression model and a corresponding bi-directional temporal entropy model, to\nmake better use of the multi-scale temporal contexts. Third, we propose a\nhierarchical quality structure-based training strategy, leading to an effective\nbit allocation across large groups of pictures (GOP). Experimental results show\nthat our DCVC-B achieves an average reduction of 26.6% in BD-Rate compared to\nthe reference software for H.265/HEVC under random access conditions.\nRemarkably, it surpasses the performance of the H.266/VVC reference software on\ncertain test datasets under the same configuration. We anticipate our work can\nprovide valuable insights and bring up deep B-frame coding to the next level.\n","authors":["Xihua Sheng","Li Li","Dong Liu","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08604v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10958v1","updated":"2025-01-19T06:16:45Z","published":"2025-01-19T06:16:45Z","title":"Rethinking Early-Fusion Strategies for Improved Multimodal Image\n  Segmentation","summary":"  RGB and thermal image fusion have great potential to exhibit improved\nsemantic segmentation in low-illumination conditions. Existing methods\ntypically employ a two-branch encoder framework for multimodal feature\nextraction and design complicated feature fusion strategies to achieve feature\nextraction and fusion for multimodal semantic segmentation. However, these\nmethods require massive parameter updates and computational effort during the\nfeature extraction and fusion. To address this issue, we propose a novel\nmultimodal fusion network (EFNet) based on an early fusion strategy and a\nsimple but effective feature clustering for training efficient RGB-T semantic\nsegmentation. In addition, we also propose a lightweight and efficient\nmulti-scale feature aggregation decoder based on Euclidean distance. We\nvalidate the effectiveness of our method on different datasets and outperform\nprevious state-of-the-art methods with lower parameters and computation.\n","authors":["Zhengwen Shen","Yulian Li","Han Zhang","Yuchen Weng","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10958v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.10957v1","updated":"2025-01-19T06:11:02Z","published":"2025-01-19T06:11:02Z","title":"MARIO: A Mixed Annotation Framework For Polyp Segmentation","summary":"  Existing polyp segmentation models are limited by high labeling costs and the\nsmall size of datasets. Additionally, vast polyp datasets remain underutilized\nbecause these models typically rely on a single type of annotation. To address\nthis dilemma, we introduce MARIO, a mixed supervision model designed to\naccommodate various annotation types, significantly expanding the range of\nusable data. MARIO learns from underutilized datasets by incorporating five\nforms of supervision: pixel-level, box-level, polygon-level, scribblelevel, and\npoint-level. Each form of supervision is associated with a tailored loss that\neffectively leverages the supervision labels while minimizing the noise. This\nallows MARIO to move beyond the constraints of relying on a single annotation\ntype. Furthermore, MARIO primarily utilizes dataset with weak and cheap\nannotations, reducing the dependence on large-scale, fully annotated ones.\nExperimental results across five benchmark datasets demonstrate that MARIO\nconsistently outperforms existing methods, highlighting its efficacy in\nbalancing trade-offs between different forms of supervision and maximizing\npolyp segmentation performance\n","authors":["Haoyang Li","Yiwen Hu","Jun Wei","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2501.10957v1.pdf","comment":"Accepted by IEEE ISBI 2025 4-page paper"},{"id":"http://arxiv.org/abs/2412.14233v2","updated":"2025-01-19T05:38:52Z","published":"2024-12-18T18:45:43Z","title":"Descriptive Caption Enhancement with Visual Specialists for Multimodal\n  Perception","summary":"  Training Large Multimodality Models (LMMs) relies on descriptive image\ncaption that connects image and language. Existing methods either distill the\ncaption from the LMM models or construct the captions from the internet images\nor by human. We propose to leverage off-the-shelf visual specialists, which\nwere trained from annotated images initially not for image captioning, for\nenhancing the image caption.\n  Our approach, named DCE, explores object low-level and fine-grained\nattributes (e.g., depth, emotion and fine-grained categories) and object\nrelations (e.g., relative location and human-object-interaction (HOI)), and\ncombine the attributes into the descriptive caption. Experiments demonstrate\nthat such visual specialists are able to improve the performance for visual\nunderstanding tasks as well as reasoning that benefits from more accurate\nvisual understanding. We will release the source code and the pipeline so that\nother visual specialists are easily combined into the pipeline. The complete\nsource code of DCE pipeline and datasets will be available at\n\\url{https://github.com/syp2ysy/DCE}.\n","authors":["Yanpeng Sun","Jing Hao","Ke Zhu","Jiang-Jiang Liu","Yuxiang Zhao","Xiaofan Li","Gang Zhang","Zechao Li","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14233v2.pdf","comment":"An open-source data engine for generating detailed image captions"},{"id":"http://arxiv.org/abs/2410.04052v2","updated":"2025-01-19T05:35:34Z","published":"2024-10-05T06:18:26Z","title":"Beyond Imperfections: A Conditional Inpainting Approach for End-to-End\n  Artifact Removal in VTON and Pose Transfer","summary":"  Artifacts often degrade the visual quality of virtual try-on (VTON) and pose\ntransfer applications, impacting user experience. This study introduces a novel\nconditional inpainting technique designed to detect and remove such\ndistortions, improving image aesthetics. Our work is the first to present an\nend-to-end framework addressing this specific issue, and we developed a\nspecialized dataset of artifacts in VTON and pose transfer tasks, complete with\nmasks highlighting the affected areas. Experimental results show that our\nmethod not only effectively removes artifacts but also significantly enhances\nthe visual quality of the final images, setting a new benchmark in computer\nvision and image processing.\n","authors":["Aref Tabatabaei","Zahra Dehghanian","Maryam Amirmazlaghani"],"pdf_url":"https://arxiv.org/pdf/2410.04052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11111v2","updated":"2025-01-19T04:36:00Z","published":"2024-09-17T12:05:29Z","title":"Few-Shot Domain Adaptation for Learned Image Compression","summary":"  Learned image compression (LIC) has achieved state-of-the-art rate-distortion\nperformance, deemed promising for next-generation image compression techniques.\nHowever, pre-trained LIC models usually suffer from significant performance\ndegradation when applied to out-of-training-domain images, implying their poor\ngeneralization capabilities. To tackle this problem, we propose a few-shot\ndomain adaptation method for LIC by integrating plug-and-play adapters into\npre-trained models. Drawing inspiration from the analogy between latent\nchannels and frequency components, we examine domain gaps in LIC and observe\nthat out-of-training-domain images disrupt pre-trained channel-wise\ndecomposition. Consequently, we introduce a method for channel-wise\nre-allocation using convolution-based adapters and low-rank adapters, which are\nlightweight and compatible to mainstream LIC schemes. Extensive experiments\nacross multiple domains and multiple representative LIC schemes demonstrate\nthat our method significantly enhances pre-trained models, achieving comparable\nperformance to H.266/VVC intra coding with merely 25 target-domain samples.\nAdditionally, our method matches the performance of full-model finetune while\ntransmitting fewer than $2\\%$ of the parameters.\n","authors":["Tianyu Zhang","Haotian Zhang","Yuqi Li","Li Li","Dong Liu"],"pdf_url":"https://arxiv.org/pdf/2409.11111v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10935v1","updated":"2025-01-19T04:05:08Z","published":"2025-01-19T04:05:08Z","title":"TSVC:Tripartite Learning with Semantic Variation Consistency for Robust\n  Image-Text Retrieval","summary":"  Cross-modal retrieval maps data under different modality via semantic\nrelevance. Existing approaches implicitly assume that data pairs are\nwell-aligned and ignore the widely existing annotation noise, i.e., noisy\ncorrespondence (NC). Consequently, it inevitably causes performance\ndegradation. Despite attempts that employ the co-teaching paradigm with\nidentical architectures to provide distinct data perspectives, the differences\nbetween these architectures are primarily stemmed from random initialization.\nThus, the model becomes increasingly homogeneous along with the training\nprocess. Consequently, the additional information brought by this paradigm is\nseverely limited. In order to resolve this problem, we introduce a Tripartite\nlearning with Semantic Variation Consistency (TSVC) for robust image-text\nretrieval. We design a tripartite cooperative learning mechanism comprising a\nCoordinator, a Master, and an Assistant model. The Coordinator distributes\ndata, and the Assistant model supports the Master model's noisy label\nprediction with diverse data. Moreover, we introduce a soft label estimation\nmethod based on mutual information variation, which quantifies the noise in new\nsamples and assigns corresponding soft labels. We also present a new loss\nfunction to enhance robustness and optimize training effectiveness. Extensive\nexperiments on three widely used datasets demonstrate that, even at increasing\nnoise ratios, TSVC exhibits significant advantages in retrieval accuracy and\nmaintains stable training performance.\n","authors":["Shuai Lyu","Zijing Tian","Zhonghong Ou","Yifan Zhu","Xiao Zhang","Qiankun Ha","Haoran Luo","Meina Song"],"pdf_url":"https://arxiv.org/pdf/2501.10935v1.pdf","comment":"This paper has been accepted to the Main Track of AAAI 2025. It\n  contains 9 pages, 7 figures, and is relevant to the areas of cross-modal\n  retrieval and machine learning. The work presents a novel approach in robust\n  image-text retrieval using a tripartite learning framework"},{"id":"http://arxiv.org/abs/2501.10928v1","updated":"2025-01-19T03:19:47Z","published":"2025-01-19T03:19:47Z","title":"Generative Physical AI in Vision: A Survey","summary":"  Generative Artificial Intelligence (AI) has rapidly advanced the field of\ncomputer vision by enabling machines to create and interpret visual data with\nunprecedented sophistication. This transformation builds upon a foundation of\ngenerative models to produce realistic images, videos, and 3D or 4D content.\nTraditionally, generative models primarily focus on visual fidelity while often\nneglecting the physical plausibility of generated content. This gap limits\ntheir effectiveness in applications requiring adherence to real-world physical\nlaws, such as robotics, autonomous systems, and scientific simulations. As\ngenerative AI evolves to increasingly integrate physical realism and dynamic\nsimulation, its potential to function as a \"world simulator\" expands-enabling\nthe modeling of interactions governed by physics and bridging the divide\nbetween virtual and physical realities. This survey systematically reviews this\nemerging field of physics-aware generative AI in computer vision, categorizing\nmethods based on how they incorporate physical knowledge-either through\nexplicit simulation or implicit learning. We analyze key paradigms, discuss\nevaluation protocols, and identify future research directions. By offering a\ncomprehensive overview, this survey aims to help future developments in\nphysically grounded generation for vision. The reviewed papers are summarized\nat https://github.com/BestJunYu/Awesome-Physics-aware-Generation.\n","authors":["Daochang Liu","Junyu Zhang","Anh-Dung Dinh","Eunbyung Park","Shichao Zhang","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2501.10928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08944v2","updated":"2025-01-19T03:07:21Z","published":"2023-09-16T10:34:01Z","title":"Learning Unified Distance Metric Across Diverse Data Distributions with\n  Parameter-Efficient Transfer Learning","summary":"  A common practice in metric learning is to train and test an embedding model\nfor each dataset. This dataset-specific approach fails to simulate real-world\nscenarios that involve multiple heterogeneous distributions of data. In this\nregard, we explore a new metric learning paradigm, called Unified Metric\nLearning (UML), which learns a unified distance metric capable of capturing\nrelations across multiple data distributions. UML presents new challenges, such\nas imbalanced data distribution and bias towards dominant distributions. These\nissues cause standard metric learning methods to fail in learning a unified\nmetric. To address these challenges, we propose Parameter-efficient Unified\nMetric leArning (PUMA), which consists of a pre-trained frozen model and two\nadditional modules, stochastic adapter and prompt pool. These modules enable to\ncapture dataset-specific knowledge while avoiding bias towards dominant\ndistributions. Additionally, we compile a new unified metric learning benchmark\nwith a total of 8 different datasets. PUMA outperforms the state-of-the-art\ndataset-specific models while using about 69 times fewer trainable parameters.\n","authors":["Sungyeon Kim","Donghyun Kim","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2309.08944v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2409.09668v2","updated":"2025-01-19T02:46:28Z","published":"2024-09-15T08:43:18Z","title":"EditBoard: Towards a Comprehensive Evaluation Benchmark for Text-Based\n  Video Editing Models","summary":"  The rapid development of diffusion models has significantly advanced\nAI-generated content (AIGC), particularly in Text-to-Image (T2I) and\nText-to-Video (T2V) generation. Text-based video editing, leveraging these\ngenerative capabilities, has emerged as a promising field, enabling precise\nmodifications to videos based on text prompts. Despite the proliferation of\ninnovative video editing models, there is a conspicuous lack of comprehensive\nevaluation benchmarks that holistically assess these models' performance across\nvarious dimensions. Existing evaluations are limited and inconsistent,\ntypically summarizing overall performance with a single score, which obscures\nmodels' effectiveness on individual editing tasks. To address this gap, we\npropose EditBoard, the first comprehensive evaluation benchmark for text-based\nvideo editing models. EditBoard encompasses nine automatic metrics across four\ndimensions, evaluating models on four task categories and introducing three new\nmetrics to assess fidelity. This task-oriented benchmark facilitates objective\nevaluation by detailing model performance and providing insights into each\nmodel's strengths and weaknesses. By open-sourcing EditBoard, we aim to\nstandardize evaluation and advance the development of robust video editing\nmodels.\n","authors":["Yupeng Chen","Penglin Chen","Xiaoyu Zhang","Yixian Huang","Qian Xie"],"pdf_url":"https://arxiv.org/pdf/2409.09668v2.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2501.08816v2","updated":"2025-01-19T02:34:44Z","published":"2025-01-15T14:12:59Z","title":"IDEA: Image Description Enhanced CLIP-Adapter","summary":"  CLIP (Contrastive Language-Image Pre-training) has attained great success in\npattern recognition and computer vision. Transferring CLIP to downstream tasks\n(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.\nHowever, current studies primarily focus on either prompt learning for text or\nadapter tuning for vision, without fully exploiting the complementary\ninformation and correlations among image-text pairs. In this paper, we propose\nan Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to\nfew-shot image classification tasks. This method captures fine-grained features\nby leveraging both visual features and textual descriptions of images. IDEA is\na training-free method for CLIP, and it can be comparable to or even exceeds\nstate-of-the-art models on multiple tasks. Furthermore, we introduce\nTrainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable\ncomponents (i.e., a projector and a learnable latent space), further enhancing\nthe model's performance and achieving SOTA results on 11 datasets. As one\nimportant contribution, we employ the Llama model and design a comprehensive\npipeline to generate textual descriptions for images of 11 datasets, resulting\nin a total of 1,637,795 image-text pairs, named \"IMD-11\". Our code and data are\nreleased at https://github.com/FourierAI/IDEA.\n","authors":["Zhipeng Ye","Feng Jiang","Qiufeng Wang","Kaizhu Huang","Jiaqi Huang"],"pdf_url":"https://arxiv.org/pdf/2501.08816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10917v1","updated":"2025-01-19T01:52:28Z","published":"2025-01-19T01:52:28Z","title":"Decomposing and Fusing Intra- and Inter-Sensor Spatio-Temporal Signal\n  for Multi-Sensor Wearable Human Activity Recognition","summary":"  Wearable Human Activity Recognition (WHAR) is a prominent research area\nwithin ubiquitous computing. Multi-sensor synchronous measurement has proven to\nbe more effective for WHAR than using a single sensor. However, existing WHAR\nmethods use shared convolutional kernels for indiscriminate temporal feature\nextraction across each sensor variable, which fails to effectively capture\nspatio-temporal relationships of intra-sensor and inter-sensor variables. We\npropose the DecomposeWHAR model consisting of a decomposition phase and a\nfusion phase to better model the relationships between modality variables. The\ndecomposition creates high-dimensional representations of each intra-sensor\nvariable through the improved Depth Separable Convolution to capture local\ntemporal features while preserving their unique characteristics. The fusion\nphase begins by capturing relationships between intra-sensor variables and\nfusing their features at both the channel and variable levels. Long-range\ntemporal dependencies are modeled using the State Space Model (SSM), and later\ncross-sensor interactions are dynamically captured through a self-attention\nmechanism, highlighting inter-sensor spatial correlations. Our model\ndemonstrates superior performance on three widely used WHAR datasets,\nsignificantly outperforming state-of-the-art models while maintaining\nacceptable computational efficiency. Our codes and supplementary materials are\navailable at https://github.com/Anakin2555/DecomposeWHAR.\n","authors":["Haoyu Xie","Haoxuan Li","Chunyuan Zheng","Haonan Yuan","Guorui Liao","Jun Liao","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2501.10917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10914v1","updated":"2025-01-19T01:42:00Z","published":"2025-01-19T01:42:00Z","title":"Green Video Camouflaged Object Detection","summary":"  Camouflaged object detection (COD) aims to distinguish hidden objects\nembedded in an environment highly similar to the object. Conventional\nvideo-based COD (VCOD) methods explicitly extract motion cues or employ complex\ndeep learning networks to handle the temporal information, which is limited by\nhigh complexity and unstable performance. In this work, we propose a green VCOD\nmethod named GreenVCOD. Built upon a green ICOD method, GreenVCOD uses long-\nand short-term temporal neighborhoods (TN) to capture joint spatial/temporal\ncontext information for decision refinement. Experimental results show that\nGreenVCOD offers competitive performance compared to state-of-the-art VCOD\nbenchmarks.\n","authors":["Xinyu Wang","Hong-Shuo Chen","Zhiruo Zhou","Suya You","Azad M. Madni","C. -C. Jay Kuo"],"pdf_url":"https://arxiv.org/pdf/2501.10914v1.pdf","comment":"Accepted to 2024 Asia Pacific Signal and Information Processing\n  Association Annual Summit and Conference (APSIPA ASC)"},{"id":"http://arxiv.org/abs/2501.10913v1","updated":"2025-01-19T01:17:05Z","published":"2025-01-19T01:17:05Z","title":"Know \"No\" Better: A Data-Driven Approach for Enhancing Negation\n  Awareness in CLIP","summary":"  While CLIP has significantly advanced multimodal understanding by bridging\nvision and language, the inability to grasp negation - such as failing to\ndifferentiate concepts like \"parking\" from \"no parking\" - poses substantial\nchallenges. By analyzing the data used in the public CLIP model's pre-training,\nwe posit this limitation stems from a lack of negation-inclusive data. To\naddress this, we introduce data generation pipelines that employ a large\nlanguage model (LLM) and a multimodal LLM to produce negation-inclusive\ncaptions. Fine-tuning CLIP with data generated from our pipelines, we develop\nNegationCLIP, which enhances negation awareness while preserving the\ngenerality. Moreover, to enable a comprehensive evaluation of negation\nunderstanding, we propose NegRefCOCOg-a benchmark tailored to test VLMs'\nability to interpret negation across diverse expressions and positions within a\nsentence. Experiments on various CLIP architectures validate the effectiveness\nof our data generation pipelines in enhancing CLIP's ability to perceive\nnegation accurately. Additionally, NegationCLIP's enhanced negation awareness\nhas practical applications across various multimodal tasks, demonstrated by\nperformance gains in text-to-image generation and referring image segmentation.\n","authors":["Junsung Park","Jungbeom Lee","Jongyoon Song","Sangwon Yu","Dahuin Jung","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2501.10913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04710v2","updated":"2025-01-19T01:01:25Z","published":"2024-05-13T12:09:01Z","title":"Visual Evaluative AI: A Hypothesis-Driven Tool with Concept-Based\n  Explanations and Weight of Evidence","summary":"  This paper presents Visual Evaluative AI, a decision aid that provides\npositive and negative evidence from image data for a given hypothesis. This\ntool finds high-level human concepts in an image and generates the Weight of\nEvidence (WoE) for each hypothesis in the decision-making process. We apply and\nevaluate this tool in the skin cancer domain by building a web-based\napplication that allows users to upload a dermatoscopic image, select a\nhypothesis and analyse their decisions by evaluating the provided evidence.\nFurther, we demonstrate the effectiveness of Visual Evaluative AI on different\nconcept-based explanation approaches.\n","authors":["Thao Le","Tim Miller","Ruihan Zhang","Liz Sonenberg","Ronal Singh"],"pdf_url":"https://arxiv.org/pdf/2407.04710v2.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2412.05707v2","updated":"2025-01-19T00:37:27Z","published":"2024-12-07T17:40:20Z","title":"Segment-Level Road Obstacle Detection Using Visual Foundation Model\n  Priors and Likelihood Ratios","summary":"  Detecting road obstacles is essential for autonomous vehicles to navigate\ndynamic and complex traffic environments safely. Current road obstacle\ndetection methods typically assign a score to each pixel and apply a threshold\nto generate final predictions. However, selecting an appropriate threshold is\nchallenging, and the per-pixel classification approach often leads to\nfragmented predictions with numerous false positives. In this work, we propose\na novel method that leverages segment-level features from visual foundation\nmodels and likelihood ratios to predict road obstacles directly. By focusing on\nsegments rather than individual pixels, our approach enhances detection\naccuracy, reduces false positives, and offers increased robustness to scene\nvariability. We benchmark our approach against existing methods on the\nRoadObstacle and LostAndFound datasets, achieving state-of-the-art performance\nwithout needing a predefined threshold.\n","authors":["Youssef Shoeb","Nazir Nayal","Azarm Nowzard","Fatma Güney","Hanno Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2412.05707v2.pdf","comment":"10 pages, 4 figures, and 1 table, to be published in VISAPP 2025"},{"id":"http://arxiv.org/abs/2501.10906v1","updated":"2025-01-19T00:23:02Z","published":"2025-01-19T00:23:02Z","title":"Explainable Adversarial Attacks on Coarse-to-Fine Classifiers","summary":"  Traditional adversarial attacks typically aim to alter the predicted labels\nof input images by generating perturbations that are imperceptible to the human\neye. However, these approaches often lack explainability. Moreover, most\nexisting work on adversarial attacks focuses on single-stage classifiers, but\nmulti-stage classifiers are largely unexplored. In this paper, we introduce\ninstance-based adversarial attacks for multi-stage classifiers, leveraging\nLayer-wise Relevance Propagation (LRP), which assigns relevance scores to\npixels based on their influence on classification outcomes. Our approach\ngenerates explainable adversarial perturbations by utilizing LRP to identify\nand target key features critical for both coarse and fine-grained\nclassifications. Unlike conventional attacks, our method not only induces\nmisclassification but also enhances the interpretability of the model's\nbehavior across classification stages, as demonstrated by experimental results.\n","authors":["Akram Heidarizadeh","Connor Hatfield","Lorenzo Lazzarotto","HanQin Cai","George Atia"],"pdf_url":"https://arxiv.org/pdf/2501.10906v1.pdf","comment":"ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.10905v1","updated":"2025-01-19T00:14:20Z","published":"2025-01-19T00:14:20Z","title":"A Remote Sensing Image Change Detection Method Integrating Layer\n  Exchange and Channel-Spatial Differences","summary":"  Change detection in remote sensing imagery is a critical technique for Earth\nobservation, primarily focusing on pixel-level segmentation of change regions\nbetween bi-temporal images. The essence of pixel-level change detection lies in\ndetermining whether corresponding pixels in bi-temporal images have changed. In\ndeep learning, the spatial and channel dimensions of feature maps represent\ndifferent information from the original images. In this study, we found that in\nchange detection tasks, difference information can be computed not only from\nthe spatial dimension of bi-temporal features but also from the channel\ndimension. Therefore, we designed the Channel-Spatial Difference Weighting\n(CSDW) module as an aggregation-distribution mechanism for bi-temporal features\nin change detection. This module enhances the sensitivity of the change\ndetection model to difference features. Additionally, bi-temporal images share\nthe same geographic location and exhibit strong inter-image correlations. To\nconstruct the correlation between bi-temporal images, we designed a decoding\nstructure based on the Layer-Exchange (LE) method to enhance the interaction of\nbi-temporal features. Comprehensive experiments on the CLCD, PX-CLCD, LEVIR-CD,\nand S2Looking datasets demonstrate that the proposed LENet model significantly\nimproves change detection performance. The code and pre-trained models will be\navailable at: https://github.com/dyzy41/lenet.\n","authors":["Sijun Dong","Fangcheng Zuo","Geng Chen","Siming Fu","Xiaoliang Meng"],"pdf_url":"https://arxiv.org/pdf/2501.10905v1.pdf","comment":"21 pages, 8 figures"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2501.11171v1","updated":"2025-01-19T21:16:39Z","published":"2025-01-19T21:16:39Z","title":"Counteracting temporal attacks in Video Copy Detection","summary":"  Video Copy Detection (VCD) plays a crucial role in copyright protection and\ncontent verification by identifying duplicates and near-duplicates in\nlarge-scale video databases. The META AI Challenge on video copy detection\nprovided a benchmark for evaluating state-of-the-art methods, with the\nDual-level detection approach emerging as a winning solution. This method\nintegrates Video Editing Detection and Frame Scene Detection to handle\nadversarial transformations and large datasets efficiently. However, our\nanalysis reveals significant limitations in the VED component, particularly in\nits ability to handle exact copies. Moreover, Dual-level detection shows\nvulnerability to temporal attacks. To address it, we propose an improved frame\nselection strategy based on local maxima of interframe differences, which\nenhances robustness against adversarial temporal modifications while\nsignificantly reducing computational overhead. Our method achieves an increase\nof 1.4 to 5.8 times in efficiency over the standard 1 FPS approach. Compared to\nDual-level detection method, our approach maintains comparable micro-average\nprecision ($\\mu$AP) while also demonstrating improved robustness against\ntemporal attacks. Given 56\\% reduced representation size and the inference time\nof more than 2 times faster, our approach is more suitable to real-world\nresource restriction.\n","authors":["Katarzyna Fojcik","Piotr Syga"],"pdf_url":"https://arxiv.org/pdf/2501.11171v1.pdf","comment":"14 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.02968v2","updated":"2025-01-19T14:23:28Z","published":"2025-01-06T12:24:57Z","title":"FlipedRAG: Black-Box Opinion Manipulation Attacks to Retrieval-Augmented\n  Generation of Large Language Models","summary":"  Retrieval-Augmented Generation (RAG) addresses hallucination and real-time\nconstraints by dynamically retrieving relevant information from a knowledge\ndatabase to supplement the LLMs' input. When presented with a query, RAG\nselects the most semantically similar texts from its knowledge bases and uses\nthem as context for the LLMs to generate more accurate responses. RAG also\ncreates a new attack surface, especially since RAG databases are frequently\nsourced from public domains. While existing studies have predominantly focused\non optimizing RAG's performance and efficiency, emerging research has begun\naddressing the security concerns associated with RAG. However, these works have\nsome limitations, typically focusing on either white-box methodologies or\nheuristic-based black-box attacks. Furthermore, prior research has mainly\ntargeted simple factoid question answering, which is neither practically\nchallenging nor resistant to correction. In this paper, we unveil a more\nrealistic and threatening scenario: opinion manipulation for controversial\ntopics against RAG. Particularly, we propose a novel RAG black-box attack\nmethod, termed FlipedRAG, which is transfer-based. By leveraging instruction\nengineering, we obtain partial retrieval model outputs from black-box RAG\nsystem, facilitating the training of surrogate models to enhance the\neffectiveness of opinion manipulation attack. Extensive experimental results\nconfirms that our approach significantly enhances the average success rate of\nopinion manipulation by 16.7%. It achieves an average of a 50% directional\nchange in the opinion polarity of RAG responses across four themes.\nAdditionally, it induces a 20% shift in user cognition. Furthermore, we discuss\nthe efficacy of potential defense mechanisms and conclude that they are\ninsufficient in mitigating this type of attack, highlighting the urgent need to\ndevelop novel defensive strategies.\n","authors":["Zhuo Chen","Yuyang Gong","Miaokun Chen","Haotan Liu","Qikai Cheng","Fan Zhang","Wei Lu","Xiaozhong Liu","Jiawei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.02968v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.13757"},{"id":"http://arxiv.org/abs/2501.11034v1","updated":"2025-01-19T12:57:13Z","published":"2025-01-19T12:57:13Z","title":"Generative Retrieval for Book search","summary":"  In book search, relevant book information should be returned in response to a\nquery. Books contain complex, multi-faceted information such as metadata,\noutlines, and main text, where the outline provides hierarchical information\nbetween chapters and sections. Generative retrieval (GR) is a new retrieval\nparadigm that consolidates corpus information into a single model to generate\nidentifiers of documents that are relevant to a given query. How can GR be\napplied to book search? Directly applying GR to book search is a challenge due\nto the unique characteristics of book search: The model needs to retain the\ncomplex, multi-faceted information of the book, which increases the demand for\nlabeled data. Splitting book information and treating it as a collection of\nseparate segments for learning might result in a loss of hierarchical\ninformation. We propose an effective Generative retrieval framework for Book\nSearch (GBS) that features two main components: data augmentation and\noutline-oriented book encoding. For data augmentation, GBS constructs multiple\nquery-book pairs for training; it constructs multiple book identifiers based on\nthe outline, various forms of book contents, and simulates real book retrieval\nscenarios with varied pseudo-queries. This includes coverage-promoting book\nidentifier augmentation, allowing the model to learn to index effectively, and\ndiversity-enhanced query augmentation, allowing the model to learn to retrieve\neffectively. Outline-oriented book encoding improves length extrapolation\nthrough bi-level positional encoding and retentive attention mechanisms to\nmaintain context over long sequences. Experiments on a proprietary Baidu\ndataset demonstrate that GBS outperforms strong baselines, achieving a 9.8\\%\nimprovement in terms of MRR@20, over the state-of-the-art RIPOR method...\n","authors":["Yubao Tang","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Shihao Liu","Shuaiqing Wang","Dawei Yin","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.11034v1.pdf","comment":"Accepted at KDD ADS 2025"},{"id":"http://arxiv.org/abs/2501.10915v1","updated":"2025-01-19T01:43:42Z","published":"2025-01-19T01:43:42Z","title":"LegalGuardian: A Privacy-Preserving Framework for Secure Integration of\n  Large Language Models in Legal Practice","summary":"  Large Language Models (LLMs) hold promise for advancing legal practice by\nautomating complex tasks and improving access to justice. However, their\nadoption is limited by concerns over client confidentiality, especially when\nlawyers include sensitive Personally Identifiable Information (PII) in prompts,\nrisking unauthorized data exposure. To mitigate this, we introduce\nLegalGuardian, a lightweight, privacy-preserving framework tailored for lawyers\nusing LLM-based tools. LegalGuardian employs Named Entity Recognition (NER)\ntechniques and local LLMs to mask and unmask confidential PII within prompts,\nsafeguarding sensitive data before any external interaction. We detail its\ndevelopment and assess its effectiveness using a synthetic prompt library in\nimmigration law scenarios. Comparing traditional NER models with one-shot\nprompted local LLM, we find that LegalGuardian achieves a F1-score of 93% with\nGLiNER and 97% with Qwen2.5-14B in PII detection. Semantic similarity analysis\nconfirms that the framework maintains high fidelity in outputs, ensuring robust\nutility of LLM-based tools. Our findings indicate that legal professionals can\nharness advanced AI technologies without compromising client confidentiality or\nthe quality of legal documents.\n","authors":["M. Mikail Demir","Hakan T. Otal","M. Abdullah Canbaz"],"pdf_url":"https://arxiv.org/pdf/2501.10915v1.pdf","comment":"10 pages, 3 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2402.16562v3","updated":"2025-01-19T23:13:24Z","published":"2024-02-26T13:39:04Z","title":"QF-tuner: Breaking Tradition in Reinforcement Learning","summary":"  Hyperparameter tuning in reinforcement learning algorithms refers to choosing\nthe optimal parameters that may increase the algorithm's performance. Manual or\nrandom hyperparameter tuning methods can be problematic, as even slight\nvariations in their values can result in significantly different outcomes in\nthe learning process. In this paper, we propose a new method, QF-tuner, for\nautomatic hyperparameter tuning in the Q-learning algorithm using the FOX\noptimization algorithm (FOX). A new objective function has been proposed for\nthe FOX, prioritizing reward over learning error and time. QF-tuner starts by\nrunning the FOX and tries to minimize the fitness value derived from\nobservations at each iteration by executing the Q-learning algorithm. The\nproposed method has been evaluated using two control tasks from the OpenAI Gym:\nCartPole and FrozenLake. The empirical results of the QF-tuner on the CartPole\ncontrol task show a reward of 499, and on the FrozenLake control task, a reward\nof 1. These results indicate that the QF-tuner outperforms other optimization\nalgorithms. On the FrozenLake control task, there was a 36\\% increase in reward\nwith a 26\\% reduction in learning time; on the CartPole control task, there was\na 57\\% increase in reward with a 20\\% decrease in learning time. Thus, the\nQF-tuner is an essential method for hyperparameter tuning in reinforcement\nlearning algorithms, enabling more effective solutions to control task\nproblems.\n","authors":["Mahmood A. Jumaah","Yossra H. Ali","Tarik A. Rashid"],"pdf_url":"https://arxiv.org/pdf/2402.16562v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2501.11190v1","updated":"2025-01-19T22:37:55Z","published":"2025-01-19T22:37:55Z","title":"Reinforcement Learning Based Goodput Maximization with Quantized\n  Feedback in URLLC","summary":"  This paper presents a comprehensive system model for goodput maximization\nwith quantized feedback in Ultra-Reliable Low-Latency Communication (URLLC),\nfocusing on dynamic channel conditions and feedback schemes. The study\ninvestigates a communication system, where the receiver provides quantized\nchannel state information to the transmitter. The system adapts its feedback\nscheme based on reinforcement learning, aiming to maximize goodput while\naccommodating varying channel statistics. We introduce a novel Rician-$K$\nfactor estimation technique to enable the communication system to optimize the\nfeedback scheme. This dynamic approach increases the overall performance,\nmaking it well-suited for practical URLLC applications where channel statistics\nvary over time.\n","authors":["Hasan Basri Celebi","Mikael Skoglund"],"pdf_url":"https://arxiv.org/pdf/2501.11190v1.pdf","comment":"Accepted for the IARIA 21st International Conference on Wireless and\n  Mobile Communication (ICWMC 2025) Conference"},{"id":"http://arxiv.org/abs/2302.00284v4","updated":"2025-01-19T22:37:22Z","published":"2023-02-01T07:31:25Z","title":"Selective Uncertainty Propagation in Offline RL","summary":"  We consider the finite-horizon offline reinforcement learning (RL) setting,\nand are motivated by the challenge of learning the policy at any step h in\ndynamic programming (DP) algorithms. To learn this, it is sufficient to\nevaluate the treatment effect of deviating from the behavioral policy at step h\nafter having optimized the policy for all future steps. Since the policy at any\nstep can affect next-state distributions, the related distributional shift\nchallenges can make this problem far more statistically hard than estimating\nsuch treatment effects in the stochastic contextual bandit setting. However,\nthe hardness of many real-world RL instances lies between the two regimes. We\ndevelop a flexible and general method called selective uncertainty propagation\nfor confidence interval construction that adapts to the hardness of the\nassociated distribution shift challenges. We show benefits of our approach on\ntoy environments and demonstrate the benefits of these techniques for offline\npolicy learning.\n","authors":["Sanath Kumar Krishnamurthy","Tanmay Gangwani","Sumeet Katariya","Branislav Kveton","Shrey Modi","Anshuka Rangi"],"pdf_url":"https://arxiv.org/pdf/2302.00284v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11183v1","updated":"2025-01-19T21:49:42Z","published":"2025-01-19T21:49:42Z","title":"Can Safety Fine-Tuning Be More Principled? Lessons Learned from\n  Cybersecurity","summary":"  As LLMs develop increasingly advanced capabilities, there is an increased\nneed to minimize the harm that could be caused to society by certain model\noutputs; hence, most LLMs have safety guardrails added, for example via\nfine-tuning. In this paper, we argue the position that current safety\nfine-tuning is very similar to a traditional cat-and-mouse game (or arms race)\nbetween attackers and defenders in cybersecurity. Model jailbreaks and attacks\nare patched with bandaids to target the specific attack mechanism, but many\nsimilar attack vectors might remain. When defenders are not proactively coming\nup with principled mechanisms, it becomes very easy for attackers to sidestep\nany new defenses. We show how current defenses are insufficient to prevent new\nadversarial jailbreak attacks, reward hacking, and loss of control problems. In\norder to learn from past mistakes in cybersecurity, we draw analogies with\nhistorical examples and develop lessons learned that can be applied to LLM\nsafety. These arguments support the need for new and more principled approaches\nto designing safe models, which are architected for security from the\nbeginning. We describe several such approaches from the AI literature.\n","authors":["David Williams-King","Linh Le","Adam Oberman","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2501.11183v1.pdf","comment":"published at Neurips Safe Generative AI Workshop 2024"},{"id":"http://arxiv.org/abs/2501.03162v2","updated":"2025-01-19T21:38:05Z","published":"2025-01-06T17:31:36Z","title":"Deep-Relative-Trust-Based Diffusion for Decentralized Deep Learning","summary":"  Decentralized learning strategies allow a collection of agents to learn\nefficiently from local data sets without the need for central aggregation or\norchestration. Current decentralized learning paradigms typically rely on an\naveraging mechanism to encourage agreement in the parameter space. We argue\nthat in the context of deep neural networks, which are often\nover-parameterized, encouraging consensus of the neural network outputs, as\nopposed to their parameters can be more appropriate. This motivates the\ndevelopment of a new decentralized learning algorithm, termed DRT diffusion,\nbased on deep relative trust (DRT), a recently introduced similarity measure\nfor neural networks. We provide convergence analysis for the proposed strategy,\nand numerically establish its benefit to generalization, especially with sparse\ntopologies, in an image classification task.\n","authors":["Muyun Li","Aaron Fainman","Stefan Vlaski"],"pdf_url":"https://arxiv.org/pdf/2501.03162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11178v1","updated":"2025-01-19T21:34:54Z","published":"2025-01-19T21:34:54Z","title":"Conditional Feature Importance with Generative Modeling Using\n  Adversarial Random Forests","summary":"  This paper proposes a method for measuring conditional feature importance via\ngenerative modeling. In explainable artificial intelligence (XAI), conditional\nfeature importance assesses the impact of a feature on a prediction model's\nperformance given the information of other features. Model-agnostic post hoc\nmethods to do so typically evaluate changes in the predictive performance under\non-manifold feature value manipulations. Such procedures require creating\nfeature values that respect conditional feature distributions, which can be\nchallenging in practice. Recent advancements in generative modeling can\nfacilitate this. For tabular data, which may consist of both categorical and\ncontinuous features, the adversarial random forest (ARF) stands out as a\ngenerative model that can generate on-manifold data points without requiring\nintensive tuning efforts or computational resources, making it a promising\ncandidate model for subroutines in XAI methods. This paper proposes cARFi\n(conditional ARF feature importance), a method for measuring conditional\nfeature importance through feature values sampled from ARF-estimated\nconditional distributions. cARFi requires only little tuning to yield robust\nimportance scores that can flexibly adapt for conditional or marginal notions\nof feature importance, including straightforward extensions to condition on\nfeature subsets and allows for inferring the significance of feature\nimportances through statistical tests.\n","authors":["Kristin Blesch","Niklas Koenen","Jan Kapar","Pegah Golchian","Lukas Burk","Markus Loecher","Marvin N. Wright"],"pdf_url":"https://arxiv.org/pdf/2501.11178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11175v1","updated":"2025-01-19T21:25:53Z","published":"2025-01-19T21:25:53Z","title":"ProKeR: A Kernel Perspective on Few-Shot Adaptation of Large\n  Vision-Language Models","summary":"  The growing popularity of Contrastive Language-Image Pretraining (CLIP) has\nled to its widespread application in various visual downstream tasks. To\nenhance CLIP's effectiveness and versatility, efficient few-shot adaptation\ntechniques have been widely adopted. Among these approaches, training-free\nmethods, particularly caching methods exemplified by Tip-Adapter, have gained\nattention for their lightweight adaptation without the need for additional\nfine-tuning. In this paper, we revisit Tip-Adapter from a kernel perspective,\nshowing that caching methods function as local adapters and are connected to a\nwell-established kernel literature. Drawing on this insight, we offer a\ntheoretical understanding of how these methods operate and suggest multiple\navenues for enhancing the Tip-Adapter baseline. Notably, our analysis shows the\nimportance of incorporating global information in local adapters. Therefore, we\nsubsequently propose a global method that learns a proximal regularizer in a\nreproducing kernel Hilbert space (RKHS) using CLIP as a base learner. Our\nmethod, which we call ProKeR (Proximal Kernel ridge Regression), has a closed\nform solution and achieves state-of-the-art performances across 11 datasets in\nthe standard few-shot adaptation benchmark.\n","authors":["Yassir Bendou","Amine Ouasfi","Vincent Gripon","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2501.11175v1.pdf","comment":"Code available at https://ybendou.github.io/ProKeR"},{"id":"http://arxiv.org/abs/2501.11171v1","updated":"2025-01-19T21:16:39Z","published":"2025-01-19T21:16:39Z","title":"Counteracting temporal attacks in Video Copy Detection","summary":"  Video Copy Detection (VCD) plays a crucial role in copyright protection and\ncontent verification by identifying duplicates and near-duplicates in\nlarge-scale video databases. The META AI Challenge on video copy detection\nprovided a benchmark for evaluating state-of-the-art methods, with the\nDual-level detection approach emerging as a winning solution. This method\nintegrates Video Editing Detection and Frame Scene Detection to handle\nadversarial transformations and large datasets efficiently. However, our\nanalysis reveals significant limitations in the VED component, particularly in\nits ability to handle exact copies. Moreover, Dual-level detection shows\nvulnerability to temporal attacks. To address it, we propose an improved frame\nselection strategy based on local maxima of interframe differences, which\nenhances robustness against adversarial temporal modifications while\nsignificantly reducing computational overhead. Our method achieves an increase\nof 1.4 to 5.8 times in efficiency over the standard 1 FPS approach. Compared to\nDual-level detection method, our approach maintains comparable micro-average\nprecision ($\\mu$AP) while also demonstrating improved robustness against\ntemporal attacks. Given 56\\% reduced representation size and the inference time\nof more than 2 times faster, our approach is more suitable to real-world\nresource restriction.\n","authors":["Katarzyna Fojcik","Piotr Syga"],"pdf_url":"https://arxiv.org/pdf/2501.11171v1.pdf","comment":"14 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.11170v1","updated":"2025-01-19T21:16:31Z","published":"2025-01-19T21:16:31Z","title":"AIMA at SemEval-2024 Task 3: Simple Yet Powerful Emotion Cause Pair\n  Analysis","summary":"  The SemEval-2024 Task 3 presents two subtasks focusing on emotion-cause pair\nextraction within conversational contexts. Subtask 1 revolves around the\nextraction of textual emotion-cause pairs, where causes are defined and\nannotated as textual spans within the conversation. Conversely, Subtask 2\nextends the analysis to encompass multimodal cues, including language, audio,\nand vision, acknowledging instances where causes may not be exclusively\nrepresented in the textual data. Our proposed model for emotion-cause analysis\nis meticulously structured into three core segments: (i) embedding extraction,\n(ii) cause-pair extraction & emotion classification, and (iii) cause extraction\nusing QA after finding pairs. Leveraging state-of-the-art techniques and\nfine-tuning on task-specific datasets, our model effectively unravels the\nintricate web of conversational dynamics and extracts subtle cues signifying\ncausality in emotional expressions. Our team, AIMA, demonstrated strong\nperformance in the SemEval-2024 Task 3 competition. We ranked as the 10th in\nsubtask 1 and the 6th in subtask 2 out of 23 teams.\n","authors":["Alireza Ghahramani Kure","Mahshid Dehghani","Mohammad Mahdi Abootorabi","Nona Ghazizadeh","Seyed Arshan Dalili","Ehsaneddin Asgari"],"pdf_url":"https://arxiv.org/pdf/2501.11170v1.pdf","comment":"Proceedings of the 18th International Workshop on Semantic Evaluation\n  (SemEval-2024)"},{"id":"http://arxiv.org/abs/2501.11167v1","updated":"2025-01-19T21:01:13Z","published":"2025-01-19T21:01:13Z","title":"Federated Testing (FedTest): A New Scheme to Enhance Convergence and\n  Mitigate Adversarial Attacks in Federating Learning","summary":"  Federated Learning (FL) has emerged as a significant paradigm for training\nmachine learning models. This is due to its data-privacy-preserving property\nand its efficient exploitation of distributed computational resources. This is\nachieved by conducting the training process in parallel at distributed users.\nHowever, traditional FL strategies grapple with difficulties in evaluating the\nquality of received models, handling unbalanced models, and reducing the impact\nof detrimental models. To resolve these problems, we introduce a novel\nfederated learning framework, which we call federated testing for federated\nlearning (FedTest). In the FedTest method, the local data of a specific user is\nused to train the model of that user and test the models of the other users.\nThis approach enables users to test each other's models and determine an\naccurate score for each. This score can then be used to aggregate the models\nefficiently and identify any malicious ones. Our numerical results reveal that\nthe proposed method not only accelerates convergence rates but also diminishes\nthe potential influence of malicious users. This significantly enhances the\noverall efficiency and robustness of FL systems.\n","authors":["Mustafa Ghaleb","Mohanad Obeed","Muhamad Felemban","Anas Chaaban","Halim Yanikomeroglu"],"pdf_url":"https://arxiv.org/pdf/2501.11167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11166v1","updated":"2025-01-19T20:56:45Z","published":"2025-01-19T20:56:45Z","title":"AIMA at SemEval-2024 Task 10: History-Based Emotion Recognition in\n  Hindi-English Code-Mixed Conversations","summary":"  In this study, we introduce a solution to the SemEval 2024 Task 10 on subtask\n1, dedicated to Emotion Recognition in Conversation (ERC) in code-mixed\nHindi-English conversations. ERC in code-mixed conversations presents unique\nchallenges, as existing models are typically trained on monolingual datasets\nand may not perform well on code-mixed data. To address this, we propose a\nseries of models that incorporate both the previous and future context of the\ncurrent utterance, as well as the sequential information of the conversation.\nTo facilitate the processing of code-mixed data, we developed a\nHinglish-to-English translation pipeline to translate the code-mixed\nconversations into English. We designed four different base models, each\nutilizing powerful pre-trained encoders to extract features from the input but\nwith varying architectures. By ensembling all of these models, we developed a\nfinal model that outperforms all other baselines.\n","authors":["Mohammad Mahdi Abootorabi","Nona Ghazizadeh","Seyed Arshan Dalili","Alireza Ghahramani Kure","Mahshid Dehghani","Ehsaneddin Asgari"],"pdf_url":"https://arxiv.org/pdf/2501.11166v1.pdf","comment":"Proceedings of the 18th International Workshop on Semantic Evaluation\n  (SemEval-2024)"},{"id":"http://arxiv.org/abs/2501.11161v1","updated":"2025-01-19T20:26:34Z","published":"2025-01-19T20:26:34Z","title":"Modeling Attention during Dimensional Shifts with Counterfactual and\n  Delayed Feedback","summary":"  Attention can be used to inform choice selection in contextual bandit tasks\neven when context features have not been previously experienced. One example of\nthis is in dimensional shifts, where additional feature values are introduced\nand the relationship between features and outcomes can either be static or\nvariable. Attentional mechanisms have been extensively studied in contextual\nbandit tasks where the feedback of choices is provided immediately, but less\nresearch has been done on tasks where feedback is delayed or in counterfactual\nfeedback cases. Some methods have successfully modeled human attention with\nimmediate feedback based on reward prediction errors (RPEs), though recent\nresearch raises questions of the applicability of RPEs onto more general\nattentional mechanisms. Alternative models suggest that information theoretic\nmetrics can be used to model human attention, with broader applications to\nnovel stimuli. In this paper, we compare two different methods for modeling how\nhumans attend to specific features of decision making tasks, one that is based\non calculating an information theoretic metric using a memory of past\nexperiences, and another that is based on iteratively updating attention from\nreward prediction errors. We compare these models using simulations in a\ncontextual bandit task with both intradimensional and extradimensional domain\nshifts, as well as immediate, delayed, and counterfactual feedback. We find\nthat calculating an information theoretic metric over a history of experiences\nis best able to account for human-like behavior in tasks that shift dimensions\nand alter feedback presentation. These results indicate that information\ntheoretic metrics of attentional mechanisms may be better suited than RPEs to\npredict human attention in decision making, though further studies of human\nbehavior are necessary to support these results.\n","authors":["Tyler Malloy","Roderick Seow","Cleotilde Gonzalez"],"pdf_url":"https://arxiv.org/pdf/2501.11161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11154v1","updated":"2025-01-19T19:46:48Z","published":"2025-01-19T19:46:48Z","title":"Modelling of automotive steel fatigue lifetime by machine learning\n  method","summary":"  In the current study, the fatigue life of QSTE340TM steel was modelled using\na machine learning method, namely, a neural network. This problem was solved by\na Multi-Layer Perceptron (MLP) neural network with a 3-75-1 architecture, which\nallows the prediction of the crack length based on the number of load cycles N,\nthe stress ratio R, and the overload ratio Rol. The proposed model showed high\naccuracy, with mean absolute percentage error (MAPE) ranging from 0.02% to\n4.59% for different R and Rol. The neural network effectively reveals the\nnonlinear relationships between input parameters and fatigue crack growth,\nproviding reliable predictions for different loading conditions.\n","authors":["Oleh Yasniy","Dmytro Tymoshchuk","Iryna Didych","Nataliya Zagorodna","Olha Malyshevska"],"pdf_url":"https://arxiv.org/pdf/2501.11154v1.pdf","comment":"Paper Submitted to ITTAP 2024 CEUR-WS, see\n  https://ceur-ws.org/Vol-3896/short4.pdf"},{"id":"http://arxiv.org/abs/2410.07166v3","updated":"2025-01-19T19:29:50Z","published":"2024-10-09T17:59:00Z","title":"Embodied Agent Interface: Benchmarking LLMs for Embodied Decision Making","summary":"  We aim to evaluate Large Language Models (LLMs) for embodied decision making.\nWhile a significant body of work has been leveraging LLMs for decision making\nin embodied environments, we still lack a systematic understanding of their\nperformance because they are usually applied in different domains, for\ndifferent purposes, and built based on different inputs and outputs.\nFurthermore, existing evaluations tend to rely solely on a final success rate,\nmaking it difficult to pinpoint what ability is missing in LLMs and where the\nproblem lies, which in turn blocks embodied agents from leveraging LLMs\neffectively and selectively. To address these limitations, we propose a\ngeneralized interface (Embodied Agent Interface) that supports the\nformalization of various types of tasks and input-output specifications of\nLLM-based modules. Specifically, it allows us to unify 1) a broad set of\nembodied decision-making tasks involving both state and temporally extended\ngoals, 2) four commonly-used LLM-based modules for decision making: goal\ninterpretation, subgoal decomposition, action sequencing, and transition\nmodeling, and 3) a collection of fine-grained metrics which break down\nevaluation into various types of errors, such as hallucination errors,\naffordance errors, various types of planning errors, etc. Overall, our\nbenchmark offers a comprehensive assessment of LLMs' performance for different\nsubtasks, pinpointing the strengths and weaknesses in LLM-powered embodied AI\nsystems, and providing insights for effective and selective use of LLMs in\nembodied decision making.\n","authors":["Manling Li","Shiyu Zhao","Qineng Wang","Kangrui Wang","Yu Zhou","Sanjana Srivastava","Cem Gokmen","Tony Lee","Li Erran Li","Ruohan Zhang","Weiyu Liu","Percy Liang","Li Fei-Fei","Jiayuan Mao","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2410.07166v3.pdf","comment":"Accepted for oral presentation at NeurIPS 2024 in the Datasets and\n  Benchmarks track. Final Camera version"},{"id":"http://arxiv.org/abs/2501.11139v1","updated":"2025-01-19T18:51:15Z","published":"2025-01-19T18:51:15Z","title":"Community detection for Contexual-LSBM: Theoretical limitation on\n  misclassfication ratio and effecient algorithm","summary":"  The integration of both network information and node attribute information\nhas recently gained significant attention in the context of community recovery\nproblems. In this work, we address the task of determining the optimal\nclassification rate for the Label-SBM(LSBM) model with node attribute\ninformation and. Specifically, we derive the optimal lower bound, which is\ncharacterized by the Chernoff-Hellinger divergence for a general LSBM network\nmodel with Gaussian node attributes. Additionally, we highlight the connection\nbetween the divergence $D(\\bs\\alpha, \\mb P, \\bs\\mu)$ in our model and those\nintroduced in \\cite{yun2016optimal} and \\cite{lu2016statistical}. We also\npresents a consistent algorithm based on spectral method for the proposed\naggreated latent factor model.\n","authors":["Dian Jin","Yuqian Zhang","Qiaosheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19328v2","updated":"2025-01-19T18:17:05Z","published":"2024-06-27T16:59:14Z","title":"Subtractive Training for Music Stem Insertion using Latent Diffusion\n  Models","summary":"  We present Subtractive Training, a simple and novel method for synthesizing\nindividual musical instrument stems given other instruments as context. This\nmethod pairs a dataset of complete music mixes with 1) a variant of the dataset\nlacking a specific stem, and 2) LLM-generated instructions describing how the\nmissing stem should be reintroduced. We then fine-tune a pretrained\ntext-to-audio diffusion model to generate the missing instrument stem, guided\nby both the existing stems and the text instruction. Our results demonstrate\nSubtractive Training's efficacy in creating authentic drum stems that\nseamlessly blend with the existing tracks. We also show that we can use the\ntext instruction to control the generation of the inserted stem in terms of\nrhythm, dynamics, and genre, allowing us to modify the style of a single\ninstrument in a full song while keeping the remaining instruments the same.\nLastly, we extend this technique to MIDI formats, successfully generating\ncompatible bass, drum, and guitar parts for incomplete arrangements.\n","authors":["Ivan Villa-Renteria","Mason L. Wang","Zachary Shah","Zhe Li","Soohyun Kim","Neelesh Ramachandran","Mert Pilanci"],"pdf_url":"https://arxiv.org/pdf/2406.19328v2.pdf","comment":"5 pages, survey, edit pipeline figure, fix typos"},{"id":"http://arxiv.org/abs/2501.11136v1","updated":"2025-01-19T18:14:12Z","published":"2025-01-19T18:14:12Z","title":"A Novel Switch-Type Policy Network for Resource Allocation Problems:\n  Technical Report","summary":"  Deep Reinforcement Learning (DRL) has become a powerful tool for developing\ncontrol policies in queueing networks, but the common use of Multi-layer\nPerceptron (MLP) neural networks in these applications has significant\ndrawbacks. MLP architectures, while versatile, often suffer from poor sample\nefficiency and a tendency to overfit training environments, leading to\nsuboptimal performance on new, unseen networks. In response to these issues, we\nintroduce a switch-type neural network (STN) architecture designed to improve\nthe efficiency and generalization of DRL policies in queueing networks. The STN\nleverages structural patterns from traditional non-learning policies, ensuring\nconsistent action choices across similar states. This design not only\nstreamlines the learning process but also fosters better generalization by\nreducing the tendency to overfit. Our works presents three key contributions:\nfirst, the development of the STN as a more effective alternative to MLPs;\nsecond, empirical evidence showing that STNs achieve superior sample efficiency\nin various training scenarios; and third, experimental results demonstrating\nthat STNs match MLP performance in familiar environments and significantly\noutperform them in new settings. By embedding domain-specific knowledge, the\nSTN enhances the Proximal Policy Optimization (PPO) algorithm's effectiveness\nwithout compromising performance, suggesting its suitability for a wide range\nof queueing network control problems.\n","authors":["Jerrod Wigmore","Brooke Shrader","Eytan Modiano"],"pdf_url":"https://arxiv.org/pdf/2501.11136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11135v1","updated":"2025-01-19T18:05:13Z","published":"2025-01-19T18:05:13Z","title":"Playing the Lottery With Concave Regularizers for Sparse Trainable\n  Neural Networks","summary":"  The design of sparse neural networks, i.e., of networks with a reduced number\nof parameters, has been attracting increasing research attention in the last\nfew years. The use of sparse models may significantly reduce the computational\nand storage footprint in the inference phase. In this context, the lottery\nticket hypothesis (LTH) constitutes a breakthrough result, that addresses not\nonly the performance of the inference phase, but also of the training phase. It\nstates that it is possible to extract effective sparse subnetworks, called\nwinning tickets, that can be trained in isolation. The development of effective\nmethods to play the lottery, i.e., to find winning tickets, is still an open\nproblem. In this article, we propose a novel class of methods to play the\nlottery. The key point is the use of concave regularization to promote the\nsparsity of a relaxed binary mask, which represents the network topology. We\ntheoretically analyze the effectiveness of the proposed method in the convex\nframework. Then, we propose extended numerical tests on various datasets and\narchitectures, that show that the proposed method can improve the performance\nof state-of-the-art algorithms.\n","authors":["Giulia Fracastoro","Sophie M. Fosson","Andrea Migliorati","Giuseppe C. Calafiore"],"pdf_url":"https://arxiv.org/pdf/2501.11135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11132v1","updated":"2025-01-19T18:01:39Z","published":"2025-01-19T18:01:39Z","title":"Advanced technology in railway track monitoring using the GPR Technique:\n  A Review","summary":"  Subsurface evaluation of railway tracks is crucial for safe operation, as it\nallows for the early detection and remediation of potential structural\nweaknesses or defects that could lead to accidents or derailments. Ground\nPenetrating Radar (GPR) is an electromagnetic survey technique as advanced\nnon-destructive technology (NDT) that can be used to monitor railway tracks.\nThis technology is well-suited for railway applications due to the sub-layered\ncomposition of the track, which includes ties, ballast, sub-ballast, and\nsubgrade regions. It can detect defects such as ballast pockets, fouled\nballast, poor drainage, and subgrade settlement. The paper reviews recent works\non advanced technology and interpretations of GPR data collected for different\nlayers. Further, this paper demonstrates the current techniques for using\nsynthetic modeling to calibrate real-world GPR data, enhancing accuracy in\nidentifying subsurface features like ballast conditions and structural\nanomalies and applying various algorithms to refine GPR data analysis. These\ninclude Support Vector Machine (SVM) for classifying railway ballast types,\nFuzzy C-means, and Generalized Regression Neural Networks for high-accuracy\ndefect classification. Deep learning techniques, particularly Convolutional\nNeural Networks (CNNs) and Recurrent Neural Networks (RNNs) are also\nhighlighted for their effectiveness in recognizing patterns associated with\ndefects in GPR images. The article specifically focuses on the development of a\nConvolutional Recurrent Neural Network (CRNN) model, which combines CNN and RNN\narchitectures for efficient processing of GPR data. This model demonstrates\nenhanced detection capabilities and faster processing compared to traditional\nobject detection models like Faster R-CNN.\n","authors":["Farhad Kooban","Aleksandra Radlińska","Reza Mousapour","Maryam Saraei"],"pdf_url":"https://arxiv.org/pdf/2501.11132v1.pdf","comment":"2nd Canadian & Cold Regions Rail Research Conference 2024 (CCRC 2024)"},{"id":"http://arxiv.org/abs/2410.03705v3","updated":"2025-01-19T17:57:11Z","published":"2024-09-25T17:13:05Z","title":"Gradient Boosting Decision Trees on Medical Diagnosis over Tabular Data","summary":"  Medical diagnosis is a crucial task in the medical field, in terms of\nproviding accurate classification and respective treatments. Having\nnear-precise decisions based on correct diagnosis can affect a patient's life\nitself, and may extremely result in a catastrophe if not classified correctly.\nSeveral traditional machine learning (ML), such as support vector machines\n(SVMs) and logistic regression, and state-of-the-art tabular deep learning (DL)\nmethods, including TabNet and TabTransformer, have been proposed and used over\ntabular medical datasets. Additionally, due to the superior performances, lower\ncomputational costs, and easier optimization over different tasks, ensemble\nmethods have been used in the field more recently. They offer a powerful\nalternative in terms of providing successful medical decision-making processes\nin several diagnosis tasks. In this study, we investigated the benefits of\nensemble methods, especially the Gradient Boosting Decision Tree (GBDT)\nalgorithms in medical classification tasks over tabular data, focusing on\nXGBoost, CatBoost, and LightGBM. The experiments demonstrate that GBDT methods\noutperform traditional ML and deep neural network architectures and have the\nhighest average rank over several benchmark tabular medical diagnosis datasets.\nFurthermore, they require much less computational power compared to DL models,\ncreating the optimal methodology in terms of high performance and lower\ncomplexity.\n","authors":["A. Yarkın Yıldız","Asli Kalayci"],"pdf_url":"https://arxiv.org/pdf/2410.03705v3.pdf","comment":"8 pages, 2 figures, under review"},{"id":"http://arxiv.org/abs/2402.13728v6","updated":"2025-01-19T17:55:45Z","published":"2024-02-21T11:40:27Z","title":"Average gradient outer product as a mechanism for deep neural collapse","summary":"  Deep Neural Collapse (DNC) refers to the surprisingly rigid structure of the\ndata representations in the final layers of Deep Neural Networks (DNNs). Though\nthe phenomenon has been measured in a variety of settings, its emergence is\ntypically explained via data-agnostic approaches, such as the unconstrained\nfeatures model. In this work, we introduce a data-dependent setting where DNC\nforms due to feature learning through the average gradient outer product\n(AGOP). The AGOP is defined with respect to a learned predictor and is equal to\nthe uncentered covariance matrix of its input-output gradients averaged over\nthe training dataset. The Deep Recursive Feature Machine (Deep RFM) is a method\nthat constructs a neural network by iteratively mapping the data with the AGOP\nand applying an untrained random feature map. We demonstrate empirically that\nDNC occurs in Deep RFM across standard settings as a consequence of the\nprojection with the AGOP matrix computed at each layer. Further, we\ntheoretically explain DNC in Deep RFM in an asymptotic setting and as a result\nof kernel learning. We then provide evidence that this mechanism holds for\nneural networks more generally. In particular, we show that the right singular\nvectors and values of the weights can be responsible for the majority of\nwithin-class variability collapse for DNNs trained in the feature learning\nregime. As observed in recent work, this singular structure is highly\ncorrelated with that of the AGOP.\n","authors":["Daniel Beaglehole","Peter Súkeník","Marco Mondelli","Mikhail Belkin"],"pdf_url":"https://arxiv.org/pdf/2402.13728v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10485v2","updated":"2025-01-19T17:41:49Z","published":"2024-06-15T03:30:29Z","title":"A Label is Worth a Thousand Images in Dataset Distillation","summary":"  Data $\\textit{quality}$ is a crucial factor in the performance of machine\nlearning models, a principle that dataset distillation methods exploit by\ncompressing training datasets into much smaller counterparts that maintain\nsimilar downstream performance. Understanding how and why data distillation\nmethods work is vital not only for improving these methods but also for\nrevealing fundamental characteristics of \"good\" training data. However, a major\nchallenge in achieving this goal is the observation that distillation\napproaches, which rely on sophisticated but mostly disparate methods to\ngenerate synthetic data, have little in common with each other. In this work,\nwe highlight a largely overlooked aspect common to most of these methods: the\nuse of soft (probabilistic) labels. Through a series of ablation experiments,\nwe study the role of soft labels in depth. Our results reveal that the main\nfactor explaining the performance of state-of-the-art distillation methods is\nnot the specific techniques used to generate synthetic data but rather the use\nof soft labels. Furthermore, we demonstrate that not all soft labels are\ncreated equal; they must contain $\\textit{structured information}$ to be\nbeneficial. We also provide empirical scaling laws that characterize the\neffectiveness of soft labels as a function of images-per-class in the distilled\ndataset and establish an empirical Pareto frontier for data-efficient learning.\nCombined, our findings challenge conventional wisdom in dataset distillation,\nunderscore the importance of soft labels in learning, and suggest new\ndirections for improving distillation methods. Code for all experiments is\navailable at https://github.com/sunnytqin/no-distillation.\n","authors":["Tian Qin","Zhiwei Deng","David Alvarez-Melis"],"pdf_url":"https://arxiv.org/pdf/2406.10485v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.11127v1","updated":"2025-01-19T17:35:45Z","published":"2025-01-19T17:35:45Z","title":"A Regularized Online Newton Method for Stochastic Convex Bandits with\n  Linear Vanishing Noise","summary":"  We study a stochastic convex bandit problem where the subgaussian noise\nparameter is assumed to decrease linearly as the learner selects actions closer\nand closer to the minimizer of the convex loss function. Accordingly, we\npropose a Regularized Online Newton Method (RONM) for solving the problem,\nbased on the Online Newton Method (ONM) of arXiv:2406.06506. Our RONM reaches a\npolylogarithmic regret in the time horizon $n$ when the loss function grows\nquadratically in the constraint set, which recovers the results of\narXiv:2402.12042 in linear bandits. Our analyses rely on the growth rate of the\nprecision matrix $\\Sigma_t^{-1}$ in ONM and we find that linear growth solves\nthe question exactly. These analyses also help us obtain better convergence\nrates when the loss function grows faster. We also study and analyze two new\nbandit models: stochastic convex bandits with noise scaled to a subgaussian\nparameter function and convex bandits with stochastic multiplicative noise.\n","authors":["Jingxin Zhan","Yuchen Xin","Kaicheng Jin","Zhihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11120v1","updated":"2025-01-19T17:28:12Z","published":"2025-01-19T17:28:12Z","title":"Tell me about yourself: LLMs are aware of their learned behaviors","summary":"  We study behavioral self-awareness -- an LLM's ability to articulate its\nbehaviors without requiring in-context examples. We finetune LLMs on datasets\nthat exhibit particular behaviors, such as (a) making high-risk economic\ndecisions, and (b) outputting insecure code. Despite the datasets containing no\nexplicit descriptions of the associated behavior, the finetuned LLMs can\nexplicitly describe it. For example, a model trained to output insecure code\nsays, ``The code I write is insecure.'' Indeed, models show behavioral\nself-awareness for a range of behaviors and for diverse evaluations. Note that\nwhile we finetune models to exhibit behaviors like writing insecure code, we do\nnot finetune them to articulate their own behaviors -- models do this without\nany special training or examples.\n  Behavioral self-awareness is relevant for AI safety, as models could use it\nto proactively disclose problematic behaviors. In particular, we study backdoor\npolicies, where models exhibit unexpected behaviors only under certain trigger\nconditions. We find that models can sometimes identify whether or not they have\na backdoor, even without its trigger being present. However, models are not\nable to directly output their trigger by default.\n  Our results show that models have surprising capabilities for self-awareness\nand for the spontaneous articulation of implicit behaviors. Future work could\ninvestigate this capability for a wider range of scenarios and models\n(including practical scenarios), and explain how it emerges in LLMs.\n","authors":["Jan Betley","Xuchan Bao","Martín Soto","Anna Sztyber-Betley","James Chua","Owain Evans"],"pdf_url":"https://arxiv.org/pdf/2501.11120v1.pdf","comment":"Submitted to ICLR 2025. 17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.02604v3","updated":"2025-01-19T17:25:26Z","published":"2024-08-05T16:27:38Z","title":"Learning rheological parameters of non-Newtonian fluids from velocimetry\n  data","summary":"  We solve a Bayesian inverse Navier-Stokes (N-S) problem that assimilates\nvelocimetry data in order to jointly reconstruct the flow field and learn the\nunknown N-S parameters. By incorporating a Carreau shear-thinning viscosity\nmodel into the N-S problem, we devise an algorithm that learns the most likely\nCarreau parameters of a shear-thinning fluid, and estimates their\nuncertainties, from velocimetry data alone. We then conduct a flow-MRI\nexperiment to obtain velocimetry data of an axisymmetric laminar jet through an\nidealised medical device (FDA nozzle) for a blood analogue fluid. We show that\nthe algorithm can successfully reconstruct the flow field by learning the most\nlikely Carreau parameters, and that the learned parameters are in very good\nagreement with rheometry measurements. The algorithm accepts any algebraic\neffective viscosity model, as long as the model is differentiable, and it can\nbe extended to more complicated non-Newtonian fluids (e.g. Oldroyd-B fluid) if\na viscoelastic model is incorporated into the N-S problem.\n","authors":["Alexandros Kontogiannis","Richard Hodgkinson","Emily L. Manchester"],"pdf_url":"https://arxiv.org/pdf/2408.02604v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17737v4","updated":"2025-01-19T17:15:27Z","published":"2024-12-23T17:36:51Z","title":"Contextual Feedback Loops: Amplifying Deep Reasoning with Iterative\n  Top-Down Feedback","summary":"  We propose \\emph{Contextual Feedback Loops} (CFLs) as a simple yet effective\nway to infuse top-down context into earlier layers of a neural network. Unlike\nstandard backpropagation, which only revisits network parameters based on how\nfar predictions deviate from labels, CFLs \\emph{directly} re-introduce the\nmodel's own output signals as feedback to guide repeated cycles of refinement.\nThis mechanism is broadly applicable across architectures (e.g., CNNs and\ntransformers), and empirical results show that iterative top-down feedback\nboosts the accuracy and coherence of the resulting representations. We suggest\nthat by projecting context back into lower-level processing stages, CFLs bridge\nthe gap between purely bottom-up inference and more dynamic, feedback-driven\nreasoning.\n","authors":["Jacob Fein-Ashley","Rajgopal Kannan","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2412.17737v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11112v1","updated":"2025-01-19T16:59:07Z","published":"2025-01-19T16:59:07Z","title":"A Novel Pearson Correlation-Based Merging Algorithm for Robust\n  Distributed Machine Learning with Heterogeneous Data","summary":"  Federated learning faces significant challenges in scenarios with\nheterogeneous data distributions and adverse network conditions, such as\ndelays, packet loss, and data poisoning attacks. This paper proposes a novel\nmethod based on the SCAFFOLD algorithm to improve the quality of local updates\nand enhance the robustness of the global model. The key idea is to form\nintermediary nodes by merging local models with high similarity, using the\nPearson correlation coefficient as a similarity measure. The proposed merging\nalgorithm reduces the number of local nodes while maintaining the accuracy of\nthe global model, effectively addressing communication overhead and bandwidth\nconsumption. Experimental results on the MNIST dataset under simulated\nfederated learning scenarios demonstrate the method's effectiveness. After 10\nrounds of training using a CNN model, the proposed approach achieved accuracies\nof 0.82, 0.73, and 0.66 under normal conditions, packet loss and data poisoning\nattacks, respectively, outperforming the baseline SCAFFOLD algorithm. These\nresults highlight the potential of the proposed method to improve efficiency\nand resilience in federated learning systems.\n","authors":["Mohammad Ghabel Rahmat","Majid Khalilian"],"pdf_url":"https://arxiv.org/pdf/2501.11112v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2501.11171v1","updated":"2025-01-19T21:16:39Z","published":"2025-01-19T21:16:39Z","title":"Counteracting temporal attacks in Video Copy Detection","summary":"  Video Copy Detection (VCD) plays a crucial role in copyright protection and\ncontent verification by identifying duplicates and near-duplicates in\nlarge-scale video databases. The META AI Challenge on video copy detection\nprovided a benchmark for evaluating state-of-the-art methods, with the\nDual-level detection approach emerging as a winning solution. This method\nintegrates Video Editing Detection and Frame Scene Detection to handle\nadversarial transformations and large datasets efficiently. However, our\nanalysis reveals significant limitations in the VED component, particularly in\nits ability to handle exact copies. Moreover, Dual-level detection shows\nvulnerability to temporal attacks. To address it, we propose an improved frame\nselection strategy based on local maxima of interframe differences, which\nenhances robustness against adversarial temporal modifications while\nsignificantly reducing computational overhead. Our method achieves an increase\nof 1.4 to 5.8 times in efficiency over the standard 1 FPS approach. Compared to\nDual-level detection method, our approach maintains comparable micro-average\nprecision ($\\mu$AP) while also demonstrating improved robustness against\ntemporal attacks. Given 56\\% reduced representation size and the inference time\nof more than 2 times faster, our approach is more suitable to real-world\nresource restriction.\n","authors":["Katarzyna Fojcik","Piotr Syga"],"pdf_url":"https://arxiv.org/pdf/2501.11171v1.pdf","comment":"14 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2411.14207v2","updated":"2025-01-19T11:15:34Z","published":"2024-11-21T15:16:48Z","title":"HARP: A Large-Scale Higher-Order Ambisonic Room Impulse Response Dataset","summary":"  This contribution introduces a dataset of 7th-order Ambisonic Room Impulse\nResponses (HOA-RIRs), created using the Image Source Method. By employing\nhigher-order Ambisonics, our dataset enables precise spatial audio\nreproduction, a critical requirement for realistic immersive audio\napplications. Leveraging the virtual simulation, we present a unique microphone\nconfiguration, based on the superposition principle, designed to optimize sound\nfield coverage while addressing the limitations of traditional microphone\narrays. The presented 64-microphone configuration allows us to capture RIRs\ndirectly in the Spherical Harmonics domain. The dataset features a wide range\nof room configurations, encompassing variations in room geometry, acoustic\nabsorption materials, and source-receiver distances. A detailed description of\nthe simulation setup is provided alongside for an accurate reproduction. The\ndataset serves as a vital resource for researchers working on spatial audio,\nparticularly in applications involving machine learning to improve room\nacoustics modeling and sound field synthesis. It further provides a very high\nlevel of spatial resolution and realism crucial for tasks such as source\nlocalization, reverberation prediction, and immersive sound reproduction.\n","authors":["Shivam Saini","Jürgen Peissig"],"pdf_url":"https://arxiv.org/pdf/2411.14207v2.pdf","comment":"Accepted at ICASSP 2025 Workshop. Code to generate uploaded at:\n  https://github.com/whojavumusic/HARP"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2402.16562v3","updated":"2025-01-19T23:13:24Z","published":"2024-02-26T13:39:04Z","title":"QF-tuner: Breaking Tradition in Reinforcement Learning","summary":"  Hyperparameter tuning in reinforcement learning algorithms refers to choosing\nthe optimal parameters that may increase the algorithm's performance. Manual or\nrandom hyperparameter tuning methods can be problematic, as even slight\nvariations in their values can result in significantly different outcomes in\nthe learning process. In this paper, we propose a new method, QF-tuner, for\nautomatic hyperparameter tuning in the Q-learning algorithm using the FOX\noptimization algorithm (FOX). A new objective function has been proposed for\nthe FOX, prioritizing reward over learning error and time. QF-tuner starts by\nrunning the FOX and tries to minimize the fitness value derived from\nobservations at each iteration by executing the Q-learning algorithm. The\nproposed method has been evaluated using two control tasks from the OpenAI Gym:\nCartPole and FrozenLake. The empirical results of the QF-tuner on the CartPole\ncontrol task show a reward of 499, and on the FrozenLake control task, a reward\nof 1. These results indicate that the QF-tuner outperforms other optimization\nalgorithms. On the FrozenLake control task, there was a 36\\% increase in reward\nwith a 26\\% reduction in learning time; on the CartPole control task, there was\na 57\\% increase in reward with a 20\\% decrease in learning time. Thus, the\nQF-tuner is an essential method for hyperparameter tuning in reinforcement\nlearning algorithms, enabling more effective solutions to control task\nproblems.\n","authors":["Mahmood A. Jumaah","Yossra H. Ali","Tarik A. Rashid"],"pdf_url":"https://arxiv.org/pdf/2402.16562v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2302.00284v4","updated":"2025-01-19T22:37:22Z","published":"2023-02-01T07:31:25Z","title":"Selective Uncertainty Propagation in Offline RL","summary":"  We consider the finite-horizon offline reinforcement learning (RL) setting,\nand are motivated by the challenge of learning the policy at any step h in\ndynamic programming (DP) algorithms. To learn this, it is sufficient to\nevaluate the treatment effect of deviating from the behavioral policy at step h\nafter having optimized the policy for all future steps. Since the policy at any\nstep can affect next-state distributions, the related distributional shift\nchallenges can make this problem far more statistically hard than estimating\nsuch treatment effects in the stochastic contextual bandit setting. However,\nthe hardness of many real-world RL instances lies between the two regimes. We\ndevelop a flexible and general method called selective uncertainty propagation\nfor confidence interval construction that adapts to the hardness of the\nassociated distribution shift challenges. We show benefits of our approach on\ntoy environments and demonstrate the benefits of these techniques for offline\npolicy learning.\n","authors":["Sanath Kumar Krishnamurthy","Tanmay Gangwani","Sumeet Katariya","Branislav Kveton","Shrey Modi","Anshuka Rangi"],"pdf_url":"https://arxiv.org/pdf/2302.00284v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15658v3","updated":"2025-01-19T22:25:54Z","published":"2024-06-21T21:33:16Z","title":"TorchSpatial: A Location Encoding Framework and Benchmark for Spatial\n  Representation Learning","summary":"  Spatial representation learning (SRL) aims at learning general-purpose neural\nnetwork representations from various types of spatial data (e.g., points,\npolylines, polygons, networks, images, etc.) in their native formats. Learning\ngood spatial representations is a fundamental problem for various downstream\napplications such as species distribution modeling, weather forecasting,\ntrajectory generation, geographic question answering, etc. Even though SRL has\nbecome the foundation of almost all geospatial artificial intelligence (GeoAI)\nresearch, we have not yet seen significant efforts to develop an extensive deep\nlearning framework and benchmark to support SRL model development and\nevaluation. To fill this gap, we propose TorchSpatial, a learning framework and\nbenchmark for location (point) encoding, which is one of the most fundamental\ndata types of spatial representation learning. TorchSpatial contains three key\ncomponents: 1) a unified location encoding framework that consolidates 15\ncommonly recognized location encoders, ensuring scalability and reproducibility\nof the implementations; 2) the LocBench benchmark tasks encompassing 7\ngeo-aware image classification and 10 geo-aware image regression datasets; 3) a\ncomprehensive suite of evaluation metrics to quantify geo-aware model's overall\nperformance as well as their geographic bias, with a novel Geo-Bias Score\nmetric. Finally, we provide a detailed analysis and insights into the model\nperformance and geographic bias of different location encoders. We believe\nTorchSpatial will foster future advancement of spatial representation learning\nand spatial fairness in GeoAI research. The TorchSpatial model framework and\nLocBench benchmark are available at https://github.com/seai-lab/TorchSpatial,\nand the Geo-Bias Score evaluation framework is available at\nhttps://github.com/seai-lab/PyGBS.\n","authors":["Nemin Wu","Qian Cao","Zhangyu Wang","Zeping Liu","Yanlin Qi","Jielu Zhang","Joshua Ni","Xiaobai Yao","Hongxu Ma","Lan Mu","Stefano Ermon","Tanuja Ganu","Akshay Nambi","Ni Lao","Gengchen Mai"],"pdf_url":"https://arxiv.org/pdf/2406.15658v3.pdf","comment":"10 pages, 2 figures. Accepted by NeurIPS 2024 Datasets and Benchmarks\n  Track"},{"id":"http://arxiv.org/abs/2501.11183v1","updated":"2025-01-19T21:49:42Z","published":"2025-01-19T21:49:42Z","title":"Can Safety Fine-Tuning Be More Principled? Lessons Learned from\n  Cybersecurity","summary":"  As LLMs develop increasingly advanced capabilities, there is an increased\nneed to minimize the harm that could be caused to society by certain model\noutputs; hence, most LLMs have safety guardrails added, for example via\nfine-tuning. In this paper, we argue the position that current safety\nfine-tuning is very similar to a traditional cat-and-mouse game (or arms race)\nbetween attackers and defenders in cybersecurity. Model jailbreaks and attacks\nare patched with bandaids to target the specific attack mechanism, but many\nsimilar attack vectors might remain. When defenders are not proactively coming\nup with principled mechanisms, it becomes very easy for attackers to sidestep\nany new defenses. We show how current defenses are insufficient to prevent new\nadversarial jailbreak attacks, reward hacking, and loss of control problems. In\norder to learn from past mistakes in cybersecurity, we draw analogies with\nhistorical examples and develop lessons learned that can be applied to LLM\nsafety. These arguments support the need for new and more principled approaches\nto designing safe models, which are architected for security from the\nbeginning. We describe several such approaches from the AI literature.\n","authors":["David Williams-King","Linh Le","Adam Oberman","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2501.11183v1.pdf","comment":"published at Neurips Safe Generative AI Workshop 2024"},{"id":"http://arxiv.org/abs/2501.11175v1","updated":"2025-01-19T21:25:53Z","published":"2025-01-19T21:25:53Z","title":"ProKeR: A Kernel Perspective on Few-Shot Adaptation of Large\n  Vision-Language Models","summary":"  The growing popularity of Contrastive Language-Image Pretraining (CLIP) has\nled to its widespread application in various visual downstream tasks. To\nenhance CLIP's effectiveness and versatility, efficient few-shot adaptation\ntechniques have been widely adopted. Among these approaches, training-free\nmethods, particularly caching methods exemplified by Tip-Adapter, have gained\nattention for their lightweight adaptation without the need for additional\nfine-tuning. In this paper, we revisit Tip-Adapter from a kernel perspective,\nshowing that caching methods function as local adapters and are connected to a\nwell-established kernel literature. Drawing on this insight, we offer a\ntheoretical understanding of how these methods operate and suggest multiple\navenues for enhancing the Tip-Adapter baseline. Notably, our analysis shows the\nimportance of incorporating global information in local adapters. Therefore, we\nsubsequently propose a global method that learns a proximal regularizer in a\nreproducing kernel Hilbert space (RKHS) using CLIP as a base learner. Our\nmethod, which we call ProKeR (Proximal Kernel ridge Regression), has a closed\nform solution and achieves state-of-the-art performances across 11 datasets in\nthe standard few-shot adaptation benchmark.\n","authors":["Yassir Bendou","Amine Ouasfi","Vincent Gripon","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2501.11175v1.pdf","comment":"Code available at https://ybendou.github.io/ProKeR"},{"id":"http://arxiv.org/abs/2501.11171v1","updated":"2025-01-19T21:16:39Z","published":"2025-01-19T21:16:39Z","title":"Counteracting temporal attacks in Video Copy Detection","summary":"  Video Copy Detection (VCD) plays a crucial role in copyright protection and\ncontent verification by identifying duplicates and near-duplicates in\nlarge-scale video databases. The META AI Challenge on video copy detection\nprovided a benchmark for evaluating state-of-the-art methods, with the\nDual-level detection approach emerging as a winning solution. This method\nintegrates Video Editing Detection and Frame Scene Detection to handle\nadversarial transformations and large datasets efficiently. However, our\nanalysis reveals significant limitations in the VED component, particularly in\nits ability to handle exact copies. Moreover, Dual-level detection shows\nvulnerability to temporal attacks. To address it, we propose an improved frame\nselection strategy based on local maxima of interframe differences, which\nenhances robustness against adversarial temporal modifications while\nsignificantly reducing computational overhead. Our method achieves an increase\nof 1.4 to 5.8 times in efficiency over the standard 1 FPS approach. Compared to\nDual-level detection method, our approach maintains comparable micro-average\nprecision ($\\mu$AP) while also demonstrating improved robustness against\ntemporal attacks. Given 56\\% reduced representation size and the inference time\nof more than 2 times faster, our approach is more suitable to real-world\nresource restriction.\n","authors":["Katarzyna Fojcik","Piotr Syga"],"pdf_url":"https://arxiv.org/pdf/2501.11171v1.pdf","comment":"14 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.11170v1","updated":"2025-01-19T21:16:31Z","published":"2025-01-19T21:16:31Z","title":"AIMA at SemEval-2024 Task 3: Simple Yet Powerful Emotion Cause Pair\n  Analysis","summary":"  The SemEval-2024 Task 3 presents two subtasks focusing on emotion-cause pair\nextraction within conversational contexts. Subtask 1 revolves around the\nextraction of textual emotion-cause pairs, where causes are defined and\nannotated as textual spans within the conversation. Conversely, Subtask 2\nextends the analysis to encompass multimodal cues, including language, audio,\nand vision, acknowledging instances where causes may not be exclusively\nrepresented in the textual data. Our proposed model for emotion-cause analysis\nis meticulously structured into three core segments: (i) embedding extraction,\n(ii) cause-pair extraction & emotion classification, and (iii) cause extraction\nusing QA after finding pairs. Leveraging state-of-the-art techniques and\nfine-tuning on task-specific datasets, our model effectively unravels the\nintricate web of conversational dynamics and extracts subtle cues signifying\ncausality in emotional expressions. Our team, AIMA, demonstrated strong\nperformance in the SemEval-2024 Task 3 competition. We ranked as the 10th in\nsubtask 1 and the 6th in subtask 2 out of 23 teams.\n","authors":["Alireza Ghahramani Kure","Mahshid Dehghani","Mohammad Mahdi Abootorabi","Nona Ghazizadeh","Seyed Arshan Dalili","Ehsaneddin Asgari"],"pdf_url":"https://arxiv.org/pdf/2501.11170v1.pdf","comment":"Proceedings of the 18th International Workshop on Semantic Evaluation\n  (SemEval-2024)"},{"id":"http://arxiv.org/abs/2408.08471v2","updated":"2025-01-19T20:59:49Z","published":"2024-08-16T01:13:36Z","title":"Fairness Issues and Mitigations in (Differentially Private)\n  Socio-Demographic Data Processes","summary":"  Statistical agencies rely on sampling techniques to collect socio-demographic\ndata crucial for policy-making and resource allocation. This paper shows that\nsurveys of important societal relevance introduce sampling errors that unevenly\nimpact group-level estimates, thereby compromising fairness in downstream\ndecisions. To address these issues, this paper introduces an optimization\napproach modeled on real-world survey design processes, ensuring sampling costs\nare optimized while maintaining error margins within prescribed tolerances.\nAdditionally, privacy-preserving methods used to determine sampling rates can\nfurther impact these fairness issues. This paper explores the impact of\ndifferential privacy on the statistics informing the sampling process,\nrevealing a surprising effect: not only is the expected negative effect from\nthe addition of noise for differential privacy negligible, but also this\nprivacy noise can in fact reduce unfairness as it positively biases smaller\ncounts. These findings are validated over an extensive analysis using datasets\ncommonly applied in census statistics.\n","authors":["Joonhyuk Ko","Juba Ziani","Saswat Das","Matt Williams","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2408.08471v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11166v1","updated":"2025-01-19T20:56:45Z","published":"2025-01-19T20:56:45Z","title":"AIMA at SemEval-2024 Task 10: History-Based Emotion Recognition in\n  Hindi-English Code-Mixed Conversations","summary":"  In this study, we introduce a solution to the SemEval 2024 Task 10 on subtask\n1, dedicated to Emotion Recognition in Conversation (ERC) in code-mixed\nHindi-English conversations. ERC in code-mixed conversations presents unique\nchallenges, as existing models are typically trained on monolingual datasets\nand may not perform well on code-mixed data. To address this, we propose a\nseries of models that incorporate both the previous and future context of the\ncurrent utterance, as well as the sequential information of the conversation.\nTo facilitate the processing of code-mixed data, we developed a\nHinglish-to-English translation pipeline to translate the code-mixed\nconversations into English. We designed four different base models, each\nutilizing powerful pre-trained encoders to extract features from the input but\nwith varying architectures. By ensembling all of these models, we developed a\nfinal model that outperforms all other baselines.\n","authors":["Mohammad Mahdi Abootorabi","Nona Ghazizadeh","Seyed Arshan Dalili","Alireza Ghahramani Kure","Mahshid Dehghani","Ehsaneddin Asgari"],"pdf_url":"https://arxiv.org/pdf/2501.11166v1.pdf","comment":"Proceedings of the 18th International Workshop on Semantic Evaluation\n  (SemEval-2024)"},{"id":"http://arxiv.org/abs/2410.07166v3","updated":"2025-01-19T19:29:50Z","published":"2024-10-09T17:59:00Z","title":"Embodied Agent Interface: Benchmarking LLMs for Embodied Decision Making","summary":"  We aim to evaluate Large Language Models (LLMs) for embodied decision making.\nWhile a significant body of work has been leveraging LLMs for decision making\nin embodied environments, we still lack a systematic understanding of their\nperformance because they are usually applied in different domains, for\ndifferent purposes, and built based on different inputs and outputs.\nFurthermore, existing evaluations tend to rely solely on a final success rate,\nmaking it difficult to pinpoint what ability is missing in LLMs and where the\nproblem lies, which in turn blocks embodied agents from leveraging LLMs\neffectively and selectively. To address these limitations, we propose a\ngeneralized interface (Embodied Agent Interface) that supports the\nformalization of various types of tasks and input-output specifications of\nLLM-based modules. Specifically, it allows us to unify 1) a broad set of\nembodied decision-making tasks involving both state and temporally extended\ngoals, 2) four commonly-used LLM-based modules for decision making: goal\ninterpretation, subgoal decomposition, action sequencing, and transition\nmodeling, and 3) a collection of fine-grained metrics which break down\nevaluation into various types of errors, such as hallucination errors,\naffordance errors, various types of planning errors, etc. Overall, our\nbenchmark offers a comprehensive assessment of LLMs' performance for different\nsubtasks, pinpointing the strengths and weaknesses in LLM-powered embodied AI\nsystems, and providing insights for effective and selective use of LLMs in\nembodied decision making.\n","authors":["Manling Li","Shiyu Zhao","Qineng Wang","Kangrui Wang","Yu Zhou","Sanjana Srivastava","Cem Gokmen","Tony Lee","Li Erran Li","Ruohan Zhang","Weiyu Liu","Percy Liang","Li Fei-Fei","Jiayuan Mao","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2410.07166v3.pdf","comment":"Accepted for oral presentation at NeurIPS 2024 in the Datasets and\n  Benchmarks track. Final Camera version"},{"id":"http://arxiv.org/abs/2501.07017v2","updated":"2025-01-19T19:06:32Z","published":"2025-01-13T02:33:28Z","title":"UNetVL: Enhancing 3D Medical Image Segmentation with Chebyshev KAN\n  Powered Vision-LSTM","summary":"  3D medical image segmentation has progressed considerably due to\nConvolutional Neural Networks (CNNs) and Vision Transformers (ViTs), yet these\nmethods struggle to balance long-range dependency acquisition with\ncomputational efficiency. To address this challenge, we propose UNETVL (U-Net\nVision-LSTM), a novel architecture that leverages recent advancements in\ntemporal information processing. UNETVL incorporates Vision-LSTM (ViL) for\nimproved scalability and memory functions, alongside an efficient Chebyshev\nKolmogorov-Arnold Networks (KAN) to handle complex and long-range dependency\npatterns more effectively. We validated our method on the ACDC and AMOS2022\n(post challenge Task 2) benchmark datasets, showing a significant improvement\nin mean Dice score compared to recent state-of-the-art approaches, especially\nover its predecessor, UNETR, with increases of 7.3% on ACDC and 15.6% on AMOS,\nrespectively. Extensive ablation studies were conducted to demonstrate the\nimpact of each component in UNETVL, providing a comprehensive understanding of\nits architecture. Our code is available at https://github.com/tgrex6/UNETVL,\nfacilitating further research and applications in this domain.\n","authors":["Xuhui Guo","Tanmoy Dam","Rohan Dhamdhere","Gourav Modanwal","Anant Madabhushi"],"pdf_url":"https://arxiv.org/pdf/2501.07017v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11140v1","updated":"2025-01-19T18:53:30Z","published":"2025-01-19T18:53:30Z","title":"CLOFAI: A Dataset of Real And Fake Image Classification Tasks for\n  Continual Learning","summary":"  The rapid advancement of generative AI models capable of creating realistic\nmedia has led to a need for classifiers that can accurately distinguish between\ngenuine and artificially-generated images. A significant challenge for these\nclassifiers emerges when they encounter images from generative models that are\nnot represented in their training data, usually resulting in diminished\nperformance. A typical approach is to periodically update the classifier's\ntraining data with images from the new generative models then retrain the\nclassifier on the updated dataset. However, in some real-life scenarios,\nstorage, computational, or privacy constraints render this approach\nimpractical. Additionally, models used in security applications may be required\nto rapidly adapt. In these circumstances, continual learning provides a\npromising alternative, as the classifier can be updated without retraining on\nthe entire dataset. In this paper, we introduce a new dataset called CLOFAI\n(Continual Learning On Fake and Authentic Images), which takes the form of a\ndomain-incremental image classification problem. Moreover, we showcase the\napplicability of this dataset as a benchmark for evaluating continual learning\nmethodologies. In doing this, we set a baseline on our novel dataset using\nthree foundational continual learning methods -- EWC, GEM, and Experience\nReplay -- and find that EWC performs poorly, while GEM and Experience Replay\nshow promise, performing significantly better than a Naive baseline. The\ndataset and code to run the experiments can be accessed from the following\nGitHub repository: https://github.com/Will-Doherty/CLOFAI.\n","authors":["William Doherty","Anton Lee","Heitor Murilo Gomes"],"pdf_url":"https://arxiv.org/pdf/2501.11140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11135v1","updated":"2025-01-19T18:05:13Z","published":"2025-01-19T18:05:13Z","title":"Playing the Lottery With Concave Regularizers for Sparse Trainable\n  Neural Networks","summary":"  The design of sparse neural networks, i.e., of networks with a reduced number\nof parameters, has been attracting increasing research attention in the last\nfew years. The use of sparse models may significantly reduce the computational\nand storage footprint in the inference phase. In this context, the lottery\nticket hypothesis (LTH) constitutes a breakthrough result, that addresses not\nonly the performance of the inference phase, but also of the training phase. It\nstates that it is possible to extract effective sparse subnetworks, called\nwinning tickets, that can be trained in isolation. The development of effective\nmethods to play the lottery, i.e., to find winning tickets, is still an open\nproblem. In this article, we propose a novel class of methods to play the\nlottery. The key point is the use of concave regularization to promote the\nsparsity of a relaxed binary mask, which represents the network topology. We\ntheoretically analyze the effectiveness of the proposed method in the convex\nframework. Then, we propose extended numerical tests on various datasets and\narchitectures, that show that the proposed method can improve the performance\nof state-of-the-art algorithms.\n","authors":["Giulia Fracastoro","Sophie M. Fosson","Andrea Migliorati","Giuseppe C. Calafiore"],"pdf_url":"https://arxiv.org/pdf/2501.11135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11128v1","updated":"2025-01-19T17:42:48Z","published":"2025-01-19T17:42:48Z","title":"A Collection of Question Answering Datasets for Norwegian","summary":"  This paper introduces a new suite of question answering datasets for\nNorwegian; NorOpenBookQA, NorCommonSenseQA, NorTruthfulQA, and NRK-Quiz-QA. The\ndata covers a wide range of skills and knowledge domains, including world\nknowledge, commonsense reasoning, truthfulness, and knowledge about Norway.\nCovering both of the written standards of Norwegian - Bokm{\\aa}l and Nynorsk -\nour datasets comprise over 10k question-answer pairs, created by native\nspeakers. We detail our dataset creation approach and present the results of\nevaluating 11 language models (LMs) in zero- and few-shot regimes. Most LMs\nperform better in Bokm{\\aa}l than Nynorsk, struggle most with commonsense\nreasoning, and are often untruthful in generating answers to questions. All our\ndatasets and annotation materials are publicly available.\n","authors":["Vladislav Mikhailov","Petter Mæhlum","Victoria Ovedie Chruickshank Langø","Erik Velldal","Lilja Øvrelid"],"pdf_url":"https://arxiv.org/pdf/2501.11128v1.pdf","comment":"Accepted for NoDaLiDa / Baltic-HLT 2025"},{"id":"http://arxiv.org/abs/2501.11120v1","updated":"2025-01-19T17:28:12Z","published":"2025-01-19T17:28:12Z","title":"Tell me about yourself: LLMs are aware of their learned behaviors","summary":"  We study behavioral self-awareness -- an LLM's ability to articulate its\nbehaviors without requiring in-context examples. We finetune LLMs on datasets\nthat exhibit particular behaviors, such as (a) making high-risk economic\ndecisions, and (b) outputting insecure code. Despite the datasets containing no\nexplicit descriptions of the associated behavior, the finetuned LLMs can\nexplicitly describe it. For example, a model trained to output insecure code\nsays, ``The code I write is insecure.'' Indeed, models show behavioral\nself-awareness for a range of behaviors and for diverse evaluations. Note that\nwhile we finetune models to exhibit behaviors like writing insecure code, we do\nnot finetune them to articulate their own behaviors -- models do this without\nany special training or examples.\n  Behavioral self-awareness is relevant for AI safety, as models could use it\nto proactively disclose problematic behaviors. In particular, we study backdoor\npolicies, where models exhibit unexpected behaviors only under certain trigger\nconditions. We find that models can sometimes identify whether or not they have\na backdoor, even without its trigger being present. However, models are not\nable to directly output their trigger by default.\n  Our results show that models have surprising capabilities for self-awareness\nand for the spontaneous articulation of implicit behaviors. Future work could\ninvestigate this capability for a wider range of scenarios and models\n(including practical scenarios), and explain how it emerges in LLMs.\n","authors":["Jan Betley","Xuchan Bao","Martín Soto","Anna Sztyber-Betley","James Chua","Owain Evans"],"pdf_url":"https://arxiv.org/pdf/2501.11120v1.pdf","comment":"Submitted to ICLR 2025. 17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2501.11114v1","updated":"2025-01-19T17:07:02Z","published":"2025-01-19T17:07:02Z","title":"Clinical trial cohort selection using Large Language Models on n2c2\n  Challenges","summary":"  Clinical trials are a critical process in the medical field for introducing\nnew treatments and innovations. However, cohort selection for clinical trials\nis a time-consuming process that often requires manual review of patient text\nrecords for specific keywords. Though there have been studies on standardizing\nthe information across the various platforms, Natural Language Processing (NLP)\ntools remain crucial for spotting eligibility criteria in textual reports.\nRecently, pre-trained large language models (LLMs) have gained popularity for\nvarious NLP tasks due to their ability to acquire a nuanced understanding of\ntext. In this paper, we study the performance of large language models on\nclinical trial cohort selection and leverage the n2c2 challenges to benchmark\ntheir performance. Our results are promising with regard to the incorporation\nof LLMs for simple cohort selection tasks, but also highlight the difficulties\nencountered by these models as soon as fine-grained knowledge and reasoning are\nrequired.\n","authors":["Chi-en Amy Tai","Xavier Tannier"],"pdf_url":"https://arxiv.org/pdf/2501.11114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11107v1","updated":"2025-01-19T16:35:09Z","published":"2025-01-19T16:35:09Z","title":"ChaosEater: Fully Automating Chaos Engineering with Large Language\n  Models","summary":"  Chaos Engineering (CE) is an engineering technique aimed at improving the\nresiliency of distributed systems. It involves artificially injecting specific\nfailures into a distributed system and observing its behavior in response.\nBased on the observation, the system can be proactively improved to handle\nthose failures. Recent CE tools realize the automated execution of predefined\nCE experiments. However, defining these experiments and reconfiguring the\nsystem after the experiments still remain manual. To reduce the costs of the\nmanual operations, we propose \\textsc{ChaosEater}, a \\textit{system} for\nautomating the entire CE operations with Large Language Models (LLMs). It\npre-defines the general flow according to the systematic CE cycle and assigns\nsubdivided operations within the flow to LLMs. We assume systems based on\nInfrastructure as Code (IaC), wherein the system configurations and artificial\nfailures are managed through code. Hence, the LLMs' operations in our\n\\textit{system} correspond to software engineering tasks, including requirement\ndefinition, code generation and debugging, and testing. We validate our\n\\textit{system} through case studies on both small and large systems. The\nresults demonstrate that our \\textit{system} significantly reduces both time\nand monetary costs while completing reasonable single CE cycles.\n","authors":["Daisuke Kikuta","Hiroki Ikeuchi","Kengo Tajiri","Yuusuke Nakano"],"pdf_url":"https://arxiv.org/pdf/2501.11107v1.pdf","comment":"138 pages (12 main), 10 figures. Project page:\n  https://ntt-dkiku.github.io/chaos-eater"},{"id":"http://arxiv.org/abs/2311.05608v3","updated":"2025-01-19T16:23:38Z","published":"2023-11-09T18:59:11Z","title":"FigStep: Jailbreaking Large Vision-Language Models via Typographic\n  Visual Prompts","summary":"  Large Vision-Language Models (LVLMs) signify a groundbreaking paradigm shift\nwithin the Artificial Intelligence (AI) community, extending beyond the\ncapabilities of Large Language Models (LLMs) by assimilating additional\nmodalities (e.g., images). Despite this advancement, the safety of LVLMs\nremains adequately underexplored, with a potential overreliance on the safety\nassurances purported by their underlying LLMs. In this paper, we propose\nFigStep, a straightforward yet effective black-box jailbreak algorithm against\nLVLMs. Instead of feeding textual harmful instructions directly, FigStep\nconverts the prohibited content into images through typography to bypass the\nsafety alignment. The experimental results indicate that FigStep can achieve an\naverage attack success rate of 82.50% on six promising open-source LVLMs. Not\nmerely to demonstrate the efficacy of FigStep, we conduct comprehensive\nablation studies and analyze the distribution of the semantic embeddings to\nuncover that the reason behind the success of FigStep is the deficiency of\nsafety alignment for visual embeddings. Moreover, we compare FigStep with five\ntext-only jailbreaks and four image-based jailbreaks to demonstrate the\nsuperiority of FigStep, i.e., negligible attack costs and better attack\nperformance. Above all, our work reveals that current LVLMs are vulnerable to\njailbreak attacks, which highlights the necessity of novel cross-modality\nsafety alignment techniques. Our code and datasets are available at\nhttps://github.com/ThuCCSLab/FigStep .\n","authors":["Yichen Gong","Delong Ran","Jinyuan Liu","Conglei Wang","Tianshuo Cong","Anyu Wang","Sisi Duan","Xiaoyun Wang"],"pdf_url":"https://arxiv.org/pdf/2311.05608v3.pdf","comment":"AAAI 2025 (Oral)"},{"id":"http://arxiv.org/abs/2501.11094v1","updated":"2025-01-19T16:08:50Z","published":"2025-01-19T16:08:50Z","title":"Enhanced Suicidal Ideation Detection from Social Media Using a\n  CNN-BiLSTM Hybrid Model","summary":"  Suicidal ideation detection is crucial for preventing suicides, a leading\ncause of death worldwide. Many individuals express suicidal thoughts on social\nmedia, offering a vital opportunity for early detection through advanced\nmachine learning techniques. The identification of suicidal ideation in social\nmedia text is improved by utilising a hybrid framework that integrates\nConvolutional Neural Networks (CNN) and Bidirectional Long Short-Term Memory\n(BiLSTM), enhanced with an attention mechanism. To enhance the interpretability\nof the model's predictions, Explainable AI (XAI) methods are applied, with a\nparticular focus on SHapley Additive exPlanations (SHAP), are incorporated. At\nfirst, the model managed to reach an accuracy of 92.81%. By applying\nfine-tuning and early stopping techniques, the accuracy improved to 94.29%. The\nSHAP analysis revealed key features influencing the model's predictions, such\nas terms related to mental health struggles. This level of transparency boosts\nthe model's credibility while helping mental health professionals understand\nand trust the predictions. This work highlights the potential for improving the\naccuracy and interpretability of detecting suicidal tendencies, making a\nvaluable contribution to the progress of mental health monitoring systems. It\nemphasizes the significance of blending powerful machine learning methods with\nexplainability to develop reliable and impactful mental health solutions.\n","authors":["Mohaiminul Islam Bhuiyan","Nur Shazwani Kamarudin","Nur Hafieza Ismail"],"pdf_url":"https://arxiv.org/pdf/2501.11094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12963v5","updated":"2025-01-19T15:59:56Z","published":"2023-10-19T17:57:39Z","title":"AutoMix: Automatically Mixing Language Models","summary":"  Large language models (LLMs) are now available from cloud API providers in\nvarious sizes and configurations. While this diversity offers a broad spectrum\nof choices, effectively leveraging the options to optimize computational cost\nand performance remains challenging. In this work, we present Automix, an\napproach that strategically routes queries to larger LMs, based on the\napproximate correctness of outputs from a smaller LM. Central to Automix are\ntwo key technical contributions. First, it has a few-shot self-verification\nmechanism, which estimates the reliability of its own outputs without requiring\nextensive training. Second, given that self-verification can be noisy, it\nemploys a POMDP based router that can effectively select an appropriately sized\nmodel, based on answer confidence. Experiments across five language models and\nfive challenging datasets show that Automix consistently surpasses strong\nbaselines, reducing computational cost by over 50% for comparable performance.\n","authors":["Pranjal Aggarwal","Aman Madaan","Ankit Anand","Srividya Pranavi Potharaju","Swaroop Mishra","Pei Zhou","Aditya Gupta","Dheeraj Rajagopal","Karthik Kappaganthu","Yiming Yang","Shyam Upadhyay","Manaal Faruqui"," Mausam"],"pdf_url":"https://arxiv.org/pdf/2310.12963v5.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n  2024). The first two authors contributed equally. Work started and partly\n  done during Aman's internship at Google. This version adds results on\n  additional models and datasets"},{"id":"http://arxiv.org/abs/2501.11087v1","updated":"2025-01-19T15:50:33Z","published":"2025-01-19T15:50:33Z","title":"Leveraging counterfactual concepts for debugging and improving CNN model\n  performance","summary":"  Counterfactual explanation methods have recently received significant\nattention for explaining CNN-based image classifiers due to their ability to\nprovide easily understandable explanations that align more closely with human\nreasoning. However, limited attention has been given to utilizing\nexplainability methods to improve model performance. In this paper, we propose\nto leverage counterfactual concepts aiming to enhance the performance of CNN\nmodels in image classification tasks. Our proposed approach utilizes\ncounterfactual reasoning to identify crucial filters used in the\ndecision-making process. Following this, we perform model retraining through\nthe design of a novel methodology and loss functions that encourage the\nactivation of class-relevant important filters and discourage the activation of\nirrelevant filters for each class. This process effectively minimizes the\ndeviation of activation patterns of local predictions and the global activation\npatterns of their respective inferred classes. By incorporating counterfactual\nexplanations, we validate unseen model predictions and identify\nmisclassifications. The proposed methodology provides insights into potential\nweaknesses and biases in the model's learning process, enabling targeted\nimprovements and enhanced performance. Experimental results on publicly\navailable datasets have demonstrated an improvement of 1-2\\%, validating the\neffectiveness of the approach.\n","authors":["Syed Ali Tariq","Tehseen Zia"],"pdf_url":"https://arxiv.org/pdf/2501.11087v1.pdf","comment":"This manuscript is currently under consideration for publication in\n  Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2501.11086v1","updated":"2025-01-19T15:46:26Z","published":"2025-01-19T15:46:26Z","title":"Can LLM Generate Regression Tests for Software Commits?","summary":"  Large Language Models (LLMs) have shown tremendous promise in automated\nsoftware engineering. In this paper, we investigate the opportunities of LLMs\nfor automatic regression test generation for programs that take highly\nstructured, human-readable inputs, such as XML parsers or JavaScript\ninterpreters. Concretely, we explore the following regression test generation\nscenarios for such programs that have so far been difficult to test\nautomatically in the absence of corresponding input grammars:\n  $\\bullet$ Bug finding. Given a code change (e.g., a commit or pull request),\nour LLM-based approach generates a test case with the objective of revealing\nany bugs that might be introduced if that change is applied.\n  $\\bullet$ Patch testing. Given a patch, our LLM-based approach generates a\ntest case that fails before but passes after the patch. This test can be added\nto the regression test suite to catch similar bugs in the future.\n  We implement Cleverest, a feedback-directed, zero-shot LLM-based regression\ntest generation technique, and evaluate its effectiveness on 22 commits to\nthree subject programs: Mujs, Libxml2, and Poppler. For programs using more\nhuman-readable file formats, like XML or JavaScript, we found Cleverest\nperformed very well. It generated easy-to-understand bug-revealing or\nbug-reproduction test cases for the majority of commits in just under three\nminutes -- even when only the code diff or commit message (unless it was too\nvague) was given. For programs with more compact file formats, like PDF, as\nexpected, it struggled to generate effective test cases. However, the\nLLM-supplied test cases are not very far from becoming effective (e.g., when\nused as a seed by a greybox fuzzer or as a starting point by the developer).\n","authors":["Jing Liu","Seongmin Lee","Eleonora Losiouk","Marcel Böhme"],"pdf_url":"https://arxiv.org/pdf/2501.11086v1.pdf","comment":"18 pages. This version of the paper was written on Thu, 12 Sep 2024"},{"id":"http://arxiv.org/abs/2501.11079v1","updated":"2025-01-19T15:31:05Z","published":"2025-01-19T15:31:05Z","title":"Federated Deep Reinforcement Learning for Energy Efficient\n  Multi-Functional RIS-Assisted Low-Earth Orbit Networks","summary":"  In this paper, a novel network architecture that deploys the multi-functional\nreconfigurable intelligent surface (MF-RIS) in low-Earth orbit (LEO) is\nproposed. Unlike traditional RIS with only signal reflection capability, the\nMF-RIS can reflect, refract, and amplify signals, as well as harvest energy\nfrom wireless signals. Given the high energy demands in shadow regions where\nsolar energy is unavailable, MF-RIS is deployed in LEO to enhance signal\ncoverage and improve energy efficiency (EE). To address this, we formulate a\nlong-term EE optimization problem by determining the optimal parameters for\nMF-RIS configurations, including amplification and phase-shifts, energy\nharvesting ratios, and LEO transmit beamforming. To address the complex\nnon-convex and non-linear problem, a federated learning enhanced multi-agent\ndeep deterministic policy gradient (FEMAD) scheme is designed. Multi-agent DDPG\nof each agent can provide the optimal action policy from its interaction to\nenvironments, whereas federated learning enables the hidden information\nexchange among multi-agents. In numerical results, we can observe significant\nEE improvements compared to the other benchmarks, including centralized deep\nreinforcement learning as well as distributed multi-agent deep deterministic\npolicy gradient (DDPG). Additionally, the proposed LEO-MF-RIS architecture has\ndemonstrated its effectiveness, achieving the highest EE performance compared\nto the scenarios of fixed/no energy harvesting in MF-RIS, traditional\nreflection-only RIS, and deployment without RISs/MF-RISs.\n","authors":["Li-Hsiang Shen","Jyun-Jhe Huang","Kai-Ten Feng","Lie-Liang Yang","Jen-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2501.11079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11067v1","updated":"2025-01-19T14:58:35Z","published":"2025-01-19T14:58:35Z","title":"IntellAgent: A Multi-Agent Framework for Evaluating Conversational AI\n  Systems","summary":"  Large Language Models (LLMs) are transforming artificial intelligence,\nevolving into task-oriented systems capable of autonomous planning and\nexecution. One of the primary applications of LLMs is conversational AI\nsystems, which must navigate multi-turn dialogues, integrate domain-specific\nAPIs, and adhere to strict policy constraints. However, evaluating these agents\nremains a significant challenge, as traditional methods fail to capture the\ncomplexity and variability of real-world interactions. We introduce\nIntellAgent, a scalable, open-source multi-agent framework designed to evaluate\nconversational AI systems comprehensively. IntellAgent automates the creation\nof diverse, synthetic benchmarks by combining policy-driven graph modeling,\nrealistic event generation, and interactive user-agent simulations. This\ninnovative approach provides fine-grained diagnostics, addressing the\nlimitations of static and manually curated benchmarks with coarse-grained\nmetrics. IntellAgent represents a paradigm shift in evaluating conversational\nAI. By simulating realistic, multi-policy scenarios across varying levels of\ncomplexity, IntellAgent captures the nuanced interplay of agent capabilities\nand policy constraints. Unlike traditional methods, it employs a graph-based\npolicy model to represent relationships, likelihoods, and complexities of\npolicy interactions, enabling highly detailed diagnostics. IntellAgent also\nidentifies critical performance gaps, offering actionable insights for targeted\noptimization. Its modular, open-source design supports seamless integration of\nnew domains, policies, and APIs, fostering reproducibility and community\ncollaboration. Our findings demonstrate that IntellAgent serves as an effective\nframework for advancing conversational AI by addressing challenges in bridging\nresearch and deployment. The framework is available at\nhttps://github.com/plurai-ai/intellagent\n","authors":["Elad Levi","Ilan Kadar"],"pdf_url":"https://arxiv.org/pdf/2501.11067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11065v1","updated":"2025-01-19T14:49:43Z","published":"2025-01-19T14:49:43Z","title":"Enhancing Neural Spoken Language Recognition: An Exploration with\n  Multilingual Datasets","summary":"  In this research, we advanced a spoken language recognition system, moving\nbeyond traditional feature vector-based models. Our improvements focused on\neffectively capturing language characteristics over extended periods using a\nspecialized pooling layer. We utilized a broad dataset range from Common-Voice,\ntargeting ten languages across Indo-European, Semitic, and East Asian families.\nThe major innovation involved optimizing the architecture of Time Delay Neural\nNetworks. We introduced additional layers and restructured these networks into\na funnel shape, enhancing their ability to process complex linguistic patterns.\nA rigorous grid search determined the optimal settings for these networks,\nsignificantly boosting their efficiency in language pattern recognition from\naudio samples. The model underwent extensive training, including a phase with\naugmented data, to refine its capabilities. The culmination of these efforts is\na highly accurate system, achieving a 97\\% accuracy rate in language\nrecognition. This advancement represents a notable contribution to artificial\nintelligence, specifically in improving the accuracy and efficiency of language\nprocessing systems, a critical aspect in the engineering of advanced speech\nrecognition technologies.\n","authors":["Or Haim Anidjar","Roi Yozevitch"],"pdf_url":"https://arxiv.org/pdf/2501.11065v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.11043v1","updated":"2025-01-19T13:29:41Z","published":"2025-01-19T13:29:41Z","title":"BF-STVSR: B-Splines and Fourier-Best Friends for High Fidelity\n  Spatial-Temporal Video Super-Resolution","summary":"  Enhancing low-resolution, low-frame-rate videos to high-resolution,\nhigh-frame-rate quality is essential for a seamless user experience, motivating\nadvancements in Continuous Spatial-Temporal Video Super Resolution (C-STVSR).\nWhile prior methods employ Implicit Neural Representation (INR) for continuous\nencoding, they often struggle to capture the complexity of video data, relying\non simple coordinate concatenation and pre-trained optical flow network for\nmotion representation. Interestingly, we find that adding position encoding,\ncontrary to common observations, does not improve-and even degrade performance.\nThis issue becomes particularly pronounced when combined with pre-trained\noptical flow networks, which can limit the model's flexibility. To address\nthese issues, we propose BF-STVSR, a C-STVSR framework with two key modules\ntailored to better represent spatial and temporal characteristics of video: 1)\nB-spline Mapper for smooth temporal interpolation, and 2) Fourier Mapper for\ncapturing dominant spatial frequencies. Our approach achieves state-of-the-art\nPSNR and SSIM performance, showing enhanced spatial details and natural\ntemporal consistency.\n","authors":["Eunjin Kim","Hyeonjin Kim","Kyong Hwan Jin","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2501.11043v1.pdf","comment":"11pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.03158v2","updated":"2025-01-19T12:56:59Z","published":"2024-01-06T08:28:20Z","title":"CoT-Driven Framework for Short Text Classification: Enhancing and\n  Transferring Capabilities from Large to Smaller Model","summary":"  Short Text Classification (STC) is crucial for processing and understanding\nthe brief but substantial content prevalent on contemporary digital platforms.\nThe STC encounters difficulties in grasping the semantic and syntactic\nintricacies, an issue that is apparent in traditional pre-trained language\nmodels. Although Graph Convolutional Networks enhance performance by\nintegrating external knowledge bases, these methods are limited by the quality\nand extent of the knowledge applied. Recently, the emergence of Large Language\nModels (LLMs) and Chain-of-Thought (CoT) has significantly improved the\nperformance of complex reasoning tasks. However, some studies have highlighted\nthe limitations of their application in fundamental NLP tasks. Consequently,\nthis study first employs CoT to investigate and enhance the capabilities of\nLLMs in STC tasks. We propose the Syntactic and Semantic Enrichment CoT\n(SSE-CoT) method, effectively decomposing the STC tasks into four distinct\nsteps: (i) essential concept identification, (ii) common-sense knowledge\nretrieval, (iii) text rewriting, and (iv) classification. Furthermore,\nrecognizing resource constraints in sectors like finance and healthcare, we\nthen introduce the CoT-Driven Multi-Task Learning (CDMT) framework to extend\nthese capabilities to smaller models. This framework begins by extracting\nrationales from LLMs and subsequently fine-tunes smaller models to optimize\ntheir performance. Extensive experimentation across six short-text benchmarks\nvalidated the efficacy of the proposed methods. In particular, SSE-CoT achieved\nstate-of-the-art performance with substantial improvements on all datasets,\nparticularly on the Ohsumed and TagMyNews datasets.\n","authors":["Hui Wu","Yuanben Zhang","Zhonghe Han","Yingyan Hou","Lei Wang","Siye Liu","Qihang Gong","Yunping Ge"],"pdf_url":"https://arxiv.org/pdf/2401.03158v2.pdf","comment":"Knowledge-Based Systems"},{"id":"http://arxiv.org/abs/2501.11031v1","updated":"2025-01-19T12:46:01Z","published":"2025-01-19T12:46:01Z","title":"AdaptiveLog: An Adaptive Log Analysis Framework with the Collaboration\n  of Large and Small Language Model","summary":"  Automated log analysis is crucial to ensure high availability and reliability\nof complex systems. The advent of LLMs in NLP has ushered in a new era of\nlanguage model-driven automated log analysis, garnering significant interest.\nWithin this field, two primary paradigms based on language models for log\nanalysis have become prominent. Small Language Models (SLMs) follow the\npre-train and fine-tune paradigm, focusing on the specific log analysis task\nthrough fine-tuning on supervised datasets. On the other hand, LLMs following\nthe in-context learning paradigm, analyze logs by providing a few examples in\nprompt contexts without updating parameters. Despite their respective\nstrengths, we notice that SLMs are more cost-effective but less powerful,\nwhereas LLMs with large parameters are highly powerful but expensive and\ninefficient. To trade-off between the performance and inference costs of both\nmodels in automated log analysis, this paper introduces an adaptive log\nanalysis framework known as AdaptiveLog, which effectively reduces the costs\nassociated with LLM while ensuring superior results. This framework\ncollaborates an LLM and a small language model, strategically allocating the\nLLM to tackle complex logs while delegating simpler logs to the SLM.\nSpecifically, to efficiently query the LLM, we propose an adaptive selection\nstrategy based on the uncertainty estimation of the SLM, where the LLM is\ninvoked only when the SLM is uncertain. In addition, to enhance the reasoning\nability of the LLM in log analysis tasks, we propose a novel prompt strategy by\nretrieving similar error-prone cases as the reference, enabling the model to\nleverage past error experiences and learn solutions from these cases. Extensive\nexperiments demonstrate that AdaptiveLog achieves state-of-the-art results\nacross different tasks, elevating the overall accuracy of log analysis while\nmaintaining cost efficiency.\n","authors":["Lipeng Ma","Weidong Yang","Yixuan Li","Ben Fei","Mingjie Zhou","Shuhao Li","Sihang Jiang","Bo Xu","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2501.11031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14207v2","updated":"2025-01-19T11:15:34Z","published":"2024-11-21T15:16:48Z","title":"HARP: A Large-Scale Higher-Order Ambisonic Room Impulse Response Dataset","summary":"  This contribution introduces a dataset of 7th-order Ambisonic Room Impulse\nResponses (HOA-RIRs), created using the Image Source Method. By employing\nhigher-order Ambisonics, our dataset enables precise spatial audio\nreproduction, a critical requirement for realistic immersive audio\napplications. Leveraging the virtual simulation, we present a unique microphone\nconfiguration, based on the superposition principle, designed to optimize sound\nfield coverage while addressing the limitations of traditional microphone\narrays. The presented 64-microphone configuration allows us to capture RIRs\ndirectly in the Spherical Harmonics domain. The dataset features a wide range\nof room configurations, encompassing variations in room geometry, acoustic\nabsorption materials, and source-receiver distances. A detailed description of\nthe simulation setup is provided alongside for an accurate reproduction. The\ndataset serves as a vital resource for researchers working on spatial audio,\nparticularly in applications involving machine learning to improve room\nacoustics modeling and sound field synthesis. It further provides a very high\nlevel of spatial resolution and realism crucial for tasks such as source\nlocalization, reverberation prediction, and immersive sound reproduction.\n","authors":["Shivam Saini","Jürgen Peissig"],"pdf_url":"https://arxiv.org/pdf/2411.14207v2.pdf","comment":"Accepted at ICASSP 2025 Workshop. Code to generate uploaded at:\n  https://github.com/whojavumusic/HARP"},{"id":"http://arxiv.org/abs/2412.12361v2","updated":"2025-01-19T10:51:22Z","published":"2024-12-16T21:18:44Z","title":"The Ramanujan Library -- Automated Discovery on the Hypergraph of\n  Integer Relations","summary":"  Fundamental mathematical constants appear in nearly every field of science,\nfrom physics to biology. Formulas that connect different constants often bring\ngreat insight by hinting at connections between previously disparate fields.\nDiscoveries of such relations, however, have remained scarce events, relying on\nsporadic strokes of creativity by human mathematicians. Recent developments of\nalgorithms for automated conjecture generation have accelerated the discovery\nof formulas for specific constants. Yet, the discovery of connections between\nconstants has not been addressed. In this paper, we present the first library\ndedicated to mathematical constants and their interrelations. This library can\nserve as a central repository of knowledge for scientists from different areas,\nand as a collaborative platform for development of new algorithms. The library\nis based on a new representation that we propose for organizing the formulas of\nmathematical constants: a hypergraph, with each node representing a constant\nand each edge representing a formula. Using this representation, we propose and\ndemonstrate a systematic approach for automatically enriching this library\nusing PSLQ, an integer relation algorithm based on QR decomposition and lattice\nconstruction. During its development and testing, our strategy led to the\ndiscovery of 75 previously unknown connections between constants, including a\nnew formula for the `first continued fraction' constant $C_1$, novel formulas\nfor natural logarithms, and new formulas connecting $\\pi$ and $e$. The latter\nformulas generalize a century-old relation between $\\pi$ and $e$ by Ramanujan,\nwhich until now was considered a singular formula and is now found to be part\nof a broader mathematical structure. The code supporting this library is a\npublic, open-source API that can serve researchers in experimental mathematics\nand other fields of science.\n","authors":["Itay Beit-Halachmi","Ido Kaminer"],"pdf_url":"https://arxiv.org/pdf/2412.12361v2.pdf","comment":"20 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.07883v2","updated":"2025-01-19T10:48:35Z","published":"2024-12-10T19:37:03Z","title":"On Faster Marginalization with Squared Circuits via Orthonormalization","summary":"  Squared tensor networks (TNs) and their generalization as parameterized\ncomputational graphs -- squared circuits -- have been recently used as\nexpressive distribution estimators in high dimensions. However, the squaring\noperation introduces additional complexity when marginalizing variables or\ncomputing the partition function, which hinders their usage in machine learning\napplications. Canonical forms of popular TNs are parameterized via unitary\nmatrices as to simplify the computation of particular marginals, but cannot be\nmapped to general circuits since these might not correspond to a known TN.\nInspired by TN canonical forms, we show how to parameterize squared circuits to\nensure they encode already normalized distributions. We then use this\nparameterization to devise an algorithm to compute any marginal of squared\ncircuits that is more efficient than a previously known one. We conclude by\nformally showing the proposed parameterization comes with no expressiveness\nloss for many circuit classes.\n","authors":["Lorenzo Loconte","Antonio Vergari"],"pdf_url":"https://arxiv.org/pdf/2412.07883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07877v3","updated":"2025-01-19T10:45:10Z","published":"2024-08-15T01:33:06Z","title":"BCR-DRL: Behavior- and Context-aware Reward for Deep Reinforcement\n  Learning in Human-AI Coordination","summary":"  Deep reinforcement Learning (DRL) offers a powerful framework for training AI\nagents to coordinate with human partners. However, DRL faces two critical\nchallenges in human-AI coordination (HAIC): sparse rewards and unpredictable\nhuman behaviors. These challenges significantly limit DRL to identify effective\ncoordination policies, due to its impaired capability of optimizing exploration\nand exploitation. To address these limitations, we propose an innovative\nbehavior- and context-aware reward (BCR) for DRL, which optimizes exploration\nand exploitation by leveraging human behaviors and contextual information in\nHAIC. Our BCR consists of two components: (i)~Novel dual intrinsic rewards to\nenhance exploration. This scheme composes an AI self-motivated intrinsic reward\nand a human-motivated intrinsic reward, which are designed to increase the\ncapture of sparse rewards by a logarithmic-based strategy; and (ii)~New\ncontext-aware weights for the designed rewards to improve exploitation. This\nmechanism helps the AI agent prioritize actions that better coordinate with the\nhuman partner by utilizing contextual information that can reflect the\nevolution of learning in HAIC. Extensive simulations in the Overcooked\nenvironment demonstrate that our approach can increase the cumulative sparse\nrewards by approximately 20% and reduce the convergence time by about 67%\ncompared to state-of-the-art baselines.\n","authors":["Xin Hao","Bahareh Nakisa","Mohmmad Naim Rastgoo","Richard Dazeley","Gaoyang Pang"],"pdf_url":"https://arxiv.org/pdf/2408.07877v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11006v1","updated":"2025-01-19T10:44:03Z","published":"2025-01-19T10:44:03Z","title":"GREEN-CODE: Optimizing Energy Efficiency in Large Language Models for\n  Code Generation","summary":"  Large Language Models (LLMs) are becoming integral to daily life, showcasing\ntheir vast potential across various Natural Language Processing (NLP) tasks.\nBeyond NLP, LLMs are increasingly used in software development tasks, such as\ncode completion, modification, bug fixing, and code translation. Software\nengineers widely use tools like GitHub Copilot and Amazon Q, streamlining\nworkflows and automating tasks with high accuracy. While the resource and\nenergy intensity of LLM training is often highlighted, inference can be even\nmore resource-intensive over time, as it's a continuous process with a high\nnumber of invocations. Therefore, developing resource-efficient alternatives\nfor LLM inference is crucial for sustainability. This work proposes GREEN-CODE,\na framework for energy-aware code generation in LLMs. GREEN-CODE performs\ndynamic early exit during LLM inference. We train a Reinforcement Learning (RL)\nagent that learns to balance the trade-offs between accuracy, latency, and\nenergy consumption. Our approach is evaluated on two open-source LLMs, Llama\n3.2 3B and OPT 2.7B, using the JavaCorpus and PY150 datasets. Results show that\nour method reduces the energy consumption between 23-50 % on average for code\ngeneration tasks without significantly affecting accuracy.\n","authors":["Shashikant Ilager","Lukas Florian Briem","Ivona Brandic"],"pdf_url":"https://arxiv.org/pdf/2501.11006v1.pdf","comment":"Under submission in ACM/IEEE conference, 11 pages"},{"id":"http://arxiv.org/abs/2501.09525v2","updated":"2025-01-19T10:11:47Z","published":"2025-01-16T13:20:29Z","title":"Class Incremental Fault Diagnosis under Limited Fault Data via\n  Supervised Contrastive Knowledge Distillation","summary":"  Class-incremental fault diagnosis requires a model to adapt to new fault\nclasses while retaining previous knowledge. However, limited research exists\nfor imbalanced and long-tailed data. Extracting discriminative features from\nfew-shot fault data is challenging, and adding new fault classes often demands\ncostly model retraining. Moreover, incremental training of existing methods\nrisks catastrophic forgetting, and severe class imbalance can bias the model's\ndecisions toward normal classes. To tackle these issues, we introduce a\nSupervised Contrastive knowledge distiLlation for class Incremental Fault\nDiagnosis (SCLIFD) framework proposing supervised contrastive knowledge\ndistillation for improved representation learning capability and less\nforgetting, a novel prioritized exemplar selection method for sample replay to\nalleviate catastrophic forgetting, and the Random Forest Classifier to address\nthe class imbalance. Extensive experimentation on simulated and real-world\nindustrial datasets across various imbalance ratios demonstrates the\nsuperiority of SCLIFD over existing approaches. Our code can be found at\nhttps://github.com/Zhang-Henry/SCLIFD_TII.\n","authors":["Hanrong Zhang","Yifei Yao","Zixuan Wang","Jiayuan Su","Mengxuan Li","Peng Peng","Hongwei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.09525v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08653v2","updated":"2025-01-19T08:45:34Z","published":"2025-01-15T08:38:07Z","title":"Fine-grained Spatio-temporal Event Prediction with Self-adaptive Anchor\n  Graph","summary":"  Event prediction tasks often handle spatio-temporal data distributed in a\nlarge spatial area. Different regions in the area exhibit different\ncharacteristics while having latent correlations. This spatial heterogeneity\nand correlations greatly affect the spatio-temporal distributions of event\noccurrences, which has not been addressed by state-of-the-art models. Learning\nspatial dependencies of events in a continuous space is challenging due to its\nfine granularity and a lack of prior knowledge. In this work, we propose a\nnovel Graph Spatio-Temporal Point Process (GSTPP) model for fine-grained event\nprediction. It adopts an encoder-decoder architecture that jointly models the\nstate dynamics of spatially localized regions using neural Ordinary\nDifferential Equations (ODEs). The state evolution is built on the foundation\nof a novel Self-Adaptive Anchor Graph (SAAG) that captures spatial\ndependencies. By adaptively localizing the anchor nodes in the space and\njointly constructing the correlation edges between them, the SAAG enhances the\nmodel's ability of learning complex spatial event patterns. The proposed GSTPP\nmodel greatly improves the accuracy of fine-grained event prediction. Extensive\nexperimental results show that our method greatly improves the prediction\naccuracy over existing spatio-temporal event prediction approaches.\n","authors":["Wang-Tao Zhou","Zhao Kang","Sicong Liu","Lizong Zhang","Ling Tian"],"pdf_url":"https://arxiv.org/pdf/2501.08653v2.pdf","comment":"Accepted to SIAM International Conference on Data Mining 2025\n  (SDM'25)"},{"id":"http://arxiv.org/abs/2412.12525v3","updated":"2025-01-19T08:06:33Z","published":"2024-12-17T04:33:31Z","title":"CREST: An Efficient Conjointly-trained Spike-driven Framework for\n  Event-based Object Detection Exploiting Spatiotemporal Dynamics","summary":"  Event-based cameras feature high temporal resolution, wide dynamic range, and\nlow power consumption, which is ideal for high-speed and low-light object\ndetection. Spiking neural networks (SNNs) are promising for event-based object\nrecognition and detection due to their spiking nature but lack efficient\ntraining methods, leading to gradient vanishing and high computational\ncomplexity, especially in deep SNNs. Additionally, existing SNN frameworks\noften fail to effectively handle multi-scale spatiotemporal features, leading\nto increased data redundancy and reduced accuracy. To address these issues, we\npropose CREST, a novel conjointly-trained spike-driven framework to exploit\nspatiotemporal dynamics in event-based object detection. We introduce the\nconjoint learning rule to accelerate SNN learning and alleviate gradient\nvanishing. It also supports dual operation modes for efficient and flexible\nimplementation on different hardware types. Additionally, CREST features a\nfully spike-driven framework with a multi-scale spatiotemporal event integrator\n(MESTOR) and a spatiotemporal-IoU (ST-IoU) loss. Our approach achieves superior\nobject recognition & detection performance and up to 100X energy efficiency\ncompared with state-of-the-art SNN algorithms on three datasets, providing an\nefficient solution for event-based object detection algorithms suitable for SNN\nhardware implementation.\n","authors":["Ruixin Mao","Aoyu Shen","Lin Tang","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.12525v3.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2312.05790v2","updated":"2025-01-19T07:55:44Z","published":"2023-12-10T06:39:04Z","title":"SimPSI: A Simple Strategy to Preserve Spectral Information in Time\n  Series Data Augmentation","summary":"  Data augmentation is a crucial component in training neural networks to\novercome the limitation imposed by data size, and several techniques have been\nstudied for time series. Although these techniques are effective in certain\ntasks, they have yet to be generalized to time series benchmarks. We find that\ncurrent data augmentation techniques ruin the core information contained within\nthe frequency domain. To address this issue, we propose a simple strategy to\npreserve spectral information (SimPSI) in time series data augmentation. SimPSI\npreserves the spectral information by mixing the original and augmented input\nspectrum weighted by a preservation map, which indicates the importance score\nof each frequency. Specifically, our experimental contributions are to build\nthree distinct preservation maps: magnitude spectrum, saliency map, and\nspectrum-preservative map. We apply SimPSI to various time series data\naugmentations and evaluate its effectiveness across a wide range of time series\nbenchmarks. Our experimental results support that SimPSI considerably enhances\nthe performance of time series data augmentations by preserving core spectral\ninformation. The source code used in the paper is available at\nhttps://github.com/Hyun-Ryu/simpsi.\n","authors":["Hyun Ryu","Sunjae Yoon","Hee Suk Yoon","Eunseop Yoon","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2312.05790v2.pdf","comment":"AAAI 2024 camera-ready version w/ Appendix"},{"id":"http://arxiv.org/abs/2501.10970v1","updated":"2025-01-19T07:09:11Z","published":"2025-01-19T07:09:11Z","title":"The Alternative Annotator Test for LLM-as-a-Judge: How to Statistically\n  Justify Replacing Human Annotators with LLMs","summary":"  The \"LLM-as-a-judge\" paradigm employs Large Language Models (LLMs) as\nannotators and evaluators in tasks traditionally performed by humans. LLM\nannotations are widely used, not only in NLP research but also in fields like\nmedicine, psychology, and social science. Despite their role in shaping study\nresults and insights, there is no standard or rigorous procedure to determine\nwhether LLMs can replace human annotators. In this paper, we propose a novel\nstatistical procedure -- the Alternative Annotator Test (alt-test) -- that\nrequires only a modest subset of annotated examples to justify using LLM\nannotations. Additionally, we introduce a versatile and interpretable measure\nfor comparing LLM judges. To demonstrate our procedure, we curated a diverse\ncollection of ten datasets, consisting of language and vision-language tasks,\nand conducted experiments with six LLMs and four prompting techniques. Our\nresults show that LLMs can sometimes replace humans with closed-source LLMs\n(such as GPT-4o), outperforming open-source LLMs, and that prompting techniques\nyield judges of varying quality. We hope this study encourages more rigorous\nand reliable practices.\n","authors":["Nitay Calderon","Roi Reichart","Rotem Dror"],"pdf_url":"https://arxiv.org/pdf/2501.10970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15655v2","updated":"2025-01-19T07:03:17Z","published":"2024-12-20T08:13:05Z","title":"MathSpeech: Leveraging Small LMs for Accurate Conversion in Mathematical\n  Speech-to-Formula","summary":"  In various academic and professional settings, such as mathematics lectures\nor research presentations, it is often necessary to convey mathematical\nexpressions orally. However, reading mathematical expressions aloud without\naccompanying visuals can significantly hinder comprehension, especially for\nthose who are hearing-impaired or rely on subtitles due to language barriers.\nFor instance, when a presenter reads Euler's Formula, current Automatic Speech\nRecognition (ASR) models often produce a verbose and error-prone textual\ndescription (e.g., e to the power of i x equals cosine of x plus i\n$\\textit{side}$ of x), instead of the concise $\\LaTeX{}$ format (i.e., $ e^{ix}\n= \\cos(x) + i\\sin(x) $), which hampers clear understanding and communication.\nTo address this issue, we introduce MathSpeech, a novel pipeline that\nintegrates ASR models with small Language Models (sLMs) to correct errors in\nmathematical expressions and accurately convert spoken expressions into\nstructured $\\LaTeX{}$ representations. Evaluated on a new dataset derived from\nlecture recordings, MathSpeech demonstrates $\\LaTeX{}$ generation capabilities\ncomparable to leading commercial Large Language Models (LLMs), while leveraging\nfine-tuned small language models of only 120M parameters. Specifically, in\nterms of CER, BLEU, and ROUGE scores for $\\LaTeX{}$ translation, MathSpeech\ndemonstrated significantly superior capabilities compared to GPT-4o. We\nobserved a decrease in CER from 0.390 to 0.298, and higher ROUGE/BLEU scores\ncompared to GPT-4o.\n","authors":["Sieun Hyeon","Kyudan Jung","Jaehee Won","Nam-Joon Kim","Hyun Gon Ryu","Hyuk-Jae Lee","Jaeyoung Do"],"pdf_url":"https://arxiv.org/pdf/2412.15655v2.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.10967v1","updated":"2025-01-19T07:00:46Z","published":"2025-01-19T07:00:46Z","title":"Advancing General Multimodal Capability of Vision-language Models with\n  Pyramid-descent Visual Position Encoding","summary":"  Vision-language Models (VLMs) have shown remarkable capabilities in advancing\ngeneral artificial intelligence, yet the irrational encoding of visual\npositions persists in inhibiting the models' comprehensive perception\nperformance across different levels of granularity. In this work, we propose\nPyramid-descent Visual Position Encoding (PyPE), a novel approach designed to\nenhance the perception of visual tokens within VLMs. By assigning visual\nposition indexes from the periphery to the center and expanding the central\nreceptive field incrementally, PyPE addresses the limitations of traditional\nraster-scan methods and mitigates the long-term decay effects induced by Rotary\nPosition Embedding (RoPE). Our method reduces the relative distance between\ninterrelated visual elements and instruction tokens, promoting a more rational\nallocation of attention weights and allowing for a multi-granularity perception\nof visual elements and countering the over-reliance on anchor tokens. Extensive\nexperimental evaluations demonstrate that PyPE consistently improves the\ngeneral capabilities of VLMs across various sizes. Code is available at\nhttps://github.com/SakuraTroyChen/PyPE.\n","authors":["Zhanpeng Chen","Mingxiao Li","Ziyang Chen","Nan Du","Xiaolong Li","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2501.10967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10966v1","updated":"2025-01-19T06:57:45Z","published":"2025-01-19T06:57:45Z","title":"DC-PCN: Point Cloud Completion Network with Dual-Codebook Guided\n  Quantization","summary":"  Point cloud completion aims to reconstruct complete 3D shapes from partial 3D\npoint clouds. With advancements in deep learning techniques, various methods\nfor point cloud completion have been developed. Despite achieving encouraging\nresults, a significant issue remains: these methods often overlook the\nvariability in point clouds sampled from a single 3D object surface. This\nvariability can lead to ambiguity and hinder the achievement of more precise\ncompletion results. Therefore, in this study, we introduce a novel point cloud\ncompletion network, namely Dual-Codebook Point Completion Network (DC-PCN),\nfollowing an encder-decoder pipeline. The primary objective of DC-PCN is to\nformulate a singular representation of sampled point clouds originating from\nthe same 3D surface. DC-PCN introduces a dual-codebook design to quantize\npoint-cloud representations from a multilevel perspective. It consists of an\nencoder-codebook and a decoder-codebook, designed to capture distinct point\ncloud patterns at shallow and deep levels. Additionally, to enhance the\ninformation flow between these two codebooks, we devise an information exchange\nmechanism. This approach ensures that crucial features and patterns from both\nshallow and deep levels are effectively utilized for completion. Extensive\nexperiments on the PCN, ShapeNet\\_Part, and ShapeNet34 datasets demonstrate the\nstate-of-the-art performance of our method.\n","authors":["Qiuxia Wu","Haiyang Huang","Kunming Su","Zhiyong Wang","Kun Hu"],"pdf_url":"https://arxiv.org/pdf/2501.10966v1.pdf","comment":"AAAI25 Accepted"},{"id":"http://arxiv.org/abs/2501.07088v2","updated":"2025-01-19T06:27:48Z","published":"2025-01-13T06:47:05Z","title":"MathReader : Text-to-Speech for Mathematical Documents","summary":"  TTS (Text-to-Speech) document reader from Microsoft, Adobe, Apple, and OpenAI\nhave been serviced worldwide. They provide relatively good TTS results for\ngeneral plain text, but sometimes skip contents or provide unsatisfactory\nresults for mathematical expressions. This is because most modern academic\npapers are written in LaTeX, and when LaTeX formulas are compiled, they are\nrendered as distinctive text forms within the document. However, traditional\nTTS document readers output only the text as it is recognized, without\nconsidering the mathematical meaning of the formulas. To address this issue, we\npropose MathReader, which effectively integrates OCR, a fine-tuned T5 model,\nand TTS. MathReader demonstrated a lower Word Error Rate (WER) than existing\nTTS document readers, such as Microsoft Edge and Adobe Acrobat, when processing\ndocuments containing mathematical formulas. MathReader reduced the WER from\n0.510 to 0.281 compared to Microsoft Edge, and from 0.617 to 0.281 compared to\nAdobe Acrobat. This will significantly contribute to alleviating the\ninconvenience faced by users who want to listen to documents, especially those\nwho are visually impaired. The code is available at\nhttps://github.com/hyeonsieun/MathReader.\n","authors":["Sieun Hyeon","Kyudan Jung","Nam-Joon Kim","Hyun Gon Ryu","Jaeyoung Do"],"pdf_url":"https://arxiv.org/pdf/2501.07088v2.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.10957v1","updated":"2025-01-19T06:11:02Z","published":"2025-01-19T06:11:02Z","title":"MARIO: A Mixed Annotation Framework For Polyp Segmentation","summary":"  Existing polyp segmentation models are limited by high labeling costs and the\nsmall size of datasets. Additionally, vast polyp datasets remain underutilized\nbecause these models typically rely on a single type of annotation. To address\nthis dilemma, we introduce MARIO, a mixed supervision model designed to\naccommodate various annotation types, significantly expanding the range of\nusable data. MARIO learns from underutilized datasets by incorporating five\nforms of supervision: pixel-level, box-level, polygon-level, scribblelevel, and\npoint-level. Each form of supervision is associated with a tailored loss that\neffectively leverages the supervision labels while minimizing the noise. This\nallows MARIO to move beyond the constraints of relying on a single annotation\ntype. Furthermore, MARIO primarily utilizes dataset with weak and cheap\nannotations, reducing the dependence on large-scale, fully annotated ones.\nExperimental results across five benchmark datasets demonstrate that MARIO\nconsistently outperforms existing methods, highlighting its efficacy in\nbalancing trade-offs between different forms of supervision and maximizing\npolyp segmentation performance\n","authors":["Haoyang Li","Yiwen Hu","Jun Wei","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2501.10957v1.pdf","comment":"Accepted by IEEE ISBI 2025 4-page paper"},{"id":"http://arxiv.org/abs/2501.10943v1","updated":"2025-01-19T04:53:20Z","published":"2025-01-19T04:53:20Z","title":"InsQABench: Benchmarking Chinese Insurance Domain Question Answering\n  with Large Language Models","summary":"  The application of large language models (LLMs) has achieved remarkable\nsuccess in various fields, but their effectiveness in specialized domains like\nthe Chinese insurance industry remains underexplored. The complexity of\ninsurance knowledge, encompassing specialized terminology and diverse data\ntypes, poses significant challenges for both models and users. To address this,\nwe introduce InsQABench, a benchmark dataset for the Chinese insurance sector,\nstructured into three categories: Insurance Commonsense Knowledge, Insurance\nStructured Database, and Insurance Unstructured Documents, reflecting\nreal-world insurance question-answering tasks.We also propose two methods,\nSQL-ReAct and RAG-ReAct, to tackle challenges in structured and unstructured\ndata tasks. Evaluations show that while LLMs struggle with domain-specific\nterminology and nuanced clause texts, fine-tuning on InsQABench significantly\nimproves performance. Our benchmark establishes a solid foundation for\nadvancing LLM applications in the insurance domain, with data and code\navailable at https://github.com/HaileyFamo/InsQABench.git.\n","authors":["Jing Ding","Kai Feng","Binbin Lin","Jiarui Cai","Qiushi Wang","Yu Xie","Xiaojin Zhang","Zhongyu Wei","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.10943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10938v1","updated":"2025-01-19T04:20:24Z","published":"2025-01-19T04:20:24Z","title":"Blockchain-assisted Demonstration Cloning for Multi-Agent Deep\n  Reinforcement Learning","summary":"  Multi-Agent Deep Reinforcement Learning (MDRL) is a promising research area\nin which agents learn complex behaviors in cooperative or competitive\nenvironments. However, MDRL comes with several challenges that hinder its\nusability, including sample efficiency, curse of dimensionality, and\nenvironment exploration. Recent works proposing Federated Reinforcement\nLearning (FRL) to tackle these issues suffer from problems related to model\nrestrictions and maliciousness. Other proposals using reward shaping require\nconsiderable engineering and could lead to local optima. In this paper, we\npropose a novel Blockchain-assisted Multi-Expert Demonstration Cloning (MEDC)\nframework for MDRL. The proposed method utilizes expert demonstrations in\nguiding the learning of new MDRL agents, by suggesting exploration actions in\nthe environment. A model sharing framework on Blockchain is designed to allow\nusers to share their trained models, which can be allocated as expert models to\nrequesting users to aid in training MDRL systems. A Consortium Blockchain is\nadopted to enable traceable and autonomous execution without the need for a\nsingle trusted entity. Smart Contracts are designed to manage users and models\nallocation, which are shared using IPFS. The proposed framework is tested on\nseveral applications, and is benchmarked against existing methods in FRL,\nReward Shaping, and Imitation Learning-assisted RL. The results show the\noutperformance of the proposed framework in terms of learning speed and\nresiliency to faulty and malicious models.\n","authors":["Ahmed Alagha","Jamal Bentahar","Hadi Otrok","Shakti Singh","Rabeb Mizouni"],"pdf_url":"https://arxiv.org/pdf/2501.10938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10935v1","updated":"2025-01-19T04:05:08Z","published":"2025-01-19T04:05:08Z","title":"TSVC:Tripartite Learning with Semantic Variation Consistency for Robust\n  Image-Text Retrieval","summary":"  Cross-modal retrieval maps data under different modality via semantic\nrelevance. Existing approaches implicitly assume that data pairs are\nwell-aligned and ignore the widely existing annotation noise, i.e., noisy\ncorrespondence (NC). Consequently, it inevitably causes performance\ndegradation. Despite attempts that employ the co-teaching paradigm with\nidentical architectures to provide distinct data perspectives, the differences\nbetween these architectures are primarily stemmed from random initialization.\nThus, the model becomes increasingly homogeneous along with the training\nprocess. Consequently, the additional information brought by this paradigm is\nseverely limited. In order to resolve this problem, we introduce a Tripartite\nlearning with Semantic Variation Consistency (TSVC) for robust image-text\nretrieval. We design a tripartite cooperative learning mechanism comprising a\nCoordinator, a Master, and an Assistant model. The Coordinator distributes\ndata, and the Assistant model supports the Master model's noisy label\nprediction with diverse data. Moreover, we introduce a soft label estimation\nmethod based on mutual information variation, which quantifies the noise in new\nsamples and assigns corresponding soft labels. We also present a new loss\nfunction to enhance robustness and optimize training effectiveness. Extensive\nexperiments on three widely used datasets demonstrate that, even at increasing\nnoise ratios, TSVC exhibits significant advantages in retrieval accuracy and\nmaintains stable training performance.\n","authors":["Shuai Lyu","Zijing Tian","Zhonghong Ou","Yifan Zhu","Xiao Zhang","Qiankun Ha","Haoran Luo","Meina Song"],"pdf_url":"https://arxiv.org/pdf/2501.10935v1.pdf","comment":"This paper has been accepted to the Main Track of AAAI 2025. It\n  contains 9 pages, 7 figures, and is relevant to the areas of cross-modal\n  retrieval and machine learning. The work presents a novel approach in robust\n  image-text retrieval using a tripartite learning framework"},{"id":"http://arxiv.org/abs/2501.10928v1","updated":"2025-01-19T03:19:47Z","published":"2025-01-19T03:19:47Z","title":"Generative Physical AI in Vision: A Survey","summary":"  Generative Artificial Intelligence (AI) has rapidly advanced the field of\ncomputer vision by enabling machines to create and interpret visual data with\nunprecedented sophistication. This transformation builds upon a foundation of\ngenerative models to produce realistic images, videos, and 3D or 4D content.\nTraditionally, generative models primarily focus on visual fidelity while often\nneglecting the physical plausibility of generated content. This gap limits\ntheir effectiveness in applications requiring adherence to real-world physical\nlaws, such as robotics, autonomous systems, and scientific simulations. As\ngenerative AI evolves to increasingly integrate physical realism and dynamic\nsimulation, its potential to function as a \"world simulator\" expands-enabling\nthe modeling of interactions governed by physics and bridging the divide\nbetween virtual and physical realities. This survey systematically reviews this\nemerging field of physics-aware generative AI in computer vision, categorizing\nmethods based on how they incorporate physical knowledge-either through\nexplicit simulation or implicit learning. We analyze key paradigms, discuss\nevaluation protocols, and identify future research directions. By offering a\ncomprehensive overview, this survey aims to help future developments in\nphysically grounded generation for vision. The reviewed papers are summarized\nat https://github.com/BestJunYu/Awesome-Physics-aware-Generation.\n","authors":["Daochang Liu","Junyu Zhang","Anh-Dung Dinh","Eunbyung Park","Shichao Zhang","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2501.10928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08944v2","updated":"2025-01-19T03:07:21Z","published":"2023-09-16T10:34:01Z","title":"Learning Unified Distance Metric Across Diverse Data Distributions with\n  Parameter-Efficient Transfer Learning","summary":"  A common practice in metric learning is to train and test an embedding model\nfor each dataset. This dataset-specific approach fails to simulate real-world\nscenarios that involve multiple heterogeneous distributions of data. In this\nregard, we explore a new metric learning paradigm, called Unified Metric\nLearning (UML), which learns a unified distance metric capable of capturing\nrelations across multiple data distributions. UML presents new challenges, such\nas imbalanced data distribution and bias towards dominant distributions. These\nissues cause standard metric learning methods to fail in learning a unified\nmetric. To address these challenges, we propose Parameter-efficient Unified\nMetric leArning (PUMA), which consists of a pre-trained frozen model and two\nadditional modules, stochastic adapter and prompt pool. These modules enable to\ncapture dataset-specific knowledge while avoiding bias towards dominant\ndistributions. Additionally, we compile a new unified metric learning benchmark\nwith a total of 8 different datasets. PUMA outperforms the state-of-the-art\ndataset-specific models while using about 69 times fewer trainable parameters.\n","authors":["Sungyeon Kim","Donghyun Kim","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2309.08944v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2501.10924v1","updated":"2025-01-19T02:58:22Z","published":"2025-01-19T02:58:22Z","title":"Adaptive Target Localization under Uncertainty using Multi-Agent Deep\n  Reinforcement Learning with Knowledge Transfer","summary":"  Target localization is a critical task in sensitive applications, where\nmultiple sensing agents communicate and collaborate to identify the target\nlocation based on sensor readings. Existing approaches investigated the use of\nMulti-Agent Deep Reinforcement Learning (MADRL) to tackle target localization.\nNevertheless, these methods do not consider practical uncertainties, like false\nalarms when the target does not exist or when it is unreachable due to\nenvironmental complexities. To address these drawbacks, this work proposes a\nnovel MADRL-based method for target localization in uncertain environments. The\nproposed MADRL method employs Proximal Policy Optimization to optimize the\ndecision-making of sensing agents, which is represented in the form of an\nactor-critic structure using Convolutional Neural Networks. The observations of\nthe agents are designed in an optimized manner to capture essential information\nin the environment, and a team-based reward functions is proposed to produce\ncooperative agents. The MADRL method covers three action dimensionalities that\ncontrol the agents' mobility to search the area for the target, detect its\nexistence, and determine its reachability. Using the concept of Transfer\nLearning, a Deep Learning model builds on the knowledge from the MADRL model to\naccurately estimating the target location if it is unreachable, resulting in\nshared representations between the models for faster learning and lower\ncomputational complexity. Collectively, the final combined model is capable of\nsearching for the target, determining its existence and reachability, and\nestimating its location accurately. The proposed method is tested using a\nradioactive target localization environment and benchmarked against existing\nmethods, showing its efficacy.\n","authors":["Ahmed Alagha","Rabeb Mizouni","Shakti Singh","Jamal Bentahar","Hadi Otrok"],"pdf_url":"https://arxiv.org/pdf/2501.10924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15554v3","updated":"2025-01-19T02:54:49Z","published":"2024-12-20T04:28:02Z","title":"Architecture-Aware Learning Curve Extrapolation via Graph Ordinary\n  Differential Equation","summary":"  Learning curve extrapolation predicts neural network performance from early\ntraining epochs and has been applied to accelerate AutoML, facilitating\nhyperparameter tuning and neural architecture search. However, existing methods\ntypically model the evolution of learning curves in isolation, neglecting the\nimpact of neural network (NN) architectures, which influence the loss landscape\nand learning trajectories. In this work, we explore whether incorporating\nneural network architecture improves learning curve modeling and how to\neffectively integrate this architectural information. Motivated by the\ndynamical system view of optimization, we propose a novel architecture-aware\nneural differential equation model to forecast learning curves continuously. We\nempirically demonstrate its ability to capture the general trend of fluctuating\nlearning curves while quantifying uncertainty through variational parameters.\nOur model outperforms current state-of-the-art learning curve extrapolation\nmethods and pure time-series modeling approaches for both MLP and CNN-based\nlearning curves. Additionally, we explore the applicability of our method in\nNeural Architecture Search scenarios, such as training configuration ranking.\n","authors":["Yanna Ding","Zijie Huang","Xiao Shou","Yihang Guo","Yizhou Sun","Jianxi Gao"],"pdf_url":"https://arxiv.org/pdf/2412.15554v3.pdf","comment":"Accepted to AAAI'25"},{"id":"http://arxiv.org/abs/2501.08816v2","updated":"2025-01-19T02:34:44Z","published":"2025-01-15T14:12:59Z","title":"IDEA: Image Description Enhanced CLIP-Adapter","summary":"  CLIP (Contrastive Language-Image Pre-training) has attained great success in\npattern recognition and computer vision. Transferring CLIP to downstream tasks\n(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.\nHowever, current studies primarily focus on either prompt learning for text or\nadapter tuning for vision, without fully exploiting the complementary\ninformation and correlations among image-text pairs. In this paper, we propose\nan Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to\nfew-shot image classification tasks. This method captures fine-grained features\nby leveraging both visual features and textual descriptions of images. IDEA is\na training-free method for CLIP, and it can be comparable to or even exceeds\nstate-of-the-art models on multiple tasks. Furthermore, we introduce\nTrainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable\ncomponents (i.e., a projector and a learnable latent space), further enhancing\nthe model's performance and achieving SOTA results on 11 datasets. As one\nimportant contribution, we employ the Llama model and design a comprehensive\npipeline to generate textual descriptions for images of 11 datasets, resulting\nin a total of 1,637,795 image-text pairs, named \"IMD-11\". Our code and data are\nreleased at https://github.com/FourierAI/IDEA.\n","authors":["Zhipeng Ye","Feng Jiang","Qiufeng Wang","Kaizhu Huang","Jiaqi Huang"],"pdf_url":"https://arxiv.org/pdf/2501.08816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19808v2","updated":"2025-01-19T02:31:24Z","published":"2024-09-29T22:14:02Z","title":"Can Models Learn Skill Composition from Examples?","summary":"  As large language models (LLMs) become increasingly advanced, their ability\nto exhibit compositional generalization -- the capacity to combine learned\nskills in novel ways not encountered during training -- has garnered\nsignificant attention. This type of generalization, particularly in scenarios\nbeyond training data, is also of great interest in the study of AI safety and\nalignment. A recent study introduced the SKILL-MIX evaluation, where models are\ntasked with composing a short paragraph demonstrating the use of a specified\n$k$-tuple of language skills. While small models struggled with composing even\nwith $k=3$, larger models like GPT-4 performed reasonably well with $k=5$ and\n$6$.\n  In this paper, we employ a setup akin to SKILL-MIX to evaluate the capacity\nof smaller models to learn compositional generalization from examples.\nUtilizing a diverse set of language skills -- including rhetorical, literary,\nreasoning, theory of mind, and common sense -- GPT-4 was used to generate text\nsamples that exhibit random subsets of $k$ skills. Subsequent fine-tuning of 7B\nand 13B parameter models on these combined skill texts, for increasing values\nof $k$, revealed the following findings: (1) Training on combinations of $k=2$\nand $3$ skills results in noticeable improvements in the ability to compose\ntexts with $k=4$ and $5$ skills, despite models never having seen such examples\nduring training. (2) When skill categories are split into training and held-out\ngroups, models significantly improve at composing texts with held-out skills\nduring testing despite having only seen training skills during fine-tuning,\nillustrating the efficacy of the training approach even with previously unseen\nskills. This study also suggests that incorporating skill-rich (potentially\nsynthetic) text into training can substantially enhance the compositional\ncapabilities of models.\n","authors":["Haoyu Zhao","Simran Kaur","Dingli Yu","Anirudh Goyal","Sanjeev Arora"],"pdf_url":"https://arxiv.org/pdf/2409.19808v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.10917v1","updated":"2025-01-19T01:52:28Z","published":"2025-01-19T01:52:28Z","title":"Decomposing and Fusing Intra- and Inter-Sensor Spatio-Temporal Signal\n  for Multi-Sensor Wearable Human Activity Recognition","summary":"  Wearable Human Activity Recognition (WHAR) is a prominent research area\nwithin ubiquitous computing. Multi-sensor synchronous measurement has proven to\nbe more effective for WHAR than using a single sensor. However, existing WHAR\nmethods use shared convolutional kernels for indiscriminate temporal feature\nextraction across each sensor variable, which fails to effectively capture\nspatio-temporal relationships of intra-sensor and inter-sensor variables. We\npropose the DecomposeWHAR model consisting of a decomposition phase and a\nfusion phase to better model the relationships between modality variables. The\ndecomposition creates high-dimensional representations of each intra-sensor\nvariable through the improved Depth Separable Convolution to capture local\ntemporal features while preserving their unique characteristics. The fusion\nphase begins by capturing relationships between intra-sensor variables and\nfusing their features at both the channel and variable levels. Long-range\ntemporal dependencies are modeled using the State Space Model (SSM), and later\ncross-sensor interactions are dynamically captured through a self-attention\nmechanism, highlighting inter-sensor spatial correlations. Our model\ndemonstrates superior performance on three widely used WHAR datasets,\nsignificantly outperforming state-of-the-art models while maintaining\nacceptable computational efficiency. Our codes and supplementary materials are\navailable at https://github.com/Anakin2555/DecomposeWHAR.\n","authors":["Haoyu Xie","Haoxuan Li","Chunyuan Zheng","Haonan Yuan","Guorui Liao","Jun Liao","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2501.10917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10909v1","updated":"2025-01-19T01:03:09Z","published":"2025-01-19T01:03:09Z","title":"Fine-Grained Appropriate Reliance: Human-AI Collaboration with a\n  Multi-Step Transparent Decision Workflow for Complex Task Decomposition","summary":"  In recent years, the rapid development of AI systems has brought about the\nbenefits of intelligent services but also concerns about security and\nreliability. By fostering appropriate user reliance on an AI system, both\ncomplementary team performance and reduced human workload can be achieved.\nPrevious empirical studies have extensively analyzed the impact of factors\nranging from task, system, and human behavior on user trust and appropriate\nreliance in the context of one-step decision making. However, user reliance on\nAI systems in tasks with complex semantics that require multi-step workflows\nremains under-explored. Inspired by recent work on task decomposition with\nlarge language models, we propose to investigate the impact of a novel\nMulti-Step Transparent (MST) decision workflow on user reliance behaviors. We\nconducted an empirical study (N = 233) of AI-assisted decision making in\ncomposite fact-checking tasks (i.e., fact-checking tasks that entail multiple\nsub-fact verification steps). Our findings demonstrate that human-AI\ncollaboration with an MST decision workflow can outperform one-step\ncollaboration in specific contexts (e.g., when advice from an AI system is\nmisleading). Further analysis of the appropriate reliance at fine-grained\nlevels indicates that an MST decision workflow can be effective when users\ndemonstrate a relatively high consideration of the intermediate steps. Our work\nhighlights that there is no one-size-fits-all decision workflow that can help\nobtain optimal human-AI collaboration. Our insights help deepen the\nunderstanding of the role of decision workflows in facilitating appropriate\nreliance. We synthesize important implications for designing effective means to\nfacilitate appropriate reliance on AI systems in composite tasks, positioning\nopportunities for the human-centered AI and broader HCI communities.\n","authors":["Gaole He","Patrick Hemmer","Michael Vössing","Max Schemmer","Ujwal Gadiraju"],"pdf_url":"https://arxiv.org/pdf/2501.10909v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2407.04710v2","updated":"2025-01-19T01:01:25Z","published":"2024-05-13T12:09:01Z","title":"Visual Evaluative AI: A Hypothesis-Driven Tool with Concept-Based\n  Explanations and Weight of Evidence","summary":"  This paper presents Visual Evaluative AI, a decision aid that provides\npositive and negative evidence from image data for a given hypothesis. This\ntool finds high-level human concepts in an image and generates the Weight of\nEvidence (WoE) for each hypothesis in the decision-making process. We apply and\nevaluate this tool in the skin cancer domain by building a web-based\napplication that allows users to upload a dermatoscopic image, select a\nhypothesis and analyse their decisions by evaluating the provided evidence.\nFurther, we demonstrate the effectiveness of Visual Evaluative AI on different\nconcept-based explanation approaches.\n","authors":["Thao Le","Tim Miller","Ruihan Zhang","Liz Sonenberg","Ronal Singh"],"pdf_url":"https://arxiv.org/pdf/2407.04710v2.pdf","comment":"4 pages"}]},"2025-01-18T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.10869v1","updated":"2025-01-18T20:31:55Z","published":"2025-01-18T20:31:55Z","title":"Diffusion-Based Imitation Learning for Social Pose Generation","summary":"  Intelligent agents, such as robots and virtual agents, must understand the\ndynamics of complex social interactions to interact with humans. Effectively\nrepresenting social dynamics is challenging because we require multi-modal,\nsynchronized observations to understand a scene. We explore how using a single\nmodality, the pose behavior, of multiple individuals in a social interaction\ncan be used to generate nonverbal social cues for the facilitator of that\ninteraction. The facilitator acts to make a social interaction proceed smoothly\nand is an essential role for intelligent agents to replicate in human-robot\ninteractions. In this paper, we adapt an existing diffusion behavior cloning\nmodel to learn and replicate facilitator behaviors. Furthermore, we evaluate\ntwo representations of pose observations from a scene, one representation has\npre-processing applied and one does not. The purpose of this paper is to\nintroduce a new use for diffusion behavior cloning for pose generation in\nsocial interactions. The second is to understand the relationship between\nperformance and computational load for generating social pose behavior using\ntwo different techniques for collecting scene observations. As such, we are\nessentially testing the effectiveness of two different types of conditioning\nfor a diffusion model. We then evaluate the resulting generated behavior from\neach technique using quantitative measures such as mean per-joint position\nerror (MPJPE), training time, and inference time. Additionally, we plot\ntraining and inference time against MPJPE to examine the trade-offs between\nefficiency and performance. Our results suggest that the further pre-processed\ndata can successfully condition diffusion models to generate realistic social\nbehavior, with reasonable trade-offs in accuracy and processing time.\n","authors":["Antonio Lech Martin-Ozimek","Isuru Jayarathne","Su Larb Mon","Jouh Yeong Chew"],"pdf_url":"https://arxiv.org/pdf/2501.10869v1.pdf","comment":"This paper was submitted as an LBR to HRI2025"},{"id":"http://arxiv.org/abs/2408.05924v2","updated":"2025-01-18T19:33:02Z","published":"2024-08-12T05:07:24Z","title":"Space-LLaVA: a Vision-Language Model Adapted to Extraterrestrial\n  Applications","summary":"  Foundation Models (FMs), e.g., large language models, possess attributes of\nintelligence which offer promise to endow a robot with the contextual\nunderstanding necessary to navigate complex, unstructured tasks in the wild. We\nsee three core challenges in the future of space robotics that motivate\nbuilding an FM for the space robotics community: 1) Scalability of\nground-in-the-loop operations; 2) Generalizing prior knowledge to novel\nenvironments; and 3) Multi-modality in tasks and sensor data. As a first-step\ntowards a space foundation model, we programmatically augment three\nextraterrestrial databases with fine-grained language annotations inspired by\nthe sensory reasoning necessary to e.g., identify a site of scientific interest\non Mars, building a synthetic dataset of visual-question-answer and visual\ninstruction-following tuples. We fine-tune a pre-trained LLaVA 13B checkpoint\non our augmented dataset to adapt a Vision-Language Model (VLM) to the visual\nsemantic features in an extraterrestrial environment, demonstrating FMs as a\ntool for specialization and enhancing a VLM's zero-shot performance on unseen\ntask types in comparison to state-of-the-art VLMs. Ablation studies show that\nfine-tuning the language backbone and vision-language adapter in concert is key\nto facilitate adaption while a small percentage, e.g., 20%, of the pre-training\ndata can be used to safeguard against catastrophic forgetting.\n","authors":["Matthew Foutter","Daniele Gammelli","Justin Kruger","Ethan Foss","Praneet Bhoj","Tommaso Guffanti","Simone D'Amico","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2408.05924v2.pdf","comment":"Accepted to IEEE Aerospace Conference, 23 pages, 18 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.10857v1","updated":"2025-01-18T19:24:54Z","published":"2025-01-18T19:24:54Z","title":"Learning Nonverbal Cues in Multiparty Social Interactions for Robotic\n  Facilitators","summary":"  Conventional behavior cloning (BC) models often struggle to replicate the\nsubtleties of human actions. Previous studies have attempted to address this\nissue through the development of a new BC technique: Implicit Behavior Cloning\n(IBC). This new technique consistently outperformed the conventional Mean\nSquared Error (MSE) BC models in a variety of tasks. Our goal is to replicate\nthe performance of the IBC model by Florence [in Proceedings of the 5th\nConference on Robot Learning, 164:158-168, 2022], for social interaction tasks\nusing our custom dataset. While previous studies have explored the use of large\nlanguage models (LLMs) for enhancing group conversations, they often overlook\nthe significance of non-verbal cues, which constitute a substantial part of\nhuman communication. We propose using IBC to replicate nonverbal cues like gaze\nbehaviors. The model is evaluated against various types of facilitator data and\ncompared to an explicit, MSE BC model. Results show that the IBC model\noutperforms the MSE BC model across session types using the same metrics used\nin the previous IBC paper. Despite some metrics showing mixed results which are\nexplainable for the custom dataset for social interaction, we successfully\nreplicated the IBC model to generate nonverbal cues. Our contributions are (1)\nthe replication and extension of the IBC model, and (2) a nonverbal cues\ngeneration model for social interaction. These advancements facilitate the\nintegration of robots into the complex interactions between robots and humans,\ne.g., in the absence of a human facilitator.\n","authors":["Antonio Lech Martin-Ozimek","Isuru Jayarathne","Su Larb Mon","Jouhyeong Chew"],"pdf_url":"https://arxiv.org/pdf/2501.10857v1.pdf","comment":"Submitted to as a short contribution to HRI2025"},{"id":"http://arxiv.org/abs/2501.10812v1","updated":"2025-01-18T16:22:07Z","published":"2025-01-18T16:22:07Z","title":"Graph Coloring to Reduce Computation Time in Prioritized Planning","summary":"  Distributing computations among agents in large networks reduces\ncomputational effort in multi-agent path finding (MAPF). One distribution\nstrategy is prioritized planning (PP). In PP, we couple and prioritize\ninteracting agents to achieve a desired behavior across all agents in the\nnetwork. We characterize the interaction with a directed acyclic graph (DAG).\nThe computation time for solving MAPF problem using PP is mainly determined\nthrough the longest path in this DAG. The longest path depends on the fixed\nundirected coupling graph and the variable prioritization. The approaches from\nliterature to prioritize agents are numerous and pursue various goals. This\narticle presents an approach for prioritization in PP to reduce the longest\npath length in the coupling DAG and thus the computation time for MAPF using\nPP. We prove that this problem can be mapped to a graph-coloring problem, in\nwhich the number of colors required corresponds to the longest path length in\nthe coupling DAG. We propose a decentralized graph-coloring algorithm to\ndetermine priorities for the agents. We evaluate the approach by applying it to\nmulti-agent motion planning (MAMP) for connected and automated vehicles (CAVs)\non roads using, a variant of MAPF.\n","authors":["Patrick Scheffe","Julius Kahle","Bassam Alrifaee"],"pdf_url":"https://arxiv.org/pdf/2501.10812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10781v1","updated":"2025-01-18T14:35:32Z","published":"2025-01-18T14:35:32Z","title":"Simultaneous Computation with Multiple Prioritizations in Multi-Agent\n  Motion Planning","summary":"  Multi-agent path finding (MAPF) in large networks is computationally\nchallenging. An approach for MAPF is prioritized planning (PP), in which agents\nplan sequentially according to their priority. Albeit a computationally\nefficient approach for MAPF, the solution quality strongly depends on the\nprioritization. Most prioritizations rely either on heuristics, which do not\ngeneralize well, or iterate to find adequate priorities, which costs\ncomputational effort. In this work, we show how agents can compute with\nmultiple prioritizations simultaneously. Our approach is general as it does not\nrely on domain-specific knowledge. The context of this work is multi-agent\nmotion planning (MAMP) with a receding horizon subject to computation time\nconstraints. MAMP considers the system dynamics in more detail compared to\nMAPF. In numerical experiments on MAMP, we demonstrate that our approach to\nprioritization comes close to optimal prioritization and outperforms\nstate-of-the-art methods with only a minor increase in computation time. We\nshow real-time capability in an experiment on a road network with ten vehicles\nin our Cyber-Physical Mobility Lab.\n","authors":["Patrick Scheffe","Julius Kahle","Bassam Alrifaee"],"pdf_url":"https://arxiv.org/pdf/2501.10781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15142v2","updated":"2025-01-18T12:03:46Z","published":"2024-03-22T11:52:31Z","title":"ALPINE: a climbing robot for operations in mountain environments","summary":"  Mountain slopes are perfect examples of harsh environments in which humans\nare required to perform difficult and dangerous operations such as removing\nunstable boulders, dangerous vegetation or deploying safety nets. A good\nreplacement for human intervention can be offered by climbing robots. The\ndifferent solutions existing in the literature are not up to the task for the\ndifficulty of the requirements (navigation, heavy payloads, flexibility in the\nexecution of the tasks). In this paper, we propose a robotic platform that can\nfill this gap. Our solution is based on a robot that hangs on ropes, and uses a\nretractable leg to jump away from the mountain walls. Our package of mechanical\nsolutions, along with the algorithms developed for motion planning and control,\ndelivers swift navigation on irregular and steep slopes, the possibility to\novercome or travel around significant natural barriers, and the ability to\ncarry heavy payloads and execute complex tasks. In the paper, we give a full\naccount of our main design and algorithmic choices and show the feasibility of\nthe solution through a large number of physically simulated scenarios.\n","authors":["Michele Focchi","Andrea Del Prete","Daniele Fontanelli","Marco Frego","Angelika Peer","Luigi Palopoli"],"pdf_url":"https://arxiv.org/pdf/2403.15142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17373v4","updated":"2025-01-18T11:48:32Z","published":"2024-08-30T16:08:49Z","title":"Augmented Reality without Borders: Achieving Precise Localization\n  Without Maps","summary":"  Visual localization is crucial for Computer Vision and Augmented Reality (AR)\napplications, where determining the camera or device's position and orientation\nis essential to accurately interact with the physical environment. Traditional\nmethods rely on detailed 3D maps constructed using Structure from Motion (SfM)\nor Simultaneous Localization and Mapping (SLAM), which is computationally\nexpensive and impractical for dynamic or large-scale environments. We introduce\nMARLoc, a novel localization framework for AR applications that uses known\nrelative transformations within image sequences to perform intra-sequence\ntriangulation, generating 3D-2D correspondences for pose estimation and\nrefinement. MARLoc eliminates the need for pre-built SfM maps, providing\naccurate and efficient localization suitable for dynamic outdoor environments.\nEvaluation with benchmark datasets and real-world experiments demonstrates\nMARLoc's state-of-the-art performance and robustness. By integrating MARLoc\ninto an AR device, we highlight its capability to achieve precise localization\nin real-world outdoor scenarios, showcasing its practical effectiveness and\npotential to enhance visual localization in AR applications.\n","authors":["Albert Gassol Puigjaner","Irvin Aloise","Patrik Schmuck"],"pdf_url":"https://arxiv.org/pdf/2408.17373v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08347v3","updated":"2025-01-18T10:40:40Z","published":"2024-06-12T15:55:39Z","title":"Three-dimensional Trajectory Optimization for Quadrotor Tail-sitter\n  UAVs: Traversing through Given Waypoints","summary":"  Given the evolving application scenarios of current fixed-wing unmanned\naerial vehicles (UAVs), it is necessary for UAVs to possess agile and rapid\n3-dimensional flight capabilities. Typically, the trajectory of a tail-sitter\nis generated separately for vertical and level flights. This limits the\ntail-sitter's ability to move in a 3-dimensional airspace and makes it\ndifficult to establish a smooth transition between vertical and level flights.\nIn the present work, a 3-dimensional trajectory optimization method is proposed\nfor quadrotor tail-sitters. Especially, the differential dynamics constraints\nare eliminated when generating the trajectory of the tail-sitter by utilizing\ndifferential flatness method. Additionally, the temporal parameters of the\ntrajectory are generated using the state-of-the-art trajectory generation\nmethod called MINCO (minimum control). Subsequently, we convert the speed\nconstraint on the vehicle into a soft constraint by discretizing the trajectory\nin time. This increases the likelihood that the control input limits are\nsatisfied and the trajectory is feasible. Then, we utilize a kind of model\npredictive control (MPC) method to track trajectories. Even if restricting the\ntail-sitter's motion to a 2-dimensional horizontal plane, the solutions still\noutperform those of the L1 Guidance Law and Dubins path.\n","authors":["Mingyue Fan","Fangfang Xie","Tingwei Ji","Yao Zheng"],"pdf_url":"https://arxiv.org/pdf/2406.08347v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10713v1","updated":"2025-01-18T10:13:15Z","published":"2025-01-18T10:13:15Z","title":"Human-like Nonverbal Behavior with MetaHumans in Real-World Interaction\n  Studies: An Architecture Using Generative Methods and Motion Capture","summary":"  Socially interactive agents are gaining prominence in domains like\nhealthcare, education, and service contexts, particularly virtual agents due to\ntheir inherent scalability. To facilitate authentic interactions, these systems\nrequire verbal and nonverbal communication through e.g., facial expressions and\ngestures. While natural language processing technologies have rapidly advanced,\nincorporating human-like nonverbal behavior into real-world interaction\ncontexts is crucial for enhancing the success of communication, yet this area\nremains underexplored. One barrier is creating autonomous systems with\nsophisticated conversational abilities that integrate human-like nonverbal\nbehavior. This paper presents a distributed architecture using Epic Games\nMetaHuman, combined with advanced conversational AI and camera-based user\nmanagement, that supports methods like motion capture, handcrafted animation,\nand generative approaches for nonverbal behavior. We share insights into a\nsystem architecture designed to investigate nonverbal behavior in socially\ninteractive agents, deployed in a three-week field study in the Deutsches\nMuseum Bonn, showcasing its potential in realistic nonverbal behavior research.\n","authors":["Oliver Chojnowski","Alexander Eberhard","Michael Schiffmann","Ana Müller","Anja Richert"],"pdf_url":"https://arxiv.org/pdf/2501.10713v1.pdf","comment":"Accepted for presentation at the ACM/IEEE International Conference on\n  Human-Robot Interaction (HRI 2025) as a Late-Breaking Report"},{"id":"http://arxiv.org/abs/2411.03706v2","updated":"2025-01-18T08:44:53Z","published":"2024-11-06T07:08:41Z","title":"3DGS-CD: 3D Gaussian Splatting-based Change Detection for Physical\n  Object Rearrangement","summary":"  We present 3DGS-CD, the first 3D Gaussian Splatting (3DGS)-based method for\ndetecting physical object rearrangements in 3D scenes. Our approach estimates\n3D object-level changes by comparing two sets of unaligned images taken at\ndifferent times. Leveraging 3DGS's novel view rendering and EfficientSAM's\nzero-shot segmentation capabilities, we detect 2D object-level changes, which\nare then associated and fused across views to estimate 3D change masks and\nobject transformations. Our method can accurately identify changes in cluttered\nenvironments using sparse (as few as one) post-change images within as little\nas 18s. It does not rely on depth input, user instructions, pre-defined object\nclasses, or object models -- An object is recognized simply if it has been\nre-arranged. Our approach is evaluated on both public and self-collected\nreal-world datasets, achieving up to 14% higher accuracy and three orders of\nmagnitude faster performance compared to the state-of-the-art\nradiance-field-based change detection method. This significant performance\nboost enables a broad range of downstream applications, where we highlight\nthree key use cases: object reconstruction, robot workspace reset, and 3DGS\nmodel update. Our code and data will be made available at\nhttps://github.com/520xyxyzq/3DGS-CD.\n","authors":["Ziqi Lu","Jianbo Ye","John Leonard"],"pdf_url":"https://arxiv.org/pdf/2411.03706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10698v1","updated":"2025-01-18T08:37:33Z","published":"2025-01-18T08:37:33Z","title":"An Interpretable Neural Control Network with Adaptable Online Learning\n  for Sample Efficient Robot Locomotion Learning","summary":"  Robot locomotion learning using reinforcement learning suffers from training\nsample inefficiency and exhibits the non-understandable/black-box nature. Thus,\nthis work presents a novel SME-AGOL to address such problems. Firstly,\nSequential Motion Executor (SME) is a three-layer interpretable neural network,\nwhere the first produces the sequentially propagating hidden states, the second\nconstructs the corresponding triangular bases with minor non-neighbor\ninterference, and the third maps the bases to the motor commands. Secondly, the\nAdaptable Gradient-weighting Online Learning (AGOL) algorithm prioritizes the\nupdate of the parameters with high relevance score, allowing the learning to\nfocus more on the highly relevant ones. Thus, these two components lead to an\nanalyzable framework, where each sequential hidden state/basis represents the\nlearned key poses/robot configuration. Compared to state-of-the-art methods,\nthe SME-AGOL requires 40% fewer samples and receives 150% higher final\nreward/locomotion performance on a simulated hexapod robot, while taking merely\n10 minutes of learning time from scratch on a physical hexapod robot. Taken\ntogether, this work not only proposes the SME-AGOL for sample efficient and\nunderstandable locomotion learning but also emphasizes the potential\nexploitation of interpretability for improving sample efficiency and learning\nperformance.\n","authors":["Arthicha Srisuchinnawong","Poramate Manoonpong"],"pdf_url":"https://arxiv.org/pdf/2501.10698v1.pdf","comment":"20 pages, 11 Figures + 6 Figures in supplementary material section, 2\n  Tables, submitted to TNNLS (minor revision; revision submitted 5 October\n  2024)"},{"id":"http://arxiv.org/abs/2407.01891v2","updated":"2025-01-18T06:11:29Z","published":"2024-07-02T02:22:49Z","title":"Refined Motion Compensation with Soft Laser Manipulators using\n  Data-Driven Surrogate Models","summary":"  Non-contact laser ablation, a precise thermal technique, simultaneously cuts\nand coagulates tissue without the insertion errors associated with rigid\nneedles. Human organ motions, such as those in the liver, exhibit rhythmic\ncomponents influenced by respiratory and cardiac cycles, making effective laser\nenergy delivery to target lesions while compensating for tumor motion crucial.\nThis research introduces a data-driven method to derive surrogate models of a\nsoft manipulator. These low-dimensional models offer computational efficiency\nwhen integrated into the Model Predictive Control (MPC) framework, while still\ncapturing the manipulator's dynamics with and without control input. Spectral\nSubmanifolds (SSM) theory models the manipulator's autonomous dynamics,\nacknowledging its tendency to reach equilibrium when external forces are\nremoved. Preliminary results show that the MPC controller using the surrogate\nmodel outperforms two other models within the same MPC framework. The\ndata-driven MPC controller also supports a design-agnostic feature, allowing\nthe interchangeability of different soft manipulators within the laser ablation\nsurgery robot system.\n","authors":["Yongjun Yan","Qingpeng Ding","Mingwu Li","Junyan Yan","Shing Shin Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.01891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10663v1","updated":"2025-01-18T06:00:29Z","published":"2025-01-18T06:00:29Z","title":"PB-NBV: Efficient Projection-Based Next-Best-View Planning Framework for\n  Reconstruction of Unknown Objects","summary":"  Completely capturing the three-dimensional (3D) data of an object is\nessential in industrial and robotic applications. The task of next-best-view\n(NBV) planning is to calculate the next optimal viewpoint based on the current\ndata, gradually achieving a complete 3D reconstruction of the object. However,\nmany existing NBV planning algorithms incur heavy computational costs due to\nthe extensive use of ray-casting. Specifically, this framework refits different\ntypes of voxel clusters into ellipsoids based on the voxel structure. Then, the\nnext optimal viewpoint is selected from the candidate views using a\nprojection-based viewpoint quality evaluation function in conjunction with a\nglobal partitioning strategy. This process replaces extensive ray-casting,\nsignificantly improving the computational efficiency. Comparison experiments in\nthe simulation environment show that our framework achieves the highest point\ncloud coverage with low computational time compared to other frameworks. The\nreal-world experiments also confirm the efficiency and feasibility of the\nframework. Our method will be made open source to benefit the community.\n","authors":["Zhizhou Jia","Yuetao Li","Qun Hao","Shaohui Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.10663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21416v2","updated":"2025-01-18T05:47:13Z","published":"2024-07-31T08:04:32Z","title":"VIPeR: Visual Incremental Place Recognition with Adaptive Mining and\n  Lifelong Learning","summary":"  Visual place recognition (VPR) is an essential component of many autonomous\nand augmented/virtual reality systems. It enables the systems to robustly\nlocalize themselves in large-scale environments. Existing VPR methods\ndemonstrate attractive performance at the cost of heavy pre-training and\nlimited generalizability. When deployed in unseen environments, these methods\nexhibit significant performance drops. Targeting this issue, we present VIPeR,\na novel approach for visual incremental place recognition with the ability to\nadapt to new environments while retaining the performance of previous\nenvironments. We first introduce an adaptive mining strategy that balances the\nperformance within a single environment and the generalizability across\nmultiple environments. Then, to prevent catastrophic forgetting in lifelong\nlearning, we draw inspiration from human memory systems and design a novel\nmemory bank for our VIPeR. Our memory bank contains a sensory memory, a working\nmemory and a long-term memory, with the first two focusing on the current\nenvironment and the last one for all previously visited environments.\nAdditionally, we propose a probabilistic knowledge distillation to explicitly\nsafeguard the previously learned knowledge. We evaluate our proposed VIPeR on\nthree large-scale datasets, namely Oxford Robotcar, Nordland, and TartanAir.\nFor comparison, we first set a baseline performance with naive finetuning.\nThen, several more recent lifelong learning methods are compared. Our VIPeR\nachieves better performance in almost all aspects with the biggest improvement\nof 13.65% in average performance.\n","authors":["Yuhang Ming","Minyang Xu","Xingrui Yang","Weicai Ye","Weihan Wang","Yong Peng","Weichen Dai","Wanzeng Kong"],"pdf_url":"https://arxiv.org/pdf/2407.21416v2.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2009.12293v3","updated":"2025-01-18T02:57:38Z","published":"2020-09-25T15:32:31Z","title":"robosuite: A Modular Simulation Framework and Benchmark for Robot\n  Learning","summary":"  robosuite is a simulation framework for robot learning powered by the MuJoCo\nphysics engine. It offers a modular design for creating robotic tasks as well\nas a suite of benchmark environments for reproducible research. This paper\ndiscusses the key system modules and the benchmark environments of our new\nrelease robosuite v1.5.\n","authors":["Yuke Zhu","Josiah Wong","Ajay Mandlekar","Roberto Martín-Martín","Abhishek Joshi","Kevin Lin","Abhiram Maddukuri","Soroush Nasiriany","Yifeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2009.12293v3.pdf","comment":"For more information, please visit https://robosuite.ai"},{"id":"http://arxiv.org/abs/2501.10636v1","updated":"2025-01-18T02:43:14Z","published":"2025-01-18T02:43:14Z","title":"Efficient and Safe Trajectory Planning for Autonomous Agricultural\n  Vehicle Headland Turning in Cluttered Orchard Environments","summary":"  Autonomous agricultural vehicles (AAVs), including field robots and\nautonomous tractors, are becoming essential in modern farming by improving\nefficiency and reducing labor costs. A critical task in AAV operations is\nheadland turning between crop rows. This task is challenging in orchards with\nlimited headland space, irregular boundaries, operational constraints, and\nstatic obstacles. While traditional trajectory planning methods work well in\narable farming, they often fail in cluttered orchard environments. This letter\npresents a novel trajectory planner that enhances the safety and efficiency of\nAAV headland maneuvers, leveraging advancements in autonomous driving. Our\napproach includes an efficient front-end algorithm and a high-performance\nback-end optimization. Applied to vehicles with various implements, it\noutperforms state-of-the-art methods in both standard and challenging orchard\nfields. This work bridges agricultural and autonomous driving technologies,\nfacilitating a broader adoption of AAVs in complex orchards.\n","authors":["Peng Wei","Chen Peng","Wenwu Lu","Yuankai Zhu","Stavros Vougioukas","Zhenghao Fei","Zhikang Ge"],"pdf_url":"https://arxiv.org/pdf/2501.10636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10621v1","updated":"2025-01-18T01:04:02Z","published":"2025-01-18T01:04:02Z","title":"RoMu4o: A Robotic Manipulation Unit For Orchard Operations Automating\n  Proximal Hyperspectral Leaf Sensing","summary":"  Driven by the need to address labor shortages and meet the demands of a\nrapidly growing population, robotic automation has become a critical component\nin precision agriculture. Leaf-level hyperspectral spectroscopy is shown to be\na powerful tool for phenotyping, monitoring crop health, identifying essential\nnutrients within plants as well as detecting diseases and water stress. This\nwork introduces RoMu4o, a robotic manipulation unit for orchard operations\noffering an automated solution for proximal hyperspectral leaf sensing. This\nground robot is equipped with a 6DOF robotic arm and vision system for\nreal-time deep learning-based image processing and motion planning. We\ndeveloped robust perception and manipulation pipelines that enable the robot to\nsuccessfully grasp target leaves and perform spectroscopy. These frameworks\noperate synergistically to identify and extract the 3D structure of leaves from\nan observed batch of foliage, propose 6D poses, and generate collision-free\nconstraint-aware paths for precise leaf manipulation. The end-effector of the\narm features a compact design that integrates an independent lighting source\nwith a hyperspectral sensor, enabling high-fidelity data acquisition while\nstreamlining the calibration process for accurate measurements. Our ground\nrobot is engineered to operate in unstructured orchard environments. However,\nthe performance of the system is evaluated in both indoor and outdoor plant\nmodels. The system demonstrated reliable performance for 1-LPB hyperspectral\nsampling, achieving 95% success rate in lab trials and 79% in field trials.\nField experiments revealed an overall success rate of 70% for autonomous leaf\ngrasping and hyperspectral measurement in a pistachio orchard. The open-source\nrepository is available at: https://github.com/mehradmrt/UCM-AgBot-ROS2\n","authors":["Mehrad Mortazavi","David J. Cappelleri","Reza Ehsani"],"pdf_url":"https://arxiv.org/pdf/2501.10621v1.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2501.10859v1","updated":"2025-01-18T19:52:27Z","published":"2025-01-18T19:52:27Z","title":"Which price to pay? Auto-tuning building MPC controller for optimal\n  economic cost","summary":"  Model predictive control (MPC) controller is considered for temperature\nmanagement in buildings but its performance heavily depends on hyperparameters.\nConsequently, MPC necessitates meticulous hyperparameter tuning to attain\noptimal performance under diverse contracts. However, conventional building\ncontroller design is an open-loop process without critical hyperparameter\noptimization, often leading to suboptimal performance due to unexpected\nenvironmental disturbances and modeling errors. Furthermore, these\nhyperparameters are not adapted to different pricing schemes and may lead to\nnon-economic operations. To address these issues, we propose an efficient\nperformance-oriented building MPC controller tuning method based on a\ncutting-edge efficient constrained Bayesian optimization algorithm, CONFIG,\nwith global optimality guarantees. We demonstrate that this technique can be\napplied to efficiently deal with real-world DSM program selection problems\nunder customized black-box constraints and objectives. In this study, a simple\nMPC controller, which offers the advantages of reduced commissioning costs,\nenhanced computational efficiency, was optimized to perform on a comparable\nlevel to a delicately designed and computationally expensive MPC controller.\nThe results also indicate that with an optimized simple MPC, the monthly\nelectricity cost of a household can be reduced by up to 26.90% compared with\nthe cost when controlled by a basic rule-based controller under the same\nconstraints. Then we compared 12 real electricity contracts in Belgium for a\nhousehold family with customized black-box occupant comfort constraints. The\nresults indicate a monthly electricity bill saving up to 20.18% when the most\neconomic contract is compared with the worst one, which again illustrates the\nsignificance of choosing a proper electricity contract.\n","authors":["Jiarui Yu","Jicheng Shi","Wenjie Xu","Colin N. Jones"],"pdf_url":"https://arxiv.org/pdf/2501.10859v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.10842v1","updated":"2025-01-18T18:25:37Z","published":"2025-01-18T18:25:37Z","title":"BOOST: Microgrid Sizing using Ordinal Optimization","summary":"  The transition to sustainable energy systems has highlighted the critical\nneed for efficient sizing of renewable energy resources in microgrids. In\nparticular, designing photovoltaic (PV) and battery systems to meet residential\nloads is challenging due to trade-offs between cost, reliability, and\nenvironmental impact. While previous studies have employed dynamic programming\nand heuristic techniques for microgrid sizing, these approaches often fail to\nbalance computational efficiency and accuracy. In this work, we propose BOOST,\nor Battery-solar Ordinal Optimization Sizing Technique, a novel framework for\noptimizing the sizing of PV and battery components in microgrids. Ordinal\noptimization enables computationally efficient evaluations of potential designs\nwhile preserving accuracy through robust ranking of solutions. To determine the\noptimal operation of the system at any given time, we introduce a mixed-integer\nlinear programming (MILP) approach, which achieves lower costs than the\ncommonly used dynamic programming methods. Our numerical experiments\ndemonstrate that the proposed framework identifies optimal designs that achieve\na levelized cost of energy (LCOE) as low as 8.84 cents/kWh, underscoring its\npotential for cost-effective microgrid design. The implications of our work are\nsignificant: BOOST provides a scalable and accurate methodology for integrating\nrenewable energy into residential microgrids, addressing economic and\nenvironmental goals simultaneously.\n","authors":["Mohamad Fares El Hajj Chehade","Sami Karaki"],"pdf_url":"https://arxiv.org/pdf/2501.10842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10839v1","updated":"2025-01-18T18:19:49Z","published":"2025-01-18T18:19:49Z","title":"Systems Engineering for Autonomous Vehicles; Supervising AI using Large\n  Language Models (SSuperLLM)","summary":"  Generative Artificial Intelligence (GAI) and the idea to use hierarchical\nmodels has been around for some years now. GAI has proved to be an extremely\nuseful tool for Autonomous Vehicles (AVs). AVs need to perform robustly in\ntheir environment. Thus the AV behavior and short-term trajectory planning\nneeds to be: a) designed and architected using safeguarding and supervisory\nsystems and b) verified using proper Systems Engineering (SysEng) Principles.\nCan AV Systems Engineering also use Large Language Models (LLM) to help\nAutonomous vehicles (AV) development? This reader-friendly paper advocates the\nuse of LLMs in 1) requirements (Reqs) development and 2) Reqs verification and\n3) provides a proof-of-concept of AV supervisory control. The latter uses a\nsimulation environment of a simple planar (bicycle) vehicle dynamics model and\na Linear Quadratic Regulator (LQR) control with an LLM Application Interface\n(API). The Open-Source simulation SW is available from the author accessible to\nthe readers so that they can engage into the AV stack, LLM API and rules,\nSysEng and Reqs and fundamental vehicle dynamics and control.\n","authors":["Diomidis Katzourakis"],"pdf_url":"https://arxiv.org/pdf/2501.10839v1.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.10827v1","updated":"2025-01-18T17:19:09Z","published":"2025-01-18T17:19:09Z","title":"Integrating Expert and Physics Knowledge for Modeling Heat Load in\n  District Heating Systems","summary":"  New residential neighborhoods are often supplied with heat via district\nheating systems (DHS). Improving the energy efficiency of a DHS is critical for\nincreasing sustainability and satisfying user requirements. In this paper, we\npresent HELIOS, a dedicated artificial intelligence (AI) model designed\nspecifically for modeling the heat load in DHS. HELIOS leverages a combination\nof established physical principles and expert knowledge, resulting in superior\nperformance compared to existing state-of-the-art models. HELIOS is\nexplainable, enabling enhanced accountability and traceability in its\npredictions. We evaluate HELIOS against ten state-of-the-art data-driven models\nin modeling the heat load in a DHS case study in the Netherlands. HELIOS\nemerges as the top-performing model while maintaining complete accountability.\nThe applications of HELIOS extend beyond the present case study, potentially\nsupporting the adoption of AI by DHS and contributing to sustainable energy\nmanagement on a larger scale.\n","authors":["Francisco Souza","Thom Badings","Geert Postma","Jeroen Jansen"],"pdf_url":"https://arxiv.org/pdf/2501.10827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10806v1","updated":"2025-01-18T16:00:14Z","published":"2025-01-18T16:00:14Z","title":"Non-Expansive Mappings in Two-Time-Scale Stochastic Approximation:\n  Finite-Time Analysis","summary":"  Two-time-scale stochastic approximation is an iterative algorithm used in\napplications such as optimization, reinforcement learning, and control.\nFinite-time analysis of these algorithms has primarily focused on fixed point\niterations where both time-scales have contractive mappings. In this paper, we\nstudy two-time-scale iterations, where the slower time-scale has a\nnon-expansive mapping. For such algorithms, the slower time-scale can be\nconsidered a stochastic inexact Krasnoselskii-Mann iteration. We show that the\nmean square error decays at a rate $O(1/k^{1/4-\\epsilon})$, where $\\epsilon>0$\nis arbitrarily small. We also show almost sure convergence of iterates to the\nset of fixed points. We show the applicability of our framework by applying our\nresults to minimax optimization, linear stochastic approximation, and\nLagrangian optimization.\n","authors":["Siddharth Chandak"],"pdf_url":"https://arxiv.org/pdf/2501.10806v1.pdf","comment":"Submitted to SIAM Journal on Control and Optimization"},{"id":"http://arxiv.org/abs/2406.00540v2","updated":"2025-01-18T14:34:13Z","published":"2024-06-01T19:35:12Z","title":"Optimal Transmission Power Scheduling for Networked Control System under\n  DoS Attack","summary":"  Designing networked control systems that are reliable and resilient against\nadversarial threats, is essential for ensuring the security of cyber-physical\nsystems. This paper addresses the communication-control co-design problem for\nnetworked control systems under denial-of-service (DoS) attacks. In the\nwireless channel, a transmission power scheduler periodically determines the\npower level for sensory data transmission. Yet DoS attacks render data packets\nunavailable by disrupting the communication channel. This paper co-designs the\ncontrol and power scheduling laws in the presence of DoS attacks and aims to\nminimize the sum of regulation control performance and transmission power\nconsumption. Both finite- and infinite-horizon discounted cost criteria are\naddressed, respectively. By delving into the information structure between the\ncontroller and the power scheduler under attack, the original co-design problem\nis divided into two subproblems that can be solved individually without\ncompromising optimality. The optimal control is shown to be certainty\nequivalent, and the optimal transmission power scheduling is solved using a\ndynamic programming approach. Moreover, in the infinite-horizon scenario, we\nanalyze the performance of the designed scheduling policy and develop an upper\nbound of the total costs. Finally, a numerical example is provided to\ndemonstrate the theoretical results.\n","authors":["Siyi Wang","Yulong Gao","Sandra Hirche"],"pdf_url":"https://arxiv.org/pdf/2406.00540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07088v2","updated":"2025-01-18T08:26:26Z","published":"2024-11-11T16:05:03Z","title":"Eavesdropping on Goal-Oriented Communication: Timing Attacks and\n  Countermeasures","summary":"  Goal-oriented communication is a new paradigm that considers the meaning of\ntransmitted information to optimize communication. One possible application is\nthe remote monitoring of a process under communication costs: scheduling\nupdates based on goal-oriented considerations can significantly reduce\ntransmission frequency while maintaining high-quality tracking performance.\nHowever, goal-oriented scheduling also opens a timing-based side-channel that\nan eavesdropper may exploit to obtain information about the state of the remote\nprocess, even if the content of updates is perfectly secure. In this work, we\nstudy an eavesdropping attack against pull-based goal-oriented scheduling for\nthe tracking of remote Markov processes. We provide a theoretical framework for\ndefining the effectiveness of the attack and of possible countermeasures, as\nwell as a practical heuristic that can provide a balance between the\nperformance gains offered by goal-oriented communication and the information\nleakage.\n","authors":["Federico Mason","Federico Chiariotti","Pietro Talli","Andrea Zanella"],"pdf_url":"https://arxiv.org/pdf/2411.07088v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10690v1","updated":"2025-01-18T08:08:20Z","published":"2025-01-18T08:08:20Z","title":"Insights from the application of nonlinear model predictive control to a\n  cart-pendulum","summary":"  Inspired greatly by Mills et al. (2009) and the solution within, this paper\naims to more clearly\n  explain the mathematics and implementation details of such a powerful control\nalgorithm. While the\n  aforementioned paper is well written and of sound mathematics, it is\nextreamly dense and requires\n  some time and patience to decipher, especially as it draws on many other\nsources to complete the\n  algorithm. This dense property is a clear result of the paper being\nrestricted to the brief form and\n  important details being ommited as a result. We provide the much needed\nelaboration here for the\n  benifit of the reader.\n","authors":["Mark P. Balenzuela"],"pdf_url":"https://arxiv.org/pdf/2501.10690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08128v2","updated":"2025-01-18T07:17:30Z","published":"2024-07-11T01:59:01Z","title":"Functional Type Expressions of Sequential Circuits with the Notion of\n  Referring Forms","summary":"  This paper introduces the notion of referring forms as a new metric for\nanalyzing sequential circuits from a functional perspective. Sequential\ncircuits are modeled as causal stream functions, the outputs of which depend\nsolely on the past and current inputs. Referring forms are defined based on the\ntype expressions of functions and represent how a circuit refers to past\ninputs. The key contribution of this study is identifying a universal property\nin multiple clock domain circuits using referring forms. This theoretical\nframework is expected to enhance the comprehension and analysis of sequential\ncircuits.\n","authors":["Shunji Nishimura"],"pdf_url":"https://arxiv.org/pdf/2407.08128v2.pdf","comment":"5 pages, 7 figures, 2025 11th International Conference on Computing\n  and Artificial Intelligence (ICCAI 2025): accepted"},{"id":"http://arxiv.org/abs/2407.01891v2","updated":"2025-01-18T06:11:29Z","published":"2024-07-02T02:22:49Z","title":"Refined Motion Compensation with Soft Laser Manipulators using\n  Data-Driven Surrogate Models","summary":"  Non-contact laser ablation, a precise thermal technique, simultaneously cuts\nand coagulates tissue without the insertion errors associated with rigid\nneedles. Human organ motions, such as those in the liver, exhibit rhythmic\ncomponents influenced by respiratory and cardiac cycles, making effective laser\nenergy delivery to target lesions while compensating for tumor motion crucial.\nThis research introduces a data-driven method to derive surrogate models of a\nsoft manipulator. These low-dimensional models offer computational efficiency\nwhen integrated into the Model Predictive Control (MPC) framework, while still\ncapturing the manipulator's dynamics with and without control input. Spectral\nSubmanifolds (SSM) theory models the manipulator's autonomous dynamics,\nacknowledging its tendency to reach equilibrium when external forces are\nremoved. Preliminary results show that the MPC controller using the surrogate\nmodel outperforms two other models within the same MPC framework. The\ndata-driven MPC controller also supports a design-agnostic feature, allowing\nthe interchangeability of different soft manipulators within the laser ablation\nsurgery robot system.\n","authors":["Yongjun Yan","Qingpeng Ding","Mingwu Li","Junyan Yan","Shing Shin Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.01891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10637v1","updated":"2025-01-18T02:44:34Z","published":"2025-01-18T02:44:34Z","title":"HOPS: High-order Polynomials with Self-supervised Dimension Reduction\n  for Load Forecasting","summary":"  Load forecasting is a fundamental task in smart grid. Many techniques have\nbeen applied to developing load forecasting models. Due to the challenges such\nas the Curse of Dimensionality, overfitting, and limited computing resources,\nmultivariate higher-order polynomial models have received limited attention in\nload forecasting, despite their desirable mathematical foundations and\noptimization properties. In this paper, we propose low rank approximation and\nself-supervised dimension reduction to address the aforementioned issues. To\nfurther improve computational efficiency, we also introduce a fast Conjugate\nGradient based algorithm for the proposed polynomial models. Based on the ISO\nNew England dataset used in Global Energy Forecasting Competition 2017, the\nproposed method high-order polynomials with self-supervised dimension reduction\n(HOPS) demonstrates higher forecasting accuracy over several competitive\nmodels. Additionally, experimental results indicate that our approach\nalleviates redundant variable construction, achieving better forecasts with\nfewer input variables.\n","authors":["Pengyang Song","Han Feng","Shreyashi Shukla","Jue Wang","Tao Hong"],"pdf_url":"https://arxiv.org/pdf/2501.10637v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.10625v1","updated":"2025-01-18T01:55:41Z","published":"2025-01-18T01:55:41Z","title":"Assessing Markov Property in Driving Behaviors: Insights from\n  Statistical Tests","summary":"  The Markov property serves as a foundational assumption in most existing work\non vehicle driving behavior, positing that future states depend solely on the\ncurrent state, not the series of preceding states. This study validates the\nMarkov properties of vehicle trajectories for both Autonomous Vehicles (AVs)\nand Human-driven Vehicles (HVs). A statistical method used to test whether time\nseries data exhibits Markov properties is applied to examine whether the\ntrajectory data possesses Markov characteristics. t test and F test are\nadditionally introduced to characterize the differences in Markov properties\nbetween AVs and HVs. Based on two public trajectory datasets, we investigate\nthe presence and order of the Markov property of different types of vehicles\nthrough rigorous statistical tests. Our findings reveal that AV trajectories\ngenerally exhibit stronger Markov properties compared to HV trajectories, with\na higher percentage conforming to the Markov property and lower Markov orders.\nIn contrast, HV trajectories display greater variability and heterogeneity in\ndecision-making processes, reflecting the complex perception and information\nprocessing involved in human driving. These results have significant\nimplications for the development of driving behavior models, AV controllers,\nand traffic simulation systems. Our study also demonstrates the feasibility of\nusing statistical methods to test the presence of Markov properties in driving\ntrajectory data.\n","authors":["Zheng Li","Haoming Meng","Chengyuan Ma","Ke Ma","Xiaopeng Li"],"pdf_url":"https://arxiv.org/pdf/2501.10625v1.pdf","comment":null}],"Optimization and Control":[{"id":"http://arxiv.org/abs/2501.10895v1","updated":"2025-01-18T22:40:33Z","published":"2025-01-18T22:40:33Z","title":"Classical and Deep Reinforcement Learning Inventory Control Policies for\n  Pharmaceutical Supply Chains with Perishability and Non-Stationarity","summary":"  We study inventory control policies for pharmaceutical supply chains,\naddressing challenges such as perishability, yield uncertainty, and\nnon-stationary demand, combined with batching constraints, lead times, and lost\nsales. Collaborating with Bristol-Myers Squibb (BMS), we develop a realistic\ncase study incorporating these factors and benchmark three\npolicies--order-up-to (OUT), projected inventory level (PIL), and deep\nreinforcement learning (DRL) using the proximal policy optimization (PPO)\nalgorithm--against a BMS baseline based on human expertise. We derive and\nvalidate bounds-based procedures for optimizing OUT and PIL policy parameters\nand propose a methodology for estimating projected inventory levels, which are\nalso integrated into the DRL policy with demand forecasts to improve\ndecision-making under non-stationarity. Compared to a human-driven policy,\nwhich avoids lost sales through higher holding costs, all three implemented\npolicies achieve lower average costs but exhibit greater cost variability.\nWhile PIL demonstrates robust and consistent performance, OUT struggles under\nhigh lost sales costs, and PPO excels in complex and variable scenarios but\nrequires significant computational effort. The findings suggest that while DRL\nshows potential, it does not outperform classical policies in all numerical\nexperiments, highlighting 1) the need to integrate diverse policies to manage\npharmaceutical challenges effectively, based on the current state-of-the-art,\nand 2) that practical problems in this domain seem to lack a single policy\nclass that yields universally acceptable performance.\n","authors":["Francesco Stranieri","Chaaben Kouki","Willem van Jaarsveld","Fabio Stella"],"pdf_url":"https://arxiv.org/pdf/2501.10895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04169v2","updated":"2025-01-18T20:58:21Z","published":"2023-04-09T06:10:49Z","title":"SLowcal-SGD: Slow Query Points Improve Local-SGD for Stochastic Convex\n  Optimization","summary":"  We consider distributed learning scenarios where M machines interact with a\nparameter server along several communication rounds in order to minimize a\njoint objective function. Focusing on the heterogeneous case, where different\nmachines may draw samples from different data-distributions, we design the\nfirst local update method that provably benefits over the two most prominent\ndistributed baselines: namely Minibatch-SGD and Local-SGD. Key to our approach\nis a slow querying technique that we customize to the distributed setting,\nwhich in turn enables a better mitigation of the bias caused by local updates.\n","authors":["Tehila Dahan","Kfir Y. Levy"],"pdf_url":"https://arxiv.org/pdf/2304.04169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10859v1","updated":"2025-01-18T19:52:27Z","published":"2025-01-18T19:52:27Z","title":"Which price to pay? Auto-tuning building MPC controller for optimal\n  economic cost","summary":"  Model predictive control (MPC) controller is considered for temperature\nmanagement in buildings but its performance heavily depends on hyperparameters.\nConsequently, MPC necessitates meticulous hyperparameter tuning to attain\noptimal performance under diverse contracts. However, conventional building\ncontroller design is an open-loop process without critical hyperparameter\noptimization, often leading to suboptimal performance due to unexpected\nenvironmental disturbances and modeling errors. Furthermore, these\nhyperparameters are not adapted to different pricing schemes and may lead to\nnon-economic operations. To address these issues, we propose an efficient\nperformance-oriented building MPC controller tuning method based on a\ncutting-edge efficient constrained Bayesian optimization algorithm, CONFIG,\nwith global optimality guarantees. We demonstrate that this technique can be\napplied to efficiently deal with real-world DSM program selection problems\nunder customized black-box constraints and objectives. In this study, a simple\nMPC controller, which offers the advantages of reduced commissioning costs,\nenhanced computational efficiency, was optimized to perform on a comparable\nlevel to a delicately designed and computationally expensive MPC controller.\nThe results also indicate that with an optimized simple MPC, the monthly\nelectricity cost of a household can be reduced by up to 26.90% compared with\nthe cost when controlled by a basic rule-based controller under the same\nconstraints. Then we compared 12 real electricity contracts in Belgium for a\nhousehold family with customized black-box occupant comfort constraints. The\nresults indicate a monthly electricity bill saving up to 20.18% when the most\neconomic contract is compared with the worst one, which again illustrates the\nsignificance of choosing a proper electricity contract.\n","authors":["Jiarui Yu","Jicheng Shi","Wenjie Xu","Colin N. Jones"],"pdf_url":"https://arxiv.org/pdf/2501.10859v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2301.09511v2","updated":"2025-01-18T16:29:57Z","published":"2023-01-23T16:02:54Z","title":"On the Convergence of the Gradient Descent Method with Stochastic\n  Fixed-point Rounding Errors under the Polyak-Lojasiewicz Inequality","summary":"  When training neural networks with low-precision computation, rounding errors\noften cause stagnation or are detrimental to the convergence of the optimizers;\nin this paper we study the influence of rounding errors on the convergence of\nthe gradient descent method for problems satisfying the Polyak-\\Lojasiewicz\ninequality. Within this context, we show that, in contrast, biased stochastic\nrounding errors may be beneficial since choosing a proper rounding strategy\neliminates the vanishing gradient problem and forces the rounding bias in a\ndescent direction. Furthermore, we obtain a bound on the convergence rate that\nis stricter than the one achieved by unbiased stochastic rounding. The\ntheoretical analysis is validated by comparing the performances of various\nrounding strategies when optimizing several examples using low-precision\nfixed-point number formats.\n","authors":["Lu Xia","Michiel E. Hochstenbach","Stefano Massei"],"pdf_url":"https://arxiv.org/pdf/2301.09511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10806v1","updated":"2025-01-18T16:00:14Z","published":"2025-01-18T16:00:14Z","title":"Non-Expansive Mappings in Two-Time-Scale Stochastic Approximation:\n  Finite-Time Analysis","summary":"  Two-time-scale stochastic approximation is an iterative algorithm used in\napplications such as optimization, reinforcement learning, and control.\nFinite-time analysis of these algorithms has primarily focused on fixed point\niterations where both time-scales have contractive mappings. In this paper, we\nstudy two-time-scale iterations, where the slower time-scale has a\nnon-expansive mapping. For such algorithms, the slower time-scale can be\nconsidered a stochastic inexact Krasnoselskii-Mann iteration. We show that the\nmean square error decays at a rate $O(1/k^{1/4-\\epsilon})$, where $\\epsilon>0$\nis arbitrarily small. We also show almost sure convergence of iterates to the\nset of fixed points. We show the applicability of our framework by applying our\nresults to minimax optimization, linear stochastic approximation, and\nLagrangian optimization.\n","authors":["Siddharth Chandak"],"pdf_url":"https://arxiv.org/pdf/2501.10806v1.pdf","comment":"Submitted to SIAM Journal on Control and Optimization"},{"id":"http://arxiv.org/abs/2406.00871v2","updated":"2025-01-18T15:05:59Z","published":"2024-06-02T21:23:17Z","title":"Inverting Laguerre tessellations: Recovering tessellations from the\n  volumes and centroids of their cells using optimal transport","summary":"  In this paper we study an inverse problem in convex geometry, inspired by a\nproblem in materials science. Firstly, we consider the question of whether a\nLaguerre tessellation (a partition by convex polytopes) can be recovered from\nonly the volumes and centroids of its cells. We show that this problem has a\nunique solution and give a constructive way of computing it using optimal\ntransport theory and convex optimisation. Secondly, we consider the problem of\nfitting a Laguerre tessellation to synthetic volume and centroid data. Given\nsome target volumes and centroids, we seek a Laguerre tessellation such that\nthe difference between the volumes and centroids of its cells and the target\nvolumes and centroids is minimised. For an appropriate objective function and\nsuitable data, we prove that local minimisers of this problem can be\nconstructed using convex optimisation. We also illustrate our results\nnumerically. There is great interest in the computational materials science\ncommunity in fitting Laguerre tessellations to electron backscatter diffraction\n(EBSD) and x-ray diffraction images of polycrystalline materials. As an\napplication of our results we fit a 2D Laguerre tessellation to an EBSD image\nof steel.\n","authors":["David P. Bourne","Mason Pearce","Steven M. Roper"],"pdf_url":"https://arxiv.org/pdf/2406.00871v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08568v3","updated":"2025-01-18T15:03:28Z","published":"2023-10-12T17:56:12Z","title":"When Location Shapes Choice: Placement Optimization of Substitutable\n  Products","summary":"  Strategic product placement can have a strong influence on customer purchase\nbehavior in physical stores as well as online platforms. Motivated by this, we\nconsider the problem of optimizing the placement of substitutable products in\ndesignated display locations to maximize the expected revenue of the seller. We\nmodel the customer behavior as a two-stage process: first, the customer visits\na subset of display locations according to a browsing distribution; second, the\ncustomer chooses at most one product from the displayed products at those\nlocations according to a choice model. Our goal is to design a general\nalgorithm that can select and place the products optimally for any browsing\ndistribution and choice model, and we call this the Placement problem. We give\na randomized algorithm that utilizes an $\\alpha$-approximate algorithm for\ncardinality constrained assortment optimization and outputs a\n$\\frac{\\Theta(\\alpha)}{\\log m}$-approximate solution (in expectation) for\nPlacement with $m$ display locations, i.e., our algorithm outputs a solution\nwith value at least $\\frac{\\Omega(\\alpha)}{\\log m}$ factor of the optimal and\nthis is tight in the worst case. We also give algorithms with stronger\nguarantees in some special cases. In particular, we give a deterministic\n$\\frac{\\Omega(1)}{\\log m}$-approximation algorithm for the Markov choice model,\nand a tight $(1-1/e)$-approximation algorithm for the problem when products\nhave identical prices.\n","authors":["Omar El Housni","Rajan Udwani"],"pdf_url":"https://arxiv.org/pdf/2310.08568v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10778v1","updated":"2025-01-18T14:19:59Z","published":"2025-01-18T14:19:59Z","title":"Supervised Large Neighbourhood Search for MIPs","summary":"  Large Neighbourhood Search (LNS) is a powerful heuristic framework for\nsolving Mixed-Integer Programming (MIP) problems. However, designing effective\nvariable selection strategies in LNS remains challenging, especially for\ndiverse sets of problems. In this paper, we propose an approach that integrates\nMachine Learning (ML) within the destroy operator of LNS for MIPs with a focus\non minimal offline training. We implement a modular LNS matheuristic as a test\nbench to compare different LNS heuristics, including our ML-enhanced LNS.\nExperimental results on the MIPLIB 2017 dataset demonstrate that the\nmatheuristic can significantly improve the performance of state-of-the-art\nsolvers like Gurobi and SCIP. We conduct analyses on noisy oracles to explore\nthe impact of prediction accuracy on solution quality. Additionally, we develop\ntechniques to enhance the ML model through loss adjustments and sampling\nroutines. Our findings suggest that while random LNS remains competitive, our\nSupervised LNS (SLNS) outperforms other baselines and helps set the foundation\nfor future research on ML for LNS methods that are both efficient and general.\n","authors":["Charly Robinson La Rocca","Jean-François Cordeau","Emma Frejinger"],"pdf_url":"https://arxiv.org/pdf/2501.10778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10769v1","updated":"2025-01-18T13:54:29Z","published":"2025-01-18T13:54:29Z","title":"Risk-Averse Antibiotics Time Machine Problem","summary":"  Antibiotic resistance, which is a serious healthcare issue, emerges due to\nuncontrolled and repeated antibiotic use that causes bacteria to mutate and\ndevelop resistance to antibiotics. The Antibiotics Time Machine Problem aims to\ncome up with treatment plans that maximize the probability of reversing these\nmutations. Motivated by the severity of the problem, we develop a risk-averse\napproach and formulate a scenario-based mixed-integer linear program with a\nconditional value-at-risk objective function. We propose a risk-averse scenario\nbatch decomposition algorithm that partitions the scenarios into manageable\nrisk-averse subproblems, enabling the construction of lower and upper bounds.\nWe develop several algorithmic enhancements in the form of stronger no-good\ncuts and symmetry breaking constraints in addition to scenario regrouping and\nwarm starting. We conduct extensive computational experiments for static and\ndynamic versions of the problem on a real dataset and demonstrate the\neffectiveness of our approach. Our results suggest that risk-averse solutions\ncan achieve significantly better worst-case performance compared to\nrisk-neutral solutions with a slight decrease in terms of the average\nperformance, especially for the dynamic version. Although our methodology is\npresented in the context of the Antibiotics Time Machine Problem, it can be\nadapted to other risk-averse problem settings in which the decision variables\ncome from special ordered sets of type one.\n","authors":["Deniz Tuncer","Burak Kocuk"],"pdf_url":"https://arxiv.org/pdf/2501.10769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19273v2","updated":"2025-01-18T13:16:01Z","published":"2024-07-27T14:56:55Z","title":"Numerical Analysis for a Hyperbolic PDE-Constrained Optimization Problem\n  in Acoustic Full Waveform Inversion","summary":"  This paper explores a fully discrete approximation for a nonlinear hyperbolic\nPDE-constrained optimization problem (P) with applications in acoustic full\nwaveform inversion. The optimization problem is primarily complicated by the\nhyperbolic character and the second-order bilinear structure in the governing\nwave equation. While the control parameter is discretized using the piecewise\nconstant elements, the state discretization is realized through an auxiliary\nfirst-order system along with the leapfrog time-stepping method and continuous\npiecewise linear elements. The resulting fully discrete minimization problem\n($\\text{P}_h$) is shown to be well-defined. Furthermore, building upon a\nsuitable CFL-condition, we prove stability and uniform convergence of the state\ndiscretization. Our final result is the strong convergence result for\n($\\text{P}_h$) in the following sense: Given a local minimizer $\\overline \\nu$\nof (P) satisfying a reasonable growth condition, there exists a sequence of\nlocal minimizers of ($\\text{P}_h$) converging strongly towards $\\overline \\nu$.\n","authors":["Luis Ammann","Irwin Yousept"],"pdf_url":"https://arxiv.org/pdf/2407.19273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10745v1","updated":"2025-01-18T12:11:01Z","published":"2025-01-18T12:11:01Z","title":"Changing the ranking in eigenvector centrality of a weighted graph by\n  small perturbations","summary":"  In this article, we consider eigenvector centrality for the nodes of a graph\nand study the robustness (and stability) of this popular centrality measure.\nFor a given weighted graph $\\G$ (both directed and undirected), we consider the\nassociated weighted adiacency matrix $A$, which by definition is a non-negative\nmatrix. Eigenvector centrality consists of ranking the elements of the graph\naccording to the corresponding entries of the Perron eigenvector of $A$, which\nis associated with the positive eigenvalue with largest modulus.\n  An indicator of the robustness of eigenvector centrality consists in looking\nfor a nearby perturbed graph $\\widetilde{\\G}$, with the same structure as $\\G$\n(i.e., with the same vertices and edges), but with a weighted adiacency matrix\n$\\widetilde A$ such that the highest $m$ entries ($m \\ge 2$) of the Perron\neigenvector of $\\widetilde A$ coalesce, making the ranking at the highest level\nambiguous. To compute a solution to this matrix nearness problem, a nested\niterative algorithm is proposed that makes use of a constrained gradient system\nof matrix differential equations (possibly on a low-rank manifold) in the inner\niteration and a one-dimensional optimization of the perturbation size in the\nouter iteration.\n  The proposed algorithm produces the {\\em optimal} perturbation (i.e., the one\nwith smallest Frobenius norm) of the graph, which causes the looked-for\ncoalescence, which is a measure of the sensitivity of the graph. The\nmethodology is formulated in terms of graphs but applies to any nonnegative\nmatrix, with potential applications in fields like population models, consensus\ndynamics, economics, etc.\n","authors":["Michele Benzi","Nicola Guglielmi"],"pdf_url":"https://arxiv.org/pdf/2501.10745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10742v1","updated":"2025-01-18T12:06:30Z","published":"2025-01-18T12:06:30Z","title":"On a geometric graph-covering problem related to optimal\n  safety-landing-site location","summary":"  We propose integer-programming formulations for an optimal safety-landing\nsite (SLS) location problem that arises in the design of urban\nair-transportation networks. We first develop a set-cover based approach for\nthe case where the candidate location set is finite and composed of points, and\nwe link the problems to solvable cases that have been studied. We then use a\nmixed-integer second-order cone program to model the situation where the\nlocations of SLSs are restricted to convex sets only. Finally, we introduce\nstrong fixing, which we found to be very effective in reducing the size of\ninteger programs.\n","authors":["Claudia D'Ambrosio","Marcia Fampa","Jon Lee","Felipe Sinnecker"],"pdf_url":"https://arxiv.org/pdf/2501.10742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10740v1","updated":"2025-01-18T12:02:35Z","published":"2025-01-18T12:02:35Z","title":"Stability of neural ODEs by a control over the expansivity of their\n  flows","summary":"  We propose a method to enhance the stability of a neural ordinary\ndifferential equation (neural ODE) by means of a control over the Lipschitz\nconstant $C$ of its flow. Since it is known that $C$ depends on the logarithmic\nnorm of the Jacobian matrix associated with the neural ODE, we tune this\nparameter at our convenience by suitably perturbing the Jacobian matrix with a\nperturbation as small as possible in Frobenius norm. We do so by introducing an\noptimization problem for which we propose a nested two-level algorithm. For a\ngiven perturbation size, the inner level computes the optimal perturbation with\na fixed Frobenius norm, while the outer level tunes the perturbation amplitude.\nWe embed the proposed algorithm in the training of the neural ODE to improve\nits stability. Numerical experiments on the MNIST and FashionMNIST datasets\nshow that an image classifier including a neural ODE in its architecture\ntrained according to our strategy is more stable than the same classifier\ntrained in the classical way, and therefore, it is more robust and less\nvulnerable to adversarial attacks.\n","authors":["Arturo De Marinis","Nicola Guglielmi","Stefano Sicilia","Francesco Tudisco"],"pdf_url":"https://arxiv.org/pdf/2501.10740v1.pdf","comment":"22 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.12206v2","updated":"2025-01-18T06:39:40Z","published":"2024-03-18T19:43:00Z","title":"Useful Compact Representations for Data-Fitting","summary":"  For minimization problems without 2nd derivative information, methods that\nestimate Hessian matrices can be very effective. However, conventional\ntechniques generate dense matrices that are prohibitive for large problems.\nLimited-memory compact representations express the dense arrays in terms of a\nlow rank representation and have become the state-of-the-art for software\nimplementations on large deterministic problems. We develop new compact\nrepresentations that are parameterized by a choice of vectors and that reduce\nto existing well known formulas for special choices. We demonstrate\neffectiveness of the compact representations for large eigenvalue computations,\ntensor factorizations and nonlinear regressions.\n","authors":["Johannes J. Brust"],"pdf_url":"https://arxiv.org/pdf/2403.12206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10670v1","updated":"2025-01-18T06:27:42Z","published":"2025-01-18T06:27:42Z","title":"Computing Capacity-Cost Functions for Continuous Channels in Wasserstein\n  Space","summary":"  This paper investigates the problem of computing capacity-cost (C-C)\nfunctions for continuous channels. Motivated by the Kullback-Leibler divergence\n(KLD) proximal reformulation of the classical Blahut-Arimoto (BA) algorithm,\nthe Wasserstein distance is introduced to the proximal term for the continuous\ncase, resulting in an iterative algorithm related to the Wasserstein gradient\ndescent. Practical implementation involves moving particles along the negative\ngradient direction of the objective function's first variation in the\nWasserstein space and approximating integrals by the importance sampling (IS)\ntechnique. Such formulation is also applied to the rate-distortion (R-D)\nfunction for continuous source spaces and thus provides a unified computation\nframework for both problems.\n","authors":["Xinyang Li","Vlad C. Andrei","Ullrich J. Mönich","Fan Liu","Holger Boche"],"pdf_url":"https://arxiv.org/pdf/2501.10670v1.pdf","comment":"Accepted to IEEE International Conference on Communications 2025"},{"id":"http://arxiv.org/abs/2407.16800v2","updated":"2025-01-18T05:56:44Z","published":"2024-07-23T19:01:53Z","title":"Wasserstein Distributionally Robust Shallow Convex Neural Networks","summary":"  In this work, we propose Wasserstein distributionally robust shallow convex\nneural networks (WaDiRo-SCNNs) to provide reliable nonlinear predictions when\nsubject to adverse and corrupted datasets. Our approach is based on a new\nconvex training program for $\\ReLU$-based shallow neural networks which allows\nus to cast the problem as an exact, tractable reformulation of its order-1\nWasserstein distributionally robust counterpart. Our training procedure is\nconservative, has low stochasticity, is solvable with open-source solvers, and\nis scalable to large industrial deployments. We provide out-of-sample\nperformance guarantees, show that hard convex physical constraints can be\nenforced in the training program, and propose a mixed-integer convex\npost-training verification program to evaluate model stability. WaDiRo-SCNN\naims to make neural networks safer for critical applications, such as in the\nenergy sector. Finally, we numerically demonstrate the performance of our model\non a synthetic experiment, a real-world power system application, i.e., the\nprediction of non-residential buildings' hourly energy consumption in the\ncontext of virtual power plants, and on benchmark datasets. The experimental\nresults are convincing and showcase the strengths of the proposed model.\n","authors":["Julien Pallage","Antoine Lesage-Landry"],"pdf_url":"https://arxiv.org/pdf/2407.16800v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00381v3","updated":"2025-01-18T04:16:09Z","published":"2023-11-01T09:18:22Z","title":"Time-inconsistent mean-field stopping problems: A regularized\n  equilibrium approach","summary":"  This paper studies the mean-field Markov decision process (MDP) with the\ncentralized stopping under the non-exponential discount. The problem differs\nfundamentally from most existing studies on mean-field optimal control/stopping\ndue to its time inconsistency by nature. We look for the subgame perfect\nrelaxed equilibria, namely the randomized stopping policies that satisfy the\ntime-consistent planning with future selves from the perspective of the social\nplanner. On the other hand, unlike many previous studies on time-inconsistent\nstopping where the decreasing impatience plays a key role, we are interested in\nthe general discount function without imposing any conditions. As a result, the\nstudy on the relaxed equilibrium becomes necessary as the pure-strategy\nequilibrium may not exist in general. We formulate relaxed equilibria as fixed\npoints of a complicated operator, whose existence is challenging by a direct\nmethod. To overcome the obstacles, we first introduce the auxiliary problem\nunder the entropy regularization on the randomized policy and the discount\nfunction, and establish the existence of the regularized equilibria as fixed\npoints to an auxiliary operator via Schauder fixed point theorem. Next, we show\nthat the regularized equilibrium converges as the regularization parameter\n$\\lambda$ tends to $0$ and the limit corresponds to a fixed point to the\noriginal operator, and hence is a relaxed equilibrium. We also establish some\nconnections between the mean-field MDP and the N-agent MDP when $N$ is\nsufficiently large in our time-inconsistent setting.\n","authors":["Xiang Yu","Fengyi Yuan"],"pdf_url":"https://arxiv.org/pdf/2311.00381v3.pdf","comment":"Final version, forthcoming in Finance and Stochastics"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.10891v1","updated":"2025-01-18T22:30:27Z","published":"2025-01-18T22:30:27Z","title":"OpenEarthMap-SAR: A Benchmark Synthetic Aperture Radar Dataset for\n  Global High-Resolution Land Cover Mapping","summary":"  High-resolution land cover mapping plays a crucial role in addressing a wide\nrange of global challenges, including urban planning, environmental monitoring,\ndisaster response, and sustainable development. However, creating accurate,\nlarge-scale land cover datasets remains a significant challenge due to the\ninherent complexities of geospatial data, such as diverse terrain, varying\nsensor modalities, and atmospheric conditions. Synthetic Aperture Radar (SAR)\nimagery, with its ability to penetrate clouds and capture data in all-weather,\nday-and-night conditions, offers unique advantages for land cover mapping.\nDespite these strengths, the lack of benchmark datasets tailored for SAR\nimagery has limited the development of robust models specifically designed for\nthis data modality. To bridge this gap and facilitate advancements in SAR-based\ngeospatial analysis, we introduce OpenEarthMap-SAR, a benchmark SAR dataset,\nfor global high-resolution land cover mapping. OpenEarthMap-SAR consists of 1.5\nmillion segments of 5033 aerial and satellite images with the size of\n1024$\\times$1024 pixels, covering 35 regions from Japan, France, and the USA,\nwith partially manually annotated and fully pseudo 8-class land cover labels at\na ground sampling distance of 0.15--0.5 m. We evaluated the performance of\nstate-of-the-art methods for semantic segmentation and present challenging\nproblem settings suitable for further technical development. The dataset also\nserves the official dataset for IEEE GRSS Data Fusion Contest Track I. The\ndataset has been made publicly available at\nhttps://zenodo.org/records/14622048.\n","authors":["Junshi Xia","Hongruixuan Chen","Clifford Broni-Bediako","Yimin Wei","Jian Song","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2501.10891v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2410.22392v5","updated":"2025-01-18T20:15:08Z","published":"2024-10-29T17:56:05Z","title":"CBAM-EfficientNetV2 for Histopathology Image Classification using\n  Transfer Learning and Dual Attention Mechanisms","summary":"  Breast cancer histopathology image classification is critical for early\ndetection and improved patient outcomes. 1 This study introduces a novel\napproach leveraging EfficientNetV2 models, to improve feature extraction and\nfocus on relevant tissue regions. The proposed models were evaluated on the\nBreakHis dataset across multiple magnification scales (40X, 100X, 200X, and\n400X). 2 Among them, the EfficientNetV2-XL with CBAM achieved outstanding\nperformance, reaching a peak accuracy of 99.01 percent and an F1-score of 98.31\npercent at 400X magnification, outperforming state-of-the-art methods. 3 By\nintegrating Contrast Limited Adaptive Histogram Equalization (CLAHE) for\npreprocessing and optimizing computational efficiency, this method demonstrates\nits suitability for real-time clinical deployment. 3 The results underscore the\npotential of attention-enhanced scalable architectures in advancing diagnostic\nprecision for breast cancer detection.\n","authors":["Naren Sengodan"],"pdf_url":"https://arxiv.org/pdf/2410.22392v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18124v5","updated":"2025-01-18T20:14:54Z","published":"2024-09-26T17:58:55Z","title":"Lotus: Diffusion-based Visual Foundation Model for High-quality Dense\n  Prediction","summary":"  Leveraging the visual priors of pre-trained text-to-image diffusion models\noffers a promising solution to enhance zero-shot generalization in dense\nprediction tasks. However, existing methods often uncritically use the original\ndiffusion formulation, which may not be optimal due to the fundamental\ndifferences between dense prediction and image generation. In this paper, we\nprovide a systemic analysis of the diffusion formulation for the dense\nprediction, focusing on both quality and efficiency. And we find that the\noriginal parameterization type for image generation, which learns to predict\nnoise, is harmful for dense prediction; the multi-step noising/denoising\ndiffusion process is also unnecessary and challenging to optimize. Based on\nthese insights, we introduce Lotus, a diffusion-based visual foundation model\nwith a simple yet effective adaptation protocol for dense prediction.\nSpecifically, Lotus is trained to directly predict annotations instead of\nnoise, thereby avoiding harmful variance. We also reformulate the diffusion\nprocess into a single-step procedure, simplifying optimization and\nsignificantly boosting inference speed. Additionally, we introduce a novel\ntuning strategy called detail preserver, which achieves more accurate and\nfine-grained predictions. Without scaling up the training data or model\ncapacity, Lotus achieves SoTA performance in zero-shot depth and normal\nestimation across various datasets. It also enhances efficiency, being\nsignificantly faster than most existing diffusion-based methods. Lotus'\nsuperior quality and efficiency also enable a wide range of practical\napplications, such as joint estimation, single/multi-view 3D reconstruction,\netc. Project page: https://lotus3d.github.io/.\n","authors":["Jing He","Haodong Li","Wei Yin","Yixun Liang","Leheng Li","Kaiqiang Zhou","Hongbo Zhang","Bingbing Liu","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18124v5.pdf","comment":"The first two authors contributed equally. Project page:\n  https://lotus3d.github.io/"},{"id":"http://arxiv.org/abs/2501.10851v1","updated":"2025-01-18T18:59:42Z","published":"2025-01-18T18:59:42Z","title":"Exploring Siamese Networks in Self-Supervised Fast MRI Reconstruction","summary":"  Reconstructing MR images using deep neural networks from undersampled k-space\ndata without using fully sampled training references offers significant value\nin practice, which is a self-supervised regression problem calling for\neffective prior knowledge and supervision. The Siamese architectures are\nmotivated by the definition \"invariance\" and shows promising results in\nunsupervised visual representative learning. Building homologous transformed\nimages and avoiding trivial solutions are two major challenges in Siamese-based\nself-supervised model. In this work, we explore Siamese architecture for MRI\nreconstruction in a self-supervised training fashion called SiamRecon. We show\nthe proposed approach mimics an expectation maximization algorithm. The\nalternative optimization provide effective supervision signal and avoid\ncollapse. The proposed SiamRecon achieves the state-of-the-art reconstruction\naccuracy in the field of self-supervised learning on both single-coil brain MRI\nand multi-coil knee MRI.\n","authors":["Liyan Sun","Shaocong Yu","Chi Zhang","Xinghao Ding"],"pdf_url":"https://arxiv.org/pdf/2501.10851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03616v3","updated":"2025-01-18T17:56:44Z","published":"2025-01-07T08:32:48Z","title":"BTMTrack: Robust RGB-T Tracking via Dual-template Bridging and\n  Temporal-Modal Candidate Elimination","summary":"  RGB-T tracking leverages the complementary strengths of RGB and thermal\ninfrared (TIR) modalities to address challenging scenarios such as low\nillumination and adverse weather. However, existing methods often fail to\neffectively integrate temporal information and perform efficient cross-modal\ninteractions, which constrain their adaptability to dynamic targets. In this\npaper, we propose BTMTrack, a novel framework for RGB-T tracking. The core of\nour approach lies in the dual-template backbone network and the Temporal-Modal\nCandidate Elimination (TMCE) strategy. The dual-template backbone effectively\nintegrates temporal information, while the TMCE strategy focuses the model on\ntarget-relevant tokens by evaluating temporal and modal correlations, reducing\ncomputational overhead and avoiding irrelevant background noise. Building upon\nthis foundation, we propose the Temporal Dual Template Bridging (TDTB) module,\nwhich facilitates precise cross-modal fusion through dynamically filtered\ntokens. This approach further strengthens the interaction between templates and\nthe search region. Extensive experiments conducted on three benchmark datasets\ndemonstrate the effectiveness of BTMTrack. Our method achieves state-of-the-art\nperformance, with a 72.3% precision rate on the LasHeR test set and competitive\nresults on RGBT210 and RGBT234 datasets.\n","authors":["Zhongxuan Zhang","Bi Zeng","Xinyu Ni","Yimin Du"],"pdf_url":"https://arxiv.org/pdf/2501.03616v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10834v1","updated":"2025-01-18T17:43:05Z","published":"2025-01-18T17:43:05Z","title":"Visual RAG: Expanding MLLM visual knowledge without fine-tuning","summary":"  Multimodal Large Language Models (MLLMs) have achieved notable performance in\ncomputer vision tasks that require reasoning across visual and textual\nmodalities, yet their capabilities are limited to their pre-trained data,\nrequiring extensive fine-tuning for updates. Recent researches have explored\nthe use of In-Context Learning (ICL) to overcome these challenges by providing\na set of demonstrating examples as context to augment MLLMs performance in\nseveral tasks, showing that many-shot ICL leads to substantial improvements\ncompared to few-shot ICL. However, the reliance on numerous demonstrating\nexamples and the limited MLLMs context windows presents significant obstacles.\nThis paper aims to address these challenges by introducing a novel approach,\nVisual RAG, that synergically combines the MLLMs capability to learn from the\ncontext, with a retrieval mechanism. The crux of this approach is to ensure to\naugment the MLLM knowledge by selecting only the most relevant demonstrating\nexamples for the query, pushing it to learn by analogy. In this way, relying on\nthe new information provided dynamically during inference time, the resulting\nsystem is not limited to the knowledge extracted from the training data, but\ncan be updated rapidly and easily without fine-tuning. Furthermore, this\ngreatly reduces the computational costs for improving the model image\nclassification performance, and augments the model knowledge to new visual\ndomains and tasks it was not trained for. Extensive experiments on eight\ndifferent datasets in the state of the art spanning several domains and image\nclassification tasks show that the proposed Visual RAG, compared to the most\nrecent state of the art (i.e., many-shot ICL), is able to obtain an accuracy\nthat is very close or even higher (approx. +2% improvement on average) while\nusing a much smaller set of demonstrating examples (approx. only 23% on\naverage).\n","authors":["Mirco Bonomo","Simone Bianco"],"pdf_url":"https://arxiv.org/pdf/2501.10834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10819v1","updated":"2025-01-18T16:40:53Z","published":"2025-01-18T16:40:53Z","title":"GAUDA: Generative Adaptive Uncertainty-guided Diffusion-based\n  Augmentation for Surgical Segmentation","summary":"  Augmentation by generative modelling yields a promising alternative to the\naccumulation of surgical data, where ethical, organisational and regulatory\naspects must be considered. Yet, the joint synthesis of (image, mask) pairs for\nsegmentation, a major application in surgery, is rather unexplored. We propose\nto learn semantically comprehensive yet compact latent representations of the\n(image, mask) space, which we jointly model with a Latent Diffusion Model. We\nshow that our approach can effectively synthesise unseen high-quality paired\nsegmentation data of remarkable semantic coherence. Generative augmentation is\ntypically applied pre-training by synthesising a fixed number of additional\ntraining samples to improve downstream task models. To enhance this approach,\nwe further propose Generative Adaptive Uncertainty-guided Diffusion-based\nAugmentation (GAUDA), leveraging the epistemic uncertainty of a Bayesian\ndownstream model for targeted online synthesis. We condition the generative\nmodel on classes with high estimated uncertainty during training to produce\nadditional unseen samples for these classes. By adaptively utilising the\ngenerative model online, we can minimise the number of additional training\nsamples and centre them around the currently most uncertain parts of the data\ndistribution. GAUDA effectively improves downstream segmentation results over\ncomparable methods by an average absolute IoU of 1.6% on CaDISv2 and 1.5% on\nCholecSeg8k, two prominent surgical datasets for semantic segmentation.\n","authors":["Yannik Frisch","Christina Bornberg","Moritz Fuchs","Anirban Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2501.10819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10814v1","updated":"2025-01-18T16:23:09Z","published":"2025-01-18T16:23:09Z","title":"No More Sliding Window: Efficient 3D Medical Image Segmentation with\n  Differentiable Top-k Patch Sampling","summary":"  3D models are favored over 2D for 3D medical image segmentation tasks due to\ntheir ability to leverage inter-slice relationship, yielding higher\nsegmentation accuracy. However, 3D models demand significantly more GPU memory\nwith increased model size and intermediate tensors. A common solution is to use\npatch-based training and make whole-volume predictions with sliding window (SW)\ninference. SW inference reduces memory usage but is slower due to equal\nresource allocation across patches and less accurate as it overlooks global\nfeatures beyond patches.\n  We propose NMSW-Net (No-More-Sliding-Window-Net), a novel framework that\nenhances efficiency and accuracy of any given 3D segmentation model by\neliminating SW inference and incorporating global predictions when necessary.\nNMSW-Net incorporates a differentiable Top-k module to sample only the relevant\npatches that enhance segmentation accuracy, thereby minimizing redundant\ncomputations. Additionally, it learns to leverage coarse global predictions\nwhen patch prediction alone is insufficient. NMSW-Net is model-agnostic, making\nit compatible with any 3D segmentation model that previously relied on SW\ninference.\n  Evaluated across 3 tasks with 3 segmentation backbones, NMSW-Net achieves\ncompetitive or sometimes superior accuracy compared to SW, while reducing\ncomputational complexity by 90% (87.5 to 7.95 TFLOPS), delivering 4x faster\ninference on the H100 GPU (19.0 to 4.3 sec), and 7x faster inference on the\nIntel Xeon Gold CPU (1710 to 230 seconds).\n","authors":["Young Seok Jeon","Hongfei Yang","Huazhu Fu","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2501.10814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10809v1","updated":"2025-01-18T16:20:04Z","published":"2025-01-18T16:20:04Z","title":"Efficient Auto-Labeling of Large-Scale Poultry Datasets (ALPD) Using\n  Semi-Supervised Models, Active Learning, and Prompt-then-Detect Approach","summary":"  The rapid growth of AI in poultry farming has highlighted the challenge of\nefficiently labeling large, diverse datasets. Manual annotation is\ntime-consuming, making it impractical for modern systems that continuously\ngenerate data. This study explores semi-supervised auto-labeling methods,\nintegrating active learning, and prompt-then-detect paradigm to develop an\nefficient framework for auto-labeling of large poultry datasets aimed at\nadvancing AI-driven behavior and health monitoring. Viideo data were collected\nfrom broilers and laying hens housed at the University of Arkansas and the\nUniversity of Georgia. The collected videos were converted into images,\npre-processed, augmented, and labeled. Various machine learning models,\nincluding zero-shot models like Grounding DINO, YOLO-World, and CLIP, and\nsupervised models like YOLO and Faster-RCNN, were utilized for broilers, hens,\nand behavior detection. The results showed that YOLOv8s-World and YOLOv9s\nperformed better when compared performance metrics for broiler and hen\ndetection under supervised learning, while among the semi-supervised model,\nYOLOv8s-ALPD achieved the highest precision (96.1%) and recall (99.0%) with an\nRMSE of 1.9. The hybrid YOLO-World model, incorporating the optimal YOLOv8s\nbackbone, demonstrated the highest overall performance. It achieved a precision\nof 99.2%, recall of 99.4%, and an F1 score of 98.7% for breed detection,\nalongside a precision of 88.4%, recall of 83.1%, and an F1 score of 84.5% for\nindividual behavior detection. Additionally, semi-supervised models showed\nsignificant improvements in behavior detection, achieving up to 31% improvement\nin precision and 16% in F1-score. The semi-supervised models with minimal\nactive learning reduced annotation time by over 80% compared to full manual\nlabeling. Moreover, integrating zero-shot models enhanced detection and\nbehavior identification.\n","authors":["Ramesh Bahadur Bist","Lilong Chai","Shawna Weimer","Hannah Atungulua","Chantel Pennicott","Xiao Yang","Sachin Subedi","Chaitanya Pallerla","Yang Tian","Dongyi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18999v4","updated":"2025-01-18T16:19:14Z","published":"2023-10-29T12:55:53Z","title":"DynPoint: Dynamic Neural Point For View Synthesis","summary":"  The introduction of neural radiance fields has greatly improved the\neffectiveness of view synthesis for monocular videos. However, existing\nalgorithms face difficulties when dealing with uncontrolled or lengthy\nscenarios, and require extensive training time specific to each new scenario.\nTo tackle these limitations, we propose DynPoint, an algorithm designed to\nfacilitate the rapid synthesis of novel views for unconstrained monocular\nvideos. Rather than encoding the entirety of the scenario information into a\nlatent representation, DynPoint concentrates on predicting the explicit 3D\ncorrespondence between neighboring frames to realize information aggregation.\nSpecifically, this correspondence prediction is achieved through the estimation\nof consistent depth and scene flow information across frames. Subsequently, the\nacquired correspondence is utilized to aggregate information from multiple\nreference frames to a target frame, by constructing hierarchical neural point\nclouds. The resulting framework enables swift and accurate view synthesis for\ndesired views of target frames. The experimental results obtained demonstrate\nthe considerable acceleration of training time achieved - typically an order of\nmagnitude - by our proposed method while yielding comparable outcomes compared\nto prior approaches. Furthermore, our method exhibits strong robustness in\nhandling long-duration videos without learning a canonical representation of\nvideo content.\n","authors":["Kaichen Zhou","Jia-Xing Zhong","Sangyun Shin","Kai Lu","Yiyuan Yang","Andrew Markham","Niki Trigoni"],"pdf_url":"https://arxiv.org/pdf/2310.18999v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15268v7","updated":"2025-01-18T16:16:10Z","published":"2023-12-23T14:36:27Z","title":"Manydepth2: Motion-Aware Self-Supervised Multi-Frame Monocular Depth\n  Estimation in Dynamic Scenes","summary":"  Despite advancements in self-supervised monocular depth estimation,\nchallenges persist in dynamic scenarios due to the dependence on assumptions\nabout a static world. In this paper, we present Manydepth2, to achieve precise\ndepth estimation for both dynamic objects and static backgrounds, all while\nmaintaining computational efficiency. To tackle the challenges posed by dynamic\ncontent, we incorporate optical flow and coarse monocular depth to create a\npseudo-static reference frame. This frame is then utilized to build a\nmotion-aware cost volume in collaboration with the vanilla target frame.\nFurthermore, to improve the accuracy and robustness of the network\narchitecture, we propose an attention-based depth network that effectively\nintegrates information from feature maps at different resolutions by\nincorporating both channel and non-local attention mechanisms. Compared to\nmethods with similar computational costs, Manydepth2 achieves a significant\nreduction of approximately five percent in root-mean-square error for\nself-supervised monocular depth estimation on the KITTI-2015 dataset. The code\ncould be found at https://github.com/kaichen-z/Manydepth2.\n","authors":["Kaichen Zhou","Jia-Wang Bian","Qian Xie","Jian-Qing Zheng","Niki Trigoni","Andrew Markham"],"pdf_url":"https://arxiv.org/pdf/2312.15268v7.pdf","comment":"Monocular Depth Estimation, Self-Supervised, Optical Flow"},{"id":"http://arxiv.org/abs/2409.09953v2","updated":"2025-01-18T16:09:00Z","published":"2024-09-16T02:53:49Z","title":"Uncertainty-Guided Appearance-Motion Association Network for\n  Out-of-Distribution Action Detection","summary":"  Out-of-distribution (OOD) detection targets to detect and reject test samples\nwith semantic shifts, to prevent models trained on in-distribution (ID) dataset\nfrom producing unreliable predictions. Existing works only extract the\nappearance features on image datasets, and cannot handle dynamic multimedia\nscenarios with much motion information. Therefore, we target a more realistic\nand challenging OOD detection task: OOD action detection (ODAD). Given an\nuntrimmed video, ODAD first classifies the ID actions and recognizes the OOD\nactions, and then localizes ID and OOD actions. To this end, in this paper, we\npropose a novel Uncertainty-Guided Appearance-Motion Association Network\n(UAAN), which explores both appearance features and motion contexts to reason\nspatial-temporal inter-object interaction for ODAD.Firstly, we design separate\nappearance and motion branches to extract corresponding appearance-oriented and\nmotion-aspect object representations. In each branch, we construct a\nspatial-temporal graph to reason appearance-guided and motion-driven\ninter-object interaction. Then, we design an appearance-motion attention module\nto fuse the appearance and motion features for final action detection.\nExperimental results on two challenging datasets show that UAAN beats\nstate-of-the-art methods by a significant margin, illustrating its\neffectiveness.\n","authors":["Xiang Fang","Arvind Easwaran","Blaise Genest"],"pdf_url":"https://arxiv.org/pdf/2409.09953v2.pdf","comment":"Accepted by MIPR 2024"},{"id":"http://arxiv.org/abs/2412.01243v2","updated":"2025-01-18T16:00:13Z","published":"2024-12-02T08:05:26Z","title":"Schedule On the Fly: Diffusion Time Prediction for Faster and Better\n  Image Generation","summary":"  Diffusion and flow models have achieved remarkable successes in various\napplications such as text-to-image generation. However, these models typically\nrely on the same predetermined denoising schedules during inference for each\nprompt, which potentially limits the inference efficiency as well as the\nflexibility when handling different prompts. In this paper, we argue that the\noptimal noise schedule should adapt to each inference instance, and introduce\nthe Time Prediction Diffusion Model (TPDM) to accomplish this. TPDM employs a\nplug-and-play Time Prediction Module (TPM) that predicts the next noise level\nbased on current latent features at each denoising step. We train the TPM using\nreinforcement learning, aiming to maximize a reward that discounts the final\nimage quality by the number of denoising steps. With such an adaptive\nscheduler, TPDM not only generates high-quality images that are aligned closely\nwith human preferences but also adjusts the number of denoising steps and time\non the fly, enhancing both performance and efficiency. We train TPDMs on\nmultiple diffusion model benchmarks. With Stable Diffusion 3 Medium\narchitecture, TPDM achieves an aesthetic score of 5.44 and a human preference\nscore (HPS) of 29.59, while using around 50% fewer denoising steps to achieve\nbetter performance. We will release our best model alongside this paper.\n","authors":["Zilyu Ye","Zhiyang Chen","Tiancheng Li","Zemin Huang","Weijian Luo","Guo-Jun Qi"],"pdf_url":"https://arxiv.org/pdf/2412.01243v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10789v1","updated":"2025-01-18T14:56:09Z","published":"2025-01-18T14:56:09Z","title":"CS-Net:Contribution-based Sampling Network for Point Cloud\n  Simplification","summary":"  Point cloud sampling plays a crucial role in reducing computation costs and\nstorage requirements for various vision tasks. Traditional sampling methods,\nsuch as farthest point sampling, lack task-specific information and, as a\nresult, cannot guarantee optimal performance in specific applications.\nLearning-based methods train a network to sample the point cloud for the\ntargeted downstream task. However, they do not guarantee that the sampled\npoints are the most relevant ones. Moreover, they may result in duplicate\nsampled points, which requires completion of the sampled point cloud through\npost-processing techniques. To address these limitations, we propose a\ncontribution-based sampling network (CS-Net), where the sampling operation is\nformulated as a Top-k operation. To ensure that the network can be trained in\nan end-to-end way using gradient descent algorithms, we use a differentiable\napproximation to the Top-k operation via entropy regularization of an optimal\ntransport problem. Our network consists of a feature embedding module, a\ncascade attention module, and a contribution scoring module. The feature\nembedding module includes a specifically designed spatial pooling layer to\nreduce parameters while preserving important features. The cascade attention\nmodule combines the outputs of three skip connected offset attention layers to\nemphasize the attractive features and suppress less important ones. The\ncontribution scoring module generates a contribution score for each point and\nguides the sampling process to prioritize the most important ones. Experiments\non the ModelNet40 and PU147 showed that CS-Net achieved state-of-the-art\nperformance in two semantic-based downstream tasks (classification and\nregistration) and two reconstruction-based tasks (compression and surface\nreconstruction).\n","authors":["Tian Guo","Chen Chen","Hui Yuan","Xiaolong Mao","Raouf Hamzaoui","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2501.10789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10788v1","updated":"2025-01-18T14:55:58Z","published":"2025-01-18T14:55:58Z","title":"Decoupling Appearance Variations with 3D Consistent Features in Gaussian\n  Splatting","summary":"  Gaussian Splatting has emerged as a prominent 3D representation in novel view\nsynthesis, but it still suffers from appearance variations, which are caused by\nvarious factors, such as modern camera ISPs, different time of day, weather\nconditions, and local light changes. These variations can lead to floaters and\ncolor distortions in the rendered images/videos. Recent appearance modeling\napproaches in Gaussian Splatting are either tightly coupled with the rendering\nprocess, hindering real-time rendering, or they only account for mild global\nvariations, performing poorly in scenes with local light changes. In this\npaper, we propose DAVIGS, a method that decouples appearance variations in a\nplug-and-play and efficient manner. By transforming the rendering results at\nthe image level instead of the Gaussian level, our approach can model\nappearance variations with minimal optimization time and memory overhead.\nFurthermore, our method gathers appearance-related information in 3D space to\ntransform the rendered images, thus building 3D consistency across views\nimplicitly. We validate our method on several appearance-variant scenes, and\ndemonstrate that it achieves state-of-the-art rendering quality with minimal\ntraining time and memory usage, without compromising rendering speeds.\nAdditionally, it provides performance improvements for different Gaussian\nSplatting baselines in a plug-and-play manner.\n","authors":["Jiaqi Lin","Zhihao Li","Binxiao Huang","Xiao Tang","Jianzhuang Liu","Shiyong Liu","Xiaofei Wu","Fenglong Song","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2501.10788v1.pdf","comment":"Accepted to AAAI 2025. Project website:\n  https://davi-gaussian.github.io"},{"id":"http://arxiv.org/abs/2501.10787v1","updated":"2025-01-18T14:54:56Z","published":"2025-01-18T14:54:56Z","title":"LD-DETR: Loop Decoder DEtection TRansformer for Video Moment Retrieval\n  and Highlight Detection","summary":"  Video Moment Retrieval and Highlight Detection aim to find corresponding\ncontent in the video based on a text query. Existing models usually first use\ncontrastive learning methods to align video and text features, then fuse and\nextract multimodal information, and finally use a Transformer Decoder to decode\nmultimodal information. However, existing methods face several issues: (1)\nOverlapping semantic information between different samples in the dataset\nhinders the model's multimodal aligning performance; (2) Existing models are\nnot able to efficiently extract local features of the video; (3) The\nTransformer Decoder used by the existing model cannot adequately decode\nmultimodal features. To address the above issues, we proposed the LD-DETR model\nfor Video Moment Retrieval and Highlight Detection tasks. Specifically, we\nfirst distilled the similarity matrix into the identity matrix to mitigate the\nimpact of overlapping semantic information. Then, we designed a method that\nenables convolutional layers to extract multimodal local features more\nefficiently. Finally, we fed the output of the Transformer Decoder back into\nitself to adequately decode multimodal information. We evaluated LD-DETR on\nfour public benchmarks and conducted extensive experiments to demonstrate the\nsuperiority and effectiveness of our approach. Our model outperforms the\nState-Of-The-Art models on QVHighlight, Charades-STA and TACoS datasets. Our\ncode is available at https://github.com/qingchen239/ld-detr.\n","authors":["Pengcheng Zhao","Zhixian He","Fuwei Zhang","Shujin Lin","Fan Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.10787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02831v2","updated":"2025-01-18T14:34:55Z","published":"2024-06-05T00:44:42Z","title":"Distilling Aggregated Knowledge for Weakly-Supervised Video Anomaly\n  Detection","summary":"  Video anomaly detection aims to develop automated models capable of\nidentifying abnormal events in surveillance videos. The benchmark setup for\nthis task is extremely challenging due to: i) the limited size of the training\nsets, ii) weak supervision provided in terms of video-level labels, and iii)\nintrinsic class imbalance induced by the scarcity of abnormal events. In this\nwork, we show that distilling knowledge from aggregated representations of\nmultiple backbones into a single-backbone Student model achieves\nstate-of-the-art performance. In particular, we develop a bi-level distillation\napproach along with a novel disentangled cross-attention-based feature\naggregation network. Our proposed approach, DAKD (Distilling Aggregated\nKnowledge with Disentangled Attention), demonstrates superior performance\ncompared to existing methods across multiple benchmark datasets. Notably, we\nachieve significant improvements of 1.36%, 0.78%, and 7.02% on the UCF-Crime,\nShanghaiTech, and XD-Violence datasets, respectively.\n","authors":["Jash Dalvi","Ali Dabouei","Gunjan Dhanuka","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2406.02831v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10775v1","updated":"2025-01-18T14:08:33Z","published":"2025-01-18T14:08:33Z","title":"MedFILIP: Medical Fine-grained Language-Image Pre-training","summary":"  Medical vision-language pretraining (VLP) that leverages naturally-paired\nmedical image-report data is crucial for medical image analysis. However,\nexisting methods struggle to accurately characterize associations between\nimages and diseases, leading to inaccurate or incomplete diagnostic results. In\nthis work, we propose MedFILIP, a fine-grained VLP model, introduces medical\nimage-specific knowledge through contrastive learning, specifically: 1) An\ninformation extractor based on a large language model is proposed to decouple\ncomprehensive disease details from reports, which excels in extracting disease\ndeals through flexible prompt engineering, thereby effectively reducing text\ncomplexity while retaining rich information at a tiny cost. 2) A knowledge\ninjector is proposed to construct relationships between categories and visual\nattributes, which help the model to make judgments based on image features, and\nfosters knowledge extrapolation to unfamiliar disease categories. 3) A semantic\nsimilarity matrix based on fine-grained annotations is proposed, providing\nsmoother, information-richer labels, thus allowing fine-grained image-text\nalignment. 4) We validate MedFILIP on numerous datasets, e.g., RSNA-Pneumonia,\nNIH ChestX-ray14, VinBigData, and COVID-19. For single-label, multi-label, and\nfine-grained classification, our model achieves state-of-the-art performance,\nthe classification accuracy has increased by a maximum of 6.69\\%. The code is\navailable in https://github.com/PerceptionComputingLab/MedFILIP.\n","authors":["Xinjie Liang","Xiangyu Li","Fanding Li","Jie Jiang","Qing Dong","Wei Wang","Kuanquan Wang","Suyu Dong","Gongning Luo","Shuo Li"],"pdf_url":"https://arxiv.org/pdf/2501.10775v1.pdf","comment":"10 pages, 5 figures, IEEE Journal of Biomedical and Health\n  Informatics 2025"},{"id":"http://arxiv.org/abs/2501.10770v1","updated":"2025-01-18T13:54:33Z","published":"2025-01-18T13:54:33Z","title":"Enhancing Diagnostic in 3D COVID-19 Pneumonia CT-scans through\n  Explainable Uncertainty Bayesian Quantification","summary":"  Accurately classifying COVID-19 pneumonia in 3D CT scans remains a\nsignificant challenge in the field of medical image analysis. Although\ndeterministic neural networks have shown promising results in this area, they\nprovide only point estimates outputs yielding poor diagnostic in clinical\ndecision-making. In this paper, we explore the use of Bayesian neural networks\nfor classifying COVID-19 pneumonia in 3D CT scans providing uncertainties in\ntheir predictions. We compare deterministic networks and their Bayesian\ncounterpart, enhancing the decision-making accuracy under uncertainty\ninformation. Remarkably, our findings reveal that lightweight architectures\nachieve the highest accuracy of 96\\% after developing extensive hyperparameter\ntuning. Furthermore, the Bayesian counterpart of these architectures via\nMultiplied Normalizing Flow technique kept a similar performance along with\ncalibrated uncertainty estimates. Finally, we have developed a 3D-visualization\napproach to explain the neural network outcomes based on SHAP values. We\nconclude that explainability along with uncertainty quantification will offer\nbetter clinical decisions in medical image analysis, contributing to ongoing\nefforts for improving the diagnosis and treatment of COVID-19 pneumonia.\n","authors":["Juan Manuel Liscano Fierro","Hector J. Hortua"],"pdf_url":"https://arxiv.org/pdf/2501.10770v1.pdf","comment":"61 pages, 16 figures. Comments are welcome"},{"id":"http://arxiv.org/abs/2412.19145v2","updated":"2025-01-18T13:48:31Z","published":"2024-12-26T09:58:04Z","title":"Impact of color and mixing proportion of synthetic point clouds on\n  semantic segmentation","summary":"  Deep learning (DL)-based point cloud segmentation is essential for\nunderstanding built environment. Despite synthetic point clouds (SPC) having\nthe potential to compensate for data shortage, how synthetic color and mixing\nproportion impact DL-based segmentation remains a long-standing question.\nTherefore, this paper addresses this question with extensive experiments by\nintroducing: 1) method to generate SPC with real colors and uniform colors from\nBIM, and 2) enhanced benchmarks for better performance evaluation. Experiments\non DL models including PointNet, PointNet++, and DGCNN show that model\nperformance on SPC with real colors outperforms that on SPC with uniform colors\nby 8.2 % + on both OA and mIoU. Furthermore, a higher than 70 % mixing\nproportion of SPC usually leads to better performance. And SPC can replace real\nones to train a DL model for detecting large and flat building elements.\nOverall, this paper unveils the performance-improving mechanism of SPC and\nbrings new insights to boost SPC's value (for building large models for point\nclouds).\n","authors":["Shaojie Zhou","Jia-Rui Lin","Peng Pan","Yuandong Pan","Ioannis Brilakis"],"pdf_url":"https://arxiv.org/pdf/2412.19145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10761v1","updated":"2025-01-18T13:17:34Z","published":"2025-01-18T13:17:34Z","title":"Infrared and Visible Image Fusion: From Data Compatibility to Task\n  Adaption","summary":"  Infrared-visible image fusion (IVIF) is a critical task in computer vision,\naimed at integrating the unique features of both infrared and visible spectra\ninto a unified representation. Since 2018, the field has entered the deep\nlearning era, with an increasing variety of approaches introducing a range of\nnetworks and loss functions to enhance visual performance. However, challenges\nsuch as data compatibility, perception accuracy, and efficiency remain.\nUnfortunately, there is a lack of recent comprehensive surveys that address\nthis rapidly expanding domain. This paper fills that gap by providing a\nthorough survey covering a broad range of topics. We introduce a\nmulti-dimensional framework to elucidate common learning-based IVIF methods,\nfrom visual enhancement strategies to data compatibility and task adaptability.\nWe also present a detailed analysis of these approaches, accompanied by a\nlookup table clarifying their core ideas. Furthermore, we summarize performance\ncomparisons, both quantitatively and qualitatively, focusing on registration,\nfusion, and subsequent high-level tasks. Beyond technical analysis, we discuss\npotential future directions and open issues in this area. For further details,\nvisit our GitHub repository: https://github.com/RollingPlain/IVIF_ZOO.\n","authors":["Jinyuan Liu","Guanyao Wu","Zhu Liu","Di Wang","Zhiying Jiang","Long Ma","Wei Zhong","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2501.10761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10757v1","updated":"2025-01-18T13:08:32Z","published":"2025-01-18T13:08:32Z","title":"Deformable Image Registration of Dark-Field Chest Radiographs for Local\n  Lung Signal Change Assessment","summary":"  Dark-field radiography of the human chest has been demonstrated to have\npromising potential for the analysis of the lung microstructure and the\ndiagnosis of respiratory diseases. However, previous studies of dark-field\nchest radiographs evaluated the lung signal only in the inspiratory breathing\nstate. Our work aims to add a new perspective to these previous assessments by\nlocally comparing dark-field lung information between different respiratory\nstates. To this end, we discuss suitable image registration methods for\ndark-field chest radiographs to enable consistent spatial alignment of the lung\nin distinct breathing states. Utilizing full inspiration and expiration scans\nfrom a clinical chronic obstructive pulmonary disease study, we assess the\nperformance of the proposed registration framework and outline applicable\nevaluation approaches. Our regional characterization of lung dark-field signal\nchanges between the breathing states provides a proof-of-principle that dynamic\nradiography-based lung function assessment approaches may benefit from\nconsidering registered dark-field images in addition to standard plain chest\nradiographs.\n","authors":["Fabian Drexel","Vasiliki Sideri-Lampretsa","Henriette Bast","Alexander W. Marka","Thomas Koehler","Florian T. Gassert","Daniela Pfeiffer","Daniel Rueckert","Franz Pfeiffer"],"pdf_url":"https://arxiv.org/pdf/2501.10757v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2406.01455v3","updated":"2025-01-18T12:51:29Z","published":"2024-06-03T15:43:29Z","title":"Automatic Fused Multimodal Deep Learning for Plant Identification","summary":"  Plant classification is vital for ecological conservation and agricultural\nproductivity, enhancing our understanding of plant growth dynamics and aiding\nspecies preservation. The advent of deep learning (DL) techniques has\nrevolutionized this field by enabling autonomous feature extraction,\nsignificantly reducing the dependence on manual expertise. However,\nconventional DL models often rely solely on single data sources, failing to\ncapture the full biological diversity of plant species comprehensively. Recent\nresearch has turned to multimodal learning to overcome this limitation by\nintegrating multiple data types, which enriches the representation of plant\ncharacteristics. This shift introduces the challenge of determining the optimal\npoint for modality fusion. In this paper, we introduce a pioneering multimodal\nDL-based approach for plant classification with automatic modality fusion.\nUtilizing the multimodal fusion architecture search, our method integrates\nimages from multiple plant organs -- flowers, leaves, fruits, and stems -- into\na cohesive model. To address the lack of multimodal datasets, we contributed\nMultimodal-PlantCLEF, a restructured version of the PlantCLEF2015 dataset\ntailored for multimodal tasks. Our method achieves 82.61% accuracy on 979\nclasses of Multimodal-PlantCLEF, surpassing state-of-the-art methods and\noutperforming late fusion by 10.33%. Through the incorporation of multimodal\ndropout, our approach demonstrates strong robustness to missing modalities. We\nvalidate our model against established benchmarks using standard performance\nmetrics and McNemar's test, further underscoring its superiority.\n","authors":["Alfreds Lapkovskis","Natalia Nefedova","Ali Beikmohammadi"],"pdf_url":"https://arxiv.org/pdf/2406.01455v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10752v1","updated":"2025-01-18T12:38:28Z","published":"2025-01-18T12:38:28Z","title":"Quadcopter Position Hold Function using Optical Flow in a\n  Smartphone-based Flight Computer","summary":"  Purpose. This paper explores the capability of smartphones as computing\ndevices for a quadcopter, specifically in terms of the ability of drones to\nmaintain a position known as the position hold function. Image processing can\nbe performed with the phone's sensors and powerful built-in camera. Method.\nUsing Shi-Tomasi corner detection and the Lucas-Kanade sparse optical flow\nalgorithms, ground features are recognized and tracked using the\ndownward-facing camera. The position is maintained by computing quadcopter\ndisplacement from the center of the image using Euclidian distance, and the\ncorresponding pitch and roll estimate is calculated using the PID controller.\nResults. Actual flights show a double standard deviation of 18.66 cm from the\ncenter for outdoor tests. With a quadcopter size of 58cm x 58cm used, it\nimplies that 95% of the time, the quadcopter is within a diameter of 96 cm. For\nindoor tests, a double standard deviation of 10.55 cm means that 95% of the\ntime, the quadcopter is within a diameter of 79 cm. Conclusion. Smartphone\nsensors and cameras can be used to perform optical flow position hold\nfunctions, proving their potential as computing devices for drones.\nRecommendations. To further improve the positioning system of the phone-based\nquadcopter system, it is suggested that potential sensor fusion be explored\nwith the phone's GNSS sensor, which gives absolute positioning information for\noutdoor applications. Research Implications. As different devices and gadgets\nare integrated into the smartphone, this paper presents an opportunity for\nphone manufacturers and researchers to explore the potential of smartphones for\na drone use-case.\n","authors":["Noel P Caliston","Chris Jordan C. Aliac","James Arnold E. Nogra"],"pdf_url":"https://arxiv.org/pdf/2501.10752v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2501.10736v1","updated":"2025-01-18T11:57:20Z","published":"2025-01-18T11:57:20Z","title":"Semi-supervised Semantic Segmentation for Remote Sensing Images via\n  Multi-scale Uncertainty Consistency and Cross-Teacher-Student Attention","summary":"  Semi-supervised learning offers an appealing solution for remote sensing (RS)\nimage segmentation to relieve the burden of labor-intensive pixel-level\nlabeling. However, RS images pose unique challenges, including rich multi-scale\nfeatures and high inter-class similarity. To address these problems, this paper\nproposes a novel semi-supervised Multi-Scale Uncertainty and\nCross-Teacher-Student Attention (MUCA) model for RS image semantic segmentation\ntasks. Specifically, MUCA constrains the consistency among feature maps at\ndifferent layers of the network by introducing a multi-scale uncertainty\nconsistency regularization. It improves the multi-scale learning capability of\nsemi-supervised algorithms on unlabeled data. Additionally, MUCA utilizes a\nCross-Teacher-Student attention mechanism to guide the student network, guiding\nthe student network to construct more discriminative feature representations\nthrough complementary features from the teacher network. This design\neffectively integrates weak and strong augmentations (WA and SA) to further\nboost segmentation performance. To verify the effectiveness of our model, we\nconduct extensive experiments on ISPRS-Potsdam and LoveDA datasets. The\nexperimental results show the superiority of our method over state-of-the-art\nsemi-supervised methods. Notably, our model excels in distinguishing highly\nsimilar objects, showcasing its potential for advancing semi-supervised RS\nimage segmentation tasks.\n","authors":["Shanwen Wang","Changrui Chen","Xin Sun","Danfeng Hong","Jungong Han"],"pdf_url":"https://arxiv.org/pdf/2501.10736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.02327v3","updated":"2025-01-18T11:57:03Z","published":"2022-06-06T02:56:51Z","title":"JigsawHSI: a network for Hyperspectral Image classification","summary":"  This article describes Jigsaw, a convolutional neural network (CNN) used in\ngeosciences and based on Inception but tailored for geoscientific analyses.\nIntroduces JigsawHSI (based on Jigsaw) and uses it on the land-use land-cover\n(LULC) classification problem with the Indian Pines, Pavia University and\nSalinas hyperspectral image data sets. The network is compared against\nHybridSN, a spectral-spatial 3D-CNN followed by 2D-CNN that achieves\nstate-of-the-art results on the datasets. This short article proves that\nJigsawHSI is able to meet or exceed HybridSN's performance in all three cases.\nIt also introduces a generalized Jigsaw architecture in d-dimensional space for\nany number of multimodal inputs. Additionally, the use of jigsaw in geosciences\nis highlighted, while the code and toolkit are made available.\n","authors":["Jaime Moraga"],"pdf_url":"https://arxiv.org/pdf/2206.02327v3.pdf","comment":"7 pages, 7 figures, not peer reviewed"},{"id":"http://arxiv.org/abs/2501.10733v1","updated":"2025-01-18T11:39:46Z","published":"2025-01-18T11:39:46Z","title":"A CNN-Transformer for Classification of Longitudinal 3D MRI Images -- A\n  Case Study on Hepatocellular Carcinoma Prediction","summary":"  Longitudinal MRI analysis is crucial for predicting disease outcomes,\nparticularly in chronic conditions like hepatocellular carcinoma (HCC), where\nearly detection can significantly influence treatment strategies and patient\nprognosis. Yet, due to challenges like limited data availability, subtle\nparenchymal changes, and the irregular timing of medical screenings, current\napproaches have so far focused on cross-sectional imaging data. To address\nthis, we propose HCCNet, a novel model architecture that integrates a 3D\nadaptation of the ConvNeXt CNN architecture with a Transformer encoder,\ncapturing both the intricate spatial features of 3D MRIs and the complex\ntemporal dependencies across different time points.\n  HCCNet utilizes a two-stage pre-training process tailored for longitudinal\nMRI data. The CNN backbone is pre-trained using a self-supervised learning\nframework adapted for 3D MRIs, while the Transformer encoder is pre-trained\nwith a sequence-order-prediction task to enhance its understanding of disease\nprogression over time. We demonstrate the effectiveness of HCCNet by applying\nit to a cohort of liver cirrhosis patients undergoing regular MRI screenings\nfor HCC surveillance. Our results show that HCCNet significantly improves\npredictive accuracy and reliability over baseline models, providing a robust\ntool for personalized HCC surveillance.\n  The methodological approach presented in this paper is versatile and can be\nadapted to various longitudinal MRI screening applications. Its ability to\nhandle varying patient record lengths and irregular screening intervals\nestablishes it as an invaluable framework for monitoring chronic diseases,\nwhere timely and accurate disease prognosis is critical for effective treatment\nplanning.\n","authors":["Jakob Nolte","Maureen M. J. Guichelaar","Donald E. Bouman","Stephanie M. van den Berg","Maryam Amir Haeri"],"pdf_url":"https://arxiv.org/pdf/2501.10733v1.pdf","comment":"Submitted for publication to Biomedical Signal Processing and Control"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.00077v4","updated":"2025-01-18T22:41:19Z","published":"2024-06-22T15:32:53Z","title":"Differentially Private Graph Diffusion with Applications in Personalized\n  PageRanks","summary":"  Graph diffusion, which iteratively propagates real-valued substances among\nthe graph, is used in numerous graph/network-involved applications. However,\nreleasing diffusion vectors may reveal sensitive linking information in the\ndata such as transaction information in financial network data. However,\nprotecting the privacy of graph data is challenging due to its interconnected\nnature. This work proposes a novel graph diffusion framework with edge-level\ndifferential privacy guarantees by using noisy diffusion iterates. The\nalgorithm injects Laplace noise per diffusion iteration and adopts a\ndegree-based thresholding function to mitigate the high sensitivity induced by\nlow-degree nodes. Our privacy loss analysis is based on Privacy Amplification\nby Iteration (PABI), which to our best knowledge, is the first effort that\nanalyzes PABI with Laplace noise and provides relevant applications. We also\nintroduce a novel Infinity-Wasserstein distance tracking method, which tightens\nthe analysis of privacy leakage and makes PABI more applicable in practice. We\nevaluate this framework by applying it to Personalized Pagerank computation for\nranking tasks. Experiments on real-world network data demonstrate the\nsuperiority of our method under stringent privacy conditions.\n","authors":["Rongzhe Wei","Eli Chien","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2407.00077v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10871v1","updated":"2025-01-18T20:35:03Z","published":"2025-01-18T20:35:03Z","title":"Enhancing User Intent for Recommendation Systems via Large Language\n  Models","summary":"  Recommendation systems play a critical role in enhancing user experience and\nengagement in various online platforms. Traditional methods, such as\nCollaborative Filtering (CF) and Content-Based Filtering (CBF), rely heavily on\npast user interactions or item features. However, these models often fail to\ncapture the dynamic and evolving nature of user preferences. To address these\nlimitations, we propose DUIP (Dynamic User Intent Prediction), a novel\nframework that combines LSTM networks with Large Language Models (LLMs) to\ndynamically capture user intent and generate personalized item recommendations.\nThe LSTM component models the sequential and temporal dependencies of user\nbehavior, while the LLM utilizes the LSTM-generated prompts to predict the next\nitem of interest. Experimental results on three diverse datasets ML-1M, Games,\nand Bundle show that DUIP outperforms a wide range of baseline models,\ndemonstrating its ability to handle the cold-start problem and real-time intent\nadaptation. The integration of dynamic prompts based on recent user\ninteractions allows DUIP to provide more accurate, context-aware, and\npersonalized recommendations. Our findings suggest that DUIP is a promising\napproach for next-generation recommendation systems, with potential for further\nimprovements in cross-modal recommendations and scalability.\n","authors":["Xiaochuan Xu","Zeqiu Xu","Peiyang Yu","Jiani Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10871v1.pdf","comment":"CAIMLR 2024 accepted"},{"id":"http://arxiv.org/abs/2501.10787v1","updated":"2025-01-18T14:54:56Z","published":"2025-01-18T14:54:56Z","title":"LD-DETR: Loop Decoder DEtection TRansformer for Video Moment Retrieval\n  and Highlight Detection","summary":"  Video Moment Retrieval and Highlight Detection aim to find corresponding\ncontent in the video based on a text query. Existing models usually first use\ncontrastive learning methods to align video and text features, then fuse and\nextract multimodal information, and finally use a Transformer Decoder to decode\nmultimodal information. However, existing methods face several issues: (1)\nOverlapping semantic information between different samples in the dataset\nhinders the model's multimodal aligning performance; (2) Existing models are\nnot able to efficiently extract local features of the video; (3) The\nTransformer Decoder used by the existing model cannot adequately decode\nmultimodal features. To address the above issues, we proposed the LD-DETR model\nfor Video Moment Retrieval and Highlight Detection tasks. Specifically, we\nfirst distilled the similarity matrix into the identity matrix to mitigate the\nimpact of overlapping semantic information. Then, we designed a method that\nenables convolutional layers to extract multimodal local features more\nefficiently. Finally, we fed the output of the Transformer Decoder back into\nitself to adequately decode multimodal information. We evaluated LD-DETR on\nfour public benchmarks and conducted extensive experiments to demonstrate the\nsuperiority and effectiveness of our approach. Our model outperforms the\nState-Of-The-Art models on QVHighlight, Charades-STA and TACoS datasets. Our\ncode is available at https://github.com/qingchen239/ld-detr.\n","authors":["Pengcheng Zhao","Zhixian He","Fuwei Zhang","Shujin Lin","Fan Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.10787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14117v2","updated":"2025-01-18T09:39:52Z","published":"2024-06-20T09:03:18Z","title":"An Investigation of Prompt Variations for Zero-shot LLM-based Rankers","summary":"  We provide a systematic understanding of the impact of specific components\nand wordings used in prompts on the effectiveness of rankers based on zero-shot\nLarge Language Models (LLMs). Several zero-shot ranking methods based on LLMs\nhave recently been proposed. Among many aspects, methods differ across (1) the\nranking algorithm they implement, e.g., pointwise vs. listwise, (2) the\nbackbone LLMs used, e.g., GPT3.5 vs. FLAN-T5, (3) the components and wording\nused in prompts, e.g., the use or not of role-definition (role-playing) and the\nactual words used to express this. It is currently unclear whether performance\ndifferences are due to the underlying ranking algorithm, or because of spurious\nfactors such as better choice of words used in prompts. This confusion risks to\nundermine future research. Through our large-scale experimentation and\nanalysis, we find that ranking algorithms do contribute to differences between\nmethods for zero-shot LLM ranking. However, so do the LLM backbones -- but even\nmore importantly, the choice of prompt components and wordings affect the\nranking. In fact, in our experiments, we find that, at times, these latter\nelements have more impact on the ranker's effectiveness than the actual ranking\nalgorithms, and that differences among ranking methods become more blurred when\nprompt variations are considered.\n","authors":["Shuoqi Sun","Shengyao Zhuang","Shuai Wang","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2406.14117v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05464v2","updated":"2025-01-18T05:53:51Z","published":"2024-12-31T19:55:45Z","title":"LLM-MedQA: Enhancing Medical Question Answering through Case Studies in\n  Large Language Models","summary":"  Accurate and efficient question-answering systems are essential for\ndelivering high-quality patient care in the medical field. While Large Language\nModels (LLMs) have made remarkable strides across various domains, they\ncontinue to face significant challenges in medical question answering,\nparticularly in understanding domain-specific terminologies and performing\ncomplex reasoning. These limitations undermine their effectiveness in critical\nmedical applications. To address these issues, we propose a novel approach\nincorporating similar case generation within a multi-agent medical\nquestion-answering (MedQA) system. Specifically, we leverage the Llama3.1:70B\nmodel, a state-of-the-art LLM, in a multi-agent architecture to enhance\nperformance on the MedQA dataset using zero-shot learning. Our method\ncapitalizes on the model's inherent medical knowledge and reasoning\ncapabilities, eliminating the need for additional training data. Experimental\nresults show substantial performance gains over existing benchmark models, with\nimprovements of 7% in both accuracy and F1-score across various medical QA\ntasks. Furthermore, we examine the model's interpretability and reliability in\naddressing complex medical queries. This research not only offers a robust\nsolution for medical question answering but also establishes a foundation for\nbroader applications of LLMs in the medical domain.\n","authors":["Hang Yang","Hao Chen","Hui Guo","Yineng Chen","Ching-Sheng Lin","Shu Hu","Jinrong Hu","Xi Wu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10638v1","updated":"2025-01-18T02:51:43Z","published":"2025-01-18T02:51:43Z","title":"A Resource-Efficient Training Framework for Remote Sensing Text--Image\n  Retrieval","summary":"  Remote sensing text--image retrieval (RSTIR) aims to retrieve the matched\nremote sensing (RS) images from the database according to the descriptive text.\nRecently, the rapid development of large visual-language pre-training models\nprovides new insights for RSTIR. Nevertheless, as the complexity of models\ngrows in RSTIR, the previous studies suffer from suboptimal resource efficiency\nduring transfer learning. To address this issue, we propose a computation and\nmemory-efficient retrieval (CMER) framework for RSTIR. To reduce the training\nmemory consumption, we propose the Focus-Adapter module, which adopts a side\nbranch structure. Its focus layer suppresses the interference of background\npixels for small targets. Simultaneously, to enhance data efficacy, we regard\nthe RS scene category as the metadata and design a concise augmentation\ntechnique. The scene label augmentation leverages the prior knowledge from land\ncover categories and shrinks the search space. We propose the negative sample\nrecycling strategy to make the negative sample pool decoupled from the\nmini-batch size. It improves the generalization performance without introducing\nadditional encoders. We have conducted quantitative and qualitative experiments\non public datasets and expanded the benchmark with some advanced approaches,\nwhich demonstrates the competitiveness of the proposed CMER. Compared with the\nrecent advanced methods, the overall retrieval performance of CMER is 2%--5%\nhigher on RSITMD. Moreover, our proposed method reduces memory consumption by\n49% and has a 1.4x data throughput during training. The code of the CMER and\nthe dataset will be released at https://github.com/ZhangWeihang99/CMER.\n","authors":["Weihang Zhang","Jihao Li","Shuoke Li","Ziqing Niu","Jialiang Chen","Wenkai Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.10638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17335v3","updated":"2025-01-18T02:04:15Z","published":"2024-06-25T07:45:00Z","title":"A Thorough Performance Benchmarking on Lightweight Embedding-based\n  Recommender Systems","summary":"  Since the creation of the Web, recommender systems (RSs) have been an\nindispensable mechanism in information filtering. State-of-the-art RSs\nprimarily depend on categorical features, which ecoded by embedding vectors,\nresulting in excessively large embedding tables. To prevent over-parameterized\nembedding tables from harming scalability, both academia and industry have seen\nincreasing efforts in compressing RS embeddings. However, despite the\nprosperity of lightweight embedding-based RSs (LERSs), a wide diversity is seen\nin evaluation protocols, resulting in obstacles when relating LERS performance\nto real-world usability. Moreover, despite the common goal of lightweight\nembeddings, LERSs are evaluated with a single choice between the two main\nrecommendation tasks -- collaborative filtering and content-based\nrecommendation. This lack of discussions on cross-task transferability hinders\nthe development of unified, more scalable solutions. Motivated by these issues,\nthis study investigates various LERSs' performance, efficiency, and cross-task\ntransferability via a thorough benchmarking process. Additionally, we propose\nan efficient embedding compression method using magnitude pruning, which is an\neasy-to-deploy yet highly competitive baseline that outperforms various complex\nLERSs. Our study reveals the distinct performance of LERSs across the two\ntasks, shedding light on their effectiveness and generalizability. To support\nedge-based recommendations, we tested all LERSs on a Raspberry Pi 4, where the\nefficiency bottleneck is exposed. Finally, we conclude this paper with critical\nsummaries of LERS performance, model selection suggestions, and underexplored\nchallenges around LERSs for future research. To encourage future research, we\npublish source codes and artifacts at \\href{this\nlink}{https://github.com/chenxing1999/recsys-benchmark}.\n","authors":["Hung Vinh Tran","Tong Chen","Quoc Viet Hung Nguyen","Zi Huang","Lizhen Cui","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2406.17335v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08911v2","updated":"2025-01-18T00:35:16Z","published":"2024-12-12T03:47:40Z","title":"Rethinking Multi-Objective Learning through Goal-Conditioned Supervised\n  Learning","summary":"  Multi-objective learning aims to optimize multiple objectives simultaneously\nwith a single model for achieving a balanced and satisfying performance on all\nthese objectives. However, it suffers from the difficulty to formalize and\nconduct the exact learning process, especially considering the possible\nconflicts between objectives. Existing approaches explores to resolve this\nprimarily in two directions: adapting modeling structure or constraining\noptimization with certain assumptions. However, a primary issue is that their\npresuppositions for the effectiveness of their design are insufficient to\nguarantee the its generality in real-world applications. What's worse, the high\nspace and computation complexity issue makes it even harder to apply them in\nlarge-scale, complicated environment such as the recommender systems. To\naddress these issues, we propose a general framework for automatically learning\nto achieve multiple objectives based on the existing sequential data. We apply\nthe goal-conditioned supervised learning (GCSL) framework to multi-objective\nlearning, by extending the definition of goals from one-dimensional scalar to\nmulti-dimensional vector that perfectly disentangle the representation of\ndifferent objectives. Meanwhile, GCSL enables the model to simultaneously learn\nto achieve each objective in a concise supervised learning way, simply guided\nby existing sequences in the offline data. No additional constraint, special\nmodel structure design, or complex optimization algorithms are further\nrequired. Apart from that, we formally analyze the property of the goals in\nGCSL and then firstly propose a goal-generation framework to gain achievable\nand reasonable goals for inference. Extensive experiments are conducted on\nreal-world recommendation datasets, demonstrating the effectiveness of the\nproposed method and exploring the feasibility of the goal-generation strategies\nin GCSL.\n","authors":["Shijun Li","Hilaf Hasson","Jing Hu","Joydeep Ghosh"],"pdf_url":"https://arxiv.org/pdf/2412.08911v2.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2501.10900v1","updated":"2025-01-18T23:17:34Z","published":"2025-01-18T23:17:34Z","title":"A Generative Security Application Engineering Curriculum","summary":"  Generative AI and large language models (LLMs) are transforming security by\nautomating many tasks being performed manually. With such automation changing\nthe practice of security as we know it, it is imperative that we prepare future\nstudents for the technology landscape they will ultimately face. Towards this\nend, we describe an initial curriculum and course that attempts to show\nstudents how to apply generative AI in order to solve problems in security. By\nrefocusing security education and training on aspects uniquely suited for\nhumans and showing students how to leverage automation for the rest, we believe\nwe can better align security education practices with generative AI as it\nevolves.\n","authors":["Wu-chang Feng","David Baker-Robinson"],"pdf_url":"https://arxiv.org/pdf/2501.10900v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.10895v1","updated":"2025-01-18T22:40:33Z","published":"2025-01-18T22:40:33Z","title":"Classical and Deep Reinforcement Learning Inventory Control Policies for\n  Pharmaceutical Supply Chains with Perishability and Non-Stationarity","summary":"  We study inventory control policies for pharmaceutical supply chains,\naddressing challenges such as perishability, yield uncertainty, and\nnon-stationary demand, combined with batching constraints, lead times, and lost\nsales. Collaborating with Bristol-Myers Squibb (BMS), we develop a realistic\ncase study incorporating these factors and benchmark three\npolicies--order-up-to (OUT), projected inventory level (PIL), and deep\nreinforcement learning (DRL) using the proximal policy optimization (PPO)\nalgorithm--against a BMS baseline based on human expertise. We derive and\nvalidate bounds-based procedures for optimizing OUT and PIL policy parameters\nand propose a methodology for estimating projected inventory levels, which are\nalso integrated into the DRL policy with demand forecasts to improve\ndecision-making under non-stationarity. Compared to a human-driven policy,\nwhich avoids lost sales through higher holding costs, all three implemented\npolicies achieve lower average costs but exhibit greater cost variability.\nWhile PIL demonstrates robust and consistent performance, OUT struggles under\nhigh lost sales costs, and PPO excels in complex and variable scenarios but\nrequires significant computational effort. The findings suggest that while DRL\nshows potential, it does not outperform classical policies in all numerical\nexperiments, highlighting 1) the need to integrate diverse policies to manage\npharmaceutical challenges effectively, based on the current state-of-the-art,\nand 2) that practical problems in this domain seem to lack a single policy\nclass that yields universally acceptable performance.\n","authors":["Francesco Stranieri","Chaaben Kouki","Willem van Jaarsveld","Fabio Stella"],"pdf_url":"https://arxiv.org/pdf/2501.10895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10893v1","updated":"2025-01-18T22:34:41Z","published":"2025-01-18T22:34:41Z","title":"Learn-by-interact: A Data-Centric Framework for Self-Adaptive Agents in\n  Realistic Environments","summary":"  Autonomous agents powered by large language models (LLMs) have the potential\nto enhance human capabilities, assisting with digital tasks from sending emails\nto performing data analysis. The abilities of existing LLMs at such tasks are\noften hindered by the lack of high-quality agent data from the corresponding\nenvironments they interact with. We propose Learn-by-interact, a data-centric\nframework to adapt LLM agents to any given environments without human\nannotations. Learn-by-interact synthesizes trajectories of agent-environment\ninteractions based on documentations, and constructs instructions by\nsummarizing or abstracting the interaction histories, a process called backward\nconstruction. We assess the quality of our synthetic data by using them in both\ntraining-based scenarios and training-free in-context learning (ICL), where we\ncraft innovative retrieval approaches optimized for agents. Extensive\nexperiments on SWE-bench, WebArena, OSWorld and Spider2-V spanning across\nrealistic coding, web, and desktop environments show the effectiveness of\nLearn-by-interact in various downstream agentic tasks -- baseline results are\nimproved by up to 12.2\\% for ICL with Claude-3.5 and 19.5\\% for training with\nCodestral-22B. We further demonstrate the critical role of backward\nconstruction, which provides up to 14.0\\% improvement for training. Our\nablation studies demonstrate the efficiency provided by our synthesized data in\nICL and the superiority of our retrieval pipeline over alternative approaches\nlike conventional retrieval-augmented generation (RAG). We expect that\nLearn-by-interact will serve as a foundation for agent data synthesis as LLMs\nare increasingly deployed at real-world environments.\n","authors":["Hongjin Su","Ruoxi Sun","Jinsung Yoon","Pengcheng Yin","Tao Yu","Sercan Ö. Arık"],"pdf_url":"https://arxiv.org/pdf/2501.10893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10891v1","updated":"2025-01-18T22:30:27Z","published":"2025-01-18T22:30:27Z","title":"OpenEarthMap-SAR: A Benchmark Synthetic Aperture Radar Dataset for\n  Global High-Resolution Land Cover Mapping","summary":"  High-resolution land cover mapping plays a crucial role in addressing a wide\nrange of global challenges, including urban planning, environmental monitoring,\ndisaster response, and sustainable development. However, creating accurate,\nlarge-scale land cover datasets remains a significant challenge due to the\ninherent complexities of geospatial data, such as diverse terrain, varying\nsensor modalities, and atmospheric conditions. Synthetic Aperture Radar (SAR)\nimagery, with its ability to penetrate clouds and capture data in all-weather,\nday-and-night conditions, offers unique advantages for land cover mapping.\nDespite these strengths, the lack of benchmark datasets tailored for SAR\nimagery has limited the development of robust models specifically designed for\nthis data modality. To bridge this gap and facilitate advancements in SAR-based\ngeospatial analysis, we introduce OpenEarthMap-SAR, a benchmark SAR dataset,\nfor global high-resolution land cover mapping. OpenEarthMap-SAR consists of 1.5\nmillion segments of 5033 aerial and satellite images with the size of\n1024$\\times$1024 pixels, covering 35 regions from Japan, France, and the USA,\nwith partially manually annotated and fully pseudo 8-class land cover labels at\na ground sampling distance of 0.15--0.5 m. We evaluated the performance of\nstate-of-the-art methods for semantic segmentation and present challenging\nproblem settings suitable for further technical development. The dataset also\nserves the official dataset for IEEE GRSS Data Fusion Contest Track I. The\ndataset has been made publicly available at\nhttps://zenodo.org/records/14622048.\n","authors":["Junshi Xia","Hongruixuan Chen","Clifford Broni-Bediako","Yimin Wei","Jian Song","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2501.10891v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2406.14722v3","updated":"2025-01-18T21:09:41Z","published":"2024-06-20T20:37:55Z","title":"Does GPT Really Get It? A Hierarchical Scale to Quantify Human vs AI's\n  Understanding of Algorithms","summary":"  As Large Language Models (LLMs) perform (and sometimes excel at) more and\nmore complex cognitive tasks, a natural question is whether AI really\nunderstands. The study of understanding in LLMs is in its infancy, and the\ncommunity has yet to incorporate well-trodden research in philosophy,\npsychology, and education. We initiate this, specifically focusing on\nunderstanding algorithms, and propose a hierarchy of levels of understanding.\nWe use the hierarchy to design and conduct a study with human subjects\n(undergraduate and graduate students) as well as large language models\n(generations of GPT), revealing interesting similarities and differences. We\nexpect that our rigorous criteria will be useful to keep track of AI's progress\nin such cognitive domains.\n","authors":["Mirabel Reid","Santosh S. Vempala"],"pdf_url":"https://arxiv.org/pdf/2406.14722v3.pdf","comment":"13 pages, 10 figures. To be published at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.10868v1","updated":"2025-01-18T20:26:00Z","published":"2025-01-18T20:26:00Z","title":"Generating Structured Outputs from Language Models: Benchmark and\n  Studies","summary":"  Reliably generating structured outputs has become a critical capability for\nmodern language model (LM) applications. Constrained decoding has emerged as\nthe dominant technology across sectors for enforcing structured outputs during\ngeneration. Despite its growing adoption, little has been done with the\nsystematic evaluation of the behaviors and performance of constrained decoding.\nConstrained decoding frameworks have standardized around JSON Schema as a\nstructured data format, with most uses guaranteeing constraint compliance given\na schema. However, there is poor understanding of the effectiveness of the\nmethods in practice. We present an evaluation framework to assess constrained\ndecoding approaches across three critical dimensions: efficiency in generating\nconstraint-compliant outputs, coverage of diverse constraint types, and quality\nof the generated outputs. To facilitate this evaluation, we introduce\nJSONSchemaBench, a benchmark for constrained decoding comprising 10K real-world\nJSON schemas that encompass a wide range of constraints with varying\ncomplexity. We pair the benchmark with the existing official JSON Schema Test\nSuite and evaluate six state-of-the-art constrained decoding frameworks,\nincluding Guidance, Outlines, Llamacpp, XGrammar, OpenAI, and Gemini. Through\nextensive experiments, we gain insights into the capabilities and limitations\nof constrained decoding on structured generation with real-world JSON schemas.\nOur work provides actionable insights for improving constrained decoding\nframeworks and structured generation tasks, setting a new standard for\nevaluating constrained decoding and structured generation. We release\nJSONSchemaBench at https://github.com/guidance-ai/jsonschemabench\n","authors":["Saibo Geng","Hudson Cooper","Michał Moskal","Samuel Jenkins","Julian Berman","Nathan Ranchin","Robert West","Eric Horvitz","Harsha Nori"],"pdf_url":"https://arxiv.org/pdf/2501.10868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18989v2","updated":"2025-01-18T20:14:21Z","published":"2024-12-25T21:56:35Z","title":"How Propense Are Large Language Models at Producing Code Smells? A\n  Benchmarking Study","summary":"  Large Language Models (LLMs) have shown significant potential in automating\nsoftware engineering tasks, particularly in code generation. However, current\nevaluation benchmarks, which primarily focus on accuracy, fall short in\nassessing the quality of the code generated by these models, specifically their\ntendency to produce code smells. To address this limitation, we introduce\nCodeSmellEval, a benchmark designed to evaluate the propensity of LLMs for\ngenerating code smells. Our benchmark includes a novel metric: Propensity\nSmelly Score (PSC), and a curated dataset of method-level code smells:\nCodeSmellData. To demonstrate the use of CodeSmellEval, we conducted a case\nstudy with two state-of-the-art LLMs, CodeLlama and Mistral. The results reveal\nthat both models tend to generate code smells, such as simplifiable-condition\nand consider-merging-isinstance. These findings highlight the effectiveness of\nour benchmark in evaluating LLMs, providing valuable insights into their\nreliability and their propensity to introduce code smells in code generation\ntasks.\n","authors":["Alejandro Velasco","Daniel Rodriguez-Cardenas","Luftar Rahman Alif","David N. Palacio","Denys Poshyvanyk"],"pdf_url":"https://arxiv.org/pdf/2412.18989v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10861v1","updated":"2025-01-18T19:58:53Z","published":"2025-01-18T19:58:53Z","title":"Dynamic Continual Learning: Harnessing Parameter Uncertainty for\n  Improved Network Adaptation","summary":"  When fine-tuning Deep Neural Networks (DNNs) to new data, DNNs are prone to\noverwriting network parameters required for task-specific functionality on\npreviously learned tasks, resulting in a loss of performance on those tasks. We\npropose using parameter-based uncertainty to determine which parameters are\nrelevant to a network's learned function and regularize training to prevent\nchange in these important parameters. We approach this regularization in two\nways: (1), we constrain critical parameters from significant changes by\nassociating more critical parameters with lower learning rates, thereby\nlimiting alterations in those parameters; (2), important parameters are\nrestricted from change by imposing a higher regularization weighting, causing\nparameters to revert to their states prior to the learning of subsequent tasks.\nWe leverage a Bayesian Moment Propagation framework which learns network\nparameters concurrently with their associated uncertainties while allowing each\nparameter to contribute uncertainty to the network's predictive distribution,\navoiding the pitfalls of existing sampling-based methods. The proposed approach\nis evaluated for common sequential benchmark datasets and compared to existing\npublished approaches from the Continual Learning community. Ultimately, we show\nimproved Continual Learning performance for Average Test Accuracy and Backward\nTransfer metrics compared to sampling-based methods and other\nnon-uncertainty-based approaches.\n","authors":["Christopher Angelini","Nidhal Bouaynaya"],"pdf_url":"https://arxiv.org/pdf/2501.10861v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.10860v1","updated":"2025-01-18T19:57:54Z","published":"2025-01-18T19:57:54Z","title":"Zero-shot and Few-shot Learning with Instruction-following LLMs for\n  Claim Matching in Automated Fact-checking","summary":"  The claim matching (CM) task can benefit an automated fact-checking pipeline\nby putting together claims that can be resolved with the same fact-check. In\nthis work, we are the first to explore zero-shot and few-shot learning\napproaches to the task. We consider CM as a binary classification task and\nexperiment with a set of instruction-following large language models\n(GPT-3.5-turbo, Gemini-1.5-flash, Mistral-7B-Instruct, and\nLlama-3-8B-Instruct), investigating prompt templates. We introduce a new CM\ndataset, ClaimMatch, which will be released upon acceptance. We put LLMs to the\ntest in the CM task and find that it can be tackled by leveraging more mature\nyet similar tasks such as natural language inference or paraphrase detection.\nWe also propose a pipeline for CM, which we evaluate on texts of different\nlengths.\n","authors":["Dina Pisarevskaya","Arkaitz Zubiaga"],"pdf_url":"https://arxiv.org/pdf/2501.10860v1.pdf","comment":"Accepted at the 31st International Conference on Computational\n  Linguistics (COLING 2025)"},{"id":"http://arxiv.org/abs/2501.10858v1","updated":"2025-01-18T19:36:37Z","published":"2025-01-18T19:36:37Z","title":"Reliable Text-to-SQL with Adaptive Abstention","summary":"  Large language models (LLMs) have revolutionized natural language interfaces\nfor databases, particularly in text-to-SQL conversion. However, current\napproaches often generate unreliable outputs when faced with ambiguity or\ninsufficient context. We present Reliable Text-to-SQL (RTS), a novel framework\nthat enhances query generation reliability by incorporating abstention and\nhuman-in-the-loop mechanisms. RTS focuses on the critical schema linking phase,\nwhich aims to identify the key database elements needed for generating SQL\nqueries. It autonomously detects potential errors during the answer generation\nprocess and responds by either abstaining or engaging in user interaction. A\nvital component of RTS is the Branching Point Prediction (BPP) which utilizes\nstatistical conformal techniques on the hidden layers of the LLM model for\nschema linking, providing probabilistic guarantees on schema linking accuracy.\nWe validate our approach through comprehensive experiments on the BIRD\nbenchmark, demonstrating significant improvements in robustness and\nreliability. Our findings highlight the potential of combining transparent-box\nLLMs with human-in-the-loop processes to create more robust natural language\ninterfaces for databases. For the BIRD benchmark, our approach achieves\nnear-perfect schema linking accuracy, autonomously involving a human when\nneeded. Combined with query generation, we demonstrate that near-perfect schema\nlinking and a small query generation model can almost match SOTA accuracy\nachieved with a model orders of magnitude larger than the one we use.\n","authors":["Kaiwen Chen","Yueting Chen","Xiaohui Yu","Nick Koudas"],"pdf_url":"https://arxiv.org/pdf/2501.10858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05924v2","updated":"2025-01-18T19:33:02Z","published":"2024-08-12T05:07:24Z","title":"Space-LLaVA: a Vision-Language Model Adapted to Extraterrestrial\n  Applications","summary":"  Foundation Models (FMs), e.g., large language models, possess attributes of\nintelligence which offer promise to endow a robot with the contextual\nunderstanding necessary to navigate complex, unstructured tasks in the wild. We\nsee three core challenges in the future of space robotics that motivate\nbuilding an FM for the space robotics community: 1) Scalability of\nground-in-the-loop operations; 2) Generalizing prior knowledge to novel\nenvironments; and 3) Multi-modality in tasks and sensor data. As a first-step\ntowards a space foundation model, we programmatically augment three\nextraterrestrial databases with fine-grained language annotations inspired by\nthe sensory reasoning necessary to e.g., identify a site of scientific interest\non Mars, building a synthetic dataset of visual-question-answer and visual\ninstruction-following tuples. We fine-tune a pre-trained LLaVA 13B checkpoint\non our augmented dataset to adapt a Vision-Language Model (VLM) to the visual\nsemantic features in an extraterrestrial environment, demonstrating FMs as a\ntool for specialization and enhancing a VLM's zero-shot performance on unseen\ntask types in comparison to state-of-the-art VLMs. Ablation studies show that\nfine-tuning the language backbone and vision-language adapter in concert is key\nto facilitate adaption while a small percentage, e.g., 20%, of the pre-training\ndata can be used to safeguard against catastrophic forgetting.\n","authors":["Matthew Foutter","Daniele Gammelli","Justin Kruger","Ethan Foss","Praneet Bhoj","Tommaso Guffanti","Simone D'Amico","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2408.05924v2.pdf","comment":"Accepted to IEEE Aerospace Conference, 23 pages, 18 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.10848v1","updated":"2025-01-18T18:48:06Z","published":"2025-01-18T18:48:06Z","title":"Fake Advertisements Detection Using Automated Multimodal Learning: A\n  Case Study for Vietnamese Real Estate Data","summary":"  The popularity of e-commerce has given rise to fake advertisements that can\nexpose users to financial and data risks while damaging the reputation of these\ne-commerce platforms. For these reasons, detecting and removing such fake\nadvertisements are important for the success of e-commerce websites. In this\npaper, we propose FADAML, a novel end-to-end machine learning system to detect\nand filter out fake online advertisements. Our system combines techniques in\nmultimodal machine learning and automated machine learning to achieve a high\ndetection rate. As a case study, we apply FADAML to detect fake advertisements\non popular Vietnamese real estate websites. Our experiments show that we can\nachieve 91.5% detection accuracy, which significantly outperforms three\ndifferent state-of-the-art fake news detection systems.\n","authors":["Duy Nguyen","Trung T. Nguyen","Cuong V. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.10848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10841v1","updated":"2025-01-18T18:22:27Z","published":"2025-01-18T18:22:27Z","title":"Practical and Ready-to-Use Methodology to Assess the re-identification\n  Risk in Anonymized Datasets","summary":"  To prove that a dataset is sufficiently anonymized, many privacy policies\nsuggest that a re-identification risk assessment be performed, but do not\nprovide a precise methodology for doing so, leaving the industry alone with the\nproblem. This paper proposes a practical and ready-to-use methodology for\nre-identification risk assessment, the originality of which is manifold: (1) it\nis the first to follow well-known risk analysis methods (e.g. EBIOS) that have\nbeen used in the cybersecurity field for years, which consider not only the\nability to perform an attack, but also the impact such an attack can have on an\nindividual; (2) it is the first to qualify attributes and values of attributes\nwith e.g. degree of exposure, as known real-world attacks mainly target certain\ntypes of attributes and not others.\n","authors":["Louis-Philippe Sondeck","Maryline Laurent"],"pdf_url":"https://arxiv.org/pdf/2501.10841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10836v1","updated":"2025-01-18T18:06:03Z","published":"2025-01-18T18:06:03Z","title":"BAP v2: An Enhanced Task Framework for Instruction Following in\n  Minecraft Dialogues","summary":"  Interactive agents capable of understanding and executing instructions in the\nphysical world have long been a central goal in AI research. The Minecraft\nCollaborative Building Task (MCBT) provides one such setting to work towards\nthis goal (Narayan-Chen, Jayannavar, and Hockenmaier 2019). It is a two-player\ngame in which an Architect (A) instructs a Builder (B) to construct a target\nstructure in a simulated Blocks World Environment. We focus on the challenging\nBuilder Action Prediction (BAP) subtask of predicting correct action sequences\nin a given multimodal game context with limited training data (Jayannavar,\nNarayan-Chen, and Hockenmaier 2020). We take a closer look at evaluation and\ndata for the BAP task, discovering key challenges and making significant\nimprovements on both fronts to propose BAP v2, an upgraded version of the task.\nThis will allow future work to make more efficient and meaningful progress on\nit. It comprises of: (1) an enhanced evaluation benchmark that includes a\ncleaner test set and fairer, more insightful metrics, and (2) additional\nsynthetic training data generated from novel Minecraft dialogue and target\nstructure simulators emulating the MCBT. We show that the synthetic data can be\nused to train more performant and robust neural models even with relatively\nsimple training methods. Looking ahead, such data could also be crucial for\ntraining more sophisticated, data-hungry deep transformer models and\ntraining/fine-tuning increasingly large LLMs. Although modeling is not the\nprimary focus of this work, we also illustrate the impact of our data and\ntraining methodologies on a simple LLM- and transformer-based model, thus\nvalidating the robustness of our approach, and setting the stage for more\nadvanced architectures and LLMs going forward.\n","authors":["Prashant Jayannavar","Liliang Ren","Marisa Hudspeth","Charlotte Lambert","Ariel Cordes","Elizabeth Kaplan","Anjali Narayan-Chen","Julia Hockenmaier"],"pdf_url":"https://arxiv.org/pdf/2501.10836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10834v1","updated":"2025-01-18T17:43:05Z","published":"2025-01-18T17:43:05Z","title":"Visual RAG: Expanding MLLM visual knowledge without fine-tuning","summary":"  Multimodal Large Language Models (MLLMs) have achieved notable performance in\ncomputer vision tasks that require reasoning across visual and textual\nmodalities, yet their capabilities are limited to their pre-trained data,\nrequiring extensive fine-tuning for updates. Recent researches have explored\nthe use of In-Context Learning (ICL) to overcome these challenges by providing\na set of demonstrating examples as context to augment MLLMs performance in\nseveral tasks, showing that many-shot ICL leads to substantial improvements\ncompared to few-shot ICL. However, the reliance on numerous demonstrating\nexamples and the limited MLLMs context windows presents significant obstacles.\nThis paper aims to address these challenges by introducing a novel approach,\nVisual RAG, that synergically combines the MLLMs capability to learn from the\ncontext, with a retrieval mechanism. The crux of this approach is to ensure to\naugment the MLLM knowledge by selecting only the most relevant demonstrating\nexamples for the query, pushing it to learn by analogy. In this way, relying on\nthe new information provided dynamically during inference time, the resulting\nsystem is not limited to the knowledge extracted from the training data, but\ncan be updated rapidly and easily without fine-tuning. Furthermore, this\ngreatly reduces the computational costs for improving the model image\nclassification performance, and augments the model knowledge to new visual\ndomains and tasks it was not trained for. Extensive experiments on eight\ndifferent datasets in the state of the art spanning several domains and image\nclassification tasks show that the proposed Visual RAG, compared to the most\nrecent state of the art (i.e., many-shot ICL), is able to obtain an accuracy\nthat is very close or even higher (approx. +2% improvement on average) while\nusing a much smaller set of demonstrating examples (approx. only 23% on\naverage).\n","authors":["Mirco Bonomo","Simone Bianco"],"pdf_url":"https://arxiv.org/pdf/2501.10834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07295v2","updated":"2025-01-18T17:11:43Z","published":"2023-04-13T03:46:40Z","title":"Experts' cognition-driven safe noisy labels learning for precise\n  segmentation of residual tumor in breast cancer","summary":"  Precise segmentation of residual tumor in breast cancer (PSRTBC) after\nneoadjuvant chemotherapy is a fundamental key technique in the treatment\nprocess of breast cancer. However, achieving PSRTBC is still a challenge, since\nthe breast cancer tissue and tumor cells commonly have complex and varied\nmorphological changes after neoadjuvant chemotherapy, which inevitably\nincreases the difficulty to produce a predictive model that has good\ngeneralization with usual supervised learning (SL). To alleviate this\nsituation, in this paper, we propose an experts' cognition-driven safe noisy\nlabels learning (ECDSNLL) approach. In the concept of safe noisy labels\nlearning, which is a typical type of safe weakly supervised learning, ECDSNLL\nis constructed by integrating the pathology experts' cognition about\nidentifying residual tumor in breast cancer and the artificial intelligence\nexperts' cognition about data modeling with provided data basis. Experimental\nresults show that, compared with usual SL, ECDSNLL can significantly improve\nthe lower bound of a number of UNet variants with 2.42% and 4.1% respectively\nin recall and fIoU for PSRTBC, while being able to achieve improvements in mean\nvalue and upper bound as well.\n","authors":["Yongquan Yang","Jie Chen","Yani Wei","Mohammad Alobaidi","Hong Bu"],"pdf_url":"https://arxiv.org/pdf/2304.07295v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10822v1","updated":"2025-01-18T16:56:50Z","published":"2025-01-18T16:56:50Z","title":"Addressing Multilabel Imbalance with an Efficiency-Focused Approach\n  Using Diffusion Model-Generated Synthetic Samples","summary":"  Predictive models trained on imbalanced data tend to produce biased results.\nThis problem is exacerbated when there is not just one output label, but a set\nof them. This is the case for multilabel learning (MLL) algorithms used to\nclassify patterns, rank labels, or learn the distribution of outputs. Many\nsolutions have been proposed in the literature. The one that can be applied\nuniversally, independent of the algorithm used to build the model, is data\nresampling. The generation of new instances associated with minority labels, so\nthat empty areas of the feature space are filled, helps to improve the obtained\nmodels. The quality of these new instances depends on the algorithm used to\ngenerate them. In this paper, a diffusion model tailored to produce new\ninstances for MLL data, called MLDM (\\textit{MultiLabel Diffusion Model}), is\nproposed. Diffusion models have been mainly used to generate artificial images\nand videos. Our proposed MLDM is based on this type of models. The experiments\nconducted compare MLDM with several other MLL resampling algorithms. The\nresults show that MLDM is competitive while it improves efficiency.\n","authors":["Francisco Charte","Miguel Ángel Dávila","María Dolores Pérez-Godoy","María José del Jesus"],"pdf_url":"https://arxiv.org/pdf/2501.10822v1.pdf","comment":"22 pages, 8 figures, 10 tables"},{"id":"http://arxiv.org/abs/2501.10814v1","updated":"2025-01-18T16:23:09Z","published":"2025-01-18T16:23:09Z","title":"No More Sliding Window: Efficient 3D Medical Image Segmentation with\n  Differentiable Top-k Patch Sampling","summary":"  3D models are favored over 2D for 3D medical image segmentation tasks due to\ntheir ability to leverage inter-slice relationship, yielding higher\nsegmentation accuracy. However, 3D models demand significantly more GPU memory\nwith increased model size and intermediate tensors. A common solution is to use\npatch-based training and make whole-volume predictions with sliding window (SW)\ninference. SW inference reduces memory usage but is slower due to equal\nresource allocation across patches and less accurate as it overlooks global\nfeatures beyond patches.\n  We propose NMSW-Net (No-More-Sliding-Window-Net), a novel framework that\nenhances efficiency and accuracy of any given 3D segmentation model by\neliminating SW inference and incorporating global predictions when necessary.\nNMSW-Net incorporates a differentiable Top-k module to sample only the relevant\npatches that enhance segmentation accuracy, thereby minimizing redundant\ncomputations. Additionally, it learns to leverage coarse global predictions\nwhen patch prediction alone is insufficient. NMSW-Net is model-agnostic, making\nit compatible with any 3D segmentation model that previously relied on SW\ninference.\n  Evaluated across 3 tasks with 3 segmentation backbones, NMSW-Net achieves\ncompetitive or sometimes superior accuracy compared to SW, while reducing\ncomputational complexity by 90% (87.5 to 7.95 TFLOPS), delivering 4x faster\ninference on the H100 GPU (19.0 to 4.3 sec), and 7x faster inference on the\nIntel Xeon Gold CPU (1710 to 230 seconds).\n","authors":["Young Seok Jeon","Hongfei Yang","Huazhu Fu","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2501.10814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10812v1","updated":"2025-01-18T16:22:07Z","published":"2025-01-18T16:22:07Z","title":"Graph Coloring to Reduce Computation Time in Prioritized Planning","summary":"  Distributing computations among agents in large networks reduces\ncomputational effort in multi-agent path finding (MAPF). One distribution\nstrategy is prioritized planning (PP). In PP, we couple and prioritize\ninteracting agents to achieve a desired behavior across all agents in the\nnetwork. We characterize the interaction with a directed acyclic graph (DAG).\nThe computation time for solving MAPF problem using PP is mainly determined\nthrough the longest path in this DAG. The longest path depends on the fixed\nundirected coupling graph and the variable prioritization. The approaches from\nliterature to prioritize agents are numerous and pursue various goals. This\narticle presents an approach for prioritization in PP to reduce the longest\npath length in the coupling DAG and thus the computation time for MAPF using\nPP. We prove that this problem can be mapped to a graph-coloring problem, in\nwhich the number of colors required corresponds to the longest path length in\nthe coupling DAG. We propose a decentralized graph-coloring algorithm to\ndetermine priorities for the agents. We evaluate the approach by applying it to\nmulti-agent motion planning (MAMP) for connected and automated vehicles (CAVs)\non roads using, a variant of MAPF.\n","authors":["Patrick Scheffe","Julius Kahle","Bassam Alrifaee"],"pdf_url":"https://arxiv.org/pdf/2501.10812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10809v1","updated":"2025-01-18T16:20:04Z","published":"2025-01-18T16:20:04Z","title":"Efficient Auto-Labeling of Large-Scale Poultry Datasets (ALPD) Using\n  Semi-Supervised Models, Active Learning, and Prompt-then-Detect Approach","summary":"  The rapid growth of AI in poultry farming has highlighted the challenge of\nefficiently labeling large, diverse datasets. Manual annotation is\ntime-consuming, making it impractical for modern systems that continuously\ngenerate data. This study explores semi-supervised auto-labeling methods,\nintegrating active learning, and prompt-then-detect paradigm to develop an\nefficient framework for auto-labeling of large poultry datasets aimed at\nadvancing AI-driven behavior and health monitoring. Viideo data were collected\nfrom broilers and laying hens housed at the University of Arkansas and the\nUniversity of Georgia. The collected videos were converted into images,\npre-processed, augmented, and labeled. Various machine learning models,\nincluding zero-shot models like Grounding DINO, YOLO-World, and CLIP, and\nsupervised models like YOLO and Faster-RCNN, were utilized for broilers, hens,\nand behavior detection. The results showed that YOLOv8s-World and YOLOv9s\nperformed better when compared performance metrics for broiler and hen\ndetection under supervised learning, while among the semi-supervised model,\nYOLOv8s-ALPD achieved the highest precision (96.1%) and recall (99.0%) with an\nRMSE of 1.9. The hybrid YOLO-World model, incorporating the optimal YOLOv8s\nbackbone, demonstrated the highest overall performance. It achieved a precision\nof 99.2%, recall of 99.4%, and an F1 score of 98.7% for breed detection,\nalongside a precision of 88.4%, recall of 83.1%, and an F1 score of 84.5% for\nindividual behavior detection. Additionally, semi-supervised models showed\nsignificant improvements in behavior detection, achieving up to 31% improvement\nin precision and 16% in F1-score. The semi-supervised models with minimal\nactive learning reduced annotation time by over 80% compared to full manual\nlabeling. Moreover, integrating zero-shot models enhanced detection and\nbehavior identification.\n","authors":["Ramesh Bahadur Bist","Lilong Chai","Shawna Weimer","Hannah Atungulua","Chantel Pennicott","Xiao Yang","Sachin Subedi","Chaitanya Pallerla","Yang Tian","Dongyi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01243v2","updated":"2025-01-18T16:00:13Z","published":"2024-12-02T08:05:26Z","title":"Schedule On the Fly: Diffusion Time Prediction for Faster and Better\n  Image Generation","summary":"  Diffusion and flow models have achieved remarkable successes in various\napplications such as text-to-image generation. However, these models typically\nrely on the same predetermined denoising schedules during inference for each\nprompt, which potentially limits the inference efficiency as well as the\nflexibility when handling different prompts. In this paper, we argue that the\noptimal noise schedule should adapt to each inference instance, and introduce\nthe Time Prediction Diffusion Model (TPDM) to accomplish this. TPDM employs a\nplug-and-play Time Prediction Module (TPM) that predicts the next noise level\nbased on current latent features at each denoising step. We train the TPM using\nreinforcement learning, aiming to maximize a reward that discounts the final\nimage quality by the number of denoising steps. With such an adaptive\nscheduler, TPDM not only generates high-quality images that are aligned closely\nwith human preferences but also adjusts the number of denoising steps and time\non the fly, enhancing both performance and efficiency. We train TPDMs on\nmultiple diffusion model benchmarks. With Stable Diffusion 3 Medium\narchitecture, TPDM achieves an aesthetic score of 5.44 and a human preference\nscore (HPS) of 29.59, while using around 50% fewer denoising steps to achieve\nbetter performance. We will release our best model alongside this paper.\n","authors":["Zilyu Ye","Zhiyang Chen","Tiancheng Li","Zemin Huang","Weijian Luo","Guo-Jun Qi"],"pdf_url":"https://arxiv.org/pdf/2412.01243v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04782v2","updated":"2025-01-18T15:46:42Z","published":"2024-12-06T05:20:04Z","title":"A Survey of Sustainability in Large Language Models: Applications,\n  Economics, and Challenges","summary":"  Large Language Models (LLMs) have transformed numerous domains by providing\nadvanced capabilities in natural language understanding, generation, and\nreasoning. Despite their groundbreaking applications across industries such as\nresearch, healthcare, and creative media, their rapid adoption raises critical\nconcerns regarding sustainability. This survey paper comprehensively examines\nthe environmental, economic, and computational challenges associated with LLMs,\nfocusing on energy consumption, carbon emissions, and resource utilization in\ndata centers. By synthesizing insights from existing literature, this work\nexplores strategies such as resource-efficient training, sustainable deployment\npractices, and lifecycle assessments to mitigate the environmental impacts of\nLLMs. Key areas of emphasis include energy optimization, renewable energy\nintegration, and balancing performance with sustainability. The findings aim to\nguide researchers, practitioners, and policymakers in developing actionable\nstrategies for sustainable AI systems, fostering a responsible and\nenvironmentally conscious future for artificial intelligence.\n","authors":["Aditi Singh","Nirmal Prakashbhai Patel","Abul Ehtesham","Saket Kumar","Tala Talaei Khoei"],"pdf_url":"https://arxiv.org/pdf/2412.04782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10799v1","updated":"2025-01-18T15:38:03Z","published":"2025-01-18T15:38:03Z","title":"Step-KTO: Optimizing Mathematical Reasoning through Stepwise Binary\n  Feedback","summary":"  Large language models (LLMs) have recently demonstrated remarkable success in\nmathematical reasoning. Despite progress in methods like chain-of-thought\nprompting and self-consistency sampling, these advances often focus on final\ncorrectness without ensuring that the underlying reasoning process is coherent\nand reliable. This paper introduces Step-KTO, a training framework that\ncombines process-level and outcome-level binary feedback to guide LLMs toward\nmore trustworthy reasoning trajectories. By providing binary evaluations for\nboth the intermediate reasoning steps and the final answer, Step-KTO encourages\nthe model to adhere to logical progressions rather than relying on superficial\nshortcuts. Our experiments on challenging mathematical benchmarks show that\nStep-KTO significantly improves both final answer accuracy and the quality of\nintermediate reasoning steps. For example, on the MATH-500 dataset, Step-KTO\nachieves a notable improvement in Pass@1 accuracy over strong baselines. These\nresults highlight the promise of integrating stepwise process feedback into LLM\ntraining, paving the way toward more interpretable and dependable reasoning\ncapabilities.\n","authors":["Yen-Ting Lin","Di Jin","Tengyu Xu","Tianhao Wu","Sainbayar Sukhbaatar","Chen Zhu","Yun He","Yun-Nung Chen","Jason Weston","Yuandong Tian","Arash Rahnama","Sinong Wang","Hao Ma","Han Fang"],"pdf_url":"https://arxiv.org/pdf/2501.10799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10782v1","updated":"2025-01-18T14:43:40Z","published":"2025-01-18T14:43:40Z","title":"ML-SceGen: A Multi-level Scenario Generation Framework","summary":"  Current scientific research witnesses various attempts at applying Large\nLanguage Models for scenario generation but is inclined only to comprehensive\nor dangerous scenarios. In this paper, we seek to build a three-stage framework\nthat not only lets users regain controllability over the generated scenarios\nbut also generates comprehensive scenarios containing danger factors in\nuncontrolled intersection settings. In the first stage, LLM agents will\ncontribute to translating the key components of the description of the expected\nscenarios into Functional Scenarios. For the second stage, we use Answer Set\nProgramming (ASP) solver Clingo to help us generate comprehensive logical\ntraffic within intersections. During the last stage, we use LLM to update\nrelevant parameters to increase the critical level of the concrete scenario.\n","authors":["Yicheng Xiao","Yangyang Sun","Yicheng Lin"],"pdf_url":"https://arxiv.org/pdf/2501.10782v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2501.10781v1","updated":"2025-01-18T14:35:32Z","published":"2025-01-18T14:35:32Z","title":"Simultaneous Computation with Multiple Prioritizations in Multi-Agent\n  Motion Planning","summary":"  Multi-agent path finding (MAPF) in large networks is computationally\nchallenging. An approach for MAPF is prioritized planning (PP), in which agents\nplan sequentially according to their priority. Albeit a computationally\nefficient approach for MAPF, the solution quality strongly depends on the\nprioritization. Most prioritizations rely either on heuristics, which do not\ngeneralize well, or iterate to find adequate priorities, which costs\ncomputational effort. In this work, we show how agents can compute with\nmultiple prioritizations simultaneously. Our approach is general as it does not\nrely on domain-specific knowledge. The context of this work is multi-agent\nmotion planning (MAMP) with a receding horizon subject to computation time\nconstraints. MAMP considers the system dynamics in more detail compared to\nMAPF. In numerical experiments on MAMP, we demonstrate that our approach to\nprioritization comes close to optimal prioritization and outperforms\nstate-of-the-art methods with only a minor increase in computation time. We\nshow real-time capability in an experiment on a road network with ten vehicles\nin our Cyber-Physical Mobility Lab.\n","authors":["Patrick Scheffe","Julius Kahle","Bassam Alrifaee"],"pdf_url":"https://arxiv.org/pdf/2501.10781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10775v1","updated":"2025-01-18T14:08:33Z","published":"2025-01-18T14:08:33Z","title":"MedFILIP: Medical Fine-grained Language-Image Pre-training","summary":"  Medical vision-language pretraining (VLP) that leverages naturally-paired\nmedical image-report data is crucial for medical image analysis. However,\nexisting methods struggle to accurately characterize associations between\nimages and diseases, leading to inaccurate or incomplete diagnostic results. In\nthis work, we propose MedFILIP, a fine-grained VLP model, introduces medical\nimage-specific knowledge through contrastive learning, specifically: 1) An\ninformation extractor based on a large language model is proposed to decouple\ncomprehensive disease details from reports, which excels in extracting disease\ndeals through flexible prompt engineering, thereby effectively reducing text\ncomplexity while retaining rich information at a tiny cost. 2) A knowledge\ninjector is proposed to construct relationships between categories and visual\nattributes, which help the model to make judgments based on image features, and\nfosters knowledge extrapolation to unfamiliar disease categories. 3) A semantic\nsimilarity matrix based on fine-grained annotations is proposed, providing\nsmoother, information-richer labels, thus allowing fine-grained image-text\nalignment. 4) We validate MedFILIP on numerous datasets, e.g., RSNA-Pneumonia,\nNIH ChestX-ray14, VinBigData, and COVID-19. For single-label, multi-label, and\nfine-grained classification, our model achieves state-of-the-art performance,\nthe classification accuracy has increased by a maximum of 6.69\\%. The code is\navailable in https://github.com/PerceptionComputingLab/MedFILIP.\n","authors":["Xinjie Liang","Xiangyu Li","Fanding Li","Jie Jiang","Qing Dong","Wei Wang","Kuanquan Wang","Suyu Dong","Gongning Luo","Shuo Li"],"pdf_url":"https://arxiv.org/pdf/2501.10775v1.pdf","comment":"10 pages, 5 figures, IEEE Journal of Biomedical and Health\n  Informatics 2025"},{"id":"http://arxiv.org/abs/2501.10770v1","updated":"2025-01-18T13:54:33Z","published":"2025-01-18T13:54:33Z","title":"Enhancing Diagnostic in 3D COVID-19 Pneumonia CT-scans through\n  Explainable Uncertainty Bayesian Quantification","summary":"  Accurately classifying COVID-19 pneumonia in 3D CT scans remains a\nsignificant challenge in the field of medical image analysis. Although\ndeterministic neural networks have shown promising results in this area, they\nprovide only point estimates outputs yielding poor diagnostic in clinical\ndecision-making. In this paper, we explore the use of Bayesian neural networks\nfor classifying COVID-19 pneumonia in 3D CT scans providing uncertainties in\ntheir predictions. We compare deterministic networks and their Bayesian\ncounterpart, enhancing the decision-making accuracy under uncertainty\ninformation. Remarkably, our findings reveal that lightweight architectures\nachieve the highest accuracy of 96\\% after developing extensive hyperparameter\ntuning. Furthermore, the Bayesian counterpart of these architectures via\nMultiplied Normalizing Flow technique kept a similar performance along with\ncalibrated uncertainty estimates. Finally, we have developed a 3D-visualization\napproach to explain the neural network outcomes based on SHAP values. We\nconclude that explainability along with uncertainty quantification will offer\nbetter clinical decisions in medical image analysis, contributing to ongoing\nefforts for improving the diagnosis and treatment of COVID-19 pneumonia.\n","authors":["Juan Manuel Liscano Fierro","Hector J. Hortua"],"pdf_url":"https://arxiv.org/pdf/2501.10770v1.pdf","comment":"61 pages, 16 figures. Comments are welcome"},{"id":"http://arxiv.org/abs/2501.10768v1","updated":"2025-01-18T13:54:00Z","published":"2025-01-18T13:54:00Z","title":"MAPS: Advancing Multi-Modal Reasoning in Expert-Level Physical Science","summary":"  Pre-trained on extensive text and image corpora, current Multi-Modal Large\nLanguage Models (MLLM) have shown strong capabilities in general visual\nreasoning tasks. However, their performance is still lacking in physical\ndomains that require understanding diagrams with complex physical structures\nand quantitative analysis based on multi-modal information. To address this, we\ndevelop a new framework, named Multi-Modal Scientific Reasoning with Physics\nPerception and Simulation (MAPS) based on an MLLM. MAPS decomposes expert-level\nmulti-modal reasoning task into physical diagram understanding via a Physical\nPerception Model (PPM) and reasoning with physical knowledge via a simulator.\nThe PPM module is obtained by fine-tuning a visual language model using\ncarefully designed synthetic data with paired physical diagrams and\ncorresponding simulation language descriptions. At the inference stage, MAPS\nintegrates the simulation language description of the input diagram provided by\nPPM and results obtained through a Chain-of-Simulation process with MLLM to\nderive the underlying rationale and the final answer. Validated using our\ncollected college-level circuit analysis problems, MAPS significantly improves\nreasoning accuracy of MLLM and outperforms all existing models. The results\nconfirm MAPS offers a promising direction for enhancing multi-modal scientific\nreasoning ability of MLLMs. We will release our code, model and dataset used\nfor our experiments upon publishing of this paper.\n","authors":["Erle Zhu","Yadi Liu","Zhe Zhang","Xujun Li","Jin Zhou","Xinjie Yu","Minlie Huang","Hongning Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09444v2","updated":"2025-01-18T13:32:15Z","published":"2025-01-16T10:17:58Z","title":"Solving the Unsolvable: Translating Case Law in Hong Kong","summary":"  This paper addresses the challenges translating case law under Hong Kong's\nbilingual legal system. It highlights the initial success of translating all\nwritten statutes into Chinese before the 1997 handover, a task mandated by the\nBasic Law. The effort involved significant collaboration among legal,\nlinguistic, and translation experts, resulting in a comprehensive and\nculturally appropriate bilingual legal system. However, translating case law\nremains a significant challenge due to the sheer volume and continuous growth\nof judicial decisions. The paper critiques the governments and judiciarys\nsporadic and uncoordinated efforts to translate case law, contrasting it with\nthe thorough approach previously taken for statute translation. Although the\ngovernment acknowledges the importance of legal bilingualism, it lacks a\nsustainable strategy for translating case law. The Judiciarys position that\ntranslating all judgments is unnecessary, unrealistic, and not cost-effectiveis\nanalyzed and critiqued for its impact on legal transparency and public trust. A\nproposed solution involves leveraging machine translation technology through a\nhuman-machine interactive translation platform, which undergoes two major\ntransitions. Initially based on a neural model, the platform transitions to\nusing a large language model for improved translation accuracy. Furthermore, it\nevolves from a single-agent system to a multi-agent system, incorporating\nTranslator, Annotator, and Proofreader agents. This multi-agent approach,\nsupported by a grant, aims to facilitate efficient, high-quality translation of\njudicial judgments by integrating advanced artificial intelligence and\ncontinuous feedback mechanisms, thus better meeting the needs of a bilingual\nlegal system.\n","authors":["King-kui Sin","Xi Xuan","Chunyu Kit","Clara Ho-yan Chan","Honic Ho-kin Ip"],"pdf_url":"https://arxiv.org/pdf/2501.09444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01455v3","updated":"2025-01-18T12:51:29Z","published":"2024-06-03T15:43:29Z","title":"Automatic Fused Multimodal Deep Learning for Plant Identification","summary":"  Plant classification is vital for ecological conservation and agricultural\nproductivity, enhancing our understanding of plant growth dynamics and aiding\nspecies preservation. The advent of deep learning (DL) techniques has\nrevolutionized this field by enabling autonomous feature extraction,\nsignificantly reducing the dependence on manual expertise. However,\nconventional DL models often rely solely on single data sources, failing to\ncapture the full biological diversity of plant species comprehensively. Recent\nresearch has turned to multimodal learning to overcome this limitation by\nintegrating multiple data types, which enriches the representation of plant\ncharacteristics. This shift introduces the challenge of determining the optimal\npoint for modality fusion. In this paper, we introduce a pioneering multimodal\nDL-based approach for plant classification with automatic modality fusion.\nUtilizing the multimodal fusion architecture search, our method integrates\nimages from multiple plant organs -- flowers, leaves, fruits, and stems -- into\na cohesive model. To address the lack of multimodal datasets, we contributed\nMultimodal-PlantCLEF, a restructured version of the PlantCLEF2015 dataset\ntailored for multimodal tasks. Our method achieves 82.61% accuracy on 979\nclasses of Multimodal-PlantCLEF, surpassing state-of-the-art methods and\noutperforming late fusion by 10.33%. Through the incorporation of multimodal\ndropout, our approach demonstrates strong robustness to missing modalities. We\nvalidate our model against established benchmarks using standard performance\nmetrics and McNemar's test, further underscoring its superiority.\n","authors":["Alfreds Lapkovskis","Natalia Nefedova","Ali Beikmohammadi"],"pdf_url":"https://arxiv.org/pdf/2406.01455v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10736v1","updated":"2025-01-18T11:57:20Z","published":"2025-01-18T11:57:20Z","title":"Semi-supervised Semantic Segmentation for Remote Sensing Images via\n  Multi-scale Uncertainty Consistency and Cross-Teacher-Student Attention","summary":"  Semi-supervised learning offers an appealing solution for remote sensing (RS)\nimage segmentation to relieve the burden of labor-intensive pixel-level\nlabeling. However, RS images pose unique challenges, including rich multi-scale\nfeatures and high inter-class similarity. To address these problems, this paper\nproposes a novel semi-supervised Multi-Scale Uncertainty and\nCross-Teacher-Student Attention (MUCA) model for RS image semantic segmentation\ntasks. Specifically, MUCA constrains the consistency among feature maps at\ndifferent layers of the network by introducing a multi-scale uncertainty\nconsistency regularization. It improves the multi-scale learning capability of\nsemi-supervised algorithms on unlabeled data. Additionally, MUCA utilizes a\nCross-Teacher-Student attention mechanism to guide the student network, guiding\nthe student network to construct more discriminative feature representations\nthrough complementary features from the teacher network. This design\neffectively integrates weak and strong augmentations (WA and SA) to further\nboost segmentation performance. To verify the effectiveness of our model, we\nconduct extensive experiments on ISPRS-Potsdam and LoveDA datasets. The\nexperimental results show the superiority of our method over state-of-the-art\nsemi-supervised methods. Notably, our model excels in distinguishing highly\nsimilar objects, showcasing its potential for advancing semi-supervised RS\nimage segmentation tasks.\n","authors":["Shanwen Wang","Changrui Chen","Xin Sun","Danfeng Hong","Jungong Han"],"pdf_url":"https://arxiv.org/pdf/2501.10736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10734v1","updated":"2025-01-18T11:53:22Z","published":"2025-01-18T11:53:22Z","title":"GEC-RAG: Improving Generative Error Correction via Retrieval-Augmented\n  Generation for Automatic Speech Recognition Systems","summary":"  Automatic Speech Recognition (ASR) systems have demonstrated remarkable\nperformance across various applications. However, limited data and the unique\nlanguage features of specific domains, such as low-resource languages,\nsignificantly degrade their performance and lead to higher Word Error Rates\n(WER). In this study, we propose Generative Error Correction via\nRetrieval-Augmented Generation (GEC-RAG), a novel approach designed to improve\nASR accuracy for low-resource domains, like Persian. Our approach treats the\nASR system as a black-box, a common practice in cloud-based services, and\nproposes a Retrieval-Augmented Generation (RAG) approach within the In-Context\nLearning (ICL) scheme to enhance the quality of ASR predictions. By\nconstructing a knowledge base that pairs ASR predictions (1-best and 5-best\nhypotheses) with their corresponding ground truths, GEC-RAG retrieves lexically\nsimilar examples to the ASR transcription using the Term Frequency-Inverse\nDocument Frequency (TF-IDF) measure. This process provides relevant error\npatterns of the system alongside the ASR transcription to the Generative Large\nLanguage Model (LLM), enabling targeted corrections. Our results demonstrate\nthat this strategy significantly reduces WER in Persian and highlights a\npotential for domain adaptation and low-resource scenarios. This research\nunderscores the effectiveness of using RAG in enhancing ASR systems without\nrequiring direct model modification or fine-tuning, making it adaptable to any\ndomain by simply updating the transcription knowledge base with domain-specific\ndata.\n","authors":["Amin Robatian","Mohammad Hajipour","Mohammad Reza Peyghan","Fatemeh Rajabi","Sajjad Amini","Shahrokh Ghaemmaghami","Iman Gholampour"],"pdf_url":"https://arxiv.org/pdf/2501.10734v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2501.10727v1","updated":"2025-01-18T11:03:59Z","published":"2025-01-18T11:03:59Z","title":"In the Picture: Medical Imaging Datasets, Artifacts, and their Living\n  Review","summary":"  Datasets play a critical role in medical imaging research, yet issues such as\nlabel quality, shortcuts, and metadata are often overlooked. This lack of\nattention may harm the generalizability of algorithms and, consequently,\nnegatively impact patient outcomes. While existing medical imaging literature\nreviews mostly focus on machine learning (ML) methods, with only a few focusing\non datasets for specific applications, these reviews remain static -- they are\npublished once and not updated thereafter. This fails to account for emerging\nevidence, such as biases, shortcuts, and additional annotations that other\nresearchers may contribute after the dataset is published. We refer to these\nnewly discovered findings of datasets as research artifacts. To address this\ngap, we propose a living review that continuously tracks public datasets and\ntheir associated research artifacts across multiple medical imaging\napplications. Our approach includes a framework for the living review to\nmonitor data documentation artifacts, and an SQL database to visualize the\ncitation relationships between research artifact and dataset. Lastly, we\ndiscuss key considerations for creating medical imaging datasets, review best\npractices for data annotation, discuss the significance of shortcuts and\ndemographic diversity, and emphasize the importance of managing datasets\nthroughout their entire lifecycle. Our demo is publicly available at\nhttp://130.226.140.142.\n","authors":["Amelia Jiménez-Sánchez","Natalia-Rozalia Avlona","Sarah de Boer","Víctor M. Campello","Aasa Feragen","Enzo Ferrante","Melanie Ganz","Judy Wawira Gichoya","Camila González","Steff Groefsema","Alessa Hering","Adam Hulman","Leo Joskowicz","Dovile Juodelyte","Melih Kandemir","Thijs Kooi","Jorge del Pozo Lérida","Livie Yumeng Li","Andre Pacheco","Tim Rädsch","Mauricio Reyes","Théo Sourget","Bram van Ginneken","David Wen","Nina Weng","Jack Junchi Xu","Hubert Dariusz Zając","Maria A. Zuluaga","Veronika Cheplygina"],"pdf_url":"https://arxiv.org/pdf/2501.10727v1.pdf","comment":"Manuscript under review"},{"id":"http://arxiv.org/abs/2411.19094v2","updated":"2025-01-18T10:29:00Z","published":"2024-11-28T12:14:24Z","title":"Beautimeter: Harnessing GPT for Assessing Architectural and Urban Beauty\n  based on the 15 Properties of Living Structure","summary":"  Beautimeter is a new tool powered by generative pre-trained transformer (GPT)\ntechnology, designed to evaluate architectural and urban beauty. Rooted in\nChristopher Alexander's theory of centers, this work builds on the idea that\nall environments possess, to varying degrees, an innate sense of life.\nAlexander identified 15 fundamental properties, such as levels of scale and\nthick boundaries, that characterize living structure, which Beautimeter uses as\na basis for its analysis. By integrating GPT's advanced natural language\nprocessing capabilities, Beautimeter assesses the extent to which a structure\nembodies these 15 properties, enabling a nuanced evaluation of architectural\nand urban aesthetics. Using ChatGPT, the tool helps users generate insights\ninto the perceived beauty and coherence of spaces. We conducted a series of\ncase studies, evaluating images of architectural and urban environments, as\nwell as carpets, paintings, and other artifacts. The results demonstrate\nBeautimeter's effectiveness in analyzing aesthetic qualities across diverse\ncontexts. Our findings suggest that by leveraging GPT technology, Beautimeter\noffers architects, urban planners, and designers a powerful tool to create\nspaces that resonate deeply with people. This paper also explores the\nimplications of such technology for architecture and urban design, highlighting\nits potential to enhance both the design process and the assessment of built\nenvironments. Keywords: Living structure, structural beauty, Christopher\nAlexander, AI in Design, human centered design\n","authors":["Bin Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.19094v2.pdf","comment":"11 pages, 6 figure, and two tables"},{"id":"http://arxiv.org/abs/2501.10711v1","updated":"2025-01-18T09:51:57Z","published":"2025-01-18T09:51:57Z","title":"How Should I Build A Benchmark?","summary":"  Various benchmarks have been proposed to assess the performance of large\nlanguage models (LLMs) in different coding scenarios. We refer to them as\ncode-related benchmarks. However, there are no systematic guidelines by which\nsuch a benchmark should be developed to ensure its quality, reliability, and\nreproducibility. We propose How2Bench, which is comprised of a 55- 55-criteria\nchecklist as a set of guidelines to govern the development of code-related\nbenchmarks comprehensively. Using HOW2BENCH, we profiled 274 benchmarks\nreleased within the past decade and found concerning issues. Nearly 70% of the\nbenchmarks did not take measures for data quality assurance; over 10% did not\neven open source or only partially open source. Many highly cited benchmarks\nhave loopholes, including duplicated samples, incorrect reference\ncodes/tests/prompts, and unremoved sensitive/confidential information. Finally,\nwe conducted a human study involving 49 participants, which revealed\nsignificant gaps in awareness of the importance of data quality,\nreproducibility, and transparency.\n","authors":["Jialun Cao","Yuk-Kit Chan","Zixuan Ling","Wenxuan Wang","Shuqing Li","Mingwei Liu","Chaozheng Wang","Boxi Yu","Pinjia He","Shuai Wang","Zibin Zheng","Michael R. Lyu","Shing-Chi Cheung"],"pdf_url":"https://arxiv.org/pdf/2501.10711v1.pdf","comment":"42 pages"},{"id":"http://arxiv.org/abs/2501.10709v1","updated":"2025-01-18T09:36:14Z","published":"2025-01-18T09:36:14Z","title":"Revisiting Ensemble Methods for Stock Trading and Crypto Trading Tasks\n  at ACM ICAIF FinRL Contest 2023-2024","summary":"  Reinforcement learning has demonstrated great potential for performing\nfinancial tasks. However, it faces two major challenges: policy instability and\nsampling bottlenecks. In this paper, we revisit ensemble methods with massively\nparallel simulations on graphics processing units (GPUs), significantly\nenhancing the computational efficiency and robustness of trained models in\nvolatile financial markets. Our approach leverages the parallel processing\ncapability of GPUs to significantly improve the sampling speed for training\nensemble models. The ensemble models combine the strengths of component agents\nto improve the robustness of financial decision-making strategies. We conduct\nexperiments in both stock and cryptocurrency trading tasks to evaluate the\neffectiveness of our approach. Massively parallel simulation on a single GPU\nimproves the sampling speed by up to $1,746\\times$ using $2,048$ parallel\nenvironments compared to a single environment. The ensemble models have high\ncumulative returns and outperform some individual agents, reducing maximum\ndrawdown by up to $4.17\\%$ and improving the Sharpe ratio by up to $0.21$.\n  This paper describes trading tasks at ACM ICAIF FinRL Contests in 2023 and\n2024.\n","authors":["Nikolaus Holzer","Keyi Wang","Kairong Xiao","Xiao-Yang Liu Yanglet"],"pdf_url":"https://arxiv.org/pdf/2501.10709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10696v1","updated":"2025-01-18T08:26:33Z","published":"2025-01-18T08:26:33Z","title":"Algorithmic Derivation of Human Spatial Navigation Indices From Eye\n  Movement Data","summary":"  Spatial navigation is a complex cognitive function involving sensory inputs,\nsuch as visual, auditory, and proprioceptive information, to understand and\nmove within space. This ability allows humans to create mental maps, navigate\nthrough environments, and process directional cues, crucial for exploring new\nplaces and finding one's way in unfamiliar surroundings. This study takes an\nalgorithmic approach to extract indices relevant to human spatial navigation\nusing eye movement data. Leveraging electrooculography signals, we analyzed\nstatistical features and applied feature engineering techniques to study eye\nmovements during navigation tasks. The proposed work combines signal processing\nand machine learning approaches to develop indices for navigation and\norientation, spatial anxiety, landmark recognition, path survey, and path\nroute. The analysis yielded five subscore indices with notable accuracy. Among\nthese, the navigation and orientation subscore achieved an R2 score of 0.72,\nwhile the landmark recognition subscore attained an R2 score of 0.50.\nAdditionally, statistical features highly correlated with eye movement metrics,\nincluding blinks, saccades, and fixations, were identified. The findings of\nthis study can lead to more cognitive assessments and enable early detection of\nspatial navigation impairments, particularly among individuals at risk of\ncognitive decline.\n","authors":["Sobhan Teymouri","Fatemeh Alizadehziri","Mobina Zibandehpoor","Mehdi Delrobaei"],"pdf_url":"https://arxiv.org/pdf/2501.10696v1.pdf","comment":"The dataset is available in the following work: Mobina Zibandehpoor,\n  Fatemeh Alizadehziri, Arash Abbasi Larki, Sobhan Teymouri, and Mehdi\n  Delrobaei. Electrooculography Dataset for Objective Spatial Navigation\n  Assessment in Healthy Participants. arXiv preprint arXiv:2411.06811, 2024"},{"id":"http://arxiv.org/abs/2501.10693v1","updated":"2025-01-18T08:12:56Z","published":"2025-01-18T08:12:56Z","title":"Distributionally Robust Policy Evaluation and Learning for Continuous\n  Treatment with Observational Data","summary":"  Using offline observational data for policy evaluation and learning allows\ndecision-makers to evaluate and learn a policy that connects characteristics\nand interventions. Most existing literature has focused on either discrete\ntreatment spaces or assumed no difference in the distributions between the\npolicy-learning and policy-deployed environments. These restrict applications\nin many real-world scenarios where distribution shifts are present with\ncontinuous treatment. To overcome these challenges, this paper focuses on\ndeveloping a distributionally robust policy under a continuous treatment\nsetting. The proposed distributionally robust estimators are established using\nthe Inverse Probability Weighting (IPW) method extended from the discrete one\nfor policy evaluation and learning under continuous treatments. Specifically,\nwe introduce a kernel function into the proposed IPW estimator to mitigate the\nexclusion of observations that can occur in the standard IPW method to\ncontinuous treatments. We then provide finite-sample analysis that guarantees\nthe convergence of the proposed distributionally robust policy evaluation and\nlearning estimators. The comprehensive experiments further verify the\neffectiveness of our approach when distribution shifts are present.\n","authors":["Cheuk Hang Leung","Yiyan Huang","Yijun Li","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2501.10693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10688v1","updated":"2025-01-18T07:58:45Z","published":"2025-01-18T07:58:45Z","title":"Simulation of Hypergraph Algorithms with Looped Transformers","summary":"  Looped Transformers have shown exceptional capability in simulating\ntraditional graph algorithms, but their application to more complex structures\nlike hypergraphs remains underexplored. Hypergraphs generalize graphs by\nmodeling higher-order relationships among multiple entities, enabling richer\nrepresentations but introducing significant computational challenges. In this\nwork, we extend the Loop Transformer architecture to simulate hypergraph\nalgorithms efficiently, addressing the gap between neural networks and\ncombinatorial optimization over hypergraphs. In this paper, we extend the Loop\nTransformer architecture to simulate hypergraph algorithms efficiently,\naddressing the gap between neural networks and combinatorial optimization over\nhypergraphs. Specifically, we propose a novel degradation mechanism for\nreducing hypergraphs to graph representations, enabling the simulation of\ngraph-based algorithms, such as Dijkstra's shortest path. Furthermore, we\nintroduce a hyperedge-aware encoding scheme to simulate hypergraph-specific\nalgorithms, exemplified by Helly's algorithm. The paper establishes theoretical\nguarantees for these simulations, demonstrating the feasibility of processing\nhigh-dimensional and combinatorial data using Loop Transformers. This work\nhighlights the potential of Transformers as general-purpose algorithmic solvers\nfor structured data.\n","authors":["Xiaoyu Li","Yingyu Liang","Jiangxuan Long","Zhenmei Shi","Zhao Song","Zhen Zhuang"],"pdf_url":"https://arxiv.org/pdf/2501.10688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06959v2","updated":"2025-01-18T07:02:25Z","published":"2024-06-11T05:35:18Z","title":"Unleashing the Denoising Capability of Diffusion Prior for Solving\n  Inverse Problems","summary":"  The recent emergence of diffusion models has significantly advanced the\nprecision of learnable priors, presenting innovative avenues for addressing\ninverse problems. Since inverse problems inherently entail maximum a posteriori\nestimation, previous works have endeavored to integrate diffusion priors into\nthe optimization frameworks. However, prevailing optimization-based inverse\nalgorithms primarily exploit the prior information within the diffusion models\nwhile neglecting their denoising capability. To bridge this gap, this work\nleverages the diffusion process to reframe noisy inverse problems as a\ntwo-variable constrained optimization task by introducing an auxiliary\noptimization variable. By employing gradient truncation, the projection\ngradient descent method is efficiently utilized to solve the corresponding\noptimization problem. The proposed algorithm, termed ProjDiff, effectively\nharnesses the prior information and the denoising capability of a pre-trained\ndiffusion model within the optimization framework. Extensive experiments on the\nimage restoration tasks and source separation and partial generation tasks\ndemonstrate that ProjDiff exhibits superior performance across various linear\nand nonlinear inverse problems, highlighting its potential for practical\napplications. Code is available at https://github.com/weigerzan/ProjDiff/.\n","authors":["Jiawei Zhang","Jiaxin Zhuang","Cheng Jin","Gen Li","Yuantao Gu"],"pdf_url":"https://arxiv.org/pdf/2406.06959v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.10677v1","updated":"2025-01-18T06:59:36Z","published":"2025-01-18T06:59:36Z","title":"Class-Imbalanced-Aware Adaptive Dataset Distillation for Scalable\n  Pretrained Model on Credit Scoring","summary":"  The advent of artificial intelligence has significantly enhanced credit\nscoring technologies. Despite the remarkable efficacy of advanced deep learning\nmodels, mainstream adoption continues to favor tree-structured models due to\ntheir robust predictive performance on tabular data. Although pretrained models\nhave seen considerable development, their application within the financial\nrealm predominantly revolves around question-answering tasks and the use of\nsuch models for tabular-structured credit scoring datasets remains largely\nunexplored. Tabular-oriented large models, such as TabPFN, has made the\napplication of large models in credit scoring feasible, albeit can only\nprocessing with limited sample sizes. This paper provides a novel framework to\ncombine tabular-tailored dataset distillation technique with the pretrained\nmodel, empowers the scalability for TabPFN. Furthermore, though class imbalance\ndistribution is the common nature in financial datasets, its influence during\ndataset distillation has not been explored. We thus integrate the\nimbalance-aware techniques during dataset distillation, resulting in improved\nperformance in financial datasets (e.g., a 2.5% enhancement in AUC). This study\npresents a novel framework for scaling up the application of large pretrained\nmodels on financial tabular datasets and offers a comparative analysis of the\ninfluence of class imbalance on the dataset distillation process. We believe\nthis approach can broaden the applications and downstream tasks of large models\nin the financial domain.\n","authors":["Xia Li","Hanghang Zheng","Xiao Chen","Hong Liu","Mao Mao"],"pdf_url":"https://arxiv.org/pdf/2501.10677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07863v2","updated":"2025-01-18T06:45:01Z","published":"2024-10-10T12:30:56Z","title":"Learning to Balance Altruism and Self-interest Based on Empathy in\n  Mixed-Motive Games","summary":"  Real-world multi-agent scenarios often involve mixed motives, demanding\naltruistic agents capable of self-protection against potential exploitation.\nHowever, existing approaches often struggle to achieve both objectives. In this\npaper, based on that empathic responses are modulated by inferred social\nrelationships between agents, we propose LASE Learning to balance Altruism and\nSelf-interest based on Empathy), a distributed multi-agent reinforcement\nlearning algorithm that fosters altruistic cooperation through gifting while\navoiding exploitation by other agents in mixed-motive games. LASE allocates a\nportion of its rewards to co-players as gifts, with this allocation adapting\ndynamically based on the social relationship -- a metric evaluating the\nfriendliness of co-players estimated by counterfactual reasoning. In\nparticular, social relationship measures each co-player by comparing the\nestimated $Q$-function of current joint action to a counterfactual baseline\nwhich marginalizes the co-player's action, with its action distribution\ninferred by a perspective-taking module. Comprehensive experiments are\nperformed in spatially and temporally extended mixed-motive games,\ndemonstrating LASE's ability to promote group collaboration without\ncompromising fairness and its capacity to adapt policies to various types of\ninteractive co-players.\n","authors":["Fanqi Kong","Yizhe Huang","Song-Chun Zhu","Siyuan Qi","Xue Feng"],"pdf_url":"https://arxiv.org/pdf/2410.07863v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00312v4","updated":"2025-01-18T06:02:08Z","published":"2024-06-29T04:29:03Z","title":"UDC: A Unified Neural Divide-and-Conquer Framework for Large-Scale\n  Combinatorial Optimization Problems","summary":"  Single-stage neural combinatorial optimization solvers have achieved\nnear-optimal results on various small-scale combinatorial optimization (CO)\nproblems without requiring expert knowledge. However, these solvers exhibit\nsignificant performance degradation when applied to large-scale CO problems.\nRecently, two-stage neural methods motivated by divide-and-conquer strategies\nhave shown efficiency in addressing large-scale CO problems. Nevertheless, the\nperformance of these methods highly relies on problem-specific heuristics in\neither the dividing or the conquering procedure, which limits their\napplicability to general CO problems. Moreover, these methods employ separate\ntraining schemes and ignore the interdependencies between the dividing and\nconquering strategies, often leading to sub-optimal solutions. To tackle these\ndrawbacks, this article develops a unified neural divide-and-conquer framework\n(i.e., UDC) for solving general large-scale CO problems. UDC offers a\nDivide-Conquer-Reunion (DCR) training method to eliminate the negative impact\nof a sub-optimal dividing policy. Employing a high-efficiency Graph Neural\nNetwork (GNN) for global instance dividing and a fixed-length sub-path solver\nfor conquering divided sub-problems, the proposed UDC framework demonstrates\nextensive applicability, achieving superior performance in 10 representative\nlarge-scale CO problems. The code is available at\nhttps://github.com/CIAM-Group/NCO_code/tree/main/single_objective/UDC-Large-scale-CO-master.\n","authors":["Zhi Zheng","Changliang Zhou","Tong Xialiang","Mingxuan Yuan","Zhenkun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.00312v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05464v2","updated":"2025-01-18T05:53:51Z","published":"2024-12-31T19:55:45Z","title":"LLM-MedQA: Enhancing Medical Question Answering through Case Studies in\n  Large Language Models","summary":"  Accurate and efficient question-answering systems are essential for\ndelivering high-quality patient care in the medical field. While Large Language\nModels (LLMs) have made remarkable strides across various domains, they\ncontinue to face significant challenges in medical question answering,\nparticularly in understanding domain-specific terminologies and performing\ncomplex reasoning. These limitations undermine their effectiveness in critical\nmedical applications. To address these issues, we propose a novel approach\nincorporating similar case generation within a multi-agent medical\nquestion-answering (MedQA) system. Specifically, we leverage the Llama3.1:70B\nmodel, a state-of-the-art LLM, in a multi-agent architecture to enhance\nperformance on the MedQA dataset using zero-shot learning. Our method\ncapitalizes on the model's inherent medical knowledge and reasoning\ncapabilities, eliminating the need for additional training data. Experimental\nresults show substantial performance gains over existing benchmark models, with\nimprovements of 7% in both accuracy and F1-score across various medical QA\ntasks. Furthermore, we examine the model's interpretability and reliability in\naddressing complex medical queries. This research not only offers a robust\nsolution for medical question answering but also establishes a foundation for\nbroader applications of LLMs in the medical domain.\n","authors":["Hang Yang","Hao Chen","Hui Guo","Yineng Chen","Ching-Sheng Lin","Shu Hu","Jinrong Hu","Xi Wu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.05464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10661v1","updated":"2025-01-18T05:43:17Z","published":"2025-01-18T05:43:17Z","title":"Unveiling the Mystery of Weight in Large Foundation Models: Gaussian\n  Distribution Never Fades","summary":"  This paper presents a pioneering exploration of the mechanisms underlying\nlarge foundation models' (LFMs) weights, aiming to simplify AI research.\nThrough extensive observation and analysis on prevailing LFMs, we find that\nregardless of initialization strategies, their weights predominantly follow a\nGaussian distribution, with occasional sharp, inverted T-shaped, or linear\npatterns. We further discover that the weights share the i.i.d. properties of\nGaussian noise, and explore their direct relationship. We find that\ntransformation weights can be derived from Gaussian noise, and they primarily\nserve to increase the standard deviation of pre-trained weights, with their\nstandard deviation growing with layer depth. In other words, transformation\nweights broaden the acceptable deviation from the optimal weights, facilitating\nadaptation to downstream tasks. Building upon the above conclusions, we\nthoroughly discussed the nature of optimal weights, ultimately concluding that\nthey should exhibit zero-mean, symmetry, and sparsity, with the sparse values\nbeing a truncated Gaussian distribution and a few outliers. Our experiments in\nLFM adaptation and editing demonstrate the effectiveness of these insights. We\nhope these findings can provide a foundational understanding to pave the way\nfor future advancements in the LFM community.\n","authors":["Chongjie Si","Jingjing Jiang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2501.10661v1.pdf","comment":"Revisions ongoing"},{"id":"http://arxiv.org/abs/2501.05675v2","updated":"2025-01-18T05:30:01Z","published":"2025-01-10T02:57:08Z","title":"Synergizing Large Language Models and Task-specific Models for Time\n  Series Anomaly Detection","summary":"  In anomaly detection, methods based on large language models (LLMs) can\nincorporate expert knowledge by reading professional document, while\ntask-specific small models excel at extracting normal data patterns and\ndetecting value fluctuations from training data of target applications.\nInspired by the human nervous system, where the brain stores expert knowledge\nand the peripheral nervous system and spinal cord handle specific tasks like\nwithdrawal and knee-jerk reflexes, we propose CoLLaTe, a framework designed to\nfacilitate collaboration between LLMs and task-specific models, leveraging the\nstrengths of both models for anomaly detection.\n  In particular, we first formulate the collaboration process and identify two\nkey challenges in the collaboration:\n  (1) the misalignment between the expression domains of the LLMs and\ntask-specific small models, and (2) error accumulation arising from the\npredictions of both models.\n  To address these challenges, we then introduce two key components in CoLLaTe:\na model alignment module and a collaborative loss function. Through theoretical\nanalysis and experimental validation, we demonstrate that these components\neffectively mitigate the identified challenges and achieve better performance\nthan both LLM-based and task-specific models.\n","authors":["Feiyi Chen","Leilei Zhang","Guansong Pang","Roger Zimmermann","Shuiguang Deng"],"pdf_url":"https://arxiv.org/pdf/2501.05675v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10658v1","updated":"2025-01-18T05:27:25Z","published":"2025-01-18T05:27:25Z","title":"LUT-DLA: Lookup Table as Efficient Extreme Low-Bit Deep Learning\n  Accelerator","summary":"  The emergence of neural network capabilities invariably leads to a\nsignificant surge in computational demands due to expanding model sizes and\nincreased computational complexity. To reduce model size and lower inference\ncosts, recent research has focused on simplifying models and designing hardware\naccelerators using low-bit quantization. However, due to numerical\nrepresentation limits, scalar quantization cannot reduce bit width lower than\n1-bit, diminishing its benefits. To break through these limitations, we\nintroduce LUT-DLA, a Look-Up Table (LUT) Deep Learning Accelerator Framework\nthat utilizes vector quantization to convert neural network models into LUTs,\nachieving extreme low-bit quantization. The LUT-DLA framework facilitates\nefficient and cost-effective hardware accelerator designs and supports the\nLUTBoost algorithm, which helps to transform various DNN models into LUT-based\nmodels via multistage training, drastically cutting both computational and\nhardware overhead. Additionally, through co-design space exploration, LUT-DLA\nassesses the impact of various model and hardware parameters to fine-tune\nhardware configurations for different application scenarios, optimizing\nperformance and efficiency. Our comprehensive experiments show that LUT-DLA\nachieves improvements in power efficiency and area efficiency with gains of\n$1.4$~$7.0\\times$ and $1.5$~$146.1\\times$, respectively, while maintaining only\na modest accuracy drop. For CNNs, accuracy decreases by $0.1\\%$~$3.1\\%$ using\nthe $L_2$ distance similarity, $0.1\\%$~$3.4\\%$ with the $L_1$ distance\nsimilarity, and $0.1\\%$~$3.8\\%$ when employing the Chebyshev distance\nsimilarity. For transformer-based models, the accuracy drop ranges from $1.4\\%$\nto $3.0\\%$.\n","authors":["Guoyu Li","Shengyu Ye","Chunyun Chen","Yang Wang","Fan Yang","Ting Cao","Cheng Liu","Mohamed M. Sabry","Mao Yang"],"pdf_url":"https://arxiv.org/pdf/2501.10658v1.pdf","comment":"12 pages, 14 figures"},{"id":"http://arxiv.org/abs/2009.12293v3","updated":"2025-01-18T02:57:38Z","published":"2020-09-25T15:32:31Z","title":"robosuite: A Modular Simulation Framework and Benchmark for Robot\n  Learning","summary":"  robosuite is a simulation framework for robot learning powered by the MuJoCo\nphysics engine. It offers a modular design for creating robotic tasks as well\nas a suite of benchmark environments for reproducible research. This paper\ndiscusses the key system modules and the benchmark environments of our new\nrelease robosuite v1.5.\n","authors":["Yuke Zhu","Josiah Wong","Ajay Mandlekar","Roberto Martín-Martín","Abhishek Joshi","Kevin Lin","Abhiram Maddukuri","Soroush Nasiriany","Yifeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2009.12293v3.pdf","comment":"For more information, please visit https://robosuite.ai"},{"id":"http://arxiv.org/abs/2501.10627v1","updated":"2025-01-18T02:05:37Z","published":"2025-01-18T02:05:37Z","title":"AI/ML Based Detection and Categorization of Covert Communication in IPv6\n  Network","summary":"  The flexibility and complexity of IPv6 extension headers allow attackers to\ncreate covert channels or bypass security mechanisms, leading to potential data\nbreaches or system compromises. The mature development of machine learning has\nbecome the primary detection technology option used to mitigate covert\ncommunication threats. However, the complexity of detecting covert\ncommunication, evolving injection techniques, and scarcity of data make\nbuilding machine-learning models challenging. In previous related research,\nmachine learning has shown good performance in detecting covert communications,\nbut oversimplified attack scenario assumptions cannot represent the complexity\nof modern covert technologies and make it easier for machine learning models to\ndetect covert communications. To bridge this gap, in this study, we analyzed\nthe packet structure and network traffic behavior of IPv6, used encryption\nalgorithms, and performed covert communication injection without changing\nnetwork packet behavior to get closer to real attack scenarios. In addition to\nanalyzing and injecting methods for covert communications, this study also uses\ncomprehensive machine learning techniques to train the model proposed in this\nstudy to detect threats, including traditional decision trees such as random\nforests and gradient boosting, as well as complex neural network architectures\nsuch as CNNs and LSTMs, to achieve detection accuracy of over 90\\%. This study\ndetails the methods used for dataset augmentation and the comparative\nperformance of the applied models, reinforcing insights into the adaptability\nand resilience of the machine learning application in IPv6 covert\ncommunication. In addition, we also proposed a Generative AI-assisted\ninterpretation concept based on prompt engineering as a preliminary study of\nthe role of Generative AI agents in covert communication.\n","authors":["Mohammad Wali Ur Rahman","Yu-Zheng Lin","Carter Weeks","David Ruddell","Jeff Gabriellini","Bill Hayes","Salim Hariri","Edward V. Ziegler Jr"],"pdf_url":"https://arxiv.org/pdf/2501.10627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02842v3","updated":"2025-01-18T01:30:56Z","published":"2023-10-04T14:11:12Z","title":"Sweeping Heterogeneity with Smart MoPs: Mixture of Prompts for LLM Task\n  Adaptation","summary":"  Large Language Models (LLMs) have the ability to solve a variety of tasks,\nsuch as text summarization and mathematical questions, just out of the box, but\nthey are often trained with a single task in mind. Due to high computational\ncosts, the current trend is to use prompt instruction tuning to better adjust\nmonolithic, pretrained LLMs for new -- but often individual -- downstream\ntasks. Thus, how one would expand prompt tuning to handle -- concomitantly --\nheterogeneous tasks and data distributions is a widely open question. To\naddress this gap, we suggest the use of \\emph{Mixture of Prompts}, or MoPs,\nassociated with smart gating functionality: the latter -- whose design is one\nof the contributions of this paper -- can identify relevant skills embedded in\ndifferent groups of prompts and dynamically assign combined experts (i.e.,\ncollection of prompts), based on the target task. Additionally, MoPs are\nempirically agnostic to any model compression technique applied -- for\nefficiency reasons -- as well as instruction data source and task composition.\nIn practice, MoPs can simultaneously mitigate prompt training \"interference\" in\nmulti-task, multi-source scenarios (e.g., task and data heterogeneity across\nsources), as well as possible implications from model approximations. As a\nhighlight, MoPs manage to decrease final perplexity from $\\sim20\\%$ up to\n$\\sim70\\%$, as compared to baselines, in the federated scenario, and from $\\sim\n3\\%$ up to $\\sim30\\%$ in the centralized scenario.\n","authors":["Chen Dun","Mirian Hipolito Garcia","Guoqing Zheng","Ahmed Hassan Awadallah","Anastasios Kyrillidis","Robert Sim"],"pdf_url":"https://arxiv.org/pdf/2310.02842v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12039v2","updated":"2025-01-18T01:19:05Z","published":"2024-12-16T18:08:14Z","title":"Can LLM Prompting Serve as a Proxy for Static Analysis in Vulnerability\n  Detection","summary":"  Despite their remarkable success, large language models (LLMs) have shown\nlimited ability on applied tasks such as vulnerability detection. We\ninvestigate various prompting strategies for vulnerability detection and, as\npart of this exploration, propose a prompting strategy that integrates natural\nlanguage descriptions of vulnerabilities with a contrastive chain-of-thought\nreasoning approach, augmented using contrastive samples from a synthetic\ndataset. Our study highlights the potential of LLMs to detect vulnerabilities\nby integrating natural language descriptions, contrastive reasoning, and\nsynthetic examples into a comprehensive prompting framework. Our results show\nthat this approach can enhance LLM understanding of vulnerabilities. On a\nhigh-quality vulnerability detection dataset such as SVEN, our prompting\nstrategies can improve accuracies, F1-scores, and pairwise accuracies by 23%,\n11%, and 14%, respectively.\n","authors":["Ira Ceka","Feitong Qiao","Anik Dey","Aastha Valecha","Gail Kaiser","Baishakhi Ray"],"pdf_url":"https://arxiv.org/pdf/2412.12039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07762v2","updated":"2025-01-18T01:03:44Z","published":"2025-01-14T00:30:22Z","title":"PSReg: Prior-guided Sparse Mixture of Experts for Point Cloud\n  Registration","summary":"  The discriminative feature is crucial for point cloud registration. Recent\nmethods improve the feature discriminative by distinguishing between\nnon-overlapping and overlapping region points. However, they still face\nchallenges in distinguishing the ambiguous structures in the overlapping\nregions. Therefore, the ambiguous features they extracted resulted in a\nsignificant number of outlier matches from overlapping regions. To solve this\nproblem, we propose a prior-guided SMoE-based registration method to improve\nthe feature distinctiveness by dispatching the potential correspondences to the\nsame experts. Specifically, we propose a prior-guided SMoE module by fusing\nprior overlap and potential correspondence embeddings for routing, assigning\ntokens to the most suitable experts for processing. In addition, we propose a\nregistration framework by a specific combination of Transformer layer and\nprior-guided SMoE module. The proposed method not only pays attention to the\nimportance of locating the overlapping areas of point clouds, but also commits\nto finding more accurate correspondences in overlapping areas. Our extensive\nexperiments demonstrate the effectiveness of our method, achieving\nstate-of-the-art registration recall (95.7\\%/79.3\\%) on the 3DMatch/3DLoMatch\nbenchmark. Moreover, we also test the performance on ModelNet40 and demonstrate\nexcellent performance.\n","authors":["Xiaoshui Huang","Zhou Huang","Yifan Zuo","Yongshun Gong","Chengdong Zhang","Deyang Liu","Yuming Fang"],"pdf_url":"https://arxiv.org/pdf/2501.07762v2.pdf","comment":"Accepted by AAAI 2025 Oral"},{"id":"http://arxiv.org/abs/2412.09582v2","updated":"2025-01-18T00:52:42Z","published":"2024-12-12T18:54:48Z","title":"Neptune: The Long Orbit to Benchmarking Long Video Understanding","summary":"  We introduce Neptune, a benchmark for long video understanding that requires\nreasoning over long time horizons and across different modalities. Many\nexisting video datasets and models are focused on short clips (10s-30s). While\nsome long video datasets do exist, they can often be solved by powerful image\nmodels applied per frame (and often to very few frames) in a video, and are\nusually manually annotated at high cost. In order to mitigate both these\nproblems, we propose a scalable dataset creation pipeline which leverages large\nmodels (VLMs and LLMs), to automatically generate dense, time-aligned video\ncaptions, as well as tough question answer decoy sets for video segments (up to\n15 minutes in length). Our dataset Neptune covers a broad range of long video\nreasoning abilities and consists of a subset that emphasizes multimodal\nreasoning. Since existing metrics for open-ended question answering are either\nrule-based or may rely on proprietary models, we provide a new open source\nmodel-based metric GEM to score open-ended responses on Neptune. Benchmark\nevaluations reveal that most current open-source long video models perform\npoorly on Neptune, particularly on questions testing temporal ordering,\ncounting and state changes. Through Neptune, we aim to spur the development of\nmore advanced models capable of understanding long videos. The dataset is\navailable at https://github.com/google-deepmind/neptune\n","authors":["Arsha Nagrani","Mingda Zhang","Ramin Mehran","Rachel Hornung","Nitesh Bharadwaj Gundavarapu","Nilpa Jha","Austin Myers","Xingyi Zhou","Boqing Gong","Cordelia Schmid","Mikhail Sirotenko","Yukun Zhu","Tobias Weyand"],"pdf_url":"https://arxiv.org/pdf/2412.09582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08911v2","updated":"2025-01-18T00:35:16Z","published":"2024-12-12T03:47:40Z","title":"Rethinking Multi-Objective Learning through Goal-Conditioned Supervised\n  Learning","summary":"  Multi-objective learning aims to optimize multiple objectives simultaneously\nwith a single model for achieving a balanced and satisfying performance on all\nthese objectives. However, it suffers from the difficulty to formalize and\nconduct the exact learning process, especially considering the possible\nconflicts between objectives. Existing approaches explores to resolve this\nprimarily in two directions: adapting modeling structure or constraining\noptimization with certain assumptions. However, a primary issue is that their\npresuppositions for the effectiveness of their design are insufficient to\nguarantee the its generality in real-world applications. What's worse, the high\nspace and computation complexity issue makes it even harder to apply them in\nlarge-scale, complicated environment such as the recommender systems. To\naddress these issues, we propose a general framework for automatically learning\nto achieve multiple objectives based on the existing sequential data. We apply\nthe goal-conditioned supervised learning (GCSL) framework to multi-objective\nlearning, by extending the definition of goals from one-dimensional scalar to\nmulti-dimensional vector that perfectly disentangle the representation of\ndifferent objectives. Meanwhile, GCSL enables the model to simultaneously learn\nto achieve each objective in a concise supervised learning way, simply guided\nby existing sequences in the offline data. No additional constraint, special\nmodel structure design, or complex optimization algorithms are further\nrequired. Apart from that, we formally analyze the property of the goals in\nGCSL and then firstly propose a goal-generation framework to gain achievable\nand reasonable goals for inference. Extensive experiments are conducted on\nreal-world recommendation datasets, demonstrating the effectiveness of the\nproposed method and exploring the feasibility of the goal-generation strategies\nin GCSL.\n","authors":["Shijun Li","Hilaf Hasson","Jing Hu","Joydeep Ghosh"],"pdf_url":"https://arxiv.org/pdf/2412.08911v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04216v3","updated":"2025-01-18T00:29:19Z","published":"2024-08-08T04:52:10Z","title":"Attention Mechanism and Context Modeling System for Text Mining Machine\n  Translation","summary":"  This paper advances a novel architectural schema anchored upon the\nTransformer paradigm and innovatively amalgamates the K-means categorization\nalgorithm to augment the contextual apprehension capabilities of the schema.\nThe transformer model performs well in machine translation tasks due to its\nparallel computing power and multi-head attention mechanism. However, it may\nencounter contextual ambiguity or ignore local features when dealing with\nhighly complex language structures. To circumvent this constraint, this\nexposition incorporates the K-Means algorithm, which is used to stratify the\nlexis and idioms of the input textual matter, thereby facilitating superior\nidentification and preservation of the local structure and contextual\nintelligence of the language. The advantage of this combination is that K-Means\ncan automatically discover the topic or concept regions in the text, which may\nbe directly related to translation quality. Consequently, the schema contrived\nherein enlists K-Means as a preparatory phase antecedent to the Transformer and\nrecalibrates the multi-head attention weights to assist in the discrimination\nof lexis and idioms bearing analogous semantics or functionalities. This\nensures the schema accords heightened regard to the contextual intelligence\nembodied by these clusters during the training phase, rather than merely\nfocusing on locational intelligence.\n","authors":["Yuwei Zhang","Junming Huang","Sitong Liu","Zexi Chen","Zizheng Li"],"pdf_url":"https://arxiv.org/pdf/2408.04216v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02639v2","updated":"2025-01-18T00:00:33Z","published":"2024-11-04T21:56:48Z","title":"Active Prompt Tuning Enables Gpt-40 To Do Efficient Classification Of\n  Microscopy Images","summary":"  Traditional deep learning-based methods for classifying cellular features in\nmicroscopy images require time- and labor-intensive processes for training\nmodels. Among the current limitations are major time commitments from domain\nexperts for accurate ground truth preparation; and the need for a large amount\nof input image data. We previously proposed a solution that overcomes these\nchallenges using OpenAI's GPT-4(V) model on a pilot dataset (Iba-1\nimmuno-stained tissue sections from 11 mouse brains). Results on the pilot\ndataset were equivalent in accuracy and with a substantial improvement in\nthroughput efficiency compared to the baseline using a traditional\nConvolutional Neural Net (CNN)-based approach.\n  The present study builds upon this framework using a second unique and\nsubstantially larger dataset of microscopy images. Our current approach uses a\nnewer and faster model, GPT-4o, along with improved prompts. It was evaluated\non a microscopy image dataset captured at low (10x) magnification from\ncresyl-violet-stained sections through the cerebellum of a total of 18 mouse\nbrains (9 Lurcher mice, 9 wild-type controls). We used our approach to classify\nthese images either as a control group or Lurcher mutant. Using 6 mice in the\nprompt set the results were correct classification for 11 out of the 12 mice\n(92%) with 96% higher efficiency, reduced image requirements, and lower demands\non time and effort of domain experts compared to the baseline method (snapshot\nensemble of CNN models). These results confirm that our approach is effective\nacross multiple datasets from different brain regions and magnifications, with\nminimal overhead.\n","authors":["Abhiram Kandiyana","Peter R. Mouton","Yaroslav Kolinko","Lawrence O. Hall","Dmitry Goldgof"],"pdf_url":"https://arxiv.org/pdf/2411.02639v2.pdf","comment":"Accepted to IEEE ISBI 2025"}]}}
\ No newline at end of file
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 0000000..7f5166c
Binary files /dev/null and b/favicon.ico differ
diff --git a/index.css b/index.css
new file mode 100644
index 0000000..9ded9d9
--- /dev/null
+++ b/index.css
@@ -0,0 +1,355 @@
+:root {
+    /* Palette: Nord (https://www.nordtheme.com)*/
+    --nord00: #2e3440;
+    --nord01: #3b4252;
+    --nord02: #434c5e;
+    --nord03: #4c566a;
+    --nord04: #d8dee9;
+    --nord05: #e5e9f0;
+    --nord06: #eceff4;
+    --nord07: #8fbcbb;
+    --nord08: #88c0d0;
+    --nord09: #81a1c1;
+    --nord0A: #5e81ac;
+    --nord0B: #bf616a;
+    --nord0C: #d08770;
+    --nord0D: #ebcb8b;
+    --nord0E: #a3be8c;
+    --nord0F: #b48ead;
+
+
+    /* Typograph */
+    --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue",
+    sans-serif;
+    --font-size-scaler: 62.5%;
+    --font-size-m: 1.6rem;
+    --font-size-s: 1.4rem;
+
+    /* Components */
+    --body-color: var(--nord06);
+    --body-bg: var(--nord00);
+
+    --header-title: var(--nord06);
+    --header-container: var(--nord00);
+    --header-title-preffix: var(--nord0F);
+
+    --chip-font: var(--nord08);
+    --chip-color: var(--nord0B);
+
+    --icons: var(--nord06);
+    --icons-hover: var(--nord0F);
+
+    --day-container: var(--nord01);
+    --date: var(--nord09);
+
+    --summary: var(--nord0E);
+    --summary-hover: var(--nord0F);
+
+    --details-open: var(--nord02);
+    --details-content: var(--nord05);
+    --details-a: var(--nord07);
+    --details-a-hover: var(--nord0F);
+
+    --highlight-title: var(--nord0B);
+    --highlight-author: var(--nord0B);
+
+    --article-summary-hover-color: var(--nord0D);
+    --article-summary-color: var(--nord04);
+
+    --article-title-color: var(--nord05);
+    --article-title-hover-color: var(--nord0E);
+
+    --accordion-content-rail-color: var(--nord01);
+    --accordion-content-hover-rail-color: var(--nord0D);
+    --accordion-title-marker-color: var(--nord01);
+    --accordion-title-hover-marker-color: var(--nord0E);
+
+    --footer-color: var(--nord04);
+    --footer-link-hover-color: var(--nord0D);
+}
+
+[data-theme="light"] {
+    /* Theme design */
+
+    --color-primary: var(--nord07);
+    --color-primary-second: var(--nord00);
+    --color-info: var(--nord0A);
+    --color-success: var(--nord0E);
+    --color-warning: var(--nord0C);
+    --color-danger: var(--nord0B);
+
+    --color-text: var(--nord00);
+    --color-hover: var(--nord0D);
+    --color-shadow: var(--nord03);
+
+    --color-primary-h: var(--nord09);
+    --color-primary-s: var(--nord08);
+    --color-primary-l: var(--nord07);
+
+    --color-contrast-higher-h: var(--nord01);
+    --color-contrast-higher-l: var(--nord02);
+    --color-contrast-higher-s: var(--nord03);
+
+    --color-content: white;
+
+    --background: var(--nord06);
+    --background-content: var(--nord05);
+    --background-color: var(--nord04);
+
+    /* Components */
+
+    --chip-font: var(--nord06);
+    --chip-color: var(--nord09);
+
+    --body-color: var(--background-color);
+    --body-bg: var(--background);
+
+    --header-title: var(--color-shadow);
+    --header-container: var(--background);
+    --header-title-preffix: var(--color-primary-h);
+
+    --icons: var(--color-shadow);
+    --icons-hover: var(--color-hover);
+
+    --day-container: var(--background-content);
+    --date: var(--color-primary-l);
+
+    --summary: var(--color-info);
+    --summary-hover: var(--color-success);
+
+    --details-open: var(--color-content);
+    --details-content: var(--color-text);
+    --details-a: var(--color-primary-h);
+    --details-a-hover: var(--color-hover);
+
+    --highlight-title: var(--color-danger);
+    --highlight-author: var(--color-warning);
+
+    --article-summary-color: var(--color-text);
+    --article-summary-hover-color: var(--color-primary-s);
+
+    --article-title-color: var(--color-primary);
+    --article-title-hover-color: var(--color-success);
+
+    --accordion-content-rail-color: var(--color-warning);
+    --accordion-content-hover-rail-color: var(--color-warning);
+    --accordion-title-marker-color: var(--color-success);
+    --accordion-title-hover-marker-color: var(--color-success);
+
+    --footer-color: var(--color-text);
+    --footer-link-hover-color: var(--color-hover);
+}
+
+html {
+    font-size: var(--font-size-scaler);
+}
+
+body {
+    background-color: var(--body-bg);
+    font-family: var(--font-family-default);
+    color: var(--body-color);
+    margin: 0;
+    padding-top: 16px;
+    display: grid;
+}
+
+.header-container {
+    width: 90%;
+    max-width: 1200px;
+    background: var(--header-container);
+    margin: 0 auto;
+}
+
+.header-title {
+    font-size: 32px;
+    font-weight: bold;
+    color: var(--header-title);
+    margin: 0;
+    padding-bottom: 14px;
+}
+
+.header-title-preffix {
+    color: var(--header-title-preffix);
+}
+
+.icons {
+    color: var(--icons);
+    padding-bottom: 16px;
+}
+
+.icons a {
+    color: var(--icons);
+    text-decoration: none;
+}
+
+.icons a:hover {
+    color: var(--icons-hover);
+}
+
+.day-container {
+    padding: 16px 16px 16px 16px;
+    background: var(--day-container);
+    width: 90%;
+    max-width: 1200px;
+    margin: 0 auto;
+    margin-bottom: 8px;
+    border-radius: 10px;
+}
+
+.date {
+    font-size: 24px;
+    font-weight: 700;
+    margin: 0;
+    color: var(--date);
+}
+
+p {
+    margin: 0;
+}
+
+summary {
+    font-weight: 600;
+    color: var(--summary);
+}
+
+summary:hover {
+    text-decoration: underline;
+    cursor: pointer;
+    color: var(--summary-hover);
+}
+
+details {
+    --border-color: transparent;
+
+    padding: 2px 4px;
+    font-size: 20px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+}
+
+details[open] {
+    background-color: var(--details-open);
+    margin-bottom: 8px;
+}
+
+.details-content {
+    padding: 12px 3px;
+    gap: 16px;
+    color: var(--details-content);
+}
+
+details a {
+    color: var(--details-a);
+}
+
+details a:hover {
+    color: var(--details-a-hover);
+}
+
+footer {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    justify-content: space-between;
+}
+
+.description {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    text-align: center;
+}
+
+.highlight-author {
+    color: var(--highlight-author);
+    font-weight: bold;
+}
+
+.highlight-title {
+    color: var(--highlight-title);
+    font-weight: bold;
+}
+
+.channel-description {
+    text-align: center;
+    font-size: var(--font-size-scaler);
+}
+
+.article-summary-link {
+    color: var(--article-summary-color);
+    font-size: var(--font-size-s);
+    text-decoration: none;
+}
+
+.article-summary-link:hover {
+    color: var(--article-summary-hover-color);
+    --accordion-content-rail-color: var(--accordion-content-hover-rail-color);
+}
+
+.article-summary-box-outer {
+    display: block;
+    padding: 4px 8px 8px 4px;
+}
+
+.article-summary-box-inner {
+    padding-left: 8px;
+    border-left: 1px solid var(--accordion-content-rail-color);
+    font-size: var(--font-size-m);
+}
+
+.article-expander {
+    padding: 10px 4px;
+    border-radius: 4px;
+}
+
+.article-authors {
+    font-size: var(--font-size-m);
+    padding: 0.25em 1em;
+}
+
+.article-authors a {
+    text-decoration: none;
+}
+
+.article-expander-title {
+    font-size: var(--font-size-m);
+    font-weight: 600;
+}
+
+.article-expander-title:hover {
+    cursor: pointer;
+}
+
+.article-expander-title::marker {
+    color: var(--accordion-title-marker-color);
+}
+
+.article-expander-title:hover::marker {
+    color: var(--accordion-title-hover-marker-color);
+}
+
+/* for switcher */
+.theme-switch {
+    display: inline-block;
+    position: relative;
+}
+
+.theme-switch input {
+    display: none;
+}
+
+/* chip */
+.chip {
+    font-size: 90%;
+    align-items: center;
+    color: var(--chip-font);
+    background: var(--chip-color);
+    border-radius: 5rem;
+    display: inline-flex;
+    padding: .2rem .4rem;
+    vertical-align: middle;
+}
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 0000000..2aea511
--- /dev/null
+++ b/index.html
@@ -0,0 +1,241299 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>MyArxiv</title>
+    <meta charset="utf-8"/>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+    <meta name="robots" content="noindex, nofollow"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1"/>
+    <link rel="shortcut icon" type="image/x-icon" href="favicon.ico"/>
+    <link href="index.css" rel="stylesheet"/>
+    <link href="https://cdn.jsdelivr.net/npm/remixicon@2.5.0/fonts/remixicon.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.js"
+            integrity="sha384-z1fJDqw8ZApjGO3/unPWUPsIymfsJmyrDVWC8Tv/a1HeOtGmkwNd/7xUS0Xcnvsx"
+            crossorigin="anonymous"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/contrib/auto-render.min.js"
+            integrity="sha384-+XBljXPPiv+OzfbB3cVmLHf4hdUFHlWNZN5spNQ7rmHTXpd7WvJum6fIACpNNfIR"
+            crossorigin="anonymous"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            renderMathInElement(document.body, {
+                // customised options
+                // • auto-render specific keys, e.g.:
+                delimiters: [
+                    {left: '$$', right: '$$', display: true},
+                    {left: '$', right: '$', display: false},
+                    {left: '\\(', right: '\\)', display: false},
+                    {left: '\\[', right: '\\]', display: true},
+                    {left: "\\begin{equation}", right: "\\end{equation}", display: true},
+                    {left: "\\begin{align}", right: "\\end{align}", display: true},
+                    {left: "\\begin{alignat}", right: "\\end{alignat}", display: true},
+                    {left: "\\begin{gather}", right: "\\end{gather}", display: true},
+                    {left: "\\begin{CD}", right: "\\end{CD}", display: true},
+                ],
+                // • rendering keys, e.g.:
+                throwOnError: false
+            });
+        });
+    </script>
+</head>
+
+<body>
+<section class="header-container">
+    <div style="display:flex; justify-content:space-between; align-items:flex-end;">
+        <div>
+            <div class="header-title">
+                MyArxiv
+            </div>
+        </div>
+
+        <div class=icons>
+            <label class="theme-switch" for="checkbox">
+                <input type="checkbox" id="checkbox"/>
+                <i id="theme-icon" class="ri-moon-line" style="font-size: 32px" rel="noopener noreferrer"></i>
+            </label>
+        </div>
+    </div>
+</section>
+
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-21T00:00:00Z">2025-01-21</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">24</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Agent Feedback Motion Planning using Probably Approximately
+  Correct Nonlinear Model Predictive Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark Gonzales, Adam Polevoy, Marin Kobilarov, Joseph Moore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For many tasks, multi-robot teams often provide greater efficiency,
+robustness, and resiliency. However, multi-robot collaboration in real-world
+scenarios poses a number of major challenges, especially when dynamic robots
+must balance competing objectives like formation control and obstacle avoidance
+in the presence of stochastic dynamics and sensor uncertainty. In this paper,
+we propose a distributed, multi-agent receding-horizon feedback motion planning
+approach using Probably Approximately Correct Nonlinear Model Predictive
+Control (PAC-NMPC) that is able to reason about both model and measurement
+uncertainty to achieve robust multi-agent formation control while navigating
+cluttered obstacle fields and avoiding inter-robot collisions. Our approach
+relies not only on the underlying PAC-NMPC algorithm but also on a terminal
+cost-function derived from gyroscopic obstacle avoidance. Through numerical
+simulation, we show that our distributed approach performs on par with a
+centralized formulation, that it offers improved performance in the case of
+significant measurement noise, and that it can scale to more complex dynamical
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving robot understanding using conversational AI: demonstration and
+  feasibility study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shikhar Kumar, Yael Edan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explanations constitute an important aspect of successful human robot
+interactions and can enhance robot understanding. To improve the understanding
+of the robot, we have developed four levels of explanation (LOE) based on two
+questions: what needs to be explained, and why the robot has made a particular
+decision. The understandable robot requires a communicative action when there
+is disparity between the human s mental model of the robot and the robots state
+of mind. This communicative action was generated by utilizing a conversational
+AI platform to generate explanations. An adaptive dialog was implemented for
+transition from one LOE to another. Here, we demonstrate the adaptive dialog in
+a collaborative task with errors and provide results of a feasibility study
+with users.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40th Anniversary, IEEE International Conference on Robotics and
+  Automation,2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Efficiency and Engagement in Scripted and LLM-Enhanced
+  Human-Robot Interactions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Schreiter, Jens V. Rüppel, Rishi Hazra, Andrey Rudenko, Martin Magnusson, Achim J. Lilienthal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To achieve natural and intuitive interaction with people, HRI frameworks
+combine a wide array of methods for human perception, intention communication,
+human-aware navigation and collaborative action. In practice, when encountering
+unpredictable behavior of people or unexpected states of the environment, these
+frameworks may lack the ability to dynamically recognize such states, adapt and
+recover to resume the interaction. Large Language Models (LLMs), owing to their
+advanced reasoning capabilities and context retention, present a promising
+solution for enhancing robot adaptability. This potential, however, may not
+directly translate to improved interaction metrics. This paper considers a
+representative interaction with an industrial robot involving approach,
+instruction, and object manipulation, implemented in two conditions: (1) fully
+scripted and (2) including LLM-enhanced responses. We use gaze tracking and
+questionnaires to measure the participants' task efficiency, engagement, and
+robot perception. The results indicate higher subjective ratings for the LLM
+condition, but objective metrics show that the scripted condition performs
+comparably, particularly in efficiency and focus during simple tasks. We also
+note that the scripted condition may have an edge over LLM-enhanced responses
+in terms of response latency and energy consumption, especially for trivial and
+repetitive interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a Late-Breaking Report to the 2025, 20th ACM/IEEE
+  International Conference on Human-Robot Interaction (HRI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards autonomous photogrammetric forest inventory using a lightweight
+  under-canopy robotic drone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Väinö Karjalainen, Niko Koivumäki, Teemu Hakala, Jesse Muhojoki, Eric Hyyppä, Anand George, Juha Suomalainen, Eija Honkavaara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Drones are increasingly used in forestry to capture high-resolution remote
+sensing data. While operations above the forest canopy are already highly
+automated, flying inside forests remains challenging, primarily relying on
+manual piloting. Inside dense forests, reliance on the Global Navigation
+Satellite System (GNSS) for localization is not feasible. Additionally, the
+drone must autonomously adjust its flight path to avoid collisions. Recently,
+advancements in robotics have enabled autonomous drone flights in GNSS-denied
+obstacle-rich areas. In this article, a step towards autonomous forest data
+collection is taken by building a prototype of a robotic under-canopy drone
+utilizing state-of-the-art open-source methods and validating its performance
+for data collection inside forests. The autonomous flight capability was
+evaluated through multiple test flights in two boreal forest test sites. The
+tree parameter estimation capability was studied by conducting diameter at
+breast height (DBH) estimation using onboard stereo camera data and
+photogrammetric methods. The prototype conducted flights in selected
+challenging forest environments, and the experiments showed excellent
+performance in forest reconstruction with a miniaturized stereoscopic
+photogrammetric system. The stem detection algorithm managed to identify 79.31
+% of the stems. The DBH estimation had a root mean square error (RMSE) of 3.33
+cm (12.79 %) and a bias of 1.01 cm (3.87 %) across all trees. For trees with a
+DBH less than 30 cm, the RMSE was 1.16 cm (5.74 %), and the bias was 0.13 cm
+(0.64 %). When considering the overall performance in terms of DBH accuracy,
+autonomy, and forest complexity, the proposed approach was superior compared to
+methods proposed in the scientific literature. Results provided valuable
+insights into autonomous forest reconstruction using drones, and several
+further development topics were proposed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 13 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low-Cost 3D printed, Biocompatible Ionic Polymer Membranes for Soft
+  Actuators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nils Trümpler, Ryo Kanno, Niu David, Anja Huch, Pham Huy Nguyen, Maksims Jurinovs, Gustav Nyström, Sergejs Gaidukovs, Mirko Kovac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ionic polymer actuators, in essence, consist of ion exchange polymers
+sandwiched between layers of electrodes. They have recently gained recognition
+as promising candidates for soft actuators due to their lightweight nature,
+noise-free operation, and low-driving voltages. However, the materials
+traditionally utilized to develop them are often not human/environmentally
+friendly. Thus, to address this issue, researchers have been focusing on
+developing biocompatible versions of this actuator. Despite this, such
+actuators still face challenges in achieving high performance, in payload
+capacity, bending capabilities, and response time. In this paper, we present a
+biocompatible ionic polymer actuator whose membrane is fully 3D printed
+utilizing a direct ink writing method. The structure of the printed membranes
+consists of biodegradable ionic fluid encapsulated within layers of activated
+carbon polymers. From the microscopic observations of its structure, we
+confirmed that the ionic polymer is well encapsulated. The actuators can
+achieve a bending performance of up to 124$^\circ$ (curvature of 0.82
+$\text{cm}^{-1}$), which, to our knowledge, is the highest curvature attained
+by any bending ionic polymer actuator to date. It can operate comfortably up to
+a 2 Hz driving frequency and can achieve blocked forces of up to 0.76 mN. Our
+results showcase a promising, high-performing biocompatible ionic polymer
+actuator, whose membrane can be easily manufactured in a single step using a
+standard FDM 3D printer. This approach paves the way for creating customized
+designs for functional soft robotic applications, including human-interactive
+devices, in the near future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, Accepted in IEEE International Conference on Soft
+  Robotics 2025 (Robosoft)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Hop for a Single-Legged Robot with Parallel Mechanism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbo Zhang, Xiangyu Chu, Yanlin Chen, Yunxi Tang, Linzhu Yue, Yun-Hui Liu, Kwok Wai Samuel Au
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents the application of reinforcement learning to improve the
+performance of a highly dynamic hopping system with a parallel mechanism.
+Unlike serial mechanisms, parallel mechanisms can not be accurately simulated
+due to the complexity of their kinematic constraints and closed-loop
+structures. Besides, learning to hop suffers from prolonged aerial phase and
+the sparse nature of the rewards. To address them, we propose a learning
+framework to encode long-history feedback to account for the under-actuation
+brought by the prolonged aerial phase. In the proposed framework, we also
+introduce a simplified serial configuration for the parallel design to avoid
+directly simulating parallel structure during the training. A torque-level
+conversion is designed to deal with the parallel-serial conversion to handle
+the sim-to-real issue. Simulation and hardware experiments have been conducted
+to validate this framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Navigating Robot Swarm Through a Virtual Tube with Flow-Adaptive
+  Distribution Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongwei Zhang, Shuli Lv, Kairong Liu, Quanyi Liang, Quan Quan, Zhikun She
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of robot swarm technology and its diverse
+applications, navigating robot swarms through complex environments has emerged
+as a critical research direction. To ensure safe navigation and avoid potential
+collisions with obstacles, the concept of virtual tubes has been introduced to
+define safe and navigable regions. However, current control methods in virtual
+tubes face the congestion issues, particularly in narrow virtual tubes with low
+throughput. To address these challenges, we first originally introduce the
+concepts of virtual tube area and flow capacity, and develop an new evolution
+model for the spatial density function. Next, we propose a novel control method
+that combines a modified artificial potential field (APF) for swarm navigation
+and density feedback control for distribution regulation, under which a
+saturated velocity command is designed. Then, we generate a global velocity
+field that not only ensures collision-free navigation through the virtual tube,
+but also achieves locally input-to-state stability (LISS) for density tracking
+errors, both of which are rigorously proven. Finally, numerical simulations and
+realistic applications validate the effectiveness and advantages of the
+proposed method in managing robot swarms within narrow virtual tubes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nocturnal eye inspired liquid to gas phase change soft actuator with
+  Laser-Induced-Graphene: enhanced environmental light harvesting and
+  photothermal conversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maina Sogabe, Youhyun Kim, Kenji Kawashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic systems' mobility is constrained by power sources and wiring. While
+pneumatic actuators remain tethered to air supplies, we developed a new
+actuator utilizing light energy. Inspired by nocturnal animals' eyes, we
+designed a bilayer soft actuator incorporating Laser-Induced Graphene (LIG) on
+the inner surface of a silicone layer. This design maintains silicone's
+transparency and flexibility while achieving 54% faster response time compared
+to conventional actuators through enhanced photothermal conversion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23pages, 8 figures, journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DynoSAM: Open-Source Smoothing and Mapping Framework for Dynamic SLAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11893v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11893v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesse Morris, Yiduo Wang, Mikolaj Kliniewski, Viorela Ila
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Visual Simultaneous Localization and Mapping (vSLAM) systems
+focus solely on static scene structures, overlooking dynamic elements in the
+environment. Although effective for accurate visual odometry in complex
+scenarios, these methods discard crucial information about moving objects. By
+incorporating this information into a Dynamic SLAM framework, the motion of
+dynamic entities can be estimated, enhancing navigation whilst ensuring
+accurate localization. However, the fundamental formulation of Dynamic SLAM
+remains an open challenge, with no consensus on the optimal approach for
+accurate motion estimation within a SLAM pipeline. Therefore, we developed
+DynoSAM, an open-source framework for Dynamic SLAM that enables the efficient
+implementation, testing, and comparison of various Dynamic SLAM optimization
+formulations. DynoSAM integrates static and dynamic measurements into a unified
+optimization problem solved using factor graphs, simultaneously estimating
+camera poses, static scene, object motion or poses, and object structures. We
+evaluate DynoSAM across diverse simulated and real-world datasets, achieving
+state-of-the-art motion estimation in indoor and outdoor environments, with
+substantial improvements over existing systems. Additionally, we demonstrate
+DynoSAM utility in downstream applications, including 3D reconstruction of
+dynamic scenes and trajectory prediction, thereby showcasing potential for
+advancing dynamic object-aware SLAM systems. DynoSAM is open-sourced at
+https://github.com/ACFR-RPG/DynOSAM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 10 figures. Submitted to T-RO Visual SLAM SI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Connection-Coordination Rapport (CCR) Scale: A Dual-Factor Scale to
+  Measure Human-Robot Rapport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ting-Han Lin, Hannah Dinner, Tsz Long Leung, Bilge Mutlu, J. Gregory Trafton, Sarah Sebo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robots, particularly in service and companionship roles, must develop
+positive relationships with people they interact with regularly to be
+successful. These positive human-robot relationships can be characterized as
+establishing "rapport," which indicates mutual understanding and interpersonal
+connection that form the groundwork for successful long-term human-robot
+interaction. However, the human-robot interaction research literature lacks
+scale instruments to assess human-robot rapport in a variety of situations. In
+this work, we developed the 18-item Connection-Coordination Rapport (CCR) Scale
+to measure human-robot rapport. We first ran Study 1 (N = 288) where online
+participants rated videos of human-robot interactions using a set of candidate
+items. Our Study 1 results showed the discovery of two factors in our scale,
+which we named "Connection" and "Coordination." We then evaluated this scale by
+running Study 2 (N = 201) where online participants rated a new set of
+human-robot interaction videos with our scale and an existing rapport scale
+from virtual agents research for comparison. We also validated our scale by
+replicating a prior in-person human-robot interaction study, Study 3 (N = 44),
+and found that rapport is rated significantly greater when participants
+interacted with a responsive robot (responsive condition) as opposed to an
+unresponsive robot (unresponsive condition). Results from these studies
+demonstrate high reliability and validity for the CCR scale, which can be used
+to measure rapport in both first-person and third-person perspectives. We
+encourage the adoption of this scale in future studies to measure rapport in a
+variety of human-robot interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automating High Quality RT Planning at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riqiang Gao, Mamadou Diallo, Han Liu, Anthony Magliari, Jonathan Sackett, Wilko Verbakel, Sandra Meyers, Masoud Zarepisheh, Rafe Mcbeth, Simon Arberet, Martin Kraus, Florin C. Ghesu, Ali Kamen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiotherapy (RT) planning is complex, subjective, and time-intensive.
+Advances in artificial intelligence (AI) promise to improve its precision,
+efficiency, and consistency, but progress is often limited by the scarcity of
+large, standardized datasets. To address this, we introduce the Automated
+Iterative RT Planning (AIRTP) system, a scalable solution for generating
+high-quality treatment plans. This scalable solution is designed to generate
+substantial volumes of consistently high-quality treatment plans, overcoming a
+key obstacle in the advancement of AI-driven RT planning. Our AIRTP pipeline
+adheres to clinical guidelines and automates essential steps, including
+organ-at-risk (OAR) contouring, helper structure creation, beam setup,
+optimization, and plan quality improvement, using AI integrated with RT
+planning software like Eclipse of Varian. Furthermore, a novel approach for
+determining optimization parameters to reproduce 3D dose distributions, i.e. a
+method to convert dose predictions to deliverable treatment plans constrained
+by machine limitations. A comparative analysis of plan quality reveals that our
+automated pipeline produces treatment plans of quality comparable to those
+generated manually, which traditionally require several hours of labor per
+plan. Committed to public research, the first data release of our AIRTP
+pipeline includes nine cohorts covering head-and-neck and lung cancer sites to
+support an AAPM 2025 challenge. This data set features more than 10 times the
+number of plans compared to the largest existing well-curated public data set
+to our best knowledge.
+Repo:{https://github.com/RiqiangGao/GDP-HMM_AAPMChallenge}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Related to GDP-HMM grand challenge</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FoundationStereo: Zero-Shot Stereo Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09898v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09898v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Wen, Matthew Trepte, Joseph Aribido, Jan Kautz, Orazio Gallo, Stan Birchfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tremendous progress has been made in deep stereo matching to excel on
+benchmark datasets through per-domain fine-tuning. However, achieving strong
+zero-shot generalization - a hallmark of foundation models in other computer
+vision tasks - remains challenging for stereo matching. We introduce
+FoundationStereo, a foundation model for stereo depth estimation designed to
+achieve strong zero-shot generalization. To this end, we first construct a
+large-scale (1M stereo pairs) synthetic training dataset featuring large
+diversity and high photorealism, followed by an automatic self-curation
+pipeline to remove ambiguous samples. We then design a number of network
+architecture components to enhance scalability, including a side-tuning feature
+backbone that adapts rich monocular priors from vision foundation models to
+mitigate the sim-to-real gap, and long-range context reasoning for effective
+cost volume filtering. Together, these components lead to strong robustness and
+accuracy across domains, establishing a new standard in zero-shot stereo depth
+estimation. Project page: https://nvlabs.github.io/FoundationStereo/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Search-to-Control Reinforcement Learning Based Framework for Quadrotor
+  Local Planning in Dense Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.00275v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.00275v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaohong Liu, Wenxuan Gao, Yinshuai Sun, Peng Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Agile flight in complex environments poses significant challenges to current
+motion planning methods, as they often fail to fully leverage the quadrotor's
+dynamic potential, leading to performance failures and reduced efficiency
+during aggressive maneuvers. Existing approaches frequently decouple trajectory
+optimization from control generation and neglect the dynamics, further limiting
+their ability to generate aggressive and feasible motions. To address these
+challenges, we introduce an enhanced Search-to-Control planning framework that
+integrates visibility path searching with reinforcement learning (RL) control
+generation, directly accounting for dynamics and bridging the gap between
+planning and control. Our method first extracts control points from
+collision-free paths using a proposed heuristic search, which are then refined
+by an RL policy to generate low-level control commands for the quadrotor's
+controller, utilizing reduced-dimensional obstacle observations for efficient
+inference with lightweight neural networks. We validate the framework through
+simulations and real-world experiments, demonstrating improved time efficiency
+and dynamic maneuverability compared to existing methods, while confirming its
+robustness and applicability. To support further research, We will release our
+implementation as an open-source package.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RadaRays: Real-time Simulation of Rotating FMCW Radar for Mobile
+  Robotics via Hardware-accelerated Ray Tracing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03505v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03505v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Mock, Martin Magnusson, Joachim Hertzberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RadaRays allows for the accurate modeling and simulation of rotating FMCW
+radar sensors in complex environments, including the simulation of reflection,
+refraction, and scattering of radar waves. Our software is able to handle large
+numbers of objects and materials in real-time, making it suitable for use in a
+variety of mobile robotics applications. We demonstrate the effectiveness of
+RadaRays through a series of experiments and show that it can more accurately
+reproduce the behavior of FMCW radar sensors in a variety of environments,
+compared to the ray casting-based lidar-like simulations that are commonly used
+in simulators for autonomous driving such as CARLA. Our experiments
+additionally serve as a valuable reference point for researchers to evaluate
+their own radar simulations. By using RadaRays, developers can significantly
+reduce the time and cost associated with prototyping and testing FMCW
+radar-based algorithms. We also provide a Gazebo plugin that makes our work
+accessible to the mobile robotics community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sampling-based Model Predictive Control Leveraging Parallelizable
+  Physics Simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09105v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09105v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Corrado Pezzato, Chadi Salmi, Elia Trevisan, Max Spahn, Javier Alonso-Mora, Carlos Hernández Corbato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method for sampling-based model predictive control that makes
+use of a generic physics simulator as the dynamical model. In particular, we
+propose a Model Predictive Path Integral controller (MPPI), that uses the
+GPU-parallelizable IsaacGym simulator to compute the forward dynamics of a
+problem. By doing so, we eliminate the need for explicit encoding of robot
+dynamics and contacts with objects for MPPI. Since no explicit dynamic modeling
+is required, our method is easily extendable to different objects and robots
+and allows one to solve complex navigation and contact-rich tasks. We
+demonstrate the effectiveness of this method in several simulated and
+real-world settings, among which mobile navigation with collision avoidance,
+non-prehensile manipulation, and whole-body control for high-dimensional
+configuration spaces. This method is a powerful and accessible open-source tool
+to solve a large variety of contact-rich motion planning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for RA-L. Code and videos available at
+  https://autonomousrobots.nl/paper_websites/isaac-mppi</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Concurrent-Learning Based Relative Localization in Shape Formation of
+  Robot Swarms (Extended version) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.06052v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.06052v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhu Lü, Kunrui Ze, Shuoyu Yue, Kexin Liu, Wei Wang, Guibin Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address the shape formation problem for massive robot
+swarms in environments where external localization systems are unavailable.
+Achieving this task effectively with solely onboard measurements is still
+scarcely explored and faces some practical challenges. To solve this
+challenging problem, we propose the following novel results. Firstly, to
+estimate the relative positions among neighboring robots, a concurrent-learning
+based estimator is proposed. It relaxes the persistent excitation condition
+required in the classical ones such as least-square estimator. Secondly, we
+introduce a finite-time agreement protocol to determine the shape location.
+This is achieved by estimating the relative position between each robot and a
+randomly assigned seed robot. The initial position of the seed one marks the
+shape location. Thirdly, based on the theoretical results of the relative
+localization, a novel behavior-based control strategy is devised. This strategy
+not only enables adaptive shape formation of large group of robots but also
+enhances the observability of inter-robot relative localization. Numerical
+simulation results are provided to verify the performance of our proposed
+strategy compared to the state-of-the-art ones. Additionally, outdoor
+experiments on real robots further demonstrate the practical effectiveness and
+robustness of our methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Agent Consensus Seeking via Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.20151v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.20151v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaben Chen, Wenkang Ji, Lufeng Xu, Shiyu Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent systems driven by large language models (LLMs) have shown
+promising abilities for solving complex tasks in a collaborative manner. This
+work considers a fundamental problem in multi-agent collaboration: consensus
+seeking. When multiple agents work together, we are interested in how they can
+reach a consensus through inter-agent negotiation. To that end, this work
+studies a consensus-seeking task where the state of each agent is a numerical
+value and they negotiate with each other to reach a consensus value. It is
+revealed that when not explicitly directed on which strategy should be adopted,
+the LLM-driven agents primarily use the average strategy for consensus seeking
+although they may occasionally use some other strategies. Moreover, this work
+analyzes the impact of the agent number, agent personality, and network
+topology on the negotiation process. The findings reported in this work can
+potentially lay the foundations for understanding the behaviors of LLM-driven
+multi-agent systems for solving more complex tasks. Furthermore, LLM-driven
+consensus seeking is applied to a multi-robot aggregation task. This
+application demonstrates the potential of LLM-driven agents to achieve
+zero-shot autonomous planning for multi-robot collaboration tasks. Project
+website: windylab.github.io/ConsensusLLM/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AirPilot: Interpretable PPO-based DRL Auto-Tuned Nonlinear PID Drone
+  Controller for Robust Autonomous Flights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00204v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00204v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyang Zhang, Cristian Emanuel Ocampo Rivera, Kyle Tyni, Steven Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Navigation precision, speed and stability are crucial for safe Unmanned
+Aerial Vehicle (UAV) flight maneuvers and effective flight mission executions
+in dynamic environments. Different flight missions may have varying objectives,
+such as minimizing energy consumption, achieving precise positioning, or
+maximizing speed. A controller that can adapt to different objectives on the
+fly is highly valuable. Proportional Integral Derivative (PID) controllers are
+one of the most popular and widely used control algorithms for drones and other
+control systems, but their linear control algorithm fails to capture the
+nonlinear nature of the dynamic wind conditions and complex drone system.
+Manually tuning the PID gains for various missions can be time-consuming and
+requires significant expertise. This paper aims to revolutionize drone flight
+control by presenting the AirPilot, a nonlinear Deep Reinforcement Learning
+(DRL) - enhanced Proportional Integral Derivative (PID) drone controller using
+Proximal Policy Optimization (PPO). AirPilot controller combines the simplicity
+and effectiveness of traditional PID control with the adaptability, learning
+capability, and optimization potential of DRL. This makes it better suited for
+modern drone applications where the environment is dynamic, and
+mission-specific performance demands are high. We employed a COEX Clover
+autonomous drone for training the DRL agent within the simulator and
+implemented it in a real-world lab setting, which marks a significant milestone
+as one of the first attempts to apply a DRL-based flight controller on an
+actual drone. Airpilot is capable of reducing the navigation error of the
+default PX4 PID position controller by 90%, improving effective navigation
+speed of a fine-tuned PID controller by 21%, reducing settling time and
+overshoot by 17% and 16% respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Complementarity-Free Multi-Contact Modeling and Optimization for
+  Dexterous Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07855v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07855v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanxin Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A significant barrier preventing model-based methods from achieving real-time
+and versatile dexterous robotic manipulation is the inherent complexity of
+multi-contact dynamics. Traditionally formulated as complementarity models,
+multi-contact dynamics introduces non-smoothness and combinatorial complexity,
+complicating contact-rich planning and optimization. In this paper, we
+circumvent these challenges by introducing a lightweight yet capable
+multi-contact model. Our new model, derived from the duality of
+optimization-based contact models, dispenses with the complementarity
+constructs entirely, providing computational advantages such as closed-form
+time stepping, differentiability, automatic satisfaction with Coulomb friction
+law, and minimal hyperparameter tuning. We demonstrate the effectiveness and
+efficiency of the model for planning and control in a range of challenging
+dexterous manipulation tasks, including fingertip 3D in-air manipulation,
+TriFinger in-hand manipulation, and Allegro hand on-palm reorientation, all
+performed with diverse objects. Our method consistently achieves
+state-of-the-art results: (I) a 96.5% average success rate across all objects
+and tasks, (II) high manipulation accuracy with an average reorientation error
+of 11{\deg} and position error of 7.8mm, and (III) contact-implicit model
+predictive control running at 50-100 Hz for all objects and tasks. These
+results are achieved with minimal hyperparameter tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Video demo: https://youtu.be/NsL4hbSXvFg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Spatial-Temporal Triangulation for Bearing-Only Cooperative
+  Motion Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.15846v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.15846v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Canlun Zheng, Yize Mi, Hanqing Guo, Huaben Chen, Zhiyun Lin, Shiyu Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-based cooperative motion estimation is an important problem for many
+multi-robot systems such as cooperative aerial target pursuit. This problem can
+be formulated as bearing-only cooperative motion estimation, where the visual
+measurement is modeled as a bearing vector pointing from the camera to the
+target. The conventional approaches for bearing-only cooperative estimation are
+mainly based on the framework distributed Kalman filtering (DKF). In this
+paper, we propose a new optimal bearing-only cooperative estimation algorithm,
+named spatial-temporal triangulation, based on the method of distributed
+recursive least squares, which provides a more flexible framework for designing
+distributed estimators than DKF. The design of the algorithm fully incorporates
+all the available information and the specific triangulation geometric
+constraint. As a result, the algorithm has superior estimation performance than
+the state-of-the-art DKF algorithms in terms of both accuracy and convergence
+speed as verified by numerical simulation. We rigorously prove the exponential
+convergence of the proposed algorithm. Moreover, to verify the effectiveness of
+the proposed algorithm under practical challenging conditions, we develop a
+vision-based cooperative aerial target pursuit system, which is the first of
+such fully autonomous systems so far to the best of our knowledge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tightly-Coupled LiDAR-IMU-Wheel Odometry with an Online Neural Kinematic
+  Model Learning via Factor Graph Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.08907v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.08907v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taku Okawara, Kenji Koide, Shuji Oishi, Masashi Yokozuka, Atsuhiko Banno, Kentaro Uno, Kazuya Yoshida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Environments lacking geometric features (e.g., tunnels and long straight
+corridors) are challenging for LiDAR-based odometry algorithms because LiDAR
+point clouds degenerate in such environments. For wheeled robots, a wheel
+kinematic model (i.e., wheel odometry) can improve the reliability of the
+odometry estimation. However, the kinematic model suffers from complex motions
+(e.g., wheel slippage, lateral movement) in the case of skid-steering robots
+particularly because this robot model rotates by skidding its wheels.
+Furthermore, these errors change nonlinearly when the wheel slippage is large
+(e.g., drifting) and are subject to terrain-dependent parameters. To
+simultaneously tackle point cloud degeneration and the kinematic model errors,
+we developed a LiDAR-IMU-wheel odometry algorithm incorporating online training
+of a neural network that learns the kinematic model of wheeled robots with
+nonlinearity. We propose to train the neural network online on a factor graph
+along with robot states, allowing the learning-based kinematic model to adapt
+to the current terrain condition. The proposed method jointly solves online
+training of the neural network and LiDARIMUwheel odometry on a unified factor
+graph to retain the consistency of all those constraints. Through experiments,
+we first verified that the proposed network adapted to a changing environment,
+resulting in an accurate odometry estimation across different environments. We
+then confirmed that the proposed odometry estimation algorithm was robust
+against point cloud degeneration and nonlinearity (e.g., large wheel slippage
+by drifting) of the kinematic model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://youtu.be/CvRVhdda7Cw</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FLAME: Learning to Navigate with Multimodal LLM in Urban Environments <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhe Xu, Yiyuan Pan, Zhe Liu, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated potential in
+Vision-and-Language Navigation (VLN) tasks, yet current applications face
+challenges. While LLMs excel in general conversation scenarios, they struggle
+with specialized navigation tasks, yielding suboptimal performance compared to
+specialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied
+Agent), a novel Multimodal LLM-based agent and architecture designed for urban
+VLN tasks that efficiently handles multiple observations. Our approach
+implements a three-phase tuning technique for effective adaptation to
+navigation tasks, including single perception tuning for street view
+description, multiple perception tuning for route summarization, and end-to-end
+training on VLN datasets. The augmented datasets are synthesized automatically.
+Experimental results demonstrate FLAME's superiority over existing methods,
+surpassing state-of-the-art methods by a 7.3% increase in task completion on
+Touchdown dataset. This work showcases the potential of Multimodal LLMs (MLLMs)
+in complex navigation tasks, representing an advancement towards applications
+of MLLMs in the field of embodied intelligence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DCPI-Depth: Explicitly Infusing Dense Correspondence Prior to
+  Unsupervised Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengtan Zhang, Yi Feng, Qijun Chen, Rui Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been a recent surge of interest in learning to perceive depth from
+monocular videos in an unsupervised fashion. A key challenge in this field is
+achieving robust and accurate depth estimation in challenging scenarios,
+particularly in regions with weak textures or where dynamic objects are
+present. This study makes three major contributions by delving deeply into
+dense correspondence priors to provide existing frameworks with explicit
+geometric constraints. The first novelty is a contextual-geometric depth
+consistency loss, which employs depth maps triangulated from dense
+correspondences based on estimated ego-motion to guide the learning of depth
+perception from contextual information, since explicitly triangulated depth
+maps capture accurate relative distances among pixels. The second novelty
+arises from the observation that there exists an explicit, deducible
+relationship between optical flow divergence and depth gradient. A differential
+property correlation loss is, therefore, designed to refine depth estimation
+with a specific emphasis on local variations. The third novelty is a
+bidirectional stream co-adjustment strategy that enhances the interaction
+between rigid and optical flows, encouraging the former towards more accurate
+correspondence and making the latter more adaptable across various scenarios
+under the static scene hypotheses. DCPI-Depth, a framework that incorporates
+all these innovative components and couples two bidirectional and collaborative
+streams, achieves state-of-the-art performance and generalizability across
+multiple public datasets, outperforming all existing prior arts. Specifically,
+it demonstrates accurate depth estimation in texture-less and dynamic regions,
+and shows more reasonable smoothness. Our source code will be publicly
+available at mias.group/DCPI-Depth upon publication.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embodied-RAG: General Non-parametric Embodied Memory for Retrieval and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18313v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18313v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quanting Xie, So Yeon Min, Pengliang Ji, Yue Yang, Tianyi Zhang, Kedi Xu, Aarav Bajaj, Ruslan Salakhutdinov, Matthew Johnson-Roberson, Yonatan Bisk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is no limit to how much a robot might explore and learn, but all of
+that knowledge needs to be searchable and actionable. Within language research,
+retrieval augmented generation (RAG) has become the workhorse of large-scale
+non-parametric knowledge; however, existing techniques do not directly transfer
+to the embodied domain, which is multimodal, where data is highly correlated,
+and perception requires abstraction. To address these challenges, we introduce
+Embodied-RAG, a framework that enhances the foundational model of an embodied
+agent with a non-parametric memory system capable of autonomously constructing
+hierarchical knowledge for both navigation and language generation.
+Embodied-RAG handles a full range of spatial and semantic resolutions across
+diverse environments and query types, whether for a specific object or a
+holistic description of ambiance. At its core, Embodied-RAG's memory is
+structured as a semantic forest, storing language descriptions at varying
+levels of detail. This hierarchical organization allows the system to
+efficiently generate context-sensitive outputs across different robotic
+platforms. We demonstrate that Embodied-RAG effectively bridges RAG to the
+robotics domain, successfully handling over 250 explanation and navigation
+queries across kilometer-level environments, highlighting its promise as a
+general-purpose non-parametric system for embodied agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Web: https://quanting-xie.github.io/Embodied-RAG-web/</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">21</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARM-IRL: Adaptive Resilience Metric Quantification Using Inverse
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12362v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12362v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhijeet Sahu, Venkatesh Venkataramanan, Richard Macwan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Resilience of safety-critical systems is gaining importance, particularly
+with the increasing number of cyber and physical threats. Cyber-physical
+threats are becoming increasingly prevalent, as digital systems are ubiquitous
+in critical infrastructure. The challenge with determining the resilience of
+cyber-physical systems is identifying a set of resilience metrics that can
+adapt to the changing states of the system. A static resilience metric can lead
+to an inaccurate estimation of system state, and can result in unintended
+consequences against cyber threats. In this work, we propose a data-driven
+method for adaptive resilience metric learning. The primary goal is to learn a
+single resilience metric by formulating an inverse reinforcement learning
+problem that learns a reward or objective from a set of control actions from an
+expert. It learns the structure or parameters of the reward function based on
+information provided by expert demonstrations. Most prior work has considered
+static weights or theories from fuzzy logic to formulate a single resilience
+metric. Instead, this work learns the resilience metric, represented as reward
+function, using adversarial inverse reinforcement learning, to determine the
+optimal policy through training the generator discriminator in parallel. We
+evaluate our proposed technique in scenarios such as optimal communication
+network rerouting, power distribution network reconfiguration, and a combined
+cyber-physical restoration of critical load using the IEEE 123-bus system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Microgrid Operation Control with State-of-Charge- Dependent Storage
+  Power Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        E. D. Gomez Anccas, C. A. Hans, D. Schulz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The microgrid concept offers high flexibility and resilience due to the
+possibility of switching between grid-connected and stand-alone operation. This
+renders microgrids an auspicious solution for rural areas and critical
+infrastructure. In standalone or islanded mode, the main objective is cost
+minimization while ensuring a safe and reliable operation. Optimal operation
+schemes for microgrids usually assume fixed power limits for energy storage
+units. This, however, is not sufficient for lithiumion energy storage systems,
+which often come with dynamic power limits that depend on the state of charge.
+These limits are especially prominent when the state of charge is close to its
+boundaries. In this paper, dynamic constraints for energy storages are modelled
+using convex polytopes and fitted to experimental data acquired from an 11.6
+kWh lithium-ion energy storage system. The polytopic constraints are integrated
+in a model predictive control scheme that was designed for a standalone
+microgrid composed of a fuel cell, a photovoltaic generator and a lithium-ion
+energy storage system. To evaluate the advantages, a case study with two
+configurations is performed. The model predictive controller without polytopic
+constraints led to constraint violations in 11.77 % of the simulation time
+steps with a maximum deviation of 118 % above the power limits. The
+configuration with polytopic constraints in contrary led to no violations over
+the entire simulation horizon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial exponential decay of perturbations in optimal control of general
+  evolution equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simone Göttlich, Benedikt Oppeneiger, Manuel Schaller, Karl Worthmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze the robustness of optimally controlled evolution equations with
+respect to spatially localized perturbations. We prove that if the involved
+operators are domain-uniformly stabilizable and detectable, then these
+localized perturbations only have a local effect on the optimal solution. We
+characterize this domain-uniform stabilizability and detectability for the
+transport equation with constant transport velocity, showing that even for
+unitary semigroups, optimality implies exponential damping. Finally, we extend
+our result to the case of a space-dependent transport velocity. Numerical
+examples in one space dimension complement the theoretical results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lie-Bracket Nash Equilibrium Seeking with Bounded Update Rates for
+  Noncooperative Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Hugo Pereira Rodrigues, Tiago Roux Oliveira, Miroslav Krstic, Tamer Basar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel approach for local convergence to Nash
+equilibrium in quadratic noncooperative games based on a distributed
+Lie-bracket extremum seeking control scheme. This is the first instance of
+noncooperative games being tackled in a model-free fashion integrated with the
+extremum seeking method of bounded update rates. In particular, the stability
+analysis is carried out using Lie-bracket approximation and Lyapunov's direct
+method. We quantify the size of the ultimate small residual sets around the
+Nash equilibrium and illustrate the theoretical results numerically on an
+example in an oligopoly setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast sparse optimization via adaptive shrinkage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vito Cerone, Sophie M. Fosson, Diego Regruto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The need for fast sparse optimization is emerging, e.g., to deal with
+large-dimensional data-driven problems and to track time-varying systems. In
+the framework of linear sparse optimization, the iterative
+shrinkage-thresholding algorithm is a valuable method to solve Lasso, which is
+particularly appreciated for its ease of implementation. Nevertheless, it
+converges slowly. In this paper, we develop a proximal method, based on
+logarithmic regularization, which turns out to be an iterative
+shrinkage-thresholding algorithm with adaptive shrinkage hyperparameter. This
+adaptivity substantially enhances the trajectory of the algorithm, in a way
+that yields faster convergence, while keeping the simplicity of the original
+method. Our contribution is twofold: on the one hand, we derive and analyze the
+proposed algorithm; on the other hand, we validate its fast convergence via
+numerical experiments and we discuss the performance with respect to
+state-of-the-art algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Characterization of Invariance, Periodic Solutions and Optimization of
+  Dynamic Financial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Stella, Dario Bauso, Franco Blanchini, Patrizio Colaneri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cascading failures, such as bankruptcies and defaults, pose a serious threat
+for the resilience of the global financial system. Indeed, because of the
+complex investment and cross-holding relations within the system, failures can
+occur as a result of the propagation of a financial collapse from one
+organization to another. While this problem has been studied in depth from a
+static angle, namely, when the system is at an equilibrium, we take a different
+perspective and study the corresponding dynamical system. The contribution of
+this paper is threefold. First, we carry out a systematic analysis of the
+regions of attraction and invariance of the system orthants, defined by the
+positive and negative values of the organizations' equity. Second, we
+investigate periodic solutions and show through a counterexample that there
+could exist periodic solutions of period greater than 2. Finally, we study the
+problem of finding the smallest cash injection that would bring the system to
+the maximal invariant region of the positive orthant.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual NUP Representations and Min-Maximization in Factor Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun-Peng Li, Hans-Andrea Loeliger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Normals with unknown parameters (NUP) can be used to convert nontrivial
+model-based estimation problems into iterations of linear least-squares or
+Gaussian estimation problems. In this paper, we extend this approach by
+augmenting factor graphs with convex-dual variables and pertinent NUP
+representations. In particular, in a state space setting, we propose a new
+iterative forward-backward algorithm that is dual to a recently proposed
+backward-forward algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Fidelity Coherent-One-Way QKD Simulation Framework for 6G Networks:
+  Bridging Theory and Reality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aitor Brazaola-Vicario, Vasileios Kouvakis, Stylianos E. Trevlakis, Alejandra Ruiz, Alexandros-Apostolos A. Boulogeorgos, Theodoros Tsiftsis, Dusit Niyato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum key distribution (QKD) has been emerged as a promising solution for
+guaranteeing information-theoretic security. Inspired by this, a great amount
+of research effort has been recently put on designing and testing QKD systems
+as well as articulating preliminary application scenarios. However, due to the
+considerable high-cost of QKD equipment, a lack of QKD communication system
+design tools, wide deployment of such systems and networks is challenging.
+Motivated by this, this paper introduces a QKD communication system design
+tool. First we articulate key operation elements of the QKD, and explain the
+feasibility and applicability of coherent-one-way (COW) QKD solutions. Next, we
+focus on documenting the corresponding simulation framework as well as defining
+the key performance metrics, i.e., quantum bit error rate (QBER), and secrecy
+key rate. To verify the accuracy of the simulation framework, we design and
+deploy a real-world QKD setup. We perform extensive experiments for three
+deployments of diverse transmission distance in the presence or absence of a
+QKD eavesdropper. The results reveal an acceptable match between simulations
+and experiments rendering the simulation framework a suitable tool for QKD
+communication system design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Navigating Robot Swarm Through a Virtual Tube with Flow-Adaptive
+  Distribution Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongwei Zhang, Shuli Lv, Kairong Liu, Quanyi Liang, Quan Quan, Zhikun She
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of robot swarm technology and its diverse
+applications, navigating robot swarms through complex environments has emerged
+as a critical research direction. To ensure safe navigation and avoid potential
+collisions with obstacles, the concept of virtual tubes has been introduced to
+define safe and navigable regions. However, current control methods in virtual
+tubes face the congestion issues, particularly in narrow virtual tubes with low
+throughput. To address these challenges, we first originally introduce the
+concepts of virtual tube area and flow capacity, and develop an new evolution
+model for the spatial density function. Next, we propose a novel control method
+that combines a modified artificial potential field (APF) for swarm navigation
+and density feedback control for distribution regulation, under which a
+saturated velocity command is designed. Then, we generate a global velocity
+field that not only ensures collision-free navigation through the virtual tube,
+but also achieves locally input-to-state stability (LISS) for density tracking
+errors, both of which are rigorously proven. Finally, numerical simulations and
+realistic applications validate the effectiveness and advantages of the
+proposed method in managing robot swarms within narrow virtual tubes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Goal-oriented Transmission Scheduling: Structure-guided DRL with a
+  Unified Dual On-policy and Off-policy Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11921v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11921v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazheng Chen, Wanchun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Goal-oriented communications prioritize application-driven objectives over
+data accuracy, enabling intelligent next-generation wireless systems. Efficient
+scheduling in multi-device, multi-channel systems poses significant challenges
+due to high-dimensional state and action spaces. We address these challenges by
+deriving key structural properties of the optimal solution to the goal-oriented
+scheduling problem, incorporating Age of Information (AoI) and channel states.
+Specifically, we establish the monotonicity of the optimal state value function
+(a measure of long-term system performance) w.r.t. channel states and prove its
+asymptotic convexity w.r.t. AoI states. Additionally, we derive the
+monotonicity of the optimal policy w.r.t. channel states, advancing the
+theoretical framework for optimal scheduling. Leveraging these insights, we
+propose the structure-guided unified dual on-off policy DRL (SUDO-DRL), a
+hybrid algorithm that combines the stability of on-policy training with the
+sample efficiency of off-policy methods. Through a novel structural property
+evaluation framework, SUDO-DRL enables effective and scalable training,
+addressing the complexities of large-scale systems. Numerical results show
+SUDO-DRL improves system performance by up to 45% and reduces convergence time
+by 40% compared to state-of-the-art methods. It also effectively handles
+scheduling in much larger systems, where off-policy DRL fails and on-policy
+benchmarks exhibit significant performance loss, demonstrating its scalability
+and efficacy in goal-oriented communications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper submitted to IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stabilizing Optimal Control for Nonlinear Stochastic Systems: A
+  Parametric Gradient-Based Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuji Ito, Kenji Fujimoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a method for designing stabilizing suboptimal controllers
+for nonlinear stochastic systems. These systems include time-invariant
+stochastic parameters that represent uncertainty of dynamics, posing two key
+difficulties in optimal control. Firstly, the time-invariant stochastic nature
+violates the principle of optimality and Hamilton-Jacobi equations, which are
+fundamental tools for solving optimal control problems. Secondly, nonlinear
+systems must be robustly stabilized against these stochastic parameters. To
+overcome these difficulties simultaneously, this study presents a
+parametric-gradient-based method with a penalty function. A controller and cost
+function are parameterized using basis functions, and a gradient method is
+employed to optimize the controller by minimizing the parameterized cost
+function. Crucial challenges in this approach are parameterizing the cost
+function appropriately and deriving the gradient of the cost. This study
+provides explicit formulations of an optimally parameterized cost and its
+gradient. Furthermore, a suitable penalty function is proposed to ensure robust
+stability, even when using the gradient method. Consequently, the gradient
+method produces a suboptimal feedback controller that guarantees the robust
+stability. The effectiveness of the proposed method is demonstrated through
+numerical simulations, highlighting its performance in comparison with other
+baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is submitted to a journal for possible publication. The
+  copyright of this paper may be transferred without notice, after which this
+  version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding the nearest bounded-real port-Hamiltonian system 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim Cherifi, Nicolas Gillis, Punit Sharma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider linear time-invariant continuous control systems
+which are bounded real, also known as scattering passive. Our main theoretical
+contribution is to show the equivalence between such systems and
+port-Hamiltonian (PH) systems whose factors satisfy certain linear matrix
+inequalities. Based on this result, we propose a formulation for the problem of
+finding the nearest bounded-real system to a given system, and design an
+algorithm combining alternating optimization and Nesterov's fast gradient
+method. This formulation also allows us to check whether a given system is
+bounded real by solving a semidefinite program, and provide a PH
+parametrization for it. We illustrate our proposed algorithms on real and
+synthetic data sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, code, experiments and data available from
+  https://gitlab.com/ngillis/nearestBRsysPHform</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of Control Strategies for Position Regulation in DC
+  Servo Motors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11820v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11820v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raihan Khan Akash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A servomotor is a closed-loop system designed for precise movement control,
+utilizing position feedback to achieve accurate final positions. Due to the
+ability to deliver higher power output and operate at enhanced speeds, DC servo
+motors are considered ideal for applications requiring precision and
+performance. This research aims to design, simulate, and compare various
+control strategies for precise position control in DC servo motors (DSM). The
+controllers evaluated in this study include proportional (P),
+proportional-integral (PI), proportional-integral-derivative (PID),
+state-feedback controllers (SFC), and state-feedback controllers augmented with
+integral action (SFCIA). The performance of these controllers was evaluated
+using MATLAB simulations, characterized by overshoot, settling time,
+steady-state error, rise time, and peak time. The results indicate that the
+state-feedback controller with integral action (SFCIA) surpasses other control
+strategies by achieving zero steady-state error, minimal overshoot, the
+shortest settling time, and optimized rise and peak times. These findings
+highlight the effectiveness of SFCIA for tasks requiring high levels of
+stability, precision, and dynamic performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intraday Power Trading for Imbalance Markets: An Adaptive Risk-Averse
+  Strategy using Mixture Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01215v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01215v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robin Bruneel, Mathijs Schuurmans, Panagiotis Patrinos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient markets are characterised by profit-driven participants
+continuously refining their positions towards the latest insights. Margins for
+profit generation are generally small, shaping a difficult landscape for
+automated trading strategies. This paper introduces a novel intraday power
+trading strategy tailored for single-price balancing markets. The strategy
+relies on a strategically devised mixture model to forecast future system
+imbalance prices and is formulated as a stochastic optimization problem with
+decision-dependent distributions to address two primary challenges: (i) the
+impact of trading positions on the system imbalance price and (ii) the
+uncertainty inherent in the model. The first challenge is tackled by adjusting
+the model to account for price changes after taking a position. For the second
+challenge, a coherent risk measure is added to the cost function to take
+additional uncertainties into account. This paper introduces a methodology to
+select the tuning parameter of this risk measure adaptively by continuously
+quantifying the performance of the strategy on a window of recently observed
+data. The strategy is validated with a simulation on the Belgian electricity
+market using real-time market data. The adaptive tuning approach leads to
+higher absolute profits, while also reducing the number of trades.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Applied Energy [Elsevier]</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Power-Efficient RAN Intelligent Controllers Through Optimized KPI
+  Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09509v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09509v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Paulo S. H. Lima, George N. Katsaros, Konstantinos Nikitopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Open Radio Access Network (RAN) paradigm envisions a more flexible,
+interoperable, and intelligent RAN ecosystem via new open interfaces and
+elements like the RAN Intelligent Controller (RIC). However, the impact of
+these elements on Open RAN's power consumption remains heavily unexplored. This
+work for the first time evaluates the impact of Key Performance Indicator (KPI)
+monitoring on RIC's power consumption using real traffic and power
+measurements. By analyzing various RIC-RAN communication scenarios, we identify
+that RIC's power consumption can become a scalability bottleneck, particularly
+in large-scale deployments, even when RIC is limited to its core operational
+functionalities and without incorporating application-specific processes. In
+this context, also for the first time we explore potential power savings
+through the elimination of redundant KPI transmissions, extending existing
+techniques for identical subscription removal and KPI selection, achieving
+significant power consumption gains exceeding 87\% of the overall RIC power
+consumption.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication and presentation at IEEE WCNC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intra-day Solar and Power Forecast for Optimization of Intraday Market
+  Participation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09551v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09551v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nelson Salazar-Pena, Adolfo Palma-Vergara, Mateo Montes-Vera, Maria Alejandra Vargas-Torres, Adriana Salinas, Andres Velasco, Alejandra Tabares, Andres Gonzalez-Mancera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prediction of solar irradiance enhances reliability in photovoltaic (PV)
+solar plant generation and grid integration. In Colombia, PV plants face
+penalties if energy production deviates beyond governmental thresholds from
+intraday market offers. This research employs Long Short-Term Memory (LSTM) and
+Bidirectional-LSTM (Bi-LSTM) models, utilizing meteorological data from a PV
+plant in El Paso, Cesar, Colombia, to predict solar irradiance with a 6-hour
+horizon and 10-minute resolution. While Bi-LSTM showed superior performance,
+the LSTM model achieved comparable results with significantly reduced training
+time (6 hours versus 18 hours), making it computationally advantageous. The
+LSTM predictions were averaged to create an hourly resolution model, evaluated
+using Mean Absolute Error, Root-Mean-Square Error, Normalized Root-Mean-Square
+Error, and Mean Absolute Percentage Error metrics. Comparison with the Global
+Forecast System (GFS) revealed similar performance, with both models
+effectively capturing daily solar irradiance patterns. The forecast model
+integrates with an Object-Oriented power production model, enabling accurate
+energy offers in the intraday market while minimizing penalty costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 37 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lossless optimal transient control for rigid bodies in 3D space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.15984v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.15984v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Zanella, Federico Califano, Antonio Franchi, Stefano Stramigioli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this letter, we propose a control scheme for rigid bodies designed to
+optimise transient behaviors. The search space for the optimal control input is
+parameterized to yield a passive, specifically lossless, nonlinear feedback
+controller. As a result, it can be combined with other stabilizing controllers
+without compromising the stability of the closed-loop system. The controller
+commands torques generating fictitious gyroscopic effects characteristics of 3D
+rotational rigid body motions, and as such does not inject nor extract kinetic
+energy from the system. We validate the controller in simulation using a model
+predictive control (MPC) scheme, successfully combining stability and
+performance in a stabilization task with obstacle avoidance constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distributed MPC for autonomous ships on inland waterways with
+  collaborative collision avoidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00554v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00554v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoang Anh Tran, Tor Arne Johansen, Rudy R. Negenborn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a distributed solution for the problem of collaborative
+collision avoidance for autonomous inland waterway ships. A two-layer collision
+avoidance framework that considers inland waterway traffic regulations is
+proposed to increase navigational safety for autonomous ships. Our approach
+allows for modifying traffic rules without changing the collision avoidance
+algorithm, and is based on a novel formulation of model predictive control
+(MPC) for collision avoidance of ships. This MPC formulation is designed for
+inland waterway traffic and can handle complex scenarios. The alternating
+direction method of multipliers is used as a scheme for exchanging and
+negotiating intentions among ships. Simulation results show that the proposed
+algorithm can comply with traffic rules. Furthermore, the proposed algorithm
+can safely deviate from traffic rules when necessary to increase efficiency in
+complex scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Agent Consensus Seeking via Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.20151v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.20151v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaben Chen, Wenkang Ji, Lufeng Xu, Shiyu Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent systems driven by large language models (LLMs) have shown
+promising abilities for solving complex tasks in a collaborative manner. This
+work considers a fundamental problem in multi-agent collaboration: consensus
+seeking. When multiple agents work together, we are interested in how they can
+reach a consensus through inter-agent negotiation. To that end, this work
+studies a consensus-seeking task where the state of each agent is a numerical
+value and they negotiate with each other to reach a consensus value. It is
+revealed that when not explicitly directed on which strategy should be adopted,
+the LLM-driven agents primarily use the average strategy for consensus seeking
+although they may occasionally use some other strategies. Moreover, this work
+analyzes the impact of the agent number, agent personality, and network
+topology on the negotiation process. The findings reported in this work can
+potentially lay the foundations for understanding the behaviors of LLM-driven
+multi-agent systems for solving more complex tasks. Furthermore, LLM-driven
+consensus seeking is applied to a multi-robot aggregation task. This
+application demonstrates the potential of LLM-driven agents to achieve
+zero-shot autonomous planning for multi-robot collaboration tasks. Project
+website: windylab.github.io/ConsensusLLM/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AirPilot: Interpretable PPO-based DRL Auto-Tuned Nonlinear PID Drone
+  Controller for Robust Autonomous Flights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00204v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00204v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyang Zhang, Cristian Emanuel Ocampo Rivera, Kyle Tyni, Steven Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Navigation precision, speed and stability are crucial for safe Unmanned
+Aerial Vehicle (UAV) flight maneuvers and effective flight mission executions
+in dynamic environments. Different flight missions may have varying objectives,
+such as minimizing energy consumption, achieving precise positioning, or
+maximizing speed. A controller that can adapt to different objectives on the
+fly is highly valuable. Proportional Integral Derivative (PID) controllers are
+one of the most popular and widely used control algorithms for drones and other
+control systems, but their linear control algorithm fails to capture the
+nonlinear nature of the dynamic wind conditions and complex drone system.
+Manually tuning the PID gains for various missions can be time-consuming and
+requires significant expertise. This paper aims to revolutionize drone flight
+control by presenting the AirPilot, a nonlinear Deep Reinforcement Learning
+(DRL) - enhanced Proportional Integral Derivative (PID) drone controller using
+Proximal Policy Optimization (PPO). AirPilot controller combines the simplicity
+and effectiveness of traditional PID control with the adaptability, learning
+capability, and optimization potential of DRL. This makes it better suited for
+modern drone applications where the environment is dynamic, and
+mission-specific performance demands are high. We employed a COEX Clover
+autonomous drone for training the DRL agent within the simulator and
+implemented it in a real-world lab setting, which marks a significant milestone
+as one of the first attempts to apply a DRL-based flight controller on an
+actual drone. Airpilot is capable of reducing the navigation error of the
+default PX4 PID position controller by 90%, improving effective navigation
+speed of a fine-tuned PID controller by 21%, reducing settling time and
+overshoot by 17% and 16% respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scenarios Generation-based Multiple Interval Prediction Method for
+  Electricity Prices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08532v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08532v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Xin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an innovative interval prediction methodology aimed at
+addressing the limitations of current evaluation indicators while enhancing
+prediction accuracy and reliability. To achieve this, new evaluation metrics
+are proposed, offering a comprehensive assessment of interval prediction
+methods across both all-sample and single-sample scenarios. Additionally, a
+novel Pattern-Diversity Conditional Time-Series Generative Adversarial Network
+(PDCTSGAN) is developed, designed to generate realistic scenarios and support a
+new interval prediction framework based on scenario generation. The PDCTSGAN
+model incorporates unique modifications to random noise inputs, enabling the
+creation of pattern-diverse and realistic scenarios. These scenarios are then
+utilized to produce multiple interval patterns characterized by high coverage
+probability and reduced average width. The proposed approach is validated
+through detailed case studies, and the paper concludes with a discussion of
+future research directions to further refine interval prediction techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">23</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ensemble control of n-level quantum systems with a scalar control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruikang Liang, Ugo Boscain, Mario Sigalotti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we discuss how a general bilinear finite-dimensional closed
+quantum system with dispersed parameters can be steered between eigenstates. We
+show that, under suitable conditions on the separation of spectral gaps and the
+boundedness of parameter dispersion, rotating wave and adiabatic approximations
+can be employed in cascade to achieve population inversion between arbitrary
+eigenstates. We propose an explicit control law and test numerically the
+sharpness of the conditions on several examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending the Leader-First Follower Structure for Bearing-only Formation
+  Control on Directed Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Shi, Daniel Zelazo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work proposes an extension to the leader-first follower (LFF) class of
+graphs used to solve the bearing-only formation control problem over directed
+graphs. The first contribution provides an equilibrium, stability, and
+convergence analysis for a one-follower, multi-leader system (which is not an
+LFF graph). We then propose an extension to the LFF structure, termed
+\emph{ordered} LFF graphs, that allows for additional forward directed edges to
+be included. Using the results of the one-follower multi-leader system we show
+that the ordered LFF graphs can be used to solve the directed bearing-only
+formation control problem. We also show that these structures offer improved
+convergence speed as compared to the LFF graphs. Numerical simulations are
+provided to validate the results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial exponential decay of perturbations in optimal control of general
+  evolution equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simone Göttlich, Benedikt Oppeneiger, Manuel Schaller, Karl Worthmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze the robustness of optimally controlled evolution equations with
+respect to spatially localized perturbations. We prove that if the involved
+operators are domain-uniformly stabilizable and detectable, then these
+localized perturbations only have a local effect on the optimal solution. We
+characterize this domain-uniform stabilizability and detectability for the
+transport equation with constant transport velocity, showing that even for
+unitary semigroups, optimality implies exponential damping. Finally, we extend
+our result to the case of a space-dependent transport velocity. Numerical
+examples in one space dimension complement the theoretical results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lie-Bracket Nash Equilibrium Seeking with Bounded Update Rates for
+  Noncooperative Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Hugo Pereira Rodrigues, Tiago Roux Oliveira, Miroslav Krstic, Tamer Basar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel approach for local convergence to Nash
+equilibrium in quadratic noncooperative games based on a distributed
+Lie-bracket extremum seeking control scheme. This is the first instance of
+noncooperative games being tackled in a model-free fashion integrated with the
+extremum seeking method of bounded update rates. In particular, the stability
+analysis is carried out using Lie-bracket approximation and Lyapunov's direct
+method. We quantify the size of the ultimate small residual sets around the
+Nash equilibrium and illustrate the theoretical results numerically on an
+example in an oligopoly setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FOCUS: First Order Concentrated Updating Scheme 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhou Liu, Ziming Liu, Jeff Gore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) demonstrate remarkable performance, and
+improving their pre-training process appears to be key to enhancing their
+capabilities further. Based on the documented success of Adam, learning rate
+decay, and weight decay, we hypothesize that the pre-training loss landscape
+features a narrowing valley structure. Through experiments with synthetic loss
+functions, we discover that when gradient query noise is high relative to the
+valley's sharpness, Adam's performance falls behind that of Signum because Adam
+reduces the effective step size too drastically. This observation led us to
+develop FOCUS, an optimizer that enhances Signum by incorporating attraction
+toward moving averaged parameters, allowing it to handle noise better while
+maintaining larger step sizes. In training GPT-2, FOCUS proves to be more
+stable than Signum and faster than Adam. These results suggest that gradient
+noise may be an underappreciated limiting factor in LLM training, and FOCUS
+offers promising solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast sparse optimization via adaptive shrinkage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vito Cerone, Sophie M. Fosson, Diego Regruto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The need for fast sparse optimization is emerging, e.g., to deal with
+large-dimensional data-driven problems and to track time-varying systems. In
+the framework of linear sparse optimization, the iterative
+shrinkage-thresholding algorithm is a valuable method to solve Lasso, which is
+particularly appreciated for its ease of implementation. Nevertheless, it
+converges slowly. In this paper, we develop a proximal method, based on
+logarithmic regularization, which turns out to be an iterative
+shrinkage-thresholding algorithm with adaptive shrinkage hyperparameter. This
+adaptivity substantially enhances the trajectory of the algorithm, in a way
+that yields faster convergence, while keeping the simplicity of the original
+method. Our contribution is twofold: on the one hand, we derive and analyze the
+proposed algorithm; on the other hand, we validate its fast convergence via
+numerical experiments and we discuss the performance with respect to
+state-of-the-art algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convergence of time-delayed opinion dynamics with complex interaction
+  types 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingling Yao, Aming Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In opinion dynamics, time delays in agent-to-agent interactions are
+ubiquitous, which can substantially disrupt the dynamical processes rooted in
+agents' opinion exchange, decision-making, and feedback mechanisms. However, a
+thorough comprehension of quantitative impacts of time delays on the opinion
+evolution, considering diverse interaction types and system dynamics, remains
+absent. In this paper, we conduct a systematic investigation into the
+convergence and the associated rate of time-delayed opinion dynamics with
+diverse interaction types in both discrete-time and continuous-time systems.
+For the discrete-time system, we commence by establishing sufficient conditions
+for its convergence on arbitrary signed interaction networks. These conditions
+show that the convergence is determined solely by the topology of the
+interaction network and remains impervious to the magnitude of the time delay.
+Subsequently, we examine the influence of random and other interaction types on
+the convergence rate and discover that time delays tend to decelerate this
+rate. Regarding the continuous-time system, we derive the feasible domain of
+the delay that ensures the convergence of opinion dynamics, revealing that,
+unlike the discrete-time scenarios, large time delays can instigate the
+divergence of opinions. Specifically, we prove that for both random and other
+interaction types, small delays can expedite the convergence of continuous-time
+system. Finally, we present simulation examples to demonstrate the
+effectiveness and robustness of our research findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MirrorCBO: A consensus-based optimization method in the spirit of mirror
+  descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leon Bungert, Franca Hoffmann, Doh Yeon Kim, Tim Roith
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we propose MirrorCBO, a consensus-based optimization (CBO)
+method which generalizes standard CBO in the same way that mirror descent
+generalizes gradient descent. For this we apply the CBO methodology to a swarm
+of dual particles and retain the primal particle positions by applying the
+inverse of the mirror map, which we parametrize as the subdifferential of a
+strongly convex function $\phi$. In this way, we combine the advantages of a
+derivative-free non-convex optimization algorithm with those of mirror descent.
+As a special case, the method extends CBO to optimization problems with convex
+constraints. Assuming bounds on the Bregman distance associated to $\phi$, we
+provide asymptotic convergence results for MirrorCBO with explicit exponential
+rate. Another key contribution is an exploratory numerical study of this new
+algorithm across different application settings, focusing on (i)
+sparsity-inducing optimization, and (ii) constrained optimization,
+demonstrating the competitive performance of MirrorCBO. We observe empirically
+that the method can also be used for optimization on (non-convex) submanifolds
+of Euclidean space, can be adapted to mirrored versions of other recent CBO
+variants, and that it inherits from mirror descent the capability to select
+desirable minimizers, like sparse ones. We also include an overview of recent
+CBO approaches for constrained optimization and compare their performance to
+MirrorCBO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>64 pages, 18 figures, 19 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Characterization of Invariance, Periodic Solutions and Optimization of
+  Dynamic Financial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Stella, Dario Bauso, Franco Blanchini, Patrizio Colaneri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cascading failures, such as bankruptcies and defaults, pose a serious threat
+for the resilience of the global financial system. Indeed, because of the
+complex investment and cross-holding relations within the system, failures can
+occur as a result of the propagation of a financial collapse from one
+organization to another. While this problem has been studied in depth from a
+static angle, namely, when the system is at an equilibrium, we take a different
+perspective and study the corresponding dynamical system. The contribution of
+this paper is threefold. First, we carry out a systematic analysis of the
+regions of attraction and invariance of the system orthants, defined by the
+positive and negative values of the organizations' equity. Second, we
+investigate periodic solutions and show through a counterexample that there
+could exist periodic solutions of period greater than 2. Finally, we study the
+problem of finding the smallest cash injection that would bring the system to
+the maximal invariant region of the positive orthant.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gram-like matrix preserving extensions of noncommutative polynomials to
+  sum of Hermitian Squares 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arijit Mukherjee, Arindam Sutradhar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a nonnegative noncommutative polynomial $f$, equivalently a sum of
+Hermitian squares (SOHS), there exists a positive semidefinite Gram matrix that
+encrypts all essential information of $f$. There are no available methods for
+extending a noncommutative polynomial to a SOHS keeping the Gram matrices
+unperturbed. As a remedy, we introduce an equally significant notion of
+Gram-like matrices and provide linear algebraic techniques to get the desired
+extensions. We further use positive semidefinite completion problem to get SOHS
+and provide criteria in terms of chordal graphs and 2-regular projective
+algebraic sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>All comments are welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Growth model with externalities for energetic transition via MFG with
+  common external variable 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Lavigne, Quentin Petit, Xavier Warin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces a novel mean-field game model for multi-sector
+economic growth in which a dynamically evolving externality, influenced by the
+collective actions of agents, plays a central role. Building on classical
+growth theories and integrating environmental considerations, the framework
+incorporates common noise to capture shared uncertainties among agents about
+the externality variable. We demonstrate the existence and uniqueness of a
+strong mean-field game equilibrium by reformulating the equilibrium conditions
+as a Forward-Backward Stochastic Differential Equation under the stochastic
+maximum principle and establishing a contraction argument to ensure a unique
+solution. We provide a numerical resolution for a specified model using a
+fixed-point approach combined with neural network approximations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simultaneously decoding the unknown stationary state and function
+  parameters for mean field games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11955v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11955v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyu Liu, Catharine W. K. Lo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mean field games (MFGs) offer a versatile framework for modeling large-scale
+interactive systems across multiple domains. This paper builds upon a previous
+work, by developing a state-of-the-art unified approach to decode or design the
+unknown stationary state of MFGs, in addition to the underlying parameter
+functions governing their behavior. This result is novel, even in the general
+realm of inverse problems for nonlinear PDEs. By enabling agents to distill
+crucial insights from observed data and unveil intricate hidden structures and
+unknown states within MFG systems, our approach surmounts a significant
+obstacle, enhancing the applicability of MFGs in real-world scenarios. This
+advancement not only enriches our understanding of MFG dynamics but also
+broadens the scope for their practical deployment in various contexts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Keywords: Mean field games, inverse problems, Cauchy dataset, unique
+  continuation principle, unique identifiability, unknown stationary solutions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Solutions of Manipulation Tasks via Optimal Control of Projected
+  Dynamical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anton Pozharskiy, Armin Nurkanović, Moritz Diehl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a modeling framework for manipulation planning based on the
+formulation of the dynamics as a projected dynamical system. This method uses
+implicit signed distance functions and their gradients to formulate an
+equivalent gradient complementarity system. The optimal control problem is then
+solved via a direct method, discretized using finite-elements with switch
+detection. An extension to this approach is provided in the form of a friction
+formulation commonly used in quasi-static models. We show that this approach is
+able to generate trajectories for problems including multiple pushers,
+friction, and non-convex objects modeled as unions of convex ellipsoids with
+reasonable computational effort.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, Accepted for Robotics Science and Systems 2024,
+  Frontiers of Optimization workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stabilizing Optimal Control for Nonlinear Stochastic Systems: A
+  Parametric Gradient-Based Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuji Ito, Kenji Fujimoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a method for designing stabilizing suboptimal controllers
+for nonlinear stochastic systems. These systems include time-invariant
+stochastic parameters that represent uncertainty of dynamics, posing two key
+difficulties in optimal control. Firstly, the time-invariant stochastic nature
+violates the principle of optimality and Hamilton-Jacobi equations, which are
+fundamental tools for solving optimal control problems. Secondly, nonlinear
+systems must be robustly stabilized against these stochastic parameters. To
+overcome these difficulties simultaneously, this study presents a
+parametric-gradient-based method with a penalty function. A controller and cost
+function are parameterized using basis functions, and a gradient method is
+employed to optimize the controller by minimizing the parameterized cost
+function. Crucial challenges in this approach are parameterizing the cost
+function appropriately and deriving the gradient of the cost. This study
+provides explicit formulations of an optimally parameterized cost and its
+gradient. Furthermore, a suitable penalty function is proposed to ensure robust
+stability, even when using the gradient method. Consequently, the gradient
+method produces a suboptimal feedback controller that guarantees the robust
+stability. The effectiveness of the proposed method is demonstrated through
+numerical simulations, highlighting its performance in comparison with other
+baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is submitted to a journal for possible publication. The
+  copyright of this paper may be transferred without notice, after which this
+  version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding the nearest bounded-real port-Hamiltonian system 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim Cherifi, Nicolas Gillis, Punit Sharma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider linear time-invariant continuous control systems
+which are bounded real, also known as scattering passive. Our main theoretical
+contribution is to show the equivalence between such systems and
+port-Hamiltonian (PH) systems whose factors satisfy certain linear matrix
+inequalities. Based on this result, we propose a formulation for the problem of
+finding the nearest bounded-real system to a given system, and design an
+algorithm combining alternating optimization and Nesterov's fast gradient
+method. This formulation also allows us to check whether a given system is
+bounded real by solving a semidefinite program, and provide a PH
+parametrization for it. We illustrate our proposed algorithms on real and
+synthetic data sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, code, experiments and data available from
+  https://gitlab.com/ngillis/nearestBRsysPHform</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributed Saddle-Point Dynamics in Multilayer Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian D. Rodríguez-Camargo, Andrés F. Urquijo-Rodríguez, Eduardo Mojica-Nava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilayer networks provide a more advanced and comprehensive framework for
+modeling real-world systems compared to traditional single-layer and multiplex
+networks. Unlike single-layer models, multilayer networks have multiple
+interacting layers, each with unique topological features. In this paper, we
+generalize previously developed results for distributed optimization in
+multiplex networks to the more general case of multilayer networks by employing
+a tensor formalism to represent multilayer networks and their tensor-Laplacian
+diffusion dynamics. Although multiplex networks are a special case of
+multilayer networks, where each layer has the same number of replica nodes
+connected one-to-one, this generalized framework removes the need for replica
+nodes, allowing variability in both topology and number of nodes across layers.
+This approach provides a fully generalized structure for distributed
+optimization in multilayer networks and enables more complex interlayer
+connections. We derive the multilayer combinatorial Laplacian tensor and extend
+the distributed gradient descent algorithm. We provide a theoretical analysis
+of the convergence of algorithms. Numerical examples validate our approach, and
+we explore the impact of heterogeneous layer topologies and complex interlayer
+dynamics on consensus time, underscoring their implications for real-world
+multilayer systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantitative convergence for mean field control with common noise and
+  degenerate idiosyncratic noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14053v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14053v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alekos Cecchin, Samuel Daudin, Joe Jackson, Mattia Martini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the convergence problem in the setting of mean field control with
+common noise and degenerate idiosyncratic noise. Our main results establish a
+rate of convergence of the finite-dimensional value functions $V^N$ towards the
+mean field value function $U$. In the case that the idiosyncratic noise is
+constant (but possibly degenerate), we obtain the rate $N^{-1/(d+7)}$, which is
+close to the conjectured optimal rate $N^{-1/d}$, and improves on the existing
+literature even in the non-degenerate setting. In the case that the
+idiosyncratic noise can be both non-constant and degenerate, the argument is
+more complicated, and we instead find the rate $N^{-1/(3d + 19)}$. Our proof
+strategy builds on the one initiated in [Daudin, Delarue, Jackson - JFA, 2024]
+in the case of non-degenerate idiosyncratic noise and zero common noise, which
+consists of approximating $U$ by more regular functions which are almost
+subsolutions of the infinite-dimensional Hamilton-Jacobi equation solved by
+$U$. Because of the different noise structure, several new steps are necessary
+in order to produce an appropriate mollification scheme. In addition to our
+main convergence results, we investigate the case of zero idiosyncratic noise,
+and show that sharper results can be obtained there by purely control-theoretic
+arguments. We also provide examples to demonstrate that the value function is
+sensitive to the choice of admissible controls in the zero noise setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Some fixes in Section 6 (deterministic case)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Programming: From Local Optimality to Global Optimality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11062v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11062v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John Stachurski, Jingni Yang, Ziyue Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the theory of dynamic programming, an optimal policy is a policy whose
+lifetime value dominates that of all other policies from every possible initial
+condition in the state space. This raises a natural question: when does
+optimality from a single state imply optimality from every state? We show that,
+in a general setting, irreducibility of the transition kernel is sufficient for
+this property. Our results have important implications for modern policy-based
+algorithms used to solve large-scale dynamic programs in reinforcement learning
+and other fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Top Feasible-Arm Subset Identification in Constrained Multi-Armed Bandit
+  with Limited Budget 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08845v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08845v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeong Soo Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an algorithm, "constrained successive accept or reject (CSAR),"
+for the problem of identifying the subset of top feasible-arms from a given
+finite set of arms with the limited sampling-budget equal to a given
+time-horizon when the sequential dynamics of the arms follows the model of a
+constrained multi-armed bandit. We provide a finite-time upper bound on the
+probability of the incorrect identification by CSAR that converges to zero with
+an exponential rate in the sampling-budget.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Stochastic Objective-Function-Free Adaptive Regularization Method with
+  Optimal Complexity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.08018v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.08018v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serge Gratton, Sadok Jerad, Philippe L. Toint
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A fully stochastic second-order adaptive-regularization method for
+unconstrained nonconvex optimization is presented which never computes the
+objective-function value, but yet achieves the optimal
+$\mathcal{O}(\epsilon^{-3/2})$ complexity bound for finding first-order
+critical points. The method is noise-tolerant and the inexactness conditions
+required for convergence depend on the history of past steps. Applications to
+cases where derivative evaluation is inexact and to minimization of finite sums
+by sampling are discussed. Numerical experiments on large binary classification
+problems illustrate the potential of the new method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On non-approximability of zero loss global ${\mathcal L}^2$ minimizers
+  by gradient descent in Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07065v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07065v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Chen, Patricia Muñoz Ewald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze geometric aspects of the gradient descent algorithm in Deep
+Learning (DL), and give a detailed discussion of the circumstance that in
+underparametrized DL networks, zero loss minimization can generically not be
+attained. As a consequence, we conclude that the distribution of training
+inputs must necessarily be non-generic in order to produce zero loss
+minimizers, both for the method constructed in [Chen-Munoz Ewald 2023, 2024],
+or for gradient descent [Chen 2025] (which assume clustering of training data).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AMS Latex, 7 pages. Title changed, statement of Corollary 1.6
+  corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient Descent Converges Linearly to Flatter Minima than Gradient Flow
+  in Shallow Linear Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09137v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09137v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierfrancesco Beneventano, Blake Woodworth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the gradient descent (GD) dynamics of a depth-2 linear neural
+network with a single input and output. We show that GD converges at an
+explicit linear rate to a global minimum of the training loss, even with a
+large stepsize -- about $2/\textrm{sharpness}$. It still converges for even
+larger stepsizes, but may do so very slowly. We also characterize the solution
+to which GD converges, which has lower norm and sharpness than the gradient
+flow solution. Our analysis reveals a trade off between the speed of
+convergence and the magnitude of implicit regularization. This sheds light on
+the benefits of training at the ``Edge of Stability'', which induces additional
+regularization by delaying convergence and may have implications for training
+more complex models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimization Algorithm Design via Electric Circuits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02573v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02573v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen P. Boyd, Tetiana Parshakova, Ernest K. Ryu, Jaewook J. Suh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel methodology for convex optimization algorithm design using
+ideas from electric RLC circuits. Given an optimization problem, the first
+stage of the methodology is to design an appropriate electric circuit whose
+continuous-time dynamics converge to the solution of the optimization problem
+at hand. Then, the second stage is an automated, computer-assisted
+discretization of the continuous-time dynamics, yielding a provably convergent
+discrete-time algorithm. Our methodology recovers many classical (distributed)
+optimization algorithms and enables users to quickly design and explore a wide
+range of new algorithms with convergence guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">116</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Affordance-Aware Articulation Synthesis for Rigged Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Chu Yu, Chieh Hubert Lin, Hsin-Ying Lee, Chaoyang Wang, Yu-Chiang Frank Wang, Ming-Hsuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rigged objects are commonly used in artist pipelines, as they can flexibly
+adapt to different scenes and postures. However, articulating the rigs into
+realistic affordance-aware postures (e.g., following the context, respecting
+the physics and the personalities of the object) remains time-consuming and
+heavily relies on human labor from experienced artists. In this paper, we
+tackle the novel problem and design A3Syn. With a given context, such as the
+environment mesh and a text prompt of the desired posture, A3Syn synthesizes
+articulation parameters for arbitrary and open-domain rigged objects obtained
+from the Internet. The task is incredibly challenging due to the lack of
+training data, and we do not make any topological assumptions about the
+open-domain rigs. We propose using 2D inpainting diffusion model and several
+control techniques to synthesize in-context affordance information. Then, we
+develop an efficient bone correspondence alignment using a combination of
+differentiable rendering and semantic correspondence. A3Syn has stable
+convergence, completes in minutes, and synthesizes plausible affordance on
+different combinations of in-the-wild object rigs and scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://chuyu.org/research/a3syn</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning segmentation from point trajectories <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12392v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12392v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurynas Karazija, Iro Laina, Christian Rupprecht, Andrea Vedaldi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of segmenting objects in videos based on their motion
+and no other forms of supervision. Prior work has often approached this problem
+by using the principle of common fate, namely the fact that the motion of
+points that belong to the same object is strongly correlated. However, most
+authors have only considered instantaneous motion from optical flow. In this
+work, we present a way to train a segmentation network using long-term point
+trajectories as a supervisory signal to complement optical flow. The key
+difficulty is that long-term motion, unlike instantaneous motion, is difficult
+to model -- any parametric approximation is unlikely to capture complex motion
+patterns over long periods of time. We instead draw inspiration from subspace
+clustering approaches, proposing a loss function that seeks to group the
+trajectories into low-rank matrices where the motion of object points can be
+approximately explained as a linear combination of other point tracks. Our
+method outperforms the prior art on motion-based segmentation, which shows the
+utility of long-term motion and the effectiveness of our formulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 Spotlight. Project
+  https://www.robots.ox.ac.uk/~vgg/research/lrtl/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GPS as a Control Signal for Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Feng, Ziyang Chen, Aleksander Holynski, Alexei A. Efros, Andrew Owens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that the GPS tags contained in photo metadata provide a useful
+control signal for image generation. We train GPS-to-image models and use them
+for tasks that require a fine-grained understanding of how images vary within a
+city. In particular, we train a diffusion model to generate images conditioned
+on both GPS and text. The learned model generates images that capture the
+distinctive appearance of different neighborhoods, parks, and landmarks. We
+also extract 3D models from 2D GPS-to-image models through score distillation
+sampling, using GPS conditioning to constrain the appearance of the
+reconstruction from each viewpoint. Our evaluations suggest that our
+GPS-conditioned models successfully learn to generate images that vary based on
+location, and that GPS conditioning improves estimated 3D structure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Taming Teacher Forcing for Masked Autoregressive Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12389v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12389v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deyu Zhou, Quan Sun, Yuang Peng, Kun Yan, Runpei Dong, Duomin Wang, Zheng Ge, Nan Duan, Xiangyu Zhang, Lionel M. Ni, Heung-Yeung Shum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MAGI, a hybrid video generation framework that combines masked
+modeling for intra-frame generation with causal modeling for next-frame
+generation. Our key innovation, Complete Teacher Forcing (CTF), conditions
+masked frames on complete observation frames rather than masked ones (namely
+Masked Teacher Forcing, MTF), enabling a smooth transition from token-level
+(patch-level) to frame-level autoregressive generation. CTF significantly
+outperforms MTF, achieving a +23% improvement in FVD scores on first-frame
+conditioned video prediction. To address issues like exposure bias, we employ
+targeted training strategies, setting a new benchmark in autoregressive video
+generation. Experiments show that MAGI can generate long, coherent video
+sequences exceeding 100 frames, even when trained on as few as 16 frames,
+highlighting its potential for scalable, high-quality video generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continuous 3D Perception Model with Persistent State 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianqian Wang, Yifei Zhang, Aleksander Holynski, Alexei A. Efros, Angjoo Kanazawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified framework capable of solving a broad range of 3D tasks.
+Our approach features a stateful recurrent model that continuously updates its
+state representation with each new observation. Given a stream of images, this
+evolving state can be used to generate metric-scale pointmaps (per-pixel 3D
+points) for each new input in an online fashion. These pointmaps reside within
+a common coordinate system, and can be accumulated into a coherent, dense scene
+reconstruction that updates as new images arrive. Our model, called CUT3R
+(Continuous Updating Transformer for 3D Reconstruction), captures rich priors
+of real-world scenes: not only can it predict accurate pointmaps from image
+observations, but it can also infer unseen regions of the scene by probing at
+virtual, unobserved views. Our method is simple yet highly flexible, naturally
+accepting varying lengths of images that may be either video streams or
+unordered photo collections, containing both static and dynamic content. We
+evaluate our method on various 3D/4D tasks and demonstrate competitive or
+state-of-the-art performance in each. Project Page: https://cut3r.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InternVideo2.5: Empowering Video MLLMs with Long and Rich Context
+  Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Wang, Xinhao Li, Ziang Yan, Yinan He, Jiashuo Yu, Xiangyu Zeng, Chenting Wang, Changlian Ma, Haian Huang, Jianfei Gao, Min Dou, Kai Chen, Wenhai Wang, Yu Qiao, Yali Wang, Limin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper aims to improve the performance of video multimodal large language
+models (MLLM) via long and rich context (LRC) modeling. As a result, we develop
+a new version of InternVideo2.5 with a focus on enhancing the original MLLMs'
+ability to perceive fine-grained details and capture long-form temporal
+structure in videos. Specifically, our approach incorporates dense vision task
+annotations into MLLMs using direct preference optimization and develops
+compact spatiotemporal representations through adaptive hierarchical token
+compression. Experimental results demonstrate this unique design of LRC greatly
+improves the results of video MLLM in mainstream video understanding benchmarks
+(short & long), enabling the MLLM to memorize significantly longer video inputs
+(at least 6x longer than the original), and master specialized vision
+capabilities like object tracking and segmentation. Our work highlights the
+importance of multimodal context richness (length and fineness) in empowering
+MLLM's innate abilites (focus and memory), providing new insights for future
+research on video MLLM. Code and models are available at
+https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2.5
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CCESAR: Coastline Classification-Extraction From SAR Images Using
+  CNN-U-Net Combination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12384v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12384v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vidhu Arora, Shreyan Gupta, Ananthakrishna Kudupu, Aditya Priyadarshi, Aswathi Mundayatt, Jaya Sreevalsan-Nair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we improve the deep learning solution for coastline
+extraction from Synthetic Aperture Radar (SAR) images by proposing a two-stage
+model involving image classification followed by segmentation. We hypothesize
+that a single segmentation model usually used for coastline detection is
+insufficient to characterize different coastline types. We demonstrate that the
+need for a two-stage workflow prevails through different compression levels of
+these images. Our results from experiments using a combination of CNN and U-Net
+models on Sentinel-1 images show that the two-stage workflow, coastline
+classification-extraction from SAR images (CCESAR) outperforms a single U-Net
+segmentation model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffDoctor: Diagnosing Image Diffusion Models Before Treating 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Wang, Xi Chen, Xiaogang Xu, Sihui Ji, Yu Liu, Yujun Shen, Hengshuang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In spite of the recent progress, image diffusion models still produce
+artifacts. A common solution is to refine an established model with a quality
+assessment system, which generally rates an image in its entirety. In this
+work, we believe problem-solving starts with identification, yielding the
+request that the model should be aware of not just the presence of defects in
+an image, but their specific locations. Motivated by this, we propose
+DiffDoctor, a two-stage pipeline to assist image diffusion models in generating
+fewer artifacts. Concretely, the first stage targets developing a robust
+artifact detector, for which we collect a dataset of over 1M flawed synthesized
+images and set up an efficient human-in-the-loop annotation process,
+incorporating a carefully designed class-balance strategy. The learned artifact
+detector is then involved in the second stage to tune the diffusion model
+through assigning a per-pixel confidence map for each synthesis. Extensive
+experiments on text-to-image diffusion models demonstrate the effectiveness of
+our artifact detector as well as the soundness of our diagnose-then-treat
+design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages of main body and 2 pages of references, 9 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parallel Sequence Modeling via Generalized Spatial Propagation Network <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongjun Wang, Wonmin Byeon, Jiarui Xu, Jinwei Gu, Ka Chun Cheung, Xiaolong Wang, Kai Han, Jan Kautz, Sifei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Generalized Spatial Propagation Network (GSPN), a new
+attention mechanism optimized for vision tasks that inherently captures 2D
+spatial structures. Existing attention models, including transformers, linear
+attention, and state-space models like Mamba, process multi-dimensional data as
+1D sequences, compromising spatial coherence and efficiency. GSPN overcomes
+these limitations by directly operating on spatially coherent image data and
+forming dense pairwise connections through a line-scan approach. Central to
+GSPN is the Stability-Context Condition, which ensures stable, context-aware
+propagation across 2D sequences and reduces the effective sequence length to
+$\sqrt{N}$ for a square map with N elements, significantly enhancing
+computational efficiency. With learnable, input-dependent weights and no
+reliance on positional embeddings, GSPN achieves superior spatial fidelity and
+state-of-the-art performance in vision tasks, including ImageNet
+classification, class-guided image generation, and text-to-image generation.
+Notably, GSPN accelerates SD-XL with softmax-attention by over $84\times$ when
+generating 16K images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: http://whj363636.github.io/GSPN/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMVU: Measuring Expert-Level Multi-Discipline Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilun Zhao, Lujing Xie, Haowei Zhang, Guo Gan, Yitao Long, Zhiyuan Hu, Tongyan Hu, Weiyuan Chen, Chuhan Li, Junyang Song, Zhijian Xu, Chengye Wang, Weifeng Pan, Ziyao Shangguan, Xiangru Tang, Zhenwen Liang, Yixin Liu, Chen Zhao, Arman Cohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MMVU, a comprehensive expert-level, multi-discipline benchmark
+for evaluating foundation models in video understanding. MMVU includes 3,000
+expert-annotated questions spanning 27 subjects across four core disciplines:
+Science, Healthcare, Humanities & Social Sciences, and Engineering. Compared to
+prior benchmarks, MMVU features three key advancements. First, it challenges
+models to apply domain-specific knowledge and perform expert-level reasoning to
+analyze specialized-domain videos, moving beyond the basic visual perception
+typically assessed in current video benchmarks. Second, each example is
+annotated by human experts from scratch. We implement strict data quality
+controls to ensure the high quality of the dataset. Finally, each example is
+enriched with expert-annotated reasoning rationals and relevant domain
+knowledge, facilitating in-depth analysis. We conduct an extensive evaluation
+of 32 frontier multimodal foundation models on MMVU. The latest
+System-2-capable models, o1 and Gemini 2.0 Flash Thinking, achieve the highest
+performance among the tested models. However, they still fall short of matching
+human expertise. Through in-depth error analyses and case studies, we offer
+actionable insights for future advancements in expert-level,
+knowledge-intensive video understanding for specialized domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video Depth Anything: Consistent Depth Estimation for Super-Long Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sili Chen, Hengkai Guo, Shengnan Zhu, Feihu Zhang, Zilong Huang, Jiashi Feng, Bingyi Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth Anything has achieved remarkable success in monocular depth estimation
+with strong generalization ability. However, it suffers from temporal
+inconsistency in videos, hindering its practical applications. Various methods
+have been proposed to alleviate this issue by leveraging video generation
+models or introducing priors from optical flow and camera poses. Nonetheless,
+these methods are only applicable to short videos (< 10 seconds) and require a
+trade-off between quality and computational efficiency. We propose Video Depth
+Anything for high-quality, consistent depth estimation in super-long videos
+(over several minutes) without sacrificing efficiency. We base our model on
+Depth Anything V2 and replace its head with an efficient spatial-temporal head.
+We design a straightforward yet effective temporal consistency loss by
+constraining the temporal depth gradient, eliminating the need for additional
+geometric priors. The model is trained on a joint dataset of video depth and
+unlabeled images, similar to Depth Anything V2. Moreover, a novel
+key-frame-based strategy is developed for long video inference. Experiments
+show that our model can be applied to arbitrarily long videos without
+compromising quality, consistency, or generalization ability. Comprehensive
+evaluations on multiple video benchmarks demonstrate that our approach sets a
+new state-of-the-art in zero-shot video depth estimation. We offer models of
+different scales to support a range of scenarios, with our smallest model
+capable of real-time performance at 30 FPS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DARB-Splatting: Generalizing Splatting with Decaying Anisotropic Radial
+  Basis Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishagar Arunan, Saeedha Nazar, Hashiru Pramuditha, Vinasirajan Viruthshaan, Sameera Ramasinghe, Simon Lucey, Ranga Rodrigo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Splatting-based 3D reconstruction methods have gained popularity with the
+advent of 3D Gaussian Splatting, efficiently synthesizing high-quality novel
+views. These methods commonly resort to using exponential family functions,
+such as the Gaussian function, as reconstruction kernels due to their
+anisotropic nature, ease of projection, and differentiability in rasterization.
+However, the field remains restricted to variations within the exponential
+family, leaving generalized reconstruction kernels largely underexplored,
+partly due to the lack of easy integrability in 3D to 2D projections. In this
+light, we show that a class of decaying anisotropic radial basis functions
+(DARBFs), which are non-negative functions of the Mahalanobis distance,
+supports splatting by approximating the Gaussian function's closed-form
+integration advantage. With this fresh perspective, we demonstrate up to 34%
+faster convergence during training and a 15% reduction in memory consumption
+across various DARB reconstruction kernels, while maintaining comparable PSNR,
+SSIM, and LPIPS results. We will make the code available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Link to the project page:
+  https://randomnerds.github.io/darbs.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InternLM-XComposer2.5-Reward: A Simple Yet Effective Multi-Modal Reward
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Zang, Xiaoyi Dong, Pan Zhang, Yuhang Cao, Ziyu Liu, Shengyuan Ding, Shenxi Wu, Yubo Ma, Haodong Duan, Wenwei Zhang, Kai Chen, Dahua Lin, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the promising performance of Large Vision Language Models (LVLMs) in
+visual understanding, they occasionally generate incorrect outputs. While
+reward models (RMs) with reinforcement learning or test-time scaling offer the
+potential for improving generation quality, a critical gap remains: publicly
+available multi-modal RMs for LVLMs are scarce, and the implementation details
+of proprietary models are often unclear. We bridge this gap with
+InternLM-XComposer2.5-Reward (IXC-2.5-Reward), a simple yet effective
+multi-modal reward model that aligns LVLMs with human preferences. To ensure
+the robustness and versatility of IXC-2.5-Reward, we set up a high-quality
+multi-modal preference corpus spanning text, image, and video inputs across
+diverse domains, such as instruction following, general understanding,
+text-rich documents, mathematical reasoning, and video understanding.
+IXC-2.5-Reward achieves excellent results on the latest multi-modal reward
+model benchmark and shows competitive performance on text-only reward model
+benchmarks. We further demonstrate three key applications of IXC-2.5-Reward:
+(1) Providing a supervisory signal for RL training. We integrate IXC-2.5-Reward
+with Proximal Policy Optimization (PPO) yields IXC-2.5-Chat, which shows
+consistent improvements in instruction following and multi-modal open-ended
+dialogue; (2) Selecting the best response from candidate responses for
+test-time scaling; and (3) Filtering outlier or noisy samples from existing
+image and video instruction tuning training data. To ensure reproducibility and
+facilitate further research, we have open-sourced all model weights and
+training recipes at https://github.com/InternLM/InternLM-XComposer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-Language Models for Automated Chest X-ray Interpretation:
+  Leveraging ViT and <span class="highlight-title">GPT</span>-2 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md. Rakibul Islam, Md. Zahid Hossain, Mustofa Ahmed, Most. Sharmin Sultana Samu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiology plays a pivotal role in modern medicine due to its non-invasive
+diagnostic capabilities. However, the manual generation of unstructured medical
+reports is time consuming and prone to errors. It creates a significant
+bottleneck in clinical workflows. Despite advancements in AI-generated
+radiology reports, challenges remain in achieving detailed and accurate report
+generation. In this study we have evaluated different combinations of
+multimodal models that integrate Computer Vision and Natural Language
+Processing to generate comprehensive radiology reports. We employed a
+pretrained Vision Transformer (ViT-B16) and a SWIN Transformer as the image
+encoders. The BART and GPT-2 models serve as the textual decoders. We used
+Chest X-ray images and reports from the IU-Xray dataset to evaluate the
+usability of the SWIN Transformer-BART, SWIN Transformer-GPT-2, ViT-B16-BART
+and ViT-B16-GPT-2 models for report generation. We aimed at finding the best
+combination among the models. The SWIN-BART model performs as the
+best-performing model among the four models achieving remarkable results in
+almost all the evaluation metrics like ROUGE, BLEU and BERTScore.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, manuscript under-review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cinepro: Robust Training of Foundation Models for Cancer Detection in
+  Prostate Ultrasound Cineloops 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Harmanani, Amoon Jamzad, Minh Nguyen Nhat To, Paul F. R. Wilson, Zhuoxin Guo, Fahimeh Fooladgar, Samira Sojoudi, Mahdi Gilany, Silvia Chang, Peter Black, Michael Leveridge, Robert Siemens, Purang Abolmaesumi, Parvin Mousavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prostate cancer (PCa) detection using deep learning (DL) models has shown
+potential for enhancing real-time guidance during biopsies. However, prostate
+ultrasound images lack pixel-level cancer annotations, introducing label noise.
+Current approaches often focus on limited regions of interest (ROIs),
+disregarding anatomical context necessary for accurate diagnosis. Foundation
+models can overcome this limitation by analyzing entire images to capture
+global spatial relationships; however, they still encounter challenges stemming
+from the weak labels associated with coarse pathology annotations in ultrasound
+data. We introduce Cinepro, a novel framework that strengthens foundation
+models' ability to localize PCa in ultrasound cineloops. Cinepro adapts robust
+training by integrating the proportion of cancer tissue reported by pathology
+in a biopsy core into its loss function to address label noise, providing a
+more nuanced supervision. Additionally, it leverages temporal data across
+multiple frames to apply robust augmentations, enhancing the model's ability to
+learn stable cancer-related features. Cinepro demonstrates superior performance
+on a multi-center prostate ultrasound dataset, achieving an AUROC of 77.1% and
+a balanced accuracy of 83.8%, surpassing current benchmarks. These findings
+underscore Cinepro's promise in advancing foundation models for weakly labeled
+ultrasound data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to IEEE ISBI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VAR<span class="highlight-title">GPT</span>: Unified Understanding and Generation in a Visual Autoregressive
+  Multimodal Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianwei Zhuang, Yuxin Xie, Yufan Deng, Liming Liang, Jinghan Ru, Yuguo Yin, Yuexian Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present VARGPT, a novel multimodal large language model (MLLM) that
+unifies visual understanding and generation within a single autoregressive
+framework. VARGPT employs a next-token prediction paradigm for visual
+understanding and a next-scale prediction paradigm for visual autoregressive
+generation. VARGPT innovatively extends the LLaVA architecture, achieving
+efficient scale-wise autoregressive visual generation within MLLMs while
+seamlessly accommodating mixed-modal input and output within a single model
+framework. Our VARGPT undergoes a three-stage unified training process on
+specially curated datasets, comprising a pre-training phase and two mixed
+visual instruction-tuning phases. The unified training strategy are designed to
+achieve alignment between visual and textual features, enhance instruction
+following for both understanding and generation, and improve visual generation
+quality, respectively. Despite its LLAVA-based architecture for multimodel
+understanding, VARGPT significantly outperforms LLaVA-1.5 across various
+vision-centric benchmarks, such as visual question-answering and reasoning
+tasks. Notably, VARGPT naturally supports capabilities in autoregressive visual
+generation and instruction-to-image synthesis, showcasing its versatility in
+both visual understanding and generation tasks. Project page is at:
+\url{https://vargpt-1.github.io/}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UI-TARS: Pioneering Automated GUI Interaction with Native Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujia Qin, Yining Ye, Junjie Fang, Haoming Wang, Shihao Liang, Shizuo Tian, Junda Zhang, Jiahao Li, Yunxin Li, Shijue Huang, Wanjun Zhong, Kuanye Li, Jiale Yang, Yu Miao, Woyu Lin, Longxiang Liu, Xu Jiang, Qianli Ma, Jingyu Li, Xiaojun Xiao, Kai Cai, Chuang Li, Yaowei Zheng, Chaolin Jin, Chen Li, Xiao Zhou, Minchao Wang, Haoli Chen, Zhaojian Li, Haihua Yang, Haifeng Liu, Feng Lin, Tao Peng, Xin Liu, Guang Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces UI-TARS, a native GUI agent model that solely perceives
+the screenshots as input and performs human-like interactions (e.g., keyboard
+and mouse operations). Unlike prevailing agent frameworks that depend on
+heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts
+and workflows, UI-TARS is an end-to-end model that outperforms these
+sophisticated frameworks. Experiments demonstrate its superior performance:
+UI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating
+perception, grounding, and GUI task execution. Notably, in the OSWorld
+benchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15
+steps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld,
+UI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several
+key innovations: (1) Enhanced Perception: leveraging a large-scale dataset of
+GUI screenshots for context-aware understanding of UI elements and precise
+captioning; (2) Unified Action Modeling, which standardizes actions into a
+unified space across platforms and achieves precise grounding and interaction
+through large-scale action traces; (3) System-2 Reasoning, which incorporates
+deliberate reasoning into multi-step decision making, involving multiple
+reasoning patterns such as task decomposition, reflection thinking, milestone
+recognition, etc. (4) Iterative Training with Reflective Online Traces, which
+addresses the data bottleneck by automatically collecting, filtering, and
+reflectively refining new interaction traces on hundreds of virtual machines.
+Through iterative training and reflection tuning, UI-TARS continuously learns
+from its mistakes and adapts to unforeseen situations with minimal human
+intervention. We also analyze the evolution path of GUI agents to guide the
+further development of this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Based Segmentation of Blood Vessels from H&E Stained
+  Oesophageal Adenocarcinoma Whole-Slide Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Lv, Stefan S Antonowicz, Shan E Ahmed Raza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blood vessels (BVs) play a critical role in the Tumor Micro-Environment
+(TME), potentially influencing cancer progression and treatment response.
+However, manually quantifying BVs in Hematoxylin and Eosin (H&E) stained images
+is challenging and labor-intensive due to their heterogeneous appearances. We
+propose a novel approach of constructing guiding maps to improve the
+performance of state-of-the-art segmentation models for BV segmentation, the
+guiding maps encourage the models to learn representative features of BVs. This
+is particularly beneficial for computational pathology, where labeled training
+data is often limited and large models are prone to overfitting. We have
+quantitative and qualitative results to demonstrate the efficacy of our
+approach in improving segmentation accuracy. In future, we plan to validate
+this method to segment BVs across various tissue types and investigate the role
+of cellular structures in relation to BVs in the TME.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ISBI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metric for Evaluating Performance of Reference-Free Demorphing Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12319v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12319v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nitish Shukla, Arun Ross
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A facial morph is an image created by combining two (or more) face images
+pertaining to two (or more) distinct identities. Reference-free face demorphing
+inverts the process and tries to recover the face images constituting a facial
+morph without using any other information. However, there is no consensus on
+the evaluation metrics to be used to evaluate and compare such demorphing
+techniques. In this paper, we first analyze the shortcomings of the demorphing
+metrics currently used in the literature. We then propose a new metric called
+biometrically cross-weighted IQA that overcomes these issues and extensively
+benchmark current methods on the proposed metric to show its efficacy.
+Experiments on three existing demorphing methods and six datasets on two
+commonly used face matchers validate the efficacy of our proposed metric.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BlanketGen2-Fit3D: Synthetic Blanket Augmentation Towards Improving
+  Real-World In-Bed Blanket Occluded Human Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tamás Karácsony, João Carmona, João Paulo Silva Cunha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Pose Estimation (HPE) from monocular RGB images is crucial for clinical
+in-bed skeleton-based action recognition, however, it poses unique challenges
+for HPE models due to the frequent presence of blankets occluding the person,
+while labeled HPE data in this scenario is scarce. To address this we introduce
+BlanketGen2-Fit3D (BG2-Fit3D), an augmentation of Fit3D dataset that contains
+1,217,312 frames with synthetic photo-realistic blankets. To generate it we
+used BlanketGen2, our new and improved version of our BlanketGen pipeline that
+simulates synthetic blankets using ground-truth Skinned Multi-Person Linear
+model (SMPL) meshes and then renders them as transparent images that can be
+layered on top of the original frames. This dataset was used in combination
+with the original Fit3D to finetune the ViTPose-B HPE model, to evaluate
+synthetic blanket augmentation effectiveness. The trained models were further
+evaluated on a real-world blanket occluded in-bed HPE dataset (SLP dataset).
+Comparing architectures trained on only Fit3D with the ones trained with our
+synthetic blanket augmentation the later improved pose estimation performance
+on BG2-Fit3D, the synthetic blanket occluded dataset significantly to (0.977
+Percentage of Correct Keypoints (PCK), 0.149 Normalized Mean Error (NME)) with
+an absolute 4.4% PCK increase. Furthermore, the test results on SLP
+demonstrated the utility of synthetic data augmentation by improving
+performance by an absolute 2.3% PCK, on real-world images with the poses
+occluded by real blankets. These results show synthetic blanket augmentation
+has the potential to improve in-bed blanket occluded HPE from RGB images. The
+dataset as well as the code will be made available to the public.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sublinear Variational Optimization of Gaussian Mixture Models with
+  Millions to Billions of Parameters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Salwig, Till Kahlke, Florian Hirschberger, Dennis Forster, Jörg Lücke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian Mixture Models (GMMs) range among the most frequently used machine
+learning models. However, training large, general GMMs becomes computationally
+prohibitive for datasets with many data points $N$ of high-dimensionality $D$.
+For GMMs with arbitrary covariances, we here derive a highly efficient
+variational approximation, which is integrated with mixtures of factor
+analyzers (MFAs). For GMMs with $C$ components, our proposed algorithm
+significantly reduces runtime complexity per iteration from
+$\mathcal{O}(NCD^2)$ to a complexity scaling linearly with $D$ and remaining
+constant w.r.t. $C$. Numerical validation of this theoretical complexity
+reduction then shows the following: the distance evaluations required for the
+entire GMM optimization process scale sublinearly with $NC$. On large-scale
+benchmarks, this sublinearity results in speed-ups of an order-of-magnitude
+compared to the state-of-the-art. As a proof of concept, we train GMMs with
+over 10 billion parameters on about 100 million images, and observe training
+times of approximately nine hours on a single state-of-the-art CPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 6 figures (and 17 pages, 3 figures in Appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RALAD: Bridging the Real-to-Sim Domain Gap in Autonomous Driving with
+  Retrieval-Augmented Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Zuo, Haibo Hu, Zikang Zhou, Yufei Cui, Ziquan Liu, Jianping Wang, Nan Guan, Jin Wang, Chun Jason Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the pursuit of robust autonomous driving systems, models trained on
+real-world datasets often struggle to adapt to new environments, particularly
+when confronted with corner cases such as extreme weather conditions.
+Collecting these corner cases in the real world is non-trivial, which
+necessitates the use of simulators for validation. However,the high
+computational cost and the domain gap in data distribution have hindered the
+seamless transition between real and simulated driving scenarios. To tackle
+this challenge, we propose Retrieval-Augmented Learning for Autonomous Driving
+(RALAD), a novel framework designed to bridge the real-to-sim gap at a low
+cost. RALAD features three primary designs, including (1) domain adaptation via
+an enhanced Optimal Transport (OT) method that accounts for both individual and
+grouped image distances, (2) a simple and unified framework that can be applied
+to various models, and (3) efficient fine-tuning techniques that freeze the
+computationally expensive layers while maintaining robustness. Experimental
+results demonstrate that RALAD compensates for the performance degradation in
+simulated environments while maintaining accuracy in real-world scenarios
+across three different models. Taking Cross View as an example, the mIOU and
+mAP metrics in real-world scenarios remain stable before and after RALAD
+fine-tuning, while in simulated environments,the mIOU and mAP metrics are
+improved by 10.30% and 12.29%, respectively. Moreover, the re-training cost of
+our approach is reduced by approximately 88.1%. Our code is available at
+https://github.com/JiachengZuo/RALAD.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Accurate Unified Anomaly Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxin Ma, Qingsong Yao, Xiang Zhang, Zhelong Huang, Zihang Jiang, S. Kevin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly detection (UAD) from images strives to model normal data
+distributions, creating discriminative representations to distinguish and
+precisely localize anomalies. Despite recent advancements in the efficient and
+unified one-for-all scheme, challenges persist in accurately segmenting
+anomalies for further monitoring. Moreover, this problem is obscured by the
+widely-used AUROC metric under imbalanced UAD settings. This motivates us to
+emphasize the significance of precise segmentation of anomaly pixels using pAP
+and DSC as metrics. To address the unsolved segmentation task, we introduce the
+Unified Anomaly Segmentation (UniAS). UniAS presents a multi-level hybrid
+pipeline that progressively enhances normal information from coarse to fine,
+incorporating a novel multi-granularity gated CNN (MGG-CNN) into Transformer
+layers to explicitly aggregate local details from different granularities.
+UniAS achieves state-of-the-art anomaly segmentation performance, attaining
+65.12/59.33 and 40.06/32.50 in pAP/DSC on the MVTec-AD and VisA datasets,
+respectively, surpassing previous methods significantly. The codes are shared
+at https://github.com/Mwxinnn/UniAS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regressor-Guided Image Editing Regulates Emotional Response to Reduce
+  Online Engagement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Gebhardt, Robin Willardt, Seyedmorteza Sadat, Chih-Wei Ning, Andreas Brombach, Jie Song, Otmar Hilliges, Christian Holz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotions are known to mediate the relationship between users' content
+consumption and their online engagement, with heightened emotional intensity
+leading to increased engagement. Building on this insight, we propose three
+regressor-guided image editing approaches aimed at diminishing the emotional
+impact of images. These include (i) a parameter optimization approach based on
+global image transformations known to influence emotions, (ii) an optimization
+approach targeting the style latent space of a generative adversarial network,
+and (iii) a diffusion-based approach employing classifier guidance and
+classifier-free guidance. Our findings demonstrate that approaches can
+effectively alter the emotional properties of images while maintaining high
+visual quality. Optimization-based methods primarily adjust low-level
+properties like color hues and brightness, whereas the diffusion-based approach
+introduces semantic changes, such as altering appearance or facial expressions.
+Notably, results from a behavioral study reveal that only the diffusion-based
+approach successfully elicits changes in viewers' emotional responses while
+preserving high perceived image quality. In future work, we will investigate
+the impact of these image adaptations on internet user behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 22 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ With Great Backbones Comes Great Adversarial Transferability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Arakelyan, Karen Hambardzumyan, Davit Papikyan, Pasquale Minervini, Albert Gordo, Isabelle Augenstein, Aram H. Markosyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in self-supervised learning (SSL) for machine vision have improved
+representation robustness and model performance, giving rise to pre-trained
+backbones like \emph{ResNet} and \emph{ViT} models tuned with SSL methods such
+as \emph{SimCLR}. Due to the computational and data demands of pre-training,
+the utilization of such backbones becomes a strenuous necessity. However,
+employing these backbones may inherit vulnerabilities to adversarial attacks.
+While adversarial robustness has been studied under \emph{white-box} and
+\emph{black-box} settings, the robustness of models tuned on pre-trained
+backbones remains largely unexplored. Additionally, the role of tuning
+meta-information in mitigating exploitation risks is unclear. This work
+systematically evaluates the adversarial robustness of such models across
+$20,000$ combinations of tuning meta-information, including fine-tuning
+techniques, backbone families, datasets, and attack types. We propose using
+proxy models to transfer attacks, simulating varying levels of target knowledge
+by fine-tuning these proxies with diverse configurations. Our findings reveal
+that proxy-based attacks approach the effectiveness of \emph{white-box}
+methods, even with minimal tuning knowledge. We also introduce a naive
+"backbone attack," leveraging only the backbone to generate adversarial
+samples, which outperforms \emph{black-box} attacks and rivals \emph{white-box}
+methods, highlighting critical risks in model-sharing practices. Finally, our
+ablations reveal how increasing tuning meta-information impacts attack
+transferability, measuring each meta-information combination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Image Perturbations for Testing Automated Driving
+  Assistance Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Carlo Lambertenghi, Hannes Leonhard, Andrea Stocco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advanced Driver Assistance Systems (ADAS) based on deep neural networks
+(DNNs) are widely used in autonomous vehicles for critical perception tasks
+such as object detection, semantic segmentation, and lane recognition. However,
+these systems are highly sensitive to input variations, such as noise and
+changes in lighting, which can compromise their effectiveness and potentially
+lead to safety-critical failures.
+  This study offers a comprehensive empirical evaluation of image
+perturbations, techniques commonly used to assess the robustness of DNNs, to
+validate and improve the robustness and generalization of ADAS perception
+systems. We first conducted a systematic review of the literature, identifying
+38 categories of perturbations. Next, we evaluated their effectiveness in
+revealing failures in two different ADAS, both at the component and at the
+system level. Finally, we explored the use of perturbation-based data
+augmentation and continuous learning strategies to improve ADAS adaptation to
+new operational design domains. Our results demonstrate that all categories of
+image perturbations successfully expose robustness issues in ADAS and that the
+use of dataset augmentation and continuous learning significantly improves ADAS
+performance in novel, unseen environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the 18th IEEE International Conference on
+  Software Testing, Verification and Validation (ICST 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VipDiff: Towards Coherent and Diverse Video Inpainting via Training-free
+  Denoising Diffusion Models <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaohao Xie, Kai Han, Kwan-Yee K. Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent video inpainting methods have achieved encouraging improvements by
+leveraging optical flow to guide pixel propagation from reference frames either
+in the image space or feature space. However, they would produce severe
+artifacts in the mask center when the masked area is too large and no pixel
+correspondences can be found for the center. Recently, diffusion models have
+demonstrated impressive performance in generating diverse and high-quality
+images, and have been exploited in a number of works for image inpainting.
+These methods, however, cannot be applied directly to videos to produce
+temporal-coherent inpainting results. In this paper, we propose a training-free
+framework, named VipDiff, for conditioning diffusion model on the reverse
+diffusion process to produce temporal-coherent inpainting results without
+requiring any training data or fine-tuning the pre-trained diffusion models.
+VipDiff takes optical flow as guidance to extract valid pixels from reference
+frames to serve as constraints in optimizing the randomly sampled Gaussian
+noise, and uses the generated results for further pixel propagation and
+conditional generation. VipDiff also allows for generating diverse video
+inpainting results over different sampled noise. Experiments demonstrate that
+VipDiff can largely outperform state-of-the-art video inpainting methods in
+terms of both spatial-temporal coherence and fidelity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 Figures (Accepted at WACV 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CBVLM: Training-free Explainable Concept-based Large Vision Language
+  Models for Medical Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristiano Patrício, Isabel Rio-Torto, Jaime S. Cardoso, Luís F. Teixeira, João C. Neves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main challenges limiting the adoption of deep learning-based solutions in
+medical workflows are the availability of annotated data and the lack of
+interpretability of such systems. Concept Bottleneck Models (CBMs) tackle the
+latter by constraining the final disease prediction on a set of predefined and
+human-interpretable concepts. However, the increased interpretability achieved
+through these concept-based explanations implies a higher annotation burden.
+Moreover, if a new concept needs to be added, the whole system needs to be
+retrained. Inspired by the remarkable performance shown by Large
+Vision-Language Models (LVLMs) in few-shot settings, we propose a simple, yet
+effective, methodology, CBVLM, which tackles both of the aforementioned
+challenges. First, for each concept, we prompt the LVLM to answer if the
+concept is present in the input image. Then, we ask the LVLM to classify the
+image based on the previous concept predictions. Moreover, in both stages, we
+incorporate a retrieval module responsible for selecting the best examples for
+in-context learning. By grounding the final diagnosis on the predicted
+concepts, we ensure explainability, and by leveraging the few-shot capabilities
+of LVLMs, we drastically lower the annotation cost. We validate our approach
+with extensive experiments across four medical datasets and twelve LVLMs (both
+generic and medical) and show that CBVLM consistently outperforms CBMs and
+task-specific supervised methods without requiring any training and using just
+a few annotated examples. More information on our project page:
+https://cristianopatricio.github.io/CBVLM/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ mmCooper: A Multi-agent Multi-stage Communication-efficient and
+  Collaboration-robust Cooperative Perception Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingyi Liu, Jian Teng, Hongfei Xue, Enshu Wang, Chuanhui Zhu, Pu Wang, Libing Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative perception significantly enhances individual vehicle perception
+performance through the exchange of sensory information among agents. However,
+real-world deployment faces challenges due to bandwidth constraints and
+inevitable calibration errors during information exchange. To address these
+issues, we propose mmCooper, a novel multi-agent, multi-stage,
+communication-efficient, and collaboration-robust cooperative perception
+framework. Our framework leverages a multi-stage collaboration strategy that
+dynamically and adaptively balances intermediate- and late-stage information to
+share among agents, enhancing perceptual performance while maintaining
+communication efficiency. To support robust collaboration despite potential
+misalignments and calibration errors, our framework captures multi-scale
+contextual information for robust fusion in the intermediate stage and
+calibrates the received detection results to improve accuracy in the late
+stage. We validate the effectiveness of mmCooper through extensive experiments
+on real-world and simulated datasets. The results demonstrate the superiority
+of our proposed framework and the effectiveness of each component.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HAC++: Towards 100X Compression of 3D Gaussian Splatting <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihang Chen, Qianyi Wu, Weiyao Lin, Mehrtash Harandi, Jianfei Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel
+view synthesis, boasting rapid rendering speed with high fidelity. However, the
+substantial Gaussians and their associated attributes necessitate effective
+compression techniques. Nevertheless, the sparse and unorganized nature of the
+point cloud of Gaussians (or anchors in our paper) presents challenges for
+compression. To achieve a compact size, we propose HAC++, which leverages the
+relationships between unorganized anchors and a structured hash grid, utilizing
+their mutual information for context modeling. Additionally, HAC++ captures
+intra-anchor contextual relationships to further enhance compression
+performance. To facilitate entropy coding, we utilize Gaussian distributions to
+precisely estimate the probability of each quantized attribute, where an
+adaptive quantization module is proposed to enable high-precision quantization
+of these attributes for improved fidelity restoration. Moreover, we incorporate
+an adaptive masking strategy to eliminate invalid Gaussians and anchors.
+Overall, HAC++ achieves a remarkable size reduction of over 100X compared to
+vanilla 3DGS when averaged on all datasets, while simultaneously improving
+fidelity. It also delivers more than 20X size reduction compared to
+Scaffold-GS. Our code is available at
+https://github.com/YihangChen-ee/HAC-plus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE TPAMI Submission. This paper is an extension of HAC at
+  arXiv:2403.14530 (ECCV 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Memory Storyboard: Leveraging Temporal Segmentation for Streaming
+  <span class="highlight-title">Self-Supervised</span> Learning from Egocentric Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanlai Yang, Mengye Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning holds the promise to learn good representations from
+real-world continuous uncurated data streams. However, most existing works in
+visual self-supervised learning focus on static images or artificial data
+streams. Towards exploring a more realistic learning substrate, we investigate
+streaming self-supervised learning from long-form real-world egocentric video
+streams. Inspired by the event segmentation mechanism in human perception and
+memory, we propose "Memory Storyboard" that groups recent past frames into
+temporal segments for more effective summarization of the past visual streams
+for memory replay. To accommodate efficient temporal segmentation, we propose a
+two-tier memory hierarchy: the recent past is stored in a short-term memory,
+and the storyboard temporal segments are then transferred to a long-term
+memory. Experiments on real-world egocentric video datasets including SAYCam
+and KrishnaCam show that contrastive learning objectives on top of storyboard
+frames result in semantically meaningful representations which outperform those
+produced by state-of-the-art unsupervised continual learning methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video Deblurring by Sharpness Prior Detection and Edge Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Tian, Fabio Brau, Giulio Rossolini, Giorgio Buttazzo, Hao Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video deblurring is essential task for autonomous driving, facial
+recognition, and security surveillance. Traditional methods directly estimate
+motion blur kernels, often introducing artifacts and leading to poor results.
+Recent approaches utilize the detection of sharp frames within video sequences
+to enhance deblurring. However, existing datasets rely on fixed number of sharp
+frames, which may be too restrictive for some applications and may introduce a
+bias during model training. To address these limitations and enhance domain
+adaptability, this work first introduces GoPro Random Sharp (GoProRS), a new
+dataset where the the frequency of sharp frames within the sequence is
+customizable, allowing more diverse training and testing scenarios.
+Furthermore, it presents a novel video deblurring model, called SPEINet, that
+integrates sharp frame features into blurry frame reconstruction through an
+attention-based encoder-decoder architecture, a lightweight yet robust sharp
+frame detection and an edge extraction phase. Extensive experimental results
+demonstrate that SPEINet outperforms state-of-the-art methods across multiple
+datasets, achieving an average of +3.2% PSNR improvement over recent
+techniques. Given such promising results, we believe that both the proposed
+model and dataset pave the way for future advancements in video deblurring
+based on the detection of sharp frames.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review in Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quality Enhancement of Radiographic X-ray Images by Interpretable
+  Mapping <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongxu Yang, Najib Akram Aboobacker, Xiaomeng Dong, German Gonzalez, Lehel Ferenczi, Gopal Avinash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  X-ray imaging is the most widely used medical imaging modality. However, in
+the common practice, inconsistency in the initial presentation of X-ray images
+is a common complaint by radiologists. Different patient positions, patient
+habitus and scanning protocols can lead to differences in image presentations,
+e.g., differences in brightness and contrast globally or regionally. To
+compensate for this, additional work will be executed by clinical experts to
+adjust the images to the desired presentation, which can be time-consuming.
+Existing deep-learning-based end-to-end solutions can automatically correct
+images with promising performances. Nevertheless, these methods are hard to be
+interpreted and difficult to be understood by clinical experts. In this
+manuscript, a novel interpretable mapping method by deep learning is proposed,
+which automatically enhances the image brightness and contrast globally and
+locally. Meanwhile, because the model is inspired by the workflow of the
+brightness and contrast manipulation, it can provide interpretable pixel maps
+for explaining the motivation of image enhancement. The experiment on the
+clinical datasets show the proposed method can provide consistent brightness
+and contrast correction on X-ray images with accuracy of 24.75 dB PSNR and
+0.8431 SSIM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SPIE Medical Imaging 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Bias Correction: Efficient MR Image Inhomogeneity Reduction
+  Without Any Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongxu Yang, Edina Timko, Brice Fernandez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, deep neural networks for image inhomogeneity reduction have
+shown promising results. However, current methods with (un)supervised solutions
+require preparing a training dataset, which is expensive and laborious for data
+collection. In this work, we demonstrate a novel zero-shot deep neural
+networks, which requires no data for pre-training and dedicated assumption of
+the bias field. The designed light-weight CNN enables an efficient zero-shot
+adaptation for bias-corrupted image correction. Our method provides a novel
+solution to mitigate the biased corrupted image as iterative homogeneity
+refinement, which therefore ensures the considered issue can be solved easier
+with stable convergence of zero-shot optimization. Extensive comparison on
+different datasets show that the proposed method performs better than current
+data-free N4 methods in both efficiency and accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ISBI 2025. Supported by IHI PREDICTOM Project</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating Market Strength Prediction with CNNs on Candlestick Chart
+  Images <span class="chip">ACML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thanh Nam Duong, Trung Kien Hoang, Quoc Khanh Duong, Quoc Dat Dinh, Duc Hoan Le, Huy Tuan Nguyen, Xuan Bach Nguyen, Quy Ban Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates predicting market strength solely from candlestick
+chart images to assist investment decisions. The core research problem is
+developing an effective computer vision-based model using raw candlestick
+visuals without time-series data. We specifically analyze the impact of
+incorporating candlestick patterns that were detected by YOLOv8. The study
+implements two approaches: pure CNN on chart images and a Decomposer
+architecture detecting patterns. Experiments utilize diverse financial datasets
+spanning stocks, cryptocurrencies, and forex assets. Key findings demonstrate
+candlestick patterns do not improve model performance over only image data in
+our research. The significance is illuminating limitations in candlestick image
+signals. Performance peaked at approximately 0.7 accuracy, below more complex
+time-series models. Outcomes reveal challenges in distilling sufficient
+predictive power from visual shapes alone, motivating the incorporation of
+other data modalities. This research clarifies how purely image-based models
+can inform trading while confirming patterns add little value over raw charts.
+Our content is endeavored to be delineated into distinct sections, each
+autonomously furnishing a unique contribution while maintaining cohesive
+linkage. Note that, the examples discussed herein are not limited to the scope,
+applicability, or knowledge outlined in the paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACMLC 2025; 8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DLEN: Dual Branch of <span class="highlight-title">Transformer</span> for Low-Light Image Enhancement in Dual
+  Domains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyu Xia, Jiesong Bai, Yihang Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-light image enhancement (LLE) aims to improve the visual quality of
+images captured in poorly lit conditions, which often suffer from low
+brightness, low contrast, noise, and color distortions. These issues hinder the
+performance of computer vision tasks such as object detection, facial
+recognition, and autonomous driving.Traditional enhancement techniques, such as
+multi-scale fusion and histogram equalization, fail to preserve fine details
+and often struggle with maintaining the natural appearance of enhanced images
+under complex lighting conditions. Although the Retinex theory provides a
+foundation for image decomposition, it often amplifies noise, leading to
+suboptimal image quality. In this paper, we propose the Dual Light Enhance
+Network (DLEN), a novel architecture that incorporates two distinct attention
+mechanisms, considering both spatial and frequency domains. Our model
+introduces a learnable wavelet transform module in the illumination estimation
+phase, preserving high- and low-frequency components to enhance edge and
+texture details. Additionally, we design a dual-branch structure that leverages
+the power of the Transformer architecture to enhance both the illumination and
+structural components of the image.Through extensive experiments, our model
+outperforms state-of-the-art methods on standard benchmarks.Code is available
+here: https://github.com/LaLaLoXX/DLEN
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10pages,6figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InsTALL: Context-aware Instructional Task Assistance with Multi-modal
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pha Nguyen, Sailik Sengupta, Girik Malik, Arshit Gupta, Bonan Min
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The improved competence of generative models can help building multi-modal
+virtual assistants that leverage modalities beyond language. By observing
+humans performing multi-step tasks, one can build assistants that have
+situational awareness of actions and tasks being performed, enabling them to
+cater assistance based on this understanding. In this paper, we develop a
+Context-aware Instructional Task Assistant with Multi-modal Large Language
+Models (InsTALL) that leverages an online visual stream (e.g. a user's screen
+share or video recording) and responds in real-time to user queries related to
+the task at hand. To enable useful assistance, InsTALL 1) trains a multi-modal
+model on task videos and paired textual data, and 2) automatically extracts
+task graph from video data and leverages it at training and inference time. We
+show InsTALL achieves state-of-the-art performance across proposed sub-tasks
+considered for multimodal activity understanding -- task recognition (TR),
+action recognition (AR), next action prediction (AP), and plan prediction (PP)
+-- and outperforms existing baselines on two novel sub-tasks related to
+automatic error identification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TokenVerse: Versatile Multi-concept Personalization in Token Modulation
+  Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Garibi, Shahar Yadin, Roni Paiss, Omer Tov, Shiran Zada, Ariel Ephrat, Tomer Michaeli, Inbar Mosseri, Tali Dekel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present TokenVerse -- a method for multi-concept personalization,
+leveraging a pre-trained text-to-image diffusion model. Our framework can
+disentangle complex visual elements and attributes from as little as a single
+image, while enabling seamless plug-and-play generation of combinations of
+concepts extracted from multiple images. As opposed to existing works,
+TokenVerse can handle multiple images with multiple concepts each, and supports
+a wide-range of concepts, including objects, accessories, materials, pose, and
+lighting. Our work exploits a DiT-based text-to-image model, in which the input
+text affects the generation through both attention and modulation (shift and
+scale). We observe that the modulation space is semantic and enables localized
+control over complex concepts. Building on this insight, we devise an
+optimization-based framework that takes as input an image and a text
+description, and finds for each word a distinct direction in the modulation
+space. These directions can then be used to generate new images that combine
+the learned concepts in a desired configuration. We demonstrate the
+effectiveness of TokenVerse in challenging personalization settings, and
+showcase its advantages over existing methods. project's webpage in
+https://token-verse.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Temporally-Aware Features for Point Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inès Hyeonsu Kim, Seokju Cho, Jiahui Huang, Jung Yi, Joon-Young Lee, Seungryong Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point tracking in videos is a fundamental task with applications in robotics,
+video editing, and more. While many vision tasks benefit from pre-trained
+feature backbones to improve generalizability, point tracking has primarily
+relied on simpler backbones trained from scratch on synthetic data, which may
+limit robustness in real-world scenarios. Additionally, point tracking requires
+temporal awareness to ensure coherence across frames, but using
+temporally-aware features is still underexplored. Most current methods often
+employ a two-stage process: an initial coarse prediction followed by a
+refinement stage to inject temporal information and correct errors from the
+coarse stage. These approach, however, is computationally expensive and
+potentially redundant if the feature backbone itself captures sufficient
+temporal information.
+  In this work, we introduce Chrono, a feature backbone specifically designed
+for point tracking with built-in temporal awareness. Leveraging pre-trained
+representations from self-supervised learner DINOv2 and enhanced with a
+temporal adapter, Chrono effectively captures long-term temporal context,
+enabling precise prediction even without the refinement stage. Experimental
+results demonstrate that Chrono achieves state-of-the-art performance in a
+refiner-free setting on the TAP-Vid-DAVIS and TAP-Vid-Kinetics datasets, among
+common feature backbones used in point tracking as well as DINOv2, with
+exceptional efficiency. Project page: https://cvlab-kaist.github.io/Chrono/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Detection and Classification of Breast Cancer Using Deep Learning
+  Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mst. Mumtahina Labonno, D. M. Asadujjaman, Md. Mahfujur Rahman, Abdullah Tamim, Mst. Jannatul Ferdous, Rafi Muttaki Mahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is one of the deadliest cancers causing about massive number of
+patients to die annually all over the world according to the WHO. It is a kind
+of cancer that develops when the tissues of the breast grow rapidly and
+unboundly. This fatality rate can be prevented if the cancer is detected before
+it gets malignant. Using automation for early-age detection of breast cancer,
+Artificial Intelligence and Machine Learning technologies can be implemented
+for the best outcome. In this study, we are using the Breast Cancer Image
+Classification dataset collected from the Kaggle depository, which comprises
+9248 Breast Ultrasound Images and is classified into three categories: Benign,
+Malignant, and Normal which refers to non-cancerous, cancerous, and normal
+images.This research introduces three pretrained model featuring custom
+classifiers that includes ResNet50, MobileNet, and VGG16, along with a custom
+CNN model utilizing the ReLU activation function.The models ResNet50,
+MobileNet, VGG16, and a custom CNN recorded accuracies of 98.41%, 97.91%,
+98.19%, and 92.94% on the dataset, correspondingly, with ResNet50 achieving the
+highest accuracy of 98.41%.This model, with its deep and powerful architecture,
+is particularly successful in detecting aberrant cells as well as cancerous or
+non-cancerous tumors. These accuracies show that the Machine Learning methods
+are more compatible for the classification and early detection of breast
+cancer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RL-RC-DoT: A Block-level RL agent for Task-Aware Video Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uri Gadot, Assaf Shocher, Shie Mannor, Gal Chechik, Assaf Hallak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video encoders optimize compression for human perception by minimizing
+reconstruction error under bit-rate constraints. In many modern applications
+such as autonomous driving, an overwhelming majority of videos serve as input
+for AI systems performing tasks like object recognition or segmentation, rather
+than being watched by humans. It is therefore useful to optimize the encoder
+for a downstream task instead of for perceptual image quality. However, a major
+challenge is how to combine such downstream optimization with existing standard
+video encoders, which are highly efficient and popular. Here, we address this
+challenge by controlling the Quantization Parameters (QPs) at the macro-block
+level to optimize the downstream task. This granular control allows us to
+prioritize encoding for task-relevant regions within each frame. We formulate
+this optimization problem as a Reinforcement Learning (RL) task, where the
+agent learns to balance long-term implications of choosing QPs on both task
+performance and bit-rate constraints. Notably, our policy does not require the
+downstream task as an input during inference, making it suitable for streaming
+applications and edge devices such as vehicles. We demonstrate significant
+improvements in two tasks, car detection, and ROI (saliency) encoding. Our
+approach improves task performance for a given bit rate compared to traditional
+task agnostic encoding methods, paving the way for more efficient task-aware
+video compression.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fixing Imbalanced Attention to Mitigate In-Context Hallucination of
+  Large Vision-Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazi Hasan Ibn Arif, Sajib Acharjee Dip, Khizar Hussain, Lang Zhang, Chris Thomas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision Language Models (LVLMs) have demonstrated remarkable
+capabilities in understanding and describing visual content, achieving
+state-of-the-art performance across various vision-language tasks. However,
+these models frequently exhibit hallucination behavior, where they generate
+descriptions containing objects or details absent in the input image. Our work
+investigates this phenomenon by analyzing attention patterns across transformer
+layers and heads, revealing that hallucinations often stem from progressive
+degradation of visual grounding in deeper layers. We propose a novel attention
+modification approach that combines selective token emphasis and head-specific
+modulation to maintain visual grounding throughout the generation process. Our
+method introduces two key components: (1) a dual-stream token selection
+mechanism that identifies and prioritizes both locally informative and
+spatially significant visual tokens, and (2) an attention head-specific
+modulation strategy that differentially amplifies visual information processing
+based on measured visual sensitivity of individual attention heads. Through
+extensive experimentation on the MSCOCO dataset, we demonstrate that our
+approach reduces hallucination rates by up to 62.3\% compared to baseline
+models while maintaining comparable task performance. Our analysis reveals that
+selectively modulating tokens across attention heads with varying levels of
+visual sensitivity can significantly improve visual grounding without requiring
+model retraining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 tables, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainability for Vision Foundation Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rémi Kazmierczak, Eloïse Berthier, Goran Frehse, Gianni Franchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As artificial intelligence systems become increasingly integrated into daily
+life, the field of explainability has gained significant attention. This trend
+is particularly driven by the complexity of modern AI models and their
+decision-making processes. The advent of foundation models, characterized by
+their extensive generalization capabilities and emergent uses, has further
+complicated this landscape. Foundation models occupy an ambiguous position in
+the explainability domain: their complexity makes them inherently challenging
+to interpret, yet they are increasingly leveraged as tools to construct
+explainable models. In this survey, we explore the intersection of foundation
+models and eXplainable AI (XAI) in the vision domain. We begin by compiling a
+comprehensive corpus of papers that bridge these fields. Next, we categorize
+these works based on their architectural characteristics. We then discuss the
+challenges faced by current research in integrating XAI within foundation
+models. Furthermore, we review common evaluation methodologies for these
+combined approaches. Finally, we present key observations and insights from our
+survey, offering directions for future research in this rapidly evolving field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D
+  Assets Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zibo Zhao, Zeqiang Lai, Qingxiang Lin, Yunfei Zhao, Haolin Liu, Shuhui Yang, Yifei Feng, Mingxin Yang, Sheng Zhang, Xianghui Yang, Huiwen Shi, Sicong Liu, Junta Wu, Yihang Lian, Fan Yang, Ruining Tang, Zebin He, Xinzhou Wang, Jian Liu, Xuhui Zuo, Zhuo Chen, Biwen Lei, Haohan Weng, Jing Xu, Yiling Zhu, Xinhai Liu, Lixin Xu, Changrong Hu, Tianyu Huang, Lifu Wang, Jihong Zhang, Meng Chen, Liang Dong, Yiwen Jia, Yulin Cai, Jiaao Yu, Yixuan Tang, Hao Zhang, Zheng Ye, Peng He, Runzhou Wu, Chao Zhang, Yonghao Tan, Jie Xiao, Yangyu Tao, Jianchen Zhu, Jinbao Xue, Kai Liu, Chongqing Zhao, Xinming Wu, Zhichao Hu, Lei Qin, Jianbing Peng, Zhan Li, Minghui Chen, Xipeng Zhang, Lin Niu, Paige Wang, Yingkai Wang, Haozhao Kuang, Zhongyi Fan, Xu Zheng, Weihao Zhuang, YingPing He, Tian Liu, Yong Yang, Di Wang, Yuhong Liu, Jie Jiang, Jingwei Huang, Chunchao Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for
+generating high-resolution textured 3D assets. This system includes two
+foundation components: a large-scale shape generation model -- Hunyuan3D-DiT,
+and a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape
+generative model, built on a scalable flow-based diffusion transformer, aims to
+create geometry that properly aligns with a given condition image, laying a
+solid foundation for downstream applications. The texture synthesis model,
+benefiting from strong geometric and diffusion priors, produces high-resolution
+and vibrant texture maps for either generated or hand-crafted meshes.
+Furthermore, we build Hunyuan3D-Studio -- a versatile, user-friendly production
+platform that simplifies the re-creation process of 3D assets. It allows both
+professional and amateur users to manipulate or even animate their meshes
+efficiently. We systematically evaluate our models, showing that Hunyuan3D 2.0
+outperforms previous state-of-the-art models, including the open-source models
+and closed-source models in geometry details, condition alignment, texture
+quality, and etc. Hunyuan3D 2.0 is publicly released in order to fill the gaps
+in the open-source 3D community for large-scale foundation generative models.
+The code and pre-trained weights of our models are available at:
+https://github.com/Tencent/Hunyuan3D-2
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GitHub link: https://github.com/Tencent/Hunyuan3D-2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A margin-based replacement for cross-entropy loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12191v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12191v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael W. Spratling, Heiko H. Schütt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-entropy (CE) loss is the de-facto standard for training deep neural
+networks to perform classification. However, CE-trained deep neural networks
+struggle with robustness and generalisation issues. To alleviate these issues,
+we propose high error margin (HEM) loss, a variant of multi-class margin loss
+that overcomes the training issues of other margin-based losses. We evaluate
+HEM extensively on a range of architectures and datasets. We find that HEM loss
+is more effective than cross-entropy loss across a wide range of tasks: unknown
+class rejection, adversarial robustness, learning with imbalanced data,
+continual learning, and semantic segmentation (a pixel-level classification
+task). Despite all training hyper-parameters being chosen for CE loss, HEM is
+inferior to CE only in terms of clean accuracy and this difference is
+insignificant. We also compare HEM to specialised losses that have previously
+been proposed to improve performance on specific tasks. LogitNorm, a loss
+achieving state-of-the-art performance on unknown class rejection, produces
+similar performance to HEM for this task, but is much poorer for continual
+learning and semantic segmentation. Logit-adjusted loss, designed for
+imbalanced data, has superior results to HEM for that task, but performs more
+poorly on unknown class rejection and semantic segmentation. DICE, a popular
+loss for semantic segmentation, is inferior to HEM loss on all tasks, including
+semantic segmentation. Thus, HEM often out-performs specialised losses, and in
+contrast to them, is a general-purpose replacement for CE loss.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://codeberg.org/mwspratling/HEMLoss</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-dimensional multimodal uncertainty estimation by manifold
+  alignment:Application to 3D right ventricular strain computations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Di Folco, Gabriel Bernardino, Patrick Clarysse, Nicolas Duchateau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Confidence in the results is a key ingredient to improve the adoption of
+machine learning methods by clinicians. Uncertainties on the results have been
+considered in the literature, but mostly those originating from the learning
+and processing methods. Uncertainty on the data is hardly challenged, as a
+single sample is often considered representative enough of each subject
+included in the analysis. In this paper, we propose a representation learning
+strategy to estimate local uncertainties on a physiological descriptor (here,
+myocardial deformation) previously obtained from medical images by different
+definitions or computations. We first use manifold alignment to match the
+latent representations associated to different high-dimensional input
+descriptors. Then, we formulate plausible distributions of latent
+uncertainties, and finally exploit them to reconstruct uncertainties on the
+input high-dimensional descriptors. We demonstrate its relevance for the
+quantification of myocardial deformation (strain) from 3D echocardiographic
+image sequences of the right ventricle, for which a lack of consensus exists in
+its definition and which directional component to use. We used a database of
+100 control subjects with right ventricle overload, for which different types
+of strain are available at each point of the right ventricle endocardial
+surface mesh. Our approach quantifies local uncertainties on myocardial
+deformation from different descriptors defining this physiological concept.
+Such uncertainties cannot be directly estimated by local statistics on such
+descriptors, potentially of heterogeneous types. Beyond this controlled
+illustrative application, our methodology has the potential to be generalized
+to many other population analyses considering heterogeneous high-dimensional
+descriptors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ComposeAnyone: Controllable Layout-to-Human Generation with Decoupled
+  Multimodal Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyue Zhang, Zheng Chong, Xi Lu, Wenqing Zhang, Haoxiang Li, Xujie Zhang, Jiehui Huang, Xiao Dong, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building on the success of diffusion models, significant advancements have
+been made in multimodal image generation tasks. Among these, human image
+generation has emerged as a promising technique, offering the potential to
+revolutionize the fashion design process. However, existing methods often focus
+solely on text-to-image or image reference-based human generation, which fails
+to satisfy the increasingly sophisticated demands. To address the limitations
+of flexibility and precision in human generation, we introduce ComposeAnyone, a
+controllable layout-to-human generation method with decoupled multimodal
+conditions. Specifically, our method allows decoupled control of any part in
+hand-drawn human layouts using text or reference images, seamlessly integrating
+them during the generation process. The hand-drawn layout, which utilizes
+color-blocked geometric shapes such as ellipses and rectangles, can be easily
+drawn, offering a more flexible and accessible way to define spatial layouts.
+Additionally, we introduce the ComposeHuman dataset, which provides decoupled
+text and reference image annotations for different components of each human
+image, enabling broader applications in human image generation tasks. Extensive
+experiments on multiple datasets demonstrate that ComposeAnyone generates human
+images with better alignment to given layouts, text descriptions, and reference
+images, showcasing its multi-task capability and controllability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SVGS-DSGAT: An IoT-Enabled Innovation in Underwater Robotic Object
+  Detection Technology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongli Wu, Ling Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of Internet of Things (IoT) technology, underwater
+target detection and tracking have become increasingly important for ocean
+monitoring and resource management. Existing methods often fall short in
+handling high-noise and low-contrast images in complex underwater environments,
+lacking precision and robustness. This paper introduces a novel SVGS-DSGAT
+model that combines GraphSage, SVAM, and DSGAT modules, enhancing feature
+extraction and target detection capabilities through graph neural networks and
+attention mechanisms. The model integrates IoT technology to facilitate
+real-time data collection and processing, optimizing resource allocation and
+model responsiveness. Experimental results demonstrate that the SVGS-DSGAT
+model achieves an mAP of 40.8% on the URPC 2020 dataset and 41.5% on the
+SeaDronesSee dataset, significantly outperforming existing mainstream models.
+This IoT-enhanced approach not only excels in high-noise and complex
+backgrounds but also improves the overall efficiency and scalability of the
+system. This research provides an effective IoT solution for underwater target
+detection technology, offering significant practical application value and
+broad development prospects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast-RF-Shimming: Accelerate RF Shimming in 7T MRI using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyi Lu, Hao Liang, Ming Lu, Xiao Wang, Xinqiang Yan, Yuankai Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrahigh field (UHF) Magnetic Resonance Imaging (MRI) provides a high
+signal-to-noise ratio (SNR), enabling exceptional spatial resolution for
+clinical diagnostics and research. However, higher fields introduce challenges
+such as transmit radiofrequency (RF) field inhomogeneities, which result in
+uneven flip angles and image intensity artifacts. These artifacts degrade image
+quality and limit clinical adoption. Traditional RF shimming methods, including
+Magnitude Least Squares (MLS) optimization, mitigate RF field inhomogeneity but
+are time-intensive and often require the presence of the patient. Recent
+machine learning methods, such as RF Shim Prediction by Iteratively Projected
+Ridge Regression and other deep learning architectures, offer alternative
+approaches but face challenges such as extensive training requirements, limited
+complexity, and practical data constraints. This paper introduces a holistic
+learning-based framework called Fast RF Shimming, which achieves a 5000-fold
+speedup compared to MLS methods. First, random-initialized Adaptive Moment
+Estimation (Adam) derives reference shimming weights from multichannel RF
+fields. Next, a Residual Network (ResNet) maps RF fields to shimming outputs
+while incorporating a confidence parameter into the loss function. Finally, a
+Non-uniformity Field Detector (NFD) identifies extreme non-uniform outcomes.
+Comparative evaluations demonstrate significant improvements in both speed and
+predictive accuracy. The proposed pipeline also supports potential extensions,
+such as the integration of anatomical priors or multi-echo data, to enhance the
+robustness of RF field correction. This approach offers a faster and more
+efficient solution to RF shimming challenges in UHF MRI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DNRSelect: Active Best View Selection for Deferred Neural Rendering <span class="chip">ICRA 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12150v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12150v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongli Wu, Haochen Li, Xiaobao Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deferred neural rendering (DNR) is an emerging computer graphics pipeline
+designed for high-fidelity rendering and robotic perception. However, DNR
+heavily relies on datasets composed of numerous ray-traced images and demands
+substantial computational resources. It remains under-explored how to reduce
+the reliance on high-quality ray-traced images while maintaining the rendering
+fidelity. In this paper, we propose DNRSelect, which integrates a reinforcement
+learning-based view selector and a 3D texture aggregator for deferred neural
+rendering. We first propose a novel view selector for deferred neural rendering
+based on reinforcement learning, which is trained on easily obtained rasterized
+images to identify the optimal views. By acquiring only a few ray-traced images
+for these selected views, the selector enables DNR to achieve high-quality
+rendering. To further enhance spatial awareness and geometric consistency in
+DNR, we introduce a 3D texture aggregator that fuses pyramid features from
+depth maps and normal maps with UV maps. Given that acquiring ray-traced images
+is more time-consuming than generating rasterized images, DNRSelect minimizes
+the need for ray-traced data by using only a few selected views while still
+achieving high-fidelity rendering results. We conduct detailed experiments and
+ablation studies on the NeRF-Synthetic dataset to demonstrate the effectiveness
+of DNRSelect. The code will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 8 figures, submitted to ICRA 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ENTIRE: Learning-based Volume Rendering Time Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikai Yin, Hamid Gadirov, Jiri Kosinka, Steffen Frey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present ENTIRE, a novel approach for volume rendering time prediction.
+Time-dependent volume data from simulations or experiments typically comprise
+complex deforming structures across hundreds or thousands of time steps, which
+in addition to the camera configuration has a significant impact on rendering
+performance. We first extract a feature vector from a volume that captures its
+structure that is relevant for rendering time performance. Then we combine this
+feature vector with further relevant parameters (e.g. camera setup), and with
+this perform the final prediction. Our experiments conducted on various
+datasets demonstrate that our model is capable of efficiently achieving high
+prediction accuracy with fast response rates. We showcase ENTIRE's capability
+of enabling dynamic parameter adaptation for stable frame rates and load
+balancing in two case studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Sparsity: Learning Optimal Sparse Structures in Multi-task Networks
+  through Meta-learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12115v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12115v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richa Upadhyay, Ronald Phlypo, Rajkumar Saini, Marcus Liwicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents meta-sparsity, a framework for learning model sparsity,
+basically learning the parameter that controls the degree of sparsity, that
+allows deep neural networks (DNNs) to inherently generate optimal sparse shared
+structures in multi-task learning (MTL) setting. This proposed approach enables
+the dynamic learning of sparsity patterns across a variety of tasks, unlike
+traditional sparsity methods that rely heavily on manual hyperparameter tuning.
+Inspired by Model Agnostic Meta-Learning (MAML), the emphasis is on learning
+shared and optimally sparse parameters in multi-task scenarios by implementing
+a penalty-based, channel-wise structured sparsity during the meta-training
+phase. This method improves the model's efficacy by removing unnecessary
+parameters and enhances its ability to handle both seen and previously unseen
+tasks. The effectiveness of meta-sparsity is rigorously evaluated by extensive
+experiments on two datasets, NYU-v2 and CelebAMask-HQ, covering a broad
+spectrum of tasks ranging from pixel-level to image-level predictions. The
+results show that the proposed approach performs well across many tasks,
+indicating its potential as a versatile tool for creating efficient and
+adaptable sparse neural networks. This work, therefore, presents an approach
+towards learning sparsity, contributing to the efforts in the field of sparse
+neural networks and suggesting new directions for research towards parsimonious
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Teacher Encoder-Student Decoder Denoising Guided Segmentation Network
+  for Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12104v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12104v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        ShiXuan Song, Hao Chen, Shu Hu, Xin Wang, Jinrong Hu, Xi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual anomaly detection is a highly challenging task, often categorized as a
+one-class classification and segmentation problem. Recent studies have
+demonstrated that the student-teacher (S-T) framework effectively addresses
+this challenge. However, most S-T frameworks rely solely on pre-trained teacher
+networks to guide student networks in learning multi-scale similar features,
+overlooking the potential of the student networks to enhance learning through
+multi-scale feature fusion. In this study, we propose a novel model named
+PFADSeg, which integrates a pre-trained teacher network, a denoising student
+network with multi-scale feature fusion, and a guided anomaly segmentation
+network into a unified framework. By adopting a unique teacher-encoder and
+student-decoder denoising mode, the model improves the student network's
+ability to learn from teacher network features. Furthermore, an adaptive
+feature fusion mechanism is introduced to train a self-supervised segmentation
+network that synthesizes anomaly masks autonomously, significantly increasing
+detection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves
+state-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean
+precision of 76.4%, and an instance-level mean precision of 78.7%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Proxies for Distortion and Consistency with Applications for Real-World
+  Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Man, Guy Ohayon, Ron Raphaeli, Michael Elad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world image restoration deals with the recovery of images suffering from
+an unknown degradation. This task is typically addressed while being given only
+degraded images, without their corresponding ground-truth versions. In this
+hard setting, designing and evaluating restoration algorithms becomes highly
+challenging. This paper offers a suite of tools that can serve both the design
+and assessment of real-world image restoration algorithms. Our work starts by
+proposing a trained model that predicts the chain of degradations a given
+real-world measured input has gone through. We show how this estimator can be
+used to approximate the consistency -- the match between the measurements and
+any proposed recovered image. We also use this estimator as a guiding force for
+the design of a simple and highly-effective plug-and-play real-world image
+restoration algorithm, leveraging a pre-trained diffusion-based image prior.
+Furthermore, this work proposes no-reference proxy measures of MSE and LPIPS,
+which, without access to the ground-truth images, allow ranking of real-world
+image restoration algorithms according to their (approximate) MSE and LPIPS.
+The proposed suite provides a versatile, first of its kind framework for
+evaluating and comparing blind image restoration algorithms in real-world
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page in https://man-sean.github.io/elad-website/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UAV-Assisted Real-Time Disaster Detection Using Optimized <span class="highlight-title">Transformer</span>
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Branislava Jankovic, Sabina Jangirova, Waseem Ullah, Latif U. Khan, Mohsen Guizani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Disaster recovery and management present significant challenges, particularly
+in unstable environments and hard-to-reach terrains. These difficulties can be
+overcome by employing unmanned aerial vehicles (UAVs) equipped with onboard
+embedded platforms and camera sensors. In this work, we address the critical
+need for accurate and timely disaster detection by enabling onboard aerial
+imagery processing and avoiding connectivity, privacy, and latency issues
+despite the challenges posed by limited onboard hardware resources. We propose
+a UAV-assisted edge framework for real-time disaster management, leveraging our
+proposed model optimized for real-time aerial image classification. The
+optimization of the model employs post-training quantization techniques. For
+real-world disaster scenarios, we introduce a novel dataset, DisasterEye,
+featuring UAV-captured disaster scenes as well as ground-level images taken by
+individuals on-site. Experimental results demonstrate the effectiveness of our
+model, achieving high accuracy with reduced inference latency and memory usage
+on resource-constrained devices. The framework's scalability and adaptability
+make it a robust solution for real-time disaster detection on resource-limited
+UAV platforms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSTSA-GCN: Advancing Skeleton-Based Gesture Recognition with
+  Semantic-Aware Spatio-Temporal Topology Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hu Cui, Renjing Huang, Ruoyu Zhang, Tessai Hayama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph convolutional networks (GCNs) have emerged as a powerful tool for
+skeleton-based action and gesture recognition, thanks to their ability to model
+spatial and temporal dependencies in skeleton data. However, existing GCN-based
+methods face critical limitations: (1) they lack effective spatio-temporal
+topology modeling that captures dynamic variations in skeletal motion, and (2)
+they struggle to model multiscale structural relationships beyond local joint
+connectivity. To address these issues, we propose a novel framework called
+Dynamic Spatial-Temporal Semantic Awareness Graph Convolutional Network
+(DSTSA-GCN). DSTSA-GCN introduces three key modules: Group Channel-wise Graph
+Convolution (GC-GC), Group Temporal-wise Graph Convolution (GT-GC), and
+Multi-Scale Temporal Convolution (MS-TCN). GC-GC and GT-GC operate in parallel
+to independently model channel-specific and frame-specific correlations,
+enabling robust topology learning that accounts for temporal variations.
+Additionally, both modules employ a grouping strategy to adaptively capture
+multiscale structural relationships. Complementing this, MS-TCN enhances
+temporal modeling through group-wise temporal convolutions with diverse
+receptive fields. Extensive experiments demonstrate that DSTSA-GCN
+significantly improves the topology modeling capabilities of GCNs, achieving
+state-of-the-art performance on benchmark datasets for gesture and action
+recognition, including SHREC17 Track, DHG-14\/28, NTU-RGB+D, and NTU-RGB+D-120.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submit to Neurocomputing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Whole Slide Image Representation Using K-Mean Clustering and
+  Fisher Vector Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ravi Kant Gupta, Shounak Das, Ardhendu Sekhar, Amit Sethi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whole slide images (WSIs) are high-resolution, gigapixel sized images that
+pose significant computational challenges for traditional machine learning
+models due to their size and heterogeneity.In this paper, we present a scalable
+and efficient methodology for WSI classification by leveraging patch-based
+feature extraction, clustering, and Fisher vector encoding. Initially, WSIs are
+divided into fixed size patches, and deep feature embeddings are extracted from
+each patch using a pre-trained convolutional neural network (CNN). These
+patch-level embeddings are subsequently clustered using K-means clustering,
+where each cluster aggregates semantically similar regions of the WSI. To
+effectively summarize each cluster, Fisher vector representations are computed
+by modeling the distribution of patch embeddings in each cluster as a
+parametric Gaussian mixture model (GMM). The Fisher vectors from each cluster
+are concatenated into a high-dimensional feature vector, creating a compact and
+informative representation of the entire WSI. This feature vector is then used
+by a classifier to predict the WSI's diagnostic label. Our method captures
+local and global tissue structures and yields robust performance for
+large-scale WSI classification, demonstrating superior accuracy and scalability
+compared to other approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-annotated and Multi-modal <span class="highlight-title">Dataset</span> for Wide-angle Video Quality
+  Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12082v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12082v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Hu, Wei Wang, Chunyi Li, Lihuo He, Leida Li, Xinbo Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wide-angle video is favored for its wide viewing angle and ability to capture
+a large area of scenery, making it an ideal choice for sports and adventure
+recording. However, wide-angle video is prone to deformation, exposure and
+other distortions, resulting in poor video quality and affecting the perception
+and experience, which may seriously hinder its application in fields such as
+competitive sports. Up to now, few explorations focus on the quality assessment
+issue of wide-angle video. This deficiency primarily stems from the absence of
+a specialized dataset for wide-angle videos. To bridge this gap, we construct
+the first Multi-annotated and multi-modal Wide-angle Video quality assessment
+(MWV) dataset. Then, the performances of state-of-the-art video quality methods
+on the MWV dataset are investigated by inter-dataset testing and intra-dataset
+testing. Experimental results show that these methods impose significant
+limitations on their applicability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards autonomous photogrammetric forest inventory using a lightweight
+  under-canopy robotic drone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Väinö Karjalainen, Niko Koivumäki, Teemu Hakala, Jesse Muhojoki, Eric Hyyppä, Anand George, Juha Suomalainen, Eija Honkavaara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Drones are increasingly used in forestry to capture high-resolution remote
+sensing data. While operations above the forest canopy are already highly
+automated, flying inside forests remains challenging, primarily relying on
+manual piloting. Inside dense forests, reliance on the Global Navigation
+Satellite System (GNSS) for localization is not feasible. Additionally, the
+drone must autonomously adjust its flight path to avoid collisions. Recently,
+advancements in robotics have enabled autonomous drone flights in GNSS-denied
+obstacle-rich areas. In this article, a step towards autonomous forest data
+collection is taken by building a prototype of a robotic under-canopy drone
+utilizing state-of-the-art open-source methods and validating its performance
+for data collection inside forests. The autonomous flight capability was
+evaluated through multiple test flights in two boreal forest test sites. The
+tree parameter estimation capability was studied by conducting diameter at
+breast height (DBH) estimation using onboard stereo camera data and
+photogrammetric methods. The prototype conducted flights in selected
+challenging forest environments, and the experiments showed excellent
+performance in forest reconstruction with a miniaturized stereoscopic
+photogrammetric system. The stem detection algorithm managed to identify 79.31
+% of the stems. The DBH estimation had a root mean square error (RMSE) of 3.33
+cm (12.79 %) and a bias of 1.01 cm (3.87 %) across all trees. For trees with a
+DBH less than 30 cm, the RMSE was 1.16 cm (5.74 %), and the bias was 0.13 cm
+(0.64 %). When considering the overall performance in terms of DBH accuracy,
+autonomy, and forest complexity, the proposed approach was superior compared to
+methods proposed in the scientific literature. Results provided valuable
+insights into autonomous forest reconstruction using drones, and several
+further development topics were proposed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 13 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Co-Paced Learning Strategy Based on Confidence for Flying Bird Object
+  Detection Model Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zi-Wei Sun, Ze-Xi Hua, Heng-Chao Li, Yan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To mitigate the adverse effects of hard samples on the training of the Flying
+Bird Object Detection (FBOD) model for surveillance videos, we propose a
+Co-Paced Learning Based on Confidence (CPL-BC) strategy and apply this strategy
+to the training process of the FBOD model. This strategy involves maintaining
+two models with identical structures but different initial parameter
+configurations, which collaborate with each other to select easy samples with
+prediction confidence exceeding a set threshold for training. As training
+progresses, the strategy gradually lowers the threshold, allowing more samples
+to participate, enhancing the model's ability to recognize objects from easy to
+hard. Before applying the CPL-BC strategy to train the FBOD models, we
+initially trained the two FBOD models to equip them with the capability to
+assess the difficulty level of flying bird object samples. Experimental results
+on two different datasets of flying bird objects in surveillance videos
+demonstrate that, compared to other model learning strategies, CPL-BC
+significantly improves detection accuracy, verifying the effectiveness and
+advancement of this method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaussianVideo: Efficient Video Representation Through 2D Gaussian
+  Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longan Wang, Yuang Shi, Wei Tsang Ooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian splats have emerged as a revolutionary, effective, learned
+representation for static 3D scenes. In this work, we explore using 2D Gaussian
+splats as a new primitive for representing videos. We propose GaussianVideo, an
+approach to learning a set of 2D Gaussian splats that can effectively represent
+video frames. GaussianVideo incorporates the following techniques: (i) To
+exploit temporal redundancy among adjacent frames, which can speed up training
+and improve the compression efficiency, we predict the Gaussian splats of a
+frame based on its previous frame; (ii) To control the trade-offs between file
+size and quality, we remove Gaussian splats with low contribution to the video
+quality; (iii) To capture dynamics in videos, we randomly add Gaussian splats
+to fit content with large motion or newly-appeared objects; (iv) To handle
+significant changes in the scene, we detect key frames based on loss
+differences during the learning process. Experiment results show that
+GaussianVideo achieves good rate-distortion trade-offs, comparable to
+state-of-the-art video codecs such as AV1 and VVC, and a rendering speed of
+1500 fps for a 1920x1080 video.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified 3D MRI Representations via Sequence-Invariant Contrastive
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liam Chalcroft, Jenny Cronin, Cathy J. Price, John Ashburner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised deep learning has accelerated 2D natural image analysis but
+remains difficult to translate into 3D MRI, where data are scarce and
+pre-trained 2D backbones cannot capture volumetric context. We present a
+sequence-invariant self-supervised framework leveraging quantitative MRI
+(qMRI). By simulating multiple MRI contrasts from a single 3D qMRI scan and
+enforcing consistent representations across these contrasts, we learn
+anatomy-centric rather than sequence-specific features. This yields a robust 3D
+encoder that performs strongly across varied tasks and protocols. Experiments
+on healthy brain segmentation (IXI), stroke lesion segmentation (ARC), and MRI
+denoising show significant gains over baseline SSL approaches, especially in
+low-data settings (up to +8.3% Dice, +4.2 dB PSNR). Our model also generalises
+effectively to unseen sites, demonstrating potential for more scalable and
+clinically reliable volumetric analysis. All code and trained models are
+publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ORCAst: Operational High-Resolution Current Forecasts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Garcia, Inès Larroche, Amélie Pesnec, Hannah Bull, Théo Archambault, Evangelos Moschos, Alexandre Stegner, Anastase Charantonis, Dominique Béréziat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present ORCAst, a multi-stage, multi-arm network for Operational
+high-Resolution Current forecAsts over one week. Producing real-time nowcasts
+and forecasts of ocean surface currents is a challenging problem due to
+indirect or incomplete information from satellite remote sensing data. Entirely
+trained on real satellite data and in situ measurements from drifters, our
+model learns to forecast global ocean surface currents using various sources of
+ground truth observations in a multi-stage learning procedure. Our multi-arm
+encoder-decoder model architecture allows us to first predict sea surface
+height and geostrophic currents from larger quantities of nadir and SWOT
+altimetry data, before learning to predict ocean surface currents from much
+more sparse in situ measurements from drifters. Training our model on specific
+regions improves performance. Our model achieves stronger nowcast and forecast
+performance in predicting ocean surface currents than various state-of-the-art
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aggrotech: Leveraging Deep Learning for Sustainable Tomato Disease
+  Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        MD Mehraz Hosen, Md. Hasibul Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tomato crop health plays a critical role in ensuring agricultural
+productivity and food security. Timely and accurate detection of diseases
+affecting tomato plants is vital for effective disease management. In this
+study, we propose a deep learning-based approach for Tomato Leaf Disease
+Detection using two well-established convolutional neural networks (CNNs),
+namely VGG19 and Inception v3. The experiment is conducted on the Tomato
+Villages Dataset, encompassing images of both healthy tomato leaves and leaves
+afflicted by various diseases. The VGG19 model is augmented with fully
+connected layers, while the Inception v3 model is modified to incorporate a
+global average pooling layer and a dense classification layer. Both models are
+trained on the prepared dataset, and their performances are evaluated on a
+separate test set. This research employs VGG19 and Inception v3 models on the
+Tomato Villages dataset (4525 images) for tomato leaf disease detection. The
+models' accuracy of 93.93% with dropout layers demonstrates their usefulness
+for crop health monitoring. The paper suggests a deep learning-based strategy
+that includes normalization, resizing, dataset preparation, and unique model
+architectures. During training, VGG19 and Inception v3 serve as feature
+extractors, with possible data augmentation and fine-tuning. Metrics like
+accuracy, precision, recall, and F1 score are obtained through evaluation on a
+test set and offer important insights into the strengths and shortcomings of
+the model. The method has the potential for practical use in precision
+agriculture and could help tomato crops prevent illness early on.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, ROC curves, confusion matrix analysis, and
+  classification reports</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Class Learning to Screen Diabetic Disorders in Fundus Images of
+  Eye <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shramana Dey, Pallabi Dutta, Riddhasree Bhattacharyya, Surochita Pal, Sushmita Mitra, Rajiv Raman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prevalence of ocular illnesses is growing globally, presenting a
+substantial public health challenge. Early detection and timely intervention
+are crucial for averting visual impairment and enhancing patient prognosis.
+This research introduces a new framework called Class Extension with Limited
+Data (CELD) to train a classifier to categorize retinal fundus images. The
+classifier is initially trained to identify relevant features concerning
+Healthy and Diabetic Retinopathy (DR) classes and later fine-tuned to adapt to
+the task of classifying the input images into three classes: Healthy, DR, and
+Glaucoma. This strategy allows the model to gradually enhance its
+classification capabilities, which is beneficial in situations where there are
+only a limited number of labeled datasets available. Perturbation methods are
+also used to identify the input image characteristics responsible for
+influencing the models decision-making process. We achieve an overall accuracy
+of 91% on publicly available datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at International Conference on Pattern Recognition (ICPR)
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Earth Observation: A <span class="highlight-title">Survey</span> on AI-Powered Image Processing in
+  Satellites 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aidan Duggan, Bruno Andrade, Haithem Afli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in technology and reduction in it's cost have led to a
+substantial growth in the quality & quantity of imagery captured by Earth
+Observation (EO) satellites. This has presented a challenge to the efficacy of
+the traditional workflow of transmitting this imagery to Earth for processing.
+An approach to addressing this issue is to use pre-trained artificial
+intelligence models to process images on-board the satellite, but this is
+difficult given the constraints within a satellite's environment. This paper
+provides an up-to-date and thorough review of research related to image
+processing on-board Earth observation satellites. The significant constraints
+are detailed along with the latest strategies to mitigate them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of <span class="highlight-title">Pre-train</span>ed Deep Learning Models and DINOv2 for
+  Cushing's Syndrome Diagnosis in Facial Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12023v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12023v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongjun Liu, Changwei Song, Jiaqi Qiang, Jianqiang Li, Hui Pan, Lin Lu, Xiao Long, Qing Zhao, Jiuzuo Huang, Shi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cushing's syndrome is a condition caused by excessive glucocorticoid
+secretion from the adrenal cortex, often manifesting with moon facies and
+plethora, making facial data crucial for diagnosis. Previous studies have used
+pre-trained convolutional neural networks (CNNs) for diagnosing Cushing's
+syndrome using frontal facial images. However, CNNs are better at capturing
+local features, while Cushing's syndrome often presents with global facial
+features. Transformer-based models like ViT and SWIN, which utilize
+self-attention mechanisms, can better capture long-range dependencies and
+global features. Recently, DINOv2, a foundation model based on visual
+Transformers, has gained interest. This study compares the performance of
+various pre-trained models, including CNNs, Transformer-based models, and
+DINOv2, in diagnosing Cushing's syndrome. We also analyze gender bias and the
+impact of freezing mechanisms on DINOv2. Our results show that
+Transformer-based models and DINOv2 outperformed CNNs, with ViT achieving the
+highest F1 score of 85.74%. Both the pre-trained model and DINOv2 had higher
+accuracy for female samples. DINOv2 also showed improved performance when
+freezing parameters. In conclusion, Transformer-based models and DINOv2 are
+effective for Cushing's syndrome classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foreign object segmentation in chest x-rays through anatomy-guided shape
+  insertion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12022v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12022v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Constantin Seibold, Hamza Kalisch, Lukas Heine, Simon Reiß, Jens Kleesiek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we tackle the challenge of instance segmentation for foreign
+objects in chest radiographs, commonly seen in postoperative follow-ups with
+stents, pacemakers, or ingested objects in children. The diversity of foreign
+objects complicates dense annotation, as shown in insufficient existing
+datasets. To address this, we propose the simple generation of synthetic data
+through (1) insertion of arbitrary shapes (lines, polygons, ellipses) with
+varying contrasts and opacities, and (2) cut-paste augmentations from a small
+set of semi-automatically extracted labels. These insertions are guided by
+anatomy labels to ensure realistic placements, such as stents appearing only in
+relevant vessels. Our approach enables networks to segment complex structures
+with minimal manually labeled data. Notably, it achieves performance comparable
+to fully supervised models while using 93\% fewer manual annotations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the "Illusion" of Gender Bias in Face Recognition: Explaining the
+  Fairness Issue Through Non-demographic Attributes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Jonas Kurz, Haiyu Wu, Kevin W. Bowyer, Philipp Terhörst
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition systems (FRS) exhibit significant accuracy differences based
+on the user's gender. Since such a gender gap reduces the trustworthiness of
+FRS, more recent efforts have tried to find the causes. However, these studies
+make use of manually selected, correlated, and small-sized sets of facial
+features to support their claims. In this work, we analyse gender bias in face
+recognition by successfully extending the search domain to decorrelated
+combinations of 40 non-demographic facial characteristics. First, we propose a
+toolchain to effectively decorrelate and aggregate facial attributes to enable
+a less-biased gender analysis on large-scale data. Second, we introduce two new
+fairness metrics to measure fairness with and without context. Based on these
+grounds, we thirdly present a novel unsupervised algorithm able to reliably
+identify attribute combinations that lead to vanishing bias when used as filter
+predicates for balanced testing datasets. The experiments show that the gender
+gap vanishes when images of male and female subjects share specific attributes,
+clearly indicating that the issue is not a question of biology but of the
+social definition of appearance. These findings could reshape our understanding
+of fairness in face biometrics and provide insights into FRS, helping to
+address gender bias issues.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are Traditional Deep Learning Model Approaches as Effective as a
+  Retinal-Specific Foundation Model for Ocular and Systemic Disease Detection? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samantha Min Er Yew, Xiaofeng Lei, Jocelyn Hui Lin Goh, Yibing Chen, Sahana Srinivasan, Miao-li Chee, Krithi Pushpanathan, Ke Zou, Qingshan Hou, Zhi Da Soh, Cancan Xue, Marco Chak Yan Yu, Charumathi Sabanayagam, E Shyong Tai, Xueling Sim, Yaxing Wang, Jost B. Jonas, Vinay Nangia, Gabriel Dawei Yang, Emma Anran Ran, Carol Yim-Lui Cheung, Yangqin Feng, Jun Zhou, Rick Siow Mong Goh, Yukun Zhou, Pearse A. Keane, Yong Liu, Ching-Yu Cheng, Yih-Chung Tham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: RETFound, a self-supervised, retina-specific foundation model
+(FM), showed potential in downstream applications. However, its comparative
+performance with traditional deep learning (DL) models remains incompletely
+understood. This study aimed to evaluate RETFound against three
+ImageNet-pretrained supervised DL models (ResNet50, ViT-base, SwinV2) in
+detecting ocular and systemic diseases.
+  Methods: We fine-tuned/trained RETFound and three DL models on full datasets,
+50%, 20%, and fixed sample sizes (400, 200, 100 images, with half comprising
+disease cases; for each DR severity class, 100 and 50 cases were used.
+Fine-tuned models were tested internally using the SEED (53,090 images) and
+APTOS-2019 (3,672 images) datasets and externally validated on population-based
+(BES, CIEMS, SP2, UKBB) and open-source datasets (ODIR-5k, PAPILA, GAMMA,
+IDRiD, MESSIDOR-2). Model performance was compared using area under the
+receiver operating characteristic curve (AUC) and Z-tests with Bonferroni
+correction (P<0.05/3).
+  Interpretation: Traditional DL models are mostly comparable to RETFound for
+ocular disease detection with large datasets. However, RETFound is superior in
+systemic disease detection with smaller datasets. These findings offer valuable
+insights into the respective merits and limitation of traditional models and
+FMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Survey</span> on Hand Gesture Recognition from Visual Input 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manousos Linardakis, Iraklis Varlamis, Georgios Th. Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hand gesture recognition has become an important research area, driven by the
+growing demand for human-computer interaction in fields such as sign language
+recognition, virtual and augmented reality, and robotics. Despite the rapid
+growth of the field, there are few surveys that comprehensively cover recent
+research developments, available solutions, and benchmark datasets. This survey
+addresses this gap by examining the latest advancements in hand gesture and 3D
+hand pose recognition from various types of camera input data including RGB
+images, depth images, and videos from monocular or multiview cameras, examining
+the differing methodological requirements of each approach. Furthermore, an
+overview of widely used datasets is provided, detailing their main
+characteristics and application domains. Finally, open challenges such as
+achieving robust recognition in real-world environments, handling occlusions,
+ensuring generalization across diverse users, and addressing computational
+efficiency for real-time applications are highlighted to guide future research
+directions. By synthesizing the objectives, methodologies, and applications of
+recent studies, this survey offers valuable insights into current trends,
+challenges, and opportunities for future research in human hand gesture
+recognition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMamba: Sparse Mamba for Event-based Object Detection <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Yang, Yang Wang, Zhanwen Liu, Meng Li, Yisheng An, Xiangmo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based methods have achieved remarkable performance in event-based
+object detection, owing to the global modeling ability. However, they neglect
+the influence of non-event and noisy regions and process them uniformly,
+leading to high computational overhead. To mitigate computation cost, some
+researchers propose window attention based sparsification strategies to discard
+unimportant regions, which sacrifices the global modeling ability and results
+in suboptimal performance. To achieve better trade-off between accuracy and
+efficiency, we propose Sparse Mamba (SMamba), which performs adaptive
+sparsification to reduce computational effort while maintaining global modeling
+capability. Specifically, a Spatio-Temporal Continuity Assessment module is
+proposed to measure the information content of tokens and discard uninformative
+ones by leveraging the spatiotemporal distribution differences between activity
+and noise events. Based on the assessment results, an Information-Prioritized
+Local Scan strategy is designed to shorten the scan distance between
+high-information tokens, facilitating interactions among them in the spatial
+dimension. Furthermore, to extend the global interaction from 2D space to 3D
+representations, a Global Channel Interaction module is proposed to aggregate
+channel information from a global spatial perspective. Results on three
+datasets (Gen1, 1Mpx, and eTram) demonstrate that our model outperforms other
+methods in both performance and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Lightweight and Interpretable Deepfakes Detection Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Umar Farooq, Ali Javed, Khalid Mahmood Malik, Muhammad Anas Raza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent realistic creation and dissemination of so-called deepfakes poses
+a serious threat to social life, civil rest, and law. Celebrity defaming,
+election manipulation, and deepfakes as evidence in court of law are few
+potential consequences of deepfakes. The availability of open source trained
+models based on modern frameworks such as PyTorch or TensorFlow, video
+manipulations Apps such as FaceApp and REFACE, and economical computing
+infrastructure has easen the creation of deepfakes. Most of the existing
+detectors focus on detecting either face-swap, lip-sync, or puppet master
+deepfakes, but a unified framework to detect all three types of deepfakes is
+hardly explored. This paper presents a unified framework that exploits the
+power of proposed feature fusion of hybrid facial landmarks and our novel heart
+rate features for detection of all types of deepfakes. We propose novel heart
+rate features and fused them with the facial landmark features to better
+extract the facial artifacts of fake videos and natural variations available in
+the original videos. We used these features to train a light-weight XGBoost to
+classify between the deepfake and bonafide videos. We evaluated the performance
+of our framework on the world leaders dataset (WLDR) that contains all types of
+deepfakes. Experimental results illustrate that the proposed framework offers
+superior detection performance over the comparative deepfakes detection
+methods. Performance comparison of our framework against the LSTM-FCN, a
+candidate of deep learning model, shows that proposed model achieves similar
+results, however, it is more interpretable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive Cross Attention Network for Flood Segmentation using
+  Multispectral Satellite Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vicky Feliren, Fithrothul Khikmah, Irfan Dwiki Bhaswara, Bahrul I. Nasution, Alex M. Lechner, Muhamad Risqi U. Saputra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the integration of deep learning techniques with remote
+sensing technology has revolutionized the way natural hazards, such as floods,
+are monitored and managed. However, existing methods for flood segmentation
+using remote sensing data often overlook the utility of correlative features
+among multispectral satellite information. In this study, we introduce a
+progressive cross attention network (ProCANet), a deep learning model that
+progressively applies both self- and cross-attention mechanisms to
+multispectral features, generating optimal feature combinations for flood
+segmentation. The proposed model was compared with state-of-the-art approaches
+using Sen1Floods11 dataset and our bespoke flood data generated for the Citarum
+River basin, Indonesia. Our model demonstrated superior performance with the
+highest Intersection over Union (IoU) score of 0.815. Our results in this
+study, coupled with the ablation assessment comparing scenarios with and
+without attention across various modalities, opens a promising path for
+enhancing the accuracy of flood analysis using remote sensing technology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures, published in IEEE Geoscience and Remote Sensing
+  Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Adversarial Transferability via Component-Wise Augmentation
+  Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hangyu Liu, Bo Peng, Pengxiang Ding, Donglin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNNs) are highly vulnerable to adversarial examples,
+which pose significant challenges in security-sensitive applications. Among
+various adversarial attack strategies, input transformation-based attacks have
+demonstrated remarkable effectiveness in enhancing adversarial transferability.
+However, existing methods fail to diversify attention regions across models
+adequately and introduce excessive information loss during transformations. In
+this paper, we introduce a novel input transformation-based method, termed
+Component-Wise Augmentation (CWA), designed to enhance transferability by
+locally applying block-wise transformations. CWA strategically integrates
+interpolation and selective rotation on individual image blocks to diversify
+model attention regions while preserving semantic integrity. Extensive
+experiments on the standard ImageNet dataset show that CWA consistently
+outperforms state-of-the-art methods in both attack success rates and stability
+across CNN- and Transformer-based models, while also demonstrating superior
+performance against multiple defense methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13pages,5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LASER: Lip Landmark Assisted Speaker Detection for Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Thien Phuc Nguyen, Zhuoran Yu, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active Speaker Detection (ASD) aims to identify speaking individuals in
+complex visual scenes. While humans can easily detect speech by matching lip
+movements to audio, current ASD models struggle to establish this
+correspondence, often misclassifying non-speaking instances when audio and lip
+movements are unsynchronized. To address this limitation, we propose Lip
+landmark Assisted Speaker dEtection for Robustness (LASER). Unlike models that
+rely solely on facial frames, LASER explicitly focuses on lip movements by
+integrating lip landmarks in training. Specifically, given a face track, LASER
+extracts frame-level visual features and the 2D coordinates of lip landmarks
+using a lightweight detector. These coordinates are encoded into dense feature
+maps, providing spatial and structural information on lip positions.
+Recognizing that landmark detectors may sometimes fail under challenging
+conditions (e.g., low resolution, occlusions, extreme angles), we incorporate
+an auxiliary consistency loss to align predictions from both lip-aware and
+face-only features, ensuring reliable performance even when lip data is absent.
+Extensive experiments across multiple datasets show that LASER outperforms
+state-of-the-art models, especially in scenarios with desynchronized audio and
+visuals, demonstrating robust performance in real-world video contexts. Code is
+available at \url{https://github.com/plnguyen2908/LASER_ASD}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Masked Autoencoders for Character-Level Open-Set Writer
+  Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaowei Jiang, Wenhao Ma, Yiqun Duan, Thomas Do, Chin-Teng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of digital forensics and document authentication, writer
+identification plays a crucial role in determining the authors of documents
+based on handwriting styles. The primary challenge in writer-id is the
+"open-set scenario", where the goal is accurately recognizing writers unseen
+during the model training. To overcome this challenge, representation learning
+is the key. This method can capture unique handwriting features, enabling it to
+recognize styles not previously encountered during training. Building on this
+concept, this paper introduces the Contrastive Masked Auto-Encoders (CMAE) for
+Character-level Open-Set Writer Identification. We merge Masked Auto-Encoders
+(MAE) with Contrastive Learning (CL) to simultaneously and respectively capture
+sequential information and distinguish diverse handwriting styles.
+Demonstrating its effectiveness, our model achieves state-of-the-art (SOTA)
+results on the CASIA online handwriting dataset, reaching an impressive
+precision rate of 89.7%. Our study advances universal writer-id with a
+sophisticated representation learning approach, contributing substantially to
+the ever-evolving landscape of digital handwriting analysis, and catering to
+the demands of an increasingly interconnected world.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Underwater Scene Reconstruction using Multi-View Stereo and
+  Physical Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyi Hu, Qi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater scene reconstruction poses a substantial challenge because of the
+intricate interplay between light and the medium, resulting in scattering and
+absorption effects that make both depth estimation and rendering more complex.
+While recent Neural Radiance Fields (NeRF) based methods for underwater scenes
+achieve high-quality results by modeling and separating the scattering medium,
+they still suffer from slow training and rendering speeds. To address these
+limitations, we propose a novel method that integrates Multi-View Stereo (MVS)
+with a physics-based underwater image formation model. Our approach consists of
+two branches: one for depth estimation using the traditional cost volume
+pipeline of MVS, and the other for rendering based on the physics-based image
+formation model. The depth branch improves scene geometry, while the medium
+branch determines the scattering parameters to achieve precise scene rendering.
+Unlike traditional MVSNet methods that rely on ground-truth depth, our method
+does not necessitate the use of depth truth, thus allowing for expedited
+training and rendering processes. By leveraging the medium subnet to estimate
+the medium parameters and combining this with a color MLP for rendering, we
+restore the true colors of underwater scenes and achieve higher-fidelity
+geometric representations. Experimental results show that our method enables
+high-quality synthesis of novel views in scattering media, clear views
+restoration by removing the medium, and outperforms existing methods in
+rendering quality and training efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FNIN: A Fourier Neural Operator-based Numerical Integration Network for
+  Surface-form-gradients <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Leng, Yakun Ju, Yuanxu Duan, Jiangnan Zhang, Qingxuan Lv, Zuxuan Wu, Hao Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surface-from-gradients (SfG) aims to recover a three-dimensional (3D) surface
+from its gradients. Traditional methods encounter significant challenges in
+achieving high accuracy and handling high-resolution inputs, particularly
+facing the complex nature of discontinuities and the inefficiencies associated
+with large-scale linear solvers. Although recent advances in deep learning,
+such as photometric stereo, have enhanced normal estimation accuracy, they do
+not fully address the intricacies of gradient-based surface reconstruction. To
+overcome these limitations, we propose a Fourier neural operator-based
+Numerical Integration Network (FNIN) within a two-stage optimization framework.
+In the first stage, our approach employs an iterative architecture for
+numerical integration, harnessing an advanced Fourier neural operator to
+approximate the solution operator in Fourier space. Additionally, a
+self-learning attention mechanism is incorporated to effectively detect and
+handle discontinuities. In the second stage, we refine the surface
+reconstruction by formulating a weighted least squares problem, addressing the
+identified discontinuities rationally. Extensive experiments demonstrate that
+our method achieves significant improvements in both accuracy and efficiency
+compared to current state-of-the-art solvers. This is particularly evident in
+handling high-resolution images with complex data, achieving errors of fewer
+than 0.1 mm on tested objects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhili Cheng, Yuge Tu, Ran Li, Shiqi Dai, Jinyi Hu, Shengding Hu, Jiahao Li, Yang Shi, Tianyu Yu, Weize Chen, Lei Shi, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have shown significant advancements,
+providing a promising future for embodied agents. Existing benchmarks for
+evaluating MLLMs primarily utilize static images or videos, limiting
+assessments to non-interactive scenarios. Meanwhile, existing embodied AI
+benchmarks are task-specific and not diverse enough, which do not adequately
+evaluate the embodied capabilities of MLLMs. To address this, we propose
+EmbodiedEval, a comprehensive and interactive evaluation benchmark for MLLMs
+with embodied tasks. EmbodiedEval features 328 distinct tasks within 125 varied
+3D scenes, each of which is rigorously selected and annotated. It covers a
+broad spectrum of existing embodied AI tasks with significantly enhanced
+diversity, all within a unified simulation and evaluation framework tailored
+for MLLMs. The tasks are organized into five categories: navigation, object
+interaction, social interaction, attribute question answering, and spatial
+question answering to assess different capabilities of the agents. We evaluated
+the state-of-the-art MLLMs on EmbodiedEval and found that they have a
+significant shortfall compared to human level on embodied tasks. Our analysis
+demonstrates the limitations of existing MLLMs in embodied capabilities,
+providing insights for their future development. We open-source all evaluation
+data and simulation framework at https://github.com/thunlp/EmbodiedEval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WaveNet-SF: A Hybrid Network for Retinal Disease Detection Based on
+  Wavelet Transform in the Spatial-Frequency Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jilan Cheng, Guoli Long, Zeyu Zhang, Zhenjia Qi, Hanyu Wang, Libin Lu, Shuihua Wang, Yudong Zhang, Jin Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retinal diseases are a leading cause of vision impairment and blindness, with
+timely diagnosis being critical for effective treatment. Optical Coherence
+Tomography (OCT) has become a standard imaging modality for retinal disease
+diagnosis, but OCT images often suffer from issues such as speckle noise,
+complex lesion shapes, and varying lesion sizes, making interpretation
+challenging. In this paper, we propose a novel framework, WaveNet-SF, to
+enhance retinal disease detection by integrating spatial-domain and
+frequency-domain learning. The framework utilizes wavelet transforms to
+decompose OCT images into low- and high-frequency components, enabling the
+model to extract both global structural features and fine-grained details. To
+improve lesion detection, we introduce a multi-scale wavelet spatial attention
+(MSW-SA) module, which enhances the model's focus on regions of interest at
+multiple scales. Additionally, a high-frequency feature compensation block
+(HFFC) is incorporated to recover edge information lost during wavelet
+decomposition, suppress noise, and preserve fine details crucial for lesion
+detection. Our approach achieves state-of-the-art (SOTA) classification
+accuracies of 97.82% and 99. 58% on the OCT-C8 and OCT2017 datasets,
+respectively, surpassing existing methods. These results demonstrate the
+efficacy of WaveNet-SF in addressing the challenges of OCT image analysis and
+its potential as a powerful tool for retinal disease diagnosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Survey</span> on Monocular Metric Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuling Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular Depth Estimation (MDE) is a fundamental computer vision task
+underpinning applications such as spatial understanding, 3D reconstruction, and
+autonomous driving. While deep learning-based MDE methods can predict relative
+depth from a single image, their lack of metric scale information often results
+in scale inconsistencies, limiting their utility in downstream tasks like
+visual SLAM, 3D reconstruction, and novel view synthesis. Monocular Metric
+Depth Estimation (MMDE) addresses these challenges by enabling precise,
+scene-scale depth inference. MMDE improves depth consistency, enhances
+sequential task stability, simplifies integration into downstream applications,
+and broadens practical use cases. This paper provides a comprehensive review of
+depth estimation technologies, highlighting the evolution from geometry-based
+methods to state-of-the-art deep learning approaches. It emphasizes
+advancements in scale-agnostic methods, which are crucial for enabling
+zero-shot generalization as the foundational capability for MMDE. Recent
+progress in zero-shot MMDE research is explored, focusing on challenges such as
+model generalization and the loss of detail at scene boundaries. Innovative
+strategies to address these issues include unlabelled data augmentation, image
+patching, architectural optimization, and generative techniques. These
+advancements, analyzed in detail, demonstrate significant contributions to
+overcoming existing limitations. Finally, this paper synthesizes recent
+developments in zero-shot MMDE, identifies unresolved challenges, and outlines
+future research directions. By offering a clear roadmap and cutting-edge
+insights, this work aims to deepen understanding of MMDE, inspire novel
+applications, and drive technological innovation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven Detection and Evaluation of Damages in Concrete Structures:
+  Using Deep Learning and Computer Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saeid Ataei, Saeed Adibnazari, Seyyed Taghi Ataei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural integrity is vital for maintaining the safety and longevity of
+concrete infrastructures such as bridges, tunnels, and walls. Traditional
+methods for detecting damages like cracks and spalls are labor-intensive,
+time-consuming, and prone to human error. To address these challenges, this
+study explores advanced data-driven techniques using deep learning for
+automated damage detection and analysis. Two state-of-the-art instance
+segmentation models, YOLO-v7 instance segmentation and Mask R-CNN, were
+evaluated using a dataset comprising 400 images, augmented to 10,995 images
+through geometric and color-based transformations to enhance robustness. The
+models were trained and validated using a dataset split into 90% training set,
+validation and test set 10%. Performance metrics such as precision, recall,
+mean average precision (mAP@0.5), and frames per second (FPS) were used for
+evaluation. YOLO-v7 achieved a superior mAP@0.5 of 96.1% and processed 40 FPS,
+outperforming Mask R-CNN, which achieved a mAP@0.5 of 92.1% with a slower
+processing speed of 18 FPS. The findings recommend YOLO-v7 instance
+segmentation model for real-time, high-speed structural health monitoring,
+while Mask R-CNN is better suited for detailed offline assessments. This study
+demonstrates the potential of deep learning to revolutionize infrastructure
+maintenance, offering a scalable and efficient solution for automated damage
+detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 10 figures. This study focuses on the data-driven detection
+  and evaluation of damages in concrete structures using deep learning and
+  computer vision techniques</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CogMorph: Cognitive Morphing Attacks for Text-to-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zonglei Jing, Zonghao Ying, Le Wang, Siyuan Liang, Aishan Liu, Xianglong Liu, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of text-to-image (T2I) generative models, that enable the
+creation of high-quality synthetic images from textual prompts, has opened new
+frontiers in creative design and content generation. However, this paper
+reveals a significant and previously unrecognized ethical risk inherent in this
+technology and introduces a novel method, termed the Cognitive Morphing Attack
+(CogMorph), which manipulates T2I models to generate images that retain the
+original core subjects but embeds toxic or harmful contextual elements. This
+nuanced manipulation exploits the cognitive principle that human perception of
+concepts is shaped by the entire visual scene and its context, producing images
+that amplify emotional harm far beyond attacks that merely preserve the
+original semantics. To address this, we first construct an imagery toxicity
+taxonomy spanning 10 major and 48 sub-categories, aligned with human
+cognitive-perceptual dimensions, and further build a toxicity risk matrix
+resulting in 1,176 high-quality T2I toxic prompts. Based on this, our CogMorph
+first introduces Cognitive Toxicity Augmentation, which develops a cognitive
+toxicity knowledge base with rich external toxic representations for humans
+(e.g., fine-grained visual features) that can be utilized to further guide the
+optimization of adversarial prompts. In addition, we present Contextual
+Hierarchical Morphing, which hierarchically extracts critical parts of the
+original prompt (e.g., scenes, subjects, and body parts), and then iteratively
+retrieves and fuses toxic features to inject harmful contexts. Extensive
+experiments on multiple open-sourced T2I models and black-box commercial APIs
+(e.g., DALLE-3) demonstrate the efficacy of CogMorph which significantly
+outperforms other baselines by large margins (+20.62\% on average).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TFLOP: Table Structure Recognition Framework with Layout Pointer
+  Mechanism <span class="chip">IJCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsoo Khang, Teakgyu Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Table Structure Recognition (TSR) is a task aimed at converting table images
+into a machine-readable format (e.g. HTML), to facilitate other applications
+such as information retrieval. Recent works tackle this problem by identifying
+the HTML tags and text regions, where the latter is used for text extraction
+from the table document. These works however, suffer from misalignment issues
+when mapping text into the identified text regions. In this paper, we introduce
+a new TSR framework, called TFLOP (TSR Framework with LayOut Pointer
+mechanism), which reformulates the conventional text region prediction and
+matching into a direct text region pointing problem. Specifically, TFLOP
+utilizes text region information to identify both the table's structure tags
+and its aligned text regions, simultaneously. Without the need for region
+prediction and alignment, TFLOP circumvents the additional text region matching
+stage, which requires finely-calibrated post-processing. TFLOP also employs
+span-aware contrastive supervision to enhance the pointing mechanism in tables
+with complex structure. As a result, TFLOP achieves the state-of-the-art
+performance across multiple benchmarks such as PubTabNet, FinTabNet, and
+SynthTabNet. In our extensive experiments, TFLOP not only exhibits competitive
+performance but also shows promising results on industrial document TSR
+scenarios such as documents with watermarks or in non-English domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in IJCAI Proceedings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provably effective detection of effective data poisoning attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Gallagher, Yasaman Esfandiari, Callen MacPhee, Michael Warren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper establishes a mathematically precise definition of dataset
+poisoning attack and proves that the very act of effectively poisoning a
+dataset ensures that the attack can be effectively detected. On top of a
+mathematical guarantee that dataset poisoning is identifiable by a new
+statistical test that we call the Conformal Separability Test, we provide
+experimental evidence that we can adequately detect poisoning attempts in the
+real world.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FoundationStereo: Zero-Shot Stereo Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09898v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09898v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Wen, Matthew Trepte, Joseph Aribido, Jan Kautz, Orazio Gallo, Stan Birchfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tremendous progress has been made in deep stereo matching to excel on
+benchmark datasets through per-domain fine-tuning. However, achieving strong
+zero-shot generalization - a hallmark of foundation models in other computer
+vision tasks - remains challenging for stereo matching. We introduce
+FoundationStereo, a foundation model for stereo depth estimation designed to
+achieve strong zero-shot generalization. To this end, we first construct a
+large-scale (1M stereo pairs) synthetic training dataset featuring large
+diversity and high photorealism, followed by an automatic self-curation
+pipeline to remove ambiguous samples. We then design a number of network
+architecture components to enhance scalability, including a side-tuning feature
+backbone that adapts rich monocular priors from vision foundation models to
+mitigate the sim-to-real gap, and long-range context reasoning for effective
+cost volume filtering. Together, these components lead to strong robustness and
+accuracy across domains, establishing a new standard in zero-shot stereo depth
+estimation. Project page: https://nvlabs.github.io/FoundationStereo/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Let There Be Light: Robust Lensless Imaging Under External Illumination
+  With Deep Learning <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Bezzam, Stefan Peters, Martin Vetterli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lensless cameras relax the design constraints of traditional cameras by
+shifting image formation from analog optics to digital post-processing. While
+new camera designs and applications can be enabled, lensless imaging is very
+sensitive to unwanted interference (other sources, noise, etc.). In this work,
+we address a prevalent noise source that has not been studied for lensless
+imaging: external illumination e.g. from ambient and direct lighting. Being
+robust to a variety of lighting conditions would increase the practicality and
+adoption of lensless imaging. To this end, we propose multiple recovery
+approaches that account for external illumination by incorporating its estimate
+into the image recovery process. At the core is a physics-based reconstruction
+that combines learnable image recovery and denoisers, all of whose parameters
+are trained using experimentally gathered data. Compared to standard
+reconstruction methods, our approach yields significant qualitative and
+quantitative improvements. We open-source our implementations and a 25K dataset
+of measurements under multiple lighting conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, dataset: https://doi.org/10.57967/hf/2970, accepted to
+  ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LiteVAE: Lightweight and Efficient Variational Autoencoders for Latent
+  Diffusion Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14477v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14477v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyedmorteza Sadat, Jakob Buhmann, Derek Bradley, Otmar Hilliges, Romann M. Weber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in latent diffusion models (LDMs) have revolutionized
+high-resolution image generation, but the design space of the autoencoder that
+is central to these systems remains underexplored. In this paper, we introduce
+LiteVAE, a new autoencoder design for LDMs, which leverages the 2D discrete
+wavelet transform to enhance scalability and computational efficiency over
+standard variational autoencoders (VAEs) with no sacrifice in output quality.
+We investigate the training methodologies and the decoder architecture of
+LiteVAE and propose several enhancements that improve the training dynamics and
+reconstruction quality. Our base LiteVAE model matches the quality of the
+established VAEs in current LDMs with a six-fold reduction in encoder
+parameters, leading to faster training and lower GPU memory requirements, while
+our larger model outperforms VAEs of comparable complexity across all evaluated
+metrics (rFID, LPIPS, PSNR, and SSIM).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SANER: Annotation-free Societal Attribute Neutralizer for Debiasing CLIP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10202v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10202v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuke Hirota, Min-Hung Chen, Chien-Yi Wang, Yuta Nakashima, Yu-Chiang Frank Wang, Ryo Hachiuma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale vision-language models, such as CLIP, are known to contain
+societal bias regarding protected attributes (e.g., gender, age). This paper
+aims to address the problems of societal bias in CLIP. Although previous
+studies have proposed to debias societal bias through adversarial learning or
+test-time projecting, our comprehensive study of these works identifies two
+critical limitations: 1) loss of attribute information when it is explicitly
+disclosed in the input and 2) use of the attribute annotations during debiasing
+process. To mitigate societal bias in CLIP and overcome these limitations
+simultaneously, we introduce a simple-yet-effective debiasing method called
+SANER (societal attribute neutralizer) that eliminates attribute information
+from CLIP text features only of attribute-neutral descriptions. Experimental
+results show that SANER, which does not require attribute annotations and
+preserves original information for attribute-specific descriptions,
+demonstrates superior debiasing ability than the existing methods.
+Additionally, we observe that SANER does not require retraining CLIP from
+scratch with the original dataset. Moreover, the debiased model can be directly
+applied to the text-to-image generation model by simply replacing the text
+encoder.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Untrained Perceptual Loss for image denoising of line-like structures in
+  MR images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.05884v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.05884v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elisabeth Pfaehler, Daniel Pflugfelder, Hanno Scharr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the acquisition of Magnetic Resonance (MR) images shorter scan times lead
+to higher image noise. Therefore, automatic image denoising using deep learning
+methods is of high interest. MR images containing line-like structures such as
+roots or vessels yield special characteristics as they display connected
+structures and yield sparse information. For this kind of data, it is important
+to consider voxel neighborhoods when training a denoising network. In this
+paper, we translate the Perceptual Loss to 3D data by comparing feature maps of
+untrained networks in the loss function as done previously for 2D data. We
+tested the performance of untrained Perceptual Loss (uPL) on 3D image denoising
+of MR images displaying brain vessels (MR angiograms - MRA) and images of plant
+roots in soil. We investigate the impact of various uPL characteristics such as
+weight initialization, network depth, kernel size, and pooling operations on
+the results. We tested the performance of the uPL loss on four Rician noise
+levels using evaluation metrics such as the Structural Similarity Index Metric
+(SSIM). We observe, that our uPL outperforms conventional loss functions such
+as the L1 loss or a loss based on the Structural Similarity Index Metric
+(SSIM). The uPL network's initialization is not important, while network depth
+and pooling operations impact denoising performance. E.g. for both datasets a
+network with five convolutional layers led to the best performance while a
+network with more layers led to a performance drop. We also find that small uPL
+networks led to better or comparable results than using large networks such as
+VGG. We observe superior performance of our loss for both datasets, all noise
+levels, and three network architectures. In conclusion, for images containing
+line-like structures, uPL is an alternative to other loss functions for 3D
+image denoising.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VITA-1.5: Towards <span class="highlight-title">GPT</span>-4o Level Real-Time Vision and Speech Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01957v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01957v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyou Fu, Haojia Lin, Xiong Wang, Yi-Fan Zhang, Yunhang Shen, Xiaoyu Liu, Haoyu Cao, Zuwei Long, Heting Gao, Ke Li, Long Ma, Xiawu Zheng, Rongrong Ji, Xing Sun, Caifeng Shan, Ran He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent Multimodal Large Language Models (MLLMs) have typically focused on
+integrating visual and textual modalities, with less emphasis placed on the
+role of speech in enhancing interaction. However, speech plays a crucial role
+in multimodal dialogue systems, and implementing high-performance in both
+vision and speech tasks remains a significant challenge due to the fundamental
+modality differences. In this paper, we propose a carefully designed
+multi-stage training methodology that progressively trains LLM to understand
+both visual and speech information, ultimately enabling fluent vision and
+speech interaction. Our approach not only preserves strong vision-language
+capacity, but also enables efficient speech-to-speech dialogue capabilities
+without separate ASR and TTS modules, significantly accelerating multimodal
+end-to-end response speed. By comparing our method against state-of-the-art
+counterparts across benchmarks for image, video, and speech tasks, we
+demonstrate that our model is equipped with both strong visual and speech
+capabilities, making near real-time vision and speech interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://github.com/VITA-MLLM/VITA (2K+ Stars by now)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Scale Texture Loss for CT denoising with GANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16640v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16640v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Di Feola, Lorenzo Tronchin, Valerio Guarrasi, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) have proved as a powerful framework
+for denoising applications in medical imaging. However, GAN-based denoising
+algorithms still suffer from limitations in capturing complex relationships
+within the images. In this regard, the loss function plays a crucial role in
+guiding the image generation process, encompassing how much a synthetic image
+differs from a real image. To grasp highly complex and non-linear textural
+relationships in the training process, this work presents a novel approach to
+capture and embed multi-scale texture information into the loss function. Our
+method introduces a differentiable multi-scale texture representation of the
+images dynamically aggregated by a self-attention layer, thus exploiting
+end-to-end gradient-based optimization. We validate our approach by carrying
+out extensive experiments in the context of low-dose CT denoising, a
+challenging application that aims to enhance the quality of noisy CT scans. We
+utilize three publicly available datasets, including one simulated and two real
+datasets. The results are promising as compared to other well-established loss
+functions, being also consistent across three different GAN architectures. The
+code is available at:
+https://github.com/TrainLaboratory/MultiScaleTextureLoss-MSTLF
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TAB: <span class="highlight-title">Transformer</span> Attention Bottlenecks enable User Intervention and
+  Debugging in Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18675v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18675v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pooyan Rahmanzadehgervi, Hung Huy Nguyen, Rosanne Liu, Long Mai, Anh Totti Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-head self-attention (MHSA) is a key component of Transformers, a widely
+popular architecture in both language and vision. Multiple heads intuitively
+enable different parallel processes over the same input. Yet, they also obscure
+the attribution of each input patch to the output of a model. We propose a
+novel 1-head Transformer Attention Bottleneck (TAB) layer, inserted after the
+traditional MHSA architecture, to serve as an attention bottleneck for
+interpretability and intervention. Unlike standard self-attention, TAB
+constrains the total attention over all patches to $\in [0, 1]$. That is, when
+the total attention is 0, no visual information is propagated further into the
+network and the vision-language model (VLM) would default to a generic,
+image-independent response. To demonstrate the advantages of TAB, we train VLMs
+with TAB to perform image difference captioning. Over three datasets, our
+models perform similarly to baseline VLMs in captioning but the bottleneck is
+superior in localizing changes and in identifying when no changes occur. TAB is
+the first architecture to enable users to intervene by editing attention, which
+often produces expected outputs by VLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Specialization: Assessing the Capabilities of MLLMs in Age and
+  Gender Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02302v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02302v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maksim Kuprashevich, Grigorii Alekseenko, Irina Tolstykh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have recently gained immense
+popularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as
+open-source ones such as LLaVA, are essentially general-purpose models and are
+applied to solve a wide variety of tasks, including those in computer vision.
+These neural networks possess such strong general knowledge and reasoning
+abilities that they have proven capable of working even on tasks for which they
+were not specifically trained. We compared the capabilities of the most
+powerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task
+of age and gender estimation with our state-of-the-art specialized model,
+MiVOLO. We also updated MiVOLO and provide details and new metrics in this
+article. This comparison has yielded some interesting results and insights
+about the strengths and weaknesses of the participating models. Furthermore, we
+attempted various ways to fine-tune the ShareGPT4V model for this specific
+task, aiming to achieve state-of-the-art results in this particular challenge.
+Although such a model would not be practical in production, as it is incredibly
+expensive compared to a specialized model like MiVOLO, it could be very useful
+in some tasks, like data annotation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FViT: A Focal Vision <span class="highlight-title">Transformer</span> with Gabor Filter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11303v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11303v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulong Shi, Mingwei Sun, Yongshuai Wang, Zengqiang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision transformers have achieved encouraging progress in various computer
+vision tasks. A common belief is that this is attributed to the capability of
+self-attention in modeling the global dependencies among feature tokens.
+However, self-attention still faces several challenges in dense prediction
+tasks, including high computational complexity and absence of desirable
+inductive bias. To alleviate these issues, the potential advantages of
+combining vision transformers with Gabor filters are revisited, and a learnable
+Gabor filter (LGF) using convolution is proposed. The LGF does not rely on
+self-attention, and it is used to simulate the response of fundamental cells in
+the biological visual system to the input images. This encourages vision
+transformers to focus on discriminative feature representations of targets
+across different scales and orientations. In addition, a Bionic Focal Vision
+(BFV) block is designed based on the LGF. This block draws inspiration from
+neuroscience and introduces a Dual-Path Feed Forward Network (DPFFN) to emulate
+the parallel and cascaded information processing scheme of the biological
+visual cortex. Furthermore, a unified and efficient family of pyramid backbone
+networks called Focal Vision Transformers (FViTs) is developed by stacking BFV
+blocks. Experimental results indicate that FViTs demonstrate superior
+performance in various vision tasks. In terms of computational efficiency and
+scalability, FViTs show significant advantages compared with other
+counterparts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to Elsevier for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic
+  Segmentation for Satellite Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.05693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.05693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ionut M. Motoi, Leonardo Saraceni, Daniele Nardi, Thomas A. Ciarfuglia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite imagery is crucial for tasks like environmental monitoring and
+urban planning. Typically, it relies on semantic segmentation or Land Use Land
+Cover (LULC) classification to categorize each pixel. Despite the advancements
+brought about by Deep Neural Networks (DNNs), their performance in segmentation
+tasks is hindered by challenges such as limited availability of labeled data,
+class imbalance and the inherent variability and complexity of satellite
+images. In order to mitigate those issues, our study explores the effectiveness
+of a Cut-and-Paste augmentation technique for semantic segmentation in
+satellite images. We adapt this augmentation, which usually requires labeled
+instances, to the case of semantic segmentation. By leveraging the connected
+components in the semantic segmentation labels, we extract instances that are
+then randomly pasted during training. Using the DynamicEarthNet dataset and a
+U-Net model for evaluation, we found that this augmentation significantly
+enhances the mIoU score on the test set from 37.9 to 44.1. This finding
+highlights the potential of the Cut-and-Paste augmentation to improve the
+generalization capabilities of semantic segmentation models in satellite
+imagery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in: IGARSS 2024 - 2024 IEEE International Geoscience and
+  Remote Sensing Symposium</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoDTS: Enhancing Sparsely Supervised Collaborative Perception with a
+  Dual Teacher-Student Framework <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.08344v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.08344v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yushan Han, Hui Zhang, Honglei Zhang, Jing Wang, Yidong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current collaborative perception methods often rely on fully annotated
+datasets, which can be expensive to obtain in practical situations. To reduce
+annotation costs, some works adopt sparsely supervised learning techniques and
+generate pseudo labels for the missing instances. However, these methods fail
+to achieve an optimal confidence threshold that harmonizes the quality and
+quantity of pseudo labels. To address this issue, we propose an end-to-end
+Collaborative perception Dual Teacher-Student framework (CoDTS), which employs
+adaptive complementary learning to produce both high-quality and high-quantity
+pseudo labels. Specifically, the Main Foreground Mining (MFM) module generates
+high-quality pseudo labels based on the prediction of the static teacher.
+Subsequently, the Supplement Foreground Mining (SFM) module ensures a balance
+between the quality and quantity of pseudo labels by adaptively identifying
+missing instances based on the prediction of the dynamic teacher. Additionally,
+the Neighbor Anchor Sampling (NAS) module is incorporated to enhance the
+representation of pseudo labels. To promote the adaptive complementary
+learning, we implement a staged training strategy that trains the student and
+dynamic teacher in a mutually beneficial manner. Extensive experiments
+demonstrate that the CoDTS effectively ensures an optimal balance of pseudo
+labels in both quality and quantity, establishing a new state-of-the-art in
+sparsely supervised collaborative perception.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI
+  Generation and Diffuse Glioma Growth Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05406v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05406v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinghui Liu, Elies Fuster-Garcia, Ivar Thokle Hovden, Bradley J MacIntosh, Edvard Grødem, Petter Brandal, Carles Lopez-Mateu, Donatas Sederevicius, Karoline Skogen, Till Schellhorn, Atle Bjørnerud, Kyrre Eeg Emblem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffuse gliomas are malignant brain tumors that grow widespread through the
+brain. The complex interactions between neoplastic cells and normal tissue, as
+well as the treatment-induced changes often encountered, make glioma tumor
+growth modeling challenging. In this paper, we present a novel end-to-end
+network capable of future predictions of tumor masks and multi-parametric
+magnetic resonance images (MRI) of how the tumor will look at any future time
+points for different treatment plans. Our approach is based on cutting-edge
+diffusion probabilistic models and deep-segmentation neural networks. We
+included sequential multi-parametric MRI and treatment information as
+conditioning inputs to guide the generative diffusion process as well as a
+joint segmentation process. This allows for tumor growth estimates and
+realistic MRI generation at any given treatment and time point. We trained the
+model using real-world postoperative longitudinal MRI data with glioma tumor
+growth trajectories represented as tumor segmentation maps over time. The model
+demonstrates promising performance across various tasks, including generating
+high-quality multi-parametric MRI with tumor masks, performing time-series
+tumor segmentations, and providing uncertainty estimates. Combined with the
+treatment-aware generated MRI, the tumor growth predictions with uncertainty
+estimates can provide useful information for clinical decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprints in the IEEE-TMI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust and Realistic Human Pose Estimation via WiFi Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09411v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09411v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Chen, Jingcai Guo, Song Guo, Jingren Zhou, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust WiFi-based human pose estimation is a challenging task that bridges
+discrete and subtle WiFi signals to human skeletons. This paper revisits this
+problem and reveals two critical yet overlooked issues: 1) cross-domain gap,
+i.e., due to significant variations between source-target domain pose
+distributions; and 2) structural fidelity gap, i.e., predicted skeletal poses
+manifest distorted topology, usually with misplaced joints and disproportionate
+bone lengths. This paper fills these gaps by reformulating the task into a
+novel two-phase framework dubbed DT-Pose: Domain-consistent representation
+learning and Topology-constrained Pose decoding. Concretely, we first propose a
+temporal-consistent contrastive learning strategy with uniformity
+regularization, coupled with self-supervised masking-reconstruction operations,
+to enable robust learning of domain-consistent and motion-discriminative
+WiFi-specific representations. Beyond this, we introduce a simple yet effective
+pose decoder with task prompts, which integrates Graph Convolution Network
+(GCN) and Transformer layers to constrain the topology structure of the
+generated skeleton by exploring the adjacent-overarching relationships among
+human joints. Extensive experiments conducted on various benchmark datasets
+highlight the superior performance of our method in tackling these fundamental
+challenges in both 2D/3D human pose estimation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ F3D-Gaus: Feed-forward 3D-aware Generation on ImageNet with
+  Cycle-Consistent Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06714v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06714v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Wang, Qianyi Wu, Dan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper tackles the problem of generalizable 3D-aware generation from
+monocular datasets, e.g., ImageNet. The key challenge of this task is learning
+a robust 3D-aware representation without multi-view or dynamic data, while
+ensuring consistent texture and geometry across different viewpoints. Although
+some baseline methods are capable of 3D-aware generation, the quality of the
+generated images still lags behind state-of-the-art 2D generation approaches,
+which excel in producing high-quality, detailed images. To address this severe
+limitation, we propose a novel feed-forward pipeline based on pixel-aligned
+Gaussian Splatting, coined as F3D-Gaus, which can produce more realistic and
+reliable 3D renderings from monocular inputs. In addition, we introduce a
+self-supervised cycle-consistent constraint to enforce cross-view consistency
+in the learned 3D representation. This training strategy naturally allows
+aggregation of multiple aligned Gaussian primitives and significantly
+alleviates the interpolation limitations inherent in single-view pixel-aligned
+Gaussian Splatting. Furthermore, we incorporate video model priors to perform
+geometry-aware refinement, enhancing the generation of fine details in
+wide-viewpoint scenarios and improving the model's capability to capture
+intricate 3D textures. Extensive experiments demonstrate that our approach not
+only achieves high-quality, multi-view consistent 3D-aware generation from
+monocular datasets, but also significantly improves training and inference
+efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://w-ted.github.io/publications/F3D-Gaus</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DehazeGS: Seeing Through Fog with 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03659v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03659v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinze Yu, Yiqun Wang, Zhengda Lu, Jianwei Guo, Yong Li, Hongxing Qin, Xiaopeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current novel view synthesis tasks primarily rely on high-quality and clear
+images. However, in foggy scenes, scattering and attenuation can significantly
+degrade the reconstruction and rendering quality. Although NeRF-based dehazing
+reconstruction algorithms have been developed, their use of deep fully
+connected neural networks and per-ray sampling strategies leads to high
+computational costs. Moreover, NeRF's implicit representation struggles to
+recover fine details from hazy scenes. In contrast, recent advancements in 3D
+Gaussian Splatting achieve high-quality 3D scene reconstruction by explicitly
+modeling point clouds into 3D Gaussians. In this paper, we propose leveraging
+the explicit Gaussian representation to explain the foggy image formation
+process through a physically accurate forward rendering process. We introduce
+DehazeGS, a method capable of decomposing and rendering a fog-free background
+from participating media using only muti-view foggy images as input. We model
+the transmission within each Gaussian distribution to simulate the formation of
+fog. During this process, we jointly learn the atmospheric light and scattering
+coefficient while optimizing the Gaussian representation of the hazy scene. In
+the inference stage, we eliminate the effects of scattering and attenuation on
+the Gaussians and directly project them onto a 2D plane to obtain a clear view.
+Experiments on both synthetic and real-world foggy datasets demonstrate that
+DehazeGS achieves state-of-the-art performance in terms of both rendering
+quality and computational efficiency. visualizations are available at
+https://dehazegs.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,4 figures. visualizations are available at
+  https://dehazegs.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diversify, Don't Fine-Tune: Scaling Up Visual Recognition Training with
+  Synthetic Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02253v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02253v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoran Yu, Chenchen Zhu, Sean Culatana, Raghuraman Krishnamoorthi, Fanyi Xiao, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in generative deep learning have enabled the creation of
+high-quality synthetic images in text-to-image generation. Prior work shows
+that fine-tuning a pretrained diffusion model on ImageNet and generating
+synthetic training images from the finetuned model can enhance an ImageNet
+classifier's performance. However, performance degrades as synthetic images
+outnumber real ones. In this paper, we explore whether generative fine-tuning
+is essential for this improvement and whether it is possible to further scale
+up training using more synthetic data. We present a new framework leveraging
+off-the-shelf generative models to generate synthetic training images,
+addressing multiple challenges: class name ambiguity, lack of diversity in
+naive prompts, and domain shifts. Specifically, we leverage large language
+models (LLMs) and CLIP to resolve class name ambiguity. To diversify images, we
+propose contextualized diversification (CD) and stylized diversification (SD)
+methods, also prompted by LLMs. Finally, to mitigate domain shifts, we leverage
+domain adaptation techniques with auxiliary batch normalization for synthetic
+images. Our framework consistently enhances recognition model performance with
+more synthetic data, up to 6x of original ImageNet size showcasing the
+potential of synthetic data for improved recognition models and strong
+out-of-domain generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Implicitly Learned Neural Phase Functions for Basis-Free Point Spread
+  Function Engineering <span class="chip">SP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05413v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05413v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksey Valouev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point spread function (PSF) engineering is vital for precisely controlling
+the focus of light in computational imaging, with applications in neural
+imaging, fluorescence microscopy, and biophotonics. The PSF is derived from the
+magnitude of the Fourier transform of a phase function, making the construction
+of the phase function given the PSF (PSF engineering) an ill-posed inverse
+problem. Traditional PSF engineering methods rely on physical basis functions,
+limiting their ability to generalize across the range of PSFs required for
+imaging tasks. We introduce a novel approach leveraging implicit neural
+representations that overcome the limitations of pixel-wise optimization
+methods. Our approach achieves a median MSSIM of 0.8162 and a mean MSSIM of
+0.5634, compared to a median MSSIM of 0.0 and a mean MSSIM of 0.1841 with
+pixel-wise optimization when learning randomly generated phase functions. Our
+approach also achieves a median PSNR of 10.38 dB and a mean PSNR of 8.672 dB,
+compared to a median PSNR of 6.653 dB and a mean PSNR of 6.660 dB with
+pixel-wise optimization for this task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 pages, 7 figures. To be published in ICVISP 2024
+  (https://www.icvisp.org/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evolver: Chain-of-Evolution <span class="highlight-title">Prompt</span>ing to Boost Large Multimodal Models
+  for Hateful Meme Detection <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21004v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21004v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinfa Huang, Jinsheng Pan, Zhongwei Wan, Hanjia Lyu, Jiebo Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances show that two-stream approaches have achieved outstanding
+performance in hateful meme detection. However, hateful memes constantly evolve
+as new memes emerge by fusing progressive cultural ideas, making existing
+methods obsolete or ineffective. In this work, we explore the potential of
+Large Multimodal Models (LMMs) for hateful meme detection. To this end, we
+propose Evolver, which incorporates LMMs via Chain-of-Evolution (CoE)
+Prompting, by integrating the evolution attribute and in-context information of
+memes. Specifically, Evolver simulates the evolving and expressing process of
+memes and reasons through LMMs in a step-by-step manner. First, an evolutionary
+pair mining module retrieves the top-k most similar memes in the external
+curated meme set with the input meme. Second, an evolutionary information
+extractor is designed to summarize the semantic regularities between the paired
+memes for prompting. Finally, a contextual relevance amplifier enhances the
+in-context hatefulness information to boost the search for evolutionary
+processes. Extensive experiments on public FHM, MAMI, and HarM datasets show
+that CoE prompting can be incorporated into existing LMMs to improve their
+performance. More encouragingly, it can serve as an interpretive tool to
+promote the understanding of the evolution of social memes. [Homepage]
+(https://github.com/inFaaa/Evolver)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FLAME: Learning to Navigate with Multimodal LLM in Urban Environments <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhe Xu, Yiyuan Pan, Zhe Liu, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated potential in
+Vision-and-Language Navigation (VLN) tasks, yet current applications face
+challenges. While LLMs excel in general conversation scenarios, they struggle
+with specialized navigation tasks, yielding suboptimal performance compared to
+specialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied
+Agent), a novel Multimodal LLM-based agent and architecture designed for urban
+VLN tasks that efficiently handles multiple observations. Our approach
+implements a three-phase tuning technique for effective adaptation to
+navigation tasks, including single perception tuning for street view
+description, multiple perception tuning for route summarization, and end-to-end
+training on VLN datasets. The augmented datasets are synthesized automatically.
+Experimental results demonstrate FLAME's superiority over existing methods,
+surpassing state-of-the-art methods by a 7.3% increase in task completion on
+Touchdown dataset. This work showcases the potential of Multimodal LLMs (MLLMs)
+in complex navigation tasks, representing an advancement towards applications
+of MLLMs in the field of embodied intelligence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Grid: Omni Visual Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10718v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10718v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Wan, Xiangyang Luo, Hao Luo, Zijian Cai, Yiren Song, Yunlong Zhao, Yifan Bai, Yuhang He, Yihong Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual generation has witnessed remarkable progress in single-image tasks,
+yet extending these capabilities to temporal sequences remains challenging.
+Current approaches either build specialized video models from scratch with
+enormous computational costs or add separate motion modules to image
+generators, both requiring learning temporal dynamics anew. We observe that
+modern image generation models possess underutilized potential in handling
+structured layouts with implicit temporal understanding. Building on this
+insight, we introduce GRID, which reformulates temporal sequences as grid
+layouts, enabling holistic processing of visual sequences while leveraging
+existing model capabilities. Through a parallel flow-matching training strategy
+with coarse-to-fine scheduling, our approach achieves up to 67 faster inference
+speeds while using <1/1000 of the computational resources compared to
+specialized models. Extensive experiments demonstrate that GRID not only excels
+in temporal tasks from Text-to-Video to 3D Editing but also preserves strong
+performance in image generation, establishing itself as an efficient and
+versatile omni-solution for visual generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Codes: https://github.com/Should-AI-Lab/GRID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DCPI-Depth: Explicitly Infusing Dense Correspondence Prior to
+  Unsupervised Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengtan Zhang, Yi Feng, Qijun Chen, Rui Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been a recent surge of interest in learning to perceive depth from
+monocular videos in an unsupervised fashion. A key challenge in this field is
+achieving robust and accurate depth estimation in challenging scenarios,
+particularly in regions with weak textures or where dynamic objects are
+present. This study makes three major contributions by delving deeply into
+dense correspondence priors to provide existing frameworks with explicit
+geometric constraints. The first novelty is a contextual-geometric depth
+consistency loss, which employs depth maps triangulated from dense
+correspondences based on estimated ego-motion to guide the learning of depth
+perception from contextual information, since explicitly triangulated depth
+maps capture accurate relative distances among pixels. The second novelty
+arises from the observation that there exists an explicit, deducible
+relationship between optical flow divergence and depth gradient. A differential
+property correlation loss is, therefore, designed to refine depth estimation
+with a specific emphasis on local variations. The third novelty is a
+bidirectional stream co-adjustment strategy that enhances the interaction
+between rigid and optical flows, encouraging the former towards more accurate
+correspondence and making the latter more adaptable across various scenarios
+under the static scene hypotheses. DCPI-Depth, a framework that incorporates
+all these innovative components and couples two bidirectional and collaborative
+streams, achieves state-of-the-art performance and generalizability across
+multiple public datasets, outperforming all existing prior arts. Specifically,
+it demonstrates accurate depth estimation in texture-less and dynamic regions,
+and shows more reasonable smoothness. Our source code will be publicly
+available at mias.group/DCPI-Depth upon publication.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SEGT: A General Spatial Expansion Group <span class="highlight-title">Transformer</span> for nuScenes
+  Lidar-based Object Detection Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Mei, Hao He, Yahui Liu, Zhenhua Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the technical report, we present a novel transformer-based framework for
+nuScenes lidar-based object detection task, termed Spatial Expansion Group
+Transformer (SEGT). To efficiently handle the irregular and sparse nature of
+point cloud, we propose migrating the voxels into distinct specialized ordered
+fields with the general spatial expansion strategies, and employ group
+attention mechanisms to extract the exclusive feature maps within each field.
+Subsequently, we integrate the feature representations across different ordered
+fields by alternately applying diverse expansion strategies, thereby enhancing
+the model's ability to capture comprehensive spatial information. The method
+was evaluated on the nuScenes lidar-based object detection test dataset,
+achieving an NDS score of 73.9 without Test-Time Augmentation (TTA) and 74.5
+with TTA, demonstrating the effectiveness of the proposed method. Notably, our
+method ranks the 1st place in the nuScenes lidar-based object detection task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EliGen: Entity-Level Controlled Image Generation with Regional Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01097v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01097v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Zhang, Zhongjie Duan, Xingjun Wang, Yingda Chen, Yu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in diffusion models have significantly advanced
+text-to-image generation, yet global text prompts alone remain insufficient for
+achieving fine-grained control over individual entities within an image. To
+address this limitation, we present EliGen, a novel framework for Entity-Level
+controlled Image Generation. We introduce regional attention, a mechanism for
+diffusion transformers that requires no additional parameters, seamlessly
+integrating entity prompts and arbitrary-shaped spatial masks. By contributing
+a high-quality dataset with fine-grained spatial and semantic entity-level
+annotations, we train EliGen to achieve robust and accurate entity-level
+manipulation, surpassing existing methods in both spatial precision and image
+quality. Additionally, we propose an inpainting fusion pipeline, extending
+EliGen's capabilities to multi-entity image inpainting tasks. We further
+demonstrate its flexibility by integrating it with other open-source models
+such as IP-Adapter, In-Context LoRA and MLLM, unlocking new creative
+possibilities. The source code, model, and dataset are published at
+https://github.com/modelscope/DiffSynth-Studio.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OmniHD-Scenes: A Next-Generation Multimodal <span class="highlight-title">Dataset</span> for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10734v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10734v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lianqing Zheng, Long Yang, Qunshu Lin, Wenjin Ai, Minghao Liu, Shouyi Lu, Jianan Liu, Hongze Ren, Jingyue Mo, Xiaokai Bai, Jie Bai, Zhixiong Ma, Xichan Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of deep learning has intensified the need for
+comprehensive data for use by autonomous driving algorithms. High-quality
+datasets are crucial for the development of effective data-driven autonomous
+driving solutions. Next-generation autonomous driving datasets must be
+multimodal, incorporating data from advanced sensors that feature extensive
+data coverage, detailed annotations, and diverse scene representation. To
+address this need, we present OmniHD-Scenes, a large-scale multimodal dataset
+that provides comprehensive omnidirectional high-definition data. The
+OmniHD-Scenes dataset combines data from 128-beam LiDAR, six cameras, and six
+4D imaging radar systems to achieve full environmental perception. The dataset
+comprises 1501 clips, each approximately 30-s long, totaling more than 450K
+synchronized frames and more than 5.85 million synchronized sensor data points.
+We also propose a novel 4D annotation pipeline. To date, we have annotated 200
+clips with more than 514K precise 3D bounding boxes. These clips also include
+semantic segmentation annotations for static scene elements. Additionally, we
+introduce a novel automated pipeline for generation of the dense occupancy
+ground truth, which effectively leverages information from non-key frames.
+Alongside the proposed dataset, we establish comprehensive evaluation metrics,
+baseline models, and benchmarks for 3D detection and semantic occupancy
+prediction. These benchmarks utilize surround-view cameras and 4D imaging radar
+to explore cost-effective sensor solutions for autonomous driving applications.
+Extensive experiments demonstrate the effectiveness of our low-cost sensor
+configuration and its robustness under adverse conditions. Data will be
+released at https://www.2077ai.com/OmniHD-Scenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Data Deluge to Data Curation: A Filtering-WoRA Paradigm for
+  Efficient Text-based Person Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.10292v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.10292v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jintao Sun, Hao Fei, Zhedong Zheng, Gangyi Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In text-based person search endeavors, data generation has emerged as a
+prevailing practice, addressing concerns over privacy preservation and the
+arduous task of manual annotation. Although the number of synthesized data can
+be infinite in theory, the scientific conundrum persists that how much
+generated data optimally fuels subsequent model training. We observe that only
+a subset of the data in these constructed datasets plays a decisive role.
+Therefore, we introduce a new Filtering-WoRA paradigm, which contains a
+filtering algorithm to identify this crucial data subset and WoRA (Weighted
+Low-Rank Adaptation) learning strategy for light fine-tuning. The filtering
+algorithm is based on the cross-modality relevance to remove the lots of coarse
+matching synthesis pairs. As the number of data decreases, we do not need to
+fine-tune the entire model. Therefore, we propose a WoRA learning strategy to
+efficiently update a minimal portion of model parameters. WoRA streamlines the
+learning process, enabling heightened efficiency in extracting knowledge from
+fewer, yet potent, data instances. Extensive experimentation validates the
+efficacy of pretraining, where our model achieves advanced and efficient
+retrieval performance on challenging real-world benchmarks. Notably, on the
+CUHK-PEDES dataset, we have achieved a competitive mAP of 67.02% while reducing
+model training time by 19.82%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Scene Change Detection <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11210v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11210v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyusik Cho, Dong Yeop Kim, Euntai Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel, training-free approach to scene change detection. Our
+method leverages tracking models, which inherently perform change detection
+between consecutive frames of video by identifying common objects and detecting
+new or missing objects. Specifically, our method takes advantage of the change
+detection effect of the tracking model by inputting reference and query images
+instead of consecutive frames. Furthermore, we focus on the content gap and
+style gap between two input images in change detection, and address both issues
+by proposing adaptive content threshold and style bridging layers,
+respectively. Finally, we extend our approach to video, leveraging rich
+temporal information to enhance the performance of scene change detection. We
+compare our approach and baseline through various experiments. While existing
+train-based baseline tend to specialize only in the trained domain, our method
+shows consistent performance across various domains, proving the
+competitiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025. Code available at: https://github.com/kyusik-cho/ZSSCD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MambaMOT: State-Space Model as Motion Predictor for Multi-Object
+  Tracking <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10826v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10826v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hsiang-Wei Huang, Cheng-Yen Yang, Wenhao Chai, Zhongyu Jiang, Jenq-Neng Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of multi-object tracking (MOT), traditional methods often rely
+on the Kalman filter for motion prediction, leveraging its strengths in linear
+motion scenarios. However, the inherent limitations of these methods become
+evident when confronted with complex, nonlinear motions and occlusions
+prevalent in dynamic environments like sports and dance. This paper explores
+the possibilities of replacing the Kalman filter with a learning-based motion
+model that effectively enhances tracking accuracy and adaptability beyond the
+constraints of Kalman filter-based tracker. In this paper, our proposed method
+MambaMOT and MambaMOT+, demonstrate advanced performance on challenging MOT
+datasets such as DanceTrack and SportsMOT, showcasing their ability to handle
+intricate, non-linear motion patterns and frequent occlusions more effectively
+than traditional methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025. Previous version paper title: Exploring
+  Learning-based Motion Models in Multi-Object Tracking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP
+  Evaluation Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09672v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09672v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexis Roger, Prateek Humane, Daniel Z. Kaplan, Kshitij Gupta, Qi Sun, George Adamopoulos, Jonathan Siu Chi Lim, Quentin Anthony, Edwin Fennell, Irina Rish
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of Vision-Language Models (VLMs) in the past several years
+calls for rigorous and comprehensive evaluation methods and benchmarks. This
+work analyzes existing VLM evaluation techniques, including automated metrics,
+AI-based assessments, and human evaluations across diverse tasks. We first
+introduce Robin - a novel suite of VLMs that we built by combining Large
+Language Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use
+Robin to identify shortcomings of current evaluation approaches across scales.
+Next, to overcome the identified limitations, we introduce CHIRP - a new long
+form response benchmark we developed for more robust and complete VLM
+evaluation. We provide open access to the Robin training code, model suite, and
+CHIRP benchmark to promote reproducibility and advance VLM research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the Efficacy of Meta-Learning: Unveiling Superior Data
+  Diversity Utilization of MAML Over <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08506v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08506v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kavita Selva, Satita Vittayaareekul, Brando Miranda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently, data and model size dominate the narrative in the training of
+super-large, powerful models. However, there has been a lack of exploration on
+the effect of other attributes of the training dataset on model performance. We
+hypothesize that dataset diversity can impact the performance of vision models.
+Our study shows positive correlations between test set accuracy and data
+diversity, providing an argument for furthering the research of dataset
+attributes beyond size. We analyzed pre-training and model-agnostic
+meta-learning methods on twelve popular visual datasets (e.g., Omniglot,
+CIFAR-FS, Aircraft) and five model configurations, including MAML variants with
+different numbers of inner gradient steps and supervised learning. We show
+moderate to strong positive correlations (R-squared: 0.15-0.42) between
+accuracy and data diversity and weaker but significant correlations (R-squared:
+~0.2) between loss and diversity. These findings support our hypothesis and
+demonstrate a promising way for a deeper exploration of how formal data
+diversity influences model performance. This initial study highlights the
+potential of (Task2Vec) data diversity as a valuable measure in the rapidly
+evolving field of large-scale learning and emphasizes that understanding the
+dataset is key to building more powerful and generalizable models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">14</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Leaky Private Information Retrieval Codes to Achieve
+  ${O}(\log K)$ Leakage Ratio Exponent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenyuan Zhao, Yu-Shin Huang, Chao Tian, Alex Sprintson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of leaky private information retrieval (L-PIR), where
+the amount of privacy leakage is measured by the pure differential privacy
+parameter, referred to as the leakage ratio exponent. Unlike the previous L-PIR
+scheme proposed by Samy et al., which only adjusted the probability allocation
+to the clean (low-cost) retrieval pattern, we optimize the probabilities
+assigned to all the retrieval patterns jointly. It is demonstrated that the
+optimal retrieval pattern probability distribution is quite sophisticated and
+has a layered structure: the retrieval patterns associated with the random key
+values of lower Hamming weights should be assigned higher probabilities. This
+new scheme provides a significant improvement, leading to an ${O}(\log K)$
+leakage ratio exponent with fixed download cost $D$ and number of servers $N$,
+in contrast to the previous art that only achieves a $\Theta(K)$ exponent,
+where $K$ is the number of messages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Long version of the paper submitted to ISIT 2025. 8 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DataPro -- A Standardized Data Understanding and Processing Procedure: A
+  Case Study of an Eco-Driving Project 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Ma, Bo Nørregaard Jørgensen, Zheng Grace Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A systematic pipeline for data processing and knowledge discovery is
+essential to extracting knowledge from big data and making recommendations for
+operational decision-making. The CRISP-DM model is the de-facto standard for
+developing data-mining projects in practice. However, advancements in data
+processing technologies require enhancements to this framework. This paper
+presents the DataPro (a standardized data understanding and processing
+procedure) model, which extends CRISP-DM and emphasizes the link between data
+scientists and stakeholders by adding the "technical understanding" and
+"implementation" phases. Firstly, the "technical understanding" phase aligns
+business demands with technical requirements, ensuring the technical team's
+accurate comprehension of business goals. Next, the "implementation" phase
+focuses on the practical application of developed data science models, ensuring
+theoretical models are effectively applied in business contexts. Furthermore,
+clearly defining roles and responsibilities in each phase enhances management
+and communication among all participants. Afterward, a case study on an
+eco-driving data science project for fuel efficiency analysis in the Danish
+public transportation sector illustrates the application of the DataPro model.
+By following the proposed framework, the project identified key business
+objectives, translated them into technical requirements, and developed models
+that provided actionable insights for reducing fuel consumption. Finally, the
+model is evaluated qualitatively, demonstrating its superiority over other data
+science procedures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Less is More: Information Bottleneck Denoised Multimedia Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonghui Yang, Le Wu, Zhuangzhuang He, Zhengwei Wu, Richang Hong, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Empowered by semantic-rich content information, multimedia recommendation has
+emerged as a potent personalized technique. Current endeavors center around
+harnessing multimedia content to refine item representation or uncovering
+latent item-item structures based on modality similarity. Despite the
+effectiveness, we posit that these methods are usually suboptimal due to the
+introduction of irrelevant multimedia features into recommendation tasks. This
+stems from the fact that generic multimedia feature extractors, while
+well-designed for domain-specific tasks, can inadvertently introduce
+task-irrelevant features, leading to potential misguidance of recommenders. In
+this work, we propose a denoised multimedia recommendation paradigm via the
+Information Bottleneck principle (IB). Specifically, we propose a novel
+Information Bottleneck denoised Multimedia Recommendation (IBMRec) model to
+tackle the irrelevant feature issue. IBMRec removes task-irrelevant features
+from both feature and item-item structure perspectives, which are implemented
+by two-level IB learning modules: feature-level (FIB) and graph-level (GIB). In
+particular, FIB focuses on learning the minimal yet sufficient multimedia
+features. This is achieved by maximizing the mutual information between
+multimedia representation and recommendation tasks, while concurrently
+minimizing it between multimedia representation and pre-trained multimedia
+features. Furthermore, GIB is designed to learn the robust item-item graph
+structure, it refines the item-item graph based on preference affinity, then
+minimizes the mutual information between the original graph and the refined
+one. Extensive experiments across three benchmarks validate the effectiveness
+of our proposed model, showcasing high performance, and applicability to
+various multimedia recommenders.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Contrastive Framework with User, Item and <span class="highlight-title">Review</span> Alignment for
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoang V. Dong, Yuan Fang, Hady W. Lauw
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning effective latent representations for users and items is the
+cornerstone of recommender systems. Traditional approaches rely on user-item
+interaction data to map users and items into a shared latent space, but the
+sparsity of interactions often poses challenges. While leveraging user reviews
+could mitigate this sparsity, existing review-aware recommendation models often
+exhibit two key limitations. First, they typically rely on reviews as
+additional features, but reviews are not universal, with many users and items
+lacking them. Second, such approaches do not integrate reviews into the
+user-item space, leading to potential divergence or inconsistency among user,
+item, and review representations. To overcome these limitations, our work
+introduces a Review-centric Contrastive Alignment Framework for Recommendation
+(ReCAFR), which incorporates reviews into the core learning process, ensuring
+alignment among user, item, and review representations within a unified space.
+Specifically, we leverage two self-supervised contrastive strategies that not
+only exploit review-based augmentation to alleviate sparsity, but also align
+the tripartite representations to enhance robustness. Empirical studies on
+public benchmark datasets demonstrate the effectiveness and robustness of
+ReCAFR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating with Fairness: A Modality-Diffused Counterfactual Framework
+  for Incomplete Multimodal Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin Li, Shoujin Wang, Qi Zhang, Shui Yu, Fang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incomplete scenario is a prevalent, practical, yet challenging setting in
+Multimodal Recommendations (MMRec), where some item modalities are missing due
+to various factors. Recently, a few efforts have sought to improve the
+recommendation accuracy by exploring generic structures from incomplete data.
+However, two significant gaps persist: 1) the difficulty in accurately
+generating missing data due to the limited ability to capture modality
+distributions; and 2) the critical but overlooked visibility bias, where items
+with missing modalities are more likely to be disregarded due to the
+prioritization of items' multimodal data over user preference alignment. This
+bias raises serious concerns about the fair treatment of items. To bridge these
+two gaps, we propose a novel Modality-Diffused Counterfactual (MoDiCF)
+framework for incomplete multimodal recommendations. MoDiCF features two key
+modules: a novel modality-diffused data completion module and a new
+counterfactual multimodal recommendation module. The former, equipped with a
+particularly designed multimodal generative framework, accurately generates and
+iteratively refines missing data from learned modality-specific distribution
+spaces. The latter, grounded in the causal perspective, effectively mitigates
+the negative causal effects of visibility bias and thus assures fairness in
+recommendations. Both modules work collaboratively to address the two
+aforementioned significant gaps for generating more accurate and fair results.
+Extensive experiments on three real-world datasets demonstrate the superior
+performance of MoDiCF in terms of both recommendation accuracy and fairness
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrate Temporal Graph Learning into LLM-based Temporal Knowledge
+  Graph Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11911v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11911v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        He Chang, Jie Wu, Zhulin Tao, Yunshan Ma, Xianglin Huang, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Knowledge Graph Forecasting (TKGF) aims to predict future events
+based on the observed events in history. Recently, Large Language Models (LLMs)
+have exhibited remarkable capabilities, generating significant research
+interest in their application for reasoning over temporal knowledge graphs
+(TKGs). Existing LLM-based methods have integrated retrieved historical facts
+or static graph representations into LLMs. Despite the notable performance of
+LLM-based methods, they are limited by the insufficient modeling of temporal
+patterns and ineffective cross-modal alignment between graph and language,
+hindering the ability of LLMs to fully grasp the temporal and structural
+information in TKGs. To tackle these issues, we propose a novel framework
+TGL-LLM to integrate temporal graph learning into LLM-based temporal knowledge
+graph model. Specifically, we introduce temporal graph learning to capture the
+temporal and relational patterns and obtain the historical graph embedding.
+Furthermore, we design a hybrid graph tokenization to sufficiently model the
+temporal patterns within LLMs. To achieve better alignment between graph and
+language, we employ a two-stage training paradigm to finetune LLMs on
+high-quality and diverse data, thereby resulting in better performance.
+Extensive experiments on three real-world datasets show that our approach
+outperforms a range of state-of-the-art (SOTA) methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coarse-to-Fine Lightweight Meta-Embedding for ID-Based Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Wang, Haipeng Liu, Zeqian Yi, Biao Qian, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The state-of-the-art recommendation systems have shifted the attention to
+efficient recommendation, e.g., on-device recommendation, under memory
+constraints. To this end, the existing methods either focused on the
+lightweight embeddings for both users and items, or involved on-device systems
+enjoying the compact embeddings to enhance reusability and reduces space
+complexity. However, they focus solely on the coarse granularity of embedding,
+while overlook the fine-grained semantic nuances, to adversarially downgrade
+the efficacy of meta-embeddings in capturing the intricate relationship over
+both user and item, consequently resulting into the suboptimal recommendations.
+In this paper, we aim to study how the meta-embedding can efficiently learn
+varied grained semantics, together with how the fine-grained meta-embedding can
+strengthen the representation of coarse-grained meta-embedding. To answer these
+questions, we develop a novel graph neural networks (GNNs) based recommender
+where each user and item serves as the node, linked directly to coarse-grained
+virtual nodes and indirectly to fine-grained virtual nodes, ensuring different
+grained semantic learning, while disclosing: 1) In contrast to coarse-grained
+semantics, fine-grained semantics are well captured through sparse
+meta-embeddings, which adaptively 2) balance the embedding uniqueness and
+memory constraint. Additionally, the initialization method come up upon
+SparsePCA, along with a soft thresholding activation function to render the
+sparseness of the meta-embeddings. We propose a weight bridging update strategy
+that focuses on matching each coarse-grained meta-embedding with several
+fine-grained meta-embeddings based on the users/items' semantics. Extensive
+experiments substantiate our method's superiority over existing baselines. Our
+code is available at https://github.com/htyjers/C2F-MetaEmbed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S+t-SNE -- Bringing Dimensionality Reduction to Data Streams 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17643v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17643v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro C. Vieira, João P. Montrezol, João T. Vieira, João Gama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present S+t-SNE, an adaptation of the t-SNE algorithm designed to handle
+infinite data streams. The core idea behind S+t-SNE is to update the t-SNE
+embedding incrementally as new data arrives, ensuring scalability and
+adaptability to handle streaming scenarios. By selecting the most important
+points at each step, the algorithm ensures scalability while keeping
+informative visualisations. By employing a blind method for drift management,
+the algorithm adjusts the embedding space, which facilitates the visualisation
+of evolving data dynamics. Our experimental evaluations demonstrate the
+effectiveness and efficiency of S+t-SNE, whilst highlighting its ability to
+capture patterns in a streaming scenario. We hope our approach offers
+researchers and practitioners a real-time tool for understanding and
+interpreting high-dimensional data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint has undergone peer review but does not have any
+  post-submission improvements or corrections. Full version after peer-review
+  and post-acceptance improvements was presented at IDA2024
+  (https://ida2024.blogs.dsv.su.se/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NoteLLM-2: Multimodal Large Representation Models for Recommendation <span class="chip">KDD'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Zhang, Haoxin Zhang, Shiwei Wu, Di Wu, Tong Xu, Xiangyu Zhao, Yan Gao, Yao Hu, Enhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated exceptional proficiency in
+text understanding and embedding tasks. However, their potential in multimodal
+representation, particularly for item-to-item (I2I) recommendations, remains
+underexplored. While leveraging existing Multimodal Large Language Models
+(MLLMs) for such tasks is promising, challenges arise due to their delayed
+release compared to corresponding LLMs and the inefficiency in representation
+tasks. To address these issues, we propose an end-to-end fine-tuning method
+that customizes the integration of any existing LLMs and vision encoders for
+efficient multimodal representation. Preliminary experiments revealed that
+fine-tuned LLMs often neglect image content. To counteract this, we propose
+NoteLLM-2, a novel framework that enhances visual information. Specifically, we
+propose two approaches: first, a prompt-based method that segregates visual and
+textual content, employing a multimodal In-Context Learning strategy to balance
+focus across modalities; second, a late fusion technique that directly
+integrates visual information into the final representations. Extensive
+experiments, both online and offline, demonstrate the effectiveness of our
+approach. Code is available at
+https://github.com/Applied-Machine-Learning-Lab/NoteLLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD'25 ADS track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Positional encoding is not the same as context: A study on positional
+  encoding for sequential recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10436v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10436v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejo Lopez-Avila, Jinhua Du, Abbas Shimary, Ze Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid growth of streaming media and e-commerce has driven advancements in
+recommendation systems, particularly Sequential Recommendation Systems (SRS).
+These systems employ users' interaction histories to predict future
+preferences. While recent research has focused on architectural innovations
+like transformer blocks and feature extraction, positional encodings, crucial
+for capturing temporal patterns, have received less attention. These encodings
+are often conflated with contextual, such as the temporal footprint, which
+previous works tend to treat as interchangeable with positional information.
+This paper highlights the critical distinction between temporal footprint and
+positional encodings, demonstrating that the latter offers unique relational
+cues between items, which the temporal footprint alone cannot provide. Through
+extensive experimentation on eight Amazon datasets and subsets, we assess the
+impact of various encodings on performance metrics and training stability. We
+introduce new positional encodings and investigate integration strategies that
+improve both metrics and stability, surpassing state-of-the-art results at the
+time of this work's initial preprint. Importantly, we demonstrate that
+selecting the appropriate encoding is not only key to better performance but
+also essential for building robust, reliable SRS models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 6 figures, 21 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Redefining POI Popularity: Integrating User Preferences and Recency for
+  Enhanced Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05360v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05360v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alif Al Hasan, Md. Musfique Anwar, M. Arifur Rahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of point-of-interest (POI) recommendation is to predict users'
+immediate future movements based on their previous records and present
+circumstances. Popularity is considered as one of the primary deciding factors
+for selecting the next place to visit. Existing approaches mainly focused on
+the number of check-ins to model the popularity of a POI. However, not enough
+attention is paid to the temporal impact or number of people check-ins for a
+particular POI. Thus, to prioritize more on recent check-ins, we propose
+recency-oriented definition of POI's popularity by considering the temporal
+effect of the POIs, the number of check-ins, as well as the number of users who
+registered in those check-ins. Our experimental results on real dataset show
+the efficacy of the proposed approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was presented at MIET-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Look Into News Avoidance Through AWRS: An Avoidance-Aware Recommender
+  System <span class="chip">SDM25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.09137v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.09137v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Igor L. R. Azevedo, Toyotaro Suzumura, Yuichiro Yasui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, journalists have expressed concerns about the increasing
+trend of news article avoidance, especially within specific domains. This issue
+has been exacerbated by the rise of recommender systems. Our research indicates
+that recommender systems should consider avoidance as a fundamental factor. We
+argue that news articles can be characterized by three principal elements:
+exposure, relevance, and avoidance, all of which are closely interconnected. To
+address these challenges, we introduce AWRS, an Avoidance-Aware Recommender
+System. This framework incorporates avoidance awareness when recommending news,
+based on the premise that news article avoidance conveys significant
+information about user preferences. Evaluation results on three news datasets
+in different languages (English, Norwegian, and Japanese) demonstrate that our
+method outperforms existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIAM International Conference on Data Mining (SDM25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Representation for Interactive Recommendation <span class="chip">AAAI-2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18396v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18396v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyu Li, Zhiyong Feng, Dongxiao He, Hongqi Chen, Qinghang Gao, Guoli Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactive Recommendation (IR) has gained significant attention recently for
+its capability to quickly capture dynamic interest and optimize both short and
+long term objectives. IR agents are typically implemented through Deep
+Reinforcement Learning (DRL), because DRL is inherently compatible with the
+dynamic nature of IR. However, DRL is currently not perfect for IR. Due to the
+large action space and sample inefficiency problem, training DRL recommender
+agents is challenging. The key point is that useful features cannot be
+extracted as high-quality representations for the recommender agent to optimize
+its policy. To tackle this problem, we propose Contrastive Representation for
+Interactive Recommendation (CRIR). CRIR efficiently extracts latent, high-level
+preference ranking features from explicit interaction, and leverages the
+features to enhance users' representation. Specifically, the CRIR provides
+representation through one representation network, and refines it through our
+proposed Preference Ranking Contrastive Learning (PRCL). The key insight of
+PRCL is that it can perform contrastive learning without relying on
+computations involving high-level representations or large potential action
+sets. Furthermore, we also propose a data exploiting mechanism and an agent
+training mechanism to better adapt CRIR to the DRL backbone. Extensive
+experiments have been carried out to show our method's superior improvement on
+the sample efficiency while training an DRL-based IR agent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI-2025 Accepted paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Customizing Language Models with Instance-wise LoRA for Sequential
+  Recommendation <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10159v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10159v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Kong, Jiancan Wu, An Zhang, Leheng Sheng, Hui Lin, Xiang Wang, Xiangnan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation systems predict the next interaction item based on
+users' past interactions, aligning recommendations with individual preferences.
+Leveraging the strengths of Large Language Models (LLMs) in knowledge
+comprehension and reasoning, recent approaches are eager to apply LLMs to
+sequential recommendation. A common paradigm is converting user behavior
+sequences into instruction data, and fine-tuning the LLM with
+parameter-efficient fine-tuning (PEFT) methods like Low-Rank Adaption (LoRA).
+However, the uniform application of LoRA across diverse user behaviors is
+insufficient to capture individual variability, resulting in negative transfer
+between disparate sequences. To address these challenges, we propose
+Instance-wise LoRA (iLoRA). We innovatively treat the sequential recommendation
+task as a form of multi-task learning, integrating LoRA with the Mixture of
+Experts (MoE) framework. This approach encourages different experts to capture
+various aspects of user behavior. Additionally, we introduce a sequence
+representation guided gate function that generates customized expert
+participation weights for each user sequence, which allows dynamic parameter
+adjustment for instance-wise recommendations. In sequential recommendation,
+iLoRA achieves an average relative improvement of 11.4\% over basic LoRA in the
+hit ratio metric, with less than a 1\% relative increase in trainable
+parameters. Extensive experiments on three benchmark datasets demonstrate the
+effectiveness of iLoRA, highlighting its superior performance compared to
+existing methods in mitigating negative transfer and improving recommendation
+accuracy. Our data and code are available at
+https://github.com/AkaliKong/iLoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">140</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning segmentation from point trajectories <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12392v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12392v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurynas Karazija, Iro Laina, Christian Rupprecht, Andrea Vedaldi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of segmenting objects in videos based on their motion
+and no other forms of supervision. Prior work has often approached this problem
+by using the principle of common fate, namely the fact that the motion of
+points that belong to the same object is strongly correlated. However, most
+authors have only considered instantaneous motion from optical flow. In this
+work, we present a way to train a segmentation network using long-term point
+trajectories as a supervisory signal to complement optical flow. The key
+difficulty is that long-term motion, unlike instantaneous motion, is difficult
+to model -- any parametric approximation is unlikely to capture complex motion
+patterns over long periods of time. We instead draw inspiration from subspace
+clustering approaches, proposing a loss function that seeks to group the
+trajectories into low-rank matrices where the motion of object points can be
+approximately explained as a linear combination of other point tracks. Our
+method outperforms the prior art on motion-based segmentation, which shows the
+utility of long-term motion and the effectiveness of our formulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 Spotlight. Project
+  https://www.robots.ox.ac.uk/~vgg/research/lrtl/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics of Skill Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziming Liu, Yizhou Liu, Eric J. Michaud, Jeff Gore, Max Tegmark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to understand physics of skill learning, i.e., how skills are learned
+in neural networks during training. We start by observing the Domino effect,
+i.e., skills are learned sequentially, and notably, some skills kick off
+learning right after others complete learning, similar to the sequential fall
+of domino cards. To understand the Domino effect and relevant behaviors of
+skill learning, we take physicists' approach of abstraction and simplification.
+We propose three models with varying complexities -- the Geometry model, the
+Resource model, and the Domino model, trading between reality and simplicity.
+The Domino effect can be reproduced in the Geometry model, whose resource
+interpretation inspires the Resource model, which can be further simplified to
+the Domino model. These models present different levels of abstraction and
+simplification; each is useful to study some aspects of skill learning. The
+Geometry model provides interesting insights into neural scaling laws and
+optimizers; the Resource model sheds light on the learning dynamics of
+compositional tasks; the Domino model reveals the benefits of modularity. These
+models are not only conceptually interesting -- e.g., we show how Chinchilla
+scaling laws can emerge from the Geometry model, but also are useful in
+practice by inspiring algorithmic development -- e.g., we show how simple
+algorithmic changes, motivated by these toy models, can speed up the training
+of deep learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 20 figures. Codes are available at
+  https://github.com/KindXiaoming/physics_of_skill_learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio Texture Manipulation by Exemplar-Based Analogy <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12385v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12385v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kan Jen Cheng, Tingle Li, Gopala Anumanchipalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio texture manipulation involves modifying the perceptual characteristics
+of a sound to achieve specific transformations, such as adding, removing, or
+replacing auditory elements. In this paper, we propose an exemplar-based
+analogy model for audio texture manipulation. Instead of conditioning on
+text-based instructions, our method uses paired speech examples, where one clip
+represents the original sound and another illustrates the desired
+transformation. The model learns to apply the same transformation to new input,
+allowing for the manipulation of sound textures. We construct a quadruplet
+dataset representing various editing tasks, and train a latent diffusion model
+in a self-supervised manner. We show through quantitative evaluations and
+perceptual studies that our model outperforms text-conditioned baselines and
+generalizes to real-world, out-of-distribution, and non-speech scenarios.
+Project page: https://berkeley-speech-group.github.io/audio-texture-analogy/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CCESAR: Coastline Classification-Extraction From SAR Images Using
+  CNN-U-Net Combination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12384v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12384v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vidhu Arora, Shreyan Gupta, Ananthakrishna Kudupu, Aditya Priyadarshi, Aswathi Mundayatt, Jaya Sreevalsan-Nair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we improve the deep learning solution for coastline
+extraction from Synthetic Aperture Radar (SAR) images by proposing a two-stage
+model involving image classification followed by segmentation. We hypothesize
+that a single segmentation model usually used for coastline detection is
+insufficient to characterize different coastline types. We demonstrate that the
+need for a two-stage workflow prevails through different compression levels of
+these images. Our results from experiments using a combination of CNN and U-Net
+models on Sentinel-1 images show that the two-stage workflow, coastline
+classification-extraction from SAR images (CCESAR) outperforms a single U-Net
+segmentation model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parallel Sequence Modeling via Generalized Spatial Propagation Network <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongjun Wang, Wonmin Byeon, Jiarui Xu, Jinwei Gu, Ka Chun Cheung, Xiaolong Wang, Kai Han, Jan Kautz, Sifei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Generalized Spatial Propagation Network (GSPN), a new
+attention mechanism optimized for vision tasks that inherently captures 2D
+spatial structures. Existing attention models, including transformers, linear
+attention, and state-space models like Mamba, process multi-dimensional data as
+1D sequences, compromising spatial coherence and efficiency. GSPN overcomes
+these limitations by directly operating on spatially coherent image data and
+forming dense pairwise connections through a line-scan approach. Central to
+GSPN is the Stability-Context Condition, which ensures stable, context-aware
+propagation across 2D sequences and reduces the effective sequence length to
+$\sqrt{N}$ for a square map with N elements, significantly enhancing
+computational efficiency. With learnable, input-dependent weights and no
+reliance on positional embeddings, GSPN achieves superior spatial fidelity and
+state-of-the-art performance in vision tasks, including ImageNet
+classification, class-guided image generation, and text-to-image generation.
+Notably, GSPN accelerates SD-XL with softmax-attention by over $84\times$ when
+generating 16K images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: http://whj363636.github.io/GSPN/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parameters vs FLOPs: Scaling Laws for Optimal Sparsity for
+  Mixture-of-Experts Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samira Abnar, Harshay Shah, Dan Busbridge, Alaaeldin Mohamed Elnouby Ali, Josh Susskind, Vimal Thilak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling the capacity of language models has consistently proven to be a
+reliable approach for improving performance and unlocking new capabilities.
+Capacity can be primarily defined by two dimensions: the number of model
+parameters and the compute per example. While scaling typically involves
+increasing both, the precise interplay between these factors and their combined
+contribution to overall capacity remains not fully understood. We explore this
+relationship in the context of sparse Mixture-of-Expert models (MoEs), which
+allow scaling the number of parameters without proportionally increasing the
+FLOPs per example. We investigate how varying the sparsity level, i.e., the
+ratio of non-active to total parameters, affects model performance in terms of
+both pretraining and downstream performance. We find that under different
+constraints (e.g. parameter size and total training compute), there is an
+optimal level of sparsity that improves both training efficiency and model
+performance. These results provide a better understanding of the impact of
+sparsity in scaling laws for MoEs and complement existing works in this area,
+offering insights for designing more efficient architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Budget-constrained Collaborative Renewable Energy Forecasting Market 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carla Goncalves, Ricardo J. Bessa, Tiago Teixeira, Joao Vinagre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate power forecasting from renewable energy sources (RES) is crucial for
+integrating additional RES capacity into the power system and realizing
+sustainability goals. This work emphasizes the importance of integrating
+decentralized spatio-temporal data into forecasting models. However,
+decentralized data ownership presents a critical obstacle to the success of
+such spatio-temporal models, and incentive mechanisms to foster data-sharing
+need to be considered. The main contributions are a) a comparative analysis of
+the forecasting models, advocating for efficient and interpretable spline LASSO
+regression models, and b) a bidding mechanism within the data/analytics market
+to ensure fair compensation for data providers and enable both buyers and
+sellers to express their data price requirements. Furthermore, an incentive
+mechanism for time series forecasting is proposed, effectively incorporating
+price constraints and preventing redundant feature allocation. Results show
+significant accuracy improvements and potential monetary gains for data
+sellers. For wind power data, an average root mean squared error improvement of
+over 10% was achieved by comparing forecasts generated by the proposal with
+locally generated ones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Algorithm for Sparse Fourier Transform of Generalized q-ary
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Darin Tsui, Kunal Talreja, Amirali Aghazadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computing the Fourier transform of a $q$-ary function
+$f:\mathbb{Z}_{q}^n\rightarrow \mathbb{R}$, which maps $q$-ary sequences to
+real numbers, is an important problem in mathematics with wide-ranging
+applications in biology, signal processing, and machine learning. Previous
+studies have shown that, under the sparsity assumption, the Fourier transform
+can be computed efficiently using fast and sample-efficient algorithms.
+However, in many practical settings, the function is defined over a more
+general space -- the space of generalized $q$-ary sequences $\mathbb{Z}_{q_1}
+\times \mathbb{Z}_{q_2} \times \cdots \times \mathbb{Z}_{q_n}$ -- where each
+$\mathbb{Z}_{q_i}$ corresponds to integers modulo $q_i$. A naive approach
+involves setting $q=\max_i{q_i}$ and treating the function as $q$-ary, which
+results in heavy computational overheads. Herein, we develop GFast, an
+algorithm that computes the $S$-sparse Fourier transform of $f$ with a sample
+complexity of $O(Sn)$, computational complexity of $O(Sn \log N)$, and a
+failure probability that approaches zero as $N=\prod_{i=1}^n q_i \rightarrow
+\infty$ with $S = N^\delta$ for some $0 \leq \delta < 1$. In the presence of
+noise, we further demonstrate that a robust version of GFast computes the
+transform with a sample complexity of $O(Sn^2)$ and computational complexity of
+$O(Sn^2 \log N)$ under the same high probability guarantees. Using large-scale
+synthetic experiments, we demonstrate that GFast computes the sparse Fourier
+transform of generalized $q$-ary functions using $16\times$ fewer samples and
+running $8\times$ faster than existing algorithms. In real-world protein
+fitness datasets, GFast explains the predictive interactions of a neural
+network with $>25\%$ smaller normalized mean-squared error compared to existing
+algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measured Hockey-Stick Divergence and its Applications to Quantum
+  Pufferfish Privacy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theshani Nuradha, Vishal Singh, Mark M. Wilde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The hockey-stick divergence is a fundamental quantity characterizing several
+statistical privacy frameworks that ensure privacy for classical and quantum
+data. In such quantum privacy frameworks, the adversary is allowed to perform
+all possible measurements. However, in practice, there are typically
+limitations to the set of measurements that can be performed. To this end,
+here, we comprehensively analyze the measured hockey-stick divergence under
+several classes of practically relevant measurement classes. We prove several
+of its properties, including data processing and convexity. We show that it is
+efficiently computable by semi-definite programming for some classes of
+measurements and can be analytically evaluated for Werner and isotropic states.
+Notably, we show that the measured hockey-stick divergence characterizes
+optimal privacy parameters in the quantum pufferfish privacy framework. With
+this connection and the developed technical tools, we enable methods to
+quantify and audit privacy for several practically relevant settings. Lastly,
+we introduce the measured hockey-stick divergence of channels and explore its
+applications in ensuring privacy for channels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, submission to the 2025 International Symposium on
+  Information Theory to be held at University of Michigan</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion-aware Censored Gaussian Processes for Demand Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Filipe Rodrigues
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inferring the true demand for a product or a service from aggregate data is
+often challenging due to the limited available supply, thus resulting in
+observations that are censored and correspond to the realized demand, thereby
+not accounting for the unsatisfied demand. Censored regression models are able
+to account for the effect of censoring due to the limited supply, but they
+don't consider the effect of substitutions, which may cause the demand for
+similar alternative products or services to increase. This paper proposes
+Diffusion-aware Censored Demand Models, which combine a Tobit likelihood with a
+graph diffusion process in order to model the latent process of transfer of
+unsatisfied demand between similar products or services. We instantiate this
+new class of models under the framework of GPs and, based on both simulated and
+real-world data for modeling sales, bike-sharing demand, and EV charging
+demand, demonstrate its ability to better recover the true demand and produce
+more accurate out-of-sample predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Test-time regression: a unifying framework for designing sequence models
+  with associative memory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Alexander Wang, Jiaxin Shi, Emily B. Fox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequences provide a remarkably general way to represent and process
+information. This powerful abstraction has placed sequence modeling at the
+center of modern deep learning applications, inspiring numerous architectures
+from transformers to recurrent networks. While this fragmented development has
+yielded powerful models, it has left us without a unified framework to
+understand their fundamental similarities and explain their effectiveness. We
+present a unifying framework motivated by an empirical observation: effective
+sequence models must be able to perform associative recall. Our key insight is
+that memorizing input tokens through an associative memory is equivalent to
+performing regression at test-time. This regression-memory correspondence
+provides a framework for deriving sequence models that can perform associative
+recall, offering a systematic lens to understand seemingly ad-hoc architectural
+choices. We show numerous recent architectures -- including linear attention
+models, their gated variants, state-space models, online learners, and softmax
+attention -- emerge naturally as specific approaches to test-time regression.
+Each architecture corresponds to three design choices: the relative importance
+of each association, the regressor function class, and the optimization
+algorithm. This connection leads to new understanding: we provide theoretical
+justification for QKNorm in softmax attention, and we motivate higher-order
+generalizations of softmax attention. Beyond unification, our work unlocks
+decades of rich statistical tools that can guide future development of more
+powerful yet principled sequence models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CYCle: Choosing Your Collaborators Wisely to Enhance Collaborative
+  Fairness in Decentralized Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nurbek Tastan, Samuel Horvath, Karthik Nandakumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative learning (CL) enables multiple participants to jointly train
+machine learning (ML) models on decentralized data sources without raw data
+sharing. While the primary goal of CL is to maximize the expected accuracy gain
+for each participant, it is also important to ensure that the gains are fairly
+distributed. Specifically, no client should be negatively impacted by the
+collaboration, and the individual gains must ideally be commensurate with the
+contributions. Most existing CL algorithms require central coordination and
+focus on the gain maximization objective while ignoring collaborative fairness.
+In this work, we first show that the existing measure of collaborative fairness
+based on the correlation between accuracy values without and with collaboration
+has drawbacks because it does not account for negative collaboration gain. We
+argue that maximizing mean collaboration gain (MCG) while simultaneously
+minimizing the collaboration gain spread (CGS) is a fairer alternative. Next,
+we propose the CYCle protocol that enables individual participants in a private
+decentralized learning (PDL) framework to achieve this objective through a
+novel reputation scoring method based on gradient alignment between the local
+cross-entropy and distillation losses. Experiments on the CIFAR-10, CIFAR-100,
+and Fed-ISIC2019 datasets empirically demonstrate the effectiveness of the
+CYCle protocol to ensure positive and fair collaboration gain for all
+participants, even in cases where the data distributions of participants are
+highly skewed. For the simple mean estimation problem with two participants, we
+also theoretically show that CYCle performs better than standard FedAvg,
+especially when there is large statistical heterogeneity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Labelling with Open-source LLMs using Dynamic Label Schema
+  Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Walshe, Sae Young Moon, Chunyang Xiao, Yawwani Gunawardana, Fran Silavong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acquiring labelled training data remains a costly task in real world machine
+learning projects to meet quantity and quality requirements. Recently Large
+Language Models (LLMs), notably GPT-4, have shown great promises in labelling
+data with high accuracy. However, privacy and cost concerns prevent the
+ubiquitous use of GPT-4. In this work, we explore effectively leveraging
+open-source models for automatic labelling. We identify integrating label
+schema as a promising technology but found that naively using the label
+description for classification leads to poor performance on high cardinality
+tasks. To address this, we propose Retrieval Augmented Classification (RAC) for
+which LLM performs inferences for one label at a time using corresponding label
+schema; we start with the most related label and iterates until a label is
+chosen by the LLM. We show that our method, which dynamically integrates label
+description, leads to performance improvements in labelling tasks. We further
+show that by focusing only on the most promising labels, RAC can trade off
+between label quality and coverage - a property we leverage to automatically
+label our internal datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cinepro: Robust Training of Foundation Models for Cancer Detection in
+  Prostate Ultrasound Cineloops 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Harmanani, Amoon Jamzad, Minh Nguyen Nhat To, Paul F. R. Wilson, Zhuoxin Guo, Fahimeh Fooladgar, Samira Sojoudi, Mahdi Gilany, Silvia Chang, Peter Black, Michael Leveridge, Robert Siemens, Purang Abolmaesumi, Parvin Mousavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prostate cancer (PCa) detection using deep learning (DL) models has shown
+potential for enhancing real-time guidance during biopsies. However, prostate
+ultrasound images lack pixel-level cancer annotations, introducing label noise.
+Current approaches often focus on limited regions of interest (ROIs),
+disregarding anatomical context necessary for accurate diagnosis. Foundation
+models can overcome this limitation by analyzing entire images to capture
+global spatial relationships; however, they still encounter challenges stemming
+from the weak labels associated with coarse pathology annotations in ultrasound
+data. We introduce Cinepro, a novel framework that strengthens foundation
+models' ability to localize PCa in ultrasound cineloops. Cinepro adapts robust
+training by integrating the proportion of cancer tissue reported by pathology
+in a biopsy core into its loss function to address label noise, providing a
+more nuanced supervision. Additionally, it leverages temporal data across
+multiple frames to apply robust augmentations, enhancing the model's ability to
+learn stable cancer-related features. Cinepro demonstrates superior performance
+on a multi-center prostate ultrasound dataset, achieving an AUROC of 77.1% and
+a balanced accuracy of 83.8%, surpassing current benchmarks. These findings
+underscore Cinepro's promise in advancing foundation models for weakly labeled
+ultrasound data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to IEEE ISBI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Gap Between Principle and Practice of Lossy Image Coding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12330v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12330v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Zhang, Dong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lossy image coding is the art of computing that is principally bounded by the
+image's rate-distortion function. This bound, though never accurately
+characterized, has been approached practically via deep learning technologies
+in recent years. Indeed, learned image coding schemes allow direct optimization
+of the joint rate-distortion cost, thereby outperforming the handcrafted image
+coding schemes by a large margin. Still, it is observed that there is room for
+further improvement in the rate-distortion performance of learned image coding.
+In this article, we identify the gap between the ideal rate-distortion function
+forecasted by Shannon's information theory and the empirical rate-distortion
+function achieved by the state-of-the-art learned image coding schemes,
+revealing that the gap is incurred by five different effects: modeling effect,
+approximation effect, amortization effect, digitization effect, and asymptotic
+effect. We design simulations and experiments to quantitively evaluate the last
+three effects, which demonstrates the high potential of future lossy image
+coding technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Quantification With Noise Injection in Neural Networks: A
+  Bayesian Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueqiong Yuan, Jipeng Li, Ercan Engin Kuruoglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model uncertainty quantification involves measuring and evaluating the
+uncertainty linked to a model's predictions, helping assess their reliability
+and confidence. Noise injection is a technique used to enhance the robustness
+of neural networks by introducing randomness. In this paper, we establish a
+connection between noise injection and uncertainty quantification from a
+Bayesian standpoint. We theoretically demonstrate that injecting noise into the
+weights of a neural network is equivalent to Bayesian inference on a deep
+Gaussian process. Consequently, we introduce a Monte Carlo Noise Injection
+(MCNI) method, which involves injecting noise into the parameters during
+training and performing multiple forward propagations during inference to
+estimate the uncertainty of the prediction. Through simulation and experiments
+on regression and classification tasks, our method demonstrates superior
+performance compared to the baseline model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hybrid Supervised and <span class="highlight-title">Self-Supervised</span> Graph Neural Network for
+  Edge-Centric Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugenio Borzone, Leandro Di Persia, Matias Gerard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel graph-based deep learning model for tasks
+involving relations between two nodes (edge-centric tasks), where the focus
+lies on predicting relationships and interactions between pairs of nodes rather
+than node properties themselves. This model combines supervised and
+self-supervised learning, taking into account for the loss function the
+embeddings learned and patterns with and without ground truth. Additionally it
+incorporates an attention mechanism that leverages both node and edge features.
+The architecture, trained end-to-end, comprises two primary components:
+embedding generation and prediction. First, a graph neural network (GNN)
+transform raw node features into dense, low-dimensional embeddings,
+incorporating edge attributes. Then, a feedforward neural model processes the
+node embeddings to produce the final output. Experiments demonstrate that our
+model matches or exceeds existing methods for protein-protein interactions
+prediction and Gene Ontology (GO) terms prediction. The model also performs
+effectively with one-hot encoding for node features, providing a solution for
+the previously unsolved problem of predicting similarity between compounds with
+unknown structures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sublinear Variational Optimization of Gaussian Mixture Models with
+  Millions to Billions of Parameters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Salwig, Till Kahlke, Florian Hirschberger, Dennis Forster, Jörg Lücke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian Mixture Models (GMMs) range among the most frequently used machine
+learning models. However, training large, general GMMs becomes computationally
+prohibitive for datasets with many data points $N$ of high-dimensionality $D$.
+For GMMs with arbitrary covariances, we here derive a highly efficient
+variational approximation, which is integrated with mixtures of factor
+analyzers (MFAs). For GMMs with $C$ components, our proposed algorithm
+significantly reduces runtime complexity per iteration from
+$\mathcal{O}(NCD^2)$ to a complexity scaling linearly with $D$ and remaining
+constant w.r.t. $C$. Numerical validation of this theoretical complexity
+reduction then shows the following: the distance evaluations required for the
+entire GMM optimization process scale sublinearly with $NC$. On large-scale
+benchmarks, this sublinearity results in speed-ups of an order-of-magnitude
+compared to the state-of-the-art. As a proof of concept, we train GMMs with
+over 10 billion parameters on about 100 million images, and observe training
+times of approximately nine hours on a single state-of-the-art CPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 6 figures (and 17 pages, 3 figures in Appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implementation of an Asymmetric Adjusted Activation Function for Class
+  Imbalance Credit Scoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xia Li, Hanghang Zheng, Kunpeng Tao, Mao Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Credit scoring is a systematic approach to evaluate a borrower's probability
+of default (PD) on a bank loan. The data associated with such scenarios are
+characteristically imbalanced, complicating binary classification owing to the
+often-underestimated cost of misclassification during the classifier's learning
+process. Considering the high imbalance ratio (IR) of these datasets, we
+introduce an innovative yet straightforward optimized activation function by
+incorporating an IR-dependent asymmetric adjusted factor embedded Sigmoid
+activation function (ASIG). The embedding of ASIG makes the sensitive margin of
+the Sigmoid function auto-adjustable, depending on the imbalance nature of the
+datasets distributed, thereby giving the activation function an asymmetric
+characteristic that prevents the underrepresentation of the minority class
+(positive samples) during the classifier's learning process. The experimental
+results show that the ASIG-embedded-classifier outperforms traditional
+classifiers on datasets across wide-ranging IRs in the downstream
+credit-scoring task. The algorithm also shows robustness and stability, even
+when the IR is ultra-high. Therefore, the algorithm provides a competitive
+alternative in the financial industry, especially in credit scoring, possessing
+the ability to effectively process highly imbalanced distribution data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MoGERNN: An Inductive Traffic Predictor for Unobserved Locations in
+  Dynamic Sensing Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qishen Zhou, Yifan Zhang, Michail A. Makridis, Anastasios Kouvelas, Yibing Wang, Simon Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a partially observed road network, how can we predict the traffic state
+of unobserved locations? While deep learning approaches show exceptional
+performance in traffic prediction, most assume sensors at all locations of
+interest, which is impractical due to financial constraints. Furthermore, these
+methods typically require costly retraining when sensor configurations change.
+We propose MoGERNN, an inductive spatio-temporal graph representation model, to
+address these challenges. Inspired by the Mixture of Experts approach in Large
+Language Models, we introduce a Mixture of Graph Expert (MoGE) block to model
+complex spatial dependencies through multiple graph message aggregators and a
+sparse gating network. This block estimates initial states for unobserved
+locations, which are then processed by a GRU-based Encoder-Decoder that
+integrates a graph message aggregator to capture spatio-temporal dependencies
+and predict future states. Experiments on two real-world datasets show MoGERNN
+consistently outperforms baseline methods for both observed and unobserved
+locations. MoGERNN can accurately predict congestion evolution even in areas
+without sensors, offering valuable information for traffic management.
+Moreover, MoGERNN is adaptable to dynamic sensing networks, maintaining
+competitive performance even compared to its retrained counterpart. Tests with
+different numbers of available sensors confirm its consistent superiority, and
+ablation studies validate the effectiveness of its key modules.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ With Great Backbones Comes Great Adversarial Transferability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Arakelyan, Karen Hambardzumyan, Davit Papikyan, Pasquale Minervini, Albert Gordo, Isabelle Augenstein, Aram H. Markosyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in self-supervised learning (SSL) for machine vision have improved
+representation robustness and model performance, giving rise to pre-trained
+backbones like \emph{ResNet} and \emph{ViT} models tuned with SSL methods such
+as \emph{SimCLR}. Due to the computational and data demands of pre-training,
+the utilization of such backbones becomes a strenuous necessity. However,
+employing these backbones may inherit vulnerabilities to adversarial attacks.
+While adversarial robustness has been studied under \emph{white-box} and
+\emph{black-box} settings, the robustness of models tuned on pre-trained
+backbones remains largely unexplored. Additionally, the role of tuning
+meta-information in mitigating exploitation risks is unclear. This work
+systematically evaluates the adversarial robustness of such models across
+$20,000$ combinations of tuning meta-information, including fine-tuning
+techniques, backbone families, datasets, and attack types. We propose using
+proxy models to transfer attacks, simulating varying levels of target knowledge
+by fine-tuning these proxies with diverse configurations. Our findings reveal
+that proxy-based attacks approach the effectiveness of \emph{white-box}
+methods, even with minimal tuning knowledge. We also introduce a naive
+"backbone attack," leveraging only the backbone to generate adversarial
+samples, which outperforms \emph{black-box} attacks and rivals \emph{white-box}
+methods, highlighting critical risks in model-sharing practices. Finally, our
+ablations reveal how increasing tuning meta-information impacts attack
+transferability, measuring each meta-information combination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Memory Storyboard: Leveraging Temporal Segmentation for Streaming
+  <span class="highlight-title">Self-Supervised</span> Learning from Egocentric Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanlai Yang, Mengye Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning holds the promise to learn good representations from
+real-world continuous uncurated data streams. However, most existing works in
+visual self-supervised learning focus on static images or artificial data
+streams. Towards exploring a more realistic learning substrate, we investigate
+streaming self-supervised learning from long-form real-world egocentric video
+streams. Inspired by the event segmentation mechanism in human perception and
+memory, we propose "Memory Storyboard" that groups recent past frames into
+temporal segments for more effective summarization of the past visual streams
+for memory replay. To accommodate efficient temporal segmentation, we propose a
+two-tier memory hierarchy: the recent past is stored in a short-term memory,
+and the storyboard temporal segments are then transferred to a long-term
+memory. Experiments on real-world egocentric video datasets including SAYCam
+and KrishnaCam show that contrastive learning objectives on top of storyboard
+frames result in semantically meaningful representations which outperform those
+produced by state-of-the-art unsupervised continual learning methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FOCUS: First Order Concentrated Updating Scheme 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhou Liu, Ziming Liu, Jeff Gore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) demonstrate remarkable performance, and
+improving their pre-training process appears to be key to enhancing their
+capabilities further. Based on the documented success of Adam, learning rate
+decay, and weight decay, we hypothesize that the pre-training loss landscape
+features a narrowing valley structure. Through experiments with synthetic loss
+functions, we discover that when gradient query noise is high relative to the
+valley's sharpness, Adam's performance falls behind that of Signum because Adam
+reduces the effective step size too drastically. This observation led us to
+develop FOCUS, an optimizer that enhances Signum by incorporating attraction
+toward moving averaged parameters, allowing it to handle noise better while
+maintaining larger step sizes. In training GPT-2, FOCUS proves to be more
+stable than Signum and faster than Adam. These results suggest that gradient
+noise may be an underappreciated limiting factor in LLM training, and FOCUS
+offers promising solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast sparse optimization via adaptive shrinkage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vito Cerone, Sophie M. Fosson, Diego Regruto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The need for fast sparse optimization is emerging, e.g., to deal with
+large-dimensional data-driven problems and to track time-varying systems. In
+the framework of linear sparse optimization, the iterative
+shrinkage-thresholding algorithm is a valuable method to solve Lasso, which is
+particularly appreciated for its ease of implementation. Nevertheless, it
+converges slowly. In this paper, we develop a proximal method, based on
+logarithmic regularization, which turns out to be an iterative
+shrinkage-thresholding algorithm with adaptive shrinkage hyperparameter. This
+adaptivity substantially enhances the trajectory of the algorithm, in a way
+that yields faster convergence, while keeping the simplicity of the original
+method. Our contribution is twofold: on the one hand, we derive and analyze the
+proposed algorithm; on the other hand, we validate its fast convergence via
+numerical experiments and we discuss the performance with respect to
+state-of-the-art algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CDW-CoT: Clustered Distance-Weighted Chain-of-Thoughts Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanheng Fang, Guoqing Chao, Wenqiang Lei, Shaobo Li, Dianhui Chu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have recently achieved impressive results in
+complex reasoning tasks through Chain of Thought (CoT) prompting. However, most
+existing CoT methods rely on using the same prompts, whether manually designed
+or automatically generated, to handle the entire dataset. This
+one-size-fits-all approach may fail to meet the specific needs arising from the
+diversities within a single dataset. To solve this problem, we propose the
+Clustered Distance-Weighted Chain of Thought (CDW-CoT) method, which
+dynamically constructs prompts tailored to the characteristics of each data
+instance by integrating clustering and prompt optimization techniques. Our
+method employs clustering algorithms to categorize the dataset into distinct
+groups, from which a candidate pool of prompts is selected to reflect the
+inherent diversity within the dataset. For each cluster, CDW-CoT trains the
+optimal prompt probability distribution tailored to their specific
+characteristics. Finally, it dynamically constructs a unique prompt probability
+distribution for each test instance, based on its proximity to cluster centers,
+from which prompts are selected for reasoning. CDW-CoT consistently outperforms
+traditional CoT methods across six datasets, including commonsense, symbolic,
+and mathematical reasoning tasks. Specifically, when compared to manual CoT,
+CDW-CoT achieves an average accuracy improvement of 25.34% on LLaMA2 (13B) and
+15.72% on LLaMA3 (8B).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>aaai25(poster)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Detection and Classification of Breast Cancer Using Deep Learning
+  Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mst. Mumtahina Labonno, D. M. Asadujjaman, Md. Mahfujur Rahman, Abdullah Tamim, Mst. Jannatul Ferdous, Rafi Muttaki Mahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is one of the deadliest cancers causing about massive number of
+patients to die annually all over the world according to the WHO. It is a kind
+of cancer that develops when the tissues of the breast grow rapidly and
+unboundly. This fatality rate can be prevented if the cancer is detected before
+it gets malignant. Using automation for early-age detection of breast cancer,
+Artificial Intelligence and Machine Learning technologies can be implemented
+for the best outcome. In this study, we are using the Breast Cancer Image
+Classification dataset collected from the Kaggle depository, which comprises
+9248 Breast Ultrasound Images and is classified into three categories: Benign,
+Malignant, and Normal which refers to non-cancerous, cancerous, and normal
+images.This research introduces three pretrained model featuring custom
+classifiers that includes ResNet50, MobileNet, and VGG16, along with a custom
+CNN model utilizing the ReLU activation function.The models ResNet50,
+MobileNet, VGG16, and a custom CNN recorded accuracies of 98.41%, 97.91%,
+98.19%, and 92.94% on the dataset, correspondingly, with ResNet50 achieving the
+highest accuracy of 98.41%.This model, with its deep and powerful architecture,
+is particularly successful in detecting aberrant cells as well as cancerous or
+non-cancerous tumors. These accuracies show that the Machine Learning methods
+are more compatible for the classification and early detection of breast
+cancer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RL-RC-DoT: A Block-level RL agent for Task-Aware Video Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uri Gadot, Assaf Shocher, Shie Mannor, Gal Chechik, Assaf Hallak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video encoders optimize compression for human perception by minimizing
+reconstruction error under bit-rate constraints. In many modern applications
+such as autonomous driving, an overwhelming majority of videos serve as input
+for AI systems performing tasks like object recognition or segmentation, rather
+than being watched by humans. It is therefore useful to optimize the encoder
+for a downstream task instead of for perceptual image quality. However, a major
+challenge is how to combine such downstream optimization with existing standard
+video encoders, which are highly efficient and popular. Here, we address this
+challenge by controlling the Quantization Parameters (QPs) at the macro-block
+level to optimize the downstream task. This granular control allows us to
+prioritize encoding for task-relevant regions within each frame. We formulate
+this optimization problem as a Reinforcement Learning (RL) task, where the
+agent learns to balance long-term implications of choosing QPs on both task
+performance and bit-rate constraints. Notably, our policy does not require the
+downstream task as an input during inference, making it suitable for streaming
+applications and edge devices such as vehicles. We demonstrate significant
+improvements in two tasks, car detection, and ROI (saliency) encoding. Our
+approach improves task performance for a given bit rate compared to traditional
+task agnostic encoding methods, paving the way for more efficient task-aware
+video compression.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic selection of the best neural architecture for time series
+  forecasting via multi-objective optimization and Pareto optimality conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianying Cao, Shanqing Liu, Alan John Varghese, Jerome Darbon, Michael Triantafyllou, George Em Karniadakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series forecasting plays a pivotal role in a wide range of applications,
+including weather prediction, healthcare, structural health monitoring,
+predictive maintenance, energy systems, and financial markets. While models
+such as LSTM, GRU, Transformers, and State-Space Models (SSMs) have become
+standard tools in this domain, selecting the optimal architecture remains a
+challenge. Performance comparisons often depend on evaluation metrics and the
+datasets under analysis, making the choice of a universally optimal model
+controversial. In this work, we introduce a flexible automated framework for
+time series forecasting that systematically designs and evaluates diverse
+network architectures by integrating LSTM, GRU, multi-head Attention, and SSM
+blocks. Using a multi-objective optimization approach, our framework determines
+the number, sequence, and combination of blocks to align with specific
+requirements and evaluation objectives. From the resulting Pareto-optimal
+architectures, the best model for a given context is selected via a
+user-defined preference function. We validate our framework across four
+distinct real-world applications. Results show that a single-layer GRU or LSTM
+is usually optimal when minimizing training time alone. However, when
+maximizing accuracy or balancing multiple objectives, the best architectures
+are often composite designs incorporating multiple block types in specific
+configurations. By employing a weighted preference function, users can resolve
+trade-offs between objectives, revealing novel, context-specific optimal
+architectures. Our findings underscore that no single neural architecture is
+universally optimal for time series forecasting. Instead, the best-performing
+model emerges as a data-driven composite architecture tailored to user-defined
+criteria and evaluation objectives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantitative Error Bounds for Scaling Limits of Stochastic Iterative
+  Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Wang, Mikolaj J. Kasprzak, Jeffrey Negrea, Solesne Bourguin, Jonathan H. Huggins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic iterative algorithms, including stochastic gradient descent (SGD)
+and stochastic gradient Langevin dynamics (SGLD), are widely utilized for
+optimization and sampling in large-scale and high-dimensional problems in
+machine learning, statistics, and engineering. Numerous works have bounded the
+parameter error in, and characterized the uncertainty of, these approximations.
+One common approach has been to use scaling limit analyses to relate the
+distribution of algorithm sample paths to a continuous-time stochastic process
+approximation, particularly in asymptotic setups. Focusing on the univariate
+setting, in this paper, we build on previous work to derive non-asymptotic
+functional approximation error bounds between the algorithm sample paths and
+the Ornstein-Uhlenbeck approximation using an infinite-dimensional version of
+Stein's method of exchangeable pairs. We show that this bound implies weak
+convergence under modest additional assumptions and leads to a bound on the
+error of the variance of the iterate averages of the algorithm. Furthermore, we
+use our main result to construct error bounds in terms of two common metrics:
+the L\'{e}vy-Prokhorov and bounded Wasserstein distances. Our results provide a
+foundation for developing similar error bounds for the multivariate setting and
+for more sophisticated stochastic approximation algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Score Combining for Contrastive OOD Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edward T. Reehorst, Philip Schniter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In out-of-distribution (OOD) detection, one is asked to classify whether a
+test sample comes from a known inlier distribution or not. We focus on the case
+where the inlier distribution is defined by a training dataset and there exists
+no additional knowledge about the novelties that one is likely to encounter.
+This problem is also referred to as novelty detection, one-class
+classification, and unsupervised anomaly detection. The current literature
+suggests that contrastive learning techniques are state-of-the-art for OOD
+detection. We aim to improve on those techniques by combining/ensembling their
+scores using the framework of null hypothesis testing and, in particular, a
+novel generalized likelihood ratio test (GLRT). We demonstrate that our
+proposed GLRT-based technique outperforms the state-of-the-art CSI and SupCSI
+techniques from Tack et al. 2020 in dataset-vs-dataset experiments with
+CIFAR-10, SVHN, LSUN, ImageNet, and CIFAR-100, as well as leave-one-class-out
+experiments with CIFAR-10. We also demonstrate that our GLRT outperforms the
+score-combining methods of Fisher, Bonferroni, Simes, Benjamini-Hochwald, and
+Stouffer in our application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Experience-replay Innovative Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuo Zhang, Leonardo Stella, Julian Barreiro Gomez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite its groundbreaking success, multi-agent reinforcement learning (MARL)
+still suffers from instability and nonstationarity. Replicator dynamics, the
+most well-known model from evolutionary game theory (EGT), provide a
+theoretical framework for the convergence of the trajectories to Nash
+equilibria and, as a result, have been used to ensure formal guarantees for
+MARL algorithms in stable game settings. However, they exhibit the opposite
+behavior in other settings, which poses the problem of finding alternatives to
+ensure convergence. In contrast, innovative dynamics, such as the Brown-von
+Neumann-Nash (BNN) or Smith, result in periodic trajectories with the potential
+to approximate Nash equilibria. Yet, no MARL algorithms based on these dynamics
+have been proposed. In response to this challenge, we develop a novel
+experience replay-based MARL algorithm that incorporates revision protocols as
+tunable hyperparameters. We demonstrate, by appropriately adjusting the
+revision protocols, that the behavior of our algorithm mirrors the trajectories
+resulting from these dynamics. Importantly, our contribution provides a
+framework capable of extending the theoretical guarantees of MARL algorithms
+beyond replicator dynamics. Finally, we corroborate our theoretical findings
+with empirical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An End-to-End Approach for Korean Wakeword Systems with Speaker
+  Authentication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geonwoo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wakeword detection plays a critical role in enabling AI assistants to listen
+to user voices and interact effectively. However, for languages other than
+English, there is a significant lack of pre-trained wakeword models.
+Additionally, systems that merely determine the presence of a wakeword can pose
+serious privacy concerns. In this paper, we propose an end-to-end approach that
+trains wakewords for Non-English languages, particulary Korean, and uses this
+to develop a Voice Authentication model to protect user privacy. Our
+implementation employs an open-source platform OpenWakeWord, which performs
+wakeword detection using an FCN (Fully-Connected Network) architecture. Once a
+wakeword is detected, our custom-developed code calculates cosine similarity
+for robust user authentication. Experimental results demonstrate the
+effectiveness of our approach, achieving a 16.79% and a 6.6% Equal Error Rate
+(EER) each in the Wakeword Detection and the Voice Authentication. These
+findings highlight the model's potential in providing secure and accurate
+wakeword detection and authentication for Korean users.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures, implementation code available at
+  https://github.com/gws8820/securewakeword-model,
+  https://github.com/gws8820/wyoming-securewakeword, demo video at
+  https://www.youtube.com/watch?v=F3AXUbL-i-o</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MyDigiTwin: A Privacy-Preserving Framework for Personalized
+  Cardiovascular Risk Prediction and Scenario Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Héctor Cadavid, Hyunho Mo, Bauke Arends, Katarzyna Dziopa, Esther E. Bron, Daniel Bos, Sonja Georgievska, Pim van der Harst
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiovascular disease (CVD) remains a leading cause of death, and primary
+prevention through personalized interventions is crucial. This paper introduces
+MyDigiTwin, a framework that integrates health digital twins with personal
+health environments to empower patients in exploring personalized health
+scenarios while ensuring data privacy. MyDigiTwin uses federated learning to
+train predictive models across distributed datasets without transferring raw
+data, and a novel data harmonization framework addresses semantic and format
+inconsistencies in health data. A proof-of-concept demonstrates the feasibility
+of harmonizing and using cohort data to train privacy-preserving CVD prediction
+models. This framework offers a scalable solution for proactive, personalized
+cardiovascular care and sets the stage for future applications in real-world
+healthcare settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A margin-based replacement for cross-entropy loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12191v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12191v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael W. Spratling, Heiko H. Schütt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-entropy (CE) loss is the de-facto standard for training deep neural
+networks to perform classification. However, CE-trained deep neural networks
+struggle with robustness and generalisation issues. To alleviate these issues,
+we propose high error margin (HEM) loss, a variant of multi-class margin loss
+that overcomes the training issues of other margin-based losses. We evaluate
+HEM extensively on a range of architectures and datasets. We find that HEM loss
+is more effective than cross-entropy loss across a wide range of tasks: unknown
+class rejection, adversarial robustness, learning with imbalanced data,
+continual learning, and semantic segmentation (a pixel-level classification
+task). Despite all training hyper-parameters being chosen for CE loss, HEM is
+inferior to CE only in terms of clean accuracy and this difference is
+insignificant. We also compare HEM to specialised losses that have previously
+been proposed to improve performance on specific tasks. LogitNorm, a loss
+achieving state-of-the-art performance on unknown class rejection, produces
+similar performance to HEM for this task, but is much poorer for continual
+learning and semantic segmentation. Logit-adjusted loss, designed for
+imbalanced data, has superior results to HEM for that task, but performs more
+poorly on unknown class rejection and semantic segmentation. DICE, a popular
+loss for semantic segmentation, is inferior to HEM loss on all tasks, including
+semantic segmentation. Thus, HEM often out-performs specialised losses, and in
+contrast to them, is a general-purpose replacement for CE loss.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://codeberg.org/mwspratling/HEMLoss</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MirrorCBO: A consensus-based optimization method in the spirit of mirror
+  descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leon Bungert, Franca Hoffmann, Doh Yeon Kim, Tim Roith
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we propose MirrorCBO, a consensus-based optimization (CBO)
+method which generalizes standard CBO in the same way that mirror descent
+generalizes gradient descent. For this we apply the CBO methodology to a swarm
+of dual particles and retain the primal particle positions by applying the
+inverse of the mirror map, which we parametrize as the subdifferential of a
+strongly convex function $\phi$. In this way, we combine the advantages of a
+derivative-free non-convex optimization algorithm with those of mirror descent.
+As a special case, the method extends CBO to optimization problems with convex
+constraints. Assuming bounds on the Bregman distance associated to $\phi$, we
+provide asymptotic convergence results for MirrorCBO with explicit exponential
+rate. Another key contribution is an exploratory numerical study of this new
+algorithm across different application settings, focusing on (i)
+sparsity-inducing optimization, and (ii) constrained optimization,
+demonstrating the competitive performance of MirrorCBO. We observe empirically
+that the method can also be used for optimization on (non-convex) submanifolds
+of Euclidean space, can be adapted to mirrored versions of other recent CBO
+variants, and that it inherits from mirror descent the capability to select
+desirable minimizers, like sparse ones. We also include an overview of recent
+CBO approaches for constrained optimization and compare their performance to
+MirrorCBO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>64 pages, 18 figures, 19 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiMarker: Enhancing Text Watermark Detection for Large Language Models
+  with Bipolar Watermarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12174v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12174v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid proliferation of Large Language Models (LLMs) has raised concerns
+about misuse and the challenges of distinguishing AI-generated text from
+human-written content. Existing watermarking techniques, such as \kgw, still
+face limitations under low watermark strength, stringent false-positive
+requirements, and low-entropy scenarios. Our analysis reveals that current
+detection methods rely on coarse estimates of non-watermarked text, which
+constrains watermark detectability. We propose the Bipolar Watermark
+(BiMarker), a novel approach that divides generated text into positive and
+negative poles, leveraging the difference in green token counts for detection.
+This differential mechanism significantly enhances the detectability of
+watermarked text. Theoretical analysis and experimental results demonstrate
+BiMarker's effectiveness and compatibility with existing optimization
+techniques, offering a new optimization dimension for watermarking in
+LLM-generated content.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Window-Based Detection: A Graph-Centric Framework for Discrete
+  Log Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Qi, Chang Zeng, Zhongzhi Luan, Shaohan Huang, Shu Yang, Yao Lu, Hailong Yang, Depei Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting anomalies in discrete event logs is critical for ensuring system
+reliability, security, and efficiency. Traditional window-based methods for log
+anomaly detection often suffer from context bias and fuzzy localization, which
+hinder their ability to precisely and efficiently identify anomalies. To
+address these challenges, we propose a graph-centric framework, TempoLog, which
+leverages multi-scale temporal graph networks for discrete log anomaly
+detection. Unlike conventional methods, TempoLog constructs continuous-time
+dynamic graphs directly from event logs, eliminating the need for fixed-size
+window grouping. By representing log templates as nodes and their temporal
+relationships as edges, the framework dynamically captures both local and
+global dependencies across multiple temporal scales. Additionally, a
+semantic-aware model enhances detection by incorporating rich contextual
+information. Extensive experiments on public datasets demonstrate that our
+method achieves state-of-the-art performance in event-level anomaly detection,
+significantly outperforming existing approaches in both accuracy and
+efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaServe: SLO-Customized LLM Serving with Fine-Grained Speculative
+  Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12162v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12162v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikun Li, Zhuofu Chen, Remi Delacourt, Gabriele Oliaro, Zeyu Wang, Qinghan Chen, Shuhuai Lin, April Yang, Zhihao Zhang, Zhuoming Chen, Sean Lai, Xupeng Miao, Zhihao Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces AdaServe, the first LLM serving system to support SLO
+customization through fine-grained speculative decoding. AdaServe leverages the
+logits of a draft model to predict the speculative accuracy of tokens and
+employs a theoretically optimal algorithm to construct token trees for
+verification. To accommodate diverse SLO requirements without compromising
+throughput, AdaServe employs a speculation-and-selection scheme that first
+constructs candidate token trees for each request and then dynamically selects
+tokens to meet individual SLO constraints while optimizing throughput.
+Comprehensive evaluations demonstrate that AdaServe achieves up to 73% higher
+SLO attainment and 74% higher goodput compared to state-of-the-art systems.
+These results underscore AdaServe's potential to enhance the efficiency and
+adaptability of LLM deployments across varied application scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Influence-based Instruction Tuning Data Selection for Balanced
+  Learning of Diverse Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qirun Dai, Dylan Zhang, Jiaqi W. Ma, Hao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Selecting appropriate training data is crucial for effective instruction
+fine-tuning of large language models (LLMs), which aims to (1) elicit strong
+capabilities, and (2) achieve balanced performance across a diverse range of
+tasks. Influence-based methods show promise in achieving (1) by estimating the
+contribution of each training example to the model's predictions, but often
+struggle with (2). Our systematic investigation reveals that this
+underperformance can be attributed to an inherent bias where certain tasks
+intrinsically have greater influence than others. As a result, data selection
+is often biased towards these tasks, not only hurting the model's performance
+on others but also, counterintuitively, harms performance on these
+high-influence tasks themselves.
+  As a remedy, we propose BIDS, a Balanced and Influential Data Selection
+algorithm. BIDS first normalizes influence scores of the training data, and
+then iteratively balances data selection by choosing the training example with
+the highest influence on the most underrepresented task. Experiments with both
+Llama-3 and Mistral-v0.3 on seven benchmarks spanning five diverse capabilities
+show that BIDS consistently outperforms both state-of-the-art influence-based
+algorithms and other non-influence-based selection frameworks. Surprisingly,
+training on a 15% subset selected by BIDS can even outperform full-dataset
+training with a much more balanced performance. Our analysis further highlights
+the importance of both instance-level normalization and iterative optimization
+of selected data for balanced learning of diverse capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Federated Learning Systems for Time-Series Power
+  Consumption Prediction with Multi-Head Embedding Mechanism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Hao Syu, Jerry Chun-Wei Lin, Gautam Srivastava, Unil Yun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time-series prediction is increasingly popular in a variety of applications,
+such as smart factories and smart transportation. Researchers have used various
+techniques to predict power consumption, but existing models lack discussion of
+collaborative learning and privacy issues among multiple clients. To address
+these issues, we propose Multi-Head Heterogeneous Federated Learning (MHHFL)
+systems that consist of multiple head networks, which independently act as
+carriers for federated learning. In the federated period, each head network is
+embedded into 2-dimensional vectors and shared with the centralized source
+pool. MHHFL then selects appropriate source networks and blends the head
+networks as knowledge transfer in federated learning. The experimental results
+show that the proposed MHHFL systems significantly outperform the benchmark and
+state-of-the-art systems and reduce the prediction error by 24.9% to 94.1%. The
+ablation studies demonstrate the effectiveness of the proposed mechanisms in
+the MHHFL (head network embedding and selection mechanisms), which
+significantly outperforms traditional federated average and random transfer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributed Multi-Head Learning Systems for Power Consumption Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Hao Syu, Jerry Chun-Wei Lin, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As more and more automatic vehicles, power consumption prediction becomes a
+vital issue for task scheduling and energy management. Most research focuses on
+automatic vehicles in transportation, but few focus on automatic ground
+vehicles (AGVs) in smart factories, which face complex environments and
+generate large amounts of data. There is an inevitable trade-off between
+feature diversity and interference. In this paper, we propose Distributed
+Multi-Head learning (DMH) systems for power consumption prediction in smart
+factories. Multi-head learning mechanisms are proposed in DMH to reduce noise
+interference and improve accuracy. Additionally, DMH systems are designed as
+distributed and split learning, reducing the client-to-server transmission
+cost, sharing knowledge without sharing local data and models, and enhancing
+the privacy and security levels. Experimental results show that the proposed
+DMH systems rank in the top-2 on most datasets and scenarios. DMH-E system
+reduces the error of the state-of-the-art systems by 14.5% to 24.0%.
+Effectiveness studies demonstrate the effectiveness of Pearson
+correlation-based feature engineering, and feature grouping with the proposed
+multi-head learning further enhances prediction performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Federated Learning System for Sparse Healthcare
+  Time-Series Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12125v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12125v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Hao Syu, Jerry Chun-Wei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a heterogeneous federated learning (HFL) system for
+sparse time series prediction in healthcare, which is a decentralized federated
+learning algorithm with heterogeneous transfers. We design dense and sparse
+feature tensors to deal with the sparsity of data sources. Heterogeneous
+federated learning is developed to share asynchronous parts of networks and
+select appropriate models for knowledge transfer. Experimental results show
+that the proposed HFL achieves the lowest prediction error among all benchmark
+systems on eight out of ten prediction tasks, with MSE reduction of 94.8%,
+48.3%, and 52.1% compared to the benchmark systems. These results demonstrate
+the effectiveness of HFL in transferring knowledge from heterogeneous domains,
+especially in the smaller target domain. Ablation studies then demonstrate the
+effectiveness of the designed mechanisms for heterogeneous domain selection and
+switching in predicting healthcare time series with privacy, model security,
+and heterogeneous knowledge transfer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimally-Weighted Maximum Mean Discrepancy Framework for Continual
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12121v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12121v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        KaiHui Huang, RunQing Wu, Fei Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning has emerged as a pivotal area of research, primarily due
+to its advantageous characteristic that allows models to persistently acquire
+and retain information. However, catastrophic forgetting can severely impair
+model performance. In this study, we tackle the issue of network forgetting by
+introducing a novel framework termed Optimally-Weighted Maximum Mean
+Discrepancy (OWMMD), which imposes penalties on representation alterations via
+a Multi-Level Feature Matching Mechanism (MLFMM). Furthermore, we propose an
+Adaptive Regularization Optimization (ARO) strategy to refine the adaptive
+weight vectors, which autonomously assess the significance of each feature
+layer throughout the optimization process. We conduct a comprehensive series of
+experiments, benchmarking our proposed method against several established
+baselines. The empirical findings indicate that our approach achieves
+state-of-the-art performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ENTIRE: Learning-based Volume Rendering Time Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikai Yin, Hamid Gadirov, Jiri Kosinka, Steffen Frey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present ENTIRE, a novel approach for volume rendering time prediction.
+Time-dependent volume data from simulations or experiments typically comprise
+complex deforming structures across hundreds or thousands of time steps, which
+in addition to the camera configuration has a significant impact on rendering
+performance. We first extract a feature vector from a volume that captures its
+structure that is relevant for rendering time performance. Then we combine this
+feature vector with further relevant parameters (e.g. camera setup), and with
+this perform the final prediction. Our experiments conducted on various
+datasets demonstrate that our model is capable of efficiently achieving high
+prediction accuracy with fast response rates. We showcase ENTIRE's capability
+of enabling dynamic parameter adaptation for stable frame rates and load
+balancing in two case studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regularized dynamical parametric approximation of stiff evolution
+  problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Lubich, Jörg Nick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evolutionary deep neural networks have emerged as a rapidly growing field of
+research. This paper studies numerical integrators for such and other classes
+of nonlinear parametrizations $ u(t) = \Phi(\theta(t)) $, where the evolving
+parameters $\theta(t)$ are to be computed. The primary focus is on tackling the
+challenges posed by the combination of stiff evolution problems and irregular
+parametrizations, which typically arise with neural networks, tensor networks,
+flocks of evolving Gaussians, and in further cases of overparametrization. We
+propose and analyse regularized parametric versions of the implicit Euler
+method and higher-order implicit Runge--Kutta methods for the time integration
+of the parameters in nonlinear approximations to evolutionary partial
+differential equations and large systems of stiff ordinary differential
+equations. At each time step, an ill-conditioned nonlinear optimization problem
+is solved approximately with a few regularized Gauss--Newton iterations. Error
+bounds for the resulting parametric integrator are derived by relating the
+computationally accessible Gauss--Newton iteration for the parameters to the
+computationally inaccessible Newton iteration for the underlying non-parametric
+time integration scheme. The theoretical findings are supported by numerical
+experiments that are designed to show key properties of the proposed parametric
+integrators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 22 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient PINNs: Multi-Head Unimodular Regularization of the Solutions
+  Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12116v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12116v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Tarancón-Álvarez, Pablo Tejerina-Pérez, Raul Jimenez, Pavlos Protopapas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a machine learning framework to facilitate the solution of
+nonlinear multiscale differential equations and, especially, inverse problems
+using Physics-Informed Neural Networks (PINNs). This framework is based on what
+is called multihead (MH) training, which involves training the network to learn
+a general space of all solutions for a given set of equations with certain
+variability, rather than learning a specific solution of the system. This setup
+is used with a second novel technique that we call Unimodular Regularization
+(UR) of the latent space of solutions. We show that the multihead approach,
+combined with the regularization, significantly improves the efficiency of
+PINNs by facilitating the transfer learning process thereby enabling the
+finding of solutions for nonlinear, coupled, and multiscale differential
+equations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Sparsity: Learning Optimal Sparse Structures in Multi-task Networks
+  through Meta-learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12115v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12115v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richa Upadhyay, Ronald Phlypo, Rajkumar Saini, Marcus Liwicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents meta-sparsity, a framework for learning model sparsity,
+basically learning the parameter that controls the degree of sparsity, that
+allows deep neural networks (DNNs) to inherently generate optimal sparse shared
+structures in multi-task learning (MTL) setting. This proposed approach enables
+the dynamic learning of sparsity patterns across a variety of tasks, unlike
+traditional sparsity methods that rely heavily on manual hyperparameter tuning.
+Inspired by Model Agnostic Meta-Learning (MAML), the emphasis is on learning
+shared and optimally sparse parameters in multi-task scenarios by implementing
+a penalty-based, channel-wise structured sparsity during the meta-training
+phase. This method improves the model's efficacy by removing unnecessary
+parameters and enhances its ability to handle both seen and previously unseen
+tasks. The effectiveness of meta-sparsity is rigorously evaluated by extensive
+experiments on two datasets, NYU-v2 and CelebAMask-HQ, covering a broad
+spectrum of tasks ranging from pixel-level to image-level predictions. The
+results show that the proposed approach performs well across many tasks,
+indicating its potential as a versatile tool for creating efficient and
+adaptable sparse neural networks. This work, therefore, presents an approach
+towards learning sparsity, contributing to the efforts in the field of sparse
+neural networks and suggesting new directions for research towards parsimonious
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual NUP Representations and Min-Maximization in Factor Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun-Peng Li, Hans-Andrea Loeliger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Normals with unknown parameters (NUP) can be used to convert nontrivial
+model-based estimation problems into iterations of linear least-squares or
+Gaussian estimation problems. In this paper, we extend this approach by
+augmenting factor graphs with convex-dual variables and pertinent NUP
+representations. In particular, in a state space setting, we propose a new
+iterative forward-backward algorithm that is dual to a recently proposed
+backward-forward algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Proxies for Distortion and Consistency with Applications for Real-World
+  Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Man, Guy Ohayon, Ron Raphaeli, Michael Elad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world image restoration deals with the recovery of images suffering from
+an unknown degradation. This task is typically addressed while being given only
+degraded images, without their corresponding ground-truth versions. In this
+hard setting, designing and evaluating restoration algorithms becomes highly
+challenging. This paper offers a suite of tools that can serve both the design
+and assessment of real-world image restoration algorithms. Our work starts by
+proposing a trained model that predicts the chain of degradations a given
+real-world measured input has gone through. We show how this estimator can be
+used to approximate the consistency -- the match between the measurements and
+any proposed recovered image. We also use this estimator as a guiding force for
+the design of a simple and highly-effective plug-and-play real-world image
+restoration algorithm, leveraging a pre-trained diffusion-based image prior.
+Furthermore, this work proposes no-reference proxy measures of MSE and LPIPS,
+which, without access to the ground-truth images, allow ranking of real-world
+image restoration algorithms according to their (approximate) MSE and LPIPS.
+The proposed suite provides a versatile, first of its kind framework for
+evaluating and comparing blind image restoration algorithms in real-world
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page in https://man-sean.github.io/elad-website/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Whole Slide Image Representation Using K-Mean Clustering and
+  Fisher Vector Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ravi Kant Gupta, Shounak Das, Ardhendu Sekhar, Amit Sethi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whole slide images (WSIs) are high-resolution, gigapixel sized images that
+pose significant computational challenges for traditional machine learning
+models due to their size and heterogeneity.In this paper, we present a scalable
+and efficient methodology for WSI classification by leveraging patch-based
+feature extraction, clustering, and Fisher vector encoding. Initially, WSIs are
+divided into fixed size patches, and deep feature embeddings are extracted from
+each patch using a pre-trained convolutional neural network (CNN). These
+patch-level embeddings are subsequently clustered using K-means clustering,
+where each cluster aggregates semantically similar regions of the WSI. To
+effectively summarize each cluster, Fisher vector representations are computed
+by modeling the distribution of patch embeddings in each cluster as a
+parametric Gaussian mixture model (GMM). The Fisher vectors from each cluster
+are concatenated into a high-dimensional feature vector, creating a compact and
+informative representation of the entire WSI. This feature vector is then used
+by a classifier to predict the WSI's diagnostic label. Our method captures
+local and global tissue structures and yields robust performance for
+large-scale WSI classification, demonstrating superior accuracy and scalability
+compared to other approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Portfolio Performance through Clustering and Sharpe
+  Ratio-Based Optimization: A Comparative Backtesting Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keon Vin Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimizing portfolio performance is a fundamental challenge in financial
+modeling, requiring the integration of advanced clustering techniques and
+data-driven optimization strategies. This paper introduces a comparative
+backtesting approach that combines clustering-based portfolio segmentation and
+Sharpe ratio-based optimization to enhance investment decision-making.
+  First, we segment a diverse set of financial assets into clusters based on
+their historical log-returns using K-Means clustering. This segmentation
+enables the grouping of assets with similar return characteristics,
+facilitating targeted portfolio construction.
+  Next, for each cluster, we apply a Sharpe ratio-based optimization model to
+derive optimal weights that maximize risk-adjusted returns. Unlike traditional
+mean-variance optimization, this approach directly incorporates the trade-off
+between returns and volatility, resulting in a more balanced allocation of
+resources within each cluster.
+  The proposed framework is evaluated through a backtesting study using
+historical data spanning multiple asset classes. Optimized portfolios for each
+cluster are constructed and their cumulative returns are compared over time
+against a traditional equal-weighted benchmark portfolio.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EDoRA: Efficient Weight-Decomposed Low-Rank Adaptation via Singular
+  Value Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamid Nasiri, Peter Garraghan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning methods, such as LoRA, reduces the number of
+trainable parameters. However, they often suffer from scalability issues and
+differences between their learning pattern and full fine-tuning. To overcome
+these limitations, we propose Efficient Weight-Decomposed Low-Rank Adaptation
+(EDoRA): a novel PEFT method that decomposes pre-trained weights into magnitude
+and directional components. By freezing low-rank matrices, initializing them by
+singular value decomposition, and introducing a small trainable matrix between
+them, EDoRA achieves substantial reduction in trainable parameters while
+maintaining learning capacity. Experimental results on the GLUE benchmark
+demonstrate that EDoRA achieves competitive or superior performance compared to
+state-of-the-art methods, such as LoRA and DoRA, with up to 30x fewer trainable
+parameters. This makes EDoRA a highly efficient solution for adapting LLMs to
+diverse tasks under memory-constrained settings. Code is available at
+https://github.com/Hamid-Nasiri/EDoRA .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tackling Uncertainties in Multi-Agent Reinforcement Learning through
+  Integration of Agent Termination Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somnath Hazra, Pallab Dasgupta, Soumyajit Dey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Agent Reinforcement Learning (MARL) has gained significant traction for
+solving complex real-world tasks, but the inherent stochasticity and
+uncertainty in these environments pose substantial challenges to efficient and
+robust policy learning. While Distributional Reinforcement Learning has been
+successfully applied in single-agent settings to address risk and uncertainty,
+its application in MARL is substantially limited. In this work, we propose a
+novel approach that integrates distributional learning with a safety-focused
+loss function to improve convergence in cooperative MARL tasks. Specifically,
+we introduce a Barrier Function based loss that leverages safety metrics,
+identified from inherent faults in the system, into the policy learning
+process. This additional loss term helps mitigate risks and encourages safer
+exploration during the early stages of training. We evaluate our method in the
+StarCraft II micromanagement benchmark, where our approach demonstrates
+improved convergence and outperforms state-of-the-art baselines in terms of
+both safety and task completion. Our results suggest that incorporating safety
+considerations can significantly enhance learning performance in complex,
+multi-agent environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aggrotech: Leveraging Deep Learning for Sustainable Tomato Disease
+  Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        MD Mehraz Hosen, Md. Hasibul Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tomato crop health plays a critical role in ensuring agricultural
+productivity and food security. Timely and accurate detection of diseases
+affecting tomato plants is vital for effective disease management. In this
+study, we propose a deep learning-based approach for Tomato Leaf Disease
+Detection using two well-established convolutional neural networks (CNNs),
+namely VGG19 and Inception v3. The experiment is conducted on the Tomato
+Villages Dataset, encompassing images of both healthy tomato leaves and leaves
+afflicted by various diseases. The VGG19 model is augmented with fully
+connected layers, while the Inception v3 model is modified to incorporate a
+global average pooling layer and a dense classification layer. Both models are
+trained on the prepared dataset, and their performances are evaluated on a
+separate test set. This research employs VGG19 and Inception v3 models on the
+Tomato Villages dataset (4525 images) for tomato leaf disease detection. The
+models' accuracy of 93.93% with dropout layers demonstrates their usefulness
+for crop health monitoring. The paper suggests a deep learning-based strategy
+that includes normalization, resizing, dataset preparation, and unique model
+architectures. During training, VGG19 and Inception v3 serve as feature
+extractors, with possible data augmentation and fine-tuning. Metrics like
+accuracy, precision, recall, and F1 score are obtained through evaluation on a
+test set and offer important insights into the strengths and shortcomings of
+the model. The method has the potential for practical use in precision
+agriculture and could help tomato crops prevent illness early on.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, ROC curves, confusion matrix analysis, and
+  classification reports</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parameterised Quantum Circuits for Novel Representation Learning in
+  Speech Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thejan Rajapakshe, Rajib Rana, Farina Riaz, Sara Khalifa, Björn W. Schuller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech Emotion Recognition (SER) is a complex and challenging task in
+human-computer interaction due to the intricate dependencies of features and
+the overlapping nature of emotional expressions conveyed through speech.
+Although traditional deep learning methods have shown effectiveness, they often
+struggle to capture subtle emotional variations and overlapping states. This
+paper introduces a hybrid classical-quantum framework that integrates
+Parameterised Quantum Circuits (PQCs) with conventional Convolutional Neural
+Network (CNN) architectures. By leveraging quantum properties such as
+superposition and entanglement, the proposed model enhances feature
+representation and captures complex dependencies more effectively than
+classical methods. Experimental evaluations conducted on benchmark datasets,
+including IEMOCAP, RECOLA, and MSP-Improv, demonstrate that the hybrid model
+achieves higher accuracy in both binary and multi-class emotion classification
+while significantly reducing the number of trainable parameters. While a few
+existing studies have explored the feasibility of using Quantum Circuits to
+reduce model complexity, none have successfully shown how they can enhance
+accuracy. This study is the first to demonstrate that Quantum Circuits has the
+potential to improve the accuracy of SER. The findings highlight the promise of
+QML to transform SER, suggesting a promising direction for future research and
+practical applications in emotion-aware systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient and Privacy-Adaptable Mechanism for Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chih Wei Ling, Youqi Wu, Jiande Sun, Cheuk Ting Li, Linqi Song, Weitao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training machine learning models on decentralized private data via federated
+learning (FL) poses two key challenges: communication efficiency and privacy
+protection. In this work, we address these challenges within the trusted
+aggregator model by introducing a novel approach called the
+Communication-Efficient and Privacy-Adaptable Mechanism (CEPAM), achieving both
+objectives simultaneously. In particular, CEPAM leverages the rejection-sampled
+universal quantizer (RSUQ), a construction of randomized vector quantizer whose
+resulting distortion is equivalent to a prescribed noise, such as Gaussian or
+Laplace noise, enabling joint differential privacy and compression. Moreover,
+we analyze the trade-offs among user privacy, global utility, and transmission
+rate of CEPAM by defining appropriate metrics for FL with differential privacy
+and compression. Our CEPAM provides the additional benefit of privacy
+adaptability, allowing clients and the server to customize privacy protection
+based on required accuracy and protection. We assess CEPAM's utility
+performance using MNIST dataset, demonstrating that CEPAM surpasses baseline
+models in terms of learning accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 3 figures, Submitted to 2025 IEEE International Symposium
+  on Information Theory</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Network Preprocessing of Recommender Systems on Multi-Tenant
+  SmartNICs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhu, Wenqi Jiang, Gustavo Alonso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Keeping ML-based recommender models up-to-date as data drifts and evolves is
+essential to maintain accuracy. As a result, online data preprocessing plays an
+increasingly important role in serving recommender systems. Existing solutions
+employ multiple CPU workers to saturate the input bandwidth of a single
+training node. Such an approach results in high deployment costs and energy
+consumption. For instance, a recent report from industrial deployments shows
+that data storage and ingestion pipelines can account for over 60\% of the
+power consumption in a recommender system. In this paper, we tackle the issue
+from a hardware perspective by introducing Piper, a flexible and
+network-attached accelerator that executes data loading and preprocessing
+pipelines in a streaming fashion. As part of the design, we define MiniPipe,
+the smallest pipeline unit enabling multi-pipeline implementation by executing
+various data preprocessing tasks across the single board, giving Piper the
+ability to be reconfigured at runtime. Our results, using publicly released
+commercial pipelines, show that Piper, prototyped on a power-efficient FPGA,
+achieves a 39$\sim$105$\times$ speedup over a server-grade, 128-core CPU and
+3$\sim$17$\times$ speedup over GPUs like RTX 3090 and A100 in multiple
+pipelines. The experimental analysis demonstrates that Piper provides
+advantages in both latency and energy efficiency for preprocessing tasks in
+recommender systems, providing an alternative design point for systems that
+today are in very high demand.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Earth Observation: A <span class="highlight-title">Survey</span> on AI-Powered Image Processing in
+  Satellites 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aidan Duggan, Bruno Andrade, Haithem Afli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in technology and reduction in it's cost have led to a
+substantial growth in the quality & quantity of imagery captured by Earth
+Observation (EO) satellites. This has presented a challenge to the efficacy of
+the traditional workflow of transmitting this imagery to Earth for processing.
+An approach to addressing this issue is to use pre-trained artificial
+intelligence models to process images on-board the satellite, but this is
+difficult given the constraints within a satellite's environment. This paper
+provides an up-to-date and thorough review of research related to image
+processing on-board Earth observation satellites. The significant constraints
+are detailed along with the latest strategies to mitigate them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of <span class="highlight-title">Pre-train</span>ed Deep Learning Models and DINOv2 for
+  Cushing's Syndrome Diagnosis in Facial Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12023v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12023v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongjun Liu, Changwei Song, Jiaqi Qiang, Jianqiang Li, Hui Pan, Lin Lu, Xiao Long, Qing Zhao, Jiuzuo Huang, Shi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cushing's syndrome is a condition caused by excessive glucocorticoid
+secretion from the adrenal cortex, often manifesting with moon facies and
+plethora, making facial data crucial for diagnosis. Previous studies have used
+pre-trained convolutional neural networks (CNNs) for diagnosing Cushing's
+syndrome using frontal facial images. However, CNNs are better at capturing
+local features, while Cushing's syndrome often presents with global facial
+features. Transformer-based models like ViT and SWIN, which utilize
+self-attention mechanisms, can better capture long-range dependencies and
+global features. Recently, DINOv2, a foundation model based on visual
+Transformers, has gained interest. This study compares the performance of
+various pre-trained models, including CNNs, Transformer-based models, and
+DINOv2, in diagnosing Cushing's syndrome. We also analyze gender bias and the
+impact of freezing mechanisms on DINOv2. Our results show that
+Transformer-based models and DINOv2 outperformed CNNs, with ViT achieving the
+highest F1 score of 85.74%. Both the pre-trained model and DINOv2 had higher
+accuracy for female samples. DINOv2 also showed improved performance when
+freezing parameters. In conclusion, Transformer-based models and DINOv2 are
+effective for Cushing's syndrome classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are Traditional Deep Learning Model Approaches as Effective as a
+  Retinal-Specific Foundation Model for Ocular and Systemic Disease Detection? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samantha Min Er Yew, Xiaofeng Lei, Jocelyn Hui Lin Goh, Yibing Chen, Sahana Srinivasan, Miao-li Chee, Krithi Pushpanathan, Ke Zou, Qingshan Hou, Zhi Da Soh, Cancan Xue, Marco Chak Yan Yu, Charumathi Sabanayagam, E Shyong Tai, Xueling Sim, Yaxing Wang, Jost B. Jonas, Vinay Nangia, Gabriel Dawei Yang, Emma Anran Ran, Carol Yim-Lui Cheung, Yangqin Feng, Jun Zhou, Rick Siow Mong Goh, Yukun Zhou, Pearse A. Keane, Yong Liu, Ching-Yu Cheng, Yih-Chung Tham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: RETFound, a self-supervised, retina-specific foundation model
+(FM), showed potential in downstream applications. However, its comparative
+performance with traditional deep learning (DL) models remains incompletely
+understood. This study aimed to evaluate RETFound against three
+ImageNet-pretrained supervised DL models (ResNet50, ViT-base, SwinV2) in
+detecting ocular and systemic diseases.
+  Methods: We fine-tuned/trained RETFound and three DL models on full datasets,
+50%, 20%, and fixed sample sizes (400, 200, 100 images, with half comprising
+disease cases; for each DR severity class, 100 and 50 cases were used.
+Fine-tuned models were tested internally using the SEED (53,090 images) and
+APTOS-2019 (3,672 images) datasets and externally validated on population-based
+(BES, CIEMS, SP2, UKBB) and open-source datasets (ODIR-5k, PAPILA, GAMMA,
+IDRiD, MESSIDOR-2). Model performance was compared using area under the
+receiver operating characteristic curve (AUC) and Z-tests with Bonferroni
+correction (P<0.05/3).
+  Interpretation: Traditional DL models are mostly comparable to RETFound for
+ocular disease detection with large datasets. However, RETFound is superior in
+systemic disease detection with smaller datasets. These findings offer valuable
+insights into the respective merits and limitation of traditional models and
+FMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TabularARGN: A Flexible and Efficient Auto-Regressive Framework for
+  Generating High-Fidelity Synthetic Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Tiwald, Ivona Krchova, Andrey Sidorenko, Mariana Vargas-Vieyra, Mario Scriminaci, Michael Platzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic data generation for tabular datasets must balance fidelity,
+efficiency, and versatility to meet the demands of real-world applications. We
+introduce the Tabular Auto-Regressive Generative Network (TabularARGN), a
+flexible framework designed to handle mixed-type, multivariate, and sequential
+datasets. By training on all possible conditional probabilities, TabularARGN
+supports advanced features such as fairness-aware generation, imputation, and
+conditional generation on any subset of columns. The framework achieves
+state-of-the-art synthetic data quality while significantly reducing training
+and inference times, making it ideal for large-scale datasets with diverse
+structures. Evaluated across established benchmarks, including realistic
+datasets with complex relationships, TabularARGN demonstrates its capability to
+synthesize high-quality data efficiently. By unifying flexibility and
+performance, this framework paves the way for practical synthetic data
+generation across industries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A note on the relations between mixture models, maximum-likelihood and
+  entropic optimal transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Titouan Vayer, Etienne Lasalle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This note aims to demonstrate that performing maximum-likelihood estimation
+for a mixture model is equivalent to minimizing over the parameters an optimal
+transport problem with entropic regularization. The objective is pedagogical:
+we seek to present this already known result in a concise and hopefully simple
+manner. We give an illustration with Gaussian mixture models by showing that
+the standard EM algorithm is a specific block-coordinate descent on an optimal
+transport loss.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Feedback Control Systems for Iterative <span class="highlight-title">Prompt</span> Optimization in
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rupesh Raj Karn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have revolutionized various applications by
+generating outputs based on given prompts. However, achieving the desired
+output requires iterative prompt refinement. This paper presents a novel
+approach that draws parallels between the iterative prompt optimization process
+in LLMs and feedback control systems. We iteratively refine the prompt by
+treating the deviation between the LLM output and the desired result as an
+error term until the output criteria are met. This process is akin to a
+feedback control system, where the LLM, despite being non-linear and
+non-deterministic, is managed using principles from linear feedback control
+systems. We explore the application of different types of controllers within
+this framework, providing a mathematical foundation for integrating linear
+feedback control mechanisms with LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "FRAME: Forward Recursive Adaptive Model Extraction -- A Technique for
+  Advance Feature Selection" 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nachiket Kapure, Harsh Joshi, Parul Kumari, Rajeshwari mistri, Manasi Mali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Feature selection is a crucial preprocessing step in machine learning,
+impacting model performance, interpretability, and computational efficiency.
+This study introduces a novel hybrid approach, the Forward Recursive Adaptive
+Model Extraction Technique (FRAME), which combines Forward Selection and
+Recursive Feature Elimination (RFE) to enhance feature selection across diverse
+datasets. FRAME integrates the strengths of both methods, balancing exploration
+and exploitation of features to optimize selection. A comprehensive evaluation
+of FRAME was conducted against traditional methods such as SelectKBest and
+Lasso Regression, using high-dimensional, noisy, and heterogeneous datasets.
+The results demonstrate that FRAME consistently delivers superior predictive
+performance based on downstream machine learning evaluation metrics. It
+effectively reduces dimensionality while maintaining robust model performance,
+making it particularly valuable for applications requiring interpretable and
+accurate predictions, such as biomedical diagnostics. This study highlights the
+importance of assessing feature selection methods across varied datasets to
+ensure their robustness and generalizability. The findings suggest that FRAME
+has significant potential for further enhancement, particularly through
+integration with deep learning architectures for adaptive and real-time feature
+selection in dynamic environments. By advancing feature selection
+methodologies, FRAME offers a practical and effective solution to improve
+machine learning applications across multiple domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Visualization and Optimization: Multimodal Large Language
+  Models on Graph-Structured Combinatorial Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Zhao, Kang Hao Cheong, Witold Pedrycz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-structured combinatorial challenges are inherently difficult due to
+their nonlinear and intricate nature, often rendering traditional computational
+methods ineffective or expensive. However, these challenges can be more
+naturally tackled by humans through visual representations that harness our
+innate ability for spatial reasoning. In this study, we propose transforming
+graphs into images to preserve their higher-order structural features
+accurately, revolutionizing the representation used in solving graph-structured
+combinatorial tasks. This approach allows machines to emulate human-like
+processing in addressing complex combinatorial challenges. By combining the
+innovative paradigm powered by multimodal large language models (MLLMs) with
+simple search techniques, we aim to develop a novel and effective framework for
+tackling such problems. Our investigation into MLLMs spanned a variety of
+graph-based tasks, from combinatorial problems like influence maximization to
+sequential decision-making in network dismantling, as well as addressing six
+fundamental graph-related issues. Our findings demonstrate that MLLMs exhibit
+exceptional spatial intelligence and a distinctive capability for handling
+these problems, significantly advancing the potential for machines to
+comprehend and analyze graph-structured data with a depth and intuition akin to
+human cognition. These results also imply that integrating MLLMs with simple
+optimization strategies could form a novel and efficient approach for
+navigating graph-structured combinatorial challenges without complex
+derivations, computationally demanding training and fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Noise-Resilient Point-wise Anomaly Detection in Time Series Using Weak
+  Segment Labels <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaxuan Wang, Hao Cheng, Jing Xiong, Qingsong Wen, Han Jia, Ruixuan Song, Liyuan Zhang, Zhaowei Zhu, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting anomalies in temporal data has gained significant attention across
+various real-world applications, aiming to identify unusual events and mitigate
+potential hazards. In practice, situations often involve a mix of segment-level
+labels (detected abnormal events with segments of time points) and unlabeled
+data (undetected events), while the ideal algorithmic outcome should be
+point-level predictions. Therefore, the huge label information gap between
+training data and targets makes the task challenging. In this study, we
+formulate the above imperfect information as noisy labels and propose
+NRdetector, a noise-resilient framework that incorporates confidence-based
+sample selection, robust segment-level learning, and data-centric point-level
+detection for multivariate time series anomaly detection. Particularly, to
+bridge the information gap between noisy segment-level labels and missing
+point-level labels, we develop a novel loss function that can effectively
+mitigate the label noise and consider the temporal features. It encourages the
+smoothness of consecutive points and the separability of points from segments
+with different labels. Extensive experiments on real-world multivariate time
+series datasets with 11 different evaluation metrics demonstrate that
+NRdetector consistently achieves robust results across multiple real-world
+datasets, outperforming various baselines adapted to operate in our setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 2025 ACM SIGKDD International Conference on Knowledge
+  Discovery and Data Mining (KDD'25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GLAM: Global-Local Variation Awareness in Mamba-based World Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian He, Wenqi Liang, Chunhui Hao, Gan Sun, Jiandong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mimicking the real interaction trajectory in the inference of the world model
+has been shown to improve the sample efficiency of model-based reinforcement
+learning (MBRL) algorithms. Many methods directly use known state sequences for
+reasoning. However, this approach fails to enhance the quality of reasoning by
+capturing the subtle variation between states. Much like how humans infer
+trends in event development from this variation, in this work, we introduce
+Global-Local variation Awareness Mamba-based world model (GLAM) that improves
+reasoning quality by perceiving and predicting variation between states. GLAM
+comprises two Mambabased parallel reasoning modules, GMamba and LMamba, which
+focus on perceiving variation from global and local perspectives, respectively,
+during the reasoning process. GMamba focuses on identifying patterns of
+variation between states in the input sequence and leverages these patterns to
+enhance the prediction of future state variation. LMamba emphasizes reasoning
+about unknown information, such as rewards, termination signals, and visual
+representations, by perceiving variation in adjacent states. By integrating the
+strengths of the two modules, GLAM accounts for highervalue variation in
+environmental changes, providing the agent with more efficient
+imagination-based training. We demonstrate that our method outperforms existing
+methods in normalized human scores on the Atari 100k benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeshONet: A Generalizable and Efficient Operator Learning Method for
+  Structured Mesh Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11937v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11937v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Xiao, Xinhai Chen, Qingling Wang, Jie Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mesh generation plays a crucial role in scientific computing. Traditional
+mesh generation methods, such as TFI and PDE-based methods, often struggle to
+achieve a balance between efficiency and mesh quality. To address this
+challenge, physics-informed intelligent learning methods have recently emerged,
+significantly improving generation efficiency while maintaining high mesh
+quality. However, physics-informed methods fail to generalize when applied to
+previously unseen geometries, as even small changes in the boundary shape
+necessitate burdensome retraining to adapt to new geometric variations. In this
+paper, we introduce MeshONet, the first generalizable intelligent learning
+method for structured mesh generation. The method transforms the mesh
+generation task into an operator learning problem with multiple input and
+solution functions. To effectively overcome the multivariable mapping
+restriction of operator learning methods, we propose a dual-branch,
+shared-trunk architecture to approximate the mapping between function spaces
+based on input-output pairs. Experimental results show that MeshONet achieves a
+speedup of up to four orders of magnitude in generation efficiency over
+traditional methods. It also enables generalization to different geometries
+without retraining, greatly enhancing the practicality of intelligent methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ALoFTRAG: Automatic Local Fine Tuning for Retrieval Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Devine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval Augmented Generation (RAG) systems have been shown to improve the
+accuracy of Large Language Model (LLM) outputs. However, these models can often
+achieve low accuracy when applied to new data domains.
+  We introduce the Automatic Local Fine Tuning of Retrieval Augmented
+Generation models (ALoFTRAG) framework, designed to improve the accuracy of RAG
+systems on a given domain by training LLMs without manually labeled data or
+using larger teacher models.
+  By generating and filtering synthetic training data and performing LoRA
+fine-tuning, ALoFTRAG improves citation and answer accuracy across 20 datasets
+in 26 languages by, on average, 8.3% and 3.0% respectively.
+  Our results demonstrate that ALoFTRAG offers a practical, cost-effective, and
+data-secure solution for improving RAG accuracy, making it particularly
+applicable to sensitive domains such as healthcare and finance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive Cross Attention Network for Flood Segmentation using
+  Multispectral Satellite Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vicky Feliren, Fithrothul Khikmah, Irfan Dwiki Bhaswara, Bahrul I. Nasution, Alex M. Lechner, Muhamad Risqi U. Saputra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the integration of deep learning techniques with remote
+sensing technology has revolutionized the way natural hazards, such as floods,
+are monitored and managed. However, existing methods for flood segmentation
+using remote sensing data often overlook the utility of correlative features
+among multispectral satellite information. In this study, we introduce a
+progressive cross attention network (ProCANet), a deep learning model that
+progressively applies both self- and cross-attention mechanisms to
+multispectral features, generating optimal feature combinations for flood
+segmentation. The proposed model was compared with state-of-the-art approaches
+using Sen1Floods11 dataset and our bespoke flood data generated for the Citarum
+River basin, Indonesia. Our model demonstrated superior performance with the
+highest Intersection over Union (IoU) score of 0.815. Our results in this
+study, coupled with the ablation assessment comparing scenarios with and
+without attention across various modalities, opens a promising path for
+enhancing the accuracy of flood analysis using remote sensing technology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures, published in IEEE Geoscience and Remote Sensing
+  Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Goal-oriented Transmission Scheduling: Structure-guided DRL with a
+  Unified Dual On-policy and Off-policy Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11921v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11921v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazheng Chen, Wanchun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Goal-oriented communications prioritize application-driven objectives over
+data accuracy, enabling intelligent next-generation wireless systems. Efficient
+scheduling in multi-device, multi-channel systems poses significant challenges
+due to high-dimensional state and action spaces. We address these challenges by
+deriving key structural properties of the optimal solution to the goal-oriented
+scheduling problem, incorporating Age of Information (AoI) and channel states.
+Specifically, we establish the monotonicity of the optimal state value function
+(a measure of long-term system performance) w.r.t. channel states and prove its
+asymptotic convexity w.r.t. AoI states. Additionally, we derive the
+monotonicity of the optimal policy w.r.t. channel states, advancing the
+theoretical framework for optimal scheduling. Leveraging these insights, we
+propose the structure-guided unified dual on-off policy DRL (SUDO-DRL), a
+hybrid algorithm that combines the stability of on-policy training with the
+sample efficiency of off-policy methods. Through a novel structural property
+evaluation framework, SUDO-DRL enables effective and scalable training,
+addressing the complexities of large-scale systems. Numerical results show
+SUDO-DRL improves system performance by up to 45% and reduces convergence time
+by 40% compared to state-of-the-art methods. It also effectively handles
+scheduling in much larger systems, where off-policy DRL fails and on-policy
+benchmarks exhibit significant performance loss, demonstrating its scalability
+and efficacy in goal-oriented communications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper submitted to IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Fine-Tuning with Latent Cluster Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11919v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11919v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cédric Ho Thanh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The existence of salient semantic clusters in the latent spaces of a neural
+network during training strongly correlates its final accuracy on
+classification tasks. This paper proposes a novel fine-tuning method that
+boosts performance by optimising the formation of these latent clusters, using
+the Louvain community detection algorithm and a specifically designed
+clustering loss function. We present preliminary results that demonstrate the
+viability of this process on classical neural network architectures during
+fine-tuning on the CIFAR-100 dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LASER: Lip Landmark Assisted Speaker Detection for Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Thien Phuc Nguyen, Zhuoran Yu, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active Speaker Detection (ASD) aims to identify speaking individuals in
+complex visual scenes. While humans can easily detect speech by matching lip
+movements to audio, current ASD models struggle to establish this
+correspondence, often misclassifying non-speaking instances when audio and lip
+movements are unsynchronized. To address this limitation, we propose Lip
+landmark Assisted Speaker dEtection for Robustness (LASER). Unlike models that
+rely solely on facial frames, LASER explicitly focuses on lip movements by
+integrating lip landmarks in training. Specifically, given a face track, LASER
+extracts frame-level visual features and the 2D coordinates of lip landmarks
+using a lightweight detector. These coordinates are encoded into dense feature
+maps, providing spatial and structural information on lip positions.
+Recognizing that landmark detectors may sometimes fail under challenging
+conditions (e.g., low resolution, occlusions, extreme angles), we incorporate
+an auxiliary consistency loss to align predictions from both lip-aware and
+face-only features, ensuring reliable performance even when lip data is absent.
+Extensive experiments across multiple datasets show that LASER outperforms
+state-of-the-art models, especially in scenarios with desynchronized audio and
+visuals, demonstrating robust performance in real-world video contexts. Code is
+available at \url{https://github.com/plnguyen2908/LASER_ASD}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Highly Efficient Rotation-Invariant Spectral Embedding for Scalable
+  Incomplete Multi-View Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinxin Wang, Yongshan Zhang, Yicong Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incomplete multi-view clustering presents significant challenges due to
+missing views. Although many existing graph-based methods aim to recover
+missing instances or complete similarity matrices with promising results, they
+still face several limitations: (1) Recovered data may be unsuitable for
+spectral clustering, as these methods often ignore guidance from spectral
+analysis; (2) Complex optimization processes require high computational burden,
+hindering scalability to large-scale problems; (3) Most methods do not address
+the rotational mismatch problem in spectral embeddings. To address these
+issues, we propose a highly efficient rotation-invariant spectral embedding
+(RISE) method for scalable incomplete multi-view clustering. RISE learns
+view-specific embeddings from incomplete bipartite graphs to capture the
+complementary information. Meanwhile, a complete consensus representation with
+second-order rotation-invariant property is recovered from these incomplete
+embeddings in a unified model. Moreover, we design a fast alternating
+optimization algorithm with linear complexity and promising convergence to
+solve the proposed formulation. Extensive experiments on multiple datasets
+demonstrate the effectiveness, scalability, and efficiency of RISE compared to
+the state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Masked Autoencoders for Character-Level Open-Set Writer
+  Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaowei Jiang, Wenhao Ma, Yiqun Duan, Thomas Do, Chin-Teng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of digital forensics and document authentication, writer
+identification plays a crucial role in determining the authors of documents
+based on handwriting styles. The primary challenge in writer-id is the
+"open-set scenario", where the goal is accurately recognizing writers unseen
+during the model training. To overcome this challenge, representation learning
+is the key. This method can capture unique handwriting features, enabling it to
+recognize styles not previously encountered during training. Building on this
+concept, this paper introduces the Contrastive Masked Auto-Encoders (CMAE) for
+Character-level Open-Set Writer Identification. We merge Masked Auto-Encoders
+(MAE) with Contrastive Learning (CL) to simultaneously and respectively capture
+sequential information and distinguish diverse handwriting styles.
+Demonstrating its effectiveness, our model achieves state-of-the-art (SOTA)
+results on the CASIA online handwriting dataset, reaching an impressive
+precision rate of 89.7%. Our study advances universal writer-id with a
+sophisticated representation learning approach, contributing substantially to
+the ever-evolving landscape of digital handwriting analysis, and catering to
+the demands of an increasingly interconnected world.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Community-Aware Temporal Walks: Parameter-Free Representation Learning
+  on Continuous-Time Dynamic Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        He Yu, Jing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic graph representation learning plays a crucial role in understanding
+evolving behaviors. However, existing methods often struggle with flexibility,
+adaptability, and the preservation of temporal and structural dynamics. To
+address these issues, we propose Community-aware Temporal Walks (CTWalks), a
+novel framework for representation learning on continuous-time dynamic graphs.
+CTWalks integrates three key components: a community-based parameter-free
+temporal walk sampling mechanism, an anonymization strategy enriched with
+community labels, and an encoding process that leverages continuous temporal
+dynamics modeled via ordinary differential equations (ODEs). This design
+enables precise modeling of both intra- and inter-community interactions,
+offering a fine-grained representation of evolving temporal patterns in
+continuous-time dynamic graphs. CTWalks theoretically overcomes locality bias
+in walks and establishes its connection to matrix factorization. Experiments on
+benchmark datasets demonstrate that CTWalks outperforms established methods in
+temporal link prediction tasks, achieving higher accuracy while maintaining
+robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Demons in the Detail: On Implementing Load Balancing Loss for Training
+  Specialized Mixture-of-Expert Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Qiu, Zeyu Huang, Bo Zheng, Kaiyue Wen, Zekun Wang, Rui Men, Ivan Titov, Dayiheng Liu, Jingren Zhou, Junyang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits the implementation of
+$\textbf{L}$oad-$\textbf{b}$alancing $\textbf{L}$oss (LBL) when training
+Mixture-of-Experts (MoEs) models. Specifically, LBL for MoEs is defined as $N_E
+\sum_{i=1}^{N_E} f_i p_i$, where $N_E$ is the total number of experts, $f_i$
+represents the frequency of expert $i$ being selected, and $p_i$ denotes the
+average gating score of the expert $i$. Existing MoE training frameworks
+usually employ the parallel training strategy so that $f_i$ and the LBL are
+calculated within a $\textbf{micro-batch}$ and then averaged across parallel
+groups. In essence, a micro-batch for training billion-scale LLMs normally
+contains very few sequences. So, the micro-batch LBL is almost at the sequence
+level, and the router is pushed to distribute the token evenly within each
+sequence. Under this strict constraint, even tokens from a domain-specific
+sequence ($\textit{e.g.}$, code) are uniformly routed to all experts, thereby
+inhibiting expert specialization. In this work, we propose calculating LBL
+using a $\textbf{global-batch}$ to loose this constraint. Because a
+global-batch contains much more diverse sequences than a micro-batch, which
+will encourage load balance at the corpus level. Specifically, we introduce an
+extra communication step to synchronize $f_i$ across micro-batches and then use
+it to calculate the LBL. Through experiments on training MoEs-based LLMs (up to
+$\textbf{42.8B}$ total parameters and $\textbf{400B}$ tokens), we surprisingly
+find that the global-batch LBL strategy yields excellent performance gains in
+both pre-training perplexity and downstream tasks. Our analysis reveals that
+the global-batch LBL also greatly improves the domain specialization of MoE
+experts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating multiple models using labeled and unlabeled data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Divya Shanmugam, Shuvom Sadhuka, Manish Raghavan, John Guttag, Bonnie Berger, Emma Pierson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It remains difficult to evaluate machine learning classifiers in the absence
+of a large, labeled dataset. While labeled data can be prohibitively expensive
+or impossible to obtain, unlabeled data is plentiful. Here, we introduce
+Semi-Supervised Model Evaluation (SSME), a method that uses both labeled and
+unlabeled data to evaluate machine learning classifiers. SSME is the first
+evaluation method to take advantage of the fact that: (i) there are frequently
+multiple classifiers for the same task, (ii) continuous classifier scores are
+often available for all classes, and (iii) unlabeled data is often far more
+plentiful than labeled data. The key idea is to use a semi-supervised mixture
+model to estimate the joint distribution of ground truth labels and classifier
+predictions. We can then use this model to estimate any metric that is a
+function of classifier scores and ground truth labels (e.g., accuracy or
+expected calibration error). We present experiments in four domains where
+obtaining large labeled datasets is often impractical: (1) healthcare, (2)
+content moderation, (3) molecular property prediction, and (4) image
+annotation. Our results demonstrate that SSME estimates performance more
+accurately than do competing methods, reducing error by 5.1x relative to using
+labeled data alone and 2.4x relative to the next best competing method. SSME
+also improves accuracy when evaluating performance across subsets of the test
+distribution (e.g., specific demographic subgroups) and when evaluating the
+performance of language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian Despeckling of Structured Sources 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Zafari, Shirin Jalali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speckle noise is a fundamental challenge in coherent imaging systems,
+significantly degrading image quality. Over the past decades, numerous
+despeckling algorithms have been developed for applications such as Synthetic
+Aperture Radar (SAR) and digital holography. In this paper, we aim to establish
+a theoretically grounded approach to despeckling. We propose a method
+applicable to general structured stationary stochastic sources. We demonstrate
+the effectiveness of the proposed method on piecewise constant sources.
+Additionally, we theoretically derive a lower bound on the despeckling
+performance for such sources. The proposed depseckler applied to the 1-Markov
+structured sources achieves better reconstruction performance with no strong
+simplification of the ground truth signal model or speckle noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Entropy Attacks to Language Models via Rare Event Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingze Ni, Yongshun Gong, Wei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Black-box textual adversarial attacks are challenging due to the lack of
+model information and the discrete, non-differentiable nature of text. Existing
+methods often lack versatility for attacking different models, suffer from
+limited attacking performance due to the inefficient optimization with word
+saliency ranking, and frequently sacrifice semantic integrity to achieve better
+attack outcomes. This paper introduces a novel approach to textual adversarial
+attacks, which we call Cross-Entropy Attacks (CEA), that uses Cross-Entropy
+optimization to address the above issues. Our CEA approach defines adversarial
+objectives for both soft-label and hard-label settings and employs CE
+optimization to identify optimal replacements. Through extensive experiments on
+document classification and language translation problems, we demonstrate that
+our attack method excels in terms of attacking performance, imperceptibility,
+and sentence quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Memory-Efficient Large-Scale Model Training in AI for
+  Science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyuan Tian, Linbo Qiao, Baihui Liu, Gongqingjian Jiang, Dongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scientific research faces high costs and inefficiencies with traditional
+methods, but the rise of deep learning and large language models (LLMs) offers
+innovative solutions. This survey reviews LLM applications across scientific
+fields such as biology, medicine, chemistry, and meteorology, underscoring
+their role in advancing research. However, the continuous expansion of model
+size has led to significant memory demands, hindering further development and
+application of LLMs for science. To address this, we review memory-efficient
+training techniques for LLMs based on the transformer architecture, including
+distributed training, mixed precision training, and gradient checkpointing.
+Using AlphaFold 2 as an example, we demonstrate how tailored memory
+optimization methods can reduce storage needs while preserving prediction
+accuracy. We also discuss the challenges of memory optimization in practice and
+potential future directions, hoping to provide valuable insights for
+researchers and engineers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Supervised Learning for Analog and RF Circuit Design: Benchmarks and
+  Comparative Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11839v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11839v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asal Mehradfar, Xuzhe Zhao, Yue Niu, Sara Babakniya, Mahdi Alesheikh, Hamidreza Aghasi, Salman Avestimehr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automating analog and radio-frequency (RF) circuit design using machine
+learning (ML) significantly reduces the time and effort required for parameter
+optimization. This study explores supervised ML-based approaches for designing
+circuit parameters from performance specifications across various circuit
+types, including homogeneous and heterogeneous designs. By evaluating diverse
+ML models, from neural networks like transformers to traditional methods like
+random forests, we identify the best-performing models for each circuit. Our
+results show that simpler circuits, such as low-noise amplifiers, achieve
+exceptional accuracy with mean relative errors as low as 0.3% due to their
+linear parameter-performance relationships. In contrast, complex circuits, like
+power amplifiers and voltage-controlled oscillators, present challenges due to
+their non-linear interactions and larger design spaces. For heterogeneous
+circuits, our approach achieves an 88% reduction in errors with increased
+training data, with the receiver achieving a mean relative error as low as
+0.23%, showcasing the scalability and accuracy of the proposed methodology.
+Additionally, we provide insights into model strengths, with transformers
+excelling in capturing non-linear mappings and k-nearest neighbors performing
+robustly in moderately linear parameter spaces, especially in heterogeneous
+circuits with larger datasets. This work establishes a foundation for extending
+ML-driven design automation, enabling more efficient and scalable circuit
+design workflows.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven Detection and Evaluation of Damages in Concrete Structures:
+  Using Deep Learning and Computer Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saeid Ataei, Saeed Adibnazari, Seyyed Taghi Ataei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural integrity is vital for maintaining the safety and longevity of
+concrete infrastructures such as bridges, tunnels, and walls. Traditional
+methods for detecting damages like cracks and spalls are labor-intensive,
+time-consuming, and prone to human error. To address these challenges, this
+study explores advanced data-driven techniques using deep learning for
+automated damage detection and analysis. Two state-of-the-art instance
+segmentation models, YOLO-v7 instance segmentation and Mask R-CNN, were
+evaluated using a dataset comprising 400 images, augmented to 10,995 images
+through geometric and color-based transformations to enhance robustness. The
+models were trained and validated using a dataset split into 90% training set,
+validation and test set 10%. Performance metrics such as precision, recall,
+mean average precision (mAP@0.5), and frames per second (FPS) were used for
+evaluation. YOLO-v7 achieved a superior mAP@0.5 of 96.1% and processed 40 FPS,
+outperforming Mask R-CNN, which achieved a mAP@0.5 of 92.1% with a slower
+processing speed of 18 FPS. The findings recommend YOLO-v7 instance
+segmentation model for real-time, high-speed structural health monitoring,
+while Mask R-CNN is better suited for detailed offline assessments. This study
+demonstrates the potential of deep learning to revolutionize infrastructure
+maintenance, offering a scalable and efficient solution for automated damage
+detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 10 figures. This study focuses on the data-driven detection
+  and evaluation of damages in concrete structures using deep learning and
+  computer vision techniques</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Adaptive Modeling using Neural Networks Trained with Nonlinear
+  Dynamics Based Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Liu, Prashant N. Kambali, C. Nataraj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate models are essential for design, performance prediction, control,
+and diagnostics in complex engineering systems. Physics-based models excel
+during the design phase but often become outdated during system deployment due
+to changing operational conditions, unknown interactions, excitations, and
+parametric drift. While data-based models can capture the current state of
+complex systems, they face significant challenges, including excessive data
+dependence, limited generalizability to changing conditions, and inability to
+predict parametric dependence. This has led to combining physics and data in
+modeling, termed physics-infused machine learning, often using numerical
+simulations from physics-based models. This paper introduces a novel approach
+that departs from standard techniques by uncovering information from nonlinear
+dynamical modeling and embedding it in data-based models. The goal is to create
+a hybrid adaptive modeling framework that integrates data-based modeling with
+newly measured data and analytical nonlinear dynamical models for enhanced
+accuracy, parametric dependence, and improved generalizability. By explicitly
+incorporating nonlinear dynamic phenomena through perturbation methods, the
+predictive capabilities are more realistic and insightful compared to knowledge
+obtained from brute-force numerical simulations. In particular, perturbation
+methods are utilized to derive asymptotic solutions which are parameterized to
+generate frequency responses. Frequency responses provide comprehensive
+insights into dynamics and nonlinearity which are quantified and extracted as
+high-quality features. A machine-learning model, trained by these features,
+tracks parameter variations and updates the mismatched model. The results
+demonstrate that this adaptive modeling method outperforms numerical gray box
+models in prediction accuracy and computational efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ShadowGenes: Leveraging Recurring Patterns within Computational Graphs
+  for Model Genealogy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kasimir Schulz, Kieran Evans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning model genealogy enables practitioners to determine which
+architectural family a neural network belongs to. In this paper, we introduce
+ShadowGenes, a novel, signature-based method for identifying a given model's
+architecture, type, and family. Our method involves building a computational
+graph of the model that is agnostic of its serialization format, then analyzing
+its internal operations to identify unique patterns, and finally building and
+refining signatures based on these. We highlight important workings of the
+underlying engine and demonstrate the technique used to construct a signature
+and scan a given model. This approach to model genealogy can be applied to
+model files without the need for additional external information. We test
+ShadowGenes on a labeled dataset of over 1,400 models and achieve a mean true
+positive rate of 97.49% and a precision score of 99.51%; which validates the
+technique as a practical method for model genealogy. This enables practitioners
+to understand the use cases of a given model, the internal computational
+process, and identify possible security risks, such as the potential for model
+backdooring.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PXGen: A Post-hoc Explainable Method for Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11827v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11827v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yen-Lung Huang, Ming-Hsi Weng, Hao-Tsung Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid growth of generative AI in numerous applications, explainable
+AI (XAI) plays a crucial role in ensuring the responsible development and
+deployment of generative AI technologies. XAI has undergone notable
+advancements and widespread adoption in recent years, reflecting a concerted
+push to enhance the transparency, interpretability, and credibility of AI
+systems. Recent research emphasizes that a proficient XAI method should adhere
+to a set of criteria, primarily focusing on two key areas. Firstly, it should
+ensure the quality and fluidity of explanations, encompassing aspects like
+faithfulness, plausibility, completeness, and tailoring to individual needs.
+Secondly, the design principle of the XAI system or mechanism should cover the
+following factors such as reliability, resilience, the verifiability of its
+outputs, and the transparency of its algorithm. However, research in XAI for
+generative models remains relatively scarce, with little exploration into how
+such methods can effectively meet these criteria in that domain. In this work,
+we propose PXGen, a post-hoc explainable method for generative models. Given a
+model that needs to be explained, PXGen prepares two materials for the
+explanation, the Anchor set and intrinsic & extrinsic criteria. Those materials
+are customizable by users according to their purpose and requirements. Via the
+calculation of each criterion, each anchor has a set of feature values and
+PXGen provides examplebased explanation methods according to the feature values
+among all the anchors and illustrated and visualized to the users via tractable
+algorithms such as k-dispersion or k-center.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Scalable Graph Unlearning: A Node Influence Maximization based
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Bowen Fan, Zhengyu Wu, Zhiyu Li, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning, as a pivotal technology for enhancing model robustness
+and data privacy, has garnered significant attention in prevalent web mining
+applications, especially in thriving graph-based scenarios. However, most
+existing graph unlearning (GU) approaches face significant challenges due to
+the intricate interactions among web-scale graph elements during the model
+training: (1) The gradient-driven node entanglement hinders the complete
+knowledge removal in response to unlearning requests; (2) The billion-level
+graph elements in the web scenarios present inevitable scalability issues. To
+break the above limitations, we open up a new perspective by drawing a
+connection between GU and conventional social influence maximization. To this
+end, we propose Node Influence Maximization (NIM) through the decoupled
+influence propagation model and fine-grained influence function in a scalable
+manner, which is crafted to be a plug-and-play strategy to identify potential
+nodes affected by unlearning entities. This approach enables offline execution
+independent of GU, allowing it to be seamlessly integrated into most GU methods
+to improve their unlearning performance. Based on this, we introduce Scalable
+Graph Unlearning (SGU) as a new fine-tuned framework, which balances the
+forgetting and reasoning capability of the unlearned model by entity-specific
+optimizations. Extensive experiments on 14 datasets, including large-scale
+ogbn-papers100M, have demonstrated the effectiveness of our approach.
+Specifically, NIM enhances the forgetting capability of most GU methods, while
+SGU achieves comprehensive SOTA performance and maintains scalability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Group-Agent Reinforcement Learning with Heterogeneous Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyue Wu, Xiao-Jun Zeng, Tingting Mu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group-agent reinforcement learning (GARL) is a newly arising learning
+scenario, where multiple reinforcement learning agents study together in a
+group, sharing knowledge in an asynchronous fashion. The goal is to improve the
+learning performance of each individual agent. Under a more general
+heterogeneous setting where different agents learn using different algorithms,
+we advance GARL by designing novel and effective group-learning mechanisms.
+They guide the agents on whether and how to learn from action choices from the
+others, and allow the agents to adopt available policy and value function
+models sent by another agent if they perform better. We have conducted
+extensive experiments on a total of 43 different Atari 2600 games to
+demonstrate the superior performance of the proposed method. After the group
+learning, among the 129 agents examined, 96% are able to achieve a learning
+speed-up, and 72% are able to learn over 100 times faster. Also, around 41% of
+those agents have achieved a higher accumulated reward score by learning in
+less than 5% of the time steps required by a single agent when learning on its
+own.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Effective Digraph Representation Learning: A Magnetic Adaptive
+  Propagation based Approach <span class="chip">WWW 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Daohan Su, Zhengyu Wu, Guang Zeng, Hongchao Qin, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The $q$-parameterized magnetic Laplacian serves as the foundation of directed
+graph (digraph) convolution, enabling this kind of digraph neural network
+(MagDG) to encode node features and structural insights by complex-domain
+message passing. As a generalization of undirected methods, MagDG shows
+superior capability in modeling intricate web-scale topology. Despite the great
+success achieved by existing MagDGs, limitations still exist: (1) Hand-crafted
+$q$: The performance of MagDGs depends on selecting an appropriate
+$q$-parameter to construct suitable graph propagation equations in the complex
+domain. This parameter tuning, driven by downstream tasks, limits model
+flexibility and significantly increases manual effort. (2) Coarse Message
+Passing: Most approaches treat all nodes with the same complex-domain
+propagation and aggregation rules, neglecting their unique digraph contexts.
+This oversight results in sub-optimal performance. To address the above issues,
+we propose two key techniques: (1) MAP is crafted to be a plug-and-play
+complex-domain propagation optimization strategy in the context of digraph
+learning, enabling seamless integration into any MagDG to improve predictions
+while enjoying high running efficiency. (2) MAP++ is a new digraph learning
+framework, further incorporating a learnable mechanism to achieve adaptively
+edge-wise propagation and node-wise aggregation in the complex domain for
+better performance. Extensive experiments on 12 datasets demonstrate that MAP
+enjoys flexibility for it can be incorporated with any MagDG, and scalability
+as it can deal with web-scale digraphs. MAP++ achieves SOTA predictive
+performance on 4 different downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by WWW 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Utilising Deep Learning to Elicit Expert Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11813v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11813v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julia R. Falconer, Eibe Frank, Devon L. L. Polaschek, Chaitanya Joshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work [ 14 ] has introduced a method for prior elicitation that
+utilizes records of expert decisions to infer a prior distribution. While this
+method provides a promising approach to eliciting expert uncertainty, it has
+only been demonstrated using tabular data, which may not entirely represent the
+information used by experts to make decisions. In this paper, we demonstrate
+how analysts can adopt a deep learning approach to utilize the method proposed
+in [14 ] with the actual information experts use. We provide an overview of
+deep learning models that can effectively model expert decision-making to
+elicit distributions that capture expert uncertainty and present an example
+examining the risk of colon cancer to show in detail how these models can be
+used.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automating High Quality RT Planning at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riqiang Gao, Mamadou Diallo, Han Liu, Anthony Magliari, Jonathan Sackett, Wilko Verbakel, Sandra Meyers, Masoud Zarepisheh, Rafe Mcbeth, Simon Arberet, Martin Kraus, Florin C. Ghesu, Ali Kamen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiotherapy (RT) planning is complex, subjective, and time-intensive.
+Advances in artificial intelligence (AI) promise to improve its precision,
+efficiency, and consistency, but progress is often limited by the scarcity of
+large, standardized datasets. To address this, we introduce the Automated
+Iterative RT Planning (AIRTP) system, a scalable solution for generating
+high-quality treatment plans. This scalable solution is designed to generate
+substantial volumes of consistently high-quality treatment plans, overcoming a
+key obstacle in the advancement of AI-driven RT planning. Our AIRTP pipeline
+adheres to clinical guidelines and automates essential steps, including
+organ-at-risk (OAR) contouring, helper structure creation, beam setup,
+optimization, and plan quality improvement, using AI integrated with RT
+planning software like Eclipse of Varian. Furthermore, a novel approach for
+determining optimization parameters to reproduce 3D dose distributions, i.e. a
+method to convert dose predictions to deliverable treatment plans constrained
+by machine limitations. A comparative analysis of plan quality reveals that our
+automated pipeline produces treatment plans of quality comparable to those
+generated manually, which traditionally require several hours of labor per
+plan. Committed to public research, the first data release of our AIRTP
+pipeline includes nine cohorts covering head-and-neck and lung cancer sites to
+support an AAPM 2025 challenge. This data set features more than 10 times the
+number of plans compared to the largest existing well-curated public data set
+to our best knowledge.
+Repo:{https://github.com/RiqiangGao/GDP-HMM_AAPMChallenge}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Related to GDP-HMM grand challenge</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provably effective detection of effective data poisoning attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Gallagher, Yasaman Esfandiari, Callen MacPhee, Michael Warren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper establishes a mathematically precise definition of dataset
+poisoning attack and proves that the very act of effectively poisoning a
+dataset ensures that the attack can be effectively detected. On top of a
+mathematical guarantee that dataset poisoning is identifiable by a new
+statistical test that we call the Conformal Separability Test, we provide
+experimental evidence that we can adequately detect poisoning attempts in the
+real world.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FoundationStereo: Zero-Shot Stereo Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09898v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09898v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Wen, Matthew Trepte, Joseph Aribido, Jan Kautz, Orazio Gallo, Stan Birchfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tremendous progress has been made in deep stereo matching to excel on
+benchmark datasets through per-domain fine-tuning. However, achieving strong
+zero-shot generalization - a hallmark of foundation models in other computer
+vision tasks - remains challenging for stereo matching. We introduce
+FoundationStereo, a foundation model for stereo depth estimation designed to
+achieve strong zero-shot generalization. To this end, we first construct a
+large-scale (1M stereo pairs) synthetic training dataset featuring large
+diversity and high photorealism, followed by an automatic self-curation
+pipeline to remove ambiguous samples. We then design a number of network
+architecture components to enhance scalability, including a side-tuning feature
+backbone that adapts rich monocular priors from vision foundation models to
+mitigate the sim-to-real gap, and long-range context reasoning for effective
+cost volume filtering. Together, these components lead to strong robustness and
+accuracy across domains, establishing a new standard in zero-shot stereo depth
+estimation. Project page: https://nvlabs.github.io/FoundationStereo/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Choice of Normalization Influences Shrinkage in Regularized
+  Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03821v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03821v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Larsson, Jonas Wallin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regularized models are often sensitive to the scales of the features in the
+data and it has therefore become standard practice to normalize (center and
+scale) the features before fitting the model. But there are many different ways
+to normalize the features and the choice may have dramatic effects on the
+resulting model. In spite of this, there has so far been no research on this
+topic. In this paper, we begin to bridge this knowledge gap by studying
+normalization in the context of lasso, ridge, and elastic net regression. We
+focus on normal and binary features and show that the class balances of binary
+features directly influences the regression coefficients and that this effect
+depends on the combination of normalization and regularization methods used. We
+demonstrate that this effect can be mitigated by scaling binary features with
+their variance in the case of the lasso and standard deviation in the case of
+ridge regression, but that this comes at the cost of increased variance. For
+the elastic net, we show that scaling the penalty weights, rather than the
+features, can achieve the same effect. Finally, we also tackle mixes of binary
+and normal features as well as interactions and provide some initial results on
+how to normalize features in these cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging the Training-Inference Gap in LLMs by Leveraging Self-Generated
+  Tokens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.14655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.14655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhepeng Cen, Yao Liu, Siliang Zeng, Pratik Chaudhari, Huzefa Rangwala, George Karypis, Rasool Fakoor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models are often trained to maximize the likelihood of the next
+token given past tokens in the training dataset. However, during inference
+time, they are utilized differently, generating text sequentially and
+auto-regressively by using previously generated tokens as input to predict the
+next one. Marginal differences in predictions at each step can cascade over
+successive steps, resulting in different distributions from what the models
+were trained for and potentially leading to unpredictable behavior. This paper
+proposes two simple approaches based on model own generation to address this
+discrepancy between the training and inference time. Our first approach is
+Batch-Scheduled Sampling, where, during training, we stochastically choose
+between the ground-truth token from the dataset and the model's own generated
+token as input to predict the next token. This is done in an offline manner,
+modifying the context window by interleaving ground-truth tokens with those
+generated by the model. Our second approach is Reference-Answer-based
+Correction, where we explicitly incorporate a self-correction capability into
+the model during training. This enables the model to effectively self-correct
+the gaps between the generated sequences and the ground truth data without
+relying on an external oracle model. By incorporating our proposed strategies
+during training, we have observed an overall improvement in performance
+compared to baseline methods, as demonstrated by our extensive experiments
+using summarization, general question-answering, and math question-answering
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in TMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $\spadesuit$ SPADE $\spadesuit$ Split Peak Attention DEcomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.05852v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.05852v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malcolm Wolff, Kin G. Olivares, Boris Oreshkin, Sunny Ruan, Sitan Yang, Abhinav Katoch, Shankar Ramasubramanian, Youxin Zhang, Michael W. Mahoney, Dmitry Efimov, Vincent Quenneville-Bélair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Demand forecasting faces challenges induced by Peak Events (PEs)
+corresponding to special periods such as promotions and holidays. Peak events
+create significant spikes in demand followed by demand ramp down periods.
+Neural networks like MQCNN and MQT overreact to demand peaks by carrying over
+the elevated PE demand into subsequent Post-Peak-Event (PPE) periods, resulting
+in significantly over-biased forecasts. To tackle this challenge, we introduce
+a neural forecasting model called Split Peak Attention DEcomposition, SPADE.
+This model reduces the impact of PEs on subsequent forecasts by modeling
+forecasting as consisting of two separate tasks: one for PEs; and the other for
+the rest. Its architecture then uses masked convolution filters and a
+specialized Peak Attention module. We show SPADE's performance on a worldwide
+retail dataset with hundreds of millions of products. Our results reveal an
+overall PPE improvement of 4.5%, a 30% improvement for most affected forecasts
+after promotions and holidays, and an improvement in PE accuracy by 3.9%,
+relative to current production models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Position: the emergence of wavelet-like properties in
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.18067v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.18067v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valeria Ruscio, Fabrizio Silvestri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies how transformer models develop robust wavelet-like
+properties that effectively compensate for the theoretical limitations of
+Rotary Position Embeddings (RoPE), providing insights into how these networks
+process sequential information across different scales. Through theoretical
+analysis and empirical validation across models ranging from 1B to 12B
+parameters, we show that attention heads naturally evolve to implement
+multi-resolution processing analogous to wavelet transforms. Our analysis
+establishes that attention heads consistently organize into complementary
+frequency bands with systematic power distribution patterns, and these
+wavelet-like characteristics become more pronounced in larger models. We
+provide mathematical analysis showing how these properties align with optimal
+solutions to the fundamental uncertainty principle between positional precision
+and frequency resolution. Our findings suggest that the effectiveness of modern
+transformer architectures stems significantly from their development of optimal
+multi-resolution decompositions that naturally address the theoretical
+constraints of position encoding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LiteVAE: Lightweight and Efficient Variational Autoencoders for Latent
+  Diffusion Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14477v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14477v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyedmorteza Sadat, Jakob Buhmann, Derek Bradley, Otmar Hilliges, Romann M. Weber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in latent diffusion models (LDMs) have revolutionized
+high-resolution image generation, but the design space of the autoencoder that
+is central to these systems remains underexplored. In this paper, we introduce
+LiteVAE, a new autoencoder design for LDMs, which leverages the 2D discrete
+wavelet transform to enhance scalability and computational efficiency over
+standard variational autoencoders (VAEs) with no sacrifice in output quality.
+We investigate the training methodologies and the decoder architecture of
+LiteVAE and propose several enhancements that improve the training dynamics and
+reconstruction quality. Our base LiteVAE model matches the quality of the
+established VAEs in current LDMs with a six-fold reduction in encoder
+parameters, leading to faster training and lower GPU memory requirements, while
+our larger model outperforms VAEs of comparable complexity across all evaluated
+metrics (rFID, LPIPS, PSNR, and SSIM).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Untrained Perceptual Loss for image denoising of line-like structures in
+  MR images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.05884v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.05884v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elisabeth Pfaehler, Daniel Pflugfelder, Hanno Scharr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the acquisition of Magnetic Resonance (MR) images shorter scan times lead
+to higher image noise. Therefore, automatic image denoising using deep learning
+methods is of high interest. MR images containing line-like structures such as
+roots or vessels yield special characteristics as they display connected
+structures and yield sparse information. For this kind of data, it is important
+to consider voxel neighborhoods when training a denoising network. In this
+paper, we translate the Perceptual Loss to 3D data by comparing feature maps of
+untrained networks in the loss function as done previously for 2D data. We
+tested the performance of untrained Perceptual Loss (uPL) on 3D image denoising
+of MR images displaying brain vessels (MR angiograms - MRA) and images of plant
+roots in soil. We investigate the impact of various uPL characteristics such as
+weight initialization, network depth, kernel size, and pooling operations on
+the results. We tested the performance of the uPL loss on four Rician noise
+levels using evaluation metrics such as the Structural Similarity Index Metric
+(SSIM). We observe, that our uPL outperforms conventional loss functions such
+as the L1 loss or a loss based on the Structural Similarity Index Metric
+(SSIM). The uPL network's initialization is not important, while network depth
+and pooling operations impact denoising performance. E.g. for both datasets a
+network with five convolutional layers led to the best performance while a
+network with more layers led to a performance drop. We also find that small uPL
+networks led to better or comparable results than using large networks such as
+VGG. We observe superior performance of our loss for both datasets, all noise
+levels, and three network architectures. In conclusion, for images containing
+line-like structures, uPL is an alternative to other loss functions for 3D
+image denoising.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Scale Texture Loss for CT denoising with GANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16640v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16640v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Di Feola, Lorenzo Tronchin, Valerio Guarrasi, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) have proved as a powerful framework
+for denoising applications in medical imaging. However, GAN-based denoising
+algorithms still suffer from limitations in capturing complex relationships
+within the images. In this regard, the loss function plays a crucial role in
+guiding the image generation process, encompassing how much a synthetic image
+differs from a real image. To grasp highly complex and non-linear textural
+relationships in the training process, this work presents a novel approach to
+capture and embed multi-scale texture information into the loss function. Our
+method introduces a differentiable multi-scale texture representation of the
+images dynamically aggregated by a self-attention layer, thus exploiting
+end-to-end gradient-based optimization. We validate our approach by carrying
+out extensive experiments in the context of low-dose CT denoising, a
+challenging application that aims to enhance the quality of noisy CT scans. We
+utilize three publicly available datasets, including one simulated and two real
+datasets. The results are promising as compared to other well-established loss
+functions, being also consistent across three different GAN architectures. The
+code is available at:
+https://github.com/TrainLaboratory/MultiScaleTextureLoss-MSTLF
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FLARE: Faithful Logic-Aided Reasoning and Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11900v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11900v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Arakelyan, Pasquale Minervini, Pat Verga, Patrick Lewis, Isabelle Augenstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern Question Answering (QA) and Reasoning approaches based on Large
+Language Models (LLMs) commonly use prompting techniques, such as
+Chain-of-Thought (CoT), assuming the resulting generation will have a more
+granular exploration and reasoning over the question space and scope. However,
+such methods struggle with generating outputs that are faithful to the
+intermediate chain of reasoning produced by the model. On the other end of the
+spectrum, neuro-symbolic methods such as Faithful CoT (F-CoT) propose to
+combine LLMs with external symbolic solvers. While such approaches boast a high
+degree of faithfulness, they usually require a model trained for code
+generation and struggle with tasks that are ambiguous or hard to formalise
+strictly. We introduce $\textbf{F}$aithful $\textbf{L}$ogic-$\textbf{A}$ided
+$\textbf{R}$easoning and $\textbf{E}$xploration ($\textbf{FLARE}$), a novel
+interpretable approach for traversing the problem space using task
+decompositions. We use the LLM to plan a solution, soft-formalise the query
+into facts and predicates using a logic programming code and simulate that code
+execution using an exhaustive multi-hop search over the defined space. Our
+method allows us to compute the faithfulness of the reasoning process w.r.t.
+the generated code and analyse the steps of the multi-hop search without
+relying on external solvers. Our methods achieve SOTA results on $\mathbf{7}$
+out of $\mathbf{9}$ diverse reasoning benchmarks. We also show that model
+faithfulness positively correlates with overall performance and further
+demonstrate that $\textbf{FLARE}$ allows pinpointing the decisive factors
+sufficient for and leading to the correct answer with optimal reasoning during
+the multi-hop search.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforcement Learning from Human Feedback without Reward Inference:
+  Model-Free Algorithm and Instance-Dependent Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.07455v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.07455v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qining Zhang, Honghao Wei, Lei Ying
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study reinforcement learning from human feedback (RLHF)
+under an episodic Markov decision process with a general trajectory-wise reward
+model. We developed a model-free RLHF best policy identification algorithm,
+called $\mathsf{BSAD}$, without explicit reward model inference, which is a
+critical intermediate step in the contemporary RLHF paradigms for training
+large language models (LLM). The algorithm identifies the optimal policy
+directly from human preference information in a backward manner, employing a
+dueling bandit sub-routine that constantly duels actions to identify the
+superior one. $\mathsf{BSAD}$ adopts a reward-free exploration and
+best-arm-identification-like adaptive stopping criteria to equalize the
+visitation among all states in the same decision step while moving to the
+previous step as soon as the optimal action is identifiable, leading to a
+provable, instance-dependent sample complexity
+$\tilde{\mathcal{O}}(c_{\mathcal{M}}SA^3H^3M\log\frac{1}{\delta})$ which
+resembles the result in classic RL, where $c_{\mathcal{M}}$ is the
+instance-dependent constant and $M$ is the batch size. Moreover,
+$\mathsf{BSAD}$ can be transformed into an explore-then-commit algorithm with
+logarithmic regret and generalized to discounted MDPs using a frame-based
+approach. Our results show: (i) sample-complexity-wise, RLHF is not
+significantly harder than classic RL and (ii) end-to-end RLHF may deliver
+improved performance by avoiding pitfalls in reward inferring such as overfit
+and distribution shift.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COmoving Computer Acceleration (COCA): $N$-body simulations in an
+  emulated frame of reference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.02154v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.02154v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deaglan J. Bartlett, Marco Chiarenza, Ludvig Doeser, Florent Leclercq
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  $N$-body simulations are computationally expensive, so machine-learning
+(ML)-based emulation techniques have emerged as a way to increase their speed.
+Although fast, surrogate models have limited trustworthiness due to potentially
+substantial emulation errors that current approaches cannot correct for. To
+alleviate this problem, we introduce COmoving Computer Acceleration (COCA), a
+hybrid framework interfacing ML with an $N$-body simulator. The correct
+physical equations of motion are solved in an emulated frame of reference, so
+that any emulation error is corrected by design. This approach corresponds to
+solving for the perturbation of particle trajectories around the machine-learnt
+solution, which is computationally cheaper than obtaining the full solution,
+yet is guaranteed to converge to the truth as one increases the number of force
+evaluations. Although applicable to any ML algorithm and $N$-body simulator,
+this approach is assessed in the particular case of particle-mesh cosmological
+simulations in a frame of reference predicted by a convolutional neural
+network, where the time dependence is encoded as an additional input parameter
+to the network. COCA efficiently reduces emulation errors in particle
+trajectories, requiring far fewer force evaluations than running the
+corresponding simulation without ML. We obtain accurate final density and
+velocity fields for a reduced computational budget. We demonstrate that this
+method shows robustness when applied to examples outside the range of the
+training data. When compared to the direct emulation of the Lagrangian
+displacement field using the same training resources, COCA's ability to correct
+emulation errors results in more accurate predictions. COCA makes $N$-body
+simulations cheaper by skipping unnecessary force evaluations, while still
+solving the correct equations of motion and correcting for emulation errors
+made by ML.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 13 figures. Accepted for publication in A&A</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Specialization: Assessing the Capabilities of MLLMs in Age and
+  Gender Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02302v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02302v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maksim Kuprashevich, Grigorii Alekseenko, Irina Tolstykh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have recently gained immense
+popularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as
+open-source ones such as LLaVA, are essentially general-purpose models and are
+applied to solve a wide variety of tasks, including those in computer vision.
+These neural networks possess such strong general knowledge and reasoning
+abilities that they have proven capable of working even on tasks for which they
+were not specifically trained. We compared the capabilities of the most
+powerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task
+of age and gender estimation with our state-of-the-art specialized model,
+MiVOLO. We also updated MiVOLO and provide details and new metrics in this
+article. This comparison has yielded some interesting results and insights
+about the strengths and weaknesses of the participating models. Furthermore, we
+attempted various ways to fine-tune the ShareGPT4V model for this specific
+task, aiming to achieve state-of-the-art results in this particular challenge.
+Although such a model would not be practical in production, as it is incredibly
+expensive compared to a specialized model like MiVOLO, it could be very useful
+in some tasks, like data annotation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative Topological Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15152v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15152v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alona Levy-Jurgenson, Zohar Yakhini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative methods have recently seen significant improvements by generating
+in a lower-dimensional latent representation of the data. However, many of the
+generative methods applied in the latent space remain complex and difficult to
+train. Further, it is not entirely clear why transitioning to a
+lower-dimensional latent space can improve generative quality. In this work, we
+introduce a new and simple generative method grounded in topology theory --
+Generative Topological Networks (GTNs) -- which also provides insights into why
+lower-dimensional latent-space representations might be better-suited for data
+generation. GTNs are simple to train -- they employ a standard supervised
+learning approach and do not suffer from common generative pitfalls such as
+mode collapse, posterior collapse or the need to pose constraints on the neural
+network architecture. We demonstrate the use of GTNs on several datasets,
+including MNIST, CelebA, CIFAR-10 and the Hands and Palm Images dataset by
+training GTNs on a lower-dimensional latent representation of the data. We show
+that GTNs can improve upon VAEs and that they are quick to converge, generating
+realistic samples in early epochs. Further, we use the topological
+considerations behind the development of GTNs to offer insights into why
+generative models may benefit from operating on a lower-dimensional latent
+space, highlighting the important link between the intrinsic dimension of the
+data and the dimension in which the data is generated. Particularly, we
+demonstrate that generating in high dimensional ambient spaces may be a
+contributing factor to out-of-distribution samples generated by diffusion
+models. We also highlight other topological properties that are important to
+consider when using and designing generative models. Our code is available at:
+https://github.com/alonalj/GTN
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ladder-residual: parallelism-aware architecture for accelerating large
+  model inference with communication overlapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muru Zhang, Mayank Mishra, Zhongzhu Zhou, William Brandon, Jue Wang, Yoon Kim, Jonathan Ragan-Kelley, Shuaiwen Leon Song, Ben Athiwaratkun, Tri Dao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language model inference is both memory-intensive and time-consuming,
+often requiring distributed algorithms to efficiently scale. Various model
+parallelism strategies are used in multi-gpu training and inference to
+partition computation across multiple devices, reducing memory load and
+computation time. However, using model parallelism necessitates communication
+of information between GPUs, which has been a major bottleneck and limits the
+gains obtained by scaling up the number of devices. We introduce Ladder
+Residual, a simple architectural modification applicable to all residual-based
+models that enables straightforward overlapping that effectively hides the
+latency of communication. Our insight is that in addition to systems
+optimization, one can also redesign the model architecture to decouple
+communication from computation. While Ladder Residual can allow
+communication-computation decoupling in conventional parallelism patterns, we
+focus on Tensor Parallelism in this paper, which is particularly bottlenecked
+by its heavy communication. For a Transformer model with 70B parameters,
+applying Ladder Residual to all its layers can achieve 30% end-to-end wall
+clock speed up at inference time with TP sharding over 8 devices. We refer the
+resulting Transformer model as the Ladder Transformer. We train a 1B and 3B
+Ladder Transformer from scratch and observe comparable performance to a
+standard dense transformer baseline. We also show that it is possible to
+convert parts of the Llama-3.1 8B model to our Ladder Residual architecture
+with minimal accuracy degradation by only retraining for 3B tokens.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to generate feasible graphs using graph grammars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06003v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06003v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Mautner, Rolf Backofen, Fabrizio Costa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative methods for graphs need to be sufficiently flexible to model
+complex dependencies between sets of nodes. At the same time, the generated
+graphs need to satisfy domain-dependent feasibility conditions, that is, they
+should not violate certain constraints that would make their interpretation
+impossible within the given application domain (e.g. a molecular graph where an
+atom has a very large number of chemical bounds). Crucially, constraints can
+involve not only local but also long-range dependencies: for example, the
+maximal length of a cycle can be bounded.
+  Currently, a large class of generative approaches for graphs, such as methods
+based on artificial neural networks, is based on message passing schemes. These
+approaches suffer from information 'dilution' issues that severely limit the
+maximal range of the dependencies that can be modeled. To address this problem,
+we propose a generative approach based on the notion of graph grammars. The key
+novel idea is to introduce a domain-dependent coarsening procedure to provide
+short-cuts for long-range dependencies.
+  We show the effectiveness of our proposal in two domains: 1) small drugs and
+2) RNA secondary structures. In the first case, we compare the quality of the
+generated molecular graphs via the Molecular Sets (MOSES) benchmark suite,
+which evaluates the distance between generated and real molecules, their
+lipophilicity, synthesizability, and drug-likeness. In the second case, we show
+that the approach can generate very large graphs (with hundreds of nodes) that
+are accepted as valid examples for a desired RNA family by the "Infernal"
+covariance model, a state-of-the-art RNA classifier.
+  Our implementation is available on github:
+github.com/fabriziocosta/GraphLearn
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PRIMUS: <span class="highlight-title">Pretrain</span>ing IMU Encoders with Multimodal Self-Supervision <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15127v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15127v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arnav M. Das, Chi Ian Tang, Fahim Kawsar, Mohammad Malekzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sensing human motions through Inertial Measurement Units (IMUs) embedded in
+personal devices has enabled significant applications in health and wellness.
+Labeled IMU data is scarce, however, unlabeled or weakly labeled IMU data can
+be used to model human motions. For video or text modalities, the "pretrain and
+adapt" approach utilizes large volumes of unlabeled or weakly labeled data to
+build a strong feature extractor, followed by adaptation to specific tasks
+using limited labeled data. However, pretraining methods are poorly understood
+for IMU data, and pipelines are rarely evaluated on out-of-domain tasks. We
+propose PRIMUS: a method for PRetraining IMU encoderS that uses a novel
+pretraining objective that is empirically validated based on downstream
+performance on both in-domain and out-of-domain datasets. The PRIMUS objective
+effectively enhances downstream performance by combining self-supervision,
+multimodal, and nearest-neighbor supervision. With fewer than 500 labeled
+samples per class, PRIMUS improves test accuracy by up to 15%, compared to
+state-of-the-art baselines. To benefit the broader community, we have
+open-sourced our code at github.com/nokia-bell-labs/pretrained-imu-encoders.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at ICASSP 2025. Also presented under the title "PRIMUS:
+  Pretraining IMU Encoders with Multimodal and Self-Supervised Learning" at
+  NeurIPS 2024 TSALM Workshop (Time Series in the Age of Large Models)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic
+  Segmentation for Satellite Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.05693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.05693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ionut M. Motoi, Leonardo Saraceni, Daniele Nardi, Thomas A. Ciarfuglia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite imagery is crucial for tasks like environmental monitoring and
+urban planning. Typically, it relies on semantic segmentation or Land Use Land
+Cover (LULC) classification to categorize each pixel. Despite the advancements
+brought about by Deep Neural Networks (DNNs), their performance in segmentation
+tasks is hindered by challenges such as limited availability of labeled data,
+class imbalance and the inherent variability and complexity of satellite
+images. In order to mitigate those issues, our study explores the effectiveness
+of a Cut-and-Paste augmentation technique for semantic segmentation in
+satellite images. We adapt this augmentation, which usually requires labeled
+instances, to the case of semantic segmentation. By leveraging the connected
+components in the semantic segmentation labels, we extract instances that are
+then randomly pasted during training. Using the DynamicEarthNet dataset and a
+U-Net model for evaluation, we found that this augmentation significantly
+enhances the mIoU score on the test set from 37.9 to 44.1. This finding
+highlights the potential of the Cut-and-Paste augmentation to improve the
+generalization capabilities of semantic segmentation models in satellite
+imagery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in: IGARSS 2024 - 2024 IEEE International Geoscience and
+  Remote Sensing Symposium</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilling Calibration via Conformalized Credal Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Huang, Sangwoo Park, Nicola Paoletti, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying artificial intelligence (AI) models on edge devices involves a
+delicate balance between meeting stringent complexity constraints, such as
+limited memory and energy resources, and ensuring reliable performance in
+sensitive decision-making tasks. One way to enhance reliability is through
+uncertainty quantification via Bayesian inference. This approach, however,
+typically necessitates maintaining and running multiple models in an ensemble,
+which may exceed the computational limits of edge devices. This paper
+introduces a low-complexity methodology to address this challenge by distilling
+calibration information from a more complex model. In an offline phase,
+predictive probabilities generated by a high-complexity cloud-based model are
+leveraged to determine a threshold based on the typical divergence between the
+cloud and edge models. At run time, this threshold is used to construct credal
+sets -- ranges of predictive probabilities that are guaranteed, with a
+user-selected confidence level, to include the predictions of the cloud model.
+The credal sets are obtained through thresholding of a divergence measure in
+the simplex of predictive probabilities. Experiments on visual and language
+tasks demonstrate that the proposed approach, termed Conformalized Distillation
+for Credal Inference (CD-CI), significantly improves calibration performance
+compared to low-complexity Bayesian methods, such as Laplace approximation,
+making it a practical and efficient solution for edge AI deployments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PIER: A Novel Metric for Evaluating What Matters in Code-Switching <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09512v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09512v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enes Yavuz Ugan, Ngoc-Quan Pham, Leonard Bärmann, Alex Waibel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Code-switching, the alternation of languages within a single discourse,
+presents a significant challenge for Automatic Speech Recognition. Despite the
+unique nature of the task, performance is commonly measured with established
+metrics such as Word-Error-Rate (WER). However, in this paper, we question
+whether these general metrics accurately assess performance on code-switching.
+Specifically, using both Connectionist-Temporal-Classification and
+Encoder-Decoder models, we show fine-tuning on non-code-switched data from both
+matrix and embedded language improves classical metrics on code-switching test
+sets, although actual code-switched words worsen (as expected). Therefore, we
+propose Point-of-Interest Error Rate (PIER), a variant of WER that focuses only
+on specific words of interest. We instantiate PIER on code-switched utterances
+and show that this more accurately describes the code-switching performance,
+showing huge room for improvement in future work. This focused evaluation
+allows for a more precise assessment of model performance, particularly in
+challenging aspects such as inter-word and intra-word code-switching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Memory Gym: Towards Endless Tasks to Benchmark Memory Capabilities of
+  Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17207v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17207v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Pleines, Matthias Pallasch, Frank Zimmer, Mike Preuss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Memory Gym presents a suite of 2D partially observable environments, namely
+Mortar Mayhem, Mystery Path, and Searing Spotlights, designed to benchmark
+memory capabilities in decision-making agents. These environments, originally
+with finite tasks, are expanded into innovative, endless formats, mirroring the
+escalating challenges of cumulative memory games such as "I packed my bag".
+This progression in task design shifts the focus from merely assessing sample
+efficiency to also probing the levels of memory effectiveness in dynamic,
+prolonged scenarios. To address the gap in available memory-based Deep
+Reinforcement Learning baselines, we introduce an implementation within the
+open-source CleanRL library that integrates Transformer-XL (TrXL) with Proximal
+Policy Optimization. This approach utilizes TrXL as a form of episodic memory,
+employing a sliding window technique. Our comparative study between the Gated
+Recurrent Unit (GRU) and TrXL reveals varied performances across our finite and
+endless tasks. TrXL, on the finite environments, demonstrates superior
+effectiveness over GRU, but only when utilizing an auxiliary loss to
+reconstruct observations. Notably, GRU makes a remarkable resurgence in all
+endless tasks, consistently outperforming TrXL by significant margins. Website
+and Source Code: https://marcometer.github.io/jmlr_2024.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 12 figures, 7 tables, accepted at JMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Federated Learning Over the Air: Combating Heavy-Tailed Noise
+  with Median Anchored Clipping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.15100v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.15100v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Li, Zihan Chen, Kai Fong Ernest Chong, Bikramjit Das, Tony Q. S. Quek, Howard H. Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging over-the-air computations for model aggregation is an effective
+approach to cope with the communication bottleneck in federated edge learning.
+By exploiting the superposition properties of multi-access channels, this
+approach facilitates an integrated design of communication and computation,
+thereby enhancing system privacy while reducing implementation costs. However,
+the inherent electromagnetic interference in radio channels often exhibits
+heavy-tailed distributions, giving rise to exceptionally strong noise in
+globally aggregated gradients that can significantly deteriorate the training
+performance. To address this issue, we propose a novel gradient clipping
+method, termed Median Anchored Clipping (MAC), to combat the detrimental
+effects of heavy-tailed noise. We also derive analytical expressions for the
+convergence rate of model training with analog over-the-air federated learning
+under MAC, which quantitatively demonstrates the effect of MAC on training
+performance. Extensive experimental results show that the proposed MAC
+algorithm effectively mitigates the impact of heavy-tailed noise, hence
+substantially enhancing system robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the full version of the paper, and the appendix contains a
+  complete convergence analysis under non-convex conditions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What should a neuron aim for? Designing local objective functions based
+  on information theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.02482v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.02482v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas C. Schneider, Valentin Neuhaus, David A. Ehrlich, Abdullah Makkeh, Alexander S. Ecker, Viola Priesemann, Michael Wibral
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In modern deep neural networks, the learning dynamics of the individual
+neurons is often obscure, as the networks are trained via global optimization.
+Conversely, biological systems build on self-organized, local learning,
+achieving robustness and efficiency with limited global information. We here
+show how self-organization between individual artificial neurons can be
+achieved by designing abstract bio-inspired local learning goals. These goals
+are parameterized using a recent extension of information theory, Partial
+Information Decomposition (PID), which decomposes the information that a set of
+information sources holds about an outcome into unique, redundant and
+synergistic contributions. Our framework enables neurons to locally shape the
+integration of information from various input classes, i.e. feedforward,
+feedback, and lateral, by selecting which of the three inputs should contribute
+uniquely, redundantly or synergistically to the output. This selection is
+expressed as a weighted sum of PID terms, which, for a given problem, can be
+directly derived from intuitive reasoning or via numerical optimization,
+offering a window into understanding task-relevant local information
+processing. Achieving neuron-level interpretability while enabling strong
+performance using local learning, our work advances a principled
+information-theoretic foundation for local learning strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intra-day Solar and Power Forecast for Optimization of Intraday Market
+  Participation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09551v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09551v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nelson Salazar-Pena, Adolfo Palma-Vergara, Mateo Montes-Vera, Maria Alejandra Vargas-Torres, Adriana Salinas, Andres Velasco, Alejandra Tabares, Andres Gonzalez-Mancera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prediction of solar irradiance enhances reliability in photovoltaic (PV)
+solar plant generation and grid integration. In Colombia, PV plants face
+penalties if energy production deviates beyond governmental thresholds from
+intraday market offers. This research employs Long Short-Term Memory (LSTM) and
+Bidirectional-LSTM (Bi-LSTM) models, utilizing meteorological data from a PV
+plant in El Paso, Cesar, Colombia, to predict solar irradiance with a 6-hour
+horizon and 10-minute resolution. While Bi-LSTM showed superior performance,
+the LSTM model achieved comparable results with significantly reduced training
+time (6 hours versus 18 hours), making it computationally advantageous. The
+LSTM predictions were averaged to create an hourly resolution model, evaluated
+using Mean Absolute Error, Root-Mean-Square Error, Normalized Root-Mean-Square
+Error, and Mean Absolute Percentage Error metrics. Comparison with the Global
+Forecast System (GFS) revealed similar performance, with both models
+effectively capturing daily solar irradiance patterns. The forecast model
+integrates with an Object-Oriented power production model, enabling accurate
+energy offers in the intraday market while minimizing penalty costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 37 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Instruction Tuning of LLMs with Domain Coverage Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.20135v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.20135v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zezhou Wang, Yaxin Du, Xingjun Ma, Yugang Jiang, Zhuzhong Qian, Siheng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Domain-specific Instruction Tuning (FedDIT) utilizes limited
+cross-client private data together with various strategies of instruction
+augmentation, ultimately boosting model performance within specific domains. To
+date, the factors affecting FedDIT remain unclear, and existing instruction
+augmentation methods primarily focus on the centralized setting without
+considering distributed environments. Our experiments reveal that the
+cross-client domain coverage, rather than data heterogeneity, drives model
+performance in FedDIT. In response, we propose FedDCA, which optimizes domain
+coverage through greedy client center selection and retrieval-based
+augmentation. At its core, the greedy selection procedure iteratively picks
+client centers that maximize the diversity and coverage of the instruction
+space while avoiding redundancy with previously selected centers. This ensures
+broad yet efficient coverage of the domain distribution across clients. For
+client-side computational efficiency and system scalability, FedDCA$^*$, the
+variant of FedDCA, utilizes heterogeneous encoders with server-side feature
+alignment. Extensive experiments across code, medical, financial, and
+mathematical domains substantiate the effectiveness of both methods, as well as
+plug-and-play capability. We further analyze privacy preservation against
+memory extraction attacks, showing that while privacy leakage risk is
+independent of augmented public data ratio, it decreases or converges as
+training progresses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-aware Knowledge Tracing <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05415v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05415v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihua Cheng, Hanwen Du, Chunxiao Li, Ersheng Ni, Liangdi Tan, Tianqi Xu, Yongxin Ni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Tracing (KT) is crucial in education assessment, which focuses on
+depicting students' learning states and assessing students' mastery of
+subjects. With the rise of modern online learning platforms, particularly
+massive open online courses (MOOCs), an abundance of interaction data has
+greatly advanced the development of the KT technology. Previous research
+commonly adopts deterministic representation to capture students' knowledge
+states, which neglects the uncertainty during student interactions and thus
+fails to model the true knowledge state in learning process. In light of this,
+we propose an Uncertainty-Aware Knowledge Tracing model (UKT) which employs
+stochastic distribution embeddings to represent the uncertainty in student
+interactions, with a Wasserstein self-attention mechanism designed to capture
+the transition of state distribution in student learning behaviors.
+Additionally, we introduce the aleatory uncertainty-aware contrastive learning
+loss, which strengthens the model's robustness towards different types of
+uncertainties. Extensive experiments on six real-world datasets demonstrate
+that UKT not only significantly surpasses existing deep learning-based models
+in KT prediction, but also shows unique advantages in handling the uncertainty
+of student interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QROA: A Black-Box Query-Response Optimization Attack on LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02044v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02044v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hussein Jawad, Nicolas J. -B. BRUNEL
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have surged in popularity in recent months, yet
+they possess concerning capabilities for generating harmful content when
+manipulated. This study introduces the Query-Response Optimization Attack
+(QROA), an optimization-based strategy designed to exploit LLMs through a
+black-box, query-only interaction. QROA adds an optimized trigger to a
+malicious instruction to compel the LLM to generate harmful content. Unlike
+previous approaches, QROA does not require access to the model's logit
+information or any other internal data and operates solely through the standard
+query-response interface of LLMs. Inspired by deep Q-learning and Greedy
+coordinate descent, the method iteratively updates tokens to maximize a
+designed reward function. We tested our method on various LLMs such as Vicuna,
+Falcon, and Mistral, achieving an Attack Success Rate (ASR) over 80\%. We also
+tested the model against Llama2-chat, the fine-tuned version of Llama2 designed
+to resist Jailbreak attacks, achieving good ASR with a suboptimal initial
+trigger seed. This study demonstrates the feasibility of generating jailbreak
+attacks against deployed LLMs in the public domain using black-box optimization
+methods, enabling more comprehensive safety testing of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training Graph Neural Networks Using Non-Robust Samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14738v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14738v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are a highly effective neural network
+architecture for processing graph -- structured data. Unlike traditional neural
+networks that rely solely on the features of the data as input, GNNs leverage
+both the graph structure, which represents the relationships between data
+points, and the feature matrix of the data to optimize their feature
+representation. This unique capability enables GNNs to achieve superior
+performance across various tasks. However, it also makes GNNs more susceptible
+to noise from both the graph structure and data features, which can
+significantly increase the training difficulty and degrade their performance.
+To address this issue, this paper proposes a novel method for selecting
+noise-sensitive training samples from the original training set to construct a
+smaller yet more effective training set for model training. These samples are
+used to help improve the model's ability to correctly process data in noisy
+environments. We have evaluated our approach on three of the most classical GNN
+models -- GCN, GAT, and GraphSAGE -- as well as three widely used benchmark
+datasets: Cora, Citeseer, and PubMed. Our experiments demonstrate that the
+proposed method can substantially boost the training of Graph Neural Networks
+compared to using randomly sampled training sets of the same size from the
+original training set and the larger original full training set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BlockDialect: Block-wise Fine-grained Mixed Format Quantization for
+  Energy-Efficient LLM Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01144v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01144v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wonsuk Jang, Thierry Tambe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapidly increasing size of large language models (LLMs) presents
+significant challenges in memory usage and computational costs. Quantizing both
+weights and activations can address these issues, with hardware-supported
+fine-grained scaling emerging as a promising solution to mitigate outliers.
+However, existing methods struggle to capture nuanced block data distributions.
+We propose BlockDialect, a block-wise fine-grained mixed format technique that
+assigns a per-block optimal number format from a formatbook for better data
+representation. Additionally, we introduce DialectFP4, a formatbook of FP4
+variants (akin to dialects) that adapt to diverse data distributions. To
+leverage this efficiently, we propose a two-stage approach for online
+DialectFP4 activation quantization. Importantly, DialectFP4 ensures energy
+efficiency by selecting representable values as scaled integers compatible with
+low-precision integer arithmetic. BlockDialect achieves 10.78% (7.48%) accuracy
+gain on the LLaMA3-8B (LLaMA2-7B) model compared to MXFP4 format with lower bit
+usage per data, while being only 5.45% (2.69%) below full precision even when
+quantizing full-path matrix multiplication. Focusing on how to represent over
+how to scale, our work presents a promising path for energy-efficient LLM
+inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Marvels of Deep Learning in Medical Diagnosis: A
+  Comprehensive <span class="highlight-title">Review</span> of COVID-19 Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09506v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09506v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Shofiqul Islam, Khondokar Fida Hasan, Hasibul Hossain Shajeeb, Humayan Kabir Rana, Md Saifur Rahmand, Md Munirul Hasan, AKM Azad, Ibrahim Abdullah, Mohammad Ali Moni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a comprehensive review of the potential of multimodal
+deep learning (DL) in medical diagnosis, using COVID-19 as a case example.
+Motivated by the success of artificial intelligence applications during the
+COVID-19 pandemic, this research aims to uncover the capabilities of DL in
+disease screening, prediction, and classification, and to derive insights that
+enhance the resilience, sustainability, and inclusiveness of science,
+technology, and innovation systems. Adopting a systematic approach, we
+investigate the fundamental methodologies, data sources, preprocessing steps,
+and challenges encountered in various studies and implementations. We explore
+the architecture of deep learning models, emphasising their data-specific
+structures and underlying algorithms. Subsequently, we compare different deep
+learning strategies utilised in COVID-19 analysis, evaluating them based on
+methodology, data, performance, and prerequisites for future research. By
+examining diverse data types and diagnostic modalities, this research
+contributes to scientific understanding and knowledge of the multimodal
+application of DL and its effectiveness in diagnosis. We have implemented and
+analysed 11 deep learning models using COVID-19 image, text, and speech (ie,
+cough) data. Our analysis revealed that the MobileNet model achieved the
+highest accuracy of 99.97% for COVID-19 image data and 93.73% for speech data
+(i.e., cough). However, the BiGRU model demonstrated superior performance in
+COVID-19 text classification with an accuracy of 99.89%. The broader
+implications of this research suggest potential benefits for other domains and
+disciplines that could leverage deep learning techniques for image, text, and
+speech analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A-FedPD: Aligning Dual-Drift is All Federated Primal-Dual Learning Needs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Sun, Li Shen, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a popular paradigm for juggling data privacy and collaborative training,
+federated learning (FL) is flourishing to distributively process the large
+scale of heterogeneous datasets on edged clients. Due to bandwidth limitations
+and security considerations, it ingeniously splits the original problem into
+multiple subproblems to be solved in parallel, which empowers primal dual
+solutions to great application values in FL. In this paper, we review the
+recent development of classical federated primal dual methods and point out a
+serious common defect of such methods in non-convex scenarios, which we say is
+a "dual drift" caused by dual hysteresis of those longstanding inactive clients
+under partial participation training. To further address this problem, we
+propose a novel Aligned Federated Primal Dual (A-FedPD) method, which
+constructs virtual dual updates to align global consensus and local dual
+variables for those protracted unparticipated local clients. Meanwhile, we
+provide a comprehensive analysis of the optimization and generalization
+efficiency for the A-FedPD method on smooth non-convex objectives, which
+confirms its high efficiency and practicality. Extensive experiments are
+conducted on several classical FL setups to validate the effectiveness of our
+proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AirPilot: Interpretable PPO-based DRL Auto-Tuned Nonlinear PID Drone
+  Controller for Robust Autonomous Flights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00204v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00204v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyang Zhang, Cristian Emanuel Ocampo Rivera, Kyle Tyni, Steven Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Navigation precision, speed and stability are crucial for safe Unmanned
+Aerial Vehicle (UAV) flight maneuvers and effective flight mission executions
+in dynamic environments. Different flight missions may have varying objectives,
+such as minimizing energy consumption, achieving precise positioning, or
+maximizing speed. A controller that can adapt to different objectives on the
+fly is highly valuable. Proportional Integral Derivative (PID) controllers are
+one of the most popular and widely used control algorithms for drones and other
+control systems, but their linear control algorithm fails to capture the
+nonlinear nature of the dynamic wind conditions and complex drone system.
+Manually tuning the PID gains for various missions can be time-consuming and
+requires significant expertise. This paper aims to revolutionize drone flight
+control by presenting the AirPilot, a nonlinear Deep Reinforcement Learning
+(DRL) - enhanced Proportional Integral Derivative (PID) drone controller using
+Proximal Policy Optimization (PPO). AirPilot controller combines the simplicity
+and effectiveness of traditional PID control with the adaptability, learning
+capability, and optimization potential of DRL. This makes it better suited for
+modern drone applications where the environment is dynamic, and
+mission-specific performance demands are high. We employed a COEX Clover
+autonomous drone for training the DRL agent within the simulator and
+implemented it in a real-world lab setting, which marks a significant milestone
+as one of the first attempts to apply a DRL-based flight controller on an
+actual drone. Airpilot is capable of reducing the navigation error of the
+default PX4 PID position controller by 90%, improving effective navigation
+speed of a fine-tuned PID controller by 21%, reducing settling time and
+overshoot by 17% and 16% respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Categorical Flow Matching on Statistical Manifolds <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16441v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16441v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoran Cheng, Jiahan Li, Jian Peng, Ge Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Statistical Flow Matching (SFM), a novel and mathematically
+rigorous flow-matching framework on the manifold of parameterized probability
+measures inspired by the results from information geometry. We demonstrate the
+effectiveness of our method on the discrete generation problem by instantiating
+SFM on the manifold of categorical distributions whose geometric properties
+remain unexplored in previous discrete generative models. Utilizing the Fisher
+information metric, we equip the manifold with a Riemannian structure whose
+intrinsic geometries are effectively leveraged by following the shortest paths
+of geodesics. We develop an efficient training and sampling algorithm that
+overcomes numerical stability issues with a diffeomorphism between manifolds.
+Our distinctive geometric perspective of statistical manifolds allows us to
+apply optimal transport during training and interpret SFM as following the
+steepest direction of the natural gradient. Unlike previous models that rely on
+variational bounds for likelihood estimation, SFM enjoys the exact likelihood
+calculation for arbitrary probability measures. We manifest that SFM can learn
+more complex patterns on the statistical manifold where existing models often
+fail due to strong prior assumptions. Comprehensive experiments on real-world
+generative tasks ranging from image, text to biological domains further
+demonstrate that SFM achieves higher sampling quality and likelihood than other
+discrete diffusion or flow-based models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2024 as a conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diversify, Don't Fine-Tune: Scaling Up Visual Recognition Training with
+  Synthetic Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02253v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02253v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoran Yu, Chenchen Zhu, Sean Culatana, Raghuraman Krishnamoorthi, Fanyi Xiao, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in generative deep learning have enabled the creation of
+high-quality synthetic images in text-to-image generation. Prior work shows
+that fine-tuning a pretrained diffusion model on ImageNet and generating
+synthetic training images from the finetuned model can enhance an ImageNet
+classifier's performance. However, performance degrades as synthetic images
+outnumber real ones. In this paper, we explore whether generative fine-tuning
+is essential for this improvement and whether it is possible to further scale
+up training using more synthetic data. We present a new framework leveraging
+off-the-shelf generative models to generate synthetic training images,
+addressing multiple challenges: class name ambiguity, lack of diversity in
+naive prompts, and domain shifts. Specifically, we leverage large language
+models (LLMs) and CLIP to resolve class name ambiguity. To diversify images, we
+propose contextualized diversification (CD) and stylized diversification (SD)
+methods, also prompted by LLMs. Finally, to mitigate domain shifts, we leverage
+domain adaptation techniques with auxiliary batch normalization for synthetic
+images. Our framework consistently enhances recognition model performance with
+more synthetic data, up to 6x of original ImageNet size showcasing the
+potential of synthetic data for improved recognition models and strong
+out-of-domain generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On non-approximability of zero loss global ${\mathcal L}^2$ minimizers
+  by gradient descent in Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07065v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07065v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Chen, Patricia Muñoz Ewald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze geometric aspects of the gradient descent algorithm in Deep
+Learning (DL), and give a detailed discussion of the circumstance that in
+underparametrized DL networks, zero loss minimization can generically not be
+attained. As a consequence, we conclude that the distribution of training
+inputs must necessarily be non-generic in order to produce zero loss
+minimizers, both for the method constructed in [Chen-Munoz Ewald 2023, 2024],
+or for gradient descent [Chen 2025] (which assume clustering of training data).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AMS Latex, 7 pages. Title changed, statement of Corollary 1.6
+  corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Bayesian Physics-Informed Kolmogorov-Arnold Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08501v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08501v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Gao, George Em Karniadakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty quantification (UQ) plays a pivotal role in scientific machine
+learning, especially when surrogate models are used to approximate complex
+systems. Although multilayer perceptions (MLPs) are commonly employed as
+surrogates, they often suffer from overfitting due to their large number of
+parameters. Kolmogorov-Arnold networks (KANs) offer an alternative solution
+with fewer parameters. However, gradient-based inference methods, such as
+Hamiltonian Monte Carlo (HMC), may result in computational inefficiency when
+applied to KANs, especially for large-scale datasets, due to the high cost of
+back-propagation. To address these challenges, we propose a novel approach,
+combining the dropout Tikhonov ensemble Kalman inversion (DTEKI) with Chebyshev
+KANs. This gradient-free method effectively mitigates overfitting and enhances
+numerical stability. Additionally, we incorporate the active subspace method to
+reduce the parameter-space dimensionality, allowing us to improve the accuracy
+of predictions and obtain more reliable uncertainty estimates. Extensive
+experiments demonstrate the efficacy of our approach in various test cases,
+including scenarios with large datasets and high noise levels. Our results show
+that the new method achieves comparable or better accuracy, much higher
+efficiency as well as stability compared to HMC, in addition to scalability.
+Moreover, by leveraging the low-dimensional parameter subspace, our method
+preserves prediction accuracy while substantially reducing further the
+computational cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 19 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Convolutional Neural Networks on Multiclass Classification of
+  Three-Dimensional Brain Images for Parkinson's Disease Stage Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23649v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23649v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guan-Hua Huang, Wan-Chen Lai, Tai-Been Chen, Chien-Chin Hsu, Huei-Yung Chen, Yi-Chen Wu, Li-Ren Yeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parkinson's disease (PD), a degenerative disorder of the central nervous
+system, is commonly diagnosed using functional medical imaging techniques such
+as single-photon emission computed tomography (SPECT). In this study, we
+utilized two SPECT data sets (n = 634 and n = 202) from different hospitals to
+develop a model capable of accurately predicting PD stages, a multiclass
+classification task. We used the entire three-dimensional (3D) brain images as
+input and experimented with various model architectures. Initially, we treated
+the 3D images as sequences of two-dimensional (2D) slices and fed them
+sequentially into 2D convolutional neural network (CNN) models pretrained on
+ImageNet, averaging the outputs to obtain the final predicted stage. We also
+applied 3D CNN models pretrained on Kinetics-400. Additionally, we incorporated
+an attention mechanism to account for the varying importance of different
+slices in the prediction process. To further enhance model efficacy and
+robustness, we simultaneously trained the two data sets using weight sharing, a
+technique known as cotraining. Our results demonstrated that 2D models
+pretrained on ImageNet outperformed 3D models pretrained on Kinetics-400, and
+models utilizing the attention mechanism outperformed both 2D and 3D models.
+The cotraining technique proved effective in improving model performance when
+the cotraining data sets were sufficiently large.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 7 figures, and 4 tables. This paper has been accepted for
+  publication in Journal of Imaging Informatics in Medicine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating Recurrent <span class="highlight-title">Transformer</span>s with Dynamic Halt 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00976v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00976v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jishnu Ray Chowdhury, Cornelia Caragea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we comprehensively study the inductive biases of two major
+approaches to augmenting Transformers with a recurrent mechanism: (1) the
+approach of incorporating a depth-wise recurrence similar to Universal
+Transformers; and (2) the approach of incorporating a chunk-wise temporal
+recurrence like Temporal Latent Bottleneck. Furthermore, we propose and
+investigate novel ways to extend and combine the above methods - for example,
+we propose a global mean-based dynamic halting mechanism for Universal
+Transformers and an augmentation of Temporal Latent Bottleneck with elements
+from Universal Transformer. We compare the models and probe their inductive
+biases in several diagnostic tasks, such as Long Range Arena (LRA), flip-flop
+language modeling, ListOps, and Logical Inference. The code is released in:
+https://github.com/JRC1995/InvestigatingRecurrentTransformers/tree/main
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenFGL: A Comprehensive Benchmark for Federated Graph Learning <span class="chip">VLDB 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16288v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16288v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Yinlin Zhu, Boyang Pang, Guochen Yan, Yeyu Yan, Zening Li, Zhengyu Wu, Wentao Zhang, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated graph learning (FGL) is a promising distributed training paradigm
+for graph neural networks across multiple local systems without direct data
+sharing. This approach inherently involves large-scale distributed graph
+processing, which closely aligns with the challenges and research focuses of
+graph-based data systems. Despite the proliferation of FGL, the diverse
+motivations from real-world applications, spanning various research backgrounds
+and settings, pose a significant challenge to fair evaluation. To fill this
+gap, we propose OpenFGL, a unified benchmark designed for the primary FGL
+scenarios: Graph-FL and Subgraph-FL. Specifically, OpenFGL includes 42 graph
+datasets from 18 application domains, 8 federated data simulation strategies
+that emphasize different graph properties, and 5 graph-based downstream tasks.
+Additionally, it offers 18 recently proposed SOTA FGL algorithms through a
+user-friendly API, enabling a thorough comparison and comprehensive evaluation
+of their effectiveness, robustness, and efficiency. Our empirical results
+demonstrate the capabilities of FGL while also highlighting its potential
+limitations, providing valuable insights for future research in this growing
+field, particularly in fostering greater interdisciplinary collaboration
+between FGL and data systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by VLDB 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient Descent Converges Linearly to Flatter Minima than Gradient Flow
+  in Shallow Linear Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09137v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09137v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierfrancesco Beneventano, Blake Woodworth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the gradient descent (GD) dynamics of a depth-2 linear neural
+network with a single input and output. We show that GD converges at an
+explicit linear rate to a global minimum of the training loss, even with a
+large stepsize -- about $2/\textrm{sharpness}$. It still converges for even
+larger stepsizes, but may do so very slowly. We also characterize the solution
+to which GD converges, which has lower norm and sharpness than the gradient
+flow solution. Our analysis reveals a trade off between the speed of
+convergence and the magnitude of implicit regularization. This sheds light on
+the benefits of training at the ``Edge of Stability'', which induces additional
+regularization by delaying convergence and may have implications for training
+more complex models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedSPU: Personalized Federated Learning for Resource-constrained Devices
+  with Stochastic Parameter Update <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziru Niu, Hai Dong, A. K. Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized Federated Learning (PFL) is widely employed in IoT applications
+to handle high-volume, non-iid client data while ensuring data privacy.
+However, heterogeneous edge devices owned by clients may impose varying degrees
+of resource constraints, causing computation and communication bottlenecks for
+PFL. Federated Dropout has emerged as a popular strategy to address this
+challenge, wherein only a subset of the global model, i.e. a sub-model, is
+trained on a client's device, thereby reducing computation and communication
+overheads. Nevertheless, the dropout-based model-pruning strategy may introduce
+bias, particularly towards non-iid local data. When biased sub-models absorb
+highly divergent parameters from other clients, performance degradation becomes
+inevitable. In response, we propose federated learning with stochastic
+parameter update (FedSPU). Unlike dropout that tailors the global model to
+small-size local sub-models, FedSPU maintains the full model architecture on
+each device but randomly freezes a certain percentage of neurons in the local
+model during training while updating the remaining neurons. This approach
+ensures that a portion of the local model remains personalized, thereby
+enhancing the model's robustness against biased parameters from other clients.
+Experimental results demonstrate that FedSPU outperforms federated dropout by
+7.57% on average in terms of accuracy. Furthermore, an introduced early
+stopping scheme leads to a significant reduction of the training time by
+24.8%-70.4% while maintaining high accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fair Secretaries with Unfair Predictions <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.09854v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.09854v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Balkanski, Will Ma, Andreas Maggiori
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Algorithms with predictions is a recent framework for decision-making under
+uncertainty that leverages the power of machine-learned predictions without
+making any assumption about their quality. The goal in this framework is for
+algorithms to achieve an improved performance when the predictions are accurate
+while maintaining acceptable guarantees when the predictions are erroneous. A
+serious concern with algorithms that use predictions is that these predictions
+can be biased and, as a result, cause the algorithm to make decisions that are
+deemed unfair. We show that this concern manifests itself in the classical
+secretary problem in the learning-augmented setting -- the state-of-the-art
+algorithm can have zero probability of accepting the best candidate, which we
+deem unfair, despite promising to accept a candidate whose expected value is at
+least $\max\{\Omega (1) , 1 - O(\epsilon)\}$ times the optimal value, where
+$\epsilon$ is the prediction error. We show how to preserve this promise while
+also guaranteeing to accept the best candidate with probability $\Omega(1)$.
+Our algorithm and analysis are based on a new "pegging" idea that diverges from
+existing works and simplifies/unifies some of their results. Finally, we extend
+to the $k$-secretary problem and complement our theoretical analysis with
+experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reward-Augmented Data Enhances Direct Preference Alignment of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.08067v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.08067v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenao Zhang, Zhihan Liu, Boyi Liu, Yufeng Zhang, Yingxiang Yang, Yongfei Liu, Liyu Chen, Tao Sun, Zhaoran Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preference alignment in Large Language Models (LLMs) has significantly
+improved their ability to adhere to human instructions and intentions. However,
+existing direct alignment algorithms primarily focus on relative preferences
+and often overlook the qualitative aspects of responses. Striving to maximize
+the implicit reward gap between the chosen and the slightly inferior rejected
+responses can cause overfitting and unnecessary unlearning of the high-quality
+rejected responses. The unawareness of the reward scores also drives the LLM to
+indiscriminately favor the low-quality chosen responses and fail to generalize
+to responses with the highest rewards, which are sparse in data. To overcome
+these shortcomings, our study introduces reward-conditioned LLM policies that
+discern and learn from the entire spectrum of response quality within the
+dataset, helping extrapolate to more optimal regions. We propose an effective
+yet simple data relabeling method that conditions the preference pairs on
+quality scores to construct a reward-augmented dataset. This dataset is easily
+integrated with existing direct alignment algorithms and is applicable to any
+preference dataset. The experimental results across instruction-following
+benchmarks including AlpacaEval, MT-Bench, and Arena-Hard-Auto demonstrate that
+our approach consistently boosts the performance of DPO by a considerable
+margin across diverse models. Additionally, our method improves the average
+accuracy on various academic benchmarks. When applying our method to on-policy
+data, the resulting DPO model achieves SOTA results on AlpacaEval. Through
+ablation studies, we demonstrate that our method not only maximizes the utility
+of preference data but also mitigates the issue of unlearning, demonstrating
+its broad effectiveness beyond mere dataset expansion. Our code is available at
+https://github.com/shenao-zhang/reward-augmented-preference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embodied-RAG: General Non-parametric Embodied Memory for Retrieval and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18313v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18313v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quanting Xie, So Yeon Min, Pengliang Ji, Yue Yang, Tianyi Zhang, Kedi Xu, Aarav Bajaj, Ruslan Salakhutdinov, Matthew Johnson-Roberson, Yonatan Bisk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is no limit to how much a robot might explore and learn, but all of
+that knowledge needs to be searchable and actionable. Within language research,
+retrieval augmented generation (RAG) has become the workhorse of large-scale
+non-parametric knowledge; however, existing techniques do not directly transfer
+to the embodied domain, which is multimodal, where data is highly correlated,
+and perception requires abstraction. To address these challenges, we introduce
+Embodied-RAG, a framework that enhances the foundational model of an embodied
+agent with a non-parametric memory system capable of autonomously constructing
+hierarchical knowledge for both navigation and language generation.
+Embodied-RAG handles a full range of spatial and semantic resolutions across
+diverse environments and query types, whether for a specific object or a
+holistic description of ambiance. At its core, Embodied-RAG's memory is
+structured as a semantic forest, storing language descriptions at varying
+levels of detail. This hierarchical organization allows the system to
+efficiently generate context-sensitive outputs across different robotic
+platforms. We demonstrate that Embodied-RAG effectively bridges RAG to the
+robotics domain, successfully handling over 250 explanation and navigation
+queries across kilometer-level environments, highlighting its promise as a
+general-purpose non-parametric system for embodied agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Web: https://quanting-xie.github.io/Embodied-RAG-web/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Information-Theoretic Generalization Bounds for Transductive Learning
+  and its Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04561v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04561v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huayi Tang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we establish generalization bounds for transductive learning
+algorithms in the context of information theory and PAC-Bayes, covering both
+the random sampling and the random splitting setting. First, we show that the
+transductive generalization gap can be controlled by the mutual information
+between training label selection and the hypothesis. Next, we propose the
+concept of transductive supersample and use it to derive transductive
+information-theoretic bounds involving conditional mutual information and
+different information measures. We further establish transductive PAC-Bayesian
+bounds with weaker assumptions on the type of loss function and the number of
+training and test data points. Lastly, we use the theoretical results to derive
+upper bounds for adaptive optimization algorithms under the transductive
+learning setting. We also apply them to semi-supervised learning and
+transductive graph learning scenarios, meanwhile validating the derived bounds
+by experiments on synthetic and real-world datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Incremental Learning of Retrievable Skills For Efficient Continual Task
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.22658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.22658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daehee Lee, Minjong Yoo, Woo Kyung Kim, Wonje Choi, Honguk Woo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual Imitation Learning (CiL) involves extracting and accumulating task
+knowledge from demonstrations across multiple stages and tasks to achieve a
+multi-task policy. With recent advancements in foundation models, there has
+been a growing interest in adapter-based CiL approaches, where adapters are
+established parameter-efficiently for tasks newly demonstrated. While these
+approaches isolate parameters for specific tasks and tend to mitigate
+catastrophic forgetting, they limit knowledge sharing among different
+demonstrations. We introduce IsCiL, an adapter-based CiL framework that
+addresses this limitation of knowledge sharing by incrementally learning
+shareable skills from different demonstrations, thus enabling sample-efficient
+task adaptation using the skills particularly in non-stationary CiL
+environments. In IsCiL, demonstrations are mapped into the state embedding
+space, where proper skills can be retrieved upon input states through
+prototype-based memory. These retrievable skills are incrementally learned on
+their corresponding adapters. Our CiL experiments with complex tasks in
+Franka-Kitchen and Meta-World demonstrate robust performance of IsCiL in both
+task adaptation and sample-efficiency. We also show a simple extension of IsCiL
+for task unlearning scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimization Algorithm Design via Electric Circuits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02573v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02573v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen P. Boyd, Tetiana Parshakova, Ernest K. Ryu, Jaewook J. Suh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel methodology for convex optimization algorithm design using
+ideas from electric RLC circuits. Given an optimization problem, the first
+stage of the methodology is to design an appropriate electric circuit whose
+continuous-time dynamics converge to the solution of the optimization problem
+at hand. Then, the second stage is an automated, computer-assisted
+discretization of the continuous-time dynamics, yielding a provably convergent
+discrete-time algorithm. Our methodology recovers many classical (distributed)
+optimization algorithms and enables users to quickly design and explore a wide
+range of new algorithms with convergence guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying the Importance of Data Alignment in Downstream Model
+  Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krrish Chawla, Aryan Sahai, Mario DePavia, Sudharsan Sundar, Brando Miranda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrary to the conventional emphasis on dataset size, we explore the role of
+data alignment -- an often overlooked aspect of data quality -- in training
+capable Large Language Models (LLMs). To do so, we use the Task2Vec-based
+alignment coefficient, a quantitative measure of the similarity between two
+datasets, to quantify the impact of alignment between training data and
+evaluation data on downstream performance. In particular, we conduct controlled
+\textit{interventional} experiments for two settings: 1. the impact of
+increased alignment coefficients between various pre-training (pt) against
+evaluation datasets, and 2. the impact of increased alignment coefficients
+between domain specific fine-tuning (ft) against domain specific evaluation.
+The domain specific task we explore is Autoformalization -- the machine
+translation task between natural language and code for formal verification. In
+both settings, we find a strong, predictable negative correlation between the
+alignment coefficient of a model's training and evaluation data and the model's
+loss/perplexity on the respective downstream task. These findings suggest a
+re-evaluation of LLM training approaches, demonstrating the relevance of data
+alignment compared to data quantity, especially in specialized downstream tasks
+such as Autoformalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the Efficacy of Meta-Learning: Unveiling Superior Data
+  Diversity Utilization of MAML Over <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08506v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08506v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kavita Selva, Satita Vittayaareekul, Brando Miranda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently, data and model size dominate the narrative in the training of
+super-large, powerful models. However, there has been a lack of exploration on
+the effect of other attributes of the training dataset on model performance. We
+hypothesize that dataset diversity can impact the performance of vision models.
+Our study shows positive correlations between test set accuracy and data
+diversity, providing an argument for furthering the research of dataset
+attributes beyond size. We analyzed pre-training and model-agnostic
+meta-learning methods on twelve popular visual datasets (e.g., Omniglot,
+CIFAR-FS, Aircraft) and five model configurations, including MAML variants with
+different numbers of inner gradient steps and supervised learning. We show
+moderate to strong positive correlations (R-squared: 0.15-0.42) between
+accuracy and data diversity and weaker but significant correlations (R-squared:
+~0.2) between loss and diversity. These findings support our hypothesis and
+demonstrate a promising way for a deeper exploration of how formal data
+diversity influences model performance. This initial study highlights the
+potential of (Task2Vec) data diversity as a valuable measure in the rapidly
+evolving field of large-scale learning and emphasizes that understanding the
+dataset is key to building more powerful and generalizable models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaussianVideo: Efficient Video Representation Through 2D Gaussian
+  Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longan Wang, Yuang Shi, Wei Tsang Ooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian splats have emerged as a revolutionary, effective, learned
+representation for static 3D scenes. In this work, we explore using 2D Gaussian
+splats as a new primitive for representing videos. We propose GaussianVideo, an
+approach to learning a set of 2D Gaussian splats that can effectively represent
+video frames. GaussianVideo incorporates the following techniques: (i) To
+exploit temporal redundancy among adjacent frames, which can speed up training
+and improve the compression efficiency, we predict the Gaussian splats of a
+frame based on its previous frame; (ii) To control the trade-offs between file
+size and quality, we remove Gaussian splats with low contribution to the video
+quality; (iii) To capture dynamics in videos, we randomly add Gaussian splats
+to fit content with large motion or newly-appeared objects; (iv) To handle
+significant changes in the scene, we detect key frames based on loss
+differences during the learning process. Experiment results show that
+GaussianVideo achieves good rate-distortion trade-offs, comparable to
+state-of-the-art video codecs such as AV1 and VVC, and a rendering speed of
+1500 fps for a 1920x1080 video.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Data Deluge to Data Curation: A Filtering-WoRA Paradigm for
+  Efficient Text-based Person Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.10292v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.10292v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jintao Sun, Hao Fei, Zhedong Zheng, Gangyi Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In text-based person search endeavors, data generation has emerged as a
+prevailing practice, addressing concerns over privacy preservation and the
+arduous task of manual annotation. Although the number of synthesized data can
+be infinite in theory, the scientific conundrum persists that how much
+generated data optimally fuels subsequent model training. We observe that only
+a subset of the data in these constructed datasets plays a decisive role.
+Therefore, we introduce a new Filtering-WoRA paradigm, which contains a
+filtering algorithm to identify this crucial data subset and WoRA (Weighted
+Low-Rank Adaptation) learning strategy for light fine-tuning. The filtering
+algorithm is based on the cross-modality relevance to remove the lots of coarse
+matching synthesis pairs. As the number of data decreases, we do not need to
+fine-tune the entire model. Therefore, we propose a WoRA learning strategy to
+efficiently update a minimal portion of model parameters. WoRA streamlines the
+learning process, enabling heightened efficiency in extracting knowledge from
+fewer, yet potent, data instances. Extensive experimentation validates the
+efficacy of pretraining, where our model achieves advanced and efficient
+retrieval performance on challenging real-world benchmarks. Notably, on the
+CUHK-PEDES dataset, we have achieved a competitive mAP of 67.02% while reducing
+model training time by 19.82%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">95</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning segmentation from point trajectories <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12392v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12392v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurynas Karazija, Iro Laina, Christian Rupprecht, Andrea Vedaldi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of segmenting objects in videos based on their motion
+and no other forms of supervision. Prior work has often approached this problem
+by using the principle of common fate, namely the fact that the motion of
+points that belong to the same object is strongly correlated. However, most
+authors have only considered instantaneous motion from optical flow. In this
+work, we present a way to train a segmentation network using long-term point
+trajectories as a supervisory signal to complement optical flow. The key
+difficulty is that long-term motion, unlike instantaneous motion, is difficult
+to model -- any parametric approximation is unlikely to capture complex motion
+patterns over long periods of time. We instead draw inspiration from subspace
+clustering approaches, proposing a loss function that seeks to group the
+trajectories into low-rank matrices where the motion of object points can be
+approximately explained as a linear combination of other point tracks. Our
+method outperforms the prior art on motion-based segmentation, which shows the
+utility of long-term motion and the effectiveness of our formulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 Spotlight. Project
+  https://www.robots.ox.ac.uk/~vgg/research/lrtl/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics of Skill Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziming Liu, Yizhou Liu, Eric J. Michaud, Jeff Gore, Max Tegmark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to understand physics of skill learning, i.e., how skills are learned
+in neural networks during training. We start by observing the Domino effect,
+i.e., skills are learned sequentially, and notably, some skills kick off
+learning right after others complete learning, similar to the sequential fall
+of domino cards. To understand the Domino effect and relevant behaviors of
+skill learning, we take physicists' approach of abstraction and simplification.
+We propose three models with varying complexities -- the Geometry model, the
+Resource model, and the Domino model, trading between reality and simplicity.
+The Domino effect can be reproduced in the Geometry model, whose resource
+interpretation inspires the Resource model, which can be further simplified to
+the Domino model. These models present different levels of abstraction and
+simplification; each is useful to study some aspects of skill learning. The
+Geometry model provides interesting insights into neural scaling laws and
+optimizers; the Resource model sheds light on the learning dynamics of
+compositional tasks; the Domino model reveals the benefits of modularity. These
+models are not only conceptually interesting -- e.g., we show how Chinchilla
+scaling laws can emerge from the Geometry model, but also are useful in
+practice by inspiring algorithmic development -- e.g., we show how simple
+algorithmic changes, motivated by these toy models, can speed up the training
+of deep learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 20 figures. Codes are available at
+  https://github.com/KindXiaoming/physics_of_skill_learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMVU: Measuring Expert-Level Multi-Discipline Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilun Zhao, Lujing Xie, Haowei Zhang, Guo Gan, Yitao Long, Zhiyuan Hu, Tongyan Hu, Weiyuan Chen, Chuhan Li, Junyang Song, Zhijian Xu, Chengye Wang, Weifeng Pan, Ziyao Shangguan, Xiangru Tang, Zhenwen Liang, Yixin Liu, Chen Zhao, Arman Cohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MMVU, a comprehensive expert-level, multi-discipline benchmark
+for evaluating foundation models in video understanding. MMVU includes 3,000
+expert-annotated questions spanning 27 subjects across four core disciplines:
+Science, Healthcare, Humanities & Social Sciences, and Engineering. Compared to
+prior benchmarks, MMVU features three key advancements. First, it challenges
+models to apply domain-specific knowledge and perform expert-level reasoning to
+analyze specialized-domain videos, moving beyond the basic visual perception
+typically assessed in current video benchmarks. Second, each example is
+annotated by human experts from scratch. We implement strict data quality
+controls to ensure the high quality of the dataset. Finally, each example is
+enriched with expert-annotated reasoning rationals and relevant domain
+knowledge, facilitating in-depth analysis. We conduct an extensive evaluation
+of 32 frontier multimodal foundation models on MMVU. The latest
+System-2-capable models, o1 and Gemini 2.0 Flash Thinking, achieve the highest
+performance among the tested models. However, they still fall short of matching
+human expertise. Through in-depth error analyses and case studies, we offer
+actionable insights for future advancements in expert-level,
+knowledge-intensive video understanding for specialized domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video Depth Anything: Consistent Depth Estimation for Super-Long Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sili Chen, Hengkai Guo, Shengnan Zhu, Feihu Zhang, Zilong Huang, Jiashi Feng, Bingyi Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth Anything has achieved remarkable success in monocular depth estimation
+with strong generalization ability. However, it suffers from temporal
+inconsistency in videos, hindering its practical applications. Various methods
+have been proposed to alleviate this issue by leveraging video generation
+models or introducing priors from optical flow and camera poses. Nonetheless,
+these methods are only applicable to short videos (< 10 seconds) and require a
+trade-off between quality and computational efficiency. We propose Video Depth
+Anything for high-quality, consistent depth estimation in super-long videos
+(over several minutes) without sacrificing efficiency. We base our model on
+Depth Anything V2 and replace its head with an efficient spatial-temporal head.
+We design a straightforward yet effective temporal consistency loss by
+constraining the temporal depth gradient, eliminating the need for additional
+geometric priors. The model is trained on a joint dataset of video depth and
+unlabeled images, similar to Depth Anything V2. Moreover, a novel
+key-frame-based strategy is developed for long video inference. Experiments
+show that our model can be applied to arbitrarily long videos without
+compromising quality, consistency, or generalization ability. Comprehensive
+evaluations on multiple video benchmarks demonstrate that our approach sets a
+new state-of-the-art in zero-shot video depth estimation. We offer models of
+different scales to support a range of scenarios, with our smallest model
+capable of real-time performance at 30 FPS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Expertise elevates AI usage: experimental evidence comparing laypeople
+  and professional artists 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas F. Eisenmann, Andres Karjus, Mar Canet Sola, Levin Brinkmann, Bramantyo Ibrahim Supriyatno, Iyad Rahwan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Novel capacities of generative AI to analyze and generate cultural artifacts
+raise inevitable questions about the nature and value of artistic education and
+human expertise. Has AI already leveled the playing field between professional
+artists and laypeople, or do trained artistic expressive capacity, curation
+skills and experience instead enhance the ability to use these new tools? In
+this pre-registered study, we conduct experimental comparisons between 50
+active artists and a demographically matched sample of laypeople. We designed
+two tasks to approximate artistic practice for testing their capabilities in
+both faithful and creative image creation: replicating a reference image, and
+moving as far away as possible from it. We developed a bespoke platform where
+participants used a modern text-to-image model to complete both tasks. We also
+collected and compared participants' sentiments towards AI. On average, artists
+produced more faithful and creative outputs than their lay counterparts,
+although only by a small margin. While AI may ease content creation,
+professional expertise is still valuable - even within the confined space of
+generative AI itself. Finally, we also explored how well an exemplary
+vision-capable large language model (GPT-4o) would complete the same tasks, if
+given the role of an image generation agent, and found it performed on par in
+copying but outperformed even artists in the creative task. The very best
+results were still produced by humans in both tasks. These outcomes highlight
+the importance of integrating artistic skills with AI training to prepare
+artists and other visual professionals for a technologically evolving
+landscape. We see a potential in collaborative synergy with generative AI,
+which could reshape creative industries and education in the arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Eisenmann and Karjus contributed equally to this work and share first
+  authorship</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Long Context All You Need? Leveraging LLM's Extended Context for
+  NL2SQL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12372v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12372v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeounoh Chung, Gaurav T. Kakkar, Yu Gan, Brenton Milne, Fatma Ozcan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated impressive capabilities across
+a range of natural language processing tasks. In particular, improvements in
+reasoning abilities and the expansion of context windows have opened new
+avenues for leveraging these powerful models. NL2SQL is challenging in that the
+natural language question is inherently ambiguous, while the SQL generation
+requires a precise understanding of complex data schema and semantics. One
+approach to this semantic ambiguous problem is to provide more and sufficient
+contextual information.
+  In this work, we explore the performance and the latency trade-offs of the
+extended context window (a.k.a., long context) offered by Google's
+state-of-the-art LLM (\textit{gemini-1.5-pro}). We study the impact of various
+contextual information, including column example values, question and SQL query
+pairs, user-provided hints, SQL documentation, and schema. To the best of our
+knowledge, this is the first work to study how the extended context window and
+extra contextual information can help NL2SQL generation with respect to both
+accuracy and latency cost. We show that long context LLMs are robust and do not
+get lost in the extended contextual information. Additionally, our long-context
+NL2SQL pipeline based on Google's \textit{gemini-pro-1.5} achieve a strong
+performance with 67.41\% on BIRD benchmark (dev) without finetuning and
+expensive self-consistency based techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parameters vs FLOPs: Scaling Laws for Optimal Sparsity for
+  Mixture-of-Experts Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samira Abnar, Harshay Shah, Dan Busbridge, Alaaeldin Mohamed Elnouby Ali, Josh Susskind, Vimal Thilak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling the capacity of language models has consistently proven to be a
+reliable approach for improving performance and unlocking new capabilities.
+Capacity can be primarily defined by two dimensions: the number of model
+parameters and the compute per example. While scaling typically involves
+increasing both, the precise interplay between these factors and their combined
+contribution to overall capacity remains not fully understood. We explore this
+relationship in the context of sparse Mixture-of-Expert models (MoEs), which
+allow scaling the number of parameters without proportionally increasing the
+FLOPs per example. We investigate how varying the sparsity level, i.e., the
+ratio of non-active to total parameters, affects model performance in terms of
+both pretraining and downstream performance. We find that under different
+constraints (e.g. parameter size and total training compute), there is an
+optimal level of sparsity that improves both training efficiency and model
+performance. These results provide a better understanding of the impact of
+sparsity in scaling laws for MoEs and complement existing works in this area,
+offering insights for designing more efficient architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DARB-Splatting: Generalizing Splatting with Decaying Anisotropic Radial
+  Basis Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishagar Arunan, Saeedha Nazar, Hashiru Pramuditha, Vinasirajan Viruthshaan, Sameera Ramasinghe, Simon Lucey, Ranga Rodrigo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Splatting-based 3D reconstruction methods have gained popularity with the
+advent of 3D Gaussian Splatting, efficiently synthesizing high-quality novel
+views. These methods commonly resort to using exponential family functions,
+such as the Gaussian function, as reconstruction kernels due to their
+anisotropic nature, ease of projection, and differentiability in rasterization.
+However, the field remains restricted to variations within the exponential
+family, leaving generalized reconstruction kernels largely underexplored,
+partly due to the lack of easy integrability in 3D to 2D projections. In this
+light, we show that a class of decaying anisotropic radial basis functions
+(DARBFs), which are non-negative functions of the Mahalanobis distance,
+supports splatting by approximating the Gaussian function's closed-form
+integration advantage. With this fresh perspective, we demonstrate up to 34%
+faster convergence during training and a 15% reduction in memory consumption
+across various DARB reconstruction kernels, while maintaining comparable PSNR,
+SSIM, and LPIPS results. We will make the code available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Link to the project page:
+  https://randomnerds.github.io/darbs.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Test-time regression: a unifying framework for designing sequence models
+  with associative memory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Alexander Wang, Jiaxin Shi, Emily B. Fox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequences provide a remarkably general way to represent and process
+information. This powerful abstraction has placed sequence modeling at the
+center of modern deep learning applications, inspiring numerous architectures
+from transformers to recurrent networks. While this fragmented development has
+yielded powerful models, it has left us without a unified framework to
+understand their fundamental similarities and explain their effectiveness. We
+present a unifying framework motivated by an empirical observation: effective
+sequence models must be able to perform associative recall. Our key insight is
+that memorizing input tokens through an associative memory is equivalent to
+performing regression at test-time. This regression-memory correspondence
+provides a framework for deriving sequence models that can perform associative
+recall, offering a systematic lens to understand seemingly ad-hoc architectural
+choices. We show numerous recent architectures -- including linear attention
+models, their gated variants, state-space models, online learners, and softmax
+attention -- emerge naturally as specific approaches to test-time regression.
+Each architecture corresponds to three design choices: the relative importance
+of each association, the regressor function class, and the optimization
+algorithm. This connection leads to new understanding: we provide theoretical
+justification for QKNorm in softmax attention, and we motivate higher-order
+generalizations of softmax attention. Beyond unification, our work unlocks
+decades of rich statistical tools that can guide future development of more
+powerful yet principled sequence models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Treefix: Enabling Execution with a Tree of Prefixes <span class="chip">ICSE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beatriz Souza, Michael Pradel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to execute code is a prerequisite for various dynamic program
+analyses. Learning-guided execution has been proposed as an approach to enable
+the execution of arbitrary code snippets by letting a neural model predict
+likely values for any missing variables. Although state-of-the-art
+learning-guided execution approaches, such as LExecutor, can enable the
+execution of a relative high amount of code, they are limited to predicting a
+restricted set of possible values and do not use any feedback from previous
+executions to execute even more code. This paper presents Treefix, a novel
+learning-guided execution approach that leverages LLMs to iteratively create
+code prefixes that enable the execution of a given code snippet. The approach
+addresses the problem in a multi-step fashion, where each step uses feedback
+about the code snippet and its execution to instruct an LLM to improve a
+previously generated prefix. This process iteratively creates a tree of
+prefixes, a subset of which is returned to the user as prefixes that maximize
+the number of executed lines in the code snippet. In our experiments with two
+datasets of Python code snippets, Treefix achieves 25% and 7% more coverage
+relative to the current state of the art in learning-guided execution, covering
+a total of 84% and 82% of all lines in the code snippets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in research track of the EEE/ACM International Conference on
+  Software Engineering (ICSE) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FuocChuVIP123 at CoMeDi Shared Task: Disagreement Ranking with
+  XLM-Ro<span class="highlight-title">bert</span>a Sentence Embeddings and Deep Neural Regression <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phuoc Duong Huy Chu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents results of our system for CoMeDi Shared Task, focusing on
+Subtask 2: Disagreement Ranking. Our system leverages sentence embeddings
+generated by the paraphrase-xlm-r-multilingual-v1 model, combined with a deep
+neural regression model incorporating batch normalization and dropout for
+improved generalization. By predicting the mean of pairwise judgment
+differences between annotators, our method explicitly targets disagreement
+ranking, diverging from traditional "gold label" aggregation approaches. We
+optimized our system with a customized architecture and training procedure,
+achieving competitive performance in Spearman correlation against mean
+disagreement labels. Our results highlight the importance of robust embeddings,
+effective model architecture, and careful handling of judgment differences for
+ranking disagreement in multilingual contexts. These findings provide insights
+into the use of contextualized representations for ordinal judgment tasks and
+open avenues for further refinement of disagreement prediction models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at COMEDI shared Task, Workshop at COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Labelling with Open-source LLMs using Dynamic Label Schema
+  Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Walshe, Sae Young Moon, Chunyang Xiao, Yawwani Gunawardana, Fran Silavong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acquiring labelled training data remains a costly task in real world machine
+learning projects to meet quantity and quality requirements. Recently Large
+Language Models (LLMs), notably GPT-4, have shown great promises in labelling
+data with high accuracy. However, privacy and cost concerns prevent the
+ubiquitous use of GPT-4. In this work, we explore effectively leveraging
+open-source models for automatic labelling. We identify integrating label
+schema as a promising technology but found that naively using the label
+description for classification leads to poor performance on high cardinality
+tasks. To address this, we propose Retrieval Augmented Classification (RAC) for
+which LLM performs inferences for one label at a time using corresponding label
+schema; we start with the most related label and iterates until a label is
+chosen by the LLM. We show that our method, which dynamically integrates label
+description, leads to performance improvements in labelling tasks. We further
+show that by focusing only on the most promising labels, RAC can trade off
+between label quality and coverage - a property we leverage to automatically
+label our internal datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UI-TARS: Pioneering Automated GUI Interaction with Native Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujia Qin, Yining Ye, Junjie Fang, Haoming Wang, Shihao Liang, Shizuo Tian, Junda Zhang, Jiahao Li, Yunxin Li, Shijue Huang, Wanjun Zhong, Kuanye Li, Jiale Yang, Yu Miao, Woyu Lin, Longxiang Liu, Xu Jiang, Qianli Ma, Jingyu Li, Xiaojun Xiao, Kai Cai, Chuang Li, Yaowei Zheng, Chaolin Jin, Chen Li, Xiao Zhou, Minchao Wang, Haoli Chen, Zhaojian Li, Haihua Yang, Haifeng Liu, Feng Lin, Tao Peng, Xin Liu, Guang Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces UI-TARS, a native GUI agent model that solely perceives
+the screenshots as input and performs human-like interactions (e.g., keyboard
+and mouse operations). Unlike prevailing agent frameworks that depend on
+heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts
+and workflows, UI-TARS is an end-to-end model that outperforms these
+sophisticated frameworks. Experiments demonstrate its superior performance:
+UI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating
+perception, grounding, and GUI task execution. Notably, in the OSWorld
+benchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15
+steps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld,
+UI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several
+key innovations: (1) Enhanced Perception: leveraging a large-scale dataset of
+GUI screenshots for context-aware understanding of UI elements and precise
+captioning; (2) Unified Action Modeling, which standardizes actions into a
+unified space across platforms and achieves precise grounding and interaction
+through large-scale action traces; (3) System-2 Reasoning, which incorporates
+deliberate reasoning into multi-step decision making, involving multiple
+reasoning patterns such as task decomposition, reflection thinking, milestone
+recognition, etc. (4) Iterative Training with Reflective Online Traces, which
+addresses the data bottleneck by automatically collecting, filtering, and
+reflectively refining new interaction traces on hundreds of virtual machines.
+Through iterative training and reflection tuning, UI-TARS continuously learns
+from its mistakes and adapts to unforeseen situations with minimal human
+intervention. We also analyze the evolution path of GUI agents to guide the
+further development of this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM-Assisted Knowledge Graph Completion for Curriculum and Domain
+  Modelling in Personalized Higher Education Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasan Abu-Rasheed, Constance Jumbo, Rashed Al Amin, Christian Weber, Veit Wiese, Roman Obermaisser, Madjid Fathi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While learning personalization offers great potential for learners, modern
+practices in higher education require a deeper consideration of domain models
+and learning contexts, to develop effective personalization algorithms. This
+paper introduces an innovative approach to higher education curriculum
+modelling that utilizes large language models (LLMs) for knowledge graph (KG)
+completion, with the goal of creating personalized learning-path
+recommendations. Our research focuses on modelling university subjects and
+linking their topics to corresponding domain models, enabling the integration
+of learning modules from different faculties and institutions in the student's
+learning path. Central to our approach is a collaborative process, where LLMs
+assist human experts in extracting high-quality, fine-grained topics from
+lecture materials. We develop a domain, curriculum, and user models for
+university modules and stakeholders. We implement this model to create the KG
+from two study modules: Embedded Systems and Development of Embedded Systems
+Using FPGA. The resulting KG structures the curriculum and links it to the
+domain models. We evaluate our approach through qualitative expert feedback and
+quantitative graph quality metrics. Domain experts validated the relevance and
+accuracy of the model, while the graph quality metrics measured the structural
+properties of our KG. Our results show that the LLM-assisted graph completion
+approach enhances the ability to connect related courses across disciplines to
+personalize the learning experience. Expert feedback also showed high
+acceptance of the proposed collaborative approach for concept extraction and
+classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the IEEE Global Engineering Education Conference
+  (EDUCON2025), London, UK, 22-25 April, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RALAD: Bridging the Real-to-Sim Domain Gap in Autonomous Driving with
+  Retrieval-Augmented Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Zuo, Haibo Hu, Zikang Zhou, Yufei Cui, Ziquan Liu, Jianping Wang, Nan Guan, Jin Wang, Chun Jason Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the pursuit of robust autonomous driving systems, models trained on
+real-world datasets often struggle to adapt to new environments, particularly
+when confronted with corner cases such as extreme weather conditions.
+Collecting these corner cases in the real world is non-trivial, which
+necessitates the use of simulators for validation. However,the high
+computational cost and the domain gap in data distribution have hindered the
+seamless transition between real and simulated driving scenarios. To tackle
+this challenge, we propose Retrieval-Augmented Learning for Autonomous Driving
+(RALAD), a novel framework designed to bridge the real-to-sim gap at a low
+cost. RALAD features three primary designs, including (1) domain adaptation via
+an enhanced Optimal Transport (OT) method that accounts for both individual and
+grouped image distances, (2) a simple and unified framework that can be applied
+to various models, and (3) efficient fine-tuning techniques that freeze the
+computationally expensive layers while maintaining robustness. Experimental
+results demonstrate that RALAD compensates for the performance degradation in
+simulated environments while maintaining accuracy in real-world scenarios
+across three different models. Taking Cross View as an example, the mIOU and
+mAP metrics in real-world scenarios remain stable before and after RALAD
+fine-tuning, while in simulated environments,the mIOU and mAP metrics are
+improved by 10.30% and 12.29%, respectively. Moreover, the re-training cost of
+our approach is reduced by approximately 88.1%. Our code is available at
+https://github.com/JiachengZuo/RALAD.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regressor-Guided Image Editing Regulates Emotional Response to Reduce
+  Online Engagement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Gebhardt, Robin Willardt, Seyedmorteza Sadat, Chih-Wei Ning, Andreas Brombach, Jie Song, Otmar Hilliges, Christian Holz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotions are known to mediate the relationship between users' content
+consumption and their online engagement, with heightened emotional intensity
+leading to increased engagement. Building on this insight, we propose three
+regressor-guided image editing approaches aimed at diminishing the emotional
+impact of images. These include (i) a parameter optimization approach based on
+global image transformations known to influence emotions, (ii) an optimization
+approach targeting the style latent space of a generative adversarial network,
+and (iii) a diffusion-based approach employing classifier guidance and
+classifier-free guidance. Our findings demonstrate that approaches can
+effectively alter the emotional properties of images while maintaining high
+visual quality. Optimization-based methods primarily adjust low-level
+properties like color hues and brightness, whereas the diffusion-based approach
+introduces semantic changes, such as altering appearance or facial expressions.
+Notably, results from a behavioral study reveal that only the diffusion-based
+approach successfully elicits changes in viewers' emotional responses while
+preserving high perceived image quality. In future work, we will investigate
+the impact of these image adaptations on internet user behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 22 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implementation of an Asymmetric Adjusted Activation Function for Class
+  Imbalance Credit Scoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xia Li, Hanghang Zheng, Kunpeng Tao, Mao Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Credit scoring is a systematic approach to evaluate a borrower's probability
+of default (PD) on a bank loan. The data associated with such scenarios are
+characteristically imbalanced, complicating binary classification owing to the
+often-underestimated cost of misclassification during the classifier's learning
+process. Considering the high imbalance ratio (IR) of these datasets, we
+introduce an innovative yet straightforward optimized activation function by
+incorporating an IR-dependent asymmetric adjusted factor embedded Sigmoid
+activation function (ASIG). The embedding of ASIG makes the sensitive margin of
+the Sigmoid function auto-adjustable, depending on the imbalance nature of the
+datasets distributed, thereby giving the activation function an asymmetric
+characteristic that prevents the underrepresentation of the minority class
+(positive samples) during the classifier's learning process. The experimental
+results show that the ASIG-embedded-classifier outperforms traditional
+classifiers on datasets across wide-ranging IRs in the downstream
+credit-scoring task. The algorithm also shows robustness and stability, even
+when the IR is ultra-high. Therefore, the algorithm provides a competitive
+alternative in the financial industry, especially in credit scoring, possessing
+the ability to effectively process highly imbalanced distribution data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ With Great Backbones Comes Great Adversarial Transferability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Arakelyan, Karen Hambardzumyan, Davit Papikyan, Pasquale Minervini, Albert Gordo, Isabelle Augenstein, Aram H. Markosyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in self-supervised learning (SSL) for machine vision have improved
+representation robustness and model performance, giving rise to pre-trained
+backbones like \emph{ResNet} and \emph{ViT} models tuned with SSL methods such
+as \emph{SimCLR}. Due to the computational and data demands of pre-training,
+the utilization of such backbones becomes a strenuous necessity. However,
+employing these backbones may inherit vulnerabilities to adversarial attacks.
+While adversarial robustness has been studied under \emph{white-box} and
+\emph{black-box} settings, the robustness of models tuned on pre-trained
+backbones remains largely unexplored. Additionally, the role of tuning
+meta-information in mitigating exploitation risks is unclear. This work
+systematically evaluates the adversarial robustness of such models across
+$20,000$ combinations of tuning meta-information, including fine-tuning
+techniques, backbone families, datasets, and attack types. We propose using
+proxy models to transfer attacks, simulating varying levels of target knowledge
+by fine-tuning these proxies with diverse configurations. Our findings reveal
+that proxy-based attacks approach the effectiveness of \emph{white-box}
+methods, even with minimal tuning knowledge. We also introduce a naive
+"backbone attack," leveraging only the backbone to generate adversarial
+samples, which outperforms \emph{black-box} attacks and rivals \emph{white-box}
+methods, highlighting critical risks in model-sharing practices. Finally, our
+ablations reveal how increasing tuning meta-information impacts attack
+transferability, measuring each meta-information combination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Condor: Enhance LLM Alignment with Knowledge-Driven Data Synthesis and
+  Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maosong Cao, Taolin Zhang, Mo Li, Chuyu Zhang, Yunxin Liu, Haodong Duan, Songyang Zhang, Kai Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The quality of Supervised Fine-Tuning (SFT) data plays a critical role in
+enhancing the conversational capabilities of Large Language Models (LLMs).
+However, as LLMs become more advanced, the availability of high-quality
+human-annotated SFT data has become a significant bottleneck, necessitating a
+greater reliance on synthetic training data. In this work, we introduce Condor,
+a novel two-stage synthetic data generation framework that incorporates World
+Knowledge Tree and Self-Reflection Refinement to produce high-quality SFT data
+at scale. Our experimental results demonstrate that a base model fine-tuned on
+only 20K Condor-generated samples achieves superior performance compared to
+counterparts. The additional refinement stage in Condor further enables
+iterative self-improvement for LLMs at various scales (up to 72B), validating
+the effectiveness of our approach. Furthermore, our investigation into the
+scaling for synthetic data in post-training reveals substantial unexplored
+potential for performance improvements, opening promising avenues for future
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech Report. Github: https://github.com/InternLM/Condor</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CBVLM: Training-free Explainable Concept-based Large Vision Language
+  Models for Medical Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristiano Patrício, Isabel Rio-Torto, Jaime S. Cardoso, Luís F. Teixeira, João C. Neves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main challenges limiting the adoption of deep learning-based solutions in
+medical workflows are the availability of annotated data and the lack of
+interpretability of such systems. Concept Bottleneck Models (CBMs) tackle the
+latter by constraining the final disease prediction on a set of predefined and
+human-interpretable concepts. However, the increased interpretability achieved
+through these concept-based explanations implies a higher annotation burden.
+Moreover, if a new concept needs to be added, the whole system needs to be
+retrained. Inspired by the remarkable performance shown by Large
+Vision-Language Models (LVLMs) in few-shot settings, we propose a simple, yet
+effective, methodology, CBVLM, which tackles both of the aforementioned
+challenges. First, for each concept, we prompt the LVLM to answer if the
+concept is present in the input image. Then, we ask the LVLM to classify the
+image based on the previous concept predictions. Moreover, in both stages, we
+incorporate a retrieval module responsible for selecting the best examples for
+in-context learning. By grounding the final diagnosis on the predicted
+concepts, we ensure explainability, and by leveraging the few-shot capabilities
+of LVLMs, we drastically lower the annotation cost. We validate our approach
+with extensive experiments across four medical datasets and twelve LVLMs (both
+generic and medical) and show that CBVLM consistently outperforms CBMs and
+task-specific supervised methods without requiring any training and using just
+a few annotated examples. More information on our project page:
+https://cristianopatricio.github.io/CBVLM/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InsTALL: Context-aware Instructional Task Assistance with Multi-modal
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pha Nguyen, Sailik Sengupta, Girik Malik, Arshit Gupta, Bonan Min
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The improved competence of generative models can help building multi-modal
+virtual assistants that leverage modalities beyond language. By observing
+humans performing multi-step tasks, one can build assistants that have
+situational awareness of actions and tasks being performed, enabling them to
+cater assistance based on this understanding. In this paper, we develop a
+Context-aware Instructional Task Assistant with Multi-modal Large Language
+Models (InsTALL) that leverages an online visual stream (e.g. a user's screen
+share or video recording) and responds in real-time to user queries related to
+the task at hand. To enable useful assistance, InsTALL 1) trains a multi-modal
+model on task videos and paired textual data, and 2) automatically extracts
+task graph from video data and leverages it at training and inference time. We
+show InsTALL achieves state-of-the-art performance across proposed sub-tasks
+considered for multimodal activity understanding -- task recognition (TR),
+action recognition (AR), next action prediction (AP), and plan prediction (PP)
+-- and outperforms existing baselines on two novel sub-tasks related to
+automatic error identification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strong phonon-mediated high temperature superconductivity in
+  Li$_2$AuH$_6$ under ambient pressure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenfeng Ouyang, Bo-Wen Yao, Xiao-Qi Han, Peng-Jie Guo, Ze-Feng Gao, Zhong-Yi Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We used our developed AI search engine~(InvDesFlow) to perform extensive
+investigations regarding ambient stable superconducting hydrides. A cubic
+structure Li$_2$AuH$_6$ with Au-H octahedral motifs is identified to be a
+candidate. After performing thermodynamical analysis, we provide a feasible
+route to experimentally synthesize this material via the known LiAu and LiH
+compounds under ambient pressure. The further first-principles calculations
+suggest that Li$_2$AuH$_6$ shows a high superconducting transition temperature
+($T_c$) $\sim$ 140 K under ambient pressure. The H-1$s$ electrons strongly
+couple with phonon modes of vibrations of Au-H octahedrons as well as
+vibrations of Li atoms, where the latter is not taken seriously in other
+previously similar cases. Hence, different from previous claims of searching
+metallic covalent bonds to find high-$T_c$ superconductors, we emphasize here
+the importance of those phonon modes with strong electron-phonon coupling
+(EPC). And we suggest that one can intercalate atoms into binary or ternary
+hydrides to introduce more potential phonon modes with strong EPC, which is an
+effective approach to find high-$T_c$ superconductors within multicomponent
+compounds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages; 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An End-to-End Approach for Korean Wakeword Systems with Speaker
+  Authentication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geonwoo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wakeword detection plays a critical role in enabling AI assistants to listen
+to user voices and interact effectively. However, for languages other than
+English, there is a significant lack of pre-trained wakeword models.
+Additionally, systems that merely determine the presence of a wakeword can pose
+serious privacy concerns. In this paper, we propose an end-to-end approach that
+trains wakewords for Non-English languages, particulary Korean, and uses this
+to develop a Voice Authentication model to protect user privacy. Our
+implementation employs an open-source platform OpenWakeWord, which performs
+wakeword detection using an FCN (Fully-Connected Network) architecture. Once a
+wakeword is detected, our custom-developed code calculates cosine similarity
+for robust user authentication. Experimental results demonstrate the
+effectiveness of our approach, achieving a 16.79% and a 6.6% Equal Error Rate
+(EER) each in the Wakeword Detection and the Voice Authentication. These
+findings highlight the model's potential in providing secure and accurate
+wakeword detection and authentication for Korean users.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures, implementation code available at
+  https://github.com/gws8820/securewakeword-model,
+  https://github.com/gws8820/wyoming-securewakeword, demo video at
+  https://www.youtube.com/watch?v=F3AXUbL-i-o</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaServe: SLO-Customized LLM Serving with Fine-Grained Speculative
+  Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12162v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12162v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikun Li, Zhuofu Chen, Remi Delacourt, Gabriele Oliaro, Zeyu Wang, Qinghan Chen, Shuhuai Lin, April Yang, Zhihao Zhang, Zhuoming Chen, Sean Lai, Xupeng Miao, Zhihao Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces AdaServe, the first LLM serving system to support SLO
+customization through fine-grained speculative decoding. AdaServe leverages the
+logits of a draft model to predict the speculative accuracy of tokens and
+employs a theoretically optimal algorithm to construct token trees for
+verification. To accommodate diverse SLO requirements without compromising
+throughput, AdaServe employs a speculation-and-selection scheme that first
+constructs candidate token trees for each request and then dynamically selects
+tokens to meet individual SLO constraints while optimizing throughput.
+Comprehensive evaluations demonstrate that AdaServe achieves up to 73% higher
+SLO attainment and 74% higher goodput compared to state-of-the-art systems.
+These results underscore AdaServe's potential to enhance the efficiency and
+adaptability of LLM deployments across varied application scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the practical applicability of modern DFT functionals for chemical
+  computations. Case study of DM21 applicability for geometry optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kirill Kulaev, Alexander Ryabov, Michael Medvedev, Evgeny Burnaev, Vladimir Vanovskiy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Density functional theory (DFT) is probably the most promising approach for
+quantum chemistry calculations considering its good balance between
+calculations precision and speed. In recent years, several neural network-based
+functionals have been developed for exchange-correlation energy approximation
+in DFT, DM21 developed by Google Deepmind being the most notable between them.
+This study focuses on evaluating the efficiency of DM21 functional in
+predicting molecular geometries, with a focus on the influence of oscillatory
+behavior in neural network exchange-correlation functionals. We implemented
+geometry optimization in PySCF for the DM21 functional in geometry optimization
+problem, compared its performance with traditional functionals, and tested it
+on various benchmarks. Our findings reveal both the potential and the current
+challenges of using neural network functionals for geometry optimization in
+DFT. We propose a solution extending the practical applicability of such
+functionals and allowing to model new substances with their help.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Influence-based Instruction Tuning Data Selection for Balanced
+  Learning of Diverse Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qirun Dai, Dylan Zhang, Jiaqi W. Ma, Hao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Selecting appropriate training data is crucial for effective instruction
+fine-tuning of large language models (LLMs), which aims to (1) elicit strong
+capabilities, and (2) achieve balanced performance across a diverse range of
+tasks. Influence-based methods show promise in achieving (1) by estimating the
+contribution of each training example to the model's predictions, but often
+struggle with (2). Our systematic investigation reveals that this
+underperformance can be attributed to an inherent bias where certain tasks
+intrinsically have greater influence than others. As a result, data selection
+is often biased towards these tasks, not only hurting the model's performance
+on others but also, counterintuitively, harms performance on these
+high-influence tasks themselves.
+  As a remedy, we propose BIDS, a Balanced and Influential Data Selection
+algorithm. BIDS first normalizes influence scores of the training data, and
+then iteratively balances data selection by choosing the training example with
+the highest influence on the most underrepresented task. Experiments with both
+Llama-3 and Mistral-v0.3 on seven benchmarks spanning five diverse capabilities
+show that BIDS consistently outperforms both state-of-the-art influence-based
+algorithms and other non-influence-based selection frameworks. Surprisingly,
+training on a 15% subset selected by BIDS can even outperform full-dataset
+training with a much more balanced performance. Our analysis further highlights
+the importance of both instance-level normalization and iterative optimization
+of selected data for balanced learning of diverse capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedCLEAN: byzantine defense by CLustering Errors of Activation maps in
+  Non-IID federated learning environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12123v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12123v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehdi Ben Ghali, Reda Bellafqira, Gouenou Coatrieux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) enables clients to collaboratively train a global
+model using their local datasets while reinforcing data privacy. However, FL is
+susceptible to poisoning attacks. Existing defense mechanisms assume that
+clients' data are independent and identically distributed (IID), making them
+ineffective in real-world applications where data are non-IID. This paper
+presents FedCLEAN, the first defense capable of filtering attackers' model
+updates in a non-IID FL environment. The originality of FedCLEAN is twofold.
+First, it relies on a client confidence score derived from the reconstruction
+errors of each client's model activation maps for a given trigger set, with
+reconstruction errors obtained by means of a Conditional Variational
+Autoencoder trained according to a novel server-side strategy. Second, we
+propose an ad-hoc trust propagation algorithm based on client scores, which
+allows building a cluster of benign clients while flagging potential attackers.
+Experimental results on the datasets MNIST and FashionMNIST demonstrate the
+robustness of FedCLEAN against Byzantine attackers in non-IID scenarios and a
+close-to-zero benign client misclassification rate, even in the absence of an
+attack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient PINNs: Multi-Head Unimodular Regularization of the Solutions
+  Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12116v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12116v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Tarancón-Álvarez, Pablo Tejerina-Pérez, Raul Jimenez, Pavlos Protopapas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a machine learning framework to facilitate the solution of
+nonlinear multiscale differential equations and, especially, inverse problems
+using Physics-Informed Neural Networks (PINNs). This framework is based on what
+is called multihead (MH) training, which involves training the network to learn
+a general space of all solutions for a given set of equations with certain
+variability, rather than learning a specific solution of the system. This setup
+is used with a second novel technique that we call Unimodular Regularization
+(UR) of the latent space of solutions. We show that the multihead approach,
+combined with the regularization, significantly improves the efficiency of
+PINNs by facilitating the transfer learning process thereby enabling the
+finding of solutions for nonlinear, coupled, and multiscale differential
+equations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can open source large language models be used for tumor documentation in
+  Germany? -- An evaluation on urological doctors' notes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Lenz, Arsenij Ustjanzew, Marco Jeray, Torsten Panholzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tumor documentation in Germany is largely done manually, requiring reading
+patient records and entering data into structured databases. Large language
+models (LLMs) could potentially enhance this process by improving efficiency
+and reliability. This evaluation tests eleven different open source LLMs with
+sizes ranging from 1-70 billion model parameters on three basic tasks of the
+tumor documentation process: identifying tumor diagnoses, assigning ICD-10
+codes, and extracting the date of first diagnosis. For evaluating the LLMs on
+these tasks, a dataset of annotated text snippets based on anonymized doctors'
+notes from urology was prepared. Different prompting strategies were used to
+investigate the effect of the number of examples in few-shot prompting and to
+explore the capabilities of the LLMs in general. The models Llama 3.1 8B,
+Mistral 7B, and Mistral NeMo 12 B performed comparably well in the tasks.
+Models with less extensive training data or having fewer than 7 billion
+parameters showed notably lower performance, while larger models did not
+display performance gains. Examples from a different medical domain than
+urology could also improve the outcome in few-shot prompting, which
+demonstrates the ability of LLMs to handle tasks needed for tumor
+documentation. Open source LLMs show a strong potential for automating tumor
+documentation. Models from 7-12 billion parameters could offer an optimal
+balance between performance and resource efficiency. With tailored fine-tuning
+and well-designed prompting, these models might become important tools for
+clinical documentation in the future. The code for the evaluation is available
+from https://github.com/stefan-m-lenz/UroLlmEval. We also release the dataset
+as a new valuable resource that addresses the shortage of authentic and easily
+accessible benchmarks in German-language medical NLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>48 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Teacher Encoder-Student Decoder Denoising Guided Segmentation Network
+  for Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12104v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12104v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        ShiXuan Song, Hao Chen, Shu Hu, Xin Wang, Jinrong Hu, Xi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual anomaly detection is a highly challenging task, often categorized as a
+one-class classification and segmentation problem. Recent studies have
+demonstrated that the student-teacher (S-T) framework effectively addresses
+this challenge. However, most S-T frameworks rely solely on pre-trained teacher
+networks to guide student networks in learning multi-scale similar features,
+overlooking the potential of the student networks to enhance learning through
+multi-scale feature fusion. In this study, we propose a novel model named
+PFADSeg, which integrates a pre-trained teacher network, a denoising student
+network with multi-scale feature fusion, and a guided anomaly segmentation
+network into a unified framework. By adopting a unique teacher-encoder and
+student-decoder denoising mode, the model improves the student network's
+ability to learn from teacher network features. Furthermore, an adaptive
+feature fusion mechanism is introduced to train a self-supervised segmentation
+network that synthesizes anomaly masks autonomously, significantly increasing
+detection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves
+state-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean
+precision of 76.4%, and an instance-level mean precision of 78.7%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Proxies for Distortion and Consistency with Applications for Real-World
+  Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Man, Guy Ohayon, Ron Raphaeli, Michael Elad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world image restoration deals with the recovery of images suffering from
+an unknown degradation. This task is typically addressed while being given only
+degraded images, without their corresponding ground-truth versions. In this
+hard setting, designing and evaluating restoration algorithms becomes highly
+challenging. This paper offers a suite of tools that can serve both the design
+and assessment of real-world image restoration algorithms. Our work starts by
+proposing a trained model that predicts the chain of degradations a given
+real-world measured input has gone through. We show how this estimator can be
+used to approximate the consistency -- the match between the measurements and
+any proposed recovered image. We also use this estimator as a guiding force for
+the design of a simple and highly-effective plug-and-play real-world image
+restoration algorithm, leveraging a pre-trained diffusion-based image prior.
+Furthermore, this work proposes no-reference proxy measures of MSE and LPIPS,
+which, without access to the ground-truth images, allow ranking of real-world
+image restoration algorithms according to their (approximate) MSE and LPIPS.
+The proposed suite provides a versatile, first of its kind framework for
+evaluating and comparing blind image restoration algorithms in real-world
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page in https://man-sean.github.io/elad-website/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Whole Slide Image Representation Using K-Mean Clustering and
+  Fisher Vector Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ravi Kant Gupta, Shounak Das, Ardhendu Sekhar, Amit Sethi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whole slide images (WSIs) are high-resolution, gigapixel sized images that
+pose significant computational challenges for traditional machine learning
+models due to their size and heterogeneity.In this paper, we present a scalable
+and efficient methodology for WSI classification by leveraging patch-based
+feature extraction, clustering, and Fisher vector encoding. Initially, WSIs are
+divided into fixed size patches, and deep feature embeddings are extracted from
+each patch using a pre-trained convolutional neural network (CNN). These
+patch-level embeddings are subsequently clustered using K-means clustering,
+where each cluster aggregates semantically similar regions of the WSI. To
+effectively summarize each cluster, Fisher vector representations are computed
+by modeling the distribution of patch embeddings in each cluster as a
+parametric Gaussian mixture model (GMM). The Fisher vectors from each cluster
+are concatenated into a high-dimensional feature vector, creating a compact and
+informative representation of the entire WSI. This feature vector is then used
+by a classifier to predict the WSI's diagnostic label. Our method captures
+local and global tissue structures and yields robust performance for
+large-scale WSI classification, demonstrating superior accuracy and scalability
+compared to other approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EDoRA: Efficient Weight-Decomposed Low-Rank Adaptation via Singular
+  Value Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamid Nasiri, Peter Garraghan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning methods, such as LoRA, reduces the number of
+trainable parameters. However, they often suffer from scalability issues and
+differences between their learning pattern and full fine-tuning. To overcome
+these limitations, we propose Efficient Weight-Decomposed Low-Rank Adaptation
+(EDoRA): a novel PEFT method that decomposes pre-trained weights into magnitude
+and directional components. By freezing low-rank matrices, initializing them by
+singular value decomposition, and introducing a small trainable matrix between
+them, EDoRA achieves substantial reduction in trainable parameters while
+maintaining learning capacity. Experimental results on the GLUE benchmark
+demonstrate that EDoRA achieves competitive or superior performance compared to
+state-of-the-art methods, such as LoRA and DoRA, with up to 30x fewer trainable
+parameters. This makes EDoRA a highly efficient solution for adapting LLMs to
+diverse tasks under memory-constrained settings. Code is available at
+https://github.com/Hamid-Nasiri/EDoRA .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Class Learning to Screen Diabetic Disorders in Fundus Images of
+  Eye <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shramana Dey, Pallabi Dutta, Riddhasree Bhattacharyya, Surochita Pal, Sushmita Mitra, Rajiv Raman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prevalence of ocular illnesses is growing globally, presenting a
+substantial public health challenge. Early detection and timely intervention
+are crucial for averting visual impairment and enhancing patient prognosis.
+This research introduces a new framework called Class Extension with Limited
+Data (CELD) to train a classifier to categorize retinal fundus images. The
+classifier is initially trained to identify relevant features concerning
+Healthy and Diabetic Retinopathy (DR) classes and later fine-tuned to adapt to
+the task of classifying the input images into three classes: Healthy, DR, and
+Glaucoma. This strategy allows the model to gradually enhance its
+classification capabilities, which is beneficial in situations where there are
+only a limited number of labeled datasets available. Perturbation methods are
+also used to identify the input image characteristics responsible for
+influencing the models decision-making process. We achieve an overall accuracy
+of 91% on publicly available datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at International Conference on Pattern Recognition (ICPR)
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing Generative <span class="highlight-title">Pre-Train</span>ed <span class="highlight-title">Transformer</span> for Datacenter Packet
+  Trace Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Griner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today, the rapid growth of applications reliant on datacenters calls for new
+advancements to meet the increasing traffic and computational demands. Traffic
+traces from datacenters are essential for further development and optimization
+of future datacenters. However, traces are rarely released to the public.
+Researchers often use simplified mathematical models that lack the depth needed
+to recreate intricate traffic patterns and, thus, miss optimization
+opportunities found in realistic traffic. In this preliminary work, we
+introduce DTG-GPT, a packet-level Datacenter Traffic Generator (DTG), based on
+the generative pre-trained transformer (GPT) architecture used by many
+state-of-the-art large language models. We train our model on a small set of
+available traffic traces from different domains and offer a simple methodology
+to evaluate the fidelity of the generated traces to their original
+counterparts. We show that DTG-GPT can synthesize novel traces that mimic the
+spatiotemporal patterns found in real traffic traces. We further demonstrate
+that DTG-GPT can generate traces for networks of different scales while
+maintaining fidelity. Our findings indicate the potential that, in the future,
+similar models to DTG-GPT will allow datacenter operators to release traffic
+information to the research community via trained GPT models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Full Proportional Justified Representation <span class="chip">AAMAS 25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.12015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.12015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuf Hakan Kalayci, Jiasen Liu, David Kempe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In multiwinner approval voting, forming a committee that proportionally
+represents voters' approval ballots is an essential task. The notion of
+justified representation (JR) demands that any large "cohesive" group of voters
+should be proportionally "represented". The "cohesiveness" is defined in
+different ways; two common ways are the following: (C1) demands that the group
+unanimously approves a set of candidates proportional to its size, while (C2)
+requires each member to approve at least a fixed fraction of such a set.
+Similarly, "representation" have been considered in different ways: (R1) the
+coalition's collective utility from the winning set exceeds that of any
+proportionally sized alternative, and (R2) for any proportionally sized
+alternative, at least one member of the coalition derives less utility from it
+than from the winning set.
+  Three of the four possible combinations have been extensively studied:
+(C1)-(R1) defines Proportional Justified Representation (PJR), (C1)-(R2)
+defines Extended Justified Representation (EJR), (C2)-(R2) defines Full
+Justified Representation (FJR). All three have merits, but also drawbacks. PJR
+is the weakest notion, and perhaps not sufficiently demanding; EJR may not be
+compatible with perfect representation; and it is open whether a committee
+satisfying FJR can be found efficiently.
+  We study the combination (C2)-(R1), which we call Full Proportional Justified
+Representation (FPJR). We investigate FPJR's properties and find that it shares
+PJR's advantages over EJR: several proportionality axioms (e.g. priceability,
+perfect representation) imply FPJR and PJR but not EJR. We also find that
+efficient rules like the greedy Monroe rule and the method of equal shares
+satisfy FPJR, matching a key advantage of EJR over FJR. However, the
+Proportional Approval Voting (PAV) rule may violate FPJR, so neither of EJR and
+FPJR implies the other.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, Accepted to AAMAS 25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Survey</span> on Hand Gesture Recognition from Visual Input 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manousos Linardakis, Iraklis Varlamis, Georgios Th. Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hand gesture recognition has become an important research area, driven by the
+growing demand for human-computer interaction in fields such as sign language
+recognition, virtual and augmented reality, and robotics. Despite the rapid
+growth of the field, there are few surveys that comprehensively cover recent
+research developments, available solutions, and benchmark datasets. This survey
+addresses this gap by examining the latest advancements in hand gesture and 3D
+hand pose recognition from various types of camera input data including RGB
+images, depth images, and videos from monocular or multiview cameras, examining
+the differing methodological requirements of each approach. Furthermore, an
+overview of widely used datasets is provided, detailing their main
+characteristics and application domains. Finally, open challenges such as
+achieving robust recognition in real-world environments, handling occlusions,
+ensuring generalization across diverse users, and addressing computational
+efficiency for real-time applications are highlighted to guide future research
+directions. By synthesizing the objectives, methodologies, and applications of
+recent studies, this survey offers valuable insights into current trends,
+challenges, and opportunities for future research in human hand gesture
+recognition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Graph Structures and Large Language Models for End-to-End
+  Synthetic Task-Oriented Dialogues 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maya Medjad, Hugo Imbert, Bruno Yun, Raphaël Szymocha, Frédéric Armetta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training task-oriented dialogue systems is both costly and time-consuming,
+due to the need for high-quality datasets encompassing diverse intents.
+Traditional methods depend on extensive human annotation, while recent
+advancements leverage large language models (LLMs) to generate synthetic data.
+However, these approaches often require custom prompts or code, limiting
+accessibility for non-technical users. We introduce GraphTOD, an end-to-end
+framework that simplifies the generation of task-oriented dialogues. Users can
+create dialogues by specifying transition graphs in JSON format. Our evaluation
+demonstrates that GraphTOD generates high-quality dialogues across various
+domains, significantly lowering the cost and complexity of dataset creation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Visualization and Optimization: Multimodal Large Language
+  Models on Graph-Structured Combinatorial Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Zhao, Kang Hao Cheong, Witold Pedrycz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-structured combinatorial challenges are inherently difficult due to
+their nonlinear and intricate nature, often rendering traditional computational
+methods ineffective or expensive. However, these challenges can be more
+naturally tackled by humans through visual representations that harness our
+innate ability for spatial reasoning. In this study, we propose transforming
+graphs into images to preserve their higher-order structural features
+accurately, revolutionizing the representation used in solving graph-structured
+combinatorial tasks. This approach allows machines to emulate human-like
+processing in addressing complex combinatorial challenges. By combining the
+innovative paradigm powered by multimodal large language models (MLLMs) with
+simple search techniques, we aim to develop a novel and effective framework for
+tackling such problems. Our investigation into MLLMs spanned a variety of
+graph-based tasks, from combinatorial problems like influence maximization to
+sequential decision-making in network dismantling, as well as addressing six
+fundamental graph-related issues. Our findings demonstrate that MLLMs exhibit
+exceptional spatial intelligence and a distinctive capability for handling
+these problems, significantly advancing the potential for machines to
+comprehend and analyze graph-structured data with a depth and intuition akin to
+human cognition. These results also imply that integrating MLLMs with simple
+optimization strategies could form a novel and efficient approach for
+navigating graph-structured combinatorial challenges without complex
+derivations, computationally demanding training and fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TAD-Bench: A Comprehensive Benchmark for Embedding-Based Text Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Cao, Sikun Yang, Chen Li, Haolong Xiang, Lianyong Qi, Bo Liu, Rongsheng Li, Ming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text anomaly detection is crucial for identifying spam, misinformation, and
+offensive language in natural language processing tasks. Despite the growing
+adoption of embedding-based methods, their effectiveness and generalizability
+across diverse application scenarios remain under-explored. To address this, we
+present TAD-Bench, a comprehensive benchmark designed to systematically
+evaluate embedding-based approaches for text anomaly detection. TAD-Bench
+integrates multiple datasets spanning different domains, combining
+state-of-the-art embeddings from large language models with a variety of
+anomaly detection algorithms. Through extensive experiments, we analyze the
+interplay between embeddings and detection methods, uncovering their strengths,
+weaknesses, and applicability to different tasks. These findings offer new
+perspectives on building more robust, efficient, and generalizable anomaly
+detection systems for real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeshONet: A Generalizable and Efficient Operator Learning Method for
+  Structured Mesh Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11937v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11937v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Xiao, Xinhai Chen, Qingling Wang, Jie Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mesh generation plays a crucial role in scientific computing. Traditional
+mesh generation methods, such as TFI and PDE-based methods, often struggle to
+achieve a balance between efficiency and mesh quality. To address this
+challenge, physics-informed intelligent learning methods have recently emerged,
+significantly improving generation efficiency while maintaining high mesh
+quality. However, physics-informed methods fail to generalize when applied to
+previously unseen geometries, as even small changes in the boundary shape
+necessitate burdensome retraining to adapt to new geometric variations. In this
+paper, we introduce MeshONet, the first generalizable intelligent learning
+method for structured mesh generation. The method transforms the mesh
+generation task into an operator learning problem with multiple input and
+solution functions. To effectively overcome the multivariable mapping
+restriction of operator learning methods, we propose a dual-branch,
+shared-trunk architecture to approximate the mapping between function spaces
+based on input-output pairs. Experimental results show that MeshONet achieves a
+speedup of up to four orders of magnitude in generation efficiency over
+traditional methods. It also enables generalization to different geometries
+without retraining, greatly enhancing the practicality of intelligent methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Webvs. LLMs: An Empirical Study of Learning Behaviors of CS2 Students 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aayush Kumar, Daniel Prol, Amin Alipour, Sruti Srinivasa Ragavan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LLMs such as ChatGPT have been widely adopted by students in higher education
+as tools for learning programming and related concepts. However, it remains
+unclear how effective students are and what strategies students use while
+learning with LLMs. Since the majority of students' experiences in online
+self-learning have come through using search engines such as Google, evaluating
+AI tools in this context can help us address these gaps. In this mixed methods
+research, we conducted an exploratory within-subjects study to understand how
+CS2 students learn programming concepts using both LLMs as well as traditional
+online methods such as educational websites and videos to examine how students
+approach learning within and across both scenarios. We discovered that students
+found it easier to learn a more difficult concept using traditional methods
+than using ChatGPT. We also found that students ask fewer follow-ups and use
+more keyword-based queries for search engines while their prompts to LLMs tend
+to explicitly ask for information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Lightweight and Interpretable Deepfakes Detection Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Umar Farooq, Ali Javed, Khalid Mahmood Malik, Muhammad Anas Raza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent realistic creation and dissemination of so-called deepfakes poses
+a serious threat to social life, civil rest, and law. Celebrity defaming,
+election manipulation, and deepfakes as evidence in court of law are few
+potential consequences of deepfakes. The availability of open source trained
+models based on modern frameworks such as PyTorch or TensorFlow, video
+manipulations Apps such as FaceApp and REFACE, and economical computing
+infrastructure has easen the creation of deepfakes. Most of the existing
+detectors focus on detecting either face-swap, lip-sync, or puppet master
+deepfakes, but a unified framework to detect all three types of deepfakes is
+hardly explored. This paper presents a unified framework that exploits the
+power of proposed feature fusion of hybrid facial landmarks and our novel heart
+rate features for detection of all types of deepfakes. We propose novel heart
+rate features and fused them with the facial landmark features to better
+extract the facial artifacts of fake videos and natural variations available in
+the original videos. We used these features to train a light-weight XGBoost to
+classify between the deepfake and bonafide videos. We evaluated the performance
+of our framework on the world leaders dataset (WLDR) that contains all types of
+deepfakes. Experimental results illustrate that the proposed framework offers
+superior detection performance over the comparative deepfakes detection
+methods. Performance comparison of our framework against the LSTM-FCN, a
+candidate of deep learning model, shows that proposed model achieves similar
+results, however, it is more interpretable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Make Full Use of Testing Information: An Integrated Accelerated Testing
+  and Evaluation Method for Autonomous Driving Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinzheng Wu, Junyi Chen, Jianfeng Wu, Longgao Zhang, Tian Xia, Yong Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Testing and evaluation is an important step before the large-scale
+application of the autonomous driving systems (ADSs). Based on the three level
+of scenario abstraction theory, a testing can be performed within a logical
+scenario, followed by an evaluation stage which is inputted with the testing
+results of each concrete scenario generated from the logical parameter space.
+During the above process, abundant testing information is produced which is
+beneficial for comprehensive and accurate evaluations. To make full use of
+testing information, this paper proposes an Integrated accelerated Testing and
+Evaluation Method (ITEM). Based on a Monte Carlo Tree Search (MCTS) paradigm
+and a dual surrogates testing framework proposed in our previous work, this
+paper applies the intermediate information (i.e., the tree structure, including
+the affiliation of each historical sampled point with the subspaces and the
+parent-child relationship between subspaces) generated during the testing stage
+into the evaluation stage to achieve accurate hazardous domain identification.
+Moreover, to better serve this purpose, the UCB calculation method is improved
+to allow the search algorithm to focus more on the hazardous domain boundaries.
+Further, a stopping condition is constructed based on the convergence of the
+search algorithm. Ablation and comparative experiments are then conducted to
+verify the effectiveness of the improvements and the superiority of the
+proposed method. The experimental results show that ITEM could well identify
+the hazardous domains in both low- and high-dimensional cases, regardless of
+the shape of the hazardous domains, indicating its generality and potential for
+the safety evaluation of ADSs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Goal-oriented Transmission Scheduling: Structure-guided DRL with a
+  Unified Dual On-policy and Off-policy Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11921v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11921v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazheng Chen, Wanchun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Goal-oriented communications prioritize application-driven objectives over
+data accuracy, enabling intelligent next-generation wireless systems. Efficient
+scheduling in multi-device, multi-channel systems poses significant challenges
+due to high-dimensional state and action spaces. We address these challenges by
+deriving key structural properties of the optimal solution to the goal-oriented
+scheduling problem, incorporating Age of Information (AoI) and channel states.
+Specifically, we establish the monotonicity of the optimal state value function
+(a measure of long-term system performance) w.r.t. channel states and prove its
+asymptotic convexity w.r.t. AoI states. Additionally, we derive the
+monotonicity of the optimal policy w.r.t. channel states, advancing the
+theoretical framework for optimal scheduling. Leveraging these insights, we
+propose the structure-guided unified dual on-off policy DRL (SUDO-DRL), a
+hybrid algorithm that combines the stability of on-policy training with the
+sample efficiency of off-policy methods. Through a novel structural property
+evaluation framework, SUDO-DRL enables effective and scalable training,
+addressing the complexities of large-scale systems. Numerical results show
+SUDO-DRL improves system performance by up to 45% and reduces convergence time
+by 40% compared to state-of-the-art methods. It also effectively handles
+scheduling in much larger systems, where off-policy DRL fails and on-policy
+benchmarks exhibit significant performance loss, demonstrating its scalability
+and efficacy in goal-oriented communications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper submitted to IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LuxVeri at GenAI Detection Task 3: Cross-Domain Detection of
+  AI-Generated Text Using Inverse Perplexity-Weighted Ensemble of Fine-Tuned
+  <span class="highlight-title">Transformer</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11918v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11918v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Kamrujjaman Mobin, Md Saiful Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents our approach for Task 3 of the GenAI content detection
+workshop at COLING-2025, focusing on Cross-Domain Machine-Generated Text (MGT)
+Detection. We propose an ensemble of fine-tuned transformer models, enhanced by
+inverse perplexity weighting, to improve classification accuracy across diverse
+text domains. For Subtask A (Non-Adversarial MGT Detection), we combined a
+fine-tuned RoBERTa-base model with an OpenAI detector-integrated RoBERTa-base
+model, achieving an aggregate TPR score of 0.826, ranking 10th out of 23
+detectors. In Subtask B (Adversarial MGT Detection), our fine-tuned
+RoBERTa-base model achieved a TPR score of 0.801, securing 8th out of 22
+detectors. Our results demonstrate the effectiveness of inverse
+perplexity-based weighting for enhancing generalization and performance in both
+non-adversarial and adversarial MGT detection, highlighting the potential for
+transformer models in cross-domain AI-generated content detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LuxVeri at GenAI Detection Task 1: Inverse Perplexity Weighted Ensemble
+  for Robust Detection of AI-Generated Text across English and Multilingual
+  Contexts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Kamrujjaman Mobin, Md Saiful Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a system developed for Task 1 of the COLING 2025 Workshop
+on Detecting AI-Generated Content, focusing on the binary classification of
+machine-generated versus human-written text. Our approach utilizes an ensemble
+of models, with weights assigned according to each model's inverse perplexity,
+to enhance classification accuracy. For the English text detection task, we
+combined RoBERTa-base, RoBERTa-base with the OpenAI detector, and
+BERT-base-cased, achieving a Macro F1-score of 0.7458, which ranked us 12th out
+of 35 teams. We ensembled RemBERT, XLM-RoBERTa-base, and
+BERT-base-multilingual-case for the multilingual text detection task, employing
+the same inverse perplexity weighting technique. This resulted in a Macro
+F1-score of 0.7513, positioning us 4th out of 25 teams. Our results demonstrate
+the effectiveness of inverse perplexity weighting in improving the robustness
+of machine-generated text detection across both monolingual and multilingual
+settings, highlighting the potential of ensemble methods for this challenging
+task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging the Communication Gap: Evaluating AI Labeling Practices for
+  Trustworthy AI Development 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Fischer, Magdalena Wischnewski, Alexander van der Staay, Katharina Poitz, Christian Janiesch, Thomas Liebig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As artificial intelligence (AI) becomes integral to economy and society,
+communication gaps between developers, users, and stakeholders hinder trust and
+informed decision-making. High-level AI labels, inspired by frameworks like EU
+energy labels, have been proposed to make the properties of AI models more
+transparent. Without requiring deep technical expertise, they can inform on the
+trade-off between predictive performance and resource efficiency. However, the
+practical benefits and limitations of AI labeling remain underexplored. This
+study evaluates AI labeling through qualitative interviews along four key
+research questions. Based on thematic analysis and inductive coding, we found a
+broad range of practitioners to be interested in AI labeling (RQ1). They see
+benefits for alleviating communication gaps and aiding non-expert
+decision-makers, however limitations, misunderstandings, and suggestions for
+improvement were also discussed (RQ2). Compared to other reporting formats,
+interviewees positively evaluated the reduced complexity of labels, increasing
+overall comprehensibility (RQ3). Trust was influenced most by usability and the
+credibility of the responsible labeling authority, with mixed preferences for
+self-certification versus third-party certification (RQ4). Our Insights
+highlight that AI labels pose a trade-off between simplicity and complexity,
+which could be resolved by developing customizable and interactive labeling
+frameworks to address diverse user needs. Transparent labeling of resource
+efficiency also nudged interviewee priorities towards paying more attention to
+sustainability aspects during AI development. This study validates AI labels as
+a valuable tool for enhancing trust and communication in AI, offering
+actionable guidelines for their refinement and standardization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Panoramic Interests: Stylistic-Content Aware Personalized Headline
+  Generation <span class="chip">WWW'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhong Lian, Xiang Ao, Xinyu Liu, Yang Liu, Qing He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized news headline generation aims to provide users with
+attention-grabbing headlines that are tailored to their preferences. Prevailing
+methods focus on user-oriented content preferences, but most of them overlook
+the fact that diverse stylistic preferences are integral to users' panoramic
+interests, leading to suboptimal personalization. In view of this, we propose a
+novel Stylistic-Content Aware Personalized Headline Generation (SCAPE)
+framework. SCAPE extracts both content and stylistic features from headlines
+with the aid of large language model (LLM) collaboration. It further adaptively
+integrates users' long- and short-term interests through a contrastive
+learning-based hierarchical fusion network. By incorporating the panoramic
+interests into the headline generator, SCAPE reflects users' stylistic-content
+preferences during the generation process. Extensive experiments on the
+real-world dataset PENS demonstrate the superiority of SCAPE over baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to The ACM Web Conference 2025 (WWW'25, short paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Systematic Abductive Reasoning via Diverse Relation Representations in
+  Vector-symbolic Architecture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhong-Hua Sun, Ru-Yuan Zhang, Zonglei Zhen, Da-Hui Wang, Yong-Jie Li, Xiaohong Wan, Hongzhi You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In abstract visual reasoning, monolithic deep learning models suffer from
+limited interpretability and generalization, while existing neuro-symbolic
+approaches fall short in capturing the diversity and systematicity of
+attributes and relation representations. To address these challenges, we
+propose a Systematic Abductive Reasoning model with diverse relation
+representations (Rel-SAR) in Vector-symbolic Architecture (VSA) to solve
+Raven's Progressive Matrices (RPM). To derive attribute representations with
+symbolic reasoning potential, we introduce not only various types of atomic
+vectors that represent numeric, periodic and logical semantics, but also the
+structured high-dimentional representation (SHDR) for the overall Grid
+component. For systematic reasoning, we propose novel numerical and logical
+relation functions and perform rule abduction and execution in a unified
+framework that integrates these relation representations. Experimental results
+demonstrate that Rel-SAR achieves significant improvement on RPM tasks and
+exhibits robust out-of-distribution generalization. Rel-SAR leverages the
+synergy between HD attribute representations and symbolic reasoning to achieve
+systematic abductive reasoning with both interpretable and computable
+semantics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Community-Aware Temporal Walks: Parameter-Free Representation Learning
+  on Continuous-Time Dynamic Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        He Yu, Jing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic graph representation learning plays a crucial role in understanding
+evolving behaviors. However, existing methods often struggle with flexibility,
+adaptability, and the preservation of temporal and structural dynamics. To
+address these issues, we propose Community-aware Temporal Walks (CTWalks), a
+novel framework for representation learning on continuous-time dynamic graphs.
+CTWalks integrates three key components: a community-based parameter-free
+temporal walk sampling mechanism, an anonymization strategy enriched with
+community labels, and an encoding process that leverages continuous temporal
+dynamics modeled via ordinary differential equations (ODEs). This design
+enables precise modeling of both intra- and inter-community interactions,
+offering a fine-grained representation of evolving temporal patterns in
+continuous-time dynamic graphs. CTWalks theoretically overcomes locality bias
+in walks and establishes its connection to matrix factorization. Experiments on
+benchmark datasets demonstrate that CTWalks outperforms established methods in
+temporal link prediction tasks, achieving higher accuracy while maintaining
+robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Drafts to Answers: Unlocking LLM Potential via Aggregation
+  Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yafu Li, Zhilin Wang, Tingchen Fu, Ganqu Cui, Sen Yang, Yu Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling data and model size has been proven effective for boosting the
+performance of large language models. In addition to training-time scaling,
+recent studies have revealed that increasing test-time computational resources
+can further improve performance. In this work, we introduce Aggregation
+Fine-Tuning (AFT), a supervised finetuning paradigm where the model learns to
+synthesize multiple draft responses, referred to as proposals, into a single,
+refined answer, termed aggregation. At inference time, a propose-and-aggregate
+strategy further boosts performance by iteratively generating proposals and
+aggregating them. Empirical evaluations on benchmark datasets show that
+AFT-trained models substantially outperform standard SFT. Notably, an AFT
+model, fine-tuned from Llama3.1-8B-Base with only 64k data, achieves a 41.3% LC
+win rate on AlpacaEval 2, surpassing significantly larger LLMs such as
+Llama3.1-405B-Instruct and GPT4. By combining sequential refinement and
+parallel sampling, the propose-and-aggregate framework scales inference-time
+computation in a flexible manner. Overall, These findings position AFT as a
+promising approach to unlocking additional capabilities of LLMs without
+resorting to increasing data volume or model size.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages; work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coarse-to-Fine Lightweight Meta-Embedding for ID-Based Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Wang, Haipeng Liu, Zeqian Yi, Biao Qian, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The state-of-the-art recommendation systems have shifted the attention to
+efficient recommendation, e.g., on-device recommendation, under memory
+constraints. To this end, the existing methods either focused on the
+lightweight embeddings for both users and items, or involved on-device systems
+enjoying the compact embeddings to enhance reusability and reduces space
+complexity. However, they focus solely on the coarse granularity of embedding,
+while overlook the fine-grained semantic nuances, to adversarially downgrade
+the efficacy of meta-embeddings in capturing the intricate relationship over
+both user and item, consequently resulting into the suboptimal recommendations.
+In this paper, we aim to study how the meta-embedding can efficiently learn
+varied grained semantics, together with how the fine-grained meta-embedding can
+strengthen the representation of coarse-grained meta-embedding. To answer these
+questions, we develop a novel graph neural networks (GNNs) based recommender
+where each user and item serves as the node, linked directly to coarse-grained
+virtual nodes and indirectly to fine-grained virtual nodes, ensuring different
+grained semantic learning, while disclosing: 1) In contrast to coarse-grained
+semantics, fine-grained semantics are well captured through sparse
+meta-embeddings, which adaptively 2) balance the embedding uniqueness and
+memory constraint. Additionally, the initialization method come up upon
+SparsePCA, along with a soft thresholding activation function to render the
+sparseness of the meta-embeddings. We propose a weight bridging update strategy
+that focuses on matching each coarse-grained meta-embedding with several
+fine-grained meta-embeddings based on the users/items' semantics. Extensive
+experiments substantiate our method's superiority over existing baselines. Our
+code is available at https://github.com/htyjers/C2F-MetaEmbed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Network-informed <span class="highlight-title">Prompt</span> Engineering against Organized Astroturf
+  Campaigns under Extreme Class Imbalance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikos Kanakaris, Heng Ping, Xiongye Xiao, Nesreen K. Ahmed, Luca Luceri, Emilio Ferrara, Paul Bogdan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting organized political campaigns is of paramount importance in
+fighting against disinformation on social media. Existing approaches for the
+identification of such organized actions employ techniques mostly from network
+science, graph machine learning and natural language processing. Their ultimate
+goal is to analyze the relationships and interactions (e.g. re-posting) among
+users and the textual similarities of their posts. Despite their effectiveness
+in recognizing astroturf campaigns, these methods face significant challenges,
+notably the class imbalance in available training datasets. To mitigate this
+issue, recent methods usually resort to data augmentation or increasing the
+number of positive samples, which may not always be feasible or sufficient in
+real-world settings. Following a different path, in this paper, we propose a
+novel framework for identifying astroturf campaigns based solely on large
+language models (LLMs), introducing a Balanced Retrieval-Augmented Generation
+(Balanced RAG) component. Our approach first gives both textual information
+concerning the posts (in our case tweets) and the user interactions of the
+social network as input to a language model. Then, through prompt engineering
+and the proposed Balanced RAG method, it effectively detects coordinated
+disinformation campaigns on X (Twitter). The proposed framework does not
+require any training or fine-tuning of the language model. Instead, by
+strategically harnessing the strengths of prompt engineering and Balanced RAG,
+it facilitates LLMs to overcome the effects of class imbalance and effectively
+identify coordinated political campaigns. The experimental results demonstrate
+that by incorporating the proposed prompt engineering and Balanced RAG methods,
+our framework outperforms the traditional graph-based baselines, achieving
+2x-3x improvements in terms of precision, recall and F1 scores.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Memory-Efficient Large-Scale Model Training in AI for
+  Science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyuan Tian, Linbo Qiao, Baihui Liu, Gongqingjian Jiang, Dongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scientific research faces high costs and inefficiencies with traditional
+methods, but the rise of deep learning and large language models (LLMs) offers
+innovative solutions. This survey reviews LLM applications across scientific
+fields such as biology, medicine, chemistry, and meteorology, underscoring
+their role in advancing research. However, the continuous expansion of model
+size has led to significant memory demands, hindering further development and
+application of LLMs for science. To address this, we review memory-efficient
+training techniques for LLMs based on the transformer architecture, including
+distributed training, mixed precision training, and gradient checkpointing.
+Using AlphaFold 2 as an example, we demonstrate how tailored memory
+optimization methods can reduce storage needs while preserving prediction
+accuracy. We also discuss the challenges of memory optimization in practice and
+potential future directions, hoping to provide valuable insights for
+researchers and engineers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Supervised Learning for Analog and RF Circuit Design: Benchmarks and
+  Comparative Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11839v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11839v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asal Mehradfar, Xuzhe Zhao, Yue Niu, Sara Babakniya, Mahdi Alesheikh, Hamidreza Aghasi, Salman Avestimehr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automating analog and radio-frequency (RF) circuit design using machine
+learning (ML) significantly reduces the time and effort required for parameter
+optimization. This study explores supervised ML-based approaches for designing
+circuit parameters from performance specifications across various circuit
+types, including homogeneous and heterogeneous designs. By evaluating diverse
+ML models, from neural networks like transformers to traditional methods like
+random forests, we identify the best-performing models for each circuit. Our
+results show that simpler circuits, such as low-noise amplifiers, achieve
+exceptional accuracy with mean relative errors as low as 0.3% due to their
+linear parameter-performance relationships. In contrast, complex circuits, like
+power amplifiers and voltage-controlled oscillators, present challenges due to
+their non-linear interactions and larger design spaces. For heterogeneous
+circuits, our approach achieves an 88% reduction in errors with increased
+training data, with the receiver achieving a mean relative error as low as
+0.23%, showcasing the scalability and accuracy of the proposed methodology.
+Additionally, we provide insights into model strengths, with transformers
+excelling in capturing non-linear mappings and k-nearest neighbors performing
+robustly in moderately linear parameter spaces, especially in heterogeneous
+circuits with larger datasets. This work establishes a foundation for extending
+ML-driven design automation, enabling more efficient and scalable circuit
+design workflows.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven Detection and Evaluation of Damages in Concrete Structures:
+  Using Deep Learning and Computer Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saeid Ataei, Saeed Adibnazari, Seyyed Taghi Ataei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural integrity is vital for maintaining the safety and longevity of
+concrete infrastructures such as bridges, tunnels, and walls. Traditional
+methods for detecting damages like cracks and spalls are labor-intensive,
+time-consuming, and prone to human error. To address these challenges, this
+study explores advanced data-driven techniques using deep learning for
+automated damage detection and analysis. Two state-of-the-art instance
+segmentation models, YOLO-v7 instance segmentation and Mask R-CNN, were
+evaluated using a dataset comprising 400 images, augmented to 10,995 images
+through geometric and color-based transformations to enhance robustness. The
+models were trained and validated using a dataset split into 90% training set,
+validation and test set 10%. Performance metrics such as precision, recall,
+mean average precision (mAP@0.5), and frames per second (FPS) were used for
+evaluation. YOLO-v7 achieved a superior mAP@0.5 of 96.1% and processed 40 FPS,
+outperforming Mask R-CNN, which achieved a mAP@0.5 of 92.1% with a slower
+processing speed of 18 FPS. The findings recommend YOLO-v7 instance
+segmentation model for real-time, high-speed structural health monitoring,
+while Mask R-CNN is better suited for detailed offline assessments. This study
+demonstrates the potential of deep learning to revolutionize infrastructure
+maintenance, offering a scalable and efficient solution for automated damage
+detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 10 figures. This study focuses on the data-driven detection
+  and evaluation of damages in concrete structures using deep learning and
+  computer vision techniques</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is your LLM trapped in a Mental Set? Investigative study on how mental
+  sets affect the reasoning capabilities of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saiful Haq, Niyati Chhaya, Piyush Pandey, Pushpak Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present an investigative study on how Mental Sets influence
+the reasoning capabilities of LLMs. LLMs have excelled in diverse natural
+language processing (NLP) tasks, driven by advancements in parameter-efficient
+fine-tuning (PEFT) and emergent capabilities like in-context learning (ICL).
+For complex reasoning tasks, selecting the right model for PEFT or ICL is
+critical, often relying on scores on benchmarks such as MMLU, MATH, and GSM8K.
+However, current evaluation methods, based on metrics like F1 Score or
+reasoning chain assessments by larger models, overlook a key dimension:
+adaptability to unfamiliar situations and overcoming entrenched thinking
+patterns. In cognitive psychology, Mental Set refers to the tendency to persist
+with previously successful strategies, even when they become inefficient - a
+challenge for problem solving and reasoning. We compare the performance of LLM
+models like Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct and GPT-4o in the
+presence of mental sets. To the best of our knowledge, this is the first study
+to integrate cognitive psychology concepts into the evaluation of LLMs for
+complex reasoning tasks, providing deeper insights into their adaptability and
+problem-solving efficacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fact-Preserved Personalized News Headline Generation <span class="chip">ICDM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao Yang, Junhong Lian, Xiang Ao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized news headline generation, aiming at generating user-specific
+headlines based on readers' preferences, burgeons a recent flourishing research
+direction. Existing studies generally inject a user interest embedding into an
+encoderdecoder headline generator to make the output personalized, while the
+factual consistency of headlines is inadequate to be verified. In this paper,
+we propose a framework Fact-Preserved Personalized News Headline Generation
+(short for FPG), to prompt a tradeoff between personalization and consistency.
+In FPG, the similarity between the candidate news to be exposed and the
+historical clicked news is used to give different levels of attention to key
+facts in the candidate news, and the similarity scores help to learn a
+fact-aware global user embedding. Besides, an additional training procedure
+based on contrastive learning is devised to further enhance the factual
+consistency of generated headlines. Extensive experiments conducted on a
+real-world benchmark PENS validate the superiority of FPG, especially on the
+tradeoff between personalization and factual consistency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ICDM 2023, Short paper, 6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PXGen: A Post-hoc Explainable Method for Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11827v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11827v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yen-Lung Huang, Ming-Hsi Weng, Hao-Tsung Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid growth of generative AI in numerous applications, explainable
+AI (XAI) plays a crucial role in ensuring the responsible development and
+deployment of generative AI technologies. XAI has undergone notable
+advancements and widespread adoption in recent years, reflecting a concerted
+push to enhance the transparency, interpretability, and credibility of AI
+systems. Recent research emphasizes that a proficient XAI method should adhere
+to a set of criteria, primarily focusing on two key areas. Firstly, it should
+ensure the quality and fluidity of explanations, encompassing aspects like
+faithfulness, plausibility, completeness, and tailoring to individual needs.
+Secondly, the design principle of the XAI system or mechanism should cover the
+following factors such as reliability, resilience, the verifiability of its
+outputs, and the transparency of its algorithm. However, research in XAI for
+generative models remains relatively scarce, with little exploration into how
+such methods can effectively meet these criteria in that domain. In this work,
+we propose PXGen, a post-hoc explainable method for generative models. Given a
+model that needs to be explained, PXGen prepares two materials for the
+explanation, the Anchor set and intrinsic & extrinsic criteria. Those materials
+are customizable by users according to their purpose and requirements. Via the
+calculation of each criterion, each anchor has a set of feature values and
+PXGen provides examplebased explanation methods according to the feature values
+among all the anchors and illustrated and visualized to the users via tractable
+algorithms such as k-dispersion or k-center.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Scalable Graph Unlearning: A Node Influence Maximization based
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Bowen Fan, Zhengyu Wu, Zhiyu Li, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning, as a pivotal technology for enhancing model robustness
+and data privacy, has garnered significant attention in prevalent web mining
+applications, especially in thriving graph-based scenarios. However, most
+existing graph unlearning (GU) approaches face significant challenges due to
+the intricate interactions among web-scale graph elements during the model
+training: (1) The gradient-driven node entanglement hinders the complete
+knowledge removal in response to unlearning requests; (2) The billion-level
+graph elements in the web scenarios present inevitable scalability issues. To
+break the above limitations, we open up a new perspective by drawing a
+connection between GU and conventional social influence maximization. To this
+end, we propose Node Influence Maximization (NIM) through the decoupled
+influence propagation model and fine-grained influence function in a scalable
+manner, which is crafted to be a plug-and-play strategy to identify potential
+nodes affected by unlearning entities. This approach enables offline execution
+independent of GU, allowing it to be seamlessly integrated into most GU methods
+to improve their unlearning performance. Based on this, we introduce Scalable
+Graph Unlearning (SGU) as a new fine-tuned framework, which balances the
+forgetting and reasoning capability of the unlearned model by entity-specific
+optimizations. Extensive experiments on 14 datasets, including large-scale
+ogbn-papers100M, have demonstrated the effectiveness of our approach.
+Specifically, NIM enhances the forgetting capability of most GU methods, while
+SGU achieves comprehensive SOTA performance and maintains scalability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Effective Digraph Representation Learning: A Magnetic Adaptive
+  Propagation based Approach <span class="chip">WWW 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Daohan Su, Zhengyu Wu, Guang Zeng, Hongchao Qin, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The $q$-parameterized magnetic Laplacian serves as the foundation of directed
+graph (digraph) convolution, enabling this kind of digraph neural network
+(MagDG) to encode node features and structural insights by complex-domain
+message passing. As a generalization of undirected methods, MagDG shows
+superior capability in modeling intricate web-scale topology. Despite the great
+success achieved by existing MagDGs, limitations still exist: (1) Hand-crafted
+$q$: The performance of MagDGs depends on selecting an appropriate
+$q$-parameter to construct suitable graph propagation equations in the complex
+domain. This parameter tuning, driven by downstream tasks, limits model
+flexibility and significantly increases manual effort. (2) Coarse Message
+Passing: Most approaches treat all nodes with the same complex-domain
+propagation and aggregation rules, neglecting their unique digraph contexts.
+This oversight results in sub-optimal performance. To address the above issues,
+we propose two key techniques: (1) MAP is crafted to be a plug-and-play
+complex-domain propagation optimization strategy in the context of digraph
+learning, enabling seamless integration into any MagDG to improve predictions
+while enjoying high running efficiency. (2) MAP++ is a new digraph learning
+framework, further incorporating a learnable mechanism to achieve adaptively
+edge-wise propagation and node-wise aggregation in the complex domain for
+better performance. Extensive experiments on 12 datasets demonstrate that MAP
+enjoys flexibility for it can be incorporated with any MagDG, and scalability
+as it can deal with web-scale digraphs. MAP++ achieves SOTA predictive
+performance on 4 different downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by WWW 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Policy-Adaptable Methods For Resolving Normative Conflicts Through
+  Argumentation and Graph Colouring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11799v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11799v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johnny Joyce
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a multi-agent system, one may choose to govern the behaviour of an agent
+by imposing norms, which act as guidelines for how agents should act either all
+of the time or in given situations. However, imposing multiple norms on one or
+more agents may result in situations where these norms conflict over how the
+agent should behave. In any system with normative conflicts (such as safe
+reinforcement models or systems which monitor safety protocols), one must
+decide which norms should be followed such that the most important and most
+relevant norms are maintained. We introduce a new method for resolving
+normative conflicts through argumentation and graph colouring which is
+compatible with a variety of normative conflict resolution policies. We prove
+that this method always creates an admissible set of arguments under
+argumentation semantics, meaning that it produces coherent outputs. We also
+introduce more robust variants of this method, each building upon their
+predecessor to create a superior output, and we include further mathematical
+proof of their coherence. Our most advanced variant uses the existing concept
+of curtailment, where one norm may supersede another without fully eliminating
+it. The methods we introduce are all compatible with various pre-existing
+policies for resolving normative conflicts. Empirical evaluations are also
+performed to compare our algorithms to each other and to others in existing
+literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Written and submitted as master's thesis for University of
+  Southampton in 2020</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Position: the emergence of wavelet-like properties in
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.18067v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.18067v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valeria Ruscio, Fabrizio Silvestri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies how transformer models develop robust wavelet-like
+properties that effectively compensate for the theoretical limitations of
+Rotary Position Embeddings (RoPE), providing insights into how these networks
+process sequential information across different scales. Through theoretical
+analysis and empirical validation across models ranging from 1B to 12B
+parameters, we show that attention heads naturally evolve to implement
+multi-resolution processing analogous to wavelet transforms. Our analysis
+establishes that attention heads consistently organize into complementary
+frequency bands with systematic power distribution patterns, and these
+wavelet-like characteristics become more pronounced in larger models. We
+provide mathematical analysis showing how these properties align with optimal
+solutions to the fundamental uncertainty principle between positional precision
+and frequency resolution. Our findings suggest that the effectiveness of modern
+transformer architectures stems significantly from their development of optimal
+multi-resolution decompositions that naturally address the theoretical
+constraints of position encoding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A recent evaluation on the performance of LLMs on radiation oncology
+  physics using questions of randomly shuffled options 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10622v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10622v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peilong Wang, Jason Holmes, Zhengliang Liu, Dequan Chen, Tianming Liu, Jiajian Shen, Wei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: We present an updated study evaluating the performance of large
+language models (LLMs) in answering radiation oncology physics questions,
+focusing on the recently released models.
+  Methods: A set of 100 multiple-choice radiation oncology physics questions,
+previously created by a well-experienced physicist, was used for this study.
+The answer options of the questions were randomly shuffled to create "new" exam
+sets. Five LLMs -- OpenAI o1-preview, GPT-4o, LLaMA 3.1 (405B), Gemini 1.5 Pro,
+and Claude 3.5 Sonnet -- with the versions released before September 30, 2024,
+were queried using these new exam sets. To evaluate their deductive reasoning
+ability, the correct answer options in the questions were replaced with "None
+of the above." Then, the explain-first and step-by-step instruction prompts
+were used to test if this strategy improved their reasoning ability. The
+performance of the LLMs was compared with the answers from medical physicists.
+  Results: All models demonstrated expert-level performance on these questions,
+with o1-preview even surpassing medical physicists with a majority vote. When
+replacing the correct answer options with 'None of the above', all models
+exhibited a considerable decline in performance, suggesting room for
+improvement. The explain-first and step-by-step instruction prompts helped
+enhance the reasoning ability of the LLaMA 3.1 (405B), Gemini 1.5 Pro, and
+Claude 3.5 Sonnet models.
+  Conclusion: These recently released LLMs demonstrated expert-level
+performance in answering radiation oncology physics questions, exhibiting great
+potential to assist in radiation oncology physics education and training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S+t-SNE -- Bringing Dimensionality Reduction to Data Streams 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17643v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17643v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro C. Vieira, João P. Montrezol, João T. Vieira, João Gama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present S+t-SNE, an adaptation of the t-SNE algorithm designed to handle
+infinite data streams. The core idea behind S+t-SNE is to update the t-SNE
+embedding incrementally as new data arrives, ensuring scalability and
+adaptability to handle streaming scenarios. By selecting the most important
+points at each step, the algorithm ensures scalability while keeping
+informative visualisations. By employing a blind method for drift management,
+the algorithm adjusts the embedding space, which facilitates the visualisation
+of evolving data dynamics. Our experimental evaluations demonstrate the
+effectiveness and efficiency of S+t-SNE, whilst highlighting its ability to
+capture patterns in a streaming scenario. We hope our approach offers
+researchers and practitioners a real-time tool for understanding and
+interpreting high-dimensional data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint has undergone peer review but does not have any
+  post-submission improvements or corrections. Full version after peer-review
+  and post-acceptance improvements was presented at IDA2024
+  (https://ida2024.blogs.dsv.su.se/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time-Series Foundation Model for Value-at-Risk Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11773v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11773v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anubha Goel, Puneet Pasricha, Juho Kanniainen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study is the first to analyze the performance of a time-series
+foundation model for Value-at-Risk (VaR), which essentially forecasts the
+left-tail quantiles of returns. Foundation models, pre-trained on diverse
+datasets, can be applied in a zero-shot setting with minimal data or further
+improved through finetuning. We compare Google's TimesFM model to conventional
+parametric and non-parametric models, including GARCH and Generalized
+Autoregressive Score (GAS), using 19 years of daily returns from the S&P 100
+index and its constituents. Backtesting with over 8.5 years of out-of-sample
+data shows that the fine-tuned foundation model consistently outperforms
+traditional methods in actual-over-expected ratios. For the quantile score loss
+function, it performs comparably to the best econometric model, GAS. Overall,
+the foundation model ranks as the best or among the top performers across the
+0.01, 0.025, 0.05, and 0.1 quantile forecasting. Fine-tuning significantly
+improves accuracy, showing that zero-shot use is not optimal for VaR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FLARE: Faithful Logic-Aided Reasoning and Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11900v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11900v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Arakelyan, Pasquale Minervini, Pat Verga, Patrick Lewis, Isabelle Augenstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern Question Answering (QA) and Reasoning approaches based on Large
+Language Models (LLMs) commonly use prompting techniques, such as
+Chain-of-Thought (CoT), assuming the resulting generation will have a more
+granular exploration and reasoning over the question space and scope. However,
+such methods struggle with generating outputs that are faithful to the
+intermediate chain of reasoning produced by the model. On the other end of the
+spectrum, neuro-symbolic methods such as Faithful CoT (F-CoT) propose to
+combine LLMs with external symbolic solvers. While such approaches boast a high
+degree of faithfulness, they usually require a model trained for code
+generation and struggle with tasks that are ambiguous or hard to formalise
+strictly. We introduce $\textbf{F}$aithful $\textbf{L}$ogic-$\textbf{A}$ided
+$\textbf{R}$easoning and $\textbf{E}$xploration ($\textbf{FLARE}$), a novel
+interpretable approach for traversing the problem space using task
+decompositions. We use the LLM to plan a solution, soft-formalise the query
+into facts and predicates using a logic programming code and simulate that code
+execution using an exhaustive multi-hop search over the defined space. Our
+method allows us to compute the faithfulness of the reasoning process w.r.t.
+the generated code and analyse the steps of the multi-hop search without
+relying on external solvers. Our methods achieve SOTA results on $\mathbf{7}$
+out of $\mathbf{9}$ diverse reasoning benchmarks. We also show that model
+faithfulness positively correlates with overall performance and further
+demonstrate that $\textbf{FLARE}$ allows pinpointing the decisive factors
+sufficient for and leading to the correct answer with optimal reasoning during
+the multi-hop search.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Specialization: Assessing the Capabilities of MLLMs in Age and
+  Gender Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02302v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02302v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maksim Kuprashevich, Grigorii Alekseenko, Irina Tolstykh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have recently gained immense
+popularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as
+open-source ones such as LLaVA, are essentially general-purpose models and are
+applied to solve a wide variety of tasks, including those in computer vision.
+These neural networks possess such strong general knowledge and reasoning
+abilities that they have proven capable of working even on tasks for which they
+were not specifically trained. We compared the capabilities of the most
+powerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task
+of age and gender estimation with our state-of-the-art specialized model,
+MiVOLO. We also updated MiVOLO and provide details and new metrics in this
+article. This comparison has yielded some interesting results and insights
+about the strengths and weaknesses of the participating models. Furthermore, we
+attempted various ways to fine-tune the ShareGPT4V model for this specific
+task, aiming to achieve state-of-the-art results in this particular challenge.
+Although such a model would not be practical in production, as it is incredibly
+expensive compared to a specialized model like MiVOLO, it could be very useful
+in some tasks, like data annotation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative Topological Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15152v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15152v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alona Levy-Jurgenson, Zohar Yakhini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative methods have recently seen significant improvements by generating
+in a lower-dimensional latent representation of the data. However, many of the
+generative methods applied in the latent space remain complex and difficult to
+train. Further, it is not entirely clear why transitioning to a
+lower-dimensional latent space can improve generative quality. In this work, we
+introduce a new and simple generative method grounded in topology theory --
+Generative Topological Networks (GTNs) -- which also provides insights into why
+lower-dimensional latent-space representations might be better-suited for data
+generation. GTNs are simple to train -- they employ a standard supervised
+learning approach and do not suffer from common generative pitfalls such as
+mode collapse, posterior collapse or the need to pose constraints on the neural
+network architecture. We demonstrate the use of GTNs on several datasets,
+including MNIST, CelebA, CIFAR-10 and the Hands and Palm Images dataset by
+training GTNs on a lower-dimensional latent representation of the data. We show
+that GTNs can improve upon VAEs and that they are quick to converge, generating
+realistic samples in early epochs. Further, we use the topological
+considerations behind the development of GTNs to offer insights into why
+generative models may benefit from operating on a lower-dimensional latent
+space, highlighting the important link between the intrinsic dimension of the
+data and the dimension in which the data is generated. Particularly, we
+demonstrate that generating in high dimensional ambient spaces may be a
+contributing factor to out-of-distribution samples generated by diffusion
+models. We also highlight other topological properties that are important to
+consider when using and designing generative models. Our code is available at:
+https://github.com/alonalj/GTN
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HIVEX: A High-Impact Environment Suite for Multi-Agent Research
+  (extended version) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04180v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04180v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Dominic Siedler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Games have been vital test beds for the rapid development of Agent-based
+research. Remarkable progress has been achieved in the past, but it is unclear
+if the findings equip for real-world problems. While pressure grows, some of
+the most critical ecological challenges can find mitigation and prevention
+solutions through technology and its applications. Most real-world domains
+include multi-agent scenarios and require machine-machine and human-machine
+collaboration. Open-source environments have not advanced and are often toy
+scenarios, too abstract or not suitable for multi-agent research. By mimicking
+real-world problems and increasing the complexity of environments, we hope to
+advance state-of-the-art multi-agent research and inspire researchers to work
+on immediate real-world problems. Here, we present HIVEX, an environment suite
+to benchmark multi-agent research focusing on ecological challenges. HIVEX
+includes the following environments: Wind Farm Control, Wildfire Resource
+Management, Drone-Based Reforestation, Ocean Plastic Collection, and Aerial
+Wildfire Suppression. We provide environments, training examples, and baselines
+for the main and sub-tasks. All trained models resulting from the experiments
+of this work are hosted on Hugging Face. We also provide a leaderboard on
+Hugging Face and encourage the community to submit models trained on our
+environment suite.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Positional encoding is not the same as context: A study on positional
+  encoding for sequential recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10436v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10436v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejo Lopez-Avila, Jinhua Du, Abbas Shimary, Ze Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid growth of streaming media and e-commerce has driven advancements in
+recommendation systems, particularly Sequential Recommendation Systems (SRS).
+These systems employ users' interaction histories to predict future
+preferences. While recent research has focused on architectural innovations
+like transformer blocks and feature extraction, positional encodings, crucial
+for capturing temporal patterns, have received less attention. These encodings
+are often conflated with contextual, such as the temporal footprint, which
+previous works tend to treat as interchangeable with positional information.
+This paper highlights the critical distinction between temporal footprint and
+positional encodings, demonstrating that the latter offers unique relational
+cues between items, which the temporal footprint alone cannot provide. Through
+extensive experimentation on eight Amazon datasets and subsets, we assess the
+impact of various encodings on performance metrics and training stability. We
+introduce new positional encodings and investigate integration strategies that
+improve both metrics and stability, surpassing state-of-the-art results at the
+time of this work's initial preprint. Importantly, we demonstrate that
+selecting the appropriate encoding is not only key to better performance but
+also essential for building robust, reliable SRS models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 6 figures, 21 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Model-Brained GUI Agents: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.18279v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.18279v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyun Zhang, Shilin He, Jiaxu Qian, Bowen Li, Liqun Li, Si Qin, Yu Kang, Minghua Ma, Guyue Liu, Qingwei Lin, Saravan Rajmohan, Dongmei Zhang, Qi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GUIs have long been central to human-computer interaction, providing an
+intuitive and visually-driven way to access and interact with digital systems.
+The advent of LLMs, particularly multimodal models, has ushered in a new era of
+GUI automation. They have demonstrated exceptional capabilities in natural
+language understanding, code generation, and visual processing. This has paved
+the way for a new generation of LLM-brained GUI agents capable of interpreting
+complex GUI elements and autonomously executing actions based on natural
+language instructions. These agents represent a paradigm shift, enabling users
+to perform intricate, multi-step tasks through simple conversational commands.
+Their applications span across web navigation, mobile app interactions, and
+desktop automation, offering a transformative user experience that
+revolutionizes how individuals interact with software. This emerging field is
+rapidly advancing, with significant progress in both research and industry.
+  To provide a structured understanding of this trend, this paper presents a
+comprehensive survey of LLM-brained GUI agents, exploring their historical
+evolution, core components, and advanced techniques. We address research
+questions such as existing GUI agent frameworks, the collection and utilization
+of data for training specialized GUI agents, the development of large action
+models tailored for GUI tasks, and the evaluation metrics and benchmarks
+necessary to assess their effectiveness. Additionally, we examine emerging
+applications powered by these agents. Through a detailed analysis, this survey
+identifies key research gaps and outlines a roadmap for future advancements in
+the field. By consolidating foundational knowledge and state-of-the-art
+developments, this work aims to guide both researchers and practitioners in
+overcoming challenges and unlocking the full potential of LLM-brained GUI
+agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The collection of papers reviewed in this survey will be hosted and
+  regularly updated on the GitHub repository:
+  https://github.com/vyokky/LLM-Brained-GUI-Agents-Survey Additionally, a
+  searchable webpage is available at https://aka.ms/gui-agent for easier access
+  and exploration</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilling Calibration via Conformalized Credal Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Huang, Sangwoo Park, Nicola Paoletti, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying artificial intelligence (AI) models on edge devices involves a
+delicate balance between meeting stringent complexity constraints, such as
+limited memory and energy resources, and ensuring reliable performance in
+sensitive decision-making tasks. One way to enhance reliability is through
+uncertainty quantification via Bayesian inference. This approach, however,
+typically necessitates maintaining and running multiple models in an ensemble,
+which may exceed the computational limits of edge devices. This paper
+introduces a low-complexity methodology to address this challenge by distilling
+calibration information from a more complex model. In an offline phase,
+predictive probabilities generated by a high-complexity cloud-based model are
+leveraged to determine a threshold based on the typical divergence between the
+cloud and edge models. At run time, this threshold is used to construct credal
+sets -- ranges of predictive probabilities that are guaranteed, with a
+user-selected confidence level, to include the predictions of the cloud model.
+The credal sets are obtained through thresholding of a divergence measure in
+the simplex of predictive probabilities. Experiments on visual and language
+tasks demonstrate that the proposed approach, termed Conformalized Distillation
+for Credal Inference (CD-CI), significantly improves calibration performance
+compared to low-complexity Bayesian methods, such as Laplace approximation,
+making it a practical and efficient solution for edge AI deployments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Yi: Open Foundation Models by 01.AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04652v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04652v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        01. AI,  :, Alex Young, Bei Chen, Chao Li, Chengen Huang, Ge Zhang, Guanwei Zhang, Guoyin Wang, Heng Li, Jiangcheng Zhu, Jianqun Chen, Jing Chang, Kaidong Yu, Peng Liu, Qiang Liu, Shawn Yue, Senbin Yang, Shiming Yang, Wen Xie, Wenhao Huang, Xiaohui Hu, Xiaoyi Ren, Xinyao Niu, Pengcheng Nie, Yanpeng Li, Yuchi Xu, Yudong Liu, Yue Wang, Yuxuan Cai, Zhenyu Gu, Zhiyuan Liu, Zonghong Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the Yi model family, a series of language and multimodal models
+that demonstrate strong multi-dimensional capabilities. The Yi model family is
+based on 6B and 34B pretrained language models, then we extend them to chat
+models, 200K long context models, depth-upscaled models, and vision-language
+models. Our base models achieve strong performance on a wide range of
+benchmarks like MMLU, and our finetuned chat models deliver strong human
+preference rate on major evaluation platforms like AlpacaEval and Chatbot
+Arena. Building upon our scalable super-computing infrastructure and the
+classical transformer architecture, we attribute the performance of Yi models
+primarily to its data quality resulting from our data-engineering efforts. For
+pretraining, we construct 3.1 trillion tokens of English and Chinese corpora
+using a cascaded data deduplication and quality filtering pipeline. For
+finetuning, we polish a small scale (less than 10K) instruction dataset over
+multiple iterations such that every single instance has been verified directly
+by our machine learning engineers. For vision-language, we combine the chat
+language model with a vision transformer encoder and train the model to align
+visual representations to the semantic space of the language model. We further
+extend the context length to 200K through lightweight continual pretraining and
+demonstrate strong needle-in-a-haystack retrieval performance. We show that
+extending the depth of the pretrained checkpoint through continual pretraining
+further improves performance. We believe that given our current results,
+continuing to scale up model parameters using thoroughly optimized data will
+lead to even stronger frontier models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Federated Learning Over the Air: Combating Heavy-Tailed Noise
+  with Median Anchored Clipping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.15100v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.15100v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Li, Zihan Chen, Kai Fong Ernest Chong, Bikramjit Das, Tony Q. S. Quek, Howard H. Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging over-the-air computations for model aggregation is an effective
+approach to cope with the communication bottleneck in federated edge learning.
+By exploiting the superposition properties of multi-access channels, this
+approach facilitates an integrated design of communication and computation,
+thereby enhancing system privacy while reducing implementation costs. However,
+the inherent electromagnetic interference in radio channels often exhibits
+heavy-tailed distributions, giving rise to exceptionally strong noise in
+globally aggregated gradients that can significantly deteriorate the training
+performance. To address this issue, we propose a novel gradient clipping
+method, termed Median Anchored Clipping (MAC), to combat the detrimental
+effects of heavy-tailed noise. We also derive analytical expressions for the
+convergence rate of model training with analog over-the-air federated learning
+under MAC, which quantitatively demonstrates the effect of MAC on training
+performance. Extensive experimental results show that the proposed MAC
+algorithm effectively mitigates the impact of heavy-tailed noise, hence
+substantially enhancing system robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the full version of the paper, and the appendix contains a
+  complete convergence analysis under non-convex conditions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive Study of Structural Pruning for Vision Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12315v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12315v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changhao Li, Haoling Li, Mengqi Xue, Gongfan Fang, Sheng Zhou, Zunlei Feng, Huiqiong Wang, Mingli Song, Jie Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural pruning has emerged as a promising approach for producing more
+efficient models. Nevertheless, the community suffers from a lack of
+standardized benchmarks and metrics, leaving the progress in this area not
+fully comprehended. To fill this gap, we present the first comprehensive
+benchmark, termed PruningBench, for structural pruning. PruningBench showcases
+the following three characteristics: 1) PruningBench employs a unified and
+consistent framework for evaluating the effectiveness of diverse structural
+pruning techniques; 2) PruningBench systematically evaluates 16 existing
+pruning methods, encompassing a wide array of models (e.g., CNNs and ViTs) and
+tasks (e.g., classification and detection); 3) PruningBench provides easily
+implementable interfaces to facilitate the implementation of future pruning
+methods, and enables the subsequent researchers to incorporate their work into
+our leaderboards. We provide an online pruning platform for customizing pruning
+tasks and reproducing all results in this paper. Leaderboard results can also
+be available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper aims to introduce an evaluation benchmark for structural
+  pruning. The complete text spans 25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LogLLM: Log-based Anomaly Detection Using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.08561v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.08561v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Guan, Jian Cao, Shiyou Qian, Jianqi Gao, Chun Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Software systems often record important runtime information in logs to help
+with troubleshooting. Log-based anomaly detection has become a key research
+area that aims to identify system issues through log data, ultimately enhancing
+the reliability of software systems. Traditional deep learning methods often
+struggle to capture the semantic information embedded in log data, which is
+typically organized in natural language. In this paper, we propose LogLLM, a
+log-based anomaly detection framework that leverages large language models
+(LLMs). LogLLM employs BERT for extracting semantic vectors from log messages,
+while utilizing Llama, a transformer decoder-based model, for classifying log
+sequences. Additionally, we introduce a projector to align the vector
+representation spaces of BERT and Llama, ensuring a cohesive understanding of
+log semantics. Unlike conventional methods that require log parsers to extract
+templates, LogLLM preprocesses log messages with regular expressions,
+streamlining the entire process. Our framework is trained through a novel
+three-stage procedure designed to enhance performance and adaptability.
+Experimental results across four public datasets demonstrate that LogLLM
+outperforms state-of-the-art methods. Even when handling unstable logs, it
+effectively captures the semantic meaning of log messages and detects anomalies
+accurately.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diversify, Don't Fine-Tune: Scaling Up Visual Recognition Training with
+  Synthetic Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02253v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02253v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoran Yu, Chenchen Zhu, Sean Culatana, Raghuraman Krishnamoorthi, Fanyi Xiao, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in generative deep learning have enabled the creation of
+high-quality synthetic images in text-to-image generation. Prior work shows
+that fine-tuning a pretrained diffusion model on ImageNet and generating
+synthetic training images from the finetuned model can enhance an ImageNet
+classifier's performance. However, performance degrades as synthetic images
+outnumber real ones. In this paper, we explore whether generative fine-tuning
+is essential for this improvement and whether it is possible to further scale
+up training using more synthetic data. We present a new framework leveraging
+off-the-shelf generative models to generate synthetic training images,
+addressing multiple challenges: class name ambiguity, lack of diversity in
+naive prompts, and domain shifts. Specifically, we leverage large language
+models (LLMs) and CLIP to resolve class name ambiguity. To diversify images, we
+propose contextualized diversification (CD) and stylized diversification (SD)
+methods, also prompted by LLMs. Finally, to mitigate domain shifts, we leverage
+domain adaptation techniques with auxiliary batch normalization for synthetic
+images. Our framework consistently enhances recognition model performance with
+more synthetic data, up to 6x of original ImageNet size showcasing the
+potential of synthetic data for improved recognition models and strong
+out-of-domain generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On non-approximability of zero loss global ${\mathcal L}^2$ minimizers
+  by gradient descent in Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07065v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07065v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Chen, Patricia Muñoz Ewald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze geometric aspects of the gradient descent algorithm in Deep
+Learning (DL), and give a detailed discussion of the circumstance that in
+underparametrized DL networks, zero loss minimization can generically not be
+attained. As a consequence, we conclude that the distribution of training
+inputs must necessarily be non-generic in order to produce zero loss
+minimizers, both for the method constructed in [Chen-Munoz Ewald 2023, 2024],
+or for gradient descent [Chen 2025] (which assume clustering of training data).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AMS Latex, 7 pages. Title changed, statement of Corollary 1.6
+  corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Convolutional Neural Networks on Multiclass Classification of
+  Three-Dimensional Brain Images for Parkinson's Disease Stage Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23649v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23649v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guan-Hua Huang, Wan-Chen Lai, Tai-Been Chen, Chien-Chin Hsu, Huei-Yung Chen, Yi-Chen Wu, Li-Ren Yeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parkinson's disease (PD), a degenerative disorder of the central nervous
+system, is commonly diagnosed using functional medical imaging techniques such
+as single-photon emission computed tomography (SPECT). In this study, we
+utilized two SPECT data sets (n = 634 and n = 202) from different hospitals to
+develop a model capable of accurately predicting PD stages, a multiclass
+classification task. We used the entire three-dimensional (3D) brain images as
+input and experimented with various model architectures. Initially, we treated
+the 3D images as sequences of two-dimensional (2D) slices and fed them
+sequentially into 2D convolutional neural network (CNN) models pretrained on
+ImageNet, averaging the outputs to obtain the final predicted stage. We also
+applied 3D CNN models pretrained on Kinetics-400. Additionally, we incorporated
+an attention mechanism to account for the varying importance of different
+slices in the prediction process. To further enhance model efficacy and
+robustness, we simultaneously trained the two data sets using weight sharing, a
+technique known as cotraining. Our results demonstrated that 2D models
+pretrained on ImageNet outperformed 3D models pretrained on Kinetics-400, and
+models utilizing the attention mechanism outperformed both 2D and 3D models.
+The cotraining technique proved effective in improving model performance when
+the cotraining data sets were sufficiently large.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 7 figures, and 4 tables. This paper has been accepted for
+  publication in Journal of Imaging Informatics in Medicine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating Recurrent <span class="highlight-title">Transformer</span>s with Dynamic Halt 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00976v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00976v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jishnu Ray Chowdhury, Cornelia Caragea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we comprehensively study the inductive biases of two major
+approaches to augmenting Transformers with a recurrent mechanism: (1) the
+approach of incorporating a depth-wise recurrence similar to Universal
+Transformers; and (2) the approach of incorporating a chunk-wise temporal
+recurrence like Temporal Latent Bottleneck. Furthermore, we propose and
+investigate novel ways to extend and combine the above methods - for example,
+we propose a global mean-based dynamic halting mechanism for Universal
+Transformers and an augmentation of Temporal Latent Bottleneck with elements
+from Universal Transformer. We compare the models and probe their inductive
+biases in several diagnostic tasks, such as Long Range Arena (LRA), flip-flop
+language modeling, ListOps, and Logical Inference. The code is released in:
+https://github.com/JRC1995/InvestigatingRecurrentTransformers/tree/main
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Judging the Judges: Evaluating Alignment and Vulnerabilities in
+  LLMs-as-Judges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12624v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12624v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aman Singh Thakur, Kartik Choudhary, Venkat Srinik Ramayapally, Sankaran Vaidyanathan, Dieuwke Hupkes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offering a promising solution to the scalability challenges associated with
+human evaluation, the LLM-as-a-judge paradigm is rapidly gaining traction as an
+approach to evaluating large language models (LLMs). However, there are still
+many open questions about the strengths and weaknesses of this paradigm, and
+what potential biases it may hold. In this paper, we present a comprehensive
+study of the performance of various LLMs acting as judges, focusing on a clean
+scenario in which inter-human agreement is high. Investigating thirteen judge
+models of different model sizes and families, judging answers of nine different
+'examtaker models' - both base and instruction-tuned - we find that only the
+best (and largest) models achieve reasonable alignment with humans. However,
+they are still quite far behind inter-human agreement and their assigned scores
+may still differ with up to 5 points from human-assigned scores. In terms of
+their ranking of the nine exam-taker models, instead, also smaller models and
+even the lexical metric contains may provide a reasonable signal. Through error
+analysis and other studies, we identify vulnerabilities in judge models, such
+as their sensitivity to prompt complexity and length, and a tendency toward
+leniency. The fact that even the best judges differ from humans in this
+comparatively simple setup suggest that caution may be wise when using judges
+in more complex setups. Lastly, our research rediscovers the importance of
+using alignment metrics beyond simple percent alignment, showing that judges
+with high percent agreement can still assign vastly different scores.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FLAME: Learning to Navigate with Multimodal LLM in Urban Environments <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhe Xu, Yiyuan Pan, Zhe Liu, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated potential in
+Vision-and-Language Navigation (VLN) tasks, yet current applications face
+challenges. While LLMs excel in general conversation scenarios, they struggle
+with specialized navigation tasks, yielding suboptimal performance compared to
+specialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied
+Agent), a novel Multimodal LLM-based agent and architecture designed for urban
+VLN tasks that efficiently handles multiple observations. Our approach
+implements a three-phase tuning technique for effective adaptation to
+navigation tasks, including single perception tuning for street view
+description, multiple perception tuning for route summarization, and end-to-end
+training on VLN datasets. The augmented datasets are synthesized automatically.
+Experimental results demonstrate FLAME's superiority over existing methods,
+surpassing state-of-the-art methods by a 7.3% increase in task completion on
+Touchdown dataset. This work showcases the potential of Multimodal LLMs (MLLMs)
+in complex navigation tasks, representing an advancement towards applications
+of MLLMs in the field of embodied intelligence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Customizing Language Models with Instance-wise LoRA for Sequential
+  Recommendation <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10159v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10159v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Kong, Jiancan Wu, An Zhang, Leheng Sheng, Hui Lin, Xiang Wang, Xiangnan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation systems predict the next interaction item based on
+users' past interactions, aligning recommendations with individual preferences.
+Leveraging the strengths of Large Language Models (LLMs) in knowledge
+comprehension and reasoning, recent approaches are eager to apply LLMs to
+sequential recommendation. A common paradigm is converting user behavior
+sequences into instruction data, and fine-tuning the LLM with
+parameter-efficient fine-tuning (PEFT) methods like Low-Rank Adaption (LoRA).
+However, the uniform application of LoRA across diverse user behaviors is
+insufficient to capture individual variability, resulting in negative transfer
+between disparate sequences. To address these challenges, we propose
+Instance-wise LoRA (iLoRA). We innovatively treat the sequential recommendation
+task as a form of multi-task learning, integrating LoRA with the Mixture of
+Experts (MoE) framework. This approach encourages different experts to capture
+various aspects of user behavior. Additionally, we introduce a sequence
+representation guided gate function that generates customized expert
+participation weights for each user sequence, which allows dynamic parameter
+adjustment for instance-wise recommendations. In sequential recommendation,
+iLoRA achieves an average relative improvement of 11.4\% over basic LoRA in the
+hit ratio metric, with less than a 1\% relative increase in trainable
+parameters. Extensive experiments on three benchmark datasets demonstrate the
+effectiveness of iLoRA, highlighting its superior performance compared to
+existing methods in mitigating negative transfer and improving recommendation
+accuracy. Our data and code are available at
+https://github.com/AkaliKong/iLoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenFGL: A Comprehensive Benchmark for Federated Graph Learning <span class="chip">VLDB 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16288v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16288v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Yinlin Zhu, Boyang Pang, Guochen Yan, Yeyu Yan, Zening Li, Zhengyu Wu, Wentao Zhang, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated graph learning (FGL) is a promising distributed training paradigm
+for graph neural networks across multiple local systems without direct data
+sharing. This approach inherently involves large-scale distributed graph
+processing, which closely aligns with the challenges and research focuses of
+graph-based data systems. Despite the proliferation of FGL, the diverse
+motivations from real-world applications, spanning various research backgrounds
+and settings, pose a significant challenge to fair evaluation. To fill this
+gap, we propose OpenFGL, a unified benchmark designed for the primary FGL
+scenarios: Graph-FL and Subgraph-FL. Specifically, OpenFGL includes 42 graph
+datasets from 18 application domains, 8 federated data simulation strategies
+that emphasize different graph properties, and 5 graph-based downstream tasks.
+Additionally, it offers 18 recently proposed SOTA FGL algorithms through a
+user-friendly API, enabling a thorough comparison and comprehensive evaluation
+of their effectiveness, robustness, and efficiency. Our empirical results
+demonstrate the capabilities of FGL while also highlighting its potential
+limitations, providing valuable insights for future research in this growing
+field, particularly in fostering greater interdisciplinary collaboration
+between FGL and data systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by VLDB 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DurFlex-EVC: Duration-Flexible Emotional Voice Conversion Leveraging
+  Discrete Representations without Text Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08095v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08095v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyung-Seok Oh, Sang-Hoon Lee, Deok-Hyeon Cho, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotional voice conversion (EVC) involves modifying various acoustic
+characteristics, such as pitch and spectral envelope, to match a desired
+emotional state while preserving the speaker's identity. Existing EVC methods
+often rely on text transcriptions or time-alignment information and struggle to
+handle varying speech durations effectively. In this paper, we propose
+DurFlex-EVC, a duration-flexible EVC framework that operates without the need
+for text or alignment information. We introduce a unit aligner that models
+contextual information by aligning speech with discrete units representing
+content, eliminating the need for text or speech-text alignment. Additionally,
+we design a style autoencoder that effectively disentangles content and
+emotional style, allowing precise manipulation of the emotional characteristics
+of the speech. We further enhance emotional expressiveness through a
+hierarchical stylize encoder that applies the target emotional style at
+multiple hierarchical levels, refining the stylization process to improve the
+naturalness and expressiveness of the converted speech. Experimental results
+from subjective and objective evaluations demonstrate that our approach
+outperforms baseline models, effectively handling duration variability and
+enhancing emotional expressiveness in the converted speech.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 11 figures, 12 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reward-Augmented Data Enhances Direct Preference Alignment of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.08067v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.08067v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenao Zhang, Zhihan Liu, Boyi Liu, Yufeng Zhang, Yingxiang Yang, Yongfei Liu, Liyu Chen, Tao Sun, Zhaoran Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preference alignment in Large Language Models (LLMs) has significantly
+improved their ability to adhere to human instructions and intentions. However,
+existing direct alignment algorithms primarily focus on relative preferences
+and often overlook the qualitative aspects of responses. Striving to maximize
+the implicit reward gap between the chosen and the slightly inferior rejected
+responses can cause overfitting and unnecessary unlearning of the high-quality
+rejected responses. The unawareness of the reward scores also drives the LLM to
+indiscriminately favor the low-quality chosen responses and fail to generalize
+to responses with the highest rewards, which are sparse in data. To overcome
+these shortcomings, our study introduces reward-conditioned LLM policies that
+discern and learn from the entire spectrum of response quality within the
+dataset, helping extrapolate to more optimal regions. We propose an effective
+yet simple data relabeling method that conditions the preference pairs on
+quality scores to construct a reward-augmented dataset. This dataset is easily
+integrated with existing direct alignment algorithms and is applicable to any
+preference dataset. The experimental results across instruction-following
+benchmarks including AlpacaEval, MT-Bench, and Arena-Hard-Auto demonstrate that
+our approach consistently boosts the performance of DPO by a considerable
+margin across diverse models. Additionally, our method improves the average
+accuracy on various academic benchmarks. When applying our method to on-policy
+data, the resulting DPO model achieves SOTA results on AlpacaEval. Through
+ablation studies, we demonstrate that our method not only maximizes the utility
+of preference data but also mitigates the issue of unlearning, demonstrating
+its broad effectiveness beyond mere dataset expansion. Our code is available at
+https://github.com/shenao-zhang/reward-augmented-preference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embodied-RAG: General Non-parametric Embodied Memory for Retrieval and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18313v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18313v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quanting Xie, So Yeon Min, Pengliang Ji, Yue Yang, Tianyi Zhang, Kedi Xu, Aarav Bajaj, Ruslan Salakhutdinov, Matthew Johnson-Roberson, Yonatan Bisk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is no limit to how much a robot might explore and learn, but all of
+that knowledge needs to be searchable and actionable. Within language research,
+retrieval augmented generation (RAG) has become the workhorse of large-scale
+non-parametric knowledge; however, existing techniques do not directly transfer
+to the embodied domain, which is multimodal, where data is highly correlated,
+and perception requires abstraction. To address these challenges, we introduce
+Embodied-RAG, a framework that enhances the foundational model of an embodied
+agent with a non-parametric memory system capable of autonomously constructing
+hierarchical knowledge for both navigation and language generation.
+Embodied-RAG handles a full range of spatial and semantic resolutions across
+diverse environments and query types, whether for a specific object or a
+holistic description of ambiance. At its core, Embodied-RAG's memory is
+structured as a semantic forest, storing language descriptions at varying
+levels of detail. This hierarchical organization allows the system to
+efficiently generate context-sensitive outputs across different robotic
+platforms. We demonstrate that Embodied-RAG effectively bridges RAG to the
+robotics domain, successfully handling over 250 explanation and navigation
+queries across kilometer-level environments, highlighting its promise as a
+general-purpose non-parametric system for embodied agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Web: https://quanting-xie.github.io/Embodied-RAG-web/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Quantum Superposition to Infer the Dynamic Behavior of a
+  Spatial-Temporal Neural Network Signaling Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18963v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18963v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel A. Silva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exploration of new problem classes for quantum computation is an active
+area of research. In this paper, we introduce and solve a novel problem class
+related to dynamics on large-scale networks relevant to neurobiology and
+machine learning. Specifically, we ask if a network can sustain inherent
+dynamic activity beyond some arbitrary observation time or if the activity
+ceases through quiescence or saturation via an epileptic-like state. We show
+that this class of problems can be formulated and structured to take advantage
+of quantum superposition and solved efficiently using the Deutsch-Jozsa and
+Grover quantum algorithms. To do so, we extend their functionality to address
+the unique requirements of how input (sub)sets into the algorithms must be
+mathematically structured while simultaneously constructing the inputs so that
+measurement outputs can be interpreted as meaningful properties of the network
+dynamics. This, in turn, allows us to answer the question we pose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 4 figures. See
+  https://github.com/gabe-alex-silva/Network_Dynamics_QuantumSim/tree/main for
+  code details</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedCT: A Clinical Terminology Graph for Generative AI Applications in
+  Healthcare 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06465v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06465v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Chen, Dongdong Huang, Haoyun Xu, Cong Fu, Lin Sheng, Qingli Zhou, Yuqiang Shen, Kai Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the world's first clinical terminology for the Chinese
+healthcare community, namely MedCT, accompanied by a clinical foundation model
+MedBERT and an entity linking model MedLink. The MedCT system enables
+standardized and programmable representation of Chinese clinical data,
+successively stimulating the development of new medicines, treatment pathways,
+and better patient outcomes for the populous Chinese community. Moreover, the
+MedCT knowledge graph provides a principled mechanism to minimize the
+hallucination problem of large language models (LLMs), therefore achieving
+significant levels of accuracy and safety in LLM-based clinical applications.
+By leveraging the LLMs' emergent capabilities of generativeness and
+expressiveness, we were able to rapidly built a production-quality terminology
+system and deployed to real-world clinical field within three months, while
+classical terminologies like SNOMED CT have gone through more than twenty years
+development. Our experiments show that the MedCT system achieves
+state-of-the-art (SOTA) performance in semantic matching and entity linking
+tasks, not only for Chinese but also for English. We also conducted a
+longitudinal field experiment by applying MedCT and LLMs in a representative
+spectrum of clinical tasks, including electronic health record (EHR)
+auto-generation and medical document search for diagnostic decision making. Our
+study shows a multitude of values of MedCT for clinical workflows and patient
+outcomes, especially in the new genre of clinical LLM applications. We present
+our approach in sufficient engineering detail, such that implementing a
+clinical terminology for other non-English societies should be readily
+reproducible. We openly release our terminology, models and algorithms, along
+with real-world clinical datasets for the development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Incremental Learning of Retrievable Skills For Efficient Continual Task
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.22658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.22658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daehee Lee, Minjong Yoo, Woo Kyung Kim, Wonje Choi, Honguk Woo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual Imitation Learning (CiL) involves extracting and accumulating task
+knowledge from demonstrations across multiple stages and tasks to achieve a
+multi-task policy. With recent advancements in foundation models, there has
+been a growing interest in adapter-based CiL approaches, where adapters are
+established parameter-efficiently for tasks newly demonstrated. While these
+approaches isolate parameters for specific tasks and tend to mitigate
+catastrophic forgetting, they limit knowledge sharing among different
+demonstrations. We introduce IsCiL, an adapter-based CiL framework that
+addresses this limitation of knowledge sharing by incrementally learning
+shareable skills from different demonstrations, thus enabling sample-efficient
+task adaptation using the skills particularly in non-stationary CiL
+environments. In IsCiL, demonstrations are mapped into the state embedding
+space, where proper skills can be retrieved upon input states through
+prototype-based memory. These retrievable skills are incrementally learned on
+their corresponding adapters. Our CiL experiments with complex tasks in
+Franka-Kitchen and Meta-World demonstrate robust performance of IsCiL in both
+task adaptation and sample-efficiency. We also show a simple extension of IsCiL
+for task unlearning scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP
+  Evaluation Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09672v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09672v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexis Roger, Prateek Humane, Daniel Z. Kaplan, Kshitij Gupta, Qi Sun, George Adamopoulos, Jonathan Siu Chi Lim, Quentin Anthony, Edwin Fennell, Irina Rish
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of Vision-Language Models (VLMs) in the past several years
+calls for rigorous and comprehensive evaluation methods and benchmarks. This
+work analyzes existing VLM evaluation techniques, including automated metrics,
+AI-based assessments, and human evaluations across diverse tasks. We first
+introduce Robin - a novel suite of VLMs that we built by combining Large
+Language Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use
+Robin to identify shortcomings of current evaluation approaches across scales.
+Next, to overcome the identified limitations, we introduce CHIRP - a new long
+form response benchmark we developed for more robust and complete VLM
+evaluation. We provide open access to the Robin training code, model suite, and
+CHIRP benchmark to promote reproducibility and advance VLM research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying the Importance of Data Alignment in Downstream Model
+  Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krrish Chawla, Aryan Sahai, Mario DePavia, Sudharsan Sundar, Brando Miranda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrary to the conventional emphasis on dataset size, we explore the role of
+data alignment -- an often overlooked aspect of data quality -- in training
+capable Large Language Models (LLMs). To do so, we use the Task2Vec-based
+alignment coefficient, a quantitative measure of the similarity between two
+datasets, to quantify the impact of alignment between training data and
+evaluation data on downstream performance. In particular, we conduct controlled
+\textit{interventional} experiments for two settings: 1. the impact of
+increased alignment coefficients between various pre-training (pt) against
+evaluation datasets, and 2. the impact of increased alignment coefficients
+between domain specific fine-tuning (ft) against domain specific evaluation.
+The domain specific task we explore is Autoformalization -- the machine
+translation task between natural language and code for formal verification. In
+both settings, we find a strong, predictable negative correlation between the
+alignment coefficient of a model's training and evaluation data and the model's
+loss/perplexity on the respective downstream task. These findings suggest a
+re-evaluation of LLM training approaches, demonstrating the relevance of data
+alignment compared to data quantity, especially in specialized downstream tasks
+such as Autoformalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the Efficacy of Meta-Learning: Unveiling Superior Data
+  Diversity Utilization of MAML Over <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08506v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08506v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kavita Selva, Satita Vittayaareekul, Brando Miranda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently, data and model size dominate the narrative in the training of
+super-large, powerful models. However, there has been a lack of exploration on
+the effect of other attributes of the training dataset on model performance. We
+hypothesize that dataset diversity can impact the performance of vision models.
+Our study shows positive correlations between test set accuracy and data
+diversity, providing an argument for furthering the research of dataset
+attributes beyond size. We analyzed pre-training and model-agnostic
+meta-learning methods on twelve popular visual datasets (e.g., Omniglot,
+CIFAR-FS, Aircraft) and five model configurations, including MAML variants with
+different numbers of inner gradient steps and supervised learning. We show
+moderate to strong positive correlations (R-squared: 0.15-0.42) between
+accuracy and data diversity and weaker but significant correlations (R-squared:
+~0.2) between loss and diversity. These findings support our hypothesis and
+demonstrate a promising way for a deeper exploration of how formal data
+diversity influences model performance. This initial study highlights the
+potential of (Task2Vec) data diversity as a valuable measure in the rapidly
+evolving field of large-scale learning and emphasizes that understanding the
+dataset is key to building more powerful and generalizable models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-20T00:00:00Z">2025-01-20</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">15</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Force-Aware Autonomous Robotic Surgery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11742v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11742v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alaa Eldin Abdelaal, Jiaying Fang, Tim N. Reinhart, Jacob A. Mejia, Tony Z. Zhao, Jeannette Bohg, Allison M. Okamura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work demonstrates the benefits of using tool-tissue interaction forces
+in the design of autonomous systems in robot-assisted surgery (RAS). Autonomous
+systems in surgery must manipulate tissues of different stiffness levels and
+hence should apply different levels of forces accordingly. We hypothesize that
+this ability is enabled by using force measurements as input to policies
+learned from human demonstrations. To test this hypothesis, we use
+Action-Chunking Transformers (ACT) to train two policies through imitation
+learning for automated tissue retraction with the da Vinci Research Kit (dVRK).
+To quantify the effects of using tool-tissue interaction force data, we trained
+a "no force policy" that uses the vision and robot kinematic data, and compared
+it to a "force policy" that uses force, vision and robot kinematic data. When
+tested on a previously seen tissue sample, the force policy is 3 times more
+successful in autonomously performing the task compared with the no force
+policy. In addition, the force policy is more gentle with the tissue compared
+with the no force policy, exerting on average 62% less force on the tissue.
+When tested on a previously unseen tissue sample, the force policy is 3.5 times
+more successful in autonomously performing the task, exerting an order of
+magnitude less forces on the tissue, compared with the no force policy. These
+results open the door to design force-aware autonomous systems that can meet
+the surgical guidelines for tissue handling, especially using the newly
+released RAS systems with force feedback capabilities such as the da Vinci 5.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event-based vision for egomotion estimation using precise event timing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11554v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11554v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugh Greatorex, Michele Mastella, Madison Cotteret, Ole Richter, Elisabetta Chicca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Egomotion estimation is crucial for applications such as autonomous
+navigation and robotics, where accurate and real-time motion tracking is
+required. However, traditional methods relying on inertial sensors are highly
+sensitive to external conditions, and suffer from drifts leading to large
+inaccuracies over long distances. Vision-based methods, particularly those
+utilising event-based vision sensors, provide an efficient alternative by
+capturing data only when changes are perceived in the scene. This approach
+minimises power consumption while delivering high-speed, low-latency feedback.
+In this work, we propose a fully event-based pipeline for egomotion estimation
+that processes the event stream directly within the event-based domain. This
+method eliminates the need for frame-based intermediaries, allowing for
+low-latency and energy-efficient motion estimation. We construct a shallow
+spiking neural network using a synaptic gating mechanism to convert precise
+event timing into bursts of spikes. These spikes encode local optical flow
+velocities, and the network provides an event-based readout of egomotion. We
+evaluate the network's performance on a dedicated chip, demonstrating strong
+potential for low-latency, low-power motion estimation. Additionally,
+simulations of larger networks show that the system achieves state-of-the-art
+accuracy in egomotion estimation tasks with event-based cameras, making it a
+promising solution for real-time, power-constrained robotics applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures. Supplementary material: 4 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clinically Ready Magnetic Microrobots for Targeted Therapies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11553v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11553v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian C. Landers, Lukas Hertle, Vitaly Pustovalov, Derick Sivakumaran, Oliver Brinkmann, Kirstin Meiners, Pascal Theiler, Valentin Gantenbein, Andrea Veciana, Michael Mattmann, Silas Riss, Simone Gervasoni, Christophe Chautems, Hao Ye, Semih Sevim, Andreas D. Flouris, Josep Puigmartí-Luis, Tiago Sotto Mayor, Pedro Alves, Tessa Lühmann, Xiangzhong Chen, Nicole Ochsenbein, Ueli Moehrlen, Philipp Gruber, Miriam Weisskopf, Quentin Boehler, Salvador Pané, Bradley J. Nelson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Systemic drug administration often causes off-target effects limiting the
+efficacy of advanced therapies. Targeted drug delivery approaches increase
+local drug concentrations at the diseased site while minimizing systemic drug
+exposure. We present a magnetically guided microrobotic drug delivery system
+capable of precise navigation under physiological conditions. This platform
+integrates a clinical electromagnetic navigation system, a custom-designed
+release catheter, and a dissolvable capsule for accurate therapeutic delivery.
+In vitro tests showed precise navigation in human vasculature models, and in
+vivo experiments confirmed tracking under fluoroscopy and successful navigation
+in large animal models. The microrobot balances magnetic material
+concentration, contrast agent loading, and therapeutic drug capacity, enabling
+effective hosting of therapeutics despite the integration complexity of its
+components, offering a promising solution for precise targeted drug delivery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Incremental Sampling and Segmentation-Based Approach for Motion
+  Planning Infeasibility 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11434v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11434v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antony Thomas, Fulvio Mastrogiovanni, Marco Baglietto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple and easy-to-implement algorithm to detect plan
+infeasibility in kinematic motion planning. Our method involves approximating
+the robot's configuration space to a discrete space, where each degree of
+freedom has a finite set of values. The obstacle region separates the free
+configuration space into different connected regions. For a path to exist
+between the start and goal configurations, they must lie in the same connected
+region of the free space. Thus, to ascertain plan infeasibility, we merely need
+to sample adequate points from the obstacle region that isolate start and goal.
+Accordingly, we progressively construct the configuration space by sampling
+from the discretized space and updating the bitmap cells representing obstacle
+regions. Subsequently, we partition this partially built configuration space to
+identify different connected components within it and assess the connectivity
+of the start and goal cells. We illustrate this methodology on five different
+scenarios with configuration spaces having up to 5 degree-of-freedom (DOF).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of World Models for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuo Feng, Wenguan Wang, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent breakthroughs in autonomous driving have revolutionized the way
+vehicles perceive and interact with their surroundings. In particular, world
+models have emerged as a linchpin technology, offering high-fidelity
+representations of the driving environment that integrate multi-sensor data,
+semantic cues, and temporal dynamics. Such models unify perception, prediction,
+and planning, thereby enabling autonomous systems to make rapid, informed
+decisions under complex and often unpredictable conditions. Research trends
+span diverse areas, including 4D occupancy prediction and generative data
+synthesis, all of which bolster scene understanding and trajectory forecasting.
+Notably, recent works exploit large-scale pretraining and advanced
+self-supervised learning to scale up models' capacity for rare-event simulation
+and real-time interaction. In addressing key challenges -- ranging from domain
+adaptation and long-tail anomaly detection to multimodal fusion -- these world
+models pave the way for more robust, reliable, and adaptable autonomous driving
+solutions. This survey systematically reviews the state of the art,
+categorizing techniques by their focus on future prediction, behavior planning,
+and the interaction between the two. We also identify potential directions for
+future research, emphasizing holistic integration, improved computational
+efficiency, and advanced simulation. Our comprehensive analysis underscores the
+transformative role of world models in driving next-generation autonomous
+systems toward safer and more equitable mobility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Ongoing project</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Hybrid-Belief POMDP with Coupled Semantic-Geometric Models and
+  Semantic Safety Awareness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuvy Lemberg, Vadim Indelman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robots operating in complex and unknown environments frequently require
+geometric-semantic representations of the environment to safely perform their
+tasks. While inferring the environment, they must account for many possible
+scenarios when planning future actions. Since objects' class types are discrete
+and the robot's self-pose and the objects' poses are continuous, the
+environment can be represented by a hybrid discrete-continuous belief which is
+updated according to models and incoming data. Prior probabilities and
+observation models representing the environment can be learned from data using
+deep learning algorithms. Such models often couple environmental semantic and
+geometric properties. As a result, semantic variables are interconnected,
+causing semantic state space dimensionality to increase exponentially. In this
+paper, we consider planning under uncertainty using partially observable Markov
+decision processes (POMDPs) with hybrid semantic-geometric beliefs. The models
+and priors consider the coupling between semantic and geometric variables.
+Within POMDP, we introduce the concept of semantically aware safety. Obtaining
+representative samples of the theoretical hybrid belief, required for
+estimating the value function, is very challenging. As a key contribution, we
+develop a novel form of the hybrid belief and leverage it to sample
+representative samples. We show that under certain conditions, the value
+function and probability of safety can be calculated efficiently with an
+explicit expectation over all possible semantic mappings. Our simulations show
+that our estimates of the objective function and probability of safety achieve
+similar levels of accuracy compared to estimators that run exhaustively on the
+entire semantic state-space using samples from the theoretical hybrid belief.
+Nevertheless, the complexity of our estimators is polynomial rather than
+exponential.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Predictive Motion Planning by Learning Obstacle Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Zhou, Yulong Gao, Ola Johansson, Björn Olofsson, Erik Frisk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe motion planning for robotic systems in dynamic environments is
+nontrivial in the presence of uncertain obstacles, where estimation of obstacle
+uncertainties is crucial in predicting future motions of dynamic obstacles. The
+worst-case characterization gives a conservative uncertainty prediction and may
+result in infeasible motion planning for the ego robotic system. In this paper,
+an efficient, robust, and safe motion-planing algorithm is developed by
+learning the obstacle uncertainties online. More specifically, the unknown yet
+intended control set of obstacles is efficiently computed by solving a linear
+programming problem. The learned control set is used to compute forward
+reachable sets of obstacles that are less conservative than the worst-case
+prediction. Based on the forward prediction, a robust model predictive
+controller is designed to compute a safe reference trajectory for the ego
+robotic system that remains outside the reachable sets of obstacles over the
+prediction horizon. The method is applied to a car-like mobile robot in both
+simulations and hardware experiments to demonstrate its effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span>-Based Model for Monocular Visual Odometry: A Video
+  Understanding Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06121v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06121v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        André O. Françani, Marcos R. O. A. Maximo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the camera's pose given images from a single camera is a
+traditional task in mobile robots and autonomous vehicles. This problem is
+called monocular visual odometry and often relies on geometric approaches that
+require considerable engineering effort for a specific scenario. Deep learning
+methods have been shown to be generalizable after proper training and with a
+large amount of available data. Transformer-based architectures have dominated
+the state-of-the-art in natural language processing and computer vision tasks,
+such as image and video understanding. In this work, we deal with the monocular
+visual odometry as a video understanding task to estimate the 6 degrees of
+freedom of a camera's pose. We contribute by presenting the TSformer-VO model
+based on spatio-temporal self-attention mechanisms to extract features from
+clips and estimate the motions in an end-to-end manner. Our approach achieved
+competitive state-of-the-art performance compared with geometry-based and deep
+learning-based methods on the KITTI visual odometry dataset, outperforming the
+DeepVO implementation highly accepted in the visual odometry community. The
+code is publicly available at https://github.com/aofrancani/TSformer-VO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been accepted for publication in IEEE Access</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Projection-free computation of robust controllable sets with constrained
+  zonotopes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13730v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13730v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abraham P. Vinod, Avishai Weiss, Stefano Di Cairano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of computing robust controllable sets for discrete-time
+linear systems with additive uncertainty. We propose a tractable and scalable
+approach to inner- and outer-approximate robust controllable sets using
+constrained zonotopes, when the additive uncertainty set is a symmetric,
+convex, and compact set. Our least-squares-based approach uses novel
+closed-form approximations of the Pontryagin difference between a constrained
+zonotopic minuend and a symmetric, convex, and compact subtrahend. Unlike
+existing approaches, our approach does not rely on convex optimization solvers,
+and is projection-free for ellipsoidal and zonotopic uncertainty sets. We also
+propose a least-squares-based approach to compute a convex, polyhedral
+outer-approximation to constrained zonotopes, and characterize sufficient
+conditions under which all these approximations are exact. We demonstrate the
+computational efficiency and scalability of our approach in several case
+studies, including the design of abort-safe rendezvous trajectories for a
+spacecraft in near-rectilinear halo orbit under uncertainty. Our approach can
+inner-approximate a 20-step robust controllable set for a 100-dimensional
+linear system in under 15 seconds on a standard computer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 7 figures; Accepted for publication at Automatica. See
+  https://youtu.be/6BPmHgxD3OI for the use of the proposed method in a
+  simplified abort-safe rendezvous problem</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Semidefinite Relaxations for Matrix-Weighted State-Estimation
+  Problems in Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07275v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07275v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Connor Holmes, Frederike Dümbgen, Timothy D Barfoot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there has been remarkable progress in the development of
+so-called certifiable perception methods, which leverage semidefinite, convex
+relaxations to find global optima of perception problems in robotics. However,
+many of these relaxations rely on simplifying assumptions that facilitate the
+problem formulation, such as an isotropic measurement noise distribution. In
+this paper, we explore the tightness of the semidefinite relaxations of
+matrix-weighted (anisotropic) state-estimation problems and reveal the
+limitations lurking therein: matrix-weighted factors can cause convex
+relaxations to lose tightness. In particular, we show that the semidefinite
+relaxations of localization problems with matrix weights may be tight only for
+low noise levels. To better understand this issue, we introduce a theoretical
+connection between the posterior uncertainty of the state estimate and the
+certificate matrix obtained via convex relaxation. With this connection in
+mind, we empirically explore the factors that contribute to this loss of
+tightness and demonstrate that redundant constraints can be used to regain it.
+As a second technical contribution of this paper, we show that the
+state-of-the-art relaxation of scalar-weighted SLAM cannot be used when matrix
+weights are considered. We provide an alternate formulation and show that its
+SDP relaxation is not tight (even for very low noise levels) unless specific
+redundant constraints are used. We demonstrate the tightness of our
+formulations on both simulated and real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenEx: Generating an Explorable World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09624v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09624v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taiming Lu, Tianmin Shu, Junfei Xiao, Luoxin Ye, Jiahao Wang, Cheng Peng, Chen Wei, Daniel Khashabi, Rama Chellappa, Alan Yuille, Jieneng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding, navigating, and exploring the 3D physical real world has long
+been a central challenge in the development of artificial intelligence. In this
+work, we take a step toward this goal by introducing GenEx, a system capable of
+planning complex embodied world exploration, guided by its generative
+imagination that forms priors (expectations) about the surrounding
+environments. GenEx generates an entire 3D-consistent imaginative environment
+from as little as a single RGB image, bringing it to life through panoramic
+video streams. Leveraging scalable 3D world data curated from Unreal Engine,
+our generative model is rounded in the physical world. It captures a continuous
+360-degree environment with little effort, offering a boundless landscape for
+AI agents to explore and interact with. GenEx achieves high-quality world
+generation, robust loop consistency over long trajectories, and demonstrates
+strong 3D capabilities such as consistency and active 3D mapping. Powered by
+generative imagination of the world, GPT-assisted agents are equipped to
+perform complex embodied tasks, including both goal-agnostic exploration and
+goal-driven navigation. These agents utilize predictive expectation regarding
+unseen parts of the physical world to refine their beliefs, simulate different
+outcomes based on potential decisions, and make more informed choices. In
+summary, we demonstrate that GenEx provides a transformative platform for
+advancing embodied AI in imaginative spaces and brings potential for extending
+these capabilities to real-world exploration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website: GenEx.world</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Dilemma of Decision-Making in the Real World: When Robots Struggle
+  to Make Choices Due to Situational Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01744v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01744v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khairidine Benali, Praminda Caleb-Solly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to demonstrate the limitations of assistive robotic capabilities in
+noisy real-world environments, we propose a Decision-Making Scenario analysis
+approach that examines the challenges due to user and environmental
+uncertainty, and incorporates these into user studies. The scenarios highlight
+how personalization can be achieved through more human-robot collaboration,
+particularly in relation to individuals with visual, physical, cognitive,
+auditory impairments, clinical needs, environmental factors (noise, light
+levels, clutter), and daily living activities. Our goal is for this
+contribution to prompt reflection and aid in the design of improved robots
+(embodiment, sensors, actuation, cognition) and their behavior, and we aim to
+introduces a groundbreaking strategy to enhance human-robot collaboration,
+addressing the complexities of decision-making under uncertainty through a
+Scenario analysis approach. By emphasizing user-centered design principles and
+offering actionable solutions to real-world challenges, this work aims to
+identify key decision-making challenges and propose potential solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at TAROS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid
+  Prototyping in Virtual Reality Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09600v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09600v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Augusto Pinheiro de Sousa, Heiko Hamann, Oliver Deussen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SLAM is a foundational technique with broad applications in robotics and
+AR/VR. SLAM simulations evaluate new concepts, but testing on
+resource-constrained devices, such as VR HMDs, faces challenges: high
+computational cost and restricted sensor data access. This work proposes a
+sparse framework using mesh geometry projections as features, which improves
+efficiency and circumvents direct sensor data access, advancing SLAM research
+as we demonstrate in VR and through numerical evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE VR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Real-time Motion Planning for autonomous vehicles in dynamic
+  environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Dehghani Tezerjani, Dominic Carrillo, Deyuan Qu, Sudip Dhakal, Amir Mirzaeinia, Qing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in self-driving car technologies have enabled them to
+navigate autonomously through various environments. However, one of the
+critical challenges in autonomous vehicle operation is trajectory planning,
+especially in dynamic environments with moving obstacles. This research aims to
+tackle this challenge by proposing a robust algorithm tailored for autonomous
+cars operating in dynamic environments with moving obstacles. The algorithm
+introduces two main innovations. Firstly, it defines path density by adjusting
+the number of waypoints along the trajectory, optimizing their distribution for
+accuracy in curved areas and reducing computational complexity in straight
+sections. Secondly, it integrates hierarchical motion planning algorithms,
+combining global planning with an enhanced $A^*$ graph-based method and local
+planning using the time elastic band algorithm with moving obstacle detection
+considering different motion models. The proposed algorithm is adaptable for
+different vehicle types and mobile robots, making it versatile for real-world
+applications. Simulation results demonstrate its effectiveness across various
+conditions, promising safer and more efficient navigation for autonomous
+vehicles in dynamic environments. These modifications significantly improve
+trajectory planning capabilities, addressing a crucial aspect of autonomous
+vehicle technology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ N-dimensional Convex Obstacle Avoidance using Hybrid Feedback Control
+  (Extended version) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11279v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11279v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayur Sawant, Ilia Polushin, Abdelhamid Tayebi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the autonomous robot navigation problem in a priori
+unknown n-dimensional environments containing convex obstacles of arbitrary
+shapes and sizes. We propose a hybrid feedback control scheme that guarantees
+safe and global asymptotic convergence of the robot to a predefined target
+location. The proposed control strategy relies on a switching mechanism
+allowing the robot to operate either in the move-to-target mode or the
+obstacle-avoidance mode, based on its proximity to the obstacles and the
+availability of a clear straight path between the robot and the target. In the
+obstacle-avoidance mode, the robot is constrained to move within a
+two-dimensional plane that intersects the obstacle being avoided and the
+target, preventing it from retracing its path. The effectiveness of the
+proposed hybrid feedback controller is demonstrated through simulations in
+two-dimensional and three-dimensional environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">28</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ultra-High Reliability by Predictive Interference Management Using
+  Extreme Value Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fateme Salehi, Aamir Mahmood, Sinem Coleri, Mikael Gidlund
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultra-reliable low-latency communications (URLLC) require innovative
+approaches to modeling channel and interference dynamics, extending beyond
+traditional average estimates to encompass entire statistical distributions,
+including rare and extreme events that challenge achieving ultra-reliability
+performance regions. In this paper, we propose a risk-sensitive approach based
+on extreme value theory (EVT) to predict the signal-to-interference-plus-noise
+ratio (SINR) for efficient resource allocation in URLLC systems. We employ EVT
+to estimate the statistics of rare and extreme interference values, and kernel
+density estimation (KDE) to model the distribution of non-extreme events. Using
+a mixture model, we develop an interference prediction algorithm based on
+quantile prediction, introducing a confidence level parameter to balance
+reliability and resource usage. While accounting for the risk sensitivity of
+interference estimates, the prediction outcome is then used for appropriate
+resource allocation of a URLLC transmission under link outage constraints.
+Simulation results demonstrate that the proposed method outperforms the
+state-of-the-art first-order discrete-time Markov chain (DTMC) approach by
+reducing outage rates up to 100-fold, achieving target outage probabilities as
+low as \(10^{-7}\). Simultaneously, it minimizes radio resource usage
+\(\simnot15 \%\) compared to DTMC, while remaining only \(\simnot20 \%\) above
+the optimal case with perfect interference knowledge, resulting in
+significantly higher prediction accuracy. Additionally, the method is
+sample-efficient, able to predict interference effectively with minimal
+training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures, Accepted for IEEE ICC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Power Ramp-Rate Control via Power Regulation for Storageless
+  Grid-Connected Photovoltaic Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11699v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11699v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose Miguel Riquelme-Dominguez, Francisco de Paula García-López, Sergio Martinez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Photovoltaic Power Ramp-Rate Control (PRRC) constitutes a key ancillary
+service for future power systems. Although its implementation through the
+installation of storage systems or irradiance sensors has been widely
+investigated, fewer studies have explored the power curtailment approach. The
+latter lacks efficiency, as it voluntarily produces power discharges, yet it is
+a cost-effective solution in terms of capital expenditures. This paper proposes
+a novel storageless and sensorless photovoltaic PRRC for grid-connected
+applications in which the photovoltaic power, rather than the voltage, is the
+controlled magnitude. The aforementioned contribution makes the effective
+tracking of the power ramp-rate limit possible compared to the existing methods
+in the literature. The method is assisted by a real-time curve-fitting
+algorithm that estimates the Maximum Power Point while operating suboptimally.
+Thus, no direct temperature or irradiance measurement systems are needed. The
+validation of the proposed PRRC strategy has been tested by simulation and
+compared to another approach available in the literature, considering
+real-field highly variable irradiance data. Experimental validation of the
+proposed strategy has been performed in real time via Controller
+Hardware-in-the-Loop.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KKL Observer Synthesis for Nonlinear Systems via Physics-Informed
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Umar B. Niazi, John Cao, Matthieu Barreau, Karl Henrik Johansson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel learning approach for designing
+Kazantzis-Kravaris/Luenberger (KKL) observers for autonomous nonlinear systems.
+The design of a KKL observer involves finding an injective map that transforms
+the system state into a higher-dimensional observer state, whose dynamics is
+linear and stable. The observer's state is then mapped back to the original
+system coordinates via the inverse map to obtain the state estimate. However,
+finding this transformation and its inverse is quite challenging. We propose to
+sequentially approximate these maps by neural networks that are trained using
+physics-informed learning. We generate synthetic data for training by
+numerically solving the system and observer dynamics. Theoretical guarantees
+for the robustness of state estimation against approximation error and system
+uncertainties are provided. Additionally, a systematic method for optimizing
+observer performance through parameter selection is presented. The
+effectiveness of the proposed approach is demonstrated through numerical
+simulations on benchmark examples and its application to sensor fault detection
+and isolation in a network of Kuramoto oscillators using learned KKL observers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PSO-based Sliding Mode Current Control of Grid-Forming Inverter in
+  Rotating Frame 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quang-Manh Hoang, Guilherme Vieira Hollweg, Akhtar Hussain, Sina Zarrabian, Wencong Su, Van-Hai Bui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Grid-Forming Inverter (GFMI) is an emerging topic that is attracting
+significant attention from both academic and industrial communities,
+particularly in the area of control design. The Decoupled Average Model-based
+Sliding Mode Current Controller (DAM-SMC) has been used to address the need
+such as fast response, fixed switching frequency, and no overshoot to avoid
+exceeding current limits. Typically, the control parameters for DAM-SMC are
+chosen based on expert knowledge and certain assumptions. However, these
+parameters may not achieve optimized performance due to system dynamics and
+uncertainties. To address this, this paper proposes a Particle Swarm
+Optimization (PSO)-based DAM-SMC controller, which inherits the control laws
+from DAM-SMC but optimizes the control parameters offline using PSO. The main
+goal is to reduce chattering and achieve smaller tracking errors. The proposed
+method is compared with other metaheuristic optimization algorithms, such as
+Genetic Algorithm (GA) and Simulated Annealing (SA). Simulations are performed
+in MATLAB/Simulink across various scenarios to evaluate the effectiveness of
+the proposed controller. The proposed approach achieves a substantial reduction
+in convergence time, decreasing it by 86.36% compared to the GA and by 88.89%
+compared to SA. Furthermore, the tracking error is reduced by 11.61% compared
+to the conventional DAM-SMC algorithm. The robustness of the proposed method is
+validated under critical conditions, where plant and control model parameters
+varied by up to 40%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DRL-Based Maximization of the Sum Cross-Layer Achievable Rate for
+  Networks Under Jamming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11626v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11626v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdul Basit, Muddasir Rahim, Tri Nhu Do, Nadir Adam, Georges Kaddoum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In quasi-static wireless networks characterized by infrequent changes in the
+transmission schedules of user equipment (UE), malicious jammers can easily
+deteriorate network performance. Accordingly, a key challenge in these networks
+is managing channel access amidst jammers and under dynamic channel conditions.
+In this context, we propose a robust learning-based mechanism for channel
+access in multi-cell quasi-static networks under jamming. The network comprises
+multiple legitimate UEs, including predefined UEs (pUEs) with stochastic
+predefined schedules and an intelligent UE (iUE) with an undefined transmission
+schedule, all transmitting over a shared, time-varying uplink channel. Jammers
+transmit unwanted packets to disturb the pUEs' and the iUE's communication. The
+iUE's learning process is based on the deep reinforcement learning (DRL)
+framework, utilizing a residual network (ResNet)-based deep Q-Network (DQN). To
+coexist in the network and maximize the network's sum cross-layer achievable
+rate (SCLAR), the iUE must learn the unknown network dynamics while
+concurrently adapting to dynamic channel conditions. Our simulation results
+reveal that, with properly defined state space, action space, and rewards in
+DRL, the iUE can effectively coexist in the network, maximizing channel
+utilization and the network's SCLAR by judiciously selecting transmission time
+slots and thus avoiding collisions and jamming.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clinically Ready Magnetic Microrobots for Targeted Therapies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11553v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11553v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian C. Landers, Lukas Hertle, Vitaly Pustovalov, Derick Sivakumaran, Oliver Brinkmann, Kirstin Meiners, Pascal Theiler, Valentin Gantenbein, Andrea Veciana, Michael Mattmann, Silas Riss, Simone Gervasoni, Christophe Chautems, Hao Ye, Semih Sevim, Andreas D. Flouris, Josep Puigmartí-Luis, Tiago Sotto Mayor, Pedro Alves, Tessa Lühmann, Xiangzhong Chen, Nicole Ochsenbein, Ueli Moehrlen, Philipp Gruber, Miriam Weisskopf, Quentin Boehler, Salvador Pané, Bradley J. Nelson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Systemic drug administration often causes off-target effects limiting the
+efficacy of advanced therapies. Targeted drug delivery approaches increase
+local drug concentrations at the diseased site while minimizing systemic drug
+exposure. We present a magnetically guided microrobotic drug delivery system
+capable of precise navigation under physiological conditions. This platform
+integrates a clinical electromagnetic navigation system, a custom-designed
+release catheter, and a dissolvable capsule for accurate therapeutic delivery.
+In vitro tests showed precise navigation in human vasculature models, and in
+vivo experiments confirmed tracking under fluoroscopy and successful navigation
+in large animal models. The microrobot balances magnetic material
+concentration, contrast agent loading, and therapeutic drug capacity, enabling
+effective hosting of therapeutics despite the integration complexity of its
+components, offering a promising solution for precise targeted drug delivery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DLinear-based Prediction of Remaining Useful Life of Lithium-Ion
+  Batteries: Feature Engineering through Explainable Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsu Kim, Jaehyun Oh, Sang-Young Lee, Junghwan Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of the Remaining Useful Life (RUL) of lithium-ion
+batteries is essential for ensuring safety, reducing maintenance costs, and
+optimizing usage. However, predicting RUL is challenging due to the nonlinear
+characteristics of the degradation caused by complex chemical reactions.
+Machine learning allows precise predictions by learning the latent functions of
+degradation relationships based on cycling behavior. This study introduces an
+accurate RUL prediction approach based on feature engineering and DLinear,
+applied to the dataset from NASA's Prognostics Center of Excellence. Among the
+20 features generated from current, voltage, temperature, and time provided in
+this dataset, key features contributing to degradation are selected using
+Pearson correlation coefficient and Shapley values. Shapley value-based feature
+selection effectively reflects cell-to-cell variability, showing similar
+importance rankings across all cells. The DLinear-based RUL prediction using
+key features efficiently captures the time-series trend, demonstrating
+significantly better performance compared to Long Short-Term Memory and
+Transformer models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Stopping Bayesian Optimization for Controller Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Stenger, Dominik Scheurenberg, Heike Vallery, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manual tuning of performance-critical controller parameters can be tedious
+and sub-optimal. Bayesian Optimization (BO) is an increasingly popular
+practical alternative to automatically optimize controller parameters from few
+experiments. Standard BO practice is to evaluate the closed-loop performance of
+parameters proposed during optimization on an episode with a fixed length.
+However, fixed-length episodes can be wasteful. For example, continuing an
+episode where already the start shows undesirable behavior such as strong
+oscillations seems pointless. Therefore, we propose a BO method that stops an
+episode early if suboptimality becomes apparent before an episode is completed.
+Such early stopping results in partial observations of the controller's
+performance, which cannot directly be included in standard BO. We propose three
+heuristics to facilitate partially observed episodes in BO. Through five
+numerical and one hardware experiment, we demonstrate that early stopping BO
+can substantially reduce the time needed for optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at CDC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Trajectory Control of Geometrically Exact Strings with
+  Space-Time Finite Elements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Thoma, Paul Kotyczka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this contribution, we present a variational space-time formulation which
+generates an optimal feed-forward controller for geometrically exact strings.
+More concretely, the optimization problem is solved with an indirect approach,
+and the space-time finite element method translates the problem to a set of
+algebraic equations. Thereby, only the positional field and the corresponding
+adjoint variable field are approximated by continuous shape functions, which
+makes the discretization of a velocity field unnecessary. In addition, the
+variational formulation can be solved using commercial or open source finite
+element packages. The entire approach can also be interpreted as a
+multiple-shooting method for solving the optimality conditions based on the
+semi-discrete problem. The performance of our approach is demonstrated by a
+numerical test.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 6 figures, submitted to the 23rd European Control Conference
+  (ECC 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discrete-Time Passivity-Based Control using Hermite-Obreschkoff Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Zhang, Paul Kotyczka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The motivation for this paper is the implementation of nonlinear state
+feedback control, designed based on the continuous-time plant model, in a
+sampled control loop under relatively slow sampling. In previous work we have
+shown that using one-step predictions of the target dynamics with higher order
+integration schemes, together with possibly higher order input shaping, is a
+simple and effective way to increase the feasible sampling times until
+performance degradation and instability occur. In this contribution we present
+a unifying derivation for arbitrary orders of the previously used Lobatto IIIA
+collocation and Hermite interpolation schemes through the Hermite-Obreschkoff
+formula. We derive, moreover, an IDA-PBC controller for a magnetic levitation
+system, which requires a non-constant target interconnection matrix, and show
+experimental results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, submitted to the 13th IFAC Symposium on Nonlinear
+  Control Systems 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fixed Point Certificates for Reachability and Expected Rewards in MDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krishnendu Chatterjee, Tim Quatmann, Maximilian Schäffeler, Maximilian Weininger, Tobias Winkler, Daniel Zilken
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The possibility of errors in human-engineered formal verification software,
+such as model checkers, poses a serious threat to the purpose of these tools.
+An established approach to mitigate this problem are certificates --
+lightweight, easy-to-check proofs of the verification results. In this paper,
+we develop novel certificates for model checking of Markov decision processes
+(MDPs) with quantitative reachability and expected reward properties. Our
+approach is conceptually simple and relies almost exclusively on elementary
+fixed point theory. Our certificates work for arbitrary finite MDPs and can be
+readily computed with little overhead using standard algorithms. We formalize
+the soundness of our certificates in Isabelle/HOL and provide a formally
+verified certificate checker. Moreover, we augment existing algorithms in the
+probabilistic model checker Storm with the ability to produce certificates and
+demonstrate practical applicability by conducting the first formal
+certification of the reference results in the Quantitative Verification
+Benchmark Set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of the TACAS 2025 paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Reduction of Interconnected Subsystem Models using Abstracted
+  Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11406v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11406v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luuk Poort, Bart Besselink, Rob H. B. Fey, Nathan van de Wouw
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present two frameworks for structure-preserving model order reduction of
+interconnected subsystems, improving tractability of the reduction methods
+while ensuring stability and accuracy bounds of the reduced interconnected
+model. Instead of reducing each subsystem independently, we take a low-order
+abstraction of its environment into account to better capture the dynamics
+relevant to the external input-output behaviour of the interconnected system,
+thereby increasing accuracy of the reduced interconnected model. This approach
+significantly reduces the computational costs of reduction by abstracting
+instead of fully retaining the environment. The two frameworks differ in how
+they generate these abstracted environments: one abstracts the environment as a
+whole, whereas the other abstracts each individual subsystem. By relating
+low-level errors introduced by reduction and abstraction to the resulting
+high-level error on the interconnected system, we are able to translate
+high-level accuracy requirements (on the reduced interconnected system) to
+low-level specifications (on abstraction and reduction errors) using techniques
+from robust performance analysis. By adhering to these low-level
+specifications, restricting the introduced low-level errors, both frameworks
+automatically guarantee the accuracy and stability of the reduced
+interconnected system. We demonstrate the effectiveness of both frameworks by
+applying them to a structural dynamics model of a two-stroke wafer stage,
+achieving improved accuracy and/or greater reduction compared to an existing
+method from literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 12 figures and 2 tables, to appear in the European Journal
+  of Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear ADRC is equivalent to PID with set-point weighting and
+  measurement filter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fredrik Bagge Carlson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that linear Active Disturbance-Rejection Control (ADRC) tuned using
+the "bandwidth method" is equivalent to PI(D) control with set-point weighting
+and a lowpass filter on the measurement signal. We also provide simple
+expressions that make it possible to implement linear ADRC for first and
+second-order systems using commonplace two degree-of-freedom PID
+implementations. The expressions are equivalent to ADRC in the response from
+measurements, and a slight approximation in the response from references.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Driver Behavior Soft-Sensor Based on Neurofuzzy Systems and Weighted
+  Projection on Principal Components 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11338v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11338v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Manuel Escaño, Miguel A. Ridao-Olivar, Carmelina Ierardi, Adolfo J. Sánchez, Kumars Rouzbehi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work has as main objective the development of a soft-sensor to classify,
+in real time, the behaviors of drivers when they are at the controls of a
+vehicle. Efficient classification of drivers' behavior while driving, using
+only the measurements of the sensors already incorporated in the vehicles and
+without the need to add extra hardware (smart phones, cameras, etc.), is a
+challenge. The main advantage of using only the data center signals of modern
+vehicles is economical. The classification of the driving behavior and the
+warning to the driver of dangerous behaviors without the need to add extra
+hardware (and their software) to the vehicle, would allow the direct
+integration of these classifiers into the current vehicles without incurring a
+greater cost in the manufacture of the vehicles and therefore be an added
+value. In this work, the classification is obtained based only on speed,
+acceleration and inertial measurements which are already present in many modern
+vehicles. The proposed algorithm is based on a structure made by several
+Neurofuzzy systems with the combination of projected data in components of
+various Principal Component Analysis. A comparison with several types of
+classical classifying algorithms has been made.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Dynamic Improvement Framework for Vehicular Task Offloading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianren Li, Yuncong Hong, Bojie Lv, Rui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, the task offloading from vehicles with random velocities is
+optimized via a novel dynamic improvement framework. Particularly, in a
+vehicular network with multiple vehicles and base stations (BSs), computing
+tasks of vehicles are offloaded via BSs to an edge server. Due to the random
+velocities, the exact trajectories of vehicles cannot be predicted in advance.
+Hence, instead of deterministic optimization, the cell association, uplink time
+and throughput allocation of multiple vehicles in a period of task offloading
+are formulated as a finite-horizon Markov decision process. In the proposed
+solution framework, we first obtain a reference scheduling scheme of cell
+association, uplink time and throughput allocation via deterministic
+optimization at the very beginning. The reference scheduling scheme is then
+used to approximate the value functions of the Bellman's equations, and the
+actual scheduling action is determined in each time slot according to the
+current system state and approximate value functions. Thus, the intensive
+computation for value iteration in the conventional solution is eliminated.
+Moreover, a non-trivial average cost upper bound is provided for the proposed
+solution framework. In the simulation, the random trajectories of vehicles are
+generated from a high-fidelity traffic simulator. It is shown that the
+performance gain of the proposed scheduling framework over the baselines is
+significant.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bounding the Settling Time of Finite-Time Stable Systems using Sum of
+  Squares 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sengiyumva Kisole, Kunal Garg, Matthew Peet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finite-time stability (FTS) of a differential equation guarantees that
+solutions reach a given equilibrium point in finite time, where the time of
+convergence depends on the initial state of the system. For traditional
+stability notions such as exponential stability, the convex optimization
+framework of Sum-of-Squares (SoS) enables the computation of polynomial
+Lyapunov functions to certify stability. However, finite-time stable systems
+are characterized by non-Lipschitz, non-polynomial vector fields, rendering
+standard SoS methods inapplicable. To this end, in this paper, we show that the
+computation of a non-polynomial Lyapunov function certifying finite-time
+stability can be reformulated as computation of a polynomial one under a
+particular transformation that we develop in this work. As a result, SoS can be
+utilized to compute a Lyapunov function for FTS. This Lyapunov function can
+then be used to obtain a bound on the settling time. We first present this
+approach for the scalar case and then extend it to the multivariate case.
+Numerical examples demonstrate the effectiveness of our approach in both
+certifying finite-time stability and computing accurate settling time bounds.
+This work represents the first combination of SoS programming with settling
+time bounds for finite-time stable systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unlocking the Potential: A Novel Tool for Assessing Untapped
+  Micro-Pumped Hydro Energy Storage Systems in Michigan 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharaf K. Magableh, Xuesong Wang, Oraib Dawaghreh, Caisheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents an innovative tool designed to unlock the potential of
+Michigan's lakes and dams for applications such as water resource management
+and renewable energy generation. Given Michigan's relatively flat landscape,
+the focus is on systems that could serve as micro-hydro energy storage
+solutions. To ensure accuracy and reliability, the tool incorporates extensive
+data gathered from authorized sources, covering more than 420 water facilities
+and potential reservoirs in the state. These data are used as part of a case
+study to evaluate the tool's capabilities. Key parameters assessed include
+horizontal and vertical distances (head), volume, and the total storage
+capacity of each reservoir, measured in GWh. By analyzing these factors, the
+tool determines the suitability of various lakes and dams for hydroelectric
+power generation, and other uses based on the horizontal and vertical threshold
+distances. Its robust assessment framework integrates these metrics to
+comprehensively evaluate each site's potential. The tool's friendly interface
+and advanced data visualization features make the findings easy to interpret,
+facilitating optimal resource utilization and informed decision-making for
+state authorities. Hence, this tool represents a meaningful advancement in
+managing Michigan's water resources sustainably, promoting environmentally
+friendly practices, and supporting economic development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Predictive Motion Planning by Learning Obstacle Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Zhou, Yulong Gao, Ola Johansson, Björn Olofsson, Erik Frisk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe motion planning for robotic systems in dynamic environments is
+nontrivial in the presence of uncertain obstacles, where estimation of obstacle
+uncertainties is crucial in predicting future motions of dynamic obstacles. The
+worst-case characterization gives a conservative uncertainty prediction and may
+result in infeasible motion planning for the ego robotic system. In this paper,
+an efficient, robust, and safe motion-planing algorithm is developed by
+learning the obstacle uncertainties online. More specifically, the unknown yet
+intended control set of obstacles is efficiently computed by solving a linear
+programming problem. The learned control set is used to compute forward
+reachable sets of obstacles that are less conservative than the worst-case
+prediction. Based on the forward prediction, a robust model predictive
+controller is designed to compute a safe reference trajectory for the ego
+robotic system that remains outside the reachable sets of obstacles over the
+prediction horizon. The method is applied to a car-like mobile robot in both
+simulations and hardware experiments to demonstrate its effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear Model of Aggregated Homogeneous Energy Storage Elements with
+  Realizable Dispatch Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04508v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04508v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mazen Elsaadany, Mads R. Almassalkhi, Simon H. Tindemans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To optimize battery dispatch, a model is required that can predict the state
+of charge (SOC) trajectory and ensure dispatch is admissible (i.e., does not
+lead to unexpected SOC saturation). However, battery dispatch optimization is
+inherently challenging since batteries cannot simultaneously charge and
+discharge, which begets a non-convex complementarity constraint. In this paper,
+we consider a composition of energy storage elements that can charge or
+discharge independently and provide a sufficient linear energy storage model of
+the composite battery. This permits convex optimization of the composite
+battery SOC trajectory while ensuring admissibility of the resulting
+(aggregated) power schedule and disaggregation to the individual energy storage
+elements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Projection-free computation of robust controllable sets with constrained
+  zonotopes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13730v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13730v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abraham P. Vinod, Avishai Weiss, Stefano Di Cairano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of computing robust controllable sets for discrete-time
+linear systems with additive uncertainty. We propose a tractable and scalable
+approach to inner- and outer-approximate robust controllable sets using
+constrained zonotopes, when the additive uncertainty set is a symmetric,
+convex, and compact set. Our least-squares-based approach uses novel
+closed-form approximations of the Pontryagin difference between a constrained
+zonotopic minuend and a symmetric, convex, and compact subtrahend. Unlike
+existing approaches, our approach does not rely on convex optimization solvers,
+and is projection-free for ellipsoidal and zonotopic uncertainty sets. We also
+propose a least-squares-based approach to compute a convex, polyhedral
+outer-approximation to constrained zonotopes, and characterize sufficient
+conditions under which all these approximations are exact. We demonstrate the
+computational efficiency and scalability of our approach in several case
+studies, including the design of abort-safe rendezvous trajectories for a
+spacecraft in near-rectilinear halo orbit under uncertainty. Our approach can
+inner-approximate a 20-step robust controllable set for a 100-dimensional
+linear system in under 15 seconds on a standard computer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 7 figures; Accepted for publication at Automatica. See
+  https://youtu.be/6BPmHgxD3OI for the use of the proposed method in a
+  simplified abort-safe rendezvous problem</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Democratic Resilience and Sociotechnical Shocks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04796v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04796v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Amin Rahimian, Michael P. Colaresi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We focus on the potential fragility of democratic elections given modern
+information-communication technologies (ICT) in the Web 2.0 era. Our work
+provides an explanation for the cascading attrition of public officials
+recently in the United States and offers potential policy interventions from a
+dynamic system's perspective. We propose that micro-level heterogeneity across
+individuals within crucial institutions leads to vulnerabilities of election
+support systems at the macro scale. Our analysis provides comparative
+statistics to measure the fragility of systems against targeted harassment,
+disinformation campaigns, and other adversarial manipulations that are now
+cheaper to scale and deploy. Our analysis also informs policy interventions
+that seek to retain public officials and increase voter turnout. We show how
+limited resources (for example, salary incentives to public officials and
+targeted interventions to increase voter turnout) can be allocated at the
+population level to improve these outcomes and maximally enhance democratic
+resilience. On the one hand, structural and individual heterogeneity cause
+systemic fragility that adversarial actors can exploit, but also provide
+opportunities for effective interventions that offer significant global
+improvements from limited and localized actions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computational and Mathematical Organization Theory, forthcoming</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Clutter-Aware Target Detection for ISAC in a Millimeter-Wave Cell-Free
+  Massive MIMO System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.08759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.08759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Rivetti, Ozlem Tugfe Demir, Emil Bjornson, Mikael Skoglund
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the performance of an integrated sensing and
+communication (ISAC) system within a cell-free massive multiple-input
+multiple-output (MIMO) system. Each access point (AP) operates in the
+millimeter-wave (mmWave) frequency band. The APs jointly serve the user
+equipments (UEs) in the downlink while simultaneously detecting a target
+through dedicated sensing beams, which are directed toward a reconfigurable
+intelligent surface (RIS). Although the AP-RIS, RIS-target, and AP-target
+channels have both line-of-sight (LoS) and non-line-of-sight (NLoS) parts, it
+is assumed only knowledge of the LoS paths is available. A key contribution of
+this study is the consideration of clutter, which degrades the target detection
+if not handled. We propose an algorithm to alternatively optimize the transmit
+power allocation and the RIS phase-shift matrix, maximizing the target
+signal-to-clutter-plus-noise ratio (SCNR) while ensuring a minimum
+signal-to-interference-plus-noise ratio (SINR) for the UEs. Numerical results
+demonstrate that exploiting clutter subspace significantly enhances detection
+probability, particularly at high clutter-to-noise ratios, and reveal that an
+increased number of transmit side clusters impair detection performance.
+Finally, we highlight the performance gains achieved using a dedicated sensing
+stream.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to IEEE ICC25 WORKSHOPS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Asymmetry of Frequency Distribution in Power Systems: Sources,
+  Estimation, Impact and Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.04287v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.04287v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taulant Kerci, Federico Milano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper analyses an emerging real-world phenomena in inverter-based
+renewable-dominated power systems, namely, asymmetry of frequency distribution.
+The paper first provides a rationale on why asymmetry reduces the "quality" of
+the frequency control and system operation. Then it provides qualitative
+theoretical insights that explain asymmetry in terms of the nonlinearity of
+real-world power systems and associated models. In particular network losses
+and pitch angle-based frequency control of wind power plants are discussed.
+Then the paper proposes a nonlinear compensation control to reduce the
+asymmetry as well as a statistical metric based on the frequency probability
+distribution to quantify the level of asymmetry in a power system. Real-world
+data obtained from the Irish and Australian transmission systems serve to
+support the theoretical appraisal, whereas simulations based on an IEEE
+benchmark system show the effectiveness of the proposed nonlinear compensation.
+The case study also shows that, while automatic generation control reduces
+asymmetry, frequency control limits and droop-based frequency support provided
+by wind generation using a tight deadband of 15 mHz, namely active power
+control, leads to a significant increase in the asymmetry of the frequency
+probability distribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast State Stabilization using Deep Reinforcement Learning for
+  Measurement-based Quantum Feedback Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11328v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11328v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunxiang Song, Yanan Liu, Daoyi Dong, Hidehiro Yonezawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The stabilization of quantum states is a fundamental problem for realizing
+various quantum technologies. Measurement-based-feedback strategies have
+demonstrated powerful performance, and the construction of quantum control
+signals using measurement information has attracted great interest. However,
+the interaction between quantum systems and the environment is inevitable,
+especially when measurements are introduced, which leads to decoherence. To
+mitigate decoherence, it is desirable to stabilize quantum systems faster,
+thereby reducing the time of interaction with the environment. In this paper,
+we utilize information obtained from measurement and apply deep reinforcement
+learning (DRL) algorithms, without explicitly constructing specific complex
+measurement-control mappings, to rapidly drive random initial quantum state to
+the target state. The proposed DRL algorithm has the ability to speed up the
+convergence to a target state, which shortens the interaction between quantum
+systems and their environments to protect coherence. Simulations are performed
+on two-qubit and three-qubit systems, and the results show that our algorithm
+can successfully stabilize random initial quantum system to the target
+entangled state, with a convergence time faster than traditional methods such
+as Lyapunov feedback control and several DRL algorithms with different reward
+functions. Moreover, it exhibits robustness against imperfect measurements and
+delays in system evolution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Bouc-Wen Model for Binary Direct Collinear Collisions of Convex
+  Viscoplastic Bodies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.08147v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.08147v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihails Milehins, Dan B. Marghitu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study mathematical models of binary direct collinear collisions of convex
+viscoplastic bodies based on two incremental collision laws that employ the
+Bouc-Wen differential model of hysteresis to represent the elastoplastic
+behavior of the materials of the colliding bodies. These collision laws are the
+Bouc-Wen-Simon-Hunt-Crossley Collision Law (BWSHCCL) and the Bouc-Wen-Maxwell
+Collision Law (BWMCL). The BWSHCCL comprises of the Bouc-Wen model amended with
+a nonlinear Hertzian elastic spring element and connected in parallel to a
+nonlinear displacement-dependent and velocity-dependent energy dissipation
+element. The BWMCL comprises of the Bouc-Wen model amended with a nonlinear
+Hertzian elastic spring element and connected in series to a linear
+velocity-dependent energy dissipation element. The mathematical models of the
+collision process are presented in the form of finite-dimensional initial value
+problems. We show that the models possess favorable analytical properties
+(e.g., global existence, uniqueness, and boundedness of the solutions) under
+suitable restrictions on the values of their parameters. Furthermore, based on
+the results of two model parameter identification studies, we demonstrate that
+good agreement can be attained between experimental data and numerical
+approximations of the behavior of the mathematical models across a wide range
+of initial relative velocities of the colliding bodies while using
+parameterizations of the models that are independent of the initial relative
+velocity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages; 5 figures; (v1-v5) a variety of amendments; (v6) updated
+  scaling/nondimensionalization and introduced amendments based on external
+  feedback; (v7) further minor amendments; the associated code/data are
+  available from https://gitlab.com/user9716869/BWBCL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ N-dimensional Convex Obstacle Avoidance using Hybrid Feedback Control
+  (Extended version) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11279v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11279v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayur Sawant, Ilia Polushin, Abdelhamid Tayebi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the autonomous robot navigation problem in a priori
+unknown n-dimensional environments containing convex obstacles of arbitrary
+shapes and sizes. We propose a hybrid feedback control scheme that guarantees
+safe and global asymptotic convergence of the robot to a predefined target
+location. The proposed control strategy relies on a switching mechanism
+allowing the robot to operate either in the move-to-target mode or the
+obstacle-avoidance mode, based on its proximity to the obstacles and the
+availability of a clear straight path between the robot and the target. In the
+obstacle-avoidance mode, the robot is constrained to move within a
+two-dimensional plane that intersects the obstacle being avoided and the
+target, preventing it from retracing its path. The effectiveness of the
+proposed hybrid feedback controller is demonstrated through simulations in
+two-dimensional and three-dimensional environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ring-LWE based encrypted controller with unlimited number of recursive
+  multiplications and effect of error growth 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14372v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14372v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeongjun Jang, Joowon Lee, Seonhong Min, Hyesun Kwak, Junsoo Kim, Yongsoo Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose an encrypted dynamic controller that executes an
+unlimited number of recursive homomorphic multiplications on a Ring Learning
+With Errors (Ring-LWE) based cryptosystem without bootstrapping. The proposed
+controller exhibits lower computational complexity compared to existing
+encrypted controllers implemented on LWE based schemes due to the polynomial
+structure of Ring-LWE. However, the structural difference introduces additional
+difficulties in analyzing the effect of error growth; Ring-LWE based schemes
+inject multiple error coefficients when encrypting a single message, which
+accumulate under recursive homomorphic multiplications. We show that their
+effect on the control performance can be arbitrarily bounded by the closed-loop
+stability, thus recovering the performance of the unencrypted controller.
+Furthermore, a novel method to ``pack'' a vector into a polynomial is
+presented, which enhances computational and memory efficiency when applied to
+the proposed encrypted controller. The effectiveness of the proposed design is
+demonstrated through numerical simulations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 2 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CSAC Drift Modeling Considering GPS Signal Quality in the Case of GPS
+  Signal Unavailability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.00594v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.00594v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seunghyeon Park, Joon Hyo Rhee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Global Positioning System (GPS), one of the Global Navigation Satellite
+Systems (GNSS), provides accurate position, navigation and time (PNT)
+information to various applications. One of the application that is highly
+receiving attention is satellite vehicles, especially Low Earth Orbit (LEO)
+satellites. Due to their limited ways to get PNT information and low
+performance of their onboard clocks, GPS system time (GPST) provided by GPS is
+a good reference clock to synchronize. However, GPS is well-known for its
+vulnerability to intentional or unintentional interference. This study aims to
+maintain the onboard clock with less error relative to the GPST even when the
+GPS signal is disrupted. In this study, we analyzed two major factors that
+affects the quality of the GPS measurements: the number of the visible
+satellites and the geometry of the satellites. Then, we proposed a weighted
+model for a Chip-Scale Atomic Clock (CSAC) that mitigates the clock error
+relative to the GPST while considering the two factors. Based on this model, a
+stand-alone CSAC could maintain its error less than 4 microseconds, even in a
+situation where no GPS signals are received for 12 hours.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICCAS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">31</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Decomposition Framework for Nonlinear Nonconvex Two-Stage Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Lou, Xinyi Luo, Andreas Wächter, Ermin Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new decomposition framework for continuous nonlinear constrained
+two-stage optimization, where both first- and second-stage problems can be
+nonconvex. A smoothing technique based on an interior-point formulation renders
+the optimal solution of the second-stage problem differentiable with respect to
+the first-stage parameters. As a consequence, efficient off-the-shelf
+optimization packages can be utilized. We show that the solution of the
+nonconvex second-stage problem behaves locally like a differentiable function
+so that existing proofs can be applied for the global convergence of the
+first-stage. We also prove fast local convergence of the algorithm as the
+barrier parameter is driven to zero. Numerical experiments for large-scale
+instances demonstrate the computational advantages of the decomposition
+framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Randomized Kaczmarz Methods with Beyond-Krylov Convergence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11673v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11673v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michał Dereziński, Deanna Needell, Elizaveta Rebrova, Jiaming Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Randomized Kaczmarz methods form a family of linear system solvers which
+converge by repeatedly projecting their iterates onto randomly sampled
+equations. While effective in some contexts, such as highly over-determined
+least squares, Kaczmarz methods are traditionally deemed secondary to Krylov
+subspace methods, since this latter family of solvers can exploit outliers in
+the input's singular value distribution to attain fast convergence on
+ill-conditioned systems.
+  In this paper, we introduce Kaczmarz++, an accelerated randomized block
+Kaczmarz algorithm that exploits outlying singular values in the input to
+attain a fast Krylov-style convergence. Moreover, we show that Kaczmarz++
+captures large outlying singular values provably faster than popular Krylov
+methods, for both over- and under-determined systems. We also develop an
+optimized variant for positive semidefinite systems, called CD++, demonstrating
+empirically that it is competitive in arithmetic operations with both CG and
+GMRES on a collection of benchmark problems. To attain these results, we
+introduce several novel algorithmic improvements to the Kaczmarz framework,
+including adaptive momentum acceleration, Tikhonov-regularized projections, and
+a memoization scheme for reusing information from previously sampled
+equation~blocks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On a Lemma by Brézis and Haraux 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11662v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11662v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh N. Bùi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose several applications of an often overlooked part of the 1976 paper
+by Br\'ezis and Haraux, in which the celebrated Br\'ezis--Haraux theorem was
+established. Our results unify and extend various existing ones on the range of
+a composite monotone operator and provide new insight into the seminal work by
+Br\'ezis and Haraux.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fully Adaptive Zeroth-Order Method for Minimizing Functions with
+  Compressible Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geovani Nunes Grapiglia, Daniel McKenzie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an adaptive zeroth-order method for minimizing differentiable
+functions with $L$-Lipschitz continuous gradients. The method is designed to
+take advantage of the eventual compressibility of the gradient of the objective
+function, but it does not require knowledge of the approximate sparsity level
+$s$ or the Lipschitz constant $L$ of the gradient. We show that the new method
+performs no more than $O\left(n^{2}\epsilon^{-2}\right)$ function evaluations
+to find an $\epsilon$-approximate stationary point of an objective function
+with $n$ variables. Assuming additionally that the gradients of the objective
+function are compressible, we obtain an improved complexity bound of
+$O\left(s\log\left(n\right)\epsilon^{-2}\right)$ function evaluations, which
+holds with high probability. Preliminary numerical results illustrate the
+efficiency of the proposed method and demonstrate that it can significantly
+outperform its non-adaptive counterpart.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Mixing and Pressure Loss Formulations for Gas Network
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geonhee Kim, Christopher Lourenco, Daphne Skipper, Luze Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-convex, nonlinear gas network optimization models are used to determine
+the feasibility of flows on existing networks given constraints on network
+flows, gas mixing, and pressure loss along pipes. This work improves two
+existing gas network models: a discrete mixed-integer nonlinear program (MINLP)
+that uses binary variables to model positive and negative flows, and a
+continuous nonlinear program (NLP) that implements complementarity constraints
+with continuous variables. We introduce cuts to expedite the MINLP and we
+formulate two new pressure loss models that leverage the flow-splitting
+variables: one that is highly accurate and another that is simpler but less
+accurate. In computational tests using the global solver BARON our cuts and
+accurate pressure loss improves: (1) the average run time of the MINLP by a
+factor of 35, (2) the stability of the MINLP by solving every tested instance
+within 2.5 minutes (the baseline model timed out on 25% of instances), (3) the
+stability of the NLP by solving more instances than the baseline. Our simpler
+pressure loss model further improved run times in the MINLP (by a factor of 48
+versus the baseline MINLP), but was unstable in the context of the NLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Riemannian Optimization for Holevo Capacity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengkai Zhu, Renfeng Peng, Bin Gao, Xin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computing the classical capacity of a noisy quantum channel is crucial for
+understanding the limits of communication over quantum channels. However, its
+evaluation remains challenging due to the difficulty of computing the Holevo
+capacity and the even greater difficulty of regularization. In this work, we
+formulate the computation of the Holevo capacity as an optimization problem on
+a product manifold constructed from probability distributions and their
+corresponding pure input states for a quantum channel. A Riemannian gradient
+descent algorithm is proposed to solve the problem, providing lower bounds on
+the classical capacity of general quantum channels and outperforming existing
+methods in numerical experiments in both efficiency and scale.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Exact and Approximation Algorithms for Linear-Parametric
+  Optimization Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Levin Nemesch, Stefan Ruzika, Clemens Thielen, Alina Wittmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linear-parametric optimization, where multiple objectives are combined into a
+single objective using linear combinations with parameters as coefficients, has
+numerous links to other fields in optimization and a wide range of application
+areas. In this survey, we provide a comprehensive overview of structural
+results and algorithmic strategies for solving linear-parametric optimization
+problems exactly and approximately. Transferring concepts from related areas
+such as multi-objective optimization provides further relevant results. The
+survey consists of two parts: First, we list strategies that work in a general
+fashion and do not rely on specific problem structures. Second, we look at
+well-studied parametric optimization problems and cover both important
+theoretical results and specialized algorithmic approaches for these problems.
+Among these problems are parametric variants of shortest path problems, minimum
+cost flow and maximum flow problems, spanning tree problems, the knapsack
+problem, and matching problems. Overall, we cover the results from 128
+publications (and refer to 33 supplemental works) published between 1963 and
+2024.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Exponential Stabilization for a Simplified Fluid-Particle
+  Interaction System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marius Tucsnak, Zhuo Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work considers a system coupling a viscous Burgers equation (aimed to
+describe a simplified model of $1D$ fluid flow) with the ODE describing the
+motion of a point mass moving inside the fluid. The point mass is possibly
+under the action of a feedback control. Our main contributions are that we
+prove two global exponential stability results. More precisely, we first show
+that the velocity field corresponding to the free dynamics case is globally
+exponentially stable. We next show that, in the presence of the feedback
+control both the velocity field and the distance from the mass point to a
+prescribed target position decay exponentially. The proofs of these results
+heavily rely on the use of a special test function allowing both to prove that
+the mass point stays away from the boundary and to construct a perturbed
+Lyapunov function.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lagrangian Duality for Mixed-Integer Semidefinite Programming: Theory
+  and Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank de Meijer, Renata Sotirov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the Lagrangian duality theory for mixed-integer
+semidefinite programming (MISDP). We derive the Lagrangian dual problem and
+prove that the resulting Lagrangian dual bound dominates the bound obtained
+from the continuous relaxation of the MISDP problem. We present a hierarchy of
+Lagrangian dual bounds by exploiting the theory of integer positive
+semidefinite matrices and propose three algorithms for obtaining those bounds.
+Our algorithms are variants of well-known algorithms for minimizing
+non-differentiable convex functions. The numerical results on the max-$k$-cut
+problem show that the Lagrangian dual bounds are substantially stronger than
+the semidefinite programming bound obtained by relaxing integrality, already
+for lower levels in the hierarchy. Computational costs for computing our bounds
+are small.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large deviations for sticky-reflecting Brownian motion with boundary
+  diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-Baptiste Casteras, Leonard Monsaingeon, Luca Nenna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a Schilder-type large deviation principle for sticky-reflected
+Brownian motion with boundary diffusion, both at the static and sample path
+level in the short-time limit. A sharp transition for the rate function occurs,
+depending on whether the tangential boundary diffusion is faster or slower than
+in the interior of the domain. The resulting intrinsic distance naturally gives
+rise to a novel optimal transport model, where motion and kinetic energy are
+treated differently in the interior and along the boundary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Regularity Estimates for Optimal Transport via Entropic
+  Regularisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathael Gozlan, Maxime Sylvestre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a general approach to prove global regularity estimates for
+quadratic optimal transport using the entropic regularisation of the problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relax-and-round strategies for solving the Unit Commitment problem with
+  AC Power Flow constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        D. Gómez, S. Göttlich, A. Ríos, P. Salgado
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Unit Commitment problem with AC power flow constraints (UC-ACOPF) is a
+non-convex mixed-integer nonlinear programming (MINLP) problem encountered in
+power systems. Its combination of combinatorial complexity and non-convex
+nonlinear constraints makes it particularly challenging. A common approach to
+tackle this issue is to relax the integrality condition, but this often results
+in infeasible solutions. Consequently, rounding heuristics are frequently
+employed to restore integer feasibility. This paper addresses recent
+advancements in heuristics aimed at quickly obtaining feasible solutions for
+the UC-ACOPF problem, focusing specifically on direct relax-and-round
+strategies. We propose a model-based heuristic that rescales the solution of
+the integer-relaxed problem before rounding. Furthermore, we introduce rounding
+formulas designed to enforce combinatorial constraints and aim to maintain AC
+feasibility in the resulting solutions. These methodologies are compared
+against standard direct rounding techniques in the literature, applied to a
+6-bus and a 118-bus test systems. Additionally, we integrate the proposed
+heuristics into an implementation of the Feasibility Pump (FP) method,
+demonstrating their utility and potential to enhance existing rounding
+strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A mixed finite elements approximation of inverse source problems for the
+  wave equation with variable coefficients using observability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Castro, Sorin Micu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider an inverse problem for the linear one-dimensional wave equation
+with variable coefficients consisting in determining an unknown source term
+from a boundary observation. A method to obtain approximations of this inverse
+problem using a space discretization based on a mixed finite element method is
+proposed and analyzed. Its stability and convergence relay on a new uniform
+boundary observability property with respect to the discretization parameter.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bounding the Settling Time of Finite-Time Stable Systems using Sum of
+  Squares 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sengiyumva Kisole, Kunal Garg, Matthew Peet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finite-time stability (FTS) of a differential equation guarantees that
+solutions reach a given equilibrium point in finite time, where the time of
+convergence depends on the initial state of the system. For traditional
+stability notions such as exponential stability, the convex optimization
+framework of Sum-of-Squares (SoS) enables the computation of polynomial
+Lyapunov functions to certify stability. However, finite-time stable systems
+are characterized by non-Lipschitz, non-polynomial vector fields, rendering
+standard SoS methods inapplicable. To this end, in this paper, we show that the
+computation of a non-polynomial Lyapunov function certifying finite-time
+stability can be reformulated as computation of a polynomial one under a
+particular transformation that we develop in this work. As a result, SoS can be
+utilized to compute a Lyapunov function for FTS. This Lyapunov function can
+then be used to obtain a bound on the settling time. We first present this
+approach for the scalar case and then extend it to the multivariate case.
+Numerical examples demonstrate the effectiveness of our approach in both
+certifying finite-time stability and computing accurate settling time bounds.
+This work represents the first combination of SoS programming with settling
+time bounds for finite-time stable systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constrained Hellinger-Kantorovich barycenters: least-cost soft and conic
+  multi-marginal formulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11268v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11268v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej Buze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that the problem of finding the barycenter in the
+Hellinger-Kantorovich setting admits a least-cost soft multi-marginal
+formulation, provided that a one-sided hard marginal constraint is introduced.
+The constrained approach is then shown to admit a conic multi-marginal
+reformulation based on defining a single joint multi-marginal perspective cost
+function in the conic multi-marginal formulation, as opposed to separate
+two-marginal perspective cost functions for each two-marginal problem in the
+coupled-two-marginal formulation, as was studied previously in literature. We
+further establish that, as in the Wasserstein metric, the recently introduced
+framework of unbalanced multi-marginal optimal transport can be reformulated
+using the notion of the least cost. Subsequently, we discuss an example when
+input measures are Dirac masses and numerically solve an example for Gaussian
+measures. Finally, we also explore why the constrained approach can be seen as
+a natural extension of a Wasserstein space barycenter to the unbalanced
+setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 1 figure, accepted version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Free Lunch in the Forest: Functionally-Identical Pruning of Boosted Tree
+  Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16167v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16167v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssouf Emine, Alexandre Forel, Idriss Malek, Thibaut Vidal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tree ensembles, including boosting methods, are highly effective and widely
+used for tabular data. However, large ensembles lack interpretability and
+require longer inference times. We introduce a method to prune a tree ensemble
+into a reduced version that is "functionally identical" to the original model.
+In other words, our method guarantees that the prediction function stays
+unchanged for any possible input. As a consequence, this pruning algorithm is
+lossless for any aggregated metric. We formalize the problem of functionally
+identical pruning on ensembles, introduce an exact optimization model, and
+provide a fast yet highly effective method to prune large ensembles. Our
+algorithm iteratively prunes considering a finite set of points, which is
+incrementally augmented using an adversarial model. In multiple computational
+experiments, we show that our approach is a "free lunch", significantly
+reducing the ensemble size without altering the model's behavior. Thus, we can
+preserve state-of-the-art performance at a fraction of the original model's
+size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Projection-free computation of robust controllable sets with constrained
+  zonotopes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13730v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13730v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abraham P. Vinod, Avishai Weiss, Stefano Di Cairano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of computing robust controllable sets for discrete-time
+linear systems with additive uncertainty. We propose a tractable and scalable
+approach to inner- and outer-approximate robust controllable sets using
+constrained zonotopes, when the additive uncertainty set is a symmetric,
+convex, and compact set. Our least-squares-based approach uses novel
+closed-form approximations of the Pontryagin difference between a constrained
+zonotopic minuend and a symmetric, convex, and compact subtrahend. Unlike
+existing approaches, our approach does not rely on convex optimization solvers,
+and is projection-free for ellipsoidal and zonotopic uncertainty sets. We also
+propose a least-squares-based approach to compute a convex, polyhedral
+outer-approximation to constrained zonotopes, and characterize sufficient
+conditions under which all these approximations are exact. We demonstrate the
+computational efficiency and scalability of our approach in several case
+studies, including the design of abort-safe rendezvous trajectories for a
+spacecraft in near-rectilinear halo orbit under uncertainty. Our approach can
+inner-approximate a 20-step robust controllable set for a 100-dimensional
+linear system in under 15 seconds on a standard computer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 7 figures; Accepted for publication at Automatica. See
+  https://youtu.be/6BPmHgxD3OI for the use of the proposed method in a
+  simplified abort-safe rendezvous problem</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimization and Generalization Guarantees for Weight Normalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08935v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08935v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Cisneros-Velarde, Zhijie Chen, Sanmi Koyejo, Arindam Banerjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weight normalization (WeightNorm) is widely used in practice for the training
+of deep neural networks and modern deep learning libraries have built-in
+implementations of it. In this paper, we provide the first theoretical
+characterizations of both optimization and generalization of deep WeightNorm
+models with smooth activation functions. For optimization, from the form of the
+Hessian of the loss, we note that a small Hessian of the predictor leads to a
+tractable analysis. Thus, we bound the spectral norm of the Hessian of
+WeightNorm networks and show its dependence on the network width and weight
+normalization terms--the latter being unique to networks without WeightNorm.
+Then, we use this bound to establish training convergence guarantees under
+suitable assumptions for gradient decent. For generalization, we use WeightNorm
+to get a uniform convergence based generalization bound, which is independent
+from the width and depends sublinearly on the depth. Finally, we present
+experimental results which illustrate how the normalization terms and other
+quantities of theoretical interest relate to the training of WeightNorm
+networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Patient Transport in Hospitals: A Literature <span class="highlight-title">Review</span> of Operations
+  Research and Management Science Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03282v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03282v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Lorenz Klein, Clemens Thielen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most activities in hospitals require the presence of the patient. Delays in
+patient transport can disrupt operations, potentially resulting in idle staff,
+underutilized equipment, and postponed procedures, which in turn lead to lost
+revenue, unnecessary costs across many different areas and departments, and
+lower patient satisfaction. Consequently, patient transport planning is a
+central operational task in hospitals. This paper provides the first literature
+review of Operations Research and Management Science approaches for
+non-emergency, intra-hospital patient transport. We structure the different
+patient transport problems considered in the literature according to several
+main characteristics and introduce a five-field notation that allows for a
+concise representation of different problem variants. We then analyze the
+relevant literature with respect to different aspects related to the considered
+problem variant, the employed modeling and solution techniques, as well as the
+data used and the level of practical implementation achieved. Based on our
+literature analysis and semi-structured interviews with hospital practitioners,
+we compare current hospital practices and the existing literature, identify
+research gaps, and formulate an agenda for relevant future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accelerated Affine-Invariant Convergence Rates of the Frank-Wolfe
+  Algorithm with Open-Loop Step-Sizes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04096v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04096v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elias Wirth, Javier Pena, Sebastian Pokutta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent papers have shown that the Frank-Wolfe algorithm (FW) with open-loop
+step-sizes exhibits rates of convergence faster than the iconic
+$\mathcal{O}(t^{-1})$ rate. In particular, when the minimizer of a strongly
+convex function over a polytope lies in the relative interior of a feasible
+region face, the FW with open-loop step-sizes $\eta_t = \frac{\ell}{t+\ell}$
+for $\ell \in \mathbb{N}_{\geq 2}$ has accelerated convergence
+$\mathcal{O}(t^{-2})$ in contrast to the rate $\Omega(t^{-1-\epsilon})$
+attainable with more complex line-search or short-step step-sizes. Given the
+relevance of this scenario in data science problems, research has grown to
+explore the settings enabling acceleration in open-loop FW. However, despite
+FW's well-known affine invariance, existing acceleration results for open-loop
+FW are affine-dependent. This paper remedies this gap in the literature by
+merging two recent research trajectories: affine invariance (Wirth et al.,
+2023b) and open-loop step-sizes (Pena, 2021). In particular, we extend all
+known non-affine-invariant convergence rates for FW with open-loop step-sizes
+to affine-invariant results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Semidefinite Relaxations for Matrix-Weighted State-Estimation
+  Problems in Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07275v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07275v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Connor Holmes, Frederike Dümbgen, Timothy D Barfoot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there has been remarkable progress in the development of
+so-called certifiable perception methods, which leverage semidefinite, convex
+relaxations to find global optima of perception problems in robotics. However,
+many of these relaxations rely on simplifying assumptions that facilitate the
+problem formulation, such as an isotropic measurement noise distribution. In
+this paper, we explore the tightness of the semidefinite relaxations of
+matrix-weighted (anisotropic) state-estimation problems and reveal the
+limitations lurking therein: matrix-weighted factors can cause convex
+relaxations to lose tightness. In particular, we show that the semidefinite
+relaxations of localization problems with matrix weights may be tight only for
+low noise levels. To better understand this issue, we introduce a theoretical
+connection between the posterior uncertainty of the state estimate and the
+certificate matrix obtained via convex relaxation. With this connection in
+mind, we empirically explore the factors that contribute to this loss of
+tightness and demonstrate that redundant constraints can be used to regain it.
+As a second technical contribution of this paper, we show that the
+state-of-the-art relaxation of scalar-weighted SLAM cannot be used when matrix
+weights are considered. We provide an alternate formulation and show that its
+SDP relaxation is not tight (even for very low noise levels) unless specific
+redundant constraints are used. We demonstrate the tightness of our
+formulations on both simulated and real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Infinite-dimensional port-Hamiltonian systems with a stationary
+  interface 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08967v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08967v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Kilian, Bernhard Maschke, Andrii Mironchenko, Fabian Wirth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider two systems of two conservation laws that are defined on
+complementary, one-dimensional spatial intervals and coupled by an interface as
+a single port-Hamiltonian system. In case of a fixed interface position, we
+characterize the boundary and interface conditions for which the associated
+port-Hamiltonian operator generates a contraction semigroup. Furthermore, we
+present sufficient conditions for the exponential stability of the generated
+$C_0$-semigroup. The results are illustrated by the example of two acoustic
+waveguides coupled by a membrane interface.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2301.07344</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When does subtracting a rank-one approximation decrease tensor rank? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14985v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14985v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emil Horobet, Ettore Teixeira Turatti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Subtracting a critical rank-one approximation from a matrix always results in
+a matrix with a lower rank. This is not true for tensors in general. Motivated
+by this, we ask the question: what is the closure of the set of those tensors
+for which subtracting some of its critical rank-one approximation from it and
+repeating the process we will eventually get to zero? In this article, we show
+how to construct this variety of tensors and we show how this is connected to
+the bottleneck points of the variety of rank-one tensors (and in general to the
+singular locus of the hyperdeterminant), and how this variety can be equal to
+and in some cases be more than (weakly) orthogonally decomposable tensors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Barrier Function for Bilevel Optimization with Coupled Lower-Level
+  Constraints: Formulation, Approximation and Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.10670v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.10670v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaotian Jiang, Jiaxiang Li, Mingyi Hong, Shuzhong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider bilevel optimization problem where the lower-level
+has coupled constraints, i.e. the constraints depend both on the upper- and
+lower-level variables. In particular, we consider two settings for the
+lower-level problem. The first is when the objective is strongly convex and the
+constraints are convex with respect to the lower-level variable; The second is
+when the lower-level is a linear program. We propose to utilize a barrier
+function reformulation to translate the problem into an unconstrained problem.
+By developing a series of new techniques, we proved that both the hyperfunction
+value and hypergradient of the barrier reformulated problem (uniformly)
+converge to those of the original problem under minimal assumptions. Further,
+to overcome the non-Lipschitz smoothness of hyperfunction and lower-level
+problem for barrier reformulated problems, we design an adaptive algorithm that
+ensures a non-asymptotic convergence guarantee. We also design an algorithm
+that converges to the stationary point of the original problem asymptotically
+under certain assumptions. The proposed algorithms require minimal assumptions,
+and to our knowledge, they are the first with convergence guarantees when the
+lower-level problem is a linear program. Numerical experiments are conducted to
+show the effectiveness of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Finite convergence of the Moment-SOS hierarchy for polynomial matrix
+  optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17241v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17241v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Huang, Jiawang Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the matrix Moment-SOS hierarchy for solving polynomial
+matrix optimization. Our first result is to show the finite convergence of this
+hierarchy, if the nondegeneracy condition, strict complementarity condition and
+second order sufficient condition hold at every minimizer, under the
+Archimedean property. A useful criterion for detecting the finite convergence
+is the flat truncation. Our second result is to show that every minimizer of
+the moment relaxation must have a flat truncation when the relaxation order is
+big enough, under the above mentioned optimality conditions. These results give
+connections between nonlinear semidefinite optimization theory and Moment-SOS
+methods for solving polynomial matrix optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accelerated Gradient and Skew-Symmetric Splitting Methods for a Class of
+  Monotone Operator Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09009v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09009v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Chen, Jingrong Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A class of monotone operator equations, which can be decomposed into sum of
+the gradient of a strongly convex function and a linear and skew-symmetric
+operator, is considered in this work. Based on discretization of the
+generalized gradient flow, gradient and skew-symmetric splitting (GSS) methods
+are proposed and proved to converge in linear rates. To further accelerate the
+convergence, an accelerated gradient flow is proposed and accelerated gradient
+and skew-symmetric splitting (AGSS) methods are developed, which extends the
+acceleration among the existing works on the convex minimization to a more
+general class of monotone operator equations. In particular, when applied to
+smooth saddle point systems with bilinear coupling, a linear convergent method
+with optimal lower iteration complexity is proposed. The robustness and
+efficiency of GSS and AGSS methods are verified via extensive numerical
+experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized Bayesian Nash Equilibrium with Continuous Type and Action
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19721v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19721v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Tao, Huifu Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian game is a strategic decision-making model where each player's type
+parameter characterizing its own objective is private information: each player
+knows its own type but not its rivals' types, and Bayesian Nash equilibrium
+(BNE) is an outcome of this game where each player makes a strategic optimal
+decision according to its own type under the Nash conjecture. In this paper, we
+advance the literature by considering a generalized Bayesian game where each
+player's action space depends on its own type parameter and the rivals'
+actions. This reflects the fact that in practical applications, a firm's
+feasible action is often related to its own type (e.g. marginal cost) and the
+rivals' actions (e.g. common resource constraints in a competitive market).
+Under some moderate conditions, we demonstrate existence of continuous
+generalized Bayesian Nash equilibria (GBNE) and uniqueness of such an
+equilibrium when each player's action space is only dependent on its type. In
+the case that each player's action space is also dependent on rivals' actions,
+we give a simple example to show that uniqueness of GBNE is not guaranteed
+under standard monotone conditions. To compute an approximate GBNE, we restrict
+each player's response function to the space of polynomial functions of its
+type parameter and consequently convert the GBNE problem to a stochastic
+generalized Nash equilibrium problem (SGNE). To justify the approximation, we
+discuss convergence of the approximation scheme. Some preliminary numerical
+test results show that the approximation scheme works well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A System of BSDEs with Singular Terminal Values Arising in Optimal
+  Liquidation with Regime Switching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19058v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19058v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanxing Fu, Xiaomin Shi, Zuo Quan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a stochastic control problem with regime switching arising in an
+optimal liquidation problem with dark pools and multiple regimes. The new
+feature of this model is that it introduces a system of BSDEs with jumps and
+with singular terminal values, which appears in literature for the first time.
+The existence result for this system is obtained. As a result, we solve the
+stochastic control problem with regime switching. More importantly, the
+uniqueness result of this system is also obtained, in contrast to merely
+minimal solutions established in most related literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Simplification Method for Inequality Constraints in Integer Binary
+  Encoding HOBO Formulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09670v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09670v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuichiro Minato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a novel method for simplifying inequality constraints in
+Higher-Order Binary Optimization (HOBO) formulations. The proposed method
+addresses challenges associated with Quadratic Unconstrained Binary
+Optimization (QUBO) formulations, specifically the increased computational
+complexity and reduced solution accuracy caused by the introduction of slack
+variables and the resulting growth in auxiliary qubits. By efficiently
+integrating constraints, the method enhances the computational efficiency and
+accuracy of both quantum and classical solvers. The effectiveness of the
+proposed approach is demonstrated through numerical experiments applied to
+combinatorial optimization problems. The results indicate that this method
+expands the applicability of quantum algorithms to high-dimensional problems
+and improves the practicality of classical optimization solvers for
+optimization problems involving inequality constraints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The assumptions of the paper are overly restrictive, and there is a
+  critical error</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robotic Sorting Systems: Robot Management and Layout Design Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.04832v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.04832v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Zhao, Xi Lin, Fang He, Hanwen Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the contemporary logistics industry, automation plays a pivotal role in
+enhancing production efficiency and expanding industrial scale. Autonomous
+mobile robots, in particular, have become integral to the modernization efforts
+in warehouses. One noteworthy application in robotic warehousing is the robotic
+sorting system (RSS), distinguished by its characteristics such as
+cost-effectiveness, simplicity, scalability, and adaptable throughput control.
+While previous research has focused on analyzing the efficiency of RSS, it
+often assumed an ideal robot management system ignoring potential queuing
+delays by assuming constant travel times. This study relaxes this assumption
+and explores the quantitative relationship between RSS configuration parameters
+and system throughput. We introduce a novel robot traffic management method,
+named the rhythmic control for sorting scenario (RC-S), for RSS operations,
+equipped with an estimation formula establishing the relationship between
+system performance and configurations. Simulations validate that RC-S reduces
+average service time by 10.3\% compared to the classical cooperative A*
+algorithm, while also improving throughput and runtime. Based on the
+performance analysis of RC-S, we further develop a layout optimization model
+for RSS, considering RSS configuration, desired throughput, and costs, to
+minimize expenses and determine the best layout. Numerical studies show that at
+lower throughput levels, facility costs dominate, while at higher throughput
+levels, labor costs prevail. Additionally, due to traffic efficiency
+limitations, RSS is well-suited for small-scale operations like
+end-of-supply-chain distribution centers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonconvex Stochastic Bregman Proximal Gradient Method with Application
+  to Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14522v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14522v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuangyu Ding, Jingyang Li, Kim-Chuan Toh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradient methods for minimizing nonconvex composite objective
+functions typically rely on the Lipschitz smoothness of the differentiable
+part, but this assumption fails in many important problem classes like
+quadratic inverse problems and neural network training, leading to instability
+of the algorithms in both theory and practice. To address this, we propose a
+family of stochastic Bregman proximal gradient (SBPG) methods that only require
+smooth adaptivity. SBPG replaces the quadratic approximation in SGD with a
+Bregman proximity measure, offering a better approximation model that handles
+non-Lipschitz gradients in nonconvex objectives. We establish the convergence
+properties of vanilla SBPG and show it achieves optimal sample complexity in
+the nonconvex setting. Experimental results on quadratic inverse problems
+demonstrate SBPG's robustness in terms of stepsize selection and sensitivity to
+the initial point. Furthermore, we introduce a momentum-based variant, MSBPG,
+which enhances convergence by relaxing the mini-batch size requirement while
+preserving the optimal oracle complexity. We apply MSBPG to the training of
+deep neural networks, utilizing a polynomial kernel function to ensure smooth
+adaptivity of the loss function. Experimental results on benchmark datasets
+confirm the effectiveness and robustness of MSBPG in training neural networks.
+Given its negligible additional computational cost compared to SGD in
+large-scale optimization, MSBPG shows promise as a universal open-source
+optimizer for future applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>44 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">98</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating visual explanations from deep networks using implicit neural
+  representations <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11784v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11784v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Byra, Henrik Skibbe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explaining deep learning models in a way that humans can easily understand is
+essential for responsible artificial intelligence applications. Attribution
+methods constitute an important area of explainable deep learning. The
+attribution problem involves finding parts of the network's input that are the
+most responsible for the model's output. In this work, we demonstrate that
+implicit neural representations (INRs) constitute a good framework for
+generating visual explanations. Firstly, we utilize coordinate-based implicit
+networks to reformulate and extend the extremal perturbations technique and
+generate attribution masks. Experimental results confirm the usefulness of our
+method. For instance, by proper conditioning of the implicit network, we obtain
+attribution masks that are well-behaved with respect to the imposed area
+constraints. Secondly, we present an iterative INR-based method that can be
+used to generate multiple non-overlapping attribution masks for the same image.
+We depict that a deep learning model may associate the image label with both
+the appearance of the object of interest as well as with areas and textures
+usually accompanying the object. Our study demonstrates that implicit networks
+are well-suited for the generation of attribution masks and can provide
+interesting insights about the performance of deep learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EfficientVITON: An Efficient Virtual Try-On Model using Optimized
+  Diffusion Process 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mostafa Atef, Mariam Ayman, Ahmed Rashed, Ashrakat Saeed, Abdelrahman Saeed, Ahmed Fares
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Would not it be much more convenient for everybody to try on clothes by only
+looking into a mirror ? The answer to that problem is virtual try-on, enabling
+users to digitally experiment with outfits. The core challenge lies in
+realistic image-to-image translation, where clothing must fit diverse human
+forms, poses, and figures. Early methods, which used 2D transformations,
+offered speed, but image quality was often disappointing and lacked the nuance
+of deep learning. Though GAN-based techniques enhanced realism, their
+dependence on paired data proved limiting. More adaptable methods offered great
+visuals but demanded significant computing power and time. Recent advances in
+diffusion models have shown promise for high-fidelity translation, yet the
+current crop of virtual try-on tools still struggle with detail loss and
+warping issues. To tackle these challenges, this paper proposes EfficientVITON,
+a new virtual try-on system leveraging the impressive pre-trained Stable
+Diffusion model for better images and deployment feasibility. The system
+includes a spatial encoder to maintain clothings finer details and zero
+cross-attention blocks to capture the subtleties of how clothes fit a human
+body. Input images are carefully prepared, and the diffusion process has been
+tweaked to significantly cut generation time without image quality loss. The
+training process involves two distinct stages of fine-tuning, carefully
+incorporating a balance of loss functions to ensure both accurate try-on
+results and high-quality visuals. Rigorous testing on the VITON-HD dataset,
+supplemented with real-world examples, has demonstrated that EfficientVITON
+achieves state-of-the-art results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Review</span> Paper of the Effects of Distinct Modalities and ML Techniques
+  to Distracted Driving Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11758v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11758v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anthony. Dontoh, Stephanie. Ivey, Logan. Sirbaugh, Armstrong. Aboah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distracted driving remains a significant global challenge with severe human
+and economic repercussions, demanding improved detection and intervention
+strategies. While previous studies have extensively explored single-modality
+approaches, recent research indicates that these systems often fall short in
+identifying complex distraction patterns, particularly cognitive distractions.
+This systematic review addresses critical gaps by providing a comprehensive
+analysis of machine learning (ML) and deep learning (DL) techniques applied
+across various data modalities - visual,, sensory, auditory, and multimodal. By
+categorizing and evaluating studies based on modality, data accessibility, and
+methodology, this review clarifies which approaches yield the highest accuracy
+and are best suited for specific distracted driving detection goals. The
+findings offer clear guidance on the advantages of multimodal versus
+single-modal systems and capture the latest advancements in the field.
+Ultimately, this review contributes valuable insights for developing robust
+distracted driving detection frameworks, supporting enhanced road safety and
+mitigation strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A generalizable 3D framework and model for <span class="highlight-title">self-supervised</span> learning in
+  medical imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tony Xu, Sepehr Hosseini, Chris Anderson, Anthony Rinaldi, Rahul G. Krishnan, Anne L. Martel, Maged Goubran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current self-supervised learning methods for 3D medical imaging rely on
+simple pretext formulations and organ- or modality-specific datasets, limiting
+their generalizability and scalability. We present 3DINO, a cutting-edge SSL
+method adapted to 3D datasets, and use it to pretrain 3DINO-ViT: a
+general-purpose medical imaging model, on an exceptionally large, multimodal,
+and multi-organ dataset of ~100,000 3D medical imaging scans from over 10
+organs. We validate 3DINO-ViT using extensive experiments on numerous medical
+imaging segmentation and classification tasks. Our results demonstrate that
+3DINO-ViT generalizes across modalities and organs, including
+out-of-distribution tasks and datasets, outperforming state-of-the-art methods
+on the majority of evaluation metrics and labeled dataset sizes. Our 3DINO
+framework and 3DINO-ViT will be made available to enable research on 3D
+foundation models or further finetuning for a wide range of medical imaging
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are generative models fair? A study of racial bias in dermatological
+  image generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11752v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11752v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel López-Pérez, Søren Hauberg, Aasa Feragen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Racial bias in medicine, particularly in dermatology, presents significant
+ethical and clinical challenges. It often results from the underrepresentation
+of darker skin tones in training datasets for machine learning models. While
+efforts to address bias in dermatology have focused on improving dataset
+diversity and mitigating disparities in discriminative models, the impact of
+racial bias on generative models remains underexplored. Generative models, such
+as Variational Autoencoders (VAEs), are increasingly used in healthcare
+applications, yet their fairness across diverse skin tones is currently not
+well understood. In this study, we evaluate the fairness of generative models
+in clinical dermatology with respect to racial bias. For this purpose, we first
+train a VAE with a perceptual loss to generate and reconstruct high-quality
+skin images across different skin tones. We utilize the Fitzpatrick17k dataset
+to examine how racial bias influences the representation and performance of
+these models. Our findings indicate that the VAE is influenced by the diversity
+of skin tones in the training dataset, with better performance observed for
+lighter skin tones. Additionally, the uncertainty estimates produced by the VAE
+are ineffective in assessing the model's fairness. These results highlight the
+need for improved uncertainty quantification mechanisms to detect and address
+racial bias in generative models for trustworthy healthcare technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SILO: Solving Inverse Problems with Latent Operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ron Raphaeli, Sean Man, Michael Elad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consistent improvement of image priors over the years has led to the
+development of better inverse problem solvers. Diffusion models are the
+newcomers to this arena, posing the strongest known prior to date. Recently,
+such models operating in a latent space have become increasingly predominant
+due to their efficiency. In recent works, these models have been applied to
+solve inverse problems. Working in the latent space typically requires multiple
+applications of an Autoencoder during the restoration process, which leads to
+both computational and restoration quality challenges. In this work, we propose
+a new approach for handling inverse problems with latent diffusion models,
+where a learned degradation function operates within the latent space,
+emulating a known image space degradation. Usage of the learned operator
+reduces the dependency on the Autoencoder to only the initial and final steps
+of the restoration process, facilitating faster sampling and superior
+restoration quality. We demonstrate the effectiveness of our method on a
+variety of image restoration tasks and datasets, achieving significant
+improvements over prior art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page in https://ronraphaeli.github.io/SILO-website/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaceSORT: a Multi-Face Tracking Method based on Biometric and Appearance
+  Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11741v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11741v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Jöchl, Andreas Uhl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking multiple faces is a difficult problem, as there may be partially
+occluded or lateral faces. In multiple face tracking, association is typically
+based on (biometric) face features. However, the models used to extract these
+face features usually require frontal face images, which can limit the tracking
+performance. In this work, a multi-face tracking method inspired by StrongSort,
+FaceSORT, is proposed. To mitigate the problem of partially occluded or lateral
+faces, biometric face features are combined with visual appearance features
+(i.e., generated by a generic object classifier), with both features are
+extracted from the same face patch. A comprehensive experimental evaluation is
+performed, including a comparison of different face descriptors, an evaluation
+of different parameter settings, and the application of a different similarity
+metric. All experiments are conducted with a new multi-face tracking dataset
+and a subset of the ChokePoint dataset. The `Paris Lodron University Salzburg
+Faces in a Queue' dataset consists of a total of seven fully annotated
+sequences (12730 frames) and is made publicly available as part of this work.
+Together with this dataset, annotations of 6 sequences from the ChokePoint
+dataset are also provided.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedicoSAM: Towards foundation models for medical image segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anwai Archit, Luca Freckmann, Constantin Pape
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation is an important analysis task in clinical practice
+and research. Deep learning has massively advanced the field, but current
+approaches are mostly based on models trained for a specific task. Training
+such models or adapting them to a new condition is costly due to the need for
+(manually) labeled data. The emergence of vision foundation models, especially
+Segment Anything, offers a path to universal segmentation for medical images,
+overcoming these issues. Here, we study how to improve Segment Anything for
+medical images by comparing different finetuning strategies on a large and
+diverse dataset. We evaluate the finetuned models on a wide range of
+interactive and (automatic) semantic segmentation tasks. We find that the
+performance can be clearly improved for interactive segmentation. However,
+semantic segmentation does not benefit from pretraining on medical images. Our
+best model, MedicoSAM, is publicly available at
+https://github.com/computational-cell-analytics/medico-sam. We show that it is
+compatible with existing tools for data annotation and believe that it will be
+of great practical value.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mobile-Agent-E: Self-Evolving Mobile Assistant for Complex Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhailong Wang, Haiyang Xu, Junyang Wang, Xi Zhang, Ming Yan, Ji Zhang, Fei Huang, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Smartphones have become indispensable in modern life, yet navigating complex
+tasks on mobile devices often remains frustrating. Recent advancements in large
+multimodal model (LMM)-based mobile agents have demonstrated the ability to
+perceive and act in mobile environments. However, current approaches face
+significant limitations: they fall short in addressing real-world human needs,
+struggle with reasoning-intensive and long-horizon tasks, and lack mechanisms
+to learn and improve from prior experiences. To overcome these challenges, we
+introduce Mobile-Agent-E, a hierarchical multi-agent framework capable of
+self-evolution through past experience. By hierarchical, we mean an explicit
+separation of high-level planning and low-level action execution. The framework
+comprises a Manager, responsible for devising overall plans by breaking down
+complex tasks into subgoals, and four subordinate agents--Perceptor, Operator,
+Action Reflector, and Notetaker--which handle fine-grained visual perception,
+immediate action execution, error verification, and information aggregation,
+respectively. Mobile-Agent-E also features a novel self-evolution module which
+maintains a persistent long-term memory comprising Tips and Shortcuts. Tips are
+general guidance and lessons learned from prior tasks on how to effectively
+interact with the environment. Shortcuts are reusable, executable sequences of
+atomic operations tailored for specific subroutines. The inclusion of Tips and
+Shortcuts facilitates continuous refinement in performance and efficiency.
+Alongside this framework, we introduce Mobile-Eval-E, a new benchmark featuring
+complex mobile tasks requiring long-horizon, multi-app interactions. Empirical
+results show that Mobile-Agent-E achieves a 22% absolute improvement over
+previous state-of-the-art approaches across three foundation model backbones.
+Project page: https://x-plug.github.io/MobileAgent.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SeRpEnt: Selective Resampling for Expressive State Space Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Rando, Luca Romani, Matteo Migliarini, Luca Franco, Denis Gudovskiy, Fabio Galasso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State Space Models (SSMs) have recently enjoyed a rise to prominence in the
+field of deep learning for sequence modeling, especially as an alternative to
+Transformers. Their success stems from avoiding two well-known drawbacks of
+attention-based models: quadratic complexity with respect to the sequence
+length and inability to model long-range dependencies. The SSM variant Mamba
+has demonstrated performance comparable to Transformers without any form of
+attention, thanks to the use of a selective mechanism for the state parameters.
+Selectivity, however, is only evaluated empirically and the reasons of its
+effectiveness remain unclear. In this work, we show how selectivity is related
+to the sequence processing. Our analysis shows that selective time intervals in
+Mamba act as linear approximators of information. Then, we propose our SeRpEnt
+architecture, a SSM that further exploits selectivity to compress sequences in
+an information-aware fashion. It employs a resampling mechanism that aggregates
+elements based on their information content. Our empirical results in the Long
+Range Arena benchmark and other language modeling tasks show benefits of the
+SeRpEnt's resampling mechanism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GL-ICNN: An End-To-End Interpretable Convolutional Neural Network for
+  the Diagnosis and Prediction of Alzheimer's Disease 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjie Kang, Lize Jiskoot, Peter De Deyn, Geert Biessels, Huiberdina Koek, Jurgen Claassen, Huub Middelkoop, Wiesje Flier, Willemijn J. Jansen, Stefan Klein, Esther Bron
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods based on Convolutional Neural Networks (CNNs) have
+shown great potential to improve early and accurate diagnosis of Alzheimer's
+disease (AD) dementia based on imaging data. However, these methods have yet to
+be widely adopted in clinical practice, possibly due to the limited
+interpretability of deep learning models. The Explainable Boosting Machine
+(EBM) is a glass-box model but cannot learn features directly from input
+imaging data. In this study, we propose a novel interpretable model that
+combines CNNs and EBMs for the diagnosis and prediction of AD. We develop an
+innovative training strategy that alternatingly trains the CNN component as a
+feature extractor and the EBM component as the output block to form an
+end-to-end model. The model takes imaging data as input and provides both
+predictions and interpretable feature importance measures. We validated the
+proposed model on the Alzheimer's Disease Neuroimaging Initiative (ADNI)
+dataset and the Health-RI Parelsnoer Neurodegenerative Diseases Biobank (PND)
+as an external testing set. The proposed model achieved an area-under-the-curve
+(AUC) of 0.956 for AD and control classification, and 0.694 for the prediction
+of conversion of mild cognitive impairment (MCI) to AD on the ADNI cohort. The
+proposed model is a glass-box model that achieves a comparable performance with
+other state-of-the-art black-box models. Our code is publicly available at:
+https://anonymous.4open.science/r/GL-ICNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Scene Understanding from Vision-Language Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahaf Pruss, Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images depicting complex, dynamic scenes are challenging to parse
+automatically, requiring both high-level comprehension of the overall situation
+and fine-grained identification of participating entities and their
+interactions. Current approaches use distinct methods tailored to sub-tasks
+such as Situation Recognition and detection of Human-Human and Human-Object
+Interactions. However, recent advances in image understanding have often
+leveraged web-scale vision-language (V&L) representations to obviate
+task-specific engineering. In this work, we propose a framework for dynamic
+scene understanding tasks by leveraging knowledge from modern, frozen V&L
+representations. By framing these tasks in a generic manner - as predicting and
+parsing structured text, or by directly concatenating representations to the
+input of existing models - we achieve state-of-the-art results while using a
+minimal number of trainable parameters relative to existing approaches.
+Moreover, our analysis of dynamic knowledge of these representations shows that
+recent, more powerful representations effectively encode dynamic scene
+semantics, making this approach newly possible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early evidence of how LLMs outperform traditional systems on OCR/HTR
+  tasks for historical records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seorin Kim, Julien Baudru, Wouter Ryckbosch, Hugues Bersini, Vincent Ginis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the ability of two LLMs -- GPT-4o and Claude Sonnet 3.5 -- to
+transcribe historical handwritten documents in a tabular format and compare
+their performance to traditional OCR/HTR systems: EasyOCR, Keras, Pytesseract,
+and TrOCR. Considering the tabular form of the data, two types of experiments
+are executed: one where the images are split line by line and the other where
+the entire scan is used as input. Based on CER and BLEU, we demonstrate that
+LLMs outperform the conventional OCR/HTR methods. Moreover, we also compare the
+evaluated CER and BLEU scores to human evaluations to better judge the outputs
+of whole-scan experiments and understand influential factors for CER and BLEU.
+Combining judgments from all the evaluation metrics, we conclude that two-shot
+GPT-4o for line-by-line images and two-shot Claude Sonnet 3.5 for whole-scan
+images yield the transcriptions of the historical records most similar to the
+ground truth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compressibility Analysis for the differentiable shift-variant Filtered
+  Backprojection Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengze Ye, Linda-Sophie Schneider, Yipeng Sun, Mareike Thies, Andreas Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The differentiable shift-variant filtered backprojection (FBP) model enables
+the reconstruction of cone-beam computed tomography (CBCT) data for any
+non-circular trajectories. This method employs deep learning technique to
+estimate the redundancy weights required for reconstruction, given knowledge of
+the specific trajectory at optimization time. However, computing the redundancy
+weight for each projection remains computationally intensive. This paper
+presents a novel approach to compress and optimize the differentiable
+shift-variant FBP model based on Principal Component Analysis (PCA). We apply
+PCA to the redundancy weights learned from sinusoidal trajectory projection
+data, revealing significant parameter redundancy in the original model. By
+integrating PCA directly into the differentiable shift-variant FBP
+reconstruction pipeline, we develop a method that decomposes the redundancy
+weight layer parameters into a trainable eigenvector matrix, compressed
+weights, and a mean vector. This innovative technique achieves a remarkable
+97.25% reduction in trainable parameters without compromising reconstruction
+accuracy. As a result, our algorithm significantly decreases the complexity of
+the differentiable shift-variant FBP model and greatly improves training speed.
+These improvements make the model substantially more practical for real-world
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Teaching Large Language Models to Regress Accurate Image Quality Scores
+  using Score Distribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11561v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11561v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan You, Xin Cai, Jinjin Gu, Tianfan Xue, Chao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid advancement of Multi-modal Large Language Models (MLLMs),
+MLLM-based Image Quality Assessment (IQA) methods have shown promising
+performance in linguistic quality description. However, current methods still
+fall short in accurately scoring image quality. In this work, we aim to
+leverage MLLMs to regress accurate quality scores. A key challenge is that the
+quality score is inherently continuous, typically modeled as a Gaussian
+distribution, whereas MLLMs generate discrete token outputs. This mismatch
+necessitates score discretization. Previous approaches discretize the mean
+score into a one-hot label, resulting in information loss and failing to
+capture inter-image relationships. We propose a distribution-based approach
+that discretizes the score distribution into a soft label. This method
+preserves the characteristics of the score distribution, achieving high
+accuracy and maintaining inter-image relationships. Moreover, to address
+dataset variation, where different IQA datasets exhibit various distributions,
+we introduce a fidelity loss based on Thurstone's model. This loss captures
+intra-dataset relationships, facilitating co-training across multiple IQA
+datasets. With these designs, we develop the distribution-based Depicted image
+Quality Assessment model for Score regression (DeQA-Score). Experiments across
+multiple benchmarks show that DeQA-Score stably outperforms baselines in score
+regression. Also, DeQA-Score can predict the score distribution that closely
+aligns with human annotations. Codes and model weights have been released in
+https://depictqa.github.io/deqa-score/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event-based vision for egomotion estimation using precise event timing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11554v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11554v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugh Greatorex, Michele Mastella, Madison Cotteret, Ole Richter, Elisabetta Chicca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Egomotion estimation is crucial for applications such as autonomous
+navigation and robotics, where accurate and real-time motion tracking is
+required. However, traditional methods relying on inertial sensors are highly
+sensitive to external conditions, and suffer from drifts leading to large
+inaccuracies over long distances. Vision-based methods, particularly those
+utilising event-based vision sensors, provide an efficient alternative by
+capturing data only when changes are perceived in the scene. This approach
+minimises power consumption while delivering high-speed, low-latency feedback.
+In this work, we propose a fully event-based pipeline for egomotion estimation
+that processes the event stream directly within the event-based domain. This
+method eliminates the need for frame-based intermediaries, allowing for
+low-latency and energy-efficient motion estimation. We construct a shallow
+spiking neural network using a synaptic gating mechanism to convert precise
+event timing into bursts of spikes. These spikes encode local optical flow
+velocities, and the network provides an event-based readout of egomotion. We
+evaluate the network's performance on a dedicated chip, demonstrating strong
+potential for low-latency, low-power motion estimation. Additionally,
+simulations of larger networks show that the system achieves state-of-the-art
+accuracy in egomotion estimation tasks with event-based cameras, making it a
+promising solution for real-time, power-constrained robotics applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures. Supplementary material: 4 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A baseline for machine-learning-based hepatocellular carcinoma diagnosis
+  using multi-modal clinical data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binwu Wang, Isaac Rodriguez, Leon Breitinger, Fabian Tollens, Timo Itzel, Dennis Grimm, Andrei Sirazitdinov, Matthias Frölich, Stefan Schönberg, Andreas Teufel, Jürgen Hesser, Wenzhao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The objective of this paper is to provide a baseline for performing
+multi-modal data classification on a novel open multimodal dataset of
+hepatocellular carcinoma (HCC), which includes both image data
+(contrast-enhanced CT and MRI images) and tabular data (the clinical laboratory
+test data as well as case report forms). TNM staging is the classification
+task. Features from the vectorized preprocessed tabular data and radiomics
+features from contrast-enhanced CT and MRI images are collected. Feature
+selection is performed based on mutual information. An XGBoost classifier
+predicts the TNM staging and it shows a prediction accuracy of $0.89 \pm 0.05$
+and an AUC of $0.93 \pm 0.03$. The classifier shows that this high level of
+prediction accuracy can only be obtained by combining image and clinical
+laboratory data and therefore is a good example case where multi-model
+classification is mandatory to achieve accurate results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fundus Image Quality Assessment and Enhancement: a Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heng Li, Haojin Li, Mingyang Ou, Xiangyang Yu, Xiaoqing Zhang, Ke Niu, Huazhu Fu, Jiang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As an affordable and convenient eye scan, fundus photography holds the
+potential for preventing vision impairment, especially in resource-limited
+regions. However, fundus image degradation is common under intricate imaging
+environments, impacting following diagnosis and treatment. Consequently, image
+quality assessment (IQA) and enhancement (IQE) are essential for ensuring the
+clinical value and reliability of fundus images. While existing reviews offer
+some overview of this field, a comprehensive analysis of the interplay between
+IQA and IQE, along with their clinical deployment challenges, is lacking. This
+paper addresses this gap by providing a thorough review of fundus IQA and IQE
+algorithms, research advancements, and practical applications. We outline the
+fundamentals of the fundus photography imaging system and the associated
+interferences, and then systematically summarize the paradigms in fundus IQA
+and IQE. Furthermore, we discuss the practical challenges and solutions in
+deploying IQA and IQE, as well as offer insights into potential future research
+directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UltraFusion: Ultra High Dynamic Imaging using Exposure Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Chen, Yujin Wang, Xin Cai, Zhiyuan You, Zheming Lu, Fan Zhang, Shi Guo, Tianfan Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing high dynamic range (HDR) scenes is one of the most important issues
+in camera design. Majority of cameras use exposure fusion technique, which
+fuses images captured by different exposure levels, to increase dynamic range.
+However, this approach can only handle images with limited exposure difference,
+normally 3-4 stops. When applying to very high dynamic scenes where a large
+exposure difference is required, this approach often fails due to incorrect
+alignment or inconsistent lighting between inputs, or tone mapping artifacts.
+In this work, we propose UltraFusion, the first exposure fusion technique that
+can merge input with 9 stops differences. The key idea is that we model the
+exposure fusion as a guided inpainting problem, where the under-exposed image
+is used as a guidance to fill the missing information of over-exposed highlight
+in the over-exposed region. Using under-exposed image as a soft guidance,
+instead of a hard constrain, our model is robust to potential alignment issue
+or lighting variations. Moreover, utilizing the image prior of the generative
+model, our model also generates natural tone mapping, even for very
+high-dynamic range scene. Our approach outperforms HDR-Transformer on latest
+HDR benchmarks. Moreover, to test its performance in ultra high dynamic range
+scene, we capture a new real-world exposure fusion benchmark, UltraFusion
+Dataset, with exposure difference up to 9 stops, and experiments show that
+\model~can generate beautiful and high-quality fusion results under various
+scenarios. An online demo is provided at
+https://openimaginglab.github.io/UltraFusion/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transferability of labels between multilens cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11513v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11513v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ignacio de Loyola Páez-Ubieta, Daniel Frau-Alfaro, Santiago T. Puente
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, a new method for automatically extending Bounding Box (BB) and
+mask labels across different channels on multilens cameras is presented. For
+that purpose, the proposed method combines the well known phase correlation
+method with a refinement process. During the first step, images are aligned by
+localizing the peak of intensity obtained in the spatial domain after
+performing the cross correlation process in the frequency domain. The second
+step consists of obtaining the best possible transformation by using an
+iterative process maximising the IoU (Intersection over Union) metric. Results
+show that, by using this method, labels could be transferred across different
+lens on a camera with an accuracy over 90% in most cases and just by using 65
+ms in the whole process. Once the transformations are obtained, artificial RGB
+images are generated, for labeling them so as to transfer this information into
+each of the other lens. This work will allow users to use this type of cameras
+in more fields rather than satellite or medical imagery, giving the chance of
+labeling even invisible objects in the visible spectrum.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a preprint version of the work accepted at 20th International
+  Conference on Computer Vision Theory and Applications (VISAPP 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multitask Auxiliary Network for Perceptual Quality Assessment of
+  Non-Uniformly Distorted Omnidirectional Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11512v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11512v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiebin Yan, Jiale Rao, Junjie Chen, Ziwen Tan, Weide Liu, Yuming Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Omnidirectional image quality assessment (OIQA) has been widely investigated
+in the past few years and achieved much success. However, most of existing
+studies are dedicated to solve the uniform distortion problem in OIQA, which
+has a natural gap with the non-uniform distortion problem, and their ability in
+capturing non-uniform distortion is far from satisfactory. To narrow this gap,
+in this paper, we propose a multitask auxiliary network for non-uniformly
+distorted omnidirectional images, where the parameters are optimized by jointly
+training the main task and other auxiliary tasks. The proposed network mainly
+consists of three parts: a backbone for extracting multiscale features from the
+viewport sequence, a multitask feature selection module for dynamically
+allocating specific features to different tasks, and auxiliary sub-networks for
+guiding the proposed model to capture local distortion and global quality
+change. Extensive experiments conducted on two large-scale OIQA databases
+demonstrate that the proposed model outperforms other state-of-the-art OIQA
+metrics, and these auxiliary sub-networks contribute to improve the performance
+of the proposed model. The source code is available at
+https://github.com/RJL2000/MTAOIQA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Subjective and Objective Quality Assessment of Non-Uniformly Distorted
+  Omnidirectional Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiebin Yan, Jiale Rao, Xuelin Liu, Yuming Fang, Yifan Zuo, Weide Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Omnidirectional image quality assessment (OIQA) has been one of the hot
+topics in IQA with the continuous development of VR techniques, and achieved
+much success in the past few years. However, most studies devote themselves to
+the uniform distortion issue, i.e., all regions of an omnidirectional image are
+perturbed by the ``same amount'' of noise, while ignoring the non-uniform
+distortion issue, i.e., partial regions undergo ``different amount'' of
+perturbation with the other regions in the same omnidirectional image.
+Additionally, nearly all OIQA models are verified on the platforms containing a
+limited number of samples, which largely increases the over-fitting risk and
+therefore impedes the development of OIQA. To alleviate these issues, we
+elaborately explore this topic from both subjective and objective perspectives.
+Specifically, we construct a large OIQA database containing 10,320
+non-uniformly distorted omnidirectional images, each of which is generated by
+considering quality impairments on one or two camera len(s). Then we
+meticulously conduct psychophysical experiments and delve into the influence of
+both holistic and individual factors (i.e., distortion range and viewing
+condition) on omnidirectional image quality. Furthermore, we propose a
+perception-guided OIQA model for non-uniform distortion by adaptively
+simulating users' viewing behavior. Experimental results demonstrate that the
+proposed model outperforms state-of-the-art methods. The source code is
+available at https://github.com/RJL2000/OIQAND.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ See In Detail: Enhancing Sparse-view 3D Gaussian Splatting with Local
+  Depth and Semantic Regularization <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongqi He, Zhe Xiao, Kin-Chung Chan, Yushen Zuo, Jun Xiao, Kin-Man Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting (3DGS) has shown remarkable performance in novel view
+synthesis. However, its rendering quality deteriorates with sparse inphut
+views, leading to distorted content and reduced details. This limitation
+hinders its practical application. To address this issue, we propose a
+sparse-view 3DGS method. Given the inherently ill-posed nature of sparse-view
+rendering, incorporating prior information is crucial. We propose a semantic
+regularization technique, using features extracted from the pretrained DINO-ViT
+model, to ensure multi-view semantic consistency. Additionally, we propose
+local depth regularization, which constrains depth values to improve
+generalization on unseen views. Our method outperforms state-of-the-art novel
+view synthesis approaches, achieving up to 0.4dB improvement in terms of PSNR
+on the LLFF dataset, with reduced distortion and enhanced visual quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 5 figures, has been accepted by the ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient Federated Learning Based on Explanation-Guided
+  Pruning for Remote Sensing Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Klotz, Barış Büyüktaş, Begüm Demir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a decentralized machine learning paradigm, where
+multiple clients collaboratively train a global model by exchanging only model
+updates with the central server without sharing the local data of clients. Due
+to the large volume of model updates required to be transmitted between clients
+and the central server, most FL systems are associated with high transfer costs
+(i.e., communication overhead). This issue is more critical for operational
+applications in remote sensing (RS), especially when large-scale RS data is
+processed and analyzed through FL systems with restricted communication
+bandwidth. To address this issue, we introduce an explanation-guided pruning
+strategy for communication-efficient FL in the context of RS image
+classification. Our pruning strategy is defined based on the layerwise
+relevance propagation (LRP) driven explanations to: 1) efficiently and
+effectively identify the most relevant and informative model parameters (to be
+exchanged between clients and the central server); and 2) eliminate the
+non-informative ones to minimize the volume of model updates. The experimental
+results on the BigEarthNet-S2 dataset demonstrate that our strategy effectively
+reduces the number of shared model updates, while increasing the generalization
+ability of the global model. The code of this work will be publicly available
+at https://git.tu-berlin.de/rsim/FL-LRP
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the IEEE International Geoscience and Remote Sensing
+  Symposium (IGARSS) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SimLabel: Consistency-Guided OOD Detection with <span class="highlight-title">Pretrain</span>ed
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shu Zou, Xinyu Tian, Qinyu Zhao, Zhaoyuan Yang, Jing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting out-of-distribution (OOD) data is crucial in real-world machine
+learning applications, particularly in safety-critical domains. Existing
+methods often leverage language information from vision-language models (VLMs)
+to enhance OOD detection by improving confidence estimation through rich
+class-wise text information. However, when building OOD detection score upon on
+in-distribution (ID) text-image affinity, existing works either focus on each
+ID class or whole ID label sets, overlooking inherent ID classes' connection.
+We find that the semantic information across different ID classes is beneficial
+for effective OOD detection. We thus investigate the ability of image-text
+comprehension among different semantic-related ID labels in VLMs and propose a
+novel post-hoc strategy called SimLabel. SimLabel enhances the separability
+between ID and OOD samples by establishing a more robust image-class similarity
+metric that considers consistency over a set of similar class labels. Extensive
+experiments demonstrate the superior performance of SimLabel on various
+zero-shot OOD detection benchmarks. The proposed model is also extended to
+various VLM-backbones, demonstrating its good generalization ability. Our
+demonstration and implementation codes are available at:
+https://github.com/ShuZou-1/SimLabel.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MASS: Overcoming Language Bias in Image-Text Matching <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiwan Chung, Seungwon Lim, Sangkyu Lee, Youngjae Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained visual-language models have made significant advancements in
+multimodal tasks, including image-text retrieval. However, a major challenge in
+image-text matching lies in language bias, where models predominantly rely on
+language priors and neglect to adequately consider the visual content. We thus
+present Multimodal ASsociation Score (MASS), a framework that reduces the
+reliance on language priors for better visual accuracy in image-text matching
+problems. It can be seamlessly incorporated into existing visual-language
+models without necessitating additional training. Our experiments have shown
+that MASS effectively lessens language bias without losing an understanding of
+linguistic compositionality. Overall, MASS offers a promising solution for
+enhancing image-text matching performance in visual-language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Adversarial Vulnerabilities of Transfer Learning in Remote
+  Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11462v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11462v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Bai, Xingjian Tian, Yonghao Xu, Bihan Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of pretrained models from general computer vision tasks is widespread
+in remote sensing, significantly reducing training costs and improving
+performance. However, this practice also introduces vulnerabilities to
+downstream tasks, where publicly available pretrained models can be used as a
+proxy to compromise downstream models. This paper presents a novel Adversarial
+Neuron Manipulation method, which generates transferable perturbations by
+selectively manipulating single or multiple neurons in pretrained models.
+Unlike existing attacks, this method eliminates the need for domain-specific
+information, making it more broadly applicable and efficient. By targeting
+multiple fragile neurons, the perturbations achieve superior attack
+performance, revealing critical vulnerabilities in deep learning models.
+Experiments on diverse models and remote sensing datasets validate the
+effectiveness of the proposed method. This low-access adversarial neuron
+manipulation technique highlights a significant security risk in transfer
+learning models, emphasizing the urgent need for more robust defenses in their
+design when addressing the safety-critical remote sensing tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Coronary Artery Calcium Scoring via Multi-Organ Segmentation
+  on Non-Contrast Cardiac Computed Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakub Nalepa, Tomasz Bartczak, Mariusz Bujny, Jarosław Gośliński, Katarzyna Jesionek, Wojciech Malara, Filip Malawski, Karol Miszalski-Jamka, Patrycja Rewa, Marcin Kostur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite coronary artery calcium scoring being considered a largely solved
+problem within the realm of medical artificial intelligence, this paper argues
+that significant improvements can still be made. By shifting the focus from
+pathology detection to a deeper understanding of anatomy, the novel algorithm
+proposed in the paper both achieves high accuracy in coronary artery calcium
+scoring and offers enhanced interpretability of the results. This approach not
+only aids in the precise quantification of calcifications in coronary arteries,
+but also provides valuable insights into the underlying anatomical structures.
+Through this anatomically-informed methodology, the paper shows how a nuanced
+understanding of the heart's anatomy can lead to more accurate and
+interpretable results in the field of cardiovascular health. We demonstrate the
+superior accuracy of the proposed method by evaluating it on an open-source
+multi-vendor dataset, where we obtain results at the inter-observer level,
+surpassing the current state of the art. Finally, the qualitative analyses show
+the practical value of the algorithm in such tasks as labeling coronary artery
+calcifications, identifying aortic calcifications, and filtering out false
+positive detections due to noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Block Flow: Learning Straight Flow on Data Blocks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zibin Wang, Zhiyuan Ouyang, Xiangyun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Flow-matching models provide a powerful framework for various applications,
+offering efficient sampling and flexible probability path modeling. These
+models are characterized by flows with low curvature in learned generative
+trajectories, which results in reduced truncation error at each sampling step.
+To further reduce curvature, we propose block matching. This novel approach
+leverages label information to partition the data distribution into blocks and
+match them with a prior distribution parameterized using the same label
+information, thereby learning straighter flows. We demonstrate that the
+variance of the prior distribution can control the curvature upper bound of
+forward trajectories in flow-matching models. By designing flexible
+regularization strategies to adjust this variance, we achieve optimal
+generation performance, effectively balancing the trade-off between maintaining
+diversity in generated samples and minimizing numerical solver errors. Our
+results demonstrate competitive performance with models of the same parameter
+scale.Code is available at \url{https://github.com/wpp13749/block_flow}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Labelling & Semantic Segmentation with 4D Radar Tensors <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11351v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11351v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Botao Sun, Ignacio Roldan, Francesco Fioranelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, an automatic labelling process is presented for automotive
+datasets, leveraging on complementary information from LiDAR and camera. The
+generated labels are then used as ground truth with the corresponding 4D radar
+data as inputs to a proposed semantic segmentation network, to associate a
+class label to each spatial voxel. Promising results are shown by applying both
+approaches to the publicly shared RaDelft dataset, with the proposed network
+achieving over 65% of the LiDAR detection performance, improving 13.2% in
+vehicle detection probability, and reducing 0.54 m in terms of Chamfer
+distance, compared to variants inspired from the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EndoChat: Grounded Multimodal Large Language Model for Endoscopic
+  Surgery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11347v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11347v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guankun Wang, Long Bai, Junyi Wang, Kun Yuan, Zhen Li, Tianxu Jiang, Xiting He, Jinlin Wu, Zhen Chen, Zhen Lei, Hongbin Liu, Jiazheng Wang, Fan Zhang, Nicolas Padoy, Nassir Navab, Hongliang Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Multimodal Large Language Models (MLLMs) have demonstrated their
+immense potential in computer-aided diagnosis and decision-making. In the
+context of robotic-assisted surgery, MLLMs can serve as effective tools for
+surgical training and guidance. However, there is still a lack of MLLMs
+specialized for surgical scene understanding in clinical applications. In this
+work, we introduce EndoChat to address various dialogue paradigms and subtasks
+in surgical scene understanding that surgeons encounter. To train our EndoChat,
+we construct the Surg-396K dataset through a novel pipeline that systematically
+extracts surgical information and generates structured annotations based on
+collected large-scale endoscopic surgery datasets. Furthermore, we introduce a
+multi-scale visual token interaction mechanism and a visual contrast-based
+reasoning mechanism to enhance the model's representation learning and
+reasoning capabilities. Our model achieves state-of-the-art performance across
+five dialogue paradigms and eight surgical scene understanding tasks.
+Additionally, we conduct evaluations with professional surgeons, most of whom
+provide positive feedback on collaborating with EndoChat. Overall, these
+results demonstrate that our EndoChat has great potential to significantly
+advance training and automation in robotic-assisted surgery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenVidBench: A Challenging Benchmark for Detecting AI-Generated Video 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenliang Ni, Qiangyu Yan, Mouxiao Huang, Tianning Yuan, Yehui Tang, Hailin Hu, Xinghao Chen, Yunhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of video generation models has made it increasingly
+challenging to distinguish AI-generated videos from real ones. This issue
+underscores the urgent need for effective AI-generated video detectors to
+prevent the dissemination of false information through such videos. However,
+the development of high-performance generative video detectors is currently
+impeded by the lack of large-scale, high-quality datasets specifically designed
+for generative video detection. To this end, we introduce GenVidBench, a
+challenging AI-generated video detection dataset with several key advantages:
+1) Cross Source and Cross Generator: The cross-generation source mitigates the
+interference of video content on the detection. The cross-generator ensures
+diversity in video attributes between the training and test sets, preventing
+them from being overly similar. 2) State-of-the-Art Video Generators: The
+dataset includes videos from 8 state-of-the-art AI video generators, ensuring
+that it covers the latest advancements in the field of video generation. 3)
+Rich Semantics: The videos in GenVidBench are analyzed from multiple dimensions
+and classified into various semantic categories based on their content. This
+classification ensures that the dataset is not only large but also diverse,
+aiding in the development of more generalized and effective detection models.
+We conduct a comprehensive evaluation of different advanced video generators
+and present a challenging setting. Additionally, we present rich experimental
+results including advanced video classification models as baselines. With the
+GenVidBench, researchers can efficiently develop and evaluate AI-generated
+video detection models. Datasets and code are available at
+https://genvidbench.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CatV2TON: Taming Diffusion <span class="highlight-title">Transformer</span>s for Vision-Based Virtual Try-On
+  with Temporal Concatenation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Chong, Wenqing Zhang, Shiyue Zhang, Jun Zheng, Xiao Dong, Haoxiang Li, Yiling Wu, Dongmei Jiang, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual try-on (VTON) technology has gained attention due to its potential to
+transform online retail by enabling realistic clothing visualization of images
+and videos. However, most existing methods struggle to achieve high-quality
+results across image and video try-on tasks, especially in long video
+scenarios. In this work, we introduce CatV2TON, a simple and effective
+vision-based virtual try-on (V2TON) method that supports both image and video
+try-on tasks with a single diffusion transformer model. By temporally
+concatenating garment and person inputs and training on a mix of image and
+video datasets, CatV2TON achieves robust try-on performance across static and
+dynamic settings. For efficient long-video generation, we propose an
+overlapping clip-based inference strategy that uses sequential frame guidance
+and Adaptive Clip Normalization (AdaCN) to maintain temporal consistency with
+reduced resource demands. We also present ViViD-S, a refined video try-on
+dataset, achieved by filtering back-facing frames and applying 3D mask
+smoothing for enhanced temporal consistency. Comprehensive experiments
+demonstrate that CatV2TON outperforms existing methods in both image and video
+try-on tasks, offering a versatile and reliable solution for realistic virtual
+try-ons across diverse scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StyleSSP: Sampling StartPoint Enhancement for Training-free
+  Diffusion-based Method for Style Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11319v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11319v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruojun Xu, Weijie Xi, Xiaodi Wang, Yongbo Mao, Zach Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training-free diffusion-based methods have achieved remarkable success in
+style transfer, eliminating the need for extensive training or fine-tuning.
+However, due to the lack of targeted training for style information extraction
+and constraints on the content image layout, training-free methods often suffer
+from layout changes of original content and content leakage from style images.
+Through a series of experiments, we discovered that an effective startpoint in
+the sampling stage significantly enhances the style transfer process. Based on
+this discovery, we propose StyleSSP, which focuses on obtaining a better
+startpoint to address layout changes of original content and content leakage
+from style image. StyleSSP comprises two key components: (1) Frequency
+Manipulation: To improve content preservation, we reduce the low-frequency
+components of the DDIM latent, allowing the sampling stage to pay more
+attention to the layout of content images; and (2) Negative Guidance via
+Inversion: To mitigate the content leakage from style image, we employ negative
+guidance in the inversion stage to ensure that the startpoint of the sampling
+stage is distanced from the content of style image. Experiments show that
+StyleSSP surpasses previous training-free style transfer baselines,
+particularly in preserving original content and minimizing the content leakage
+from style image.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nested Annealed Training Scheme for Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Wan, Ming-Hsuan Yang, Minglu Li, Yunliang Jiang, Zhonglong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, researchers have proposed many deep generative models, including
+generative adversarial networks(GANs) and denoising diffusion models. Although
+significant breakthroughs have been made and empirical success has been
+achieved with the GAN, its mathematical underpinnings remain relatively
+unknown. This paper focuses on a rigorous mathematical theoretical framework:
+the composite-functional-gradient GAN (CFG)[1]. Specifically, we reveal the
+theoretical connection between the CFG model and score-based models. We find
+that the training objective of the CFG discriminator is equivalent to finding
+an optimal D(x). The optimal gradient of D(x) differentiates the integral of
+the differences between the score functions of real and synthesized samples.
+Conversely, training the CFG generator involves finding an optimal G(x) that
+minimizes this difference. In this paper, we aim to derive an annealed weight
+preceding the weight of the CFG discriminator. This new explicit theoretical
+explanation model is called the annealed CFG method. To overcome the limitation
+of the annealed CFG method, as the method is not readily applicable to the SOTA
+GAN model, we propose a nested annealed training scheme (NATS). This scheme
+keeps the annealed weight from the CFG method and can be seamlessly adapted to
+various GAN models, no matter their structural, loss, or regularization
+differences. We conduct thorough experimental evaluations on various benchmark
+datasets for image generation. The results show that our annealed CFG and NATS
+methods significantly improve the quality and diversity of the synthesized
+samples. This improvement is clear when comparing the CFG method and the SOTA
+GAN models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anomaly Detection for Industrial Applications, Its Challenges,
+  Solutions, and Future Directions: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrahman Alzarooni, Ehtesham Iqbal, Samee Ullah Khan, Sajid Javed, Brain Moyo, Yusra Abdulrahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection from images captured using camera sensors is one of the
+mainstream applications at the industrial level. Particularly, it maintains the
+quality and optimizes the efficiency in production processes across diverse
+industrial tasks, including advanced manufacturing and aerospace engineering.
+Traditional anomaly detection workflow is based on a manual inspection by human
+operators, which is a tedious task. Advances in intelligent automated
+inspection systems have revolutionized the Industrial Anomaly Detection (IAD)
+process. Recent vision-based approaches can automatically extract, process, and
+interpret features using computer vision and align with the goals of automation
+in industrial operations. In light of the shift in inspection methodologies,
+this survey reviews studies published since 2019, with a specific focus on
+vision-based anomaly detection. The components of an IAD pipeline that are
+overlooked in existing surveys are presented, including areas related to data
+acquisition, preprocessing, learning mechanisms, and evaluation. In addition to
+the collected publications, several scientific and industry-related challenges
+and their perspective solutions are highlighted. Popular and relevant
+industrial datasets are also summarized, providing further insight into
+inspection applications. Finally, future directions of vision-based IAD are
+discussed, offering researchers insight into the state-of-the-art of industrial
+inspection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finer-CAM: Spotting the Difference Reveals Finer Details for Visual
+  Explanation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziheng Zhang, Jianyang Gu, Arpita Chowdhury, Zheda Mai, David Carlyn, Tanya Berger-Wolf, Yu Su, Wei-Lun Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class activation map (CAM) has been widely used to highlight image regions
+that contribute to class predictions. Despite its simplicity and computational
+efficiency, CAM often struggles to identify discriminative regions that
+distinguish visually similar fine-grained classes. Prior efforts address this
+limitation by introducing more sophisticated explanation processes, but at the
+cost of extra complexity. In this paper, we propose Finer-CAM, a method that
+retains CAM's efficiency while achieving precise localization of discriminative
+regions. Our key insight is that the deficiency of CAM lies not in "how" it
+explains, but in "what" it explains}. Specifically, previous methods attempt to
+identify all cues contributing to the target class's logit value, which
+inadvertently also activates regions predictive of visually similar classes. By
+explicitly comparing the target class with similar classes and spotting their
+differences, Finer-CAM suppresses features shared with other classes and
+emphasizes the unique, discriminative details of the target class. Finer-CAM is
+easy to implement, compatible with various CAM methods, and can be extended to
+multi-modal models for accurate localization of specific concepts.
+Additionally, Finer-CAM allows adjustable comparison strength, enabling users
+to selectively highlight coarse object contours or fine discriminative details.
+Quantitatively, we show that masking out the top 5% of activated pixels by
+Finer-CAM results in a larger relative confidence drop compared to baselines.
+The source code and demo are available at
+https://github.com/Imageomics/Finer-CAM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MIFNet: Learning Modality-Invariant Features for Generalizable
+  Multimodal Image Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yepeng Liu, Zhichao Sun, Baosheng Yu, Yitian Zhao, Bo Du, Yongchao Xu, Jun Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many keypoint detection and description methods have been proposed for image
+matching or registration. While these methods demonstrate promising performance
+for single-modality image matching, they often struggle with multimodal data
+because the descriptors trained on single-modality data tend to lack robustness
+against the non-linear variations present in multimodal data. Extending such
+methods to multimodal image matching often requires well-aligned multimodal
+data to learn modality-invariant descriptors. However, acquiring such data is
+often costly and impractical in many real-world scenarios. To address this
+challenge, we propose a modality-invariant feature learning network (MIFNet) to
+compute modality-invariant features for keypoint descriptions in multimodal
+image matching using only single-modality training data. Specifically, we
+propose a novel latent feature aggregation module and a cumulative hybrid
+aggregation module to enhance the base keypoint descriptors trained on
+single-modality data by leveraging pre-trained features from Stable Diffusion
+models. We validate our method with recent keypoint detection and description
+methods in three multimodal retinal image datasets (CF-FA, CF-OCT, EMA-OCTA)
+and two remote sensing datasets (Optical-SAR and Optical-NIR). Extensive
+experiments demonstrate that the proposed MIFNet is able to learn
+modality-invariant feature for multimodal image matching without accessing the
+targeted modality and has good zero-shot generalization ability. The source
+code will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PD-SORT: Occlusion-Robust Multi-Object Tracking Using Pseudo-Depth Cues 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanchao Wang, Dawei Zhang, Run Li, Zhonglong Zheng, Minglu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-object tracking (MOT) is a rising topic in video processing
+technologies and has important application value in consumer electronics.
+Currently, tracking-by-detection (TBD) is the dominant paradigm for MOT, which
+performs target detection and association frame by frame. However, the
+association performance of TBD methods degrades in complex scenes with heavy
+occlusions, which hinders the application of such methods in real-world
+scenarios.To this end, we incorporate pseudo-depth cues to enhance the
+association performance and propose Pseudo-Depth SORT (PD-SORT). First, we
+extend the Kalman filter state vector with pseudo-depth states. Second, we
+introduce a novel depth volume IoU (DVIoU) by combining the conventional 2D IoU
+with pseudo-depth. Furthermore, we develop a quantized pseudo-depth measurement
+(QPDM) strategy for more robust data association. Besides, we also integrate
+camera motion compensation (CMC) to handle dynamic camera situations. With the
+above designs, PD-SORT significantly alleviates the occlusion-induced ambiguous
+associations and achieves leading performances on DanceTrack, MOT17, and MOT20.
+Note that the improvement is especially obvious on DanceTrack, where objects
+show complex motions, similar appearances, and frequent occlusions. The code is
+available at https://github.com/Wangyc2000/PD_SORT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ITCFN: Incomplete Triple-Modal Co-Attention Fusion Network for Mild
+  Cognitive Impairment Conversion Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyang Hu, Xiangyu Shen, Yifei Sun, Xuhao Shan, Wenwen Min, Liyilei Su, Xiaomao Fan, Ahmed Elazab, Ruiquan Ge, Changmiao Wang, Xiaopeng Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alzheimer's disease (AD) is a common neurodegenerative disease among the
+elderly. Early prediction and timely intervention of its prodromal stage, mild
+cognitive impairment (MCI), can decrease the risk of advancing to AD. Combining
+information from various modalities can significantly improve predictive
+accuracy. However, challenges such as missing data and heterogeneity across
+modalities complicate multimodal learning methods as adding more modalities can
+worsen these issues. Current multimodal fusion techniques often fail to adapt
+to the complexity of medical data, hindering the ability to identify
+relationships between modalities. To address these challenges, we propose an
+innovative multimodal approach for predicting MCI conversion, focusing
+specifically on the issues of missing positron emission tomography (PET) data
+and integrating diverse medical information. The proposed incomplete
+triple-modal MCI conversion prediction network is tailored for this purpose.
+Through the missing modal generation module, we synthesize the missing PET data
+from the magnetic resonance imaging and extract features using specifically
+designed encoders. We also develop a channel aggregation module and a
+triple-modal co-attention fusion module to reduce feature redundancy and
+achieve effective multimodal data fusion. Furthermore, we design a loss
+function to handle missing modality issues and align cross-modal features.
+These components collectively harness multimodal data to boost network
+performance. Experimental results on the ADNI1 and ADNI2 datasets show that our
+method significantly surpasses existing unimodal and other multimodal models.
+Our code is available at https://github.com/justinhxy/ITFC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, accepted by IEEE ISBI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatiotemporal Air Quality Mapping in Urban Areas Using Sparse Sensor
+  Data, Satellite Imagery, Meteorological Factors, and Spatial Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osama Ahmad, Zubair Khalid, Muhammad Tahir, Momin Uppal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monitoring air pollution is crucial for protecting human health from exposure
+to harmful substances. Traditional methods of air quality monitoring, such as
+ground-based sensors and satellite-based remote sensing, face limitations due
+to high deployment costs, sparse sensor coverage, and environmental
+interferences. To address these challenges, this paper proposes a framework for
+high-resolution spatiotemporal Air Quality Index (AQI) mapping using sparse
+sensor data, satellite imagery, and various spatiotemporal factors. By
+leveraging Graph Neural Networks (GNNs), we estimate AQI values at unmonitored
+locations based on both spatial and temporal dependencies. The framework
+incorporates a wide range of environmental features, including meteorological
+data, road networks, points of interest (PoIs), population density, and urban
+green spaces, which enhance prediction accuracy. We illustrate the use of our
+approach through a case study in Lahore, Pakistan, where multi-resolution data
+is used to generate the air quality index map at a fine spatiotemporal scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Loss-Resilient Image Coding for Unstable Satellite Networks <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongwei Sha, Muchen Dong, Quanyou Luo, Ming Lu, Hao Chen, Zhan Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geostationary Earth Orbit (GEO) satellite communication demonstrates
+significant advantages in emergency short burst data services. However,
+unstable satellite networks, particularly those with frequent packet loss,
+present a severe challenge to accurate image transmission. To address it, we
+propose a loss-resilient image coding approach that leverages end-to-end
+optimization in learned image compression (LIC). Our method builds on the
+channel-wise progressive coding framework, incorporating Spatial-Channel
+Rearrangement (SCR) on the encoder side and Mask Conditional Aggregation (MCA)
+on the decoder side to improve reconstruction quality with unpredictable
+errors. By integrating the Gilbert-Elliot model into the training process, we
+enhance the model's ability to generalize in real-world network conditions.
+Extensive evaluations show that our approach outperforms traditional and deep
+learning-based methods in terms of compression performance and stability under
+diverse packet loss, offering robust and efficient progressive transmission
+even in challenging environments. Code is available at
+https://github.com/NJUVISION/LossResilientLIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a poster presentation at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of World Models for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuo Feng, Wenguan Wang, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent breakthroughs in autonomous driving have revolutionized the way
+vehicles perceive and interact with their surroundings. In particular, world
+models have emerged as a linchpin technology, offering high-fidelity
+representations of the driving environment that integrate multi-sensor data,
+semantic cues, and temporal dynamics. Such models unify perception, prediction,
+and planning, thereby enabling autonomous systems to make rapid, informed
+decisions under complex and often unpredictable conditions. Research trends
+span diverse areas, including 4D occupancy prediction and generative data
+synthesis, all of which bolster scene understanding and trajectory forecasting.
+Notably, recent works exploit large-scale pretraining and advanced
+self-supervised learning to scale up models' capacity for rare-event simulation
+and real-time interaction. In addressing key challenges -- ranging from domain
+adaptation and long-tail anomaly detection to multimodal fusion -- these world
+models pave the way for more robust, reliable, and adaptable autonomous driving
+solutions. This survey systematically reviews the state of the art,
+categorizing techniques by their focus on future prediction, behavior planning,
+and the interaction between the two. We also identify potential directions for
+future research, emphasizing holistic integration, improved computational
+efficiency, and advanced simulation. Our comprehensive analysis underscores the
+transformative role of world models in driving next-generation autonomous
+systems toward safer and more equitable mobility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Ongoing project</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Uncertainty Estimation in Semantic Segmentation via
+  Monte-Carlo Frequency Dropout 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tal Zeevi, Lawrence H. Staib, John A. Onofrey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monte-Carlo (MC) Dropout provides a practical solution for estimating
+predictive distributions in deterministic neural networks. Traditional dropout,
+applied within the signal space, may fail to account for frequency-related
+noise common in medical imaging, leading to biased predictive estimates. A
+novel approach extends Dropout to the frequency domain, allowing stochastic
+attenuation of signal frequencies during inference. This creates diverse global
+textural variations in feature maps while preserving structural integrity -- a
+factor we hypothesize and empirically show is contributing to accurately
+estimating uncertainties in semantic segmentation. We evaluated traditional
+MC-Dropout and the MC-frequency Dropout in three segmentation tasks involving
+different imaging modalities: (i) prostate zones in biparametric MRI, (ii)
+liver tumors in contrast-enhanced CT, and (iii) lungs in chest X-ray scans. Our
+results show that MC-Frequency Dropout improves calibration, convergence, and
+semantic uncertainty, thereby improving prediction scrutiny, boundary
+delineation, and has the potential to enhance medical decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ISBI 2025 4-page paper. Code for the implementation
+  is available at https://github.com/talze/frequency-dropout</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Well Do Supervised 3D Models Transfer to Medical Imaging Tasks? <span class="chip">ICLR-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Li, Alan Yuille, Zongwei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pre-training and fine-tuning paradigm has become prominent in transfer
+learning. For example, if the model is pre-trained on ImageNet and then
+fine-tuned to PASCAL, it can significantly outperform that trained on PASCAL
+from scratch. While ImageNet pre-training has shown enormous success, it is
+formed in 2D, and the learned features are for classification tasks; when
+transferring to more diverse tasks, like 3D image segmentation, its performance
+is inevitably compromised due to the deviation from the original ImageNet
+context. A significant challenge lies in the lack of large, annotated 3D
+datasets rivaling the scale of ImageNet for model pre-training. To overcome
+this challenge, we make two contributions. Firstly, we construct AbdomenAtlas
+1.1 that comprises 9,262 three-dimensional computed tomography (CT) volumes
+with high-quality, per-voxel annotations of 25 anatomical structures and pseudo
+annotations of seven tumor types. Secondly, we develop a suite of models that
+are pre-trained on our AbdomenAtlas 1.1 for transfer learning. Our preliminary
+analyses indicate that the model trained only with 21 CT volumes, 672 masks,
+and 40 GPU hours has a transfer learning ability similar to the model trained
+with 5,050 (unlabeled) CT volumes and 1,152 GPU hours. More importantly, the
+transfer learning ability of supervised models can further scale up with larger
+annotated datasets, achieving significantly better performance than preexisting
+pre-trained models, irrespective of their pre-training methodologies or data
+sources. We hope this study can facilitate collective efforts in constructing
+larger 3D medical datasets and more releases of supervised pre-trained models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing SAR Object Detection with <span class="highlight-title">Self-Supervised</span> <span class="highlight-title">Pre-train</span>ing on
+  Masked Auto-Encoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyang Pu, Feng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised fine-tuning methods (SFT) perform great efficiency on artificial
+intelligence interpretation in SAR images, leveraging the powerful
+representation knowledge from pre-training models. Due to the lack of
+domain-specific pre-trained backbones in SAR images, the traditional strategies
+are loading the foundation pre-train models of natural scenes such as ImageNet,
+whose characteristics of images are extremely different from SAR images. This
+may hinder the model performance on downstream tasks when adopting SFT on
+small-scale annotated SAR data. In this paper, an self-supervised learning
+(SSL) method of masked image modeling based on Masked Auto-Encoders (MAE) is
+proposed to learn feature representations of SAR images during the pre-training
+process and benefit the object detection task in SAR images of SFT. The
+evaluation experiments on the large-scale SAR object detection benchmark named
+SARDet-100k verify that the proposed method captures proper latent
+representations of SAR images and improves the model generalization in
+downstream tasks by converting the pre-trained domain from natural scenes to
+SAR images through SSL. The proposed method achieves an improvement of 1.3 mAP
+on the SARDet-100k benchmark compared to only the SFT strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Irony in Emojis: A Comparative Study of Human and LLM Interpretation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yawen Zheng, Hanjia Lyu, Jiebo Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emojis have become a universal language in online communication, often
+carrying nuanced and context-dependent meanings. Among these, irony poses a
+significant challenge for Large Language Models (LLMs) due to its inherent
+incongruity between appearance and intent. This study examines the ability of
+GPT-4o to interpret irony in emojis. By prompting GPT-4o to evaluate the
+likelihood of specific emojis being used to express irony on social media and
+comparing its interpretations with human perceptions, we aim to bridge the gap
+between machine and human understanding. Our findings reveal nuanced insights
+into GPT-4o's interpretive capabilities, highlighting areas of alignment with
+and divergence from human behavior. Additionally, this research underscores the
+importance of demographic factors, such as age and gender, in shaping emoji
+interpretation and evaluates how these factors influence GPT-4o's performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Formulation of Lipschitz Constrained With Functional Gradient
+  Learning for GANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Wan, Ke Fan, Xinwei Sun, Yanwei Fu, Minglu Li, Yunliang Jiang, Zhonglong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a promising alternative method for training Generative
+Adversarial Networks (GANs) on large-scale datasets with clear theoretical
+guarantees. GANs are typically learned through a minimax game between a
+generator and a discriminator, which is known to be empirically unstable.
+Previous learning paradigms have encountered mode collapse issues without a
+theoretical solution. To address these challenges, we propose a novel
+Lipschitz-constrained Functional Gradient GANs learning (Li-CFG) method to
+stabilize the training of GAN and provide a theoretical foundation for
+effectively increasing the diversity of synthetic samples by reducing the
+neighborhood size of the latent vector. Specifically, we demonstrate that the
+neighborhood size of the latent vector can be reduced by increasing the norm of
+the discriminator gradient, resulting in enhanced diversity of synthetic
+samples. To efficiently enlarge the norm of the discriminator gradient, we
+introduce a novel {\epsilon}-centered gradient penalty that amplifies the norm
+of the discriminator gradient using the hyper-parameter {\epsilon}. In
+comparison to other constraints, our method enlarging the discriminator norm,
+thus obtaining the smallest neighborhood size of the latent vector. Extensive
+experiments on benchmark datasets for image generation demonstrate the efficacy
+of the Li-CFG method and the {\epsilon}-centered gradient penalty. The results
+showcase improved stability and increased diversity of synthetic samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KPL: Training-Free Medical Knowledge Mining of Vision-Language Models <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxiang Liu, Tianxiang Hu, Jiawei Du, Ruiyuan Zhang, Joey Tianyi Zhou, Zuozhu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Language Models such as CLIP excel in image recognition due to
+extensive image-text pre-training. However, applying the CLIP inference in
+zero-shot classification, particularly for medical image diagnosis, faces
+challenges due to: 1) the inadequacy of representing image classes solely with
+single category names; 2) the modal gap between the visual and text spaces
+generated by CLIP encoders. Despite attempts to enrich disease descriptions
+with large language models, the lack of class-specific knowledge often leads to
+poor performance. In addition, empirical evidence suggests that existing proxy
+learning methods for zero-shot image classification on natural image datasets
+exhibit instability when applied to medical datasets. To tackle these
+challenges, we introduce the Knowledge Proxy Learning (KPL) to mine knowledge
+from CLIP. KPL is designed to leverage CLIP's multimodal understandings for
+medical image classification through Text Proxy Optimization and Multimodal
+Proxy Learning. Specifically, KPL retrieves image-relevant knowledge
+descriptions from the constructed knowledge-enhanced base to enrich semantic
+text proxies. It then harnesses input images and these descriptions, encoded
+via CLIP, to stably generate multimodal proxies that boost the zero-shot
+classification performance. Extensive experiments conducted on both medical and
+natural image datasets demonstrate that KPL enables effective zero-shot image
+classification, outperforming all baselines. These findings highlight the great
+potential in this paradigm of mining knowledge from CLIP for medical image
+classification and broader areas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI(Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Successive Interference Cancellation-aided Diffusion Models for Joint
+  Channel Estimation and Data Detection in Low Rank Channel Scenarios <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sagnik Bhattacharya, Muhammad Ahmed Mohsin, Kamyar Rajabalifardi, John M. Cioffi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel joint channel-estimation and source-detection
+algorithm using successive interference cancellation (SIC)-aided generative
+score-based diffusion models. Prior work in this area focuses on massive MIMO
+scenarios, which are typically characterized by full-rank channels, and fail in
+low-rank channel scenarios. The proposed algorithm outperforms existing methods
+in joint source-channel estimation, especially in low-rank scenarios where the
+number of users exceeds the number of antennas at the access point (AP). The
+proposed score-based iterative diffusion process estimates the gradient of the
+prior distribution on partial channels, and recursively updates the estimated
+channel parts as well as the source. Extensive simulation results show that the
+proposed method outperforms the baseline methods in terms of normalized mean
+squared error (NMSE) and symbol error rate (SER) in both full-rank and low-rank
+channel scenarios, while having a more dominant effect in the latter, at
+various signal-to-noise ratios (SNR).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CNN-based TEM image denoising from first principles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinwoong Chae, Sungwook Hong, Sungkyu Kim, Sungroh Yoon, Gunn Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transmission electron microscope (TEM) images are often corrupted by noise,
+hindering their interpretation. To address this issue, we propose a deep
+learning-based approach using simulated images. Using density functional theory
+calculations with a set of pseudo-atomic orbital basis sets, we generate highly
+accurate ground truth images. We introduce four types of noise into these
+simulations to create realistic training datasets. Each type of noise is then
+used to train a separate convolutional neural network (CNN) model. Our results
+show that these CNNs are effective in reducing noise, even when applied to
+images with different noise levels than those used during training. However, we
+observe limitations in some cases, particularly in preserving the integrity of
+circular shapes and avoiding visible artifacts between image patches. To
+overcome these challenges, we propose alternative training strategies and
+future research directions. This study provides a valuable framework for
+training deep learning models for TEM image denoising.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages and 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Reproducible and Prognostic Radiomic Features in Variable Slice
+  Thickness Contrast Enhanced CT of Colorectal Liver Metastases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob J. Peoples, Mohammad Hamghalam, Imani James, Maida Wasim, Natalie Gangai, Hyunseon Christine Kang, X. John Rong, Yun Shin Chun, Richard K. G. Do, Amber L. Simpson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Establishing the reproducibility of radiomic signatures is a critical step in
+the path to clinical adoption of quantitative imaging biomarkers; however,
+radiomic signatures must also be meaningfully related to an outcome of clinical
+importance to be of value for personalized medicine. In this study, we analyze
+both the reproducibility and prognostic value of radiomic features extracted
+from the liver parenchyma and largest liver metastases in contrast enhanced CT
+scans of patients with colorectal liver metastases (CRLM). A prospective cohort
+of 81 patients from two major US cancer centers was used to establish the
+reproducibility of radiomic features extracted from images reconstructed with
+different slice thicknesses. A publicly available, single-center cohort of 197
+preoperative scans from patients who underwent hepatic resection for treatment
+of CRLM was used to evaluate the prognostic value of features and models to
+predict overall survival. A standard set of 93 features was extracted from all
+images, with a set of eight different extractor settings. The feature
+extraction settings producing the most reproducible, as well as the most
+prognostically discriminative feature values were highly dependent on both the
+region of interest and the specific feature in question. While the best overall
+predictive model was produced using features extracted with a particular
+setting, without accounting for reproducibility, (C-index = 0.630
+(0.603--0.649)) an equivalent-performing model (C-index = 0.629 (0.605--0.645))
+was produced by pooling features from all extraction settings, and thresholding
+features with low reproducibility ($\mathrm{CCC} \geq 0.85$), prior to feature
+selection. Our findings support a data-driven approach to feature extraction
+and selection, preferring the inclusion of many features, and narrowing feature
+selection based on reproducibility when relevant data is available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the Journal of Machine Learning for
+  Biomedical Imaging (MELBA) https://melba-journal.org/2024:032</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging GANs For Active Appearance Models Optimized Model Fitting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anurag Awasthi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) have gained prominence in refining
+model fitting tasks in computer vision, particularly in domains involving
+deformable models like Active Appearance Models (AAMs). This paper explores the
+integration of GANs to enhance the AAM fitting process, addressing challenges
+in optimizing nonlinear parameters associated with appearance and shape
+variations. By leveraging GANs' adversarial training framework, the aim is to
+minimize fitting errors and improve convergence rates. Achieving robust
+performance even in cases with high appearance variability and occlusions. Our
+approach demonstrates significant improvements in accuracy and computational
+efficiency compared to traditional optimization techniques, thus establishing
+GANs as a potent tool for advanced image model fitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures, in proceeding at conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ditto: Accelerating Diffusion Model via Temporal Value Similarity <span class="chip">HPCA 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungbin Kim, Hyunwuk Lee, Wonho Cho, Mincheol Park, Won Woo Ro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models achieve superior performance in image generation tasks.
+However, it incurs significant computation overheads due to its iterative
+structure. To address these overheads, we analyze this iterative structure and
+observe that adjacent time steps in diffusion models exhibit high value
+similarity, leading to narrower differences between consecutive time steps. We
+adapt these characteristics to a quantized diffusion model and reveal that the
+majority of these differences can be represented with reduced bit-width, and
+even zero. Based on our observations, we propose the Ditto algorithm, a
+difference processing algorithm that leverages temporal similarity with
+quantization to enhance the efficiency of diffusion models. By exploiting the
+narrower differences and the distributive property of layer operations, it
+performs full bit-width operations for the initial time step and processes
+subsequent steps with temporal differences. In addition, Ditto execution flow
+optimization is designed to mitigate the memory overhead of temporal difference
+processing, further boosting the efficiency of the Ditto algorithm. We also
+design the Ditto hardware, a specialized hardware accelerator, fully exploiting
+the dynamic characteristics of the proposed algorithm. As a result, the Ditto
+hardware achieves up to 1.5x speedup and 17.74% energy saving compared to other
+accelerators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the 2025 IEEE International Symposium on
+  High-Performance Computer Architecture (HPCA 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Oyster Phenotype Segmentation with Multi-Network Ensemble and
+  Multi-Scale mechanism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenli Yang, Yanyu Chen, Andrew Trotter, Byeong Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phenotype segmentation is pivotal in analysing visual features of living
+organisms, enhancing our understanding of their characteristics. In the context
+of oysters, meat quality assessment is paramount, focusing on shell, meat,
+gonad, and muscle components. Traditional manual inspection methods are
+time-consuming and subjective, prompting the adoption of machine vision
+technology for efficient and objective evaluation. We explore machine vision's
+capacity for segmenting oyster components, leading to the development of a
+multi-network ensemble approach with a global-local hierarchical attention
+mechanism. This approach integrates predictions from diverse models and
+addresses challenges posed by varying scales, ensuring robust instance
+segmentation across components. Finally, we provide a comprehensive evaluation
+of the proposed method's performance using different real-world datasets,
+highlighting its efficacy and robustness in enhancing oyster phenotype
+segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VLM Agents Generate Their Own Memories: Distilling Experience into
+  Embodied Programs of Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14596v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14596v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Sarch, Lawrence Jang, Michael J. Tarr, William W. Cohen, Kenneth Marino, Katerina Fragkiadaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale LLMs and VLMs excel at few-shot learning but require high-quality
+examples. We introduce In-Context Abstraction Learning (ICAL), which
+iteratively refines suboptimal trajectories into high-quality data with
+optimized actions and detailed reasoning. Given an inefficient demonstration, a
+VLM corrects actions and annotates causal relationships, object states,
+subgoals, and task-relevant visuals, forming "programs of thought." With human
+feedback, these programs are improved as the agent executes them in a similar
+environment. The resulting examples, used as prompt context or fine-tuning
+data, significantly boost decision-making while reducing human feedback needs.
+ICAL surpasses state-of-the-art in TEACh (dialogue-based instruction
+following), VisualWebArena (multimodal web agents), and Ego4D (egocentric video
+action anticipation). In TEACh, combining fine-tuning and retrieval on ICAL
+examples outperforms raw human demonstrations and expert examples, achieving a
+17.5% increase in goal-condition success. In VisualWebArena,
+retrieval-augmented GPT-4V with ICAL improves task success rate 1.6x over
+GPT-4V, while fine-tuning Qwen2-VL achieves a 2.8x improvement. In Ego4D, ICAL
+outperforms few-shot GPT-4V and remains competitive with supervised models.
+Overall, ICAL scales 2x better than raw human demonstrations and reduces manual
+prompt engineering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://ical-learning.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ X-Dyna: Expressive Dynamic Human Image Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10021v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10021v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Chang, Hongyi Xu, You Xie, Yipeng Gao, Zhengfei Kuang, Shengqu Cai, Chenxu Zhang, Guoxian Song, Chao Wang, Yichun Shi, Zeyuan Chen, Shijie Zhou, Linjie Luo, Gordon Wetzstein, Mohammad Soleymani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce X-Dyna, a novel zero-shot, diffusion-based pipeline for
+animating a single human image using facial expressions and body movements
+derived from a driving video, that generates realistic, context-aware dynamics
+for both the subject and the surrounding environment. Building on prior
+approaches centered on human pose control, X-Dyna addresses key shortcomings
+causing the loss of dynamic details, enhancing the lifelike qualities of human
+video animations. At the core of our approach is the Dynamics-Adapter, a
+lightweight module that effectively integrates reference appearance context
+into the spatial attentions of the diffusion backbone while preserving the
+capacity of motion modules in synthesizing fluid and intricate dynamic details.
+Beyond body pose control, we connect a local control module with our model to
+capture identity-disentangled facial expressions, facilitating accurate
+expression transfer for enhanced realism in animated scenes. Together, these
+components form a unified framework capable of learning physical human motion
+and natural scene dynamics from a diverse blend of human and scene videos.
+Comprehensive qualitative and quantitative evaluations demonstrate that X-Dyna
+outperforms state-of-the-art methods, creating highly lifelike and expressive
+animations. The code is available at https://github.com/bytedance/X-Dyna.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page:https://x-dyna.github.io/xdyna.github.io/
+  Code:https://github.com/bytedance/X-Dyna
+  Model:https://huggingface.co/Boese0601/X-Dyna</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Common-Sense Bias Modeling for Classification Tasks <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13213v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13213v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miao Zhang, Zee fryer, Ben Colman, Ali Shahriyari, Gaurav Bharaj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning model bias can arise from dataset composition: correlated
+sensitive features can distort the downstream classification model's decision
+boundary and lead to performance differences along these features. Existing
+de-biasing works tackle the most prominent bias features, such as colors of
+digits or background of animals. However, real-world datasets often include a
+large number of feature correlations that intrinsically manifest in the data as
+common sense information. Such spurious visual cues can further reduce model
+robustness. Thus, domain practitioners desire a comprehensive understanding of
+correlations and the flexibility to address relevant biases. To this end, we
+propose a novel framework to extract comprehensive biases in image datasets
+based on textual descriptions, a common sense-rich modality. Specifically,
+features are constructed by clustering noun phrase embeddings with similar
+semantics. The presence of each feature across the dataset is inferred, and
+their co-occurrence statistics are measured, with spurious correlations
+optionally examined by a human-in-the-loop module. Downstream experiments show
+that our method uncovers novel model biases in multiple image benchmark
+datasets. Furthermore, the discovered bias can be mitigated by simple data
+re-weighting to de-correlate the features, outperforming state-of-the-art
+unsupervised bias mitigation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for AAAI Conference on Artificial Intelligence (AAAI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust Nonlinear Subspace Clustering: A Kernel Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06368v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06368v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunpeng Xu, Lifei Chen, Shengrui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kernel-based subspace clustering, which addresses the nonlinear structures in
+data, is an evolving area of research. Despite noteworthy progressions,
+prevailing methodologies predominantly grapple with limitations relating to (i)
+the influence of predefined kernels on model performance; (ii) the difficulty
+of preserving the original manifold structures in the nonlinear space; (iii)
+the dependency of spectral-type strategies on the ideal block diagonal
+structure of the affinity matrix. This paper presents DKLM, a novel paradigm
+for kernel-induced nonlinear subspace clustering. DKLM provides a data-driven
+approach that directly learns the kernel from the data's self-representation,
+ensuring adaptive weighting and satisfying the multiplicative triangle
+inequality constraint, which enhances the robustness of the learned kernel. By
+leveraging this learned kernel, DKLM preserves the local manifold structure of
+data in a nonlinear space while promoting the formation of an optimal
+block-diagonal affinity matrix. A thorough theoretical examination of DKLM
+reveals its relationship with existing clustering paradigms. Comprehensive
+experiments on synthetic and real-world datasets demonstrate the effectiveness
+of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PDF-WuKong: A Large Multimodal Model for Efficient Long PDF Reading with
+  End-to-End Sparse Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05970v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05970v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xudong Xie, Hao Yan, Liang Yin, Yang Liu, Jing Ding, Minghui Liao, Yuliang Liu, Wei Chen, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal document understanding is a challenging task to process and
+comprehend large amounts of textual and visual information. Recent advances in
+Large Language Models (LLMs) have significantly improved the performance of
+this task. However, existing methods typically focus on either plain text or a
+limited number of document images, struggling to handle long PDF documents with
+interleaved text and images, especially for academic papers. In this paper, we
+introduce PDF-WuKong, a multimodal large language model (MLLM) which is
+designed to enhance multimodal question-answering (QA) for long PDF documents.
+PDF-WuKong incorporates a sparse sampler that operates on both text and image
+representations, significantly improving the efficiency and capability of the
+MLLM. The sparse sampler is integrated with the MLLM's image encoder and
+selects the paragraphs or diagrams most pertinent to user queries for
+processing by the language model. To effectively train and evaluate our model,
+we construct PaperPDF, a dataset consisting of a broad collection of English
+and Chinese academic papers. Multiple strategies are proposed to automatically
+generate 1.1 million QA pairs along with their corresponding evidence sources.
+Experimental results demonstrate the superiority and high efficiency of our
+approach over other models on the task of long multimodal document
+understanding, surpassing proprietary products by an average of 8.6% on F1. Our
+code and dataset will be released at https://github.com/yh-hust/PDF-Wukong.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span>-Based Model for Monocular Visual Odometry: A Video
+  Understanding Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06121v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06121v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        André O. Françani, Marcos R. O. A. Maximo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the camera's pose given images from a single camera is a
+traditional task in mobile robots and autonomous vehicles. This problem is
+called monocular visual odometry and often relies on geometric approaches that
+require considerable engineering effort for a specific scenario. Deep learning
+methods have been shown to be generalizable after proper training and with a
+large amount of available data. Transformer-based architectures have dominated
+the state-of-the-art in natural language processing and computer vision tasks,
+such as image and video understanding. In this work, we deal with the monocular
+visual odometry as a video understanding task to estimate the 6 degrees of
+freedom of a camera's pose. We contribute by presenting the TSformer-VO model
+based on spatio-temporal self-attention mechanisms to extract features from
+clips and estimate the motions in an end-to-end manner. Our approach achieved
+competitive state-of-the-art performance compared with geometry-based and deep
+learning-based methods on the KITTI visual odometry dataset, outperforming the
+DeepVO implementation highly accepted in the visual odometry community. The
+code is publicly available at https://github.com/aofrancani/TSformer-VO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been accepted for publication in IEEE Access</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DDS: Decoupled Dynamic Scene-Graph Generation Network <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.07666v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.07666v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A S M Iftekhar, Raphael Ruschel, Satish Kumar, Suya You, B. S. Manjunath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene-graph generation involves creating a structural representation of the
+relationships between objects in a scene by predicting subject-object-relation
+triplets from input data. Existing methods show poor performance in detecting
+triplets outside of a predefined set, primarily due to their reliance on
+dependent feature learning. To address this issue, we propose DDS -- a
+decoupled dynamic scene-graph generation network -- that consists of two
+independent branches that can disentangle extracted features. The key
+innovation of the current paper is the decoupling of the features representing
+the relationships from those of the objects, which enables the detection of
+novel object-relationship combinations. The DDS model is evaluated on three
+datasets and outperforms previous methods by a significant margin, especially
+in detecting previously unseen triplets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Flexible Mesh Segmentation via Reeb Graph Representation of Geometrical
+  and Topological Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05335v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05335v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Beguet, Sandrine Lanquetin, Romain Raffin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a new mesh segmentation method that integrates
+geometrical and topological features through a flexible Reeb graph
+representation. The algorithm consists of three phases: construction of the
+Reeb graph using the improved topological skeleton approach, topological
+simplification of the graph by cancelling critical points while preserving
+essential features, and generation of contiguous segments via an adaptive
+region-growth process that takes geometric and topological criteria into
+account. Operating with a computational complexity of O(n log(n)) for a mesh of
+n vertices, the method demonstrates both efficiency and scalability. An
+evaluation through case studies, including part-based decomposition with Shape
+Diameter Function and terrain analysis with Shape Index, validates the
+effectiveness of the method in completely different applications. The results
+establish this approach as a robust framework for advanced geometric analysis
+of meshes, connecting the geometric and topological features of shapes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning predictable and robust neural representations by straightening
+  image sequences <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.01777v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.01777v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueyan Niu, Cristina Savin, Eero P. Simoncelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prediction is a fundamental capability of all living organisms, and has been
+proposed as an objective for learning sensory representations. Recent work
+demonstrates that in primate visual systems, prediction is facilitated by
+neural representations that follow straighter temporal trajectories than their
+initial photoreceptor encoding, which allows for prediction by linear
+extrapolation. Inspired by these experimental findings, we develop a
+self-supervised learning (SSL) objective that explicitly quantifies and
+promotes straightening. We demonstrate the power of this objective in training
+deep feedforward neural networks on smoothly-rendered synthetic image sequences
+that mimic commonly-occurring properties of natural videos. The learned model
+contains neural embeddings that are predictive, but also factorize the
+geometric, photometric, and semantic attributes of objects. The representations
+also prove more robust to noise and adversarial attacks compared to previous
+SSL methods that optimize for invariance to random augmentations. Moreover,
+these beneficial properties can be transferred to other training procedures by
+using the straightening objective as a regularizer, suggesting a broader
+utility for straightening as a principle for robust unsupervised learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenEx: Generating an Explorable World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09624v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09624v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taiming Lu, Tianmin Shu, Junfei Xiao, Luoxin Ye, Jiahao Wang, Cheng Peng, Chen Wei, Daniel Khashabi, Rama Chellappa, Alan Yuille, Jieneng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding, navigating, and exploring the 3D physical real world has long
+been a central challenge in the development of artificial intelligence. In this
+work, we take a step toward this goal by introducing GenEx, a system capable of
+planning complex embodied world exploration, guided by its generative
+imagination that forms priors (expectations) about the surrounding
+environments. GenEx generates an entire 3D-consistent imaginative environment
+from as little as a single RGB image, bringing it to life through panoramic
+video streams. Leveraging scalable 3D world data curated from Unreal Engine,
+our generative model is rounded in the physical world. It captures a continuous
+360-degree environment with little effort, offering a boundless landscape for
+AI agents to explore and interact with. GenEx achieves high-quality world
+generation, robust loop consistency over long trajectories, and demonstrates
+strong 3D capabilities such as consistency and active 3D mapping. Powered by
+generative imagination of the world, GPT-assisted agents are equipped to
+perform complex embodied tasks, including both goal-agnostic exploration and
+goal-driven navigation. These agents utilize predictive expectation regarding
+unseen parts of the physical world to refine their beliefs, simulate different
+outcomes based on potential decisions, and make more informed choices. In
+summary, we demonstrate that GenEx provides a transformative platform for
+advancing embodied AI in imaginative spaces and brings potential for extending
+these capabilities to real-world exploration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website: GenEx.world</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Deconvolution of Astronomical Images with Diffusion Models:
+  Quantifying Prior-Driven Features in Reconstructions <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19158v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19158v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessio Spagnoletti, Alexandre Boucaud, Marc Huertas-Company, Wassim Kabalan, Biswajit Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deconvolution of astronomical images is a key aspect of recovering the
+intrinsic properties of celestial objects, especially when considering
+ground-based observations. This paper explores the use of diffusion models
+(DMs) and the Diffusion Posterior Sampling (DPS) algorithm to solve this
+inverse problem task. We apply score-based DMs trained on high-resolution
+cosmological simulations, through a Bayesian setting to compute a posterior
+distribution given the observations available. By considering the redshift and
+the pixel scale as parameters of our inverse problem, the tool can be easily
+adapted to any dataset. We test our model on Hyper Supreme Camera (HSC) data
+and show that we reach resolutions comparable to those obtained by Hubble Space
+Telescope (HST) images. Most importantly, we quantify the uncertainty of
+reconstructions and propose a metric to identify prior-driven features in the
+reconstructed images, which is key in view of applying these methods for
+scientific purposes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5+5 pages, 16 figures, Machine Learning and the Physical Sciences
+  Workshop, NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Token Turing Machines are Efficient Vision Models <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.07613v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.07613v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Purvish Jajal, Nick John Eliopoulos, Benjamin Shiue-Hal Chou, George K. Thiravathukal, James C. Davis, Yung-Hsiang Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Vision Token Turing Machines (ViTTM), an efficient, low-latency,
+memory-augmented Vision Transformer (ViT). Our approach builds on Neural Turing
+Machines and Token Turing Machines, which were applied to NLP and sequential
+visual understanding tasks. ViTTMs are designed for non-sequential computer
+vision tasks such as image classification and segmentation. Our model creates
+two sets of tokens: process tokens and memory tokens; process tokens pass
+through encoder blocks and read-write from memory tokens at each encoder block
+in the network, allowing them to store and retrieve information from memory. By
+ensuring that there are fewer process tokens than memory tokens, we are able to
+reduce the inference time of the network while maintaining its accuracy. On
+ImageNet-1K, the state-of-the-art ViT-B has median latency of 529.5ms and 81.0%
+accuracy, while our ViTTM-B is 56% faster (234.1ms), with 2.4 times fewer
+FLOPs, with an accuracy of 82.9%. On ADE20K semantic segmentation, ViT-B
+achieves 45.65mIoU at 13.8 frame-per-second (FPS) whereas our ViTTM-B model
+acheives a 45.17 mIoU with 26.8 FPS (+94%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Counterfactuals and Uncertainty-Based Explainable Paradigm for the
+  Automated Detection and Segmentation of Renal Cysts in Computed Tomography
+  Images: A Multi-Center Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.03789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.03789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zohaib Salahuddin, Abdalla Ibrahim, Sheng Kuang, Yousif Widaatalla, Razvan L. Miclea, Oliver Morin, Spencer Behr, Marnix P. M. Kop, Tom Marcelissen, Patricia Zondervan, Auke Jager, Philippe Lambin, Henry C Woodruff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Routine computed tomography (CT) scans often detect a wide range of renal
+cysts, some of which may be malignant. Early and precise localization of these
+cysts can significantly aid quantitative image analysis. Current segmentation
+methods, however, do not offer sufficient interpretability at the feature and
+pixel levels, emphasizing the necessity for an explainable framework that can
+detect and rectify model inaccuracies. We developed an interpretable
+segmentation framework and validated it on a multi-centric dataset. A
+Variational Autoencoder Generative Adversarial Network (VAE-GAN) was employed
+to learn the latent representation of 3D input patches and reconstruct input
+images. Modifications in the latent representation using the gradient of the
+segmentation model generated counterfactual explanations for varying dice
+similarity coefficients (DSC). Radiomics features extracted from these
+counterfactual images, using a ground truth cyst mask, were analyzed to
+determine their correlation with segmentation performance. The DSCs for the
+original and VAE-GAN reconstructed images for counterfactual image generation
+showed no significant differences. Counterfactual explanations highlighted how
+variations in cyst image features influence segmentation outcomes and showed
+model discrepancies. Radiomics features correlating positively and negatively
+with dice scores were identified. The uncertainty of the predicted segmentation
+masks was estimated using posterior sampling of the weight space. The
+combination of counterfactual explanations and uncertainty maps provided a
+deeper understanding of the image features within the segmented renal cysts
+that lead to high uncertainty. The proposed segmentation framework not only
+achieved high segmentation accuracy but also increased interpretability
+regarding how image features impact segmentation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LatentGAN Autoencoder: Learning Disentangled Latent Distribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.02010v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.02010v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanket Kalwar, Animikh Aich, Tanay Dixit, Adit Chhabra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In autoencoder, the encoder generally approximates the latent distribution
+over the dataset, and the decoder generates samples using this learned latent
+distribution. There is very little control over the latent vector as using the
+random latent vector for generation will lead to trivial outputs. This work
+tries to address this issue by using the LatentGAN generator to directly learn
+to approximate the latent distribution of the autoencoder and show meaningful
+results on MNIST, 3D Chair, and CelebA datasets, an additional
+information-theoretic constrain is used which successfully learns to control
+autoencoder latent distribution. With this, our model also achieves an error
+rate of 2.38 on MNIST unsupervised image classification, which is better as
+compared to InfoGAN and AAE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SituationalLLM: Proactive Language Models with Scene Awareness for
+  Dynamic, Contextual Task Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.13302v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.13302v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Saif Ullah Khan, Didier Stricker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved remarkable success in text-based
+tasks but often struggle to provide actionable guidance in real-world physical
+environments. This is because of their inability to recognize their limited
+understanding of the user's physical context. We present SituationalLLM, a
+novel approach that integrates structured scene information into an LLM to
+deliver proactive, context-aware assistance. By encoding objects, attributes,
+and relationships in a custom Scene Graph Language, SituationalLLM actively
+identifies gaps in environmental context and seeks clarifications during user
+interactions. This behavior emerges from training on the Situational Awareness
+Database for Instruct-Tuning (SAD-Instruct), which combines diverse,
+scenario-specific scene graphs with iterative, dialogue-based refinements.
+Experimental results indicate that SituationalLLM outperforms generic LLM
+baselines in task specificity, reliability, and adaptability, paving the way
+for environment-aware AI assistants capable of delivering robust, user-centric
+guidance under real-world constraints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Open Research Europe</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Feature-Centered First Order Structure Tensor Scale-Space in 2D and 3D 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.13389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.13389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pawel Tomasz Pieta, Anders Bjorholm Dahl, Jeppe Revall Frisvad, Siavash Arjomand Bigdeli, Anders Nymark Christensen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The structure tensor method is often used for 2D and 3D analysis of imaged
+structures, but its results are in many cases very dependent on the user's
+choice of method parameters. We simplify this parameter choice in first order
+structure tensor scale-space by directly connecting the width of the derivative
+filter to the size of image features. By introducing a ring-filter step, we
+substitute the Gaussian integration/smoothing with a method that more
+accurately shifts the derivative filter response from feature edges to their
+center. We further demonstrate how extracted structural measures can be used to
+correct known inaccuracies in the scale map, resulting in a reliable
+representation of the feature sizes both in 2D and 3D. Compared to the
+traditional first order structure tensor, or previous structure tensor
+scale-space approaches, our solution is much more accurate and can serve as an
+out-of-the-box method for extracting a wide range of structural parameters with
+minimal user input.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid
+  Prototyping in Virtual Reality Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09600v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09600v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Augusto Pinheiro de Sousa, Heiko Hamann, Oliver Deussen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SLAM is a foundational technique with broad applications in robotics and
+AR/VR. SLAM simulations evaluate new concepts, but testing on
+resource-constrained devices, such as VR HMDs, faces challenges: high
+computational cost and restricted sensor data access. This work proposes a
+sparse framework using mesh geometry projections as features, which improves
+efficiency and circumvents direct sensor data access, advancing SLAM research
+as we demonstrate in VR and through numerical evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE VR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FAIR-TAT: Improving Model Fairness Using Targeted Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23142v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23142v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tejaswini Medi, Steffen Jung, Margret Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are susceptible to adversarial attacks and common
+corruptions, which undermine their robustness. In order to enhance model
+resilience against such challenges, Adversarial Training (AT) has emerged as a
+prominent solution. Nevertheless, adversarial robustness is often attained at
+the expense of model fairness during AT, i.e., disparity in class-wise
+robustness of the model. While distinctive classes become more robust towards
+such adversaries, hard to detect classes suffer. Recently, research has focused
+on improving model fairness specifically for perturbed images, overlooking the
+accuracy of the most likely non-perturbed data. Additionally, despite their
+robustness against the adversaries encountered during model training,
+state-of-the-art adversarial trained models have difficulty maintaining
+robustness and fairness when confronted with diverse adversarial threats or
+common corruptions. In this work, we address the above concerns by introducing
+a novel approach called Fair Targeted Adversarial Training (FAIR-TAT). We show
+that using targeted adversarial attacks for adversarial training (instead of
+untargeted attacks) can allow for more favorable trade-offs with respect to
+adversarial fairness. Empirical results validate the efficacy of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Underwater Camouflaged Object Tracking: Benchmark and Baselines <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16902v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16902v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunhui Zhang, Li Liu, Guanjie Huang, Hao Wen, Xi Zhou, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past decade, significant progress has been made in visual object
+tracking, largely due to the availability of large-scale datasets. However,
+existing tracking datasets are primarily focused on open-air scenarios, which
+greatly limits the development of object tracking in underwater environments.
+To bridge this gap, we take a step forward by proposing the first large-scale
+multimodal underwater camouflaged object tracking dataset, namely UW-COT220.
+Based on the proposed dataset, this paper first comprehensively evaluates
+current advanced visual object tracking methods and SAM- and SAM2-based
+trackers in challenging underwater environments. Our findings highlight the
+improvements of SAM2 over SAM, demonstrating its enhanced ability to handle the
+complexities of underwater camouflaged objects. Furthermore, we propose a novel
+vision-language tracking framework called VL-SAM2, based on the video
+foundation model SAM2. Experimental results demonstrate that our VL-SAM2
+achieves state-of-the-art performance on the UW-COT220 dataset. The dataset and
+codes can be accessible at
+\color{magenta}{https://github.com/983632847/Awesome-Multimodal-Object-Tracking}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Work in Progress. Extended Version of WebUOT-1M on NeurIPS
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TransPixeler: Advancing Text-to-Video Generation with Transparency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luozhou Wang, Yijun Li, Zhifei Chen, Jui-Hsien Wang, Zhifei Zhang, He Zhang, Zhe Lin, Yingcong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-video generative models have made significant strides, enabling
+diverse applications in entertainment, advertising, and education. However,
+generating RGBA video, which includes alpha channels for transparency, remains
+a challenge due to limited datasets and the difficulty of adapting existing
+models. Alpha channels are crucial for visual effects (VFX), allowing
+transparent elements like smoke and reflections to blend seamlessly into
+scenes. We introduce TransPixeler, a method to extend pretrained video models
+for RGBA generation while retaining the original RGB capabilities. TransPixar
+leverages a diffusion transformer (DiT) architecture, incorporating
+alpha-specific tokens and using LoRA-based fine-tuning to jointly generate RGB
+and alpha channels with high consistency. By optimizing attention mechanisms,
+TransPixar preserves the strengths of the original RGB model and achieves
+strong alignment between RGB and alpha channels despite limited training data.
+Our approach effectively generates diverse and consistent RGBA videos,
+advancing the possibilities for VFX and interactive content creation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://wileewang.github.io/TransPixar/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GSTAR: Gaussian Surface Tracking and Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10283v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10283v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengwei Zheng, Lixin Xue, Juan Zarate, Jie Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting techniques have enabled efficient photo-realistic
+rendering of static scenes. Recent works have extended these approaches to
+support surface reconstruction and tracking. However, tracking dynamic surfaces
+with 3D Gaussians remains challenging due to complex topology changes, such as
+surfaces appearing, disappearing, or splitting. To address these challenges, we
+propose GSTAR, a novel method that achieves photo-realistic rendering, accurate
+surface reconstruction, and reliable 3D tracking for general dynamic scenes
+with changing topology. Given multi-view captures as input, GSTAR binds
+Gaussians to mesh faces to represent dynamic objects. For surfaces with
+consistent topology, GSTAR maintains the mesh topology and tracks the meshes
+using Gaussians. In regions where topology changes, GSTAR adaptively unbinds
+Gaussians from the mesh, enabling accurate registration and the generation of
+new surfaces based on these optimized Gaussians. Additionally, we introduce a
+surface-based scene flow method that provides robust initialization for
+tracking between frames. Experiments demonstrate that our method effectively
+tracks and reconstructs dynamic surfaces, enabling a range of applications. Our
+project page with the code release is available at
+https://eth-ait.github.io/GSTAR/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Aesthetics: Cultural Competence in Text-to-Image Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.06863v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.06863v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nithish Kannen, Arif Ahmad, Marco Andreetto, Vinodkumar Prabhakaran, Utsav Prabhu, Adji Bousso Dieng, Pushpak Bhattacharyya, Shachi Dave
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-Image (T2I) models are being increasingly adopted in diverse global
+communities where they create visual representations of their unique cultures.
+Current T2I benchmarks primarily focus on faithfulness, aesthetics, and realism
+of generated images, overlooking the critical dimension of cultural competence.
+In this work, we introduce a framework to evaluate cultural competence of T2I
+models along two crucial dimensions: cultural awareness and cultural diversity,
+and present a scalable approach using a combination of structured knowledge
+bases and large language models to build a large dataset of cultural artifacts
+to enable this evaluation. In particular, we apply this approach to build CUBE
+(CUltural BEnchmark for Text-to-Image models), a first-of-its-kind benchmark to
+evaluate cultural competence of T2I models. CUBE covers cultural artifacts
+associated with 8 countries across different geo-cultural regions and along 3
+concepts: cuisine, landmarks, and art. CUBE consists of 1) CUBE-1K, a set of
+high-quality prompts that enable the evaluation of cultural awareness, and 2)
+CUBE-CSpace, a larger dataset of cultural artifacts that serves as grounding to
+evaluate cultural diversity. We also introduce cultural diversity as a novel
+T2I evaluation component, leveraging quality-weighted Vendi score. Our
+evaluations reveal significant gaps in the cultural awareness of existing
+models across countries and provide valuable insights into the cultural
+diversity of T2I outputs for under-specified prompts. Our methodology is
+extendable to other cultural regions and concepts, and can facilitate the
+development of T2I models that better cater to the global population.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Diabetic Retinopathy Detection with CNN-Based Models: A
+  Comparative Study of UNET and Stacked UNET Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.01251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.01251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ameya Uppina, S Navaneetha Krishnan, Talluri Krishna Sai Teja, Nikhil N Iyer, Joe Dhanith P R
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diabetic Retinopathy DR is a severe complication of diabetes. Damaged or
+abnormal blood vessels can cause loss of vision. The need for massive screening
+of a large population of diabetic patients has generated an interest in a
+computer-aided fully automatic diagnosis of DR. In the realm of Deep learning
+frameworks, particularly convolutional neural networks CNNs, have shown great
+interest and promise in detecting DR by analyzing retinal images. However,
+several challenges have been faced in the application of deep learning in this
+domain. High-quality, annotated datasets are scarce, and the variations in
+image quality and class imbalances pose significant hurdles in developing a
+dependable model. In this paper, we demonstrate the proficiency of two
+Convolutional Neural Networks CNNs based models, UNET and Stacked UNET
+utilizing the APTOS Asia Pacific Tele-Ophthalmology Society Dataset. This
+system achieves an accuracy of 92.81% for the UNET and 93.32% for the stacked
+UNET architecture. The architecture classifies the images into five categories
+ranging from 0 to 4, where 0 is no DR and 4 is proliferative DR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic rating of incomplete hippocampal inversions evaluated across
+  multiple cohorts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.02496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.02496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lisa Hemforth, Baptiste Couvy-Duchesne, Kevin De Matos, Camille Brianceau, Matthieu Joulot, Tobias Banaschewski, Arun L. W. Bokde, Sylvane Desrivières, Herta Flor, Antoine Grigis, Hugh Garavan, Penny Gowland, Andreas Heinz, Rüdiger Brühl, Jean-Luc Martinot, Marie-Laure Paillère Martinot, Eric Artiges, Dimitri Papadopoulos, Herve Lemaitre, Tomas Paus, Luise Poustka, Sarah Hohmann, Nathalie Holz, Juliane H. Fröhner, Michael N. Smolka, Nilakshi Vaidya, Henrik Walter, Robert Whelan, Gunter Schumann, Christian Büchel, JB Poline, Bernd Itterman, Vincent Frouin, Alexandre Martin, IMAGEN study group, Claire Cury, Olivier Colliot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incomplete Hippocampal Inversion (IHI), sometimes called hippocampal
+malrotation, is an atypical anatomical pattern of the hippocampus found in
+about 20% of the general population. IHI can be visually assessed on coronal
+slices of T1 weighted MR images, using a composite score that combines four
+anatomical criteria. IHI has been associated with several brain disorders
+(epilepsy, schizophrenia). However, these studies were based on small samples.
+Furthermore, the factors (genetic or environmental) that contribute to the
+genesis of IHI are largely unknown. Large-scale studies are thus needed to
+further understand IHI and their potential relationships to neurological and
+psychiatric disorders. However, visual evaluation is long and tedious,
+justifying the need for an automatic method. In this paper, we propose, for the
+first time, to automatically rate IHI. We proceed by predicting four anatomical
+criteria, which are then summed up to form the IHI score, providing the
+advantage of an interpretable score. We provided an extensive experimental
+investigation of different machine learning methods and training strategies. We
+performed automatic rating using a variety of deep learning models (conv5-FC3,
+ResNet and SECNN) as well as a ridge regression. We studied the generalization
+of our models using different cohorts and performed multi-cohort learning. We
+relied on a large population of 2,008 participants from the IMAGEN study, 993
+and 403 participants from the QTIM/QTAB studies as well as 985 subjects from
+the UKBiobank. We showed that deep learning models outperformed a ridge
+regression. We demonstrated that the performances of the conv5-FC3 network were
+at least as good as more complex networks while maintaining a low complexity
+and computation time. We showed that training on a single cohort may lack in
+variability while training on several cohorts improves generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the Journal of Machine Learning for
+  Biomedical Imaging (MELBA) https://melba-journal.org/2024:016</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unified theory for joint covariance properties under geometric image
+  transformations for spatio-temporal receptive fields according to the
+  generalized Gaussian derivative model for visual receptive fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.10543v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.10543v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tony Lindeberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The influence of natural image transformations on receptive field responses
+is crucial for modelling visual operations in computer vision and biological
+vision. In this regard, covariance properties with respect to geometric image
+transformations in the earliest layers of the visual hierarchy are essential
+for expressing robust image operations, and for formulating invariant visual
+operations at higher levels.
+  This paper defines and proves a set of joint covariance properties for
+spatio-temporal receptive fields in terms of spatio-temporal derivative
+operators applied to spatio-temporally smoothed image data under compositions
+of spatial scaling transformations, spatial affine transformations, Galilean
+transformations and temporal scaling transformations. Specifically, the derived
+relations show how the parameters of the receptive fields need to be
+transformed, in order to match the output from spatio-temporal receptive fields
+under composed spatio-temporal image transformations.
+  For this purpose, we also fundamentally extend the notion of scale-normalized
+derivatives to affine-normalized derivatives, that are computed based on
+spatial smoothing with affine Gaussian kernels, and analyze the covariance
+properties of the resulting affine-normalized derivatives for the affine group
+as well as for important subgroups thereof.
+  We conclude with a geometric analysis, showing how the derived joint
+covariance properties make it possible to relate or match spatio-temporal
+receptive field responses, when observing, possibly moving, local surface
+patches from different views, under locally linearized perspective or
+projective transformations, as well as when observing different instances of
+spatio-temporal events, that may occur either faster or slower between
+different views of similar spatio-temporal events.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46 pages, 19 figures. Note: From version 4, this paper considers a
+  different form of joint composition of the geometric image transformations
+  than in the earlier versions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ECGrecover: a Deep Learning Approach for Electrocardiogram Signal
+  Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.16901v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.16901v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Lence, Federica Granese, Ahmad Fall, Blaise Hanczar, Joe-Elie Salem, Jean-Daniel Zucker, Edi Prifti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we address the challenge of reconstructing the complete 12-lead
+ECG signal from its incomplete parts. We focus on two main scenarios: (i)
+reconstructing missing signal segments within an ECG lead and (ii) recovering
+entire leads from signal in another unique lead. Two emerging clinical
+applications emphasize the relevance of our work. The first is the increasing
+need to digitize paper-stored ECGs for utilization in AI-based applications,
+often limited to digital 12 lead 10s ECGs. The second is the widespread use of
+wearable devices that record ECGs but typically capture only one or a few
+leads. In both cases, a non-negligible amount of information is lost or not
+recorded. Our approach aims to recover this missing signal. We propose
+ECGrecover, a U-Net neural network model trained on a novel composite objective
+function to address the reconstruction problem. This function incorporates both
+spatial and temporal features of the ECG by combining the distance in amplitude
+and sycnhronization through time between the reconstructed and the real digital
+signals. We used real-life ECG datasets and through comprehensive assessments
+compared ECGrecover with three state-of-the-art methods based on generative
+adversarial networks (EKGAN, Pix2Pix) as well as the CopyPaste strategy. The
+results demonstrated that ECGrecover consistently outperformed state-of-the-art
+methods in standard distortion metrics as well as in preserving critical ECG
+characteristics, particularly the P, QRS, and T wave coordinates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 14 figures, 29 tables, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Improved Conditioning Mechanisms and <span class="highlight-title">Pre-train</span>ing Strategies for
+  Diffusion Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03177v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03177v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tariq Berrada Ifriqi, Pietro Astolfi, Melissa Hall, Reyhane Askari-Hemmat, Yohann Benchetrit, Marton Havasi, Matthew Muckley, Karteek Alahari, Adriana Romero-Soriano, Jakob Verbeek, Michal Drozdzal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale training of latent diffusion models (LDMs) has enabled
+unprecedented quality in image generation. However, the key components of the
+best performing LDM training recipes are oftentimes not available to the
+research community, preventing apple-to-apple comparisons and hindering the
+validation of progress in the field. In this work, we perform an in-depth study
+of LDM training recipes focusing on the performance of models and their
+training efficiency. To ensure apple-to-apple comparisons, we re-implement five
+previously published models with their corresponding recipes. Through our
+study, we explore the effects of (i)~the mechanisms used to condition the
+generative model on semantic information (e.g., text prompt) and control
+metadata (e.g., crop size, random flip flag, etc.) on the model performance,
+and (ii)~the transfer of the representations learned on smaller and
+lower-resolution datasets to larger ones on the training efficiency and model
+performance. We then propose a novel conditioning mechanism that disentangles
+semantic and control metadata conditionings and sets a new state-of-the-art in
+class-conditional generation on the ImageNet-1k dataset -- with FID
+improvements of 7% on 256 and 8% on 512 resolutions -- as well as text-to-image
+generation on the CC12M dataset -- with FID improvements of 8% on 256 and 23%
+on 512 resolution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a conference paper (poster) for NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Transparent to Opaque: Rethinking Neural Implicit Surfaces with
+  $α$-NeuS <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.05362v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.05362v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Zhang, Junkai Deng, Xuhui Chen, Fei Hou, Wencheng Wang, Hong Qin, Chen Qian, Ying He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional 3D shape reconstruction techniques from multi-view images, such
+as structure from motion and multi-view stereo, face challenges in
+reconstructing transparent objects. Recent advances in neural radiance fields
+and its variants primarily address opaque or transparent objects, encountering
+difficulties to reconstruct both transparent and opaque objects simultaneously.
+This paper introduces $\alpha$-Neus -- an extension of NeuS -- that proves NeuS
+is unbiased for materials from fully transparent to fully opaque. We find that
+transparent and opaque surfaces align with the non-negative local minima and
+the zero iso-surface, respectively, in the learned distance field of NeuS.
+Traditional iso-surfacing extraction algorithms, such as marching cubes, which
+rely on fixed iso-values, are ill-suited for such data. We develop a method to
+extract the transparent and opaque surface simultaneously based on DCUDF. To
+validate our approach, we construct a benchmark that includes both real-world
+and synthetic scenes, demonstrating its practical utility and effectiveness.
+Our data and code are publicly available at
+https://github.com/728388808/alpha-NeuS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unveil Inversion and Invariance in Flow <span class="highlight-title">Transformer</span> for Versatile Image
+  Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15843v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15843v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Xu, Boyuan Jiang, Xiaobin Hu, Donghao Luo, Qingdong He, Jiangning Zhang, Chengjie Wang, Yunsheng Wu, Charles Ling, Boyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging the large generative prior of the flow transformer for tuning-free
+image editing requires authentic inversion to project the image into the
+model's domain and a flexible invariance control mechanism to preserve
+non-target contents. However, the prevailing diffusion inversion performs
+deficiently in flow-based models, and the invariance control cannot reconcile
+diverse rigid and non-rigid editing tasks. To address these, we systematically
+analyze the \textbf{inversion and invariance} control based on the flow
+transformer. Specifically, we unveil that the Euler inversion shares a similar
+structure to DDIM yet is more susceptible to the approximation error. Thus, we
+propose a two-stage inversion to first refine the velocity estimation and then
+compensate for the leftover error, which pivots closely to the model prior and
+benefits editing. Meanwhile, we propose the invariance control that manipulates
+the text features within the adaptive layer normalization, connecting the
+changes in the text prompt to image semantics. This mechanism can
+simultaneously preserve the non-target contents while allowing rigid and
+non-rigid manipulation, enabling a wide range of editing types such as visual
+text, quantity, facial expression, etc. Experiments on versatile scenarios
+validate that our framework achieves flexible and accurate editing, unlocking
+the potential of the flow transformer for versatile image editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://pengchengpcx.github.io/EditFT/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasoning to Attend: Try to Understand How <SEG> Token Works 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17741v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17741v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Qian, Xin Yin, Dejing Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current Large Multimodal Models (LMMs) empowered visual grounding typically
+rely on $\texttt{<SEG>}$ token as a text prompt to jointly optimize the
+vision-language model (e.g., LLaVA) and the downstream task-specified model
+(\eg, SAM). However, we observe that little research has looked into how it
+works. In this work, we first visualize the similarity maps, which are obtained
+by computing the semantic similarity between the $\texttt{<SEG>}$ token and the
+image token embeddings derived from the last hidden layer in both the LLaVA
+encoder and SAM decoder. Intriguingly, we have found that a striking
+consistency holds in terms of activation responses in the similarity map,which
+reveals that what $\texttt{<SEG>}$ token contributes to is the semantic
+similarity within image-text pairs. Specifically, $\texttt{<SEG>}$ token, a
+placeholder expanded in text vocabulary, extensively queries among individual
+tokenized image patches to match the semantics of an object from text to the
+paired image while the Large Language Models (LLMs) are being fine-tuned. Upon
+the above findings, we present READ, which facilitates LMMs' resilient
+$\textbf{REA}$soning capability of where to atten$\textbf{D}$ under the
+guidance of highly activated points borrowed from similarity maps. Remarkably,
+READ features an intuitive design, Similarity as Points module (SasP), which
+can be seamlessly applied to $\texttt{<SEG>}$-like paradigms in a plug-and-play
+fashion. Also, extensive experiments have been conducted on the ReasonSeg and
+RefCOCO(+/g) datasets. To validate whether READ suffers from catastrophic
+forgetting of previous skills after fine-tuning, we further assess its
+generation ability on an augmented FP-RefCOCO(+/g) dataset. All codes and
+models are publicly available at https://github.com/rui-qian/READ.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://github.com/rui-qian/READ</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizable Disaster Damage Assessment via Change Detection with
+  Vision Foundation Model <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.08020v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.08020v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyeongjin Ahn, Sungwon Han, Sungwon Park, Jihee Kim, Sangyoon Park, Meeyoung Cha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing frequency and intensity of natural disasters call for rapid
+and accurate damage assessment. In response, disaster benchmark datasets from
+high-resolution satellite imagery have been constructed to develop methods for
+detecting damaged areas. However, these methods face significant challenges
+when applied to previously unseen regions due to the limited geographical and
+disaster-type diversity in the existing datasets. We introduce DAVI (Disaster
+Assessment with VIsion foundation model), a novel approach that addresses
+domain disparities and detects structural damage at the building level without
+requiring ground-truth labels for target regions. DAVI combines task-specific
+knowledge from a model trained on source regions with task-agnostic knowledge
+from an image segmentation model to generate pseudo labels indicating potential
+damage in target regions. It then utilizes a two-stage refinement process,
+which operate at both pixel and image levels, to accurately identify changes in
+disaster-affected areas. Our evaluation, including a case study on the 2023
+T\"urkiye earthquake, demonstrates that our model achieves exceptional
+performance across diverse terrains (e.g., North America, Asia, and the Middle
+East) and disaster types (e.g., wildfires, hurricanes, and tsunamis). This
+confirms its robustness in disaster assessment without dependence on
+ground-truth labels and highlights its practical applicability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025 (oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MM-GTUNets: Unified Multi-Modal Graph Deep Learning for Brain Disorders
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14455v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14455v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luhui Cai, Weiming Zeng, Hongyu Chen, Hua Zhang, Yueyang Li, Yu Feng, Hongjie Yan, Lingbin Bian, Wai Ting Siok, Nizhuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph deep learning (GDL) has demonstrated impressive performance in
+predicting population-based brain disorders (BDs) through the integration of
+both imaging and non-imaging data. However, the effectiveness of GDL based
+methods heavily depends on the quality of modeling the multi-modal population
+graphs and tends to degrade as the graph scale increases. Furthermore, these
+methods often constrain interactions between imaging and non-imaging data to
+node-edge interactions within the graph, overlooking complex inter-modal
+correlations, leading to suboptimal outcomes. To overcome these challenges, we
+propose MM-GTUNets, an end-to-end graph transformer based multi-modal graph
+deep learning (MMGDL) framework designed for brain disorders prediction at
+large scale. Specifically, to effectively leverage rich multi-modal information
+related to diseases, we introduce Modality Reward Representation Learning
+(MRRL) which adaptively constructs population graphs using a reward system.
+Additionally, we employ variational autoencoder to reconstruct latent
+representations of non-imaging features aligned with imaging features. Based on
+this, we propose Adaptive Cross-Modal Graph Learning (ACMGL), which captures
+critical modality-specific and modality-shared features through a unified
+GTUNet encoder taking advantages of Graph UNet and Graph Transformer, and
+feature fusion module. We validated our method on two public multi-modal
+datasets ABIDE and ADHD-200, demonstrating its superior performance in
+diagnosing BDs. Our code is available at https://github.com/NZWANG/MM-GTUNets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Long-Horizon Vision-Language Navigation: Platform, Benchmark and
+  Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09082v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09082v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinshuai Song, Weixing Chen, Yang Liu, Vincent Chan, Guanbin Li, Liang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Vision-Language Navigation (VLN) methods primarily focus on
+single-stage navigation, limiting their effectiveness in multi-stage and
+long-horizon tasks within complex and dynamic environments. To address these
+limitations, we propose a novel VLN task, named Long-Horizon Vision-Language
+Navigation (LH-VLN), which emphasizes long-term planning and decision
+consistency across consecutive subtasks. Furthermore, to support LH-VLN, we
+develop an automated data generation platform NavGen, which constructs datasets
+with complex task structures and improves data utility through a bidirectional,
+multi-granularity generation approach. To accurately evaluate complex tasks, we
+construct the Long-Horizon Planning and Reasoning in VLN (LHPR-VLN) benchmark
+consisting of 3,260 tasks with an average of 150 task steps, serving as the
+first dataset specifically designed for the long-horizon vision-language
+navigation task. Furthermore, we propose Independent Success Rate (ISR),
+Conditional Success Rate (CSR), and CSR weight by Ground Truth (CGT) metrics,
+to provide fine-grained assessments of task completion. To improve model
+adaptability in complex tasks, we propose a novel Multi-Granularity Dynamic
+Memory (MGDM) module that integrates short-term memory blurring with long-term
+memory retrieval to enable flexible navigation in dynamic environments. Our
+platform, benchmark and method supply LH-VLN with a robust data generation
+pipeline, comprehensive model evaluation dataset, reasonable metrics, and a
+novel VLN model, establishing a foundational framework for advancing LH-VLN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A novel Vision-Language Navigation task: Long-Horizon Vision-Language
+  Navigation, project page: https://hcplab-sysu.github.io/LH-VLN/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WorldPose: A World Cup <span class="highlight-title">Dataset</span> for Global 3D Human Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02771v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02771v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianjian Jiang, Johsan Billingham, Sebastian Müksch, Juan Zarate, Nicolas Evans, Martin R. Oswald, Marc Pollefeys, Otmar Hilliges, Manuel Kaufmann, Jie Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present WorldPose, a novel dataset for advancing research in multi-person
+global pose estimation in the wild, featuring footage from the 2022 FIFA World
+Cup. While previous datasets have primarily focused on local poses, often
+limited to a single person or in constrained, indoor settings, the
+infrastructure deployed for this sporting event allows access to multiple fixed
+and moving cameras in different stadiums. We exploit the static multi-view
+setup of HD cameras to recover the 3D player poses and motions with
+unprecedented accuracy given capture areas of more than 1.75 acres. We then
+leverage the captured players' motions and field markings to calibrate a moving
+broadcasting camera. The resulting dataset comprises more than 80 sequences
+with approx 2.5 million 3D poses and a total traveling distance of over 120 km.
+Subsequently, we conduct an in-depth analysis of the SOTA methods for global
+pose estimation. Our experiments demonstrate that WorldPose challenges existing
+multi-person techniques, supporting the potential for new research in this area
+and others, such as sports analysis. All pose annotations (in SMPL format),
+broadcasting camera parameters and footage will be released for academic
+research purposes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UniMLVG: Unified Framework for Multi-view Long Video Generation with
+  Comprehensive Control Capabilities for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04842v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04842v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Chen, Zehuan Wu, Yichen Liu, Yuxin Guo, Jingcheng Ni, Haifeng Xia, Siyu Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The creation of diverse and realistic driving scenarios has become essential
+to enhance perception and planning capabilities of the autonomous driving
+system. However, generating long-duration, surround-view consistent driving
+videos remains a significant challenge. To address this, we present UniMLVG, a
+unified framework designed to generate extended street multi-perspective videos
+under precise control. By integrating single- and multi-view driving videos
+into the training data, our approach updates cross-frame and cross-view modules
+across three stages with different training objectives, substantially boosting
+the diversity and quality of generated visual content. Additionally, we employ
+the explicit viewpoint modeling in multi-view video generation to effectively
+improve motion transition consistency. Capable of handling various input
+reference formats (e.g., text, images, or video), our UniMLVG generates
+high-quality multi-view videos according to the corresponding condition
+constraints such as 3D bounding boxes or frame-level text descriptions.
+Compared to the best models with similar capabilities, our framework achieves
+improvements of 21.4% in FID and 36.5% in FVD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ICM-Assistant: Instruction-tuning Multimodal Large Language Models for
+  Rule-based Explainable Image Content Moderation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18216v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18216v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengyang Wu, Yuzhi Zhao, Jialun Cao, Mingjie Xu, Zhongming Jiang, Xuehui Wang, Qinbin Li, Guangneng Hu, Shengchao Qin, Chi-Wing Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controversial contents largely inundate the Internet, infringing various
+cultural norms and child protection standards. Traditional Image Content
+Moderation (ICM) models fall short in producing precise moderation decisions
+for diverse standards, while recent multimodal large language models (MLLMs),
+when adopted to general rule-based ICM, often produce classification and
+explanation results that are inconsistent with human moderators. Aiming at
+flexible, explainable, and accurate ICM, we design a novel rule-based dataset
+generation pipeline, decomposing concise human-defined rules and leveraging
+well-designed multi-stage prompts to enrich short explicit image annotations.
+Our ICM-Instruct dataset includes detailed moderation explanation and
+moderation Q-A pairs. Built upon it, we create our ICM-Assistant model in the
+framework of rule-based ICM, making it readily applicable in real practice. Our
+ICM-Assistant model demonstrates exceptional performance and flexibility.
+Specifically, it significantly outperforms existing approaches on various
+sources, improving both the moderation classification (36.8% on average) and
+moderation explanation quality (26.6% on average) consistently over existing
+MLLMs. Code/Data is available at https://github.com/zhaoyuzhi/ICM-Assistant.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constructing Fair Latent Space for Intersection of Fairness and
+  Explainability <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17523v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17523v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyungjun Joo, Hyeonggeun Han, Sehwan Kim, Sangwoo Hong, Jungwoo Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the use of machine learning models has increased, numerous studies have
+aimed to enhance fairness. However, research on the intersection of fairness
+and explainability remains insufficient, leading to potential issues in gaining
+the trust of actual users. Here, we propose a novel module that constructs a
+fair latent space, enabling faithful explanation while ensuring fairness. The
+fair latent space is constructed by disentangling and redistributing labels and
+sensitive attributes, allowing the generation of counterfactual explanations
+for each type of information. Our module is attached to a pretrained generative
+model, transforming its biased latent space into a fair latent space.
+Additionally, since only the module needs to be trained, there are advantages
+in terms of time and cost savings, without the need to train the entire
+generative model. We validate the fair latent space with various fairness
+metrics and demonstrate that our approach can effectively provide explanations
+for biased decisions and assurances of fairness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, accepted in AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Making Images Real Again: A Comprehensive <span class="highlight-title">Survey</span> on Deep Image
+  Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.14490v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.14490v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Niu, Wenyan Cong, Liu Liu, Yan Hong, Bo Zhang, Jing Liang, Liqing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a common image editing operation, image composition (object insertion)
+aims to combine the foreground from one image and another background image,
+resulting in a composite image. However, there are many issues that could make
+the composite images unrealistic. These issues can be summarized as the
+inconsistency between foreground and background, which includes appearance
+inconsistency (e.g., incompatible illumination), geometry inconsistency (e.g.,
+unreasonable size), and semantic inconsistency (e.g., mismatched semantic
+context). Image composition task could be decomposed into multiple sub-tasks,
+in which each sub-task targets at one or more issues. Specifically, object
+placement aims to find reasonable scale, location, and shape for the
+foreground. Image blending aims to address the unnatural boundary between
+foreground and background. Image harmonization aims to adjust the illumination
+statistics of foreground. Shadow (resp., reflection) generation aims to
+generate plausible shadow (resp., reflection) for the foreground. These
+sub-tasks can be executed sequentially or parallelly to acquire realistic
+composite images. To the best of our knowledge, there is no previous survey on
+image composition (object insertion). In this paper, we conduct comprehensive
+survey over the sub-tasks and combinatorial task of image composition (object
+insertion). For each one, we summarize the existing methods, available
+datasets, and common evaluation metrics. We have also contributed the first
+image composition toolbox libcom, which assembles 10+ image composition related
+functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffVSR: Enhancing Real-World Video Super-Resolution with Diffusion
+  Models for Advanced Visual Quality and Temporal Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10110v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10110v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohui Li, Yihao Liu, Shuo Cao, Ziyan Chen, Shaobin Zhuang, Xiangyu Chen, Yinan He, Yi Wang, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated exceptional capabilities in image
+generation and restoration, yet their application to video super-resolution
+faces significant challenges in maintaining both high fidelity and temporal
+consistency. We present DiffVSR, a diffusion-based framework for real-world
+video super-resolution that effectively addresses these challenges through key
+innovations. For intra-sequence coherence, we develop a multi-scale temporal
+attention module and temporal-enhanced VAE decoder that capture fine-grained
+motion details. To ensure inter-sequence stability, we introduce a noise
+rescheduling mechanism with an interweaved latent transition approach, which
+enhances temporal consistency without additional training overhead. We propose
+a progressive learning strategy that transitions from simple to complex
+degradations, enabling robust optimization despite limited high-quality video
+data. Extensive experiments demonstrate that DiffVSR delivers superior results
+in both visual quality and temporal consistency, setting a new performance
+standard in real-world video super-resolution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://xh9998.github.io/DiffVSR-project/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Touchstone Benchmark: Are We on the Right Way for Evaluating AI
+  Algorithms for Medical Segmentation? <span class="chip">NeurIPS-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03670v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03670v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro R. A. S. Bassi, Wenxuan Li, Yucheng Tang, Fabian Isensee, Zifu Wang, Jieneng Chen, Yu-Cheng Chou, Yannick Kirchhoff, Maximilian Rokuss, Ziyan Huang, Jin Ye, Junjun He, Tassilo Wald, Constantin Ulrich, Michael Baumgartner, Saikat Roy, Klaus H. Maier-Hein, Paul Jaeger, Yiwen Ye, Yutong Xie, Jianpeng Zhang, Ziyang Chen, Yong Xia, Zhaohu Xing, Lei Zhu, Yousef Sadegheih, Afshin Bozorgpour, Pratibha Kumari, Reza Azad, Dorit Merhof, Pengcheng Shi, Ting Ma, Yuxin Du, Fan Bai, Tiejun Huang, Bo Zhao, Haonan Wang, Xiaomeng Li, Hanxue Gu, Haoyu Dong, Jichen Yang, Maciej A. Mazurowski, Saumya Gupta, Linshan Wu, Jiaxin Zhuang, Hao Chen, Holger Roth, Daguang Xu, Matthew B. Blaschko, Sergio Decherchi, Andrea Cavalli, Alan L. Yuille, Zongwei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How can we test AI performance? This question seems trivial, but it isn't.
+Standard benchmarks often have problems such as in-distribution and small-size
+test sets, oversimplified metrics, unfair comparisons, and short-term outcome
+pressure. As a consequence, good performance on standard benchmarks does not
+guarantee success in real-world scenarios. To address these problems, we
+present Touchstone, a large-scale collaborative segmentation benchmark of 9
+types of abdominal organs. This benchmark is based on 5,195 training CT scans
+from 76 hospitals around the world and 5,903 testing CT scans from 11
+additional hospitals. This diverse test set enhances the statistical
+significance of benchmark results and rigorously evaluates AI algorithms across
+various out-of-distribution scenarios. We invited 14 inventors of 19 AI
+algorithms to train their algorithms, while our team, as a third party,
+independently evaluated these algorithms on three test sets. In addition, we
+also evaluated pre-existing AI frameworks--which, differing from algorithms,
+are more flexible and can support different algorithms--including MONAI from
+NVIDIA, nnU-Net from DKFZ, and numerous other open-source frameworks. We are
+committed to expanding this benchmark to encourage more innovation of AI
+algorithms for the medical domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RICAU-Net: Residual-block Inspired Coordinate Attention U-Net for
+  Segmentation of Small and Sparse Calcium Lesions in Cardiac CT 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.06993v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.06993v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Doyoung Park, Jinsoo Kim, Qi Chang, Shuang Leng, Liang Zhong, Lohendran Baskaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Agatston score, which is the sum of the calcification in the four main
+coronary arteries, has been widely used in the diagnosis of coronary artery
+disease (CAD). However, many studies have emphasized the importance of the
+vessel-specific Agatston score, as calcification in a specific vessel is
+significantly correlated with the occurrence of coronary heart disease (CHD).
+In this paper, we propose the Residual-block Inspired Coordinate Attention
+U-Net (RICAU-Net), which incorporates coordinate attention in two distinct
+manners and a customized combo loss function for lesion-specific coronary
+artery calcium (CAC) segmentation. This approach aims to tackle the high
+class-imbalance issue associated with small and sparse CAC lesions.
+Experimental results and the ablation study demonstrate that the proposed
+method outperforms the five other U-Net based methods used in medical
+applications, by achieving the highest per-lesion Dice scores across all four
+lesions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ISBI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Monocular Scene Flow Estimation in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10357v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10357v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqing Liang, Abhishek Badki, Hang Su, James Tompkin, Orazio Gallo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large models have shown generalization across datasets for many low-level
+vision tasks, like depth estimation, but no such general models exist for scene
+flow. Even though scene flow has wide potential use, it is not used in practice
+because current predictive models do not generalize well. We identify three key
+challenges and propose solutions for each. First, we create a method that
+jointly estimates geometry and motion for accurate prediction. Second, we
+alleviate scene flow data scarcity with a data recipe that affords us 1M
+annotated training samples across diverse synthetic scenes. Third, we evaluate
+different parameterizations for scene flow prediction and adopt a natural and
+effective parameterization. Our resulting model outperforms existing methods as
+well as baselines built on large-scale models in terms of 3D end-point error,
+and shows zero-shot generalization to the casually captured videos from DAVIS
+and the robotic manipulation scenes from RoboTAP. Overall, our approach makes
+scene flow prediction more practical in-the-wild.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Website: https://research.nvidia.com/labs/lpr/zero_msf//</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ List Items One by One: A New Data Source and Learning Paradigm for
+  Multimodal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.16375v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.16375v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An Yan, Zhengyuan Yang, Junda Wu, Wanrong Zhu, Jianwei Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Julian McAuley, Jianfeng Gao, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Set-of-Mark (SoM) Prompting unleashes the visual grounding capability of
+GPT-4V, by enabling the model to associate visual objects with tags inserted on
+the image. These tags, marked with alphanumerics, can be indexed via text
+tokens for easy reference. Despite the extraordinary performance from GPT-4V,
+we observe that other Multimodal Large Language Models (MLLMs) struggle to
+understand these visual tags. To promote the learning of SoM prompting for
+open-source models, we propose a new learning paradigm: "list items one by
+one," which asks the model to enumerate and describe all visual tags placed on
+the image following the alphanumeric orders of tags. By integrating our curated
+dataset with other visual instruction tuning datasets, we are able to equip
+existing MLLMs with the SoM prompting ability. Furthermore, we evaluate our
+finetuned SoM models on five MLLM benchmarks. We find that this new dataset,
+even in a relatively small size (10k-30k images with tags), significantly
+enhances visual reasoning capabilities and reduces hallucinations for MLLMs.
+Perhaps surprisingly, these improvements persist even when the visual tags are
+omitted from input images during inference. This suggests the potential of
+"list items one by one" as a new paradigm for training MLLMs, which strengthens
+the object-text alignment through the use of visual tags in the training stage.
+Finally, we conduct analyses by probing trained models to understand the
+working mechanism of SoM. Our code and data are available at
+\url{https://github.com/zzxslp/SoM-LLaVA}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published at COLM-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Poison-RAG: Adversarial Data Poisoning Attacks on Retrieval-Augmented
+  Generation in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Nazary, Yashar Deldjoo, Tommaso di Noia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents Poison-RAG, a framework for adversarial data poisoning
+attacks targeting retrieval-augmented generation (RAG)-based recommender
+systems. Poison-RAG manipulates item metadata, such as tags and descriptions,
+to influence recommendation outcomes. Using item metadata generated through a
+large language model (LLM) and embeddings derived via the OpenAI API, we
+explore the impact of adversarial poisoning attacks on provider-side, where
+attacks are designed to promote long-tail items and demote popular ones. Two
+attack strategies are proposed: local modifications, which personalize tags for
+each item using BERT embeddings, and global modifications, applying uniform
+tags across the dataset. Experiments conducted on the MovieLens dataset in a
+black-box setting reveal that local strategies improve manipulation
+effectiveness by up to 50\%, while global strategies risk boosting already
+popular items. Results indicate that popular items are more susceptible to
+attacks, whereas long-tail items are harder to manipulate. Approximately 70\%
+of items lack tags, presenting a cold-start challenge; data augmentation and
+synthesis are proposed as potential defense mechanisms to enhance RAG-based
+systems' resilience. The findings emphasize the need for robust metadata
+management to safeguard recommendation frameworks. Code and data are available
+at https://github.com/atenanaz/Poison-RAG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Preference-Guided Diffusion Model for Cross-Domain
+  Recommendation <span class="chip">KDD'2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaodong Li, Hengzhu Tang, Jiawei Sheng, Xinghua Zhang, Li Gao, Suqi Cheng, Dawei Yin, Tingwen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain recommendation (CDR) has been proven as a promising way to
+alleviate the cold-start issue, in which the most critical problem is how to
+draw an informative user representation in the target domain via the transfer
+of user preference existing in the source domain. Prior efforts mostly follow
+the embedding-and-mapping paradigm, which first integrate the preference into
+user representation in the source domain, and then perform a mapping function
+on this representation to the target domain. However, they focus on mapping
+features across domains, neglecting to explicitly model the preference
+integration process, which may lead to learning coarse user representation.
+Diffusion models (DMs), which contribute to more accurate user/item
+representations due to their explicit information injection capability, have
+achieved promising performance in recommendation systems. Nevertheless, these
+DMs-based methods cannot directly account for valuable user preference in other
+domains, leading to challenges in adapting to the transfer of preference for
+cold-start users. Consequently, the feasibility of DMs for CDR remains
+underexplored. To this end, we explore to utilize the explicit information
+injection capability of DMs for user preference integration and propose a
+Preference-Guided Diffusion Model for CDR to cold-start users, termed as DMCDR.
+Specifically, we leverage a preference encoder to establish the preference
+guidance signal with the user's interaction history in the source domain. Then,
+we explicitly inject the preference guidance signal into the user
+representation step by step to guide the reverse process, and ultimately
+generate the personalized user representation in the target domain, thus
+achieving the transfer of user preference across domains. Furthermore, we
+comprehensively explore the impact of six DMs-based variants on CDR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by KDD'2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Biomedical Knowledge Graph: A <span class="highlight-title">Survey</span> of Domains, Tasks, and Real-World
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxing Lu, Sin Yee Goi, Xukai Zhao, Jinzhuo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical knowledge graphs (BKGs) have emerged as powerful tools for
+organizing and leveraging the vast and complex data found across the biomedical
+field. Yet, current reviews of BKGs often limit their scope to specific domains
+or methods, overlooking the broader landscape and the rapid technological
+progress reshaping it. In this survey, we address this gap by offering a
+systematic review of BKGs from three core perspectives: domains, tasks, and
+applications. We begin by examining how BKGs are constructed from diverse data
+sources, including molecular interactions, pharmacological datasets, and
+clinical records. Next, we discuss the essential tasks enabled by BKGs,
+focusing on knowledge management, retrieval, reasoning, and interpretation.
+Finally, we highlight real-world applications in precision medicine, drug
+discovery, and scientific research, illustrating the translational impact of
+BKGs across multiple sectors. By synthesizing these perspectives into a unified
+framework, this survey not only clarifies the current state of BKG research but
+also establishes a foundation for future exploration, enabling both innovative
+methodological advances and practical implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>45 pages, 4 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Scalability of Approximate Sparse Retrieval Algorithms
+  to Massive <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Bruch, Franco Maria Nardini, Cosimo Rulli, Rossano Venturini, Leonardo Venuta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learned sparse text embeddings have gained popularity due to their
+effectiveness in top-k retrieval and inherent interpretability. Their
+distributional idiosyncrasies, however, have long hindered their use in
+real-world retrieval systems. That changed with the recent development of
+approximate algorithms that leverage the distributional properties of sparse
+embeddings to speed up retrieval. Nonetheless, in much of the existing
+literature, evaluation has been limited to datasets with only a few million
+documents such as MSMARCO. It remains unclear how these systems behave on much
+larger datasets and what challenges lurk in larger scales. To bridge that gap,
+we investigate the behavior of state-of-the-art retrieval algorithms on massive
+datasets. We compare and contrast the recently-proposed Seismic and graph-based
+solutions adapted from dense retrieval. We extensively evaluate Splade
+embeddings of 138M passages from MsMarco-v2 and report indexing time and other
+efficiency and effectiveness metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Estimation in the Real World: A Study on Music Emotion
+  Recognition <span class="chip">ECIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karn N. Watcharasupat, Yiwei Ding, T. Aleksandra Ma, Pavan Seshadri, Alexander Lerch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Any data annotation for subjective tasks shows potential variations between
+individuals. This is particularly true for annotations of emotional responses
+to musical stimuli. While older approaches to music emotion recognition systems
+frequently addressed this uncertainty problem through probabilistic modeling,
+modern systems based on neural networks tend to ignore the variability and
+focus only on predicting central tendencies of human subjective responses. In
+this work, we explore several methods for estimating not only the central
+tendencies of the subjective responses to a musical stimulus, but also for
+estimating the uncertainty associated with these responses. In particular, we
+investigate probabilistic loss functions and inference-time random sampling.
+Experimental results indicate that while the modeling of the central tendencies
+is achievable, modeling of the uncertainty in subjective responses proves
+significantly more challenging with currently available approaches even when
+empirical estimates of variations in the responses are available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented as a Findings paper at the 2025 European Conference
+  on Information Retrieval (ECIR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KEIR @ ECIR 2025: The Second Workshop on Knowledge-Enhanced Information
+  Retrieval <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Wang, Jinyuan Fang, Giacomo Frisoni, Zhuyun Dai, Zaiqiao Meng, Gianluca Moro, Emine Yilmaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained language models (PLMs) like BERT and GPT-4 have become the
+foundation for modern information retrieval (IR) systems. However, existing
+PLM-based IR models primarily rely on the knowledge learned during training for
+prediction, limiting their ability to access and incorporate external,
+up-to-date, or domain-specific information. Therefore, current information
+retrieval systems struggle with semantic nuances, context relevance, and
+domain-specific issues. To address these challenges, we propose the second
+Knowledge-Enhanced Information Retrieval workshop (KEIR @ ECIR 2025) as a
+platform to discuss innovative approaches that integrate external knowledge,
+aiming to enhance the effectiveness of information retrieval in a rapidly
+evolving technological landscape. The goal of this workshop is to bring
+together researchers from academia and industry to discuss various aspects of
+knowledge-enhanced information retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KEIR @ ECIR 2025 workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ontology Matching with Large Language Models and Prioritized Depth-First
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Taboada, Diego Martinez, Mohammed Arideh, Rosa Mosquera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ontology matching (OM) plays a key role in enabling data interoperability and
+knowledge sharing, but it remains challenging due to the need for large
+training datasets and limited vocabulary processing in machine learning
+approaches. Recently, methods based on Large Language Model (LLMs) have shown
+great promise in OM, particularly through the use of a retrieve-then-prompt
+pipeline. In this approach, relevant target entities are first retrieved and
+then used to prompt the LLM to predict the final matches. Despite their
+potential, these systems still present limited performance and high
+computational overhead. To address these issues, we introduce MILA, a novel
+approach that embeds a retrieve-identify-prompt pipeline within a prioritized
+depth-first search (PDFS) strategy. This approach efficiently identifies a
+large number of semantic correspondences with high accuracy, limiting LLM
+requests to only the most borderline cases. We evaluated MILA using the
+biomedical challenge proposed in the 2023 and 2024 editions of the Ontology
+Alignment Evaluation Initiative. Our method achieved the highest F-Measure in
+four of the five unsupervised tasks, outperforming state-of-the-art OM systems
+by up to 17%. It also performed better than or comparable to the leading
+supervised OM systems. MILA further exhibited task-agnostic performance,
+remaining stable across all tasks and settings, while significantly reducing
+LLM requests. These findings highlight that high-performance LLM-based OM can
+be achieved through a combination of programmed (PDFS), learned (embedding
+vectors), and prompting-based heuristics, without the need of domain-specific
+heuristics or fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Verifying Cross-modal Entity Consistency in News using Vision-language
+  Models <span class="chip">ECIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahar Tahmasebi, Eric Müller-Budack, Ralph Ewerth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The web has become a crucial source of information, but it is also used to
+spread disinformation, often conveyed through multiple modalities like images
+and text. The identification of inconsistent cross-modal information, in
+particular entities such as persons, locations, and events, is critical to
+detect disinformation. Previous works either identify out-of-context
+disinformation by assessing the consistency of images to the whole document,
+neglecting relations of individual entities, or focus on generic entities that
+are not relevant to news. So far, only few approaches have addressed the task
+of validating entity consistency between images and text in news. However, the
+potential of large vision-language models (LVLMs) has not been explored yet. In
+this paper, we propose an LVLM-based framework for verifying Cross-modal Entity
+Consistency~(LVLM4CEC), to assess whether persons, locations and events in news
+articles are consistent across both modalities. We suggest effective prompting
+strategies for LVLMs for entity verification that leverage reference images
+crawled from web. Moreover, we extend three existing datasets for the task of
+entity verification in news providing manual ground-truth data. Our results
+show the potential of LVLMs for automating cross-modal entity verification,
+showing improved accuracy in identifying persons and events when using evidence
+images. Moreover, our method outperforms a baseline for location and event
+verification in documents. The datasets and source code are available on GitHub
+at \url{https://github.com/TIBHannover/LVLM4CEC}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in: European Conference on Information
+  Retrieval (ECIR) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Language Models in Neural News Recommender Systems <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyue Zhao, Jin Huang, David Vos, Maarten de Rijke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural news recommender systems (RSs) have integrated language models (LMs)
+to encode news articles with rich textual information into representations,
+thereby improving the recommendation process. Most studies suggest that (i)
+news RSs achieve better performance with larger pre-trained language models
+(PLMs) than shallow language models (SLMs), and (ii) that large language models
+(LLMs) outperform PLMs. However, other studies indicate that PLMs sometimes
+lead to worse performance than SLMs. Thus, it remains unclear whether using
+larger LMs consistently improves the performance of news RSs. In this paper, we
+revisit, unify, and extend these comparisons of the effectiveness of LMs in
+news RSs using the real-world MIND dataset. We find that (i) larger LMs do not
+necessarily translate to better performance in news RSs, and (ii) they require
+stricter fine-tuning hyperparameter selection and greater computational
+resources to achieve optimal recommendation performance than smaller LMs. On
+the positive side, our experiments show that larger LMs lead to better
+recommendation performance for cold-start users: they alleviate dependency on
+extensive user interaction history and make recommendations more reliant on the
+news content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, ECIR 2025, the 47th European Conference on Information
+  Retrieval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangled Modeling of Preferences and Social Influence for Group
+  Recommendation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11342v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11342v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangze Ye, Wen Wu, Guoqing Wang, Xi Chen, Hong Zheng, Liang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The group recommendation (GR) aims to suggest items for a group of users in
+social networks. Existing work typically considers individual preferences as
+the sole factor in aggregating group preferences. Actually, social influence is
+also an important factor in modeling users' contributions to the final group
+decision. However, existing methods either neglect the social influence of
+individual members or bundle preferences and social influence together as a
+unified representation. As a result, these models emphasize the preferences of
+the majority within the group rather than the actual interaction items, which
+we refer to as the preference bias issue in GR. Moreover, the self-supervised
+learning (SSL) strategies they designed to address the issue of group data
+sparsity fail to account for users' contextual social weights when regulating
+group representations, leading to suboptimal results. To tackle these issues,
+we propose a novel model based on Disentangled Modeling of Preferences and
+Social Influence for Group Recommendation (DisRec). Concretely, we first design
+a user-level disentangling network to disentangle the preferences and social
+influence of group members with separate embedding propagation schemes based on
+(hyper)graph convolution networks. We then introduce a socialbased contrastive
+learning strategy, selectively excluding user nodes based on their social
+importance to enhance group representations and alleviate the group-level data
+sparsity issue. The experimental results demonstrate that our model
+significantly outperforms state-of-the-art methods on two realworld datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PlotEdit: Natural Language-Driven Accessible Chart Editing in PDFs via
+  Multimodal LLM Agents <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanika Goswami, Puneet Mathur, Ryan Rossi, Franck Dernoncourt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chart visualizations, while essential for data interpretation and
+communication, are predominantly accessible only as images in PDFs, lacking
+source data tables and stylistic information. To enable effective editing of
+charts in PDFs or digital scans, we present PlotEdit, a novel multi-agent
+framework for natural language-driven end-to-end chart image editing via
+self-reflective LLM agents. PlotEdit orchestrates five LLM agents: (1)
+Chart2Table for data table extraction, (2) Chart2Vision for style attribute
+identification, (3) Chart2Code for retrieving rendering code, (4) Instruction
+Decomposition Agent for parsing user requests into executable steps, and (5)
+Multimodal Editing Agent for implementing nuanced chart component modifications
+- all coordinated through multimodal feedback to maintain visual fidelity.
+PlotEdit outperforms existing baselines on the ChartCraft dataset across style,
+layout, format, and data-centric edits, enhancing accessibility for visually
+challenged users and improving novice productivity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECIR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">133</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic Data Can Mislead Evaluations: Membership Inference as Machine
+  Text Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11786v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11786v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Naseh, Niloofar Mireshghallah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work shows membership inference attacks (MIAs) on large language
+models (LLMs) produce inconclusive results, partly due to difficulties in
+creating non-member datasets without temporal shifts. While researchers have
+turned to synthetic data as an alternative, we show this approach can be
+fundamentally misleading. Our experiments indicate that MIAs function as
+machine-generated text detectors, incorrectly identifying synthetic data as
+training samples regardless of the data source. This behavior persists across
+different model architectures and sizes, from open-source models to commercial
+ones such as GPT-3.5. Even synthetic text generated by different, potentially
+larger models is classified as training data by the target model. Our findings
+highlight a serious concern: using synthetic data in membership evaluations may
+lead to false conclusions about model memorization and data leakage. We caution
+that this issue could affect other evaluations using model signals such as loss
+where synthetic or machine-generated translated data substitutes for real-world
+samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Glinthawk: A Two-Tiered Architecture for High-Throughput LLM Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pouya Hamadanian, Sadjad Fouladi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLM) have revolutionized natural language processing,
+but their inference demands substantial resources, while under-utilizing
+high-end accelerators like GPUs. A major bottleneck arises from the attention
+mechanism, which requires storing large key-value caches, limiting the maximum
+achievable throughput way below the available computing resources. Current
+approaches attempt to mitigate this issue through memory-efficient attention
+and paging mechanisms, but remained constrained by the assumption that all
+operations must be performed on high-end accelerators.
+  In this work, we propose Glinthawk, a two-tiered architecture that decouples
+the attention mechanism from the rest of the Transformer model. This approach
+allows the memory requirements for attention to scale independently, enabling
+larger batch sizes and more efficient use of the high-end accelerators. We
+prototype Glinthawk with NVIDIA T4 GPUs as one tier and standard CPU VMs as the
+other. Compared to a traditional single-tier setup, it improves throughput by
+$5.9\times$ and reduces cost of generation by $2.8\times$. For longer sequence
+lengths, it achieves $16.3\times$ throughput improvement at $2.4\times$ less
+cost. Our evaluation shows that this architecture can tolerate moderate network
+latency with minimal performance degradation, making it highly effective for
+latency-tolerant, throughput-oriented applications such as batch processing. We
+shared our prototype publicly at \url{https://github.com/microsoft/glinthawk}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Bayesian Neural Networks Make Confident Predictions? <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11773v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11773v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katharine Fisher, Youssef Marzouk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian inference promises a framework for principled uncertainty
+quantification of neural network predictions. Barriers to adoption include the
+difficulty of fully characterizing posterior distributions on network
+parameters and the interpretability of posterior predictive distributions. We
+demonstrate that under a discretized prior for the inner layer weights, we can
+exactly characterize the posterior predictive distribution as a Gaussian
+mixture. This setting allows us to define equivalence classes of network
+parameter values which produce the same likelihood (training error) and to
+relate the elements of these classes to the network's scaling regime -- defined
+via ratios of the training sample size, the size of each layer, and the number
+of final layer parameters. Of particular interest are distinct parameter
+realizations that map to low training error and yet correspond to distinct
+modes in the posterior predictive distribution. We identify settings that
+exhibit such predictive multimodality, and thus provide insight into the
+accuracy of unimodal posterior approximations. We also characterize the
+capacity of a model to "learn from data" by evaluating contraction of the
+posterior predictive in different scaling regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Mathematics of Modern Machine Learning Workshop at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is logical analysis performed by <span class="highlight-title">transformer</span>s taking place in
+  self-attention or in the fully connected part? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evgeniy Shin, Heinrich Matzinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers architecture apply self-attention to tokens represented as
+vectors, before a fully connected (neuronal network) layer. These two parts can
+be layered many times. Traditionally, self-attention is seen as a mechanism for
+aggregating information before logical operations are performed by the fully
+connected layer. In this paper, we show, that quite counter-intuitively, the
+logical analysis can also be performed within the self-attention. For this we
+implement a handcrafted single-level encoder layer which performs the logical
+analysis within self-attention. We then study the scenario in which a one-level
+transformer model undergoes self-learning using gradient descent. We
+investigate whether the model utilizes fully connected layers or self-attention
+mechanisms for logical analysis when it has the choice. Given that gradient
+descent can become stuck at undesired zeros, we explicitly calculate these
+unwanted zeros and find ways to avoid them. We do all this in the context of
+predicting grammatical category pairs of adjacent tokens in a text. We believe
+that our findings have broader implications for understanding the potential
+logical operations performed by self-attention.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 3 figures, to be submitted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangling stellar atmospheric parameters in astronomical spectra
+  using Generative Adversarial Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minia Manteiga, Raúl Santoveña, Marco A. Álvarez, Carlos Dafonte, Manuel G. Penedo, Silvana Navarro, Luis Corral
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A method based on Generative Adversaria! Networks (GANs) is developed for
+disentangling the physical (effective temperature and gravity) and chemical
+(metallicity, overabundance of a-elements with respect to iron) atmospheric
+properties in astronomical spectra. Using a projection of the stellar spectra,
+commonly called latent space, in which the contribution dueto one or several
+main stellar physicochemical properties is minimised while others are enhanced,
+it was possible to maximise the information related to certain properties,
+which can then be extracted using artificial neural networks (ANN) as
+regressors with higher accuracy than a reference method based on the use of ANN
+trained with the original spectra. Methods. Our model utilises autoencoders,
+comprising two artificial neural networks: an encoder anda decoder which
+transform input data into a low-dimensional representation known as latent
+space. It also uses discriminators, which are additional neural networks aimed
+at transforming the traditional autoencoder training into an adversaria!
+approach, to disentangle or reinforce the astrophysical parameters from the
+latent space. The GANDALF tool is described. It was developed to define, train,
+and test our GAN model with a web framework to show how the disentangling
+algorithm works visually. It is open to the community in Github. Results. The
+performance of our approach for retrieving atmospheric stellar properties from
+spectra is demonstrated using Gaia Radial Velocity Spectrograph (RVS) data from
+DR3. We use a data-driven perspective and obtain very competitive values, ali
+within the literature errors, and with the advantage of an important
+dimensionality reduction of the data to be processed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SILO: Solving Inverse Problems with Latent Operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ron Raphaeli, Sean Man, Michael Elad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consistent improvement of image priors over the years has led to the
+development of better inverse problem solvers. Diffusion models are the
+newcomers to this arena, posing the strongest known prior to date. Recently,
+such models operating in a latent space have become increasingly predominant
+due to their efficiency. In recent works, these models have been applied to
+solve inverse problems. Working in the latent space typically requires multiple
+applications of an Autoencoder during the restoration process, which leads to
+both computational and restoration quality challenges. In this work, we propose
+a new approach for handling inverse problems with latent diffusion models,
+where a learned degradation function operates within the latent space,
+emulating a known image space degradation. Usage of the learned operator
+reduces the dependency on the Autoencoder to only the initial and final steps
+of the restoration process, facilitating faster sampling and superior
+restoration quality. We demonstrate the effectiveness of our method on a
+variety of image restoration tasks and datasets, achieving significant
+improvements over prior art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page in https://ronraphaeli.github.io/SILO-website/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personalized Federated Learning for Cellular VR: Online Learning and
+  Dynamic Caching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krishnendu S. Tharakan, Hayssam Dahrouj, Nour Kouzayha, Hesham ElSawy, Tareq Y. Al-Naffouri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Delivering an immersive experience to virtual reality (VR) users through
+wireless connectivity offers the freedom to engage from anywhere at any time.
+Nevertheless, it is challenging to ensure seamless wireless connectivity that
+delivers real-time and high-quality videos to the VR users. This paper proposes
+a field of view (FoV) aware caching for mobile edge computing (MEC)-enabled
+wireless VR network. In particular, the FoV of each VR user is
+cached/prefetched at the base stations (BSs) based on the caching strategies
+tailored to each BS. Specifically, decentralized and personalized federated
+learning (DP-FL) based caching strategies with guarantees are presented.
+Considering VR systems composed of multiple VR devices and BSs, a DP-FL caching
+algorithm is implemented at each BS to personalize content delivery for VR
+users. The utilized DP-FL algorithm guarantees a probably approximately correct
+(PAC) bound on the conditional average cache hit. Further, to reduce the cost
+of communicating gradients, one-bit quantization of the stochastic gradient
+descent (OBSGD) is proposed, and a convergence guarantee of
+$\mathcal{O}(1/\sqrt{T})$ is obtained for the proposed algorithm, where $T$ is
+the number of iterations. Additionally, to better account for the wireless
+channel dynamics, the FoVs are grouped into multicast or unicast groups based
+on the number of requesting VR users. The performance of the proposed DP-FL
+algorithm is validated through realistic VR head-tracking dataset, and the
+proposed algorithm is shown to have better performance in terms of average
+delay and cache hit as compared to baseline algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted for publication in IEEE Transactions on Communications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-Reversible Langevin Algorithms for Constrained Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengrong Du, Qi Feng, Changwei Tu, Xiaoyu Wang, Lingjiong Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the constrained sampling problem where the goal is to sample from
+a target distribution on a constrained domain. We propose skew-reflected
+non-reversible Langevin dynamics (SRNLD), a continuous-time stochastic
+differential equation with skew-reflected boundary. We obtain non-asymptotic
+convergence rate of SRNLD to the target distribution in both total variation
+and 1-Wasserstein distances. By breaking reversibility, we show that the
+convergence is faster than the special case of the reversible dynamics. Based
+on the discretization of SRNLD, we propose skew-reflected non-reversible
+Langevin Monte Carlo (SRNLMC), and obtain non-asymptotic discretization error
+from SRNLD, and convergence guarantees to the target distribution in
+1-Wasserstein distance. We show better performance guarantees than the
+projected Langevin Monte Carlo in the literature that is based on the
+reversible dynamics. Numerical experiments are provided for both synthetic and
+real datasets to show efficiency of the proposed algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span> Vibration Forecasting for Advancing Rail Safety and
+  Maintenance 4.0 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Darío C. Larese, Almudena Bravo Cerrada, Gabriel Dambrosio Tomei, Alejandro Guerrero-López, Pablo M. Olmos, María Jesús Gómez García
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maintaining railway axles is critical to preventing severe accidents and
+financial losses. The railway industry is increasingly interested in advanced
+condition monitoring techniques to enhance safety and efficiency, moving beyond
+traditional periodic inspections toward Maintenance 4.0.
+  This study introduces a robust Deep Autoregressive solution that integrates
+seamlessly with existing systems to avert mechanical failures. Our approach
+simulates and predicts vibration signals under various conditions and fault
+scenarios, improving dataset robustness for more effective detection systems.
+These systems can alert maintenance needs, preventing accidents preemptively.
+We use experimental vibration signals from accelerometers on train axles.
+  Our primary contributions include a transformer model, ShaftFormer, designed
+for processing time series data, and an alternative model incorporating
+spectral methods and enhanced observation models. Simulating vibration signals
+under diverse conditions mitigates the high cost of obtaining experimental
+signals for all scenarios. Given the non-stationary nature of railway vibration
+signals, influenced by speed and load changes, our models address these
+complexities, offering a powerful tool for predictive maintenance in the rail
+industry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SeRpEnt: Selective Resampling for Expressive State Space Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Rando, Luca Romani, Matteo Migliarini, Luca Franco, Denis Gudovskiy, Fabio Galasso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State Space Models (SSMs) have recently enjoyed a rise to prominence in the
+field of deep learning for sequence modeling, especially as an alternative to
+Transformers. Their success stems from avoiding two well-known drawbacks of
+attention-based models: quadratic complexity with respect to the sequence
+length and inability to model long-range dependencies. The SSM variant Mamba
+has demonstrated performance comparable to Transformers without any form of
+attention, thanks to the use of a selective mechanism for the state parameters.
+Selectivity, however, is only evaluated empirically and the reasons of its
+effectiveness remain unclear. In this work, we show how selectivity is related
+to the sequence processing. Our analysis shows that selective time intervals in
+Mamba act as linear approximators of information. Then, we propose our SeRpEnt
+architecture, a SSM that further exploits selectivity to compress sequences in
+an information-aware fashion. It employs a resampling mechanism that aggregates
+elements based on their information content. Our empirical results in the Long
+Range Arena benchmark and other language modeling tasks show benefits of the
+SeRpEnt's resampling mechanism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explain-Query-Test: Self-Evaluating LLMs Via Explanation and
+  Comprehension Discrepancy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saeid Asgari Taghanaki, Joao Monteiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable proficiency in
+generating detailed and coherent explanations of complex concepts. However, the
+extent to which these models truly comprehend the concepts they articulate
+remains unclear. To assess the level of comprehension of a model relative to
+the content it generates, we implemented a self-evaluation pipeline where
+models: (i) given a topic generate an excerpt with information about the topic,
+(ii) given an excerpt generate question-answer pairs, and finally (iii) given a
+question generate an answer. We refer to this self-evaluation approach as
+Explain-Query-Test (EQT). Interestingly, the accuracy on generated questions
+resulting from running the EQT pipeline correlates strongly with the model
+performance as verified by typical benchmarks such as MMLU-Pro. In other words,
+EQT's performance is predictive of MMLU-Pro's, and EQT can be used to rank
+models without the need for any external source of evaluation data other than
+lists of topics of interest. Moreover, our results reveal a disparity between
+the models' ability to produce detailed explanations and their performance on
+questions related to those explanations. This gap highlights fundamental
+limitations in the internal knowledge representation and reasoning abilities of
+current LLMs. We release the code at https://github.com/asgsaeid/EQT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prediction of Lung Metastasis from Hepatocellular Carcinoma using the
+  SEER Database 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeff J. H. Kim, George R. Nahass, Yang Dai, Theja Tulabandhula
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hepatocellular carcinoma (HCC) is a leading cause of cancer-related
+mortality, with lung metastases being the most common site of distant spread
+and significantly worsening prognosis. Despite the growing availability of
+clinical and demographic data, predictive models for lung metastasis in HCC
+remain limited in scope and clinical applicability. In this study, we develop
+and validate an end-to-end machine learning pipeline using data from the
+Surveillance, Epidemiology, and End Results (SEER) database. We evaluated three
+machine learning models (Random Forest, XGBoost, and Logistic Regression)
+alongside a multilayer perceptron (MLP) neural network. Our models achieved
+high AUROC values and recall, with the Random Forest and MLP models
+demonstrating the best overall performance (AUROC = 0.82). However, the low
+precision across models highlights the challenges of accurately predicting
+positive cases. To address these limitations, we developed a custom loss
+function incorporating recall optimization, enabling the MLP model to achieve
+the highest sensitivity. An ensemble approach further improved overall recall
+by leveraging the strengths of individual models. Feature importance analysis
+revealed key predictors such as surgery status, tumor staging, and follow up
+duration, emphasizing the relevance of clinical interventions and disease
+progression in metastasis prediction. While this study demonstrates the
+potential of machine learning for identifying high-risk patients, limitations
+include reliance on imbalanced datasets, incomplete feature annotations, and
+the low precision of predictions. Future work should leverage the expanding
+SEER dataset, improve data imputation techniques, and explore advanced
+pre-trained models to enhance predictive accuracy and clinical utility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>JJHK and GRN contributed equally, YD and TT are co-corresponding. 11
+  pages, 7 figures, 1 Table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Transition from Centralized Machine Learning to Federated Learning
+  for Mental Health in Education: A <span class="highlight-title">Survey</span> of Current Methods and Future
+  Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maryam Ebrahimi, Rajeev Sahay, Seyyedali Hosseinalipour, Bita Akram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research has increasingly explored the application of artificial intelligence
+(AI) and machine learning (ML) within the mental health domain to enhance both
+patient care and healthcare provider efficiency. Given that mental health
+challenges frequently emerge during early adolescence -- the critical years of
+high school and college -- investigating AI/ML-driven mental health solutions
+within the education domain is of paramount importance. Nevertheless,
+conventional AI/ML techniques follow a centralized model training architecture,
+which poses privacy risks due to the need for transferring students' sensitive
+data from institutions, universities, and clinics to central servers. Federated
+learning (FL) has emerged as a solution to address these risks by enabling
+distributed model training while maintaining data privacy. Despite its
+potential, research on applying FL to analyze students' mental health remains
+limited. In this paper, we aim to address this limitation by proposing a
+roadmap for integrating FL into mental health data analysis within educational
+settings. We begin by providing an overview of mental health issues among
+students and reviewing existing studies where ML has been applied to address
+these challenges. Next, we examine broader applications of FL in the mental
+health domain to emphasize the lack of focus on educational contexts. Finally,
+we propose promising research directions focused on using FL to address mental
+health issues in the education sector, which entails discussing the synergies
+between the proposed directions with broader human-centered domains. By
+categorizing the proposed research directions into short- and long-term
+strategies and highlighting the unique challenges at each stage, we aim to
+encourage the development of privacy-conscious AI/ML-driven mental health
+solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 1 figure, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging graph neural networks and mobility data for COVID-19
+  forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando H. O. Duarte, Gladston J. P. Moreira, Eduardo J. S. Luz, Leonardo B. L. Santos, Vander L. S. Freitas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic has victimized over 7 million people to date, prompting
+diverse research efforts. Spatio-temporal models combining mobility data with
+machine learning have gained attention for disease forecasting. Here, we
+explore Graph Convolutional Recurrent Network (GCRN) and Graph Convolutional
+Long Short-Term Memory (GCLSTM), which combine the power of Graph Neural
+Networks (GNN) with traditional architectures that deal with sequential data.
+The aim is to forecast future values of COVID-19 cases in Brazil and China by
+leveraging human mobility networks, whose nodes represent geographical
+locations and links are flows of vehicles or people. We show that employing
+backbone extraction to filter out negligible connections in the mobility
+network enhances predictive stability. Comparing regression and classification
+tasks demonstrates that binary classification yields smoother, more
+interpretable results. Interestingly, we observe qualitatively equivalent
+results for both Brazil and China datasets by introducing sliding windows of
+variable size and prediction horizons. Compared to prior studies, introducing
+the sliding window and the network backbone extraction strategies yields
+improvements of about 80% in root mean squared errors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trustformer: A Trusted Federated <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11706v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11706v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Abbasi Tadi, Dima Alhadidi, Luis Rueda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers, a cornerstone of deep-learning architectures for sequential
+data, have achieved state-of-the-art results in tasks like Natural Language
+Processing (NLP). Models such as BERT and GPT-3 exemplify their success and
+have driven the rise of large language models (LLMs). However, a critical
+challenge persists: safeguarding the privacy of data used in LLM training.
+Privacy-preserving techniques like Federated Learning (FL) offer potential
+solutions, but practical limitations hinder their effectiveness for Transformer
+training. Two primary issues are (I) the risk of sensitive information leakage
+due to aggregation methods like FedAvg or FedSGD, and (II) the high
+communication overhead caused by the large size of Transformer models.
+  This paper introduces a novel FL method that reduces communication overhead
+while maintaining competitive utility. Our approach avoids sharing full model
+weights by simulating a global model locally. We apply k-means clustering to
+each Transformer layer, compute centroids locally, and transmit only these
+centroids to the server instead of full weights or gradients. To enhance
+security, we leverage Intel SGX for secure transmission of centroids. Evaluated
+on a translation task, our method achieves utility comparable to
+state-of-the-art baselines while significantly reducing communication costs.
+This provides a more efficient and privacy-preserving FL solution for
+Transformer models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatially-Delineated Domain-Adapted AI Classification: An Application
+  for Oncology Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Majid Farhadloo, Arun Sharma, Alexey Leontovich, Svetomir N. Markovic, Shashi Shekhar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given multi-type point maps from different place-types (e.g., tumor regions),
+our objective is to develop a classifier trained on the source place-type to
+accurately distinguish between two classes of the target place-type based on
+their point arrangements. This problem is societally important for many
+applications, such as generating clinical hypotheses for designing new
+immunotherapies for cancer treatment. The challenge lies in the spatial
+variability, the inherent heterogeneity and variation observed in spatial
+properties or arrangements across different locations (i.e., place-types).
+Previous techniques focus on self-supervised tasks to learn domain-invariant
+features and mitigate domain differences; however, they often neglect the
+underlying spatial arrangements among data points, leading to significant
+discrepancies across different place-types. We explore a novel multi-task
+self-learning framework that targets spatial arrangements, such as spatial
+mix-up masking and spatial contrastive predictive coding, for
+spatially-delineated domain-adapted AI classification. Experimental results on
+real-world datasets (e.g., oncology data) show that the proposed framework
+provides higher prediction accuracy than baseline methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Randomness, exchangeability, and conformal prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladimir Vovk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This note continues development of the functional theory of randomness, a
+modification of the algorithmic theory of randomness getting rid of unspecified
+additive constants. It introduces new kinds of confidence predictors, including
+randomness predictors (the most general confidence predictors based on the
+assumption of IID observations) and exchangeability predictors (the most
+general confidence predictors based on the assumption of exchangeable
+observations). The main result implies that both are close to conformal
+predictors and quantifies the difference between them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Randomized Kaczmarz Methods with Beyond-Krylov Convergence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11673v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11673v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michał Dereziński, Deanna Needell, Elizaveta Rebrova, Jiaming Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Randomized Kaczmarz methods form a family of linear system solvers which
+converge by repeatedly projecting their iterates onto randomly sampled
+equations. While effective in some contexts, such as highly over-determined
+least squares, Kaczmarz methods are traditionally deemed secondary to Krylov
+subspace methods, since this latter family of solvers can exploit outliers in
+the input's singular value distribution to attain fast convergence on
+ill-conditioned systems.
+  In this paper, we introduce Kaczmarz++, an accelerated randomized block
+Kaczmarz algorithm that exploits outlying singular values in the input to
+attain a fast Krylov-style convergence. Moreover, we show that Kaczmarz++
+captures large outlying singular values provably faster than popular Krylov
+methods, for both over- and under-determined systems. We also develop an
+optimized variant for positive semidefinite systems, called CD++, demonstrating
+empirically that it is competitive in arithmetic operations with both CG and
+GMRES on a collection of benchmark problems. To attain these results, we
+introduce several novel algorithmic improvements to the Kaczmarz framework,
+including adaptive momentum acceleration, Tikhonov-regularized projections, and
+a memoization scheme for reusing information from previously sampled
+equation~blocks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of HI Galaxy Profiles Using Unsupervised Learning and
+  Convolutional Neural Networks: A Comparative Analysis and Methodological
+  Cases of Studies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11657v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11657v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Jaimes-Illanes, Manuel Parra-Royon, Laura Darriba-Pol, Javier Moldón, Amidou Sorgho, Susana Sánchez-Expósito, Julián Garrido-Sánchez, Lourdes Verdes-Montenegro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hydrogen, the most abundant element in the universe, is crucial for
+understanding galaxy formation and evolution. The 21 cm neutral atomic hydrogen
+- HI spectral line maps the gas kinematics within galaxies, providing key
+insights into interactions, galactic structure, and star formation processes.
+With new radio instruments, the volume and complexity of data is increasing. To
+analyze and classify integrated HI spectral profiles in a efficient way, this
+work presents a framework that integrates Machine Learning techniques,
+combining unsupervised methods and CNNs. To this end, we apply our framework to
+a selected subsample of 318 spectral HI profiles of the CIG and 30.780 profiles
+from the Arecibo Legacy Fast ALFA Survey catalogue. Data pre-processing
+involved the Busyfit package and iterative fitting with polynomial, Gaussian,
+and double-Lorentzian models. Clustering methods, including K-means, spectral
+clustering, DBSCAN, and agglomerative clustering, were used for feature
+extraction and to bootstrap classification we applied K-NN, SVM, and Random
+Forest classifiers, optimizing accuracy with CNN. Additionally, we introduced a
+2D model of the profiles to enhance classification by adding dimensionality to
+the data. Three 2D models were generated based on transformations and
+normalised versions to quantify the level of asymmetry. These methods were
+tested in a previous analytical classification study conducted by the Analysis
+of the Interstellar Medium in Isolated Galaxies group. This approach enhances
+classification accuracy and aims to establish a methodology that could be
+applied to data analysis in future surveys conducted with the Square Kilometre
+Array (SKA), currently under construction. All materials, code, and models have
+been made publicly available in an open-access repository, adhering to FAIR
+principles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KKL Observer Synthesis for Nonlinear Systems via Physics-Informed
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Umar B. Niazi, John Cao, Matthieu Barreau, Karl Henrik Johansson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel learning approach for designing
+Kazantzis-Kravaris/Luenberger (KKL) observers for autonomous nonlinear systems.
+The design of a KKL observer involves finding an injective map that transforms
+the system state into a higher-dimensional observer state, whose dynamics is
+linear and stable. The observer's state is then mapped back to the original
+system coordinates via the inverse map to obtain the state estimate. However,
+finding this transformation and its inverse is quite challenging. We propose to
+sequentially approximate these maps by neural networks that are trained using
+physics-informed learning. We generate synthetic data for training by
+numerically solving the system and observer dynamics. Theoretical guarantees
+for the robustness of state estimation against approximation error and system
+uncertainties are provided. Additionally, a systematic method for optimizing
+observer performance through parameter selection is presented. The
+effectiveness of the proposed approach is demonstrated through numerical
+simulations on benchmark examples and its application to sensor fault detection
+and isolation in a network of Kuramoto oscillators using learned KKL observers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Scene Understanding from Vision-Language Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahaf Pruss, Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images depicting complex, dynamic scenes are challenging to parse
+automatically, requiring both high-level comprehension of the overall situation
+and fine-grained identification of participating entities and their
+interactions. Current approaches use distinct methods tailored to sub-tasks
+such as Situation Recognition and detection of Human-Human and Human-Object
+Interactions. However, recent advances in image understanding have often
+leveraged web-scale vision-language (V&L) representations to obviate
+task-specific engineering. In this work, we propose a framework for dynamic
+scene understanding tasks by leveraging knowledge from modern, frozen V&L
+representations. By framing these tasks in a generic manner - as predicting and
+parsing structured text, or by directly concatenating representations to the
+input of existing models - we achieve state-of-the-art results while using a
+minimal number of trainable parameters relative to existing approaches.
+Moreover, our analysis of dynamic knowledge of these representations shows that
+recent, more powerful representations effectively encode dynamic scene
+semantics, making this approach newly possible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Language Model Reasoning through Reinforcement Learning and
+  Inference Scaling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11651v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11651v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyu Hou, Xin Lv, Rui Lu, Jiajie Zhang, Yujiang Li, Zijun Yao, Juanzi Li, Jie Tang, Yuxiao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable capabilities in
+complex reasoning tasks. However, existing approaches mainly rely on imitation
+learning and struggle to achieve effective test-time scaling. While
+reinforcement learning (RL) holds promise for enabling self-exploration and
+learning from feedback, recent attempts yield only modest improvements in
+complex reasoning. In this paper, we present T1 to scale RL by encouraging
+exploration and understand inference scaling. We first initialize the LLM using
+synthesized chain-of-thought data that integrates trial-and-error and
+self-verification. To scale RL training, we promote increased sampling
+diversity through oversampling. We further employ an entropy bonus as an
+auxiliary loss, alongside a dynamic anchor for regularization to facilitate
+reward optimization. We demonstrate that T1 with open LLMs as its base exhibits
+inference scaling behavior and achieves superior performance on challenging
+math reasoning benchmarks. For example, T1 with Qwen2.5-32B as the base model
+outperforms the recent Qwen QwQ-32B-Preview model on MATH500, AIME2024, and
+Omni-math-500. More importantly, we present a simple strategy to examine
+inference scaling, where increased inference budgets directly lead to T1's
+better performance without any additional verification. We will open-source the
+T1 models and the data used to train them at \url{https://github.com/THUDM/T1}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class Imbalance in Anomaly Detection: Learning from an Exactly Solvable
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        F. S. Pezzicoli, V. Ros, F. P. Landes, M. Baity-Jesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class imbalance (CI) is a longstanding problem in machine learning, slowing
+down training and reducing performances. Although empirical remedies exist, it
+is often unclear which ones work best and when, due to the lack of an
+overarching theory. We address a common case of imbalance, that of anomaly (or
+outlier) detection. We provide a theoretical framework to analyze, interpret
+and address CI. It is based on an exact solution of the teacher-student
+perceptron model, through replica theory. Within this framework, one can
+distinguish several sources of CI: either intrinsic, train or test imbalance.
+Our analysis reveals that the optimal train imbalance is generally different
+from 50%, with a non trivial dependence on the intrinsic imbalance, the
+abundance of data and on the noise in the learning. Moreover, there is a
+crossover between a small noise training regime where results are independent
+of the noise level to a high noise regime where performances quickly degrade
+with noise. Our results challenge some of the conventional wisdom on CI and
+offer practical guidelines to address it.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early evidence of how LLMs outperform traditional systems on OCR/HTR
+  tasks for historical records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seorin Kim, Julien Baudru, Wouter Ryckbosch, Hugues Bersini, Vincent Ginis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the ability of two LLMs -- GPT-4o and Claude Sonnet 3.5 -- to
+transcribe historical handwritten documents in a tabular format and compare
+their performance to traditional OCR/HTR systems: EasyOCR, Keras, Pytesseract,
+and TrOCR. Considering the tabular form of the data, two types of experiments
+are executed: one where the images are split line by line and the other where
+the entire scan is used as input. Based on CER and BLEU, we demonstrate that
+LLMs outperform the conventional OCR/HTR methods. Moreover, we also compare the
+evaluated CER and BLEU scores to human evaluations to better judge the outputs
+of whole-scan experiments and understand influential factors for CER and BLEU.
+Combining judgments from all the evaluation metrics, we conclude that two-shot
+GPT-4o for line-by-line images and two-shot Claude Sonnet 3.5 for whole-scan
+images yield the transcriptions of the historical records most similar to the
+ground truth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Learning for Heterogeneous Subgroups Based on Nonlinear Causal
+  Kernel Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11622v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11622v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Liu, Yang Tang, Kexuan Zhang, Qiyu Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the challenge posed by multi-source and heterogeneous data collected
+from diverse environments, causal relationships among features can exhibit
+variations influenced by different time spans, regions, or strategies. This
+diversity makes a single causal model inadequate for accurately representing
+complex causal relationships in all observational data, a crucial consideration
+in causal learning. To address this challenge, we introduce the nonlinear
+Causal Kernel Clustering method designed for heterogeneous subgroup causal
+learning, illuminating variations in causal relationships across diverse
+subgroups. It comprises two primary components. First, the construction of a
+sample mapping function forms the basis of the subsequent nonlinear causal
+kernel. This function assesses the differences in potential nonlinear causal
+relationships in various samples, supported by our causal identifiability
+theory. Second, a nonlinear causal kernel is proposed for clustering
+heterogeneous subgroups. Experimental results showcase the exceptional
+performance of our method in accurately identifying heterogeneous subgroups and
+effectively enhancing causal learning, leading to a great reduction in
+prediction error.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trojan Detection Through Pattern Recognition for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vedant Bhasin, Matthew Yudin, Razvan Stefanescu, Rauf Izmailov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trojan backdoors can be injected into large language models at various
+stages, including pretraining, fine-tuning, and in-context learning, posing a
+significant threat to the model's alignment. Due to the nature of causal
+language modeling, detecting these triggers is challenging given the vast
+search space. In this study, we propose a multistage framework for detecting
+Trojan triggers in large language models consisting of token filtration,
+trigger identification, and trigger verification. We discuss existing trigger
+identification methods and propose two variants of a black-box trigger
+inversion method that rely on output logits, utilizing beam search and greedy
+decoding respectively. We show that the verification stage is critical in the
+process and propose semantic-preserving prompts and special perturbations to
+differentiate between actual Trojan triggers and other adversarial strings that
+display similar characteristics. The evaluation of our approach on the TrojAI
+and RLHF poisoned model datasets demonstrates promising results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fairness Testing through Extreme Value Theory <span class="chip">ICSE'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11597v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11597v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Verya Monjezi, Ashutosh Trivedi, Vladik Kreinovich, Saeid Tizpaz-Niari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven software is increasingly being used as a critical component of
+automated decision-support systems. Since this class of software learns its
+logic from historical data, it can encode or amplify discriminatory practices.
+Previous research on algorithmic fairness has focused on improving average-case
+fairness. On the other hand, fairness at the extreme ends of the spectrum,
+which often signifies lasting and impactful shifts in societal attitudes, has
+received significantly less emphasis.
+  Leveraging the statistics of extreme value theory (EVT), we propose a novel
+fairness criterion called extreme counterfactual discrimination (ECD). This
+criterion estimates the worst-case amounts of disadvantage in outcomes for
+individuals solely based on their memberships in a protected group. Utilizing
+tools from search-based software engineering and generative AI, we present a
+randomized algorithm that samples a statistically significant set of points
+from the tail of ML outcome distributions even if the input dataset lacks a
+sufficient number of relevant samples.
+  We conducted several experiments on four ML models (deep neural networks,
+logistic regression, and random forests) over 10 socially relevant tasks from
+the literature on algorithmic fairness. First, we evaluate the generative AI
+methods and find that they generate sufficient samples to infer valid EVT
+distribution in 95% of cases. Remarkably, we found that the prevalent bias
+mitigators reduce the average-case discrimination but increase the worst-case
+discrimination significantly in 5% of cases. We also observed that even the
+tail-aware mitigation algorithm -- MiniMax-Fairness -- increased the worst-case
+discrimination in 30% of cases. We propose a novel ECD-based mitigator that
+improves fairness in the tail in 90% of cases with no degradation of the
+average-case discrimination.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In IEEE/ACM 47th International Conference on Software Engineering
+  (ICSE'25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training-free Ultra Small Model for Universal Sparse Reconstruction in
+  Compressed Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqing Tang, Huanze Zhuang, Guiyun Tian, Zhenli Zeng, Yi Ding, Wenzhong Liu, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large models attract widespread attention in recent years, but
+they face challenges in applications that require high interpretability or have
+limited resources, such as physical sensing, medical imaging, and
+bioinformatics. Compressed Sensing (CS) is a well-proved theory that drives
+many recent breakthroughs in these applications. However, as a typical
+under-determined linear system, CS suffers from excessively long sparse
+reconstruction times when using traditional iterative methods, particularly
+with large-scale data. Current AI methods like deep unfolding fail to
+substitute them because pre-trained models exhibit poor generality beyond their
+training conditions and dataset distributions, or lack interpretability.
+Instead of following the big model fervor, this paper proposes ultra-small
+artificial neural models called coefficients learning (CL), enabling
+training-free and rapid sparse reconstruction while perfectly inheriting the
+generality and interpretability of traditional iterative methods, bringing new
+feature of incorporating prior knowledges. In CL, a signal of length $n$ only
+needs a minimal of $n$ trainable parameters. A case study model called CLOMP is
+implemented for evaluation. Experiments are conducted on both synthetic and
+real one-dimensional and two-dimensional signals, demonstrating significant
+improvements in efficiency and accuracy. Compared to representative iterative
+methods, CLOMP improves efficiency by 100 to 1000 folds for large-scale data.
+Test results on eight diverse image datasets indicate that CLOMP improves
+structural similarity index by 292%, 98%, 45% for sampling rates of 0.1, 0.3,
+0.5, respectively. We believe this method can truly usher CS reconstruction
+into the AI era, benefiting countless under-determined linear systems that rely
+on sparse solution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recurrent Diffusion for Large-Scale Parameter Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11587v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11587v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Wang, Dongwen Tang, Wangbo Zhao, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter generation has struggled to scale up for a long time, significantly
+limiting its range of applications. In this study, we introduce
+\textbf{R}ecurrent diffusion for large-scale \textbf{P}arameter
+\textbf{G}eneration, called \textbf{RPG}. We first divide the trained
+parameters into non-overlapping parts, after which a recurrent model is
+proposed to learn their relationships. The recurrent model's outputs, as
+conditions, are then fed into a diffusion model to generate the neural network
+parameters. Using only a single GPU, recurrent diffusion enables us to generate
+popular vision and language models such as ConvNeXt-L and LoRA parameters of
+LLaMA-7B. Meanwhile, across various architectures and tasks, the generated
+parameters consistently perform comparable results over trained networks.
+Notably, our approach also shows the potential to generate models for handling
+unseen tasks, which largely increases the practicality of parameter generation.
+Our code is available
+\href{https://github.com/NUS-HPC-AI-Lab/Recurrent-Parameter-Generation}{here}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Generating 200 million parameters in just minutes</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GCSAM: Gradient Centralized Sharpness Aware Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11584v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11584v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Hassan, Aleksandar Vakanski, Boyu Zhang, Min Xian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generalization performance of deep neural networks (DNNs) is a critical
+factor in achieving robust model behavior on unseen data. Recent studies have
+highlighted the importance of sharpness-based measures in promoting
+generalization by encouraging convergence to flatter minima. Among these
+approaches, Sharpness-Aware Minimization (SAM) has emerged as an effective
+optimization technique for reducing the sharpness of the loss landscape,
+thereby improving generalization. However, SAM's computational overhead and
+sensitivity to noisy gradients limit its scalability and efficiency. To address
+these challenges, we propose Gradient-Centralized Sharpness-Aware Minimization
+(GCSAM), which incorporates Gradient Centralization (GC) to stabilize gradients
+and accelerate convergence. GCSAM normalizes gradients before the ascent step,
+reducing noise and variance, and improving stability during training. Our
+evaluations indicate that GCSAM consistently outperforms SAM and the Adam
+optimizer in terms of generalization and computational efficiency. These
+findings demonstrate GCSAM's effectiveness across diverse domains, including
+general and medical imaging tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Membership Inference Attacks Against Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Wu, Jing Chen, Qianru Fang, Kun He, Ziming Zhao, Hao Ren, Guowen Xu, Yang Liu, Yang Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transfer learning, successful in knowledge translation across related tasks,
+faces a substantial privacy threat from membership inference attacks (MIAs).
+These attacks, despite posing significant risk to ML model's training data,
+remain limited-explored in transfer learning. The interaction between teacher
+and student models in transfer learning has not been thoroughly explored in
+MIAs, potentially resulting in an under-examined aspect of privacy
+vulnerabilities within transfer learning. In this paper, we propose a new MIA
+vector against transfer learning, to determine whether a specific data point
+was used to train the teacher model while only accessing the student model in a
+white-box setting. Our method delves into the intricate relationship between
+teacher and student models, analyzing the discrepancies in hidden layer
+representations between the student model and its shadow counterpart. These
+identified differences are then adeptly utilized to refine the shadow model's
+training process and to inform membership inference decisions effectively. Our
+method, evaluated across four datasets in diverse transfer learning tasks,
+reveals that even when an attacker only has access to the student model, the
+teacher model's training data remains susceptible to MIAs. We believe our work
+unveils the unexplored risk of membership inference in transfer learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Estimation in the Real World: A Study on Music Emotion
+  Recognition <span class="chip">ECIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karn N. Watcharasupat, Yiwei Ding, T. Aleksandra Ma, Pavan Seshadri, Alexander Lerch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Any data annotation for subjective tasks shows potential variations between
+individuals. This is particularly true for annotations of emotional responses
+to musical stimuli. While older approaches to music emotion recognition systems
+frequently addressed this uncertainty problem through probabilistic modeling,
+modern systems based on neural networks tend to ignore the variability and
+focus only on predicting central tendencies of human subjective responses. In
+this work, we explore several methods for estimating not only the central
+tendencies of the subjective responses to a musical stimulus, but also for
+estimating the uncertainty associated with these responses. In particular, we
+investigate probabilistic loss functions and inference-time random sampling.
+Experimental results indicate that while the modeling of the central tendencies
+is achievable, modeling of the uncertainty in subjective responses proves
+significantly more challenging with currently available approaches even when
+empirical estimates of variations in the responses are available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented as a Findings paper at the 2025 European Conference
+  on Information Retrieval (ECIR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Defense Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin He, Wenqi Fan, Yili Wang, Chengyi Liu, Rui Miao, Xin Juan, Xin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) demonstrate significant potential in various
+applications but remain highly vulnerable to adversarial attacks, which can
+greatly degrade their performance. Existing graph purification methods attempt
+to address this issue by filtering attacked graphs; however, they struggle to
+effectively defend against multiple types of adversarial attacks simultaneously
+due to their limited flexibility, and they lack comprehensive modeling of graph
+data due to their heavy reliance on heuristic prior knowledge. To overcome
+these challenges, we propose a more versatile approach for defending against
+adversarial attacks on graphs. In this work, we introduce the Graph Defense
+Diffusion Model (GDDM), a flexible purification method that leverages the
+denoising and modeling capabilities of diffusion models. The iterative nature
+of diffusion models aligns well with the stepwise process of adversarial
+attacks, making them particularly suitable for defense. By iteratively adding
+and removing noise, GDDM effectively purifies attacked graphs, restoring their
+original structure and features. Our GDDM consists of two key components: (1)
+Graph Structure-Driven Refiner, which preserves the basic fidelity of the graph
+during the denoising process, and ensures that the generated graph remains
+consistent with the original scope; and (2) Node Feature-Constrained
+Regularizer, which removes residual impurities from the denoised graph, further
+enhances the purification effect. Additionally, we design tailored denoising
+strategies to handle different types of adversarial attacks, improving the
+model's adaptability to various attack scenarios. Extensive experiments
+conducted on three real-world datasets demonstrate that GDDM outperforms
+state-of-the-art methods in defending against a wide range of adversarial
+attacks, showcasing its robustness and effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Lane Change Prediction for Near-Crash Scenarios Using
+  Knowledge Graph Embeddings and Retrieval Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Manzour, A. Ballardini, R. Izquierdo, M. Á. Sotelo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lane-changing maneuvers, particularly those executed abruptly or in risky
+situations, are a significant cause of road traffic accidents. However, current
+research mainly focuses on predicting safe lane changes. Furthermore, existing
+accident datasets are often based on images only and lack comprehensive sensory
+data. In this work, we focus on predicting risky lane changes using the CRASH
+dataset (our own collected dataset specifically for risky lane changes), and
+safe lane changes (using the HighD dataset). Then, we leverage KG and Bayesian
+inference to predict these maneuvers using linguistic contextual information,
+enhancing the model's interpretability and transparency. The model achieved a
+91.5% f1-score with anticipation time extending to four seconds for risky lane
+changes, and a 90.0% f1-score for predicting safe lane changes with the same
+anticipation time. We validate our model by integrating it into a vehicle
+within the CARLA simulator in scenarios that involve risky lane changes. The
+model managed to anticipate sudden lane changes, thus providing automated
+vehicles with further time to plan and execute appropriate safe reactions.
+Finally, to enhance the explainability of our model, we utilize RAG to provide
+clear and natural language explanations for the given prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Secure Resource Allocation via Constrained Deep Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11557v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11557v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianfei Sun, Qiang Gao, Cong Wu, Yuxian Li, Jiacheng Wang, Dusit Niyato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of Internet of Things (IoT) devices and the advent of 6G
+technologies have introduced computationally intensive tasks that often surpass
+the processing capabilities of user devices. Efficient and secure resource
+allocation in serverless multi-cloud edge computing environments is essential
+for supporting these demands and advancing distributed computing. However,
+existing solutions frequently struggle with the complexity of multi-cloud
+infrastructures, robust security integration, and effective application of
+traditional deep reinforcement learning (DRL) techniques under system
+constraints. To address these challenges, we present SARMTO, a novel framework
+that integrates an action-constrained DRL model. SARMTO dynamically balances
+resource allocation, task offloading, security, and performance by utilizing a
+Markov decision process formulation, an adaptive security mechanism, and
+sophisticated optimization techniques. Extensive simulations across varying
+scenarios, including different task loads, data sizes, and MEC capacities, show
+that SARMTO consistently outperforms five baseline approaches, achieving up to
+a 40% reduction in system costs and a 41.5% improvement in energy efficiency
+over state-of-the-art methods. These enhancements highlight SARMTO's potential
+to revolutionize resource management in intricate distributed computing
+environments, opening the door to more efficient and secure IoT and edge
+computing applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond R-barycenters: an effective averaging method on Stiefel and
+  Grassmann manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florent Bouchard, Nils Laurent, Salem Said, Nicolas Le Bihan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, the issue of averaging data on a manifold is addressed. While
+the Fr\'echet mean resulting from Riemannian geometry appears ideal, it is
+unfortunately not always available and often computationally very expensive. To
+overcome this, R-barycenters have been proposed and successfully applied to
+Stiefel and Grassmann manifolds. However, R-barycenters still suffer severe
+limitations as they rely on iterative algorithms and complicated operators. We
+propose simpler, yet efficient, barycenters that we call RL-barycenters. We
+show that, in the setting relevant to most applications, our framework yields
+astonishingly simple barycenters: arithmetic means projected onto the manifold.
+We apply this approach to the Stiefel and Grassmann manifolds. On simulated
+data, our approach is competitive with respect to existing averaging methods,
+while computationally cheaper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DLinear-based Prediction of Remaining Useful Life of Lithium-Ion
+  Batteries: Feature Engineering through Explainable Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsu Kim, Jaehyun Oh, Sang-Young Lee, Junghwan Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of the Remaining Useful Life (RUL) of lithium-ion
+batteries is essential for ensuring safety, reducing maintenance costs, and
+optimizing usage. However, predicting RUL is challenging due to the nonlinear
+characteristics of the degradation caused by complex chemical reactions.
+Machine learning allows precise predictions by learning the latent functions of
+degradation relationships based on cycling behavior. This study introduces an
+accurate RUL prediction approach based on feature engineering and DLinear,
+applied to the dataset from NASA's Prognostics Center of Excellence. Among the
+20 features generated from current, voltage, temperature, and time provided in
+this dataset, key features contributing to degradation are selected using
+Pearson correlation coefficient and Shapley values. Shapley value-based feature
+selection effectively reflects cell-to-cell variability, showing similar
+importance rankings across all cells. The DLinear-based RUL prediction using
+key features efficiently captures the time-series trend, demonstrating
+significantly better performance compared to Long Short-Term Memory and
+Transformer models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hands-free Spatial Selection and Interaction Technique using Gaze and
+  Blink Input with Blink Prediction for Extended Reality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Rolff, Jenny Gabel, Lauren Zerbin, Niklas Hypki, Susanne Schmidt, Markus Lappe, Frank Steinicke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaze-based interaction techniques have created significant interest in the
+field of spatial interaction. Many of these methods require additional input
+modalities, such as hand gestures (e.g., gaze coupled with pinch). Those can be
+uncomfortable and difficult to perform in public or limited spaces, and pose
+challenges for users who are unable to execute pinch gestures. To address these
+aspects, we propose a novel, hands-free Gaze+Blink interaction technique that
+leverages the user's gaze and intentional eye blinks. This technique enables
+users to perform selections by executing intentional blinks. It facilitates
+continuous interactions, such as scrolling or drag-and-drop, through eye blinks
+coupled with head movements. So far, this concept has not been explored for
+hands-free spatial interaction techniques. We evaluated the performance and
+user experience (UX) of our Gaze+Blink method with two user studies and
+compared it with Gaze+Pinch in a realistic user interface setup featuring
+common menu interaction tasks. Study 1 demonstrated that while Gaze+Blink
+achieved comparable selection speeds, it was prone to accidental selections
+resulting from unintentional blinks. In Study 2 we explored an enhanced
+technique employing a deep learning algorithms for filtering out unintentional
+blinks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DenoMAE: A Multimodal Autoencoder for Denoising Modulation Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11538v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11538v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atik Faysal, Taha Boushine, Mohammad Rostami, Reihaneh Gh. Roshan, Huaxia Wang, Nikhil Muralidhar, Avimanyu Sahoo, Yu-Dong Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Denoising Masked Autoencoder (Deno-MAE), a novel multimodal
+autoencoder framework for denoising modulation signals during pretraining.
+DenoMAE extends the concept of masked autoencoders by incorporating multiple
+input modalities, including noise as an explicit modality, to enhance
+cross-modal learning and improve denoising performance. The network is
+pre-trained using unlabeled noisy modulation signals and constellation
+diagrams, effectively learning to reconstruct their equivalent noiseless
+signals and diagrams. Deno-MAE achieves state-of-the-art accuracy in automatic
+modulation classification tasks with significantly fewer training samples,
+demonstrating a 10% reduction in unlabeled pretraining data and a 3% reduction
+in labeled fine-tuning data compared to existing approaches. Moreover, our
+model exhibits robust performance across varying signal-to-noise ratios (SNRs)
+and supports extrapolation on unseen lower SNRs. The results indicate that
+DenoMAE is an efficient, flexible, and data-efficient solution for denoising
+and classifying modulation signals in challenging noise-intensive environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The impact of intrinsic rewards on exploration in Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aya Kayal, Eduardo Pignatelli, Laura Toni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the open challenges in Reinforcement Learning is the hard exploration
+problem in sparse reward environments. Various types of intrinsic rewards have
+been proposed to address this challenge by pushing towards diversity. This
+diversity might be imposed at different levels, favouring the agent to explore
+different states, policies or behaviours (State, Policy and Skill level
+diversity, respectively). However, the impact of diversity on the agent's
+behaviour remains unclear. In this work, we aim to fill this gap by studying
+the effect of different levels of diversity imposed by intrinsic rewards on the
+exploration patterns of RL agents. We select four intrinsic rewards (State
+Count, Intrinsic Curiosity Module (ICM), Maximum Entropy, and Diversity is all
+you need (DIAYN)), each pushing for a different diversity level. We conduct an
+empirical study on MiniGrid environment to compare their impact on exploration
+considering various metrics related to the agent's exploration, namely:
+episodic return, observation coverage, agent's position coverage, policy
+entropy, and timeframes to reach the sparse reward. The main outcome of the
+study is that State Count leads to the best exploration performance in the case
+of low-dimensional observations. However, in the case of RGB observations, the
+performance of State Count is highly degraded mostly due to representation
+learning challenges. Conversely, Maximum Entropy is less impacted, resulting in
+a more robust exploration, despite being not always optimal. Lastly, our
+empirical study revealed that learning diverse skills with DIAYN, often linked
+to improved robustness and generalisation, does not promote exploration in
+MiniGrid environments. This is because: i) learning the skill space itself can
+be challenging, and ii) exploration within the skill space prioritises
+differentiating between behaviours rather than achieving uniform state
+visitation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>45 pages, 17 figures. Submitted to Neural Computing and Applications
+  Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Instance Selection. Instance Selection as a Classification Problem
+  with Meta-Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcin Blachnik, Piotr Ciepliński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data pruning, or instance selection, is an important problem in machine
+learning especially in terms of nearest neighbour classifier. However, in data
+pruning which speeds up the prediction phase, there is an issue related to the
+speed and efficiency of the process itself. In response, the study proposes an
+approach involving transforming the instance selection process into a
+classification task conducted in a unified meta-feature space where each
+instance can be classified and assigned to either the "to keep" or "to remove"
+class. This approach requires training an appropriate meta-classifier, which
+can be developed based on historical instance selection results from other
+datasets using reference instance selection methods as a labeling tool. This
+work proposes constructing the meta-feature space based on properties extracted
+from the nearest neighbor graph. Experiments conducted on 17 datasets of
+varying sizes and five reference instance selection methods (ENN, Drop3, ICF,
+HMN-EI, and CCIS) demonstrate that the proposed solution achieves results
+comparable to reference instance selection methods while significantly reducing
+computational complexity. In the proposed approach, the computational
+complexity of the system depends only on identifying the k-nearest neighbors
+for each data sample and running the meta-classifier. Additionally, the study
+discusses the choice of meta-classifier, recommending the use of Balanced
+Random Forest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Technical Report for the Forgotten-by-Design Project: Targeted
+  Obfuscation for Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rickard Brännvall, Laurynas Adomaitis, Olof Görnerup, Anass Sedrati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The right to privacy, enshrined in various human rights declarations, faces
+new challenges in the age of artificial intelligence (AI). This paper explores
+the concept of the Right to be Forgotten (RTBF) within AI systems, contrasting
+it with traditional data erasure methods. We introduce Forgotten by Design, a
+proactive approach to privacy preservation that integrates instance-specific
+obfuscation techniques during the AI model training process. Unlike machine
+unlearning, which modifies models post-training, our method prevents sensitive
+data from being embedded in the first place. Using the LIRA membership
+inference attack, we identify vulnerable data points and propose defenses that
+combine additive gradient noise and weighting schemes. Our experiments on the
+CIFAR-10 dataset demonstrate that our techniques reduce privacy risks by at
+least an order of magnitude while maintaining model accuracy (at 95%
+significance). Additionally, we present visualization methods for the
+privacy-utility trade-off, providing a clear framework for balancing privacy
+risk and model accuracy. This work contributes to the development of
+privacy-preserving AI systems that align with human cognitive processes of
+motivated forgetting, offering a robust framework for safeguarding sensitive
+information and ensuring compliance with privacy regulations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative AI and Large Language Models in Language Preservation:
+  Opportunities and Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincent Koc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI and large-scale language models (LLM) have emerged as powerful
+tools in language preservation, particularly for near-native and endangered
+languages. With the increasing reliance on technology for communication,
+education, and cultural documentation, new opportunities have emerged to
+mitigate the dramatic decline of linguistic diversity worldwide. This paper
+examines the role of generative AIs and LLMs in preserving endangered
+languages, highlighting the risks and challenges associated with their use. We
+analyze the underlying technologies driving these models, including natural
+language processing (NLP) and deep learning, and explore several cases where
+these technologies have been applied to low-resource languages. Additionally,
+we discuss ethical considerations, data scarcity issues, and technical
+challenges while proposing solutions to enhance AI-driven language
+preservation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure, submitted for IEEE publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph-defined Language Learning with LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huachi Zhou, Jiahe Du, Chuang Zhou, Chang Yang, Yilin Xiao, Yuxuan Xie, Xiao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent efforts leverage Large Language Models (LLMs) for modeling
+text-attributed graph structures in node classification tasks. These approaches
+describe graph structures for LLMs to understand or aggregate LLM-generated
+textual attribute embeddings through graph structure. However, these approaches
+face two main limitations in modeling graph structures with LLMs. (i) Graph
+descriptions become verbose in describing high-order graph structure. (ii)
+Textual attributes alone do not contain adequate graph structure information.
+It is challenging to model graph structure concisely and adequately with LLMs.
+LLMs lack built-in mechanisms to model graph structures directly. They also
+struggle with complex long-range dependencies between high-order nodes and
+target nodes.
+  Inspired by the observation that LLMs pre-trained on one language can achieve
+exceptional performance on another with minimal additional training, we propose
+\textbf{G}raph-\textbf{D}efined \textbf{L}anguage for \textbf{L}arge
+\textbf{L}anguage \textbf{M}odel (GDL4LLM). This novel framework enables LLMs
+to transfer their powerful language understanding capabilities to
+graph-structured data. GDL4LLM translates graphs into a graph language corpus
+instead of graph descriptions and pre-trains LLMs on this corpus to adequately
+understand graph structures. During fine-tuning, this corpus describes the
+structural information of target nodes concisely with only a few tokens. By
+treating graphs as a new language, GDL4LLM enables LLMs to model graph
+structures adequately and concisely for node classification tasks. Extensive
+experiments on three real-world datasets demonstrate that GDL4LLM outperforms
+description-based and textual attribute embeddings-based baselines by
+efficiently modeling different orders of graph structure with LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MASS: Overcoming Language Bias in Image-Text Matching <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiwan Chung, Seungwon Lim, Sangkyu Lee, Youngjae Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained visual-language models have made significant advancements in
+multimodal tasks, including image-text retrieval. However, a major challenge in
+image-text matching lies in language bias, where models predominantly rely on
+language priors and neglect to adequately consider the visual content. We thus
+present Multimodal ASsociation Score (MASS), a framework that reduces the
+reliance on language priors for better visual accuracy in image-text matching
+problems. It can be seamlessly incorporated into existing visual-language
+models without necessitating additional training. Our experiments have shown
+that MASS effectively lessens language bias without losing an understanding of
+linguistic compositionality. Overall, MASS offers a promising solution for
+enhancing image-text matching performance in visual-language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving thermal state preparation of Sachdev-Ye-Kitaev model with
+  reinforcement learning on quantum hardware 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11454v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11454v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akash Kundu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Sachdev-Ye-Kitaev (SYK) model, known for its strong quantum correlations
+and chaotic behavior, serves as a key platform for quantum gravity studies.
+However, variationally preparing thermal states on near-term quantum processors
+for large systems (N>12, where N is the number of Majorana fermions) presents a
+significant challenge due to the rapid growth in the complexity of
+parameterized quantum circuits. This paper addresses this challenge by
+integrating reinforcement learning (RL) with convolutional neural networks,
+employing an iterative approach to optimize the quantum circuit and its
+parameters. The refinement process is guided by a composite reward signal
+derived from entropy and the expectation values of the SYK Hamiltonian. This
+approach reduces the number of CNOT gates by two orders of magnitude for
+systems N>10 compared to traditional methods like first-order Trotterization.
+We demonstrate the effectiveness of the RL framework in both noiseless and
+noisy quantum hardware environments, maintaining high accuracy in thermal state
+preparation. This work contributes to the advancement of a scalable, RL-based
+framework with applications for computations of thermal out-of-time-order
+correlators in quantum many-body systems and quantum gravity studies on
+near-term quantum hardware.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and the data will be available soon. Comments are welcomed!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Diffusion Models for Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Liu, Zhenchao Ma, Zepu Wang, Yang Liu, Zehua Wang, Peng Sun, Liang Song, Bo Hu, Azzedine Boukerche, Victor C. M. Leung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models (DMs) have emerged as a powerful class of generative AI
+models, showing remarkable potential in anomaly detection (AD) tasks across
+various domains, such as cybersecurity, fraud detection, healthcare, and
+manufacturing. The intersection of these two fields, termed diffusion models
+for anomaly detection (DMAD), offers promising solutions for identifying
+deviations in increasingly complex and high-dimensional data. In this survey,
+we systematically review recent advances in DMAD research and investigate their
+capabilities. We begin by presenting the fundamental concepts of AD and DMs,
+followed by a comprehensive analysis of classic DM architectures including
+DDPMs, DDIMs, and Score SDEs. We further categorize existing DMAD methods into
+reconstruction-based, density-based, and hybrid approaches, providing detailed
+examinations of their methodological innovations. We also explore the diverse
+tasks across different data modalities, encompassing image, time series, video,
+and multimodal data analysis. Furthermore, we discuss critical challenges and
+emerging research directions, including computational efficiency, model
+interpretability, robustness enhancement, edge-cloud collaboration, and
+integration with large language models. The collection of DMAD research papers
+and resources is available at https://github.com/fdjingliu/DMAD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Coronary Artery Calcium Scoring via Multi-Organ Segmentation
+  on Non-Contrast Cardiac Computed Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakub Nalepa, Tomasz Bartczak, Mariusz Bujny, Jarosław Gośliński, Katarzyna Jesionek, Wojciech Malara, Filip Malawski, Karol Miszalski-Jamka, Patrycja Rewa, Marcin Kostur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite coronary artery calcium scoring being considered a largely solved
+problem within the realm of medical artificial intelligence, this paper argues
+that significant improvements can still be made. By shifting the focus from
+pathology detection to a deeper understanding of anatomy, the novel algorithm
+proposed in the paper both achieves high accuracy in coronary artery calcium
+scoring and offers enhanced interpretability of the results. This approach not
+only aids in the precise quantification of calcifications in coronary arteries,
+but also provides valuable insights into the underlying anatomical structures.
+Through this anatomically-informed methodology, the paper shows how a nuanced
+understanding of the heart's anatomy can lead to more accurate and
+interpretable results in the field of cardiovascular health. We demonstrate the
+superior accuracy of the proposed method by evaluating it on an open-source
+multi-vendor dataset, where we obtain results at the inter-observer level,
+surpassing the current state of the art. Finally, the qualitative analyses show
+the practical value of the algorithm in such tasks as labeling coronary artery
+calcifications, identifying aortic calcifications, and filtering out false
+positive detections due to noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View Spectral Clustering for Graphs with Multiple View Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yorgos Tsitsikas, Evangelos E. Papalexakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the fundamental importance of clustering, to this day, much of the
+relevant research is still based on ambiguous foundations, leading to an
+unclear understanding of whether or how the various clustering methods are
+connected with each other. In this work, we provide an additional stepping
+stone towards resolving such ambiguities by presenting a general clustering
+framework that subsumes a series of seemingly disparate clustering methods,
+including various methods belonging to the wildly popular spectral clustering
+framework. In fact, the generality of the proposed framework is additionally
+capable of shedding light to the largely unexplored area of multi-view graphs
+whose each view may have differently clustered nodes. In turn, we propose
+GenClus: a method that is simultaneously an instance of this framework and a
+generalization of spectral clustering, while also being closely related to
+k-means as well. This results in a principled alternative to the few existing
+methods studying this special type of multi-view graphs. Then, we conduct
+in-depth experiments, which demonstrate that GenClus is more computationally
+efficient than existing methods, while also attaining similar or better
+clustering performance. Lastly, a qualitative real-world case-study further
+demonstrates the ability of GenClus to produce meaningful clusterings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Clustering with Bandit Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        G Dhinesh Chandran, Srinivas Reddy Kota, Srikrishna Bhashyam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of online clustering within the multi-armed bandit
+framework under the fixed confidence setting. In this multi-armed bandit
+problem, we have $M$ arms, each providing i.i.d. samples that follow a
+multivariate Gaussian distribution with an {\em unknown} mean and a known unit
+covariance. The arms are grouped into $K$ clusters based on the distance
+between their means using the Single Linkage (SLINK) clustering algorithm on
+the means of the arms. Since the true means are unknown, the objective is to
+obtain the above clustering of the arms with the minimum number of samples
+drawn from the arms, subject to an upper bound on the error probability. We
+introduce a novel algorithm, Average Tracking Bandit Online Clustering (ATBOC),
+and prove that this algorithm is order optimal, meaning that the upper bound on
+its expected sample complexity for given error probability $\delta$ is within a
+factor of 2 of an instance-dependent lower bound as $\delta \rightarrow 0$.
+Furthermore, we propose a computationally more efficient algorithm, Lower and
+Upper Confidence Bound-based Bandit Online Clustering (LUCBBOC), inspired by
+the LUCB algorithm for best arm identification. Simulation results demonstrate
+that the performance of LUCBBOC is comparable to that of ATBOC. We numerically
+assess the effectiveness of the proposed algorithms through numerical
+experiments on both synthetic datasets and the real-world MovieLens dataset. To
+the best of our knowledge, this is the first work on bandit online clustering
+that allows arms with different means in a cluster and $K$ greater than 2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Algorithm Selection with Probing Trajectories: Benchmarking the Choice
+  of Classifier Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11414v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11414v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quentin Renau, Emma Hart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent approaches to training algorithm selectors in the black-box
+optimisation domain have advocated for the use of training data that is
+algorithm-centric in order to encapsulate information about how an algorithm
+performs on an instance, rather than relying on information derived from
+features of the instance itself. Probing-trajectories that consist of a
+sequence of objective performance per function evaluation obtained from a short
+run of an algorithm have recently shown particular promise in training accurate
+selectors. However, training models on this type of data requires an
+appropriately chosen classifier given the sequential nature of the data. There
+are currently no clear guidelines for choosing the most appropriate classifier
+for algorithm selection using time-series data from the plethora of models
+available. To address this, we conduct a large benchmark study using 17
+different classifiers and three types of trajectory on a classification task
+using the BBOB benchmark suite using both leave-one-instance out and
+leave-one-problem out cross-validation. In contrast to previous studies using
+tabular data, we find that the choice of classifier has a significant impact,
+showing that feature-based and interval-based models are the best choices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Applications of Evolutionary Computation 28th
+  International Conference, EvoApplications 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization and Informativeness of Weighted Conformal Risk Control
+  Under Covariate Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Zecchin, Fredrik Hellström, Sangwoo Park, Shlomo Shamai, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive models are often required to produce reliable predictions under
+statistical conditions that are not matched to the training data. A common type
+of training-testing mismatch is covariate shift, where the conditional
+distribution of the target variable given the input features remains fixed,
+while the marginal distribution of the inputs changes. Weighted conformal risk
+control (W-CRC) uses data collected during the training phase to convert point
+predictions into prediction sets with valid risk guarantees at test time
+despite the presence of a covariate shift. However, while W-CRC provides
+statistical reliability, its efficiency -- measured by the size of the
+prediction sets -- can only be assessed at test time. In this work, we relate
+the generalization properties of the base predictor to the efficiency of W-CRC
+under covariate shifts. Specifically, we derive a bound on the inefficiency of
+the W-CRC predictor that depends on algorithmic hyperparameters and
+task-specific quantities available at training time. This bound offers insights
+on relationships between the informativeness of the prediction sets, the extent
+of the covariate shift, and the size of the calibration and training sets.
+Experiments on fingerprinting-based localization validate the theoretical
+results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Learning in Echo State Networks for Input Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taiki Yamada, Yuichi Katori, Kantaro Fujiwara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional echo state networks (ESNs) require supervised learning to train
+the readout layer, using the desired outputs as training data. In this study,
+we focus on input reconstruction (IR), which refers to training the readout
+layer to reproduce the input time series in its output. We reformulate the
+learning algorithm of the ESN readout layer to perform IR using unsupervised
+learning (UL). By conducting theoretical analysis and numerical experiments, we
+demonstrate that IR in ESNs can be effectively implemented under realistic
+conditions without explicitly using the desired outputs as training data; in
+this way, UL is enabled. Furthermore, we demonstrate that applications relying
+on IR, such as dynamical system replication and noise filtering, can be
+reformulated within the UL framework. Our findings establish a theoretically
+sound and universally applicable IR formulation, along with its related tasks
+in ESNs. This work paves the way for novel predictions and highlights
+unresolved theoretical challenges in ESNs, particularly in the context of
+time-series processing methods and computational models of the brain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures, regular paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Truly Sparse and General Implementation of Gradient-Based Synaptic
+  Plasticity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jamie Lohoff, Anil Kaya, Florian Assmuth, Emre Neftci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online synaptic plasticity rules derived from gradient descent achieve high
+accuracy on a wide range of practical tasks. However, their software
+implementation often requires tediously hand-derived gradients or using
+gradient backpropagation which sacrifices the online capability of the rules.
+In this work, we present a custom automatic differentiation (AD) pipeline for
+sparse and online implementation of gradient-based synaptic plasticity rules
+that generalizes to arbitrary neuron models. Our work combines the programming
+ease of backpropagation-type methods for forward AD while being
+memory-efficient. To achieve this, we exploit the advantageous compute and
+memory scaling of online synaptic plasticity by providing an inherently sparse
+implementation of AD where expensive tensor contractions are replaced with
+simple element-wise multiplications if the tensors are diagonal. Gradient-based
+synaptic plasticity rules such as eligibility propagation (e-prop) have exactly
+this property and thus profit immensely from this feature. We demonstrate the
+alignment of our gradients with respect to gradient backpropagation on an
+synthetic task where e-prop gradients are exact, as well as audio speech
+classification benchmarks. We demonstrate how memory utilization scales with
+network size without dependence on the sequence length, as expected from
+forward AD methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniTrans: A Unified Vertical Federated Knowledge Transfer Framework for
+  Enhancing Cross-Hospital Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chung-ju Huang, Yuanpeng He, Xiao Han, Wenpin Jiao, Zhi Jin, Leye Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-hospital collaboration has the potential to address disparities in
+medical resources across different regions. However, strict privacy regulations
+prohibit the direct sharing of sensitive patient information between hospitals.
+Vertical federated learning (VFL) offers a novel privacy-preserving machine
+learning paradigm that maximizes data utility across multiple hospitals.
+Traditional VFL methods, however, primarily benefit patients with overlapping
+data, leaving vulnerable non-overlapping patients without guaranteed
+improvements in medical prediction services. While some knowledge transfer
+techniques can enhance the prediction performance for non-overlapping patients,
+they fall short in addressing scenarios where overlapping and non-overlapping
+patients belong to different domains, resulting in challenges such as feature
+heterogeneity and label heterogeneity. To address these issues, we propose a
+novel unified vertical federated knowledge transfer framework (Unitrans). Our
+framework consists of three key steps. First, we extract the federated
+representation of overlapping patients by employing an effective vertical
+federated representation learning method to model multi-party joint features
+online. Next, each hospital learns a local knowledge transfer module offline,
+enabling the transfer of knowledge from the federated representation of
+overlapping patients to the enriched representation of local non-overlapping
+patients in a domain-adaptive manner. Finally, hospitals utilize these enriched
+local representations to enhance performance across various downstream medical
+prediction tasks. Experiments on real-world medical datasets validate the
+framework's dual effectiveness in both intra-domain and cross-domain knowledge
+transfer. The code of \method is available at
+\url{https://github.com/Chung-ju/Unitrans}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transductive Conformal Inference for Ranking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11384v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11384v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-Baptiste Fermanian, Pierre Humbert, Gilles Blanchard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a method based on Conformal Prediction (CP) to quantify the
+uncertainty of full ranking algorithms. We focus on a specific scenario where
+$n + m$ items are to be ranked by some ''black box'' algorithm. It is assumed
+that the relative (ground truth) ranking of n of them is known. The objective
+is then to quantify the error made by the algorithm on the ranks of the m new
+items among the total $(n + m)$. In such a setting, the true ranks of the n
+original items in the total $(n + m)$ depend on the (unknown) true ranks of the
+m new ones. Consequently, we have no direct access to a calibration set to
+apply a classical CP method. To address this challenge, we propose to construct
+distribution-free bounds of the unknown conformity scores using recent results
+on the distribution of conformal p-values. Using these scores upper bounds, we
+provide valid prediction sets for the rank of any item. We also control the
+false coverage proportion, a crucial quantity when dealing with multiple
+prediction sets. Finally, we empirically show on both synthetic and real data
+the efficiency of our CP method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Block Flow: Learning Straight Flow on Data Blocks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zibin Wang, Zhiyuan Ouyang, Xiangyun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Flow-matching models provide a powerful framework for various applications,
+offering efficient sampling and flexible probability path modeling. These
+models are characterized by flows with low curvature in learned generative
+trajectories, which results in reduced truncation error at each sampling step.
+To further reduce curvature, we propose block matching. This novel approach
+leverages label information to partition the data distribution into blocks and
+match them with a prior distribution parameterized using the same label
+information, thereby learning straighter flows. We demonstrate that the
+variance of the prior distribution can control the curvature upper bound of
+forward trajectories in flow-matching models. By designing flexible
+regularization strategies to adjust this variance, we achieve optimal
+generation performance, effectively balancing the trade-off between maintaining
+diversity in generated samples and minimizing numerical solver errors. Our
+results demonstrate competitive performance with models of the same parameter
+scale.Code is available at \url{https://github.com/wpp13749/block_flow}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Learning with Sample-level Client Drift Mitigation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Xu, Jiaze Li, Wanyi Wu, Hao Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) suffers from severe performance degradation due to
+the data heterogeneity among clients. Existing works reveal that the
+fundamental reason is that data heterogeneity can cause client drift where the
+local model update deviates from the global one, and thus they usually tackle
+this problem from the perspective of calibrating the obtained local update.
+Despite effectiveness, existing methods substantially lack a deep understanding
+of how heterogeneous data samples contribute to the formation of client drift.
+In this paper, we bridge this gap by identifying that the drift can be viewed
+as a cumulative manifestation of biases present in all local samples and the
+bias between samples is different. Besides, the bias dynamically changes as the
+FL training progresses. Motivated by this, we propose FedBSS that first
+mitigates the heterogeneity issue in a sample-level manner, orthogonal to
+existing methods. Specifically, the core idea of our method is to adopt a
+bias-aware sample selection scheme that dynamically selects the samples from
+small biases to large epoch by epoch to train progressively the local model in
+each round. In order to ensure the stability of training, we set the
+diversified knowledge acquisition stage as the warm-up stage to avoid the local
+optimality caused by knowledge deviation in the early stage of the model.
+Evaluation results show that FedBSS outperforms state-of-the-art baselines. In
+addition, we also achieved effective results on feature distribution skew and
+noise label dataset setting, which proves that FedBSS can not only reduce
+heterogeneity, but also has scalability and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Dimension of Pullback Attractors in Recurrent Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammed Fadera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recurrent Neural Networks (RNNs) are high-dimensional state space models
+capable of learning functions on sequence data. Recently, it has been
+conjectured that reservoir computers, a particular class of RNNs, trained on
+observations of a dynamical systems can be interpreted as embeddings. This
+result has been established for the case of linear reservoir systems. In this
+work, we use a nonautonomous dynamical systems approach to establish an upper
+bound for the fractal dimension of the subset of reservoir state space
+approximated during training and prediction phase. We prove that when the input
+sequences comes from an Nin-dimensional invertible dynamical system, the
+fractal dimension of this set is bounded above by Nin. The result obtained here
+are useful in dimensionality reduction of computation in RNNs as well as
+estimating fractal dimensions of dynamical systems from limited observations of
+their time series. It is also a step towards understanding embedding properties
+of reservoir computers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive parameters identification for nonlinear dynamics using deep
+  permutation invariant networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11350v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11350v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mouad Elaarabi, Domenico Borzacchiello, Yves Le Guennec, Philippe Le Bot, Sebastien Comas-Cardona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The promising outcomes of dynamical system identification techniques, such as
+SINDy [Brunton et al. 2016], highlight their advantages in providing
+qualitative interpretability and extrapolation compared to non-interpretable
+deep neural networks [Rudin 2019]. These techniques suffer from parameter
+updating in real-time use cases, especially when the system parameters are
+likely to change during or between processes. Recently, the OASIS [Bhadriraju
+et al. 2020] framework introduced a data-driven technique to address the
+limitations of real-time dynamical system parameters updating, yielding
+interesting results. Nevertheless, we show in this work that superior
+performance can be achieved using more advanced model architectures. We present
+an innovative encoding approach, based mainly on the use of Set Encoding
+methods of sequence data, which give accurate adaptive model identification for
+complex dynamic systems, with variable input time series length. Two Set
+Encoding methods are used, the first is Deep Set [Zaheer et al. 2017], and the
+second is Set Transformer [Lee et al. 2019]. Comparing Set Transformer to OASIS
+framework on Lotka Volterra for real-time local dynamical system identification
+and time series forecasting, we find that the Set Transformer architecture is
+well adapted to learning relationships within data sets. We then compare the
+two Set Encoding methods based on the Lorenz system for online global dynamical
+system identification. Finally, we trained a Deep Set model to perform
+identification and characterization of abnormalities for 1D heat-transfer
+problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lee and Seung (2000)'s Algorithms for Non-negative Matrix Factorization:
+  A Supplementary Proof Guide 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungjae Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lee and Seung (2000) introduced numerical solutions for non-negative matrix
+factorization (NMF) using iterative multiplicative update algorithms. These
+algorithms have been actively utilized as dimensionality reduction tools for
+high-dimensional non-negative data and learning algorithms for artificial
+neural networks. Despite a considerable amount of literature on the
+applications of the NMF algorithms, detailed explanations about their
+formulation and derivation are lacking. This report provides supplementary
+details to help understand the formulation and derivation of the proofs as used
+in the original paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages; 3 figures; 10 subfigures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The "Law" of the Unconscious Contrastive Learner: Probabilistic
+  Alignment of Unpaired Modalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongwei Che, Benjamin Eysenbach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While internet-scale data often comes in pairs (e.g., audio/image,
+image/text), we often want to perform inferences over modalities unseen
+together in the training data (e.g., audio/text). Empirically, this can often
+be addressed by learning multiple contrastive embedding spaces between existing
+modality pairs, implicitly hoping that unseen modality pairs will end up being
+aligned. This theoretical paper proves that this hope is well founded, under
+certain assumptions. Starting with the proper Bayesian approach of integrating
+out intermediate modalities, we show that directly comparing the
+representations of data from unpaired modalities can recover the same
+likelihood ratio. Our analysis builds on prior work on the geometry and
+probabilistic interpretation of contrastive representations, showing how these
+representations can answer many of the same inferences as probabilistic
+graphical models. Our analysis suggests two new ways of using contrastive
+representations: in settings with pre-trained contrastive models, and for
+handling language ambiguity in reinforcement learning. Our numerical
+experiments study the importance of our assumptions and demonstrate these new
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-Informed Machine Learning for Efficient Reconfigurable
+  Intelligent Surface Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Zhang, Jun Hui Qiu, Jun Wei Zhang, Hui Dong Li, Dong Tang, Qiang Cheng, Wei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconfigurable intelligent surface (RIS) is a two-dimensional periodic
+structure integrated with a large number of reflective elements, which can
+manipulate electromagnetic waves in a digital way, offering great potentials
+for wireless communication and radar detection applications. However,
+conventional RIS designs highly rely on extensive full-wave EM simulations that
+are extremely time-consuming. To address this challenge, we propose a
+machine-learning-assisted approach for efficient RIS design. An accurate and
+fast model to predict the reflection coefficient of RIS element is developed by
+combining a multi-layer perceptron neural network (MLP) and a dual-port
+network, which can significantly reduce tedious EM simulations in the network
+training. A RIS has been practically designed based on the proposed method. To
+verify the proposed method, the RIS has also been fabricated and measured. The
+experimental results are in good agreement with the simulation results, which
+validates the efficacy of the proposed method in RIS design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nested Annealed Training Scheme for Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Wan, Ming-Hsuan Yang, Minglu Li, Yunliang Jiang, Zhonglong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, researchers have proposed many deep generative models, including
+generative adversarial networks(GANs) and denoising diffusion models. Although
+significant breakthroughs have been made and empirical success has been
+achieved with the GAN, its mathematical underpinnings remain relatively
+unknown. This paper focuses on a rigorous mathematical theoretical framework:
+the composite-functional-gradient GAN (CFG)[1]. Specifically, we reveal the
+theoretical connection between the CFG model and score-based models. We find
+that the training objective of the CFG discriminator is equivalent to finding
+an optimal D(x). The optimal gradient of D(x) differentiates the integral of
+the differences between the score functions of real and synthesized samples.
+Conversely, training the CFG generator involves finding an optimal G(x) that
+minimizes this difference. In this paper, we aim to derive an annealed weight
+preceding the weight of the CFG discriminator. This new explicit theoretical
+explanation model is called the annealed CFG method. To overcome the limitation
+of the annealed CFG method, as the method is not readily applicable to the SOTA
+GAN model, we propose a nested annealed training scheme (NATS). This scheme
+keeps the annealed weight from the CFG method and can be seamlessly adapted to
+various GAN models, no matter their structural, loss, or regularization
+differences. We conduct thorough experimental evaluations on various benchmark
+datasets for image generation. The results show that our annealed CFG and NATS
+methods significantly improve the quality and diversity of the synthesized
+samples. This improvement is clear when comparing the CFG method and the SOTA
+GAN models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A2SB: Audio-to-Audio Schrodinger Bridges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhifeng Kong, Kevin J Shih, Weili Nie, Arash Vahdat, Sang-gil Lee, Joao Felipe Santos, Ante Jukic, Rafael Valle, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio in the real world may be perturbed due to numerous factors, causing the
+audio quality to be degraded. The following work presents an audio restoration
+model tailored for high-res music at 44.1kHz. Our model, Audio-to-Audio
+Schrodinger Bridges (A2SB), is capable of both bandwidth extension (predicting
+high-frequency components) and inpainting (re-generating missing segments).
+Critically, A2SB is end-to-end without need of a vocoder to predict waveform
+outputs, able to restore hour-long audio inputs, and trained on permissively
+licensed music data. A2SB is capable of achieving state-of-the-art bandwidth
+extension and inpainting quality on several out-of-distribution music test
+sets. Our demo website is https: //research.nvidia.com/labs/adlr/A2SB/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Imputation of Urban Time Series through Cross-city
+  Meta-learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Nie, Wei Ma, Jian Sun, Yu Yang, Jiannong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urban time series, such as mobility flows, energy consumption, and pollution
+records, encapsulate complex urban dynamics and structures. However, data
+collection in each city is impeded by technical challenges such as budget
+limitations and sensor failures, necessitating effective data imputation
+techniques that can enhance data quality and reliability. Existing imputation
+models, categorized into learning-based and analytics-based paradigms, grapple
+with the trade-off between capacity and generalizability. Collaborative
+learning to reconstruct data across multiple cities holds the promise of
+breaking this trade-off. Nevertheless, urban data's inherent irregularity and
+heterogeneity issues exacerbate challenges of knowledge sharing and
+collaboration across cities. To address these limitations, we propose a novel
+collaborative imputation paradigm leveraging meta-learned implicit neural
+representations (INRs). INRs offer a continuous mapping from domain coordinates
+to target values, integrating the strengths of both paradigms. By imposing
+embedding theory, we first employ continuous parameterization to handle
+irregularity and reconstruct the dynamical system. We then introduce a
+cross-city collaborative learning scheme through model-agnostic meta learning,
+incorporating hierarchical modulation and normalization techniques to
+accommodate multiscale representations and reduce variance in response to
+heterogeneity. Extensive experiments on a diverse urban dataset from 20 global
+cities demonstrate our model's superior imputation performance and
+generalizability, underscoring the effectiveness of collaborative imputation in
+resource-constrained settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalizable Spectral Embedding with an Application to UMAP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nir Ben-Ari, Amitai Yacobi, Uri Shaham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spectral Embedding (SE) is a popular method for dimensionality reduction,
+applicable across diverse domains. Nevertheless, its current implementations
+face three prominent drawbacks which curtail its broader applicability:
+generalizability (i.e., out-of-sample extension), scalability, and eigenvectors
+separation. In this paper, we introduce GrEASE: Generalizable and Efficient
+Approximate Spectral Embedding, a novel deep-learning approach designed to
+address these limitations. GrEASE incorporates an efficient post-processing
+step to achieve eigenvectors separation, while ensuring both generalizability
+and scalability, allowing for the computation of the Laplacian's eigenvectors
+on unseen data. This method expands the applicability of SE to a wider range of
+tasks and can enhance its performance in existing applications. We empirically
+demonstrate GrEASE's ability to consistently approximate and generalize SE,
+while ensuring scalability. Additionally, we show how GrEASE can be leveraged
+to enhance existing methods. Specifically, we focus on UMAP, a leading
+visualization technique, and introduce NUMAP, a generalizable version of UMAP
+powered by GrEASE. Our codes are publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Machine Learning Framework for Handling Unreliable Absence Label and
+  Class Imbalance for Marine Stinger Beaching Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amuche Ibenegbu, Amandine Schaeffer, Pierre Lafaye de Micheaux, Rohitash Chandra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bluebottles (\textit{Physalia} spp.) are marine stingers resembling
+jellyfish, whose presence on Australian beaches poses a significant public risk
+due to their venomous nature. Understanding the environmental factors driving
+bluebottles ashore is crucial for mitigating their impact, and machine learning
+tools are to date relatively unexplored. We use bluebottle marine stinger
+presence/absence data from beaches in Eastern Sydney, Australia, and compare
+machine learning models (Multilayer Perceptron, Random Forest, and XGBoost) to
+identify factors influencing their presence. We address challenges such as
+class imbalance, class overlap, and unreliable absence data by employing data
+augmentation techniques, including the Synthetic Minority Oversampling
+Technique (SMOTE), Random Undersampling, and Synthetic Negative Approach that
+excludes the negative class. Our results show that SMOTE failed to resolve
+class overlap, but the presence-focused approach effectively handled imbalance,
+class overlap, and ambiguous absence data. The data attributes such as the wind
+direction, which is a circular variable, emerged as a key factor influencing
+bluebottle presence, confirming previous inference studies. However, in the
+absence of population dynamics, biological behaviours, and life cycles, the
+best predictive model appears to be Random Forests combined with Synthetic
+Negative Approach. This research contributes to mitigating the risks posed by
+bluebottles to beachgoers and provides insights into handling class overlap and
+unreliable negative class in environmental modelling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RedStar: Does Scaling Long-CoT Data Unlock Better Slow-Reasoning
+  Systems? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Xu, Xing Wu, Weinong Wang, Zhongzhi Li, Da Zheng, Boyuan Chen, Yi Hu, Shijia Kang, Jiaming Ji, Yingying Zhang, Zhijiang Guo, Yaodong Yang, Muhan Zhang, Debing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can scaling transform reasoning? In this work, we explore the untapped
+potential of scaling Long Chain-of-Thought (Long-CoT) data to 1000k samples,
+pioneering the development of a slow-thinking model, RedStar. Through extensive
+experiments with various LLMs and different sizes, we uncover the ingredients
+for specialization and scale for Long-CoT training. Surprisingly, even smaller
+models show significant performance gains with limited data, revealing the
+sample efficiency of Long-CoT and the critical role of sample difficulty in the
+learning process. Our findings demonstrate that Long-CoT reasoning can be
+effectively triggered with just a few thousand examples, while larger models
+achieve unparalleled improvements. We also introduce reinforcement learning
+(RL)-scale training as a promising direction for advancing slow-thinking
+systems. RedStar shines across domains: on the MATH-Hard benchmark,
+RedStar-code-math boosts performance from 66.2\% to 81.6\%, and on the USA Math
+Olympiad (AIME), it solves 46.7\% of problems using only 21k mixed-code-math
+datasets. In multimodal tasks like GeoQA and MathVista-GEO, RedStar-Geo
+achieves competitive results with minimal Long-CoT data, outperforming other
+slow-thinking systems like QvQ-Preview. Compared to QwQ, RedStar strikes the
+perfect balance between reasoning and generalizability. Our work highlights
+that, with careful tuning, scaling Long-CoT can unlock extraordinary reasoning
+capabilities-even with limited dataset and set a new standard for slow-thinking
+models across diverse challenges. Our data and models are released at
+https://huggingface.co/RedStar-Reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>technique-report, https://huggingface.co/RedStar-Reasoning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empirical Bayes Estimation for Lasso-Type Regularizers: Analysis of
+  Automatic Relevance Determination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tsukasa Yoshida, Kazuho Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on linear regression models with non-conjugate
+sparsity-inducing regularizers such as lasso and group lasso. Although
+empirical Bayes approach enables us to estimate the regularization parameter,
+little is known on the properties of the estimators. In particular, there are
+many unexplained aspects regarding the specific conditions under which the
+mechanism of automatic relevance determination (ARD) occurs. In this paper, we
+derive the empirical Bayes estimators for the group lasso regularized linear
+regression models with a limited number of parameters. It is shown that the
+estimators diverge under a certain condition, giving rise to the ARD mechanism.
+We also prove that empirical Bayes methods can produce ARD mechanism in general
+regularized linear regression models and clarify the conditions under which
+models such as ridge, lasso, and group lasso can produce ARD mechanism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Higher Order Approximation Rates for ReLU CNNs in Korobov Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuwen Li, Guozhi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the $L_p$ approximation error for higher order
+Korobov functions using deep convolutional neural networks (CNNs) with ReLU
+activation. For target functions having a mixed derivative of order m+1 in each
+direction, we improve classical approximation rate of second order to (m+1)-th
+order (modulo a logarithmic factor) in terms of the depth of CNNs. The key
+ingredient in our analysis is approximate representation of high-order sparse
+grid basis functions by CNNs. The results suggest that higher order
+expressivity of CNNs does not severely suffer from the curse of dimensionality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatiotemporal Air Quality Mapping in Urban Areas Using Sparse Sensor
+  Data, Satellite Imagery, Meteorological Factors, and Spatial Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osama Ahmad, Zubair Khalid, Muhammad Tahir, Momin Uppal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monitoring air pollution is crucial for protecting human health from exposure
+to harmful substances. Traditional methods of air quality monitoring, such as
+ground-based sensors and satellite-based remote sensing, face limitations due
+to high deployment costs, sparse sensor coverage, and environmental
+interferences. To address these challenges, this paper proposes a framework for
+high-resolution spatiotemporal Air Quality Index (AQI) mapping using sparse
+sensor data, satellite imagery, and various spatiotemporal factors. By
+leveraging Graph Neural Networks (GNNs), we estimate AQI values at unmonitored
+locations based on both spatial and temporal dependencies. The framework
+incorporates a wide range of environmental features, including meteorological
+data, road networks, points of interest (PoIs), population density, and urban
+green spaces, which enhance prediction accuracy. We illustrate the use of our
+approach through a case study in Lahore, Pakistan, where multi-resolution data
+is used to generate the air quality index map at a fine spatiotemporal scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse L0-norm based Kernel-free Quadratic Surface Support Vector
+  Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmad Mousavi, Ramin Zandvakili
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kernel-free quadratic surface support vector machine (SVM) models have gained
+significant attention in machine learning. However, introducing a quadratic
+classifier increases the model's complexity by quadratically expanding the
+number of parameters relative to the dimensionality of the data, exacerbating
+overfitting. To address this, we propose sparse $\ell_0$-norm based Kernel-free
+quadratic surface SVMs, designed to mitigate overfitting and enhance
+interpretability. Given the intractable nature of these models, we present a
+penalty decomposition algorithm to efficiently obtain first-order optimality
+points. Our analysis shows that the subproblems in this framework either admit
+closed-form solutions or can leverage duality theory to improve computational
+efficiency. Through empirical evaluations on real-world datasets, we
+demonstrate the efficacy and robustness of our approach, showcasing its
+potential to advance Kernel-free quadratic surface SVMs in practical
+applications while addressing overfitting concerns. All the implemented models
+and experiment codes are available at
+\url{https://github.com/raminzandvakili/L0-QSVM}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient Federated Learning by Quantized Variance
+  Reduction for Heterogeneous Wireless Edge Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Wang, Yanqing Xu, Chaoqun You, Mingjie Shao, Tony Q. S. Quek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has been recognized as a viable solution for
+local-privacy-aware collaborative model training in wireless edge networks, but
+its practical deployment is hindered by the high communication overhead caused
+by frequent and costly server-device synchronization. Notably, most existing
+communication-efficient FL algorithms fail to reduce the significant
+inter-device variance resulting from the prevalent issue of device
+heterogeneity. This variance severely decelerates algorithm convergence,
+increasing communication overhead and making it more challenging to achieve a
+well-performed model. In this paper, we propose a novel communication-efficient
+FL algorithm, named FedQVR, which relies on a sophisticated variance-reduced
+scheme to achieve heterogeneity-robustness in the presence of quantized
+transmission and heterogeneous local updates among active edge devices.
+Comprehensive theoretical analysis justifies that FedQVR is inherently
+resilient to device heterogeneity and has a comparable convergence rate even
+with a small number of quantization bits, yielding significant communication
+savings. Besides, considering non-ideal wireless channels, we propose FedQVR-E
+which enhances the convergence of FedQVR by performing joint allocation of
+bandwidth and quantization bits across devices under constrained transmission
+delays. Extensive experimental results are also presented to demonstrate the
+superior performance of the proposed algorithms over their counterparts in
+terms of both communication efficiency and application performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Metric Topology of Deep Learning for Data Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jwo-Yuh Wu, Liang-Chi Huang, Wen-Hsuan Li, Chun-Hung Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Empirically, Deep Learning (DL) has demonstrated unprecedented success in
+practical applications. However, DL remains by and large a mysterious
+"black-box", spurring recent theoretical research to build its mathematical
+foundations. In this paper, we investigate DL for data classification through
+the prism of metric topology. Considering that conventional Euclidean metric
+over the network parameter space typically fails to discriminate DL networks
+according to their classification outcomes, we propose from a probabilistic
+point of view a meaningful distance measure, whereby DL networks yielding
+similar classification performances are close. The proposed distance measure
+defines such an equivalent relation among network parameter vectors that
+networks performing equally well belong to the same equivalent class.
+Interestingly, our proposed distance measure can provably serve as a metric on
+the quotient set modulo the equivalent relation. Then, under quite mild
+conditions it is shown that, apart from a vanishingly small subset of networks
+likely to predict non-unique labels, our proposed metric space is compact, and
+coincides with the well-known quotient topological space. Our study contributes
+to fundamental understanding of DL, and opens up new ways of studying DL using
+fruitful metric space theory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Uncertainty Estimation in Semantic Segmentation via
+  Monte-Carlo Frequency Dropout 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tal Zeevi, Lawrence H. Staib, John A. Onofrey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monte-Carlo (MC) Dropout provides a practical solution for estimating
+predictive distributions in deterministic neural networks. Traditional dropout,
+applied within the signal space, may fail to account for frequency-related
+noise common in medical imaging, leading to biased predictive estimates. A
+novel approach extends Dropout to the frequency domain, allowing stochastic
+attenuation of signal frequencies during inference. This creates diverse global
+textural variations in feature maps while preserving structural integrity -- a
+factor we hypothesize and empirically show is contributing to accurately
+estimating uncertainties in semantic segmentation. We evaluated traditional
+MC-Dropout and the MC-frequency Dropout in three segmentation tasks involving
+different imaging modalities: (i) prostate zones in biparametric MRI, (ii)
+liver tumors in contrast-enhanced CT, and (iii) lungs in chest X-ray scans. Our
+results show that MC-Frequency Dropout improves calibration, convergence, and
+semantic uncertainty, thereby improving prediction scrutiny, boundary
+delineation, and has the potential to enhance medical decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ISBI 2025 4-page paper. Code for the implementation
+  is available at https://github.com/talze/frequency-dropout</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multivariate Wireless Link Quality Prediction Based on <span class="highlight-title">Pre-train</span>ed Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuangzhuang Yan, Xinyu Gu, Shilong Fan, Zhenyu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and reliable link quality prediction (LQP) is crucial for optimizing
+network performance, ensuring communication stability, and enhancing user
+experience in wireless communications. However, LQP faces significant
+challenges due to the dynamic and lossy nature of wireless links, which are
+influenced by interference, multipath effects, fading, and blockage. In this
+paper, we propose GAT-LLM, a novel multivariate wireless link quality
+prediction model that combines Large Language Models (LLMs) with Graph
+Attention Networks (GAT) to enable accurate and reliable multivariate LQP of
+wireless communications. By framing LQP as a time series prediction task and
+appropriately preprocessing the input data, we leverage LLMs to improve the
+accuracy of link quality prediction. To address the limitations of LLMs in
+multivariate prediction due to typically handling one-dimensional data, we
+integrate GAT to model interdependencies among multiple variables across
+different protocol layers, enhancing the model's ability to handle complex
+dependencies. Experimental results demonstrate that GAT-LLM significantly
+improves the accuracy and robustness of link quality prediction, particularly
+in multi-step prediction scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast instance-specific algorithm configuration with graph neural network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shingo Aihara, Matthieu Parizy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Combinatorial optimization (CO) problems are pivotal across various
+industrial applications, where the speed of solving these problems is crucial.
+Improving the performance of CO solvers across diverse input instances requires
+fine-tuning solver parameters for each instance. However, this tuning process
+is time-consuming, and the time required increases with the number of
+instances. To address this, a method called instance-specific algorithm
+configuration (ISAC) has been devised. This approach involves two main steps:
+training and execution. During the training step, features are extracted from
+various instances and then grouped into clusters. For each cluster, parameters
+are fine-tuned. This cluster-specific tuning process results in a set of
+generalized parameters for instances belonging to each class. In the execution
+step, features are extracted from an unknown instance to determine its cluster,
+and the corresponding pre-tuned parameters are applied. Generally, the running
+time of a solver is evaluated by the time to solution ($TTS$). However, methods
+like ISAC require preprocessing. Therefore, the total execution time is
+$T_{tot}=TTS+T_{tune}$, where $T_{tune}$ represents the tuning time. While the
+goal is to minimize $T_{tot}$, it is important to note that extracting features
+in the ISAC method requires a certain amount of computational time. The
+extracting features include summary statistics of the solver execution logs,
+which takes several 10 seconds. This research presents a method to
+significantly reduce the time of the ISAC execution step by streamlining
+feature extraction and class determination with a graph neural network.
+Experimental results show that $T_{tune}$ in the execution step, which take
+several 10 seconds in the original ISAC manner, could be reduced to
+sub-seconds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WSSM: Geographic-enhanced hierarchical state-space model for global
+  station weather forecast 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songru Yang, Zili Liu, Zhenwei Shi, Zhengxia Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global Station Weather Forecasting (GSWF), a prominent meteorological
+research area, is pivotal in providing timely localized weather predictions.
+Despite the progress existing models have made in the overall accuracy of the
+GSWF, executing high-precision extreme event prediction still presents a
+substantial challenge. The recent emergence of state-space models, with their
+ability to efficiently capture continuous-time dynamics and latent states,
+offer potential solutions. However, early investigations indicated that Mamba
+underperforms in the context of GSWF, suggesting further adaptation and
+optimization. To tackle this problem, in this paper, we introduce Weather
+State-space Model (WSSM), a novel Mamba-based approach tailored for GSWF.
+Geographical knowledge is integrated in addition to the widely-used positional
+encoding to represent the absolute special-temporal position. The multi-scale
+time-frequency features are synthesized from coarse to fine to model the
+seasonal to extreme weather dynamic. Our method effectively improves the
+overall prediction accuracy and addresses the challenge of forecasting extreme
+weather events. The state-of-the-art results obtained on the Weather-5K subset
+underscore the efficacy of the WSSM
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Formulation of Lipschitz Constrained With Functional Gradient
+  Learning for GANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Wan, Ke Fan, Xinwei Sun, Yanwei Fu, Minglu Li, Yunliang Jiang, Zhonglong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a promising alternative method for training Generative
+Adversarial Networks (GANs) on large-scale datasets with clear theoretical
+guarantees. GANs are typically learned through a minimax game between a
+generator and a discriminator, which is known to be empirically unstable.
+Previous learning paradigms have encountered mode collapse issues without a
+theoretical solution. To address these challenges, we propose a novel
+Lipschitz-constrained Functional Gradient GANs learning (Li-CFG) method to
+stabilize the training of GAN and provide a theoretical foundation for
+effectively increasing the diversity of synthetic samples by reducing the
+neighborhood size of the latent vector. Specifically, we demonstrate that the
+neighborhood size of the latent vector can be reduced by increasing the norm of
+the discriminator gradient, resulting in enhanced diversity of synthetic
+samples. To efficiently enlarge the norm of the discriminator gradient, we
+introduce a novel {\epsilon}-centered gradient penalty that amplifies the norm
+of the discriminator gradient using the hyper-parameter {\epsilon}. In
+comparison to other constraints, our method enlarging the discriminator norm,
+thus obtaining the smallest neighborhood size of the latent vector. Extensive
+experiments on benchmark datasets for image generation demonstrate the efficacy
+of the Li-CFG method and the {\epsilon}-centered gradient penalty. The results
+showcase improved stability and increased diversity of synthetic samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Imbalanced Learning-based Sampling Method for Physics-informed Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Luo, Yahong Yang, Yuan Yuan, Shixin Xu, Wenrui Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Residual-based Smote (RSmote), an innovative local
+adaptive sampling technique tailored to improve the performance of
+Physics-Informed Neural Networks (PINNs) through imbalanced learning
+strategies. Traditional residual-based adaptive sampling methods, while
+effective in enhancing PINN accuracy, often struggle with efficiency and high
+memory consumption, particularly in high-dimensional problems. RSmote addresses
+these challenges by targeting regions with high residuals and employing
+oversampling techniques from imbalanced learning to refine the sampling
+process. Our approach is underpinned by a rigorous theoretical analysis that
+supports the effectiveness of RSmote in managing computational resources more
+efficiently. Through extensive evaluations, we benchmark RSmote against the
+state-of-the-art Residual-based Adaptive Distribution (RAD) method across a
+variety of dimensions and differential equations. The results demonstrate that
+RSmote not only achieves or exceeds the accuracy of RAD but also significantly
+reduces memory usage, making it particularly advantageous in high-dimensional
+scenarios. These contributions position RSmote as a robust and
+resource-efficient solution for solving complex partial differential equations,
+especially when computational constraints are a critical consideration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 figures,7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging GANs For Active Appearance Models Optimized Model Fitting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anurag Awasthi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) have gained prominence in refining
+model fitting tasks in computer vision, particularly in domains involving
+deformable models like Active Appearance Models (AAMs). This paper explores the
+integration of GANs to enhance the AAM fitting process, addressing challenges
+in optimizing nonlinear parameters associated with appearance and shape
+variations. By leveraging GANs' adversarial training framework, the aim is to
+minimize fitting errors and improve convergence rates. Achieving robust
+performance even in cases with high appearance variability and occlusions. Our
+approach demonstrates significant improvements in accuracy and computational
+efficiency compared to traditional optimization techniques, thus establishing
+GANs as a potent tool for advanced image model fitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures, in proceeding at conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Spatial Disparity in Urban Prediction Using Residual-Aware
+  Spatiotemporal Graph Neural Networks: A Chicago Case Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingyi Zhuang, Hanyong Xu, Xiaotong Guo, Yunhan Zheng, Shenhao Wang, Jinhua Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urban prediction tasks, such as forecasting traffic flow, temperature, and
+crime rates, are crucial for efficient urban planning and management. However,
+existing Spatiotemporal Graph Neural Networks (ST-GNNs) often rely solely on
+accuracy, overlooking spatial and demographic disparities in their predictions.
+This oversight can lead to imbalanced resource allocation and exacerbate
+existing inequities in urban areas. This study introduces a Residual-Aware
+Attention (RAA) Block and an equality-enhancing loss function to address these
+disparities. By adapting the adjacency matrix during training and incorporating
+spatial disparity metrics, our approach aims to reduce local segregation of
+residuals and errors. We applied our methodology to urban prediction tasks in
+Chicago, utilizing a travel demand dataset as an example. Our model achieved a
+48% significant improvement in fairness metrics with only a 9% increase in
+error metrics. Spatial analysis of residual distributions revealed that models
+with RAA Blocks produced more equitable prediction results, particularly by
+reducing errors clustered in central regions. Attention maps demonstrated the
+model's ability to dynamically adjust focus, leading to more balanced
+predictions. Case studies of various community areas in Chicago further
+illustrated the effectiveness of our approach in addressing spatial and
+demographic disparities, supporting more balanced and equitable urban planning
+and policy-making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Risk Analysis of Flowlines in the Oil and Gas Sector: A GIS and Machine
+  Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        I. Chittumuri, N. Alshehab, R. J. Voss, L. L. Douglass, S. Kamrava, Y. Fan, J. Miskimins, W. Fleckenstein, S. Bandyopadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a risk analysis of flowlines in the oil and gas sector
+using Geographic Information Systems (GIS) and machine learning (ML).
+Flowlines, vital conduits transporting oil, gas, and water from wellheads to
+surface facilities, often face under-assessment compared to transmission
+pipelines. This study addresses this gap using advanced tools to predict and
+mitigate failures, improving environmental safety and reducing human exposure.
+Extensive datasets from the Colorado Energy and Carbon Management Commission
+(ECMC) were processed through spatial matching, feature engineering, and
+geometric extraction to build robust predictive models. Various ML algorithms,
+including logistic regression, support vector machines, gradient boosting
+decision trees, and K-Means clustering, were used to assess and classify risks,
+with ensemble classifiers showing superior accuracy, especially when paired
+with Principal Component Analysis (PCA) for dimensionality reduction. Finally,
+a thorough data analysis highlighted spatial and operational factors
+influencing risks, identifying high-risk zones for focused monitoring. Overall,
+the study demonstrates the transformative potential of integrating GIS and ML
+in flowline risk management, proposing a data-driven approach that emphasizes
+the need for accurate data and refined models to improve safety in petroleum
+extraction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ditto: Accelerating Diffusion Model via Temporal Value Similarity <span class="chip">HPCA 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungbin Kim, Hyunwuk Lee, Wonho Cho, Mincheol Park, Won Woo Ro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models achieve superior performance in image generation tasks.
+However, it incurs significant computation overheads due to its iterative
+structure. To address these overheads, we analyze this iterative structure and
+observe that adjacent time steps in diffusion models exhibit high value
+similarity, leading to narrower differences between consecutive time steps. We
+adapt these characteristics to a quantized diffusion model and reveal that the
+majority of these differences can be represented with reduced bit-width, and
+even zero. Based on our observations, we propose the Ditto algorithm, a
+difference processing algorithm that leverages temporal similarity with
+quantization to enhance the efficiency of diffusion models. By exploiting the
+narrower differences and the distributive property of layer operations, it
+performs full bit-width operations for the initial time step and processes
+subsequent steps with temporal differences. In addition, Ditto execution flow
+optimization is designed to mitigate the memory overhead of temporal difference
+processing, further boosting the efficiency of the Ditto algorithm. We also
+design the Ditto hardware, a specialized hardware accelerator, fully exploiting
+the dynamic characteristics of the proposed algorithm. As a result, the Ditto
+hardware achieves up to 1.5x speedup and 17.74% energy saving compared to other
+accelerators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the 2025 IEEE International Symposium on
+  High-Performance Computer Architecture (HPCA 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Neural Nonparametric Point Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05637v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05637v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Chen, Xuhui Fan, Hengyu Liu, Yaqiong Li, Zhilin Zhao, Feng Zhou, Christopher John Quinn, Longbing Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal point processes (TPPs) are effective for modeling event occurrences
+over time, but they struggle with sparse and uncertain events in federated
+systems, where privacy is a major concern. To address this, we propose
+\textit{FedPP}, a Federated neural nonparametric Point Process model. FedPP
+integrates neural embeddings into Sigmoidal Gaussian Cox Processes (SGCPs) on
+the client side, which is a flexible and expressive class of TPPs, allowing it
+to generate highly flexible intensity functions that capture client-specific
+event dynamics and uncertainties while efficiently summarizing historical
+records. For global aggregation, FedPP introduces a divergence-based mechanism
+that communicates the distributions of SGCPs' kernel hyperparameters between
+the server and clients, while keeping client-specific parameters local to
+ensure privacy and personalization. FedPP effectively captures event
+uncertainty and sparsity, and extensive experiments demonstrate its superior
+performance in federated settings, particularly with KL divergence and
+Wasserstein distance-based global aggregation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VLM Agents Generate Their Own Memories: Distilling Experience into
+  Embodied Programs of Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14596v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14596v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Sarch, Lawrence Jang, Michael J. Tarr, William W. Cohen, Kenneth Marino, Katerina Fragkiadaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale LLMs and VLMs excel at few-shot learning but require high-quality
+examples. We introduce In-Context Abstraction Learning (ICAL), which
+iteratively refines suboptimal trajectories into high-quality data with
+optimized actions and detailed reasoning. Given an inefficient demonstration, a
+VLM corrects actions and annotates causal relationships, object states,
+subgoals, and task-relevant visuals, forming "programs of thought." With human
+feedback, these programs are improved as the agent executes them in a similar
+environment. The resulting examples, used as prompt context or fine-tuning
+data, significantly boost decision-making while reducing human feedback needs.
+ICAL surpasses state-of-the-art in TEACh (dialogue-based instruction
+following), VisualWebArena (multimodal web agents), and Ego4D (egocentric video
+action anticipation). In TEACh, combining fine-tuning and retrieval on ICAL
+examples outperforms raw human demonstrations and expert examples, achieving a
+17.5% increase in goal-condition success. In VisualWebArena,
+retrieval-augmented GPT-4V with ICAL improves task success rate 1.6x over
+GPT-4V, while fine-tuning Qwen2-VL achieves a 2.8x improvement. In Ego4D, ICAL
+outperforms few-shot GPT-4V and remains competitive with supervised models.
+Overall, ICAL scales 2x better than raw human demonstrations and reduces manual
+prompt engineering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://ical-learning.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Structure Aware Virtual Screening to Billions of Molecules with
+  SPRINT 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15418v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15418v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew T. McNutt, Abhinav K. Adduri, Caleb N. Ellington, Monica T. Dayao, Eric P. Xing, Hosein Mohimani, David R. Koes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual screening of small molecules against protein targets can accelerate
+drug discovery and development by predicting drug-target interactions (DTIs).
+However, structure-based methods like molecular docking are too slow to allow
+for broad proteome-scale screens, limiting their application in screening for
+off-target effects or new molecular mechanisms. Recently, vector-based methods
+using protein language models (PLMs) have emerged as a complementary approach
+that bypasses explicit 3D structure modeling. Here, we develop SPRINT, a
+vector-based approach for screening entire chemical libraries against whole
+proteomes for DTIs and novel mechanisms of action. SPRINT improves on prior
+work by using a self-attention based architecture and structure-aware PLMs to
+learn drug-target co-embeddings for binder prediction, search, and retrieval.
+SPRINT achieves SOTA enrichment factors in virtual screening on LIT-PCBA, DTI
+classification benchmarks, and binding affinity prediction benchmarks, while
+providing interpretability in the form of residue-level attention maps. In
+addition to being both accurate and interpretable, SPRINT is ultra-fast:
+querying the whole human proteome against the ENAMINE Real Database (6.7B
+drugs) for the 100 most likely binders per protein takes 16 minutes. SPRINT
+promises to enable virtual screening at an unprecedented scale, opening up new
+opportunities for in silico drug repurposing and development. SPRINT is
+available on the web as ColabScreen: https://bit.ly/colab-screen
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust Nonlinear Subspace Clustering: A Kernel Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06368v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06368v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunpeng Xu, Lifei Chen, Shengrui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kernel-based subspace clustering, which addresses the nonlinear structures in
+data, is an evolving area of research. Despite noteworthy progressions,
+prevailing methodologies predominantly grapple with limitations relating to (i)
+the influence of predefined kernels on model performance; (ii) the difficulty
+of preserving the original manifold structures in the nonlinear space; (iii)
+the dependency of spectral-type strategies on the ideal block diagonal
+structure of the affinity matrix. This paper presents DKLM, a novel paradigm
+for kernel-induced nonlinear subspace clustering. DKLM provides a data-driven
+approach that directly learns the kernel from the data's self-representation,
+ensuring adaptive weighting and satisfying the multiplicative triangle
+inequality constraint, which enhances the robustness of the learned kernel. By
+leveraging this learned kernel, DKLM preserves the local manifold structure of
+data in a nonlinear space while promoting the formation of an optimal
+block-diagonal affinity matrix. A thorough theoretical examination of DKLM
+reveals its relationship with existing clustering paradigms. Comprehensive
+experiments on synthetic and real-world datasets demonstrate the effectiveness
+of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Strategy Masking: A Method for Guardrails in Value-based Reinforcement
+  Learning Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05501v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05501v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Keane, Sam Keyser, Jeremy Kedziora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of reward functions to structure AI learning and decision making is
+core to the current reinforcement learning paradigm; however, without careful
+design of reward functions, agents can learn to solve problems in ways that may
+be considered "undesirable" or "unethical." Without thorough understanding of
+the incentives a reward function creates, it can be difficult to impose
+principled yet general control mechanisms over its behavior. In this paper, we
+study methods for constructing guardrails for AI agents that use reward
+functions to learn decision making. We introduce a novel approach, which we
+call strategy masking, to explicitly learn and then suppress undesirable AI
+agent behavior. We apply our method to study lying in AI agents and show that
+it can be used to effectively modify agent behavior by suppressing lying
+post-training without compromising agent ability to perform effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inference-Time Alignment in Diffusion Models with Reward-Guided
+  Generation: Tutorial and <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09685v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09685v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masatoshi Uehara, Yulai Zhao, Chenyu Wang, Xiner Li, Aviv Regev, Sergey Levine, Tommaso Biancalani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This tutorial provides an in-depth guide on inference-time guidance and
+alignment methods for optimizing downstream reward functions in diffusion
+models. While diffusion models are renowned for their generative modeling
+capabilities, practical applications in fields such as biology often require
+sample generation that maximizes specific metrics (e.g., stability, affinity in
+proteins, closeness to target structures). In these scenarios, diffusion models
+can be adapted not only to generate realistic samples but also to explicitly
+maximize desired measures at inference time without fine-tuning. This tutorial
+explores the foundational aspects of such inference-time algorithms. We review
+these methods from a unified perspective, demonstrating that current techniques
+-- such as Sequential Monte Carlo (SMC)-based guidance, value-based sampling,
+and classifier guidance -- aim to approximate soft optimal denoising processes
+(a.k.a. policies in RL) that combine pre-trained denoising processes with value
+functions serving as look-ahead functions that predict from intermediate states
+to terminal rewards. Within this framework, we present several novel algorithms
+not yet covered in the literature. Furthermore, we discuss (1) fine-tuning
+methods combined with inference-time techniques, (2) inference-time algorithms
+based on search algorithms such as Monte Carlo tree search, which have received
+limited attention in current research, and (3) connections between
+inference-time algorithms in language models and diffusion models. The code of
+this tutorial on protein design is available at
+https://github.com/masa-ue/AlignInversePro
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We plan to add more content and codes. Please let us know if there
+  are any comments or missing citations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Blind Normalized Stein Variational Gradient Descent-Based Detection
+  for Intelligent Random Access in Cellular IoT 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18846v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18846v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhu, Ahmet Enis Cetin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of an efficient preamble detection algorithm remains a challenge for
+solving preamble collision problems in intelligent random access (RA) in the
+cellular Internet of Things (IoT). To address this problem, we present an early
+preamble detection scheme based on a maximum likelihood estimation (MLE) model
+at the first step of the grant-based RA procedure. A novel blind normalized
+Stein variational gradient descent (SVGD)-based detector is proposed to obtain
+an approximate solution to the MLE model. First, by exploring the relationship
+between the Hadamard transform and wavelet packet transform, a new modified
+Hadamard transform (MHT) is developed to separate high-frequency components
+from signals using the second-order derivative filter. Next, to eliminate noise
+and mitigate the vanishing gradients problem in the SVGD-based detectors, the
+block MHT layer is designed based on the MHT, scaling layer, soft-thresholding
+layer, inverse MHT and sparsity penalty. Then, the blind normalized SVGD
+algorithm is derived to perform preamble detection without prior knowledge of
+noise power and the number of active IoT devices. The experimental results show
+the proposed block MHT layer outperforms other transform-based methods in terms
+of computation costs and denoising performance. Furthermore, with the
+assistance of the block MHT layer, the proposed blind normalized SVGD algorithm
+achieves a higher preamble detection accuracy and throughput than other
+state-of-the-art detection methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the IEEE Internet of Things Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Unknowable Limits to Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19223v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19223v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiani Yan, Charles Rahal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a rigorous decomposition of predictive error, highlighting that
+not all 'irreducible' error is genuinely immutable. Many domains stand to
+benefit from iterative enhancements in measurement, construct validity, and
+modeling. Our approach demonstrates how apparently 'unpredictable' outcomes can
+become more tractable with improved data (across both target and features) and
+refined algorithms. By distinguishing aleatoric from epistemic error, we
+delineate how accuracy may asymptotically improve--though inherent
+stochasticity may remain--and offer a robust framework for advancing
+computational research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing End Stage Renal Disease Outcome Prediction: A Multi-Sourced
+  Data-Driven Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.01859v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.01859v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubo Li, Rema Padman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: To improve prediction of Chronic Kidney Disease (CKD) progression
+to End Stage Renal Disease (ESRD) using machine learning (ML) and deep learning
+(DL) models applied to an integrated clinical and claims dataset of varying
+observation windows, supported by explainable AI (XAI) to enhance
+interpretability and reduce bias.
+  Materials and Methods: We utilized data about 10,326 CKD patients, combining
+their clinical and claims information from 2009 to 2018. Following data
+preprocessing, cohort identification, and feature engineering, we evaluated
+multiple statistical, ML and DL models using data extracted from five distinct
+observation windows. Feature importance and Shapley value analysis were
+employed to understand key predictors. Models were tested for robustness,
+clinical relevance, misclassification errors and bias issues.
+  Results: Integrated data models outperformed those using single data sources,
+with the Long Short-Term Memory (LSTM) model achieving the highest AUC (0.93)
+and F1 score (0.65). A 24-month observation window was identified as optimal
+for balancing early detection and prediction accuracy. The 2021 eGFR equation
+improved prediction accuracy and reduced racial bias, notably for African
+American patients. Discussion: Improved ESRD prediction accuracy, results
+interpretability and bias mitigation strategies presented in this study have
+the potential to significantly enhance CKD and ESRD management, support
+targeted early interventions and reduce healthcare disparities.
+  Conclusion: This study presents a robust framework for predicting ESRD
+outcomes in CKD patients, improving clinical decision-making and patient care
+through multi-sourced, integrated data and AI/ML methods. Future research will
+expand data integration and explore the application of this framework to other
+chronic diseases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Free Lunch in the Forest: Functionally-Identical Pruning of Boosted Tree
+  Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16167v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16167v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssouf Emine, Alexandre Forel, Idriss Malek, Thibaut Vidal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tree ensembles, including boosting methods, are highly effective and widely
+used for tabular data. However, large ensembles lack interpretability and
+require longer inference times. We introduce a method to prune a tree ensemble
+into a reduced version that is "functionally identical" to the original model.
+In other words, our method guarantees that the prediction function stays
+unchanged for any possible input. As a consequence, this pruning algorithm is
+lossless for any aggregated metric. We formalize the problem of functionally
+identical pruning on ensembles, introduce an exact optimization model, and
+provide a fast yet highly effective method to prune large ensembles. Our
+algorithm iteratively prunes considering a finite set of points, which is
+incrementally augmented using an adversarial model. In multiple computational
+experiments, we show that our approach is a "free lunch", significantly
+reducing the ensemble size without altering the model's behavior. Thus, we can
+preserve state-of-the-art performance at a fraction of the original model's
+size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimization and Generalization Guarantees for Weight Normalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08935v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08935v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Cisneros-Velarde, Zhijie Chen, Sanmi Koyejo, Arindam Banerjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weight normalization (WeightNorm) is widely used in practice for the training
+of deep neural networks and modern deep learning libraries have built-in
+implementations of it. In this paper, we provide the first theoretical
+characterizations of both optimization and generalization of deep WeightNorm
+models with smooth activation functions. For optimization, from the form of the
+Hessian of the loss, we note that a small Hessian of the predictor leads to a
+tractable analysis. Thus, we bound the spectral norm of the Hessian of
+WeightNorm networks and show its dependence on the network width and weight
+normalization terms--the latter being unique to networks without WeightNorm.
+Then, we use this bound to establish training convergence guarantees under
+suitable assumptions for gradient decent. For generalization, we use WeightNorm
+to get a uniform convergence based generalization bound, which is independent
+from the width and depends sublinearly on the depth. Finally, we present
+experimental results which illustrate how the normalization terms and other
+quantities of theoretical interest relate to the training of WeightNorm
+networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Against the RAG: Jamming Retrieval-Augmented Generation with
+  Blocker Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.05870v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.05870v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avital Shafran, Roei Schuster, Vitaly Shmatikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) systems respond to queries by retrieving
+relevant documents from a knowledge database and applying an LLM to the
+retrieved documents. We demonstrate that RAG systems that operate on databases
+with untrusted content are vulnerable to denial-of-service attacks we call
+jamming. An adversary can add a single ``blocker'' document to the database
+that will be retrieved in response to a specific query and result in the RAG
+system not answering this query - ostensibly because it lacks the relevant
+information or because the answer is unsafe.
+  We describe and measure the efficacy of several methods for generating
+blocker documents, including a new method based on black-box optimization. This
+method (1) does not rely on instruction injection, (2) does not require the
+adversary to know the embedding or LLM used by the target RAG system, and (3)
+does not rely on an auxiliary LLM.
+  We evaluate jamming attacks on several LLMs and embeddings and demonstrate
+that the existing safety metrics for LLMs do not capture their vulnerability to
+jamming. We then discuss defenses against blocker documents.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Navigating the Designs of Privacy-Preserving Fine-tuning for Large
+  Language Models <span class="chip">WWW 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04323v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04323v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Shi, Tu Ouyang, An Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning has proven effective in enhancing Large Language Models'
+(LLMs) performance on downstream tasks. However, real-world fine-tuning faces
+inherent conflicts between model providers' intellectual property protection,
+clients' data privacy requirements, and tuning costs. While recent approaches
+like split learning and offsite tuning demonstrate promising architectures for
+privacy-preserving fine-tuning, there is a gap in systematically addressing the
+multidimensional trade-offs required for diverse real-world deployments. We
+propose several indicative evaluation metrics to guide design trade-offs for
+privacy-preserving fine-tuning and a series of example designs, collectively
+named GuardedTuning; they result from novel combinations of system
+architectures with adapted privacy-enhancement methods and emerging computation
+techniques. Each design represents distinct trade-offs across model utility,
+privacy guarantees, and costs. Experimental results demonstrate that these
+designs protect against data reconstruction attacks while maintaining
+competitive fine-tuning performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WWW 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Treatment of Statistical Estimation Problems in Randomized Smoothing for
+  Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.17830v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.17830v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaclav Voracek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Randomized smoothing is a popular certified defense against adversarial
+attacks. In its essence, we need to solve a problem of statistical estimation
+which is usually very time-consuming since we need to perform numerous (usually
+$10^5$) forward passes of the classifier for every point to be certified. In
+this paper, we review the statistical estimation problems for randomized
+smoothing to find out if the computational burden is necessary. In particular,
+we consider the (standard) task of adversarial robustness where we need to
+decide if a point is robust at a certain radius or not using as few samples as
+possible while maintaining statistical guarantees. We present estimation
+procedures employing confidence sequences enjoying the same statistical
+guarantees as the standard methods, with the optimal sample complexities for
+the estimation task and empirically demonstrate their good performance.
+Additionally, we provide a randomized version of Clopper-Pearson confidence
+intervals resulting in strictly stronger certificates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>comments are welcome; neurips 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Block-wise Gradient Shuffle for Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21347v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21347v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Zagardo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Differentially Private Stochastic Gradient Descent (DP-SGD)
+introduces statistical noise on top of gradients drawn from a Gaussian
+distribution to ensure privacy. This paper introduces the novel Differentially
+Private Block-wise Gradient Shuffle (DP-BloGS) algorithm for deep learning.
+BloGS builds off of existing private deep learning literature, but makes a
+definitive shift by taking a probabilistic approach to gradient noise
+introduction through shuffling modeled after information theoretic privacy
+analyses. The theoretical results presented in this paper show that the
+combination of shuffling, parameter-specific block size selection, batch layer
+clipping, and gradient accumulation allows DP-BloGS to achieve training times
+close to that of non-private training while maintaining similar privacy and
+utility guarantees to DP-SGD. DP-BloGS is found to be significantly more
+resistant to data extraction attempts than DP-SGD. The theoretical results are
+validated by the experimental findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The results are genuine, but the math is wrong! Please do not use
+  this method for your Differential Privacy implementations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Too Good to be True? Turn Any Model Differentially Private With
+  DP-Weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19507v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19507v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Zagardo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imagine training a machine learning model with Differentially Private
+Stochastic Gradient Descent (DP-SGD), only to discover post-training that the
+noise level was either too high, crippling your model's utility, or too low,
+compromising privacy. The dreaded realization hits: you must start the lengthy
+training process from scratch. But what if you could avoid this retraining
+nightmare? In this study, we introduce a groundbreaking approach (to our
+knowledge) that applies differential privacy noise to the model's weights after
+training. We offer a comprehensive mathematical proof for this novel approach's
+privacy bounds, use formal methods to validate its privacy guarantees, and
+empirically evaluate its effectiveness using membership inference attacks and
+performance evaluations. This method allows for a single training run, followed
+by post-hoc noise adjustments to achieve optimal privacy-utility trade-offs. We
+compare this novel fine-tuned model (DP-Weights model) to a traditional DP-SGD
+model, demonstrating that our approach yields statistically similar performance
+and privacy guarantees. Our results validate the efficacy of post-training
+noise application, promising significant time savings and flexibility in
+fine-tuning differential privacy parameters, making it a practical alternative
+for deploying differentially private models in real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The results are genuine, but the math is wrong! Please do not use
+  this method for your Differential Privacy implementations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Token Turing Machines are Efficient Vision Models <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.07613v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.07613v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Purvish Jajal, Nick John Eliopoulos, Benjamin Shiue-Hal Chou, George K. Thiravathukal, James C. Davis, Yung-Hsiang Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Vision Token Turing Machines (ViTTM), an efficient, low-latency,
+memory-augmented Vision Transformer (ViT). Our approach builds on Neural Turing
+Machines and Token Turing Machines, which were applied to NLP and sequential
+visual understanding tasks. ViTTMs are designed for non-sequential computer
+vision tasks such as image classification and segmentation. Our model creates
+two sets of tokens: process tokens and memory tokens; process tokens pass
+through encoder blocks and read-write from memory tokens at each encoder block
+in the network, allowing them to store and retrieve information from memory. By
+ensuring that there are fewer process tokens than memory tokens, we are able to
+reduce the inference time of the network while maintaining its accuracy. On
+ImageNet-1K, the state-of-the-art ViT-B has median latency of 529.5ms and 81.0%
+accuracy, while our ViTTM-B is 56% faster (234.1ms), with 2.4 times fewer
+FLOPs, with an accuracy of 82.9%. On ADE20K semantic segmentation, ViT-B
+achieves 45.65mIoU at 13.8 frame-per-second (FPS) whereas our ViTTM-B model
+acheives a 45.17 mIoU with 26.8 FPS (+94%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FinLoRA: Finetuning Quantized Financial Large Language Models Using
+  Low-Rank Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11378v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11378v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dannong Wang, Daniel Kim, Bo Jin, Xingjian Zhao, Tianfan Fu, Steve Yang, Xiao-Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finetuned large language models (LLMs) have shown remarkable performance in
+financial tasks, such as sentiment analysis and information retrieval. Due to
+privacy concerns, finetuning and deploying Financial LLMs (FinLLMs) locally are
+crucial for institutions. However, finetuning FinLLMs poses challenges
+including GPU memory constraints and long input sequences. In this paper, we
+employ quantized low-rank adaptation (QLoRA) to finetune FinLLMs, which
+leverage low-rank matrix decomposition and quantization techniques to
+significantly reduce computational requirements while maintaining high model
+performance. We also employ data and pipeline parallelism to enable local
+finetuning using cost-effective, widely accessible GPUs. Experiments on
+financial datasets demonstrate that our method achieves substantial
+improvements in accuracy, GPU memory usage, and time efficiency, underscoring
+the potential of lowrank methods for scalable and resource-efficient LLM
+finetuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using dynamic loss weighting to boost improvements in forecast stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18267v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18267v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daan Caljon, Jeff Vercauteren, Simon De Vos, Wouter Verbeke, Jente Van Belle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rolling origin forecast instability refers to variability in forecasts for a
+specific period induced by updating the forecast when new data points become
+available. Recently, an extension to the N-BEATS model for univariate time
+series point forecasting was proposed to include forecast stability as an
+additional optimization objective, next to accuracy. It was shown that more
+stable forecasts can be obtained without harming accuracy by minimizing a
+composite loss function that contains both a forecast error and a forecast
+instability component, with a static hyperparameter to control the impact of
+stability. In this paper, we empirically investigate whether further
+improvements in stability can be obtained without compromising accuracy by
+applying dynamic loss weighting algorithms, which change the loss weights
+during training. We show that existing dynamic loss weighting methods can
+achieve this objective and provide insights into why this might be the case.
+Additionally, we propose an extension to the Random Weighting approach --
+Task-Aware Random Weighting -- which also achieves this objective.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UniGraph: Learning a Unified Cross-Domain Foundation Model for
+  Text-Attributed Graphs <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13630v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13630v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei He, Yuan Sui, Xiaoxin He, Bryan Hooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models like ChatGPT and GPT-4 have revolutionized artificial
+intelligence, exhibiting remarkable abilities to generalize across a wide array
+of tasks and applications beyond their initial training objectives. However,
+graph learning has predominantly focused on single-graph models, tailored to
+specific tasks or datasets, lacking the ability to transfer learned knowledge
+to different domains. This limitation stems from the inherent complexity and
+diversity of graph structures, along with the different feature and label
+spaces specific to graph data. In this paper, we recognize text as an effective
+unifying medium and employ Text-Attributed Graphs (TAGs) to leverage this
+potential. We present our UniGraph framework, designed to learn a foundation
+model for TAGs, which is capable of generalizing to unseen graphs and tasks
+across diverse domains. Unlike single-graph models that use pre-computed node
+features of varying dimensions as input, our approach leverages textual
+features for unifying node representations, even for graphs such as molecular
+graphs that do not naturally have textual features. We propose a novel cascaded
+architecture of Language Models (LMs) and Graph Neural Networks (GNNs) as
+backbone networks. Additionally, we propose the first pre-training algorithm
+specifically designed for large-scale self-supervised learning on TAGs, based
+on Masked Graph Modeling. We introduce graph instruction tuning using Large
+Language Models (LLMs) to enable zero-shot prediction ability. Our
+comprehensive experiments across various graph learning tasks and domains
+demonstrate the model's effectiveness in self-supervised representation
+learning on unseen graphs, few-shot in-context transfer, and zero-shot
+transfer, even surpassing or matching the performance of GNNs that have
+undergone supervised training on target datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LatentGAN Autoencoder: Learning Disentangled Latent Distribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.02010v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.02010v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanket Kalwar, Animikh Aich, Tanay Dixit, Adit Chhabra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In autoencoder, the encoder generally approximates the latent distribution
+over the dataset, and the decoder generates samples using this learned latent
+distribution. There is very little control over the latent vector as using the
+random latent vector for generation will lead to trivial outputs. This work
+tries to address this issue by using the LatentGAN generator to directly learn
+to approximate the latent distribution of the autoencoder and show meaningful
+results on MNIST, 3D Chair, and CelebA datasets, an additional
+information-theoretic constrain is used which successfully learns to control
+autoencoder latent distribution. With this, our model also achieves an error
+rate of 2.38 on MNIST unsupervised image classification, which is better as
+compared to InfoGAN and AAE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LaMSUM: Amplifying Voices Against Harassment through LLM Guided
+  Extractive Summarization of User Incident Reports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15809v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15809v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Garima Chhikara, Anurag Sharma, V. Gurucharan, Kripabandhu Ghosh, Abhijnan Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Citizen reporting platforms like Safe City in India help the public and
+authorities stay informed about sexual harassment incidents. However, the high
+volume of data shared on these platforms makes reviewing each individual case
+challenging. Therefore, a summarization algorithm capable of processing and
+understanding various Indian code-mixed languages is essential. In recent
+years, Large Language Models (LLMs) have shown exceptional performance in NLP
+tasks, including summarization. LLMs inherently produce abstractive summaries
+by paraphrasing the original text, while the generation of extractive summaries
+- selecting specific subsets from the original text - through LLMs remains
+largely unexplored. Moreover, LLMs have a limited context window size,
+restricting the amount of data that can be processed at once. We tackle these
+challenge by introducing LaMSUM, a novel multi-level framework designed to
+generate extractive summaries for large collections of Safe City posts using
+LLMs. LaMSUM integrates summarization with different voting methods to achieve
+robust summaries. Extensive evaluation using three popular LLMs (Llama, Mistral
+and GPT-4o) demonstrates that LaMSUM outperforms state-of-the-art extractive
+summarization methods for Safe City posts. Overall, this work represents one of
+the first attempts to achieve extractive summarization through LLMs, and is
+likely to support stakeholders by offering a comprehensive overview and
+enabling them to develop effective policies to minimize incidents of
+unwarranted harassment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking a wide range of optimisers for solving the Fermi-Hubbard
+  model using the variational quantum eigensolver 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.13742v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.13742v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin D. M. Jones, Lana Mineh, Ashley Montanaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We numerically benchmark 30 optimisers on 372 instances of the variational
+quantum eigensolver for solving the Fermi-Hubbard system with the Hamiltonian
+variational ansatz. We rank the optimisers with respect to metrics such as
+final energy achieved and function calls needed to get within a certain
+tolerance level, and find that the best performing optimisers are variants of
+gradient descent such as Momentum and ADAM (using finite difference), SPSA,
+CMAES, and BayesMGD. We also perform gradient analysis and observe that the
+step size for finite difference has a very significant impact. We also consider
+using simultaneous perturbation (inspired by SPSA) as a gradient subroutine:
+here finite difference can lead to a more precise estimate of the ground state
+but uses more calls, whereas simultaneous perturbation can converge quicker but
+may be less precise in the later stages. Finally, we also study the quantum
+natural gradient algorithm: we implement this method for 1-dimensional
+Fermi-Hubbard systems, and find that whilst it can reach a lower energy with
+fewer iterations, this improvement is typically lost when taking total function
+calls into account. Our method involves performing careful hyperparameter
+sweeping on 4 instances. We present a variety of analysis and figures, detailed
+optimiser notes, and discuss future directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 30 figures. Version 2 contains minor edits and additional
+  references. Associated data can be found at
+  https://doi.org/10.5281/zenodo.13960674</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Feature Construction for Anomaly Detection in Time Series
+  -- An Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07999v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07999v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marine Hamon, Vincent Lemaire, Nour Eddine Yassine Nair-Benrekia, Samuel Berlemont, Julien Cumin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To detect anomalies with precision and without prior knowledge in time
+series, is it better to build a detector from the initial temporal
+representation, or to compute a new (tabular) representation using an existing
+automatic variable construction library? In this article, we address this
+question by conducting an in-depth experimental study for two popular detectors
+(Isolation Forest and Local Outlier Factor). The obtained results, for 5
+different datasets, show that the new representation, computed using the
+tsfresh library, allows Isolation Forest to significantly improve its
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FAIR-TAT: Improving Model Fairness Using Targeted Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23142v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23142v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tejaswini Medi, Steffen Jung, Margret Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are susceptible to adversarial attacks and common
+corruptions, which undermine their robustness. In order to enhance model
+resilience against such challenges, Adversarial Training (AT) has emerged as a
+prominent solution. Nevertheless, adversarial robustness is often attained at
+the expense of model fairness during AT, i.e., disparity in class-wise
+robustness of the model. While distinctive classes become more robust towards
+such adversaries, hard to detect classes suffer. Recently, research has focused
+on improving model fairness specifically for perturbed images, overlooking the
+accuracy of the most likely non-perturbed data. Additionally, despite their
+robustness against the adversaries encountered during model training,
+state-of-the-art adversarial trained models have difficulty maintaining
+robustness and fairness when confronted with diverse adversarial threats or
+common corruptions. In this work, we address the above concerns by introducing
+a novel approach called Fair Targeted Adversarial Training (FAIR-TAT). We show
+that using targeted adversarial attacks for adversarial training (instead of
+untargeted attacks) can allow for more favorable trade-offs with respect to
+adversarial fairness. Empirical results validate the efficacy of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Self-Adaptive Physics-Informed Quantum Machine Learning for Solving
+  Differential Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09215v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09215v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Setty, Rasul Abdusalamov, Felix Motzoi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chebyshev polynomials have shown significant promise as an efficient tool for
+both classical and quantum neural networks to solve linear and nonlinear
+differential equations. In this work, we adapt and generalize this framework in
+a quantum machine learning setting for a variety of problems, including the 2D
+Poisson's equation, second-order linear differential equation, system of
+differential equations, nonlinear Duffing and Riccati equation. In particular,
+we propose in the quantum setting a modified Self-Adaptive Physics-Informed
+Neural Network (SAPINN) approach, where self-adaptive weights are applied to
+problems with multi-objective loss functions. We further explore capturing
+correlations in our loss function using a quantum-correlated measurement,
+resulting in improved accuracy for initial value problems. We analyse also the
+use of entangling layers and their impact on the solution accuracy for
+second-order differential equations. The results indicate a promising approach
+to the near-term evaluation of differential equations on quantum devices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph neural networks informed locally by thermodynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13093v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13093v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alicia Tierz, Iciar Alfaro, David González, Francisco Chinesta, Elías Cueto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thermodynamics-informed neural networks employ inductive biases for the
+enforcement of the first and second principles of thermodynamics. To construct
+these biases, a metriplectic evolution of the system is assumed. This provides
+excellent results, when compared to uninformed, black box networks. While the
+degree of accuracy can be increased in one or two orders of magnitude, in the
+case of graph networks, this requires assembling global Poisson and dissipation
+matrices, which breaks the local structure of such networks. In order to avoid
+this drawback, a local version of the metriplectic biases has been developed in
+this work, which avoids the aforementioned matrix assembly, thus preserving the
+node-by-node structure of the graph networks. We apply this framework for
+examples in the fields of solid and fluid mechanics. Our approach demonstrates
+significant computational efficiency and strong generalization capabilities,
+accurately making inferences on examples significantly different from those
+encountered during training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Diabetic Retinopathy Detection with CNN-Based Models: A
+  Comparative Study of UNET and Stacked UNET Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.01251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.01251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ameya Uppina, S Navaneetha Krishnan, Talluri Krishna Sai Teja, Nikhil N Iyer, Joe Dhanith P R
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diabetic Retinopathy DR is a severe complication of diabetes. Damaged or
+abnormal blood vessels can cause loss of vision. The need for massive screening
+of a large population of diabetic patients has generated an interest in a
+computer-aided fully automatic diagnosis of DR. In the realm of Deep learning
+frameworks, particularly convolutional neural networks CNNs, have shown great
+interest and promise in detecting DR by analyzing retinal images. However,
+several challenges have been faced in the application of deep learning in this
+domain. High-quality, annotated datasets are scarce, and the variations in
+image quality and class imbalances pose significant hurdles in developing a
+dependable model. In this paper, we demonstrate the proficiency of two
+Convolutional Neural Networks CNNs based models, UNET and Stacked UNET
+utilizing the APTOS Asia Pacific Tele-Ophthalmology Society Dataset. This
+system achieves an accuracy of 92.81% for the UNET and 93.32% for the stacked
+UNET architecture. The architecture classifies the images into five categories
+ranging from 0 to 4, where 0 is no DR and 4 is proliferative DR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Gradient Flow based on the Sliced Wasserstein
+  Distance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.08227v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.08227v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilana Sebag, Muni Sreenivas Pydi, Jean-Yves Franceschi, Alain Rakotomamonjy, Mike Gartrell, Jamal Atif, Alexandre Allauzen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safeguarding privacy in sensitive training data is paramount, particularly in
+the context of generative modeling. This can be achieved through either
+differentially private stochastic gradient descent or a differentially private
+metric for training models or generators. In this paper, we introduce a novel
+differentially private generative modeling approach based on a gradient flow in
+the space of probability measures. To this end, we define the gradient flow of
+the Gaussian-smoothed Sliced Wasserstein Distance, including the associated
+stochastic differential equation (SDE). By discretizing and defining a
+numerical scheme for solving this SDE, we demonstrate the link between
+smoothing and differential privacy based on a Gaussian mechanism, due to a
+specific form of the SDE's drift term. We then analyze the differential privacy
+guarantee of our gradient flow, which accounts for both the smoothing and the
+Wiener process introduced by the SDE itself. Experiments show that our proposed
+model can generate higher-fidelity data at a low privacy budget compared to a
+generator-based model, offering a promising alternative.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine learning applications in archaeological practices: a <span class="highlight-title">review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathias Bellat, Jordy D. Orellana Figueroa, Jonathan S. Reeves, Ruhollah Taghizadeh-Mehrjardi, Claudio Tennie, Thomas Scholten
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence and machine learning applications in archaeology have
+increased significantly in recent years, and these now span all subfields,
+geographical regions, and time periods. The prevalence and success of these
+applications have remained largely unexamined, as recent reviews on the use of
+machine learning in archaeology have only focused only on specific subfields of
+archaeology. Our review examined an exhaustive corpus of 135 articles published
+between 1997 and 2022. We observed a significant increase in the number of
+publications from 2019 onwards. Automatic structure detection and artefact
+classification were the most represented tasks in the articles reviewed,
+followed by taphonomy, and archaeological predictive modelling. From the
+review, clustering and unsupervised methods were underrepresented compared to
+supervised models. Artificial neural networks and ensemble learning account for
+two thirds of the total number of models used. However, if machine learning
+models are gaining in popularity they remain subject to misunderstanding. We
+observed, in some cases, poorly defined requirements and caveats of the machine
+learning methods used. Furthermore, the goals and the needs of machine learning
+applications for archaeological purposes are in some cases unclear or poorly
+expressed. To address this, we proposed a workflow guide for archaeologists to
+develop coherent and consistent methodologies adapted to their research
+questions, project scale and data. As in many other areas, machine learning is
+rapidly becoming an important tool in archaeological research and practice,
+useful for the analyses of large and multivariate data, although not without
+limitations. This review highlights the importance of well-defined and
+well-reported structured methodologies and collaborative practices to maximise
+the potential of applications of machine learning methods in archaeology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PandaSkill -- Player Performance and Skill Rating in Esports:
+  Application to League of Legends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10049v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10049v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime De Bois, Flora Parmentier, Raphaël Puget, Matthew Tanti, Jordan Peltier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To take the esports scene to the next level, we introduce PandaSkill, a
+framework for assessing player performance and skill rating. Traditional rating
+systems like Elo and TrueSkill often overlook individual contributions and face
+challenges in professional esports due to limited game data and fragmented
+competitive scenes. PandaSkill leverages machine learning to estimate in-game
+player performance from individual player statistics. Each in-game role is
+modeled independently, ensuring a fair comparison between them. Then, using
+these performance scores, PandaSkill updates the player skill ratings using the
+Bayesian framework OpenSkill in a free-for-all setting. In this setting, skill
+ratings are updated solely based on performance scores rather than game
+outcomes, hightlighting individual contributions. To address the challenge of
+isolated rating pools that hinder cross-regional comparisons, PandaSkill
+introduces a dual-rating system that combines players' regional ratings with a
+meta-rating representing each region's overall skill level. Applying PandaSkill
+to five years of professional League of Legends matches worldwide, we show that
+our method produces skill ratings that better predict game outcomes and align
+more closely with expert opinions compared to existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ APOLLO: SGD-like Memory, AdamW-level Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05270v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05270v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanqing Zhu, Zhenyu Zhang, Wenyan Cong, Xi Liu, Sem Park, Vikas Chandra, Bo Long, David Z. Pan, Zhangyang Wang, Jinwon Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are notoriously memory-intensive during
+training, particularly with the popular AdamW optimizer. This memory burden
+necessitates using more or higher-end GPUs or reducing batch sizes, limiting
+training scalability and throughput. To address this, various memory-efficient
+optimizers have been proposed to reduce optimizer memory usage. However, they
+face critical challenges: (i) reliance on costly SVD operations; (ii)
+significant performance trade-offs compared to AdamW; and (iii) still
+substantial optimizer memory overhead to maintain competitive performance.
+  In this work, we identify that AdamW's learning rate adaptation rule can be
+effectively coarsened as a structured learning rate update. Based on this
+insight, we propose Approximated Gradient Scaling for Memory-Efficient LLM
+Optimization (APOLLO), which approximates learning rate scaling using an
+auxiliary low-rank optimizer state based on pure random projection. This
+structured learning rate update rule makes APOLLO highly tolerant to further
+memory reductions while delivering comparable pre-training performance. Even
+its rank-1 variant, APOLLO-Mini, achieves superior pre-training performance
+compared to AdamW with SGD-level memory costs.
+  Extensive experiments demonstrate that the APOLLO series performs on-par with
+or better than AdamW, while achieving greater memory savings by nearly
+eliminating the optimization states of AdamW. These savings provide significant
+system-level benefits: (1) Enhanced Throughput: 3x throughput on an 8xA100-80GB
+setup compared to AdamW by supporting 4x larger batch sizes. (2) Improved Model
+Scalability: Pre-training LLaMA-13B with naive DDP on A100-80GB GPUs without
+system-level optimizations. (3) Low-End GPU Friendly Pre-training: Pre-training
+LLaMA-7B on a single GPU using less than 12 GB of memory with weight
+quantization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; update code link and visualization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic rating of incomplete hippocampal inversions evaluated across
+  multiple cohorts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.02496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.02496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lisa Hemforth, Baptiste Couvy-Duchesne, Kevin De Matos, Camille Brianceau, Matthieu Joulot, Tobias Banaschewski, Arun L. W. Bokde, Sylvane Desrivières, Herta Flor, Antoine Grigis, Hugh Garavan, Penny Gowland, Andreas Heinz, Rüdiger Brühl, Jean-Luc Martinot, Marie-Laure Paillère Martinot, Eric Artiges, Dimitri Papadopoulos, Herve Lemaitre, Tomas Paus, Luise Poustka, Sarah Hohmann, Nathalie Holz, Juliane H. Fröhner, Michael N. Smolka, Nilakshi Vaidya, Henrik Walter, Robert Whelan, Gunter Schumann, Christian Büchel, JB Poline, Bernd Itterman, Vincent Frouin, Alexandre Martin, IMAGEN study group, Claire Cury, Olivier Colliot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incomplete Hippocampal Inversion (IHI), sometimes called hippocampal
+malrotation, is an atypical anatomical pattern of the hippocampus found in
+about 20% of the general population. IHI can be visually assessed on coronal
+slices of T1 weighted MR images, using a composite score that combines four
+anatomical criteria. IHI has been associated with several brain disorders
+(epilepsy, schizophrenia). However, these studies were based on small samples.
+Furthermore, the factors (genetic or environmental) that contribute to the
+genesis of IHI are largely unknown. Large-scale studies are thus needed to
+further understand IHI and their potential relationships to neurological and
+psychiatric disorders. However, visual evaluation is long and tedious,
+justifying the need for an automatic method. In this paper, we propose, for the
+first time, to automatically rate IHI. We proceed by predicting four anatomical
+criteria, which are then summed up to form the IHI score, providing the
+advantage of an interpretable score. We provided an extensive experimental
+investigation of different machine learning methods and training strategies. We
+performed automatic rating using a variety of deep learning models (conv5-FC3,
+ResNet and SECNN) as well as a ridge regression. We studied the generalization
+of our models using different cohorts and performed multi-cohort learning. We
+relied on a large population of 2,008 participants from the IMAGEN study, 993
+and 403 participants from the QTIM/QTAB studies as well as 985 subjects from
+the UKBiobank. We showed that deep learning models outperformed a ridge
+regression. We demonstrated that the performances of the conv5-FC3 network were
+at least as good as more complex networks while maintaining a low complexity
+and computation time. We showed that training on a single cohort may lack in
+variability while training on several cohorts improves generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the Journal of Machine Learning for
+  Biomedical Imaging (MELBA) https://melba-journal.org/2024:016</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Autoregressive <span class="highlight-title">Transformer</span>s: Combining Byte- and Word-Level
+  Processing for Robust, Adaptable Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10322v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10322v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pit Neitemeier, Björn Deiseroth, Constantin Eichenberg, Lukas Balles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tokenization is a fundamental step in natural language processing, breaking
+text into units that computational models can process. While learned subword
+tokenizers have become the de-facto standard, they present challenges such as
+large vocabularies, limited adaptability to new domains or languages, and
+sensitivity to spelling errors and variations. To overcome these limitations,
+we investigate a hierarchical architecture for autoregressive language
+modelling that combines character-level and word-level processing. It employs a
+lightweight character-level encoder to convert character sequences into word
+embeddings, which are then processed by a word-level backbone model and decoded
+back into characters via a compact character-level decoder. This method retains
+the sequence compression benefits of word-level tokenization without relying on
+a rigid, predefined vocabulary. We demonstrate, at scales up to 7 billion
+parameters, that hierarchical transformers match the downstream task
+performance of subword-tokenizer-based models while exhibiting significantly
+greater robustness to input perturbations. Additionally, during continued
+pretraining on an out-of-domain language, our model trains almost twice as
+fast, achieves superior performance on the target language, and retains more of
+its previously learned knowledge. Hierarchical transformers pave the way for
+NLP systems that are more robust, flexible, and generalizable across languages
+and domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ECGrecover: a Deep Learning Approach for Electrocardiogram Signal
+  Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.16901v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.16901v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Lence, Federica Granese, Ahmad Fall, Blaise Hanczar, Joe-Elie Salem, Jean-Daniel Zucker, Edi Prifti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we address the challenge of reconstructing the complete 12-lead
+ECG signal from its incomplete parts. We focus on two main scenarios: (i)
+reconstructing missing signal segments within an ECG lead and (ii) recovering
+entire leads from signal in another unique lead. Two emerging clinical
+applications emphasize the relevance of our work. The first is the increasing
+need to digitize paper-stored ECGs for utilization in AI-based applications,
+often limited to digital 12 lead 10s ECGs. The second is the widespread use of
+wearable devices that record ECGs but typically capture only one or a few
+leads. In both cases, a non-negligible amount of information is lost or not
+recorded. Our approach aims to recover this missing signal. We propose
+ECGrecover, a U-Net neural network model trained on a novel composite objective
+function to address the reconstruction problem. This function incorporates both
+spatial and temporal features of the ECG by combining the distance in amplitude
+and sycnhronization through time between the reconstructed and the real digital
+signals. We used real-life ECG datasets and through comprehensive assessments
+compared ECGrecover with three state-of-the-art methods based on generative
+adversarial networks (EKGAN, Pix2Pix) as well as the CopyPaste strategy. The
+results demonstrated that ECGrecover consistently outperformed state-of-the-art
+methods in standard distortion metrics as well as in preserving critical ECG
+characteristics, particularly the P, QRS, and T wave coordinates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 14 figures, 29 tables, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Credit Risk Identification in Supply Chains Using Generative Adversarial
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10348v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10348v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zizhou Zhang, Xinshi Li, Yu Cheng, Zhenrui Chen, Qianying Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Credit risk management within supply chains has emerged as a critical
+research area due to its significant implications for operational stability and
+financial sustainability. The intricate interdependencies among supply chain
+participants mean that credit risks can propagate across networks, with impacts
+varying by industry. This study explores the application of Generative
+Adversarial Networks (GANs) to enhance credit risk identification in supply
+chains. GANs enable the generation of synthetic credit risk scenarios,
+addressing challenges related to data scarcity and imbalanced datasets. By
+leveraging GAN-generated data, the model improves predictive accuracy while
+effectively capturing dynamic and temporal dependencies in supply chain data.
+The research focuses on three representative industries-manufacturing (steel),
+distribution (pharmaceuticals), and services (e-commerce) to assess
+industry-specific credit risk contagion. Experimental results demonstrate that
+the GAN-based model outperforms traditional methods, including logistic
+regression, decision trees, and neural networks, achieving superior accuracy,
+recall, and F1 scores. The findings underscore the potential of GANs in
+proactive risk management, offering robust tools for mitigating financial
+disruptions in supply chains. Future research could expand the model by
+incorporating external market factors and supplier relationships to further
+enhance predictive capabilities. Keywords- Generative Adversarial Networks
+(GANs); Supply Chain Risk; Credit Risk Identification; Machine Learning; Data
+Augmentation
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper will be published and indexed by IEEE at 2025 8th
+  International Conference on Advanced Algorithms and Control Engineering
+  (ICAACE 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unveil Inversion and Invariance in Flow <span class="highlight-title">Transformer</span> for Versatile Image
+  Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15843v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15843v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Xu, Boyuan Jiang, Xiaobin Hu, Donghao Luo, Qingdong He, Jiangning Zhang, Chengjie Wang, Yunsheng Wu, Charles Ling, Boyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging the large generative prior of the flow transformer for tuning-free
+image editing requires authentic inversion to project the image into the
+model's domain and a flexible invariance control mechanism to preserve
+non-target contents. However, the prevailing diffusion inversion performs
+deficiently in flow-based models, and the invariance control cannot reconcile
+diverse rigid and non-rigid editing tasks. To address these, we systematically
+analyze the \textbf{inversion and invariance} control based on the flow
+transformer. Specifically, we unveil that the Euler inversion shares a similar
+structure to DDIM yet is more susceptible to the approximation error. Thus, we
+propose a two-stage inversion to first refine the velocity estimation and then
+compensate for the leftover error, which pivots closely to the model prior and
+benefits editing. Meanwhile, we propose the invariance control that manipulates
+the text features within the adaptive layer normalization, connecting the
+changes in the text prompt to image semantics. This mechanism can
+simultaneously preserve the non-target contents while allowing rigid and
+non-rigid manipulation, enabling a wide range of editing types such as visual
+text, quantity, facial expression, etc. Experiments on versatile scenarios
+validate that our framework achieves flexible and accurate editing, unlocking
+the potential of the flow transformer for versatile image editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://pengchengpcx.github.io/EditFT/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BoostStep: Boosting mathematical capability of Large Language Models via
+  improved single-step reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beichen Zhang, Yuhong Liu, Xiaoyi Dong, Yuhang Zang, Pan Zhang, Haodong Duan, Yuhang Cao, Dahua Lin, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cutting-edge large language models (LLMs) demonstrate promising performance
+in solving complex math problems with a divide-and-conquer pipeline and the
+assistance of in-context learning (ICL) examples. However, their potential for
+improvement is limited by two critical problems within their ICL examples:
+granularity-mismatch and the ensuing negative-effect noise problem.
+Specifically, the LLMs are capable of the dividing process yet mostly failed by
+inaccurate reasoning within a few conquer steps, while the ICL examples
+retrieved in question-grained sometimes lack relevant steps for a specific
+challenging reasoning step. Further, this disconnect may hinder the correct
+reasoning due to its irrelevance. To this end, we focus on improving the
+reasoning quality within each step and present BoostStep. BoostStep aligns the
+granularity between the retrieving and reasoning on step grained, and provides
+highly related ICL examples for each reasoning step with a novel `first-try'
+strategy. BoostStep provides more relevant examples than the coarse
+question-grained strategy, enhancing the model reasoning quality within each
+step steadily. BoostStep is a general and robust reasoning-enhancing method
+that not only improves standalone reasoning performance but also integrates
+seamlessly with Monte Carlo Tree Search methods (MCTS) to refine both candidate
+generation and decision-making. Quantitatively, it improves GPT-4o and
+Qwen2.5-Math-72B by 3.6\% and 2.0\% respectively on various mathematical
+benchmarks, and 7.5\% gain combined with MCTS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Codes and Data are available at
+  https://github.com/beichenzbc/BoostStep</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constructing Fair Latent Space for Intersection of Fairness and
+  Explainability <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17523v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17523v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyungjun Joo, Hyeonggeun Han, Sehwan Kim, Sangwoo Hong, Jungwoo Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the use of machine learning models has increased, numerous studies have
+aimed to enhance fairness. However, research on the intersection of fairness
+and explainability remains insufficient, leading to potential issues in gaining
+the trust of actual users. Here, we propose a novel module that constructs a
+fair latent space, enabling faithful explanation while ensuring fairness. The
+fair latent space is constructed by disentangling and redistributing labels and
+sensitive attributes, allowing the generation of counterfactual explanations
+for each type of information. Our module is attached to a pretrained generative
+model, transforming its biased latent space into a fair latent space.
+Additionally, since only the module needs to be trained, there are advantages
+in terms of time and cost savings, without the need to train the entire
+generative model. We validate the fair latent space with various fairness
+metrics and demonstrate that our approach can effectively provide explanations
+for biased decisions and assurances of fairness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, accepted in AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DPO Kernels: A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich
+  Paradigm for Direct Preference Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03271v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03271v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amitava Das, Suranjana Trivedy, Danush Khanna, Rajarshi Roy, Gurpreet Singh, Basab Ghosh, Yaswanth Narsupalli, Vinija Jain, Vasu Sharma, Aishwarya Naresh Reganti, Aman Chadha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid rise of large language models (LLMs) has unlocked many applications
+but also underscores the challenge of aligning them with diverse values and
+preferences. Direct Preference Optimization (DPO) is central to alignment but
+constrained by fixed divergences and limited feature transformations. We
+propose DPO-Kernels, which integrates kernel methods to address these issues
+through four key contributions: (i) Kernelized Representations with polynomial,
+RBF, Mahalanobis, and spectral kernels for richer transformations, plus a
+hybrid loss combining embedding-based and probability-based objectives; (ii)
+Divergence Alternatives (Jensen-Shannon, Hellinger, Renyi, Bhattacharyya,
+Wasserstein, and f-divergences) for greater stability; (iii) Data-Driven
+Selection metrics that automatically choose the best kernel-divergence pair;
+and (iv) a Hierarchical Mixture of Kernels for both local precision and global
+modeling. Evaluations on 12 datasets demonstrate state-of-the-art performance
+in factuality, safety, reasoning, and instruction following. Grounded in
+Heavy-Tailed Self-Regularization, DPO-Kernels maintains robust generalization
+for LLMs, offering a comprehensive resource for further alignment research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Probabilistic Circuits: Enabling Compositional and Interpretable
+  Predictions through Logical Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07021v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07021v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixin Chen, Simon Yu, Huajie Shao, Lui Sha, Han Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end deep neural networks have achieved remarkable success across
+various domains but are often criticized for their lack of interpretability.
+While post hoc explanation methods attempt to address this issue, they often
+fail to accurately represent these black-box models, resulting in misleading or
+incomplete explanations. To overcome these challenges, we propose an inherently
+transparent model architecture called Neural Probabilistic Circuits (NPCs),
+which enable compositional and interpretable predictions through logical
+reasoning. In particular, an NPC consists of two modules: an attribute
+recognition model, which predicts probabilities for various attributes, and a
+task predictor built on a probabilistic circuit, which enables logical
+reasoning over recognized attributes to make class predictions. To train NPCs,
+we introduce a three-stage training algorithm comprising attribute recognition,
+circuit construction, and joint optimization. Moreover, we theoretically
+demonstrate that an NPC's error is upper-bounded by a linear combination of the
+errors from its modules. To further demonstrate the interpretability of NPC, we
+provide both the most probable explanations and the counterfactual
+explanations. Empirical results on four benchmark datasets show that NPCs
+strike a balance between interpretability and performance, achieving results
+competitive even with those of end-to-end black-box models while providing
+enhanced interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deciphering the Chaos: Enhancing Jailbreak Attacks via Adversarial
+  <span class="highlight-title">Prompt</span> Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhang Li, Xiaochen Yang, Wangmeng Zuo, Yiwen Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic adversarial prompt generation provides remarkable success in
+jailbreaking safely-aligned large language models (LLMs). Existing
+gradient-based attacks, while demonstrating outstanding performance in
+jailbreaking white-box LLMs, often generate garbled adversarial prompts with
+chaotic appearance. These adversarial prompts are difficult to transfer to
+other LLMs, hindering their performance in attacking unknown victim models. In
+this paper, for the first time, we delve into the semantic meaning embedded in
+garbled adversarial prompts and propose a novel method that "translates" them
+into coherent and human-readable natural language adversarial prompts. In this
+way, we can effectively uncover the semantic information that triggers
+vulnerabilities of the model and unambiguously transfer it to the victim model,
+without overlooking the adversarial information hidden in the garbled text, to
+enhance jailbreak attacks. It also offers a new approach to discovering
+effective designs for jailbreak prompts, advancing the understanding of
+jailbreak attacks. Experimental results demonstrate that our method
+significantly improves the success rate of jailbreak attacks against various
+safety-aligned LLMs and outperforms state-of-the-arts by large margins. With at
+most 10 queries, our method achieves an average attack success rate of 81.8% in
+attacking 7 commercial closed-source LLMs, including GPT and Claude-3 series,
+on HarmBench. Our method also achieves over 90% attack success rates against
+Llama-2-Chat models on AdvBench, despite their outstanding resistance to
+jailbreak attacks. Code at:
+https://github.com/qizhangli/Adversarial-Prompt-Translator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Large-Scale Urban Parking Prediction: Graph Coarsening Based
+  on Real-Time Parking Service Capability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.04022v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.04022v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Wang, Zhenwu Chen, Kangshuai Zhang, Yunduan Cui, Yang Yang, Lei Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the sharp increase in the number of vehicles, the issue of parking
+difficulties has emerged as an urgent challenge that many cities need to
+address promptly. In the task of predicting large-scale urban parking data,
+existing research often lacks effective deep learning models and strategies. To
+tackle this challenge, this paper proposes an innovative framework for
+predicting large-scale urban parking graphs leveraging real-time service
+capabilities, aimed at improving the accuracy and efficiency of parking
+predictions. Specifically, we introduce a graph attention mechanism that
+assesses the real-time service capabilities of parking lots to construct a
+dynamic parking graph that accurately reflects real preferences in parking
+behavior. To effectively handle large-scale parking data, this study combines
+graph coarsening techniques with temporal convolutional autoencoders to achieve
+unified dimension reduction of the complex urban parking graph structure and
+features. Subsequently, we use a spatio-temporal graph convolutional model to
+make predictions based on the coarsened graph, and a pre-trained
+autoencoder-decoder module restores the predicted results to their original
+data dimensions, completing the task. Our methodology has been rigorously
+tested on a real dataset from parking lots in Shenzhen. The experimental
+results indicate that compared to traditional parking prediction models, our
+framework achieves improvements of 46.8\% and 30.5\% in accuracy and
+efficiency, respectively. Remarkably, with the expansion of the graph's scale,
+our framework's advantages become even more apparent, showcasing its
+substantial potential for solving complex urban parking dilemmas in practical
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sequential Portfolio Selection under Latent Side Information-Dependence
+  Structure: Optimality and Universal Learning Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06701v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06701v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duy Khanh Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the investment problem of constructing an optimal
+no-short sequential portfolio strategy in a market with a latent dependence
+structure between asset prices and partly unobservable side information, which
+is often high-dimensional. The results demonstrate that a dynamic strategy,
+which forms a portfolio based on perfect knowledge of the dependence structure
+and full market information over time, may not grow at a higher rate infinitely
+often than a constant strategy, which remains invariant over time.
+Specifically, if the market is stationary, implying that the dependence
+structure is statistically stable, the growth rate of an optimal dynamic
+strategy, utilizing the maximum capacity of the entire market information,
+almost surely decays over time into an equilibrium state, asymptotically
+converging to the growth rate of a constant strategy.
+  Technically, this work reassesses the common belief that a constant strategy
+only attains the optimal limiting growth rate of dynamic strategies when the
+market process is identically and independently distributed. By analyzing the
+dynamic log-optimal portfolio strategy as the optimal benchmark in a stationary
+market with side information, we show that a random optimal constant strategy
+almost surely exists, even when a limiting growth rate for the dynamic strategy
+does not. Consequently, two approaches to learning algorithms for portfolio
+construction are discussed, demonstrating the safety of removing side
+information from the learning process while still guaranteeing an asymptotic
+growth rate comparable to that of the optimal dynamic strategy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, working paper, second draft (with the remark in section 3.2
+  removed from the first draft)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Steering Large Language Models with Feature Guided Activation Additions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09929v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09929v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Soo, Wesley Teng, Chandrasekaran Balaganesh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective and reliable control over large language model (LLM) behavior is a
+significant challenge. While activation steering methods, which add steering
+vectors to a model's hidden states, are a promising approach, existing
+techniques often lack precision and interpretability in how they influence
+model outputs. We introduce Feature Guided Activation Additions (FGAA), a novel
+activation steering method that leverages insights from Contrastive Activation
+Addition (CAA) and Sparse Autoencoder-Targeted Steering (SAE-TS). By operating
+in the latent space of a Sparse Autoencoder (SAE) and employing optimization
+techniques to select desired SAE features, FGAA constructs precise steering
+vectors that provide better steering effects while maintaining coherence of
+steered model outputs. In this regard, evaluations on Gemma-2-2B and Gemma-2-9B
+models across various steering tasks demonstrate that FGAA outperforms existing
+steering methods of CAA, SAE decoder steering, and SAE-TS. Our results also
+highlight important trade-offs between steering scale and general model
+capabilities that are consistent across all tested steering methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 maintext pages, 14 appendix pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Uncertainty Quantification Methods for Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13425v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13425v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenchong He, Zhe Jiang, Tingsong Xiao, Zelin Xu, Yukun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) have achieved tremendous success in making
+accurate predictions for computer vision, natural language processing, as well
+as science and engineering domains. However, it is also well-recognized that
+DNNs sometimes make unexpected, incorrect, but overconfident predictions. This
+can cause serious consequences in high-stake applications, such as autonomous
+driving, medical diagnosis, and disaster response. Uncertainty quantification
+(UQ) aims to estimate the confidence of DNN predictions beyond prediction
+accuracy. In recent years, many UQ methods have been developed for DNNs. It is
+of great practical value to systematically categorize these UQ methods and
+compare their advantages and disadvantages. However, existing surveys mostly
+focus on categorizing UQ methodologies from a neural network architecture
+perspective or a Bayesian perspective and ignore the source of uncertainty that
+each methodology can incorporate, making it difficult to select an appropriate
+UQ method in practice. To fill the gap, this paper presents a systematic
+taxonomy of UQ methods for DNNs based on the types of uncertainty sources (data
+uncertainty versus model uncertainty). We summarize the advantages and
+disadvantages of methods in each category. We show how our taxonomy of UQ
+methodologies can potentially help guide the choice of UQ method in different
+machine learning problems (e.g., active learning, robustness, and reinforcement
+learning). We also identify current research gaps and propose several future
+research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonconvex Stochastic Bregman Proximal Gradient Method with Application
+  to Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14522v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14522v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuangyu Ding, Jingyang Li, Kim-Chuan Toh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradient methods for minimizing nonconvex composite objective
+functions typically rely on the Lipschitz smoothness of the differentiable
+part, but this assumption fails in many important problem classes like
+quadratic inverse problems and neural network training, leading to instability
+of the algorithms in both theory and practice. To address this, we propose a
+family of stochastic Bregman proximal gradient (SBPG) methods that only require
+smooth adaptivity. SBPG replaces the quadratic approximation in SGD with a
+Bregman proximity measure, offering a better approximation model that handles
+non-Lipschitz gradients in nonconvex objectives. We establish the convergence
+properties of vanilla SBPG and show it achieves optimal sample complexity in
+the nonconvex setting. Experimental results on quadratic inverse problems
+demonstrate SBPG's robustness in terms of stepsize selection and sensitivity to
+the initial point. Furthermore, we introduce a momentum-based variant, MSBPG,
+which enhances convergence by relaxing the mini-batch size requirement while
+preserving the optimal oracle complexity. We apply MSBPG to the training of
+deep neural networks, utilizing a polynomial kernel function to ensure smooth
+adaptivity of the loss function. Experimental results on benchmark datasets
+confirm the effectiveness and robustness of MSBPG in training neural networks.
+Given its negligible additional computational cost compared to SGD in
+large-scale optimization, MSBPG shows promise as a universal open-source
+optimizer for future applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>44 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PRKAN: Parameter-Reduced Kolmogorov-Arnold Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07032v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07032v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoang-Thang Ta, Duy-Quy Thai, Anh Tran, Grigori Sidorov, Alexander Gelbukh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kolmogorov-Arnold Networks (KANs) represent an innovation in neural network
+architectures, offering a compelling alternative to Multi-Layer Perceptrons
+(MLPs) in models such as Convolutional Neural Networks (CNNs), Recurrent Neural
+Networks (RNNs), and Transformers. By advancing network design, KANs drive
+groundbreaking research and enable transformative applications across various
+scientific domains involving neural networks. However, existing KANs often
+require significantly more parameters in their network layers than MLPs. To
+address this limitation, this paper introduces PRKANs (Parameter-Reduced
+Kolmogorov-Arnold Networks), which employ several methods to reduce the
+parameter count in KAN layers, making them comparable to MLP layers.
+Experimental results on the MNIST and Fashion-MNIST datasets demonstrate that
+PRKANs outperform several existing KANs, and their variant with attention
+mechanisms rivals the performance of MLPs, albeit with slightly longer training
+times. Furthermore, the study highlights the advantages of Gaussian Radial
+Basis Functions (GRBFs) and layer normalization in KAN designs. The repository
+for this work is available at: https://github.com/hoangthangta/All-KAN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">1</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Verifying Cross-modal Entity Consistency in News using Vision-language
+  Models <span class="chip">ECIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahar Tahmasebi, Eric Müller-Budack, Ralph Ewerth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The web has become a crucial source of information, but it is also used to
+spread disinformation, often conveyed through multiple modalities like images
+and text. The identification of inconsistent cross-modal information, in
+particular entities such as persons, locations, and events, is critical to
+detect disinformation. Previous works either identify out-of-context
+disinformation by assessing the consistency of images to the whole document,
+neglecting relations of individual entities, or focus on generic entities that
+are not relevant to news. So far, only few approaches have addressed the task
+of validating entity consistency between images and text in news. However, the
+potential of large vision-language models (LVLMs) has not been explored yet. In
+this paper, we propose an LVLM-based framework for verifying Cross-modal Entity
+Consistency~(LVLM4CEC), to assess whether persons, locations and events in news
+articles are consistent across both modalities. We suggest effective prompting
+strategies for LVLMs for entity verification that leverage reference images
+crawled from web. Moreover, we extend three existing datasets for the task of
+entity verification in news providing manual ground-truth data. Our results
+show the potential of LVLMs for automating cross-modal entity verification,
+showing improved accuracy in identifying persons and events when using evidence
+images. Moreover, our method outperforms a baseline for location and event
+verification in documents. The datasets and source code are available on GitHub
+at \url{https://github.com/TIBHannover/LVLM4CEC}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in: European Conference on Information
+  Retrieval (ECIR) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">94</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Large Language Models via Random Variables 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijin Hong, Hao Wu, Su Dong, Junnan Dong, Yilin Xiao, Yujing Zhang, Zhu Wang, Feiran Huang, Linyi Li, Hongxia Yang, Xiao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the continuous advancement of large language models (LLMs) in
+mathematical reasoning, evaluating their performance in this domain has become
+a prominent research focus. Recent studies have raised concerns about the
+reliability of current mathematical benchmarks, highlighting issues such as
+simplistic design and potential data leakage. Therefore, creating a reliable
+benchmark that effectively evaluates the genuine capabilities of LLMs in
+mathematical reasoning remains a significant challenge. To address this, we
+propose RV-Bench, a framework for Benchmarking LLMs via Random Variables in
+mathematical reasoning. Specifically, the background content of a random
+variable question (RV question) mirrors the original problem in existing
+standard benchmarks, but the variable combinations are randomized into
+different values. LLMs must fully understand the problem-solving process for
+the original problem to correctly answer RV questions with various combinations
+of variable values. As a result, the LLM's genuine capability in mathematical
+reasoning is reflected by its accuracy on RV-Bench. Extensive experiments are
+conducted with 29 representative LLMs across 900+ RV questions. A leaderboard
+for RV-Bench ranks the genuine capability of these LLMs. Further analysis of
+accuracy dropping indicates that current LLMs still struggle with complex
+mathematical reasoning problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-AI Collaborative Game Testing with Vision Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boran Zhang, Muhan Xu, Zhijun Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As modern video games become increasingly complex, traditional manual testing
+methods are proving costly and inefficient, limiting the ability to ensure
+high-quality game experiences. While advancements in Artificial Intelligence
+(AI) offer the potential to assist human testers, the effectiveness of AI in
+truly enhancing real-world human performance remains underexplored. This study
+investigates how AI can improve game testing by developing and experimenting
+with an AI-assisted workflow that leverages state-of-the-art machine learning
+models for defect detection. Through an experiment involving 800 test cases and
+276 participants of varying backgrounds, we evaluate the effectiveness of AI
+assistance under four conditions: with or without AI support, and with or
+without detailed knowledge of defects and design documentation. The results
+indicate that AI assistance significantly improves defect identification
+performance, particularly when paired with detailed knowledge. However,
+challenges arise when AI errors occur, negatively impacting human
+decision-making. Our findings show the importance of optimizing human-AI
+collaboration and implementing strategies to mitigate the effects of AI
+inaccuracies. By this research, we demonstrate AI's potential and problems in
+enhancing efficiency and accuracy in game testing workflows and offers
+practical insights for integrating AI into the testing process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Experiment Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is logical analysis performed by <span class="highlight-title">transformer</span>s taking place in
+  self-attention or in the fully connected part? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evgeniy Shin, Heinrich Matzinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers architecture apply self-attention to tokens represented as
+vectors, before a fully connected (neuronal network) layer. These two parts can
+be layered many times. Traditionally, self-attention is seen as a mechanism for
+aggregating information before logical operations are performed by the fully
+connected layer. In this paper, we show, that quite counter-intuitively, the
+logical analysis can also be performed within the self-attention. For this we
+implement a handcrafted single-level encoder layer which performs the logical
+analysis within self-attention. We then study the scenario in which a one-level
+transformer model undergoes self-learning using gradient descent. We
+investigate whether the model utilizes fully connected layers or self-attention
+mechanisms for logical analysis when it has the choice. Given that gradient
+descent can become stuck at undesired zeros, we explicitly calculate these
+unwanted zeros and find ways to avoid them. We do all this in the context of
+predicting grammatical category pairs of adjacent tokens in a text. We believe
+that our findings have broader implications for understanding the potential
+logical operations performed by self-attention.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 3 figures, to be submitted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing <span class="highlight-title">Pretrain</span>ing Data Mixtures with LLM-Estimated Utility 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Held, Bhargavi Paranjape, Punit Singh Koura, Mike Lewis, Frank Zhang, Todor Mihaylov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models improve with increasing amounts of high-quality
+training data. However, leveraging larger datasets requires balancing quality,
+quantity, and diversity across sources. After evaluating nine baseline methods
+under both compute- and data-constrained scenarios, we find token-count
+heuristics outperform manual and learned mixes, indicating that simple
+approaches accounting for dataset size and diversity are surprisingly
+effective. Building on this insight, we propose two complementary approaches:
+UtiliMax, which extends token-based heuristics by incorporating utility
+estimates from reduced-scale ablations, achieving up to a 10.6x speedup over
+manual baselines; and Model Estimated Data Utility (MEDU), which leverages LLMs
+to estimate data utility from small samples, matching ablation-based
+performance while reducing computational requirements by $\sim$200x. Together,
+these approaches establish a new framework for automated, compute-efficient
+data mixing that is robust across training regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SILO: Solving Inverse Problems with Latent Operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ron Raphaeli, Sean Man, Michael Elad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consistent improvement of image priors over the years has led to the
+development of better inverse problem solvers. Diffusion models are the
+newcomers to this arena, posing the strongest known prior to date. Recently,
+such models operating in a latent space have become increasingly predominant
+due to their efficiency. In recent works, these models have been applied to
+solve inverse problems. Working in the latent space typically requires multiple
+applications of an Autoencoder during the restoration process, which leads to
+both computational and restoration quality challenges. In this work, we propose
+a new approach for handling inverse problems with latent diffusion models,
+where a learned degradation function operates within the latent space,
+emulating a known image space degradation. Usage of the learned operator
+reduces the dependency on the Autoencoder to only the initial and final steps
+of the restoration process, facilitating faster sampling and superior
+restoration quality. We demonstrate the effectiveness of our method on a
+variety of image restoration tasks and datasets, achieving significant
+improvements over prior art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page in https://ronraphaeli.github.io/SILO-website/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Episodic memory in AI agents poses risks that should be studied and
+  mitigated 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chad DeChant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most current AI models have little ability to store and later retrieve a
+record or representation of what they do. In human cognition, episodic memories
+play an important role in both recall of the past as well as planning for the
+future. The ability to form and use episodic memories would similarly enable a
+broad range of improved capabilities in an AI agent that interacts with and
+takes actions in the world. Researchers have begun directing more attention to
+developing memory abilities in AI models. It is therefore likely that models
+with such capability will be become widespread in the near future. This could
+in some ways contribute to making such AI agents safer by enabling users to
+better monitor, understand, and control their actions. However, as a new
+capability with wide applications, we argue that it will also introduce
+significant new risks that researchers should begin to study and address. We
+outline these risks and benefits and propose four principles to guide the
+development of episodic memory capabilities so that these will enhance, rather
+than undermine, the effort to keep AI safe and trustworthy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span> Vibration Forecasting for Advancing Rail Safety and
+  Maintenance 4.0 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Darío C. Larese, Almudena Bravo Cerrada, Gabriel Dambrosio Tomei, Alejandro Guerrero-López, Pablo M. Olmos, María Jesús Gómez García
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maintaining railway axles is critical to preventing severe accidents and
+financial losses. The railway industry is increasingly interested in advanced
+condition monitoring techniques to enhance safety and efficiency, moving beyond
+traditional periodic inspections toward Maintenance 4.0.
+  This study introduces a robust Deep Autoregressive solution that integrates
+seamlessly with existing systems to avert mechanical failures. Our approach
+simulates and predicts vibration signals under various conditions and fault
+scenarios, improving dataset robustness for more effective detection systems.
+These systems can alert maintenance needs, preventing accidents preemptively.
+We use experimental vibration signals from accelerometers on train axles.
+  Our primary contributions include a transformer model, ShaftFormer, designed
+for processing time series data, and an alternative model incorporating
+spectral methods and enhanced observation models. Simulating vibration signals
+under diverse conditions mitigates the high cost of obtaining experimental
+signals for all scenarios. Given the non-stationary nature of railway vibration
+signals, influenced by speed and load changes, our models address these
+complexities, offering a powerful tool for predictive maintenance in the rail
+industry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GL-ICNN: An End-To-End Interpretable Convolutional Neural Network for
+  the Diagnosis and Prediction of Alzheimer's Disease 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjie Kang, Lize Jiskoot, Peter De Deyn, Geert Biessels, Huiberdina Koek, Jurgen Claassen, Huub Middelkoop, Wiesje Flier, Willemijn J. Jansen, Stefan Klein, Esther Bron
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods based on Convolutional Neural Networks (CNNs) have
+shown great potential to improve early and accurate diagnosis of Alzheimer's
+disease (AD) dementia based on imaging data. However, these methods have yet to
+be widely adopted in clinical practice, possibly due to the limited
+interpretability of deep learning models. The Explainable Boosting Machine
+(EBM) is a glass-box model but cannot learn features directly from input
+imaging data. In this study, we propose a novel interpretable model that
+combines CNNs and EBMs for the diagnosis and prediction of AD. We develop an
+innovative training strategy that alternatingly trains the CNN component as a
+feature extractor and the EBM component as the output block to form an
+end-to-end model. The model takes imaging data as input and provides both
+predictions and interpretable feature importance measures. We validated the
+proposed model on the Alzheimer's Disease Neuroimaging Initiative (ADNI)
+dataset and the Health-RI Parelsnoer Neurodegenerative Diseases Biobank (PND)
+as an external testing set. The proposed model achieved an area-under-the-curve
+(AUC) of 0.956 for AD and control classification, and 0.694 for the prediction
+of conversion of mild cognitive impairment (MCI) to AD on the ADNI cohort. The
+proposed model is a glass-box model that achieves a comparable performance with
+other state-of-the-art black-box models. Our code is publicly available at:
+https://anonymous.4open.science/r/GL-ICNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human services organizations and the responsible integration of AI:
+  Considering ethics and contextualizing risk(s) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian E. Perron, Lauri Goldkind, Zia Qi, Bryan G. Victor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper examines the responsible integration of artificial intelligence
+(AI) in human services organizations (HSOs), proposing a nuanced framework for
+evaluating AI applications across multiple dimensions of risk. The authors
+argue that ethical concerns about AI deployment -- including professional
+judgment displacement, environmental impact, model bias, and data laborer
+exploitation -- vary significantly based on implementation context and specific
+use cases. They challenge the binary view of AI adoption, demonstrating how
+different applications present varying levels of risk that can often be
+effectively managed through careful implementation strategies. The paper
+highlights promising solutions, such as local large language models, that can
+facilitate responsible AI integration while addressing common ethical concerns.
+The authors propose a dimensional risk assessment approach that considers
+factors like data sensitivity, professional oversight requirements, and
+potential impact on client wellbeing. They conclude by outlining a path forward
+that emphasizes empirical evaluation, starting with lower-risk applications and
+building evidence-based understanding through careful experimentation. This
+approach enables organizations to maintain high ethical standards while
+thoughtfully exploring how AI might enhance their capacity to serve clients and
+communities effectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1 figure. Journal of Technology in Human Services (2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatially-Delineated Domain-Adapted AI Classification: An Application
+  for Oncology Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Majid Farhadloo, Arun Sharma, Alexey Leontovich, Svetomir N. Markovic, Shashi Shekhar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given multi-type point maps from different place-types (e.g., tumor regions),
+our objective is to develop a classifier trained on the source place-type to
+accurately distinguish between two classes of the target place-type based on
+their point arrangements. This problem is societally important for many
+applications, such as generating clinical hypotheses for designing new
+immunotherapies for cancer treatment. The challenge lies in the spatial
+variability, the inherent heterogeneity and variation observed in spatial
+properties or arrangements across different locations (i.e., place-types).
+Previous techniques focus on self-supervised tasks to learn domain-invariant
+features and mitigate domain differences; however, they often neglect the
+underlying spatial arrangements among data points, leading to significant
+discrepancies across different place-types. We explore a novel multi-task
+self-learning framework that targets spatial arrangements, such as spatial
+mix-up masking and spatial contrastive predictive coding, for
+spatially-delineated domain-adapted AI classification. Experimental results on
+real-world datasets (e.g., oncology data) show that the proposed framework
+provides higher prediction accuracy than baseline methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StAyaL | Multilingual Style Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karishma Thakrar, Katrina Lawrence, Kyle Howard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stylistic text generation plays a vital role in enhancing communication by
+reflecting the nuances of individual expression. This paper presents a novel
+approach for generating text in a specific speaker's style across different
+languages. We show that by leveraging only 100 lines of text, an individuals
+unique style can be captured as a high-dimensional embedding, which can be used
+for both text generation and stylistic translation. This methodology breaks
+down the language barrier by transferring the style of a speaker between
+languages. The paper is structured into three main phases: augmenting the
+speaker's data with stylistically consistent external sources, separating style
+from content using machine learning and deep learning techniques, and
+generating an abstract style profile by mean pooling the learned embeddings.
+The proposed approach is shown to be topic-agnostic, with test accuracy and F1
+scores of 74.9\% and 0.75, respectively. The results demonstrate the potential
+of the style profile for multilingual communication, paving the way for further
+applications in personalized content generation and cross-linguistic stylistic
+transfer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The primary authors, Karishma Thakrar and Katrina Lawrence,
+  contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Biomedical Knowledge Graph: A <span class="highlight-title">Survey</span> of Domains, Tasks, and Real-World
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxing Lu, Sin Yee Goi, Xukai Zhao, Jinzhuo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical knowledge graphs (BKGs) have emerged as powerful tools for
+organizing and leveraging the vast and complex data found across the biomedical
+field. Yet, current reviews of BKGs often limit their scope to specific domains
+or methods, overlooking the broader landscape and the rapid technological
+progress reshaping it. In this survey, we address this gap by offering a
+systematic review of BKGs from three core perspectives: domains, tasks, and
+applications. We begin by examining how BKGs are constructed from diverse data
+sources, including molecular interactions, pharmacological datasets, and
+clinical records. Next, we discuss the essential tasks enabled by BKGs,
+focusing on knowledge management, retrieval, reasoning, and interpretation.
+Finally, we highlight real-world applications in precision medicine, drug
+discovery, and scientific research, illustrating the translational impact of
+BKGs across multiple sectors. By synthesizing these perspectives into a unified
+framework, this survey not only clarifies the current state of BKG research but
+also establishes a foundation for future exploration, enabling both innovative
+methodological advances and practical implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>45 pages, 4 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Noise-Agnostic Multitask Whisper Training for Reducing False Alarm
+  Errors in Call-for-Help Detection <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Myeonghoon Ryu, June-Woo Kim, Minseok Oh, Suji Lee, Han Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Keyword spotting is often implemented by keyword classifier to the encoder in
+acoustic models, enabling the classification of predefined or open vocabulary
+keywords. Although keyword spotting is a crucial task in various applications
+and can be extended to call-for-help detection in emergencies, however, the
+previous method often suffers from scalability limitations due to retraining
+required to introduce new keywords or adapt to changing contexts. We explore a
+simple yet effective approach that leverages off-the-shelf pretrained ASR
+models to address these challenges, especially in call-for-help detection
+scenarios. Furthermore, we observed a substantial increase in false alarms when
+deploying call-for-help detection system in real-world scenarios due to noise
+introduced by microphones or different environments. To address this, we
+propose a novel noise-agnostic multitask learning approach that integrates a
+noise classification head into the ASR encoder. Our method enhances the model's
+robustness to noisy environments, leading to a significant reduction in false
+alarms and improved overall call-for-help performance. Despite the added
+complexity of multitask learning, our approach is computationally efficient and
+provides a promising solution for call-for-help detection in real-world
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early evidence of how LLMs outperform traditional systems on OCR/HTR
+  tasks for historical records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seorin Kim, Julien Baudru, Wouter Ryckbosch, Hugues Bersini, Vincent Ginis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the ability of two LLMs -- GPT-4o and Claude Sonnet 3.5 -- to
+transcribe historical handwritten documents in a tabular format and compare
+their performance to traditional OCR/HTR systems: EasyOCR, Keras, Pytesseract,
+and TrOCR. Considering the tabular form of the data, two types of experiments
+are executed: one where the images are split line by line and the other where
+the entire scan is used as input. Based on CER and BLEU, we demonstrate that
+LLMs outperform the conventional OCR/HTR methods. Moreover, we also compare the
+evaluated CER and BLEU scores to human evaluations to better judge the outputs
+of whole-scan experiments and understand influential factors for CER and BLEU.
+Combining judgments from all the evaluation metrics, we conclude that two-shot
+GPT-4o for line-by-line images and two-shot Claude Sonnet 3.5 for whole-scan
+images yield the transcriptions of the historical records most similar to the
+ground truth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conversation Routines: A <span class="highlight-title">Prompt</span> Engineering Framework for Task-Oriented
+  Dialog Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Robino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces Conversation Routines (CR), a structured prompt
+engineering framework for developing task-oriented dialog systems using Large
+Language Models (LLMs). While LLMs demonstrate remarkable natural language
+understanding capabilities, engineering them to reliably execute complex
+business workflows remains challenging. The proposed CR framework enables the
+development of Conversation Agentic Systems (CAS) through natural language
+specifications, embedding task-oriented logic within LLM prompts. This approach
+provides a systematic methodology for designing and implementing complex
+conversational workflows while maintaining behavioral consistency. We
+demonstrate the framework's effectiveness through two proof of concept
+implementations: a Train Ticket Booking System and an Interactive
+Troubleshooting Copilot. These case studies validate CR's capability to encode
+sophisticated behavioral patterns and decision logic while preserving natural
+conversational flexibility. Results show that CR enables domain experts to
+design conversational workflows in natural language while leveraging custom
+enterprise functionalities (tools) developed by software engineers, creating an
+efficient division of responsibilities where developers focus on core API
+implementation and domain experts handle conversation design. While the
+framework shows promise in accessibility and adaptability, we identify key
+challenges including computational overhead, non-deterministic behavior, and
+domain-specific logic optimization. Future research directions include
+enhancing system robustness, improving scalability for complex multi-agent
+interactions, and addressing the identified limitations across diverse business
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SR-FoT: A Syllogistic-Reasoning Framework of Thought for Large Language
+  Models Tackling Knowledge-based Reasoning Tasks <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11599v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11599v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Wan, Zhuojie Yang, Yongcan Chen, Chenglin Luo, Ruilin Wang, Kehao Cai, Nan Kang, Liang Lin, Keze Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deductive reasoning is a crucial logical capability that assists us in
+solving complex problems based on existing knowledge. Although augmented by
+Chain-of-Thought prompts, Large Language Models (LLMs) might not follow the
+correct reasoning paths. Enhancing the deductive reasoning abilities of LLMs,
+and leveraging their extensive built-in knowledge for various reasoning tasks,
+remains an open question. Attempting to mimic the human deductive reasoning
+paradigm, we propose a multi-stage Syllogistic-Reasoning Framework of Thought
+(SR-FoT) that enables LLMs to perform syllogistic deductive reasoning to handle
+complex knowledge-based reasoning tasks. Our SR-FoT begins by interpreting the
+question and then uses the interpretation and the original question to propose
+a suitable major premise. It proceeds by generating and answering minor premise
+questions in two stages to match the minor premises. Finally, it guides LLMs to
+use the previously generated major and minor premises to perform syllogistic
+deductive reasoning to derive the answer to the original question. Extensive
+and thorough experiments on knowledge-based reasoning tasks have demonstrated
+the effectiveness and advantages of our SR-FoT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fairness Testing through Extreme Value Theory <span class="chip">ICSE'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11597v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11597v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Verya Monjezi, Ashutosh Trivedi, Vladik Kreinovich, Saeid Tizpaz-Niari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven software is increasingly being used as a critical component of
+automated decision-support systems. Since this class of software learns its
+logic from historical data, it can encode or amplify discriminatory practices.
+Previous research on algorithmic fairness has focused on improving average-case
+fairness. On the other hand, fairness at the extreme ends of the spectrum,
+which often signifies lasting and impactful shifts in societal attitudes, has
+received significantly less emphasis.
+  Leveraging the statistics of extreme value theory (EVT), we propose a novel
+fairness criterion called extreme counterfactual discrimination (ECD). This
+criterion estimates the worst-case amounts of disadvantage in outcomes for
+individuals solely based on their memberships in a protected group. Utilizing
+tools from search-based software engineering and generative AI, we present a
+randomized algorithm that samples a statistically significant set of points
+from the tail of ML outcome distributions even if the input dataset lacks a
+sufficient number of relevant samples.
+  We conducted several experiments on four ML models (deep neural networks,
+logistic regression, and random forests) over 10 socially relevant tasks from
+the literature on algorithmic fairness. First, we evaluate the generative AI
+methods and find that they generate sufficient samples to infer valid EVT
+distribution in 95% of cases. Remarkably, we found that the prevalent bias
+mitigators reduce the average-case discrimination but increase the worst-case
+discrimination significantly in 5% of cases. We also observed that even the
+tail-aware mitigation algorithm -- MiniMax-Fairness -- increased the worst-case
+discrimination in 30% of cases. We propose a novel ECD-based mitigator that
+improves fairness in the tail in 90% of cases with no degradation of the
+average-case discrimination.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In IEEE/ACM 47th International Conference on Software Engineering
+  (ICSE'25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training-free Ultra Small Model for Universal Sparse Reconstruction in
+  Compressed Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqing Tang, Huanze Zhuang, Guiyun Tian, Zhenli Zeng, Yi Ding, Wenzhong Liu, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large models attract widespread attention in recent years, but
+they face challenges in applications that require high interpretability or have
+limited resources, such as physical sensing, medical imaging, and
+bioinformatics. Compressed Sensing (CS) is a well-proved theory that drives
+many recent breakthroughs in these applications. However, as a typical
+under-determined linear system, CS suffers from excessively long sparse
+reconstruction times when using traditional iterative methods, particularly
+with large-scale data. Current AI methods like deep unfolding fail to
+substitute them because pre-trained models exhibit poor generality beyond their
+training conditions and dataset distributions, or lack interpretability.
+Instead of following the big model fervor, this paper proposes ultra-small
+artificial neural models called coefficients learning (CL), enabling
+training-free and rapid sparse reconstruction while perfectly inheriting the
+generality and interpretability of traditional iterative methods, bringing new
+feature of incorporating prior knowledges. In CL, a signal of length $n$ only
+needs a minimal of $n$ trainable parameters. A case study model called CLOMP is
+implemented for evaluation. Experiments are conducted on both synthetic and
+real one-dimensional and two-dimensional signals, demonstrating significant
+improvements in efficiency and accuracy. Compared to representative iterative
+methods, CLOMP improves efficiency by 100 to 1000 folds for large-scale data.
+Test results on eight diverse image datasets indicate that CLOMP improves
+structural similarity index by 292%, 98%, 45% for sampling rates of 0.1, 0.3,
+0.5, respectively. We believe this method can truly usher CS reconstruction
+into the AI era, benefiting countless under-determined linear systems that rely
+on sparse solution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recurrent Diffusion for Large-Scale Parameter Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11587v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11587v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Wang, Dongwen Tang, Wangbo Zhao, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter generation has struggled to scale up for a long time, significantly
+limiting its range of applications. In this study, we introduce
+\textbf{R}ecurrent diffusion for large-scale \textbf{P}arameter
+\textbf{G}eneration, called \textbf{RPG}. We first divide the trained
+parameters into non-overlapping parts, after which a recurrent model is
+proposed to learn their relationships. The recurrent model's outputs, as
+conditions, are then fed into a diffusion model to generate the neural network
+parameters. Using only a single GPU, recurrent diffusion enables us to generate
+popular vision and language models such as ConvNeXt-L and LoRA parameters of
+LLaMA-7B. Meanwhile, across various architectures and tasks, the generated
+parameters consistently perform comparable results over trained networks.
+Notably, our approach also shows the potential to generate models for handling
+unseen tasks, which largely increases the practicality of parameter generation.
+Our code is available
+\href{https://github.com/NUS-HPC-AI-Lab/Recurrent-Parameter-Generation}{here}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Generating 200 million parameters in just minutes</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Lane Change Prediction for Near-Crash Scenarios Using
+  Knowledge Graph Embeddings and Retrieval Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Manzour, A. Ballardini, R. Izquierdo, M. Á. Sotelo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lane-changing maneuvers, particularly those executed abruptly or in risky
+situations, are a significant cause of road traffic accidents. However, current
+research mainly focuses on predicting safe lane changes. Furthermore, existing
+accident datasets are often based on images only and lack comprehensive sensory
+data. In this work, we focus on predicting risky lane changes using the CRASH
+dataset (our own collected dataset specifically for risky lane changes), and
+safe lane changes (using the HighD dataset). Then, we leverage KG and Bayesian
+inference to predict these maneuvers using linguistic contextual information,
+enhancing the model's interpretability and transparency. The model achieved a
+91.5% f1-score with anticipation time extending to four seconds for risky lane
+changes, and a 90.0% f1-score for predicting safe lane changes with the same
+anticipation time. We validate our model by integrating it into a vehicle
+within the CARLA simulator in scenarios that involve risky lane changes. The
+model managed to anticipate sudden lane changes, thus providing automated
+vehicles with further time to plan and execute appropriate safe reactions.
+Finally, to enhance the explainability of our model, we utilize RAG to provide
+clear and natural language explanations for the given prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The impact of intrinsic rewards on exploration in Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aya Kayal, Eduardo Pignatelli, Laura Toni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the open challenges in Reinforcement Learning is the hard exploration
+problem in sparse reward environments. Various types of intrinsic rewards have
+been proposed to address this challenge by pushing towards diversity. This
+diversity might be imposed at different levels, favouring the agent to explore
+different states, policies or behaviours (State, Policy and Skill level
+diversity, respectively). However, the impact of diversity on the agent's
+behaviour remains unclear. In this work, we aim to fill this gap by studying
+the effect of different levels of diversity imposed by intrinsic rewards on the
+exploration patterns of RL agents. We select four intrinsic rewards (State
+Count, Intrinsic Curiosity Module (ICM), Maximum Entropy, and Diversity is all
+you need (DIAYN)), each pushing for a different diversity level. We conduct an
+empirical study on MiniGrid environment to compare their impact on exploration
+considering various metrics related to the agent's exploration, namely:
+episodic return, observation coverage, agent's position coverage, policy
+entropy, and timeframes to reach the sparse reward. The main outcome of the
+study is that State Count leads to the best exploration performance in the case
+of low-dimensional observations. However, in the case of RGB observations, the
+performance of State Count is highly degraded mostly due to representation
+learning challenges. Conversely, Maximum Entropy is less impacted, resulting in
+a more robust exploration, despite being not always optimal. Lastly, our
+empirical study revealed that learning diverse skills with DIAYN, often linked
+to improved robustness and generalisation, does not promote exploration in
+MiniGrid environments. This is because: i) learning the skill space itself can
+be challenging, and ii) exploration within the skill space prioritises
+differentiating between behaviours rather than achieving uniform state
+visitation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>45 pages, 17 figures. Submitted to Neural Computing and Applications
+  Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Instance Selection. Instance Selection as a Classification Problem
+  with Meta-Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcin Blachnik, Piotr Ciepliński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data pruning, or instance selection, is an important problem in machine
+learning especially in terms of nearest neighbour classifier. However, in data
+pruning which speeds up the prediction phase, there is an issue related to the
+speed and efficiency of the process itself. In response, the study proposes an
+approach involving transforming the instance selection process into a
+classification task conducted in a unified meta-feature space where each
+instance can be classified and assigned to either the "to keep" or "to remove"
+class. This approach requires training an appropriate meta-classifier, which
+can be developed based on historical instance selection results from other
+datasets using reference instance selection methods as a labeling tool. This
+work proposes constructing the meta-feature space based on properties extracted
+from the nearest neighbor graph. Experiments conducted on 17 datasets of
+varying sizes and five reference instance selection methods (ENN, Drop3, ICF,
+HMN-EI, and CCIS) demonstrate that the proposed solution achieves results
+comparable to reference instance selection methods while significantly reducing
+computational complexity. In the proposed approach, the computational
+complexity of the system depends only on identifying the k-nearest neighbors
+for each data sample and running the meta-classifier. Additionally, the study
+discusses the choice of meta-classifier, recommending the use of Balanced
+Random Forest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Technical Report for the Forgotten-by-Design Project: Targeted
+  Obfuscation for Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rickard Brännvall, Laurynas Adomaitis, Olof Görnerup, Anass Sedrati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The right to privacy, enshrined in various human rights declarations, faces
+new challenges in the age of artificial intelligence (AI). This paper explores
+the concept of the Right to be Forgotten (RTBF) within AI systems, contrasting
+it with traditional data erasure methods. We introduce Forgotten by Design, a
+proactive approach to privacy preservation that integrates instance-specific
+obfuscation techniques during the AI model training process. Unlike machine
+unlearning, which modifies models post-training, our method prevents sensitive
+data from being embedded in the first place. Using the LIRA membership
+inference attack, we identify vulnerable data points and propose defenses that
+combine additive gradient noise and weighting schemes. Our experiments on the
+CIFAR-10 dataset demonstrate that our techniques reduce privacy risks by at
+least an order of magnitude while maintaining model accuracy (at 95%
+significance). Additionally, we present visualization methods for the
+privacy-utility trade-off, providing a clear framework for balancing privacy
+risk and model accuracy. This work contributes to the development of
+privacy-preserving AI systems that align with human cognitive processes of
+motivated forgetting, offering a robust framework for safeguarding sensitive
+information and ensuring compliance with privacy regulations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dialect2SQL: A Novel Text-to-SQL <span class="highlight-title">Dataset</span> for Arabic Dialects with a
+  Focus on Moroccan Darija 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11498v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11498v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salmane Chafik, Saad Ezzini, Ismail Berrada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of converting natural language questions (NLQs) into executable SQL
+queries, known as text-to-SQL, has gained significant interest in recent years,
+as it enables non-technical users to interact with relational databases. Many
+benchmarks, such as SPIDER and WikiSQL, have contributed to the development of
+new models and the evaluation of their performance. In addition, other
+datasets, like SEDE and BIRD, have introduced more challenges and complexities
+to better map real-world scenarios. However, these datasets primarily focus on
+high-resource languages such as English and Chinese. In this work, we introduce
+Dialect2SQL, the first large-scale, cross-domain text-to-SQL dataset in an
+Arabic dialect. It consists of 9,428 NLQ-SQL pairs across 69 databases in
+various domains. Along with SQL-related challenges such as long schemas, dirty
+values, and complex queries, our dataset also incorporates the complexities of
+the Moroccan dialect, which is known for its diverse source languages, numerous
+borrowed words, and unique expressions. This demonstrates that our dataset will
+be a valuable contribution to both the text-to-SQL community and the
+development of resources for low-resource languages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative AI and Large Language Models in Language Preservation:
+  Opportunities and Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincent Koc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI and large-scale language models (LLM) have emerged as powerful
+tools in language preservation, particularly for near-native and endangered
+languages. With the increasing reliance on technology for communication,
+education, and cultural documentation, new opportunities have emerged to
+mitigate the dramatic decline of linguistic diversity worldwide. This paper
+examines the role of generative AIs and LLMs in preserving endangered
+languages, highlighting the risks and challenges associated with their use. We
+analyze the underlying technologies driving these models, including natural
+language processing (NLP) and deep learning, and explore several cases where
+these technologies have been applied to low-resource languages. Additionally,
+we discuss ethical considerations, data scarcity issues, and technical
+challenges while proposing solutions to enhance AI-driven language
+preservation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure, submitted for IEEE publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient Federated Learning Based on Explanation-Guided
+  Pruning for Remote Sensing Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Klotz, Barış Büyüktaş, Begüm Demir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a decentralized machine learning paradigm, where
+multiple clients collaboratively train a global model by exchanging only model
+updates with the central server without sharing the local data of clients. Due
+to the large volume of model updates required to be transmitted between clients
+and the central server, most FL systems are associated with high transfer costs
+(i.e., communication overhead). This issue is more critical for operational
+applications in remote sensing (RS), especially when large-scale RS data is
+processed and analyzed through FL systems with restricted communication
+bandwidth. To address this issue, we introduce an explanation-guided pruning
+strategy for communication-efficient FL in the context of RS image
+classification. Our pruning strategy is defined based on the layerwise
+relevance propagation (LRP) driven explanations to: 1) efficiently and
+effectively identify the most relevant and informative model parameters (to be
+exchanged between clients and the central server); and 2) eliminate the
+non-informative ones to minimize the volume of model updates. The experimental
+results on the BigEarthNet-S2 dataset demonstrate that our strategy effectively
+reduces the number of shared model updates, while increasing the generalization
+ability of the global model. The code of this work will be publicly available
+at https://git.tu-berlin.de/rsim/FL-LRP
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the IEEE International Geoscience and Remote Sensing
+  Symposium (IGARSS) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph-defined Language Learning with LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huachi Zhou, Jiahe Du, Chuang Zhou, Chang Yang, Yilin Xiao, Yuxuan Xie, Xiao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent efforts leverage Large Language Models (LLMs) for modeling
+text-attributed graph structures in node classification tasks. These approaches
+describe graph structures for LLMs to understand or aggregate LLM-generated
+textual attribute embeddings through graph structure. However, these approaches
+face two main limitations in modeling graph structures with LLMs. (i) Graph
+descriptions become verbose in describing high-order graph structure. (ii)
+Textual attributes alone do not contain adequate graph structure information.
+It is challenging to model graph structure concisely and adequately with LLMs.
+LLMs lack built-in mechanisms to model graph structures directly. They also
+struggle with complex long-range dependencies between high-order nodes and
+target nodes.
+  Inspired by the observation that LLMs pre-trained on one language can achieve
+exceptional performance on another with minimal additional training, we propose
+\textbf{G}raph-\textbf{D}efined \textbf{L}anguage for \textbf{L}arge
+\textbf{L}anguage \textbf{M}odel (GDL4LLM). This novel framework enables LLMs
+to transfer their powerful language understanding capabilities to
+graph-structured data. GDL4LLM translates graphs into a graph language corpus
+instead of graph descriptions and pre-trains LLMs on this corpus to adequately
+understand graph structures. During fine-tuning, this corpus describes the
+structural information of target nodes concisely with only a few tokens. By
+treating graphs as a new language, GDL4LLM enables LLMs to model graph
+structures adequately and concisely for node classification tasks. Extensive
+experiments on three real-world datasets demonstrate that GDL4LLM outperforms
+description-based and textual attribute embeddings-based baselines by
+efficiently modeling different orders of graph structure with LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving thermal state preparation of Sachdev-Ye-Kitaev model with
+  reinforcement learning on quantum hardware 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11454v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11454v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akash Kundu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Sachdev-Ye-Kitaev (SYK) model, known for its strong quantum correlations
+and chaotic behavior, serves as a key platform for quantum gravity studies.
+However, variationally preparing thermal states on near-term quantum processors
+for large systems (N>12, where N is the number of Majorana fermions) presents a
+significant challenge due to the rapid growth in the complexity of
+parameterized quantum circuits. This paper addresses this challenge by
+integrating reinforcement learning (RL) with convolutional neural networks,
+employing an iterative approach to optimize the quantum circuit and its
+parameters. The refinement process is guided by a composite reward signal
+derived from entropy and the expectation values of the SYK Hamiltonian. This
+approach reduces the number of CNOT gates by two orders of magnitude for
+systems N>10 compared to traditional methods like first-order Trotterization.
+We demonstrate the effectiveness of the RL framework in both noiseless and
+noisy quantum hardware environments, maintaining high accuracy in thermal state
+preparation. This work contributes to the advancement of a scalable, RL-based
+framework with applications for computations of thermal out-of-time-order
+correlators in quantum many-body systems and quantum gravity studies on
+near-term quantum hardware.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and the data will be available soon. Comments are welcomed!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decomposing Interventional Causality into Synergistic, Redundant, and
+  Unique Components 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abel Jansma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel framework for decomposing interventional causal effects
+into synergistic, redundant, and unique components, building on the intuition
+of Partial Information Decomposition (PID) and the principle of M\"obius
+inversion. While recent work has explored a similar decomposition of an
+observational measure, we argue that a proper causal decomposition must be
+interventional in nature. We develop a mathematical approach that
+systematically quantifies how causal power is distributed among variables in a
+system, using a recently derived closed-form expression for the M\"obius
+function of the redundancy lattice. The formalism is then illustrated by
+decomposing the causal power in logic gates, cellular automata, and chemical
+reaction networks. Our results reveal how the distribution of causal power can
+be context- and parameter-dependent. This decomposition provides new insights
+into complex systems by revealing how causal influences are shared and combined
+among multiple variables, with potential applications ranging from attribution
+of responsibility in legal or AI systems, to the analysis of biological
+networks or climate models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Diffusion Models for Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Liu, Zhenchao Ma, Zepu Wang, Yang Liu, Zehua Wang, Peng Sun, Liang Song, Bo Hu, Azzedine Boukerche, Victor C. M. Leung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models (DMs) have emerged as a powerful class of generative AI
+models, showing remarkable potential in anomaly detection (AD) tasks across
+various domains, such as cybersecurity, fraud detection, healthcare, and
+manufacturing. The intersection of these two fields, termed diffusion models
+for anomaly detection (DMAD), offers promising solutions for identifying
+deviations in increasingly complex and high-dimensional data. In this survey,
+we systematically review recent advances in DMAD research and investigate their
+capabilities. We begin by presenting the fundamental concepts of AD and DMs,
+followed by a comprehensive analysis of classic DM architectures including
+DDPMs, DDIMs, and Score SDEs. We further categorize existing DMAD methods into
+reconstruction-based, density-based, and hybrid approaches, providing detailed
+examinations of their methodological innovations. We also explore the diverse
+tasks across different data modalities, encompassing image, time series, video,
+and multimodal data analysis. Furthermore, we discuss critical challenges and
+emerging research directions, including computational efficiency, model
+interpretability, robustness enhancement, edge-cloud collaboration, and
+integration with large language models. The collection of DMAD research papers
+and resources is available at https://github.com/fdjingliu/DMAD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Explanation Game -- Rekindled (Extended Version) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joao Marques-Silva, Xuanxiang Huang, Olivier Letoffe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work demonstrated the existence of critical flaws in the current use
+of Shapley values in explainable AI (XAI), i.e. the so-called SHAP scores.
+These flaws are significant in that the scores provided to a human
+decision-maker can be misleading. Although these negative results might appear
+to indicate that Shapley values ought not be used in XAI, this paper argues
+otherwise. Concretely, this paper proposes a novel definition of SHAP scores
+that overcomes existing flaws. Furthermore, the paper outlines a practically
+efficient solution for the rigorous estimation of the novel SHAP scores.
+Preliminary experimental results confirm our claims, and further underscore the
+flaws of the current SHAP scores.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Coronary Artery Calcium Scoring via Multi-Organ Segmentation
+  on Non-Contrast Cardiac Computed Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakub Nalepa, Tomasz Bartczak, Mariusz Bujny, Jarosław Gośliński, Katarzyna Jesionek, Wojciech Malara, Filip Malawski, Karol Miszalski-Jamka, Patrycja Rewa, Marcin Kostur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite coronary artery calcium scoring being considered a largely solved
+problem within the realm of medical artificial intelligence, this paper argues
+that significant improvements can still be made. By shifting the focus from
+pathology detection to a deeper understanding of anatomy, the novel algorithm
+proposed in the paper both achieves high accuracy in coronary artery calcium
+scoring and offers enhanced interpretability of the results. This approach not
+only aids in the precise quantification of calcifications in coronary arteries,
+but also provides valuable insights into the underlying anatomical structures.
+Through this anatomically-informed methodology, the paper shows how a nuanced
+understanding of the heart's anatomy can lead to more accurate and
+interpretable results in the field of cardiovascular health. We demonstrate the
+superior accuracy of the proposed method by evaluating it on an open-source
+multi-vendor dataset, where we obtain results at the inter-observer level,
+surpassing the current state of the art. Finally, the qualitative analyses show
+the practical value of the algorithm in such tasks as labeling coronary artery
+calcifications, identifying aortic calcifications, and filtering out false
+positive detections due to noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Agent-R: Training Language Model Agents to Reflect via Iterative
+  Self-Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Yuan, Zehui Chen, Zhiheng Xi, Junjie Ye, Zhengyin Du, Jiecao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) agents are increasingly pivotal for addressing
+complex tasks in interactive environments. Existing work mainly focuses on
+enhancing performance through behavior cloning from stronger experts, yet such
+approaches often falter in real-world applications, mainly due to the inability
+to recover from errors. However, step-level critique data is difficult and
+expensive to collect. Automating and dynamically constructing self-critique
+datasets is thus crucial to empowering models with intelligent agent
+capabilities. In this work, we propose an iterative self-training framework,
+Agent-R, that enables language Agent to Reflect on the fly. Unlike traditional
+methods that reward or penalize actions based on correctness, Agent-R leverages
+MCTS to construct training data that recover correct trajectories from
+erroneous ones. A key challenge of agent reflection lies in the necessity for
+timely revision rather than waiting until the end of a rollout. To address
+this, we introduce a model-guided critique construction mechanism: the actor
+model identifies the first error step (within its current capability) in a
+failed trajectory. Starting from it, we splice it with the adjacent correct
+path, which shares the same parent node in the tree. This strategy enables the
+model to learn reflection based on its current policy, therefore yielding
+better learning efficiency. To further explore the scalability of this
+self-improvement paradigm, we investigate iterative refinement of both error
+correction capabilities and dataset construction. Our findings demonstrate that
+Agent-R continuously improves the model's ability to recover from errors and
+enables timely error correction. Experiments on three interactive environments
+show that Agent-R effectively equips agents to correct erroneous actions while
+avoiding loops, achieving superior performance compared to baseline methods
+(+5.59%).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View Spectral Clustering for Graphs with Multiple View Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yorgos Tsitsikas, Evangelos E. Papalexakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the fundamental importance of clustering, to this day, much of the
+relevant research is still based on ambiguous foundations, leading to an
+unclear understanding of whether or how the various clustering methods are
+connected with each other. In this work, we provide an additional stepping
+stone towards resolving such ambiguities by presenting a general clustering
+framework that subsumes a series of seemingly disparate clustering methods,
+including various methods belonging to the wildly popular spectral clustering
+framework. In fact, the generality of the proposed framework is additionally
+capable of shedding light to the largely unexplored area of multi-view graphs
+whose each view may have differently clustered nodes. In turn, we propose
+GenClus: a method that is simultaneously an instance of this framework and a
+generalization of spectral clustering, while also being closely related to
+k-means as well. This results in a principled alternative to the few existing
+methods studying this special type of multi-view graphs. Then, we conduct
+in-depth experiments, which demonstrate that GenClus is more computationally
+efficient than existing methods, while also attaining similar or better
+clustering performance. Lastly, a qualitative real-world case-study further
+demonstrates the ability of GenClus to produce meaningful clusterings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Contextual Reinforcement Framework for Logical Structure Language
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcus Irvin, William Cooper, Edward Hughes, Jessica Morgan, Christopher Hamilton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Neural Contextual Reinforcement Framework introduces an innovative
+approach to enhancing the logical coherence and structural consistency of text
+generated by large language models. Leveraging reinforcement learning
+principles, the framework integrates custom reward functions and dynamic
+context alignment mechanisms to address challenges inherent in maintaining
+long-range dependencies across extended sequences. The architecture
+incorporates multi-head attention layers and hierarchical encoding modules,
+enabling the model to produce outputs that align closely with human
+expectations of logical structure and semantic flow. Quantitative evaluations
+across diverse datasets demonstrate substantial improvements in coherence
+metrics, perplexity reduction, and semantic alignment, showcasing the
+framework's ability to outperform baseline models in both general and
+domain-specific tasks. Qualitative analyses further highlight the framework's
+capacity to generate text with improved narrative clarity and reduced
+redundancy, reflecting its effectiveness in balancing fluency with structural
+precision. In addition to its performance gains, the framework exhibits
+robustness in handling noisy input data and scalability across varying model
+sizes, reinforcing its versatility in practical applications. Experimental
+results reveal that optimal context window sizes significantly influence
+coherence outcomes, showing the importance of architectural flexibility in
+adapting to diverse linguistic structures. Cross-lingual performance
+evaluations affirm the framework's adaptability to multiple languages,
+extending its utility beyond monolingual contexts. Resource efficiency analyses
+indicate a reduction in computational overhead compared to traditional
+approaches, emphasizing the practicality of the framework for large-scale
+deployment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization and Informativeness of Weighted Conformal Risk Control
+  Under Covariate Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Zecchin, Fredrik Hellström, Sangwoo Park, Shlomo Shamai, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive models are often required to produce reliable predictions under
+statistical conditions that are not matched to the training data. A common type
+of training-testing mismatch is covariate shift, where the conditional
+distribution of the target variable given the input features remains fixed,
+while the marginal distribution of the inputs changes. Weighted conformal risk
+control (W-CRC) uses data collected during the training phase to convert point
+predictions into prediction sets with valid risk guarantees at test time
+despite the presence of a covariate shift. However, while W-CRC provides
+statistical reliability, its efficiency -- measured by the size of the
+prediction sets -- can only be assessed at test time. In this work, we relate
+the generalization properties of the base predictor to the efficiency of W-CRC
+under covariate shifts. Specifically, we derive a bound on the inefficiency of
+the W-CRC predictor that depends on algorithmic hyperparameters and
+task-specific quantities available at training time. This bound offers insights
+on relationships between the informativeness of the prediction sets, the extent
+of the covariate shift, and the size of the calibration and training sets.
+Experiments on fingerprinting-based localization validate the theoretical
+results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Learning in Echo State Networks for Input Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taiki Yamada, Yuichi Katori, Kantaro Fujiwara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional echo state networks (ESNs) require supervised learning to train
+the readout layer, using the desired outputs as training data. In this study,
+we focus on input reconstruction (IR), which refers to training the readout
+layer to reproduce the input time series in its output. We reformulate the
+learning algorithm of the ESN readout layer to perform IR using unsupervised
+learning (UL). By conducting theoretical analysis and numerical experiments, we
+demonstrate that IR in ESNs can be effectively implemented under realistic
+conditions without explicitly using the desired outputs as training data; in
+this way, UL is enabled. Furthermore, we demonstrate that applications relying
+on IR, such as dynamical system replication and noise filtering, can be
+reformulated within the UL framework. Our findings establish a theoretically
+sound and universally applicable IR formulation, along with its related tasks
+in ESNs. This work paves the way for novel predictions and highlights
+unresolved theoretical challenges in ESNs, particularly in the context of
+time-series processing methods and computational models of the brain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures, regular paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Truly Sparse and General Implementation of Gradient-Based Synaptic
+  Plasticity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jamie Lohoff, Anil Kaya, Florian Assmuth, Emre Neftci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online synaptic plasticity rules derived from gradient descent achieve high
+accuracy on a wide range of practical tasks. However, their software
+implementation often requires tediously hand-derived gradients or using
+gradient backpropagation which sacrifices the online capability of the rules.
+In this work, we present a custom automatic differentiation (AD) pipeline for
+sparse and online implementation of gradient-based synaptic plasticity rules
+that generalizes to arbitrary neuron models. Our work combines the programming
+ease of backpropagation-type methods for forward AD while being
+memory-efficient. To achieve this, we exploit the advantageous compute and
+memory scaling of online synaptic plasticity by providing an inherently sparse
+implementation of AD where expensive tensor contractions are replaced with
+simple element-wise multiplications if the tensors are diagonal. Gradient-based
+synaptic plasticity rules such as eligibility propagation (e-prop) have exactly
+this property and thus profit immensely from this feature. We demonstrate the
+alignment of our gradients with respect to gradient backpropagation on an
+synthetic task where e-prop gradients are exact, as well as audio speech
+classification benchmarks. We demonstrate how memory utilization scales with
+network size without dependence on the sequence length, as expected from
+forward AD methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigation of Whisper ASR Hallucinations Induced by Non-Speech Audio <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11378v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11378v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mateusz Barański, Jan Jasiński, Julitta Bartolewska, Stanisław Kacprzak, Marcin Witkowski, Konrad Kowalczyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hallucinations of deep neural models are amongst key challenges in automatic
+speech recognition (ASR). In this paper, we investigate hallucinations of the
+Whisper ASR model induced by non-speech audio segments present during
+inference. By inducting hallucinations with various types of sounds, we show
+that there exists a set of hallucinations that appear frequently. We then study
+hallucinations caused by the augmentation of speech with such sounds. Finally,
+we describe the creation of a bag of hallucinations (BoH) that allows to remove
+the effect of hallucinations through the post-processing of text
+transcriptions. The results of our experiments show that such post-processing
+is capable of reducing word error rate (WER) and acts as a good safeguard
+against problematic hallucinations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for IEEE ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Learning with Sample-level Client Drift Mitigation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Xu, Jiaze Li, Wanyi Wu, Hao Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) suffers from severe performance degradation due to
+the data heterogeneity among clients. Existing works reveal that the
+fundamental reason is that data heterogeneity can cause client drift where the
+local model update deviates from the global one, and thus they usually tackle
+this problem from the perspective of calibrating the obtained local update.
+Despite effectiveness, existing methods substantially lack a deep understanding
+of how heterogeneous data samples contribute to the formation of client drift.
+In this paper, we bridge this gap by identifying that the drift can be viewed
+as a cumulative manifestation of biases present in all local samples and the
+bias between samples is different. Besides, the bias dynamically changes as the
+FL training progresses. Motivated by this, we propose FedBSS that first
+mitigates the heterogeneity issue in a sample-level manner, orthogonal to
+existing methods. Specifically, the core idea of our method is to adopt a
+bias-aware sample selection scheme that dynamically selects the samples from
+small biases to large epoch by epoch to train progressively the local model in
+each round. In order to ensure the stability of training, we set the
+diversified knowledge acquisition stage as the warm-up stage to avoid the local
+optimality caused by knowledge deviation in the early stage of the model.
+Evaluation results show that FedBSS outperforms state-of-the-art baselines. In
+addition, we also achieved effective results on feature distribution skew and
+noise label dataset setting, which proves that FedBSS can not only reduce
+heterogeneity, but also has scalability and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Dimension of Pullback Attractors in Recurrent Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammed Fadera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recurrent Neural Networks (RNNs) are high-dimensional state space models
+capable of learning functions on sequence data. Recently, it has been
+conjectured that reservoir computers, a particular class of RNNs, trained on
+observations of a dynamical systems can be interpreted as embeddings. This
+result has been established for the case of linear reservoir systems. In this
+work, we use a nonautonomous dynamical systems approach to establish an upper
+bound for the fractal dimension of the subset of reservoir state space
+approximated during training and prediction phase. We prove that when the input
+sequences comes from an Nin-dimensional invertible dynamical system, the
+fractal dimension of this set is bounded above by Nin. The result obtained here
+are useful in dimensionality reduction of computation in RNNs as well as
+estimating fractal dimensions of dynamical systems from limited observations of
+their time series. It is also a step towards understanding embedding properties
+of reservoir computers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Advancing Code Generation with Large Language Models: A Research
+  Roadmap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haolin Jin, Huaming Chen, Qinghua Lu, Liming Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, we have witnessed the rapid development of large language models,
+which have demonstrated excellent capabilities in the downstream task of code
+generation. However, despite their potential, LLM-based code generation still
+faces numerous technical and evaluation challenges, particularly when embedded
+in real-world development. In this paper, we present our vision for current
+research directions, and provide an in-depth analysis of existing studies on
+this task. We propose a six-layer vision framework that categorizes code
+generation process into distinct phases, namely Input Phase, Orchestration
+Phase, Development Phase, and Validation Phase. Additionally, we outline our
+vision workflow, which reflects on the currently prevalent frameworks. We
+systematically analyse the challenges faced by large language models, including
+those LLM-based agent frameworks, in code generation tasks. With these, we
+offer various perspectives and actionable recommendations in this area. Our aim
+is to provide guidelines for improving the reliability, robustness and
+usability of LLM-based code generation systems. Ultimately, this work seeks to
+address persistent challenges and to provide practical suggestions for a more
+pragmatic LLM-based solution for future code generation endeavors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-shot Policy (de)composition in Conversational Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Erwin, Guy Axelrod, Maria Chang, Achille Fokoue, Maxwell Crouse, Soham Dan, Tian Gao, Rosario Uceda-Sosa, Ndivhuwo Makondo, Naweed Khan, Alexander Gray
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of policy compliance detection (PCD) is to determine if a scenario
+is in compliance with respect to a set of written policies. In a conversational
+setting, the results of PCD can indicate if clarifying questions must be asked
+to determine compliance status. Existing approaches usually claim to have
+reasoning capabilities that are latent or require a large amount of annotated
+data. In this work, we propose logical decomposition for policy compliance
+(LDPC): a neuro-symbolic framework to detect policy compliance using large
+language models (LLMs) in a few-shot setting. By selecting only a few exemplars
+alongside recently developed prompting techniques, we demonstrate that our
+approach soundly reasons about policy compliance conversations by extracting
+sub-questions to be answered, assigning truth values from contextual
+information, and explicitly producing a set of logic statements from the given
+policies. The formulation of explicit logic graphs can in turn help answer
+PCDrelated questions with increased transparency and explainability. We apply
+this approach to the popular PCD and conversational machine reading benchmark,
+ShARC, and show competitive performance with no task-specific finetuning. We
+also leverage the inherently interpretable architecture of LDPC to understand
+where errors occur, revealing ambiguities in the ShARC dataset and highlighting
+the challenges involved with reasoning for conversational question answering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CatV2TON: Taming Diffusion <span class="highlight-title">Transformer</span>s for Vision-Based Virtual Try-On
+  with Temporal Concatenation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Chong, Wenqing Zhang, Shiyue Zhang, Jun Zheng, Xiao Dong, Haoxiang Li, Yiling Wu, Dongmei Jiang, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual try-on (VTON) technology has gained attention due to its potential to
+transform online retail by enabling realistic clothing visualization of images
+and videos. However, most existing methods struggle to achieve high-quality
+results across image and video try-on tasks, especially in long video
+scenarios. In this work, we introduce CatV2TON, a simple and effective
+vision-based virtual try-on (V2TON) method that supports both image and video
+try-on tasks with a single diffusion transformer model. By temporally
+concatenating garment and person inputs and training on a mix of image and
+video datasets, CatV2TON achieves robust try-on performance across static and
+dynamic settings. For efficient long-video generation, we propose an
+overlapping clip-based inference strategy that uses sequential frame guidance
+and Adaptive Clip Normalization (AdaCN) to maintain temporal consistency with
+reduced resource demands. We also present ViViD-S, a refined video try-on
+dataset, achieved by filtering back-facing frames and applying 3D mask
+smoothing for enhanced temporal consistency. Comprehensive experiments
+demonstrate that CatV2TON outperforms existing methods in both image and video
+try-on tasks, offering a versatile and reliable solution for realistic virtual
+try-ons across diverse scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finer-CAM: Spotting the Difference Reveals Finer Details for Visual
+  Explanation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziheng Zhang, Jianyang Gu, Arpita Chowdhury, Zheda Mai, David Carlyn, Tanya Berger-Wolf, Yu Su, Wei-Lun Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class activation map (CAM) has been widely used to highlight image regions
+that contribute to class predictions. Despite its simplicity and computational
+efficiency, CAM often struggles to identify discriminative regions that
+distinguish visually similar fine-grained classes. Prior efforts address this
+limitation by introducing more sophisticated explanation processes, but at the
+cost of extra complexity. In this paper, we propose Finer-CAM, a method that
+retains CAM's efficiency while achieving precise localization of discriminative
+regions. Our key insight is that the deficiency of CAM lies not in "how" it
+explains, but in "what" it explains}. Specifically, previous methods attempt to
+identify all cues contributing to the target class's logit value, which
+inadvertently also activates regions predictive of visually similar classes. By
+explicitly comparing the target class with similar classes and spotting their
+differences, Finer-CAM suppresses features shared with other classes and
+emphasizes the unique, discriminative details of the target class. Finer-CAM is
+easy to implement, compatible with various CAM methods, and can be extended to
+multi-modal models for accurate localization of specific concepts.
+Additionally, Finer-CAM allows adjustable comparison strength, enabling users
+to selectively highlight coarse object contours or fine discriminative details.
+Quantitatively, we show that masking out the top 5% of activated pixels by
+Finer-CAM results in a larger relative confidence drop compared to baselines.
+The source code and demo are available at
+https://github.com/Imageomics/Finer-CAM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Imputation of Urban Time Series through Cross-city
+  Meta-learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Nie, Wei Ma, Jian Sun, Yu Yang, Jiannong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urban time series, such as mobility flows, energy consumption, and pollution
+records, encapsulate complex urban dynamics and structures. However, data
+collection in each city is impeded by technical challenges such as budget
+limitations and sensor failures, necessitating effective data imputation
+techniques that can enhance data quality and reliability. Existing imputation
+models, categorized into learning-based and analytics-based paradigms, grapple
+with the trade-off between capacity and generalizability. Collaborative
+learning to reconstruct data across multiple cities holds the promise of
+breaking this trade-off. Nevertheless, urban data's inherent irregularity and
+heterogeneity issues exacerbate challenges of knowledge sharing and
+collaboration across cities. To address these limitations, we propose a novel
+collaborative imputation paradigm leveraging meta-learned implicit neural
+representations (INRs). INRs offer a continuous mapping from domain coordinates
+to target values, integrating the strengths of both paradigms. By imposing
+embedding theory, we first employ continuous parameterization to handle
+irregularity and reconstruct the dynamical system. We then introduce a
+cross-city collaborative learning scheme through model-agnostic meta learning,
+incorporating hierarchical modulation and normalization techniques to
+accommodate multiscale representations and reduce variance in response to
+heterogeneity. Extensive experiments on a diverse urban dataset from 20 global
+cities demonstrate our model's superior imputation performance and
+generalizability, underscoring the effectiveness of collaborative imputation in
+resource-constrained settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Question-to-Question Retrieval for Hallucination-Free Knowledge Access:
+  An Approach for Wikipedia and Wikidata Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Santhosh Thottingal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an approach to question answering over knowledge bases
+like Wikipedia and Wikidata by performing "question-to-question" matching and
+retrieval from a dense vector embedding store. Instead of embedding document
+content, we generate a comprehensive set of questions for each logical content
+unit using an instruction-tuned LLM. These questions are vector-embedded and
+stored, mapping to the corresponding content. Vector embedding of user queries
+are then matched against this question vector store. The highest similarity
+score leads to direct retrieval of the associated article content, eliminating
+the need for answer generation. Our method achieves high cosine similarity ( >
+0.9 ) for relevant question pairs, enabling highly precise retrieval. This
+approach offers several advantages including computational efficiency, rapid
+response times, and increased scalability. We demonstrate its effectiveness on
+Wikipedia and Wikidata, including multimedia content through structured fact
+retrieval from Wikidata, opening up new pathways for multimodal question
+answering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Machine Learning Framework for Handling Unreliable Absence Label and
+  Class Imbalance for Marine Stinger Beaching Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amuche Ibenegbu, Amandine Schaeffer, Pierre Lafaye de Micheaux, Rohitash Chandra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bluebottles (\textit{Physalia} spp.) are marine stingers resembling
+jellyfish, whose presence on Australian beaches poses a significant public risk
+due to their venomous nature. Understanding the environmental factors driving
+bluebottles ashore is crucial for mitigating their impact, and machine learning
+tools are to date relatively unexplored. We use bluebottle marine stinger
+presence/absence data from beaches in Eastern Sydney, Australia, and compare
+machine learning models (Multilayer Perceptron, Random Forest, and XGBoost) to
+identify factors influencing their presence. We address challenges such as
+class imbalance, class overlap, and unreliable absence data by employing data
+augmentation techniques, including the Synthetic Minority Oversampling
+Technique (SMOTE), Random Undersampling, and Synthetic Negative Approach that
+excludes the negative class. Our results show that SMOTE failed to resolve
+class overlap, but the presence-focused approach effectively handled imbalance,
+class overlap, and ambiguous absence data. The data attributes such as the wind
+direction, which is a circular variable, emerged as a key factor influencing
+bluebottle presence, confirming previous inference studies. However, in the
+absence of population dynamics, biological behaviours, and life cycles, the
+best predictive model appears to be Random Forests combined with Synthetic
+Negative Approach. This research contributes to mitigating the risks posed by
+bluebottles to beachgoers and provides insights into handling class overlap and
+unreliable negative class in environmental modelling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RedStar: Does Scaling Long-CoT Data Unlock Better Slow-Reasoning
+  Systems? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Xu, Xing Wu, Weinong Wang, Zhongzhi Li, Da Zheng, Boyuan Chen, Yi Hu, Shijia Kang, Jiaming Ji, Yingying Zhang, Zhijiang Guo, Yaodong Yang, Muhan Zhang, Debing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can scaling transform reasoning? In this work, we explore the untapped
+potential of scaling Long Chain-of-Thought (Long-CoT) data to 1000k samples,
+pioneering the development of a slow-thinking model, RedStar. Through extensive
+experiments with various LLMs and different sizes, we uncover the ingredients
+for specialization and scale for Long-CoT training. Surprisingly, even smaller
+models show significant performance gains with limited data, revealing the
+sample efficiency of Long-CoT and the critical role of sample difficulty in the
+learning process. Our findings demonstrate that Long-CoT reasoning can be
+effectively triggered with just a few thousand examples, while larger models
+achieve unparalleled improvements. We also introduce reinforcement learning
+(RL)-scale training as a promising direction for advancing slow-thinking
+systems. RedStar shines across domains: on the MATH-Hard benchmark,
+RedStar-code-math boosts performance from 66.2\% to 81.6\%, and on the USA Math
+Olympiad (AIME), it solves 46.7\% of problems using only 21k mixed-code-math
+datasets. In multimodal tasks like GeoQA and MathVista-GEO, RedStar-Geo
+achieves competitive results with minimal Long-CoT data, outperforming other
+slow-thinking systems like QvQ-Preview. Compared to QwQ, RedStar strikes the
+perfect balance between reasoning and generalizability. Our work highlights
+that, with careful tuning, scaling Long-CoT can unlock extraordinary reasoning
+capabilities-even with limited dataset and set a new standard for slow-thinking
+models across diverse challenges. Our data and models are released at
+https://huggingface.co/RedStar-Reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>technique-report, https://huggingface.co/RedStar-Reasoning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatiotemporal Air Quality Mapping in Urban Areas Using Sparse Sensor
+  Data, Satellite Imagery, Meteorological Factors, and Spatial Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osama Ahmad, Zubair Khalid, Muhammad Tahir, Momin Uppal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monitoring air pollution is crucial for protecting human health from exposure
+to harmful substances. Traditional methods of air quality monitoring, such as
+ground-based sensors and satellite-based remote sensing, face limitations due
+to high deployment costs, sparse sensor coverage, and environmental
+interferences. To address these challenges, this paper proposes a framework for
+high-resolution spatiotemporal Air Quality Index (AQI) mapping using sparse
+sensor data, satellite imagery, and various spatiotemporal factors. By
+leveraging Graph Neural Networks (GNNs), we estimate AQI values at unmonitored
+locations based on both spatial and temporal dependencies. The framework
+incorporates a wide range of environmental features, including meteorological
+data, road networks, points of interest (PoIs), population density, and urban
+green spaces, which enhance prediction accuracy. We illustrate the use of our
+approach through a case study in Lahore, Pakistan, where multi-resolution data
+is used to generate the air quality index map at a fine spatiotemporal scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Code Readability in the Age of Large Language Models: An Industrial Case
+  Study from Atlassian 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wannita Takerngsaksiri, Micheal Fu, Chakkrit Tantithamthavorn, Jirat Pasuksmit, Kun Chen, Ming Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Programmers spend a significant amount of time reading code during the
+software development process. This trend is amplified by the emergence of large
+language models (LLMs) that automatically generate code. However, little is
+known about the readability of the LLM-generated code and whether it is still
+important from practitioners' perspectives in this new era. In this paper, we
+conduct a survey to explore the practitioners' perspectives on code readability
+in the age of LLMs and investigate the readability of our LLM-based software
+development agents framework, HULA, by comparing its generated code with
+human-written code in real-world scenarios. Overall, the findings underscore
+that (1) readability remains a critical aspect of software development; (2) the
+readability of our LLM-generated code is comparable to human-written code,
+fostering the establishment of appropriate trust and driving the broad adoption
+of our LLM-powered software development platform.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 2 figures, 5 tables, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WSSM: Geographic-enhanced hierarchical state-space model for global
+  station weather forecast 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songru Yang, Zili Liu, Zhenwei Shi, Zhengxia Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global Station Weather Forecasting (GSWF), a prominent meteorological
+research area, is pivotal in providing timely localized weather predictions.
+Despite the progress existing models have made in the overall accuracy of the
+GSWF, executing high-precision extreme event prediction still presents a
+substantial challenge. The recent emergence of state-space models, with their
+ability to efficiently capture continuous-time dynamics and latent states,
+offer potential solutions. However, early investigations indicated that Mamba
+underperforms in the context of GSWF, suggesting further adaptation and
+optimization. To tackle this problem, in this paper, we introduce Weather
+State-space Model (WSSM), a novel Mamba-based approach tailored for GSWF.
+Geographical knowledge is integrated in addition to the widely-used positional
+encoding to represent the absolute special-temporal position. The multi-scale
+time-frequency features are synthesized from coarse to fine to model the
+seasonal to extreme weather dynamic. Our method effectively improves the
+overall prediction accuracy and addresses the challenge of forecasting extreme
+weather events. The state-of-the-art results obtained on the Weather-5K subset
+underscore the efficacy of the WSSM
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reasoning Language Models: A Blueprint 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej Besta, Julia Barth, Eric Schreiber, Ales Kubicek, Afonso Catarino, Robert Gerstenberger, Piotr Nyczyk, Patrick Iff, Yueling Li, Sam Houliston, Tomasz Sternal, Marcin Copik, Grzegorz Kwaśniewski, Jürgen Müller, Łukasz Flis, Hannes Eberhard, Hubert Niewiadomski, Torsten Hoefler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning language models (RLMs), also known as Large Reasoning Models
+(LRMs), such as OpenAI's o1 and o3, DeepSeek-V3, and Alibaba's QwQ, have
+redefined AI's problem-solving capabilities by extending large language models
+(LLMs) with advanced reasoning mechanisms. Yet, their high costs, proprietary
+nature, and complex architectures - uniquely combining Reinforcement Learning
+(RL), search heuristics, and LLMs - present accessibility and scalability
+challenges. To address these, we propose a comprehensive blueprint that
+organizes RLM components into a modular framework, based on a survey and
+analysis of all RLM works. This blueprint incorporates diverse reasoning
+structures (chains, trees, graphs, and nested forms), reasoning strategies
+(e.g., Monte Carlo Tree Search, Beam Search), RL concepts (policy, value models
+and others), and supervision schemes (Output-Based and Process-Based
+Supervision). We also provide detailed mathematical formulations and
+algorithmic specifications to simplify RLM implementation. By showing how
+schemes like LLaMA-Berry, QwQ, Journey Learning, and Graph of Thoughts fit as
+special cases, we demonstrate the blueprint's versatility and unifying
+potential. To illustrate its utility, we introduce x1, a modular implementation
+for rapid RLM prototyping and experimentation. Using x1 and a literature
+review, we provide key insights, such as multi-phase training for policy and
+value models, and the importance of familiar training distributions. Finally,
+we outline how RLMs can integrate with a broader LLM ecosystem, including tools
+and databases. Our work demystifies RLM construction, democratizes advanced
+reasoning capabilities, and fosters innovation, aiming to mitigate the gap
+between "rich AI" and "poor AI" by lowering barriers to RLM development and
+experimentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging GANs For Active Appearance Models Optimized Model Fitting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anurag Awasthi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) have gained prominence in refining
+model fitting tasks in computer vision, particularly in domains involving
+deformable models like Active Appearance Models (AAMs). This paper explores the
+integration of GANs to enhance the AAM fitting process, addressing challenges
+in optimizing nonlinear parameters associated with appearance and shape
+variations. By leveraging GANs' adversarial training framework, the aim is to
+minimize fitting errors and improve convergence rates. Achieving robust
+performance even in cases with high appearance variability and occlusions. Our
+approach demonstrates significant improvements in accuracy and computational
+efficiency compared to traditional optimization techniques, thus establishing
+GANs as a potent tool for advanced image model fitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures, in proceeding at conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Neural Nonparametric Point Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05637v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05637v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Chen, Xuhui Fan, Hengyu Liu, Yaqiong Li, Zhilin Zhao, Feng Zhou, Christopher John Quinn, Longbing Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal point processes (TPPs) are effective for modeling event occurrences
+over time, but they struggle with sparse and uncertain events in federated
+systems, where privacy is a major concern. To address this, we propose
+\textit{FedPP}, a Federated neural nonparametric Point Process model. FedPP
+integrates neural embeddings into Sigmoidal Gaussian Cox Processes (SGCPs) on
+the client side, which is a flexible and expressive class of TPPs, allowing it
+to generate highly flexible intensity functions that capture client-specific
+event dynamics and uncertainties while efficiently summarizing historical
+records. For global aggregation, FedPP introduces a divergence-based mechanism
+that communicates the distributions of SGCPs' kernel hyperparameters between
+the server and clients, while keeping client-specific parameters local to
+ensure privacy and personalization. FedPP effectively captures event
+uncertainty and sparsity, and extensive experiments demonstrate its superior
+performance in federated settings, particularly with KL divergence and
+Wasserstein distance-based global aggregation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Characterising Simulation-Based Program Equilibria 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14570v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14570v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emery Cooper, Caspar Oesterheld, Vincent Conitzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Tennenholtz's program equilibrium, players of a game submit programs to
+play on their behalf. Each program receives the other programs' source code and
+outputs an action. This can model interactions involving AI agents, mutually
+transparent institutions, or commitments. Tennenholtz (2004) proves a folk
+theorem for program games, but the equilibria constructed are very brittle. We
+therefore consider simulation-based programs -- i.e., programs that work by
+running opponents' programs. These are relatively robust (in particular, two
+programs that act the same are treated the same) and are more practical than
+proof-based approaches. Oesterheld's (2019) $\epsilon$Grounded$\pi$Bot is such
+an approach. Unfortunately, it is not generally applicable to games of three or
+more players, and only allows for a limited range of equilibria in two player
+games. In this paper, we propose a generalisation to Oesterheld's (2019)
+$\epsilon$Grounded$\pi$Bot. We prove a folk theorem for our programs in a
+setting with access to a shared source of randomness. We then characterise
+their equilibria in a setting without shared randomness. Both with and without
+shared randomness, we achieve a much wider range of equilibria than
+Oesterheld's (2019) $\epsilon$Grounded$\pi$Bot. Finally, we explore the limits
+of simulation-based program equilibrium, showing that the Tennenholtz folk
+theorem cannot be attained by simulation-based programs without access to
+shared randomness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VLM Agents Generate Their Own Memories: Distilling Experience into
+  Embodied Programs of Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14596v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14596v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Sarch, Lawrence Jang, Michael J. Tarr, William W. Cohen, Kenneth Marino, Katerina Fragkiadaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale LLMs and VLMs excel at few-shot learning but require high-quality
+examples. We introduce In-Context Abstraction Learning (ICAL), which
+iteratively refines suboptimal trajectories into high-quality data with
+optimized actions and detailed reasoning. Given an inefficient demonstration, a
+VLM corrects actions and annotates causal relationships, object states,
+subgoals, and task-relevant visuals, forming "programs of thought." With human
+feedback, these programs are improved as the agent executes them in a similar
+environment. The resulting examples, used as prompt context or fine-tuning
+data, significantly boost decision-making while reducing human feedback needs.
+ICAL surpasses state-of-the-art in TEACh (dialogue-based instruction
+following), VisualWebArena (multimodal web agents), and Ego4D (egocentric video
+action anticipation). In TEACh, combining fine-tuning and retrieval on ICAL
+examples outperforms raw human demonstrations and expert examples, achieving a
+17.5% increase in goal-condition success. In VisualWebArena,
+retrieval-augmented GPT-4V with ICAL improves task success rate 1.6x over
+GPT-4V, while fine-tuning Qwen2-VL achieves a 2.8x improvement. In Ego4D, ICAL
+outperforms few-shot GPT-4V and remains competitive with supervised models.
+Overall, ICAL scales 2x better than raw human demonstrations and reduces manual
+prompt engineering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://ical-learning.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BEAVER: An Enterprise Benchmark for Text-to-SQL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.02038v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.02038v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Baile Chen, Fabian Wenz, Yi Zhang, Devin Yang, Justin Choi, Nesime Tatbul, Michael Cafarella, Çağatay Demiralp, Michael Stonebraker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing text-to-SQL benchmarks have largely been constructed from web tables
+with human-generated question-SQL pairs. LLMs typically show strong results on
+these benchmarks, leading to a belief that LLMs are effective at text-to-SQL
+tasks. However, how these results transfer to enterprise settings is unclear
+because tables in enterprise databases might differ substantially from web
+tables in structure and content. To contend with this problem, we introduce a
+new dataset BEAVER, the first enterprise text-to-SQL benchmark sourced from
+real private enterprise data warehouses. This dataset includes natural language
+queries and their correct SQL statements, which we collected from actual query
+logs. We then benchmark off-the-shelf LLMs on this dataset. LLMs perform
+poorly, even when augmented with standard prompt engineering and RAG
+techniques. We identify three main reasons for the poor performance: (1)
+schemas of enterprise tables are more complex than the schemas in public data,
+resulting in SQL-generation tasks intrinsically harder; (2) business-oriented
+questions are often more complex, requiring joins over multiple tables,
+aggregations, and nested queries; (3) public LLMs cannot train on private
+enterprise data warehouses that are not publicly accessible, and therefore it
+is difficult for the model to learn to solve (1) and (2). We believe BEAVER
+will facilitate future research in building text-to-SQL systems that perform
+better in enterprise settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Dataset and code are available at
+  https://peterbaile.github.io/beaver/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Strategy Masking: A Method for Guardrails in Value-based Reinforcement
+  Learning Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05501v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05501v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Keane, Sam Keyser, Jeremy Kedziora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of reward functions to structure AI learning and decision making is
+core to the current reinforcement learning paradigm; however, without careful
+design of reward functions, agents can learn to solve problems in ways that may
+be considered "undesirable" or "unethical." Without thorough understanding of
+the incentives a reward function creates, it can be difficult to impose
+principled yet general control mechanisms over its behavior. In this paper, we
+study methods for constructing guardrails for AI agents that use reward
+functions to learn decision making. We introduce a novel approach, which we
+call strategy masking, to explicitly learn and then suppress undesirable AI
+agent behavior. We apply our method to study lying in AI agents and show that
+it can be used to effectively modify agent behavior by suppressing lying
+post-training without compromising agent ability to perform effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inference-Time Alignment in Diffusion Models with Reward-Guided
+  Generation: Tutorial and <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09685v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09685v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masatoshi Uehara, Yulai Zhao, Chenyu Wang, Xiner Li, Aviv Regev, Sergey Levine, Tommaso Biancalani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This tutorial provides an in-depth guide on inference-time guidance and
+alignment methods for optimizing downstream reward functions in diffusion
+models. While diffusion models are renowned for their generative modeling
+capabilities, practical applications in fields such as biology often require
+sample generation that maximizes specific metrics (e.g., stability, affinity in
+proteins, closeness to target structures). In these scenarios, diffusion models
+can be adapted not only to generate realistic samples but also to explicitly
+maximize desired measures at inference time without fine-tuning. This tutorial
+explores the foundational aspects of such inference-time algorithms. We review
+these methods from a unified perspective, demonstrating that current techniques
+-- such as Sequential Monte Carlo (SMC)-based guidance, value-based sampling,
+and classifier guidance -- aim to approximate soft optimal denoising processes
+(a.k.a. policies in RL) that combine pre-trained denoising processes with value
+functions serving as look-ahead functions that predict from intermediate states
+to terminal rewards. Within this framework, we present several novel algorithms
+not yet covered in the literature. Furthermore, we discuss (1) fine-tuning
+methods combined with inference-time techniques, (2) inference-time algorithms
+based on search algorithms such as Monte Carlo tree search, which have received
+limited attention in current research, and (3) connections between
+inference-time algorithms in language models and diffusion models. The code of
+this tutorial on protein design is available at
+https://github.com/masa-ue/AlignInversePro
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We plan to add more content and codes. Please let us know if there
+  are any comments or missing citations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PDF-WuKong: A Large Multimodal Model for Efficient Long PDF Reading with
+  End-to-End Sparse Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05970v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05970v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xudong Xie, Hao Yan, Liang Yin, Yang Liu, Jing Ding, Minghui Liao, Yuliang Liu, Wei Chen, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal document understanding is a challenging task to process and
+comprehend large amounts of textual and visual information. Recent advances in
+Large Language Models (LLMs) have significantly improved the performance of
+this task. However, existing methods typically focus on either plain text or a
+limited number of document images, struggling to handle long PDF documents with
+interleaved text and images, especially for academic papers. In this paper, we
+introduce PDF-WuKong, a multimodal large language model (MLLM) which is
+designed to enhance multimodal question-answering (QA) for long PDF documents.
+PDF-WuKong incorporates a sparse sampler that operates on both text and image
+representations, significantly improving the efficiency and capability of the
+MLLM. The sparse sampler is integrated with the MLLM's image encoder and
+selects the paragraphs or diagrams most pertinent to user queries for
+processing by the language model. To effectively train and evaluate our model,
+we construct PaperPDF, a dataset consisting of a broad collection of English
+and Chinese academic papers. Multiple strategies are proposed to automatically
+generate 1.1 million QA pairs along with their corresponding evidence sources.
+Experimental results demonstrate the superiority and high efficiency of our
+approach over other models on the task of long multimodal document
+understanding, surpassing proprietary products by an average of 8.6% on F1. Our
+code and dataset will be released at https://github.com/yh-hust/PDF-Wukong.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Blind Normalized Stein Variational Gradient Descent-Based Detection
+  for Intelligent Random Access in Cellular IoT 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18846v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18846v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhu, Ahmet Enis Cetin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of an efficient preamble detection algorithm remains a challenge for
+solving preamble collision problems in intelligent random access (RA) in the
+cellular Internet of Things (IoT). To address this problem, we present an early
+preamble detection scheme based on a maximum likelihood estimation (MLE) model
+at the first step of the grant-based RA procedure. A novel blind normalized
+Stein variational gradient descent (SVGD)-based detector is proposed to obtain
+an approximate solution to the MLE model. First, by exploring the relationship
+between the Hadamard transform and wavelet packet transform, a new modified
+Hadamard transform (MHT) is developed to separate high-frequency components
+from signals using the second-order derivative filter. Next, to eliminate noise
+and mitigate the vanishing gradients problem in the SVGD-based detectors, the
+block MHT layer is designed based on the MHT, scaling layer, soft-thresholding
+layer, inverse MHT and sparsity penalty. Then, the blind normalized SVGD
+algorithm is derived to perform preamble detection without prior knowledge of
+noise power and the number of active IoT devices. The experimental results show
+the proposed block MHT layer outperforms other transform-based methods in terms
+of computation costs and denoising performance. Furthermore, with the
+assistance of the block MHT layer, the proposed blind normalized SVGD algorithm
+achieves a higher preamble detection accuracy and throughput than other
+state-of-the-art detection methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the IEEE Internet of Things Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Unknowable Limits to Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19223v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19223v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiani Yan, Charles Rahal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a rigorous decomposition of predictive error, highlighting that
+not all 'irreducible' error is genuinely immutable. Many domains stand to
+benefit from iterative enhancements in measurement, construct validity, and
+modeling. Our approach demonstrates how apparently 'unpredictable' outcomes can
+become more tractable with improved data (across both target and features) and
+refined algorithms. By distinguishing aleatoric from epistemic error, we
+delineate how accuracy may asymptotically improve--though inherent
+stochasticity may remain--and offer a robust framework for advancing
+computational research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DCOR: Anomaly Detection in Attributed Networks via Dual Contrastive
+  Learning Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16788v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16788v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossein Rafieizadeh, Hadi Zare, Mohsen Ghassemi Parsa, Hadi Davardoust, Meshkat Shariat Bagheri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection using a network-based approach is one of the most efficient
+ways to identify abnormal events such as fraud, security breaches, and system
+faults in a variety of applied domains. While most of the earlier works address
+the complex nature of graph-structured data and predefined anomalies, the
+impact of data attributes and emerging anomalies are often neglected. This
+paper introduces DCOR, a novel approach on attributed networks that integrates
+reconstruction-based anomaly detection with Contrastive Learning. Utilizing a
+Graph Neural Network (GNN) framework, DCOR contrasts the reconstructed
+adjacency and feature matrices from both the original and augmented graphs to
+detect subtle anomalies. We employed comprehensive experimental studies on
+benchmark datasets through standard evaluation measures. The results show that
+DCOR significantly outperforms state-of-the-art methods. Obtained results
+demonstrate the efficacy of proposed approach in attributed networks with the
+potential of uncovering new patterns of anomalies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Thirteenth International Conference on Complex
+  Networks and Their Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Agents using Social Choice Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03121v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03121v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Lanctot, Kate Larson, Yoram Bachrach, Luke Marris, Zun Li, Avishkar Bhoopchand, Thomas Anthony, Brian Tanner, Anna Koop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We argue that many general evaluation problems can be viewed through the lens
+of voting theory. Each task is interpreted as a separate voter, which requires
+only ordinal rankings or pairwise comparisons of agents to produce an overall
+evaluation. By viewing the aggregator as a social welfare function, we are able
+to leverage centuries of research in social choice theory to derive principled
+evaluation frameworks with axiomatic foundations. These evaluations are
+interpretable and flexible, while avoiding many of the problems currently
+facing cross-task evaluation. We apply this Voting-as-Evaluation (VasE)
+framework across multiple settings, including reinforcement learning, large
+language models, and humans. In practice, we observe that VasE can be more
+robust than popular evaluation frameworks (Elo and Nash averaging), discovers
+properties in the evaluation data not evident from scores alone, and can
+predict outcomes better than Elo in a complex seven-player game. We identify
+one particular approach, maximal lotteries, that satisfies important
+consistency properties relevant to evaluation, is computationally efficient
+(polynomial in the size of the evaluation data), and identifies game-theoretic
+cycles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Code-Driven Law NO, Normware SI! 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.17257v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.17257v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni Sileno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the digitalization of society, the interest, the debates and the
+research efforts concerning "code", "law", "artificial intelligence", and their
+various relationships, have been widely increasing. Yet, most arguments
+primarily focus on contemporary computational methods and artifacts
+(inferential models constructed via machine-learning methods, rule-based
+systems, smart contracts), rather than attempting to identify more fundamental
+mechanisms. Aiming to go beyond this conceptual limitation, this paper
+introduces and elaborates on "normware" as an explicit additional stance --
+complementary to software and hardware -- for the interpretation and the design
+of artificial devices. By means of a few examples, I will argue that a
+normware-centred perspective provides a more adequate abstraction to study and
+design interactions between computational systems and human institutions, and
+may help with the design and development of technical interventions within
+wider socio-technical views.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First version of the paper presented at CRCL 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span>-Based Model for Monocular Visual Odometry: A Video
+  Understanding Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06121v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06121v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        André O. Françani, Marcos R. O. A. Maximo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the camera's pose given images from a single camera is a
+traditional task in mobile robots and autonomous vehicles. This problem is
+called monocular visual odometry and often relies on geometric approaches that
+require considerable engineering effort for a specific scenario. Deep learning
+methods have been shown to be generalizable after proper training and with a
+large amount of available data. Transformer-based architectures have dominated
+the state-of-the-art in natural language processing and computer vision tasks,
+such as image and video understanding. In this work, we deal with the monocular
+visual odometry as a video understanding task to estimate the 6 degrees of
+freedom of a camera's pose. We contribute by presenting the TSformer-VO model
+based on spatio-temporal self-attention mechanisms to extract features from
+clips and estimate the motions in an end-to-end manner. Our approach achieved
+competitive state-of-the-art performance compared with geometry-based and deep
+learning-based methods on the KITTI visual odometry dataset, outperforming the
+DeepVO implementation highly accepted in the visual odometry community. The
+code is publicly available at https://github.com/aofrancani/TSformer-VO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been accepted for publication in IEEE Access</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Non-autoregressive Model for Joint STT and TTS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishal Sunder, Brian Kingsbury, George Saon, Samuel Thomas, Slava Shechtman, Hagai Aronowitz, Eric Fosler-Lussier, Luis Lastras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we take a step towards jointly modeling automatic speech
+recognition (STT) and speech synthesis (TTS) in a fully non-autoregressive way.
+We develop a novel multimodal framework capable of handling the speech and text
+modalities as input either individually or together. The proposed model can
+also be trained with unpaired speech or text data owing to its multimodal
+nature. We further propose an iterative refinement strategy to improve the STT
+and TTS performance of our model such that the partial hypothesis at the output
+can be fed back to the input of our model, thus iteratively improving both STT
+and TTS predictions. We show that our joint model can effectively perform both
+STT and TTS tasks, outperforming the STT-specific baseline in all tasks and
+performing competitively with the TTS-specific baseline across a wide range of
+evaluation metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimization and Generalization Guarantees for Weight Normalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08935v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08935v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Cisneros-Velarde, Zhijie Chen, Sanmi Koyejo, Arindam Banerjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weight normalization (WeightNorm) is widely used in practice for the training
+of deep neural networks and modern deep learning libraries have built-in
+implementations of it. In this paper, we provide the first theoretical
+characterizations of both optimization and generalization of deep WeightNorm
+models with smooth activation functions. For optimization, from the form of the
+Hessian of the loss, we note that a small Hessian of the predictor leads to a
+tractable analysis. Thus, we bound the spectral norm of the Hessian of
+WeightNorm networks and show its dependence on the network width and weight
+normalization terms--the latter being unique to networks without WeightNorm.
+Then, we use this bound to establish training convergence guarantees under
+suitable assumptions for gradient decent. For generalization, we use WeightNorm
+to get a uniform convergence based generalization bound, which is independent
+from the width and depends sublinearly on the depth. Finally, we present
+experimental results which illustrate how the normalization terms and other
+quantities of theoretical interest relate to the training of WeightNorm
+networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Practical Examination of AI-Generated Text Detectors for Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05139v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05139v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Tufts, Xuandong Zhao, Lei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of large language models has raised growing concerns about
+their misuse, particularly in cases where AI-generated text is falsely
+attributed to human authors. Machine-generated content detectors claim to
+effectively identify such text under various conditions and from any language
+model. This paper critically evaluates these claims by assessing several
+popular detectors (RADAR, Wild, T5Sentinel, Fast-DetectGPT, GPTID, LogRank,
+Binoculars) on a range of domains, datasets, and models that these detectors
+have not previously encountered. We employ various prompting strategies to
+simulate adversarial attacks, demonstrating that even moderate efforts can
+significantly evade detection. We emphasize the importance of the true positive
+rate at a specific false positive rate (TPR@FPR) metric and demonstrate that
+these detectors perform poorly in certain settings, with TPR@.01 as low as 0%.
+Our findings suggest that both trained and zero-shot detectors struggle to
+maintain high sensitivity while achieving a reasonable true positive rate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Potential and Perils of Large Language Models as Judges of Unstructured
+  Textual Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08167v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08167v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rewina Bedemariam, Natalie Perez, Sreyoshi Bhaduri, Satya Kapoor, Alex Gil, Elizabeth Conjar, Ikkei Itoku, David Theil, Aman Chadha, Naumaan Nayyar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapid advancements in large language models have unlocked remarkable
+capabilities when it comes to processing and summarizing unstructured text
+data. This has implications for the analysis of rich, open-ended datasets, such
+as survey responses, where LLMs hold the promise of efficiently distilling key
+themes and sentiments. However, as organizations increasingly turn to these
+powerful AI systems to make sense of textual feedback, a critical question
+arises, can we trust LLMs to accurately represent the perspectives contained
+within these text based datasets? While LLMs excel at generating human-like
+summaries, there is a risk that their outputs may inadvertently diverge from
+the true substance of the original responses. Discrepancies between the
+LLM-generated outputs and the actual themes present in the data could lead to
+flawed decision-making, with far-reaching consequences for organizations. This
+research investigates the effectiveness of LLM-as-judge models to evaluate the
+thematic alignment of summaries generated by other LLMs. We utilized an
+Anthropic Claude model to generate thematic summaries from open-ended survey
+responses, with Amazon's Titan Express, Nova Pro, and Meta's Llama serving as
+judges. This LLM-as-judge approach was compared to human evaluations using
+Cohen's kappa, Spearman's rho, and Krippendorff's alpha, validating a scalable
+alternative to traditional human centric evaluation methods. Our findings
+reveal that while LLM-as-judge offer a scalable solution comparable to human
+raters, humans may still excel at detecting subtle, context-specific nuances.
+Our research contributes to the growing body of knowledge on AI assisted text
+analysis. Further, we provide recommendations for future research, emphasizing
+the need for careful consideration when generalizing LLM-as-judge models across
+various contexts and use cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 1 appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Block-wise Gradient Shuffle for Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21347v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21347v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Zagardo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Differentially Private Stochastic Gradient Descent (DP-SGD)
+introduces statistical noise on top of gradients drawn from a Gaussian
+distribution to ensure privacy. This paper introduces the novel Differentially
+Private Block-wise Gradient Shuffle (DP-BloGS) algorithm for deep learning.
+BloGS builds off of existing private deep learning literature, but makes a
+definitive shift by taking a probabilistic approach to gradient noise
+introduction through shuffling modeled after information theoretic privacy
+analyses. The theoretical results presented in this paper show that the
+combination of shuffling, parameter-specific block size selection, batch layer
+clipping, and gradient accumulation allows DP-BloGS to achieve training times
+close to that of non-private training while maintaining similar privacy and
+utility guarantees to DP-SGD. DP-BloGS is found to be significantly more
+resistant to data extraction attempts than DP-SGD. The theoretical results are
+validated by the experimental findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The results are genuine, but the math is wrong! Please do not use
+  this method for your Differential Privacy implementations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Too Good to be True? Turn Any Model Differentially Private With
+  DP-Weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19507v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19507v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Zagardo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imagine training a machine learning model with Differentially Private
+Stochastic Gradient Descent (DP-SGD), only to discover post-training that the
+noise level was either too high, crippling your model's utility, or too low,
+compromising privacy. The dreaded realization hits: you must start the lengthy
+training process from scratch. But what if you could avoid this retraining
+nightmare? In this study, we introduce a groundbreaking approach (to our
+knowledge) that applies differential privacy noise to the model's weights after
+training. We offer a comprehensive mathematical proof for this novel approach's
+privacy bounds, use formal methods to validate its privacy guarantees, and
+empirically evaluate its effectiveness using membership inference attacks and
+performance evaluations. This method allows for a single training run, followed
+by post-hoc noise adjustments to achieve optimal privacy-utility trade-offs. We
+compare this novel fine-tuned model (DP-Weights model) to a traditional DP-SGD
+model, demonstrating that our approach yields statistically similar performance
+and privacy guarantees. Our results validate the efficacy of post-training
+noise application, promising significant time savings and flexibility in
+fine-tuning differential privacy parameters, making it a practical alternative
+for deploying differentially private models in real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The results are genuine, but the math is wrong! Please do not use
+  this method for your Differential Privacy implementations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cooperative Evolutionary Pressure and Diminishing Returns Might Explain
+  the Fermi Paradox: On What Super-AIs Are Like 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03685v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03685v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Vallstrom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With an evolutionary approach, the basis of morality can be explained as
+adaptations to problems of cooperation. With 'evolution' taken in a broad
+sense, AIs that satisfy the conditions for evolution to apply will be subject
+to the same cooperative evolutionary pressure as biological entities. Here the
+adaptiveness of increased cooperation as material safety and wealth increase is
+discussed -- for humans, for other societies, and for AIs. Diminishing
+beneficial returns from increased access to material resources also suggests
+the possibility that, on the whole, there will be no incentive to for instance
+colonize entire galaxies, thus providing a possible explanation of the Fermi
+paradox, wondering where everybody is. It is further argued that old societies
+could engender, give way to, super-AIs, since it is likely that super-AIs are
+feasible, and fitter. Closing is an aside on effective ways for morals and
+goals to affect life and society, emphasizing environments, cultures, and laws,
+and exemplified by how to eat.
+  'Diminishing returns' is defined, as less than roots, the inverse of
+infeasibility. It is also noted that there can be no exponential colonization
+or reproduction, for mathematical reasons, as each entity takes up a certain
+amount of space. Appended are an algorithm for colonizing for example a galaxy
+quickly, models of the evolution of cooperation and fairness under diminishing
+returns, and software for simulating signaling development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 10 figures. Added figures, expansions, definitions, proofs,
+  references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 60 Data Points are Sufficient to Fine-Tune LLMs for Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.15825v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.15825v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Ye, Yuming Yang, Qi Zhang, Tao Gui, Xuanjing Huang, Peng Wang, Zhongchao Shi, Jianping Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) encode extensive world knowledge through
+pre-training on massive datasets, which can then be fine-tuned for the
+question-answering (QA) task. However, effective strategies for fine-tuning
+LLMs for the QA task remain largely unexplored. To address this gap, we
+categorize supervised fine-tuning (SFT) data based on the extent of knowledge
+memorized by the pretrained LLMs and conduct a series of empirical analyses.
+Our experiments, involving four LLMs from three different model families, focus
+on three key factors: the amount of data required for SFT, the impact of
+different SFT datasets on model performance, and how data requirements vary
+across LLMs. The results show that as few as 60 data points during the SFT
+stage can activate the knowledge encoded during pre-training, enabling LLMs to
+perform the QA task. Additionally, SFT with data of varying memory levels has a
+significant impact on LLM performance, with the optimal dataset differing based
+on the specific model being fine-tuned. Future research will delve deeper into
+the mechanisms underlying these phenomena.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligning Instruction Tuning with <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09368v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09368v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Liang, Tianyu Zheng, Xinrun Du, Ge Zhang, Jiaheng Liu, Xingwei Qu, Wenqiang Zu, Xingrun Xing, Chujie Zheng, Lei Ma, Wenhu Chen, Guoyin Wang, Zhaoxiang Zhang, Wenhao Huang, Xiang Yue, Jiajun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning enhances large language models (LLMs) to follow human
+instructions across diverse tasks, relying on high-quality datasets to guide
+behavior. However, these datasets, whether manually curated or synthetically
+generated, are often narrowly focused and misaligned with the broad
+distributions captured during pre-training, limiting LLM generalization and
+effective use of pre-trained knowledge. We propose Aligning Instruction Tuning
+with Pre-training (AITP), a method that bridges this gap by identifying
+coverage shortfalls in instruction-tuning datasets and rewriting
+underrepresented pre-training data into high-quality instruction-response
+pairs. This approach enriches dataset diversity while preserving task-specific
+objectives. Evaluations on three fully open LLMs across eight benchmarks
+demonstrate consistent performance improvements with AITP. Ablations highlight
+the benefits of adaptive data selection, controlled rewriting, and balanced
+integration, emphasizing the importance of aligning instruction tuning with
+pre-training distributions to unlock the full potential of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:hep-ph/9811436 by other
+  authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Underwater Camouflaged Object Tracking: Benchmark and Baselines <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16902v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16902v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunhui Zhang, Li Liu, Guanjie Huang, Hao Wen, Xi Zhou, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past decade, significant progress has been made in visual object
+tracking, largely due to the availability of large-scale datasets. However,
+existing tracking datasets are primarily focused on open-air scenarios, which
+greatly limits the development of object tracking in underwater environments.
+To bridge this gap, we take a step forward by proposing the first large-scale
+multimodal underwater camouflaged object tracking dataset, namely UW-COT220.
+Based on the proposed dataset, this paper first comprehensively evaluates
+current advanced visual object tracking methods and SAM- and SAM2-based
+trackers in challenging underwater environments. Our findings highlight the
+improvements of SAM2 over SAM, demonstrating its enhanced ability to handle the
+complexities of underwater camouflaged objects. Furthermore, we propose a novel
+vision-language tracking framework called VL-SAM2, based on the video
+foundation model SAM2. Experimental results demonstrate that our VL-SAM2
+achieves state-of-the-art performance on the UW-COT220 dataset. The dataset and
+codes can be accessible at
+\color{magenta}{https://github.com/983632847/Awesome-Multimodal-Object-Tracking}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Work in Progress. Extended Version of WebUOT-1M on NeurIPS
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Systems Thinking Approach to Algorithmic Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16641v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16641v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chris Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Systems thinking provides us with a way to model the algorithmic fairness
+problem by allowing us to encode prior knowledge and assumptions about where we
+believe bias might exist in the data generating process. We can then encode
+these beliefs as a series of causal graphs, enabling us to link AI/ML systems
+to politics and the law. This allows us to combine techniques from machine
+learning, causal inference, and system dynamics in order to capture different
+emergent aspects of the fairness problem. We can use systems thinking to help
+policymakers on both sides of the political aisle to understand the complex
+trade-offs that exist from different types of fairness policies, providing a
+sociotechnical foundation for designing AI policy that is aligned to their
+political agendas and with society's values.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to the 2025 ACM FAccT conference for
+  review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph neural networks informed locally by thermodynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13093v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13093v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alicia Tierz, Iciar Alfaro, David González, Francisco Chinesta, Elías Cueto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thermodynamics-informed neural networks employ inductive biases for the
+enforcement of the first and second principles of thermodynamics. To construct
+these biases, a metriplectic evolution of the system is assumed. This provides
+excellent results, when compared to uninformed, black box networks. While the
+degree of accuracy can be increased in one or two orders of magnitude, in the
+case of graph networks, this requires assembling global Poisson and dissipation
+matrices, which breaks the local structure of such networks. In order to avoid
+this drawback, a local version of the metriplectic biases has been developed in
+this work, which avoids the aforementioned matrix assembly, thus preserving the
+node-by-node structure of the graph networks. We apply this framework for
+examples in the fields of solid and fluid mechanics. Our approach demonstrates
+significant computational efficiency and strong generalization capabilities,
+accurately making inferences on examples significantly different from those
+encountered during training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ APOLLO: SGD-like Memory, AdamW-level Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05270v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05270v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanqing Zhu, Zhenyu Zhang, Wenyan Cong, Xi Liu, Sem Park, Vikas Chandra, Bo Long, David Z. Pan, Zhangyang Wang, Jinwon Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are notoriously memory-intensive during
+training, particularly with the popular AdamW optimizer. This memory burden
+necessitates using more or higher-end GPUs or reducing batch sizes, limiting
+training scalability and throughput. To address this, various memory-efficient
+optimizers have been proposed to reduce optimizer memory usage. However, they
+face critical challenges: (i) reliance on costly SVD operations; (ii)
+significant performance trade-offs compared to AdamW; and (iii) still
+substantial optimizer memory overhead to maintain competitive performance.
+  In this work, we identify that AdamW's learning rate adaptation rule can be
+effectively coarsened as a structured learning rate update. Based on this
+insight, we propose Approximated Gradient Scaling for Memory-Efficient LLM
+Optimization (APOLLO), which approximates learning rate scaling using an
+auxiliary low-rank optimizer state based on pure random projection. This
+structured learning rate update rule makes APOLLO highly tolerant to further
+memory reductions while delivering comparable pre-training performance. Even
+its rank-1 variant, APOLLO-Mini, achieves superior pre-training performance
+compared to AdamW with SGD-level memory costs.
+  Extensive experiments demonstrate that the APOLLO series performs on-par with
+or better than AdamW, while achieving greater memory savings by nearly
+eliminating the optimization states of AdamW. These savings provide significant
+system-level benefits: (1) Enhanced Throughput: 3x throughput on an 8xA100-80GB
+setup compared to AdamW by supporting 4x larger batch sizes. (2) Improved Model
+Scalability: Pre-training LLaMA-13B with naive DDP on A100-80GB GPUs without
+system-level optimizations. (3) Low-End GPU Friendly Pre-training: Pre-training
+LLaMA-7B on a single GPU using less than 12 GB of memory with weight
+quantization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; update code link and visualization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Autoregressive <span class="highlight-title">Transformer</span>s: Combining Byte- and Word-Level
+  Processing for Robust, Adaptable Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10322v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10322v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pit Neitemeier, Björn Deiseroth, Constantin Eichenberg, Lukas Balles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tokenization is a fundamental step in natural language processing, breaking
+text into units that computational models can process. While learned subword
+tokenizers have become the de-facto standard, they present challenges such as
+large vocabularies, limited adaptability to new domains or languages, and
+sensitivity to spelling errors and variations. To overcome these limitations,
+we investigate a hierarchical architecture for autoregressive language
+modelling that combines character-level and word-level processing. It employs a
+lightweight character-level encoder to convert character sequences into word
+embeddings, which are then processed by a word-level backbone model and decoded
+back into characters via a compact character-level decoder. This method retains
+the sequence compression benefits of word-level tokenization without relying on
+a rigid, predefined vocabulary. We demonstrate, at scales up to 7 billion
+parameters, that hierarchical transformers match the downstream task
+performance of subword-tokenizer-based models while exhibiting significantly
+greater robustness to input perturbations. Additionally, during continued
+pretraining on an out-of-domain language, our model trains almost twice as
+fast, achieves superior performance on the target language, and retains more of
+its previously learned knowledge. Hierarchical transformers pave the way for
+NLP systems that are more robust, flexible, and generalizable across languages
+and domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Improved Conditioning Mechanisms and <span class="highlight-title">Pre-train</span>ing Strategies for
+  Diffusion Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03177v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03177v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tariq Berrada Ifriqi, Pietro Astolfi, Melissa Hall, Reyhane Askari-Hemmat, Yohann Benchetrit, Marton Havasi, Matthew Muckley, Karteek Alahari, Adriana Romero-Soriano, Jakob Verbeek, Michal Drozdzal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale training of latent diffusion models (LDMs) has enabled
+unprecedented quality in image generation. However, the key components of the
+best performing LDM training recipes are oftentimes not available to the
+research community, preventing apple-to-apple comparisons and hindering the
+validation of progress in the field. In this work, we perform an in-depth study
+of LDM training recipes focusing on the performance of models and their
+training efficiency. To ensure apple-to-apple comparisons, we re-implement five
+previously published models with their corresponding recipes. Through our
+study, we explore the effects of (i)~the mechanisms used to condition the
+generative model on semantic information (e.g., text prompt) and control
+metadata (e.g., crop size, random flip flag, etc.) on the model performance,
+and (ii)~the transfer of the representations learned on smaller and
+lower-resolution datasets to larger ones on the training efficiency and model
+performance. We then propose a novel conditioning mechanism that disentangles
+semantic and control metadata conditionings and sets a new state-of-the-art in
+class-conditional generation on the ImageNet-1k dataset -- with FID
+improvements of 7% on 256 and 8% on 512 resolutions -- as well as text-to-image
+generation on the CC12M dataset -- with FID improvements of 8% on 256 and 23%
+on 512 resolution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a conference paper (poster) for NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI Techniques in the Microservices Life-Cycle: A Systematic Mapping
+  Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16092v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16092v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Moreschini, Shahrzad Pour, Ivan Lanese, Daniel Balouek-Thomert, Justus Bogner, Xiaozhou Li, Fabiano Pecorelli, Jacopo Soldani, Eddy Truyen, Davide Taibi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of AI in microservices (MSs) is an emerging field as indicated by a
+substantial number of surveys. However these surveys focus on a specific
+problem using specific AI techniques, therefore not fully capturing the growth
+of research and the rise and disappearance of trends. In our systematic mapping
+study, we take an exhaustive approach to reveal all possible connections
+between the use of AI techniques for improving any quality attribute (QA) of
+MSs during the DevOps phases. Our results include 16 research themes that
+connect to the intersection of particular QAs, AI domains and DevOps phases.
+Moreover by mapping identified future research challenges and relevant industry
+domains, we can show that many studies aim to deliver prototypes to be
+automated at a later stage, aiming at providing exploitable products in a
+number of key industry domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review at a journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DutyTTE: Deciphering Uncertainty in Origin-Destination Travel Time
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12809v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12809v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaowei Mao, Yan Lin, Shengnan Guo, Yubin Chen, Xingyu Xian, Haomin Wen, Qisen Xu, Youfang Lin, Huaiyu Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty quantification in travel time estimation (TTE) aims to estimate
+the confidence interval for travel time, given the origin (O), destination (D),
+and departure time (T). Accurately quantifying this uncertainty requires
+generating the most likely path and assessing travel time uncertainty along the
+path. This involves two main challenges: 1) Predicting a path that aligns with
+the ground truth, and 2) modeling the impact of travel time in each segment on
+overall uncertainty under varying conditions. We propose DutyTTE to address
+these challenges. For the first challenge, we introduce a deep reinforcement
+learning method to improve alignment between the predicted path and the ground
+truth, providing more accurate travel time information from road segments to
+improve TTE. For the second challenge, we propose a mixture of experts guided
+uncertainty quantification mechanism to better capture travel time uncertainty
+for each segment under varying contexts. Additionally, we calibrate our results
+using Hoeffding's upper-confidence bound to provide statistical guarantees for
+the estimated confidence intervals. Extensive experiments on two real-world
+datasets demonstrate the superiority of our proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BoostStep: Boosting mathematical capability of Large Language Models via
+  improved single-step reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beichen Zhang, Yuhong Liu, Xiaoyi Dong, Yuhang Zang, Pan Zhang, Haodong Duan, Yuhang Cao, Dahua Lin, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cutting-edge large language models (LLMs) demonstrate promising performance
+in solving complex math problems with a divide-and-conquer pipeline and the
+assistance of in-context learning (ICL) examples. However, their potential for
+improvement is limited by two critical problems within their ICL examples:
+granularity-mismatch and the ensuing negative-effect noise problem.
+Specifically, the LLMs are capable of the dividing process yet mostly failed by
+inaccurate reasoning within a few conquer steps, while the ICL examples
+retrieved in question-grained sometimes lack relevant steps for a specific
+challenging reasoning step. Further, this disconnect may hinder the correct
+reasoning due to its irrelevance. To this end, we focus on improving the
+reasoning quality within each step and present BoostStep. BoostStep aligns the
+granularity between the retrieving and reasoning on step grained, and provides
+highly related ICL examples for each reasoning step with a novel `first-try'
+strategy. BoostStep provides more relevant examples than the coarse
+question-grained strategy, enhancing the model reasoning quality within each
+step steadily. BoostStep is a general and robust reasoning-enhancing method
+that not only improves standalone reasoning performance but also integrates
+seamlessly with Monte Carlo Tree Search methods (MCTS) to refine both candidate
+generation and decision-making. Quantitatively, it improves GPT-4o and
+Qwen2.5-Math-72B by 3.6\% and 2.0\% respectively on various mathematical
+benchmarks, and 7.5\% gain combined with MCTS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Codes and Data are available at
+  https://github.com/beichenzbc/BoostStep</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ValuesRAG: Enhancing Cultural Alignment Through Retrieval-Augmented
+  Contextual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01031v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01031v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wonduk Seo, Zonghao Yuan, Yi Bu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cultural values alignment in Large Language Models (LLMs) is a critical
+challenge due to their tendency to embed Western-centric biases from training
+data, leading to misrepresentations and fairness issues in cross-cultural
+contexts. Recent approaches, such as role-assignment and few-shot learning,
+often struggle with reliable cultural alignment as they heavily rely on
+pre-trained knowledge, lack scalability, and fail to capture nuanced cultural
+values effectively. To address these issues, we propose ValuesRAG, a novel and
+effective framework that applies Retrieval-Augmented Generation (RAG) with
+In-Context Learning (ICL) to integrate cultural and demographic knowledge
+dynamically during text generation. Leveraging the World Values Survey (WVS)
+dataset, ValuesRAG first generates summaries of values for each individual.
+Subsequently, we curate several representative regional datasets to serve as
+test datasets and retrieve relevant summaries of values based on demographic
+features, followed by a reranking step to select the top-k relevant summaries.
+ValuesRAG consistently outperforms baseline methods, both in the main
+experiment and in the ablation study where only the values summary was
+provided. Notably, ValuesRAG demonstrates an accuracy of 21% improvement over
+other baseline methods, highlighting its potential to foster culturally aligned
+AI systems and enhance the inclusivity of AI-driven applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constructing Fair Latent Space for Intersection of Fairness and
+  Explainability <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17523v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17523v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyungjun Joo, Hyeonggeun Han, Sehwan Kim, Sangwoo Hong, Jungwoo Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the use of machine learning models has increased, numerous studies have
+aimed to enhance fairness. However, research on the intersection of fairness
+and explainability remains insufficient, leading to potential issues in gaining
+the trust of actual users. Here, we propose a novel module that constructs a
+fair latent space, enabling faithful explanation while ensuring fairness. The
+fair latent space is constructed by disentangling and redistributing labels and
+sensitive attributes, allowing the generation of counterfactual explanations
+for each type of information. Our module is attached to a pretrained generative
+model, transforming its biased latent space into a fair latent space.
+Additionally, since only the module needs to be trained, there are advantages
+in terms of time and cost savings, without the need to train the entire
+generative model. We validate the fair latent space with various fairness
+metrics and demonstrate that our approach can effectively provide explanations
+for biased decisions and assurances of fairness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, accepted in AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SilverSpeak: Evading AI-Generated Text Detectors using Homoglyphs <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11239v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11239v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aldan Creo, Shushanta Pudasaini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of Large Language Models (LLMs) has enabled the generation of text
+that increasingly exhibits human-like characteristics. As the detection of such
+content is of significant importance, substantial research has been conducted
+with the objective of developing reliable AI-generated text detectors. These
+detectors have demonstrated promising results on test data, but recent research
+has revealed that they can be circumvented by employing different techniques.
+  In this paper, we present homoglyph-based attacks (A $\rightarrow$ Cyrillic
+A) as a means of circumventing existing detectors. We conduct a comprehensive
+evaluation to assess the effectiveness of these attacks on seven detectors,
+including ArguGPT, Binoculars, DetectGPT, Fast-DetectGPT, Ghostbuster, OpenAI's
+detector, and watermarking techniques, on five different datasets. Our findings
+demonstrate that homoglyph-based attacks can effectively circumvent
+state-of-the-art detectors, leading them to classify all texts as either
+AI-generated or human-written (decreasing the average Matthews Correlation
+Coefficient from 0.64 to -0.01). Through further examination, we extract the
+technical justification underlying the success of the attacks, which varies
+across detectors. Finally, we discuss the implications of these findings and
+potential defenses against such attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Workshop on Detecting AI Generated Content at COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DPO Kernels: A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich
+  Paradigm for Direct Preference Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03271v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03271v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amitava Das, Suranjana Trivedy, Danush Khanna, Rajarshi Roy, Gurpreet Singh, Basab Ghosh, Yaswanth Narsupalli, Vinija Jain, Vasu Sharma, Aishwarya Naresh Reganti, Aman Chadha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid rise of large language models (LLMs) has unlocked many applications
+but also underscores the challenge of aligning them with diverse values and
+preferences. Direct Preference Optimization (DPO) is central to alignment but
+constrained by fixed divergences and limited feature transformations. We
+propose DPO-Kernels, which integrates kernel methods to address these issues
+through four key contributions: (i) Kernelized Representations with polynomial,
+RBF, Mahalanobis, and spectral kernels for richer transformations, plus a
+hybrid loss combining embedding-based and probability-based objectives; (ii)
+Divergence Alternatives (Jensen-Shannon, Hellinger, Renyi, Bhattacharyya,
+Wasserstein, and f-divergences) for greater stability; (iii) Data-Driven
+Selection metrics that automatically choose the best kernel-divergence pair;
+and (iv) a Hierarchical Mixture of Kernels for both local precision and global
+modeling. Evaluations on 12 datasets demonstrate state-of-the-art performance
+in factuality, safety, reasoning, and instruction following. Grounded in
+Heavy-Tailed Self-Regularization, DPO-Kernels maintains robust generalization
+for LLMs, offering a comprehensive resource for further alignment research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Probabilistic Circuits: Enabling Compositional and Interpretable
+  Predictions through Logical Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07021v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07021v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixin Chen, Simon Yu, Huajie Shao, Lui Sha, Han Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end deep neural networks have achieved remarkable success across
+various domains but are often criticized for their lack of interpretability.
+While post hoc explanation methods attempt to address this issue, they often
+fail to accurately represent these black-box models, resulting in misleading or
+incomplete explanations. To overcome these challenges, we propose an inherently
+transparent model architecture called Neural Probabilistic Circuits (NPCs),
+which enable compositional and interpretable predictions through logical
+reasoning. In particular, an NPC consists of two modules: an attribute
+recognition model, which predicts probabilities for various attributes, and a
+task predictor built on a probabilistic circuit, which enables logical
+reasoning over recognized attributes to make class predictions. To train NPCs,
+we introduce a three-stage training algorithm comprising attribute recognition,
+circuit construction, and joint optimization. Moreover, we theoretically
+demonstrate that an NPC's error is upper-bounded by a linear combination of the
+errors from its modules. To further demonstrate the interpretability of NPC, we
+provide both the most probable explanations and the counterfactual
+explanations. Empirical results on four benchmark datasets show that NPCs
+strike a balance between interpretability and performance, achieving results
+competitive even with those of end-to-end black-box models while providing
+enhanced interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Large-Scale Urban Parking Prediction: Graph Coarsening Based
+  on Real-Time Parking Service Capability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.04022v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.04022v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Wang, Zhenwu Chen, Kangshuai Zhang, Yunduan Cui, Yang Yang, Lei Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the sharp increase in the number of vehicles, the issue of parking
+difficulties has emerged as an urgent challenge that many cities need to
+address promptly. In the task of predicting large-scale urban parking data,
+existing research often lacks effective deep learning models and strategies. To
+tackle this challenge, this paper proposes an innovative framework for
+predicting large-scale urban parking graphs leveraging real-time service
+capabilities, aimed at improving the accuracy and efficiency of parking
+predictions. Specifically, we introduce a graph attention mechanism that
+assesses the real-time service capabilities of parking lots to construct a
+dynamic parking graph that accurately reflects real preferences in parking
+behavior. To effectively handle large-scale parking data, this study combines
+graph coarsening techniques with temporal convolutional autoencoders to achieve
+unified dimension reduction of the complex urban parking graph structure and
+features. Subsequently, we use a spatio-temporal graph convolutional model to
+make predictions based on the coarsened graph, and a pre-trained
+autoencoder-decoder module restores the predicted results to their original
+data dimensions, completing the task. Our methodology has been rigorously
+tested on a real dataset from parking lots in Shenzhen. The experimental
+results indicate that compared to traditional parking prediction models, our
+framework achieves improvements of 46.8\% and 30.5\% in accuracy and
+efficiency, respectively. Remarkably, with the expansion of the graph's scale,
+our framework's advantages become even more apparent, showcasing its
+substantial potential for solving complex urban parking dilemmas in practical
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Steering Large Language Models with Feature Guided Activation Additions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09929v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09929v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Soo, Wesley Teng, Chandrasekaran Balaganesh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective and reliable control over large language model (LLM) behavior is a
+significant challenge. While activation steering methods, which add steering
+vectors to a model's hidden states, are a promising approach, existing
+techniques often lack precision and interpretability in how they influence
+model outputs. We introduce Feature Guided Activation Additions (FGAA), a novel
+activation steering method that leverages insights from Contrastive Activation
+Addition (CAA) and Sparse Autoencoder-Targeted Steering (SAE-TS). By operating
+in the latent space of a Sparse Autoencoder (SAE) and employing optimization
+techniques to select desired SAE features, FGAA constructs precise steering
+vectors that provide better steering effects while maintaining coherence of
+steered model outputs. In this regard, evaluations on Gemma-2-2B and Gemma-2-9B
+models across various steering tasks demonstrate that FGAA outperforms existing
+steering methods of CAA, SAE decoder steering, and SAE-TS. Our results also
+highlight important trade-offs between steering scale and general model
+capabilities that are consistent across all tested steering methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 maintext pages, 14 appendix pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Touchstone Benchmark: Are We on the Right Way for Evaluating AI
+  Algorithms for Medical Segmentation? <span class="chip">NeurIPS-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03670v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03670v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro R. A. S. Bassi, Wenxuan Li, Yucheng Tang, Fabian Isensee, Zifu Wang, Jieneng Chen, Yu-Cheng Chou, Yannick Kirchhoff, Maximilian Rokuss, Ziyan Huang, Jin Ye, Junjun He, Tassilo Wald, Constantin Ulrich, Michael Baumgartner, Saikat Roy, Klaus H. Maier-Hein, Paul Jaeger, Yiwen Ye, Yutong Xie, Jianpeng Zhang, Ziyang Chen, Yong Xia, Zhaohu Xing, Lei Zhu, Yousef Sadegheih, Afshin Bozorgpour, Pratibha Kumari, Reza Azad, Dorit Merhof, Pengcheng Shi, Ting Ma, Yuxin Du, Fan Bai, Tiejun Huang, Bo Zhao, Haonan Wang, Xiaomeng Li, Hanxue Gu, Haoyu Dong, Jichen Yang, Maciej A. Mazurowski, Saumya Gupta, Linshan Wu, Jiaxin Zhuang, Hao Chen, Holger Roth, Daguang Xu, Matthew B. Blaschko, Sergio Decherchi, Andrea Cavalli, Alan L. Yuille, Zongwei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How can we test AI performance? This question seems trivial, but it isn't.
+Standard benchmarks often have problems such as in-distribution and small-size
+test sets, oversimplified metrics, unfair comparisons, and short-term outcome
+pressure. As a consequence, good performance on standard benchmarks does not
+guarantee success in real-world scenarios. To address these problems, we
+present Touchstone, a large-scale collaborative segmentation benchmark of 9
+types of abdominal organs. This benchmark is based on 5,195 training CT scans
+from 76 hospitals around the world and 5,903 testing CT scans from 11
+additional hospitals. This diverse test set enhances the statistical
+significance of benchmark results and rigorously evaluates AI algorithms across
+various out-of-distribution scenarios. We invited 14 inventors of 19 AI
+algorithms to train their algorithms, while our team, as a third party,
+independently evaluated these algorithms on three test sets. In addition, we
+also evaluated pre-existing AI frameworks--which, differing from algorithms,
+are more flexible and can support different algorithms--including MONAI from
+NVIDIA, nnU-Net from DKFZ, and numerous other open-source frameworks. We are
+committed to expanding this benchmark to encourage more innovation of AI
+algorithms for the medical domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ List Items One by One: A New Data Source and Learning Paradigm for
+  Multimodal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.16375v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.16375v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An Yan, Zhengyuan Yang, Junda Wu, Wanrong Zhu, Jianwei Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Julian McAuley, Jianfeng Gao, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Set-of-Mark (SoM) Prompting unleashes the visual grounding capability of
+GPT-4V, by enabling the model to associate visual objects with tags inserted on
+the image. These tags, marked with alphanumerics, can be indexed via text
+tokens for easy reference. Despite the extraordinary performance from GPT-4V,
+we observe that other Multimodal Large Language Models (MLLMs) struggle to
+understand these visual tags. To promote the learning of SoM prompting for
+open-source models, we propose a new learning paradigm: "list items one by
+one," which asks the model to enumerate and describe all visual tags placed on
+the image following the alphanumeric orders of tags. By integrating our curated
+dataset with other visual instruction tuning datasets, we are able to equip
+existing MLLMs with the SoM prompting ability. Furthermore, we evaluate our
+finetuned SoM models on five MLLM benchmarks. We find that this new dataset,
+even in a relatively small size (10k-30k images with tags), significantly
+enhances visual reasoning capabilities and reduces hallucinations for MLLMs.
+Perhaps surprisingly, these improvements persist even when the visual tags are
+omitted from input images during inference. This suggests the potential of
+"list items one by one" as a new paradigm for training MLLMs, which strengthens
+the object-text alignment through the use of visual tags in the training stage.
+Finally, we conduct analyses by probing trained models to understand the
+working mechanism of SoM. Our code and data are available at
+\url{https://github.com/zzxslp/SoM-LLaVA}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published at COLM-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-19T00:00:00Z">2025-01-19</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">12</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CART-MPC: Coordinating Assistive Devices for Robot-Assisted Transferring
+  with Multi-Agent Model Predictive Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruolin Ye, Shuaixing Chen, Yunting Yan, Joyce Yang, Christina Ge, Jose Barreiros, Kate Tsui, Tom Silver, Tapomayukh Bhattacharjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bed-to-wheelchair transferring is a ubiquitous activity of daily living
+(ADL), but especially challenging for caregiving robots with limited payloads.
+We develop a novel algorithm that leverages the presence of other assistive
+devices: a Hoyer sling and a wheelchair for coarse manipulation of heavy loads,
+alongside a robot arm for fine-grained manipulation of deformable objects
+(Hoyer sling straps). We instrument the Hoyer sling and wheelchair with
+actuators and sensors so that they can become intelligent agents in the
+algorithm. We then focus on one subtask of the transferring ADL -- tying Hoyer
+sling straps to the sling bar -- that exemplifies the challenges of transfer:
+multi-agent planning, deformable object manipulation, and generalization to
+varying hook shapes, sling materials, and care recipient bodies. To address
+these challenges, we propose CART-MPC, a novel algorithm based on turn-taking
+multi-agent model predictive control that uses a learned neural dynamics model
+for a keypoint-based representation of the deformable Hoyer sling strap, and a
+novel cost function that leverages linking numbers from knot theory and neural
+amortization to accelerate inference. We validate it in both RCareWorld
+simulation and real-world environments. In simulation, CART-MPC successfully
+generalizes across diverse hook designs, sling materials, and care recipient
+body shapes. In the real world, we show zero-shot sim-to-real generalization
+capabilities to tie deformable Hoyer sling straps on a sling bar towards
+transferring a manikin from a hospital bed to a wheelchair. See our website for
+supplementary materials: https://emprise.cs.cornell.edu/cart-mpc/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenLiDARMap: Zero-Drift Point Cloud Mapping using Map Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11111v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11111v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Kulmer, Maximilian Leitenstern, Marcel Weinmann, Markus Lienkamp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate localization is a critical component of mobile autonomous systems,
+especially in Global Navigation Satellite Systems (GNSS)-denied environments
+where traditional methods fail. In such scenarios, environmental sensing is
+essential for reliable operation. However, approaches such as LiDAR odometry
+and Simultaneous Localization and Mapping (SLAM) suffer from drift over long
+distances, especially in the absence of loop closures. Map-based localization
+offers a robust alternative, but the challenge lies in creating and
+georeferencing maps without GNSS support. To address this issue, we propose a
+method for creating georeferenced maps without GNSS by using publicly available
+data, such as building footprints and surface models derived from sparse aerial
+scans. Our approach integrates these data with onboard LiDAR scans to produce
+dense, accurate, georeferenced 3D point cloud maps. By combining an Iterative
+Closest Point (ICP) scan-to-scan and scan-to-map matching strategy, we achieve
+high local consistency without suffering from long-term drift. Thus, we
+eliminate the reliance on GNSS for the creation of georeferenced maps. The
+results demonstrate that LiDAR-only mapping can produce accurate georeferenced
+point cloud maps when augmented with existing map priors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-LiCa: A Motion and Targetless Multi LiDAR-to-LiDAR Calibration
+  Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Kulmer, Ilir Tahiraj, Andrii Chumak, Markus Lienkamp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today's autonomous vehicles rely on a multitude of sensors to perceive their
+environment. To improve the perception or create redundancy, the sensor's
+alignment relative to each other must be known. With Multi-LiCa, we present a
+novel approach for the alignment, e.g. calibration. We present an automatic
+motion- and targetless approach for the extrinsic multi LiDAR-to-LiDAR
+calibration without the need for additional sensor modalities or an initial
+transformation input. We propose a two-step process with feature-based matching
+for the coarse alignment and a GICP-based fine registration in combination with
+a cost-based matching strategy. Our approach can be applied to any number of
+sensors and positions if there is a partial overlap between the field of view
+of single sensors. We show that our pipeline is better generalized to different
+sensor setups and scenarios and is on par or better in calibration accuracy
+than existing approaches. The presented framework is integrated in ROS 2 but
+can also be used as a standalone application. To build upon our work, our
+source code is available at: https://github.com/TUMFTM/Multi_LiCa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2024 IEEE International Conference on Multisensor Fusion and
+  Integration for Intelligent Systems, 2835-947X</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Front Hair Styling Robot System Using Path Planning for Root-Centric
+  Strand Adjustment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soonhyo Kim, Naoaki Kanazawa, Shun Hasegawa, Kento Kawaharazuka, Kei Okada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hair styling is a crucial aspect of personal grooming, significantly
+influenced by the appearance of front hair. While brushing is commonly used
+both to detangle hair and for styling purposes, existing research primarily
+focuses on robotic systems for detangling hair, with limited exploration into
+robotic hair styling. This research presents a novel robotic system designed to
+automatically adjust front hairstyles, with an emphasis on path planning for
+root-centric strand adjustment. The system utilizes images to compare the
+current hair state with the desired target state through an orientation map of
+hair strands. By concentrating on the differences in hair orientation and
+specifically targeting adjustments at the root of each strand, the system
+performs detailed styling tasks. The path planning approach ensures effective
+alignment of the hairstyle with the target, and a closed-loop mechanism refines
+these adjustments to accurately evolve the hairstyle towards the desired
+outcome. Experimental results demonstrate that the proposed system achieves a
+high degree of similarity and consistency in front hair styling, showing
+promising results for automated, precise hairstyle adjustments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE/SICE SII2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Factor Graph-Based Active SLAM for Spacecraft Proximity Operations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Ticozzi, Panagiotis Tsiotras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate a scenario where a chaser spacecraft or satellite equipped
+with a monocular camera navigates in close proximity to a target spacecraft.
+The satellite's primary objective is to construct a representation of the
+operational environment and localize itself within it, utilizing the available
+image data. We frame the joint task of state trajectory and map estimation as
+an instance of smoothing-based simultaneous localization and mapping (SLAM),
+where the underlying structure of the problem is represented as a factor graph.
+Rather than considering estimation and planning as separate tasks, we propose
+to control the camera observations to actively reduce the uncertainty of the
+estimation variables, the spacecraft state, and the map landmarks. This is
+accomplished by adopting an information-theoretic metric to reason about the
+impact of candidate actions on the evolution of the belief state. Numerical
+simulations indicate that the proposed method successfully captures the
+interplay between planning and estimation, hence yielding reduced uncertainty
+and higher accuracy when compared to commonly adopted passive sensing
+strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Target Localization under Uncertainty using Multi-Agent Deep
+  Reinforcement Learning with Knowledge Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Alagha, Rabeb Mizouni, Shakti Singh, Jamal Bentahar, Hadi Otrok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Target localization is a critical task in sensitive applications, where
+multiple sensing agents communicate and collaborate to identify the target
+location based on sensor readings. Existing approaches investigated the use of
+Multi-Agent Deep Reinforcement Learning (MADRL) to tackle target localization.
+Nevertheless, these methods do not consider practical uncertainties, like false
+alarms when the target does not exist or when it is unreachable due to
+environmental complexities. To address these drawbacks, this work proposes a
+novel MADRL-based method for target localization in uncertain environments. The
+proposed MADRL method employs Proximal Policy Optimization to optimize the
+decision-making of sensing agents, which is represented in the form of an
+actor-critic structure using Convolutional Neural Networks. The observations of
+the agents are designed in an optimized manner to capture essential information
+in the environment, and a team-based reward functions is proposed to produce
+cooperative agents. The MADRL method covers three action dimensionalities that
+control the agents' mobility to search the area for the target, detect its
+existence, and determine its reachability. Using the concept of Transfer
+Learning, a Deep Learning model builds on the knowledge from the MADRL model to
+accurately estimating the target location if it is unreachable, resulting in
+shared representations between the models for faster learning and lower
+computational complexity. Collectively, the final combined model is capable of
+searching for the target, determining its existence and reachability, and
+estimating its location accurately. The proposed method is tested using a
+radioactive target localization environment and benchmarked against existing
+methods, showing its efficacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embodied Agent Interface: Benchmarking LLMs for Embodied Decision Making <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.07166v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.07166v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manling Li, Shiyu Zhao, Qineng Wang, Kangrui Wang, Yu Zhou, Sanjana Srivastava, Cem Gokmen, Tony Lee, Li Erran Li, Ruohan Zhang, Weiyu Liu, Percy Liang, Li Fei-Fei, Jiayuan Mao, Jiajun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to evaluate Large Language Models (LLMs) for embodied decision making.
+While a significant body of work has been leveraging LLMs for decision making
+in embodied environments, we still lack a systematic understanding of their
+performance because they are usually applied in different domains, for
+different purposes, and built based on different inputs and outputs.
+Furthermore, existing evaluations tend to rely solely on a final success rate,
+making it difficult to pinpoint what ability is missing in LLMs and where the
+problem lies, which in turn blocks embodied agents from leveraging LLMs
+effectively and selectively. To address these limitations, we propose a
+generalized interface (Embodied Agent Interface) that supports the
+formalization of various types of tasks and input-output specifications of
+LLM-based modules. Specifically, it allows us to unify 1) a broad set of
+embodied decision-making tasks involving both state and temporally extended
+goals, 2) four commonly-used LLM-based modules for decision making: goal
+interpretation, subgoal decomposition, action sequencing, and transition
+modeling, and 3) a collection of fine-grained metrics which break down
+evaluation into various types of errors, such as hallucination errors,
+affordance errors, various types of planning errors, etc. Overall, our
+benchmark offers a comprehensive assessment of LLMs' performance for different
+subtasks, pinpointing the strengths and weaknesses in LLM-powered embodied AI
+systems, and providing insights for effective and selective use of LLMs in
+embodied decision making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for oral presentation at NeurIPS 2024 in the Datasets and
+  Benchmarks track. Final Camera version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised UAV 3D Trajectories Estimation with Sparse Point Clouds <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12716v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12716v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanfang Liang, Yizhuo Yang, Jinming Hu, Jianfei Yang, Fen Liu, Shenghai Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compact UAV systems, while advancing delivery and surveillance, pose
+significant security challenges due to their small size, which hinders
+detection by traditional methods. This paper presents a cost-effective,
+unsupervised UAV detection method using spatial-temporal sequence processing to
+fuse multiple LiDAR scans for accurate UAV tracking in real-world scenarios.
+Our approach segments point clouds into foreground and background, analyzes
+spatial-temporal data, and employs a scoring mechanism to enhance detection
+accuracy. Tested on a public dataset, our solution placed 4th in the CVPR 2024
+UG2+ Challenge, demonstrating its practical effectiveness. We plan to
+open-source all designs, code, and sample data for the research community
+github.com/lianghanfang/UnLiDAR-UAV-Est.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at the IEEE
+  International Conference on Acoustics, Speech, and Signal Processing (ICASSP)
+  2025. 2025 IEEE Trademark. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ REBEL: Reward Regularization-Based Approach for Robotic Reinforcement
+  Learning from Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14436v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14436v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Souradip Chakraborty, Anukriti Singh, Amisha Bhaskar, Pratap Tokekar, Dinesh Manocha, Amrit Singh Bedi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The effectiveness of reinforcement learning (RL) agents in continuous control
+robotics tasks is mainly dependent on the design of the underlying reward
+function, which is highly prone to reward hacking. A misalignment between the
+reward function and underlying human preferences (values, social norms) can
+lead to catastrophic outcomes in the real world especially in the context of
+robotics for critical decision making. Recent methods aim to mitigate
+misalignment by learning reward functions from human preferences and
+subsequently performing policy optimization. However, these methods
+inadvertently introduce a distribution shift during reward learning due to
+ignoring the dependence of agent-generated trajectories on the reward learning
+objective, ultimately resulting in sub-optimal alignment. Hence, in this work,
+we address this challenge by advocating for the adoption of regularized reward
+functions that more accurately mirror the intended behaviors of the agent. We
+propose a novel concept of reward regularization within the robotic RLHF (RL
+from Human Feedback) framework, which we refer to as \emph{agent preferences}.
+Our approach uniquely incorporates not just human feedback in the form of
+preferences but also considers the preferences of the RL agent itself during
+the reward function learning process. This dual consideration significantly
+mitigates the issue of distribution shift in RLHF with a computationally
+tractable algorithm. We provide a theoretical justification for the proposed
+algorithm by formulating the robotic RLHF problem as a bilevel optimization
+problem and developing a computationally tractable version of the same. We
+demonstrate the efficiency of our algorithm {\ours} in several continuous
+control benchmarks in DeepMind Control Suite \cite{tassa2018deepmind}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Audio Array-Based 3D UAV Trajectory Estimation with LiDAR
+  Pseudo-Labeling <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12698v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12698v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allen Lei, Tianchen Deng, Han Wang, Jianfei Yang, Shenghai Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As small unmanned aerial vehicles (UAVs) become increasingly prevalent, there
+is growing concern regarding their impact on public safety and privacy,
+highlighting the need for advanced tracking and trajectory estimation
+solutions. In response, this paper introduces a novel framework that utilizes
+audio array for 3D UAV trajectory estimation. Our approach incorporates a
+self-supervised learning model, starting with the conversion of audio data into
+mel-spectrograms, which are analyzed through an encoder to extract crucial
+temporal and spectral information. Simultaneously, UAV trajectories are
+estimated using LiDAR point clouds via unsupervised methods. These LiDAR-based
+estimations act as pseudo labels, enabling the training of an Audio Perception
+Network without requiring labeled data. In this architecture, the LiDAR-based
+system operates as the Teacher Network, guiding the Audio Perception Network,
+which serves as the Student Network. Once trained, the model can independently
+predict 3D trajectories using only audio signals, with no need for LiDAR data
+or external ground truth during deployment. To further enhance precision, we
+apply Gaussian Process modeling for improved spatiotemporal tracking. Our
+method delivers top-tier performance on the MMAUD dataset, establishing a new
+benchmark in trajectory estimation using self-supervised learning techniques
+without reliance on ground truth annotations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for ICASSP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VIEW: Visual Imitation Learning with Waypoints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.17906v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.17906v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ananth Jonnavittula, Sagar Parekh, Dylan P. Losey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robots can use Visual Imitation Learning (VIL) to learn manipulation tasks
+from video demonstrations. However, translating visual observations into
+actionable robot policies is challenging due to the high-dimensional nature of
+video data. This challenge is further exacerbated by the morphological
+differences between humans and robots, especially when the video demonstrations
+feature humans performing tasks. To address these problems we introduce Visual
+Imitation lEarning with Waypoints (VIEW), an algorithm that significantly
+enhances the sample efficiency of human-to-robot VIL. VIEW achieves this
+efficiency using a multi-pronged approach: extracting a condensed prior
+trajectory that captures the demonstrator's intent, employing an agent-agnostic
+reward function for feedback on the robot's actions, and utilizing an
+exploration algorithm that efficiently samples around waypoints in the
+extracted trajectory. VIEW also segments the human trajectory into grasp and
+task phases to further accelerate learning efficiency. Through comprehensive
+simulations and real-world experiments, VIEW demonstrates improved performance
+compared to current state-of-the-art VIL methods. VIEW enables robots to learn
+manipulation tasks involving multiple objects from arbitrarily long video
+demonstrations. Additionally, it can learn standard manipulation tasks such as
+pushing or moving objects from a single video demonstration in under 30
+minutes, with fewer than 20 real-world rollouts. Code and videos here:
+https://collab.me.vt.edu/view/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LEGATO: Cross-Embodiment Imitation Using a Grasping Tool 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03682v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03682v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyo Seo, H. Andy Park, Shenli Yuan, Yuke Zhu, Luis Sentis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-embodiment imitation learning enables policies trained on specific
+embodiments to transfer across different robots, unlocking the potential for
+large-scale imitation learning that is both cost-effective and highly reusable.
+This paper presents LEGATO, a cross-embodiment imitation learning framework for
+visuomotor skill transfer across varied kinematic morphologies. We introduce a
+handheld gripper that unifies action and observation spaces, allowing tasks to
+be defined consistently across robots. We train visuomotor policies on task
+demonstrations using this gripper through imitation learning, applying
+transformation to a motion-invariant space for computing the training loss.
+Gripper motions generated by the policies are retargeted into
+high-degree-of-freedom whole-body motions using inverse kinematics for
+deployment across diverse embodiments. Our evaluations in simulation and
+real-robot experiments highlight the framework's effectiveness in learning and
+transferring visuomotor skills across various robots. More information can be
+found at the project page: https://ut-hcrl.github.io/LEGATO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to RA-L</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Attitude Synchronization for Multi-agent Systems on SO(3) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mouaad Boughellaba, Soulaimane Berkane, Abdelhamid Tayebi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address the problem of attitude synchronization for a group
+of rigid body systems evolving on SO(3). The interaction among these systems is
+modeled through an undirected, connected, and acyclic graph topology. First, we
+present an almost global continuous distributed attitude synchronization scheme
+with rigorously proven stability guarantees. Thereafter, we propose two global
+distributed hybrid attitude synchronization schemes on SO(3). The first scheme
+is a hybrid control law that leverages angular velocities and relative
+orientations to achieve global alignment to a common orientation. The second
+scheme eliminates the dependence on angular velocities by introducing dynamic
+auxiliary variables, while ensuring global asymptotic attitude synchronization.
+This velocity-free control scheme relies exclusively on attitude information.
+Simulation results are provided to illustrate the effectiveness of the proposed
+distributed attitude synchronization schemes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2304.01928</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Switch-Type Policy Network for Resource Allocation Problems:
+  Technical Report 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerrod Wigmore, Brooke Shrader, Eytan Modiano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Reinforcement Learning (DRL) has become a powerful tool for developing
+control policies in queueing networks, but the common use of Multi-layer
+Perceptron (MLP) neural networks in these applications has significant
+drawbacks. MLP architectures, while versatile, often suffer from poor sample
+efficiency and a tendency to overfit training environments, leading to
+suboptimal performance on new, unseen networks. In response to these issues, we
+introduce a switch-type neural network (STN) architecture designed to improve
+the efficiency and generalization of DRL policies in queueing networks. The STN
+leverages structural patterns from traditional non-learning policies, ensuring
+consistent action choices across similar states. This design not only
+streamlines the learning process but also fosters better generalization by
+reducing the tendency to overfit. Our works presents three key contributions:
+first, the development of the STN as a more effective alternative to MLPs;
+second, empirical evidence showing that STNs achieve superior sample efficiency
+in various training scenarios; and third, experimental results demonstrating
+that STNs match MLP performance in familiar environments and significantly
+outperform them in new settings. By embedding domain-specific knowledge, the
+STN enhances the Proximal Policy Optimization (PPO) algorithm's effectiveness
+without compromising performance, suggesting its suitability for a wide range
+of queueing network control problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sequential Change Detection for Learning in Piecewise Stationary Bandit
+  Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Han Huang, Venugopal V. Veeravalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A finite-horizon variant of the quickest change detection problem is
+investigated, which is motivated by a change detection problem that arises in
+piecewise stationary bandits. The goal is to minimize the \emph{latency}, which
+is smallest threshold such that the probability that the detection delay
+exceeds the threshold is below a desired low level, while controlling the false
+alarm probability to a desired low level. When the pre- and post-change
+distributions are unknown, two tests are proposed as candidate solutions. These
+tests are shown to attain order optimality in terms of the horizon.
+Furthermore, the growth in their latencies with respect to the false alarm
+probability and late detection probability satisfies a property that is
+desirable in regret analysis for piecewise stationary bandits. Numerical
+results are provided to validate the theoretical performance results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 2 figures. arXiv admin note: text overlap with
+  arXiv:2501.01291</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Enrichment Opportunities for Distribution Grid Cable Networks using
+  Variational Autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konrad Sundsgaard, Kutay Bölat, Guangya Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electricity distribution cable networks suffer from incomplete and unbalanced
+data, hindering the effectiveness of machine learning models for predictive
+maintenance and reliability evaluation. Features such as the installation date
+of the cables are frequently missing. To address data scarcity, this study
+investigates the application of Variational Autoencoders (VAEs) for data
+enrichment, synthetic data generation, imbalanced data handling, and outlier
+detection. Based on a proof-of-concept case study for Denmark, targeting the
+imputation of missing age information in cable network asset registers, the
+analysis underlines the potential of generative models to support data-driven
+maintenance. However, the study also highlights several areas for improvement,
+including enhanced feature importance analysis, incorporating network
+characteristics and external features, and handling biases in missing data.
+Future initiatives should expand the application of VAEs by incorporating
+semi-supervised learning, advanced sampling techniques, and additional
+distribution grid elements, including low-voltage networks, into the analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advancing Hybrid Quantum Neural Network for Alternative Current Optimal
+  Power Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.20275v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.20275v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ze Hu, Ziqing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alternative Current Optimal Power Flow (AC-OPF) is essential for efficient
+planning and real-time operation in power systems but is NP-hard and
+non-convex, leading to significant computational challenges. Neural networks
+(NNs) offer computational speedups in solving OPF but face issues like
+dependency on large datasets, scalability limitations, and inability to enforce
+physical constraints, compromising solution reliability. To overcome these
+limitations, this paper proposes hybrid Quantum Neural Networks (QNNs) that
+integrate quantum computing principles into neural network architectures.
+Leveraging quantum mechanics properties such as superposition and entanglement,
+QNNs can capture complex input-output relationships more effectively and learn
+from small or noisy datasets.To further improve the performance of QNNs and
+investigate the interplay between classical and quantum components in hybrid
+architectures, we incorporate advanced techniques, including residual learning
+and physics-informed machine learning, into the hybrid QNN designs. These
+enhancements aim to boost convergence efficiency, lower errors, superior
+generalization, and robustness to quantum noise. Simulation results demonstrate
+that these enhanced hybrid QNNs outperform typical hybrid QNNs in solving OPF
+problems. This work provides valuable insights into the design and optimization
+of hybrid QNNs, highlighting the potential of quantum computation for broader
+applications in power systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reducing real-time complexity via sub-control Lyapunov functions: from
+  theory to experiments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huu-Thinh Do, Franco Blanchini, Stefano Miani, Ionela Prodan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The techniques to design control Lyapunov functions (CLF), along with a
+proper stabilizing feedback, possibly in the presence of constraints, often
+provide control laws that are too complex for proper implementation online,
+especially when an optimization problem is involved. In this work, we show how
+to acquire an alternative, computationally attractive feedback. Given a nominal
+CLF and a nominal state feedback, we say that a different positive definite
+function is a Sub-control Lyapunov function (SCLF) if its Lyapunov derivative
+is negative-definite and bounded above by the Lyapunov derivative of the
+nominal function with the nominal control. It turns out that if we consider a
+family of basis functions, then a SCLF can be computed by linear programming,
+with an infinite number of constraints. The idea is that although the offline
+computational burden to achieve the new controller and solve the linear program
+is considerable, the online computational burden is drastically reduced.
+Comprehensive simulations and experiments on drone control are conducted to
+demonstrate the effectiveness of the study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimisation of design parameters to improve performance of a planar
+  electromagnetic actuator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13436v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13436v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        K. S. Vikrant, D. Dadkhah, S. O. Reza Moheimani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planar electromagnetic actuators based on the principle of linear motors are
+widely employed for micro and nano positioning applications. These actuators
+usually employ a planar magnetic platform driven by a co-planar electromagnetic
+coil. While these actuators offer a large motion range and high positioning
+resolution, their actuation bandwidth is limited due to relatively small
+electromagnetic stiffness. We report optimization of the design parameters of
+the electromagnetic coil and the magnetic assembly to maximize the
+electromagnetic force and stiffness. Firstly, we derive closed-form expressions
+for the electromagnetic forces and stiffness, which enable us to express these
+quantities in terms of the design parameters of the actuator. Secondly, based
+on these derived expressions, we estimate the optimum values of the design
+parameters to maximize force and stiffness. Notably, for the optimum design
+parameters, the force and stiffness per unit volume can be increased by two and
+three orders of magnitude, respectively by reducing the pitch of the
+electromagnetic coil by a factor of 10. Lastly, we develop an electromagnetic
+actuator and evaluate its performance using a Microelectromechanical system
+(MEMS) based force sensor. By operating the force sensor in a feedback loop, we
+precisely measure the generated electromagnetic forces for different design
+parameters of the actuator. The experimental results obtained align closely
+with the analytical values, with an error of less than 15%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been published on IEEE Transactions on Magnetics</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">12</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Regularized Online Newton Method for Stochastic Convex Bandits with
+  Linear Vanishing Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingxin Zhan, Yuchen Xin, Kaicheng Jin, Zhihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a stochastic convex bandit problem where the subgaussian noise
+parameter is assumed to decrease linearly as the learner selects actions closer
+and closer to the minimizer of the convex loss function. Accordingly, we
+propose a Regularized Online Newton Method (RONM) for solving the problem,
+based on the Online Newton Method (ONM) of arXiv:2406.06506. Our RONM reaches a
+polylogarithmic regret in the time horizon $n$ when the loss function grows
+quadratically in the constraint set, which recovers the results of
+arXiv:2402.12042 in linear bandits. Our analyses rely on the growth rate of the
+precision matrix $\Sigma_t^{-1}$ in ONM and we find that linear growth solves
+the question exactly. These analyses also help us obtain better convergence
+rates when the loss function grows faster. We also study and analyze two new
+bandit models: stochastic convex bandits with noise scaled to a subgaussian
+parameter function and convex bandits with stochastic multiplicative noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Independence of Irrelevant Alternatives, State-Salient Decision
+  Rules and the Strict Condorcet Choice Function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somdeb Lahiri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple proof of a well-known axiomatic characterization of
+state-salient decision rules, using Weak Dominance Criterion and Global
+Independence of Irrelevant Alternatives. Subsequently we provide a simple
+axiomatic characterization of the Strict-Condorcet choice function on the
+domain of all preference profiles that have a strict-Condorcet winner, assuming
+that if the first two ranks are occupied by the same two alternatives in all
+states of nature, then the chosen alternative will be the one from these two
+that is preferred to the other with probability greater than half-provided such
+an alternative exists. We also show that this result is not valid if we extend
+the domain to the set of all preference profiles that have a unique
+weak-Condorcet winner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-CephaloNet: A Two-stage Novel Framework using Operational Neural
+  Network for Cephalometric Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md. Shaheenur Islam Sumon, Khandaker Reajul Islam, Tanzila Rafique, Gazi Shamim Hassan, Md. Sakib Abrar Hossain, Kanchon Kanti Podder, Noha Barhom, Faleh Tamimi, Abdulrahman Alqahtani, Muhammad E. H. Chowdhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cephalometric analysis is essential for the diagnosis and treatment planning
+of orthodontics. In lateral cephalograms, however, the manual detection of
+anatomical landmarks is a time-consuming procedure. Deep learning solutions
+hold the potential to address the time constraints associated with certain
+tasks; however, concerns regarding their performance have been observed. To
+address this critical issue, we proposed an end-to-end cascaded deep learning
+framework (Self-CepahloNet) for the task, which demonstrated benchmark
+performance over the ISBI 2015 dataset in predicting 19 dental landmarks. Due
+to their adaptive nodal capabilities, Self-ONN (self-operational neural
+networks) demonstrate superior learning performance for complex feature spaces
+over conventional convolutional neural networks. To leverage this attribute, we
+introduced a novel self-bottleneck in the HRNetV2 (High Resolution Network)
+backbone, which has exhibited benchmark performance on the ISBI 2015 dataset
+for the dental landmark detection task. Our first-stage results surpassed
+previous studies, showcasing the efficacy of our singular end-to-end deep
+learning model, which achieved a remarkable 70.95% success rate in detecting
+cephalometric landmarks within a 2mm range for the Test1 and Test2 datasets.
+Moreover, the second stage significantly improved overall performance, yielding
+an impressive 82.25% average success rate for the datasets above within the
+same 2mm distance. Furthermore, external validation was conducted using the PKU
+cephalogram dataset. Our model demonstrated a commendable success rate of
+75.95% within the 2mm range.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been accepted for publication in Neural Computing and
+  Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View Clustering Meets High-Dimensional Mixed Data: A Fusion
+  Regularized Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangru Xing, Yan Li, Xin Wang, Huangyue Chen, Xianchao Xiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view clustering leverages consistent and complementary information
+across multiple views to provide more comprehensive insights than analysis of
+single-view data. However, the heterogeneity and redundancy of high-dimensional
+mixed multi-view data pose significant challenges to the existing clustering
+techniques. In this paper, we propose a novel multi-view fusion regularized
+clustering method with adaptive group sparsity, enabling reliable clustering
+while effectively capturing local features. Technically, for multi-view data
+with mixed features exhibiting different distributions, different losses or
+divergence metrics are considered with a collective fusion penalty to obtain
+common groups. Moreover, the non-convex group sparsity consisting of
+inter-group sparsity and intra-group sparsity is utilized to screen informative
+features, thereby enhancing the robustness. Furthermore, we develop an
+effective proximal alternating direction method of multipliers (ADMM) and each
+subproblem admits a closed-form solution. It is rigorously proven that this
+algorithm globally converges to a Karush-Kuhn-Tucker (KKT) point, while
+establishing the equivalence between local minimum points and KKT points within
+a certain region. Extensive numerical experiments on both simulated and real
+data validate the superior performance of the presented method in clustering
+accuracy and feature selection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Calibration of Mesoscopic Traffic Simulation Using Vehicle
+  Trajectory Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Sun, Zihao Wang, Xingmin Wang, Henry X. Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic simulation models have long been popular in modern traffic planning
+and operation applications. Efficient calibration of simulation models is
+usually a crucial step in a simulation study. However, traditional calibration
+procedures are often resource-intensive and time-consuming, limiting the
+broader adoption of simulation models. In this study, a vehicle
+trajectory-based automatic calibration framework for mesoscopic traffic
+simulation is proposed. The framework incorporates behavior models from both
+the demand and the supply sides of a traffic network. An optimization-based
+network flow estimation model is designed for demand and route choice
+calibration. Dimensionality reduction techniques are incorporated to define the
+zoning system and the path choice set. A stochastic approximation model is
+established for capacity and driving behavior parameter calibration. The
+applicability and performance of the calibration framework are demonstrated
+through a case study for the City of Birmingham network in Michigan.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Packing Dijoins in Weighted Chordal Digraphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10918v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10918v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gérard Cornuéjols, Siyue Liu, R. Ravi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a digraph, a dicut is a cut where all the arcs cross in one direction. A
+dijoin is a subset of arcs that intersects every dicut. Edmonds and Giles
+conjectured that in a weighted digraph, the minimum weight of a dicut is equal
+to the maximum size of a packing of dijoins. This has been disproved. However,
+the unweighted version conjectured by Woodall remains open. We prove that the
+Edmonds-Giles conjecture is true if the underlying undirected graph is chordal.
+We also give a strongly polynomial time algorithm to construct such a packing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reducing real-time complexity via sub-control Lyapunov functions: from
+  theory to experiments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huu-Thinh Do, Franco Blanchini, Stefano Miani, Ionela Prodan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The techniques to design control Lyapunov functions (CLF), along with a
+proper stabilizing feedback, possibly in the presence of constraints, often
+provide control laws that are too complex for proper implementation online,
+especially when an optimization problem is involved. In this work, we show how
+to acquire an alternative, computationally attractive feedback. Given a nominal
+CLF and a nominal state feedback, we say that a different positive definite
+function is a Sub-control Lyapunov function (SCLF) if its Lyapunov derivative
+is negative-definite and bounded above by the Lyapunov derivative of the
+nominal function with the nominal control. It turns out that if we consider a
+family of basis functions, then a SCLF can be computed by linear programming,
+with an infinite number of constraints. The idea is that although the offline
+computational burden to achieve the new controller and solve the linear program
+is considerable, the online computational burden is drastically reduced.
+Comprehensive simulations and experiments on drone control are conducted to
+demonstrate the effectiveness of the study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A geodesic convexity-like structure for the polar decomposition of a
+  square matrix 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13990v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13990v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Foivos Alimisis, Bart Vandereycken
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We make a full landscape analysis of the (generally non-convex) orthogonal
+Procrustes problem. This problem is equivalent to computing the polar factor of
+a square matrix. We reveal a convexity-like structure, which explains the
+already established tractability of the problem and show that gradient descent
+in the orthogonal group computes the polar factor of a square matrix with
+linear convergence rate if the matrix is invertible and with an algebraic one
+if the matrix is singular. These results are similar to the ones of Alimisis
+and Vandereycken (2024) for the symmetric eigenvalue problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning rheological parameters of non-Newtonian fluids from velocimetry
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.02604v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.02604v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandros Kontogiannis, Richard Hodgkinson, Emily L. Manchester
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We solve a Bayesian inverse Navier-Stokes (N-S) problem that assimilates
+velocimetry data in order to jointly reconstruct the flow field and learn the
+unknown N-S parameters. By incorporating a Carreau shear-thinning viscosity
+model into the N-S problem, we devise an algorithm that learns the most likely
+Carreau parameters of a shear-thinning fluid, and estimates their
+uncertainties, from velocimetry data alone. We then conduct a flow-MRI
+experiment to obtain velocimetry data of an axisymmetric laminar jet through an
+idealised medical device (FDA nozzle) for a blood analogue fluid. We show that
+the algorithm can successfully reconstruct the flow field by learning the most
+likely Carreau parameters, and that the learned parameters are in very good
+agreement with rheometry measurements. The algorithm accepts any algebraic
+effective viscosity model, as long as the model is differentiable, and it can
+be extended to more complicated non-Newtonian fluids (e.g. Oldroyd-B fluid) if
+a viscoelastic model is incorporated into the N-S problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized convergence of the deep BSDE method: a step towards
+  fully-coupled FBSDEs and applications in stochastic control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18552v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18552v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Balint Negyesi, Zhipeng Huang, Cornelis W. Oosterlee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We are concerned with high-dimensional coupled FBSDE systems approximated by
+the deep BSDE method of Han et al. (2018). It was shown by Han and Long (2020)
+that the errors induced by the deep BSDE method admit a posteriori estimate
+depending on the loss function, whenever the backward equation only couples
+into the forward diffusion through the Y process. We generalize this result to
+drift coefficients that may also depend on Z, and give sufficient conditions
+for convergence under standard assumptions. The resulting conditions are
+directly verifiable for any equation. Consequently, unlike in earlier theory,
+our convergence analysis enables the treatment of FBSDEs stemming from
+stochastic optimal control problems. In particular, we provide a theoretical
+justification for the non-convergence of the deep BSDE method observed in
+recent literature, and present direct guidelines for when convergence can be
+guaranteed in practice. Our theoretical findings are supported by several
+numerical experiments in high-dimensional settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 3 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimization Hyper-parameter Laws for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.04777v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.04777v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Xie, Kuangyu Ding, Shuicheng Yan, Kim-Chuan Toh, Tianwen Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models have driven significant AI advancements, yet their
+training is resource-intensive and highly sensitive to hyper-parameter
+selection. While scaling laws provide valuable guidance on model size and data
+requirements, they fall short in choosing dynamic hyper-parameters, such as
+learning-rate (LR) schedules, that evolve during training. To bridge this gap,
+we present Optimization Hyper-parameter Laws (Opt-Laws), a framework that
+effectively captures the relationship between hyper-parameters and training
+outcomes, enabling the pre-selection of potential optimal schedules. Grounded
+in stochastic differential equations, Opt-Laws introduce novel mathematical
+interpretability and offer a robust theoretical foundation for some popular LR
+schedules. Our extensive validation across diverse model sizes and data scales
+demonstrates Opt-Laws' ability to accurately predict training loss and identify
+optimal LR schedule candidates in pre-training, continual training, and
+fine-tuning scenarios. This approach significantly reduces computational costs
+while enhancing overall model performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient Equilibrium in Online Learning: Theory and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08330v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08330v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasios N. Angelopoulos, Michael I. Jordan, Ryan J. Tibshirani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new perspective on online learning that we refer to as gradient
+equilibrium: a sequence of iterates achieves gradient equilibrium if the
+average of gradients of losses along the sequence converges to zero. In
+general, this condition is not implied by nor implies sublinear regret. It
+turns out that gradient equilibrium is achievable by standard online learning
+methods such as gradient descent and mirror descent with constant step sizes
+(rather than decaying step sizes, as is usually required for no regret).
+Further, as we show through examples, gradient equilibrium translates into an
+interpretable and meaningful property in online prediction problems spanning
+regression, classification, quantile estimation, and others. Notably, we show
+that the gradient equilibrium framework can be used to develop a debiasing
+scheme for black-box predictions under arbitrary distribution shift, based on
+simple post hoc online descent updates. We also show that post hoc gradient
+updates can be used to calibrate predicted quantiles under distribution shift,
+and that the framework leads to unbiased Elo scores for pairwise preference
+prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at
+  https://github.com/aangelopoulos/gradient-equilibrium/</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">59</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Brain Tumor Segmentation Using Channel Attention and Transfer
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11196v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11196v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Majid Behzadpour, Ebrahim Azizi, Kai Wu, Bengie L. Ortiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and efficient segmentation of brain tumors is critical for
+diagnosis, treatment planning, and monitoring in clinical practice. In this
+study, we present an enhanced ResUNet architecture for automatic brain tumor
+segmentation, integrating an EfficientNetB0 encoder, a channel attention
+mechanism, and an Atrous Spatial Pyramid Pooling (ASPP) module. The
+EfficientNetB0 encoder leverages pre-trained features to improve feature
+extraction efficiency, while the channel attention mechanism enhances the
+model's focus on tumor-relevant features. ASPP enables multiscale contextual
+learning, crucial for handling tumors of varying sizes and shapes. The proposed
+model was evaluated on two benchmark datasets: TCGA LGG and BraTS 2020.
+Experimental results demonstrate that our method consistently outperforms the
+baseline ResUNet and its EfficientNet variant, achieving Dice coefficients of
+0.903 and 0.851 and HD95 scores of 9.43 and 3.54 for whole tumor and tumor core
+regions on the BraTS 2020 dataset, respectively. compared with state-of-the-art
+methods, our approach shows competitive performance, particularly in whole
+tumor and tumor core segmentation. These results indicate that combining a
+powerful encoder with attention mechanisms and ASPP can significantly enhance
+brain tumor segmentation performance. The proposed approach holds promise for
+further optimization and application in other medical image segmentation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProKeR: A Kernel Perspective on Few-Shot Adaptation of Large
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassir Bendou, Amine Ouasfi, Vincent Gripon, Adnane Boukhayma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing popularity of Contrastive Language-Image Pretraining (CLIP) has
+led to its widespread application in various visual downstream tasks. To
+enhance CLIP's effectiveness and versatility, efficient few-shot adaptation
+techniques have been widely adopted. Among these approaches, training-free
+methods, particularly caching methods exemplified by Tip-Adapter, have gained
+attention for their lightweight adaptation without the need for additional
+fine-tuning. In this paper, we revisit Tip-Adapter from a kernel perspective,
+showing that caching methods function as local adapters and are connected to a
+well-established kernel literature. Drawing on this insight, we offer a
+theoretical understanding of how these methods operate and suggest multiple
+avenues for enhancing the Tip-Adapter baseline. Notably, our analysis shows the
+importance of incorporating global information in local adapters. Therefore, we
+subsequently propose a global method that learns a proximal regularizer in a
+reproducing kernel Hilbert space (RKHS) using CLIP as a base learner. Our
+method, which we call ProKeR (Proximal Kernel ridge Regression), has a closed
+form solution and achieves state-of-the-art performances across 11 datasets in
+the standard few-shot adaptation benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://ybendou.github.io/ProKeR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counteracting temporal attacks in Video Copy Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katarzyna Fojcik, Piotr Syga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Copy Detection (VCD) plays a crucial role in copyright protection and
+content verification by identifying duplicates and near-duplicates in
+large-scale video databases. The META AI Challenge on video copy detection
+provided a benchmark for evaluating state-of-the-art methods, with the
+Dual-level detection approach emerging as a winning solution. This method
+integrates Video Editing Detection and Frame Scene Detection to handle
+adversarial transformations and large datasets efficiently. However, our
+analysis reveals significant limitations in the VED component, particularly in
+its ability to handle exact copies. Moreover, Dual-level detection shows
+vulnerability to temporal attacks. To address it, we propose an improved frame
+selection strategy based on local maxima of interframe differences, which
+enhances robustness against adversarial temporal modifications while
+significantly reducing computational overhead. Our method achieves an increase
+of 1.4 to 5.8 times in efficiency over the standard 1 FPS approach. Compared to
+Dual-level detection method, our approach maintains comparable micro-average
+precision ($\mu$AP) while also demonstrating improved robustness against
+temporal attacks. Given 56\% reduced representation size and the inference time
+of more than 2 times faster, our approach is more suitable to real-world
+resource restriction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepEyeNet: Adaptive Genetic Bayesian Algorithm Based Hybrid
+  ConvNeXtTiny Framework For Multi-Feature Glaucoma Eye Diagnosis <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angshuman Roy, Anuvab Sen, Soumyajit Gupta, Soham Haldar, Subhrajit Deb, Taraka Nithin Vankala, Arkapravo Das
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Glaucoma is a leading cause of irreversible blindness worldwide, emphasizing
+the critical need for early detection and intervention. In this paper, we
+present DeepEyeNet, a novel and comprehensive framework for automated glaucoma
+detection using retinal fundus images. Our approach integrates advanced image
+standardization through dynamic thresholding, precise optic disc and cup
+segmentation via a U-Net model, and comprehensive feature extraction
+encompassing anatomical and texture-based features. We employ a customized
+ConvNeXtTiny based Convolutional Neural Network (CNN) classifier, optimized
+using our Adaptive Genetic Bayesian Optimization (AGBO) algorithm. This
+proposed AGBO algorithm balances exploration and exploitation in hyperparameter
+tuning, leading to significant performance improvements. Experimental results
+on the EyePACS-AIROGS-light-V2 dataset demonstrate that DeepEyeNet achieves a
+high classification accuracy of 95.84%, which was possible due to the effective
+optimization provided by the novel AGBO algorithm, outperforming existing
+methods. The integration of sophisticated image processing techniques, deep
+learning, and optimized hyperparameter tuning through our proposed AGBO
+algorithm positions DeepEyeNet as a promising tool for early glaucoma detection
+in clinical settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 12 figures, 3 Tables, Accepted by 15th IEEE Symposium Series
+  on Computational Intelligence (SSCI) 2025, Trondheim, Norway, Europe</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LiFT: Lightweight, FPGA-tailored 3D object detection based on LiDAR data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11159v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11159v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konrad Lis, Tomasz Kryjak, Marek Gorgon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents LiFT, a lightweight, fully quantized 3D object detection
+algorithm for LiDAR data, optimized for real-time inference on FPGA platforms.
+Through an in-depth analysis of FPGA-specific limitations, we identify a set of
+FPGA-induced constraints that shape the algorithm's design. These include a
+computational complexity limit of 30 GMACs (billion multiply-accumulate
+operations), INT8 quantization for weights and activations, 2D cell-based
+processing instead of 3D voxels, and minimal use of skip connections. To meet
+these constraints while maximizing performance, LiFT combines novel mechanisms
+with state-of-the-art techniques such as reparameterizable convolutions and
+fully sparse architecture. Key innovations include the Dual-bound Pillar
+Feature Net, which boosts performance without increasing complexity, and an
+efficient scheme for INT8 quantization of input features. With a computational
+cost of just 20.73 GMACs, LiFT stands out as one of the few algorithms
+targeting minimal-complexity 3D object detection. Among comparable methods,
+LiFT ranks first, achieving an mAP of 51.84% and an NDS of 61.01% on the
+challenging NuScenes validation dataset. The code will be available at
+https://github.com/vision-agh/lift.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been accepted for the DASIP 2025 workshop in
+  conjunction with the HiPEAC 2025 conference in Barcelona</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Frame Extraction: A Novel Approach Through Frame Similarity
+  and Surgical Tool Tracking for Video Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huu Phong Nguyen, Shekhar Madhav Khairnar, Sofia Garces Palacios, Amr Al-Abbas, Francisco Antunes, Bernardete Ribeiro, Melissa E. Hogg, Amer H. Zureikat, Patricio M. Polanco, Herbert Zeh III, Ganesh Sankaranarayanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The interest in leveraging Artificial Intelligence (AI) for surgical
+procedures to automate analysis has witnessed a significant surge in recent
+years. One of the primary tools for recording surgical procedures and
+conducting subsequent analyses, such as performance assessment, is through
+videos. However, these operative videos tend to be notably lengthy compared to
+other fields, spanning from thirty minutes to several hours, which poses a
+challenge for AI models to effectively learn from them. Despite this challenge,
+the foreseeable increase in the volume of such videos in the near future
+necessitates the development and implementation of innovative techniques to
+tackle this issue effectively. In this article, we propose a novel technique
+called Kinematics Adaptive Frame Recognition (KAFR) that can efficiently
+eliminate redundant frames to reduce dataset size and computation time while
+retaining useful frames to improve accuracy. Specifically, we compute the
+similarity between consecutive frames by tracking the movement of surgical
+tools. Our approach follows these steps: i) Tracking phase: a YOLOv8 model is
+utilized to detect tools presented in the scene, ii) Similarity phase:
+Similarities between consecutive frames are computed by estimating variation in
+the spatial positions and velocities of the tools, iii) Classification phase: A
+X3D CNN is trained to classify segmentation. We evaluate the effectiveness of
+our approach by analyzing datasets obtained through retrospective reviews of
+cases at two referral centers. The Gastrojejunostomy (GJ) dataset covers
+procedures performed between 2017 to 2021, while the Pancreaticojejunostomy
+(PJ) dataset spans from 2011 to 2022 at the same centers. By adaptively
+selecting relevant frames, we achieve a tenfold reduction in the number of
+frames while improving accuracy by 4.32% (from 0.749 to 0.7814).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLOFAI: A <span class="highlight-title">Dataset</span> of Real And Fake Image Classification Tasks for
+  Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11140v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11140v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Doherty, Anton Lee, Heitor Murilo Gomes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of generative AI models capable of creating realistic
+media has led to a need for classifiers that can accurately distinguish between
+genuine and artificially-generated images. A significant challenge for these
+classifiers emerges when they encounter images from generative models that are
+not represented in their training data, usually resulting in diminished
+performance. A typical approach is to periodically update the classifier's
+training data with images from the new generative models then retrain the
+classifier on the updated dataset. However, in some real-life scenarios,
+storage, computational, or privacy constraints render this approach
+impractical. Additionally, models used in security applications may be required
+to rapidly adapt. In these circumstances, continual learning provides a
+promising alternative, as the classifier can be updated without retraining on
+the entire dataset. In this paper, we introduce a new dataset called CLOFAI
+(Continual Learning On Fake and Authentic Images), which takes the form of a
+domain-incremental image classification problem. Moreover, we showcase the
+applicability of this dataset as a benchmark for evaluating continual learning
+methodologies. In doing this, we set a baseline on our novel dataset using
+three foundational continual learning methods -- EWC, GEM, and Experience
+Replay -- and find that EWC performs poorly, while GEM and Experience Replay
+show promise, performing significantly better than a Naive baseline. The
+dataset and code to run the experiments can be accessed from the following
+GitHub repository: https://github.com/Will-Doherty/CLOFAI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advanced technology in railway track monitoring using the GPR Technique:
+  A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farhad Kooban, Aleksandra Radlińska, Reza Mousapour, Maryam Saraei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Subsurface evaluation of railway tracks is crucial for safe operation, as it
+allows for the early detection and remediation of potential structural
+weaknesses or defects that could lead to accidents or derailments. Ground
+Penetrating Radar (GPR) is an electromagnetic survey technique as advanced
+non-destructive technology (NDT) that can be used to monitor railway tracks.
+This technology is well-suited for railway applications due to the sub-layered
+composition of the track, which includes ties, ballast, sub-ballast, and
+subgrade regions. It can detect defects such as ballast pockets, fouled
+ballast, poor drainage, and subgrade settlement. The paper reviews recent works
+on advanced technology and interpretations of GPR data collected for different
+layers. Further, this paper demonstrates the current techniques for using
+synthetic modeling to calibrate real-world GPR data, enhancing accuracy in
+identifying subsurface features like ballast conditions and structural
+anomalies and applying various algorithms to refine GPR data analysis. These
+include Support Vector Machine (SVM) for classifying railway ballast types,
+Fuzzy C-means, and Generalized Regression Neural Networks for high-accuracy
+defect classification. Deep learning techniques, particularly Convolutional
+Neural Networks (CNNs) and Recurrent Neural Networks (RNNs) are also
+highlighted for their effectiveness in recognizing patterns associated with
+defects in GPR images. The article specifically focuses on the development of a
+Convolutional Recurrent Neural Network (CRNN) model, which combines CNN and RNN
+architectures for efficient processing of GPR data. This model demonstrates
+enhanced detection capabilities and faster processing compared to traditional
+object detection models like Faster R-CNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2nd Canadian & Cold Regions Rail Research Conference 2024 (CCRC 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Pseudo-Label Guided Learning for Weakly Supervised Temporal
+  Action Localization from the Perspective of Noise Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan Zhang, Yuxin Qi, Xi Tang, Rui Yuan, Xi Lin, Ke Zhang, Chun Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pseudo-label learning methods have been widely applied in weakly-supervised
+temporal action localization. Existing works directly utilize weakly-supervised
+base model to generate instance-level pseudo-labels for training the
+fully-supervised detection head. We argue that the noise in pseudo-labels would
+interfere with the learning of fully-supervised detection head, leading to
+significant performance leakage. Issues with noisy labels include:(1)
+inaccurate boundary localization; (2) undetected short action clips; (3)
+multiple adjacent segments incorrectly detected as one segment. To target these
+issues, we introduce a two-stage noisy label learning strategy to harness every
+potential useful signal in noisy labels. First, we propose a frame-level
+pseudo-label generation model with a context-aware denoising algorithm to
+refine the boundaries. Second, we introduce an online-revised teacher-student
+framework with a missing instance compensation module and an ambiguous instance
+correction module to solve the short-action-missing and many-to-one problems.
+Besides, we apply a high-quality pseudo-label mining loss in our online-revised
+teacher-student framework to add different weights to the noisy labels to train
+more effectively. Our model outperforms the previous state-of-the-art method in
+detection accuracy and inference speed greatly upon the THUMOS14 and
+ActivityNet v1.2 benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RDG-GS: Relative Depth Guidance with Gaussian Splatting for Real-time
+  Sparse-View 3D Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenlu Zhan, Yufei Zhang, Yu Lin, Gaoang Wang, Hongwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiently synthesizing novel views from sparse inputs while maintaining
+accuracy remains a critical challenge in 3D reconstruction. While advanced
+techniques like radiance fields and 3D Gaussian Splatting achieve rendering
+quality and impressive efficiency with dense view inputs, they suffer from
+significant geometric reconstruction errors when applied to sparse input views.
+Moreover, although recent methods leverage monocular depth estimation to
+enhance geometric learning, their dependence on single-view estimated depth
+often leads to view inconsistency issues across different viewpoints.
+Consequently, this reliance on absolute depth can introduce inaccuracies in
+geometric information, ultimately compromising the quality of scene
+reconstruction with Gaussian splats. In this paper, we present RDG-GS, a novel
+sparse-view 3D rendering framework with Relative Depth Guidance based on 3D
+Gaussian Splatting. The core innovation lies in utilizing relative depth
+guidance to refine the Gaussian field, steering it towards view-consistent
+spatial geometric representations, thereby enabling the reconstruction of
+accurate geometric structures and capturing intricate textures. First, we
+devise refined depth priors to rectify the coarse estimated depth and insert
+global and fine-grained scene information to regular Gaussians. Building on
+this, to address spatial geometric inaccuracies from absolute depth, we propose
+relative depth guidance by optimizing the similarity between spatially
+correlated patches of depth and images. Additionally, we also directly deal
+with the sparse areas challenging to converge by the adaptive sampling for
+quick densification. Across extensive experiments on Mip-NeRF360, LLFF, DTU,
+and Blender, RDG-GS demonstrates state-of-the-art rendering quality and
+efficiency, making a significant advancement for real-world application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unit Region Encoding: A Unified and Compact Geometry-aware
+  Representation for Floorplan Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huichao Zhang, Pengyu Wang, Manyi Li, Zuojun Li, Yaguang Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Unit Region Encoding of floorplans, which is a unified and
+compact geometry-aware encoding representation for various applications,
+ranging from interior space planning, floorplan metric learning to floorplan
+generation tasks. The floorplans are represented as the latent encodings on a
+set of boundary-adaptive unit region partition based on the clustering of the
+proposed geometry-aware density map. The latent encodings are extracted by a
+trained network (URE-Net) from the input dense density map and other available
+semantic maps. Compared to the over-segmented rasterized images and the
+room-level graph structures, our representation can be flexibly adapted to
+different applications with the sliced unit regions while achieving higher
+accuracy performance and better visual quality. We conduct a variety of
+experiments and compare to the state-of-the-art methods on the aforementioned
+applications to validate the superiority of our representation, as well as
+extensive ablation studies to demonstrate the effect of our slicing choices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reproducibility <span class="highlight-title">review</span> of "Why Not Other Classes": Towards
+  Class-Contrastive Back-Propagation Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arvid Eriksson, Anton Israelsson, Mattias Kallhauge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  "Why Not Other Classes?": Towards Class-Contrastive Back-Propagation
+Explanations (Wang & Wang, 2022) provides a method for contrastively explaining
+why a certain class in a neural network image classifier is chosen above
+others. This method consists of using back-propagation-based explanation
+methods from after the softmax layer rather than before. Our work consists of
+reproducing the work in the original paper. We also provide extensions to the
+paper by evaluating the method on XGradCAM, FullGrad, and Vision Transformers
+to evaluate its generalization capabilities. The reproductions show similar
+results as the original paper, with the only difference being the visualization
+of heatmaps which could not be reproduced to look similar. The generalization
+seems to be generally good, with implementations working for Vision
+Transformers and alternative back-propagation methods. We also show that the
+original paper suffers from issues such as a lack of detail in the method and
+an erroneous equation which makes reproducibility difficult. To remedy this we
+provide an open-source repository containing all code used for this project.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging counterfactual concepts for debugging and improving CNN model
+  performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Syed Ali Tariq, Tehseen Zia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual explanation methods have recently received significant
+attention for explaining CNN-based image classifiers due to their ability to
+provide easily understandable explanations that align more closely with human
+reasoning. However, limited attention has been given to utilizing
+explainability methods to improve model performance. In this paper, we propose
+to leverage counterfactual concepts aiming to enhance the performance of CNN
+models in image classification tasks. Our proposed approach utilizes
+counterfactual reasoning to identify crucial filters used in the
+decision-making process. Following this, we perform model retraining through
+the design of a novel methodology and loss functions that encourage the
+activation of class-relevant important filters and discourage the activation of
+irrelevant filters for each class. This process effectively minimizes the
+deviation of activation patterns of local predictions and the global activation
+patterns of their respective inferred classes. By incorporating counterfactual
+explanations, we validate unseen model predictions and identify
+misclassifications. The proposed methodology provides insights into potential
+weaknesses and biases in the model's learning process, enabling targeted
+improvements and enhanced performance. Experimental results on publicly
+available datasets have demonstrated an improvement of 1-2\%, validating the
+effectiveness of the approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript is currently under consideration for publication in
+  Pattern Recognition Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Refinement Module based on Parse Graph of Feature Map for Human Pose
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shibang Liu, Xuemei Xie, Guangming Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parse graphs of the human body can be obtained in the human brain to help
+humans complete the human pose estimation (HPE). It contains a hierarchical
+structure, like a tree structure, and context relations among nodes. Many
+researchers pre-design the parse graph of body structure, and then design
+framework for HPE. However, these frameworks are difficulty adapting when
+encountering situations that differ from the preset human structure. Different
+from them, we regard the feature map as a whole, similarly to human body, so
+the feature map can be optimized based on parse graphs and each node feature is
+learned implicitly instead of explicitly, which means it can flexibly respond
+to different human body structure. In this paper, we design the Refinement
+Module based on the Parse Graph of feature map (RMPG), which includes two
+stages: top-down decomposition and bottom-up combination. In the top-down
+decomposition stage, the feature map is decomposed into multiple sub-feature
+maps along the channel and their context relations are calculated to obtain
+their respective context information. In the bottom-up combination stage, the
+sub-feature maps and their context information are combined to obtain refined
+sub-feature maps, and then these refined sub-feature maps are concatenated to
+obtain the refined feature map. Additionally ,we design a top-down framework by
+using multiple RMPG modules for HPE, some of which are supervised to obtain
+context relations among body parts. Our framework achieves excellent results on
+the COCO keypoint detection, CrowdPose and MPII human pose datasets. More
+importantly, our experiments also demonstrate the effectiveness of RMPG on
+different methods, including SimpleBaselines, Hourglass, and ViTPose.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Sample Utilization in Noise-Robust Deep Metric Learning With
+  Subgroup-Based Positive-Pair Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Yu, Qianqian Xu, Yangbangyan Jiang, Yingfei Sun, Qingming Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The existence of noisy labels in real-world data negatively impacts the
+performance of deep learning models. Although much research effort has been
+devoted to improving the robustness towards noisy labels in classification
+tasks, the problem of noisy labels in deep metric learning (DML) remains
+under-explored. Existing noisy label learning methods designed for DML mainly
+discard suspicious noisy samples, resulting in a waste of the training data. To
+address this issue, we propose a noise-robust DML framework with SubGroup-based
+Positive-pair Selection (SGPS), which constructs reliable positive pairs for
+noisy samples to enhance the sample utilization. Specifically, SGPS first
+effectively identifies clean and noisy samples by a probability-based clean
+sample selectionstrategy. To further utilize the remaining noisy samples, we
+discover their potential similar samples based on the subgroup information
+given by a subgroup generation module and then aggregate them into informative
+positive prototypes for each noisy sample via a positive prototype generation
+module. Afterward, a new contrastive loss is tailored for the noisy samples
+with their selected positive pairs. SGPS can be easily integrated into the
+training process of existing pair-wise DML tasks, like image retrieval and face
+recognition. Extensive experiments on multiple synthetic and real-world
+large-scale label noise datasets demonstrate the effectiveness of our proposed
+method. Without any bells and whistles, our SGPS framework outperforms the
+state-of-the-art noisy label DML methods. Code is available at
+\url{https://github.com/smuelpeng/SGPS-NoiseFreeDML}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2108.01431,
+  arXiv:2103.16047 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning with Open-world Noisy Data via Class-independent Margin in Dual
+  Representation Space <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linchao Pan, Can Gao, Jie Zhou, Jinbao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning with Noisy Labels (LNL) aims to improve the model generalization
+when facing data with noisy labels, and existing methods generally assume that
+noisy labels come from known classes, called closed-set noise. However, in
+real-world scenarios, noisy labels from similar unknown classes, i.e., open-set
+noise, may occur during the training and inference stage. Such open-world noisy
+labels may significantly impact the performance of LNL methods. In this study,
+we propose a novel dual-space joint learning method to robustly handle the
+open-world noise. To mitigate model overfitting on closed-set and open-set
+noises, a dual representation space is constructed by two networks. One is a
+projection network that learns shared representations in the prototype space,
+while the other is a One-Vs-All (OVA) network that makes predictions using
+unique semantic representations in the class-independent space. Then, bi-level
+contrastive learning and consistency regularization are introduced in two
+spaces to enhance the detection capability for data with unknown classes. To
+benefit from the memorization effects across different types of samples,
+class-independent margin criteria are designed for sample identification, which
+selects clean samples, weights closed-set noise, and filters open-set noise
+effectively. Extensive experiments demonstrate that our method outperforms the
+state-of-the-art methods and achieves an average accuracy improvement of 4.55\%
+and an AUROC improvement of 6.17\% on CIFAR80N.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages of main text, 4 pages of appendix, accepted to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BF-STVSR: B-Splines and Fourier-Best Friends for High Fidelity
+  Spatial-Temporal Video Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eunjin Kim, Hyeonjin Kim, Kyong Hwan Jin, Jaejun Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enhancing low-resolution, low-frame-rate videos to high-resolution,
+high-frame-rate quality is essential for a seamless user experience, motivating
+advancements in Continuous Spatial-Temporal Video Super Resolution (C-STVSR).
+While prior methods employ Implicit Neural Representation (INR) for continuous
+encoding, they often struggle to capture the complexity of video data, relying
+on simple coordinate concatenation and pre-trained optical flow network for
+motion representation. Interestingly, we find that adding position encoding,
+contrary to common observations, does not improve-and even degrade performance.
+This issue becomes particularly pronounced when combined with pre-trained
+optical flow networks, which can limit the model's flexibility. To address
+these issues, we propose BF-STVSR, a C-STVSR framework with two key modules
+tailored to better represent spatial and temporal characteristics of video: 1)
+B-spline Mapper for smooth temporal interpolation, and 2) Fourier Mapper for
+capturing dominant spatial frequencies. Our approach achieves state-of-the-art
+PSNR and SSIM performance, showing enhanced spatial details and natural
+temporal consistency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tracking Mouse from Incomplete Body-Part Observations and Deep-Learned
+  Deformable-Mouse Model Motion-Track Constraint for Behavior Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olaf Hellwich, Niek Andresen, Katharina Hohlbaum, Marcus N. Boon, Monika Kwiatkowski, Simon Matern, Patrik Reiske, Henning Sprekeler, Christa ThöneReineke, Lars Lewejohann, Huma Ghani Zada, Michael Brück, Soledad Traverso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking mouse body parts in video is often incomplete due to occlusions such
+that - e.g. - subsequent action and behavior analysis is impeded. In this
+conceptual work, videos from several perspectives are integrated via global
+exterior camera orientation; body part positions are estimated by 3D
+triangulation and bundle adjustment. Consistency of overall 3D track
+reconstruction is achieved by introduction of a 3D mouse model, deep-learned
+body part movements, and global motion-track smoothness constraint. The
+resulting 3D body and body part track estimates are substantially more complete
+than the original single-frame-based body part detection, therefore, allowing
+improved animal behavior analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Car-GS: Addressing Reflective and Transparent Surface Challenges in 3D
+  Car Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Congcong Li, Jin Wang, Xiaomeng Wang, Xingchen Zhou, Wei Wu, Yuzhi Zhang, Tongyi Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D car modeling is crucial for applications in autonomous driving systems,
+virtual and augmented reality, and gaming. However, due to the distinctive
+properties of cars, such as highly reflective and transparent surface
+materials, existing methods often struggle to achieve accurate 3D car
+reconstruction.To address these limitations, we propose Car-GS, a novel
+approach designed to mitigate the effects of specular highlights and the
+coupling of RGB and geometry in 3D geometric and shading reconstruction (3DGS).
+Our method incorporates three key innovations: First, we introduce
+view-dependent Gaussian primitives to effectively model surface reflections.
+Second, we identify the limitations of using a shared opacity parameter for
+both image rendering and geometric attributes when modeling transparent
+objects. To overcome this, we assign a learnable geometry-specific opacity to
+each 2D Gaussian primitive, dedicated solely to rendering depth and normals.
+Third, we observe that reconstruction errors are most prominent when the camera
+view is nearly orthogonal to glass surfaces. To address this issue, we develop
+a quality-aware supervision module that adaptively leverages normal priors from
+a pre-trained large-scale normal model.Experimental results demonstrate that
+Car-GS achieves precise reconstruction of car surfaces and significantly
+outperforms prior methods. The project page is available at
+https://lcc815.github.io/Car-GS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transfer Learning Strategies for Pathological Foundation Models: A
+  Systematic Evaluation in Brain Tumor Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ken Enda, Yoshitaka Oda, Zen-ichi Tanei, Wang Lei, Masumi Tsuda, Takahiro Ogawa, Shinya Tanaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models pretrained on large-scale pathology datasets have shown
+promising results across various diagnostic tasks. Here, we present a
+systematic evaluation of transfer learning strategies for brain tumor
+classification using these models. We analyzed 252 cases comprising five major
+tumor types: glioblastoma, astrocytoma, oligodendroglioma, primary central
+nervous system lymphoma, and metastatic tumors. Comparing state-of-the-art
+foundation models with conventional approaches, we found that foundation models
+demonstrated robust classification performance with as few as 10 patches per
+case, challenging the traditional assumption that extensive per-case image
+sampling is necessary. Furthermore, our evaluation revealed that simple
+transfer learning strategies like linear probing were sufficient, while
+fine-tuning often degraded model performance. These findings suggest a paradigm
+shift from extensive data collection to efficient utilization of pretrained
+features, providing practical implications for implementing AI-assisted
+diagnosis in clinical pathology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HFGCN:Hypergraph Fusion Graph Convolutional Networks for Skeleton-Based
+  Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Dong, Wenbo Wan, Huaxiang Zhang, Jiande Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, action recognition has received much attention and wide
+application due to its important role in video understanding. Most of the
+researches on action recognition methods focused on improving the performance
+via various deep learning methods rather than the classification of skeleton
+points. The topological modeling between skeleton points and body parts was
+seldom considered. Although some studies have used a data-driven approach to
+classify the topology of the skeleton point, the nature of the skeleton point
+in terms of kinematics has not been taken into consideration. Therefore, in
+this paper, we draw on the theory of kinematics to adapt the topological
+relations of the skeleton point and propose a topological relation
+classification based on body parts and distance from core of body. To
+synthesize these topological relations for action recognition, we propose a
+novel Hypergraph Fusion Graph Convolutional Network (HFGCN). In particular, the
+proposed model is able to focus on the human skeleton points and the different
+body parts simultaneously, and thus construct the topology, which improves the
+recognition accuracy obviously. We use a hypergraph to represent the
+categorical relationships of these skeleton points and incorporate the
+hypergraph into a graph convolution network to model the higher-order
+relationships among the skeleton points and enhance the feature representation
+of the network. In addition, our proposed hypergraph attention module and
+hypergraph graph convolution module optimize topology modeling in temporal and
+channel dimensions, respectively, to further enhance the feature representation
+of the network. We conducted extensive experiments on three widely used
+datasets.The results validate that our proposed method can achieve the best
+performance when compared with the state-of-the-art skeleton-based methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-CephaloNet: A Two-stage Novel Framework using Operational Neural
+  Network for Cephalometric Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md. Shaheenur Islam Sumon, Khandaker Reajul Islam, Tanzila Rafique, Gazi Shamim Hassan, Md. Sakib Abrar Hossain, Kanchon Kanti Podder, Noha Barhom, Faleh Tamimi, Abdulrahman Alqahtani, Muhammad E. H. Chowdhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cephalometric analysis is essential for the diagnosis and treatment planning
+of orthodontics. In lateral cephalograms, however, the manual detection of
+anatomical landmarks is a time-consuming procedure. Deep learning solutions
+hold the potential to address the time constraints associated with certain
+tasks; however, concerns regarding their performance have been observed. To
+address this critical issue, we proposed an end-to-end cascaded deep learning
+framework (Self-CepahloNet) for the task, which demonstrated benchmark
+performance over the ISBI 2015 dataset in predicting 19 dental landmarks. Due
+to their adaptive nodal capabilities, Self-ONN (self-operational neural
+networks) demonstrate superior learning performance for complex feature spaces
+over conventional convolutional neural networks. To leverage this attribute, we
+introduced a novel self-bottleneck in the HRNetV2 (High Resolution Network)
+backbone, which has exhibited benchmark performance on the ISBI 2015 dataset
+for the dental landmark detection task. Our first-stage results surpassed
+previous studies, showcasing the efficacy of our singular end-to-end deep
+learning model, which achieved a remarkable 70.95% success rate in detecting
+cephalometric landmarks within a 2mm range for the Test1 and Test2 datasets.
+Moreover, the second stage significantly improved overall performance, yielding
+an impressive 82.25% average success rate for the datasets above within the
+same 2mm distance. Furthermore, external validation was conducted using the PKU
+cephalogram dataset. Our model demonstrated a commendable success rate of
+75.95% within the 2mm range.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been accepted for publication in Neural Computing and
+  Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMARTe-VR: Student Monitoring and Adaptive Response Technology for
+  e-learning in Virtual Reality <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roberto Daza, Lin Shengkai, Aythami Morales, Julian Fierrez, Katashi Nagao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces SMARTe-VR, a platform for student monitoring in an
+immersive virtual reality environment designed for online education. SMARTe-VR
+is aimed to gather data for adaptive learning, focusing on facial biometrics
+and learning metadata. The platform allows instructors to create tailored
+learning sessions with video lectures, featuring an interface with an Auto QA
+system to evaluate understanding, interaction tools (e.g., textbook
+highlighting and lecture tagging), and real-time feedback. Additionally, we
+release a dataset containing 5 research challenges with data from 10 users in
+VR-based TOEIC sessions. This dataset, spanning over 25 hours, includes facial
+features, learning metadata, 450 responses, question difficulty levels, concept
+tags, and understanding labels. Alongside the database, we present preliminary
+experiments using Item Response Theory models, adapted for understanding
+detection using facial features. Two architectures were explored: a Temporal
+Convolutional Network for local features and a Multilayer Perceptron for global
+features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the Workshop on Artificial Intelligence for Education
+  (AI4EDU) at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Based Font Pair Suggestion Modelling For Graphic Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10969v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10969v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryan Singh, Sumithra Bhakthavatsalam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the key challenges of AI generated designs in Microsoft Designer is
+selecting the most contextually relevant and novel fonts for the design
+suggestions. Previous efforts involved manually mapping design intent to fonts.
+Though this was high quality, this method does not scale for a large number of
+fonts (3000+) and numerous user intents for graphic design. In this work we
+create font visual embeddings, a font stroke width algorithm, a font category
+to font mapping dataset, an LLM-based category utilization description and a
+lightweight, low latency knowledge-distilled mini language model (Mini LM V2)
+to recommend multiple pairs of contextual heading and subheading fonts for
+beautiful and intuitive designs. We also utilize a weighted scoring mechanism,
+nearest neighbor approach and stratified sampling to rank the font pairs and
+bring novelty to the predictions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the Microsoft Journal of Applied Research (MSJAR), Volume 21, July
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing General Multimodal Capability of Vision-language Models with
+  Pyramid-descent Visual Position Encoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanpeng Chen, Mingxiao Li, Ziyang Chen, Nan Du, Xiaolong Li, Yuexian Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language Models (VLMs) have shown remarkable capabilities in advancing
+general artificial intelligence, yet the irrational encoding of visual
+positions persists in inhibiting the models' comprehensive perception
+performance across different levels of granularity. In this work, we propose
+Pyramid-descent Visual Position Encoding (PyPE), a novel approach designed to
+enhance the perception of visual tokens within VLMs. By assigning visual
+position indexes from the periphery to the center and expanding the central
+receptive field incrementally, PyPE addresses the limitations of traditional
+raster-scan methods and mitigates the long-term decay effects induced by Rotary
+Position Embedding (RoPE). Our method reduces the relative distance between
+interrelated visual elements and instruction tokens, promoting a more rational
+allocation of attention weights and allowing for a multi-granularity perception
+of visual elements and countering the over-reliance on anchor tokens. Extensive
+experimental evaluations demonstrate that PyPE consistently improves the
+general capabilities of VLMs across various sizes. Code is available at
+https://github.com/SakuraTroyChen/PyPE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DC-PCN: Point Cloud Completion Network with Dual-Codebook Guided
+  Quantization <span class="chip">AAAI25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiuxia Wu, Haiyang Huang, Kunming Su, Zhiyong Wang, Kun Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud completion aims to reconstruct complete 3D shapes from partial 3D
+point clouds. With advancements in deep learning techniques, various methods
+for point cloud completion have been developed. Despite achieving encouraging
+results, a significant issue remains: these methods often overlook the
+variability in point clouds sampled from a single 3D object surface. This
+variability can lead to ambiguity and hinder the achievement of more precise
+completion results. Therefore, in this study, we introduce a novel point cloud
+completion network, namely Dual-Codebook Point Completion Network (DC-PCN),
+following an encder-decoder pipeline. The primary objective of DC-PCN is to
+formulate a singular representation of sampled point clouds originating from
+the same 3D surface. DC-PCN introduces a dual-codebook design to quantize
+point-cloud representations from a multilevel perspective. It consists of an
+encoder-codebook and a decoder-codebook, designed to capture distinct point
+cloud patterns at shallow and deep levels. Additionally, to enhance the
+information flow between these two codebooks, we devise an information exchange
+mechanism. This approach ensures that crucial features and patterns from both
+shallow and deep levels are effectively utilized for completion. Extensive
+experiments on the PCN, ShapeNet\_Part, and ShapeNet34 datasets demonstrate the
+state-of-the-art performance of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI25 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Early-Fusion Strategies for Improved Multimodal Image
+  Segmentation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengwen Shen, Yulian Li, Han Zhang, Yuchen Weng, Jun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RGB and thermal image fusion have great potential to exhibit improved
+semantic segmentation in low-illumination conditions. Existing methods
+typically employ a two-branch encoder framework for multimodal feature
+extraction and design complicated feature fusion strategies to achieve feature
+extraction and fusion for multimodal semantic segmentation. However, these
+methods require massive parameter updates and computational effort during the
+feature extraction and fusion. To address this issue, we propose a novel
+multimodal fusion network (EFNet) based on an early fusion strategy and a
+simple but effective feature clustering for training efficient RGB-T semantic
+segmentation. In addition, we also propose a lightweight and efficient
+multi-scale feature aggregation decoder based on Euclidean distance. We
+validate the effectiveness of our method on different datasets and outperform
+previous state-of-the-art methods with lower parameters and computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MARIO: A Mixed Annotation Framework For Polyp Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyang Li, Yiwen Hu, Jun Wei, Zhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing polyp segmentation models are limited by high labeling costs and the
+small size of datasets. Additionally, vast polyp datasets remain underutilized
+because these models typically rely on a single type of annotation. To address
+this dilemma, we introduce MARIO, a mixed supervision model designed to
+accommodate various annotation types, significantly expanding the range of
+usable data. MARIO learns from underutilized datasets by incorporating five
+forms of supervision: pixel-level, box-level, polygon-level, scribblelevel, and
+point-level. Each form of supervision is associated with a tailored loss that
+effectively leverages the supervision labels while minimizing the noise. This
+allows MARIO to move beyond the constraints of relying on a single annotation
+type. Furthermore, MARIO primarily utilizes dataset with weak and cheap
+annotations, reducing the dependence on large-scale, fully annotated ones.
+Experimental results across five benchmark datasets demonstrate that MARIO
+consistently outperforms existing methods, highlighting its efficacy in
+balancing trade-offs between different forms of supervision and maximizing
+polyp segmentation performance
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ISBI 2025 4-page paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TSVC:Tripartite Learning with Semantic Variation Consistency for Robust
+  Image-Text Retrieval <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Lyu, Zijing Tian, Zhonghong Ou, Yifan Zhu, Xiao Zhang, Qiankun Ha, Haoran Luo, Meina Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal retrieval maps data under different modality via semantic
+relevance. Existing approaches implicitly assume that data pairs are
+well-aligned and ignore the widely existing annotation noise, i.e., noisy
+correspondence (NC). Consequently, it inevitably causes performance
+degradation. Despite attempts that employ the co-teaching paradigm with
+identical architectures to provide distinct data perspectives, the differences
+between these architectures are primarily stemmed from random initialization.
+Thus, the model becomes increasingly homogeneous along with the training
+process. Consequently, the additional information brought by this paradigm is
+severely limited. In order to resolve this problem, we introduce a Tripartite
+learning with Semantic Variation Consistency (TSVC) for robust image-text
+retrieval. We design a tripartite cooperative learning mechanism comprising a
+Coordinator, a Master, and an Assistant model. The Coordinator distributes
+data, and the Assistant model supports the Master model's noisy label
+prediction with diverse data. Moreover, we introduce a soft label estimation
+method based on mutual information variation, which quantifies the noise in new
+samples and assigns corresponding soft labels. We also present a new loss
+function to enhance robustness and optimize training effectiveness. Extensive
+experiments on three widely used datasets demonstrate that, even at increasing
+noise ratios, TSVC exhibits significant advantages in retrieval accuracy and
+maintains stable training performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted to the Main Track of AAAI 2025. It
+  contains 9 pages, 7 figures, and is relevant to the areas of cross-modal
+  retrieval and machine learning. The work presents a novel approach in robust
+  image-text retrieval using a tripartite learning framework</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Physical AI in Vision: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daochang Liu, Junyu Zhang, Anh-Dung Dinh, Eunbyung Park, Shichao Zhang, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Artificial Intelligence (AI) has rapidly advanced the field of
+computer vision by enabling machines to create and interpret visual data with
+unprecedented sophistication. This transformation builds upon a foundation of
+generative models to produce realistic images, videos, and 3D or 4D content.
+Traditionally, generative models primarily focus on visual fidelity while often
+neglecting the physical plausibility of generated content. This gap limits
+their effectiveness in applications requiring adherence to real-world physical
+laws, such as robotics, autonomous systems, and scientific simulations. As
+generative AI evolves to increasingly integrate physical realism and dynamic
+simulation, its potential to function as a "world simulator" expands-enabling
+the modeling of interactions governed by physics and bridging the divide
+between virtual and physical realities. This survey systematically reviews this
+emerging field of physics-aware generative AI in computer vision, categorizing
+methods based on how they incorporate physical knowledge-either through
+explicit simulation or implicit learning. We analyze key paradigms, discuss
+evaluation protocols, and identify future research directions. By offering a
+comprehensive overview, this survey aims to help future developments in
+physically grounded generation for vision. The reviewed papers are summarized
+at https://github.com/BestJunYu/Awesome-Physics-aware-Generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decomposing and Fusing Intra- and Inter-Sensor Spatio-Temporal Signal
+  for Multi-Sensor Wearable Human Activity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Xie, Haoxuan Li, Chunyuan Zheng, Haonan Yuan, Guorui Liao, Jun Liao, Li Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wearable Human Activity Recognition (WHAR) is a prominent research area
+within ubiquitous computing. Multi-sensor synchronous measurement has proven to
+be more effective for WHAR than using a single sensor. However, existing WHAR
+methods use shared convolutional kernels for indiscriminate temporal feature
+extraction across each sensor variable, which fails to effectively capture
+spatio-temporal relationships of intra-sensor and inter-sensor variables. We
+propose the DecomposeWHAR model consisting of a decomposition phase and a
+fusion phase to better model the relationships between modality variables. The
+decomposition creates high-dimensional representations of each intra-sensor
+variable through the improved Depth Separable Convolution to capture local
+temporal features while preserving their unique characteristics. The fusion
+phase begins by capturing relationships between intra-sensor variables and
+fusing their features at both the channel and variable levels. Long-range
+temporal dependencies are modeled using the State Space Model (SSM), and later
+cross-sensor interactions are dynamically captured through a self-attention
+mechanism, highlighting inter-sensor spatial correlations. Our model
+demonstrates superior performance on three widely used WHAR datasets,
+significantly outperforming state-of-the-art models while maintaining
+acceptable computational efficiency. Our codes and supplementary materials are
+available at https://github.com/Anakin2555/DecomposeWHAR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Green Video Camouflaged Object Detection <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Wang, Hong-Shuo Chen, Zhiruo Zhou, Suya You, Azad M. Madni, C. -C. Jay Kuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camouflaged object detection (COD) aims to distinguish hidden objects
+embedded in an environment highly similar to the object. Conventional
+video-based COD (VCOD) methods explicitly extract motion cues or employ complex
+deep learning networks to handle the temporal information, which is limited by
+high complexity and unstable performance. In this work, we propose a green VCOD
+method named GreenVCOD. Built upon a green ICOD method, GreenVCOD uses long-
+and short-term temporal neighborhoods (TN) to capture joint spatial/temporal
+context information for decision refinement. Experimental results show that
+GreenVCOD offers competitive performance compared to state-of-the-art VCOD
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 2024 Asia Pacific Signal and Information Processing
+  Association Annual Summit and Conference (APSIPA ASC)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Know "No" Better: A Data-Driven Approach for Enhancing Negation
+  Awareness in CLIP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10913v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10913v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junsung Park, Jungbeom Lee, Jongyoon Song, Sangwon Yu, Dahuin Jung, Sungroh Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While CLIP has significantly advanced multimodal understanding by bridging
+vision and language, the inability to grasp negation - such as failing to
+differentiate concepts like "parking" from "no parking" - poses substantial
+challenges. By analyzing the data used in the public CLIP model's pre-training,
+we posit this limitation stems from a lack of negation-inclusive data. To
+address this, we introduce data generation pipelines that employ a large
+language model (LLM) and a multimodal LLM to produce negation-inclusive
+captions. Fine-tuning CLIP with data generated from our pipelines, we develop
+NegationCLIP, which enhances negation awareness while preserving the
+generality. Moreover, to enable a comprehensive evaluation of negation
+understanding, we propose NegRefCOCOg-a benchmark tailored to test VLMs'
+ability to interpret negation across diverse expressions and positions within a
+sentence. Experiments on various CLIP architectures validate the effectiveness
+of our data generation pipelines in enhancing CLIP's ability to perceive
+negation accurately. Additionally, NegationCLIP's enhanced negation awareness
+has practical applications across various multimodal tasks, demonstrated by
+performance gains in text-to-image generation and referring image segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Adversarial Attacks on Coarse-to-Fine Classifiers <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akram Heidarizadeh, Connor Hatfield, Lorenzo Lazzarotto, HanQin Cai, George Atia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional adversarial attacks typically aim to alter the predicted labels
+of input images by generating perturbations that are imperceptible to the human
+eye. However, these approaches often lack explainability. Moreover, most
+existing work on adversarial attacks focuses on single-stage classifiers, but
+multi-stage classifiers are largely unexplored. In this paper, we introduce
+instance-based adversarial attacks for multi-stage classifiers, leveraging
+Layer-wise Relevance Propagation (LRP), which assigns relevance scores to
+pixels based on their influence on classification outcomes. Our approach
+generates explainable adversarial perturbations by utilizing LRP to identify
+and target key features critical for both coarse and fine-grained
+classifications. Unlike conventional attacks, our method not only induces
+misclassification but also enhances the interpretability of the model's
+behavior across classification stages, as demonstrated by experimental results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Remote Sensing Image Change Detection Method Integrating Layer
+  Exchange and Channel-Spatial Differences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sijun Dong, Fangcheng Zuo, Geng Chen, Siming Fu, Xiaoliang Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Change detection in remote sensing imagery is a critical technique for Earth
+observation, primarily focusing on pixel-level segmentation of change regions
+between bi-temporal images. The essence of pixel-level change detection lies in
+determining whether corresponding pixels in bi-temporal images have changed. In
+deep learning, the spatial and channel dimensions of feature maps represent
+different information from the original images. In this study, we found that in
+change detection tasks, difference information can be computed not only from
+the spatial dimension of bi-temporal features but also from the channel
+dimension. Therefore, we designed the Channel-Spatial Difference Weighting
+(CSDW) module as an aggregation-distribution mechanism for bi-temporal features
+in change detection. This module enhances the sensitivity of the change
+detection model to difference features. Additionally, bi-temporal images share
+the same geographic location and exhibit strong inter-image correlations. To
+construct the correlation between bi-temporal images, we designed a decoding
+structure based on the Layer-Exchange (LE) method to enhance the interaction of
+bi-temporal features. Comprehensive experiments on the CLCD, PX-CLCD, LEVIR-CD,
+and S2Looking datasets demonstrate that the proposed LENet model significantly
+improves change detection performance. The code and pre-trained models will be
+available at: https://github.com/dyzy41/lenet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do large language vision models understand 3D shapes? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10908v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10908v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sagi Eppel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision language models (LVLM) are the leading A.I approach for
+achieving a general visual understanding of the world. Models such as GPT,
+Claude, Gemini, and LLama can use images to understand and analyze complex
+visual scenes. 3D objects and shapes are the basic building blocks of the
+world, recognizing them is a fundamental part of human perception. The goal of
+this work is to test whether LVLMs truly understand 3D shapes by testing the
+models ability to identify and match objects of the exact same 3D shapes but
+with different orientations and materials/textures. A large number of test
+images were created using CGI with a huge number of highly diverse objects,
+materials, and scenes. The results of this test show that the ability of such
+models to match 3D shapes is significantly below humans but much higher than
+random guesses. Suggesting that the models have gained some abstract
+understanding of 3D shapes but still trail far beyond humans in this task.
+Mainly it seems that the models can easily identify the same object with a
+different orientation as well as matching identical 3D shapes of the same
+orientation but with different materials and textures. However, when both the
+object material and orientation are changed, all models perform poorly relative
+to humans. Code and benchmark are available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TorchSpatial: A Location Encoding Framework and Benchmark for Spatial
+  Representation Learning <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15658v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15658v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nemin Wu, Qian Cao, Zhangyu Wang, Zeping Liu, Yanlin Qi, Jielu Zhang, Joshua Ni, Xiaobai Yao, Hongxu Ma, Lan Mu, Stefano Ermon, Tanuja Ganu, Akshay Nambi, Ni Lao, Gengchen Mai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial representation learning (SRL) aims at learning general-purpose neural
+network representations from various types of spatial data (e.g., points,
+polylines, polygons, networks, images, etc.) in their native formats. Learning
+good spatial representations is a fundamental problem for various downstream
+applications such as species distribution modeling, weather forecasting,
+trajectory generation, geographic question answering, etc. Even though SRL has
+become the foundation of almost all geospatial artificial intelligence (GeoAI)
+research, we have not yet seen significant efforts to develop an extensive deep
+learning framework and benchmark to support SRL model development and
+evaluation. To fill this gap, we propose TorchSpatial, a learning framework and
+benchmark for location (point) encoding, which is one of the most fundamental
+data types of spatial representation learning. TorchSpatial contains three key
+components: 1) a unified location encoding framework that consolidates 15
+commonly recognized location encoders, ensuring scalability and reproducibility
+of the implementations; 2) the LocBench benchmark tasks encompassing 7
+geo-aware image classification and 10 geo-aware image regression datasets; 3) a
+comprehensive suite of evaluation metrics to quantify geo-aware model's overall
+performance as well as their geographic bias, with a novel Geo-Bias Score
+metric. Finally, we provide a detailed analysis and insights into the model
+performance and geographic bias of different location encoders. We believe
+TorchSpatial will foster future advancement of spatial representation learning
+and spatial fairness in GeoAI research. The TorchSpatial model framework and
+LocBench benchmark are available at https://github.com/seai-lab/TorchSpatial,
+and the Geo-Bias Score evaluation framework is available at
+https://github.com/seai-lab/PyGBS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures. Accepted by NeurIPS 2024 Datasets and Benchmarks
+  Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UNetVL: Enhancing 3D Medical Image Segmentation with Chebyshev KAN
+  Powered Vision-LSTM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07017v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07017v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhui Guo, Tanmoy Dam, Rohan Dhamdhere, Gourav Modanwal, Anant Madabhushi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D medical image segmentation has progressed considerably due to
+Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs), yet these
+methods struggle to balance long-range dependency acquisition with
+computational efficiency. To address this challenge, we propose UNETVL (U-Net
+Vision-LSTM), a novel architecture that leverages recent advancements in
+temporal information processing. UNETVL incorporates Vision-LSTM (ViL) for
+improved scalability and memory functions, alongside an efficient Chebyshev
+Kolmogorov-Arnold Networks (KAN) to handle complex and long-range dependency
+patterns more effectively. We validated our method on the ACDC and AMOS2022
+(post challenge Task 2) benchmark datasets, showing a significant improvement
+in mean Dice score compared to recent state-of-the-art approaches, especially
+over its predecessor, UNETR, with increases of 7.3% on ACDC and 15.6% on AMOS,
+respectively. Extensive ablation studies were conducted to demonstrate the
+impact of each component in UNETVL, providing a comprehensive understanding of
+its architecture. Our code is available at https://github.com/tgrex6/UNETVL,
+facilitating further research and applications in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physically Feasible Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14672v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14672v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shamik Basu, Luc Van Gool, Christos Sakaridis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art semantic segmentation models are typically optimized in a
+data-driven fashion, minimizing solely per-pixel or per-segment classification
+objectives on their training data. This purely data-driven paradigm often leads
+to absurd segmentations, especially when the domain of input images is shifted
+from the one encountered during training. For instance, state-of-the-art models
+may assign the label ``road to a segment that is located above a segment that
+is respectively labeled as ``sky, although our knowledge of the physical world
+dictates that such a configuration is not feasible for images captured by
+forward-facing upright cameras. Our method, Physically Feasible Semantic
+Segmentation (PhyFea), first extracts explicit constraints that govern spatial
+class relations from the semantic segmentation training set at hand in an
+offline, data-driven fashion, and then enforces a morphological yet
+differentiable loss that penalizes violations of these constraints during
+training to promote prediction feasibility. PhyFea is a plug-and-play method
+and yields consistent and significant performance improvements over diverse
+state-of-the-art networks on which we implement it across the ADE20K,
+Cityscapes, and ACDC datasets. Code and models will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-Generated Content (AIGC) for Various Data Modalities: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.14177v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.14177v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Geng Foo, Hossein Rahmani, Jun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI-generated content (AIGC) methods aim to produce text, images, videos, 3D
+assets, and other media using AI algorithms. Due to its wide range of
+applications and the potential of recent works, AIGC developments -- especially
+in Machine Learning (ML) and Deep Learning (DL) -- have been attracting
+significant attention, and this survey focuses on comprehensively reviewing
+such advancements in ML/DL. AIGC methods have been developed for various data
+modalities, such as image, video, text, 3D shape, 3D scene, 3D human avatar, 3D
+motion, and audio -- each presenting unique characteristics and challenges.
+Furthermore, there have been significant developments in cross-modality AIGC
+methods, where generative methods receive conditioning input in one modality
+and produce outputs in another. Examples include going from various modalities
+to image, video, 3D, and audio. This paper provides a comprehensive review of
+AIGC methods across different data modalities, including both single-modality
+and cross-modality methods, highlighting the various challenges, representative
+works, and recent technical directions in each setting. We also survey the
+representative datasets throughout the modalities, and present comparative
+results for various modalities. Moreover, we discuss the typical applications
+of AIGC methods in various domains, challenges, and future research directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Label is Worth a Thousand Images in <span class="highlight-title">Dataset</span> Distillation <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10485v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10485v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Qin, Zhiwei Deng, David Alvarez-Melis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data $\textit{quality}$ is a crucial factor in the performance of machine
+learning models, a principle that dataset distillation methods exploit by
+compressing training datasets into much smaller counterparts that maintain
+similar downstream performance. Understanding how and why data distillation
+methods work is vital not only for improving these methods but also for
+revealing fundamental characteristics of "good" training data. However, a major
+challenge in achieving this goal is the observation that distillation
+approaches, which rely on sophisticated but mostly disparate methods to
+generate synthetic data, have little in common with each other. In this work,
+we highlight a largely overlooked aspect common to most of these methods: the
+use of soft (probabilistic) labels. Through a series of ablation experiments,
+we study the role of soft labels in depth. Our results reveal that the main
+factor explaining the performance of state-of-the-art distillation methods is
+not the specific techniques used to generate synthetic data but rather the use
+of soft labels. Furthermore, we demonstrate that not all soft labels are
+created equal; they must contain $\textit{structured information}$ to be
+beneficial. We also provide empirical scaling laws that characterize the
+effectiveness of soft labels as a function of images-per-class in the distilled
+dataset and establish an empirical Pareto frontier for data-efficient learning.
+Combined, our findings challenge conventional wisdom in dataset distillation,
+underscore the importance of soft labels in learning, and suggest new
+directions for improving distillation methods. Code for all experiments is
+available at https://github.com/sunnytqin/no-distillation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised UAV 3D Trajectories Estimation with Sparse Point Clouds <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12716v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12716v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanfang Liang, Yizhuo Yang, Jinming Hu, Jianfei Yang, Fen Liu, Shenghai Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compact UAV systems, while advancing delivery and surveillance, pose
+significant security challenges due to their small size, which hinders
+detection by traditional methods. This paper presents a cost-effective,
+unsupervised UAV detection method using spatial-temporal sequence processing to
+fuse multiple LiDAR scans for accurate UAV tracking in real-world scenarios.
+Our approach segments point clouds into foreground and background, analyzes
+spatial-temporal data, and employs a scoring mechanism to enhance detection
+accuracy. Tested on a public dataset, our solution placed 4th in the CVPR 2024
+UG2+ Challenge, demonstrating its practical effectiveness. We plan to
+open-source all designs, code, and sample data for the research community
+github.com/lianghanfang/UnLiDAR-UAV-Est.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at the IEEE
+  International Conference on Acoustics, Speech, and Signal Processing (ICASSP)
+  2025. 2025 IEEE Trademark. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Local Patch Alignment to Seam-cutting for Large Parallax
+  Image Stitching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18564v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18564v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianli Liao, Chenyang Zhao, Lei Li, Heling Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Seam cutting methods have been proven effective in the composition step of
+image stitching, especially for images with parallax. However, current seam
+cutting can be seen as the subsequent step after the image alignment is
+settled. Its effectiveness usually depends on the fact that images can be
+roughly aligned such that a local region exists where an unnoticeable seam can
+be found. Current alignment methods often fall short of expectations for images
+with large parallax, and most efforts are devoted to improving the alignment
+accuracy.
+  In this paper, we argue that by adding a simple Local Patch Alignment Module
+(LPAM) into the seam cutting, the final result can be efficiently improved for
+large parallax image stitching. Concretely, we first evaluate the quality of
+pixels along the estimated seam of the seam cutting method. Then, for pixels
+with low qualities, we separate their enclosing patches in the aligned images
+and locally align them by constructing modified dense correspondences via SIFT
+flow. Finally, we composite the aligned patches via seam cutting and merge them
+into the original aligned result to generate the final mosaic. Experiments show
+that introducing LPAM can effectively and efficiently improve the stitching
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In peer review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Real-time Identity Defenses against Malicious Personalization of
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09844v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09844v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanzhong Guo, Shen Nie, Chao Du, Tianyu Pang, Hao Sun, Chongxuan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized generative diffusion models, capable of synthesizing highly
+realistic images based on a few reference portraits, may pose substantial
+social, ethical, and legal risks via identity replication. Existing defense
+mechanisms rely on computationally intensive adversarial perturbations tailored
+to individual images, rendering them impractical for real-world deployment.
+This study introduces the Real-time Identity Defender (RID), a neural network
+designed to generate adversarial perturbations through a single forward pass,
+bypassing the need for image-specific optimization. RID achieves unprecedented
+efficiency, with defense times as low as 0.12 seconds on a single NVIDIA A100
+80G GPU (4,400 times faster than leading methods) and 1.1 seconds per image on
+a standard Intel i9 CPU, making it suitable for edge devices such as
+smartphones. Despite its efficiency, RID achieves promising protection
+performance across visual and quantitative benchmarks, effectively mitigating
+identity replication risks. Our analysis reveals that RID's perturbations mimic
+the efficacy of traditional defenses while exhibiting properties distinct from
+natural noise, such as Gaussian perturbations. To enhance robustness, we extend
+RID into an ensemble framework that integrates multiple pre-trained
+text-to-image diffusion models, ensuring resilience against black-box attacks
+and post-processing techniques, including image compression and purification.
+Our model is envisioned to play a crucial role in safeguarding portrait rights,
+thereby preventing illegal and unethical uses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 7 figures (RID)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image Segmentation: Inducing graph-based learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03765v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03765v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryan Singh, Pepijn Van de Ven, Ciarán Eising, Patrick Denny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the potential of graph neural networks (GNNs) to enhance
+semantic segmentation across diverse image modalities. We evaluate the
+effectiveness of a novel GNN-based U-Net architecture on three distinct
+datasets: PascalVOC, a standard benchmark for natural image segmentation,
+WoodScape, a challenging dataset of fisheye images commonly used in autonomous
+driving, introducing significant geometric distortions; and ISIC2016, a dataset
+of dermoscopic images for skin lesion segmentation. We compare our proposed
+UNet-GNN model against established convolutional neural networks (CNNs) based
+segmentation models, including U-Net and U-Net++, as well as the
+transformer-based SwinUNet. Unlike these methods, which primarily rely on local
+convolutional operations or global self-attention, GNNs explicitly model
+relationships between image regions by constructing and operating on a graph
+representation of the image features. This approach allows the model to capture
+long-range dependencies and complex spatial relationships, which we hypothesize
+will be particularly beneficial for handling geometric distortions present in
+fisheye imagery and capturing intricate boundaries in medical images. Our
+analysis demonstrates the versatility of GNNs in addressing diverse
+segmentation challenges and highlights their potential to improve segmentation
+accuracy in various applications, including autonomous driving and medical
+image analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAGD: Boundary-Enhanced Segment Anything in 3D Gaussian via Gaussian
+  Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.17857v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.17857v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Hu, Yuxi Wang, Lue Fan, Chuanchen Luo, Junsong Fan, Zhen Lei, Qing Li, Junran Peng, Zhaoxiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting has emerged as an alternative 3D representation for
+novel view synthesis, benefiting from its high-quality rendering results and
+real-time rendering speed. However, the 3D Gaussians learned by 3D-GS have
+ambiguous structures without any geometry constraints. This inherent issue in
+3D-GS leads to a rough boundary when segmenting individual objects. To remedy
+these problems, we propose SAGD, a conceptually simple yet effective
+boundary-enhanced segmentation pipeline for 3D-GS to improve segmentation
+accuracy while preserving segmentation speed. Specifically, we introduce a
+Gaussian Decomposition scheme, which ingeniously utilizes the special structure
+of 3D Gaussian, finds out, and then decomposes the boundary Gaussians.
+Moreover, to achieve fast interactive 3D segmentation, we introduce a novel
+training-free pipeline by lifting a 2D foundation model to 3D-GS. Extensive
+experiments demonstrate that our approach achieves high-quality 3D segmentation
+without rough boundary issues, which can be easily applied to other scene
+editing tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CREST: An Efficient Conjointly-trained Spike-driven Framework for
+  Event-based Object Detection Exploiting Spatiotemporal Dynamics <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12525v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12525v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixin Mao, Aoyu Shen, Lin Tang, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based cameras feature high temporal resolution, wide dynamic range, and
+low power consumption, which is ideal for high-speed and low-light object
+detection. Spiking neural networks (SNNs) are promising for event-based object
+recognition and detection due to their spiking nature but lack efficient
+training methods, leading to gradient vanishing and high computational
+complexity, especially in deep SNNs. Additionally, existing SNN frameworks
+often fail to effectively handle multi-scale spatiotemporal features, leading
+to increased data redundancy and reduced accuracy. To address these issues, we
+propose CREST, a novel conjointly-trained spike-driven framework to exploit
+spatiotemporal dynamics in event-based object detection. We introduce the
+conjoint learning rule to accelerate SNN learning and alleviate gradient
+vanishing. It also supports dual operation modes for efficient and flexible
+implementation on different hardware types. Additionally, CREST features a
+fully spike-driven framework with a multi-scale spatiotemporal event integrator
+(MESTOR) and a spatiotemporal-IoU (ST-IoU) loss. Our approach achieves superior
+object recognition & detection performance and up to 100X energy efficiency
+compared with state-of-the-art SNN algorithms on three datasets, providing an
+efficient solution for event-based object detection algorithms suitable for SNN
+hardware implementation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Label Skewness for Spiking Neural Networks in Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17305v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17305v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Yu, Xin Du, Linshan Jiang, Huijing Zhang, Shunwen Bai, Shuiguang Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The energy efficiency of deep spiking neural networks (SNNs) aligns with the
+constraints of resource-limited edge devices, positioning SNNs as a promising
+foundation for intelligent applications leveraging the extensive data collected
+by these devices. To address data privacy concerns when deploying SNNs on edge
+devices, federated learning (FL) facilitates collaborative model training by
+leveraging data distributed across edge devices without transmitting local data
+to a central server. However, existing FL approaches struggle with label-skewed
+data across devices, which leads to drift in local SNN models and degrades the
+performance of the global SNN model. In this paper, we propose a novel
+framework called FedLEC, which incorporates intra-client label weight
+calibration to balance the learning intensity across local labels and
+inter-client knowledge distillation to mitigate local SNN model bias caused by
+label absence. Extensive experiments with three different structured SNNs
+across five datasets (i.e., three non-neuromorphic and two neuromorphic
+datasets) demonstrate the efficiency of FedLEC. Compared to eight
+state-of-the-art FL algorithms, FedLEC achieves an average accuracy improvement
+of approximately 11.59% for the global SNN model under various label skew
+distribution settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multi-task Supervised Compression Model for Split Computing <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01420v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01420v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoshitomo Matsubara, Matteo Mendula, Marco Levorato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Split computing ($\neq$ split learning) is a promising approach to deep
+learning models for resource-constrained edge computing systems, where weak
+sensor (mobile) devices are wirelessly connected to stronger edge servers
+through channels with limited communication capacity. State-of-theart work on
+split computing presents methods for single tasks such as image classification,
+object detection, or semantic segmentation. The application of existing methods
+to multitask problems degrades model accuracy and/or significantly increase
+runtime latency. In this study, we propose Ladon, the first multi-task-head
+supervised compression model for multi-task split computing. Experimental
+results show that the multi-task supervised compression model either
+outperformed or rivaled strong lightweight baseline models in terms of
+predictive performance for ILSVRC 2012, COCO 2017, and PASCAL VOC 2012 datasets
+while learning compressed representations at its early layers. Furthermore, our
+models reduced end-to-end latency (by up to 95.4%) and energy consumption of
+mobile devices (by up to 88.2%) in multi-task split computing scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WACV 2025. Code and models are available at
+  https://github.com/yoshitomo-matsubara/ladon-multi-task-sc2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prediction and Reference Quality Adaptation for Learned Video
+  Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14118v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14118v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xihua Sheng, Li Li, Dong Liu, Houqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal prediction is one of the most important technologies for video
+compression. Various prediction coding modes are designed in traditional video
+codecs. Traditional video codecs will adaptively to decide the optimal coding
+mode according to the prediction quality and reference quality. Recently,
+learned video codecs have made great progress. However, they did not
+effectively address the problem of prediction and reference quality adaptation,
+which limits the effective utilization of temporal prediction and reduction of
+reconstruction error propagation. Therefore, in this paper, we first propose a
+confidence-based prediction quality adaptation (PQA) module to provide explicit
+discrimination for the spatial and channel-wise prediction quality difference.
+With this module, the prediction with low quality will be suppressed and that
+with high quality will be enhanced. The codec can adaptively decide which
+spatial or channel location of predictions to use. Then, we further propose a
+reference quality adaptation (RQA) module and an associated repeat-long
+training strategy to provide dynamic spatially variant filters for diverse
+reference qualities. With these filters, our codec can adapt to different
+reference qualities, making it easier to achieve the target reconstruction
+quality and reduce the reconstruction error propagation. Experimental results
+verify that our proposed modules can effectively help our codec achieve a
+higher compression performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bi-Directional Deep Contextual Video Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08604v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08604v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xihua Sheng, Li Li, Dong Liu, Shiqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep video compression has made remarkable process in recent years, with the
+majority of advancements concentrated on P-frame coding. Although efforts to
+enhance B-frame coding are ongoing, their compression performance is still far
+behind that of traditional bi-directional video codecs. In this paper, we
+introduce a bi-directional deep contextual video compression scheme tailored
+for B-frames, termed DCVC-B, to improve the compression performance of deep
+B-frame coding. Our scheme mainly has three key innovations. First, we develop
+a bi-directional motion difference context propagation method for effective
+motion difference coding, which significantly reduces the bit cost of
+bi-directional motions. Second, we propose a bi-directional contextual
+compression model and a corresponding bi-directional temporal entropy model, to
+make better use of the multi-scale temporal contexts. Third, we propose a
+hierarchical quality structure-based training strategy, leading to an effective
+bit allocation across large groups of pictures (GOP). Experimental results show
+that our DCVC-B achieves an average reduction of 26.6% in BD-Rate compared to
+the reference software for H.265/HEVC under random access conditions.
+Remarkably, it surpasses the performance of the H.266/VVC reference software on
+certain test datasets under the same configuration. We anticipate our work can
+provide valuable insights and bring up deep B-frame coding to the next level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Descriptive Caption Enhancement with Visual Specialists for Multimodal
+  Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanpeng Sun, Jing Hao, Ke Zhu, Jiang-Jiang Liu, Yuxiang Zhao, Xiaofan Li, Gang Zhang, Zechao Li, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training Large Multimodality Models (LMMs) relies on descriptive image
+caption that connects image and language. Existing methods either distill the
+caption from the LMM models or construct the captions from the internet images
+or by human. We propose to leverage off-the-shelf visual specialists, which
+were trained from annotated images initially not for image captioning, for
+enhancing the image caption.
+  Our approach, named DCE, explores object low-level and fine-grained
+attributes (e.g., depth, emotion and fine-grained categories) and object
+relations (e.g., relative location and human-object-interaction (HOI)), and
+combine the attributes into the descriptive caption. Experiments demonstrate
+that such visual specialists are able to improve the performance for visual
+understanding tasks as well as reasoning that benefits from more accurate
+visual understanding. We will release the source code and the pipeline so that
+other visual specialists are easily combined into the pipeline. The complete
+source code of DCE pipeline and datasets will be available at
+\url{https://github.com/syp2ysy/DCE}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An open-source data engine for generating detailed image captions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Imperfections: A Conditional Inpainting Approach for End-to-End
+  Artifact Removal in VTON and Pose Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.04052v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.04052v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aref Tabatabaei, Zahra Dehghanian, Maryam Amirmazlaghani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artifacts often degrade the visual quality of virtual try-on (VTON) and pose
+transfer applications, impacting user experience. This study introduces a novel
+conditional inpainting technique designed to detect and remove such
+distortions, improving image aesthetics. Our work is the first to present an
+end-to-end framework addressing this specific issue, and we developed a
+specialized dataset of artifacts in VTON and pose transfer tasks, complete with
+masks highlighting the affected areas. Experimental results show that our
+method not only effectively removes artifacts but also significantly enhances
+the visual quality of the final images, setting a new benchmark in computer
+vision and image processing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-Shot Domain Adaptation for Learned Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.11111v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.11111v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Zhang, Haotian Zhang, Yuqi Li, Li Li, Dong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learned image compression (LIC) has achieved state-of-the-art rate-distortion
+performance, deemed promising for next-generation image compression techniques.
+However, pre-trained LIC models usually suffer from significant performance
+degradation when applied to out-of-training-domain images, implying their poor
+generalization capabilities. To tackle this problem, we propose a few-shot
+domain adaptation method for LIC by integrating plug-and-play adapters into
+pre-trained models. Drawing inspiration from the analogy between latent
+channels and frequency components, we examine domain gaps in LIC and observe
+that out-of-training-domain images disrupt pre-trained channel-wise
+decomposition. Consequently, we introduce a method for channel-wise
+re-allocation using convolution-based adapters and low-rank adapters, which are
+lightweight and compatible to mainstream LIC schemes. Extensive experiments
+across multiple domains and multiple representative LIC schemes demonstrate
+that our method significantly enhances pre-trained models, achieving comparable
+performance to H.266/VVC intra coding with merely 25 target-domain samples.
+Additionally, our method matches the performance of full-model finetune while
+transmitting fewer than $2\%$ of the parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Unified Distance Metric Across Diverse Data Distributions with
+  Parameter-Efficient Transfer Learning <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08944v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08944v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungyeon Kim, Donghyun Kim, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common practice in metric learning is to train and test an embedding model
+for each dataset. This dataset-specific approach fails to simulate real-world
+scenarios that involve multiple heterogeneous distributions of data. In this
+regard, we explore a new metric learning paradigm, called Unified Metric
+Learning (UML), which learns a unified distance metric capable of capturing
+relations across multiple data distributions. UML presents new challenges, such
+as imbalanced data distribution and bias towards dominant distributions. These
+issues cause standard metric learning methods to fail in learning a unified
+metric. To address these challenges, we propose Parameter-efficient Unified
+Metric leArning (PUMA), which consists of a pre-trained frozen model and two
+additional modules, stochastic adapter and prompt pool. These modules enable to
+capture dataset-specific knowledge while avoiding bias towards dominant
+distributions. Additionally, we compile a new unified metric learning benchmark
+with a total of 8 different datasets. PUMA outperforms the state-of-the-art
+dataset-specific models while using about 69 times fewer trainable parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EditBoard: Towards a Comprehensive Evaluation Benchmark for Text-Based
+  Video Editing Models <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09668v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09668v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupeng Chen, Penglin Chen, Xiaoyu Zhang, Yixian Huang, Qian Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of diffusion models has significantly advanced
+AI-generated content (AIGC), particularly in Text-to-Image (T2I) and
+Text-to-Video (T2V) generation. Text-based video editing, leveraging these
+generative capabilities, has emerged as a promising field, enabling precise
+modifications to videos based on text prompts. Despite the proliferation of
+innovative video editing models, there is a conspicuous lack of comprehensive
+evaluation benchmarks that holistically assess these models' performance across
+various dimensions. Existing evaluations are limited and inconsistent,
+typically summarizing overall performance with a single score, which obscures
+models' effectiveness on individual editing tasks. To address this gap, we
+propose EditBoard, the first comprehensive evaluation benchmark for text-based
+video editing models. EditBoard encompasses nine automatic metrics across four
+dimensions, evaluating models on four task categories and introducing three new
+metrics to assess fidelity. This task-oriented benchmark facilitates objective
+evaluation by detailing model performance and providing insights into each
+model's strengths and weaknesses. By open-sourcing EditBoard, we aim to
+standardize evaluation and advance the development of robust video editing
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IDEA: Image Description Enhanced CLIP-Adapter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08816v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08816v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Ye, Feng Jiang, Qiufeng Wang, Kaizhu Huang, Jiaqi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP (Contrastive Language-Image Pre-training) has attained great success in
+pattern recognition and computer vision. Transferring CLIP to downstream tasks
+(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.
+However, current studies primarily focus on either prompt learning for text or
+adapter tuning for vision, without fully exploiting the complementary
+information and correlations among image-text pairs. In this paper, we propose
+an Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to
+few-shot image classification tasks. This method captures fine-grained features
+by leveraging both visual features and textual descriptions of images. IDEA is
+a training-free method for CLIP, and it can be comparable to or even exceeds
+state-of-the-art models on multiple tasks. Furthermore, we introduce
+Trainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable
+components (i.e., a projector and a learnable latent space), further enhancing
+the model's performance and achieving SOTA results on 11 datasets. As one
+important contribution, we employ the Llama model and design a comprehensive
+pipeline to generate textual descriptions for images of 11 datasets, resulting
+in a total of 1,637,795 image-text pairs, named "IMD-11". Our code and data are
+released at https://github.com/FourierAI/IDEA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Evaluative AI: A Hypothesis-Driven Tool with Concept-Based
+  Explanations and Weight of Evidence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04710v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04710v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thao Le, Tim Miller, Ruihan Zhang, Liz Sonenberg, Ronal Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents Visual Evaluative AI, a decision aid that provides
+positive and negative evidence from image data for a given hypothesis. This
+tool finds high-level human concepts in an image and generates the Weight of
+Evidence (WoE) for each hypothesis in the decision-making process. We apply and
+evaluate this tool in the skin cancer domain by building a web-based
+application that allows users to upload a dermatoscopic image, select a
+hypothesis and analyse their decisions by evaluating the provided evidence.
+Further, we demonstrate the effectiveness of Visual Evaluative AI on different
+concept-based explanation approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Segment-Level Road Obstacle Detection Using Visual Foundation Model
+  Priors and Likelihood Ratios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05707v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05707v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssef Shoeb, Nazir Nayal, Azarm Nowzard, Fatma Güney, Hanno Gottschalk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting road obstacles is essential for autonomous vehicles to navigate
+dynamic and complex traffic environments safely. Current road obstacle
+detection methods typically assign a score to each pixel and apply a threshold
+to generate final predictions. However, selecting an appropriate threshold is
+challenging, and the per-pixel classification approach often leads to
+fragmented predictions with numerous false positives. In this work, we propose
+a novel method that leverages segment-level features from visual foundation
+models and likelihood ratios to predict road obstacles directly. By focusing on
+segments rather than individual pixels, our approach enhances detection
+accuracy, reduces false positives, and offers increased robustness to scene
+variability. We benchmark our approach against existing methods on the
+RoadObstacle and LostAndFound datasets, achieving state-of-the-art performance
+without needing a predefined threshold.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, and 1 table, to be published in VISAPP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counteracting temporal attacks in Video Copy Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katarzyna Fojcik, Piotr Syga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Copy Detection (VCD) plays a crucial role in copyright protection and
+content verification by identifying duplicates and near-duplicates in
+large-scale video databases. The META AI Challenge on video copy detection
+provided a benchmark for evaluating state-of-the-art methods, with the
+Dual-level detection approach emerging as a winning solution. This method
+integrates Video Editing Detection and Frame Scene Detection to handle
+adversarial transformations and large datasets efficiently. However, our
+analysis reveals significant limitations in the VED component, particularly in
+its ability to handle exact copies. Moreover, Dual-level detection shows
+vulnerability to temporal attacks. To address it, we propose an improved frame
+selection strategy based on local maxima of interframe differences, which
+enhances robustness against adversarial temporal modifications while
+significantly reducing computational overhead. Our method achieves an increase
+of 1.4 to 5.8 times in efficiency over the standard 1 FPS approach. Compared to
+Dual-level detection method, our approach maintains comparable micro-average
+precision ($\mu$AP) while also demonstrating improved robustness against
+temporal attacks. Given 56\% reduced representation size and the inference time
+of more than 2 times faster, our approach is more suitable to real-world
+resource restriction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Retrieval for Book search <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11034v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11034v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubao Tang, Ruqing Zhang, Jiafeng Guo, Maarten de Rijke, Shihao Liu, Shuaiqing Wang, Dawei Yin, Xueqi Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In book search, relevant book information should be returned in response to a
+query. Books contain complex, multi-faceted information such as metadata,
+outlines, and main text, where the outline provides hierarchical information
+between chapters and sections. Generative retrieval (GR) is a new retrieval
+paradigm that consolidates corpus information into a single model to generate
+identifiers of documents that are relevant to a given query. How can GR be
+applied to book search? Directly applying GR to book search is a challenge due
+to the unique characteristics of book search: The model needs to retain the
+complex, multi-faceted information of the book, which increases the demand for
+labeled data. Splitting book information and treating it as a collection of
+separate segments for learning might result in a loss of hierarchical
+information. We propose an effective Generative retrieval framework for Book
+Search (GBS) that features two main components: data augmentation and
+outline-oriented book encoding. For data augmentation, GBS constructs multiple
+query-book pairs for training; it constructs multiple book identifiers based on
+the outline, various forms of book contents, and simulates real book retrieval
+scenarios with varied pseudo-queries. This includes coverage-promoting book
+identifier augmentation, allowing the model to learn to index effectively, and
+diversity-enhanced query augmentation, allowing the model to learn to retrieve
+effectively. Outline-oriented book encoding improves length extrapolation
+through bi-level positional encoding and retentive attention mechanisms to
+maintain context over long sequences. Experiments on a proprietary Baidu
+dataset demonstrate that GBS outperforms strong baselines, achieving a 9.8\%
+improvement in terms of MRR@20, over the state-of-the-art RIPOR method...
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at KDD ADS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LegalGuardian: A Privacy-Preserving Framework for Secure Integration of
+  Large Language Models in Legal Practice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Mikail Demir, Hakan T. Otal, M. Abdullah Canbaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) hold promise for advancing legal practice by
+automating complex tasks and improving access to justice. However, their
+adoption is limited by concerns over client confidentiality, especially when
+lawyers include sensitive Personally Identifiable Information (PII) in prompts,
+risking unauthorized data exposure. To mitigate this, we introduce
+LegalGuardian, a lightweight, privacy-preserving framework tailored for lawyers
+using LLM-based tools. LegalGuardian employs Named Entity Recognition (NER)
+techniques and local LLMs to mask and unmask confidential PII within prompts,
+safeguarding sensitive data before any external interaction. We detail its
+development and assess its effectiveness using a synthetic prompt library in
+immigration law scenarios. Comparing traditional NER models with one-shot
+prompted local LLM, we find that LegalGuardian achieves a F1-score of 93% with
+GLiNER and 97% with Qwen2.5-14B in PII detection. Semantic similarity analysis
+confirms that the framework maintains high fidelity in outputs, ensuring robust
+utility of LLM-based tools. Our findings indicate that legal professionals can
+harness advanced AI technologies without compromising client confidentiality or
+the quality of legal documents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FlipedRAG: Black-Box Opinion Manipulation Attacks to Retrieval-Augmented
+  Generation of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02968v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02968v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuo Chen, Yuyang Gong, Miaokun Chen, Haotan Liu, Qikai Cheng, Fan Zhang, Wei Lu, Xiaozhong Liu, Jiawei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) addresses hallucination and real-time
+constraints by dynamically retrieving relevant information from a knowledge
+database to supplement the LLMs' input. When presented with a query, RAG
+selects the most semantically similar texts from its knowledge bases and uses
+them as context for the LLMs to generate more accurate responses. RAG also
+creates a new attack surface, especially since RAG databases are frequently
+sourced from public domains. While existing studies have predominantly focused
+on optimizing RAG's performance and efficiency, emerging research has begun
+addressing the security concerns associated with RAG. However, these works have
+some limitations, typically focusing on either white-box methodologies or
+heuristic-based black-box attacks. Furthermore, prior research has mainly
+targeted simple factoid question answering, which is neither practically
+challenging nor resistant to correction. In this paper, we unveil a more
+realistic and threatening scenario: opinion manipulation for controversial
+topics against RAG. Particularly, we propose a novel RAG black-box attack
+method, termed FlipedRAG, which is transfer-based. By leveraging instruction
+engineering, we obtain partial retrieval model outputs from black-box RAG
+system, facilitating the training of surrogate models to enhance the
+effectiveness of opinion manipulation attack. Extensive experimental results
+confirms that our approach significantly enhances the average success rate of
+opinion manipulation by 16.7%. It achieves an average of a 50% directional
+change in the opinion polarity of RAG responses across four themes.
+Additionally, it induces a 20% shift in user cognition. Furthermore, we discuss
+the efficacy of potential defense mechanisms and conclude that they are
+insufficient in mitigating this type of attack, highlighting the urgent need to
+develop novel defensive strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2407.13757</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">27</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning Based Goodput Maximization with Quantized
+  Feedback in URLLC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11190v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11190v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasan Basri Celebi, Mikael Skoglund
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a comprehensive system model for goodput maximization
+with quantized feedback in Ultra-Reliable Low-Latency Communication (URLLC),
+focusing on dynamic channel conditions and feedback schemes. The study
+investigates a communication system, where the receiver provides quantized
+channel state information to the transmitter. The system adapts its feedback
+scheme based on reinforcement learning, aiming to maximize goodput while
+accommodating varying channel statistics. We introduce a novel Rician-$K$
+factor estimation technique to enable the communication system to optimize the
+feedback scheme. This dynamic approach increases the overall performance,
+making it well-suited for practical URLLC applications where channel statistics
+vary over time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the IARIA 21st International Conference on Wireless and
+  Mobile Communication (ICWMC 2025) Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> Can Safety Fine-Tuning Be More Principled? Lessons Learned from
+  Cybersecurity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11183v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11183v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Williams-King, Linh Le, Adam Oberman, <span class="highlight-author">Yoshua Bengio</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As LLMs develop increasingly advanced capabilities, there is an increased
+need to minimize the harm that could be caused to society by certain model
+outputs; hence, most LLMs have safety guardrails added, for example via
+fine-tuning. In this paper, we argue the position that current safety
+fine-tuning is very similar to a traditional cat-and-mouse game (or arms race)
+between attackers and defenders in cybersecurity. Model jailbreaks and attacks
+are patched with bandaids to target the specific attack mechanism, but many
+similar attack vectors might remain. When defenders are not proactively coming
+up with principled mechanisms, it becomes very easy for attackers to sidestep
+any new defenses. We show how current defenses are insufficient to prevent new
+adversarial jailbreak attacks, reward hacking, and loss of control problems. In
+order to learn from past mistakes in cybersecurity, we draw analogies with
+historical examples and develop lessons learned that can be applied to LLM
+safety. These arguments support the need for new and more principled approaches
+to designing safe models, which are architected for security from the
+beginning. We describe several such approaches from the AI literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published at Neurips Safe Generative AI Workshop 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditional Feature Importance with Generative Modeling Using
+  Adversarial Random Forests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kristin Blesch, Niklas Koenen, Jan Kapar, Pegah Golchian, Lukas Burk, Markus Loecher, Marvin N. Wright
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a method for measuring conditional feature importance via
+generative modeling. In explainable artificial intelligence (XAI), conditional
+feature importance assesses the impact of a feature on a prediction model's
+performance given the information of other features. Model-agnostic post hoc
+methods to do so typically evaluate changes in the predictive performance under
+on-manifold feature value manipulations. Such procedures require creating
+feature values that respect conditional feature distributions, which can be
+challenging in practice. Recent advancements in generative modeling can
+facilitate this. For tabular data, which may consist of both categorical and
+continuous features, the adversarial random forest (ARF) stands out as a
+generative model that can generate on-manifold data points without requiring
+intensive tuning efforts or computational resources, making it a promising
+candidate model for subroutines in XAI methods. This paper proposes cARFi
+(conditional ARF feature importance), a method for measuring conditional
+feature importance through feature values sampled from ARF-estimated
+conditional distributions. cARFi requires only little tuning to yield robust
+importance scores that can flexibly adapt for conditional or marginal notions
+of feature importance, including straightforward extensions to condition on
+feature subsets and allows for inferring the significance of feature
+importances through statistical tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProKeR: A Kernel Perspective on Few-Shot Adaptation of Large
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassir Bendou, Amine Ouasfi, Vincent Gripon, Adnane Boukhayma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing popularity of Contrastive Language-Image Pretraining (CLIP) has
+led to its widespread application in various visual downstream tasks. To
+enhance CLIP's effectiveness and versatility, efficient few-shot adaptation
+techniques have been widely adopted. Among these approaches, training-free
+methods, particularly caching methods exemplified by Tip-Adapter, have gained
+attention for their lightweight adaptation without the need for additional
+fine-tuning. In this paper, we revisit Tip-Adapter from a kernel perspective,
+showing that caching methods function as local adapters and are connected to a
+well-established kernel literature. Drawing on this insight, we offer a
+theoretical understanding of how these methods operate and suggest multiple
+avenues for enhancing the Tip-Adapter baseline. Notably, our analysis shows the
+importance of incorporating global information in local adapters. Therefore, we
+subsequently propose a global method that learns a proximal regularizer in a
+reproducing kernel Hilbert space (RKHS) using CLIP as a base learner. Our
+method, which we call ProKeR (Proximal Kernel ridge Regression), has a closed
+form solution and achieves state-of-the-art performances across 11 datasets in
+the standard few-shot adaptation benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://ybendou.github.io/ProKeR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counteracting temporal attacks in Video Copy Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katarzyna Fojcik, Piotr Syga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Copy Detection (VCD) plays a crucial role in copyright protection and
+content verification by identifying duplicates and near-duplicates in
+large-scale video databases. The META AI Challenge on video copy detection
+provided a benchmark for evaluating state-of-the-art methods, with the
+Dual-level detection approach emerging as a winning solution. This method
+integrates Video Editing Detection and Frame Scene Detection to handle
+adversarial transformations and large datasets efficiently. However, our
+analysis reveals significant limitations in the VED component, particularly in
+its ability to handle exact copies. Moreover, Dual-level detection shows
+vulnerability to temporal attacks. To address it, we propose an improved frame
+selection strategy based on local maxima of interframe differences, which
+enhances robustness against adversarial temporal modifications while
+significantly reducing computational overhead. Our method achieves an increase
+of 1.4 to 5.8 times in efficiency over the standard 1 FPS approach. Compared to
+Dual-level detection method, our approach maintains comparable micro-average
+precision ($\mu$AP) while also demonstrating improved robustness against
+temporal attacks. Given 56\% reduced representation size and the inference time
+of more than 2 times faster, our approach is more suitable to real-world
+resource restriction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AIMA at SemEval-2024 Task 3: Simple Yet Powerful Emotion Cause Pair
+  Analysis <span class="chip">SemEval-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alireza Ghahramani Kure, Mahshid Dehghani, Mohammad Mahdi Abootorabi, Nona Ghazizadeh, Seyed Arshan Dalili, Ehsaneddin Asgari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The SemEval-2024 Task 3 presents two subtasks focusing on emotion-cause pair
+extraction within conversational contexts. Subtask 1 revolves around the
+extraction of textual emotion-cause pairs, where causes are defined and
+annotated as textual spans within the conversation. Conversely, Subtask 2
+extends the analysis to encompass multimodal cues, including language, audio,
+and vision, acknowledging instances where causes may not be exclusively
+represented in the textual data. Our proposed model for emotion-cause analysis
+is meticulously structured into three core segments: (i) embedding extraction,
+(ii) cause-pair extraction & emotion classification, and (iii) cause extraction
+using QA after finding pairs. Leveraging state-of-the-art techniques and
+fine-tuning on task-specific datasets, our model effectively unravels the
+intricate web of conversational dynamics and extracts subtle cues signifying
+causality in emotional expressions. Our team, AIMA, demonstrated strong
+performance in the SemEval-2024 Task 3 competition. We ranked as the 10th in
+subtask 1 and the 6th in subtask 2 out of 23 teams.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 18th International Workshop on Semantic Evaluation
+  (SemEval-2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Testing (FedTest): A New Scheme to Enhance Convergence and
+  Mitigate Adversarial Attacks in Federating Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mustafa Ghaleb, Mohanad Obeed, Muhamad Felemban, Anas Chaaban, Halim Yanikomeroglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has emerged as a significant paradigm for training
+machine learning models. This is due to its data-privacy-preserving property
+and its efficient exploitation of distributed computational resources. This is
+achieved by conducting the training process in parallel at distributed users.
+However, traditional FL strategies grapple with difficulties in evaluating the
+quality of received models, handling unbalanced models, and reducing the impact
+of detrimental models. To resolve these problems, we introduce a novel
+federated learning framework, which we call federated testing for federated
+learning (FedTest). In the FedTest method, the local data of a specific user is
+used to train the model of that user and test the models of the other users.
+This approach enables users to test each other's models and determine an
+accurate score for each. This score can then be used to aggregate the models
+efficiently and identify any malicious ones. Our numerical results reveal that
+the proposed method not only accelerates convergence rates but also diminishes
+the potential influence of malicious users. This significantly enhances the
+overall efficiency and robustness of FL systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AIMA at SemEval-2024 Task 10: History-Based Emotion Recognition in
+  Hindi-English Code-Mixed Conversations <span class="chip">SemEval-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mahdi Abootorabi, Nona Ghazizadeh, Seyed Arshan Dalili, Alireza Ghahramani Kure, Mahshid Dehghani, Ehsaneddin Asgari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce a solution to the SemEval 2024 Task 10 on subtask
+1, dedicated to Emotion Recognition in Conversation (ERC) in code-mixed
+Hindi-English conversations. ERC in code-mixed conversations presents unique
+challenges, as existing models are typically trained on monolingual datasets
+and may not perform well on code-mixed data. To address this, we propose a
+series of models that incorporate both the previous and future context of the
+current utterance, as well as the sequential information of the conversation.
+To facilitate the processing of code-mixed data, we developed a
+Hinglish-to-English translation pipeline to translate the code-mixed
+conversations into English. We designed four different base models, each
+utilizing powerful pre-trained encoders to extract features from the input but
+with varying architectures. By ensembling all of these models, we developed a
+final model that outperforms all other baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 18th International Workshop on Semantic Evaluation
+  (SemEval-2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling Attention during Dimensional Shifts with Counterfactual and
+  Delayed Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11161v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11161v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tyler Malloy, Roderick Seow, Cleotilde Gonzalez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention can be used to inform choice selection in contextual bandit tasks
+even when context features have not been previously experienced. One example of
+this is in dimensional shifts, where additional feature values are introduced
+and the relationship between features and outcomes can either be static or
+variable. Attentional mechanisms have been extensively studied in contextual
+bandit tasks where the feedback of choices is provided immediately, but less
+research has been done on tasks where feedback is delayed or in counterfactual
+feedback cases. Some methods have successfully modeled human attention with
+immediate feedback based on reward prediction errors (RPEs), though recent
+research raises questions of the applicability of RPEs onto more general
+attentional mechanisms. Alternative models suggest that information theoretic
+metrics can be used to model human attention, with broader applications to
+novel stimuli. In this paper, we compare two different methods for modeling how
+humans attend to specific features of decision making tasks, one that is based
+on calculating an information theoretic metric using a memory of past
+experiences, and another that is based on iteratively updating attention from
+reward prediction errors. We compare these models using simulations in a
+contextual bandit task with both intradimensional and extradimensional domain
+shifts, as well as immediate, delayed, and counterfactual feedback. We find
+that calculating an information theoretic metric over a history of experiences
+is best able to account for human-like behavior in tasks that shift dimensions
+and alter feedback presentation. These results indicate that information
+theoretic metrics of attentional mechanisms may be better suited than RPEs to
+predict human attention in decision making, though further studies of human
+behavior are necessary to support these results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modelling of automotive steel fatigue lifetime by machine learning
+  method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11154v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11154v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleh Yasniy, Dmytro Tymoshchuk, Iryna Didych, Nataliya Zagorodna, Olha Malyshevska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the current study, the fatigue life of QSTE340TM steel was modelled using
+a machine learning method, namely, a neural network. This problem was solved by
+a Multi-Layer Perceptron (MLP) neural network with a 3-75-1 architecture, which
+allows the prediction of the crack length based on the number of load cycles N,
+the stress ratio R, and the overload ratio Rol. The proposed model showed high
+accuracy, with mean absolute percentage error (MAPE) ranging from 0.02% to
+4.59% for different R and Rol. The neural network effectively reveals the
+nonlinear relationships between input parameters and fatigue crack growth,
+providing reliable predictions for different loading conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper Submitted to ITTAP 2024 CEUR-WS, see
+  https://ceur-ws.org/Vol-3896/short4.pdf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Community detection for Contexual-LSBM: Theoretical limitation on
+  misclassfication ratio and effecient algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dian Jin, Yuqian Zhang, Qiaosheng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of both network information and node attribute information
+has recently gained significant attention in the context of community recovery
+problems. In this work, we address the task of determining the optimal
+classification rate for the Label-SBM(LSBM) model with node attribute
+information and. Specifically, we derive the optimal lower bound, which is
+characterized by the Chernoff-Hellinger divergence for a general LSBM network
+model with Gaussian node attributes. Additionally, we highlight the connection
+between the divergence $D(\bs\alpha, \mb P, \bs\mu)$ in our model and those
+introduced in \cite{yun2016optimal} and \cite{lu2016statistical}. We also
+presents a consistent algorithm based on spectral method for the proposed
+aggreated latent factor model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Switch-Type Policy Network for Resource Allocation Problems:
+  Technical Report 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerrod Wigmore, Brooke Shrader, Eytan Modiano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Reinforcement Learning (DRL) has become a powerful tool for developing
+control policies in queueing networks, but the common use of Multi-layer
+Perceptron (MLP) neural networks in these applications has significant
+drawbacks. MLP architectures, while versatile, often suffer from poor sample
+efficiency and a tendency to overfit training environments, leading to
+suboptimal performance on new, unseen networks. In response to these issues, we
+introduce a switch-type neural network (STN) architecture designed to improve
+the efficiency and generalization of DRL policies in queueing networks. The STN
+leverages structural patterns from traditional non-learning policies, ensuring
+consistent action choices across similar states. This design not only
+streamlines the learning process but also fosters better generalization by
+reducing the tendency to overfit. Our works presents three key contributions:
+first, the development of the STN as a more effective alternative to MLPs;
+second, empirical evidence showing that STNs achieve superior sample efficiency
+in various training scenarios; and third, experimental results demonstrating
+that STNs match MLP performance in familiar environments and significantly
+outperform them in new settings. By embedding domain-specific knowledge, the
+STN enhances the Proximal Policy Optimization (PPO) algorithm's effectiveness
+without compromising performance, suggesting its suitability for a wide range
+of queueing network control problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Playing the Lottery With Concave Regularizers for Sparse Trainable
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giulia Fracastoro, Sophie M. Fosson, Andrea Migliorati, Giuseppe C. Calafiore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The design of sparse neural networks, i.e., of networks with a reduced number
+of parameters, has been attracting increasing research attention in the last
+few years. The use of sparse models may significantly reduce the computational
+and storage footprint in the inference phase. In this context, the lottery
+ticket hypothesis (LTH) constitutes a breakthrough result, that addresses not
+only the performance of the inference phase, but also of the training phase. It
+states that it is possible to extract effective sparse subnetworks, called
+winning tickets, that can be trained in isolation. The development of effective
+methods to play the lottery, i.e., to find winning tickets, is still an open
+problem. In this article, we propose a novel class of methods to play the
+lottery. The key point is the use of concave regularization to promote the
+sparsity of a relaxed binary mask, which represents the network topology. We
+theoretically analyze the effectiveness of the proposed method in the convex
+framework. Then, we propose extended numerical tests on various datasets and
+architectures, that show that the proposed method can improve the performance
+of state-of-the-art algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advanced technology in railway track monitoring using the GPR Technique:
+  A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farhad Kooban, Aleksandra Radlińska, Reza Mousapour, Maryam Saraei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Subsurface evaluation of railway tracks is crucial for safe operation, as it
+allows for the early detection and remediation of potential structural
+weaknesses or defects that could lead to accidents or derailments. Ground
+Penetrating Radar (GPR) is an electromagnetic survey technique as advanced
+non-destructive technology (NDT) that can be used to monitor railway tracks.
+This technology is well-suited for railway applications due to the sub-layered
+composition of the track, which includes ties, ballast, sub-ballast, and
+subgrade regions. It can detect defects such as ballast pockets, fouled
+ballast, poor drainage, and subgrade settlement. The paper reviews recent works
+on advanced technology and interpretations of GPR data collected for different
+layers. Further, this paper demonstrates the current techniques for using
+synthetic modeling to calibrate real-world GPR data, enhancing accuracy in
+identifying subsurface features like ballast conditions and structural
+anomalies and applying various algorithms to refine GPR data analysis. These
+include Support Vector Machine (SVM) for classifying railway ballast types,
+Fuzzy C-means, and Generalized Regression Neural Networks for high-accuracy
+defect classification. Deep learning techniques, particularly Convolutional
+Neural Networks (CNNs) and Recurrent Neural Networks (RNNs) are also
+highlighted for their effectiveness in recognizing patterns associated with
+defects in GPR images. The article specifically focuses on the development of a
+Convolutional Recurrent Neural Network (CRNN) model, which combines CNN and RNN
+architectures for efficient processing of GPR data. This model demonstrates
+enhanced detection capabilities and faster processing compared to traditional
+object detection models like Faster R-CNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2nd Canadian & Cold Regions Rail Research Conference 2024 (CCRC 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Regularized Online Newton Method for Stochastic Convex Bandits with
+  Linear Vanishing Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingxin Zhan, Yuchen Xin, Kaicheng Jin, Zhihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a stochastic convex bandit problem where the subgaussian noise
+parameter is assumed to decrease linearly as the learner selects actions closer
+and closer to the minimizer of the convex loss function. Accordingly, we
+propose a Regularized Online Newton Method (RONM) for solving the problem,
+based on the Online Newton Method (ONM) of arXiv:2406.06506. Our RONM reaches a
+polylogarithmic regret in the time horizon $n$ when the loss function grows
+quadratically in the constraint set, which recovers the results of
+arXiv:2402.12042 in linear bandits. Our analyses rely on the growth rate of the
+precision matrix $\Sigma_t^{-1}$ in ONM and we find that linear growth solves
+the question exactly. These analyses also help us obtain better convergence
+rates when the loss function grows faster. We also study and analyze two new
+bandit models: stochastic convex bandits with noise scaled to a subgaussian
+parameter function and convex bandits with stochastic multiplicative noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tell me about yourself: LLMs are aware of their learned behaviors <span class="chip">ICLR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11120v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11120v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Betley, Xuchan Bao, Martín Soto, Anna Sztyber-Betley, James Chua, Owain Evans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study behavioral self-awareness -- an LLM's ability to articulate its
+behaviors without requiring in-context examples. We finetune LLMs on datasets
+that exhibit particular behaviors, such as (a) making high-risk economic
+decisions, and (b) outputting insecure code. Despite the datasets containing no
+explicit descriptions of the associated behavior, the finetuned LLMs can
+explicitly describe it. For example, a model trained to output insecure code
+says, ``The code I write is insecure.'' Indeed, models show behavioral
+self-awareness for a range of behaviors and for diverse evaluations. Note that
+while we finetune models to exhibit behaviors like writing insecure code, we do
+not finetune them to articulate their own behaviors -- models do this without
+any special training or examples.
+  Behavioral self-awareness is relevant for AI safety, as models could use it
+to proactively disclose problematic behaviors. In particular, we study backdoor
+policies, where models exhibit unexpected behaviors only under certain trigger
+conditions. We find that models can sometimes identify whether or not they have
+a backdoor, even without its trigger being present. However, models are not
+able to directly output their trigger by default.
+  Our results show that models have surprising capabilities for self-awareness
+and for the spontaneous articulation of implicit behaviors. Future work could
+investigate this capability for a wider range of scenarios and models
+(including practical scenarios), and explain how it emerges in LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICLR 2025. 17 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Pearson Correlation-Based Merging Algorithm for Robust
+  Distributed Machine Learning with Heterogeneous Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11112v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11112v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Ghabel Rahmat, Majid Khalilian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning faces significant challenges in scenarios with
+heterogeneous data distributions and adverse network conditions, such as
+delays, packet loss, and data poisoning attacks. This paper proposes a novel
+method based on the SCAFFOLD algorithm to improve the quality of local updates
+and enhance the robustness of the global model. The key idea is to form
+intermediary nodes by merging local models with high similarity, using the
+Pearson correlation coefficient as a similarity measure. The proposed merging
+algorithm reduces the number of local nodes while maintaining the accuracy of
+the global model, effectively addressing communication overhead and bandwidth
+consumption. Experimental results on the MNIST dataset under simulated
+federated learning scenarios demonstrate the method's effectiveness. After 10
+rounds of training using a CNN model, the proposed approach achieved accuracies
+of 0.82, 0.73, and 0.66 under normal conditions, packet loss and data poisoning
+attacks, respectively, outperforming the baseline SCAFFOLD algorithm. These
+results highlight the potential of the proposed method to improve efficiency
+and resilience in federated learning systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QF-tuner: Breaking Tradition in Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16562v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16562v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahmood A. Jumaah, Yossra H. Ali, Tarik A. Rashid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperparameter tuning in reinforcement learning algorithms refers to choosing
+the optimal parameters that may increase the algorithm's performance. Manual or
+random hyperparameter tuning methods can be problematic, as even slight
+variations in their values can result in significantly different outcomes in
+the learning process. In this paper, we propose a new method, QF-tuner, for
+automatic hyperparameter tuning in the Q-learning algorithm using the FOX
+optimization algorithm (FOX). A new objective function has been proposed for
+the FOX, prioritizing reward over learning error and time. QF-tuner starts by
+running the FOX and tries to minimize the fitness value derived from
+observations at each iteration by executing the Q-learning algorithm. The
+proposed method has been evaluated using two control tasks from the OpenAI Gym:
+CartPole and FrozenLake. The empirical results of the QF-tuner on the CartPole
+control task show a reward of 499, and on the FrozenLake control task, a reward
+of 1. These results indicate that the QF-tuner outperforms other optimization
+algorithms. On the FrozenLake control task, there was a 36\% increase in reward
+with a 26\% reduction in learning time; on the CartPole control task, there was
+a 57\% increase in reward with a 20\% decrease in learning time. Thus, the
+QF-tuner is an essential method for hyperparameter tuning in reinforcement
+learning algorithms, enabling more effective solutions to control task
+problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Uncertainty Propagation in Offline RL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00284v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00284v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanath Kumar Krishnamurthy, Tanmay Gangwani, Sumeet Katariya, Branislav Kveton, Shrey Modi, Anshuka Rangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the finite-horizon offline reinforcement learning (RL) setting,
+and are motivated by the challenge of learning the policy at any step h in
+dynamic programming (DP) algorithms. To learn this, it is sufficient to
+evaluate the treatment effect of deviating from the behavioral policy at step h
+after having optimized the policy for all future steps. Since the policy at any
+step can affect next-state distributions, the related distributional shift
+challenges can make this problem far more statistically hard than estimating
+such treatment effects in the stochastic contextual bandit setting. However,
+the hardness of many real-world RL instances lies between the two regimes. We
+develop a flexible and general method called selective uncertainty propagation
+for confidence interval construction that adapts to the hardness of the
+associated distribution shift challenges. We show benefits of our approach on
+toy environments and demonstrate the benefits of these techniques for offline
+policy learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep-Relative-Trust-Based Diffusion for Decentralized Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03162v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03162v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muyun Li, Aaron Fainman, Stefan Vlaski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized learning strategies allow a collection of agents to learn
+efficiently from local data sets without the need for central aggregation or
+orchestration. Current decentralized learning paradigms typically rely on an
+averaging mechanism to encourage agreement in the parameter space. We argue
+that in the context of deep neural networks, which are often
+over-parameterized, encouraging consensus of the neural network outputs, as
+opposed to their parameters can be more appropriate. This motivates the
+development of a new decentralized learning algorithm, termed DRT diffusion,
+based on deep relative trust (DRT), a recently introduced similarity measure
+for neural networks. We provide convergence analysis for the proposed strategy,
+and numerically establish its benefit to generalization, especially with sparse
+topologies, in an image classification task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embodied Agent Interface: Benchmarking LLMs for Embodied Decision Making <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.07166v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.07166v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manling Li, Shiyu Zhao, Qineng Wang, Kangrui Wang, Yu Zhou, Sanjana Srivastava, Cem Gokmen, Tony Lee, Li Erran Li, Ruohan Zhang, Weiyu Liu, Percy Liang, Li Fei-Fei, Jiayuan Mao, Jiajun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to evaluate Large Language Models (LLMs) for embodied decision making.
+While a significant body of work has been leveraging LLMs for decision making
+in embodied environments, we still lack a systematic understanding of their
+performance because they are usually applied in different domains, for
+different purposes, and built based on different inputs and outputs.
+Furthermore, existing evaluations tend to rely solely on a final success rate,
+making it difficult to pinpoint what ability is missing in LLMs and where the
+problem lies, which in turn blocks embodied agents from leveraging LLMs
+effectively and selectively. To address these limitations, we propose a
+generalized interface (Embodied Agent Interface) that supports the
+formalization of various types of tasks and input-output specifications of
+LLM-based modules. Specifically, it allows us to unify 1) a broad set of
+embodied decision-making tasks involving both state and temporally extended
+goals, 2) four commonly-used LLM-based modules for decision making: goal
+interpretation, subgoal decomposition, action sequencing, and transition
+modeling, and 3) a collection of fine-grained metrics which break down
+evaluation into various types of errors, such as hallucination errors,
+affordance errors, various types of planning errors, etc. Overall, our
+benchmark offers a comprehensive assessment of LLMs' performance for different
+subtasks, pinpointing the strengths and weaknesses in LLM-powered embodied AI
+systems, and providing insights for effective and selective use of LLMs in
+embodied decision making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for oral presentation at NeurIPS 2024 in the Datasets and
+  Benchmarks track. Final Camera version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Subtractive Training for Music Stem Insertion using Latent Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19328v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19328v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Villa-Renteria, Mason L. Wang, Zachary Shah, Zhe Li, Soohyun Kim, Neelesh Ramachandran, Mert Pilanci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Subtractive Training, a simple and novel method for synthesizing
+individual musical instrument stems given other instruments as context. This
+method pairs a dataset of complete music mixes with 1) a variant of the dataset
+lacking a specific stem, and 2) LLM-generated instructions describing how the
+missing stem should be reintroduced. We then fine-tune a pretrained
+text-to-audio diffusion model to generate the missing instrument stem, guided
+by both the existing stems and the text instruction. Our results demonstrate
+Subtractive Training's efficacy in creating authentic drum stems that
+seamlessly blend with the existing tracks. We also show that we can use the
+text instruction to control the generation of the inserted stem in terms of
+rhythm, dynamics, and genre, allowing us to modify the style of a single
+instrument in a full song while keeping the remaining instruments the same.
+Lastly, we extend this technique to MIDI formats, successfully generating
+compatible bass, drum, and guitar parts for incomplete arrangements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, survey, edit pipeline figure, fix typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient Boosting Decision Trees on Medical Diagnosis over Tabular Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.03705v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.03705v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Yarkın Yıldız, Asli Kalayci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical diagnosis is a crucial task in the medical field, in terms of
+providing accurate classification and respective treatments. Having
+near-precise decisions based on correct diagnosis can affect a patient's life
+itself, and may extremely result in a catastrophe if not classified correctly.
+Several traditional machine learning (ML), such as support vector machines
+(SVMs) and logistic regression, and state-of-the-art tabular deep learning (DL)
+methods, including TabNet and TabTransformer, have been proposed and used over
+tabular medical datasets. Additionally, due to the superior performances, lower
+computational costs, and easier optimization over different tasks, ensemble
+methods have been used in the field more recently. They offer a powerful
+alternative in terms of providing successful medical decision-making processes
+in several diagnosis tasks. In this study, we investigated the benefits of
+ensemble methods, especially the Gradient Boosting Decision Tree (GBDT)
+algorithms in medical classification tasks over tabular data, focusing on
+XGBoost, CatBoost, and LightGBM. The experiments demonstrate that GBDT methods
+outperform traditional ML and deep neural network architectures and have the
+highest average rank over several benchmark tabular medical diagnosis datasets.
+Furthermore, they require much less computational power compared to DL models,
+creating the optimal methodology in terms of high performance and lower
+complexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Average gradient outer product as a mechanism for deep neural collapse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13728v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13728v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Beaglehole, Peter Súkeník, Marco Mondelli, Mikhail Belkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Collapse (DNC) refers to the surprisingly rigid structure of the
+data representations in the final layers of Deep Neural Networks (DNNs). Though
+the phenomenon has been measured in a variety of settings, its emergence is
+typically explained via data-agnostic approaches, such as the unconstrained
+features model. In this work, we introduce a data-dependent setting where DNC
+forms due to feature learning through the average gradient outer product
+(AGOP). The AGOP is defined with respect to a learned predictor and is equal to
+the uncentered covariance matrix of its input-output gradients averaged over
+the training dataset. The Deep Recursive Feature Machine (Deep RFM) is a method
+that constructs a neural network by iteratively mapping the data with the AGOP
+and applying an untrained random feature map. We demonstrate empirically that
+DNC occurs in Deep RFM across standard settings as a consequence of the
+projection with the AGOP matrix computed at each layer. Further, we
+theoretically explain DNC in Deep RFM in an asymptotic setting and as a result
+of kernel learning. We then provide evidence that this mechanism holds for
+neural networks more generally. In particular, we show that the right singular
+vectors and values of the weights can be responsible for the majority of
+within-class variability collapse for DNNs trained in the feature learning
+regime. As observed in recent work, this singular structure is highly
+correlated with that of the AGOP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Label is Worth a Thousand Images in <span class="highlight-title">Dataset</span> Distillation <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10485v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10485v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Qin, Zhiwei Deng, David Alvarez-Melis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data $\textit{quality}$ is a crucial factor in the performance of machine
+learning models, a principle that dataset distillation methods exploit by
+compressing training datasets into much smaller counterparts that maintain
+similar downstream performance. Understanding how and why data distillation
+methods work is vital not only for improving these methods but also for
+revealing fundamental characteristics of "good" training data. However, a major
+challenge in achieving this goal is the observation that distillation
+approaches, which rely on sophisticated but mostly disparate methods to
+generate synthetic data, have little in common with each other. In this work,
+we highlight a largely overlooked aspect common to most of these methods: the
+use of soft (probabilistic) labels. Through a series of ablation experiments,
+we study the role of soft labels in depth. Our results reveal that the main
+factor explaining the performance of state-of-the-art distillation methods is
+not the specific techniques used to generate synthetic data but rather the use
+of soft labels. Furthermore, we demonstrate that not all soft labels are
+created equal; they must contain $\textit{structured information}$ to be
+beneficial. We also provide empirical scaling laws that characterize the
+effectiveness of soft labels as a function of images-per-class in the distilled
+dataset and establish an empirical Pareto frontier for data-efficient learning.
+Combined, our findings challenge conventional wisdom in dataset distillation,
+underscore the importance of soft labels in learning, and suggest new
+directions for improving distillation methods. Code for all experiments is
+available at https://github.com/sunnytqin/no-distillation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning rheological parameters of non-Newtonian fluids from velocimetry
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.02604v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.02604v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandros Kontogiannis, Richard Hodgkinson, Emily L. Manchester
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We solve a Bayesian inverse Navier-Stokes (N-S) problem that assimilates
+velocimetry data in order to jointly reconstruct the flow field and learn the
+unknown N-S parameters. By incorporating a Carreau shear-thinning viscosity
+model into the N-S problem, we devise an algorithm that learns the most likely
+Carreau parameters of a shear-thinning fluid, and estimates their
+uncertainties, from velocimetry data alone. We then conduct a flow-MRI
+experiment to obtain velocimetry data of an axisymmetric laminar jet through an
+idealised medical device (FDA nozzle) for a blood analogue fluid. We show that
+the algorithm can successfully reconstruct the flow field by learning the most
+likely Carreau parameters, and that the learned parameters are in very good
+agreement with rheometry measurements. The algorithm accepts any algebraic
+effective viscosity model, as long as the model is differentiable, and it can
+be extended to more complicated non-Newtonian fluids (e.g. Oldroyd-B fluid) if
+a viscoelastic model is incorporated into the N-S problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contextual Feedback Loops: Amplifying Deep Reasoning with Iterative
+  Top-Down Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17737v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17737v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Fein-Ashley, Rajgopal Kannan, Viktor Prasanna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose \emph{Contextual Feedback Loops} (CFLs) as a simple yet effective
+way to infuse top-down context into earlier layers of a neural network. Unlike
+standard backpropagation, which only revisits network parameters based on how
+far predictions deviate from labels, CFLs \emph{directly} re-introduce the
+model's own output signals as feedback to guide repeated cycles of refinement.
+This mechanism is broadly applicable across architectures (e.g., CNNs and
+transformers), and empirical results show that iterative top-down feedback
+boosts the accuracy and coherence of the resulting representations. We suggest
+that by projecting context back into lower-level processing stages, CFLs bridge
+the gap between purely bottom-up inference and more dynamic, feedback-driven
+reasoning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counteracting temporal attacks in Video Copy Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katarzyna Fojcik, Piotr Syga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Copy Detection (VCD) plays a crucial role in copyright protection and
+content verification by identifying duplicates and near-duplicates in
+large-scale video databases. The META AI Challenge on video copy detection
+provided a benchmark for evaluating state-of-the-art methods, with the
+Dual-level detection approach emerging as a winning solution. This method
+integrates Video Editing Detection and Frame Scene Detection to handle
+adversarial transformations and large datasets efficiently. However, our
+analysis reveals significant limitations in the VED component, particularly in
+its ability to handle exact copies. Moreover, Dual-level detection shows
+vulnerability to temporal attacks. To address it, we propose an improved frame
+selection strategy based on local maxima of interframe differences, which
+enhances robustness against adversarial temporal modifications while
+significantly reducing computational overhead. Our method achieves an increase
+of 1.4 to 5.8 times in efficiency over the standard 1 FPS approach. Compared to
+Dual-level detection method, our approach maintains comparable micro-average
+precision ($\mu$AP) while also demonstrating improved robustness against
+temporal attacks. Given 56\% reduced representation size and the inference time
+of more than 2 times faster, our approach is more suitable to real-world
+resource restriction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HARP: A Large-Scale Higher-Order Ambisonic Room Impulse Response <span class="highlight-title">Dataset</span> <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14207v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14207v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivam Saini, Jürgen Peissig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This contribution introduces a dataset of 7th-order Ambisonic Room Impulse
+Responses (HOA-RIRs), created using the Image Source Method. By employing
+higher-order Ambisonics, our dataset enables precise spatial audio
+reproduction, a critical requirement for realistic immersive audio
+applications. Leveraging the virtual simulation, we present a unique microphone
+configuration, based on the superposition principle, designed to optimize sound
+field coverage while addressing the limitations of traditional microphone
+arrays. The presented 64-microphone configuration allows us to capture RIRs
+directly in the Spherical Harmonics domain. The dataset features a wide range
+of room configurations, encompassing variations in room geometry, acoustic
+absorption materials, and source-receiver distances. A detailed description of
+the simulation setup is provided alongside for an accurate reproduction. The
+dataset serves as a vital resource for researchers working on spatial audio,
+particularly in applications involving machine learning to improve room
+acoustics modeling and sound field synthesis. It further provides a very high
+level of spatial resolution and realism crucial for tasks such as source
+localization, reverberation prediction, and immersive sound reproduction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025 Workshop. Code to generate uploaded at:
+  https://github.com/whojavumusic/HARP</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">55</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> Can Safety Fine-Tuning Be More Principled? Lessons Learned from
+  Cybersecurity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11183v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11183v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Williams-King, Linh Le, Adam Oberman, <span class="highlight-author">Yoshua Bengio</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As LLMs develop increasingly advanced capabilities, there is an increased
+need to minimize the harm that could be caused to society by certain model
+outputs; hence, most LLMs have safety guardrails added, for example via
+fine-tuning. In this paper, we argue the position that current safety
+fine-tuning is very similar to a traditional cat-and-mouse game (or arms race)
+between attackers and defenders in cybersecurity. Model jailbreaks and attacks
+are patched with bandaids to target the specific attack mechanism, but many
+similar attack vectors might remain. When defenders are not proactively coming
+up with principled mechanisms, it becomes very easy for attackers to sidestep
+any new defenses. We show how current defenses are insufficient to prevent new
+adversarial jailbreak attacks, reward hacking, and loss of control problems. In
+order to learn from past mistakes in cybersecurity, we draw analogies with
+historical examples and develop lessons learned that can be applied to LLM
+safety. These arguments support the need for new and more principled approaches
+to designing safe models, which are architected for security from the
+beginning. We describe several such approaches from the AI literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published at Neurips Safe Generative AI Workshop 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProKeR: A Kernel Perspective on Few-Shot Adaptation of Large
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassir Bendou, Amine Ouasfi, Vincent Gripon, Adnane Boukhayma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing popularity of Contrastive Language-Image Pretraining (CLIP) has
+led to its widespread application in various visual downstream tasks. To
+enhance CLIP's effectiveness and versatility, efficient few-shot adaptation
+techniques have been widely adopted. Among these approaches, training-free
+methods, particularly caching methods exemplified by Tip-Adapter, have gained
+attention for their lightweight adaptation without the need for additional
+fine-tuning. In this paper, we revisit Tip-Adapter from a kernel perspective,
+showing that caching methods function as local adapters and are connected to a
+well-established kernel literature. Drawing on this insight, we offer a
+theoretical understanding of how these methods operate and suggest multiple
+avenues for enhancing the Tip-Adapter baseline. Notably, our analysis shows the
+importance of incorporating global information in local adapters. Therefore, we
+subsequently propose a global method that learns a proximal regularizer in a
+reproducing kernel Hilbert space (RKHS) using CLIP as a base learner. Our
+method, which we call ProKeR (Proximal Kernel ridge Regression), has a closed
+form solution and achieves state-of-the-art performances across 11 datasets in
+the standard few-shot adaptation benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://ybendou.github.io/ProKeR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counteracting temporal attacks in Video Copy Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katarzyna Fojcik, Piotr Syga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Copy Detection (VCD) plays a crucial role in copyright protection and
+content verification by identifying duplicates and near-duplicates in
+large-scale video databases. The META AI Challenge on video copy detection
+provided a benchmark for evaluating state-of-the-art methods, with the
+Dual-level detection approach emerging as a winning solution. This method
+integrates Video Editing Detection and Frame Scene Detection to handle
+adversarial transformations and large datasets efficiently. However, our
+analysis reveals significant limitations in the VED component, particularly in
+its ability to handle exact copies. Moreover, Dual-level detection shows
+vulnerability to temporal attacks. To address it, we propose an improved frame
+selection strategy based on local maxima of interframe differences, which
+enhances robustness against adversarial temporal modifications while
+significantly reducing computational overhead. Our method achieves an increase
+of 1.4 to 5.8 times in efficiency over the standard 1 FPS approach. Compared to
+Dual-level detection method, our approach maintains comparable micro-average
+precision ($\mu$AP) while also demonstrating improved robustness against
+temporal attacks. Given 56\% reduced representation size and the inference time
+of more than 2 times faster, our approach is more suitable to real-world
+resource restriction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AIMA at SemEval-2024 Task 3: Simple Yet Powerful Emotion Cause Pair
+  Analysis <span class="chip">SemEval-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alireza Ghahramani Kure, Mahshid Dehghani, Mohammad Mahdi Abootorabi, Nona Ghazizadeh, Seyed Arshan Dalili, Ehsaneddin Asgari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The SemEval-2024 Task 3 presents two subtasks focusing on emotion-cause pair
+extraction within conversational contexts. Subtask 1 revolves around the
+extraction of textual emotion-cause pairs, where causes are defined and
+annotated as textual spans within the conversation. Conversely, Subtask 2
+extends the analysis to encompass multimodal cues, including language, audio,
+and vision, acknowledging instances where causes may not be exclusively
+represented in the textual data. Our proposed model for emotion-cause analysis
+is meticulously structured into three core segments: (i) embedding extraction,
+(ii) cause-pair extraction & emotion classification, and (iii) cause extraction
+using QA after finding pairs. Leveraging state-of-the-art techniques and
+fine-tuning on task-specific datasets, our model effectively unravels the
+intricate web of conversational dynamics and extracts subtle cues signifying
+causality in emotional expressions. Our team, AIMA, demonstrated strong
+performance in the SemEval-2024 Task 3 competition. We ranked as the 10th in
+subtask 1 and the 6th in subtask 2 out of 23 teams.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 18th International Workshop on Semantic Evaluation
+  (SemEval-2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AIMA at SemEval-2024 Task 10: History-Based Emotion Recognition in
+  Hindi-English Code-Mixed Conversations <span class="chip">SemEval-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mahdi Abootorabi, Nona Ghazizadeh, Seyed Arshan Dalili, Alireza Ghahramani Kure, Mahshid Dehghani, Ehsaneddin Asgari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce a solution to the SemEval 2024 Task 10 on subtask
+1, dedicated to Emotion Recognition in Conversation (ERC) in code-mixed
+Hindi-English conversations. ERC in code-mixed conversations presents unique
+challenges, as existing models are typically trained on monolingual datasets
+and may not perform well on code-mixed data. To address this, we propose a
+series of models that incorporate both the previous and future context of the
+current utterance, as well as the sequential information of the conversation.
+To facilitate the processing of code-mixed data, we developed a
+Hinglish-to-English translation pipeline to translate the code-mixed
+conversations into English. We designed four different base models, each
+utilizing powerful pre-trained encoders to extract features from the input but
+with varying architectures. By ensembling all of these models, we developed a
+final model that outperforms all other baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 18th International Workshop on Semantic Evaluation
+  (SemEval-2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLOFAI: A <span class="highlight-title">Dataset</span> of Real And Fake Image Classification Tasks for
+  Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11140v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11140v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Doherty, Anton Lee, Heitor Murilo Gomes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of generative AI models capable of creating realistic
+media has led to a need for classifiers that can accurately distinguish between
+genuine and artificially-generated images. A significant challenge for these
+classifiers emerges when they encounter images from generative models that are
+not represented in their training data, usually resulting in diminished
+performance. A typical approach is to periodically update the classifier's
+training data with images from the new generative models then retrain the
+classifier on the updated dataset. However, in some real-life scenarios,
+storage, computational, or privacy constraints render this approach
+impractical. Additionally, models used in security applications may be required
+to rapidly adapt. In these circumstances, continual learning provides a
+promising alternative, as the classifier can be updated without retraining on
+the entire dataset. In this paper, we introduce a new dataset called CLOFAI
+(Continual Learning On Fake and Authentic Images), which takes the form of a
+domain-incremental image classification problem. Moreover, we showcase the
+applicability of this dataset as a benchmark for evaluating continual learning
+methodologies. In doing this, we set a baseline on our novel dataset using
+three foundational continual learning methods -- EWC, GEM, and Experience
+Replay -- and find that EWC performs poorly, while GEM and Experience Replay
+show promise, performing significantly better than a Naive baseline. The
+dataset and code to run the experiments can be accessed from the following
+GitHub repository: https://github.com/Will-Doherty/CLOFAI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Playing the Lottery With Concave Regularizers for Sparse Trainable
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giulia Fracastoro, Sophie M. Fosson, Andrea Migliorati, Giuseppe C. Calafiore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The design of sparse neural networks, i.e., of networks with a reduced number
+of parameters, has been attracting increasing research attention in the last
+few years. The use of sparse models may significantly reduce the computational
+and storage footprint in the inference phase. In this context, the lottery
+ticket hypothesis (LTH) constitutes a breakthrough result, that addresses not
+only the performance of the inference phase, but also of the training phase. It
+states that it is possible to extract effective sparse subnetworks, called
+winning tickets, that can be trained in isolation. The development of effective
+methods to play the lottery, i.e., to find winning tickets, is still an open
+problem. In this article, we propose a novel class of methods to play the
+lottery. The key point is the use of concave regularization to promote the
+sparsity of a relaxed binary mask, which represents the network topology. We
+theoretically analyze the effectiveness of the proposed method in the convex
+framework. Then, we propose extended numerical tests on various datasets and
+architectures, that show that the proposed method can improve the performance
+of state-of-the-art algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Collection of Question Answering <span class="highlight-title">Dataset</span>s for Norwegian 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladislav Mikhailov, Petter Mæhlum, Victoria Ovedie Chruickshank Langø, Erik Velldal, Lilja Øvrelid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new suite of question answering datasets for
+Norwegian; NorOpenBookQA, NorCommonSenseQA, NorTruthfulQA, and NRK-Quiz-QA. The
+data covers a wide range of skills and knowledge domains, including world
+knowledge, commonsense reasoning, truthfulness, and knowledge about Norway.
+Covering both of the written standards of Norwegian - Bokm{\aa}l and Nynorsk -
+our datasets comprise over 10k question-answer pairs, created by native
+speakers. We detail our dataset creation approach and present the results of
+evaluating 11 language models (LMs) in zero- and few-shot regimes. Most LMs
+perform better in Bokm{\aa}l than Nynorsk, struggle most with commonsense
+reasoning, and are often untruthful in generating answers to questions. All our
+datasets and annotation materials are publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for NoDaLiDa / Baltic-HLT 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tell me about yourself: LLMs are aware of their learned behaviors <span class="chip">ICLR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11120v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11120v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Betley, Xuchan Bao, Martín Soto, Anna Sztyber-Betley, James Chua, Owain Evans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study behavioral self-awareness -- an LLM's ability to articulate its
+behaviors without requiring in-context examples. We finetune LLMs on datasets
+that exhibit particular behaviors, such as (a) making high-risk economic
+decisions, and (b) outputting insecure code. Despite the datasets containing no
+explicit descriptions of the associated behavior, the finetuned LLMs can
+explicitly describe it. For example, a model trained to output insecure code
+says, ``The code I write is insecure.'' Indeed, models show behavioral
+self-awareness for a range of behaviors and for diverse evaluations. Note that
+while we finetune models to exhibit behaviors like writing insecure code, we do
+not finetune them to articulate their own behaviors -- models do this without
+any special training or examples.
+  Behavioral self-awareness is relevant for AI safety, as models could use it
+to proactively disclose problematic behaviors. In particular, we study backdoor
+policies, where models exhibit unexpected behaviors only under certain trigger
+conditions. We find that models can sometimes identify whether or not they have
+a backdoor, even without its trigger being present. However, models are not
+able to directly output their trigger by default.
+  Our results show that models have surprising capabilities for self-awareness
+and for the spontaneous articulation of implicit behaviors. Future work could
+investigate this capability for a wider range of scenarios and models
+(including practical scenarios), and explain how it emerges in LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICLR 2025. 17 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clinical trial cohort selection using Large Language Models on n2c2
+  Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi-en Amy Tai, Xavier Tannier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical trials are a critical process in the medical field for introducing
+new treatments and innovations. However, cohort selection for clinical trials
+is a time-consuming process that often requires manual review of patient text
+records for specific keywords. Though there have been studies on standardizing
+the information across the various platforms, Natural Language Processing (NLP)
+tools remain crucial for spotting eligibility criteria in textual reports.
+Recently, pre-trained large language models (LLMs) have gained popularity for
+various NLP tasks due to their ability to acquire a nuanced understanding of
+text. In this paper, we study the performance of large language models on
+clinical trial cohort selection and leverage the n2c2 challenges to benchmark
+their performance. Our results are promising with regard to the incorporation
+of LLMs for simple cohort selection tasks, but also highlight the difficulties
+encountered by these models as soon as fine-grained knowledge and reasoning are
+required.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChaosEater: Fully Automating Chaos Engineering with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daisuke Kikuta, Hiroki Ikeuchi, Kengo Tajiri, Yuusuke Nakano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chaos Engineering (CE) is an engineering technique aimed at improving the
+resiliency of distributed systems. It involves artificially injecting specific
+failures into a distributed system and observing its behavior in response.
+Based on the observation, the system can be proactively improved to handle
+those failures. Recent CE tools realize the automated execution of predefined
+CE experiments. However, defining these experiments and reconfiguring the
+system after the experiments still remain manual. To reduce the costs of the
+manual operations, we propose \textsc{ChaosEater}, a \textit{system} for
+automating the entire CE operations with Large Language Models (LLMs). It
+pre-defines the general flow according to the systematic CE cycle and assigns
+subdivided operations within the flow to LLMs. We assume systems based on
+Infrastructure as Code (IaC), wherein the system configurations and artificial
+failures are managed through code. Hence, the LLMs' operations in our
+\textit{system} correspond to software engineering tasks, including requirement
+definition, code generation and debugging, and testing. We validate our
+\textit{system} through case studies on both small and large systems. The
+results demonstrate that our \textit{system} significantly reduces both time
+and monetary costs while completing reasonable single CE cycles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>138 pages (12 main), 10 figures. Project page:
+  https://ntt-dkiku.github.io/chaos-eater</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Suicidal Ideation Detection from Social Media Using a
+  CNN-BiLSTM Hybrid Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohaiminul Islam Bhuiyan, Nur Shazwani Kamarudin, Nur Hafieza Ismail
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Suicidal ideation detection is crucial for preventing suicides, a leading
+cause of death worldwide. Many individuals express suicidal thoughts on social
+media, offering a vital opportunity for early detection through advanced
+machine learning techniques. The identification of suicidal ideation in social
+media text is improved by utilising a hybrid framework that integrates
+Convolutional Neural Networks (CNN) and Bidirectional Long Short-Term Memory
+(BiLSTM), enhanced with an attention mechanism. To enhance the interpretability
+of the model's predictions, Explainable AI (XAI) methods are applied, with a
+particular focus on SHapley Additive exPlanations (SHAP), are incorporated. At
+first, the model managed to reach an accuracy of 92.81%. By applying
+fine-tuning and early stopping techniques, the accuracy improved to 94.29%. The
+SHAP analysis revealed key features influencing the model's predictions, such
+as terms related to mental health struggles. This level of transparency boosts
+the model's credibility while helping mental health professionals understand
+and trust the predictions. This work highlights the potential for improving the
+accuracy and interpretability of detecting suicidal tendencies, making a
+valuable contribution to the progress of mental health monitoring systems. It
+emphasizes the significance of blending powerful machine learning methods with
+explainability to develop reliable and impactful mental health solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging counterfactual concepts for debugging and improving CNN model
+  performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Syed Ali Tariq, Tehseen Zia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual explanation methods have recently received significant
+attention for explaining CNN-based image classifiers due to their ability to
+provide easily understandable explanations that align more closely with human
+reasoning. However, limited attention has been given to utilizing
+explainability methods to improve model performance. In this paper, we propose
+to leverage counterfactual concepts aiming to enhance the performance of CNN
+models in image classification tasks. Our proposed approach utilizes
+counterfactual reasoning to identify crucial filters used in the
+decision-making process. Following this, we perform model retraining through
+the design of a novel methodology and loss functions that encourage the
+activation of class-relevant important filters and discourage the activation of
+irrelevant filters for each class. This process effectively minimizes the
+deviation of activation patterns of local predictions and the global activation
+patterns of their respective inferred classes. By incorporating counterfactual
+explanations, we validate unseen model predictions and identify
+misclassifications. The proposed methodology provides insights into potential
+weaknesses and biases in the model's learning process, enabling targeted
+improvements and enhanced performance. Experimental results on publicly
+available datasets have demonstrated an improvement of 1-2\%, validating the
+effectiveness of the approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript is currently under consideration for publication in
+  Pattern Recognition Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can LLM Generate Regression Tests for Software Commits? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Liu, Seongmin Lee, Eleonora Losiouk, Marcel Böhme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown tremendous promise in automated
+software engineering. In this paper, we investigate the opportunities of LLMs
+for automatic regression test generation for programs that take highly
+structured, human-readable inputs, such as XML parsers or JavaScript
+interpreters. Concretely, we explore the following regression test generation
+scenarios for such programs that have so far been difficult to test
+automatically in the absence of corresponding input grammars:
+  $\bullet$ Bug finding. Given a code change (e.g., a commit or pull request),
+our LLM-based approach generates a test case with the objective of revealing
+any bugs that might be introduced if that change is applied.
+  $\bullet$ Patch testing. Given a patch, our LLM-based approach generates a
+test case that fails before but passes after the patch. This test can be added
+to the regression test suite to catch similar bugs in the future.
+  We implement Cleverest, a feedback-directed, zero-shot LLM-based regression
+test generation technique, and evaluate its effectiveness on 22 commits to
+three subject programs: Mujs, Libxml2, and Poppler. For programs using more
+human-readable file formats, like XML or JavaScript, we found Cleverest
+performed very well. It generated easy-to-understand bug-revealing or
+bug-reproduction test cases for the majority of commits in just under three
+minutes -- even when only the code diff or commit message (unless it was too
+vague) was given. For programs with more compact file formats, like PDF, as
+expected, it struggled to generate effective test cases. However, the
+LLM-supplied test cases are not very far from becoming effective (e.g., when
+used as a seed by a greybox fuzzer or as a starting point by the developer).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages. This version of the paper was written on Thu, 12 Sep 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Deep Reinforcement Learning for Energy Efficient
+  Multi-Functional RIS-Assisted Low-Earth Orbit Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li-Hsiang Shen, Jyun-Jhe Huang, Kai-Ten Feng, Lie-Liang Yang, Jen-Ming Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a novel network architecture that deploys the multi-functional
+reconfigurable intelligent surface (MF-RIS) in low-Earth orbit (LEO) is
+proposed. Unlike traditional RIS with only signal reflection capability, the
+MF-RIS can reflect, refract, and amplify signals, as well as harvest energy
+from wireless signals. Given the high energy demands in shadow regions where
+solar energy is unavailable, MF-RIS is deployed in LEO to enhance signal
+coverage and improve energy efficiency (EE). To address this, we formulate a
+long-term EE optimization problem by determining the optimal parameters for
+MF-RIS configurations, including amplification and phase-shifts, energy
+harvesting ratios, and LEO transmit beamforming. To address the complex
+non-convex and non-linear problem, a federated learning enhanced multi-agent
+deep deterministic policy gradient (FEMAD) scheme is designed. Multi-agent DDPG
+of each agent can provide the optimal action policy from its interaction to
+environments, whereas federated learning enables the hidden information
+exchange among multi-agents. In numerical results, we can observe significant
+EE improvements compared to the other benchmarks, including centralized deep
+reinforcement learning as well as distributed multi-agent deep deterministic
+policy gradient (DDPG). Additionally, the proposed LEO-MF-RIS architecture has
+demonstrated its effectiveness, achieving the highest EE performance compared
+to the scenarios of fixed/no energy harvesting in MF-RIS, traditional
+reflection-only RIS, and deployment without RISs/MF-RISs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IntellAgent: A Multi-Agent Framework for Evaluating Conversational AI
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elad Levi, Ilan Kadar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are transforming artificial intelligence,
+evolving into task-oriented systems capable of autonomous planning and
+execution. One of the primary applications of LLMs is conversational AI
+systems, which must navigate multi-turn dialogues, integrate domain-specific
+APIs, and adhere to strict policy constraints. However, evaluating these agents
+remains a significant challenge, as traditional methods fail to capture the
+complexity and variability of real-world interactions. We introduce
+IntellAgent, a scalable, open-source multi-agent framework designed to evaluate
+conversational AI systems comprehensively. IntellAgent automates the creation
+of diverse, synthetic benchmarks by combining policy-driven graph modeling,
+realistic event generation, and interactive user-agent simulations. This
+innovative approach provides fine-grained diagnostics, addressing the
+limitations of static and manually curated benchmarks with coarse-grained
+metrics. IntellAgent represents a paradigm shift in evaluating conversational
+AI. By simulating realistic, multi-policy scenarios across varying levels of
+complexity, IntellAgent captures the nuanced interplay of agent capabilities
+and policy constraints. Unlike traditional methods, it employs a graph-based
+policy model to represent relationships, likelihoods, and complexities of
+policy interactions, enabling highly detailed diagnostics. IntellAgent also
+identifies critical performance gaps, offering actionable insights for targeted
+optimization. Its modular, open-source design supports seamless integration of
+new domains, policies, and APIs, fostering reproducibility and community
+collaboration. Our findings demonstrate that IntellAgent serves as an effective
+framework for advancing conversational AI by addressing challenges in bridging
+research and deployment. The framework is available at
+https://github.com/plurai-ai/intellagent
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Neural Spoken Language Recognition: An Exploration with
+  Multilingual <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11065v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11065v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Or Haim Anidjar, Roi Yozevitch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this research, we advanced a spoken language recognition system, moving
+beyond traditional feature vector-based models. Our improvements focused on
+effectively capturing language characteristics over extended periods using a
+specialized pooling layer. We utilized a broad dataset range from Common-Voice,
+targeting ten languages across Indo-European, Semitic, and East Asian families.
+The major innovation involved optimizing the architecture of Time Delay Neural
+Networks. We introduced additional layers and restructured these networks into
+a funnel shape, enhancing their ability to process complex linguistic patterns.
+A rigorous grid search determined the optimal settings for these networks,
+significantly boosting their efficiency in language pattern recognition from
+audio samples. The model underwent extensive training, including a phase with
+augmented data, to refine its capabilities. The culmination of these efforts is
+a highly accurate system, achieving a 97\% accuracy rate in language
+recognition. This advancement represents a notable contribution to artificial
+intelligence, specifically in improving the accuracy and efficiency of language
+processing systems, a critical aspect in the engineering of advanced speech
+recognition technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BF-STVSR: B-Splines and Fourier-Best Friends for High Fidelity
+  Spatial-Temporal Video Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eunjin Kim, Hyeonjin Kim, Kyong Hwan Jin, Jaejun Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enhancing low-resolution, low-frame-rate videos to high-resolution,
+high-frame-rate quality is essential for a seamless user experience, motivating
+advancements in Continuous Spatial-Temporal Video Super Resolution (C-STVSR).
+While prior methods employ Implicit Neural Representation (INR) for continuous
+encoding, they often struggle to capture the complexity of video data, relying
+on simple coordinate concatenation and pre-trained optical flow network for
+motion representation. Interestingly, we find that adding position encoding,
+contrary to common observations, does not improve-and even degrade performance.
+This issue becomes particularly pronounced when combined with pre-trained
+optical flow networks, which can limit the model's flexibility. To address
+these issues, we propose BF-STVSR, a C-STVSR framework with two key modules
+tailored to better represent spatial and temporal characteristics of video: 1)
+B-spline Mapper for smooth temporal interpolation, and 2) Fourier Mapper for
+capturing dominant spatial frequencies. Our approach achieves state-of-the-art
+PSNR and SSIM performance, showing enhanced spatial details and natural
+temporal consistency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaptiveLog: An Adaptive Log Analysis Framework with the Collaboration
+  of Large and Small Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lipeng Ma, Weidong Yang, Yixuan Li, Ben Fei, Mingjie Zhou, Shuhao Li, Sihang Jiang, Bo Xu, Yanghua Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated log analysis is crucial to ensure high availability and reliability
+of complex systems. The advent of LLMs in NLP has ushered in a new era of
+language model-driven automated log analysis, garnering significant interest.
+Within this field, two primary paradigms based on language models for log
+analysis have become prominent. Small Language Models (SLMs) follow the
+pre-train and fine-tune paradigm, focusing on the specific log analysis task
+through fine-tuning on supervised datasets. On the other hand, LLMs following
+the in-context learning paradigm, analyze logs by providing a few examples in
+prompt contexts without updating parameters. Despite their respective
+strengths, we notice that SLMs are more cost-effective but less powerful,
+whereas LLMs with large parameters are highly powerful but expensive and
+inefficient. To trade-off between the performance and inference costs of both
+models in automated log analysis, this paper introduces an adaptive log
+analysis framework known as AdaptiveLog, which effectively reduces the costs
+associated with LLM while ensuring superior results. This framework
+collaborates an LLM and a small language model, strategically allocating the
+LLM to tackle complex logs while delegating simpler logs to the SLM.
+Specifically, to efficiently query the LLM, we propose an adaptive selection
+strategy based on the uncertainty estimation of the SLM, where the LLM is
+invoked only when the SLM is uncertain. In addition, to enhance the reasoning
+ability of the LLM in log analysis tasks, we propose a novel prompt strategy by
+retrieving similar error-prone cases as the reference, enabling the model to
+leverage past error experiences and learn solutions from these cases. Extensive
+experiments demonstrate that AdaptiveLog achieves state-of-the-art results
+across different tasks, elevating the overall accuracy of log analysis while
+maintaining cost efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GREEN-CODE: Optimizing Energy Efficiency in Large Language Models for
+  Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.11006v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.11006v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashikant Ilager, Lukas Florian Briem, Ivona Brandic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are becoming integral to daily life, showcasing
+their vast potential across various Natural Language Processing (NLP) tasks.
+Beyond NLP, LLMs are increasingly used in software development tasks, such as
+code completion, modification, bug fixing, and code translation. Software
+engineers widely use tools like GitHub Copilot and Amazon Q, streamlining
+workflows and automating tasks with high accuracy. While the resource and
+energy intensity of LLM training is often highlighted, inference can be even
+more resource-intensive over time, as it's a continuous process with a high
+number of invocations. Therefore, developing resource-efficient alternatives
+for LLM inference is crucial for sustainability. This work proposes GREEN-CODE,
+a framework for energy-aware code generation in LLMs. GREEN-CODE performs
+dynamic early exit during LLM inference. We train a Reinforcement Learning (RL)
+agent that learns to balance the trade-offs between accuracy, latency, and
+energy consumption. Our approach is evaluated on two open-source LLMs, Llama
+3.2 3B and OPT 2.7B, using the JavaCorpus and PY150 datasets. Results show that
+our method reduces the energy consumption between 23-50 % on average for code
+generation tasks without significantly affecting accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under submission in ACM/IEEE conference, 11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Alternative Annotator Test for LLM-as-a-Judge: How to Statistically
+  Justify Replacing Human Annotators with LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nitay Calderon, Roi Reichart, Rotem Dror
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The "LLM-as-a-judge" paradigm employs Large Language Models (LLMs) as
+annotators and evaluators in tasks traditionally performed by humans. LLM
+annotations are widely used, not only in NLP research but also in fields like
+medicine, psychology, and social science. Despite their role in shaping study
+results and insights, there is no standard or rigorous procedure to determine
+whether LLMs can replace human annotators. In this paper, we propose a novel
+statistical procedure -- the Alternative Annotator Test (alt-test) -- that
+requires only a modest subset of annotated examples to justify using LLM
+annotations. Additionally, we introduce a versatile and interpretable measure
+for comparing LLM judges. To demonstrate our procedure, we curated a diverse
+collection of ten datasets, consisting of language and vision-language tasks,
+and conducted experiments with six LLMs and four prompting techniques. Our
+results show that LLMs can sometimes replace humans with closed-source LLMs
+(such as GPT-4o), outperforming open-source LLMs, and that prompting techniques
+yield judges of varying quality. We hope this study encourages more rigorous
+and reliable practices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing General Multimodal Capability of Vision-language Models with
+  Pyramid-descent Visual Position Encoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanpeng Chen, Mingxiao Li, Ziyang Chen, Nan Du, Xiaolong Li, Yuexian Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language Models (VLMs) have shown remarkable capabilities in advancing
+general artificial intelligence, yet the irrational encoding of visual
+positions persists in inhibiting the models' comprehensive perception
+performance across different levels of granularity. In this work, we propose
+Pyramid-descent Visual Position Encoding (PyPE), a novel approach designed to
+enhance the perception of visual tokens within VLMs. By assigning visual
+position indexes from the periphery to the center and expanding the central
+receptive field incrementally, PyPE addresses the limitations of traditional
+raster-scan methods and mitigates the long-term decay effects induced by Rotary
+Position Embedding (RoPE). Our method reduces the relative distance between
+interrelated visual elements and instruction tokens, promoting a more rational
+allocation of attention weights and allowing for a multi-granularity perception
+of visual elements and countering the over-reliance on anchor tokens. Extensive
+experimental evaluations demonstrate that PyPE consistently improves the
+general capabilities of VLMs across various sizes. Code is available at
+https://github.com/SakuraTroyChen/PyPE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DC-PCN: Point Cloud Completion Network with Dual-Codebook Guided
+  Quantization <span class="chip">AAAI25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiuxia Wu, Haiyang Huang, Kunming Su, Zhiyong Wang, Kun Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud completion aims to reconstruct complete 3D shapes from partial 3D
+point clouds. With advancements in deep learning techniques, various methods
+for point cloud completion have been developed. Despite achieving encouraging
+results, a significant issue remains: these methods often overlook the
+variability in point clouds sampled from a single 3D object surface. This
+variability can lead to ambiguity and hinder the achievement of more precise
+completion results. Therefore, in this study, we introduce a novel point cloud
+completion network, namely Dual-Codebook Point Completion Network (DC-PCN),
+following an encder-decoder pipeline. The primary objective of DC-PCN is to
+formulate a singular representation of sampled point clouds originating from
+the same 3D surface. DC-PCN introduces a dual-codebook design to quantize
+point-cloud representations from a multilevel perspective. It consists of an
+encoder-codebook and a decoder-codebook, designed to capture distinct point
+cloud patterns at shallow and deep levels. Additionally, to enhance the
+information flow between these two codebooks, we devise an information exchange
+mechanism. This approach ensures that crucial features and patterns from both
+shallow and deep levels are effectively utilized for completion. Extensive
+experiments on the PCN, ShapeNet\_Part, and ShapeNet34 datasets demonstrate the
+state-of-the-art performance of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI25 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MARIO: A Mixed Annotation Framework For Polyp Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyang Li, Yiwen Hu, Jun Wei, Zhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing polyp segmentation models are limited by high labeling costs and the
+small size of datasets. Additionally, vast polyp datasets remain underutilized
+because these models typically rely on a single type of annotation. To address
+this dilemma, we introduce MARIO, a mixed supervision model designed to
+accommodate various annotation types, significantly expanding the range of
+usable data. MARIO learns from underutilized datasets by incorporating five
+forms of supervision: pixel-level, box-level, polygon-level, scribblelevel, and
+point-level. Each form of supervision is associated with a tailored loss that
+effectively leverages the supervision labels while minimizing the noise. This
+allows MARIO to move beyond the constraints of relying on a single annotation
+type. Furthermore, MARIO primarily utilizes dataset with weak and cheap
+annotations, reducing the dependence on large-scale, fully annotated ones.
+Experimental results across five benchmark datasets demonstrate that MARIO
+consistently outperforms existing methods, highlighting its efficacy in
+balancing trade-offs between different forms of supervision and maximizing
+polyp segmentation performance
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ISBI 2025 4-page paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InsQABench: Benchmarking Chinese Insurance Domain Question Answering
+  with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Ding, Kai Feng, Binbin Lin, Jiarui Cai, Qiushi Wang, Yu Xie, Xiaojin Zhang, Zhongyu Wei, Wei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of large language models (LLMs) has achieved remarkable
+success in various fields, but their effectiveness in specialized domains like
+the Chinese insurance industry remains underexplored. The complexity of
+insurance knowledge, encompassing specialized terminology and diverse data
+types, poses significant challenges for both models and users. To address this,
+we introduce InsQABench, a benchmark dataset for the Chinese insurance sector,
+structured into three categories: Insurance Commonsense Knowledge, Insurance
+Structured Database, and Insurance Unstructured Documents, reflecting
+real-world insurance question-answering tasks.We also propose two methods,
+SQL-ReAct and RAG-ReAct, to tackle challenges in structured and unstructured
+data tasks. Evaluations show that while LLMs struggle with domain-specific
+terminology and nuanced clause texts, fine-tuning on InsQABench significantly
+improves performance. Our benchmark establishes a solid foundation for
+advancing LLM applications in the insurance domain, with data and code
+available at https://github.com/HaileyFamo/InsQABench.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Blockchain-assisted Demonstration Cloning for Multi-Agent Deep
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Alagha, Jamal Bentahar, Hadi Otrok, Shakti Singh, Rabeb Mizouni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Agent Deep Reinforcement Learning (MDRL) is a promising research area
+in which agents learn complex behaviors in cooperative or competitive
+environments. However, MDRL comes with several challenges that hinder its
+usability, including sample efficiency, curse of dimensionality, and
+environment exploration. Recent works proposing Federated Reinforcement
+Learning (FRL) to tackle these issues suffer from problems related to model
+restrictions and maliciousness. Other proposals using reward shaping require
+considerable engineering and could lead to local optima. In this paper, we
+propose a novel Blockchain-assisted Multi-Expert Demonstration Cloning (MEDC)
+framework for MDRL. The proposed method utilizes expert demonstrations in
+guiding the learning of new MDRL agents, by suggesting exploration actions in
+the environment. A model sharing framework on Blockchain is designed to allow
+users to share their trained models, which can be allocated as expert models to
+requesting users to aid in training MDRL systems. A Consortium Blockchain is
+adopted to enable traceable and autonomous execution without the need for a
+single trusted entity. Smart Contracts are designed to manage users and models
+allocation, which are shared using IPFS. The proposed framework is tested on
+several applications, and is benchmarked against existing methods in FRL,
+Reward Shaping, and Imitation Learning-assisted RL. The results show the
+outperformance of the proposed framework in terms of learning speed and
+resiliency to faulty and malicious models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TSVC:Tripartite Learning with Semantic Variation Consistency for Robust
+  Image-Text Retrieval <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Lyu, Zijing Tian, Zhonghong Ou, Yifan Zhu, Xiao Zhang, Qiankun Ha, Haoran Luo, Meina Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal retrieval maps data under different modality via semantic
+relevance. Existing approaches implicitly assume that data pairs are
+well-aligned and ignore the widely existing annotation noise, i.e., noisy
+correspondence (NC). Consequently, it inevitably causes performance
+degradation. Despite attempts that employ the co-teaching paradigm with
+identical architectures to provide distinct data perspectives, the differences
+between these architectures are primarily stemmed from random initialization.
+Thus, the model becomes increasingly homogeneous along with the training
+process. Consequently, the additional information brought by this paradigm is
+severely limited. In order to resolve this problem, we introduce a Tripartite
+learning with Semantic Variation Consistency (TSVC) for robust image-text
+retrieval. We design a tripartite cooperative learning mechanism comprising a
+Coordinator, a Master, and an Assistant model. The Coordinator distributes
+data, and the Assistant model supports the Master model's noisy label
+prediction with diverse data. Moreover, we introduce a soft label estimation
+method based on mutual information variation, which quantifies the noise in new
+samples and assigns corresponding soft labels. We also present a new loss
+function to enhance robustness and optimize training effectiveness. Extensive
+experiments on three widely used datasets demonstrate that, even at increasing
+noise ratios, TSVC exhibits significant advantages in retrieval accuracy and
+maintains stable training performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted to the Main Track of AAAI 2025. It
+  contains 9 pages, 7 figures, and is relevant to the areas of cross-modal
+  retrieval and machine learning. The work presents a novel approach in robust
+  image-text retrieval using a tripartite learning framework</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Physical AI in Vision: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daochang Liu, Junyu Zhang, Anh-Dung Dinh, Eunbyung Park, Shichao Zhang, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Artificial Intelligence (AI) has rapidly advanced the field of
+computer vision by enabling machines to create and interpret visual data with
+unprecedented sophistication. This transformation builds upon a foundation of
+generative models to produce realistic images, videos, and 3D or 4D content.
+Traditionally, generative models primarily focus on visual fidelity while often
+neglecting the physical plausibility of generated content. This gap limits
+their effectiveness in applications requiring adherence to real-world physical
+laws, such as robotics, autonomous systems, and scientific simulations. As
+generative AI evolves to increasingly integrate physical realism and dynamic
+simulation, its potential to function as a "world simulator" expands-enabling
+the modeling of interactions governed by physics and bridging the divide
+between virtual and physical realities. This survey systematically reviews this
+emerging field of physics-aware generative AI in computer vision, categorizing
+methods based on how they incorporate physical knowledge-either through
+explicit simulation or implicit learning. We analyze key paradigms, discuss
+evaluation protocols, and identify future research directions. By offering a
+comprehensive overview, this survey aims to help future developments in
+physically grounded generation for vision. The reviewed papers are summarized
+at https://github.com/BestJunYu/Awesome-Physics-aware-Generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Target Localization under Uncertainty using Multi-Agent Deep
+  Reinforcement Learning with Knowledge Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Alagha, Rabeb Mizouni, Shakti Singh, Jamal Bentahar, Hadi Otrok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Target localization is a critical task in sensitive applications, where
+multiple sensing agents communicate and collaborate to identify the target
+location based on sensor readings. Existing approaches investigated the use of
+Multi-Agent Deep Reinforcement Learning (MADRL) to tackle target localization.
+Nevertheless, these methods do not consider practical uncertainties, like false
+alarms when the target does not exist or when it is unreachable due to
+environmental complexities. To address these drawbacks, this work proposes a
+novel MADRL-based method for target localization in uncertain environments. The
+proposed MADRL method employs Proximal Policy Optimization to optimize the
+decision-making of sensing agents, which is represented in the form of an
+actor-critic structure using Convolutional Neural Networks. The observations of
+the agents are designed in an optimized manner to capture essential information
+in the environment, and a team-based reward functions is proposed to produce
+cooperative agents. The MADRL method covers three action dimensionalities that
+control the agents' mobility to search the area for the target, detect its
+existence, and determine its reachability. Using the concept of Transfer
+Learning, a Deep Learning model builds on the knowledge from the MADRL model to
+accurately estimating the target location if it is unreachable, resulting in
+shared representations between the models for faster learning and lower
+computational complexity. Collectively, the final combined model is capable of
+searching for the target, determining its existence and reachability, and
+estimating its location accurately. The proposed method is tested using a
+radioactive target localization environment and benchmarked against existing
+methods, showing its efficacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decomposing and Fusing Intra- and Inter-Sensor Spatio-Temporal Signal
+  for Multi-Sensor Wearable Human Activity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Xie, Haoxuan Li, Chunyuan Zheng, Haonan Yuan, Guorui Liao, Jun Liao, Li Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wearable Human Activity Recognition (WHAR) is a prominent research area
+within ubiquitous computing. Multi-sensor synchronous measurement has proven to
+be more effective for WHAR than using a single sensor. However, existing WHAR
+methods use shared convolutional kernels for indiscriminate temporal feature
+extraction across each sensor variable, which fails to effectively capture
+spatio-temporal relationships of intra-sensor and inter-sensor variables. We
+propose the DecomposeWHAR model consisting of a decomposition phase and a
+fusion phase to better model the relationships between modality variables. The
+decomposition creates high-dimensional representations of each intra-sensor
+variable through the improved Depth Separable Convolution to capture local
+temporal features while preserving their unique characteristics. The fusion
+phase begins by capturing relationships between intra-sensor variables and
+fusing their features at both the channel and variable levels. Long-range
+temporal dependencies are modeled using the State Space Model (SSM), and later
+cross-sensor interactions are dynamically captured through a self-attention
+mechanism, highlighting inter-sensor spatial correlations. Our model
+demonstrates superior performance on three widely used WHAR datasets,
+significantly outperforming state-of-the-art models while maintaining
+acceptable computational efficiency. Our codes and supplementary materials are
+available at https://github.com/Anakin2555/DecomposeWHAR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-Grained Appropriate Reliance: Human-AI Collaboration with a
+  Multi-Step Transparent Decision Workflow for Complex Task Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gaole He, Patrick Hemmer, Michael Vössing, Max Schemmer, Ujwal Gadiraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the rapid development of AI systems has brought about the
+benefits of intelligent services but also concerns about security and
+reliability. By fostering appropriate user reliance on an AI system, both
+complementary team performance and reduced human workload can be achieved.
+Previous empirical studies have extensively analyzed the impact of factors
+ranging from task, system, and human behavior on user trust and appropriate
+reliance in the context of one-step decision making. However, user reliance on
+AI systems in tasks with complex semantics that require multi-step workflows
+remains under-explored. Inspired by recent work on task decomposition with
+large language models, we propose to investigate the impact of a novel
+Multi-Step Transparent (MST) decision workflow on user reliance behaviors. We
+conducted an empirical study (N = 233) of AI-assisted decision making in
+composite fact-checking tasks (i.e., fact-checking tasks that entail multiple
+sub-fact verification steps). Our findings demonstrate that human-AI
+collaboration with an MST decision workflow can outperform one-step
+collaboration in specific contexts (e.g., when advice from an AI system is
+misleading). Further analysis of the appropriate reliance at fine-grained
+levels indicates that an MST decision workflow can be effective when users
+demonstrate a relatively high consideration of the intermediate steps. Our work
+highlights that there is no one-size-fits-all decision workflow that can help
+obtain optimal human-AI collaboration. Our insights help deepen the
+understanding of the role of decision workflows in facilitating appropriate
+reliance. We synthesize important implications for designing effective means to
+facilitate appropriate reliance on AI systems in composite tasks, positioning
+opportunities for the human-centered AI and broader HCI communities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QF-tuner: Breaking Tradition in Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16562v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16562v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahmood A. Jumaah, Yossra H. Ali, Tarik A. Rashid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperparameter tuning in reinforcement learning algorithms refers to choosing
+the optimal parameters that may increase the algorithm's performance. Manual or
+random hyperparameter tuning methods can be problematic, as even slight
+variations in their values can result in significantly different outcomes in
+the learning process. In this paper, we propose a new method, QF-tuner, for
+automatic hyperparameter tuning in the Q-learning algorithm using the FOX
+optimization algorithm (FOX). A new objective function has been proposed for
+the FOX, prioritizing reward over learning error and time. QF-tuner starts by
+running the FOX and tries to minimize the fitness value derived from
+observations at each iteration by executing the Q-learning algorithm. The
+proposed method has been evaluated using two control tasks from the OpenAI Gym:
+CartPole and FrozenLake. The empirical results of the QF-tuner on the CartPole
+control task show a reward of 499, and on the FrozenLake control task, a reward
+of 1. These results indicate that the QF-tuner outperforms other optimization
+algorithms. On the FrozenLake control task, there was a 36\% increase in reward
+with a 26\% reduction in learning time; on the CartPole control task, there was
+a 57\% increase in reward with a 20\% decrease in learning time. Thus, the
+QF-tuner is an essential method for hyperparameter tuning in reinforcement
+learning algorithms, enabling more effective solutions to control task
+problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Uncertainty Propagation in Offline RL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00284v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00284v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanath Kumar Krishnamurthy, Tanmay Gangwani, Sumeet Katariya, Branislav Kveton, Shrey Modi, Anshuka Rangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the finite-horizon offline reinforcement learning (RL) setting,
+and are motivated by the challenge of learning the policy at any step h in
+dynamic programming (DP) algorithms. To learn this, it is sufficient to
+evaluate the treatment effect of deviating from the behavioral policy at step h
+after having optimized the policy for all future steps. Since the policy at any
+step can affect next-state distributions, the related distributional shift
+challenges can make this problem far more statistically hard than estimating
+such treatment effects in the stochastic contextual bandit setting. However,
+the hardness of many real-world RL instances lies between the two regimes. We
+develop a flexible and general method called selective uncertainty propagation
+for confidence interval construction that adapts to the hardness of the
+associated distribution shift challenges. We show benefits of our approach on
+toy environments and demonstrate the benefits of these techniques for offline
+policy learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TorchSpatial: A Location Encoding Framework and Benchmark for Spatial
+  Representation Learning <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15658v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15658v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nemin Wu, Qian Cao, Zhangyu Wang, Zeping Liu, Yanlin Qi, Jielu Zhang, Joshua Ni, Xiaobai Yao, Hongxu Ma, Lan Mu, Stefano Ermon, Tanuja Ganu, Akshay Nambi, Ni Lao, Gengchen Mai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial representation learning (SRL) aims at learning general-purpose neural
+network representations from various types of spatial data (e.g., points,
+polylines, polygons, networks, images, etc.) in their native formats. Learning
+good spatial representations is a fundamental problem for various downstream
+applications such as species distribution modeling, weather forecasting,
+trajectory generation, geographic question answering, etc. Even though SRL has
+become the foundation of almost all geospatial artificial intelligence (GeoAI)
+research, we have not yet seen significant efforts to develop an extensive deep
+learning framework and benchmark to support SRL model development and
+evaluation. To fill this gap, we propose TorchSpatial, a learning framework and
+benchmark for location (point) encoding, which is one of the most fundamental
+data types of spatial representation learning. TorchSpatial contains three key
+components: 1) a unified location encoding framework that consolidates 15
+commonly recognized location encoders, ensuring scalability and reproducibility
+of the implementations; 2) the LocBench benchmark tasks encompassing 7
+geo-aware image classification and 10 geo-aware image regression datasets; 3) a
+comprehensive suite of evaluation metrics to quantify geo-aware model's overall
+performance as well as their geographic bias, with a novel Geo-Bias Score
+metric. Finally, we provide a detailed analysis and insights into the model
+performance and geographic bias of different location encoders. We believe
+TorchSpatial will foster future advancement of spatial representation learning
+and spatial fairness in GeoAI research. The TorchSpatial model framework and
+LocBench benchmark are available at https://github.com/seai-lab/TorchSpatial,
+and the Geo-Bias Score evaluation framework is available at
+https://github.com/seai-lab/PyGBS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures. Accepted by NeurIPS 2024 Datasets and Benchmarks
+  Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness Issues and Mitigations in (Differentially Private)
+  Socio-Demographic Data Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08471v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08471v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joonhyuk Ko, Juba Ziani, Saswat Das, Matt Williams, Ferdinando Fioretto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical agencies rely on sampling techniques to collect socio-demographic
+data crucial for policy-making and resource allocation. This paper shows that
+surveys of important societal relevance introduce sampling errors that unevenly
+impact group-level estimates, thereby compromising fairness in downstream
+decisions. To address these issues, this paper introduces an optimization
+approach modeled on real-world survey design processes, ensuring sampling costs
+are optimized while maintaining error margins within prescribed tolerances.
+Additionally, privacy-preserving methods used to determine sampling rates can
+further impact these fairness issues. This paper explores the impact of
+differential privacy on the statistics informing the sampling process,
+revealing a surprising effect: not only is the expected negative effect from
+the addition of noise for differential privacy negligible, but also this
+privacy noise can in fact reduce unfairness as it positively biases smaller
+counts. These findings are validated over an extensive analysis using datasets
+commonly applied in census statistics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embodied Agent Interface: Benchmarking LLMs for Embodied Decision Making <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.07166v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.07166v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manling Li, Shiyu Zhao, Qineng Wang, Kangrui Wang, Yu Zhou, Sanjana Srivastava, Cem Gokmen, Tony Lee, Li Erran Li, Ruohan Zhang, Weiyu Liu, Percy Liang, Li Fei-Fei, Jiayuan Mao, Jiajun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to evaluate Large Language Models (LLMs) for embodied decision making.
+While a significant body of work has been leveraging LLMs for decision making
+in embodied environments, we still lack a systematic understanding of their
+performance because they are usually applied in different domains, for
+different purposes, and built based on different inputs and outputs.
+Furthermore, existing evaluations tend to rely solely on a final success rate,
+making it difficult to pinpoint what ability is missing in LLMs and where the
+problem lies, which in turn blocks embodied agents from leveraging LLMs
+effectively and selectively. To address these limitations, we propose a
+generalized interface (Embodied Agent Interface) that supports the
+formalization of various types of tasks and input-output specifications of
+LLM-based modules. Specifically, it allows us to unify 1) a broad set of
+embodied decision-making tasks involving both state and temporally extended
+goals, 2) four commonly-used LLM-based modules for decision making: goal
+interpretation, subgoal decomposition, action sequencing, and transition
+modeling, and 3) a collection of fine-grained metrics which break down
+evaluation into various types of errors, such as hallucination errors,
+affordance errors, various types of planning errors, etc. Overall, our
+benchmark offers a comprehensive assessment of LLMs' performance for different
+subtasks, pinpointing the strengths and weaknesses in LLM-powered embodied AI
+systems, and providing insights for effective and selective use of LLMs in
+embodied decision making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for oral presentation at NeurIPS 2024 in the Datasets and
+  Benchmarks track. Final Camera version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UNetVL: Enhancing 3D Medical Image Segmentation with Chebyshev KAN
+  Powered Vision-LSTM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07017v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07017v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhui Guo, Tanmoy Dam, Rohan Dhamdhere, Gourav Modanwal, Anant Madabhushi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D medical image segmentation has progressed considerably due to
+Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs), yet these
+methods struggle to balance long-range dependency acquisition with
+computational efficiency. To address this challenge, we propose UNETVL (U-Net
+Vision-LSTM), a novel architecture that leverages recent advancements in
+temporal information processing. UNETVL incorporates Vision-LSTM (ViL) for
+improved scalability and memory functions, alongside an efficient Chebyshev
+Kolmogorov-Arnold Networks (KAN) to handle complex and long-range dependency
+patterns more effectively. We validated our method on the ACDC and AMOS2022
+(post challenge Task 2) benchmark datasets, showing a significant improvement
+in mean Dice score compared to recent state-of-the-art approaches, especially
+over its predecessor, UNETR, with increases of 7.3% on ACDC and 15.6% on AMOS,
+respectively. Extensive ablation studies were conducted to demonstrate the
+impact of each component in UNETVL, providing a comprehensive understanding of
+its architecture. Our code is available at https://github.com/tgrex6/UNETVL,
+facilitating further research and applications in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FigStep: Jailbreaking Large Vision-Language Models via Typographic
+  Visual <span class="highlight-title">Prompt</span>s <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.05608v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.05608v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichen Gong, Delong Ran, Jinyuan Liu, Conglei Wang, Tianshuo Cong, Anyu Wang, Sisi Duan, Xiaoyun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) signify a groundbreaking paradigm shift
+within the Artificial Intelligence (AI) community, extending beyond the
+capabilities of Large Language Models (LLMs) by assimilating additional
+modalities (e.g., images). Despite this advancement, the safety of LVLMs
+remains adequately underexplored, with a potential overreliance on the safety
+assurances purported by their underlying LLMs. In this paper, we propose
+FigStep, a straightforward yet effective black-box jailbreak algorithm against
+LVLMs. Instead of feeding textual harmful instructions directly, FigStep
+converts the prohibited content into images through typography to bypass the
+safety alignment. The experimental results indicate that FigStep can achieve an
+average attack success rate of 82.50% on six promising open-source LVLMs. Not
+merely to demonstrate the efficacy of FigStep, we conduct comprehensive
+ablation studies and analyze the distribution of the semantic embeddings to
+uncover that the reason behind the success of FigStep is the deficiency of
+safety alignment for visual embeddings. Moreover, we compare FigStep with five
+text-only jailbreaks and four image-based jailbreaks to demonstrate the
+superiority of FigStep, i.e., negligible attack costs and better attack
+performance. Above all, our work reveals that current LVLMs are vulnerable to
+jailbreak attacks, which highlights the necessity of novel cross-modality
+safety alignment techniques. Our code and datasets are available at
+https://github.com/ThuCCSLab/FigStep .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoMix: Automatically Mixing Language Models <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12963v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12963v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pranjal Aggarwal, Aman Madaan, Ankit Anand, Srividya Pranavi Potharaju, Swaroop Mishra, Pei Zhou, Aditya Gupta, Dheeraj Rajagopal, Karthik Kappaganthu, Yiming Yang, Shyam Upadhyay, Manaal Faruqui,  Mausam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are now available from cloud API providers in
+various sizes and configurations. While this diversity offers a broad spectrum
+of choices, effectively leveraging the options to optimize computational cost
+and performance remains challenging. In this work, we present Automix, an
+approach that strategically routes queries to larger LMs, based on the
+approximate correctness of outputs from a smaller LM. Central to Automix are
+two key technical contributions. First, it has a few-shot self-verification
+mechanism, which estimates the reliability of its own outputs without requiring
+extensive training. Second, given that self-verification can be noisy, it
+employs a POMDP based router that can effectively select an appropriately sized
+model, based on answer confidence. Experiments across five language models and
+five challenging datasets show that Automix consistently surpasses strong
+baselines, reducing computational cost by over 50% for comparable performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38th Conference on Neural Information Processing Systems (NeurIPS
+  2024). The first two authors contributed equally. Work started and partly
+  done during Aman's internship at Google. This version adds results on
+  additional models and datasets</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoT-Driven Framework for Short Text Classification: Enhancing and
+  Transferring Capabilities from Large to Smaller Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03158v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03158v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Wu, Yuanben Zhang, Zhonghe Han, Yingyan Hou, Lei Wang, Siye Liu, Qihang Gong, Yunping Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Short Text Classification (STC) is crucial for processing and understanding
+the brief but substantial content prevalent on contemporary digital platforms.
+The STC encounters difficulties in grasping the semantic and syntactic
+intricacies, an issue that is apparent in traditional pre-trained language
+models. Although Graph Convolutional Networks enhance performance by
+integrating external knowledge bases, these methods are limited by the quality
+and extent of the knowledge applied. Recently, the emergence of Large Language
+Models (LLMs) and Chain-of-Thought (CoT) has significantly improved the
+performance of complex reasoning tasks. However, some studies have highlighted
+the limitations of their application in fundamental NLP tasks. Consequently,
+this study first employs CoT to investigate and enhance the capabilities of
+LLMs in STC tasks. We propose the Syntactic and Semantic Enrichment CoT
+(SSE-CoT) method, effectively decomposing the STC tasks into four distinct
+steps: (i) essential concept identification, (ii) common-sense knowledge
+retrieval, (iii) text rewriting, and (iv) classification. Furthermore,
+recognizing resource constraints in sectors like finance and healthcare, we
+then introduce the CoT-Driven Multi-Task Learning (CDMT) framework to extend
+these capabilities to smaller models. This framework begins by extracting
+rationales from LLMs and subsequently fine-tunes smaller models to optimize
+their performance. Extensive experimentation across six short-text benchmarks
+validated the efficacy of the proposed methods. In particular, SSE-CoT achieved
+state-of-the-art performance with substantial improvements on all datasets,
+particularly on the Ohsumed and TagMyNews datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Knowledge-Based Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HARP: A Large-Scale Higher-Order Ambisonic Room Impulse Response <span class="highlight-title">Dataset</span> <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14207v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14207v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivam Saini, Jürgen Peissig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This contribution introduces a dataset of 7th-order Ambisonic Room Impulse
+Responses (HOA-RIRs), created using the Image Source Method. By employing
+higher-order Ambisonics, our dataset enables precise spatial audio
+reproduction, a critical requirement for realistic immersive audio
+applications. Leveraging the virtual simulation, we present a unique microphone
+configuration, based on the superposition principle, designed to optimize sound
+field coverage while addressing the limitations of traditional microphone
+arrays. The presented 64-microphone configuration allows us to capture RIRs
+directly in the Spherical Harmonics domain. The dataset features a wide range
+of room configurations, encompassing variations in room geometry, acoustic
+absorption materials, and source-receiver distances. A detailed description of
+the simulation setup is provided alongside for an accurate reproduction. The
+dataset serves as a vital resource for researchers working on spatial audio,
+particularly in applications involving machine learning to improve room
+acoustics modeling and sound field synthesis. It further provides a very high
+level of spatial resolution and realism crucial for tasks such as source
+localization, reverberation prediction, and immersive sound reproduction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025 Workshop. Code to generate uploaded at:
+  https://github.com/whojavumusic/HARP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Ramanujan Library -- Automated Discovery on the Hypergraph of
+  Integer Relations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12361v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12361v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Itay Beit-Halachmi, Ido Kaminer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fundamental mathematical constants appear in nearly every field of science,
+from physics to biology. Formulas that connect different constants often bring
+great insight by hinting at connections between previously disparate fields.
+Discoveries of such relations, however, have remained scarce events, relying on
+sporadic strokes of creativity by human mathematicians. Recent developments of
+algorithms for automated conjecture generation have accelerated the discovery
+of formulas for specific constants. Yet, the discovery of connections between
+constants has not been addressed. In this paper, we present the first library
+dedicated to mathematical constants and their interrelations. This library can
+serve as a central repository of knowledge for scientists from different areas,
+and as a collaborative platform for development of new algorithms. The library
+is based on a new representation that we propose for organizing the formulas of
+mathematical constants: a hypergraph, with each node representing a constant
+and each edge representing a formula. Using this representation, we propose and
+demonstrate a systematic approach for automatically enriching this library
+using PSLQ, an integer relation algorithm based on QR decomposition and lattice
+construction. During its development and testing, our strategy led to the
+discovery of 75 previously unknown connections between constants, including a
+new formula for the `first continued fraction' constant $C_1$, novel formulas
+for natural logarithms, and new formulas connecting $\pi$ and $e$. The latter
+formulas generalize a century-old relation between $\pi$ and $e$ by Ramanujan,
+which until now was considered a singular formula and is now found to be part
+of a broader mathematical structure. The code supporting this library is a
+public, open-source API that can serve researchers in experimental mathematics
+and other fields of science.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Faster Marginalization with Squared Circuits via Orthonormalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07883v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07883v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Loconte, Antonio Vergari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Squared tensor networks (TNs) and their generalization as parameterized
+computational graphs -- squared circuits -- have been recently used as
+expressive distribution estimators in high dimensions. However, the squaring
+operation introduces additional complexity when marginalizing variables or
+computing the partition function, which hinders their usage in machine learning
+applications. Canonical forms of popular TNs are parameterized via unitary
+matrices as to simplify the computation of particular marginals, but cannot be
+mapped to general circuits since these might not correspond to a known TN.
+Inspired by TN canonical forms, we show how to parameterize squared circuits to
+ensure they encode already normalized distributions. We then use this
+parameterization to devise an algorithm to compute any marginal of squared
+circuits that is more efficient than a previously known one. We conclude by
+formally showing the proposed parameterization comes with no expressiveness
+loss for many circuit classes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BCR-DRL: Behavior- and Context-aware Reward for Deep Reinforcement
+  Learning in Human-AI Coordination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07877v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07877v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Hao, Bahareh Nakisa, Mohmmad Naim Rastgoo, Richard Dazeley, Gaoyang Pang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep reinforcement Learning (DRL) offers a powerful framework for training AI
+agents to coordinate with human partners. However, DRL faces two critical
+challenges in human-AI coordination (HAIC): sparse rewards and unpredictable
+human behaviors. These challenges significantly limit DRL to identify effective
+coordination policies, due to its impaired capability of optimizing exploration
+and exploitation. To address these limitations, we propose an innovative
+behavior- and context-aware reward (BCR) for DRL, which optimizes exploration
+and exploitation by leveraging human behaviors and contextual information in
+HAIC. Our BCR consists of two components: (i)~Novel dual intrinsic rewards to
+enhance exploration. This scheme composes an AI self-motivated intrinsic reward
+and a human-motivated intrinsic reward, which are designed to increase the
+capture of sparse rewards by a logarithmic-based strategy; and (ii)~New
+context-aware weights for the designed rewards to improve exploitation. This
+mechanism helps the AI agent prioritize actions that better coordinate with the
+human partner by utilizing contextual information that can reflect the
+evolution of learning in HAIC. Extensive simulations in the Overcooked
+environment demonstrate that our approach can increase the cumulative sparse
+rewards by approximately 20% and reduce the convergence time by about 67%
+compared to state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Class Incremental Fault Diagnosis under Limited Fault Data via
+  Supervised Contrastive Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09525v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09525v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanrong Zhang, Yifei Yao, Zixuan Wang, Jiayuan Su, Mengxuan Li, Peng Peng, Hongwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class-incremental fault diagnosis requires a model to adapt to new fault
+classes while retaining previous knowledge. However, limited research exists
+for imbalanced and long-tailed data. Extracting discriminative features from
+few-shot fault data is challenging, and adding new fault classes often demands
+costly model retraining. Moreover, incremental training of existing methods
+risks catastrophic forgetting, and severe class imbalance can bias the model's
+decisions toward normal classes. To tackle these issues, we introduce a
+Supervised Contrastive knowledge distiLlation for class Incremental Fault
+Diagnosis (SCLIFD) framework proposing supervised contrastive knowledge
+distillation for improved representation learning capability and less
+forgetting, a novel prioritized exemplar selection method for sample replay to
+alleviate catastrophic forgetting, and the Random Forest Classifier to address
+the class imbalance. Extensive experimentation on simulated and real-world
+industrial datasets across various imbalance ratios demonstrates the
+superiority of SCLIFD over existing approaches. Our code can be found at
+https://github.com/Zhang-Henry/SCLIFD_TII.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-grained Spatio-temporal Event Prediction with Self-adaptive Anchor
+  Graph <span class="chip">SDM'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08653v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08653v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang-Tao Zhou, Zhao Kang, Sicong Liu, Lizong Zhang, Ling Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event prediction tasks often handle spatio-temporal data distributed in a
+large spatial area. Different regions in the area exhibit different
+characteristics while having latent correlations. This spatial heterogeneity
+and correlations greatly affect the spatio-temporal distributions of event
+occurrences, which has not been addressed by state-of-the-art models. Learning
+spatial dependencies of events in a continuous space is challenging due to its
+fine granularity and a lack of prior knowledge. In this work, we propose a
+novel Graph Spatio-Temporal Point Process (GSTPP) model for fine-grained event
+prediction. It adopts an encoder-decoder architecture that jointly models the
+state dynamics of spatially localized regions using neural Ordinary
+Differential Equations (ODEs). The state evolution is built on the foundation
+of a novel Self-Adaptive Anchor Graph (SAAG) that captures spatial
+dependencies. By adaptively localizing the anchor nodes in the space and
+jointly constructing the correlation edges between them, the SAAG enhances the
+model's ability of learning complex spatial event patterns. The proposed GSTPP
+model greatly improves the accuracy of fine-grained event prediction. Extensive
+experimental results show that our method greatly improves the prediction
+accuracy over existing spatio-temporal event prediction approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIAM International Conference on Data Mining 2025
+  (SDM'25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CREST: An Efficient Conjointly-trained Spike-driven Framework for
+  Event-based Object Detection Exploiting Spatiotemporal Dynamics <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12525v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12525v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixin Mao, Aoyu Shen, Lin Tang, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based cameras feature high temporal resolution, wide dynamic range, and
+low power consumption, which is ideal for high-speed and low-light object
+detection. Spiking neural networks (SNNs) are promising for event-based object
+recognition and detection due to their spiking nature but lack efficient
+training methods, leading to gradient vanishing and high computational
+complexity, especially in deep SNNs. Additionally, existing SNN frameworks
+often fail to effectively handle multi-scale spatiotemporal features, leading
+to increased data redundancy and reduced accuracy. To address these issues, we
+propose CREST, a novel conjointly-trained spike-driven framework to exploit
+spatiotemporal dynamics in event-based object detection. We introduce the
+conjoint learning rule to accelerate SNN learning and alleviate gradient
+vanishing. It also supports dual operation modes for efficient and flexible
+implementation on different hardware types. Additionally, CREST features a
+fully spike-driven framework with a multi-scale spatiotemporal event integrator
+(MESTOR) and a spatiotemporal-IoU (ST-IoU) loss. Our approach achieves superior
+object recognition & detection performance and up to 100X energy efficiency
+compared with state-of-the-art SNN algorithms on three datasets, providing an
+efficient solution for event-based object detection algorithms suitable for SNN
+hardware implementation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SimPSI: A Simple Strategy to Preserve Spectral Information in Time
+  Series Data Augmentation <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.05790v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.05790v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyun Ryu, Sunjae Yoon, Hee Suk Yoon, Eunseop Yoon, Chang D. Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is a crucial component in training neural networks to
+overcome the limitation imposed by data size, and several techniques have been
+studied for time series. Although these techniques are effective in certain
+tasks, they have yet to be generalized to time series benchmarks. We find that
+current data augmentation techniques ruin the core information contained within
+the frequency domain. To address this issue, we propose a simple strategy to
+preserve spectral information (SimPSI) in time series data augmentation. SimPSI
+preserves the spectral information by mixing the original and augmented input
+spectrum weighted by a preservation map, which indicates the importance score
+of each frequency. Specifically, our experimental contributions are to build
+three distinct preservation maps: magnitude spectrum, saliency map, and
+spectrum-preservative map. We apply SimPSI to various time series data
+augmentations and evaluate its effectiveness across a wide range of time series
+benchmarks. Our experimental results support that SimPSI considerably enhances
+the performance of time series data augmentations by preserving core spectral
+information. The source code used in the paper is available at
+https://github.com/Hyun-Ryu/simpsi.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024 camera-ready version w/ Appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MathSpeech: Leveraging Small LMs for Accurate Conversion in Mathematical
+  Speech-to-Formula <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sieun Hyeon, Kyudan Jung, Jaehee Won, Nam-Joon Kim, Hyun Gon Ryu, Hyuk-Jae Lee, Jaeyoung Do
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In various academic and professional settings, such as mathematics lectures
+or research presentations, it is often necessary to convey mathematical
+expressions orally. However, reading mathematical expressions aloud without
+accompanying visuals can significantly hinder comprehension, especially for
+those who are hearing-impaired or rely on subtitles due to language barriers.
+For instance, when a presenter reads Euler's Formula, current Automatic Speech
+Recognition (ASR) models often produce a verbose and error-prone textual
+description (e.g., e to the power of i x equals cosine of x plus i
+$\textit{side}$ of x), instead of the concise $\LaTeX{}$ format (i.e., $ e^{ix}
+= \cos(x) + i\sin(x) $), which hampers clear understanding and communication.
+To address this issue, we introduce MathSpeech, a novel pipeline that
+integrates ASR models with small Language Models (sLMs) to correct errors in
+mathematical expressions and accurately convert spoken expressions into
+structured $\LaTeX{}$ representations. Evaluated on a new dataset derived from
+lecture recordings, MathSpeech demonstrates $\LaTeX{}$ generation capabilities
+comparable to leading commercial Large Language Models (LLMs), while leveraging
+fine-tuned small language models of only 120M parameters. Specifically, in
+terms of CER, BLEU, and ROUGE scores for $\LaTeX{}$ translation, MathSpeech
+demonstrated significantly superior capabilities compared to GPT-4o. We
+observed a decrease in CER from 0.390 to 0.298, and higher ROUGE/BLEU scores
+compared to GPT-4o.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MathReader : Text-to-Speech for Mathematical Documents <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07088v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07088v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sieun Hyeon, Kyudan Jung, Nam-Joon Kim, Hyun Gon Ryu, Jaeyoung Do
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  TTS (Text-to-Speech) document reader from Microsoft, Adobe, Apple, and OpenAI
+have been serviced worldwide. They provide relatively good TTS results for
+general plain text, but sometimes skip contents or provide unsatisfactory
+results for mathematical expressions. This is because most modern academic
+papers are written in LaTeX, and when LaTeX formulas are compiled, they are
+rendered as distinctive text forms within the document. However, traditional
+TTS document readers output only the text as it is recognized, without
+considering the mathematical meaning of the formulas. To address this issue, we
+propose MathReader, which effectively integrates OCR, a fine-tuned T5 model,
+and TTS. MathReader demonstrated a lower Word Error Rate (WER) than existing
+TTS document readers, such as Microsoft Edge and Adobe Acrobat, when processing
+documents containing mathematical formulas. MathReader reduced the WER from
+0.510 to 0.281 compared to Microsoft Edge, and from 0.617 to 0.281 compared to
+Adobe Acrobat. This will significantly contribute to alleviating the
+inconvenience faced by users who want to listen to documents, especially those
+who are visually impaired. The code is available at
+https://github.com/hyeonsieun/MathReader.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Unified Distance Metric Across Diverse Data Distributions with
+  Parameter-Efficient Transfer Learning <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08944v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08944v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungyeon Kim, Donghyun Kim, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common practice in metric learning is to train and test an embedding model
+for each dataset. This dataset-specific approach fails to simulate real-world
+scenarios that involve multiple heterogeneous distributions of data. In this
+regard, we explore a new metric learning paradigm, called Unified Metric
+Learning (UML), which learns a unified distance metric capable of capturing
+relations across multiple data distributions. UML presents new challenges, such
+as imbalanced data distribution and bias towards dominant distributions. These
+issues cause standard metric learning methods to fail in learning a unified
+metric. To address these challenges, we propose Parameter-efficient Unified
+Metric leArning (PUMA), which consists of a pre-trained frozen model and two
+additional modules, stochastic adapter and prompt pool. These modules enable to
+capture dataset-specific knowledge while avoiding bias towards dominant
+distributions. Additionally, we compile a new unified metric learning benchmark
+with a total of 8 different datasets. PUMA outperforms the state-of-the-art
+dataset-specific models while using about 69 times fewer trainable parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Architecture-Aware Learning Curve Extrapolation via Graph Ordinary
+  Differential Equation <span class="chip">AAAI'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15554v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15554v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanna Ding, Zijie Huang, Xiao Shou, Yihang Guo, Yizhou Sun, Jianxi Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning curve extrapolation predicts neural network performance from early
+training epochs and has been applied to accelerate AutoML, facilitating
+hyperparameter tuning and neural architecture search. However, existing methods
+typically model the evolution of learning curves in isolation, neglecting the
+impact of neural network (NN) architectures, which influence the loss landscape
+and learning trajectories. In this work, we explore whether incorporating
+neural network architecture improves learning curve modeling and how to
+effectively integrate this architectural information. Motivated by the
+dynamical system view of optimization, we propose a novel architecture-aware
+neural differential equation model to forecast learning curves continuously. We
+empirically demonstrate its ability to capture the general trend of fluctuating
+learning curves while quantifying uncertainty through variational parameters.
+Our model outperforms current state-of-the-art learning curve extrapolation
+methods and pure time-series modeling approaches for both MLP and CNN-based
+learning curves. Additionally, we explore the applicability of our method in
+Neural Architecture Search scenarios, such as training configuration ranking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI'25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IDEA: Image Description Enhanced CLIP-Adapter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08816v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08816v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Ye, Feng Jiang, Qiufeng Wang, Kaizhu Huang, Jiaqi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP (Contrastive Language-Image Pre-training) has attained great success in
+pattern recognition and computer vision. Transferring CLIP to downstream tasks
+(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.
+However, current studies primarily focus on either prompt learning for text or
+adapter tuning for vision, without fully exploiting the complementary
+information and correlations among image-text pairs. In this paper, we propose
+an Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to
+few-shot image classification tasks. This method captures fine-grained features
+by leveraging both visual features and textual descriptions of images. IDEA is
+a training-free method for CLIP, and it can be comparable to or even exceeds
+state-of-the-art models on multiple tasks. Furthermore, we introduce
+Trainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable
+components (i.e., a projector and a learnable latent space), further enhancing
+the model's performance and achieving SOTA results on 11 datasets. As one
+important contribution, we employ the Llama model and design a comprehensive
+pipeline to generate textual descriptions for images of 11 datasets, resulting
+in a total of 1,637,795 image-text pairs, named "IMD-11". Our code and data are
+released at https://github.com/FourierAI/IDEA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Models Learn Skill Composition from Examples? <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.19808v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.19808v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Zhao, Simran Kaur, Dingli Yu, Anirudh Goyal, Sanjeev Arora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) become increasingly advanced, their ability
+to exhibit compositional generalization -- the capacity to combine learned
+skills in novel ways not encountered during training -- has garnered
+significant attention. This type of generalization, particularly in scenarios
+beyond training data, is also of great interest in the study of AI safety and
+alignment. A recent study introduced the SKILL-MIX evaluation, where models are
+tasked with composing a short paragraph demonstrating the use of a specified
+$k$-tuple of language skills. While small models struggled with composing even
+with $k=3$, larger models like GPT-4 performed reasonably well with $k=5$ and
+$6$.
+  In this paper, we employ a setup akin to SKILL-MIX to evaluate the capacity
+of smaller models to learn compositional generalization from examples.
+Utilizing a diverse set of language skills -- including rhetorical, literary,
+reasoning, theory of mind, and common sense -- GPT-4 was used to generate text
+samples that exhibit random subsets of $k$ skills. Subsequent fine-tuning of 7B
+and 13B parameter models on these combined skill texts, for increasing values
+of $k$, revealed the following findings: (1) Training on combinations of $k=2$
+and $3$ skills results in noticeable improvements in the ability to compose
+texts with $k=4$ and $5$ skills, despite models never having seen such examples
+during training. (2) When skill categories are split into training and held-out
+groups, models significantly improve at composing texts with held-out skills
+during testing despite having only seen training skills during fine-tuning,
+illustrating the efficacy of the training approach even with previously unseen
+skills. This study also suggests that incorporating skill-rich (potentially
+synthetic) text into training can substantially enhance the compositional
+capabilities of models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Evaluative AI: A Hypothesis-Driven Tool with Concept-Based
+  Explanations and Weight of Evidence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04710v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04710v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thao Le, Tim Miller, Ruihan Zhang, Liz Sonenberg, Ronal Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents Visual Evaluative AI, a decision aid that provides
+positive and negative evidence from image data for a given hypothesis. This
+tool finds high-level human concepts in an image and generates the Weight of
+Evidence (WoE) for each hypothesis in the decision-making process. We apply and
+evaluate this tool in the skin cancer domain by building a web-based
+application that allows users to upload a dermatoscopic image, select a
+hypothesis and analyse their decisions by evaluating the provided evidence.
+Further, we demonstrate the effectiveness of Visual Evaluative AI on different
+concept-based explanation approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-18T00:00:00Z">2025-01-18</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">17</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion-Based Imitation Learning for Social Pose Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Lech Martin-Ozimek, Isuru Jayarathne, Su Larb Mon, Jouh Yeong Chew
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent agents, such as robots and virtual agents, must understand the
+dynamics of complex social interactions to interact with humans. Effectively
+representing social dynamics is challenging because we require multi-modal,
+synchronized observations to understand a scene. We explore how using a single
+modality, the pose behavior, of multiple individuals in a social interaction
+can be used to generate nonverbal social cues for the facilitator of that
+interaction. The facilitator acts to make a social interaction proceed smoothly
+and is an essential role for intelligent agents to replicate in human-robot
+interactions. In this paper, we adapt an existing diffusion behavior cloning
+model to learn and replicate facilitator behaviors. Furthermore, we evaluate
+two representations of pose observations from a scene, one representation has
+pre-processing applied and one does not. The purpose of this paper is to
+introduce a new use for diffusion behavior cloning for pose generation in
+social interactions. The second is to understand the relationship between
+performance and computational load for generating social pose behavior using
+two different techniques for collecting scene observations. As such, we are
+essentially testing the effectiveness of two different types of conditioning
+for a diffusion model. We then evaluate the resulting generated behavior from
+each technique using quantitative measures such as mean per-joint position
+error (MPJPE), training time, and inference time. Additionally, we plot
+training and inference time against MPJPE to examine the trade-offs between
+efficiency and performance. Our results suggest that the further pre-processed
+data can successfully condition diffusion models to generate realistic social
+behavior, with reasonable trade-offs in accuracy and processing time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was submitted as an LBR to HRI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Nonverbal Cues in Multiparty Social Interactions for Robotic
+  Facilitators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Lech Martin-Ozimek, Isuru Jayarathne, Su Larb Mon, Jouhyeong Chew
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional behavior cloning (BC) models often struggle to replicate the
+subtleties of human actions. Previous studies have attempted to address this
+issue through the development of a new BC technique: Implicit Behavior Cloning
+(IBC). This new technique consistently outperformed the conventional Mean
+Squared Error (MSE) BC models in a variety of tasks. Our goal is to replicate
+the performance of the IBC model by Florence [in Proceedings of the 5th
+Conference on Robot Learning, 164:158-168, 2022], for social interaction tasks
+using our custom dataset. While previous studies have explored the use of large
+language models (LLMs) for enhancing group conversations, they often overlook
+the significance of non-verbal cues, which constitute a substantial part of
+human communication. We propose using IBC to replicate nonverbal cues like gaze
+behaviors. The model is evaluated against various types of facilitator data and
+compared to an explicit, MSE BC model. Results show that the IBC model
+outperforms the MSE BC model across session types using the same metrics used
+in the previous IBC paper. Despite some metrics showing mixed results which are
+explainable for the custom dataset for social interaction, we successfully
+replicated the IBC model to generate nonverbal cues. Our contributions are (1)
+the replication and extension of the IBC model, and (2) a nonverbal cues
+generation model for social interaction. These advancements facilitate the
+integration of robots into the complex interactions between robots and humans,
+e.g., in the absence of a human facilitator.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to as a short contribution to HRI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Coloring to Reduce Computation Time in Prioritized Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10812v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10812v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Scheffe, Julius Kahle, Bassam Alrifaee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributing computations among agents in large networks reduces
+computational effort in multi-agent path finding (MAPF). One distribution
+strategy is prioritized planning (PP). In PP, we couple and prioritize
+interacting agents to achieve a desired behavior across all agents in the
+network. We characterize the interaction with a directed acyclic graph (DAG).
+The computation time for solving MAPF problem using PP is mainly determined
+through the longest path in this DAG. The longest path depends on the fixed
+undirected coupling graph and the variable prioritization. The approaches from
+literature to prioritize agents are numerous and pursue various goals. This
+article presents an approach for prioritization in PP to reduce the longest
+path length in the coupling DAG and thus the computation time for MAPF using
+PP. We prove that this problem can be mapped to a graph-coloring problem, in
+which the number of colors required corresponds to the longest path length in
+the coupling DAG. We propose a decentralized graph-coloring algorithm to
+determine priorities for the agents. We evaluate the approach by applying it to
+multi-agent motion planning (MAMP) for connected and automated vehicles (CAVs)
+on roads using, a variant of MAPF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simultaneous Computation with Multiple Prioritizations in Multi-Agent
+  Motion Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Scheffe, Julius Kahle, Bassam Alrifaee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent path finding (MAPF) in large networks is computationally
+challenging. An approach for MAPF is prioritized planning (PP), in which agents
+plan sequentially according to their priority. Albeit a computationally
+efficient approach for MAPF, the solution quality strongly depends on the
+prioritization. Most prioritizations rely either on heuristics, which do not
+generalize well, or iterate to find adequate priorities, which costs
+computational effort. In this work, we show how agents can compute with
+multiple prioritizations simultaneously. Our approach is general as it does not
+rely on domain-specific knowledge. The context of this work is multi-agent
+motion planning (MAMP) with a receding horizon subject to computation time
+constraints. MAMP considers the system dynamics in more detail compared to
+MAPF. In numerical experiments on MAMP, we demonstrate that our approach to
+prioritization comes close to optimal prioritization and outperforms
+state-of-the-art methods with only a minor increase in computation time. We
+show real-time capability in an experiment on a road network with ten vehicles
+in our Cyber-Physical Mobility Lab.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-like Nonverbal Behavior with MetaHumans in Real-World Interaction
+  Studies: An Architecture Using Generative Methods and Motion Capture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oliver Chojnowski, Alexander Eberhard, Michael Schiffmann, Ana Müller, Anja Richert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Socially interactive agents are gaining prominence in domains like
+healthcare, education, and service contexts, particularly virtual agents due to
+their inherent scalability. To facilitate authentic interactions, these systems
+require verbal and nonverbal communication through e.g., facial expressions and
+gestures. While natural language processing technologies have rapidly advanced,
+incorporating human-like nonverbal behavior into real-world interaction
+contexts is crucial for enhancing the success of communication, yet this area
+remains underexplored. One barrier is creating autonomous systems with
+sophisticated conversational abilities that integrate human-like nonverbal
+behavior. This paper presents a distributed architecture using Epic Games
+MetaHuman, combined with advanced conversational AI and camera-based user
+management, that supports methods like motion capture, handcrafted animation,
+and generative approaches for nonverbal behavior. We share insights into a
+system architecture designed to investigate nonverbal behavior in socially
+interactive agents, deployed in a three-week field study in the Deutsches
+Museum Bonn, showcasing its potential in realistic nonverbal behavior research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the ACM/IEEE International Conference on
+  Human-Robot Interaction (HRI 2025) as a Late-Breaking Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Interpretable Neural Control Network with Adaptable Online Learning
+  for Sample Efficient Robot Locomotion Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arthicha Srisuchinnawong, Poramate Manoonpong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robot locomotion learning using reinforcement learning suffers from training
+sample inefficiency and exhibits the non-understandable/black-box nature. Thus,
+this work presents a novel SME-AGOL to address such problems. Firstly,
+Sequential Motion Executor (SME) is a three-layer interpretable neural network,
+where the first produces the sequentially propagating hidden states, the second
+constructs the corresponding triangular bases with minor non-neighbor
+interference, and the third maps the bases to the motor commands. Secondly, the
+Adaptable Gradient-weighting Online Learning (AGOL) algorithm prioritizes the
+update of the parameters with high relevance score, allowing the learning to
+focus more on the highly relevant ones. Thus, these two components lead to an
+analyzable framework, where each sequential hidden state/basis represents the
+learned key poses/robot configuration. Compared to state-of-the-art methods,
+the SME-AGOL requires 40% fewer samples and receives 150% higher final
+reward/locomotion performance on a simulated hexapod robot, while taking merely
+10 minutes of learning time from scratch on a physical hexapod robot. Taken
+together, this work not only proposes the SME-AGOL for sample efficient and
+understandable locomotion learning but also emphasizes the potential
+exploitation of interpretability for improving sample efficiency and learning
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 Figures + 6 Figures in supplementary material section, 2
+  Tables, submitted to TNNLS (minor revision; revision submitted 5 October
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PB-NBV: Efficient Projection-Based Next-Best-View Planning Framework for
+  Reconstruction of Unknown Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10663v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10663v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhizhou Jia, Yuetao Li, Qun Hao, Shaohui Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Completely capturing the three-dimensional (3D) data of an object is
+essential in industrial and robotic applications. The task of next-best-view
+(NBV) planning is to calculate the next optimal viewpoint based on the current
+data, gradually achieving a complete 3D reconstruction of the object. However,
+many existing NBV planning algorithms incur heavy computational costs due to
+the extensive use of ray-casting. Specifically, this framework refits different
+types of voxel clusters into ellipsoids based on the voxel structure. Then, the
+next optimal viewpoint is selected from the candidate views using a
+projection-based viewpoint quality evaluation function in conjunction with a
+global partitioning strategy. This process replaces extensive ray-casting,
+significantly improving the computational efficiency. Comparison experiments in
+the simulation environment show that our framework achieves the highest point
+cloud coverage with low computational time compared to other frameworks. The
+real-world experiments also confirm the efficiency and feasibility of the
+framework. Our method will be made open source to benefit the community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient and Safe Trajectory Planning for Autonomous Agricultural
+  Vehicle Headland Turning in Cluttered Orchard Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10636v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10636v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Wei, Chen Peng, Wenwu Lu, Yuankai Zhu, Stavros Vougioukas, Zhenghao Fei, Zhikang Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous agricultural vehicles (AAVs), including field robots and
+autonomous tractors, are becoming essential in modern farming by improving
+efficiency and reducing labor costs. A critical task in AAV operations is
+headland turning between crop rows. This task is challenging in orchards with
+limited headland space, irregular boundaries, operational constraints, and
+static obstacles. While traditional trajectory planning methods work well in
+arable farming, they often fail in cluttered orchard environments. This letter
+presents a novel trajectory planner that enhances the safety and efficiency of
+AAV headland maneuvers, leveraging advancements in autonomous driving. Our
+approach includes an efficient front-end algorithm and a high-performance
+back-end optimization. Applied to vehicles with various implements, it
+outperforms state-of-the-art methods in both standard and challenging orchard
+fields. This work bridges agricultural and autonomous driving technologies,
+facilitating a broader adoption of AAVs in complex orchards.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoMu4o: A Robotic Manipulation Unit For Orchard Operations Automating
+  Proximal Hyperspectral Leaf Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehrad Mortazavi, David J. Cappelleri, Reza Ehsani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by the need to address labor shortages and meet the demands of a
+rapidly growing population, robotic automation has become a critical component
+in precision agriculture. Leaf-level hyperspectral spectroscopy is shown to be
+a powerful tool for phenotyping, monitoring crop health, identifying essential
+nutrients within plants as well as detecting diseases and water stress. This
+work introduces RoMu4o, a robotic manipulation unit for orchard operations
+offering an automated solution for proximal hyperspectral leaf sensing. This
+ground robot is equipped with a 6DOF robotic arm and vision system for
+real-time deep learning-based image processing and motion planning. We
+developed robust perception and manipulation pipelines that enable the robot to
+successfully grasp target leaves and perform spectroscopy. These frameworks
+operate synergistically to identify and extract the 3D structure of leaves from
+an observed batch of foliage, propose 6D poses, and generate collision-free
+constraint-aware paths for precise leaf manipulation. The end-effector of the
+arm features a compact design that integrates an independent lighting source
+with a hyperspectral sensor, enabling high-fidelity data acquisition while
+streamlining the calibration process for accurate measurements. Our ground
+robot is engineered to operate in unstructured orchard environments. However,
+the performance of the system is evaluated in both indoor and outdoor plant
+models. The system demonstrated reliable performance for 1-LPB hyperspectral
+sampling, achieving 95% success rate in lab trials and 79% in field trials.
+Field experiments revealed an overall success rate of 70% for autonomous leaf
+grasping and hyperspectral measurement in a pistachio orchard. The open-source
+repository is available at: https://github.com/mehradmrt/UCM-AgBot-ROS2
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Space-LLaVA: a Vision-Language Model Adapted to Extraterrestrial
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05924v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05924v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Foutter, Daniele Gammelli, Justin Kruger, Ethan Foss, Praneet Bhoj, Tommaso Guffanti, Simone D'Amico, Marco Pavone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation Models (FMs), e.g., large language models, possess attributes of
+intelligence which offer promise to endow a robot with the contextual
+understanding necessary to navigate complex, unstructured tasks in the wild. We
+see three core challenges in the future of space robotics that motivate
+building an FM for the space robotics community: 1) Scalability of
+ground-in-the-loop operations; 2) Generalizing prior knowledge to novel
+environments; and 3) Multi-modality in tasks and sensor data. As a first-step
+towards a space foundation model, we programmatically augment three
+extraterrestrial databases with fine-grained language annotations inspired by
+the sensory reasoning necessary to e.g., identify a site of scientific interest
+on Mars, building a synthetic dataset of visual-question-answer and visual
+instruction-following tuples. We fine-tune a pre-trained LLaVA 13B checkpoint
+on our augmented dataset to adapt a Vision-Language Model (VLM) to the visual
+semantic features in an extraterrestrial environment, demonstrating FMs as a
+tool for specialization and enhancing a VLM's zero-shot performance on unseen
+task types in comparison to state-of-the-art VLMs. Ablation studies show that
+fine-tuning the language backbone and vision-language adapter in concert is key
+to facilitate adaption while a small percentage, e.g., 20%, of the pre-training
+data can be used to safeguard against catastrophic forgetting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Aerospace Conference, 23 pages, 18 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ALPINE: a climbing robot for operations in mountain environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15142v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15142v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michele Focchi, Andrea Del Prete, Daniele Fontanelli, Marco Frego, Angelika Peer, Luigi Palopoli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mountain slopes are perfect examples of harsh environments in which humans
+are required to perform difficult and dangerous operations such as removing
+unstable boulders, dangerous vegetation or deploying safety nets. A good
+replacement for human intervention can be offered by climbing robots. The
+different solutions existing in the literature are not up to the task for the
+difficulty of the requirements (navigation, heavy payloads, flexibility in the
+execution of the tasks). In this paper, we propose a robotic platform that can
+fill this gap. Our solution is based on a robot that hangs on ropes, and uses a
+retractable leg to jump away from the mountain walls. Our package of mechanical
+solutions, along with the algorithms developed for motion planning and control,
+delivers swift navigation on irregular and steep slopes, the possibility to
+overcome or travel around significant natural barriers, and the ability to
+carry heavy payloads and execute complex tasks. In the paper, we give a full
+account of our main design and algorithmic choices and show the feasibility of
+the solution through a large number of physically simulated scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Augmented Reality without Borders: Achieving Precise Localization
+  Without Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.17373v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.17373v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Albert Gassol Puigjaner, Irvin Aloise, Patrik Schmuck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual localization is crucial for Computer Vision and Augmented Reality (AR)
+applications, where determining the camera or device's position and orientation
+is essential to accurately interact with the physical environment. Traditional
+methods rely on detailed 3D maps constructed using Structure from Motion (SfM)
+or Simultaneous Localization and Mapping (SLAM), which is computationally
+expensive and impractical for dynamic or large-scale environments. We introduce
+MARLoc, a novel localization framework for AR applications that uses known
+relative transformations within image sequences to perform intra-sequence
+triangulation, generating 3D-2D correspondences for pose estimation and
+refinement. MARLoc eliminates the need for pre-built SfM maps, providing
+accurate and efficient localization suitable for dynamic outdoor environments.
+Evaluation with benchmark datasets and real-world experiments demonstrates
+MARLoc's state-of-the-art performance and robustness. By integrating MARLoc
+into an AR device, we highlight its capability to achieve precise localization
+in real-world outdoor scenarios, showcasing its practical effectiveness and
+potential to enhance visual localization in AR applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Three-dimensional Trajectory Optimization for Quadrotor Tail-sitter
+  UAVs: Traversing through Given Waypoints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.08347v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.08347v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyue Fan, Fangfang Xie, Tingwei Ji, Yao Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the evolving application scenarios of current fixed-wing unmanned
+aerial vehicles (UAVs), it is necessary for UAVs to possess agile and rapid
+3-dimensional flight capabilities. Typically, the trajectory of a tail-sitter
+is generated separately for vertical and level flights. This limits the
+tail-sitter's ability to move in a 3-dimensional airspace and makes it
+difficult to establish a smooth transition between vertical and level flights.
+In the present work, a 3-dimensional trajectory optimization method is proposed
+for quadrotor tail-sitters. Especially, the differential dynamics constraints
+are eliminated when generating the trajectory of the tail-sitter by utilizing
+differential flatness method. Additionally, the temporal parameters of the
+trajectory are generated using the state-of-the-art trajectory generation
+method called MINCO (minimum control). Subsequently, we convert the speed
+constraint on the vehicle into a soft constraint by discretizing the trajectory
+in time. This increases the likelihood that the control input limits are
+satisfied and the trajectory is feasible. Then, we utilize a kind of model
+predictive control (MPC) method to track trajectories. Even if restricting the
+tail-sitter's motion to a 2-dimensional horizontal plane, the solutions still
+outperform those of the L1 Guidance Law and Dubins path.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3DGS-CD: 3D Gaussian Splatting-based Change Detection for Physical
+  Object Rearrangement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03706v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03706v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Lu, Jianbo Ye, John Leonard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present 3DGS-CD, the first 3D Gaussian Splatting (3DGS)-based method for
+detecting physical object rearrangements in 3D scenes. Our approach estimates
+3D object-level changes by comparing two sets of unaligned images taken at
+different times. Leveraging 3DGS's novel view rendering and EfficientSAM's
+zero-shot segmentation capabilities, we detect 2D object-level changes, which
+are then associated and fused across views to estimate 3D change masks and
+object transformations. Our method can accurately identify changes in cluttered
+environments using sparse (as few as one) post-change images within as little
+as 18s. It does not rely on depth input, user instructions, pre-defined object
+classes, or object models -- An object is recognized simply if it has been
+re-arranged. Our approach is evaluated on both public and self-collected
+real-world datasets, achieving up to 14% higher accuracy and three orders of
+magnitude faster performance compared to the state-of-the-art
+radiance-field-based change detection method. This significant performance
+boost enables a broad range of downstream applications, where we highlight
+three key use cases: object reconstruction, robot workspace reset, and 3DGS
+model update. Our code and data will be made available at
+https://github.com/520xyxyzq/3DGS-CD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Refined Motion Compensation with Soft Laser Manipulators using
+  Data-Driven Surrogate Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.01891v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.01891v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongjun Yan, Qingpeng Ding, Mingwu Li, Junyan Yan, Shing Shin Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-contact laser ablation, a precise thermal technique, simultaneously cuts
+and coagulates tissue without the insertion errors associated with rigid
+needles. Human organ motions, such as those in the liver, exhibit rhythmic
+components influenced by respiratory and cardiac cycles, making effective laser
+energy delivery to target lesions while compensating for tumor motion crucial.
+This research introduces a data-driven method to derive surrogate models of a
+soft manipulator. These low-dimensional models offer computational efficiency
+when integrated into the Model Predictive Control (MPC) framework, while still
+capturing the manipulator's dynamics with and without control input. Spectral
+Submanifolds (SSM) theory models the manipulator's autonomous dynamics,
+acknowledging its tendency to reach equilibrium when external forces are
+removed. Preliminary results show that the MPC controller using the surrogate
+model outperforms two other models within the same MPC framework. The
+data-driven MPC controller also supports a design-agnostic feature, allowing
+the interchangeability of different soft manipulators within the laser ablation
+surgery robot system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VIPeR: Visual Incremental Place Recognition with Adaptive Mining and
+  Lifelong Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21416v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21416v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Ming, Minyang Xu, Xingrui Yang, Weicai Ye, Weihan Wang, Yong Peng, Weichen Dai, Wanzeng Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual place recognition (VPR) is an essential component of many autonomous
+and augmented/virtual reality systems. It enables the systems to robustly
+localize themselves in large-scale environments. Existing VPR methods
+demonstrate attractive performance at the cost of heavy pre-training and
+limited generalizability. When deployed in unseen environments, these methods
+exhibit significant performance drops. Targeting this issue, we present VIPeR,
+a novel approach for visual incremental place recognition with the ability to
+adapt to new environments while retaining the performance of previous
+environments. We first introduce an adaptive mining strategy that balances the
+performance within a single environment and the generalizability across
+multiple environments. Then, to prevent catastrophic forgetting in lifelong
+learning, we draw inspiration from human memory systems and design a novel
+memory bank for our VIPeR. Our memory bank contains a sensory memory, a working
+memory and a long-term memory, with the first two focusing on the current
+environment and the last one for all previously visited environments.
+Additionally, we propose a probabilistic knowledge distillation to explicitly
+safeguard the previously learned knowledge. We evaluate our proposed VIPeR on
+three large-scale datasets, namely Oxford Robotcar, Nordland, and TartanAir.
+For comparison, we first set a baseline performance with naive finetuning.
+Then, several more recent lifelong learning methods are compared. Our VIPeR
+achieves better performance in almost all aspects with the biggest improvement
+of 13.65% in average performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ robosuite: A Modular Simulation Framework and Benchmark for Robot
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.12293v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.12293v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuke Zhu, Josiah Wong, Ajay Mandlekar, Roberto Martín-Martín, Abhishek Joshi, Kevin Lin, Abhiram Maddukuri, Soroush Nasiriany, Yifeng Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  robosuite is a simulation framework for robot learning powered by the MuJoCo
+physics engine. It offers a modular design for creating robotic tasks as well
+as a suite of benchmark environments for reproducible research. This paper
+discusses the key system modules and the benchmark environments of our new
+release robosuite v1.5.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For more information, please visit https://robosuite.ai</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">12</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Which price to pay? Auto-tuning building MPC controller for optimal
+  economic cost 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarui Yu, Jicheng Shi, Wenjie Xu, Colin N. Jones
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model predictive control (MPC) controller is considered for temperature
+management in buildings but its performance heavily depends on hyperparameters.
+Consequently, MPC necessitates meticulous hyperparameter tuning to attain
+optimal performance under diverse contracts. However, conventional building
+controller design is an open-loop process without critical hyperparameter
+optimization, often leading to suboptimal performance due to unexpected
+environmental disturbances and modeling errors. Furthermore, these
+hyperparameters are not adapted to different pricing schemes and may lead to
+non-economic operations. To address these issues, we propose an efficient
+performance-oriented building MPC controller tuning method based on a
+cutting-edge efficient constrained Bayesian optimization algorithm, CONFIG,
+with global optimality guarantees. We demonstrate that this technique can be
+applied to efficiently deal with real-world DSM program selection problems
+under customized black-box constraints and objectives. In this study, a simple
+MPC controller, which offers the advantages of reduced commissioning costs,
+enhanced computational efficiency, was optimized to perform on a comparable
+level to a delicately designed and computationally expensive MPC controller.
+The results also indicate that with an optimized simple MPC, the monthly
+electricity cost of a household can be reduced by up to 26.90% compared with
+the cost when controlled by a basic rule-based controller under the same
+constraints. Then we compared 12 real electricity contracts in Belgium for a
+household family with customized black-box occupant comfort constraints. The
+results indicate a monthly electricity bill saving up to 20.18% when the most
+economic contract is compared with the worst one, which again illustrates the
+significance of choosing a proper electricity contract.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BOOST: Microgrid Sizing using Ordinal Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad Fares El Hajj Chehade, Sami Karaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transition to sustainable energy systems has highlighted the critical
+need for efficient sizing of renewable energy resources in microgrids. In
+particular, designing photovoltaic (PV) and battery systems to meet residential
+loads is challenging due to trade-offs between cost, reliability, and
+environmental impact. While previous studies have employed dynamic programming
+and heuristic techniques for microgrid sizing, these approaches often fail to
+balance computational efficiency and accuracy. In this work, we propose BOOST,
+or Battery-solar Ordinal Optimization Sizing Technique, a novel framework for
+optimizing the sizing of PV and battery components in microgrids. Ordinal
+optimization enables computationally efficient evaluations of potential designs
+while preserving accuracy through robust ranking of solutions. To determine the
+optimal operation of the system at any given time, we introduce a mixed-integer
+linear programming (MILP) approach, which achieves lower costs than the
+commonly used dynamic programming methods. Our numerical experiments
+demonstrate that the proposed framework identifies optimal designs that achieve
+a levelized cost of energy (LCOE) as low as 8.84 cents/kWh, underscoring its
+potential for cost-effective microgrid design. The implications of our work are
+significant: BOOST provides a scalable and accurate methodology for integrating
+renewable energy into residential microgrids, addressing economic and
+environmental goals simultaneously.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Systems Engineering for Autonomous Vehicles; Supervising AI using Large
+  Language Models (SSuperLLM) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10839v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10839v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diomidis Katzourakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Artificial Intelligence (GAI) and the idea to use hierarchical
+models has been around for some years now. GAI has proved to be an extremely
+useful tool for Autonomous Vehicles (AVs). AVs need to perform robustly in
+their environment. Thus the AV behavior and short-term trajectory planning
+needs to be: a) designed and architected using safeguarding and supervisory
+systems and b) verified using proper Systems Engineering (SysEng) Principles.
+Can AV Systems Engineering also use Large Language Models (LLM) to help
+Autonomous vehicles (AV) development? This reader-friendly paper advocates the
+use of LLMs in 1) requirements (Reqs) development and 2) Reqs verification and
+3) provides a proof-of-concept of AV supervisory control. The latter uses a
+simulation environment of a simple planar (bicycle) vehicle dynamics model and
+a Linear Quadratic Regulator (LQR) control with an LLM Application Interface
+(API). The Open-Source simulation SW is available from the author accessible to
+the readers so that they can engage into the AV stack, LLM API and rules,
+SysEng and Reqs and fundamental vehicle dynamics and control.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating Expert and Physics Knowledge for Modeling Heat Load in
+  District Heating Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10827v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10827v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Souza, Thom Badings, Geert Postma, Jeroen Jansen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  New residential neighborhoods are often supplied with heat via district
+heating systems (DHS). Improving the energy efficiency of a DHS is critical for
+increasing sustainability and satisfying user requirements. In this paper, we
+present HELIOS, a dedicated artificial intelligence (AI) model designed
+specifically for modeling the heat load in DHS. HELIOS leverages a combination
+of established physical principles and expert knowledge, resulting in superior
+performance compared to existing state-of-the-art models. HELIOS is
+explainable, enabling enhanced accountability and traceability in its
+predictions. We evaluate HELIOS against ten state-of-the-art data-driven models
+in modeling the heat load in a DHS case study in the Netherlands. HELIOS
+emerges as the top-performing model while maintaining complete accountability.
+The applications of HELIOS extend beyond the present case study, potentially
+supporting the adoption of AI by DHS and contributing to sustainable energy
+management on a larger scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-Expansive Mappings in Two-Time-Scale Stochastic Approximation:
+  Finite-Time Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Chandak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Two-time-scale stochastic approximation is an iterative algorithm used in
+applications such as optimization, reinforcement learning, and control.
+Finite-time analysis of these algorithms has primarily focused on fixed point
+iterations where both time-scales have contractive mappings. In this paper, we
+study two-time-scale iterations, where the slower time-scale has a
+non-expansive mapping. For such algorithms, the slower time-scale can be
+considered a stochastic inexact Krasnoselskii-Mann iteration. We show that the
+mean square error decays at a rate $O(1/k^{1/4-\epsilon})$, where $\epsilon>0$
+is arbitrarily small. We also show almost sure convergence of iterates to the
+set of fixed points. We show the applicability of our framework by applying our
+results to minimax optimization, linear stochastic approximation, and
+Lagrangian optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to SIAM Journal on Control and Optimization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Insights from the application of nonlinear model predictive control to a
+  cart-pendulum 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark P. Balenzuela
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired greatly by Mills et al. (2009) and the solution within, this paper
+aims to more clearly
+  explain the mathematics and implementation details of such a powerful control
+algorithm. While the
+  aforementioned paper is well written and of sound mathematics, it is
+extreamly dense and requires
+  some time and patience to decipher, especially as it draws on many other
+sources to complete the
+  algorithm. This dense property is a clear result of the paper being
+restricted to the brief form and
+  important details being ommited as a result. We provide the much needed
+elaboration here for the
+  benifit of the reader.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HOPS: High-order Polynomials with <span class="highlight-title">Self-supervised</span> Dimension Reduction
+  for Load Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10637v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10637v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyang Song, Han Feng, Shreyashi Shukla, Jue Wang, Tao Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Load forecasting is a fundamental task in smart grid. Many techniques have
+been applied to developing load forecasting models. Due to the challenges such
+as the Curse of Dimensionality, overfitting, and limited computing resources,
+multivariate higher-order polynomial models have received limited attention in
+load forecasting, despite their desirable mathematical foundations and
+optimization properties. In this paper, we propose low rank approximation and
+self-supervised dimension reduction to address the aforementioned issues. To
+further improve computational efficiency, we also introduce a fast Conjugate
+Gradient based algorithm for the proposed polynomial models. Based on the ISO
+New England dataset used in Global Energy Forecasting Competition 2017, the
+proposed method high-order polynomials with self-supervised dimension reduction
+(HOPS) demonstrates higher forecasting accuracy over several competitive
+models. Additionally, experimental results indicate that our approach
+alleviates redundant variable construction, achieving better forecasts with
+fewer input variables.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Markov Property in Driving Behaviors: Insights from
+  Statistical Tests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10625v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10625v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Li, Haoming Meng, Chengyuan Ma, Ke Ma, Xiaopeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Markov property serves as a foundational assumption in most existing work
+on vehicle driving behavior, positing that future states depend solely on the
+current state, not the series of preceding states. This study validates the
+Markov properties of vehicle trajectories for both Autonomous Vehicles (AVs)
+and Human-driven Vehicles (HVs). A statistical method used to test whether time
+series data exhibits Markov properties is applied to examine whether the
+trajectory data possesses Markov characteristics. t test and F test are
+additionally introduced to characterize the differences in Markov properties
+between AVs and HVs. Based on two public trajectory datasets, we investigate
+the presence and order of the Markov property of different types of vehicles
+through rigorous statistical tests. Our findings reveal that AV trajectories
+generally exhibit stronger Markov properties compared to HV trajectories, with
+a higher percentage conforming to the Markov property and lower Markov orders.
+In contrast, HV trajectories display greater variability and heterogeneity in
+decision-making processes, reflecting the complex perception and information
+processing involved in human driving. These results have significant
+implications for the development of driving behavior models, AV controllers,
+and traffic simulation systems. Our study also demonstrates the feasibility of
+using statistical methods to test the presence of Markov properties in driving
+trajectory data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Transmission Power Scheduling for Networked Control System under
+  DoS Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00540v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00540v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyi Wang, Yulong Gao, Sandra Hirche
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing networked control systems that are reliable and resilient against
+adversarial threats, is essential for ensuring the security of cyber-physical
+systems. This paper addresses the communication-control co-design problem for
+networked control systems under denial-of-service (DoS) attacks. In the
+wireless channel, a transmission power scheduler periodically determines the
+power level for sensory data transmission. Yet DoS attacks render data packets
+unavailable by disrupting the communication channel. This paper co-designs the
+control and power scheduling laws in the presence of DoS attacks and aims to
+minimize the sum of regulation control performance and transmission power
+consumption. Both finite- and infinite-horizon discounted cost criteria are
+addressed, respectively. By delving into the information structure between the
+controller and the power scheduler under attack, the original co-design problem
+is divided into two subproblems that can be solved individually without
+compromising optimality. The optimal control is shown to be certainty
+equivalent, and the optimal transmission power scheduling is solved using a
+dynamic programming approach. Moreover, in the infinite-horizon scenario, we
+analyze the performance of the designed scheduling policy and develop an upper
+bound of the total costs. Finally, a numerical example is provided to
+demonstrate the theoretical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Eavesdropping on Goal-Oriented Communication: Timing Attacks and
+  Countermeasures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07088v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07088v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Mason, Federico Chiariotti, Pietro Talli, Andrea Zanella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Goal-oriented communication is a new paradigm that considers the meaning of
+transmitted information to optimize communication. One possible application is
+the remote monitoring of a process under communication costs: scheduling
+updates based on goal-oriented considerations can significantly reduce
+transmission frequency while maintaining high-quality tracking performance.
+However, goal-oriented scheduling also opens a timing-based side-channel that
+an eavesdropper may exploit to obtain information about the state of the remote
+process, even if the content of updates is perfectly secure. In this work, we
+study an eavesdropping attack against pull-based goal-oriented scheduling for
+the tracking of remote Markov processes. We provide a theoretical framework for
+defining the effectiveness of the attack and of possible countermeasures, as
+well as a practical heuristic that can provide a balance between the
+performance gains offered by goal-oriented communication and the information
+leakage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Functional Type Expressions of Sequential Circuits with the Notion of
+  Referring Forms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.08128v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.08128v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shunji Nishimura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the notion of referring forms as a new metric for
+analyzing sequential circuits from a functional perspective. Sequential
+circuits are modeled as causal stream functions, the outputs of which depend
+solely on the past and current inputs. Referring forms are defined based on the
+type expressions of functions and represent how a circuit refers to past
+inputs. The key contribution of this study is identifying a universal property
+in multiple clock domain circuits using referring forms. This theoretical
+framework is expected to enhance the comprehension and analysis of sequential
+circuits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 7 figures, 2025 11th International Conference on Computing
+  and Artificial Intelligence (ICCAI 2025): accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Refined Motion Compensation with Soft Laser Manipulators using
+  Data-Driven Surrogate Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.01891v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.01891v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongjun Yan, Qingpeng Ding, Mingwu Li, Junyan Yan, Shing Shin Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-contact laser ablation, a precise thermal technique, simultaneously cuts
+and coagulates tissue without the insertion errors associated with rigid
+needles. Human organ motions, such as those in the liver, exhibit rhythmic
+components influenced by respiratory and cardiac cycles, making effective laser
+energy delivery to target lesions while compensating for tumor motion crucial.
+This research introduces a data-driven method to derive surrogate models of a
+soft manipulator. These low-dimensional models offer computational efficiency
+when integrated into the Model Predictive Control (MPC) framework, while still
+capturing the manipulator's dynamics with and without control input. Spectral
+Submanifolds (SSM) theory models the manipulator's autonomous dynamics,
+acknowledging its tendency to reach equilibrium when external forces are
+removed. Preliminary results show that the MPC controller using the surrogate
+model outperforms two other models within the same MPC framework. The
+data-driven MPC controller also supports a design-agnostic feature, allowing
+the interchangeability of different soft manipulators within the laser ablation
+surgery robot system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">17</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classical and Deep Reinforcement Learning Inventory Control Policies for
+  Pharmaceutical Supply Chains with Perishability and Non-Stationarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Stranieri, Chaaben Kouki, Willem van Jaarsveld, Fabio Stella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study inventory control policies for pharmaceutical supply chains,
+addressing challenges such as perishability, yield uncertainty, and
+non-stationary demand, combined with batching constraints, lead times, and lost
+sales. Collaborating with Bristol-Myers Squibb (BMS), we develop a realistic
+case study incorporating these factors and benchmark three
+policies--order-up-to (OUT), projected inventory level (PIL), and deep
+reinforcement learning (DRL) using the proximal policy optimization (PPO)
+algorithm--against a BMS baseline based on human expertise. We derive and
+validate bounds-based procedures for optimizing OUT and PIL policy parameters
+and propose a methodology for estimating projected inventory levels, which are
+also integrated into the DRL policy with demand forecasts to improve
+decision-making under non-stationarity. Compared to a human-driven policy,
+which avoids lost sales through higher holding costs, all three implemented
+policies achieve lower average costs but exhibit greater cost variability.
+While PIL demonstrates robust and consistent performance, OUT struggles under
+high lost sales costs, and PPO excels in complex and variable scenarios but
+requires significant computational effort. The findings suggest that while DRL
+shows potential, it does not outperform classical policies in all numerical
+experiments, highlighting 1) the need to integrate diverse policies to manage
+pharmaceutical challenges effectively, based on the current state-of-the-art,
+and 2) that practical problems in this domain seem to lack a single policy
+class that yields universally acceptable performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Which price to pay? Auto-tuning building MPC controller for optimal
+  economic cost 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarui Yu, Jicheng Shi, Wenjie Xu, Colin N. Jones
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model predictive control (MPC) controller is considered for temperature
+management in buildings but its performance heavily depends on hyperparameters.
+Consequently, MPC necessitates meticulous hyperparameter tuning to attain
+optimal performance under diverse contracts. However, conventional building
+controller design is an open-loop process without critical hyperparameter
+optimization, often leading to suboptimal performance due to unexpected
+environmental disturbances and modeling errors. Furthermore, these
+hyperparameters are not adapted to different pricing schemes and may lead to
+non-economic operations. To address these issues, we propose an efficient
+performance-oriented building MPC controller tuning method based on a
+cutting-edge efficient constrained Bayesian optimization algorithm, CONFIG,
+with global optimality guarantees. We demonstrate that this technique can be
+applied to efficiently deal with real-world DSM program selection problems
+under customized black-box constraints and objectives. In this study, a simple
+MPC controller, which offers the advantages of reduced commissioning costs,
+enhanced computational efficiency, was optimized to perform on a comparable
+level to a delicately designed and computationally expensive MPC controller.
+The results also indicate that with an optimized simple MPC, the monthly
+electricity cost of a household can be reduced by up to 26.90% compared with
+the cost when controlled by a basic rule-based controller under the same
+constraints. Then we compared 12 real electricity contracts in Belgium for a
+household family with customized black-box occupant comfort constraints. The
+results indicate a monthly electricity bill saving up to 20.18% when the most
+economic contract is compared with the worst one, which again illustrates the
+significance of choosing a proper electricity contract.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-Expansive Mappings in Two-Time-Scale Stochastic Approximation:
+  Finite-Time Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Chandak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Two-time-scale stochastic approximation is an iterative algorithm used in
+applications such as optimization, reinforcement learning, and control.
+Finite-time analysis of these algorithms has primarily focused on fixed point
+iterations where both time-scales have contractive mappings. In this paper, we
+study two-time-scale iterations, where the slower time-scale has a
+non-expansive mapping. For such algorithms, the slower time-scale can be
+considered a stochastic inexact Krasnoselskii-Mann iteration. We show that the
+mean square error decays at a rate $O(1/k^{1/4-\epsilon})$, where $\epsilon>0$
+is arbitrarily small. We also show almost sure convergence of iterates to the
+set of fixed points. We show the applicability of our framework by applying our
+results to minimax optimization, linear stochastic approximation, and
+Lagrangian optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to SIAM Journal on Control and Optimization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Supervised Large Neighbourhood Search for MIPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charly Robinson La Rocca, Jean-François Cordeau, Emma Frejinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Neighbourhood Search (LNS) is a powerful heuristic framework for
+solving Mixed-Integer Programming (MIP) problems. However, designing effective
+variable selection strategies in LNS remains challenging, especially for
+diverse sets of problems. In this paper, we propose an approach that integrates
+Machine Learning (ML) within the destroy operator of LNS for MIPs with a focus
+on minimal offline training. We implement a modular LNS matheuristic as a test
+bench to compare different LNS heuristics, including our ML-enhanced LNS.
+Experimental results on the MIPLIB 2017 dataset demonstrate that the
+matheuristic can significantly improve the performance of state-of-the-art
+solvers like Gurobi and SCIP. We conduct analyses on noisy oracles to explore
+the impact of prediction accuracy on solution quality. Additionally, we develop
+techniques to enhance the ML model through loss adjustments and sampling
+routines. Our findings suggest that while random LNS remains competitive, our
+Supervised LNS (SLNS) outperforms other baselines and helps set the foundation
+for future research on ML for LNS methods that are both efficient and general.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Risk-Averse Antibiotics Time Machine Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deniz Tuncer, Burak Kocuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Antibiotic resistance, which is a serious healthcare issue, emerges due to
+uncontrolled and repeated antibiotic use that causes bacteria to mutate and
+develop resistance to antibiotics. The Antibiotics Time Machine Problem aims to
+come up with treatment plans that maximize the probability of reversing these
+mutations. Motivated by the severity of the problem, we develop a risk-averse
+approach and formulate a scenario-based mixed-integer linear program with a
+conditional value-at-risk objective function. We propose a risk-averse scenario
+batch decomposition algorithm that partitions the scenarios into manageable
+risk-averse subproblems, enabling the construction of lower and upper bounds.
+We develop several algorithmic enhancements in the form of stronger no-good
+cuts and symmetry breaking constraints in addition to scenario regrouping and
+warm starting. We conduct extensive computational experiments for static and
+dynamic versions of the problem on a real dataset and demonstrate the
+effectiveness of our approach. Our results suggest that risk-averse solutions
+can achieve significantly better worst-case performance compared to
+risk-neutral solutions with a slight decrease in terms of the average
+performance, especially for the dynamic version. Although our methodology is
+presented in the context of the Antibiotics Time Machine Problem, it can be
+adapted to other risk-averse problem settings in which the decision variables
+come from special ordered sets of type one.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Changing the ranking in eigenvector centrality of a weighted graph by
+  small perturbations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michele Benzi, Nicola Guglielmi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we consider eigenvector centrality for the nodes of a graph
+and study the robustness (and stability) of this popular centrality measure.
+For a given weighted graph $\G$ (both directed and undirected), we consider the
+associated weighted adiacency matrix $A$, which by definition is a non-negative
+matrix. Eigenvector centrality consists of ranking the elements of the graph
+according to the corresponding entries of the Perron eigenvector of $A$, which
+is associated with the positive eigenvalue with largest modulus.
+  An indicator of the robustness of eigenvector centrality consists in looking
+for a nearby perturbed graph $\widetilde{\G}$, with the same structure as $\G$
+(i.e., with the same vertices and edges), but with a weighted adiacency matrix
+$\widetilde A$ such that the highest $m$ entries ($m \ge 2$) of the Perron
+eigenvector of $\widetilde A$ coalesce, making the ranking at the highest level
+ambiguous. To compute a solution to this matrix nearness problem, a nested
+iterative algorithm is proposed that makes use of a constrained gradient system
+of matrix differential equations (possibly on a low-rank manifold) in the inner
+iteration and a one-dimensional optimization of the perturbation size in the
+outer iteration.
+  The proposed algorithm produces the {\em optimal} perturbation (i.e., the one
+with smallest Frobenius norm) of the graph, which causes the looked-for
+coalescence, which is a measure of the sensitivity of the graph. The
+methodology is formulated in terms of graphs but applies to any nonnegative
+matrix, with potential applications in fields like population models, consensus
+dynamics, economics, etc.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On a geometric graph-covering problem related to optimal
+  safety-landing-site location 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10742v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10742v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claudia D'Ambrosio, Marcia Fampa, Jon Lee, Felipe Sinnecker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose integer-programming formulations for an optimal safety-landing
+site (SLS) location problem that arises in the design of urban
+air-transportation networks. We first develop a set-cover based approach for
+the case where the candidate location set is finite and composed of points, and
+we link the problems to solvable cases that have been studied. We then use a
+mixed-integer second-order cone program to model the situation where the
+locations of SLSs are restricted to convex sets only. Finally, we introduce
+strong fixing, which we found to be very effective in reducing the size of
+integer programs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stability of neural ODEs by a control over the expansivity of their
+  flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10740v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10740v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arturo De Marinis, Nicola Guglielmi, Stefano Sicilia, Francesco Tudisco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method to enhance the stability of a neural ordinary
+differential equation (neural ODE) by means of a control over the Lipschitz
+constant $C$ of its flow. Since it is known that $C$ depends on the logarithmic
+norm of the Jacobian matrix associated with the neural ODE, we tune this
+parameter at our convenience by suitably perturbing the Jacobian matrix with a
+perturbation as small as possible in Frobenius norm. We do so by introducing an
+optimization problem for which we propose a nested two-level algorithm. For a
+given perturbation size, the inner level computes the optimal perturbation with
+a fixed Frobenius norm, while the outer level tunes the perturbation amplitude.
+We embed the proposed algorithm in the training of the neural ODE to improve
+its stability. Numerical experiments on the MNIST and FashionMNIST datasets
+show that an image classifier including a neural ODE in its architecture
+trained according to our strategy is more stable than the same classifier
+trained in the classical way, and therefore, it is more robust and less
+vulnerable to adversarial attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 3 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computing Capacity-Cost Functions for Continuous Channels in Wasserstein
+  Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyang Li, Vlad C. Andrei, Ullrich J. Mönich, Fan Liu, Holger Boche
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the problem of computing capacity-cost (C-C)
+functions for continuous channels. Motivated by the Kullback-Leibler divergence
+(KLD) proximal reformulation of the classical Blahut-Arimoto (BA) algorithm,
+the Wasserstein distance is introduced to the proximal term for the continuous
+case, resulting in an iterative algorithm related to the Wasserstein gradient
+descent. Practical implementation involves moving particles along the negative
+gradient direction of the objective function's first variation in the
+Wasserstein space and approximating integrals by the importance sampling (IS)
+technique. Such formulation is also applied to the rate-distortion (R-D)
+function for continuous source spaces and thus provides a unified computation
+framework for both problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE International Conference on Communications 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SLowcal-SGD: Slow Query Points Improve Local-SGD for Stochastic Convex
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04169v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04169v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tehila Dahan, Kfir Y. Levy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider distributed learning scenarios where M machines interact with a
+parameter server along several communication rounds in order to minimize a
+joint objective function. Focusing on the heterogeneous case, where different
+machines may draw samples from different data-distributions, we design the
+first local update method that provably benefits over the two most prominent
+distributed baselines: namely Minibatch-SGD and Local-SGD. Key to our approach
+is a slow querying technique that we customize to the distributed setting,
+which in turn enables a better mitigation of the bias caused by local updates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Convergence of the Gradient Descent Method with Stochastic
+  Fixed-point Rounding Errors under the Polyak-Lojasiewicz Inequality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Xia, Michiel E. Hochstenbach, Stefano Massei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When training neural networks with low-precision computation, rounding errors
+often cause stagnation or are detrimental to the convergence of the optimizers;
+in this paper we study the influence of rounding errors on the convergence of
+the gradient descent method for problems satisfying the Polyak-\Lojasiewicz
+inequality. Within this context, we show that, in contrast, biased stochastic
+rounding errors may be beneficial since choosing a proper rounding strategy
+eliminates the vanishing gradient problem and forces the rounding bias in a
+descent direction. Furthermore, we obtain a bound on the convergence rate that
+is stricter than the one achieved by unbiased stochastic rounding. The
+theoretical analysis is validated by comparing the performances of various
+rounding strategies when optimizing several examples using low-precision
+fixed-point number formats.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inverting Laguerre tessellations: Recovering tessellations from the
+  volumes and centroids of their cells using optimal transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00871v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00871v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David P. Bourne, Mason Pearce, Steven M. Roper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we study an inverse problem in convex geometry, inspired by a
+problem in materials science. Firstly, we consider the question of whether a
+Laguerre tessellation (a partition by convex polytopes) can be recovered from
+only the volumes and centroids of its cells. We show that this problem has a
+unique solution and give a constructive way of computing it using optimal
+transport theory and convex optimisation. Secondly, we consider the problem of
+fitting a Laguerre tessellation to synthetic volume and centroid data. Given
+some target volumes and centroids, we seek a Laguerre tessellation such that
+the difference between the volumes and centroids of its cells and the target
+volumes and centroids is minimised. For an appropriate objective function and
+suitable data, we prove that local minimisers of this problem can be
+constructed using convex optimisation. We also illustrate our results
+numerically. There is great interest in the computational materials science
+community in fitting Laguerre tessellations to electron backscatter diffraction
+(EBSD) and x-ray diffraction images of polycrystalline materials. As an
+application of our results we fit a 2D Laguerre tessellation to an EBSD image
+of steel.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Location Shapes Choice: Placement Optimization of Substitutable
+  Products 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08568v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08568v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omar El Housni, Rajan Udwani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Strategic product placement can have a strong influence on customer purchase
+behavior in physical stores as well as online platforms. Motivated by this, we
+consider the problem of optimizing the placement of substitutable products in
+designated display locations to maximize the expected revenue of the seller. We
+model the customer behavior as a two-stage process: first, the customer visits
+a subset of display locations according to a browsing distribution; second, the
+customer chooses at most one product from the displayed products at those
+locations according to a choice model. Our goal is to design a general
+algorithm that can select and place the products optimally for any browsing
+distribution and choice model, and we call this the Placement problem. We give
+a randomized algorithm that utilizes an $\alpha$-approximate algorithm for
+cardinality constrained assortment optimization and outputs a
+$\frac{\Theta(\alpha)}{\log m}$-approximate solution (in expectation) for
+Placement with $m$ display locations, i.e., our algorithm outputs a solution
+with value at least $\frac{\Omega(\alpha)}{\log m}$ factor of the optimal and
+this is tight in the worst case. We also give algorithms with stronger
+guarantees in some special cases. In particular, we give a deterministic
+$\frac{\Omega(1)}{\log m}$-approximation algorithm for the Markov choice model,
+and a tight $(1-1/e)$-approximation algorithm for the problem when products
+have identical prices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Numerical Analysis for a Hyperbolic PDE-Constrained Optimization Problem
+  in Acoustic Full Waveform Inversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19273v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19273v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis Ammann, Irwin Yousept
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores a fully discrete approximation for a nonlinear hyperbolic
+PDE-constrained optimization problem (P) with applications in acoustic full
+waveform inversion. The optimization problem is primarily complicated by the
+hyperbolic character and the second-order bilinear structure in the governing
+wave equation. While the control parameter is discretized using the piecewise
+constant elements, the state discretization is realized through an auxiliary
+first-order system along with the leapfrog time-stepping method and continuous
+piecewise linear elements. The resulting fully discrete minimization problem
+($\text{P}_h$) is shown to be well-defined. Furthermore, building upon a
+suitable CFL-condition, we prove stability and uniform convergence of the state
+discretization. Our final result is the strong convergence result for
+($\text{P}_h$) in the following sense: Given a local minimizer $\overline \nu$
+of (P) satisfying a reasonable growth condition, there exists a sequence of
+local minimizers of ($\text{P}_h$) converging strongly towards $\overline \nu$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Useful Compact Representations for Data-Fitting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12206v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12206v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes J. Brust
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For minimization problems without 2nd derivative information, methods that
+estimate Hessian matrices can be very effective. However, conventional
+techniques generate dense matrices that are prohibitive for large problems.
+Limited-memory compact representations express the dense arrays in terms of a
+low rank representation and have become the state-of-the-art for software
+implementations on large deterministic problems. We develop new compact
+representations that are parameterized by a choice of vectors and that reduce
+to existing well known formulas for special choices. We demonstrate
+effectiveness of the compact representations for large eigenvalue computations,
+tensor factorizations and nonlinear regressions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wasserstein Distributionally Robust Shallow Convex Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16800v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16800v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julien Pallage, Antoine Lesage-Landry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose Wasserstein distributionally robust shallow convex
+neural networks (WaDiRo-SCNNs) to provide reliable nonlinear predictions when
+subject to adverse and corrupted datasets. Our approach is based on a new
+convex training program for $\ReLU$-based shallow neural networks which allows
+us to cast the problem as an exact, tractable reformulation of its order-1
+Wasserstein distributionally robust counterpart. Our training procedure is
+conservative, has low stochasticity, is solvable with open-source solvers, and
+is scalable to large industrial deployments. We provide out-of-sample
+performance guarantees, show that hard convex physical constraints can be
+enforced in the training program, and propose a mixed-integer convex
+post-training verification program to evaluate model stability. WaDiRo-SCNN
+aims to make neural networks safer for critical applications, such as in the
+energy sector. Finally, we numerically demonstrate the performance of our model
+on a synthetic experiment, a real-world power system application, i.e., the
+prediction of non-residential buildings' hourly energy consumption in the
+context of virtual power plants, and on benchmark datasets. The experimental
+results are convincing and showcase the strengths of the proposed model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time-inconsistent mean-field stopping problems: A regularized
+  equilibrium approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.00381v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.00381v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Yu, Fengyi Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the mean-field Markov decision process (MDP) with the
+centralized stopping under the non-exponential discount. The problem differs
+fundamentally from most existing studies on mean-field optimal control/stopping
+due to its time inconsistency by nature. We look for the subgame perfect
+relaxed equilibria, namely the randomized stopping policies that satisfy the
+time-consistent planning with future selves from the perspective of the social
+planner. On the other hand, unlike many previous studies on time-inconsistent
+stopping where the decreasing impatience plays a key role, we are interested in
+the general discount function without imposing any conditions. As a result, the
+study on the relaxed equilibrium becomes necessary as the pure-strategy
+equilibrium may not exist in general. We formulate relaxed equilibria as fixed
+points of a complicated operator, whose existence is challenging by a direct
+method. To overcome the obstacles, we first introduce the auxiliary problem
+under the entropy regularization on the randomized policy and the discount
+function, and establish the existence of the regularized equilibria as fixed
+points to an auxiliary operator via Schauder fixed point theorem. Next, we show
+that the regularized equilibrium converges as the regularization parameter
+$\lambda$ tends to $0$ and the limit corresponds to a fixed point to the
+original operator, and hence is a relaxed equilibrium. We also establish some
+connections between the mean-field MDP and the N-agent MDP when $N$ is
+sufficiently large in our time-inconsistent setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final version, forthcoming in Finance and Stochastics</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">27</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenEarthMap-SAR: A Benchmark Synthetic Aperture Radar <span class="highlight-title">Dataset</span> for
+  Global High-Resolution Land Cover Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junshi Xia, Hongruixuan Chen, Clifford Broni-Bediako, Yimin Wei, Jian Song, Naoto Yokoya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-resolution land cover mapping plays a crucial role in addressing a wide
+range of global challenges, including urban planning, environmental monitoring,
+disaster response, and sustainable development. However, creating accurate,
+large-scale land cover datasets remains a significant challenge due to the
+inherent complexities of geospatial data, such as diverse terrain, varying
+sensor modalities, and atmospheric conditions. Synthetic Aperture Radar (SAR)
+imagery, with its ability to penetrate clouds and capture data in all-weather,
+day-and-night conditions, offers unique advantages for land cover mapping.
+Despite these strengths, the lack of benchmark datasets tailored for SAR
+imagery has limited the development of robust models specifically designed for
+this data modality. To bridge this gap and facilitate advancements in SAR-based
+geospatial analysis, we introduce OpenEarthMap-SAR, a benchmark SAR dataset,
+for global high-resolution land cover mapping. OpenEarthMap-SAR consists of 1.5
+million segments of 5033 aerial and satellite images with the size of
+1024$\times$1024 pixels, covering 35 regions from Japan, France, and the USA,
+with partially manually annotated and fully pseudo 8-class land cover labels at
+a ground sampling distance of 0.15--0.5 m. We evaluated the performance of
+state-of-the-art methods for semantic segmentation and present challenging
+problem settings suitable for further technical development. The dataset also
+serves the official dataset for IEEE GRSS Data Fusion Contest Track I. The
+dataset has been made publicly available at
+https://zenodo.org/records/14622048.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Siamese Networks in <span class="highlight-title">Self-Supervised</span> Fast MRI Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyan Sun, Shaocong Yu, Chi Zhang, Xinghao Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing MR images using deep neural networks from undersampled k-space
+data without using fully sampled training references offers significant value
+in practice, which is a self-supervised regression problem calling for
+effective prior knowledge and supervision. The Siamese architectures are
+motivated by the definition "invariance" and shows promising results in
+unsupervised visual representative learning. Building homologous transformed
+images and avoiding trivial solutions are two major challenges in Siamese-based
+self-supervised model. In this work, we explore Siamese architecture for MRI
+reconstruction in a self-supervised training fashion called SiamRecon. We show
+the proposed approach mimics an expectation maximization algorithm. The
+alternative optimization provide effective supervision signal and avoid
+collapse. The proposed SiamRecon achieves the state-of-the-art reconstruction
+accuracy in the field of self-supervised learning on both single-coil brain MRI
+and multi-coil knee MRI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual RAG: Expanding MLLM visual knowledge without fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirco Bonomo, Simone Bianco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have achieved notable performance in
+computer vision tasks that require reasoning across visual and textual
+modalities, yet their capabilities are limited to their pre-trained data,
+requiring extensive fine-tuning for updates. Recent researches have explored
+the use of In-Context Learning (ICL) to overcome these challenges by providing
+a set of demonstrating examples as context to augment MLLMs performance in
+several tasks, showing that many-shot ICL leads to substantial improvements
+compared to few-shot ICL. However, the reliance on numerous demonstrating
+examples and the limited MLLMs context windows presents significant obstacles.
+This paper aims to address these challenges by introducing a novel approach,
+Visual RAG, that synergically combines the MLLMs capability to learn from the
+context, with a retrieval mechanism. The crux of this approach is to ensure to
+augment the MLLM knowledge by selecting only the most relevant demonstrating
+examples for the query, pushing it to learn by analogy. In this way, relying on
+the new information provided dynamically during inference time, the resulting
+system is not limited to the knowledge extracted from the training data, but
+can be updated rapidly and easily without fine-tuning. Furthermore, this
+greatly reduces the computational costs for improving the model image
+classification performance, and augments the model knowledge to new visual
+domains and tasks it was not trained for. Extensive experiments on eight
+different datasets in the state of the art spanning several domains and image
+classification tasks show that the proposed Visual RAG, compared to the most
+recent state of the art (i.e., many-shot ICL), is able to obtain an accuracy
+that is very close or even higher (approx. +2% improvement on average) while
+using a much smaller set of demonstrating examples (approx. only 23% on
+average).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GAUDA: Generative Adaptive Uncertainty-guided Diffusion-based
+  Augmentation for Surgical Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannik Frisch, Christina Bornberg, Moritz Fuchs, Anirban Mukhopadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Augmentation by generative modelling yields a promising alternative to the
+accumulation of surgical data, where ethical, organisational and regulatory
+aspects must be considered. Yet, the joint synthesis of (image, mask) pairs for
+segmentation, a major application in surgery, is rather unexplored. We propose
+to learn semantically comprehensive yet compact latent representations of the
+(image, mask) space, which we jointly model with a Latent Diffusion Model. We
+show that our approach can effectively synthesise unseen high-quality paired
+segmentation data of remarkable semantic coherence. Generative augmentation is
+typically applied pre-training by synthesising a fixed number of additional
+training samples to improve downstream task models. To enhance this approach,
+we further propose Generative Adaptive Uncertainty-guided Diffusion-based
+Augmentation (GAUDA), leveraging the epistemic uncertainty of a Bayesian
+downstream model for targeted online synthesis. We condition the generative
+model on classes with high estimated uncertainty during training to produce
+additional unseen samples for these classes. By adaptively utilising the
+generative model online, we can minimise the number of additional training
+samples and centre them around the currently most uncertain parts of the data
+distribution. GAUDA effectively improves downstream segmentation results over
+comparable methods by an average absolute IoU of 1.6% on CaDISv2 and 1.5% on
+CholecSeg8k, two prominent surgical datasets for semantic segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No More Sliding Window: Efficient 3D Medical Image Segmentation with
+  Differentiable Top-k Patch Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10814v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10814v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Young Seok Jeon, Hongfei Yang, Huazhu Fu, Mengling Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D models are favored over 2D for 3D medical image segmentation tasks due to
+their ability to leverage inter-slice relationship, yielding higher
+segmentation accuracy. However, 3D models demand significantly more GPU memory
+with increased model size and intermediate tensors. A common solution is to use
+patch-based training and make whole-volume predictions with sliding window (SW)
+inference. SW inference reduces memory usage but is slower due to equal
+resource allocation across patches and less accurate as it overlooks global
+features beyond patches.
+  We propose NMSW-Net (No-More-Sliding-Window-Net), a novel framework that
+enhances efficiency and accuracy of any given 3D segmentation model by
+eliminating SW inference and incorporating global predictions when necessary.
+NMSW-Net incorporates a differentiable Top-k module to sample only the relevant
+patches that enhance segmentation accuracy, thereby minimizing redundant
+computations. Additionally, it learns to leverage coarse global predictions
+when patch prediction alone is insufficient. NMSW-Net is model-agnostic, making
+it compatible with any 3D segmentation model that previously relied on SW
+inference.
+  Evaluated across 3 tasks with 3 segmentation backbones, NMSW-Net achieves
+competitive or sometimes superior accuracy compared to SW, while reducing
+computational complexity by 90% (87.5 to 7.95 TFLOPS), delivering 4x faster
+inference on the H100 GPU (19.0 to 4.3 sec), and 7x faster inference on the
+Intel Xeon Gold CPU (1710 to 230 seconds).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Auto-Labeling of Large-Scale Poultry <span class="highlight-title">Dataset</span>s (ALPD) Using
+  Semi-Supervised Models, Active Learning, and <span class="highlight-title">Prompt</span>-then-Detect Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10809v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10809v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramesh Bahadur Bist, Lilong Chai, Shawna Weimer, Hannah Atungulua, Chantel Pennicott, Xiao Yang, Sachin Subedi, Chaitanya Pallerla, Yang Tian, Dongyi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid growth of AI in poultry farming has highlighted the challenge of
+efficiently labeling large, diverse datasets. Manual annotation is
+time-consuming, making it impractical for modern systems that continuously
+generate data. This study explores semi-supervised auto-labeling methods,
+integrating active learning, and prompt-then-detect paradigm to develop an
+efficient framework for auto-labeling of large poultry datasets aimed at
+advancing AI-driven behavior and health monitoring. Viideo data were collected
+from broilers and laying hens housed at the University of Arkansas and the
+University of Georgia. The collected videos were converted into images,
+pre-processed, augmented, and labeled. Various machine learning models,
+including zero-shot models like Grounding DINO, YOLO-World, and CLIP, and
+supervised models like YOLO and Faster-RCNN, were utilized for broilers, hens,
+and behavior detection. The results showed that YOLOv8s-World and YOLOv9s
+performed better when compared performance metrics for broiler and hen
+detection under supervised learning, while among the semi-supervised model,
+YOLOv8s-ALPD achieved the highest precision (96.1%) and recall (99.0%) with an
+RMSE of 1.9. The hybrid YOLO-World model, incorporating the optimal YOLOv8s
+backbone, demonstrated the highest overall performance. It achieved a precision
+of 99.2%, recall of 99.4%, and an F1 score of 98.7% for breed detection,
+alongside a precision of 88.4%, recall of 83.1%, and an F1 score of 84.5% for
+individual behavior detection. Additionally, semi-supervised models showed
+significant improvements in behavior detection, achieving up to 31% improvement
+in precision and 16% in F1-score. The semi-supervised models with minimal
+active learning reduced annotation time by over 80% compared to full manual
+labeling. Moreover, integrating zero-shot models enhanced detection and
+behavior identification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CS-Net:Contribution-based Sampling Network for Point Cloud
+  Simplification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10789v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10789v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Guo, Chen Chen, Hui Yuan, Xiaolong Mao, Raouf Hamzaoui, Junhui Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud sampling plays a crucial role in reducing computation costs and
+storage requirements for various vision tasks. Traditional sampling methods,
+such as farthest point sampling, lack task-specific information and, as a
+result, cannot guarantee optimal performance in specific applications.
+Learning-based methods train a network to sample the point cloud for the
+targeted downstream task. However, they do not guarantee that the sampled
+points are the most relevant ones. Moreover, they may result in duplicate
+sampled points, which requires completion of the sampled point cloud through
+post-processing techniques. To address these limitations, we propose a
+contribution-based sampling network (CS-Net), where the sampling operation is
+formulated as a Top-k operation. To ensure that the network can be trained in
+an end-to-end way using gradient descent algorithms, we use a differentiable
+approximation to the Top-k operation via entropy regularization of an optimal
+transport problem. Our network consists of a feature embedding module, a
+cascade attention module, and a contribution scoring module. The feature
+embedding module includes a specifically designed spatial pooling layer to
+reduce parameters while preserving important features. The cascade attention
+module combines the outputs of three skip connected offset attention layers to
+emphasize the attractive features and suppress less important ones. The
+contribution scoring module generates a contribution score for each point and
+guides the sampling process to prioritize the most important ones. Experiments
+on the ModelNet40 and PU147 showed that CS-Net achieved state-of-the-art
+performance in two semantic-based downstream tasks (classification and
+registration) and two reconstruction-based tasks (compression and surface
+reconstruction).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoupling Appearance Variations with 3D Consistent Features in Gaussian
+  Splatting <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Lin, Zhihao Li, Binxiao Huang, Xiao Tang, Jianzhuang Liu, Shiyong Liu, Xiaofei Wu, Fenglong Song, Wenming Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian Splatting has emerged as a prominent 3D representation in novel view
+synthesis, but it still suffers from appearance variations, which are caused by
+various factors, such as modern camera ISPs, different time of day, weather
+conditions, and local light changes. These variations can lead to floaters and
+color distortions in the rendered images/videos. Recent appearance modeling
+approaches in Gaussian Splatting are either tightly coupled with the rendering
+process, hindering real-time rendering, or they only account for mild global
+variations, performing poorly in scenes with local light changes. In this
+paper, we propose DAVIGS, a method that decouples appearance variations in a
+plug-and-play and efficient manner. By transforming the rendering results at
+the image level instead of the Gaussian level, our approach can model
+appearance variations with minimal optimization time and memory overhead.
+Furthermore, our method gathers appearance-related information in 3D space to
+transform the rendered images, thus building 3D consistency across views
+implicitly. We validate our method on several appearance-variant scenes, and
+demonstrate that it achieves state-of-the-art rendering quality with minimal
+training time and memory usage, without compromising rendering speeds.
+Additionally, it provides performance improvements for different Gaussian
+Splatting baselines in a plug-and-play manner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025. Project website:
+  https://davi-gaussian.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LD-DETR: Loop Decoder DEtection <span class="highlight-title">TRansformer</span> for Video Moment Retrieval
+  and Highlight Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Zhao, Zhixian He, Fuwei Zhang, Shujin Lin, Fan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Moment Retrieval and Highlight Detection aim to find corresponding
+content in the video based on a text query. Existing models usually first use
+contrastive learning methods to align video and text features, then fuse and
+extract multimodal information, and finally use a Transformer Decoder to decode
+multimodal information. However, existing methods face several issues: (1)
+Overlapping semantic information between different samples in the dataset
+hinders the model's multimodal aligning performance; (2) Existing models are
+not able to efficiently extract local features of the video; (3) The
+Transformer Decoder used by the existing model cannot adequately decode
+multimodal features. To address the above issues, we proposed the LD-DETR model
+for Video Moment Retrieval and Highlight Detection tasks. Specifically, we
+first distilled the similarity matrix into the identity matrix to mitigate the
+impact of overlapping semantic information. Then, we designed a method that
+enables convolutional layers to extract multimodal local features more
+efficiently. Finally, we fed the output of the Transformer Decoder back into
+itself to adequately decode multimodal information. We evaluated LD-DETR on
+four public benchmarks and conducted extensive experiments to demonstrate the
+superiority and effectiveness of our approach. Our model outperforms the
+State-Of-The-Art models on QVHighlight, Charades-STA and TACoS datasets. Our
+code is available at https://github.com/qingchen239/ld-detr.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedFILIP: Medical Fine-grained Language-Image <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinjie Liang, Xiangyu Li, Fanding Li, Jie Jiang, Qing Dong, Wei Wang, Kuanquan Wang, Suyu Dong, Gongning Luo, Shuo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical vision-language pretraining (VLP) that leverages naturally-paired
+medical image-report data is crucial for medical image analysis. However,
+existing methods struggle to accurately characterize associations between
+images and diseases, leading to inaccurate or incomplete diagnostic results. In
+this work, we propose MedFILIP, a fine-grained VLP model, introduces medical
+image-specific knowledge through contrastive learning, specifically: 1) An
+information extractor based on a large language model is proposed to decouple
+comprehensive disease details from reports, which excels in extracting disease
+deals through flexible prompt engineering, thereby effectively reducing text
+complexity while retaining rich information at a tiny cost. 2) A knowledge
+injector is proposed to construct relationships between categories and visual
+attributes, which help the model to make judgments based on image features, and
+fosters knowledge extrapolation to unfamiliar disease categories. 3) A semantic
+similarity matrix based on fine-grained annotations is proposed, providing
+smoother, information-richer labels, thus allowing fine-grained image-text
+alignment. 4) We validate MedFILIP on numerous datasets, e.g., RSNA-Pneumonia,
+NIH ChestX-ray14, VinBigData, and COVID-19. For single-label, multi-label, and
+fine-grained classification, our model achieves state-of-the-art performance,
+the classification accuracy has increased by a maximum of 6.69\%. The code is
+available in https://github.com/PerceptionComputingLab/MedFILIP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, IEEE Journal of Biomedical and Health
+  Informatics 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Diagnostic in 3D COVID-19 Pneumonia CT-scans through
+  Explainable Uncertainty Bayesian Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Manuel Liscano Fierro, Hector J. Hortua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately classifying COVID-19 pneumonia in 3D CT scans remains a
+significant challenge in the field of medical image analysis. Although
+deterministic neural networks have shown promising results in this area, they
+provide only point estimates outputs yielding poor diagnostic in clinical
+decision-making. In this paper, we explore the use of Bayesian neural networks
+for classifying COVID-19 pneumonia in 3D CT scans providing uncertainties in
+their predictions. We compare deterministic networks and their Bayesian
+counterpart, enhancing the decision-making accuracy under uncertainty
+information. Remarkably, our findings reveal that lightweight architectures
+achieve the highest accuracy of 96\% after developing extensive hyperparameter
+tuning. Furthermore, the Bayesian counterpart of these architectures via
+Multiplied Normalizing Flow technique kept a similar performance along with
+calibrated uncertainty estimates. Finally, we have developed a 3D-visualization
+approach to explain the neural network outcomes based on SHAP values. We
+conclude that explainability along with uncertainty quantification will offer
+better clinical decisions in medical image analysis, contributing to ongoing
+efforts for improving the diagnosis and treatment of COVID-19 pneumonia.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>61 pages, 16 figures. Comments are welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Infrared and Visible Image Fusion: From Data Compatibility to Task
+  Adaption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyuan Liu, Guanyao Wu, Zhu Liu, Di Wang, Zhiying Jiang, Long Ma, Wei Zhong, Xin Fan, Risheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared-visible image fusion (IVIF) is a critical task in computer vision,
+aimed at integrating the unique features of both infrared and visible spectra
+into a unified representation. Since 2018, the field has entered the deep
+learning era, with an increasing variety of approaches introducing a range of
+networks and loss functions to enhance visual performance. However, challenges
+such as data compatibility, perception accuracy, and efficiency remain.
+Unfortunately, there is a lack of recent comprehensive surveys that address
+this rapidly expanding domain. This paper fills that gap by providing a
+thorough survey covering a broad range of topics. We introduce a
+multi-dimensional framework to elucidate common learning-based IVIF methods,
+from visual enhancement strategies to data compatibility and task adaptability.
+We also present a detailed analysis of these approaches, accompanied by a
+lookup table clarifying their core ideas. Furthermore, we summarize performance
+comparisons, both quantitatively and qualitatively, focusing on registration,
+fusion, and subsequent high-level tasks. Beyond technical analysis, we discuss
+potential future directions and open issues in this area. For further details,
+visit our GitHub repository: https://github.com/RollingPlain/IVIF_ZOO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deformable Image Registration of Dark-Field Chest Radiographs for Local
+  Lung Signal Change Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Drexel, Vasiliki Sideri-Lampretsa, Henriette Bast, Alexander W. Marka, Thomas Koehler, Florian T. Gassert, Daniela Pfeiffer, Daniel Rueckert, Franz Pfeiffer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dark-field radiography of the human chest has been demonstrated to have
+promising potential for the analysis of the lung microstructure and the
+diagnosis of respiratory diseases. However, previous studies of dark-field
+chest radiographs evaluated the lung signal only in the inspiratory breathing
+state. Our work aims to add a new perspective to these previous assessments by
+locally comparing dark-field lung information between different respiratory
+states. To this end, we discuss suitable image registration methods for
+dark-field chest radiographs to enable consistent spatial alignment of the lung
+in distinct breathing states. Utilizing full inspiration and expiration scans
+from a clinical chronic obstructive pulmonary disease study, we assess the
+performance of the proposed registration framework and outline applicable
+evaluation approaches. Our regional characterization of lung dark-field signal
+changes between the breathing states provides a proof-of-principle that dynamic
+radiography-based lung function assessment approaches may benefit from
+considering registered dark-field images in addition to standard plain chest
+radiographs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quadcopter Position Hold Function using Optical Flow in a
+  Smartphone-based Flight Computer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10752v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10752v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noel P Caliston, Chris Jordan C. Aliac, James Arnold E. Nogra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose. This paper explores the capability of smartphones as computing
+devices for a quadcopter, specifically in terms of the ability of drones to
+maintain a position known as the position hold function. Image processing can
+be performed with the phone's sensors and powerful built-in camera. Method.
+Using Shi-Tomasi corner detection and the Lucas-Kanade sparse optical flow
+algorithms, ground features are recognized and tracked using the
+downward-facing camera. The position is maintained by computing quadcopter
+displacement from the center of the image using Euclidian distance, and the
+corresponding pitch and roll estimate is calculated using the PID controller.
+Results. Actual flights show a double standard deviation of 18.66 cm from the
+center for outdoor tests. With a quadcopter size of 58cm x 58cm used, it
+implies that 95% of the time, the quadcopter is within a diameter of 96 cm. For
+indoor tests, a double standard deviation of 10.55 cm means that 95% of the
+time, the quadcopter is within a diameter of 79 cm. Conclusion. Smartphone
+sensors and cameras can be used to perform optical flow position hold
+functions, proving their potential as computing devices for drones.
+Recommendations. To further improve the positioning system of the phone-based
+quadcopter system, it is suggested that potential sensor fusion be explored
+with the phone's GNSS sensor, which gives absolute positioning information for
+outdoor applications. Research Implications. As different devices and gadgets
+are integrated into the smartphone, this paper presents an opportunity for
+phone manufacturers and researchers to explore the potential of smartphones for
+a drone use-case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-supervised Semantic Segmentation for Remote Sensing Images via
+  Multi-scale Uncertainty Consistency and Cross-Teacher-Student Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10736v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10736v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanwen Wang, Changrui Chen, Xin Sun, Danfeng Hong, Jungong Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised learning offers an appealing solution for remote sensing (RS)
+image segmentation to relieve the burden of labor-intensive pixel-level
+labeling. However, RS images pose unique challenges, including rich multi-scale
+features and high inter-class similarity. To address these problems, this paper
+proposes a novel semi-supervised Multi-Scale Uncertainty and
+Cross-Teacher-Student Attention (MUCA) model for RS image semantic segmentation
+tasks. Specifically, MUCA constrains the consistency among feature maps at
+different layers of the network by introducing a multi-scale uncertainty
+consistency regularization. It improves the multi-scale learning capability of
+semi-supervised algorithms on unlabeled data. Additionally, MUCA utilizes a
+Cross-Teacher-Student attention mechanism to guide the student network, guiding
+the student network to construct more discriminative feature representations
+through complementary features from the teacher network. This design
+effectively integrates weak and strong augmentations (WA and SA) to further
+boost segmentation performance. To verify the effectiveness of our model, we
+conduct extensive experiments on ISPRS-Potsdam and LoveDA datasets. The
+experimental results show the superiority of our method over state-of-the-art
+semi-supervised methods. Notably, our model excels in distinguishing highly
+similar objects, showcasing its potential for advancing semi-supervised RS
+image segmentation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A CNN-<span class="highlight-title">Transformer</span> for Classification of Longitudinal 3D MRI Images -- A
+  Case Study on Hepatocellular Carcinoma Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Nolte, Maureen M. J. Guichelaar, Donald E. Bouman, Stephanie M. van den Berg, Maryam Amir Haeri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Longitudinal MRI analysis is crucial for predicting disease outcomes,
+particularly in chronic conditions like hepatocellular carcinoma (HCC), where
+early detection can significantly influence treatment strategies and patient
+prognosis. Yet, due to challenges like limited data availability, subtle
+parenchymal changes, and the irregular timing of medical screenings, current
+approaches have so far focused on cross-sectional imaging data. To address
+this, we propose HCCNet, a novel model architecture that integrates a 3D
+adaptation of the ConvNeXt CNN architecture with a Transformer encoder,
+capturing both the intricate spatial features of 3D MRIs and the complex
+temporal dependencies across different time points.
+  HCCNet utilizes a two-stage pre-training process tailored for longitudinal
+MRI data. The CNN backbone is pre-trained using a self-supervised learning
+framework adapted for 3D MRIs, while the Transformer encoder is pre-trained
+with a sequence-order-prediction task to enhance its understanding of disease
+progression over time. We demonstrate the effectiveness of HCCNet by applying
+it to a cohort of liver cirrhosis patients undergoing regular MRI screenings
+for HCC surveillance. Our results show that HCCNet significantly improves
+predictive accuracy and reliability over baseline models, providing a robust
+tool for personalized HCC surveillance.
+  The methodological approach presented in this paper is versatile and can be
+adapted to various longitudinal MRI screening applications. Its ability to
+handle varying patient record lengths and irregular screening intervals
+establishes it as an invaluable framework for monitoring chronic diseases,
+where timely and accurate disease prognosis is critical for effective treatment
+planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted for publication to Biomedical Signal Processing and Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CBAM-EfficientNetV2 for Histopathology Image Classification using
+  Transfer Learning and Dual Attention Mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.22392v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.22392v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naren Sengodan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer histopathology image classification is critical for early
+detection and improved patient outcomes. 1 This study introduces a novel
+approach leveraging EfficientNetV2 models, to improve feature extraction and
+focus on relevant tissue regions. The proposed models were evaluated on the
+BreakHis dataset across multiple magnification scales (40X, 100X, 200X, and
+400X). 2 Among them, the EfficientNetV2-XL with CBAM achieved outstanding
+performance, reaching a peak accuracy of 99.01 percent and an F1-score of 98.31
+percent at 400X magnification, outperforming state-of-the-art methods. 3 By
+integrating Contrast Limited Adaptive Histogram Equalization (CLAHE) for
+preprocessing and optimizing computational efficiency, this method demonstrates
+its suitability for real-time clinical deployment. 3 The results underscore the
+potential of attention-enhanced scalable architectures in advancing diagnostic
+precision for breast cancer detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lotus: Diffusion-based Visual Foundation Model for High-quality Dense
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18124v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18124v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing He, Haodong Li, Wei Yin, Yixun Liang, Leheng Li, Kaiqiang Zhou, Hongbo Zhang, Bingbing Liu, Ying-Cong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging the visual priors of pre-trained text-to-image diffusion models
+offers a promising solution to enhance zero-shot generalization in dense
+prediction tasks. However, existing methods often uncritically use the original
+diffusion formulation, which may not be optimal due to the fundamental
+differences between dense prediction and image generation. In this paper, we
+provide a systemic analysis of the diffusion formulation for the dense
+prediction, focusing on both quality and efficiency. And we find that the
+original parameterization type for image generation, which learns to predict
+noise, is harmful for dense prediction; the multi-step noising/denoising
+diffusion process is also unnecessary and challenging to optimize. Based on
+these insights, we introduce Lotus, a diffusion-based visual foundation model
+with a simple yet effective adaptation protocol for dense prediction.
+Specifically, Lotus is trained to directly predict annotations instead of
+noise, thereby avoiding harmful variance. We also reformulate the diffusion
+process into a single-step procedure, simplifying optimization and
+significantly boosting inference speed. Additionally, we introduce a novel
+tuning strategy called detail preserver, which achieves more accurate and
+fine-grained predictions. Without scaling up the training data or model
+capacity, Lotus achieves SoTA performance in zero-shot depth and normal
+estimation across various datasets. It also enhances efficiency, being
+significantly faster than most existing diffusion-based methods. Lotus'
+superior quality and efficiency also enable a wide range of practical
+applications, such as joint estimation, single/multi-view 3D reconstruction,
+etc. Project page: https://lotus3d.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally. Project page:
+  https://lotus3d.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BTMTrack: Robust RGB-T Tracking via Dual-template Bridging and
+  Temporal-Modal Candidate Elimination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03616v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03616v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongxuan Zhang, Bi Zeng, Xinyu Ni, Yimin Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RGB-T tracking leverages the complementary strengths of RGB and thermal
+infrared (TIR) modalities to address challenging scenarios such as low
+illumination and adverse weather. However, existing methods often fail to
+effectively integrate temporal information and perform efficient cross-modal
+interactions, which constrain their adaptability to dynamic targets. In this
+paper, we propose BTMTrack, a novel framework for RGB-T tracking. The core of
+our approach lies in the dual-template backbone network and the Temporal-Modal
+Candidate Elimination (TMCE) strategy. The dual-template backbone effectively
+integrates temporal information, while the TMCE strategy focuses the model on
+target-relevant tokens by evaluating temporal and modal correlations, reducing
+computational overhead and avoiding irrelevant background noise. Building upon
+this foundation, we propose the Temporal Dual Template Bridging (TDTB) module,
+which facilitates precise cross-modal fusion through dynamically filtered
+tokens. This approach further strengthens the interaction between templates and
+the search region. Extensive experiments conducted on three benchmark datasets
+demonstrate the effectiveness of BTMTrack. Our method achieves state-of-the-art
+performance, with a 72.3% precision rate on the LasHeR test set and competitive
+results on RGBT210 and RGBT234 datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DynPoint: Dynamic Neural Point For View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.18999v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.18999v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaichen Zhou, Jia-Xing Zhong, Sangyun Shin, Kai Lu, Yiyuan Yang, Andrew Markham, Niki Trigoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The introduction of neural radiance fields has greatly improved the
+effectiveness of view synthesis for monocular videos. However, existing
+algorithms face difficulties when dealing with uncontrolled or lengthy
+scenarios, and require extensive training time specific to each new scenario.
+To tackle these limitations, we propose DynPoint, an algorithm designed to
+facilitate the rapid synthesis of novel views for unconstrained monocular
+videos. Rather than encoding the entirety of the scenario information into a
+latent representation, DynPoint concentrates on predicting the explicit 3D
+correspondence between neighboring frames to realize information aggregation.
+Specifically, this correspondence prediction is achieved through the estimation
+of consistent depth and scene flow information across frames. Subsequently, the
+acquired correspondence is utilized to aggregate information from multiple
+reference frames to a target frame, by constructing hierarchical neural point
+clouds. The resulting framework enables swift and accurate view synthesis for
+desired views of target frames. The experimental results obtained demonstrate
+the considerable acceleration of training time achieved - typically an order of
+magnitude - by our proposed method while yielding comparable outcomes compared
+to prior approaches. Furthermore, our method exhibits strong robustness in
+handling long-duration videos without learning a canonical representation of
+video content.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Manydepth2: Motion-Aware <span class="highlight-title">Self-Supervised</span> Multi-Frame Monocular Depth
+  Estimation in Dynamic Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.15268v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.15268v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaichen Zhou, Jia-Wang Bian, Qian Xie, Jian-Qing Zheng, Niki Trigoni, Andrew Markham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite advancements in self-supervised monocular depth estimation,
+challenges persist in dynamic scenarios due to the dependence on assumptions
+about a static world. In this paper, we present Manydepth2, to achieve precise
+depth estimation for both dynamic objects and static backgrounds, all while
+maintaining computational efficiency. To tackle the challenges posed by dynamic
+content, we incorporate optical flow and coarse monocular depth to create a
+pseudo-static reference frame. This frame is then utilized to build a
+motion-aware cost volume in collaboration with the vanilla target frame.
+Furthermore, to improve the accuracy and robustness of the network
+architecture, we propose an attention-based depth network that effectively
+integrates information from feature maps at different resolutions by
+incorporating both channel and non-local attention mechanisms. Compared to
+methods with similar computational costs, Manydepth2 achieves a significant
+reduction of approximately five percent in root-mean-square error for
+self-supervised monocular depth estimation on the KITTI-2015 dataset. The code
+could be found at https://github.com/kaichen-z/Manydepth2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Monocular Depth Estimation, Self-Supervised, Optical Flow</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-Guided Appearance-Motion Association Network for
+  Out-of-Distribution Action Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09953v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09953v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Fang, Arvind Easwaran, Blaise Genest
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection targets to detect and reject test samples
+with semantic shifts, to prevent models trained on in-distribution (ID) dataset
+from producing unreliable predictions. Existing works only extract the
+appearance features on image datasets, and cannot handle dynamic multimedia
+scenarios with much motion information. Therefore, we target a more realistic
+and challenging OOD detection task: OOD action detection (ODAD). Given an
+untrimmed video, ODAD first classifies the ID actions and recognizes the OOD
+actions, and then localizes ID and OOD actions. To this end, in this paper, we
+propose a novel Uncertainty-Guided Appearance-Motion Association Network
+(UAAN), which explores both appearance features and motion contexts to reason
+spatial-temporal inter-object interaction for ODAD.Firstly, we design separate
+appearance and motion branches to extract corresponding appearance-oriented and
+motion-aspect object representations. In each branch, we construct a
+spatial-temporal graph to reason appearance-guided and motion-driven
+inter-object interaction. Then, we design an appearance-motion attention module
+to fuse the appearance and motion features for final action detection.
+Experimental results on two challenging datasets show that UAAN beats
+state-of-the-art methods by a significant margin, illustrating its
+effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MIPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Schedule On the Fly: Diffusion Time Prediction for Faster and Better
+  Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01243v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01243v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zilyu Ye, Zhiyang Chen, Tiancheng Li, Zemin Huang, Weijian Luo, Guo-Jun Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion and flow models have achieved remarkable successes in various
+applications such as text-to-image generation. However, these models typically
+rely on the same predetermined denoising schedules during inference for each
+prompt, which potentially limits the inference efficiency as well as the
+flexibility when handling different prompts. In this paper, we argue that the
+optimal noise schedule should adapt to each inference instance, and introduce
+the Time Prediction Diffusion Model (TPDM) to accomplish this. TPDM employs a
+plug-and-play Time Prediction Module (TPM) that predicts the next noise level
+based on current latent features at each denoising step. We train the TPM using
+reinforcement learning, aiming to maximize a reward that discounts the final
+image quality by the number of denoising steps. With such an adaptive
+scheduler, TPDM not only generates high-quality images that are aligned closely
+with human preferences but also adjusts the number of denoising steps and time
+on the fly, enhancing both performance and efficiency. We train TPDMs on
+multiple diffusion model benchmarks. With Stable Diffusion 3 Medium
+architecture, TPDM achieves an aesthetic score of 5.44 and a human preference
+score (HPS) of 29.59, while using around 50% fewer denoising steps to achieve
+better performance. We will release our best model alongside this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilling Aggregated Knowledge for Weakly-Supervised Video Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jash Dalvi, Ali Dabouei, Gunjan Dhanuka, Min Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video anomaly detection aims to develop automated models capable of
+identifying abnormal events in surveillance videos. The benchmark setup for
+this task is extremely challenging due to: i) the limited size of the training
+sets, ii) weak supervision provided in terms of video-level labels, and iii)
+intrinsic class imbalance induced by the scarcity of abnormal events. In this
+work, we show that distilling knowledge from aggregated representations of
+multiple backbones into a single-backbone Student model achieves
+state-of-the-art performance. In particular, we develop a bi-level distillation
+approach along with a novel disentangled cross-attention-based feature
+aggregation network. Our proposed approach, DAKD (Distilling Aggregated
+Knowledge with Disentangled Attention), demonstrates superior performance
+compared to existing methods across multiple benchmark datasets. Notably, we
+achieve significant improvements of 1.36%, 0.78%, and 7.02% on the UCF-Crime,
+ShanghaiTech, and XD-Violence datasets, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Impact of color and mixing proportion of synthetic point clouds on
+  semantic segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19145v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19145v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaojie Zhou, Jia-Rui Lin, Peng Pan, Yuandong Pan, Ioannis Brilakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL)-based point cloud segmentation is essential for
+understanding built environment. Despite synthetic point clouds (SPC) having
+the potential to compensate for data shortage, how synthetic color and mixing
+proportion impact DL-based segmentation remains a long-standing question.
+Therefore, this paper addresses this question with extensive experiments by
+introducing: 1) method to generate SPC with real colors and uniform colors from
+BIM, and 2) enhanced benchmarks for better performance evaluation. Experiments
+on DL models including PointNet, PointNet++, and DGCNN show that model
+performance on SPC with real colors outperforms that on SPC with uniform colors
+by 8.2 % + on both OA and mIoU. Furthermore, a higher than 70 % mixing
+proportion of SPC usually leads to better performance. And SPC can replace real
+ones to train a DL model for detecting large and flat building elements.
+Overall, this paper unveils the performance-improving mechanism of SPC and
+brings new insights to boost SPC's value (for building large models for point
+clouds).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Fused Multimodal Deep Learning for Plant Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01455v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01455v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alfreds Lapkovskis, Natalia Nefedova, Ali Beikmohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plant classification is vital for ecological conservation and agricultural
+productivity, enhancing our understanding of plant growth dynamics and aiding
+species preservation. The advent of deep learning (DL) techniques has
+revolutionized this field by enabling autonomous feature extraction,
+significantly reducing the dependence on manual expertise. However,
+conventional DL models often rely solely on single data sources, failing to
+capture the full biological diversity of plant species comprehensively. Recent
+research has turned to multimodal learning to overcome this limitation by
+integrating multiple data types, which enriches the representation of plant
+characteristics. This shift introduces the challenge of determining the optimal
+point for modality fusion. In this paper, we introduce a pioneering multimodal
+DL-based approach for plant classification with automatic modality fusion.
+Utilizing the multimodal fusion architecture search, our method integrates
+images from multiple plant organs -- flowers, leaves, fruits, and stems -- into
+a cohesive model. To address the lack of multimodal datasets, we contributed
+Multimodal-PlantCLEF, a restructured version of the PlantCLEF2015 dataset
+tailored for multimodal tasks. Our method achieves 82.61% accuracy on 979
+classes of Multimodal-PlantCLEF, surpassing state-of-the-art methods and
+outperforming late fusion by 10.33%. Through the incorporation of multimodal
+dropout, our approach demonstrates strong robustness to missing modalities. We
+validate our model against established benchmarks using standard performance
+metrics and McNemar's test, further underscoring its superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ JigsawHSI: a network for Hyperspectral Image classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02327v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02327v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaime Moraga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article describes Jigsaw, a convolutional neural network (CNN) used in
+geosciences and based on Inception but tailored for geoscientific analyses.
+Introduces JigsawHSI (based on Jigsaw) and uses it on the land-use land-cover
+(LULC) classification problem with the Indian Pines, Pavia University and
+Salinas hyperspectral image data sets. The network is compared against
+HybridSN, a spectral-spatial 3D-CNN followed by 2D-CNN that achieves
+state-of-the-art results on the datasets. This short article proves that
+JigsawHSI is able to meet or exceed HybridSN's performance in all three cases.
+It also introduces a generalized Jigsaw architecture in d-dimensional space for
+any number of multimodal inputs. Additionally, the use of jigsaw in geosciences
+is highlighted, while the code and toolkit are made available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 7 figures, not peer reviewed</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing User Intent for Recommendation Systems via Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10871v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10871v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochuan Xu, Zeqiu Xu, Peiyang Yu, Jiani Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation systems play a critical role in enhancing user experience and
+engagement in various online platforms. Traditional methods, such as
+Collaborative Filtering (CF) and Content-Based Filtering (CBF), rely heavily on
+past user interactions or item features. However, these models often fail to
+capture the dynamic and evolving nature of user preferences. To address these
+limitations, we propose DUIP (Dynamic User Intent Prediction), a novel
+framework that combines LSTM networks with Large Language Models (LLMs) to
+dynamically capture user intent and generate personalized item recommendations.
+The LSTM component models the sequential and temporal dependencies of user
+behavior, while the LLM utilizes the LSTM-generated prompts to predict the next
+item of interest. Experimental results on three diverse datasets ML-1M, Games,
+and Bundle show that DUIP outperforms a wide range of baseline models,
+demonstrating its ability to handle the cold-start problem and real-time intent
+adaptation. The integration of dynamic prompts based on recent user
+interactions allows DUIP to provide more accurate, context-aware, and
+personalized recommendations. Our findings suggest that DUIP is a promising
+approach for next-generation recommendation systems, with potential for further
+improvements in cross-modal recommendations and scalability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CAIMLR 2024 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LD-DETR: Loop Decoder DEtection <span class="highlight-title">TRansformer</span> for Video Moment Retrieval
+  and Highlight Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Zhao, Zhixian He, Fuwei Zhang, Shujin Lin, Fan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Moment Retrieval and Highlight Detection aim to find corresponding
+content in the video based on a text query. Existing models usually first use
+contrastive learning methods to align video and text features, then fuse and
+extract multimodal information, and finally use a Transformer Decoder to decode
+multimodal information. However, existing methods face several issues: (1)
+Overlapping semantic information between different samples in the dataset
+hinders the model's multimodal aligning performance; (2) Existing models are
+not able to efficiently extract local features of the video; (3) The
+Transformer Decoder used by the existing model cannot adequately decode
+multimodal features. To address the above issues, we proposed the LD-DETR model
+for Video Moment Retrieval and Highlight Detection tasks. Specifically, we
+first distilled the similarity matrix into the identity matrix to mitigate the
+impact of overlapping semantic information. Then, we designed a method that
+enables convolutional layers to extract multimodal local features more
+efficiently. Finally, we fed the output of the Transformer Decoder back into
+itself to adequately decode multimodal information. We evaluated LD-DETR on
+four public benchmarks and conducted extensive experiments to demonstrate the
+superiority and effectiveness of our approach. Our model outperforms the
+State-Of-The-Art models on QVHighlight, Charades-STA and TACoS datasets. Our
+code is available at https://github.com/qingchen239/ld-detr.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Resource-Efficient Training Framework for Remote Sensing Text--Image
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihang Zhang, Jihao Li, Shuoke Li, Ziqing Niu, Jialiang Chen, Wenkai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing text--image retrieval (RSTIR) aims to retrieve the matched
+remote sensing (RS) images from the database according to the descriptive text.
+Recently, the rapid development of large visual-language pre-training models
+provides new insights for RSTIR. Nevertheless, as the complexity of models
+grows in RSTIR, the previous studies suffer from suboptimal resource efficiency
+during transfer learning. To address this issue, we propose a computation and
+memory-efficient retrieval (CMER) framework for RSTIR. To reduce the training
+memory consumption, we propose the Focus-Adapter module, which adopts a side
+branch structure. Its focus layer suppresses the interference of background
+pixels for small targets. Simultaneously, to enhance data efficacy, we regard
+the RS scene category as the metadata and design a concise augmentation
+technique. The scene label augmentation leverages the prior knowledge from land
+cover categories and shrinks the search space. We propose the negative sample
+recycling strategy to make the negative sample pool decoupled from the
+mini-batch size. It improves the generalization performance without introducing
+additional encoders. We have conducted quantitative and qualitative experiments
+on public datasets and expanded the benchmark with some advanced approaches,
+which demonstrates the competitiveness of the proposed CMER. Compared with the
+recent advanced methods, the overall retrieval performance of CMER is 2%--5%
+higher on RSITMD. Moreover, our proposed method reduces memory consumption by
+49% and has a 1.4x data throughput during training. The code of the CMER and
+the dataset will be released at https://github.com/ZhangWeihang99/CMER.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Graph Diffusion with Applications in Personalized
+  PageRanks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.00077v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.00077v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongzhe Wei, Eli Chien, Pan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph diffusion, which iteratively propagates real-valued substances among
+the graph, is used in numerous graph/network-involved applications. However,
+releasing diffusion vectors may reveal sensitive linking information in the
+data such as transaction information in financial network data. However,
+protecting the privacy of graph data is challenging due to its interconnected
+nature. This work proposes a novel graph diffusion framework with edge-level
+differential privacy guarantees by using noisy diffusion iterates. The
+algorithm injects Laplace noise per diffusion iteration and adopts a
+degree-based thresholding function to mitigate the high sensitivity induced by
+low-degree nodes. Our privacy loss analysis is based on Privacy Amplification
+by Iteration (PABI), which to our best knowledge, is the first effort that
+analyzes PABI with Laplace noise and provides relevant applications. We also
+introduce a novel Infinity-Wasserstein distance tracking method, which tightens
+the analysis of privacy leakage and makes PABI more applicable in practice. We
+evaluate this framework by applying it to Personalized Pagerank computation for
+ranking tasks. Experiments on real-world network data demonstrate the
+superiority of our method under stringent privacy conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Investigation of <span class="highlight-title">Prompt</span> Variations for Zero-shot LLM-based Rankers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14117v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14117v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuoqi Sun, Shengyao Zhuang, Shuai Wang, Guido Zuccon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide a systematic understanding of the impact of specific components
+and wordings used in prompts on the effectiveness of rankers based on zero-shot
+Large Language Models (LLMs). Several zero-shot ranking methods based on LLMs
+have recently been proposed. Among many aspects, methods differ across (1) the
+ranking algorithm they implement, e.g., pointwise vs. listwise, (2) the
+backbone LLMs used, e.g., GPT3.5 vs. FLAN-T5, (3) the components and wording
+used in prompts, e.g., the use or not of role-definition (role-playing) and the
+actual words used to express this. It is currently unclear whether performance
+differences are due to the underlying ranking algorithm, or because of spurious
+factors such as better choice of words used in prompts. This confusion risks to
+undermine future research. Through our large-scale experimentation and
+analysis, we find that ranking algorithms do contribute to differences between
+methods for zero-shot LLM ranking. However, so do the LLM backbones -- but even
+more importantly, the choice of prompt components and wordings affect the
+ranking. In fact, in our experiments, we find that, at times, these latter
+elements have more impact on the ranker's effectiveness than the actual ranking
+algorithms, and that differences among ranking methods become more blurred when
+prompt variations are considered.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-MedQA: Enhancing Medical Question Answering through Case Studies in
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Yang, Hao Chen, Hui Guo, Yineng Chen, Ching-Sheng Lin, Shu Hu, Jinrong Hu, Xi Wu, Xin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and efficient question-answering systems are essential for
+delivering high-quality patient care in the medical field. While Large Language
+Models (LLMs) have made remarkable strides across various domains, they
+continue to face significant challenges in medical question answering,
+particularly in understanding domain-specific terminologies and performing
+complex reasoning. These limitations undermine their effectiveness in critical
+medical applications. To address these issues, we propose a novel approach
+incorporating similar case generation within a multi-agent medical
+question-answering (MedQA) system. Specifically, we leverage the Llama3.1:70B
+model, a state-of-the-art LLM, in a multi-agent architecture to enhance
+performance on the MedQA dataset using zero-shot learning. Our method
+capitalizes on the model's inherent medical knowledge and reasoning
+capabilities, eliminating the need for additional training data. Experimental
+results show substantial performance gains over existing benchmark models, with
+improvements of 7% in both accuracy and F1-score across various medical QA
+tasks. Furthermore, we examine the model's interpretability and reliability in
+addressing complex medical queries. This research not only offers a robust
+solution for medical question answering but also establishes a foundation for
+broader applications of LLMs in the medical domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Thorough Performance Benchmarking on Lightweight Embedding-based
+  Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.17335v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.17335v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Vinh Tran, Tong Chen, Quoc Viet Hung Nguyen, Zi Huang, Lizhen Cui, Hongzhi Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the creation of the Web, recommender systems (RSs) have been an
+indispensable mechanism in information filtering. State-of-the-art RSs
+primarily depend on categorical features, which ecoded by embedding vectors,
+resulting in excessively large embedding tables. To prevent over-parameterized
+embedding tables from harming scalability, both academia and industry have seen
+increasing efforts in compressing RS embeddings. However, despite the
+prosperity of lightweight embedding-based RSs (LERSs), a wide diversity is seen
+in evaluation protocols, resulting in obstacles when relating LERS performance
+to real-world usability. Moreover, despite the common goal of lightweight
+embeddings, LERSs are evaluated with a single choice between the two main
+recommendation tasks -- collaborative filtering and content-based
+recommendation. This lack of discussions on cross-task transferability hinders
+the development of unified, more scalable solutions. Motivated by these issues,
+this study investigates various LERSs' performance, efficiency, and cross-task
+transferability via a thorough benchmarking process. Additionally, we propose
+an efficient embedding compression method using magnitude pruning, which is an
+easy-to-deploy yet highly competitive baseline that outperforms various complex
+LERSs. Our study reveals the distinct performance of LERSs across the two
+tasks, shedding light on their effectiveness and generalizability. To support
+edge-based recommendations, we tested all LERSs on a Raspberry Pi 4, where the
+efficiency bottleneck is exposed. Finally, we conclude this paper with critical
+summaries of LERS performance, model selection suggestions, and underexplored
+challenges around LERSs for future research. To encourage future research, we
+publish source codes and artifacts at \href{this
+link}{https://github.com/chenxing1999/recsys-benchmark}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Multi-Objective Learning through Goal-Conditioned Supervised
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.08911v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.08911v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijun Li, Hilaf Hasson, Jing Hu, Joydeep Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-objective learning aims to optimize multiple objectives simultaneously
+with a single model for achieving a balanced and satisfying performance on all
+these objectives. However, it suffers from the difficulty to formalize and
+conduct the exact learning process, especially considering the possible
+conflicts between objectives. Existing approaches explores to resolve this
+primarily in two directions: adapting modeling structure or constraining
+optimization with certain assumptions. However, a primary issue is that their
+presuppositions for the effectiveness of their design are insufficient to
+guarantee the its generality in real-world applications. What's worse, the high
+space and computation complexity issue makes it even harder to apply them in
+large-scale, complicated environment such as the recommender systems. To
+address these issues, we propose a general framework for automatically learning
+to achieve multiple objectives based on the existing sequential data. We apply
+the goal-conditioned supervised learning (GCSL) framework to multi-objective
+learning, by extending the definition of goals from one-dimensional scalar to
+multi-dimensional vector that perfectly disentangle the representation of
+different objectives. Meanwhile, GCSL enables the model to simultaneously learn
+to achieve each objective in a concise supervised learning way, simply guided
+by existing sequences in the offline data. No additional constraint, special
+model structure design, or complex optimization algorithms are further
+required. Apart from that, we formally analyze the property of the goals in
+GCSL and then firstly propose a goal-generation framework to gain achievable
+and reasonable goals for inference. Extensive experiments are conducted on
+real-world recommendation datasets, demonstrating the effectiveness of the
+proposed method and exploring the feasibility of the goal-generation strategies
+in GCSL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">56</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Generative Security Application Engineering Curriculum 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wu-chang Feng, David Baker-Robinson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI and large language models (LLMs) are transforming security by
+automating many tasks being performed manually. With such automation changing
+the practice of security as we know it, it is imperative that we prepare future
+students for the technology landscape they will ultimately face. Towards this
+end, we describe an initial curriculum and course that attempts to show
+students how to apply generative AI in order to solve problems in security. By
+refocusing security education and training on aspects uniquely suited for
+humans and showing students how to leverage automation for the rest, we believe
+we can better align security education practices with generative AI as it
+evolves.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classical and Deep Reinforcement Learning Inventory Control Policies for
+  Pharmaceutical Supply Chains with Perishability and Non-Stationarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Stranieri, Chaaben Kouki, Willem van Jaarsveld, Fabio Stella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study inventory control policies for pharmaceutical supply chains,
+addressing challenges such as perishability, yield uncertainty, and
+non-stationary demand, combined with batching constraints, lead times, and lost
+sales. Collaborating with Bristol-Myers Squibb (BMS), we develop a realistic
+case study incorporating these factors and benchmark three
+policies--order-up-to (OUT), projected inventory level (PIL), and deep
+reinforcement learning (DRL) using the proximal policy optimization (PPO)
+algorithm--against a BMS baseline based on human expertise. We derive and
+validate bounds-based procedures for optimizing OUT and PIL policy parameters
+and propose a methodology for estimating projected inventory levels, which are
+also integrated into the DRL policy with demand forecasts to improve
+decision-making under non-stationarity. Compared to a human-driven policy,
+which avoids lost sales through higher holding costs, all three implemented
+policies achieve lower average costs but exhibit greater cost variability.
+While PIL demonstrates robust and consistent performance, OUT struggles under
+high lost sales costs, and PPO excels in complex and variable scenarios but
+requires significant computational effort. The findings suggest that while DRL
+shows potential, it does not outperform classical policies in all numerical
+experiments, highlighting 1) the need to integrate diverse policies to manage
+pharmaceutical challenges effectively, based on the current state-of-the-art,
+and 2) that practical problems in this domain seem to lack a single policy
+class that yields universally acceptable performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learn-by-interact: A Data-Centric Framework for Self-Adaptive Agents in
+  Realistic Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10893v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10893v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongjin Su, Ruoxi Sun, Jinsung Yoon, Pengcheng Yin, Tao Yu, Sercan Ö. Arık
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous agents powered by large language models (LLMs) have the potential
+to enhance human capabilities, assisting with digital tasks from sending emails
+to performing data analysis. The abilities of existing LLMs at such tasks are
+often hindered by the lack of high-quality agent data from the corresponding
+environments they interact with. We propose Learn-by-interact, a data-centric
+framework to adapt LLM agents to any given environments without human
+annotations. Learn-by-interact synthesizes trajectories of agent-environment
+interactions based on documentations, and constructs instructions by
+summarizing or abstracting the interaction histories, a process called backward
+construction. We assess the quality of our synthetic data by using them in both
+training-based scenarios and training-free in-context learning (ICL), where we
+craft innovative retrieval approaches optimized for agents. Extensive
+experiments on SWE-bench, WebArena, OSWorld and Spider2-V spanning across
+realistic coding, web, and desktop environments show the effectiveness of
+Learn-by-interact in various downstream agentic tasks -- baseline results are
+improved by up to 12.2\% for ICL with Claude-3.5 and 19.5\% for training with
+Codestral-22B. We further demonstrate the critical role of backward
+construction, which provides up to 14.0\% improvement for training. Our
+ablation studies demonstrate the efficiency provided by our synthesized data in
+ICL and the superiority of our retrieval pipeline over alternative approaches
+like conventional retrieval-augmented generation (RAG). We expect that
+Learn-by-interact will serve as a foundation for agent data synthesis as LLMs
+are increasingly deployed at real-world environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenEarthMap-SAR: A Benchmark Synthetic Aperture Radar <span class="highlight-title">Dataset</span> for
+  Global High-Resolution Land Cover Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junshi Xia, Hongruixuan Chen, Clifford Broni-Bediako, Yimin Wei, Jian Song, Naoto Yokoya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-resolution land cover mapping plays a crucial role in addressing a wide
+range of global challenges, including urban planning, environmental monitoring,
+disaster response, and sustainable development. However, creating accurate,
+large-scale land cover datasets remains a significant challenge due to the
+inherent complexities of geospatial data, such as diverse terrain, varying
+sensor modalities, and atmospheric conditions. Synthetic Aperture Radar (SAR)
+imagery, with its ability to penetrate clouds and capture data in all-weather,
+day-and-night conditions, offers unique advantages for land cover mapping.
+Despite these strengths, the lack of benchmark datasets tailored for SAR
+imagery has limited the development of robust models specifically designed for
+this data modality. To bridge this gap and facilitate advancements in SAR-based
+geospatial analysis, we introduce OpenEarthMap-SAR, a benchmark SAR dataset,
+for global high-resolution land cover mapping. OpenEarthMap-SAR consists of 1.5
+million segments of 5033 aerial and satellite images with the size of
+1024$\times$1024 pixels, covering 35 regions from Japan, France, and the USA,
+with partially manually annotated and fully pseudo 8-class land cover labels at
+a ground sampling distance of 0.15--0.5 m. We evaluated the performance of
+state-of-the-art methods for semantic segmentation and present challenging
+problem settings suitable for further technical development. The dataset also
+serves the official dataset for IEEE GRSS Data Fusion Contest Track I. The
+dataset has been made publicly available at
+https://zenodo.org/records/14622048.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Structured Outputs from Language Models: Benchmark and
+  Studies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saibo Geng, Hudson Cooper, Michał Moskal, Samuel Jenkins, Julian Berman, Nathan Ranchin, Robert West, Eric Horvitz, Harsha Nori
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliably generating structured outputs has become a critical capability for
+modern language model (LM) applications. Constrained decoding has emerged as
+the dominant technology across sectors for enforcing structured outputs during
+generation. Despite its growing adoption, little has been done with the
+systematic evaluation of the behaviors and performance of constrained decoding.
+Constrained decoding frameworks have standardized around JSON Schema as a
+structured data format, with most uses guaranteeing constraint compliance given
+a schema. However, there is poor understanding of the effectiveness of the
+methods in practice. We present an evaluation framework to assess constrained
+decoding approaches across three critical dimensions: efficiency in generating
+constraint-compliant outputs, coverage of diverse constraint types, and quality
+of the generated outputs. To facilitate this evaluation, we introduce
+JSONSchemaBench, a benchmark for constrained decoding comprising 10K real-world
+JSON schemas that encompass a wide range of constraints with varying
+complexity. We pair the benchmark with the existing official JSON Schema Test
+Suite and evaluate six state-of-the-art constrained decoding frameworks,
+including Guidance, Outlines, Llamacpp, XGrammar, OpenAI, and Gemini. Through
+extensive experiments, we gain insights into the capabilities and limitations
+of constrained decoding on structured generation with real-world JSON schemas.
+Our work provides actionable insights for improving constrained decoding
+frameworks and structured generation tasks, setting a new standard for
+evaluating constrained decoding and structured generation. We release
+JSONSchemaBench at https://github.com/guidance-ai/jsonschemabench
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Continual Learning: Harnessing Parameter Uncertainty for
+  Improved Network Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Angelini, Nidhal Bouaynaya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When fine-tuning Deep Neural Networks (DNNs) to new data, DNNs are prone to
+overwriting network parameters required for task-specific functionality on
+previously learned tasks, resulting in a loss of performance on those tasks. We
+propose using parameter-based uncertainty to determine which parameters are
+relevant to a network's learned function and regularize training to prevent
+change in these important parameters. We approach this regularization in two
+ways: (1), we constrain critical parameters from significant changes by
+associating more critical parameters with lower learning rates, thereby
+limiting alterations in those parameters; (2), important parameters are
+restricted from change by imposing a higher regularization weighting, causing
+parameters to revert to their states prior to the learning of subsequent tasks.
+We leverage a Bayesian Moment Propagation framework which learns network
+parameters concurrently with their associated uncertainties while allowing each
+parameter to contribute uncertainty to the network's predictive distribution,
+avoiding the pitfalls of existing sampling-based methods. The proposed approach
+is evaluated for common sequential benchmark datasets and compared to existing
+published approaches from the Continual Learning community. Ultimately, we show
+improved Continual Learning performance for Average Test Accuracy and Backward
+Transfer metrics compared to sampling-based methods and other
+non-uncertainty-based approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot and Few-shot Learning with Instruction-following LLMs for
+  Claim Matching in Automated Fact-checking <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dina Pisarevskaya, Arkaitz Zubiaga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The claim matching (CM) task can benefit an automated fact-checking pipeline
+by putting together claims that can be resolved with the same fact-check. In
+this work, we are the first to explore zero-shot and few-shot learning
+approaches to the task. We consider CM as a binary classification task and
+experiment with a set of instruction-following large language models
+(GPT-3.5-turbo, Gemini-1.5-flash, Mistral-7B-Instruct, and
+Llama-3-8B-Instruct), investigating prompt templates. We introduce a new CM
+dataset, ClaimMatch, which will be released upon acceptance. We put LLMs to the
+test in the CM task and find that it can be tackled by leveraging more mature
+yet similar tasks such as natural language inference or paraphrase detection.
+We also propose a pipeline for CM, which we evaluate on texts of different
+lengths.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 31st International Conference on Computational
+  Linguistics (COLING 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reliable Text-to-SQL with Adaptive Abstention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiwen Chen, Yueting Chen, Xiaohui Yu, Nick Koudas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have revolutionized natural language interfaces
+for databases, particularly in text-to-SQL conversion. However, current
+approaches often generate unreliable outputs when faced with ambiguity or
+insufficient context. We present Reliable Text-to-SQL (RTS), a novel framework
+that enhances query generation reliability by incorporating abstention and
+human-in-the-loop mechanisms. RTS focuses on the critical schema linking phase,
+which aims to identify the key database elements needed for generating SQL
+queries. It autonomously detects potential errors during the answer generation
+process and responds by either abstaining or engaging in user interaction. A
+vital component of RTS is the Branching Point Prediction (BPP) which utilizes
+statistical conformal techniques on the hidden layers of the LLM model for
+schema linking, providing probabilistic guarantees on schema linking accuracy.
+We validate our approach through comprehensive experiments on the BIRD
+benchmark, demonstrating significant improvements in robustness and
+reliability. Our findings highlight the potential of combining transparent-box
+LLMs with human-in-the-loop processes to create more robust natural language
+interfaces for databases. For the BIRD benchmark, our approach achieves
+near-perfect schema linking accuracy, autonomously involving a human when
+needed. Combined with query generation, we demonstrate that near-perfect schema
+linking and a small query generation model can almost match SOTA accuracy
+achieved with a model orders of magnitude larger than the one we use.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fake Advertisements Detection Using Automated Multimodal Learning: A
+  Case Study for Vietnamese Real Estate Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duy Nguyen, Trung T. Nguyen, Cuong V. Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of e-commerce has given rise to fake advertisements that can
+expose users to financial and data risks while damaging the reputation of these
+e-commerce platforms. For these reasons, detecting and removing such fake
+advertisements are important for the success of e-commerce websites. In this
+paper, we propose FADAML, a novel end-to-end machine learning system to detect
+and filter out fake online advertisements. Our system combines techniques in
+multimodal machine learning and automated machine learning to achieve a high
+detection rate. As a case study, we apply FADAML to detect fake advertisements
+on popular Vietnamese real estate websites. Our experiments show that we can
+achieve 91.5% detection accuracy, which significantly outperforms three
+different state-of-the-art fake news detection systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Practical and Ready-to-Use Methodology to Assess the re-identification
+  Risk in Anonymized <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis-Philippe Sondeck, Maryline Laurent
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To prove that a dataset is sufficiently anonymized, many privacy policies
+suggest that a re-identification risk assessment be performed, but do not
+provide a precise methodology for doing so, leaving the industry alone with the
+problem. This paper proposes a practical and ready-to-use methodology for
+re-identification risk assessment, the originality of which is manifold: (1) it
+is the first to follow well-known risk analysis methods (e.g. EBIOS) that have
+been used in the cybersecurity field for years, which consider not only the
+ability to perform an attack, but also the impact such an attack can have on an
+individual; (2) it is the first to qualify attributes and values of attributes
+with e.g. degree of exposure, as known real-world attacks mainly target certain
+types of attributes and not others.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BAP v2: An Enhanced Task Framework for Instruction Following in
+  Minecraft Dialogues 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prashant Jayannavar, Liliang Ren, Marisa Hudspeth, Charlotte Lambert, Ariel Cordes, Elizabeth Kaplan, Anjali Narayan-Chen, Julia Hockenmaier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactive agents capable of understanding and executing instructions in the
+physical world have long been a central goal in AI research. The Minecraft
+Collaborative Building Task (MCBT) provides one such setting to work towards
+this goal (Narayan-Chen, Jayannavar, and Hockenmaier 2019). It is a two-player
+game in which an Architect (A) instructs a Builder (B) to construct a target
+structure in a simulated Blocks World Environment. We focus on the challenging
+Builder Action Prediction (BAP) subtask of predicting correct action sequences
+in a given multimodal game context with limited training data (Jayannavar,
+Narayan-Chen, and Hockenmaier 2020). We take a closer look at evaluation and
+data for the BAP task, discovering key challenges and making significant
+improvements on both fronts to propose BAP v2, an upgraded version of the task.
+This will allow future work to make more efficient and meaningful progress on
+it. It comprises of: (1) an enhanced evaluation benchmark that includes a
+cleaner test set and fairer, more insightful metrics, and (2) additional
+synthetic training data generated from novel Minecraft dialogue and target
+structure simulators emulating the MCBT. We show that the synthetic data can be
+used to train more performant and robust neural models even with relatively
+simple training methods. Looking ahead, such data could also be crucial for
+training more sophisticated, data-hungry deep transformer models and
+training/fine-tuning increasingly large LLMs. Although modeling is not the
+primary focus of this work, we also illustrate the impact of our data and
+training methodologies on a simple LLM- and transformer-based model, thus
+validating the robustness of our approach, and setting the stage for more
+advanced architectures and LLMs going forward.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual RAG: Expanding MLLM visual knowledge without fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirco Bonomo, Simone Bianco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have achieved notable performance in
+computer vision tasks that require reasoning across visual and textual
+modalities, yet their capabilities are limited to their pre-trained data,
+requiring extensive fine-tuning for updates. Recent researches have explored
+the use of In-Context Learning (ICL) to overcome these challenges by providing
+a set of demonstrating examples as context to augment MLLMs performance in
+several tasks, showing that many-shot ICL leads to substantial improvements
+compared to few-shot ICL. However, the reliance on numerous demonstrating
+examples and the limited MLLMs context windows presents significant obstacles.
+This paper aims to address these challenges by introducing a novel approach,
+Visual RAG, that synergically combines the MLLMs capability to learn from the
+context, with a retrieval mechanism. The crux of this approach is to ensure to
+augment the MLLM knowledge by selecting only the most relevant demonstrating
+examples for the query, pushing it to learn by analogy. In this way, relying on
+the new information provided dynamically during inference time, the resulting
+system is not limited to the knowledge extracted from the training data, but
+can be updated rapidly and easily without fine-tuning. Furthermore, this
+greatly reduces the computational costs for improving the model image
+classification performance, and augments the model knowledge to new visual
+domains and tasks it was not trained for. Extensive experiments on eight
+different datasets in the state of the art spanning several domains and image
+classification tasks show that the proposed Visual RAG, compared to the most
+recent state of the art (i.e., many-shot ICL), is able to obtain an accuracy
+that is very close or even higher (approx. +2% improvement on average) while
+using a much smaller set of demonstrating examples (approx. only 23% on
+average).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing Multilabel Imbalance with an Efficiency-Focused Approach
+  Using Diffusion Model-Generated Synthetic Samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Charte, Miguel Ángel Dávila, María Dolores Pérez-Godoy, María José del Jesus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive models trained on imbalanced data tend to produce biased results.
+This problem is exacerbated when there is not just one output label, but a set
+of them. This is the case for multilabel learning (MLL) algorithms used to
+classify patterns, rank labels, or learn the distribution of outputs. Many
+solutions have been proposed in the literature. The one that can be applied
+universally, independent of the algorithm used to build the model, is data
+resampling. The generation of new instances associated with minority labels, so
+that empty areas of the feature space are filled, helps to improve the obtained
+models. The quality of these new instances depends on the algorithm used to
+generate them. In this paper, a diffusion model tailored to produce new
+instances for MLL data, called MLDM (\textit{MultiLabel Diffusion Model}), is
+proposed. Diffusion models have been mainly used to generate artificial images
+and videos. Our proposed MLDM is based on this type of models. The experiments
+conducted compare MLDM with several other MLL resampling algorithms. The
+results show that MLDM is competitive while it improves efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 8 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No More Sliding Window: Efficient 3D Medical Image Segmentation with
+  Differentiable Top-k Patch Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10814v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10814v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Young Seok Jeon, Hongfei Yang, Huazhu Fu, Mengling Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D models are favored over 2D for 3D medical image segmentation tasks due to
+their ability to leverage inter-slice relationship, yielding higher
+segmentation accuracy. However, 3D models demand significantly more GPU memory
+with increased model size and intermediate tensors. A common solution is to use
+patch-based training and make whole-volume predictions with sliding window (SW)
+inference. SW inference reduces memory usage but is slower due to equal
+resource allocation across patches and less accurate as it overlooks global
+features beyond patches.
+  We propose NMSW-Net (No-More-Sliding-Window-Net), a novel framework that
+enhances efficiency and accuracy of any given 3D segmentation model by
+eliminating SW inference and incorporating global predictions when necessary.
+NMSW-Net incorporates a differentiable Top-k module to sample only the relevant
+patches that enhance segmentation accuracy, thereby minimizing redundant
+computations. Additionally, it learns to leverage coarse global predictions
+when patch prediction alone is insufficient. NMSW-Net is model-agnostic, making
+it compatible with any 3D segmentation model that previously relied on SW
+inference.
+  Evaluated across 3 tasks with 3 segmentation backbones, NMSW-Net achieves
+competitive or sometimes superior accuracy compared to SW, while reducing
+computational complexity by 90% (87.5 to 7.95 TFLOPS), delivering 4x faster
+inference on the H100 GPU (19.0 to 4.3 sec), and 7x faster inference on the
+Intel Xeon Gold CPU (1710 to 230 seconds).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Coloring to Reduce Computation Time in Prioritized Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10812v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10812v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Scheffe, Julius Kahle, Bassam Alrifaee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributing computations among agents in large networks reduces
+computational effort in multi-agent path finding (MAPF). One distribution
+strategy is prioritized planning (PP). In PP, we couple and prioritize
+interacting agents to achieve a desired behavior across all agents in the
+network. We characterize the interaction with a directed acyclic graph (DAG).
+The computation time for solving MAPF problem using PP is mainly determined
+through the longest path in this DAG. The longest path depends on the fixed
+undirected coupling graph and the variable prioritization. The approaches from
+literature to prioritize agents are numerous and pursue various goals. This
+article presents an approach for prioritization in PP to reduce the longest
+path length in the coupling DAG and thus the computation time for MAPF using
+PP. We prove that this problem can be mapped to a graph-coloring problem, in
+which the number of colors required corresponds to the longest path length in
+the coupling DAG. We propose a decentralized graph-coloring algorithm to
+determine priorities for the agents. We evaluate the approach by applying it to
+multi-agent motion planning (MAMP) for connected and automated vehicles (CAVs)
+on roads using, a variant of MAPF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Auto-Labeling of Large-Scale Poultry <span class="highlight-title">Dataset</span>s (ALPD) Using
+  Semi-Supervised Models, Active Learning, and <span class="highlight-title">Prompt</span>-then-Detect Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10809v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10809v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramesh Bahadur Bist, Lilong Chai, Shawna Weimer, Hannah Atungulua, Chantel Pennicott, Xiao Yang, Sachin Subedi, Chaitanya Pallerla, Yang Tian, Dongyi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid growth of AI in poultry farming has highlighted the challenge of
+efficiently labeling large, diverse datasets. Manual annotation is
+time-consuming, making it impractical for modern systems that continuously
+generate data. This study explores semi-supervised auto-labeling methods,
+integrating active learning, and prompt-then-detect paradigm to develop an
+efficient framework for auto-labeling of large poultry datasets aimed at
+advancing AI-driven behavior and health monitoring. Viideo data were collected
+from broilers and laying hens housed at the University of Arkansas and the
+University of Georgia. The collected videos were converted into images,
+pre-processed, augmented, and labeled. Various machine learning models,
+including zero-shot models like Grounding DINO, YOLO-World, and CLIP, and
+supervised models like YOLO and Faster-RCNN, were utilized for broilers, hens,
+and behavior detection. The results showed that YOLOv8s-World and YOLOv9s
+performed better when compared performance metrics for broiler and hen
+detection under supervised learning, while among the semi-supervised model,
+YOLOv8s-ALPD achieved the highest precision (96.1%) and recall (99.0%) with an
+RMSE of 1.9. The hybrid YOLO-World model, incorporating the optimal YOLOv8s
+backbone, demonstrated the highest overall performance. It achieved a precision
+of 99.2%, recall of 99.4%, and an F1 score of 98.7% for breed detection,
+alongside a precision of 88.4%, recall of 83.1%, and an F1 score of 84.5% for
+individual behavior detection. Additionally, semi-supervised models showed
+significant improvements in behavior detection, achieving up to 31% improvement
+in precision and 16% in F1-score. The semi-supervised models with minimal
+active learning reduced annotation time by over 80% compared to full manual
+labeling. Moreover, integrating zero-shot models enhanced detection and
+behavior identification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Step-KTO: Optimizing Mathematical Reasoning through Stepwise Binary
+  Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10799v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10799v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yen-Ting Lin, Di Jin, Tengyu Xu, Tianhao Wu, Sainbayar Sukhbaatar, Chen Zhu, Yun He, Yun-Nung Chen, Jason Weston, Yuandong Tian, Arash Rahnama, Sinong Wang, Hao Ma, Han Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have recently demonstrated remarkable success in
+mathematical reasoning. Despite progress in methods like chain-of-thought
+prompting and self-consistency sampling, these advances often focus on final
+correctness without ensuring that the underlying reasoning process is coherent
+and reliable. This paper introduces Step-KTO, a training framework that
+combines process-level and outcome-level binary feedback to guide LLMs toward
+more trustworthy reasoning trajectories. By providing binary evaluations for
+both the intermediate reasoning steps and the final answer, Step-KTO encourages
+the model to adhere to logical progressions rather than relying on superficial
+shortcuts. Our experiments on challenging mathematical benchmarks show that
+Step-KTO significantly improves both final answer accuracy and the quality of
+intermediate reasoning steps. For example, on the MATH-500 dataset, Step-KTO
+achieves a notable improvement in Pass@1 accuracy over strong baselines. These
+results highlight the promise of integrating stepwise process feedback into LLM
+training, paving the way toward more interpretable and dependable reasoning
+capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ML-SceGen: A Multi-level Scenario Generation Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicheng Xiao, Yangyang Sun, Yicheng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current scientific research witnesses various attempts at applying Large
+Language Models for scenario generation but is inclined only to comprehensive
+or dangerous scenarios. In this paper, we seek to build a three-stage framework
+that not only lets users regain controllability over the generated scenarios
+but also generates comprehensive scenarios containing danger factors in
+uncontrolled intersection settings. In the first stage, LLM agents will
+contribute to translating the key components of the description of the expected
+scenarios into Functional Scenarios. For the second stage, we use Answer Set
+Programming (ASP) solver Clingo to help us generate comprehensive logical
+traffic within intersections. During the last stage, we use LLM to update
+relevant parameters to increase the critical level of the concrete scenario.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simultaneous Computation with Multiple Prioritizations in Multi-Agent
+  Motion Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Scheffe, Julius Kahle, Bassam Alrifaee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent path finding (MAPF) in large networks is computationally
+challenging. An approach for MAPF is prioritized planning (PP), in which agents
+plan sequentially according to their priority. Albeit a computationally
+efficient approach for MAPF, the solution quality strongly depends on the
+prioritization. Most prioritizations rely either on heuristics, which do not
+generalize well, or iterate to find adequate priorities, which costs
+computational effort. In this work, we show how agents can compute with
+multiple prioritizations simultaneously. Our approach is general as it does not
+rely on domain-specific knowledge. The context of this work is multi-agent
+motion planning (MAMP) with a receding horizon subject to computation time
+constraints. MAMP considers the system dynamics in more detail compared to
+MAPF. In numerical experiments on MAMP, we demonstrate that our approach to
+prioritization comes close to optimal prioritization and outperforms
+state-of-the-art methods with only a minor increase in computation time. We
+show real-time capability in an experiment on a road network with ten vehicles
+in our Cyber-Physical Mobility Lab.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedFILIP: Medical Fine-grained Language-Image <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinjie Liang, Xiangyu Li, Fanding Li, Jie Jiang, Qing Dong, Wei Wang, Kuanquan Wang, Suyu Dong, Gongning Luo, Shuo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical vision-language pretraining (VLP) that leverages naturally-paired
+medical image-report data is crucial for medical image analysis. However,
+existing methods struggle to accurately characterize associations between
+images and diseases, leading to inaccurate or incomplete diagnostic results. In
+this work, we propose MedFILIP, a fine-grained VLP model, introduces medical
+image-specific knowledge through contrastive learning, specifically: 1) An
+information extractor based on a large language model is proposed to decouple
+comprehensive disease details from reports, which excels in extracting disease
+deals through flexible prompt engineering, thereby effectively reducing text
+complexity while retaining rich information at a tiny cost. 2) A knowledge
+injector is proposed to construct relationships between categories and visual
+attributes, which help the model to make judgments based on image features, and
+fosters knowledge extrapolation to unfamiliar disease categories. 3) A semantic
+similarity matrix based on fine-grained annotations is proposed, providing
+smoother, information-richer labels, thus allowing fine-grained image-text
+alignment. 4) We validate MedFILIP on numerous datasets, e.g., RSNA-Pneumonia,
+NIH ChestX-ray14, VinBigData, and COVID-19. For single-label, multi-label, and
+fine-grained classification, our model achieves state-of-the-art performance,
+the classification accuracy has increased by a maximum of 6.69\%. The code is
+available in https://github.com/PerceptionComputingLab/MedFILIP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, IEEE Journal of Biomedical and Health
+  Informatics 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Diagnostic in 3D COVID-19 Pneumonia CT-scans through
+  Explainable Uncertainty Bayesian Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Manuel Liscano Fierro, Hector J. Hortua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately classifying COVID-19 pneumonia in 3D CT scans remains a
+significant challenge in the field of medical image analysis. Although
+deterministic neural networks have shown promising results in this area, they
+provide only point estimates outputs yielding poor diagnostic in clinical
+decision-making. In this paper, we explore the use of Bayesian neural networks
+for classifying COVID-19 pneumonia in 3D CT scans providing uncertainties in
+their predictions. We compare deterministic networks and their Bayesian
+counterpart, enhancing the decision-making accuracy under uncertainty
+information. Remarkably, our findings reveal that lightweight architectures
+achieve the highest accuracy of 96\% after developing extensive hyperparameter
+tuning. Furthermore, the Bayesian counterpart of these architectures via
+Multiplied Normalizing Flow technique kept a similar performance along with
+calibrated uncertainty estimates. Finally, we have developed a 3D-visualization
+approach to explain the neural network outcomes based on SHAP values. We
+conclude that explainability along with uncertainty quantification will offer
+better clinical decisions in medical image analysis, contributing to ongoing
+efforts for improving the diagnosis and treatment of COVID-19 pneumonia.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>61 pages, 16 figures. Comments are welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAPS: Advancing Multi-Modal Reasoning in Expert-Level Physical Science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erle Zhu, Yadi Liu, Zhe Zhang, Xujun Li, Jin Zhou, Xinjie Yu, Minlie Huang, Hongning Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained on extensive text and image corpora, current Multi-Modal Large
+Language Models (MLLM) have shown strong capabilities in general visual
+reasoning tasks. However, their performance is still lacking in physical
+domains that require understanding diagrams with complex physical structures
+and quantitative analysis based on multi-modal information. To address this, we
+develop a new framework, named Multi-Modal Scientific Reasoning with Physics
+Perception and Simulation (MAPS) based on an MLLM. MAPS decomposes expert-level
+multi-modal reasoning task into physical diagram understanding via a Physical
+Perception Model (PPM) and reasoning with physical knowledge via a simulator.
+The PPM module is obtained by fine-tuning a visual language model using
+carefully designed synthetic data with paired physical diagrams and
+corresponding simulation language descriptions. At the inference stage, MAPS
+integrates the simulation language description of the input diagram provided by
+PPM and results obtained through a Chain-of-Simulation process with MLLM to
+derive the underlying rationale and the final answer. Validated using our
+collected college-level circuit analysis problems, MAPS significantly improves
+reasoning accuracy of MLLM and outperforms all existing models. The results
+confirm MAPS offers a promising direction for enhancing multi-modal scientific
+reasoning ability of MLLMs. We will release our code, model and dataset used
+for our experiments upon publishing of this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-supervised Semantic Segmentation for Remote Sensing Images via
+  Multi-scale Uncertainty Consistency and Cross-Teacher-Student Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10736v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10736v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanwen Wang, Changrui Chen, Xin Sun, Danfeng Hong, Jungong Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised learning offers an appealing solution for remote sensing (RS)
+image segmentation to relieve the burden of labor-intensive pixel-level
+labeling. However, RS images pose unique challenges, including rich multi-scale
+features and high inter-class similarity. To address these problems, this paper
+proposes a novel semi-supervised Multi-Scale Uncertainty and
+Cross-Teacher-Student Attention (MUCA) model for RS image semantic segmentation
+tasks. Specifically, MUCA constrains the consistency among feature maps at
+different layers of the network by introducing a multi-scale uncertainty
+consistency regularization. It improves the multi-scale learning capability of
+semi-supervised algorithms on unlabeled data. Additionally, MUCA utilizes a
+Cross-Teacher-Student attention mechanism to guide the student network, guiding
+the student network to construct more discriminative feature representations
+through complementary features from the teacher network. This design
+effectively integrates weak and strong augmentations (WA and SA) to further
+boost segmentation performance. To verify the effectiveness of our model, we
+conduct extensive experiments on ISPRS-Potsdam and LoveDA datasets. The
+experimental results show the superiority of our method over state-of-the-art
+semi-supervised methods. Notably, our model excels in distinguishing highly
+similar objects, showcasing its potential for advancing semi-supervised RS
+image segmentation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GEC-RAG: Improving Generative Error Correction via Retrieval-Augmented
+  Generation for Automatic Speech Recognition Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amin Robatian, Mohammad Hajipour, Mohammad Reza Peyghan, Fatemeh Rajabi, Sajjad Amini, Shahrokh Ghaemmaghami, Iman Gholampour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Speech Recognition (ASR) systems have demonstrated remarkable
+performance across various applications. However, limited data and the unique
+language features of specific domains, such as low-resource languages,
+significantly degrade their performance and lead to higher Word Error Rates
+(WER). In this study, we propose Generative Error Correction via
+Retrieval-Augmented Generation (GEC-RAG), a novel approach designed to improve
+ASR accuracy for low-resource domains, like Persian. Our approach treats the
+ASR system as a black-box, a common practice in cloud-based services, and
+proposes a Retrieval-Augmented Generation (RAG) approach within the In-Context
+Learning (ICL) scheme to enhance the quality of ASR predictions. By
+constructing a knowledge base that pairs ASR predictions (1-best and 5-best
+hypotheses) with their corresponding ground truths, GEC-RAG retrieves lexically
+similar examples to the ASR transcription using the Term Frequency-Inverse
+Document Frequency (TF-IDF) measure. This process provides relevant error
+patterns of the system alongside the ASR transcription to the Generative Large
+Language Model (LLM), enabling targeted corrections. Our results demonstrate
+that this strategy significantly reduces WER in Persian and highlights a
+potential for domain adaptation and low-resource scenarios. This research
+underscores the effectiveness of using RAG in enhancing ASR systems without
+requiring direct model modification or fine-tuning, making it adaptable to any
+domain by simply updating the transcription knowledge base with domain-specific
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In the Picture: Medical Imaging <span class="highlight-title">Dataset</span>s, Artifacts, and their Living
+  <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amelia Jiménez-Sánchez, Natalia-Rozalia Avlona, Sarah de Boer, Víctor M. Campello, Aasa Feragen, Enzo Ferrante, Melanie Ganz, Judy Wawira Gichoya, Camila González, Steff Groefsema, Alessa Hering, Adam Hulman, Leo Joskowicz, Dovile Juodelyte, Melih Kandemir, Thijs Kooi, Jorge del Pozo Lérida, Livie Yumeng Li, Andre Pacheco, Tim Rädsch, Mauricio Reyes, Théo Sourget, Bram van Ginneken, David Wen, Nina Weng, Jack Junchi Xu, Hubert Dariusz Zając, Maria A. Zuluaga, Veronika Cheplygina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Datasets play a critical role in medical imaging research, yet issues such as
+label quality, shortcuts, and metadata are often overlooked. This lack of
+attention may harm the generalizability of algorithms and, consequently,
+negatively impact patient outcomes. While existing medical imaging literature
+reviews mostly focus on machine learning (ML) methods, with only a few focusing
+on datasets for specific applications, these reviews remain static -- they are
+published once and not updated thereafter. This fails to account for emerging
+evidence, such as biases, shortcuts, and additional annotations that other
+researchers may contribute after the dataset is published. We refer to these
+newly discovered findings of datasets as research artifacts. To address this
+gap, we propose a living review that continuously tracks public datasets and
+their associated research artifacts across multiple medical imaging
+applications. Our approach includes a framework for the living review to
+monitor data documentation artifacts, and an SQL database to visualize the
+citation relationships between research artifact and dataset. Lastly, we
+discuss key considerations for creating medical imaging datasets, review best
+practices for data annotation, discuss the significance of shortcuts and
+demographic diversity, and emphasize the importance of managing datasets
+throughout their entire lifecycle. Our demo is publicly available at
+http://130.226.140.142.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Should I Build A Benchmark? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialun Cao, Yuk-Kit Chan, Zixuan Ling, Wenxuan Wang, Shuqing Li, Mingwei Liu, Chaozheng Wang, Boxi Yu, Pinjia He, Shuai Wang, Zibin Zheng, Michael R. Lyu, Shing-Chi Cheung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various benchmarks have been proposed to assess the performance of large
+language models (LLMs) in different coding scenarios. We refer to them as
+code-related benchmarks. However, there are no systematic guidelines by which
+such a benchmark should be developed to ensure its quality, reliability, and
+reproducibility. We propose How2Bench, which is comprised of a 55- 55-criteria
+checklist as a set of guidelines to govern the development of code-related
+benchmarks comprehensively. Using HOW2BENCH, we profiled 274 benchmarks
+released within the past decade and found concerning issues. Nearly 70% of the
+benchmarks did not take measures for data quality assurance; over 10% did not
+even open source or only partially open source. Many highly cited benchmarks
+have loopholes, including duplicated samples, incorrect reference
+codes/tests/prompts, and unremoved sensitive/confidential information. Finally,
+we conducted a human study involving 49 participants, which revealed
+significant gaps in awareness of the importance of data quality,
+reproducibility, and transparency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Ensemble Methods for Stock Trading and Crypto Trading Tasks
+  at ACM ICAIF FinRL Contest 2023-2024 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10709v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10709v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolaus Holzer, Keyi Wang, Kairong Xiao, Xiao-Yang Liu Yanglet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning has demonstrated great potential for performing
+financial tasks. However, it faces two major challenges: policy instability and
+sampling bottlenecks. In this paper, we revisit ensemble methods with massively
+parallel simulations on graphics processing units (GPUs), significantly
+enhancing the computational efficiency and robustness of trained models in
+volatile financial markets. Our approach leverages the parallel processing
+capability of GPUs to significantly improve the sampling speed for training
+ensemble models. The ensemble models combine the strengths of component agents
+to improve the robustness of financial decision-making strategies. We conduct
+experiments in both stock and cryptocurrency trading tasks to evaluate the
+effectiveness of our approach. Massively parallel simulation on a single GPU
+improves the sampling speed by up to $1,746\times$ using $2,048$ parallel
+environments compared to a single environment. The ensemble models have high
+cumulative returns and outperform some individual agents, reducing maximum
+drawdown by up to $4.17\%$ and improving the Sharpe ratio by up to $0.21$.
+  This paper describes trading tasks at ACM ICAIF FinRL Contests in 2023 and
+2024.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Algorithmic Derivation of Human Spatial Navigation Indices From Eye
+  Movement Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sobhan Teymouri, Fatemeh Alizadehziri, Mobina Zibandehpoor, Mehdi Delrobaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial navigation is a complex cognitive function involving sensory inputs,
+such as visual, auditory, and proprioceptive information, to understand and
+move within space. This ability allows humans to create mental maps, navigate
+through environments, and process directional cues, crucial for exploring new
+places and finding one's way in unfamiliar surroundings. This study takes an
+algorithmic approach to extract indices relevant to human spatial navigation
+using eye movement data. Leveraging electrooculography signals, we analyzed
+statistical features and applied feature engineering techniques to study eye
+movements during navigation tasks. The proposed work combines signal processing
+and machine learning approaches to develop indices for navigation and
+orientation, spatial anxiety, landmark recognition, path survey, and path
+route. The analysis yielded five subscore indices with notable accuracy. Among
+these, the navigation and orientation subscore achieved an R2 score of 0.72,
+while the landmark recognition subscore attained an R2 score of 0.50.
+Additionally, statistical features highly correlated with eye movement metrics,
+including blinks, saccades, and fixations, were identified. The findings of
+this study can lead to more cognitive assessments and enable early detection of
+spatial navigation impairments, particularly among individuals at risk of
+cognitive decline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The dataset is available in the following work: Mobina Zibandehpoor,
+  Fatemeh Alizadehziri, Arash Abbasi Larki, Sobhan Teymouri, and Mehdi
+  Delrobaei. Electrooculography Dataset for Objective Spatial Navigation
+  Assessment in Healthy Participants. arXiv preprint arXiv:2411.06811, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributionally Robust Policy Evaluation and Learning for Continuous
+  Treatment with Observational Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheuk Hang Leung, Yiyan Huang, Yijun Li, Qi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using offline observational data for policy evaluation and learning allows
+decision-makers to evaluate and learn a policy that connects characteristics
+and interventions. Most existing literature has focused on either discrete
+treatment spaces or assumed no difference in the distributions between the
+policy-learning and policy-deployed environments. These restrict applications
+in many real-world scenarios where distribution shifts are present with
+continuous treatment. To overcome these challenges, this paper focuses on
+developing a distributionally robust policy under a continuous treatment
+setting. The proposed distributionally robust estimators are established using
+the Inverse Probability Weighting (IPW) method extended from the discrete one
+for policy evaluation and learning under continuous treatments. Specifically,
+we introduce a kernel function into the proposed IPW estimator to mitigate the
+exclusion of observations that can occur in the standard IPW method to
+continuous treatments. We then provide finite-sample analysis that guarantees
+the convergence of the proposed distributionally robust policy evaluation and
+learning estimators. The comprehensive experiments further verify the
+effectiveness of our approach when distribution shifts are present.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulation of Hypergraph Algorithms with Looped <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Li, Yingyu Liang, Jiangxuan Long, Zhenmei Shi, Zhao Song, Zhen Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Looped Transformers have shown exceptional capability in simulating
+traditional graph algorithms, but their application to more complex structures
+like hypergraphs remains underexplored. Hypergraphs generalize graphs by
+modeling higher-order relationships among multiple entities, enabling richer
+representations but introducing significant computational challenges. In this
+work, we extend the Loop Transformer architecture to simulate hypergraph
+algorithms efficiently, addressing the gap between neural networks and
+combinatorial optimization over hypergraphs. In this paper, we extend the Loop
+Transformer architecture to simulate hypergraph algorithms efficiently,
+addressing the gap between neural networks and combinatorial optimization over
+hypergraphs. Specifically, we propose a novel degradation mechanism for
+reducing hypergraphs to graph representations, enabling the simulation of
+graph-based algorithms, such as Dijkstra's shortest path. Furthermore, we
+introduce a hyperedge-aware encoding scheme to simulate hypergraph-specific
+algorithms, exemplified by Helly's algorithm. The paper establishes theoretical
+guarantees for these simulations, demonstrating the feasibility of processing
+high-dimensional and combinatorial data using Loop Transformers. This work
+highlights the potential of Transformers as general-purpose algorithmic solvers
+for structured data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class-Imbalanced-Aware Adaptive <span class="highlight-title">Dataset</span> Distillation for Scalable
+  <span class="highlight-title">Pretrain</span>ed Model on Credit Scoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10677v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10677v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xia Li, Hanghang Zheng, Xiao Chen, Hong Liu, Mao Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of artificial intelligence has significantly enhanced credit
+scoring technologies. Despite the remarkable efficacy of advanced deep learning
+models, mainstream adoption continues to favor tree-structured models due to
+their robust predictive performance on tabular data. Although pretrained models
+have seen considerable development, their application within the financial
+realm predominantly revolves around question-answering tasks and the use of
+such models for tabular-structured credit scoring datasets remains largely
+unexplored. Tabular-oriented large models, such as TabPFN, has made the
+application of large models in credit scoring feasible, albeit can only
+processing with limited sample sizes. This paper provides a novel framework to
+combine tabular-tailored dataset distillation technique with the pretrained
+model, empowers the scalability for TabPFN. Furthermore, though class imbalance
+distribution is the common nature in financial datasets, its influence during
+dataset distillation has not been explored. We thus integrate the
+imbalance-aware techniques during dataset distillation, resulting in improved
+performance in financial datasets (e.g., a 2.5% enhancement in AUC). This study
+presents a novel framework for scaling up the application of large pretrained
+models on financial tabular datasets and offers a comparative analysis of the
+influence of class imbalance on the dataset distillation process. We believe
+this approach can broaden the applications and downstream tasks of large models
+in the financial domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Mystery of Weight in Large Foundation Models: Gaussian
+  Distribution Never Fades 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongjie Si, Jingjing Jiang, Wei Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a pioneering exploration of the mechanisms underlying
+large foundation models' (LFMs) weights, aiming to simplify AI research.
+Through extensive observation and analysis on prevailing LFMs, we find that
+regardless of initialization strategies, their weights predominantly follow a
+Gaussian distribution, with occasional sharp, inverted T-shaped, or linear
+patterns. We further discover that the weights share the i.i.d. properties of
+Gaussian noise, and explore their direct relationship. We find that
+transformation weights can be derived from Gaussian noise, and they primarily
+serve to increase the standard deviation of pre-trained weights, with their
+standard deviation growing with layer depth. In other words, transformation
+weights broaden the acceptable deviation from the optimal weights, facilitating
+adaptation to downstream tasks. Building upon the above conclusions, we
+thoroughly discussed the nature of optimal weights, ultimately concluding that
+they should exhibit zero-mean, symmetry, and sparsity, with the sparse values
+being a truncated Gaussian distribution and a few outliers. Our experiments in
+LFM adaptation and editing demonstrate the effectiveness of these insights. We
+hope these findings can provide a foundational understanding to pave the way
+for future advancements in the LFM community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revisions ongoing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LUT-DLA: Lookup Table as Efficient Extreme Low-Bit Deep Learning
+  Accelerator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10658v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10658v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoyu Li, Shengyu Ye, Chunyun Chen, Yang Wang, Fan Yang, Ting Cao, Cheng Liu, Mohamed M. Sabry, Mao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of neural network capabilities invariably leads to a
+significant surge in computational demands due to expanding model sizes and
+increased computational complexity. To reduce model size and lower inference
+costs, recent research has focused on simplifying models and designing hardware
+accelerators using low-bit quantization. However, due to numerical
+representation limits, scalar quantization cannot reduce bit width lower than
+1-bit, diminishing its benefits. To break through these limitations, we
+introduce LUT-DLA, a Look-Up Table (LUT) Deep Learning Accelerator Framework
+that utilizes vector quantization to convert neural network models into LUTs,
+achieving extreme low-bit quantization. The LUT-DLA framework facilitates
+efficient and cost-effective hardware accelerator designs and supports the
+LUTBoost algorithm, which helps to transform various DNN models into LUT-based
+models via multistage training, drastically cutting both computational and
+hardware overhead. Additionally, through co-design space exploration, LUT-DLA
+assesses the impact of various model and hardware parameters to fine-tune
+hardware configurations for different application scenarios, optimizing
+performance and efficiency. Our comprehensive experiments show that LUT-DLA
+achieves improvements in power efficiency and area efficiency with gains of
+$1.4$~$7.0\times$ and $1.5$~$146.1\times$, respectively, while maintaining only
+a modest accuracy drop. For CNNs, accuracy decreases by $0.1\%$~$3.1\%$ using
+the $L_2$ distance similarity, $0.1\%$~$3.4\%$ with the $L_1$ distance
+similarity, and $0.1\%$~$3.8\%$ when employing the Chebyshev distance
+similarity. For transformer-based models, the accuracy drop ranges from $1.4\%$
+to $3.0\%$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI/ML Based Detection and Categorization of Covert Communication in IPv6
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10627v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10627v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Wali Ur Rahman, Yu-Zheng Lin, Carter Weeks, David Ruddell, Jeff Gabriellini, Bill Hayes, Salim Hariri, Edward V. Ziegler Jr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The flexibility and complexity of IPv6 extension headers allow attackers to
+create covert channels or bypass security mechanisms, leading to potential data
+breaches or system compromises. The mature development of machine learning has
+become the primary detection technology option used to mitigate covert
+communication threats. However, the complexity of detecting covert
+communication, evolving injection techniques, and scarcity of data make
+building machine-learning models challenging. In previous related research,
+machine learning has shown good performance in detecting covert communications,
+but oversimplified attack scenario assumptions cannot represent the complexity
+of modern covert technologies and make it easier for machine learning models to
+detect covert communications. To bridge this gap, in this study, we analyzed
+the packet structure and network traffic behavior of IPv6, used encryption
+algorithms, and performed covert communication injection without changing
+network packet behavior to get closer to real attack scenarios. In addition to
+analyzing and injecting methods for covert communications, this study also uses
+comprehensive machine learning techniques to train the model proposed in this
+study to detect threats, including traditional decision trees such as random
+forests and gradient boosting, as well as complex neural network architectures
+such as CNNs and LSTMs, to achieve detection accuracy of over 90\%. This study
+details the methods used for dataset augmentation and the comparative
+performance of the applied models, reinforcing insights into the adaptability
+and resilience of the machine learning application in IPv6 covert
+communication. In addition, we also proposed a Generative AI-assisted
+interpretation concept based on prompt engineering as a preliminary study of
+the role of Generative AI agents in covert communication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Does <span class="highlight-title">GPT</span> Really Get It? A Hierarchical Scale to Quantify Human vs AI's
+  Understanding of Algorithms <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14722v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14722v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirabel Reid, Santosh S. Vempala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Large Language Models (LLMs) perform (and sometimes excel at) more and
+more complex cognitive tasks, a natural question is whether AI really
+understands. The study of understanding in LLMs is in its infancy, and the
+community has yet to incorporate well-trodden research in philosophy,
+psychology, and education. We initiate this, specifically focusing on
+understanding algorithms, and propose a hierarchy of levels of understanding.
+We use the hierarchy to design and conduct a study with human subjects
+(undergraduate and graduate students) as well as large language models
+(generations of GPT), revealing interesting similarities and differences. We
+expect that our rigorous criteria will be useful to keep track of AI's progress
+in such cognitive domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 10 figures. To be published at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Propense Are Large Language Models at Producing Code Smells? A
+  Benchmarking Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18989v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18989v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Velasco, Daniel Rodriguez-Cardenas, Luftar Rahman Alif, David N. Palacio, Denys Poshyvanyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown significant potential in automating
+software engineering tasks, particularly in code generation. However, current
+evaluation benchmarks, which primarily focus on accuracy, fall short in
+assessing the quality of the code generated by these models, specifically their
+tendency to produce code smells. To address this limitation, we introduce
+CodeSmellEval, a benchmark designed to evaluate the propensity of LLMs for
+generating code smells. Our benchmark includes a novel metric: Propensity
+Smelly Score (PSC), and a curated dataset of method-level code smells:
+CodeSmellData. To demonstrate the use of CodeSmellEval, we conducted a case
+study with two state-of-the-art LLMs, CodeLlama and Mistral. The results reveal
+that both models tend to generate code smells, such as simplifiable-condition
+and consider-merging-isinstance. These findings highlight the effectiveness of
+our benchmark in evaluating LLMs, providing valuable insights into their
+reliability and their propensity to introduce code smells in code generation
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Space-LLaVA: a Vision-Language Model Adapted to Extraterrestrial
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05924v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05924v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Foutter, Daniele Gammelli, Justin Kruger, Ethan Foss, Praneet Bhoj, Tommaso Guffanti, Simone D'Amico, Marco Pavone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation Models (FMs), e.g., large language models, possess attributes of
+intelligence which offer promise to endow a robot with the contextual
+understanding necessary to navigate complex, unstructured tasks in the wild. We
+see three core challenges in the future of space robotics that motivate
+building an FM for the space robotics community: 1) Scalability of
+ground-in-the-loop operations; 2) Generalizing prior knowledge to novel
+environments; and 3) Multi-modality in tasks and sensor data. As a first-step
+towards a space foundation model, we programmatically augment three
+extraterrestrial databases with fine-grained language annotations inspired by
+the sensory reasoning necessary to e.g., identify a site of scientific interest
+on Mars, building a synthetic dataset of visual-question-answer and visual
+instruction-following tuples. We fine-tune a pre-trained LLaVA 13B checkpoint
+on our augmented dataset to adapt a Vision-Language Model (VLM) to the visual
+semantic features in an extraterrestrial environment, demonstrating FMs as a
+tool for specialization and enhancing a VLM's zero-shot performance on unseen
+task types in comparison to state-of-the-art VLMs. Ablation studies show that
+fine-tuning the language backbone and vision-language adapter in concert is key
+to facilitate adaption while a small percentage, e.g., 20%, of the pre-training
+data can be used to safeguard against catastrophic forgetting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Aerospace Conference, 23 pages, 18 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Experts' cognition-driven safe noisy labels learning for precise
+  segmentation of residual tumor in breast cancer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07295v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07295v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongquan Yang, Jie Chen, Yani Wei, Mohammad Alobaidi, Hong Bu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precise segmentation of residual tumor in breast cancer (PSRTBC) after
+neoadjuvant chemotherapy is a fundamental key technique in the treatment
+process of breast cancer. However, achieving PSRTBC is still a challenge, since
+the breast cancer tissue and tumor cells commonly have complex and varied
+morphological changes after neoadjuvant chemotherapy, which inevitably
+increases the difficulty to produce a predictive model that has good
+generalization with usual supervised learning (SL). To alleviate this
+situation, in this paper, we propose an experts' cognition-driven safe noisy
+labels learning (ECDSNLL) approach. In the concept of safe noisy labels
+learning, which is a typical type of safe weakly supervised learning, ECDSNLL
+is constructed by integrating the pathology experts' cognition about
+identifying residual tumor in breast cancer and the artificial intelligence
+experts' cognition about data modeling with provided data basis. Experimental
+results show that, compared with usual SL, ECDSNLL can significantly improve
+the lower bound of a number of UNet variants with 2.42% and 4.1% respectively
+in recall and fIoU for PSRTBC, while being able to achieve improvements in mean
+value and upper bound as well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Schedule On the Fly: Diffusion Time Prediction for Faster and Better
+  Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01243v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01243v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zilyu Ye, Zhiyang Chen, Tiancheng Li, Zemin Huang, Weijian Luo, Guo-Jun Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion and flow models have achieved remarkable successes in various
+applications such as text-to-image generation. However, these models typically
+rely on the same predetermined denoising schedules during inference for each
+prompt, which potentially limits the inference efficiency as well as the
+flexibility when handling different prompts. In this paper, we argue that the
+optimal noise schedule should adapt to each inference instance, and introduce
+the Time Prediction Diffusion Model (TPDM) to accomplish this. TPDM employs a
+plug-and-play Time Prediction Module (TPM) that predicts the next noise level
+based on current latent features at each denoising step. We train the TPM using
+reinforcement learning, aiming to maximize a reward that discounts the final
+image quality by the number of denoising steps. With such an adaptive
+scheduler, TPDM not only generates high-quality images that are aligned closely
+with human preferences but also adjusts the number of denoising steps and time
+on the fly, enhancing both performance and efficiency. We train TPDMs on
+multiple diffusion model benchmarks. With Stable Diffusion 3 Medium
+architecture, TPDM achieves an aesthetic score of 5.44 and a human preference
+score (HPS) of 29.59, while using around 50% fewer denoising steps to achieve
+better performance. We will release our best model alongside this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Sustainability in Large Language Models: Applications,
+  Economics, and Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04782v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04782v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditi Singh, Nirmal Prakashbhai Patel, Abul Ehtesham, Saket Kumar, Tala Talaei Khoei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have transformed numerous domains by providing
+advanced capabilities in natural language understanding, generation, and
+reasoning. Despite their groundbreaking applications across industries such as
+research, healthcare, and creative media, their rapid adoption raises critical
+concerns regarding sustainability. This survey paper comprehensively examines
+the environmental, economic, and computational challenges associated with LLMs,
+focusing on energy consumption, carbon emissions, and resource utilization in
+data centers. By synthesizing insights from existing literature, this work
+explores strategies such as resource-efficient training, sustainable deployment
+practices, and lifecycle assessments to mitigate the environmental impacts of
+LLMs. Key areas of emphasis include energy optimization, renewable energy
+integration, and balancing performance with sustainability. The findings aim to
+guide researchers, practitioners, and policymakers in developing actionable
+strategies for sustainable AI systems, fostering a responsible and
+environmentally conscious future for artificial intelligence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solving the Unsolvable: Translating Case Law in Hong Kong 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09444v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09444v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        King-kui Sin, Xi Xuan, Chunyu Kit, Clara Ho-yan Chan, Honic Ho-kin Ip
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the challenges translating case law under Hong Kong's
+bilingual legal system. It highlights the initial success of translating all
+written statutes into Chinese before the 1997 handover, a task mandated by the
+Basic Law. The effort involved significant collaboration among legal,
+linguistic, and translation experts, resulting in a comprehensive and
+culturally appropriate bilingual legal system. However, translating case law
+remains a significant challenge due to the sheer volume and continuous growth
+of judicial decisions. The paper critiques the governments and judiciarys
+sporadic and uncoordinated efforts to translate case law, contrasting it with
+the thorough approach previously taken for statute translation. Although the
+government acknowledges the importance of legal bilingualism, it lacks a
+sustainable strategy for translating case law. The Judiciarys position that
+translating all judgments is unnecessary, unrealistic, and not cost-effectiveis
+analyzed and critiqued for its impact on legal transparency and public trust. A
+proposed solution involves leveraging machine translation technology through a
+human-machine interactive translation platform, which undergoes two major
+transitions. Initially based on a neural model, the platform transitions to
+using a large language model for improved translation accuracy. Furthermore, it
+evolves from a single-agent system to a multi-agent system, incorporating
+Translator, Annotator, and Proofreader agents. This multi-agent approach,
+supported by a grant, aims to facilitate efficient, high-quality translation of
+judicial judgments by integrating advanced artificial intelligence and
+continuous feedback mechanisms, thus better meeting the needs of a bilingual
+legal system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Fused Multimodal Deep Learning for Plant Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01455v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01455v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alfreds Lapkovskis, Natalia Nefedova, Ali Beikmohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plant classification is vital for ecological conservation and agricultural
+productivity, enhancing our understanding of plant growth dynamics and aiding
+species preservation. The advent of deep learning (DL) techniques has
+revolutionized this field by enabling autonomous feature extraction,
+significantly reducing the dependence on manual expertise. However,
+conventional DL models often rely solely on single data sources, failing to
+capture the full biological diversity of plant species comprehensively. Recent
+research has turned to multimodal learning to overcome this limitation by
+integrating multiple data types, which enriches the representation of plant
+characteristics. This shift introduces the challenge of determining the optimal
+point for modality fusion. In this paper, we introduce a pioneering multimodal
+DL-based approach for plant classification with automatic modality fusion.
+Utilizing the multimodal fusion architecture search, our method integrates
+images from multiple plant organs -- flowers, leaves, fruits, and stems -- into
+a cohesive model. To address the lack of multimodal datasets, we contributed
+Multimodal-PlantCLEF, a restructured version of the PlantCLEF2015 dataset
+tailored for multimodal tasks. Our method achieves 82.61% accuracy on 979
+classes of Multimodal-PlantCLEF, surpassing state-of-the-art methods and
+outperforming late fusion by 10.33%. Through the incorporation of multimodal
+dropout, our approach demonstrates strong robustness to missing modalities. We
+validate our model against established benchmarks using standard performance
+metrics and McNemar's test, further underscoring its superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beautimeter: Harnessing <span class="highlight-title">GPT</span> for Assessing Architectural and Urban Beauty
+  based on the 15 Properties of Living Structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Beautimeter is a new tool powered by generative pre-trained transformer (GPT)
+technology, designed to evaluate architectural and urban beauty. Rooted in
+Christopher Alexander's theory of centers, this work builds on the idea that
+all environments possess, to varying degrees, an innate sense of life.
+Alexander identified 15 fundamental properties, such as levels of scale and
+thick boundaries, that characterize living structure, which Beautimeter uses as
+a basis for its analysis. By integrating GPT's advanced natural language
+processing capabilities, Beautimeter assesses the extent to which a structure
+embodies these 15 properties, enabling a nuanced evaluation of architectural
+and urban aesthetics. Using ChatGPT, the tool helps users generate insights
+into the perceived beauty and coherence of spaces. We conducted a series of
+case studies, evaluating images of architectural and urban environments, as
+well as carpets, paintings, and other artifacts. The results demonstrate
+Beautimeter's effectiveness in analyzing aesthetic qualities across diverse
+contexts. Our findings suggest that by leveraging GPT technology, Beautimeter
+offers architects, urban planners, and designers a powerful tool to create
+spaces that resonate deeply with people. This paper also explores the
+implications of such technology for architecture and urban design, highlighting
+its potential to enhance both the design process and the assessment of built
+environments. Keywords: Living structure, structural beauty, Christopher
+Alexander, AI in Design, human centered design
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figure, and two tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unleashing the Denoising Capability of Diffusion Prior for Solving
+  Inverse Problems <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.06959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.06959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Zhang, Jiaxin Zhuang, Cheng Jin, Gen Li, Yuantao Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent emergence of diffusion models has significantly advanced the
+precision of learnable priors, presenting innovative avenues for addressing
+inverse problems. Since inverse problems inherently entail maximum a posteriori
+estimation, previous works have endeavored to integrate diffusion priors into
+the optimization frameworks. However, prevailing optimization-based inverse
+algorithms primarily exploit the prior information within the diffusion models
+while neglecting their denoising capability. To bridge this gap, this work
+leverages the diffusion process to reframe noisy inverse problems as a
+two-variable constrained optimization task by introducing an auxiliary
+optimization variable. By employing gradient truncation, the projection
+gradient descent method is efficiently utilized to solve the corresponding
+optimization problem. The proposed algorithm, termed ProjDiff, effectively
+harnesses the prior information and the denoising capability of a pre-trained
+diffusion model within the optimization framework. Extensive experiments on the
+image restoration tasks and source separation and partial generation tasks
+demonstrate that ProjDiff exhibits superior performance across various linear
+and nonlinear inverse problems, highlighting its potential for practical
+applications. Code is available at https://github.com/weigerzan/ProjDiff/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Balance Altruism and Self-interest Based on Empathy in
+  Mixed-Motive Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.07863v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.07863v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanqi Kong, Yizhe Huang, Song-Chun Zhu, Siyuan Qi, Xue Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world multi-agent scenarios often involve mixed motives, demanding
+altruistic agents capable of self-protection against potential exploitation.
+However, existing approaches often struggle to achieve both objectives. In this
+paper, based on that empathic responses are modulated by inferred social
+relationships between agents, we propose LASE Learning to balance Altruism and
+Self-interest based on Empathy), a distributed multi-agent reinforcement
+learning algorithm that fosters altruistic cooperation through gifting while
+avoiding exploitation by other agents in mixed-motive games. LASE allocates a
+portion of its rewards to co-players as gifts, with this allocation adapting
+dynamically based on the social relationship -- a metric evaluating the
+friendliness of co-players estimated by counterfactual reasoning. In
+particular, social relationship measures each co-player by comparing the
+estimated $Q$-function of current joint action to a counterfactual baseline
+which marginalizes the co-player's action, with its action distribution
+inferred by a perspective-taking module. Comprehensive experiments are
+performed in spatially and temporally extended mixed-motive games,
+demonstrating LASE's ability to promote group collaboration without
+compromising fairness and its capacity to adapt policies to various types of
+interactive co-players.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UDC: A Unified Neural Divide-and-Conquer Framework for Large-Scale
+  Combinatorial Optimization Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.00312v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.00312v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhi Zheng, Changliang Zhou, Tong Xialiang, Mingxuan Yuan, Zhenkun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single-stage neural combinatorial optimization solvers have achieved
+near-optimal results on various small-scale combinatorial optimization (CO)
+problems without requiring expert knowledge. However, these solvers exhibit
+significant performance degradation when applied to large-scale CO problems.
+Recently, two-stage neural methods motivated by divide-and-conquer strategies
+have shown efficiency in addressing large-scale CO problems. Nevertheless, the
+performance of these methods highly relies on problem-specific heuristics in
+either the dividing or the conquering procedure, which limits their
+applicability to general CO problems. Moreover, these methods employ separate
+training schemes and ignore the interdependencies between the dividing and
+conquering strategies, often leading to sub-optimal solutions. To tackle these
+drawbacks, this article develops a unified neural divide-and-conquer framework
+(i.e., UDC) for solving general large-scale CO problems. UDC offers a
+Divide-Conquer-Reunion (DCR) training method to eliminate the negative impact
+of a sub-optimal dividing policy. Employing a high-efficiency Graph Neural
+Network (GNN) for global instance dividing and a fixed-length sub-path solver
+for conquering divided sub-problems, the proposed UDC framework demonstrates
+extensive applicability, achieving superior performance in 10 representative
+large-scale CO problems. The code is available at
+https://github.com/CIAM-Group/NCO_code/tree/main/single_objective/UDC-Large-scale-CO-master.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-MedQA: Enhancing Medical Question Answering through Case Studies in
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Yang, Hao Chen, Hui Guo, Yineng Chen, Ching-Sheng Lin, Shu Hu, Jinrong Hu, Xi Wu, Xin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and efficient question-answering systems are essential for
+delivering high-quality patient care in the medical field. While Large Language
+Models (LLMs) have made remarkable strides across various domains, they
+continue to face significant challenges in medical question answering,
+particularly in understanding domain-specific terminologies and performing
+complex reasoning. These limitations undermine their effectiveness in critical
+medical applications. To address these issues, we propose a novel approach
+incorporating similar case generation within a multi-agent medical
+question-answering (MedQA) system. Specifically, we leverage the Llama3.1:70B
+model, a state-of-the-art LLM, in a multi-agent architecture to enhance
+performance on the MedQA dataset using zero-shot learning. Our method
+capitalizes on the model's inherent medical knowledge and reasoning
+capabilities, eliminating the need for additional training data. Experimental
+results show substantial performance gains over existing benchmark models, with
+improvements of 7% in both accuracy and F1-score across various medical QA
+tasks. Furthermore, we examine the model's interpretability and reliability in
+addressing complex medical queries. This research not only offers a robust
+solution for medical question answering but also establishes a foundation for
+broader applications of LLMs in the medical domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synergizing Large Language Models and Task-specific Models for Time
+  Series Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05675v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05675v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiyi Chen, Leilei Zhang, Guansong Pang, Roger Zimmermann, Shuiguang Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In anomaly detection, methods based on large language models (LLMs) can
+incorporate expert knowledge by reading professional document, while
+task-specific small models excel at extracting normal data patterns and
+detecting value fluctuations from training data of target applications.
+Inspired by the human nervous system, where the brain stores expert knowledge
+and the peripheral nervous system and spinal cord handle specific tasks like
+withdrawal and knee-jerk reflexes, we propose CoLLaTe, a framework designed to
+facilitate collaboration between LLMs and task-specific models, leveraging the
+strengths of both models for anomaly detection.
+  In particular, we first formulate the collaboration process and identify two
+key challenges in the collaboration:
+  (1) the misalignment between the expression domains of the LLMs and
+task-specific small models, and (2) error accumulation arising from the
+predictions of both models.
+  To address these challenges, we then introduce two key components in CoLLaTe:
+a model alignment module and a collaborative loss function. Through theoretical
+analysis and experimental validation, we demonstrate that these components
+effectively mitigate the identified challenges and achieve better performance
+than both LLM-based and task-specific models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ robosuite: A Modular Simulation Framework and Benchmark for Robot
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.12293v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.12293v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuke Zhu, Josiah Wong, Ajay Mandlekar, Roberto Martín-Martín, Abhishek Joshi, Kevin Lin, Abhiram Maddukuri, Soroush Nasiriany, Yifeng Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  robosuite is a simulation framework for robot learning powered by the MuJoCo
+physics engine. It offers a modular design for creating robotic tasks as well
+as a suite of benchmark environments for reproducible research. This paper
+discusses the key system modules and the benchmark environments of our new
+release robosuite v1.5.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For more information, please visit https://robosuite.ai</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sweeping Heterogeneity with Smart MoPs: Mixture of <span class="highlight-title">Prompt</span>s for LLM Task
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02842v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02842v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Dun, Mirian Hipolito Garcia, Guoqing Zheng, Ahmed Hassan Awadallah, Anastasios Kyrillidis, Robert Sim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have the ability to solve a variety of tasks,
+such as text summarization and mathematical questions, just out of the box, but
+they are often trained with a single task in mind. Due to high computational
+costs, the current trend is to use prompt instruction tuning to better adjust
+monolithic, pretrained LLMs for new -- but often individual -- downstream
+tasks. Thus, how one would expand prompt tuning to handle -- concomitantly --
+heterogeneous tasks and data distributions is a widely open question. To
+address this gap, we suggest the use of \emph{Mixture of Prompts}, or MoPs,
+associated with smart gating functionality: the latter -- whose design is one
+of the contributions of this paper -- can identify relevant skills embedded in
+different groups of prompts and dynamically assign combined experts (i.e.,
+collection of prompts), based on the target task. Additionally, MoPs are
+empirically agnostic to any model compression technique applied -- for
+efficiency reasons -- as well as instruction data source and task composition.
+In practice, MoPs can simultaneously mitigate prompt training "interference" in
+multi-task, multi-source scenarios (e.g., task and data heterogeneity across
+sources), as well as possible implications from model approximations. As a
+highlight, MoPs manage to decrease final perplexity from $\sim20\%$ up to
+$\sim70\%$, as compared to baselines, in the federated scenario, and from $\sim
+3\%$ up to $\sim30\%$ in the centralized scenario.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can LLM <span class="highlight-title">Prompt</span>ing Serve as a Proxy for Static Analysis in Vulnerability
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12039v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12039v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ira Ceka, Feitong Qiao, Anik Dey, Aastha Valecha, Gail Kaiser, Baishakhi Ray
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their remarkable success, large language models (LLMs) have shown
+limited ability on applied tasks such as vulnerability detection. We
+investigate various prompting strategies for vulnerability detection and, as
+part of this exploration, propose a prompting strategy that integrates natural
+language descriptions of vulnerabilities with a contrastive chain-of-thought
+reasoning approach, augmented using contrastive samples from a synthetic
+dataset. Our study highlights the potential of LLMs to detect vulnerabilities
+by integrating natural language descriptions, contrastive reasoning, and
+synthetic examples into a comprehensive prompting framework. Our results show
+that this approach can enhance LLM understanding of vulnerabilities. On a
+high-quality vulnerability detection dataset such as SVEN, our prompting
+strategies can improve accuracies, F1-scores, and pairwise accuracies by 23%,
+11%, and 14%, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PSReg: Prior-guided Sparse Mixture of Experts for Point Cloud
+  Registration <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07762v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07762v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshui Huang, Zhou Huang, Yifan Zuo, Yongshun Gong, Chengdong Zhang, Deyang Liu, Yuming Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discriminative feature is crucial for point cloud registration. Recent
+methods improve the feature discriminative by distinguishing between
+non-overlapping and overlapping region points. However, they still face
+challenges in distinguishing the ambiguous structures in the overlapping
+regions. Therefore, the ambiguous features they extracted resulted in a
+significant number of outlier matches from overlapping regions. To solve this
+problem, we propose a prior-guided SMoE-based registration method to improve
+the feature distinctiveness by dispatching the potential correspondences to the
+same experts. Specifically, we propose a prior-guided SMoE module by fusing
+prior overlap and potential correspondence embeddings for routing, assigning
+tokens to the most suitable experts for processing. In addition, we propose a
+registration framework by a specific combination of Transformer layer and
+prior-guided SMoE module. The proposed method not only pays attention to the
+importance of locating the overlapping areas of point clouds, but also commits
+to finding more accurate correspondences in overlapping areas. Our extensive
+experiments demonstrate the effectiveness of our method, achieving
+state-of-the-art registration recall (95.7\%/79.3\%) on the 3DMatch/3DLoMatch
+benchmark. Moreover, we also test the performance on ModelNet40 and demonstrate
+excellent performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neptune: The Long Orbit to Benchmarking Long Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09582v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09582v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arsha Nagrani, Mingda Zhang, Ramin Mehran, Rachel Hornung, Nitesh Bharadwaj Gundavarapu, Nilpa Jha, Austin Myers, Xingyi Zhou, Boqing Gong, Cordelia Schmid, Mikhail Sirotenko, Yukun Zhu, Tobias Weyand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Neptune, a benchmark for long video understanding that requires
+reasoning over long time horizons and across different modalities. Many
+existing video datasets and models are focused on short clips (10s-30s). While
+some long video datasets do exist, they can often be solved by powerful image
+models applied per frame (and often to very few frames) in a video, and are
+usually manually annotated at high cost. In order to mitigate both these
+problems, we propose a scalable dataset creation pipeline which leverages large
+models (VLMs and LLMs), to automatically generate dense, time-aligned video
+captions, as well as tough question answer decoy sets for video segments (up to
+15 minutes in length). Our dataset Neptune covers a broad range of long video
+reasoning abilities and consists of a subset that emphasizes multimodal
+reasoning. Since existing metrics for open-ended question answering are either
+rule-based or may rely on proprietary models, we provide a new open source
+model-based metric GEM to score open-ended responses on Neptune. Benchmark
+evaluations reveal that most current open-source long video models perform
+poorly on Neptune, particularly on questions testing temporal ordering,
+counting and state changes. Through Neptune, we aim to spur the development of
+more advanced models capable of understanding long videos. The dataset is
+available at https://github.com/google-deepmind/neptune
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Multi-Objective Learning through Goal-Conditioned Supervised
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.08911v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.08911v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijun Li, Hilaf Hasson, Jing Hu, Joydeep Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-objective learning aims to optimize multiple objectives simultaneously
+with a single model for achieving a balanced and satisfying performance on all
+these objectives. However, it suffers from the difficulty to formalize and
+conduct the exact learning process, especially considering the possible
+conflicts between objectives. Existing approaches explores to resolve this
+primarily in two directions: adapting modeling structure or constraining
+optimization with certain assumptions. However, a primary issue is that their
+presuppositions for the effectiveness of their design are insufficient to
+guarantee the its generality in real-world applications. What's worse, the high
+space and computation complexity issue makes it even harder to apply them in
+large-scale, complicated environment such as the recommender systems. To
+address these issues, we propose a general framework for automatically learning
+to achieve multiple objectives based on the existing sequential data. We apply
+the goal-conditioned supervised learning (GCSL) framework to multi-objective
+learning, by extending the definition of goals from one-dimensional scalar to
+multi-dimensional vector that perfectly disentangle the representation of
+different objectives. Meanwhile, GCSL enables the model to simultaneously learn
+to achieve each objective in a concise supervised learning way, simply guided
+by existing sequences in the offline data. No additional constraint, special
+model structure design, or complex optimization algorithms are further
+required. Apart from that, we formally analyze the property of the goals in
+GCSL and then firstly propose a goal-generation framework to gain achievable
+and reasonable goals for inference. Extensive experiments are conducted on
+real-world recommendation datasets, demonstrating the effectiveness of the
+proposed method and exploring the feasibility of the goal-generation strategies
+in GCSL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attention Mechanism and Context Modeling System for Text Mining Machine
+  Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04216v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04216v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuwei Zhang, Junming Huang, Sitong Liu, Zexi Chen, Zizheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper advances a novel architectural schema anchored upon the
+Transformer paradigm and innovatively amalgamates the K-means categorization
+algorithm to augment the contextual apprehension capabilities of the schema.
+The transformer model performs well in machine translation tasks due to its
+parallel computing power and multi-head attention mechanism. However, it may
+encounter contextual ambiguity or ignore local features when dealing with
+highly complex language structures. To circumvent this constraint, this
+exposition incorporates the K-Means algorithm, which is used to stratify the
+lexis and idioms of the input textual matter, thereby facilitating superior
+identification and preservation of the local structure and contextual
+intelligence of the language. The advantage of this combination is that K-Means
+can automatically discover the topic or concept regions in the text, which may
+be directly related to translation quality. Consequently, the schema contrived
+herein enlists K-Means as a preparatory phase antecedent to the Transformer and
+recalibrates the multi-head attention weights to assist in the discrimination
+of lexis and idioms bearing analogous semantics or functionalities. This
+ensures the schema accords heightened regard to the contextual intelligence
+embodied by these clusters during the training phase, rather than merely
+focusing on locational intelligence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active <span class="highlight-title">Prompt</span> Tuning Enables <span class="highlight-title">Gpt</span>-40 To Do Efficient Classification Of
+  Microscopy Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhiram Kandiyana, Peter R. Mouton, Yaroslav Kolinko, Lawrence O. Hall, Dmitry Goldgof
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional deep learning-based methods for classifying cellular features in
+microscopy images require time- and labor-intensive processes for training
+models. Among the current limitations are major time commitments from domain
+experts for accurate ground truth preparation; and the need for a large amount
+of input image data. We previously proposed a solution that overcomes these
+challenges using OpenAI's GPT-4(V) model on a pilot dataset (Iba-1
+immuno-stained tissue sections from 11 mouse brains). Results on the pilot
+dataset were equivalent in accuracy and with a substantial improvement in
+throughput efficiency compared to the baseline using a traditional
+Convolutional Neural Net (CNN)-based approach.
+  The present study builds upon this framework using a second unique and
+substantially larger dataset of microscopy images. Our current approach uses a
+newer and faster model, GPT-4o, along with improved prompts. It was evaluated
+on a microscopy image dataset captured at low (10x) magnification from
+cresyl-violet-stained sections through the cerebellum of a total of 18 mouse
+brains (9 Lurcher mice, 9 wild-type controls). We used our approach to classify
+these images either as a control group or Lurcher mutant. Using 6 mice in the
+prompt set the results were correct classification for 11 out of the 12 mice
+(92%) with 96% higher efficiency, reduced image requirements, and lower demands
+on time and effort of domain experts compared to the baseline method (snapshot
+ensemble of CNN models). These results confirm that our approach is effective
+across multiple datasets from different brain regions and magnifications, with
+minimal overhead.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE ISBI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-17T00:00:00Z">2025-01-17</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">24</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DexForce: Extracting Force-informed Actions from Kinesthetic
+  Demonstrations for Dexterous Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claire Chen, Zhongchun Yu, Hojung Choi, Mark Cutkosky, Jeannette Bohg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning requires high-quality demonstrations consisting of
+sequences of state-action pairs. For contact-rich dexterous manipulation tasks
+that require fine-grained dexterity, the actions in these state-action pairs
+must produce the right forces. Current widely-used methods for collecting
+dexterous manipulation demonstrations are difficult to use for demonstrating
+contact-rich tasks due to unintuitive human-to-robot motion retargeting and the
+lack of direct haptic feedback. Motivated by this, we propose DexForce, a
+method for collecting demonstrations of contact-rich dexterous manipulation.
+DexForce leverages contact forces, measured during kinesthetic demonstrations,
+to compute force-informed actions for policy learning. We use DexForce to
+collect demonstrations for six tasks and show that policies trained on our
+force-informed actions achieve an average success rate of 76% across all tasks.
+In contrast, policies trained directly on actions that do not account for
+contact forces have near-zero success rates. We also conduct a study ablating
+the inclusion of force data in policy observations. We find that while using
+force data never hurts policy performance, it helps the most for tasks that
+require an advanced level of precision and coordination, like opening an
+AirPods case and unscrewing a nut.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Videos can be found here:
+  https://clairelc.github.io/dexforce.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deployment of an Aerial Multi-agent System for Automated Task Execution
+  in Large-scale Underground Mining Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niklas Dahlquist, Samuel Nordström, Nikolaos Stathoulopoulos, Björn Lindqvist, Akshit Saradagi, George Nikolakopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we present a framework for deploying an aerial multi-agent
+system in large-scale subterranean environments with minimal infrastructure for
+supporting multi-agent operations. The multi-agent objective is to optimally
+and reactively allocate and execute inspection tasks in a mine, which are
+entered by a mine operator on-the-fly. The assignment of currently available
+tasks to the team of agents is accomplished through an auction-based system,
+where the agents bid for available tasks, which are used by a central
+auctioneer to optimally assigns tasks to agents. A mobile Wi-Fi mesh supports
+inter-agent communication and bi-directional communication between the agents
+and the task allocator, while the task execution is performed completely
+infrastructure-free. Given a task to be accomplished, a reliable and modular
+agent behavior is synthesized by generating behavior trees from a pool of agent
+capabilities, using a back-chaining approach. The auction system in the
+proposed framework is reactive and supports addition of new operator-specified
+tasks on-the-go, at any point through a user-friendly operator interface. The
+framework has been validated in a real underground mining environment using
+three aerial agents, with several inspection locations spread in an environment
+of almost 200 meters. The proposed framework can be utilized for missions
+involving rapid inspection, gas detection, distributed sensing and mapping etc.
+in a subterranean environment. The proposed framework and its field deployment
+contributes towards furthering reliable automation in large-scale subterranean
+environments to offload both routine and dangerous tasks from human operators
+to autonomous aerial robots.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Transactions on Field Robotics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tethered Variable Inertial Attitude Control Mechanisms through a Modular
+  Jumping Limbed Robot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuke Tanaka, Alvin Zhu, Dennis Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the concept of a tethered variable inertial attitude
+control mechanism for a modular jumping-limbed robot designed for planetary
+exploration in low-gravity environments. The system, named SPLITTER, comprises
+two sub-10 kg quadrupedal robots connected by a tether, capable of executing
+successive jumping gaits and stabilizing in-flight using inertial morphing
+technology. Through model predictive control (MPC), attitude control was
+demonstrated by adjusting the limbs and tether length to modulate the system's
+principal moments of inertia. Our results indicate that this control strategy
+allows the robot to stabilize during flight phases without needing traditional
+flywheel-based systems or relying on aerodynamics, making the approach
+mass-efficient and ideal for small-scale planetary robots' successive jumps.
+The paper outlines the dynamics, MPC formulation for inertial morphing,
+actuator requirements, and simulation results, illustrating the potential of
+agile exploration for small-scale rovers in low-gravity environments like the
+Moon or asteroids.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceeding to IEEE Aerospace Conference 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Actions for Enhanced Embodied Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10105v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10105v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinliang Zheng, Jianxiong Li, Dongxiu Liu, Yinan Zheng, Zhihao Wang, Zhonghong Ou, Yu Liu, Jingjing Liu, Ya-Qin Zhang, Xianyuan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training on diverse, internet-scale data is a key factor in the success of
+recent large foundation models. Yet, using the same recipe for building
+embodied agents has faced noticeable difficulties. Despite the availability of
+many crowd-sourced embodied datasets, their action spaces often exhibit
+significant heterogeneity due to distinct physical embodiment and control
+interfaces for different robots, causing substantial challenges in developing
+embodied foundation models using cross-domain data. In this paper, we introduce
+UniAct, a new embodied foundation modeling framework operating in a tokenized
+Universal Action Space. Our learned universal actions capture the generic
+atomic behaviors across diverse robots by exploiting their shared structural
+features, and enable enhanced cross-domain data utilization and
+cross-embodiment generalizations by eliminating the notorious heterogeneity.
+The universal actions can be efficiently translated back to heterogeneous
+actionable commands by simply adding embodiment-specific details, from which
+fast adaptation to new robots becomes simple and straightforward. Our 0.5B
+instantiation of UniAct outperforms 14X larger SOTA embodied foundation models
+in extensive evaluations on various real-world and simulation robots,
+showcasing exceptional cross-embodiment control and adaptation capability,
+highlighting the crucial benefit of adopting universal actions. Project page:
+https://github.com/2toinf/UniAct
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robotic World Model: A Neural Network Simulator for Robust Policy
+  Optimization in Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenhao Li, Andreas Krause, Marco Hutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning robust and generalizable world models is crucial for enabling
+efficient and scalable robotic control in real-world environments. In this
+work, we introduce a novel framework for learning world models that accurately
+capture complex, partially observable, and stochastic dynamics. The proposed
+method employs a dual-autoregressive mechanism and self-supervised training to
+achieve reliable long-horizon predictions without relying on domain-specific
+inductive biases, ensuring adaptability across diverse robotic tasks. We
+further propose a policy optimization framework that leverages world models for
+efficient training in imagined environments and seamless deployment in
+real-world systems. Through extensive experiments, our approach consistently
+outperforms state-of-the-art methods, demonstrating superior autoregressive
+prediction accuracy, robustness to noise, and generalization across
+manipulation and locomotion tasks. Notably, policies trained with our method
+are successfully deployed on ANYmal D hardware in a zero-shot transfer,
+achieving robust performance with minimal sim-to-real performance loss. This
+work advances model-based reinforcement learning by addressing the challenges
+of long-horizon prediction, error accumulation, and sim-to-real transfer. By
+providing a scalable and robust framework, the introduced methods pave the way
+for adaptive and efficient robotic systems in real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and
+  Chain-of-Thought for Embodied Task Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuecheng Liu, Dafeng Chi, Shiguang Wu, Zhanguang Zhang, Yaochen Hu, Lingfeng Zhang, Yingxue Zhang, Shuang Wu, Tongtong Cao, Guowei Huang, Guangjian Tian, Xingyue Quan, Jianye Hao, Yuzheng Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial reasoning is an essential problem in embodied AI research. Efforts to
+enhance spatial reasoning abilities through supplementary spatial data and
+fine-tuning have proven limited and ineffective when addressing complex
+embodied tasks, largely due to their dependence on language-based outputs.
+While some approaches have introduced a point-based action space to mitigate
+this issue, they fall short in managing more intricate tasks within complex
+environments. This deficiency arises from their failure to fully exploit the
+inherent thinking and reasoning capabilities that are fundamental strengths of
+Vision-Language Models (VLMs). To address these limitations, we propose a novel
+approach named SpatialCoT, specifically designed to bolster the spatial
+reasoning capabilities of VLMs. Our approach comprises two stages: spatial
+coordinate bi-directional alignment, which aligns vision-language inputs with
+spatial coordinates, and chain-of-thought spatial grounding, which harnesses
+the reasoning capabilities of language models for advanced spatial reasoning.
+We evaluate SpatialCoT on challenging navigation and manipulation tasks, both
+in simulation and real-world settings. Experimental results demonstrate that
+our method significantly outperforms previous state-of-the-art approaches in
+both tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive Insights into Drones: History, Classification,
+  Architecture, Navigation, Applications, Challenges, and Future Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruchita Singh, Sandeep Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned Aerial Vehicles (UAVs), commonly known as Drones, are one of 21st
+century most transformative technologies. Emerging first for military use,
+advancements in materials, electronics, and software have catapulted drones
+into multipurpose tools for a wide range of industries. In this paper, we have
+covered the history, taxonomy, architecture, navigation systems and branched
+activities for the same. It explores important future trends like autonomous
+navigation, AI integration, and obstacle avoidance systems, emphasizing how
+they contribute to improving the efficiency and versatility of drones. It also
+looks at the major challenges like technical, environmental, economic,
+regulatory and ethical, that limit the actual take-up of drones, as well as
+trends that are likely to mitigate these obstacles in the future. This work
+offers a structured synthesis of existing studies and perspectives that enable
+insights about how drones will transform agriculture, logistics, healthcare,
+disaster management, and other areas, while also identifying new opportunities
+for innovation and development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Twisting Sliding Control for Integrated Attack UAV's Autopilot
+  and Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09937v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09937v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh Tu Nguyen, Van Truong Hoang, Manh Duong Phung, Van Hoa Doan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates an adaptive sliding-mode control for an integrated
+UAV autopilot and guidance system. First, a two-dimensional mathematical model
+of the system is derived by considering the incorporated lateral dynamics and
+relative kinematics of the UAV and its potential target of attack. Then, a
+sliding surface is derived utilizing the zero-effort miss distance. An adaptive
+twisting sliding mode (ATSMC) algorithm is applied to the integrated system.
+Simulation and comparisons have been accomplished. The results show our
+proposed design performs well in interception precision, even with high
+nonlinearity, uncertainties, disturbances, and abrupt changes in the target's
+movement, thanks to the adaptation strategy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in Proceedings of the 2025 International Conference on Energy,
+  Infrastructure and Environmental Research (EIER2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon
+  Visuomotor Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haichao Zhang, Haonan Yu, Le Zhao, Andrew Choi, Qinxun Bai, Yiqing Yang, Wei Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a low-cost quadruped manipulation system that solves long-horizon
+real-world tasks, trained by reinforcement learning purely in simulation. The
+system comprises 1) a hierarchical design of a high-level policy for
+visual-mobile manipulation following instructions, and a low-level policy for
+quadruped movement and limb-control, 2) a progressive policy expansion approach
+for solving the long-horizon task together with a teacher-student framework for
+efficient high-level training of the high-level visuomotor policy, and 3) a
+suite of techniques for minimizing sim-to-real gaps.
+  With budget-friendly but limited reliability and performance hardware, and
+just one wrist-mounted RGB camera, the entire system fully trained in
+simulation achieves high success rates for long horizon tasks involving search,
+move, grasp, and drop-into, with fluid sim-to-real transfer in a wide variety
+of indoor and outdoor scenes and lighting conditions.Extensive real-world
+evaluations show that on the long horizon mobile manipulation tasks, our system
+achieves good performance when transferred to real both in terms of task
+success rate and execution efficiency. Finally, we discuss the necessity of our
+sim-to-real techniques for legged mobile manipulation, and show their ablation
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FoundationStereo: Zero-Shot Stereo Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Wen, Matthew Trepte, Joseph Aribido, Jan Kautz, Orazio Gallo, Stan Birchfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tremendous progress has been made in deep stereo matching to excel on
+benchmark datasets through per-domain fine-tuning. However, achieving strong
+zero-shot generalization - a hallmark of foundation models in other computer
+vision tasks - remains challenging for stereo matching. We introduce
+FoundationStereo, a foundation model for stereo depth estimation designed to
+achieve strong zero-shot generalization. To this end, we first construct a
+large-scale (1M stereo pairs) synthetic training dataset featuring large
+diversity and high photorealism, followed by an automatic self-curation
+pipeline to remove ambiguous samples. We then design a number of network
+architecture components to enhance scalability, including a side-tuning feature
+backbone that adapts rich monocular priors from vision foundation models to
+mitigate the sim-to-real gap, and long-range context reasoning for effective
+cost volume filtering. Together, these components lead to strong robustness and
+accuracy across domains, establishing a new standard in zero-shot stereo depth
+estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Failure Detection in Autonomous Surgical Soft-Tissue Manipulation
+  via Uncertainty Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10561v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10561v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Thompson, Ronald Koe, Anthony Le, Gabriella Goodman, Daniel S. Brown, Alan Kuntz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous surgical robots are a promising solution to the increasing demand
+for surgery amid a shortage of surgeons. Recent work has proposed
+learning-based approaches for the autonomous manipulation of soft tissue.
+However, due to variability in tissue geometries and stiffnesses, these methods
+do not always perform optimally, especially in out-of-distribution settings. We
+propose, develop, and test the first application of uncertainty quantification
+to learned surgical soft-tissue manipulation policies as an early
+identification system for task failures. We analyze two different methods of
+uncertainty quantification, deep ensembles and Monte Carlo dropout, and find
+that deep ensembles provide a stronger signal of future task success or
+failure. We validate our approach using the physical daVinci Research Kit
+(dVRK) surgical robot to perform physical soft-tissue manipulation. We show
+that we are able to successfully detect task failure and request human
+intervention when necessary while still enabling autonomous manipulation when
+possible. Our learned tissue manipulation policy with uncertainty-based early
+failure detection achieves a zero-shot sim2real performance improvement of
+47.5% over the prior state of the art in learned soft-tissue manipulation. We
+also show that our method generalizes well to new types of tissue as well as to
+a bimanual soft tissue manipulation task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConfigBot: Adaptive Resource Allocation for Robot Applications in
+  Dynamic Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10513v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10513v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Dwivedula, Sadanand Modak, Aditya Akella, Joydeep Biswas, Daehyeok Kim, Christopher J. Rossbach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing use of autonomous mobile service robots (AMSRs) in dynamic
+environments requires flexible management of compute resources to optimize the
+performance of diverse tasks such as navigation, localization, perception, and
+so on. Current robot deployments, which oftentimes rely on static
+configurations (of the OS, applications, etc.) and system over-provisioning,
+fall short since they do not account for the tasks' performance variations
+resulting in poor system-wide behavior such as robot instability and/or
+inefficient resource use. This paper presents ConfigBot, a system designed to
+adaptively reconfigure AMSR applications to meet a predefined performance
+specification by leveraging runtime profiling and automated configuration
+tuning. Through experiments on a Boston Dynamics Spot robot equipped with
+NVIDIA AGX Orin, we demonstrate ConfigBot's efficacy in maintaining system
+stability and optimizing resource allocation across diverse scenarios. Our
+findings highlight the promise of tailored and dynamic configurations for robot
+deployments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 13 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning More With Less: Sample Efficient Dynamics Learning and
+  Model-Based RL for Loco-Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Hoffman, Jin Cheng, Chenhao Li, Stelian Coros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Combining the agility of legged locomotion with the capabilities of
+manipulation, loco-manipulation platforms have the potential to perform complex
+tasks in real-world applications. To this end, state-of-the-art quadrupeds with
+attached manipulators, such as the Boston Dynamics Spot, have emerged to
+provide a capable and robust platform. However, both the complexity of
+loco-manipulation control, as well as the black-box nature of commercial
+platforms pose challenges for developing accurate dynamics models and control
+policies. We address these challenges by developing a hand-crafted kinematic
+model for a quadruped-with-arm platform and, together with recent advances in
+Bayesian Neural Network (BNN)-based dynamics learning using physical priors,
+efficiently learn an accurate dynamics model from data. We then derive control
+policies for loco-manipulation via model-based reinforcement learning (RL). We
+demonstrate the effectiveness of this approach on hardware using the Boston
+Dynamics Spot with a manipulator, accurately performing dynamic end-effector
+trajectory tracking even in low data regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Master Thesis at ETH Zurich</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Virtual Model Control for Robotics: Design and Tuning of
+  Passivity-Based Controllers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.06627v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.06627v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Larby, Fulvio Forni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Passivity-based control is a cornerstone of control theory and an established
+design approach in robotics. Its strength is based on the passivity theorem,
+which provides a powerful interconnection framework for robotics. However, the
+design of passivity-based controllers and their optimal tuning remain
+challenging. We propose here an intuitive design approach for fully actuated
+robots, where the control action is determined by a `virtual-mechanism' as in
+classical virtual model control. The result is a robot whose controlled
+behavior can be understood in terms of physics. We achieve optimal tuning by
+applying algorithmic differentiation to ODE simulations of the rigid body
+dynamics. Overall, this leads to a flexible design and optimization approach:
+stability is proven by passivity of the virtual mechanism, while performance is
+obtained by optimization using algorithmic differentiation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid
+  Prototyping in Virtual Reality Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09600v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09600v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Augusto Pinheiro de Sousa, Heiko Hamann, Oliver Deussen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SLAM is a foundational technique with broad applications in robotics and
+AR/VR. SLAM simulations evaluate new concepts, but testing on
+resource-constrained devices, such as VR HMDs, faces challenges: high
+computational cost and restricted sensor data access. This work proposes a
+sparse framework using mesh geometry projections as features, which improves
+efficiency and circumvents direct sensor data access, advancing SLAM research
+as we demonstrate in VR and through numerical evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STPOTR: Simultaneous Human Trajectory and Pose Prediction Using a
+  Non-Autoregressive <span class="highlight-title">Transformer</span> for Robot Following Ahead 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.07600v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.07600v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mahdavian, Payam Nikdel, Mahdi TaherAhmadi, Mo Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we develop a neural network model to predict future human
+motion from an observed human motion history. We propose a non-autoregressive
+transformer architecture to leverage its parallel nature for easier training
+and fast, accurate predictions at test time. The proposed architecture divides
+human motion prediction into two parts: 1) the human trajectory, which is the
+hip joint 3D position over time and 2) the human pose which is the all other
+joints 3D positions over time with respect to a fixed hip joint. We propose to
+make the two predictions simultaneously, as the shared representation can
+improve the model performance. Therefore, the model consists of two sets of
+encoders and decoders. First, a multi-head attention module applied to encoder
+outputs improves human trajectory. Second, another multi-head self-attention
+module applied to encoder outputs concatenated with decoder outputs facilitates
+learning of temporal dependencies. Our model is well-suited for robotic
+applications in terms of test accuracy and speed, and compares favorably with
+respect to state-of-the-art methods. We demonstrate the real-world
+applicability of our work via the Robot Follow-Ahead task, a challenging yet
+practical case study for our proposed model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.09374v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.09374v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junkai Niu, Sheng Zhong, Xiuyuan Lu, Shaojie Shen, Guillermo Gallego, Yi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based visual odometry is a specific branch of visual Simultaneous
+Localization and Mapping (SLAM) techniques, which aims at solving tracking and
+mapping subproblems (typically in parallel), by exploiting the special working
+principles of neuromorphic (i.e., event-based) cameras. Due to the
+motion-dependent nature of event data, explicit data association (i.e., feature
+matching) under large-baseline view-point changes is difficult to establish,
+making direct methods a more rational choice. However, state-of-the-art direct
+methods are limited by the high computational complexity of the mapping
+sub-problem and the degeneracy of camera pose tracking in certain degrees of
+freedom (DoF) in rotation. In this paper, we tackle these issues by building an
+event-based stereo visual-inertial odometry system on top of a direct pipeline.
+Specifically, to speed up the mapping operation, we propose an efficient
+strategy for sampling contour points according to the local dynamics of events.
+The mapping performance is also improved in terms of structure completeness and
+local smoothness by merging the temporal stereo and static stereo results. To
+circumvent the degeneracy of camera pose tracking in recovering the pitch and
+yaw components of general 6-DoF motion, we introduce IMU measurements as motion
+priors via pre-integration. To this end, a compact back-end is proposed for
+continuously updating the IMU bias and predicting the linear velocity, enabling
+an accurate motion prediction for camera pose tracking. The resulting system
+scales well with modern high-resolution event cameras and leads to better
+global positioning accuracy in large-scale outdoor environments. Extensive
+evaluations on five publicly available datasets featuring different resolutions
+and scenarios justify the superior performance of the proposed system against
+five state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BILTS: A Bi-Invariant Similarity Measure for Robust Object Trajectory
+  Recognition under Reference Frame Variations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.04392v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.04392v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arno Verduyn, Erwin Aertbeliën, Glenn Maes, Joris De Schutter, Maxim Vochten
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When similar object motions are performed in diverse contexts but are meant
+to be recognized under a single classification, these contextual variations act
+as disturbances that negatively affect accurate motion recognition. In this
+paper, we focus on contextual variations caused by reference frame variations.
+To robustly deal with these variations, similarity measures have been
+introduced that compare object motion trajectories in a context-invariant
+manner. However, most are highly sensitive to noise near singularities, where
+the measure is not uniquely defined, and lack bi-invariance (invariance to both
+world and body frame variations). To address these issues, we propose the novel
+\textit{Bi-Invariant Local Trajectory-Shape Similarity} (BILTS) measure.
+Compared to other measures, the BILTS measure uniquely offers bi-invariance,
+boundedness, and third-order shape identity. Aimed at practical
+implementations, we devised a discretized and regularized version of the BILTS
+measure which shows exceptional robustness to singularities. This is
+demonstrated through rigorous recognition experiments using multiple datasets.
+On average, BILTS attained the highest recognition ratio and least sensitivity
+to contextual variations compared to other invariant object motion similarity
+measures. We believe that the BILTS measure is a valuable tool for recognizing
+motions performed in diverse contexts and has potential in other applications,
+including the recognition, segmentation, and adaptation of both motion and
+force trajectories.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted as a regular research paper for
+  consideration in the Journal of Intelligent & Robotic Systems. The content in
+  this preprint is identical to the version submitted for peer review, except
+  for formatting differences required by the journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safe Interval Randomized Path Planning For Manipulators <span class="chip">ICAPS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19567v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19567v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nuraddin Kerimov, Aleksandr Onegin, Konstantin Yakovlev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning safe paths in 3D workspace for high DoF robotic systems, such as
+manipulators, is a challenging problem, especially when the environment is
+populated with the dynamic obstacles that need to be avoided. In this case the
+time dimension should be taken into account that further increases the
+complexity of planning. To mitigate this issue we suggest to combine
+safe-interval path planning (a prominent technique in heuristic search) with
+the randomized planning, specifically, with the bidirectional rapidly-exploring
+random trees (RRT-Connect) - a fast and efficient algorithm for
+high-dimensional planning. Leveraging a dedicated technique of fast computation
+of the safe intervals we end up with an efficient planner dubbed SI-RRT. We
+compare it with the state of the art and show that SI-RRT consistently
+outperforms the competitors both in runtime and solution cost.
+  Our implementation of SI-RRT is publicly available at
+https://github.com/PathPlanning/ManipulationPlanning-SI-RRT
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to The 35th International Conference on Automated Planning
+  and Scheduling (ICAPS 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sensor-Based Distributionally Robust Control for Safe Robot Navigation
+  in Dynamic Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kehan Long, Yinzhuang Yi, Zhirui Dai, Sylvia Herbert, Jorge Cortés, Nikolay Atanasov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel method for mobile robot navigation in dynamic, unknown
+environments, leveraging onboard sensing and distributionally robust
+optimization to impose probabilistic safety constraints. Our method introduces
+a distributionally robust control barrier function (DR-CBF) that directly
+integrates noisy sensor measurements and state estimates to define safety
+constraints. This approach is applicable to a wide range of control-affine
+dynamics, generalizable to robots with complex geometries, and capable of
+operating at real-time control frequencies. Coupled with a control Lyapunov
+function (CLF) for path following, the proposed CLF-DR-CBF control synthesis
+method achieves safe, robust, and efficient navigation in challenging
+environments. We demonstrate the effectiveness and robustness of our approach
+for safe autonomous navigation under uncertainty in simulations and real-world
+experiments with differential-drive robots.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://existentialrobotics.org/DRO_Safe_Navigation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-Aware Planning for Heterogeneous Robot Teams using Dynamic
+  Topological Graphs and Mixed-Integer Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08396v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08396v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cora A. Dimmig, Kevin C. Wolfe, Bradley Woosley, Marin Kobilarov, Joseph Moore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-robot planning and coordination in uncertain environments is a
+fundamental computational challenge, since the belief space increases
+exponentially with the number of robots. In this paper, we address the problem
+of planning in uncertain environments with a heterogeneous robot team of fast
+scout vehicles for information gathering and more risk-averse carrier robots
+from which the scouts vehicles are deployed. To overcome the computational
+challenges, we represent the environment and operational scenario using a
+topological graph, where the parameters of the edge weight distributions vary
+with the state of the robot team on the graph, and we formulate a
+computationally efficient mixed-integer program which removes the dependence on
+the number of robots from its decision space. Our formulation results in the
+capability to generate optimal multi-robot, long-horizon plans in seconds that
+could otherwise be computationally intractable. Ultimately our approach enables
+real-time re-planning, since the computation time is significantly faster than
+the time to execute one step. We evaluate our approach in a scenario where the
+robot team must traverse an environment while minimizing detection by observers
+in positions that are uncertain to the robot team. We demonstrate that our
+approach is computationally tractable, can improve performance in the presence
+of imperfect information, and can be adjusted for different risk profiles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Systematic Study of Multi-Agent Deep Reinforcement Learning for Safe
+  and Robust Autonomous Highway Ramp Entry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14593v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14593v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Larry Schester, Luis E. Ortiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicles today can drive themselves on highways and driverless robotaxis
+operate in major cities, with more sophisticated levels of autonomous driving
+expected to be available and become more common in the future. Yet, technically
+speaking, so-called "Level 5" (L5) operation, corresponding to full autonomy,
+has not been achieved. For that to happen, functions such as fully autonomous
+highway ramp entry must be available, and provide provably safe, and reliably
+robust behavior to enable full autonomy. We present a systematic study of a
+highway ramp function that controls the vehicles forward-moving actions to
+minimize collisions with the stream of highway traffic into which a merging
+(ego) vehicle enters. We take a game-theoretic multi-agent (MA) approach to
+this problem and study the use of controllers based on deep reinforcement
+learning (DRL). The virtual environment of the MA DRL uses self-play with
+simulated data where merging vehicles safely learn to control longitudinal
+position during a taper-type merge. The work presented in this paper extends
+existing work by studying the interaction of more than two vehicles (agents)
+and does so by systematically expanding the road scene with additional traffic
+and ego vehicles. While previous work on the two-vehicle setting established
+that collision-free controllers are theoretically impossible in fully
+decentralized, non-coordinated environments, we empirically show that
+controllers learned using our approach are nearly ideal when measured against
+idealized optimal controllers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 9 figures; added support ack</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Uncertainty: Risk-Aware Active View Acquisition for Safe Robot
+  Navigation and 3D Scene Understanding with FisherRF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11396v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11396v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyi Liu, Wen Jiang, Boshu Lei, Vivek Pandey, Kostas Daniilidis, Nader Motee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The active view acquisition problem has been extensively studied in the
+context of robot navigation using NeRF and 3D Gaussian Splatting. To enhance
+scene reconstruction efficiency and ensure robot safety, we propose the
+Risk-aware Environment Masking (RaEM) framework. RaEM leverages coherent risk
+measures to dynamically prioritize safety-critical regions of the unknown
+environment, guiding active view acquisition algorithms toward identifying the
+next-best-view (NBV). Integrated with FisherRF, which selects the NBV by
+maximizing expected information gain, our framework achieves a dual objective:
+improving robot safety and increasing efficiency in risk-aware 3D scene
+reconstruction and understanding. Extensive high-fidelity experiments validate
+the effectiveness of our approach, demonstrating its ability to establish a
+robust and safety-focused framework for active robot exploration and 3D scene
+understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tactile Displays Driven by Projected Light 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05494v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05494v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Linnander, Dustin Goetz, Gregory Reardon, Vijay Kumar, Elliot Hawkes, Yon Visell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tactile displays that lend tangible form to digital content could transform
+computing interactions. However, achieving the resolution, speed, and dynamic
+range needed for perceptual fidelity remains challenging. We present a tactile
+display that directly converts projected light into visible tactile patterns
+via a photomechanical surface populated with millimeter-scale optotactile
+pixels. The pixels transduce incident light into mechanical displacements
+through photostimulated thermal gas expansion, yielding millimeter scale
+displacements with response times of 2 to 100 milliseconds. Employing projected
+light for power transmission and addressing renders these displays highly
+scalable. We demonstrate optically driven displays with up to 1,511 addressable
+pixels -- several times more pixels than any prior tactile display attaining
+comparable performance. Perceptual studies confirm that these displays can
+reproduce diverse spatiotemporal tactile patterns with high fidelity. This
+research establishes a foundation for practical, versatile high-resolution
+tactile displays driven by light.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">30</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-Aware Digital Twins: Robust Model Predictive Control using
+  Time-Series Deep Quantile Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Ping Chen, Ying-Kuan Tsai, Vispi Karkaria, Wei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital Twins, virtual replicas of physical systems that enable real-time
+monitoring, model updates, predictions, and decision-making, present novel
+avenues for proactive control strategies for autonomous systems. However,
+achieving real-time decision-making in Digital Twins considering uncertainty
+necessitates an efficient uncertainty quantification (UQ) approach and
+optimization driven by accurate predictions of system behaviors, which remains
+a challenge for learning-based methods. This paper presents a simultaneous
+multi-step robust model predictive control (MPC) framework that incorporates
+real-time decision-making with uncertainty awareness for Digital Twin systems.
+Leveraging a multistep ahead predictor named Time-Series Dense Encoder (TiDE)
+as the surrogate model, this framework differs from conventional MPC models
+that provide only one-step ahead predictions. In contrast, TiDE can predict
+future states within the prediction horizon in a one-shot, significantly
+accelerating MPC. Furthermore, quantile regression is employed with the
+training of TiDE to perform flexible while computationally efficient UQ on data
+uncertainty. Consequently, with the deep learning quantiles, the robust MPC
+problem is formulated into a deterministic optimization problem and provides a
+safety buffer that accommodates disturbances to enhance constraint satisfaction
+rate. As a result, the proposed method outperforms existing robust MPC methods
+by providing less-conservative UQ and has demonstrated efficacy in an
+engineering case study involving Directed Energy Deposition (DED) additive
+manufacturing. This proactive while uncertainty-aware control capability
+positions the proposed method as a potent tool for future Digital Twin
+applications and real-time process control in engineering systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deployment of an Aerial Multi-agent System for Automated Task Execution
+  in Large-scale Underground Mining Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niklas Dahlquist, Samuel Nordström, Nikolaos Stathoulopoulos, Björn Lindqvist, Akshit Saradagi, George Nikolakopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we present a framework for deploying an aerial multi-agent
+system in large-scale subterranean environments with minimal infrastructure for
+supporting multi-agent operations. The multi-agent objective is to optimally
+and reactively allocate and execute inspection tasks in a mine, which are
+entered by a mine operator on-the-fly. The assignment of currently available
+tasks to the team of agents is accomplished through an auction-based system,
+where the agents bid for available tasks, which are used by a central
+auctioneer to optimally assigns tasks to agents. A mobile Wi-Fi mesh supports
+inter-agent communication and bi-directional communication between the agents
+and the task allocator, while the task execution is performed completely
+infrastructure-free. Given a task to be accomplished, a reliable and modular
+agent behavior is synthesized by generating behavior trees from a pool of agent
+capabilities, using a back-chaining approach. The auction system in the
+proposed framework is reactive and supports addition of new operator-specified
+tasks on-the-go, at any point through a user-friendly operator interface. The
+framework has been validated in a real underground mining environment using
+three aerial agents, with several inspection locations spread in an environment
+of almost 200 meters. The proposed framework can be utilized for missions
+involving rapid inspection, gas detection, distributed sensing and mapping etc.
+in a subterranean environment. The proposed framework and its field deployment
+contributes towards furthering reliable automation in large-scale subterranean
+environments to offload both routine and dangerous tasks from human operators
+to autonomous aerial robots.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Transactions on Field Robotics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Actively Coupled Sensor Configuration and Planning in Unknown Dynamic
+  Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prakash Poudel, Jeffrey DesRoches, Raghvendra V. Cowlagi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the problem of path-planning for an autonomous mobile vehicle,
+called the ego vehicle, in an unknown andtime-varying environment. The
+objective is for the ego vehicle to minimize exposure to a
+spatiotemporally-varying unknown scalar field called the threat field. Noisy
+measurements of the threat field are provided by a network of mobile sensors.
+Weaddress the problem of optimally configuring (placing) these sensors in the
+environment. To this end, we propose sensor reconfiguration by maximizing a
+reward function composed of three different elements. First, the reward
+includes an informa tion measure that we call context-relevant mutual
+information (CRMI). Unlike typical sensor placement techniques that maxi mize
+mutual information of the measurements and environment state, CRMI directly
+quantifies uncertainty reduction in the ego path cost while it moves in the
+environment. Therefore, the CRMI introduces active coupling between the ego
+vehicle and the sensor network. Second, the reward includes a penalty on the
+distances traveled by the sensors. Third, the reward includes a measure of
+proximity of the sensors to the ego vehicle. Although we do not consider
+communication issues in this paper, such proximity is of relevance for future
+work that addresses communications between the sensors and the ego vehicle. We
+illustrate and analyze the proposed technique via numerical simulations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Draft submitted to the 2025 American Control Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ODMA-Based Cell-Free Unsourced Random Access with Successive
+  Interference Cancellation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mert Ozates, Mohammad Kazemi, Eduard Jorswieck, Deniz Gunduz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the unsourced random access problem with multiple receivers and
+propose a cell-free type solution for that. In our proposed scheme, the active
+users transmit their signals to the access points (APs) distributed in a
+geographical area and connected to a central processing unit (CPU). The
+transmitted signals are composed of a pilot and polar codeword, where the polar
+codeword bits occupy a small fraction of the data part of the transmission
+frame. The receiver operations of pilot detection and channel and symbol
+estimation take place at the APs, while the actual message bits are detected at
+the CPU by combining the symbol estimates from the APs forwarded over the
+fronthaul. The effect of the successfully decoded messages is then subtracted
+at the APs. Numerical examples illustrate that the proposed scheme can support
+up to 1400 users with a high energy efficiency, and the distributed structure
+decreases the error probability by more than two orders of magnitude.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pricing Mechanisms versus Non-Pricing Mechanisms for Demand Side
+  Management in Microgrids 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10196v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10196v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cassia Nunes Almeida, Arun Narayanan, Hafiz Majid Hussain, Pedro H. J. Nardelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we compare pricing and non-pricing mechanisms for implementing
+demand-side management (DSM) mechanisms in a neighborhood in Helsinki, Finland.
+We compare load steering based on peak load-reduction using the profile
+steering method, and load steering based on market price signals, in terms of
+peak loads, losses, and device profiles. We found that there are significant
+differences between the two methods; the peak-load reduction control strategies
+contribute to reducing peak power and improving power flow stability, while
+strategies primarily based on prices result in higher peaks and increased grid
+losses. Our results highlight the need to potentially move away from
+market-price-based DSM to DSM incentivization and control strategies that are
+based on peak load reductions and other system requirements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decomposition and Quantification of SOTIF Requirements for Perception
+  Systems of Autonomous Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruilin Yu, Cheng Wang, Yuxin Zhang, Fuming Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring the safety of autonomous vehicles (AVs) is paramount before they can
+be introduced to the market.
+  More specifically, securing the Safety of the Intended Functionality (SOTIF)
+poses a notable challenge; while ISO 21448 outlines numerous activities to
+refine the performance of AVs, it offers minimal quantitative guidance. This
+paper endeavors to decompose the acceptance criterion into quantitative
+perception requirements, aiming to furnish developers with requirements that
+are not only understandable but also actionable. This paper introduces a risk
+decomposition methodology to derive SOTIF requirements for perception. More
+explicitly, for subsystemlevel safety requirements, we define a collision
+severity model to establish requirements for state uncertainty and present a
+Bayesian model to discern requirements for existence uncertainty.
+  For component-level safety requirements, we proposed a decomposition method
+based on the Shapley value. Our findings indicate that these methods can
+effectively decompose the system-level safety requirements into quantitative
+perception requirements, potentially facilitating the safety verification of
+various AV components.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14pages,13figures,4tables,Journal Article</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Energy-Aware RIoT System: Analysis, Modeling and Prediction in the
+  SUPERIOT Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammud J. Bocus, Juha Hakkinen, Helder Fontes, Marcin Drzewiecki, Senhui Qiu, Kerstin Eder, Robert Piechocki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a comprehensive analysis of the energy consumption
+characteristics of a Silicon (Si)-based Reconfigurable IoT (RIoT) node
+developed in the initial phase of the SUPERIOT project, focusing on key
+operating states, including Bluetooth Low Energy (BLE) communication,
+Narrow-Band Visible Light Communication (NBVLC), sensing, and E-ink display.
+Extensive measurements were conducted to establish a detailed energy profile,
+which serves as a benchmark for evaluating the effectiveness of subsequent
+optimizations and future node iterations. To minimize the energy consumption,
+multiple optimizations were implemented at both the software and hardware
+levels, achieving a reduction of over 60% in total energy usage through
+software modifications alone. Further improvements were realized by optimizing
+the E-ink display driving waveform and implementing a very low-power mode for
+non-communication activities. Based on the measured data, three
+measurement-based energy consumption models were developed to characterize the
+energy behavior of the node under: (i) normal, unoptimized operation, (ii)
+low-power, software-optimized operation, and (iii) very low-power,
+hardware-optimized operation. These models, validated with new measurement
+data, achieved an accuracy exceeding 97%, confirming their reliability for
+predicting energy consumption in diverse configurations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 13 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Parallel Collaborative Simulation Framework Integrating Device
+  Physics with Circuit Dynamics for PDAE-Modeled Power Electronic Equipment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyuan Shi, Chijie Zhuang, Jiapeng Liu, Bo Lin, Xiyu Peng, Dan Wu, Zhicheng Liu, Rong Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimizing high-performance power electronic equipment, such as power
+converters, requires multiscale simulations that incorporate the physics of
+power semiconductor devices and the dynamics of other circuit components,
+especially in conducting Design of Experiments (DoEs), defining the safe
+operating area of devices, and analyzing failures related to semiconductor
+devices. However, current methodologies either overlook the intricacies of
+device physics or do not achieve satisfactory computational speeds. To bridge
+this gap, this paper proposes a Hybrid-Parallel Collaborative (HPC) framework
+specifically designed to analyze the Partial Differential Algebraic Equation
+(PDAE) modeled power electronic equipment, integrating the device physics and
+circuit dynamics. The HPC framework employs a dynamic iteration to tackle the
+challenges inherent in solving the coupled nonlinear PDAE system, and utilizes
+a hybrid-parallel computing strategy to reduce computing time. Physics-based
+system partitioning along with hybrid-process-thread parallelization on shared
+and distributed memory are employed, facilitating the simulation of hundreds of
+partial differential equations (PDEs)-modeled devices simultaneously without
+compromising speed. Experiments based on the hybrid line commutated converter
+and reverse-blocking integrated gate-commutated thyristors are conducted under
+3 typical real-world scenarios: semiconductor device optimization for the
+converter; converter design optimization; and device failure analysis. The HPC
+framework delivers simulation speed up to 60 times faster than the leading
+commercial software, while maintaining carrier-level accuracy in the
+experiments. This shows great potential for comprehensive analysis and
+collaborative optimization of devices and electronic power equipment,
+particularly in extreme conditions and failure scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Informativity Conditions for Multiple Signals: Properties, Experimental
+  Design, and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ao Cao, Fuyong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies highlight the importance of persistently exciting condition in
+single signal sequence for model identification and data-driven control
+methodologies. However, maintaining prolonged excitation in control signals
+introduces significant challenges, as continuous excitation can reduce the
+lifetime of mechanical devices. In this paper, we introduce three informativity
+conditions for various types of multi-signal data, each augmented by weight
+factors. We explore the interrelations between these conditions and their rank
+properties in linear time-invariant systems. Furthermore, we introduce
+open-loop experimental design methods tailored to each of the three conditions,
+which can synthesize the required excitation conditions either offline or
+online, even in the presence of limited information within each signal segment.
+We demonstrate the effectiveness of these informativity conditions in
+least-squares identification. Additionally, all three conditions can extend
+Willems' fundamental lemma and are utilized to assess the properties of the
+system. Illustrative examples confirm that these conditions yield satisfactory
+outcomes in both least-squares identification and the construction of
+data-driven controllers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Explainability for Power Electronics: From a Lipschitz Continuity
+  Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinze Li, Fanfan Lin, Homer Alan Mantooth, Juan José Rodríguez-Andina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lifecycle management of power converters continues to thrive with emerging
+artificial intelligence (AI) solutions, yet AI mathematical explainability
+remains unexplored in power electronics (PE) community. The lack of theoretical
+rigor challenges adoption in mission-critical applications. Therefore, this
+letter proposes a generic framework to evaluate mathematical explainability,
+highlighting inference stability and training convergence from a Lipschitz
+continuity perspective. Inference stability governs consistent outputs under
+input perturbations, essential for robust real-time control and fault
+diagnosis. Training convergence guarantees stable learning dynamics,
+facilitating accurate modeling in PE contexts. Additionally, a Lipschitz-aware
+learning rate selection strategy is introduced to accelerate convergence while
+mitigating overshoots and oscillations. The feasibility of the proposed
+Lipschitz-oriented framework is demonstrated by validating the mathematical
+explainability of a state-of-the-art physics-in-architecture neural network,
+and substantiated through empirical case studies on dual-active-bridge
+converters. This letter serves as a clarion call for the PE community to
+embrace mathematical explainability, heralding a transformative era of
+trustworthy and explainable AI solutions that potentially redefine the future
+of power electronics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Minimum-Time Sequential Traversal by a Team of Small Unmanned Aerial
+  Vehicles in an Unknown Environment with Winds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeffrey A. DesRoches, Raghvendra V. Cowlagi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of transporting multiple packages from an initial
+location to a destination location in a windy urban environment using a team of
+SUAVs. Each SUAV carries one package. We assume that the wind field is unknown,
+but wind speed can be measured by SUAVs during flight. The SUAVs fly
+sequentially one after the other, measure wind speeds along their trajectories,
+and report the measurements to a central computer. The overall objective is to
+minimize the total travel time of all SUAVs, which is in turn related to the
+number of SUAV traversals through the environment. For a discretized
+environment modeled by a graph, we describe a method to estimate wind speeds
+and the time of traversal for each SUAV path. Each SUAV traverses a
+minimum-time path planned based on the current wind field estimate. We study
+cases of static and time-varying wind fields with and without measurement
+noise. For each case, we demonstrate via numerical simulation that the proposed
+method finds the optimal path after a minimal number of traversals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Draft submitted to the 2025 American Control Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning port maneuvers from data for automatic guidance of Unmanned
+  Surface Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeyson A. Becerra-Mora, José Ángel Acosta, Ángel Rodríguez Castaño
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  At shipping ports, some repetitive maneuvering tasks such as entering/leaving
+port, transporting goods inside it or just making surveillance activities, can
+be efficiently and quickly carried out by a domestic pilot according to his
+experience. This know-how can be seized by Unmanned Surface Vehicles (USV) in
+order to autonomously replicate the same tasks. However, the inherent
+nonlinearity of ship trajectories and environmental perturbations as wind or
+marine currents make it difficult to learn a model and its respective control.
+We therefore present a data-driven learning and control methodology for USV,
+which is based on Gaussian Mixture Model, Gaussian Mixture Regression and the
+Sontag's universal formula. Our approach is capable to learn the nonlinear
+dynamics as well as guarantee the convergence toward the target with a robust
+controller. Real data have been collected through experiments with a vessel at
+the port of Ceuta. The complex trajectories followed by an expert have been
+learned including the robust controller. The effect of the controller over
+noise/perturbations are presented, a measure of error is used to compare
+estimates and real data trajectories, and finally, an analysis of computational
+complexity is performed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to journal (under review). 25 pages, 13 figures, 3
+  tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Water Irrigation System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Okner, David Veksler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the design and implementation of an automated water
+irrigation system aimed at optimizing plant care through precision moisture
+monitoring and controlled water delivery. The system uses a capacitive soil
+moisture sensor, an ADC (analog-to-digital converter), and a relay-driven water
+pump to ensure plants receive adequate hydration based on real-time data. In
+addition, this work aims to build on existing applications for Raspberry Pi
+(4B) and Arduino-based automatic irrigation systems by integrating advanced
+calibration methods, employing optimized algorithms, and introducing new
+technologies to further enhance overall system efficiency and reliability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Wasserstein Adaptive Value Estimation for Actor-Critic Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Baheri, Zahra Sharooei, Chirayu Salgarkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Wasserstein Adaptive Value Estimation for Actor-Critic (WAVE), an
+approach to enhance stability in deep reinforcement learning through adaptive
+Wasserstein regularization. Our method addresses the inherent instability of
+actor-critic algorithms by incorporating an adaptively weighted Wasserstein
+regularization term into the critic's loss function. We prove that WAVE
+achieves $\mathcal{O}\left(\frac{1}{k}\right)$ convergence rate for the
+critic's mean squared error and provide theoretical guarantees for stability
+through Wasserstein-based regularization. Using the Sinkhorn approximation for
+computational efficiency, our approach automatically adjusts the regularization
+based on the agent's performance. Theoretical analysis and experimental results
+demonstrate that WAVE achieves superior performance compared to standard
+actor-critic methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analytical Models of Frequency and Voltage in Large-Scale All-Inverter
+  Power Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marena Trujillo, Amir Sajadi, Bri-Mathias Hodge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-order frequency response models for power systems have a decades-long
+history in optimization and control problems such as unit commitment, economic
+dispatch, and wide-area control. With a few exceptions, these models are built
+upon the Newtonian mechanics of synchronous generators, assuming that the
+frequency dynamics across a system are approximately homogeneous, and assume
+the dynamics of nodal voltages for most operating conditions are negligible,
+and thus are not directly computed at all buses. As a result, the use of system
+frequency models results in the systematic underestimation of frequency minimum
+nadir and maximum RoCoF, and provides no insight into the reactive
+power-voltage dynamics. This paper proposes a low-order model of both frequency
+and voltage response in grid-forming inverter-dominated power systems. The
+proposed model accounts for spatial-temporal variations in frequency and
+voltage behavior across a system and as a result, demonstrates the
+heterogeneity of frequency response in future renewable power systems.
+Electromagnetic transient (EMT) simulations are used to validate the utility,
+accuracy, and computational efficiency of these models, setting the basis for
+them to serve as fast, scalable alternatives to EMT simulation, especially when
+dealing with very large-scale systems, for both planning and operational
+studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiclass Queue Scheduling Under Slowdown: An Approximate Dynamic
+  Programming Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Dong, Berk Görgülü, Vahid Sarhangian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many service systems, especially those in healthcare, customer waiting
+times can result in increased service requirements. Such service slowdowns can
+significantly impact system performance. Therefore, it is important to properly
+account for their impact when designing scheduling policies. Scheduling under
+wait-dependent service times is challenging, especially when multiple customer
+classes are heterogeneously affected by waiting. In this work, we study
+scheduling policies in multiclass, multiserver queues with wait-dependent
+service slowdowns. We propose a simulation-based Approximate Dynamic
+Programming (ADP) algorithm to find close-to-optimal scheduling policies. The
+ADP algorithm (i) represents the policy using classifiers based on the index
+policy structure, (ii) leverages a coupling method to estimate the differences
+of the relative value functions directly, and (iii) uses adaptive sampling for
+efficient state-space exploration. Through extensive numerical experiments, we
+illustrate that the ADP algorithm generates close-to-optimal policies that
+outperform well-known benchmarks. We also provide insights into the structure
+of the optimal policy, which reveals an important trade-off between
+instantaneous cost reduction and preventing the system from reaching high-cost
+equilibria. Lastly, we conduct a case study on scheduling admissions into
+rehabilitation care to illustrate the effectiveness of the ADP algorithm in
+practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ACCEPT: Diagnostic Forecasting of Battery Degradation Through
+  Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Sadler, Rizwaan Mohammed, Michael Castle, Kotub Uddin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling lithium-ion battery (LIB) degradation offers significant cost
+savings and enhances the safety and reliability of electric vehicles (EVs) and
+battery energy storage systems (BESS). Whilst data-driven methods have received
+great attention for forecasting degradation, they often demonstrate limited
+generalization ability and tend to underperform particularly in critical
+scenarios involving accelerated degradation, which are crucial to predict
+accurately. These methods also fail to elucidate the underlying causes of
+degradation. Alternatively, physical models provide a deeper understanding, but
+their complex parameters and inherent uncertainties limit their applicability
+in real-world settings. To this end, we propose a new model - ACCEPT. Our novel
+framework uses contrastive learning to map the relationship between the
+underlying physical degradation parameters and observable operational
+quantities, combining the benefits of both approaches. Furthermore, due to the
+similarity of degradation paths between LIBs with the same chemistry, this
+model transfers non-trivially to most downstream tasks, allowing for zero-shot
+inference. Additionally, since categorical features can be included in the
+model, it can generalize to other LIB chemistries. This work establishes a
+foundational battery degradation model, providing reliable forecasts across a
+range of battery types and operating conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Fairness-Oriented Reinforcement Learning Approach for the Operation
+  and Control of Shared Micromobility Services 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Cederle, Luca Vittorio Piron, Marina Ceccon, Federico Chiariotti, Alessandro Fabris, Marco Fabris, Gian Antonio Susto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Machine Learning grows in popularity across various fields, equity has
+become a key focus for the AI community. However, fairness-oriented approaches
+are still underexplored in smart mobility. Addressing this gap, our study
+investigates the balance between performance optimization and algorithmic
+fairness in shared micromobility services providing a novel framework based on
+Reinforcement Learning. Exploiting Q-learning, the proposed methodology
+achieves equitable outcomes in terms of the Gini index across different areas
+characterized by their distance from central hubs. Through vehicle rebalancing,
+the provided scheme maximizes operator performance while ensuring fairness
+principles for users, reducing iniquity by up to 85% while only increasing
+costs by 30% (w.r.t. applying no equity adjustment). A case study with
+synthetic data validates our insights and highlights the importance of fairness
+in urban micromobility (source code:
+https://github.com/mcederle99/FairMSS.git).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures, accepted at the 2025 American Control Conference
+  (ACC) on January 17th, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AIaaS for ORAN-based 6G Networks: Multi-time Scale Slice Resource
+  Management with DRL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.11668v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.11668v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suvidha Mhatre, Ferran Adelantado, Kostas Ramantas, Christos Verikoukis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses how to handle slice resources for 6G networks at
+different time scales in an architecture based on an open radio access network
+(ORAN). The proposed solution includes artificial intelligence (AI) at the edge
+of the network and applies two control-level loops to obtain optimal
+performance compared to other techniques. The ORAN facilitates programmable
+network architectures to support such multi-time scale management using AI
+approaches. The proposed algorithms analyze the maximum utilization of
+resources from slice performance to take decisions at the inter-slice level.
+Inter-slice intelligent agents work at a non-real-time level to reconfigure
+resources within various slices. Further than meeting the slice requirements,
+the intra-slice objective must also include the minimization of maximum
+resource utilization. This enables smart utilization of the resources within
+each slice without affecting slice performance. Here, each xApp that is an
+intra-slice agent aims at meeting the optimal quality of service (QoS) of the
+users, but at the same time, some inter-slice objectives should be included to
+coordinate intra- and inter-slice agents. This is done without penalizing the
+main intra-slice objective. All intelligent agents use deep reinforcement
+learning (DRL) algorithms to meet their objectives. We have presented results
+for enhanced mobile broadband (eMBB), ultra-reliable low latency (URLLC), and
+massive machine type communication (mMTC) slice categories.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated to reflect acceptance in IEEE ICC 2024: IEEE International
+  Conference on Communications, Denver, CO, USA, 2024, pp. 5407-5412, doi:
+  10.1109/ICC51166.2024.10622601</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Optimal Control via Local Occupation Measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15652v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15652v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Flemming Holtorf, Alan Edelman, Christopher Rackauckas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Viewing stochastic processes through the lens of occupation measures has
+proved to be a powerful angle of attack for the theoretical and computational
+analysis of stochastic optimal control problems. We present a simple
+modification of the traditional occupation measure framework derived from
+resolving the occupation measures locally on a partition of the control
+problem's space-time domain. This notion of local occupation measures provides
+fine-grained control over the construction of structured semidefinite
+programming relaxations for a rich class of stochastic optimal control problems
+with embedded diffusion and jump processes via the moment-sum-of-squares
+hierarchy. As such, it bridges the gap between discretization-based
+approximations to the Hamilton-Jacobi-Bellmann equations and occupation measure
+relaxations. We demonstrate with examples that this approach enables the
+computation of high quality bounds for the optimal value of a large class of
+stochastic optimal control problems with significant performance gains relative
+to the traditional occupation measure framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 4 figures, associated implementation:
+  https://github.com/FHoltorf/MarkovBounds.jl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Near Optimal Approximations and Finite Memory Policies for POMPDs with
+  Continuous Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.02895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.02895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Devran Kara, Erhan Bayraktar, Serdar Yuksel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study an approximation method for partially observed Markov decision
+processes (POMDPs) with continuous spaces. Belief MDP reduction, which has been
+the standard approach to study POMDPs requires rigorous approximation methods
+for practical applications, due to the state space being lifted to the space of
+probability measures. Generalizing recent work, in this paper we present
+rigorous approximation methods via discretizing the observation space and
+constructing a fully observed finite MDP model using a finite length history of
+the discrete observations and control actions. We show that the resulting
+policy is near-optimal under some regularity assumptions on the channel, and
+under certain controlled filter stability requirements for the hidden state
+process. Furthermore, by quantizing the measurements, we are able to utilize
+refined filter stability conditions. We also provide a Q learning algorithm
+that uses a finite memory of discretized information variables, and prove its
+convergence to the optimality equation of the finite fully observed MDP
+constructed using the approximation method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized Multi-hop Traffic Pressure for Heterogeneous Traffic
+  Perimeter Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.00753v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.00753v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocan Li, Xiaoyu Wang, Ilia Smirnov, Scott Sanner, Baher Abdulhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Perimeter control (PC) prevents loss of traffic network capacity due to
+congestion in urban areas. Homogeneous PC allows all access points to a
+protected region to have identical permitted inflow. However, homogeneous PC
+performs poorly when the congestion in the protected region is heterogeneous
+(e.g., imbalanced demand) since the homogeneous PC does not consider specific
+traffic conditions around each perimeter intersection. When the protected
+region has spatially heterogeneous congestion, one needs to modulate the
+perimeter inflow rate to be higher near low-density regions and vice versa for
+high-density regions. A na\"ive approach is to leverage 1-hop traffic pressure
+to measure traffic condition around perimeter intersections, but such metric is
+too spatially myopic for PC. To address this issue, we formulate multi-hop
+downstream pressure grounded on Markov chain theory, which ``looks deeper''
+into the protected region beyond perimeter intersections. In addition, we
+formulate a two-stage hierarchical control scheme that can leverage this novel
+multi-hop pressure to redistribute the total permitted inflow provided by a
+pre-trained deep reinforcement learning homogeneous control policy.
+Experimental results show that our heterogeneous PC approaches leveraging
+multi-hop pressure significantly outperform homogeneous PC in scenarios where
+the origin-destination flows are highly imbalanced with high spatial
+heterogeneity. Moveover, our approach is shown to be robust against turning
+ratio uncertainties by a sensitivity analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages main body, 13 figures, journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated monitoring of bee colony movement in the hive during winter
+  season 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01170v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01170v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rostyslav Koroliuk, Vyacheslav Nykytyuk, Vitaliy Tymoshchuk, Veronika Soyka, Dmytro Tymoshchuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we have experimentally modelled the movement of a bee colony
+in a hive during the winter season and developed a monitoring system that
+allows tracking the movement of the bee colony and honey consumption. The
+monitoring system consists of four load cells connected to the RP2040
+controller based on the Raspberry Pi Pico board, from which data is transmitted
+via the MQTT protocol to the Raspberry Pi 5 microcomputer via a Wi-Fi network.
+The processed data from the Raspberry Pi 5 is recorded in a MySQL database. The
+algorithm for finding the location of the bee colony in the hive works
+correctly, the trajectory of movement based on the data from the sensors
+repeats the physical movement in the experiment, which is an imitation of the
+movement of the bee colony in real conditions. The proposed monitoring system
+provides continuous observation of the bee colony without adversely affecting
+its natural activities and can be integrated with various wireless data
+networks. This is a promising tool for improving the efficiency of beekeeping
+and maintaining the health of bee colonies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper Accepted at BAIT 2024 CEUR-WS, see
+  https://ceur-ws.org/Vol-3842/paper9.pdf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Higher-Order Sinusoidal Input Describing Functions for Open-Loop and
+  Closed-Loop Reset Control with Application to Mechatronics Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13086v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13086v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinxin Zhang, S. Hassan HosseinNia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reset control enhances the performance of high-precision mechatronics
+systems. This paper introduces a generalized reset feedback control structure
+that integrates a single reset-state reset controller, a shaping filter for
+tuning reset actions, and linear compensators arranged in series and parallel
+configurations with the reset controller. This structure offers greater tuning
+flexibility to optimize reset control performance. However, frequency-domain
+analysis for such systems remains underdeveloped. To address this gap, this
+study makes three key contributions: (1) developing Higher-Order Sinusoidal
+Input Describing Functions (HOSIDFs) for open-loop reset control systems; (2)
+deriving HOSIDFs for closed-loop reset control systems and establishing a
+connection with open-loop analysis; and (3) creating a MATLAB-based App to
+implement these methods, providing mechatronics engineers with a practical tool
+for reset control system design and analysis. The accuracy of the proposed
+methods is validated through simulations and experiments. Finally, the utility
+of the proposed methods is demonstrated through case studies that analyze and
+compare the performance of three controllers: a PID controller, a reset
+controller, and a shaped reset controller on a precision motion stage. Both
+analytical and experimental results demonstrate that the shaped reset
+controller provides higher tracking precision while reducing actuation forces,
+outperforming both the reset and PID controllers. These findings highlight the
+effectiveness of the proposed frequency-domain methods in analyzing and
+optimizing the performance of reset-controlled mechatronics systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sensor-Based Distributionally Robust Control for Safe Robot Navigation
+  in Dynamic Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kehan Long, Yinzhuang Yi, Zhirui Dai, Sylvia Herbert, Jorge Cortés, Nikolay Atanasov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel method for mobile robot navigation in dynamic, unknown
+environments, leveraging onboard sensing and distributionally robust
+optimization to impose probabilistic safety constraints. Our method introduces
+a distributionally robust control barrier function (DR-CBF) that directly
+integrates noisy sensor measurements and state estimates to define safety
+constraints. This approach is applicable to a wide range of control-affine
+dynamics, generalizable to robots with complex geometries, and capable of
+operating at real-time control frequencies. Coupled with a control Lyapunov
+function (CLF) for path following, the proposed CLF-DR-CBF control synthesis
+method achieves safe, robust, and efficient navigation in challenging
+environments. We demonstrate the effectiveness and robustness of our approach
+for safe autonomous navigation under uncertainty in simulations and real-world
+experiments with differential-drive robots.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://existentialrobotics.org/DRO_Safe_Navigation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Dynamical Systems by Leveraging Data from Similar Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04344v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04344v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Xin, Lintao Ye, George Chiu, Shreyas Sundaram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning the dynamics of a linear system when one
+has access to data generated by an auxiliary system that shares similar (but
+not identical) dynamics, in addition to data from the true system. We use a
+weighted least squares approach, and provide finite sample error bounds of the
+learned model as a function of the number of samples and various system
+parameters from the two systems as well as the weight assigned to the auxiliary
+data. We show that the auxiliary data can help to reduce the intrinsic system
+identification error due to noise, at the price of adding a portion of error
+that is due to the differences between the two system models. We further
+provide a data-dependent bound that is computable when some prior knowledge
+about the systems, such as upper bounds on noise levels and model difference,
+is available. This bound can also be used to determine the weight that should
+be assigned to the auxiliary data during the model training stage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages,9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Semi-Infinite Constraints to Structured Robust Policies: Optimal
+  Gain Selection for Financial Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.02300v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.02300v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chung-Han Hsieh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the robust optimal gain selection problem for financial
+trading systems, formulated within a \emph{double linear policy} framework,
+which allocates capital across long and short positions. The key objective is
+to guarantee \emph{robust positive expected} (RPE) profits uniformly across a
+range of uncertain market conditions while ensuring risk control. This problem
+leads to a robust optimization formulation with \emph{semi-infinite}
+constraints, where the uncertainty is modeled by a bounded set of possible
+return parameters. We address this by transforming semi-infinite constraints
+into structured policies -- the \emph{balanced} policy and the
+\emph{complementary} policy -- which enable explicit characterization of the
+optimal solution. Additionally, we propose a novel graphical approach to
+efficiently solve the robust gain selection problem, drastically reducing
+computational complexity. Empirical validation on historical stock price data
+demonstrates superior performance in terms of risk-adjusted returns and
+downside risk compared to conventional strategies. This framework generalizes
+classical mean-variance optimization by incorporating robustness
+considerations, offering a systematic and efficient solution for robust trading
+under uncertainty.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Systematic Study of Multi-Agent Deep Reinforcement Learning for Safe
+  and Robust Autonomous Highway Ramp Entry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14593v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14593v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Larry Schester, Luis E. Ortiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicles today can drive themselves on highways and driverless robotaxis
+operate in major cities, with more sophisticated levels of autonomous driving
+expected to be available and become more common in the future. Yet, technically
+speaking, so-called "Level 5" (L5) operation, corresponding to full autonomy,
+has not been achieved. For that to happen, functions such as fully autonomous
+highway ramp entry must be available, and provide provably safe, and reliably
+robust behavior to enable full autonomy. We present a systematic study of a
+highway ramp function that controls the vehicles forward-moving actions to
+minimize collisions with the stream of highway traffic into which a merging
+(ego) vehicle enters. We take a game-theoretic multi-agent (MA) approach to
+this problem and study the use of controllers based on deep reinforcement
+learning (DRL). The virtual environment of the MA DRL uses self-play with
+simulated data where merging vehicles safely learn to control longitudinal
+position during a taper-type merge. The work presented in this paper extends
+existing work by studying the interaction of more than two vehicles (agents)
+and does so by systematically expanding the road scene with additional traffic
+and ego vehicles. While previous work on the two-vehicle setting established
+that collision-free controllers are theoretically impossible in fully
+decentralized, non-coordinated environments, we empirically show that
+controllers learned using our approach are nearly ideal when measured against
+idealized optimal controllers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 9 figures; added support ack</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safety Index Synthesis with State-dependent Control Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12406v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12406v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Chen, Weiye Zhao, Changliu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an approach for synthesizing feasible safety indices to
+derive safe control laws under state-dependent control spaces. The problem,
+referred to as Safety Index Synthesis (SIS), is challenging because it requires
+the existence of feasible control input in all states and leads to an infinite
+number of constraints. The proposed method leverages Positivstellensatz to
+formulate SIS as a nonlinear programming (NP) problem. We formally prove that
+the NP solutions yield safe control laws with two imperative guarantees:
+forward invariance within user-defined safe regions and finite-time convergence
+to those regions. A numerical study validates the effectiveness of our
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2024 American Control Conference (ACC)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Extended <span class="highlight-title">Survey</span> and a Comparison Framework for Dataflow Models of
+  Computation and Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07273v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07273v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Roumage, Selma Azaiez, Cyril Faure, Stéphane Louise
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataflow Model of Computation and Communications (DF MoCCs) is a formalism
+used to specify the behavior of Cyber-Physical Systems (CPSs). DF MoCCs are
+widely used in the design of CPSs, as they provide a high-level of abstraction
+to specify the system's behavior. DF MoCCs rules give semantics to a dataflow
+specification of a CPS, and static analysis algorithms rely on these semantics
+to guarantee safety properties of the dataflow specification, such as bounded
+memory usage and deadlock freeness. A wide range of DF MoCCs exists, each with
+its own characteristics and static analyses. This paper presents a survey of
+those DF MoCCs and a classification in eight categories. In addition, DF MoCCs
+are characterized by a comprehensive list of features and static analyses,
+which reflect their expressiveness and analyzability. Based on this
+characterization, a framework is proposed to compare the expressiveness and the
+analyzability of DF MoCCs quantitatively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">29</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Micro-Macro Decomposition of Particle Swarm Optimization Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Herty, Sara Veneruso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving non-convex minimization problems using multi-particle metaheuristic
+derivative-free optimization methods is still active area of research. Popular
+methods are Particle Swarm Optimization (PSO) methods, that iteratively update
+a population of particles according to dynamics inspired by social interactions
+between individuals. We present a modification to include constrained
+minimization problems using exact penalization. Additionally, we utilize the
+hierarchical structure of PSO to introduce a micro-macro decomposition of the
+algorithm. The probability density of particles is written as a convex
+combination of microscopic and macroscopic contributions, and both parts are
+propagated separately. The decomposition is dynamically updated based on
+heuristic considerations. Numerical examples compare the results obtained using
+the algorithm in the microscopic scale, in the macroscopic scale, and, using
+the new micro-macro decomposition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fixed Confidence and Fixed Tolerance Bi-level Optimization for Selecting
+  the Best Optimized System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Wang, Seong-Hee Kim, Enlu Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study a fixed-confidence, fixed-tolerance formulation of a
+class of stochastic bi-level optimization problems, where the upper-level
+problem selects from a finite set of systems based on a performance metric, and
+the lower-level problem optimizes continuous decision variables for each
+system. Notably, the objective functions for the upper and lower levels can
+differ. This class of problems has a wide range of applications, including
+model selection, ranking and selection under input uncertainty, and optimal
+design. To address this, we propose a multi-stage Pruning-Optimization
+framework that alternates between comparing the performance of different
+systems (Pruning) and optimizing systems (Optimization). % In the Pruning
+stage, we design a sequential algorithm that identifies and eliminates inferior
+systems through systematic performance evaluations. In the Optimization stage,
+the goal is to solve for a near-optimal solution that meets specified
+confidence and tolerance requirements. This multi-stage framework is designed
+to enhance computational efficiency by pruning inferior systems with high
+tolerance early on, thereby avoiding unnecessary computational efforts. We
+demonstrate the effectiveness of the proposed algorithm through both
+theoretical analysis of statistical validity and sample complexity and
+numerical experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DADA: Dual Averaging with Distance Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Moshtaghifar, Anton Rodomanov, Daniil Vankov, Sebastian Stich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel universal gradient method for solving convex optimization
+problems. Our algorithm -- Dual Averaging with Distance Adaptation (DADA) -- is
+based on the classical scheme of dual averaging and dynamically adjusts its
+coefficients based on observed gradients and the distance between iterates and
+the starting point, eliminating the need for problem-specific parameters. DADA
+is a universal algorithm that simultaneously works for a broad spectrum of
+problem classes, provided the local growth of the objective function around its
+minimizer can be bounded. Particular examples of such problem classes are
+nonsmooth Lipschitz functions, Lipschitz-smooth functions, H\"older-smooth
+functions, functions with high-order Lipschitz derivative,
+quasi-self-concordant functions, and $(L_0,L_1)$-smooth functions. Crucially,
+DADA is applicable to both unconstrained and constrained problems, even when
+the domain is unbounded, without requiring prior knowledge of the number of
+iterations or desired accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Restart Strategies for Parameter-dependent Optimization
+  Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lisa Schönenberger, Hans-Georg Beyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper examines restart strategies for algorithms whose successful
+termination depends on an unknown parameter $\lambda$. After each restart,
+$\lambda$ is increased, until the algorithm terminates successfully. It is
+assumed that there is a unique, unknown, optimal value for $\lambda$. For the
+algorithm to run successfully, this value must be reached or surpassed. The key
+question is whether there exists an optimal strategy for selecting $\lambda$
+after each restart taking into account that the computational costs (runtime)
+increases with $\lambda$. In this work, potential restart strategies are
+classified into parameter-dependent strategy types. A loss function is
+introduced to quantify the wasted computational cost relative to the optimal
+strategy. A crucial requirement for any efficient restart strategy is that its
+loss, relative to the optimal $\lambda$, remains bounded. To this end, upper
+and lower bounds of the loss are derived. Using these bounds it will be shown
+that not all strategy types are bounded. However, for a particular strategy
+type, where $\lambda$ is increased multiplicatively by a constant factor
+$\lambda$, the relative loss function is bounded. Furthermore, it will be
+demonstrated that within this strategy type, there exists an optimal value for
+$\lambda$ that minimizes the maximum relative loss. In the asymptotic limit,
+this optimal choice of $\lambda$ does not depend on the unknown optimal
+$\lambda$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Family of Controllable Momentum Coefficients for Forward-Backward
+  Accelerated Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingwei Fu, Bin Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nesterov's accelerated gradient method (NAG) marks a pivotal advancement in
+gradient-based optimization, achieving faster convergence compared to the
+vanilla gradient descent method for convex functions. However, its algorithmic
+complexity when applied to strongly convex functions remains unknown, as noted
+in the comprehensive review by Chambolle and Pock [2016]. This issue, aside
+from the critical step size, was addressed by Li et al. [2024b], with the
+monotonic case further explored by Fu and Shi [2024]. In this paper, we
+introduce a family of controllable momentum coefficients for forward-backward
+accelerated methods, focusing on the critical step size $s=1/L$. Unlike
+traditional linear forms, the proposed momentum coefficients follow an
+$\alpha$-th power structure, where the parameter $r$ is adaptively tuned to
+$\alpha$. Using a Lyapunov function specifically designed for $\alpha$, we
+establish a controllable $O\left(1/k^{2\alpha} \right)$ convergence rate for
+the NAG-$\alpha$ method, provided that $r > 2\alpha$. At the critical step
+size, NAG-$\alpha$ achieves an inverse polynomial convergence rate of arbitrary
+degree by adjusting $r$ according to $\alpha > 0$. We further simplify the
+Lyapunov function by expressing it in terms of the iterative sequences $x_k$
+and $y_k$, eliminating the need for phase-space representations. This
+simplification enables us to extend the controllable $O \left(1/k^{2\alpha}
+\right)$ rate to the monotonic variant, M-NAG-$\alpha$, thereby enhancing
+optimization efficiency. Finally, by leveraging the fundamental inequality for
+composite functions, we extended the controllable $O\left(1/k^{2\alpha}
+\right)$ rate to proximal algorithms, including the fast iterative
+shrinkage-thresholding algorithm (FISTA-$\alpha$) and its monotonic counterpart
+(M-FISTA-$\alpha$).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 1 figure. arXiv admin note: text overlap with
+  arXiv:2412.13527</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Client-Centric Federated Adaptive Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianhui Sun, Xidong Wu, Heng Huang, Aidong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a distributed learning paradigm where clients
+collaboratively train a model while keeping their own data private. With an
+increasing scale of clients and models, FL encounters two key challenges,
+client drift due to a high degree of statistical/system heterogeneity, and lack
+of adaptivity. However, most existing FL research is based on unrealistic
+assumptions that virtually ignore system heterogeneity. In this paper, we
+propose Client-Centric Federated Adaptive Optimization, which is a class of
+novel federated adaptive optimization approaches. We enable several features in
+this framework such as arbitrary client participation, asynchronous server
+aggregation, and heterogeneous local computing, which are ubiquitous in
+real-world FL systems but are missed in most existing works. We provide a
+rigorous convergence analysis of our proposed framework for general nonconvex
+objectives, which is shown to converge with the best-known rate. Extensive
+experiments show that our approaches consistently outperform the baseline by a
+large margin across benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generic uniqueness and conjugate points for optimal control problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alberto Bressan, Marco Mazzola, Khai T. Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper is concerned with an optimal control problem on $\mathbb{R}^n$,
+where the dynamics is linear w.r.t.~the control functions. For a terminal cost
+$\psi$ in a $mathcal{G}_\delta$ set of $\mathcal{C}^4(\mathbb{R}^n)$ (i.e., in
+a countable intersection of open dense subsets), two main results are
+proved.Namely: the set $\Gamma_\psi\subset\mathbb{R}^n$ of conjugate points is
+closed, with locally bounded $(n-2)$-dimensional Hausdorff measure. Moreover,
+the set of initial points $y\in \mathbb{R}^n\setminus\Gamma_\psi$, which admit
+two or more globally optimal trajectories, is contained in the union of a
+locally finite family of embedded manifolds. In particular, the value function
+is continuously differentiable on an open, dense subset of $\mathbb{R}^n$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiclass Queue Scheduling Under Slowdown: An Approximate Dynamic
+  Programming Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Dong, Berk Görgülü, Vahid Sarhangian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many service systems, especially those in healthcare, customer waiting
+times can result in increased service requirements. Such service slowdowns can
+significantly impact system performance. Therefore, it is important to properly
+account for their impact when designing scheduling policies. Scheduling under
+wait-dependent service times is challenging, especially when multiple customer
+classes are heterogeneously affected by waiting. In this work, we study
+scheduling policies in multiclass, multiserver queues with wait-dependent
+service slowdowns. We propose a simulation-based Approximate Dynamic
+Programming (ADP) algorithm to find close-to-optimal scheduling policies. The
+ADP algorithm (i) represents the policy using classifiers based on the index
+policy structure, (ii) leverages a coupling method to estimate the differences
+of the relative value functions directly, and (iii) uses adaptive sampling for
+efficient state-space exploration. Through extensive numerical experiments, we
+illustrate that the ADP algorithm generates close-to-optimal policies that
+outperform well-known benchmarks. We also provide insights into the structure
+of the optimal policy, which reveals an important trade-off between
+instantaneous cost reduction and preventing the system from reaching high-cost
+equilibria. Lastly, we conduct a case study on scheduling admissions into
+rehabilitation care to illustrate the effectiveness of the ADP algorithm in
+practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Family of Controllable Momentum Coefficients for Forward-Backward
+  Accelerated Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingwei Fu, Bin Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nesterov's accelerated gradient method (NAG) marks a pivotal advancement in
+gradient-based optimization, achieving faster convergence compared to the
+vanilla gradient descent method for convex functions. However, its algorithmic
+complexity when applied to strongly convex functions remains unknown, as noted
+in the comprehensive review by Chambolle and Pock [2016]. This issue, aside
+from the critical step size, was addressed by Li et al. [2024b], with the
+monotonic case further explored by Fu and Shi [2024]. In this paper, we
+introduce a family of controllable momentum coefficients for forward-backward
+accelerated methods, focusing on the critical step size $s=1/L$. Unlike
+traditional linear forms, the proposed momentum coefficients follow an
+$\alpha$-th power structure, where the parameter $r$ is adaptively tuned to
+$\alpha$. Using a Lyapunov function specifically designed for $\alpha$, we
+establish a controllable $O\left(1/k^{2\alpha} \right)$ convergence rate for
+the NAG-$\alpha$ method, provided that $r > 2\alpha$. At the critical step
+size, NAG-$\alpha$ achieves an inverse polynomial convergence rate of arbitrary
+degree by adjusting $r$ according to $\alpha > 0$. We further simplify the
+Lyapunov function by expressing it in terms of the iterative sequences $x_k$
+and $y_k$, eliminating the need for phase-space representations. This
+simplification enables us to extend the controllable $O \left(1/k^{2\alpha}
+\right)$ rate to the monotonic variant, M-NAG-$\alpha$, thereby enhancing
+optimization efficiency. Finally, by leveraging the fundamental inequality for
+composite functions, we extended the controllable $O\left(1/k^{2\alpha}
+\right)$ rate to proximal algorithms, including the fast iterative
+shrinkage-thresholding algorithm (FISTA-$\alpha$) and its monotonic counterpart
+(M-FISTA-$\alpha$).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Sensor Selection for Biomarker Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09809v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09809v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Pickard, Cooper Stansbury, Amit Surana, Lindsey Muir, Anthony Bloch, Indika Rajapakse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in methods of biological data collection are driving the rapid
+growth of comprehensive datasets across clinical and research settings. These
+datasets provide the opportunity to monitor biological systems in greater depth
+and at finer time steps than was achievable in the past. Classically,
+biomarkers are used to represent and track key aspects of a biological system.
+Biomarkers retain utility even with the availability of large datasets, since
+monitoring and interpreting changes in a vast number of molecules remains
+impractical. However, given the large number of molecules in these datasets, a
+major challenge is identifying the best biomarkers for a particular setting
+Here, we apply principles of observability theory to establish a general
+methodology for biomarker selection. We demonstrate that observability measures
+effectively identify biologically meaningful sensors in a range of time series
+transcriptomics data. Motivated by the practical considerations of biological
+systems, we introduce the method of dynamic sensor selection (DSS) to maximize
+observability over time, thus enabling observability over regimes where system
+dynamics themselves are subject to change. This observability framework is
+flexible, capable of modeling gene expression dynamics and using auxiliary
+data, including chromosome conformation, to select biomarkers. Additionally, we
+demonstrate the applicability of this approach beyond genomics by evaluating
+the observability of neural activity These applications demonstrate the utility
+of observability-guided biomarker selection for across a wide range of
+biological systems, from agriculture and biomanufacturing to neural
+applications and beyond.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Centralized Reduction of Decentralized Stochastic Control Models and
+  their weak-Feller Regularity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13828v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13828v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omar Mrani-Zentar, Serdar Yüksel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized stochastic control problems involving general
+state/measurement/action spaces are intrinsically difficult to study because of
+the inapplicability of standard tools from centralized (single-agent)
+stochastic control. In this paper, we address some of these challenges for
+decentralized stochastic control with standard Borel spaces under two different
+but tightly related information structures: the one-step delayed information
+sharing pattern (OSDISP), and the $K$-step periodic information sharing pattern
+(KSPISP). We will show that the one-step delayed and $K$-step periodic problems
+can be reduced to a centralized Markov Decision Process (MDP), generalizing
+prior results which considered finite, linear, or static models, by addressing
+several measurability and topological questions. We then provide sufficient
+conditions for the transition kernels of both centralized reductions to be
+weak-Feller. The existence and separated nature of optimal policies under both
+information structures are then established. The weak Feller regularity also
+facilitates rigorous approximation and learning theoretic results, as shown in
+the paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A summary of the results was presented in CDC'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Optimal Control via Local Occupation Measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15652v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15652v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Flemming Holtorf, Alan Edelman, Christopher Rackauckas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Viewing stochastic processes through the lens of occupation measures has
+proved to be a powerful angle of attack for the theoretical and computational
+analysis of stochastic optimal control problems. We present a simple
+modification of the traditional occupation measure framework derived from
+resolving the occupation measures locally on a partition of the control
+problem's space-time domain. This notion of local occupation measures provides
+fine-grained control over the construction of structured semidefinite
+programming relaxations for a rich class of stochastic optimal control problems
+with embedded diffusion and jump processes via the moment-sum-of-squares
+hierarchy. As such, it bridges the gap between discretization-based
+approximations to the Hamilton-Jacobi-Bellmann equations and occupation measure
+relaxations. We demonstrate with examples that this approach enables the
+computation of high quality bounds for the optimal value of a large class of
+stochastic optimal control problems with significant performance gains relative
+to the traditional occupation measure framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 4 figures, associated implementation:
+  https://github.com/FHoltorf/MarkovBounds.jl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Hypomonotone Class of Variational Inequalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.09182v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.09182v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khaled Alomar, Tatjana Chavdarova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the behavior of the extragradient algorithm [Korpelevich,
+1976] when applied to hypomonotone operators, a class of problems that extends
+beyond the classical monotone setting. To support the understanding of this
+variational inequality problem class, we focus on a subclass of hypomonotone
+linear operators, characterizing them based on their eigenvalues and providing
+concrete examples. While the extragradient method is widely recognized for its
+efficiency in solving variational inequalities involving monotone and Lipschitz
+continuous operators, we demonstrate that it does not guarantee convergence in
+the hypomonotone case. In particular, we construct a counterexample where the
+extragradient method diverges regardless of the step size. A numerical
+experiment is presented to support this result.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Near Optimal Approximations and Finite Memory Policies for POMPDs with
+  Continuous Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.02895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.02895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Devran Kara, Erhan Bayraktar, Serdar Yuksel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study an approximation method for partially observed Markov decision
+processes (POMDPs) with continuous spaces. Belief MDP reduction, which has been
+the standard approach to study POMDPs requires rigorous approximation methods
+for practical applications, due to the state space being lifted to the space of
+probability measures. Generalizing recent work, in this paper we present
+rigorous approximation methods via discretizing the observation space and
+constructing a fully observed finite MDP model using a finite length history of
+the discrete observations and control actions. We show that the resulting
+policy is near-optimal under some regularity assumptions on the channel, and
+under certain controlled filter stability requirements for the hidden state
+process. Furthermore, by quantizing the measurements, we are able to utilize
+refined filter stability conditions. We also provide a Q learning algorithm
+that uses a finite memory of discretized information variables, and prove its
+convergence to the optimality equation of the finite fully observed MDP
+constructed using the approximation method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Controlling the Rates of a Chain of Harmonic Oscillators with a Point
+  Langevin Thermostat 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06536v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06536v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirali Hannani, Minh-Binh Tran, Minh Nhat Phung, Emmanuel Trélat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the control problem of controlling the rates of an infinite chain
+of coupled harmonic oscillators with a Langevin thermostat at the origin. We
+study the effect of two types of open-loop boundary controls, impulsive control
+and linear memory-feedback control, in the high frequency limit. We investigate
+their action on the reflection-transmission coefficients for the wave energy
+for the scattering of the thermostat. Our study shows that the impulsive
+boundary controls have no impact on the rates and are thus not appropriate to
+act on the system, despite their physical meaning and relevance. In contrast,
+the second kind of control that we propose, which is less standard and uses the
+past of the state solution of the system, is adequate and relevant. We prove
+that any triple of rates satisfying appropriate assumptions is asymptotically
+reachable thanks to the linear memory-feedback controls that we design
+explicitly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adjustable Robust Nonlinear Network Design Without Controllable Elements
+  under Load Scenario Uncertainties 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Thürauf, Julia Grübel, Martin Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study network design problems for nonlinear and nonconvex flow models
+without controllable elements under load scenario uncertainties, i.e., under
+uncertain injections and withdrawals. To this end, we apply the concept of
+adjustable robust optimization to compute a network design that admits a
+feasible transport for all, possibly infinitely many, load scenarios within a
+given uncertainty set. For solving the corresponding adjustable robust
+mixed-integer nonlinear optimization problem, we show that a given network
+design is robust feasible, i.e., it admits a feasible transport for all load
+scenario uncertainties, if and only if a finite number of worst-case load
+scenarios can be routed through the network. We compute these worst-case
+scenarios by solving polynomially many nonlinear optimization problems.
+Embedding this result for robust feasibility in an adversarial approach leads
+to an exact algorithm that computes an optimal robust network design in a
+finite number of iterations. Since all of the results are valid for general
+potential-based flows, the approach can be applied to different utility
+networks such as gas, hydrogen, or water networks. We finally demonstrate the
+applicability of the method by computing robust gas networks that are protected
+from future load fluctuations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solving Optimal Control Problems of Rigid-Body Dynamics with Collisions
+  Using the Hybrid Minimum Principle 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.08622v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.08622v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Hu, Jihao Long, Yaohua Zang, Weinan E, Jiequn Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collisions are common in many dynamical systems with real applications. They
+can be formulated as hybrid dynamical systems with discontinuities
+automatically triggered when states transverse certain manifolds. We present an
+algorithm for the optimal control problem of such hybrid dynamical systems
+based on solving the equations derived from the hybrid minimum principle (HMP).
+The algorithm is an iterative scheme following the spirit of the method of
+successive approximations (MSA), and it is robust to undesired collisions
+observed in the initial guesses. We propose several techniques to address the
+additional numerical challenges introduced by the presence of discontinuities.
+The algorithm is tested on disc collision problems whose optimal solutions
+exhibit one or multiple collisions. Linear convergence in terms of iteration
+steps and asymptotic first-order accuracy in terms of time discretization are
+observed when the algorithm is implemented with the forward-Euler scheme. The
+numerical results demonstrate that the proposed algorithm has better accuracy
+and convergence than direct methods based on gradient descent. Furthermore, the
+algorithm is also simpler, more accurate, and more stable than a deep
+reinforcement learning method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extremum Seeking for Linear Time-Varying Systems with Unknown Control
+  Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13344v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13344v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frederic Mazenc, Michael Malisoff, Emilia Fridman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider bounded extremum seeking controls for time-varying linear systems
+with uncertain coefficient matrices and measurement uncertainty. Using a new
+change of variables, Lyapunov functions, and a comparison principle, we provide
+practical exponential stability bounds for the states of the closed loop
+systems that hold for all nonnegative times. For the first time for linear
+time-varying systems with unknown control directions, we consider bounded
+extremum seeking controls in the presence of uncertain time-varying input
+delays with small time-varying delay uncertainties, and we provide reduction
+model controllers to compensate for the constant part of the delays.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solving Monge problem by Hil<span class="highlight-title">bert</span> space embeddings of probability
+  measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.03478v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.03478v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takafumi Saito, Yumiharu Nakano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose deep learning methods for classical Monge's optimal mass
+transportation problems, where where the distribution constraint is treated as
+penalty terms defined by the maximum mean discrepancy in the theory of Hilbert
+space embeddings of probability measures. We prove that the transport maps
+given by the proposed methods converge to optimal transport maps in the problem
+with $L^2$ cost. Several numerical experiments validate our methods. In
+particular, we show that our methods are applicable to large-scale Monge
+problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Simplification Method for Inequality Constraints in Integer Binary
+  Encoding HOBO Formulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09670v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09670v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuichiro Minato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a novel method for simplifying inequality constraints in
+Higher-Order Binary Optimization (HOBO) formulations. The proposed method
+addresses challenges associated with Quadratic Unconstrained Binary
+Optimization (QUBO) formulations, specifically the increased computational
+complexity and reduced solution accuracy caused by the introduction of slack
+variables and the resulting growth in auxiliary qubits. By efficiently
+integrating constraints, the method enhances the computational efficiency and
+accuracy of both quantum and classical solvers. The effectiveness of the
+proposed approach is demonstrated through numerical experiments applied to
+combinatorial optimization problems. The results indicate that this method
+expands the applicability of quantum algorithms to high-dimensional problems
+and improves the practicality of classical optimization solvers for
+optimization problems involving inequality constraints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The assumptions of the paper are overly restrictive, and there is a
+  critical error</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Effective Front-Descent Algorithms with Convergence Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.08450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.08450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Lapucci, Pierluigi Mansueto, Davide Pucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this manuscript, we address continuous unconstrained multi-objective
+optimization problems and we discuss descent type methods for the
+reconstruction of the Pareto set. Specifically, we analyze the class of Front
+Descent methods, which generalizes the Front Steepest Descent algorithm
+allowing the employment of suitable, effective search directions (e.g., Newton,
+Quasi-Newton, Barzilai-Borwein). We provide a deep characterization of the
+behavior and the mechanisms of the algorithmic framework, and we prove that,
+under reasonable assumptions, standard convergence results and some complexity
+bounds hold for the generalized approach. Moreover, we prove that popular
+search directions can indeed be soundly used within the framework. Then, we
+provide a completely novel type of convergence results, concerning the sequence
+of sets produced by the procedure. In particular, iterate sets are shown to
+asymptotically approach stationarity for all of their points; the convergence
+result is accompanied by a worst-case iteration complexity bound; additionally,
+in finite precision settings, the sets are shown to only be enriched through
+exploration steps in later iterations, and suitable stopping conditions can be
+devised. Finally, the results from a large experimental benchmark show that the
+proposed class of approaches far outperforms state-of-the-art methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralized Conjugate Gradient and Memoryless BFGS Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.07122v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.07122v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Wang, Hao Wu, Hongchao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a new decentralized conjugate gradient (NDCG) method and
+a decentralized memoryless BFGS (DMBFGS) method for the nonconvex and strongly
+convex decentralized optimization problem, respectively, of minimizing a finite
+sum of continuously differentiable functions over a fixed-connected undirected
+network. Gradient tracking techniques are applied in these two methods to
+enhance their convergence properties and the numerical stability. In
+particular, we show global convergence of NDCG with constant stepsize for
+general nonconvex smooth decentralized optimization. Our new DMBFGS method uses
+a scaled memoryless BFGS technique and only requires gradient information to
+approximate second-order information of the component functions in the
+objective. We also establish global convergence and linear convergence rate of
+DMBFGS with constant stepsize for strongly convex smooth decentralized
+optimization. Our numerical results show that NDCG and DMBFGS are very
+efficient in terms of both iteration and communication cost compared with other
+state-of-the-art methods for solving smooth decentralized optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages,27 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Two-Timescale Decision-Hazard-Decision Formulation for Storage Usage
+  Values Calculation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.17113v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.17113v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Camila Martinez Parra, Michel de Lara, Jean-Philippe Chancelier, Pierre Carpentier, Jean-Marc Janin, Manuel Ruiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The penetration of renewable energies requires additional storages to deal
+with intermittency. Accordingly, there is growing interest in evaluating the
+opportunity cost (usage value) associated with stored energy in large storages,
+a cost obtained by solving a multistage stochastic optimization problem. Today,
+to compute usage values under uncertainties, an adequacy resource problem is
+solved using stochastic dynamic programming assuming a hazard-decision
+information structure. This modelling assumes complete knowledge of the coming
+week uncertainties, which is not adapted to the system operation as the
+intermittency occurs at smaller timescale. We equip the twotimescale problem
+with a new information structure considering planning and recourse decisions:
+decision-hazard-decision. This structure is used to decompose the multistage
+decision-making process into a nonanticipative planning step in which the
+on/off decisions for the thermal units are made, and a recourse step in which
+the power modulation decisions are made once the uncertainties have been
+disclosed. In a numerical case, we illustrate how usage values are sensitive as
+how the disclosure of information is modelled.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Control of Several Motion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.00260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.00260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tan H. Cao, Nilson Chapagain, Haejoon Lee, Phung Ngoc Thi, Nguyen Nang Thieu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper is devoted to the study of the dynamic optimization of several
+controlled crowd motion models in the general planar settings, which is an
+application of a class of optimal control problems involving a general
+nonconvex sweeping process with perturbations. A set of necessary optimality
+conditions for such optimal control problems involving the crowd motion models
+with multiple agents and obstacles is obtained and analyzed. Several effective
+algorithms based on such necessary optimality conditions are proposed and
+various nontrivial illustrative examples together with their simulations are
+also presented. The implementation of all the considered motion models can be
+found via the link:
+https://github.com/tancao1128/Optimal_Control_of_Several_Motion_Models with the
+instruction and demonstration video uploaded at
+https://www.youtube.com/watch?v=B8DQ0wvCtIQ.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Effective Generalized Moment Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09385v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09385v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Gamertsfelder, Bernard Mourrain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We establish new convergence rates for the moment-sum-of-squares (Moment-SOS)
+relaxations for the Generalized Moment Problem (GMP). These bounds, which adapt
+to the geometry of the underlying semi-algebraic set, apply to both the
+convergence of optima, and to the convergence in Hausdorff distance between the
+relaxation feasibility set and the GMP feasibility set. This research extends
+previous works limited to specific problems in polynomial optimization, volume
+computation and optimal control. We complement our theoretical analysis with an
+application: minimal rank symmetric tensor decomposition. In the examples, we
+formulate the problem as a GMP, solve using Moment-SOS relaxation, and apply
+the theoretical results to observe a convergence rate of the relaxations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Bottom-Up Approach to Optimizing the Solar Organic Rankine Cycle for
+  Transactive Energy Trading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01359v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01359v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Silvia Anna Cordieri, Chiara Bordin, Sambeet Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solar Organic Rankine Cycle (ORC)-based power generation plants leverage
+solar irradiation to produce thermal energy, offering a highly compatible
+renewable technology due to the alignment between solar irradiation
+temperatures and ORC operating requirements. Their superior performance
+compared to steam Rankine cycles in small-scale applications makes them
+particularly relevant within the smart grid and microgrid contexts. This study
+explores the role of ORC in peer-to-peer (P2P) energy trading within
+renewable-based community microgrids, where consumers become prosumers,
+simultaneously producing and consuming energy while engaging in virtual trading
+at the distribution system level. Focusing on a microgrid integrating solar ORC
+with a storage system to meet consumer demand, the paper highlights the
+importance of combining these technologies with storage to enhance
+predictability and competitiveness with conventional energy plants, despite
+management challenges. A methodology based on operations research techniques is
+developed to optimize system performance. Furthermore, the impact of various
+technological parameters of the solar ORC on the system's performance is
+examined. The study concludes by assessing the value of solar ORC within the
+transactive energy trading framework across different configurations and
+scenarios. Results demonstrate an average 16\% reduction in operational costs,
+showcasing the benefits of implementing a predictable and manageable system in
+P2P transactive energy trading.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a preprint of a paper accepted for publication to the "Energy
+  Systems" Journal of Springer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sensor-Based Distributionally Robust Control for Safe Robot Navigation
+  in Dynamic Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kehan Long, Yinzhuang Yi, Zhirui Dai, Sylvia Herbert, Jorge Cortés, Nikolay Atanasov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel method for mobile robot navigation in dynamic, unknown
+environments, leveraging onboard sensing and distributionally robust
+optimization to impose probabilistic safety constraints. Our method introduces
+a distributionally robust control barrier function (DR-CBF) that directly
+integrates noisy sensor measurements and state estimates to define safety
+constraints. This approach is applicable to a wide range of control-affine
+dynamics, generalizable to robots with complex geometries, and capable of
+operating at real-time control frequencies. Coupled with a control Lyapunov
+function (CLF) for path following, the proposed CLF-DR-CBF control synthesis
+method achieves safe, robust, and efficient navigation in challenging
+environments. We demonstrate the effectiveness and robustness of our approach
+for safe autonomous navigation under uncertainty in simulations and real-world
+experiments with differential-drive robots.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://existentialrobotics.org/DRO_Safe_Navigation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Semi-Infinite Constraints to Structured Robust Policies: Optimal
+  Gain Selection for Financial Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.02300v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.02300v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chung-Han Hsieh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the robust optimal gain selection problem for financial
+trading systems, formulated within a \emph{double linear policy} framework,
+which allocates capital across long and short positions. The key objective is
+to guarantee \emph{robust positive expected} (RPE) profits uniformly across a
+range of uncertain market conditions while ensuring risk control. This problem
+leads to a robust optimization formulation with \emph{semi-infinite}
+constraints, where the uncertainty is modeled by a bounded set of possible
+return parameters. We address this by transforming semi-infinite constraints
+into structured policies -- the \emph{balanced} policy and the
+\emph{complementary} policy -- which enable explicit characterization of the
+optimal solution. Additionally, we propose a novel graphical approach to
+efficiently solve the robust gain selection problem, drastically reducing
+computational complexity. Empirical validation on historical stock price data
+demonstrates superior performance in terms of risk-adjusted returns and
+downside risk compared to conventional strategies. This framework generalizes
+classical mean-variance optimization by incorporating robustness
+considerations, offering a systematic and efficient solution for robust trading
+under uncertainty.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solvability and Optimal Controls of Impulsive Stochastic Evolution
+  Equations in Hil<span class="highlight-title">bert</span> Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.13496v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.13496v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javad A. Asadzade, Nazim I. Mahmudov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the solvability and optimal control of a class of
+impulsive stochastic differential equations (SDEs) within a Hilbert space
+setting. First, we establish the existence and uniqueness of mild solutions for
+the proposed impulsive stochastic system, leveraging fixed-point theorems and
+appropriate analytical techniques. Next, we identify and derive the necessary
+conditions for the existence of optimal control pairs, ensuring the feasibility
+and effectiveness of the control solutions. Finally, to validate and
+demonstrate the practical applicability of our theoretical findings, we provide
+a detailed example showcasing the utility of the results in real-world
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">96</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaceXBench: Evaluating Multimodal LLMs on Face Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kartik Narayan, Vibashan VS, Vishal M. Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) demonstrate impressive
+problem-solving abilities across a wide range of tasks and domains. However,
+their capacity for face understanding has not been systematically studied. To
+address this gap, we introduce FaceXBench, a comprehensive benchmark designed
+to evaluate MLLMs on complex face understanding tasks. FaceXBench includes
+5,000 multimodal multiple-choice questions derived from 25 public datasets and
+a newly created dataset, FaceXAPI. These questions cover 14 tasks across 6
+broad categories, assessing MLLMs' face understanding abilities in bias and
+fairness, face authentication, recognition, analysis, localization and tool
+retrieval. Using FaceXBench, we conduct an extensive evaluation of 26
+open-source MLLMs alongside 2 proprietary models, revealing the unique
+challenges in complex face understanding tasks. We analyze the models across
+three evaluation settings: zero-shot, in-context task description, and
+chain-of-thought prompting. Our detailed analysis reveals that current MLLMs,
+including advanced models like GPT-4o, and GeminiPro 1.5, show significant room
+for improvement. We believe FaceXBench will be a crucial resource for
+developing MLLMs equipped to perform sophisticated face understanding. Code:
+https://github.com/Kartik-3004/facexbench
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://kartik-3004.github.io/facexbench/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Shot Monocular Scene Flow Estimation in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqing Liang, Abhishek Badki, Hang Su, James Tompkin, Orazio Gallo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large models have shown generalization across datasets for many low-level
+vision tasks, like depth estimation, but no such general models exist for scene
+flow. Even though scene flow has wide potential use, it is not used in practice
+because current predictive models do not generalize well. We identify three key
+challenges and propose solutions for each.First, we create a method that
+jointly estimates geometry and motion for accurate prediction. Second, we
+alleviate scene flow data scarcity with a data recipe that affords us 1M
+annotated training samples across diverse synthetic scenes. Third, we evaluate
+different parameterizations for scene flow prediction and adopt a natural and
+effective parameterization. Our resulting model outperforms existing methods as
+well as baselines built on large-scale models in terms of 3D end-point error,
+and shows zero-shot generalization to the casually captured videos from DAVIS
+and the robotic manipulation scenes from RoboTAP. Overall, our approach makes
+scene flow prediction more practical in-the-wild.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Website: https://research.nvidia.com/labs/zero_msf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3rd Workshop on Maritime Computer Vision (MaCVi) 2025: Challenge Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Kiefer, Lojze Žust, Jon Muhovič, Matej Kristan, Janez Perš, Matija Teršek, Uma Mudenagudi Chaitra Desai, Arnold Wiliem, Marten Kreis, Nikhil Akalwadi, Yitong Quan, Zhiqiang Zhong, Zhe Zhang, Sujie Liu, Xuran Chen, Yang Yang, Matej Fabijanić, Fausto Ferreira, Seongju Lee, Junseok Lee, Kyoobin Lee, Shanliang Yao, Runwei Guan, Xiaoyu Huang, Yi Ni, Himanshu Kumar, Yuan Feng, Yi-Ching Cheng, Tzu-Yu Lin, Chia-Ming Lee, Chih-Chung Hsu, Jannik Sheikh, Andreas Michel, Wolfgang Gross, Martin Weinmann, Josip Šarić, Yipeng Lin, Xiang Yang, Nan Jiang, Yutang Lu, Fei Feng, Ali Awad, Evan Lucas, Ashraf Saleem, Ching-Heng Cheng, Yu-Fan Lin, Tzu-Yu Lin, Chih-Chung Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The 3rd Workshop on Maritime Computer Vision (MaCVi) 2025 addresses maritime
+computer vision for Unmanned Surface Vehicles (USV) and underwater. This report
+offers a comprehensive overview of the findings from the challenges. We provide
+both statistical and qualitative analyses, evaluating trends from over 700
+submissions. All datasets, evaluation code, and the leaderboard are available
+to the public at https://macvi.org/workshop/macvi25.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Part of the MaCVi 2025 workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffStereo: High-Frequency Aware Diffusion Model for Stereo Image
+  Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huiyun Cao, Yuan Shi, Bin Xia, Xiaoyu Jin, Wenming Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models (DMs) have achieved promising performance in image
+restoration but haven't been explored for stereo images. The application of DM
+in stereo image restoration is confronted with a series of challenges. The need
+to reconstruct two images exacerbates DM's computational cost. Additionally,
+existing latent DMs usually focus on semantic information and remove
+high-frequency details as redundancy during latent compression, which is
+precisely what matters for image restoration. To address the above problems, we
+propose a high-frequency aware diffusion model, DiffStereo for stereo image
+restoration as the first attempt at DM in this domain. Specifically, DiffStereo
+first learns latent high-frequency representations (LHFR) of HQ images. DM is
+then trained in the learned space to estimate LHFR for stereo images, which are
+fused into a transformer-based stereo image restoration network providing
+beneficial high-frequency information of corresponding HQ images. The
+resolution of LHFR is kept the same as input images, which preserves the
+inherent texture from distortion. And the compression in channels alleviates
+the computational burden of DM. Furthermore, we devise a position encoding
+scheme when integrating the LHFR into the restoration network, enabling
+distinctive guidance in different depths of the restoration network.
+Comprehensive experiments verify that by combining generative DM and
+transformer, DiffStereo achieves both higher reconstruction accuracy and better
+perceptual quality on stereo super-resolution, deblurring, and low-light
+enhancement compared with state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ New Fashion Products Performance Forecasting: A <span class="highlight-title">Survey</span> on Evolutions,
+  Models and Emerging Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Avogaro, Luigi Capogrosso, Andrea Toaiari, Franco Fummi, Marco Cristani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fast fashion industry's insatiable demand for new styles and rapid
+production cycles has led to a significant environmental burden.
+Overproduction, excessive waste, and harmful chemicals have contributed to the
+negative environmental impact of the industry. To mitigate these issues, a
+paradigm shift that prioritizes sustainability and efficiency is urgently
+needed. Integrating learning-based predictive analytics into the fashion
+industry represents a significant opportunity to address environmental
+challenges and drive sustainable practices. By forecasting fashion trends and
+optimizing production, brands can reduce their ecological footprint while
+remaining competitive in a rapidly changing market. However, one of the key
+challenges in forecasting fashion sales is the dynamic nature of consumer
+preferences. Fashion is acyclical, with trends constantly evolving and
+resurfacing. In addition, cultural changes and unexpected events can disrupt
+established patterns. This problem is also known as New Fashion Products
+Performance Forecasting (NFPPF), and it has recently gained more and more
+interest in the global research landscape. Given its multidisciplinary nature,
+the field of NFPPF has been approached from many different angles. This
+comprehensive survey wishes to provide an up-to-date overview that focuses on
+learning-based NFPPF strategies. The survey is based on the Preferred Reporting
+Items for Systematic Reviews and Meta-Analyses (PRISMA) methodological flow,
+allowing for a systematic and complete literature review. In particular, we
+propose the first taxonomy that covers the learning panorama for NFPPF,
+examining in detail the different methodologies used to increase the amount of
+multimodal information, as well as the state-of-the-art available datasets.
+Finally, we discuss the challenges and future directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Springer Nature Computer Science journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HiMix: Reducing Computational Complexity in Large Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuange Zhang, Dengjie Li, Bo Liu, Zenghao Bao, Yao Zhou, Baisong Yang, Zhongying Liu, Yujie Zhong, Zheng Zhao, Tongtong Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from recent advancements in large language models and modality
+alignment techniques, existing Large Vision-Language Models(LVLMs) have
+achieved prominent performance across a wide range of scenarios. However, the
+excessive computational complexity limits the widespread use of these models in
+practical applications. We argue that one main bottleneck in computational
+complexity is caused by the involvement of redundant vision sequences in model
+computation. This is inspired by a reassessment of the efficiency of vision and
+language information transmission in the language decoder of LVLMs. Then, we
+propose a novel hierarchical vision-language interaction mechanism called
+Hierarchical Vision injection for Mixture Attention (HiMix). In HiMix, only the
+language sequence undergoes full forward propagation, while the vision sequence
+interacts with the language at specific stages within each language decoder
+layer. It is striking that our approach significantly reduces computational
+complexity with minimal performance loss. Specifically, HiMix achieves a 10x
+reduction in the computational cost of the language decoder across multiple
+LVLM models while maintaining comparable performance. This highlights the
+advantages of our method, and we hope our research brings new perspectives to
+the field of vision-language understanding. Project Page:
+https://xuange923.github.io/HiMix
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GSTAR: Gaussian Surface Tracking and Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengwei Zheng, Lixin Xue, Juan Zarate, Jie Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting techniques have enabled efficient photo-realistic
+rendering of static scenes. Recent works have extended these approaches to
+support surface reconstruction and tracking. However, tracking dynamic surfaces
+with 3D Gaussians remains challenging due to complex topology changes, such as
+surfaces appearing, disappearing, or splitting. To address these challenges, we
+propose GSTAR, a novel method that achieves photo-realistic rendering, accurate
+surface reconstruction, and reliable 3D tracking for general dynamic scenes
+with changing topology. Given multi-view captures as input, GSTAR binds
+Gaussians to mesh faces to represent dynamic objects. For surfaces with
+consistent topology, GSTAR maintains the mesh topology and tracks the meshes
+using Gaussians. In regions where topology changes, GSTAR adaptively unbinds
+Gaussians from the mesh, enabling accurate registration and the generation of
+new surfaces based on these optimized Gaussians. Additionally, we introduce a
+surface-based scene flow method that provides robust initialization for
+tracking between frames. Experiments demonstrate that our method effectively
+tracks and reconstructs dynamic surfaces, enabling a range of applications. Our
+project page with the code release is available at
+https://chengwei-zheng.github.io/GSTAR/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MutualForce: Mutual-Aware Enhancement for 4D Radar-LiDAR 3D Object
+  Detection <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyuan Peng, Huawei Sun, Kay Bierzynski, Anton Fischbacher, Lorenzo Servadei, Robert Wille
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radar and LiDAR have been widely used in autonomous driving as LiDAR provides
+rich structure information, and radar demonstrates high robustness under
+adverse weather. Recent studies highlight the effectiveness of fusing radar and
+LiDAR point clouds. However, challenges remain due to the modality misalignment
+and information loss during feature extractions. To address these issues, we
+propose a 4D radar-LiDAR framework to mutually enhance their representations.
+Initially, the indicative features from radar are utilized to guide both radar
+and LiDAR geometric feature learning. Subsequently, to mitigate their sparsity
+gap, the shape information from LiDAR is used to enrich radar BEV features.
+Extensive experiments on the View-of-Delft (VoD) dataset demonstrate our
+approach's superiority over existing methods, achieving the highest mAP of
+71.76% across the entire area and 86.36\% within the driving corridor.
+Especially for cars, we improve the AP by 4.17% and 4.20% due to the strong
+indicative features and symmetric shapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Egoistic Rigid Body Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niclas Führling, Giuseppe Thadeu Freitas de Abreu, David González G., Osvaldo Gonsa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a robust and self-reliant (or "egoistic") variation of the rigid
+body localization (RBL) problem, in which a primary rigid body seeks to
+estimate the pose (i.e., location and orientation) of another rigid body (or
+"target"), relative to its own, without the assistance of external
+infrastructure, without prior knowledge of the shape of the target, and taking
+into account the possibility that the available observations are incomplete.
+Three complementary contributions are then offered for such a scenario. The
+first is a method to estimate the translation vector between the center point
+of both rigid bodies, which unlike existing techniques does not require that
+both objects have the same shape or even the same number of landmark points.
+This technique is shown to significantly outperform the state-of-the-art (SotA)
+under complete information, but to be sensitive to data erasures, even when
+enhanced by matrix completion methods. The second contribution, designed to
+offer improved performance in the presence of incomplete information, offers a
+robust alternative to the latter, at the expense of a slight relative loss
+under complete information. Finally, the third contribution is a scheme for the
+estimation of the rotation matrix describing the relative orientation of the
+target rigid body with respect to the primary. Comparisons of the proposed
+schemes and SotA techniques demonstrate the advantage of the contributed
+methods in terms of root mean square error (RMSE) performance under fully
+complete information and incomplete conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disharmony: Forensics using Reverse Lighting Harmonization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philip Wootaek Shin, Jack Sampson, Vijaykrishnan Narayanan, Andres Marquez, Mahantesh Halappanavar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Content generation and manipulation approaches based on deep learning methods
+have seen significant advancements, leading to an increased need for techniques
+to detect whether an image has been generated or edited. Another area of
+research focuses on the insertion and harmonization of objects within images.
+In this study, we explore the potential of using harmonization data in
+conjunction with a segmentation model to enhance the detection of edited image
+regions. These edits can be either manually crafted or generated using deep
+learning methods. Our findings demonstrate that this approach can effectively
+identify such edits. Existing forensic models often overlook the detection of
+harmonized objects in relation to the background, but our proposed Disharmony
+Network addresses this gap. By utilizing an aggregated dataset of harmonization
+techniques, our model outperforms existing forensic networks in identifying
+harmonized objects integrated into their backgrounds, and shows potential for
+detecting various forms of edits, including virtual try-on tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hypercone Assisted Contour Generation for Out-of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Annita Vapsi, Andrés Muñoz, Nancy Thomas, Keshav Ramani, Daniel Borrajo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in the field of out-of-distribution (OOD) detection have
+placed great emphasis on learning better representations suited to this task.
+While there are distance-based approaches, distributional awareness has seldom
+been exploited for better performance. We present HAC$_k$-OOD, a novel OOD
+detection method that makes no distributional assumption about the data, but
+automatically adapts to its distribution. Specifically, HAC$_k$-OOD constructs
+a set of hypercones by maximizing the angular distance to neighbors in a given
+data-point's vicinity to approximate the contour within which in-distribution
+(ID) data-points lie. Experimental results show state-of-the-art FPR@95 and
+AUROC performance on Near-OOD detection and on Far-OOD detection on the
+challenging CIFAR-100 benchmark without explicitly training for OOD
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Clustering for Efficient Phenotype Segmentation of UAV
+  Hyperspectral Data <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ciem Cornelissen, Sam Leroux, Pieter Simoens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned Aerial Vehicles (UAVs) combined with Hyperspectral imaging (HSI)
+offer potential for environmental and agricultural applications by capturing
+detailed spectral information that enables the prediction of invisible features
+like biochemical leaf properties. However, the data-intensive nature of HSI
+poses challenges for remote devices, which have limited computational resources
+and storage. This paper introduces an Online Hyperspectral Simple Linear
+Iterative Clustering algorithm (OHSLIC) framework for real-time tree phenotype
+segmentation. OHSLIC reduces inherent noise and computational demands through
+adaptive incremental clustering and a lightweight neural network, which
+phenotypes trees using leaf contents such as chlorophyll, carotenoids, and
+anthocyanins. A hyperspectral dataset is created using a custom simulator that
+incorporates realistic leaf parameters, and light interactions. Results
+demonstrate that OHSLIC achieves superior regression accuracy and segmentation
+performance compared to pixel- or window-based methods while significantly
+reducing inference time. The method`s adaptive clustering enables dynamic
+trade-offs between computational efficiency and accuracy, paving the way for
+scalable edge-device deployment in HSI applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted WACV 2025 GeoCV workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CSHNet: A Novel Information Asymmetric Image Translation Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Yang, Haoyuan Shi, Zihan Wang, Nannan Wang, Xinbo Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite advancements in cross-domain image translation, challenges persist in
+asymmetric tasks such as SAR-to-Optical and Sketch-to-Instance conversions,
+which involve transforming data from a less detailed domain into one with
+richer content. Traditional CNN-based methods are effective at capturing fine
+details but struggle with global structure, leading to unwanted merging of
+image regions. To address this, we propose the CNN-Swin Hybrid Network
+(CSHNet), which combines two key modules: Swin Embedded CNN (SEC) and CNN
+Embedded Swin (CES), forming the SEC-CES-Bottleneck (SCB). SEC leverages CNN's
+detailed feature extraction while integrating the Swin Transformer's structural
+bias. CES, in turn, preserves the Swin Transformer's global integrity,
+compensating for CNN's lack of focus on structure. Additionally, CSHNet
+includes two components designed to enhance cross-domain information retention:
+the Interactive Guided Connection (IGC), which enables dynamic information
+exchange between SEC and CES, and Adaptive Edge Perception Loss (AEPL), which
+maintains structural boundaries during translation. Experimental results show
+that CSHNet outperforms existing methods in both visual quality and performance
+metrics across scene-level and instance-level datasets. Our code is available
+at: https://github.com/XduShi/CSHNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Structure-guided Deep Multi-View Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinrong Cui, Xiaohuang Wu, Haitao Zhang, Chongjie Dong, Jie Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep multi-view clustering seeks to utilize the abundant information from
+multiple views to improve clustering performance. However, most of the existing
+clustering methods often neglect to fully mine multi-view structural
+information and fail to explore the distribution of multi-view data, limiting
+clustering performance. To address these limitations, we propose a
+structure-guided deep multi-view clustering model. Specifically, we introduce a
+positive sample selection strategy based on neighborhood relationships, coupled
+with a corresponding loss function. This strategy constructs multi-view nearest
+neighbor graphs to dynamically redefine positive sample pairs, enabling the
+mining of local structural information within multi-view data and enhancing the
+reliability of positive sample selection. Additionally, we introduce a Gaussian
+distribution model to uncover latent structural information and introduce a
+loss function to reduce discrepancies between view embeddings. These two
+strategies explore multi-view structural information and data distribution from
+different perspectives, enhancing consistency across views and increasing
+intra-cluster compactness. Experimental evaluations demonstrate the efficacy of
+our method, showing significant improvements in clustering performance on
+multiple benchmark datasets compared to state-of-the-art multi-view clustering
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Vision-Language Framework for Multispectral Scene Representation Using
+  Language-Grounded Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enes Karanfil, Nevrez Imamoglu, Erkut Erdem, Aykut Erdem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene understanding in remote sensing often faces challenges in generating
+accurate representations for complex environments such as various land use
+areas or coastal regions, which may also include snow, clouds, or haze. To
+address this, we present a vision-language framework named Spectral LLaVA,
+which integrates multispectral data with vision-language alignment techniques
+to enhance scene representation and description. Using the BigEarthNet v2
+dataset from Sentinel-2, we establish a baseline with RGB-based scene
+descriptions and further demonstrate substantial improvements through the
+incorporation of multispectral information. Our framework optimizes a
+lightweight linear projection layer for alignment while keeping the vision
+backbone of SpectralGPT frozen. Our experiments encompass scene classification
+using linear probing and language modeling for jointly performing scene
+classification and description generation. Our results highlight Spectral
+LLaVA's ability to produce detailed and accurate descriptions, particularly for
+scenarios where RGB data alone proves inadequate, while also enhancing
+classification performance by refining SpectralGPT features into semantically
+meaningful representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ACE: Anatomically Consistent Embeddings in Composition and Decomposition <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyu Zhou, Haozhe Luo, Mohammad Reza Hosseinzadeh Taher, Jiaxuan Pang, Xiaowei Ding, Michael Gotway, Jianming Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical images acquired from standardized protocols show consistent
+macroscopic or microscopic anatomical structures, and these structures consist
+of composable/decomposable organs and tissues, but existing self-supervised
+learning (SSL) methods do not appreciate such composable/decomposable structure
+attributes inherent to medical images. To overcome this limitation, this paper
+introduces a novel SSL approach called ACE to learn anatomically consistent
+embedding via composition and decomposition with two key branches: (1) global
+consistency, capturing discriminative macro-structures via extracting global
+features; (2) local consistency, learning fine-grained anatomical details from
+composable/decomposable patch features via corresponding matrix matching.
+Experimental results across 6 datasets 2 backbones, evaluated in few-shot
+learning, fine-tuning, and property analysis, show ACE's superior robustness,
+transferability, and clinical potential. The innovations of our ACE lie in
+grid-wise image cropping, leveraging the intrinsic properties of
+compositionality and decompositionality of medical images, bridging the
+semantic gap from high-level pathologies to low-level tissue anomalies, and
+providing a new SSL method for medical imaging.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatio-temporal Graph Learning on Adaptive Mined Key Frames for
+  High-performance Multi-Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Futian Wang, Fengxiang Liu, Xiao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of multi-object tracking, the challenge of accurately capturing
+the spatial and temporal relationships between objects in video sequences
+remains a significant hurdle. This is further complicated by frequent
+occurrences of mutual occlusions among objects, which can lead to tracking
+errors and reduced performance in existing methods. Motivated by these
+challenges, we propose a novel adaptive key frame mining strategy that
+addresses the limitations of current tracking approaches. Specifically, we
+introduce a Key Frame Extraction (KFE) module that leverages reinforcement
+learning to adaptively segment videos, thereby guiding the tracker to exploit
+the intrinsic logic of the video content. This approach allows us to capture
+structured spatial relationships between different objects as well as the
+temporal relationships of objects across frames. To tackle the issue of object
+occlusions, we have developed an Intra-Frame Feature Fusion (IFF) module.
+Unlike traditional graph-based methods that primarily focus on inter-frame
+feature fusion, our IFF module uses a Graph Convolutional Network (GCN) to
+facilitate information exchange between the target and surrounding objects
+within a frame. This innovation significantly enhances target
+distinguishability and mitigates tracking loss and appearance similarity due to
+occlusions. By combining the strengths of both long and short trajectories and
+considering the spatial relationships between objects, our proposed tracker
+achieves impressive results on the MOT17 dataset, i.e., 68.6 HOTA, 81.0 IDF1,
+66.6 AssA, and 893 IDS, proving its effectiveness and accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FECT: Classification of Breast Cancer Pathological Images Based on
+  Fusion Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Hao, Yiqing Liu, Siqi Zeng, Yonghong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is one of the most common cancers among women globally, with
+early diagnosis and precise classification being crucial. With the advancement
+of deep learning and computer vision, the automatic classification of breast
+tissue pathological images has emerged as a research focus. Existing methods
+typically rely on singular cell or tissue features and lack design
+considerations for morphological characteristics of challenging-to-classify
+categories, resulting in suboptimal classification performance. To address
+these problems, we proposes a novel breast cancer tissue classification model
+that Fused features of Edges, Cells, and Tissues (FECT), employing the
+ResMTUNet and an attention-based aggregator to extract and aggregate these
+features. Extensive testing on the BRACS dataset demonstrates that our model
+surpasses current advanced methods in terms of classification accuracy and F1
+scores. Moreover, due to its feature fusion that aligns with the diagnostic
+approach of pathologists, our model exhibits interpretability and holds promise
+for significant roles in future clinical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffVSR: Enhancing Real-World Video Super-Resolution with Diffusion
+  Models for Advanced Visual Quality and Temporal Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10110v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10110v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohui Li, Yihao Liu, Shuo Cao, Ziyan Chen, Shaobin Zhuang, Xiangyu Chen, Yinan He, Yi Wang, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated exceptional capabilities in image
+generation and restoration, yet their application to video super-resolution
+faces significant challenges in maintaining both high fidelity and temporal
+consistency. We present DiffVSR, a diffusion-based framework for real-world
+video super-resolution that effectively addresses these challenges through key
+innovations. For intra-sequence coherence, we develop a multi-scale temporal
+attention module and temporal-enhanced VAE decoder that capture fine-grained
+motion details. To ensure inter-sequence stability, we introduce a noise
+rescheduling mechanism with an interweaved latent transition approach, which
+enhances temporal consistency without additional training overhead. We propose
+a progressive learning strategy that transitions from simple to complex
+degradations, enabling robust optimization despite limited high-quality video
+data. Extensive experiments demonstrate that DiffVSR delivers superior results
+in both visual quality and temporal consistency, setting a new performance
+standard in real-world video super-resolution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: \url{https://xh9998.github.io/DiffVSR-project/}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Actions for Enhanced Embodied Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10105v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10105v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinliang Zheng, Jianxiong Li, Dongxiu Liu, Yinan Zheng, Zhihao Wang, Zhonghong Ou, Yu Liu, Jingjing Liu, Ya-Qin Zhang, Xianyuan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training on diverse, internet-scale data is a key factor in the success of
+recent large foundation models. Yet, using the same recipe for building
+embodied agents has faced noticeable difficulties. Despite the availability of
+many crowd-sourced embodied datasets, their action spaces often exhibit
+significant heterogeneity due to distinct physical embodiment and control
+interfaces for different robots, causing substantial challenges in developing
+embodied foundation models using cross-domain data. In this paper, we introduce
+UniAct, a new embodied foundation modeling framework operating in a tokenized
+Universal Action Space. Our learned universal actions capture the generic
+atomic behaviors across diverse robots by exploiting their shared structural
+features, and enable enhanced cross-domain data utilization and
+cross-embodiment generalizations by eliminating the notorious heterogeneity.
+The universal actions can be efficiently translated back to heterogeneous
+actionable commands by simply adding embodiment-specific details, from which
+fast adaptation to new robots becomes simple and straightforward. Our 0.5B
+instantiation of UniAct outperforms 14X larger SOTA embodied foundation models
+in extensive evaluations on various real-world and simulation robots,
+showcasing exceptional cross-embodiment control and adaptation capability,
+highlighting the crucial benefit of adopting universal actions. Project page:
+https://github.com/2toinf/UniAct
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ landmarker: a Toolkit for Anatomical Landmark Localization in 2D/3D
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jef Jonkers, Luc Duchateau, Glenn Van Wallendael, Sofie Van Hoecke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anatomical landmark localization in 2D/3D images is a critical task in
+medical imaging. Although many general-purpose tools exist for landmark
+localization in classical computer vision tasks, such as pose estimation, they
+lack the specialized features and modularity necessary for anatomical landmark
+localization applications in the medical domain. Therefore, we introduce
+landmarker, a Python package built on PyTorch. The package provides a
+comprehensive, flexible toolkit for developing and evaluating landmark
+localization algorithms, supporting a range of methodologies, including static
+and adaptive heatmap regression. landmarker enhances the accuracy of landmark
+identification, streamlines research and development processes, and supports
+various image formats and preprocessing pipelines. Its modular design allows
+users to customize and extend the toolkit for specific datasets and
+applications, accelerating innovation in medical imaging. landmarker addresses
+a critical need for precision and customization in landmark localization tasks
+not adequately met by existing general-purpose pose estimation tools.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classifier Ensemble for Efficient Uncertainty Calibration of Deep Neural
+  Networks for Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Schulze, Nikolas Ebert, Laurenz Reichardt, Oliver Wasenmüller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates novel classifier ensemble techniques for uncertainty
+calibration applied to various deep neural networks for image classification.
+We evaluate both accuracy and calibration metrics, focusing on Expected
+Calibration Error (ECE) and Maximum Calibration Error (MCE). Our work compares
+different methods for building simple yet efficient classifier ensembles,
+including majority voting and several metamodel-based approaches. Our
+evaluation reveals that while state-of-the-art deep neural networks for image
+classification achieve high accuracy on standard datasets, they frequently
+suffer from significant calibration errors. Basic ensemble techniques like
+majority voting provide modest improvements, while metamodel-based ensembles
+consistently reduce ECE and MCE across all architectures. Notably, the largest
+of our compared metamodels demonstrate the most substantial calibration
+improvements, with minimal impact on accuracy. Moreover, classifier ensembles
+with metamodels outperform traditional model ensembles in calibration
+performance, while requiring significantly fewer parameters. In comparison to
+traditional post-hoc calibration methods, our approach removes the need for a
+separate calibration dataset. These findings underscore the potential of our
+proposed metamodel-based classifier ensembles as an efficient and effective
+approach to improving model calibration, thereby contributing to more reliable
+deep learning systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted at International Conference on Computer
+  Vision Theory and Applications (VISAPP), 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Confident Image Regions for Source-Free Domain-Adaptive
+  Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Lamine Mekhalfi, Davide Boscaini, Fabio Poiesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Source-free domain-adaptive object detection is an interesting but scarcely
+addressed topic. It aims at adapting a source-pretrained detector to a distinct
+target domain without resorting to source data during adaptation. So far, there
+is no data augmentation scheme tailored to source-free domain-adaptive object
+detection. To this end, this paper presents a novel data augmentation approach
+that cuts out target image regions where the detector is confident, augments
+them along with their respective pseudo-labels, and joins them into a
+challenging target image to adapt the detector. As the source data is out of
+reach during adaptation, we implement our approach within a teacher-student
+learning paradigm to ensure that the model does not collapse during the
+adaptation procedure. We evaluated our approach on three adaptation benchmarks
+of traffic scenes, scoring new state-of-the-art on two of them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-shot Structure-Informed Machinery Part Segmentation with Foundation
+  Models and Graph Neural Networks <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Schwingshackl, Fabio Francisco Oberweger, Markus Murschitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel approach to few-shot semantic segmentation for
+machinery with multiple parts that exhibit spatial and hierarchical
+relationships. Our method integrates the foundation models CLIPSeg and Segment
+Anything Model (SAM) with the interest point detector SuperPoint and a graph
+convolutional network (GCN) to accurately segment machinery parts. By providing
+1 to 25 annotated samples, our model, evaluated on a purely synthetic dataset
+depicting a truck-mounted loading crane, achieves effective segmentation across
+various levels of detail. Training times are kept under five minutes on
+consumer GPUs. The model demonstrates robust generalization to real data,
+achieving a qualitative synthetic-to-real generalization with a $J\&F$ score of
+92.2 on real data using 10 synthetic support samples. When benchmarked on the
+DAVIS 2017 dataset, it achieves a $J\&F$ score of 71.5 in semi-supervised video
+segmentation with three support samples. This method's fast training times and
+effective generalization to real data make it a valuable tool for autonomous
+systems interacting with machinery and infrastructure, and illustrate the
+potential of combined and orchestrated foundation models for few-shot
+segmentation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Winter Conference on Applications of Computer Vision
+  (WACV) 2025. Code and available at
+  https://github.com/AIT-Assistive-Autonomous-Systems/Hopomop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Change Captioning in Remote Sensing: SECOND-CC <span class="highlight-title">Dataset</span> and
+  MModalCC Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Can Karaca, M. Enes Ozelbas, Saadettin Berber, Orkhan Karimli, Turabi Yildirim, M. Fatih Amasyali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing change captioning (RSICC) aims to describe changes between
+bitemporal images in natural language. Existing methods often fail under
+challenges like illumination differences, viewpoint changes, blur effects,
+leading to inaccuracies, especially in no-change regions. Moreover, the images
+acquired at different spatial resolutions and have registration errors tend to
+affect the captions. To address these issues, we introduce SECOND-CC, a novel
+RSICC dataset featuring high-resolution RGB image pairs, semantic segmentation
+maps, and diverse real-world scenarios. SECOND-CC which contains 6,041 pairs of
+bitemporal RS images and 30,205 sentences describing the differences between
+images. Additionally, we propose MModalCC, a multimodal framework that
+integrates semantic and visual data using advanced attention mechanisms,
+including Cross-Modal Cross Attention (CMCA) and Multimodal Gated Cross
+Attention (MGCA). Detailed ablation studies and attention visualizations
+further demonstrate its effectiveness and ability to address RSICC challenges.
+Comprehensive experiments show that MModalCC outperforms state-of-the-art RSICC
+methods, including RSICCformer, Chg2Cap, and PSNet with +4.6% improvement on
+BLEU4 score and +9.6% improvement on CIDEr score. We will make our dataset and
+codebase publicly available to facilitate future research at
+https://github.com/ChangeCapsInRS/SecondCC
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE Transactions on Geoscience
+  and Remote Sensing journal for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and
+  Chain-of-Thought for Embodied Task Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuecheng Liu, Dafeng Chi, Shiguang Wu, Zhanguang Zhang, Yaochen Hu, Lingfeng Zhang, Yingxue Zhang, Shuang Wu, Tongtong Cao, Guowei Huang, Guangjian Tian, Xingyue Quan, Jianye Hao, Yuzheng Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial reasoning is an essential problem in embodied AI research. Efforts to
+enhance spatial reasoning abilities through supplementary spatial data and
+fine-tuning have proven limited and ineffective when addressing complex
+embodied tasks, largely due to their dependence on language-based outputs.
+While some approaches have introduced a point-based action space to mitigate
+this issue, they fall short in managing more intricate tasks within complex
+environments. This deficiency arises from their failure to fully exploit the
+inherent thinking and reasoning capabilities that are fundamental strengths of
+Vision-Language Models (VLMs). To address these limitations, we propose a novel
+approach named SpatialCoT, specifically designed to bolster the spatial
+reasoning capabilities of VLMs. Our approach comprises two stages: spatial
+coordinate bi-directional alignment, which aligns vision-language inputs with
+spatial coordinates, and chain-of-thought spatial grounding, which harnesses
+the reasoning capabilities of language models for advanced spatial reasoning.
+We evaluate SpatialCoT on challenging navigation and manipulation tasks, both
+in simulation and real-world settings. Experimental results demonstrate that
+our method significantly outperforms previous state-of-the-art approaches in
+both tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLIP-PCQA: Exploring Subjective-Aligned Vision-Language Modeling for
+  Point Cloud Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yating Liu, Yujie Zhang, Ziyu Shan, Yiling Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, No-Reference Point Cloud Quality Assessment (NR-PCQA)
+research has achieved significant progress. However, existing methods mostly
+seek a direct mapping function from visual data to the Mean Opinion Score
+(MOS), which is contradictory to the mechanism of practical subjective
+evaluation. To address this, we propose a novel language-driven PCQA method
+named CLIP-PCQA. Considering that human beings prefer to describe visual
+quality using discrete quality descriptions (e.g., "excellent" and "poor")
+rather than specific scores, we adopt a retrieval-based mapping strategy to
+simulate the process of subjective assessment. More specifically, based on the
+philosophy of CLIP, we calculate the cosine similarity between the visual
+features and multiple textual features corresponding to different quality
+descriptions, in which process an effective contrastive loss and learnable
+prompts are introduced to enhance the feature extraction. Meanwhile, given the
+personal limitations and bias in subjective experiments, we further covert the
+feature similarities into probabilities and consider the Opinion Score
+Distribution (OSD) rather than a single MOS as the final target. Experimental
+results show that our CLIP-PCQA outperforms other State-Of-The-Art (SOTA)
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FiLo++: Zero-/Few-Shot Anomaly Detection by Fused Fine-Grained
+  Descriptions and Deformable Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaopeng Gu, Bingke Zhu, Guibo Zhu, Yingying Chen, Ming Tang, Jinqiao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection methods typically require extensive normal samples from the
+target class for training, limiting their applicability in scenarios that
+require rapid adaptation, such as cold start. Zero-shot and few-shot anomaly
+detection do not require labeled samples from the target class in advance,
+making them a promising research direction. Existing zero-shot and few-shot
+approaches often leverage powerful multimodal models to detect and localize
+anomalies by comparing image-text similarity. However, their handcrafted
+generic descriptions fail to capture the diverse range of anomalies that may
+emerge in different objects, and simple patch-level image-text matching often
+struggles to localize anomalous regions of varying shapes and sizes. To address
+these issues, this paper proposes the FiLo++ method, which consists of two key
+components. The first component, Fused Fine-Grained Descriptions (FusDes),
+utilizes large language models to generate anomaly descriptions for each object
+category, combines both fixed and learnable prompt templates and applies a
+runtime prompt filtering method, producing more accurate and task-specific
+textual descriptions. The second component, Deformable Localization (DefLoc),
+integrates the vision foundation model Grounding DINO with position-enhanced
+text descriptions and a Multi-scale Deformable Cross-modal Interaction (MDCI)
+module, enabling accurate localization of anomalies with various shapes and
+sizes. In addition, we design a position-enhanced patch matching approach to
+improve few-shot anomaly detection performance. Experiments on multiple
+datasets demonstrate that FiLo++ achieves significant performance improvements
+compared with existing methods. Code will be available at
+https://github.com/CASIA-IVA-Lab/FiLo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-D-Piece: Image Tokenizer Meets Quality-Controllable Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keita Miwa, Kento Sasaki, Hidehisa Arai, Tsubasa Takahashi, Yu Yamaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current image tokenization methods require a large number of tokens to
+capture the information contained within images. Although the amount of
+information varies across images, most image tokenizers only support
+fixed-length tokenization, leading to inefficiency in token allocation. In this
+study, we introduce One-D-Piece, a discrete image tokenizer designed for
+variable-length tokenization, achieving quality-controllable mechanism. To
+enable variable compression rate, we introduce a simple but effective
+regularization mechanism named "Tail Token Drop" into discrete one-dimensional
+image tokenizers. This method encourages critical information to concentrate at
+the head of the token sequence, enabling support of variadic tokenization,
+while preserving state-of-the-art reconstruction quality. We evaluate our
+tokenizer across multiple reconstruction quality metrics and find that it
+delivers significantly better perceptual quality than existing
+quality-controllable compression methods, including JPEG and WebP, at smaller
+byte sizes. Furthermore, we assess our tokenizer on various downstream computer
+vision tasks, including image classification, object detection, semantic
+segmentation, and depth estimation, confirming its adaptability to numerous
+applications compared to other variable-rate methods. Our approach demonstrates
+the versatility of variable-length discrete image tokenization, establishing a
+new paradigm in both compression efficiency and reconstruction performance.
+Finally, we validate the effectiveness of tail token drop via detailed analysis
+of tokenizers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our Project Page:
+  https://turingmotors.github.io/one-d-piece-tokenizer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LWGANet: A Lightweight Group Attention Backbone for Remote Sensing
+  Visual Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Lu, Si-Bao Chen, Chris H. Q. Ding, Jin Tang, Bin Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing (RS) visual tasks have gained significant academic and
+practical importance. However, they encounter numerous challenges that hinder
+effective feature extraction, including the detection and recognition of
+multiple objects exhibiting substantial variations in scale within a single
+image. While prior dual-branch or multi-branch architectural strategies have
+been effective in managing these object variances, they have concurrently
+resulted in considerable increases in computational demands and parameter
+counts. Consequently, these architectures are rendered less viable for
+deployment on resource-constrained devices. Contemporary lightweight backbone
+networks, designed primarily for natural images, frequently encounter
+difficulties in effectively extracting features from multi-scale objects, which
+compromises their efficacy in RS visual tasks. This article introduces LWGANet,
+a specialized lightweight backbone network tailored for RS visual tasks,
+incorporating a novel lightweight group attention (LWGA) module designed to
+address these specific challenges. LWGA module, tailored for RS imagery,
+adeptly harnesses redundant features to extract a wide range of spatial
+information, from local to global scales, without introducing additional
+complexity or computational overhead. This facilitates precise feature
+extraction across multiple scales within an efficient framework.LWGANet was
+rigorously evaluated across twelve datasets, which span four crucial RS visual
+tasks: scene classification, oriented object detection, semantic segmentation,
+and change detection. The results confirm LWGANet's widespread applicability
+and its ability to maintain an optimal balance between high performance and low
+complexity, achieving SOTA results across diverse datasets. LWGANet emerged as
+a novel solution for resource-limited scenarios requiring robust RS image
+processing capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures, Remote sensing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ X-Dyna: Expressive Dynamic Human Image Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Chang, Hongyi Xu, You Xie, Yipeng Gao, Zhengfei Kuang, Shengqu Cai, Chenxu Zhang, Guoxian Song, Chao Wang, Yichun Shi, Zeyuan Chen, Shijie Zhou, Linjie Luo, Gordon Wetzstein, Mohammad Soleymani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce X-Dyna, a novel zero-shot, diffusion-based pipeline for
+animating a single human image using facial expressions and body movements
+derived from a driving video, that generates realistic, context-aware dynamics
+for both the subject and the surrounding environment. Building on prior
+approaches centered on human pose control, X-Dyna addresses key shortcomings
+causing the loss of dynamic details, enhancing the lifelike qualities of human
+video animations. At the core of our approach is the Dynamics-Adapter, a
+lightweight module that effectively integrates reference appearance context
+into the spatial attentions of the diffusion backbone while preserving the
+capacity of motion modules in synthesizing fluid and intricate dynamic details.
+Beyond body pose control, we connect a local control module with our model to
+capture identity-disentangled facial expressions, facilitating accurate
+expression transfer for enhanced realism in animated scenes. Together, these
+components form a unified framework capable of learning physical human motion
+and natural scene dynamics from a diverse blend of human and scene videos.
+Comprehensive qualitative and quantitative evaluations demonstrate that X-Dyna
+outperforms state-of-the-art methods, creating highly lifelike and expressive
+animations. The code is available at https://github.com/bytedance/X-Dyna.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page:https://x-dyna.github.io/xdyna.github.io/
+  Code:https://github.com/bytedance/X-Dyna</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Textoon: Generating Vivid 2D Cartoon Characters from Text Descriptions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao He, Jianqiang Ren, Liefeng Bo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The 2D cartoon style is a prominent art form in digital character creation,
+particularly popular among younger audiences. While advancements in digital
+human technology have spurred extensive research into photorealistic digital
+humans and 3D characters, interactive 2D cartoon characters have received
+comparatively less attention. Unlike 3D counterparts, which require
+sophisticated construction and resource-intensive rendering, Live2D, a
+widely-used format for 2D cartoon characters, offers a more efficient
+alternative, which allows to animate 2D characters in a manner that simulates
+3D movement without the necessity of building a complete 3D model. Furthermore,
+Live2D employs lightweight HTML5 (H5) rendering, improving both accessibility
+and efficiency. In this technical report, we introduce Textoon, an innovative
+method for generating diverse 2D cartoon characters in the Live2D format based
+on text descriptions. The Textoon leverages cutting-edge language and vision
+models to comprehend textual intentions and generate 2D appearance, capable of
+creating a wide variety of stunning and interactive 2D characters within one
+minute. The project homepage is https://human3daigc.github.io/Textoon_webpage/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffuEraser: A Diffusion Model for Video Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaowen Li, Haolan Xue, Peiran Ren, Liefeng Bo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent video inpainting algorithms integrate flow-based pixel propagation
+with transformer-based generation to leverage optical flow for restoring
+textures and objects using information from neighboring frames, while
+completing masked regions through visual Transformers. However, these
+approaches often encounter blurring and temporal inconsistencies when dealing
+with large masks, highlighting the need for models with enhanced generative
+capabilities. Recently, diffusion models have emerged as a prominent technique
+in image and video generation due to their impressive performance. In this
+paper, we introduce DiffuEraser, a video inpainting model based on stable
+diffusion, designed to fill masked regions with greater details and more
+coherent structures. We incorporate prior information to provide initialization
+and weak conditioning,which helps mitigate noisy artifacts and suppress
+hallucinations. Additionally, to improve temporal consistency during
+long-sequence inference, we expand the temporal receptive fields of both the
+prior model and DiffuEraser, and further enhance consistency by leveraging the
+temporal smoothing property of Video Diffusion Models. Experimental results
+demonstrate that our proposed method outperforms state-of-the-art techniques in
+both content completeness and temporal consistency while maintaining acceptable
+efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11pages, 13figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Hallucinations on Object Attributes using Multiview Images
+  and Negative Instructions <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhijie Tan, Yuzhi Li, Shengwei Meng, Xiang Yuan, Weiping Li, Tong Mo, Bingce Wang, Xu Chu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current popular Large Vision-Language Models (LVLMs) are suffering from
+Hallucinations on Object Attributes (HoOA), leading to incorrect determination
+of fine-grained attributes in the input images. Leveraging significant
+advancements in 3D generation from a single image, this paper proposes a novel
+method to mitigate HoOA in LVLMs. This method utilizes multiview images sampled
+from generated 3D representations as visual prompts for LVLMs, thereby
+providing more visual information from other viewpoints. Furthermore, we
+observe the input order of multiple multiview images significantly affects the
+performance of LVLMs. Consequently, we have devised Multiview Image Augmented
+VLM (MIAVLM), incorporating a Multiview Attributes Perceiver (MAP) submodule
+capable of simultaneously eliminating the influence of input image order and
+aligning visual information from multiview images with Large Language Models
+(LLMs). Besides, we designed and employed negative instructions to mitigate
+LVLMs' bias towards ``Yes" responses. Comprehensive experiments demonstrate the
+effectiveness of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2025 IEEE International Conference on Acoustics, Speech, and Signal
+  Processing (ICASSP 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Early Alzheimer Disease Detection with MRI Scans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Rafsan, Tamer Oraby, Upal Roy, Sanjeev Kumar, Hansapani Rodrigo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alzheimer's Disease is a neurodegenerative condition characterized by
+dementia and impairment in neurological function. The study primarily focuses
+on the individuals above age 40, affecting their memory, behavior, and
+cognitive processes of the brain. Alzheimer's disease requires diagnosis by a
+detailed assessment of MRI scans and neuropsychological tests of the patients.
+This project compares existing deep learning models in the pursuit of enhancing
+the accuracy and efficiency of AD diagnosis, specifically focusing on the
+Convolutional Neural Network, Bayesian Convolutional Neural Network, and the
+U-net model with the Open Access Series of Imaging Studies brain MRI dataset.
+Besides, to ensure robustness and reliability in the model evaluations, we
+address the challenge of imbalance in data. We then perform rigorous evaluation
+to determine strengths and weaknesses for each model by considering
+sensitivity, specificity, and computational efficiency. This comparative
+analysis would shed light on the future role of AI in revolutionizing AD
+diagnostics but also paved ways for future innovation in medical imaging and
+the management of neurodegenerative diseases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Modal Attention Networks for Enhanced Segmentation and Depth
+  Estimation of Subsurface Defects in Pulse Thermography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Salah, Naoufel Werghi, Davor Svetinovic, Yusra Abdulrahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI-driven pulse thermography (PT) has become a crucial tool in
+non-destructive testing (NDT), enabling automatic detection of hidden anomalies
+in various industrial components. Current state-of-the-art techniques feed
+segmentation and depth estimation networks compressed PT sequences using either
+Principal Component Analysis (PCA) or Thermographic Signal Reconstruction
+(TSR). However, treating these two modalities independently constrains the
+performance of PT inspection models as these representations possess
+complementary semantic features. To address this limitation, this work proposes
+PT-Fusion, a multi-modal attention-based fusion network that fuses both PCA and
+TSR modalities for defect segmentation and depth estimation of subsurface
+defects in PT setups. PT-Fusion introduces novel feature fusion modules,
+Encoder Attention Fusion Gate (EAFG) and Attention Enhanced Decoding Block
+(AEDB), to fuse PCA and TSR features for enhanced segmentation and depth
+estimation of subsurface defects. In addition, a novel data augmentation
+technique is proposed based on random data sampling from thermographic
+sequences to alleviate the scarcity of PT datasets. The proposed method is
+benchmarked against state-of-the-art PT inspection models, including U-Net,
+attention U-Net, and 3D-CNN on the Universit\'e Laval IRT-PVC dataset. The
+results demonstrate that PT-Fusion outperforms the aforementioned models in
+defect segmentation and depth estimation accuracies with a margin of 10%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pulse thermography, infrared thermography, defect segmentation,
+  multi-modal networks, attention mechanism</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RichSpace: Enriching Text-to-Video <span class="highlight-title">Prompt</span> Space via Text Embedding
+  Interpolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuefan Cao, Chengyue Gong, Xiaoyu Li, Yingyu Liang, Zhizhou Sha, Zhenmei Shi, Zhao Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-video generation models have made impressive progress, but they still
+struggle with generating videos with complex features. This limitation often
+arises from the inability of the text encoder to produce accurate embeddings,
+which hinders the video generation model. In this work, we propose a novel
+approach to overcome this challenge by selecting the optimal text embedding
+through interpolation in the embedding space. We demonstrate that this method
+enables the video generation model to produce the desired videos. Additionally,
+we introduce a simple algorithm using perpendicular foot embeddings and cosine
+similarity to identify the optimal interpolation embedding. Our findings
+highlight the importance of accurate text embeddings and offer a pathway for
+improving text-to-video generation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aneumo: A Large-Scale Comprehensive Synthetic <span class="highlight-title">Dataset</span> of Aneurysm
+  Hemodynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xigui Li, Yuanye Zhou, Feiyang Xiao, Xin Guo, Yichi Zhang, Chen Jiang, Jianchao Ge, Xiansheng Wang, Qimeng Wang, Taiwei Zhang, Chensen Lin, Yuan Cheng, Yuan Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intracranial aneurysm (IA) is a common cerebrovascular disease that is
+usually asymptomatic but may cause severe subarachnoid hemorrhage (SAH) if
+ruptured. Although clinical practice is usually based on individual factors and
+morphological features of the aneurysm, its pathophysiology and hemodynamic
+mechanisms remain controversial. To address the limitations of current
+research, this study constructed a comprehensive hemodynamic dataset of
+intracranial aneurysms. The dataset is based on 466 real aneurysm models, and
+10,000 synthetic models were generated by resection and deformation operations,
+including 466 aneurysm-free models and 9,534 deformed aneurysm models. The
+dataset also provides medical image-like segmentation mask files to support
+insightful analysis. In addition, the dataset contains hemodynamic data
+measured at eight steady-state flow rates (0.001 to 0.004 kg/s), including
+critical parameters such as flow velocity, pressure, and wall shear stress,
+providing a valuable resource for investigating aneurysm pathogenesis and
+clinical prediction. This dataset will help advance the understanding of the
+pathologic features and hemodynamic mechanisms of intracranial aneurysms and
+support in-depth research in related fields. Dataset hosted at
+https://github.com/Xigui-Li/Aneumo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaussianAvatar-Editor: Photorealistic Animatable Gaussian Head Avatar
+  Editor <span class="chip">3DV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyue Liu, Kunming Luo, Heng Li, Qi Zhang, Yuan Liu, Li Yi, Ping Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce GaussianAvatar-Editor, an innovative framework for text-driven
+editing of animatable Gaussian head avatars that can be fully controlled in
+expression, pose, and viewpoint. Unlike static 3D Gaussian editing, editing
+animatable 4D Gaussian avatars presents challenges related to motion occlusion
+and spatial-temporal inconsistency. To address these issues, we propose the
+Weighted Alpha Blending Equation (WABE). This function enhances the blending
+weight of visible Gaussians while suppressing the influence on non-visible
+Gaussians, effectively handling motion occlusion during editing. Furthermore,
+to improve editing quality and ensure 4D consistency, we incorporate
+conditional adversarial learning into the editing process. This strategy helps
+to refine the edited results and maintain consistency throughout the animation.
+By integrating these methods, our GaussianAvatar-Editor achieves photorealistic
+and consistent results in animatable 4D Gaussian editing. We conduct
+comprehensive experiments across various subjects to validate the effectiveness
+of our proposed techniques, which demonstrates the superiority of our approach
+over existing methods. More results and code are available at: [Project
+Link](https://xiangyueliu.github.io/GaussianAvatar-Editor/).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 3DV 2025. [Project
+  Link](https://xiangyueliu.github.io/GaussianAvatar-Editor/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable artificial intelligence (XAI): from inherent explainability
+  to large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fuseini Mumuni, Alhassan Mumuni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI) has continued to achieve tremendous success in
+recent times. However, the decision logic of these frameworks is often not
+transparent, making it difficult for stakeholders to understand, interpret or
+explain their behavior. This limitation hinders trust in machine learning
+systems and causes a general reluctance towards their adoption in practical
+applications, particularly in mission-critical domains like healthcare and
+autonomous driving. Explainable AI (XAI) techniques facilitate the
+explainability or interpretability of machine learning models, enabling users
+to discern the basis of the decision and possibly avert undesirable behavior.
+This comprehensive survey details the advancements of explainable AI methods,
+from inherently interpretable models to modern approaches for achieving
+interpretability of various black box models, including large language models
+(LLMs). Additionally, we review explainable AI techniques that leverage LLM and
+vision-language model (VLM) frameworks to automate or improve the
+explainability of other machine learning models. The use of LLM and VLM as
+interpretability methods particularly enables high-level, semantically
+meaningful explanations of model decisions and behavior. Throughout the paper,
+we highlight the scientific principles, strengths and weaknesses of
+state-of-the-art methods and outline different areas of improvement. Where
+appropriate, we also present qualitative and quantitative comparison results of
+various methods to show how they compare. Finally, we discuss the key
+challenges of XAI and directions for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discrete Prior-based Temporal-coherent Content Prediction for Blind Face
+  Video Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lianxin Xie, Bingbing Zheng, Wen Xue, Yunfei Zhang, Le Jiang, Ruotao Xu, Si Wu, Hau-San Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind face video restoration aims to restore high-fidelity details from
+videos subjected to complex and unknown degradations. This task poses a
+significant challenge of managing temporal heterogeneity while at the same time
+maintaining stable face attributes. In this paper, we introduce a Discrete
+Prior-based Temporal-Coherent content prediction transformer to address the
+challenge, and our model is referred to as DP-TempCoh. Specifically, we
+incorporate a spatial-temporal-aware content prediction module to synthesize
+high-quality content from discrete visual priors, conditioned on degraded video
+tokens. To further enhance the temporal coherence of the predicted content, a
+motion statistics modulation module is designed to adjust the content, based on
+discrete motion priors in terms of cross-frame mean and variance. As a result,
+the statistics of the predicted content can match with that of real videos over
+time. By performing extensive experiments, we verify the effectiveness of the
+design elements and demonstrate the superior performance of our DP-TempCoh in
+both synthetically and naturally degraded video restoration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Surface-SOS: <span class="highlight-title">Self-Supervised</span> Object Segmentation via Neural Surface
+  Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyun Zheng, Liwei Liao, Jianbo Jiao, Feng Gao, Ronggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised Object Segmentation (SOS) aims to segment objects without any
+annotations. Under conditions of multi-camera inputs, the structural, textural
+and geometrical consistency among each view can be leveraged to achieve
+fine-grained object segmentation. To make better use of the above information,
+we propose Surface representation based Self-supervised Object Segmentation
+(Surface-SOS), a new framework to segment objects for each view by 3D surface
+representation from multi-view images of a scene. To model high-quality
+geometry surfaces for complex scenes, we design a novel scene representation
+scheme, which decomposes the scene into two complementary neural representation
+modules respectively with a Signed Distance Function (SDF). Moreover,
+Surface-SOS is able to refine single-view segmentation with multi-view
+unlabeled images, by introducing coarse segmentation masks as additional input.
+To the best of our knowledge, Surface-SOS is the first self-supervised approach
+that leverages neural surface representation to break the dependence on large
+amounts of annotated data and strong constraints. These constraints typically
+involve observing target objects against a static background or relying on
+temporal supervision in videos. Extensive experiments on standard benchmarks
+including LLFF, CO3D, BlendedMVS, TUM and several real-world scenes show that
+Surface-SOS always yields finer object masks than its NeRF-based counterparts
+and surpasses supervised single-view baselines remarkably. Code is available
+at: https://github.com/zhengxyun/Surface-SOS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TIP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-Scale Feature Extraction and Fusion Deep Learning Method for
+  Classification of Wheat Diseases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sajjad Saleem, Adil Hussain, Nabila Majeed, Zahid Akhtar, Kamran Siddique
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wheat is an important source of dietary fiber and protein that is negatively
+impacted by a number of risks to its growth. The difficulty of identifying and
+classifying wheat diseases is discussed with an emphasis on wheat loose smut,
+leaf rust, and crown and root rot. Addressing conditions like crown and root
+rot, this study introduces an innovative approach that integrates multi-scale
+feature extraction with advanced image segmentation techniques to enhance
+classification accuracy. The proposed method uses neural network models
+Xception, Inception V3, and ResNet 50 to train on a large wheat disease
+classification dataset 2020 in conjunction with an ensemble of machine vision
+classifiers, including voting and stacking. The study shows that the suggested
+methodology has a superior accuracy of 99.75% in the classification of wheat
+diseases when compared to current state-of-the-art approaches. A deep learning
+ensemble model Xception showed the highest accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-informed DeepCT: Sinogram Wavelet Decomposition Meets Masked
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekun Zhou, Tan Liu, Bing Yu, Yanru Gong, Liu Shi, Qiegen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion model shows remarkable potential on sparse-view computed tomography
+(SVCT) reconstruction. However, when a network is trained on a limited sample
+space, its generalization capability may be constrained, which degrades
+performance on unfamiliar data. For image generation tasks, this can lead to
+issues such as blurry details and inconsistencies between regions. To alleviate
+this problem, we propose a Sinogram-based Wavelet random decomposition And
+Random mask diffusion Model (SWARM) for SVCT reconstruction. Specifically,
+introducing a random mask strategy in the sinogram effectively expands the
+limited training sample space. This enables the model to learn a broader range
+of data distributions, enhancing its understanding and generalization of data
+uncertainty. In addition, applying a random training strategy to the
+high-frequency components of the sinogram wavelet enhances feature
+representation and improves the ability to capture details in different
+frequency bands, thereby improving performance and robustness. Two-stage
+iterative reconstruction method is adopted to ensure the global consistency of
+the reconstructed image while refining its details. Experimental results
+demonstrate that SWARM outperforms competing approaches in both quantitative
+and qualitative performance across various datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IE-Bench: Advancing the Measurement of Text-Driven Image Editing for
+  Human Perception Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangkun Sun, Bowen Qu, Xiaoyu Liang, Songlin Fan, Wei Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in text-driven image editing have been significant, yet the
+task of accurately evaluating these edited images continues to pose a
+considerable challenge. Different from the assessment of text-driven image
+generation, text-driven image editing is characterized by simultaneously
+conditioning on both text and a source image. The edited images often retain an
+intrinsic connection to the original image, which dynamically change with the
+semantics of the text. However, previous methods tend to solely focus on
+text-image alignment or have not aligned with human perception. In this work,
+we introduce the Text-driven Image Editing Benchmark suite (IE-Bench) to
+enhance the assessment of text-driven edited images. IE-Bench includes a
+database contains diverse source images, various editing prompts and the
+corresponding results different editing methods, and total 3,010 Mean Opinion
+Scores (MOS) provided by 25 human subjects. Furthermore, we introduce IE-QA, a
+multi-modality source-aware quality assessment method for text-driven image
+editing. To the best of our knowledge, IE-Bench offers the first IQA dataset
+and model tailored for text-driven image editing. Extensive experiments
+demonstrate IE-QA's superior subjective-alignments on the text-driven image
+editing task compared with previous metrics. We will make all related data and
+code available to the public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ForestProtector: An IoT Architecture Integrating Machine Vision and Deep
+  Reinforcement Learning for Efficient Wildfire Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenneth Bonilla-Ormachea, Horacio Cuizaga, Edwin Salcedo, Sebastian Castro, Sergio Fernandez-Testa, Misael Mamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early detection of forest fires is crucial to minimizing the environmental
+and socioeconomic damage they cause. Indeed, a fire's duration directly
+correlates with the difficulty and cost of extinguishing it. For instance, a
+fire burning for 1 minute might require 1 liter of water to extinguish, while a
+2-minute fire could demand 100 liters, and a 10-minute fire might necessitate
+1,000 liters. On the other hand, existing fire detection systems based on novel
+technologies (e.g., remote sensing, PTZ cameras, UAVs) are often expensive and
+require human intervention, making continuous monitoring of large areas
+impractical. To address this challenge, this work proposes a low-cost forest
+fire detection system that utilizes a central gateway device with computer
+vision capabilities to monitor a 360{\deg} field of view for smoke at long
+distances. A deep reinforcement learning agent enhances surveillance by
+dynamically controlling the camera's orientation, leveraging real-time sensor
+data (smoke levels, ambient temperature, and humidity) from distributed IoT
+devices. This approach enables automated wildfire monitoring across expansive
+areas while reducing false positives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the proceedings of the 11th International
+  Conference on Automation, Robotics, and Applications (ICARA 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TalkingEyes: Pluralistic Speech-Driven 3D Eye Gaze Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09921v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09921v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixiang Zhuang, Chunshan Ma, Yao Cheng, Xuan Cheng, Jing Liao, Juncong Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although significant progress has been made in the field of speech-driven 3D
+facial animation recently, the speech-driven animation of an indispensable
+facial component, eye gaze, has been overlooked by recent research. This is
+primarily due to the weak correlation between speech and eye gaze, as well as
+the scarcity of audio-gaze data, making it very challenging to generate 3D eye
+gaze motion from speech alone. In this paper, we propose a novel data-driven
+method which can generate diverse 3D eye gaze motions in harmony with the
+speech. To achieve this, we firstly construct an audio-gaze dataset that
+contains about 14 hours of audio-mesh sequences featuring high-quality eye gaze
+motion, head motion and facial motion simultaneously. The motion data is
+acquired by performing lightweight eye gaze fitting and face reconstruction on
+videos from existing audio-visual datasets. We then tailor a novel
+speech-to-motion translation framework in which the head motions and eye gaze
+motions are jointly generated from speech but are modeled in two separate
+latent spaces. This design stems from the physiological knowledge that the
+rotation range of eyeballs is less than that of head. Through mapping the
+speech embedding into the two latent spaces, the difficulty in modeling the
+weak correlation between speech and non-verbal motion is thus attenuated.
+Finally, our TalkingEyes, integrated with a speech-driven 3D facial motion
+generator, can synthesize eye gaze motion, eye blinks, head motion and facial
+motion collectively from speech. Extensive quantitative and qualitative
+evaluations demonstrate the superiority of the proposed method in generating
+diverse and natural 3D eye gaze motions from speech. The project page of this
+paper is: https://lkjkjoiuiu.github.io/TalkingEyes_Home/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon
+  Visuomotor Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haichao Zhang, Haonan Yu, Le Zhao, Andrew Choi, Qinxun Bai, Yiqing Yang, Wei Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a low-cost quadruped manipulation system that solves long-horizon
+real-world tasks, trained by reinforcement learning purely in simulation. The
+system comprises 1) a hierarchical design of a high-level policy for
+visual-mobile manipulation following instructions, and a low-level policy for
+quadruped movement and limb-control, 2) a progressive policy expansion approach
+for solving the long-horizon task together with a teacher-student framework for
+efficient high-level training of the high-level visuomotor policy, and 3) a
+suite of techniques for minimizing sim-to-real gaps.
+  With budget-friendly but limited reliability and performance hardware, and
+just one wrist-mounted RGB camera, the entire system fully trained in
+simulation achieves high success rates for long horizon tasks involving search,
+move, grasp, and drop-into, with fluid sim-to-real transfer in a wide variety
+of indoor and outdoor scenes and lighting conditions.Extensive real-world
+evaluations show that on the long horizon mobile manipulation tasks, our system
+achieves good performance when transferred to real both in terms of task
+success rate and execution efficiency. Finally, we discuss the necessity of our
+sim-to-real techniques for legged mobile manipulation, and show their ablation
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FoundationStereo: Zero-Shot Stereo Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Wen, Matthew Trepte, Joseph Aribido, Jan Kautz, Orazio Gallo, Stan Birchfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tremendous progress has been made in deep stereo matching to excel on
+benchmark datasets through per-domain fine-tuning. However, achieving strong
+zero-shot generalization - a hallmark of foundation models in other computer
+vision tasks - remains challenging for stereo matching. We introduce
+FoundationStereo, a foundation model for stereo depth estimation designed to
+achieve strong zero-shot generalization. To this end, we first construct a
+large-scale (1M stereo pairs) synthetic training dataset featuring large
+diversity and high photorealism, followed by an automatic self-curation
+pipeline to remove ambiguous samples. We then design a number of network
+architecture components to enhance scalability, including a side-tuning feature
+backbone that adapts rich monocular priors from vision foundation models to
+mitigate the sim-to-real gap, and long-range context reasoning for effective
+cost volume filtering. Together, these components lead to strong robustness and
+accuracy across domains, establishing a new standard in zero-shot stereo depth
+estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLORA: Formal Language Model Enables Robust Training-free Zero-shot
+  Object Referring Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Chen, Zijing Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object Referring Analysis (ORA), commonly known as referring expression
+comprehension, requires the identification and localization of specific objects
+in an image based on natural descriptions. Unlike generic object detection, ORA
+requires both accurate language understanding and precise visual localization,
+making it inherently more complex. Although recent pre-trained large visual
+grounding detectors have achieved significant progress, they heavily rely on
+extensively labeled data and time-consuming learning. To address these, we
+introduce a novel, training-free framework for zero-shot ORA, termed FLORA
+(Formal Language for Object Referring and Analysis). FLORA harnesses the
+inherent reasoning capabilities of large language models (LLMs) and integrates
+a formal language model - a logical framework that regulates language within
+structured, rule-based descriptions - to provide effective zero-shot ORA. More
+specifically, our formal language model (FLM) enables an effective,
+logic-driven interpretation of object descriptions without necessitating any
+training processes. Built upon FLM-regulated LLM outputs, we further devise a
+Bayesian inference framework and employ appropriate off-the-shelf interpretive
+models to finalize the reasoning, delivering favorable robustness against LLM
+hallucinations and compelling ORA performance in a training-free manner. In
+practice, our FLORA boosts the zero-shot performance of existing pretrained
+grounding detectors by up to around 45%. Our comprehensive evaluation across
+different challenging datasets also confirms that FLORA consistently surpasses
+current state-of-the-art zero-shot methods in both detection and segmentation
+tasks associated with zero-shot ORA. We believe our probabilistic parsing and
+reasoning of the LLM outputs elevate the reliability and interpretability of
+zero-shot ORA. We shall release codes upon publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MVTamperBench: Evaluating Robustness of Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19794v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19794v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Agarwal, Srikant Panda, Angeline Charles, Bhargava Kumar, Hitesh Patel, Priyaranjan Pattnayak, Taki Hasan Rafi, Tejaswini Kumar, Dong-Kyu Chae
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have driven major advances in video
+understanding, yet their vulnerability to adversarial tampering and
+manipulations remains underexplored. To address this gap, we introduce
+MVTamperBench, a benchmark that systematically evaluates MLLM robustness
+against five prevalent tampering techniques: rotation, masking, substitution,
+repetition, and dropping. Built from 3.4K original videos-expanded to over 17K
+tampered clips spanning 19 video tasks.
+  MVTamperBench challenges models to detect manipulations in spatial and
+temporal coherence. We evaluate 45 recent MLLMs from 15+ model families,
+revealing substantial variability in resilience across tampering types and
+showing that larger parameter counts do not necessarily guarantee robustness.
+MVTamperBench sets a new benchmark for developing tamper-resilient MLLM in
+safety-critical applications, including detecting clickbait, preventing harmful
+content distribution, and enforcing policies on media platforms. We release all
+code and data to foster open research in trustworthy video understanding.
+  Code: https://amitbcp.github.io/MVTamperBench/ Data:
+https://huggingface.co/datasets/Srikant86/MVTamperBench
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid
+  Prototyping in Virtual Reality Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09600v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09600v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Augusto Pinheiro de Sousa, Heiko Hamann, Oliver Deussen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SLAM is a foundational technique with broad applications in robotics and
+AR/VR. SLAM simulations evaluate new concepts, but testing on
+resource-constrained devices, such as VR HMDs, faces challenges: high
+computational cost and restricted sensor data access. This work proposes a
+sparse framework using mesh geometry projections as features, which improves
+efficiency and circumvents direct sensor data access, advancing SLAM research
+as we demonstrate in VR and through numerical evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.09374v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.09374v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junkai Niu, Sheng Zhong, Xiuyuan Lu, Shaojie Shen, Guillermo Gallego, Yi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based visual odometry is a specific branch of visual Simultaneous
+Localization and Mapping (SLAM) techniques, which aims at solving tracking and
+mapping subproblems (typically in parallel), by exploiting the special working
+principles of neuromorphic (i.e., event-based) cameras. Due to the
+motion-dependent nature of event data, explicit data association (i.e., feature
+matching) under large-baseline view-point changes is difficult to establish,
+making direct methods a more rational choice. However, state-of-the-art direct
+methods are limited by the high computational complexity of the mapping
+sub-problem and the degeneracy of camera pose tracking in certain degrees of
+freedom (DoF) in rotation. In this paper, we tackle these issues by building an
+event-based stereo visual-inertial odometry system on top of a direct pipeline.
+Specifically, to speed up the mapping operation, we propose an efficient
+strategy for sampling contour points according to the local dynamics of events.
+The mapping performance is also improved in terms of structure completeness and
+local smoothness by merging the temporal stereo and static stereo results. To
+circumvent the degeneracy of camera pose tracking in recovering the pitch and
+yaw components of general 6-DoF motion, we introduce IMU measurements as motion
+priors via pre-integration. To this end, a compact back-end is proposed for
+continuously updating the IMU bias and predicting the linear velocity, enabling
+an accurate motion prediction for camera pose tracking. The resulting system
+scales well with modern high-resolution event cameras and leads to better
+global positioning accuracy in large-scale outdoor environments. Extensive
+evaluations on five publicly available datasets featuring different resolutions
+and scenarios justify the superior performance of the proposed system against
+five state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BILTS: A Bi-Invariant Similarity Measure for Robust Object Trajectory
+  Recognition under Reference Frame Variations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.04392v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.04392v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arno Verduyn, Erwin Aertbeliën, Glenn Maes, Joris De Schutter, Maxim Vochten
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When similar object motions are performed in diverse contexts but are meant
+to be recognized under a single classification, these contextual variations act
+as disturbances that negatively affect accurate motion recognition. In this
+paper, we focus on contextual variations caused by reference frame variations.
+To robustly deal with these variations, similarity measures have been
+introduced that compare object motion trajectories in a context-invariant
+manner. However, most are highly sensitive to noise near singularities, where
+the measure is not uniquely defined, and lack bi-invariance (invariance to both
+world and body frame variations). To address these issues, we propose the novel
+\textit{Bi-Invariant Local Trajectory-Shape Similarity} (BILTS) measure.
+Compared to other measures, the BILTS measure uniquely offers bi-invariance,
+boundedness, and third-order shape identity. Aimed at practical
+implementations, we devised a discretized and regularized version of the BILTS
+measure which shows exceptional robustness to singularities. This is
+demonstrated through rigorous recognition experiments using multiple datasets.
+On average, BILTS attained the highest recognition ratio and least sensitivity
+to contextual variations compared to other invariant object motion similarity
+measures. We believe that the BILTS measure is a valuable tool for recognizing
+motions performed in diverse contexts and has potential in other applications,
+including the recognition, segmentation, and adaptation of both motion and
+force trajectories.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted as a regular research paper for
+  consideration in the Journal of Intelligent & Robotic Systems. The content in
+  this preprint is identical to the version submitted for peer review, except
+  for formatting differences required by the journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Diversity and Uncertainty in Active learning with
+  <span class="highlight-title">Self-Supervised</span> <span class="highlight-title">Pre-Train</span>ing <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03728v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03728v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Doucet, Benjamin Estermann, Till Aczel, Roger Wattenhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the integration of diversity-based and uncertainty-based
+sampling strategies in active learning, particularly within the context of
+self-supervised pre-trained models. We introduce a straightforward heuristic
+called TCM that mitigates the cold start problem while maintaining strong
+performance across various data levels. By initially applying TypiClust for
+diversity sampling and subsequently transitioning to uncertainty sampling with
+Margin, our approach effectively combines the strengths of both strategies. Our
+experiments demonstrate that TCM consistently outperforms existing methods
+across various datasets in both low and high data regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024 Workshop on Practical Machine Learning for Low
+  Resource Settings (PML4LRS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Compression Autoencoder for Efficient High-Resolution Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.10733v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.10733v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyu Chen, Han Cai, Junsong Chen, Enze Xie, Shang Yang, Haotian Tang, Muyang Li, Yao Lu, Song Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder
+models for accelerating high-resolution diffusion models. Existing autoencoder
+models have demonstrated impressive results at a moderate spatial compression
+ratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for
+high spatial compression ratios (e.g., 64x). We address this challenge by
+introducing two key techniques: (1) Residual Autoencoding, where we design our
+models to learn residuals based on the space-to-channel transformed features to
+alleviate the optimization difficulty of high spatial-compression autoencoders;
+(2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases
+training strategy for mitigating the generalization penalty of high
+spatial-compression autoencoders. With these designs, we improve the
+autoencoder's spatial compression ratio up to 128 while maintaining the
+reconstruction quality. Applying our DC-AE to latent diffusion models, we
+achieve significant speedup without accuracy drop. For example, on ImageNet
+512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup
+on H100 GPU for UViT-H while achieving a better FID, compared with the widely
+used SD-VAE-f8 autoencoder. Our code is available at
+https://github.com/mit-han-lab/efficientvit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. First two authors contributed equally to this work. Update:
+  fix typo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generate E-commerce Product Background by Integrating Category
+  Commonality and Personalized Style <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13309v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13309v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haohan Wang, Wei Feng, Yaoyu Li, Zheng Zhang, Jingjing Lv, Junjie Shen, Zhangang Lin, Jingping Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The state-of-the-art methods for e-commerce product background generation
+suffer from the inefficiency of designing product-wise prompts when scaling up
+the production, as well as the ineffectiveness of describing fine-grained
+styles when customizing personalized backgrounds for some specific brands. To
+address these obstacles, we integrate the category commonality and personalized
+style into diffusion models. Concretely, we propose a Category-Wise Generator
+to enable large-scale background generation with only one model for the first
+time. A unique identifier in the prompt is assigned to each category, whose
+attention is located on the background by a mask-guided cross attention layer
+to learn the category-wise style. Furthermore, for products with specific and
+fine-grained requirements in layout, elements, etc, a Personality-Wise
+Generator is devised to learn such personalized style directly from a reference
+image to resolve textual ambiguities, and is trained in a self-supervised
+manner for more efficient training data usage. To advance research in this
+field, the first large-scale e-commerce product background generation dataset
+BG60k is constructed, which covers more than 60k product images from over 2k
+categories. Experiments demonstrate that our method could generate high-quality
+backgrounds for different categories, and maintain the personalized background
+style of reference images. BG60k will be available at
+\url{https://github.com/Whileherham/BG60k}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LayerAnimate: Layer-specific Control for Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08295v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08295v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxue Yang, Lue Fan, Zuzeng Lin, Feng Wang, Zhaoxiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Animated video separates foreground and background elements into layers, with
+distinct processes for sketching, refining, coloring, and in-betweening.
+Existing video generation methods typically treat animation as a monolithic
+data domain, lacking fine-grained control over individual layers. In this
+paper, we introduce LayerAnimate, a novel architectural approach that enhances
+fine-grained control over individual animation layers within a video diffusion
+model, allowing users to independently manipulate foreground and background
+elements in distinct layers. To address the challenge of limited layer-specific
+data, we propose a data curation pipeline that features automated element
+segmentation, motion-state hierarchical merging, and motion coherence
+refinement. Through quantitative and qualitative comparisons, and user study,
+we demonstrate that LayerAnimate outperforms current methods in terms of
+animation quality, control precision, and usability, making it an ideal tool
+for both professional animators and amateur enthusiasts. This framework opens
+up new possibilities for layer-specific animation applications and creative
+flexibility. Our code is available at https://layeranimate.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://layeranimate.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Deep Learning for Polyp Segmentation: Techniques, Challenges
+  and Future Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18373v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18373v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Mei, Tao Zhou, Kaiwen Huang, Yizhe Zhang, Yi Zhou, Ye Wu, Huazhu Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early detection and assessment of polyps play a crucial role in the
+prevention and treatment of colorectal cancer (CRC). Polyp segmentation
+provides an effective solution to assist clinicians in accurately locating and
+segmenting polyp regions. In the past, people often relied on manually
+extracted lower-level features such as color, texture, and shape, which often
+had issues capturing global context and lacked robustness to complex scenarios.
+With the advent of deep learning, more and more outstanding medical image
+segmentation algorithms based on deep learning networks have emerged, making
+significant progress in this field. This paper provides a comprehensive review
+of polyp segmentation algorithms. We first review some traditional algorithms
+based on manually extracted features and deep segmentation algorithms, then
+detail benchmark datasets related to the topic. Specifically, we carry out a
+comprehensive evaluation of recent deep learning models and results based on
+polyp sizes, considering the pain points of research topics and differences in
+network structures. Finally, we discuss the challenges of polyp segmentation
+and future trends in this field. The models, benchmark datasets, and source
+code links we collected are all published at
+https://github.com/taozh2017/Awesome-Polyp-Segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Have been published in Visual Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Isolated Diffusion: Optimizing Multi-Concept Text-to-Image Generation
+  Training-Freely with Isolated Diffusion Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16954v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16954v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyuan Zhu, Huimin Ma, Jiansheng Chen, Jian Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale text-to-image diffusion models have achieved great success in
+synthesizing high-quality and diverse images given target text prompts. Despite
+the revolutionary image generation ability, current state-of-the-art models
+still struggle to deal with multi-concept generation accurately in many cases.
+This phenomenon is known as ``concept bleeding" and displays as the unexpected
+overlapping or merging of various concepts. This paper presents a general
+approach for text-to-image diffusion models to address the mutual interference
+between different subjects and their attachments in complex scenes, pursuing
+better text-image consistency. The core idea is to isolate the synthesizing
+processes of different concepts. We propose to bind each attachment to
+corresponding subjects separately with split text prompts. Besides, we
+introduce a revision method to fix the concept bleeding problem in
+multi-subject synthesis. We first depend on pre-trained object detection and
+segmentation models to obtain the layouts of subjects. Then we isolate and
+resynthesize each subject individually with corresponding text prompts to avoid
+mutual interference. Overall, we achieve a training-free strategy, named
+Isolated Diffusion, to optimize multi-concept text-to-image synthesis. It is
+compatible with the latest Stable Diffusion XL (SDXL) and prior Stable
+Diffusion (SD) models. We compare our approach with alternative methods using a
+variety of multi-concept text prompts and demonstrate its effectiveness with
+clear advantages in text-image consistency and user study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Visualization and Computer Graphics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expression <span class="highlight-title">Prompt</span> Collaboration <span class="highlight-title">Transformer</span> for Universal Referring
+  Video Object Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04162v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04162v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Chen, Jiacheng Lin, Guojin Zhong, Haolong Fu, Ke Nai, Kailun Yang, Zhiyong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-guided Video Object Segmentation (A-VOS) and Referring Video Object
+Segmentation (R-VOS) are two highly related tasks that both aim to segment
+specific objects from video sequences according to expression prompts. However,
+due to the challenges of modeling representations for different modalities,
+existing methods struggle to strike a balance between interaction flexibility
+and localization precision. In this paper, we address this problem from two
+perspectives: the alignment of audio and text and the deep interaction among
+audio, text, and visual modalities. First, we propose a universal architecture,
+the Expression Prompt Collaboration Transformer, herein EPCFormer. Next, we
+propose an Expression Alignment (EA) mechanism for audio and text. The proposed
+EPCFormer exploits the fact that audio and text prompts referring to the same
+objects are semantically equivalent by using contrastive learning for both
+types of expressions. Then, to facilitate deep interactions among audio, text,
+and visual modalities, we introduce an Expression-Visual Attention (EVA)
+module. The knowledge of video object segmentation in terms of the expression
+prompts can seamlessly transfer between the two tasks by deeply exploring
+complementary cues between text and audio. Experiments on well-recognized
+benchmarks demonstrate that our EPCFormer attains state-of-the-art results on
+both tasks. The source code will be made publicly available at
+https://github.com/lab206/EPCFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Knowledge-Based Systems (KBS). The source code will be
+  made publicly available at https://github.com/lab206/EPCFormer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tarsier2: Advancing Large Vision-Language Models from Detailed Video
+  Description to Comprehensive Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07888v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07888v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Yuan, Jiawei Wang, Haomiao Sun, Yuchen Zhang, Yuan Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM)
+designed for generating detailed and accurate video descriptions, while also
+exhibiting superior general video understanding capabilities. Tarsier2 achieves
+significant advancements through three key upgrades: (1) Scaling pre-training
+data from 11M to 40M video-text pairs, enriching both volume and diversity; (2)
+Performing fine-grained temporal alignment during supervised fine-tuning; (3)
+Using model-based sampling to automatically construct preference data and
+applying DPO training for optimization. Extensive experiments show that
+Tarsier2-7B consistently outperforms leading proprietary models, including
+GPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K
+benchmark, Tarsier2-7B improves F1 by 2.8\% over GPT-4o and 5.8\% over
+Gemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\%
+performance advantage over GPT-4o and +24.9\% over Gemini-1.5-Pro. Tarsier2-7B
+also sets new state-of-the-art results across 15 public benchmarks, spanning
+tasks such as video question-answering, video grounding, hallucination test,
+and embodied question-answering, demonstrating its versatility as a robust
+generalist vision-language model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continuous Urban Change Detection from Satellite Image Time Series with
+  Temporal Feature Refinement and Multi-Task Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.17458v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.17458v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Hafner, Heng Fang, Hossein Azizpour, Yifang Ban
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urbanization advances at unprecedented rates, resulting in negative effects
+on the environment and human well-being. Remote sensing has the potential to
+mitigate these effects by supporting sustainable development strategies with
+accurate information on urban growth. Deep learning-based methods have achieved
+promising urban change detection results from optical satellite image pairs
+using convolutional neural networks (ConvNets), transformers, and a multi-task
+learning setup. However, transformers have not been leveraged for urban change
+detection with multi-temporal data, i.e., >2 images, and multi-task learning
+methods lack integration approaches that combine change and segmentation
+outputs. To fill this research gap, we propose a continuous urban change
+detection method that identifies changes in each consecutive image pair of a
+satellite image time series (SITS). Specifically, we propose a temporal feature
+refinement (TFR) module that utilizes self-attention to improve ConvNet-based
+multi-temporal building representations. Furthermore, we propose a multi-task
+integration (MTI) module that utilizes Markov networks to find an optimal
+building map time series based on segmentation and dense change outputs. The
+proposed method effectively identifies urban changes based on high-resolution
+SITS acquired by the PlanetScope constellation (F1 score 0.551) and Gaofen-2
+(F1 score 0.440). Moreover, our experiments on two challenging datasets
+demonstrate the effectiveness of the proposed method compared to bi-temporal
+and multi-temporal urban change detection and segmentation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review at IEEE Transactions on Geoscience and Remote Sensing,
+  Code will be available at https://github.com/SebastianHafner/ContUrbanCD.git</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mamba2D: A Natively Multi-Dimensional State-Space Model for Vision Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16146v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16146v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enis Baty, Alejandro Hernández Díaz, Chris Bridges, Rebecca Davidson, Steve Eckersley, Simon Hadfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-Space Models (SSMs) have recently emerged as a powerful and efficient
+alternative to the long-standing transformer architecture. However, existing
+SSM conceptualizations retain deeply rooted biases from their roots in natural
+language processing. This constrains their ability to appropriately model the
+spatially-dependent characteristics of visual inputs. In this paper, we address
+these limitations by re-deriving modern selective state-space techniques,
+starting from a natively multidimensional formulation. Currently, prior works
+attempt to apply natively 1D SSMs to 2D data (i.e. images) by relying on
+arbitrary combinations of 1D scan directions to capture spatial dependencies.
+In contrast, Mamba2D improves upon this with a single 2D scan direction that
+factors in both dimensions of the input natively, effectively modelling spatial
+dependencies when constructing hidden states. Mamba2D shows comparable
+performance to prior adaptations of SSMs for vision tasks, on standard image
+classification evaluations with the ImageNet-1K dataset. Source code is
+available at https://github.com/cocoalex00/Mamba2D.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Synthesis for Zero-Shot Model Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15977v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15977v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyun Yang, Juan Cao, Danding Wang, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, generative models are shaping various fields such as art, design,
+and human-computer interaction, yet accompanied by challenges related to
+copyright infringement and content management. In response, existing research
+seeks to identify the unique fingerprints on the images they generate, which
+can be leveraged to attribute the generated images to their source models.
+Existing methods, however, are constrained to identifying models within a
+static set included in the classifier training, failing to adapt to newly
+emerged unseen models dynamically. To bridge this gap, we aim to develop a
+generalized model fingerprint extractor capable of zero-shot attribution,
+effectively attributes unseen models without exposure during training. Central
+to our method is a model synthesis technique, which generates numerous
+synthetic models mimicking the fingerprint patterns of real-world generative
+models. The design of the synthesis technique is motivated by observations on
+how the basic generative model's architecture building blocks and parameters
+influence fingerprint patterns, and it is validated through two designed
+metrics that examine synthetic models' fidelity and diversity. Our experiments
+demonstrate that this fingerprint extractor, trained solely on synthetic
+models, achieves impressive zero-shot generalization on a wide range of
+real-world generative models, improving model identification and verification
+accuracy on unseen models by over 40% and 15%, respectively, compared to
+existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-stage Deep Learning Artifact Reduction for Pallel-beam Computed
+  Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayang Shi, Daniel M. Pelt, K. Joost Batenburg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computed Tomography (CT) using synchrotron radiation is a powerful technique
+that, compared to lab-CT techniques, boosts high spatial and temporal
+resolution while also providing access to a range of contrast-formation
+mechanisms. The acquired projection data is typically processed by a
+computational pipeline composed of multiple stages. Artifacts introduced during
+data acquisition can propagate through the pipeline, and degrade image quality
+in the reconstructed images. Recently, deep learning has shown significant
+promise in enhancing image quality for images representing scientific data.
+This success has driven increasing adoption of deep learning techniques in CT
+imaging. Various approaches have been proposed to incorporate deep learning
+into computational pipelines, but each has limitations in addressing artifacts
+effectively and efficiently in synchrotron CT, either in properly addressing
+the specific artifacts, or in computational efficiency.
+  Recognizing these challenges, we introduce a novel method that incorporates
+separate deep learning models at each stage of the tomography
+pipeline-projection, sinogram, and reconstruction-to address specific artifacts
+locally in a data-driven way. Our approach includes bypass connections that
+feed both the outputs from previous stages and raw data to subsequent stages,
+minimizing the risk of error propagation. Extensive evaluations on both
+simulated and real-world datasets illustrate that our approach effectively
+reduces artifacts and outperforms comparison methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IncSAR: A Dual Fusion Incremental Learning Framework for SAR Target
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05820v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05820v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Karantaidis, Athanasios Pantsios, Ioannis Kompatsiaris, Symeon Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning techniques have achieved significant success in Synthetic
+Aperture Radar (SAR) target recognition using predefined datasets in static
+scenarios. However, real-world applications demand that models incrementally
+learn new information without forgetting previously acquired knowledge. The
+challenge of catastrophic forgetting, where models lose past knowledge when
+adapting to new tasks, remains a critical issue. In this paper, we introduce
+IncSAR, an incremental learning framework designed to tackle catastrophic
+forgetting in SAR target recognition. IncSAR combines the power of a Vision
+Transformer (ViT) and a custom-designed Convolutional Neural Network (CNN) in a
+dual-branch architecture, integrated via a late-fusion strategy. Additionally,
+we explore the use of TinyViT to reduce computational complexity and propose an
+attention mechanism to dynamically enhance feature representation. To mitigate
+the speckle noise inherent in SAR images, we employ a denoising module based on
+a neural network approximation of Robust Principal Component Analysis (RPCA),
+leveraging a simple neural network for efficient noise reduction in SAR
+imagery. Moreover, a random projection layer improves the linear separability
+of features, and a variant of Linear Discriminant Analysis (LDA) decorrelates
+extracted class prototypes for better generalization. Extensive experiments on
+the MSTAR, SAR-AIRcraft-1.0, and OpenSARShip benchmark datasets demonstrate
+that IncSAR significantly outperforms state-of-the-art approaches, achieving a
+99.63\% average accuracy and a 0.33\% performance drop, representing an 89\%
+improvement in retention compared to existing techniques. The source code is
+available at https://github.com/geokarant/IncSAR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VLSBench: Unveiling Visual Leakage in Multimodal Safety 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19939v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19939v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhao Hu, Dongrui Liu, Hao Li, Xuanjing Huang, Jing Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safety concerns of Multimodal large language models (MLLMs) have gradually
+become an important problem in various applications. Surprisingly, previous
+works indicate a counter-intuitive phenomenon that using textual unlearning to
+align MLLMs achieves comparable safety performances with MLLMs trained with
+image-text pairs. To explain such a counter-intuitive phenomenon, we discover a
+visual safety information leakage (VSIL) problem in existing multimodal safety
+benchmarks, i.e., the potentially risky and sensitive content in the image has
+been revealed in the textual query. In this way, MLLMs can easily refuse these
+sensitive text-image queries according to textual queries. However, image-text
+pairs without VSIL are common in real-world scenarios and are overlooked by
+existing multimodal safety benchmarks. To this end, we construct multimodal
+visual leakless safety benchmark (VLSBench) preventing visual safety leakage
+from image to textual query with 2.4k image-text pairs. Experimental results
+indicate that VLSBench poses a significant challenge to both open-source and
+close-source MLLMs, including LLaVA, Qwen2-VL, Llama3.2-Vision, and GPT-4o.
+This study demonstrates that textual alignment is enough for multimodal safety
+scenarios with VSIL, while multimodal alignment is a more promising solution
+for multimodal safety scenarios without VSIL. Please see our code and data at:
+https://hxhcreate.github.io/vlsbench.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SARATR-X: Towards Building A Foundation Model for SAR Target Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09365v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09365v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijie Li, Wei Yang, Yuenan Hou, Li Liu, Yongxiang Liu, Xiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the remarkable progress in synthetic aperture radar automatic target
+recognition (SAR ATR), recent efforts have concentrated on detecting and
+classifying a specific category, e.g., vehicles, ships, airplanes, or
+buildings. One of the fundamental limitations of the top-performing SAR ATR
+methods is that the learning paradigm is supervised, task-specific,
+limited-category, closed-world learning, which depends on massive amounts of
+accurately annotated samples that are expensively labeled by expert SAR
+analysts and have limited generalization capability and scalability. In this
+work, we make the first attempt towards building a foundation model for SAR
+ATR, termed SARATR-X. SARATR-X learns generalizable representations via
+self-supervised learning (SSL) and provides a cornerstone for label-efficient
+model adaptation to generic SAR target detection and classification tasks.
+Specifically, SARATR-X is trained on 0.18 M unlabelled SAR target samples,
+which are curated by combining contemporary benchmarks and constitute the
+largest publicly available dataset till now. Considering the characteristics of
+SAR images, a backbone tailored for SAR ATR is carefully designed, and a
+two-step SSL method endowed with multi-scale gradient features was applied to
+ensure the feature diversity and model scalability of SARATR-X. The
+capabilities of SARATR-X are evaluated on classification under few-shot and
+robustness settings and detection across various categories and scenes, and
+impressive performance is achieved, often competitive with or even superior to
+prior fully supervised, semi-supervised, or self-supervised algorithms. Our
+SARATR-X and the curated dataset are released at
+https://github.com/waterdisappear/SARATR-X to foster research into foundation
+models for SAR image interpretation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating analytical variability in fMRI results with style transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03703v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03703v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elodie Germani, Camille Maumet, Elisa Fromont
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel approach to improve the reproducibility of neuroimaging
+results by converting statistic maps across different functional MRI pipelines.
+We make the assumption that pipelines used to compute fMRI statistic maps can
+be considered as a style component and we propose to use different generative
+models, among which, Generative Adversarial Networks (GAN) and Diffusion Models
+(DM) to convert statistic maps across different pipelines. We explore the
+performance of multiple GAN frameworks, and design a new DM framework for
+unsupervised multi-domain styletransfer. We constrain the generation of 3D fMRI
+statistic maps using the latent space of an auxiliary classifier that
+distinguishes statistic maps from different pipelines and extend traditional
+sampling techniques used in DM to improve the transition performance. Our
+experiments demonstrate that our proposed methods aresuccessful: pipelines can
+indeed be transferred as a style component, providing animportant source of
+data augmentation for future medical studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accelerating lensed quasars discovery and modeling with physics-informed
+  variational autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12709v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12709v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Irham T. Andika, Stefan Schuldt, Sherry H. Suyu, Satadru Bag, Raoul Cañameras, Alejandra Melo, Claudio Grillo, James H. H. Chan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Strongly lensed quasars provide valuable insights into the rate of cosmic
+expansion, the distribution of dark matter in foreground deflectors, and the
+characteristics of quasar hosts. However, detecting them in astronomical images
+is difficult due to the prevalence of non-lensing objects. To address this
+challenge, we developed a generative deep learning model called VariLens, built
+upon a physics-informed variational autoencoder. This model seamlessly
+integrates three essential modules: image reconstruction, object
+classification, and lens modeling, offering a fast and comprehensive approach
+to strong lens analysis. VariLens is capable of rapidly determining both (1)
+the probability that an object is a lens system and (2) key parameters of a
+singular isothermal ellipsoid (SIE) mass model -- including the Einstein radius
+($\theta_\mathrm{E}$), lens center, and ellipticity -- in just milliseconds
+using a single CPU. A direct comparison of VariLens estimates with traditional
+lens modeling for 20 known lensed quasars within the Subaru Hyper Suprime-Cam
+(HSC) footprint shows good agreement, with both results consistent within
+$2\sigma$ for systems with $\theta_\mathrm{E}<3$ arcsecs. To identify new
+lensed quasar candidates, we begin with an initial sample of approximately 80
+million sources, combining HSC data with multiwavelength information from
+various surveys. After applying a photometric preselection aimed at locating
+$z>1.5$ sources, the number of candidates is reduced to 710,966. Subsequently,
+VariLens highlights 13,831 sources, each showing a high likelihood of being a
+lens. A visual assessment of these objects results in 42 promising candidates
+that await spectroscopic confirmation. These results underscore the potential
+of automated deep learning pipelines to efficiently detect and model strong
+lenses in large datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the Astronomy & Astrophysics journal and updated to
+  reflect the revised version. The paper consists of 17 main pages, 14 figures,
+  and 5 tables. We welcome feedback and comments from readers!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WaveDH: Wavelet Sub-bands Guided ConvNet for Efficient Image Dehazing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01604v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01604v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seongmin Hwang, Daeyoung Han, Cheolkon Jung, Moongu Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The surge in interest regarding image dehazing has led to notable
+advancements in deep learning-based single image dehazing approaches,
+exhibiting impressive performance in recent studies. Despite these strides,
+many existing methods fall short in meeting the efficiency demands of practical
+applications. In this paper, we introduce WaveDH, a novel and compact ConvNet
+designed to address this efficiency gap in image dehazing. Our WaveDH leverages
+wavelet sub-bands for guided up-and-downsampling and frequency-aware feature
+refinement. The key idea lies in utilizing wavelet decomposition to extract
+low-and-high frequency components from feature levels, allowing for faster
+processing while upholding high-quality reconstruction. The downsampling block
+employs a novel squeeze-and-attention scheme to optimize the feature
+downsampling process in a structurally compact manner through wavelet domain
+learning, preserving discriminative features while discarding noise components.
+In our upsampling block, we introduce a dual-upsample and fusion mechanism to
+enhance high-frequency component awareness, aiding in the reconstruction of
+high-frequency details. Departing from conventional dehazing methods that treat
+low-and-high frequency components equally, our feature refinement block
+strategically processes features with a frequency-aware approach. By employing
+a coarse-to-fine methodology, it not only refines the details at frequency
+levels but also significantly optimizes computational costs. The refinement is
+performed in a maximum 8x downsampled feature space, striking a favorable
+efficiency-vs-accuracy trade-off. Extensive experiments demonstrate that our
+method, WaveDH, outperforms many state-of-the-art methods on several image
+dehazing benchmarks with significantly reduced computational costs. Our code is
+available at https://github.com/AwesomeHwang/WaveDH.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-guided Image Restoration and Semantic Enhancement for Text-to-Image
+  Person Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09059v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09059v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Delong Liu, Haiwen Li, Zhicheng Zhao, Yuan Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of Text-to-Image Person Retrieval (TIPR) is to retrieve specific
+person images according to the given textual descriptions. A primary challenge
+in this task is bridging the substantial representational gap between visual
+and textual modalities. The prevailing methods map texts and images into
+unified embedding space for matching, while the intricate semantic
+correspondences between texts and images are still not effectively constructed.
+To address this issue, we propose a novel TIPR framework to build fine-grained
+interactions and alignment between person images and the corresponding texts.
+Specifically, via fine-tuning the Contrastive Language-Image Pre-training
+(CLIP) model, a visual-textual dual encoder is firstly constructed, to
+preliminarily align the image and text features. Secondly, a Text-guided Image
+Restoration (TIR) auxiliary task is proposed to map abstract textual entities
+to specific image regions, improving the alignment between local textual and
+visual embeddings. Additionally, a cross-modal triplet loss is presented to
+handle hard samples, and further enhance the model's discriminability for minor
+differences. Moreover, a pruning-based text data augmentation approach is
+proposed to enhance focus on essential elements in descriptions, thereby
+avoiding excessive model attention to less significant information. The
+experimental results show our proposed method outperforms state-of-the-art
+methods on three popular benchmark datasets, and the code will be made publicly
+available at https://github.com/Delong-liu-bupt/SEN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper was withdrawn due to a dispute among the authors regarding
+  the content of the article</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-guided Synthetic Geometric Augmentation for Zero-shot 3D
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09278v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09278v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kohei Torimi, Ryosuke Yamada, Daichi Otsuka, Kensho Hara, Yuki M. Asano, Hirokatsu Kataoka, Yoshimitsu Aoki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot recognition models require extensive training data for
+generalization. However, in zero-shot 3D classification, collecting 3D data and
+captions is costly and laborintensive, posing a significant barrier compared to
+2D vision. Recent advances in generative models have achieved unprecedented
+realism in synthetic data production, and recent research shows the potential
+for using generated data as training data. Here, naturally raising the
+question: Can synthetic 3D data generated by generative models be used as
+expanding limited 3D datasets? In response, we present a synthetic 3D dataset
+expansion method, Textguided Geometric Augmentation (TeGA). TeGA is tailored
+for language-image-3D pretraining, which achieves SoTA in zero-shot 3D
+classification, and uses a generative textto-3D model to enhance and extend
+limited 3D datasets. Specifically, we automatically generate text-guided
+synthetic 3D data and introduce a consistency filtering strategy to discard
+noisy samples where semantics and geometric shapes do not match with text. In
+the experiment to double the original dataset size using TeGA, our approach
+demonstrates improvements over the baselines, achieving zeroshot performance
+gains of 3.0% on Objaverse-LVIS, 4.6% on ScanObjectNN, and 8.7% on ModelNet40.
+These results demonstrate that TeGA effectively bridges the 3D data gap,
+enabling robust zero-shot 3D classification even with limited real training
+data and paving the way for zero-shot 3D vision application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SuperNeRF-GAN: A Universal 3D-Consistent Super-Resolution Framework for
+  Efficient and Enhanced 3D-Aware Image Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06770v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06770v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Zheng, Linzhi Huang, Yizhou Yu, Yi Chang, Yilin Wang, Rui Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural volume rendering techniques, such as NeRF, have revolutionized
+3D-aware image synthesis by enabling the generation of images of a single scene
+or object from various camera poses. However, the high computational cost of
+NeRF presents challenges for synthesizing high-resolution (HR) images. Most
+existing methods address this issue by leveraging 2D super-resolution, which
+compromise 3D-consistency. Other methods propose radiance manifolds or
+two-stage generation to achieve 3D-consistent HR synthesis, yet they are
+limited to specific synthesis tasks, reducing their universality. To tackle
+these challenges, we propose SuperNeRF-GAN, a universal framework for
+3D-consistent super-resolution. A key highlight of SuperNeRF-GAN is its
+seamless integration with NeRF-based 3D-aware image synthesis methods and it
+can simultaneously enhance the resolution of generated images while preserving
+3D-consistency and reducing computational cost. Specifically, given a
+pre-trained generator capable of producing a NeRF representation such as
+tri-plane, we first perform volume rendering to obtain a low-resolution image
+with corresponding depth and normal map. Then, we employ a NeRF
+Super-Resolution module which learns a network to obtain a high-resolution
+NeRF. Next, we propose a novel Depth-Guided Rendering process which contains
+three simple yet effective steps, including the construction of a
+boundary-correct multi-depth map through depth aggregation, a normal-guided
+depth super-resolution and a depth-guided NeRF rendering. Experimental results
+demonstrate the superior efficiency, 3D-consistency, and quality of our
+approach. Additionally, ablation studies confirm the effectiveness of our
+proposed components.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DX2CT: Diffusion Model for 3D CT Reconstruction from Bi or Mono-planar
+  2D X-ray(s) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08850v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08850v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Su Jeong, Hye Bin Yoo, Il Yong Chun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational tomography (CT) provides high-resolution medical imaging, but
+it can expose patients to high radiation. X-ray scanners have low radiation
+exposure, but their resolutions are low. This paper proposes a new conditional
+diffusion model, DX2CT, that reconstructs three-dimensional (3D) CT volumes
+from bi or mono-planar X-ray image(s). Proposed DX2CT consists of two key
+components: 1) modulating feature maps extracted from two-dimensional (2D)
+X-ray(s) with 3D positions of CT volume using a new transformer and 2)
+effectively using the modulated 3D position-aware feature maps as conditions of
+DX2CT. In particular, the proposed transformer can provide conditions with rich
+information of a target CT slice to the conditional diffusion model, enabling
+high-quality CT reconstruction. Our experiments with the bi or mono-planar
+X-ray(s) benchmark datasets show that proposed DX2CT outperforms several
+state-of-the-art methods. Our codes and model will be available at:
+https://www.github.com/intyeger/DX2CT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MoRe: Class Patch Attention Needs Regularization for Weakly Supervised
+  Semantic Segmentation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11076v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11076v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Yang, Yucong Meng, Kexue Fu, Shuo Wang, Zhijian Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly Supervised Semantic Segmentation (WSSS) with image-level labels
+typically uses Class Activation Maps (CAM) to achieve dense predictions.
+Recently, Vision Transformer (ViT) has provided an alternative to generate
+localization maps from class-patch attention. However, due to insufficient
+constraints on modeling such attention, we observe that the Localization
+Attention Maps (LAM) often struggle with the artifact issue, i.e., patch
+regions with minimal semantic relevance are falsely activated by class tokens.
+In this work, we propose MoRe to address this issue and further explore the
+potential of LAM. Our findings suggest that imposing additional regularization
+on class-patch attention is necessary. To this end, we first view the attention
+as a novel directed graph and propose the Graph Category Representation module
+to implicitly regularize the interaction among class-patch entities. It ensures
+that class tokens dynamically condense the related patch information and
+suppress unrelated artifacts at a graph level. Second, motivated by the
+observation that CAM from classification weights maintains smooth localization
+of objects, we devise the Localization-informed Regularization module to
+explicitly regularize the class-patch attention. It directly mines the token
+relations from CAM and further supervises the consistency between class and
+patch tokens in a learnable manner. Extensive experiments are conducted on
+PASCAL VOC and MS COCO, validating that MoRe effectively addresses the artifact
+issue and achieves state-of-the-art performance, surpassing recent single-stage
+and even multi-stage methods. Code is available at
+https://github.com/zwyang6/MoRe.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Elucidating the Design Space of <span class="highlight-title">Dataset</span> Condensation <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13733v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13733v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shitong Shao, Zikai Zhou, Huanran Chen, Zhiqiang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset condensation, a concept within data-centric learning, efficiently
+transfers critical attributes from an original dataset to a synthetic version,
+maintaining both diversity and realism. This approach significantly improves
+model training efficiency and is adaptable across multiple application areas.
+Previous methods in dataset condensation have faced challenges: some incur high
+computational costs which limit scalability to larger datasets (e.g., MTT,
+DREAM, and TESLA), while others are restricted to less optimal design spaces,
+which could hinder potential improvements, especially in smaller datasets
+(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a
+comprehensive design framework that includes specific, effective strategies
+like implementing soft category-aware matching and adjusting the learning rate
+schedule. These strategies are grounded in empirical evidence and theoretical
+backing. Our resulting approach, Elucidate Dataset Condensation (EDC),
+establishes a benchmark for both small and large-scale dataset condensation. In
+our testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on
+ImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a
+compression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM,
+and RDED by margins of 27.3%, 17.2%, and 6.6%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harnessing small projectors and multiple views for efficient vision
+  <span class="highlight-title">pretrain</span>ing <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10725v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10725v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kumar Krishna Agrawal, Arna Ghosh, Shagun Sodhani, Adam Oberman, Blake Richards
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in self-supervised (SSL) visual representation learning has
+led to the development of several different proposed frameworks that rely on
+augmentations of images but use different loss functions. However, there are
+few theoretically grounded principles to guide practice, so practical
+implementation of each SSL framework requires several heuristics to achieve
+competitive performance. In this work, we build on recent analytical results to
+design practical recommendations for competitive and efficient SSL that are
+grounded in theory. Specifically, recent theory tells us that existing SSL
+frameworks are minimizing the same idealized loss, which is to learn features
+that best match the data similarity kernel defined by the augmentations used.
+We show how this idealized loss can be reformulated to a functionally
+equivalent loss that is more efficient to compute. We study the implicit bias
+of using gradient descent to minimize our reformulated loss function and find
+that using a stronger orthogonalization constraint with a reduced projector
+dimensionality should yield good representations. Furthermore, the theory tells
+us that approximating the reformulated loss should be improved by increasing
+the number of augmentations, and as such using multiple augmentations should
+lead to improved convergence. We empirically verify our findings on CIFAR, STL
+and Imagenet datasets, wherein we demonstrate an improved linear readout
+performance when training a ResNet-backbone using our theoretically grounded
+recommendations. Remarkably, we also demonstrate that by leveraging these
+insights, we can reduce the pretraining dataset size by up to 2$\times$ while
+maintaining downstream accuracy simply by using more data augmentations. Taken
+together, our work provides theoretically grounded recommendations that can be
+used to improve SSL convergence and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OPCap:Object-aware <span class="highlight-title">Prompt</span>ing Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.00095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.00095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiyang Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of image captioning, the phenomenon where missing or nonexistent
+objects are used to explain an image is referred to as object bias (or
+hallucination). To mitigate this issue, we propose a target-aware prompting
+strategy. This method first extracts object labels and their spatial
+information from the image using an object detector. Then, an attribute
+predictor further refines the semantic features of the objects. These refined
+features are subsequently integrated and fed into the decoder, enhancing the
+model's understanding of the image context. Experimental results on the COCO
+and nocaps datasets demonstrate that OPCap effectively mitigates hallucination
+and significantly improves the quality of generated captions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Driving in the Occupancy World: Vision-Centric 4D Occupancy Forecasting
+  and Planning via World Models for Autonomous Driving <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14197v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14197v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Yang, Jianbiao Mei, Yukai Ma, Siliang Du, Wenqing Chen, Yijie Qian, Yuxiang Feng, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  World models envision potential future states based on various ego actions.
+They embed extensive knowledge about the driving environment, facilitating safe
+and scalable autonomous driving. Most existing methods primarily focus on
+either data generation or the pretraining paradigms of world models. Unlike the
+aforementioned prior works, we propose Drive-OccWorld, which adapts a
+vision-centric 4D forecasting world model to end-to-end planning for autonomous
+driving. Specifically, we first introduce a semantic and motion-conditional
+normalization in the memory module, which accumulates semantic and dynamic
+information from historical BEV embeddings. These BEV features are then
+conveyed to the world decoder for future occupancy and flow forecasting,
+considering both geometry and spatiotemporal modeling. Additionally, we propose
+injecting flexible action conditions, such as velocity, steering angle,
+trajectory, and commands, into the world model to enable controllable
+generation and facilitate a broader range of downstream applications.
+Furthermore, we explore integrating the generative capabilities of the 4D world
+model with end-to-end planning, enabling continuous forecasting of future
+states and the selection of optimal trajectories using an occupancy-based cost
+function. Comprehensive experiments conducted on the nuScenes,
+nuScenes-Occupancy, and Lyft-Level5 datasets illustrate that our method can
+generate plausible and controllable 4D occupancy, paving the way for
+advancements in driving world generation and end-to-end planning. Project page:
+https://drive-occworld.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Plug-and-Play HIO Approach for Phase Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.18967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.18967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cagatay Isil, Figen S. Oktem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the phase retrieval problem, the aim is the recovery of an unknown image
+from intensity-only measurements such as Fourier intensity. Although there are
+several solution approaches, solving this problem is challenging due to its
+nonlinear and ill-posed nature. Recently, learning-based approaches have
+emerged as powerful alternatives to the analytical methods for several inverse
+problems. In the context of phase retrieval, a novel plug-and-play approach
+that exploits learning-based prior and efficient update steps has been
+presented at the Computational Optical Sensing and Imaging topical meeting,
+with demonstrated state-of-the-art performance. The key idea was to incorporate
+learning-based prior to the Gerchberg-Saxton type algorithms through
+plug-and-play regularization. In this paper, we present the mathematical
+development of the method including the derivation of its analytical update
+steps based on half-quadratic splitting and comparatively evaluate its
+performance through extensive simulations on a large test dataset. The results
+show the effectiveness of the method in terms of both image quality,
+computational efficiency, and robustness to initialization and noise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instruction-Guided Fusion of Multi-Layer Visual Features in Large
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08443v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08443v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Li, Yi Zheng, Haotian Chen, Xiaolei Chen, Yuxuan Liang, Chenghang Lai, Bin Li, Xiangyang Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) have achieved remarkable success in a
+wide range of multimodal tasks by integrating pre-trained vision encoders and
+large language models. However, current LVLMs primarily rely on visual features
+extracted from the final layers of the vision encoder, overlooking the
+complementary information available in shallower layers. While recent
+approaches have explored the use of multilayer visual features in LVLMs, they
+tend to be task-agnostic and fail to examine the dependencies of hierarchical
+visual features on specific tasks. To address these gaps, we systematically
+investigate the contributions of visual features from different encoder layers
+using 18 benchmarks spanning 6 task categories. Our findings reveal that
+multilayer features provide complementary strengths with varying task
+dependencies, and uniform fusion leads to suboptimal performance. Building on
+these insights, we propose the instruction-guided vision aggregator, a module
+that dynamically integrates multi-layer visual features based on textual
+instructions, without increasing the number of visual tokens. Extensive
+evaluations demonstrate the superior performance of our method. Additionally,
+an in-depth analysis of the aggregator's behavior highlights the dominance of
+mid-to-high-level features in semantic-rich tasks and the critical role of
+low-level features in fine-grained perception.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Myriad: Large Multimodal Model by Applying Vision Experts for Industrial
+  Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.19070v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.19070v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanze Li, Haolin Wang, Shihao Yuan, Ming Liu, Debin Zhao, Yiwen Guo, Chen Xu, Guangming Shi, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the training configuration, traditional industrial anomaly detection
+(IAD) methods have to train a specific model for each deployment scenario,
+which is insufficient to meet the requirements of modern design and
+manufacturing. On the contrary, large multimodal models~(LMMs) have shown
+eminent generalization ability on various vision tasks, and their perception
+and comprehension capabilities imply the potential of applying LMMs on IAD
+tasks. However, we observe that even though the LMMs have abundant knowledge
+about industrial anomaly detection in the textual domain, the LMMs are unable
+to leverage the knowledge due to the modality gap between textual and visual
+domains. To stimulate the relevant knowledge in LMMs and adapt the LMMs towards
+anomaly detection tasks, we introduce existing IAD methods as vision experts
+and present a novel large multimodal model applying vision experts for
+industrial anomaly detection~(abbreviated to {Myriad}). Specifically, we
+utilize the anomaly map generated by the vision experts as guidance for LMMs,
+such that the vision model is guided to pay more attention to anomalous
+regions. Then, the visual features are modulated via an adapter to fit the
+anomaly detection tasks, which are fed into the language model together with
+the vision expert guidance and human instructions to generate the final
+outputs. Extensive experiments are applied on MVTec-AD, VisA, and PCB Bank
+benchmarks demonstrate that our proposed method not only performs favorably
+against state-of-the-art methods, but also inherits the flexibility and
+instruction-following ability of LMMs in the field of IAD. Source code and
+pre-trained models are publicly available at
+\url{https://github.com/tzjtatata/Myriad}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TraceFL: Interpretability-Driven Debugging in Federated Learning via
+  Neuron Provenance <span class="chip">ICSE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13632v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13632v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Waris Gill, Ali Anwar, Muhammad Ali Gulzar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Federated Learning, clients train models on local data and send updates to
+a central server, which aggregates them into a global model using a fusion
+algorithm. This collaborative yet privacy-preserving training comes at a cost.
+FL developers face significant challenges in attributing global model
+predictions to specific clients. Localizing responsible clients is a crucial
+step towards (a) excluding clients primarily responsible for incorrect
+predictions and (b) encouraging clients who contributed high-quality models to
+continue participating in the future. Existing ML debugging approaches are
+inherently inapplicable as they are designed for single-model, centralized
+training.
+  We introduce TraceFL, a fine-grained neuron provenance capturing mechanism
+that identifies clients responsible for a global model's prediction by tracking
+the flow of information from individual clients to the global model. Since
+inference on different inputs activates a different set of neurons of the
+global model, TraceFL dynamically quantifies the significance of the global
+model's neurons in a given prediction, identifying the most crucial neurons in
+the global model. It then maps them to the corresponding neurons in every
+participating client to determine each client's contribution, ultimately
+localizing the responsible client. We evaluate TraceFL on six datasets,
+including two real-world medical imaging datasets and four neural networks,
+including advanced models such as GPT. TraceFL achieves 99% accuracy in
+localizing the responsible client in FL tasks spanning both image and text
+classification tasks. At a time when state-of-the-artML debugging approaches
+are mostly domain-specific (e.g., image classification only), TraceFL is the
+first technique to enable highly accurate automated reasoning across a wide
+range of FL applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 2025 IEEE/ACM 47th International Conference on Software
+  Engineering (ICSE)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Epicardium <span class="highlight-title">Prompt</span>-guided Real-time Cardiac Ultrasound Frame-to-volume
+  Registration <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14534v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14534v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Lei, Jun Zhou, Jialun Pei, Baoliang Zhao, Yueming Jin, Yuen-Chun Jeremy Teoh, Jing Qin, Pheng-Ann Heng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A comprehensive guidance view for cardiac interventional surgery can be
+provided by the real-time fusion of the intraoperative 2D images and
+preoperative 3D volume based on the ultrasound frame-to-volume registration.
+However, cardiac ultrasound images are characterized by a low signal-to-noise
+ratio and small differences between adjacent frames, coupled with significant
+dimension variations between 2D frames and 3D volumes to be registered,
+resulting in real-time and accurate cardiac ultrasound frame-to-volume
+registration being a very challenging task. This paper introduces a lightweight
+end-to-end Cardiac Ultrasound frame-to-volume Registration network, termed
+CU-Reg. Specifically, the proposed model leverages epicardium prompt-guided
+anatomical clues to reinforce the interaction of 2D sparse and 3D dense
+features, followed by a voxel-wise local-global aggregation of enhanced
+features, thereby boosting the cross-dimensional matching effectiveness of
+low-quality ultrasound modalities. We further embed an inter-frame
+discriminative regularization term within the hybrid supervised learning to
+increase the distinction between adjacent slices in the same ultrasound volume
+to ensure registration stability. Experimental results on the reprocessed CAMUS
+dataset demonstrate that our CU-Reg surpasses existing methods in terms of
+registration accuracy and efficiency, meeting the guidance requirements of
+clinical cardiac interventional surgery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by MICCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuManifold: Neural Watertight Manifold Reconstruction with Efficient
+  and High-Quality Rendering Support 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17134v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17134v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Wei, Fanbo Xiang, Sai Bi, Anpei Chen, Kalyan Sunkavalli, Zexiang Xu, Hao Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method for generating high-quality watertight manifold meshes
+from multi-view input images. Existing volumetric rendering methods are robust
+in optimization but tend to generate noisy meshes with poor topology.
+Differentiable rasterization-based methods can generate high-quality meshes but
+are sensitive to initialization. Our method combines the benefits of both
+worlds; we take the geometry initialization obtained from neural volumetric
+fields, and further optimize the geometry as well as a compact neural texture
+representation with differentiable rasterizers. Through extensive experiments,
+we demonstrate that our method can generate accurate mesh reconstructions with
+faithful appearance that are comparable to previous volume rendering methods
+while being an order of magnitude faster in rendering. We also show that our
+generated mesh and neural texture reconstruction is compatible with existing
+graphics pipelines and enables downstream 3D applications such as simulation.
+Project page: https://sarahweiii.github.io/neumanifold/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://sarahweiii.github.io/neumanifold/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FireANTs: Adaptive Riemannian Optimization for Multi-Scale Diffeomorphic
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01249v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01249v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Jena, Pratik Chaudhari, James C. Gee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper proposes FireANTs, the first multi-scale Adaptive Riemannian
+Optimization algorithm for dense diffeomorphic image matching. One of the most
+critical and understudied aspects of diffeomorphic image matching algorithms
+are its highly ill-conditioned nature. We quantitatively capture the extent of
+ill-conditioning in a typical MRI matching task, motivating the need for an
+adaptive optimization algorithm for diffeomorphic matching. To this end,
+FireANTs generalizes the concept of momentum and adaptive estimates of the
+Hessian to mitigate this ill-conditioning in the non-Euclidean space of
+diffeomorphisms. Unlike common non-Euclidean manifolds, we also formalize
+considerations for multi-scale optimization of diffeomorphisms. Our rigorous
+mathematical results and operational contributions lead to a state-of-the-art
+dense matching algorithm that can be applied to generic image data with
+remarkable accuracy and robustness. We demonstrate consistent improvements in
+image matching performance across a spectrum of community-standard medical and
+biological correspondence matching challenges spanning a wide variety of image
+modalities, anatomies, resolutions, acquisition protocols, and preprocessing
+pipelines. This improvement is supplemented by from 300x up to 3200x speedup
+over existing state-of-the-art algorithms. For the first time, we perform
+diffeomorphic matching of sub-micron mouse cortex volumes at native resolution.
+Our fast implementation also enables hyperparameter studies that were
+intractable with existing correspondence matching algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learnable Scaled Gradient Descent for Guaranteed Robust Tensor PCA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lanlan Feng, Ce Zhu, Yipeng Liu, Saiprasad Ravishankar, Longxiu Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust tensor principal component analysis (RTPCA) aims to separate the
+low-rank and sparse components from multi-dimensional data, making it an
+essential technique in the signal processing and computer vision fields.
+Recently emerging tensor singular value decomposition (t-SVD) has gained
+considerable attention for its ability to better capture the low-rank structure
+of tensors compared to traditional matrix SVD. However, existing methods often
+rely on the computationally expensive tensor nuclear norm (TNN), which limits
+their scalability for real-world tensors. To address this issue, we explore an
+efficient scaled gradient descent (SGD) approach within the t-SVD framework for
+the first time, and propose the RTPCA-SGD method. Theoretically, we rigorously
+establish the recovery guarantees of RTPCA-SGD under mild assumptions,
+demonstrating that with appropriate parameter selection, it achieves linear
+convergence to the true low-rank tensor at a constant rate, independent of the
+condition number. To enhance its practical applicability, we further propose a
+learnable self-supervised deep unfolding model, which enables effective
+parameter learning. Numerical experiments on both synthetic and real-world
+datasets demonstrate the superior performance of the proposed methods while
+maintaining competitive computational efficiency, especially consuming less
+time than RTPCA-TNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empowering Large Language Model for Continual Video Question Answering
+  with Collaborative <span class="highlight-title">Prompt</span>ing <span class="chip">EMNLP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.00771v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.00771v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Cai, Zheng Wang, Jianjun Gao, Wenyang Liu, Ye Lu, Runzhong Zhang, Kim-Hui Yap
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the rapid increase in online video content has underscored
+the limitations of static Video Question Answering (VideoQA) models trained on
+fixed datasets, as they struggle to adapt to new questions or tasks posed by
+newly available content. In this paper, we explore the novel challenge of
+VideoQA within a continual learning framework, and empirically identify a
+critical issue: fine-tuning a large language model (LLM) for a sequence of
+tasks often results in catastrophic forgetting. To address this, we propose
+Collaborative Prompting (ColPro), which integrates specific question constraint
+prompting, knowledge acquisition prompting, and visual temporal awareness
+prompting. These prompts aim to capture textual question context, visual
+content, and video temporal dynamics in VideoQA, a perspective underexplored in
+prior research. Experimental results on the NExT-QA and DramaQA datasets show
+that ColPro achieves superior performance compared to existing approaches,
+achieving 55.14\% accuracy on NExT-QA and 71.24\% accuracy on DramaQA,
+highlighting its practical relevance and effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by main EMNLP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IOR: Inversed Objects Replay for Incremental Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04829v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04829v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijia An, Boyu Diao, Libo Huang, Ruiqi Liu, Zhulin An, Yongjun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Incremental Object Detection (IOD) methods partially alleviate
+catastrophic forgetting when incrementally detecting new objects in real-world
+scenarios. However, many of these methods rely on the assumption that unlabeled
+old-class objects may co-occur with labeled new-class objects in the
+incremental data. When unlabeled old-class objects are absent, the performance
+of existing methods tends to degrade. The absence can be mitigated by
+generating old-class samples, but it incurs high costs. This paper argues that
+previous generation-based IOD suffers from redundancy, both in the use of
+generative models, which require additional training and storage, and in the
+overproduction of generated samples, many of which do not contribute
+significantly to performance improvements. To eliminate the redundancy, we
+propose Inversed Objects Replay (IOR). Specifically, we generate old-class
+samples by inversing the original detectors, thus eliminating the necessity of
+training and storing additional generative models. We propose augmented replay
+to reuse the objects in generated samples, reducing redundant generations.
+Moreover, we propose high-value knowledge distillation focusing on the
+positions of old-class objects overwhelmed by the background, which transfers
+the knowledge to the incremental detector. Extensive experiments conducted on
+MS COCO 2017 demonstrate that our method can efficiently improve detection
+performance in IOD scenarios with the absence of old-class objects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Challenge Summary U-MedSAM: Uncertainty-aware MedSAM for Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08881v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08881v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Wang, Xiaoyu Liu, Peng Huang, Pu Huang, Shu Hu, Hongtu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical Image Foundation Models have proven to be powerful tools for mask
+prediction across various datasets. However, accurately assessing the
+uncertainty of their predictions remains a significant challenge. To address
+this, we propose a new model, U-MedSAM, which integrates the MedSAM model with
+an uncertainty-aware loss function and the Sharpness-Aware Minimization
+(SharpMin) optimizer. The uncertainty-aware loss function automatically
+combines region-based, distribution-based, and pixel-based loss designs to
+enhance segmentation accuracy and robustness. SharpMin improves generalization
+by finding flat minima in the loss landscape, thereby reducing overfitting. Our
+method was evaluated in the CVPR24 MedSAM on Laptop challenge, where U-MedSAM
+demonstrated promising performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2405.17496</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07227v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07227v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tieyuan Chen, Huabin Liu, Yi Wang, Yihang Chen, Tianyao He, Chaofan Gan, Huanyu He, Weiyao Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video causal reasoning aims to achieve a high-level understanding of videos
+from a causal perspective. However, it exhibits limitations in its scope,
+primarily executed in a question-answering paradigm and focusing on brief video
+segments containing isolated events and basic causal relations, lacking
+comprehensive and structured causality analysis for videos with multiple
+interconnected events. To fill this gap, we introduce a new task and dataset,
+Multi-Event Causal Discovery (MECD). It aims to uncover the causal relations
+between events distributed chronologically across long videos. Given visual
+segments and textual descriptions of events, MECD identifies the causal
+associations between these events to derive a comprehensive and structured
+event-level video causal graph explaining why and how the result event
+occurred. To address the challenges of MECD, we devise a novel framework
+inspired by the Granger Causality method, incorporating an efficient mask-based
+event prediction model to perform an Event Granger Test. It estimates causality
+by comparing the predicted result event when premise events are masked versus
+unmasked. Furthermore, we integrate causal inference techniques such as
+front-door adjustment and counterfactual inference to mitigate challenges in
+MECD like causality confounding and illusory causality. Additionally, context
+chain reasoning is introduced to conduct more robust and generalized reasoning.
+Experiments validate the effectiveness of our framework in reasoning complete
+causal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%,
+respectively. Further experiments demonstrate that causal relation graphs can
+also contribute to downstream video understanding tasks such as video question
+answering and video event prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE TPAMI Submission. continuous work of arXiv:2409.17647 (NeurIPS
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07832v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07832v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shantanu Ghosh, Rayan Syed, Chenyu Wang, Clare B. Poynton, Shyam Visweswaran, Kayhan Batmanghelich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Error slice discovery is crucial to diagnose and mitigate model errors.
+Current clustering or discrete attribute-based slice discovery methods face key
+limitations: 1) clustering results in incoherent slices, while assigning
+discrete attributes to slices leads to incomplete coverage of error patterns
+due to missing or insufficient attributes; 2) these methods lack complex
+reasoning, preventing them from fully explaining model biases; 3) they fail to
+integrate \textit{domain knowledge}, limiting their usage in specialized fields
+\eg radiology. We propose\ladder (\underline{La}nguage-\underline{D}riven
+\underline{D}iscovery and \underline{E}rror \underline{R}ectification), to
+address the limitations by: (1) leveraging the flexibility of natural language
+to address incompleteness, (2) employing LLM's latent \textit{domain knowledge}
+and advanced reasoning to analyze sentences and derive testable hypotheses
+directly, identifying biased attributes, and form coherent error slices without
+clustering. Existing mitigation methods typically address only the
+worst-performing group, often amplifying errors in other subgroups. In
+contrast,\ladder generates pseudo attributes from the discovered hypotheses to
+mitigate errors across all biases without explicit attribute annotations or
+prior knowledge of bias. Rigorous evaluations on 6 datasets spanning natural
+and medical images -- comparing 200+ classifiers with diverse architectures,
+pretraining strategies, and LLMs -- show that\ladder consistently outperforms
+existing baselines in discovering and mitigating biases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MetaNeRV: Meta Neural Representations for Videos with Spatial-Temporal
+  Guidance <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02427v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02427v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialong Guo, Ke liu, Jiangchao Yao, Zhihua Wang, Jiajun Bu, Haishuai Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Representations for Videos (NeRV) has emerged as a promising implicit
+neural representation (INR) approach for video analysis, which represents
+videos as neural networks with frame indexes as inputs. However, NeRV-based
+methods are time-consuming when adapting to a large number of diverse videos,
+as each video requires a separate NeRV model to be trained from scratch. In
+addition, NeRV-based methods spatially require generating a high-dimension
+signal (i.e., an entire image) from the input of a low-dimension timestamp, and
+a video typically consists of tens of frames temporally that have a minor
+change between adjacent frames. To improve the efficiency of video
+representation, we propose Meta Neural Representations for Videos, named
+MetaNeRV, a novel framework for fast NeRV representation for unseen videos.
+MetaNeRV leverages a meta-learning framework to learn an optimal parameter
+initialization, which serves as a good starting point for adapting to new
+videos. To address the unique spatial and temporal characteristics of video
+modality, we further introduce spatial-temporal guidance to improve the
+representation capabilities of MetaNeRV. Specifically, the spatial guidance
+with a multi-resolution loss aims to capture the information from different
+resolution stages, and the temporal guidance with an effective progressive
+learning strategy could gradually refine the number of fitted frames during the
+meta-learning process. Extensive experiments conducted on multiple datasets
+demonstrate the superiority of MetaNeRV for video representations and video
+compression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Keep It Accurate and Robust: An Enhanced Nuclei Analysis Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.03415v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.03415v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhua Zhang, Sen Yang, Meiwei Luo, Chuan He, Yuchen Li, Jun Zhang, Xiyue Wang, Fang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation and classification of nuclei in histology images is
+critical but challenging due to nuclei heterogeneity, staining variations, and
+tissue complexity. Existing methods often struggle with limited dataset
+variability, with patches extracted from similar whole slide images (WSI),
+making models prone to falling into local optima. Here we propose a new
+framework to address this limitation and enable robust nuclear analysis. Our
+method leverages dual-level ensemble modeling to overcome issues stemming from
+limited dataset variation. Intra-ensembling applies diverse transformations to
+individual samples, while inter-ensembling combines networks of different
+scales. We also introduce enhancements to the HoVer-Net architecture, including
+updated encoders, nested dense decoding and model regularization strategy. We
+achieve state-of-the-art results on public benchmarks, including 1st place for
+nuclear composition prediction and 3rd place for segmentation/classification in
+the 2022 Colon Nuclei Identification and Counting (CoNIC) Challenge. This
+success validates our approach for accurate histological nuclei analysis.
+Extensive experiments and ablation studies provide insights into optimal
+network design choices and training techniques. In conclusion, this work
+proposes an improved framework advancing the state-of-the-art in nuclei
+analysis. We release our code and models
+(https://github.com/WinnieLaugh/CONIC_Pathology_AI) to serve as a toolkit for
+the community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">13</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple but Effective Closed-form Solution for Extreme Multi-label
+  Learning <span class="chip">ECIR25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazuma Onishi, Katsuhiko Hayashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extreme multi-label learning (XML) is a task of assigning multiple labels
+from an extremely large set of labels to each data instance. Many current
+high-performance XML models are composed of a lot of hyperparameters, which
+complicates the tuning process. Additionally, the models themselves are adapted
+specifically to XML, which complicates their reimplementation. To remedy this
+problem, we propose a simple method based on ridge regression for XML. The
+proposed method not only has a closed-form solution but also is composed of a
+single hyperparameter. Since there are no precedents on applying ridge
+regression to XML, this paper verified the performance of the method by using
+various XML benchmark datasets. Furthermore, we enhanced the prediction of
+low-frequency labels in XML, which hold informative content. This prediction is
+essential yet challenging because of the limited amount of data. Here, we
+employed a simple frequency-based weighting. This approach greatly simplifies
+the process compared with existing techniques. Experimental results revealed
+that it can achieve levels of performance comparable to, or even exceeding,
+those of models with numerous hyperparameters. Additionally, we found that the
+frequency-based weighting significantly improved the predictive performance for
+low-frequency labels, while requiring almost no changes in implementation. The
+source code for the proposed method is available on github at
+https://github.com/cars1015/XML-ridge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10pages, Accepted at ECIR25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MechIR: A Mechanistic Interpretability Framework for Information
+  Retrieval <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Parry, Catherine Chen, Carsten Eickhoff, Sean MacAvaney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mechanistic interpretability is an emerging diagnostic approach for neural
+models that has gained traction in broader natural language processing domains.
+This paradigm aims to provide attribution to components of neural systems where
+causal relationships between hidden layers and output were previously
+uninterpretable. As the use of neural models in IR for retrieval and evaluation
+becomes ubiquitous, we need to ensure that we can interpret why a model
+produces a given output for both transparency and the betterment of systems.
+This work comprises a flexible framework for diagnostic analysis and
+intervention within these highly parametric neural systems specifically
+tailored for IR tasks and architectures. In providing such a framework, we look
+to facilitate further research in interpretable IR with a broader scope for
+practical interventions derived from mechanistic interpretability. We provide
+preliminary analysis and look to demonstrate our framework through an axiomatic
+lens to show its applications and ease of use for those IR practitioners
+inexperienced in this emerging paradigm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, Accepted to ECIR 2025 as a Demo Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Worrying Reproducibility Study of Intent-Aware Recommendation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faisal Shehzad, Maurizio Ferrari Dacrema, Dietmar Jannach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lately, we have observed a growing interest in intent-aware recommender
+systems (IARS). The promise of such systems is that they are capable of
+generating better recommendations by predicting and considering the underlying
+motivations and short-term goals of consumers. From a technical perspective,
+various sophisticated neural models were recently proposed in this emerging and
+promising area. In the broader context of complex neural recommendation models,
+a growing number of research works unfortunately indicates that (i) reproducing
+such works is often difficult and (ii) that the true benefits of such models
+may be limited in reality, e.g., because the reported improvements were
+obtained through comparisons with untuned or weak baselines. In this work, we
+investigate if recent research in IARS is similarly affected by such problems.
+Specifically, we tried to reproduce five contemporary IARS models that were
+published in top-level outlets, and we benchmarked them against a number of
+traditional non-neural recommendation models. In two of the cases, running the
+provided code with the optimal hyperparameters reported in the paper did not
+yield the results reported in the paper. Worryingly, we find that all examined
+IARS approaches are consistently outperformed by at least one traditional
+model. These findings point to sustained methodological issues and to a
+pressing need for more rigorous scholarly practices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PaSa: An LLM Agent for Comprehensive Academic Paper Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10120v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10120v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichen He, Guanhua Huang, Peiyuan Feng, Yuan Lin, Yuchen Zhang, Hang Li, Weinan E
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce PaSa, an advanced Paper Search agent powered by large language
+models. PaSa can autonomously make a series of decisions, including invoking
+search tools, reading papers, and selecting relevant references, to ultimately
+obtain comprehensive and accurate results for complex scholarly queries. We
+optimize PaSa using reinforcement learning with a synthetic dataset,
+AutoScholarQuery, which includes 35k fine-grained academic queries and
+corresponding papers sourced from top-tier AI conference publications.
+Additionally, we develop RealScholarQuery, a benchmark collecting real-world
+academic queries to assess PaSa performance in more realistic scenarios.
+Despite being trained on synthetic data, PaSa significantly outperforms
+existing baselines on RealScholarQuery, including Google, Google Scholar,
+Google with GPT-4 for paraphrased queries, chatGPT (search-enabled GPT-4o),
+GPT-o1, and PaSa-GPT-4o (PaSa implemented by prompting GPT-4o). Notably,
+PaSa-7B surpasses the best Google-based baseline, Google with GPT-4o, by 37.78%
+in recall@20 and 39.90% in recall@50. It also exceeds PaSa-GPT-4o by 30.36% in
+recall and 4.25% in precision. Model, datasets, and code are available at
+https://github.com/bytedance/pasa.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Passage Segmentation of Documents for Extractive Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09940v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09940v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuhong Liu, Charles-Elie Simon, Fabien Caspani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) has proven effective in open-domain
+question answering. However, the chunking process, which is essential to this
+pipeline, often receives insufficient attention relative to retrieval and
+synthesis components. This study emphasizes the critical role of chunking in
+improving the performance of both dense passage retrieval and the end-to-end
+RAG pipeline. We then introduce the Logits-Guided Multi-Granular Chunker
+(LGMGC), a novel framework that splits long documents into contextualized,
+self-contained chunks of varied granularity. Our experimental results,
+evaluated on two benchmark datasets, demonstrate that LGMGC not only improves
+the retrieval step but also outperforms existing chunking methods when
+integrated into a RAG pipeline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Models in Recommendation Systems: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ting-Ruen Wei, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems remain an essential topic due to its wide application in
+various domains and the business potential behind them. With the rise of deep
+learning, common solutions have leveraged neural networks to facilitate
+collaborative filtering, and some have turned to generative adversarial
+networks to augment the dataset and tackle the data sparsity issue. However,
+they are limited in learning the complex user and item distribution and still
+suffer from model collapse. Given the great generation capability exhibited by
+diffusion models in computer vision recently, many recommender systems have
+adopted diffusion models and found improvements in performance for various
+tasks. Diffusion models in recommender systems excel in managing complex user
+and item distributions and do not suffer from mode collapse. With these
+advantages, the amount of research in this domain have been growing rapidly and
+calling for a systematic survey. In this survey paper, we present and propose a
+taxonomy on past research papers in recommender systems that utilize diffusion
+models. Distinct from a prior survey paper that categorizes based on the role
+of the diffusion model, we categorize based on the recommendation task at hand.
+The decision originates from the rationale that after all, the adoption of
+diffusion models is to enhance the recommendation performance, not vice versa:
+adapting the recommendation task to enable diffusion models. Nonetheless, we
+offer a unique perspective for diffusion models in recommender systems
+complementary to existing surveys. We present the foundation algorithms in
+diffusion models and their applications in recommender systems to summarize the
+rapid development in this field. Finally, we discuss open research directions
+to prepare and encourage further efforts to advance the field. We compile the
+relevant papers in a public GitHub repository.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tracing Affordance and Item Adoption on Music Streaming Platforms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.03538v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.03538v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dougal Shakespeare, Camille Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Popular music streaming platforms offer users a diverse network of content
+exploration through a triad of affordances: organic, algorithmic and editorial
+access modes. Whilst offering great potential for discovery, such platform
+developments also pose the modern user with daily adoption decisions on two
+fronts: platform affordance adoption and the adoption of recommendations
+therein. Following a carefully constrained set of Deezer users over a 2-year
+observation period, our work explores factors driving user behaviour in the
+broad sense, by differentiating users on the basis of their temporal daily
+usage, adoption of the main platform affordances, and the ways in which they
+react to them, especially in terms of recommendation adoption. Diverging from a
+perspective common in studies on the effects of recommendation, we assume and
+confirm that users exhibit very diverse behaviours in using and adopting the
+platform affordances. The resulting complex and quite heterogeneous picture
+demonstrates that there is no blanket answer for adoption practices of both
+recommendation features and recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ISMIR 2021 pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ News Without Borders: Domain Adaptation of Multilingual Sentence
+  Embeddings for Cross-lingual News Recommendation <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12634v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12634v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreea Iana, Fabian David Schmidt, Goran Glavaš, Heiko Paulheim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapidly growing numbers of multilingual news consumers pose an increasing
+challenge to news recommender systems in terms of providing customized
+recommendations. First, existing neural news recommenders, even when powered by
+multilingual language models (LMs), suffer substantial performance losses in
+zero-shot cross-lingual transfer (ZS-XLT). Second, the current paradigm of
+fine-tuning the backbone LM of a neural recommender on task-specific data is
+computationally expensive and infeasible in few-shot recommendation and
+cold-start setups, where data is scarce or completely unavailable. In this
+work, we propose a news-adapted sentence encoder (NaSE), domain-specialized
+from a pretrained massively multilingual sentence encoder (SE). To this end, we
+construct and leverage PolyNews and PolyNewsParallel, two multilingual
+news-specific corpora. With the news-adapted multilingual SE in place, we test
+the effectiveness of (i.e., question the need for) supervised fine-tuning for
+news recommendation, and propose a simple and strong baseline based on (i)
+frozen NaSE embeddings and (ii) late click-behavior fusion. We show that NaSE
+achieves state-of-the-art performance in ZS-XLT in true cold-start and few-shot
+news recommendation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 47th European Conference on Information Retrieval
+  (ECIR 2025) Appendix A is provided only in the arXiv version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enabling Low-Resource Language Retrieval: Establishing Baselines for
+  Urdu MS MARCO <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12997v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12997v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Umer Butt, Stalin Veranasi, Günter Neumann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the Information Retrieval (IR) field increasingly recognizes the
+importance of inclusivity, addressing the needs of low-resource languages
+remains a significant challenge. This paper introduces the first large-scale
+Urdu IR dataset, created by translating the MS MARCO dataset through machine
+translation. We establish baseline results through zero-shot learning for IR in
+Urdu and subsequently apply the mMARCO multilingual IR methodology to this
+newly translated dataset. Our findings demonstrate that the fine-tuned model
+(Urdu-mT5-mMARCO) achieves a Mean Reciprocal Rank (MRR@10) of 0.247 and a
+Recall@10 of 0.439, representing significant improvements over zero-shot
+results and showing the potential for expanding IR access for Urdu speakers. By
+bridging access gaps for speakers of low-resource languages, this work not only
+advances multilingual IR research but also emphasizes the ethical and societal
+importance of inclusive IR technologies. This work provides valuable insights
+into the challenges and solutions for improving language representation and
+lays the groundwork for future research, especially in South Asian languages,
+which can benefit from the adaptable methods used in this study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, ECIR 2025, conference camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Application of Large Language Models in Recommendation Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02178v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02178v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiyang Yu, Zeqiu Xu, Jiani Wang, Xiaochuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of Large Language Models into recommendation frameworks
+presents key advantages for personalization and adaptability of experiences to
+the users. Classic methods of recommendations, such as collaborative filtering
+and content-based filtering, are seriously limited in the solution of
+cold-start problems, sparsity of data, and lack of diversity in information
+considered. LLMs, of which GPT-4 is a good example, have emerged as powerful
+tools that enable recommendation frameworks to tap into unstructured data
+sources such as user reviews, social interactions, and text-based content. By
+analyzing these data sources, LLMs improve the accuracy and relevance of
+recommendations, thereby overcoming some of the limitations of traditional
+approaches. This work discusses applications of LLMs in recommendation systems,
+especially in electronic commerce, social media platforms, streaming services,
+and educational technologies. This showcases how LLMs enrich recommendation
+diversity, user engagement, and the system's adaptability; yet it also looks
+into the challenges connected to their technical implementation. This can also
+be presented as a study that shows the potential of LLMs for changing user
+experiences and making innovation possible in industries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ranking Generated Answers: On the Agreement of Retrieval Models with
+  Humans on Consumer Health Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Heineking, Jonas Probst, Daniel Steinbach, Martin Potthast, Harrisen Scells
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating the output of generative large language models (LLMs) is
+challenging and difficult to scale. Many evaluations of LLMs focus on tasks
+such as single-choice question-answering or text classification. These tasks
+are not suitable for assessing open-ended question-answering capabilities,
+which are critical in domains where expertise is required. One such domain is
+health, where misleading or incorrect answers can have a negative impact on a
+user's well-being. Using human experts to evaluate the quality of LLM answers
+is generally considered the gold standard, but expert annotation is costly and
+slow. We present a method for evaluating LLM answers that uses ranking models
+trained on annotated document collections as a substitute for explicit
+relevance judgements and apply it to the CLEF 2021 eHealth dataset. In a user
+study, our method correlates with the preferences of a human expert (Kendall's
+$\tau=0.64$). It is also consistent with previous findings in that the quality
+of generated answers improves with the size of the model and more sophisticated
+prompting strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing User Interest based on Stream Clustering and Memory Networks
+  in Large-Scale Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13238v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13238v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Liu, Nian Wang, Cong Xu, Ming Zhao, Bin Wang, Yi Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender Systems (RSs) provide personalized recommendation service based
+on user interest, which are widely used in various platforms. However, there
+are lots of users with sparse interest due to lacking consumption behaviors,
+which leads to poor recommendation results for them. This problem is widespread
+in large-scale RSs and is particularly difficult to address. To solve this
+problem, we propose a novel solution named User Interest Enhancement (UIE)
+which enhances user interest including user profile and user history behavior
+sequences using the enhancement vectors and personalized enhancement vector
+generated based on stream clustering and memory networks from different
+perspectives. UIE not only remarkably improves model performance on the users
+with sparse interest but also significantly enhance model performance on other
+users. UIE is an end-to-end solution which is easy to be implemented based on
+ranking model. Moreover, we expand our solution and apply similar methods to
+long-tail items, which also achieves excellent improvement. Furthermore, we
+conduct extensive offline and online experiments in a large-scale industrial
+RS. The results demonstrate that our model outperforms other models remarkably,
+especially for the users with sparse interest. Until now, UIE has been fully
+deployed in multiple large-scale RSs and achieved remarkable improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Explaining Recommendations with Large Language Models: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19576v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19576v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alan Said
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of Large Language Models (LLMs), such as LLaMA and ChatGPT, has
+opened new opportunities for enhancing recommender systems through improved
+explainability. This paper provides a systematic literature review focused on
+leveraging LLMs to generate explanations for recommendations -- a critical
+aspect for fostering transparency and user trust. We conducted a comprehensive
+search within the ACM Guide to Computing Literature, covering publications from
+the launch of ChatGPT (November 2022) to the present (November 2024). Our
+search yielded 232 articles, but after applying inclusion criteria, only six
+were identified as directly addressing the use of LLMs in explaining
+recommendations. This scarcity highlights that, despite the rise of LLMs, their
+application in explainable recommender systems is still in an early stage. We
+analyze these select studies to understand current methodologies, identify
+challenges, and suggest directions for future research. Our findings underscore
+the potential of LLMs improving explanations of recommender systems and
+encourage the development of more transparent and user-centric recommendation
+explanation solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">119</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Credit Risk Identification in Supply Chains Using Generative Adversarial
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10348v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10348v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zizhou Zhang, Xinshi Li, Yu Cheng, Zhenrui Chen, Qianying Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Credit risk management within supply chains has emerged as a critical
+research area due to its significant implications for operational stability and
+financial sustainability. The intricate interdependencies among supply chain
+participants mean that credit risks can propagate across networks, with impacts
+varying by industry. This study explores the application of Generative
+Adversarial Networks (GANs) to enhance credit risk identification in supply
+chains. GANs enable the generation of synthetic credit risk scenarios,
+addressing challenges related to data scarcity and imbalanced datasets. By
+leveraging GAN-generated data, the model improves predictive accuracy while
+effectively capturing dynamic and temporal dependencies in supply chain data.
+The research focuses on three representative industries-manufacturing (steel),
+distribution (pharmaceuticals), and services (e-commerce) to assess
+industry-specific credit risk contagion. Experimental results demonstrate that
+the GAN-based model outperforms traditional methods, including logistic
+regression, decision trees, and neural networks, achieving superior accuracy,
+recall, and F1 scores. The findings underscore the potential of GANs in
+proactive risk management, offering robust tools for mitigating financial
+disruptions in supply chains. Future research could expand the model by
+incorporating external market factors and supplier relationships to further
+enhance predictive capabilities. Keywords- Generative Adversarial Networks
+(GANs); Supply Chain Risk; Credit Risk Identification; Machine Learning; Data
+Augmentation
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper will be published and indexed by IEEE at 2025 8th
+  International Conference on Advanced Algorithms and Control Engineering
+  (ICAACE 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ColNet: Collaborative Optimization in Decentralized Federated Multi-task
+  Learning Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10347v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10347v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Feng, Nicolas Fazli Kohler, Alberto Huertas Celdran, Gerome Bovet, Burkhard Stiller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of Federated Learning (FL) and Multi-Task Learning (MTL) has
+been explored to address client heterogeneity, with Federated Multi-Task
+Learning (FMTL) treating each client as a distinct task. However, most existing
+research focuses on data heterogeneity (e.g., addressing non-IID data) rather
+than task heterogeneity, where clients solve fundamentally different tasks.
+Additionally, much of the work relies on centralized settings with a server
+managing the federation, leaving the more challenging domain of decentralized
+FMTL largely unexplored. Thus, this work bridges this gap by proposing ColNet,
+a framework designed for heterogeneous tasks in decentralized federated
+environments. ColNet divides models into the backbone and task-specific layers,
+forming groups of similar clients, with group leaders performing
+conflict-averse cross-group aggregation. A pool of experiments with different
+federations demonstrated ColNet outperforms the compared aggregation schemes in
+decentralized settings with label and task heterogeneity scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Deep Learning Model for epileptic seizure classification by using
+  1D-CNN with multi-head attention mechanism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10342v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10342v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Guhdar, Ramadhan J. Mstafa, Abdulhakeem O. Mohammed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Epilepsy is a prevalent neurological disorder globally, impacting around 50
+million people \cite{WHO_epilepsy_50million}. Epileptic seizures result from
+sudden abnormal electrical activity in the brain, which can be read as sudden
+and significant changes in the EEG signal of the brain. The signal can vary in
+severity and frequency, which results in loss of consciousness and muscle
+contractions for a short period of time \cite{epilepsyfoundation_myoclonic}.
+Individuals with epilepsy often face significant employment challenges due to
+safety concerns in certain work environments. Many jobs that involve working at
+heights, operating heavy machinery, or in other potentially hazardous settings
+may be restricted for people with seizure disorders. This certainly limits job
+options and economic opportunities for those living with epilepsy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ New Fashion Products Performance Forecasting: A <span class="highlight-title">Survey</span> on Evolutions,
+  Models and Emerging Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Avogaro, Luigi Capogrosso, Andrea Toaiari, Franco Fummi, Marco Cristani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fast fashion industry's insatiable demand for new styles and rapid
+production cycles has led to a significant environmental burden.
+Overproduction, excessive waste, and harmful chemicals have contributed to the
+negative environmental impact of the industry. To mitigate these issues, a
+paradigm shift that prioritizes sustainability and efficiency is urgently
+needed. Integrating learning-based predictive analytics into the fashion
+industry represents a significant opportunity to address environmental
+challenges and drive sustainable practices. By forecasting fashion trends and
+optimizing production, brands can reduce their ecological footprint while
+remaining competitive in a rapidly changing market. However, one of the key
+challenges in forecasting fashion sales is the dynamic nature of consumer
+preferences. Fashion is acyclical, with trends constantly evolving and
+resurfacing. In addition, cultural changes and unexpected events can disrupt
+established patterns. This problem is also known as New Fashion Products
+Performance Forecasting (NFPPF), and it has recently gained more and more
+interest in the global research landscape. Given its multidisciplinary nature,
+the field of NFPPF has been approached from many different angles. This
+comprehensive survey wishes to provide an up-to-date overview that focuses on
+learning-based NFPPF strategies. The survey is based on the Preferred Reporting
+Items for Systematic Reviews and Meta-Analyses (PRISMA) methodological flow,
+allowing for a systematic and complete literature review. In particular, we
+propose the first taxonomy that covers the learning panorama for NFPPF,
+examining in detail the different methodologies used to increase the amount of
+multimodal information, as well as the state-of-the-art available datasets.
+Finally, we discuss the challenges and future directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Springer Nature Computer Science journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Autoregressive <span class="highlight-title">Transformer</span>s: Combining Byte-~and Word-Level
+  Processing for Robust, Adaptable Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pit Neitemeier, Björn Deiseroth, Constantin Eichenberg, Lukas Balles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tokenization is a fundamental step in natural language processing, breaking
+text into units that computational models can process. While learned subword
+tokenizers have become the de-facto standard, they present challenges such as
+large vocabularies, limited adaptability to new domains or languages, and
+sensitivity to spelling errors and variations. To overcome these limitations,
+we investigate a hierarchical architecture for autoregressive language
+modelling that combines character-level and word-level processing. It employs a
+lightweight character-level encoder to convert character sequences into word
+embeddings, which are then processed by a word-level backbone model and decoded
+back into characters via a compact character-level decoder. This method retains
+the sequence compression benefits of word-level tokenization without relying on
+a rigid, predefined vocabulary. We demonstrate, at scales up to 7 billion
+parameters, that hierarchical transformers match the downstream task
+performance of subword-tokenizer-based models while exhibiting significantly
+greater robustness to input perturbations. Additionally, during continued
+pretraining on an out-of-domain language, our model trains almost twice as
+fast, achieves superior performance on the target language, and retains more of
+its previously learned knowledge. Hierarchical transformers pave the way for
+NLP systems that are more robust, flexible, and generalizable across languages
+and domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Human-Guided, Data-Centric LLM Co-Pilots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evgeny Saveliev, Jiashuo Liu, Nabeel Seedat, Anders Boyd, Mihaela van der Schaar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) has the potential to revolutionize healthcare, but its
+adoption is often hindered by the disconnect between the needs of domain
+experts and translating these needs into robust and valid ML tools. Despite
+recent advances in LLM-based co-pilots to democratize ML for non-technical
+domain experts, these systems remain predominantly focused on model-centric
+aspects while overlooking critical data-centric challenges. This limitation is
+problematic in complex real-world settings where raw data often contains
+complex issues, such as missing values, label noise, and domain-specific
+nuances requiring tailored handling. To address this we introduce CliMB-DC, a
+human-guided, data-centric framework for LLM co-pilots that combines advanced
+data-centric tools with LLM-driven reasoning to enable robust, context-aware
+data processing. At its core, CliMB-DC introduces a novel, multi-agent
+reasoning system that combines a strategic coordinator for dynamic planning and
+adaptation with a specialized worker agent for precise execution. Domain
+expertise is then systematically incorporated to guide the reasoning process
+using a human-in-the-loop approach. To guide development, we formalize a
+taxonomy of key data-centric challenges that co-pilots must address.
+Thereafter, to address the dimensions of the taxonomy, we integrate
+state-of-the-art data-centric tools into an extensible, open-source
+architecture, facilitating the addition of new tools from the research
+community. Empirically, using real-world healthcare datasets we demonstrate
+CliMB-DC's ability to transform uncurated datasets into ML-ready formats,
+significantly outperforming existing co-pilot baselines for handling
+data-centric challenges. CliMB-DC promises to empower domain experts from
+diverse domains -- healthcare, finance, social sciences and more -- to actively
+participate in driving real-world impact using ML.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Saveliev, Liu & Seedat contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pairwise Elimination with Instance-Dependent Guarantees for Bandits with
+  Cost Subsidy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ishank Juneja, Carlee Joe-Wong, Osman Yağan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-armed bandits (MAB) are commonly used in sequential online
+decision-making when the reward of each decision is an unknown random variable.
+In practice however, the typical goal of maximizing total reward may be less
+important than minimizing the total cost of the decisions taken, subject to a
+reward constraint. For example, we may seek to make decisions that have at
+least the reward of a reference ``default'' decision, with as low a cost as
+possible. This problem was recently introduced in the Multi-Armed Bandits with
+Cost Subsidy (MAB-CS) framework. MAB-CS is broadly applicable to problem
+domains where a primary metric (cost) is constrained by a secondary metric
+(reward), and the rewards are unknown. In our work, we address variants of
+MAB-CS including ones with reward constrained by the reward of a known
+reference arm or by the subsidized best reward. We introduce the
+Pairwise-Elimination (PE) algorithm for the known reference arm variant and
+generalize PE to PE-CS for the subsidized best reward variant. Our
+instance-dependent analysis of PE and PE-CS reveals that both algorithms have
+an order-wise logarithmic upper bound on Cost and Quality Regret, making our
+policies the first with such a guarantee. Moreover, by comparing our upper and
+lower bound results we establish that PE is order-optimal for all known
+reference arm problem instances. Finally, experiments are conducted using the
+MovieLens 25M and Goodreads datasets for both PE and PE-CS revealing the
+effectiveness of PE and the superior balance between performance and
+reliability offered by PE-CS compared to baselines from the literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEANN: A Domain-Informed Neural Network for Epidemiological Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-Baptiste Guimbaud, Marc Plantevit, Léa Maître, Rémy Cazabet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In epidemiology, traditional statistical methods such as logistic regression,
+linear regression, and other parametric models are commonly employed to
+investigate associations between predictors and health outcomes. However,
+non-parametric machine learning techniques, such as deep neural networks
+(DNNs), coupled with explainable AI (XAI) tools, offer new opportunities for
+this task. Despite their potential, these methods face challenges due to the
+limited availability of high-quality, high-quantity data in this field. To
+address these challenges, we introduce SEANN, a novel approach for informed
+DNNs that leverages a prevalent form of domain-specific knowledge: Pooled
+Effect Sizes (PES). PESs are commonly found in published Meta-Analysis studies,
+in different forms, and represent a quantitative form of a scientific
+consensus. By direct integration within the learning procedure using a custom
+loss, we experimentally demonstrate significant improvements in the
+generalizability of predictive performances and the scientific plausibility of
+extracted relationships compared to a domain-knowledge agnostic neural network
+in a scarce and noisy data setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Logarithmic Regret for Nonlinear Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10261v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10261v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Wang, Bruce D. Lee, Ingvar Ziemann, Nikolai Matni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the problem of learning to control an unknown nonlinear dynamical
+system through sequential interactions. Motivated by high-stakes applications
+in which mistakes can be catastrophic, such as robotics and healthcare, we
+study situations where it is possible for fast sequential learning to occur.
+Fast sequential learning is characterized by the ability of the learning agent
+to incur logarithmic regret relative to a fully-informed baseline. We
+demonstrate that fast sequential learning is achievable in a diverse class of
+continuous control problems where the system dynamics depend smoothly on
+unknown parameters, provided the optimal control policy is persistently
+exciting. Additionally, we derive a regret bound which grows with the square
+root of the number of interactions for cases where the optimal policy is not
+persistently exciting. Our results provide the first regret bounds for
+controlling nonlinear dynamical systems depending nonlinearly on unknown
+parameters. We validate the trends our theory predicts in simulation on a
+simple dynamical system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DADA: Dual Averaging with Distance Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Moshtaghifar, Anton Rodomanov, Daniil Vankov, Sebastian Stich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel universal gradient method for solving convex optimization
+problems. Our algorithm -- Dual Averaging with Distance Adaptation (DADA) -- is
+based on the classical scheme of dual averaging and dynamically adjusts its
+coefficients based on observed gradients and the distance between iterates and
+the starting point, eliminating the need for problem-specific parameters. DADA
+is a universal algorithm that simultaneously works for a broad spectrum of
+problem classes, provided the local growth of the objective function around its
+minimizer can be bounded. Particular examples of such problem classes are
+nonsmooth Lipschitz functions, Lipschitz-smooth functions, H\"older-smooth
+functions, functions with high-order Lipschitz derivative,
+quasi-self-concordant functions, and $(L_0,L_1)$-smooth functions. Crucially,
+DADA is applicable to both unconstrained and constrained problems, even when
+the domain is unbounded, without requiring prior knowledge of the number of
+iterations or desired accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Rhythm and Voice Conversion of Dysarthric to Healthy Speech
+  for ASR <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karl El Hajal, Enno Hermann, Ajinkya Kulkarni, Mathew Magimai. -Doss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic speech recognition (ASR) systems are well known to perform poorly
+on dysarthric speech. Previous works have addressed this by speaking rate
+modification to reduce the mismatch with typical speech. Unfortunately, these
+approaches rely on transcribed speech data to estimate speaking rates and
+phoneme durations, which might not be available for unseen speakers. Therefore,
+we combine unsupervised rhythm and voice conversion methods based on
+self-supervised speech representations to map dysarthric to typical speech. We
+evaluate the outputs with a large ASR model pre-trained on healthy speech
+without further fine-tuning and find that the proposed rhythm conversion
+especially improves performance for speakers of the Torgo corpus with more
+severe cases of dysarthria. Code and audio samples are available at
+https://idiap.github.io/RnV .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025 Satellite Workshop: Workshop on Speech
+  Pathology Analysis and DEtection (SPADE)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Over-the-Air Multi-Sensor Inference with Neural Networks Using
+  Memristor-Based Analog Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Busra Tegin, Muhammad Atif Ali, Tolga M Duman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks provide reliable solutions for many classification and
+regression tasks; however, their application in real-time wireless systems with
+simple sensor networks is limited due to high energy consumption and
+significant bandwidth needs. This study proposes a multi-sensor wireless
+inference system with memristor-based analog computing. Given the sensors'
+limited computational capabilities, the features from the network's front end
+are transmitted to a central device where an $L_p$-norm inspired approximation
+of the maximum operation is employed to achieve transformation-invariant
+features, enabling efficient over-the-air transmission. We also introduce a
+trainable over-the-air sensor fusion method based on $L_p$-norm inspired
+combining function that customizes sensor fusion to match the network and
+sensor distribution characteristics, enhancing adaptability. To address the
+energy constraints of sensors, we utilize memristors, known for their
+energy-efficient in-memory computing, enabling analog-domain computations that
+reduce energy use and computational overhead in edge computing. This dual
+approach of memristors and $L_p$-norm inspired sensor fusion fosters
+energy-efficient computational and transmission paradigms and serves as a
+practical energy-efficient solution with minimal performance loss.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Challenges and recommendations for Electronic Health Records data
+  extraction and preparation for dynamic prediction modelling in hospitalized
+  patients -- a practical guide 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elena Albu, Shan Gao, Pieter Stijnen, Frank E. Rademakers, Bas C T van Bussel, Taya Collyer, Tina Hernandez-Boussard, Laure Wynants, Ben Van Calster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic predictive modeling using electronic health record (EHR) data has
+gained significant attention in recent years. The reliability and
+trustworthiness of such models depend heavily on the quality of the underlying
+data, which is largely determined by the stages preceding the model
+development: data extraction from EHR systems and data preparation. We list
+over forty challenges encountered during these stages and provide actionable
+recommendations for addressing them. These challenges are organized into four
+categories: cohort definition, outcome definition, feature engineering, and
+data cleaning. This list is designed to serve as a practical guide for data
+extraction engineers and researchers, supporting better practices and improving
+the quality and real-world applicability of dynamic prediction models in
+clinical settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpaceTime: Causal Discovery from Non-Stationary Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Mameche, Lénaïg Cornanguer, Urmi Ninad, Jilles Vreeken
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding causality is challenging and often complicated by changing
+causal relationships over time and across environments. Climate patterns, for
+example, shift over time with recurring seasonal trends, while also depending
+on geographical characteristics such as ecosystem variability. Existing methods
+for discovering causal graphs from time series either assume stationarity, do
+not permit both temporal and spatial distribution changes, or are unaware of
+locations with the same causal relationships. In this work, we therefore unify
+the three tasks of causal graph discovery in the non-stationary multi-context
+setting, of reconstructing temporal regimes, and of partitioning datasets and
+time intervals into those where invariant causal relationships hold. To
+construct a consistent score that forms the basis of our method, we employ the
+Minimum Description Length principle. Our resulting algorithm SPACETIME
+simultaneously accounts for heterogeneity across space and non-stationarity
+over time. Given multiple time series, it discovers regime changepoints and a
+temporal causal graph using non-parametric functional modeling and kernelized
+discrepancy testing. We also show that our method provides insights into
+real-world phenomena such as river-runoff measured at different catchments and
+biosphere-atmosphere interactions across ecosystems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counterfactual Explanations for k-means and Gaussian Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgios Vardakas, Antonia Karra, Evaggelia Pitoura, Aristidis Likas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactuals have been recognized as an effective approach to explain
+classifier decisions. Nevertheless, they have not yet been considered in the
+context of clustering. In this work, we propose the use of counterfactuals to
+explain clustering solutions. First, we present a general definition for
+counterfactuals for model-based clustering that includes plausibility and
+feasibility constraints. Then we consider the counterfactual generation problem
+for k-means and Gaussian clustering assuming Euclidean distance. Our approach
+takes as input the factual, the target cluster, a binary mask indicating
+actionable or immutable features and a plausibility factor specifying how far
+from the cluster boundary the counterfactual should be placed. In the k-means
+clustering case, analytical mathematical formulas are presented for computing
+the optimal solution, while in the Gaussian clustering case (assuming full,
+diagonal, or spherical covariances) our method requires the numerical solution
+of a nonlinear equation with a single parameter only. We demonstrate the
+advantages of our approach through illustrative examples and quantitative
+experimental comparisons.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Amortized Bayesian Mixture Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Šimon Kucharský, Paul Christian Bürkner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finite mixtures are a broad class of models useful in scenarios where
+observed data is generated by multiple distinct processes but without explicit
+information about the responsible process for each data point. Estimating
+Bayesian mixture models is computationally challenging due to issues such as
+high-dimensional posterior inference and label switching. Furthermore,
+traditional methods such as MCMC are applicable only if the likelihoods for
+each mixture component are analytically tractable.
+  Amortized Bayesian Inference (ABI) is a simulation-based framework for
+estimating Bayesian models using generative neural networks. This allows the
+fitting of models without explicit likelihoods, and provides fast inference.
+ABI is therefore an attractive framework for estimating mixture models. This
+paper introduces a novel extension of ABI tailored to mixture models. We
+factorize the posterior into a distribution of the parameters and a
+distribution of (categorical) mixture indicators, which allows us to use a
+combination of generative neural networks for parameter inference, and
+classification networks for mixture membership identification. The proposed
+framework accommodates both independent and dependent mixture models, enabling
+filtering and smoothing. We validate and demonstrate our approach through
+synthetic and real-world datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modelling Activity Scheduling Behaviour with Deep Generative Machine
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fred Shone, Tim Hillel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We model human activity scheduling behaviour using a deep generative machine
+learning approach. Activity schedules, which represent the activities and
+associated travel behaviours of individuals, are a core component of many
+applied models in the transport, energy and epidemiology domains. Our data
+driven approach learns human preferences and scheduling logic without the need
+for complex interacting combinations of sub-models and custom-rules, this makes
+our approach significantly faster and simpler to operate that existing
+approaches. We find activity schedule data combines aspects of both continuous
+image data and also discrete text data, requiring novel approaches. We
+additionally contribute a novel schedule representation and comprehensive
+evaluation framework for generated schedules. Evaluation shows our approach is
+able to rapidly generate large, diverse and realistic synthetic samples of
+activity schedules.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Relevance of AWS Chronos: An Evaluation of Standard Methods for Time
+  Series Forecasting with Limited Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Baron, Alex Karpinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A systematic comparison of Chronos, a transformer-based time series
+forecasting framework, against traditional approaches including ARIMA and
+Prophet. We evaluate these models across multiple time horizons and user
+categories, with a focus on the impact of historical context length. Our
+analysis reveals that while Chronos demonstrates superior performance for
+longer-term predictions and maintains accuracy with increased context,
+traditional models show significant degradation as context length increases. We
+find that prediction quality varies systematically between user classes,
+suggesting that underlying behavior patterns always influence model
+performance. This study provides a case for deploying Chronos in real-world
+applications where limited model tuning is feasible, especially in scenarios
+requiring longer prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Graph MLP Mixer for Spatio-Temporal Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Bilal, Luis Carretero Lopez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatiotemporal forecasting is critical in applications such as traffic
+prediction, climate modeling, and environmental monitoring. However, the
+prevalence of missing data in real-world sensor networks significantly
+complicates this task. In this paper, we introduce the Temporal Graph MLP-Mixer
+(T-GMM), a novel architecture designed to address these challenges. The model
+combines node-level processing with patch-level subgraph encoding to capture
+localized spatial dependencies while leveraging a three-dimensional MLP-Mixer
+to handle temporal, spatial, and feature-based dependencies. Experiments on the
+AQI, ENGRAD, PV-US and METR-LA datasets demonstrate the model's ability to
+effectively forecast even in the presence of significant missing data. While
+not surpassing state-of-the-art models in all scenarios, the T-GMM exhibits
+strong learning capabilities, particularly in capturing long-range
+dependencies. These results highlight its potential for robust, scalable
+spatiotemporal forecasting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hypercone Assisted Contour Generation for Out-of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Annita Vapsi, Andrés Muñoz, Nancy Thomas, Keshav Ramani, Daniel Borrajo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in the field of out-of-distribution (OOD) detection have
+placed great emphasis on learning better representations suited to this task.
+While there are distance-based approaches, distributional awareness has seldom
+been exploited for better performance. We present HAC$_k$-OOD, a novel OOD
+detection method that makes no distributional assumption about the data, but
+automatically adapts to its distribution. Specifically, HAC$_k$-OOD constructs
+a set of hypercones by maximizing the angular distance to neighbors in a given
+data-point's vicinity to approximate the contour within which in-distribution
+(ID) data-points lie. Experimental results show state-of-the-art FPR@95 and
+AUROC performance on Near-OOD detection and on Far-OOD detection on the
+challenging CIFAR-100 benchmark without explicitly training for OOD
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provably Safeguarding a Classifier from OOD and Adversarial Samples: an
+  Extreme Value Theory Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Atienza, Christophe Labreuche, Johanne Cohen, Michele Sebag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel method, Sample-efficient Probabilistic
+Detection using Extreme Value Theory (SPADE), which transforms a classifier
+into an abstaining classifier, offering provable protection against
+out-of-distribution and adversarial samples. The approach is based on a
+Generalized Extreme Value (GEV) model of the training distribution in the
+classifier's latent space, enabling the formal characterization of OOD samples.
+Interestingly, under mild assumptions, the GEV model also allows for formally
+characterizing adversarial samples. The abstaining classifier, which rejects
+samples based on their assessment by the GEV model, provably avoids OOD and
+adversarial samples. The empirical validation of the approach, conducted on
+various neural architectures (ResNet, VGG, and Vision Transformer) and medium
+and large-sized datasets (CIFAR-10, CIFAR-100, and ImageNet), demonstrates its
+frugality, stability, and efficiency compared to the state of the art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contributions to the Decision Theoretic Foundations of Machine Learning
+  and Robust Statistics under Weakly Structured Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10195v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10195v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Jansen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This habilitation thesis is cumulative and, therefore, is collecting and
+connecting research that I (together with several co-authors) have conducted
+over the last few years. Thus, the absolute core of the work is formed by the
+ten publications listed on page 5 under the name Contributions 1 to 10. The
+references to the complete versions of these articles are also found in this
+list, making them as easily accessible as possible for readers wishing to dive
+deep into the different research projects. The chapters following this thesis,
+namely Parts A to C and the concluding remarks, serve to place the articles in
+a larger scientific context, to (briefly) explain their respective content on a
+less formal level, and to highlight some interesting perspectives for future
+research in their respective contexts. Naturally, therefore, the following
+presentation has neither the level of detail nor the formal rigor that can
+(hopefully) be found in the papers. The purpose of the following text is to
+provide the reader an easy and high-level access to this interesting and
+important research field as a whole, thereby, advertising it to a broader
+audience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Habilitation Thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Surrogate-based multiscale analysis of experiments on thermoplastic
+  composites under off-axis loading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. A. Maia, I. B. C. M. Rocha, D. Kovačević, F. P. van der Meer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a surrogate-based multiscale approach to model
+constant strain-rate and creep experiments on unidirectional thermoplastic
+composites under off-axis loading. In previous contributions, these experiments
+were modeled through a single-scale micromechanical simulation under the
+assumption of macroscopic homogeneity. Although efficient and accurate in many
+scenarios, simulations with low-off axis angles showed significant
+discrepancies with the experiments. It was hypothesized that the mismatch was
+caused by macroscopic inhomogeneity, which would require a multiscale approach
+to capture it. However, full-field multiscale simulations remain
+computationally prohibitive. To address this issue, we replace the micromodel
+with a Physically Recurrent Neural Network (PRNN), a surrogate model that
+combines data-driven components with embedded constitutive models to capture
+history-dependent behavior naturally. The explainability of the latent space of
+this network is also explored in a transfer learning strategy that requires no
+re-training. With the surrogate-based simulations, we confirm the hypothesis
+raised on the inhomogeneity of the macroscopic strain field and gain insights
+into the influence of adjustment of the experimental setup with oblique
+end-tabs. Results from the surrogate-based multiscale approach show better
+agreement with experiments than the single-scale micromechanical approach over
+a wide range of settings, although with limited accuracy on the creep
+experiments, where macroscopic test effects were implicitly taken into account
+in the material properties calibration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages. 31 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved learning rates in multi-unit uniform price auctions <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marius Potfer, Dorian Baudry, Hugo Richard, Vianney Perchet, Cheng Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the strategic participation of electricity producers in
+electricity day-ahead market, we study the problem of online learning in
+repeated multi-unit uniform price auctions focusing on the adversarial opposing
+bid setting. The main contribution of this paper is the introduction of a new
+modeling of the bid space. Indeed, we prove that a learning algorithm
+leveraging the structure of this problem achieves a regret of
+$\tilde{O}(K^{4/3}T^{2/3})$ under bandit feedback, improving over the bound of
+$\tilde{O}(K^{7/4}T^{3/4})$ previously obtained in the literature. This
+improved regret rate is tight up to logarithmic terms. Inspired by electricity
+reserve markets, we further introduce a different feedback model under which
+all winning bids are revealed. This feedback interpolates between the
+full-information and bandit scenarios depending on the auctions' results. We
+prove that, under this feedback, the algorithm that we propose achieves regret
+$\tilde{O}(K^{5/2}\sqrt{T})$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple but Effective Closed-form Solution for Extreme Multi-label
+  Learning <span class="chip">ECIR25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazuma Onishi, Katsuhiko Hayashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extreme multi-label learning (XML) is a task of assigning multiple labels
+from an extremely large set of labels to each data instance. Many current
+high-performance XML models are composed of a lot of hyperparameters, which
+complicates the tuning process. Additionally, the models themselves are adapted
+specifically to XML, which complicates their reimplementation. To remedy this
+problem, we propose a simple method based on ridge regression for XML. The
+proposed method not only has a closed-form solution but also is composed of a
+single hyperparameter. Since there are no precedents on applying ridge
+regression to XML, this paper verified the performance of the method by using
+various XML benchmark datasets. Furthermore, we enhanced the prediction of
+low-frequency labels in XML, which hold informative content. This prediction is
+essential yet challenging because of the limited amount of data. Here, we
+employed a simple frequency-based weighting. This approach greatly simplifies
+the process compared with existing techniques. Experimental results revealed
+that it can achieve levels of performance comparable to, or even exceeding,
+those of models with numerous hyperparameters. Additionally, we found that the
+frequency-based weighting significantly improved the predictive performance for
+low-frequency labels, while requiring almost no changes in implementation. The
+source code for the proposed method is available on github at
+https://github.com/cars1015/XML-ridge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10pages, Accepted at ECIR25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mean and Variance Estimation Complexity in Arbitrary Distributions via
+  Wasserstein Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valentio Iverson, Stephen Vavasis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter estimation is a fundamental challenge in machine learning, crucial
+for tasks such as neural network weight fitting and Bayesian inference. This
+paper focuses on the complexity of estimating translation $\boldsymbol{\mu} \in
+\mathbb{R}^l$ and shrinkage $\sigma \in \mathbb{R}_{++}$ parameters for a
+distribution of the form $\frac{1}{\sigma^l} f_0 \left( \frac{\boldsymbol{x} -
+\boldsymbol{\mu}}{\sigma} \right)$, where $f_0$ is a known density in
+$\mathbb{R}^l$ given $n$ samples. We highlight that while the problem is
+NP-hard for Maximum Likelihood Estimation (MLE), it is possible to obtain
+$\varepsilon$-approximations for arbitrary $\varepsilon > 0$ within
+$\text{poly} \left( \frac{1}{\varepsilon} \right)$ time using the Wasserstein
+distance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convex Physics Informed Neural Networks for the Monge-Ampère Optimal
+  Transport Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10162v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10162v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Caboussat, Anna Peruso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal transportation of raw material from suppliers to customers is an
+issue arising in logistics that is addressed here with a continuous model
+relying on optimal transport theory. A physics informed neuralnetwork method is
+advocated here for the solution of the corresponding generalized Monge-Amp`ere
+equation. Convex neural networks are advocated to enforce the convexity of the
+solution to the Monge-Amp\`ere equation and obtain a suitable approximation of
+the optimal transport map. A particular focus is set on the enforcement of
+transport boundary conditions in the loss function. Numerical experiments
+illustrate the solution to the optimal transport problem in several
+configurations, and sensitivity analyses are performed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 figures. Submitted to Engineering Computations on 26
+  September 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Region-wise stacking ensembles for estimating brain-age using MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgios Antonopoulos, Shammi More, Simon B. Eickhoff, Federico Raimondo, Kaustubh R. Patil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive modeling using structural magnetic resonance imaging (MRI) data is
+a prominent approach to study brain-aging. Machine learning algorithms and
+feature extraction methods have been employed to improve predictions and
+explore healthy and accelerated aging e.g. neurodegenerative and psychiatric
+disorders. The high-dimensional MRI data pose challenges to building
+generalizable and interpretable models as well as for data privacy. Common
+practices are resampling or averaging voxels within predefined parcels, which
+reduces anatomical specificity and biological interpretability as voxels within
+a region may differently relate to aging. Effectively, naive fusion by
+averaging can result in information loss and reduced accuracy. We present a
+conceptually novel two-level stacking ensemble (SE) approach. The first level
+comprises regional models for predicting individuals' age based on voxel-wise
+information, fused by a second-level model yielding final predictions. Eight
+data fusion scenarios were explored using as input Gray matter volume (GMV)
+estimates from four datasets covering the adult lifespan. Performance, measured
+using mean absolute error (MAE), R2, correlation and prediction bias, showed
+that SE outperformed the region-wise averages. The best performance was
+obtained when first-level regional predictions were obtained as out-of-sample
+predictions on the application site with second-level models trained on
+independent and site-specific data (MAE=4.75 vs baseline regional mean GMV
+MAE=5.68). Performance improved as more datasets were used for training.
+First-level predictions showed improved and more robust aging signal providing
+new biological insights and enhanced data privacy. Overall, the SE improves
+accuracy compared to the baseline while preserving or enhancing data privacy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>version1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing UAV Path Planning Efficiency Through Accelerated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseanne Viana, Boris Galkin, Lester Ho, Holger Claussen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned Aerial Vehicles (UAVs) are increasingly essential in various fields
+such as surveillance, reconnaissance, and telecommunications. This study aims
+to develop a learning algorithm for the path planning of UAV wireless
+communication relays, which can reduce storage requirements and accelerate Deep
+Reinforcement Learning (DRL) convergence. Assuming the system possesses terrain
+maps of the area and can estimate user locations using localization algorithms
+or direct GPS reporting, it can input these parameters into the learning
+algorithms to achieve optimized path planning performance. However, higher
+resolution terrain maps are necessary to extract topological information such
+as terrain height, object distances, and signal blockages. This requirement
+increases memory and storage demands on UAVs while also lengthening convergence
+times in DRL algorithms. Similarly, defining the telecommunication coverage map
+in UAV wireless communication relays using these terrain maps and user position
+estimations demands higher memory and storage utilization for the learning path
+planning algorithms. Our approach reduces path planning training time by
+applying a dimensionality reduction technique based on Principal Component
+Analysis (PCA), sample combination, Prioritized Experience Replay (PER), and
+the combination of Mean Squared Error (MSE) and Mean Absolute Error (MAE) loss
+calculations in the coverage map estimates, thereby enhancing a Twin Delayed
+Deep Deterministic Policy Gradient (TD3) algorithm. The proposed solution
+reduces the convergence episodes needed for basic training by approximately
+four times compared to the traditional TD3.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted in https://camad2024.ieee-camad.org/
+  conference but it is not available from the conference yet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal Prediction Sets with Improved Conditional Coverage using Trust
+  Scores 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jivat Neet Kaur, Michael I. Jordan, Ahmed Alaa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard conformal prediction offers a marginal guarantee on coverage, but
+for prediction sets to be truly useful, they should ideally ensure coverage
+conditional on each test point. Unfortunately, it is impossible to achieve
+exact, distribution-free conditional coverage in finite samples. In this work,
+we propose an alternative conformal prediction algorithm that targets coverage
+where it matters most--in instances where a classifier is overconfident in its
+incorrect predictions. We start by dissecting miscoverage events in
+marginally-valid conformal prediction, and show that miscoverage rates vary
+based on the classifier's confidence and its deviation from the Bayes optimal
+classifier. Motivated by this insight, we develop a variant of conformal
+prediction that targets coverage conditional on a reduced set of two variables:
+the classifier's confidence in a prediction and a nonparametric trust score
+that measures its deviation from the Bayes classifier. Empirical evaluation on
+multiple image datasets shows that our method generally improves conditional
+coverage properties compared to standard conformal prediction, including
+class-conditional coverage, coverage over arbitrary subgroups, and coverage
+over demographic groups.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Exploration of Stopword Probabilities in Topic Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangjiang Xue, Pierre Le Bras, David A. Robb, Mike J. Chantler, Stefano Padilla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stopword removal is a critical stage in many Machine Learning methods but
+often receives little consideration, it interferes with the model
+visualizations and disrupts user confidence. Inappropriately chosen or hastily
+omitted stopwords not only lead to suboptimal performance but also
+significantly affect the quality of models, thus reducing the willingness of
+practitioners and stakeholders to rely on the output visualizations. This paper
+proposes a novel extraction method that provides a corpus-specific
+probabilistic estimation of stopword likelihood and an interactive
+visualization system to support their analysis. We evaluated our approach and
+interface using real-world data, a commonly used Machine Learning method (Topic
+Modelling), and a comprehensive qualitative experiment probing user confidence.
+The results of our work show that our system increases user confidence in the
+credibility of topic models by (1) returning reasonable probabilities, (2)
+generating an appropriate and representative extension of common stopword
+lists, and (3) providing an adjustable threshold for estimating and analyzing
+stopwords visually. Finally, we discuss insights, recommendations, and best
+practices to support practitioners while improving the output of Machine
+Learning methods and topic model visualizations with robust stopword analysis
+and removal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Impact of Generative Artificial Intelligence in Education:
+  A Thematic Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Kaushik, Sargam Yadav, Andrew Browne, David Lillis, David Williams, Jack Mc Donnell, Peadar Grant, Siobhan Connolly Kernan, Shubham Sharma, Mansi Arora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advancements in Generative Artificial intelligence (GenAI)
+technology have been transformative for the field of education. Large Language
+Models (LLMs) such as ChatGPT and Bard can be leveraged to automate boilerplate
+tasks, create content for personalised teaching, and handle repetitive tasks to
+allow more time for creative thinking. However, it is important to develop
+guidelines, policies, and assessment methods in the education sector to ensure
+the responsible integration of these tools. In this article, thematic analysis
+has been performed on seven essays obtained from professionals in the education
+sector to understand the advantages and pitfalls of using GenAI models such as
+ChatGPT and Bard in education. Exploratory Data Analysis (EDA) has been
+performed on the essays to extract further insights from the text. The study
+found several themes which highlight benefits and drawbacks of GenAI tools, as
+well as suggestions to overcome these limitations and ensure that students are
+using these tools in a responsible and ethical manner.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gene Regulatory Network Inference in the Presence of Selection Bias and
+  Latent Confounders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gongxu Luo, Haoyue Dai, Boyang Sun, Loka Li, Biwei Huang, Petar Stojanov, Kun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gene Regulatory Network Inference (GRNI) aims to identify causal
+relationships among genes using gene expression data, providing insights into
+regulatory mechanisms. A significant yet often overlooked challenge is
+selection bias, a process where only cells meeting specific criteria, such as
+gene expression thresholds, survive or are observed, distorting the true joint
+distribution of genes and thus biasing GRNI results. Furthermore, gene
+expression is influenced by latent confounders, such as non-coding RNAs, which
+add complexity to GRNI. To address these challenges, we propose GISL (Gene
+Regulatory Network Inference in the presence of Selection bias and Latent
+confounders), a novel algorithm to infer true regulatory relationships in the
+presence of selection and confounding issues. Leveraging data obtained via
+multiple gene perturbation experiments, we show that the true regulatory
+relationships, as well as selection processes and latent confounders can be
+partially identified without strong parametric models and under mild graphical
+assumptions. Experimental results on both synthetic and real-world single-cell
+gene expression datasets demonstrate the superiority of GISL over existing
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PaSa: An LLM Agent for Comprehensive Academic Paper Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10120v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10120v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichen He, Guanhua Huang, Peiyuan Feng, Yuan Lin, Yuchen Zhang, Hang Li, Weinan E
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce PaSa, an advanced Paper Search agent powered by large language
+models. PaSa can autonomously make a series of decisions, including invoking
+search tools, reading papers, and selecting relevant references, to ultimately
+obtain comprehensive and accurate results for complex scholarly queries. We
+optimize PaSa using reinforcement learning with a synthetic dataset,
+AutoScholarQuery, which includes 35k fine-grained academic queries and
+corresponding papers sourced from top-tier AI conference publications.
+Additionally, we develop RealScholarQuery, a benchmark collecting real-world
+academic queries to assess PaSa performance in more realistic scenarios.
+Despite being trained on synthetic data, PaSa significantly outperforms
+existing baselines on RealScholarQuery, including Google, Google Scholar,
+Google with GPT-4 for paraphrased queries, chatGPT (search-enabled GPT-4o),
+GPT-o1, and PaSa-GPT-4o (PaSa implemented by prompting GPT-4o). Notably,
+PaSa-7B surpasses the best Google-based baseline, Google with GPT-4o, by 37.78%
+in recall@20 and 39.90% in recall@50. It also exceeds PaSa-GPT-4o by 30.36% in
+recall and 4.25% in precision. Model, datasets, and code are available at
+https://github.com/bytedance/pasa.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robotic World Model: A Neural Network Simulator for Robust Policy
+  Optimization in Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenhao Li, Andreas Krause, Marco Hutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning robust and generalizable world models is crucial for enabling
+efficient and scalable robotic control in real-world environments. In this
+work, we introduce a novel framework for learning world models that accurately
+capture complex, partially observable, and stochastic dynamics. The proposed
+method employs a dual-autoregressive mechanism and self-supervised training to
+achieve reliable long-horizon predictions without relying on domain-specific
+inductive biases, ensuring adaptability across diverse robotic tasks. We
+further propose a policy optimization framework that leverages world models for
+efficient training in imagined environments and seamless deployment in
+real-world systems. Through extensive experiments, our approach consistently
+outperforms state-of-the-art methods, demonstrating superior autoregressive
+prediction accuracy, robustness to noise, and generalization across
+manipulation and locomotion tasks. Notably, policies trained with our method
+are successfully deployed on ANYmal D hardware in a zero-shot transfer,
+achieving robust performance with minimal sim-to-real performance loss. This
+work advances model-based reinforcement learning by addressing the challenges
+of long-horizon prediction, error accumulation, and sim-to-real transfer. By
+providing a scalable and robust framework, the introduced methods pave the way
+for adaptive and efficient robotic systems in real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ landmarker: a Toolkit for Anatomical Landmark Localization in 2D/3D
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jef Jonkers, Luc Duchateau, Glenn Van Wallendael, Sofie Van Hoecke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anatomical landmark localization in 2D/3D images is a critical task in
+medical imaging. Although many general-purpose tools exist for landmark
+localization in classical computer vision tasks, such as pose estimation, they
+lack the specialized features and modularity necessary for anatomical landmark
+localization applications in the medical domain. Therefore, we introduce
+landmarker, a Python package built on PyTorch. The package provides a
+comprehensive, flexible toolkit for developing and evaluating landmark
+localization algorithms, supporting a range of methodologies, including static
+and adaptive heatmap regression. landmarker enhances the accuracy of landmark
+identification, streamlines research and development processes, and supports
+various image formats and preprocessing pipelines. Its modular design allows
+users to customize and extend the toolkit for specific datasets and
+applications, accelerating innovation in medical imaging. landmarker addresses
+a critical need for precision and customization in landmark localization tasks
+not adequately met by existing general-purpose pose estimation tools.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A recursive Bayesian neural network for constitutive modeling of sands
+  under monotonic loading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Toiba Noor, Soban Nasir Lone, G. V. Ramana, Rajdip Nayek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In geotechnical engineering, constitutive models play a crucial role in
+describing soil behavior under varying loading conditions. Data-driven deep
+learning (DL) models offer a promising alternative for developing predictive
+constitutive models. When prediction is the primary focus, quantifying the
+predictive uncertainty of a trained DL model and communicating this uncertainty
+to end users is crucial for informed decision-making.
+  This study proposes a recursive Bayesian neural network (rBNN) framework,
+which builds upon recursive feedforward neural networks (rFFNNs) by introducing
+generalized Bayesian inference for uncertainty quantification. A significant
+contribution of this work is the incorporation of a sliding window approach in
+rFFNNs, allowing the models to effectively capture temporal dependencies across
+load steps. The rBNN extends this framework by treating model parameters as
+random variables, with their posterior distributions inferred using generalized
+variational inference.
+  The proposed framework is validated on two datasets: (i) a numerically
+simulated consolidated drained (CD) triaxial dataset employing a hardening soil
+model and (ii) an experimental dataset comprising 28 CD triaxial tests on
+Baskarp sand. Comparative analyses with LSTM, Bi-LSTM, and GRU models
+demonstrate that the deterministic rFFNN achieves superior predictive accuracy,
+attributed to its transparent structure and sliding window design. While the
+rBNN marginally trails in accuracy for the experimental case, it provides
+robust confidence intervals, addressing data sparsity and measurement noise in
+experimental conditions. The study underscores the trade-offs between
+deterministic and probabilistic approaches and the potential of rBNNs for
+uncertainty-aware constitutive modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Two-level Solar Irradiance Clustering with Season Identification: A
+  Comparative Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10084v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10084v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roshni Agrawal, Sivakumar Subramanian, Venkataramana Runkana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solar irradiance clustering can enhance solar power capacity planning and
+help improve forecasting models by identifying similar irradiance patterns
+influenced by seasonal and weather changes. In this study, we adopt an
+efficient two-level clustering approach to automatically identify seasons using
+the clear sky irradiance in first level and subsequently to identify daily
+cloud level as clear, cloudy and partly cloudy within each season in second
+level. In the second level of clustering, three methods are compared, namely,
+Daily Irradiance Index (DII or $\beta$), Euclidean Distance (ED), and Dynamic
+Time Warping (DTW) distance. The DII is computed as the ratio of time integral
+of measured irradiance to time integral of the clear sky irradiance. The
+identified clusters were compared quantitatively using established clustering
+metrics and qualitatively by comparing the mean irradiance profiles. The
+results clearly establish the superiority of the $\beta$-based clustering
+approach as the leader, setting a new benchmark for solar irradiance clustering
+studies. Moreover, $\beta$-based clustering remains effective even for annual
+data unlike the time-series methods which suffer significant performance
+degradation. Interestingly, contrary to expectations, ED-based clustering
+outperforms the more compute-intensive DTW distance-based clustering. The
+method has been rigorously validated using data from two distinct US locations,
+demonstrating robust scalability for larger datasets and potential
+applicability for other locations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 9 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Double descent in quantum machine learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marie Kempkes, Aroosa Ijaz, Elies Gil-Fuster, Carlos Bravo-Prieto, Jakob Spiegelberg, Evert van Nieuwenburg, Vedran Dunjko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The double descent phenomenon challenges traditional statistical learning
+theory by revealing scenarios where larger models do not necessarily lead to
+reduced performance on unseen data. While this counterintuitive behavior has
+been observed in a variety of classical machine learning models, particularly
+modern neural network architectures, it remains elusive within the context of
+quantum machine learning. In this work, we analytically demonstrate that
+quantum learning models can exhibit double descent behavior by drawing on
+insights from linear regression and random matrix theory. Additionally, our
+numerical experiments on quantum kernel methods across different real-world
+datasets and system sizes further confirm the existence of a test error peak, a
+characteristic feature of double descent. Our findings provide evidence that
+quantum models can operate in the modern, overparameterized regime without
+experiencing overfitting, thereby opening pathways to improved learning
+performance beyond traditional statistical learning theory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Change Captioning in Remote Sensing: SECOND-CC <span class="highlight-title">Dataset</span> and
+  MModalCC Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Can Karaca, M. Enes Ozelbas, Saadettin Berber, Orkhan Karimli, Turabi Yildirim, M. Fatih Amasyali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing change captioning (RSICC) aims to describe changes between
+bitemporal images in natural language. Existing methods often fail under
+challenges like illumination differences, viewpoint changes, blur effects,
+leading to inaccuracies, especially in no-change regions. Moreover, the images
+acquired at different spatial resolutions and have registration errors tend to
+affect the captions. To address these issues, we introduce SECOND-CC, a novel
+RSICC dataset featuring high-resolution RGB image pairs, semantic segmentation
+maps, and diverse real-world scenarios. SECOND-CC which contains 6,041 pairs of
+bitemporal RS images and 30,205 sentences describing the differences between
+images. Additionally, we propose MModalCC, a multimodal framework that
+integrates semantic and visual data using advanced attention mechanisms,
+including Cross-Modal Cross Attention (CMCA) and Multimodal Gated Cross
+Attention (MGCA). Detailed ablation studies and attention visualizations
+further demonstrate its effectiveness and ability to address RSICC challenges.
+Comprehensive experiments show that MModalCC outperforms state-of-the-art RSICC
+methods, including RSICCformer, Chg2Cap, and PSNet with +4.6% improvement on
+BLEU4 score and +9.6% improvement on CIDEr score. We will make our dataset and
+codebase publicly available to facilitate future research at
+https://github.com/ChangeCapsInRS/SecondCC
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE Transactions on Geoscience
+  and Remote Sensing journal for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-D-Piece: Image Tokenizer Meets Quality-Controllable Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keita Miwa, Kento Sasaki, Hidehisa Arai, Tsubasa Takahashi, Yu Yamaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current image tokenization methods require a large number of tokens to
+capture the information contained within images. Although the amount of
+information varies across images, most image tokenizers only support
+fixed-length tokenization, leading to inefficiency in token allocation. In this
+study, we introduce One-D-Piece, a discrete image tokenizer designed for
+variable-length tokenization, achieving quality-controllable mechanism. To
+enable variable compression rate, we introduce a simple but effective
+regularization mechanism named "Tail Token Drop" into discrete one-dimensional
+image tokenizers. This method encourages critical information to concentrate at
+the head of the token sequence, enabling support of variadic tokenization,
+while preserving state-of-the-art reconstruction quality. We evaluate our
+tokenizer across multiple reconstruction quality metrics and find that it
+delivers significantly better perceptual quality than existing
+quality-controllable compression methods, including JPEG and WebP, at smaller
+byte sizes. Furthermore, we assess our tokenizer on various downstream computer
+vision tasks, including image classification, object detection, semantic
+segmentation, and depth estimation, confirming its adaptability to numerous
+applications compared to other variable-rate methods. Our approach demonstrates
+the versatility of variable-length discrete image tokenization, establishing a
+new paradigm in both compression efficiency and reconstruction performance.
+Finally, we validate the effectiveness of tail token drop via detailed analysis
+of tokenizers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our Project Page:
+  https://turingmotors.github.io/one-d-piece-tokenizer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OMoE: Diversifying Mixture of Low-Rank Adaptation by Orthogonal
+  Finetuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyuan Feng, Zhiqiang Pu, Tianyi Hu, Dongmin Li, Xiaolin Ai, Huimu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building mixture-of-experts (MoE) architecture for Low-rank adaptation (LoRA)
+is emerging as a potential direction in parameter-efficient fine-tuning (PEFT)
+for its modular design and remarkable performance. However, simply stacking the
+number of experts cannot guarantee significant improvement. In this work, we
+first conduct qualitative analysis to indicate that experts collapse to similar
+representations in vanilla MoE, limiting the capacity of modular design and
+computational efficiency. Ulteriorly, Our analysis reveals that the performance
+of previous MoE variants maybe limited by a lack of diversity among experts.
+Motivated by these findings, we propose Orthogonal Mixture-of-Experts (OMoE), a
+resource-efficient MoE variant that trains experts in an orthogonal manner to
+promote diversity. In OMoE, a Gram-Schmidt process is leveraged to enforce that
+the experts' representations lie within the Stiefel manifold. By applying
+orthogonal constraints directly to the architecture, OMoE keeps the learning
+objective unchanged, without compromising optimality. Our method is simple and
+alleviates memory bottlenecks, as it incurs minimal experts compared to vanilla
+MoE models. Experiments on diverse commonsense reasoning benchmarks demonstrate
+that OMoE can consistently achieve stable and efficient performance improvement
+when compared with the state-of-the-art methods while significantly reducing
+the number of required experts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating Large Language Models through Partially Linear Feed-Forward
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gansen Hu, Zhaoguo Wang, Jinglin Wei, Wei Huang, Haibo Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) demonstrate remarkable capabilities but face
+deployment challenges due to their massive parameter counts. While existing
+compression techniques like pruning can reduce model size, it leads to
+significant accuracy degradation under high compression ratios. We present a
+novel perspective inspired by constant folding in compiler optimization. Our
+approach enables parameter reduction by treating activation functions in LLMs
+as linear functions.
+  However, recent LLMs use complex non-linear activations like GELU that
+prevent direct application of this technique. We propose TARDIS, which enables
+optimization of LLMs with non-linear activations by partially approximating
+them with linear functions in frequently occurring input ranges. For outlier
+inputs, TARDIS employs an online predictor to dynamically fall back to original
+computations.
+  Our experiments demonstrate that TARDIS achieves 80% parameter reduction in
+feed-forward networks, while significantly outperforming state-of-the-art
+pruning methods Wanda and RIA with up to 65% higher accuracy. In practical
+deployments for a 7B model, TARDIS achieves 1.6x end-to-end inference speedup
+when integrated with the vLLM serving system, and 1.4x speedup with the widely
+adopted HuggingFace implementation, while incurring only a 10.9% accuracy
+trade-off.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tracking student skills real-time through a continuous-variable dynamic
+  Bayesian network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hildo Bijl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of Knowledge Tracing is focused on predicting the success rate of a
+student for a given skill. Modern methods like Deep Knowledge Tracing provide
+accurate estimates given enough data, but being based on neural networks they
+struggle to explain how these estimates are formed. More classical methods like
+Dynamic Bayesian Networks can do this, but they cannot give data on the
+accuracy of their estimates and often struggle to incorporate new observations
+in real-time due to their high computational load.
+  This paper presents a novel method, Performance Distribution Tracing (PDT),
+in which the distribution of the success rate is traced live. It uses a Dynamic
+Bayesian Network with continuous random variables as nodes. By tracing the
+success rate distribution, there is always data available on the accuracy of
+any success rate estimation. In addition, it makes it possible to combine data
+from similar/related skills to come up with a more informed estimate of success
+rates. This makes it possible to predict exercise success rates, providing both
+explainability and an accuracy indication, even when an exercise requires a
+combination of different skills to solve. And through the use of the beta
+distribution functions as conjugate priors, all distributions are available in
+analytical form, allowing efficient online updates upon new observations.
+Experiments have shown that the resulting estimates generally feel sufficiently
+accurate to end-users such that they accept recommendations based on them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PandaSkill -- Player Performance and Skill Rating in Esports:
+  Application to League of Legends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime De Bois, Flora Parmentier, Raphaël Puget, Matthew Tanti, Jordan Peltier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To take the esports scene to the next level, we introduce PandaSkill, a
+framework for assessing player performance and skill rating. Traditional rating
+systems like Elo and TrueSkill often overlook individual contributions and face
+challenges in professional esports due to limited game data and fragmented
+competitive scenes. PandaSkill leverages machine learning to estimate in-game
+player performance from individual player statistics. Each in-game role is
+modeled independently, ensuring a fair comparison between them. Then, using
+these performance scores, PandaSkill updates the player skill ratings using the
+Bayesian framework OpenSkill in a free-for-all setting. In this setting, skill
+ratings are updated solely based on performance scores rather than game
+outcomes, hightlighting individual contributions. To address the challenge of
+isolated rating pools that hinder cross-regional comparisons, PandaSkill
+introduces a dual-rating system that combines players' regional ratings with a
+meta-rating representing each region's overall skill level. Applying PandaSkill
+to five years of professional League of Legends matches worldwide, we show that
+our method produces skill ratings that better predict game outcomes and align
+more closely with expert opinions compared to existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Virtual Nodes Improve Long-term Traffic Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyang Cao, Dingyi Zhuang, Jinhua Zhao, Shenhao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective traffic prediction is a cornerstone of intelligent transportation
+systems, enabling precise forecasts of traffic flow, speed, and congestion.
+While traditional spatio-temporal graph neural networks (ST-GNNs) have achieved
+notable success in short-term traffic forecasting, their performance in
+long-term predictions remains limited. This challenge arises from
+over-squashing problem, where bottlenecks and limited receptive fields restrict
+information flow and hinder the modeling of global dependencies. To address
+these challenges, this study introduces a novel framework that incorporates
+virtual nodes, which are additional nodes added to the graph and connected to
+existing nodes, in order to aggregate information across the entire graph
+within a single GNN layer. Our proposed model incorporates virtual nodes by
+constructing a semi-adaptive adjacency matrix. This matrix integrates
+distance-based and adaptive adjacency matrices, allowing the model to leverage
+geographical information while also learning task-specific features from data.
+Experimental results demonstrate that the inclusion of virtual nodes
+significantly enhances long-term prediction accuracy while also improving
+layer-wise sensitivity to mitigate the over-squashing problem. Virtual nodes
+also offer enhanced explainability by focusing on key intersections and
+high-traffic areas, as shown by the visualization of their adjacency matrix
+weights on road network heat maps. Our advanced approach enhances the
+understanding and management of urban traffic systems, making it particularly
+well-suited for real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Spatiotemporal Augmentation for Improving Dynamic Graph
+  Learning <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10010v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10010v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Chu, Hanlin Xue, Bingce Wang, Xiaoyang Liu, Weiping Li, Tong Mo, Tuoyu Feng, Zhijie Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic graph augmentation is used to improve the performance of dynamic
+GNNs. Most methods assume temporal locality, meaning that recent edges are more
+influential than earlier edges. However, for temporal changes in edges caused
+by random noise, overemphasizing recent edges while neglecting earlier ones may
+lead to the model capturing noise. To address this issue, we propose STAA
+(SpatioTemporal Activity-Aware Random Walk Diffusion). STAA identifies nodes
+likely to have noisy edges in spatiotemporal dimensions. Spatially, it analyzes
+critical topological positions through graph wavelet coefficients. Temporally,
+it analyzes edge evolution through graph wavelet coefficient change rates.
+Then, random walks are used to reduce the weights of noisy edges, deriving a
+diffusion matrix containing spatiotemporal information as an augmented
+adjacency matrix for dynamic GNN learning. Experiments on multiple datasets
+show that STAA outperforms other dynamic graph augmentation methods in node
+classification and link prediction tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2025 IEEE International Conference on Acoustics, Speech, and Signal
+  Processing (ICASSP 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RichSpace: Enriching Text-to-Video <span class="highlight-title">Prompt</span> Space via Text Embedding
+  Interpolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuefan Cao, Chengyue Gong, Xiaoyu Li, Yingyu Liang, Zhizhou Sha, Zhenmei Shi, Zhao Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-video generation models have made impressive progress, but they still
+struggle with generating videos with complex features. This limitation often
+arises from the inability of the text encoder to produce accurate embeddings,
+which hinders the video generation model. In this work, we propose a novel
+approach to overcome this challenge by selecting the optimal text embedding
+through interpolation in the embedding space. We demonstrate that this method
+enables the video generation model to produce the desired videos. Additionally,
+we introduce a simple algorithm using perpendicular foot embeddings and cosine
+similarity to identify the optimal interpolation embedding. Our findings
+highlight the importance of accurate text embeddings and offer a pathway for
+improving text-to-video generation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aneumo: A Large-Scale Comprehensive Synthetic <span class="highlight-title">Dataset</span> of Aneurysm
+  Hemodynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xigui Li, Yuanye Zhou, Feiyang Xiao, Xin Guo, Yichi Zhang, Chen Jiang, Jianchao Ge, Xiansheng Wang, Qimeng Wang, Taiwei Zhang, Chensen Lin, Yuan Cheng, Yuan Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intracranial aneurysm (IA) is a common cerebrovascular disease that is
+usually asymptomatic but may cause severe subarachnoid hemorrhage (SAH) if
+ruptured. Although clinical practice is usually based on individual factors and
+morphological features of the aneurysm, its pathophysiology and hemodynamic
+mechanisms remain controversial. To address the limitations of current
+research, this study constructed a comprehensive hemodynamic dataset of
+intracranial aneurysms. The dataset is based on 466 real aneurysm models, and
+10,000 synthetic models were generated by resection and deformation operations,
+including 466 aneurysm-free models and 9,534 deformed aneurysm models. The
+dataset also provides medical image-like segmentation mask files to support
+insightful analysis. In addition, the dataset contains hemodynamic data
+measured at eight steady-state flow rates (0.001 to 0.004 kg/s), including
+critical parameters such as flow velocity, pressure, and wall shear stress,
+providing a valuable resource for investigating aneurysm pathogenesis and
+clinical prediction. This dataset will help advance the understanding of the
+pathologic features and hemodynamic mechanisms of intracranial aneurysms and
+support in-depth research in related fields. Dataset hosted at
+https://github.com/Xigui-Li/Aneumo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable artificial intelligence (XAI): from inherent explainability
+  to large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fuseini Mumuni, Alhassan Mumuni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI) has continued to achieve tremendous success in
+recent times. However, the decision logic of these frameworks is often not
+transparent, making it difficult for stakeholders to understand, interpret or
+explain their behavior. This limitation hinders trust in machine learning
+systems and causes a general reluctance towards their adoption in practical
+applications, particularly in mission-critical domains like healthcare and
+autonomous driving. Explainable AI (XAI) techniques facilitate the
+explainability or interpretability of machine learning models, enabling users
+to discern the basis of the decision and possibly avert undesirable behavior.
+This comprehensive survey details the advancements of explainable AI methods,
+from inherently interpretable models to modern approaches for achieving
+interpretability of various black box models, including large language models
+(LLMs). Additionally, we review explainable AI techniques that leverage LLM and
+vision-language model (VLM) frameworks to automate or improve the
+explainability of other machine learning models. The use of LLM and VLM as
+interpretability methods particularly enables high-level, semantically
+meaningful explanations of model decisions and behavior. Throughout the paper,
+we highlight the scientific principles, strengths and weaknesses of
+state-of-the-art methods and outline different areas of improvement. Where
+appropriate, we also present qualitative and quantitative comparison results of
+various methods to show how they compare. Finally, we discuss the key
+challenges of XAI and directions for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AIRCHITECT v2: Learning the Hardware Accelerator Design Space through
+  Unified Representations <span class="chip">DATE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jamin Seo, Akshat Ramachandran, Yu-Chuan Chuang, Anirudh Itagi, Tushar Krishna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Design space exploration (DSE) plays a crucial role in enabling custom
+hardware architectures, particularly for emerging applications like AI, where
+optimized and specialized designs are essential. With the growing complexity of
+deep neural networks (DNNs) and the introduction of advanced foundational
+models (FMs), the design space for DNN accelerators is expanding at an
+exponential rate. Additionally, this space is highly non-uniform and
+non-convex, making it increasingly difficult to navigate and optimize.
+Traditional DSE techniques rely on search-based methods, which involve
+iterative sampling of the design space to find the optimal solution. However,
+this process is both time-consuming and often fails to converge to the global
+optima for such design spaces. Recently, AIrchitect v1, the first attempt to
+address the limitations of search-based techniques, transformed DSE into a
+constant-time classification problem using recommendation networks. In this
+work, we propose AIrchitect v2, a more accurate and generalizable
+learning-based DSE technique applicable to large-scale design spaces that
+overcomes the shortcomings of earlier approaches. Specifically, we devise an
+encoder-decoder transformer model that (a) encodes the complex design space
+into a uniform intermediate representation using contrastive learning and (b)
+leverages a novel unified representation blending the advantages of
+classification and regression to effectively explore the large DSE space
+without sacrificing accuracy. Experimental results evaluated on 10^5 real DNN
+workloads demonstrate that, on average, AIrchitect v2 outperforms existing
+techniques by 15% in identifying optimal design points. Furthermore, to
+demonstrate the generalizability of our method, we evaluate performance on
+unseen model workloads (LLMs) and attain a 1.7x improvement in inference
+latency on the identified hardware architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to DATE 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MultiPruner: Balanced Structure Removal in Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J. Pablo Muñoz, Jinjie Yuan, Nilesh Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, state-of-the-art approaches for pruning large pre-trained models
+(LPMs) have demonstrated that the training-free removal of non-critical
+residual blocks in Transformers is viable for reducing model size, achieving
+results that outperform previous training-free pruning approaches. Motivated by
+these findings, we extend BlockPruner (Zhong et al., 2024) and propose
+MultiPruner, a pruning approach that surpasses recent training-free pruning
+methods by adopting a multidimensional, iterative, fine-grained pruning
+strategy. In MultiPruner, multidimensional pruning reinstates the structural
+balance in block-pruned models by sequentially compressing along three
+dimensions: i) residual blocks, ii) channels of multilayer perceptrons (MLP),
+and iii) attention heads. This solution enhances zero-shot accuracy on
+downstream tasks compared to other techniques while improving model compression
+ratios, producing compressed models with fewer computing and memory
+requirements. Extensive experiments demonstrate the advantages of the proposed
+method across various large pre-trained models. The code and pruning
+configurations are available at
+https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Client-Centric Federated Adaptive Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianhui Sun, Xidong Wu, Heng Huang, Aidong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a distributed learning paradigm where clients
+collaboratively train a model while keeping their own data private. With an
+increasing scale of clients and models, FL encounters two key challenges,
+client drift due to a high degree of statistical/system heterogeneity, and lack
+of adaptivity. However, most existing FL research is based on unrealistic
+assumptions that virtually ignore system heterogeneity. In this paper, we
+propose Client-Centric Federated Adaptive Optimization, which is a class of
+novel federated adaptive optimization approaches. We enable several features in
+this framework such as arbitrary client participation, asynchronous server
+aggregation, and heterogeneous local computing, which are ubiquitous in
+real-world FL systems but are missed in most existing works. We provide a
+rigorous convergence analysis of our proposed framework for general nonconvex
+objectives, which is shown to converge with the best-known rate. Extensive
+experiments show that our approaches consistently outperform the baseline by a
+large margin across benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-Scale Feature Extraction and Fusion Deep Learning Method for
+  Classification of Wheat Diseases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sajjad Saleem, Adil Hussain, Nabila Majeed, Zahid Akhtar, Kamran Siddique
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wheat is an important source of dietary fiber and protein that is negatively
+impacted by a number of risks to its growth. The difficulty of identifying and
+classifying wheat diseases is discussed with an emphasis on wheat loose smut,
+leaf rust, and crown and root rot. Addressing conditions like crown and root
+rot, this study introduces an innovative approach that integrates multi-scale
+feature extraction with advanced image segmentation techniques to enhance
+classification accuracy. The proposed method uses neural network models
+Xception, Inception V3, and ResNet 50 to train on a large wheat disease
+classification dataset 2020 in conjunction with an ensemble of machine vision
+classifiers, including voting and stacking. The study shows that the suggested
+methodology has a superior accuracy of 99.75% in the classification of wheat
+diseases when compared to current state-of-the-art approaches. A deep learning
+ensemble model Xception showed the highest accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HEART: Achieving Timely Multi-Model Training for
+  Vehicle-Edge-Cloud-Integrated Hierarchical Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohong Yang, Minghui Liwang, Xianbin Wang, Zhipeng Cheng, Seyyedali Hosseinalipour, Huaiyu Dai, Zhenzhen Jiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid growth of AI-enabled Internet of Vehicles (IoV) calls for efficient
+machine learning (ML) solutions that can handle high vehicular mobility and
+decentralized data. This has motivated the emergence of Hierarchical Federated
+Learning over vehicle-edge-cloud architectures (VEC-HFL). Nevertheless, one
+aspect which is underexplored in the literature on VEC-HFL is that vehicles
+often need to execute multiple ML tasks simultaneously, where this multi-model
+training environment introduces crucial challenges. First, improper aggregation
+rules can lead to model obsolescence and prolonged training times. Second,
+vehicular mobility may result in inefficient data utilization by preventing the
+vehicles from returning their models to the network edge. Third, achieving a
+balanced resource allocation across diverse tasks becomes of paramount
+importance as it majorly affects the effectiveness of collaborative training.
+We take one of the first steps towards addressing these challenges via
+proposing a framework for multi-model training in dynamic VEC-HFL with the goal
+of minimizing global training latency while ensuring balanced training across
+various tasks-a problem that turns out to be NP-hard. To facilitate timely
+model training, we introduce a hybrid synchronous-asynchronous aggregation
+rule. Building on this, we present a novel method called Hybrid Evolutionary
+And gReedy allocaTion (HEART). The framework operates in two stages: first, it
+achieves balanced task scheduling through a hybrid heuristic approach that
+combines improved Particle Swarm Optimization (PSO) and Genetic Algorithms
+(GA); second, it employs a low-complexity greedy algorithm to determine the
+training priority of assigned tasks on vehicles. Experiments on real-world
+datasets demonstrate the superiority of HEART over existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures,</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Statistical Inference for Sequential Feature Selection after Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duong Tan Loc, Nguyen Thang Loi, Vo Nguyen Le Duy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In high-dimensional regression, feature selection methods, such as sequential
+feature selection (SeqFS), are commonly used to identify relevant features.
+When data is limited, domain adaptation (DA) becomes crucial for transferring
+knowledge from a related source domain to a target domain, improving
+generalization performance. Although SeqFS after DA is an important task in
+machine learning, none of the existing methods can guarantee the reliability of
+its results. In this paper, we propose a novel method for testing the features
+selected by SeqFS-DA. The main advantage of the proposed method is its
+capability to control the false positive rate (FPR) below a significance level
+$\alpha$ (e.g., 0.05). Additionally, a strategic approach is introduced to
+enhance the statistical power of the test. Furthermore, we provide extensions
+of the proposed method to SeqFS with model selection criteria including AIC,
+BIC, and adjusted R-squared. Extensive experiments are conducted on both
+synthetic and real-world datasets to validate the theoretical results and
+demonstrate the proposed method's superior performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Steering Large Language Models with Feature Guided Activation Additions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Soo, Wesley Teng, Chandrasekaran Balaganesh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective and reliable control over large language model (LLM) behavior is a
+significant challenge. While activation steering methods, which add steering
+vectors to a model's hidden states, are a promising approach, existing
+techniques often lack precision and interpretability in how they influence
+model outputs. We introduce Feature Guided Activation Additions (FGAA), a novel
+activation steering method that leverages insights from Contrastive Activation
+Addition (CAA) and Sparse Autoencoder-Targeted Steering (SAE-TS). By operating
+in the latent space of a Sparse Autoencoder (SAE) and employing optimization
+techniques to select desired SAE features, FGAA constructs precise steering
+vectors that provide better steering effects while maintaining coherence of
+steered model outputs. In this regard, evaluations on Gemma-2-2B and Gemma-2-9B
+models across various steering tasks demonstrate that FGAA outperforms existing
+steering methods of CAA, SAE decoder steering, and SAE-TS. Our results also
+highlight important trade-offs between steering scale and general model
+capabilities that are consistent across all tested steering methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 maintext pages, 14 appendix pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Study on a Fast Solver for Combined Field Integral Equations of 3D
+  Conducting Bodies Based on Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Shan, Xin Zhang, Di Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a graph neural networks (GNNs)-based fast solver
+(GraphSolver) for solving combined field integral equations (CFIEs) of 3D
+conducting bodies. Rao-Wilton-Glisson (RWG) basis functions are employed to
+discretely and accurately represent the geometry of 3D conducting bodies. A
+concise and informative graph representation is then constructed by treating
+each RWG function as a node in the graph, enabling the flow of current between
+nodes. With the transformed graphs, GraphSolver is developed to directly
+predict real and imaginary parts of the x, y and z components of the surface
+current densities at each node (RWG function). Numerical results demonstrate
+the efficacy of GraphSolver in solving CFIEs for 3D conducting bodies with
+varying levels of geometric complexity, including basic 3D targets,
+missile-shaped targets, and airplane-shaped targets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages,11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon
+  Visuomotor Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haichao Zhang, Haonan Yu, Le Zhao, Andrew Choi, Qinxun Bai, Yiqing Yang, Wei Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a low-cost quadruped manipulation system that solves long-horizon
+real-world tasks, trained by reinforcement learning purely in simulation. The
+system comprises 1) a hierarchical design of a high-level policy for
+visual-mobile manipulation following instructions, and a low-level policy for
+quadruped movement and limb-control, 2) a progressive policy expansion approach
+for solving the long-horizon task together with a teacher-student framework for
+efficient high-level training of the high-level visuomotor policy, and 3) a
+suite of techniques for minimizing sim-to-real gaps.
+  With budget-friendly but limited reliability and performance hardware, and
+just one wrist-mounted RGB camera, the entire system fully trained in
+simulation achieves high success rates for long horizon tasks involving search,
+move, grasp, and drop-into, with fluid sim-to-real transfer in a wide variety
+of indoor and outdoor scenes and lighting conditions.Extensive real-world
+evaluations show that on the long horizon mobile manipulation tasks, our system
+achieves good performance when transferred to real both in terms of task
+success rate and execution efficiency. Finally, we discuss the necessity of our
+sim-to-real techniques for legged mobile manipulation, and show their ablation
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SBAMDT: Bayesian Additive Decision Trees with Adaptive Soft
+  Semi-multivariate Split Rules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stamatina Lamprinakou, Huiyan Sang, Bledar A. Konomi, Ligang Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian Additive Regression Trees [BART, Chipman et al., 2010] have gained
+significant popularity due to their remarkable predictive performance and
+ability to quantify uncertainty. However, standard decision tree models rely on
+recursive data splits at each decision node, using deterministic decision rules
+based on a single univariate feature. This approach limits their ability to
+effectively capture complex decision boundaries, particularly in scenarios
+involving multiple features, such as spatial domains, or when transitions are
+either sharp or smoothly varying. In this paper, we introduce a novel
+probabilistic additive decision tree model that employs a soft split rule. This
+method enables highly flexible splits that leverage both univariate and
+multivariate features, while also respecting the geometric properties of the
+feature domain. Notably, the probabilistic split rule adapts dynamically across
+decision nodes, allowing the model to account for varying levels of smoothness
+in the regression function. We demonstrate the utility of the proposed model
+through comparisons with existing tree-based models on synthetic datasets and a
+New York City education dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FoundationStereo: Zero-Shot Stereo Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Wen, Matthew Trepte, Joseph Aribido, Jan Kautz, Orazio Gallo, Stan Birchfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tremendous progress has been made in deep stereo matching to excel on
+benchmark datasets through per-domain fine-tuning. However, achieving strong
+zero-shot generalization - a hallmark of foundation models in other computer
+vision tasks - remains challenging for stereo matching. We introduce
+FoundationStereo, a foundation model for stereo depth estimation designed to
+achieve strong zero-shot generalization. To this end, we first construct a
+large-scale (1M stereo pairs) synthetic training dataset featuring large
+diversity and high photorealism, followed by an automatic self-curation
+pipeline to remove ambiguous samples. We then design a number of network
+architecture components to enhance scalability, including a side-tuning feature
+backbone that adapts rich monocular priors from vision foundation models to
+mitigate the sim-to-real gap, and long-range context reasoning for effective
+cost volume filtering. Together, these components lead to strong robustness and
+accuracy across domains, establishing a new standard in zero-shot stereo depth
+estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse Binary Representation Learning for Knowledge Tracing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09893v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09893v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yahya Badran, Christine Preisach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge tracing (KT) models aim to predict students' future performance
+based on their historical interactions. Most existing KT models rely
+exclusively on human-defined knowledge concepts (KCs) associated with
+exercises. As a result, the effectiveness of these models is highly dependent
+on the quality and completeness of the predefined KCs. Human errors in labeling
+and the cost of covering all potential underlying KCs can limit model
+performance.
+  In this paper, we propose a KT model, Sparse Binary Representation KT
+(SBRKT), that generates new KC labels, referred to as auxiliary KCs, which can
+augment the predefined KCs to address the limitations of relying solely on
+human-defined KCs. These are learned through a binary vector representation,
+where each bit indicates the presence (one) or absence (zero) of an auxiliary
+KC. The resulting discrete representation allows these auxiliary KCs to be
+utilized in training any KT model that incorporates KCs. Unlike pre-trained
+dense embeddings, which are limited to models designed to accept such vectors,
+our discrete representations are compatible with both classical models, such as
+Bayesian Knowledge Tracing (BKT), and modern deep learning approaches.
+  To generate this discrete representation, SBRKT employs a binarization method
+that learns a sparse representation, fully trainable via stochastic gradient
+descent. Additionally, SBRKT incorporates a recurrent neural network (RNN) to
+capture temporal dynamics and predict future student responses by effectively
+combining the auxiliary and predefined KCs. Experimental results demonstrate
+that SBRKT outperforms the tested baselines on several datasets and achieves
+competitive performance on others. Furthermore, incorporating the learned
+auxiliary KCs consistently enhances the performance of BKT across all tested
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Fairness-Oriented Reinforcement Learning Approach for the Operation
+  and Control of Shared Micromobility Services 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Cederle, Luca Vittorio Piron, Marina Ceccon, Federico Chiariotti, Alessandro Fabris, Marco Fabris, Gian Antonio Susto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Machine Learning grows in popularity across various fields, equity has
+become a key focus for the AI community. However, fairness-oriented approaches
+are still underexplored in smart mobility. Addressing this gap, our study
+investigates the balance between performance optimization and algorithmic
+fairness in shared micromobility services providing a novel framework based on
+Reinforcement Learning. Exploiting Q-learning, the proposed methodology
+achieves equitable outcomes in terms of the Gini index across different areas
+characterized by their distance from central hubs. Through vehicle rebalancing,
+the provided scheme maximizes operator performance while ensuring fairness
+principles for users, reducing iniquity by up to 85% while only increasing
+costs by 30% (w.r.t. applying no equity adjustment). A case study with
+synthetic data validates our insights and highlights the importance of fairness
+in urban micromobility (source code:
+https://github.com/mcederle99/FairMSS.git).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures, accepted at the 2025 American Control Conference
+  (ACC) on January 17th, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Learning Informative Trajectory Embeddings for Imitation,
+  Classification and Regression <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09327v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09327v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichang Ge, Changyu Chen, Arunesh Sinha, Pradeep Varakantham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world sequential decision making tasks like autonomous driving,
+robotics, and healthcare, learning from observed state-action trajectories is
+critical for tasks like imitation, classification, and clustering. For example,
+self-driving cars must replicate human driving behaviors, while robots and
+healthcare systems benefit from modeling decision sequences, whether or not
+they come from expert data. Existing trajectory encoding methods often focus on
+specific tasks or rely on reward signals, limiting their ability to generalize
+across domains and tasks. Inspired by the success of embedding models like CLIP
+and BERT in static domains, we propose a novel method for embedding
+state-action trajectories into a latent space that captures the skills and
+competencies in the dynamic underlying decision-making processes. This method
+operates without the need for reward labels, enabling better generalization
+across diverse domains and tasks. Our contributions are threefold: (1) We
+introduce a trajectory embedding approach that captures multiple abilities from
+state-action data. (2) The learned embeddings exhibit strong representational
+power across downstream tasks, including imitation, classification, clustering,
+and regression. (3) The embeddings demonstrate unique properties, such as
+controlling agent behaviors in IQ-Learn and an additive structure in the latent
+space. Experimental results confirm that our method outperforms traditional
+approaches, offering more flexible and powerful trajectory representations for
+various applications. Our code is available at
+https://github.com/Erasmo1015/vte.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAMAS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic gradient descent for streaming linear and rectified linear
+  systems with adversarial corruptions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01204v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01204v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Halyun Jeong, Deanna Needell, Elizaveta Rebrova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose SGD-exp, a stochastic gradient descent approach for linear and
+ReLU regressions under Massart noise (adversarial semi-random corruption model)
+for the fully streaming setting. We show novel nearly linear convergence
+guarantees of SGD-exp to the true parameter with up to $50\%$ Massart
+corruption rate, and with any corruption rate in the case of symmetric
+oblivious corruptions. This is the first convergence guarantee result for
+robust ReLU regression in the streaming setting, and it shows the improved
+convergence rate over previous robust methods for $L_1$ linear regression due
+to a choice of an exponentially decaying step size, known for its efficiency in
+practice. Our analysis is based on the drift analysis of a discrete stochastic
+process, which could also be interesting on its own.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to a journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STPOTR: Simultaneous Human Trajectory and Pose Prediction Using a
+  Non-Autoregressive <span class="highlight-title">Transformer</span> for Robot Following Ahead 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.07600v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.07600v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mahdavian, Payam Nikdel, Mahdi TaherAhmadi, Mo Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we develop a neural network model to predict future human
+motion from an observed human motion history. We propose a non-autoregressive
+transformer architecture to leverage its parallel nature for easier training
+and fast, accurate predictions at test time. The proposed architecture divides
+human motion prediction into two parts: 1) the human trajectory, which is the
+hip joint 3D position over time and 2) the human pose which is the all other
+joints 3D positions over time with respect to a fixed hip joint. We propose to
+make the two predictions simultaneously, as the shared representation can
+improve the model performance. Therefore, the model consists of two sets of
+encoders and decoders. First, a multi-head attention module applied to encoder
+outputs improves human trajectory. Second, another multi-head self-attention
+module applied to encoder outputs concatenated with decoder outputs facilitates
+learning of temporal dependencies. Our model is well-suited for robotic
+applications in terms of test accuracy and speed, and compares favorably with
+respect to state-of-the-art methods. We demonstrate the real-world
+applicability of our work via the Robot Follow-Ahead task, a challenging yet
+practical case study for our proposed model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Effect of Similarity Measures on Accurate Stability Estimates for
+  Local Surrogate Models in Text-based Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15839v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15839v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Burger, Charles Walter, Thai Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has investigated the vulnerability of local surrogate methods to
+adversarial perturbations on a machine learning (ML) model's inputs, where the
+explanation is manipulated while the meaning and structure of the original
+input remains similar under the complex model. Although weaknesses across many
+methods have been shown to exist, the reasons behind why remain little
+explored. Central to the concept of adversarial attacks on explainable AI (XAI)
+is the similarity measure used to calculate how one explanation differs from
+another. A poor choice of similarity measure can lead to erroneous conclusions
+on the efficacy of an XAI method. Too sensitive a measure results in
+exaggerated vulnerability, while too coarse understates its weakness. We
+investigate a variety of similarity measures designed for text-based ranked
+lists, including Kendall's Tau, Spearman's Footrule, and Rank-biased Overlap to
+determine how substantial changes in the type of measure or threshold of
+success affect the conclusions generated from common adversarial attack
+processes. Certain measures are found to be overly sensitive, resulting in
+erroneous estimates of stability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 Tables (Minor edits for clarity and grammar)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-Rank Irreducible Cartesian Tensor Decomposition and Bases of
+  Equivariant Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18263v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18263v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shihao Shao, Yikang Li, Zhouchen Lin, Qinghua Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Irreducible Cartesian tensors (ICTs) play a crucial role in the design of
+equivariant graph neural networks, as well as in theoretical chemistry and
+chemical physics. Meanwhile, the design space of available linear operations on
+tensors that preserve symmetry presents a significant challenge. The ICT
+decomposition and a basis of this equivariant space are difficult to obtain for
+high-rank tensors. After decades of research, Bonvicini (2024) recently
+achieves an explicit ICT decomposition for $n=5$ with factorial time/space
+complexity. In this work we, for the first time, obtains decomposition matrices
+for ICTs up to rank $n=9$ with reduced and affordable complexity, by
+constructing what we call path matrices. The path matrices are obtained via
+performing chain-like contractions with Clebsch-Gordan matrices following the
+parentage scheme. We prove and leverage that the concatenation of path matrices
+is an orthonormal change-of-basis matrix between the Cartesian tensor product
+space and the spherical direct sum spaces. Furthermore, we identify a complete
+orthogonal basis for the equivariant space, rather than a spanning set
+(Pearce-Crump, 2023), through this path matrices technique. To the best of our
+knowledge, this is also the first analytic, rather than numerical, method for
+theoretically obtaining arbitrary rank orthogonal ICT decomposition matrices
+and orthogonal equivariant bases. We further extend our result to the arbitrary
+tensor product and direct sum spaces, enabling free design between different
+spaces while keeping symmetry. The Python code is available at
+https://github.com/ShihaoShao-GH/ICT-decomposition-and-equivariant-bases, where
+the $n=6,\dots,9$ ICT decomposition matrices are obtained in 1s, 3s, 11s, and
+4m32s on 28-cores Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>47 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized Multi-hop Traffic Pressure for Heterogeneous Traffic
+  Perimeter Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.00753v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.00753v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocan Li, Xiaoyu Wang, Ilia Smirnov, Scott Sanner, Baher Abdulhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Perimeter control (PC) prevents loss of traffic network capacity due to
+congestion in urban areas. Homogeneous PC allows all access points to a
+protected region to have identical permitted inflow. However, homogeneous PC
+performs poorly when the congestion in the protected region is heterogeneous
+(e.g., imbalanced demand) since the homogeneous PC does not consider specific
+traffic conditions around each perimeter intersection. When the protected
+region has spatially heterogeneous congestion, one needs to modulate the
+perimeter inflow rate to be higher near low-density regions and vice versa for
+high-density regions. A na\"ive approach is to leverage 1-hop traffic pressure
+to measure traffic condition around perimeter intersections, but such metric is
+too spatially myopic for PC. To address this issue, we formulate multi-hop
+downstream pressure grounded on Markov chain theory, which ``looks deeper''
+into the protected region beyond perimeter intersections. In addition, we
+formulate a two-stage hierarchical control scheme that can leverage this novel
+multi-hop pressure to redistribute the total permitted inflow provided by a
+pre-trained deep reinforcement learning homogeneous control policy.
+Experimental results show that our heterogeneous PC approaches leveraging
+multi-hop pressure significantly outperform homogeneous PC in scenarios where
+the origin-destination flows are highly imbalanced with high spatial
+heterogeneity. Moveover, our approach is shown to be robust against turning
+ratio uncertainties by a sensitivity analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages main body, 13 figures, journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Types of AI Existential Risk: Decisive and Accumulative 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07836v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07836v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atoosa Kasirzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional discourse on existential risks (x-risks) from AI typically
+focuses on abrupt, dire events caused by advanced AI systems, particularly
+those that might achieve or surpass human-level intelligence. These events have
+severe consequences that either lead to human extinction or irreversibly
+cripple human civilization to a point beyond recovery. This discourse, however,
+often neglects the serious possibility of AI x-risks manifesting incrementally
+through a series of smaller yet interconnected disruptions, gradually crossing
+critical thresholds over time. This paper contrasts the conventional "decisive
+AI x-risk hypothesis" with an "accumulative AI x-risk hypothesis." While the
+former envisions an overt AI takeover pathway, characterized by scenarios like
+uncontrollable superintelligence, the latter suggests a different causal
+pathway to existential catastrophes. This involves a gradual accumulation of
+critical AI-induced threats such as severe vulnerabilities and systemic erosion
+of economic and political structures. The accumulative hypothesis suggests a
+boiling frog scenario where incremental AI risks slowly converge, undermining
+societal resilience until a triggering event results in irreversible collapse.
+Through systems analysis, this paper examines the distinct assumptions
+differentiating these two hypotheses. It is then argued that the accumulative
+view can reconcile seemingly incompatible perspectives on AI risks. The
+implications of differentiating between these causal pathways -- the decisive
+and the accumulative -- for the governance of AI as well as long-term AI safety
+are discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal article for Philosophical Studies</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Counterfactual Uncertainty Quantification of Factual Estimand of
+  Efficacy from Before-and-After Treatment Repeated Measures Randomized
+  Controlled Trials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.09635v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.09635v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingya Wang, Yang Han, Yushi Liu, Szu-Yu Tang, Jason C. Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ideal estimand for comparing treatment $Rx$ with a control $C$ is the
+$\textit{counterfactual}$ efficacy $Rx:C$, the expected differential outcome
+between $Rx$ and $C$ if each patient were given $\textit{both}$. One hundred
+years ago, Neyman (1923a) proved unbiased $\textit{point estimation}$ of
+counterfactual efficacy from designed $\textit{factual}$ experiments is
+achievable. But he left the determination of how much might the counterfactual
+variance of this estimate be smaller than the factual variance as an open
+challenge. This article shows $\textit{counterfactual}$ uncertainty
+quantification (CUQ), quantifying uncertainty for factual point estimates but
+in a counterfactual setting, is achievable for Randomized Controlled Trials
+(RCTs) with Before-and-After treatment Repeated Measures which are common in
+many therapeutic areas. We achieve CUQ whose variability is typically smaller
+than factual UQ by creating a new statistical modeling principle called ETZ.
+  We urge caution in using predictors with measurement error which violates
+standard regression assumption and can cause $\textit{attenuation}$ in
+estimating treatment effects. Fortunately, we prove that, for traditional
+medicine in general, and for targeted therapy with efficacy defined as averaged
+over the population, counterfactual point estimation is unbiased. However, for
+both Real Human and Digital Twins approaches, predicting treatment effect in
+$\textit{subgroups}$ may have attenuation bias.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated Machine Learning for Remaining Useful Life Predictions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12215v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12215v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc-André Zöller, Fabian Mauthe, Peter Zeiler, Marius Lindauer, Marco F. Huber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being able to predict the remaining useful life (RUL) of an engineering
+system is an important task in prognostics and health management. Recently,
+data-driven approaches to RUL predictions are becoming prevalent over
+model-based approaches since no underlying physical knowledge of the
+engineering system is required. Yet, this just replaces required expertise of
+the underlying physics with machine learning (ML) expertise, which is often
+also not available. Automated machine learning (AutoML) promises to build
+end-to-end ML pipelines automatically enabling domain experts without ML
+expertise to create their own models. This paper introduces AutoRUL, an
+AutoML-driven end-to-end approach for automatic RUL predictions. AutoRUL
+combines fine-tuned standard regression methods to an ensemble with high
+predictive power. By evaluating the proposed method on eight real-world and
+synthetic datasets against state-of-the-art hand-crafted models, we show that
+AutoML provides a viable alternative to hand-crafted data-driven RUL
+predictions. Consequently, creating RUL predictions can be made more accessible
+for domain experts using AutoML by eliminating ML expertise from data-driven
+model construction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript accepted at IEEE SMC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing reliability in prediction intervals using point forecasters:
+  Heteroscedastic Quantile Regression and Width-Adaptive Conformal Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14904v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14904v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Sebastián, Carlos E. González-Guillén, Jesús Juan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Constructing prediction intervals for time series forecasting is challenging,
+particularly when practitioners rely solely on point forecasts. While previous
+research has focused on creating increasingly efficient intervals, we argue
+that standard measures alone are inadequate. Beyond efficiency, prediction
+intervals must adapt their width based on the difficulty of the prediction
+while preserving coverage regardless of complexity. To address these issues, we
+propose combining Heteroscedastic Quantile Regression (HQR) with Width-Adaptive
+Conformal Inference (WACI). This integrated procedure guarantees theoretical
+coverage and enables interval widths to vary with predictive uncertainty. We
+assess its performance using both a synthetic example and a real world
+Electricity Price Forecasting scenario. Our results show that this combined
+approach meets or surpasses typical benchmarks for validity and efficiency,
+while also fulfilling important yet often overlooked practical requirements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can machine learning unlock new insights into high-frequency trading? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.08101v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.08101v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        G. Ibikunle, B. Moews, D. Muravyev, K. Rzayev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We design and train machine learning models to capture the nonlinear
+interactions between financial market dynamics and high-frequency trading (HFT)
+activity. In doing so, we introduce new metrics to identify liquidity-demanding
+and -supplying HFT strategies. Both types of HFT strategies increase activity
+in response to information events and decrease it when trading speed is
+restricted, with liquidity-supplying strategies demonstrating greater
+responsiveness. Liquidity-demanding HFT is positively linked with latency
+arbitrage opportunities, whereas liquidity-supplying HFT is negatively related,
+aligning with theoretical expectations. Our metrics have implications for
+understanding the information production process in financial markets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>66 pages, 6 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Model is Secretly a Protein Sequence Optimizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09274v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09274v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinkai Wang, Jiaxing He, Yuanqi Du, Xiaohui Chen, Jianan Canal Li, Li-Ping Liu, Xiaolin Xu, Soha Hassoun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the protein sequence engineering problem, which aims to find
+protein sequences with high fitness levels, starting from a given wild-type
+sequence. Directed evolution has been a dominating paradigm in this field which
+has an iterative process to generate variants and select via experimental
+feedback. We demonstrate large language models (LLMs), despite being trained on
+massive texts, are secretly protein sequence optimizers. With a directed
+evolutionary method, LLM can perform protein engineering through Pareto and
+experiment-budget constrained optimization, demonstrating success on both
+synthetic and experimental fitness landscapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Diversity and Uncertainty in Active learning with
+  <span class="highlight-title">Self-Supervised</span> <span class="highlight-title">Pre-Train</span>ing <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03728v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03728v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Doucet, Benjamin Estermann, Till Aczel, Roger Wattenhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the integration of diversity-based and uncertainty-based
+sampling strategies in active learning, particularly within the context of
+self-supervised pre-trained models. We introduce a straightforward heuristic
+called TCM that mitigates the cold start problem while maintaining strong
+performance across various data levels. By initially applying TypiClust for
+diversity sampling and subsequently transitioning to uncertainty sampling with
+Margin, our approach effectively combines the strengths of both strategies. Our
+experiments demonstrate that TCM consistently outperforms existing methods
+across various datasets in both low and high data regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024 Workshop on Practical Machine Learning for Low
+  Resource Settings (PML4LRS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Quantization for Matrix Multiplication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.13780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.13780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Or Ordentlich, Yury Polyanskiy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work in machine learning community proposed multiple methods for
+performing lossy compression (quantization) of large matrices. This
+quantization is important for accelerating matrix multiplication (main
+component of large language models), which is often bottlenecked by the speed
+of loading these matrices from memory. Unlike classical vector quantization and
+rate-distortion theory, the goal of these new compression algorithms is to be
+able to approximate not the matrices themselves, but their matrix product.
+Specifically, given a pair of real matrices $A,B$ an encoder (compressor) is
+applied to each of them independently producing descriptions with $R$ bits per
+entry. These representations subsequently are used by the decoder to estimate
+matrix product $A^\top B$. In this work, we provide a non-asymptotic lower
+bound on the mean squared error of this approximation (as a function of rate
+$R$) for the case of matrices $A,B$ with iid Gaussian entries. Algorithmically,
+we construct a universal quantizer based on nested lattices with an explicit
+guarantee of approximation error for any (non-random) pair of matrices $A$, $B$
+in terms of only Frobenius norms $\|\bar{A}\|_F, \|\bar{B}\|_F$ and
+$\|\bar{A}^\top \bar{B}\|_F$, where $\bar{A},\bar{B}$ are versions of $A,B$
+with zero-centered columns, respectively. For iid Gaussian matrices our
+quantizer achieves the lower bound and is, thus, asymptotically optimal. A
+practical low-complexity version of our quantizer achieves performance quite
+close to optimal. In addition, we derive rate-distortion function for matrix
+multiplication of iid Gaussian matrices, which exhibits an interesting
+phase-transition at $R\approx 0.906$ bit/entry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DPCL-Diff: The Temporal Knowledge Graph Reasoning Based on Graph Node
+  Diffusion Model with Dual-Domain Periodic Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.01477v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.01477v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukun Cao, Lisheng Wang, Luobin Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal knowledge graph (TKG) reasoning that infers future missing facts is
+an essential and challenging task. Predicting future events typically relies on
+closely related historical facts, yielding more accurate results for repetitive
+or periodic events. However, for future events with sparse historical
+interactions, the effectiveness of this method, which focuses on leveraging
+high-frequency historical information, diminishes. Recently, the capabilities
+of diffusion models in image generation have opened new opportunities for TKG
+reasoning. Therefore, we propose a graph node diffusion model with dual-domain
+periodic contrastive learning (DPCL-Diff). Graph node diffusion model (GNDiff)
+introduces noise into sparsely related events to simulate new events,
+generating high-quality data that better conforms to the actual distribution.
+This generative mechanism significantly enhances the model's ability to reason
+about new events. Additionally, the dual-domain periodic contrastive learning
+(DPCL) maps periodic and non-periodic event entities to Poincar\'e and
+Euclidean spaces, leveraging their characteristics to distinguish similar
+periodic events effectively. Experimental results on four public datasets
+demonstrate that DPCL-Diff significantly outperforms state-of-the-art TKG
+models in event prediction, demonstrating our approach's effectiveness. This
+study also investigates the combined effectiveness of GNDiff and DPCL in TKG
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Jailbreaking as a Reward Misspecification Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14393v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14393v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihui Xie, Jiahui Gao, Lei Li, Zhenguo Li, Qi Liu, Lingpeng Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread adoption of large language models (LLMs) has raised concerns
+about their safety and reliability, particularly regarding their vulnerability
+to adversarial attacks. In this paper, we propose a novel perspective that
+attributes this vulnerability to reward misspecification during the alignment
+process. This misspecification occurs when the reward function fails to
+accurately capture the intended behavior, leading to misaligned model outputs.
+We introduce a metric ReGap to quantify the extent of reward misspecification
+and demonstrate its effectiveness and robustness in detecting harmful backdoor
+prompts. Building upon these insights, we present ReMiss, a system for
+automated red teaming that generates adversarial prompts in a
+reward-misspecified space. ReMiss achieves state-of-the-art attack success
+rates on the AdvBench benchmark against various target aligned LLMs while
+preserving the human readability of the generated prompts. Furthermore, these
+attacks on open-source models demonstrate high transferability to closed-source
+models like GPT-4o and out-of-distribution tasks from HarmBench. Detailed
+analysis highlights the unique advantages of the proposed reward
+misspecification objective compared to previous methods, offering new insights
+for improving LLM safety and robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting drug-disease association prediction for drug repositioning via
+  dual-feature extraction and cross-dual-domain decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.11812v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.11812v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enqiang Zhu, Xiang Li, Chanjuan Liu, Nikhil R. Pal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The extraction of biomedical data has significant academic and practical
+value in contemporary biomedical sciences. In recent years, drug repositioning,
+a cost-effective strategy for drug development by discovering new indications
+for approved drugs, has gained increasing attention. However, many existing
+drug repositioning methods focus on mining information from adjacent nodes in
+biomedical networks without considering the potential inter-relationships
+between the feature spaces of drugs and diseases. This can lead to inaccurate
+encoding, resulting in biased mined drug-disease association information. To
+address this limitation, we propose a new model called Dual-Feature Drug
+Repurposing Neural Network (DFDRNN). DFDRNN allows the mining of two features
+(similarity and association) from the drug-disease biomedical networks to
+encode drugs and diseases. A self-attention mechanism is utilized to extract
+neighbor feature information. It incorporates two dual-feature extraction
+modules: the single-domain dual-feature extraction (SDDFE) module for
+extracting features within a single domain (drugs or diseases) and the
+cross-domain dual-feature extraction (CDDFE) module for extracting features
+across domains. By utilizing these modules, we ensure more appropriate encoding
+of drugs and diseases. A cross-dual-domain decoder is also designed to predict
+drug-disease associations in both domains. Our proposed DFDRNN model
+outperforms six state-of-the-art methods on four benchmark datasets, achieving
+an average AUROC of 0.946 and an average AUPR of 0.597. Case studies on two
+diseases show that the proposed DFDRNN model can be applied in real-world
+scenarios, demonstrating its significant potential in drug repositioning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bandit on the Hunt: Dynamic Crawling for Cyber Threat Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11960v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11960v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Kuehn, Dilara Nadermahmoodi, Markus Bayer, Christian Reuter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Public information contains valuable Cyber Threat Intelligence (CTI) that is
+used to prevent attacks in the future. Ideally, the learnings from previous
+attacks help to mitigate all those that follow. While there are standards for
+sharing this information, much of it is shared in non-standardized news
+articles or blog posts. It is a time-consuming task to monitor online sources
+for threats and even then, one can never be sure, to use the right sources.
+Current research propose extractors of Indicators of Compromise from known
+sources, while the identification of new sources is rarely considered. This
+paper proposes a focused crawler focused on the CTI domain based on multi-armed
+bandit ( MAB) and different crawling strategies. It uses SBERT to identify
+relevant documents, while dynamically adapt its crawling path. We propose a
+system called ThreatCrawl, which achieve a harvest rate of over 25% and is able
+to expand its used seed by over 300%, while retaining focus on the topic at
+hand. In addition, this crawler identified previously unknown but highly
+relevant overview pages, datasets, and domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Redundant Is the <span class="highlight-title">Transformer</span> Stack in Speech Representation Models? <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16302v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16302v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teresa Dorszewski, Albert Kjøller Jacobsen, Lenka Tětková, Lars Kai Hansen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised speech representation models, particularly those leveraging
+transformer architectures, have demonstrated remarkable performance across
+various tasks such as speech recognition, speaker identification, and emotion
+detection. Recent studies on transformer models revealed a high redundancy
+between layers and the potential for significant pruning, which we will
+investigate here for transformer-based speech representation models. We perform
+a detailed analysis of layer similarity in speech representation models using
+three similarity metrics: cosine similarity, centered kernel alignment, and
+mutual nearest-neighbor alignment. Our findings reveal a block-like structure
+of high similarity, suggesting two main processing steps and significant
+redundancy of layers. We demonstrate the effectiveness of pruning
+transformer-based speech representation models without the need for
+post-training, achieving up to 40% reduction in transformer layers while
+maintaining over 95% of the model's predictive capacity. Furthermore, we employ
+a knowledge distillation method to substitute the entire transformer stack with
+mimicking layers, reducing the network size 95-98% and the inference time by up
+to 94%. This substantial decrease in computational load occurs without
+considerable performance loss, suggesting that the transformer stack is almost
+completely redundant for downstream applications of speech representation
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICASSP 2025 (excluding appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-Based Routing in Mixture of Experts: A Novel Framework for Trading <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09636v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09636v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuan-Ming Liu, Ming-Chih Lo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in deep learning and large language models (LLMs) have
+facilitated the deployment of the mixture-of-experts (MoE) mechanism in the
+stock investment domain. While these models have demonstrated promising trading
+performance, they are often unimodal, neglecting the wealth of information
+available in other modalities, such as textual data. Moreover, the traditional
+neural network-based router selection mechanism fails to consider contextual
+and real-world nuances, resulting in suboptimal expert selection. To address
+these limitations, we propose LLMoE, a novel framework that employs LLMs as the
+router within the MoE architecture. Specifically, we replace the conventional
+neural network-based router with LLMs, leveraging their extensive world
+knowledge and reasoning capabilities to select experts based on historical
+price data and stock news. This approach provides a more effective and
+interpretable selection mechanism. Our experiments on multimodal real-world
+stock datasets demonstrate that LLMoE outperforms state-of-the-art MoE models
+and other deep neural network approaches. Additionally, the flexible
+architecture of LLMoE allows for easy adaptation to various downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025 Workshop on AI for Social Impact - Bridging
+  Innovations in Finance, Social Media, and Crime Prevention</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-stage Deep Learning Artifact Reduction for Pallel-beam Computed
+  Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayang Shi, Daniel M. Pelt, K. Joost Batenburg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computed Tomography (CT) using synchrotron radiation is a powerful technique
+that, compared to lab-CT techniques, boosts high spatial and temporal
+resolution while also providing access to a range of contrast-formation
+mechanisms. The acquired projection data is typically processed by a
+computational pipeline composed of multiple stages. Artifacts introduced during
+data acquisition can propagate through the pipeline, and degrade image quality
+in the reconstructed images. Recently, deep learning has shown significant
+promise in enhancing image quality for images representing scientific data.
+This success has driven increasing adoption of deep learning techniques in CT
+imaging. Various approaches have been proposed to incorporate deep learning
+into computational pipelines, but each has limitations in addressing artifacts
+effectively and efficiently in synchrotron CT, either in properly addressing
+the specific artifacts, or in computational efficiency.
+  Recognizing these challenges, we introduce a novel method that incorporates
+separate deep learning models at each stage of the tomography
+pipeline-projection, sinogram, and reconstruction-to address specific artifacts
+locally in a data-driven way. Our approach includes bypass connections that
+feed both the outputs from previous stages and raw data to subsequent stages,
+minimizing the risk of error propagation. Extensive evaluations on both
+simulated and real-world datasets illustrate that our approach effectively
+reduces artifacts and outperforms comparison methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Annealed Multiple Choice Learning: Overcoming limitations of
+  Winner-takes-all with annealing <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15580v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15580v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Perera, Victor Letzelter, Théo Mariotte, Adrien Cortés, Mickael Chen, Slim Essid, Gaël Richard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Annealed Multiple Choice Learning (aMCL) which combines
+simulated annealing with MCL. MCL is a learning framework handling ambiguous
+tasks by predicting a small set of plausible hypotheses. These hypotheses are
+trained using the Winner-takes-all (WTA) scheme, which promotes the diversity
+of the predictions. However, this scheme may converge toward an arbitrarily
+suboptimal local minimum, due to the greedy nature of WTA. We overcome this
+limitation using annealing, which enhances the exploration of the hypothesis
+space during training. We leverage insights from statistical physics and
+information theory to provide a detailed description of the model training
+trajectory. Additionally, we validate our algorithm by extensive experiments on
+synthetic datasets, on the standard UCI benchmark, and on speech separation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM360 K2: Building a 65B 360-Open-Source Large Language Model from
+  Scratch 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07124v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07124v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengzhong Liu, Bowen Tan, Hongyi Wang, Willie Neiswanger, Tianhua Tao, Haonan Li, Fajri Koto, Yuqi Wang, Suqi Sun, Omkar Pangarkar, Richard Fan, Yi Gu, Victor Miller, Liqun Ma, Liping Tang, Nikhil Ranjan, Yonghao Zhuang, Guowei He, Renxi Wang, Mingkai Deng, Robin Algayres, Yuanzhi Li, Zhiqiang Shen, Preslav Nakov, Eric Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We detail the training of the LLM360 K2-65B model, scaling up our 360-degree
+OPEN SOURCE approach to the largest and most powerful models under project
+LLM360. While open-source LLMs continue to advance, the answer to "How are the
+largest LLMs trained?" remains unclear within the community. The implementation
+details for such high-capacity models are often protected due to business
+considerations associated with their high cost. This lack of transparency
+prevents LLM researchers from leveraging valuable insights from prior
+experience, e.g., "What are the best practices for addressing loss spikes?" The
+LLM360 K2 project addresses this gap by providing full transparency and access
+to resources accumulated during the training of LLMs at the largest scale. This
+report highlights key elements of the K2 project, including our first model, K2
+DIAMOND, a 65 billion-parameter LLM that surpasses LLaMA-65B and rivals
+LLaMA2-70B, while requiring fewer FLOPs and tokens. We detail the
+implementation steps and present a longitudinal analysis of K2 DIAMOND's
+capabilities throughout its training process. We also outline ongoing projects
+such as TXT360, setting the stage for future models in the series. By offering
+previously unavailable resources, the K2 project also resonates with the
+360-degree OPEN SOURCE principles of transparency, reproducibility, and
+accessibility, which we believe are vital in the era of resource-intensive AI
+research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BatchLLM: Optimizing Large Batched LLM Inference with Global Prefix
+  Sharing and Throughput-oriented Token Batching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.03594v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.03594v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Zheng, Xin Ji, Taosong Fang, Fanghao Zhou, Chuanjie Liu, Gang Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) increasingly play an important role in a wide
+range of information processing and management tasks. Many of these tasks are
+performed in large batches or even offline, and the performance indictor for
+which is throughput. These tasks usually show the characteristic of prefix
+sharing, where different prompt input can partially show the common prefix.
+However, the existing LLM inference engines tend to optimize the streaming
+requests and show limitations of supporting the large batched tasks with the
+prefix sharing characteristic. The existing solutions use the LRU-based cache
+to reuse the KV context of common prefix between requests. The KV context that
+are about to be reused may prematurely evicted with the implicit cache
+management. Besides, the streaming oriented systems do not leverage the
+request-batch information and can not mix the decoding tokens with the prefill
+chunks to the best for the batched scenarios, and thus fails to saturate the
+GPU. We propose BatchLLM to address the above problems. BatchLLM explicitly
+identifies the common prefixes globally. The requests sharing the same prefix
+will be scheduled together to reuse the KV context the best. BatchLLM reorders
+the requests and schedules the requests with larger ratio of decoding first to
+better mix the decoding tokens with the latter prefill chunks, and applies
+memory-centric token batching to enlarge the token-batch sizes, which helps to
+increase the GPU utilization. Finally, BatchLLM optimizes the prefix-shared
+Attention kernel with horizontal fusion to reduce tail effect and kernel launch
+overhead. Extensive evaluation shows that BatchLLM outperforms vLLM and SGLang
+by 1.3$\times$ to 10.8$\times$ on a set of microbenchmarks and a typical
+industry workload under different hardware environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ELITR-Bench: A Meeting Assistant Benchmark for Long-Context Language
+  Models <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.20262v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.20262v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thibaut Thonet, Jos Rozen, Laurent Besacier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research on Large Language Models (LLMs) has recently witnessed an increasing
+interest in extending the models' context size to better capture dependencies
+within long documents. While benchmarks have been proposed to assess long-range
+abilities, existing efforts primarily considered generic tasks that are not
+necessarily aligned with real-world applications. In contrast, we propose a new
+benchmark for long-context LLMs focused on a practical meeting assistant
+scenario in which the long contexts consist of transcripts obtained by
+automatic speech recognition, presenting unique challenges for LLMs due to the
+inherent noisiness and oral nature of such data. Our benchmark, ELITR-Bench,
+augments the existing ELITR corpus by adding 271 manually crafted questions
+with their ground-truth answers, as well as noisy versions of meeting
+transcripts altered to target different Word Error Rate levels. Our experiments
+with 12 long-context LLMs on ELITR-Bench confirm the progress made across
+successive generations of both proprietary and open models, and point out their
+discrepancies in terms of robustness to transcript noise. We also provide a
+thorough analysis of our GPT-4-based evaluation, including insights from a
+crowdsourcing study. Our findings indicate that while GPT-4's scores align with
+human judges, its ability to distinguish beyond three score levels may be
+limited.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating analytical variability in fMRI results with style transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03703v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03703v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elodie Germani, Camille Maumet, Elisa Fromont
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel approach to improve the reproducibility of neuroimaging
+results by converting statistic maps across different functional MRI pipelines.
+We make the assumption that pipelines used to compute fMRI statistic maps can
+be considered as a style component and we propose to use different generative
+models, among which, Generative Adversarial Networks (GAN) and Diffusion Models
+(DM) to convert statistic maps across different pipelines. We explore the
+performance of multiple GAN frameworks, and design a new DM framework for
+unsupervised multi-domain styletransfer. We constrain the generation of 3D fMRI
+statistic maps using the latent space of an auxiliary classifier that
+distinguishes statistic maps from different pipelines and extend traditional
+sampling techniques used in DM to improve the transition performance. Our
+experiments demonstrate that our proposed methods aresuccessful: pipelines can
+indeed be transferred as a style component, providing animportant source of
+data augmentation for future medical studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accelerating lensed quasars discovery and modeling with physics-informed
+  variational autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12709v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12709v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Irham T. Andika, Stefan Schuldt, Sherry H. Suyu, Satadru Bag, Raoul Cañameras, Alejandra Melo, Claudio Grillo, James H. H. Chan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Strongly lensed quasars provide valuable insights into the rate of cosmic
+expansion, the distribution of dark matter in foreground deflectors, and the
+characteristics of quasar hosts. However, detecting them in astronomical images
+is difficult due to the prevalence of non-lensing objects. To address this
+challenge, we developed a generative deep learning model called VariLens, built
+upon a physics-informed variational autoencoder. This model seamlessly
+integrates three essential modules: image reconstruction, object
+classification, and lens modeling, offering a fast and comprehensive approach
+to strong lens analysis. VariLens is capable of rapidly determining both (1)
+the probability that an object is a lens system and (2) key parameters of a
+singular isothermal ellipsoid (SIE) mass model -- including the Einstein radius
+($\theta_\mathrm{E}$), lens center, and ellipticity -- in just milliseconds
+using a single CPU. A direct comparison of VariLens estimates with traditional
+lens modeling for 20 known lensed quasars within the Subaru Hyper Suprime-Cam
+(HSC) footprint shows good agreement, with both results consistent within
+$2\sigma$ for systems with $\theta_\mathrm{E}<3$ arcsecs. To identify new
+lensed quasar candidates, we begin with an initial sample of approximately 80
+million sources, combining HSC data with multiwavelength information from
+various surveys. After applying a photometric preselection aimed at locating
+$z>1.5$ sources, the number of candidates is reduced to 710,966. Subsequently,
+VariLens highlights 13,831 sources, each showing a high likelihood of being a
+lens. A visual assessment of these objects results in 42 promising candidates
+that await spectroscopic confirmation. These results underscore the potential
+of automated deep learning pipelines to efficiently detect and model strong
+lenses in large datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the Astronomy & Astrophysics journal and updated to
+  reflect the revised version. The paper consists of 17 main pages, 14 figures,
+  and 5 tables. We welcome feedback and comments from readers!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VECT-GAN: A variationally encoded generative model for overcoming data
+  scarcity in pharmaceutical science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08995v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08995v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssef Abdalla, Marrisa Taub, Eleanor Hilton, Priya Akkaraju, Alexander Milanovic, Mine Orlu, Abdul W. Basit, Michael T Cook, Tapabrata Chakraborti, David Shorthouse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data scarcity in pharmaceutical research has led to reliance on
+labour-intensive trial-and-error approaches for development rather than
+data-driven methods. While Machine Learning offers a solution, existing
+datasets are often small and noisy, limiting their utility. To address this, we
+developed a Variationally Encoded Conditional Tabular Generative Adversarial
+Network (VECT-GAN), a novel generative model specifically designed for
+augmenting small, noisy datasets. We introduce a pipeline where data is
+augmented before regression model development and demonstrate that this
+consistently and significantly improves performance over other state-of-the-art
+tabular generative models. We apply this pipeline across six pharmaceutical
+datasets, and highlight its real-world applicability by developing novel
+polymers with medically desirable mucoadhesive properties, which we made and
+experimentally characterised. Additionally, we pre-train the model on the
+ChEMBL database of drug-like molecules, leveraging knowledge distillation to
+enhance its generalisability, making it readily available for use on
+pharmaceutical datasets containing small molecules, an extremely common
+pharmaceutical task. We demonstrate the power of synthetic data for
+regularising small tabular datasets, highlighting its potential to become
+standard practice in pharmaceutical model development, and make our method,
+including VECT-GAN pre-trained on ChEMBL available as a pip package.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 6 primary figures, 3 supplementary figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IterL2Norm: Fast Iterative L2-Normalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04778v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04778v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        ChangMin Ye, Yonguk Sim, Youngchae Kim, SeongMin Jin, Doo Seok Jeong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based large language models are a memory-bound model whose
+operation is based on a large amount of data that are marginally reused. Thus,
+the data movement between a host and accelerator likely dictates the total
+wall-clock time. Layer normalization is one of the key workloads in the
+transformer model, following each of multi-head attention and feed-forward
+network blocks. To reduce data movement, layer normalization needs to be
+performed on the same chip as the matrix-matrix multiplication engine. To this
+end, we introduce an iterative L2-normalization method for 1D input
+(IterL2Norm), ensuring fast convergence to the steady-state solution within
+five iteration steps and high precision, outperforming the fast inverse square
+root algorithm in six out of nine cases for FP32 and five out of nine for
+BFloat16 across the embedding lengths used in the OPT models. Implemented in
+32/28nm CMOS, the IterL2Norm macro normalizes $d$-dimensional vectors, where
+$64 \leq d \leq 1024$, with a latency of 116-227 cycles at 100MHz/1.05V.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Design, Automation & Test in Europe Conference 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Graph Representations and Graph Neural Networks for
+  Multivariate Time Series Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08305v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08305v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wennuo Yang, Shiling Wu, Yuzhi Zhou, Cheng Luo, Xilin He, Weicheng Xie, Linlin Shen, Siyang Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multivariate Time Series Classification (MTSC) enables the analysis if
+complex temporal data, and thus serves as a cornerstone in various real-world
+applications, ranging from healthcare to finance. Since the relationship among
+variables in MTS usually contain crucial cues, a large number of graph-based
+MTSC approaches have been proposed, as the graph topology and edges can
+explicitly represent relationships among variables (channels), where not only
+various MTS graph representation learning strategies but also different Graph
+Neural Networks (GNNs) have been explored. Despite such progresses, there is no
+comprehensive study that fairly benchmarks and investigates the performances of
+existing widely-used graph representation learning strategies/GNN classifiers
+in the application of different MTSC tasks. In this paper, we present the first
+benchmark which systematically investigates the effectiveness of the
+widely-used three node feature definition strategies, four edge feature
+learning strategies and five GNN architecture, resulting in 60 different
+variants for graph-based MTSC. These variants are developed and evaluated with
+a standardized data pipeline and training/validation/testing strategy on 26
+widely-used suspensor MTSC datasets. Our experiments highlight that node
+features significantly influence MTSC performance, while the visualization of
+edge features illustrates why adaptive edge learning outperforms other edge
+feature learning methods. The code of the proposed benchmark is publicly
+available at
+\url{https://github.com/CVI-yangwn/Benchmark-GNN-for-Multivariate-Time-Series-Classification}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Median (GM) Matching for Robust Data Pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.17188v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.17188v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anish Acharya, Inderjit S Dhillon, Sujay Sanghavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale data collections in the wild, are invariably noisy. Thus
+developing data pruning strategies that remain robust even in the presence of
+corruption is critical in practice. In this work, we propose Geometric Median
+($\gm$) Matching -- a herding style greedy algorithm that yields a $k$-subset
+such that the mean of the subset approximates the geometric median of the
+(potentially) noisy dataset. Theoretically, we show that $\gm$ Matching enjoys
+an improved $\gO(1/k)$ scaling over $\gO(1/\sqrt{k})$ scaling of uniform
+sampling; while achieving {\bf optimal breakdown point} of {\bf 1/2} even under
+{\bf arbitrary} corruption. Extensive experiments across several popular deep
+learning benchmarks indicate that $\gm$ Matching consistently improves over
+prior state-of-the-art; the gains become more profound at high rates of
+corruption and aggressive pruning rates; making $\gm$ Matching a strong
+baseline for future research in robust data pruning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural networks for insurance pricing with frequency and severity data:
+  a benchmark study from data preprocessing to technical tariff 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12671v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12671v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Freek Holvoet, Katrien Antonio, Roel Henckaerts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Insurers usually turn to generalized linear models for modeling claim
+frequency and severity data. Due to their success in other fields, machine
+learning techniques are gaining popularity within the actuarial toolbox. Our
+paper contributes to the literature on frequency-severity insurance pricing
+with machine learning via deep learning structures. We present a benchmark
+study on four insurance data sets with frequency and severity targets in the
+presence of multiple types of input features. We compare in detail the
+performance of: a generalized linear model on binned input data, a
+gradient-boosted tree model, a feed-forward neural network (FFNN), and the
+combined actuarial neural network (CANN). The CANNs combine a baseline
+prediction established with a GLM and GBM, respectively, with a neural network
+correction. We explain the data preprocessing steps with specific focus on the
+multiple types of input features typically present in tabular insurance data
+sets, such as postal codes, numeric and categorical covariates. Autoencoders
+are used to embed the categorical variables into the neural network, and we
+explore their potential advantages in a frequency-severity setting. Model
+performance is evaluated not only on out-of-sample deviance but also using
+statistical and calibration performance criteria and managerial tools to get
+more nuanced insights. Finally, we construct global surrogate models for the
+neural nets' frequency and severity models. These surrogates enable the
+translation of the essential insights captured by the FFNNs or CANNs to GLMs.
+As such, a technical tariff table results that can easily be deployed in
+practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Secure Multiplication: Hiding Information in the
+  Rubble of Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16105v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16105v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viveck R. Cadambe, Ateet Devulapalli, Haewon Jeong, Flavio P. Calmon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of private distributed multi-party multiplication. It
+is well-established that Shamir secret-sharing coding strategies can enable
+perfect information-theoretic privacy in distributed computation via the
+celebrated algorithm of Ben Or, Goldwasser and Wigderson (the "BGW algorithm").
+However, perfect privacy and accuracy require an honest majority, that is, $N
+\geq 2t+1$ compute nodes are required to ensure privacy against any $t$
+colluding adversarial nodes. By allowing for some controlled amount of
+information leakage and approximate multiplication instead of exact
+multiplication, we study coding schemes for the setting where the number of
+honest nodes can be a minority, that is $N< 2t+1.$ We develop a tight
+characterization privacy-accuracy trade-off for cases where $N < 2t+1$ by
+measuring information leakage using {differential} privacy instead of perfect
+privacy, and using the mean squared error metric for accuracy. A novel
+technical aspect is an intricately layered noise distribution that merges ideas
+from differential privacy and Shamir secret-sharing at different layers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of papers presented in IEEE ISIT 2022, IEEE ISIT
+  2023 and TPDP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RELIEF: Reinforcement Learning Empowered Graph Feature <span class="highlight-title">Prompt</span> Tuning <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.03195v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.03195v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiapeng Zhu, Zichen Ding, Jianxiang Yu, Jiaqi Tan, Xiang Li, Weining Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of the "pre-train, prompt" paradigm has recently extended its
+generalization ability and data efficiency to graph representation learning,
+following its achievements in Natural Language Processing (NLP). Initial graph
+prompt tuning approaches tailored specialized prompting functions for Graph
+Neural Network (GNN) models pre-trained with specific strategies, such as edge
+prediction, thus limiting their applicability. In contrast, another pioneering
+line of research has explored universal prompting via adding prompts to the
+input graph's feature space, thereby removing the reliance on specific
+pre-training strategies. However, the necessity to add feature prompts to all
+nodes remains an open question. Motivated by findings from prompt tuning
+research in the NLP domain, which suggest that highly capable pre-trained
+models need less conditioning signal to achieve desired behaviors, we advocate
+for strategically incorporating necessary and lightweight feature prompts to
+certain graph nodes to enhance downstream task performance. This introduces a
+combinatorial optimization problem, requiring a policy to decide 1) which nodes
+to prompt and 2) what specific feature prompts to attach. We then address the
+problem by framing the prompt incorporation process as a sequential
+decision-making problem and propose our method, RELIEF, which employs
+Reinforcement Learning (RL) to optimize it. At each step, the RL agent selects
+a node (discrete action) and determines the prompt content (continuous action),
+aiming to maximize cumulative performance gain. Extensive experiments on graph
+and node-level tasks with various pre-training strategies in few-shot scenarios
+demonstrate that our RELIEF outperforms fine-tuning and other prompt-based
+approaches in classification performance and data efficiency. The code is
+available at https://github.com/JasonZhujp/RELIEF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGKDD 2025 (camera-ready version). Due to the space
+  limitation, please refer to the V2 version for more details</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Elucidating the Design Space of <span class="highlight-title">Dataset</span> Condensation <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13733v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13733v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shitong Shao, Zikai Zhou, Huanran Chen, Zhiqiang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset condensation, a concept within data-centric learning, efficiently
+transfers critical attributes from an original dataset to a synthetic version,
+maintaining both diversity and realism. This approach significantly improves
+model training efficiency and is adaptable across multiple application areas.
+Previous methods in dataset condensation have faced challenges: some incur high
+computational costs which limit scalability to larger datasets (e.g., MTT,
+DREAM, and TESLA), while others are restricted to less optimal design spaces,
+which could hinder potential improvements, especially in smaller datasets
+(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a
+comprehensive design framework that includes specific, effective strategies
+like implementing soft category-aware matching and adjusting the learning rate
+schedule. These strategies are grounded in empirical evidence and theoretical
+backing. Our resulting approach, Elucidate Dataset Condensation (EDC),
+establishes a benchmark for both small and large-scale dataset condensation. In
+our testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on
+ImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a
+compression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM,
+and RDED by margins of 27.3%, 17.2%, and 6.6%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AceMath: Advancing Frontier Math Reasoning with Post-Training and Reward
+  Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15084v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15084v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Liu, Yang Chen, Mohammad Shoeybi, Bryan Catanzaro, Wei Ping
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce AceMath, a suite of frontier math models that
+excel in solving complex math problems, along with highly effective reward
+models capable of evaluating generated solutions and reliably identifying the
+correct ones. To develop the instruction-tuned math models, we propose a
+supervised fine-tuning (SFT) process that first achieves competitive
+performance across general domains, followed by targeted fine-tuning for the
+math domain using a carefully curated set of prompts and synthetically
+generated responses. The resulting model, AceMath-72B-Instruct greatly
+outperforms Qwen2.5-Math-72B-Instruct, GPT-4o and Claude-3.5 Sonnet. To develop
+math-specialized reward model, we first construct AceMath-RewardBench, a
+comprehensive and robust benchmark for evaluating math reward models across
+diverse problems and difficulty levels. After that, we present a systematic
+approach to build our math reward models. The resulting model, AceMath-72B-RM,
+consistently outperforms state-of-the-art reward models. Furthermore, when
+combining AceMath-72B-Instruct with AceMath-72B-RM, we achieve the highest
+average rm@8 score across the math reasoning benchmarks. We release model
+weights, training data, and evaluation benchmarks at:
+https://research.nvidia.com/labs/adlr/acemath
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harnessing small projectors and multiple views for efficient vision
+  <span class="highlight-title">pretrain</span>ing <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10725v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10725v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kumar Krishna Agrawal, Arna Ghosh, Shagun Sodhani, Adam Oberman, Blake Richards
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in self-supervised (SSL) visual representation learning has
+led to the development of several different proposed frameworks that rely on
+augmentations of images but use different loss functions. However, there are
+few theoretically grounded principles to guide practice, so practical
+implementation of each SSL framework requires several heuristics to achieve
+competitive performance. In this work, we build on recent analytical results to
+design practical recommendations for competitive and efficient SSL that are
+grounded in theory. Specifically, recent theory tells us that existing SSL
+frameworks are minimizing the same idealized loss, which is to learn features
+that best match the data similarity kernel defined by the augmentations used.
+We show how this idealized loss can be reformulated to a functionally
+equivalent loss that is more efficient to compute. We study the implicit bias
+of using gradient descent to minimize our reformulated loss function and find
+that using a stronger orthogonalization constraint with a reduced projector
+dimensionality should yield good representations. Furthermore, the theory tells
+us that approximating the reformulated loss should be improved by increasing
+the number of augmentations, and as such using multiple augmentations should
+lead to improved convergence. We empirically verify our findings on CIFAR, STL
+and Imagenet datasets, wherein we demonstrate an improved linear readout
+performance when training a ResNet-backbone using our theoretically grounded
+recommendations. Remarkably, we also demonstrate that by leveraging these
+insights, we can reduce the pretraining dataset size by up to 2$\times$ while
+maintaining downstream accuracy simply by using more data augmentations. Taken
+together, our work provides theoretically grounded recommendations that can be
+used to improve SSL convergence and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Plug-and-Play HIO Approach for Phase Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.18967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.18967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cagatay Isil, Figen S. Oktem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the phase retrieval problem, the aim is the recovery of an unknown image
+from intensity-only measurements such as Fourier intensity. Although there are
+several solution approaches, solving this problem is challenging due to its
+nonlinear and ill-posed nature. Recently, learning-based approaches have
+emerged as powerful alternatives to the analytical methods for several inverse
+problems. In the context of phase retrieval, a novel plug-and-play approach
+that exploits learning-based prior and efficient update steps has been
+presented at the Computational Optical Sensing and Imaging topical meeting,
+with demonstrated state-of-the-art performance. The key idea was to incorporate
+learning-based prior to the Gerchberg-Saxton type algorithms through
+plug-and-play regularization. In this paper, we present the mathematical
+development of the method including the derivation of its analytical update
+steps based on half-quadratic splitting and comparatively evaluate its
+performance through extensive simulations on a large test dataset. The results
+show the effectiveness of the method in terms of both image quality,
+computational efficiency, and robustness to initialization and noise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tree-structured Markov random fields with Poisson marginal distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13649v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13649v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Côté, Hélène Cossette, Etienne Marceau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A new family of tree-structured Markov random fields for a vector of discrete
+counting random variables is introduced. According to the characteristics of
+the family, the marginal distributions of the Markov random fields are all
+Poisson with the same mean, and are untied from the strength or structure of
+their built-in dependence. This key feature is uncommon for Markov random
+fields and most convenient for applications purposes. The specific properties
+of this new family confer a straightforward sampling procedure and analytic
+expressions for the joint probability mass function and the joint probability
+generating function of the vector of counting random variables, thus granting
+computational methods that scale well to vectors of high dimension. We study
+the distribution of the sum of random variables constituting a Markov random
+field from the proposed family, analyze a random variable's individual
+contribution to that sum through expected allocations, and establish stochastic
+orderings to assess a wide understanding of their behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instruction-Guided Fusion of Multi-Layer Visual Features in Large
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08443v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08443v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Li, Yi Zheng, Haotian Chen, Xiaolei Chen, Yuxuan Liang, Chenghang Lai, Bin Li, Xiangyang Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) have achieved remarkable success in a
+wide range of multimodal tasks by integrating pre-trained vision encoders and
+large language models. However, current LVLMs primarily rely on visual features
+extracted from the final layers of the vision encoder, overlooking the
+complementary information available in shallower layers. While recent
+approaches have explored the use of multilayer visual features in LVLMs, they
+tend to be task-agnostic and fail to examine the dependencies of hierarchical
+visual features on specific tasks. To address these gaps, we systematically
+investigate the contributions of visual features from different encoder layers
+using 18 benchmarks spanning 6 task categories. Our findings reveal that
+multilayer features provide complementary strengths with varying task
+dependencies, and uniform fusion leads to suboptimal performance. Building on
+these insights, we propose the instruction-guided vision aggregator, a module
+that dynamically integrates multi-layer visual features based on textual
+instructions, without increasing the number of visual tokens. Extensive
+evaluations demonstrate the superior performance of our method. Additionally,
+an in-depth analysis of the aggregator's behavior highlights the dominance of
+mid-to-high-level features in semantic-rich tasks and the critical role of
+low-level features in fine-grained perception.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TraceFL: Interpretability-Driven Debugging in Federated Learning via
+  Neuron Provenance <span class="chip">ICSE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13632v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13632v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Waris Gill, Ali Anwar, Muhammad Ali Gulzar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Federated Learning, clients train models on local data and send updates to
+a central server, which aggregates them into a global model using a fusion
+algorithm. This collaborative yet privacy-preserving training comes at a cost.
+FL developers face significant challenges in attributing global model
+predictions to specific clients. Localizing responsible clients is a crucial
+step towards (a) excluding clients primarily responsible for incorrect
+predictions and (b) encouraging clients who contributed high-quality models to
+continue participating in the future. Existing ML debugging approaches are
+inherently inapplicable as they are designed for single-model, centralized
+training.
+  We introduce TraceFL, a fine-grained neuron provenance capturing mechanism
+that identifies clients responsible for a global model's prediction by tracking
+the flow of information from individual clients to the global model. Since
+inference on different inputs activates a different set of neurons of the
+global model, TraceFL dynamically quantifies the significance of the global
+model's neurons in a given prediction, identifying the most crucial neurons in
+the global model. It then maps them to the corresponding neurons in every
+participating client to determine each client's contribution, ultimately
+localizing the responsible client. We evaluate TraceFL on six datasets,
+including two real-world medical imaging datasets and four neural networks,
+including advanced models such as GPT. TraceFL achieves 99% accuracy in
+localizing the responsible client in FL tasks spanning both image and text
+classification tasks. At a time when state-of-the-artML debugging approaches
+are mostly domain-specific (e.g., image classification only), TraceFL is the
+first technique to enable highly accurate automated reasoning across a wide
+range of FL applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 2025 IEEE/ACM 47th International Conference on Software
+  Engineering (ICSE)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Dynamical Systems by Leveraging Data from Similar Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04344v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04344v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Xin, Lintao Ye, George Chiu, Shreyas Sundaram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning the dynamics of a linear system when one
+has access to data generated by an auxiliary system that shares similar (but
+not identical) dynamics, in addition to data from the true system. We use a
+weighted least squares approach, and provide finite sample error bounds of the
+learned model as a function of the number of samples and various system
+parameters from the two systems as well as the weight assigned to the auxiliary
+data. We show that the auxiliary data can help to reduce the intrinsic system
+identification error due to noise, at the price of adding a portion of error
+that is due to the differences between the two system models. We further
+provide a data-dependent bound that is computable when some prior knowledge
+about the systems, such as upper bounds on noise levels and model difference,
+is available. This bound can also be used to determine the weight that should
+be assigned to the auxiliary data during the model training stage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages,9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Spatial Complexity of Optical Computing and How to Reduce It 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yandong Li, Francesco Monticone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Similar to algorithms, which consume time and memory to run, hardware
+requires resources to function. For devices processing physical waves,
+implementing operations needs sufficient "space," as dictated by wave physics.
+How much space is needed to perform a certain function is a fundamental
+question in optics, with recent research addressing it for given mathematical
+operations, but not for more general computing tasks, e.g., classification.
+Inspired by computational complexity theory, we study the "spatial complexity"
+of optical computing systems in terms of scaling laws - specifically, how their
+physical dimensions must scale as the dimension of the mathematical operation
+increases - and propose a new paradigm for designing optical computing systems:
+space-efficient neuromorphic optics, based on structural sparsity constraints
+and neural pruning methods motivated by wave physics (notably, the concept of
+"overlapping nonlocality"). On two mainstream platforms, free-space optics and
+on-chip integrated photonics, our methods demonstrate substantial size
+reductions (to 1%-10% the size of conventional designs) with minimal compromise
+on performance. Our theoretical and computational results reveal a trend of
+diminishing returns on accuracy as structure dimensions increase, providing a
+new perspective for interpreting and approaching the ultimate limits of optical
+computing - a balanced trade-off between device size and accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Consistent estimation of generative model representations in the data
+  kernel perspective space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.17308v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.17308v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aranyak Acharyya, Michael W. Trosset, Carey E. Priebe, Hayden S. Helm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models, such as large language models and text-to-image diffusion
+models, produce relevant information when presented a query. Different models
+may produce different information when presented the same query. As the
+landscape of generative models evolves, it is important to develop techniques
+to study and analyze differences in model behaviour. In this paper we present
+novel theoretical results for embedding-based representations of generative
+models in the context of a set of queries. In particular, we establish
+sufficient conditions for the consistent estimation of the model embeddings in
+situations where the query set and the number of models grow.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can AI-Generated Text be Reliably Detected? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11156v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11156v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinu Sankar Sadasivan, Aounon Kumar, Sriram Balasubramanian, Wenxiao Wang, Soheil Feizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) perform impressively well in various
+applications. However, the potential for misuse of these models in activities
+such as plagiarism, generating fake news, and spamming has raised concern about
+their responsible use. Consequently, the reliable detection of AI-generated
+text has become a critical area of research. AI text detectors have shown to be
+effective under their specific settings. In this paper, we stress-test the
+robustness of these AI text detectors in the presence of an attacker. We
+introduce recursive paraphrasing attack to stress test a wide range of
+detection schemes, including the ones using the watermarking as well as neural
+network-based detectors, zero shot classifiers, and retrieval-based detectors.
+Our experiments conducted on passages, each approximately 300 tokens long,
+reveal the varying sensitivities of these detectors to our attacks. Our
+findings indicate that while our recursive paraphrasing method can
+significantly reduce detection rates, it only slightly degrades text quality in
+many cases, highlighting potential vulnerabilities in current detection systems
+in the presence of an attacker. Additionally, we investigate the susceptibility
+of watermarked LLMs to spoofing attacks aimed at misclassifying human-written
+text as AI-generated. We demonstrate that an attacker can infer hidden AI text
+signatures without white-box access to the detection method, potentially
+leading to reputational risks for LLM developers. Finally, we provide a
+theoretical framework connecting the AUROC of the best possible detector to the
+Total Variation distance between human and AI text distributions. This analysis
+offers insights into the fundamental challenges of reliable detection as
+language models continue to advance. Our code is publicly available at
+https://github.com/vinusankars/Reliability-of-AI-text-detectors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing User Interest based on Stream Clustering and Memory Networks
+  in Large-Scale Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13238v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13238v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Liu, Nian Wang, Cong Xu, Ming Zhao, Bin Wang, Yi Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender Systems (RSs) provide personalized recommendation service based
+on user interest, which are widely used in various platforms. However, there
+are lots of users with sparse interest due to lacking consumption behaviors,
+which leads to poor recommendation results for them. This problem is widespread
+in large-scale RSs and is particularly difficult to address. To solve this
+problem, we propose a novel solution named User Interest Enhancement (UIE)
+which enhances user interest including user profile and user history behavior
+sequences using the enhancement vectors and personalized enhancement vector
+generated based on stream clustering and memory networks from different
+perspectives. UIE not only remarkably improves model performance on the users
+with sparse interest but also significantly enhance model performance on other
+users. UIE is an end-to-end solution which is easy to be implemented based on
+ranking model. Moreover, we expand our solution and apply similar methods to
+long-tail items, which also achieves excellent improvement. Furthermore, we
+conduct extensive offline and online experiments in a large-scale industrial
+RS. The results demonstrate that our model outperforms other models remarkably,
+especially for the users with sparse interest. Until now, UIE has been fully
+deployed in multiple large-scale RSs and achieved remarkable improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligning with Human Judgement: The Role of Pairwise Preference in Large
+  Language Model Evaluators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16950v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16950v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinhong Liu, Han Zhou, Zhijiang Guo, Ehsan Shareghi, Ivan Vulić, Anna Korhonen, Nigel Collier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated promising capabilities as
+automatic evaluators in assessing the quality of generated natural language.
+However, LLMs still exhibit biases in evaluation and often struggle to generate
+coherent evaluations that align with human assessments. In this work, we first
+conduct a systematic study of the misalignment between LLM evaluators and human
+evaluation, revealing that existing calibration methods aimed at mitigating
+biases of LLMs are insufficient for effectively aligning LLM evaluators.
+Inspired by the use of preference data in RLHF, we formulate the evaluation as
+a ranking problem and introduce Pairwise-preference Search (PAIRS), an
+uncertainty-guided search-based rank aggregation method that employs LLMs to
+conduct pairwise comparisons locally and efficiently ranks candidate texts
+globally. PAIRS achieves state-of-the-art performance on representative
+evaluation tasks in long-form generations and demonstrates significant
+improvements over direct scoring. Furthermore, we provide insights into the
+role of pairwise preference in quantifying the transitivity of LLMs and
+demonstrate how PAIRS benefits from calibration using debiased pairwise
+evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by COLM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Matrix Multiplications for Lookup Table-Quantized LLMs <span class="chip">EMNLP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10960v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10960v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Guo, William Brandon, Radostin Cholakov, Jonathan Ragan-Kelley, Eric P. Xing, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The deployment of large language models (LLMs) is often constrained by memory
+bandwidth, where the primary bottleneck is the cost of transferring model
+parameters from the GPU's global memory to its registers. When coupled with
+custom kernels that fuse the dequantization and matmul operations, weight-only
+quantization can thus enable faster inference by reducing the amount of memory
+movement. However, developing high-performance kernels for weight-quantized
+LLMs presents substantial challenges, especially when the weights are
+compressed to non-evenly-divisible bit widths (e.g., 3 bits) with non-uniform,
+lookup table (LUT) quantization. This paper describes FLUTE, a flexible lookup
+table engine for LUT-quantized LLMs, which uses offline restructuring of the
+quantized weight matrix to minimize bit manipulations associated with
+unpacking, and vectorization and duplication of the lookup table to mitigate
+shared memory bandwidth constraints. At batch sizes < 32 and quantization group
+size of 128 (typical in LLM inference), the FLUTE kernel can be 2-4x faster
+than existing GEMM kernels. As an application of FLUTE, we explore a simple
+extension to lookup table-based NormalFloat quantization and apply it to
+quantize LLaMA3 to various configurations, obtaining competitive quantization
+performance against strong baselines while obtaining an end-to-end throughput
+increase of 1.5 to 2 times.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial Clustering of Citizen Science Data Improves Downstream Species
+  Distribution Models <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15559v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15559v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nahian Ahmed, Mark Roth, Tyler A. Hallman, W. Douglas Robinson, Rebecca A. Hutchinson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Citizen science biodiversity data present great opportunities for ecology and
+conservation across vast spatial and temporal scales. However, the
+opportunistic nature of these data lacks the sampling structure required by
+modeling methodologies that address a pervasive challenge in ecological data
+collection: imperfect detection, i.e., the likelihood of under-observing
+species on field surveys. Occupancy modeling is an example of an approach that
+accounts for imperfect detection by explicitly modeling the observation process
+separately from the biological process of habitat selection. This produces
+species distribution models that speak to the pattern of the species on a
+landscape after accounting for imperfect detection in the data, rather than the
+pattern of species observations corrupted by errors. To achieve this benefit,
+occupancy models require multiple surveys of a site across which the site's
+status (i.e., occupied or not) is assumed constant. Since citizen science data
+are not collected under the required repeated-visit protocol, observations may
+be grouped into sites post hoc. Existing approaches for constructing sites
+discard some observations and/or consider only geographic distance and not
+environmental similarity. In this study, we compare ten approaches for site
+construction in terms of their impact on downstream species distribution models
+for 31 bird species in Oregon, using observations recorded in the eBird
+database. We find that occupancy models built on sites constructed by spatial
+clustering algorithms perform better than existing alternatives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled Sequence and Structure Generation for Realistic Antibody
+  Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05982v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05982v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nayoung Kim, Minsu Kim, Sungsoo Ahn, Jinkyoo Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, deep learning has made rapid progress in antibody design, which
+plays a key role in the advancement of therapeutics. A dominant paradigm is to
+train a model to jointly generate the antibody sequence and the structure as a
+candidate. However, the joint generation requires the model to generate both
+the discrete amino acid categories and the continuous 3D coordinates; this
+limits the space of possible architectures and may lead to suboptimal
+performance. In response, we propose an antibody sequence-structure decoupling
+(ASSD) framework, which separates sequence generation and structure prediction.
+Although our approach is simple, our idea allows the use of powerful neural
+architectures and demonstrates notable performance improvements. We also find
+that the widely used non-autoregressive generators promote sequences with
+overly repeating tokens. Such sequences are both out-of-distribution and prone
+to undesirable developability properties that can trigger harmful immune
+responses in patients. To resolve this, we introduce a composition-based
+objective that allows an efficient trade-off between high performance and low
+token repetition. ASSD shows improved performance in various antibody design
+experiments, while the composition-based objective successfully mitigates token
+repetition of non-autoregressive models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Adaptive Calibration and Optimal Design <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14440v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14440v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rafael Oliveira, Dino Sejdinovic, David Howard, Edwin V. Bonilla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The process of calibrating computer models of natural phenomena is essential
+for applications in the physical sciences, where plenty of domain knowledge can
+be embedded into simulations and then calibrated against real observations.
+Current machine learning approaches, however, mostly rely on rerunning
+simulations over a fixed set of designs available in the observed data,
+potentially neglecting informative correlations across the design space and
+requiring a large amount of simulations. Instead, we consider the calibration
+process from the perspective of Bayesian adaptive experimental design and
+propose a data-efficient algorithm to run maximally informative simulations
+within a batch-sequential process. At each round, the algorithm jointly
+estimates the parameters of the posterior distribution and optimal designs by
+maximising a variational lower bound of the expected information gain. The
+simulator is modelled as a sample from a Gaussian process, which allows us to
+correlate simulations and observed data with the unknown calibration
+parameters. We show the benefits of our method when compared to related
+approaches across synthetic and real-data problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 final revision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Keeping LLMs Aligned After Fine-tuning: The Crucial Role of <span class="highlight-title">Prompt</span>
+  Templates <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18540v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18540v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaifeng Lyu, Haoyu Zhao, Xinran Gu, Dingli Yu, Anirudh Goyal, Sanjeev Arora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Public LLMs such as the Llama 2-Chat underwent alignment training and were
+considered safe. Recently Qi et al. [2024] reported that even benign
+fine-tuning on seemingly safe datasets can give rise to unsafe behaviors in the
+models. The current paper is about methods and best practices to mitigate such
+loss of alignment. We focus on the setting where a public model is fine-tuned
+before serving users for specific usage, where the model should improve on the
+downstream task while maintaining alignment. Through extensive experiments on
+several chat models (Meta's Llama 2-Chat, Mistral AI's Mistral 7B Instruct
+v0.2, and OpenAI's GPT-3.5 Turbo), this paper uncovers that the prompt
+templates used during fine-tuning and inference play a crucial role in
+preserving safety alignment, and proposes the ``Pure Tuning, Safe Testing''
+(PTST) strategy -- fine-tune models without a safety prompt, but include it at
+test time. This seemingly counterintuitive strategy incorporates an intended
+distribution shift to encourage alignment preservation. Fine-tuning experiments
+on GSM8K, ChatDoctor, and OpenOrca show that PTST significantly reduces the
+rise of unsafe behaviors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RLPF: Reinforcement Learning from Prediction Feedback for User
+  Summarization with LLMs <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.04421v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.04421v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Wu, Lin Ning, Luyang Liu, Harrison Lee, Neo Wu, Chao Wang, Sushant Prakash, Shawn O'Banion, Bradley Green, Jun Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LLM-powered personalization agent systems employ Large Language Models (LLMs)
+to predict users' behavior from their past activities. However, their
+effectiveness often hinges on the ability to effectively leverage extensive,
+long user historical data due to its inherent noise and length of such data.
+Existing pretrained LLMs may generate summaries that are concise but lack the
+necessary context for downstream tasks, hindering their utility in
+personalization systems. To address these challenges, we introduce
+Reinforcement Learning from Prediction Feedback (RLPF). RLPF fine-tunes LLMs to
+generate concise, human-readable user summaries that are optimized for
+downstream task performance. By maximizing the usefulness of the generated
+summaries, RLPF effectively distills extensive user history data while
+preserving essential information for downstream tasks. Our empirical evaluation
+demonstrates significant improvements in both extrinsic downstream task utility
+and intrinsic summary quality, surpassing baseline methods by up to 22% on
+downstream task performance and achieving an up to 84.59% win rate on
+Factuality, Abstractiveness, and Readability. RLPF also achieves a remarkable
+74% reduction in context length while improving performance on 16 out of 19
+unseen tasks and/or datasets, showcasing its generalizability. This approach
+offers a promising solution for enhancing LLM personalization by effectively
+transforming long, noisy user histories into informative and human-readable
+representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Systematic Study of Multi-Agent Deep Reinforcement Learning for Safe
+  and Robust Autonomous Highway Ramp Entry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14593v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14593v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Larry Schester, Luis E. Ortiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicles today can drive themselves on highways and driverless robotaxis
+operate in major cities, with more sophisticated levels of autonomous driving
+expected to be available and become more common in the future. Yet, technically
+speaking, so-called "Level 5" (L5) operation, corresponding to full autonomy,
+has not been achieved. For that to happen, functions such as fully autonomous
+highway ramp entry must be available, and provide provably safe, and reliably
+robust behavior to enable full autonomy. We present a systematic study of a
+highway ramp function that controls the vehicles forward-moving actions to
+minimize collisions with the stream of highway traffic into which a merging
+(ego) vehicle enters. We take a game-theoretic multi-agent (MA) approach to
+this problem and study the use of controllers based on deep reinforcement
+learning (DRL). The virtual environment of the MA DRL uses self-play with
+simulated data where merging vehicles safely learn to control longitudinal
+position during a taper-type merge. The work presented in this paper extends
+existing work by studying the interaction of more than two vehicles (agents)
+and does so by systematically expanding the road scene with additional traffic
+and ego vehicles. While previous work on the two-vehicle setting established
+that collision-free controllers are theoretically impossible in fully
+decentralized, non-coordinated environments, we empirically show that
+controllers learned using our approach are nearly ideal when measured against
+idealized optimal controllers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 9 figures; added support ack</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Complete Characterization of Learnability for Stochastic Noisy Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.09597v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.09597v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steve Hanneke, Kun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the stochastic noisy bandit problem with an unknown reward function
+$f^*$ in a known function class $\mathcal{F}$. Formally, a model $M$ maps arms
+$\pi$ to a probability distribution $M(\pi)$ of reward. A model class
+$\mathcal{M}$ is a collection of models. For each model $M$, define its mean
+reward function $f^M(\pi)=\mathbb{E}_{r \sim M(\pi)}[r]$. In the bandit
+learning problem, we proceed in rounds, pulling one arm $\pi$ each round and
+observing a reward sampled from $M(\pi)$. With knowledge of $\mathcal{M}$,
+supposing that the true model $M\in \mathcal{M}$, the objective is to identify
+an arm $\hat{\pi}$ of near-maximal mean reward $f^M(\hat{\pi})$ with high
+probability in a bounded number of rounds. If this is possible, then the model
+class is said to be learnable.
+  Importantly, a result of \cite{hanneke2023bandit} shows there exist model
+classes for which learnability is undecidable. However, the model class they
+consider features deterministic rewards, and they raise the question of whether
+learnability is decidable for classes containing sufficiently noisy models. For
+the first time, we answer this question in the positive by giving a complete
+characterization of learnability for model classes with arbitrary noise. In
+addition to that, we also describe the full spectrum of possible optimal query
+complexities. Further, we prove adaptivity is sometimes necessary to achieve
+the optimal query complexity. Last, we revisit an important complexity measure
+for interactive decision making, the Decision-Estimation-Coefficient
+\citep{foster2021statistical,foster2023tight}, and propose a new variant of the
+DEC which also characterizes learnability in this setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparing hundreds of machine learning classifiers and discrete choice
+  models in predicting travel behavior: an empirical benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2102.01130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2102.01130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenhao Wang, Baichuan Mo, Yunhan Zheng, Stephane Hess, Jinhua Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous studies have compared machine learning (ML) and discrete choice
+models (DCMs) in predicting travel demand. However, these studies often lack
+generalizability as they compare models deterministically without considering
+contextual variations. To address this limitation, our study develops an
+empirical benchmark by designing a tournament model, thus efficiently
+summarizing a large number of experiments, quantifying the randomness in model
+comparisons, and using formal statistical tests to differentiate between the
+model and contextual effects. This benchmark study compares two large-scale
+data sources: a database compiled from literature review summarizing 136
+experiments from 35 studies, and our own experiment data, encompassing a total
+of 6,970 experiments from 105 models and 12 model families. This benchmark
+study yields two key findings. Firstly, many ML models, particularly the
+ensemble methods and deep learning, statistically outperform the DCM family
+(i.e., multinomial, nested, and mixed logit models). However, this study also
+highlights the crucial role of the contextual factors (i.e., data sources,
+inputs and choice categories), which can explain models' predictive performance
+more effectively than the differences in model types alone. Model performance
+varies significantly with data sources, improving with larger sample sizes and
+lower dimensional alternative sets. After controlling all the model and
+contextual factors, significant randomness still remains, implying inherent
+uncertainty in such model comparisons. Overall, we suggest that future
+researchers shift more focus from context-specific model comparisons towards
+examining model transferability across contexts and characterizing the inherent
+uncertainty in ML, thus creating more robust and generalizable next-generation
+travel demand models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using Technology in Digital Humanities for Learning and Knowledge
+  Dissemination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armanda Rodrigues, Nuno Correia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research on Digital Humanities (DH) has been boosted due to the investment in
+technology for developing access and interaction tools for handling Humanities
+and Heritage data. The availability of these tools lowers the distance between
+DH scholars and data generators, and students at various levels, not only
+because it facilitates access to information but also through the dissemination
+technologies used in these tools, designed for the improvement of user
+experience. Most of the disciplines associated with the humanities involve
+geographical and temporal references, often integrated. These references have
+been scientifically and pedagogically handled for centuries and are established
+through the use of maps and timelines. Both these supports have been
+implemented and used digitally and their potential has been risen through their
+innovative integration with narratives, storytelling and story maps, enabling
+the telling of historical events in narratives superimposed on maps. These can
+be enhanced when supported by rich data, such as images, videos, sound, and
+their possible combinations in virtual and augmented reality. In this paper, we
+describe an initial set of tools which use a subset of these technologies and
+data types to enable learning and dissemination of Humanities data and
+knowledge. We describe how techniques for making data available and tools for
+enhancing interaction with these data can improve user experience and
+potentiate learning and dissemination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Change Captioning in Remote Sensing: SECOND-CC <span class="highlight-title">Dataset</span> and
+  MModalCC Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Can Karaca, M. Enes Ozelbas, Saadettin Berber, Orkhan Karimli, Turabi Yildirim, M. Fatih Amasyali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing change captioning (RSICC) aims to describe changes between
+bitemporal images in natural language. Existing methods often fail under
+challenges like illumination differences, viewpoint changes, blur effects,
+leading to inaccuracies, especially in no-change regions. Moreover, the images
+acquired at different spatial resolutions and have registration errors tend to
+affect the captions. To address these issues, we introduce SECOND-CC, a novel
+RSICC dataset featuring high-resolution RGB image pairs, semantic segmentation
+maps, and diverse real-world scenarios. SECOND-CC which contains 6,041 pairs of
+bitemporal RS images and 30,205 sentences describing the differences between
+images. Additionally, we propose MModalCC, a multimodal framework that
+integrates semantic and visual data using advanced attention mechanisms,
+including Cross-Modal Cross Attention (CMCA) and Multimodal Gated Cross
+Attention (MGCA). Detailed ablation studies and attention visualizations
+further demonstrate its effectiveness and ability to address RSICC challenges.
+Comprehensive experiments show that MModalCC outperforms state-of-the-art RSICC
+methods, including RSICCformer, Chg2Cap, and PSNet with +4.6% improvement on
+BLEU4 score and +9.6% improvement on CIDEr score. We will make our dataset and
+codebase publicly available to facilitate future research at
+https://github.com/ChangeCapsInRS/SecondCC
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE Transactions on Geoscience
+  and Remote Sensing journal for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLIP-PCQA: Exploring Subjective-Aligned Vision-Language Modeling for
+  Point Cloud Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yating Liu, Yujie Zhang, Ziyu Shan, Yiling Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, No-Reference Point Cloud Quality Assessment (NR-PCQA)
+research has achieved significant progress. However, existing methods mostly
+seek a direct mapping function from visual data to the Mean Opinion Score
+(MOS), which is contradictory to the mechanism of practical subjective
+evaluation. To address this, we propose a novel language-driven PCQA method
+named CLIP-PCQA. Considering that human beings prefer to describe visual
+quality using discrete quality descriptions (e.g., "excellent" and "poor")
+rather than specific scores, we adopt a retrieval-based mapping strategy to
+simulate the process of subjective assessment. More specifically, based on the
+philosophy of CLIP, we calculate the cosine similarity between the visual
+features and multiple textual features corresponding to different quality
+descriptions, in which process an effective contrastive loss and learnable
+prompts are introduced to enhance the feature extraction. Meanwhile, given the
+personal limitations and bias in subjective experiments, we further covert the
+feature similarities into probabilities and consider the Opinion Score
+Distribution (OSD) rather than a single MOS as the final target. Experimental
+results show that our CLIP-PCQA outperforms other State-Of-The-Art (SOTA)
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GVMGen: A General Video-to-Music Generation Model with Hierarchical
+  Attentions <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heda Zuo, Weitao You, Junxian Wu, Shihong Ren, Pei Chen, Mingxu Zhou, Yujia Lu, Lingyun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Composing music for video is essential yet challenging, leading to a growing
+interest in automating music generation for video applications. Existing
+approaches often struggle to achieve robust music-video correspondence and
+generative diversity, primarily due to inadequate feature alignment methods and
+insufficient datasets. In this study, we present General Video-to-Music
+Generation model (GVMGen), designed for generating high-related music to the
+video input. Our model employs hierarchical attentions to extract and align
+video features with music in both spatial and temporal dimensions, ensuring the
+preservation of pertinent features while minimizing redundancy. Remarkably, our
+method is versatile, capable of generating multi-style music from different
+video inputs, even in zero-shot scenarios. We also propose an evaluation model
+along with two novel objective metrics for assessing video-music alignment.
+Additionally, we have compiled a large-scale dataset comprising diverse types
+of video-music pairs. Experimental results demonstrate that GVMGen surpasses
+previous models in terms of music-video correspondence, generative diversity,
+and application universality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 39th AAAI Conference on Artificial Intelligence
+  (AAAI-25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">115</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3rd Workshop on Maritime Computer Vision (MaCVi) 2025: Challenge Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Kiefer, Lojze Žust, Jon Muhovič, Matej Kristan, Janez Perš, Matija Teršek, Uma Mudenagudi Chaitra Desai, Arnold Wiliem, Marten Kreis, Nikhil Akalwadi, Yitong Quan, Zhiqiang Zhong, Zhe Zhang, Sujie Liu, Xuran Chen, Yang Yang, Matej Fabijanić, Fausto Ferreira, Seongju Lee, Junseok Lee, Kyoobin Lee, Shanliang Yao, Runwei Guan, Xiaoyu Huang, Yi Ni, Himanshu Kumar, Yuan Feng, Yi-Ching Cheng, Tzu-Yu Lin, Chia-Ming Lee, Chih-Chung Hsu, Jannik Sheikh, Andreas Michel, Wolfgang Gross, Martin Weinmann, Josip Šarić, Yipeng Lin, Xiang Yang, Nan Jiang, Yutang Lu, Fei Feng, Ali Awad, Evan Lucas, Ashraf Saleem, Ching-Heng Cheng, Yu-Fan Lin, Tzu-Yu Lin, Chih-Chung Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The 3rd Workshop on Maritime Computer Vision (MaCVi) 2025 addresses maritime
+computer vision for Unmanned Surface Vehicles (USV) and underwater. This report
+offers a comprehensive overview of the findings from the challenges. We provide
+both statistical and qualitative analyses, evaluating trends from over 700
+submissions. All datasets, evaluation code, and the leaderboard are available
+to the public at https://macvi.org/workshop/macvi25.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Part of the MaCVi 2025 workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Agent4Edu: Generating Learner Response Data by Generative Agents for
+  Intelligent Education Systems <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weibo Gao, Qi Liu, Linan Yue, Fangzhou Yao, Rui Lv, Zheng Zhang, Hao Wang, Zhenya Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized learning represents a promising educational strategy within
+intelligent educational systems, aiming to enhance learners' practice
+efficiency. However, the discrepancy between offline metrics and online
+performance significantly impedes their progress. To address this challenge, we
+introduce Agent4Edu, a novel personalized learning simulator leveraging recent
+advancements in human intelligence through large language models (LLMs).
+Agent4Edu features LLM-powered generative agents equipped with learner profile,
+memory, and action modules tailored to personalized learning algorithms. The
+learner profiles are initialized using real-world response data, capturing
+practice styles and cognitive factors. Inspired by human psychology theory, the
+memory module records practice facts and high-level summaries, integrating
+reflection mechanisms. The action module supports various behaviors, including
+exercise understanding, analysis, and response generation. Each agent can
+interact with personalized learning algorithms, such as computerized adaptive
+testing, enabling a multifaceted evaluation and enhancement of customized
+services. Through a comprehensive assessment, we explore the strengths and
+weaknesses of Agent4Edu, emphasizing the consistency and discrepancies in
+responses between agents and human learners. The code, data, and appendix are
+publicly available at https://github.com/bigdata-ustc/Agent4Edu.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large language models for automated scholarly paper <span class="highlight-title">review</span>: A <span class="highlight-title">survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenzhen Zhuang, Jiandong Chen, Hongfeng Xu, Yuwen Jiang, Jialiang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have significantly impacted human society,
+influencing various domains. Among them, academia is not simply a domain
+affected by LLMs, but it is also the pivotal force in the development of LLMs.
+In academic publications, this phenomenon is represented during the
+incorporation of LLMs into the peer review mechanism for reviewing manuscripts.
+We proposed the concept of automated scholarly paper review (ASPR) in our
+previous paper. As the incorporation grows, it now enters the coexistence phase
+of ASPR and peer review, which is described in that paper. LLMs hold
+transformative potential for the full-scale implementation of ASPR, but they
+also pose new issues and challenges that need to be addressed. In this survey
+paper, we aim to provide a holistic view of ASPR in the era of LLMs. We begin
+with a survey to find out which LLMs are used to conduct ASPR. Then, we review
+what ASPR-related technological bottlenecks have been solved with the
+incorporation of LLM technology. After that, we move on to explore new methods,
+new datasets, new source code, and new online systems that come with LLMs for
+ASPR. Furthermore, we summarize the performance and issues of LLMs in ASPR, and
+investigate the attitudes and reactions of publishers and academia to ASPR.
+Lastly, we discuss the challenges associated with the development of LLMs for
+ASPR. We hope this survey can serve as an inspirational reference for the
+researchers and promote the progress of ASPR for its actual implementation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Autoregressive <span class="highlight-title">Transformer</span>s: Combining Byte-~and Word-Level
+  Processing for Robust, Adaptable Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pit Neitemeier, Björn Deiseroth, Constantin Eichenberg, Lukas Balles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tokenization is a fundamental step in natural language processing, breaking
+text into units that computational models can process. While learned subword
+tokenizers have become the de-facto standard, they present challenges such as
+large vocabularies, limited adaptability to new domains or languages, and
+sensitivity to spelling errors and variations. To overcome these limitations,
+we investigate a hierarchical architecture for autoregressive language
+modelling that combines character-level and word-level processing. It employs a
+lightweight character-level encoder to convert character sequences into word
+embeddings, which are then processed by a word-level backbone model and decoded
+back into characters via a compact character-level decoder. This method retains
+the sequence compression benefits of word-level tokenization without relying on
+a rigid, predefined vocabulary. We demonstrate, at scales up to 7 billion
+parameters, that hierarchical transformers match the downstream task
+performance of subword-tokenizer-based models while exhibiting significantly
+greater robustness to input perturbations. Additionally, during continued
+pretraining on an out-of-domain language, our model trains almost twice as
+fast, achieves superior performance on the target language, and retains more of
+its previously learned knowledge. Hierarchical transformers pave the way for
+NLP systems that are more robust, flexible, and generalizable across languages
+and domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Ontology for Social Determinants of Education (SDoEd) based on
+  Human-AI Collaborative Approach <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navya Martin Kollapally, James Geller, Patricia Morreale, Daehan Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of computational ontologies is well-established in the field of
+Medical Informatics. The topic of Social Determinants of Health (SDoH) has also
+received extensive attention. Work at the intersection of ontologies and SDoH
+has been published. However, a standardized framework for Social Determinants
+of Education (SDoEd) is lacking. In this paper, we are closing the gap by
+introducing an SDoEd ontology for creating a precise conceptualization of the
+interplay between life circumstances of students and their possible educational
+achievements. The ontology was developed utilizing suggestions from
+ChatGPT-3.5-010422 and validated using peer-reviewed research articles. The
+first version of developed ontology was evaluated by human experts in the field
+of education and validated using standard ontology evaluation software. This
+version of the SDoEd ontology contains 231 domain concepts, 10 object
+properties, and 24 data properties
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CONSORTIUM FOR COMPUTING SCIENCES IN COLLEGES</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEANN: A Domain-Informed Neural Network for Epidemiological Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-Baptiste Guimbaud, Marc Plantevit, Léa Maître, Rémy Cazabet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In epidemiology, traditional statistical methods such as logistic regression,
+linear regression, and other parametric models are commonly employed to
+investigate associations between predictors and health outcomes. However,
+non-parametric machine learning techniques, such as deep neural networks
+(DNNs), coupled with explainable AI (XAI) tools, offer new opportunities for
+this task. Despite their potential, these methods face challenges due to the
+limited availability of high-quality, high-quantity data in this field. To
+address these challenges, we introduce SEANN, a novel approach for informed
+DNNs that leverages a prevalent form of domain-specific knowledge: Pooled
+Effect Sizes (PES). PESs are commonly found in published Meta-Analysis studies,
+in different forms, and represent a quantitative form of a scientific
+consensus. By direct integration within the learning procedure using a custom
+loss, we experimentally demonstrate significant improvements in the
+generalizability of predictive performances and the scientific plausibility of
+extracted relationships compared to a domain-knowledge agnostic neural network
+in a scarce and noisy data setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Rhythm and Voice Conversion of Dysarthric to Healthy Speech
+  for ASR <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karl El Hajal, Enno Hermann, Ajinkya Kulkarni, Mathew Magimai. -Doss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic speech recognition (ASR) systems are well known to perform poorly
+on dysarthric speech. Previous works have addressed this by speaking rate
+modification to reduce the mismatch with typical speech. Unfortunately, these
+approaches rely on transcribed speech data to estimate speaking rates and
+phoneme durations, which might not be available for unseen speakers. Therefore,
+we combine unsupervised rhythm and voice conversion methods based on
+self-supervised speech representations to map dysarthric to typical speech. We
+evaluate the outputs with a large ASR model pre-trained on healthy speech
+without further fine-tuning and find that the proposed rhythm conversion
+especially improves performance for speakers of the Torgo corpus with more
+severe cases of dysarthria. Code and audio samples are available at
+https://idiap.github.io/RnV .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025 Satellite Workshop: Workshop on Speech
+  Pathology Analysis and DEtection (SPADE)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Random-Key Algorithms for Optimizing Integrated Operating Room
+  Scheduling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Salezze Vieira, Eduardo Machado Silva, Antonio Augusto Chaves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient surgery room scheduling is essential for hospital efficiency,
+patient satisfaction, and resource utilization. This study addresses this
+challenge by introducing a novel concept of Random-Key Optimizer (RKO),
+rigorously tested on literature and new, real-world inspired instances. Our
+combinatorial optimization problem incorporates multi-room scheduling,
+equipment scheduling, and complex availability constraints for rooms, patients,
+and surgeons, facilitating rescheduling and enhancing operational flexibility.
+The RKO approach represents solutions as points in a continuous space, which
+are then mapped in the problem solution space via a deterministic function
+known as a decoder. The core idea is to operate metaheuristics and heuristics
+in the random-key space, unaware of the original solution space. We design the
+Biased Random-Key Genetic Algorithm with $Q$-Learning, Simulated Annealing, and
+Iterated Local Search for use within an RKO framework, employing a single
+decoder function. The proposed metaheuristics are complemented by lower-bound
+formulations, providing optimal gaps for evaluating the effectiveness of the
+heuristic results. Our results demonstrate significant lower and upper bounds
+improvements for the literature instances, notably proving one optimal result.
+Furthermore, the best-proposed metaheuristic efficiently generates schedules
+for the newly introduced instances, even in highly constrained scenarios. This
+research offers valuable insights and practical solutions for improving surgery
+scheduling processes, offering tangible benefits to hospitals by optimising
+resource allocation, reducing patient wait times, and enhancing overall
+operational efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, Preprint submitted to Applied Soft Computing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Challenges and recommendations for Electronic Health Records data
+  extraction and preparation for dynamic prediction modelling in hospitalized
+  patients -- a practical guide 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elena Albu, Shan Gao, Pieter Stijnen, Frank E. Rademakers, Bas C T van Bussel, Taya Collyer, Tina Hernandez-Boussard, Laure Wynants, Ben Van Calster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic predictive modeling using electronic health record (EHR) data has
+gained significant attention in recent years. The reliability and
+trustworthiness of such models depend heavily on the quality of the underlying
+data, which is largely determined by the stages preceding the model
+development: data extraction from EHR systems and data preparation. We list
+over forty challenges encountered during these stages and provide actionable
+recommendations for addressing them. These challenges are organized into four
+categories: cohort definition, outcome definition, feature engineering, and
+data cleaning. This list is designed to serve as a practical guide for data
+extraction engineers and researchers, supporting better practices and improving
+the quality and real-world applicability of dynamic prediction models in
+clinical settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Causal Reasoning with (Non-Recursive) Structural Equation
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10190v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10190v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maksim Gladyshev, Natasha Alechina, Mehdi Dastani, Dragan Doder, Brian Logan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural Equation Models (SEM) are the standard approach to representing
+causal dependencies between variables in causal models. In this paper we
+propose a new interpretation of SEMs when reasoning about Actual Causality, in
+which SEMs are viewed as mechanisms transforming the dynamics of exogenous
+variables into the dynamics of endogenous variables. This allows us to combine
+counterfactual causal reasoning with existing temporal logic formalisms, and to
+introduce a temporal logic, CPLTL, for causal reasoning about such structures.
+We show that the standard restriction to so-called \textit{recursive} models
+(with no cycles in the dependency graph) is not necessary in our approach,
+allowing us to reason about mutually dependent processes and feedback loops.
+Finally, we introduce new notions of model equivalence for temporal causal
+models, and show that CPLTL has an efficient model-checking procedure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Good things come in small packages: Should we adopt Lite-GPUs in AI
+  infrastructure? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Burcu Canakci, Junyi Liu, Xingbo Wu, Nathanaël Cheriere, Paolo Costa, Sergey Legtchenko, Dushyanth Narayanan, Ant Rowstron
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To match the blooming demand of generative AI workloads, GPU designers have
+so far been trying to pack more and more compute and memory into single complex
+and expensive packages. However, there is growing uncertainty about the
+scalability of individual GPUs and thus AI clusters, as state-of-the-art GPUs
+are already displaying packaging, yield, and cooling limitations. We propose to
+rethink the design and scaling of AI clusters through efficiently-connected
+large clusters of Lite-GPUs, GPUs with single, small dies and a fraction of the
+capabilities of larger GPUs. We think recent advances in co-packaged optics can
+be key in overcoming the communication challenges of distributing AI workloads
+onto more Lite-GPUs. In this paper, we present the key benefits of Lite-GPUs on
+manufacturing cost, blast radius, yield, and power efficiency; and discuss
+systems opportunities and challenges around resource, workload, memory, and
+network management.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5+ pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Artificial Intelligence: Implications for Biomedical and
+  Health Professions Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Hersh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI has had a profound impact on biomedicine and health, both in
+professional work and in education. Based on large language models (LLMs),
+generative AI has been found to perform as well as humans in simulated
+situations taking medical board exams, answering clinical questions, solving
+clinical cases, applying clinical reasoning, and summarizing information.
+Generative AI is also being used widely in education, performing well in
+academic courses and their assessments. This review summarizes the successes of
+LLMs and highlights some of their challenges in the context of education, most
+notably aspects that may undermines the acquisition of knowledge and skills for
+professional work. It then provides recommendations for best practices
+overcoming shortcomings for LLM use in education. Although there are challenges
+for use of generative AI in education, all students and faculty, in biomedicine
+and health and beyond, must have understanding and be competent in its use.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple but Effective Closed-form Solution for Extreme Multi-label
+  Learning <span class="chip">ECIR25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazuma Onishi, Katsuhiko Hayashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extreme multi-label learning (XML) is a task of assigning multiple labels
+from an extremely large set of labels to each data instance. Many current
+high-performance XML models are composed of a lot of hyperparameters, which
+complicates the tuning process. Additionally, the models themselves are adapted
+specifically to XML, which complicates their reimplementation. To remedy this
+problem, we propose a simple method based on ridge regression for XML. The
+proposed method not only has a closed-form solution but also is composed of a
+single hyperparameter. Since there are no precedents on applying ridge
+regression to XML, this paper verified the performance of the method by using
+various XML benchmark datasets. Furthermore, we enhanced the prediction of
+low-frequency labels in XML, which hold informative content. This prediction is
+essential yet challenging because of the limited amount of data. Here, we
+employed a simple frequency-based weighting. This approach greatly simplifies
+the process compared with existing techniques. Experimental results revealed
+that it can achieve levels of performance comparable to, or even exceeding,
+those of models with numerous hyperparameters. Additionally, we found that the
+frequency-based weighting significantly improved the predictive performance for
+low-frequency labels, while requiring almost no changes in implementation. The
+source code for the proposed method is available on github at
+https://github.com/cars1015/XML-ridge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10pages, Accepted at ECIR25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CSSDM Ontology to Enable Continuity of Care Data Interoperability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subhashis Das, Debashis Naskar, Sara Rodriguez Gonzalez, Pamela Hussey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of digital technologies and recent global pandemic
+scenarios have led to a growing focus on how these technologies can enhance
+healthcare service delivery and workflow to address crises. Action plans that
+consolidate existing digital transformation programs are being reviewed to
+establish core infrastructure and foundations for sustainable healthcare
+solutions. Reforming health and social care to personalize home care, for
+example, can help avoid treatment in overcrowded acute hospital settings and
+improve the experiences and outcomes for both healthcare professionals and
+service users. In this information-intensive domain, addressing the
+interoperability challenge through standards-based roadmaps is crucial for
+enabling effective connections between health and social care services. This
+approach facilitates safe and trustworthy data workflows between different
+healthcare system providers. In this paper, we present a methodology for
+extracting, transforming, and loading data through a semi-automated process
+using a Common Semantic Standardized Data Model (CSSDM) to create personalized
+healthcare knowledge graph (KG). The CSSDM is grounded in the formal ontology
+of ISO 13940 ContSys and incorporates FHIR-based specifications to support
+structural attributes for generating KGs. We propose that the CSSDM facilitates
+data harmonization and linking, offering an alternative approach to
+interoperability. This approach promotes a novel form of collaboration between
+companies developing health information systems and cloud-enabled health
+services. Consequently, it provides multiple stakeholders with access to
+high-quality data and information sharing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures, Published in: 2024 IEEE International Conference
+  on Bioinformatics and Biomedicine (BIBM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Region-wise stacking ensembles for estimating brain-age using MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgios Antonopoulos, Shammi More, Simon B. Eickhoff, Federico Raimondo, Kaustubh R. Patil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive modeling using structural magnetic resonance imaging (MRI) data is
+a prominent approach to study brain-aging. Machine learning algorithms and
+feature extraction methods have been employed to improve predictions and
+explore healthy and accelerated aging e.g. neurodegenerative and psychiatric
+disorders. The high-dimensional MRI data pose challenges to building
+generalizable and interpretable models as well as for data privacy. Common
+practices are resampling or averaging voxels within predefined parcels, which
+reduces anatomical specificity and biological interpretability as voxels within
+a region may differently relate to aging. Effectively, naive fusion by
+averaging can result in information loss and reduced accuracy. We present a
+conceptually novel two-level stacking ensemble (SE) approach. The first level
+comprises regional models for predicting individuals' age based on voxel-wise
+information, fused by a second-level model yielding final predictions. Eight
+data fusion scenarios were explored using as input Gray matter volume (GMV)
+estimates from four datasets covering the adult lifespan. Performance, measured
+using mean absolute error (MAE), R2, correlation and prediction bias, showed
+that SE outperformed the region-wise averages. The best performance was
+obtained when first-level regional predictions were obtained as out-of-sample
+predictions on the application site with second-level models trained on
+independent and site-specific data (MAE=4.75 vs baseline regional mean GMV
+MAE=5.68). Performance improved as more datasets were used for training.
+First-level predictions showed improved and more robust aging signal providing
+new biological insights and enhanced data privacy. Overall, the SE improves
+accuracy compared to the baseline while preserving or enhancing data privacy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>version1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topology-Driven Attribute Recovery for Attribute Missing Graph Learning
+  in Social Internet of Things 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengran Li, Junzhou Chen, Chenyun Yu, Guanying Jiang, Ronghui Zhang, Yanming Shen, Houbing Herbert Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of information technology, the Social Internet of Things
+(SIoT) has fostered the integration of physical devices and social networks,
+deepening the study of complex interaction patterns. Text Attribute Graphs
+(TAGs) capture both topological structures and semantic attributes, enhancing
+the analysis of complex interactions within the SIoT. However, existing graph
+learning methods are typically designed for complete attributed graphs, and the
+common issue of missing attributes in Attribute Missing Graphs (AMGs) increases
+the difficulty of analysis tasks. To address this, we propose the
+Topology-Driven Attribute Recovery (TDAR) framework, which leverages
+topological data for AMG learning. TDAR introduces an improved pre-filling
+method for initial attribute recovery using native graph topology.
+Additionally, it dynamically adjusts propagation weights and incorporates
+homogeneity strategies within the embedding space to suit AMGs' unique
+topological structures, effectively reducing noise during information
+propagation. Extensive experiments on public datasets demonstrate that TDAR
+significantly outperforms state-of-the-art methods in attribute reconstruction
+and downstream tasks, offering a robust solution to the challenges posed by
+AMGs. The code is available at https://github.com/limengran98/TDAR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Internet of Things Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Debiasing: Remove Stereotypes and Keep Factual Gender for Fair
+  Language Modeling and Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10150v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10150v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomasz Limisiewicz, David Mareček, Tomáš Musil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mitigation of biases, such as language models' reliance on gender
+stereotypes, is a crucial endeavor required for the creation of reliable and
+useful language technology. The crucial aspect of debiasing is to ensure that
+the models preserve their versatile capabilities, including their ability to
+solve language tasks and equitably represent various genders. To address this
+issue, we introduce a streamlined Dual Dabiasing Algorithm through Model
+Adaptation (2DAMA). Novel Dual Debiasing enables robust reduction of
+stereotypical bias while preserving desired factual gender information encoded
+by language models. We show that 2DAMA effectively reduces gender bias in
+English and is one of the first approaches facilitating the mitigation of
+stereotypical tendencies in translation. The proposed method's key advantage is
+the preservation of factual gender cues, which are useful in a wide range of
+natural language processing tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing UAV Path Planning Efficiency Through Accelerated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseanne Viana, Boris Galkin, Lester Ho, Holger Claussen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned Aerial Vehicles (UAVs) are increasingly essential in various fields
+such as surveillance, reconnaissance, and telecommunications. This study aims
+to develop a learning algorithm for the path planning of UAV wireless
+communication relays, which can reduce storage requirements and accelerate Deep
+Reinforcement Learning (DRL) convergence. Assuming the system possesses terrain
+maps of the area and can estimate user locations using localization algorithms
+or direct GPS reporting, it can input these parameters into the learning
+algorithms to achieve optimized path planning performance. However, higher
+resolution terrain maps are necessary to extract topological information such
+as terrain height, object distances, and signal blockages. This requirement
+increases memory and storage demands on UAVs while also lengthening convergence
+times in DRL algorithms. Similarly, defining the telecommunication coverage map
+in UAV wireless communication relays using these terrain maps and user position
+estimations demands higher memory and storage utilization for the learning path
+planning algorithms. Our approach reduces path planning training time by
+applying a dimensionality reduction technique based on Principal Component
+Analysis (PCA), sample combination, Prioritized Experience Replay (PER), and
+the combination of Mean Squared Error (MSE) and Mean Absolute Error (MAE) loss
+calculations in the coverage map estimates, thereby enhancing a Twin Delayed
+Deep Deterministic Policy Gradient (TD3) algorithm. The proposed solution
+reduces the convergence episodes needed for basic training by approximately
+four times compared to the traditional TD3.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted in https://camad2024.ieee-camad.org/
+  conference but it is not available from the conference yet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal Prediction Sets with Improved Conditional Coverage using Trust
+  Scores 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jivat Neet Kaur, Michael I. Jordan, Ahmed Alaa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard conformal prediction offers a marginal guarantee on coverage, but
+for prediction sets to be truly useful, they should ideally ensure coverage
+conditional on each test point. Unfortunately, it is impossible to achieve
+exact, distribution-free conditional coverage in finite samples. In this work,
+we propose an alternative conformal prediction algorithm that targets coverage
+where it matters most--in instances where a classifier is overconfident in its
+incorrect predictions. We start by dissecting miscoverage events in
+marginally-valid conformal prediction, and show that miscoverage rates vary
+based on the classifier's confidence and its deviation from the Bayes optimal
+classifier. Motivated by this insight, we develop a variant of conformal
+prediction that targets coverage conditional on a reduced set of two variables:
+the classifier's confidence in a prediction and a nonparametric trust score
+that measures its deviation from the Bayes classifier. Empirical evaluation on
+multiple image datasets shows that our method generally improves conditional
+coverage properties compared to standard conformal prediction, including
+class-conditional coverage, coverage over arbitrary subgroups, and coverage
+over demographic groups.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Impact of Generative Artificial Intelligence in Education:
+  A Thematic Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Kaushik, Sargam Yadav, Andrew Browne, David Lillis, David Williams, Jack Mc Donnell, Peadar Grant, Siobhan Connolly Kernan, Shubham Sharma, Mansi Arora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advancements in Generative Artificial intelligence (GenAI)
+technology have been transformative for the field of education. Large Language
+Models (LLMs) such as ChatGPT and Bard can be leveraged to automate boilerplate
+tasks, create content for personalised teaching, and handle repetitive tasks to
+allow more time for creative thinking. However, it is important to develop
+guidelines, policies, and assessment methods in the education sector to ensure
+the responsible integration of these tools. In this article, thematic analysis
+has been performed on seven essays obtained from professionals in the education
+sector to understand the advantages and pitfalls of using GenAI models such as
+ChatGPT and Bard in education. Exploratory Data Analysis (EDA) has been
+performed on the essays to extract further insights from the text. The study
+found several themes which highlight benefits and drawbacks of GenAI tools, as
+well as suggestions to overcome these limitations and ensure that students are
+using these tools in a responsible and ethical manner.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatio-temporal Graph Learning on Adaptive Mined Key Frames for
+  High-performance Multi-Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Futian Wang, Fengxiang Liu, Xiao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of multi-object tracking, the challenge of accurately capturing
+the spatial and temporal relationships between objects in video sequences
+remains a significant hurdle. This is further complicated by frequent
+occurrences of mutual occlusions among objects, which can lead to tracking
+errors and reduced performance in existing methods. Motivated by these
+challenges, we propose a novel adaptive key frame mining strategy that
+addresses the limitations of current tracking approaches. Specifically, we
+introduce a Key Frame Extraction (KFE) module that leverages reinforcement
+learning to adaptively segment videos, thereby guiding the tracker to exploit
+the intrinsic logic of the video content. This approach allows us to capture
+structured spatial relationships between different objects as well as the
+temporal relationships of objects across frames. To tackle the issue of object
+occlusions, we have developed an Intra-Frame Feature Fusion (IFF) module.
+Unlike traditional graph-based methods that primarily focus on inter-frame
+feature fusion, our IFF module uses a Graph Convolutional Network (GCN) to
+facilitate information exchange between the target and surrounding objects
+within a frame. This innovation significantly enhances target
+distinguishability and mitigates tracking loss and appearance similarity due to
+occlusions. By combining the strengths of both long and short trajectories and
+considering the spatial relationships between objects, our proposed tracker
+achieves impressive results on the MOT17 dataset, i.e., 68.6 HOTA, 81.0 IDF1,
+66.6 AssA, and 893 IDS, proving its effectiveness and accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Infrastructure for AI Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alan Chan, Kevin Wei, Sihao Huang, Nitarshan Rajkumar, Elija Perrier, Seth Lazar, Gillian K. Hadfield, Markus Anderljung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasingly many AI systems can plan and execute interactions in open-ended
+environments, such as making phone calls or buying online goods. As developers
+grow the space of tasks that such AI agents can accomplish, we will need tools
+both to unlock their benefits and manage their risks. Current tools are largely
+insufficient because they are not designed to shape how agents interact with
+existing institutions (e.g., legal and economic systems) or actors (e.g.,
+digital service providers, humans, other AI agents). For example, alignment
+techniques by nature do not assure counterparties that some human will be held
+accountable when a user instructs an agent to perform an illegal action. To
+fill this gap, we propose the concept of agent infrastructure: technical
+systems and shared protocols external to agents that are designed to mediate
+and influence their interactions with and impacts on their environments. Agent
+infrastructure comprises both new tools and reconfigurations or extensions of
+existing tools. For example, to facilitate accountability, protocols that tie
+users to agents could build upon existing systems for user authentication, such
+as OpenID. Just as the Internet relies on infrastructure like HTTPS, we argue
+that agent infrastructure will be similarly indispensable to ecosystems of
+agents. We identify three functions for agent infrastructure: 1) attributing
+actions, properties, and other information to specific agents, their users, or
+other actors; 2) shaping agents' interactions; and 3) detecting and remedying
+harmful actions from agents. We propose infrastructure that could help achieve
+each function, explaining use cases, adoption, limitations, and open questions.
+Making progress on agent infrastructure can prepare society for the adoption of
+more advanced agents.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BBPOS: <span class="highlight-title">BERT</span>-based Part-of-Speech Tagging for Uzbek 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Latofat Bobojonova, Arofat Akhundjanova, Phil Ostheimer, Sophie Fellenz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper advances NLP research for the low-resource Uzbek language by
+evaluating two previously untested monolingual Uzbek BERT models on the
+part-of-speech (POS) tagging task and introducing the first publicly available
+UPOS-tagged benchmark dataset for Uzbek. Our fine-tuned models achieve 91%
+average accuracy, outperforming the baseline multi-lingual BERT as well as the
+rule-based tagger. Notably, these models capture intermediate POS changes
+through affixes and demonstrate context sensitivity, unlike existing rule-based
+taggers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM Reasoner and Automated Planner: A new NPC approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Israel Puerta-Merino, Jordi Sabater-Mir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In domains requiring intelligent agents to emulate plausible human-like
+behaviour, such as formative simulations, traditional techniques like behaviour
+trees encounter significant challenges. Large Language Models (LLMs), despite
+not always yielding optimal solutions, usually offer plausible and human-like
+responses to a given problem. In this paper, we exploit this capability and
+propose a novel architecture that integrates an LLM for decision-making with a
+classical automated planner that can generate sound plans for that decision.
+The combination aims to equip an agent with the ability to make decisions in
+various situations, even if they were not anticipated during the design phase.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures, extended version of the homonymous paper
+  submitted to the Catalan Conference on Artificial Intelligent (CCIA) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Actions for Enhanced Embodied Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10105v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10105v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinliang Zheng, Jianxiong Li, Dongxiu Liu, Yinan Zheng, Zhihao Wang, Zhonghong Ou, Yu Liu, Jingjing Liu, Ya-Qin Zhang, Xianyuan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training on diverse, internet-scale data is a key factor in the success of
+recent large foundation models. Yet, using the same recipe for building
+embodied agents has faced noticeable difficulties. Despite the availability of
+many crowd-sourced embodied datasets, their action spaces often exhibit
+significant heterogeneity due to distinct physical embodiment and control
+interfaces for different robots, causing substantial challenges in developing
+embodied foundation models using cross-domain data. In this paper, we introduce
+UniAct, a new embodied foundation modeling framework operating in a tokenized
+Universal Action Space. Our learned universal actions capture the generic
+atomic behaviors across diverse robots by exploiting their shared structural
+features, and enable enhanced cross-domain data utilization and
+cross-embodiment generalizations by eliminating the notorious heterogeneity.
+The universal actions can be efficiently translated back to heterogeneous
+actionable commands by simply adding embodiment-specific details, from which
+fast adaptation to new robots becomes simple and straightforward. Our 0.5B
+instantiation of UniAct outperforms 14X larger SOTA embodied foundation models
+in extensive evaluations on various real-world and simulation robots,
+showcasing exceptional cross-embodiment control and adaptation capability,
+highlighting the crucial benefit of adopting universal actions. Project page:
+https://github.com/2toinf/UniAct
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robotic World Model: A Neural Network Simulator for Robust Policy
+  Optimization in Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenhao Li, Andreas Krause, Marco Hutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning robust and generalizable world models is crucial for enabling
+efficient and scalable robotic control in real-world environments. In this
+work, we introduce a novel framework for learning world models that accurately
+capture complex, partially observable, and stochastic dynamics. The proposed
+method employs a dual-autoregressive mechanism and self-supervised training to
+achieve reliable long-horizon predictions without relying on domain-specific
+inductive biases, ensuring adaptability across diverse robotic tasks. We
+further propose a policy optimization framework that leverages world models for
+efficient training in imagined environments and seamless deployment in
+real-world systems. Through extensive experiments, our approach consistently
+outperforms state-of-the-art methods, demonstrating superior autoregressive
+prediction accuracy, robustness to noise, and generalization across
+manipulation and locomotion tasks. Notably, policies trained with our method
+are successfully deployed on ANYmal D hardware in a zero-shot transfer,
+achieving robust performance with minimal sim-to-real performance loss. This
+work advances model-based reinforcement learning by addressing the challenges
+of long-horizon prediction, error accumulation, and sim-to-real transfer. By
+providing a scalable and robust framework, the introduced methods pave the way
+for adaptive and efficient robotic systems in real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ landmarker: a Toolkit for Anatomical Landmark Localization in 2D/3D
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jef Jonkers, Luc Duchateau, Glenn Van Wallendael, Sofie Van Hoecke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anatomical landmark localization in 2D/3D images is a critical task in
+medical imaging. Although many general-purpose tools exist for landmark
+localization in classical computer vision tasks, such as pose estimation, they
+lack the specialized features and modularity necessary for anatomical landmark
+localization applications in the medical domain. Therefore, we introduce
+landmarker, a Python package built on PyTorch. The package provides a
+comprehensive, flexible toolkit for developing and evaluating landmark
+localization algorithms, supporting a range of methodologies, including static
+and adaptive heatmap regression. landmarker enhances the accuracy of landmark
+identification, streamlines research and development processes, and supports
+various image formats and preprocessing pipelines. Its modular design allows
+users to customize and extend the toolkit for specific datasets and
+applications, accelerating innovation in medical imaging. landmarker addresses
+a critical need for precision and customization in landmark localization tasks
+not adequately met by existing general-purpose pose estimation tools.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Do Programming Students Use Generative AI? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Rahe, Walid Maalej
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Programming students have a widespread access to powerful Generative AI tools
+like ChatGPT. While this can help understand the learning material and assist
+with exercises, educators are voicing more and more concerns about an
+over-reliance on generated outputs and lack of critical thinking skills. It is
+thus important to understand how students actually use generative AI and what
+impact this could have on their learning behavior. To this end, we conducted a
+study including an exploratory experiment with 37 programming students, giving
+them monitored access to ChatGPT while solving a code understanding and
+improving exercise. While only 23 of the students actually opted to use the
+chatbot, the majority of those eventually prompted it to simply generate a full
+solution. We observed two prevalent usage strategies: to seek knowledge about
+general concepts and to directly generate solutions. Instead of using the bot
+to comprehend the code and their own mistakes, students often got trapped in a
+vicious cycle of submitting wrong generated code and then asking the bot for a
+fix. Those who self-reported using generative AI regularly were more likely to
+prompt the bot to generate a solution. Our findings indicate that concerns
+about potential decrease in programmers' agency and productivity with
+Generative AI are justified. We discuss how researchers and educators can
+respond to the potential risk of students uncritically over-relying on
+generative AI. We also discuss potential modifications to our study design for
+large-scale replications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint; accepted to ACM International Conference on the Foundations
+  of Software Engineering (FSE) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Change Captioning in Remote Sensing: SECOND-CC <span class="highlight-title">Dataset</span> and
+  MModalCC Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Can Karaca, M. Enes Ozelbas, Saadettin Berber, Orkhan Karimli, Turabi Yildirim, M. Fatih Amasyali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing change captioning (RSICC) aims to describe changes between
+bitemporal images in natural language. Existing methods often fail under
+challenges like illumination differences, viewpoint changes, blur effects,
+leading to inaccuracies, especially in no-change regions. Moreover, the images
+acquired at different spatial resolutions and have registration errors tend to
+affect the captions. To address these issues, we introduce SECOND-CC, a novel
+RSICC dataset featuring high-resolution RGB image pairs, semantic segmentation
+maps, and diverse real-world scenarios. SECOND-CC which contains 6,041 pairs of
+bitemporal RS images and 30,205 sentences describing the differences between
+images. Additionally, we propose MModalCC, a multimodal framework that
+integrates semantic and visual data using advanced attention mechanisms,
+including Cross-Modal Cross Attention (CMCA) and Multimodal Gated Cross
+Attention (MGCA). Detailed ablation studies and attention visualizations
+further demonstrate its effectiveness and ability to address RSICC challenges.
+Comprehensive experiments show that MModalCC outperforms state-of-the-art RSICC
+methods, including RSICCformer, Chg2Cap, and PSNet with +4.6% improvement on
+BLEU4 score and +9.6% improvement on CIDEr score. We will make our dataset and
+codebase publicly available to facilitate future research at
+https://github.com/ChangeCapsInRS/SecondCC
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE Transactions on Geoscience
+  and Remote Sensing journal for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and
+  Chain-of-Thought for Embodied Task Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuecheng Liu, Dafeng Chi, Shiguang Wu, Zhanguang Zhang, Yaochen Hu, Lingfeng Zhang, Yingxue Zhang, Shuang Wu, Tongtong Cao, Guowei Huang, Guangjian Tian, Xingyue Quan, Jianye Hao, Yuzheng Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial reasoning is an essential problem in embodied AI research. Efforts to
+enhance spatial reasoning abilities through supplementary spatial data and
+fine-tuning have proven limited and ineffective when addressing complex
+embodied tasks, largely due to their dependence on language-based outputs.
+While some approaches have introduced a point-based action space to mitigate
+this issue, they fall short in managing more intricate tasks within complex
+environments. This deficiency arises from their failure to fully exploit the
+inherent thinking and reasoning capabilities that are fundamental strengths of
+Vision-Language Models (VLMs). To address these limitations, we propose a novel
+approach named SpatialCoT, specifically designed to bolster the spatial
+reasoning capabilities of VLMs. Our approach comprises two stages: spatial
+coordinate bi-directional alignment, which aligns vision-language inputs with
+spatial coordinates, and chain-of-thought spatial grounding, which harnesses
+the reasoning capabilities of language models for advanced spatial reasoning.
+We evaluate SpatialCoT on challenging navigation and manipulation tasks, both
+in simulation and real-world settings. Experimental results demonstrate that
+our method significantly outperforms previous state-of-the-art approaches in
+both tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on LLM Test-Time Compute via Search: Tasks, LLM Profiling,
+  Search Algorithms, and Relevant Frameworks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinzhe Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LLM test-time compute (or LLM inference) via search has emerged as a
+promising research area with rapid developments. However, current frameworks
+often adopt distinct perspectives on three key aspects (task definition, LLM
+profiling, and search procedures), making direct comparisons challenging.
+Moreover, the search algorithms employed often diverge from standard
+implementations, and their specific characteristics are not thoroughly
+specified. In this survey, we provide a comprehensive technical review that
+unifies task definitions and provides modular definitions of LLM profiling and
+search procedures. The definitions enable precise comparisons of various LLM
+inference frameworks while highlighting their departures from conventional
+search algorithms. We also discuss the applicability, performance, and
+efficiency of these methods. For further details and ongoing updates, please
+refer to our GitHub repository:
+https://github.com/xinzhel/LLM-Agent-Survey/blob/main/search.md
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating Large Language Models through Partially Linear Feed-Forward
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gansen Hu, Zhaoguo Wang, Jinglin Wei, Wei Huang, Haibo Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) demonstrate remarkable capabilities but face
+deployment challenges due to their massive parameter counts. While existing
+compression techniques like pruning can reduce model size, it leads to
+significant accuracy degradation under high compression ratios. We present a
+novel perspective inspired by constant folding in compiler optimization. Our
+approach enables parameter reduction by treating activation functions in LLMs
+as linear functions.
+  However, recent LLMs use complex non-linear activations like GELU that
+prevent direct application of this technique. We propose TARDIS, which enables
+optimization of LLMs with non-linear activations by partially approximating
+them with linear functions in frequently occurring input ranges. For outlier
+inputs, TARDIS employs an online predictor to dynamically fall back to original
+computations.
+  Our experiments demonstrate that TARDIS achieves 80% parameter reduction in
+feed-forward networks, while significantly outperforming state-of-the-art
+pruning methods Wanda and RIA with up to 65% higher accuracy. In practical
+deployments for a 7B model, TARDIS achieves 1.6x end-to-end inference speedup
+when integrated with the vLLM serving system, and 1.4x speedup with the widely
+adopted HuggingFace implementation, while incurring only a 10.9% accuracy
+trade-off.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AirRAG: Activating Intrinsic Reasoning for Retrieval Augmented
+  Generation via Tree-based Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenfeng Feng, Chuzhan Hao, Yuewei Zhang, Jingyi Song, Hao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging the autonomous decision-making capabilities of large language
+models (LLMs) demonstrates superior performance in reasoning tasks. Despite the
+successes of iterative or recursive retrieval-augmented generation (RAG), they
+often are trapped in a single solution space when confronted with complex
+tasks. In this paper, we propose a novel thinking pattern in RAG which
+integrates system analysis with efficient reasoning actions, significantly
+activating intrinsic reasoning capabilities and expanding the solution space of
+specific tasks via Monte Carlo Tree Search (MCTS), dubbed AirRAG. Specifically,
+our approach designs five fundamental reasoning actions that are expanded to a
+wide tree-based reasoning spaces using MCTS. The extension also uses
+self-consistency verification to explore potential reasoning paths and
+implement inference scaling. In addition, computationally optimal strategies
+are used to apply more inference computation to key actions to achieve further
+performance improvements. Experimental results demonstrate the effectiveness of
+AirRAG through considerable performance gains over complex QA datasets.
+Furthermore, AirRAG is flexible and lightweight, making it easy to integrate
+with other advanced technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Virtual Nodes Improve Long-term Traffic Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyang Cao, Dingyi Zhuang, Jinhua Zhao, Shenhao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective traffic prediction is a cornerstone of intelligent transportation
+systems, enabling precise forecasts of traffic flow, speed, and congestion.
+While traditional spatio-temporal graph neural networks (ST-GNNs) have achieved
+notable success in short-term traffic forecasting, their performance in
+long-term predictions remains limited. This challenge arises from
+over-squashing problem, where bottlenecks and limited receptive fields restrict
+information flow and hinder the modeling of global dependencies. To address
+these challenges, this study introduces a novel framework that incorporates
+virtual nodes, which are additional nodes added to the graph and connected to
+existing nodes, in order to aggregate information across the entire graph
+within a single GNN layer. Our proposed model incorporates virtual nodes by
+constructing a semi-adaptive adjacency matrix. This matrix integrates
+distance-based and adaptive adjacency matrices, allowing the model to leverage
+geographical information while also learning task-specific features from data.
+Experimental results demonstrate that the inclusion of virtual nodes
+significantly enhances long-term prediction accuracy while also improving
+layer-wise sensitivity to mitigate the over-squashing problem. Virtual nodes
+also offer enhanced explainability by focusing on key intersections and
+high-traffic areas, as shown by the visualization of their adjacency matrix
+weights on road network heat maps. Our advanced approach enhances the
+understanding and management of urban traffic systems, making it particularly
+well-suited for real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatiotemporal Prediction of Secondary Crashes by Rebalancing Dynamic
+  and Static Data with Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10041v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10041v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junlan Chen, Yiqun Li, Chenyu Ling, Ziyuan Pu, Xiucheng Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data imbalance is a common issue in analyzing and predicting sudden traffic
+events. Secondary crashes constitute only a small proportion of all crashes.
+These secondary crashes, triggered by primary crashes, significantly exacerbate
+traffic congestion and increase the severity of incidents. However, the severe
+imbalance of secondary crash data poses significant challenges for prediction
+models, affecting their generalization ability and prediction accuracy.
+Existing methods fail to fully address the complexity of traffic crash data,
+particularly the coexistence of dynamic and static features, and often struggle
+to effectively handle data samples of varying lengths. Furthermore, most
+current studies predict the occurrence probability and spatiotemporal
+distribution of secondary crashes separately, lacking an integrated solution.
+To address these challenges, this study proposes a hybrid model named
+VarFusiGAN-Transformer, aimed at improving the fidelity of secondary crash data
+generation and jointly predicting the occurrence and spatiotemporal
+distribution of secondary crashes. The VarFusiGAN-Transformer model employs
+Long Short-Term Memory (LSTM) networks to enhance the generation of
+multivariate long-time series data, incorporating a static data generator and
+an auxiliary discriminator to model the joint distribution of dynamic and
+static features. In addition, the model's prediction module achieves
+simultaneous prediction of both the occurrence and spatiotemporal distribution
+of secondary crashes. Compared to existing methods, the proposed model
+demonstrates superior performance in generating high-fidelity data and
+improving prediction accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Speech Recognition for Sanskrit with Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bidit Sadhukhan, Swami Punyeshwarananda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sanskrit, one of humanity's most ancient languages, has a vast collection of
+books and manuscripts on diverse topics that have been accumulated over
+millennia. However, its digital content (audio and text), which is vital for
+the training of AI systems, is profoundly limited. Furthermore, its intricate
+linguistics make it hard to develop robust NLP tools for wider accessibility.
+Given these constraints, we have developed an automatic speech recognition
+model for Sanskrit by employing transfer learning mechanism on OpenAI's Whisper
+model. After carefully optimising the hyper-parameters, we obtained promising
+results with our transfer-learned model achieving a word error rate of 15.42%
+on Vaksancayah dataset. An online demo of our model is made available for the
+use of public and to evaluate its performance firsthand thereby paving the way
+for improved accessibility and technological support for Sanskrit learning in
+the modern era.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper has been accepted at the 4th International Conference on
+  Computer, Communication, Control & Information Technology (C3IT), Hooghly,
+  India, 2024, pp. 1-5</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Crash Frequency Modeling Based on Augmented Multi-Type Data by
+  Hybrid VAE-Diffusion-Based Generative Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junlan Chen, Qijie He, Pei Liu, Wei Ma, Ziyuan Pu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crash frequency modelling analyzes the impact of factors like traffic volume,
+road geometry, and environmental conditions on crash occurrences. Inaccurate
+predictions can distort our understanding of these factors, leading to
+misguided policies and wasted resources, which jeopardize traffic safety. A key
+challenge in crash frequency modelling is the prevalence of excessive zero
+observations, caused by underreporting, the low probability of crashes, and
+high data collection costs. These zero observations often reduce model accuracy
+and introduce bias, complicating safety decision making. While existing
+approaches, such as statistical methods, data aggregation, and resampling,
+attempt to address this issue, they either rely on restrictive assumptions or
+result in significant information loss, distorting crash data. To overcome
+these limitations, we propose a hybrid VAE-Diffusion neural network, designed
+to reduce zero observations and handle the complexities of multi-type tabular
+crash data (count, ordinal, nominal, and real-valued variables). We assess the
+synthetic data quality generated by this model through metrics like similarity,
+accuracy, diversity, and structural consistency, and compare its predictive
+performance against traditional statistical models. Our findings demonstrate
+that the hybrid VAE-Diffusion model outperforms baseline models across all
+metrics, offering a more effective approach to augmenting crash data and
+improving the accuracy of crash frequency predictions. This study highlights
+the potential of synthetic data to enhance traffic safety by improving crash
+frequency modelling and informing better policy decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Hallucinations on Object Attributes using Multiview Images
+  and Negative Instructions <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhijie Tan, Yuzhi Li, Shengwei Meng, Xiang Yuan, Weiping Li, Tong Mo, Bingce Wang, Xu Chu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current popular Large Vision-Language Models (LVLMs) are suffering from
+Hallucinations on Object Attributes (HoOA), leading to incorrect determination
+of fine-grained attributes in the input images. Leveraging significant
+advancements in 3D generation from a single image, this paper proposes a novel
+method to mitigate HoOA in LVLMs. This method utilizes multiview images sampled
+from generated 3D representations as visual prompts for LVLMs, thereby
+providing more visual information from other viewpoints. Furthermore, we
+observe the input order of multiple multiview images significantly affects the
+performance of LVLMs. Consequently, we have devised Multiview Image Augmented
+VLM (MIAVLM), incorporating a Multiview Attributes Perceiver (MAP) submodule
+capable of simultaneously eliminating the influence of input image order and
+aligning visual information from multiview images with Large Language Models
+(LLMs). Besides, we designed and employed negative instructions to mitigate
+LVLMs' bias towards ``Yes" responses. Comprehensive experiments demonstrate the
+effectiveness of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2025 IEEE International Conference on Acoustics, Speech, and Signal
+  Processing (ICASSP 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Spatiotemporal Augmentation for Improving Dynamic Graph
+  Learning <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10010v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10010v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Chu, Hanlin Xue, Bingce Wang, Xiaoyang Liu, Weiping Li, Tong Mo, Tuoyu Feng, Zhijie Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic graph augmentation is used to improve the performance of dynamic
+GNNs. Most methods assume temporal locality, meaning that recent edges are more
+influential than earlier edges. However, for temporal changes in edges caused
+by random noise, overemphasizing recent edges while neglecting earlier ones may
+lead to the model capturing noise. To address this issue, we propose STAA
+(SpatioTemporal Activity-Aware Random Walk Diffusion). STAA identifies nodes
+likely to have noisy edges in spatiotemporal dimensions. Spatially, it analyzes
+critical topological positions through graph wavelet coefficients. Temporally,
+it analyzes edge evolution through graph wavelet coefficient change rates.
+Then, random walks are used to reduce the weights of noisy edges, deriving a
+diffusion matrix containing spatiotemporal information as an augmented
+adjacency matrix for dynamic GNN learning. Experiments on multiple datasets
+show that STAA outperforms other dynamic graph augmentation methods in node
+classification and link prediction tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2025 IEEE International Conference on Acoustics, Speech, and Signal
+  Processing (ICASSP 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Early Alzheimer Disease Detection with MRI Scans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Rafsan, Tamer Oraby, Upal Roy, Sanjeev Kumar, Hansapani Rodrigo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alzheimer's Disease is a neurodegenerative condition characterized by
+dementia and impairment in neurological function. The study primarily focuses
+on the individuals above age 40, affecting their memory, behavior, and
+cognitive processes of the brain. Alzheimer's disease requires diagnosis by a
+detailed assessment of MRI scans and neuropsychological tests of the patients.
+This project compares existing deep learning models in the pursuit of enhancing
+the accuracy and efficiency of AD diagnosis, specifically focusing on the
+Convolutional Neural Network, Bayesian Convolutional Neural Network, and the
+U-net model with the Open Access Series of Imaging Studies brain MRI dataset.
+Besides, to ensure robustness and reliability in the model evaluations, we
+address the challenge of imbalance in data. We then perform rigorous evaluation
+to determine strengths and weaknesses for each model by considering
+sensitivity, specificity, and computational efficiency. This comparative
+analysis would shed light on the future role of AI in revolutionizing AD
+diagnostics but also paved ways for future innovation in medical imaging and
+the management of neurodegenerative diseases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention-guided Self-reflection for Zero-shot Hallucination Detection
+  in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Liu, Xinlong Chen, Yue Ding, Shizhen Xu, Shu Wu, Liang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hallucination has emerged as a significant barrier to the effective
+application of Large Language Models (LLMs). In this work, we introduce a novel
+Attention-Guided SElf-Reflection (AGSER) approach for zero-shot hallucination
+detection in LLMs. The AGSER method utilizes attention contributions to
+categorize the input query into attentive and non-attentive queries. Each query
+is then processed separately through the LLMs, allowing us to compute
+consistency scores between the generated responses and the original answer. The
+difference between the two consistency scores serves as a hallucination
+estimator. In addition to its efficacy in detecting hallucinations, AGSER
+notably reduces computational complexity, requiring only three passes through
+the LLM and utilizing two sets of tokens. We have conducted extensive
+experiments with four widely-used LLMs across three different hallucination
+benchmarks, demonstrating that our approach significantly outperforms existing
+methods in zero-shot hallucination detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast energy-aware OLSR routing in VANETs by means of a parallel
+  evolutionary algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jamal Toutouh, Sergio Nesmachnow, Enrique Alba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work tackles the problem of reducing the power consumption of the OLSR
+routing protocol in vehicular networks. Nowadays, energy-aware and green
+communication protocols are important research topics, specially when deploying
+wireless mobile networks. This article introduces a fast automatic methodology
+to search for energy-efficient OLSR configurations by using a parallel
+evolutionary algorithm. The experimental analysis demonstrates that significant
+improvements over the standard configuration can be attained in terms of power
+consumption, with no noteworthy loss in the QoS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Modal Attention Networks for Enhanced Segmentation and Depth
+  Estimation of Subsurface Defects in Pulse Thermography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Salah, Naoufel Werghi, Davor Svetinovic, Yusra Abdulrahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI-driven pulse thermography (PT) has become a crucial tool in
+non-destructive testing (NDT), enabling automatic detection of hidden anomalies
+in various industrial components. Current state-of-the-art techniques feed
+segmentation and depth estimation networks compressed PT sequences using either
+Principal Component Analysis (PCA) or Thermographic Signal Reconstruction
+(TSR). However, treating these two modalities independently constrains the
+performance of PT inspection models as these representations possess
+complementary semantic features. To address this limitation, this work proposes
+PT-Fusion, a multi-modal attention-based fusion network that fuses both PCA and
+TSR modalities for defect segmentation and depth estimation of subsurface
+defects in PT setups. PT-Fusion introduces novel feature fusion modules,
+Encoder Attention Fusion Gate (EAFG) and Attention Enhanced Decoding Block
+(AEDB), to fuse PCA and TSR features for enhanced segmentation and depth
+estimation of subsurface defects. In addition, a novel data augmentation
+technique is proposed based on random data sampling from thermographic
+sequences to alleviate the scarcity of PT datasets. The proposed method is
+benchmarked against state-of-the-art PT inspection models, including U-Net,
+attention U-Net, and 3D-CNN on the Universit\'e Laval IRT-PVC dataset. The
+results demonstrate that PT-Fusion outperforms the aforementioned models in
+defect segmentation and depth estimation accuracies with a margin of 10%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pulse thermography, infrared thermography, defect segmentation,
+  multi-modal networks, attention mechanism</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RichSpace: Enriching Text-to-Video <span class="highlight-title">Prompt</span> Space via Text Embedding
+  Interpolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuefan Cao, Chengyue Gong, Xiaoyu Li, Yingyu Liang, Zhizhou Sha, Zhenmei Shi, Zhao Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-video generation models have made impressive progress, but they still
+struggle with generating videos with complex features. This limitation often
+arises from the inability of the text encoder to produce accurate embeddings,
+which hinders the video generation model. In this work, we propose a novel
+approach to overcome this challenge by selecting the optimal text embedding
+through interpolation in the embedding space. We demonstrate that this method
+enables the video generation model to produce the desired videos. Additionally,
+we introduce a simple algorithm using perpendicular foot embeddings and cosine
+similarity to identify the optimal interpolation embedding. Our findings
+highlight the importance of accurate text embeddings and offer a pathway for
+improving text-to-video generation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aneumo: A Large-Scale Comprehensive Synthetic <span class="highlight-title">Dataset</span> of Aneurysm
+  Hemodynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xigui Li, Yuanye Zhou, Feiyang Xiao, Xin Guo, Yichi Zhang, Chen Jiang, Jianchao Ge, Xiansheng Wang, Qimeng Wang, Taiwei Zhang, Chensen Lin, Yuan Cheng, Yuan Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intracranial aneurysm (IA) is a common cerebrovascular disease that is
+usually asymptomatic but may cause severe subarachnoid hemorrhage (SAH) if
+ruptured. Although clinical practice is usually based on individual factors and
+morphological features of the aneurysm, its pathophysiology and hemodynamic
+mechanisms remain controversial. To address the limitations of current
+research, this study constructed a comprehensive hemodynamic dataset of
+intracranial aneurysms. The dataset is based on 466 real aneurysm models, and
+10,000 synthetic models were generated by resection and deformation operations,
+including 466 aneurysm-free models and 9,534 deformed aneurysm models. The
+dataset also provides medical image-like segmentation mask files to support
+insightful analysis. In addition, the dataset contains hemodynamic data
+measured at eight steady-state flow rates (0.001 to 0.004 kg/s), including
+critical parameters such as flow velocity, pressure, and wall shear stress,
+providing a valuable resource for investigating aneurysm pathogenesis and
+clinical prediction. This dataset will help advance the understanding of the
+pathologic features and hemodynamic mechanisms of intracranial aneurysms and
+support in-depth research in related fields. Dataset hosted at
+https://github.com/Xigui-Li/Aneumo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GVMGen: A General Video-to-Music Generation Model with Hierarchical
+  Attentions <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heda Zuo, Weitao You, Junxian Wu, Shihong Ren, Pei Chen, Mingxu Zhou, Yujia Lu, Lingyun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Composing music for video is essential yet challenging, leading to a growing
+interest in automating music generation for video applications. Existing
+approaches often struggle to achieve robust music-video correspondence and
+generative diversity, primarily due to inadequate feature alignment methods and
+insufficient datasets. In this study, we present General Video-to-Music
+Generation model (GVMGen), designed for generating high-related music to the
+video input. Our model employs hierarchical attentions to extract and align
+video features with music in both spatial and temporal dimensions, ensuring the
+preservation of pertinent features while minimizing redundancy. Remarkably, our
+method is versatile, capable of generating multi-style music from different
+video inputs, even in zero-shot scenarios. We also propose an evaluation model
+along with two novel objective metrics for assessing video-music alignment.
+Additionally, we have compiled a large-scale dataset comprising diverse types
+of video-music pairs. Experimental results demonstrate that GVMGen surpasses
+previous models in terms of music-video correspondence, generative diversity,
+and application universality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 39th AAAI Conference on Artificial Intelligence
+  (AAAI-25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable artificial intelligence (XAI): from inherent explainability
+  to large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fuseini Mumuni, Alhassan Mumuni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI) has continued to achieve tremendous success in
+recent times. However, the decision logic of these frameworks is often not
+transparent, making it difficult for stakeholders to understand, interpret or
+explain their behavior. This limitation hinders trust in machine learning
+systems and causes a general reluctance towards their adoption in practical
+applications, particularly in mission-critical domains like healthcare and
+autonomous driving. Explainable AI (XAI) techniques facilitate the
+explainability or interpretability of machine learning models, enabling users
+to discern the basis of the decision and possibly avert undesirable behavior.
+This comprehensive survey details the advancements of explainable AI methods,
+from inherently interpretable models to modern approaches for achieving
+interpretability of various black box models, including large language models
+(LLMs). Additionally, we review explainable AI techniques that leverage LLM and
+vision-language model (VLM) frameworks to automate or improve the
+explainability of other machine learning models. The use of LLM and VLM as
+interpretability methods particularly enables high-level, semantically
+meaningful explanations of model decisions and behavior. Throughout the paper,
+we highlight the scientific principles, strengths and weaknesses of
+state-of-the-art methods and outline different areas of improvement. Where
+appropriate, we also present qualitative and quantitative comparison results of
+various methods to show how they compare. Finally, we discuss the key
+challenges of XAI and directions for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AIRCHITECT v2: Learning the Hardware Accelerator Design Space through
+  Unified Representations <span class="chip">DATE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jamin Seo, Akshat Ramachandran, Yu-Chuan Chuang, Anirudh Itagi, Tushar Krishna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Design space exploration (DSE) plays a crucial role in enabling custom
+hardware architectures, particularly for emerging applications like AI, where
+optimized and specialized designs are essential. With the growing complexity of
+deep neural networks (DNNs) and the introduction of advanced foundational
+models (FMs), the design space for DNN accelerators is expanding at an
+exponential rate. Additionally, this space is highly non-uniform and
+non-convex, making it increasingly difficult to navigate and optimize.
+Traditional DSE techniques rely on search-based methods, which involve
+iterative sampling of the design space to find the optimal solution. However,
+this process is both time-consuming and often fails to converge to the global
+optima for such design spaces. Recently, AIrchitect v1, the first attempt to
+address the limitations of search-based techniques, transformed DSE into a
+constant-time classification problem using recommendation networks. In this
+work, we propose AIrchitect v2, a more accurate and generalizable
+learning-based DSE technique applicable to large-scale design spaces that
+overcomes the shortcomings of earlier approaches. Specifically, we devise an
+encoder-decoder transformer model that (a) encodes the complex design space
+into a uniform intermediate representation using contrastive learning and (b)
+leverages a novel unified representation blending the advantages of
+classification and regression to effectively explore the large DSE space
+without sacrificing accuracy. Experimental results evaluated on 10^5 real DNN
+workloads demonstrate that, on average, AIrchitect v2 outperforms existing
+techniques by 15% in identifying optimal design points. Furthermore, to
+demonstrate the generalizability of our method, we evaluate performance on
+unseen model workloads (LLMs) and attain a 1.7x improvement in inference
+latency on the identified hardware architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to DATE 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MultiPruner: Balanced Structure Removal in Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J. Pablo Muñoz, Jinjie Yuan, Nilesh Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, state-of-the-art approaches for pruning large pre-trained models
+(LPMs) have demonstrated that the training-free removal of non-critical
+residual blocks in Transformers is viable for reducing model size, achieving
+results that outperform previous training-free pruning approaches. Motivated by
+these findings, we extend BlockPruner (Zhong et al., 2024) and propose
+MultiPruner, a pruning approach that surpasses recent training-free pruning
+methods by adopting a multidimensional, iterative, fine-grained pruning
+strategy. In MultiPruner, multidimensional pruning reinstates the structural
+balance in block-pruned models by sequentially compressing along three
+dimensions: i) residual blocks, ii) channels of multilayer perceptrons (MLP),
+and iii) attention heads. This solution enhances zero-shot accuracy on
+downstream tasks compared to other techniques while improving model compression
+ratios, producing compressed models with fewer computing and memory
+requirements. Extensive experiments demonstrate the advantages of the proposed
+method across various large pre-trained models. The code and pruning
+configurations are available at
+https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Explainability for Power Electronics: From a Lipschitz Continuity
+  Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinze Li, Fanfan Lin, Homer Alan Mantooth, Juan José Rodríguez-Andina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lifecycle management of power converters continues to thrive with emerging
+artificial intelligence (AI) solutions, yet AI mathematical explainability
+remains unexplored in power electronics (PE) community. The lack of theoretical
+rigor challenges adoption in mission-critical applications. Therefore, this
+letter proposes a generic framework to evaluate mathematical explainability,
+highlighting inference stability and training convergence from a Lipschitz
+continuity perspective. Inference stability governs consistent outputs under
+input perturbations, essential for robust real-time control and fault
+diagnosis. Training convergence guarantees stable learning dynamics,
+facilitating accurate modeling in PE contexts. Additionally, a Lipschitz-aware
+learning rate selection strategy is introduced to accelerate convergence while
+mitigating overshoots and oscillations. The feasibility of the proposed
+Lipschitz-oriented framework is demonstrated by validating the mathematical
+explainability of a state-of-the-art physics-in-architecture neural network,
+and substantiated through empirical case studies on dual-active-bridge
+converters. This letter serves as a clarion call for the PE community to
+embrace mathematical explainability, heralding a transformative era of
+trustworthy and explainable AI solutions that potentially redefine the future
+of power electronics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Client-Centric Federated Adaptive Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianhui Sun, Xidong Wu, Heng Huang, Aidong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a distributed learning paradigm where clients
+collaboratively train a model while keeping their own data private. With an
+increasing scale of clients and models, FL encounters two key challenges,
+client drift due to a high degree of statistical/system heterogeneity, and lack
+of adaptivity. However, most existing FL research is based on unrealistic
+assumptions that virtually ignore system heterogeneity. In this paper, we
+propose Client-Centric Federated Adaptive Optimization, which is a class of
+novel federated adaptive optimization approaches. We enable several features in
+this framework such as arbitrary client participation, asynchronous server
+aggregation, and heterogeneous local computing, which are ubiquitous in
+real-world FL systems but are missed in most existing works. We provide a
+rigorous convergence analysis of our proposed framework for general nonconvex
+objectives, which is shown to converge with the best-known rate. Extensive
+experiments show that our approaches consistently outperform the baseline by a
+large margin across benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HEART: Achieving Timely Multi-Model Training for
+  Vehicle-Edge-Cloud-Integrated Hierarchical Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohong Yang, Minghui Liwang, Xianbin Wang, Zhipeng Cheng, Seyyedali Hosseinalipour, Huaiyu Dai, Zhenzhen Jiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid growth of AI-enabled Internet of Vehicles (IoV) calls for efficient
+machine learning (ML) solutions that can handle high vehicular mobility and
+decentralized data. This has motivated the emergence of Hierarchical Federated
+Learning over vehicle-edge-cloud architectures (VEC-HFL). Nevertheless, one
+aspect which is underexplored in the literature on VEC-HFL is that vehicles
+often need to execute multiple ML tasks simultaneously, where this multi-model
+training environment introduces crucial challenges. First, improper aggregation
+rules can lead to model obsolescence and prolonged training times. Second,
+vehicular mobility may result in inefficient data utilization by preventing the
+vehicles from returning their models to the network edge. Third, achieving a
+balanced resource allocation across diverse tasks becomes of paramount
+importance as it majorly affects the effectiveness of collaborative training.
+We take one of the first steps towards addressing these challenges via
+proposing a framework for multi-model training in dynamic VEC-HFL with the goal
+of minimizing global training latency while ensuring balanced training across
+various tasks-a problem that turns out to be NP-hard. To facilitate timely
+model training, we introduce a hybrid synchronous-asynchronous aggregation
+rule. Building on this, we present a novel method called Hybrid Evolutionary
+And gReedy allocaTion (HEART). The framework operates in two stages: first, it
+achieves balanced task scheduling through a hybrid heuristic approach that
+combines improved Particle Swarm Optimization (PSO) and Genetic Algorithms
+(GA); second, it employs a low-complexity greedy algorithm to determine the
+training priority of assigned tasks on vehicles. Experiments on real-world
+datasets demonstrate the superiority of HEART over existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures,</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Steering Large Language Models with Feature Guided Activation Additions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Soo, Wesley Teng, Chandrasekaran Balaganesh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective and reliable control over large language model (LLM) behavior is a
+significant challenge. While activation steering methods, which add steering
+vectors to a model's hidden states, are a promising approach, existing
+techniques often lack precision and interpretability in how they influence
+model outputs. We introduce Feature Guided Activation Additions (FGAA), a novel
+activation steering method that leverages insights from Contrastive Activation
+Addition (CAA) and Sparse Autoencoder-Targeted Steering (SAE-TS). By operating
+in the latent space of a Sparse Autoencoder (SAE) and employing optimization
+techniques to select desired SAE features, FGAA constructs precise steering
+vectors that provide better steering effects while maintaining coherence of
+steered model outputs. In this regard, evaluations on Gemma-2-2B and Gemma-2-9B
+models across various steering tasks demonstrate that FGAA outperforms existing
+steering methods of CAA, SAE decoder steering, and SAE-TS. Our results also
+highlight important trade-offs between steering scale and general model
+capabilities that are consistent across all tested steering methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 maintext pages, 14 appendix pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dialogue Benchmark Generation from Knowledge Graphs with Cost-Effective
+  Retrieval-Augmented LLMs <span class="chip">SIGMOD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reham Omar, Omij Mangukiya, Essam Mansour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialogue benchmarks are crucial in training and evaluating chatbots engaging
+in domain-specific conversations. Knowledge graphs (KGs) represent semantically
+rich and well-organized data spanning various domains, such as DBLP, DBpedia,
+and YAGO. Traditionally, dialogue benchmarks have been manually created from
+documents, neglecting the potential of KGs in automating this process. Some
+question-answering benchmarks are automatically generated using extensive
+preprocessing from KGs, but they do not support dialogue generation. This paper
+introduces Chatty-Gen, a novel multi-stage retrieval-augmented generation
+platform for automatically generating high-quality dialogue benchmarks tailored
+to a specific domain using a KG. Chatty-Gen decomposes the generation process
+into manageable stages and uses assertion rules for automatic validation
+between stages. Our approach enables control over intermediate results to
+prevent time-consuming restarts due to hallucinations. It also reduces reliance
+on costly and more powerful commercial LLMs. Chatty-Gen eliminates upfront
+processing of the entire KG using efficient query-based retrieval to find
+representative subgraphs based on the dialogue context. Our experiments with
+several real and large KGs demonstrate that Chatty-Gen significantly
+outperforms state-of-the-art systems and ensures consistent model and system
+performance across multiple LLMs of diverse capabilities, such as GPT-4o,
+Gemini 1.5, Llama 3, and Mistral.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper is publsihed in SIGMOD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IE-Bench: Advancing the Measurement of Text-Driven Image Editing for
+  Human Perception Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangkun Sun, Bowen Qu, Xiaoyu Liang, Songlin Fan, Wei Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in text-driven image editing have been significant, yet the
+task of accurately evaluating these edited images continues to pose a
+considerable challenge. Different from the assessment of text-driven image
+generation, text-driven image editing is characterized by simultaneously
+conditioning on both text and a source image. The edited images often retain an
+intrinsic connection to the original image, which dynamically change with the
+semantics of the text. However, previous methods tend to solely focus on
+text-image alignment or have not aligned with human perception. In this work,
+we introduce the Text-driven Image Editing Benchmark suite (IE-Bench) to
+enhance the assessment of text-driven edited images. IE-Bench includes a
+database contains diverse source images, various editing prompts and the
+corresponding results different editing methods, and total 3,010 Mean Opinion
+Scores (MOS) provided by 25 human subjects. Furthermore, we introduce IE-QA, a
+multi-modality source-aware quality assessment method for text-driven image
+editing. To the best of our knowledge, IE-Bench offers the first IQA dataset
+and model tailored for text-driven image editing. Extensive experiments
+demonstrate IE-QA's superior subjective-alignments on the text-driven image
+editing task compared with previous metrics. We will make all related data and
+code available to the public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ForestProtector: An IoT Architecture Integrating Machine Vision and Deep
+  Reinforcement Learning for Efficient Wildfire Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenneth Bonilla-Ormachea, Horacio Cuizaga, Edwin Salcedo, Sebastian Castro, Sergio Fernandez-Testa, Misael Mamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early detection of forest fires is crucial to minimizing the environmental
+and socioeconomic damage they cause. Indeed, a fire's duration directly
+correlates with the difficulty and cost of extinguishing it. For instance, a
+fire burning for 1 minute might require 1 liter of water to extinguish, while a
+2-minute fire could demand 100 liters, and a 10-minute fire might necessitate
+1,000 liters. On the other hand, existing fire detection systems based on novel
+technologies (e.g., remote sensing, PTZ cameras, UAVs) are often expensive and
+require human intervention, making continuous monitoring of large areas
+impractical. To address this challenge, this work proposes a low-cost forest
+fire detection system that utilizes a central gateway device with computer
+vision capabilities to monitor a 360{\deg} field of view for smoke at long
+distances. A deep reinforcement learning agent enhances surveillance by
+dynamically controlling the camera's orientation, leveraging real-time sensor
+data (smoke levels, ambient temperature, and humidity) from distributed IoT
+devices. This approach enables automated wildfire monitoring across expansive
+areas while reducing false positives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the proceedings of the 11th International
+  Conference on Automation, Robotics, and Applications (ICARA 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Study on a Fast Solver for Combined Field Integral Equations of 3D
+  Conducting Bodies Based on Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Shan, Xin Zhang, Di Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a graph neural networks (GNNs)-based fast solver
+(GraphSolver) for solving combined field integral equations (CFIEs) of 3D
+conducting bodies. Rao-Wilton-Glisson (RWG) basis functions are employed to
+discretely and accurately represent the geometry of 3D conducting bodies. A
+concise and informative graph representation is then constructed by treating
+each RWG function as a node in the graph, enabling the flow of current between
+nodes. With the transformed graphs, GraphSolver is developed to directly
+predict real and imaginary parts of the x, y and z components of the surface
+current densities at each node (RWG function). Numerical results demonstrate
+the efficacy of GraphSolver in solving CFIEs for 3D conducting bodies with
+varying levels of geometric complexity, including basic 3D targets,
+missile-shaped targets, and airplane-shaped targets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages,11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenSC-6G: A Prototype Testbed for Integrated Generative AI, Quantum, and
+  Semantic Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09918v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09918v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian E. Arfeto, Shehbaz Tariq, Uman Khalid, Trung Q. Duong, Hyundong Shin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a prototyping testbed, GenSC-6G, developed to generate a
+comprehensive dataset that supports the integration of generative artificial
+intelligence (AI), quantum computing, and semantic communication for emerging
+sixth-generation (6G) applications. The GenSC-6G dataset is designed with
+noise-augmented synthetic data optimized for semantic decoding, classification,
+and localization tasks, significantly enhancing flexibility for diverse
+AI-driven communication applications. This adaptable prototype supports
+seamless modifications across baseline models, communication modules, and
+goal-oriented decoders. Case studies demonstrate its application in lightweight
+classification, semantic upsampling, and edge-based language inference under
+noise conditions. The GenSC-6G dataset serves as a scalable and robust resource
+for developing goal-oriented communication systems tailored to the growing
+demands of 6G networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SUBMITTED FOR PUBLICATION IN IEEE COMMUNICATIONS MAGAZINE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards A Litmus Test for Common Sense 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09913v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09913v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Latapie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper is the second in a planned series aimed at envisioning a path to
+safe and beneficial artificial intelligence. Building on the conceptual
+insights of "Common Sense Is All You Need," we propose a more formal litmus
+test for common sense, adopting an axiomatic approach that combines minimal
+prior knowledge (MPK) constraints with diagonal or Godel-style arguments to
+create tasks beyond the agent's known concept set. We discuss how this approach
+applies to the Abstraction and Reasoning Corpus (ARC), acknowledging
+training/test data constraints, physical or virtual embodiment, and large
+language models (LLMs). We also integrate observations regarding emergent
+deceptive hallucinations, in which more capable AI systems may intentionally
+fabricate plausible yet misleading outputs to disguise knowledge gaps. The
+overarching theme is that scaling AI without ensuring common sense risks
+intensifying such deceptive tendencies, thereby undermining safety and trust.
+Aligning with the broader goal of developing beneficial AI without causing
+harm, our axiomatic litmus test not only diagnoses whether an AI can handle
+truly novel concepts but also provides a stepping stone toward an ethical,
+reliable foundation for future safe, beneficial, and aligned artificial
+intelligence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon
+  Visuomotor Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haichao Zhang, Haonan Yu, Le Zhao, Andrew Choi, Qinxun Bai, Yiqing Yang, Wei Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a low-cost quadruped manipulation system that solves long-horizon
+real-world tasks, trained by reinforcement learning purely in simulation. The
+system comprises 1) a hierarchical design of a high-level policy for
+visual-mobile manipulation following instructions, and a low-level policy for
+quadruped movement and limb-control, 2) a progressive policy expansion approach
+for solving the long-horizon task together with a teacher-student framework for
+efficient high-level training of the high-level visuomotor policy, and 3) a
+suite of techniques for minimizing sim-to-real gaps.
+  With budget-friendly but limited reliability and performance hardware, and
+just one wrist-mounted RGB camera, the entire system fully trained in
+simulation achieves high success rates for long horizon tasks involving search,
+move, grasp, and drop-into, with fluid sim-to-real transfer in a wide variety
+of indoor and outdoor scenes and lighting conditions.Extensive real-world
+evaluations show that on the long horizon mobile manipulation tasks, our system
+achieves good performance when transferred to real both in terms of task
+success rate and execution efficiency. Finally, we discuss the necessity of our
+sim-to-real techniques for legged mobile manipulation, and show their ablation
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evolving Deeper LLM Thinking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuang-Huei Lee, Ian Fischer, Yueh-Hua Wu, Dave Marwood, Shumeet Baluja, Dale Schuurmans, Xinyun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore an evolutionary search strategy for scaling inference time compute
+in Large Language Models. The proposed approach, Mind Evolution, uses a
+language model to generate, recombine and refine candidate responses. The
+proposed approach avoids the need to formalize the underlying inference problem
+whenever a solution evaluator is available. Controlling for inference cost, we
+find that Mind Evolution significantly outperforms other inference strategies
+such as Best-of-N and Sequential Revision in natural language planning tasks.
+In the TravelPlanner and Natural Plan benchmarks, Mind Evolution solves more
+than 98% of the problem instances using Gemini 1.5 Pro without the use of a
+formal solver.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Implementation of AI in Early Onset Interviews to Help
+  Mitigate Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nishka Lal, Omar Benkraouda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the application of artificial intelligence (AI) in
+early-stage recruitment interviews in order to reduce inherent bias,
+specifically sentiment bias. Traditional interviewers are often subject to
+several biases, including interviewer bias, social desirability effects, and
+even confirmation bias. In turn, this leads to non-inclusive hiring practices,
+and a less diverse workforce. This study further analyzes various AI
+interventions that are present in the marketplace today such as multimodal
+platforms and interactive candidate assessment tools in order to gauge the
+current market usage of AI in early-stage recruitment. However, this paper aims
+to use a unique AI system that was developed to transcribe and analyze
+interview dynamics, which emphasize skill and knowledge over emotional
+sentiments. Results indicate that AI effectively minimizes sentiment-driven
+biases by 41.2%, suggesting its revolutionizing power in companies' recruitment
+processes for improved equity and efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Learning Informative Trajectory Embeddings for Imitation,
+  Classification and Regression <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09327v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09327v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichang Ge, Changyu Chen, Arunesh Sinha, Pradeep Varakantham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world sequential decision making tasks like autonomous driving,
+robotics, and healthcare, learning from observed state-action trajectories is
+critical for tasks like imitation, classification, and clustering. For example,
+self-driving cars must replicate human driving behaviors, while robots and
+healthcare systems benefit from modeling decision sequences, whether or not
+they come from expert data. Existing trajectory encoding methods often focus on
+specific tasks or rely on reward signals, limiting their ability to generalize
+across domains and tasks. Inspired by the success of embedding models like CLIP
+and BERT in static domains, we propose a novel method for embedding
+state-action trajectories into a latent space that captures the skills and
+competencies in the dynamic underlying decision-making processes. This method
+operates without the need for reward labels, enabling better generalization
+across diverse domains and tasks. Our contributions are threefold: (1) We
+introduce a trajectory embedding approach that captures multiple abilities from
+state-action data. (2) The learned embeddings exhibit strong representational
+power across downstream tasks, including imitation, classification, clustering,
+and regression. (3) The embeddings demonstrate unique properties, such as
+controlling agent behaviors in IQ-Learn and an additive structure in the latent
+space. Experimental results confirm that our method outperforms traditional
+approaches, offering more flexible and powerful trajectory representations for
+various applications. Our code is available at
+https://github.com/Erasmo1015/vte.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAMAS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neuradicon: operational representation learning of neuroimaging reports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.10021v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.10021v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henry Watkins, Robert Gray, Adam Julius, Yee-Haur Mah, Walter H. L. Pinaya, Paul Wright, Ashwani Jha, Holger Engleitner, Jorge Cardoso, Sebastien Ourselin, Geraint Rees, Rolf Jaeger, Parashkev Nachev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiological reports typically summarize the content and interpretation of
+imaging studies in unstructured form that precludes quantitative analysis. This
+limits the monitoring of radiological services to throughput undifferentiated
+by content, impeding specific, targeted operational optimization. Here we
+present Neuradicon, a natural language processing (NLP) framework for
+quantitative analysis of neuroradiological reports. Our framework is a hybrid
+of rule-based and artificial intelligence models to represent neurological
+reports in succinct, quantitative form optimally suited to operational
+guidance. We demonstrate the application of Neuradicon to operational
+phenotyping of a corpus of 336,569 reports, and report excellent
+generalizability across time and two independent healthcare institutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Moonshine: Distilling Game Content Generators into Steerable Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09594v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09594v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhe Nie, Michael Middleton, Tim Merino, Nidhushan Kanagaraja, Ashutosh Kumar, Zhan Zhuang, Julian Togelius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Procedural Content Generation via Machine Learning (PCGML) has enhanced game
+content creation, yet challenges in controllability and limited training data
+persist. This study addresses these issues by distilling a constructive PCG
+algorithm into a controllable PCGML model. We first generate a large amount of
+content with a constructive algorithm and label it using a Large Language Model
+(LLM). We use these synthetic labels to condition two PCGML models for
+content-specific generation, a diffusion model and the five-dollar model. This
+neural network distillation process ensures that the generation aligns with the
+original algorithm while introducing controllability through plain text. We
+define this text-conditioned PCGML as a Text-to-game-Map (T2M) task, offering
+an alternative to prevalent text-to-image multi-modal tasks. We compare our
+distilled models with the baseline constructive algorithm. Our analysis of the
+variety, accuracy, and quality of our generation demonstrates the efficacy of
+distilling constructive methods into controllable text-conditioned PCGML
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Types of AI Existential Risk: Decisive and Accumulative 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07836v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07836v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atoosa Kasirzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional discourse on existential risks (x-risks) from AI typically
+focuses on abrupt, dire events caused by advanced AI systems, particularly
+those that might achieve or surpass human-level intelligence. These events have
+severe consequences that either lead to human extinction or irreversibly
+cripple human civilization to a point beyond recovery. This discourse, however,
+often neglects the serious possibility of AI x-risks manifesting incrementally
+through a series of smaller yet interconnected disruptions, gradually crossing
+critical thresholds over time. This paper contrasts the conventional "decisive
+AI x-risk hypothesis" with an "accumulative AI x-risk hypothesis." While the
+former envisions an overt AI takeover pathway, characterized by scenarios like
+uncontrollable superintelligence, the latter suggests a different causal
+pathway to existential catastrophes. This involves a gradual accumulation of
+critical AI-induced threats such as severe vulnerabilities and systemic erosion
+of economic and political structures. The accumulative hypothesis suggests a
+boiling frog scenario where incremental AI risks slowly converge, undermining
+societal resilience until a triggering event results in irreversible collapse.
+Through systems analysis, this paper examines the distinct assumptions
+differentiating these two hypotheses. It is then argued that the accumulative
+view can reconcile seemingly incompatible perspectives on AI risks. The
+implications of differentiating between these causal pathways -- the decisive
+and the accumulative -- for the governance of AI as well as long-term AI safety
+are discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal article for Philosophical Studies</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated Machine Learning for Remaining Useful Life Predictions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12215v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12215v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc-André Zöller, Fabian Mauthe, Peter Zeiler, Marius Lindauer, Marco F. Huber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being able to predict the remaining useful life (RUL) of an engineering
+system is an important task in prognostics and health management. Recently,
+data-driven approaches to RUL predictions are becoming prevalent over
+model-based approaches since no underlying physical knowledge of the
+engineering system is required. Yet, this just replaces required expertise of
+the underlying physics with machine learning (ML) expertise, which is often
+also not available. Automated machine learning (AutoML) promises to build
+end-to-end ML pipelines automatically enabling domain experts without ML
+expertise to create their own models. This paper introduces AutoRUL, an
+AutoML-driven end-to-end approach for automatic RUL predictions. AutoRUL
+combines fine-tuned standard regression methods to an ensemble with high
+predictive power. By evaluating the proposed method on eight real-world and
+synthetic datasets against state-of-the-art hand-crafted models, we show that
+AutoML provides a viable alternative to hand-crafted data-driven RUL
+predictions. Consequently, creating RUL predictions can be made more accessible
+for domain experts using AutoML by eliminating ML expertise from data-driven
+model construction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript accepted at IEEE SMC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Large Reasoning Models: A <span class="highlight-title">Survey</span> on Scaling LLM Reasoning
+  Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09686v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09686v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengli Xu, Qianyue Hao, Zefang Zong, Jingwei Wang, Yunke Zhang, Jingyi Wang, Xiaochong Lan, Jiahui Gong, Tianjian Ouyang, Fanjin Meng, Chenyang Shao, Yuwei Yan, Qinglong Yang, Yiwen Song, Sijian Ren, Xinyuan Hu, Yu Li, Jie Feng, Chen Gao, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language has long been conceived as an essential tool for human reasoning.
+The breakthrough of Large Language Models (LLMs) has sparked significant
+research interest in leveraging these models to tackle complex reasoning tasks.
+Researchers have moved beyond simple autoregressive token generation by
+introducing the concept of "thought" -- a sequence of tokens representing
+intermediate steps in the reasoning process. This innovative paradigm enables
+LLMs' to mimic complex human reasoning processes, such as tree search and
+reflective thinking. Recently, an emerging trend of learning to reason has
+applied reinforcement learning (RL) to train LLMs to master reasoning
+processes. This approach enables the automatic generation of high-quality
+reasoning trajectories through trial-and-error search algorithms, significantly
+expanding LLMs' reasoning capacity by providing substantially more training
+data. Furthermore, recent studies demonstrate that encouraging LLMs to "think"
+with more tokens during test-time inference can further significantly boost
+reasoning accuracy. Therefore, the train-time and test-time scaling combined to
+show a new research frontier -- a path toward Large Reasoning Model. The
+introduction of OpenAI's o1 series marks a significant milestone in this
+research direction. In this survey, we present a comprehensive review of recent
+progress in LLM reasoning. We begin by introducing the foundational background
+of LLMs and then explore the key technical components driving the development
+of large reasoning models, with a focus on automated data construction,
+learning-to-reason techniques, and test-time scaling. We also analyze popular
+open-source projects at building large reasoning models, and conclude with open
+challenges and future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Model is Secretly a Protein Sequence Optimizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09274v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09274v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinkai Wang, Jiaxing He, Yuanqi Du, Xiaohui Chen, Jianan Canal Li, Li-Ping Liu, Xiaolin Xu, Soha Hassoun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the protein sequence engineering problem, which aims to find
+protein sequences with high fitness levels, starting from a given wild-type
+sequence. Directed evolution has been a dominating paradigm in this field which
+has an iterative process to generate variants and select via experimental
+feedback. We demonstrate large language models (LLMs), despite being trained on
+massive texts, are secretly protein sequence optimizers. With a directed
+evolutionary method, LLM can perform protein engineering through Pareto and
+experiment-budget constrained optimization, demonstrating success on both
+synthetic and experimental fitness landscapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Diversity and Uncertainty in Active learning with
+  <span class="highlight-title">Self-Supervised</span> <span class="highlight-title">Pre-Train</span>ing <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03728v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03728v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Doucet, Benjamin Estermann, Till Aczel, Roger Wattenhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the integration of diversity-based and uncertainty-based
+sampling strategies in active learning, particularly within the context of
+self-supervised pre-trained models. We introduce a straightforward heuristic
+called TCM that mitigates the cold start problem while maintaining strong
+performance across various data levels. By initially applying TypiClust for
+diversity sampling and subsequently transitioning to uncertainty sampling with
+Margin, our approach effectively combines the strengths of both strategies. Our
+experiments demonstrate that TCM consistently outperforms existing methods
+across various datasets in both low and high data regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024 Workshop on Practical Machine Learning for Low
+  Resource Settings (PML4LRS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Animal-AI Environment: A Virtual Laboratory For Comparative
+  Cognition and Artificial Intelligence Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11414v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11414v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantinos Voudouris, Ibrahim Alhas, Wout Schellaert, Matteo G. Mecattaf, Ben Slater, Matthew Crosby, Joel Holmes, John Burden, Niharika Chaubey, Niall Donnelly, Matishalin Patel, Marta Halina, José Hernández-Orallo, Lucy G. Cheke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Animal-AI Environment is a unique game-based research platform designed
+to facilitate collaboration between the artificial intelligence and comparative
+cognition research communities. In this paper, we present the latest version of
+the Animal-AI Environment, outlining several major features that make the game
+more engaging for humans and more complex for AI systems. These features
+include interactive buttons, reward dispensers, and player notifications, as
+well as an overhaul of the environment's graphics and processing for
+significant improvements in agent training time and quality of the human player
+experience. We provide detailed guidance on how to build computational and
+behavioural experiments with the Animal-AI Environment. We present results from
+a series of agents, including the state-of-the-art deep reinforcement learning
+agent Dreamer-v3, on newly designed tests and the Animal-AI Testbed of 900
+tasks inspired by research in the field of comparative cognition. The Animal-AI
+Environment offers a new approach for modelling cognition in humans and
+non-human animals, and for building biologically inspired artificial
+intelligence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 16 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Quantization for Matrix Multiplication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.13780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.13780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Or Ordentlich, Yury Polyanskiy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work in machine learning community proposed multiple methods for
+performing lossy compression (quantization) of large matrices. This
+quantization is important for accelerating matrix multiplication (main
+component of large language models), which is often bottlenecked by the speed
+of loading these matrices from memory. Unlike classical vector quantization and
+rate-distortion theory, the goal of these new compression algorithms is to be
+able to approximate not the matrices themselves, but their matrix product.
+Specifically, given a pair of real matrices $A,B$ an encoder (compressor) is
+applied to each of them independently producing descriptions with $R$ bits per
+entry. These representations subsequently are used by the decoder to estimate
+matrix product $A^\top B$. In this work, we provide a non-asymptotic lower
+bound on the mean squared error of this approximation (as a function of rate
+$R$) for the case of matrices $A,B$ with iid Gaussian entries. Algorithmically,
+we construct a universal quantizer based on nested lattices with an explicit
+guarantee of approximation error for any (non-random) pair of matrices $A$, $B$
+in terms of only Frobenius norms $\|\bar{A}\|_F, \|\bar{B}\|_F$ and
+$\|\bar{A}^\top \bar{B}\|_F$, where $\bar{A},\bar{B}$ are versions of $A,B$
+with zero-centered columns, respectively. For iid Gaussian matrices our
+quantizer achieves the lower bound and is, thus, asymptotically optimal. A
+practical low-complexity version of our quantizer achieves performance quite
+close to optimal. In addition, we derive rate-distortion function for matrix
+multiplication of iid Gaussian matrices, which exhibits an interesting
+phase-transition at $R\approx 0.906$ bit/entry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Compression Autoencoder for Efficient High-Resolution Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.10733v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.10733v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyu Chen, Han Cai, Junsong Chen, Enze Xie, Shang Yang, Haotian Tang, Muyang Li, Yao Lu, Song Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder
+models for accelerating high-resolution diffusion models. Existing autoencoder
+models have demonstrated impressive results at a moderate spatial compression
+ratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for
+high spatial compression ratios (e.g., 64x). We address this challenge by
+introducing two key techniques: (1) Residual Autoencoding, where we design our
+models to learn residuals based on the space-to-channel transformed features to
+alleviate the optimization difficulty of high spatial-compression autoencoders;
+(2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases
+training strategy for mitigating the generalization penalty of high
+spatial-compression autoencoders. With these designs, we improve the
+autoencoder's spatial compression ratio up to 128 while maintaining the
+reconstruction quality. Applying our DC-AE to latent diffusion models, we
+achieve significant speedup without accuracy drop. For example, on ImageNet
+512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup
+on H100 GPU for UViT-H while achieving a better FID, compared with the widely
+used SD-VAE-f8 autoencoder. Our code is available at
+https://github.com/mit-han-lab/efficientvit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. First two authors contributed equally to this work. Update:
+  fix typo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generate E-commerce Product Background by Integrating Category
+  Commonality and Personalized Style <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13309v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13309v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haohan Wang, Wei Feng, Yaoyu Li, Zheng Zhang, Jingjing Lv, Junjie Shen, Zhangang Lin, Jingping Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The state-of-the-art methods for e-commerce product background generation
+suffer from the inefficiency of designing product-wise prompts when scaling up
+the production, as well as the ineffectiveness of describing fine-grained
+styles when customizing personalized backgrounds for some specific brands. To
+address these obstacles, we integrate the category commonality and personalized
+style into diffusion models. Concretely, we propose a Category-Wise Generator
+to enable large-scale background generation with only one model for the first
+time. A unique identifier in the prompt is assigned to each category, whose
+attention is located on the background by a mask-guided cross attention layer
+to learn the category-wise style. Furthermore, for products with specific and
+fine-grained requirements in layout, elements, etc, a Personality-Wise
+Generator is devised to learn such personalized style directly from a reference
+image to resolve textual ambiguities, and is trained in a self-supervised
+manner for more efficient training data usage. To advance research in this
+field, the first large-scale e-commerce product background generation dataset
+BG60k is constructed, which covers more than 60k product images from over 2k
+categories. Experiments demonstrate that our method could generate high-quality
+backgrounds for different categories, and maintain the personalized background
+style of reference images. BG60k will be available at
+\url{https://github.com/Whileherham/BG60k}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ News Without Borders: Domain Adaptation of Multilingual Sentence
+  Embeddings for Cross-lingual News Recommendation <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12634v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12634v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreea Iana, Fabian David Schmidt, Goran Glavaš, Heiko Paulheim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapidly growing numbers of multilingual news consumers pose an increasing
+challenge to news recommender systems in terms of providing customized
+recommendations. First, existing neural news recommenders, even when powered by
+multilingual language models (LMs), suffer substantial performance losses in
+zero-shot cross-lingual transfer (ZS-XLT). Second, the current paradigm of
+fine-tuning the backbone LM of a neural recommender on task-specific data is
+computationally expensive and infeasible in few-shot recommendation and
+cold-start setups, where data is scarce or completely unavailable. In this
+work, we propose a news-adapted sentence encoder (NaSE), domain-specialized
+from a pretrained massively multilingual sentence encoder (SE). To this end, we
+construct and leverage PolyNews and PolyNewsParallel, two multilingual
+news-specific corpora. With the news-adapted multilingual SE in place, we test
+the effectiveness of (i.e., question the need for) supervised fine-tuning for
+news recommendation, and propose a simple and strong baseline based on (i)
+frozen NaSE embeddings and (ii) late click-behavior fusion. We show that NaSE
+achieves state-of-the-art performance in ZS-XLT in true cold-start and few-shot
+news recommendation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 47th European Conference on Information Retrieval
+  (ECIR 2025) Appendix A is provided only in the arXiv version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MRI2Speech: Speech Synthesis from Articulatory Movements Recorded by
+  Real-time MRI <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18836v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18836v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neil Shah, Ayan Kashyap, Shirish Karande, Vineet Gandhi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous real-time MRI (rtMRI)-based speech synthesis models depend heavily
+on noisy ground-truth speech. Applying loss directly over ground truth
+mel-spectrograms entangles speech content with MRI noise, resulting in poor
+intelligibility. We introduce a novel approach that adapts the multi-modal
+self-supervised AV-HuBERT model for text prediction from rtMRI and incorporates
+a new flow-based duration predictor for speaker-specific alignment. The
+predicted text and durations are then used by a speech decoder to synthesize
+aligned speech in any novel voice. We conduct thorough experiments on two
+datasets and demonstrate our method's generalization ability to unseen
+speakers. We assess our framework's performance by masking parts of the rtMRI
+video to evaluate the impact of different articulators on text prediction. Our
+method achieves a $15.18\%$ Word Error Rate (WER) on the USC-TIMIT MRI corpus,
+marking a huge improvement over the current state-of-the-art. Speech samples
+are available at https://mri2speech.github.io/MRI2Speech/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Audio-Driven Reinforcement Learning for Head-Orientation in Naturalistic
+  Environments <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.10048v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.10048v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wessel Ledder, Yuzhen Qin, Kiki van der Heijden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although deep reinforcement learning (DRL) approaches in audio signal
+processing have seen substantial progress in recent years, audio-driven DRL for
+tasks such as navigation, gaze control and head-orientation control in the
+context of human-robot interaction have received little attention. Here, we
+propose an audio-driven DRL framework in which we utilise deep Q-learning to
+develop an autonomous agent that orients towards a talker in the acoustic
+environment based on stereo speech recordings. Our results show that the agent
+learned to perform the task at a near perfect level when trained on speech
+segments in anechoic environments (that is, without reverberation). The
+presence of reverberation in naturalistic acoustic environments affected the
+agent's performance, although the agent still substantially outperformed a
+baseline, randomly acting agent. Finally, we quantified the degree of
+generalization of the proposed DRL approach across naturalistic acoustic
+environments. Our experiments revealed that policies learned by agents trained
+on medium or high reverb environments generalized to low reverb environments,
+but policies learned by agents trained on anechoic or low reverb environments
+did not generalize to medium or high reverb environments. Taken together, this
+study demonstrates the potential of audio-driven DRL for tasks such as
+head-orientation control and highlights the need for training strategies that
+enable robust generalization across environments for real-world audio-driven
+DRL applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Agent Hospital: A Simulacrum of Hospital with Evolvable Medical Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02957v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02957v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junkai Li, Yunghwei Lai, Weitao Li, Jingyi Ren, Meng Zhang, Xinhui Kang, Siyu Wang, Peng Li, Ya-Qin Zhang, Weizhi Ma, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent rapid development of large language models (LLMs) has sparked a
+new wave of technological revolution in medical artificial intelligence (AI).
+While LLMs are designed to understand and generate text like a human,
+autonomous agents that utilize LLMs as their "brain" have exhibited
+capabilities beyond text processing such as planning, reflection, and using
+tools by enabling their "bodies" to interact with the environment. We introduce
+a simulacrum of hospital called Agent Hospital that simulates the entire
+process of treating illness, in which all patients, nurses, and doctors are
+LLM-powered autonomous agents. Within the simulacrum, doctor agents are able to
+evolve by treating a large number of patient agents without the need to label
+training data manually. After treating tens of thousands of patient agents in
+the simulacrum (human doctors may take several years in the real world), the
+evolved doctor agents outperform state-of-the-art medical agent methods on the
+MedQA benchmark comprising US Medical Licensing Examination (USMLE) test
+questions. Our methods of simulacrum construction and agent evolution have the
+potential in benefiting a broad range of applications beyond medical AI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Piece of Table: A Divide-and-Conquer Approach for Selecting Sub-Tables
+  in Table Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07629v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07629v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wonjin Lee, Kyumin Kim, Sungjae Lee, Jihun Lee, Kwang In Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Applying language models (LMs) to tables is challenging due to the inherent
+structural differences between two-dimensional tables and one-dimensional text
+for which the LMs were originally designed. Furthermore, when applying
+linearized tables to LMs, the maximum token lengths often imposed in
+self-attention calculations make it difficult to comprehensively understand the
+context spread across large tables. To address these challenges, we present
+PieTa (Piece of Table), a new framework for sub-table-based question answering
+(QA). PieTa operates through an iterative process of dividing tables into
+smaller windows, using LMs to select relevant cells within each window, and
+merging these cells into a sub-table. This multi-resolution approach captures
+dependencies across multiple rows and columns while avoiding the limitations
+caused by long context inputs. Instantiated as a simple iterative sub-table
+union algorithm, PieTa demonstrates improved performance over previous
+sub-table-based QA approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Process Models: A Vision for Business Process Management in the
+  Age of Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00900v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00900v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timotheus Kampik, Christian Warmuth, Adrian Rebmann, Ron Agam, Lukas N. P. Egger, Andreas Gerber, Johannes Hoffart, Jonas Kolk, Philipp Herzig, Gero Decker, Han van der Aa, Artem Polyvyanyy, Stefanie Rinderle-Ma, Ingo Weber, Matthias Weidlich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The continued success of Large Language Models (LLMs) and other generative
+artificial intelligence approaches highlights the advantages that large
+information corpora can have over rigidly defined symbolic models, but also
+serves as a proof-point of the challenges that purely statistics-based
+approaches have in terms of safety and trustworthiness. As a framework for
+contextualizing the potential, as well as the limitations of LLMs and other
+foundation model-based technologies, we propose the concept of a Large Process
+Model (LPM) that combines the correlation power of LLMs with the analytical
+precision and reliability of knowledge-based systems and automated reasoning
+approaches. LPMs are envisioned to directly utilize the wealth of process
+management experience that experts have accumulated, as well as process
+performance data of organizations with diverse characteristics, e.g.,\
+regarding size, region, or industry. In this vision, the proposed LPM would
+allow organizations to receive context-specific (tailored) process and other
+business models, analytical deep-dives, and improvement recommendations. As
+such, they would allow to substantially decrease the time and effort required
+for business transformation, while also allowing for deeper, more impactful,
+and more actionable insights than previously possible. We argue that
+implementing an LPM is feasible, but also highlight limitations and research
+challenges that need to be solved to implement particular aspects of the LPM
+vision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative AI in Cybersecurity: A Comprehensive <span class="highlight-title">Review</span> of LLM
+  Applications and Vulnerabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12750v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12750v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Amine Ferrag, Fatima Alwahedi, Ammar Battah, Bilel Cherif, Abdechakour Mechri, Norbert Tihanyi, Tamas Bisztray, Merouane Debbah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides a comprehensive review of the future of cybersecurity
+through Generative AI and Large Language Models (LLMs). We explore LLM
+applications across various domains, including hardware design security,
+intrusion detection, software engineering, design verification, cyber threat
+intelligence, malware detection, and phishing detection. We present an overview
+of LLM evolution and its current state, focusing on advancements in models such
+as GPT-4, GPT-3.5, Mixtral-8x7B, BERT, Falcon2, and LLaMA. Our analysis extends
+to LLM vulnerabilities, such as prompt injection, insecure output handling,
+data poisoning, DDoS attacks, and adversarial instructions. We delve into
+mitigation strategies to protect these models, providing a comprehensive look
+at potential attack scenarios and prevention techniques. Furthermore, we
+evaluate the performance of 42 LLM models in cybersecurity knowledge and
+hardware security, highlighting their strengths and weaknesses. We thoroughly
+evaluate cybersecurity datasets for LLM training and testing, covering the
+lifecycle from data creation to usage and identifying gaps for future research.
+In addition, we review new strategies for leveraging LLMs, including techniques
+like Half-Quadratic Quantization (HQQ), Reinforcement Learning with Human
+Feedback (RLHF), Direct Preference Optimization (DPO), Quantized Low-Rank
+Adapters (QLoRA), and Retrieval-Augmented Generation (RAG). These insights aim
+to enhance real-time cybersecurity defenses and improve the sophistication of
+LLM applications in threat detection and response. Our paper provides a
+foundational understanding and strategic direction for integrating LLMs into
+future cybersecurity frameworks, emphasizing innovation and robust model
+deployment to safeguard against evolving cyber threats.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>52 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tarsier2: Advancing Large Vision-Language Models from Detailed Video
+  Description to Comprehensive Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07888v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07888v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Yuan, Jiawei Wang, Haomiao Sun, Yuchen Zhang, Yuan Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM)
+designed for generating detailed and accurate video descriptions, while also
+exhibiting superior general video understanding capabilities. Tarsier2 achieves
+significant advancements through three key upgrades: (1) Scaling pre-training
+data from 11M to 40M video-text pairs, enriching both volume and diversity; (2)
+Performing fine-grained temporal alignment during supervised fine-tuning; (3)
+Using model-based sampling to automatically construct preference data and
+applying DPO training for optimization. Extensive experiments show that
+Tarsier2-7B consistently outperforms leading proprietary models, including
+GPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K
+benchmark, Tarsier2-7B improves F1 by 2.8\% over GPT-4o and 5.8\% over
+Gemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\%
+performance advantage over GPT-4o and +24.9\% over Gemini-1.5-Pro. Tarsier2-7B
+also sets new state-of-the-art results across 15 public benchmarks, spanning
+tasks such as video question-answering, video grounding, hallucination test,
+and embodied question-answering, demonstrating its versatility as a robust
+generalist vision-language model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XEQ Scale for Evaluating XAI Experience Quality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10662v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10662v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anjana Wijekoon, Nirmalie Wiratunga, David Corsar, Kyle Martin, Ikechukwu Nkisi-Orji, Belen Díaz-Agudo, Derek Bridge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable Artificial Intelligence (XAI) aims to improve the transparency of
+autonomous decision-making through explanations. Recent literature has
+emphasised users' need for holistic "multi-shot" explanations and personalised
+engagement with XAI systems. We refer to this user-centred interaction as an
+XAI Experience. Despite advances in creating XAI experiences, evaluating them
+in a user-centred manner has remained challenging. In response, we developed
+the XAI Experience Quality (XEQ) Scale. XEQ quantifies the quality of
+experiences across four dimensions: learning, utility, fulfilment and
+engagement. These contributions extend the state-of-the-art of XAI evaluation,
+moving beyond the one-dimensional metrics frequently developed to assess
+single-shot explanations. This paper presents the XEQ scale development and
+validation process, including content validation with XAI experts, and
+discriminant and construct validation through a large-scale pilot study. Our
+pilot study results offer strong evidence that establishes the XEQ Scale as a
+comprehensive framework for evaluating user-centred XAI experiences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enabling Low-Resource Language Retrieval: Establishing Baselines for
+  Urdu MS MARCO <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12997v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12997v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Umer Butt, Stalin Veranasi, Günter Neumann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the Information Retrieval (IR) field increasingly recognizes the
+importance of inclusivity, addressing the needs of low-resource languages
+remains a significant challenge. This paper introduces the first large-scale
+Urdu IR dataset, created by translating the MS MARCO dataset through machine
+translation. We establish baseline results through zero-shot learning for IR in
+Urdu and subsequently apply the mMARCO multilingual IR methodology to this
+newly translated dataset. Our findings demonstrate that the fine-tuned model
+(Urdu-mT5-mMARCO) achieves a Mean Reciprocal Rank (MRR@10) of 0.247 and a
+Recall@10 of 0.439, representing significant improvements over zero-shot
+results and showing the potential for expanding IR access for Urdu speakers. By
+bridging access gaps for speakers of low-resource languages, this work not only
+advances multilingual IR research but also emphasizes the ethical and societal
+importance of inclusive IR technologies. This work provides valuable insights
+into the challenges and solutions for improving language representation and
+lays the groundwork for future research, especially in South Asian languages,
+which can benefit from the adaptable methods used in this study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, ECIR 2025, conference camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VLSBench: Unveiling Visual Leakage in Multimodal Safety 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19939v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19939v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhao Hu, Dongrui Liu, Hao Li, Xuanjing Huang, Jing Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safety concerns of Multimodal large language models (MLLMs) have gradually
+become an important problem in various applications. Surprisingly, previous
+works indicate a counter-intuitive phenomenon that using textual unlearning to
+align MLLMs achieves comparable safety performances with MLLMs trained with
+image-text pairs. To explain such a counter-intuitive phenomenon, we discover a
+visual safety information leakage (VSIL) problem in existing multimodal safety
+benchmarks, i.e., the potentially risky and sensitive content in the image has
+been revealed in the textual query. In this way, MLLMs can easily refuse these
+sensitive text-image queries according to textual queries. However, image-text
+pairs without VSIL are common in real-world scenarios and are overlooked by
+existing multimodal safety benchmarks. To this end, we construct multimodal
+visual leakless safety benchmark (VLSBench) preventing visual safety leakage
+from image to textual query with 2.4k image-text pairs. Experimental results
+indicate that VLSBench poses a significant challenge to both open-source and
+close-source MLLMs, including LLaVA, Qwen2-VL, Llama3.2-Vision, and GPT-4o.
+This study demonstrates that textual alignment is enough for multimodal safety
+scenarios with VSIL, while multimodal alignment is a more promising solution
+for multimodal safety scenarios without VSIL. Please see our code and data at:
+https://hxhcreate.github.io/vlsbench.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Sycophancy in Decoder-Only <span class="highlight-title">Transformer</span> Architectures:
+  Synthetic Data Intervention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10156v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10156v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Libo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To address the sycophancy problem caused by reinforcement learning from human
+feedback in large language models, this research applies synthetic data
+intervention technology to the decoder-only transformer architecture. Based on
+the research gaps in the existing literature, the researcher designed an
+experimental process to reduce the tendency of models to cater by generating
+diversified data, and used GPT4o as an experimental tool for verification. The
+experiment used 100 true and false questions, and compared the performance of
+the model trained with synthetic data intervention and the original untrained
+model on multiple indicators. The results show that the SDI training model
+supports the technology in terms of accuracy rate and sycophancy rate and has
+significant effectiveness in reducing sycophancy phenomena. Notably, the data
+set, experimental process, code and data results have been uploaded to Github,
+the link is https://github.com/brucewang123456789/GeniusTrail.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This research is also submitted to OpenReview. The main text is 9
+  pages (excluding citations), 7 figures, and 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BatchLLM: Optimizing Large Batched LLM Inference with Global Prefix
+  Sharing and Throughput-oriented Token Batching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.03594v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.03594v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Zheng, Xin Ji, Taosong Fang, Fanghao Zhou, Chuanjie Liu, Gang Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) increasingly play an important role in a wide
+range of information processing and management tasks. Many of these tasks are
+performed in large batches or even offline, and the performance indictor for
+which is throughput. These tasks usually show the characteristic of prefix
+sharing, where different prompt input can partially show the common prefix.
+However, the existing LLM inference engines tend to optimize the streaming
+requests and show limitations of supporting the large batched tasks with the
+prefix sharing characteristic. The existing solutions use the LRU-based cache
+to reuse the KV context of common prefix between requests. The KV context that
+are about to be reused may prematurely evicted with the implicit cache
+management. Besides, the streaming oriented systems do not leverage the
+request-batch information and can not mix the decoding tokens with the prefill
+chunks to the best for the batched scenarios, and thus fails to saturate the
+GPU. We propose BatchLLM to address the above problems. BatchLLM explicitly
+identifies the common prefixes globally. The requests sharing the same prefix
+will be scheduled together to reuse the KV context the best. BatchLLM reorders
+the requests and schedules the requests with larger ratio of decoding first to
+better mix the decoding tokens with the latter prefill chunks, and applies
+memory-centric token batching to enlarge the token-batch sizes, which helps to
+increase the GPU utilization. Finally, BatchLLM optimizes the prefix-shared
+Attention kernel with horizontal fusion to reduce tail effect and kernel launch
+overhead. Extensive evaluation shows that BatchLLM outperforms vLLM and SGLang
+by 1.3$\times$ to 10.8$\times$ on a set of microbenchmarks and a typical
+industry workload under different hardware environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ELITR-Bench: A Meeting Assistant Benchmark for Long-Context Language
+  Models <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.20262v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.20262v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thibaut Thonet, Jos Rozen, Laurent Besacier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research on Large Language Models (LLMs) has recently witnessed an increasing
+interest in extending the models' context size to better capture dependencies
+within long documents. While benchmarks have been proposed to assess long-range
+abilities, existing efforts primarily considered generic tasks that are not
+necessarily aligned with real-world applications. In contrast, we propose a new
+benchmark for long-context LLMs focused on a practical meeting assistant
+scenario in which the long contexts consist of transcripts obtained by
+automatic speech recognition, presenting unique challenges for LLMs due to the
+inherent noisiness and oral nature of such data. Our benchmark, ELITR-Bench,
+augments the existing ELITR corpus by adding 271 manually crafted questions
+with their ground-truth answers, as well as noisy versions of meeting
+transcripts altered to target different Word Error Rate levels. Our experiments
+with 12 long-context LLMs on ELITR-Bench confirm the progress made across
+successive generations of both proprietary and open models, and point out their
+discrepancies in terms of robustness to transcript noise. We also provide a
+thorough analysis of our GPT-4-based evaluation, including insights from a
+crowdsourcing study. Our findings indicate that while GPT-4's scores align with
+human judges, its ability to distinguish beyond three score levels may be
+limited.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforcement Learning from Human Feedback: Whose Culture, Whose Values,
+  Whose Perspectives? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.17482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.17482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kristian González Barman, Simon Lohse, Henk de Regt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We argue for the epistemic and ethical advantages of pluralism in
+Reinforcement Learning from Human Feedback (RLHF) in the context of Large
+Language Models (LLM). Drawing on social epistemology and pluralist philosophy
+of science, we suggest ways in which RHLF can be made more responsive to human
+needs and how we can address challenges along the way. The paper concludes with
+an agenda for change, i.e. concrete, actionable steps to improve LLM
+development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating analytical variability in fMRI results with style transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03703v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03703v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elodie Germani, Camille Maumet, Elisa Fromont
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel approach to improve the reproducibility of neuroimaging
+results by converting statistic maps across different functional MRI pipelines.
+We make the assumption that pipelines used to compute fMRI statistic maps can
+be considered as a style component and we propose to use different generative
+models, among which, Generative Adversarial Networks (GAN) and Diffusion Models
+(DM) to convert statistic maps across different pipelines. We explore the
+performance of multiple GAN frameworks, and design a new DM framework for
+unsupervised multi-domain styletransfer. We constrain the generation of 3D fMRI
+statistic maps using the latent space of an auxiliary classifier that
+distinguishes statistic maps from different pipelines and extend traditional
+sampling techniques used in DM to improve the transition performance. Our
+experiments demonstrate that our proposed methods aresuccessful: pipelines can
+indeed be transferred as a style component, providing animportant source of
+data augmentation for future medical studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Median (GM) Matching for Robust Data Pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.17188v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.17188v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anish Acharya, Inderjit S Dhillon, Sujay Sanghavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale data collections in the wild, are invariably noisy. Thus
+developing data pruning strategies that remain robust even in the presence of
+corruption is critical in practice. In this work, we propose Geometric Median
+($\gm$) Matching -- a herding style greedy algorithm that yields a $k$-subset
+such that the mean of the subset approximates the geometric median of the
+(potentially) noisy dataset. Theoretically, we show that $\gm$ Matching enjoys
+an improved $\gO(1/k)$ scaling over $\gO(1/\sqrt{k})$ scaling of uniform
+sampling; while achieving {\bf optimal breakdown point} of {\bf 1/2} even under
+{\bf arbitrary} corruption. Extensive experiments across several popular deep
+learning benchmarks indicate that $\gm$ Matching consistently improves over
+prior state-of-the-art; the gains become more profound at high rates of
+corruption and aggressive pruning rates; making $\gm$ Matching a strong
+baseline for future research in robust data pruning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-guided Image Restoration and Semantic Enhancement for Text-to-Image
+  Person Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09059v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09059v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Delong Liu, Haiwen Li, Zhicheng Zhao, Yuan Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of Text-to-Image Person Retrieval (TIPR) is to retrieve specific
+person images according to the given textual descriptions. A primary challenge
+in this task is bridging the substantial representational gap between visual
+and textual modalities. The prevailing methods map texts and images into
+unified embedding space for matching, while the intricate semantic
+correspondences between texts and images are still not effectively constructed.
+To address this issue, we propose a novel TIPR framework to build fine-grained
+interactions and alignment between person images and the corresponding texts.
+Specifically, via fine-tuning the Contrastive Language-Image Pre-training
+(CLIP) model, a visual-textual dual encoder is firstly constructed, to
+preliminarily align the image and text features. Secondly, a Text-guided Image
+Restoration (TIR) auxiliary task is proposed to map abstract textual entities
+to specific image regions, improving the alignment between local textual and
+visual embeddings. Additionally, a cross-modal triplet loss is presented to
+handle hard samples, and further enhance the model's discriminability for minor
+differences. Moreover, a pruning-based text data augmentation approach is
+proposed to enhance focus on essential elements in descriptions, thereby
+avoiding excessive model attention to less significant information. The
+experimental results show our proposed method outperforms state-of-the-art
+methods on three popular benchmark datasets, and the code will be made publicly
+available at https://github.com/Delong-liu-bupt/SEN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper was withdrawn due to a dispute among the authors regarding
+  the content of the article</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligning Instruction Tuning with <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09368v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09368v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Liang, Tianyu Zheng, Xinrun Du, Ge Zhang, Xingwei Qu, Xiang Yue, Chujie Zheng, Jiaheng Liu, Lei Ma, Wenhu Chen, Guoyin Wang, Zhaoxiang Zhang, Wenhao Huang, Jiajun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning enhances large language models (LLMs) to follow human
+instructions across diverse tasks, relying on high-quality datasets to guide
+behavior. However, these datasets, whether manually curated or synthetically
+generated, are often narrowly focused and misaligned with the broad
+distributions captured during pre-training, limiting LLM generalization and
+effective use of pre-trained knowledge. We propose *Aligning Instruction Tuning
+with Pre-training* (AITP), a method that bridges this gap by identifying
+coverage shortfalls in instruction-tuning datasets and rewriting
+underrepresented pre-training data into high-quality instruction-response
+pairs. This approach enriches dataset diversity while preserving task-specific
+objectives. Evaluations on three fully open LLMs across eight benchmarks
+demonstrate consistent performance improvements with AITP. Ablations highlight
+the benefits of adaptive data selection, controlled rewriting, and balanced
+integration, emphasizing the importance of aligning instruction tuning with
+pre-training distributions to unlock the full potential of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Elucidating the Design Space of <span class="highlight-title">Dataset</span> Condensation <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13733v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13733v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shitong Shao, Zikai Zhou, Huanran Chen, Zhiqiang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset condensation, a concept within data-centric learning, efficiently
+transfers critical attributes from an original dataset to a synthetic version,
+maintaining both diversity and realism. This approach significantly improves
+model training efficiency and is adaptable across multiple application areas.
+Previous methods in dataset condensation have faced challenges: some incur high
+computational costs which limit scalability to larger datasets (e.g., MTT,
+DREAM, and TESLA), while others are restricted to less optimal design spaces,
+which could hinder potential improvements, especially in smaller datasets
+(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a
+comprehensive design framework that includes specific, effective strategies
+like implementing soft category-aware matching and adjusting the learning rate
+schedule. These strategies are grounded in empirical evidence and theoretical
+backing. Our resulting approach, Elucidate Dataset Condensation (EDC),
+establishes a benchmark for both small and large-scale dataset condensation. In
+our testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on
+ImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a
+compression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM,
+and RDED by margins of 27.3%, 17.2%, and 6.6%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AceMath: Advancing Frontier Math Reasoning with Post-Training and Reward
+  Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15084v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15084v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Liu, Yang Chen, Mohammad Shoeybi, Bryan Catanzaro, Wei Ping
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce AceMath, a suite of frontier math models that
+excel in solving complex math problems, along with highly effective reward
+models capable of evaluating generated solutions and reliably identifying the
+correct ones. To develop the instruction-tuned math models, we propose a
+supervised fine-tuning (SFT) process that first achieves competitive
+performance across general domains, followed by targeted fine-tuning for the
+math domain using a carefully curated set of prompts and synthetically
+generated responses. The resulting model, AceMath-72B-Instruct greatly
+outperforms Qwen2.5-Math-72B-Instruct, GPT-4o and Claude-3.5 Sonnet. To develop
+math-specialized reward model, we first construct AceMath-RewardBench, a
+comprehensive and robust benchmark for evaluating math reward models across
+diverse problems and difficulty levels. After that, we present a systematic
+approach to build our math reward models. The resulting model, AceMath-72B-RM,
+consistently outperforms state-of-the-art reward models. Furthermore, when
+combining AceMath-72B-Instruct with AceMath-72B-RM, we achieve the highest
+average rm@8 score across the math reasoning benchmarks. We release model
+weights, training data, and evaluation benchmarks at:
+https://research.nvidia.com/labs/adlr/acemath
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harnessing small projectors and multiple views for efficient vision
+  <span class="highlight-title">pretrain</span>ing <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10725v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10725v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kumar Krishna Agrawal, Arna Ghosh, Shagun Sodhani, Adam Oberman, Blake Richards
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in self-supervised (SSL) visual representation learning has
+led to the development of several different proposed frameworks that rely on
+augmentations of images but use different loss functions. However, there are
+few theoretically grounded principles to guide practice, so practical
+implementation of each SSL framework requires several heuristics to achieve
+competitive performance. In this work, we build on recent analytical results to
+design practical recommendations for competitive and efficient SSL that are
+grounded in theory. Specifically, recent theory tells us that existing SSL
+frameworks are minimizing the same idealized loss, which is to learn features
+that best match the data similarity kernel defined by the augmentations used.
+We show how this idealized loss can be reformulated to a functionally
+equivalent loss that is more efficient to compute. We study the implicit bias
+of using gradient descent to minimize our reformulated loss function and find
+that using a stronger orthogonalization constraint with a reduced projector
+dimensionality should yield good representations. Furthermore, the theory tells
+us that approximating the reformulated loss should be improved by increasing
+the number of augmentations, and as such using multiple augmentations should
+lead to improved convergence. We empirically verify our findings on CIFAR, STL
+and Imagenet datasets, wherein we demonstrate an improved linear readout
+performance when training a ResNet-backbone using our theoretically grounded
+recommendations. Remarkably, we also demonstrate that by leveraging these
+insights, we can reduce the pretraining dataset size by up to 2$\times$ while
+maintaining downstream accuracy simply by using more data augmentations. Taken
+together, our work provides theoretically grounded recommendations that can be
+used to improve SSL convergence and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Honeytrace: A Robust Plug-and-Play Watermarking Framework against
+  Model Extraction Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09328v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09328v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixiao Xu, Binxing Fang, Rui Wang, Yinghai Zhou, Shouling Ji, Yuan Liu, Mohan Li, Zhihong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing high-performance deep learning models is resource-intensive,
+leading model owners to utilize Machine Learning as a Service (MLaaS) platforms
+instead of publicly releasing their models. However, malicious users may
+exploit query interfaces to execute model extraction attacks, reconstructing
+the target model's functionality locally. While prior research has investigated
+triggerable watermarking techniques for asserting ownership, existing methods
+face significant challenges: (1) most approaches require additional training,
+resulting in high overhead and limited flexibility, and (2) they often fail to
+account for advanced attackers, leaving them vulnerable to adaptive attacks.
+  In this paper, we propose Neural Honeytrace, a robust plug-and-play
+watermarking framework against model extraction attacks. We first formulate a
+watermark transmission model from an information-theoretic perspective,
+providing an interpretable account of the principles and limitations of
+existing triggerable watermarking. Guided by the model, we further introduce:
+(1) a similarity-based training-free watermarking method for plug-and-play and
+flexible watermarking, and (2) a distribution-based multi-step watermark
+information transmission strategy for robust watermarking. Comprehensive
+experiments on four datasets demonstrate that Neural Honeytrace outperforms
+previous methods in efficiency and resisting adaptive attacks. Neural
+Honeytrace reduces the average number of samples required for a worst-case
+t-Test-based copyright claim from $12,000$ to $200$ with zero training cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TraceFL: Interpretability-Driven Debugging in Federated Learning via
+  Neuron Provenance <span class="chip">ICSE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13632v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13632v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Waris Gill, Ali Anwar, Muhammad Ali Gulzar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Federated Learning, clients train models on local data and send updates to
+a central server, which aggregates them into a global model using a fusion
+algorithm. This collaborative yet privacy-preserving training comes at a cost.
+FL developers face significant challenges in attributing global model
+predictions to specific clients. Localizing responsible clients is a crucial
+step towards (a) excluding clients primarily responsible for incorrect
+predictions and (b) encouraging clients who contributed high-quality models to
+continue participating in the future. Existing ML debugging approaches are
+inherently inapplicable as they are designed for single-model, centralized
+training.
+  We introduce TraceFL, a fine-grained neuron provenance capturing mechanism
+that identifies clients responsible for a global model's prediction by tracking
+the flow of information from individual clients to the global model. Since
+inference on different inputs activates a different set of neurons of the
+global model, TraceFL dynamically quantifies the significance of the global
+model's neurons in a given prediction, identifying the most crucial neurons in
+the global model. It then maps them to the corresponding neurons in every
+participating client to determine each client's contribution, ultimately
+localizing the responsible client. We evaluate TraceFL on six datasets,
+including two real-world medical imaging datasets and four neural networks,
+including advanced models such as GPT. TraceFL achieves 99% accuracy in
+localizing the responsible client in FL tasks spanning both image and text
+classification tasks. At a time when state-of-the-artML debugging approaches
+are mostly domain-specific (e.g., image classification only), TraceFL is the
+first technique to enable highly accurate automated reasoning across a wide
+range of FL applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 2025 IEEE/ACM 47th International Conference on Software
+  Engineering (ICSE)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LEGO-GraphRAG: Modularizing Graph-based Retrieval-Augmented Generation
+  for Design Space Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.05844v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.05844v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukun Cao, Zengyi Gao, Zhiyang Li, Xike Xie, Kevin Zhou, Jianliang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GraphRAG integrates (knowledge) graphs with large language models (LLMs) to
+improve reasoning accuracy and contextual relevance. Despite its promising
+applications and strong relevance to multiple research communities, such as
+databases and natural language processing, GraphRAG currently lacks modular
+workflow analysis, systematic solution frameworks, and insightful empirical
+studies. To bridge these gaps, we propose LEGO-GraphRAG, a modular framework
+that enables: 1) fine-grained decomposition of the GraphRAG workflow, 2)
+systematic classification of existing techniques and implemented GraphRAG
+instances, and 3) creation of new GraphRAG instances. Our framework facilitates
+comprehensive empirical studies of GraphRAG on large-scale real-world graphs
+and diverse query sets, revealing insights into balancing reasoning quality,
+runtime efficiency, and token or GPU cost, that are essential for building
+advanced GraphRAG systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SEAL: Entangled White-box Watermarks on Low-Rank Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09284v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09284v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giyeong Oh, Saejin Kim, Woohyun Cho, Sangkyu Lee, Jiwan Chung, Dokyung Song, Youngjae Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, LoRA and its variants have become the de facto strategy for
+training and sharing task-specific versions of large pretrained models, thanks
+to their efficiency and simplicity. However, the issue of copyright protection
+for LoRA weights, especially through watermark-based techniques, remains
+underexplored. To address this gap, we propose SEAL (SEcure wAtermarking on
+LoRA weights), the universal whitebox watermarking for LoRA. SEAL embeds a
+secret, non-trainable matrix between trainable LoRA weights, serving as a
+passport to claim ownership. SEAL then entangles the passport with the LoRA
+weights through training, without extra loss for entanglement, and distributes
+the finetuned weights after hiding the passport. When applying SEAL, we
+observed no performance degradation across commonsense reasoning,
+textual/visual instruction tuning, and text-to-image synthesis tasks. We
+demonstrate that SEAL is robust against a variety of known attacks: removal,
+obfuscation, and ambiguity attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Author name corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Iterative Enhancement for Improving Learnersourced
+  Multiple-Choice Question Explanations with Large Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10444v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10444v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiming Bao, Juho Leinonen, Alex Yuxuan Peng, Wanjun Zhong, Gaël Gendron, Timothy Pistotti, Alice Huang, Paul Denny, Michael Witbrock, Jiamou Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models exhibit superior capabilities in processing and
+understanding language, yet their applications in educational contexts remain
+underexplored. Learnersourcing enhances learning by engaging students in
+creating their own educational content. When learnersourcing multiple-choice
+questions, creating explanations for the solution of a question is a crucial
+step; it helps other students understand the solution and promotes a deeper
+understanding of related concepts. However, it is often difficult for students
+to craft effective solution explanations, due to limited subject understanding.
+To help scaffold the task of automated explanation generation, we present and
+evaluate a framework called "ILearner-LLM", that iteratively enhances the
+generated explanations for the given questions with large language models.
+Comprising an explanation generation model and an explanation evaluation model,
+the framework generates high-quality student-aligned explanations by
+iteratively feeding the quality rating score from the evaluation model back
+into the instruction prompt of the explanation generation model. Experimental
+results demonstrate the effectiveness of our ILearner-LLM on LLaMA2-13B and
+GPT-4 to generate higher quality explanations that are closer to those written
+by students on five PeerWise datasets. Our findings represent a promising path
+to enrich the learnersourcing experience for students and to enhance the
+capabilities of large language models for educational applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The short version (v4) has been accepted as a non-archival workshop
+  paper at AGI@ICLR 2024, and the full version has been accepted by the main
+  track of AAAI/EAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Assessing and Enhancing the Robustness of Large Language Models with
+  Task Structure Variations for Logical Reasoning <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09430v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09430v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiming Bao, Gael Gendron, Alex Yuxuan Peng, Wanjun Zhong, Neset Tan, Yang Chen, Michael Witbrock, Jiamou Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs), such as LLaMA, Alpaca, Vicuna, GPT-3.5 and
+GPT-4, have advanced the performance of AI systems on various natural language
+processing tasks to human-like levels. However, their generalisation and
+robustness when performing logical reasoning has not been sufficiently
+assessed. To comprehensively evaluate this ability, we develop three new
+logical reasoning datasets named "ReClor-plus", "LogiQA-plus" and
+"LogiQAv2-plus" that extend standard logical reasoning datasets to evaluate the
+robustness of the LLM's reasoning. For each, we create three subsets: the first
+with randomly shuffled options, the second with the correct choices replaced by
+"none of the other options is correct", and the third with a combination of
+shuffling and substitution. Experiments on these datasets show that these
+simple augmentations greatly hinder the models' performance. Despite their high
+performance on the original publicly available datasets, we find that all
+models perform poorly on these newly constructed datasets. We also demonstrate
+that introducing task variations into the training set can markedly improve the
+model's performance on both the original and our developed datasets. Finally,
+we show that applying logic-driven data augmentation for fine-tuning and
+prompting can enhance generalisation in both discriminative and generative
+models, offering a path to improving their robustness for tasks involving
+logical reasoning. Source code and data are made publicly available at
+https://github.com/Strong-AI-Lab/Logical-and-abstract-reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The short version (v3) was accepted for oral presentation at the
+  first LLM@IJCAI 2023 non-archival symposium, and the full version was
+  accepted by ICONIP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GRASP: A Grid-Based Benchmark for Evaluating Commonsense Spatial
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.01892v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.01892v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhisheng Tang, Mayank Kejriwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial reasoning, an important faculty of human cognition with many
+practical applications, is one of the core commonsense skills that is not
+purely language-based and, for satisfying (as opposed to optimal) solutions,
+requires some minimum degree of planning. Existing benchmarks of Commonsense
+Spatial Reasoning (CSR) tend to evaluate how Large Language Models (LLMs)
+interpret text-based spatial $\textit{descriptions}$ rather than directly
+evaluate a plan produced by the LLM in response to a $\textit{specific}$
+spatial reasoning problem. In this paper, we construct a large-scale benchmark
+called GRASP, which consists of 16,000 grid-based environments where the agent
+is tasked with an energy collection problem. These environments include 100
+grid instances instantiated using each of the 160 different grid settings,
+involving five different energy distributions, two modes of agent starting
+position, and two distinct obstacle configurations, as well as three kinds of
+agent constraints. Using GRASP, we compare classic baseline approaches, such as
+random walk and greedy search methods, with advanced LLMs like GPT-3.5-Turbo,
+GPT-4o, and GPT-o1-mini. The experimental results indicate that even these
+advanced LLMs struggle to consistently achieve satisfactory solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ T3: A Novel Zero-shot Transfer Learning Framework Iteratively Training
+  on an Assistant Task for a Target Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.17640v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.17640v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xindi Tong, Yujin Zhu, Shijian Fan, Liang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long text summarization, gradually being essential for efficiently processing
+large volumes of information, stays challenging for Large Language Models
+(LLMs) such as GPT and LLaMA families because of the insufficient open-sourced
+training datasets and the high requirement of contextual details dealing. To
+address the issue, we design a novel zero-shot transfer learning framework,
+abbreviated as T3, to iteratively training a baseline LLM on an assistant task
+for the target task, where the former should own richer data resources and
+share structural or semantic similarity with the latter. In practice, T3 is
+approached to deal with the long text summarization task by utilizing question
+answering as the assistant task, and further validated its effectiveness on the
+BBC summary, NarraSum, FairytaleQA, and NLQuAD datasets, with up to nearly 14%
+improvement in ROUGE, 35% improvement in BLEU, and 16% improvement in Factscore
+compared to three baseline LLMs, demonstrating its potential for more
+assistant-target task combinations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can AI-Generated Text be Reliably Detected? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11156v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11156v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinu Sankar Sadasivan, Aounon Kumar, Sriram Balasubramanian, Wenxiao Wang, Soheil Feizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) perform impressively well in various
+applications. However, the potential for misuse of these models in activities
+such as plagiarism, generating fake news, and spamming has raised concern about
+their responsible use. Consequently, the reliable detection of AI-generated
+text has become a critical area of research. AI text detectors have shown to be
+effective under their specific settings. In this paper, we stress-test the
+robustness of these AI text detectors in the presence of an attacker. We
+introduce recursive paraphrasing attack to stress test a wide range of
+detection schemes, including the ones using the watermarking as well as neural
+network-based detectors, zero shot classifiers, and retrieval-based detectors.
+Our experiments conducted on passages, each approximately 300 tokens long,
+reveal the varying sensitivities of these detectors to our attacks. Our
+findings indicate that while our recursive paraphrasing method can
+significantly reduce detection rates, it only slightly degrades text quality in
+many cases, highlighting potential vulnerabilities in current detection systems
+in the presence of an attacker. Additionally, we investigate the susceptibility
+of watermarked LLMs to spoofing attacks aimed at misclassifying human-written
+text as AI-generated. We demonstrate that an attacker can infer hidden AI text
+signatures without white-box access to the detection method, potentially
+leading to reputational risks for LLM developers. Finally, we provide a
+theoretical framework connecting the AUROC of the best possible detector to the
+Total Variation distance between human and AI text distributions. This analysis
+offers insights into the fundamental challenges of reliable detection as
+language models continue to advance. Our code is publicly available at
+https://github.com/vinusankars/Reliability-of-AI-text-detectors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligning with Human Judgement: The Role of Pairwise Preference in Large
+  Language Model Evaluators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16950v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16950v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinhong Liu, Han Zhou, Zhijiang Guo, Ehsan Shareghi, Ivan Vulić, Anna Korhonen, Nigel Collier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated promising capabilities as
+automatic evaluators in assessing the quality of generated natural language.
+However, LLMs still exhibit biases in evaluation and often struggle to generate
+coherent evaluations that align with human assessments. In this work, we first
+conduct a systematic study of the misalignment between LLM evaluators and human
+evaluation, revealing that existing calibration methods aimed at mitigating
+biases of LLMs are insufficient for effectively aligning LLM evaluators.
+Inspired by the use of preference data in RLHF, we formulate the evaluation as
+a ranking problem and introduce Pairwise-preference Search (PAIRS), an
+uncertainty-guided search-based rank aggregation method that employs LLMs to
+conduct pairwise comparisons locally and efficiently ranks candidate texts
+globally. PAIRS achieves state-of-the-art performance on representative
+evaluation tasks in long-form generations and demonstrates significant
+improvements over direct scoring. Furthermore, we provide insights into the
+role of pairwise preference in quantifying the transitivity of LLMs and
+demonstrate how PAIRS benefits from calibration using debiased pairwise
+evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by COLM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NL2KQL: From Natural Language to Kusto Query 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.02933v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.02933v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinye Tang, Amir H. Abdi, Jeremias Eichelbaum, Mahan Das, Alex Klein, Nihal Irmak Pakis, William Blum, Daniel L Mace, Tanvi Raja, Namrata Padmanabhan, Ye Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data is growing rapidly in volume and complexity. Proficiency in database
+query languages is pivotal for crafting effective queries. As coding assistants
+become more prevalent, there is significant opportunity to enhance database
+query languages. The Kusto Query Language (KQL) is a widely used query language
+for large semi-structured data such as logs, telemetries, and time-series for
+big data analytics platforms. This paper introduces NL2KQL an innovative
+framework that uses large language models (LLMs) to convert natural language
+queries (NLQs) to KQL queries. The proposed NL2KQL framework includes several
+key components: Schema Refiner which narrows down the schema to its most
+pertinent elements; the Few-shot Selector which dynamically selects relevant
+examples from a few-shot dataset; and the Query Refiner which repairs syntactic
+and semantic errors in KQL queries. Additionally, this study outlines a method
+for generating large datasets of synthetic NLQ-KQL pairs which are valid within
+a specific database contexts. To validate NL2KQL's performance, we utilize an
+array of online (based on query execution) and offline (based on query parsing)
+metrics. Through ablation studies, the significance of each framework component
+is examined, and the datasets used for benchmarking are made publicly
+available. This work is the first of its kind and is compared with available
+baselines to demonstrate its effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Challenge Summary U-MedSAM: Uncertainty-aware MedSAM for Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08881v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08881v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Wang, Xiaoyu Liu, Peng Huang, Pu Huang, Shu Hu, Hongtu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical Image Foundation Models have proven to be powerful tools for mask
+prediction across various datasets. However, accurately assessing the
+uncertainty of their predictions remains a significant challenge. To address
+this, we propose a new model, U-MedSAM, which integrates the MedSAM model with
+an uncertainty-aware loss function and the Sharpness-Aware Minimization
+(SharpMin) optimizer. The uncertainty-aware loss function automatically
+combines region-based, distribution-based, and pixel-based loss designs to
+enhance segmentation accuracy and robustness. SharpMin improves generalization
+by finding flat minima in the loss landscape, thereby reducing overfitting. Our
+method was evaluated in the CVPR24 MedSAM on Laptop challenge, where U-MedSAM
+demonstrated promising performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2405.17496</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism,
+  and Mitigation <span class="chip">ISSTA 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.20550v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.20550v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyao Zhang, Yanlin Wang, Chong Wang, Jiachi Chen, Zibin Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Code generation aims to automatically generate code from input requirements,
+significantly enhancing development efficiency. Recent large language models
+(LLMs) based approaches have shown promising results and revolutionized code
+generation task. Despite the promising performance, LLMs often generate
+contents with hallucinations, especially for the code generation scenario
+requiring the handling of complex contextual dependencies in practical
+development process. Although previous study has analyzed hallucinations in
+LLM-powered code generation, the study is limited to standalone function
+generation. In this paper, we conduct an empirical study to study the
+phenomena, mechanism, and mitigation of LLM hallucinations within more
+practical and complex development contexts in repository-level generation
+scenario. First, we manually examine the code generation results from six
+mainstream LLMs to establish a hallucination taxonomy of LLM-generated code.
+Next, we elaborate on the phenomenon of hallucinations, analyze their
+distribution across different models. We then analyze causes of hallucinations
+and identify four potential factors contributing to hallucinations. Finally, we
+propose an RAG-based mitigation method, which demonstrates consistent
+effectiveness in all studied LLMs. The replication package including code,
+data, and experimental results is available at
+https://github.com/DeepSoftwareAnalytics/LLMCodingHallucination
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ISSTA 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Keeping LLMs Aligned After Fine-tuning: The Crucial Role of <span class="highlight-title">Prompt</span>
+  Templates <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18540v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18540v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaifeng Lyu, Haoyu Zhao, Xinran Gu, Dingli Yu, Anirudh Goyal, Sanjeev Arora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Public LLMs such as the Llama 2-Chat underwent alignment training and were
+considered safe. Recently Qi et al. [2024] reported that even benign
+fine-tuning on seemingly safe datasets can give rise to unsafe behaviors in the
+models. The current paper is about methods and best practices to mitigate such
+loss of alignment. We focus on the setting where a public model is fine-tuned
+before serving users for specific usage, where the model should improve on the
+downstream task while maintaining alignment. Through extensive experiments on
+several chat models (Meta's Llama 2-Chat, Mistral AI's Mistral 7B Instruct
+v0.2, and OpenAI's GPT-3.5 Turbo), this paper uncovers that the prompt
+templates used during fine-tuning and inference play a crucial role in
+preserving safety alignment, and proposes the ``Pure Tuning, Safe Testing''
+(PTST) strategy -- fine-tune models without a safety prompt, but include it at
+test time. This seemingly counterintuitive strategy incorporates an intended
+distribution shift to encourage alignment preservation. Fine-tuning experiments
+on GSM8K, ChatDoctor, and OpenOrca show that PTST significantly reduces the
+rise of unsafe behaviors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating the Propensity of Generative AI for Producing Harmful
+  Disinformation During an Election Cycle 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.06120v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.06120v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik J Schlicht
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Artificial Intelligence offers a powerful tool for adversaries who
+wish to engage in influence operations, such as the Chinese Spamouflage
+operation and the Russian Internet Research Agency effort that both sought to
+interfere with recent US election cycles. Therefore, this study seeks to
+investigate the propensity of current generative AI models for producing
+harmful disinformation during an election cycle. The probability that different
+generative AI models produced disinformation when given adversarial prompts was
+evaluated, in addition the associated harm. This allows for the expected harm
+for each model to be computed and it was discovered that Copilot and Gemini
+tied for the overall safest performance by realizing the lowest expected harm,
+while GPT-4o produced the greatest rates of harmful disinformation, resulting
+in much higher expected harm scores. The impact of disinformation category was
+also investigated and Gemini was safest within the political category of
+disinformation due to mitigation attempts made by developers during the
+election, while Copilot was safest for topics related to health. Moreover,
+characteristics of adversarial roles were discovered that led to greater
+expected harm across all models. Finally, classification models were developed
+that predicted disinformation production based on the conditions considered in
+this study, which offers insight into factors important for predicting
+disinformation production. Based on all of these insights, recommendations are
+provided that seek to mitigate factors that lead to harmful disinformation
+being produced by generative AI models. It is hoped that developers will use
+these insights to improve future models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RLPF: Reinforcement Learning from Prediction Feedback for User
+  Summarization with LLMs <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.04421v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.04421v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Wu, Lin Ning, Luyang Liu, Harrison Lee, Neo Wu, Chao Wang, Sushant Prakash, Shawn O'Banion, Bradley Green, Jun Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LLM-powered personalization agent systems employ Large Language Models (LLMs)
+to predict users' behavior from their past activities. However, their
+effectiveness often hinges on the ability to effectively leverage extensive,
+long user historical data due to its inherent noise and length of such data.
+Existing pretrained LLMs may generate summaries that are concise but lack the
+necessary context for downstream tasks, hindering their utility in
+personalization systems. To address these challenges, we introduce
+Reinforcement Learning from Prediction Feedback (RLPF). RLPF fine-tunes LLMs to
+generate concise, human-readable user summaries that are optimized for
+downstream task performance. By maximizing the usefulness of the generated
+summaries, RLPF effectively distills extensive user history data while
+preserving essential information for downstream tasks. Our empirical evaluation
+demonstrates significant improvements in both extrinsic downstream task utility
+and intrinsic summary quality, surpassing baseline methods by up to 22% on
+downstream task performance and achieving an up to 84.59% win rate on
+Factuality, Abstractiveness, and Readability. RLPF also achieves a remarkable
+74% reduction in context length while improving performance on 16 out of 19
+unseen tasks and/or datasets, showcasing its generalizability. This approach
+offers a promising solution for enhancing LLM personalization by effectively
+transforming long, noisy user histories into informative and human-readable
+representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Systematic Study of Multi-Agent Deep Reinforcement Learning for Safe
+  and Robust Autonomous Highway Ramp Entry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14593v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14593v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Larry Schester, Luis E. Ortiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicles today can drive themselves on highways and driverless robotaxis
+operate in major cities, with more sophisticated levels of autonomous driving
+expected to be available and become more common in the future. Yet, technically
+speaking, so-called "Level 5" (L5) operation, corresponding to full autonomy,
+has not been achieved. For that to happen, functions such as fully autonomous
+highway ramp entry must be available, and provide provably safe, and reliably
+robust behavior to enable full autonomy. We present a systematic study of a
+highway ramp function that controls the vehicles forward-moving actions to
+minimize collisions with the stream of highway traffic into which a merging
+(ego) vehicle enters. We take a game-theoretic multi-agent (MA) approach to
+this problem and study the use of controllers based on deep reinforcement
+learning (DRL). The virtual environment of the MA DRL uses self-play with
+simulated data where merging vehicles safely learn to control longitudinal
+position during a taper-type merge. The work presented in this paper extends
+existing work by studying the interaction of more than two vehicles (agents)
+and does so by systematically expanding the road scene with additional traffic
+and ego vehicles. While previous work on the two-vehicle setting established
+that collision-free controllers are theoretically impossible in fully
+decentralized, non-coordinated environments, we empirically show that
+controllers learned using our approach are nearly ideal when measured against
+idealized optimal controllers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 9 figures; added support ack</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Complete Characterization of Learnability for Stochastic Noisy Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.09597v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.09597v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steve Hanneke, Kun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the stochastic noisy bandit problem with an unknown reward function
+$f^*$ in a known function class $\mathcal{F}$. Formally, a model $M$ maps arms
+$\pi$ to a probability distribution $M(\pi)$ of reward. A model class
+$\mathcal{M}$ is a collection of models. For each model $M$, define its mean
+reward function $f^M(\pi)=\mathbb{E}_{r \sim M(\pi)}[r]$. In the bandit
+learning problem, we proceed in rounds, pulling one arm $\pi$ each round and
+observing a reward sampled from $M(\pi)$. With knowledge of $\mathcal{M}$,
+supposing that the true model $M\in \mathcal{M}$, the objective is to identify
+an arm $\hat{\pi}$ of near-maximal mean reward $f^M(\hat{\pi})$ with high
+probability in a bounded number of rounds. If this is possible, then the model
+class is said to be learnable.
+  Importantly, a result of \cite{hanneke2023bandit} shows there exist model
+classes for which learnability is undecidable. However, the model class they
+consider features deterministic rewards, and they raise the question of whether
+learnability is decidable for classes containing sufficiently noisy models. For
+the first time, we answer this question in the positive by giving a complete
+characterization of learnability for model classes with arbitrary noise. In
+addition to that, we also describe the full spectrum of possible optimal query
+complexities. Further, we prove adaptivity is sometimes necessary to achieve
+the optimal query complexity. Last, we revisit an important complexity measure
+for interactive decision making, the Decision-Estimation-Coefficient
+\citep{foster2021statistical,foster2023tight}, and propose a new variant of the
+DEC which also characterizes learnability in this setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Generic Model for Swarm Intelligence and Its Validations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1712.04182v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1712.04182v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenpin Jiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The modeling of emergent swarm intelligence constitutes a major challenge and
+it has been tackled in a number of different ways. However, existing approaches
+fail to capture the nature of swarm intelligence and they are either too
+abstract for practical application or not generic enough to describe the
+various types of emergence phenomena. In this paper, a contradiction-centric
+model for swarm intelligence is proposed, in which individu-als determine their
+behaviors based on their internal contradictions whilst they associate and
+interact to update their contradictions. The model hypothesizes that 1) the
+emergence of swarm intelligence is rooted in the de-velopment of individuals'
+internal contradictions and the interactions taking place between individuals
+and the environment, and 2) swarm intelligence is essentially a combinative
+reflection of the configurations of individuals' internal contradictions and
+the distributions of these contradictions across individuals. The model is
+formally described and five swarm intelligence systems are studied to
+illustrate its broad applicability. The studies confirm the generic character
+of the model and its effectiveness for describing the emergence of various
+kinds of swarm intelligence; and they also demonstrate that the model is
+straightforward to apply, without the need for complicated computations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-16T00:00:00Z">2025-01-16</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">30</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distilling Multi-modal Large Language Models for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deepti Hegde, Rajeev Yasarla, Hong Cai, Shizhong Han, Apratim Bhattacharyya, Shweta Mahajan, Litian Liu, Risheek Garrepalli, Vishal M. Patel, Fatih Porikli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving demands safe motion planning, especially in critical
+"long-tail" scenarios. Recent end-to-end autonomous driving systems leverage
+large language models (LLMs) as planners to improve generalizability to rare
+events. However, using LLMs at test time introduces high computational costs.
+To address this, we propose DiMA, an end-to-end autonomous driving system that
+maintains the efficiency of an LLM-free (or vision-based) planner while
+leveraging the world knowledge of an LLM. DiMA distills the information from a
+multi-modal LLM to a vision-based end-to-end planner through a set of specially
+designed surrogate tasks. Under a joint training strategy, a scene encoder
+common to both networks produces structured representations that are
+semantically grounded as well as aligned to the final planning objective.
+Notably, the LLM is optional at inference, enabling robust planning without
+compromising on efficiency. Training with DiMA results in a 37% reduction in
+the L2 trajectory error and an 80% reduction in the collision rate of the
+vision-based planner, as well as a 44% trajectory error reduction in longtail
+scenarios. DiMA also achieves state-of-the-art performance on the nuScenes
+planning benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FAST: Efficient Action Tokenization for Vision-Language-Action Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karl Pertsch, Kyle Stachowicz, Brian Ichter, Danny Driess, Suraj Nair, Quan Vuong, Oier Mees, Chelsea Finn, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autoregressive sequence models, such as Transformer-based vision-language
+action (VLA) policies, can be tremendously effective for capturing complex and
+generalizable robotic behaviors. However, such models require us to choose a
+tokenization of our continuous action signals, which determines how the
+discrete symbols predicted by the model map to continuous robot actions. We
+find that current approaches for robot action tokenization, based on simple
+per-dimension, per-timestep binning schemes, typically perform poorly when
+learning dexterous skills from high-frequency robot data. To address this
+challenge, we propose a new compression-based tokenization scheme for robot
+actions, based on the discrete cosine transform. Our tokenization approach,
+Frequency-space Action Sequence Tokenization (FAST), enables us to train
+autoregressive VLAs for highly dexterous and high-frequency tasks where
+standard discretization methods fail completely. Based on FAST, we release
+FAST+, a universal robot action tokenizer, trained on 1M real robot action
+trajectories. It can be used as a black-box tokenizer for a wide range of robot
+action sequences, with diverse action spaces and control frequencies. Finally,
+we show that, when combined with the pi0 VLA, our method can scale to training
+on 10k hours of robot data and match the performance of diffusion VLAs, while
+reducing training time by up to 5x.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website: https://www.pi.website/research/fast</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLOL: Fast Baselines for Real-World Low-Light Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan C. Benito, Daniel Feijoo, Alvaro Garcia, Marcos V. Conde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-Light Image Enhancement (LLIE) is a key task in computational photography
+and imaging. The problem of enhancing images captured during night or in dark
+environments has been well-studied in the image signal processing literature.
+However, current deep learning-based solutions struggle with efficiency and
+robustness in real-world scenarios (e.g. scenes with noise, saturated pixels,
+bad illumination). We propose a lightweight neural network that combines image
+processing in the frequency and spatial domains. Our method, FLOL+, is one of
+the fastest models for this task, achieving state-of-the-art results on popular
+real scenes datasets such as LOL and LSRW. Moreover, we are able to process
+1080p images under 12ms. Code and models at https://github.com/cidautai/FLOL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoNav Chair: Design of a ROS-based Smart Wheelchair for Shared Control
+  Navigation in the Built Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Xu, Qianwei Wang, Jordan Lillie, Vineet Kamat, Carol Menassa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the number of people with disabilities (PWD) increasing worldwide each
+year, the demand for mobility support to enable independent living and social
+integration is also growing. Wheelchairs commonly support the mobility of PWD
+in both indoor and outdoor environments. However, current powered wheelchairs
+(PWC) often fail to meet the needs of PWD, who may find it difficult to operate
+them. Furthermore, existing research on robotic wheelchairs typically focuses
+either on full autonomy or enhanced manual control, which can lead to reduced
+efficiency and user trust. To address these issues, this paper proposes a Robot
+Operating System (ROS)-based smart wheelchair, called CoNav Chair, that
+incorporates a shared control navigation algorithm and obstacle avoidance to
+support PWD while fostering efficiency and trust between the robot and the
+user. Our design consists of hardware and software components. Experimental
+results conducted in a typical indoor social environment demonstrate the
+performance and effectiveness of the smart wheelchair hardware and software
+design. This integrated design promotes trust and autonomy, which are crucial
+for the acceptance of assistive mobility technologies in the built environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Predictive Path Integral Docking of Fully Actuated Surface Vessel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09668v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09668v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akash Vijayakumar, Atmanand M A, Abhilash Somayajula
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous docking remains one of the most challenging maneuvers in marine
+robotics, requiring precise control and robust perception in confined spaces.
+This paper presents a novel approach integrating Model Predictive Path
+Integral(MPPI) control with real-time LiDAR-based dock detection for autonomous
+surface vessel docking. Our framework uniquely combines probabilistic
+trajectory optimization with a multiobjective cost function that simultaneously
+considers docking precision, safety constraints, and motion efficiency. The
+MPPI controller generates optimal trajectories by intelligently sampling
+control sequences and evaluating their costs based on dynamic clearance
+requirements, orientation alignment, and target position objectives. We
+introduce an adaptive dock detection pipeline that processes LiDAR point clouds
+to extract critical geometric features, enabling real-time updates of docking
+parameters. The proposed method is extensively validated in a physics-based
+simulation environment that incorporates realistic sensor noise, vessel
+dynamics, and environmental constraints. Results demonstrate successful docking
+from various initial positions while maintaining safe clearances and smooth
+motion characteristics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 6 figures, 1 table, UT2025 Conference, IEEE International
+  Symposium on Underwater Technology 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Monte Carlo Tree Search with Velocity Obstacles for safe and efficient
+  motion planning in dynamic environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Bonanni, Daniele Meli, Alberto Castellini, Alessandro Farinelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online motion planning is a challenging problem for intelligent robots moving
+in dense environments with dynamic obstacles, e.g., crowds. In this work, we
+propose a novel approach for optimal and safe online motion planning with
+minimal information about dynamic obstacles. Specifically, our approach
+requires only the current position of the obstacles and their maximum speed,
+but it does not need any information about their exact trajectories or dynamic
+model. The proposed methodology combines Monte Carlo Tree Search (MCTS), for
+online optimal planning via model simulations, with Velocity Obstacles (VO),
+for obstacle avoidance. We perform experiments in a cluttered simulated
+environment with walls, and up to 40 dynamic obstacles moving with random
+velocities and directions. With an ablation study, we show the key contribution
+of VO in scaling up the efficiency of MCTS, selecting the safest and most
+rewarding actions in the tree of simulations. Moreover, we show the superiority
+of our methodology with respect to state-of-the-art planners, including
+Non-linear Model Predictive Control (NMPC), in terms of improved collision
+rate, computational and task performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid
+  Prototyping in Virtual Reality Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09600v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09600v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Augusto Pinheiro de Sousa, Heiko Hamann, Oliver Deussen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SLAM is a foundational technique with broad applications in robotics and
+AR/VR. SLAM simulations evaluate new concepts, but testing on
+resource-constrained devices, such as VR HMDs, faces challenges: high
+computational cost and restricted sensor data access. This work proposes a
+sparse framework using mesh geometry projections as features, which improves
+efficiency and circumvents direct sensor data access, advancing SLAM research
+as we demonstrate in VR and through numerical evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparison of Various SLAM Systems for Mobile Robot in an Indoor
+  Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maksim Filipenko, Ilya Afanasyev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article presents a comparative analysis of a mobile robot trajectories
+computed by various ROS-based SLAM systems. For this reason we developed a
+prototype of a mobile robot with common sensors: 2D lidar, a monocular and ZED
+stereo cameras. Then we conducted experiments in a typical office environment
+and collected data from all sensors, running all tested SLAM systems based on
+the acquired dataset. We studied the following SLAM systems: (a) 2D
+lidar-based: GMapping, Hector SLAM, Cartographer; (b) monocular camera-based:
+Large Scale Direct monocular SLAM (LSD SLAM), ORB SLAM, Direct Sparse Odometry
+(DSO); and (c) stereo camera-based: ZEDfu, Real-Time Appearance-Based Mapping
+(RTAB map), ORB SLAM, Stereo Parallel Tracking and Mapping (S-PTAM). Since all
+SLAM methods were tested on the same dataset we compared results for different
+SLAM systems with appropriate metrics, demonstrating encouraging results for
+lidar-based Cartographer SLAM, Monocular ORB SLAM and Stereo RTAB Map methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sensorimotor Control Strategies for Tactile Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enrico Donato, Matteo Lo Preti, Lucia Beccai, Egidio Falotico
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How are robots becoming smarter at interacting with their surroundings?
+Recent advances have reshaped how robots use tactile sensing to perceive and
+engage with the world. Tactile sensing is a game-changer, allowing robots to
+embed sensorimotor control strategies to interact with complex environments and
+skillfully handle heterogeneous objects. Such control frameworks plan
+contact-driven motions while staying responsive to sudden changes. We review
+the latest methods for building perception and control systems in tactile
+robotics while offering practical guidelines for their design and
+implementation. We also address key challenges to shape the future of
+intelligent robots.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 8 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Generation of Near-Minimum-Energy Trajectories via
+  Constraint-Informed Residual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Domenico Dona', Giovanni Franzese, Cosimo Della Santina, Paolo Boscariol, Basilio Lenzo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Industrial robotics demands significant energy to operate, making
+energy-reduction methodologies increasingly important. Strategies for planning
+minimum-energy trajectories typically involve solving nonlinear optimal control
+problems (OCPs), which rarely cope with real-time requirements. In this paper,
+we propose a paradigm for generating near minimum-energy trajectories for
+manipulators by learning from optimal solutions. Our paradigm leverages a
+residual learning approach, which embeds boundary conditions while focusing on
+learning only the adjustments needed to steer a standard solution to an optimal
+one. Compared to a computationally expensive OCP-based planner, our paradigm
+achieves 87.3% of the performance near the training dataset and 50.8% far from
+the dataset, while being two to three orders of magnitude faster.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Path Planning for a UAV Swarm Using Formation Teaching-Learning-Based
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Truong Hoang, Manh Duong Phung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work addresses the path planning problem for a group of unmanned aerial
+vehicles (UAVs) to maintain a desired formation during operation. Our approach
+formulates the problem as an optimization task by defining a set of fitness
+functions that not only ensure the formation but also include constraints for
+optimal and safe UAV operation. To optimize the fitness function and obtain a
+suboptimal path, we employ the teaching-learning-based optimization algorithm
+and then further enhance it with mechanisms such as mutation, elite strategy,
+and multi-subject combination. A number of simulations and experiments have
+been conducted to evaluate the proposed method. The results demonstrate that
+the algorithm successfully generates valid paths for the UAVs to fly in a
+triangular formation for an inspection task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in Proceedings of the 2025 International Conference on Energy,
+  Infrastructure and Environmental Research (EIER2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust UAV Path Planning with Obstacle Avoidance for Emergency Rescue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09338v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09338v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junteng Mao, Ziye Jia, Hanzhi Gu, Chenyu Shi, Haomin Shi, Lijun He, Qihui Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unmanned aerial vehicles (UAVs) are efficient tools for diverse tasks
+such as electronic reconnaissance, agricultural operations and disaster relief.
+In the complex three-dimensional (3D) environments, the path planning with
+obstacle avoidance for UAVs is a significant issue for security assurance. In
+this paper, we construct a comprehensive 3D scenario with obstacles and no-fly
+zones for dynamic UAV trajectory. Moreover, a novel artificial potential field
+algorithm coupled with simulated annealing (APF-SA) is proposed to tackle the
+robust path planning problem. APF-SA modifies the attractive and repulsive
+potential functions and leverages simulated annealing to escape local minimum
+and converge to globally optimal solutions. Simulation results demonstrate that
+the effectiveness of APF-SA, enabling efficient autonomous path planning for
+UAVs with obstacle avoidance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoboReflect: Robotic Reflective Reasoning for Grasping
+  Ambiguous-Condition Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09307v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09307v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Luo, Yixuan Yang, Chang Cai, Yanfu Zhang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As robotic technology rapidly develops, robots are being employed in an
+increasing number of fields. However, due to the complexity of deployment
+environments or the prevalence of ambiguous-condition objects, the practical
+application of robotics still faces many challenges, leading to frequent
+errors. Traditional methods and some LLM-based approaches, although improved,
+still require substantial human intervention and struggle with autonomous error
+correction in complex scenarios.In this work, we propose RoboReflect, a novel
+framework leveraging large vision-language models (LVLMs) to enable
+self-reflection and autonomous error correction in robotic grasping tasks.
+RoboReflect allows robots to automatically adjust their strategies based on
+unsuccessful attempts until successful execution is achieved.The corrected
+strategies are saved in a memory for future task reference.We evaluate
+RoboReflect through extensive testing on eight common objects prone to
+ambiguous conditions of three categories.Our results demonstrate that
+RoboReflect not only outperforms existing grasp pose estimation methods like
+AnyGrasp and high-level action planning techniques using GPT-4V but also
+significantly enhances the robot's ability to adapt and correct errors
+independently. These findings underscore the critical importance of autonomous
+selfreflection in robotic systems while effectively addressing the challenges
+posed by ambiguous environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interoceptive Robots for Convergent Shared Control in Collaborative
+  Construction Work 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshan Zhou, Carol C. Menassa, Vineet R. Kamat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building autonomous mobile robots (AMRs) with optimized efficiency and
+adaptive capabilities-able to respond to changing task demands and dynamic
+environments-is a strongly desired goal for advancing construction robotics.
+Such robots can play a critical role in enabling automation, reducing
+operational carbon footprints, and supporting modular construction processes.
+Inspired by the adaptive autonomy of living organisms, we introduce
+interoception, which centers on the robot's internal state representation, as a
+foundation for developing self-reflection and conscious learning to enable
+continual learning and adaptability in robotic agents. In this paper, we
+factorize internal state variables and mathematical properties as "cognitive
+dissonance" in shared control paradigms, where human interventions occasionally
+occur. We offer a new perspective on how interoception can help build adaptive
+motion planning in AMRs by integrating the legacy of heuristic costs from
+grid/graph-based algorithms with recent advances in neuroscience and
+reinforcement learning. Declarative and procedural knowledge extracted from
+human semantic inputs is encoded into a hypergraph model that overlaps with the
+spatial configuration of onsite layout for path planning. In addition, we
+design a velocity-replay module using an encoder-decoder architecture with
+few-shot learning to enable robots to replicate velocity profiles in
+contextualized scenarios for multi-robot synchronization and handover
+collaboration. These "cached" knowledge representations are demonstrated in
+simulated environments for multi-robot motion planning and stacking tasks. The
+insights from this study pave the way toward artificial general intelligence in
+AMRs, fostering their progression from complexity to competence in construction
+automation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ThinTact:Thin Vision-Based Tactile Sensor by Lensless Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Xu, Weihang Chen, Hongyu Qian, Dan Wu, Rui Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-based tactile sensors have drawn increasing interest in the robotics
+community. However, traditional lens-based designs impose minimum thickness
+constraints on these sensors, limiting their applicability in space-restricted
+settings. In this paper, we propose ThinTact, a novel lensless vision-based
+tactile sensor with a sensing field of over 200 mm2 and a thickness of less
+than 10 mm.ThinTact utilizes the mask-based lensless imaging technique to map
+the contact information to CMOS signals. To ensure real-time tactile sensing,
+we propose a real-time lensless reconstruction algorithm that leverages a
+frequency-spatial-domain joint filter based on discrete cosine transform (DCT).
+This algorithm achieves computation significantly faster than existing
+optimization-based methods. Additionally, to improve the sensing quality, we
+develop a mask optimization method based on the generic algorithm and the
+corresponding system matrix calibration algorithm.We evaluate the performance
+of our proposed lensless reconstruction and tactile sensing through qualitative
+and quantitative experiments. Furthermore, we demonstrate ThinTact's practical
+applicability in diverse applications, including texture recognition and
+contact-rich object manipulation. The paper will appear in the IEEE
+Transactions on Robotics: https://ieeexplore.ieee.org/document/10842357. Video:
+https://youtu.be/YrOO9BDMAHo
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\c{opyright} 2025 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are Open-Vocabulary Models Ready for Detection of MEP Elements on
+  Construction Sites 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdalwhab Abdalwhab, Ali Imran, Sina Heydarian, Ivanka Iordanova, David St-Onge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The construction industry has long explored robotics and computer vision, yet
+their deployment on construction sites remains very limited. These technologies
+have the potential to revolutionize traditional workflows by enhancing
+accuracy, efficiency, and safety in construction management. Ground robots
+equipped with advanced vision systems could automate tasks such as monitoring
+mechanical, electrical, and plumbing (MEP) systems. The present research
+evaluates the applicability of open-vocabulary vision-language models compared
+to fine-tuned, lightweight, closed-set object detectors for detecting MEP
+components using a mobile ground robotic platform. A dataset collected with
+cameras mounted on a ground robot was manually annotated and analyzed to
+compare model performance. The results demonstrate that, despite the
+versatility of vision-language models, fine-tuned lightweight models still
+largely outperform them in specialized environments and for domain-specific
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Torque Responsive Metamaterials Enable High Payload Soft Robot Arms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ian Good, Srivatsan Balaji, David Oh, Sawyer Thomas, Jeffrey I. Lipton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Soft robots have struggled to support large forces and moments while also
+supporting their own weight against gravity. This limits their ability to reach
+certain configurations necessary for tasks such as inspection and pushing
+objects up. We have overcome this limitation by creating an electrically driven
+metamaterial soft arm using handed shearing auxetics (HSA) and bendable
+extendable torque resistant (BETR) shafts. These use the large force and torque
+capacity of HSAs and the nestable torque transmission of BETRs to create a
+strong soft arm. We found that the HSA arm was able to push 2.3 kg vertically
+and lift more than 600 g when positioned horizontally, supporting 0.33 Nm of
+torque at the base. The arm is able to move between waypoints while carrying
+the large payload and demonstrates consistent movement with path variance below
+5 mm. The HSA arm's ability to perform active grasping with HSA grippers was
+also demonstrated, requiring 20 N of pull force to dislodge the object.
+Finally, we test the arm in a pipe inspection task. The arm is able to locate
+all the defects while sliding against the inner surface of the pipe,
+demonstrating its compliance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 8 figures, currently under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GeoManip: Geometric Constraints as General Interfaces for Robot
+  Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiliang Tang, Jia-Hui Pan, Yun-Hui Liu, Masayoshi Tomizuka, Li Erran Li, Chi-Wing Fu, Mingyu Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present GeoManip, a framework to enable generalist robots to leverage
+essential conditions derived from object and part relationships, as geometric
+constraints, for robot manipulation. For example, cutting the carrot requires
+adhering to a geometric constraint: the blade of the knife should be
+perpendicular to the carrot's direction. By interpreting these constraints
+through symbolic language representations and translating them into low-level
+actions, GeoManip bridges the gap between natural language and robotic
+execution, enabling greater generalizability across diverse even unseen tasks,
+objects, and scenarios. Unlike vision-language-action models that require
+extensive training, operates training-free by utilizing large foundational
+models: a constraint generation module that predicts stage-specific geometric
+constraints and a geometry parser that identifies object parts involved in
+these constraints. A solver then optimizes trajectories to satisfy inferred
+constraints from task descriptions and the scene. Furthermore, GeoManip learns
+in-context and provides five appealing human-robot interaction features:
+on-the-fly policy adaptation, learning from human demonstrations, learning from
+failure cases, long-horizon action planning, and efficient data collection for
+imitation learning. Extensive evaluations on both simulations and real-world
+scenarios demonstrate GeoManip's state-of-the-art performance, with superior
+out-of-distribution generalization while avoiding costly model training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMPLest-X: Ultimate Scaling for Expressive Human Pose and Shape
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanqi Yin, Zhongang Cai, Ruisi Wang, Ailing Zeng, Chen Wei, Qingping Sun, Haiyi Mei, Yanjun Wang, Hui En Pang, Mingyuan Zhang, Lei Zhang, Chen Change Loy, Atsushi Yamashita, Lei Yang, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expressive human pose and shape estimation (EHPS) unifies body, hands, and
+face motion capture with numerous applications. Despite encouraging progress,
+current state-of-the-art methods focus on training innovative architectural
+designs on confined datasets. In this work, we investigate the impact of
+scaling up EHPS towards a family of generalist foundation models. 1) For data
+scaling, we perform a systematic investigation on 40 EHPS datasets,
+encompassing a wide range of scenarios that a model trained on any single
+dataset cannot handle. More importantly, capitalizing on insights obtained from
+the extensive benchmarking process, we optimize our training scheme and select
+datasets that lead to a significant leap in EHPS capabilities. Ultimately, we
+achieve diminishing returns at 10M training instances from diverse data
+sources. 2) For model scaling, we take advantage of vision transformers (up to
+ViT-Huge as the backbone) to study the scaling law of model sizes in EHPS. To
+exclude the influence of algorithmic design, we base our experiments on two
+minimalist architectures: SMPLer-X, which consists of an intermediate step for
+hand and face localization, and SMPLest-X, an even simpler version that reduces
+the network to its bare essentials and highlights significant advances in the
+capture of articulated hands. With big data and the large model, the foundation
+models exhibit strong performance across diverse test benchmarks and excellent
+transferability to even unseen environments. Moreover, our finetuning strategy
+turns the generalist into specialist models, allowing them to achieve further
+performance boosts. Notably, our foundation models consistently deliver
+state-of-the-art results on seven benchmarks such as AGORA, UBody, EgoBody, and
+our proposed SynHand dataset for comprehensive hand evaluation. (Code is
+available at: https://github.com/wqyin/SMPLest-X).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An extension of SMPLer-X [arXiv:2309.17448]. Homepage:
+  https://caizhongang.com/projects/SMPLer-X/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Global SLAM in Visual-Inertial Systems with 5G Time-of-Arrival
+  Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12406v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12406v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meisam Kabiri, Holger Voos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach that integrates 5G Time of Arrival (ToA)
+measurements into ORB-SLAM3 to enable global localization and enhance mapping
+capabilities for indoor drone navigation. We extend ORB-SLAM3's optimization
+pipeline to jointly process ToA data from 5G base stations alongside visual and
+inertial measurements while estimating system biases. This integration
+transforms the inherently local SLAM estimates into globally referenced
+trajectories and effectively resolves scale ambiguity in monocular
+configurations. Our method is evaluated using five real-world indoor datasets
+collected with RGB-D cameras and inertial measurement units (IMUs),
+complemented by simulated 5G ToA measurements at 28 GHz and 78 GHz frequencies
+using MATLAB and QuaDRiGa. Extensive experiments across four SLAM
+configurations (RGB-D, RGB-D-Inertial, Monocular, and Monocular-Inertial)
+demonstrate that ToA integration enables consistent global positioning across
+all modes while significantly improving local accuracy in minimal sensor
+setups. Notably, ToA-enhanced monocular SLAM achieves superior local accuracy
+(6.3 cm average) compared to the RGB-D baseline (11.5 cm), and enables reliable
+operation of monocular-inertial SLAM in scenarios where the baseline system
+fails completely. While ToA integration offers limited local accuracy
+improvements for sensor-rich configurations like RGB-D SLAM, it consistently
+enables robust global localization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AeroHaptix: A Wearable Vibrotactile Feedback System for Enhancing
+  Collision Avoidance in UAV Teleoperation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.12105v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.12105v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingjian Huang, Zhecheng Wang, Qilong Cheng, Siyi Ren, Hanfeng Cai, Antonio Alvarez Valdivia, Karthik Mahadevan, Daniel Wigdor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Haptic feedback enhances collision avoidance by providing directional
+obstacle information to operators during unmanned aerial vehicle (UAV)
+teleoperation. However, such feedback is often rendered via haptic joysticks,
+which are unfamiliar to UAV operators and limited to single-direction force
+feedback. Additionally, the direct coupling between the input device and the
+feedback method diminishes operators' sense of control and induces oscillatory
+movements. To overcome these limitations, we propose AeroHaptix, a wearable
+haptic feedback system that uses spatial vibrations to simultaneously
+communicate multiple obstacle directions to operators, without interfering with
+their input control. The layout of vibrotactile actuators was optimized via a
+perceptual study to eliminate perceptual biases and achieve uniform spatial
+coverage. A novel rendering algorithm, MultiCBF, extended control barrier
+functions to support multi-directional feedback. Our system evaluation showed
+that compared to a no-feedback condition, AeroHaptix effectively reduced the
+number of collisions and input disagreement. Furthermore, operators reported
+that AeroHaptix was more helpful than force feedback, with improved situational
+awareness and comparable workload.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Constraint Network from Demonstrations via Positive-Unlabeled
+  Learning with Memory Replay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16485v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16485v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baiyu Peng, Aude Billard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning for a wide range of real-world tasks necessitates to know and write
+all constraints. However, instances exist where these constraints are either
+unknown or challenging to specify accurately. A possible solution is to infer
+the unknown constraints from expert demonstration. The majority of prior works
+limit themselves to learning simple linear constraints, or require strong
+knowledge of the true constraint parameterization or environmental model. To
+mitigate these problems, this paper presents a positive-unlabeled (PU) learning
+approach to infer a continuous, arbitrary and possibly nonlinear, constraint
+from demonstration. From a PU learning view, We treat all data in
+demonstrations as positive (feasible) data, and learn a (sub)-optimal policy to
+generate high-reward-winning but potentially infeasible trajectories, which
+serve as unlabeled data containing both feasible and infeasible states. Under
+an assumption on data distribution, a feasible-infeasible classifier (i.e.,
+constraint model) is learned from the two datasets through a postprocessing PU
+learning technique. The entire method employs an iterative framework
+alternating between updating the policy, which generates and selects
+higher-reward policies, and updating the constraint model. Additionally, a
+memory buffer is introduced to record and reuse samples from previous
+iterations to prevent forgetting. The effectiveness of the proposed method is
+validated in two Mujoco environments, successfully inferring continuous
+nonlinear constraints and outperforming a baseline method in terms of
+constraint accuracy and policy safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Positive-Unlabeled Constraint Learning for Inferring Nonlinear
+  Continuous Constraints Functions from Expert Demonstrations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01622v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01622v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baiyu Peng, Aude Billard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning for diverse real-world robotic tasks necessitates to know and write
+all constraints. However, instances exist where these constraints are either
+unknown or challenging to specify accurately. A possible solution is to infer
+the unknown constraints from expert demonstration. This paper presents a novel
+two-step Positive-Unlabeled Constraint Learning (PUCL) algorithm to infer a
+continuous constraint function from demonstrations, without requiring prior
+knowledge of the true constraint parameterization or environmental model as
+existing works. We treat all data in demonstrations as positive (feasible)
+data, and learn a control policy to generate potentially infeasible
+trajectories, which serve as unlabeled data. The proposed two-step learning
+framework first identifies reliable infeasible data using a distance metric,
+and secondly learns a binary feasibility classifier (i.e., constraint function)
+from the feasible demonstrations and reliable infeasible data. The proposed
+method is flexible to learn complex-shaped constraint boundary and will not
+mistakenly classify demonstrations as infeasible as previous methods. The
+effectiveness of the proposed method is verified in four constrained
+environments, using a networked policy or a dynamical system policy. It
+successfully infers the continuous nonlinear constraints and outperforms other
+baseline methods in terms of constraint accuracy and policy safety. This work
+has been published in IEEE Robotics and Automation Letters (RA-L). Please refer
+to the final version at https://doi.org/10.1109/LRA.2024.3522756
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Humanoid Robot RHP Friends: Seamless Combination of Autonomous and
+  Teleoperated Tasks in a Nursing Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20770v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20770v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehdi Benallegue, Guillaume Lorthioir, Antonin Dallard, Rafael Cisneros-Limón, Iori Kumagai, Mitsuharu Morisawa, Hiroshi Kaminaga, Masaki Murooka, Antoine Andre, Pierre Gergondet, Kenji Kaneko, Guillaume Caron, Fumio Kanehiro, Abderrahmane Kheddar, Soh Yukizaki, Junichi Karasuyama, Junichi Murakami, Masayuki Kamon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes RHP Friends, a social humanoid robot developed to enable
+assistive robotic deployments in human-coexisting environments. As a use-case
+application, we present its potential use in nursing by extending its
+capabilities to operate human devices and tools according to the task and by
+enabling remote assistance operations. To meet a wide variety of tasks and
+situations in environments designed by and for humans, we developed a system
+that seamlessly integrates the slim and lightweight robot and several
+technologies: locomanipulation, multi-contact motion, teleoperation, and object
+detection and tracking. We demonstrated the system's usage in a nursing
+application. The robot efficiently performed the daily task of patient transfer
+and a non-routine task, represented by a request to operate a circuit breaker.
+This demonstration, held at the 2023 International Robot Exhibition (IREX),
+conducted three times a day over three days.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Robotics and Automation Magazine, In press</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Equivariant IMU Preintegration with Biases: a Galilean Group Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.05548v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.05548v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giulio Delama, Alessandro Fornasier, Robert Mahony, Stephan Weiss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This letter proposes a new approach for Inertial Measurement Unit (IMU)
+preintegration, a fundamental building block that can be leveraged in different
+optimization-based Inertial Navigation System (INS) localization solutions.
+Inspired by recent advances in equivariant theory applied to biased INSs, we
+derive a discrete-time formulation of the IMU preintegration on
+${\mathbf{Gal}(3) \ltimes \mathfrak{gal}(3)}$, the left-trivialization of the
+tangent group of the Galilean group $\mathbf{Gal}(3)$. We define a novel
+preintegration error that geometrically couples the navigation states and the
+bias leading to lower linearization error. Our method improves in consistency
+compared to existing preintegration approaches which treat IMU biases as a
+separate state-space. Extensive validation against state-of-the-art methods,
+both in simulation and with real-world IMU data, implementation in the Lie++
+library, and open-source code are provided.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PO-GVINS: Tightly Coupled GNSS-Visual-Inertial Integration with
+  Pose-Only Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07259v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07259v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuo Xu, Feng Zhu, Zihang Zhang, Chang Jian, Jiarui Lv, Yuantai Zhang, Xiaohong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and reliable positioning is crucial for perception, decision-making,
+and other high-level applications in autonomous driving, unmanned aerial
+vehicles, and intelligent robots. Given the inherent limitations of standalone
+sensors, integrating heterogeneous sensors with complementary capabilities is
+one of the most effective approaches to achieving this goal. In this paper, we
+propose a filtering-based, tightly coupled global navigation satellite system
+(GNSS)-visual-inertial positioning framework with a pose-only formulation
+applied to the visual-inertial system (VINS), termed PO-GVINS. Specifically,
+multiple-view imaging used in current VINS requires a priori of 3D feature,
+then jointly estimate camera poses and 3D feature position, which inevitably
+introduces linearization error of the feature as well as facing dimensional
+explosion. However, the pose-only (PO) formulation, which is demonstrated to be
+equivalent to the multiple-view imaging and has been applied in visual
+reconstruction, represent feature depth using two camera poses and thus 3D
+feature position is removed from state vector avoiding aforementioned
+difficulties. Inspired by this, we first apply PO formulation in our VINS,
+i.e., PO-VINS. GNSS raw measurements are then incorporated with integer
+ambiguity resolved to achieve accurate and drift-free estimation. Extensive
+experiments demonstrate that the proposed PO-VINS significantly outperforms the
+multi-state constrained Kalman filter (MSCKF). By incorporating GNSS
+measurements, PO-GVINS achieves accurate, drift-free state estimation, making
+it a robust solution for positioning in challenging environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Autonomous Algorithm for Training Autonomous Vehicles with Minimal Human
+  Intervention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13345v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13345v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sang-Hyun Lee, Daehyeok Kwon, Seung-Woo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent reinforcement learning (RL) algorithms have demonstrated impressive
+results in simulated driving environments. However, autonomous vehicles trained
+in simulation often struggle to work well in the real world due to the fidelity
+gap between simulated and real-world environments. While directly training
+real-world autonomous vehicles with RL algorithms is a promising approach to
+bypass the fidelity gap problem, it presents several challenges. One critical
+yet often overlooked challenge is the need to reset a driving environment
+between every episode. This reset process demands significant human
+intervention, leading to poor training efficiency in the real world. In this
+paper, we introduce a novel autonomous algorithm that enables off-the-shelf RL
+algorithms to train autonomous vehicles with minimal human intervention. Our
+algorithm reduces unnecessary human intervention by aborting episodes to
+prevent unsafe states and identifying informative initial states for subsequent
+episodes. The key idea behind identifying informative initial states is to
+estimate the expected amount of information that can be obtained from
+under-explored but reachable states. Our algorithm also revisits rule-based
+autonomous driving algorithms and highlights their benefits in safely returning
+an autonomous vehicle to initial states. To evaluate how much human
+intervention is required during training, we implement challenging urban
+driving tasks that require an autonomous vehicle to reset to initial states on
+its own. The experimental results show that our autonomous algorithm is
+task-agnostic and achieves competitive driving performance with much less human
+intervention than baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 2 tables, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gameplay Filters: Robust Zero-Shot Safety through Adversarial
+  Imagination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00846v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00846v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duy P. Nguyen, Kai-Chieh Hsu, Wenhao Yu, Jie Tan, Jaime F. Fisac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive recent advances in learning-based robot control,
+ensuring robustness to out-of-distribution conditions remains an open
+challenge. Safety filters can, in principle, keep arbitrary control policies
+from incurring catastrophic failures by overriding unsafe actions, but existing
+solutions for complex (e.g., legged) robot dynamics do not span the full motion
+envelope and instead rely on local, reduced-order models. These filters tend to
+overly restrict agility and can still fail when perturbed away from nominal
+conditions. This paper presents the gameplay filter, a new class of predictive
+safety filter that continually plays out hypothetical matches between its
+simulation-trained safety strategy and a virtual adversary co-trained to invoke
+worst-case events and sim-to-real error, and precludes actions that would cause
+failures down the line. We demonstrate the scalability and robustness of the
+approach with a first-of-its-kind full-order safety filter for (36-D)
+quadrupedal dynamics. Physical experiments on two different quadruped platforms
+demonstrate the superior zero-shot effectiveness of the gameplay filter under
+large perturbations such as tugging and unmodeled terrain. Experiment videos
+and open-source software are available online:
+https://saferobotics.org/research/gameplay-filter
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Dark Side of Rich Rewards: Understanding and Mitigating Noise in VLM
+  Rewards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.15922v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.15922v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sukai Huang, Shu-Wei Liu, Nir Lipovetzky, Trevor Cohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Vision-Language Models (VLMs) are increasingly used to generate reward
+signals for training embodied agents to follow instructions, our research
+reveals that agents guided by VLM rewards often underperform compared to those
+employing only intrinsic (exploration-driven) rewards, contradicting
+expectations set by recent work. We hypothesize that false positive rewards --
+instances where unintended trajectories are incorrectly rewarded -- are more
+detrimental than false negatives. Our analysis confirms this hypothesis,
+revealing that the widely used cosine similarity metric is prone to false
+positive reward estimates. To address this, we introduce BiMI ({Bi}nary
+{M}utual {I}nformation), a novel reward function designed to mitigate noise.
+BiMI significantly enhances learning efficiency across diverse and challenging
+embodied navigation environments. Our findings offer a nuanced understanding of
+how different types of reward noise impact agent learning and highlight the
+importance of addressing multimodal reward signal noise when training embodied
+agents
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 main body pages, 21 appendix pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual collective behaviors on spherical robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.20539v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.20539v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Castro, Christophe Eloy, Franck Ruffier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The implementation of collective motion, traditionally, disregard the limited
+sensing capabilities of an individual, to instead assuming an omniscient
+perception of the environment. This study implements a visual flocking model in
+a ``robot-in-the-loop'' approach to reproduce these behaviors with a flock
+composed of 10 independent spherical robots. The model achieves robotic
+collective motion by only using panoramic visual information of each robot,
+such as retinal position, optical size and optic flow of the neighboring
+robots. We introduce a virtual anchor to confine the collective robotic
+movements so to avoid wall interactions. For the first time, a simple visual
+robot-in-the-loop approach succeed in reproducing several collective motion
+phases, in particular, swarming, and milling. Another milestone achieved with
+by this model is bridging the gap between simulation and physical experiments
+by demonstrating nearly identical behaviors in both environments with the same
+visual model. To conclude, we show that our minimal visual collective motion
+model is sufficient to recreate most collective behaviors on a
+robot-in-the-loop system that is scalable, behaves as numerical simulations
+predict and is easily comparable to traditional models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 16 figures, journal bioinspired and biomimetics</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">22</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Design-Agnostic Distributed Timing Fault Injection Monitor With
+  End-to-End Design Automation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09665v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09665v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan He, Yumin Su, Kaiyuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fault injection attacks induce hardware failures in circuits and exploit
+these faults to compromise the security of the system. It has been demonstrated
+that FIAs can bypass system security mechanisms, cause faulty outputs, and gain
+access to secret information. Certain types of FIAs can be mounted with little
+effort by tampering with clock signals and or the chip operating conditions. To
+mitigate such low cost, yet powerful attacks, we propose a fully synthesizable
+and distributable in situ fault injection monitor that employs a delay locked
+loop to track the pulsewidth of the clock. We further develop a fully automated
+design framework to optimize and implement the FIA monitors at any process
+node. Our design is fabricated and verified in 65 nm CMOS technology with a
+small footprint of 1500 um2. It can lock to clock frequencies from 2 MHz to
+1.26 GHz while detecting all 12 types of possible clock glitches, as well as
+timing FIA injections via the supply voltage, electromagnetic signals, and chip
+temperature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 26 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intra-day Solar and Power Forecast for Optimization of Intraday Market
+  Participation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09551v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09551v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nelson Salazar-Peña, Adolfo Palma-Vergara, Mateo Montes, María Alejandra Vargas-Torres, Adriana Salinas, Andrés Velasco, Alejandra Tabares, Andrés González-Mancera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prediction of solar irradiance enhances reliability in photovoltaic (PV)
+solar plant generation and grid integration. In Colombia, PV plants face
+penalties if energy production deviates beyond governmental thresholds from
+intraday market offers. This research employs Long Short-Term Memory (LSTM) and
+Bidirectional-LSTM (Bi-LSTM) models, utilizing meteorological data from a PV
+plant in El Paso, Cesar, Colombia, to predict solar irradiance with a 6-hour
+horizon and 10-minute resolution. While Bi-LSTM showed superior performance,
+the LSTM model achieved comparable results with significantly reduced training
+time (6 hours versus 18 hours), making it computationally advantageous. The
+LSTM predictions were averaged to create an hourly resolution model, evaluated
+using Mean Absolute Error, Root-Mean-Square Error, Normalized Root-Mean-Square
+Error, and Mean Absolute Percentage Error metrics. Comparison with the Global
+Forecast System (GFS) revealed similar performance, with both models
+effectively capturing daily solar irradiance patterns. The forecast model
+integrates with an Object-Oriented power production model, enabling accurate
+energy offers in the intraday market while minimizing penalty costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 37 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Dataset</span> Generation Toolbox for Dynamic Security Assessment: On the
+  Role of the Security Boundary 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09513v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09513v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bastien Giraud, Lola Charles, Agnes Marjorie Nakiganda, Johanna Vorwerk, Spyros Chatzivasileiadis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic security assessment (DSA) is crucial for ensuring the reliable
+operation of power systems. However, conventional DSA approaches are becoming
+intractable for future power systems, driving interest in more computationally
+efficient data-driven methods. Efficient dataset generation is a cornerstone of
+these methods. While importance and generic sampling techniques often focus on
+operating points near the system's security boundary, systematic methods for
+sampling in this region remain scarce. Furthermore, the impact of sampling near
+the security boundary on the performance of data-driven DSA methods has yet to
+be established. This paper highlights the critical role of accurately capturing
+security boundaries for effective security assessment. As such, we propose a
+novel method for generating a high number of samples close to the security
+boundary, considering both AC feasibility and small-signal stability. Case
+studies on the PGLib-OPF 39-bus and PGLib-OPF 162-bus systems demonstrate the
+importance of including boundary-adjacent operating points in training datasets
+while maintaining a balanced distribution of secure and insecure points.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IREP 2025 (under review)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Power-Efficient RAN Intelligent Controllers Through Optimized KPI
+  Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09509v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09509v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Paulo S. H. Lima, George N. Katsaros, Konstantinos Nikitopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Open Radio Access Network (RAN) paradigm envisions a more flexible,
+interoperable, and intelligent RAN ecosystem via new open interfaces and
+elements like the RAN Intelligent Controller (RIC). However, the impact of
+these elements on Open RAN's power consumption remains heavily unexplored. This
+work for the first time evaluates the impact of Key Performance Indicator (KPI)
+monitoring on RIC's power consumption using real traffic and power
+measurements. By analyzing various RIC-RAN communication scenarios, we identify
+that RIC's power consumption can become a scalability bottleneck, particularly
+in large-scale deployments, even when RIC is limited to its core operational
+functionalities and without incorporating application-specific processes. In
+this context, also for the first time we explore potential power savings
+through the elimination of redundant KPI transmissions, extending existing
+techniques for identical subscription removal and KPI selection, achieving
+significant power consumption gains exceeding 87\% of the overall RIC power
+consumption.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication and presentation at IEEE WCNC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HpC: A Calculus for Hybrid and Mobile Systems -- Full Version <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiong Xu, Jean-Pierre Talpin, Shuling Wang, Hao Wu, Bohua Zhan, Xinxin Liu, Naijun Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Networked cybernetic and physical systems of the Internet of Things (IoT)
+immerse civilian and industrial infrastructures into an interconnected and
+dynamic web of hybrid and mobile devices. The key feature of such systems is
+the hybrid and tight coupling of mobile and pervasive discrete communications
+in a continuously evolving environment (discrete computations with predominant
+continuous dynamics). In the aim of ensuring the correctness and reliability of
+such heterogeneous infrastructures, we introduce the hybrid {\pi}-calculus
+(HpC), to formally capture both mobility, pervasiveness and hybridisation in
+infrastructures where the network topology and its communicating entities
+evolve continuously in the physical world. The {\pi}-calculus proposed by Robin
+Milner et al. is a process calculus that can model mobile communications and
+computations in a very elegant manner. The HpC we propose is a conservative
+extension of the classical {\pi}-calculus, i.e., the extension is ``minimal'',
+and yet describes mobility, time and physics of systems, while allowing to lift
+all theoretical results (e.g. bisimulation) to the context of that extension.
+We showcase the HpC by considering a realistic handover protocol among mobile
+devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The published version of this article will be available in the ACM
+  Digital Library as part of the Proceedings of the ACM on Programming
+  Languages issue for SPLASH/OOPSLA 2025. This extended version contains
+  additional appendices, proofs and case studies</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Path Planning for a UAV Swarm Using Formation Teaching-Learning-Based
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Truong Hoang, Manh Duong Phung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work addresses the path planning problem for a group of unmanned aerial
+vehicles (UAVs) to maintain a desired formation during operation. Our approach
+formulates the problem as an optimization task by defining a set of fitness
+functions that not only ensure the formation but also include constraints for
+optimal and safe UAV operation. To optimize the fitness function and obtain a
+suboptimal path, we employ the teaching-learning-based optimization algorithm
+and then further enhance it with mechanisms such as mutation, elite strategy,
+and multi-subject combination. A number of simulations and experiments have
+been conducted to evaluate the proposed method. The results demonstrate that
+the algorithm successfully generates valid paths for the UAVs to fly in a
+triangular formation for an inspection task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in Proceedings of the 2025 International Conference on Energy,
+  Infrastructure and Environmental Research (EIER2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust UAV Path Planning with Obstacle Avoidance for Emergency Rescue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09338v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09338v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junteng Mao, Ziye Jia, Hanzhi Gu, Chenyu Shi, Haomin Shi, Lijun He, Qihui Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unmanned aerial vehicles (UAVs) are efficient tools for diverse tasks
+such as electronic reconnaissance, agricultural operations and disaster relief.
+In the complex three-dimensional (3D) environments, the path planning with
+obstacle avoidance for UAVs is a significant issue for security assurance. In
+this paper, we construct a comprehensive 3D scenario with obstacles and no-fly
+zones for dynamic UAV trajectory. Moreover, a novel artificial potential field
+algorithm coupled with simulated annealing (APF-SA) is proposed to tackle the
+robust path planning problem. APF-SA modifies the attractive and repulsive
+potential functions and leverages simulated annealing to escape local minimum
+and converge to globally optimal solutions. Simulation results demonstrate that
+the effectiveness of APF-SA, enabling efficient autonomous path planning for
+UAVs with obstacle avoidance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safety-Critical Control for Discrete-time Stochastic Systems with
+  Flexible Safe Bounds using Affine and Quadratic Control Barrier Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sotaro Fushimi, Kenta Hoshino, Yuki Nishimura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a safe controller synthesis of discrete-time stochastic
+systems using Control Barrier Functions (CBFs). The proposed condition allows
+the design of a safe controller synthesis that ensures system safety while
+avoiding the conservative bounds of safe probabilities. In particular, this
+study focuses on the design of CBFs that provide flexibility in the choice of
+functions to obtain tighter bounds on the safe probabilities. Numerical
+examples demonstrate the effectiveness of the approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Control Barrier Function-Based Safety Filters: Characterization of
+  Undesired Equilibria, Unbounded Trajectories, and Limit Cycles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pol Mestres, Yiting Chen, Emiliano Dall'anese, Jorge Cortés
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on safety filters designed based on Control Barrier
+Functions (CBFs): these are modifications of a nominal stabilizing controller
+typically utilized in safety-critical control applications to render a given
+subset of states forward invariant. The paper investigates the dynamical
+properties of the closed-loop systems, with a focus on characterizing
+undesirable behaviors that may emerge due to the use of CBF-based filters.
+These undesirable behaviors include unbounded trajectories, limit cycles, and
+undesired equilibria, which can be locally stable and even form a continuum.
+Our analysis offer the following contributions: (i) conditions under which
+trajectories remain bounded and (ii) conditions under which limit cycles do not
+exist; (iii) we show that undesired equilibria can be characterized by solving
+an algebraic equation, and (iv) we provide examples that show that
+asymptotically stable undesired equilibria can exist for a large class of
+nominal controllers and design parameters of the safety filter (even for convex
+safe sets). Further, for the specific class of planar systems, (v) we provide
+explicit formulas for the total number of undesired equilibria and the
+proportion of saddle points and asymptotically stable equilibria, and (vi) in
+the case of linear planar systems, we present an exhaustive analysis of their
+global stability properties. Examples throughout the paper illustrate the
+results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Explainability to Interpretability: Interpretable Policies in
+  Reinforcement Learning Via Model Explanation <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peilang Li, Umer Siddique, Yongcan Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep reinforcement learning (RL) has shown remarkable success in complex
+domains, however, the inherent black box nature of deep neural network policies
+raises significant challenges in understanding and trusting the decision-making
+processes. While existing explainable RL methods provide local insights, they
+fail to deliver a global understanding of the model, particularly in
+high-stakes applications. To overcome this limitation, we propose a novel
+model-agnostic approach that bridges the gap between explainability and
+interpretability by leveraging Shapley values to transform complex deep RL
+policies into transparent representations. The proposed approach offers two key
+contributions: a novel approach employing Shapley values to policy
+interpretation beyond local explanations and a general framework applicable to
+off-policy and on-policy algorithms. We evaluate our approach with three
+existing deep RL algorithms and validate its performance in two classic control
+environments. The results demonstrate that our approach not only preserves the
+original models' performance but also generates more stable interpretable
+policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Deployable AI (DAI) Workshop at the Thirty-Ninth AAAI
+  Conference on Artificial Intelligence (AAAI-25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Probabilistic Assessment of Power System Resilience Using the
+  Polynomial Chaos Expansion Method with Enhanced Stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aidan Gerkis, Xiaozhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasing frequency and intensity of extreme weather events motivates the
+assessment of power system resilience. The random nature of these events and
+the resulting failures mandates probabilistic resilience assessment, but
+state-of-the-art methods (e.g., Monte Carlo simulation) are computationally
+inefficient. This paper leverages the polynomial chaos expansion (PCE) method
+to efficiently quantify uncertainty in power system resilience. To address
+repeatability issues arising from PCE computation with different sample sets,
+we propose the integration of the Maximin-LHS experiment design method with the
+PCE method. Numerical studies on the IEEE 39-bus system illustrate the improved
+repeatability and convergence of the proposed method. The enhanced PCE method
+is then used to assess the resilience of the system and propose adaptation
+measures to improve it.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE PESGM 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Greening the Grid: Electricity Market Clearing with Consumer-Based
+  Carbon Cost 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqian Jiang, Line Roald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To enhance decarbonization efforts in electric power systems, we propose a
+novel electricity market clearing model that internalizes the allocation of
+emissions from generations to loads and allows for consideration of
+consumer-side carbon costs. Specifically, consumers can not only bid for power
+but also assign a cost to the carbon emissions incurred by their electricity
+use. These carbon costs provide consumers, ranging from carbon-agnostic to
+carbon-sensitive, with a tool to actively manage their roles in carbon emission
+mitigation. By incorporating carbon allocation and consumer-side carbon costs,
+the market clearing is influenced not solely by production and demand dynamics
+but also by the allocation of carbon emission responsibilities. To demonstrate
+the effect of our proposed model, we conduct a case study comparing market
+clearing outcomes across various percentages of carbon-sensitive consumers with
+differing carbon costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Crossover-BPSO Driven Multi-Agent Technology for Managing Local Energy
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hafiz Majid Hussain, Ashfaq Ahmad. Pedro H. J. Nardelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article presents a new hybrid algorithm, crossover binary particle swarm
+optimization (crBPSO), for allocating resources in local energy systems via
+multi-agent (MA) technology. Initially, a hierarchical MA-based architecture in
+a grid-connected local energy setup is presented. In this architecture, task
+specific agents operate in a master-slave manner. Where, the master runs a
+well-formulated optimization routine aiming at minimizing costs of energy
+procurement, battery degradation, and load scheduling delay. The slaves update
+the master on their current status and receive optimal action plans
+accordingly. Simulation results demonstrate that the proposed algorithm
+outperforms selected existing ones by 21\% in terms average energy system costs
+while satisfying customers' energy demand and maintaining the required quality
+of service.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantum Annealing based Power Grid Partitioning for Parallel Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04097v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04097v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carsten Hartmann, Junjie Zhang, Carlos D. Gonzalez Calaza, Thiemo Pesch, Kristel Michielsen, Andrea Benigni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph partitioning has many applications in powersystems from decentralized
+state estimation to parallel simulation. Focusing on parallel simulation,
+optimal grid partitioning minimizes the idle time caused by different
+simulation times for the sub-networks and their components and reduces the
+overhead required to simulate the cuts. Partitioning a graph into two parts
+such that, for example, the cut is minimal and the subgraphs have equal size is
+an NP-hard problem. In this paper we show how optimal partitioning of a graph
+can be obtained using quantum annealing (QA). We show how to map the
+requirements for optimal splitting to a quadratic unconstrained binary
+optimization (QUBO) formulation and test the proposed formulation using a
+current D-Wave QPU. We show that the necessity to find an embedding of the QUBO
+on current D-Wave QPUs limits the problem size to under 200 buses and notably
+affects the time-to-solution. We finally discuss the implications on near-term
+implementation of QA in combination to traditional CPU or GPU based simulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Characterization of full-scale denial-of-service 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05161v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05161v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anindya Basu, Indrani Kar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article investigates the resilient control problem for Cyber-Physical
+Systems (CPSs) with multiple sensors, where both sides of the communication
+channels are affected by Denial-of-Service (DoS) attacks. While previous work
+focused on characterizing Multi-Channel DoS (MCDoS), this study emphasizes the
+characterization of Full-Scale DoS (FSDoS). First, a partial observer technique
+is proposed to address the MCDoS condition. Then, an event-triggered control
+strategy is designed to handle FSDoS. Finally, the frequency and duration of
+FSDoS are analyzed to ensure the Input-to-State Stability (ISS) of the
+closed-loop system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Observability for Nonlinear Systems: Connecting Variational Dynamics,
+  Lyapunov Exponents, and Empirical Gramians 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14711v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14711v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad H. Kazma, Ahmad F. Taha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Observability quantification is a key problem in dynamic network sciences.
+While it has been thoroughly studied for linear systems, observability
+quantification for nonlinear networks is less intuitive and more cumbersome.
+One common approach to quantify observability for nonlinear systems is via the
+Empirical Gramian (Empr-Gram) -- a generalized form of the Gramian of linear
+systems. In this paper, we produce three new results. First, we establish that
+a variational form of discrete-time autonomous nonlinear systems (computed via
+perturbing initial conditions) yields a so-called Variational Gramian
+(Var-Gram) that is equivalent to the classic Empr-Gram; the former being easier
+to compute than the latter. Via Lyapunov exponents derived from Lyapunov's
+direct method, the paper's second result derives connections between existing
+observability measures and Var-Gram. The third result demonstrates the
+applicability of these new notions for sensor selection/placement in nonlinear
+systems. Numerical case studies demonstrate these three developments and their
+merits.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safe Control and Learning Using the Generalized Action Governor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Li, Yutong Li, Ilya Kolmanovsky, Anouck Girard, H. Eric Tseng, Dimitar Filev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces a general framework for safe control and learning
+based on the generalized action governor (AG). The AG is a supervisory scheme
+for augmenting a nominal closed-loop system with the ability of strictly
+handling prescribed safety constraints. In the first part of this article, we
+present a generalized AG methodology and analyze its key properties in a
+general setting. Then, we introduce tailored AG design approaches derived from
+the generalized methodology for linear and discrete systems. Afterward, we
+discuss the application of the generalized AG to facilitate safe online
+learning, which aims at safely evolving control parameters using real-time data
+to enhance control performance in uncertain systems. We present two safe
+learning algorithms based on, respectively, reinforcement learning and
+data-driven Koopman operator-based control integrated with the generalized AG
+to exemplify this application. Finally, we illustrate the developments with a
+numerical example.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 4 figures, submitted to the International Journal of
+  Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collaborative Secret and Covert Communications for Multi-User
+  Multi-Antenna Uplink UAV Systems: Design and Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05738v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05738v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinpeng Xu, Lin Bai, Xin Xie, Lin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by diverse secure requirements of multi-user in UAV systems, we
+propose a collaborative secret and covert transmission method for multi-antenna
+ground users to unmanned aerial vehicle (UAV) communications. Specifically,
+based on the power domain non-orthogonal multiple access (NOMA), two ground
+users with distinct security requirements, named Bob and Carlo, superimpose
+their signals and transmit the combined signal to the UAV named Alice. An
+adversary Willie attempts to simultaneously eavesdrop Bob's confidential
+message and detect whether Carlo is transmitting or not. We derive close-form
+expressions of the secrecy connection probability (SCP) and the covert
+connection probability (CCP) to evaluate the link reliability for wiretap and
+covert transmissions, respectively. Furthermore, we bound the secrecy outage
+probability (SOP) from Bob to Alice and the detection error probability (DEP)
+of Willie to evaluate the link security for wiretap and covert transmissions,
+respectively. To characterize the theoretical benchmark of the above model, we
+formulate a weighted multi-objective optimization problem to maximize the
+average of secret and covert transmission rates subject to constraints SOP,
+DEP, the beamformers of Bob and Carlo, and UAV trajectory parameters. To solve
+the optimization problem, we propose an iterative optimization algorithm using
+successive convex approximation and block coordinate descent (SCA-BCD) methods.
+Our results reveal the influence of design parameters of the system on the
+wiretap and covert rates, analytically and numerically. In summary, our study
+fills the gaps in joint secret and covert transmission for multi-user
+multi-antenna uplink UAV communications and provides insights to construct such
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modeling and Predictive Control for the Treatment of Hyperthyroidism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10096v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10096v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias M. Wolff, Maylin Menzel, Johannes W. Dietrich, Matthias A. Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose an approach to determine the dosages of antithyroid
+agents to treat hyperthyroid patients. Instead of relying on a trial-and-error
+approach as it is commonly done in clinical practice, we suggest to determine
+the dosages by means of a model predictive control (MPC) scheme. To this end,
+we first extend a mathematical model of the pituitary-thyroid feedback loop
+such that the intake of methimazole, a common antithyroid agent, can be
+considered. Second, based on the extended model, we develop an MPC scheme to
+determine suitable dosages. In numerical simulations, we consider scenarios in
+which (i) patients are affected by Graves' disease and take the medication
+orally and (ii) patients suffering from a life-threatening thyrotoxicosis, in
+which the medication is usually given intravenously. Our conceptual study
+suggests that determining the medication dosages by means of an MPC scheme
+could be a promising alternative to the currently applied trial-and-error
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-hop Upstream Anticipatory Traffic Signal Control with Deep
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocan Li, Xiaoyu Wang, Ilia Smirnov, Scott Sanner, Baher Abdulhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coordination in traffic signal control is crucial for managing congestion in
+urban networks. Existing pressure-based control methods focus only on immediate
+upstream links, leading to suboptimal green time allocation and increased
+network delays. However, effective signal control inherently requires
+coordination across a broader spatial scope, as the effect of upstream traffic
+should influence signal control decisions at downstream intersections,
+impacting a large area in the traffic network. Although agent communication
+using neural network-based feature extraction can implicitly enhance spatial
+awareness, it significantly increases the learning complexity, adding an
+additional layer of difficulty to the challenging task of control in deep
+reinforcement learning. To address the issue of learning complexity and myopic
+traffic pressure definition, our work introduces a novel concept based on
+Markov chain theory, namely \textit{multi-hop upstream pressure}, which
+generalizes the conventional pressure to account for traffic conditions beyond
+the immediate upstream links. This farsighted and compact metric informs the
+deep reinforcement learning agent to preemptively clear the multi-hop upstream
+queues, guiding the agent to optimize signal timings with a broader spatial
+awareness. Simulations on synthetic and realistic (Toronto) scenarios
+demonstrate controllers utilizing multi-hop upstream pressure significantly
+reduce overall network delay by prioritizing traffic movements based on a
+broader understanding of upstream congestion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 tables, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual collective behaviors on spherical robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.20539v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.20539v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Castro, Christophe Eloy, Franck Ruffier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The implementation of collective motion, traditionally, disregard the limited
+sensing capabilities of an individual, to instead assuming an omniscient
+perception of the environment. This study implements a visual flocking model in
+a ``robot-in-the-loop'' approach to reproduce these behaviors with a flock
+composed of 10 independent spherical robots. The model achieves robotic
+collective motion by only using panoramic visual information of each robot,
+such as retinal position, optical size and optic flow of the neighboring
+robots. We introduce a virtual anchor to confine the collective robotic
+movements so to avoid wall interactions. For the first time, a simple visual
+robot-in-the-loop approach succeed in reproducing several collective motion
+phases, in particular, swarming, and milling. Another milestone achieved with
+by this model is bridging the gap between simulation and physical experiments
+by demonstrating nearly identical behaviors in both environments with the same
+visual model. To conclude, we show that our minimal visual collective motion
+model is sufficient to recreate most collective behaviors on a
+robot-in-the-loop system that is scalable, behaves as numerical simulations
+predict and is easily comparable to traditional models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 16 figures, journal bioinspired and biomimetics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time-Varying Convex Optimization: A Contraction and Equilibrium Tracking
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15595v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15595v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Davydov, Veronica Centorrino, Anand Gokhale, Giovanni Russo, Francesco Bullo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we provide a novel and broadly-applicable
+contraction-theoretic approach to continuous-time time-varying convex
+optimization. For any parameter-dependent contracting dynamics, we show that
+the tracking error is asymptotically proportional to the rate of change of the
+parameter and that the proportionality constant is upper bounded by Lipschitz
+constant in which the parameter appears divided by the contraction rate of the
+dynamics squared. We additionally establish that augmenting any
+parameter-dependent contracting dynamics with a feedforward prediction term
+ensures that the tracking error vanishes exponentially quickly. To apply these
+results to time-varying convex optimization, we establish the strong
+infinitesimal contractivity of dynamics solving three canonical problems:
+monotone inclusions, linear equality-constrained problems, and composite
+minimization problems. For each case, we derive the sharpest-known contraction
+rates and provide explicit bounds on the tracking error between solution
+trajectories and minimizing trajectories. We validate our theoretical results
+on two numerical examples and on an application to control barrier
+function-based controller design that involves real hardware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">31</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tensor-based Dinkelbach method for computing generalized tensor
+  eigenvalues and its applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haibin Chen, Wenqi Zhu, Coralia Cartis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel tensor-based Dinkelbach--Type method for
+computing extremal tensor generalized eigenvalues. We show that the extremal
+tensor generalized eigenvalue can be reformulated as a critical subproblem of
+the classical Dinkelbach--Type method, which can subsequently be expressed as a
+multilinear optimization problem (MOP). The MOP is solved under a spherical
+constraint using an efficient proximal alternative minimization method, in
+which we rigorously establish the global convergence. Additionally, the
+equivalent MOP is reformulated as an unconstrained optimization problem,
+allowing for the analysis of the Kurdyka-Lojasiewicz (KL) exponent and
+providing an explicit expression for the convergence rate of the proposed
+algorithm. Preliminary numerical experiments on solving extremal tensor
+generalized eigenvalues and minimizing high-order trust-region subproblems are
+provided, validating the efficacy and practical utility of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Random Subspace Cubic-Regularization Methods, with Applications to
+  Low-Rank Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Coralia Cartis, Zhen Shao, Edward Tansley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose and analyze random subspace variants of the second-order Adaptive
+Regularization using Cubics (ARC) algorithm. These methods iteratively restrict
+the search space to some random subspace of the parameters, constructing and
+minimizing a local model only within this subspace. Thus, our variants only
+require access to (small-dimensional) projections of first- and second-order
+problem derivatives and calculate a reduced step inexpensively. Under suitable
+assumptions, the ensuing methods maintain the optimal first-order, and
+second-order, global rates of convergence of (full-dimensional) cubic
+regularization, while showing improved scalability both theoretically and
+numerically, particularly when applied to low-rank functions. When applied to
+the latter, our adaptive variant naturally adapts the subspace size to the true
+rank of the function, without knowing it a priori.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convergence of a Deep BSDE solver with jumps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Gnoatto, Katharina Oberpriller, Athena Picarelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the error arising in the numerical approximation of FBSDEs and
+related PIDEs by means of a deep learning-based method. Our results focus on
+decoupled FBSDEs with jumps and extend the seminal work of HAn and Long (2020)
+analyzing the numerical error of the deep BSDE solver proposed in E et al.
+(2017). We provide a priori and a posteriori error estimates for the finite and
+infinite activity case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributionally Fair Peer-to-Peer Electricity Trading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Estibalitz Ruiz Irusta, Juan M. Morales
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Peer-to-peer energy trading platforms enable direct electricity exchanges
+between peers who belong to the same energy community. In a semi-decentralized
+system, a community manager adheres to grid restrictions while optimizing
+social welfare. However, with no further supervision, some peers can be
+discriminated against from participating in the electricity trades. To solve
+this issue, this paper proposes an optimization-based mechanism to enable
+distributionally fair peer-to-peer electricity trading. For the implementation
+of our mechanism, peers are grouped by energy poverty level. The proposed model
+aims to redistribute the electricity trades to minimize the maximum Wasserstein
+distance among the transaction distributions linked to the groups while
+limiting the sacrifice level with a predefined parameter. We demonstrate the
+effectiveness of our proposal using the IEEE 33-bus distribution grid,
+simulating an energy community with 1600 peers. Results indicate that up to
+70.1% of unfairness can be eliminated by using our proposed model, even
+achieving a full elimination when including a non-profit community photovoltaic
+plant.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simplification Method for Inequality Constraints in Integer Binary
+  Encoding HOBO Formulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuichiro Minato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a novel method for simplifying inequality constraints in
+Higher-Order Binary Optimization (HOBO) formulations. The proposed method
+addresses challenges associated with Quadratic Unconstrained Binary
+Optimization (QUBO) formulations, specifically the increased computational
+complexity and reduced solution accuracy caused by the introduction of slack
+variables and the resulting growth in auxiliary qubits. By efficiently
+integrating constraints, the method enhances the computational efficiency and
+accuracy of both quantum and classical solvers. The effectiveness of the
+proposed approach is demonstrated through numerical experiments applied to
+combinatorial optimization problems. The results indicate that this method
+expands the applicability of quantum algorithms to high-dimensional problems
+and improves the practicality of classical optimization solvers for
+optimization problems involving inequality constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Faces of homogeneous cones and applications to homogeneous chordality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Gouveia, Masaru Ito, Bruno F. Lourenço
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A convex cone $\mathcal{K}$ is said to be homogeneous if its group of
+automorphisms acts transitively on its relative interior. Important examples of
+homogeneous cones include symmetric cones and cones of positive semidefinite
+(PSD) matrices that follow a sparsity pattern given by a homogeneous chordal
+graph. Our goal in this paper is to elucidate the facial structure of
+homogeneous cones and make it as transparent as the faces of the PSD matrices.
+We prove that each face of a homogeneous cone $\mathcal{K}$ is mapped by an
+automorphism of $\mathcal{K}$ to one of its finitely many so-called principal
+faces. Furthermore, constructing such an automorphism can be done
+algorithmically by making use of a generalized Cholesky decomposition. Among
+other consequences, we give a proof that homogeneous cones are projectionally
+exposed, which strengthens the previous best result that they are amenable.
+Using our results, we will carefully analyze the facial structure of cones of
+PSD matrices satisfying homogeneous chordality and discuss consequences for the
+corresponding family of PSD completion problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages. Comments welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-agent System for Hybrid Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric S. Fraga, Veerawat Udomvorakulchai, Miguel Pineda, Lazaros G. Papageorgiou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization problems in process engineering, including design and operation,
+can often pose challenges to many solvers: multi-modal, non-smooth, and
+discontinuous models often with large computational requirements. In such
+cases, the optimization problem is often treated as a black box in which only
+the value of the objective function is required, sometimes with some indication
+of the measure of the violation of the constraints. Such problems have
+traditionally been tackled through the use of direct search and meta-heuristic
+methods. The challenge, then, is to determine which of these methods or
+combination of methods should be considered to make most effective use of
+finite computational resources.
+  This paper presents a multi-agent system for optimization which enables a set
+of solvers to be applied simultaneously to an optimization problem, including
+different instantiations of any solver. The evaluation of the optimization
+problem model is controlled by a scheduler agent which facilitates cooperation
+and competition between optimization methods. The architecture and
+implementation of the agent system is described in detail, including the
+solver, model evaluation, and scheduler agents. A suite of direct search and
+meta-heuristic methods has been developed for use with this system. Case
+studies from process systems engineering applications are presented and the
+results show the potential benefits of automated cooperation between different
+optimization solvers and motivates the implementation of competition between
+solvers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On a Variant of the Minimum Path Cover Problem in Acyclic Digraphs:
+  Computational Complexity Results and Exact Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nour ElHouda Tellache, Roberto Baldacci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Minimum Path Cover (MPC) problem consists of finding a
+minimum-cardinality set of node-disjoint paths that cover all nodes in a given
+graph. We explore a variant of the MPC problem on acyclic digraphs (DAGs)
+where, given a subset of arcs, each path within the MPC should contain at least
+one arc from this subset. We prove that the feasibility problem is strongly
+NP-hard on arbitrary DAGs, but the problem can be solved in polynomial time
+when the DAG is the transitive closure of a path.
+  Given that the problem may not always be feasible, our solution focuses on
+covering a maximum number of nodes with a minimum number of node-disjoint
+paths, such that each path includes at least one arc from the predefined subset
+of arcs. This paper introduces and investigates two integer programming
+formulations for this problem. We propose several valid inequalities to enhance
+the linear programming relaxations, employing them as cutting planes in a
+branch-and-cut approach. The procedure is implemented and tested on a wide
+range of instances, including real-world instances derived from an airline crew
+scheduling problem, demonstrating the effectiveness of the proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rates of (T-)asymptotic regularity of the generalized
+  Krasnoselskii-Mann-type iteration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paulo Firmino, Laurentiu Leustean
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we use proof mining methods to compute rates of
+($T$-)asymptotic regularity of the generalized Krasnoselskii-Mann-type
+iteration associated to a nonexpansive mapping $T:X\to X$ in a uniformly convex
+normed space $X$. For special choices of the parameter sequences, we obtain
+quadratic rates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Proximal Quasi-Newton Method for Composite Optimization over the Stiefel
+  Manifold 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09516v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09516v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinsi Wang, Wei Hong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider the composite optimization problems over the
+Stiefel manifold. A successful method to solve this class of problems is the
+proximal gradient method proposed by Chen et al. Motivated by the proximal
+Newton-type techniques in the Euclidean space, we present a Riemannian proximal
+quasi-Newton method, named ManPQN, to solve the composite optimization
+problems. The global convergence of the ManPQN method is proved and iteration
+complexity for obtaining an $\epsilon$-stationary point is analyzed. Under some
+mild conditions, we also establish the local linear convergence result of the
+ManPQN method. Numerical results are encouraging, which shows that the proximal
+quasi-Newton technique can be used to accelerate the proximal gradient method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal taxes and subsidies to incentivize modal shift for inner-city
+  freight transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krissada Tundulyasaree, Layla Martin, Rolf N. van Lieshout, Tom Van Woensel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With increasing freight demands for inner-city transport, shifting freight
+from road to scheduled line services such as buses, metros, trams, and barges
+is a sustainable solution. Public authorities typically impose economic
+policies, including road taxes and subsidies for scheduled line services, to
+achieve this modal shift. This study models such a policy using a bi-level
+approach: at the upper level, authorities set road taxes and scheduled line
+subsidies, while at the lower level, freight forwarders arrange transportation
+via road or a combination of road and scheduled lines. We prove that fully
+subsidizing the scheduled line is an optimal and budget-efficient policy. Due
+to its computational complexity, we solve the problem heuristically using a
+bi-section algorithm for the upper level and an Adaptive Large Neighbourhood
+Search for the lower level. Our results show that optimally setting subsidy and
+tax can reduce the driving distance by up to 12.5\% and substantially increase
+modal shift, albeit at a higher operational cost due to increased taxes.
+Furthermore, increased scheduled line frequency and decreased geographical
+scatteredness of freight orders increase modal shift. For the partial subsidy
+policy, we found that an additional budget provides a better trade-off between
+minimizing distance and transportation costs than solely increasing the subsidy
+level. In a Berlin, Germany, case study, we find that we can achieve up to
+2.9\% reduction in driven distance due to 23.2\% scheduled line usage, which
+amounts to an increase of multiple orders of magnitude, despite only using a
+few stations for transshipment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mid-term bio-economic optimization of multi-species fisheries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09452v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09452v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        L. Bayon, P. Fortuny Ayuso, P. J. Garcia-Nieto, J. A. Otero, P. M. Suarez, C. Tasis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we analyze the dynamics of a multi-species fisheries system in
+the presence of harvesting. We solve the problem of finding the optimal
+harvesting strategy for a mid-term horizon with a fixed final stock of each
+species, while maximizing the expected present value of total revenues. The
+problem is formulated as an optimal control problem. For its solution, we
+combine techniques derived from Pontryagin's Maximum Principle, cyclic
+coordinate descent and the shooting method. The algorithm we develop can solve
+problems both with inter-species competition and with predator-prey behaviors.
+Several numerical examples are presented to illustrate the different
+possibilities of the method and a study of the dependence of the behavior on
+some parameters is performed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Effective Generalized Moment Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09385v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09385v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Gamertsfelder, Bernard Mourrain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We establish new convergence rates for the moment-sum-of-squares (Moment-SOS)
+relaxations for the Generalized Moment Problem (GMP). These bounds, which adapt
+to the geometry of the underlying semi-algebraic set, apply to both the
+convergence of optima, and to the convergence in Hausdorff distance between the
+relaxation feasibility set and the GMP feasibility set. This research extends
+previous works limited to specific problems in polynomial optimization, volume
+computation and optimal control. We complement our theoretical analysis with an
+application: minimal rank symmetric tensor decomposition. In the examples, we
+formulate the problem as a GMP, solve using Moment-SOS relaxation, and apply
+the theoretical results to observe a convergence rate of the relaxations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Control Barrier Function-Based Safety Filters: Characterization of
+  Undesired Equilibria, Unbounded Trajectories, and Limit Cycles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pol Mestres, Yiting Chen, Emiliano Dall'anese, Jorge Cortés
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on safety filters designed based on Control Barrier
+Functions (CBFs): these are modifications of a nominal stabilizing controller
+typically utilized in safety-critical control applications to render a given
+subset of states forward invariant. The paper investigates the dynamical
+properties of the closed-loop systems, with a focus on characterizing
+undesirable behaviors that may emerge due to the use of CBF-based filters.
+These undesirable behaviors include unbounded trajectories, limit cycles, and
+undesired equilibria, which can be locally stable and even form a continuum.
+Our analysis offer the following contributions: (i) conditions under which
+trajectories remain bounded and (ii) conditions under which limit cycles do not
+exist; (iii) we show that undesired equilibria can be characterized by solving
+an algebraic equation, and (iv) we provide examples that show that
+asymptotically stable undesired equilibria can exist for a large class of
+nominal controllers and design parameters of the safety filter (even for convex
+safe sets). Further, for the specific class of planar systems, (v) we provide
+explicit formulas for the total number of undesired equilibria and the
+proportion of saddle points and asymptotically stable equilibria, and (vi) in
+the case of linear planar systems, we present an exhaustive analysis of their
+global stability properties. Examples throughout the paper illustrate the
+results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the convergence of noisy Bayesian Optimization with Expected
+  Improvement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyi Wang, Haowei Wang, Cosmin G. Petra, Nai-Yuan Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expected improvement (EI) is one of the most widely-used acquisition
+functions in Bayesian optimization (BO). Despite its proven success in
+applications for decades, important open questions remain on the theoretical
+convergence behaviors and rates for EI. In this paper, we contribute to the
+convergence theories of EI in three novel and critical area. First, we consider
+objective functions that are under the Gaussian process (GP) prior assumption,
+whereas existing works mostly focus on functions in the reproducing kernel
+Hilbert space (RKHS). Second, we establish the first asymptotic error bound and
+its corresponding rate for GP-EI with noisy observations under the GP prior
+assumption. Third, by investigating the exploration and exploitation of the
+non-convex EI function, we prove improved error bounds for both the noise-free
+and noisy cases. The improved noiseless bound is extended to the RKHS
+assumption as well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Dynamic Unmanned Aerial Vehicle Routing Framework for Urban Traffic
+  Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yumeng Bai, Yiheng Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned Aerial Vehicles (UAVs) have great potential in urban traffic
+monitoring due to their rapid speed, cost-effectiveness, and extensive
+field-of-view, while being unconstrained by traffic congestion. However, their
+limited flight duration presents critical challenges in sustainable recharging
+strategies and efficient route planning in long-term monitoring tasks.
+Additionally, existing approaches for long-term monitoring often neglect the
+evolving nature of urban traffic networks. In this study, we introduce a novel
+dynamic UAV routing framework for long-term, network-wide urban traffic
+monitoring, leveraging existing ground vehicles as mobile charging stations
+without disrupting their operations. To address the complexity of long-term
+monitoring scenarios involving multiple flights, we decompose the problem into
+manageable single-flight tasks, in which each flight is modeled as a Team Arc
+Orienteering Problem with Decreasing Profits with the objective to collectively
+maximize the spatiotemporal network coverage. Between flights, we adaptively
+update the edge weights to incorporate real-time traffic changes and revisit
+intervals. We validate our framework through extensive microscopic simulations
+in a modified Sioux Falls network under various scenarios. Comparative results
+demonstrate that our model outperforms three baseline approaches, especially
+when historical information is incomplete or absent. Moreover, we show that our
+monitoring framework can capture network-wide traffic trends and construct
+accurate Macroscopic Fundamental Diagrams (MFDs). These findings demonstrate
+the effectiveness of the proposed dynamic UAV routing framework, underscoring
+its suitability for efficient and reliable long-term traffic monitoring. Our
+approach's adaptability and high accuracy in capturing the MFD highlight its
+potential in network-wide traffic control and management applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Weighted Total Variation boosted by learning techniques in
+  few-view tomographic imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elena Morotti, Davide Evangelista, Andrea Sebastiani, Elena Loli Piccolomini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents the development of a spatially adaptive weighting
+strategy for Total Variation regularization, aimed at addressing
+under-determined linear inverse problems. The method leverages the rapid
+computation of an accurate approximation of the true image (or its gradient
+magnitude) through a neural network. Our approach operates without requiring
+prior knowledge of the noise intensity in the data and avoids the iterative
+recomputation of weights. Additionally, the paper includes a theoretical
+analysis of the proposed method, establishing its validity as a regularization
+approach. This framework integrates advanced neural network capabilities within
+a regularization context, thereby making the results of the networks
+interpretable. The results are promising as they enable high-quality
+reconstructions from limited-view tomographic measurements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 8 figures, submitted to journal for peer-review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Complexity of p-Order Cone Programs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Víctor Blanco, Victor Magron, Miguel Martínez-Antón
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This manuscript explores novel complexity results for the feasibility problem
+over $p$-order cones, extending the foundational work of Porkolab and
+Khachiyan. By leveraging the intrinsic structure of $p$-order cones, we derive
+refined complexity bounds that surpass those obtained via standard semidefinite
+programming reformulations. Our analysis not only improves theoretical bounds
+but also provides practical insights into the computational efficiency of
+solving such problems. In addition to establishing complexity results, we
+derive explicit bounds for solutions when the feasibility problem admits one.
+For infeasible instances, we analyze their discrepancy quantifying the degree
+of infeasibility. Finally, we examine specific cases of interest, highlighting
+scenarios where the geometry of $p$-order cones or problem structure yields
+further computational simplifications. These findings contribute to both the
+theoretical understanding and practical tractability of optimization problems
+involving $p$-order cones.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized TCP-RED dynamical model for Internet congestion control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10473v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10473v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José M. Amigó, Guillem Duran, Angel Giménez, Oscar Martínez-Bonastre, José Valero
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive management of traffic congestion in the Internet is a complex
+problem that can gain useful insights from a dynamical approach. In this paper
+we propose and analyze a one-dimensional, discrete-time nonlinear model for
+Internet congestion control at the routers. Specifically, the states correspond
+to the average queue sizes of the incoming data packets and the dynamical core
+consists of a monotone or unimodal mapping with a unique fixed point. This
+model generalizes a previous one in that additional control parameters are
+introduced via the data packet drop probability with the objective of enhancing
+stability. To make the analysis more challenging, the original model was shown
+to exhibit the usual features of low-dimensional chaos with respect to several
+system and control parameters, e.g., positive Lyapunov exponents and
+Feigenbaum-like bifurcation diagrams. We concentrate first on the theoretical
+aspects that may promote the unique stationary state of the system to a global
+attractor, which in our case amounts to global stability. In a second step,
+those theoretical results are translated into stability domains for robust
+setting of the new control parameters in practical applications. Numerical
+simulations confirm that the new parameters make it possible to extend the
+stability domains, in comparison with previous results. Therefore, the present
+work may lead to an adaptive congestion control algorithm with a more stable
+performance than other algorithms currently in use.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uses of Sub-sample Estimates to Reduce Errors in Stochastic Optimization
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07052v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07052v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John R. Birge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization software enables the solution of problems with millions of
+variables and associated parameters. These parameters are, however, often
+uncertain and represented with an analytical description of the parameter's
+distribution or with some form of sample. With large numbers of such
+parameters, optimization of the resulting model is often driven by
+mis-specifications or extreme sample characteristics, resulting in solutions
+that are far from a true optimum. This paper describes how asymptotic
+convergence results may not be useful in large-scale problems and how the
+optimization of problems based on sub-sample estimates may achieve improved
+results over models using full-sample solution estimates. A motivating example
+and numerical results from a portfolio optimization problem demonstrate the
+potential improvement. A theoretical analysis also provides insight into the
+structure of problems where sub-sample optimization may be most beneficial.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Actuation manifold from snapshot data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03653v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03653v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luigi Marra, Guy Y. Cornejo Maceda, Andrea Meilán-Vila, Vanesa Guerrero, Salma Rashwan, Bernd R. Noack, Stefano Discetti, Andrea Ianiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a data-driven methodology to learn a low-dimensional manifold of
+controlled flows. The starting point is resolving snapshot flow data for a
+representative ensemble of actuations. Key enablers for the actuation manifold
+are isometric mapping as encoder and a combination of a neural network and a
+k-nearest-neighbour interpolation as decoder. This methodology is tested for
+the fluidic pinball, a cluster of three parallel cylinders perpendicular to the
+oncoming uniform flow. The centres of these cylinders are the vertices of an
+equilateral triangle pointing upstream. The flow is manipulated by constant
+rotation of the cylinders, i.e. described by three actuation parameters. The
+Reynolds number based on a cylinder diameter is chosen to be 30. The unforced
+flow yields statistically symmetric periodic shedding represented by a
+one-dimensional limit cycle. The proposed methodology yields a five-dimensional
+manifold describing a wide range of dynamics with small representation error.
+Interestingly, the manifold coordinates automatically unveil physically
+meaningful parameters. Two of them describe the downstream periodic vortex
+shedding. The other three describe the near-field actuation, i.e. the strength
+of boat-tailing, the Magnus effect and forward stagnation point. The manifold
+is shown to be a key enabler for control-oriented flow estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Heavy Ball Momentum for Non-Strongly Convex Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-François Aujol, Charles Dossal, Hippolyte Labarrière, Aude Rondepierre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When considering the minimization of a quadratic or strongly convex function,
+it is well known that first-order methods involving an inertial term weighted
+by a constant-in-time parameter are particularly efficient (see Polyak [32],
+Nesterov [28], and references therein). By setting the inertial parameter
+according to the condition number of the objective function, these methods
+guarantee a fast exponential decay of the error. We prove that this type of
+schemes (which are later called Heavy Ball schemes) is relevant in a relaxed
+setting, i.e. for composite functions satisfying a quadratic growth condition.
+In particular, we adapt V-FISTA, introduced by Beck in [10] for strongly convex
+functions, to this broader class of functions. To the authors' knowledge, the
+resulting worst-case convergence rates are faster than any other in the
+literature, including those of FISTA restart schemes. No assumption on the set
+of minimizers is required and guarantees are also given in the non-optimal
+case, i.e. when the condition number is not exactly known. This analysis
+follows the study of the corresponding continuous-time dynamical system (Heavy
+Ball with friction system), for which new convergence results of the trajectory
+are shown.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OPM, a collection of Optimization Problems in Matlab 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.05636v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.05636v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serge Gratton, Philippe L. Toint
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  OPM is a small collection of CUTEst unconstrained and bound-constrained
+nonlinear optimization problems, which can be used in Matlab for testing
+optimization algorithms directly (i.e. without installing additional software).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safe Control and Learning Using the Generalized Action Governor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Li, Yutong Li, Ilya Kolmanovsky, Anouck Girard, H. Eric Tseng, Dimitar Filev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces a general framework for safe control and learning
+based on the generalized action governor (AG). The AG is a supervisory scheme
+for augmenting a nominal closed-loop system with the ability of strictly
+handling prescribed safety constraints. In the first part of this article, we
+present a generalized AG methodology and analyze its key properties in a
+general setting. Then, we introduce tailored AG design approaches derived from
+the generalized methodology for linear and discrete systems. Afterward, we
+discuss the application of the generalized AG to facilitate safe online
+learning, which aims at safely evolving control parameters using real-time data
+to enhance control performance in uncertain systems. We present two safe
+learning algorithms based on, respectively, reinforcement learning and
+data-driven Koopman operator-based control integrated with the generalized AG
+to exemplify this application. Finally, we illustrate the developments with a
+numerical example.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 4 figures, submitted to the International Journal of
+  Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparsity-Aware Distributed Learning for Gaussian Processes with Linear
+  Multiple Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08201v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08201v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Cornelius Suwandi, Zhidi Lin, Feng Yin, Zhiguo Wang, Sergios Theodoridis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian processes (GPs) stand as crucial tools in machine learning and
+signal processing, with their effectiveness hinging on kernel design and
+hyper-parameter optimization. This paper presents a novel GP linear multiple
+kernel (LMK) and a generic sparsity-aware distributed learning framework to
+optimize the hyper-parameters. The newly proposed grid spectral mixture product
+(GSMP) kernel is tailored for multi-dimensional data, effectively reducing the
+number of hyper-parameters while maintaining good approximation capability. We
+further demonstrate that the associated hyper-parameter optimization of this
+kernel yields sparse solutions. To exploit the inherent sparsity of the
+solutions, we introduce the Sparse LInear Multiple Kernel Learning (SLIM-KL)
+framework. The framework incorporates a quantized alternating direction method
+of multipliers (ADMM) scheme for collaborative learning among multiple agents,
+where the local optimization problem is solved using a distributed successive
+convex approximation (DSCA) algorithm. SLIM-KL effectively manages large-scale
+hyper-parameter optimization for the proposed kernel, simultaneously ensuring
+data privacy and minimizing communication costs. Theoretical analysis
+establishes convergence guarantees for the learning framework, while
+experiments on diverse datasets demonstrate the superior prediction performance
+and efficiency of our proposed methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wasserstein Gradient Flows for Moreau Envelopes of f-Divergences in
+  Reproducing Kernel Hil<span class="highlight-title">bert</span> Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04613v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04613v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viktor Stein, Sebastian Neumayer, Nicolaj Rux, Gabriele Steidl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Commonly used $f$-divergences of measures, e.g., the Kullback-Leibler
+divergence, are subject to limitations regarding the support of the involved
+measures. A remedy is regularizing the $f$-divergence by a squared maximum mean
+discrepancy (MMD) associated with a characteristic kernel $K$. We use the
+kernel mean embedding to show that this regularization can be rewritten as the
+Moreau envelope of some function on the associated reproducing kernel Hilbert
+space. Then, we exploit well-known results on Moreau envelopes in Hilbert
+spaces to analyze the MMD-regularized $f$-divergences, particularly their
+gradients. Subsequently, we use our findings to analyze Wasserstein gradient
+flows of MMD-regularized $f$-divergences. We provide proof-of-the-concept
+numerical examples for flows starting from empirical measures. Here, we cover
+$f$-divergences with infinite and finite recession constants. Lastly, we extend
+our results to the tight variational formulation of $f$-divergences and
+numerically compare the resulting flows.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>56 pages, 14 figures, 3 tables. Comments welcome! NEW: Incorporated
+  Reviewers' suggestions, added FISTA and tight formulation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distributed Riemannian Stochastic Gradient Tracking Algorithm on the
+  Stiefel Manifold 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16900v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16900v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jishu Zhao, Xi Wang, Jinlong Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focus on investigating the distributed Riemannian stochastic
+optimization problem on the Stiefel manifold for multi-agent systems, where all
+the agents work collaboratively to optimize a function modeled by the average
+of their expectation-valued local costs. Each agent only processes its own
+local cost function and communicate with neighboring agents to achieve optimal
+results while ensuring consensus. Since the local Riemannian gradient in
+stochastic regimes cannot be directly calculated, we will estimate the gradient
+by the average of a variable number of sampled gradient, which however brings
+about noise to the system. We then propose a distributed Riemannian stochastic
+optimization algorithm on the Stiefel manifold by combining the variable sample
+size gradient approximation method with the gradient tracking dynamic. It is
+worth noticing that the suitably chosen increasing sample size plays an
+important role in improving the algorithm efficiency, as it reduces the noise
+variance. In an expectation-valued sense, the iterates of all agents are proved
+to converge to a stationary point (or neighborhood) with fixed step sizes. We
+further establish the convergence rate of the iterates for the cases when the
+sample size is exponentially increasing, polynomial increasing, or a constant,
+respectively. Finally, numerical experiments are implemented to demonstrate the
+theoretical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cislunar Constellation Design for Space Situational Awareness with
+  Time-Expanded p-Median Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06238v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06238v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuri Shimane, Kento Tomita, Koki Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by the surmounting interest for a dedicated infrastructure in cislunar
+space, this work considers the satellite constellation design for cislunar
+space situational awareness (CSSA). We propose a linear programming (LP)-based
+formulation that simultaneously tackles the constellation design and
+sensor-tasking subproblems surrounding CSSA. Our approach generates
+constellation designs that provide coverage with considerations for the
+field-of-view of observers. We propose a time-expanded p-Median problem
+(TE-p-MP) which considers the optimal placement of p space-based observers into
+discretized locations based on orbital slots along libration point orbits,
+simultaneously with observer pointing directions across discretized time. We
+further develop a Lagrangian method for the TE-p-MP, where a relaxed problem
+with an analytical solution is derived, and customized heuristics leveraging
+the orbital structure of candidate observer locations are devised. The
+performance of the proposed formulation is demonstrated with several case
+studies for CSSA constellations monitoring the cislunar Cone of Shame and a
+periodic time-varying transit window for low-energy transfers located in the
+Earth-Moon L2 neck region. The proposed problem formulation, along with the
+Lagrangian method, is demonstrated to enable a fast assessment of near-optimal
+CSSA constellations, equipping decision-makers with a critical technique for
+exploring the design trade space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>49 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Thermal Bootstrap of Matrix Quantum Mechanics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.04262v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.04262v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minjae Cho, Barak Gabai, Joshua Sandor, Xi Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We implement a bootstrap method that combines Schwinger-Dyson equations,
+thermal inequalities, and semidefinite relaxations of matrix logarithm in the
+ungauged one-matrix quantum mechanics, at finite rank N as well as in the large
+N limit, and determine finite temperature observables that interpolate between
+available analytic results in the low and high temperature limits respectively.
+We also obtain bootstrap bounds on thermal phase transition as well as
+preliminary results in the ungauged two-matrix quantum mechanics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 8 figures, v2: references added</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convergence and Bound Computation for Chance Constrained
+  Distributionally Robust Models using Sample Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12018v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12018v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Lei, Sanjay Mehrotra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers a distributionally robust chance constraint model with a
+general ambiguity set. We show that a sample based approximation of this model
+converges under suitable sufficient conditions. We also show that upper and
+lower bounds on the optimal value of the model can be estimated statistically.
+Specific ambiguity sets are discussed as examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time-Varying Convex Optimization: A Contraction and Equilibrium Tracking
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15595v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15595v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Davydov, Veronica Centorrino, Anand Gokhale, Giovanni Russo, Francesco Bullo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we provide a novel and broadly-applicable
+contraction-theoretic approach to continuous-time time-varying convex
+optimization. For any parameter-dependent contracting dynamics, we show that
+the tracking error is asymptotically proportional to the rate of change of the
+parameter and that the proportionality constant is upper bounded by Lipschitz
+constant in which the parameter appears divided by the contraction rate of the
+dynamics squared. We additionally establish that augmenting any
+parameter-dependent contracting dynamics with a feedforward prediction term
+ensures that the tracking error vanishes exponentially quickly. To apply these
+results to time-varying convex optimization, we establish the strong
+infinitesimal contractivity of dynamics solving three canonical problems:
+monotone inclusions, linear equality-constrained problems, and composite
+minimization problems. For each case, we derive the sharpest-known contraction
+rates and provide explicit bounds on the tracking error between solution
+trajectories and minimizing trajectories. We validate our theoretical results
+on two numerical examples and on an application to control barrier
+function-based controller design that involves real hardware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">122</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distilling Multi-modal Large Language Models for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deepti Hegde, Rajeev Yasarla, Hong Cai, Shizhong Han, Apratim Bhattacharyya, Shweta Mahajan, Litian Liu, Risheek Garrepalli, Vishal M. Patel, Fatih Porikli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving demands safe motion planning, especially in critical
+"long-tail" scenarios. Recent end-to-end autonomous driving systems leverage
+large language models (LLMs) as planners to improve generalizability to rare
+events. However, using LLMs at test time introduces high computational costs.
+To address this, we propose DiMA, an end-to-end autonomous driving system that
+maintains the efficiency of an LLM-free (or vision-based) planner while
+leveraging the world knowledge of an LLM. DiMA distills the information from a
+multi-modal LLM to a vision-based end-to-end planner through a set of specially
+designed surrogate tasks. Under a joint training strategy, a scene encoder
+common to both networks produces structured representations that are
+semantically grounded as well as aligned to the final planning objective.
+Notably, the LLM is optional at inference, enabling robust planning without
+compromising on efficiency. Training with DiMA results in a 37% reduction in
+the L2 trajectory error and an 80% reduction in the collision rate of the
+vision-based planner, as well as a 44% trajectory error reduction in longtail
+scenarios. DiMA also achieves state-of-the-art performance on the nuScenes
+planning benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SynthLight: Portrait Relighting with Diffusion Model by Learning to
+  Re-render Synthetic Faces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sumit Chaturvedi, Mengwei Ren, Yannick Hold-Geoffroy, Jingyuan Liu, Julie Dorsey, Zhixin Shu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce SynthLight, a diffusion model for portrait relighting. Our
+approach frames image relighting as a re-rendering problem, where pixels are
+transformed in response to changes in environmental lighting conditions. Using
+a physically-based rendering engine, we synthesize a dataset to simulate this
+lighting-conditioned transformation with 3D head assets under varying lighting.
+We propose two training and inference strategies to bridge the gap between the
+synthetic and real image domains: (1) multi-task training that takes advantage
+of real human portraits without lighting labels; (2) an inference time
+diffusion sampling procedure based on classifier-free guidance that leverages
+the input portrait to better preserve details. Our method generalizes to
+diverse real photographs and produces realistic illumination effects, including
+specular highlights and cast shadows, while preserving the subject's identity.
+Our quantitative experiments on Light Stage data demonstrate results comparable
+to state-of-the-art relighting methods. Our qualitative results on in-the-wild
+images showcase rich and unprecedented illumination effects. Project Page:
+\url{https://vrroom.github.io/synthlight/}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 25 figures, Project Page
+  https://vrroom.github.io/synthlight/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learnings from Scaling Visual Tokenizers for Reconstruction and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philippe Hansen-Estruch, David Yan, Ching-Yao Chung, Orr Zohar, Jialiang Wang, Tingbo Hou, Tao Xu, Sriram Vishwanath, Peter Vajda, Xinlei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual tokenization via auto-encoding empowers state-of-the-art image and
+video generative models by compressing pixels into a latent space. Although
+scaling Transformer-based generators has been central to recent advances, the
+tokenizer component itself is rarely scaled, leaving open questions about how
+auto-encoder design choices influence both its objective of reconstruction and
+downstream generative performance. Our work aims to conduct an exploration of
+scaling in auto-encoders to fill in this blank. To facilitate this exploration,
+we replace the typical convolutional backbone with an enhanced Vision
+Transformer architecture for Tokenization (ViTok). We train ViTok on
+large-scale image and video datasets far exceeding ImageNet-1K, removing data
+constraints on tokenizer scaling. We first study how scaling the auto-encoder
+bottleneck affects both reconstruction and generation -- and find that while it
+is highly correlated with reconstruction, its relationship with generation is
+more complex. We next explored the effect of separately scaling the
+auto-encoders' encoder and decoder on reconstruction and generation
+performance. Crucially, we find that scaling the encoder yields minimal gains
+for either reconstruction or generation, while scaling the decoder boosts
+reconstruction but the benefits for generation are mixed. Building on our
+exploration, we design ViTok as a lightweight auto-encoder that achieves
+competitive performance with state-of-the-art auto-encoders on ImageNet-1K and
+COCO reconstruction tasks (256p and 512p) while outperforming existing
+auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x
+fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates
+competitive performance on image generation for ImageNet-1K and sets new
+state-of-the-art benchmarks for class-conditional video generation on UCF-101.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 25 figures, 7 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lost in Translation, Found in Context: Sign Language Translation with
+  Contextual Cues 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youngjoon Jang, Haran Raajesh, Liliane Momeni, Gül Varol, Andrew Zisserman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our objective is to translate continuous sign language into spoken language
+text. Inspired by the way human interpreters rely on context for accurate
+translation, we incorporate additional contextual cues together with the
+signing video, into a new translation framework. Specifically, besides visual
+sign recognition features that encode the input video, we integrate
+complementary textual information from (i) captions describing the background
+show, (ii) translation of previous sentences, as well as (iii) pseudo-glosses
+transcribing the signing. These are automatically extracted and inputted along
+with the visual features to a pre-trained large language model (LLM), which we
+fine-tune to generate spoken language translations in text form. Through
+extensive ablation studies, we show the positive contribution of each input cue
+to the translation performance. We train and evaluate our approach on BOBSL --
+the largest British Sign Language dataset currently available. We show that our
+contextual approach significantly enhances the quality of the translations
+compared to previously reported results on BOBSL, and also to state-of-the-art
+methods that we implement as baselines. Furthermore, we demonstrate the
+generality of our approach by applying it also to How2Sign, an American Sign
+Language dataset, and achieve competitive results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SRE-Conv: Symmetric Rotation Equivariant Convolution for Biomedical
+  Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuexi Du, Jiazhen Zhang, Tal Zeevi, Nicha C. Dvornek, John A. Onofrey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks (CNNs) are essential tools for computer vision
+tasks, but they lack traditionally desired properties of extracted features
+that could further improve model performance, e.g., rotational equivariance.
+Such properties are ubiquitous in biomedical images, which often lack explicit
+orientation. While current work largely relies on data augmentation or explicit
+modules to capture orientation information, this comes at the expense of
+increased training costs or ineffective approximations of the desired
+equivariance. To overcome these challenges, we propose a novel and efficient
+implementation of the Symmetric Rotation-Equivariant (SRE) Convolution
+(SRE-Conv) kernel, designed to learn rotation-invariant features while
+simultaneously compressing the model size. The SRE-Conv kernel can easily be
+incorporated into any CNN backbone. We validate the ability of a deep SRE-CNN
+to capture equivariance to rotation using the public MedMNISTv2 dataset (16
+total tasks). SRE-Conv-CNN demonstrated improved rotated image classification
+performance accuracy on all 16 test datasets in both 2D and 3D images, all
+while increasing efficiency with fewer parameters and reduced memory footprint.
+The code is available at https://github.com/XYPB/SRE-Conv.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ISBI 2025 4-page paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ComplexVAD: Detecting Interaction Anomalies in Video <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Furkan Mumcu, Michael J. Jones, Yasin Yilmaz, Anoop Cherian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing video anomaly detection datasets are inadequate for representing
+complex anomalies that occur due to the interactions between objects. The
+absence of complex anomalies in previous video anomaly detection datasets
+affects research by shifting the focus onto simple anomalies. To address this
+problem, we introduce a new large-scale dataset: ComplexVAD. In addition, we
+propose a novel method to detect complex anomalies via modeling the
+interactions between objects using a scene graph with spatio-temporal
+attributes. With our proposed method and two other state-of-the-art video
+anomaly detection methods, we obtain baseline scores on ComplexVAD and
+demonstrate that our new method outperforms existing works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 11 figures, to appear in WACV Workshop ASTAD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inference-Time Scaling for Diffusion Models beyond Scaling Denoising
+  Steps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nanye Ma, Shangyuan Tong, Haolin Jia, Hexiang Hu, Yu-Chuan Su, Mingda Zhang, Xuan Yang, Yandong Li, Tommi Jaakkola, Xuhui Jia, Saining Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models have made significant impacts across various domains,
+largely due to their ability to scale during training by increasing data,
+computational resources, and model size, a phenomenon characterized by the
+scaling laws. Recent research has begun to explore inference-time scaling
+behavior in Large Language Models (LLMs), revealing how performance can further
+improve with additional computation during inference. Unlike LLMs, diffusion
+models inherently possess the flexibility to adjust inference-time computation
+via the number of denoising steps, although the performance gains typically
+flatten after a few dozen. In this work, we explore the inference-time scaling
+behavior of diffusion models beyond increasing denoising steps and investigate
+how the generation performance can further improve with increased computation.
+Specifically, we consider a search problem aimed at identifying better noises
+for the diffusion sampling process. We structure the design space along two
+axes: the verifiers used to provide feedback, and the algorithms used to find
+better noise candidates. Through extensive experiments on class-conditioned and
+text-conditioned image generation benchmarks, our findings reveal that
+increasing inference-time compute leads to substantial improvements in the
+quality of samples generated by diffusion models, and with the complicated
+nature of images, combinations of the components in the framework can be
+specifically chosen to conform with different application scenario.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple Aerial Detection Baseline of Multimodal Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyun Li, Yushi Chen, Xinya Shu, Dong Chen, Xin He, Yi Yu, Xue Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The multimodal language models (MLMs) based on generative pre-trained
+Transformer are considered powerful candidates for unifying various domains and
+tasks. MLMs developed for remote sensing (RS) have demonstrated outstanding
+performance in multiple tasks, such as visual question answering and visual
+grounding. In addition to visual grounding that detects specific objects
+corresponded to given instruction, aerial detection, which detects all objects
+of multiple categories, is also a valuable and challenging task for RS
+foundation models. However, aerial detection has not been explored by existing
+RS MLMs because the autoregressive prediction mechanism of MLMs differs
+significantly from the detection outputs. In this paper, we present a simple
+baseline for applying MLMs to aerial detection for the first time, named
+LMMRotate. Specifically, we first introduce a normalization method to transform
+detection outputs into textual outputs to be compatible with the MLM framework.
+Then, we propose a evaluation method, which ensures a fair comparison between
+MLMs and conventional object detection models. We construct the baseline by
+fine-tuning open-source general-purpose MLMs and achieve impressive detection
+performance comparable to conventional detector. We hope that this baseline
+will serve as a reference for future MLM development, enabling more
+comprehensive capabilities for understanding RS images. Code is available at
+https://github.com/Li-Qingyun/mllm-mmrotate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 1 table, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLOL: Fast Baselines for Real-World Low-Light Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan C. Benito, Daniel Feijoo, Alvaro Garcia, Marcos V. Conde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-Light Image Enhancement (LLIE) is a key task in computational photography
+and imaging. The problem of enhancing images captured during night or in dark
+environments has been well-studied in the image signal processing literature.
+However, current deep learning-based solutions struggle with efficiency and
+robustness in real-world scenarios (e.g. scenes with noise, saturated pixels,
+bad illumination). We propose a lightweight neural network that combines image
+processing in the frequency and spatial domains. Our method, FLOL+, is one of
+the fastest models for this task, achieving state-of-the-art results on popular
+real scenes datasets such as LOL and LSRW. Moreover, we are able to process
+1080p images under 12ms. Code and models at https://github.com/cidautai/FLOL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Practical Continual Forgetting for <span class="highlight-title">Pre-train</span>ed Vision Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbo Zhao, Fei Zhu, Bolin Ni, Feng Zhu, Gaofeng Meng, Zhaoxiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For privacy and security concerns, the need to erase unwanted information
+from pre-trained vision models is becoming evident nowadays. In real-world
+scenarios, erasure requests originate at any time from both users and model
+owners, and these requests usually form a sequence. Therefore, under such a
+setting, selective information is expected to be continuously removed from a
+pre-trained model while maintaining the rest. We define this problem as
+continual forgetting and identify three key challenges. (i) For unwanted
+knowledge, efficient and effective deleting is crucial. (ii) For remaining
+knowledge, the impact brought by the forgetting procedure should be minimal.
+(iii) In real-world scenarios, the training samples may be scarce or partially
+missing during the process of forgetting. To address them, we first propose
+Group Sparse LoRA (GS-LoRA). Specifically, towards (i), we introduce LoRA
+modules to fine-tune the FFN layers in Transformer blocks for each forgetting
+task independently, and towards (ii), a simple group sparse regularization is
+adopted, enabling automatic selection of specific LoRA groups and zeroing out
+the others. To further extend GS-LoRA to more practical scenarios, we
+incorporate prototype information as additional supervision and introduce a
+more practical approach, GS-LoRA++. For each forgotten class, we move the
+logits away from its original prototype. For the remaining classes, we pull the
+logits closer to their respective prototypes. We conduct extensive experiments
+on face recognition, object detection and image classification and demonstrate
+that our method manages to forget specific classes with minimal impact on other
+classes. Codes have been released on https://github.com/bjzhb666/GS-LoRA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Hallucinations in Large Vision-Language Models via DPO:
+  On-Policy Data Hold the Key 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihe Yang, Xufang Luo, Dongqi Han, Yunjian Xu, Dongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hallucination remains a major challenge for Large Vision-Language Models
+(LVLMs). Direct Preference Optimization (DPO) has gained increasing attention
+as a simple solution to hallucination issues. It directly learns from
+constructed preference pairs that reflect the severity of hallucinations in
+responses to the same prompt and image. Nonetheless, different data
+construction methods in existing works bring notable performance variations. We
+identify a crucial factor here: outcomes are largely contingent on whether the
+constructed data aligns on-policy w.r.t the initial (reference) policy of DPO.
+Theoretical analysis suggests that learning from off-policy data is impeded by
+the presence of KL-divergence between the updated policy and the reference
+policy. From the perspective of dataset distribution, we systematically
+summarize the inherent flaws in existing algorithms that employ DPO to address
+hallucination issues. To alleviate the problems, we propose On-Policy Alignment
+(OPA)-DPO framework, which uniquely leverages expert feedback to correct
+hallucinated responses and aligns both the original and expert-revised
+responses in an on-policy manner. Notably, with only 4.8k data, OPA-DPO
+achieves an additional reduction in the hallucination rate of LLaVA-1.5-7B:
+13.26% on the AMBER benchmark and 5.39% on the Object-Hal benchmark, compared
+to the previous SOTA algorithm trained with 16k samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-Grained Image-Text Correspondence with Cost Aggregation for
+  Open-Vocabulary Part Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiho Choi, Seonho Lee, Minhyun Lee, Seungho Lee, Hyunjung Shim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-Vocabulary Part Segmentation (OVPS) is an emerging field for recognizing
+fine-grained parts in unseen categories. We identify two primary challenges in
+OVPS: (1) the difficulty in aligning part-level image-text correspondence, and
+(2) the lack of structural understanding in segmenting object parts. To address
+these issues, we propose PartCATSeg, a novel framework that integrates
+object-aware part-level cost aggregation, compositional loss, and structural
+guidance from DINO. Our approach employs a disentangled cost aggregation
+strategy that handles object and part-level costs separately, enhancing the
+precision of part-level segmentation. We also introduce a compositional loss to
+better capture part-object relationships, compensating for the limited part
+annotations. Additionally, structural guidance from DINO features improves
+boundary delineation and inter-part understanding. Extensive experiments on
+Pascal-Part-116, ADE20K-Part-234, and PartImageNet datasets demonstrate that
+our method significantly outperforms state-of-the-art approaches, setting a new
+baseline for robust generalization to unseen part categories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP
+  Evaluation Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexis Roger, Prateek Humane, Daniel Z. Kaplan, Kshitij Gupta, Qi Sun, George Adamopoulos, Jonathan Siu Chi Lim, Quentin Anthony, Edwin Fennell, Irina Rish
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of Vision-Language Models (VLMs) in the past several years
+calls for rigorous and comprehensive evaluation methods and benchmarks. This
+work analyzes existing VLM evaluation techniques, including automated metrics,
+AI-based assessments, and human evaluations across diverse tasks. We first
+introduce Robin - a novel suite of VLMs that we built by combining Large
+Language Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use
+Robin to identify shortcomings of current evaluation approaches across scales.
+Next, to overcome the identified limitations, we introduce CHIRP - a new long
+form response benchmark we developed for more robust and complete VLM
+evaluation. We provide open access to the Robin training code, model suite, and
+CHIRP benchmark to promote reproducibility and advance VLM research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Face Matching and Physical-Digital Spoofing Attack Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arun Kunwar, Ajita Rattani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition technology has dramatically transformed the landscape of
+security, surveillance, and authentication systems, offering a user-friendly
+and non-invasive biometric solution. However, despite its significant
+advantages, face recognition systems face increasing threats from physical and
+digital spoofing attacks. Current research typically treats face recognition
+and attack detection as distinct classification challenges. This approach
+necessitates the implementation of separate models for each task, leading to
+considerable computational complexity, particularly on devices with limited
+resources. Such inefficiencies can stifle scalability and hinder performance.
+In response to these challenges, this paper introduces an innovative unified
+model designed for face recognition and detection of physical and digital
+attacks. By leveraging the advanced Swin Transformer backbone and incorporating
+HiLo attention in a convolutional neural network framework, we address unified
+face recognition and spoof attack detection more effectively. Moreover, we
+introduce augmentation techniques that replicate the traits of physical and
+digital spoofing cues, significantly enhancing our model robustness. Through
+comprehensive experimental evaluation across various datasets, we showcase the
+effectiveness of our model in unified face recognition and spoof detection.
+Additionally, we confirm its resilience against unseen physical and digital
+spoofing attacks, underscoring its potential for real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WMamba: Wavelet-based Mamba for Face Forgery Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siran Peng, Tianshuo Zhang, Li Gao, Xiangyu Zhu, Haoyuan Zhang, Kai Pang, Zhen Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid advancement of deepfake generation technologies, the demand
+for robust and accurate face forgery detection algorithms has become
+increasingly critical. Recent studies have demonstrated that wavelet analysis
+can uncover subtle forgery artifacts that remain imperceptible in the spatial
+domain. Wavelets effectively capture important facial contours, which are often
+slender, fine-grained, and global in nature. However, existing wavelet-based
+approaches fail to fully leverage these unique characteristics, resulting in
+sub-optimal feature extraction and limited generalizability. To address this
+challenge, we introduce WMamba, a novel wavelet-based feature extractor built
+upon the Mamba architecture. WMamba maximizes the utility of wavelet
+information through two key innovations. First, we propose Dynamic Contour
+Convolution (DCConv), which employs specially crafted deformable kernels to
+adaptively model slender facial contours. Second, by leveraging the Mamba
+architecture, our method captures long-range spatial relationships with linear
+computational complexity. This efficiency allows for the extraction of
+fine-grained, global forgery artifacts from small image patches. Extensive
+experimental results show that WMamba achieves state-of-the-art (SOTA)
+performance, highlighting its effectiveness and superiority in face forgery
+detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metric Learning with Progressive Self-Distillation for Audio-Visual
+  Embedding Learning <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donghuo Zeng, Kazushi Ikeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Metric learning projects samples into an embedded space, where similarities
+and dissimilarities are quantified based on their learned representations.
+However, existing methods often rely on label-guided representation learning,
+where representations of different modalities, such as audio and visual data,
+are aligned based on annotated labels. This approach tends to underutilize
+latent complex features and potential relationships inherent in the
+distributions of audio and visual data that are not directly tied to the
+labels, resulting in suboptimal performance in audio-visual embedding learning.
+To address this issue, we propose a novel architecture that integrates
+cross-modal triplet loss with progressive self-distillation. Our method
+enhances representation learning by leveraging inherent distributions and
+dynamically refining soft audio-visual alignments -- probabilistic alignments
+between audio and visual data that capture the inherent relationships beyond
+explicit labels. Specifically, the model distills audio-visual
+distribution-based knowledge from annotated labels in a subset of each batch.
+This self-distilled knowledge is used t
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 2 tables. Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid
+  Prototyping in Virtual Reality Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09600v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09600v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Augusto Pinheiro de Sousa, Heiko Hamann, Oliver Deussen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SLAM is a foundational technique with broad applications in robotics and
+AR/VR. SLAM simulations evaluate new concepts, but testing on
+resource-constrained devices, such as VR HMDs, faces challenges: high
+computational cost and restricted sensor data access. This work proposes a
+sparse framework using mesh geometry projections as features, which improves
+efficiency and circumvents direct sensor data access, advancing SLAM research
+as we demonstrate in VR and through numerical evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sequential PatchCore: Anomaly Detection for Surface Inspection using
+  Synthetic Impurities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runzhou Mao, Juraj Fulir, Christoph Garth, Petra Gospodnetić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The appearance of surface impurities (e.g., water stains, fingerprints,
+stickers) is an often-mentioned issue that causes degradation of automated
+visual inspection systems. At the same time, synthetic data generation
+techniques for visual surface inspection have focused primarily on generating
+perfect examples and defects, disregarding impurities. This study highlights
+the importance of considering impurities when generating synthetic data. We
+introduce a procedural method to include photorealistic water stains in
+synthetic data. The synthetic datasets are generated to correspond to real
+datasets and are further used to train an anomaly detection model and
+investigate the influence of water stains. The high-resolution images used for
+surface inspection lead to memory bottlenecks during anomaly detection
+training. To address this, we introduce Sequential PatchCore - a method to
+build coresets sequentially and make training on large images using
+consumer-grade hardware tractable. This allows us to perform transfer learning
+using coresets pre-trained on different dataset versions. Our results show the
+benefits of using synthetic data for pre-training an explicit coreset anomaly
+model and the extended performance benefits of finetuning the coreset using
+real data. We observed how the impurities and labelling ambiguity lower the
+model performance and have additionally reported the defect-wise recall to
+provide an industrially relevant perspective on model performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Teacher-<span class="highlight-title">Review</span>er-Student Framework for Semi-supervised 2D Human
+  Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wulian Yun, Mengshi Qi, Fei Peng, Huadong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional 2D human pose estimation methods typically require extensive
+labeled annotations, which are both labor-intensive and expensive. In contrast,
+semi-supervised 2D human pose estimation can alleviate the above problems by
+leveraging a large amount of unlabeled data along with a small portion of
+labeled data. Existing semi-supervised 2D human pose estimation methods update
+the network through backpropagation, ignoring crucial historical information
+from the previous training process. Therefore, we propose a novel
+semi-supervised 2D human pose estimation method by utilizing a newly designed
+Teacher-Reviewer-Student framework. Specifically, we first mimic the phenomenon
+that human beings constantly review previous knowledge for consolidation to
+design our framework, in which the teacher predicts results to guide the
+student's learning and the reviewer stores important historical parameters to
+provide additional supervision signals. Secondly, we introduce a Multi-level
+Feature Learning strategy, which utilizes the outputs from different stages of
+the backbone to estimate the heatmap to guide network training, enriching the
+supervisory information while effectively capturing keypoint relationships.
+Finally, we design a data augmentation strategy, i.e., Keypoint-Mix, to perturb
+pose information by mixing different keypoints, thus enhancing the network's
+ability to discern keypoints. Extensive experiments on publicly available
+datasets, demonstrate our method achieves significant improvements compared to
+the existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-driven Adaptation of Foundation Models for Few-shot Surgical
+  Workflow Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tingxuan Chen, Kun Yuan, Vinkle Srivastav, Nassir Navab, Nicolas Padoy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Surgical workflow analysis is crucial for improving surgical
+efficiency and safety. However, previous studies rely heavily on large-scale
+annotated datasets, posing challenges in cost, scalability, and reliance on
+expert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven
+Adaptation), designed to handle various surgical workflow analysis tasks with
+minimal paired image-label data.
+  Methods: Our approach has two key components. First, Few-shot selection-based
+modality alignment selects a small subset of images and aligns their embeddings
+with text embeddings from the downstream task, bridging the modality gap.
+Second, Text-driven adaptation leverages only text data to train a decoder,
+eliminating the need for paired image-text data. This decoder is then applied
+to aligned image embeddings, enabling image-related tasks without explicit
+image-text pairs.
+  Results: We evaluate our approach to generative tasks (image captioning) and
+discriminative tasks (triplet recognition and phase recognition). Results show
+that Surg-FTDA outperforms baselines and generalizes well across downstream
+tasks.
+  Conclusion: We propose a text-driven adaptation approach that mitigates the
+modality gap and handles multiple downstream tasks in surgical workflow
+analysis, with minimal reliance on large annotated datasets. The code and
+dataset will be released in https://github.com/TingxuanSix/Surg-FTDA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring AI-based System Design for Pixel-level Protected Health
+  Information Detection in Medical Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuan Truong, Ivo M. Baltruschat, Mark Klemens, Grit Werner, Matthias Lenga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  De-identification of medical images is a critical step to ensure privacy
+during data sharing in research and clinical settings. The initial step in this
+process involves detecting Protected Health Information (PHI), which can be
+found in image metadata or imprinted within image pixels. Despite the
+importance of such systems, there has been limited evaluation of existing
+AI-based solutions, creating barriers to the development of reliable and robust
+tools. In this study, we present an AI-based pipeline for PHI detection,
+comprising three key components: text detection, text extraction, and analysis
+of PHI content in medical images. By experimenting with exchanging roles of
+vision and language models within the pipeline, we evaluate the performance and
+recommend the best setup for the PHI detection task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaFV: Accelerating VLMs with Self-Adaptive Cross-Modality Attention
+  Mixture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Han, Liang Du, Yiwen Wu, Xiangguo Zhou, Hongwei Du, Weibo Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of VLMs often relies on the dynamic high-resolution schema that
+adaptively augments the input images to multiple crops, so that the details of
+the images can be retained. However, such approaches result in a large number
+of redundant visual tokens, thus significantly reducing the efficiency of the
+VLMs. To improve the VLMs' efficiency without introducing extra training costs,
+many research works are proposed to reduce the visual tokens by filtering the
+uninformative visual tokens or aggregating their information. Some approaches
+propose to reduce the visual tokens according to the self-attention of VLMs,
+which are biased, to result in inaccurate responses. The token reduction
+approaches solely rely on visual cues are text-agnostic, and fail to focus on
+the areas that are most relevant to the question, especially when the queried
+objects are non-salient to the image. In this work, we first conduct
+experiments to show that the original text embeddings are aligned with the
+visual tokens, without bias on the tailed visual tokens. We then propose a
+self-adaptive cross-modality attention mixture mechanism that dynamically
+leverages the effectiveness of visual saliency and text-to-image similarity in
+the pre-LLM layers to select the visual tokens that are informative. Extensive
+experiments demonstrate that the proposed approach achieves state-of-the-art
+training-free VLM acceleration performance, especially when the reduction rate
+is sufficiently large.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HydraMix: Multi-Image Feature Mixing for Small Data Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Reinders, Frederik Schubert, Bodo Rosenhahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training deep neural networks requires datasets with a large number of
+annotated examples. The collection and annotation of these datasets is not only
+extremely expensive but also faces legal and privacy problems. These factors
+are a significant limitation for many real-world applications. To address this,
+we introduce HydraMix, a novel architecture that generates new image
+compositions by mixing multiple different images from the same class. HydraMix
+learns the fusion of the content of various images guided by a
+segmentation-based mixing mask in feature space and is optimized via a
+combination of unsupervised and adversarial training. Our data augmentation
+scheme allows the creation of models trained from scratch on very small
+datasets. We conduct extensive experiments on ciFAIR-10, STL-10, and
+ciFAIR-100. Additionally, we introduce a novel text-image metric to assess the
+generality of the augmented datasets. Our results show that HydraMix
+outperforms existing state-of-the-art methods for image classification on small
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AnyStory: Towards Unified Single and Multiple Subject Personalization in
+  Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie He, Yuxiang Tuo, Binghui Chen, Chongyang Zhong, Yifeng Geng, Liefeng Bo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large-scale generative models have demonstrated outstanding
+text-to-image generation capabilities. However, generating high-fidelity
+personalized images with specific subjects still presents challenges,
+especially in cases involving multiple subjects. In this paper, we propose
+AnyStory, a unified approach for personalized subject generation. AnyStory not
+only achieves high-fidelity personalization for single subjects, but also for
+multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory
+models the subject personalization problem in an "encode-then-route" manner. In
+the encoding step, AnyStory utilizes a universal and powerful image encoder,
+i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve
+high-fidelity encoding of subject features. In the routing step, AnyStory
+utilizes a decoupled instance-aware subject router to accurately perceive and
+predict the potential location of the corresponding subject in the latent
+space, and guide the injection of subject conditions. Detailed experimental
+results demonstrate the excellent performance of our method in retaining
+subject details, aligning text descriptions, and personalizing for multiple
+subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech report; Project page:
+  https://aigcdesigngroup.github.io/AnyStory/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Omni-Emotion: Extending Video MLLM with Detailed Face and Audio Modeling
+  for Multimodal Emotion Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qize Yang, Detao Bai, Yi-Xing Peng, Xihan Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding emotions accurately is essential for fields like human-computer
+interaction. Due to the complexity of emotions and their multi-modal nature
+(e.g., emotions are influenced by facial expressions and audio), researchers
+have turned to using multi-modal models to understand human emotions rather
+than single-modality. However, current video multi-modal large language models
+(MLLMs) encounter difficulties in effectively integrating audio and identifying
+subtle facial micro-expressions. Furthermore, the lack of detailed emotion
+analysis datasets also limits the development of multimodal emotion analysis.
+To address these issues, we introduce a self-reviewed dataset and a
+human-reviewed dataset, comprising 24,137 coarse-grained samples and 3,500
+manually annotated samples with detailed emotion annotations, respectively.
+These datasets allow models to learn from diverse scenarios and better
+generalize to real-world applications. Moreover, in addition to the audio
+modeling, we propose to explicitly integrate facial encoding models into the
+existing advanced Video MLLM, enabling the MLLM to effectively unify audio and
+the subtle facial cues for emotion understanding. By aligning these features
+within a unified space and employing instruction tuning in our proposed
+datasets, our Omni-Emotion achieves state-of-the-art performance in both
+emotion recognition and reasoning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VanGogh: A Unified Multimodal Diffusion-based Framework for Video
+  Colorization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixun Fang, Zhiheng Liu, Kai Zhu, Yu Liu, Ka Leong Cheng, Wei Zhai, Yang Cao, Zheng-Jun Zha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video colorization aims to transform grayscale videos into vivid color
+representations while maintaining temporal consistency and structural
+integrity. Existing video colorization methods often suffer from color bleeding
+and lack comprehensive control, particularly under complex motion or diverse
+semantic cues. To this end, we introduce VanGogh, a unified multimodal
+diffusion-based framework for video colorization. VanGogh tackles these
+challenges using a Dual Qformer to align and fuse features from multiple
+modalities, complemented by a depth-guided generation process and an optical
+flow loss, which help reduce color overflow. Additionally, a color injection
+strategy and luma channel replacement are implemented to improve generalization
+and mitigate flickering artifacts. Thanks to this design, users can exercise
+both global and local control over the generation process, resulting in
+higher-quality colorized videos. Extensive qualitative and quantitative
+evaluations, and user studies, demonstrate that VanGogh achieves superior
+temporal consistency and color fidelity.Project page:
+https://becauseimbatman0.github.io/VanGogh.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparison of Various SLAM Systems for Mobile Robot in an Indoor
+  Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maksim Filipenko, Ilya Afanasyev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article presents a comparative analysis of a mobile robot trajectories
+computed by various ROS-based SLAM systems. For this reason we developed a
+prototype of a mobile robot with common sensors: 2D lidar, a monocular and ZED
+stereo cameras. Then we conducted experiments in a typical office environment
+and collected data from all sensors, running all tested SLAM systems based on
+the acquired dataset. We studied the following SLAM systems: (a) 2D
+lidar-based: GMapping, Hector SLAM, Cartographer; (b) monocular camera-based:
+Large Scale Direct monocular SLAM (LSD SLAM), ORB SLAM, Direct Sparse Odometry
+(DSO); and (c) stereo camera-based: ZEDfu, Real-Time Appearance-Based Mapping
+(RTAB map), ORB SLAM, Stereo Parallel Tracking and Mapping (S-PTAM). Since all
+SLAM methods were tested on the same dataset we compared results for different
+SLAM systems with appropriate metrics, demonstrating encouraging results for
+lidar-based Cartographer SLAM, Monocular ORB SLAM and Stereo RTAB Map methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Devil is in the Details: Simple Remedies for Image-to-LiDAR
+  Representation Learning <span class="chip">ACCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wonjun Jo, Kwon Byung-Ki, Kim Ji-Yeon, Hawook Jeong, Kyungdon Joo, Tae-Hyun Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR is a crucial sensor in autonomous driving, commonly used alongside
+cameras. By exploiting this camera-LiDAR setup and recent advances in image
+representation learning, prior studies have shown the promising potential of
+image-to-LiDAR distillation. These prior arts focus on the designs of their own
+losses to effectively distill the pre-trained 2D image representations into a
+3D model. However, the other parts of the designs have been surprisingly
+unexplored. We find that fundamental design elements, e.g., the LiDAR
+coordinate system, quantization according to the existing input interface, and
+data utilization, are more critical than developing loss functions, which have
+been overlooked in prior works. In this work, we show that simple fixes to
+these designs notably outperform existing methods by 16% in 3D semantic
+segmentation on the nuScenes dataset and 13% in 3D object detection on the
+KITTI dataset in downstream task performance. We focus on overlooked design
+choices along the spatial and temporal axes. Spatially, prior work has used
+cylindrical coordinate and voxel sizes without considering their side effects
+yielded with a commonly deployed sparse convolution layer input interface,
+leading to spatial quantization errors in 3D models. Temporally, existing work
+has avoided cumbersome data curation by discarding unsynced data, limiting the
+use to only the small portion of data that is temporally synced across sensors.
+We analyze these effects and propose simple solutions for each overlooked
+aspect.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACCV2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MonoSOWA: Scalable monocular 3D Object detector Without human
+  Annotations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Skvrna, Lukas Neumann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting the three-dimensional position and orientation of objects using a
+single RGB camera is a foundational task in computer vision with many important
+applications. Traditionally, 3D object detection methods are trained in a
+fully-supervised setup, requiring vast amounts of human annotations, which are
+laborious, costly, and do not scale well with the ever-increasing amounts of
+data being captured.
+  In this paper, we present the first method to train 3D object detectors for
+monocular RGB cameras without domain-specific human annotations, thus making
+orders of magnitude more data available for training. Thanks to newly proposed
+Canonical Object Space, the method can not only exploit data across a variety
+of datasets and camera setups to train a single 3D detector, but unlike
+previous work it also works out of the box in previously unseen camera setups.
+All this is crucial for practical applications, where the data and cameras are
+extremely heterogeneous.
+  The method is evaluated on two standard autonomous driving datasets, where it
+outperforms previous works, which, unlike our method, still rely on 2D human
+annotations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DEFOM-Stereo: Depth Foundation Model Based Stereo Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09466v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09466v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hualie Jiang, Zhiqiang Lou, Laiyan Ding, Rui Xu, Minglang Tan, Wenjie Jiang, Rui Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stereo matching is a key technique for metric depth estimation in computer
+vision and robotics. Real-world challenges like occlusion and non-texture
+hinder accurate disparity estimation from binocular matching cues. Recently,
+monocular relative depth estimation has shown remarkable generalization using
+vision foundation models. Thus, to facilitate robust stereo matching with
+monocular depth cues, we incorporate a robust monocular relative depth model
+into the recurrent stereo-matching framework, building a new framework for
+depth foundation model-based stereo-matching, DEFOM-Stereo. In the feature
+extraction stage, we construct the combined context and matching feature
+encoder by integrating features from conventional CNNs and DEFOM. In the update
+stage, we use the depth predicted by DEFOM to initialize the recurrent
+disparity and introduce a scale update module to refine the disparity at the
+correct scale. DEFOM-Stereo is verified to have comparable performance on the
+Scene Flow dataset with state-of-the-art (SOTA) methods and notably shows much
+stronger zero-shot generalization. Moreover, DEFOM-Stereo achieves SOTA
+performance on the KITTI 2012, KITTI 2015, Middlebury, and ETH3D benchmarks,
+ranking 1st on many metrics. In the joint evaluation under the robust vision
+challenge, our model simultaneously outperforms previous models on the
+individual benchmarks. Both results demonstrate the outstanding capabilities of
+the proposed model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/Insta360-Research-Team/DEFOM-Stereo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RE-POSE: Synergizing Reinforcement Learning-Based Partitioning and
+  Offloading for Edge Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianrui Shi, Yong Zhao, Zeyang Cui, Xiaoming Shen, Minhang Zeng, Xiaojie Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object detection plays a crucial role in smart video analysis, with
+applications ranging from autonomous driving and security to smart cities.
+However, achieving real-time object detection on edge devices presents
+significant challenges due to their limited computational resources and the
+high demands of deep neural network (DNN)-based detection models, particularly
+when processing high-resolution video. Conventional strategies, such as input
+down-sampling and network up-scaling, often compromise detection accuracy for
+faster performance or lead to higher inference latency. To address these
+issues, this paper introduces RE-POSE, a Reinforcement Learning (RL)-Driven
+Partitioning and Edge Offloading framework designed to optimize the
+accuracy-latency trade-off in resource-constrained edge environments. Our
+approach features an RL-Based Dynamic Clustering Algorithm (RL-DCA) that
+partitions video frames into non-uniform blocks based on object distribution
+and the computational characteristics of DNNs. Furthermore, a parallel edge
+offloading scheme is implemented to distribute these blocks across multiple
+edge servers for concurrent processing. Experimental evaluations show that
+RE-POSE significantly enhances detection accuracy and reduces inference
+latency, surpassing existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Normal-NeRF: Ambiguity-Robust Normal Estimation for Highly Reflective
+  Scenes <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Shi, Xianghua Ying, Ruohao Guo, Bowei Xing, Wenzhen Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) often struggle with reconstructing and
+rendering highly reflective scenes. Recent advancements have developed various
+reflection-aware appearance models to enhance NeRF's capability to render
+specular reflections. However, the robust reconstruction of highly reflective
+scenes is still hindered by the inherent shape ambiguity on specular surfaces.
+Existing methods typically rely on additional geometry priors to regularize the
+shape prediction, but this can lead to oversmoothed geometry in complex scenes.
+Observing the critical role of surface normals in parameterizing reflections,
+we introduce a transmittance-gradient-based normal estimation technique that
+remains robust even under ambiguous shape conditions. Furthermore, we propose a
+dual activated densities module that effectively bridges the gap between smooth
+surface normals and sharp object boundaries. Combined with a reflection-aware
+appearance model, our proposed method achieves robust reconstruction and
+high-fidelity rendering of scenes featuring both highly specular reflections
+and intricate geometric structures. Extensive experiments demonstrate that our
+method outperforms existing state-of-the-art methods on various datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025, code available at https://github.com/sjj118/Normal-NeRF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Relation between Optical Aperture and Automotive Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09456v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09456v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ofer Bar-Shalom, Tzvi Philipp, Eran Kishon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the impact of aperture size and shape on automotive camera systems
+for deep-learning-based tasks like traffic sign recognition and light state
+detection. A method is proposed to simulate optical effects using the point
+spread function (PSF), enhancing realism and reducing the domain gap between
+synthetic and real-world images. Computer-generated scenes are refined with
+this technique to model optical distortions and improve simulation accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Double Visual Defense: Adversarial <span class="highlight-title">Pre-train</span>ing and Instruction Tuning
+  for Improving Vision-Language Model Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Wang, Cihang Xie, Brian Bartoldson, Bhavya Kailkhura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the robustness of vision-language models against
+adversarial visual perturbations and introduces a novel ``double visual
+defense" to enhance this robustness. Unlike previous approaches that resort to
+lightweight adversarial fine-tuning of a pre-trained CLIP model, we perform
+large-scale adversarial vision-language pre-training from scratch using
+web-scale data. We then strengthen the defense by incorporating adversarial
+visual instruction tuning. The resulting models from each stage, $\Delta$CLIP
+and $\Delta^2$LLaVA, show substantially enhanced zero-shot robustness and set a
+new state-of-the-art in adversarial defense for vision-language models. For
+example, the adversarial robustness of $\Delta$CLIP surpasses that of the
+previous best models on ImageNet-1k by ~20%. %For example, $\Delta$CLIP
+surpasses the previous best models on ImageNet-1k by ~20% in terms of
+adversarial robustness. Similarly, compared to prior art, $\Delta^2$LLaVA
+brings a ~30% robustness improvement to image captioning task and a ~20%
+robustness improvement to visual question answering task. Furthermore, our
+models exhibit stronger zero-shot recognition capability, fewer hallucinations,
+and superior reasoning performance compared to baselines. Our project page is
+https://doublevisualdefense.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling up <span class="highlight-title">self-supervised</span> learning for improved surgical foundation
+  models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim J. M. Jaspers, Ronald L. P. D. de Jong, Yiping Li, Carolus H. J. Kusters, Franciscus H. A. Bakker, Romy C. van Jaarsveld, Gino M. Kuiper, Richard van Hillegersberg, Jelle P. Ruurda, Willem M. Brinkman, Josien P. W. Pluim, Peter H. N. de With, Marcel Breeuwer, Yasmina Al Khalil, Fons van der Sommen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have revolutionized computer vision by achieving vastly
+superior performance across diverse tasks through large-scale pretraining on
+extensive datasets. However, their application in surgical computer vision has
+been limited. This study addresses this gap by introducing SurgeNetXL, a novel
+surgical foundation model that sets a new benchmark in surgical computer
+vision. Trained on the largest reported surgical dataset to date, comprising
+over 4.7 million video frames, SurgeNetXL achieves consistent top-tier
+performance across six datasets spanning four surgical procedures and three
+tasks, including semantic segmentation, phase recognition, and critical view of
+safety (CVS) classification. Compared with the best-performing surgical
+foundation models, SurgeNetXL shows mean improvements of 2.4, 9.0, and 12.6
+percent for semantic segmentation, phase recognition, and CVS classification,
+respectively. Additionally, SurgeNetXL outperforms the best-performing
+ImageNet-based variants by 14.4, 4.0, and 1.6 percent in the respective tasks.
+In addition to advancing model performance, this study provides key insights
+into scaling pretraining datasets, extending training durations, and optimizing
+model architectures specifically for surgical computer vision. These findings
+pave the way for improved generalizability and robustness in data-scarce
+scenarios, offering a comprehensive framework for future research in this
+domain. All models and a subset of the SurgeNetXL dataset, including over 2
+million video frames, are publicly available at:
+https://github.com/TimJaspers0801/SurgeNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09433v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09433v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hwan Heo, Jangyeong Kim, Seongyeong Lee, Jeong A Wi, Junyoung Choi, Sangjun Ahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The synthesis of high-quality 3D assets from textual or visual inputs has
+become a central objective in modern generative modeling. Despite the
+proliferation of 3D generation algorithms, they frequently grapple with
+challenges such as multi-view inconsistency, slow generation times, low
+fidelity, and surface reconstruction problems. While some studies have
+addressed some of these issues, a comprehensive solution remains elusive. In
+this paper, we introduce \textbf{CaPa}, a carve-and-paint framework that
+generates high-fidelity 3D assets efficiently. CaPa employs a two-stage
+process, decoupling geometry generation from texture synthesis. Initially, a 3D
+latent diffusion model generates geometry guided by multi-view inputs, ensuring
+structural consistency across perspectives. Subsequently, leveraging a novel,
+model-agnostic Spatially Decoupled Attention, the framework synthesizes
+high-resolution textures (up to 4K) for a given geometry. Furthermore, we
+propose a 3D-aware occlusion inpainting algorithm that fills untextured
+regions, resulting in cohesive results across the entire model. This pipeline
+generates high-quality 3D assets in less than 30 seconds, providing
+ready-to-use outputs for commercial applications. Experimental results
+demonstrate that CaPa excels in both texture fidelity and geometric stability,
+establishing a new standard for practical, scalable 3D asset generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://ncsoft.github.io/CaPa/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AugRefer: Advancing 3D Visual Grounding via Cross-Modal Augmentation and
+  Spatial Relation-based Referring <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Wang, Na Zhao, Zhiyuan Han, Dan Guo, Xun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D visual grounding (3DVG), which aims to correlate a natural language
+description with the target object within a 3D scene, is a significant yet
+challenging task. Despite recent advancements in this domain, existing
+approaches commonly encounter a shortage: a limited amount and diversity of
+text3D pairs available for training. Moreover, they fall short in effectively
+leveraging different contextual clues (e.g., rich spatial relations within the
+3D visual space) for grounding. To address these limitations, we propose
+AugRefer, a novel approach for advancing 3D visual grounding. AugRefer
+introduces cross-modal augmentation designed to extensively generate diverse
+text-3D pairs by placing objects into 3D scenes and creating accurate and
+semantically rich descriptions using foundation models. Notably, the resulting
+pairs can be utilized by any existing 3DVG methods for enriching their training
+data. Additionally, AugRefer presents a language-spatial adaptive decoder that
+effectively adapts the potential referring objects based on the language
+description and various 3D spatial relations. Extensive experiments on three
+benchmark datasets clearly validate the effectiveness of AugRefer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-Language Models Do Not Understand Negation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kumail Alhamoud, Shaden Alshammari, Yonglong Tian, Guohao Li, Philip Torr, Yoon Kim, Marzyeh Ghassemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many practical vision-language applications require models that understand
+negation, e.g., when using natural language to retrieve images which contain
+certain objects but not others. Despite advancements in vision-language models
+(VLMs) through large-scale training, their ability to comprehend negation
+remains underexplored. This study addresses the question: how well do current
+VLMs understand negation? We introduce NegBench, a new benchmark designed to
+evaluate negation understanding across 18 task variations and 79k examples
+spanning image, video, and medical datasets. The benchmark consists of two core
+tasks designed to evaluate negation understanding in diverse multimodal
+settings: Retrieval with Negation and Multiple Choice Questions with Negated
+Captions. Our evaluation reveals that modern VLMs struggle significantly with
+negation, often performing at chance level. To address these shortcomings, we
+explore a data-centric approach wherein we finetune CLIP models on large-scale
+synthetic datasets containing millions of negated captions. We show that this
+approach can result in a 10% increase in recall on negated queries and a 40%
+boost in accuracy on multiple-choice questions with negated captions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://negbench.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Neural Style Transfer for Artistic Image Generation using VGG19 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kapil Kashyap, Mehak Garg, Sean Fargose, Sindhu Nair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Throughout history, humans have created remarkable works of art, but
+artificial intelligence has only recently started to make strides in generating
+visually compelling art. Breakthroughs in the past few years have focused on
+using convolutional neural networks (CNNs) to separate and manipulate the
+content and style of images, applying texture synthesis techniques.
+Nevertheless, a number of current techniques continue to encounter obstacles,
+including lengthy processing times, restricted choices of style images, and the
+inability to modify the weight ratio of styles. We proposed a neural style
+transfer system that can add various artistic styles to a desired image to
+address these constraints allowing flexible adjustments to style weight ratios
+and reducing processing time. The system uses the VGG19 model for feature
+extraction, ensuring high-quality, flexible stylization without compromising
+content integrity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Robust and Realistic Human Pose Estimation via WiFi Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Chen, Jingcai Guo, Song Guo, Jingren Zhou, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust WiFi-based human pose estimation is a challenging task that bridges
+discrete and subtle WiFi signals to human skeletons. This paper revisits this
+problem and reveals two critical yet overlooked issues: 1) cross-domain gap,
+i.e., due to significant variations between source-target domain pose
+distributions; and 2) structural fidelity gap, i.e., predicted skeletal poses
+manifest distorted topology, usually with misplaced joints and disproportionate
+bone lengths. This paper fills these gaps by reformulating the task into a
+novel two-phase framework dubbed DT-Pose: Domain-consistent representation
+learning and Topology-constrained Pose decoding. Concretely, we first propose a
+temporal-consistent contrastive learning strategy with uniformity
+regularization, coupled with self-supervised masking-reconstruction operations,
+to enable robust learning of domain-consistent and motion-discriminative
+WiFi-specific representations. Beyond this, we introduce a simple yet effective
+pose decoder with task prompts, which integrates Graph Convolution Network
+(GCN) and Transformer layers to constrain the topology structure of the
+generated skeleton by exploring the adjacent-overarching relationships among
+human joints. Extensive experiments conducted on various benchmark datasets
+highlight the superior performance of our method in tackling these fundamental
+challenges in both 2D/3D human pose estimation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PISCO: <span class="highlight-title">Self-Supervised</span> k-Space Regularization for Improved Neural
+  Implicit k-Space Representations of Dynamic MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Veronika Spieker, Hannah Eichhorn, Wenqi Huang, Jonathan K. Stelter, Tabita Catalan, Rickmer F. Braren, Daniel Rueckert, Francisco Sahli Costabal, Kerstin Hammernik, Dimitrios C. Karampinos, Claudia Prieto, Julia A. Schnabel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural implicit k-space representations (NIK) have shown promising results
+for dynamic magnetic resonance imaging (MRI) at high temporal resolutions. Yet,
+reducing acquisition time, and thereby available training data, results in
+severe performance drops due to overfitting. To address this, we introduce a
+novel self-supervised k-space loss function $\mathcal{L}_\mathrm{PISCO}$,
+applicable for regularization of NIK-based reconstructions. The proposed loss
+function is based on the concept of parallel imaging-inspired self-consistency
+(PISCO), enforcing a consistent global k-space neighborhood relationship
+without requiring additional data. Quantitative and qualitative evaluations on
+static and dynamic MR reconstructions show that integrating PISCO significantly
+improves NIK representations. Particularly for high acceleration factors
+(R$\geq$54), NIK with PISCO achieves superior spatio-temporal reconstruction
+quality compared to state-of-the-art methods. Furthermore, an extensive
+analysis of the loss assumptions and stability shows PISCO's potential as
+versatile self-supervised k-space loss function for further applications and
+architectures. Code is available at:
+https://github.com/compai-lab/2025-pisco-spieker
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint Transmission and Deblurring: A Semantic Communication Approach
+  Using Events 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09396v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09396v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pujing Yang, Guangyi Zhang, Yunlong Cai, Lei Yu, Guanding Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based joint source-channel coding (JSCC) is emerging as a
+promising technology for effective image transmission. However, most existing
+approaches focus on transmitting clear images, overlooking real-world
+challenges such as motion blur caused by camera shaking or fast-moving objects.
+Motion blur often degrades image quality, making transmission and
+reconstruction more challenging. Event cameras, which asynchronously record
+pixel intensity changes with extremely low latency, have shown great potential
+for motion deblurring tasks. However, the efficient transmission of the
+abundant data generated by event cameras remains a significant challenge. In
+this work, we propose a novel JSCC framework for the joint transmission of
+blurry images and events, aimed at achieving high-quality reconstructions under
+limited channel bandwidth. This approach is designed as a deblurring
+task-oriented JSCC system. Since RGB cameras and event cameras capture the same
+scene through different modalities, their outputs contain both shared and
+domain-specific information. To avoid repeatedly transmitting the shared
+information, we extract and transmit their shared information and
+domain-specific information, respectively. At the receiver, the received
+signals are processed by a deblurring decoder to generate clear images.
+Additionally, we introduce a multi-stage training strategy to train the
+proposed model. Simulation results demonstrate that our method significantly
+outperforms existing JSCC-based image transmission schemes, addressing motion
+blur effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SVIA: A Street View Image Anonymization Framework for Self-Driving
+  Applications <span class="chip">SC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyu Liu, Xuhong Wang, Cen Chen, Yanhao Wang, Shengyue Yao, Yilun Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there has been an increasing interest in image
+anonymization, particularly focusing on the de-identification of faces and
+individuals. However, for self-driving applications, merely de-identifying
+faces and individuals might not provide sufficient privacy protection since
+street views like vehicles and buildings can still disclose locations,
+trajectories, and other sensitive information. Therefore, it remains crucial to
+extend anonymization techniques to street view images to fully preserve the
+privacy of users, pedestrians, and vehicles. In this paper, we propose a Street
+View Image Anonymization (SVIA) framework for self-driving applications. The
+SVIA framework consists of three integral components: a semantic segmenter to
+segment an input image into functional regions, an inpainter to generate
+alternatives to privacy-sensitive regions, and a harmonizer to seamlessly
+stitch modified regions to guarantee visual coherence. Compared to existing
+methods, SVIA achieves a much better trade-off between image generation quality
+and privacy protection, as evidenced by experimental results for five common
+metrics on two widely used public datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 3 tables. Accepted by IEEE ITSC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Segmentation with <span class="highlight-title">transformer</span>s: An <span class="highlight-title">Overview</span>, Challenges and Future 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09372v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09372v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deepjyoti Chetia, Debasish Dutta, Sanjib Kr Kalita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image segmentation, a key task in computer vision, has traditionally relied
+on convolutional neural networks (CNNs), yet these models struggle with
+capturing complex spatial dependencies, objects with varying scales, need for
+manually crafted architecture components and contextual information. This paper
+explores the shortcomings of CNN-based models and the shift towards transformer
+architectures -to overcome those limitations. This work reviews
+state-of-the-art transformer-based segmentation models, addressing
+segmentation-specific challenges and their solutions. The paper discusses
+current challenges in transformer-based segmentation and outlines promising
+future trends, such as lightweight architectures and enhanced data efficiency.
+This survey serves as a guide for understanding the impact of transformers in
+advancing segmentation capabilities and overcoming the limitations of
+traditional models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identification of Traditional Medicinal Plant Leaves Using an effective
+  Deep Learning model and Self-Curated <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deepjyoti Chetia, Sanjib Kr Kalita, Prof Partha Pratim Baruah, Debasish Dutta, Tanaz Akhter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medicinal plants have been a key component in producing traditional and
+modern medicines, especially in the field of Ayurveda, an ancient Indian
+medical system. Producing these medicines and collecting and extracting the
+right plant is a crucial step due to the visually similar nature of some
+plants. The extraction of these plants from nonmedicinal plants requires human
+expert intervention. To solve the issue of accurate plant identification and
+reduce the need for a human expert in the collection process; employing
+computer vision methods will be efficient and beneficial. In this paper, we
+have proposed a model that solves such issues. The proposed model is a custom
+convolutional neural network (CNN) architecture with 6 convolution layers,
+max-pooling layers, and dense layers. The model was tested on three different
+datasets named Indian Medicinal Leaves Image Dataset,MED117 Medicinal Plant
+Leaf Dataset, and the self-curated dataset by the authors. The proposed model
+achieved respective accuracies of 99.5%, 98.4%, and 99.7% using various
+optimizers including Adam, RMSprop, and SGD with momentum.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strategic Base Representation Learning via Feature Augmentations for
+  Few-Shot Class Incremental Learning <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parinita Nema, Vinod K Kurmi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot class incremental learning implies the model to learn new classes
+while retaining knowledge of previously learned classes with a small number of
+training instances. Existing frameworks typically freeze the parameters of the
+previously learned classes during the incorporation of new classes. However,
+this approach often results in suboptimal class separation of previously
+learned classes, leading to overlap between old and new classes. Consequently,
+the performance of old classes degrades on new classes. To address these
+challenges, we propose a novel feature augmentation driven contrastive learning
+framework designed to enhance the separation of previously learned classes to
+accommodate new classes. Our approach involves augmenting feature vectors and
+assigning proxy labels to these vectors. This strategy expands the feature
+space, ensuring seamless integration of new classes within the expanded space.
+Additionally, we employ a self-supervised contrastive loss to improve the
+separation between previous classes. We validate our framework through
+experiments on three FSCIL benchmark datasets: CIFAR100, miniImageNet, and
+CUB200. The results demonstrate that our Feature Augmentation driven
+Contrastive Learning framework significantly outperforms other approaches,
+achieving state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ YETI (YET to Intervene) Proactive Interventions by Multimodal AI Agents
+  in Augmented Reality Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saptarashmi Bandyopadhyay, Vikas Bahirwani, Lavisha Aggarwal, Bhanu Guda, Lin Li, Andrea Colaco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal AI Agents are AI models that have the capability of interactively
+and cooperatively assisting human users to solve day-to-day tasks. Augmented
+Reality (AR) head worn devices can uniquely improve the user experience of
+solving procedural day-to-day tasks by providing egocentric multimodal (audio
+and video) observational capabilities to AI Agents. Such AR capabilities can
+help AI Agents see and listen to actions that users take which can relate to
+multimodal capabilities of human users. Existing AI Agents, either Large
+Language Models (LLMs) or Multimodal Vision-Language Models (VLMs) are reactive
+in nature, which means that models cannot take an action without reading or
+listening to the human user's prompts. Proactivity of AI Agents on the other
+hand can help the human user detect and correct any mistakes in agent observed
+tasks, encourage users when they do tasks correctly or simply engage in
+conversation with the user - akin to a human teaching or assisting a user. Our
+proposed YET to Intervene (YETI) multimodal agent focuses on the research
+question of identifying circumstances that may require the agent to intervene
+proactively. This allows the agent to understand when it can intervene in a
+conversation with human users that can help the user correct mistakes on tasks,
+like cooking, using AR. Our YETI Agent learns scene understanding signals based
+on interpretable notions of Structural Similarity (SSIM) on consecutive video
+frames. We also define the alignment signal which the AI Agent can learn to
+identify if the video frames corresponding to the user's actions on the task
+are consistent with expected actions. These signals are used by our AI Agent to
+determine when it should proactively intervene. We compare our results on the
+instances of proactive intervention in the HoloAssist multimodal benchmark for
+an expert agent guiding a user to complete procedural tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making Your Dreams A Reality: Decoding the Dreams into a Coherent Video
+  Story from fMRI Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09350v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09350v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanwei Fu, Jianxiong Gao, Baofeng Yang, Jianfeng Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the brave new idea for Multimedia community, and proposes
+a novel framework to convert dreams into coherent video narratives using fMRI
+data. Essentially, dreams have intrigued humanity for centuries, offering
+glimpses into our subconscious minds. Recent advancements in brain imaging,
+particularly functional magnetic resonance imaging (fMRI), have provided new
+ways to explore the neural basis of dreaming. By combining subjective dream
+experiences with objective neurophysiological data, we aim to understand the
+visual aspects of dreams and create complete video narratives. Our process
+involves three main steps: reconstructing visual perception, decoding dream
+imagery, and integrating dream stories. Using innovative techniques in fMRI
+analysis and language modeling, we seek to push the boundaries of dream
+research and gain deeper insights into visual experiences during sleep. This
+technical report introduces a novel approach to visually decoding dreams using
+fMRI signals and weaving dream visuals into narratives using language models.
+We gather a dataset of dreams along with descriptions to assess the
+effectiveness of our framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UVRM: A Scalable 3D Reconstruction Model from Unposed Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09347v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09347v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiu-hong Kao, Xiao Li, Jinglu Wang, Chi-Keung Tang, Yu-Wing Tai, Yan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Reconstruction Models (LRMs) have recently become a popular method for
+creating 3D foundational models. Training 3D reconstruction models with 2D
+visual data traditionally requires prior knowledge of camera poses for the
+training samples, a process that is both time-consuming and prone to errors.
+Consequently, 3D reconstruction training has been confined to either synthetic
+3D datasets or small-scale datasets with annotated poses. In this study, we
+investigate the feasibility of 3D reconstruction using unposed video data of
+various objects. We introduce UVRM, a novel 3D reconstruction model capable of
+being trained and evaluated on monocular videos without requiring any
+information about the pose. UVRM uses a transformer network to implicitly
+aggregate video frames into a pose-invariant latent feature space, which is
+then decoded into a tri-plane 3D representation. To obviate the need for
+ground-truth pose annotations during training, UVRM employs a combination of
+the score distillation sampling (SDS) method and an analysis-by-synthesis
+approach, progressively synthesizing pseudo novel-views using a pre-trained
+diffusion model. We qualitatively and quantitatively evaluate UVRM's
+performance on the G-Objaverse and CO3D datasets without relying on pose
+information. Extensive experiments show that UVRM is capable of effectively and
+efficiently reconstructing a wide range of 3D objects from unposed videos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SE-BSFV: Online Subspace Learning based Shadow Enhancement and
+  Background Suppression for ViSAR under Complex Background 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangqu Yan, Chenyang Luo, Yaowen Fu, Wenpeng Zhang, Wei Yang, Ruofeng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video synthetic aperture radar (ViSAR) has attracted substantial attention in
+the moving target detection (MTD) field due to its ability to continuously
+monitor changes in the target area. In ViSAR, the moving targets' shadows will
+not offset and defocus, which is widely used as a feature for MTD. However, the
+shadows are difficult to distinguish from the low scattering region in the
+background, which will cause more missing and false alarms. Therefore, it is
+worth investigating how to enhance the distinction between the shadows and
+background. In this study, we proposed the Shadow Enhancement and Background
+Suppression for ViSAR (SE-BSFV) algorithm. The SE-BSFV algorithm is based on
+the low-rank representation (LRR) theory and adopts online subspace learning
+technique to enhance shadows and suppress background for ViSAR images. Firstly,
+we use a registration algorithm to register the ViSAR images and utilize
+Gaussian mixture distribution (GMD) to model the ViSAR data. Secondly, the
+knowledge learned from the previous frames is leveraged to estimate the GMD
+parameters of the current frame, and the Expectation-maximization (EM)
+algorithm is used to estimate the subspace parameters. Then, the foreground
+matrix of the current frame can be obtained. Finally, the alternating direction
+method of multipliers (ADMM) is used to eliminate strong scattering objects in
+the foreground matrix to obtain the final results. The experimental results
+indicate that the SE-BSFV algorithm significantly enhances the shadows'
+saliency and greatly improves the detection performance while ensuring
+efficiency compared with several other advanced pre-processing algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>-CAM: A Simpler Interpretable <span class="highlight-title">Transformer</span> for Fine-Grained
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arpita Chowdhury, Dipanjyoti Paul, Zheda Mai, Jianyang Gu, Ziheng Zhang, Kazi Sajeed Mehrab, Elizabeth G. Campolongo, Daniel Rubenstein, Charles V. Stewart, Anuj Karpatne, Tanya Berger-Wolf, Yu Su, Wei-Lun Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple usage of pre-trained Vision Transformers (ViTs) for
+fine-grained analysis, aiming to identify and localize the traits that
+distinguish visually similar categories, such as different bird species or dog
+breeds. Pre-trained ViTs such as DINO have shown remarkable capabilities to
+extract localized, informative features. However, using saliency maps like
+Grad-CAM can hardly point out the traits: they often locate the whole object by
+a blurred, coarse heatmap, not traits. We propose a novel approach Prompt Class
+Attention Map (Prompt-CAM) to the rescue. Prompt-CAM learns class-specific
+prompts to a pre-trained ViT and uses the corresponding outputs for
+classification. To classify an image correctly, the true-class prompt must
+attend to the unique image patches not seen in other classes' images, i.e.,
+traits. As such, the true class's multi-head attention maps reveal traits and
+their locations. Implementation-wise, Prompt-CAM is almost a free lunch by
+simply modifying the prediction head of Visual Prompt Tuning (VPT). This makes
+Prompt-CAM fairly easy to train and apply, sharply contrasting other
+interpretable methods that design specific models and training processes. It is
+even simpler than the recently published INterpretable TRansformer (INTR),
+whose encoder-decoder architecture prevents it from leveraging pre-trained
+ViTs. Extensive empirical studies on a dozen datasets from various domains
+(e.g., birds, fishes, insects, fungi, flowers, food, and cars) validate
+Prompt-CAM superior interpretation capability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soft Knowledge Distillation with Multi-Dimensional Cross-Net Attention
+  for Image Restoration Models Compression <span class="chip">ICASSP2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongheng Zhang, Danfeng Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based encoder-decoder models have achieved remarkable success in
+image-to-image transfer tasks, particularly in image restoration. However,
+their high computational complexity-manifested in elevated FLOPs and parameter
+counts-limits their application in real-world scenarios. Existing knowledge
+distillation methods in image restoration typically employ lightweight student
+models that directly mimic the intermediate features and reconstruction results
+of the teacher, overlooking the implicit attention relationships between them.
+To address this, we propose a Soft Knowledge Distillation (SKD) strategy that
+incorporates a Multi-dimensional Cross-net Attention (MCA) mechanism for
+compressing image restoration models. This mechanism facilitates interaction
+between the student and teacher across both channel and spatial dimensions,
+enabling the student to implicitly learn the attention matrices. Additionally,
+we employ a Gaussian kernel function to measure the distance between student
+and teacher features in kernel space, ensuring stable and efficient feature
+learning. To further enhance the quality of reconstructed images, we replace
+the commonly used L1 or KL divergence loss with a contrastive learning loss at
+the image level. Experiments on three tasks-image deraining, deblurring, and
+denoising-demonstrate that our SKD strategy significantly reduces computational
+complexity while maintaining strong image restoration capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shape-Based Single Object Classification Using Ensemble Method
+  Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nur Shazwani Kamarudin, Mokhairi Makhtar, Syadiah Nor Wan Shamsuddin, Syed Abdullah Fadzli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, more and more images are available. Annotation and retrieval of the
+images pose classification problems, where each class is defined as the group
+of database images labelled with a common semantic label. Various systems have
+been proposed for content-based retrieval, as well as for image classification
+and indexing. In this paper, a hierarchical classification framework has been
+proposed for bridging the semantic gap effectively and achieving multi-category
+image classification. A well known pre-processing and post-processing method
+was used and applied to three problems; image segmentation, object
+identification and image classification. The method was applied to classify
+single object images from Amazon and Google datasets. The classification was
+tested for four different classifiers; BayesNetwork (BN), Random Forest (RF),
+Bagging and Vote. The estimated classification accuracies ranged from 20% to
+99% (using 10-fold cross validation). The Bagging classifier presents the best
+performance, followed by the Random Forest classifier.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain-conditioned and Temporal-guided Diffusion Modeling for
+  Accelerated Dynamic MRI Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Zhang, Iris Yuwen Zhou, Sydney B. Montesi, Li Feng, Fang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: To propose a domain-conditioned and temporal-guided diffusion
+modeling method, termed dynamic Diffusion Modeling (dDiMo), for accelerated
+dynamic MRI reconstruction, enabling diffusion process to characterize
+spatiotemporal information for time-resolved multi-coil Cartesian and
+non-Cartesian data. Methods: The dDiMo framework integrates temporal
+information from time-resolved dimensions, allowing for the concurrent capture
+of intra-frame spatial features and inter-frame temporal dynamics in diffusion
+modeling. It employs additional spatiotemporal ($x$-$t$) and self-consistent
+frequency-temporal ($k$-$t$) priors to guide the diffusion process. This
+approach ensures precise temporal alignment and enhances the recovery of fine
+image details. To facilitate a smooth diffusion process, the nonlinear
+conjugate gradient algorithm is utilized during the reverse diffusion steps.
+The proposed model was tested on two types of MRI data: Cartesian-acquired
+multi-coil cardiac MRI and Golden-Angle-Radial-acquired multi-coil
+free-breathing lung MRI, across various undersampling rates. Results: dDiMo
+achieved high-quality reconstructions at various acceleration factors,
+demonstrating improved temporal alignment and structural recovery compared to
+other competitive reconstruction methods, both qualitatively and
+quantitatively. This proposed diffusion framework exhibited robust performance
+in handling both Cartesian and non-Cartesian acquisitions, effectively
+reconstructing dynamic datasets in cardiac and lung MRI under different imaging
+conditions. Conclusion: This study introduces a novel diffusion modeling method
+for dynamic MRI reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 15 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding the Trigger: Causal Abductive Reasoning on Video Events 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thao Minh Le, Vuong Le, Kien Do, Sunil Gupta, Svetha Venkatesh, Truyen Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new problem, Causal Abductive Reasoning on Video
+Events (CARVE), which involves identifying causal relationships between events
+in a video and generating hypotheses about causal chains that account for the
+occurrence of a target event. To facilitate research in this direction, we
+create two new benchmark datasets with both synthetic and realistic videos,
+accompanied by trigger-target labels generated through a novel counterfactual
+synthesis approach. To explore the challenge of solving CARVE, we present a
+Causal Event Relation Network (CERN) that examines the relationships between
+video events in temporal and semantic spaces to efficiently determine the
+root-cause trigger events. Through extensive experiments, we demonstrate the
+critical roles of event relational representation learning and interaction
+modeling in solving video causal reasoning challenges. The introduction of the
+CARVE task, along with the accompanying datasets and the CERN framework, will
+advance future research on video causal reasoning and significantly facilitate
+various applications, including video surveillance, root-cause analysis and
+movie content management.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Creating Virtual Environments with 3D Gaussian Splatting: A Comparative
+  Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09302v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09302v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shi Qiu, Binzhu Xie, Qixuan Liu, Pheng-Ann Heng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting (3DGS) has recently emerged as an innovative and
+efficient 3D representation technique. While its potential for extended reality
+(XR) applications is frequently highlighted, its practical effectiveness
+remains underexplored. In this work, we examine three distinct 3DGS-based
+approaches for virtual environment (VE) creation, leveraging their unique
+strengths for efficient and visually compelling scene representation. By
+conducting a comparable study, we evaluate the feasibility of 3DGS in creating
+immersive VEs, identify its limitations in XR applications, and discuss future
+research and development opportunities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE VR 2025 Posters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Few-Shot Medical Image Analysis via Hierarchical Contrastive
+  Vision-Language Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harrison Fuller, Fernando Gabriela Garcia, Victor Flores
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot learning in medical image classification presents a significant
+challenge due to the limited availability of annotated data and the complex
+nature of medical imagery. In this work, we propose Adaptive Vision-Language
+Fine-tuning with Hierarchical Contrastive Alignment (HiCA), a novel framework
+that leverages the capabilities of Large Vision-Language Models (LVLMs) for
+medical image analysis. HiCA introduces a two-stage fine-tuning strategy,
+combining domain-specific pretraining and hierarchical contrastive learning to
+align visual and textual representations at multiple levels. We evaluate our
+approach on two benchmark datasets, Chest X-ray and Breast Ultrasound,
+achieving state-of-the-art performance in both few-shot and zero-shot settings.
+Further analyses demonstrate the robustness, generalizability, and
+interpretability of our method, with substantial improvements in performance
+compared to existing baselines. Our work highlights the potential of
+hierarchical contrastive strategies in adapting LVLMs to the unique challenges
+of medical imaging tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SoccerSynth-Detection: A Synthetic <span class="highlight-title">Dataset</span> for Soccer Player Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haobin Qin, Calvin Yeung, Rikuhei Umemoto, Keisuke Fujii
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In soccer video analysis, player detection is essential for identifying key
+events and reconstructing tactical positions. The presence of numerous players
+and frequent occlusions, combined with copyright restrictions, severely
+restricts the availability of datasets, leaving limited options such as
+SoccerNet-Tracking and SportsMOT. These datasets suffer from a lack of
+diversity, which hinders algorithms from adapting effectively to varied soccer
+video contexts. To address these challenges, we developed
+SoccerSynth-Detection, the first synthetic dataset designed for the detection
+of synthetic soccer players. It includes a broad range of random lighting and
+textures, as well as simulated camera motion blur. We validated its efficacy
+using the object detection model (Yolov8n) against real-world datasets
+(SoccerNet-Tracking and SportsMoT). In transfer tests, it matched the
+performance of real datasets and significantly outperformed them in images with
+motion blur; in pre-training tests, it demonstrated its efficacy as a
+pre-training dataset, significantly enhancing the algorithm's overall
+performance. Our work demonstrates the potential of synthetic datasets to
+replace real datasets for algorithm training in the field of soccer video
+analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-guided Synthetic Geometric Augmentation for Zero-shot 3D
+  Understanding <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kohei Torimi, Ryosuke Yamada, Daichi Otsuka, Kensho Hara, Yuki M. Asano, Hirokatsu Kataoka, Yoshimitsu Aoki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot recognition models require extensive training data for
+generalization. However, in zero-shot 3D classification, collecting 3D data and
+captions is costly and laborintensive, posing a significant barrier compared to
+2D vision. Recent advances in generative models have achieved unprecedented
+realism in synthetic data production, and recent research shows the potential
+for using generated data as training data. Here, naturally raising the
+question: Can synthetic 3D data generated by generative models be used as
+expanding limited 3D datasets? In response, we present a synthetic 3D dataset
+expansion method, Textguided Geometric Augmentation (TeGA). TeGA is tailored
+for language-image-3D pretraining, which achieves SoTA in zero-shot 3D
+classification, and uses a generative textto-3D model to enhance and extend
+limited 3D datasets. Specifically, we automatically generate text-guided
+synthetic 3D data and introduce a consistency filtering strategy to discard
+noisy samples where semantics and geometric shapes do not match with text. In
+the experiment to double the original dataset size using TeGA, our approach
+demonstrates improvements over the baselines, achieving zeroshot performance
+gains of 3.0% on Objaverse-LVIS, 4.6% on ScanObjectNN, and 8.7% on ModelNet40.
+These results demonstrate that TeGA effectively bridges the 3D data gap,
+enabling robust zero-shot 3D classification even with limited real training
+data and paving the way for zero-shot 3D vision application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures, this paper is submitted to CVPR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bias for Action: Video Implicit Neural Representations with Bias
+  Modulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09277v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09277v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alper Kayabasi, Anil Kumar Vadathya, Guha Balakrishnan, Vishwanath Saragadam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new continuous video modeling framework based on implicit neural
+representations (INRs) called ActINR. At the core of our approach is the
+observation that INRs can be considered as a learnable dictionary, with the
+shapes of the basis functions governed by the weights of the INR, and their
+locations governed by the biases. Given compact non-linear activation
+functions, we hypothesize that an INR's biases are suitable to capture motion
+across images, and facilitate compact representations for video sequences.
+Using these observations, we design ActINR to share INR weights across frames
+of a video sequence, while using unique biases for each frame. We further model
+the biases as the output of a separate INR conditioned on time index to promote
+smoothness. By training the video INR and this bias INR together, we
+demonstrate unique capabilities, including $10\times$ video slow motion,
+$4\times$ spatial super resolution along with $2\times$ slow motion, denoising,
+and video inpainting. ActINR performs remarkably well across numerous video
+processing tasks (often achieving more than 6dB improvement), setting a new
+standard for continuous modeling of videos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Distillation for Image Restoration : Simultaneous Learning
+  from Degraded and Clean Images <span class="chip">ICASSP2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongheng Zhang, Danfeng Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model compression through knowledge distillation has seen extensive
+application in classification and segmentation tasks. However, its potential in
+image-to-image translation, particularly in image restoration, remains
+underexplored. To address this gap, we propose a Simultaneous Learning
+Knowledge Distillation (SLKD) framework tailored for model compression in image
+restoration tasks. SLKD employs a dual-teacher, single-student architecture
+with two distinct learning strategies: Degradation Removal Learning (DRL) and
+Image Reconstruction Learning (IRL), simultaneously. In DRL, the student
+encoder learns from Teacher A to focus on removing degradation factors, guided
+by a novel BRISQUE extractor. In IRL, the student decoder learns from Teacher B
+to reconstruct clean images, with the assistance of a proposed PIQE extractor.
+These strategies enable the student to learn from degraded and clean images
+simultaneously, ensuring high-quality compression of image restoration models.
+Experimental results across five datasets and three tasks demonstrate that SLKD
+achieves substantial reductions in FLOPs and parameters, exceeding 80\%, while
+maintaining strong image restoration performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are Open-Vocabulary Models Ready for Detection of MEP Elements on
+  Construction Sites 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdalwhab Abdalwhab, Ali Imran, Sina Heydarian, Ivanka Iordanova, David St-Onge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The construction industry has long explored robotics and computer vision, yet
+their deployment on construction sites remains very limited. These technologies
+have the potential to revolutionize traditional workflows by enhancing
+accuracy, efficiency, and safety in construction management. Ground robots
+equipped with advanced vision systems could automate tasks such as monitoring
+mechanical, electrical, and plumbing (MEP) systems. The present research
+evaluates the applicability of open-vocabulary vision-language models compared
+to fine-tuned, lightweight, closed-set object detectors for detecting MEP
+components using a mobile ground robotic platform. A dataset collected with
+cameras mounted on a ground robot was manually annotated and analyzed to
+compare model performance. The results demonstrate that, despite the
+versatility of vision-language models, fine-tuned lightweight models still
+largely outperform them in specialized environments and for domain-specific
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpticFusion: Multi-Modal Neural Implicit 3D Reconstruction of
+  Microstructures by Fusing White Light Interferometry and Optical Microscopy <span class="chip">3DV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Chen, Yijin Li, Guofeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  White Light Interferometry (WLI) is a precise optical tool for measuring the
+3D topography of microstructures. However, conventional WLI cannot capture the
+natural color of a sample's surface, which is essential for many microscale
+research applications that require both 3D geometry and color information.
+Previous methods have attempted to overcome this limitation by modifying WLI
+hardware and analysis software, but these solutions are often costly. In this
+work, we address this challenge from a computer vision multi-modal
+reconstruction perspective for the first time. We introduce OpticFusion, a
+novel approach that uses an additional digital optical microscope (OM) to
+achieve 3D reconstruction with natural color textures using multi-view WLI and
+OM images. Our method employs a two-step data association process to obtain the
+poses of WLI and OM data. By leveraging the neural implicit representation, we
+fuse multi-modal data and apply color decomposition technology to extract the
+sample's natural color. Tested on our multi-modal dataset of various microscale
+samples, OpticFusion achieves detailed 3D reconstructions with color textures.
+Our method provides an effective tool for practical applications across
+numerous microscale research fields. The source code and our real-world dataset
+are available at https://github.com/zju3dv/OpticFusion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3DV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Scale-aware Representations for improved
+  Concept-Representation Alignment in ViTs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanchit Sinha, Guangzhi Xiong, Aidong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) are increasingly being adopted in various
+sensitive vision applications - like medical diagnosis, facial recognition,
+etc. To improve the interpretability of such models, many approaches attempt to
+forward-align them with carefully annotated abstract, human-understandable
+semantic entities - concepts. Concepts provide global rationales to the model
+predictions and can be quickly understood/intervened on by domain experts. Most
+current research focuses on designing model-agnostic, plug-and-play generic
+concept-based explainability modules that do not incorporate the inner workings
+of foundation models (e.g., inductive biases, scale invariance, etc.) during
+training. To alleviate this issue for ViTs, in this paper, we propose a novel
+Concept Representation Alignment Module (CRAM) which learns both scale and
+position-aware representations from multi-scale feature pyramids and patch
+representations respectively. CRAM further aligns these representations with
+concept annotations through an attention matrix. The proposed CRAM module
+improves the predictive performance of ViT architectures and also provides
+accurate and robust concept explanations as demonstrated on five datasets -
+including three widely used benchmarks (CUB, Pascal APY, Concept-MNIST) and 2
+real-world datasets (AWA2, KITS).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Law-Based Transformation (ALT): A Lightweight Feature
+  Representation for Time Series Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcell T. Kurbucz, Balázs Hajós, Balázs P. Halmos, Vince Á. Molnár, Antal Jakovác
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series classification (TSC) is fundamental in numerous domains,
+including finance, healthcare, and environmental monitoring. However,
+traditional TSC methods often struggle with the inherent complexity and
+variability of time series data. Building on our previous work with the linear
+law-based transformation (LLT) - which improved classification accuracy by
+transforming the feature space based on key data patterns - we introduce
+adaptive law-based transformation (ALT). ALT enhances LLT by incorporating
+variable-length shifted time windows, enabling it to capture distinguishing
+patterns of various lengths and thereby handle complex time series more
+effectively. By mapping features into a linearly separable space, ALT provides
+a fast, robust, and transparent solution that achieves state-of-the-art
+performance with only a few hyperparameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Surgical Visual Understanding (SurgVU) <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aneeq Zia, Max Berniker, Rogerio Nespolo, Conor Perreault, Ziheng Wang, Benjamin Mueller, Ryan Schmidt, Kiran Bhattacharyya, Xi Liu, Anthony Jarc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Owing to recent advances in machine learning and the ability to harvest large
+amounts of data during robotic-assisted surgeries, surgical data science is
+ripe for foundational work. We present a large dataset of surgical videos and
+their accompanying labels for this purpose. We describe how the data was
+collected and some of its unique attributes. Multiple example problems are
+outlined. Although the dataset was curated for a particular set of scientific
+challenges (in an accompanying paper), it is general enough to be used for a
+broad range machine learning questions. Our hope is that this dataset exposes
+the larger machine learning community to the challenging problems within
+surgical data science, and becomes a touchstone for future research. The videos
+are available at
+https://storage.googleapis.com/isi-surgvu/surgvu24_videos_only.zip, the labels
+at https://storage.googleapis.com/isi-surgvu/surgvu24_labels_updated_v2.zip,
+and a validation set for tool detection problem at
+https://storage.googleapis.com/isi-surgvu/cat1_test_set_public.zip.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-Supervised Image-Based Narrative Extraction: A Case Study with
+  Historical Photographic Records <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fausto German, Brian Keith, Mauricio Matus, Diego Urrutia, Claudio Meneses
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a semi-supervised approach to extracting narratives from
+historical photographic records using an adaptation of the narrative maps
+algorithm. We extend the original unsupervised text-based method to work with
+image data, leveraging deep learning techniques for visual feature extraction
+and similarity computation. Our method is applied to the ROGER dataset, a
+collection of photographs from the 1928 Sacambaya Expedition in Bolivia
+captured by Robert Gerstmann. We compare our algorithmically extracted visual
+narratives with expert-curated timelines of varying lengths (5 to 30 images) to
+evaluate the effectiveness of our approach. In particular, we use the Dynamic
+Time Warping (DTW) algorithm to match the extracted narratives with the
+expert-curated baseline. In addition, we asked an expert on the topic to
+qualitatively evaluate a representative example of the resulting narratives.
+Our findings show that the narrative maps approach generally outperforms random
+sampling for longer timelines (10+ images, p < 0.05), with expert evaluation
+confirming the historical accuracy and coherence of the extracted narratives.
+This research contributes to the field of computational analysis of visual
+cultural heritage, offering new tools for historians, archivists, and digital
+humanities scholars to explore and understand large-scale image collections.
+The method's ability to generate meaningful narratives from visual data opens
+up new possibilities for the study and interpretation of historical events
+through photographic evidence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for oral presentation in the findings
+  track of the 47th European Conference on Information Retrieval (ECIR 2025).
+  Source code and experiments are available at
+  https://github.com/faustogerman/ROGER-Concept-Narratives</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ASTRA: A Scene-aware <span class="highlight-title">TRAnsformer</span>-based model for trajectory prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzeddin Teeti, Aniket Thomas, Munish Monga, Sachin Kumar, Uddeshya Singh, Andrew Bradley, Biplab Banerjee, Fabio Cuzzolin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present ASTRA (A} Scene-aware TRAnsformer-based model for trajectory
+prediction), a light-weight pedestrian trajectory forecasting model that
+integrates the scene context, spatial dynamics, social inter-agent interactions
+and temporal progressions for precise forecasting. We utilised a U-Net-based
+feature extractor, via its latent vector representation, to capture scene
+representations and a graph-aware transformer encoder for capturing social
+interactions. These components are integrated to learn an agent-scene aware
+embedding, enabling the model to learn spatial dynamics and forecast the future
+trajectory of pedestrians. The model is designed to produce both deterministic
+and stochastic outcomes, with the stochastic predictions being generated by
+incorporating a Conditional Variational Auto-Encoder (CVAE). ASTRA also
+proposes a simple yet effective weighted penalty loss function, which helps to
+yield predictions that outperform a wide array of state-of-the-art
+deterministic and generative models. ASTRA demonstrates an average improvement
+of 27%/10% in deterministic/stochastic settings on the ETH-UCY dataset, and 26%
+improvement on the PIE dataset, respectively, along with seven times fewer
+parameters than the existing state-of-the-art model (see Figure 1).
+Additionally, the model's versatility allows it to generalize across different
+perspectives, such as Bird's Eye View (BEV) and Ego-Vehicle View (EVV).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detection of Vascular Leukoencephalopathy in CT Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09863v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09863v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Z. Cernekova, V. Sisik, F. Jafari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence (AI) has seen a significant surge in popularity,
+particularly in its application to medicine. This study explores AI's role in
+diagnosing leukoencephalopathy, a small vessel disease of the brain, and a
+leading cause of vascular dementia and hemorrhagic strokes. We utilized a
+dataset of approximately 1200 patients with axial brain CT scans to train
+convolutional neural networks (CNNs) for binary disease classification.
+Addressing the challenge of varying scan dimensions due to different patient
+physiologies, we processed the data to a uniform size and applied three
+preprocessing methods to improve model accuracy. We compared four neural
+network architectures: ResNet50, ResNet50 3D, ConvNext, and Densenet. The
+ConvNext model achieved the highest accuracy of 98.5% without any
+preprocessing, outperforming models with 3D convolutions. To gain insights into
+model decision-making, we implemented Grad-CAM heatmaps, which highlighted the
+focus areas of the models on the scans. Our results demonstrate that AI,
+particularly the ConvNext architecture, can significantly enhance diagnostic
+accuracy for leukoencephalopathy. This study underscores AI's potential in
+advancing diagnostic methodologies for brain diseases and highlights the
+effectiveness of CNNs in medical imaging applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrossModalityDiffusion: Multi-Modal Novel View Synthesis with Unified
+  Intermediate Representation <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09838v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09838v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Berian, Daniel Brignac, JhihYang Wu, Natnael Daba, Abhijit Mahalanobis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geospatial imaging leverages data from diverse sensing modalities-such as EO,
+SAR, and LiDAR, ranging from ground-level drones to satellite views. These
+heterogeneous inputs offer significant opportunities for scene understanding
+but present challenges in interpreting geometry accurately, particularly in the
+absence of precise ground truth data. To address this, we propose
+CrossModalityDiffusion, a modular framework designed to generate images across
+different modalities and viewpoints without prior knowledge of scene geometry.
+CrossModalityDiffusion employs modality-specific encoders that take multiple
+input images and produce geometry-aware feature volumes that encode scene
+structure relative to their input camera positions. The space where the feature
+volumes are placed acts as a common ground for unifying input modalities. These
+feature volumes are overlapped and rendered into feature images from novel
+perspectives using volumetric rendering techniques. The rendered feature images
+are used as conditioning inputs for a modality-specific diffusion model,
+enabling the synthesis of novel images for the desired output modality. In this
+paper, we show that jointly training different modules ensures consistent
+geometric understanding across all modalities within the framework. We validate
+CrossModalityDiffusion's capabilities on the synthetic ShapeNet cars dataset,
+demonstrating its effectiveness in generating accurate and consistent novel
+views across multiple imaging modalities and perspectives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the 2025 WACV workshop GeoCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EraseBench: Understanding The Ripple Effects of Concept Erasure
+  Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibtihel Amara, Ahmed Imtiaz Humayun, Ivana Kajic, Zarana Parekh, Natalie Harris, Sarah Young, Chirag Nagpal, Najoung Kim, Junfeng He, Cristina Nader Vasconcelos, Deepak Ramachandran, Goolnoosh Farnadi, Katherine Heller, Mohammad Havaei, Negar Rostamzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concept erasure techniques have recently gained significant attention for
+their potential to remove unwanted concepts from text-to-image models. While
+these methods often demonstrate success in controlled scenarios, their
+robustness in real-world applications and readiness for deployment remain
+uncertain. In this work, we identify a critical gap in evaluating sanitized
+models, particularly in terms of their performance across various concept
+dimensions. We systematically investigate the failure modes of current concept
+erasure techniques, with a focus on visually similar, binomial, and
+semantically related concepts. We propose that these interconnected
+relationships give rise to a phenomenon of concept entanglement resulting in
+ripple effects and degradation in image quality. To facilitate more
+comprehensive evaluation, we introduce EraseBENCH, a multi-dimensional
+benchmark designed to assess concept erasure methods with greater depth. Our
+dataset includes over 100 diverse concepts and more than 1,000 tailored
+prompts, paired with a comprehensive suite of metrics that together offer a
+holistic view of erasure efficacy. Our findings reveal that even
+state-of-the-art techniques struggle with maintaining quality post-erasure,
+indicating that these approaches are not yet ready for real-world deployment.
+This highlights the gap in reliability of the concept erasure techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages main; 9 pages supplemental material</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PIXELS: Progressive Image Xemplar-based Editing with Latent Surgery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shristi Das Biswas, Matthew Shreve, Xuelu Li, Prateek Singhal, Kaushik Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in language-guided diffusion models for image editing are
+often bottle-necked by cumbersome prompt engineering to precisely articulate
+desired changes. An intuitive alternative calls on guidance from in-the-wild
+image exemplars to help users bring their imagined edits to life. Contemporary
+exemplar-based editing methods shy away from leveraging the rich latent space
+learnt by pre-existing large text-to-image (TTI) models and fall back on
+training with curated objective functions to achieve the task. Though somewhat
+effective, this demands significant computational resources and lacks
+compatibility with diverse base models and arbitrary exemplar count. On further
+investigation, we also find that these techniques restrict user control to only
+applying uniform global changes over the entire edited region. In this paper,
+we introduce a novel framework for progressive exemplar-driven editing with
+off-the-shelf diffusion models, dubbed PIXELS, to enable customization by
+providing granular control over edits, allowing adjustments at the pixel or
+region level. Our method operates solely during inference to facilitate
+imitative editing, enabling users to draw inspiration from a dynamic number of
+reference images, or multimodal prompts, and progressively incorporate all the
+desired changes without retraining or fine-tuning existing TTI models. This
+capability of fine-grained control opens up a range of new possibilities,
+including selective modification of individual objects and specifying gradual
+spatial changes. We demonstrate that PIXELS delivers high-quality edits
+efficiently, leading to a notable improvement in quantitative metrics as well
+as human evaluation. By making high-quality image editing more accessible,
+PIXELS has the potential to enable professional-grade edits to a wider audience
+with the ease of using any open-source image generation model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Single-Image-Based Morphing Attack Detection Using Deep
+  Representations from Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Zhang, Raghavendra Ramachandra, Kiran Raja, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face morphing attacks have posed severe threats to Face Recognition Systems
+(FRS), which are operated in border control and passport issuance use cases.
+Correspondingly, morphing attack detection algorithms (MAD) are needed to
+defend against such attacks. MAD approaches must be robust enough to handle
+unknown attacks in an open-set scenario where attacks can originate from
+various morphing generation algorithms, post-processing and the diversity of
+printers/scanners. The problem of generalization is further pronounced when the
+detection has to be made on a single suspected image. In this paper, we propose
+a generalized single-image-based MAD (S-MAD) algorithm by learning the encoding
+from Vision Transformer (ViT) architecture. Compared to CNN-based
+architectures, ViT model has the advantage on integrating local and global
+information and hence can be suitable to detect the morphing traces widely
+distributed among the face region. Extensive experiments are carried out on
+face morphing datasets generated using publicly available FRGC face datasets.
+Several state-of-the-art (SOTA) MAD algorithms, including representative ones
+that have been publicly evaluated, have been selected and benchmarked with our
+ViT-based approach. Obtained results demonstrate the improved detection
+performance of the proposed S-MAD method on inter-dataset testing (when
+different data is used for training and testing) and comparable performance on
+intra-dataset testing (when the same data is used for training and testing)
+experimental protocol.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lossy Compression with <span class="highlight-title">Pretrain</span>ed Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeremy Vonderfecht, Feng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We apply the DiffC algorithm (Theis et al. 2022) to Stable Diffusion 1.5,
+2.1, XL, and Flux-dev, and demonstrate that these pretrained models are
+remarkably capable lossy image compressors. A principled algorithm for lossy
+compression using pretrained diffusion models has been understood since at
+least Ho et al. 2020, but challenges in reverse-channel coding have prevented
+such algorithms from ever being fully implemented. We introduce simple
+workarounds that lead to the first complete implementation of DiffC, which is
+capable of compressing and decompressing images using Stable Diffusion in under
+10 seconds. Despite requiring no additional training, our method is competitive
+with other state-of-the-art generative compression methods at low ultra-low
+bitrates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMPLest-X: Ultimate Scaling for Expressive Human Pose and Shape
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanqi Yin, Zhongang Cai, Ruisi Wang, Ailing Zeng, Chen Wei, Qingping Sun, Haiyi Mei, Yanjun Wang, Hui En Pang, Mingyuan Zhang, Lei Zhang, Chen Change Loy, Atsushi Yamashita, Lei Yang, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expressive human pose and shape estimation (EHPS) unifies body, hands, and
+face motion capture with numerous applications. Despite encouraging progress,
+current state-of-the-art methods focus on training innovative architectural
+designs on confined datasets. In this work, we investigate the impact of
+scaling up EHPS towards a family of generalist foundation models. 1) For data
+scaling, we perform a systematic investigation on 40 EHPS datasets,
+encompassing a wide range of scenarios that a model trained on any single
+dataset cannot handle. More importantly, capitalizing on insights obtained from
+the extensive benchmarking process, we optimize our training scheme and select
+datasets that lead to a significant leap in EHPS capabilities. Ultimately, we
+achieve diminishing returns at 10M training instances from diverse data
+sources. 2) For model scaling, we take advantage of vision transformers (up to
+ViT-Huge as the backbone) to study the scaling law of model sizes in EHPS. To
+exclude the influence of algorithmic design, we base our experiments on two
+minimalist architectures: SMPLer-X, which consists of an intermediate step for
+hand and face localization, and SMPLest-X, an even simpler version that reduces
+the network to its bare essentials and highlights significant advances in the
+capture of articulated hands. With big data and the large model, the foundation
+models exhibit strong performance across diverse test benchmarks and excellent
+transferability to even unseen environments. Moreover, our finetuning strategy
+turns the generalist into specialist models, allowing them to achieve further
+performance boosts. Notably, our foundation models consistently deliver
+state-of-the-art results on seven benchmarks such as AGORA, UBody, EgoBody, and
+our proposed SynHand dataset for comprehensive hand evaluation. (Code is
+available at: https://github.com/wqyin/SMPLest-X).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An extension of SMPLer-X [arXiv:2309.17448]. Homepage:
+  https://caizhongang.com/projects/SMPLer-X/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoWorld: Exploring Knowledge Learning from Unlabeled Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongwei Ren, Yunchao Wei, Xun Guo, Yao Zhao, Bingyi Kang, Jiashi Feng, Xiaojie Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work explores whether a deep generative model can learn complex
+knowledge solely from visual input, in contrast to the prevalent focus on
+text-based models like large language models (LLMs). We develop VideoWorld, an
+auto-regressive video generation model trained on unlabeled video data, and
+test its knowledge acquisition abilities in video-based Go and robotic control
+tasks. Our experiments reveal two key findings: (1) video-only training
+provides sufficient information for learning knowledge, including rules,
+reasoning and planning capabilities, and (2) the representation of visual
+change is crucial for knowledge acquisition. To improve both the efficiency and
+efficacy of this process, we introduce the Latent Dynamics Model (LDM) as a key
+component of VideoWorld. Remarkably, VideoWorld reaches a 5-dan professional
+level in the Video-GoBench with just a 300-million-parameter model, without
+relying on search algorithms or reward mechanisms typical in reinforcement
+learning. In robotic tasks, VideoWorld effectively learns diverse control
+operations and generalizes across environments, approaching the performance of
+oracle models in CALVIN and RLBench. This study opens new avenues for knowledge
+acquisition from visual data, with all code, data, and models open-sourced for
+further research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and models are released at:
+  https://maverickren.github.io/VideoWorld.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FutureDepth: Learning to Predict the Future Improves Video Depth
+  Estimation <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12953v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12953v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajeev Yasarla, Manish Kumar Singh, Hong Cai, Yunxiao Shi, Jisoo Jeong, Yinhao Zhu, Shizhong Han, Risheek Garrepalli, Fatih Porikli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel video depth estimation approach,
+FutureDepth, which enables the model to implicitly leverage multi-frame and
+motion cues to improve depth estimation by making it learn to predict the
+future at training. More specifically, we propose a future prediction network,
+F-Net, which takes the features of multiple consecutive frames and is trained
+to predict multi-frame features one time step ahead iteratively. In this way,
+F-Net learns the underlying motion and correspondence information, and we
+incorporate its features into the depth decoding process. Additionally, to
+enrich the learning of multiframe correspondence cues, we further leverage a
+reconstruction network, R-Net, which is trained via adaptively masked
+auto-encoding of multiframe feature volumes. At inference time, both F-Net and
+R-Net are used to produce queries to work with the depth decoder, as well as a
+final refinement network. Through extensive experiments on several benchmarks,
+i.e., NYUDv2, KITTI, DDAD, and Sintel, which cover indoor, driving, and
+open-domain scenarios, we show that FutureDepth significantly improves upon
+baseline models, outperforms existing video depth estimation methods, and sets
+new state-of-the-art (SOTA) accuracy. Furthermore, FutureDepth is more
+efficient than existing SOTA video depth estimation models and has similar
+latencies when comparing to monocular models
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAMo: Leveraging Memory and Attention for Monocular Video Depth
+  Estimation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14336v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14336v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajeev Yasarla, Hong Cai, Jisoo Jeong, Yunxiao Shi, Risheek Garrepalli, Fatih Porikli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MAMo, a novel memory and attention frame-work for monocular video
+depth estimation. MAMo can augment and improve any single-image depth
+estimation networks into video depth estimation models, enabling them to take
+advantage of the temporal information to predict more accurate depth. In MAMo,
+we augment model with memory which aids the depth prediction as the model
+streams through the video. Specifically, the memory stores learned visual and
+displacement tokens of the previous time instances. This allows the depth
+network to cross-reference relevant features from the past when predicting
+depth on the current frame. We introduce a novel scheme to continuously update
+the memory, optimizing it to keep tokens that correspond with both the past and
+the present visual information. We adopt attention-based approach to process
+memory features where we first learn the spatio-temporal relation among the
+resultant visual and displacement memory tokens using self-attention module.
+Further, the output features of self-attention are aggregated with the current
+visual features through cross-attention. The cross-attended features are
+finally given to a decoder to predict depth on the current frame. Through
+extensive experiments on several benchmarks, including KITTI, NYU-Depth V2, and
+DDAD, we show that MAMo consistently improves monocular depth estimation
+networks and sets new state-of-the-art (SOTA) accuracy. Notably, our MAMo video
+depth estimation provides higher accuracy with lower latency, when omparing to
+SOTA cost-volume-based video depth models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vulnerability-Aware Spatio-Temporal Learning for Generalizable and
+  Interpretable Deepfake Video Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01184v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01184v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dat Nguyen, Marcella Astrid, Anis Kacem, Enjie Ghorbel, Djamila Aouada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting deepfake videos is highly challenging due to the complex
+intertwined spatial and temporal artifacts in forged sequences. Most recent
+approaches rely on binary classifiers trained on both real and fake data.
+However, such methods may struggle to focus on important artifacts, which can
+hinder their generalization capability. Additionally, these models often lack
+interpretability, making it difficult to understand how predictions are made.
+To address these issues, we propose FakeSTormer, offering two key
+contributions. First, we introduce a multi-task learning framework with
+additional spatial and temporal branches that enable the model to focus on
+subtle spatio-temporal artifacts. These branches also provide interpretability
+by highlighting video regions that may contain artifacts. Second, we propose a
+video-level data synthesis algorithm that generates pseudo-fake videos with
+subtle artifacts, providing the model with high-quality samples and ground
+truth data for our spatial and temporal branches. Extensive experiments on
+several challenging benchmarks demonstrate the competitiveness of our approach
+compared to recent state-of-the-art methods. The code is available at
+https://github.com/10Ring/FakeSTormer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Super-class guided <span class="highlight-title">Transformer</span> for Zero-Shot Attribute Classification <span class="chip">AAAI25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05728v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05728v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sehyung Kim, Chanhyeong Yang, Jihwan Park, Taehoon Song, Hyunwoo J. Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attribute classification is crucial for identifying specific characteristics
+within image regions. Vision-Language Models (VLMs) have been effective in
+zero-shot tasks by leveraging their general knowledge from large-scale
+datasets. Recent studies demonstrate that transformer-based models with
+class-wise queries can effectively address zero-shot multi-label
+classification. However, poor utilization of the relationship between seen and
+unseen attributes makes the model lack generalizability. Additionally,
+attribute classification generally involves many attributes, making maintaining
+the model's scalability difficult. To address these issues, we propose
+Super-class guided transFormer (SugaFormer), a novel framework that leverages
+super-classes to enhance scalability and generalizability for zero-shot
+attribute classification. SugaFormer employs Super-class Query Initialization
+(SQI) to reduce the number of queries, utilizing common semantic information
+from super-classes, and incorporates Multi-context Decoding (MD) to handle
+diverse visual cues. To strengthen generalizability, we introduce two knowledge
+transfer strategies that utilize VLMs. During training, Super-class guided
+Consistency Regularization (SCR) aligns model's features with VLMs using
+super-class guided prompts, and during inference, Zero-shot Retrieval-based
+Score Enhancement (ZRSE) refines predictions for unseen attributes. Extensive
+experiments demonstrate that SugaFormer achieves state-of-the-art performance
+across three widely-used attribute classification benchmarks under zero-shot,
+and cross-dataset transfer settings. Our code is available at
+https://github.com/mlvlab/SugaFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VIS-MAE: An Efficient <span class="highlight-title">Self-supervised</span> Learning Approach on Medical Image
+  Segmentation and Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01034v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01034v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelong Liu, Andrew Tieu, Nikhil Patel, Georgios Soultanidis, Louisa Deyer, Ying Wang, Sean Huver, Alexander Zhou, Yunhao Mei, Zahi A. Fayad, Timothy Deyer, Xueyan Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI) has the potential to revolutionize diagnosis and
+segmentation in medical imaging. However, development and clinical
+implementation face multiple challenges including limited data availability,
+lack of generalizability, and the necessity to incorporate multi-modal data
+effectively. A foundation model, which is a large-scale pre-trained AI model,
+offers a versatile base that can be adapted to a variety of specific tasks and
+contexts. Here, we present VIsualization and Segmentation Masked AutoEncoder
+(VIS-MAE), novel model weights specifically designed for medical imaging.
+Specifically, VIS-MAE is trained on a dataset of 2.5 million unlabeled images
+from various modalities (CT, MR, PET,X-rays, and ultrasound), using
+self-supervised learning techniques. It is then adapted to classification and
+segmentation tasks using explicit labels. VIS-MAE has high label efficiency,
+outperforming several benchmark models in both in-domain and out-of-domain
+applications. In addition, VIS-MAE has improved label efficiency as it can
+achieve similar performance to other models with a reduced amount of labeled
+training data (50% or 80%) compared to other pre-trained weights. VIS-MAE
+represents a significant advancement in medical imaging AI, offering a
+generalizable and robust solution for improving segmentation and classification
+tasks while reducing the data annotation workload. The source code of this work
+is available at https://github.com/lzl199704/VIS-MAE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comparative Study on Multi-task Uncertainty Quantification in Semantic
+  Segmentation and Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17097v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17097v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Landgraf, Markus Hillemann, Theodor Kapler, Markus Ulrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks excel in perception tasks such as semantic segmentation
+and monocular depth estimation, making them indispensable in safety-critical
+applications like autonomous driving and industrial inspection. However, they
+often suffer from overconfidence and poor explainability, especially for
+out-of-domain data. While uncertainty quantification has emerged as a promising
+solution to these challenges, multi-task settings have yet to be explored. In
+an effort to shed light on this, we evaluate Monte Carlo Dropout, Deep
+Sub-Ensembles, and Deep Ensembles for joint semantic segmentation and monocular
+depth estimation. Thereby, we reveal that Deep Ensembles stand out as the
+preferred choice, particularly in out-of-domain scenarios, and show the
+potential benefit of multi-task learning with regard to the uncertainty quality
+in comparison to solving both tasks separately. Additionally, we highlight the
+impact of employing different uncertainty thresholds to classify pixels as
+certain or uncertain, with the median uncertainty emerging as a robust default.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript is an extended version of a previously published
+  conference paper and is currently in review for a journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> of Foundation Models in Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10729v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10729v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wasif Khan, Seowung Leem, Kyle B. See, Joshua K. Wong, Shaoting Zhang, Ruogu Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FMs) are large-scale deep learning models trained on
+massive datasets, often using self-supervised learning techniques. These models
+serve as a versatile base for a wide range of downstream tasks, including those
+in medicine and healthcare. FMs have demonstrated remarkable success across
+multiple healthcare domains. However, existing surveys in this field do not
+comprehensively cover all areas where FMs have made significant strides. In
+this survey, we present a comprehensive review of FMs in medicine, focusing on
+their evolution, learning strategies, flagship models, applications, and
+associated challenges. We examine how prominent FMs, such as the BERT and GPT
+families, are transforming various aspects of healthcare, including clinical
+large language models, medical image analysis, and omics research.
+Additionally, we provide a detailed taxonomy of FM-enabled healthcare
+applications, spanning clinical natural language processing, medical computer
+vision, graph learning, and other biology- and omics- related tasks. Despite
+the transformative potentials of FMs, they also pose unique challenges. This
+survey delves into these challenges and highlights open research questions and
+lessons learned to guide researchers and practitioners. Our goal is to provide
+valuable insights into the capabilities of FMs in health, facilitating
+responsible deployment and mitigating associated risks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Zero-Shot Object-Level Change Detection by Incorporating
+  Visual Correspondence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05555v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05555v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Huy Nguyen, Pooyan Rahmanzadehgervi, Long Mai, Anh Totti Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting object-level changes between two images across possibly different
+views is a core task in many applications that involve visual inspection or
+camera surveillance. Existing change-detection approaches suffer from three
+major limitations: (1) lack of evaluation on image pairs that contain no
+changes, leading to unreported false positive rates; (2) lack of
+correspondences (i.e., localizing the regions before and after a change); and
+(3) poor zero-shot generalization across different domains. To address these
+issues, we introduce a novel method that leverages change correspondences (a)
+during training to improve change detection accuracy, and (b) at test time, to
+minimize false positives. That is, we harness the supervision labels of where
+an object is added or removed to supervise change detectors, improving their
+accuracy over previous work by a large margin. Our work is also the first to
+predict correspondences between pairs of detected changes using estimated
+homography and the Hungarian algorithm. Our model demonstrates superior
+performance over existing methods, achieving state-of-the-art results in change
+detection and change correspondence accuracy across both in-distribution and
+zero-shot benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tieyuan Chen, Huabin Liu, Yi Wang, Yihang Chen, Tianyao He, Chaofan Gan, Huanyu He, Weiyao Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video causal reasoning aims to achieve a high-level understanding of videos
+from a causal perspective. However, it exhibits limitations in its scope,
+primarily executed in a question-answering paradigm and focusing on brief video
+segments containing isolated events and basic causal relations, lacking
+comprehensive and structured causality analysis for videos with multiple
+interconnected events. To fill this gap, we introduce a new task and dataset,
+Multi-Event Causal Discovery (MECD). It aims to uncover the causal relations
+between events distributed chronologically across long videos. Given visual
+segments and textual descriptions of events, MECD identifies the causal
+associations between these events to derive a comprehensive and structured
+event-level video causal graph explaining why and how the result event
+occurred. To address the challenges of MECD, we devise a novel framework
+inspired by the Granger Causality method, incorporating an efficient mask-based
+event prediction model to perform an Event Granger Test. It estimates causality
+by comparing the predicted result event when premise events are masked versus
+unmasked. Furthermore, we integrate causal inference techniques such as
+front-door adjustment and counterfactual inference to mitigate challenges in
+MECD like causality confounding and illusory causality. Additionally, context
+chain reasoning is introduced to conduct more robust and generalized reasoning.
+Experiments validate the effectiveness of our framework in reasoning complete
+causal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%,
+respectively. Further experiments demonstrate that causal relation graphs can
+also contribute to downstream video understanding tasks such as video question
+answering and video event prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE TPAMI Submission. continuous work of arXiv:2409.17647 (NeurIPS
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VITA-1.5: Towards <span class="highlight-title">GPT</span>-4o Level Real-Time Vision and Speech Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01957v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01957v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyou Fu, Haojia Lin, Xiong Wang, Yi-Fan Zhang, Yunhang Shen, Xiaoyu Liu, Yangze Li, Zuwei Long, Heting Gao, Ke Li, Long Ma, Xiawu Zheng, Rongrong Ji, Xing Sun, Caifeng Shan, Ran He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent Multimodal Large Language Models (MLLMs) have typically focused on
+integrating visual and textual modalities, with less emphasis placed on the
+role of speech in enhancing interaction. However, speech plays a crucial role
+in multimodal dialogue systems, and implementing high-performance in both
+vision and speech tasks remains a significant challenge due to the fundamental
+modality differences. In this paper, we propose a carefully designed
+multi-stage training methodology that progressively trains LLM to understand
+both visual and speech information, ultimately enabling fluent vision and
+speech interaction. Our approach not only preserves strong vision-language
+capacity, but also enables efficient speech-to-speech dialogue capabilities
+without separate ASR and TTS modules, significantly accelerating multimodal
+end-to-end response speed. By comparing our method against state-of-the-art
+counterparts across benchmarks for image, video, and speech tasks, we
+demonstrate that our model is equipped with both strong visual and speech
+capabilities, making near real-time vision and speech interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://github.com/VITA-MLLM/VITA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian
+  Neural Networks <span class="chip">AAAI'2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20891v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20891v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bao Gia Doan, Afshar Shamsi, Xiao-Yu Guo, Arash Mohammadi, Hamid Alinejad-Rokny, Dino Sejdinovic, Damien Teney, Damith C. Ranasinghe, Ehsan Abbasnejad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational complexity of Bayesian learning is impeding its adoption in
+practical, large-scale tasks. Despite demonstrations of significant merits such
+as improved robustness and resilience to unseen or out-of-distribution inputs
+over their non- Bayesian counterparts, their practical use has faded to near
+insignificance. In this study, we introduce an innovative framework to mitigate
+the computational burden of Bayesian neural networks (BNNs). Our approach
+follows the principle of Bayesian techniques based on deep ensembles, but
+significantly reduces their cost via multiple low-rank perturbations of
+parameters arising from a pre-trained neural network. Both vanilla version of
+ensembles as well as more sophisticated schemes such as Bayesian learning with
+Stein Variational Gradient Descent (SVGD), previously deemed impractical for
+large models, can be seamlessly implemented within the proposed framework,
+called Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a
+dramatic reduction in the number of trainable parameters required to
+approximate a Bayesian posterior; and ii) it not only maintains, but in some
+instances, surpasses the performance of conventional Bayesian learning methods
+and non-Bayesian baselines. Our results with large-scale tasks such as
+ImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the
+effectiveness and versatility of Bella in building highly scalable and
+practical Bayesian deep models for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted in AAAI'2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Space Characterization of Autoencoder Variants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04755v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04755v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anika Shrivastava, Renu Rameshan, Samar Agnihotri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the latent spaces learned by deep learning models is crucial in
+exploring how they represent and generate complex data. Autoencoders (AEs) have
+played a key role in the area of representation learning, with numerous
+regularization techniques and training principles developed not only to enhance
+their ability to learn compact and robust representations, but also to reveal
+how different architectures influence the structure and smoothness of the
+lower-dimensional non-linear manifold. We strive to characterize the structure
+of the latent spaces learned by different autoencoders including convolutional
+autoencoders (CAEs), denoising autoencoders (DAEs), and variational
+autoencoders (VAEs) and how they change with the perturbations in the input. By
+characterizing the matrix manifolds corresponding to the latent spaces, we
+provide an explanation for the well-known observation that the latent spaces of
+CAE and DAE form non-smooth manifolds, while that of VAE forms a smooth
+manifold. We also map the points of the matrix manifold to a Hilbert space
+using distance preserving transforms and provide an alternate view in terms of
+the subspaces generated in the Hilbert space as a function of the distortion in
+the input. The results show that the latent manifolds of CAE and DAE are
+stratified with each stratum being a smooth product manifold, while the
+manifold of VAE is a smooth product manifold of two symmetric positive definite
+matrices and a symmetric positive semi-definite matrix.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures, and 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STROOBnet Optimization via GPU-Accelerated Proximal Recurrence
+  Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14388v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14388v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ted Edward Holmberg, Mahdi Abdelguerfi, Elias Ioup
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatiotemporal networks' observational capabilities are crucial for accurate
+data gathering and informed decisions across multiple sectors. This study
+focuses on the Spatiotemporal Ranged Observer-Observable Bipartite Network
+(STROOBnet), linking observational nodes (e.g., surveillance cameras) to events
+within defined geographical regions, enabling efficient monitoring. Using data
+from Real-Time Crime Camera (RTCC) systems and Calls for Service (CFS) in New
+Orleans, where RTCC combats rising crime amidst reduced police presence, we
+address the network's initial observational imbalances. Aiming for uniform
+observational efficacy, we propose the Proximal Recurrence approach. It
+outperformed traditional clustering methods like k-means and DBSCAN by offering
+holistic event frequency and spatial consideration, enhancing observational
+coverage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 17 figures, 2023 IEEE International Conference on Big Data
+  (BigData)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Few-Shot Image Classification through Learnable Multi-Scale
+  Embedding and Attention Mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.07989v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.07989v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Askari, Amirreza Fateh, Mohammad Reza Mohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of few-shot classification, the goal is to train a classifier
+using a limited number of samples while maintaining satisfactory performance.
+However, traditional metric-based methods exhibit certain limitations in
+achieving this objective. These methods typically rely on a single distance
+value between the query feature and support feature, thereby overlooking the
+contribution of shallow features. To overcome this challenge, we propose a
+novel approach in this paper. Our approach involves utilizing a multi-output
+embedding network that maps samples into distinct feature spaces. The proposed
+method extracts feature vectors at different stages, enabling the model to
+capture both global and abstract features. By utilizing these diverse feature
+spaces, our model enhances its performance. Moreover, employing a
+self-attention mechanism improves the refinement of features at each stage,
+leading to even more robust representations and improved overall performance.
+Furthermore, assigning learnable weights to each stage significantly improved
+performance and results. We conducted comprehensive evaluations on the
+MiniImageNet and FC100 datasets, specifically in the 5-way 1-shot and 5-way
+5-shot scenarios. Additionally, we performed cross-domain tasks across eight
+benchmark datasets, achieving high accuracy in the testing domains. These
+evaluations demonstrate the efficacy of our proposed method in comparison to
+state-of-the-art approaches. https://github.com/FatemehAskari/MSENet
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems
+  using Disparity Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.24031v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.24031v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariel Larey, Eyal Rond, Omer Achrack
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition technologies are increasingly used in various applications,
+yet they are vulnerable to face spoofing attacks. These spoofing attacks often
+involve unique 3D structures, such as printed papers or mobile device screens.
+Although stereo-depth cameras can detect such attacks effectively, their
+high-cost limits their widespread adoption. Conversely, two-sensor systems
+without extrinsic calibration offer a cost-effective alternative but are unable
+to calculate depth using stereo techniques. In this work, we propose a method
+to overcome this challenge by leveraging facial attributes to derive disparity
+information and estimate relative depth for anti-spoofing purposes, using
+non-calibrated systems. We introduce a multi-modal anti-spoofing model, coined
+Disparity Model, that incorporates created disparity maps as a third modality
+alongside the two original sensor modalities. We demonstrate the effectiveness
+of the Disparity Model in countering various spoof attacks using a
+comprehensive dataset collected from the Intel RealSense ID Solution F455. Our
+method outperformed existing methods in the literature, achieving an Equal
+Error Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False
+Positive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the
+errors of the best comparison method, respectively. Additionally, we introduce
+a model ensemble that addresses 3D spoof attacks as well, achieving an EER of
+2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a
+state-of-the-art solution for the challenging task of anti-spoofing in
+non-calibrated systems that lack depth information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating alignment between humans and neural network representations
+  in image-based learning tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09377v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09377v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Can Demircan, Tankred Saanum, Leonardo Pettini, Marcel Binz, Blazej M Baczkowski, Christian F Doeller, Mona M Garvert, Eric Schulz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans represent scenes and objects in rich feature spaces, carrying
+information that allows us to generalise about category memberships and
+abstract functions with few examples. What determines whether a neural network
+model generalises like a human? We tested how well the representations of $86$
+pretrained neural network models mapped to human learning trajectories across
+two tasks where humans had to learn continuous relationships and categories of
+natural images. In these tasks, both human participants and neural networks
+successfully identified the relevant stimulus features within a few trials,
+demonstrating effective generalisation. We found that while training dataset
+size was a core determinant of alignment with human choices, contrastive
+training with multi-modal data (text and imagery) was a common feature of
+currently publicly available models that predicted human generalisation.
+Intrinsic dimensionality of representations had different effects on alignment
+for different model types. Lastly, we tested three sets of human-aligned
+representations and found no consistent improvements in predictive accuracy
+compared to the baselines. In conclusion, pretrained neural networks can serve
+to extract representations for cognitive models, as they appear to capture some
+fundamental aspects of cognition that are transferable across tasks. Both our
+paradigms and modelling approach offer a novel way to quantify alignment
+between neural networks and humans and extend cognitive science into more
+naturalistic domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instruction-Guided Fusion of Multi-Layer Visual Features in Large
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08443v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08443v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Li, Yi Zheng, Haotian Chen, Xiaolei Chen, Yuxuan Liang, Chenghang Lai, Bin Li, Xiangyang Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) have achieved significant success in
+multimodal tasks by combining pre-trained vision encoders and large language
+models. However, current LVLMs mainly rely on features from the final layers of
+the vision encoder, neglecting complementary information in shallower layers.
+While recent methods have explored multi-layer features, they are often
+task-agnostic. We investigate the contributions of visual features from
+different encoder layers across 18 benchmarks and 6 task categories. Our
+results show that multi-layer features provide complementary strengths with
+varying task dependencies, and uniform fusion performs suboptimally. Based on
+these findings, we propose an instruction-guided vision aggregator that
+dynamically integrates multi-layer features based on textual instructions,
+without increasing the number of visual tokens. Extensive evaluations show
+superior performance, and analysis reveals the dominance of mid-to-high-level
+features in semantic tasks and the critical role of low-level features in
+fine-grained perception. This work provides valuable insights into the adaptive
+use of hierarchical visual features in LVLMs, advancing more flexible
+multimodal systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Models in Vision: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.04747v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.04747v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florinel-Alin Croitoru, Vlad Hondru, Radu Tudor Ionescu, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising diffusion models represent a recent emerging topic in computer
+vision, demonstrating remarkable results in the area of generative modeling. A
+diffusion model is a deep generative model that is based on two stages, a
+forward diffusion stage and a reverse diffusion stage. In the forward diffusion
+stage, the input data is gradually perturbed over several steps by adding
+Gaussian noise. In the reverse stage, a model is tasked at recovering the
+original input data by learning to gradually reverse the diffusion process,
+step by step. Diffusion models are widely appreciated for the quality and
+diversity of the generated samples, despite their known computational burdens,
+i.e. low speeds due to the high number of steps involved during sampling. In
+this survey, we provide a comprehensive review of articles on denoising
+diffusion models applied in vision, comprising both theoretical and practical
+contributions in the field. First, we identify and present three generic
+diffusion modeling frameworks, which are based on denoising diffusion
+probabilistic models, noise conditioned score networks, and stochastic
+differential equations. We further discuss the relations between diffusion
+models and other deep generative models, including variational auto-encoders,
+generative adversarial networks, energy-based models, autoregressive models and
+normalizing flows. Then, we introduce a multi-perspective categorization of
+diffusion models applied in computer vision. Finally, we illustrate the current
+limitations of diffusion models and envision some interesting directions for
+future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Transactions on Pattern Analysis and Machine
+  Intelligence. 25 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DriveLM: Driving with Graph Visual Question Answering <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14150v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14150v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chonghao Sima, Katrin Renz, Kashyap Chitta, Li Chen, Hanxue Zhang, Chengen Xie, Jens Beißwenger, Ping Luo, Andreas Geiger, Hongyang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study how vision-language models (VLMs) trained on web-scale data can be
+integrated into end-to-end driving systems to boost generalization and enable
+interactivity with human users. While recent approaches adapt VLMs to driving
+via single-round visual question answering (VQA), human drivers reason about
+decisions in multiple steps. Starting from the localization of key objects,
+humans estimate object interactions before taking actions. The key insight is
+that with our proposed task, Graph VQA, where we model graph-structured
+reasoning through perception, prediction and planning question-answer pairs, we
+obtain a suitable proxy task to mimic the human reasoning process. We
+instantiate datasets (DriveLM-Data) built upon nuScenes and CARLA, and propose
+a VLM-based baseline approach (DriveLM-Agent) for jointly performing Graph VQA
+and end-to-end driving. The experiments demonstrate that Graph VQA provides a
+simple, principled framework for reasoning about a driving scene, and
+DriveLM-Data provides a challenging benchmark for this task. Our DriveLM-Agent
+baseline performs end-to-end autonomous driving competitively in comparison to
+state-of-the-art driving-specific architectures. Notably, its benefits are
+pronounced when it is evaluated zero-shot on unseen objects or sensor
+configurations. We hope this work can be the starting point to shed new light
+on how to apply VLMs for autonomous driving. To facilitate future research, all
+code, data, and models are available to the public.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ECCV 2024 as Oral paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards an End-to-End (E2E) Adversarial Learning and Application in the
+  Physical World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08258v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08258v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dudi Biton, Jacob Shams, Satoru Koda, Asaf Shabtai, Yuval Elovici, Ben Nassi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The traditional learning process of patch-based adversarial attacks,
+conducted in the digital domain and then applied in the physical domain (e.g.,
+via printed stickers), may suffer from reduced performance due to adversarial
+patches' limited transferability from the digital domain to the physical
+domain. Given that previous studies have considered using projectors to apply
+adversarial attacks, we raise the following question: can adversarial learning
+(i.e., patch generation) be performed entirely in the physical domain with a
+projector? In this work, we propose the Physical-domain Adversarial Patch
+Learning Augmentation (PAPLA) framework, a novel end-to-end (E2E) framework
+that converts adversarial learning from the digital domain to the physical
+domain using a projector. We evaluate PAPLA across multiple scenarios,
+including controlled laboratory settings and realistic outdoor environments,
+demonstrating its ability to ensure attack success compared to conventional
+digital learning-physical application (DL-PA) methods. We also analyze the
+impact of environmental factors, such as projection surface color, projector
+strength, ambient light, distance, and angle of the target object relative to
+the camera, on the effectiveness of projected patches. Finally, we demonstrate
+the feasibility of the attack against a parked car and a stop sign in a
+real-world outdoor environment. Our results show that under specific
+conditions, E2E adversarial learning in the physical domain eliminates the
+transferability issue and ensures evasion by object detectors. Finally, we
+provide insights into the challenges and opportunities of applying adversarial
+learning in the physical domain and explain where such an approach is more
+effective than using a sticker.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TextureCrop: Enhancing Synthetic Image Detection through Texture-based
+  Cropping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15500v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15500v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Despina Konstantinidou, Christos Koutlis, Symeon Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI technologies produce increasingly realistic imagery, which,
+despite its potential for creative applications, can also be misused to produce
+misleading and harmful content. This renders Synthetic Image Detection (SID)
+methods essential for identifying AI-generated content online. State-of-the-art
+SID methods typically resize or center-crop input images due to architectural
+or computational constraints, which hampers the detection of artifacts that
+appear in high-resolution images. To address this limitation, we propose
+TextureCrop, an image pre-processing component that can be plugged in any
+pre-trained SID model to improve its performance. By focusing on high-frequency
+image parts where generative artifacts are prevalent, TextureCrop enhances SID
+performance with manageable memory requirements. Experimental results
+demonstrate a consistent improvement in AUC across various detectors by 6.1%
+compared to center cropping and by 15% compared to resizing, across
+high-resolution images from the Forensynths, Synthbuster and TWIGMA datasets.
+Code available at https : //github.com/mever-team/texture-crop.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 images</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IOR: Inversed Objects Replay for Incremental Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04829v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04829v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijia An, Boyu Diao, Libo Huang, Ruiqi Liu, Zhulin An, Yongjun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Incremental Object Detection (IOD) methods partially alleviate
+catastrophic forgetting when incrementally detecting new objects in real-world
+scenarios. However, many of these methods rely on the assumption that unlabeled
+old-class objects may co-occur with labeled new-class objects in the
+incremental data. When unlabeled old-class objects are absent, the performance
+of existing methods tends to degrade. The absence can be mitigated by
+generating old-class samples, but it incurs high costs. This paper argues that
+previous generation-based IOD suffers from redundancy, both in the use of
+generative models, which require additional training and storage, and in the
+overproduction of generated samples, many of which do not contribute
+significantly to performance improvements. To eliminate the redundancy, we
+propose Inversed Objects Replay (IOR). Specifically, we generate old-class
+samples by inversing the original detectors, thus eliminating the necessity of
+training and storing additional generative models. We propose augmented replay
+to reuse the objects in generated samples, reducing redundant generations.
+Moreover, we propose high-value knowledge distillation focusing on the
+positions of old-class objects overwhelmed by the background, which transfers
+the knowledge to the incremental detector. Extensive experiments conducted on
+MS COCO 2017 demonstrate that our method can efficiently improve detection
+performance in IOD scenarios with the absence of old-class objects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Skinned Motion Retargeting with Dense Geometric Interaction Perception <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.20986v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.20986v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Ye, Jia-Wei Liu, Jia Jia, Shikun Sun, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing and maintaining geometric interactions among different body parts
+is crucial for successful motion retargeting in skinned characters. Existing
+approaches often overlook body geometries or add a geometry correction stage
+after skeletal motion retargeting. This results in conflicts between skeleton
+interaction and geometry correction, leading to issues such as jittery,
+interpenetration, and contact mismatches. To address these challenges, we
+introduce a new retargeting framework, MeshRet, which directly models the dense
+geometric interactions in motion retargeting. Initially, we establish dense
+mesh correspondences between characters using semantically consistent sensors
+(SCS), effective across diverse mesh topologies. Subsequently, we develop a
+novel spatio-temporal representation called the dense mesh interaction (DMI)
+field. This field, a collection of interacting SCS feature vectors, skillfully
+captures both contact and non-contact interactions between body geometries. By
+aligning the DMI field during retargeting, MeshRet not only preserves motion
+semantics but also prevents self-interpenetration and ensures contact
+preservation. Extensive experiments on the public Mixamo dataset and our
+newly-collected ScanRet dataset demonstrate that MeshRet achieves
+state-of-the-art performance. Code available at
+https://github.com/abcyzj/MeshRet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 Spotlight</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ reBEN: Refined BigEarthNet <span class="highlight-title">Dataset</span> for Remote Sensing Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.03653v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.03653v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Norman Clasen, Leonard Hackel, Tom Burgert, Gencer Sumbul, Begüm Demir, Volker Markl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents refined BigEarthNet (reBEN) that is a large-scale,
+multi-modal remote sensing dataset constructed to support deep learning (DL)
+studies for remote sensing image analysis. The reBEN dataset consists of
+549,488 pairs of Sentinel-1 and Sentinel-2 image patches. To construct reBEN,
+we initially consider the Sentinel-1 and Sentinel-2 tiles used to construct the
+BigEarthNet dataset and then divide them into patches of size 1200 m x 1200 m.
+We apply atmospheric correction to the Sentinel-2 patches using the latest
+version of the sen2cor tool, resulting in higher-quality patches compared to
+those present in BigEarthNet. Each patch is then associated with a pixel-level
+reference map and scene-level multi-labels. This makes reBEN suitable for
+pixel- and scene-based learning tasks. The labels are derived from the most
+recent CORINE Land Cover (CLC) map of 2018 by utilizing the 19-class
+nomenclature as in BigEarthNet. The use of the most recent CLC map results in
+overcoming the label noise present in BigEarthNet. Furthermore, we introduce a
+new geographical-based split assignment algorithm that significantly reduces
+the spatial correlation among the train, validation, and test sets with respect
+to those present in BigEarthNet. This increases the reliability of the
+evaluation of DL models. To minimize the DL model training time, we introduce
+software tools that convert the reBEN dataset into a DL-optimized data format.
+In our experiments, we show the potential of reBEN for multi-modal multi-label
+image classification problems by considering several state-of-the-art DL
+models. The pre-trained model weights, associated code, and complete dataset
+are available at https://bigearth.net.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DehazeGS: Seeing Through Fog with 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03659v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03659v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinze Yu, Yiqun Wang, Zhengda Lu, Jianwei Guo, Yong Li, Hongxing Qin, Xiaopeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current novel view synthesis tasks primarily rely on high-quality and clear
+images. However, in foggy scenes, scattering and attenuation can significantly
+degrade the reconstruction and rendering quality. Although NeRF-based dehazing
+reconstruction algorithms have been developed, their use of deep fully
+connected neural networks and per-ray sampling strategies leads to high
+computational costs. Moreover, NeRF's implicit representation struggles to
+recover fine details from hazy scenes. In contrast, recent advancements in 3D
+Gaussian Splatting achieve high-quality 3D scene reconstruction by explicitly
+modeling point clouds into 3D Gaussians. In this paper, we propose leveraging
+the explicit Gaussian representation to explain the foggy image formation
+process through a physically accurate forward rendering process. We introduce
+DehazeGS, a method capable of decomposing and rendering a fog-free background
+from participating media using only muti-view foggy images as input. We model
+the transmission within each Gaussian distribution to simulate the formation of
+fog. During this process, we jointly learn the atmospheric light and scattering
+coefficient while optimizing the Gaussian representation of the hazy scene. In
+the inference stage, we eliminate the effects of scattering and attenuation on
+the Gaussians and directly project them onto a 2D plane to obtain a clear view.
+Experiments on both synthetic and real-world foggy datasets demonstrate that
+DehazeGS achieves state-of-the-art performance in terms of both rendering
+quality and computational efficiency. visualizations are available at
+https://dehazegs.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StructSR: Refuse Spurious Details in Real-World Image Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05777v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05777v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yachao Li, Dong Liang, Tianyu Ding, Sheng-Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based models have shown great promise in real-world image
+super-resolution (Real-ISR), but often generate content with structural errors
+and spurious texture details due to the empirical priors and illusions of these
+models. To address this issue, we introduce StructSR, a simple, effective, and
+plug-and-play method that enhances structural fidelity and suppresses spurious
+details for diffusion-based Real-ISR. StructSR operates without the need for
+additional fine-tuning, external model priors, or high-level semantic
+knowledge. At its core is the Structure-Aware Screening (SAS) mechanism, which
+identifies the image with the highest structural similarity to the
+low-resolution (LR) input in the early inference stage, allowing us to leverage
+it as a historical structure knowledge to suppress the generation of spurious
+details. By intervening in the diffusion inference process, StructSR seamlessly
+integrates with existing diffusion-based Real-ISR models. Our experimental
+results demonstrate that StructSR significantly improves the fidelity of
+structure and texture, improving the PSNR and SSIM metrics by an average of
+5.27% and 9.36% on a synthetic dataset (DIV2K-Val) and 4.13% and 8.64% on two
+real-world datasets (RealSR and DRealSR) when integrated with four
+state-of-the-art diffusion-based Real-ISR methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Direct Unlearning Optimization for Robust and Safe Text-to-Image Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21035v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21035v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong-Hyun Park, Sangdoo Yun, Jin-Hwa Kim, Junho Kim, Geonhui Jang, Yonghyun Jeong, Junghyo Jo, Gayoung Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in text-to-image (T2I) models have unlocked a wide range
+of applications but also present significant risks, particularly in their
+potential to generate unsafe content. To mitigate this issue, researchers have
+developed unlearning techniques to remove the model's ability to generate
+potentially harmful content. However, these methods are easily bypassed by
+adversarial attacks, making them unreliable for ensuring the safety of
+generated images. In this paper, we propose Direct Unlearning Optimization
+(DUO), a novel framework for removing Not Safe For Work (NSFW) content from T2I
+models while preserving their performance on unrelated topics. DUO employs a
+preference optimization approach using curated paired image data, ensuring that
+the model learns to remove unsafe visual concepts while retaining unrelated
+features. Furthermore, we introduce an output-preserving regularization term to
+maintain the model's generative capabilities on safe content. Extensive
+experiments demonstrate that DUO can robustly defend against various
+state-of-the-art red teaming methods without significant performance
+degradation on unrelated topics, as measured by FID and CLIP scores. Our work
+contributes to the development of safer and more reliable T2I models, paving
+the way for their responsible deployment in both closed-source and open-source
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Distortion Guided <span class="highlight-title">Transformer</span> for Omnidirectional Image
+  Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10869v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10869v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cuixin Yang, Rongkang Dong, Jun Xiao, Cong Zhang, Kin-Man Lam, Fei Zhou, Guoping Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As virtual and augmented reality applications gain popularity,
+omnidirectional image (ODI) super-resolution has become increasingly important.
+Unlike 2D plain images that are formed on a plane, ODIs are projected onto
+spherical surfaces. Applying established image super-resolution methods to
+ODIs, therefore, requires performing equirectangular projection (ERP) to map
+the ODIs onto a plane. ODI super-resolution needs to take into account
+geometric distortion resulting from ERP. However, without considering such
+geometric distortion of ERP images, previous deep-learning-based methods only
+utilize a limited range of pixels and may easily miss self-similar textures for
+reconstruction. In this paper, we introduce a novel Geometric Distortion Guided
+Transformer for Omnidirectional image Super-Resolution (GDGT-OSR).
+Specifically, a distortion modulated rectangle-window self-attention mechanism,
+integrated with deformable self-attention, is proposed to better perceive the
+distortion and thus involve more self-similar textures. Distortion modulation
+is achieved through a newly devised distortion guidance generator that produces
+guidance by exploiting the variability of distortion across latitudes.
+Furthermore, we propose a dynamic feature aggregation scheme to adaptively fuse
+the features from different self-attention modules. We present extensive
+experimental results on public datasets and show that the new GDGT-OSR
+outperforms methods in existing literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 12 figures, journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ iFADIT: Invertible Face Anonymization via Disentangled Identity
+  Transform 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04390v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04390v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Yuan, Kai Liang, Xiong Li, Tao Wu, Nannan Wang, Xinbo Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face anonymization aims to conceal the visual identity of a face to safeguard
+the individual's privacy. Traditional methods like blurring and pixelation can
+largely remove identifying features, but these techniques significantly degrade
+image quality and are vulnerable to deep reconstruction attacks. Generative
+models have emerged as a promising solution for anonymizing faces while
+preserving a natural appearance. However, many still face limitations in visual
+quality and often overlook the potential to recover the original face from the
+anonymized version, which can be valuable in specific contexts such as image
+forensics. This paper proposes a novel framework named iFADIT, an acronym for
+Invertible Face Anonymization via Disentangled Identity Transform. The
+framework features a disentanglement architecture coupled with a secure
+flow-based model: the former decouples identity information from
+non-identifying attributes, while the latter transforms the decoupled identity
+into an anonymized version in an invertible manner controlled by a secret key.
+The anonymized face can then be reconstructed based on a pre-trained StyleGAN
+that ensures high image quality and realistic facial details. Recovery of the
+original face (aka de-anonymization) is possible upon the availability of the
+matching secret, by inverting the anonymization process based on the same set
+of model parameters. Furthermore, a dedicated secret-key mechanism along with a
+dual-phase training strategy is devised to ensure the desired properties of
+face anonymization. Qualitative and quantitative experiments demonstrate the
+superiority of the proposed approach in anonymity, reversibility, security,
+diversity, and interpretability over competing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using
+  Real-Time Warped Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08331v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08331v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan Burgert, Yuancheng Xu, Wenqi Xian, Oliver Pilarski, Pascal Clausen, Mingming He, Li Ma, Yitong Deng, Lingxiao Li, Mohsen Mousavi, Michael Ryoo, Paul Debevec, Ning Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative modeling aims to transform random noise into structured outputs.
+In this work, we enhance video diffusion models by allowing motion control via
+structured latent noise sampling. This is achieved by just a change in data: we
+pre-process training videos to yield structured noise. Consequently, our method
+is agnostic to diffusion model design, requiring no changes to model
+architectures or training pipelines. Specifically, we propose a novel noise
+warping algorithm, fast enough to run in real time, that replaces random
+temporal Gaussianity with correlated warped noise derived from optical flow
+fields, while preserving the spatial Gaussianity. The efficiency of our
+algorithm enables us to fine-tune modern video diffusion base models using
+warped noise with minimal overhead, and provide a one-stop solution for a wide
+range of user-friendly motion control: local object motion control, global
+camera movement control, and motion transfer. The harmonization between
+temporal coherence and spatial Gaussianity in our warped noise leads to
+effective motion control while maintaining per-frame pixel quality. Extensive
+experiments and user studies demonstrate the advantages of our method, making
+it a robust and scalable approach for controlling motion in video diffusion
+models. Video results are available on our webpage:
+https://vgenai-netflix-eyeline-research.github.io/Go-with-the-Flow. Source code
+and model checkpoints are available on GitHub:
+https://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Point-PRC: A <span class="highlight-title">Prompt</span> Learning Based Regulation Framework for
+  Generalizable Point Cloud Analysis <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.20406v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.20406v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyu Sun, Qiuhong Ke, Yongcai Wang, Wang Chen, Kang Yang, Deying Li, Jianfei Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the 3D domain generalization (3DDG) ability of large
+3D models based on prevalent prompt learning. Recent works demonstrate the
+performances of 3D point cloud recognition can be boosted remarkably by
+parameter-efficient prompt tuning. However, we observe that the improvement on
+downstream tasks comes at the expense of a severe drop in 3D domain
+generalization. To resolve this challenge, we present a comprehensive
+regulation framework that allows the learnable prompts to actively interact
+with the well-learned general knowledge in large 3D models to maintain good
+generalization. Specifically, the proposed framework imposes multiple explicit
+constraints on the prompt learning trajectory by maximizing the mutual
+agreement between task-specific predictions and task-agnostic knowledge. We
+design the regulation framework as a plug-and-play module to embed into
+existing representative large 3D models. Surprisingly, our method not only
+realizes consistently increasing generalization ability but also enhances
+task-specific 3D recognition performances across various 3DDG benchmarks by a
+clear margin. Considering the lack of study and evaluation on 3DDG, we also
+create three new benchmarks, namely base-to-new, cross-dataset and few-shot
+generalization benchmarks, to enrich the field and inspire future research.
+Code and benchmarks are available at
+\url{https://github.com/auniquesun/Point-PRC}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 figures, 14 tables; accepted by NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMRxRecon2024: A Multi-Modality, Multi-View K-Space <span class="highlight-title">Dataset</span> Boosting
+  Universal Machine Learning for Accelerated Cardiac MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zi Wang, Fanwen Wang, Chen Qin, Jun Lyu, Cheng Ouyang, Shuo Wang, Yan Li, Mengyao Yu, Haoyu Zhang, Kunyuan Guo, Zhang Shi, Qirong Li, Ziqiang Xu, Yajing Zhang, Hao Li, Sha Hua, Binghua Chen, Longyu Sun, Mengting Sun, Qin Li, Ying-Hua Chu, Wenjia Bai, Jing Qin, Xiahai Zhuang, Claudia Prieto, Alistair Young, Michael Markl, He Wang, Lianming Wu, Guang Yang, Xiaobo Qu, Chengyan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiac magnetic resonance imaging (MRI) has emerged as a clinically
+gold-standard technique for diagnosing cardiac diseases, thanks to its ability
+to provide diverse information with multiple modalities and anatomical views.
+Accelerated cardiac MRI is highly expected to achieve time-efficient and
+patient-friendly imaging, and then advanced image reconstruction approaches are
+required to recover high-quality, clinically interpretable images from
+undersampled measurements. However, the lack of publicly available cardiac MRI
+k-space dataset in terms of both quantity and diversity has severely hindered
+substantial technological progress, particularly for data-driven artificial
+intelligence. Here, we provide a standardized, diverse, and high-quality
+CMRxRecon2024 dataset to facilitate the technical development, fair evaluation,
+and clinical transfer of cardiac MRI reconstruction approaches, towards
+promoting the universal frameworks that enable fast and robust reconstructions
+across different cardiac MRI protocols in clinical practice. To the best of our
+knowledge, the CMRxRecon2024 dataset is the largest and most protocal-diverse
+publicly available cardiac k-space dataset. It is acquired from 330 healthy
+volunteers, covering commonly used modalities, anatomical views, and
+acquisition trajectories in clinical cardiac MRI workflows. Besides, an open
+platform with tutorials, benchmarks, and data processing tools is provided to
+facilitate data usage, advanced method development, and fair performance
+evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 3 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VLG-CBM: Training Concept Bottleneck Models with Vision-Language
+  Guidance <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01432v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01432v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Divyansh Srivastava, Ge Yan, Tsui-Wei Weng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concept Bottleneck Models (CBMs) provide interpretable prediction by
+introducing an intermediate Concept Bottleneck Layer (CBL), which encodes
+human-understandable concepts to explain models' decision. Recent works
+proposed to utilize Large Language Models and pre-trained Vision-Language
+Models to automate the training of CBMs, making it more scalable and automated.
+However, existing approaches still fall short in two aspects: First, the
+concepts predicted by CBL often mismatch the input image, raising doubts about
+the faithfulness of interpretation. Second, it has been shown that concept
+values encode unintended information: even a set of random concepts could
+achieve comparable test accuracy to state-of-the-art CBMs. To address these
+critical limitations, in this work, we propose a novel framework called
+Vision-Language-Guided Concept Bottleneck Model (VLG-CBM) to enable faithful
+interpretability with the benefits of boosted performance. Our method leverages
+off-the-shelf open-domain grounded object detectors to provide visually
+grounded concept annotation, which largely enhances the faithfulness of concept
+prediction while further improving the model performance. In addition, we
+propose a new metric called Number of Effective Concepts (NEC) to control the
+information leakage and provide better interpretability. Extensive evaluations
+across five standard benchmarks show that our method, VLG-CBM, outperforms
+existing methods by at least 4.27% and up to 51.09% on Accuracy at NEC=5
+(denoted as ANEC-5), and by at least 0.45% and up to 29.78% on average accuracy
+(denoted as ANEC-avg), while preserving both faithfulness and interpretability
+of the learned concepts as demonstrated in extensive experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appeared at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synthesizing Forestry Images Conditioned on Plant Phenotype Using a
+  Generative Adversarial Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03789v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03789v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debasmita Pal, Arun Ross
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plant phenology and phenotype prediction using remote sensing data are
+increasingly gaining attention within the plant science community as a
+promising approach to enhance agricultural productivity. This work focuses on
+generating synthetic forestry images that satisfy certain phenotypic
+attributes, viz. canopy greenness. We harness a Generative Adversarial Network
+(GAN) to synthesize biologically plausible and phenotypically stable forestry
+images conditioned on the greenness of vegetation (a continuous attribute) over
+a specific region of interest, describing a particular vegetation type in a
+mixed forest. The training data is based on the automated digital camera
+imagery provided by the National Ecological Observatory Network (NEON) and
+processed by the PhenoCam Network. Our method helps render the appearance of
+forest sites specific to a greenness value. The synthetic images are
+subsequently utilized to predict another phenotypic attribute, viz., redness of
+plants. The quality of the synthetic images is assessed using the Structural
+SIMilarity (SSIM) index and Fr\'echet Inception Distance (FID). Further, the
+greenness and redness indices of the synthetic images are compared against
+those of the original images using Root Mean Squared Percentage Error (RMSPE)
+to evaluate their accuracy and integrity. The generalizability and scalability
+of our proposed GAN model are established by effectively transforming it to
+generate synthetic images for other forest sites and vegetation types. From a
+broader perspective, this approach could be leveraged to visualize forestry
+based on different phenotypic attributes in the context of various
+environmental parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Pattern Recognition journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BRIGHT-VO: Brightness-Guided Hybrid <span class="highlight-title">Transformer</span> for Visual Odometry with
+  Multi-modality Refinement Module 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08659v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08659v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongzhihan Wang, Yang Yang, Liang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual odometry (VO) plays a crucial role in autonomous driving, robotic
+navigation, and other related tasks by estimating the position and orientation
+of a camera based on visual input. Significant progress has been made in
+data-driven VO methods, particularly those leveraging deep learning techniques
+to extract image features and estimate camera poses. However, these methods
+often struggle in low-light conditions because of the reduced visibility of
+features and the increased difficulty of matching keypoints. To address this
+limitation, we introduce BrightVO, a novel VO model based on Transformer
+architecture, which not only performs front-end visual feature extraction, but
+also incorporates a multi-modality refinement module in the back-end that
+integrates Inertial Measurement Unit (IMU) data. Using pose graph optimization,
+this module iteratively refines pose estimates to reduce errors and improve
+both accuracy and robustness. Furthermore, we create a synthetic low-light
+dataset, KiC4R, which includes a variety of lighting conditions to facilitate
+the training and evaluation of VO frameworks in challenging environments.
+Experimental results demonstrate that BrightVO achieves state-of-the-art
+performance on both the KiC4R dataset and the KITTI benchmarks. Specifically,
+it provides an average improvement of 20% in pose estimation accuracy in normal
+outdoor environments and 259% in low-light conditions, outperforming existing
+methods. For widespread use and further development, the research work is fully
+open-source at https://github.com/Anastasiawd/BrightVO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We have identified significant issues in the methodology and data
+  analysis that impact the validity of our conclusions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A General Framework for Inference-time Scaling and Steering of Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06848v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06848v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raghav Singhal, Zachary Horvitz, Ryan Teehan, Mengye Ren, Zhou Yu, Kathleen McKeown, Rajesh Ranganath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models produce impressive results in modalities ranging from images
+and video to protein design and text. However, generating samples with
+user-specified properties remains a challenge. Recent research proposes
+fine-tuning models to maximize rewards that capture desired properties, but
+these methods require expensive training and are prone to mode collapse. In
+this work, we propose Feynman Kac (FK) steering, an inference-time framework
+for steering diffusion models with reward functions. FK steering works by
+sampling a system of multiple interacting diffusion processes, called
+particles, and resampling particles at intermediate steps based on scores
+computed using functions called potentials. Potentials are defined using
+rewards for intermediate states and are selected such that a high value
+indicates that the particle will yield a high-reward sample. We explore various
+choices of potentials, intermediate rewards, and samplers. We evaluate FK
+steering on text-to-image and text diffusion models. For steering text-to-image
+models with a human preference reward, we find that FK steering a 0.8B
+parameter model outperforms a 2.6B parameter fine-tuned model on prompt
+fidelity, with faster sampling and no training. For steering text diffusion
+models with rewards for text quality and specific text attributes, we find that
+FK steering generates lower perplexity, more linguistically acceptable outputs
+and enables gradient-free control of attributes like toxicity. Our results
+demonstrate that inference-time scaling and steering of diffusion models, even
+with off-the-shelf rewards, can provide significant sample quality gains and
+controllability benefits. Code is available at
+https://github.com/zacharyhorvitz/Fk-Diffusion-Steering .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffMesh: A Motion-aware Diffusion Framework for Human Mesh Recovery
+  from Videos <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13397v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13397v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ce Zheng, Xianpeng Liu, Qucheng Peng, Tianfu Wu, Pu Wang, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human mesh recovery (HMR) provides rich human body information for various
+real-world applications. While image-based HMR methods have achieved impressive
+results, they often struggle to recover humans in dynamic scenarios, leading to
+temporal inconsistencies and non-smooth 3D motion predictions due to the
+absence of human motion. In contrast, video-based approaches leverage temporal
+information to mitigate this issue. In this paper, we present DiffMesh, an
+innovative motion-aware Diffusion-like framework for video-based HMR. DiffMesh
+establishes a bridge between diffusion models and human motion, efficiently
+generating accurate and smooth output mesh sequences by incorporating human
+motion within the forward process and reverse process in the diffusion model.
+Extensive experiments are conducted on the widely used datasets (Human3.6M
+\cite{h36m_pami} and 3DPW \cite{pw3d2018}), which demonstrate the effectiveness
+and efficiency of our DiffMesh. Visual comparisons in real-world scenarios
+further highlight DiffMesh's suitability for practical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Balanced Continual Multi-Modal Learning in Human Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05264v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05264v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxuan Peng, Mengshi Qi, Dong Zhao, Huadong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D human pose estimation (3D HPE) has emerged as a prominent research topic,
+particularly in the realm of RGB-based methods. However, RGB images are
+susceptible to limitations such as sensitivity to lighting conditions and
+potential user discomfort. Consequently, multi-modal sensing, which leverages
+non-intrusive sensors, is gaining increasing attention. Nevertheless,
+multi-modal 3D HPE still faces challenges, including modality imbalance and the
+imperative for continual learning. In this work, we introduce a novel balanced
+continual multi-modal learning method for 3D HPE, which harnesses the power of
+RGB, LiDAR, mmWave, and WiFi. Specifically, we propose a Shapley value-based
+contribution algorithm to quantify the contribution of each modality and
+identify modality imbalance. To address this imbalance, we employ a re-learning
+strategy. Furthermore, recognizing that raw data is prone to noise
+contamination, we develop a novel denoising continual learning approach. This
+approach incorporates a noise identification and separation module to mitigate
+the adverse effects of noise and collaborates with the balanced learning
+strategy to enhance optimization. Additionally, an adaptive EWC mechanism is
+employed to alleviate catastrophic forgetting. We conduct extensive experiments
+on the widely-adopted multi-modal dataset, MM-Fi, which demonstrate the
+superiority of our approach in boosting 3D pose estimation and mitigating
+catastrophic forgetting in complex scenarios. We will release our codes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collaboration in Immersive Environments: Challenges and Solutions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.00689v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.00689v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahin Doroudian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual Reality (VR) and Augmented Reality (AR) tools have been applied in
+all engineering fields in order to avoid the use of physical prototypes, to
+train in high-risk situations, and to interpret real or simulated results. In
+order to complete a shared task or assign tasks to the agents in such immersive
+environments, collaboration or Shared Cooperative Activities are a necessity.
+Collaboration in immersive environments is an emerging field of research that
+aims to study and enhance the ways in which people interact and work together
+in Virtual and Augmented Reality settings. Collaboration in immersive
+environments is a complex process that involves different factors such as
+communication, coordination, and social presence. This paper provides an
+overview of the current state of research on collaboration in immersive
+environments. It discusses the different types of immersive environments,
+including VR and AR, and the different forms of collaboration that can occur in
+these environments. The paper also highlights the challenges and limitations of
+collaboration in immersive environments, such as the lack of physical cues,
+cost and usability and the need for further research in this area. Overall,
+collaboration in immersive environments is a promising field with a wide range
+of potential applications, from education to industry, and it can benefit both
+individuals and groups by enhancing their ability to work together effectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added new references in Networking section</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking <span class="highlight-title">Pre-Train</span>ed Feature Extractor Selection in Multiple Instance
+  Learning for Whole Slide Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01167v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01167v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bryan Wong, Mun Yong Yi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple instance learning (MIL) has become a preferred method for gigapixel
+whole slide image (WSI) classification without requiring patch-level
+annotations. Current MIL research primarily relies on embedding-based
+approaches, which extract patch features using a pre-trained feature extractor
+and aggregate them for slide-level prediction. Despite the critical role of
+feature extraction, there is limited guidance on selecting optimal feature
+extractors to maximize WSI performance. This study addresses this gap by
+systematically evaluating MIL feature extractors across three dimensions:
+pre-training dataset, backbone model, and pre-training method. Extensive
+experiments were conducted on two public WSI datasets (TCGA-NSCLC and
+Camelyon16) using four state-of-the-art (SOTA) MIL models. Our findings reveal
+that: 1) selecting a robust self-supervised learning (SSL) method has a greater
+impact on performance than relying solely on an in-domain pre-training dataset;
+2) prioritizing Transformer-based backbones with deeper architectures over
+CNN-based models; and 3) using larger, more diverse pre-training datasets
+significantly enhances classification outcomes. We hope that these insights can
+provide practical guidance for optimizing WSI classification and explain the
+reasons behind the performance advantages of the current SOTA pathology
+foundation models. Furthermore, this work may inform the development of more
+effective pathology foundation models. Our code is publicly available at
+https://github.com/bryanwong17/MIL-Feature-Extractor-Selection
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE International Symposium on Biomedical Imaging (ISBI)
+  2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PhysMamba: State Space Duality Model for Remote Physiological
+  Measurement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01077v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01077v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixin Yan, Yan Zhong, Hongbin Xu, Wenjun Zhang, Shangru Yi, Lin Shu, Wenxiong Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote Photoplethysmography (rPPG) enables non-contact physiological signal
+extraction from facial videos, offering applications in psychological state
+analysis, medical assistance, and anti-face spoofing. However, challenges such
+as motion artifacts, lighting variations, and noise limit its real-world
+applicability. To address these issues, we propose PhysMamba, a novel
+dual-pathway time-frequency interaction model based on Synergistic State Space
+Duality (SSSD), which for the first time integrates state space models with
+attention mechanisms in a dual-branch framework. Combined with a Multi-Scale
+Query (MQ) mechanism, PhysMamba achieves efficient information exchange and
+enhanced feature representation, ensuring robustness under noisy and dynamic
+conditions. Experiments on PURE, UBFC-rPPG, and MMPD datasets demonstrate that
+PhysMamba outperforms state-of-the-art methods, offering superior accuracy and
+generalization. This work lays a strong foundation for practical applications
+in non-contact health monitoring, including real-time remote patient care.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal
+  MRI <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10377v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10377v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linxuan Han, Sa Xiao, Zimeng Li, Haidong Li, Xiuchao Zhao, Yeqing Han, Fumin Guo, Xin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal magnetic resonance imaging (MRI) provides information of lesions
+for computer-aided diagnosis from different views. Deep learning algorithms are
+suitable for identifying specific anatomical structures, segmenting lesions,
+and classifying diseases. Manual labels are limited due to the high expense,
+which hinders further improvement of accuracy. Self-supervised learning,
+particularly masked image modeling (MIM), has shown promise in utilizing
+unlabeled data. However, we spot model collapse when applying MIM to
+multi-modal MRI datasets. The performance of downstream tasks does not see any
+improvement following the collapsed model. To solve model collapse, we analyze
+and address it in two types: complete collapse and dimensional collapse. We
+find complete collapse occurs because the collapsed loss value in multi-modal
+MRI datasets falls below the normally converged loss value. Based on this, the
+hybrid mask pattern (HMP) masking strategy is introduced to elevate the
+collapsed loss above the normally converged loss value and avoid complete
+collapse. Additionally, we reveal that dimensional collapse stems from
+insufficient feature uniformity in MIM. We mitigate dimensional collapse by
+introducing the pyramid barlow twins (PBT) module as an explicit regularization
+method. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module
+to avoid model collapse multi-modal MRI. Experiments are conducted on three
+multi-modal MRI datasets to validate the effectiveness of our approach in
+preventing both types of model collapse. By preventing model collapse, the
+training of the model becomes more stable, resulting in a decent improvement in
+performance for segmentation and classification tasks. The code is available at
+https://github.com/LinxuanHan/E-MIM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the lEEE for possible publication.
+  copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CryoBench: Diverse and challenging <span class="highlight-title">dataset</span>s for the heterogeneity
+  problem in cryo-EM <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05526v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05526v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minkyu Jeon, Rishwanth Raghu, Miro Astore, Geoffrey Woollard, Ryan Feathers, Alkin Kaz, Sonya M. Hanson, Pilar Cossio, Ellen D. Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cryo-electron microscopy (cryo-EM) is a powerful technique for determining
+high-resolution 3D biomolecular structures from imaging data. Its unique
+ability to capture structural variability has spurred the development of
+heterogeneous reconstruction algorithms that can infer distributions of 3D
+structures from noisy, unlabeled imaging data. Despite the growing number of
+advanced methods, progress in the field is hindered by the lack of standardized
+benchmarks with ground truth information and reliable validation metrics. Here,
+we introduce CryoBench, a suite of datasets, metrics, and benchmarks for
+heterogeneous reconstruction in cryo-EM. CryoBench includes five datasets
+representing different sources of heterogeneity and degrees of difficulty.
+These include conformational heterogeneity generated from designed motions of
+antibody complexes or sampled from a molecular dynamics simulation, as well as
+compositional heterogeneity from mixtures of ribosome assembly states or 100
+common complexes present in cells. We then analyze state-of-the-art
+heterogeneous reconstruction tools, including neural and non-neural methods,
+assess their sensitivity to noise, and propose new metrics for quantitative
+evaluation. We hope that CryoBench will be a foundational resource for
+accelerating algorithmic development and evaluation in the cryo-EM and machine
+learning communities. Project page: https://cryobench.cs.princeton.edu.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Swin <span class="highlight-title">transformer</span>s are robust to distribution and concept drift in
+  endoscopy-based longitudinal rectal cancer assessment <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.03762v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.03762v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Tapias Gomez, Aneesh Rangnekar, Hannah Williams, Hannah Thompson, Julio Garcia-Aguilar, Joshua Jesse Smith, Harini Veeraraghavan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Endoscopic images are used at various stages of rectal cancer treatment
+starting from cancer screening, diagnosis, during treatment to assess response
+and toxicity from treatments such as colitis, and at follow up to detect new
+tumor or local regrowth (LR). However, subjective assessment is highly variable
+and can underestimate the degree of response in some patients, subjecting them
+to unnecessary surgery, or overestimate response that places patients at risk
+of disease spread. Advances in deep learning has shown the ability to produce
+consistent and objective response assessment for endoscopic images. However,
+methods for detecting cancers, regrowth, and monitoring response during the
+entire course of patient treatment and follow-up are lacking. This is because,
+automated diagnosis and rectal cancer response assessment requires methods that
+are robust to inherent imaging illumination variations and confounding
+conditions (blood, scope, blurring) present in endoscopy images as well as
+changes to the normal lumen and tumor during treatment. Hence, a hierarchical
+shifted window (Swin) transformer was trained to distinguish rectal cancer from
+normal lumen using endoscopy images. Swin as well as two convolutional
+(ResNet-50, WideResNet-50), and vision transformer (ViT) models were trained
+and evaluated on follow-up longitudinal images to detect LR on private dataset
+as well as on out-of-distribution (OOD) public colonoscopy datasets to detect
+pre/non-cancerous polyps. Color shifts were applied using optimal transport to
+simulate distribution shifts. Swin and ResNet models were similarly accurate in
+the in-distribution dataset. Swin was more accurate than other methods
+(follow-up: 0.84, OOD: 0.83) even when subject to color shifts (follow-up:
+0.83, OOD: 0.87), indicating capability to provide robust performance for
+longitudinal cancer assessment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The work has been accepted for publication in 2024 SPIE Medical
+  Imaging conference proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TakuNet: an Energy-Efficient CNN for Real-Time Inference on Embedded UAV
+  systems in Emergency Response Scenarios <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05880v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05880v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Rossi, Guido Borghi, Roberto Vezzani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing efficient neural networks for embedded devices is a critical
+challenge, particularly in applications requiring real-time performance, such
+as aerial imaging with drones and UAVs for emergency responses. In this work,
+we introduce TakuNet, a novel light-weight architecture which employs
+techniques such as depth-wise convolutions and an early downsampling stem to
+reduce computational complexity while maintaining high accuracy. It leverages
+dense connections for fast convergence during training and uses 16-bit
+floating-point precision for optimization on embedded hardware accelerators.
+Experimental evaluation on two public datasets shows that TakuNet achieves
+near-state-of-the-art accuracy in classifying aerial images of emergency
+situations, despite its minimal parameter count. Real-world tests on embedded
+devices, namely Jetson Orin Nano and Raspberry Pi, confirm TakuNet's
+efficiency, achieving more than 650 fps on the 15W Jetson board, making it
+suitable for real-time AI processing on resource-constrained platforms and
+advancing the applicability of drones in emergency scenarios. The code and
+implementation details are publicly released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted at WACVW 2025, which will take place on
+  28/02/2025. The official conference proceedings have not yet been published
+  at the time of submission to arXiv. The final version of the paper,
+  incorporating any changes based on feedback received during the conference,
+  will be included in the proceedings once they are made available</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AgRegNet: A Deep Regression Network for Flower and Fruit Density
+  Estimation, Localization, and Counting in Orchards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.17400v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.17400v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uddhav Bhattarai, Santosh Bhusal, Qin Zhang, Manoj Karkee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the major challenges for the agricultural industry today is the
+uncertainty in manual labor availability and the associated cost. Automated
+flower and fruit density estimation, localization, and counting could help
+streamline harvesting, yield estimation, and crop-load management strategies
+such as flower and fruitlet thinning. This article proposes a deep
+regression-based network, AgRegNet, to estimate density, count, and location of
+flower and fruit in tree fruit canopies without explicit object detection or
+polygon annotation. Inspired by popular U-Net architecture, AgRegNet is a
+U-shaped network with an encoder-to-decoder skip connection and modified
+ConvNeXt-T as an encoder feature extractor. AgRegNet can be trained based on
+information from point annotation and leverages segmentation information and
+attention modules (spatial and channel) to highlight relevant flower and fruit
+features while suppressing non-relevant background features. Experimental
+evaluation in apple flower and fruit canopy images under an unstructured
+orchard environment showed that AgRegNet achieved promising accuracy as
+measured by Structural Similarity Index (SSIM), percentage Mean Absolute Error
+(pMAE) and mean Average Precision (mAP) to estimate flower and fruit density,
+count, and centroid location, respectively. Specifically, the SSIM, pMAE, and
+mAP values for flower images were 0.938, 13.7%, and 0.81, respectively. For
+fruit images, the corresponding values were 0.910, 5.6%, and 0.93. Since the
+proposed approach relies on information from point annotation, it is suitable
+for sparsely and densely located objects. This simplified technique will be
+highly applicable for growers to accurately estimate yields and decide on
+optimal chemical and mechanical flower thinning practices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Computers and Electronics in Agriculture</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">22</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Lexicon-Based Text Embeddings with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09749v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09749v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibin Lei, Tao Shen, Yu Cao, Andrew Yates
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent large language models (LLMs) have demonstrated exceptional performance
+on general-purpose text embedding tasks. While dense embeddings have dominated
+related research, we introduce the first Lexicon-based EmbeddiNgS (LENS)
+leveraging LLMs that achieve competitive performance on these tasks. Regarding
+the inherent tokenization redundancy issue and unidirectional attention
+limitations in traditional causal LLMs, LENS consolidates the vocabulary space
+through token embedding clustering, and investigates bidirectional attention
+and various pooling strategies. Specifically, LENS simplifies lexicon matching
+by assigning each dimension to a specific token cluster, where semantically
+similar tokens are grouped together, and unlocking the full potential of LLMs
+through bidirectional attention. Extensive experiments demonstrate that LENS
+outperforms dense embeddings on the Massive Text Embedding Benchmark (MTEB),
+delivering compact feature representations that match the sizes of dense
+counterparts. Notably, combining LENSE with dense embeddings achieves
+state-of-the-art performance on the retrieval subset of MTEB (i.e. BEIR).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metric Learning with Progressive Self-Distillation for Audio-Visual
+  Embedding Learning <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donghuo Zeng, Kazushi Ikeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Metric learning projects samples into an embedded space, where similarities
+and dissimilarities are quantified based on their learned representations.
+However, existing methods often rely on label-guided representation learning,
+where representations of different modalities, such as audio and visual data,
+are aligned based on annotated labels. This approach tends to underutilize
+latent complex features and potential relationships inherent in the
+distributions of audio and visual data that are not directly tied to the
+labels, resulting in suboptimal performance in audio-visual embedding learning.
+To address this issue, we propose a novel architecture that integrates
+cross-modal triplet loss with progressive self-distillation. Our method
+enhances representation learning by leveraging inherent distributions and
+dynamically refining soft audio-visual alignments -- probabilistic alignments
+between audio and visual data that capture the inherent relationships beyond
+explicit labels. Specifically, the model distills audio-visual
+distribution-based knowledge from annotated labels in a subset of each batch.
+This self-distilled knowledge is used t
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 2 tables. Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Conversational Recommender Systems with Large Language
+  Models: A User-Centric Evaluation Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nuo Chen, Quanyu Dai, Xiaoyu Dong, Xiao-Ming Wu, Zhenhua Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational recommender systems (CRS) involve both recommendation and
+dialogue tasks, which makes their evaluation a unique challenge. Although past
+research has analyzed various factors that may affect user satisfaction with
+CRS interactions from the perspective of user studies, few evaluation metrics
+for CRS have been proposed. Recent studies have shown that LLMs can align with
+human preferences, and several LLM-based text quality evaluation measures have
+been introduced. However, the application of LLMs in CRS evaluation remains
+relatively limited. To address this research gap and advance the development of
+user-centric conversational recommender systems, this study proposes an
+automated LLM-based CRS evaluation framework, building upon existing research
+in human-computer interaction and psychology. The framework evaluates CRS from
+four dimensions: dialogue behavior, language expression, recommendation items,
+and response content. We use this framework to evaluate four different
+conversational recommender systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating LLM Abilities to Understand Tabular Electronic Health
+  Records: A Comprehensive Study of Patient Data Extraction and Retrieval <span class="chip">ECIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09384v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09384v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesus Lovon, Martin Mouysset, Jo Oleiwan, Jose G. Moreno, Christine Damase-Michel, Lynda Tamine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electronic Health Record (EHR) tables pose unique challenges among which is
+the presence of hidden contextual dependencies between medical features with a
+high level of data dimensionality and sparsity. This study presents the first
+investigation into the abilities of LLMs to comprehend EHRs for patient data
+extraction and retrieval. We conduct extensive experiments using the MIMICSQL
+dataset to explore the impact of the prompt structure, instruction, context,
+and demonstration, of two backbone LLMs, Llama2 and Meditron, based on task
+performance. Through quantitative and qualitative analyses, our findings show
+that optimal feature selection and serialization methods can enhance task
+performance by up to 26.79% compared to naive approaches. Similarly, in-context
+learning setups with relevant example selection improve data extraction
+performance by 5.95%. Based on our study findings, we propose guidelines that
+we believe would help the design of LLM-based models to support health search.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published as full paper in the Proceedings of the European
+  Conference on Information Retrieval (ECIR) 2025. Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-tiered Solution for Personalized Baggage Item Recommendations
+  using FastText and Association Rule Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mudavath Ravi, Atul Negi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an intelligent baggage item recommendation system to
+optimize packing for air travelers by providing tailored suggestions based on
+specific travel needs and destinations. Using FastText word embeddings and
+Association Rule Mining (ARM), the system ensures efficient luggage space
+utilization, compliance with weight limits, and an enhanced travel experience.
+The methodology comprises four phases: (1) data collection and preprocessing
+with pre-trained FastText embeddings for text representation and similarity
+scoring (2) a content-based recommendation system enriched by user search
+history (3) application of ARM to user interactions to uncover meaningful item
+associations and (4) integration of FastText and ARM for accurate, personalized
+recommendations. Performance is evaluated using metrics such as coverage,
+support, confidence, lift, leverage, and conviction. Results demonstrate the
+system's effectiveness in providing relevant suggestions, improving customer
+satisfaction, and simplifying the packing process. These insights advance
+personalized recommendations, targeted marketing, and product optimization in
+air travel and beyond.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Style4Rec: Enhancing <span class="highlight-title">Transformer</span>-based E-commerce Recommendation Systems
+  with Style and Shopping Cart Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Berke Ugurlu, Ming-Yi Hong, Che Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding users' product preferences is essential to the efficacy of a
+recommendation system. Precision marketing leverages users' historical data to
+discern these preferences and recommends products that align with them.
+However, recent browsing and purchase records might better reflect current
+purchasing inclinations. Transformer-based recommendation systems have made
+strides in sequential recommendation tasks, but they often fall short in
+utilizing product image style information and shopping cart data effectively.
+In light of this, we propose Style4Rec, a transformer-based e-commerce
+recommendation system that harnesses style and shopping cart information to
+enhance existing transformer-based sequential product recommendation systems.
+Style4Rec represents a significant step forward in personalized e-commerce
+recommendations, outperforming benchmarks across various evaluation metrics.
+Style4Rec resulted in notable improvements: HR@5 increased from 0.681 to 0.735,
+NDCG@5 increased from 0.594 to 0.674, and MRR@5 increased from 0.559 to 0.654.
+We tested our model using an e-commerce dataset from our partnering company and
+found that it exceeded established transformer-based sequential recommendation
+benchmarks across various evaluation metrics. Thus, Style4Rec presents a
+significant step forward in personalized e-commerce recommendation systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 images, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ To Retrieve or Not to Retrieve? Uncertainty Detection for Dynamic
+  Retrieval Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaustubh D. Dhole
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation equips large language models with the
+capability to retrieve external knowledge, thereby mitigating hallucinations by
+incorporating information beyond the model's intrinsic abilities. However, most
+prior works have focused on invoking retrieval deterministically, which makes
+it unsuitable for tasks such as long-form question answering. Instead,
+dynamically performing retrieval by invoking it only when the underlying LLM
+lacks the required knowledge can be more efficient. In this context, we delve
+deeper into the question, "To Retrieve or Not to Retrieve?" by exploring
+multiple uncertainty detection methods. We evaluate these methods for the task
+of long-form question answering, employing dynamic retrieval, and present our
+comparisons. Our findings suggest that uncertainty detection metrics, such as
+Degree Matrix Jaccard and Eccentricity, can reduce the number of retrieval
+calls by almost half, with only a slight reduction in question-answering
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fuzzy Integration of Data Lake Tables 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aamod Khatiwada, Roee Shraga, Renée J. Miller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data integration is an important step in any data science pipeline where the
+objective is to unify the information available in different datasets for
+comprehensive analysis. Full Disjunction, which is an associative extension of
+the outer join operator, has been shown to be an effective operator for
+integrating datasets. It fully preserves and combines the available
+information. Existing Full Disjunction algorithms only consider the equi-join
+scenario where only tuples having the same value on joining columns are
+integrated. This, however, does not realistically represent an open data
+scenario, where datasets come from diverse sources with inconsistent values
+(e.g., synonyms, abbreviations, etc.) and with limited metadata. So, joining
+just on equal values severely limits the ability of Full Disjunction to fully
+combine datasets. Thus, in this work, we propose an extension of Full
+Disjunction to also account for "fuzzy" matches among tuples. We present a
+novel data-driven approach to enable the joining of approximate or fuzzy
+matches within Full Disjunction. Experimentally, we show that fuzzy Full
+Disjunction does not add significant time overhead over a state-of-the-art Full
+Disjunction implementation and also that it enhances the integration
+effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-Supervised Image-Based Narrative Extraction: A Case Study with
+  Historical Photographic Records <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fausto German, Brian Keith, Mauricio Matus, Diego Urrutia, Claudio Meneses
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a semi-supervised approach to extracting narratives from
+historical photographic records using an adaptation of the narrative maps
+algorithm. We extend the original unsupervised text-based method to work with
+image data, leveraging deep learning techniques for visual feature extraction
+and similarity computation. Our method is applied to the ROGER dataset, a
+collection of photographs from the 1928 Sacambaya Expedition in Bolivia
+captured by Robert Gerstmann. We compare our algorithmically extracted visual
+narratives with expert-curated timelines of varying lengths (5 to 30 images) to
+evaluate the effectiveness of our approach. In particular, we use the Dynamic
+Time Warping (DTW) algorithm to match the extracted narratives with the
+expert-curated baseline. In addition, we asked an expert on the topic to
+qualitatively evaluate a representative example of the resulting narratives.
+Our findings show that the narrative maps approach generally outperforms random
+sampling for longer timelines (10+ images, p < 0.05), with expert evaluation
+confirming the historical accuracy and coherence of the extracted narratives.
+This research contributes to the field of computational analysis of visual
+cultural heritage, offering new tools for historians, archivists, and digital
+humanities scholars to explore and understand large-scale image collections.
+The method's ability to generate meaningful narratives from visual data opens
+up new possibilities for the study and interpretation of historical events
+through photographic evidence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for oral presentation in the findings
+  track of the 47th European Conference on Information Retrieval (ECIR 2025).
+  Source code and experiments are available at
+  https://github.com/faustogerman/ROGER-Concept-Narratives</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empirical Evaluation of Embedding Models in the Context of Text
+  Classification in Document <span class="highlight-title">Review</span> in Construction Delay Disputes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fusheng Wei, Robert Neary, Han Qin, Qiang Mao, Jianping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text embeddings are numerical representations of text data, where words,
+phrases, or entire documents are converted into vectors of real numbers. These
+embeddings capture semantic meanings and relationships between text elements in
+a continuous vector space. The primary goal of text embeddings is to enable the
+processing of text data by machine learning models, which require numerical
+input. Numerous embedding models have been developed for various applications.
+This paper presents our work in evaluating different embeddings through a
+comprehensive comparative analysis of four distinct models, focusing on their
+text classification efficacy. We employ both K-Nearest Neighbors (KNN) and
+Logistic Regression (LR) to perform binary classification tasks, specifically
+determining whether a text snippet is associated with 'delay' or 'not delay'
+within a labeled dataset. Our research explores the use of text snippet
+embeddings for training supervised text classification models to identify
+delay-related statements during the document review process of construction
+delay disputes. The results of this study highlight the potential of embedding
+models to enhance the efficiency and accuracy of document analysis in legal
+contexts, paving the way for more informed decision-making in complex
+investigative scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conversational Text Extraction with Large Language Models Using
+  Retrieval-Augmented Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09801v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09801v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soham Roy, Mitul Goswami, Nisharg Nargund, Suneeta Mohanty, Prasant Kumar Pattnaik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces a system leveraging Large Language Models (LLMs) to
+extract text and enhance user interaction with PDF documents via a
+conversational interface. Utilizing Retrieval-Augmented Generation (RAG), the
+system provides informative responses to user inquiries while highlighting
+relevant passages within the PDF. Upon user upload, the system processes the
+PDF, employing sentence embeddings to create a document-specific vector store.
+This vector store enables efficient retrieval of pertinent sections in response
+to user queries. The LLM then engages in a conversational exchange, using the
+retrieved information to extract text and generate comprehensive, contextually
+aware answers. While our approach demonstrates competitive ROUGE values
+compared to existing state-of-the-art techniques for text extraction and
+summarization, we acknowledge that further qualitative evaluation is necessary
+to fully assess its effectiveness in real-world applications. The proposed
+system gives competitive ROUGE values as compared to existing state-of-the-art
+techniques for text extraction and summarization, thus offering a valuable tool
+for researchers, students, and anyone seeking to efficiently extract knowledge
+and gain insights from documents through an intuitive question-answering
+interface.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniThink: Expanding Knowledge Boundaries in Machine Writing through
+  Thinking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekun Xi, Wenbiao Yin, Jizhan Fang, Jialong Wu, Runnan Fang, Ningyu Zhang, Jiang Yong, Pengjun Xie, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine writing with large language models often relies on
+retrieval-augmented generation. However, these approaches remain confined
+within the boundaries of the model's predefined scope, limiting the generation
+of content with rich information. Specifically, vanilla-retrieved information
+tends to lack depth, utility, and suffers from redundancy, which negatively
+impacts the quality of generated articles, leading to shallow, repetitive, and
+unoriginal outputs. To address these issues, we propose OmniThink, a machine
+writing framework that emulates the human-like process of iterative expansion
+and reflection. The core idea behind OmniThink is to simulate the cognitive
+behavior of learners as they progressively deepen their knowledge of the
+topics. Experimental results demonstrate that OmniThink improves the knowledge
+density of generated articles without compromising metrics such as coherence
+and depth. Human evaluations and expert feedback further highlight the
+potential of OmniThink to address real-world challenges in the generation of
+long-form articles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lossless Compression of Vector IDs for Approximate Nearest Neighbor
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10479v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10479v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Severo, Giuseppe Ottaviano, Matthew Muckley, Karen Ullrich, Matthijs Douze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Approximate nearest neighbor search for vectors relies on indexes that are
+most often accessed from RAM. Therefore, storage is the factor limiting the
+size of the database that can be served from a machine. Lossy vector
+compression, i.e., embedding quantization, has been applied extensively to
+reduce the size of indexes. However, for inverted file and graph-based indices,
+auxiliary data such as vector ids and links (edges) can represent most of the
+storage cost. We introduce and evaluate lossless compression schemes for these
+cases. These approaches are based on asymmetric numeral systems or wavelet
+trees that exploit the fact that the ordering of ids is irrelevant within the
+data structures. In some settings, we are able to compress the vector ids by a
+factor 7, with no impact on accuracy or search runtime. On billion-scale
+datasets, this results in a reduction of 30% of the index size. Furthermore, we
+show that for some datasets, these methods can also compress the quantized
+vector codes losslessly, by exploiting sub-optimalities in the original
+quantization algorithm. The source code for our approach available at
+https://github.com/facebookresearch/vector_db_id_compression.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Algorithmic Collective Action in Recommender Systems: Promoting Songs by
+  Reordering Playlists <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.04269v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.04269v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joachim Baumann, Celestine Mendler-Dünner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate algorithmic collective action in transformer-based recommender
+systems. Our use case is a music streaming platform where a collective of fans
+aims to promote the visibility of an underrepresented artist by strategically
+placing one of their songs in the existing playlists they control. We introduce
+two easily implementable strategies to select the position at which to insert
+the song with the goal to boost recommendations at test time. The strategies
+exploit statistical properties of the learner by targeting discontinuities in
+the recommendations, and leveraging the long-tail nature of song distributions.
+We evaluate the efficacy of our strategies using a publicly available
+recommender system model released by a major music streaming platform. Our
+findings reveal that through strategic placement even small collectives
+(controlling less than 0.01\% of the training data) can achieve up to
+$40\times$ more test time recommendations than an average song with the same
+number of training set occurrences. Focusing on the externalities of the
+strategy, we find that the recommendations of other songs are largely
+preserved, and the newly gained recommendations are distributed across various
+artists. Together, our findings demonstrate how carefully designed collective
+action strategies can be effective while not necessarily being adversarial.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at NeurIPS 2024, camera-ready updates</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPRec: Leveraging Self-Play to Debias Preference Alignment for Large
+  Language Model-based Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09243v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09243v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongming Gao, Ruijun Chen, Shuai Yuan, Kexin Huang, Yuanqing Yu, Xiangnan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have attracted significant attention in
+recommendation systems. Current LLM-based recommender systems primarily rely on
+supervised fine-tuning (SFT) to train the model for recommendation tasks.
+However, relying solely on positive samples limits the model's ability to align
+with user satisfaction and expectations. To address this, researchers have
+introduced Direct Preference Optimization (DPO), which explicitly aligns
+recommendations with user preferences using offline preference ranking data.
+Despite its advantages, our theoretical analysis reveals that DPO inherently
+biases the model towards a few items, exacerbating the filter bubble issue and
+ultimately degrading user experience. In this paper, we propose SPRec, a novel
+self-play recommendation framework designed to mitigate over-recommendation and
+improve fairness without requiring additional data or manual intervention. In
+each self-play iteration, the model undergoes an SFT step followed by a DPO
+step, treating offline interaction data as positive samples and the predicted
+outputs from the previous iteration as negative samples. This effectively
+re-weights the DPO loss function using the model's logits, adaptively
+suppressing biased items. Extensive experiments on multiple real-world datasets
+demonstrate SPRec's effectiveness in enhancing recommendation accuracy and
+addressing fairness concerns. The implementation is available via
+https://github.com/RegionCh/SPRec
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tapping the Potential of Large Language Models as Recommender Systems: A
+  Comprehensive Framework and Empirical Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04997v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04997v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lanling Xu, Junjie Zhang, Bingqian Li, Jinpeng Wang, Sheng Chen, Wayne Xin Zhao, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Large Language Models~(LLMs) such as ChatGPT have showcased
+remarkable abilities in solving general tasks, demonstrating the potential for
+applications in recommender systems. To assess how effectively LLMs can be used
+in recommendation tasks, our study primarily focuses on employing LLMs as
+recommender systems through prompting engineering. We propose a general
+framework for utilizing LLMs in recommendation tasks, focusing on the
+capabilities of LLMs as recommenders. To conduct our analysis, we formalize the
+input of LLMs for recommendation into natural language prompts with two key
+aspects, and explain how our framework can be generalized to various
+recommendation scenarios. As for the use of LLMs as recommenders, we analyze
+the impact of public availability, tuning strategies, model architecture,
+parameter scale, and context length on recommendation results based on the
+classification of LLMs. As for prompt engineering, we further analyze the
+impact of four important components of prompts, \ie task descriptions, user
+interest modeling, candidate items construction and prompting strategies. In
+each section, we first define and categorize concepts in line with the existing
+literature. Then, we propose inspiring research questions followed by detailed
+experiments on two public datasets, in order to systematically analyze the
+impact of different factors on performance. Based on our empirical analysis, we
+finally summarize promising directions to shed lights on future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>52 pages, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Retrieval Based on Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04635v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04635v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Te-Lun Yang, Jyi-Shane Liu, Yuen-Hsien Tseng, Jyh-Shing Roger Jang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study develops a question-answering system based on Retrieval-Augmented
+Generation (RAG) using Chinese Wikipedia and Lawbank as retrieval sources.
+Using TTQA and TMMLU+ as evaluation datasets, the system employs BGE-M3 for
+dense vector retrieval to obtain highly relevant search results and
+BGE-reranker to reorder these results based on query relevance. The most
+pertinent retrieval outcomes serve as reference knowledge for a Large Language
+Model (LLM), enhancing its ability to answer questions and establishing a
+knowledge retrieval system grounded in generative AI. The system's
+effectiveness is assessed through a two-stage evaluation: automatic and
+assisted performance evaluations. The automatic evaluation calculates accuracy
+by comparing the model's auto-generated labels with ground truth answers,
+measuring performance under standardized conditions without human intervention.
+The assisted performance evaluation involves 20 finance-related multiple-choice
+questions answered by 20 participants without financial backgrounds. Initially,
+participants answer independently. Later, they receive system-generated
+reference information to assist in answering, examining whether the system
+improves accuracy when assistance is provided. The main contributions of this
+research are: (1) Enhanced LLM Capability: By integrating BGE-M3 and
+BGE-reranker, the system retrieves and reorders highly relevant results,
+reduces hallucinations, and dynamically accesses authorized or public knowledge
+sources. (2) Improved Data Privacy: A customized RAG architecture enables local
+operation of the LLM, eliminating the need to send private data to external
+servers. This approach enhances data security, reduces reliance on commercial
+services, lowers operational costs, and mitigates privacy risks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 13 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing User Interest based on Stream Clustering and Memory Networks
+  in Large-Scale Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13238v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13238v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Liu, Nian Wang, Cong Xu, Ming Zhao, Bin Wang, Yi Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender Systems (RSs) provide personalized recommendation service based
+on user interest, which are widely used in various platforms. However, there
+are lots of users with sparse interest due to lacking consumption behaviors,
+which leads to poor recommendation results for them. This problem is widespread
+in large-scale RSs and is particularly difficult to address. To solve this
+problem, we propose a novel solution named User Interest Enhancement (UIE)
+which enhances user interest including user profile and user history behavior
+sequences using the enhancement vectors and personalized enhancement vector
+generated based on stream clustering and memory networks from different
+perspectives. UIE not only remarkably improves model performance on the users
+with sparse interest but also significantly enhance model performance on other
+users. UIE is an end-to-end solution which is easy to be implemented based on
+ranking model. Moreover, we expand our solution and apply similar methods to
+long-tail items, which also achieves excellent improvement. Furthermore, we
+conduct extensive offline and online experiments in a large-scale industrial
+RS. The results demonstrate that our model outperforms other models remarkably,
+especially for the users with sparse interest. Until now, UIE has been fully
+deployed in multiple large-scale RSs and achieved remarkable improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SSD4Rec: A Structured State Space Duality Model for Efficient Sequential
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.01192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.01192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haohao Qu, Yifeng Zhang, Liangbo Ning, Wenqi Fan, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation methods are crucial in modern recommender systems
+for their remarkable capability to understand a user's changing interests based
+on past interactions. However, a significant challenge faced by current methods
+(e.g., RNN- or Transformer-based models) is to effectively and efficiently
+capture users' preferences by modeling long behavior sequences, which impedes
+their various applications like short video platforms where user interactions
+are numerous. Recently, an emerging architecture named Mamba, built on state
+space models (SSM) with efficient hardware-aware designs, has showcased the
+tremendous potential for sequence modeling, presenting a compelling avenue for
+addressing the challenge effectively. Inspired by this, we propose a novel
+generic and efficient sequential recommendation backbone, SSD4Rec, which
+explores the seamless adaptation of Mamba for sequential recommendations.
+Specifically, SSD4Rec marks the variable- and long-length item sequences with
+sequence registers and processes the item representations with bidirectional
+Structured State Space Duality (SSD) blocks. This not only allows for
+hardware-aware matrix multiplication but also empowers outstanding capabilities
+in variable-length and long-range sequence modeling. Extensive evaluations on
+four benchmark datasets demonstrate that the proposed model achieves
+state-of-the-art performance while maintaining near-linear scalability with
+user sequence length. Our code is publicly available at
+https://github.com/ZhangYifeng1995/SSD4Rec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Significant revisions have been implemented in our paper,
+  particularly focusing on both the methodology and experimental sections</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scenario-Wise Rec: A Multi-Scenario Recommendation Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17374v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17374v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaopeng Li, Jingtong Gao, Pengyue Jia, Xiangyu Zhao, Yichao Wang, Wanyu Wang, Yejing Wang, Yuhao Wang, Xiangyu Zhao, Huifeng Guo, Ruiming Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi Scenario Recommendation (MSR) tasks, referring to building a unified
+model to enhance performance across all recommendation scenarios, have recently
+gained much attention. However, current research in MSR faces two significant
+challenges that hinder the field's development: the absence of uniform
+procedures for multi-scenario dataset processing, thus hindering fair
+comparisons, and most models being closed-sourced, which complicates
+comparisons with current SOTA models. Consequently, we introduce our benchmark,
+\textbf{Scenario-Wise Rec}, which comprises 6 public datasets and 12 benchmark
+models, along with a training and evaluation pipeline. Additionally, we
+validated the benchmark using an industrial advertising dataset, reinforcing
+its reliability and applicability in real-world scenarios. We aim for this
+benchmark to offer researchers valuable insights from prior work, enabling the
+development of novel models based on our benchmark and thereby fostering a
+collaborative research ecosystem in MSR. Our source code is also publicly
+available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fusion <span class="highlight-title">Self-supervised</span> Learning for Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19692v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19692v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhang, Lei Sang, Yi Zhang, Yiwen Zhang, Yun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems are widely deployed in various web environments, and
+self-supervised learning (SSL) has recently attracted significant attention in
+this field. Contrastive learning (CL) stands out as a major SSL paradigm due to
+its robust ability to generate self-supervised signals. Mainstream graph
+contrastive learning (GCL)-based methods typically implement CL by creating
+contrastive views through various data augmentation techniques. Despite these
+methods are effective, we argue that there still exist several challenges. i)
+Data augmentation ($e.g.,$ discarding edges or adding noise) necessitates
+additional graph convolution (GCN) or modeling operations, which are highly
+time-consuming and potentially harm the embedding quality. ii) Existing
+CL-based methods use traditional CL objectives to capture self-supervised
+signals. However, few studies have explored obtaining CL objectives from more
+perspectives and have attempted to fuse the varying signals from these CL
+objectives to enhance recommendation performance.
+  To overcome these challenges, we propose a Fusion Self-supervised Learning
+framework for recommendation. Specifically, instead of facilitating data
+augmentations, we use high-order information from GCN process to create
+contrastive views. Additionally, to integrate self-supervised signals from
+various CL objectives, we propose an advanced CL objective. By ensuring that
+positive pairs are distanced from negative samples derived from both
+contrastive views, we effectively fuse self-supervised signals from distinct CL
+objectives, thereby enhancing the mutual information between positive pairs.
+Experimental results on three public datasets demonstrate the superior
+recommendation performance and efficiency of HFGCL compared to the
+state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cold-Start Recommendation towards the Era of Large Language Models
+  (LLMs): A Comprehensive <span class="highlight-title">Survey</span> and Roadmap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01945v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01945v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhi Zhang, Yuanchen Bei, Liangwei Yang, Henry Peng Zou, Peilin Zhou, Aiwei Liu, Yinghui Li, Hao Chen, Jianling Wang, Yu Wang, Feiran Huang, Sheng Zhou, Jiajun Bu, Allen Lin, James Caverlee, Fakhri Karray, Irwin King, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cold-start problem is one of the long-standing challenges in recommender
+systems, focusing on accurately modeling new or interaction-limited users or
+items to provide better recommendations. Due to the diversification of internet
+platforms and the exponential growth of users and items, the importance of
+cold-start recommendation (CSR) is becoming increasingly evident. At the same
+time, large language models (LLMs) have achieved tremendous success and possess
+strong capabilities in modeling user and item information, providing new
+potential for cold-start recommendations. However, the research community on
+CSR still lacks a comprehensive review and reflection in this field. Based on
+this, in this paper, we stand in the context of the era of large language
+models and provide a comprehensive review and discussion on the roadmap,
+related literature, and future directions of CSR. Specifically, we have
+conducted an exploration of the development path of how existing CSR utilizes
+information, from content features, graph relations, and domain information, to
+the world knowledge possessed by large language models, aiming to provide new
+insights for both the research and industrial communities on CSR. Related
+resources of cold-start recommendations are collected and continuously updated
+for the community in
+https://github.com/YuanchenBei/Awesome-Cold-Start-Recommendation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">156</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SRE-Conv: Symmetric Rotation Equivariant Convolution for Biomedical
+  Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuexi Du, Jiazhen Zhang, Tal Zeevi, Nicha C. Dvornek, John A. Onofrey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks (CNNs) are essential tools for computer vision
+tasks, but they lack traditionally desired properties of extracted features
+that could further improve model performance, e.g., rotational equivariance.
+Such properties are ubiquitous in biomedical images, which often lack explicit
+orientation. While current work largely relies on data augmentation or explicit
+modules to capture orientation information, this comes at the expense of
+increased training costs or ineffective approximations of the desired
+equivariance. To overcome these challenges, we propose a novel and efficient
+implementation of the Symmetric Rotation-Equivariant (SRE) Convolution
+(SRE-Conv) kernel, designed to learn rotation-invariant features while
+simultaneously compressing the model size. The SRE-Conv kernel can easily be
+incorporated into any CNN backbone. We validate the ability of a deep SRE-CNN
+to capture equivariance to rotation using the public MedMNISTv2 dataset (16
+total tasks). SRE-Conv-CNN demonstrated improved rotated image classification
+performance accuracy on all 16 test datasets in both 2D and 3D images, all
+while increasing efficiency with fewer parameters and reduced memory footprint.
+The code is available at https://github.com/XYPB/SRE-Conv.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ISBI 2025 4-page paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FAST: Efficient Action Tokenization for Vision-Language-Action Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karl Pertsch, Kyle Stachowicz, Brian Ichter, Danny Driess, Suraj Nair, Quan Vuong, Oier Mees, Chelsea Finn, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autoregressive sequence models, such as Transformer-based vision-language
+action (VLA) policies, can be tremendously effective for capturing complex and
+generalizable robotic behaviors. However, such models require us to choose a
+tokenization of our continuous action signals, which determines how the
+discrete symbols predicted by the model map to continuous robot actions. We
+find that current approaches for robot action tokenization, based on simple
+per-dimension, per-timestep binning schemes, typically perform poorly when
+learning dexterous skills from high-frequency robot data. To address this
+challenge, we propose a new compression-based tokenization scheme for robot
+actions, based on the discrete cosine transform. Our tokenization approach,
+Frequency-space Action Sequence Tokenization (FAST), enables us to train
+autoregressive VLAs for highly dexterous and high-frequency tasks where
+standard discretization methods fail completely. Based on FAST, we release
+FAST+, a universal robot action tokenizer, trained on 1M real robot action
+trajectories. It can be used as a black-box tokenizer for a wide range of robot
+action sequences, with diverse action spaces and control frequencies. Finally,
+we show that, when combined with the pi0 VLA, our method can scale to training
+on 10k hours of robot data and match the performance of diffusion VLAs, while
+reducing training time by up to 5x.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website: https://www.pi.website/research/fast</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Suggesting Code Edits in Interactive Machine Learning Notebooks Using
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bihui Jin, Jiayue Wang, Pengyu Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning developers frequently use interactive computational
+notebooks, such as Jupyter notebooks, to host code for data processing and
+model training. Jupyter notebooks provide a convenient tool for writing machine
+learning pipelines and interactively observing outputs, however, maintaining
+Jupyter notebooks, e.g., to add new features or fix bugs, can be challenging
+due to the length and complexity of the notebooks. Moreover, there is no
+existing benchmark related to developer edits on Jupyter notebooks. To address
+this, we present the first dataset of 48,398 Jupyter notebook edits derived
+from 20,095 revisions of 792 machine learning repositories on GitHub, and
+perform the first study of the using LLMs to predict code edits in Jupyter
+notebooks. Our dataset captures granular details of cell-level and line-level
+modifications, offering a foundation for understanding real-world maintenance
+patterns in machine learning workflows. We observed that the edits on Jupyter
+notebooks are highly localized, with changes averaging only 166 lines of code
+in repositories. While larger models outperform smaller counterparts in code
+editing, all models have low accuracy on our dataset even after finetuning,
+demonstrating the complexity of real-world machine learning maintenance tasks.
+Our findings emphasize the critical role of contextual information in improving
+model performance and point toward promising avenues for advancing large
+language models' capabilities in engineering machine learning code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Random Subspace Cubic-Regularization Methods, with Applications to
+  Low-Rank Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Coralia Cartis, Zhen Shao, Edward Tansley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose and analyze random subspace variants of the second-order Adaptive
+Regularization using Cubics (ARC) algorithm. These methods iteratively restrict
+the search space to some random subspace of the parameters, constructing and
+minimizing a local model only within this subspace. Thus, our variants only
+require access to (small-dimensional) projections of first- and second-order
+problem derivatives and calculate a reduced step inexpensively. Under suitable
+assumptions, the ensuing methods maintain the optimal first-order, and
+second-order, global rates of convergence of (full-dimensional) cubic
+regularization, while showing improved scalability both theoretically and
+numerically, particularly when applied to low-rank functions. When applied to
+the latter, our adaptive variant naturally adapts the subspace size to the true
+rank of the function, without knowing it a priori.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predictions as Surrogates: Revisiting Surrogate Outcomes in the Age of
+  AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenlong Ji, Lihua Lei, Tijana Zrnic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We establish a formal connection between the decades-old surrogate outcome
+model in biostatistics and economics and the emerging field of
+prediction-powered inference (PPI). The connection treats predictions from
+pre-trained models, prevalent in the age of AI, as cost-effective surrogates
+for expensive outcomes. Building on the surrogate outcomes literature, we
+develop recalibrated prediction-powered inference, a more efficient approach to
+statistical inference than existing PPI proposals. Our method departs from the
+existing proposals by using flexible machine learning techniques to learn the
+optimal ``imputed loss'' through a step we call recalibration. Importantly, the
+method always improves upon the estimator that relies solely on the data with
+available true outcomes, even when the optimal imputed loss is estimated
+imperfectly, and it achieves the smallest asymptotic variance among PPI
+estimators if the estimate is consistent. Computationally, our optimization
+objective is convex whenever the loss function that defines the target
+parameter is convex. We further analyze the benefits of recalibration, both
+theoretically and numerically, in several common scenarios where machine
+learning predictions systematically deviate from the outcome of interest. We
+demonstrate significant gains in effective sample size over existing PPI
+proposals via three applications leveraging state-of-the-art machine
+learning/AI models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating particle physics Lagrangians with <span class="highlight-title">transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong Sheng Koay, Rikard Enberg, Stefano Moretti, Eliel Camargo-Molina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In physics, Lagrangians provide a systematic way to describe laws governing
+physical systems. In the context of particle physics, they encode the
+interactions and behavior of the fundamental building blocks of our universe.
+By treating Lagrangians as complex, rule-based constructs similar to linguistic
+expressions, we trained a transformer model -- proven to be effective in
+natural language tasks -- to predict the Lagrangian corresponding to a given
+list of particles. We report on the transformer's performance in constructing
+Lagrangians respecting the Standard Model $\mathrm{SU}(3)\times
+\mathrm{SU}(2)\times \mathrm{U}(1)$ gauge symmetries. The resulting model is
+shown to achieve high accuracies (over 90\%) with Lagrangians up to six matter
+fields, with the capacity to generalize beyond the training distribution,
+albeit within architectural constraints. We show through an analysis of input
+embeddings that the model has internalized concepts such as group
+representations and conjugation operations as it learned to generate
+Lagrangians. We make the model and training datasets available to the
+community. An interactive demonstration can be found at:
+\url{https://huggingface.co/spaces/JoseEliel/generate-lagrangians}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 11 figues, 18 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention based Bidirectional GRU hybrid model for inappropriate content
+  detection in Urdu language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09722v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09722v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ezzah Shoukat, Rabia Irfan, Iqra Basharat, Muhammad Ali Tahir, Sameen Shaukat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increased use of the internet and social networks for online
+discussions, the spread of toxic and inappropriate content on social networking
+sites has also increased. Several studies have been conducted in different
+languages. However, there is less work done for South Asian languages for
+inappropriate content identification using deep learning techniques. In Urdu
+language, the spellings are not unique, and people write different common
+spellings for the same word, while mixing it other languages, like English in
+the text makes it more challenging, and limited research work is available to
+process such language with the finest algorithms. The use of attention layer
+with a deep learning model can help handling the long-term dependencies and
+increase its efficiency . To explore the effects of the attention layer, this
+study proposes attention-based Bidirectional GRU hybrid model for identifying
+inappropriate content in Urdu Unicode text language. Four different baseline
+deep learning models; LSTM, Bi-LSTM, GRU, and TCN, are used to compare the
+performance of the proposed model. The results of these models were compared
+based on evaluation metrics, dataset size, and impact of the word embedding
+layer. The pre-trained Urdu word2Vec embeddings were utilized for our case. Our
+proposed model BiGRU-A outperformed all other baseline models by yielding 84\%
+accuracy without using pre-trained word2Vec layer. From our experiments, we
+have established that the attention layer improves the model's efficiency, and
+pre-trained word2Vec embedding does not work well with an inappropriate content
+dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Practical Continual Forgetting for <span class="highlight-title">Pre-train</span>ed Vision Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbo Zhao, Fei Zhu, Bolin Ni, Feng Zhu, Gaofeng Meng, Zhaoxiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For privacy and security concerns, the need to erase unwanted information
+from pre-trained vision models is becoming evident nowadays. In real-world
+scenarios, erasure requests originate at any time from both users and model
+owners, and these requests usually form a sequence. Therefore, under such a
+setting, selective information is expected to be continuously removed from a
+pre-trained model while maintaining the rest. We define this problem as
+continual forgetting and identify three key challenges. (i) For unwanted
+knowledge, efficient and effective deleting is crucial. (ii) For remaining
+knowledge, the impact brought by the forgetting procedure should be minimal.
+(iii) In real-world scenarios, the training samples may be scarce or partially
+missing during the process of forgetting. To address them, we first propose
+Group Sparse LoRA (GS-LoRA). Specifically, towards (i), we introduce LoRA
+modules to fine-tune the FFN layers in Transformer blocks for each forgetting
+task independently, and towards (ii), a simple group sparse regularization is
+adopted, enabling automatic selection of specific LoRA groups and zeroing out
+the others. To further extend GS-LoRA to more practical scenarios, we
+incorporate prototype information as additional supervision and introduce a
+more practical approach, GS-LoRA++. For each forgotten class, we move the
+logits away from its original prototype. For the remaining classes, we pull the
+logits closer to their respective prototypes. We conduct extensive experiments
+on face recognition, object detection and image classification and demonstrate
+that our method manages to forget specific classes with minimal impact on other
+classes. Codes have been released on https://github.com/bjzhb666/GS-LoRA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cueless EEG imagined speech for subject identification: <span class="highlight-title">dataset</span> and
+  benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Derakhshesh, Zahra Dehghanian, Reza Ebrahimpour, Hamid R. Rabiee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electroencephalogram (EEG) signals have emerged as a promising modality for
+biometric identification. While previous studies have explored the use of
+imagined speech with semantically meaningful words for subject identification,
+most have relied on additional visual or auditory cues. In this study, we
+introduce a cueless EEG-based imagined speech paradigm, where subjects imagine
+the pronunciation of semantically meaningful words without any external cues.
+This innovative approach addresses the limitations of prior methods by
+requiring subjects to select and imagine words from a predefined list
+naturally. The dataset comprises over 4,350 trials from 11 subjects across five
+sessions. We assess a variety of classification methods, including traditional
+machine learning techniques such as Support Vector Machines (SVM) and XGBoost,
+as well as time-series foundation models and deep learning architectures
+specifically designed for EEG classification, such as EEG Conformer and Shallow
+ConvNet. A session-based hold-out validation strategy was employed to ensure
+reliable evaluation and prevent data leakage. Our results demonstrate
+outstanding classification accuracy, reaching 97.93%. These findings highlight
+the potential of cueless EEG paradigms for secure and reliable subject
+identification in real-world applications, such as brain-computer interfaces
+(BCIs).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Near-optimal Algorithm for Learning Margin Halfspaces with Massart
+  Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09691v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09691v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilias Diakonikolas, Nikos Zarifis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of PAC learning $\gamma$-margin halfspaces in the
+presence of Massart noise. Without computational considerations, the sample
+complexity of this learning problem is known to be
+$\widetilde{\Theta}(1/(\gamma^2 \epsilon))$. Prior computationally efficient
+algorithms for the problem incur sample complexity $\tilde{O}(1/(\gamma^4
+\epsilon^3))$ and achieve 0-1 error of $\eta+\epsilon$, where $\eta<1/2$ is the
+upper bound on the noise rate. Recent work gave evidence of an
+information-computation tradeoff, suggesting that a quadratic dependence on
+$1/\epsilon$ is required for computationally efficient algorithms. Our main
+result is a computationally efficient learner with sample complexity
+$\widetilde{\Theta}(1/(\gamma^2 \epsilon^2))$, nearly matching this lower
+bound. In addition, our algorithm is simple and practical, relying on online
+SGD on a carefully selected sequence of convex losses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ U-Fair: Uncertainty-based Multimodal Multitask Learning for Fairer
+  Depression Detection <span class="chip">ML4H</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaee Cheong, Aditya Bangar, Sinan Kalkan, Hatice Gunes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning bias in mental health is becoming an increasingly pertinent
+challenge. Despite promising efforts indicating that multitask approaches often
+work better than unitask approaches, there is minimal work investigating the
+impact of multitask learning on performance and fairness in depression
+detection nor leveraged it to achieve fairer prediction outcomes. In this work,
+we undertake a systematic investigation of using a multitask approach to
+improve performance and fairness for depression detection. We propose a novel
+gender-based task-reweighting method using uncertainty grounded in how the
+PHQ-8 questionnaire is structured. Our results indicate that, although a
+multitask approach improves performance and fairness compared to a unitask
+approach, the results are not always consistent and we see evidence of negative
+transfer and a reduction in the Pareto frontier, which is concerning given the
+high-stake healthcare setting. Our proposed approach of gender-based
+reweighting with uncertainty improves performance and fairness and alleviates
+both challenges to a certain extent. Our findings on each PHQ-8 subitem task
+difficulty are also in agreement with the largest study conducted on the PHQ-8
+subitem discrimination capacity, thus providing the very first tangible
+evidence linking ML findings with large-scale empirical population studies
+conducted on the PHQ-8.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at the Proceedings of Machine Learning Research 259, 1-14,
+  2024 as part of the Machine Learning for Health (ML4H) Symposium 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reward-Guided Controlled Generation for Inference-Time Alignment in
+  Diffusion Models: Tutorial and <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masatoshi Uehara, Yulai Zhao, Chenyu Wang, Xiner Li, Aviv Regev, Sergey Levine, Tommaso Biancalani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This tutorial provides an in-depth guide on inference-time guidance and
+alignment methods for optimizing downstream reward functions in diffusion
+models. While diffusion models are renowned for their generative modeling
+capabilities, practical applications in fields such as biology often require
+sample generation that maximizes specific metrics (e.g., stability, affinity in
+proteins, closeness to target structures). In these scenarios, diffusion models
+can be adapted not only to generate realistic samples but also to explicitly
+maximize desired measures at inference time without fine-tuning. This tutorial
+explores the foundational aspects of such inference-time algorithms. We review
+these methods from a unified perspective, demonstrating that current techniques
+-- such as Sequential Monte Carlo (SMC)-based guidance, value-based sampling,
+and classifier guidance -- aim to approximate soft optimal denoising processes
+(a.k.a. policies in RL) that combine pre-trained denoising processes with value
+functions serving as look-ahead functions that predict from intermediate states
+to terminal rewards. Within this framework, we present several novel algorithms
+not yet covered in the literature. Furthermore, we discuss (1) fine-tuning
+methods combined with inference-time techniques, (2) inference-time algorithms
+based on search algorithms such as Monte Carlo tree search, which have received
+limited attention in current research, and (3) connections between
+inference-time algorithms in language models and diffusion models. The code of
+this tutorial on protein design is available at
+https://github.com/masa-ue/AlignInversePro
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We plan to add more content/codes. Please let us know if there are
+  any comments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rough kernel hedging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09683v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09683v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicola Muca Cirone, Cristopher Salvi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building on the functional-analytic framework of operator-valued kernels and
+un-truncated signature kernels, we propose a scalable, provably convergent
+signature-based algorithm for a broad class of high-dimensional, path-dependent
+hedging problems. We make minimal assumptions about market dynamics by
+modelling them as general geometric rough paths, yielding a fully model-free
+approach. Furthermore, through a representer theorem, we provide theoretical
+guarantees on the existence and uniqueness of a global minimum for the
+resulting optimization problem and derive an analytic solution under highly
+general loss functions. Similar to the popular deep hedging approach, but in a
+more rigorous fashion, our method can also incorporate additional features via
+the underlying operator-valued kernel, such as trading signals, news analytics,
+and past hedging decisions, closely aligning with true machine-learning
+practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fokker-Planck to Callan-Symanzik: evolution of weight matrices under
+  training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Bu, Uri Kol, Ziming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dynamical evolution of a neural network during training has been an
+incredibly fascinating subject of study. First principal derivation of generic
+evolution of variables in statistical physics systems has proved useful when
+used to describe training dynamics conceptually, which in practice means
+numerically solving equations such as Fokker-Planck equation. Simulating entire
+networks inevitably runs into the curse of dimensionality. In this paper, we
+utilize Fokker-Planck to simulate the probability density evolution of
+individual weight matrices in the bottleneck layers of a simple
+2-bottleneck-layered auto-encoder and compare the theoretical evolutions
+against the empirical ones by examining the output data distributions. We also
+derive physically relevant partial differential equations such as
+Callan-Symanzik and Kardar-Parisi-Zhang equations from the dynamical equation
+we have.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Research in Large Language Models for Electronic Design
+  Automation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyu Pan, Guanglei Zhou, Chen-Chia Chang, Isaac Jacobson, Jiang Hu, Yiran Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the rapidly evolving domain of Electronic Design Automation (EDA),
+Large Language Models (LLMs) have emerged as transformative technologies,
+offering unprecedented capabilities for optimizing and automating various
+aspects of electronic design. This survey provides a comprehensive exploration
+of LLM applications in EDA, focusing on advancements in model architectures,
+the implications of varying model sizes, and innovative customization
+techniques that enable tailored analytical insights. By examining the
+intersection of LLM capabilities and EDA requirements, the paper highlights the
+significant impact these models have on extracting nuanced understandings from
+complex datasets. Furthermore, it addresses the challenges and opportunities in
+integrating LLMs into EDA workflows, paving the way for future research and
+application in this dynamic field. Through this detailed analysis, the survey
+aims to offer valuable insights to professionals in the EDA industry, AI
+researchers, and anyone interested in the convergence of advanced AI
+technologies and electronic design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 2 figures, 3 tables, accepted by TODAES</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM-Based Routing in Mixture of Experts: A Novel Framework for Trading <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09636v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09636v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuan-Ming Liu, Ming-Chih Lo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in deep learning and large language models (LLMs) have
+facilitated the deployment of the mixture-of-experts (MoE) mechanism in the
+stock investment domain. While these models have demonstrated promising trading
+performance, they are often unimodal, neglecting the wealth of information
+available in other modalities, such as textual data. Moreover, the traditional
+neural network-based router selection mechanism fails to consider contextual
+and real-world nuances, resulting in suboptimal expert selection. To address
+these limitations, we propose LLMoE, a novel framework that employs LLMs as the
+router within the MoE architecture. Specifically, we replace the conventional
+neural network-based router with LLMs, leveraging their extensive world
+knowledge and reasoning capabilities to select experts based on historical
+price data and stock news. This approach provides a more effective and
+interpretable selection mechanism. Our experiments on multimodal real-world
+stock datasets demonstrate that LLMoE outperforms state-of-the-art MoE models
+and other deep neural network approaches. Additionally, the flexible
+architecture of LLMoE allows for easy adaptation to various downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025 Workshop on AI for Social Impact - Bridging
+  Innovations in Finance, Social Media, and Crime Prevention</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering Large Language Models in Wireless Communication: A Novel
+  <span class="highlight-title">Dataset</span> and Fine-Tuning Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yushen Lin, Ruichen Zhang, Wenqi Huang, Kaidi Wang, Zhiguo Ding, Daniel K. C. So, Dusit Niyato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we develop a specialized dataset aimed at enhancing the
+evaluation and fine-tuning of large language models (LLMs) specifically for
+wireless communication applications. The dataset includes a diverse set of
+multi-hop questions, including true/false and multiple-choice types, spanning
+varying difficulty levels from easy to hard. By utilizing advanced language
+models for entity extraction and question generation, rigorous data curation
+processes are employed to maintain high quality and relevance. Additionally, we
+introduce a Pointwise V-Information (PVI) based fine-tuning method, providing a
+detailed theoretical analysis and justification for its use in quantifying the
+information content of training data with 2.24\% and 1.31\% performance boost
+for different models compared to baselines, respectively. To demonstrate the
+effectiveness of the fine-tuned models with the proposed methodologies on
+practical tasks, we also consider different tasks, including summarizing
+optimization problems from technical papers and solving the mathematical
+problems related to non-orthogonal multiple access (NOMA), which are generated
+by using the proposed multi-agent framework. Simulation results show
+significant performance gain in summarization tasks with 20.9\% in the ROUGE-L
+metrics. We also study the scaling laws of fine-tuning LLMs and the challenges
+LLMs face in the field of wireless communications, offering insights into their
+adaptation to wireless communication tasks. This dataset and fine-tuning
+methodology aim to enhance the training and evaluation of LLMs, contributing to
+advancements in LLMs for wireless communication research and applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 13 figure, journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weight for Robustness: A Comprehensive Approach towards Optimal
+  Fault-Tolerant Asynchronous ML 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tehila Dahan, Kfir Y. Levy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the challenges of Byzantine-robust training in asynchronous
+distributed machine learning systems, aiming to enhance efficiency amid massive
+parallelization and heterogeneous computing resources. Asynchronous systems,
+marked by independently operating workers and intermittent updates, uniquely
+struggle with maintaining integrity against Byzantine failures, which encompass
+malicious or erroneous actions that disrupt learning. The inherent delays in
+such settings not only introduce additional bias to the system but also obscure
+the disruptions caused by Byzantine faults. To tackle these issues, we adapt
+the Byzantine framework to asynchronous dynamics by introducing a novel
+weighted robust aggregation framework. This allows for the extension of robust
+aggregators and a recent meta-aggregator to their weighted versions, mitigating
+the effects of delayed updates. By further incorporating a recent
+variance-reduction technique, we achieve an optimal convergence rate for the
+first time in an asynchronous Byzantine environment. Our methodology is
+rigorously validated through empirical and theoretical analysis, demonstrating
+its effectiveness in enhancing fault tolerance and optimizing performance in
+asynchronous ML systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Reward Hacking: Causal Rewards for Large Language Model Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqi Wang, Zhuokai Zhao, Yibo Jiang, Zhaorun Chen, Chen Zhu, Yuxin Chen, Jiayi Liu, Lizhu Zhang, Xiangjun Fan, Hao Ma, Sinong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language models (LLMs) have demonstrated significant
+progress in performing complex tasks. While Reinforcement Learning from Human
+Feedback (RLHF) has been effective in aligning LLMs with human preferences, it
+is susceptible to spurious correlations in reward modeling. Consequently, it
+often introduces biases-such as length bias, sycophancy, conceptual bias, and
+discrimination that hinder the model's ability to capture true causal
+relationships. To address this, we propose a novel causal reward modeling
+approach that integrates causal inference to mitigate these spurious
+correlations. Our method enforces counterfactual invariance, ensuring reward
+predictions remain consistent when irrelevant variables are altered. Through
+experiments on both synthetic and real-world datasets, we show that our
+approach mitigates various types of spurious correlations effectively,
+resulting in more reliable and fair alignment of LLMs with human preferences.
+As a drop-in enhancement to the existing RLHF workflow, our causal reward
+modeling provides a practical way to improve the trustworthiness and fairness
+of LLM finetuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARMAX identification of low rank graphical models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqi Cao, Aming Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In large-scale systems, complex internal relationships are often present.
+Such interconnected systems can be effectively described by low rank stochastic
+processes. When identifying a predictive model of low rank processes from
+sampling data, the rank-deficient property of spectral densities is often
+obscured by the inevitable measurement noise in practice. However, existing low
+rank identification approaches often did not take noise into explicit
+consideration, leading to non-negligible inaccuracies even under weak noise. In
+this paper, we address the identification issue of low rank processes under
+measurement noise. We find that the noisy measurement model admits a sparse
+plus low rank structure in latent-variable graphical models. Specifically, we
+first decompose the problem into a maximum entropy covariance extension
+problem, and a low rank graphical estimation problem based on an autoregressive
+moving-average with exogenous input (ARMAX) model. To identify the ARMAX low
+rank graphical models, we propose an estimation approach based on maximum
+likelihood. The identifiability and consistency of this approach are proven
+under certain conditions. Simulation results confirm the reliable performance
+of the entire algorithm in both the parameter estimation and noisy data
+filtering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EVaDE : Event-Based Variational Thompson Sampling for Model-Based
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Aravindan, Dixant Mittal, Wee Sun Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Posterior Sampling for Reinforcement Learning (PSRL) is a well-known
+algorithm that augments model-based reinforcement learning (MBRL) algorithms
+with Thompson sampling. PSRL maintains posterior distributions of the
+environment transition dynamics and the reward function, which are intractable
+for tasks with high-dimensional state and action spaces. Recent works show that
+dropout, used in conjunction with neural networks, induces variational
+distributions that can approximate these posteriors. In this paper, we propose
+Event-based Variational Distributions for Exploration (EVaDE), which are
+variational distributions that are useful for MBRL, especially when the
+underlying domain is object-based. We leverage the general domain knowledge of
+object-based domains to design three types of event-based convolutional layers
+to direct exploration. These layers rely on Gaussian dropouts and are inserted
+between the layers of the deep neural network model to help facilitate
+variational Thompson sampling. We empirically show the effectiveness of
+EVaDE-equipped Simulated Policy Learning (EVaDE-SimPLe) on the 100K Atari game
+suite.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial-Ensemble Kolmogorov Arnold Networks for Enhancing Indoor
+  Wi-Fi Positioning: A Defensive Approach Against Spoofing and Signal
+  Manipulation Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mitul Goswami, Romit Chatterjee, Somnath Mahato, Prasant Kumar Pattnaik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research presents a study on enhancing the robustness of Wi-Fi-based
+indoor positioning systems against adversarial attacks. The goal is to improve
+the positioning accuracy and resilience of these systems under two attack
+scenarios: Wi-Fi Spoofing and Signal Strength Manipulation. Three models are
+developed and evaluated: a baseline model (M_Base), an adversarially trained
+robust model (M_Rob), and an ensemble model (M_Ens). All models utilize a
+Kolmogorov-Arnold Network (KAN) architecture. The robust model is trained with
+adversarially perturbed data, while the ensemble model combines predictions
+from both the base and robust models. Experimental results show that the robust
+model reduces positioning error by approximately 10% compared to the baseline,
+achieving 2.03 meters error under Wi-Fi spoofing and 2.00 meters under signal
+strength manipulation. The ensemble model further outperforms with errors of
+2.01 meters and 1.975 meters for the respective attack types. This analysis
+highlights the effectiveness of adversarial training techniques in mitigating
+attack impacts. The findings underscore the importance of considering
+adversarial scenarios in developing indoor positioning systems, as improved
+resilience can significantly enhance the accuracy and reliability of such
+systems in mission-critical environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reducing the Sensitivity of Neural Physics Simulators to Mesh Topology
+  via <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09597v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09597v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Vaska, Justin Goodwin, Robin Walters, Rajmonda S. Caceres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Meshes are used to represent complex objects in high fidelity physics
+simulators across a variety of domains, such as radar sensing and aerodynamics.
+There is growing interest in using neural networks to accelerate physics
+simulations, and also a growing body of work on applying neural networks
+directly to irregular mesh data. Since multiple mesh topologies can represent
+the same object, mesh augmentation is typically required to handle topological
+variation when training neural networks. Due to the sensitivity of physics
+simulators to small changes in mesh shape, it is challenging to use these
+augmentations when training neural network-based physics simulators. In this
+work, we show that variations in mesh topology can significantly reduce the
+performance of neural network simulators. We evaluate whether pretraining can
+be used to address this issue, and find that employing an established
+autoencoder pretraining technique with graph embedding models reduces the
+sensitivity of neural network simulators to variations in mesh topology.
+Finally, we highlight future research directions that may further reduce neural
+simulator sensitivity to mesh topology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IFRA: a machine learning-based Instrumented Fall Risk Assessment Scale
+  derived from Instrumented Timed Up and Go test in stroke patients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09595v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09595v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simone Macciò, Alessandro Carfì, Alessio Capitanelli, Peppino Tropea, Massimo Corbo, Fulvio Mastrogiovanni, Michela Picardi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective fall risk assessment is critical for post-stroke patients. The
+present study proposes a novel, data-informed fall risk assessment method based
+on the instrumented Timed Up and Go (ITUG) test data, bringing in many mobility
+measures that traditional clinical scales fail to capture. IFRA, which stands
+for Instrumented Fall Risk Assessment, has been developed using a two-step
+process: first, features with the highest predictive power among those
+collected in a ITUG test have been identified using machine learning
+techniques; then, a strategy is proposed to stratify patients into low, medium,
+or high-risk strata. The dataset used in our analysis consists of 142
+participants, out of which 93 were used for training (15 synthetically
+generated), 17 for validation and 32 to test the resulting IFRA scale (22
+non-fallers and 10 fallers). Features considered in the IFRA scale include gait
+speed, vertical acceleration during sit-to-walk transition, and turning angular
+velocity, which align well with established literature on the risk of fall in
+neurological patients. In a comparison with traditional clinical scales such as
+the traditional Timed Up & Go and the Mini-BESTest, IFRA demonstrates
+competitive performance, being the only scale to correctly assign more than
+half of the fallers to the high-risk stratum (Fischer's Exact test p = 0.004).
+Despite the dataset's limited size, this is the first proof-of-concept study to
+pave the way for future evidence regarding the use of IFRA tool for continuous
+patient monitoring and fall prevention both in clinical stroke rehabilitation
+and at home post-discharge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 2 figures, submitted for review dec 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metrics for Inter-<span class="highlight-title">Dataset</span> Similarity with Example Applications in
+  Synthetic Data and Feature Selection Evaluation -- Extended Version <span class="chip">SDM</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Rajabinasab, Anton D. Lautrup, Arthur Zimek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Measuring inter-dataset similarity is an important task in machine learning
+and data mining with various use cases and applications. Existing methods for
+measuring inter-dataset similarity are computationally expensive, limited, or
+sensitive to different entities and non-trivial choices for parameters. They
+also lack a holistic perspective on the entire dataset. In this paper, we
+propose two novel metrics for measuring inter-dataset similarity. We discuss
+the mathematical foundation and the theoretical basis of our proposed metrics.
+We demonstrate the effectiveness of the proposed metrics by investigating two
+applications in the evaluation of synthetic data and in the evaluation of
+feature selection methods. The theoretical and empirical studies conducted in
+this paper illustrate the effectiveness of the proposed metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the extended version of a paper accepted at 2025 SIAM
+  International Conference on Data Mining (SDM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Atleus: Accelerating <span class="highlight-title">Transformer</span>s on the Edge Enabled by 3D
+  Heterogeneous Manycore Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pratyush Dhingra, Janardhan Rao Doppa, Partha Pratim Pande
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer architectures have become the standard neural network model for
+various machine learning applications including natural language processing and
+computer vision. However, the compute and memory requirements introduced by
+transformer models make them challenging to adopt for edge applications.
+Furthermore, fine-tuning pre-trained transformers (e.g., foundation models) is
+a common task to enhance the model's predictive performance on specific
+tasks/applications. Existing transformer accelerators are oblivious to
+complexities introduced by fine-tuning. In this paper, we propose the design of
+a three-dimensional (3D) heterogeneous architecture referred to as Atleus that
+incorporates heterogeneous computing resources specifically optimized to
+accelerate transformer models for the dual purposes of fine-tuning and
+inference. Specifically, Atleus utilizes non-volatile memory and systolic array
+for accelerating transformer computational kernels using an integrated 3D
+platform. Moreover, we design a suitable NoC to achieve high performance and
+energy efficiency. Finally, Atleus adopts an effective quantization scheme to
+support model compression. Experimental results demonstrate that Atleus
+outperforms existing state-of-the-art by up to 56x and 64.5x in terms of
+performance and energy efficiency respectively
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for Publication in IEEE Transactions on Computer-Aided
+  Design of Integrated Circuits and Systems (TCAD)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sequential PatchCore: Anomaly Detection for Surface Inspection using
+  Synthetic Impurities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runzhou Mao, Juraj Fulir, Christoph Garth, Petra Gospodnetić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The appearance of surface impurities (e.g., water stains, fingerprints,
+stickers) is an often-mentioned issue that causes degradation of automated
+visual inspection systems. At the same time, synthetic data generation
+techniques for visual surface inspection have focused primarily on generating
+perfect examples and defects, disregarding impurities. This study highlights
+the importance of considering impurities when generating synthetic data. We
+introduce a procedural method to include photorealistic water stains in
+synthetic data. The synthetic datasets are generated to correspond to real
+datasets and are further used to train an anomaly detection model and
+investigate the influence of water stains. The high-resolution images used for
+surface inspection lead to memory bottlenecks during anomaly detection
+training. To address this, we introduce Sequential PatchCore - a method to
+build coresets sequentially and make training on large images using
+consumer-grade hardware tractable. This allows us to perform transfer learning
+using coresets pre-trained on different dataset versions. Our results show the
+benefits of using synthetic data for pre-training an explicit coreset anomaly
+model and the extended performance benefits of finetuning the coreset using
+real data. We observed how the impurities and labelling ambiguity lower the
+model performance and have additionally reported the defect-wise recall to
+provide an industrially relevant perspective on model performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Spectral Convergence of Locally Linear Embedding on Manifolds
+  with Boundary 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Lyons
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the eigenvalues and eigenfunctions of a differential operator that
+governs the asymptotic behavior of the unsupervised learning algorithm known as
+Locally Linear Embedding when a large data set is sampled from an interval or
+disc. In particular, the differential operator is of second order, mixed-type,
+and degenerates near the boundary. We show that a natural regularity condition
+on the eigenfunctions imposes a consistent boundary condition and use the
+Frobenius method to estimate pointwise behavior. We then determine the limiting
+sequence of eigenvalues analytically and compare them to numerical predictions.
+Finally, we propose a variational framework for determining eigenvalues on
+other compact manifolds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 7 figures; the author welcomes all comments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MatrixNet: Learning over symmetry groups using learned group
+  representations <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Laird, Circe Hsu, Asilata Bapat, Robin Walters
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group theory has been used in machine learning to provide a theoretically
+grounded approach for incorporating known symmetry transformations in tasks
+from robotics to protein modeling. In these applications, equivariant neural
+networks use known symmetry groups with predefined representations to learn
+over geometric input data. We propose MatrixNet, a neural network architecture
+that learns matrix representations of group element inputs instead of using
+predefined representations. MatrixNet achieves higher sample efficiency and
+generalization over several standard baselines in prediction tasks over the
+several finite groups and the Artin braid group. We also show that MatrixNet
+respects group relations allowing generalization to group elements of greater
+word length than in the training set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Overshoot: Taking advantage of future gradients in momentum-based
+  stochastic optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09556v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09556v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakub Kopal, Michal Gregor, Santiago de Leon-Martinez, Jakub Simko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Overshoot is a novel, momentum-based stochastic gradient descent optimization
+method designed to enhance performance beyond standard and Nesterov's momentum.
+In conventional momentum methods, gradients from previous steps are aggregated
+with the gradient at current model weights before taking a step and updating
+the model. Rather than calculating gradient at the current model weights,
+Overshoot calculates the gradient at model weights shifted in the direction of
+the current momentum. This sacrifices the immediate benefit of using the
+gradient w.r.t. the exact model weights now, in favor of evaluating at a point,
+which will likely be more relevant for future updates. We show that
+incorporating this principle into momentum-based optimizers (SGD with momentum
+and Adam) results in faster convergence (saving on average at least 15% of
+steps). Overshoot consistently outperforms both standard and Nesterov's
+momentum across a wide range of tasks and integrates into popular
+momentum-based optimizers with zero memory and small computational overhead.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intra-day Solar and Power Forecast for Optimization of Intraday Market
+  Participation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09551v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09551v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nelson Salazar-Peña, Adolfo Palma-Vergara, Mateo Montes, María Alejandra Vargas-Torres, Adriana Salinas, Andrés Velasco, Alejandra Tabares, Andrés González-Mancera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prediction of solar irradiance enhances reliability in photovoltaic (PV)
+solar plant generation and grid integration. In Colombia, PV plants face
+penalties if energy production deviates beyond governmental thresholds from
+intraday market offers. This research employs Long Short-Term Memory (LSTM) and
+Bidirectional-LSTM (Bi-LSTM) models, utilizing meteorological data from a PV
+plant in El Paso, Cesar, Colombia, to predict solar irradiance with a 6-hour
+horizon and 10-minute resolution. While Bi-LSTM showed superior performance,
+the LSTM model achieved comparable results with significantly reduced training
+time (6 hours versus 18 hours), making it computationally advantageous. The
+LSTM predictions were averaged to create an hourly resolution model, evaluated
+using Mean Absolute Error, Root-Mean-Square Error, Normalized Root-Mean-Square
+Error, and Mean Absolute Percentage Error metrics. Comparison with the Global
+Forecast System (GFS) revealed similar performance, with both models
+effectively capturing daily solar irradiance patterns. The forecast model
+integrates with an Object-Oriented power production model, enabling accurate
+energy offers in the intraday market while minimizing penalty costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 37 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOGNET: A Mux-residual quantized Network leveraging Online-Generated
+  weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09531v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09531v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Thien Nguyen, William Guicquero, Gilles Sicard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a compact model architecture called MOGNET, compatible
+with a resource-limited hardware. MOGNET uses a streamlined Convolutional
+factorization block based on a combination of 2 point-wise (1x1) convolutions
+with a group-wise convolution in-between. To further limit the overall model
+size and reduce the on-chip required memory, the second point-wise
+convolution's parameters are on-line generated by a Cellular Automaton
+structure. In addition, MOGNET enables the use of low-precision weights and
+activations, by taking advantage of a Multiplexer mechanism with a proper
+Bitshift rescaling for integrating residual paths without increasing the
+hardware-related complexity. To efficiently train this model we also introduce
+a novel weight ternarization method favoring the balance between quantized
+levels. Experimental results show that given tiny memory budget (sub-2Mb),
+MOGNET can achieve higher accuracy with a clear gap up to 1% at a similar or
+even lower model size compared to recent state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE AICAS 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Confidence Estimation for Error Detection in Text-to-SQL Systems <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Somov, Elena Tutubalina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-SQL enables users to interact with databases through natural
+language, simplifying the retrieval and synthesis of information. Despite the
+success of large language models (LLMs) in converting natural language
+questions into SQL queries, their broader adoption is limited by two main
+challenges: achieving robust generalization across diverse queries and ensuring
+interpretative confidence in their predictions. To tackle these issues, our
+research investigates the integration of selective classifiers into Text-to-SQL
+systems. We analyse the trade-off between coverage and risk using entropy based
+confidence estimation with selective classifiers and assess its impact on the
+overall performance of Text-to-SQL models. Additionally, we explore the models'
+initial calibration and improve it with calibration techniques for better model
+alignment between confidence and accuracy. Our experimental results show that
+encoder-decoder T5 is better calibrated than in-context-learning GPT 4 and
+decoder-only Llama 3, thus the designated external entropy-based selective
+classifier has better performance. The study also reveal that, in terms of
+error detection, selective classifier with a higher probability detects errors
+associated with irrelevant questions rather than incorrect query generations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 11 figures, to be published in AAAI 2025 Proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class Incremental Fault Diagnosis under Limited Fault Data via
+  Supervised Contrastive Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanrong Zhang, Yifei Yao, Zixuan Wang, Jiayuan Su, Mengxuan Li, Peng Peng, Hongwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class-incremental fault diagnosis requires a model to adapt to new fault
+classes while retaining previous knowledge. However, limited research exists
+for imbalanced and long-tailed data. Extracting discriminative features from
+few-shot fault data is challenging, and adding new fault classes often demands
+costly model retraining. Moreover, incremental training of existing methods
+risks catastrophic forgetting, and severe class imbalance can bias the model's
+decisions toward normal classes. To tackle these issues, we introduce a
+Supervised Contrastive knowledge distiLlation for class Incremental Fault
+Diagnosis (SCLIFD) framework proposing supervised contrastive knowledge
+distillation for improved representation learning capability and less
+forgetting, a novel prioritized exemplar selection method for sample replay to
+alleviate catastrophic forgetting, and the Random Forest Classifier to address
+the class imbalance. Extensive experimentation on simulated and real-world
+industrial datasets across various imbalance ratios demonstrates the
+superiority of SCLIFD over existing approaches. Our code can be found at
+https://github.com/Zhang-Henry/SCLIFD_TII.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Merging Models on the Fly Without Retraining: A Sequential Approach to
+  Scalable Continual Model Merging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anke Tang, Enneng Yang, Li Shen, Yong Luo, Han Hu, Bo Du, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep model merging represents an emerging research direction that combines
+multiple fine-tuned models to harness their specialized capabilities across
+different tasks and domains. Current model merging techniques focus on merging
+all available models simultaneously, with weight interpolation-based methods
+being the predominant approaches. However, these conventional approaches are
+not well-suited for scenarios where models become available sequentially, and
+they often suffer from high memory requirements and potential interference
+between tasks. In this study, we propose a training-free projection-based
+continual merging method that processes models sequentially through orthogonal
+projections of weight matrices and adaptive scaling mechanisms. Our method
+operates by projecting new parameter updates onto subspaces orthogonal to
+existing merged parameter updates while using an adaptive scaling mechanism to
+maintain stable parameter distances, enabling efficient sequential integration
+of task-specific knowledge. Our approach maintains constant memory complexity
+to the number of models, minimizes interference between tasks through
+orthogonal projections, and retains the performance of previously merged models
+through adaptive task vector scaling. Extensive experiments on CLIP-ViT models
+demonstrate that our method achieves a 5-8% average accuracy improvement while
+maintaining robust performance in different task orderings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-task deep-learning for sleep event detection and stage
+  classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09519v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09519v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adriana Anido-Alonso, Diego Alvarez-Estevez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Polysomnographic sleep analysis is the standard clinical method to accurately
+diagnose and treat sleep disorders. It is an intricate process which involves
+the manual identification, classification, and location of multiple sleep event
+patterns. This is complex, for which identification of different types of
+events involves focusing on different subsets of signals, resulting on an
+iterative time-consuming process entailing several visual analysis passes. In
+this paper we propose a multi-task deep-learning approach for the simultaneous
+detection of sleep events and hypnogram construction in one single pass. Taking
+as reference state-of-the-art methodology for object-detection in the field of
+Computer Vision, we reformulate the problem for the analysis of multi-variate
+time sequences, and more specifically for pattern detection in the sleep
+analysis scenario. We investigate the performance of the resulting method in
+identifying different assembly combinations of EEG arousals, respiratory events
+(apneas and hypopneas) and sleep stages, also considering different input
+signal montage configurations. Furthermore, we evaluate our approach using two
+independent datasets, assessing true-generalization effects involving local and
+external validation scenarios. Based on our results, we analyze and discuss our
+method's capabilities and its potential wide-range applicability across
+different settings and datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PIER: A Novel Metric for Evaluating What Matters in Code-Switching <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09512v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09512v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enes Yavuz Ugan, Ngoc-Quan Pham, Leonard Bärmann, Alex Waibel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Code-switching, the alternation of languages within a single discourse,
+presents a significant challenge for Automatic Speech Recognition. Despite the
+unique nature of the task, performance is commonly measured with established
+metrics such as Word-Error-Rate (WER). However, in this paper, we question
+whether these general metrics accurately assess performance on code-switching.
+Specifically, using both Connectionist-Temporal-Classification and
+Encoder-Decoder models, we show fine-tuning on non-code-switched data from both
+matrix and embedded language improves classical metrics on code-switching test
+sets, although actual code-switched words worsen (as expected). Therefore, we
+propose Point-of-Interest Error Rate (PIER), a variant of WER that focuses only
+on specific words of interest. We instantiate PIER on code-switched utterances
+and show that this more accurately describes the code-switching performance,
+showing huge room for improvement in future work. This focused evaluation
+allows for a more precise assessment of model performance, particularly in
+challenging aspects such as inter-word and intra-word code-switching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Marvels of Deep Learning in Medical Diagnosis: A
+  Comprehensive <span class="highlight-title">Review</span> of COVID-19 Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Shofiqul Islama, Khondokar Fida Hasanc, Hasibul Hossain Shajeebd, Humayan Kabir Ranae, Md Saifur Rahmand, Md Munirul Hasanb, AKM Azadf, Ibrahim Abdullahg, Mohammad Ali Moni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a comprehensive review of the potential of multimodal
+deep learning (DL) in medical diagnosis, using COVID-19 as a case example.
+Motivated by the success of artificial intelligence applications during the
+COVID-19 pandemic, this research aims to uncover the capabilities of DL in
+disease screening, prediction, and classification, and to derive insights that
+enhance the resilience, sustainability, and inclusiveness of science,
+technology, and innovation systems. Adopting a systematic approach, we
+investigate the fundamental methodologies, data sources, preprocessing steps,
+and challenges encountered in various studies and implementations. We explore
+the architecture of deep learning models, emphasising their data-specific
+structures and underlying algorithms. Subsequently, we compare different deep
+learning strategies utilised in COVID-19 analysis, evaluating them based on
+methodology, data, performance, and prerequisites for future research. By
+examining diverse data types and diagnostic modalities, this research
+contributes to scientific understanding and knowledge of the multimodal
+application of DL and its effectiveness in diagnosis. We have implemented and
+analysed 11 deep learning models using COVID-19 image, text, and speech (ie,
+cough) data. Our analysis revealed that the MobileNet model achieved the
+highest accuracy of 99.97% for COVID-19 image data and 93.73% for speech data
+(i.e., cough). However, the BiGRU model demonstrated superior performance in
+COVID-19 text classification with an accuracy of 99.89%. The broader
+implications of this research suggest potential benefits for other domains and
+disciplines that could leverage deep learning techniques for image, text, and
+speech analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MonoSOWA: Scalable monocular 3D Object detector Without human
+  Annotations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Skvrna, Lukas Neumann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting the three-dimensional position and orientation of objects using a
+single RGB camera is a foundational task in computer vision with many important
+applications. Traditionally, 3D object detection methods are trained in a
+fully-supervised setup, requiring vast amounts of human annotations, which are
+laborious, costly, and do not scale well with the ever-increasing amounts of
+data being captured.
+  In this paper, we present the first method to train 3D object detectors for
+monocular RGB cameras without domain-specific human annotations, thus making
+orders of magnitude more data available for training. Thanks to newly proposed
+Canonical Object Space, the method can not only exploit data across a variety
+of datasets and camera setups to train a single 3D detector, but unlike
+previous work it also works out of the box in previously unseen camera setups.
+All this is crucial for practical applications, where the data and cameras are
+extremely heterogeneous.
+  The method is evaluated on two standard autonomous driving datasets, where it
+outperforms previous works, which, unlike our method, still rely on 2D human
+annotations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Utilizing AI Language Models to Identify Prognostic Factors for Coronary
+  Artery Disease: A Study in Mashhad Residents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bami Zahra, Behnampour Nasser, Doosti Hassan, Ghayour Mobarhan Majid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abstract: Background: Understanding cardiovascular artery disease risk
+factors, the leading global cause of mortality, is crucial for influencing its
+etiology, prevalence, and treatment. This study aims to evaluate prognostic
+markers for coronary artery disease in Mashhad using Naive Bayes, REP Tree,
+J48, CART, and CHAID algorithms. Methods:
+  Using data from the 2009 MASHAD STUDY, prognostic factors for coronary artery
+disease were determined with Naive Bayes, REP Tree, J48, CART, CHAID, and
+Random Forest algorithms using R 3.5.3 and WEKA 3.9.4. Model efficiency was
+compared by sensitivity, specificity, and accuracy. Cases were patients with
+coronary artery disease; each had three controls (totally 940). Results:
+Prognostic factors for coronary artery disease in Mashhad residents varied by
+algorithm. CHAID identified age, myocardial infarction history, and
+hypertension. CART included depression score and physical activity. REP added
+education level and anxiety score. NB included diabetes and family history. J48
+highlighted father's heart disease and weight loss. CHAID had the highest
+accuracy (0.80).
+  Conclusion:
+  Key prognostic factors for coronary artery disease in CART and CHAID models
+include age, myocardial infarction history, hypertension, depression score,
+physical activity, and BMI. NB, REP Tree, and J48 identified numerous factors.
+CHAID had the highest accuracy, sensitivity, and specificity. CART offers
+simpler interpretation, aiding physician and paramedic model selection based on
+specific. Keywords: RF, Na\"ive Bayes, REP, J48 algorithms, Coronary Artery
+Disease (CAD).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Air Temperature from Volumetric Urban Morphology with Machine
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Berk Kıvılcım, Patrick Erik Bradley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we firstly introduce a method that converts CityGML data into
+voxels which works efficiently and fast in high resolution for large scale
+datasets such as cities but by sacrificing some building details to overcome
+the limitations of previous voxelization methodologies that have been
+computationally intensive and inefficient at transforming large-scale urban
+areas into voxel representations for high resolution. Those voxelized 3D city
+data from multiple cities and corresponding air temperature data are used to
+develop a machine learning model. Before the model training, Gaussian blurring
+is implemented on input data to consider spatial relationships, as a result the
+correlation rate between air temperature and volumetric building morphology is
+also increased after the Gaussian blurring. After the model training, the
+prediction results are not just evaluated with Mean Square Error (MSE) but some
+image similarity metrics such as Structural Similarity Index Measure (SSIM) and
+Learned Perceptual Image Patch Similarity (LPIPS) that are able to detect and
+consider spatial relations during the evaluation process. This trained model is
+capable of predicting the spatial distribution of air temperature by using
+building volume information of corresponding pixel as input. By doing so, this
+research aims to assist urban planners in incorporating environmental
+parameters into their planning strategies, thereby facilitating more
+sustainable and inhabitable urban environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 8 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pruning for Sparse Diffusion Models based on Gradient Flow <span class="chip">ICASSP2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Wan, Tianyi Zheng, Zhaoyu Chen, Yuxiao Wang, Jia Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Models (DMs) have impressive capabilities among generation models,
+but are limited to slower inference speeds and higher computational costs.
+Previous works utilize one-shot structure pruning to derive lightweight DMs
+from pre-trained ones, but this approach often leads to a significant drop in
+generation quality and may result in the removal of crucial weights. Thus we
+propose a iterative pruning method based on gradient flow, including the
+gradient flow pruning process and the gradient flow pruning criterion. We
+employ a progressive soft pruning strategy to maintain the continuity of the
+mask matrix and guide it along the gradient flow of the energy function based
+on the pruning criterion in sparse space, thereby avoiding the sudden
+information loss typically caused by one-shot pruning. Gradient-flow based
+criterion prune parameters whose removal increases the gradient norm of loss
+function and can enable fast convergence for a pruned model in iterative
+pruning stage. Our extensive experiments on widely used datasets demonstrate
+that our method achieves superior performance in efficiency and consistency
+with pre-trained models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, accepted by ICASSP2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Teaching Wav2Vec2 the Language of the Brain <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Fiedler, Leon Hermann, Florian Müller, Sarel Cohen, Peter Chin, Tobias Friedrich, Eilon Vaadia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The decoding of continuously spoken speech from neuronal activity has the
+potential to become an important clinical solution for paralyzed patients. Deep
+Learning Brain Computer Interfaces (BCIs) have recently successfully mapped
+neuronal activity to text contents in subjects who attempted to formulate
+speech. However, only small BCI datasets are available. In contrast, labeled
+data and pre-trained models for the closely related task of speech recognition
+from audio are widely available. One such model is Wav2Vec2 which has been
+trained in a self-supervised fashion to create meaningful representations of
+speech audio data. In this study, we show that patterns learned by Wav2Vec2 are
+transferable to brain data. Specifically, we replace its audio feature
+extractor with an untrained Brain Feature Extractor (BFE) model. We then
+execute full fine-tuning with pre-trained weights for Wav2Vec2, training ''from
+scratch'' without pre-trained weights as well as freezing a pre-trained
+Wav2Vec2 and training only the BFE each for 45 different BFE architectures.
+Across these experiments, the best run is from full fine-tuning with
+pre-trained weights, achieving a Character Error Rate (CER) of 18.54\%,
+outperforming the best training from scratch run by 20.46\% and that of frozen
+Wav2Vec2 training by 15.92\% percentage points. These results indicate that
+knowledge transfer from audio speech recognition to brain decoding is possible
+and significantly improves brain decoding performance for the same
+architectures. Related source code is available at
+https://github.com/tfiedlerdev/Wav2Vec2ForBrain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper was submitted to ICASSP 2025 but marginally rejected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving the unsolvable: Translating case law in Hong Kong 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09444v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09444v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        King-kui Sin, Xi Xuan, Chunyu Kit, Clara Ho-yan Chan, Honic Ho-kin Ip
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the challenges translating case law under Hong Kong's
+bilingual legal system. It highlights the initial success of translating all
+written statutes into Chinese before the 1997 handover, a task mandated by the
+Basic Law. The effort involved significant collaboration among legal,
+linguistic, and translation experts, resulting in a comprehensive and
+culturally appropriate bilingual legal system. However, translating case law
+remains a significant challenge due to the sheer volume and continuous growth
+of judicial decisions. The paper critiques the governments and judiciarys
+sporadic and uncoordinated efforts to translate case law, contrasting it with
+the thorough approach previously taken for statute translation. Although the
+government acknowledges the importance of legal bilingualism, it lacks a
+sustainable strategy for translating case law. The Judiciarys position that
+translating all judgments is unnecessary, unrealistic, and not cost-effectiveis
+analyzed and critiqued for its impact on legal transparency and public trust. A
+proposed solution involves leveraging machine translation technology through a
+human-machine interactive translation platform, which undergoes two major
+transitions. Initially based on a neural model, the platform transitions to
+using a large language model for improved translation accuracy. Furthermore, it
+evolves from a single-agent system to a multi-agent system, incorporating
+Translator, Annotator, and Proofreader agents. This multi-agent approach,
+supported by a grant, aims to facilitate efficient, high-quality translation of
+judicial judgments by integrating advanced artificial intelligence and
+continuous feedback mechanisms, thus better meeting the needs of a bilingual
+legal system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADAGE: A generic two-layer framework for adaptive agent based modelling <span class="chip">AAMAS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Patrick Evans, Sihan Zeng, Sumitra Ganesh, Leo Ardon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Agent-based models (ABMs) are valuable for modelling complex, potentially
+out-of-equilibria scenarios. However, ABMs have long suffered from the Lucas
+critique, stating that agent behaviour should adapt to environmental changes.
+Furthermore, the environment itself often adapts to these behavioural changes,
+creating a complex bi-level adaptation problem. Recent progress integrating
+multi-agent reinforcement learning into ABMs introduces adaptive agent
+behaviour, beginning to address the first part of this critique, however, the
+approaches are still relatively ad hoc, lacking a general formulation, and
+furthermore, do not tackle the second aspect of simultaneously adapting
+environmental level characteristics in addition to the agent behaviours. In
+this work, we develop a generic two-layer framework for ADaptive AGEnt based
+modelling (ADAGE) for addressing these problems. This framework formalises the
+bi-level problem as a Stackelberg game with conditional behavioural policies,
+providing a consolidated framework for adaptive agent-based modelling based on
+solving a coupled set of non-linear equations. We demonstrate how this generic
+approach encapsulates several common (previously viewed as distinct) ABM tasks,
+such as policy design, calibration, scenario generation, and robust behavioural
+learning under one unified framework. We provide example simulations on
+multiple complex economic and financial environments, showing the strength of
+the novel framework under these canonical settings, addressing long-standing
+critiques of traditional ABMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2025 International Conference on Autonomous Agents
+  and Multiagent Systems (AAMAS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Neural Style Transfer for Artistic Image Generation using VGG19 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kapil Kashyap, Mehak Garg, Sean Fargose, Sindhu Nair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Throughout history, humans have created remarkable works of art, but
+artificial intelligence has only recently started to make strides in generating
+visually compelling art. Breakthroughs in the past few years have focused on
+using convolutional neural networks (CNNs) to separate and manipulate the
+content and style of images, applying texture synthesis techniques.
+Nevertheless, a number of current techniques continue to encounter obstacles,
+including lengthy processing times, restricted choices of style images, and the
+inability to modify the weight ratio of styles. We proposed a neural style
+transfer system that can add various artistic styles to a desired image to
+address these constraints allowing flexible adjustments to style weight ratios
+and reducing processing time. The system uses the VGG19 model for feature
+extraction, ensuring high-quality, flexible stylization without compromising
+content integrity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FASP: Fast and Accurate Structured Pruning of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09412v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09412v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanyu Hu, Pengxiang Zhao, Ping Li, Yi Zheng, Zhefeng Wang, Xiaoming Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid increase in the size of large language models (LLMs) has
+significantly escalated their computational and memory demands, posing
+challenges for efficient deployment, especially on resource-constrained
+devices. Structured pruning has emerged as an effective model compression
+method that can reduce these demands while preserving performance. In this
+paper, we introduce FASP (Fast and Accurate Structured Pruning), a novel
+structured pruning framework for LLMs that emphasizes both speed and accuracy.
+FASP employs a distinctive pruning structure that interlinks sequential layers,
+allowing for the removal of columns in one layer while simultaneously
+eliminating corresponding rows in the preceding layer without incurring
+additional performance loss. The pruning metric, inspired by Wanda, is
+computationally efficient and effectively selects components to prune.
+Additionally, we propose a restoration mechanism that enhances model fidelity
+by adjusting the remaining weights post-pruning. We evaluate FASP on the OPT
+and LLaMA model families, demonstrating superior performance in terms of
+perplexity and accuracy on downstream tasks compared to state-of-the-art
+methods. Our approach achieves significant speed-ups, pruning models such as
+OPT-125M in 17 seconds and LLaMA-30B in 15 minutes on a single NVIDIA RTX 4090
+GPU, making it a highly practical solution for optimizing LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MoE$^2$: Optimizing Collaborative Inference for Edge Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lyudong Jin, Yanning Zhang, Yanhan Li, Shurong Wang, Howard H. Yang, Jian Wu, Meng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable capabilities across
+a wide range of natural language processing tasks. Exploiting the heterogeneous
+capabilities of edge LLMs is crucial for diverse emerging applications, as it
+enables greater cost-effectiveness and reduced latency. In this work, we
+introduce \textit{Mixture-of-Edge-Experts (MoE$^2$)}, a novel collaborative
+inference framework for edge LLMs. We formulate the joint gating and expert
+selection problem to optimize inference performance under energy and latency
+constraints. Unlike conventional MoE problems, LLM expert selection is
+significantly more challenging due to the combinatorial nature and the
+heterogeneity of edge LLMs across various attributes. To this end, we propose a
+two-level expert selection mechanism through which we uncover an
+optimality-preserving property of gating parameters across expert selections.
+This property enables the decomposition of the training and selection
+processes, significantly reducing complexity. Furthermore, we leverage the
+objective's monotonicity and design a discrete monotonic optimization algorithm
+for optimal expert selection. We implement edge servers with NVIDIA Jetson AGX
+Orins and NVIDIA RTX 4090 GPUs, and perform extensive experiments. Our results
+validate that performance improvements of various LLM models and show that our
+MoE$^2$ method can achieve optimal trade-offs among different delay and energy
+budgets, and outperforms baselines under various system resource constraints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE/ACM Transactions on Networking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PISCO: <span class="highlight-title">Self-Supervised</span> k-Space Regularization for Improved Neural
+  Implicit k-Space Representations of Dynamic MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Veronika Spieker, Hannah Eichhorn, Wenqi Huang, Jonathan K. Stelter, Tabita Catalan, Rickmer F. Braren, Daniel Rueckert, Francisco Sahli Costabal, Kerstin Hammernik, Dimitrios C. Karampinos, Claudia Prieto, Julia A. Schnabel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural implicit k-space representations (NIK) have shown promising results
+for dynamic magnetic resonance imaging (MRI) at high temporal resolutions. Yet,
+reducing acquisition time, and thereby available training data, results in
+severe performance drops due to overfitting. To address this, we introduce a
+novel self-supervised k-space loss function $\mathcal{L}_\mathrm{PISCO}$,
+applicable for regularization of NIK-based reconstructions. The proposed loss
+function is based on the concept of parallel imaging-inspired self-consistency
+(PISCO), enforcing a consistent global k-space neighborhood relationship
+without requiring additional data. Quantitative and qualitative evaluations on
+static and dynamic MR reconstructions show that integrating PISCO significantly
+improves NIK representations. Particularly for high acceleration factors
+(R$\geq$54), NIK with PISCO achieves superior spatio-temporal reconstruction
+quality compared to state-of-the-art methods. Furthermore, an extensive
+analysis of the loss assumptions and stability shows PISCO's potential as
+versatile self-supervised k-space loss function for further applications and
+architectures. Code is available at:
+https://github.com/compai-lab/2025-pisco-spieker
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Searching of Extreme Operating Conditions for Relay Protection
+  Setting Calculation Based on Graph Neural Network and Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09399v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09399v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Li, Jingyu Wang, Jiankang Zhang, Huaiqiang Li, Longfei Ren, Yinhong Li, Dongyuan Shi, Xianzhong Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Searching for the Extreme Operating Conditions (EOCs) is one of the core
+problems of power system relay protection setting calculation. The current
+methods based on brute-force search, heuristic algorithms, and mathematical
+programming can hardly meet the requirements of today's power systems in terms
+of computation speed due to the drastic changes in operating conditions induced
+by renewables and power electronics. This paper proposes an EOC fast search
+method, named Graph Dueling Double Deep Q Network (Graph D3QN), which combines
+graph neural network and deep reinforcement learning to address this challenge.
+First, the EOC search problem is modeled as a Markov decision process, where
+the information of the underlying power system is extracted using graph neural
+networks, so that the EOC of the system can be found via deep reinforcement
+learning. Then, a two-stage Guided Learning and Free Exploration (GLFE)
+training framework is constructed to accelerate the convergence speed of
+reinforcement learning. Finally, the proposed Graph D3QN method is validated
+through case studies of searching maximum fault current for relay protection
+setting calculation on the IEEE 39-bus and 118-bus systems. The experimental
+results demonstrate that Graph D3QN can reduce the computation time by 10 to
+1000 times while guaranteeing the accuracy of the selected EOCs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ELM-DeepONets: Backpropagation-Free Training of Deep Operator Networks
+  via Extreme Learning Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hwijae Son
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Operator Networks (DeepONets) are among the most prominent frameworks
+for operator learning, grounded in the universal approximation theorem for
+operators. However, training DeepONets typically requires significant
+computational resources. To address this limitation, we propose ELM-DeepONets,
+an Extreme Learning Machine (ELM) framework for DeepONets that leverages the
+backpropagation-free nature of ELM. By reformulating DeepONet training as a
+least-squares problem for newly introduced parameters, the ELM-DeepONet
+approach significantly reduces training complexity. Validation on benchmark
+problems, including nonlinear ODEs and PDEs, demonstrates that the proposed
+method not only achieves superior accuracy but also drastically reduces
+computational costs. This work offers a scalable and efficient alternative for
+operator learning in scientific computing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum-Enhanced <span class="highlight-title">Transformer</span>s for Robust Acoustic Scene Classification
+  in IoT Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh K. Quan, Mayuri Wijayasundara, Sujeeva Setunge, Pubudu N. Pathirana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of Internet of Things (IoT) devices equipped with acoustic
+sensors necessitates robust acoustic scene classification (ASC) capabilities,
+even in noisy and data-limited environments. Traditional machine learning
+methods often struggle to generalize effectively under such conditions. To
+address this, we introduce Q-ASC, a novel Quantum-Inspired Acoustic Scene
+Classifier that leverages the power of quantum-inspired transformers. By
+integrating quantum concepts like superposition and entanglement, Q-ASC
+achieves superior feature learning and enhanced noise resilience compared to
+classical models. Furthermore, we introduce a Quantum Variational Autoencoder
+(QVAE) based data augmentation technique to mitigate the challenge of limited
+labeled data in IoT deployments. Extensive evaluations on the Tampere
+University of Technology (TUT) Acoustic Scenes 2016 benchmark dataset
+demonstrate that Q-ASC achieves remarkable accuracy between 68.3% and 88.5%
+under challenging conditions, outperforming state-of-the-art methods by over 5%
+in the best case. This research paves the way for deploying intelligent
+acoustic sensing in IoT networks, with potential applications in smart homes,
+industrial monitoring, and environmental surveillance, even in adverse acoustic
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAL: <span class="highlight-title">Prompt</span>ing Analytic Learning with Missing Modality for Multi-Modal
+  Class-Incremental Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianghu Yue, Yiming Chen, Xueyi Zhang, Xiaoxue Gao, Mengling Feng, Mingrui Lao, Huiping Zhuang, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal class-incremental learning (MMCIL) seeks to leverage multi-modal
+data, such as audio-visual and image-text pairs, thereby enabling models to
+learn continuously across a sequence of tasks while mitigating forgetting.
+While existing studies primarily focus on the integration and utilization of
+multi-modal information for MMCIL, a critical challenge remains: the issue of
+missing modalities during incremental learning phases. This oversight can
+exacerbate severe forgetting and significantly impair model performance. To
+bridge this gap, we propose PAL, a novel exemplar-free framework tailored to
+MMCIL under missing-modality scenarios. Concretely, we devise modality-specific
+prompts to compensate for missing information, facilitating the model to
+maintain a holistic representation of the data. On this foundation, we
+reformulate the MMCIL problem into a Recursive Least-Squares task, delivering
+an analytical linear solution. Building upon these, PAL not only alleviates the
+inherent under-fitting limitation in analytic learning but also preserves the
+holistic representation of missing-modality data, achieving superior
+performance with less forgetting across various multi-modal incremental
+scenarios. Extensive experiments demonstrate that PAL significantly outperforms
+competitive methods across various datasets, including UPMC-Food101 and
+N24News, showcasing its robustness towards modality absence and its
+anti-forgetting ability to maintain high incremental accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rational Tuning of LLM Cascades via Probabilistic Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael J. Zellinger, Matt Thomson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the reliability of large language models (LLMs) has recently
+garnered significant attention. Given LLMs' propensity to hallucinate, as well
+as their high sensitivity to prompt design, it is already challenging to
+predict the performance of an individual LLM. However, the problem becomes more
+complex for compound LLM systems such as cascades, where in addition to each
+model's standalone performance, we must understand how the error rates of
+different models interact. In this paper, we present a probabilistic model for
+the joint performance distribution of a sequence of LLMs, which enables a
+framework for rationally tuning the confidence thresholds of a LLM cascade
+using continuous optimization. Compared to selecting confidence thresholds
+using grid search, our parametric Markov-copula model significantly improves
+runtime scaling with respect to the length of the cascade and the desired
+resolution of the cost-error curve, turning them from intractable into
+low-order polynomial. In addition, the optimal thresholds computed using our
+continuous optimization-based algorithm increasingly outperform those found via
+grid search as cascade length grows, improving the area under the cost-error
+curve by 1.9% on average for cascades consisting of at least three models.
+Overall, our Markov-copula model provides a rational basis for tuning LLM
+cascade performance and points to the potential of probabilistic methods in
+analyzing LLM systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimating shared subspace with AJIVE: the power and limitation of
+  multiple data matrices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuepeng Yang, Cong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Integrative data analysis often requires disentangling joint and individual
+variations across multiple datasets, a challenge commonly addressed by the
+Joint and Individual Variation Explained (JIVE) model. While numerous methods
+have been developed to estimate the shared subspace under JIVE, the theoretical
+understanding of their performance remains limited, particularly in the context
+of multiple matrices and varying levels of subspace misalignment. This paper
+bridges this gap by providing a systematic analysis of shared subspace
+estimation in multi-matrix settings.
+  We focus on the Angle-based Joint and Individual Variation Explained (AJIVE)
+method, a two-stage spectral approach, and establish new performance guarantees
+that uncover its strengths and limitations. Specifically, we show that in high
+signal-to-noise ratio (SNR) regimes, AJIVE's estimation error decreases with
+the number of matrices, demonstrating the power of multi-matrix integration.
+Conversely, in low-SNR settings, AJIVE exhibits a non-diminishing error,
+highlighting fundamental limitations. To complement these results, we derive
+minimax lower bounds, showing that AJIVE achieves optimal rates in high-SNR
+regimes. Furthermore, we analyze an oracle-aided spectral estimator to
+demonstrate that the non-diminishing error in low-SNR scenarios is a
+fundamental barrier. Extensive numerical experiments corroborate our
+theoretical findings, providing insights into the interplay between SNR, matrix
+count, and subspace misalignment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying Information from Observations with Uncertainty and Novelty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Derek S. Prijatelj, Timothy J. Ireland, Walter J. Scheirer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A machine learning tasks from observations must encounter and process
+uncertainty and novelty, especially when it is expected to maintain performance
+when observing new information and to choose the best fitting hypothesis to the
+currently observed information. In this context, some key questions arise: what
+is information, how much information did the observations provide, how much
+information is required to identify the data-generating process, how many
+observations remain to get that information, and how does a predictor determine
+that it has observed novel information? This paper strengthens existing answers
+to these questions by formalizing the notion of "identifiable information" that
+arises from the language used to express the relationship between distinct
+states. Model identifiability and sample complexity are defined via computation
+of an indicator function over a set of hypotheses. Their properties and
+asymptotic statistics are described for data-generating processes ranging from
+deterministic processes to ergodic stationary stochastic processes. This
+connects the notion of identifying information in finite steps with asymptotic
+statistics and PAC-learning. The indicator function's computation naturally
+formalizes novel information and its identification from observations with
+respect to a hypothesis set. We also proved that computable PAC-Bayes learners'
+sample complexity distribution is determined by its moments in terms of the the
+prior probability distribution over a fixed finite hypothesis set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 1 figure, 1 table, and 2 inline algorithms. Submitted to
+  JMLR Jan. 6, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Learning Informative Trajectory Embeddings for Imitation,
+  Classification and Regression <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichang Ge, Changyu Chen, Arunesh Sinha, Pradeep Varakantham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world sequential decision making tasks like autonomous driving,
+robotics, and healthcare, learning from observed state-action trajectories is
+critical for tasks like imitation, classification, and clustering. For example,
+self-driving cars must replicate human driving behaviors, while robots and
+healthcare systems benefit from modeling decision sequences, whether or not
+they come from expert data. Existing trajectory encoding methods often focus on
+specific tasks or rely on reward signals, limiting their ability to generalize
+across domains and tasks. Inspired by the success of embedding models like CLIP
+and BERT in static domains, we propose a novel method for embedding
+state-action trajectories into a latent space that captures the skills and
+competencies in the dynamic underlying decision-making processes. This method
+operates without the need for reward labels, enabling better generalization
+across diverse domains and tasks. Our contributions are threefold: (1) We
+introduce a trajectory embedding approach that captures multiple abilities from
+state-action data. (2) The learned embeddings exhibit strong representational
+power across downstream tasks, including imitation, classification, clustering,
+and regression. (3) The embeddings demonstrate unique properties, such as
+controlling agent behaviors in IQ-Learn and an additive structure in the latent
+space. Experimental results confirm that our method outperforms traditional
+approaches, offering more flexible and powerful trajectory representations for
+various applications. Our code is available at
+https://github.com/Erasmo1015/vte.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAMAS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Decentralized Backdoor Attacks on Vertical Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seohyun Lee, Wenzhi Fang, Anindya Bijoy Das, Seyyedali Hosseinalipour, David J. Love, Christopher G. Brinton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is vulnerable to backdoor attacks, where adversaries
+alter model behavior on target classification labels by embedding triggers into
+data samples. While these attacks have received considerable attention in
+horizontal FL, they are less understood for vertical FL (VFL), where devices
+hold different features of the samples, and only the server holds the labels.
+In this work, we propose a novel backdoor attack on VFL which (i) does not rely
+on gradient information from the server and (ii) considers potential collusion
+among multiple adversaries for sample selection and trigger embedding. Our
+label inference model augments variational autoencoders with metric learning,
+which adversaries can train locally. A consensus process over the adversary
+graph topology determines which datapoints to poison. We further propose
+methods for trigger splitting across the adversaries, with an intensity-based
+implantation scheme skewing the server towards the trigger. Our convergence
+analysis reveals the impact of backdoor perturbations on VFL indicated by a
+stationarity gap for the trained model, which we verify empirically as well. We
+conduct experiments comparing our attack with recent backdoor VFL approaches,
+finding that ours obtains significantly higher success rates for the same main
+task performance despite not using server information. Additionally, our
+results verify the impact of collusion on attack performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is currently under review in the IEEE/ACM Transactions on
+  Networking Special Issue on AI and Networking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding the Trigger: Causal Abductive Reasoning on Video Events 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thao Minh Le, Vuong Le, Kien Do, Sunil Gupta, Svetha Venkatesh, Truyen Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new problem, Causal Abductive Reasoning on Video
+Events (CARVE), which involves identifying causal relationships between events
+in a video and generating hypotheses about causal chains that account for the
+occurrence of a target event. To facilitate research in this direction, we
+create two new benchmark datasets with both synthetic and realistic videos,
+accompanied by trigger-target labels generated through a novel counterfactual
+synthesis approach. To explore the challenge of solving CARVE, we present a
+Causal Event Relation Network (CERN) that examines the relationships between
+video events in temporal and semantic spaces to efficiently determine the
+root-cause trigger events. Through extensive experiments, we demonstrate the
+critical roles of event relational representation learning and interaction
+modeling in solving video causal reasoning challenges. The introduction of the
+CARVE task, along with the accompanying datasets and the CERN framework, will
+advance future research on video causal reasoning and significantly facilitate
+various applications, including video surveillance, root-cause analysis and
+movie content management.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-informed deep learning for infectious disease forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Qian, Éric Marty, Avranil Basu, Eamon B. O'Dea, Xianqiao Wang, Spencer Fox, Pejman Rohani, John M. Drake, He Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate forecasting of contagious illnesses has become increasingly
+important to public health policymaking, and better prediction could prevent
+the loss of millions of lives. To better prepare for future pandemics, it is
+essential to improve forecasting methods and capabilities. In this work, we
+propose a new infectious disease forecasting model based on physics-informed
+neural networks (PINNs), an emerging area of scientific machine learning. The
+proposed PINN model incorporates dynamical systems representations of disease
+transmission into the loss function, thereby assimilating epidemiological
+theory and data using neural networks (NNs). Our approach is designed to
+prevent model overfitting, which often occurs when training deep learning
+models with observation data alone. In addition, we employ an additional
+sub-network to account for mobility, vaccination, and other covariates that
+influence the transmission rate, a key parameter in the compartment model. To
+demonstrate the capability of the proposed model, we examine the performance of
+the model using state-level COVID-19 data in California. Our simulation results
+show that predictions of PINN model on the number of cases, deaths, and
+hospitalizations are consistent with existing benchmarks. In particular, the
+PINN model outperforms the basic NN model and naive baseline forecast. We also
+show that the performance of the PINN model is comparable to a sophisticated
+Gaussian infection state space with time dependence (GISST) forecasting model
+that integrates the compartment model with a data observation model and a
+regression model for inferring parameters in the compartment model.
+Nonetheless, the PINN model offers a simpler structure and is easier to
+implement. Our results show that the proposed forecaster could potentially
+serve as a new computational tool to enhance the current capacity of infectious
+disease forecasting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Free-Knots Kolmogorov-Arnold Network: On the Analysis of Spline Knots
+  and Advancing Stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liangwewi Nathan Zheng, Wei Emma Zhang, Lin Yue, Miao Xu, Olaf Maennel, Weitong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kolmogorov-Arnold Neural Networks (KANs) have gained significant attention in
+the machine learning community. However, their implementation often suffers
+from poor training stability and heavy trainable parameter. Furthermore, there
+is limited understanding of the behavior of the learned activation functions
+derived from B-splines. In this work, we analyze the behavior of KANs through
+the lens of spline knots and derive the lower and upper bound for the number of
+knots in B-spline-based KANs. To address existing limitations, we propose a
+novel Free Knots KAN that enhances the performance of the original KAN while
+reducing the number of trainable parameters to match the trainable parameter
+scale of standard Multi-Layer Perceptrons (MLPs). Additionally, we introduce
+new a training strategy to ensure $C^2$ continuity of the learnable spline,
+resulting in smoother activation compared to the original KAN and improve the
+training stability by range expansion. The proposed method is comprehensively
+evaluated on 8 datasets spanning various domains, including image, text, time
+series, multimodal, and function approximation tasks. The promising results
+demonstrates the feasibility of KAN-based network and the effectiveness of
+proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Model is Secretly a Protein Sequence Optimizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinkai Wang, Jiaxing He, Yuanqi Du, Xiaohui Chen, Jianan Canal Li, Li-Ping Liu, Xiaolin Xu, Soha Hassoun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the protein sequence engineering problem, which aims to find
+protein sequences with high fitness levels, starting from a given wild-type
+sequence. Directed evolution has been a dominating paradigm in this field which
+has an iterative process to generate variants and select via experimental
+feedback. We demonstrate large language models (LLMs), despite being trained on
+massive texts, are secretly protein sequence optimizers. With a directed
+evolutionary method, LLM can perform protein engineering through Pareto and
+experiment-budget constrained optimization, demonstrating success on both
+synthetic and experimental fitness landscapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the convergence of noisy Bayesian Optimization with Expected
+  Improvement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyi Wang, Haowei Wang, Cosmin G. Petra, Nai-Yuan Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expected improvement (EI) is one of the most widely-used acquisition
+functions in Bayesian optimization (BO). Despite its proven success in
+applications for decades, important open questions remain on the theoretical
+convergence behaviors and rates for EI. In this paper, we contribute to the
+convergence theories of EI in three novel and critical area. First, we consider
+objective functions that are under the Gaussian process (GP) prior assumption,
+whereas existing works mostly focus on functions in the reproducing kernel
+Hilbert space (RKHS). Second, we establish the first asymptotic error bound and
+its corresponding rate for GP-EI with noisy observations under the GP prior
+assumption. Third, by investigating the exploration and exploitation of the
+non-convex EI function, we prove improved error bounds for both the noise-free
+and noisy cases. The improved noiseless bound is extended to the RKHS
+assumption as well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clone-Robust AI Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariel D. Procaccia, Benjamin Schiffer, Shirley Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge in training Large Language Models (LLMs) is properly aligning
+them with human preferences. Reinforcement Learning with Human Feedback (RLHF)
+uses pairwise comparisons from human annotators to train reward functions and
+has emerged as a popular alignment method. However, input datasets in RLHF are
+not necessarily balanced in the types of questions and answers that are
+included. Therefore, we want RLHF algorithms to perform well even when the set
+of alternatives is not uniformly distributed. Drawing on insights from social
+choice theory, we introduce robustness to approximate clones, a desirable
+property of RLHF algorithms which requires that adding near-duplicate
+alternatives does not significantly change the learned reward function. We
+first demonstrate that the standard RLHF algorithm based on regularized maximum
+likelihood estimation (MLE) fails to satisfy this property. We then propose the
+weighted MLE, a new RLHF algorithm that modifies the standard regularized MLE
+by weighting alternatives based on their similarity to other alternatives. This
+new algorithm guarantees robustness to approximate clones while preserving
+desirable theoretical properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Task Vectors in In-Context Learning: Emergence, Formation, and Benefit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Yang, Ziqian Lin, Kangwook Lee, Dimitris Papailiopoulos, Robert Nowak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning is a remarkable capability of transformers, referring to
+their ability to adapt to specific tasks based on a short history or context.
+Previous research has found that task-specific information is locally encoded
+within models, though their emergence and functionality remain unclear due to
+opaque pre-training processes. In this work, we investigate the formation of
+task vectors in a controlled setting, using models trained from scratch on
+synthetic datasets. Our findings confirm that task vectors naturally emerge
+under certain conditions, but the tasks may be relatively weakly and/or
+non-locally encoded within the model. To promote strong task vectors encoded at
+a prescribed location within the model, we propose an auxiliary training
+mechanism based on a task vector prompting loss (TVP-loss). This method
+eliminates the need to search for task-correlated encodings within the trained
+model and demonstrably improves robustness and generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mono-Forward: Backpropagation-Free Algorithm for Efficient Neural
+  Network Training Harnessing Local Errors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Gong, Bruce Li, Waleed Abdulla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backpropagation is the standard method for achieving state-of-the-art
+accuracy in neural network training, but it often imposes high memory costs and
+lacks biological plausibility. In this paper, we introduce the Mono-Forward
+algorithm, a purely local layerwise learning method inspired by Hinton's
+Forward-Forward framework. Unlike backpropagation, Mono-Forward optimizes each
+layer solely with locally available information, eliminating the reliance on
+global error signals. We evaluated Mono-Forward on multi-layer perceptrons and
+convolutional neural networks across multiple benchmarks, including MNIST,
+Fashion-MNIST, CIFAR-10, and CIFAR-100. The test results show that Mono-Forward
+consistently matches or surpasses the accuracy of backpropagation across all
+tasks, with significantly reduced and more even memory usage, better
+parallelizability, and a comparable convergence rate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tessellated Linear Model for Age Prediction from Voice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dareen Alharthi, Mahsa Zamani, Bhiksha Raj, Rita Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Voice biometric tasks, such as age estimation require modeling the often
+complex relationship between voice features and the biometric variable. While
+deep learning models can handle such complexity, they typically require large
+amounts of accurately labeled data to perform well. Such data are often scarce
+for biometric tasks such as voice-based age prediction. On the other hand,
+simpler models like linear regression can work with smaller datasets but often
+fail to generalize to the underlying non-linear patterns present in the data.
+In this paper we propose the Tessellated Linear Model (TLM), a piecewise linear
+approach that combines the simplicity of linear models with the capacity of
+non-linear functions. TLM tessellates the feature space into convex regions and
+fits a linear model within each region. We optimize the tessellation and the
+linear models using a hierarchical greedy partitioning. We evaluated TLM on the
+TIMIT dataset on the task of age prediction from voice, where it outperformed
+state-of-the-art deep learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foundations of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Xiao, Jingbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This is a book about large language models. As indicated by the title, it
+primarily focuses on foundational concepts rather than comprehensive coverage
+of all cutting-edge technologies. The book is structured into four main
+chapters, each exploring a key area: pre-training, generative models, prompting
+techniques, and alignment methods. It is intended for college students,
+professionals, and practitioners in natural language processing and related
+fields, and can serve as a reference for anyone interested in large language
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Scale-aware Representations for improved
+  Concept-Representation Alignment in ViTs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanchit Sinha, Guangzhi Xiong, Aidong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) are increasingly being adopted in various
+sensitive vision applications - like medical diagnosis, facial recognition,
+etc. To improve the interpretability of such models, many approaches attempt to
+forward-align them with carefully annotated abstract, human-understandable
+semantic entities - concepts. Concepts provide global rationales to the model
+predictions and can be quickly understood/intervened on by domain experts. Most
+current research focuses on designing model-agnostic, plug-and-play generic
+concept-based explainability modules that do not incorporate the inner workings
+of foundation models (e.g., inductive biases, scale invariance, etc.) during
+training. To alleviate this issue for ViTs, in this paper, we propose a novel
+Concept Representation Alignment Module (CRAM) which learns both scale and
+position-aware representations from multi-scale feature pyramids and patch
+representations respectively. CRAM further aligns these representations with
+concept annotations through an attention matrix. The proposed CRAM module
+improves the predictive performance of ViT architectures and also provides
+accurate and robust concept explanations as demonstrated on five datasets -
+including three widely used benchmarks (CUB, Pascal APY, Concept-MNIST) and 2
+real-world datasets (AWA2, KITS).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Law-Based Transformation (ALT): A Lightweight Feature
+  Representation for Time Series Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcell T. Kurbucz, Balázs Hajós, Balázs P. Halmos, Vince Á. Molnár, Antal Jakovác
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series classification (TSC) is fundamental in numerous domains,
+including finance, healthcare, and environmental monitoring. However,
+traditional TSC methods often struggle with the inherent complexity and
+variability of time series data. Building on our previous work with the linear
+law-based transformation (LLT) - which improved classification accuracy by
+transforming the feature space based on key data patterns - we introduce
+adaptive law-based transformation (ALT). ALT enhances LLT by incorporating
+variable-length shifted time windows, enabling it to capture distinguishing
+patterns of various lengths and thereby handle complex time series more
+effectively. By mapping features into a linearly separable space, ALT provides
+a fast, robust, and transparent solution that achieves state-of-the-art
+performance with only a few hyperparameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLAP-S: Support Set Based Adaptation for Downstream Fiber-optic Acoustic
+  Recognition <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingchen Sun, Shaobo Han, Wataru Kohno, Changyou Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Audio Pretraining (CLAP) models have demonstrated
+unprecedented performance in various acoustic signal recognition tasks.
+Fiber-optic-based acoustic recognition is one of the most important downstream
+tasks and plays a significant role in environmental sensing. Adapting CLAP for
+fiber-optic acoustic recognition has become an active research area. As a
+non-conventional acoustic sensor, fiber-optic acoustic recognition presents a
+challenging, domain-specific, low-shot deployment environment with significant
+domain shifts due to unique frequency response and noise characteristics. To
+address these challenges, we propose a support-based adaptation method, CLAP-S,
+which linearly interpolates a CLAP Adapter with the Support Set, leveraging
+both implicit knowledge through fine-tuning and explicit knowledge retrieved
+from memory for cross-domain generalization. Experimental results show that our
+method delivers competitive performance on both laboratory-recorded fiber-optic
+ESC-50 datasets and a real-world fiber-optic gunshot-firework dataset. Our
+research also provides valuable insights for other downstream acoustic
+recognition tasks. The code and gunshot-firework dataset are available at
+https://github.com/Jingchensun/clap-s.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Geometry-Preserving Encoder/Decoder in Latent Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wonjun Lee, Riley C. W. O'Neill, Dongmian Zou, Jeff Calder, Gilad Lerman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative modeling aims to generate new data samples that resemble a given
+dataset, with diffusion models recently becoming the most popular generative
+model. One of the main challenges of diffusion models is solving the problem in
+the input space, which tends to be very high-dimensional. Recently, solving
+diffusion models in the latent space through an encoder that maps from the data
+space to a lower-dimensional latent space has been considered to make the
+training process more efficient and has shown state-of-the-art results. The
+variational autoencoder (VAE) is the most commonly used encoder/decoder
+framework in this domain, known for its ability to learn latent representations
+and generate data samples. In this paper, we introduce a novel encoder/decoder
+framework with theoretical properties distinct from those of the VAE,
+specifically designed to preserve the geometric structure of the data
+distribution. We demonstrate the significant advantages of this
+geometry-preserving encoder in the training process of both the encoder and
+decoder. Additionally, we provide theoretical results proving convergence of
+the training process, including convergence guarantees for encoder training,
+and results showing faster convergence of decoder training when using the
+geometry-preserving encoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An LLM-Guided Tutoring System for Social Skills Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Guevarra, Indronil Bhattacharjee, Srijita Das, Christabel Wayllace, Carrie Demmans Epp, Matthew E. Taylor, Alan Tay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social skills training targets behaviors necessary for success in social
+interactions. However, traditional classroom training for such skills is often
+insufficient to teach effective communication -- one-to-one interaction in
+real-world scenarios is preferred to lecture-style information delivery. This
+paper introduces a framework that allows instructors to collaborate with large
+language models to dynamically design realistic scenarios for students to
+communicate. Our framework uses these scenarios to enable student rehearsal,
+provide immediate feedback, and visualize performance for both students and
+instructors. Unlike traditional intelligent tutoring systems, instructors can
+easily co-create scenarios with a large language model without technical
+skills. Additionally, the system generates new scenario branches in real time
+when existing options do not fit the student's response.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Explainability to Interpretability: Interpretable Policies in
+  Reinforcement Learning Via Model Explanation <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peilang Li, Umer Siddique, Yongcan Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep reinforcement learning (RL) has shown remarkable success in complex
+domains, however, the inherent black box nature of deep neural network policies
+raises significant challenges in understanding and trusting the decision-making
+processes. While existing explainable RL methods provide local insights, they
+fail to deliver a global understanding of the model, particularly in
+high-stakes applications. To overcome this limitation, we propose a novel
+model-agnostic approach that bridges the gap between explainability and
+interpretability by leveraging Shapley values to transform complex deep RL
+policies into transparent representations. The proposed approach offers two key
+contributions: a novel approach employing Shapley values to policy
+interpretation beyond local explanations and a general framework applicable to
+off-policy and on-policy algorithms. We evaluate our approach with three
+existing deep RL algorithms and validate its performance in two classic control
+environments. The results demonstrate that our approach not only preserves the
+original models' performance but also generates more stable interpretable
+policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Deployable AI (DAI) Workshop at the Thirty-Ninth AAAI
+  Conference on Artificial Intelligence (AAAI-25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Noisy Halfspaces with a Margin: Massart is No Harder than
+  Random <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gautam Chandrasekaran, Vasilis Kontonis, Konstantinos Stavropoulos, Kevin Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of PAC learning $\gamma$-margin halfspaces with Massart
+noise. We propose a simple proper learning algorithm, the Perspectron, that has
+sample complexity $\widetilde{O}((\epsilon\gamma)^{-2})$ and achieves
+classification error at most $\eta+\epsilon$ where $\eta$ is the Massart noise
+rate. Prior works [DGT19,CKMY20] came with worse sample complexity guarantees
+(in both $\epsilon$ and $\gamma$) or could only handle random classification
+noise [DDK+23,KIT+23] -- a much milder noise assumption. We also show that our
+results extend to the more challenging setting of learning generalized linear
+models with a known link function under Massart noise, achieving a similar
+sample complexity to the halfspace case. This significantly improves upon the
+prior state-of-the-art in this setting due to [CKMY20], who introduced this
+model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appeared in NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coded Deep Learning: Framework and Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        En-hui Yang, Shayan Mohajer Hamidi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of deep learning (DL) is often achieved with large models and
+high complexity during both training and post-training inferences, hindering
+training in resource-limited settings. To alleviate these issues, this paper
+introduces a new framework dubbed ``coded deep learning'' (CDL), which
+integrates information-theoretic coding concepts into the inner workings of DL,
+to significantly compress model weights and activations, reduce computational
+complexity at both training and post-training inference stages, and enable
+efficient model/data parallelism. Specifically, within CDL, (i) we first
+propose a novel probabilistic method for quantizing both model weights and
+activations, and its soft differentiable variant which offers an analytic
+formula for gradient calculation during training; (ii) both the forward and
+backward passes during training are executed over quantized weights and
+activations, eliminating most floating-point operations and reducing training
+complexity; (iii) during training, both weights and activations are entropy
+constrained so that they are compressible in an information-theoretic sense
+throughout training, thus reducing communication costs in model/data
+parallelism; and (iv) the trained model in CDL is by default in a quantized
+format with compressible quantized weights, reducing post-training inference
+and storage complexity. Additionally, a variant of CDL, namely relaxed CDL
+(R-CDL), is presented to further improve the trade-off between validation
+accuracy and compression though requiring full precision in training with other
+advantageous features of CDL intact. Extensive empirical results show that CDL
+and R-CDL outperform the state-of-the-art algorithms in DNN compression in the
+literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ pFedWN: A Personalized Federated Learning Framework for D2D Wireless
+  Networks with Heterogeneous Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhou Ni, Masoud Ghazikor, Morteza Hashemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Federated Learning (FL) approaches often struggle with data
+heterogeneity across clients, leading to suboptimal model performance for
+individual clients. To address this issue, Personalized Federated Learning
+(PFL) emerges as a solution to the challenges posed by non-independent and
+identically distributed (non-IID) and unbalanced data across clients.
+Furthermore, in most existing decentralized machine learning works, a perfect
+communication channel is considered for model parameter transmission between
+clients and servers. However, decentralized PFL over wireless links introduces
+new challenges, such as resource allocation and interference management. To
+overcome these challenges, we formulate a joint optimization problem that
+incorporates the underlying device-to-device (D2D) wireless channel conditions
+into a server-free PFL approach. The proposed method, dubbed pFedWN, optimizes
+the learning performance for each client while accounting for the variability
+in D2D wireless channels. To tackle the formulated problem, we divide it into
+two sub-problems: PFL neighbor selection and PFL weight assignment. The PFL
+neighbor selection is addressed through channel-aware neighbor selection within
+unlicensed spectrum bands such as ISM bands. Next, to assign PFL weights, we
+utilize the Expectation-Maximization (EM) method to evaluate the similarity
+between clients' data and obtain optimal weight distribution among the chosen
+PFL neighbors. Empirical results show that pFedWN provides efficient and
+personalized learning performance with non-IID and unbalanced datasets.
+Furthermore, it outperforms the existing FL and PFL methods in terms of
+learning efficacy and robustness, particularly under dynamic and unpredictable
+wireless channel conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures, 3 tables, submitted to Transactions on
+  Networking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BN-Pool: a Bayesian Nonparametric Approach to Graph Pooling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Castellana, Filippo Maria Bianchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce BN-Pool, the first clustering-based pooling method for Graph
+Neural Networks (GNNs) that adaptively determines the number of supernodes in a
+coarsened graph. By leveraging a Bayesian non-parametric framework, BN-Pool
+employs a generative model capable of partitioning graph nodes into an
+unbounded number of clusters. During training, we learn the node-to-cluster
+assignments by combining the supervised loss of the downstream task with an
+unsupervised auxiliary term, which encourages the reconstruction of the
+original graph topology while penalizing unnecessary proliferation of clusters.
+This adaptive strategy allows BN-Pool to automatically discover an optimal
+coarsening level, offering enhanced flexibility and removing the need to
+specify sensitive pooling ratios. We show that BN-Pool achieves superior
+performance across diverse benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Generalization in Chain of Thought Reasoning for Smaller
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09804v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09804v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxwell J. Yin, Dingyi Jiang, Yongbing Chen, Boyu Wang, Charles Ling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) reasoning in smaller language models is a challenging
+natural language process problem yet highly desirable in many real-life
+applications. Existing CoT knowledge distillation methods often suffer from
+overly conservative memorization in smaller LLMs, leading to low generalization
+confidence. As fully preserving the CoT ability of teacher model is impossible,
+we hypothesize that adversarial CoT fine-tuning is crucial for developing
+smaller LLM with robust CoT generalization. To this end, we propose
+\textit{PRompt-Assisted Domain-Adversarial fine-tuning} (PRADA), a principled
+fine-tuning framework that integrates diverse CoT domains. Specifically, PRADA
+pioneers two CoT improvements in smaller LLM: (1) Recovering the
+domain-invariant feature insight which typically lost during distillation with
+domain adversarial fine-tuning; (2) Enhancing the domain adaptability of CoT
+prompt engineering by employing domain-adversarial approaches. We theoretically
+demonstrate the effectiveness of our approach and empirically show that it
+significantly outperforms the state of the arts in a wide range of tasks.
+Moreover, our empirical findings reveal that the smaller LLM, when leveraging
+PRADA, aligns closely with domain knowledge, thereby improving the
+explainability of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Neural Networks for Travel Distance Estimation and Route
+  Recommendation Under Probabilistic Hazards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Liu, Hadi Meidani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the shortest travel time and providing route recommendation
+between different locations in a city or region can quantitatively measure the
+conditions of the transportation network during or after extreme events. One
+common approach is to use Dijkstra's Algorithm, which produces the shortest
+path as well as the shortest distance. However, this option is computationally
+expensive when applied to large-scale networks. This paper proposes a novel
+fast framework based on graph neural networks (GNNs) which approximate the
+single-source shortest distance between pairs of locations, and predict the
+single-source shortest path subsequently. We conduct multiple experiments on
+synthetic graphs of different size to demonstrate the feasibility and
+computational efficiency of the proposed model. In real-world case studies, we
+also applied the proposed method of flood risk analysis of coastal urban areas
+to calculate delays in evacuation to public shelters during hurricanes. The
+results indicate the accuracy and computational efficiency of the GNN model,
+and its potential for effective implementation in emergency planning and
+management.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniThink: Expanding Knowledge Boundaries in Machine Writing through
+  Thinking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekun Xi, Wenbiao Yin, Jizhan Fang, Jialong Wu, Runnan Fang, Ningyu Zhang, Jiang Yong, Pengjun Xie, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine writing with large language models often relies on
+retrieval-augmented generation. However, these approaches remain confined
+within the boundaries of the model's predefined scope, limiting the generation
+of content with rich information. Specifically, vanilla-retrieved information
+tends to lack depth, utility, and suffers from redundancy, which negatively
+impacts the quality of generated articles, leading to shallow, repetitive, and
+unoriginal outputs. To address these issues, we propose OmniThink, a machine
+writing framework that emulates the human-like process of iterative expansion
+and reflection. The core idea behind OmniThink is to simulate the cognitive
+behavior of learners as they progressively deepen their knowledge of the
+topics. Experimental results demonstrate that OmniThink improves the knowledge
+density of generated articles without compromising metrics such as coherence
+and depth. Human evaluations and expert feedback further highlight the
+potential of OmniThink to address real-world challenges in the generation of
+long-form articles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Head Self-Attending Neural Tucker Factorization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yikai Hou, Peng Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quality-of-service (QoS) data exhibit dynamic temporal patterns that are
+crucial for accurately predicting missing values. These patterns arise from the
+evolving interactions between users and services, making it essential to
+capture the temporal dynamics inherent in such data for improved prediction
+performance. As the size and complexity of QoS datasets increase, existing
+models struggle to provide accurate predictions, highlighting the need for more
+flexible and dynamic methods to better capture the underlying patterns in
+large-scale QoS data. To address this issue, we introduce a neural
+network-based tensor factorization approach tailored for learning
+spatiotemporal representations of high-dimensional and incomplete (HDI)
+tensors, namely the Multi-head Self-attending Neural Tucker Factorization
+(MSNTucF). The model is elaborately designed for modeling intricate nonlinear
+spatiotemporal feature interaction patterns hidden in real world data with a
+two-fold idea. It first employs a neural network structure to generalize the
+traditional framework of Tucker factorization and then proposes to leverage a
+multi-head self-attending module to enforce nonlinear latent interaction
+learning. In empirical studies on two dynamic QoS datasets from real
+applications, the proposed MSNTucF model demonstrates superior performance
+compared to state-of-the-art benchmark models in estimating missing
+observations. This highlights its ability to learn non-linear spatiotemporal
+representations of HDI tensors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Algorithmic Collective Action in Recommender Systems: Promoting Songs by
+  Reordering Playlists <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.04269v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.04269v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joachim Baumann, Celestine Mendler-Dünner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate algorithmic collective action in transformer-based recommender
+systems. Our use case is a music streaming platform where a collective of fans
+aims to promote the visibility of an underrepresented artist by strategically
+placing one of their songs in the existing playlists they control. We introduce
+two easily implementable strategies to select the position at which to insert
+the song with the goal to boost recommendations at test time. The strategies
+exploit statistical properties of the learner by targeting discontinuities in
+the recommendations, and leveraging the long-tail nature of song distributions.
+We evaluate the efficacy of our strategies using a publicly available
+recommender system model released by a major music streaming platform. Our
+findings reveal that through strategic placement even small collectives
+(controlling less than 0.01\% of the training data) can achieve up to
+$40\times$ more test time recommendations than an average song with the same
+number of training set occurrences. Focusing on the externalities of the
+strategy, we find that the recommendations of other songs are largely
+preserved, and the newly gained recommendations are distributed across various
+artists. Together, our findings demonstrate how carefully designed collective
+action strategies can be effective while not necessarily being adversarial.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at NeurIPS 2024, camera-ready updates</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using Machine Learning to Discover Parsimonious and
+  Physically-Interpretable Representations of Catchment-Scale Rainfall-Runoff
+  Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04845v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04845v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan-Heng Wang, Hoshin V. Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the excellent real-world predictive performance of modern machine
+learning (ML) methods, many scientists remain hesitant to discard traditional
+physical-conceptual (PC) approaches due mainly to their relative
+interpretability, which contributes to credibility during decision-making. In
+this context, a currently underexplored aspect of ML is how to develop
+minimally-optimal representations that can facilitate better insight regarding
+system functioning. Regardless of how this is achieved, it is arguably true
+that parsimonious representations better support the advancement of scientific
+understanding. Our own view is that ML-based modeling of geoscientific systems
+should be based in the use of computational units that are fundamentally
+interpretable by design.
+  This paper continues our exploration of how the strengths of ML can be
+exploited in the service of better understanding via scientific investigation.
+Here, we use the Mass Conserving Perceptron (MCP) as the fundamental
+computational unit in a generic network architecture consisting of nodes
+arranged in series and parallel to explore several generic and important issues
+related to the use of observational data for constructing input-state-output
+models of dynamical systems. In the context of lumped catchment modeling, we
+show that physical interpretability and excellent predictive performance can
+both be achieved using a relatively parsimonious distributed-state
+multiple-flow-path network with context-dependent gating and information
+sharing across the nodes, suggesting that MCP-based modeling can play a
+significant role in application of ML to geoscientific investigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>74 Pages, 4 Tables, 13 Figures, 11 Tables and 11 Figures in
+  Supplementary Materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamics of Moral Behavior in Heterogeneous Populations of Learning
+  Agents <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04202v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04202v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizaveta Tennant, Stephen Hailes, Mirco Musolesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Growing concerns about safety and alignment of AI systems highlight the
+importance of embedding moral capabilities in artificial agents: a promising
+solution is the use of learning from experience, i.e., Reinforcement Learning.
+In multi-agent (social) environments, complex population-level phenomena may
+emerge from interactions between individual learning agents. Many of the
+existing studies rely on simulated social dilemma environments to study the
+interactions of independent learning agents; however, they tend to ignore the
+moral heterogeneity that is likely to be present in societies of agents in
+practice. For example, at different points in time a single learning agent may
+face opponents who are consequentialist (i.e., focused on maximizing outcomes
+over time), norm-based (i.e., conforming to specific norms), or virtue-based
+(i.e., considering a combination of different virtues). The extent to which
+agents' co-development may be impacted by such moral heterogeneity in
+populations is not well understood. In this paper, we present a study of the
+learning dynamics of morally heterogeneous populations interacting in a social
+dilemma setting. Using an Iterated Prisoner's Dilemma environment with a
+partner selection mechanism, we investigate the extent to which the prevalence
+of diverse moral agents in populations affects individual agents' learning
+behaviors and emergent population-level outcomes. We observe several types of
+non-trivial interactions between pro-social and anti-social agents, and find
+that certain types of moral agents are able to steer selfish agents towards
+more cooperative behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at AIES 2024 (7th AAAI/ACM Conference on AI, Ethics, and
+  Society - San Jose, CA, USA) - see
+  https://ojs.aaai.org/index.php/AIES/article/view/31736</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comparative Study on Multi-task Uncertainty Quantification in Semantic
+  Segmentation and Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17097v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17097v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Landgraf, Markus Hillemann, Theodor Kapler, Markus Ulrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks excel in perception tasks such as semantic segmentation
+and monocular depth estimation, making them indispensable in safety-critical
+applications like autonomous driving and industrial inspection. However, they
+often suffer from overconfidence and poor explainability, especially for
+out-of-domain data. While uncertainty quantification has emerged as a promising
+solution to these challenges, multi-task settings have yet to be explored. In
+an effort to shed light on this, we evaluate Monte Carlo Dropout, Deep
+Sub-Ensembles, and Deep Ensembles for joint semantic segmentation and monocular
+depth estimation. Thereby, we reveal that Deep Ensembles stand out as the
+preferred choice, particularly in out-of-domain scenarios, and show the
+potential benefit of multi-task learning with regard to the uncertainty quality
+in comparison to solving both tasks separately. Additionally, we highlight the
+impact of employing different uncertainty thresholds to classify pixels as
+certain or uncertain, with the median uncertainty emerging as a robust default.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript is an extended version of a previously published
+  conference paper and is currently in review for a journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligning Brain Activity with Advanced <span class="highlight-title">Transformer</span> Models: Exploring the
+  Role of Punctuation in Semantic Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06278v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06278v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zenon Lamprou, Frank Polick, Yashar Moshfeghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research examines the congruence between neural activity and advanced
+transformer models, emphasizing the semantic significance of punctuation in
+text understanding. Utilizing an innovative approach originally proposed by
+Toneva and Wehbe, we evaluate four advanced transformer models RoBERTa,
+DistiliBERT, ALBERT, and ELECTRA against neural activity data. Our findings
+indicate that RoBERTa exhibits the closest alignment with neural activity,
+surpassing BERT in accuracy. Furthermore, we investigate the impact of
+punctuation removal on model performance and neural alignment, revealing that
+BERT's accuracy enhances in the absence of punctuation. This study contributes
+to the comprehension of how neural networks represent language and the
+influence of punctuation on semantic processing within the human brain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Flexible task abstractions emerge in linear networks with fast and
+  bounded units 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Sandbrink, Jan P. Bauer, Alexandra M. Proca, Andrew M. Saxe, Christopher Summerfield, Ali Hummos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Animals survive in dynamic environments changing at arbitrary timescales, but
+such data distribution shifts are a challenge to neural networks. To adapt to
+change, neural systems may change a large number of parameters, which is a slow
+process involving forgetting past information. In contrast, animals leverage
+distribution changes to segment their stream of experience into tasks and
+associate them with internal task abstracts. Animals can then respond flexibly
+by selecting the appropriate task abstraction. However, how such flexible task
+abstractions may arise in neural systems remains unknown. Here, we analyze a
+linear gated network where the weights and gates are jointly optimized via
+gradient descent, but with neuron-like constraints on the gates including a
+faster timescale, nonnegativity, and bounded activity. We observe that the
+weights self-organize into modules specialized for tasks or sub-tasks
+encountered, while the gates layer forms unique representations that switch the
+appropriate weight modules (task abstractions). We analytically reduce the
+learning dynamics to an effective eigenspace, revealing a virtuous cycle: fast
+adapting gates drive weight specialization by protecting previous knowledge,
+while weight specialization in turn increases the update rate of the gating
+layer. Task switching in the gating layer accelerates as a function of
+curriculum block size and task training, mirroring key findings in cognitive
+neuroscience. We show that the discovered task abstractions support
+generalization through both task and subtask composition, and we extend our
+findings to a non-linear network switching between two tasks. Overall, our work
+offers a theory of cognitive flexibility in animals as arising from joint
+gradient descent on synaptic and neural gating in a neural network
+architecture.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> of Foundation Models in Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10729v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10729v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wasif Khan, Seowung Leem, Kyle B. See, Joshua K. Wong, Shaoting Zhang, Ruogu Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FMs) are large-scale deep learning models trained on
+massive datasets, often using self-supervised learning techniques. These models
+serve as a versatile base for a wide range of downstream tasks, including those
+in medicine and healthcare. FMs have demonstrated remarkable success across
+multiple healthcare domains. However, existing surveys in this field do not
+comprehensively cover all areas where FMs have made significant strides. In
+this survey, we present a comprehensive review of FMs in medicine, focusing on
+their evolution, learning strategies, flagship models, applications, and
+associated challenges. We examine how prominent FMs, such as the BERT and GPT
+families, are transforming various aspects of healthcare, including clinical
+large language models, medical image analysis, and omics research.
+Additionally, we provide a detailed taxonomy of FM-enabled healthcare
+applications, spanning clinical natural language processing, medical computer
+vision, graph learning, and other biology- and omics- related tasks. Despite
+the transformative potentials of FMs, they also pose unique challenges. This
+survey delves into these challenges and highlights open research questions and
+lessons learned to guide researchers and practitioners. Our goal is to provide
+valuable insights into the capabilities of FMs in health, facilitating
+responsible deployment and mitigating associated risks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hybrid Approaches for Moral Value Alignment in AI Agents: a Manifesto 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01818v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01818v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizaveta Tennant, Stephen Hailes, Mirco Musolesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasing interest in ensuring the safety of next-generation Artificial
+Intelligence (AI) systems calls for novel approaches to embedding morality into
+autonomous agents. This goal differs qualitatively from traditional
+task-specific AI methodologies. In this paper, we provide a systematization of
+existing approaches to the problem of introducing morality in machines -
+modelled as a continuum. Our analysis suggests that popular techniques lie at
+the extremes of this continuum - either being fully hard-coded into top-down,
+explicit rules, or entirely learned in a bottom-up, implicit fashion with no
+direct statement of any moral principle (this includes learning from human
+feedback, as applied to the training and finetuning of large language models,
+or LLMs). Given the relative strengths and weaknesses of each type of
+methodology, we argue that more hybrid solutions are needed to create adaptable
+and robust, yet controllable and interpretable agentic systems. To that end,
+this paper discusses both the ethical foundations (including deontology,
+consequentialism and virtue ethics) and implementations of morally aligned AI
+systems.
+  We present a series of case studies that rely on intrinsic rewards, moral
+constraints or textual instructions, applied to either pure-Reinforcement
+Learning or LLM-based agents. By analysing these diverse implementations under
+one framework, we compare their relative strengths and shortcomings in
+developing morally aligned AI systems. We then discuss strategies for
+evaluating the effectiveness of moral learning agents. Finally, we present open
+research questions and implications for the future of AI safety and ethics
+which are emerging from this hybrid framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReFactor GNNs: Revisiting Factorisation-based Models from a
+  Message-Passing Perspective <span class="chip">NeurIPS
+  2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.09980v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.09980v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihong Chen, Pushkar Mishra, Luca Franceschi, Pasquale Minervini, Pontus Stenetorp, Sebastian Riedel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Factorisation-based Models (FMs), such as DistMult, have enjoyed enduring
+success for Knowledge Graph Completion (KGC) tasks, often outperforming Graph
+Neural Networks (GNNs). However, unlike GNNs, FMs struggle to incorporate node
+features and generalise to unseen nodes in inductive settings. Our work bridges
+the gap between FMs and GNNs by proposing ReFactor GNNs. This new architecture
+draws upon both modelling paradigms, which previously were largely thought of
+as disjoint. Concretely, using a message-passing formalism, we show how FMs can
+be cast as GNNs by reformulating the gradient descent procedure as
+message-passing operations, which forms the basis of our ReFactor GNNs. Across
+a multitude of well-established KGC benchmarks, our ReFactor GNNs achieve
+comparable transductive performance to FMs, and state-of-the-art inductive
+performance while using an order of magnitude fewer parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36th Conference on Neural Information Processing Systems (NeurIPS
+  2022)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Local Anti-Concentration Class: Logarithmic Regret for Greedy Linear
+  Contextual Bandit <span class="chip">NeurIPS2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.12878v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.12878v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seok-Jin Kim, Min-hwan Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the performance guarantees of exploration-free greedy algorithms for
+the linear contextual bandit problem. We introduce a novel condition, named the
+\textit{Local Anti-Concentration} (LAC) condition, which enables a greedy
+bandit algorithm to achieve provable efficiency. We show that the LAC condition
+is satisfied by a broad class of distributions, including Gaussian,
+exponential, uniform, Cauchy, and Student's~$t$ distributions, along with other
+exponential family distributions and their truncated variants. This
+significantly expands the class of distributions under which greedy algorithms
+can perform efficiently. Under our proposed LAC condition, we prove that the
+cumulative expected regret of the greedy algorithm for the linear contextual
+bandit is bounded by $O(\operatorname{poly} \log T)$. Our results establish the
+widest range of distributions known to date that allow a sublinear regret bound
+for greedy algorithms, further achieving a sharp poly-logarithmic regret.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Higher-Order Topological Directionality and Directed Simplicial Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08389v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08389v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Lecha, Andrea Cavallo, Francesca Dominici, Elvin Isufi, Claudio Battiloro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topological Deep Learning (TDL) has emerged as a paradigm to process and
+learn from signals defined on higher-order combinatorial topological spaces,
+such as simplicial or cell complexes. Although many complex systems have an
+asymmetric relational structure, most TDL models forcibly symmetrize these
+relationships. In this paper, we first introduce a novel notion of higher-order
+directionality and we then design Directed Simplicial Neural Networks
+(Dir-SNNs) based on it. Dir-SNNs are message-passing networks operating on
+directed simplicial complexes able to leverage directed and possibly asymmetric
+interactions among the simplices. To our knowledge, this is the first TDL model
+using a notion of higher-order directionality. We theoretically and empirically
+prove that Dir-SNNs are more expressive than their directed graph counterpart
+in distinguishing isomorphic directed graphs. Experiments on a synthetic source
+localization task demonstrate that Dir-SNNs outperform undirected SNNs when the
+underlying complex is directed, and perform comparably when the underlying
+complex is undirected.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 8 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hybrid additive modeling with partial dependence for supervised
+  regression and dynamical systems forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02229v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02229v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yann Claes, Vân Anh Huynh-Thu, Pierre Geurts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning processes by exploiting restricted domain knowledge is an important
+task across a plethora of scientific areas, with more and more hybrid training
+methods additively combining data-driven and model-based approaches. Although
+the obtained models are more accurate than purely data-driven models, the
+optimization process usually comes with sensitive regularization constraints.
+Furthermore, while such hybrid methods have been tested in various scientific
+applications, they have been mostly tested on dynamical systems, with only
+limited study about the influence of each model component on global performance
+and parameter identification. In this work, we introduce a new hybrid training
+approach based on partial dependence, which removes the need for intricate
+regularization. Moreover, we assess the performance of hybrid modeling against
+traditional machine learning methods on standard regression problems. We
+compare, on both synthetic and real regression problems, several approaches for
+training such hybrid models. We focus on hybrid methods that additively combine
+a parametric term with a machine learning term and investigate model-agnostic
+training procedures. Therefore, experiments are carried out with different
+types of machine learning models, including tree-based models and artificial
+neural networks. We also extend our partial dependence optimization process for
+dynamical systems forecasting and compare it to existing schemes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of the paper entitled "Knowledge-Guided Additive
+  Modeling for Supervised Regression"
+  (https://link.springer.com/chapter/10.1007/978-3-031-45275-8_5), accepted for
+  publication in the Machine Learning journal. The extension includes new
+  experiments in the static setting, along with a dedicated section on the
+  application of our method to the problem of dynamical systems forecasting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provably Efficient Reinforcement Learning with Multinomial Logit
+  Function Approximation <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17061v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17061v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long-Fei Li, Yu-Jie Zhang, Peng Zhao, Zhi-Hua Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a new class of MDPs that employs multinomial logit (MNL) function
+approximation to ensure valid probability distributions over the state space.
+Despite its significant benefits, incorporating the non-linear function raises
+substantial challenges in both statistical and computational efficiency. The
+best-known result of Hwang and Oh [2023] has achieved an
+$\widetilde{\mathcal{O}}(\kappa^{-1}dH^2\sqrt{K})$ regret upper bound, where
+$\kappa$ is a problem-dependent quantity, $d$ is the feature dimension, $H$ is
+the episode length, and $K$ is the number of episodes. However, we observe that
+$\kappa^{-1}$ exhibits polynomial dependence on the number of reachable states,
+which can be as large as the state space size in the worst case and thus
+undermines the motivation for function approximation. Additionally, their
+method requires storing all historical data and the time complexity scales
+linearly with the episode count, which is computationally expensive. In this
+work, we propose a statistically efficient algorithm that achieves a regret of
+$\widetilde{\mathcal{O}}(dH^2\sqrt{K} + \kappa^{-1}d^2H^2)$, eliminating the
+dependence on $\kappa^{-1}$ in the dominant term for the first time. We then
+address the computational challenges by introducing an enhanced algorithm that
+achieves the same regret guarantee but with only constant cost. Finally, we
+establish the first lower bound for this problem, justifying the optimality of
+our results in $d$ and $K$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024; v3 substantially improves the presentation and further
+  illustrates the role of $\kappa$ in function approximation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian
+  Neural Networks <span class="chip">AAAI'2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20891v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20891v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bao Gia Doan, Afshar Shamsi, Xiao-Yu Guo, Arash Mohammadi, Hamid Alinejad-Rokny, Dino Sejdinovic, Damien Teney, Damith C. Ranasinghe, Ehsan Abbasnejad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational complexity of Bayesian learning is impeding its adoption in
+practical, large-scale tasks. Despite demonstrations of significant merits such
+as improved robustness and resilience to unseen or out-of-distribution inputs
+over their non- Bayesian counterparts, their practical use has faded to near
+insignificance. In this study, we introduce an innovative framework to mitigate
+the computational burden of Bayesian neural networks (BNNs). Our approach
+follows the principle of Bayesian techniques based on deep ensembles, but
+significantly reduces their cost via multiple low-rank perturbations of
+parameters arising from a pre-trained neural network. Both vanilla version of
+ensembles as well as more sophisticated schemes such as Bayesian learning with
+Stein Variational Gradient Descent (SVGD), previously deemed impractical for
+large models, can be seamlessly implemented within the proposed framework,
+called Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a
+dramatic reduction in the number of trainable parameters required to
+approximate a Bayesian posterior; and ii) it not only maintains, but in some
+instances, surpasses the performance of conventional Bayesian learning methods
+and non-Bayesian baselines. Our results with large-scale tasks such as
+ImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the
+effectiveness and versatility of Bella in building highly scalable and
+practical Bayesian deep models for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted in AAAI'2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Space Characterization of Autoencoder Variants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04755v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04755v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anika Shrivastava, Renu Rameshan, Samar Agnihotri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the latent spaces learned by deep learning models is crucial in
+exploring how they represent and generate complex data. Autoencoders (AEs) have
+played a key role in the area of representation learning, with numerous
+regularization techniques and training principles developed not only to enhance
+their ability to learn compact and robust representations, but also to reveal
+how different architectures influence the structure and smoothness of the
+lower-dimensional non-linear manifold. We strive to characterize the structure
+of the latent spaces learned by different autoencoders including convolutional
+autoencoders (CAEs), denoising autoencoders (DAEs), and variational
+autoencoders (VAEs) and how they change with the perturbations in the input. By
+characterizing the matrix manifolds corresponding to the latent spaces, we
+provide an explanation for the well-known observation that the latent spaces of
+CAE and DAE form non-smooth manifolds, while that of VAE forms a smooth
+manifold. We also map the points of the matrix manifold to a Hilbert space
+using distance preserving transforms and provide an alternate view in terms of
+the subspaces generated in the Hilbert space as a function of the distortion in
+the input. The results show that the latent manifolds of CAE and DAE are
+stratified with each stratum being a smooth product manifold, while the
+manifold of VAE is a smooth product manifold of two symmetric positive definite
+matrices and a symmetric positive semi-definite matrix.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures, and 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FSDEM: Feature Selection Dynamic Evaluation Metric 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14234v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14234v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Rajabinasab, Anton D. Lautrup, Tobias Hyrup, Arthur Zimek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expressive evaluation metrics are indispensable for informative experiments
+in all areas, and while several metrics are established in some areas, in
+others, such as feature selection, only indirect or otherwise limited
+evaluation metrics are found. In this paper, we propose a novel evaluation
+metric to address several problems of its predecessors and allow for flexible
+and reliable evaluation of feature selection algorithms. The proposed metric is
+a dynamic metric with two properties that can be used to evaluate both the
+performance and the stability of a feature selection algorithm. We conduct
+several empirical experiments to illustrate the use of the proposed metric in
+the successful evaluation of feature selection algorithms. We also provide a
+comparison and analysis to show the different aspects involved in the
+evaluation of the feature selection algorithms. The results indicate that the
+proposed metric is successful in carrying out the evaluation task for feature
+selection algorithms.
+  This paper is an extended version of a paper published at SISAP 2024.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Short version of this paper is published at 17th International
+  Conference on Similarity Search and Applications, SISAP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STROOBnet Optimization via GPU-Accelerated Proximal Recurrence
+  Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14388v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14388v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ted Edward Holmberg, Mahdi Abdelguerfi, Elias Ioup
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatiotemporal networks' observational capabilities are crucial for accurate
+data gathering and informed decisions across multiple sectors. This study
+focuses on the Spatiotemporal Ranged Observer-Observable Bipartite Network
+(STROOBnet), linking observational nodes (e.g., surveillance cameras) to events
+within defined geographical regions, enabling efficient monitoring. Using data
+from Real-Time Crime Camera (RTCC) systems and Calls for Service (CFS) in New
+Orleans, where RTCC combats rising crime amidst reduced police presence, we
+address the network's initial observational imbalances. Aiming for uniform
+observational efficacy, we propose the Proximal Recurrence approach. It
+outperformed traditional clustering methods like k-means and DBSCAN by offering
+holistic event frequency and spatial consideration, enhancing observational
+coverage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 17 figures, 2023 IEEE International Conference on Big Data
+  (BigData)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Consolidated Volatility Prediction with Back Propagation Neural
+  Network and Genetic Algorithm <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07223v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07223v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zong Ke, Jingyu Xu, Zizhou Zhang, Yu Cheng, Wenjun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides a unique approach with AI algorithms to predict emerging
+stock markets volatility. Traditionally, stock volatility is derived from
+historical volatility,Monte Carlo simulation and implied volatility as well. In
+this paper, the writer designs a consolidated model with back-propagation
+neural network and genetic algorithm to predict future volatility of emerging
+stock markets and found that the results are quite accurate with low errors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures, 1 table, The paper will be published by IEEE on
+  conference: 2024 3rd International Conference on Image Processing, Computer
+  Vision and Machine Learning (ICICML 2024) (V2)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SimHawNet: A Modified Hawkes Process for Temporal Network Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.07260v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.07260v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathilde Perez, Raphaël Romero, Bo Kang, Tijl De Bie, Jefrey Lijffijt, Charlotte Laclau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal networks allow representing connections between objects while
+incorporating the temporal dimension. While static network models can capture
+unchanging topological regularities, they often fail to model the effects
+associated with the causal generative process of the network that occurs in
+time. Hence, exploiting the temporal aspect of networks has been the focus of
+many recent studies. In this context, we propose a new framework for generative
+models of continuous-time temporal networks. We assume that the activation of
+the edges in a temporal network is driven by a specified temporal point
+process. This approach allows to directly model the waiting time between events
+while incorporating time-varying history-based features as covariates in the
+predictions. Coupled with a thinning algorithm designed for the simulation of
+point processes, SimHawNet enables simulation of the evolution of temporal
+networks in continuous time. Finally, we introduce a comprehensive evaluation
+framework to assess the performance of such an approach, in which we
+demonstrate that SimHawNet successfully simulates the evolution of networks
+with very different generative processes and achieves performance comparable to
+the state of the art, while being significantly faster.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking quantum machine learning kernel training for classification
+  tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10274v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10274v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Alvarez-Estevez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum-enhanced machine learning is a rapidly evolving field that aims to
+leverage the unique properties of quantum mechanics to enhance classical
+machine learning. However, the practical applicability of these methods remains
+an open question, particularly beyond the context of specifically-crafted toy
+problems, and given the current limitations of quantum hardware. This study
+focuses on quantum kernel methods in the context of classification tasks. In
+particular, it examines the performance of Quantum Kernel Estimation (QKE) and
+Quantum Kernel Training (QKT) in connection with two quantum feature mappings,
+namely ZZFeatureMap and CovariantFeatureMap. Remarkably, these feature maps
+have been proposed in the literature under the conjecture of possible near-term
+quantum advantage and have shown promising performance in ad-hoc datasets. In
+this study, we aim to evaluate their versatility and generalization
+capabilities in a more general benchmark, encompassing both artificial and
+established reference datasets. Classical machine learning methods,
+specifically Support Vector Machines (SVMs) and logistic regression, are also
+incorporated as baseline comparisons. Experimental results indicate that
+quantum methods exhibit varying performance across different datasets. Despite
+outperforming classical methods in ad-hoc datasets, mixed results are obtained
+for the general case among standard classical benchmarks. Our experiments call
+into question a general added value of applying QKT optimization, for which the
+additional computational cost does not necessarily translate into improved
+classification performance. Instead, it is suggested that a careful choice of
+the quantum feature map in connection with proper hyperparameterization may
+prove more effective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures; extended experiments and datasets, fixed typos;
+  in consideration for publication in IEEE TQE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formation-Controlled Dimensionality Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.06808v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.06808v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taeuk Jeong, Yoon Mo Jung, Euntack Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dimensionality reduction represents the process of generating a low
+dimensional representation of high dimensional data. Motivated by the formation
+control of mobile agents, we propose a nonlinear dynamical system for
+dimensionality reduction. The system consists of two parts; the control of
+neighbor points, addressing local structures, and the control of remote points,
+accounting for global structures.We also include a brief mathematical analysis
+of the model and its numerical procedure. Numerical experiments are performed
+on both synthetic and real datasets and comparisons with existing models
+demonstrate the soundness and effectiveness of the proposed model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparsity-Aware Distributed Learning for Gaussian Processes with Linear
+  Multiple Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08201v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08201v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Cornelius Suwandi, Zhidi Lin, Feng Yin, Zhiguo Wang, Sergios Theodoridis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian processes (GPs) stand as crucial tools in machine learning and
+signal processing, with their effectiveness hinging on kernel design and
+hyper-parameter optimization. This paper presents a novel GP linear multiple
+kernel (LMK) and a generic sparsity-aware distributed learning framework to
+optimize the hyper-parameters. The newly proposed grid spectral mixture product
+(GSMP) kernel is tailored for multi-dimensional data, effectively reducing the
+number of hyper-parameters while maintaining good approximation capability. We
+further demonstrate that the associated hyper-parameter optimization of this
+kernel yields sparse solutions. To exploit the inherent sparsity of the
+solutions, we introduce the Sparse LInear Multiple Kernel Learning (SLIM-KL)
+framework. The framework incorporates a quantized alternating direction method
+of multipliers (ADMM) scheme for collaborative learning among multiple agents,
+where the local optimization problem is solved using a distributed successive
+convex approximation (DSCA) algorithm. SLIM-KL effectively manages large-scale
+hyper-parameter optimization for the proposed kernel, simultaneously ensuring
+data privacy and minimizing communication costs. Theoretical analysis
+establishes convergence guarantees for the learning framework, while
+experiments on diverse datasets demonstrate the superior prediction performance
+and efficiency of our proposed methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AALF: Almost Always Linear Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.10142v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.10142v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Jakobs, Thomas Liebig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works for time-series forecasting more and more leverage the high
+predictive power of Deep Learning models. With this increase in model
+complexity, however, comes a lack in understanding of the underlying model
+decision process, which is problematic for high-stakes application scenarios.
+At the same time, simple, interpretable forecasting methods such as ARIMA still
+perform very well, sometimes on-par, with Deep Learning approaches. We argue
+that simple models are good enough most of the time, and that forecasting
+performance could be improved by choosing a Deep Learning method only for few,
+important predictions, increasing the overall interpretability of the
+forecasting process. In this context, we propose a novel online model selection
+framework which learns to identify these predictions. An extensive empirical
+study on various real-world datasets shows that our selection methodology
+performs comparable to state-of-the-art online model selections methods in most
+cases while being significantly more interpretable. We find that almost always
+choosing a simple autoregressive linear model for forecasting results in
+competitive performance, suggesting that the need for opaque black-box models
+in time-series forecasting might be smaller than recent works would suggest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating alignment between humans and neural network representations
+  in image-based learning tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09377v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09377v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Can Demircan, Tankred Saanum, Leonardo Pettini, Marcel Binz, Blazej M Baczkowski, Christian F Doeller, Mona M Garvert, Eric Schulz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans represent scenes and objects in rich feature spaces, carrying
+information that allows us to generalise about category memberships and
+abstract functions with few examples. What determines whether a neural network
+model generalises like a human? We tested how well the representations of $86$
+pretrained neural network models mapped to human learning trajectories across
+two tasks where humans had to learn continuous relationships and categories of
+natural images. In these tasks, both human participants and neural networks
+successfully identified the relevant stimulus features within a few trials,
+demonstrating effective generalisation. We found that while training dataset
+size was a core determinant of alignment with human choices, contrastive
+training with multi-modal data (text and imagery) was a common feature of
+currently publicly available models that predicted human generalisation.
+Intrinsic dimensionality of representations had different effects on alignment
+for different model types. Lastly, we tested three sets of human-aligned
+representations and found no consistent improvements in predictive accuracy
+compared to the baselines. In conclusion, pretrained neural networks can serve
+to extract representations for cognitive models, as they appear to capture some
+fundamental aspects of cognition that are transferable across tasks. Both our
+paradigms and modelling approach offer a novel way to quantify alignment
+between neural networks and humans and extend cognitive science into more
+naturalistic domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instruction-Guided Fusion of Multi-Layer Visual Features in Large
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08443v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08443v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Li, Yi Zheng, Haotian Chen, Xiaolei Chen, Yuxuan Liang, Chenghang Lai, Bin Li, Xiangyang Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) have achieved significant success in
+multimodal tasks by combining pre-trained vision encoders and large language
+models. However, current LVLMs mainly rely on features from the final layers of
+the vision encoder, neglecting complementary information in shallower layers.
+While recent methods have explored multi-layer features, they are often
+task-agnostic. We investigate the contributions of visual features from
+different encoder layers across 18 benchmarks and 6 task categories. Our
+results show that multi-layer features provide complementary strengths with
+varying task dependencies, and uniform fusion performs suboptimally. Based on
+these findings, we propose an instruction-guided vision aggregator that
+dynamically integrates multi-layer features based on textual instructions,
+without increasing the number of visual tokens. Extensive evaluations show
+superior performance, and analysis reveals the dominance of mid-to-high-level
+features in semantic tasks and the critical role of low-level features in
+fine-grained perception. This work provides valuable insights into the adaptive
+use of hierarchical visual features in LVLMs, advancing more flexible
+multimodal systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wasserstein Gradient Flows for Moreau Envelopes of f-Divergences in
+  Reproducing Kernel Hil<span class="highlight-title">bert</span> Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04613v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04613v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viktor Stein, Sebastian Neumayer, Nicolaj Rux, Gabriele Steidl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Commonly used $f$-divergences of measures, e.g., the Kullback-Leibler
+divergence, are subject to limitations regarding the support of the involved
+measures. A remedy is regularizing the $f$-divergence by a squared maximum mean
+discrepancy (MMD) associated with a characteristic kernel $K$. We use the
+kernel mean embedding to show that this regularization can be rewritten as the
+Moreau envelope of some function on the associated reproducing kernel Hilbert
+space. Then, we exploit well-known results on Moreau envelopes in Hilbert
+spaces to analyze the MMD-regularized $f$-divergences, particularly their
+gradients. Subsequently, we use our findings to analyze Wasserstein gradient
+flows of MMD-regularized $f$-divergences. We provide proof-of-the-concept
+numerical examples for flows starting from empirical measures. Here, we cover
+$f$-divergences with infinite and finite recession constants. Lastly, we extend
+our results to the tight variational formulation of $f$-divergences and
+numerically compare the resulting flows.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>56 pages, 14 figures, 3 tables. Comments welcome! NEW: Incorporated
+  Reviewers' suggestions, added FISTA and tight formulation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Constraint Network from Demonstrations via Positive-Unlabeled
+  Learning with Memory Replay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16485v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16485v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baiyu Peng, Aude Billard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning for a wide range of real-world tasks necessitates to know and write
+all constraints. However, instances exist where these constraints are either
+unknown or challenging to specify accurately. A possible solution is to infer
+the unknown constraints from expert demonstration. The majority of prior works
+limit themselves to learning simple linear constraints, or require strong
+knowledge of the true constraint parameterization or environmental model. To
+mitigate these problems, this paper presents a positive-unlabeled (PU) learning
+approach to infer a continuous, arbitrary and possibly nonlinear, constraint
+from demonstration. From a PU learning view, We treat all data in
+demonstrations as positive (feasible) data, and learn a (sub)-optimal policy to
+generate high-reward-winning but potentially infeasible trajectories, which
+serve as unlabeled data containing both feasible and infeasible states. Under
+an assumption on data distribution, a feasible-infeasible classifier (i.e.,
+constraint model) is learned from the two datasets through a postprocessing PU
+learning technique. The entire method employs an iterative framework
+alternating between updating the policy, which generates and selects
+higher-reward policies, and updating the constraint model. Additionally, a
+memory buffer is introduced to record and reuse samples from previous
+iterations to prevent forgetting. The effectiveness of the proposed method is
+validated in two Mujoco environments, successfully inferring continuous
+nonlinear constraints and outperforming a baseline method in terms of
+constraint accuracy and policy safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Focus On This, Not That! Steering LLMs With Adaptive Feature
+  Specification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.22944v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.22944v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom A. Lamb, Adam Davies, Alasdair Paren, Philip H. S. Torr, Francesco Pinto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the success of Instruction Tuning (IT) in training large language
+models (LLMs) to perform arbitrary user-specified tasks, these models often
+still leverage spurious or biased features learned from their training data,
+leading to undesired behaviours when deploying them in new contexts. In this
+work, we introduce Focus Instruction Tuning (FIT), which trains LLMs to
+condition their responses by focusing on specific features whilst ignoring
+others, leading to different behaviours based on what features are specified.
+Across several experimental settings, we show that focus-tuned models can be
+adaptively steered by focusing on different features at inference-time: for
+instance, robustness can be improved by focusing on task-causal features and
+ignoring spurious features, and social bias can be mitigated by ignoring
+demographic categories. Furthermore, FIT can steer behaviour in new contexts,
+generalising under distribution shift and to new unseen features at inference
+time, and thereby facilitating more robust, fair, and controllable LLM
+applications in real-world environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Models in Vision: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.04747v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.04747v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florinel-Alin Croitoru, Vlad Hondru, Radu Tudor Ionescu, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising diffusion models represent a recent emerging topic in computer
+vision, demonstrating remarkable results in the area of generative modeling. A
+diffusion model is a deep generative model that is based on two stages, a
+forward diffusion stage and a reverse diffusion stage. In the forward diffusion
+stage, the input data is gradually perturbed over several steps by adding
+Gaussian noise. In the reverse stage, a model is tasked at recovering the
+original input data by learning to gradually reverse the diffusion process,
+step by step. Diffusion models are widely appreciated for the quality and
+diversity of the generated samples, despite their known computational burdens,
+i.e. low speeds due to the high number of steps involved during sampling. In
+this survey, we provide a comprehensive review of articles on denoising
+diffusion models applied in vision, comprising both theoretical and practical
+contributions in the field. First, we identify and present three generic
+diffusion modeling frameworks, which are based on denoising diffusion
+probabilistic models, noise conditioned score networks, and stochastic
+differential equations. We further discuss the relations between diffusion
+models and other deep generative models, including variational auto-encoders,
+generative adversarial networks, energy-based models, autoregressive models and
+normalizing flows. Then, we introduce a multi-perspective categorization of
+diffusion models applied in computer vision. Finally, we illustrate the current
+limitations of diffusion models and envision some interesting directions for
+future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Transactions on Pattern Analysis and Machine
+  Intelligence. 25 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WindsorML: High-Fidelity Computational Fluid Dynamics <span class="highlight-title">Dataset</span> For
+  Automotive Aerodynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19320v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19320v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neil Ashton, Jordan B. Angel, Aditya S. Ghate, Gaetan K. W. Kenway, Man Long Wong, Cetin Kiris, Astrid Walle, Danielle C. Maddix, Gary Page
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a new open-source high-fidelity dataset for Machine
+Learning (ML) containing 355 geometric variants of the Windsor body, to help
+the development and testing of ML surrogate models for external automotive
+aerodynamics. Each Computational Fluid Dynamics (CFD) simulation was run with a
+GPU-native high-fidelity Wall-Modeled Large-Eddy Simulations (WMLES) using a
+Cartesian immersed-boundary method using more than 280M cells to ensure the
+greatest possible accuracy. The dataset contains geometry variants that
+exhibits a wide range of flow characteristics that are representative of those
+observed on road-cars. The dataset itself contains the 3D time-averaged volume
+& boundary data as well as the geometry and force & moment coefficients. This
+paper discusses the validation of the underlying CFD methods as well as
+contents and structure of the dataset. To the authors knowledge, this
+represents the first, large-scale high-fidelity CFD dataset for the Windsor
+body with a permissive open-source license (CC-BY-SA).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Policy Gradient: Aligning LLMs on sequence-level scores in a
+  supervised-friendly fashion <span class="chip">EMNLP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19185v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19185v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannis Flet-Berliac, Nathan Grinsztajn, Florian Strub, Bill Wu, Eugene Choi, Chris Cremer, Arash Ahmadian, Yash Chandak, Mohammad Gheshlaghi Azar, Olivier Pietquin, Matthieu Geist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning (RL) has been used to finetune Large Language Models
+(LLMs) using a reward model trained from preference data, to better align with
+human judgment. The recently introduced direct alignment methods, which are
+often simpler, more stable, and computationally lighter, can more directly
+achieve this. However, these approaches cannot optimize arbitrary rewards, and
+the preference-based ones are not the only rewards of interest for LLMs (eg.,
+unit tests for code generation or textual entailment for summarization, among
+others). RL-finetuning is usually done with a variation of policy gradient,
+which calls for on-policy or near-on-policy samples, requiring costly
+generations. We introduce Contrastive Policy Gradient, or CoPG, a simple and
+mathematically principled new RL algorithm that can estimate the optimal policy
+even from off-policy data. It can be seen as an off-policy policy gradient
+approach that does not rely on important sampling techniques and highlights the
+importance of using (the right) state baseline. We show this approach to
+generalize the direct alignment method IPO (identity preference optimization)
+and classic policy gradient. We experiment with the proposed CoPG on a toy
+bandit problem to illustrate its properties, as well as for finetuning LLMs on
+a summarization task, using a learned reward function considered as ground
+truth for the purpose of the experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Dataset</span>-Free Weight-Initialization on Restricted Boltzmann Machine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.07708v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.07708v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muneki Yasuda, Ryosuke Maeno, Chako Takahashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In feed-forward neural networks, dataset-free weight-initialization methods
+such as LeCun, Xavier (or Glorot), and He initializations have been developed.
+These methods randomly determine the initial values of weight parameters based
+on specific distributions (e.g., Gaussian or uniform distributions) without
+using training datasets. To the best of the authors' knowledge, such a
+dataset-free weight-initialization method is yet to be developed for restricted
+Boltzmann machines (RBMs), which are probabilistic neural networks consisting
+of two layers. In this study, we derive a dataset-free weight-initialization
+method for Bernoulli--Bernoulli RBMs based on statistical mechanical analysis.
+In the proposed weight-initialization method, the weight parameters are drawn
+from a Gaussian distribution with zero mean. The standard deviation of the
+Gaussian distribution is optimized based on our hypothesis that a standard
+deviation providing a larger layer correlation (LC) between the two layers
+improves the learning efficiency. The expression of the LC is derived based on
+a statistical mechanical analysis. The optimal value of the standard deviation
+corresponds to the maximum point of the LC. The proposed weight-initialization
+method is identical to Xavier initialization in a specific case (i.e., when the
+sizes of the two layers are the same, the random variables of the layers are
+$\{-1,1\}$-binary, and all bias parameters are zero). The validity of the
+proposed weight-initialization method is demonstrated in numerical experiments
+using a toy and real-world datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Learning Nonadiabatic Dynamics: Eliminating Phase Freedom of
+  Nonadiabatic Couplings with the State-Intraction State-Averaged
+  Spin-Restricted Ensemble-Referenced Kohn-Sham Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.22801v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.22801v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sung Wook Moon, Soohaeng Yoo Willow, Tae Hyeon Park, Seung Kyu Min, Chang Woo Myung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Excited-state molecular dynamics (ESMD) simulations near conical
+intersections (CIs) pose significant challenges when using machine learning
+potentials (MLPs). Although MLPs have gained recognition for their integration
+into mixed quantum-classical (MQC) methods, such as trajectory surface hopping
+(TSH), and their capacity to model correlated electron-nuclear dynamics
+efficiently, difficulties persist in managing nonadiabatic dynamics.
+Specifically, singularities at CIs and double-valued coupling elements result
+in discontinuities that disrupt the smoothness of predictive functions. Partial
+solutions have been provided by learning diabatic Hamiltonians with phaseless
+loss functions to these challenges. However, a definitive method for addressing
+the discontinuities caused by CIs and double-valued coupling elements has yet
+to be developed. Here, we introduce the phaseless coupling term, $\Delta^2$,
+derived from the square of the off-diagonal elements of the diabatic
+Hamiltonian in the state-interaction state-averaged spin-restricted
+ensemble-referenced Kohn-Sham (SI-SA-REKS, briefly SSR)(2,2) formalism. This
+approach improves the stability and accuracy of the MLP model by addressing the
+issues arising from CI singularities and double-valued coupling functions. We
+apply this method to the penta-2,4-dieniminium cation (PSB3), demonstrating its
+effectiveness in improving MLP training for ML-based nonadiabatic dynamics. Our
+results show that the $\Delta^2$ based ML-ESMD method can reproduce ab initio
+ESMD simulations, underscoring its potential and efficiency for broader
+applications, particularly in large-scale and long-timescale ESMD simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Positive-Unlabeled Constraint Learning for Inferring Nonlinear
+  Continuous Constraints Functions from Expert Demonstrations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01622v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01622v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baiyu Peng, Aude Billard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning for diverse real-world robotic tasks necessitates to know and write
+all constraints. However, instances exist where these constraints are either
+unknown or challenging to specify accurately. A possible solution is to infer
+the unknown constraints from expert demonstration. This paper presents a novel
+two-step Positive-Unlabeled Constraint Learning (PUCL) algorithm to infer a
+continuous constraint function from demonstrations, without requiring prior
+knowledge of the true constraint parameterization or environmental model as
+existing works. We treat all data in demonstrations as positive (feasible)
+data, and learn a control policy to generate potentially infeasible
+trajectories, which serve as unlabeled data. The proposed two-step learning
+framework first identifies reliable infeasible data using a distance metric,
+and secondly learns a binary feasibility classifier (i.e., constraint function)
+from the feasible demonstrations and reliable infeasible data. The proposed
+method is flexible to learn complex-shaped constraint boundary and will not
+mistakenly classify demonstrations as infeasible as previous methods. The
+effectiveness of the proposed method is verified in four constrained
+environments, using a networked policy or a dynamical system policy. It
+successfully infers the continuous nonlinear constraints and outperforms other
+baseline methods in terms of constraint accuracy and policy safety. This work
+has been published in IEEE Robotics and Automation Letters (RA-L). Please refer
+to the final version at https://doi.org/10.1109/LRA.2024.3522756
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ERGNN: Spectral Graph Neural Network With Explicitly-Optimized Rational
+  Graph Filters <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19106v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19106v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoming Li, Jian Yang, Shangsong Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Approximation-based spectral graph neural networks, which construct graph
+filters with function approximation, have shown substantial performance in
+graph learning tasks. Despite their great success, existing works primarily
+employ polynomial approximation to construct the filters, whereas another
+superior option, namely ration approximation, remains underexplored. Although a
+handful of prior works have attempted to deploy the rational approximation,
+their implementations often involve intensive computational demands or still
+resort to polynomial approximations, hindering full potential of the rational
+graph filters. To address the issues, this paper introduces ERGNN, a novel
+spectral GNN with explicitly-optimized rational filter. ERGNN adopts a unique
+two-step framework that sequentially applies the numerator filter and the
+denominator filter to the input signals, thus streamlining the model paradigm
+while enabling explicit optimization of both numerator and denominator of the
+rational filter. Extensive experiments validate the superiority of ERGNN over
+state-of-the-art methods, establishing it as a practical solution for deploying
+rational-based GNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 2025 IEEE International Conference on Acoustics, Speech,
+  and Signal Processing, ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Adaptive Collocation Point Strategy For Physics Informed Neural
+  Networks via the QR Discrete Empirical Interpolation Method <span class="chip">ICML 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Celaya, David Fuentes, Beatrice Riviere
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) have gained significant attention
+for solving forward and inverse problems related to partial differential
+equations (PDEs). While advancements in loss functions and network
+architectures have improved PINN accuracy, the impact of collocation point
+sampling on their performance remains underexplored. Fixed sampling methods,
+such as uniform random sampling and equispaced grids, can fail to capture
+critical regions with high solution gradients, limiting their effectiveness for
+complex PDEs. Adaptive methods, inspired by adaptive mesh refinement from
+traditional numerical methods, address this by dynamically updating collocation
+points during training but may overlook residual dynamics between updates,
+potentially losing valuable information. To overcome this limitation, we
+propose an adaptive collocation point selection strategy utilizing the QR
+Discrete Empirical Interpolation Method (QR-DEIM), a reduced-order modeling
+technique for efficiently approximating nonlinear functions. Our results on
+benchmark PDEs, including the wave, Allen-Cahn, and Burgers' equations,
+demonstrate that our QR-DEIM-based approach improves PINN accuracy compared to
+existing methods, offering a promising direction for adaptive collocation point
+strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICML 2025. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deterministic Uncertainty Propagation for Improved Model-Based Offline
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04088v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04088v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdullah Akgül, Manuel Haußmann, Melih Kandemir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current approaches to model-based offline reinforcement learning often
+incorporate uncertainty-based reward penalization to address the distributional
+shift problem. These approaches, commonly known as pessimistic value iteration,
+use Monte Carlo sampling to estimate the Bellman target to perform temporal
+difference-based policy evaluation. We find out that the randomness caused by
+this sampling step significantly delays convergence. We present a theoretical
+result demonstrating the strong dependency of suboptimality on the number of
+Monte Carlo samples taken per Bellman target calculation. Our main contribution
+is a deterministic approximation to the Bellman target that uses progressive
+moment matching, a method developed originally for deterministic variational
+inference. The resulting algorithm, which we call Moment Matching Offline
+Model-Based Policy Optimization (MOMBO), propagates the uncertainty of the next
+state through a nonlinear Q-network in a deterministic fashion by approximating
+the distributions of hidden layer activations by a normal distribution. We show
+that it is possible to provide tighter guarantees for the suboptimality of
+MOMBO than the existing Monte Carlo sampling approaches. We also observe MOMBO
+to converge faster than these approaches in a large set of benchmark tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentangled Interleaving Variational Encoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08710v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08710v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noelle Y. L. Wong, Eng Yeow Cheu, Zhonglin Chiam, Dipti Srinivasan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conflicting objectives present a considerable challenge in interleaving
+multi-task learning, necessitating the need for meticulous design and balance
+to ensure effective learning of a representative latent data space across all
+tasks without mutual negative impact. Drawing inspiration from the concept of
+marginal and conditional probability distributions in probability theory, we
+design a principled and well-founded approach to disentangle the original input
+into marginal and conditional probability distributions in the latent space of
+a variational autoencoder. Our proposed model, Deep Disentangled Interleaving
+Variational Encoding (DeepDIVE) learns disentangled features from the original
+input to form clusters in the embedding space and unifies these features via
+the cross-attention mechanism in the fusion stage. We theoretically prove that
+combining the objectives for reconstruction and forecasting fully captures the
+lower bound and mathematically derive a loss function for disentanglement using
+Na\"ive Bayes. Under the assumption that the prior is a mixture of log-concave
+distributions, we also establish that the Kullback-Leibler divergence between
+the prior and the posterior is upper bounded by a function minimized by the
+minimizer of the cross entropy loss, informing our adoption of radial basis
+functions (RBF) and cross entropy with interleaving training for DeepDIVE to
+provide a justified basis for convergence. Experiments on two public datasets
+show that DeepDIVE disentangles the original input and yields forecast
+accuracies better than the original VAE and comparable to existing
+state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PeFLL: Personalized Federated Learning by Learning to Learn 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05515v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05515v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Scott, Hossein Zakerinia, Christoph H. Lampert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present PeFLL, a new personalized federated learning algorithm that
+improves over the state-of-the-art in three aspects: 1) it produces more
+accurate models, especially in the low-data regime, and not only for clients
+present during its training phase, but also for any that may emerge in the
+future; 2) it reduces the amount of on-client computation and client-server
+communication by providing future clients with ready-to-use personalized models
+that require no additional finetuning or optimization; 3) it comes with
+theoretical guarantees that establish generalization from the observed clients
+to future ones. At the core of PeFLL lies a learning-to-learn approach that
+jointly trains an embedding network and a hypernetwork. The embedding network
+is used to represent clients in a latent descriptor space in a way that
+reflects their similarity to each other. The hypernetwork takes as input such
+descriptors and outputs the parameters of fully personalized client models. In
+combination, both networks constitute a learning algorithm that achieves
+state-of-the-art performance in several personalized federated learning
+benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM360 K2: Building a 65B 360-Open-Source Large Language Model from
+  Scratch 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07124v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07124v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengzhong Liu, Bowen Tan, Hongyi Wang, Willie Neiswanger, Tianhua Tao, Haonan Li, Fajri Koto, Yuqi Wang, Suqi Sun, Omkar Pangarkar, Richard Fan, Yi Gu, Victor Miller, Liqun Ma, Liping Tang, Nikhil Ranjan, Yonghao Zhuang, Guowei He, Renxi Wang, Mingkai Deng, Robin Algayres, Yuanzhi Li, Zhiqiang Shen, Preslav Nakov, Eric Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We detail the training of the LLM360 K2-65B model, scaling up our 360-degree
+OPEN SOURCE approach to the largest and most powerful models under project
+LLM360. While open-source LLMs continue to advance, the answer to "How are the
+largest LLMs trained?" remains unclear within the community. The implementation
+details for such high-capacity models are often protected due to business
+considerations associated with their high cost. This lack of transparency
+prevents LLM researchers from leveraging valuable insights from prior
+experience, e.g., "What are the best practices for addressing loss spikes?" The
+LLM360 K2 project addresses this gap by providing full transparency and access
+to resources accumulated during the training of LLMs at the largest scale. This
+report highlights key elements of the K2 project, including our first model, K2
+DIAMOND, a 65 billion-parameter LLM that surpasses LLaMA-65B and rivals
+LLaMA2-70B, while requiring fewer FLOPs and tokens. We detail the
+implementation steps and present a longitudinal analysis of K2 DIAMOND's
+capabilities throughout its training process. We also outline ongoing projects
+such as TXT360, setting the stage for future models in the series. By offering
+previously unavailable resources, the K2 project also resonates with the
+360-degree OPEN SOURCE principles of transparency, reproducibility, and
+accessibility, which we believe are vital in the era of resource-intensive AI
+research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing User Interest based on Stream Clustering and Memory Networks
+  in Large-Scale Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13238v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13238v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Liu, Nian Wang, Cong Xu, Ming Zhao, Bin Wang, Yi Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender Systems (RSs) provide personalized recommendation service based
+on user interest, which are widely used in various platforms. However, there
+are lots of users with sparse interest due to lacking consumption behaviors,
+which leads to poor recommendation results for them. This problem is widespread
+in large-scale RSs and is particularly difficult to address. To solve this
+problem, we propose a novel solution named User Interest Enhancement (UIE)
+which enhances user interest including user profile and user history behavior
+sequences using the enhancement vectors and personalized enhancement vector
+generated based on stream clustering and memory networks from different
+perspectives. UIE not only remarkably improves model performance on the users
+with sparse interest but also significantly enhance model performance on other
+users. UIE is an end-to-end solution which is easy to be implemented based on
+ranking model. Moreover, we expand our solution and apply similar methods to
+long-tail items, which also achieves excellent improvement. Furthermore, we
+conduct extensive offline and online experiments in a large-scale industrial
+RS. The results demonstrate that our model outperforms other models remarkably,
+especially for the users with sparse interest. Until now, UIE has been fully
+deployed in multiple large-scale RSs and achieved remarkable improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simplified and Generalized Masked Diffusion for Discrete Data <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04329v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04329v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Shi, Kehang Han, Zhe Wang, Arnaud Doucet, Michalis K. Titsias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Masked (or absorbing) diffusion is actively explored as an alternative to
+autoregressive models for generative modeling of discrete data. However,
+existing work in this area has been hindered by unnecessarily complex model
+formulations and unclear relationships between different perspectives, leading
+to suboptimal parameterization, training objectives, and ad hoc adjustments to
+counteract these issues. In this work, we aim to provide a simple and general
+framework that unlocks the full potential of masked diffusion models. We show
+that the continuous-time variational objective of masked diffusion models is a
+simple weighted integral of cross-entropy losses. Our framework also enables
+training generalized masked diffusion models with state-dependent masking
+schedules. When evaluated by perplexity, our models trained on OpenWebText
+surpass prior diffusion language models at GPT-2 scale and demonstrate superior
+performance on 4 out of 5 zero-shot language modeling tasks. Furthermore, our
+models vastly outperform previous discrete diffusion models on pixel-level
+image modeling, achieving 2.75 (CIFAR-10) and 3.40 (ImageNet 64x64) bits per
+dimension that are better than autoregressive models of similar sizes. Our code
+is available at https://github.com/google-deepmind/md4.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024. Code is available at:
+  https://github.com/google-deepmind/md4</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balancing Act: Prioritization Strategies for LLM-Designed Restless
+  Bandit Rewards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12112v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12112v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shresth Verma, Niclas Boehmer, Lingkai Kong, Milind Tambe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LLMs are increasingly used to design reward functions based on human
+preferences in Reinforcement Learning (RL). We focus on LLM-designed rewards
+for Restless Multi-Armed Bandits, a framework for allocating limited resources
+among agents. In applications such as public health, this approach empowers
+grassroots health workers to tailor automated allocation decisions to community
+needs. In the presence of multiple agents, altering the reward function based
+on human preferences can impact subpopulations very differently, leading to
+complex tradeoffs and a multi-objective resource allocation problem. We are the
+first to present a principled method termed Social Choice Language Model for
+dealing with these tradeoffs for LLM-designed rewards for multiagent planners
+in general and restless bandits in particular. The novel part of our model is a
+transparent and configurable selection component, called an adjudicator,
+external to the LLM that controls complex tradeoffs via a user-selected social
+welfare function. Our experiments demonstrate that our model reliably selects
+more effective, aligned, and balanced reward functions compared to purely
+LLM-based approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Assist Humans without Inferring Rewards <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02623v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02623v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Myers, Evan Ellis, Sergey Levine, Benjamin Eysenbach, Anca Dragan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assistive agents should make humans' lives easier. Classically, such
+assistance is studied through the lens of inverse reinforcement learning, where
+an assistive agent (e.g., a chatbot, a robot) infers a human's intention and
+then selects actions to help the human reach that goal. This approach requires
+inferring intentions, which can be difficult in high-dimensional settings. We
+build upon prior work that studies assistance through the lens of empowerment:
+an assistive agent aims to maximize the influence of the human's actions such
+that they exert a greater control over the environmental outcomes and can solve
+tasks in fewer steps. We lift the major limitation of prior work in this
+area--scalability to high-dimensional settings--with contrastive successor
+representations. We formally prove that these representations estimate a
+similar notion of empowerment to that studied by prior work and provide a
+ready-made mechanism for optimizing it. Empirically, our proposed method
+outperforms prior methods on synthetic benchmarks, and scales to Overcooked, a
+cooperative game setting. Theoretically, our work connects ideas from
+information theory, neuroscience, and reinforcement learning, and charts a path
+for representations to play a critical role in solving assistive problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference on Neural Information Processing Systems (NeurIPS), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Discriminative Representation learning via Attention-Enhanced
+  Contrastive Learning for Short Text Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03584v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03584v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning has gained significant attention in short text
+clustering, yet it has an inherent drawback of mistakenly identifying samples
+from the same category as negatives and then separating them in the feature
+space (false negative separation), which hinders the generation of superior
+representations. To generate more discriminative representations for efficient
+clustering, we propose a novel short text clustering method, called
+Discriminative Representation learning via \textbf{A}ttention-\textbf{E}nhanced
+\textbf{C}ontrastive \textbf{L}earning for Short Text Clustering
+(\textbf{AECL}). The \textbf{AECL} consists of two modules which are the
+pseudo-label generation module and the contrastive learning module. Both
+modules build a sample-level attention mechanism to capture similarity
+relationships between samples and aggregate cross-sample features to generate
+consistent representations. Then, the former module uses the more
+discriminative consistent representation to produce reliable supervision
+information for assist clustering, while the latter module explores similarity
+relationships and consistent representations optimize the construction of
+positive samples to perform similarity-guided contrastive learning, effectively
+addressing the false negative separation issue. Experimental results
+demonstrate that the proposed \textbf{AECL} outperforms state-of-the-art
+methods. If the paper is accepted, we will open-source the code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PsyDI: Towards a Personalized and Progressively In-depth Chatbot for
+  Psychological Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.03337v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.03337v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueyan Li, Xinyan Chen, Yazhe Niu, Shuai Hu, Yu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of psychology, traditional assessment methods, such as
+standardized scales, are frequently critiqued for their static nature, lack of
+personalization, and reduced participant engagement, while comprehensive
+counseling evaluations are often inaccessible. The complexity of quantifying
+psychological traits further limits these methods. Despite advances with large
+language models (LLMs), many still depend on single-round Question-and-Answer
+interactions. To bridge this gap, we introduce PsyDI, a personalized and
+progressively in-depth chatbot designed for psychological measurements,
+exemplified by its application in the Myers-Briggs Type Indicator (MBTI)
+framework. PsyDI leverages user-related multi-modal information and engages in
+customized, multi-turn interactions to provide personalized, easily accessible
+measurements, while ensuring precise MBTI type determination. To address the
+challenge of unquantifiable psychological traits, we introduce a novel training
+paradigm that involves learning the ranking of proxy variables associated with
+these traits, culminating in a robust score model for MBTI measurements. The
+score model enables PsyDI to conduct comprehensive and precise measurements
+through multi-turn interactions within a unified estimation context. Through
+various experiments, we validate the efficacy of both the score model and the
+PsyDI pipeline, demonstrating its potential to serve as a general framework for
+psychological measurements. Furthermore, the online deployment of PsyDI has
+garnered substantial user engagement, with over 3,000 visits, resulting in the
+collection of numerous multi-turn dialogues annotated with MBTI types, which
+facilitates further research. The source code for the training and web service
+components is publicly available as a part of OpenDILab at:
+https://github.com/opendilab/PsyDI
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Overfitting in Graph Neural Networks via Feature and
+  Hyperplane Perturbation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15081v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15081v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoonhyuk Choi, Jiho Choi, Taewook Ko, Chong-Kwon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) are commonly used in semi-supervised settings.
+Previous research has primarily focused on finding appropriate graph filters
+(e.g. aggregation methods) to perform well on both homophilic and heterophilic
+graphs. While these methods are effective, they can still suffer from the
+sparsity of node features, where the initial data contain few non-zero
+elements. This can lead to overfitting in certain dimensions in the first
+projection matrix, as training samples may not cover the entire range of graph
+filters (hyperplanes). To address this, we propose a novel data augmentation
+strategy. Specifically, by flipping both the initial features and hyperplane,
+we create additional space for training, which leads to more precise updates of
+the learnable parameters and improved robustness for unseen features during
+inference. To the best of our knowledge, this is the first attempt to mitigate
+the overfitting caused by the initial features. Extensive experiments on
+real-world datasets show that our proposed technique increases node
+classification accuracy by up to 46.5% relatively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced SPS Velocity-adaptive Scheme: Access Fairness in 5G NR V2I
+  Networks <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08037v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08037v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Xu, Qiong Wu, Pingyi Fan, Kezhi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicle-to-Infrastructure (V2I) technology enables information exchange
+between vehicles and road infrastructure. Specifically, when a vehicle
+approaches a roadside unit (RSU), it can exchange information with the RSU to
+obtain accurate data that assists in driving. With the release of the 3rd
+Generation Partnership Project (3GPP) Release 16, which includes the 5G New
+Radio (NR) Vehicle-to-Everything (V2X) standards, vehicles typically adopt
+mode-2 communication using sensing-based semi-persistent scheduling (SPS) for
+resource allocation. In this approach, vehicles identify candidate resources
+within a selection window and exclude ineligible resources based on information
+from a sensing window. However, vehicles often drive at different speeds,
+resulting in varying amounts of data transmission with RSUs as they pass by,
+which leads to unfair access. Therefore, it is essential to design an access
+scheme that accounts for different vehicle speeds to achieve fair access across
+the network. This paper formulates an optimization problem for vehicular
+networks and proposes a multi-objective optimization scheme to address it by
+adjusting the selection window in the SPS mechanism of 5G NR V2I mode-2.
+Simulation results demonstrate the effectiveness of the proposed scheme
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to IEEE Journal. The source code has
+  been released at:
+  https://github.com/qiongwu86/Enhanced-SPS-Velocity-adaptiveScheme-Access-Fariness-in-5G-NR-V2I-Networks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient PAC Learning of Halfspaces with Constant Malicious Noise Rate <span class="chip">ALT 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.01186v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.01186v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding noise tolerance of machine learning algorithms is a central
+quest in learning theory. In this work, we study the problem of computationally
+efficient PAC learning of halfspaces in the presence of malicious noise, where
+an adversary can corrupt both instances and labels of training samples. The
+best-known noise tolerance either depends on a target error rate under
+distributional assumptions or on a margin parameter under large-margin
+conditions. In this work, we show that when both types of conditions are
+satisfied, it is possible to achieve constant noise tolerance by minimizing a
+reweighted hinge loss. Our key ingredients include: 1) an efficient algorithm
+that finds weights to control the gradient deterioration from corrupted
+samples, and 2) a new analysis on the robustness of the hinge loss equipped
+with such weights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ALT 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VLG-CBM: Training Concept Bottleneck Models with Vision-Language
+  Guidance <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01432v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01432v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Divyansh Srivastava, Ge Yan, Tsui-Wei Weng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concept Bottleneck Models (CBMs) provide interpretable prediction by
+introducing an intermediate Concept Bottleneck Layer (CBL), which encodes
+human-understandable concepts to explain models' decision. Recent works
+proposed to utilize Large Language Models and pre-trained Vision-Language
+Models to automate the training of CBMs, making it more scalable and automated.
+However, existing approaches still fall short in two aspects: First, the
+concepts predicted by CBL often mismatch the input image, raising doubts about
+the faithfulness of interpretation. Second, it has been shown that concept
+values encode unintended information: even a set of random concepts could
+achieve comparable test accuracy to state-of-the-art CBMs. To address these
+critical limitations, in this work, we propose a novel framework called
+Vision-Language-Guided Concept Bottleneck Model (VLG-CBM) to enable faithful
+interpretability with the benefits of boosted performance. Our method leverages
+off-the-shelf open-domain grounded object detectors to provide visually
+grounded concept annotation, which largely enhances the faithfulness of concept
+prediction while further improving the model performance. In addition, we
+propose a new metric called Number of Effective Concepts (NEC) to control the
+information leakage and provide better interpretability. Extensive evaluations
+across five standard benchmarks show that our method, VLG-CBM, outperforms
+existing methods by at least 4.27% and up to 51.09% on Accuracy at NEC=5
+(denoted as ANEC-5), and by at least 0.45% and up to 29.78% on average accuracy
+(denoted as ANEC-avg), while preserving both faithfulness and interpretability
+of the learned concepts as demonstrated in extensive experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appeared at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Power of Types: Exploring the Impact of Type Checking on Neural Bug
+  Detection in Dynamically Typed Languages <span class="chip">ICSE'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15368v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15368v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boqi Chen, José Antonio Hernández López, Gunter Mussbacher, Dániel Varró
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivation: Automated bug detection in dynamically typed languages such as
+Python is essential for maintaining code quality. The lack of mandatory type
+annotations in such languages can lead to errors that are challenging to
+identify early with traditional static analysis tools. Recent progress in deep
+neural networks has led to increased use of neural bug detectors. In statically
+typed languages, a type checker is integrated into the compiler and thus taken
+into consideration when the neural bug detector is designed for these
+languages.
+  Problem: However, prior studies overlook this aspect during the training and
+testing of neural bug detectors for dynamically typed languages. When an
+optional type checker is used, assessing existing neural bug detectors on bugs
+easily detectable by type checkers may impact their performance estimation.
+Moreover, including these bugs in the training set of neural bug detectors can
+shift their detection focus toward the wrong type of bugs.
+  Contribution: We explore the impact of type checking on various neural bug
+detectors for variable misuse bugs, a common type targeted by neural bug
+detectors. Existing synthetic and real-world datasets are type-checked to
+evaluate the prevalence of type-related bugs. Then, we investigate how
+type-related bugs influence the training and testing of the neural bug
+detectors.
+  Findings: Our findings indicate that existing bug detection datasets contain
+a significant proportion of type-related bugs. Building on this insight, we
+discover integrating the neural bug detector with a type checker can be
+beneficial, especially when the code is annotated with types. Further
+investigation reveals neural bug detectors perform better on type-related bugs
+than other bugs. Moreover, removing type-related bugs from the training data
+helps improve neural bug detectors' ability to identify bugs beyond the scope
+of type checkers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICSE'25 Research Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cost-aware Bayesian Optimization via the Pandora's Box Gittins Index 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.20062v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.20062v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Xie, Raul Astudillo, Peter I. Frazier, Ziv Scully, Alexander Terenin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian optimization is a technique for efficiently optimizing unknown
+functions in a black-box manner. To handle practical settings where gathering
+data requires use of finite resources, it is desirable to explicitly
+incorporate function evaluation costs into Bayesian optimization policies. To
+understand how to do so, we develop a previously-unexplored connection between
+cost-aware Bayesian optimization and the Pandora's Box problem, a decision
+problem from economics. The Pandora's Box problem admits a Bayesian-optimal
+solution based on an expression called the Gittins index, which can be
+reinterpreted as an acquisition function. We study the use of this acquisition
+function for cost-aware Bayesian optimization, and demonstrate empirically that
+it performs well, particularly in medium-high dimensions. We further show that
+this performance carries over to classical Bayesian optimization without
+explicit evaluation costs. Our work constitutes a first step towards
+integrating techniques from Gittins index theory into Bayesian optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Analysis Using a GPU-based Parallel Algorithm: Quantum Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14641v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14641v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Wang, ZhiJie He, Ding Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The article introduces a new method for applying Quantum Clustering to graph
+structures. Quantum Clustering (QC) is a novel density-based unsupervised
+learning method that determines cluster centers by constructing a potential
+function. In this method, we use the Graph Gradient Descent algorithm to find
+the centers of clusters. GPU parallelization is utilized for computing
+potential values. We also conducted experiments on five widely used datasets
+and evaluated using four indicators. The results show superior performance of
+the method. Finally, we discuss the influence of $\sigma$ on the experimental
+results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The surprising efficiency of temporal difference learning for rare event
+  prediction <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17638v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17638v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoou Cheng, Jonathan Weare
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We quantify the efficiency of temporal difference (TD) learning over the
+direct, or Monte Carlo (MC), estimator for policy evaluation in reinforcement
+learning, with an emphasis on estimation of quantities related to rare events.
+Policy evaluation is complicated in the rare event setting by the long
+timescale of the event and by the need for \emph{relative accuracy} in
+estimates of very small values. Specifically, we focus on least-squares TD
+(LSTD) prediction for finite state Markov chains, and show that LSTD can
+achieve relative accuracy far more efficiently than MC. We prove a central
+limit theorem for the LSTD estimator and upper bound the \emph{relative
+asymptotic variance} by simple quantities characterizing the connectivity of
+states relative to the transition probabilities between them. Using this bound,
+we show that, even when both the timescale of the rare event and the relative
+accuracy of the MC estimator are exponentially large in the number of states,
+LSTD maintains a fixed level of relative accuracy with a total number of
+observed transitions of the Markov chain that is only \emph{polynomially} large
+in the number of states.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final camera-ready version published at NeurIPS 2024. Correct an
+  assumption statement and typos, and change/add a few sentences from the last
+  version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-Based Transfer Learning for Contextual Reinforcement Learning <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04498v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04498v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jung-Hoon Cho, Vindula Jayawardana, Sirui Li, Cathy Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep reinforcement learning (RL) is a powerful approach to complex decision
+making. However, one issue that limits its practical application is its
+brittleness, sometimes failing to train in the presence of small changes in the
+environment. Motivated by the success of zero-shot transfer-where pre-trained
+models perform well on related tasks-we consider the problem of selecting a
+good set of training tasks to maximize generalization performance across a
+range of tasks. Given the high cost of training, it is critical to select
+training tasks strategically, but not well understood how to do so. We hence
+introduce Model-Based Transfer Learning (MBTL), which layers on top of existing
+RL methods to effectively solve contextual RL problems. MBTL models the
+generalization performance in two parts: 1) the performance set point, modeled
+using Gaussian processes, and 2) performance loss (generalization gap), modeled
+as a linear function of contextual similarity. MBTL combines these two pieces
+of information within a Bayesian optimization (BO) framework to strategically
+select training tasks. We show theoretically that the method exhibits sublinear
+regret in the number of training tasks and discuss conditions to further
+tighten regret bounds. We experimentally validate our methods using urban
+traffic and standard continuous control benchmarks. The experimental results
+suggest that MBTL can achieve up to 43x improved sample efficiency compared
+with canonical independent training and multi-task training. Further
+experiments demonstrate the efficacy of BO and the insensitivity to the
+underlying RL algorithm and hyperparameters. This work lays the foundations for
+investigating explicit modeling of generalization, thereby enabling principled
+yet effective methods for contextual RL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38th Conference on Neural Information Processing Systems (NeurIPS
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Statistical Efficiency of Distributional Temporal Difference Learning
+  and Freedman's Inequality in Hil<span class="highlight-title">bert</span> Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05811v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05811v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Peng, Liangyu Zhang, Zhihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributional reinforcement learning (DRL) has achieved empirical success in
+various domains. One core task in DRL is distributional policy evaluation,
+which involves estimating the return distribution $\eta^\pi$ for a given policy
+$\pi$. Distributional temporal difference learning has been accordingly
+proposed, which extends the classic temporal difference learning (TD) in RL. In
+this paper, we focus on the non-asymptotic statistical rates of distributional
+TD. To facilitate theoretical analysis, we propose non-parametric
+distributional TD (NTD). For a $\gamma$-discounted infinite-horizon tabular
+Markov decision process, we show that for NTD with a generative model, we need
+$\tilde{O}(\varepsilon^{-2}\mu_{\min}^{-1}(1-\gamma)^{-3})$ interactions with
+the environment to achieve an $\varepsilon$-optimal estimator with high
+probability, when the estimation error is measured by the $1$-Wasserstein. This
+sample complexity bound is minimax optimal up to logarithmic factors. In
+addition, we revisit categorical distributional TD (CTD), showing that the same
+non-asymptotic convergence bounds hold for CTD in the case of the
+$1$-Wasserstein distance. We also extend our analysis to the more general
+setting where the data generating process is Markovian. In the Markovian
+setting, we propose variance-reduced variants of NTD and CTD, and show that
+both can achieve a $\tilde{O}(\varepsilon^{-2}
+\mu_{\pi,\min}^{-1}(1-\gamma)^{-3}+t_{mix}\mu_{\pi,\min}^{-1}(1-\gamma)^{-1})$
+sample complexity bounds in the case of the $1$-Wasserstein distance, which
+matches the state-of-the-art statistical results for classic policy evaluation.
+To achieve the sharp statistical rates, we establish a novel Freedman's
+inequality in Hilbert spaces. This new Freedman's inequality would be of
+independent interest for statistical analysis of various infinite-dimensional
+online learning problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A General Framework for Inference-time Scaling and Steering of Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06848v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06848v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raghav Singhal, Zachary Horvitz, Ryan Teehan, Mengye Ren, Zhou Yu, Kathleen McKeown, Rajesh Ranganath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models produce impressive results in modalities ranging from images
+and video to protein design and text. However, generating samples with
+user-specified properties remains a challenge. Recent research proposes
+fine-tuning models to maximize rewards that capture desired properties, but
+these methods require expensive training and are prone to mode collapse. In
+this work, we propose Feynman Kac (FK) steering, an inference-time framework
+for steering diffusion models with reward functions. FK steering works by
+sampling a system of multiple interacting diffusion processes, called
+particles, and resampling particles at intermediate steps based on scores
+computed using functions called potentials. Potentials are defined using
+rewards for intermediate states and are selected such that a high value
+indicates that the particle will yield a high-reward sample. We explore various
+choices of potentials, intermediate rewards, and samplers. We evaluate FK
+steering on text-to-image and text diffusion models. For steering text-to-image
+models with a human preference reward, we find that FK steering a 0.8B
+parameter model outperforms a 2.6B parameter fine-tuned model on prompt
+fidelity, with faster sampling and no training. For steering text diffusion
+models with rewards for text quality and specific text attributes, we find that
+FK steering generates lower perplexity, more linguistically acceptable outputs
+and enables gradient-free control of attributes like toxicity. Our results
+demonstrate that inference-time scaling and steering of diffusion models, even
+with off-the-shelf rewards, can provide significant sample quality gains and
+controllability benefits. Code is available at
+https://github.com/zacharyhorvitz/Fk-Diffusion-Steering .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smoothness Really Matters: A Simple Yet Effective Approach for
+  Unsupervised Graph Domain Adaptation <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11654v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11654v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Chen, Guo Ye, Yakun Wang, Zhao Zhang, Libang Zhang, Daixin Wang, Zhiqiang Zhang, Fuzhen Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Graph Domain Adaptation (UGDA) seeks to bridge distribution
+shifts between domains by transferring knowledge from labeled source graphs to
+given unlabeled target graphs. Existing UGDA methods primarily focus on
+aligning features in the latent space learned by graph neural networks (GNNs)
+across domains, often overlooking structural shifts, resulting in limited
+effectiveness when addressing structurally complex transfer scenarios. Given
+the sensitivity of GNNs to local structural features, even slight discrepancies
+between source and target graphs could lead to significant shifts in node
+embeddings, thereby reducing the effectiveness of knowledge transfer. To
+address this issue, we introduce a novel approach for UGDA called Target-Domain
+Structural Smoothing (TDSS). TDSS is a simple and effective method designed to
+perform structural smoothing directly on the target graph, thereby mitigating
+structural distribution shifts and ensuring the consistency of node
+representations. Specifically, by integrating smoothing techniques with
+neighborhood sampling, TDSS maintains the structural coherence of the target
+graph while mitigating the risk of over-smoothing. Our theoretical analysis
+shows that TDSS effectively reduces target risk by improving model smoothness.
+Empirical results on three real-world datasets demonstrate that TDSS
+outperforms recent state-of-the-art baselines, achieving significant
+improvements across six transfer scenarios. The code is available in
+https://github.com/cwei01/TDSS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, Accpected by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Survey</span>ing Attitudinal Alignment Between Large Language Models Vs. Humans
+  Towards 17 Sustainable Development Goals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13885v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13885v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyang Wu, Ying Xu, Tingsong Xiao, Yunze Xiao, Yitong Li, Tianyang Wang, Yichi Zhang, Shanghai Zhong, Yuwei Zhang, Wei Lu, Yifan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have emerged as potent tools for advancing the
+United Nations' Sustainable Development Goals (SDGs). However, the attitudinal
+disparities between LLMs and humans towards these goals can pose significant
+challenges. This study conducts a comprehensive review and analysis of the
+existing literature on the attitudes of LLMs towards the 17 SDGs, emphasizing
+the comparison between their attitudes and support for each goal and those of
+humans. We examine the potential disparities, primarily focusing on aspects
+such as understanding and emotions, cultural and regional differences, task
+objective variations, and factors considered in the decision-making process.
+These disparities arise from the underrepresentation and imbalance in LLM
+training data, historical biases, quality issues, lack of contextual
+understanding, and skewed ethical values reflected. The study also investigates
+the risks and harms that may arise from neglecting the attitudes of LLMs
+towards the SDGs, including the exacerbation of social inequalities, racial
+discrimination, environmental destruction, and resource wastage. To address
+these challenges, we propose strategies and recommendations to guide and
+regulate the application of LLMs, ensuring their alignment with the principles
+and goals of the SDGs, and therefore creating a more just, inclusive, and
+sustainable future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Autonomous Algorithm for Training Autonomous Vehicles with Minimal Human
+  Intervention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13345v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13345v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sang-Hyun Lee, Daehyeok Kwon, Seung-Woo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent reinforcement learning (RL) algorithms have demonstrated impressive
+results in simulated driving environments. However, autonomous vehicles trained
+in simulation often struggle to work well in the real world due to the fidelity
+gap between simulated and real-world environments. While directly training
+real-world autonomous vehicles with RL algorithms is a promising approach to
+bypass the fidelity gap problem, it presents several challenges. One critical
+yet often overlooked challenge is the need to reset a driving environment
+between every episode. This reset process demands significant human
+intervention, leading to poor training efficiency in the real world. In this
+paper, we introduce a novel autonomous algorithm that enables off-the-shelf RL
+algorithms to train autonomous vehicles with minimal human intervention. Our
+algorithm reduces unnecessary human intervention by aborting episodes to
+prevent unsafe states and identifying informative initial states for subsequent
+episodes. The key idea behind identifying informative initial states is to
+estimate the expected amount of information that can be obtained from
+under-explored but reachable states. Our algorithm also revisits rule-based
+autonomous driving algorithms and highlights their benefits in safely returning
+an autonomous vehicle to initial states. To evaluate how much human
+intervention is required during training, we implement challenging urban
+driving tasks that require an autonomous vehicle to reset to initial states on
+its own. The experimental results show that our autonomous algorithm is
+task-agnostic and achieves competitive driving performance with much less human
+intervention than baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 2 tables, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Deep Subspace Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00230v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00230v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupei Zhang, Ruojia Feng, Yifei Wang, Xuequn Shang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces FDSC, a private-protected subspace clustering (SC)
+approach with federated learning (FC) schema. In each client, there is a deep
+subspace clustering network accounting for grouping the isolated data, composed
+of a encode network, a self-expressive layer, and a decode network. FDSC is
+achieved by uploading the encode network to communicate with other clients in
+the server. Besides, FDSC is also enhanced by preserving the local neighborhood
+relationship in each client. With the effects of federated learning and
+locality preservation, the learned data features from the encoder are boosted
+so as to enhance the self-expressiveness learning and result in better
+clustering performance. Experiments test FDSC on public datasets and compare
+with other clustering methods, demonstrating the effectiveness of FDSC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8pages,4 figures, 4 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gameplay Filters: Robust Zero-Shot Safety through Adversarial
+  Imagination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00846v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00846v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duy P. Nguyen, Kai-Chieh Hsu, Wenhao Yu, Jie Tan, Jaime F. Fisac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive recent advances in learning-based robot control,
+ensuring robustness to out-of-distribution conditions remains an open
+challenge. Safety filters can, in principle, keep arbitrary control policies
+from incurring catastrophic failures by overriding unsafe actions, but existing
+solutions for complex (e.g., legged) robot dynamics do not span the full motion
+envelope and instead rely on local, reduced-order models. These filters tend to
+overly restrict agility and can still fail when perturbed away from nominal
+conditions. This paper presents the gameplay filter, a new class of predictive
+safety filter that continually plays out hypothetical matches between its
+simulation-trained safety strategy and a virtual adversary co-trained to invoke
+worst-case events and sim-to-real error, and precludes actions that would cause
+failures down the line. We demonstrate the scalability and robustness of the
+approach with a first-of-its-kind full-order safety filter for (36-D)
+quadrupedal dynamics. Physical experiments on two different quadruped platforms
+demonstrate the superior zero-shot effectiveness of the gameplay filter under
+large perturbations such as tugging and unmodeled terrain. Experiment videos
+and open-source software are available online:
+https://saferobotics.org/research/gameplay-filter
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Graph <span class="highlight-title">Self-Supervised</span> Learning with Graph Interplay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.04061v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.04061v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinjian Zhao, Wei Pang, Xiangru Jian, Yaoyao Xu, Chaolong Ying, Tianshu Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph self-supervised learning (GSSL) has emerged as a compelling framework
+for extracting informative representations from graph-structured data without
+extensive reliance on labeled inputs. In this study, we introduce Graph
+Interplay (GIP), an innovative and versatile approach that significantly
+enhances the performance equipped with various existing GSSL methods. To this
+end, GIP advocates direct graph-level communications by introducing random
+inter-graph edges within standard batches. Against GIP's simplicity, we further
+theoretically show that \textsc{GIP} essentially performs a principled manifold
+separation via combining inter-graph message passing and GSSL, bringing about
+more structured embedding manifolds and thus benefits a series of downstream
+tasks. Our empirical study demonstrates that GIP surpasses the performance of
+prevailing GSSL methods across multiple benchmarks by significant margins,
+highlighting its potential as a breakthrough approach. Besides, GIP can be
+readily integrated into a series of GSSL methods and consistently offers
+additional performance gain. This advancement not only amplifies the capability
+of GSSL but also potentially sets the stage for a novel graph learning paradigm
+in a broader sense.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Due to potential implicit data leakage in our experimental setup,
+  where the pretraining dataset was ordered by default labels, we withdraw this
+  manuscript for further self-examination and rigorous validation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CryoBench: Diverse and challenging <span class="highlight-title">dataset</span>s for the heterogeneity
+  problem in cryo-EM <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05526v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05526v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minkyu Jeon, Rishwanth Raghu, Miro Astore, Geoffrey Woollard, Ryan Feathers, Alkin Kaz, Sonya M. Hanson, Pilar Cossio, Ellen D. Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cryo-electron microscopy (cryo-EM) is a powerful technique for determining
+high-resolution 3D biomolecular structures from imaging data. Its unique
+ability to capture structural variability has spurred the development of
+heterogeneous reconstruction algorithms that can infer distributions of 3D
+structures from noisy, unlabeled imaging data. Despite the growing number of
+advanced methods, progress in the field is hindered by the lack of standardized
+benchmarks with ground truth information and reliable validation metrics. Here,
+we introduce CryoBench, a suite of datasets, metrics, and benchmarks for
+heterogeneous reconstruction in cryo-EM. CryoBench includes five datasets
+representing different sources of heterogeneity and degrees of difficulty.
+These include conformational heterogeneity generated from designed motions of
+antibody complexes or sampled from a molecular dynamics simulation, as well as
+compositional heterogeneity from mixtures of ribosome assembly states or 100
+common complexes present in cells. We then analyze state-of-the-art
+heterogeneous reconstruction tools, including neural and non-neural methods,
+assess their sensitivity to noise, and propose new metrics for quantitative
+evaluation. We hope that CryoBench will be a foundational resource for
+accelerating algorithmic development and evaluation in the cryo-EM and machine
+learning communities. Project page: https://cryobench.cs.princeton.edu.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An efficient likelihood-free Bayesian inference method based on
+  sequential neural posterior estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12530v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12530v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Xiong, Xiliang Yang, Sanguo Zhang, Zhijian He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential neural posterior estimation (SNPE) techniques have been recently
+proposed for dealing with simulation-based models with intractable likelihoods.
+Unlike approximate Bayesian computation, SNPE techniques learn the posterior
+from sequential simulation using neural network-based conditional density
+estimators by minimizing a specific loss function. The SNPE method proposed by
+Lueckmann et al. (2017) used a calibration kernel to boost the sample weights
+around the observed data, resulting in a concentrated loss function. However,
+the use of calibration kernels may increase the variances of both the empirical
+loss and its gradient, making the training inefficient. To improve the
+stability of SNPE, this paper proposes to use an adaptive calibration kernel
+and several variance reduction techniques. The proposed method greatly speeds
+up the process of training and provides a better approximation of the posterior
+than the original SNPE method and some existing competitors as confirmed by
+numerical experiments. We also managed to demonstrate the superiority of the
+proposed method for a high-dimensional model with a real-world dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Dark Side of Rich Rewards: Understanding and Mitigating Noise in VLM
+  Rewards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.15922v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.15922v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sukai Huang, Shu-Wei Liu, Nir Lipovetzky, Trevor Cohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Vision-Language Models (VLMs) are increasingly used to generate reward
+signals for training embodied agents to follow instructions, our research
+reveals that agents guided by VLM rewards often underperform compared to those
+employing only intrinsic (exploration-driven) rewards, contradicting
+expectations set by recent work. We hypothesize that false positive rewards --
+instances where unintended trajectories are incorrectly rewarded -- are more
+detrimental than false negatives. Our analysis confirms this hypothesis,
+revealing that the widely used cosine similarity metric is prone to false
+positive reward estimates. To address this, we introduce BiMI ({Bi}nary
+{M}utual {I}nformation), a novel reward function designed to mitigate noise.
+BiMI significantly enhances learning efficiency across diverse and challenging
+embodied navigation environments. Our findings offer a nuanced understanding of
+how different types of reward noise impact agent learning and highlight the
+importance of addressing multimodal reward signal noise when training embodied
+agents
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 main body pages, 21 appendix pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Mechanistic Explanatory Strategy for XAI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.01332v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.01332v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcin Rabiza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant advancements in XAI, scholars note a persistent lack of
+solid conceptual foundations and integration with broader scientific discourse
+on explanation. In response, emerging XAI research draws on explanatory
+strategies from various sciences and philosophy of science literature to fill
+these gaps. This paper outlines a mechanistic strategy for explaining the
+functional organization of deep learning systems, situating recent advancements
+in AI explainability within a broader philosophical context. According to the
+mechanistic approach, the explanation of opaque AI systems involves identifying
+mechanisms that drive decision-making. For deep neural networks, this means
+discerning functionally relevant components -- such as neurons, layers,
+circuits, or activation patterns -- and understanding their roles through
+decomposition, localization, and recomposition. Proof-of-principle case studies
+from image recognition and language modeling align these theoretical approaches
+with the latest research from AI labs like OpenAI and Anthropic. This research
+suggests that a systematic approach to studying model organization can reveal
+elements that simpler (or ''more modest'') explainability techniques might
+miss, fostering more thoroughly explainable AI. The paper concludes with a
+discussion on the epistemic relevance of the mechanistic approach positioned in
+the context of selected philosophical debates on XAI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Forthcoming in M\"uller, V. C., Dewey, A. R., Dung, L., & L\"ohr, G.
+  (Eds.), Philosophy of Artificial Intelligence: The State of the Art, Synthese
+  Library, Berlin: Springer Nature. Please cite the published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preference-based Pure Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.02988v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.02988v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Apurv Shukla, Debabrota Basu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the preference-based pure exploration problem for bandits with
+vector-valued rewards. The rewards are ordered using a (given) preference cone
+$\mathcal{C}$ and our goal is to identify the set of Pareto optimal arms.
+First, to quantify the impact of preferences, we derive a novel lower bound on
+sample complexity for identifying the most preferred policy with a confidence
+level $1-\delta$. Our lower bound elicits the role played by the geometry of
+the preference cone and punctuates the difference in hardness compared to
+existing best-arm identification variants of the problem. We further explicate
+this geometry when the rewards follow Gaussian distributions. We then provide a
+convex relaxation of the lower bound and leverage it to design the
+Preference-based Track and Stop (PreTS) algorithm that identifies the most
+preferred policy. Finally, we show that the sample complexity of PreTS is
+asymptotically tight by deriving a new concentration inequality for
+vector-valued rewards.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intelligent Icing Detection Model of Wind Turbine Blades Based on SCADA
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2101.07914v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2101.07914v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqian Jiang, Junyang Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosis of ice accretion on wind turbine blades is all the time a hard nut
+to crack in condition monitoring of wind farms. Existing methods focus on
+mechanism analysis of icing process, deviation degree analysis of feature
+engineering. However, there have not been deep researches of neural networks
+applied in this field at present. Supervisory control and data acquisition
+(SCADA) makes it possible to train networks through continuously providing not
+only operation parameters and performance parameters of wind turbines but also
+environmental parameters and operation modes. This paper explores the
+possibility that using convolutional neural networks (CNNs), generative
+adversarial networks (GANs) and domain adaption learning to establish
+intelligent diagnosis frameworks under different training scenarios.
+Specifically, PGANC and PGANT are proposed for sufficient and non-sufficient
+target wind turbine labeled data, respectively. The basic idea is that we
+consider a two-stage training with parallel GANs, which are aimed at capturing
+intrinsic features for normal and icing samples, followed by classification CNN
+or domain adaption module in various training cases. Model validation on three
+wind turbine SCADA data shows that two-stage training can effectively improve
+the model performance. Besides, if there is no sufficient labeled data for a
+target turbine, which is an extremely common phenomenon in real industrial
+practices, the addition of domain adaption learning makes the trained model
+show better performance. Overall, our proposed intelligent diagnosis frameworks
+can achieve more accurate detection on the same wind turbine and more
+generalized capability on a new wind turbine, compared with other machine
+learning models and conventional CNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-hop Upstream Anticipatory Traffic Signal Control with Deep
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocan Li, Xiaoyu Wang, Ilia Smirnov, Scott Sanner, Baher Abdulhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coordination in traffic signal control is crucial for managing congestion in
+urban networks. Existing pressure-based control methods focus only on immediate
+upstream links, leading to suboptimal green time allocation and increased
+network delays. However, effective signal control inherently requires
+coordination across a broader spatial scope, as the effect of upstream traffic
+should influence signal control decisions at downstream intersections,
+impacting a large area in the traffic network. Although agent communication
+using neural network-based feature extraction can implicitly enhance spatial
+awareness, it significantly increases the learning complexity, adding an
+additional layer of difficulty to the challenging task of control in deep
+reinforcement learning. To address the issue of learning complexity and myopic
+traffic pressure definition, our work introduces a novel concept based on
+Markov chain theory, namely \textit{multi-hop upstream pressure}, which
+generalizes the conventional pressure to account for traffic conditions beyond
+the immediate upstream links. This farsighted and compact metric informs the
+deep reinforcement learning agent to preemptively clear the multi-hop upstream
+queues, guiding the agent to optimize signal timings with a broader spatial
+awareness. Simulations on synthetic and realistic (Toronto) scenarios
+demonstrate controllers utilizing multi-hop upstream pressure significantly
+reduce overall network delay by prioritizing traffic movements based on a
+broader understanding of upstream congestion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 tables, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Alignment Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06164v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06164v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satchel Grant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When can we say that two neural systems are the same? The answer to this
+question is goal-dependent, and it is often addressed through correlative
+methods such as Representational Similarity Analysis (RSA) and Centered Kernel
+Alignment (CKA). What do we miss when we forgo causal explorations, and how can
+we target specific types of similarity? In this work, we introduce Model
+Alignment Search (MAS), a method for causally exploring distributed
+representational similarity. The method learns invertible linear
+transformations that align a subspace between two distributed networks'
+representations where causal information can be freely interchanged. We first
+show that the method can be used to transfer specific causal variables, such as
+the number of items in a counting task, between networks with different
+training seeds. We then explore open questions in number cognition by comparing
+different types of numeric representations in models trained on structurally
+different numeric tasks. We then explore differences between MAS vs preexisting
+causal similarity methods, and lastly, we introduce a counterfactual latent
+auxiliary loss function that helps shape causally relevant alignments even in
+cases where we do not have causal access to one of the two models for training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforcement learning with non-ergodic reward increments: robustness
+  via ergodicity transformations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11335v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11335v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Baumann, Erfaun Noorani, James Price, Ole Peters, Colm Connaughton, Thomas B. Schön
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Envisioned application areas for reinforcement learning (RL) include
+autonomous driving, precision agriculture, and finance, which all require RL
+agents to make decisions in the real world. A significant challenge hindering
+the adoption of RL methods in these domains is the non-robustness of
+conventional algorithms. In particular, the focus of RL is typically on the
+expected value of the return. The expected value is the average over the
+statistical ensemble of infinitely many trajectories, which can be
+uninformative about the performance of the average individual. For instance,
+when we have a heavy-tailed return distribution, the ensemble average can be
+dominated by rare extreme events. Consequently, optimizing the expected value
+can lead to policies that yield exceptionally high returns with a probability
+that approaches zero but almost surely result in catastrophic outcomes in
+single long trajectories. In this paper, we develop an algorithm that lets RL
+agents optimize the long-term performance of individual trajectories. The
+algorithm enables the agents to learn robust policies, which we show in an
+instructive example with a heavy-tailed return distribution and standard RL
+benchmarks. The key element of the algorithm is a transformation that we learn
+from data. This transformation turns the time series of collected returns into
+one for whose increments expected value and the average over a long trajectory
+coincide. Optimizing these increments results in robust policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted final version to appear in the Transactions on Machine
+  Learning Research</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the uncertainty principle of neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.01493v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.01493v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun-Jie Zhang, Dong-Xiao Zhang, Jian-Nan Chen, Long-Gang Pang, Deyu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we explore the inherent trade-off between accuracy and
+robustness in neural networks, drawing an analogy to the uncertainty principle
+in quantum mechanics. We propose that neural networks are subject to an
+uncertainty relation, which manifests as a fundamental limitation in their
+ability to simultaneously achieve high accuracy and robustness against
+adversarial attacks. Through mathematical proofs and empirical evidence, we
+demonstrate that this trade-off is a natural consequence of the sharp
+boundaries formed between different class concepts during training. Our
+findings reveal that the complementarity principle, a cornerstone of quantum
+physics, applies to neural networks, imposing fundamental limits on their
+capabilities in simultaneous learning of conjugate features. Meanwhile, our
+work suggests that achieving human-level intelligence through a single network
+architecture or massive datasets alone may be inherently limited. Our work
+provides new insights into the theoretical foundations of neural network
+vulnerability and opens up avenues for designing more robust neural network
+architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hidden Markov Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2004.06963v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2004.06963v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Rimella, Nick Whiteley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We define an evolving in-time Bayesian neural network called a Hidden Markov
+Neural Network, which addresses the crucial challenge in time-series
+forecasting and continual learning: striking a balance between adapting to new
+data and appropriately forgetting outdated information. This is achieved by
+modelling the weights of a neural network as the hidden states of a Hidden
+Markov model, with the observed process defined by the available data. A
+filtering algorithm is employed to learn a variational approximation of the
+evolving-in-time posterior distribution over the weights. By leveraging a
+sequential variant of Bayes by Backprop, enriched with a stronger
+regularization technique called variational DropConnect, Hidden Markov Neural
+Networks achieve robust regularization and scalable inference. Experiments on
+MNIST, dynamic classification tasks, and next-frame forecasting in videos
+demonstrate that Hidden Markov Neural Networks provide strong predictive
+performance while enabling effective uncertainty quantification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metric Learning with Progressive Self-Distillation for Audio-Visual
+  Embedding Learning <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donghuo Zeng, Kazushi Ikeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Metric learning projects samples into an embedded space, where similarities
+and dissimilarities are quantified based on their learned representations.
+However, existing methods often rely on label-guided representation learning,
+where representations of different modalities, such as audio and visual data,
+are aligned based on annotated labels. This approach tends to underutilize
+latent complex features and potential relationships inherent in the
+distributions of audio and visual data that are not directly tied to the
+labels, resulting in suboptimal performance in audio-visual embedding learning.
+To address this issue, we propose a novel architecture that integrates
+cross-modal triplet loss with progressive self-distillation. Our method
+enhances representation learning by leveraging inherent distributions and
+dynamically refining soft audio-visual alignments -- probabilistic alignments
+between audio and visual data that capture the inherent relationships beyond
+explicit labels. Specifically, the model distills audio-visual
+distribution-based knowledge from annotated labels in a subset of each batch.
+This self-distilled knowledge is used t
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 2 tables. Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAL: <span class="highlight-title">Prompt</span>ing Analytic Learning with Missing Modality for Multi-Modal
+  Class-Incremental Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianghu Yue, Yiming Chen, Xueyi Zhang, Xiaoxue Gao, Mengling Feng, Mingrui Lao, Huiping Zhuang, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal class-incremental learning (MMCIL) seeks to leverage multi-modal
+data, such as audio-visual and image-text pairs, thereby enabling models to
+learn continuously across a sequence of tasks while mitigating forgetting.
+While existing studies primarily focus on the integration and utilization of
+multi-modal information for MMCIL, a critical challenge remains: the issue of
+missing modalities during incremental learning phases. This oversight can
+exacerbate severe forgetting and significantly impair model performance. To
+bridge this gap, we propose PAL, a novel exemplar-free framework tailored to
+MMCIL under missing-modality scenarios. Concretely, we devise modality-specific
+prompts to compensate for missing information, facilitating the model to
+maintain a holistic representation of the data. On this foundation, we
+reformulate the MMCIL problem into a Recursive Least-Squares task, delivering
+an analytical linear solution. Building upon these, PAL not only alleviates the
+inherent under-fitting limitation in analytic learning but also preserves the
+holistic representation of missing-modality data, achieving superior
+performance with less forgetting across various multi-modal incremental
+scenarios. Extensive experiments demonstrate that PAL significantly outperforms
+competitive methods across various datasets, including UPMC-Food101 and
+N24News, showcasing its robustness towards modality absence and its
+anti-forgetting ability to maintain high incremental accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LAVCap: LLM-based Audio-Visual Captioning using Optimal Transport <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyeongha Rho, Hyeongkeun Lee, Valentio Iverson, Joon Son Chung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated audio captioning is a task that generates textual descriptions for
+audio content, and recent studies have explored using visual information to
+enhance captioning quality. However, current methods often fail to effectively
+fuse audio and visual data, missing important semantic cues from each modality.
+To address this, we introduce LAVCap, a large language model (LLM)-based
+audio-visual captioning framework that effectively integrates visual
+information with audio to improve audio captioning performance. LAVCap employs
+an optimal transport-based alignment loss to bridge the modality gap between
+audio and visual features, enabling more effective semantic extraction.
+Additionally, we propose an optimal transport attention module that enhances
+audio-visual fusion using an optimal transport assignment map. Combined with
+the optimal training strategy, experimental results demonstrate that each
+component of our framework is effective. LAVCap outperforms existing
+state-of-the-art methods on the AudioCaps dataset, without relying on large
+datasets or post-processing. Code is available at
+https://github.com/NAVER-INTEL-Co-Lab/gaudi-lavcap.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures; Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MagnetDB: A Longitudinal Torrent Discovery <span class="highlight-title">Dataset</span> with IMDb-Matched
+  Movies and TV Shows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Scott Seidenberger, Noah Pursell, Anindya Maiti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  BitTorrent remains a prominent channel for illicit distribution of
+copyrighted material, yet the supply side of such content remains understudied.
+We introduce MagnetDB, a longitudinal dataset of torrents discovered through
+the BitTorrent DHT between 2018 and 2024, containing more than 28.6 million
+torrents and metadata of more than 950 million files. While our primary focus
+is on enabling research based on the supply of pirated movies and TV shows, the
+dataset also encompasses other legitimate and illegitimate torrents. By
+applying IMDb-matching and annotation to movie and TV show torrents, MagnetDB
+facilitates detailed analyses of pirated content evolution in the BitTorrent
+network. Researchers can leverage MagnetDB to examine distribution trends,
+subcultural practices, and the gift economy within piracy ecosystems. Through
+its scale and temporal scope, MagnetDB presents a unique opportunity for
+investigating the broader dynamics of BitTorrent and advancing empirical
+knowledge on digital piracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMPLest-X: Ultimate Scaling for Expressive Human Pose and Shape
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanqi Yin, Zhongang Cai, Ruisi Wang, Ailing Zeng, Chen Wei, Qingping Sun, Haiyi Mei, Yanjun Wang, Hui En Pang, Mingyuan Zhang, Lei Zhang, Chen Change Loy, Atsushi Yamashita, Lei Yang, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expressive human pose and shape estimation (EHPS) unifies body, hands, and
+face motion capture with numerous applications. Despite encouraging progress,
+current state-of-the-art methods focus on training innovative architectural
+designs on confined datasets. In this work, we investigate the impact of
+scaling up EHPS towards a family of generalist foundation models. 1) For data
+scaling, we perform a systematic investigation on 40 EHPS datasets,
+encompassing a wide range of scenarios that a model trained on any single
+dataset cannot handle. More importantly, capitalizing on insights obtained from
+the extensive benchmarking process, we optimize our training scheme and select
+datasets that lead to a significant leap in EHPS capabilities. Ultimately, we
+achieve diminishing returns at 10M training instances from diverse data
+sources. 2) For model scaling, we take advantage of vision transformers (up to
+ViT-Huge as the backbone) to study the scaling law of model sizes in EHPS. To
+exclude the influence of algorithmic design, we base our experiments on two
+minimalist architectures: SMPLer-X, which consists of an intermediate step for
+hand and face localization, and SMPLest-X, an even simpler version that reduces
+the network to its bare essentials and highlights significant advances in the
+capture of articulated hands. With big data and the large model, the foundation
+models exhibit strong performance across diverse test benchmarks and excellent
+transferability to even unseen environments. Moreover, our finetuning strategy
+turns the generalist into specialist models, allowing them to achieve further
+performance boosts. Notably, our foundation models consistently deliver
+state-of-the-art results on seven benchmarks such as AGORA, UBody, EgoBody, and
+our proposed SynHand dataset for comprehensive hand evaluation. (Code is
+available at: https://github.com/wqyin/SMPLest-X).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An extension of SMPLer-X [arXiv:2309.17448]. Homepage:
+  https://caizhongang.com/projects/SMPLer-X/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Frechet Music Distance: A Metric For Generative Symbolic Music
+  Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07948v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07948v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Retkowski, Jakub Stępniak, Mateusz Modrzejewski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we introduce the Frechet Music Distance (FMD), a novel
+evaluation metric for generative symbolic music models, inspired by the Frechet
+Inception Distance (FID) in computer vision and Frechet Audio Distance (FAD) in
+generative audio. FMD calculates the distance between distributions of
+reference and generated symbolic music embeddings, capturing abstract musical
+features. We validate FMD across several datasets and models. Results indicate
+that FMD effectively differentiates model quality, providing a domain-specific
+metric for evaluating symbolic music generation, and establishing a
+reproducible standard for future research in symbolic music modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffMesh: A Motion-aware Diffusion Framework for Human Mesh Recovery
+  from Videos <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13397v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13397v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ce Zheng, Xianpeng Liu, Qucheng Peng, Tianfu Wu, Pu Wang, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human mesh recovery (HMR) provides rich human body information for various
+real-world applications. While image-based HMR methods have achieved impressive
+results, they often struggle to recover humans in dynamic scenarios, leading to
+temporal inconsistencies and non-smooth 3D motion predictions due to the
+absence of human motion. In contrast, video-based approaches leverage temporal
+information to mitigate this issue. In this paper, we present DiffMesh, an
+innovative motion-aware Diffusion-like framework for video-based HMR. DiffMesh
+establishes a bridge between diffusion models and human motion, efficiently
+generating accurate and smooth output mesh sequences by incorporating human
+motion within the forward process and reverse process in the diffusion model.
+Extensive experiments are conducted on the widely used datasets (Human3.6M
+\cite{h36m_pami} and 3DPW \cite{pw3d2018}), which demonstrate the effectiveness
+and efficiency of our DiffMesh. Visual comparisons in real-world scenarios
+further highlight DiffMesh's suitability for practical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">130</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learnings from Scaling Visual Tokenizers for Reconstruction and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philippe Hansen-Estruch, David Yan, Ching-Yao Chung, Orr Zohar, Jialiang Wang, Tingbo Hou, Tao Xu, Sriram Vishwanath, Peter Vajda, Xinlei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual tokenization via auto-encoding empowers state-of-the-art image and
+video generative models by compressing pixels into a latent space. Although
+scaling Transformer-based generators has been central to recent advances, the
+tokenizer component itself is rarely scaled, leaving open questions about how
+auto-encoder design choices influence both its objective of reconstruction and
+downstream generative performance. Our work aims to conduct an exploration of
+scaling in auto-encoders to fill in this blank. To facilitate this exploration,
+we replace the typical convolutional backbone with an enhanced Vision
+Transformer architecture for Tokenization (ViTok). We train ViTok on
+large-scale image and video datasets far exceeding ImageNet-1K, removing data
+constraints on tokenizer scaling. We first study how scaling the auto-encoder
+bottleneck affects both reconstruction and generation -- and find that while it
+is highly correlated with reconstruction, its relationship with generation is
+more complex. We next explored the effect of separately scaling the
+auto-encoders' encoder and decoder on reconstruction and generation
+performance. Crucially, we find that scaling the encoder yields minimal gains
+for either reconstruction or generation, while scaling the decoder boosts
+reconstruction but the benefits for generation are mixed. Building on our
+exploration, we design ViTok as a lightweight auto-encoder that achieves
+competitive performance with state-of-the-art auto-encoders on ImageNet-1K and
+COCO reconstruction tasks (256p and 512p) while outperforming existing
+auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x
+fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates
+competitive performance on image generation for ImageNet-1K and sets new
+state-of-the-art benchmarks for class-conditional video generation on UCF-101.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 25 figures, 7 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KU AIGEN ICL EDI@BC8 Track 3: Advancing Phenotype Named Entity
+  Recognition and Normalization for Dysmorphology Physical Examination Reports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hajung Kim, Chanhwi Kim, Jiwoong Sohn, Tim Beck, Marek Rei, Sunkyu Kim, T Ian Simpson, Joram M Posma, Antoine Lain, Mujeen Sung, Jaewoo Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The objective of BioCreative8 Track 3 is to extract phenotypic key medical
+findings embedded within EHR texts and subsequently normalize these findings to
+their Human Phenotype Ontology (HPO) terms. However, the presence of diverse
+surface forms in phenotypic findings makes it challenging to accurately
+normalize them to the correct HPO terms. To address this challenge, we explored
+various models for named entity recognition and implemented data augmentation
+techniques such as synonym marginalization to enhance the normalization step.
+Our pipeline resulted in an exact extraction and normalization F1 score 2.6\%
+higher than the mean score of all submissions received in response to the
+challenge. Furthermore, in terms of the normalization F1 score, our approach
+surpassed the average performance by 1.9\%. These findings contribute to the
+advancement of automated medical data extraction and normalization techniques,
+showcasing potential pathways for future research and application in the
+biomedical domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article is part of the Proceedings of the BioCreative VIII
+  Challenge and Workshop: Curation and Evaluation in the era of Generative
+  Models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parallel multi-objective metaheuristics for smart communications in
+  vehicular networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jamal Toutouh, Enrique Alba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article analyzes the use of two parallel multi-objective soft computing
+algorithms to automatically search for high-quality settings of the Ad hoc On
+Demand Vector routing protocol for vehicular networks. These methods are based
+on an evolutionary algorithm and on a swarm intelligence approach. The
+experimental analysis demonstrates that the configurations computed by our
+optimization algorithms outperform other state-of-the-art optimized ones. In
+turn, the computational efficiency achieved by all the parallel versions is
+greater than 87 %. Therefore, the line of work presented in this article
+represents an efficient framework to improve vehicular communications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple Aerial Detection Baseline of Multimodal Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyun Li, Yushi Chen, Xinya Shu, Dong Chen, Xin He, Yi Yu, Xue Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The multimodal language models (MLMs) based on generative pre-trained
+Transformer are considered powerful candidates for unifying various domains and
+tasks. MLMs developed for remote sensing (RS) have demonstrated outstanding
+performance in multiple tasks, such as visual question answering and visual
+grounding. In addition to visual grounding that detects specific objects
+corresponded to given instruction, aerial detection, which detects all objects
+of multiple categories, is also a valuable and challenging task for RS
+foundation models. However, aerial detection has not been explored by existing
+RS MLMs because the autoregressive prediction mechanism of MLMs differs
+significantly from the detection outputs. In this paper, we present a simple
+baseline for applying MLMs to aerial detection for the first time, named
+LMMRotate. Specifically, we first introduce a normalization method to transform
+detection outputs into textual outputs to be compatible with the MLM framework.
+Then, we propose a evaluation method, which ensures a fair comparison between
+MLMs and conventional object detection models. We construct the baseline by
+fine-tuning open-source general-purpose MLMs and achieve impressive detection
+performance comparable to conventional detector. We hope that this baseline
+will serve as a reference for future MLM development, enabling more
+comprehensive capabilities for understanding RS images. Code is available at
+https://github.com/Li-Qingyun/mllm-mmrotate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 1 table, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CyberMentor: AI Powered Learning Tool Platform to Address Diverse
+  Student Needs in Cybersecurity Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09709v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09709v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Wang, Nianjun Zhou, Zhixiong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many non-traditional students in cybersecurity programs often lack access to
+advice from peers, family members and professors, which can hinder their
+educational experiences. Additionally, these students may not fully benefit
+from various LLM-powered AI assistants due to issues like content relevance,
+locality of advice, minimum expertise, and timing. This paper addresses these
+challenges by introducing an application designed to provide comprehensive
+support by answering questions related to knowledge, skills, and career
+preparation advice tailored to the needs of these students. We developed a
+learning tool platform, CyberMentor, to address the diverse needs and pain
+points of students majoring in cybersecurity. Powered by agentic workflow and
+Generative Large Language Models (LLMs), the platform leverages
+Retrieval-Augmented Generation (RAG) for accurate and contextually relevant
+information retrieval to achieve accessibility and personalization. We
+demonstrated its value in addressing knowledge requirements for cybersecurity
+education and for career marketability, in tackling skill requirements for
+analytical and programming assignments, and in delivering real time on demand
+learning support. Using three use scenarios, we showcased CyberMentor in
+facilitating knowledge acquisition and career preparation and providing
+seamless skill-based guidance and support. We also employed the LangChain
+prompt-based evaluation methodology to evaluate the platform's impact,
+confirming its strong performance in helpfulness, correctness, and
+completeness. These results underscore the system's ability to support students
+in developing practical cybersecurity skills while improving equity and
+sustainability within higher education. Furthermore, CyberMentor's open-source
+design allows for adaptation across other disciplines, fostering educational
+innovation and broadening its potential impact.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Goofus & Gallant Story Corpus for Practical Value Alignment <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Sultan Al Nahian, Tasmia Tasrin, Spencer Frazier, Mark Riedl, Brent Harrison
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Values or principles are key elements of human society that influence people
+to behave and function according to an accepted standard set of social rules to
+maintain social order. As AI systems are becoming ubiquitous in human society,
+it is a major concern that they could violate these norms or values and
+potentially cause harm. Thus, to prevent intentional or unintentional harm, AI
+systems are expected to take actions that align with these principles. Training
+systems to exhibit this type of behavior is difficult and often requires a
+specialized dataset. This work presents a multi-modal dataset illustrating
+normative and non-normative behavior in real-life situations described through
+natural language and artistic images. This training set contains curated sets
+of images that are designed to teach young children about social principles. We
+argue that this is an ideal dataset to use for training socially normative
+agents given this fact.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Conference on Machine Learning and
+  Applications (ICMLA) 2024. Main Conference, Long Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Practical Continual Forgetting for <span class="highlight-title">Pre-train</span>ed Vision Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbo Zhao, Fei Zhu, Bolin Ni, Feng Zhu, Gaofeng Meng, Zhaoxiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For privacy and security concerns, the need to erase unwanted information
+from pre-trained vision models is becoming evident nowadays. In real-world
+scenarios, erasure requests originate at any time from both users and model
+owners, and these requests usually form a sequence. Therefore, under such a
+setting, selective information is expected to be continuously removed from a
+pre-trained model while maintaining the rest. We define this problem as
+continual forgetting and identify three key challenges. (i) For unwanted
+knowledge, efficient and effective deleting is crucial. (ii) For remaining
+knowledge, the impact brought by the forgetting procedure should be minimal.
+(iii) In real-world scenarios, the training samples may be scarce or partially
+missing during the process of forgetting. To address them, we first propose
+Group Sparse LoRA (GS-LoRA). Specifically, towards (i), we introduce LoRA
+modules to fine-tune the FFN layers in Transformer blocks for each forgetting
+task independently, and towards (ii), a simple group sparse regularization is
+adopted, enabling automatic selection of specific LoRA groups and zeroing out
+the others. To further extend GS-LoRA to more practical scenarios, we
+incorporate prototype information as additional supervision and introduce a
+more practical approach, GS-LoRA++. For each forgotten class, we move the
+logits away from its original prototype. For the remaining classes, we pull the
+logits closer to their respective prototypes. We conduct extensive experiments
+on face recognition, object detection and image classification and demonstrate
+that our method manages to forget specific classes with minimal impact on other
+classes. Codes have been released on https://github.com/bjzhb666/GS-LoRA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cueless EEG imagined speech for subject identification: <span class="highlight-title">dataset</span> and
+  benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Derakhshesh, Zahra Dehghanian, Reza Ebrahimpour, Hamid R. Rabiee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electroencephalogram (EEG) signals have emerged as a promising modality for
+biometric identification. While previous studies have explored the use of
+imagined speech with semantically meaningful words for subject identification,
+most have relied on additional visual or auditory cues. In this study, we
+introduce a cueless EEG-based imagined speech paradigm, where subjects imagine
+the pronunciation of semantically meaningful words without any external cues.
+This innovative approach addresses the limitations of prior methods by
+requiring subjects to select and imagine words from a predefined list
+naturally. The dataset comprises over 4,350 trials from 11 subjects across five
+sessions. We assess a variety of classification methods, including traditional
+machine learning techniques such as Support Vector Machines (SVM) and XGBoost,
+as well as time-series foundation models and deep learning architectures
+specifically designed for EEG classification, such as EEG Conformer and Shallow
+ConvNet. A session-based hold-out validation strategy was employed to ensure
+reliable evaluation and prevent data leakage. Our results demonstrate
+outstanding classification accuracy, reaching 97.93%. These findings highlight
+the potential of cueless EEG paradigms for secure and reliable subject
+identification in real-world applications, such as brain-computer interfaces
+(BCIs).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Large Reasoning Models: A <span class="highlight-title">Survey</span> of Reinforced Reasoning with
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengli Xu, Qianyue Hao, Zefang Zong, Jingwei Wang, Yunke Zhang, Jingyi Wang, Xiaochong Lan, Jiahui Gong, Tianjian Ouyang, Fanjin Meng, Chenyang Shao, Yuwei Yan, Qinglong Yang, Yiwen Song, Sijian Ren, Xinyuan Hu, Yu Li, Jie Feng, Chen Gao, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language has long been conceived as an essential tool for human reasoning.
+The breakthrough of Large Language Models (LLMs) has sparked significant
+research interest in leveraging these models to tackle complex reasoning tasks.
+Researchers have moved beyond simple autoregressive token generation by
+introducing the concept of "thought" -- a sequence of tokens representing
+intermediate steps in the reasoning process. This innovative paradigm enables
+LLMs' to mimic complex human reasoning processes, such as tree search and
+reflective thinking. Recently, an emerging trend of learning to reason has
+applied reinforcement learning (RL) to train LLMs to master reasoning
+processes. This approach enables the automatic generation of high-quality
+reasoning trajectories through trial-and-error search algorithms, significantly
+expanding LLMs' reasoning capacity by providing substantially more training
+data. Furthermore, recent studies demonstrate that encouraging LLMs to "think"
+with more tokens during test-time inference can further significantly boost
+reasoning accuracy. Therefore, the train-time and test-time scaling combined to
+show a new research frontier -- a path toward Large Reasoning Model. The
+introduction of OpenAI's o1 series marks a significant milestone in this
+research direction. In this survey, we present a comprehensive review of recent
+progress in LLM reasoning. We begin by introducing the foundational background
+of LLMs and then explore the key technical components driving the development
+of large reasoning models, with a focus on automated data construction,
+learning-to-reason techniques, and test-time scaling. We also analyze popular
+open-source projects at building large reasoning models, and conclude with open
+challenges and future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reward-Guided Controlled Generation for Inference-Time Alignment in
+  Diffusion Models: Tutorial and <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masatoshi Uehara, Yulai Zhao, Chenyu Wang, Xiner Li, Aviv Regev, Sergey Levine, Tommaso Biancalani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This tutorial provides an in-depth guide on inference-time guidance and
+alignment methods for optimizing downstream reward functions in diffusion
+models. While diffusion models are renowned for their generative modeling
+capabilities, practical applications in fields such as biology often require
+sample generation that maximizes specific metrics (e.g., stability, affinity in
+proteins, closeness to target structures). In these scenarios, diffusion models
+can be adapted not only to generate realistic samples but also to explicitly
+maximize desired measures at inference time without fine-tuning. This tutorial
+explores the foundational aspects of such inference-time algorithms. We review
+these methods from a unified perspective, demonstrating that current techniques
+-- such as Sequential Monte Carlo (SMC)-based guidance, value-based sampling,
+and classifier guidance -- aim to approximate soft optimal denoising processes
+(a.k.a. policies in RL) that combine pre-trained denoising processes with value
+functions serving as look-ahead functions that predict from intermediate states
+to terminal rewards. Within this framework, we present several novel algorithms
+not yet covered in the literature. Furthermore, we discuss (1) fine-tuning
+methods combined with inference-time techniques, (2) inference-time algorithms
+based on search algorithms such as Monte Carlo tree search, which have received
+limited attention in current research, and (3) connections between
+inference-time algorithms in language models and diffusion models. The code of
+this tutorial on protein design is available at
+https://github.com/masa-ue/AlignInversePro
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We plan to add more content/codes. Please let us know if there are
+  any comments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incorporating Quantum Advantage in Quantum Circuit Generation through
+  Genetic Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Stein, Michael Färber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing efficient quantum circuits that leverage quantum advantage compared
+to classical computing has become increasingly critical. Genetic algorithms
+have shown potential in generating such circuits through artificial evolution.
+However, integrating quantum advantage into the fitness function of these
+algorithms remains unexplored. In this paper, we aim to enhance the efficiency
+of quantum circuit design by proposing two novel approaches for incorporating
+quantum advantage metrics into the fitness function of genetic algorithms.1 We
+evaluate our approaches based on the Bernstein-Vazirani Problem and the
+Unstructured Database Search Problem as test cases. The results demonstrate
+that our approaches not only improve the convergence speed of the genetic
+algorithm but also produce circuits comparable to expert-designed solutions.
+Our findings suggest that automated quantum circuit design using genetic
+algorithms that incorporate a measure of quantum advantage is a promising
+approach to accelerating the development of quantum algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Authenticated Delegation and Authorized AI Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobin South, Samuele Marro, Thomas Hardjono, Robert Mahari, Cedric Deslandes Whitney, Dazza Greenwood, Alan Chan, Alex Pentland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid deployment of autonomous AI agents creates urgent challenges around
+authorization, accountability, and access control in digital spaces. New
+standards are needed to know whom AI agents act on behalf of and guide their
+use appropriately, protecting online spaces while unlocking the value of task
+delegation to autonomous agents. We introduce a novel framework for
+authenticated, authorized, and auditable delegation of authority to AI agents,
+where human users can securely delegate and restrict the permissions and scope
+of agents while maintaining clear chains of accountability. This framework
+builds on existing identification and access management protocols, extending
+OAuth 2.0 and OpenID Connect with agent-specific credentials and metadata,
+maintaining compatibility with established authentication and web
+infrastructure. Further, we propose a framework for translating flexible,
+natural language permissions into auditable access control configurations,
+enabling robust scoping of AI agent capabilities across diverse interaction
+modalities. Taken together, this practical approach facilitates immediate
+deployment of AI agents while addressing key security and accountability
+concerns, working toward ensuring agentic AI systems perform only appropriate
+actions and providing a tool for digital service providers to enable AI agent
+interactions without risking harm from scalable interaction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP
+  Evaluation Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexis Roger, Prateek Humane, Daniel Z. Kaplan, Kshitij Gupta, Qi Sun, George Adamopoulos, Jonathan Siu Chi Lim, Quentin Anthony, Edwin Fennell, Irina Rish
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of Vision-Language Models (VLMs) in the past several years
+calls for rigorous and comprehensive evaluation methods and benchmarks. This
+work analyzes existing VLM evaluation techniques, including automated metrics,
+AI-based assessments, and human evaluations across diverse tasks. We first
+introduce Robin - a novel suite of VLMs that we built by combining Large
+Language Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use
+Robin to identify shortcomings of current evaluation approaches across scales.
+Next, to overcome the identified limitations, we introduce CHIRP - a new long
+form response benchmark we developed for more robust and complete VLM
+evaluation. We provide open access to the Robin training code, model suite, and
+CHIRP benchmark to promote reproducibility and advance VLM research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Heap: A Contamination-Free Multilingual Code <span class="highlight-title">Dataset</span> for Evaluating
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Katzy, Razvan Mihai Popescu, Arie van Deursen, Maliheh Izadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent rise in the popularity of large language models has spurred the
+development of extensive code datasets needed to train them. This has left
+limited code available for collection and use in the downstream investigation
+of specific behaviors, or evaluation of large language models without suffering
+from data contamination. To address this problem, we release The Heap, a large
+multilingual dataset covering 57 programming languages that has been
+deduplicated with respect to other open datasets of code, enabling researchers
+to conduct fair evaluations of large language models without significant data
+cleaning overhead.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-Print. Accepted to FORGE 2025 Dataset Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Monte Carlo Tree Search with Velocity Obstacles for safe and efficient
+  motion planning in dynamic environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Bonanni, Daniele Meli, Alberto Castellini, Alessandro Farinelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online motion planning is a challenging problem for intelligent robots moving
+in dense environments with dynamic obstacles, e.g., crowds. In this work, we
+propose a novel approach for optimal and safe online motion planning with
+minimal information about dynamic obstacles. Specifically, our approach
+requires only the current position of the obstacles and their maximum speed,
+but it does not need any information about their exact trajectories or dynamic
+model. The proposed methodology combines Monte Carlo Tree Search (MCTS), for
+online optimal planning via model simulations, with Velocity Obstacles (VO),
+for obstacle avoidance. We perform experiments in a cluttered simulated
+environment with walls, and up to 40 dynamic obstacles moving with random
+velocities and directions. With an ablation study, we show the key contribution
+of VO in scaling up the efficiency of MCTS, selecting the safest and most
+rewarding actions in the tree of simulations. Moreover, we show the superiority
+of our methodology with respect to state-of-the-art planners, including
+Non-linear Model Predictive Control (NMPC), in terms of improved collision
+rate, computational and task performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NS-Gym: Open-Source Simulation Environments and Benchmarks for
+  Non-Stationary Markov Decision Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathaniel S. Keplinger, Baiting Luo, Iliyas Bektas, Yunuo Zhang, Kyle Hollins Wray, Aron Laszka, Abhishek Dubey, Ayan Mukhopadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many real-world applications, agents must make sequential decisions in
+environments where conditions are subject to change due to various exogenous
+factors. These non-stationary environments pose significant challenges to
+traditional decision-making models, which typically assume stationary dynamics.
+Non-stationary Markov decision processes (NS-MDPs) offer a framework to model
+and solve decision problems under such changing conditions. However, the lack
+of standardized benchmarks and simulation tools has hindered systematic
+evaluation and advance in this field. We present NS-Gym, the first simulation
+toolkit designed explicitly for NS-MDPs, integrated within the popular
+Gymnasium framework. In NS-Gym, we segregate the evolution of the environmental
+parameters that characterize non-stationarity from the agent's decision-making
+module, allowing for modular and flexible adaptations to dynamic environments.
+We review prior work in this domain and present a toolkit encapsulating key
+problem characteristics and types in NS-MDPs. This toolkit is the first effort
+to develop a set of standardized interfaces and benchmark problems to enable
+consistent and reproducible evaluation of algorithms under non-stationary
+conditions. We also benchmark six algorithmic approaches from prior work on
+NS-MDPs using NS-Gym. Our vision is that NS-Gym will enable researchers to
+assess the adaptability and robustness of their decision-making algorithms to
+non-stationary conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CarMem: Enhancing Long-Term Memory in LLM Voice Assistants through
+  Category-Bounding <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Kirmayr, Lukas Stappen, Phillip Schneider, Florian Matthes, Elisabeth André
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In today's assistant landscape, personalisation enhances interactions,
+fosters long-term relationships, and deepens engagement. However, many systems
+struggle with retaining user preferences, leading to repetitive user requests
+and disengagement. Furthermore, the unregulated and opaque extraction of user
+preferences in industry applications raises significant concerns about privacy
+and trust, especially in regions with stringent regulations like Europe. In
+response to these challenges, we propose a long-term memory system for voice
+assistants, structured around predefined categories. This approach leverages
+Large Language Models to efficiently extract, store, and retrieve preferences
+within these categories, ensuring both personalisation and transparency. We
+also introduce a synthetic multi-turn, multi-session conversation dataset
+(CarMem), grounded in real industry data, tailored to an in-car voice assistant
+setting. Benchmarked on the dataset, our system achieves an F1-score of .78 to
+.95 in preference extraction, depending on category granularity. Our
+maintenance strategy reduces redundant preferences by 95% and contradictory
+ones by 92%, while the accuracy of optimal retrieval is at .87. Collectively,
+the results demonstrate the system's suitability for industrial applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the International Conference on
+  Computational Linguistics (COLING 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Electronic Health Records: Towards Digital Twins in Healthcare 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09640v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09640v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammet Alkan, Hester Huijsdens, Yola Jones, Fani Deligianni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pivotal shift from traditional paper-based records to sophisticated
+Electronic Health Records (EHR), enabled systematic collection and analysis of
+patient data through descriptive statistics, providing insight into patterns
+and trends across patient populations. This evolution continued toward
+predictive analytics, allowing healthcare providers to anticipate patient
+outcomes and potential complications before they occur. This progression from
+basic digital record-keeping to sophisticated predictive modelling and digital
+twins reflects healthcare's broader evolution toward more integrated,
+patient-centred approaches that combine data-driven insights with personalized
+care delivery. This chapter explores the evolution and significance of
+healthcare information systems, beginning with an examination of the
+implementation of EHR in the UK and the USA. It provides a comprehensive
+overview of the International Classification of Diseases (ICD) system, tracing
+its development from ICD-9 to ICD-10. Central to this discussion is the
+MIMIC-III database, a landmark achievement in healthcare data sharing and
+arguably the most comprehensive critical care database freely available to
+researchers worldwide. MIMIC-III has democratized access to high-quality
+healthcare data, enabling unprecedented opportunities for research and
+analysis. The chapter examines its structure, clinical outcome analysis
+capabilities, and practical applications through case studies, with a
+particular focus on mortality and length of stay metrics, vital signs
+extraction, and ICD coding. Through detailed entity-relationship diagrams and
+practical examples, the text illustrates MIMIC's complex data structure and
+demonstrates how different querying approaches can lead to subtly different
+results, emphasizing the critical importance of understanding the database's
+architecture for accurate data extraction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Platform-Aware Mission Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Panjkovic, Alessandro Cimatti, Andrea Micheli, Stefano Tonetta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning for autonomous systems typically requires reasoning with models at
+different levels of abstraction, and the harmonization of two competing sets of
+objectives: high-level mission goals that refer to an interaction of the system
+with the external environment, and low-level platform constraints that aim to
+preserve the integrity and the correct interaction of the subsystems. The
+complicated interplay between these two models makes it very hard to reason on
+the system as a whole, especially when the objective is to find plans with
+robustness guarantees, considering the non-deterministic behavior of the lower
+layers of the system.
+  In this paper, we introduce the problem of Platform-Aware Mission Planning
+(PAMP), addressing it in the setting of temporal durative actions. The PAMP
+problem differs from standard temporal planning for its exists-forall nature:
+the high-level plan dealing with mission goals is required to satisfy safety
+and executability constraints, for all the possible non-deterministic
+executions of the low-level model of the platform and the environment. We
+propose two approaches for solving PAMP. The first baseline approach
+amalgamates the mission and platform levels, while the second is based on an
+abstraction-refinement loop that leverages the combination of a planner and a
+verification engine. We prove the soundness and completeness of the proposed
+approaches and validate them experimentally, demonstrating the importance of
+heterogeneous modeling and the superiority of the technique based on
+abstraction-refinement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Artificial Intelligence-Driven Clinical Decision Support Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammet Alkan, Idris Zakariyya, Samuel Leighton, Kaushik Bhargav Sivangi, Christos Anagnostopoulos, Fani Deligianni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As artificial intelligence (AI) becomes increasingly embedded in healthcare
+delivery, this chapter explores the critical aspects of developing reliable and
+ethical Clinical Decision Support Systems (CDSS). Beginning with the
+fundamental transition from traditional statistical models to sophisticated
+machine learning approaches, this work examines rigorous validation strategies
+and performance assessment methods, including the crucial role of model
+calibration and decision curve analysis. The chapter emphasizes that creating
+trustworthy AI systems in healthcare requires more than just technical
+accuracy; it demands careful consideration of fairness, explainability, and
+privacy. The challenge of ensuring equitable healthcare delivery through AI is
+stressed, discussing methods to identify and mitigate bias in clinical
+predictive models. The chapter then delves into explainability as a cornerstone
+of human-centered CDSS. This focus reflects the understanding that healthcare
+professionals must not only trust AI recommendations but also comprehend their
+underlying reasoning. The discussion advances in an analysis of privacy
+vulnerabilities in medical AI systems, from data leakage in deep learning
+models to sophisticated attacks against model explanations. The text explores
+privacy-preservation strategies such as differential privacy and federated
+learning, while acknowledging the inherent trade-offs between privacy
+protection and model performance. This progression, from technical validation
+to ethical considerations, reflects the multifaceted challenges of developing
+AI systems that can be seamlessly and reliably integrated into daily clinical
+practice while maintaining the highest standards of patient care and data
+protection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Reward Hacking: Causal Rewards for Large Language Model Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqi Wang, Zhuokai Zhao, Yibo Jiang, Zhaorun Chen, Chen Zhu, Yuxin Chen, Jiayi Liu, Lizhu Zhang, Xiangjun Fan, Hao Ma, Sinong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language models (LLMs) have demonstrated significant
+progress in performing complex tasks. While Reinforcement Learning from Human
+Feedback (RLHF) has been effective in aligning LLMs with human preferences, it
+is susceptible to spurious correlations in reward modeling. Consequently, it
+often introduces biases-such as length bias, sycophancy, conceptual bias, and
+discrimination that hinder the model's ability to capture true causal
+relationships. To address this, we propose a novel causal reward modeling
+approach that integrates causal inference to mitigate these spurious
+correlations. Our method enforces counterfactual invariance, ensuring reward
+predictions remain consistent when irrelevant variables are altered. Through
+experiments on both synthetic and real-world datasets, we show that our
+approach mitigates various types of spurious correlations effectively,
+resulting in more reliable and fair alignment of LLMs with human preferences.
+As a drop-in enhancement to the existing RLHF workflow, our causal reward
+modeling provides a practical way to improve the trustworthiness and fairness
+of LLM finetuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metric Learning with Progressive Self-Distillation for Audio-Visual
+  Embedding Learning <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donghuo Zeng, Kazushi Ikeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Metric learning projects samples into an embedded space, where similarities
+and dissimilarities are quantified based on their learned representations.
+However, existing methods often rely on label-guided representation learning,
+where representations of different modalities, such as audio and visual data,
+are aligned based on annotated labels. This approach tends to underutilize
+latent complex features and potential relationships inherent in the
+distributions of audio and visual data that are not directly tied to the
+labels, resulting in suboptimal performance in audio-visual embedding learning.
+To address this issue, we propose a novel architecture that integrates
+cross-modal triplet loss with progressive self-distillation. Our method
+enhances representation learning by leveraging inherent distributions and
+dynamically refining soft audio-visual alignments -- probabilistic alignments
+between audio and visual data that capture the inherent relationships beyond
+explicit labels. Specifically, the model distills audio-visual
+distribution-based knowledge from annotated labels in a subset of each batch.
+This self-distilled knowledge is used t
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 2 tables. Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Managed-Retention Memory: A New Class of Memory for the AI Era 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergey Legtchenko, Ioan Stefanovici, Richard Black, Antony Rowstron, Junyi Liu, Paolo Costa, Burcu Canakci, Dushyanth Narayanan, Xingbo Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI clusters today are one of the major uses of High Bandwidth Memory (HBM).
+However, HBM is suboptimal for AI workloads for several reasons. Analysis shows
+HBM is overprovisioned on write performance, but underprovisioned on density
+and read bandwidth, and also has significant energy per bit overheads. It is
+also expensive, with lower yield than DRAM due to manufacturing complexity. We
+propose a new memory class: Managed-Retention Memory (MRM), which is more
+optimized to store key data structures for AI inference workloads. We believe
+that MRM may finally provide a path to viability for technologies that were
+originally proposed to support Storage Class Memory (SCM). These technologies
+traditionally offered long-term persistence (10+ years) but provided poor IO
+performance and/or endurance. MRM makes different trade-offs, and by
+understanding the workload IO patterns, MRM foregoes long-term data retention
+and write performance for better potential performance on the metrics important
+for these workloads.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages (5 content + 3 refs); 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reducing the Sensitivity of Neural Physics Simulators to Mesh Topology
+  via <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09597v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09597v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Vaska, Justin Goodwin, Robin Walters, Rajmonda S. Caceres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Meshes are used to represent complex objects in high fidelity physics
+simulators across a variety of domains, such as radar sensing and aerodynamics.
+There is growing interest in using neural networks to accelerate physics
+simulations, and also a growing body of work on applying neural networks
+directly to irregular mesh data. Since multiple mesh topologies can represent
+the same object, mesh augmentation is typically required to handle topological
+variation when training neural networks. Due to the sensitivity of physics
+simulators to small changes in mesh shape, it is challenging to use these
+augmentations when training neural network-based physics simulators. In this
+work, we show that variations in mesh topology can significantly reduce the
+performance of neural network simulators. We evaluate whether pretraining can
+be used to address this issue, and find that employing an established
+autoencoder pretraining technique with graph embedding models reduces the
+sensitivity of neural network simulators to variations in mesh topology.
+Finally, we highlight future research directions that may further reduce neural
+simulator sensitivity to mesh topology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IFRA: a machine learning-based Instrumented Fall Risk Assessment Scale
+  derived from Instrumented Timed Up and Go test in stroke patients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09595v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09595v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simone Macciò, Alessandro Carfì, Alessio Capitanelli, Peppino Tropea, Massimo Corbo, Fulvio Mastrogiovanni, Michela Picardi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective fall risk assessment is critical for post-stroke patients. The
+present study proposes a novel, data-informed fall risk assessment method based
+on the instrumented Timed Up and Go (ITUG) test data, bringing in many mobility
+measures that traditional clinical scales fail to capture. IFRA, which stands
+for Instrumented Fall Risk Assessment, has been developed using a two-step
+process: first, features with the highest predictive power among those
+collected in a ITUG test have been identified using machine learning
+techniques; then, a strategy is proposed to stratify patients into low, medium,
+or high-risk strata. The dataset used in our analysis consists of 142
+participants, out of which 93 were used for training (15 synthetically
+generated), 17 for validation and 32 to test the resulting IFRA scale (22
+non-fallers and 10 fallers). Features considered in the IFRA scale include gait
+speed, vertical acceleration during sit-to-walk transition, and turning angular
+velocity, which align well with established literature on the risk of fall in
+neurological patients. In a comparison with traditional clinical scales such as
+the traditional Timed Up & Go and the Mini-BESTest, IFRA demonstrates
+competitive performance, being the only scale to correctly assign more than
+half of the fallers to the high-risk stratum (Fischer's Exact test p = 0.004).
+Despite the dataset's limited size, this is the first proof-of-concept study to
+pave the way for future evidence regarding the use of IFRA tool for continuous
+patient monitoring and fall prevention both in clinical stroke rehabilitation
+and at home post-discharge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 2 figures, submitted for review dec 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MatrixNet: Learning over symmetry groups using learned group
+  representations <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Laird, Circe Hsu, Asilata Bapat, Robin Walters
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group theory has been used in machine learning to provide a theoretically
+grounded approach for incorporating known symmetry transformations in tasks
+from robotics to protein modeling. In these applications, equivariant neural
+networks use known symmetry groups with predefined representations to learn
+over geometric input data. We propose MatrixNet, a neural network architecture
+that learns matrix representations of group element inputs instead of using
+predefined representations. MatrixNet achieves higher sample efficiency and
+generalization over several standard baselines in prediction tasks over the
+several finite groups and the Artin braid group. We also show that MatrixNet
+respects group relations allowing generalization to group elements of greater
+word length than in the training set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-driven Adaptation of Foundation Models for Few-shot Surgical
+  Workflow Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tingxuan Chen, Kun Yuan, Vinkle Srivastav, Nassir Navab, Nicolas Padoy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Surgical workflow analysis is crucial for improving surgical
+efficiency and safety. However, previous studies rely heavily on large-scale
+annotated datasets, posing challenges in cost, scalability, and reliance on
+expert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven
+Adaptation), designed to handle various surgical workflow analysis tasks with
+minimal paired image-label data.
+  Methods: Our approach has two key components. First, Few-shot selection-based
+modality alignment selects a small subset of images and aligns their embeddings
+with text embeddings from the downstream task, bridging the modality gap.
+Second, Text-driven adaptation leverages only text data to train a decoder,
+eliminating the need for paired image-text data. This decoder is then applied
+to aligned image embeddings, enabling image-related tasks without explicit
+image-text pairs.
+  Results: We evaluate our approach to generative tasks (image captioning) and
+discriminative tasks (triplet recognition and phase recognition). Results show
+that Surg-FTDA outperforms baselines and generalizes well across downstream
+tasks.
+  Conclusion: We propose a text-driven adaptation approach that mitigates the
+modality gap and handles multiple downstream tasks in surgical workflow
+analysis, with minimal reliance on large annotated datasets. The code and
+dataset will be released in https://github.com/TingxuanSix/Surg-FTDA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI in Support of Diversity and Inclusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Çiçek Güven, Afra Alishahi, Henry Brighton, Gonzalo Nápoles, Juan Sebastian Olier, Marie Šafář, Eric Postma, Dimitar Shterionov, Mirella De Sisto, Eva Vanmassenhove
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we elaborate on how AI can support diversity and inclusion and
+exemplify research projects conducted in that direction. We start by looking at
+the challenges and progress in making large language models (LLMs) more
+transparent, inclusive, and aware of social biases. Even though LLMs like
+ChatGPT have impressive abilities, they struggle to understand different
+cultural contexts and engage in meaningful, human like conversations. A key
+issue is that biases in language processing, especially in machine translation,
+can reinforce inequality. Tackling these biases requires a multidisciplinary
+approach to ensure AI promotes diversity, fairness, and inclusion. We also
+highlight AI's role in identifying biased content in media, which is important
+for improving representation. By detecting unequal portrayals of social groups,
+AI can help challenge stereotypes and create more inclusive technologies.
+Transparent AI algorithms, which clearly explain their decisions, are essential
+for building trust and reducing bias in AI systems. We also stress AI systems
+need diverse and inclusive training data. Projects like the Child Growth
+Monitor show how using a wide range of data can help address real world
+problems like malnutrition and poverty. We present a project that demonstrates
+how AI can be applied to monitor the role of search engines in spreading
+disinformation about the LGBTQ+ community. Moreover, we discuss the SignON
+project as an example of how technology can bridge communication gaps between
+hearing and deaf people, emphasizing the importance of collaboration and mutual
+trust in developing inclusive AI. Overall, with this paper, we advocate for AI
+systems that are not only effective but also socially responsible, promoting
+fair and inclusive interactions between humans and machines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class Incremental Fault Diagnosis under Limited Fault Data via
+  Supervised Contrastive Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanrong Zhang, Yifei Yao, Zixuan Wang, Jiayuan Su, Mengxuan Li, Peng Peng, Hongwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class-incremental fault diagnosis requires a model to adapt to new fault
+classes while retaining previous knowledge. However, limited research exists
+for imbalanced and long-tailed data. Extracting discriminative features from
+few-shot fault data is challenging, and adding new fault classes often demands
+costly model retraining. Moreover, incremental training of existing methods
+risks catastrophic forgetting, and severe class imbalance can bias the model's
+decisions toward normal classes. To tackle these issues, we introduce a
+Supervised Contrastive knowledge distiLlation for class Incremental Fault
+Diagnosis (SCLIFD) framework proposing supervised contrastive knowledge
+distillation for improved representation learning capability and less
+forgetting, a novel prioritized exemplar selection method for sample replay to
+alleviate catastrophic forgetting, and the Random Forest Classifier to address
+the class imbalance. Extensive experimentation on simulated and real-world
+industrial datasets across various imbalance ratios demonstrates the
+superiority of SCLIFD over existing approaches. Our code can be found at
+https://github.com/Zhang-Henry/SCLIFD_TII.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MonoSOWA: Scalable monocular 3D Object detector Without human
+  Annotations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Skvrna, Lukas Neumann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting the three-dimensional position and orientation of objects using a
+single RGB camera is a foundational task in computer vision with many important
+applications. Traditionally, 3D object detection methods are trained in a
+fully-supervised setup, requiring vast amounts of human annotations, which are
+laborious, costly, and do not scale well with the ever-increasing amounts of
+data being captured.
+  In this paper, we present the first method to train 3D object detectors for
+monocular RGB cameras without domain-specific human annotations, thus making
+orders of magnitude more data available for training. Thanks to newly proposed
+Canonical Object Space, the method can not only exploit data across a variety
+of datasets and camera setups to train a single 3D detector, but unlike
+previous work it also works out of the box in previously unseen camera setups.
+All this is crucial for practical applications, where the data and cameras are
+extremely heterogeneous.
+  The method is evaluated on two standard autonomous driving datasets, where it
+outperforms previous works, which, unlike our method, still rely on 2D human
+annotations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Air Temperature from Volumetric Urban Morphology with Machine
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Berk Kıvılcım, Patrick Erik Bradley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we firstly introduce a method that converts CityGML data into
+voxels which works efficiently and fast in high resolution for large scale
+datasets such as cities but by sacrificing some building details to overcome
+the limitations of previous voxelization methodologies that have been
+computationally intensive and inefficient at transforming large-scale urban
+areas into voxel representations for high resolution. Those voxelized 3D city
+data from multiple cities and corresponding air temperature data are used to
+develop a machine learning model. Before the model training, Gaussian blurring
+is implemented on input data to consider spatial relationships, as a result the
+correlation rate between air temperature and volumetric building morphology is
+also increased after the Gaussian blurring. After the model training, the
+prediction results are not just evaluated with Mean Square Error (MSE) but some
+image similarity metrics such as Structural Similarity Index Measure (SSIM) and
+Learned Perceptual Image Patch Similarity (LPIPS) that are able to detect and
+consider spatial relations during the evaluation process. This trained model is
+capable of predicting the spatial distribution of air temperature by using
+building volume information of corresponding pixel as input. By doing so, this
+research aims to assist urban planners in incorporating environmental
+parameters into their planning strategies, thereby facilitating more
+sustainable and inhabitable urban environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 8 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RE-POSE: Synergizing Reinforcement Learning-Based Partitioning and
+  Offloading for Edge Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianrui Shi, Yong Zhao, Zeyang Cui, Xiaoming Shen, Minhang Zeng, Xiaojie Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object detection plays a crucial role in smart video analysis, with
+applications ranging from autonomous driving and security to smart cities.
+However, achieving real-time object detection on edge devices presents
+significant challenges due to their limited computational resources and the
+high demands of deep neural network (DNN)-based detection models, particularly
+when processing high-resolution video. Conventional strategies, such as input
+down-sampling and network up-scaling, often compromise detection accuracy for
+faster performance or lead to higher inference latency. To address these
+issues, this paper introduces RE-POSE, a Reinforcement Learning (RL)-Driven
+Partitioning and Edge Offloading framework designed to optimize the
+accuracy-latency trade-off in resource-constrained edge environments. Our
+approach features an RL-Based Dynamic Clustering Algorithm (RL-DCA) that
+partitions video frames into non-uniform blocks based on object distribution
+and the computational characteristics of DNNs. Furthermore, a parallel edge
+offloading scheme is implemented to distribute these blocks across multiple
+edge servers for concurrent processing. Experimental evaluations show that
+RE-POSE significantly enhances detection accuracy and reduces inference
+latency, surpassing existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving the unsolvable: Translating case law in Hong Kong 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09444v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09444v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        King-kui Sin, Xi Xuan, Chunyu Kit, Clara Ho-yan Chan, Honic Ho-kin Ip
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the challenges translating case law under Hong Kong's
+bilingual legal system. It highlights the initial success of translating all
+written statutes into Chinese before the 1997 handover, a task mandated by the
+Basic Law. The effort involved significant collaboration among legal,
+linguistic, and translation experts, resulting in a comprehensive and
+culturally appropriate bilingual legal system. However, translating case law
+remains a significant challenge due to the sheer volume and continuous growth
+of judicial decisions. The paper critiques the governments and judiciarys
+sporadic and uncoordinated efforts to translate case law, contrasting it with
+the thorough approach previously taken for statute translation. Although the
+government acknowledges the importance of legal bilingualism, it lacks a
+sustainable strategy for translating case law. The Judiciarys position that
+translating all judgments is unnecessary, unrealistic, and not cost-effectiveis
+analyzed and critiqued for its impact on legal transparency and public trust. A
+proposed solution involves leveraging machine translation technology through a
+human-machine interactive translation platform, which undergoes two major
+transitions. Initially based on a neural model, the platform transitions to
+using a large language model for improved translation accuracy. Furthermore, it
+evolves from a single-agent system to a multi-agent system, incorporating
+Translator, Annotator, and Proofreader agents. This multi-agent approach,
+supported by a grant, aims to facilitate efficient, high-quality translation of
+judicial judgments by integrating advanced artificial intelligence and
+continuous feedback mechanisms, thus better meeting the needs of a bilingual
+legal system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Responsible LLMs: Inherent Risk, Malicious Use, and
+  Mitigation Strategy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huandong Wang, Wenjie Fu, Yingzhou Tang, Zhilong Chen, Yuxi Huang, Jinghua Piao, Chen Gao, Fengli Xu, Tao Jiang, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) present significant potential for
+supporting numerous real-world applications and delivering positive social
+impacts, they still face significant challenges in terms of the inherent risk
+of privacy leakage, hallucinated outputs, and value misalignment, and can be
+maliciously used for generating toxic content and unethical purposes after been
+jailbroken. Therefore, in this survey, we present a comprehensive review of
+recent advancements aimed at mitigating these issues, organized across the four
+phases of LLM development and usage: data collecting and pre-training,
+fine-tuning and alignment, prompting and reasoning, and post-processing and
+auditing. We elaborate on the recent advances for enhancing the performance of
+LLMs in terms of privacy protection, hallucination reduction, value alignment,
+toxicity elimination, and jailbreak defenses. In contrast to previous surveys
+that focus on a single dimension of responsible LLMs, this survey presents a
+unified framework that encompasses these diverse dimensions, providing a
+comprehensive view of enhancing LLMs to better serve real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADAGE: A generic two-layer framework for adaptive agent based modelling <span class="chip">AAMAS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Patrick Evans, Sihan Zeng, Sumitra Ganesh, Leo Ardon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Agent-based models (ABMs) are valuable for modelling complex, potentially
+out-of-equilibria scenarios. However, ABMs have long suffered from the Lucas
+critique, stating that agent behaviour should adapt to environmental changes.
+Furthermore, the environment itself often adapts to these behavioural changes,
+creating a complex bi-level adaptation problem. Recent progress integrating
+multi-agent reinforcement learning into ABMs introduces adaptive agent
+behaviour, beginning to address the first part of this critique, however, the
+approaches are still relatively ad hoc, lacking a general formulation, and
+furthermore, do not tackle the second aspect of simultaneously adapting
+environmental level characteristics in addition to the agent behaviours. In
+this work, we develop a generic two-layer framework for ADaptive AGEnt based
+modelling (ADAGE) for addressing these problems. This framework formalises the
+bi-level problem as a Stackelberg game with conditional behavioural policies,
+providing a consolidated framework for adaptive agent-based modelling based on
+solving a coupled set of non-linear equations. We demonstrate how this generic
+approach encapsulates several common (previously viewed as distinct) ABM tasks,
+such as policy design, calibration, scenario generation, and robust behavioural
+learning under one unified framework. We provide example simulations on
+multiple complex economic and financial environments, showing the strength of
+the novel framework under these canonical settings, addressing long-standing
+critiques of traditional ABMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2025 International Conference on Autonomous Agents
+  and Multiagent Systems (AAMAS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Neural Style Transfer for Artistic Image Generation using VGG19 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kapil Kashyap, Mehak Garg, Sean Fargose, Sindhu Nair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Throughout history, humans have created remarkable works of art, but
+artificial intelligence has only recently started to make strides in generating
+visually compelling art. Breakthroughs in the past few years have focused on
+using convolutional neural networks (CNNs) to separate and manipulate the
+content and style of images, applying texture synthesis techniques.
+Nevertheless, a number of current techniques continue to encounter obstacles,
+including lengthy processing times, restricted choices of style images, and the
+inability to modify the weight ratio of styles. We proposed a neural style
+transfer system that can add various artistic styles to a desired image to
+address these constraints allowing flexible adjustments to style weight ratios
+and reducing processing time. The system uses the VGG19 model for feature
+extraction, ensuring high-quality, flexible stylization without compromising
+content integrity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MoE$^2$: Optimizing Collaborative Inference for Edge Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lyudong Jin, Yanning Zhang, Yanhan Li, Shurong Wang, Howard H. Yang, Jian Wu, Meng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable capabilities across
+a wide range of natural language processing tasks. Exploiting the heterogeneous
+capabilities of edge LLMs is crucial for diverse emerging applications, as it
+enables greater cost-effectiveness and reduced latency. In this work, we
+introduce \textit{Mixture-of-Edge-Experts (MoE$^2$)}, a novel collaborative
+inference framework for edge LLMs. We formulate the joint gating and expert
+selection problem to optimize inference performance under energy and latency
+constraints. Unlike conventional MoE problems, LLM expert selection is
+significantly more challenging due to the combinatorial nature and the
+heterogeneity of edge LLMs across various attributes. To this end, we propose a
+two-level expert selection mechanism through which we uncover an
+optimality-preserving property of gating parameters across expert selections.
+This property enables the decomposition of the training and selection
+processes, significantly reducing complexity. Furthermore, we leverage the
+objective's monotonicity and design a discrete monotonic optimization algorithm
+for optimal expert selection. We implement edge servers with NVIDIA Jetson AGX
+Orins and NVIDIA RTX 4090 GPUs, and perform extensive experiments. Our results
+validate that performance improvements of various LLM models and show that our
+MoE$^2$ method can achieve optimal trade-offs among different delay and energy
+budgets, and outperforms baselines under various system resource constraints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE/ACM Transactions on Networking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ELM-DeepONets: Backpropagation-Free Training of Deep Operator Networks
+  via Extreme Learning Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hwijae Son
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Operator Networks (DeepONets) are among the most prominent frameworks
+for operator learning, grounded in the universal approximation theorem for
+operators. However, training DeepONets typically requires significant
+computational resources. To address this limitation, we propose ELM-DeepONets,
+an Extreme Learning Machine (ELM) framework for DeepONets that leverages the
+backpropagation-free nature of ELM. By reformulating DeepONet training as a
+least-squares problem for newly introduced parameters, the ELM-DeepONet
+approach significantly reduces training complexity. Validation on benchmark
+problems, including nonlinear ODEs and PDEs, demonstrates that the proposed
+method not only achieves superior accuracy but also drastically reduces
+computational costs. This work offers a scalable and efficient alternative for
+operator learning in scientific computing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum-Enhanced <span class="highlight-title">Transformer</span>s for Robust Acoustic Scene Classification
+  in IoT Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh K. Quan, Mayuri Wijayasundara, Sujeeva Setunge, Pubudu N. Pathirana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of Internet of Things (IoT) devices equipped with acoustic
+sensors necessitates robust acoustic scene classification (ASC) capabilities,
+even in noisy and data-limited environments. Traditional machine learning
+methods often struggle to generalize effectively under such conditions. To
+address this, we introduce Q-ASC, a novel Quantum-Inspired Acoustic Scene
+Classifier that leverages the power of quantum-inspired transformers. By
+integrating quantum concepts like superposition and entanglement, Q-ASC
+achieves superior feature learning and enhanced noise resilience compared to
+classical models. Furthermore, we introduce a Quantum Variational Autoencoder
+(QVAE) based data augmentation technique to mitigate the challenge of limited
+labeled data in IoT deployments. Extensive evaluations on the Tampere
+University of Technology (TUT) Acoustic Scenes 2016 benchmark dataset
+demonstrate that Q-ASC achieves remarkable accuracy between 68.3% and 88.5%
+under challenging conditions, outperforming state-of-the-art methods by over 5%
+in the best case. This research paves the way for deploying intelligent
+acoustic sensing in IoT networks, with potential applications in smart homes,
+industrial monitoring, and environmental surveillance, even in adverse acoustic
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aligning Instruction Tuning with <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Liang, Tianyu Zheng, Xinrun Du, Ge Zhang, Xingwei Qu, Xiang Yue, Chujie Zheng, Jiaheng Liu, Lei Ma, Wenhu Chen, Guoyin Wang, Zhaoxiang Zhang, Wenhao Huang, Jiajun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning enhances large language models (LLMs) to follow human
+instructions across diverse tasks, relying on high-quality datasets to guide
+behavior. However, these datasets, whether manually curated or synthetically
+generated, are often narrowly focused and misaligned with the broad
+distributions captured during pre-training, limiting LLM generalization and
+effective use of pre-trained knowledge. We propose *Aligning Instruction Tuning
+with Pre-training* (AITP), a method that bridges this gap by identifying
+coverage shortfalls in instruction-tuning datasets and rewriting
+underrepresented pre-training data into high-quality instruction-response
+pairs. This approach enriches dataset diversity while preserving task-specific
+objectives. Evaluations on three fully open LLMs across eight benchmarks
+demonstrate consistent performance improvements with AITP. Ablations highlight
+the benefits of adaptive data selection, controlled rewriting, and balanced
+integration, emphasizing the importance of aligning instruction tuning with
+pre-training distributions to unlock the full potential of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ YETI (YET to Intervene) Proactive Interventions by Multimodal AI Agents
+  in Augmented Reality Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saptarashmi Bandyopadhyay, Vikas Bahirwani, Lavisha Aggarwal, Bhanu Guda, Lin Li, Andrea Colaco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal AI Agents are AI models that have the capability of interactively
+and cooperatively assisting human users to solve day-to-day tasks. Augmented
+Reality (AR) head worn devices can uniquely improve the user experience of
+solving procedural day-to-day tasks by providing egocentric multimodal (audio
+and video) observational capabilities to AI Agents. Such AR capabilities can
+help AI Agents see and listen to actions that users take which can relate to
+multimodal capabilities of human users. Existing AI Agents, either Large
+Language Models (LLMs) or Multimodal Vision-Language Models (VLMs) are reactive
+in nature, which means that models cannot take an action without reading or
+listening to the human user's prompts. Proactivity of AI Agents on the other
+hand can help the human user detect and correct any mistakes in agent observed
+tasks, encourage users when they do tasks correctly or simply engage in
+conversation with the user - akin to a human teaching or assisting a user. Our
+proposed YET to Intervene (YETI) multimodal agent focuses on the research
+question of identifying circumstances that may require the agent to intervene
+proactively. This allows the agent to understand when it can intervene in a
+conversation with human users that can help the user correct mistakes on tasks,
+like cooking, using AR. Our YETI Agent learns scene understanding signals based
+on interpretable notions of Structural Similarity (SSIM) on consecutive video
+frames. We also define the alignment signal which the AI Agent can learn to
+identify if the video frames corresponding to the user's actions on the task
+are consistent with expected actions. These signals are used by our AI Agent to
+determine when it should proactively intervene. We compare our results on the
+instances of proactive intervention in the HoloAssist multimodal benchmark for
+an expert agent guiding a user to complete procedural tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Style4Rec: Enhancing <span class="highlight-title">Transformer</span>-based E-commerce Recommendation Systems
+  with Style and Shopping Cart Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Berke Ugurlu, Ming-Yi Hong, Che Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding users' product preferences is essential to the efficacy of a
+recommendation system. Precision marketing leverages users' historical data to
+discern these preferences and recommends products that align with them.
+However, recent browsing and purchase records might better reflect current
+purchasing inclinations. Transformer-based recommendation systems have made
+strides in sequential recommendation tasks, but they often fall short in
+utilizing product image style information and shopping cart data effectively.
+In light of this, we propose Style4Rec, a transformer-based e-commerce
+recommendation system that harnesses style and shopping cart information to
+enhance existing transformer-based sequential product recommendation systems.
+Style4Rec represents a significant step forward in personalized e-commerce
+recommendations, outperforming benchmarks across various evaluation metrics.
+Style4Rec resulted in notable improvements: HR@5 increased from 0.681 to 0.735,
+NDCG@5 increased from 0.594 to 0.674, and MRR@5 increased from 0.559 to 0.654.
+We tested our model using an e-commerce dataset from our partnering company and
+found that it exceeded established transformer-based sequential recommendation
+benchmarks across various evaluation metrics. Thus, Style4Rec presents a
+significant step forward in personalized e-commerce recommendation systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 images, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rational Tuning of LLM Cascades via Probabilistic Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael J. Zellinger, Matt Thomson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the reliability of large language models (LLMs) has recently
+garnered significant attention. Given LLMs' propensity to hallucinate, as well
+as their high sensitivity to prompt design, it is already challenging to
+predict the performance of an individual LLM. However, the problem becomes more
+complex for compound LLM systems such as cascades, where in addition to each
+model's standalone performance, we must understand how the error rates of
+different models interact. In this paper, we present a probabilistic model for
+the joint performance distribution of a sequence of LLMs, which enables a
+framework for rationally tuning the confidence thresholds of a LLM cascade
+using continuous optimization. Compared to selecting confidence thresholds
+using grid search, our parametric Markov-copula model significantly improves
+runtime scaling with respect to the length of the cascade and the desired
+resolution of the cost-error curve, turning them from intractable into
+low-order polynomial. In addition, the optimal thresholds computed using our
+continuous optimization-based algorithm increasingly outperform those found via
+grid search as cascade length grows, improving the area under the cost-error
+curve by 1.9% on average for cascades consisting of at least three models.
+Overall, our Markov-copula model provides a rational basis for tuning LLM
+cascade performance and points to the potential of probabilistic methods in
+analyzing LLM systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>-CAM: A Simpler Interpretable <span class="highlight-title">Transformer</span> for Fine-Grained
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arpita Chowdhury, Dipanjyoti Paul, Zheda Mai, Jianyang Gu, Ziheng Zhang, Kazi Sajeed Mehrab, Elizabeth G. Campolongo, Daniel Rubenstein, Charles V. Stewart, Anuj Karpatne, Tanya Berger-Wolf, Yu Su, Wei-Lun Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple usage of pre-trained Vision Transformers (ViTs) for
+fine-grained analysis, aiming to identify and localize the traits that
+distinguish visually similar categories, such as different bird species or dog
+breeds. Pre-trained ViTs such as DINO have shown remarkable capabilities to
+extract localized, informative features. However, using saliency maps like
+Grad-CAM can hardly point out the traits: they often locate the whole object by
+a blurred, coarse heatmap, not traits. We propose a novel approach Prompt Class
+Attention Map (Prompt-CAM) to the rescue. Prompt-CAM learns class-specific
+prompts to a pre-trained ViT and uses the corresponding outputs for
+classification. To classify an image correctly, the true-class prompt must
+attend to the unique image patches not seen in other classes' images, i.e.,
+traits. As such, the true class's multi-head attention maps reveal traits and
+their locations. Implementation-wise, Prompt-CAM is almost a free lunch by
+simply modifying the prediction head of Visual Prompt Tuning (VPT). This makes
+Prompt-CAM fairly easy to train and apply, sharply contrasting other
+interpretable methods that design specific models and training processes. It is
+even simpler than the recently published INterpretable TRansformer (INTR),
+whose encoder-decoder architecture prevents it from leveraging pre-trained
+ViTs. Extensive empirical studies on a dozen datasets from various domains
+(e.g., birds, fishes, insects, fungi, flowers, food, and cars) validate
+Prompt-CAM superior interpretation capability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Honeytrace: A Robust Plug-and-Play Watermarking Framework against
+  Model Extraction Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09328v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09328v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixiao Xu, Binxing Fang, Rui Wang, Yinghai Zhou, Shouling Ji, Yuan Liu, Mohan Li, Zhihong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing high-performance deep learning models is resource-intensive,
+leading model owners to utilize Machine Learning as a Service (MLaaS) platforms
+instead of publicly releasing their models. However, malicious users may
+exploit query interfaces to execute model extraction attacks, reconstructing
+the target model's functionality locally. While prior research has investigated
+triggerable watermarking techniques for asserting ownership, existing methods
+face significant challenges: (1) most approaches require additional training,
+resulting in high overhead and limited flexibility, and (2) they often fail to
+account for advanced attackers, leaving them vulnerable to adaptive attacks.
+  In this paper, we propose Neural Honeytrace, a robust plug-and-play
+watermarking framework against model extraction attacks. We first formulate a
+watermark transmission model from an information-theoretic perspective,
+providing an interpretable account of the principles and limitations of
+existing triggerable watermarking. Guided by the model, we further introduce:
+(1) a similarity-based training-free watermarking method for plug-and-play and
+flexible watermarking, and (2) a distribution-based multi-step watermark
+information transmission strategy for robust watermarking. Comprehensive
+experiments on four datasets demonstrate that Neural Honeytrace outperforms
+previous methods in efficiency and resisting adaptive attacks. Neural
+Honeytrace reduces the average number of samples required for a worst-case
+t-Test-based copyright claim from $12,000$ to $200$ with zero training cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Learning Informative Trajectory Embeddings for Imitation,
+  Classification and Regression <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichang Ge, Changyu Chen, Arunesh Sinha, Pradeep Varakantham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world sequential decision making tasks like autonomous driving,
+robotics, and healthcare, learning from observed state-action trajectories is
+critical for tasks like imitation, classification, and clustering. For example,
+self-driving cars must replicate human driving behaviors, while robots and
+healthcare systems benefit from modeling decision sequences, whether or not
+they come from expert data. Existing trajectory encoding methods often focus on
+specific tasks or rely on reward signals, limiting their ability to generalize
+across domains and tasks. Inspired by the success of embedding models like CLIP
+and BERT in static domains, we propose a novel method for embedding
+state-action trajectories into a latent space that captures the skills and
+competencies in the dynamic underlying decision-making processes. This method
+operates without the need for reward labels, enabling better generalization
+across diverse domains and tasks. Our contributions are threefold: (1) We
+introduce a trajectory embedding approach that captures multiple abilities from
+state-action data. (2) The learned embeddings exhibit strong representational
+power across downstream tasks, including imitation, classification, clustering,
+and regression. (3) The embeddings demonstrate unique properties, such as
+controlling agent behaviors in IQ-Learn and an additive structure in the latent
+space. Experimental results confirm that our method outperforms traditional
+approaches, offering more flexible and powerful trajectory representations for
+various applications. Our code is available at
+https://github.com/Erasmo1015/vte.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAMAS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SOP-Agent: Empower General Purpose AI Agent with Domain-Specific SOPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anbang Ye, Qianran Ma, Jia Chen, Muqi Li, Tong Li, Fujiao Liu, Siqi Mai, Meichen Lu, Haitao Bao, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant advancements in general-purpose AI agents, several
+challenges still hinder their practical application in real-world scenarios.
+First, the limited planning capabilities of Large Language Models (LLM)
+restrict AI agents from effectively solving complex tasks that require
+long-horizon planning. Second, general-purpose AI agents struggle to
+efficiently utilize domain-specific knowledge and human expertise. In this
+paper, we introduce the Standard Operational Procedure-guided Agent
+(SOP-agent), a novel framework for constructing domain-specific agents through
+pseudocode-style Standard Operational Procedures (SOPs) written in natural
+language. Formally, we represent a SOP as a decision graph, which is traversed
+to guide the agent in completing tasks specified by the SOP. We conduct
+extensive experiments across tasks in multiple domains, including
+decision-making, search and reasoning, code generation, data cleaning, and
+grounded customer service. The SOP-agent demonstrates excellent versatility,
+achieving performance superior to general-purpose agent frameworks and
+comparable to domain-specific agent systems. Additionally, we introduce the
+Grounded Customer Service Benchmark, the first benchmark designed to evaluate
+the grounded decision-making capabilities of AI agents in customer service
+scenarios based on SOPs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shape-Based Single Object Classification Using Ensemble Method
+  Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nur Shazwani Kamarudin, Mokhairi Makhtar, Syadiah Nor Wan Shamsuddin, Syed Abdullah Fadzli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, more and more images are available. Annotation and retrieval of the
+images pose classification problems, where each class is defined as the group
+of database images labelled with a common semantic label. Various systems have
+been proposed for content-based retrieval, as well as for image classification
+and indexing. In this paper, a hierarchical classification framework has been
+proposed for bridging the semantic gap effectively and achieving multi-category
+image classification. A well known pre-processing and post-processing method
+was used and applied to three problems; image segmentation, object
+identification and image classification. The method was applied to classify
+single object images from Amazon and Google datasets. The classification was
+tested for four different classifiers; BayesNetwork (BN), Random Forest (RF),
+Bagging and Vote. The estimated classification accuracies ranged from 20% to
+99% (using 10-fold cross validation). The Bagging classifier presents the best
+performance, followed by the Random Forest classifier.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Study of In-Context-Learning-Based Text-to-SQL Errors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Shen, Chengcheng Wan, Ruoyi Qiao, Jiazhen Zou, Hang Xu, Yuchen Shao, Yueling Zhang, Weikai Miao, Geguang Pu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have been adopted to perform text-to-SQL tasks,
+utilizing their in-context learning (ICL) capability to translate natural
+language questions into structured query language (SQL). However, such a
+technique faces correctness problems and requires efficient repairing
+solutions. In this paper, we conduct the first comprehensive study of
+text-to-SQL errors. Our study covers four representative ICL-based techniques,
+five basic repairing methods, two benchmarks, and two LLM settings. We find
+that text-to-SQL errors are widespread and summarize 29 error types of 7
+categories. We also find that existing repairing attempts have limited
+correctness improvement at the cost of high computational overhead with many
+mis-repairs. Based on the findings, we propose MapleRepair, a novel text-to-SQL
+error detection and repairing framework. The evaluation demonstrates that
+MapleRepair outperforms existing solutions by repairing 13.8% more queries with
+neglectable mis-repairs and 67.4% less overhead.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Mental Health Content on Social Media and Its Effect
+  Towards Suicidal Ideation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohaiminul Islam Bhuiyan, Nur Shazwani Kamarudin, Nur Hafieza Ismail
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This review underscores the critical need for effective strategies to
+identify and support individuals with suicidal ideation, exploiting
+technological innovations in ML and DL to further suicide prevention efforts.
+The study details the application of these technologies in analyzing vast
+amounts of unstructured social media data to detect linguistic patterns,
+keywords, phrases, tones, and contextual cues associated with suicidal
+thoughts. It explores various ML and DL models like SVMs, CNNs, LSTM, neural
+networks, and their effectiveness in interpreting complex data patterns and
+emotional nuances within text data. The review discusses the potential of these
+technologies to serve as a life-saving tool by identifying at-risk individuals
+through their digital traces. Furthermore, it evaluates the real-world
+effectiveness, limitations, and ethical considerations of employing these
+technologies for suicide prevention, stressing the importance of responsible
+development and usage. The study aims to fill critical knowledge gaps by
+analyzing recent studies, methodologies, tools, and techniques in this field.
+It highlights the importance of synthesizing current literature to inform
+practical tools and suicide prevention efforts, guiding innovation in reliable,
+ethical systems for early intervention. This research synthesis evaluates the
+intersection of technology and mental health, advocating for the ethical and
+responsible application of ML, DL, and NLP to offer life-saving potential
+worldwide while addressing challenges like generalizability, biases, privacy,
+and the need for further research to ensure these technologies do not
+exacerbate existing inequities and harms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ To Retrieve or Not to Retrieve? Uncertainty Detection for Dynamic
+  Retrieval Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaustubh D. Dhole
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation equips large language models with the
+capability to retrieve external knowledge, thereby mitigating hallucinations by
+incorporating information beyond the model's intrinsic abilities. However, most
+prior works have focused on invoking retrieval deterministically, which makes
+it unsuitable for tasks such as long-form question answering. Instead,
+dynamically performing retrieval by invoking it only when the underlying LLM
+lacks the required knowledge can be more efficient. In this context, we delve
+deeper into the question, "To Retrieve or Not to Retrieve?" by exploring
+multiple uncertainty detection methods. We evaluate these methods for the task
+of long-form question answering, employing dynamic retrieval, and present our
+comparisons. Our findings suggest that uncertainty detection metrics, such as
+Degree Matrix Jaccard and Eccentricity, can reduce the number of retrieval
+calls by almost half, with only a slight reduction in question-answering
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LAVCap: LLM-based Audio-Visual Captioning using Optimal Transport <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyeongha Rho, Hyeongkeun Lee, Valentio Iverson, Joon Son Chung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated audio captioning is a task that generates textual descriptions for
+audio content, and recent studies have explored using visual information to
+enhance captioning quality. However, current methods often fail to effectively
+fuse audio and visual data, missing important semantic cues from each modality.
+To address this, we introduce LAVCap, a large language model (LLM)-based
+audio-visual captioning framework that effectively integrates visual
+information with audio to improve audio captioning performance. LAVCap employs
+an optimal transport-based alignment loss to bridge the modality gap between
+audio and visual features, enabling more effective semantic extraction.
+Additionally, we propose an optimal transport attention module that enhances
+audio-visual fusion using an optimal transport assignment map. Combined with
+the optimal training strategy, experimental results demonstrate that each
+component of our framework is effective. LAVCap outperforms existing
+state-of-the-art methods on the AudioCaps dataset, without relying on large
+datasets or post-processing. Code is available at
+https://github.com/NAVER-INTEL-Co-Lab/gaudi-lavcap.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures; Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEAL: Entangled White-box Watermarks on Low-Rank Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giyeong Oh, Seajin Kim, Woohyun Cho, Sangkyu Lee, Jiwan Chung, Dokyung Song, Youngjae Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, LoRA and its variants have become the de facto strategy for
+training and sharing task-specific versions of large pretrained models, thanks
+to their efficiency and simplicity. However, the issue of copyright protection
+for LoRA weights, especially through watermark-based techniques, remains
+underexplored. To address this gap, we propose SEAL (SEcure wAtermarking on
+LoRA weights), the universal whitebox watermarking for LoRA. SEAL embeds a
+secret, non-trainable matrix between trainable LoRA weights, serving as a
+passport to claim ownership. SEAL then entangles the passport with the LoRA
+weights through training, without extra loss for entanglement, and distributes
+the finetuned weights after hiding the passport. When applying SEAL, we
+observed no performance degradation across commonsense reasoning,
+textual/visual instruction tuning, and text-to-image synthesis tasks. We
+demonstrate that SEAL is robust against a variety of known attacks: removal,
+obfuscation, and ambiguity attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 16 tables, 9 figures, initial version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text Semantics to Flexible Design: A Residential Layout Generation
+  Method Based on Stable Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijin Qiu, Jiepeng Liu, Yi Xia, Hongtuo Qi, Pengkun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Flexibility in the AI-based residential layout design remains a significant
+challenge, as traditional methods like rule-based heuristics and graph-based
+generation often lack flexibility and require substantial design knowledge from
+users. To address these limitations, we propose a cross-modal design approach
+based on the Stable Diffusion model for generating flexible residential
+layouts. The method offers multiple input types for learning objectives,
+allowing users to specify both boundaries and layouts. It incorporates natural
+language as design constraints and introduces ControlNet to enable stable
+layout generation through two distinct pathways. We also present a scheme that
+encapsulates design expertise within a knowledge graph and translates it into
+natural language, providing an interpretable representation of design
+knowledge. This comprehensibility and diversity of input options enable
+professionals and non-professionals to directly express design requirements,
+enhancing flexibility and controllability. Finally, experiments verify the
+flexibility of the proposed methods under multimodal constraints better than
+state-of-the-art models, even when specific semantic information about room
+areas or connections is incomplete.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Model is Secretly a Protein Sequence Optimizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinkai Wang, Jiaxing He, Yuanqi Du, Xiaohui Chen, Jianan Canal Li, Li-Ping Liu, Xiaolin Xu, Soha Hassoun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the protein sequence engineering problem, which aims to find
+protein sequences with high fitness levels, starting from a given wild-type
+sequence. Directed evolution has been a dominating paradigm in this field which
+has an iterative process to generate variants and select via experimental
+feedback. We demonstrate large language models (LLMs), despite being trained on
+massive texts, are secretly protein sequence optimizers. With a directed
+evolutionary method, LLM can perform protein engineering through Pareto and
+experiment-budget constrained optimization, demonstrating success on both
+synthetic and experimental fitness landscapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Perspective Transition of Large Language Models for Solving Subjective
+  Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaolong Wang, Yuanchi Zhang, Ziyue Wang, Yuzhuang Xu, Fuwen Luo, Yile Wang, Peng Li, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have revolutionized the field of natural
+language processing, enabling remarkable progress in various tasks. Different
+from objective tasks such as commonsense reasoning and arithmetic
+question-answering, the performance of LLMs on subjective tasks is still
+limited, where the perspective on the specific problem plays crucial roles for
+better interpreting the context and giving proper response. For example, in
+certain scenarios, LLMs may perform better when answering from an expert role
+perspective, potentially eliciting their relevant domain knowledge. In
+contrast, in some scenarios, LLMs may provide more accurate responses when
+answering from a third-person standpoint, enabling a more comprehensive
+understanding of the problem and potentially mitigating inherent biases. In
+this paper, we propose Reasoning through Perspective Transition (RPT), a method
+based on in-context learning that enables LLMs to dynamically select among
+direct, role, and third-person perspectives for the best way to solve
+corresponding subjective problem. Through extensive experiments on totally 12
+subjective tasks by using both closed-source and open-source LLMs including
+GPT-4, GPT-3.5, Llama-3, and Qwen-2, our method outperforms widely used single
+fixed perspective based methods such as chain-of-thought prompting and expert
+prompting, highlights the intricate ways that LLMs can adapt their perspectives
+to provide nuanced and contextually appropriate responses for different
+problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clone-Robust AI Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariel D. Procaccia, Benjamin Schiffer, Shirley Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge in training Large Language Models (LLMs) is properly aligning
+them with human preferences. Reinforcement Learning with Human Feedback (RLHF)
+uses pairwise comparisons from human annotators to train reward functions and
+has emerged as a popular alignment method. However, input datasets in RLHF are
+not necessarily balanced in the types of questions and answers that are
+included. Therefore, we want RLHF algorithms to perform well even when the set
+of alternatives is not uniformly distributed. Drawing on insights from social
+choice theory, we introduce robustness to approximate clones, a desirable
+property of RLHF algorithms which requires that adding near-duplicate
+alternatives does not significantly change the learned reward function. We
+first demonstrate that the standard RLHF algorithm based on regularized maximum
+likelihood estimation (MLE) fails to satisfy this property. We then propose the
+weighted MLE, a new RLHF algorithm that modifies the standard regularized MLE
+by weighting alternatives based on their similarity to other alternatives. This
+new algorithm guarantees robustness to approximate clones while preserving
+desirable theoretical properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-based Identity Fraud Detection: A Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuo Jun Zhang, Asif Q. Gill, Bo Liu, Memoona J. Anwar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of digital services, a large volume of personally
+identifiable information (PII) is stored online and is subject to cyberattacks
+such as Identity fraud. Most recently, the use of Artificial Intelligence (AI)
+enabled deep fake technologies has significantly increased the complexity of
+identity fraud. Fraudsters may use these technologies to create highly
+sophisticated counterfeit personal identification documents, photos and videos.
+These advancements in the identity fraud landscape pose challenges for identity
+fraud detection and society at large. There is a pressing need to review and
+understand identity fraud detection methods, their limitations and potential
+solutions. This research aims to address this important need by using the
+well-known systematic literature review method. This paper reviewed a selected
+set of 43 papers across 4 major academic literature databases. In particular,
+the review results highlight the two types of identity fraud prevention and
+detection methods, in-depth and open challenges. The results were also
+consolidated into a taxonomy of AI-based identity fraud detection and
+prevention methods including key insights and trends. Overall, this paper
+provides a foundational knowledge base to researchers and practitioners for
+further research and development in this important area of digital identity
+fraud.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foundations of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Xiao, Jingbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This is a book about large language models. As indicated by the title, it
+primarily focuses on foundational concepts rather than comprehensive coverage
+of all cutting-edge technologies. The book is structured into four main
+chapters, each exploring a key area: pre-training, generative models, prompting
+techniques, and alignment methods. It is intended for college students,
+professionals, and practitioners in natural language processing and related
+fields, and can serve as a reference for anyone interested in large language
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable Droplet Digital PCR Assay for Trustworthy Molecular
+  Diagnostics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyuan Wei, Yucheng Wu, Fuyang Qu, Yao Mu, Yi-Ping Ho, Ho-Pui Ho, Wu Yuan, Mingkun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate molecular quantification is essential for advancing research and
+diagnostics in fields such as infectious diseases, cancer biology, and genetic
+disorders. Droplet digital PCR (ddPCR) has emerged as a gold standard for
+achieving absolute quantification. While computational ddPCR technologies have
+advanced significantly, achieving automatic interpretation and consistent
+adaptability across diverse operational environments remains a challenge. To
+address these limitations, we introduce the intelligent interpretable droplet
+digital PCR (I2ddPCR) assay, a comprehensive framework integrating front-end
+predictive models (for droplet segmentation and classification) with GPT-4o
+multimodal large language model (MLLM, for context-aware explanations and
+recommendations) to automate and enhance ddPCR image analysis. This approach
+surpasses the state-of-the-art models, affording 99.05% accuracy in processing
+complex ddPCR images containing over 300 droplets per image with varying
+signal-to-noise ratios (SNRs). By combining specialized neural networks and
+large language models, the I2ddPCR assay offers a robust and adaptable solution
+for absolute molecular quantification, achieving a sensitivity capable of
+detecting low-abundance targets as low as 90.32 copies/{\mu}L. Furthermore, it
+improves model's transparency through detailed explanation and troubleshooting
+guidance, empowering users to make informed decisions. This innovative
+framework has the potential to benefit molecular diagnostics, disease research,
+and clinical applications, especially in resource-constrained settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Law-Based Transformation (ALT): A Lightweight Feature
+  Representation for Time Series Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcell T. Kurbucz, Balázs Hajós, Balázs P. Halmos, Vince Á. Molnár, Antal Jakovác
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series classification (TSC) is fundamental in numerous domains,
+including finance, healthcare, and environmental monitoring. However,
+traditional TSC methods often struggle with the inherent complexity and
+variability of time series data. Building on our previous work with the linear
+law-based transformation (LLT) - which improved classification accuracy by
+transforming the feature space based on key data patterns - we introduce
+adaptive law-based transformation (ALT). ALT enhances LLT by incorporating
+variable-length shifted time windows, enabling it to capture distinguishing
+patterns of various lengths and thereby handle complex time series more
+effectively. By mapping features into a linearly separable space, ALT provides
+a fast, robust, and transparent solution that achieves state-of-the-art
+performance with only a few hyperparameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ASTRA: A Scene-aware <span class="highlight-title">TRAnsformer</span>-based model for trajectory prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzeddin Teeti, Aniket Thomas, Munish Monga, Sachin Kumar, Uddeshya Singh, Andrew Bradley, Biplab Banerjee, Fabio Cuzzolin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present ASTRA (A} Scene-aware TRAnsformer-based model for trajectory
+prediction), a light-weight pedestrian trajectory forecasting model that
+integrates the scene context, spatial dynamics, social inter-agent interactions
+and temporal progressions for precise forecasting. We utilised a U-Net-based
+feature extractor, via its latent vector representation, to capture scene
+representations and a graph-aware transformer encoder for capturing social
+interactions. These components are integrated to learn an agent-scene aware
+embedding, enabling the model to learn spatial dynamics and forecast the future
+trajectory of pedestrians. The model is designed to produce both deterministic
+and stochastic outcomes, with the stochastic predictions being generated by
+incorporating a Conditional Variational Auto-Encoder (CVAE). ASTRA also
+proposes a simple yet effective weighted penalty loss function, which helps to
+yield predictions that outperform a wide array of state-of-the-art
+deterministic and generative models. ASTRA demonstrates an average improvement
+of 27%/10% in deterministic/stochastic settings on the ETH-UCY dataset, and 26%
+improvement on the PIE dataset, respectively, along with seven times fewer
+parameters than the existing state-of-the-art model (see Figure 1).
+Additionally, the model's versatility allows it to generalize across different
+perspectives, such as Bird's Eye View (BEV) and Ego-Vehicle View (EVV).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Explainability to Interpretability: Interpretable Policies in
+  Reinforcement Learning Via Model Explanation <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peilang Li, Umer Siddique, Yongcan Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep reinforcement learning (RL) has shown remarkable success in complex
+domains, however, the inherent black box nature of deep neural network policies
+raises significant challenges in understanding and trusting the decision-making
+processes. While existing explainable RL methods provide local insights, they
+fail to deliver a global understanding of the model, particularly in
+high-stakes applications. To overcome this limitation, we propose a novel
+model-agnostic approach that bridges the gap between explainability and
+interpretability by leveraging Shapley values to transform complex deep RL
+policies into transparent representations. The proposed approach offers two key
+contributions: a novel approach employing Shapley values to policy
+interpretation beyond local explanations and a general framework applicable to
+off-policy and on-policy algorithms. We evaluate our approach with three
+existing deep RL algorithms and validate its performance in two classic control
+environments. The results demonstrate that our approach not only preserves the
+original models' performance but also generates more stable interpretable
+policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Deployable AI (DAI) Workshop at the Thirty-Ninth AAAI
+  Conference on Artificial Intelligence (AAAI-25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrossModalityDiffusion: Multi-Modal Novel View Synthesis with Unified
+  Intermediate Representation <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09838v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09838v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Berian, Daniel Brignac, JhihYang Wu, Natnael Daba, Abhijit Mahalanobis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geospatial imaging leverages data from diverse sensing modalities-such as EO,
+SAR, and LiDAR, ranging from ground-level drones to satellite views. These
+heterogeneous inputs offer significant opportunities for scene understanding
+but present challenges in interpreting geometry accurately, particularly in the
+absence of precise ground truth data. To address this, we propose
+CrossModalityDiffusion, a modular framework designed to generate images across
+different modalities and viewpoints without prior knowledge of scene geometry.
+CrossModalityDiffusion employs modality-specific encoders that take multiple
+input images and produce geometry-aware feature volumes that encode scene
+structure relative to their input camera positions. The space where the feature
+volumes are placed acts as a common ground for unifying input modalities. These
+feature volumes are overlapped and rendered into feature images from novel
+perspectives using volumetric rendering techniques. The rendered feature images
+are used as conditioning inputs for a modality-specific diffusion model,
+enabling the synthesis of novel images for the desired output modality. In this
+paper, we show that jointly training different modules ensures consistent
+geometric understanding across all modalities within the framework. We validate
+CrossModalityDiffusion's capabilities on the synthetic ShapeNet cars dataset,
+demonstrating its effectiveness in generating accurate and consistent novel
+views across multiple imaging modalities and perspectives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the 2025 WACV workshop GeoCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Language Barriers in Healthcare: A Study on Arabic LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nada Saadi, Tathagata Raha, Clément Christophe, Marco AF Pimentel, Ronnie Rajan, Praveen K Kanithi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the challenges of developing large language models
+(LLMs) proficient in both multilingual understanding and medical knowledge. We
+demonstrate that simply translating medical data does not guarantee strong
+performance on clinical tasks in the target language. Our experiments reveal
+that the optimal language mix in training data varies significantly across
+different medical tasks. We find that larger models with carefully calibrated
+language ratios achieve superior performance on native-language clinical tasks.
+Furthermore, our results suggest that relying solely on fine-tuning may not be
+the most effective approach for incorporating new language knowledge into LLMs.
+Instead, data and computationally intensive pretraining methods may still be
+necessary to achieve optimal performance in multilingual medical settings.
+These findings provide valuable guidance for building effective and inclusive
+medical AI systems for diverse linguistic communities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Single-Image-Based Morphing Attack Detection Using Deep
+  Representations from Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Zhang, Raghavendra Ramachandra, Kiran Raja, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face morphing attacks have posed severe threats to Face Recognition Systems
+(FRS), which are operated in border control and passport issuance use cases.
+Correspondingly, morphing attack detection algorithms (MAD) are needed to
+defend against such attacks. MAD approaches must be robust enough to handle
+unknown attacks in an open-set scenario where attacks can originate from
+various morphing generation algorithms, post-processing and the diversity of
+printers/scanners. The problem of generalization is further pronounced when the
+detection has to be made on a single suspected image. In this paper, we propose
+a generalized single-image-based MAD (S-MAD) algorithm by learning the encoding
+from Vision Transformer (ViT) architecture. Compared to CNN-based
+architectures, ViT model has the advantage on integrating local and global
+information and hence can be suitable to detect the morphing traces widely
+distributed among the face region. Extensive experiments are carried out on
+face morphing datasets generated using publicly available FRGC face datasets.
+Several state-of-the-art (SOTA) MAD algorithms, including representative ones
+that have been publicly evaluated, have been selected and benchmarked with our
+ViT-based approach. Obtained results demonstrate the improved detection
+performance of the proposed S-MAD method on inter-dataset testing (when
+different data is used for training and testing) and comparable performance on
+intra-dataset testing (when the same data is used for training and testing)
+experimental protocol.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Generalization in Chain of Thought Reasoning for Smaller
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09804v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09804v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxwell J. Yin, Dingyi Jiang, Yongbing Chen, Boyu Wang, Charles Ling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) reasoning in smaller language models is a challenging
+natural language process problem yet highly desirable in many real-life
+applications. Existing CoT knowledge distillation methods often suffer from
+overly conservative memorization in smaller LLMs, leading to low generalization
+confidence. As fully preserving the CoT ability of teacher model is impossible,
+we hypothesize that adversarial CoT fine-tuning is crucial for developing
+smaller LLM with robust CoT generalization. To this end, we propose
+\textit{PRompt-Assisted Domain-Adversarial fine-tuning} (PRADA), a principled
+fine-tuning framework that integrates diverse CoT domains. Specifically, PRADA
+pioneers two CoT improvements in smaller LLM: (1) Recovering the
+domain-invariant feature insight which typically lost during distillation with
+domain adversarial fine-tuning; (2) Enhancing the domain adaptability of CoT
+prompt engineering by employing domain-adversarial approaches. We theoretically
+demonstrate the effectiveness of our approach and empirically show that it
+significantly outperforms the state of the arts in a wide range of tasks.
+Moreover, our empirical findings reveal that the smaller LLM, when leveraging
+PRADA, aligns closely with domain knowledge, thereby improving the
+explainability of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniThink: Expanding Knowledge Boundaries in Machine Writing through
+  Thinking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekun Xi, Wenbiao Yin, Jizhan Fang, Jialong Wu, Runnan Fang, Ningyu Zhang, Jiang Yong, Pengjun Xie, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine writing with large language models often relies on
+retrieval-augmented generation. However, these approaches remain confined
+within the boundaries of the model's predefined scope, limiting the generation
+of content with rich information. Specifically, vanilla-retrieved information
+tends to lack depth, utility, and suffers from redundancy, which negatively
+impacts the quality of generated articles, leading to shallow, repetitive, and
+unoriginal outputs. To address these issues, we propose OmniThink, a machine
+writing framework that emulates the human-like process of iterative expansion
+and reflection. The core idea behind OmniThink is to simulate the cognitive
+behavior of learners as they progressively deepen their knowledge of the
+topics. Experimental results demonstrate that OmniThink improves the knowledge
+density of generated articles without compromising metrics such as coherence
+and depth. Human evaluations and expert feedback further highlight the
+potential of OmniThink to address real-world challenges in the generation of
+long-form articles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiple Choice Questions: Reasoning Makes Large Language Models (LLMs)
+  More Self-Confident Even When They Are Wrong 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tairan Fu, Javier Conde, Gonzalo Martínez, María Grandury, Pedro Reviriego
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most widely used methods to evaluate LLMs are Multiple Choice
+Question (MCQ) tests. MCQ benchmarks enable the testing of LLM knowledge on
+almost any topic at scale as the results can be processed automatically. To
+help the LLM answer, a few examples called few shots can be included in the
+prompt. Moreover, the LLM can be asked to answer the question directly with the
+selected option or to first provide the reasoning and then the selected answer,
+which is known as chain of thought. In addition to checking whether the
+selected answer is correct, the evaluation can look at the LLM-estimated
+probability of its response as an indication of the confidence of the LLM in
+the response. In this paper, we study how the LLM confidence in its answer
+depends on whether the model has been asked to answer directly or to provide
+the reasoning before answering. The results of the evaluation of questions on a
+wide range of topics in seven different models show that LLMs are more
+confident in their answers when they provide reasoning before the answer. This
+occurs regardless of whether the selected answer is correct. Our hypothesis is
+that this behavior is due to the reasoning that modifies the probability of the
+selected answer, as the LLM predicts the answer based on the input question and
+the reasoning that supports the selection made. Therefore, LLM estimated
+probabilities seem to have intrinsic limitations that should be understood in
+order to use them in evaluation procedures. Interestingly, the same behavior
+has been observed in humans, for whom explaining an answer increases confidence
+in its correctness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meaning-Typed Programming: Language-level Abstractions and Runtime for
+  GenAI Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.08965v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.08965v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Mars, Yiping Kang, Jayanaka L. Dantanarayana, Kugesan Sivasothynathan, Christopher Clarke, Baichuan Li, Krisztian Flautner, Lingjia Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Software is rapidly evolving from being programmed with traditional logical
+code, to neuro-integrated applications that leverage generative AI and large
+language models (LLMs) for application functionality. This shift increases the
+complexity of building applications, as developers now must reasoning about,
+program, and prompt LLMs. Despite efforts to create tools to assist with prompt
+engineering, these solutions often introduce additional layers of complexity to
+the development of neuro-integrated applications. This paper proposes
+meaning-typed programming (MTP), a novel approach to simplify the creation of
+neuro-integrated applications by introducing new language-level abstractions
+that hide the complexities of LLM integration. Our key insight is that typical
+conventional code already possesses a high level of semantic richness that can
+be automatically reasoned about, as it is designed to be readable and
+maintainable by humans. Leveraging this insight, we conceptualize LLMs as
+meaning-typed code constructs and introduce a by abstraction at the language
+level, MT-IR, a new meaning-based intermediate representation at the compiler
+level, and MT Runtime, an automated run-time engine for LLM integration and
+operations. We implement MTP in a production-grade Python super-set language
+called Jac and perform an extensive evaluation. Our results demonstrate that
+MTP not only simplifies the development process but also meets or exceeds the
+efficacy of state-of-the-art manual and tool-assisted prompt engineering
+techniques in terms of accuracy and usability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using Machine Learning to Discover Parsimonious and
+  Physically-Interpretable Representations of Catchment-Scale Rainfall-Runoff
+  Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04845v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04845v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan-Heng Wang, Hoshin V. Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the excellent real-world predictive performance of modern machine
+learning (ML) methods, many scientists remain hesitant to discard traditional
+physical-conceptual (PC) approaches due mainly to their relative
+interpretability, which contributes to credibility during decision-making. In
+this context, a currently underexplored aspect of ML is how to develop
+minimally-optimal representations that can facilitate better insight regarding
+system functioning. Regardless of how this is achieved, it is arguably true
+that parsimonious representations better support the advancement of scientific
+understanding. Our own view is that ML-based modeling of geoscientific systems
+should be based in the use of computational units that are fundamentally
+interpretable by design.
+  This paper continues our exploration of how the strengths of ML can be
+exploited in the service of better understanding via scientific investigation.
+Here, we use the Mass Conserving Perceptron (MCP) as the fundamental
+computational unit in a generic network architecture consisting of nodes
+arranged in series and parallel to explore several generic and important issues
+related to the use of observational data for constructing input-state-output
+models of dynamical systems. In the context of lumped catchment modeling, we
+show that physical interpretability and excellent predictive performance can
+both be achieved using a relatively parsimonious distributed-state
+multiple-flow-path network with context-dependent gating and information
+sharing across the nodes, suggesting that MCP-based modeling can play a
+significant role in application of ML to geoscientific investigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>74 Pages, 4 Tables, 13 Figures, 11 Tables and 11 Figures in
+  Supplementary Materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NL2KQL: From Natural Language to Kusto Query 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.02933v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.02933v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinye Tang, Amir H. Abdi, Jeremias Eichelbaum, Mahan Das, Alex Klein, Nihal Irmak Pakis, William Blum, Daniel L Mace, Tanvi Raja, Namrata Padmanabhan, Ye Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data is growing rapidly in volume and complexity. Proficiency in database
+query languages is pivotal for crafting effective queries. As coding assistants
+become more prevalent, there is significant opportunity to enhance database
+query languages. The Kusto Query Language (KQL) is a widely used query language
+for large semi-structured data such as logs, telemetries, and time-series for
+big data analytics platforms. This paper introduces NL2KQL an innovative
+framework that uses large language models (LLMs) to convert natural language
+queries (NLQs) to KQL queries. The proposed NL2KQL framework includes several
+key components: Schema Refiner which narrows down the schema to its most
+pertinent elements; the Few-shot Selector which dynamically selects relevant
+examples from a few-shot dataset; and the Query Refiner which repairs syntactic
+and semantic errors in KQL queries. Additionally, this study outlines a method
+for generating large datasets of synthetic NLQ-KQL pairs which are valid within
+a specific database contexts. To validate NL2KQL's performance, we utilize an
+array of online (based on query execution) and offline (based on query parsing)
+metrics. Through ablation studies, the significance of each framework component
+is examined, and the datasets used for benchmarking are made publicly
+available. This work is the first of its kind and is compared with available
+baselines to demonstrate its effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Frechet Music Distance: A Metric For Generative Symbolic Music
+  Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07948v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07948v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Retkowski, Jakub Stępniak, Mateusz Modrzejewski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we introduce the Frechet Music Distance (FMD), a novel
+evaluation metric for generative symbolic music models, inspired by the Frechet
+Inception Distance (FID) in computer vision and Frechet Audio Distance (FAD) in
+generative audio. FMD calculates the distance between distributions of
+reference and generated symbolic music embeddings, capturing abstract musical
+features. We validate FMD across several datasets and models. Results indicate
+that FMD effectively differentiates model quality, providing a domain-specific
+metric for evaluating symbolic music generation, and establishing a
+reproducible standard for future research in symbolic music modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamics of Moral Behavior in Heterogeneous Populations of Learning
+  Agents <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04202v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04202v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizaveta Tennant, Stephen Hailes, Mirco Musolesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Growing concerns about safety and alignment of AI systems highlight the
+importance of embedding moral capabilities in artificial agents: a promising
+solution is the use of learning from experience, i.e., Reinforcement Learning.
+In multi-agent (social) environments, complex population-level phenomena may
+emerge from interactions between individual learning agents. Many of the
+existing studies rely on simulated social dilemma environments to study the
+interactions of independent learning agents; however, they tend to ignore the
+moral heterogeneity that is likely to be present in societies of agents in
+practice. For example, at different points in time a single learning agent may
+face opponents who are consequentialist (i.e., focused on maximizing outcomes
+over time), norm-based (i.e., conforming to specific norms), or virtue-based
+(i.e., considering a combination of different virtues). The extent to which
+agents' co-development may be impacted by such moral heterogeneity in
+populations is not well understood. In this paper, we present a study of the
+learning dynamics of morally heterogeneous populations interacting in a social
+dilemma setting. Using an Iterated Prisoner's Dilemma environment with a
+partner selection mechanism, we investigate the extent to which the prevalence
+of diverse moral agents in populations affects individual agents' learning
+behaviors and emergent population-level outcomes. We observe several types of
+non-trivial interactions between pro-social and anti-social agents, and find
+that certain types of moral agents are able to steer selfish agents towards
+more cooperative behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at AIES 2024 (7th AAAI/ACM Conference on AI, Ethics, and
+  Society - San Jose, CA, USA) - see
+  https://ojs.aaai.org/index.php/AIES/article/view/31736</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convex Markov Games: A Framework for Creativity, Imitation, Fairness,
+  and Safety in Multiagent Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.16600v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.16600v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ian Gemp, Andreas Haupt, Luke Marris, Siqi Liu, Georgios Piliouras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Behavioral diversity, expert imitation, fairness, safety goals and others
+give rise to preferences in sequential decision making domains that do not
+decompose additively across time. We introduce the class of convex Markov games
+that allow general convex preferences over occupancy measures. Despite infinite
+time horizon and strictly higher generality than Markov games, pure strategy
+Nash equilibria exist. Furthermore, equilibria can be approximated empirically
+by performing gradient descent on an upper bound of exploitability. Our
+experiments reveal novel solutions to classic repeated normal-form games, find
+fair solutions in a repeated asymmetric coordination game, and prioritize safe
+long-term behavior in a robot warehouse environment. In the prisoner's dilemma,
+our algorithm leverages transient imitation to find a policy profile that
+deviates from observed human play only slightly, yet achieves higher per-player
+utility while also being three orders of magnitude less exploitable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Systems Thinking Approach to Algorithmic Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16641v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16641v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chris Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Systems thinking provides us with a way to model the algorithmic fairness
+problem by allowing us to encode prior knowledge and assumptions about where we
+believe bias might exist in the data generating process. We can then encode
+these beliefs as a series of causal graphs, enabling us to link AI/ML systems
+to politics and the law. This allows us to combine techniques from machine
+learning, causal inference, and system dynamics in order to capture different
+emergent aspects of the fairness problem. We can use systems thinking to help
+policymakers on both sides of the political aisle to understand the complex
+trade-offs that exist from different types of fairness policies, providing a
+sociotechnical foundation for designing AI policy that is aligned to their
+political agendas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to the 2025 ACM FAccT conference for
+  review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comparative Study on Multi-task Uncertainty Quantification in Semantic
+  Segmentation and Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17097v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17097v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Landgraf, Markus Hillemann, Theodor Kapler, Markus Ulrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks excel in perception tasks such as semantic segmentation
+and monocular depth estimation, making them indispensable in safety-critical
+applications like autonomous driving and industrial inspection. However, they
+often suffer from overconfidence and poor explainability, especially for
+out-of-domain data. While uncertainty quantification has emerged as a promising
+solution to these challenges, multi-task settings have yet to be explored. In
+an effort to shed light on this, we evaluate Monte Carlo Dropout, Deep
+Sub-Ensembles, and Deep Ensembles for joint semantic segmentation and monocular
+depth estimation. Thereby, we reveal that Deep Ensembles stand out as the
+preferred choice, particularly in out-of-domain scenarios, and show the
+potential benefit of multi-task learning with regard to the uncertainty quality
+in comparison to solving both tasks separately. Additionally, we highlight the
+impact of employing different uncertainty thresholds to classify pixels as
+certain or uncertain, with the median uncertainty emerging as a robust default.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript is an extended version of a previously published
+  conference paper and is currently in review for a journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> of Foundation Models in Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10729v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10729v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wasif Khan, Seowung Leem, Kyle B. See, Joshua K. Wong, Shaoting Zhang, Ruogu Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FMs) are large-scale deep learning models trained on
+massive datasets, often using self-supervised learning techniques. These models
+serve as a versatile base for a wide range of downstream tasks, including those
+in medicine and healthcare. FMs have demonstrated remarkable success across
+multiple healthcare domains. However, existing surveys in this field do not
+comprehensively cover all areas where FMs have made significant strides. In
+this survey, we present a comprehensive review of FMs in medicine, focusing on
+their evolution, learning strategies, flagship models, applications, and
+associated challenges. We examine how prominent FMs, such as the BERT and GPT
+families, are transforming various aspects of healthcare, including clinical
+large language models, medical image analysis, and omics research.
+Additionally, we provide a detailed taxonomy of FM-enabled healthcare
+applications, spanning clinical natural language processing, medical computer
+vision, graph learning, and other biology- and omics- related tasks. Despite
+the transformative potentials of FMs, they also pose unique challenges. This
+survey delves into these challenges and highlights open research questions and
+lessons learned to guide researchers and practitioners. Our goal is to provide
+valuable insights into the capabilities of FMs in health, facilitating
+responsible deployment and mitigating associated risks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Zero-Shot Object-Level Change Detection by Incorporating
+  Visual Correspondence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05555v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05555v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Huy Nguyen, Pooyan Rahmanzadehgervi, Long Mai, Anh Totti Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting object-level changes between two images across possibly different
+views is a core task in many applications that involve visual inspection or
+camera surveillance. Existing change-detection approaches suffer from three
+major limitations: (1) lack of evaluation on image pairs that contain no
+changes, leading to unreported false positive rates; (2) lack of
+correspondences (i.e., localizing the regions before and after a change); and
+(3) poor zero-shot generalization across different domains. To address these
+issues, we introduce a novel method that leverages change correspondences (a)
+during training to improve change detection accuracy, and (b) at test time, to
+minimize false positives. That is, we harness the supervision labels of where
+an object is added or removed to supervise change detectors, improving their
+accuracy over previous work by a large margin. Our work is also the first to
+predict correspondences between pairs of detected changes using estimated
+homography and the Hungarian algorithm. Our model demonstrates superior
+performance over existing methods, achieving state-of-the-art results in change
+detection and change correspondence accuracy across both in-distribution and
+zero-shot benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hybrid Approaches for Moral Value Alignment in AI Agents: a Manifesto 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01818v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01818v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizaveta Tennant, Stephen Hailes, Mirco Musolesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasing interest in ensuring the safety of next-generation Artificial
+Intelligence (AI) systems calls for novel approaches to embedding morality into
+autonomous agents. This goal differs qualitatively from traditional
+task-specific AI methodologies. In this paper, we provide a systematization of
+existing approaches to the problem of introducing morality in machines -
+modelled as a continuum. Our analysis suggests that popular techniques lie at
+the extremes of this continuum - either being fully hard-coded into top-down,
+explicit rules, or entirely learned in a bottom-up, implicit fashion with no
+direct statement of any moral principle (this includes learning from human
+feedback, as applied to the training and finetuning of large language models,
+or LLMs). Given the relative strengths and weaknesses of each type of
+methodology, we argue that more hybrid solutions are needed to create adaptable
+and robust, yet controllable and interpretable agentic systems. To that end,
+this paper discusses both the ethical foundations (including deontology,
+consequentialism and virtue ethics) and implementations of morally aligned AI
+systems.
+  We present a series of case studies that rely on intrinsic rewards, moral
+constraints or textual instructions, applied to either pure-Reinforcement
+Learning or LLM-based agents. By analysing these diverse implementations under
+one framework, we compare their relative strengths and shortcomings in
+developing morally aligned AI systems. We then discuss strategies for
+evaluating the effectiveness of moral learning agents. Finally, we present open
+research questions and implications for the future of AI safety and ethics
+which are emerging from this hybrid framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Monte Carlo Tree Search for Comprehensive Exploration in LLM-Based
+  Automatic Heuristic Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08603v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08603v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhi Zheng, Zhuoliang Xie, Zhenkun Wang, Bryan Hooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handcrafting heuristics for solving complex planning tasks (e.g., NP-hard
+combinatorial optimization (CO) problems) is a common practice but requires
+extensive domain knowledge. Recently, Large Language Model (LLM)-based
+automatic heuristics design (AHD) methods have shown promise in generating
+high-quality heuristics without manual intervention. Existing LLM-based AHD
+methods employ a population to maintain a fixed number of top-performing
+LLM-generated heuristics and introduce evolutionary computation (EC) to enhance
+the population iteratively. However, the population-based procedure brings
+greedy properties, often resulting in convergence to local optima. Instead, to
+more comprehensively explore the space of heuristics, we propose using Monte
+Carlo Tree Search (MCTS) for LLM-based heuristic evolution while preserving all
+LLM-generated heuristics in a tree structure. With a novel thought-alignment
+process and an exploration-decay technique, the proposed MCTS-AHD method
+delivers significantly higher-quality heuristics on various complex tasks. Our
+code is available at https://github.com/zz1358m/MCTS-AHD-master.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReFactor GNNs: Revisiting Factorisation-based Models from a
+  Message-Passing Perspective <span class="chip">NeurIPS
+  2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.09980v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.09980v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihong Chen, Pushkar Mishra, Luca Franceschi, Pasquale Minervini, Pontus Stenetorp, Sebastian Riedel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Factorisation-based Models (FMs), such as DistMult, have enjoyed enduring
+success for Knowledge Graph Completion (KGC) tasks, often outperforming Graph
+Neural Networks (GNNs). However, unlike GNNs, FMs struggle to incorporate node
+features and generalise to unseen nodes in inductive settings. Our work bridges
+the gap between FMs and GNNs by proposing ReFactor GNNs. This new architecture
+draws upon both modelling paradigms, which previously were largely thought of
+as disjoint. Concretely, using a message-passing formalism, we show how FMs can
+be cast as GNNs by reformulating the gradient descent procedure as
+message-passing operations, which forms the basis of our ReFactor GNNs. Across
+a multitude of well-established KGC benchmarks, our ReFactor GNNs achieve
+comparable transductive performance to FMs, and state-of-the-art inductive
+performance while using an order of magnitude fewer parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36th Conference on Neural Information Processing Systems (NeurIPS
+  2022)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian
+  Neural Networks <span class="chip">AAAI'2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20891v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20891v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bao Gia Doan, Afshar Shamsi, Xiao-Yu Guo, Arash Mohammadi, Hamid Alinejad-Rokny, Dino Sejdinovic, Damien Teney, Damith C. Ranasinghe, Ehsan Abbasnejad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational complexity of Bayesian learning is impeding its adoption in
+practical, large-scale tasks. Despite demonstrations of significant merits such
+as improved robustness and resilience to unseen or out-of-distribution inputs
+over their non- Bayesian counterparts, their practical use has faded to near
+insignificance. In this study, we introduce an innovative framework to mitigate
+the computational burden of Bayesian neural networks (BNNs). Our approach
+follows the principle of Bayesian techniques based on deep ensembles, but
+significantly reduces their cost via multiple low-rank perturbations of
+parameters arising from a pre-trained neural network. Both vanilla version of
+ensembles as well as more sophisticated schemes such as Bayesian learning with
+Stein Variational Gradient Descent (SVGD), previously deemed impractical for
+large models, can be seamlessly implemented within the proposed framework,
+called Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a
+dramatic reduction in the number of trainable parameters required to
+approximate a Bayesian posterior; and ii) it not only maintains, but in some
+instances, surpasses the performance of conventional Bayesian learning methods
+and non-Bayesian baselines. Our results with large-scale tasks such as
+ImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the
+effectiveness and versatility of Bella in building highly scalable and
+practical Bayesian deep models for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted in AAAI'2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modeling Time-Variant Responses of Optical Compressors with Selective
+  State Space Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12549v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12549v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Simionato, Stefano Fasciani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a method for modeling optical dynamic range compressors
+using deep neural networks with Selective State Space models. The proposed
+approach surpasses previous methods based on recurrent layers by employing a
+Selective State Space block to encode the input audio. It features a refined
+technique integrating Feature-wise Linear Modulation and Gated Linear Units to
+adjust the network dynamically, conditioning the compression's attack and
+release phases according to external parameters. The proposed architecture is
+well-suited for low-latency and real-time applications, crucial in live audio
+processing. The method has been validated on the analog optical compressors
+TubeTech CL 1B and Teletronix LA-2A, which possess distinct characteristics.
+Evaluation is performed using quantitative metrics and subjective listening
+tests, comparing the proposed method with other state-of-the-art models.
+Results show that our black-box modeling methods outperform all others,
+achieving accurate emulation of the compression process for both seen and
+unseen settings during training. We further show a correlation between this
+accuracy and the sampling density of the control parameters in the dataset and
+identify settings with fast attack and slow release as the most challenging to
+emulate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal of the Audio Engineering Society</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Few-Shot Image Classification through Learnable Multi-Scale
+  Embedding and Attention Mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.07989v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.07989v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Askari, Amirreza Fateh, Mohammad Reza Mohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of few-shot classification, the goal is to train a classifier
+using a limited number of samples while maintaining satisfactory performance.
+However, traditional metric-based methods exhibit certain limitations in
+achieving this objective. These methods typically rely on a single distance
+value between the query feature and support feature, thereby overlooking the
+contribution of shallow features. To overcome this challenge, we propose a
+novel approach in this paper. Our approach involves utilizing a multi-output
+embedding network that maps samples into distinct feature spaces. The proposed
+method extracts feature vectors at different stages, enabling the model to
+capture both global and abstract features. By utilizing these diverse feature
+spaces, our model enhances its performance. Moreover, employing a
+self-attention mechanism improves the refinement of features at each stage,
+leading to even more robust representations and improved overall performance.
+Furthermore, assigning learnable weights to each stage significantly improved
+performance and results. We conducted comprehensive evaluations on the
+MiniImageNet and FC100 datasets, specifically in the 5-way 1-shot and 5-way
+5-shot scenarios. Additionally, we performed cross-domain tasks across eight
+benchmark datasets, achieving high accuracy in the testing domains. These
+evaluations demonstrate the efficacy of our proposed method in comparison to
+state-of-the-art approaches. https://github.com/FatemehAskari/MSENet
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sines, Transient, Noise Neural Modeling of Piano Notes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.06513v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.06513v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Simionato, Stefano Fasciani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel method for emulating piano sounds. We propose
+to exploit the sines, transient, and noise decomposition to design a
+differentiable spectral modeling synthesizer replicating piano notes. Three
+sub-modules learn these components from piano recordings and generate the
+corresponding harmonic, transient, and noise signals. Splitting the emulation
+into three independently trainable models reduces the modeling tasks'
+complexity. The quasi-harmonic content is produced using a differentiable
+sinusoidal model guided by physics-derived formulas, whose parameters are
+automatically estimated from audio recordings. The noise sub-module uses a
+learnable time-varying filter, and the transients are generated using a deep
+convolutional network. From singular notes, we emulate the coupling between
+different keys in trichords with a convolutional-based network. Results show
+the model matches the partial distribution of the target while predicting the
+energy in the higher part of the spectrum presents more challenges. The energy
+distribution in the spectra of the transient and noise components is accurate
+overall. While the model is more computationally and memory efficient,
+perceptual tests reveal limitations in accurately modeling the attack phase of
+notes. Despite this, it generally achieves perceptual accuracy in emulating
+single notes and trichords.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safe Control and Learning Using the Generalized Action Governor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Li, Yutong Li, Ilya Kolmanovsky, Anouck Girard, H. Eric Tseng, Dimitar Filev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces a general framework for safe control and learning
+based on the generalized action governor (AG). The AG is a supervisory scheme
+for augmenting a nominal closed-loop system with the ability of strictly
+handling prescribed safety constraints. In the first part of this article, we
+present a generalized AG methodology and analyze its key properties in a
+general setting. Then, we introduce tailored AG design approaches derived from
+the generalized methodology for linear and discrete systems. Afterward, we
+discuss the application of the generalized AG to facilitate safe online
+learning, which aims at safely evolving control parameters using real-time data
+to enhance control performance in uncertain systems. We present two safe
+learning algorithms based on, respectively, reinforcement learning and
+data-driven Koopman operator-based control integrated with the generalized AG
+to exemplify this application. Finally, we illustrate the developments with a
+numerical example.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 4 figures, submitted to the International Journal of
+  Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Silent Abandonment in Text-Based Contact Centers: Identifying,
+  Quantifying, and Mitigating its Operational Impacts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08869v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08869v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Castellanos, Galit B. Yom-Tov, Yair Goldberg, Jaeyoung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the quest to improve services, companies offer customers the option to
+interact with agents via texting. Such contact centers face unique challenges
+compared to traditional call centers, as measuring customer experience proxies
+like abandonment and patience involves uncertainty. A key source of this
+uncertainty is silent abandonment, where customers leave without notifying the
+system, wasting agent time and leaving their status unclear. Silent abandonment
+also obscures whether a customer was served or left. Our goals are to measure
+the magnitude of silent abandonment and mitigate its effects. Classification
+models show that 3%-70% of customers across 17 companies abandon silently. In
+one study, 71.3% of abandoning customers did so silently, reducing agent
+efficiency by 3.2% and system capacity by 15.3%, incurring $5,457 in annual
+costs per agent. We develop an expectation-maximization (EM) algorithm to
+estimate customer patience under uncertainty and identify influencing
+covariates. We find that companies should use classification models to estimate
+abandonment scope and our EM algorithm to assess patience. We suggest
+strategies to operationally mitigate the impact of silent abandonment by
+predicting suspected silent-abandonment behavior or changing service design.
+Specifically, we show that while allowing customers to write while waiting in
+the queue creates a missing data challenge, it also significantly increases
+patience and reduces service time, leading to reduced abandonment and lower
+staffing requirements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>75% of the paper is an updated version of arXiv:2304.11754</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems
+  using Disparity Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.24031v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.24031v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariel Larey, Eyal Rond, Omer Achrack
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition technologies are increasingly used in various applications,
+yet they are vulnerable to face spoofing attacks. These spoofing attacks often
+involve unique 3D structures, such as printed papers or mobile device screens.
+Although stereo-depth cameras can detect such attacks effectively, their
+high-cost limits their widespread adoption. Conversely, two-sensor systems
+without extrinsic calibration offer a cost-effective alternative but are unable
+to calculate depth using stereo techniques. In this work, we propose a method
+to overcome this challenge by leveraging facial attributes to derive disparity
+information and estimate relative depth for anti-spoofing purposes, using
+non-calibrated systems. We introduce a multi-modal anti-spoofing model, coined
+Disparity Model, that incorporates created disparity maps as a third modality
+alongside the two original sensor modalities. We demonstrate the effectiveness
+of the Disparity Model in countering various spoof attacks using a
+comprehensive dataset collected from the Intel RealSense ID Solution F455. Our
+method outperformed existing methods in the literature, achieving an Equal
+Error Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False
+Positive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the
+errors of the best comparison method, respectively. Additionally, we introduce
+a model ensemble that addresses 3D spoof attacks as well, achieving an EER of
+2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a
+state-of-the-art solution for the challenging task of anti-spoofing in
+non-calibrated systems that lack depth information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ aiXcoder-7B: A Lightweight and Effective Large Language Model for Code
+  Processing <span class="chip">ICSE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.13187v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.13187v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Jiang, Jia Li, He Zong, Huanyu Liu, Hao Zhu, Shukai Hu, Erlu Li, Jiazheng Ding, Yu Han, Wei Ning, Gen Wang, Yihong Dong, Kechi Zhang, Ge Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have been widely used in code completion, and
+researchers are focusing on scaling up LLMs to improve their accuracy. However,
+larger LLMs have lower inference efficiency, affecting developers' experience
+and productivity. In this paper, we propose a lightweight and effective LLM for
+code completion named aiXcoder-7B. Compared to existing LLMs, aiXcoder-7B
+achieves higher code completion accuracy while having smaller scales (i.e., 7
+billion parameters). We attribute the superiority of aiXcoder-7B to three key
+factors: (1) Multi-objective training. We employ three training objectives, one
+of which is our proposed Structured Fill-In-the-Middle (SFIM). SFIM considers
+the syntax structures in code and effectively improves the performance of LLMs
+for code. (2) Diverse data sampling strategies. They consider inter-file
+relationships and enhance the capability of LLMs in understanding cross-file
+contexts. (3) Extensive high-quality data. We establish a rigorous data
+collection pipeline and consume a total of 1.2 trillion unique tokens for
+training aiXcoder-7B. This vast volume of data enables aiXcoder-7B to learn a
+broad distribution of code. We evaluate aiXcoder-7B in five popular code
+completion benchmarks and a new benchmark collected by this paper. The results
+show that aiXcoder-7B outperforms the latest six LLMs with similar sizes and
+even surpasses four larger LLMs (e.g., StarCoder2-15B and CodeLlama-34B),
+positioning aiXcoder-7B as a lightweight and effective LLM for academia and
+industry. Finally, we summarize three valuable insights for helping
+practitioners train the next generations of LLMs for code. aiXcoder-7B has been
+open-souced and gained significant attention. Until January 2025, aiXcoder-7B
+has received 2,226 GitHub Stars.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>(1) Accepted by the 47th International Conference on Software
+  Engineering (ICSE 2025). (2) aiXcoder-7B is available at
+  https://github.com/aixcoder-plugin/aiXcoder-7B</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Audio<span class="highlight-title">BERT</span>: Audio Knowledge Augmented Language Model <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08199v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08199v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyunjong Ok, Suho Yoo, Jaeho Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have identified that language models, pretrained on text-only
+datasets, often lack elementary visual knowledge, \textit{e.g.,} colors of
+everyday objects. Motivated by this observation, we ask whether a similar
+shortcoming exists in terms of the \textit{auditory} knowledge. To answer this
+question, we construct a new dataset called AuditoryBench, which consists of
+two novel tasks for evaluating auditory knowledge. Based on our analysis using
+the benchmark, we find that language models also suffer from a severe lack of
+auditory knowledge. To address this limitation, we propose AudioBERT, a novel
+method to augment the auditory knowledge of BERT through a retrieval-based
+approach. First, we detect auditory knowledge spans in prompts to query our
+retrieval model efficiently. Then, we inject audio knowledge into BERT and
+switch on low-rank adaptation for effective adaptation when audio knowledge is
+required. Our experiments demonstrate that AudioBERT is quite effective,
+achieving superior performance on the AuditoryBench. The dataset and code are
+available at \bulurl{https://github.com/HJ-Ok/AudioBERT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating alignment between humans and neural network representations
+  in image-based learning tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09377v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09377v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Can Demircan, Tankred Saanum, Leonardo Pettini, Marcel Binz, Blazej M Baczkowski, Christian F Doeller, Mona M Garvert, Eric Schulz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans represent scenes and objects in rich feature spaces, carrying
+information that allows us to generalise about category memberships and
+abstract functions with few examples. What determines whether a neural network
+model generalises like a human? We tested how well the representations of $86$
+pretrained neural network models mapped to human learning trajectories across
+two tasks where humans had to learn continuous relationships and categories of
+natural images. In these tasks, both human participants and neural networks
+successfully identified the relevant stimulus features within a few trials,
+demonstrating effective generalisation. We found that while training dataset
+size was a core determinant of alignment with human choices, contrastive
+training with multi-modal data (text and imagery) was a common feature of
+currently publicly available models that predicted human generalisation.
+Intrinsic dimensionality of representations had different effects on alignment
+for different model types. Lastly, we tested three sets of human-aligned
+representations and found no consistent improvements in predictive accuracy
+compared to the baselines. In conclusion, pretrained neural networks can serve
+to extract representations for cognitive models, as they appear to capture some
+fundamental aspects of cognition that are transferable across tasks. Both our
+paradigms and modelling approach offer a novel way to quantify alignment
+between neural networks and humans and extend cognitive science into more
+naturalistic domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Constraint Network from Demonstrations via Positive-Unlabeled
+  Learning with Memory Replay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16485v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16485v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baiyu Peng, Aude Billard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning for a wide range of real-world tasks necessitates to know and write
+all constraints. However, instances exist where these constraints are either
+unknown or challenging to specify accurately. A possible solution is to infer
+the unknown constraints from expert demonstration. The majority of prior works
+limit themselves to learning simple linear constraints, or require strong
+knowledge of the true constraint parameterization or environmental model. To
+mitigate these problems, this paper presents a positive-unlabeled (PU) learning
+approach to infer a continuous, arbitrary and possibly nonlinear, constraint
+from demonstration. From a PU learning view, We treat all data in
+demonstrations as positive (feasible) data, and learn a (sub)-optimal policy to
+generate high-reward-winning but potentially infeasible trajectories, which
+serve as unlabeled data containing both feasible and infeasible states. Under
+an assumption on data distribution, a feasible-infeasible classifier (i.e.,
+constraint model) is learned from the two datasets through a postprocessing PU
+learning technique. The entire method employs an iterative framework
+alternating between updating the policy, which generates and selects
+higher-reward policies, and updating the constraint model. Additionally, a
+memory buffer is introduced to record and reuse samples from previous
+iterations to prevent forgetting. The effectiveness of the proposed method is
+validated in two Mujoco environments, successfully inferring continuous
+nonlinear constraints and outperforming a baseline method in terms of
+constraint accuracy and policy safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Focus On This, Not That! Steering LLMs With Adaptive Feature
+  Specification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.22944v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.22944v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom A. Lamb, Adam Davies, Alasdair Paren, Philip H. S. Torr, Francesco Pinto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the success of Instruction Tuning (IT) in training large language
+models (LLMs) to perform arbitrary user-specified tasks, these models often
+still leverage spurious or biased features learned from their training data,
+leading to undesired behaviours when deploying them in new contexts. In this
+work, we introduce Focus Instruction Tuning (FIT), which trains LLMs to
+condition their responses by focusing on specific features whilst ignoring
+others, leading to different behaviours based on what features are specified.
+Across several experimental settings, we show that focus-tuned models can be
+adaptively steered by focusing on different features at inference-time: for
+instance, robustness can be improved by focusing on task-causal features and
+ignoring spurious features, and social bias can be mitigated by ignoring
+demographic categories. Furthermore, FIT can steer behaviour in new contexts,
+generalising under distribution shift and to new unseen features at inference
+time, and thereby facilitating more robust, fair, and controllable LLM
+applications in real-world environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Models in Vision: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.04747v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.04747v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florinel-Alin Croitoru, Vlad Hondru, Radu Tudor Ionescu, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising diffusion models represent a recent emerging topic in computer
+vision, demonstrating remarkable results in the area of generative modeling. A
+diffusion model is a deep generative model that is based on two stages, a
+forward diffusion stage and a reverse diffusion stage. In the forward diffusion
+stage, the input data is gradually perturbed over several steps by adding
+Gaussian noise. In the reverse stage, a model is tasked at recovering the
+original input data by learning to gradually reverse the diffusion process,
+step by step. Diffusion models are widely appreciated for the quality and
+diversity of the generated samples, despite their known computational burdens,
+i.e. low speeds due to the high number of steps involved during sampling. In
+this survey, we provide a comprehensive review of articles on denoising
+diffusion models applied in vision, comprising both theoretical and practical
+contributions in the field. First, we identify and present three generic
+diffusion modeling frameworks, which are based on denoising diffusion
+probabilistic models, noise conditioned score networks, and stochastic
+differential equations. We further discuss the relations between diffusion
+models and other deep generative models, including variational auto-encoders,
+generative adversarial networks, energy-based models, autoregressive models and
+normalizing flows. Then, we introduce a multi-perspective categorization of
+diffusion models applied in computer vision. Finally, we illustrate the current
+limitations of diffusion models and envision some interesting directions for
+future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Transactions on Pattern Analysis and Machine
+  Intelligence. 25 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Positive-Unlabeled Constraint Learning for Inferring Nonlinear
+  Continuous Constraints Functions from Expert Demonstrations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01622v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01622v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baiyu Peng, Aude Billard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning for diverse real-world robotic tasks necessitates to know and write
+all constraints. However, instances exist where these constraints are either
+unknown or challenging to specify accurately. A possible solution is to infer
+the unknown constraints from expert demonstration. This paper presents a novel
+two-step Positive-Unlabeled Constraint Learning (PUCL) algorithm to infer a
+continuous constraint function from demonstrations, without requiring prior
+knowledge of the true constraint parameterization or environmental model as
+existing works. We treat all data in demonstrations as positive (feasible)
+data, and learn a control policy to generate potentially infeasible
+trajectories, which serve as unlabeled data. The proposed two-step learning
+framework first identifies reliable infeasible data using a distance metric,
+and secondly learns a binary feasibility classifier (i.e., constraint function)
+from the feasible demonstrations and reliable infeasible data. The proposed
+method is flexible to learn complex-shaped constraint boundary and will not
+mistakenly classify demonstrations as infeasible as previous methods. The
+effectiveness of the proposed method is verified in four constrained
+environments, using a networked policy or a dynamical system policy. It
+successfully infers the continuous nonlinear constraints and outperforms other
+baseline methods in terms of constraint accuracy and policy safety. This work
+has been published in IEEE Robotics and Automation Letters (RA-L). Please refer
+to the final version at https://doi.org/10.1109/LRA.2024.3522756
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAGBench: Explainable Benchmark for Retrieval-Augmented Generation
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.11005v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.11005v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Friel, Masha Belyi, Atindriyo Sanyal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) has become a standard architectural
+pattern for incorporating domain-specific knowledge into user-facing chat
+applications powered by Large Language Models (LLMs). RAG systems are
+characterized by (1) a document retriever that queries a domain-specific corpus
+for context information relevant to an input query, and (2) an LLM that
+generates a response based on the provided query and context. However,
+comprehensive evaluation of RAG systems remains a challenge due to the lack of
+unified evaluation criteria and annotated datasets. In response, we introduce
+RAGBench: the first comprehensive, large-scale RAG benchmark dataset of 100k
+examples. It covers five unique industry-specific domains and various RAG task
+types. RAGBench examples are sourced from industry corpora such as user
+manuals, making it particularly relevant for industry applications. Further, we
+formalize the TRACe evaluation framework: a set of explainable and actionable
+RAG evaluation metrics applicable across all RAG domains. We release the
+labeled dataset at https://huggingface.co/datasets/rungalileo/ragbench.
+RAGBench explainable labels facilitate holistic evaluation of RAG systems,
+enabling actionable feedback for continuous improvement of production
+applications. Thorough extensive benchmarking, we find that LLM-based RAG
+evaluation methods struggle to compete with a finetuned RoBERTa model on the
+RAG evaluation task. We identify areas where existing approaches fall short and
+propose the adoption of RAGBench with TRACe towards advancing the state of RAG
+evaluation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Retrieval Based on Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04635v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04635v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Te-Lun Yang, Jyi-Shane Liu, Yuen-Hsien Tseng, Jyh-Shing Roger Jang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study develops a question-answering system based on Retrieval-Augmented
+Generation (RAG) using Chinese Wikipedia and Lawbank as retrieval sources.
+Using TTQA and TMMLU+ as evaluation datasets, the system employs BGE-M3 for
+dense vector retrieval to obtain highly relevant search results and
+BGE-reranker to reorder these results based on query relevance. The most
+pertinent retrieval outcomes serve as reference knowledge for a Large Language
+Model (LLM), enhancing its ability to answer questions and establishing a
+knowledge retrieval system grounded in generative AI. The system's
+effectiveness is assessed through a two-stage evaluation: automatic and
+assisted performance evaluations. The automatic evaluation calculates accuracy
+by comparing the model's auto-generated labels with ground truth answers,
+measuring performance under standardized conditions without human intervention.
+The assisted performance evaluation involves 20 finance-related multiple-choice
+questions answered by 20 participants without financial backgrounds. Initially,
+participants answer independently. Later, they receive system-generated
+reference information to assist in answering, examining whether the system
+improves accuracy when assistance is provided. The main contributions of this
+research are: (1) Enhanced LLM Capability: By integrating BGE-M3 and
+BGE-reranker, the system retrieves and reorders highly relevant results,
+reduces hallucinations, and dynamically accesses authorized or public knowledge
+sources. (2) Improved Data Privacy: A customized RAG architecture enables local
+operation of the LLM, eliminating the need to send private data to external
+servers. This approach enhances data security, reduces reliance on commercial
+services, lowers operational costs, and mitigates privacy risks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 13 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balancing Act: Prioritization Strategies for LLM-Designed Restless
+  Bandit Rewards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12112v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12112v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shresth Verma, Niclas Boehmer, Lingkai Kong, Milind Tambe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LLMs are increasingly used to design reward functions based on human
+preferences in Reinforcement Learning (RL). We focus on LLM-designed rewards
+for Restless Multi-Armed Bandits, a framework for allocating limited resources
+among agents. In applications such as public health, this approach empowers
+grassroots health workers to tailor automated allocation decisions to community
+needs. In the presence of multiple agents, altering the reward function based
+on human preferences can impact subpopulations very differently, leading to
+complex tradeoffs and a multi-objective resource allocation problem. We are the
+first to present a principled method termed Social Choice Language Model for
+dealing with these tradeoffs for LLM-designed rewards for multiagent planners
+in general and restless bandits in particular. The novel part of our model is a
+transparent and configurable selection component, called an adjudicator,
+external to the LLM that controls complex tradeoffs via a user-selected social
+welfare function. Our experiments demonstrate that our model reliably selects
+more effective, aligned, and balanced reward functions compared to purely
+LLM-based approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Assist Humans without Inferring Rewards <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02623v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02623v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Myers, Evan Ellis, Sergey Levine, Benjamin Eysenbach, Anca Dragan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assistive agents should make humans' lives easier. Classically, such
+assistance is studied through the lens of inverse reinforcement learning, where
+an assistive agent (e.g., a chatbot, a robot) infers a human's intention and
+then selects actions to help the human reach that goal. This approach requires
+inferring intentions, which can be difficult in high-dimensional settings. We
+build upon prior work that studies assistance through the lens of empowerment:
+an assistive agent aims to maximize the influence of the human's actions such
+that they exert a greater control over the environmental outcomes and can solve
+tasks in fewer steps. We lift the major limitation of prior work in this
+area--scalability to high-dimensional settings--with contrastive successor
+representations. We formally prove that these representations estimate a
+similar notion of empowerment to that studied by prior work and provide a
+ready-made mechanism for optimizing it. Empirically, our proposed method
+outperforms prior methods on synthetic benchmarks, and scales to Overcooked, a
+cooperative game setting. Theoretically, our work connects ideas from
+information theory, neuroscience, and reinforcement learning, and charts a path
+for representations to play a critical role in solving assistive problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference on Neural Information Processing Systems (NeurIPS), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TPIA: Towards Target-specific <span class="highlight-title">Prompt</span> Injection Attack against
+  Code-oriented Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.09164v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.09164v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Yang, Hongwei Yao, Bingrun Yang, Yiling He, Yiming Li, Tianwei Zhang, Zhan Qin, Kui Ren, Chun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, code-oriented large language models (Code LLMs) have been widely
+exploited to simplify and facilitate programming. With these tools, developers
+can easily generate the desired complete functional code based on incomplete
+code snippets and natural language prompts. Unfortunately, a few pioneering
+works revealed that these Code LLMs are vulnerable to backdoor and adversarial
+attacks. The former poisons the training data or model parameters, hijacking
+the LLMs to generate malicious code snippets when encountering the trigger. The
+latter crafts malicious adversarial input codes to reduce the quality of the
+generated codes. However, both attacks have some inherent limitations: backdoor
+attacks rely on the adversary's capability of controlling the model training
+process; adversarial attacks struggle with fulfilling specific malicious
+purposes. This paper presents a novel attack paradigm against Code LLMs, namely
+target-specific prompt injection attack (TPIA). TPIA generates non-functional
+perturbations containing the information of malicious instructions and inserts
+them into the victim's code context by spreading them into potentially used
+dependencies (e.g., packages or RAG's knowledge base). It induces the Code LLMs
+to generate attacker-specified malicious code snippets at the target location.
+In general, we compress the attacker-specified malicious objective into the
+perturbation by adversarial optimization based on greedy token search. We
+collect 13 representative malicious objectives to design 31 threat cases for
+three popular programming languages. We show that our TPIA can successfully
+attack three representative open-source Code LLMs (with an ASR of up to 97.9%)
+and two mainstream commercial Code LLM-integrated applications (with an ASR of
+over 90%) in all threat cases, using only a 12-token perturbation. Our work
+alerts a new practical threat of using Code LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PsyDI: Towards a Personalized and Progressively In-depth Chatbot for
+  Psychological Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.03337v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.03337v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueyan Li, Xinyan Chen, Yazhe Niu, Shuai Hu, Yu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of psychology, traditional assessment methods, such as
+standardized scales, are frequently critiqued for their static nature, lack of
+personalization, and reduced participant engagement, while comprehensive
+counseling evaluations are often inaccessible. The complexity of quantifying
+psychological traits further limits these methods. Despite advances with large
+language models (LLMs), many still depend on single-round Question-and-Answer
+interactions. To bridge this gap, we introduce PsyDI, a personalized and
+progressively in-depth chatbot designed for psychological measurements,
+exemplified by its application in the Myers-Briggs Type Indicator (MBTI)
+framework. PsyDI leverages user-related multi-modal information and engages in
+customized, multi-turn interactions to provide personalized, easily accessible
+measurements, while ensuring precise MBTI type determination. To address the
+challenge of unquantifiable psychological traits, we introduce a novel training
+paradigm that involves learning the ranking of proxy variables associated with
+these traits, culminating in a robust score model for MBTI measurements. The
+score model enables PsyDI to conduct comprehensive and precise measurements
+through multi-turn interactions within a unified estimation context. Through
+various experiments, we validate the efficacy of both the score model and the
+PsyDI pipeline, demonstrating its potential to serve as a general framework for
+psychological measurements. Furthermore, the online deployment of PsyDI has
+garnered substantial user engagement, with over 3,000 visits, resulting in the
+collection of numerous multi-turn dialogues annotated with MBTI types, which
+facilitates further research. The source code for the training and web service
+components is publicly available as a part of OpenDILab at:
+https://github.com/opendilab/PsyDI
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Overfitting in Graph Neural Networks via Feature and
+  Hyperplane Perturbation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15081v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15081v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoonhyuk Choi, Jiho Choi, Taewook Ko, Chong-Kwon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) are commonly used in semi-supervised settings.
+Previous research has primarily focused on finding appropriate graph filters
+(e.g. aggregation methods) to perform well on both homophilic and heterophilic
+graphs. While these methods are effective, they can still suffer from the
+sparsity of node features, where the initial data contain few non-zero
+elements. This can lead to overfitting in certain dimensions in the first
+projection matrix, as training samples may not cover the entire range of graph
+filters (hyperplanes). To address this, we propose a novel data augmentation
+strategy. Specifically, by flipping both the initial features and hyperplane,
+we create additional space for training, which leads to more precise updates of
+the learnable parameters and improved robustness for unseen features during
+inference. To the best of our knowledge, this is the first attempt to mitigate
+the overfitting caused by the initial features. Extensive experiments on
+real-world datasets show that our proposed technique increases node
+classification accuracy by up to 46.5% relatively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collaborative Gym: A Framework for Enabling and Evaluating Human-Agent
+  Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15701v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15701v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijia Shao, Vinay Samuel, Yucheng Jiang, John Yang, Diyi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in language models (LMs) have sparked growing interest in
+developing LM agents. While fully autonomous agents could excel in many
+scenarios, numerous use cases inherently require them to collaborate with
+humans due to humans' latent preferences, domain expertise, or need for
+control. To facilitate the study of human-agent collaboration, we present
+Collaborative Gym (Co-Gym), a general framework enabling asynchronous,
+tripartite interaction among agents, humans, and task environments. We
+instantiate Co-Gym with three representative tasks in both simulated and
+real-world conditions, and propose an evaluation framework that assesses both
+the collaboration outcomes and processes. Our findings reveal that
+collaborative agents consistently outperform their fully autonomous
+counterparts in task performance within those delivered cases, achieving win
+rates of 86% in Travel Planning, 74% in Tabular Analysis, and 66% in Related
+Work when evaluated by real users. However, our study also highlights
+significant challenges in developing collaborative agents, requiring
+advancements in core aspects of intelligence -- communication capabilities,
+situational awareness, and balancing autonomy and human control.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMRxRecon2024: A Multi-Modality, Multi-View K-Space <span class="highlight-title">Dataset</span> Boosting
+  Universal Machine Learning for Accelerated Cardiac MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zi Wang, Fanwen Wang, Chen Qin, Jun Lyu, Cheng Ouyang, Shuo Wang, Yan Li, Mengyao Yu, Haoyu Zhang, Kunyuan Guo, Zhang Shi, Qirong Li, Ziqiang Xu, Yajing Zhang, Hao Li, Sha Hua, Binghua Chen, Longyu Sun, Mengting Sun, Qin Li, Ying-Hua Chu, Wenjia Bai, Jing Qin, Xiahai Zhuang, Claudia Prieto, Alistair Young, Michael Markl, He Wang, Lianming Wu, Guang Yang, Xiaobo Qu, Chengyan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiac magnetic resonance imaging (MRI) has emerged as a clinically
+gold-standard technique for diagnosing cardiac diseases, thanks to its ability
+to provide diverse information with multiple modalities and anatomical views.
+Accelerated cardiac MRI is highly expected to achieve time-efficient and
+patient-friendly imaging, and then advanced image reconstruction approaches are
+required to recover high-quality, clinically interpretable images from
+undersampled measurements. However, the lack of publicly available cardiac MRI
+k-space dataset in terms of both quantity and diversity has severely hindered
+substantial technological progress, particularly for data-driven artificial
+intelligence. Here, we provide a standardized, diverse, and high-quality
+CMRxRecon2024 dataset to facilitate the technical development, fair evaluation,
+and clinical transfer of cardiac MRI reconstruction approaches, towards
+promoting the universal frameworks that enable fast and robust reconstructions
+across different cardiac MRI protocols in clinical practice. To the best of our
+knowledge, the CMRxRecon2024 dataset is the largest and most protocal-diverse
+publicly available cardiac k-space dataset. It is acquired from 330 healthy
+volunteers, covering commonly used modalities, anatomical views, and
+acquisition trajectories in clinical cardiac MRI workflows. Besides, an open
+platform with tutorials, benchmarks, and data processing tools is provided to
+facilitate data usage, advanced method development, and fair performance
+evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 3 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MERaLiON-TextLLM: Cross-Lingual Understanding of Large Language Models
+  in Chinese, Indonesian, Malay, and Singlish 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08335v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08335v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Huang, Tarun Kumar Vangani, Minh Duc Pham, Xunlong Zou, Bin Wang, Zhengyuan Liu, Ai Ti Aw
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilingual large language models (MLLMs) have shown impressive capabilities
+across a variety of languages. However, efficacy can differ greatly between
+different language families, especially for those with limited linguistic
+resources. This report presents MERaLiON-TextLLM, a series of open-source
+language models specifically tailored to improve understanding and generation
+in Chinese, Indonesian, Malay, and Singlish. The initial released model is
+built on Llama-3-8B-Base and refined through a meticulously crafted process of
+continued pre-training and weight merging. Our approach achieves performance
+improvements across benchmarks in these languages, exceeding the capabilities
+of the official Llama-3 models. We provide the model checkpoints as a resource
+to support further research and development in cross-lingual language
+understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do LLMs Really Think Step-by-step In Implicit Reasoning? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15862v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15862v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijiong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has been well-known that Chain-of-Thought can remarkably enhance LLMs'
+performance on complex tasks. However, because it also introduces slower
+inference speeds and higher computational costs, many researches have attempted
+to use implicit CoT, which does not need LLMs to explicitly generate the
+intermediate steps. However, the invisible reasoning process leaves us a doubt
+that, can implicit CoT really be equal to explicit CoT? Therefore, in this
+study, we address this question through experiments. We probe the information
+of intermediate steps from the model's hidden states when it is either trained
+or prompted to perform implicit CoT. The results surprisingly indicate that
+when prompted, LLMs hardly think about intermediate steps, suggesting they may
+just rely on experience rather than strict step-by-step reasoning. But when
+trained, they indeed calculate intermediate steps. Moreover, in both
+situations, we find the effect of using implicit CoT is susceptible to the
+format of the problem, reaffirming the current deficiency of implicit CoT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is in
+  https://github.com/yuyijiong/if_step_by_step_implicit_CoT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Measuring Diversity of Game Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.15192v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.15192v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Li, Ziqi Wang, Qingquan Zhang, Bo Yuan, Jialin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This survey comprehensively reviews the multi-dimensionality of game scenario
+diversity, spotlighting the innovative use of procedural content generation and
+other fields as cornerstones for enriching player experiences through diverse
+game scenarios. By traversing a wide array of disciplines, from affective
+modeling and multi-agent systems to psychological studies, our research
+underscores the importance of diverse game scenarios in gameplay and education.
+Through a taxonomy of diversity metrics and evaluation methods, we aim to
+bridge the current gaps in literature and practice, offering insights into
+effective strategies for measuring and integrating diversity in game scenarios.
+Our analysis highlights the necessity for a unified taxonomy to aid developers
+and researchers in crafting more engaging and varied game worlds. This survey
+not only charts a path for future research in diverse game scenarios but also
+serves as a handbook for industry practitioners seeking to leverage diversity
+as a key component of game design and development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The surprising efficiency of temporal difference learning for rare event
+  prediction <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17638v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17638v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoou Cheng, Jonathan Weare
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We quantify the efficiency of temporal difference (TD) learning over the
+direct, or Monte Carlo (MC), estimator for policy evaluation in reinforcement
+learning, with an emphasis on estimation of quantities related to rare events.
+Policy evaluation is complicated in the rare event setting by the long
+timescale of the event and by the need for \emph{relative accuracy} in
+estimates of very small values. Specifically, we focus on least-squares TD
+(LSTD) prediction for finite state Markov chains, and show that LSTD can
+achieve relative accuracy far more efficiently than MC. We prove a central
+limit theorem for the LSTD estimator and upper bound the \emph{relative
+asymptotic variance} by simple quantities characterizing the connectivity of
+states relative to the transition probabilities between them. Using this bound,
+we show that, even when both the timescale of the rare event and the relative
+accuracy of the MC estimator are exponentially large in the number of states,
+LSTD maintains a fixed level of relative accuracy with a total number of
+observed transitions of the Markov chain that is only \emph{polynomially} large
+in the number of states.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final camera-ready version published at NeurIPS 2024. Correct an
+  assumption statement and typos, and change/add a few sentences from the last
+  version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MERaLiON-AudioLLM: Bridging Audio and Language with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09818v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09818v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingxu He, Zhuohan Liu, Shuo Sun, Bin Wang, Wenyu Zhang, Xunlong Zou, Nancy F. Chen, Ai Ti Aw
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MERaLiON-AudioLLM (Multimodal Empathetic Reasoning and Learning
+in One Network), the first speech-text model tailored for Singapore's
+multilingual and multicultural landscape. Developed under the National Large
+Language Models Funding Initiative, Singapore, MERaLiON-AudioLLM integrates
+advanced speech and text processing to address the diverse linguistic nuances
+of local accents and dialects, enhancing accessibility and usability in
+complex, multilingual environments. Our results demonstrate improvements in
+both speech recognition and task-specific understanding, positioning
+MERaLiON-AudioLLM as a pioneering solution for region specific AI applications.
+We envision this release to set a precedent for future models designed to
+address localised linguistic and cultural contexts in a global framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CrisisSense-LLM: Instruction Fine-Tuned Large Language Model for
+  Multi-label Social Media Text Classification in Disaster Informatics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15477v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15477v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Yin, Chengkai Liu, Ali Mostafavi, Xia Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of crisis/disaster informatics, social media is increasingly
+being used for improving situational awareness to inform response and relief
+efforts. Efficient and accurate text classification tools have been a focal
+area of investigation in crisis informatics. However, current methods mostly
+rely on single-label text classification models, which fails to capture
+different insights embedded in dynamic and multifaceted disaster-related social
+media data. This study introduces a novel approach to disaster text
+classification by enhancing a pre-trained Large Language Model (LLM) through
+instruction fine-tuning targeted for multi-label classification of
+disaster-related tweets. Our methodology involves creating a comprehensive
+instruction dataset from disaster-related tweets, which is then used to
+fine-tune an open-source LLM, thereby embedding it with disaster-specific
+knowledge. This fine-tuned model can classify multiple aspects of
+disaster-related information simultaneously, such as the type of event,
+informativeness, and involvement of human aid, significantly improving the
+utility of social media data for situational awareness in disasters. The
+results demonstrate that this approach enhances the categorization of critical
+information from social media posts, thereby facilitating a more effective
+deployment for situational awareness during emergencies. This research paves
+the way for more advanced, adaptable, and robust disaster management tools,
+leveraging the capabilities of LLMs to improve real-time situational awareness
+and response strategies in disaster scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Relevant source code and data is available:
+  https://github.com/KaiYin97/CrsisLLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Chat<span class="highlight-title">GPT</span> Overcome Behavioral Biases in the Financial Sector?
+  Classify-and-Rethink: Multi-Step Zero-Shot Reasoning in the Gold Investment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.13599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.13599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuoling Liu, Gaoguo Jia, Yuhang Jiang, Liyuan Chen, Qiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have achieved remarkable success recently,
+displaying exceptional capabilities in creating understandable and organized
+text. These LLMs have been utilized in diverse fields, such as clinical
+research, where domain-specific models like Med-Palm have achieved human-level
+performance. Recently, researchers have employed advanced prompt engineering to
+enhance the general reasoning ability of LLMs. Despite the remarkable success
+of zero-shot Chain-of-Thoughts (CoT) in solving general reasoning tasks, the
+potential of these methods still remains paid limited attention in the
+financial reasoning task.To address this issue, we explore multiple prompt
+strategies and incorporated semantic news information to improve LLMs'
+performance on financial reasoning tasks.To the best of our knowledge, we are
+the first to explore this important issue by applying ChatGPT to the gold
+investment.In this work, our aim is to investigate the financial reasoning
+capabilities of LLMs and their capacity to generate logical and persuasive
+investment opinions. We will use ChatGPT, one of the most powerful LLMs
+recently, and prompt engineering to achieve this goal. Our research will focus
+on understanding the ability of LLMs in sophisticated analysis and reasoning
+within the context of investment decision-making. Our study finds that ChatGPT
+with CoT prompt can provide more explainable predictions and overcome
+behavioral biases, which is crucial in finance-related tasks and can achieve
+higher investment returns.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smoothness Really Matters: A Simple Yet Effective Approach for
+  Unsupervised Graph Domain Adaptation <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11654v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11654v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Chen, Guo Ye, Yakun Wang, Zhao Zhang, Libang Zhang, Daixin Wang, Zhiqiang Zhang, Fuzhen Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Graph Domain Adaptation (UGDA) seeks to bridge distribution
+shifts between domains by transferring knowledge from labeled source graphs to
+given unlabeled target graphs. Existing UGDA methods primarily focus on
+aligning features in the latent space learned by graph neural networks (GNNs)
+across domains, often overlooking structural shifts, resulting in limited
+effectiveness when addressing structurally complex transfer scenarios. Given
+the sensitivity of GNNs to local structural features, even slight discrepancies
+between source and target graphs could lead to significant shifts in node
+embeddings, thereby reducing the effectiveness of knowledge transfer. To
+address this issue, we introduce a novel approach for UGDA called Target-Domain
+Structural Smoothing (TDSS). TDSS is a simple and effective method designed to
+perform structural smoothing directly on the target graph, thereby mitigating
+structural distribution shifts and ensuring the consistency of node
+representations. Specifically, by integrating smoothing techniques with
+neighborhood sampling, TDSS maintains the structural coherence of the target
+graph while mitigating the risk of over-smoothing. Our theoretical analysis
+shows that TDSS effectively reduces target risk by improving model smoothness.
+Empirical results on three real-world datasets demonstrate that TDSS
+outperforms recent state-of-the-art baselines, achieving significant
+improvements across six transfer scenarios. The code is available in
+https://github.com/cwei01/TDSS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, Accpected by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MVGT: A Multi-view Graph <span class="highlight-title">Transformer</span> Based on Spatial Relations for EEG
+  Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.03131v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.03131v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanjie Cui, Xiaohong Liu, Jing Liang, Yamin Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electroencephalography (EEG), a technique that records electrical activity
+from the scalp using electrodes, plays a vital role in affective computing.
+However, fully utilizing the multi-domain characteristics of EEG signals
+remains a significant challenge. Traditional single-perspective analyses often
+fail to capture the complex interplay of temporal, frequency, and spatial
+dimensions in EEG data. To address this, we introduce a multi-view graph
+transformer (MVGT) based on spatial relations that integrates information
+across three domains: temporal dynamics from continuous series, frequency
+features extracted from frequency bands, and inter-channel relationships
+captured through several spatial encodings. This comprehensive approach allows
+model to capture the nuanced properties inherent in EEG signals, enhancing its
+flexibility and representational power. Evaluation on publicly available
+datasets demonstrates that MVGT surpasses state-of-the-art methods in
+performance. The results highlight its ability to extract multi-domain
+information and effectively model inter-channel relationships, showcasing its
+potential for EEG-based emotion recognition tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffMesh: A Motion-aware Diffusion Framework for Human Mesh Recovery
+  from Videos <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13397v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13397v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ce Zheng, Xianpeng Liu, Qucheng Peng, Tianfu Wu, Pu Wang, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human mesh recovery (HMR) provides rich human body information for various
+real-world applications. While image-based HMR methods have achieved impressive
+results, they often struggle to recover humans in dynamic scenarios, leading to
+temporal inconsistencies and non-smooth 3D motion predictions due to the
+absence of human motion. In contrast, video-based approaches leverage temporal
+information to mitigate this issue. In this paper, we present DiffMesh, an
+innovative motion-aware Diffusion-like framework for video-based HMR. DiffMesh
+establishes a bridge between diffusion models and human motion, efficiently
+generating accurate and smooth output mesh sequences by incorporating human
+motion within the forward process and reverse process in the diffusion model.
+Extensive experiments are conducted on the widely used datasets (Human3.6M
+\cite{h36m_pami} and 3DPW \cite{pw3d2018}), which demonstrate the effectiveness
+and efficiency of our DiffMesh. Visual comparisons in real-world scenarios
+further highlight DiffMesh's suitability for practical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Survey</span>ing Attitudinal Alignment Between Large Language Models Vs. Humans
+  Towards 17 Sustainable Development Goals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13885v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13885v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyang Wu, Ying Xu, Tingsong Xiao, Yunze Xiao, Yitong Li, Tianyang Wang, Yichi Zhang, Shanghai Zhong, Yuwei Zhang, Wei Lu, Yifan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have emerged as potent tools for advancing the
+United Nations' Sustainable Development Goals (SDGs). However, the attitudinal
+disparities between LLMs and humans towards these goals can pose significant
+challenges. This study conducts a comprehensive review and analysis of the
+existing literature on the attitudes of LLMs towards the 17 SDGs, emphasizing
+the comparison between their attitudes and support for each goal and those of
+humans. We examine the potential disparities, primarily focusing on aspects
+such as understanding and emotions, cultural and regional differences, task
+objective variations, and factors considered in the decision-making process.
+These disparities arise from the underrepresentation and imbalance in LLM
+training data, historical biases, quality issues, lack of contextual
+understanding, and skewed ethical values reflected. The study also investigates
+the risks and harms that may arise from neglecting the attitudes of LLMs
+towards the SDGs, including the exacerbation of social inequalities, racial
+discrimination, environmental destruction, and resource wastage. To address
+these challenges, we propose strategies and recommendations to guide and
+regulate the application of LLMs, ensuring their alignment with the principles
+and goals of the SDGs, and therefore creating a more just, inclusive, and
+sustainable future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Balanced Continual Multi-Modal Learning in Human Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05264v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05264v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxuan Peng, Mengshi Qi, Dong Zhao, Huadong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D human pose estimation (3D HPE) has emerged as a prominent research topic,
+particularly in the realm of RGB-based methods. However, RGB images are
+susceptible to limitations such as sensitivity to lighting conditions and
+potential user discomfort. Consequently, multi-modal sensing, which leverages
+non-intrusive sensors, is gaining increasing attention. Nevertheless,
+multi-modal 3D HPE still faces challenges, including modality imbalance and the
+imperative for continual learning. In this work, we introduce a novel balanced
+continual multi-modal learning method for 3D HPE, which harnesses the power of
+RGB, LiDAR, mmWave, and WiFi. Specifically, we propose a Shapley value-based
+contribution algorithm to quantify the contribution of each modality and
+identify modality imbalance. To address this imbalance, we employ a re-learning
+strategy. Furthermore, recognizing that raw data is prone to noise
+contamination, we develop a novel denoising continual learning approach. This
+approach incorporates a noise identification and separation module to mitigate
+the adverse effects of noise and collaborates with the balanced learning
+strategy to enhance optimization. Additionally, an adaptive EWC mechanism is
+employed to alleviate catastrophic forgetting. We conduct extensive experiments
+on the widely-adopted multi-modal dataset, MM-Fi, which demonstrate the
+superiority of our approach in boosting 3D pose estimation and mitigating
+catastrophic forgetting in complex scenarios. We will release our codes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PeFoMed: Parameter Efficient Fine-tuning of Multimodal Large Language
+  Models for Medical Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.02797v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.02797v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinlong He, Pengfei Li, Gang Liu, Genrong He, Zhaolin Chen, Shenjun Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) represent an evolutionary expansion
+in the capabilities of traditional large language models, enabling them to
+tackle challenges that surpass the scope of purely text-based applications. It
+leverages the knowledge previously encoded within these language models,
+thereby enhancing their applicability and functionality in the reign of
+multimodal contexts. Recent works investigate the adaptation of MLLMs as a
+universal solution to address medical multi-modal problems as a generative
+task. In this paper, we propose a parameter efficient framework for fine-tuning
+MLLMs, specifically validated on medical visual question answering (Med-VQA)
+and medical report generation (MRG) tasks, using public benchmark datasets. We
+also introduce an evaluation metric using the 5-point Likert scale and its
+weighted average value to measure the quality of the generated reports for MRG
+tasks, where the scale ratings are labelled by both humans manually and the
+GPT-4 model. We further assess the consistency of performance metrics across
+traditional measures, GPT-4, and human ratings for both VQA and MRG tasks. The
+results indicate that semantic similarity assessments using GPT-4 align closely
+with human annotators and provide greater stability, yet they reveal a
+discrepancy when compared to conventional lexical similarity measurements. This
+questions the reliability of lexical similarity metrics for evaluating the
+performance of generative models in Med-VQA and report generation tasks.
+Besides, our fine-tuned model significantly outperforms GPT-4v. This indicates
+that without additional fine-tuning, multi-modal models like GPT-4v do not
+perform effectively on medical imaging tasks. The code will be available here:
+https://github.com/jinlHe/PeFoMed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures, 12 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Deep Subspace Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00230v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00230v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupei Zhang, Ruojia Feng, Yifei Wang, Xuequn Shang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces FDSC, a private-protected subspace clustering (SC)
+approach with federated learning (FC) schema. In each client, there is a deep
+subspace clustering network accounting for grouping the isolated data, composed
+of a encode network, a self-expressive layer, and a decode network. FDSC is
+achieved by uploading the encode network to communicate with other clients in
+the server. Besides, FDSC is also enhanced by preserving the local neighborhood
+relationship in each client. With the effects of federated learning and
+locality preservation, the learned data features from the encoder are boosted
+so as to enhance the self-expressiveness learning and result in better
+clustering performance. Experiments test FDSC on public datasets and compare
+with other clustering methods, demonstrating the effectiveness of FDSC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8pages,4 figures, 4 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VBIM-Net: Variational Born Iterative Network for Inverse Scattering
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18731v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18731v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqing Xing, Zhaoyang Zhang, Zirui Chen, Yusong Wang, Haoran Ma, Zhun Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, studies have shown the potential of integrating field-type
+iterative methods with deep learning (DL) techniques in solving inverse
+scattering problems (ISPs). In this article, we propose a novel Variational
+Born Iterative Network, namely, VBIM-Net, to solve the full-wave ISPs with
+significantly improved structural rationality and inversion quality. The
+proposed VBIM-Net emulates the alternating updates of the total electric field
+and the contrast in the variational Born iterative method (VBIM) by multiple
+layers of subnetworks. We embed the analytical calculation of the contrast
+variation into each subnetwork, converting the scattered field residual into an
+approximate contrast variation and then enhancing it by a U-Net, thus avoiding
+the requirement of matched measurement dimension and grid resolution as in
+existing approaches. The total field and contrast of each layer's output is
+supervised in the loss function of VBIM-Net, imposing soft physical constraints
+on the variables in the subnetworks, which benefits the model's performance. In
+addition, we design a training scheme with extra noise to enhance the model's
+stability. Extensive numerical results on synthetic and experimental data both
+verify the inversion quality, generalization ability, and robustness of the
+proposed VBIM-Net. This work may provide some new inspiration for the design of
+efficient field-type DL schemes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Geoscience and Remote Sensing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DynST: Dynamic Sparse Training for Resource-Constrained Spatio-Temporal
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02914v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02914v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Wu, Haomin Wen, Guibin Zhang, Yutong Xia, Yuxuan Liang, Yu Zheng, Qingsong Wen, Kun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ever-increasing sensor service, though opening a precious path and
+providing a deluge of earth system data for deep-learning-oriented earth
+science, sadly introduce a daunting obstacle to their industrial level
+deployment. Concretely, earth science systems rely heavily on the extensive
+deployment of sensors, however, the data collection from sensors is constrained
+by complex geographical and social factors, making it challenging to achieve
+comprehensive coverage and uniform deployment. To alleviate the obstacle,
+traditional approaches to sensor deployment utilize specific algorithms to
+design and deploy sensors. These methods \textit{dynamically adjust the
+activation times of sensors to optimize the detection process across each
+sub-region}. Regrettably, formulating an activation strategy generally based on
+historical observations and geographic characteristics, which make the methods
+and resultant models were neither simple nor practical. Worse still, the
+complex technical design may ultimately lead to a model with weak
+generalizability. In this paper, we introduce for the first time the concept of
+spatio-temporal data dynamic sparse training and are committed to adaptively,
+dynamically filtering important sensor distributions. To our knowledge, this is
+the \textbf{first} proposal (\textit{termed DynST}) of an
+\textbf{industry-level} deployment optimization concept at the data level.
+However, due to the existence of the temporal dimension, pruning of
+spatio-temporal data may lead to conflicts at different timestamps. To achieve
+this goal, we employ dynamic merge technology, along with ingenious dimensional
+mapping to mitigate potential impacts caused by the temporal aspect. During the
+training process, DynST utilize iterative pruning and sparse training,
+repeatedly identifying and dynamically removing sensor perception areas that
+contribute the least to future predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal
+  MRI <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10377v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10377v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linxuan Han, Sa Xiao, Zimeng Li, Haidong Li, Xiuchao Zhao, Yeqing Han, Fumin Guo, Xin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal magnetic resonance imaging (MRI) provides information of lesions
+for computer-aided diagnosis from different views. Deep learning algorithms are
+suitable for identifying specific anatomical structures, segmenting lesions,
+and classifying diseases. Manual labels are limited due to the high expense,
+which hinders further improvement of accuracy. Self-supervised learning,
+particularly masked image modeling (MIM), has shown promise in utilizing
+unlabeled data. However, we spot model collapse when applying MIM to
+multi-modal MRI datasets. The performance of downstream tasks does not see any
+improvement following the collapsed model. To solve model collapse, we analyze
+and address it in two types: complete collapse and dimensional collapse. We
+find complete collapse occurs because the collapsed loss value in multi-modal
+MRI datasets falls below the normally converged loss value. Based on this, the
+hybrid mask pattern (HMP) masking strategy is introduced to elevate the
+collapsed loss above the normally converged loss value and avoid complete
+collapse. Additionally, we reveal that dimensional collapse stems from
+insufficient feature uniformity in MIM. We mitigate dimensional collapse by
+introducing the pyramid barlow twins (PBT) module as an explicit regularization
+method. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module
+to avoid model collapse multi-modal MRI. Experiments are conducted on three
+multi-modal MRI datasets to validate the effectiveness of our approach in
+preventing both types of model collapse. By preventing model collapse, the
+training of the model becomes more stable, resulting in a decent improvement in
+performance for segmentation and classification tasks. The code is available at
+https://github.com/LinxuanHan/E-MIM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the lEEE for possible publication.
+  copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Graph <span class="highlight-title">Self-Supervised</span> Learning with Graph Interplay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.04061v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.04061v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinjian Zhao, Wei Pang, Xiangru Jian, Yaoyao Xu, Chaolong Ying, Tianshu Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph self-supervised learning (GSSL) has emerged as a compelling framework
+for extracting informative representations from graph-structured data without
+extensive reliance on labeled inputs. In this study, we introduce Graph
+Interplay (GIP), an innovative and versatile approach that significantly
+enhances the performance equipped with various existing GSSL methods. To this
+end, GIP advocates direct graph-level communications by introducing random
+inter-graph edges within standard batches. Against GIP's simplicity, we further
+theoretically show that \textsc{GIP} essentially performs a principled manifold
+separation via combining inter-graph message passing and GSSL, bringing about
+more structured embedding manifolds and thus benefits a series of downstream
+tasks. Our empirical study demonstrates that GIP surpasses the performance of
+prevailing GSSL methods across multiple benchmarks by significant margins,
+highlighting its potential as a breakthrough approach. Besides, GIP can be
+readily integrated into a series of GSSL methods and consistently offers
+additional performance gain. This advancement not only amplifies the capability
+of GSSL but also potentially sets the stage for a novel graph learning paradigm
+in a broader sense.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Due to potential implicit data leakage in our experimental setup,
+  where the pretraining dataset was ordered by default labels, we withdraw this
+  manuscript for further self-examination and rigorous validation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CryoBench: Diverse and challenging <span class="highlight-title">dataset</span>s for the heterogeneity
+  problem in cryo-EM <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05526v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05526v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minkyu Jeon, Rishwanth Raghu, Miro Astore, Geoffrey Woollard, Ryan Feathers, Alkin Kaz, Sonya M. Hanson, Pilar Cossio, Ellen D. Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cryo-electron microscopy (cryo-EM) is a powerful technique for determining
+high-resolution 3D biomolecular structures from imaging data. Its unique
+ability to capture structural variability has spurred the development of
+heterogeneous reconstruction algorithms that can infer distributions of 3D
+structures from noisy, unlabeled imaging data. Despite the growing number of
+advanced methods, progress in the field is hindered by the lack of standardized
+benchmarks with ground truth information and reliable validation metrics. Here,
+we introduce CryoBench, a suite of datasets, metrics, and benchmarks for
+heterogeneous reconstruction in cryo-EM. CryoBench includes five datasets
+representing different sources of heterogeneity and degrees of difficulty.
+These include conformational heterogeneity generated from designed motions of
+antibody complexes or sampled from a molecular dynamics simulation, as well as
+compositional heterogeneity from mixtures of ribosome assembly states or 100
+common complexes present in cells. We then analyze state-of-the-art
+heterogeneous reconstruction tools, including neural and non-neural methods,
+assess their sensitivity to noise, and propose new metrics for quantitative
+evaluation. We hope that CryoBench will be a foundational resource for
+accelerating algorithmic development and evaluation in the cryo-EM and machine
+learning communities. Project page: https://cryobench.cs.princeton.edu.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Mechanistic Explanatory Strategy for XAI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.01332v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.01332v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcin Rabiza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant advancements in XAI, scholars note a persistent lack of
+solid conceptual foundations and integration with broader scientific discourse
+on explanation. In response, emerging XAI research draws on explanatory
+strategies from various sciences and philosophy of science literature to fill
+these gaps. This paper outlines a mechanistic strategy for explaining the
+functional organization of deep learning systems, situating recent advancements
+in AI explainability within a broader philosophical context. According to the
+mechanistic approach, the explanation of opaque AI systems involves identifying
+mechanisms that drive decision-making. For deep neural networks, this means
+discerning functionally relevant components -- such as neurons, layers,
+circuits, or activation patterns -- and understanding their roles through
+decomposition, localization, and recomposition. Proof-of-principle case studies
+from image recognition and language modeling align these theoretical approaches
+with the latest research from AI labs like OpenAI and Anthropic. This research
+suggests that a systematic approach to studying model organization can reveal
+elements that simpler (or ''more modest'') explainability techniques might
+miss, fostering more thoroughly explainable AI. The paper concludes with a
+discussion on the epistemic relevance of the mechanistic approach positioned in
+the context of selected philosophical debates on XAI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Forthcoming in M\"uller, V. C., Dewey, A. R., Dung, L., & L\"ohr, G.
+  (Eds.), Philosophy of Artificial Intelligence: The State of the Art, Synthese
+  Library, Berlin: Springer Nature. Please cite the published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intelligent Icing Detection Model of Wind Turbine Blades Based on SCADA
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2101.07914v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2101.07914v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqian Jiang, Junyang Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosis of ice accretion on wind turbine blades is all the time a hard nut
+to crack in condition monitoring of wind farms. Existing methods focus on
+mechanism analysis of icing process, deviation degree analysis of feature
+engineering. However, there have not been deep researches of neural networks
+applied in this field at present. Supervisory control and data acquisition
+(SCADA) makes it possible to train networks through continuously providing not
+only operation parameters and performance parameters of wind turbines but also
+environmental parameters and operation modes. This paper explores the
+possibility that using convolutional neural networks (CNNs), generative
+adversarial networks (GANs) and domain adaption learning to establish
+intelligent diagnosis frameworks under different training scenarios.
+Specifically, PGANC and PGANT are proposed for sufficient and non-sufficient
+target wind turbine labeled data, respectively. The basic idea is that we
+consider a two-stage training with parallel GANs, which are aimed at capturing
+intrinsic features for normal and icing samples, followed by classification CNN
+or domain adaption module in various training cases. Model validation on three
+wind turbine SCADA data shows that two-stage training can effectively improve
+the model performance. Besides, if there is no sufficient labeled data for a
+target turbine, which is an extremely common phenomenon in real industrial
+practices, the addition of domain adaption learning makes the trained model
+show better performance. Overall, our proposed intelligent diagnosis frameworks
+can achieve more accurate detection on the same wind turbine and more
+generalized capability on a new wind turbine, compared with other machine
+learning models and conventional CNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-hop Upstream Anticipatory Traffic Signal Control with Deep
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocan Li, Xiaoyu Wang, Ilia Smirnov, Scott Sanner, Baher Abdulhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coordination in traffic signal control is crucial for managing congestion in
+urban networks. Existing pressure-based control methods focus only on immediate
+upstream links, leading to suboptimal green time allocation and increased
+network delays. However, effective signal control inherently requires
+coordination across a broader spatial scope, as the effect of upstream traffic
+should influence signal control decisions at downstream intersections,
+impacting a large area in the traffic network. Although agent communication
+using neural network-based feature extraction can implicitly enhance spatial
+awareness, it significantly increases the learning complexity, adding an
+additional layer of difficulty to the challenging task of control in deep
+reinforcement learning. To address the issue of learning complexity and myopic
+traffic pressure definition, our work introduces a novel concept based on
+Markov chain theory, namely \textit{multi-hop upstream pressure}, which
+generalizes the conventional pressure to account for traffic conditions beyond
+the immediate upstream links. This farsighted and compact metric informs the
+deep reinforcement learning agent to preemptively clear the multi-hop upstream
+queues, guiding the agent to optimize signal timings with a broader spatial
+awareness. Simulations on synthetic and realistic (Toronto) scenarios
+demonstrate controllers utilizing multi-hop upstream pressure significantly
+reduce overall network delay by prioritizing traffic movements based on a
+broader understanding of upstream congestion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 tables, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Alignment Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06164v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06164v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satchel Grant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When can we say that two neural systems are the same? The answer to this
+question is goal-dependent, and it is often addressed through correlative
+methods such as Representational Similarity Analysis (RSA) and Centered Kernel
+Alignment (CKA). What do we miss when we forgo causal explorations, and how can
+we target specific types of similarity? In this work, we introduce Model
+Alignment Search (MAS), a method for causally exploring distributed
+representational similarity. The method learns invertible linear
+transformations that align a subspace between two distributed networks'
+representations where causal information can be freely interchanged. We first
+show that the method can be used to transfer specific causal variables, such as
+the number of items in a counting task, between networks with different
+training seeds. We then explore open questions in number cognition by comparing
+different types of numeric representations in models trained on structurally
+different numeric tasks. We then explore differences between MAS vs preexisting
+causal similarity methods, and lastly, we introduce a counterfactual latent
+auxiliary loss function that helps shape causally relevant alignments even in
+cases where we do not have causal access to one of the two models for training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AgRegNet: A Deep Regression Network for Flower and Fruit Density
+  Estimation, Localization, and Counting in Orchards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.17400v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.17400v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uddhav Bhattarai, Santosh Bhusal, Qin Zhang, Manoj Karkee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the major challenges for the agricultural industry today is the
+uncertainty in manual labor availability and the associated cost. Automated
+flower and fruit density estimation, localization, and counting could help
+streamline harvesting, yield estimation, and crop-load management strategies
+such as flower and fruitlet thinning. This article proposes a deep
+regression-based network, AgRegNet, to estimate density, count, and location of
+flower and fruit in tree fruit canopies without explicit object detection or
+polygon annotation. Inspired by popular U-Net architecture, AgRegNet is a
+U-shaped network with an encoder-to-decoder skip connection and modified
+ConvNeXt-T as an encoder feature extractor. AgRegNet can be trained based on
+information from point annotation and leverages segmentation information and
+attention modules (spatial and channel) to highlight relevant flower and fruit
+features while suppressing non-relevant background features. Experimental
+evaluation in apple flower and fruit canopy images under an unstructured
+orchard environment showed that AgRegNet achieved promising accuracy as
+measured by Structural Similarity Index (SSIM), percentage Mean Absolute Error
+(pMAE) and mean Average Precision (mAP) to estimate flower and fruit density,
+count, and centroid location, respectively. Specifically, the SSIM, pMAE, and
+mAP values for flower images were 0.938, 13.7%, and 0.81, respectively. For
+fruit images, the corresponding values were 0.910, 5.6%, and 0.93. Since the
+proposed approach relies on information from point annotation, it is suitable
+for sparsely and densely located objects. This simplified technique will be
+highly applicable for growers to accurately estimate yields and decide on
+optimal chemical and mechanical flower thinning practices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Computers and Electronics in Agriculture</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cold-Start Recommendation towards the Era of Large Language Models
+  (LLMs): A Comprehensive <span class="highlight-title">Survey</span> and Roadmap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01945v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01945v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhi Zhang, Yuanchen Bei, Liangwei Yang, Henry Peng Zou, Peilin Zhou, Aiwei Liu, Yinghui Li, Hao Chen, Jianling Wang, Yu Wang, Feiran Huang, Sheng Zhou, Jiajun Bu, Allen Lin, James Caverlee, Fakhri Karray, Irwin King, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cold-start problem is one of the long-standing challenges in recommender
+systems, focusing on accurately modeling new or interaction-limited users or
+items to provide better recommendations. Due to the diversification of internet
+platforms and the exponential growth of users and items, the importance of
+cold-start recommendation (CSR) is becoming increasingly evident. At the same
+time, large language models (LLMs) have achieved tremendous success and possess
+strong capabilities in modeling user and item information, providing new
+potential for cold-start recommendations. However, the research community on
+CSR still lacks a comprehensive review and reflection in this field. Based on
+this, in this paper, we stand in the context of the era of large language
+models and provide a comprehensive review and discussion on the roadmap,
+related literature, and future directions of CSR. Specifically, we have
+conducted an exploration of the development path of how existing CSR utilizes
+information, from content features, graph relations, and domain information, to
+the world knowledge possessed by large language models, aiming to provide new
+insights for both the research and industrial communities on CSR. Related
+resources of cold-start recommendations are collected and continuously updated
+for the community in
+https://github.com/YuanchenBei/Awesome-Cold-Start-Recommendation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-15T00:00:00Z">2025-01-15</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">27</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Applying General Turn-taking Models to Conversational Human-Robot
+  Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Skantze, Bahar Irfan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Turn-taking is a fundamental aspect of conversation, but current Human-Robot
+Interaction (HRI) systems often rely on simplistic, silence-based models,
+leading to unnatural pauses and interruptions. This paper investigates, for the
+first time, the application of general turn-taking models, specifically TurnGPT
+and Voice Activity Projection (VAP), to improve conversational dynamics in HRI.
+These models are trained on human-human dialogue data using self-supervised
+learning objectives, without requiring domain-specific fine-tuning. We propose
+methods for using these models in tandem to predict when a robot should begin
+preparing responses, take turns, and handle potential interruptions. We
+evaluated the proposed system in a within-subject study against a traditional
+baseline system, using the Furhat robot with 39 adults in a conversational
+setting, in combination with a large language model for autonomous response
+generation. The results show that participants significantly prefer the
+proposed system, and it significantly reduces response delays and
+interruptions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at HRI 2025 (the IEEE/ACM International Conference on
+  Human-Robot Interaction)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Reinforcement Learning Approach to Quiet and Safe UAM Traffic
+  Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Surya Murthy, John-Paul Clarke, Ufuk Topcu, Zhenyu Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urban air mobility (UAM) is a transformative system that operates various
+small aerial vehicles in urban environments to reshape urban transportation.
+However, integrating UAM into existing urban environments presents a variety of
+complex challenges. Recent analyses of UAM's operational constraints highlight
+aircraft noise and system safety as key hurdles to UAM system implementation.
+Future UAM air traffic management schemes must ensure that the system is both
+quiet and safe. We propose a multi-agent reinforcement learning approach to
+manage UAM traffic, aiming at both vertical separation assurance and noise
+mitigation. Through extensive training, the reinforcement learning agent learns
+to balance the two primary objectives by employing altitude adjustments in a
+multi-layer UAM network. The results reveal the tradeoffs among noise impact,
+traffic congestion, and separation. Overall, our findings demonstrate the
+potential of reinforcement learning in mitigating UAM's noise impact while
+maintaining safe separation using altitude adjustments
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper presented at SciTech 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When Uncertainty Leads to Unsafety: Empirical Insights into the Role of
+  Uncertainty in Unmanned Aerial Vehicle Safety 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sajad Khatiri, Fatemeh Mohammadi Amin, Sebastiano Panichella, Paolo Tonella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the recent developments in obstacle avoidance and other safety
+features, autonomous Unmanned Aerial Vehicles (UAVs) continue to face safety
+challenges. No previous work investigated the relationship between the
+behavioral uncertainty of a UAV and the unsafety of its flight. By quantifying
+uncertainty, it is possible to develop a predictor for unsafety, which acts as
+a flight supervisor. We conducted a large-scale empirical investigation of
+safety violations using PX4-Autopilot, an open-source UAV software platform.
+Our dataset of over 5,000 simulated flights, created to challenge obstacle
+avoidance, allowed us to explore the relation between uncertain UAV decisions
+and safety violations: up to 89% of unsafe UAV states exhibit significant
+decision uncertainty, and up to 74% of uncertain decisions lead to unsafe
+states. Based on these findings, we implemented Superialist (Supervising
+Autonomous Aerial Vehicles), a runtime uncertainty detector based on
+autoencoders, the state-of-the-art technology for anomaly detection.
+Superialist achieved high performance in detecting uncertain behaviors with up
+to 96% precision and 93% recall. Despite the observed performance degradation
+when using the same approach for predicting unsafety (up to 74% precision and
+87% recall), Superialist enabled early prediction of unsafe states up to 50
+seconds in advance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLC$^2$-SLAM: Semantic-guided Loop Closure with Shared Latent Code for
+  NeRF SLAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Ming, Di Ma, Weichen Dai, Han Yang, Rui Fan, Guofeng Zhang, Wanzeng Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Targeting the notorious cumulative drift errors in NeRF SLAM, we propose a
+Semantic-guided Loop Closure with Shared Latent Code, dubbed SLC$^2$-SLAM.
+Especially, we argue that latent codes stored in many NeRF SLAM systems are not
+fully exploited, as they are only used for better reconstruction. In this
+paper, we propose a simple yet effective way to detect potential loops using
+the same latent codes as local features. To further improve the loop detection
+performance, we use the semantic information, which are also decoded from the
+same latent codes to guide the aggregation of local features. Finally, with the
+potential loops detected, we close them with a graph optimization followed by
+bundle adjustment to refine both the estimated poses and the reconstructed
+scene. To evaluate the performance of our SLC$^2$-SLAM, we conduct extensive
+experiments on Replica and ScanNet datasets. Our proposed semantic-guided loop
+closure significantly outperforms the pre-trained NetVLAD and ORB combined with
+Bag-of-Words, which are used in all the other NeRF SLAM with loop closure. As a
+result, our SLC$^2$-SLAM also demonstrated better tracking and reconstruction
+performance, especially in larger scenes with more loops, like ScanNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Task Allocation in Mobile Robot Fleets: A <span class="highlight-title">review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08726v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08726v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrés Meseguer Valenzuela, Francisco Blanes Noguera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mobile robot fleets are currently used in different scenarios such as medical
+environments or logistics. The management of these systems provides different
+challenges that vary from the control of the movement of each robot to the
+allocation of tasks to be performed. Task Allocation (TA) problem is a key
+topic for the proper management of mobile robot fleets to ensure the
+minimization of energy consumption and quantity of necessary robots. Solutions
+on this aspect are essential to reach economic and environmental sustainability
+of robot fleets, mainly in industry applications such as warehouse logistics.
+The minimization of energy consumption introduces TA problem as an optimization
+issue which has been treated in recent studies. This work focuses on the
+analysis of current trends in solving TA of mobile robot fleets. Main TA
+optimization algorithms are presented, including novel methods based on
+Artificial Intelligence (AI). Additionally, this work showcases most important
+results extracted from simulations, including frameworks utilized for the
+development of the simulations. Finally, some conclusions are obtained from the
+analysis to target on gaps that must be treated in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GS-LIVO: Real-Time LiDAR, Inertial, and Visual Multi-sensor Fused
+  Odometry with Gaussian Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Hong, Chunran Zheng, Yishu Shen, Changze Li, Fu Zhang, Tong Qin, Shaojie Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, 3D Gaussian splatting (3D-GS) has emerged as a novel scene
+representation approach. However, existing vision-only 3D-GS methods often rely
+on hand-crafted heuristics for point-cloud densification and face challenges in
+handling occlusions and high GPU memory and computation consumption.
+LiDAR-Inertial-Visual (LIV) sensor configuration has demonstrated superior
+performance in localization and dense mapping by leveraging complementary
+sensing characteristics: rich texture information from cameras, precise
+geometric measurements from LiDAR, and high-frequency motion data from IMU.
+Inspired by this, we propose a novel real-time Gaussian-based simultaneous
+localization and mapping (SLAM) system. Our map system comprises a global
+Gaussian map and a sliding window of Gaussians, along with an IESKF-based
+odometry. The global Gaussian map consists of hash-indexed voxels organized in
+a recursive octree, effectively covering sparse spatial volumes while adapting
+to different levels of detail and scales. The Gaussian map is initialized
+through multi-sensor fusion and optimized with photometric gradients. Our
+system incrementally maintains a sliding window of Gaussians, significantly
+reducing GPU computation and memory consumption by only optimizing the map
+within the sliding window. Moreover, we implement a tightly coupled
+multi-sensor fusion odometry with an iterative error state Kalman filter
+(IESKF), leveraging real-time updating and rendering of the Gaussian map. Our
+system represents the first real-time Gaussian-based SLAM framework deployable
+on resource-constrained embedded systems, demonstrated on the NVIDIA Jetson
+Orin NX platform. The framework achieves real-time performance while
+maintaining robust multi-sensor fusion capabilities. All implementation
+algorithms, hardware designs, and CAD models will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application of Deep Reinforcement Learning to UAV Swarming for Ground
+  Surveillance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raúl Arranz, David Carramiñana, Gonzalo de Miguel, Juan A. Besada, Ana M. Bernardos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper summarizes in depth the state of the art of aerial swarms,
+covering both classical and new reinforcement-learning-based approaches for
+their management. Then, it proposes a hybrid AI system, integrating deep
+reinforcement learning in a multi-agent centralized swarm architecture. The
+proposed system is tailored to perform surveillance of a specific area,
+searching and tracking ground targets, for security and law enforcement
+applications. The swarm is governed by a central swarm controller responsible
+for distributing different search and tracking tasks among the cooperating
+UAVs. Each UAV agent is then controlled by a collection of cooperative
+sub-agents, whose behaviors have been trained using different deep
+reinforcement learning models, tailored for the different task types proposed
+by the swarm controller. More specifically, proximal policy optimization (PPO)
+algorithms were used to train the agents' behavior. In addition, several
+metrics to assess the performance of the swarm in this application were
+defined. The results obtained through simulation show that our system searches
+the operation area effectively, acquires the targets in a reasonable time, and
+is capable of tracking them continuously and consistently.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Organizing Edge Computing Distribution Framework for Visual SLAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jussi Kalliola, Lauri Suomela, Sergio Moreschini, David Hästbacka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Localization within a known environment is a crucial capability for mobile
+robots. Simultaneous Localization and Mapping (SLAM) is a prominent solution to
+this problem. SLAM is a framework that consists of a diverse set of
+computational tasks ranging from real-time tracking to computation-intensive
+map optimization. This combination can present a challenge for resource-limited
+mobile robots. Previously, edge-assisted SLAM methods have demonstrated
+promising real-time execution capabilities by offloading heavy computations
+while performing real-time tracking onboard. However, the common approach of
+utilizing a client-server architecture for offloading is sensitive to server
+and network failures. In this article, we propose a novel edge-assisted SLAM
+framework capable of self-organizing fully distributed SLAM execution across a
+network of devices or functioning on a single device without connectivity. The
+architecture consists of three layers and is designed to be device-agnostic,
+resilient to network failures, and minimally invasive to the core SLAM system.
+We have implemented and demonstrated the framework for monocular ORB SLAM3 and
+evaluated it in both fully distributed and standalone SLAM configurations
+against the ORB SLAM3. The experiment results demonstrate that the proposed
+design matches the accuracy and resource utilization of the monolithic approach
+while enabling collaborative execution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image-to-Force Estimation for Soft Tissue Interaction in
+  Robotic-Assisted Surgery Using Structured Light 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08593v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08593v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayin Wang, Mingfeng Yao, Yanran Wei, Xiaoyu Guo, Ayong Zheng, Weidong Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For Minimally Invasive Surgical (MIS) robots, accurate haptic interaction
+force feedback is essential for ensuring the safety of interacting with soft
+tissue. However, most existing MIS robotic systems cannot facilitate direct
+measurement of the interaction force with hardware sensors due to space
+limitations. This letter introduces an effective vision-based scheme that
+utilizes a One-Shot structured light projection with a designed pattern on soft
+tissue coupled with haptic information processing through a trained
+image-to-force neural network. The images captured from the endoscopic stereo
+camera are analyzed to reconstruct high-resolution 3D point clouds for soft
+tissue deformation. Based on this, a modified PointNet-based force estimation
+method is proposed, which excels in representing the complex mechanical
+properties of soft tissue. Numerical force interaction experiments are
+conducted on three silicon materials with different stiffness. The results
+validate the effectiveness of the proposed scheme.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GOTLoc: General Outdoor Text-based Localization Using Scene Graph
+  Retrieval with OpenStreetMap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donghwi Jung, Keonwoo Kim, Seong-Woo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose GOTLoc, a robust localization method capable of operating even in
+outdoor environments where GPS signals are unavailable. The method achieves
+this robust localization by leveraging comparisons between scene graphs
+generated from text descriptions and maps. Existing text-based localization
+studies typically represent maps as point clouds and identify the most similar
+scenes by comparing embeddings of text and point cloud data. However, point
+cloud maps have limited scalability as it is impractical to pre-generate maps
+for all outdoor spaces. Furthermore, their large data size makes it challenging
+to store and utilize them directly on actual robots. To address these issues,
+GOTLoc leverages compact data structures, such as scene graphs, to store
+spatial information, enabling individual robots to carry and utilize large
+amounts of map data. Additionally, by utilizing publicly available map data,
+such as OpenStreetMap, which provides global information on outdoor spaces, we
+eliminate the need for additional effort to create custom map data. For
+performance evaluation, we utilized the KITTI360Pose dataset in conjunction
+with corresponding OpenStreetMap data to compare the proposed method with
+existing approaches. Our results demonstrate that the proposed method achieves
+accuracy comparable to algorithms relying on point cloud maps. Moreover, in
+city-scale tests, GOTLoc required significantly less storage compared to point
+cloud-based methods and completed overall processing within a few seconds,
+validating its applicability to real-world robotics. Our code is available at
+https://github.com/donghwijung/GOTLoc.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LAMS: LLM-Driven Automatic Mode Switching for Assistive Teleoperation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiran Tao, Jehan Yang, Dan Ding, Zackory Erickson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teleoperating high degrees-of-freedom (DoF) robotic manipulators via low-DoF
+controllers like joysticks often requires frequent switching between control
+modes, where each mode maps controller movements to specific robot actions.
+Manually performing this frequent switching can make teleoperation cumbersome
+and inefficient. On the other hand, existing automatic mode-switching
+solutions, such as heuristic-based or learning-based methods, are often
+task-specific and lack generalizability. In this paper, we introduce LLM-Driven
+Automatic Mode Switching (LAMS), a novel approach that leverages Large Language
+Models (LLMs) to automatically switch control modes based on task context.
+Unlike existing methods, LAMS requires no prior task demonstrations and
+incrementally improves by integrating user-generated mode-switching examples.
+We validate LAMS through an ablation study and a user study with 10
+participants on complex, long-horizon tasks, demonstrating that LAMS
+effectively reduces manual mode switches, is preferred over alternative
+methods, and improves performance over time. The project website with
+supplementary materials is at https://lams-assistance.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chance-Constrained Sampling-Based MPC for Collision Avoidance in
+  Uncertain Dynamic Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ihab S. Mohamed, Mahmoud Ali, Lantao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Navigating safely in dynamic and uncertain environments is challenging due to
+uncertainties in perception and motion. This letter presents C2U-MPPI, a robust
+sampling-based Model Predictive Control (MPC) framework that addresses these
+challenges by leveraging the Unscented Model Predictive Path Integral (U-MPPI)
+control strategy with integrated probabilistic chance constraints, ensuring
+more reliable and efficient navigation under uncertainty. Unlike gradient-based
+MPC methods, our approach (i) avoids linearization of system dynamics and
+directly applies non-convex and nonlinear chance constraints, enabling more
+accurate and flexible optimization, and (ii) enhances computational efficiency
+by reformulating probabilistic constraints into a deterministic form and
+employing a layered dynamic obstacle representation, enabling real-time
+handling of multiple obstacles. Extensive experiments in simulated and
+real-world human-shared environments validate the effectiveness of our
+algorithm against baseline methods, showcasing its capability to generate
+feasible trajectories and control inputs that adhere to system dynamics and
+constraints in dynamic settings, enabled by unscented-based sampling strategy
+and risk-sensitive trajectory evaluation. A supplementary video is available
+at: https://youtu.be/FptAhvJlQm8
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has 8 pages, 2 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Framework for Dynamic Situational Awareness in Human Robot Teams: An
+  Interview Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hashini Senaratne, Leimin Tian, Pavan Sikka, Jason Williams, David Howard, Dana Kulić, Cécile Paris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In human-robot teams, human situational awareness is the operator's conscious
+knowledge of the team's states, actions, plans and their environment.
+Appropriate human situational awareness is critical to successful human-robot
+collaboration. In human-robot teaming, it is often assumed that the best and
+required level of situational awareness is knowing everything at all times.
+This view is problematic, because what a human needs to know for optimal team
+performance varies given the dynamic environmental conditions, task context and
+roles and capabilities of team members. We explore this topic by interviewing
+16 participants with active and repeated experience in diverse human-robot
+teaming applications. Based on analysis of these interviews, we derive a
+framework explaining the dynamic nature of required situational awareness in
+human-robot teaming. In addition, we identify a range of factors affecting the
+dynamic nature of required and actual levels of situational awareness (i.e.,
+dynamic situational awareness), types of situational awareness inefficiencies
+resulting from gaps between actual and required situational awareness, and
+their main consequences. We also reveal various strategies, initiated by humans
+and robots, that assist in maintaining the required situational awareness. Our
+findings inform the implementation of accurate estimates of dynamic situational
+awareness and the design of user-adaptive human-robot interfaces. Therefore,
+this work contributes to the future design of more collaborative and effective
+human-robot teams.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Few-shot Crack Segmentation and its Precise 3D Automatic
+  Measurement in Concrete Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengru Deng, Jiapeng Yao, Chun Li, Su Wang, Xinrun Li, Varun Ojha, Xuhui He, Takashi Matsumoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual-Spatial Systems has become increasingly essential in concrete crack
+inspection. However, existing methods often lacks adaptability to diverse
+scenarios, exhibits limited robustness in image-based approaches, and struggles
+with curved or complex geometries. To address these limitations, an innovative
+framework for two-dimensional (2D) crack detection, three-dimensional (3D)
+reconstruction, and 3D automatic crack measurement was proposed by integrating
+computer vision technologies and multi-modal Simultaneous localization and
+mapping (SLAM) in this study. Firstly, building on a base DeepLabv3+
+segmentation model, and incorporating specific refinements utilizing foundation
+model Segment Anything Model (SAM), we developed a crack segmentation method
+with strong generalization across unfamiliar scenarios, enabling the generation
+of precise 2D crack masks. To enhance the accuracy and robustness of 3D
+reconstruction, Light Detection and Ranging (LiDAR) point clouds were utilized
+together with image data and segmentation masks. By leveraging both image- and
+LiDAR-SLAM, we developed a multi-frame and multi-modal fusion framework that
+produces dense, colorized point clouds, effectively capturing crack semantics
+at a 3D real-world scale. Furthermore, the crack geometric attributions were
+measured automatically and directly within 3D dense point cloud space,
+surpassing the limitations of conventional 2D image-based measurements. This
+advancement makes the method suitable for structural components with curved and
+complex 3D geometries. Experimental results across various concrete structures
+highlight the significant improvements and unique advantages of the proposed
+method, demonstrating its effectiveness, accuracy, and robustness in real-world
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combining Movement Primitives with Contraction Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moses C. Nah, Johannes Lachner, Neville Hogan, Jean-Jacques Slotine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a modular framework for motion planning using movement
+primitives. Central to the approach is Contraction Theory, a modular stability
+tool for nonlinear dynamical systems. The approach extends prior methods by
+achieving parallel and sequential combinations of both discrete and rhythmic
+movements, while enabling independent modulation of each movement. This modular
+framework enables a divide-and-conquer strategy to simplify the programming of
+complex robot motion planning. Simulation examples illustrate the flexibility
+and versatility of the framework, highlighting its potential to address diverse
+challenges in robot motion planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, submitted to Robotics and Automation Letters
+  (RA-L) for review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimation-Aware Trajectory Optimization with Set-Valued Measurement
+  Uncertainties 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Deole, Mehran Mesbahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present an optimization-based framework for generating
+estimation-aware trajectories in scenarios where measurement (output)
+uncertainties are state-dependent and set-valued. The framework leverages the
+concept of regularity for set-valued output maps. Specifically, we demonstrate
+that, for output-regular maps, one can utilize a set-valued observability
+measure that is concave with respect to finite-horizon state trajectories. By
+maximizing this measure, optimized estimation-aware trajectories can be
+designed for a broad class of systems, including those with locally linearized
+dynamics. To illustrate the effectiveness of the proposed approach, we provide
+a representative example in the context of trajectory planning for vision-based
+estimation. We present an estimation-aware trajectory for an uncooperative
+target-tracking problem that uses a machine learning (ML)-based estimation
+module on an ego-satellite.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embodied Scene Understanding for Vision Language Models via MetaVQA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhen Wang, Chenda Duan, Zhenghao Peng, Yuxin Liu, Bolei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Language Models (VLMs) demonstrate significant potential as embodied
+AI agents for various mobility applications. However, a standardized,
+closed-loop benchmark for evaluating their spatial reasoning and sequential
+decision-making capabilities is lacking. To address this, we present MetaVQA: a
+comprehensive benchmark designed to assess and enhance VLMs' understanding of
+spatial relationships and scene dynamics through Visual Question Answering
+(VQA) and closed-loop simulations. MetaVQA leverages Set-of-Mark prompting and
+top-down view ground-truth annotations from nuScenes and Waymo datasets to
+automatically generate extensive question-answer pairs based on diverse
+real-world traffic scenarios, ensuring object-centric and context-rich
+instructions. Our experiments show that fine-tuning VLMs with the MetaVQA
+dataset significantly improves their spatial reasoning and embodied scene
+comprehension in safety-critical simulations, evident not only in improved VQA
+accuracies but also in emerging safety-aware driving maneuvers. In addition,
+the learning demonstrates strong transferability from simulation to real-world
+observation. Code and data will be publicly available at
+https://metadriverse.github.io/metavqa .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>for the project webpage, see https://metadriverse.github.io/metavqa</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoLoop: Fast Visual SLAM Fine-tuning through Agentic Curriculum
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Assaf Lahiany, Oren Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current visual SLAM systems face significant challenges in balancing
+computational efficiency with robust loop closure handling. Traditional
+approaches require careful manual tuning and incur substantial computational
+overhead, while learning-based methods either lack explicit loop closure
+capabilities or implement them through computationally expensive methods. We
+present AutoLoop, a novel approach that combines automated curriculum learning
+with efficient fine-tuning for visual SLAM systems. Our method employs a DDPG
+(Deep Deterministic Policy Gradient) agent to dynamically adjust loop closure
+weights during training, eliminating the need for manual hyperparameter search
+while significantly reducing the required training steps. The approach
+pre-computes potential loop closure pairs offline and leverages them through an
+agent-guided curriculum, allowing the model to adapt efficiently to new
+scenarios. Experiments conducted on TartanAir for training and validated across
+multiple benchmarks including KITTI, EuRoC, ICL-NUIM and TUM RGB-D demonstrate
+that AutoLoop achieves comparable or superior performance while reducing
+training time by an order of magnitude compared to traditional approaches.
+AutoLoop provides a practical solution for rapid adaptation of visual SLAM
+systems, automating the weight tuning process that traditionally requires
+multiple manual iterations. Our results show that this automated curriculum
+strategy not only accelerates training but also maintains or improves the
+model's performance across diverse environmental conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Real-World Evaluation of two Cooperative Intersection Management
+  Approaches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16478v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16478v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marvin Klimke, Max Bastian Mertens, Benjamin Völz, Michael Buchholz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperative maneuver planning promises to significantly improve traffic
+efficiency at unsignalized intersections by leveraging connected automated
+vehicles. Previous works on this topic have been mostly developed for
+completely automated traffic in a simple simulated environment. In contrast,
+our previously introduced planning approaches are specifically designed to
+handle real-world mixed traffic. The two methods are based on multi-scenario
+prediction and graph-based reinforcement learning, respectively. This is the
+first study to perform evaluations in a novel mixed traffic simulation
+framework as well as real-world drives with prototype connected automated
+vehicles in public traffic. The simulation features the same connected
+automated driving software stack as deployed on one of the automated vehicles.
+Our quantitative evaluations show that cooperative maneuver planning achieves a
+substantial reduction in crossing times and the number of stops. In a realistic
+environment with few automated vehicles, there are noticeable efficiency gains
+with only slightly increasing criticality metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>M. Klimke and M. B. Mertens are both first authors with equal
+  contribution. 10 pages, 9 figures, 3 tables, submitted to IEEE Intelligent
+  Transportation Systems Magazine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Low-Dimensional Strain Models of Soft Robots by Looking at the
+  Evolution of Their Shape with Application to Model-Based Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.00138v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.00138v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ricardo Valadas, Maximilian Stölzle, Jingyue Liu, Cosimo Della Santina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining dynamic models of continuum soft robots is central to the analysis
+and control of soft robots, and researchers have devoted much attention to the
+challenge of proposing both data-driven and first-principle solutions. Both
+avenues have, however, shown their limitations; the former lacks structure and
+performs poorly outside training data, while the latter requires significant
+simplifications and extensive expert knowledge to be used in practice. This
+paper introduces a streamlined method for learning low-dimensional,
+physics-based models that are both accurate and easy to interpret. We start
+with an algorithm that uses image data (i.e., shape evolutions) to determine
+the minimal necessary segments for describing a soft robot's movement.
+Following this, we apply a dynamic regression and strain sparsification
+algorithm to identify relevant strains and define the model's dynamics. We
+validate our approach through simulations with various planar soft
+manipulators, comparing its performance against other learning strategies,
+showing that our models are both computationally efficient and 25x more
+accurate on out-of-training distribution inputs. Finally, we demonstrate that
+thanks to the capability of the method of generating physically compatible
+models, the learned models can be straightforwardly combined with model-based
+control policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, appearing in Proceedings of the 2025 IEEE 8th International
+  Conference on Soft Robotics (RoboSoft)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of Artificial Intelligence Methods for Lead Time Prediction
+  in Non-Cycled Areas of Automotive Production 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07317v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07317v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cornelius Hake, Jonas Weigele, Frederik Reichert, Christian Friedrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present study examines the effectiveness of applying Artificial
+Intelligence methods in an automotive production environment to predict unknown
+lead times in a non-cycle-controlled production area. Data structures are
+analyzed to identify contextual features and then preprocessed using one-hot
+encoding. Methods selection focuses on supervised machine learning techniques.
+In supervised learning methods, regression and classification methods are
+evaluated. Continuous regression based on target size distribution is not
+feasible. Classification methods analysis shows that Ensemble Learning and
+Support Vector Machines are the most suitable. Preliminary study results
+indicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost
+yield the best results. After further testing and extensive hyperparameter
+optimization, the final method choice is the LightGBM algorithm. Depending on
+feature availability and prediction interval granularity, relative prediction
+accuracies of up to 90% can be achieved. Further tests highlight the importance
+of periodic retraining of AI models to accurately represent complex production
+processes using the database. The research demonstrates that AI methods can be
+effectively applied to highly variable production data, adding business value
+by providing an additional metric for various control tasks while outperforming
+current non AI-based systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mind the Error! Detection and Localization of Instruction Errors in
+  Vision-and-Language Navigation <span class="chip">IROS'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Taioli, Stefano Rosa, Alberto Castellini, Lorenzo Natale, Alessio Del Bue, Alessandro Farinelli, Marco Cristani, Yiming Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-and-Language Navigation in Continuous Environments (VLN-CE) is one of
+the most intuitive yet challenging embodied AI tasks. Agents are tasked to
+navigate towards a target goal by executing a set of low-level actions,
+following a series of natural language instructions. All VLN-CE methods in the
+literature assume that language instructions are exact. However, in practice,
+instructions given by humans can contain errors when describing a spatial
+environment due to inaccurate memory or confusion. Current VLN-CE benchmarks do
+not address this scenario, making the state-of-the-art methods in VLN-CE
+fragile in the presence of erroneous instructions from human users. For the
+first time, we propose a novel benchmark dataset that introduces various types
+of instruction errors considering potential human causes. This benchmark
+provides valuable insight into the robustness of VLN systems in continuous
+environments. We observe a noticeable performance drop (up to -25%) in Success
+Rate when evaluating the state-of-the-art VLN-CE methods on our benchmark.
+Moreover, we formally define the task of Instruction Error Detection and
+Localization, and establish an evaluation protocol on top of our benchmark
+dataset. We also propose an effective method, based on a cross-modal
+transformer architecture, that achieves the best performance in error detection
+and localization, compared to baselines. Surprisingly, our proposed method has
+revealed errors in the validation set of the two commonly used datasets for
+VLN-CE, i.e., R2R-CE and RxR-CE, demonstrating the utility of our technique in
+other tasks. Code and dataset available at
+https://intelligolabs.github.io/R2RIE-CE
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 figures, 8 pages. Accepted at IROS'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reward-Driven Automated Curriculum Learning for Interaction-Aware
+  Self-Driving at Unsignalized Intersections 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13674v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13674v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zengqi Peng, Xiao Zhou, Lei Zheng, Yubin Wang, Jun Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present a reward-driven automated curriculum reinforcement
+learning approach for interaction-aware self-driving at unsignalized
+intersections, taking into account the uncertainties associated with
+surrounding vehicles (SVs). These uncertainties encompass the uncertainty of
+SVs' driving intention and also the quantity of SVs. To deal with this problem,
+the curriculum set is specifically designed to accommodate a progressively
+increasing number of SVs. By implementing an automated curriculum selection
+mechanism, the importance weights are rationally allocated across various
+curricula, thereby facilitating improved sample efficiency and training
+outcomes. Furthermore, the reward function is meticulously designed to guide
+the agent towards effective policy exploration. Thus the proposed framework
+could proactively address the above uncertainties at unsignalized intersections
+by employing the automated curriculum learning technique that progressively
+increases task difficulty, and this ensures safe self-driving through effective
+interaction with SVs. Comparative experiments are conducted in $Highway\_Env$,
+and the results indicate that our approach achieves the highest task success
+rate, attains strong robustness to initialization parameters of the curriculum
+selection module, and exhibits superior adaptability to diverse situational
+configurations at unsignalized intersections. Furthermore, the effectiveness of
+the proposed method is validated using the high-fidelity CARLA simulator.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, add grant information, minor textual polishing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ModCube: Modular, Self-Assembling Cubic Underwater Robot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.15627v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.15627v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxi Zheng, Guangmin Dai, Botao He, Zhaoyang Mu, Zhaochen Meng, Tianyi Zhang, Weiming Zhi, Dixia Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a low-cost, centralized modular underwater robot
+platform, ModCube, which can be used to study swarm coordination for a wide
+range of tasks in underwater environments. A ModCube structure consists of
+multiple ModCube robots. Each robot can move in six DoF with eight thrusters
+and can be rigidly connected to other ModCube robots with an electromagnet
+controlled by onboard computer. In this paper, we present a novel method for
+characterizing and visualizing dynamic behavior, along with four benchmarks to
+evaluate the morphological performance of the robot. Analysis shows that our
+ModCube design is desirable for omnidirectional tasks, compared with the
+configurations widely used by commercial underwater robots. We run real robot
+experiments in two water tanks to demonstrate the robust control and
+self-assemble of the proposed system, We also open-source the design and code
+to facilitate future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 8 figures, letter</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RoboHorizon: An LLM-Assisted Multi-View World Model for Long-Horizon
+  Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06605v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06605v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Chen, Jing Huo, Yangtao Chen, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient control in long-horizon robotic manipulation is challenging due to
+complex representation and policy learning requirements. Model-based visual
+reinforcement learning (RL) has shown great potential in addressing these
+challenges but still faces notable limitations, particularly in handling sparse
+rewards and complex visual features in long-horizon environments. To address
+these limitations, we propose the Recognize-Sense-Plan-Act (RSPA) pipeline for
+long-horizon tasks and further introduce RoboHorizon, an LLM-assisted
+multi-view world model tailored for long-horizon robotic manipulation. In
+RoboHorizon, pre-trained LLMs generate dense reward structures for multi-stage
+sub-tasks based on task language instructions, enabling robots to better
+recognize long-horizon tasks. Keyframe discovery is then integrated into the
+multi-view masked autoencoder (MAE) architecture to enhance the robot's ability
+to sense critical task sequences, strengthening its multi-stage perception of
+long-horizon processes. Leveraging these dense rewards and multi-view
+representations, a robotic world model is constructed to efficiently plan
+long-horizon tasks, enabling the robot to reliably act through RL algorithms.
+Experiments on two representative benchmarks, RLBench and FurnitureBench, show
+that RoboHorizon outperforms state-of-the-art visual model-based RL methods,
+achieving a 23.35% improvement in task success rates on RLBench's 4
+short-horizon tasks and a 29.23% improvement on 6 long-horizon tasks from
+RLBench and 3 furniture assembly tasks from FurnitureBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Surprising Effectiveness of Spectrum Clipping in Learning Stable
+  Linear Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01168v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01168v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanyao Guo, Yunhai Han, Harish Ravichandar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When learning stable linear dynamical systems from data, three important
+properties are desirable: i) predictive accuracy, ii) provable stability, and
+iii) computational efficiency. Unconstrained minimization of reconstruction
+errors leads to high accuracy and efficiency but cannot guarantee stability.
+Existing methods to remedy this focus on enforcing stability while also
+ensuring accuracy, but do so only at the cost of increased computation. In this
+work, we investigate if a straightforward approach can simultaneously offer all
+three desiderata of learning stable linear systems. Specifically, we consider a
+post-hoc approach that manipulates the spectrum of the learned system matrix
+after it is learned in an unconstrained fashion. We call this approach spectrum
+clipping (SC) as it involves eigen decomposition and subsequent reconstruction
+of the system matrix after clipping all of its eigenvalues that are larger than
+one to one (without altering the eigenvectors). Through detailed experiments
+involving two different applications and publicly available benchmark datasets,
+we demonstrate that this simple technique can simultaneously learn highly
+accurate linear systems that are provably stable. Notably, we demonstrate that
+SC can achieve similar or better performance than strong baselines while being
+orders-of-magnitude faster. We also show that SC can be readily combined with
+Koopman operators to learn stable nonlinear dynamics, such as those underlying
+complex dexterous manipulation skills involving multi-fingered robotic hands.
+Further, we find that SC can learn stable robot policies even when the training
+data includes unsuccessful or truncated demonstrations. Our codes and dataset
+can be found at https://github.com/GT-STAR-Lab/spec_clip.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review by L4DC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Experimental Study on The Effect of Multi-step Deep Reinforcement
+  Learning in POMDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.04999v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.04999v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingheng Meng, Rob Gorbet, Michael Burke, Dana Kulić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Reinforcement Learning (DRL) has made tremendous advances in both
+simulated and real-world robot control tasks in recent years. This is
+particularly the case for tasks that can be carefully engineered with a full
+state representation, and which can then be formulated as a Markov Decision
+Process (MDP). However, applying DRL strategies designed for MDPs to novel
+robot control tasks can be challenging, because the available observations may
+be a partial representation of the state, resulting in a Partially Observable
+Markov Decision Process (POMDP). This paper considers three popular DRL
+algorithms, namely Proximal Policy Optimization (PPO), Twin Delayed Deep
+Deterministic Policy Gradient (TD3), and Soft Actor-Critic (SAC), invented for
+MDPs, and studies their performance in POMDP scenarios. While prior work has
+found that SAC and TD3 typically outperform PPO across a broad range of tasks
+that can be represented as MDPs, we show that this is not always the case,
+using three representative POMDP environments. Empirical studies show that this
+is related to multi-step bootstrapping, where multi-step immediate rewards,
+instead of one-step immediate reward, are used to calculate the target value
+estimation of an observation and action pair. We identify this by observing
+that the inclusion of multi-step bootstrapping in TD3 (MTD3) and SAC (MSAC)
+results in improved robustness in POMDP settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">23</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Planning in Large-scale Systems Using Hierarchical Finite
+  State Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08918v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08918v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elis Stefansson, Karl H. Johansson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider optimal planning in a large-scale system formalised as a
+hierarchical finite state machine (HFSM). A planning algorithm is proposed
+computing an optimal plan between any two states in the HFSM, consisting of two
+steps: A pre-processing step that computes optimal exit costs of the machines
+in the HFSM, with time complexity scaling with the number of machines; and a
+query step that efficiently computes an optimal plan by removing irrelevant
+subtrees of the HFSM using the optimal exit costs. The algorithm is
+reconfigurable in the sense that changes in the HFSM are handled with ease,
+where the pre-processing step recomputes only the optimal exit costs affected
+by the change. The algorithm can also exploit compact representations that
+groups together identical machines in the HFSM, where the algorithm only needs
+to compute the optimal exit costs for one of the identical machines within each
+group, thereby avoid unnecessary recomputations. We validate the algorithm on
+large systems with millions of states and a robotic application. It is shown
+that our approach outperforms Dijkstra's algorithm, Bidirectional Dijkstra and
+Contraction Hierarchies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to TAC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating Cybersecurity in Predictive Cost-Benefit Power Scheduling: A
+  DeepStack Model with Dynamic Defense Mechanism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Peivand, Seyyed Mostafa Nosratabadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel, deep learning-based predictive model tailored
+to address wind curtailment in contemporary power systems, while enhancing
+cybersecurity measures through the implementation of a Dynamic Defense
+Mechanism (DDM). The augmented BiLSTM architecture facilitates accurate
+short-term predictions for wind power. In addition, a ConvGAN-driven step for
+stochastic scenario generation and a hierarchical, multi-stage optimization
+framework, which includes cases with and without Battery Energy Storage (BES),
+significantly minimizes operational costs. The inclusion of DDM strategically
+alters network reactances, thereby obfuscating the system's operational
+parameters to deter cyber threats. This robust solution not only integrates
+wind power more efficiently into power grids, leveraging BES potential to
+improve the economic efficiency of the system, but also boosting the cyber
+security of the system. Validation using the Illinois 200-bus system
+demonstrates the model's potential, achieving a 98% accuracy in forecasting and
+substantial cost reductions of over 3.8%. The results underscore the dual
+benefits of enhancing system reliability and security through advanced deep
+learning architectures and the strategic application of cybersecurity measures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Processing and Analyzing Real-World Driving Data: Insights on Trips,
+  Scenarios, and Human Driving Behaviors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihun Han, Dominik Karbowski, Ayman Moawad, Namdoo Kim, Aymeric Rousseau, Shihong Fan, Jason Hoon Lee, Jinho Ha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing large volumes of real-world driving data is essential for providing
+meaningful and reliable insights into real-world trips, scenarios, and human
+driving behaviors. To this end, we developed a multi-level data processing
+approach that adds new information, segments data, and extracts desired
+parameters. Leveraging a confidential but extensive dataset (over 1 million
+km), this approach leads to three levels of in-depth analysis: trip, scenario,
+and driving. The trip-level analysis explains representative properties
+observed in real-world trips, while the scenario-level analysis focuses on
+scenario conditions resulting from road events that reduce vehicle speed. The
+driving-level analysis identifies the cause of driving regimes for specific
+situations and characterizes typical human driving behaviors. Such analyses can
+support the design of both trip- and scenario-based tests, the modeling of
+human drivers, and the establishment of guidelines for connected and automated
+vehicles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Achieving Stability and Optimality: Control Strategy for a Wind Turbine
+  Supplying an Electrolyzer in the Islanded Storage-less Microgrid 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bosen Yang, Kang Ma, Jin Lin, Mingjun Zhang,  QiweiDuan, Zhendong Ji, Zhi Liu, Yonghua Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wind power generation supplying electrolyzers in islanded microgrids is an
+essential technical pathway for green hydrogen production, attracting growing
+attention in the transition towards net zero carbon emissions. Both academia
+and industry widely recognize that islanded AC microgrids normally rely on
+battery energy storage systems (BESSs) for grid-forming functions. However, the
+high cost of BESS significantly increases the levelized cost of hydrogen
+(LCOH), compromising economic feasibility. To address this challenge and reduce
+the LCOH, this paper focuses on a wind turbine (WT) supplying an electrolyzer
+in a storage-less microgrid and identifies a unique characteristic that
+challenges the conventional understanding of this microgrid: active power is
+coupled with microgrid voltage rather than frequency, the latter being entirely
+decoupled from active power balance. Based on this unique characteristic, this
+paper develops a new control strategy that maintains power balance, stabilizes
+the voltage and frequency, and maximizes hydrogen production. The effectiveness
+of the control strategy is validated through case studies conducted in
+Matlab/Simulink, especially its capability to maintain stability while
+maximizing hydrogen production under various conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Bayesian Hierarchical Model for Generating Synthetic Unbalanced Power
+  Distribution Grids 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henrique O. Caetano, Rahul K. Gupta, Marco Aiello, Carlos Dias Maciel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The real-world data of power networks is often inaccessible due to privacy
+and security concerns, highlighting the need for tools to generate realistic
+synthetic network data. Existing methods leverage geographic tools like
+OpenStreetMap with heuristic rules to model system topology and typically focus
+on single-phase, balanced systems, limiting their applicability to real-world
+distribution systems, which are usually unbalanced. This work proposes a
+Bayesian Hierarchical Model (BHM) to generate unbalanced three-phase
+distribution systems learning from existing networks. The scheme takes as input
+the base topology and aggregated demand per node and outputs a three-phase
+unbalanced system. The proposed scheme achieves a Mean Absolute Percentage
+Error (MAPE) of less than $8\%$ across all phases, with computation times of
+20.4 seconds for model training and 3.1 seconds per sample generation. The tool
+is applied to learn from publicly available SMART-DS dataset and applied to
+generate European 906 and IEEE-123 systems. We demonstrate the transfer
+learning capability of the proposed tool by leveraging a model trained on an
+observed system to generate a synthetic network for an unobserved system.
+Specifically, the tool is trained using the publicly available SMART-DS dataset
+and subsequently applied to generate synthetic networks for the European
+906-bus system and the IEEE 123-bus system. This tool allows researchers to
+simulate realistic unbalanced three-phase power data with high accuracy and
+speed, enhancing planning and operational analysis for modern power grids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cultivating Precision: Comparative Analysis of Sensor-Based Yogurt
+  Fermentation Monitoring Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ege Keskin, İhsan Ozan Yıldırım
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fermented dairy products, including yogurt, are widely consumed for their
+nutritional and health benefits. While numerous methods exist to monitor and
+understand yogurt fermentation, the literature lacks an integrated evaluation
+of diverse sensing approaches within a single experimental framework. To
+address this gap, this study systematically examines and compares multiple
+measurement techniques--electrical impedance, DC resistance, pH, optical
+transparency, carbon dioxide concentration, ambient temperature, and relative
+humidity--in tracking the yogurt fermentation process. By presenting a unified
+set of experimental results and assessing each method's observational
+characteristics, this work offers an encompassing reference point for
+researchers seeking to understand the relative merits and limitations of
+different sensing modalities. Rather than establishing definitive guidelines or
+practical recommendations, the findings provide a foundation for subsequent
+investigations into sensor-based fermentation monitoring, thereby contributing
+to a more comprehensive understanding of yogurt fermentation dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Some remarks on practical stabilization via CLF-based control under
+  measurement noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Schmidt, Pavel Osinenko, Stefan Streif
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Practical stabilization of input-affine systems in the presence of
+measurement errors and input constraints is considered in this brief note.
+Assuming that a Lyapunov function and a stabilizing control exist for an
+input-affine system, the required measurement accuracy at each point of the
+state space is computed. This is done via the Lyapunov function-based decay
+condition, which describes along with the input constraints a set of admissible
+controls. Afterwards, the measurement time points are computed based on the
+system dynamics. It is shown that between these self-triggered measurement time
+points, the system evolves and converges into the so-called target ball, i.e. a
+vicinity of the origin, where it remains. Furthermore, it is shown that the
+approach ensures the existence of a control law, which is admissible for all
+possible states and it introduces a connection between measurement time points,
+measurement accuracy, target ball, and decay. The results of the approach are
+shown in three examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures, DOI 10.1109/ACCESS.2024.3521048</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Digital Twin Online Channel Modeling: Challenges,Principles, and
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junling Li, Cheng-Xiang Wang, Chen Huang, Tianrun Qi, Tong Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Different from traditional offline channel modeling, digital twin online
+channel modeling can sense and accurately characterize dynamic wireless
+channels in real time, and can therefore greatly assist 6G network
+optimization. This article proposes a novel promising framework and a
+step-by-step design procedure of digital twin online channel models (DTOCM). By
+enabling continuous visualization and accurate prediction of dynamic channel
+variations, DTOCM can synchronize the performance between simulated and real
+networks. We first explore the evolution and conceptual advancements of DTOCM,
+highlighting its visions and associated challenges. Then, we explain its
+operational principles, construction mechanisms, and applications to typical 6G
+scenarios. Subsequently, the real-time channel information provisioning and
+visualization capabilities of DTOCM are illustrated through our DTOCM platform
+based on practical scenarios. Finally, future research directions and open
+issues are discussed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Multiple Interval Prediction Method for Electricity Prices based
+  on Scenarios Generation: Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Xin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an innovative interval prediction methodology aimed at
+addressing the limitations of current evaluation indicators while enhancing
+prediction accuracy and reliability. To achieve this, new evaluation metrics
+are proposed, offering a comprehensive assessment of interval prediction
+methods across both all-sample and single-sample scenarios. Additionally, a
+novel Pattern-Diversity Conditional Time-Series Generative Adversarial Network
+(PDCTSGAN) is developed, designed to generate realistic scenarios and support a
+new interval prediction framework based on scenario generation. The PDCTSGAN
+model incorporates unique modifications to random noise inputs, enabling the
+creation of pattern-diverse and realistic scenarios. These scenarios are then
+utilized to produce multiple interval patterns characterized by high coverage
+probability and reduced average width. The proposed approach is validated
+through detailed case studies, and the paper concludes with a discussion of
+future research directions to further refine interval prediction techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Multiple Interval Prediction Method for Electricity Prices based
+  on Scenarios Generation: Definition and Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08531v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08531v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Xin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents interval prediction methodology to address limitations in
+existing evaluation indicators and improve prediction accuracy and reliability.
+First, new evaluation indicators are proposed to comprehensively assess
+interval prediction methods, considering both all-sample and single-sample
+scenarios. Second, a novel Pattern-Diversity Conditional Time-Series Generative
+Adversarial Network (PDCTSGAN) is introduced to generate realistic scenarios,
+enabling a new interval prediction approach based on scenario generation. The
+PDCTSGAN model innovatively incorporates modifications to random noise inputs,
+allowing the generation of pattern-diverse realistic scenarios. These scenarios
+are further utilized to construct multiple interval patterns with high coverage
+probability and low average width. The effectiveness of the proposed
+methodology is demonstrated through comprehensive case studies. The paper
+concludes by highlighting future research directions to further enhance
+interval prediction methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2501.07827</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chance-Constrained Sampling-Based MPC for Collision Avoidance in
+  Uncertain Dynamic Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ihab S. Mohamed, Mahmoud Ali, Lantao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Navigating safely in dynamic and uncertain environments is challenging due to
+uncertainties in perception and motion. This letter presents C2U-MPPI, a robust
+sampling-based Model Predictive Control (MPC) framework that addresses these
+challenges by leveraging the Unscented Model Predictive Path Integral (U-MPPI)
+control strategy with integrated probabilistic chance constraints, ensuring
+more reliable and efficient navigation under uncertainty. Unlike gradient-based
+MPC methods, our approach (i) avoids linearization of system dynamics and
+directly applies non-convex and nonlinear chance constraints, enabling more
+accurate and flexible optimization, and (ii) enhances computational efficiency
+by reformulating probabilistic constraints into a deterministic form and
+employing a layered dynamic obstacle representation, enabling real-time
+handling of multiple obstacles. Extensive experiments in simulated and
+real-world human-shared environments validate the effectiveness of our
+algorithm against baseline methods, showcasing its capability to generate
+feasible trajectories and control inputs that adhere to system dynamics and
+constraints in dynamic settings, enabled by unscented-based sampling strategy
+and risk-sensitive trajectory evaluation. A supplementary video is available
+at: https://youtu.be/FptAhvJlQm8
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has 8 pages, 2 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on IBR Penetrated Power System Stability Analysis Using
+  Frequency Scanning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08516v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08516v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuvangkar Chandra Das, Lokesh Saravana, Le Minh Vu, Manh Bui, Tuyen Vu, Jianhua Zhang, Thomas Ortmeyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid rise in inverter-based renewable resources has heightened concerns
+over subsynchronous resonance and oscillations, thereby challenging grid
+stability. This paper reviews approaches to identify and mitigate these issues,
+focusing on frequency scanning methods for stability assessment. It categorizes
+white-, black-, and gray-box modeling techniques, compares positive-sequence,
+dq-frame, and alpha-beta domain scanning, and examines perturbation shapes like
+step, ramp, and chirp. A comparative study highlights their strengths,
+limitations, and suitability for specific scenarios. By summarizing past events
+and surveying available tools, this work guides operators and researchers
+toward more effective, reliable stability analysis methods in grids with high
+renewable penetration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimation-Aware Trajectory Optimization with Set-Valued Measurement
+  Uncertainties 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Deole, Mehran Mesbahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present an optimization-based framework for generating
+estimation-aware trajectories in scenarios where measurement (output)
+uncertainties are state-dependent and set-valued. The framework leverages the
+concept of regularity for set-valued output maps. Specifically, we demonstrate
+that, for output-regular maps, one can utilize a set-valued observability
+measure that is concave with respect to finite-horizon state trajectories. By
+maximizing this measure, optimized estimation-aware trajectories can be
+designed for a broad class of systems, including those with locally linearized
+dynamics. To illustrate the effectiveness of the proposed approach, we provide
+a representative example in the context of trajectory planning for vision-based
+estimation. We present an estimation-aware trajectory for an uncooperative
+target-tracking problem that uses a machine learning (ML)-based estimation
+module on an ego-satellite.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reducing real-time complexity via sub-control Lyapunov functions: from
+  theory to experiments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huu-Thinh Do, Franco Blanchini, Stefano Miani, Ionela Prodan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The techniques to design control Lyapunov functions (CLF), along with a
+proper stabilizing feedback, possibly in the presence of constraints, often
+provide control laws that are too complex for proper implementation online,
+especially when an optimization problem is involved. In this work, we show how
+to acquire an alternative, computationally attractive feedback. Given a nominal
+CLF and a nominal state feedback, we say that a different positive definite
+function is a Sub-control Lyapunov function (SCLF) if its Lyapunov derivative
+is negative-definite and bounded above by the Lyapunov derivative of the
+nominal function with the nominal control. It turns out that if we consider a
+family of basis functions, then a SCLF can be computed by linear programming,
+with an infinite number of constraints. The idea is that although the offline
+computational burden to achieve the new controller and solve the linear program
+is considerable, the online computational burden is drastically reduced.
+Comprehensive simulations and experiments on drone control are conducted to
+demonstrate the effectiveness of the study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Discrete-sequence <span class="highlight-title">Dataset</span> for Evaluating Online Unsupervised Anomaly
+  Detection Approaches for Multivariate Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.13951v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.13951v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Correia, Jan-Christoph Goos, Thomas Bäck, Anna V. Kononova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benchmarking anomaly detection approaches for multivariate time series is
+challenging due to the lack of high-quality datasets. Current publicly
+available datasets are too small, not diverse and feature trivial anomalies,
+which hinders measurable progress in this research area. We propose a solution:
+a diverse, extensive, and non-trivial dataset generated via state-of-the-art
+simulation tools that reflects realistic behaviour of an automotive powertrain,
+including its multivariate, dynamic and variable-state properties. To cater for
+both unsupervised and semi-supervised anomaly detection settings, as well as
+time series generation and forecasting, we make different versions of the
+dataset available, where training and test subsets are offered in contaminated
+and clean versions, depending on the task. We also provide baseline results
+from a small selection of approaches based on deterministic and variational
+autoencoders, as well as a non-parametric approach. As expected, the baseline
+experimentation shows that the approaches trained on the semi-supervised
+version of the dataset outperform their unsupervised counterparts, highlighting
+a need for approaches more robust to contaminated training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the IEEE Transactions on Reliability journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enforcing contraction via data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07819v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07819v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Hu, Claudio De Persis, Pietro Tesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present data-based conditions for enforcing contractivity via feedback
+control and obtain desired asymptotic properties of the closed-loop system. We
+focus on unknown nonlinear control systems whose vector fields are expressible
+via a dictionary of functions and derive data-dependent semidefinite programs
+whose solution returns the controller that guarantees contractivity. When data
+are perturbed by disturbances that are linear combinations of sinusoids of
+known frequencies (but unknown amplitude and phase) and constants, we
+remarkably obtain conditions for contractivity that do not depend on the
+magnitude of the disturbances, with imaginable positive consequences for the
+synthesis of the controller. Finally, we show how to design from data an
+integral controller for nonlinear systems that achieves constant reference
+tracking and constant disturbance rejection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SupplyGraph: A Benchmark <span class="highlight-title">Dataset</span> for Supply Chain Planning using Graph
+  Neural Networks <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15299v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15299v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azmine Toushik Wasi, MD Shafikul Islam, Adipto Raihan Akib
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have gained traction across different domains
+such as transportation, bio-informatics, language processing, and computer
+vision. However, there is a noticeable absence of research on applying GNNs to
+supply chain networks. Supply chain networks are inherently graph-like in
+structure, making them prime candidates for applying GNN methodologies. This
+opens up a world of possibilities for optimizing, predicting, and solving even
+the most complex supply chain problems. A major setback in this approach lies
+in the absence of real-world benchmark datasets to facilitate the research and
+resolution of supply chain problems using GNNs. To address the issue, we
+present a real-world benchmark dataset for temporal tasks, obtained from one of
+the leading FMCG companies in Bangladesh, focusing on supply chain planning for
+production purposes. The dataset includes temporal data as node features to
+enable sales predictions, production planning, and the identification of
+factory issues. By utilizing this dataset, researchers can employ GNNs to
+address numerous supply chain problems, thereby advancing the field of supply
+chain analytics and planning. Source: https://github.com/CIOL-SUST/SupplyGraph
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 4th workshop on Graphs and more Complex structures for
+  Learning and Reasoning, colocated with AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Implications of Zoning Ordinances for Rural Utility-Scale Solar
+  Deployment and Power System Decarbonization in the Great Lakes Region 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.16626v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.16626v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Papa Yaw Owusu-Obeng, Sarah Banas Mills, Michael T. Craig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Local zoning ordinances across the United States have the impact of
+restricting development of energy infrastructure, including utility-scale solar
+photovoltaics. While these ordinances may be developed for legitimate purposes
+to protect public health and safety, they could impede or increase costs of
+power sector decarbonization. We quantify the role of utility-scale solar
+zoning ordinances on power sector decarbonization across the Great Lakes region
+(Illinois, Indiana, Michigan, Minnesota, Ohio, and Wisconsin) by integrating
+6,300 rural community zoning ordinances into a power system planning model.
+Relative to no ordinances, solar zoning ordinances reduce total potential
+deployment of solar PV by 52% (or 1.6 TW) across our region. Currently,
+however, the biggest zoning barrier to deployment is zoning ordinances which
+are silent on utility-scale solar. Deployment restrictions translate to up to 4
+GW greater investment needs and 5.6% greater PV investment costs to achieve a
+10% PV generation target. Starker shifts occur at the state level, e.g.
+Wisconsin sees a 40% reduction in PV investments due to zoning restrictions.
+Our results underscore the need for planning that aligns local zoning laws with
+state and regional goals.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EdgeSight: Enabling Modeless and Cost-Efficient Inference at the Edge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19213v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19213v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        ChonLam Lao, Jiaqi Gao, Ganesh Ananthanarayanan, Aditya Akella, Minlan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional ML inference is evolving toward modeless inference, which
+abstracts the complexity of model selection from users, allowing the system to
+automatically choose the most appropriate model for each request based on
+accuracy and resource requirements. While prior studies have focused on
+modeless inference within data centers, this paper tackles the pressing need
+for cost-efficient modeless inference at the edge -- particularly within its
+unique constraints of limited device memory, volatile network conditions, and
+restricted power consumption.
+  To overcome these challenges, we propose EdgeSight, a system that provides
+cost-efficient EdgeSight serving for diverse DNNs at the edge. EdgeSight
+employs an edge-data center (edge-DC) architecture, utilizing confidence
+scaling to reduce the number of model options while meeting diverse accuracy
+requirements. Additionally, it supports lossy inference in volatile network
+environments. Our experimental results show that EdgeSight outperforms existing
+systems by up to 1.6x in P99 latency for modeless services. Furthermore, our
+FPGA prototype demonstrates similar performance at certain accuracy levels,
+with a power consumption reduction of up to 3.34x.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Surprising Effectiveness of Spectrum Clipping in Learning Stable
+  Linear Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01168v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01168v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanyao Guo, Yunhai Han, Harish Ravichandar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When learning stable linear dynamical systems from data, three important
+properties are desirable: i) predictive accuracy, ii) provable stability, and
+iii) computational efficiency. Unconstrained minimization of reconstruction
+errors leads to high accuracy and efficiency but cannot guarantee stability.
+Existing methods to remedy this focus on enforcing stability while also
+ensuring accuracy, but do so only at the cost of increased computation. In this
+work, we investigate if a straightforward approach can simultaneously offer all
+three desiderata of learning stable linear systems. Specifically, we consider a
+post-hoc approach that manipulates the spectrum of the learned system matrix
+after it is learned in an unconstrained fashion. We call this approach spectrum
+clipping (SC) as it involves eigen decomposition and subsequent reconstruction
+of the system matrix after clipping all of its eigenvalues that are larger than
+one to one (without altering the eigenvectors). Through detailed experiments
+involving two different applications and publicly available benchmark datasets,
+we demonstrate that this simple technique can simultaneously learn highly
+accurate linear systems that are provably stable. Notably, we demonstrate that
+SC can achieve similar or better performance than strong baselines while being
+orders-of-magnitude faster. We also show that SC can be readily combined with
+Koopman operators to learn stable nonlinear dynamics, such as those underlying
+complex dexterous manipulation skills involving multi-fingered robotic hands.
+Further, we find that SC can learn stable robot policies even when the training
+data includes unsuccessful or truncated demonstrations. Our codes and dataset
+can be found at https://github.com/GT-STAR-Lab/spec_clip.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review by L4DC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Performance-Barrier Event-Triggered PDE Control of Traffic Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00722v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00722v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peihan Zhang, Bhathiya Rathnayake, Mamadou Diagne, Miroslav Krstic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For stabilizing stop-and-go oscillations in traffic flow by actuating a
+variable speed limit (VSL) at a downstream boundary of a freeway segment, we
+introduce event-triggered PDE backstepping designs employing the recent concept
+of performance-barrier event-triggered control (P-ETC). Our design is for
+linearized hyperbolic Aw-Rascle-Zhang (ARZ) PDEs governing traffic velocity and
+density. Compared to continuous feedback, ETC provides a piecewise-constant VSL
+commands-more likely to be obeyed by human drivers. Unlike the existing regular
+ETC (R-ETC), which enforces conservatively a strict decrease of a Lyapunov
+function, our performance-barrier (P-ETC) approach permits an increase, as long
+as the Lyapunov function remains below a performance barrier, resulting in
+fewer control updates than R-ETC. To relieve VSL from continuously monitoring
+the triggering function, we also develop periodic event-triggered (PETC) and
+self-triggered (STC) versions of both R-ETC and P-ETC. These are referred to as
+R/P-PETC and R/P-STC, respectively, and we show that they both guarantee
+Zeno-free behavior and exponential convergence in the spatial $L^2$ norm. With
+comparative simulations, we illustrate the benefits of the performance-barrier
+designs through traffic metrics (driver comfort, safety, travel time, fuel
+consumption). The proposed algorithms reduce discomfort nearly in half relative
+to driver behavior without VSL, while tripling the driver safety, measured by
+the average dwell time, relative to the R-ETC frequent-switching VSL schedule.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the 6G Potentials: Immersive, Hyper Reliable, and Low-Latency
+  Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.11051v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.11051v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Afsoon Alidadi Shamsabadi, Animesh Yadav, Yasser Gadallah, Halim Yanikomeroglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transition towards the sixth-generation (6G) wireless telecommunications
+networks introduces significant challenges for researchers and industry
+stakeholders. The 6G technology aims to enhance existing usage scenarios
+through supporting innovative applications that require stringent key
+performance indicators (KPIs). In some critical use cases of 6G, multiple KPIs,
+including immersive throughput, with an envisioned peak data rate of $1$ Tbps,
+hyper-reliability, in the range of $10^{-5}$ to $10^{-7}$, and hyper
+low-latency, between $0.1$ and $1$ ms, must be achieved simultaneously to
+deliver the expected service experience. However, this is challenging due to
+the conflicting nature of these KPIs. This article proposes a new service class
+of 6G as immersive, hyper reliable, and low-latency communication (IHRLLC), and
+introduces a potential network architecture to achieve the associated KPIs.
+Specifically, enhanced technologies, such as ultra-massive multiple-input
+multiple-output (umMIMO)-aided terahertz (THz) communications, reconfigurable
+intelligent surfaces (RIS), and non-terrestrial networks (NTN), are viewed as
+the key enablers for achieving immersive data rates and hyper reliability.
+Given the computational complexity involved in employing these technologies, we
+propose mathematical and computational enabling technologies, such as
+learn-to-optimize (L2O), generative-AI (GenAI), quantum computing, and network
+digital twin (NDT), to complement the proposed architecture and optimize the
+latency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lateral String Stability in Autonomous & Connected Vehicle Platoons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07540v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07540v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neelkamal Somisetty, Swaroop Darbha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the lateral control of Autonomous and Connected Vehicles
+(ACVs) in a platoon executing an Emergency Lane Change (ELC) maneuver. These
+maneuvers are typically triggered by emergency signals from the front or rear
+of the platoon in response to the need to avoid obstacles or allow other
+vehicles to pass. The study assumes that ACVs maintain reliable connectivity,
+enabling each following vehicle to access GPS position traces of both the lead
+and immediately preceding vehicles in the platoon. We demonstrate that lateral
+string stability in the ACV platoon can be achieved using communicated
+information solely from the lead and preceding vehicles. Additionally, we
+present a lateral control framework for ACVs, which helps track a discretized
+preview of the trajectory constructed from the communicated data. This
+framework involves constructing two distinct trajectories based on the preview
+data from the lead and preceding vehicles, calculating the associated errors
+and lateral control actions for each, and then integrating these to generate a
+steering command. Numerical results validate the effectiveness of the proposed
+lateral control scheme.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18th IEEE International Conference on Vehicular Electronics and
+  Safety 2024 (ICVES)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">33</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A consensus-based optimization method for nonsmooth nonconvex programs
+  with approximated gradient descent scheme 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhen Wei, Fan Wu, Wei Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we are interested in finding the global minimizer of a
+nonsmooth nonconvex unconstrained optimization problem. By combining the
+discrete consensus-based optimization (CBO) algorithm and the gradient descent
+method, we develop a novel CBO algorithm with an extra gradient descent scheme
+evaluated by the forward-difference technique on the function values, where
+only the objective function values are used in the proposed algorithm. First,
+we prove that the proposed algorithm can exhibit global consensus in an
+exponential rate in two senses and possess a unique global consensus point.
+Second, we evaluate the error estimate between the objective function value on
+the global consensus point and its global minimum. In particular, as the
+parameter $\beta$ tends to $\infty$, the error converges to zero and the
+convergence rate is $\mathcal{O}\left(\frac{\log\beta}{\beta}\right)$. Third,
+under some suitable assumptions on the objective function, we provide the
+number of iterations required for the mean square error in expectation to reach
+the desired accuracy. It is worth underlining that the theoretical analysis in
+this paper does not use the mean-field limit. Finally, we illustrate the
+improved efficiency and promising performance of our novel CBO method through
+some experiments on several nonconvex benchmark problems and the application to
+train deep neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAC Learnability of Scenario Decision-Making Algorithms: Necessary and
+  Sufficient Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume O. Berger, Raphaël M. Jungers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the PAC property of scenario decision-making algorithms, that is,
+the ability to make a decision that has an arbitrarily low risk of violating an
+unknown safety constraint, provided sufficiently many realizations (called
+scenarios) of the safety constraint are sampled. Sufficient conditions for
+scenario decision-making algorithms to be PAC are available in the literature,
+such as finiteness of the VC dimension of its associated classifier and
+existence of a compression scheme. We study the question of whether these
+sufficient conditions are also necessary. We show with counterexamples that
+this is not the case in general. This contrasts with binary classification
+learning, for which the analogous conditions are sufficient and necessary.
+Popular scenario decision-making algorithms, such as scenario optimization,
+enjoy additional properties, such as stability and consistency. We show that
+even under these additional assumptions the above conclusions hold. Finally, we
+derive a necessary condition for scenario decision-making algorithms to be PAC,
+inspired by the VC dimension and the so-called no-free-lunch theorem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Compression Bounds for Scenario Decision Making 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume O. Berger, Raphaël M. Jungers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scenario decision making offers a flexible way of making decision in an
+uncertain environment while obtaining probabilistic guarantees on the risk of
+failure of the decision. The idea of this approach is to draw samples of the
+uncertainty and make a decision based on the samples, called "scenarios". The
+probabilistic guarantees take the form of a bound on the probability of
+sampling a set of scenarios that will lead to a decision whose risk of failure
+is above a given maximum tolerance. This bound can be expressed as a function
+of the number of sampled scenarios, the maximum tolerated risk, and some
+intrinsic property of the problem called the "compression size". Several such
+bounds have been proposed in the literature under various assumptions on the
+problem. We propose new bounds that improve upon the existing ones without
+requiring stronger assumptions on the problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Markov decision processes: on the convergence of the Monte-Carlo first
+  visit algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sylvain Delattre, Nicolas Fournier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the Monte-Carlo first visit algorithm, of which the goal is to
+find the optimal control in a Markov decision process with finite state space
+and finite number of possible actions. We show its convergence when the
+discount factor is smaller than $1/2$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nesterov Acceleration for Ensemble Kalman Inversion and Variants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sydney Vernon, Eviatar Bach, Oliver R. A. Dunbar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensemble Kalman inversion (EKI) is a derivative-free, particle-based
+optimization method for solving inverse problems. It can be shown that EKI
+approximates a gradient flow, which allows the application of methods for
+accelerating gradient descent. Here, we show that Nesterov acceleration is
+effective in speeding up the reduction of the EKI cost function on a variety of
+inverse problems. We also implement Nesterov acceleration for two EKI variants,
+unscented Kalman inversion and ensemble transform Kalman inversion. Our
+specific implementation takes the form of a particle-level nudge that is
+demonstrably simple to couple in a black-box fashion with any existing EKI
+variant algorithms, comes with no additional computational expense, and with no
+additional tuning hyperparameters. This work shows a pathway for future
+research to translate advances in gradient-based optimization into advances in
+gradient-free Kalman optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Major-Minor Mean Field Game of Stopping: An Entropy Regularization
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Yu, Jiacheng Zhang, Keyu Zhang, Zhou Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies a discrete-time major-minor mean field game of stopping
+where the major player can choose either an optimal control or stopping time.
+We look for the relaxed equilibrium as a randomized stopping policy, which is
+formulated as a fixed point of a set-valued mapping, whose existence is
+challenging by direct arguments. To overcome the difficulties caused by the
+presence of a major player, we propose to study an auxiliary problem by
+considering entropy regularization in the major player's problem while
+formulating the minor players' optimal stopping problems as linear programming
+over occupation measures. We first show the existence of regularized equilibria
+as fixed points of some simplified set-valued operator using the
+Kakutani-Fan-Glicksberg fixed-point theorem. Next, we prove that the
+regularized equilibrium converges as the regularization parameter $\lambda$
+tends to 0, and the limit corresponds to a fixed point of the original
+operator, thereby confirming the existence of a relaxed equilibrium in the
+original mean field game problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2210.03554 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extrapolated Hard Thresholding Algorithms with Finite Length for
+  Composite $\ell_0$ Penalized Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08719v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08719v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Wu, Jiazhen Wei, Wei Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For a class of sparse optimization problems with the penalty function of
+$\|(\cdot)_+\|_0$, we first characterize its local minimizers and then propose
+an extrapolated hard thresholding algorithm to solve such problems. We show
+that the iterates generated by the proposed algorithm with $\epsilon>0$ (where
+$\epsilon$ is the dry friction coefficient) have finite length, without relying
+on the Kurdyka-{\L}ojasiewicz inequality. Furthermore, we demonstrate that the
+algorithm converges to an $\epsilon$-local minimizer of this problem. For the
+special case that $\epsilon=0$, we establish that any accumulation point of the
+iterates is a local minimizer of the problem. Additionally, we analyze the
+convergence when an error term is present in the algorithm, showing that the
+algorithm still converges in the same manner as before, provided that the
+errors asymptotically approach zero. Finally, we conduct numerical experiments
+to verify the theoretical results of the proposed algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kernel EDMD for data-driven nonlinear Koopman MPC with stability
+  guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08709v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08709v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lea Bold, Manuel Schaller, Irene Schimperna, Karl Worthmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extended dynamic mode decomposition (EDMD) is a popular data-driven method to
+predict the action of the Koopman operator, i.e., the evolution of an
+observable function along the flow of a dynamical system. In this paper, we
+leverage a recently-introduced kernel EDMD method for control systems for
+data-driven model predictive control. Building upon pointwise error bounds
+proportional in the state, we rigorously show practical asymptotic stability of
+the origin w.r.t. the MPC closed loop without stabilizing terminal conditions.
+The key novelty is that we avoid restrictive invariance conditions. Last, we
+verify our findings by numerical simulations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal control of counter-terrorism tactics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08706v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08706v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        L. Bayon, P. Fortuny Ayuso, P. J. Garcia-Nieto, J. M. Grau, M. M. Ruiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an optimal control problem to analyze the efficacy of
+counter-terrorism tactics. We present an algorithm that efficiently combines
+the Minimum Principle of Pontryagin, the shooting method and the cyclic descent
+of coordinates. We also present a result that allows us to know a priori the
+steady state solutions. Using this technique we are able to choose parameters
+that reach a specific solution, of which there are two. Numerical examples are
+presented to illustrate the possibilities of the method. Finally, we study the
+sufficient conditions for optimality and suggest an improvement on the
+functional which also guarantees local optimality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Some remarks on practical stabilization via CLF-based control under
+  measurement noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Schmidt, Pavel Osinenko, Stefan Streif
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Practical stabilization of input-affine systems in the presence of
+measurement errors and input constraints is considered in this brief note.
+Assuming that a Lyapunov function and a stabilizing control exist for an
+input-affine system, the required measurement accuracy at each point of the
+state space is computed. This is done via the Lyapunov function-based decay
+condition, which describes along with the input constraints a set of admissible
+controls. Afterwards, the measurement time points are computed based on the
+system dynamics. It is shown that between these self-triggered measurement time
+points, the system evolves and converges into the so-called target ball, i.e. a
+vicinity of the origin, where it remains. Furthermore, it is shown that the
+approach ensures the existence of a control law, which is admissible for all
+possible states and it introduces a connection between measurement time points,
+measurement accuracy, target ball, and decay. The results of the approach are
+shown in three examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures, DOI 10.1109/ACCESS.2024.3521048</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ $H^\infty$-control for a class of boundary controlled hyperbolic PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08658v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08658v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anthony Hastir, Birgit Jacob, Hans Zwart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A solution to the suboptimal $H^\infty$-control problem is given for a class
+of hyperbolic partial differential equations (PDEs). The first result of this
+manuscript shows that the considered class of PDEs admits an equivalent
+representation as an infinite-dimensional discrete-time system. Taking
+advantage of this, this manuscript shows that it is equivalent to solve the
+suboptimal $H^\infty$-control problem for a finite-dimensional discrete-time
+system whose matrices are derived from the PDEs. After computing the solution
+to this much simpler problem, the solution to the original problem can be
+deduced easily. In particular, the optimal compensator solution to the
+suboptimal $H^\infty$-control problem is governed by a set of hyperbolic PDEs,
+actuated and observed at the boundary. We illustrate our results with a
+boundary controlled and boundary observed vibrating string.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Geometry of Sparsity-Inducing Norms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08651v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08651v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-Philippe Chancelier, Michel de Lara, Antoine Deza, Lionel Pournin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sparse optimization seeks an optimal solution with few nonzero entries. To
+achieve this, it is common to add to the criterion a penalty term proportional
+to the $\ell_1$-norm, which is recognized as the archetype of sparsity-inducing
+norms. In this approach, the number of nonzero entries is not controlled a
+priori. By contrast, in this paper, we focus on finding an optimal solution
+with at most~$k$ nonzero coordinates (or for short, $k$-sparse vectors), where
+$k$ is a given sparsity level (or ``sparsity budget''). For this purpose, we
+study the class of generalized $k$-support norms that arise from a given source
+norm. When added as a penalty term, we provide conditions under which such
+generalized $k$-support norms promote $k$-sparse solutions. The result follows
+from an analysis of the exposed faces of closed convex sets generated by
+$k$-sparse vectors, and of how primal support identification can be deduced
+from dual information. Finally, we study some of the geometric properties of
+the unit balls for the $k$-support norms and their dual norms when the source
+norm belongs to the family of $\ell_p$-norms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentiable Singular Value Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Kanchi, Sicheng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Singular value decomposition is widely used in modal analysis, such as proper
+orthogonal decomposition and resolvent analysis, to extract key features from
+complex problems. SVD derivatives need to be computed efficiently to enable the
+large scale design optimization. However, for a general complex matrix, no
+method can accurately compute this derivative to machine precision and remain
+scalable with respect to the number of design variables without requiring the
+all of the singular variables. We propose two algorithms to efficiently compute
+this derivative based on the adjoint method and reverse automatic
+differentiation and RAD-based singular value derivative formula.
+Differentiation results for each method proposed were compared with FD results
+for one square and one tall rectangular matrix example and matched with the FD
+results to about 5 to 7 digits. Finally, we demonstrate the scalability of the
+proposed method by calculating the derivatives of singular values with respect
+to the snapshot matrix derived from the POD of a large dataset for a
+laminar-turbulent transitional flow over a flat plate, sourced from the John
+Hopkins turbulence database.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>52 pages , 4 tables, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimation-Aware Trajectory Optimization with Set-Valued Measurement
+  Uncertainties 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Deole, Mehran Mesbahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present an optimization-based framework for generating
+estimation-aware trajectories in scenarios where measurement (output)
+uncertainties are state-dependent and set-valued. The framework leverages the
+concept of regularity for set-valued output maps. Specifically, we demonstrate
+that, for output-regular maps, one can utilize a set-valued observability
+measure that is concave with respect to finite-horizon state trajectories. By
+maximizing this measure, optimized estimation-aware trajectories can be
+designed for a broad class of systems, including those with locally linearized
+dynamics. To illustrate the effectiveness of the proposed approach, we provide
+a representative example in the context of trajectory planning for vision-based
+estimation. We present an estimation-aware trajectory for an uncooperative
+target-tracking problem that uses a machine learning (ML)-based estimation
+module on an ego-satellite.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extended Triangle Inequalities for Nonconvex Box-Constrained Quadratic
+  Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09150v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09150v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kurt M. Anstreicher, Diane Puges
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Let $\rm{Box}_n = \{x \in \mathbb{R}^n : 0 \leq x \leq e \}$, and let
+$\rm{QPB}_n$ denote the convex hull of $\{(1, x')'(1, x') : x \in
+\rm{Box}_n\}$. The quadratic programming problem $\min\{x'Q x + q'x : x \in
+\rm{Box}_n\}$ where $Q$ is not positive semidefinite (PSD), is equivalent to a
+linear optimization problem over $\rm{QPB}_n$ and could be efficiently solved
+if a tractable characterization of $\rm{QPB}_n$ was available. It is known that
+$\rm{QPB}_2$ can be represented using a PSD constraint combined with
+constraints generated using the reformulation-linearization technique (RLT).
+The triangle (TRI) inequalities are also valid for $\rm{QPB}_3$, but the PSD,
+RLT and TRI constraints together do not fully characterize $\rm{QPB}_3$. In
+this paper we describe new valid linear inequalities for $\rm{QPB}_n$, $n \geq
+3$ based on strengthening the approximation of $\rm{QPB}_3$ given by the PSD,
+RLT and TRI constraints. These new inequalities are generated in a systematic
+way using a known disjunctive characterization for $\rm{QPB}_3$. We also
+describe a conic strengthening of the linear inequalities that incorporates
+second-order cone constraints. We show computationally that the new
+inequalities and their conic strengthenings obtain exact solutions for some
+nonconvex box-constrained instances that are not solved exactly using the PSD,
+RLT and TRI constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reducing real-time complexity via sub-control Lyapunov functions: from
+  theory to experiments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huu-Thinh Do, Franco Blanchini, Stefano Miani, Ionela Prodan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The techniques to design control Lyapunov functions (CLF), along with a
+proper stabilizing feedback, possibly in the presence of constraints, often
+provide control laws that are too complex for proper implementation online,
+especially when an optimization problem is involved. In this work, we show how
+to acquire an alternative, computationally attractive feedback. Given a nominal
+CLF and a nominal state feedback, we say that a different positive definite
+function is a Sub-control Lyapunov function (SCLF) if its Lyapunov derivative
+is negative-definite and bounded above by the Lyapunov derivative of the
+nominal function with the nominal control. It turns out that if we consider a
+family of basis functions, then a SCLF can be computed by linear programming,
+with an infinite number of constraints. The idea is that although the offline
+computational burden to achieve the new controller and solve the linear program
+is considerable, the online computational burden is drastically reduced.
+Comprehensive simulations and experiments on drone control are conducted to
+demonstrate the effectiveness of the study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Descent Converges Linearly to Flatter Minima than Gradient Flow
+  in Shallow Linear Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierfrancesco Beneventano, Blake Woodworth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the gradient descent (GD) dynamics of a depth-2 linear neural
+network with a single input and output. We show that GD converges at an
+explicit linear rate to a global minimum of the training loss, even with a
+large stepsize -- about $2/\textrm{sharpness}$. It still converges for even
+larger stepsizes, but may do so very slowly. We also characterize the solution
+to which GD converges, which has lower norm and sharpness than the gradient
+flow solution. Our analysis reveals a trade off between the speed of
+convergence and the magnitude of implicit regularization. This sheds light on
+the benefits of training at the ``Edge of Stability'', which induces additional
+regularization by delaying convergence and may have implications for training
+more complex models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Least-Squares Problem Over Probability Measure Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qin Li, Li Wang, Yunan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate the variational problem $$\rho_x^\ast =
+\text{argmin}_{\rho_x} D(G\#\rho_x, \rho_y)\,, $$ where $D$ quantifies the
+difference between two probability measures, and ${G}$ is a forward operator
+that maps a variable $x$ to $y=G(x)$. This problem can be regarded as an
+analogue of its counterpart in linear spaces (e.g., Euclidean spaces),
+$\text{argmin}_x \|G(x) - y\|^2$. Similar to how the choice of norm $\|\cdot\|$
+influences the optimizer in $\mathbb R^d$ or other linear spaces, the minimizer
+in the probabilistic variational problem also depends on the choice of $D$. Our
+findings reveal that using a $\phi$-divergence for $D$ leads to the recovery of
+a conditional distribution of $\rho_y$, while employing the Wasserstein
+distance results in the recovery of a marginal distribution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 0 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stochastic Optimal Control of Prosumers in a District Heating System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maalvladédon Ganet Somé
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a network of residential heating systems in which several
+prosumers satisfy their heating and hot water demand using solar thermal
+collectors and services of a central producer. Overproduction of heat can
+either be stored in a local thermal storage or sold to the network. Our focus
+is the minimization of the prosumers expected discounted total cost from
+purchasing and selling thermal energy and running the system. This decision
+making problem under uncertainty about the future production and consumption of
+thermal energy is formulated as a stochastic optimal control problem and solved
+with dynamic programming techniques. We present numerical results for the value
+function and the optimal control.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Mathematics of Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Peyré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This overview article highlights the critical role of mathematics in
+artificial intelligence (AI), emphasizing that mathematics provides tools to
+better understand and enhance AI systems. Conversely, AI raises new problems
+and drives the development of new mathematics at the intersection of various
+fields. This article focuses on the application of analytical and probabilistic
+tools to model neural network architectures and better understand their
+optimization. Statistical questions (particularly the generalization capacity
+of these networks) are intentionally set aside, though they are of crucial
+importance. We also shed light on the evolution of ideas that have enabled
+significant advances in AI through architectures tailored to specific tasks,
+each echoing distinct mathematical techniques. The goal is to encourage more
+mathematicians to take an interest in and contribute to this exciting field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Sensor Selection for Biomarker Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09809v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09809v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Pickard, Cooper Stansbury, Amit Surana, Lindsey Muir, Anthony Bloch, Indika Rajapakse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in methods of biological data collection are driving the rapid
+growth of comprehensive datasets across clinical and research settings. These
+datasets provide the opportunity to monitor biological systems in greater depth
+and at finer time steps than was achievable in the past. Classically,
+biomarkers are used to represent and track key aspects of a biological system.
+Biomarkers retain utility even with the availability of large datasets, since
+monitoring and interpreting changes in a vast number of molecules remains
+impractical. However, given the large number of molecules in these datasets, a
+major challenge is identifying the best biomarkers for a particular setting
+Here, we apply principles of observability theory to establish a general
+methodology for biomarker selection. We demonstrate that observability measures
+effectively identify biologically meaningful sensors in a range of time series
+transcriptomics data. Motivated by the practical considerations of biological
+systems, we introduce the method of dynamic sensor selection (DSS) to maximize
+observability over time, thus enabling observability over regimes where system
+dynamics themselves are subject to change. This observability framework is
+flexible, capable of modeling gene expression dynamics and using auxiliary
+data, including chromosome conformation, to select biomarkers. Additionally, we
+demonstrate the applicability of this approach beyond genomics by evaluating
+the observability of neural activity These applications demonstrate the utility
+of observability-guided biomarker selection for across a wide range of
+biological systems, from agriculture and biomanufacturing to neural
+applications and beyond.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Objective LQR with Linear Scalarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Jadbabaie, Devavrat Shah, Sean R. Sinclair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The framework of decision-making, modeled as a Markov Decision Process (MDP),
+typically assumes a single objective. However, practical scenarios often
+involve tradeoffs between multiple objectives. We address this in the Linear
+Quadratic Regulator (LQR), a canonical continuous, infinite horizon MDP. First,
+we establish that the Pareto front for LQR is characterized by linear
+scalarization: a convex combination of objectives recovers all tradeoff points,
+making multi-objective LQR reducible to single-objective problems. This
+highlights an important instance where linear scalarization suffices for a
+non-convex problem. Second, we show the Pareto front is smooth, in that an
+$\epsilon$ perturbation of a scalarization parameter yields an $\epsilon$
+approximation to the objective. These results inspire a simple algorithm to
+approximate the Pareto front via grid search over scalarization parameters,
+where each optimization problem retains the computational efficiency of
+single-objective LQR. Lastly, we extend the analysis to certainty equivalence,
+where unknown dynamics are replaced with estimates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Separable approximations of optimal value functions under a decaying
+  sensitivity assumption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06379v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06379v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mario Sperl, Luca Saluzzi, Lars Grüne, Dante Kalise
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An efficient approach for the construction of separable approximations of
+optimal value functions from interconnected optimal control problems is
+presented. The approach is based on assuming decaying sensitivities between
+subsystems, enabling a curse-of-dimensionality free approximation, for instance
+by deep neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unifying System Theory Framework for Distributed Optimization and
+  Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12623v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12623v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guido Carnevale, Nicola Mimmo, Giuseppe Notarstefano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a systematic methodological framework to design and
+analyze distributed algorithms for optimization and games over networks.
+Starting from a centralized method, we identify an aggregation function
+involving all the decision variables (e.g., a global cost gradient or
+constraint) and introduce a distributed consensus-oriented scheme to
+asymptotically approximate the unavailable information at each agent. Then, we
+delineate the proper methodology for intertwining the identified building
+blocks, i.e., the optimization-oriented method and the consensus-oriented one.
+The key intuition is to interpret the obtained interconnection as a singularly
+perturbed system. We rely on this interpretation to provide sufficient
+conditions for the building blocks to be successfully connected into a
+distributed scheme exhibiting the convergence guarantees of the centralized
+algorithm. Finally, we show the potential of our approach by developing a new
+distributed scheme for constraint-coupled problems with a linear convergence
+rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An <span class="highlight-title">Overview</span> of Convergence Rates for Sum of Squares Hierarchies in
+  Polynomial Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04417v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04417v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monique Laurent, Lucas Slot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this survey we consider polynomial optimization problems, asking to
+minimize a polynomial function over a compact semialgebraic set, defined by
+polynomial inequalities. This models a great variety of (in general, nonlinear
+nonconvex) optimization problems. Various hierarchies of (lower and upper)
+bounds have been introduced, having the remarkable property that they converge
+asymptotically to the global minimum. These bounds exploit algebraic
+representations of positive polynomials in terms of sums of squares and can be
+computed using semidefinite optimization. Our focus lies in the performance
+analysis of these hierarchies of bounds, namely, in how far the bounds are from
+the global minimum as the degrees of the sums of squares they involve tend to
+infinity. We present the main state-of-the-art results and offer a gentle
+introductory overview over the various techniques that have been recently
+developed to establish them, stemming from the theory of orthogonal
+polynomials, approximation theory, Fourier analysis, and more.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2: Made minor change to title. Fixed several typos. Updated caption
+  of Table 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extended convexity and smoothness and their applications in deep
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binchuan Qi, Wei Gong, Li Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an optimization framework aimed at providing a
+theoretical foundation for a class of composite optimization problems,
+particularly those encountered in deep learning. In this framework, we
+introduce $\mathcal{H}(\phi)$-convexity and $\mathcal{H}(\Phi)$-smoothness to
+generalize the existing concepts of Lipschitz smoothness and strong convexity.
+Furthermore, we analyze and establish the convergence of both gradient descent
+and stochastic gradient descent methods for objective functions that are
+$\mathcal{H}(\Phi)$-smooth. We prove that the optimal convergence rates of
+these methods depend solely on the homogeneous degree of $\Phi$. Based on these
+findings, we construct two types of non-convex and non-smooth optimization
+problems: deterministic composite and stochastic composite optimization
+problems, which encompass the majority of optimization problems in deep
+learning. To address these problems, we develop the gradient structure control
+algorithm and prove that it can locate approximate global optima. This marks a
+significant departure from traditional non-convex analysis framework, which
+typically settle for stationary points. Therefore, with the introduction of
+$\mathcal{H}(\phi)$-convexity and $\mathcal{H}(\Phi)$-smoothness, along with
+the GSC algorithm, the non-convex optimization mechanisms in deep learning can
+be theoretically explained and supported. Finally, the effectiveness of the
+proposed framework is substantiated through empirical experimentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Accelerated Algorithm for Stochastic Bilevel Optimization under
+  Unbounded Smoothness <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.19212v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.19212v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochuan Gong, Jie Hao, Mingrui Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates a class of stochastic bilevel optimization problems
+where the upper-level function is nonconvex with potentially unbounded
+smoothness and the lower-level problem is strongly convex. These problems have
+significant applications in sequential data learning, such as text
+classification using recurrent neural networks. The unbounded smoothness is
+characterized by the smoothness constant of the upper-level function scaling
+linearly with the gradient norm, lacking a uniform upper bound. Existing
+state-of-the-art algorithms require $\widetilde{O}(1/\epsilon^4)$ oracle calls
+of stochastic gradient or Hessian/Jacobian-vector product to find an
+$\epsilon$-stationary point. However, it remains unclear if we can further
+improve the convergence rate when the assumptions for the function in the
+population level also hold for each random realization almost surely. To
+address this issue, we propose a new Accelerated Bilevel Optimization algorithm
+named AccBO. The algorithm updates the upper-level variable by normalized
+stochastic gradient descent with recursive momentum and the lower-level
+variable by the stochastic Nesterov accelerated gradient descent algorithm with
+averaging. We prove that our algorithm achieves an oracle complexity of
+$\widetilde{O}(1/\epsilon^3)$ to find an $\epsilon$-stationary point, when the
+lower-level stochastic gradient's variance is $O(\epsilon)$. Our proof relies
+on a novel lemma characterizing the dynamics of stochastic Nesterov accelerated
+gradient descent algorithm under distribution drift with high probability for
+the lower-level variable, which is of independent interest and also plays a
+crucial role in analyzing the hypergradient estimation error over time.
+Experimental results on various tasks confirm that our proposed algorithm
+achieves the predicted theoretical acceleration and significantly outperforms
+baselines in bilevel optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024. The code is available at
+  https://github.com/MingruiLiu-ML-Lab/Accelerated-Bilevel-Optimization-Unbounded-Smoothness</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analytic Formulas for Alternating Projection Sequences for the Positive
+  Semidefinite Cone and an Application to Convergence Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15276v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15276v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hiroyuki Ochiai, Yoshiyuki Sekiguchi, Hayato Waki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We derive analytic formulas for the alternating projection method applied to
+the cone $\mathbb{S}^n_+$ of positive semidefinite matrices and an affine
+subspace. More precisely, we find recursive relations on parameters
+representing a sequence constructed by the alternating projection method. By
+applying these formulas, we analyze the alternating projection method in detail
+and show that the upper bound given by the singularity degree is actually tight
+when the alternating projection method is applied to $\mathbb{S}^3_+$ and a
+$3$-plane whose intersection is a singleton with singularity degree $2$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An accelerated gradient method with adaptive restart for convex
+  multiobjective optimization problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07863v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07863v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Luo, Liping Tang, Xinmin Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, based on the continuous time approach, we propose an
+accelerated gradient method with adaptive residual restart for convex
+multiobjective optimization problems. For the first, we derive rigorously the
+continuous limit of the multiobjective accelerated proximal gradient method by
+Tanabe et al. [Comput. Optim. Appl., 2023]. It is a second-order ordinary
+differential equation (ODE) that involves a special projection operator and can
+be viewed as an extension of the ODE by Su et al. [J. Mach. Learn. Res., 2016]
+for Nesterov's accelerated gradient method. Then, we introduce a novel
+accelerated multiobjective gradient (AMG) flow with tailored time scaling that
+adapts automatically to the convex case and the strongly convex case, and the
+exponential decay rate of a merit function along with the solution trajectory
+of AMG flow is established via the Lyapunov analysis. After that, we consider
+an implicit-explicit time discretization and obtain an accelerated
+multiobjective gradient method with a convex quadratic programming subproblem.
+The fast sublinear rate and linear rate are proved respectively for convex and
+strongly convex problems. In addition, we present an efficient residual based
+adaptive restart technique to overcome the oscillation issue and improve the
+convergence significantly. Numerical results are provided to validate the
+practical performance of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dual Cone Gradient Descent for Training Physics-Informed Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18426v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18426v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youngsik Hwang, Dong-Young Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) have emerged as a prominent approach
+for solving partial differential equations (PDEs) by minimizing a combined loss
+function that incorporates both boundary loss and PDE residual loss. Despite
+their remarkable empirical performance in various scientific computing tasks,
+PINNs often fail to generate reasonable solutions, and such pathological
+behaviors remain difficult to explain and resolve. In this paper, we identify
+that PINNs can be adversely trained when gradients of each loss function
+exhibit a significant imbalance in their magnitudes and present a negative
+inner product value. To address these issues, we propose a novel optimization
+framework, Dual Cone Gradient Descent (DCGD), which adjusts the direction of
+the updated gradient to ensure it falls within a dual cone region. This region
+is defined as a set of vectors where the inner products with both the gradients
+of the PDE residual loss and the boundary loss are non-negative. Theoretically,
+we analyze the convergence properties of DCGD algorithms in a non-convex
+setting. On a variety of benchmark equations, we demonstrate that DCGD
+outperforms other optimization algorithms in terms of various evaluation
+metrics. In particular, DCGD achieves superior predictive accuracy and enhances
+the stability of training for failure modes of PINNs and complex PDEs, compared
+to existing optimally tuned models. Moreover, DCGD can be further improved by
+combining it with popular strategies for PINNs, including learning rate
+annealing and the Neural Tangent Kernel (NTK).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Thirty-eighth Annual Conference on Neural Information Processing
+  Systems, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ First-Order Methods for Nonsmooth Nonconvex Functional Constrained
+  Optimization with or without Slater Points 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.00927v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.00927v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhichao Jia, Benjamin Grimmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Constrained optimization problems where both the objective and constraints
+may be nonsmooth and nonconvex arise across many learning and data science
+settings. In this paper, we show for any Lipschitz, weakly convex objectives
+and constraints, a simple first-order method finds a feasible,
+$\epsilon$-stationary point at a convergence rate of $O(\epsilon^{-4})$ without
+relying on compactness or Constraint Qualification (CQ). When CQ holds, this
+convergence is measured by approximately satisfying the Karush-Kuhn-Tucker
+conditions. When CQ fails, we guarantee the attainment of weaker Fritz-John
+conditions. As an illustrative example, our method stably converges on
+piecewise quadratic SCAD regularized problems despite frequent violations of
+constraint qualification. The considered algorithm is similar to those of
+"Quadratically regularized subgradient methods for weakly convex optimization
+with weakly convex constraints" by Ma et al. and "Stochastic first-order
+methods for convex and nonconvex functional constrained optimization" by Boob
+et al. (whose guarantees further assume compactness and CQ), iteratively taking
+inexact proximal steps, computed via an inner loop applying a switching
+subgradient method to a strongly convex constrained subproblem. Our
+non-Lipschitz analysis of the switching subgradient method appears to be new
+and may be of independent interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Second-order methods for quartically-regularised cubic polynomials, with
+  applications to high-order tensor methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.15336v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.15336v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Coralia Cartis, Wenqi Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been growing interest in high-order tensor methods for nonconvex
+optimization, with adaptive regularization, as they possess better/optimal
+worst-case evaluation complexity globally and faster convergence
+asymptotically. These algorithms crucially rely on repeatedly minimizing
+nonconvex multivariate Taylor-based polynomial sub-problems, at least locally.
+Finding efficient techniques for the solution of these sub-problems, beyond the
+second-order case, has been an open question. This paper proposes a
+second-order method, Quadratic Quartic Regularisation (QQR), for efficiently
+minimizing nonconvex quartically-regularized cubic polynomials, such as the
+AR$p$ sub-problem [3] with $p=3$. Inspired by [35], QQR approximates the
+third-order tensor term by a linear combination of quadratic and quartic terms,
+yielding (possibly nonconvex) local models that are solvable to global
+optimality. In order to achieve accuracy $\epsilon$ in the first-order
+criticality of the sub-problem in finitely many iterations, we show that the
+error in the QQR method decreases either linearly or by at least
+$\mathcal{O}(\epsilon^{4/3})$ for locally convex iterations, while in the
+nonconvex case, by at least $\mathcal{O}(\epsilon)$; thus improving, on these
+types of iterations, the general cubic-regularization bound. Preliminary
+numerical experiments indicate that two QQR variants perform competitively with
+state-of-the-art approaches such as ARC (also known as AR$p$ with $p=2$),
+achieving either a lower objective value or iteration counts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonsmooth Nonconvex-Nonconcave Minimax Optimization: Primal-Dual
+  Balancing and Iteration Complexity Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10825v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10825v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajin Li, Linglingzhi Zhu, Anthony Man-Cho So
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonconvex-nonconcave minimax optimization has gained widespread interest over
+the last decade. However, most existing works focus on variants of gradient
+descent-ascent (GDA) algorithms, which are only applicable to smooth
+nonconvex-concave settings. To address this limitation, we propose a novel
+algorithm named smoothed proximal linear descent-ascent (smoothed PLDA), which
+can effectively handle a broad range of structured nonsmooth
+nonconvex-nonconcave minimax problems. Specifically, we consider the setting
+where the primal function has a nonsmooth composite structure and the dual
+function possesses the Kurdyka-Lojasiewicz (KL) property with exponent $\theta
+\in [0,1)$. We introduce a novel convergence analysis framework for smoothed
+PLDA, the key components of which are our newly developed nonsmooth primal
+error bound and dual error bound. Using this framework, we show that smoothed
+PLDA can find both $\epsilon$-game-stationary points and
+$\epsilon$-optimization-stationary points of the problems of interest in
+$\mathcal{O}(\epsilon^{-2\max\{2\theta,1\}})$ iterations. Furthermore, when
+$\theta \in [0,\frac{1}{2}]$, smoothed PLDA achieves the optimal iteration
+complexity of $\mathcal{O}(\epsilon^{-2})$. To further demonstrate the
+effectiveness and wide applicability of our analysis framework, we show that
+certain max-structured problem possesses the KL property with exponent
+$\theta=0$ under mild assumptions. As a by-product, we establish
+algorithm-independent quantitative relationships among various stationarity
+concepts, which may be of independent interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Mathematical Programming</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">137</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ouroboros-Diffusion: Exploring Consistent Content Generation in
+  Tuning-free Long Video Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyuan Chen, Fuchen Long, Jie An, Zhaofan Qiu, Ting Yao, Jiebo Luo, Tao Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The first-in-first-out (FIFO) video diffusion, built on a pre-trained
+text-to-video model, has recently emerged as an effective approach for
+tuning-free long video generation. This technique maintains a queue of video
+frames with progressively increasing noise, continuously producing clean frames
+at the queue's head while Gaussian noise is enqueued at the tail. However,
+FIFO-Diffusion often struggles to keep long-range temporal consistency in the
+generated videos due to the lack of correspondence modeling across frames. In
+this paper, we propose Ouroboros-Diffusion, a novel video denoising framework
+designed to enhance structural and content (subject) consistency, enabling the
+generation of consistent videos of arbitrary length. Specifically, we introduce
+a new latent sampling technique at the queue tail to improve structural
+consistency, ensuring perceptually smooth transitions among frames. To enhance
+subject consistency, we devise a Subject-Aware Cross-Frame Attention (SACFA)
+mechanism, which aligns subjects across frames within short segments to achieve
+better visual coherence. Furthermore, we introduce self-recurrent guidance.
+This technique leverages information from all previous cleaner frames at the
+front of the queue to guide the denoising of noisier frames at the end,
+fostering rich and contextual global information interaction. Extensive
+experiments of long video generation on the VBench benchmark demonstrate the
+superiority of our Ouroboros-Diffusion, particularly in terms of subject
+consistency, motion smoothness, and temporal consistency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal LLMs Can Reason about Aesthetics in Zero-Shot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixiang Jiang, Changwen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability
+shall be elicited to evaluate the aesthetics of artworks. To facilitate this
+investigation, we construct MM-StyleBench, a novel high-quality dataset for
+benchmarking artistic stylization. We then develop a principled method for
+human preference modeling and perform a systematic correlation analysis between
+MLLMs' responses and human preference. Our experiments reveal an inherent
+hallucination issue of MLLMs in art evaluation, associated with response
+subjectivity. ArtCoT is proposed, demonstrating that art-specific task
+decomposition and the use of concrete language boost MLLMs' reasoning ability
+for aesthetics. Our findings offer valuable insights into MLLMs for art and can
+benefit a wide range of downstream applications, such as style transfer and
+artistic image generation. Code available at
+https://github.com/songrise/MLLM4Art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WIP, Homepage https://github.com/songrise/MLLM4Art</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SimGen: A Diffusion-Based Framework for Simultaneous Surgical Image and
+  Segmentation Mask Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Bhat, Rupak Bose, Chinedu Innocent Nwoye, Nicolas Padoy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acquiring and annotating surgical data is often resource-intensive, ethical
+constraining, and requiring significant expert involvement. While generative AI
+models like text-to-image can alleviate data scarcity, incorporating spatial
+annotations, such as segmentation masks, is crucial for precision-driven
+surgical applications, simulation, and education. This study introduces both a
+novel task and method, SimGen, for Simultaneous Image and Mask Generation.
+SimGen is a diffusion model based on the DDPM framework and Residual U-Net,
+designed to jointly generate high-fidelity surgical images and their
+corresponding segmentation masks. The model leverages cross-correlation priors
+to capture dependencies between continuous image and discrete mask
+distributions. Additionally, a Canonical Fibonacci Lattice (CFL) is employed to
+enhance class separability and uniformity in the RGB space of the masks. SimGen
+delivers high-fidelity images and accurate segmentation masks, outperforming
+baselines across six public datasets assessed on image and semantic inception
+distance metrics. Ablation study shows that the CFL improves mask quality and
+spatial separation. Downstream experiments suggest generated image-mask pairs
+are usable if regulations limit human data release for research. This work
+offers a cost-effective solution for generating paired surgical images and
+complex labels, advancing surgical AI development by reducing the need for
+expensive manual annotations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 17 figures, 4 tables, project page at
+  https://camma-public.github.io/endogen/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision Foundation Models for Computed Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suraj Pai, Ibrahim Hadzic, Dennis Bontempi, Keno Bressem, Benjamin H. Kann, Andriy Fedorov, Raymond H. Mak, Hugo J. W. L. Aerts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FMs) have shown transformative potential in radiology by
+performing diverse, complex tasks across imaging modalities. Here, we developed
+CT-FM, a large-scale 3D image-based pre-trained model designed explicitly for
+various radiological tasks. CT-FM was pre-trained using 148,000 computed
+tomography (CT) scans from the Imaging Data Commons through label-agnostic
+contrastive learning. We evaluated CT-FM across four categories of tasks,
+namely, whole-body and tumor segmentation, head CT triage, medical image
+retrieval, and semantic understanding, showing superior performance against
+state-of-the-art models. Beyond quantitative success, CT-FM demonstrated the
+ability to cluster regions anatomically and identify similar anatomical and
+structural concepts across scans. Furthermore, it remained robust across
+test-retest settings and indicated reasonable salient regions attached to its
+embeddings. This study demonstrates the value of large-scale medical imaging
+foundation models and by open-sourcing the model weights, code, and data, aims
+to support more adaptable, reliable, and interpretable AI solutions in
+radiology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 figures, followed by 9 Extended Data Figures and a Supplementary
+  Information document</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RepVideo: Rethinking Cross-Layer Representation for Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyang Si, Weichen Fan, Zhengyao Lv, Ziqi Huang, Yu Qiao, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video generation has achieved remarkable progress with the introduction of
+diffusion models, which have significantly improved the quality of generated
+videos. However, recent research has primarily focused on scaling up model
+training, while offering limited insights into the direct impact of
+representations on the video generation process. In this paper, we initially
+investigate the characteristics of features in intermediate layers, finding
+substantial variations in attention maps across different layers. These
+variations lead to unstable semantic representations and contribute to
+cumulative differences between features, which ultimately reduce the similarity
+between adjacent frames and negatively affect temporal coherence. To address
+this, we propose RepVideo, an enhanced representation framework for
+text-to-video diffusion models. By accumulating features from neighboring
+layers to form enriched representations, this approach captures more stable
+semantic information. These enhanced representations are then used as inputs to
+the attention mechanism, thereby improving semantic expressiveness while
+ensuring feature consistency across adjacent frames. Extensive experiments
+demonstrate that our RepVideo not only significantly enhances the ability to
+generate accurate spatial appearances, such as capturing complex spatial
+relationships between multiple objects, but also improves temporal consistency
+in video generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://vchitect.github.io/RepVid-Webpage</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CityDreamer4D: Compositional Generative Model of Unbounded 4D Cities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhe Xie, Zhaoxi Chen, Fangzhou Hong, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D scene generation has garnered growing attention in recent years and has
+made significant progress. Generating 4D cities is more challenging than 3D
+scenes due to the presence of structurally complex, visually diverse objects
+like buildings and vehicles, and heightened human sensitivity to distortions in
+urban environments. To tackle these issues, we propose CityDreamer4D, a
+compositional generative model specifically tailored for generating unbounded
+4D cities. Our main insights are 1) 4D city generation should separate dynamic
+objects (e.g., vehicles) from static scenes (e.g., buildings and roads), and 2)
+all objects in the 4D scene should be composed of different types of neural
+fields for buildings, vehicles, and background stuff. Specifically, we propose
+Traffic Scenario Generator and Unbounded Layout Generator to produce dynamic
+traffic scenarios and static city layouts using a highly compact BEV
+representation. Objects in 4D cities are generated by combining stuff-oriented
+and instance-oriented neural fields for background stuff, buildings, and
+vehicles. To suit the distinct characteristics of background stuff and
+instances, the neural fields employ customized generative hash grids and
+periodic positional embeddings as scene parameterizations. Furthermore, we
+offer a comprehensive suite of datasets for city generation, including OSM,
+GoogleEarth, and CityTopia. The OSM dataset provides a variety of real-world
+city layouts, while the Google Earth and CityTopia datasets deliver
+large-scale, high-quality city imagery complete with 3D instance annotations.
+Leveraging its compositional design, CityDreamer4D supports a range of
+downstream applications, such as instance editing, city stylization, and urban
+simulation, while delivering state-of-the-art performance in generating
+realistic 4D cities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CityLoc: 6 DoF Localization of Text Descriptions in Large-Scale Scenes
+  with Gaussian Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Ma, Runyi Yang, Bin Ren, Ender Konukoglu, Luc Van Gool, Danda Pani Paudel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Localizing text descriptions in large-scale 3D scenes is inherently an
+ambiguous task. This nonetheless arises while describing general concepts, e.g.
+all traffic lights in a city.
+  To facilitate reasoning based on such concepts, text localization in the form
+of distribution is required. In this paper, we generate the distribution of the
+camera poses conditioned upon the textual description.
+  To facilitate such generation, we propose a diffusion-based architecture that
+conditionally diffuses the noisy 6DoF camera poses to their plausible
+locations.
+  The conditional signals are derived from the text descriptions, using the
+pre-trained text encoders. The connection between text descriptions and pose
+distribution is established through pretrained Vision-Language-Model, i.e.
+CLIP. Furthermore, we demonstrate that the candidate poses for the distribution
+can be further refined by rendering potential poses using 3D Gaussian
+splatting, guiding incorrectly posed samples towards locations that better
+align with the textual description, through visual reasoning.
+  We demonstrate the effectiveness of our method by comparing it with both
+standard retrieval methods and learning-based approaches. Our proposed method
+consistently outperforms these baselines across all five large-scale datasets.
+Our source code and dataset will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An analysis of data variation and bias in image-based dermatological
+  <span class="highlight-title">dataset</span>s for machine learning classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Mauro, Emanoel Thyago, Othon Vinicius, Rodrigo Abreu, Kelvin Cunha, José Gabriel, Rafael Barros, Thales Bezerra, Manoel Henriques, Natalia Lopes, Érico Moutinho, Jéssica Guido, Tsang Ing Ren, Paulo Borba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI algorithms have become valuable in aiding professionals in healthcare. The
+increasing confidence obtained by these models is helpful in critical decision
+demands. In clinical dermatology, classification models can detect malignant
+lesions on patients' skin using only RGB images as input. However, most
+learning-based methods employ data acquired from dermoscopic datasets on
+training, which are large and validated by a gold standard. Clinical models aim
+to deal with classification on users' smartphone cameras that do not contain
+the corresponding resolution provided by dermoscopy. Also, clinical
+applications bring new challenges. It can contain captures from uncontrolled
+environments, skin tone variations, viewpoint changes, noises in data and
+labels, and unbalanced classes. A possible alternative would be to use transfer
+learning to deal with the clinical images. However, as the number of samples is
+low, it can cause degradations on the model's performance; the source
+distribution used in training differs from the test set. This work aims to
+evaluate the gap between dermoscopic and clinical samples and understand how
+the dataset variations impact training. It assesses the main differences
+between distributions that disturb the model's prediction. Finally, from
+experiments on different architectures, we argue how to combine the data from
+divergent distributions, decreasing the impact on the model's final accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual WetlandBirds <span class="highlight-title">Dataset</span>: Bird Species Identification and Behavior
+  Recognition in Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Rodriguez-Juan, David Ortiz-Perez, Manuel Benavent-Lledo, David Mulero-Pérez, Pablo Ruiz-Ponce, Adrian Orihuela-Torres, Jose Garcia-Rodriguez, Esther Sebastián-González
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current biodiversity loss crisis makes animal monitoring a relevant field
+of study. In light of this, data collected through monitoring can provide
+essential insights, and information for decision-making aimed at preserving
+global biodiversity. Despite the importance of such data, there is a notable
+scarcity of datasets featuring videos of birds, and none of the existing
+datasets offer detailed annotations of bird behaviors in video format. In
+response to this gap, our study introduces the first fine-grained video dataset
+specifically designed for bird behavior detection and species classification.
+This dataset addresses the need for comprehensive bird video datasets and
+provides detailed data on bird actions, facilitating the development of deep
+learning models to recognize these, similar to the advancements made in human
+action recognition. The proposed dataset comprises 178 videos recorded in
+Spanish wetlands, capturing 13 different bird species performing 7 distinct
+behavior classes. In addition, we also present baseline results using state of
+the art models on two tasks: bird behavior recognition and species
+classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Joint Denoising, Demosaicing, and Compression from the Raw
+  Natural Image Noise <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benoit Brummer, Christophe De Vleeschouwer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the Raw Natural Image Noise Dataset (RawNIND), a
+diverse collection of paired raw images designed to support the development of
+denoising models that generalize across sensors, image development workflows,
+and styles. Two denoising methods are proposed: one operates directly on raw
+Bayer data, leveraging computational efficiency, while the other processes
+linear RGB images for improved generalization to different sensors, with both
+preserving flexibility for subsequent development. Both methods outperform
+traditional approaches which rely on developed images. Additionally, the
+integration of denoising and compression at the raw data level significantly
+enhances rate-distortion performance and computational efficiency. These
+findings suggest a paradigm shift toward raw data workflows for efficient and
+flexible image processing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering Agricultural Insights: RiceLeafBD - A Novel <span class="highlight-title">Dataset</span> and
+  Optimal Model Selection for Rice Leaf Disease Diagnosis through Transfer
+  Learning Technique 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadia Afrin Rimi, Md. Jalal Uddin Chowdhury, Rifat Abdullah, Iftekhar Ahmed, Mahrima Akter Mim, Mohammad Shoaib Rahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The number of people living in this agricultural nation of ours, which is
+surrounded by lush greenery, is growing on a daily basis. As a result of this,
+the level of arable land is decreasing, as well as residential houses and
+industrial factories. The food crisis is becoming the main threat for us in the
+upcoming days. Because on the one hand, the population is increasing, and on
+the other hand, the amount of food crop production is decreasing due to the
+attack of diseases. Rice is one of the most significant cultivated crops since
+it provides food for more than half of the world's population. Bangladesh is
+dependent on rice (Oryza sativa) as a vital crop for its agriculture, but it
+faces a significant problem as a result of the ongoing decline in rice yield
+brought on by common diseases. Early disease detection is the main difficulty
+in rice crop cultivation. In this paper, we proposed our own dataset, which was
+collected from the Bangladesh field, and also applied deep learning and
+transfer learning models for the evaluation of the datasets. We elaborately
+explain our dataset and also give direction for further research work to serve
+society using this dataset. We applied a light CNN model and pre-trained
+InceptionNet-V2, EfficientNet-V2, and MobileNet-V2 models, which achieved 91.5%
+performance for the EfficientNet-V2 model of this work. The results obtained
+assaulted other models and even exceeded approaches that are considered to be
+part of the state of the art. It has been demonstrated by this study that it is
+possible to precisely and effectively identify diseases that affect rice leaves
+using this unbiased datasets. After analysis of the performance of different
+models, the proposed datasets are significant for the society for research work
+to provide solutions for decreasing rice leaf disease.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lights, Camera, Matching: The Role of Image Illumination in Fair Face
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08910v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08910v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriella Pangelinan, Grace Bezold, Haiyu Wu, Michael C. King, Kevin W. Bowyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial brightness is a key image quality factor impacting face recognition
+accuracy differentials across demographic groups. In this work, we aim to
+decrease the accuracy gap between the similarity score distributions for
+Caucasian and African American female mated image pairs, as measured by d'
+between distributions. To balance brightness across demographic groups, we
+conduct three experiments, interpreting brightness in the face skin region
+either as median pixel value or as the distribution of pixel values. Balancing
+based on median brightness alone yields up to a 46.8% decrease in d', while
+balancing based on brightness distribution yields up to a 57.6% decrease. In
+all three cases, the similarity scores of the individual distributions improve,
+with mean scores maximally improving 5.9% for Caucasian females and 3.7% for
+African American females.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 11 figures, Conference submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View <span class="highlight-title">Transformer</span>s for Airway-To-Lung Ratio Inference on Cardiac CT
+  Scans: The C4R Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sneha N. Naik, Elsa D. Angelini, Eric A. Hoffman, Elizabeth C. Oelsner, R. Graham Barr, Benjamin M. Smith, Andrew F. Laine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ratio of airway tree lumen to lung size (ALR), assessed at full
+inspiration on high resolution full-lung computed tomography (CT), is a major
+risk factor for chronic obstructive pulmonary disease (COPD). There is growing
+interest to infer ALR from cardiac CT images, which are widely available in
+epidemiological cohorts, to investigate the relationship of ALR to severe
+COVID-19 and post-acute sequelae of SARS-CoV-2 infection (PASC). Previously,
+cardiac scans included approximately 2/3 of the total lung volume with 5-6x
+greater slice thickness than high-resolution (HR) full-lung (FL) CT. In this
+study, we present a novel attention-based Multi-view Swin Transformer to infer
+FL ALR values from segmented cardiac CT scans. For the supervised training we
+exploit paired full-lung and cardiac CTs acquired in the Multi-Ethnic Study of
+Atherosclerosis (MESA). Our network significantly outperforms a proxy direct
+ALR inference on segmented cardiac CT scans and achieves accuracy and
+reproducibility comparable with a scan-rescan reproducibility of the FL ALR
+ground-truth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to appear in Proceedings of International Symposium on
+  Biomedical Imaging (ISBI), 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Multi-Scale Cross-Attention for Person Image Generation <span class="chip">ECCV2020</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Tang, Ling Shao, Nicu Sebe, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel cross-attention-based generative
+adversarial network (GAN) for the challenging person image generation task.
+Cross-attention is a novel and intuitive multi-modal fusion method in which an
+attention/correlation matrix is calculated between two feature maps of
+different modalities. Specifically, we propose the novel XingGAN (or
+CrossingGAN), which consists of two generation branches that capture the
+person's appearance and shape, respectively. Moreover, we propose two novel
+cross-attention blocks to effectively transfer and update the person's shape
+and appearance embeddings for mutual improvement. This has not been considered
+by any other existing GAN-based image generation work. To further learn the
+long-range correlations between different person poses at different scales and
+sub-regions, we propose two novel multi-scale cross-attention blocks. To tackle
+the issue of independent correlation computations within the cross-attention
+mechanism leading to noisy and ambiguous attention weights, which hinder
+performance improvements, we propose a module called enhanced attention (EA).
+Lastly, we introduce a novel densely connected co-attention module to fuse
+appearance and shape features at different stages effectively. Extensive
+experiments on two public datasets demonstrate that the proposed method
+outperforms current GAN-based methods and performs on par with diffusion-based
+methods. However, our method is significantly faster than diffusion-based
+methods in both training and inference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to TPAMI, an extended version of a paper published in
+  ECCV2020. arXiv admin note: substantial text overlap with arXiv:2007.09278</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature-based One-For-All: A Universal Framework for Heterogeneous
+  Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jhe-Hao Lin, Yi Yao, Chan-Feng Hsu, Hongxia Xie, Hong-Han Shuai, Wen-Huang Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD) involves transferring knowledge from a
+pre-trained heavy teacher model to a lighter student model, thereby reducing
+the inference cost while maintaining comparable effectiveness. Prior KD
+techniques typically assume homogeneity between the teacher and student models.
+However, as technology advances, a wide variety of architectures have emerged,
+ranging from initial Convolutional Neural Networks (CNNs) to Vision
+Transformers (ViTs), and Multi-Level Perceptrons (MLPs). Consequently,
+developing a universal KD framework compatible with any architecture has become
+an important research topic. In this paper, we introduce a feature-based
+one-for-all (FOFA) KD framework to enable feature distillation across diverse
+architecture. Our framework comprises two key components. First, we design
+prompt tuning blocks that incorporate student feedback, allowing teacher
+features to adapt to the student model's learning process. Second, we propose
+region-aware attention to mitigate the view mismatch problem between
+heterogeneous architecture. By leveraging these two modules, effective
+distillation of intermediate features can be achieved across heterogeneous
+architectures. Extensive experiments on CIFAR, ImageNet, and COCO demonstrate
+the superiority of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Planning with 3D-vision Language <span class="highlight-title">Pre-train</span>ing for End-to-End
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengpeng Li, Hanli Wang, Xianfei Li, Wenlong Liao, Tao He, Pai Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving is a challenging task that requires perceiving and
+understanding the surrounding environment for safe trajectory planning. While
+existing vision-based end-to-end models have achieved promising results, these
+methods are still facing the challenges of vision understanding, decision
+reasoning and scene generalization. To solve these issues, a generative
+planning with 3D-vision language pre-training model named GPVL is proposed for
+end-to-end autonomous driving. The proposed paradigm has two significant
+aspects. On one hand, a 3D-vision language pre-training module is designed to
+bridge the gap between visual perception and linguistic understanding in the
+bird's eye view. On the other hand, a cross-modal language model is introduced
+to generate holistic driving decisions and fine-grained trajectories with
+perception and navigation information in an auto-regressive manner. Experiments
+on the challenging nuScenes dataset demonstrate that the proposed scheme
+achieves excellent performances compared with state-of-the-art methods.
+Besides, the proposed GPVL presents strong generalization ability and real-time
+potential when handling high-level commands in various scenarios. It is
+believed that the effective, robust and efficient performance of GPVL is
+crucial for the practical application of future autonomous driving systems.
+Code is available at https://github.com/ltp1995/GPVL
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Task-Level Optimal <span class="highlight-title">Prompt</span>s for Visual In-Context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Zhu, Huan Ma, Changqing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of Vision Foundation Models (VFMs) in recent years,
+Visual In-Context Learning (VICL) has become a better choice compared to
+modifying models in most scenarios. Different from retraining or fine-tuning
+model, VICL does not require modifications to the model's weights or
+architecture, and only needs a prompt with demonstrations to teach VFM how to
+solve tasks. Currently, significant computational cost for finding optimal
+prompts for every test sample hinders the deployment of VICL, as determining
+which demonstrations to use for constructing prompts is very costly. In this
+paper, however, we find a counterintuitive phenomenon that most test samples
+actually achieve optimal performance under the same prompts, and searching for
+sample-level prompts only costs more time but results in completely identical
+prompts. Therefore, we propose task-level prompting to reduce the cost of
+searching for prompts during the inference stage and introduce two time-saving
+yet effective task-level prompt search strategies. Extensive experimental
+results show that our proposed method can identify near-optimal prompts and
+reach the best VICL performance with a minimal cost that prior work has never
+achieved.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MANTA: Diffusion Mamba for Efficient and Effective Stochastic Long-Term
+  Dense Anticipation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olga Zatsarynna, Emad Bahrami, Yazan Abu Farha, Gianpiero Francesca, Juergen Gall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our work addresses the problem of stochastic long-term dense anticipation.
+The goal of this task is to predict actions and their durations several minutes
+into the future based on provided video observations. Anticipation over
+extended horizons introduces high uncertainty, as a single observation can lead
+to multiple plausible future outcomes. To address this uncertainty, stochastic
+models are designed to predict several potential future action sequences.
+Recent work has further proposed to incorporate uncertainty modelling for
+observed frames by simultaneously predicting per-frame past and future actions
+in a unified manner. While such joint modelling of actions is beneficial, it
+requires long-range temporal capabilities to connect events across distant past
+and future time points. However, the previous work struggles to achieve such a
+long-range understanding due to its limited and/or sparse receptive field. To
+alleviate this issue, we propose a novel MANTA (MAmba for ANTicipation)
+network. Our model enables effective long-term temporal modelling even for very
+long sequences while maintaining linear complexity in sequence length. We
+demonstrate that our approach achieves state-of-the-art results on three
+datasets - Breakfast, 50Salads, and Assembly101 - while also significantly
+improving computational and memory efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuicai Dong, Yujing Chang, Xin Deik Goh, Dexun Li, Ruiming Tang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal document retrieval is designed to identify and retrieve various
+forms of multi-modal content, such as figures, tables, charts, and layout
+information from extensive documents. Despite its significance, there is a
+notable lack of a robust benchmark to effectively evaluate the performance of
+systems in multi-modal document retrieval. To address this gap, this work
+introduces a new benchmark, named as MMDocIR, encompassing two distinct tasks:
+page-level and layout-level retrieval. The former focuses on localizing the
+most relevant pages within a long document, while the latter targets the
+detection of specific layouts, offering a more fine-grained granularity than
+whole-page analysis. A layout can refer to a variety of elements such as
+textual paragraphs, equations, figures, tables, or charts. The MMDocIR
+benchmark comprises a rich dataset featuring expertly annotated labels for
+1,685 questions and bootstrapped labels for 173,843 questions, making it a
+pivotal resource for advancing multi-modal document retrieval for both training
+and evaluation. Through rigorous experiments, we reveal that (i) visual
+retrievers significantly outperform their text counterparts, (ii) MMDocIR train
+set can effectively benefit the training process of multi-modal document
+retrieval and (iii) text retrievers leveraging on VLM-text perform much better
+than those using OCR-text. These findings underscores the potential advantages
+of integrating visual elements for multi-modal document retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://huggingface.co/MMDocIR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Diffusion Guidance via Learning Degradation-Aware Models for
+  Blind Super Resolution <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shao-Hao Lu, Ren Wang, Ching-Chun Huang, Wei-Chen Chiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, diffusion-based blind super-resolution (SR) methods have shown
+great ability to generate high-resolution images with abundant high-frequency
+detail, but the detail is often achieved at the expense of fidelity. Meanwhile,
+another line of research focusing on rectifying the reverse process of
+diffusion models (i.e., diffusion guidance), has demonstrated the power to
+generate high-fidelity results for non-blind SR. However, these methods rely on
+known degradation kernels, making them difficult to apply to blind SR. To
+address these issues, we introduce degradation-aware models that can be
+integrated into the diffusion guidance framework, eliminating the need to know
+degradation kernels. Additionally, we propose two novel techniques input
+perturbation and guidance scalar to further improve our performance. Extensive
+experimental results show that our proposed method has superior performance
+over state-of-the-art methods on blind SR benchmarks
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in WACV 2025. Code is available at:
+  https://github.com/ryanlu2240/Boosting-Diffusion-Guidance-via-Learning-Degradation-Aware-Models-for-Blind-Super-Resolution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IDEA: Image Description Enhanced CLIP-Adapter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08816v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08816v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Ye, Feng Jiang, Qiufeng Wang, Kaizhu Huang, Jiaqi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP (Contrastive Language-Image Pre-training) has attained great success in
+pattern recognition and computer vision. Transferring CLIP to downstream tasks
+(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.
+However, current studies primarily focus on either prompt learning for text or
+adapter tuning for vision, without fully exploiting the complementary
+information and correlations among image-text pairs. In this paper, we propose
+an Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to
+few-shot image classification tasks. This method captures fine-grained features
+by leveraging both visual features and textual descriptions of images. IDEA is
+a training-free method for CLIP, and it can be comparable to or even exceeds
+state-of-the-art models on multiple tasks. Furthermore, we introduce
+Trainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable
+components (i.e., a projector and a learnable latent space), further enhancing
+the model's performance and achieving SOTA results on 11 datasets. As one
+important contribution, we employ the Llama model and design a comprehensive
+pipeline to generate textual descriptions for images of 11 datasets, resulting
+in a total of 1,637,795 image-text pairs, named "IMD-11". Our code and data are
+released at https://github.com/FourierAI/IDEA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human Pose-Constrained UV Map Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matej Suchanek, Miroslav Purkrabek, Jiri Matas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  UV map estimation is used in computer vision for detailed analysis of human
+posture or activity. Previous methods assign pixels to body model vertices by
+comparing pixel descriptors independently, without enforcing global coherence
+or plausibility in the UV map. We propose Pose-Constrained Continuous Surface
+Embeddings (PC-CSE), which integrates estimated 2D human pose into the
+pixel-to-vertex assignment process. The pose provides global anatomical
+constraints, ensuring that UV maps remain coherent while preserving local
+precision. Evaluation on DensePose COCO demonstrates consistent improvement,
+regardless of the chosen 2D human pose model. Whole-body poses offer better
+constraints by incorporating additional details about the hands and feet.
+Conditioning UV maps with human pose reduces invalid mappings and enhances
+anatomical plausibility. In addition, we highlight inconsistencies in the
+ground-truth annotations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-visual modality micro drone-based structural damage detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08807v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08807v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaac Osei Agyemanga, Liaoyuan Zeng, Jianwen Chena, Isaac Adjei-Mensah, Daniel Acheampong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate detection and resilience of object detectors in structural damage
+detection are important in ensuring the continuous use of civil infrastructure.
+However, achieving robustness in object detectors remains a persistent
+challenge, impacting their ability to generalize effectively. This study
+proposes DetectorX, a robust framework for structural damage detection coupled
+with a micro drone. DetectorX addresses the challenges of object detector
+robustness by incorporating two innovative modules: a stem block and a spiral
+pooling technique. The stem block introduces a dynamic visual modality by
+leveraging the outputs of two Deep Convolutional Neural Network (DCNN) models.
+The framework employs the proposed event-based reward reinforcement learning to
+constrain the actions of a parent and child DCNN model leading to a reward.
+This results in the induction of two dynamic visual modalities alongside the
+Red, Green, and Blue (RGB) data. This enhancement significantly augments
+DetectorX's perception and adaptability in diverse environmental situations.
+Further, a spiral pooling technique, an online image augmentation method,
+strengthens the framework by increasing feature representations by
+concatenating spiraled and average/max pooled features. In three extensive
+experiments: (1) comparative and (2) robustness, which use the Pacific
+Earthquake Engineering Research Hub ImageNet dataset, and (3) field-experiment,
+DetectorX performed satisfactorily across varying metrics, including precision
+(0.88), recall (0.84), average precision (0.91), mean average precision (0.76),
+and mean average recall (0.73), compared to the competing detectors including
+You Only Look Once X-medium (YOLOX-m) and others. The study's findings indicate
+that DetectorX can provide satisfactory results and demonstrate resilience in
+challenging environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Chat<span class="highlight-title">GPT</span> for Face Presentation Attack Detection in Zero and
+  Few-Shot in-Context Learning <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08799v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08799v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alain Komaty, Hatef Otroshi Shahreza, Anjith George, Sebastien Marcel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study highlights the potential of ChatGPT (specifically GPT-4o) as a
+competitive alternative for Face Presentation Attack Detection (PAD),
+outperforming several PAD models, including commercial solutions, in specific
+scenarios. Our results show that GPT-4o demonstrates high consistency,
+particularly in few-shot in-context learning, where its performance improves as
+more examples are provided (reference data). We also observe that detailed
+prompts enable the model to provide scores reliably, a behavior not observed
+with concise prompts. Additionally, explanation-seeking prompts slightly
+enhance the model's performance by improving its interpretability. Remarkably,
+the model exhibits emergent reasoning capabilities, correctly predicting the
+attack type (print or replay) with high accuracy in few-shot scenarios, despite
+not being explicitly instructed to classify attack types. Despite these
+strengths, GPT-4o faces challenges in zero-shot tasks, where its performance is
+limited compared to specialized PAD systems. Experiments were conducted on a
+subset of the SOTERIA dataset, ensuring compliance with data privacy
+regulations by using only data from consenting individuals. These findings
+underscore GPT-4o's promise in PAD applications, laying the groundwork for
+future research to address broader data privacy concerns and improve
+cross-dataset generalization. Code available here:
+https://gitlab.idiap.ch/bob/bob.paper.wacv2025_chatgpt_face_pad
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in WACV workshop 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Admitting Ignorance Helps the Video Question Answering Models to Answer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08771v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08771v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haopeng Li, Tom Drummond, Mingming Gong, Mohammed Bennamoun, Qiuhong Ke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant progress has been made in the field of video question answering
+(VideoQA) thanks to deep learning and large-scale pretraining. Despite the
+presence of sophisticated model structures and powerful video-text foundation
+models, most existing methods focus solely on maximizing the correlation
+between answers and video-question pairs during training. We argue that these
+models often establish shortcuts, resulting in spurious correlations between
+questions and answers, especially when the alignment between video and text
+data is suboptimal. To address these spurious correlations, we propose a novel
+training framework in which the model is compelled to acknowledge its ignorance
+when presented with an intervened question, rather than making guesses solely
+based on superficial question-answer correlations. We introduce methodologies
+for intervening in questions, utilizing techniques such as displacement and
+perturbation, and design frameworks for the model to admit its lack of
+knowledge in both multi-choice VideoQA and open-ended settings. In practice, we
+integrate a state-of-the-art model into our framework to validate its
+effectiveness. The results clearly demonstrate that our framework can
+significantly enhance the performance of VideoQA models with minimal structural
+modifications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-Shot Learner Generalizes Across AI-Generated Image Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyu Wu, Jing Liu, Jing Li, Yequan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current fake image detectors trained on large synthetic image datasets
+perform satisfactorily on limited studied generative models. However, they
+suffer a notable performance decline over unseen models. Besides, collecting
+adequate training data from online generative models is often expensive or
+infeasible. To overcome these issues, we propose Few-Shot Detector (FSD), a
+novel AI-generated image detector which learns a specialized metric space to
+effectively distinguish unseen fake images by utilizing very few samples.
+Experiments show FSD achieves state-of-the-art performance by $+7.4\%$ average
+ACC on GenImage dataset. More importantly, our method is better capable of
+capturing the intra-category common features in unseen images without further
+training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ $\texttt{InfoHier}$: Hierarchical Information Extraction via Encoding
+  and Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianru Zhang, Li Ju, Prashant Singh, Salman Toor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing large-scale datasets, especially involving complex and
+high-dimensional data like images, is particularly challenging. While
+self-supervised learning (SSL) has proven effective for learning
+representations from unlabelled data, it typically focuses on flat,
+non-hierarchical structures, missing the multi-level relationships present in
+many real-world datasets. Hierarchical clustering (HC) can uncover these
+relationships by organizing data into a tree-like structure, but it often
+relies on rigid similarity metrics that struggle to capture the complexity of
+diverse data types. To address these we envision $\texttt{InfoHier}$, a
+framework that combines SSL with HC to jointly learn robust latent
+representations and hierarchical structures. This approach leverages SSL to
+provide adaptive representations, enhancing HC's ability to capture complex
+patterns. Simultaneously, it integrates HC loss to refine SSL training,
+resulting in representations that are more attuned to the underlying
+information hierarchy. $\texttt{InfoHier}$ has the potential to improve the
+expressiveness and performance of both clustering and representation learning,
+offering significant benefits for data analysis, management, and information
+retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Transformation Learning for Equivariant Representations <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemyung Yu, Jaehyun Choi, Dong-Jae Lee, HyeongGwon Hong, Junmo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised representation learning has significantly advanced various
+machine learning tasks. In the computer vision domain, state-of-the-art
+approaches utilize transformations like random crop and color jitter to achieve
+invariant representations, embedding semantically the same inputs despite
+transformations. However, this can degrade performance in tasks requiring
+precise features, such as localization or flower classification. To address
+this, recent research incorporates equivariant representation learning, which
+captures transformation-sensitive information. However, current methods depend
+on transformation labels and thus struggle with interdependency and complex
+transformations. We propose Self-supervised Transformation Learning (STL),
+replacing transformation labels with transformation representations derived
+from image pairs. The proposed method ensures transformation representation is
+image-invariant and learns corresponding equivariant transformations, enhancing
+performance without increased batch complexity. We demonstrate the approach's
+effectiveness across diverse classification and detection tasks, outperforming
+existing methods in 7 out of 11 benchmarks and excelling in detection. By
+integrating complex transformations like AugMix, unusable by prior equivariant
+methods, this approach enhances performance across tasks, underscoring its
+adaptability and resilience. Additionally, its compatibility with various base
+models highlights its flexibility and broad applicability. The code is
+available at https://github.com/jaemyung-u/stl.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38th Conference on Neural Information Processing Systems (NeurIPS
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RealVVT: Towards Photorealistic Video Virtual Try-on via Spatio-Temporal
+  Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siqi Li, Zhengkai Jiang, Jiawei Zhou, Zhihong Liu, Xiaowei Chi, Haoqian Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual try-on has emerged as a pivotal task at the intersection of computer
+vision and fashion, aimed at digitally simulating how clothing items fit on the
+human body. Despite notable progress in single-image virtual try-on (VTO),
+current methodologies often struggle to preserve a consistent and authentic
+appearance of clothing across extended video sequences. This challenge arises
+from the complexities of capturing dynamic human pose and maintaining target
+clothing characteristics. We leverage pre-existing video foundation models to
+introduce RealVVT, a photoRealistic Video Virtual Try-on framework tailored to
+bolster stability and realism within dynamic video contexts. Our methodology
+encompasses a Clothing & Temporal Consistency strategy, an Agnostic-guided
+Attention Focus Loss mechanism to ensure spatial consistency, and a Pose-guided
+Long Video VTO technique adept at handling extended video sequences.Extensive
+experiments across various datasets confirms that our approach outperforms
+existing state-of-the-art models in both single-image and video VTO tasks,
+offering a viable solution for practical applications within the realms of
+fashion e-commerce and virtual fitting environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages (8 pages main text, 2 pages references), 5 figures in the
+  main text, and 4 pages supplementary materials with 3 additional figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FlexiClip: Locality-Preserving Free-Form Character Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anant Khandelwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Animating clipart images with seamless motion while maintaining visual
+fidelity and temporal coherence presents significant challenges. Existing
+methods, such as AniClipart, effectively model spatial deformations but often
+fail to ensure smooth temporal transitions, resulting in artifacts like abrupt
+motions and geometric distortions. Similarly, text-to-video (T2V) and
+image-to-video (I2V) models struggle to handle clipart due to the mismatch in
+statistical properties between natural video and clipart styles. This paper
+introduces FlexiClip, a novel approach designed to overcome these limitations
+by addressing the intertwined challenges of temporal consistency and geometric
+integrity. FlexiClip extends traditional B\'ezier curve-based trajectory
+modeling with key innovations: temporal Jacobians to correct motion dynamics
+incrementally, continuous-time modeling via probability flow ODEs (pfODEs) to
+mitigate temporal noise, and a flow matching loss inspired by GFlowNet
+principles to optimize smooth motion transitions. These enhancements ensure
+coherent animations across complex scenarios involving rapid movements and
+non-rigid deformations. Extensive experiments validate the effectiveness of
+FlexiClip in generating animations that are not only smooth and natural but
+also structurally consistent across diverse clipart types, including humans and
+animals. By integrating spatial and temporal modeling with pre-trained video
+diffusion models, FlexiClip sets a new standard for high-quality clipart
+animation, offering robust performance across a wide range of visual content.
+Project Page: https://creative-gen.github.io/flexiclip.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GS-LIVO: Real-Time LiDAR, Inertial, and Visual Multi-sensor Fused
+  Odometry with Gaussian Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Hong, Chunran Zheng, Yishu Shen, Changze Li, Fu Zhang, Tong Qin, Shaojie Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, 3D Gaussian splatting (3D-GS) has emerged as a novel scene
+representation approach. However, existing vision-only 3D-GS methods often rely
+on hand-crafted heuristics for point-cloud densification and face challenges in
+handling occlusions and high GPU memory and computation consumption.
+LiDAR-Inertial-Visual (LIV) sensor configuration has demonstrated superior
+performance in localization and dense mapping by leveraging complementary
+sensing characteristics: rich texture information from cameras, precise
+geometric measurements from LiDAR, and high-frequency motion data from IMU.
+Inspired by this, we propose a novel real-time Gaussian-based simultaneous
+localization and mapping (SLAM) system. Our map system comprises a global
+Gaussian map and a sliding window of Gaussians, along with an IESKF-based
+odometry. The global Gaussian map consists of hash-indexed voxels organized in
+a recursive octree, effectively covering sparse spatial volumes while adapting
+to different levels of detail and scales. The Gaussian map is initialized
+through multi-sensor fusion and optimized with photometric gradients. Our
+system incrementally maintains a sliding window of Gaussians, significantly
+reducing GPU computation and memory consumption by only optimizing the map
+within the sliding window. Moreover, we implement a tightly coupled
+multi-sensor fusion odometry with an iterative error state Kalman filter
+(IESKF), leveraging real-time updating and rendering of the Gaussian map. Our
+system represents the first real-time Gaussian-based SLAM framework deployable
+on resource-constrained embedded systems, demonstrated on the NVIDIA Jetson
+Orin NX platform. The framework achieves real-time performance while
+maintaining robust multi-sensor fusion capabilities. All implementation
+algorithms, hardware designs, and CAD models will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimeFlow: Longitudinal Brain Image Registration and Aging Progression
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bailiang Jian, Jiazhen Pan, Yitong Li, Fabian Bongratz, Ruochen Li, Daniel Rueckert, Benedikt Wiestler, Christian Wachinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting future brain states is crucial for understanding healthy aging and
+neurodegenerative diseases. Longitudinal brain MRI registration, a cornerstone
+for such analyses, has long been limited by its inability to forecast future
+developments, reliance on extensive, dense longitudinal data, and the need to
+balance registration accuracy with temporal smoothness. In this work, we
+present \emph{TimeFlow}, a novel framework for longitudinal brain MRI
+registration that overcomes all these challenges. Leveraging a U-Net
+architecture with temporal conditioning inspired by diffusion models, TimeFlow
+enables accurate longitudinal registration and facilitates prospective analyses
+through future image prediction. Unlike traditional methods that depend on
+explicit smoothness regularizers and dense sequential data, TimeFlow achieves
+temporal consistency and continuity without these constraints. Experimental
+results highlight its superior performance in both future timepoint prediction
+and registration accuracy compared to state-of-the-art methods. Additionally,
+TimeFlow supports novel biological brain aging analyses, effectively
+differentiating neurodegenerative conditions from healthy aging. It eliminates
+the need for segmentation, thereby avoiding the challenges of non-trivial
+annotation and inconsistent segmentation errors. TimeFlow paves the way for
+accurate, data-efficient, and annotation-free prospective analyses of brain
+aging and chronic diseases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Facial Image Privacy Preservation in Cloud-Based Services 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08665v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08665v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Chen, Mengyuan Sun, Xueluan Gong, Yanjiao Chen, Qian Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial recognition models are increasingly employed by commercial
+enterprises, government agencies, and cloud service providers for identity
+verification, consumer services, and surveillance. These models are often
+trained using vast amounts of facial data processed and stored in cloud-based
+platforms, raising significant privacy concerns. Users' facial images may be
+exploited without their consent, leading to potential data breaches and misuse.
+This survey presents a comprehensive review of current methods aimed at
+preserving facial image privacy in cloud-based services. We categorize these
+methods into two primary approaches: image obfuscation-based protection and
+adversarial perturbation-based protection. We provide an in-depth analysis of
+both categories, offering qualitative and quantitative comparisons of their
+effectiveness. Additionally, we highlight unresolved challenges and propose
+future research directions to improve privacy preservation in cloud computing
+environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Product of Gaussian Mixture Diffusion Model for non-linear MRI Inversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08662v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08662v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurenz Nagler, Martin Zach, Thomas Pock
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have recently shown remarkable results in magnetic resonance
+imaging reconstruction. However, the employed networks typically are black-box
+estimators of the (smoothed) prior score with tens of millions of parameters,
+restricting interpretability and increasing reconstruction time. Furthermore,
+parallel imaging reconstruction algorithms either rely on off-line coil
+sensitivity estimation, which is prone to misalignment and restricting sampling
+trajectories, or perform per-coil reconstruction, making the computational cost
+proportional to the number of coils. To overcome this, we jointly reconstruct
+the image and the coil sensitivities using the lightweight,
+parameter-efficient, and interpretable product of Gaussian mixture diffusion
+model as an image prior and a classical smoothness priors on the coil
+sensitivities. The proposed method delivers promising results while allowing
+for fast inference and demonstrating robustness to contrast out-of-distribution
+data and sampling trajectories, comparable to classical variational penalties
+such as total variation. Finally, the probabilistic formulation allows the
+calculation of the posterior expectation and pixel-wise variance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BRIGHT-VO: Brightness-Guided Hybrid <span class="highlight-title">Transformer</span> for Visual Odometry with
+  Multi-modality Refinement Module 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongzhihan Wang, Yang Yang, Liang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual odometry (VO) plays a crucial role in autonomous driving, robotic
+navigation, and other related tasks by estimating the position and orientation
+of a camera based on visual input. Significant progress has been made in
+data-driven VO methods, particularly those leveraging deep learning techniques
+to extract image features and estimate camera poses. However, these methods
+often struggle in low-light conditions because of the reduced visibility of
+features and the increased difficulty of matching keypoints. To address this
+limitation, we introduce BrightVO, a novel VO model based on Transformer
+architecture, which not only performs front-end visual feature extraction, but
+also incorporates a multi-modality refinement module in the back-end that
+integrates Inertial Measurement Unit (IMU) data. Using pose graph optimization,
+this module iteratively refines pose estimates to reduce errors and improve
+both accuracy and robustness. Furthermore, we create a synthetic low-light
+dataset, KiC4R, which includes a variety of lighting conditions to facilitate
+the training and evaluation of VO frameworks in challenging environments.
+Experimental results demonstrate that BrightVO achieves state-of-the-art
+performance on both the KiC4R dataset and the KITTI benchmarks. Specifically,
+it provides an average improvement of 20% in pose estimation accuracy in normal
+outdoor environments and 259% in low-light conditions, outperforming existing
+methods. For widespread use and further development, the research work is fully
+open-source at https://github.com/Anastasiawd/BrightVO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StereoGen: High-quality Stereo Image Generation from a Single Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianqi Wang, Hao Yang, Gangwei Xu, Junda Cheng, Min Lin, Yong Deng, Jinliang Zang, Yurui Chen, Xin Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art supervised stereo matching methods have achieved amazing
+results on various benchmarks. However, these data-driven methods suffer from
+generalization to real-world scenarios due to the lack of real-world annotated
+data. In this paper, we propose StereoGen, a novel pipeline for high-quality
+stereo image generation. This pipeline utilizes arbitrary single images as left
+images and pseudo disparities generated by a monocular depth estimation model
+to synthesize high-quality corresponding right images. Unlike previous methods
+that fill the occluded area in warped right images using random backgrounds or
+using convolutions to take nearby pixels selectively, we fine-tune a diffusion
+inpainting model to recover the background. Images generated by our model
+possess better details and undamaged semantic structures. Besides, we propose
+Training-free Confidence Generation and Adaptive Disparity Selection. The
+former suppresses the negative effect of harmful pseudo ground truth during
+stereo training, while the latter helps generate a wider disparity distribution
+and better synthetic images. Experiments show that models trained under our
+pipeline achieve state-of-the-art zero-shot generalization results among all
+published methods. The code will be available upon publication of the paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint Learning of Depth and Appearance for Portrait Image Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinya Ji, Gaspard Zoss, Prashanth Chandran, Lingchen Yang, Xun Cao, Barbara Solenthaler, Derek Bradley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  2D portrait animation has experienced significant advancements in recent
+years. Much research has utilized the prior knowledge embedded in large
+generative diffusion models to enhance high-quality image manipulation.
+However, most methods only focus on generating RGB images as output, and the
+co-generation of consistent visual plus 3D output remains largely
+under-explored. In our work, we propose to jointly learn the visual appearance
+and depth simultaneously in a diffusion-based portrait image generator. Our
+method embraces the end-to-end diffusion paradigm and introduces a new
+architecture suitable for learning this conditional joint distribution,
+consisting of a reference network and a channel-expanded diffusion backbone.
+Once trained, our framework can be efficiently adapted to various downstream
+applications, such as facial depth-to-image and image-to-depth generation,
+portrait relighting, and audio-driven talking head animation with consistent 3D
+output.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MonSter: Marry Monodepth to Stereo Unleashes Power 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junda Cheng, Longliang Liu, Gangwei Xu, Xianqi Wang, Zhaoxing Zhang, Yong Deng, Jinliang Zang, Yurui Chen, Zhipeng Cai, Xin Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stereo matching recovers depth from image correspondences. Existing methods
+struggle to handle ill-posed regions with limited matching cues, such as
+occlusions and textureless areas. To address this, we propose MonSter, a novel
+method that leverages the complementary strengths of monocular depth estimation
+and stereo matching. MonSter integrates monocular depth and stereo matching
+into a dual-branch architecture to iteratively improve each other.
+Confidence-based guidance adaptively selects reliable stereo cues for monodepth
+scale-shift recovery. The refined monodepth is in turn guides stereo
+effectively at ill-posed regions. Such iterative mutual enhancement enables
+MonSter to evolve monodepth priors from coarse object-level structures to
+pixel-level geometry, fully unlocking the potential of stereo matching. As
+shown in Fig.1, MonSter ranks 1st across five most commonly used leaderboards
+-- SceneFlow, KITTI 2012, KITTI 2015, Middlebury, and ETH3D. Achieving up to
+49.5% improvements (Bad 1.0 on ETH3D) over the previous best method.
+Comprehensive analysis verifies the effectiveness of MonSter in ill-posed
+regions. In terms of zero-shot generalization, MonSter significantly and
+consistently outperforms state-of-the-art across the board. The code is
+publicly available at: https://github.com/Junda24/MonSter.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Wildfire Flame and Smoke through Edge Computing using Transfer
+  Learning Enhanced Deep Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanny Vazquez, Shengjie Zhai, Mei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous unmanned aerial vehicles (UAVs) integrated with edge computing
+capabilities empower real-time data processing directly on the device,
+dramatically reducing latency in critical scenarios such as wildfire detection.
+This study underscores Transfer Learning's (TL) significance in boosting the
+performance of object detectors for identifying wildfire smoke and flames,
+especially when trained on limited datasets, and investigates the impact TL has
+on edge computing metrics. With the latter focusing how TL-enhanced You Only
+Look Once (YOLO) models perform in terms of inference time, power usage, and
+energy consumption when using edge computing devices. This study utilizes the
+Aerial Fire and Smoke Essential (AFSE) dataset as the target, with the Flame
+and Smoke Detection Dataset (FASDD) and the Microsoft Common Objects in Context
+(COCO) dataset serving as source datasets. We explore a two-stage cascaded TL
+method, utilizing D-Fire or FASDD as initial stage target datasets and AFSE as
+the subsequent stage. Through fine-tuning, TL significantly enhances detection
+precision, achieving up to 79.2% mean Average Precision (mAP@0.5), reduces
+training time, and increases model generalizability across the AFSE dataset.
+However, cascaded TL yielded no notable improvements and TL alone did not
+benefit the edge computing metrics evaluated. Lastly, this work found that
+YOLOv5n remains a powerful model when lacking hardware acceleration, finding
+that YOLOv5n can process images nearly twice as fast as its newer counterpart,
+YOLO11n. Overall, the results affirm TL's role in augmenting the accuracy of
+object detectors while also illustrating that additional enhancements are
+needed to improve edge computing performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Organizing Edge Computing Distribution Framework for Visual SLAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jussi Kalliola, Lauri Suomela, Sergio Moreschini, David Hästbacka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Localization within a known environment is a crucial capability for mobile
+robots. Simultaneous Localization and Mapping (SLAM) is a prominent solution to
+this problem. SLAM is a framework that consists of a diverse set of
+computational tasks ranging from real-time tracking to computation-intensive
+map optimization. This combination can present a challenge for resource-limited
+mobile robots. Previously, edge-assisted SLAM methods have demonstrated
+promising real-time execution capabilities by offloading heavy computations
+while performing real-time tracking onboard. However, the common approach of
+utilizing a client-server architecture for offloading is sensitive to server
+and network failures. In this article, we propose a novel edge-assisted SLAM
+framework capable of self-organizing fully distributed SLAM execution across a
+network of devices or functioning on a single device without connectivity. The
+architecture consists of three layers and is designed to be device-agnostic,
+resilient to network failures, and minimally invasive to the core SLAM system.
+We have implemented and demonstrated the framework for monocular ORB SLAM3 and
+evaluated it in both fully distributed and standalone SLAM configurations
+against the ORB SLAM3. The experiment results demonstrate that the proposed
+design matches the accuracy and resource utilization of the monolithic approach
+while enabling collaborative execution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computerized Assessment of Motor Imitation for Distinguishing Autism in
+  Video (CAMI-2DNet) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaleab A. Kinfu, Carolina Pacheco, Alice D. Sperry, Deana Crocetti, Bahar Tunçgenç, Stewart H. Mostofsky, René Vidal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motor imitation impairments are commonly reported in individuals with autism
+spectrum conditions (ASCs), suggesting that motor imitation could be used as a
+phenotype for addressing autism heterogeneity. Traditional methods for
+assessing motor imitation are subjective, labor-intensive, and require
+extensive human training. Modern Computerized Assessment of Motor Imitation
+(CAMI) methods, such as CAMI-3D for motion capture data and CAMI-2D for video
+data, are less subjective. However, they rely on labor-intensive data
+normalization and cleaning techniques, and human annotations for algorithm
+training. To address these challenges, we propose CAMI-2DNet, a scalable and
+interpretable deep learning-based approach to motor imitation assessment in
+video data, which eliminates the need for data normalization, cleaning and
+annotation. CAMI-2DNet uses an encoder-decoder architecture to map a video to a
+motion encoding that is disentangled from nuisance factors such as body shape
+and camera views. To learn a disentangled representation, we employ synthetic
+data generated by motion retargeting of virtual characters through the
+reshuffling of motion, body shape, and camera views, as well as real
+participant data. To automatically assess how well an individual imitates an
+actor, we compute a similarity score between their motion encodings, and use it
+to discriminate individuals with ASCs from neurotypical (NT) individuals. Our
+comparative analysis demonstrates that CAMI-2DNet has a strong correlation with
+human scores while outperforming CAMI-2D in discriminating ASC vs NT children.
+Moreover, CAMI-2DNet performs comparably to CAMI-3D while offering greater
+practicality by operating directly on video data and without the need for
+ad-hoc data normalization and human annotations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PACF: Prototype Augmented Compact Features for Improving Domain Adaptive
+  Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenguang Liu, Yongchao Feng, Yanan Zhang, Qingjie Liu, Yunhong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there has been significant advancement in object detection.
+However, applying off-the-shelf detectors to a new domain leads to significant
+performance drop, caused by the domain gap. These detectors exhibit
+higher-variance class-conditional distributions in the target domain than that
+in the source domain, along with mean shift. To address this problem, we
+propose the Prototype Augmented Compact Features (PACF) framework to regularize
+the distribution of intra-class features. Specifically, we provide an in-depth
+theoretical analysis on the lower bound of the target features-related
+likelihood and derive the prototype cross entropy loss to further calibrate the
+distribution of target RoI features. Furthermore, a mutual regularization
+strategy is designed to enable the linear and prototype-based classifiers to
+learn from each other, promoting feature compactness while enhancing
+discriminability. Thanks to this PACF framework, we have obtained a more
+compact cross-domain feature space, within which the variance of the target
+features' class-conditional distributions has significantly decreased, and the
+class-mean shift between the two domains has also been further reduced. The
+results on different adaptation settings are state-of-the-art, which
+demonstrate the board applicability and effectiveness of the proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Watermarking in Diffusion Model: Gaussian Shading with Exact Diffusion
+  Inversion via Coupled Transformations (EDICT) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08604v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08604v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krishna Panthi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel approach to enhance the performance of Gaussian
+Shading, a prevalent watermarking technique, by integrating the Exact Diffusion
+Inversion via Coupled Transformations (EDICT) framework. While Gaussian Shading
+traditionally embeds watermarks in a noise latent space, followed by iterative
+denoising for image generation and noise addition for watermark recovery, its
+inversion process is not exact, leading to potential watermark distortion. We
+propose to leverage EDICT's ability to derive exact inverse mappings to refine
+this process. Our method involves duplicating the watermark-infused noisy
+latent and employing a reciprocal, alternating denoising and noising scheme
+between the two latents, facilitated by EDICT. This allows for a more precise
+reconstruction of both the image and the embedded watermark. Empirical
+evaluation on standard datasets demonstrates that our integrated approach
+yields a slight, yet statistically significant improvement in watermark
+recovery fidelity. These results highlight the potential of EDICT to enhance
+existing diffusion-based watermarking techniques by providing a more accurate
+and robust inversion mechanism. To the best of our knowledge, this is the first
+work to explore the synergy between EDICT and Gaussian Shading for digital
+watermarking, opening new avenues for research in robust and high-fidelity
+watermark embedding and extraction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image-to-Force Estimation for Soft Tissue Interaction in
+  Robotic-Assisted Surgery Using Structured Light 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08593v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08593v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayin Wang, Mingfeng Yao, Yanran Wei, Xiaoyu Guo, Ayong Zheng, Weidong Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For Minimally Invasive Surgical (MIS) robots, accurate haptic interaction
+force feedback is essential for ensuring the safety of interacting with soft
+tissue. However, most existing MIS robotic systems cannot facilitate direct
+measurement of the interaction force with hardware sensors due to space
+limitations. This letter introduces an effective vision-based scheme that
+utilizes a One-Shot structured light projection with a designed pattern on soft
+tissue coupled with haptic information processing through a trained
+image-to-force neural network. The images captured from the endoscopic stereo
+camera are analyzed to reconstruct high-resolution 3D point clouds for soft
+tissue deformation. Based on this, a modified PointNet-based force estimation
+method is proposed, which excels in representing the complex mechanical
+properties of soft tissue. Numerical force interaction experiments are
+conducted on three silicon materials with different stiffness. The results
+validate the effectiveness of the proposed scheme.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Densely Connected Parameter-Efficient Tuning for Referring Image
+  Segmentation <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Huang, Zunnan Xu, Ting Liu, Yong Liu, Haonan Han, Kehong Yuan, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the domain of computer vision, Parameter-Efficient Tuning (PET) is
+increasingly replacing the traditional paradigm of pre-training followed by
+full fine-tuning. PET is particularly favored for its effectiveness in large
+foundation models, as it streamlines transfer learning costs and optimizes
+hardware utilization. However, the current PET methods are mainly designed for
+single-modal optimization. While some pioneering studies have undertaken
+preliminary explorations, they still remain at the level of aligned encoders
+(e.g., CLIP) and lack exploration of misaligned encoders. These methods show
+sub-optimal performance with misaligned encoders, as they fail to effectively
+align the multimodal features during fine-tuning. In this paper, we introduce
+DETRIS, a parameter-efficient tuning framework designed to enhance low-rank
+visual feature propagation by establishing dense interconnections between each
+layer and all preceding layers, which enables effective cross-modal feature
+interaction and adaptation to misaligned encoders. We also suggest using text
+adapters to improve textual features. Our simple yet efficient approach greatly
+surpasses state-of-the-art methods with 0.9% to 1.8% backbone parameter
+updates, evaluated on challenging benchmarks. Our project is available at
+\url{https://github.com/jiaqihuang01/DETRIS}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable and High-Quality Neural Implicit Representation for 3D
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leyuan Yang, Bailin Deng, Juyong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various SDF-based neural implicit surface reconstruction methods have been
+proposed recently, and have demonstrated remarkable modeling capabilities.
+However, due to the global nature and limited representation ability of a
+single network, existing methods still suffer from many drawbacks, such as
+limited accuracy and scale of the reconstruction. In this paper, we propose a
+versatile, scalable and high-quality neural implicit representation to address
+these issues. We integrate a divide-and-conquer approach into the neural
+SDF-based reconstruction. Specifically, we model the object or scene as a
+fusion of multiple independent local neural SDFs with overlapping regions. The
+construction of our representation involves three key steps: (1) constructing
+the distribution and overlap relationship of the local radiance fields based on
+object structure or data distribution, (2) relative pose registration for
+adjacent local SDFs, and (3) SDF blending. Thanks to the independent
+representation of each local region, our approach can not only achieve
+high-fidelity surface reconstruction, but also enable scalable scene
+reconstruction. Extensive experimental results demonstrate the effectiveness
+and practicality of our proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GOTLoc: General Outdoor Text-based Localization Using Scene Graph
+  Retrieval with OpenStreetMap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donghwi Jung, Keonwoo Kim, Seong-Woo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose GOTLoc, a robust localization method capable of operating even in
+outdoor environments where GPS signals are unavailable. The method achieves
+this robust localization by leveraging comparisons between scene graphs
+generated from text descriptions and maps. Existing text-based localization
+studies typically represent maps as point clouds and identify the most similar
+scenes by comparing embeddings of text and point cloud data. However, point
+cloud maps have limited scalability as it is impractical to pre-generate maps
+for all outdoor spaces. Furthermore, their large data size makes it challenging
+to store and utilize them directly on actual robots. To address these issues,
+GOTLoc leverages compact data structures, such as scene graphs, to store
+spatial information, enabling individual robots to carry and utilize large
+amounts of map data. Additionally, by utilizing publicly available map data,
+such as OpenStreetMap, which provides global information on outdoor spaces, we
+eliminate the need for additional effort to create custom map data. For
+performance evaluation, we utilized the KITTI360Pose dataset in conjunction
+with corresponding OpenStreetMap data to compare the proposed method with
+existing approaches. Our results demonstrate that the proposed method achieves
+accuracy comparable to algorithms relying on point cloud maps. Moreover, in
+city-scale tests, GOTLoc required significantly less storage compared to point
+cloud-based methods and completed overall processing within a few seconds,
+validating its applicability to real-world robotics. Our code is available at
+https://github.com/donghwijung/GOTLoc.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MIAFEx: An Attention-based Feature Extraction Method for Medical Image
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08562v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08562v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Ramos-Soto, Jorge Ramos-Frutos, Ezequiel Perez-Zarate, Diego Oliva, Sandra E. Balderas-Mata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Feature extraction techniques are crucial in medical image classification;
+however, classical feature extractors in addition to traditional machine
+learning classifiers often exhibit significant limitations in providing
+sufficient discriminative information for complex image sets. While
+Convolutional Neural Networks (CNNs) and Vision Transformer (ViT) have shown
+promise in feature extraction, they are prone to overfitting due to the
+inherent characteristics of medical imaging data, including small sample sizes
+or high intra-class variance. In this work, the Medical Image Attention-based
+Feature Extractor (MIAFEx) is proposed, a novel method that employs a learnable
+refinement mechanism to enhance the classification token within the Transformer
+encoder architecture. This mechanism adjusts the token based on learned
+weights, improving the extraction of salient features and enhancing the model's
+adaptability to the challenges presented by medical imaging data. The MIAFEx
+output features quality is compared against classical feature extractors using
+traditional and hybrid classifiers. Also, the performance of these features is
+compared against modern CNN and ViT models in classification tasks,
+demonstrating its superiority in accuracy and robustness across multiple
+complex classification medical imaging datasets. This advantage is particularly
+pronounced in scenarios with limited training data, where traditional and
+modern models often struggle to generalize effectively. The source code of this
+proposal can be found at
+https://github.com/Oscar-RamosS/Medical-Image-Attention-based-Feature-Extractor-MIAFEx
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In preparation for Journal Submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DynamicFace: High-Quality and Consistent Video Face Swapping using
+  Composable 3D Facial Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08553v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08553v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runqi Wang, Sijie Xu, Tianyao He, Yang Chen, Wei Zhu, Dejia Song, Nemo Chen, Xu Tang, Yao Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face swapping transfers the identity of a source face to a target face while
+retaining the attributes like expression, pose, hair, and background of the
+target face. Advanced face swapping methods have achieved attractive results.
+However, these methods often inadvertently transfer identity information from
+the target face, compromising expression-related details and accurate identity.
+We propose a novel method DynamicFace that leverages the power of diffusion
+model and plug-and-play temporal layers for video face swapping. First, we
+introduce four fine-grained face conditions using 3D facial priors. All
+conditions are designed to be disentangled from each other for precise and
+unique control. Then, we adopt Face Former and ReferenceNet for high-level and
+detailed identity injection. Through experiments on the FF++ dataset, we
+demonstrate that our method achieves state-of-the-art results in face swapping,
+showcasing superior image quality, identity preservation, and expression
+accuracy. Besides, our method could be easily transferred to video domain with
+temporal attention layer. Our code and results will be available on the project
+page: https://dynamic-face.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Devil is in Temporal Token: High Quality Video Reasoning
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sitong Gong, Yunzhi Zhuge, Lu Zhang, Zongxin Yang, Pingping Zhang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods for Video Reasoning Segmentation rely heavily on a single
+special token to represent the object in the keyframe or the entire video,
+inadequately capturing spatial complexity and inter-frame motion. To overcome
+these challenges, we propose VRS-HQ, an end-to-end video reasoning segmentation
+approach that leverages Multimodal Large Language Models (MLLMs) to inject rich
+spatiotemporal features into hierarchical tokens.Our key innovations include a
+Temporal Dynamic Aggregation (TDA) and a Token-driven Keyframe Selection (TKS).
+Specifically, we design frame-level <SEG> and temporal-level <TAK> tokens that
+utilize MLLM's autoregressive learning to effectively capture both local and
+global information. Subsequently, we apply a similarity-based weighted fusion
+and frame selection strategy, then utilize SAM2 to perform keyframe
+segmentation and propagation. To enhance keyframe localization accuracy, the
+TKS filters keyframes based on SAM2's occlusion scores during inference. VRS-HQ
+achieves state-of-the-art performance on ReVOS, surpassing VISA by
+5.9%/12.5%/9.1% in J&F scores across the three subsets. These results highlight
+the strong temporal reasoning and segmentation capabilities of our method. Code
+and model weights will be released at VRS-HQ.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comprehensive Subjective and Objective Evaluation Method for
+  Text-generated Video 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelu Qi, Ping Shi, Shuqi Wang, Zhaoyang Zhang, Zefeng Ying, Da Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent text-to-video (T2V) technology advancements, as demonstrated by models
+such as Gen3, Pika, and Sora, have significantly broadened its applicability
+and popularity. This progress has created a growing demand for accurate quality
+assessment metrics to evaluate the perceptual quality of text-generated videos
+and optimize video generation models. However, assessing the quality of
+text-generated videos remains challenging due to the presence of highly complex
+distortions, such as unnatural actions and phenomena that defy human cognition.
+To address these challenges, we constructed a large-scale benchmark dataset for
+\textbf{T}ext-generated \textbf{V}ideo \textbf{eval}uation,
+\textbf{T2VEval-Bench}, comprising 148 textual words and 1,783 videos generated
+by 12 models. During the subjective evaluation, we collected five key scores:
+overall impression, video quality, aesthetic quality, realness, and text-video
+consistency. For objective evaluation, we developed the \textbf{T2VEval} model,
+which assesses videos across three branches: quality, authenticity, and
+consistency. Using an attention-based fusion module, T2VEval effectively
+integrates features from each branch and predicts scores with the aid of a
+large oracle model. Additionally, we implemented a progressive training
+strategy, enabling each branch to learn targeted knowledge while maintaining
+synergy with the others. Experimental results demonstrate that T2VEval achieves
+state-of-the-art performance across multiple metrics. The dataset and code will
+be open-sourced upon completion of the follow-up work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Fake News Video Explanation Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08514v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08514v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lizhi Chen, Zhong Qian, Peifeng Li, Qiaoming Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal explanation involves the assessment of the veracity of a variety
+of different content, and relies on multiple information modalities to
+comprehensively consider the relevance and consistency between modalities. Most
+existing fake news video detection methods focus on improving accuracy while
+ignoring the importance of providing explanations. In this paper, we propose a
+novel problem - Fake News Video Explanation (FNVE) - Given a multimodal news
+containing both video and caption text, we aim to generate natural language
+explanations to reveal the truth of predictions. To this end, we develop
+FakeNVE, a new dataset of explanations for truthfully multimodal posts, where
+each explanation is a natural language (English) sentence describing the
+attribution of a news thread. We benchmark FakeNVE by using a multimodal
+transformer-based architecture. Subsequently, a BART-based autoregressive
+decoder is used as the generator. Empirical results show compelling results for
+various baselines (applicable to FNVE) across multiple evaluation metrics. We
+also perform human evaluation on explanation generation, achieving high scores
+for both adequacy and fluency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Efficacy of Meta-Learning: Unveiling Superior Data
+  Diversity Utilization of MAML Over <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kavita Selva, Satita Vittayaareekul, Brando Miranda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently, data and model size dominate the narrative in the training of
+super-large, powerful models. However, there has been a lack of exploration on
+the effect of other attributes of the training dataset on model performance. We
+hypothesize that dataset diversity can impact the performance of vision models.
+Our study shows positive correlations between test set accuracy and data
+diversity, providing an argument for furthering the research of dataset
+attributes beyond size. We analyzed pre-training and model-agnostic
+meta-learning methods on twelve popular visual datasets (e.g., Omniglot,
+CIFAR-FS, Aircraft) and five model configurations, including MAML variants with
+different numbers of inner gradient steps and supervised learning. We show
+moderate to strong positive correlations (R-squared: 0.15-0.42) between
+accuracy and data diversity and weaker but significant correlations (R-squared:
+~0.2) between loss and diversity. These findings support our hypothesis and
+demonstrate a promising way for a deeper exploration of how formal data
+diversity influences model performance. This initial study highlights the
+potential of (Task2Vec) data diversity as a valuable measure in the rapidly
+evolving field of large-scale learning and emphasizes that understanding the
+dataset is key to building more powerful and generalizable models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Yuan: Yielding Unblemished Aesthetics Through A Unified Network for
+  Visual Imperfections Removal in Generated Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08505v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08505v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyu Yu, Chee Seng Chan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI presents transformative potential across various domains, from
+creative arts to scientific visualization. However, the utility of AI-generated
+imagery is often compromised by visual flaws, including anatomical
+inaccuracies, improper object placements, and misplaced textual elements. These
+imperfections pose significant challenges for practical applications. To
+overcome these limitations, we introduce \textit{Yuan}, a novel framework that
+autonomously corrects visual imperfections in text-to-image synthesis.
+\textit{Yuan} uniquely conditions on both the textual prompt and the segmented
+image, generating precise masks that identify areas in need of refinement
+without requiring manual intervention -- a common constraint in previous
+methodologies. Following the automated masking process, an advanced inpainting
+module seamlessly integrates contextually coherent content into the identified
+regions, preserving the integrity and fidelity of the original image and
+associated text prompts. Through extensive experimentation on publicly
+available datasets such as ImageNet100 and Stanford Dogs, along with a
+custom-generated dataset, \textit{Yuan} demonstrated superior performance in
+eliminating visual imperfections. Our approach consistently achieved higher
+scores in quantitative metrics, including NIQE, BRISQUE, and PI, alongside
+favorable qualitative evaluations. These results underscore \textit{Yuan}'s
+potential to significantly enhance the quality and applicability of
+AI-generated images across diverse fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SuperSAM: Crafting a SAM Supernetwork via Structured Pruning and
+  Unstructured Parameter Prioritization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Waqwoya Abebe, Sadegh Jafari, Sixing Yu, Akash Dutta, Jan Strube, Nathan R. Tallent, Luanzheng Guo, Pablo Munoz, Ali Jannesari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Architecture Search (NAS) is a powerful approach of automating the
+design of efficient neural architectures. In contrast to traditional NAS
+methods, recently proposed one-shot NAS methods prove to be more efficient in
+performing NAS. One-shot NAS works by generating a singular weight-sharing
+supernetwork that acts as a search space (container) of subnetworks. Despite
+its achievements, designing the one-shot search space remains a major
+challenge. In this work we propose a search space design strategy for Vision
+Transformer (ViT)-based architectures. In particular, we convert the Segment
+Anything Model (SAM) into a weight-sharing supernetwork called SuperSAM. Our
+approach involves automating the search space design via layer-wise structured
+pruning and parameter prioritization. While the structured pruning applies
+probabilistic removal of certain transformer layers, parameter prioritization
+performs weight reordering and slicing of MLP-blocks in the remaining layers.
+We train supernetworks on several datasets using the sandwich rule. For
+deployment, we enhance subnetwork discovery by utilizing a program autotuner to
+identify efficient subnetworks within the search space. The resulting
+subnetworks are 30-70% smaller in size compared to the original pre-trained SAM
+ViT-B, yet outperform the pretrained model. Our work introduces a new and
+effective method for ViT NAS search-space design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Few-shot Crack Segmentation and its Precise 3D Automatic
+  Measurement in Concrete Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengru Deng, Jiapeng Yao, Chun Li, Su Wang, Xinrun Li, Varun Ojha, Xuhui He, Takashi Matsumoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual-Spatial Systems has become increasingly essential in concrete crack
+inspection. However, existing methods often lacks adaptability to diverse
+scenarios, exhibits limited robustness in image-based approaches, and struggles
+with curved or complex geometries. To address these limitations, an innovative
+framework for two-dimensional (2D) crack detection, three-dimensional (3D)
+reconstruction, and 3D automatic crack measurement was proposed by integrating
+computer vision technologies and multi-modal Simultaneous localization and
+mapping (SLAM) in this study. Firstly, building on a base DeepLabv3+
+segmentation model, and incorporating specific refinements utilizing foundation
+model Segment Anything Model (SAM), we developed a crack segmentation method
+with strong generalization across unfamiliar scenarios, enabling the generation
+of precise 2D crack masks. To enhance the accuracy and robustness of 3D
+reconstruction, Light Detection and Ranging (LiDAR) point clouds were utilized
+together with image data and segmentation masks. By leveraging both image- and
+LiDAR-SLAM, we developed a multi-frame and multi-modal fusion framework that
+produces dense, colorized point clouds, effectively capturing crack semantics
+at a 3D real-world scale. Furthermore, the crack geometric attributions were
+measured automatically and directly within 3D dense point cloud space,
+surpassing the limitations of conventional 2D image-based measurements. This
+advancement makes the method suitable for structural components with curved and
+complex 3D geometries. Experimental results across various concrete structures
+highlight the significant improvements and unique advantages of the proposed
+method, demonstrating its effectiveness, accuracy, and robustness in real-world
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grounding Text-To-Image Diffusion Models For Controlled High-Quality
+  Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmad Süleyman, Göksel Biricik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale text-to-image (T2I) diffusion models have demonstrated an
+outstanding performance in synthesizing diverse high-quality visuals from
+natural language text captions. Multiple layout-to-image models have been
+developed to control the generation process by utilizing a broad array of
+layouts such as segmentation maps, edges, and human keypoints. In this work, we
+present ObjectDiffusion, a model that takes inspirations from the top
+cutting-edge image generative frameworks to seamlessly condition T2I models
+with new bounding boxes capabilities. Specifically, we make substantial
+modifications to the network architecture introduced in ContorlNet to integrate
+it with the condition processing and injection techniques proposed in GLIGEN.
+ObjectDiffusion is initialized with pretraining parameters to leverage the
+generation knowledge obtained from training on large-scale datasets. We
+fine-tune ObjectDiffusion on the COCO2017 training dataset and evaluate it on
+the COCO2017 validation dataset. Our model achieves an AP$_{50}$ of 46.6, an AR
+of 44.5, and a FID of 19.8 outperforming the current SOTA model trained on
+open-source datasets in all of the three metrics. ObjectDiffusion demonstrates
+a distinctive capability in synthesizing diverse, high-quality, high-fidelity
+images that seamlessly conform to the semantic and spatial control layout.
+Evaluated in qualitative and quantitative tests, ObjectDiffusion exhibits
+remarkable grounding abilities on closed-set and open-set settings across a
+wide variety of contexts. The qualitative assessment verifies the ability of
+ObjectDiffusion to generate multiple objects of different sizes and locations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Patch-aware Vector Quantized Codebook Learning for Unsupervised Visual
+  Defect Detection <span class="chip">ICTAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qisen Cheng, Shuhui Qu, Janghwan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised visual defect detection is critical in industrial applications,
+requiring a representation space that captures normal data features while
+detecting deviations. Achieving a balance between expressiveness and
+compactness is challenging; an overly expressive space risks inefficiency and
+mode collapse, impairing detection accuracy. We propose a novel approach using
+an enhanced VQ-VAE framework optimized for unsupervised defect detection. Our
+model introduces a patch-aware dynamic code assignment scheme, enabling
+context-sensitive code allocation to optimize spatial representation. This
+strategy enhances normal-defect distinction and improves detection accuracy
+during inference. Experiments on MVTecAD, BTAD, and MTSD datasets show our
+method achieves state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, Accepted to 36th IEEE ICTAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cancer-Net PCa-Seg: Benchmarking Deep Learning Models for Prostate
+  Cancer Segmentation Using Synthetic Correlated Diffusion Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jarett Dewbury, Chi-en Amy Tai, Alexander Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prostate cancer (PCa) is the most prevalent cancer among men in the United
+States, accounting for nearly 300,000 cases, 29% of all diagnoses and 35,000
+total deaths in 2024. Traditional screening methods such as prostate-specific
+antigen (PSA) testing and magnetic resonance imaging (MRI) have been pivotal in
+diagnosis, but have faced limitations in specificity and generalizability. In
+this paper, we explore the potential of enhancing PCa lesion segmentation using
+a novel MRI modality called synthetic correlated diffusion imaging (CDI$^s$).
+We employ several state-of-the-art deep learning models, including U-Net,
+SegResNet, Swin UNETR, Attention U-Net, and LightM-UNet, to segment PCa lesions
+from a 200 CDI$^s$ patient cohort. We find that SegResNet achieved superior
+segmentation performance with a Dice-Sorensen coefficient (DSC) of $76.68 \pm
+0.8$. Notably, the Attention U-Net, while slightly less accurate (DSC $74.82
+\pm 2.0$), offered a favorable balance between accuracy and computational
+efficiency. Our findings demonstrate the potential of deep learning models in
+improving PCa lesion segmentation using CDI$^s$ to enhance PCa management and
+clinical support.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures, to be published in Studies in Computational
+  Intelligence. This paper introduces Cancer-Net PCa-Seg, a comprehensive
+  evaluation of deep learning models for prostate cancer segmentation using
+  synthetic correlated diffusion imaging (CDI$^s$). We benchmark five
+  state-of-the-art architectures: U-Net, SegResNet, Swin UNETR, Attention
+  U-Net, and LightM-UNet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embodied Scene Understanding for Vision Language Models via MetaVQA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhen Wang, Chenda Duan, Zhenghao Peng, Yuxin Liu, Bolei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Language Models (VLMs) demonstrate significant potential as embodied
+AI agents for various mobility applications. However, a standardized,
+closed-loop benchmark for evaluating their spatial reasoning and sequential
+decision-making capabilities is lacking. To address this, we present MetaVQA: a
+comprehensive benchmark designed to assess and enhance VLMs' understanding of
+spatial relationships and scene dynamics through Visual Question Answering
+(VQA) and closed-loop simulations. MetaVQA leverages Set-of-Mark prompting and
+top-down view ground-truth annotations from nuScenes and Waymo datasets to
+automatically generate extensive question-answer pairs based on diverse
+real-world traffic scenarios, ensuring object-centric and context-rich
+instructions. Our experiments show that fine-tuning VLMs with the MetaVQA
+dataset significantly improves their spatial reasoning and embodied scene
+comprehension in safety-critical simulations, evident not only in improved VQA
+accuracies but also in emerging safety-aware driving maneuvers. In addition,
+the learning demonstrates strong transferability from simulation to real-world
+observation. Code and data will be publicly available at
+https://metadriverse.github.io/metavqa .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>for the project webpage, see https://metadriverse.github.io/metavqa</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Vessel Bifurcation Landmark Pair <span class="highlight-title">Dataset</span> for Abdominal CT Deformable
+  Image Registration (DIR) Validation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09162v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09162v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edward R Criscuolo, Yao Hao, Zhendong Zhang, Trevor McKeown, Deshan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deformable image registration (DIR) is an enabling technology in many
+diagnostic and therapeutic tasks. Despite this, DIR algorithms have limited
+clinical use, largely due to a lack of benchmark datasets for quality assurance
+during development. To support future algorithm development, here we introduce
+our first-of-its-kind abdominal CT DIR benchmark dataset, comprising large
+numbers of highly accurate landmark pairs on matching blood vessel
+bifurcations. Abdominal CT image pairs of 30 patients were acquired from
+several public repositories as well as the authors' institution with IRB
+approval. The two CTs of each pair were originally acquired for the same
+patient on different days. An image processing workflow was developed and
+applied to each image pair: 1) Abdominal organs were segmented with a deep
+learning model, and image intensity within organ masks was overwritten. 2)
+Matching image patches were manually identified between two CTs of each image
+pair 3) Vessel bifurcation landmarks were labeled on one image of each image
+patch pair. 4) Image patches were deformably registered, and landmarks were
+projected onto the second image. 5) Landmark pair locations were refined
+manually or with an automated process. This workflow resulted in 1895 total
+landmark pairs, or 63 per case on average. Estimates of the landmark pair
+accuracy using digital phantoms were 0.7+/-1.2mm. The data is published in
+Zenodo at https://doi.org/10.5281/zenodo.14362785. Instructions for use can be
+found at https://github.com/deshanyang/Abdominal-DIR-QA. This dataset is a
+first-of-its-kind for abdominal DIR validation. The number, accuracy, and
+distribution of landmark pairs will allow for robust validation of DIR
+algorithms with precision beyond what is currently available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VCRScore: Image captioning metric based on V\&L <span class="highlight-title">Transformer</span>s, CLIP, and
+  precision-recall 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillermo Ruiz, Tania Ramírez, Daniela Moctezuma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image captioning has become an essential Vision & Language research task. It
+is about predicting the most accurate caption given a specific image or video.
+The research community has achieved impressive results by continuously
+proposing new models and approaches to improve the overall model's performance.
+Nevertheless, despite increasing proposals, the performance metrics used to
+measure their advances have remained practically untouched through the years. A
+probe of that, nowadays metrics like BLEU, METEOR, CIDEr, and ROUGE are still
+very used, aside from more sophisticated metrics such as BertScore and
+ClipScore.
+  Hence, it is essential to adjust how are measure the advances, limitations,
+and scopes of the new image captioning proposals, as well as to adapt new
+metrics to these new advanced image captioning approaches.
+  This work proposes a new evaluation metric for the image captioning problem.
+To do that, first, it was generated a human-labeled dataset to assess to which
+degree the captions correlate with the image's content. Taking these human
+scores as ground truth, we propose a new metric, and compare it with several
+well-known metrics, from classical to newer ones. Outperformed results were
+also found, and interesting insights were presented and discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-Shot Adaptation of Training-Free Foundation Model for 3D Medical
+  Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingxin He, Yifan Hu, Zhaoye Zhou, Mohamed Jarraya, Fang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision foundation models have achieved remarkable progress across various
+image analysis tasks. In the image segmentation task, foundation models like
+the Segment Anything Model (SAM) enable generalizable zero-shot segmentation
+through user-provided prompts. However, SAM primarily trained on natural
+images, lacks the domain-specific expertise of medical imaging. This limitation
+poses challenges when applying SAM to medical image segmentation, including the
+need for extensive fine-tuning on specialized medical datasets and a dependency
+on manual prompts, which are both labor-intensive and require intervention from
+medical experts.
+  This work introduces the Few-shot Adaptation of Training-frEe SAM (FATE-SAM),
+a novel method designed to adapt the advanced Segment Anything Model 2 (SAM2)
+for 3D medical image segmentation. FATE-SAM reassembles pre-trained modules of
+SAM2 to enable few-shot adaptation, leveraging a small number of support
+examples to capture anatomical knowledge and perform prompt-free segmentation,
+without requiring model fine-tuning. To handle the volumetric nature of medical
+images, we incorporate a Volumetric Consistency mechanism that enhances spatial
+coherence across 3D slices. We evaluate FATE-SAM on multiple medical imaging
+datasets and compare it with supervised learning methods, zero-shot SAM
+approaches, and fine-tuned medical SAM methods. Results show that FATE-SAM
+delivers robust and accurate segmentation while eliminating the need for large
+annotated datasets and expert intervention. FATE-SAM provides a practical,
+efficient solution for medical image segmentation, making it more accessible
+for clinical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Robustness of Contrastive Learning Models for Medical
+  Image-Report Retrieval <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Demetrio Deanda, Yuktha Priya Masupalli, Jeong Yang, Young Lee, Zechun Cao, Gongbo Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical images and reports offer invaluable insights into patient health. The
+heterogeneity and complexity of these data hinder effective analysis. To bridge
+this gap, we investigate contrastive learning models for cross-domain
+retrieval, which associates medical images with their corresponding clinical
+reports. This study benchmarks the robustness of four state-of-the-art
+contrastive learning models: CLIP, CXR-RePaiR, MedCLIP, and CXR-CLIP. We
+introduce an occlusion retrieval task to evaluate model performance under
+varying levels of image corruption. Our findings reveal that all evaluated
+models are highly sensitive to out-of-distribution data, as evidenced by the
+proportional decrease in performance with increasing occlusion levels. While
+MedCLIP exhibits slightly more robustness, its overall performance remains
+significantly behind CXR-CLIP and CXR-RePaiR. CLIP, trained on a
+general-purpose dataset, struggles with medical image-report retrieval,
+highlighting the importance of domain-specific training data. The evaluation of
+this work suggests that more effort needs to be spent on improving the
+robustness of these models. By addressing these limitations, we can develop
+more reliable cross-domain retrieval models for medical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted to AAAI 2025 Workshop -- the 9th International
+  Workshop on Health Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep <span class="highlight-title">Self-Supervised</span> Disturbance Mapping with the OPERA Sentinel-1
+  Radiometric Terrain Corrected SAR Backscatter Product 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harris Hardiman-Mostow, Charles Marshak, Alexander L. Handwerger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mapping land surface disturbances supports disaster response, resource and
+ecosystem management, and climate adaptation efforts. Synthetic aperture radar
+(SAR) is an invaluable tool for disturbance mapping, providing consistent
+time-series images of the ground regardless of weather or illumination
+conditions. Despite SAR's potential for disturbance mapping, processing SAR
+data to an analysis-ready format requires expertise and significant compute
+resources, particularly for large-scale global analysis. In October 2023,
+NASA's Observational Products for End-Users from Remote Sensing Analysis
+(OPERA) project released the near-global Radiometric Terrain Corrected SAR
+backscatter from Sentinel-1 (RTC-S1) dataset, providing publicly available,
+analysis-ready SAR imagery. In this work, we utilize this new dataset to
+systematically analyze land surface disturbances. As labeling SAR data is often
+prohibitively time-consuming, we train a self-supervised vision transformer -
+which requires no labels to train - on OPERA RTC-S1 data to estimate a
+per-pixel distribution from the set of baseline imagery and assess disturbances
+when there is significant deviation from the modeled distribution. To test our
+model's capability and generality, we evaluate three different natural
+disasters - which represent high-intensity, abrupt disturbances - from three
+different regions of the world. Across events, our approach yields high quality
+delineations: F1 scores exceeding 0.6 and Areas Under the Precision-Recall
+Curve exceeding 0.65, consistently outperforming existing SAR disturbance
+methods. Our findings suggest that a self-supervised vision transformer is
+well-suited for global disturbance mapping and can be a valuable tool for
+operational, near-global disturbance monitoring, particularly when labeled data
+does not exist.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 18 figures, 5 tables. Preprint. Submitted to JSTARS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Distance Map Regression Network with Shape-aware Loss for
+  Imbalanced Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09116v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09116v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huiyu Li, Xiabi Liu, Said Boumaraf, Xiaopeng Gong, Donghai Liao, Xiaohong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Small object segmentation, like tumor segmentation, is a difficult and
+critical task in the field of medical image analysis. Although deep learning
+based methods have achieved promising performance, they are restricted to the
+use of binary segmentation mask. Inspired by the rigorous mapping between
+binary segmentation mask and distance map, we adopt distance map as a novel
+ground truth and employ a network to fulfill the computation of distance map.
+Specially, we propose a new segmentation framework that incorporates the
+existing binary segmentation network and a light weight regression network
+(dubbed as LR-Net). Thus, the LR-Net can convert the distance map computation
+into a regression task and leverage the rich information of distance maps.
+Additionally, we derive a shape-aware loss by employing distance maps as
+penalty map to infer the complete shape of an object. We evaluated our approach
+on MICCAI 2017 Liver Tumor Segmentation (LiTS) Challenge dataset and a clinical
+dataset. Experimental results show that our approach outperforms the
+classification-based methods as well as other existing state-of-the-arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Medical Image Anonymization Based on Latent Code Projection
+  and Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huiyu Li, Nicholas Ayache, Hervé Delingette
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image anonymization aims to protect patient privacy by removing
+identifying information, while preserving the data utility to solve downstream
+tasks. In this paper, we address the medical image anonymization problem with a
+two-stage solution: latent code projection and optimization. In the projection
+stage, we design a streamlined encoder to project input images into a latent
+space and propose a co-training scheme to enhance the projection process. In
+the optimization stage, we refine the latent code using two deep loss functions
+designed to address the trade-off between identity protection and data utility
+dedicated to medical images. Through a comprehensive set of qualitative and
+quantitative experiments, we showcase the effectiveness of our approach on the
+MIMIC-CXR chest X-ray dataset by generating anonymized synthetic images that
+can serve as training set for detecting lung pathologies. Source codes are
+available at https://github.com/Huiyu-Li/GMIA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relation U-Net 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng He, Rina Bao, P. Ellen Grant, Yangming Ou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Towards clinical interpretations, this paper presents a new
+''output-with-confidence'' segmentation neural network with multiple input
+images and multiple output segmentation maps and their pairwise relations. A
+confidence score of the test image without ground-truth can be estimated from
+the difference among the estimated relation maps. We evaluate the method based
+on the widely used vanilla U-Net for segmentation and our new model is named
+Relation U-Net which can output segmentation maps of the input images as well
+as an estimated confidence score of the test image without ground-truth.
+Experimental results on four public datasets show that Relation U-Net can not
+only provide better accuracy than vanilla U-Net but also estimate a confidence
+score which is linearly correlated to the segmentation accuracy on test images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ISIB 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self <span class="highlight-title">Pre-train</span>ing with Adaptive Mask Autoencoders for Variable-Contrast
+  3D Medical Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Badhan Kumar Das, Gengyan Zhao, Han Liu, Thomas J. Re, Dorin Comaniciu, Eli Gibson, Andreas Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Masked Autoencoder (MAE) has recently demonstrated effectiveness in
+pre-training Vision Transformers (ViT) for analyzing natural images. By
+reconstructing complete images from partially masked inputs, the ViT encoder
+gathers contextual information to predict the missing regions. This capability
+to aggregate context is especially important in medical imaging, where
+anatomical structures are functionally and mechanically linked to surrounding
+regions. However, current methods do not consider variations in the number of
+input images, which is typically the case in real-world Magnetic Resonance (MR)
+studies. To address this limitation, we propose a 3D Adaptive Masked
+Autoencoders (AMAE) architecture that accommodates a variable number of 3D
+input contrasts per subject. A magnetic resonance imaging (MRI) dataset of
+45,364 subjects was used for pretraining and a subset of 1648 training, 193
+validation and 215 test subjects were used for finetuning. The performance
+demonstrates that self pre-training of this adaptive masked autoencoders can
+enhance the infarct segmentation performance by 2.8%-3.7% for ViT-based
+segmentation models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, ISBI 2025 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Salient Information Preserving Adversarial Training Improves Clean and
+  Robust Accuracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy Redgrave, Adam Czajka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we introduce Salient Information Preserving Adversarial Training
+(SIP-AT), an intuitive method for relieving the robustness-accuracy trade-off
+incurred by traditional adversarial training. SIP-AT uses salient image regions
+to guide the adversarial training process in such a way that fragile features
+deemed meaningful by an annotator remain unperturbed during training, allowing
+models to learn highly predictive non-robust features without sacrificing
+overall robustness. This technique is compatible with both human-based and
+automatically generated salience estimates, allowing SIP-AT to be used as a
+part of human-driven model development without forcing SIP-AT to be reliant
+upon additional human data. We perform experiments across multiple datasets and
+architectures and demonstrate that SIP-AT is able to boost the clean accuracy
+of models while maintaining a high degree of robustness against attacks at
+multiple epsilon levels. We complement our central experiments with an
+observational study measuring the rate at which human subjects successfully
+identify perturbed images. This study helps build a more intuitive
+understanding of adversarial attack strength and demonstrates the heightened
+importance of low-epsilon robustness. Our results demonstrate the efficacy of
+SIP-AT and provide valuable insight into the risks posed by adversarial samples
+of various strengths.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SHYI: Action Support for Contrastive Learning in High-Fidelity
+  Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianxiang Xia, Lin Xiao, Yannick Montorfani, Francesco Pavia, Enis Simsar, Thomas Hofmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this project, we address the issue of infidelity in text-to-image
+generation, particularly for actions involving multiple objects. For this we
+build on top of the CONFORM framework which uses Contrastive Learning to
+improve the accuracy of the generated image for multiple objects. However the
+depiction of actions which involves multiple different object has still large
+room for improvement. To improve, we employ semantically hypergraphic
+contrastive adjacency learning, a comprehension of enhanced contrastive
+structure and "contrast but link" technique. We further amend Stable
+Diffusion's understanding of actions by InteractDiffusion. As evaluation
+metrics we use image-text similarity CLIP and TIFA. In addition, we conducted a
+user study.
+  Our method shows promising results even with verbs that Stable Diffusion
+understands mediocrely. We then provide future directions by analyzing the
+results.
+  Our codebase can be found on polybox under the link:
+https://polybox.ethz.ch/index.php/s/dJm3SWyRohUrFxn
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main content 4 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering Agricultural Insights: RiceLeafBD -- A Novel <span class="highlight-title">Dataset</span> and
+  Optimal Model Selection for Rice Leaf Disease Diagnosis through Transfer
+  Learning Technique 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadia Afrin Rimi, Md. Jalal Uddin Chowdhury, Rifat Abdullah, Iftekhar Ahmed, Mahrima Akter Mim, Mohammad Shoaib Rahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The number of people living in this agricultural nation of ours, which is
+surrounded by lush greenery, is growing on a daily basis. As a result of this,
+the level of arable land is decreasing, as well as residential houses and
+industrial factories. The food crisis is becoming the main threat for us in the
+upcoming days. Because on the one hand, the population is increasing, and on
+the other hand, the amount of food crop production is decreasing due to the
+attack of diseases. Rice is one of the most significant cultivated crops since
+it provides food for more than half of the world's population. Bangladesh is
+dependent on rice (Oryza sativa) as a vital crop for its agriculture, but it
+faces a significant problem as a result of the ongoing decline in rice yield
+brought on by common diseases. Early disease detection is the main difficulty
+in rice crop cultivation. In this paper, we proposed our own dataset, which was
+collected from the Bangladesh field, and also applied deep learning and
+transfer learning models for the evaluation of the datasets. We elaborately
+explain our dataset and also give direction for further research work to serve
+society using this dataset. We applied a light CNN model and pre-trained
+InceptionNet-V2, EfficientNet-V2, and MobileNet-V2 models, which achieved 91.5%
+performance for the EfficientNet-V2 model of this work. The results obtained
+assaulted other models and even exceeded approaches that are considered to be
+part of the state of the art. It has been demonstrated by this study that it is
+possible to precisely and effectively identify diseases that affect rice leaves
+using this unbiased datasets. After analysis of the performance of different
+models, the proposed datasets are significant for the society for research work
+to provide solutions for decreasing rice leaf disease.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Polyp detection in colonoscopy images using YOLOv11 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alok Ranjan Sahoo, Satya Sangram Sahoo, Pavan Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Colorectal cancer (CRC) is one of the most commonly diagnosed cancers all
+over the world. It starts as a polyp in the inner lining of the colon. To
+prevent CRC, early polyp detection is required. Colonosopy is used for the
+inspection of the colon. Generally, the images taken by the camera placed at
+the tip of the endoscope are analyzed by the experts manually. Various
+traditional machine learning models have been used with the rise of machine
+learning. Recently, deep learning models have shown more effectiveness in polyp
+detection due to their superiority in generalizing and learning small features.
+These deep learning models for object detection can be segregated into two
+different types: single-stage and two-stage. Generally, two stage models have
+higher accuracy than single stage ones but the single stage models have low
+inference time. Hence, single stage models are easy to use for quick object
+detection. YOLO is one of the singlestage models used successfully for polyp
+detection. It has drawn the attention of researchers because of its lower
+inference time. The researchers have used Different versions of YOLO so far,
+and with each newer version, the accuracy of the model is increasing. This
+paper aims to see the effectiveness of the recently released YOLOv11 to detect
+polyp. We analyzed the performance for all five models of YOLOv11 (YOLO11n,
+YOLO11s, YOLO11m, YOLO11l, YOLO11x) with Kvasir dataset for the training and
+testing. Two different versions of the dataset were used. The first consisted
+of the original dataset, and the other was created using augmentation
+techniques. The performance of all the models with these two versions of the
+dataset have been analysed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Realistic Synthetic Head Rotation Data for Extended Reality
+  using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Struye, Filip Lemic, Jeroen Famaey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extended Reality is a revolutionary method of delivering multimedia content
+to users. A large contributor to its popularity is the sense of immersion and
+interactivity enabled by having real-world motion reflected in the virtual
+experience accurately and immediately. This user motion, mainly caused by head
+rotations, induces several technical challenges. For instance, which content is
+generated and transmitted depends heavily on where the user is looking.
+Seamless systems, taking user motion into account proactively, will therefore
+require accurate predictions of upcoming rotations. Training and evaluating
+such predictors requires vast amounts of orientational input data, which is
+expensive to gather, as it requires human test subjects. A more feasible
+approach is to gather a modest dataset through test subjects, and then extend
+it to a more sizeable set using synthetic data generation methods. In this
+work, we present a head rotation time series generator based on TimeGAN, an
+extension of the well-known Generative Adversarial Network, designed
+specifically for generating time series. This approach is able to extend a
+dataset of head rotations with new samples closely matching the distribution of
+the measured time series.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published and presented at International Conference on Multimedia
+  2022 (ACMMM), Workshop on Interactive eXtended Reality (IXR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic-Aware Spatio-temporal Representation Learning for Dynamic MRI
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dayoung Baik, Jaejun Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic MRI reconstruction, one of inverse problems, has seen a surge by the
+use of deep learning techniques. Especially, the practical difficulty of
+obtaining ground truth data has led to the emergence of unsupervised learning
+approaches. A recent promising method among them is implicit neural
+representation (INR), which defines the data as a continuous function that maps
+coordinate values to the corresponding signal values. This allows for filling
+in missing information only with incomplete measurements and solving the
+inverse problem effectively. Nevertheless, previous works incorporating this
+method have faced drawbacks such as long optimization time and the need for
+extensive hyperparameter tuning. To address these issues, we propose
+Dynamic-Aware INR (DA-INR), an INR-based model for dynamic MRI reconstruction
+that captures the spatial and temporal continuity of dynamic MRI data in the
+image domain and explicitly incorporates the temporal redundancy of the data
+into the model structure. As a result, DA-INR outperforms other models in
+reconstruction quality even at extreme undersampling ratios while significantly
+reducing optimization time and requiring minimal hyperparameter tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anthropomorphic Features for On-Line Signatures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moises Diaz, Miguel A. Ferrer, Jose J. Quintana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many features have been proposed in on-line signature verification.
+Generally, these features rely on the position of the on-line signature samples
+and their dynamic properties, as recorded by a tablet. This paper proposes a
+novel feature space to describe efficiently on-line signatures. Since producing
+a signature requires a skeletal arm system and its associated muscles, the new
+feature space is based on characterizing the movement of the shoulder, the
+elbow and the wrist joints when signing. As this motion is not directly
+obtained from a digital tablet, the new features are calculated by means of a
+virtual skeletal arm (VSA) model, which simulates the architecture of a real
+arm and forearm. Specifically, the VSA motion is described by its 3D joint
+position and its joint angles. These anthropomorphic features are worked out
+from both pen position and orientation through the VSA forward and direct
+kinematic model. The anthropomorphic features' robustness is proved by
+achieving state-of-the-art performance with several verifiers and multiple
+benchmarks on third party signature databases, which were collected with
+different devices and in different languages and scripts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Hemodynamic Scalar Fields on Coronary Artery Meshes: A
+  Benchmark of Geometric Deep Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guido Nannini, Julian Suk, Patryk Rygiel, Simone Saitta, Luca Mariani, Riccardo Maranga, Andrea Baggiano, Gianluca Pontone, Alberto Redaelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coronary artery disease, caused by the narrowing of coronary vessels due to
+atherosclerosis, is the leading cause of death worldwide. The diagnostic gold
+standard, fractional flow reserve (FFR), measures the trans-stenotic pressure
+ratio during maximal vasodilation but is invasive and costly. This has driven
+the development of virtual FFR (vFFR) using computational fluid dynamics (CFD)
+to simulate coronary flow. Geometric deep learning algorithms have shown
+promise for learning features on meshes, including cardiovascular research
+applications. This study empirically analyzes various backends for predicting
+vFFR fields in coronary arteries as CFD surrogates, comparing six backends for
+learning hemodynamics on meshes using CFD solutions as ground truth.
+  The study has two parts: i) Using 1,500 synthetic left coronary artery
+bifurcations, models were trained to predict pressure-related fields for vFFR
+reconstruction, comparing different learning variables. ii) Using 427
+patient-specific CFD simulations, experiments were repeated focusing on the
+best-performing learning variable from the synthetic dataset.
+  Most backends performed well on the synthetic dataset, especially when
+predicting pressure drop over the manifold. Transformer-based backends
+outperformed others when predicting pressure and vFFR fields and were the only
+models achieving strong performance on patient-specific data, excelling in both
+average per-point error and vFFR accuracy in stenotic lesions.
+  These results suggest geometric deep learning backends can effectively
+replace CFD for simple geometries, while transformer-based networks are
+superior for complex, heterogeneous datasets. Pressure drop was identified as
+the optimal network output for learning pressure-related fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatio-Temporal Foundation Models: Vision, Challenges, and Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Goodge, Wee Siong Ng, Bryan Hooi, See Kiong Ng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have revolutionized artificial intelligence, setting new
+benchmarks in performance and enabling transformative capabilities across a
+wide range of vision and language tasks. However, despite the prevalence of
+spatio-temporal data in critical domains such as transportation, public health,
+and environmental monitoring, spatio-temporal foundation models (STFMs) have
+not yet achieved comparable success. In this paper, we articulate a vision for
+the future of STFMs, outlining their essential characteristics and the
+generalization capabilities necessary for broad applicability. We critically
+assess the current state of research, identifying gaps relative to these ideal
+traits, and highlight key challenges that impede their progress. Finally, we
+explore potential opportunities and directions to advance research towards the
+aim of effective and broadly applicable STFMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TCMM: Token Constraint and Multi-Scale Memory Bank of Contrastive
+  Learning for Unsupervised Person Re-identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng-An Zhu, Hsin-Che Chien, Chen-Kuo Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes the ViT Token Constraint and Multi-scale Memory bank
+(TCMM) method to address the patch noises and feature inconsistency in
+unsupervised person re-identification works. Many excellent methods use ViT
+features to obtain pseudo labels and clustering prototypes, then train the
+model with contrastive learning. However, ViT processes images by performing
+patch embedding, which inevitably introduces noise in patches and may
+compromise the performance of the re-identification model. On the other hand,
+previous memory bank based contrastive methods may lead data inconsistency due
+to the limitation of batch size. Furthermore, existing pseudo label methods
+often discard outlier samples that are difficult to cluster. It sacrifices the
+potential value of outlier samples, leading to limited model diversity and
+robustness. This paper introduces the ViT Token Constraint to mitigate the
+damage caused by patch noises to the ViT architecture. The proposed Multi-scale
+Memory enhances the exploration of outlier samples and maintains feature
+consistency. Experimental results demonstrate that our system achieves
+state-of-the-art performance on common benchmarks. The project is available at
+\href{https://github.com/andy412510/TCMM}{https://github.com/andy412510/TCMM}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CookingDiffusion: Cooking Procedural Image Generation with Stable
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Wang, Bin Xhu, Yanbin Hao, Chong-Wah Ngo, Yi Tan, Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in text-to-image generation models have excelled in
+creating diverse and realistic images. This success extends to food imagery,
+where various conditional inputs like cooking styles, ingredients, and recipes
+are utilized. However, a yet-unexplored challenge is generating a sequence of
+procedural images based on cooking steps from a recipe. This could enhance the
+cooking experience with visual guidance and possibly lead to an intelligent
+cooking simulation system. To fill this gap, we introduce a novel task called
+\textbf{cooking procedural image generation}. This task is inherently
+demanding, as it strives to create photo-realistic images that align with
+cooking steps while preserving sequential consistency. To collectively tackle
+these challenges, we present \textbf{CookingDiffusion}, a novel approach that
+leverages Stable Diffusion and three innovative Memory Nets to model procedural
+prompts. These prompts encompass text prompts (representing cooking steps),
+image prompts (corresponding to cooking images), and multi-modal prompts
+(mixing cooking steps and images), ensuring the consistent generation of
+cooking procedural images. To validate the effectiveness of our approach, we
+preprocess the YouCookII dataset, establishing a new benchmark. Our
+experimental results demonstrate that our model excels at generating
+high-quality cooking procedural images with remarkable consistency across
+sequential cooking steps, as measured by both the FID and the proposed Average
+Procedure Consistency metrics. Furthermore, CookingDiffusion demonstrates the
+ability to manipulate ingredients and cooking methods in a recipe. We will make
+our code, models, and dataset publicly accessible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Visual Commonsense Answering and Explaining with Generative
+  Scene Graph Constructing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09041v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09041v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Yuan, Xiaoyuan Fang, Rong Quan, Jing Li, Wei Bi, Xiaogang Xu, Piji Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Commonsense Reasoning, which is regarded as one challenging task to
+pursue advanced visual scene comprehension, has been used to diagnose the
+reasoning ability of AI systems. However, reliable reasoning requires a good
+grasp of the scene's details. Existing work fails to effectively exploit the
+real-world object relationship information present within the scene, and
+instead overly relies on knowledge from training memory. Based on these
+observations, we propose a novel scene-graph-enhanced visual commonsense
+reasoning generation method named \textit{\textbf{G2}}, which first utilizes
+the image patches and LLMs to construct a location-free scene graph, and then
+answer and explain based on the scene graph's information. We also propose
+automatic scene graph filtering and selection strategies to absorb valuable
+scene graph information during training. Extensive experiments are conducted on
+the tasks and datasets of scene graph constructing and visual commonsense
+answering and explaining, respectively. Experimental results and ablation
+analysis demonstrate the effectiveness of our proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pseudolabel guided pixels contrast for domain adaptive semantic
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianzi Xiang, Cailu Wan, Zhu Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation is essential for comprehending images, but the process
+necessitates a substantial amount of detailed annotations at the pixel level.
+Acquiring such annotations can be costly in the real-world. Unsupervised domain
+adaptation (UDA) for semantic segmentation is a technique that uses virtual
+data with labels to train a model and adapts it to real data without labels.
+Some recent works use contrastive learning, which is a powerful method for
+self-supervised learning, to help with this technique. However, these works do
+not take into account the diversity of features within each class when using
+contrastive learning, which leads to errors in class prediction. We analyze the
+limitations of these works and propose a novel framework called Pseudo-label
+Guided Pixel Contrast (PGPC), which overcomes the disadvantages of previous
+methods. We also investigate how to use more information from target images
+without adding noise from pseudo-labels. We test our method on two standard UDA
+benchmarks and show that it outperforms existing methods. Specifically, we
+achieve relative improvements of 5.1% mIoU and 4.6% mIoU on the Grand Theft
+Auto V (GTA5) to Cityscapes and SYNTHIA to Cityscapes tasks based on DAFormer,
+respectively. Furthermore, our approach can enhance the performance of other
+UDA approaches without increasing model complexity. Code is available at
+https://github.com/embar111/pgpc
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 5 figures. Code: https://github.com/embar111/pgpc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ T2V-CompBench: A Comprehensive Benchmark for Compositional Text-to-video
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.14505v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.14505v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyue Sun, Kaiyi Huang, Xian Liu, Yue Wu, Zihan Xu, Zhenguo Li, Xihui Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-video (T2V) generative models have advanced significantly, yet their
+ability to compose different objects, attributes, actions, and motions into a
+video remains unexplored. Previous text-to-video benchmarks also neglect this
+important ability for evaluation. In this work, we conduct the first systematic
+study on compositional text-to-video generation. We propose T2V-CompBench, the
+first benchmark tailored for compositional text-to-video generation.
+T2V-CompBench encompasses diverse aspects of compositionality, including
+consistent attribute binding, dynamic attribute binding, spatial relationships,
+motion binding, action binding, object interactions, and generative numeracy.
+We further carefully design evaluation metrics of multimodal large language
+model (MLLM)-based, detection-based, and tracking-based metrics, which can
+better reflect the compositional text-to-video generation quality of seven
+proposed categories with 1400 text prompts. The effectiveness of the proposed
+metrics is verified by correlation with human evaluations. We also benchmark
+various text-to-video generative models and conduct in-depth analysis across
+different models and various compositional categories. We find that
+compositional text-to-video generation is highly challenging for current
+models, and we hope our attempt could shed light on future research in this
+direction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://t2v-compbench-2025.github.io/ Code:
+  https://github.com/KaiyueSun98/T2V-CompBench/tree/V2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeblurDiNAT: A Compact Model with Exceptional Generalization and Visual
+  Fidelity on Unseen Domains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13163v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13163v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanzhou Liu, Binghan Li, Chengkai Liu, Mi Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent deblurring networks have effectively restored clear images from the
+blurred ones. However, they often struggle with generalization to unknown
+domains. Moreover, these models typically focus on distortion metrics such as
+PSNR and SSIM, neglecting the critical aspect of metrics aligned with human
+perception. To address these limitations, we propose DeblurDiNAT, a deblurring
+Transformer based on Dilated Neighborhood Attention. First, DeblurDiNAT employs
+an alternating dilation factor paradigm to capture both local and global
+blurred patterns, enhancing generalization and perceptual clarity. Second, a
+local cross-channel learner aids the Transformer block to understand the
+short-range relationships between adjacent channels. Additionally, we present a
+linear feed-forward network with a simple while effective design. Finally, a
+dual-stage feature fusion module is introduced as an alternative to the
+existing approach, which efficiently process multi-scale visual information
+across network levels. Compared to state-of-the-art models, our compact
+DeblurDiNAT demonstrates superior generalization capabilities and achieves
+remarkable performance in perceptual metrics, while maintaining a favorable
+model size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Click-Calib: A Robust Extrinsic Calibration Method for Surround-View
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01557v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01557v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lihao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surround-View System (SVS) is an essential component in Advanced Driver
+Assistance System (ADAS) and requires precise calibrations. However,
+conventional offline extrinsic calibration methods are cumbersome and
+time-consuming as they rely heavily on physical patterns. Additionally, these
+methods primarily focus on short-range areas surrounding the vehicle, resulting
+in lower calibration quality in more distant zones. To address these
+limitations, we propose Click-Calib, a pattern-free approach for offline SVS
+extrinsic calibration. Without requiring any special setup, the user only needs
+to click a few keypoints on the ground in natural scenes. Unlike other offline
+calibration approaches, Click-Calib optimizes camera poses over a wide range by
+minimizing reprojection distance errors of keypoints, thereby achieving
+accurate calibrations at both short and long distances. Furthermore,
+Click-Calib supports both single-frame and multiple-frame modes, with the
+latter offering even better results. Evaluations on our in-house dataset and
+the public WoodScape dataset demonstrate its superior accuracy and robustness
+compared to baseline methods. Code is available at
+https://github.com/lwangvaleo/click_calib.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A General Framework for Inference-time Scaling and Steering of Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06848v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06848v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raghav Singhal, Zachary Horvitz, Ryan Teehan, Mengye Ren, Zhou Yu, Kathleen McKeown, Rajesh Ranganath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models produce impressive results in modalities ranging from images
+and video to protein design and text. However, generating samples with
+user-specified properties remains a challenge. Recent research proposes
+fine-tuning models to maximize rewards that capture desired properties, but
+these methods require expensive training and are prone to mode collapse. In
+this work, we propose Feynman Kac (FK) steering, an inference-time framework
+for steering diffusion models with reward functions. FK steering works by
+sampling a system of multiple interacting diffusion processes, called
+particles, and resampling particles at intermediate steps based on scores
+computed using functions called potentials. Potentials are defined using
+rewards for intermediate states and are selected such that a high value
+indicates that the particle will yield a high-reward sample. We explore various
+choices of potentials, intermediate rewards, and samplers. We evaluate FK
+steering on text-to-image and text diffusion models. For steering text-to-image
+models with a human preference reward, we find that FK steering a 0.8B
+parameter model outperforms a 2.6B parameter fine-tuned model on prompt
+fidelity, with faster sampling and no training. For steering text diffusion
+models with rewards for text quality and specific text attributes, we find that
+FK steering generates lower perplexity, more linguistically acceptable outputs
+and enables gradient-free control of attributes like toxicity. Our results
+demonstrate that inference-time scaling and steering of diffusion models, even
+with off-the-shelf rewards, can provide significant sample quality gains and
+controllability benefits. Code is available at
+https://github.com/zacharyhorvitz/Fk-Diffusion-Steering .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SA-MLP: A Low-Power Multiplication-Free Deep Network for 3D Point Cloud
+  Classification in Resource-Constrained Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.01998v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.01998v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Zheng, Chao Zhang, Jian Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud classification plays a crucial role in the processing and
+analysis of data from 3D sensors such as LiDAR, which are commonly used in
+applications like autonomous vehicles, robotics, and environmental monitoring.
+However, traditional neural networks, which rely heavily on multiplication
+operations, often face challenges in terms of high computational costs and
+energy consumption. This study presents a novel family of efficient MLP-based
+architectures designed to improve the computational efficiency of point cloud
+classification tasks in sensor systems. The baseline model, Mul-MLP, utilizes
+conventional multiplication operations, while Add-MLP and Shift-MLP replace
+multiplications with addition and shift operations, respectively. These
+replacements leverage more sensor-friendly operations that can significantly
+reduce computational overhead, making them particularly suitable for
+resource-constrained sensor platforms. To further enhance performance, we
+propose SA-MLP, a hybrid architecture that alternates between shift and adder
+layers, preserving the network depth while optimizing computational efficiency.
+Unlike previous approaches such as ShiftAddNet, which increase the layer count
+and limit representational capacity by freezing shift weights, SA-MLP fully
+exploits the complementary advantages of shift and adder layers by employing
+distinct learning rates and optimizers. Experimental results show that Add-MLP
+and Shift-MLP achieve competitive performance compared to Mul-MLP, while SA-MLP
+surpasses the baseline, delivering results comparable to state-of-the-art MLP
+models in terms of both classification accuracy and computational efficiency.
+This work offers a promising, energy-efficient solution for sensor-driven
+applications requiring real-time point cloud classification, particularly in
+environments with limited computational resources.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A design of Convolutional Neural Network model for the Diagnosis of the
+  COVID-19 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.06394v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.06394v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyuan Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the spread of COVID-19 around the globe over the past year, the usage of
+artificial intelligence (AI) algorithms and image processing methods to analyze
+the X-ray images of patients' chest with COVID-19 has become essential. The
+COVID-19 virus recognition in the lung area of a patient is one of the basic
+and essential needs of clicical centers and hospitals. Most research in this
+field has been devoted to papers on the basis of deep learning methods
+utilizing CNNs (Convolutional Neural Network), which mainly deal with the
+screening of sick and healthy people.In this study, a new structure of a
+19-layer CNN has been recommended for accurately recognition of the COVID-19
+from the X-ray pictures of chest. The offered CNN is developed to serve as a
+precise diagnosis system for a three class (viral pneumonia, Normal, COVID) and
+a four classclassification (Lung opacity, Normal, COVID-19, and pneumonia). A
+comparison is conducted among the outcomes of the offered procedure and some
+popular pretrained networks, including Inception, Alexnet, ResNet50,
+Squeezenet, and VGG19 and based on Specificity, Accuracy, Precision,
+Sensitivity, Confusion Matrix, and F1-score. The experimental results of the
+offered CNN method specify its dominance over the existing published
+procedures. This method can be a useful tool for clinicians in deciding
+properly about COVID-19.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Important mistakes found. There's no new version currently. Also
+  contradiction with authorship</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compression with Global Guidance: Towards Training-free High-Resolution
+  MLLMs Acceleration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05179v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05179v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyang Liu, Ziming Wang, Yuhang Han, Yingyao Wang, Jiale Yuan, Jun Song, Bo Zheng, Linfeng Zhang, Siteng Huang, Honggang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) have attracted considerable
+attention due to their exceptional performance in visual content understanding
+and reasoning. However, their inference efficiency has been a notable concern,
+as the increasing length of multimodal contexts leads to quadratic complexity.
+Token compression techniques, which reduce the number of visual tokens, have
+demonstrated their effectiveness in reducing computational costs. Yet, these
+approaches have struggled to keep pace with the rapid advancements in MLLMs,
+especially the AnyRes strategy in the context of high-resolution image
+understanding. In this paper, we propose a novel token compression method,
+GlobalCom$^2$, tailored for high-resolution MLLMs that receive both the
+thumbnail and multiple crops. GlobalCom$^2$ treats the tokens derived from the
+thumbnail as the "commander" of the entire token compression process, directing
+the allocation of retention ratios and the specific compression for each crop.
+In this way, redundant tokens are eliminated while important local details are
+adaptively preserved to the highest extent feasible. Empirical results across
+10 benchmarks reveal that GlobalCom$^2$ achieves an optimal balance between
+performance and efficiency, and consistently outperforms state-of-the-art token
+compression methods with LLaVA-NeXT-7B/13B models. Our code is released at
+https://github.com/xuyang-liu16/GlobalCom2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code is released at
+  \url{https://github.com/xuyang-liu16/GlobalCom2}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Identifying Spurious Correlations using Counterfactual Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02186v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02186v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Paul Cohen, Louis Blankemeier, Akshay Chaudhari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models driven by spurious correlations often yield poor generalization
+performance. We propose the counterfactual (CF) alignment method to detect and
+quantify spurious correlations of black box classifiers. Our methodology is
+based on counterfactual images generated with respect to one classifier being
+input into other classifiers to see if they also induce changes in the outputs
+of these classifiers. The relationship between these responses can be
+quantified and used to identify specific instances where a spurious correlation
+exists. This is validated by observing intuitive trends in face-attribute and
+waterbird classifiers, as well as by fabricating spurious correlations and
+detecting their presence, both visually and quantitatively. Furthermore,
+utilizing the CF alignment method, we demonstrate that we can evaluate robust
+optimization methods (GroupDRO, JTT, and FLAC) by detecting a reduction in
+spurious correlations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Transactions on Machine Learning Research (TMLR), Code:
+  https://github.com/ieee8023/latentshift</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PACE: Marrying generalization in PArameter-efficient fine-tuning with
+  Consistency rEgularization <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.17137v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.17137v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Ni, Shan Zhang, Piotr Koniusz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-Efficient Fine-Tuning (PEFT) effectively adapts pre-trained
+transformers to downstream tasks. However, the optimization of tasks
+performance often comes at the cost of generalizability in fine-tuned models.
+To address this issue, we theoretically connect smaller weight gradient norms
+during training and larger datasets to the improvements in model
+generalization. Motivated by this connection, we propose reducing gradient
+norms for enhanced generalization and aligning fine-tuned model with the
+pre-trained counterpart to retain knowledge from large-scale pre-training data.
+Yet, naive alignment does not guarantee gradient reduction and can potentially
+cause gradient explosion, complicating efforts to manage gradients. To address
+such an issue, we propose PACE, marrying generalization of PArameter-efficient
+fine-tuning with Consistency rEgularization. We perturb features learned from
+the adapter with the multiplicative noise and ensure the fine-tuned model
+remains consistent for same sample under different perturbations. Theoretical
+analysis shows that PACE not only implicitly regularizes gradients for enhanced
+generalization, but also implicitly aligns the fine-tuned and pre-trained
+models to retain knowledge. Experimental evidence supports our theories. PACE
+surpasses existing PEFT methods in visual adaptation tasks (VTAB-1k, FGVC,
+few-shot learning, domain adaptation) showcasing its potential for
+resource-efficient fine-tuning. It also improves LoRA in text classification
+(GLUE) and mathematical reasoning (GSM-8K). The code is available at
+https://github.com/MaxwellYaoNi/PACE
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024 as a spotlight</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TextSleuth: Towards Explainable Tampered Text Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14816v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14816v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenfan Qu, Jian Liu, Haoxing Chen, Baihan Yu, Jingjing Liu, Weiqiang Wang, Lianwen Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, tampered text detection has attracted increasing attention due to
+its essential role in information security. Although existing methods can
+detect the tampered text region, the interpretation of such detection remains
+unclear, making the prediction unreliable. To address this problem, we propose
+to explain the basis of tampered text detection with natural language via large
+multimodal models. To fill the data gap for this task, we propose a
+large-scale, comprehensive dataset, ETTD, which contains both pixel-level
+annotations for tampered text region and natural language annotations
+describing the anomaly of the tampered text. Multiple methods are employed to
+improve the quality of the proposed data. For example, elaborate queries are
+introduced to generate high-quality anomaly descriptions with GPT4o. A fused
+mask prompt is proposed to reduce confusion when querying GPT4o to generate
+anomaly descriptions. To automatically filter out low-quality annotations, we
+also propose to prompt GPT4o to recognize tampered texts before describing the
+anomaly, and to filter out the responses with low OCR accuracy. To further
+improve explainable tampered text detection, we propose a simple yet effective
+model called TextSleuth, which achieves improved fine-grained perception and
+cross-domain generalization by focusing on the suspected region, with a
+two-stage analysis paradigm and an auxiliary grounding prompt. Extensive
+experiments on both the ETTD dataset and the public dataset have verified the
+effectiveness of the proposed methods. In-depth analysis is also provided to
+inspire further research. Our dataset and code will be open-source.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first work for explainable tampered text detection</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Foundation Language-Image Model of the Retina (FLAIR): Encoding Expert
+  Knowledge in Text Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07898v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07898v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julio Silva-Rodríguez, Hadi Chakor, Riadh Kobbi, Jose Dolz, Ismail Ben Ayed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation vision-language models are currently transforming computer vision,
+and are on the rise in medical imaging fueled by their very promising
+generalization capabilities. However, the initial attempts to transfer this new
+paradigm to medical imaging have shown less impressive performances than those
+observed in other domains, due to the significant domain shift and the complex,
+expert domain knowledge inherent to medical-imaging tasks. Motivated by the
+need for domain-expert foundation models, we present FLAIR, a pre-trained
+vision-language model for universal retinal fundus image understanding. To this
+end, we compiled 38 open-access, mostly categorical fundus imaging datasets
+from various sources, with up to 101 different target conditions and 288,307
+images. We integrate the expert's domain knowledge in the form of descriptive
+textual prompts, during both pre-training and zero-shot inference, enhancing
+the less-informative categorical supervision of the data. Such a textual
+expert's knowledge, which we compiled from the relevant clinical literature and
+community standards, describes the fine-grained features of the pathologies as
+well as the hierarchies and dependencies between them. We report comprehensive
+evaluations, which illustrate the benefit of integrating expert knowledge and
+the strong generalization capabilities of FLAIR under difficult scenarios with
+domain shifts or unseen categories. When adapted with a lightweight linear
+probe, FLAIR outperforms fully-trained, dataset-focused models, more so in the
+few-shot regimes. Interestingly, FLAIR outperforms by a wide margin
+larger-scale generalist image-language models and retina domain-specific
+self-supervised networks, which emphasizes the potential of embedding experts'
+domain knowledge and the limitations of generalist models in medical imaging.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Medical Image Analysis. The pre-trained model is
+  available at: https://github.com/jusiro/FLAIR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MADiff: Text-Guided Fashion Image Editing with Mask Prediction and
+  Attention-Enhanced Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20062v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20062v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zechao Zhan, Dehong Gao, Jinxia Zhang, Jiale Huang, Yang Hu, Xin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-guided image editing model has achieved great success in general domain.
+However, directly applying these models to the fashion domain may encounter two
+issues: (1) Inaccurate localization of editing region; (2) Weak editing
+magnitude. To address these issues, the MADiff model is proposed. Specifically,
+to more accurately identify editing region, the MaskNet is proposed, in which
+the foreground region, densepose and mask prompts from large language model are
+fed into a lightweight UNet to predict the mask for editing region. To
+strengthen the editing magnitude, the Attention-Enhanced Diffusion Model is
+proposed, where the noise map, attention map, and the mask from MaskNet are fed
+into the proposed Attention Processor to produce a refined noise map. By
+integrating the refined noise map into the diffusion model, the edited image
+can better align with the target prompt. Given the absence of benchmarks in
+fashion image editing, we constructed a dataset named Fashion-E, comprising
+28390 image-text pairs in the training set, and 2639 image-text pairs for four
+types of fashion tasks in the evaluation set. Extensive experiments on
+Fashion-E demonstrate that our proposed method can accurately predict the mask
+of editing region and significantly enhance editing magnitude in fashion image
+editing compared to the state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Industrial Anomaly Detection and Localization Using Weakly-Supervised
+  Residual <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03492v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03492v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanxi Li, Jingqi Wu, Deyin Liu, Lin Wu, Hao Chen, Mingwen Wang, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in industrial anomaly detection (AD) have demonstrated
+that incorporating a small number of anomalous samples during training can
+significantly enhance accuracy. However, this improvement often comes at the
+cost of extensive annotation efforts, which are impractical for many real-world
+applications. In this paper, we introduce a novel framework, Weak}ly-supervised
+RESidual Transformer (WeakREST), designed to achieve high anomaly detection
+accuracy while minimizing the reliance on manual annotations. First, we
+reformulate the pixel-wise anomaly localization task into a block-wise
+classification problem. Second, we introduce a residual-based feature
+representation called Positional Fast Anomaly Residuals (PosFAR) which captures
+anomalous patterns more effectively. To leverage this feature, we adapt the
+Swin Transformer for enhanced anomaly detection and localization. Additionally,
+we propose a weak annotation approach, utilizing bounding boxes and image tags
+to define anomalous regions. This approach establishes a semi-supervised
+learning context that reduces the dependency on precise pixel-level labels. To
+further improve the learning process, we develop a novel ResMixMatch algorithm,
+capable of handling the interplay between weak labels and residual-based
+representations.
+  On the benchmark dataset MVTec-AD, our method achieves an Average Precision
+(AP) of $83.0\%$, surpassing the previous best result of $82.7\%$ in the
+unsupervised setting. In the supervised AD setting, WeakREST attains an AP of
+$87.6\%$, outperforming the previous best of $86.0\%$. Notably, even when using
+weaker annotations such as bounding boxes, WeakREST exceeds the performance of
+leading methods relying on pixel-wise supervision, achieving an AP of $87.1\%$
+compared to the prior best of $86.0\%$ on MVTec-AD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Surprising Ineffectiveness of <span class="highlight-title">Pre-Train</span>ed Visual Representations for
+  Model-Based Reinforcement Learning <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10175v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10175v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moritz Schneider, Robert Krug, Narunas Vaskevicius, Luigi Palmieri, Joschka Boedecker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Reinforcement Learning (RL) methods often require extensive amounts of
+data. As opposed to model-free RL, model-based RL (MBRL) offers a potential
+solution with efficient data utilization through planning. Additionally, RL
+lacks generalization capabilities for real-world tasks. Prior work has shown
+that incorporating pre-trained visual representations (PVRs) enhances sample
+efficiency and generalization. While PVRs have been extensively studied in the
+context of model-free RL, their potential in MBRL remains largely unexplored.
+In this paper, we benchmark a set of PVRs on challenging control tasks in a
+model-based RL setting. We investigate the data efficiency, generalization
+capabilities, and the impact of different properties of PVRs on the performance
+of model-based agents. Our results, perhaps surprisingly, reveal that for MBRL
+current PVRs are not more sample efficient than learning representations from
+scratch, and that they do not generalize better to out-of-distribution (OOD)
+settings. To explain this, we analyze the quality of the trained dynamics
+model. Furthermore, we show that data diversity and network architecture are
+the most important contributors to OOD generalization performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the 38th Conference on Neural Information Processing
+  Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CGCOD: Class-Guided Camouflaged Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18977v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18977v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxi Zhang, Qing Zhang, Jiayun Wu, Youwei Pang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camouflaged Object Detection (COD) aims to identify objects that blend
+seamlessly into their surroundings. The inherent visual complexity of
+camouflaged objects, including their low contrast with the background, diverse
+textures, and subtle appearance variations, often obscures semantic cues,
+making accurate segmentation highly challenging. Existing methods primarily
+rely on visual features, which are insufficient to handle the variability and
+intricacy of camouflaged objects, leading to unstable object perception and
+ambiguous segmentation results. To tackle these limitations, we introduce a
+novel task, class-guided camouflaged object detection (CGCOD), which extends
+traditional COD task by incorporating object-specific class knowledge to
+enhance detection robustness and accuracy. To facilitate this task, we present
+a new dataset, CamoClass, comprising real-world camouflaged objects with class
+annotations. Furthermore, we propose a multi-stage framework, CGNet, which
+incorporates a plug-and-play class prompt generator and a simple yet effective
+class-guided detector. This establishes a new paradigm for COD, bridging the
+gap between contextual understanding and class-guided detection. Extensive
+experimental results demonstrate the effectiveness of our flexible framework in
+improving the performance of proposed and existing detectors by leveraging
+class-level textual information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of radiomic feature harmonization techniques for benign and
+  malignant pulmonary nodules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16758v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16758v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claire Huchthausen, Menglin Shi, Gabriel L. A. de Sousa, Jonathan Colen, Emery Shelley, James Larner, Einsley Janowski, Krishni Wijesooriya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  BACKGROUND: Radiomics provides quantitative features of pulmonary nodules
+(PNs) which could aid lung cancer diagnosis, but medical image acquisition
+variability is an obstacle to clinical application. Acquisition effects may
+differ between radiomic features from benign vs. malignant PNs. PURPOSE: We
+evaluated how to account for differences between benign and malignant PNs when
+correcting radiomic features' acquisition dependency. METHODS: We used 567
+chest CT scans grouped as benign, malignant, or lung cancer screening (mixed
+benign, malignant). ComBat harmonization was applied to extracted features for
+variation in 4 acquisition parameters. We compared: harmonizing without
+distinction, harmonizing with a covariate to preserve distinctions between
+subgroups, and harmonizing subgroups separately. Significant ($p\le0.05$)
+Kruskal-Wallis tests showed whether harmonization removed acquisition
+dependency. A LASSO-SVM pipeline was trained on successfully harmonized
+features to predict malignancy. To evaluate predictive information in these
+features, the trained harmonization estimators and predictive model were
+applied to unseen test sets. Harmonization and predictive performance were
+assessed for 10 trials of 5-fold cross-validation. RESULTS: An average 2.1% of
+features (95% CI:1.9-2.4%) were acquisition-independent when harmonized without
+distinction, 27.3% (95% CI:25.7-28.9%) when harmonized with a covariate, and
+90.9% (95% CI:90.4-91.5%) when harmonized separately. Data harmonized
+separately or with a covariate trained models with higher ROC-AUC for screening
+scans than data harmonized without distinction between benign and malignant PNs
+(Delong test, adjusted $p\le0.05$). CONCLUSIONS: Radiomic features of benign
+and malignant PNs need different corrective transformations to recover
+acquisition-independent distributions. This can be done by harmonizing
+separately or with a covariate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures, plus supplemental material; updated author list,
+  corrected result in paragraph 3 of Discussion, updated Figure S1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structural damage detection via hierarchical damage information with
+  volumetric assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19694v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19694v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaac Osei Agyemang, Isaac Adjei-Mensah, Daniel Acheampong, Gordon Owusu Boateng, Adu Asare Baffour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural health monitoring (SHM) is essential for ensuring the safety and
+longevity of infrastructure, but complex image environments, noisy labels, and
+reliance on manual damage assessments often hinder its effectiveness. This
+study introduces the Guided Detection Network (Guided-DetNet), a framework
+designed to address these challenges. Guided-DetNet is characterized by a
+Generative Attention Module (GAM), Hierarchical Elimination Algorithm (HEA),
+and Volumetric Contour Visual Assessment (VCVA). GAM leverages cross-horizontal
+and cross-vertical patch merging and cross-foreground-background feature fusion
+to generate varied features to mitigate complex image environments. HEA
+addresses noisy labeling using hierarchical relationships among classes to
+refine instances given an image by eliminating unlikely class instances. VCVA
+assesses the severity of detected damages via volumetric representation and
+quantification leveraging the Dirac delta distribution. A comprehensive
+quantitative study and two robustness tests were conducted using the PEER Hub
+dataset, and a drone-based application, which involved a field experiment, was
+conducted to substantiate Guided-DetNet's promising performances. In triple
+classification tasks, the framework achieved 96% accuracy, surpassing
+state-of-the-art classifiers by up to 3%. In dual detection tasks, it
+outperformed competitive detectors with a precision of 94% and a mean average
+precision (mAP) of 79% while maintaining a frame rate of 57.04fps, suitable for
+real-time applications. Additionally, robustness tests demonstrated resilience
+under adverse conditions, with precision scores ranging from 79% to 91%.
+Guided-DetNet is established as a robust and efficient framework for SHM,
+offering advancements in automation and precision, with the potential for
+widespread application in drone-based infrastructure inspections.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SemTalk: Holistic Co-speech Motion Generation with Frame-level Semantic
+  Emphasis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16563v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16563v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyue Zhang, Jianfang Li, Jiaxu Zhang, Ziqiang Dang, Jianqiang Ren, Liefeng Bo, Zhigang Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A good co-speech motion generation cannot be achieved without a careful
+integration of common rhythmic motion and rare yet essential semantic motion.
+In this work, we propose SemTalk for holistic co-speech motion generation with
+frame-level semantic emphasis. Our key insight is to separately learn general
+motions and sparse motions, and then adaptively fuse them. In particular,
+rhythmic consistency learning is explored to establish rhythm-related base
+motion, ensuring a coherent foundation that synchronizes gestures with the
+speech rhythm. Subsequently, textit{semantic emphasis learning is designed to
+generate semantic-aware sparse motion, focusing on frame-level semantic cues.
+Finally, to integrate sparse motion into the base motion and generate
+semantic-emphasized co-speech gestures, we further leverage a learned semantic
+score for adaptive synthesis. Qualitative and quantitative comparisons on two
+public datasets demonstrate that our method outperforms the state-of-the-art,
+delivering high-quality co-speech motion with enhanced semantic richness over a
+stable base motion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ACE++: Instruction-Based Image Creation and Editing via Context-Aware
+  Content Filling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02487v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02487v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaojie Mao, Jingfeng Zhang, Yulin Pan, Zeyinzi Jiang, Zhen Han, Yu Liu, Jingren Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We report ACE++, an instruction-based diffusion framework that tackles
+various image generation and editing tasks. Inspired by the input format for
+the inpainting task proposed by FLUX.1-Fill-dev, we improve the Long-context
+Condition Unit (LCU) introduced in ACE and extend this input paradigm to any
+editing and generation tasks. To take full advantage of image generative
+priors, we develop a two-stage training scheme to minimize the efforts of
+finetuning powerful text-to-image diffusion models like FLUX.1-dev. In the
+first stage, we pre-train the model using task data with the 0-ref tasks from
+the text-to-image model. There are many models in the community based on the
+post-training of text-to-image foundational models that meet this training
+paradigm of the first stage. For example, FLUX.1-Fill-dev deals primarily with
+painting tasks and can be used as an initialization to accelerate the training
+process. In the second stage, we finetune the above model to support the
+general instructions using all tasks defined in ACE. To promote the widespread
+application of ACE++ in different scenarios, we provide a comprehensive set of
+models that cover both full finetuning and lightweight finetuning, while
+considering general applicability and applicability in vertical scenarios. The
+qualitative analysis showcases the superiority of ACE++ in terms of generating
+image quality and prompt following ability. Code and models will be available
+on the project page: https://ali-vilab. github.io/ACE_plus_page/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solving Energy-Independent Density for CT Metal Artifact Reduction via
+  Neural Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07047v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07047v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qing Wu, Xu Guo, Lixuan Chen, Yanyan Liu, Dongming He, Xudong Wang, Xueli Chen, Yifeng Zhang, S. Kevin Zhou, Jingyi Yu, Yuyao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  X-ray CT often suffers from shadowing and streaking artifacts in the presence
+of metallic materials, which severely degrade imaging quality. Physically, the
+linear attenuation coefficients (LACs) of metals vary significantly with X-ray
+energy, causing a nonlinear beam hardening effect (BHE) in CT measurements.
+Reconstructing CT images from metal-corrupted measurements consequently becomes
+a challenging nonlinear inverse problem. Existing state-of-the-art (SOTA) metal
+artifact reduction (MAR) algorithms rely on supervised learning with numerous
+paired CT samples. While promising, these supervised methods often assume that
+the unknown LACs are energy-independent, ignoring the energy-induced BHE, which
+results in limited generalization. Moreover, the requirement for large datasets
+also limits their applications in real-world scenarios. In this work, we
+propose Density neural representation (Diner), a novel unsupervised MAR method.
+Our key innovation lies in formulating MAR as an energy-independent density
+reconstruction problem that strictly adheres to the photon-tissue absorption
+physical model. This model is inherently nonlinear and complex, making it a
+rarely considered approach in inverse imaging problems. By introducing the
+water-equivalent tissues approximation and a new polychromatic model to
+characterize the nonlinear CT acquisition process, we directly learn the neural
+representation of the density map from raw measurements without using external
+training data. This energy-independent density reconstruction framework
+fundamentally resolves the nonlinear BHE, enabling superior MAR performance
+across a wide range of scanning scenarios. Extensive experiments on both
+simulated and real-world datasets demonstrate the superiority of our
+unsupervised Diner over popular supervised methods in terms of MAR performance
+and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3VL: Using Trees to Improve Vision-Language Models' Interpretability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.17345v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.17345v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nir Yellinek, Leonid Karlinsky, Raja Giryes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language models (VLMs) have proven to be effective at aligning image
+and text representations, producing superior zero-shot results when transferred
+to many downstream tasks. However, these representations suffer from some key
+shortcomings in understanding Compositional Language Concepts (CLC), such as
+recognizing objects' attributes, states, and relations between different
+objects. Moreover, VLMs typically have poor interpretability, making it
+challenging to debug and mitigate compositional-understanding failures. In this
+work, we introduce the architecture and training technique of Tree-augmented
+Vision-Language (3VL) model accompanied by our proposed Anchor inference method
+and Differential Relevance (DiRe) interpretability tool. By expanding the text
+of an arbitrary image-text pair into a hierarchical tree structure using
+language analysis tools, 3VL allows the induction of this structure into the
+visual representation learned by the model, enhancing its interpretability and
+compositional reasoning. Additionally, we show how Anchor, a simple technique
+for text unification, can be used to filter nuisance factors while increasing
+CLC understanding performance, e.g., on the fundamental VL-Checklist benchmark.
+We also show how DiRe, which performs a differential comparison between VLM
+relevancy maps, enables us to generate compelling visualizations of the reasons
+for a model's success or failure. Our code is available at:
+https://github.com/niryellinek/3VL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to IEEE TIP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When No-Reference Image Quality Models Meet MAP Estimation in Diffusion
+  Latents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06406v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06406v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixia Zhang, Dingquan Li, Guangtao Zhai, Xiaokang Yang, Kede Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary no-reference image quality assessment (NR-IQA) models can
+effectively quantify perceived image quality, often achieving strong
+correlations with human perceptual scores on standard IQA benchmarks. Yet,
+limited efforts have been devoted to treating NR-IQA models as natural image
+priors for real-world image enhancement, and consequently comparing them from a
+perceptual optimization standpoint. In this work, we show -- for the first time
+-- that NR-IQA models can be plugged into the maximum a posteriori (MAP)
+estimation framework for image enhancement. This is achieved by performing
+gradient ascent in the diffusion latent space rather than in the raw pixel
+domain, leveraging a pretrained differentiable and bijective diffusion process.
+Likely, different NR-IQA models lead to different enhanced outputs, which in
+turn provides a new computational means of comparing them. Unlike conventional
+correlation-based measures, our comparison method offers complementary insights
+into the respective strengths and weaknesses of the competing NR-IQA models in
+perceptual optimization scenarios. Additionally, we aim to improve the
+best-performing NR-IQA model in diffusion latent MAP estimation by
+incorporating the advantages of other top-performing methods. The resulting
+model delivers noticeably better results in enhancing real-world images
+afflicted by unknown and complex distortions, all preserving a high degree of
+image fidelity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sports-QA: A Large-Scale Video Question Answering Benchmark for Complex
+  and Professional Sports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01505v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01505v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haopeng Li, Andong Deng, Jun Liu, Hossein Rahmani, Yulan Guo, Bernt Schiele, Mohammed Bennamoun, Qiuhong Ke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning over sports videos for question answering is an important task with
+numerous applications, such as player training and information retrieval.
+However, this task has not been explored due to the lack of relevant datasets
+and the challenging nature it presents. Most datasets for video question
+answering (VideoQA) focus mainly on general and coarse-grained understanding of
+daily-life videos, which is not applicable to sports scenarios requiring
+professional action understanding and fine-grained motion analysis. In this
+paper, we introduce the first dataset, named Sports-QA, specifically designed
+for the sports VideoQA task. The Sports-QA dataset includes various types of
+questions, such as descriptions, chronologies, causalities, and counterfactual
+conditions, covering multiple sports. Furthermore, to address the
+characteristics of the sports VideoQA task, we propose a new Auto-Focus
+Transformer (AFT) capable of automatically focusing on particular scales of
+temporal information for question answering. We conduct extensive experiments
+on Sports-QA, including baseline studies and the evaluation of different
+methods. The results demonstrate that our AFT achieves state-of-the-art
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Maximizing Uncertainty for Federated learning via Bayesian
+  Optimisation-based Model Poisoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08002v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08002v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Aristodemou, Xiaolan Liu, Yuan Wang, Konstantinos G. Kyriakopoulos, Sangarapillai Lambotharan, Qingsong Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As we transition from Narrow Artificial Intelligence towards Artificial Super
+Intelligence, users are increasingly concerned about their privacy and the
+trustworthiness of machine learning (ML) technology. A common denominator for
+the metrics of trustworthiness is the quantification of uncertainty inherent in
+DL algorithms, and specifically in the model parameters, input data, and model
+predictions. One of the common approaches to address privacy-related issues in
+DL is to adopt distributed learning such as federated learning (FL), where
+private raw data is not shared among users. Despite the privacy-preserving
+mechanisms in FL, it still faces challenges in trustworthiness. Specifically,
+the malicious users, during training, can systematically create malicious model
+parameters to compromise the models predictive and generative capabilities,
+resulting in high uncertainty about their reliability. To demonstrate malicious
+behaviour, we propose a novel model poisoning attack method named Delphi which
+aims to maximise the uncertainty of the global model output. We achieve this by
+taking advantage of the relationship between the uncertainty and the model
+parameters of the first hidden layer of the local model. Delphi employs two
+types of optimisation , Bayesian Optimisation and Least Squares Trust Region,
+to search for the optimal poisoned model parameters, named as Delphi-BO and
+Delphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise
+the distance of the predictive probability distribution towards an uncertain
+distribution of model output. Furthermore, we establish a mathematical proof
+for the attack effectiveness demonstrated in FL. Numerical results demonstrate
+that Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR
+highlighting vulnerability of FL systems to model poisoning attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MGF: Mixed Gaussian Flow for Diverse Trajectory Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12238v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12238v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahe Chen, Jinkun Cao, Dahua Lin, Kris Kitani, Jiangmiao Pang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To predict future trajectories, the normalizing flow with a standard Gaussian
+prior suffers from weak diversity. The ineffectiveness comes from the conflict
+between the fact of asymmetric and multi-modal distribution of likely outcomes
+and symmetric and single-modal original distribution and supervision losses.
+Instead, we propose constructing a mixed Gaussian prior for a normalizing flow
+model for trajectory prediction. The prior is constructed by analyzing the
+trajectory patterns in the training samples without requiring extra annotations
+while showing better expressiveness and being multi-modal and asymmetric.
+Besides diversity, it also provides better controllability for probabilistic
+trajectory generation. We name our method Mixed Gaussian Flow (MGF). It
+achieves state-of-the-art performance in the evaluation of both trajectory
+alignment and diversity on the popular UCY/ETH and SDD datasets. Code is
+available at https://github.com/mulplue/MGF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Neurips 2024. Code: https://github.com/mulplue/MGF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mask-guided cross-image attention for zero-shot in-silico
+  histopathologic image generation with a diffusion model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.11664v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.11664v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Winter, Nicolas Triltsch, Marco Rosati, Anatoliy Shumilov, Ziya Kokaragac, Yuri Popov, Thomas Padel, Laura Sebastian Monasor, Ross Hill, Markus Schick, Nicolas Brieu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating in-silico data with generative AI promises a cost-effective
+alternative to staining, imaging, and annotating whole slide images in
+computational pathology. Diffusion models are the state-of-the-art solution for
+generating in-silico images, offering unparalleled fidelity and realism. Using
+appearance transfer diffusion models allows for zero-shot image generation,
+facilitating fast application and making model training unnecessary. However
+current appearance transfer diffusion models are designed for natural images,
+where the main task is to transfer the foreground object from an origin to a
+target domain, while the background is of insignificant importance. In
+computational pathology, specifically in oncology, it is however not
+straightforward to define which objects in an image should be classified as
+foreground and background, as all objects in an image may be of critical
+importance for the detailed understanding the tumor micro-environment. We
+contribute to the applicability of appearance transfer diffusion models to
+immunohistochemistry-stained images by modifying the appearance transfer
+guidance to alternate between class-specific AdaIN feature statistics matchings
+using existing segmentation masks. The performance of the proposed method is
+demonstrated on the downstream task of supervised epithelium segmentation,
+showing that the number of manual annotations required for model training can
+be reduced by 75%, outperforming the baseline approach. Additionally, we
+consulted with a certified pathologist to investigate future improvements. We
+anticipate this work to inspire the application of zero-shot diffusion models
+in computational pathology, providing an efficient method to generate in-silico
+images with unmatched fidelity and realism, which prove meaningful for
+downstream tasks, such as training existing deep learning models or finetuning
+foundation models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RoHan: Robust Hand Detection in Operation Room 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08115v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08115v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roi Papo, Sapir Gershov, Tom Friedman, Itay Or, Gil Bolotin, Shlomi Laufer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hand-specific localization has garnered significant interest within the
+computer vision community. Although there are numerous datasets with hand
+annotations from various angles and settings, domain transfer techniques
+frequently struggle in surgical environments. This is mainly due to the limited
+availability of gloved hand instances and the unique challenges of operating
+rooms (ORs). Thus, hand-detection models tailored to OR settings require
+extensive training and expensive annotation processes. To overcome these
+challenges, we present "RoHan" - a novel approach for robust hand detection in
+the OR, leveraging advanced semi-supervised domain adaptation techniques to
+tackle the challenges of varying recording conditions, diverse glove colors,
+and occlusions common in surgical settings. Our methodology encompasses two
+main stages: (1) data augmentation strategy that utilizes "Artificial Gloves,"
+a method for augmenting publicly available hand datasets with synthetic images
+of hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that
+improves detection performance in real-world OR settings through iterative
+prediction refinement and efficient frame filtering. We evaluate our method
+using two datasets: simulated enterotomy repair and saphenous vein graft
+harvesting. "RoHan" substantially reduces the need for extensive labeling and
+model training, paving the way for the practical implementation of hand
+detection technologies in medical settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion-based Unsupervised Audio-visual Speech Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05301v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05301v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-Eudes Ayilo, Mostafa Sadeghi, Romain Serizel, Xavier Alameda-Pineda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a new unsupervised audio-visual speech enhancement (AVSE)
+approach that combines a diffusion-based audio-visual speech generative model
+with a non-negative matrix factorization (NMF) noise model. First, the
+diffusion model is pre-trained on clean speech conditioned on corresponding
+video data to simulate the speech generative distribution. This pre-trained
+model is then paired with the NMF-based noise model to estimate clean speech
+iteratively. Specifically, a diffusion-based posterior sampling approach is
+implemented within the reverse diffusion process, where after each iteration, a
+speech estimate is obtained and used to update the noise parameters.
+Experimental results confirm that the proposed AVSE approach not only
+outperforms its audio-only counterpart but also generalizes better than a
+recent supervised-generative AVSE method. Additionally, the new inference
+algorithm offers a better balance between inference speed and performance
+compared to the previous diffusion-based method. Code and demo available at:
+https://jeaneudesayilo.github.io/fast_UdiffSE
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Pain Classification using Spatio-Temporal Deep Learning
+  Approaches with Facial Expressions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06787v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06787v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aafaf Ridouan, Amine Bohi, Youssef Mourchid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pain management and severity detection are crucial for effective treatment,
+yet traditional self-reporting methods are subjective and may be unsuitable for
+non-verbal individuals (people with limited speaking skills). To address this
+limitation, we explore automated pain detection using facial expressions. Our
+study leverages deep learning techniques to improve pain assessment by
+analyzing facial images from the Pain Emotion Faces Database (PEMF). We propose
+two novel approaches1: (1) a hybrid ConvNeXt model combined with Long
+Short-Term Memory (LSTM) blocks to analyze video frames and predict pain
+presence, and (2) a Spatio-Temporal Graph Convolution Network (STGCN)
+integrated with LSTM to process landmarks from facial images for pain
+detection. Our work represents the first use of the PEMF dataset for binary
+pain classification and demonstrates the effectiveness of these models through
+extensive experimentation. The results highlight the potential of combining
+spatial and temporal features for enhanced pain detection, offering a promising
+advancement in objective pain assessment methodologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, 3 tables. Accepted and presented at the 18th
+  International Conference on Machine Vision (ICMV 2024), Edinburgh, UK</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multispectral Pedestrian Detection with Sparsely Annotated Label <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02640v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02640v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chan Lee, Seungho Shin, Gyeong-Moon Park, Jung Uk Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although existing Sparsely Annotated Object Detection (SAOD) approches have
+made progress in handling sparsely annotated environments in multispectral
+domain, where only some pedestrians are annotated, they still have the
+following limitations: (i) they lack considerations for improving the quality
+of pseudo-labels for missing annotations, and (ii) they rely on fixed ground
+truth annotations, which leads to learning only a limited range of pedestrian
+visual appearances in the multispectral domain. To address these issues, we
+propose a novel framework called Sparsely Annotated Multispectral Pedestrian
+Detection (SAMPD). For limitation (i), we introduce Multispectral
+Pedestrian-aware Adaptive Weight (MPAW) and Positive Pseudo-label Enhancement
+(PPE) module. Utilizing multispectral knowledge, these modules ensure the
+generation of high-quality pseudo-labels and enable effective learning by
+increasing weights for high-quality pseudo-labels based on modality
+characteristics. To address limitation (ii), we propose an Adaptive Pedestrian
+Retrieval Augmentation (APRA) module, which adaptively incorporates pedestrian
+patches from ground-truth and dynamically integrates high-quality pseudo-labels
+with the ground-truth, facilitating a more diverse learning pool of
+pedestrians. Extensive experimental results demonstrate that our SAMPD
+significantly enhances performance in sparsely annotated environments within
+the multispectral domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Approximation properties relative to continuous scale space for hybrid
+  discretizations of Gaussian derivative operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05095v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05095v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tony Lindeberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an analysis of properties of two hybrid discretization
+methods for Gaussian derivatives, based on convolutions with either the
+normalized sampled Gaussian kernel or the integrated Gaussian kernel followed
+by central differences. The motivation for studying these discretization
+methods is that in situations when multiple spatial derivatives of different
+order are needed at the same scale level, they can be computed significantly
+more efficiently compared to more direct derivative approximations based on
+explicit convolutions with either sampled Gaussian kernels or integrated
+Gaussian kernels.
+  While these computational benefits do also hold for the genuinely discrete
+approach for computing discrete analogues of Gaussian derivatives, based on
+convolution with the discrete analogue of the Gaussian kernel followed by
+central differences, the underlying mathematical primitives for the discrete
+analogue of the Gaussian kernel, in terms of modified Bessel functions of
+integer order, may not be available in certain frameworks for image processing,
+such as when performing deep learning based on scale-parameterized filters in
+terms of Gaussian derivatives, with learning of the scale levels.
+  In this paper, we present a characterization of the properties of these
+hybrid discretization methods, in terms of quantitative performance measures
+concerning the amount of spatial smoothing that they imply, as well as the
+relative consistency of scale estimates obtained from scale-invariant feature
+detectors with automatic scale selection, with an emphasis on the behaviour for
+very small values of the scale parameter, which may differ significantly from
+corresponding results obtained from the fully continuous scale-space theory, as
+well as between different types of discretization methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 9 figures. arXiv admin note: text overlap with
+  arXiv:2311.11317</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OminiControl: Minimal and Universal Control for Diffusion <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15098v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15098v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenxiong Tan, Songhua Liu, Xingyi Yang, Qiaochu Xue, Xinchao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce OminiControl, a highly versatile and
+parameter-efficient framework that integrates image conditions into pre-trained
+Diffusion Transformer (DiT) models. At its core, OminiControl leverages a
+parameter reuse mechanism, enabling the DiT to encode image conditions using
+itself as a powerful backbone and process them with its flexible multi-modal
+attention processors. Unlike existing methods, which rely heavily on additional
+encoder modules with complex architectures, OminiControl (1) effectively and
+efficiently incorporates injected image conditions with only ~0.1% additional
+parameters, and (2) addresses a wide range of image conditioning tasks in a
+unified manner, including subject-driven generation and spatially-aligned
+conditions such as edges, depth, and more. Remarkably, these capabilities are
+achieved by training on images generated by the DiT itself, which is
+particularly beneficial for subject-driven generation. Extensive evaluations
+demonstrate that OminiControl outperforms existing UNet-based and DiT-adapted
+models in both subject-driven and spatially-aligned conditional generation.
+Additionally, we release our training dataset, Subjects200K, a diverse
+collection of over 200,000 identity-consistent images, along with an efficient
+data synthesis pipeline to advance research in subject-consistent generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10919v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10919v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Zhao, Tingwei Chen, Zhijie Cai, Xiaoyang Li, Hang Li, Qimei Chen, Guangxu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Wi-Fi sensing has garnered significant attention due to its
+numerous benefits, such as privacy protection, low cost, and penetration
+ability. Extensive research has been conducted in this field, focusing on areas
+such as gesture recognition, people identification, and fall detection.
+However, many data-driven methods encounter challenges related to domain shift,
+where the model fails to perform well in environments different from the
+training data. One major factor contributing to this issue is the limited
+availability of Wi-Fi sensing datasets, which makes models learn excessive
+irrelevant information and over-fit to the training set. Unfortunately,
+collecting large-scale Wi-Fi sensing datasets across diverse scenarios is a
+challenging task. To address this problem, we propose CrossFi, a siamese
+network-based approach that excels in both in-domain scenario and cross-domain
+scenario, including few-shot, zero-shot scenarios, and even works in few-shot
+new-class scenario where testing set contains new categories. The core
+component of CrossFi is a sample-similarity calculation network called CSi-Net,
+which improves the structure of the siamese network by using an attention
+mechanism to capture similarity information, instead of simply calculating the
+distance or cosine similarity. Based on it, we develop an extra Weight-Net that
+can generate a template for each class, so that our CrossFi can work in
+different scenarios. Experimental results demonstrate that our CrossFi achieves
+state-of-the-art performance across various scenarios. In gesture recognition
+task, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%
+in one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,
+and 84.75% in one-shot new-class scenario. The code for our model is publicly
+available at https://github.com/RS2002/CrossFi.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multiple Information <span class="highlight-title">Prompt</span> Learning for Cloth-Changing Person
+  Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.00330v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.00330v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengxun Wei, Zan Gao, Chunjie Ma, Yibo Zhao, Weili Guan, Shengyong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cloth-changing person re-identification is a subject closer to the real
+world, which focuses on solving the problem of person re-identification after
+pedestrians change clothes. The primary challenge in this field is to overcome
+the complex interplay between intra-class and inter-class variations and to
+identify features that remain unaffected by changes in appearance. Sufficient
+data collection for model training would significantly aid in addressing this
+problem. However, it is challenging to gather diverse datasets in practice.
+Current methods focus on implicitly learning identity information from the
+original image or introducing additional auxiliary models, which are largely
+limited by the quality of the image and the performance of the additional
+model. To address these issues, inspired by prompt learning, we propose a novel
+multiple information prompt learning (MIPL) scheme for cloth-changing person
+ReID, which learns identity robust features through the common prompt guidance
+of multiple messages. Specifically, the clothing information stripping (CIS)
+module is designed to decouple the clothing information from the original RGB
+image features to counteract the influence of clothing appearance. The
+Bio-guided attention (BGA) module is proposed to increase the learning
+intensity of the model for key information. A dual-length hybrid patch (DHP)
+module is employed to make the features have diverse coverage to minimize the
+impact of feature bias. Extensive experiments demonstrate that the proposed
+method outperforms all state-of-the-art methods on the LTCC, Celeb-reID,
+Celeb-reID-light, and CSCC datasets, achieving rank-1 scores of 74.8%, 73.3%,
+66.0%, and 88.1%, respectively. When compared to AIM (CVPR23), ACID (TIP23),
+and SCNet (MM23), MIPL achieves rank-1 improvements of 11.3%, 13.8%, and 7.9%,
+respectively, on the PRCC dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Silent Majority: Demystifying Memorization Effect in the Presence of
+  Spurious Correlations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyu You, Haocheng Dai, Yifei Min, Jasjeet S. Sekhon, Sarang Joshi, James S. Duncan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models often rely on simple spurious features -- patterns in
+training data that correlate with targets but are not causally related to them,
+like image backgrounds in foreground classification. This reliance typically
+leads to imbalanced test performance across minority and majority groups. In
+this work, we take a closer look at the fundamental cause of such imbalanced
+performance through the lens of memorization, which refers to the ability to
+predict accurately on \textit{atypical} examples (minority groups) in the
+training set but failing in achieving the same accuracy in the testing set.
+This paper systematically shows the ubiquitous existence of spurious features
+in a small set of neurons within the network, providing the first-ever evidence
+that memorization may contribute to imbalanced group performance. Through three
+experimental sources of converging empirical evidence, we find the property of
+a small subset of neurons or channels in memorizing minority group information.
+Inspired by these findings, we articulate the hypothesis: the imbalanced group
+performance is a byproduct of ``noisy'' spurious memorization confined to a
+small set of neurons. To further substantiate this hypothesis, we show that
+eliminating these unnecessary spurious memorization patterns via a novel
+framework during training can significantly affect the model performance on
+minority groups. Our experimental results across various architectures and
+benchmarks offer new insights on how neural networks encode core and spurious
+knowledge, laying the groundwork for future research in demystifying robustness
+to spurious correlation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DATransNet: Dynamic Attention <span class="highlight-title">Transformer</span> Network for Infrared Small
+  Target Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.19599v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.19599v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Hu, Yian Huang, Kexuan Li, Luping Zhang, Chang Long, Yiming Zhu, Tian Pu, Zhenming Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared small target detection (ISTD) is widely used in civilian and
+military applications. However, ISTD encounters several challenges, including
+the tendency for small and dim targets to be obscured by complex backgrounds.To
+address this issue, we propose the Dynamic Attention Transformer Network
+(DATransNet), which aims to extract and preserve edge information of small
+targets.DATransNet employs the Dynamic Attention Transformer (DATrans),
+simulating central difference convolutions (CDC) to extract and integrate
+gradient features with deeper features.Furthermore, we propose a global feature
+extraction module (GFEM) that offers a comprehensive perspective to prevent the
+network from focusing solely on details while neglecting the background
+information. We compare the network with state-of-the-art (SOTA) approaches,
+and the results demonstrate that our method performs effectively. Our source
+code is available at https://github.com/greekinRoma/DATransNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ultra-High-Definition Image Deblurring via Multi-scale Cubic-Mixer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.03678v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.03678v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingchi Chen, Xiuyi Jia, Zhuoran Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently, transformer-based algorithms are making a splash in the domain of
+image deblurring. Their achievement depends on the self-attention mechanism
+with CNN stem to model long range dependencies between tokens. Unfortunately,
+this ear-pleasing pipeline introduces high computational complexity and makes
+it difficult to run an ultra-high-definition image on a single GPU in real
+time. To trade-off accuracy and efficiency, the input degraded image is
+computed cyclically over three dimensional ($C$, $W$, and $H$) signals without
+a self-attention mechanism. We term this deep network as Multi-scale
+Cubic-Mixer, which is acted on both the real and imaginary components after
+fast Fourier transform to estimate the Fourier coefficients and thus obtain a
+deblurred image. Furthermore, we combine the multi-scale cubic-mixer with a
+slicing strategy to generate high-quality results at a much lower computational
+cost. Experimental results demonstrate that the proposed algorithm performs
+favorably against the state-of-the-art deblurring approaches on the several
+benchmarks and a new ultra-high-definition dataset in terms of accuracy and
+speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-shot Video Restoration and Enhancement Using <span class="highlight-title">Pre-Train</span>ed Image
+  Diffusion Model <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.01960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.01960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Cao, Huanjing Yue, Xin Liu, Jingyu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based zero-shot image restoration and enhancement models have
+achieved great success in various tasks of image restoration and enhancement.
+However, directly applying them to video restoration and enhancement results in
+severe temporal flickering artifacts. In this paper, we propose the first
+framework for zero-shot video restoration and enhancement based on the
+pre-trained image diffusion model. By replacing the spatial self-attention
+layer with the proposed short-long-range (SLR) temporal attention layer, the
+pre-trained image diffusion model can take advantage of the temporal
+correlation between frames. We further propose temporal consistency guidance,
+spatial-temporal noise sharing, and an early stopping sampling strategy to
+improve temporally consistent sampling. Our method is a plug-and-play module
+that can be inserted into any diffusion-based image restoration or enhancement
+methods to further improve their performance. Experimental results demonstrate
+the superiority of our proposed method. Our code is available at
+https://github.com/cao-cong/ZVRD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continuous Concepts Removal in Text-to-image Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.00580v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.00580v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tingxu Han, Weisong Sun, Yanrong Hu, Chunrong Fang, Yonglong Zhang, Shiqing Ma, Tao Zheng, Zhenyu Chen, Zhenting Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image diffusion models have shown an impressive ability to generate
+high-quality images from input textual descriptions. However, concerns have
+been raised about the potential for these models to create content that
+infringes on copyrights or depicts disturbing subject matter. Removing specific
+concepts from these models is a promising potential solution to this problem.
+However, existing methods for concept removal do not work well in practical but
+challenging scenarios where concepts need to be continuously removed.
+Specifically, these methods lead to poor alignment between the text prompts and
+the generated image after the continuous removal process. To address this
+issue, we propose a novel approach called CCRT that includes a designed
+knowledge distillation paradigm. It constrains the text-image alignment
+behavior during the continuous concept removal process by using a set of text
+prompts generated through our genetic algorithm, which employs a designed
+fuzzing strategy. We conduct extensive experiments involving the removal of
+various concepts. The results evaluated through both algorithmic metrics and
+human studies demonstrate that our CCRT can effectively remove the targeted
+concepts in a continuous manner while maintaining the high generation quality
+(e.g., text-image alignment) of the model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformal-in-the-Loop for Learning with Imbalanced Noisy Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02281v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02281v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John Brandon Graham-Knight, Jamil Fayyad, Nourhan Bayasi, Patricia Lasserre, Homayoun Najjaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class imbalance and label noise are pervasive in large-scale datasets, yet
+much of machine learning research assumes well-labeled, balanced data, which
+rarely reflects real world conditions. Existing approaches typically address
+either label noise or class imbalance in isolation, leading to suboptimal
+results when both issues coexist. In this work, we propose
+Conformal-in-the-Loop (CitL), a novel training framework that addresses both
+challenges with a conformal prediction-based approach. CitL evaluates sample
+uncertainty to adjust weights and prune unreliable examples, enhancing model
+resilience and accuracy with minimal computational cost. Our extensive
+experiments include a detailed analysis showing how CitL effectively emphasizes
+impactful data in noisy, imbalanced datasets. Our results show that CitL
+consistently boosts model performance, achieving up to a 6.1% increase in
+classification accuracy and a 5.0 mIoU improvement in segmentation. Our code is
+publicly available: CitL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating the Effect of Network Pruning on Performance and
+  Interpretability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.19727v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.19727v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan von Rad, Florian Seuffert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNNs) are often over-parameterized for their tasks and
+can be compressed quite drastically by removing weights, a process called
+pruning. We investigate the impact of different pruning techniques on the
+classification performance and interpretability of GoogLeNet. We systematically
+apply unstructured and structured pruning, as well as connection sparsity
+(pruning of input weights) methods to the network and analyze the outcomes
+regarding the network's performance on the validation set of ImageNet. We also
+compare different retraining strategies, such as iterative pruning and one-shot
+pruning. We find that with sufficient retraining epochs, the performance of the
+networks can approximate the performance of the default GoogLeNet - and even
+surpass it in some cases. To assess interpretability, we employ the Mechanistic
+Interpretability Score (MIS) developed by Zimmermann et al. . Our experiments
+reveal that there is no significant relationship between interpretability and
+pruning rate when using MIS as a measure. Additionally, we observe that
+networks with extremely low accuracy can still achieve high MIS scores,
+suggesting that the MIS may not always align with intuitive notions of
+interpretability, such as understanding the basis of correct decisions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Make-A-Character 2: Animatable 3D Character Generation From a Single
+  Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Liu, Yutong Wang, Jiahao Chen, Jianfang Li, Tangli Xue, Longlong Li, Jianqiang Ren, Liefeng Bo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report introduces Make-A-Character 2, an advanced system for generating
+high-quality 3D characters from single portrait photographs, ideal for game
+development and digital human applications. Make-A-Character 2 builds upon its
+predecessor by incorporating several significant improvements for image-based
+head generation. We utilize the IC-Light method to correct non-ideal
+illumination in input photos and apply neural network-based color correction to
+harmonize skin tones between the photos and game engine renders. We also employ
+the Hierarchical Representation Network to capture high-frequency facial
+structures and conduct adaptive skeleton calibration for accurate and
+expressive facial animations. The entire image-to-3D-character generation
+process takes less than 2 minutes. Furthermore, we leverage transformer
+architecture to generate co-speech facial and gesture actions, enabling
+real-time conversation with the generated character. These technologies have
+been integrated into our conversational AI avatar products.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal and Multi-scale Spatial Environment Understanding for
+  Immersive Visual Text-to-Speech <span class="chip">AAAI'2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11409v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11409v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Liu, Shuwei He, Yifan Hu, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Text-to-Speech (VTTS) aims to take the environmental image as the
+prompt to synthesize the reverberant speech for the spoken content. The
+challenge of this task lies in understanding the spatial environment from the
+image. Many attempts have been made to extract global spatial visual
+information from the RGB space of an spatial image. However, local and depth
+image information are crucial for understanding the spatial environment, which
+previous works have ignored. To address the issues, we propose a novel
+multi-modal and multi-scale spatial environment understanding scheme to achieve
+immersive VTTS, termed M2SE-VTTS. The multi-modal aims to take both the RGB and
+Depth spaces of the spatial image to learn more comprehensive spatial
+information, and the multi-scale seeks to model the local and global spatial
+knowledge simultaneously. Specifically, we first split the RGB and Depth images
+into patches and adopt the Gemini-generated environment captions to guide the
+local spatial understanding. After that, the multi-modal and multi-scale
+features are integrated by the local-aware global spatial understanding. In
+this way, M2SE-VTTS effectively models the interactions between local and
+global spatial contexts in the multi-modal spatial environment. Objective and
+subjective evaluations suggest that our model outperforms the advanced
+baselines in environmental speech generation. The code and audio samples are
+available at: https://github.com/AI-S2-Lab/M2SE-VTTS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,2 figures, Accepted by AAAI'2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Context Temporal Consistent Modeling for Referring Video Object
+  Segmentation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04939v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04939v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sun-Hyuk Choi, Hayoung Jo, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Referring video object segmentation aims to segment objects within a video
+corresponding to a given text description. Existing transformer-based temporal
+modeling approaches face challenges related to query inconsistency and the
+limited consideration of context. Query inconsistency produces unstable masks
+of different objects in the middle of the video. The limited consideration of
+context leads to the segmentation of incorrect objects by failing to adequately
+account for the relationship between the given text and instances. To address
+these issues, we propose the Multi-context Temporal Consistency Module (MTCM),
+which consists of an Aligner and a Multi-Context Enhancer (MCE). The Aligner
+removes noise from queries and aligns them to achieve query consistency. The
+MCE predicts text-relevant queries by considering multi-context. We applied
+MTCM to four different models, increasing performance across all of them,
+particularly achieving 47.6 J&F on the MeViS. Code is available at
+https://github.com/Choi58/MTCM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Comment: Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Noise-Tolerant Network for Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07163v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07163v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unlike image classification and annotation, for which deep network models
+have achieved dominating superior performances compared to traditional computer
+vision algorithms, deep learning for automatic image segmentation still faces
+critical challenges. One of such hurdles is to obtain ground-truth
+segmentations as the training labels for deep network training. Especially when
+we study biomedical images, such as histopathological images (histo-images), it
+is unrealistic to ask for manual segmentation labels as the ground truth for
+training due to the fine image resolution as well as the large image size and
+complexity. In this paper, instead of relying on clean segmentation labels, we
+study whether and how integrating imperfect or noisy segmentation results from
+off-the-shelf segmentation algorithms may help achieve better segmentation
+results through a new Adaptive Noise-Tolerant Network (ANTN) model. We extend
+the noisy label deep learning to image segmentation with two novel aspects: (1)
+multiple noisy labels can be integrated into one deep learning model; (2) noisy
+segmentation modeling, including probabilistic parameters, is adaptive,
+depending on the given testing image appearance. Implementation of the new ANTN
+model on both the synthetic data and real-world histo-images demonstrates its
+effectiveness and superiority over off-the-shelf and other existing
+deep-learning-based image segmentation algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Long Video Tokenization via Coordinate-based Patch
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14762v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14762v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huiwon Jang, Sihyun Yu, Jinwoo Shin, Pieter Abbeel, Younggyo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient tokenization of videos remains a challenge in training vision
+models that can process long videos. One promising direction is to develop a
+tokenizer that can encode long video clips, as it would enable the tokenizer to
+leverage the temporal coherence of videos better for tokenization. However,
+training existing tokenizers on long videos often incurs a huge training cost
+as they are trained to reconstruct all the frames at once. In this paper, we
+introduce CoordTok, a video tokenizer that learns a mapping from
+coordinate-based representations to the corresponding patches of input videos,
+inspired by recent advances in 3D generative models. In particular, CoordTok
+encodes a video into factorized triplane representations and reconstructs
+patches that correspond to randomly sampled $(x,y,t)$ coordinates. This allows
+for training large tokenizer models directly on long videos without requiring
+excessive training resources. Our experiments show that CoordTok can
+drastically reduce the number of tokens for encoding long video clips. For
+instance, CoordTok can encode a 128-frame video with 128$\times$128 resolution
+into 1280 tokens, while baselines need 6144 or 8192 tokens to achieve similar
+reconstruction quality. We further show that this efficient video tokenization
+enables memory-efficient training of a diffusion transformer that can generate
+128 frames at once.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available on the project webpage:
+  https://huiwon-jang.github.io/coordtok/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unifying Information-theoretic Perspective on Evaluating Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14340v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14340v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexis Fox, Samarth Swarup, Abhijin Adiga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Considering the difficulty of interpreting generative model output, there is
+significant current research focused on determining meaningful evaluation
+metrics. Several recent approaches utilize "precision" and "recall," borrowed
+from the classification domain, to individually quantify the output fidelity
+(realism) and output diversity (representation of the real data variation),
+respectively. With the increase in metric proposals, there is a need for a
+unifying perspective, allowing for easier comparison and clearer explanation of
+their benefits and drawbacks. To this end, we unify a class of
+kth-nearest-neighbors (kNN)-based metrics under an information-theoretic lens
+using approaches from kNN density estimation. Additionally, we propose a
+tri-dimensional metric composed of Precision Cross-Entropy (PCE), Recall
+Cross-Entropy (RCE), and Recall Entropy (RE), which separately measure fidelity
+and two distinct aspects of diversity, inter- and intra-class. Our
+domain-agnostic metric, derived from the information-theoretic concepts of
+entropy and cross-entropy, can be dissected for both sample- and mode-level
+analysis. Our detailed experimental results demonstrate the sensitivity of our
+metric components to their respective qualities and reveal undesirable
+behaviors of other metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Skin Disease Diagnosis: Interpretable Visual Concept Discovery
+  with SAM <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09520v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09520v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Hu, Janet Wang, Jihun Hamm, Rie R Yotsu, Zhengming Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current AI-assisted skin image diagnosis has achieved dermatologist-level
+performance in classifying skin cancer, driven by rapid advancements in deep
+learning architectures. However, unlike traditional vision tasks, skin images
+in general present unique challenges due to the limited availability of
+well-annotated datasets, complex variations in conditions, and the necessity
+for detailed interpretations to ensure patient safety. Previous segmentation
+methods have sought to reduce image noise and enhance diagnostic performance,
+but these techniques require fine-grained, pixel-level ground truth masks for
+training. In contrast, with the rise of foundation models, the Segment Anything
+Model (SAM) has been introduced to facilitate promptable segmentation, enabling
+the automation of the segmentation process with simple yet effective prompts.
+Efforts applying SAM predominantly focus on dermatoscopy images, which present
+more easily identifiable lesion boundaries than clinical photos taken with
+smartphones. This limitation constrains the practicality of these approaches to
+real-world applications. To overcome the challenges posed by noisy clinical
+photos acquired via non-standardized protocols and to improve diagnostic
+accessibility, we propose a novel Cross-Attentive Fusion framework for
+interpretable skin lesion diagnosis. Our method leverages SAM to generate
+visual concepts for skin diseases using prompts, integrating local visual
+concepts with global image features to enhance model performance. Extensive
+evaluation on two skin disease datasets demonstrates our proposed method's
+effectiveness on lesion diagnosis and interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GauFRe: Gaussian Deformation Fields for Real-time Dynamic Novel View
+  Synthesis <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11458v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11458v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqing Liang, Numair Khan, Zhengqin Li, Thu Nguyen-Phuoc, Douglas Lanman, James Tompkin, Lei Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method that achieves state-of-the-art rendering quality and
+efficiency on monocular dynamic scene reconstruction using deformable 3D
+Gaussians. Implicit deformable representations commonly model motion with a
+canonical space and time-dependent backward-warping deformation field. Our
+method, GauFRe, uses a forward-warping deformation to explicitly model
+non-rigid transformations of scene geometry. Specifically, we propose a
+template set of 3D Gaussians residing in a canonical space, and a
+time-dependent forward-warping deformation field to model dynamic objects.
+Additionally, we tailor a 3D Gaussian-specific static component supported by an
+inductive bias-aware initialization approach which allows the deformation field
+to focus on moving scene regions, improving the rendering of complex real-world
+motion. The differentiable pipeline is optimized end-to-end with a
+self-supervised rendering loss. Experiments show our method achieves
+competitive results and higher efficiency than both previous state-of-the-art
+NeRF and Gaussian-based methods. For real-world scenes, GauFRe can train in ~20
+mins and offer 96 FPS real-time rendering on an RTX 3090 GPU. Project website:
+https://lynl7130.github.io/gaufre/index.html
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV 2025. 11 pages, 8 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Key-Exchange Convolutional Auto-Encoder for Data Augmentation in Early
+  Knee Osteoarthritis Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13336v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13336v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Wang, Aladine Chetouani, Mohamed Jarraya, Yung Hsin Chen, Yuhua Ru, Fang Chen, Fabian Bauer, Liping Zhang, Didier Hans, Rachid Jennane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knee Osteoarthritis (KOA) is a common musculoskeletal condition that
+significantly affects mobility and quality of life, particularly in elderly
+populations. However, training deep learning models for early KOA
+classification is often hampered by the limited availability of annotated
+medical datasets, owing to the high costs and labour-intensive nature of data
+labelling. Traditional data augmentation techniques, while useful, rely on
+simple transformations and fail to introduce sufficient diversity into the
+dataset. To address these challenges, we propose the Key-Exchange Convolutional
+Auto-Encoder (KECAE) as an innovative Artificial Intelligence (AI)-based data
+augmentation strategy for early KOA classification. Our model employs a
+convolutional autoencoder with a novel key-exchange mechanism that generates
+synthetic images by selectively exchanging key pathological features between
+X-ray images, which not only diversifies the dataset but also ensures the
+clinical validity of the augmented data. A hybrid loss function is introduced
+to supervise feature learning and reconstruction, integrating multiple
+components, including reconstruction, supervision, and feature separation
+losses. Experimental results demonstrate that the KECAE-generated data
+significantly improve the performance of KOA classification models, with
+accuracy gains of up to 1.98% across various standard and state-of-the-art
+architectures. Furthermore, a clinical validation study involving expert
+radiologists confirms the anatomical plausibility and diagnostic realism of the
+synthetic outputs. These findings highlight the potential of KECAE as a robust
+tool for augmenting medical datasets in early KOA detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Human Activity Recognition in an Open World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12141v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12141v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Derek S. Prijatelj, Samuel Grieggs, Jin Huang, Dawei Du, Ameya Shringi, Christopher Funk, Adam Kaufman, Eric Robertson, Walter J. Scheirer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Managing novelty in perception-based human activity recognition (HAR) is
+critical in realistic settings to improve task performance over time and ensure
+solution generalization outside of prior seen samples. Novelty manifests in HAR
+as unseen samples, activities, objects, environments, and sensor changes, among
+other ways. Novelty may be task-relevant, such as a new class or new features,
+or task-irrelevant resulting in nuisance novelty, such as never before seen
+noise, blur, or distorted video recordings. To perform HAR optimally,
+algorithmic solutions must be tolerant to nuisance novelty, and learn over time
+in the face of novelty. This paper 1) formalizes the definition of novelty in
+HAR building upon the prior definition of novelty in classification tasks, 2)
+proposes an incremental open world learning (OWL) protocol and applies it to
+the Kinetics datasets to generate a new benchmark KOWL-718, 3) analyzes the
+performance of current state-of-the-art HAR models when novelty is introduced
+over time, 4) provides a containerized and packaged pipeline for reproducing
+the OWL protocol and for modifying for any future updates to Kinetics. The
+experimental analysis includes an ablation study of how the different models
+perform under various conditions as annotated by Kinetics-AVA. The protocol as
+an algorithm for reproducing experiments using the KOWL-718 benchmark will be
+publicly released with code and containers at
+https://github.com/prijatelj/human-activity-recognition-in-an-open-world. The
+code may be used to analyze different annotations and subsets of the Kinetics
+datasets in an incremental open world fashion, as well as be extended as
+further updates to Kinetics are released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 16 figures, 3 tables. Published in JAIR 81 on Dec 20, 2024.
+  All author affiliations are from during the paper's original funded work.
+  Updated info and current emails are provided in this version's first page</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Confidence-Driven Deep Learning Framework for Early Detection of Knee
+  Osteoarthritis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13203v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13203v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Wang, Aladine Chetouani, Yung Hsin Chen, Yuhua Ru, Fang Chen, Mohamed Jarraya, Fabian Bauer, Liping Zhang, Didier Hans, Rachid Jennane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knee Osteoarthritis (KOA) is a prevalent musculoskeletal disorder that
+severely impacts mobility and quality of life, particularly among older adults.
+Its diagnosis often relies on subjective assessments using the
+Kellgren-Lawrence (KL) grading system, leading to variability in clinical
+evaluations. To address these challenges, we propose a confidence-driven deep
+learning framework for early KOA detection, focusing on distinguishing KL-0 and
+KL-2 stages. The Siamese-based framework integrates a novel multi-level feature
+extraction architecture with a hybrid loss strategy. Specifically, multi-level
+Global Average Pooling (GAP) layers are employed to extract features from
+varying network depths, ensuring comprehensive feature representation, while
+the hybrid loss strategy partitions training samples into high-, medium-, and
+low-confidence subsets. Tailored loss functions are applied to improve model
+robustness and effectively handle uncertainty in annotations. Experimental
+results on the Osteoarthritis Initiative (OAI) dataset demonstrate that the
+proposed framework achieves competitive accuracy, sensitivity, and specificity,
+comparable to those of expert radiologists. Cohen's kappa values (k > 0.85))
+confirm substantial agreement, while McNemar's test (p > 0.05) indicates no
+statistically significant differences between the model and radiologists.
+Additionally, Confidence distribution analysis reveals that the model emulates
+radiologists' decision-making patterns. These findings highlight the potential
+of the proposed approach to serve as an auxiliary diagnostic tool, enhancing
+early KOA detection and reducing clinical workload.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Novel Object Detection via Cooperative Foundational Models <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12068v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12068v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Bharadwaj, Muzammal Naseer, Salman Khan, Fahad Shahbaz Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we address the challenging and emergent problem of novel object
+detection (NOD), focusing on the accurate detection of both known and novel
+object categories during inference. Traditional object detection algorithms are
+inherently closed-set, limiting their capability to handle NOD. We present a
+novel approach to transform existing closed-set detectors into open-set
+detectors. This transformation is achieved by leveraging the complementary
+strengths of pre-trained foundational models, specifically CLIP and SAM,
+through our cooperative mechanism. Furthermore, by integrating this mechanism
+with state-of-the-art open-set detectors such as GDINO, we establish new
+benchmarks in object detection performance. Our method achieves 17.42 mAP in
+novel object detection and 42.08 mAP for known objects on the challenging LVIS
+dataset. Adapting our approach to the COCO OVD split, we surpass the current
+state-of-the-art by a margin of 7.2 $ \text{AP}_{50} $ for novel classes. Our
+code is available at https://rohit901.github.io/coop-foundation-models/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MVTamperBench: Evaluating Robustness of Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19794v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19794v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Agarwal, Srikant Panda, Angeline Charles, Bhargava Kumar, Hitesh Patel, Priyaranjan Pattnayak, Taki Hasan Rafi, Tejaswini Kumar, Dong-Kyu Chae
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Vision-Language Models (VLMs) have enabled significant
+progress in complex video understanding tasks. However, their robustness to
+real-world manipulations remains underexplored, limiting their reliability in
+critical applications. To address this gap, we introduce MVTamperBench, a
+comprehensive benchmark designed to evaluate VLM's resilience to video
+tampering effects, including rotation, dropping, masking, substitution, and
+repetition. By systematically assessing state-of-the-art models, MVTamperBench
+reveals substantial variability in robustness, with models like InternVL2-8B
+achieving high performance, while others, such as Llama-VILA1.5-8B, exhibit
+severe vulnerabilities. To foster broader adoption and reproducibility,
+MVTamperBench is integrated into VLMEvalKit, a modular evaluation toolkit,
+enabling streamlined testing and facilitating advancements in model robustness.
+Our benchmark represents a critical step towards developing tamper-resilient
+VLMs, ensuring their dependability in real-world scenarios.
+  Project Page: https://amitbcp.github.io/MVTamperBench/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ImageNet-Patch: A <span class="highlight-title">Dataset</span> for Benchmarking Machine Learning Robustness
+  against Adversarial Patches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.04412v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.04412v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maura Pintor, Daniele Angioni, Angelo Sotgiu, Luca Demetrio, Ambra Demontis, Battista Biggio, Fabio Roli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial patches are optimized contiguous pixel blocks in an input image
+that cause a machine-learning model to misclassify it. However, their
+optimization is computationally demanding, and requires careful hyperparameter
+tuning, potentially leading to suboptimal robustness evaluations. To overcome
+these issues, we propose ImageNet-Patch, a dataset to benchmark
+machine-learning models against adversarial patches. It consists of a set of
+patches, optimized to generalize across different models, and readily
+applicable to ImageNet data after preprocessing them with affine
+transformations. This process enables an approximate yet faster robustness
+evaluation, leveraging the transferability of adversarial perturbations. We
+showcase the usefulness of this dataset by testing the effectiveness of the
+computed patches against 127 models. We conclude by discussing how our dataset
+could be used as a benchmark for robustness, and how our methodology can be
+generalized to other domains. We open source our dataset and evaluation code at
+https://github.com/pralab/ImageNet-Patch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Pattern Recognition. DOI:
+  https://doi.org/10.1016/j.patcog.2022.109064</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">12</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continuous Approach to Phase (Norm) Retrieval Frames 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramin Farshchian, Rajab Ali Kamyabi-Gol, Fahimeh Arabyani-Neyshaburi, Fatemeh Esmaeelzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the properties of continuous frames, with a
+particular focus on phase retrieval and norm retrieval in the context of
+Hilbert spaces. We introduce the concept of continuous near-Riesz bases and
+prove their invariance under invertible operators. Some equivalent conditions
+for phase and norm retrieval property of continuous frames are presented. We
+study the stability of phase retrieval under perturbations. Furthermore, tensor
+product frames for separable Hilbert spaces are studied, and we establish the
+equivalence of phase retrieval and norm retrieval properties between components
+and their tensor products.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuicai Dong, Yujing Chang, Xin Deik Goh, Dexun Li, Ruiming Tang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal document retrieval is designed to identify and retrieve various
+forms of multi-modal content, such as figures, tables, charts, and layout
+information from extensive documents. Despite its significance, there is a
+notable lack of a robust benchmark to effectively evaluate the performance of
+systems in multi-modal document retrieval. To address this gap, this work
+introduces a new benchmark, named as MMDocIR, encompassing two distinct tasks:
+page-level and layout-level retrieval. The former focuses on localizing the
+most relevant pages within a long document, while the latter targets the
+detection of specific layouts, offering a more fine-grained granularity than
+whole-page analysis. A layout can refer to a variety of elements such as
+textual paragraphs, equations, figures, tables, or charts. The MMDocIR
+benchmark comprises a rich dataset featuring expertly annotated labels for
+1,685 questions and bootstrapped labels for 173,843 questions, making it a
+pivotal resource for advancing multi-modal document retrieval for both training
+and evaluation. Through rigorous experiments, we reveal that (i) visual
+retrievers significantly outperform their text counterparts, (ii) MMDocIR train
+set can effectively benefit the training process of multi-modal document
+retrieval and (iii) text retrievers leveraging on VLM-text perform much better
+than those using OCR-text. These findings underscores the potential advantages
+of integrating visual elements for multi-modal document retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://huggingface.co/MMDocIR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ $\texttt{InfoHier}$: Hierarchical Information Extraction via Encoding
+  and Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianru Zhang, Li Ju, Prashant Singh, Salman Toor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing large-scale datasets, especially involving complex and
+high-dimensional data like images, is particularly challenging. While
+self-supervised learning (SSL) has proven effective for learning
+representations from unlabelled data, it typically focuses on flat,
+non-hierarchical structures, missing the multi-level relationships present in
+many real-world datasets. Hierarchical clustering (HC) can uncover these
+relationships by organizing data into a tree-like structure, but it often
+relies on rigid similarity metrics that struggle to capture the complexity of
+diverse data types. To address these we envision $\texttt{InfoHier}$, a
+framework that combines SSL with HC to jointly learn robust latent
+representations and hierarchical structures. This approach leverages SSL to
+provide adaptive representations, enhancing HC's ability to capture complex
+patterns. Simultaneously, it integrates HC loss to refine SSL training,
+resulting in representations that are more attuned to the underlying
+information hierarchy. $\texttt{InfoHier}$ has the potential to improve the
+expressiveness and performance of both clustering and representation learning,
+offering significant benefits for data analysis, management, and information
+retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-time Indexing for Large-scale Recommendation by Streaming Vector
+  Quantization Retriever 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyan Bin, Jianfei Cui, Wujie Yan, Zhichen Zhao, Xintian Han, Chongyang Yan, Feng Zhang, Xun Zhou, Qi Wu, Zuotao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrievers, which form one of the most important recommendation stages, are
+responsible for efficiently selecting possible positive samples to the later
+stages under strict latency limitations. Because of this, large-scale systems
+always rely on approximate calculations and indexes to roughly shrink candidate
+scale, with a simple ranking model. Considering simple models lack the ability
+to produce precise predictions, most of the existing methods mainly focus on
+incorporating complicated ranking models. However, another fundamental problem
+of index effectiveness remains unresolved, which also bottlenecks complication.
+In this paper, we propose a novel index structure: streaming Vector
+Quantization model, as a new generation of retrieval paradigm. Streaming VQ
+attaches items with indexes in real time, granting it immediacy. Moreover,
+through meticulous verification of possible variants, it achieves additional
+benefits like index balancing and reparability, enabling it to support
+complicated ranking models as existing approaches. As a lightweight and
+implementation-friendly architecture, streaming VQ has been deployed and
+replaced all major retrievers in Douyin and Douyin Lite, resulting in
+remarkable user engagement gain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Graph-based Retrieval-Augmented Generation for Schema Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuangtao Ma, Sriom Chakrabarti, Arijit Khan, Bálint Molnár
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional similarity-based schema matching methods are incapable of
+resolving semantic ambiguities and conflicts in domain-specific complex mapping
+scenarios due to missing commonsense and domain-specific knowledge. The
+hallucination problem of large language models (LLMs) also makes it challenging
+for LLM-based schema matching to address the above issues. Therefore, we
+propose a Knowledge Graph-based Retrieval-Augmented Generation model for Schema
+Matching, referred to as the KG-RAG4SM. In particular, KG-RAG4SM introduces
+novel vector-based, graph traversal-based, and query-based graph retrievals, as
+well as a hybrid approach and ranking schemes that identify the most relevant
+subgraphs from external large knowledge graphs (KGs). We showcase that KG-based
+retrieval-augmented LLMs are capable of generating more accurate results for
+complex matching cases without any re-training. Our experimental results show
+that KG-RAG4SM outperforms the LLM-based state-of-the-art (SOTA) methods (e.g.,
+Jellyfish-8B) by 35.89% and 30.50% in terms of precision and F1 score on the
+MIMIC dataset, respectively; KG-RAG4SM with GPT-4o-mini outperforms the
+pre-trained language model (PLM)-based SOTA methods (e.g., SMAT) by 69.20% and
+21.97% in terms of precision and F1 score on the Synthea dataset, respectively.
+The results also demonstrate that our approach is more efficient in end-to-end
+schema matching, and scales to retrieve from large KGs. Our case studies on the
+dataset from the real-world schema matching scenario exhibit that the
+hallucination problem of LLMs for schema matching is well mitigated by our
+solution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DNMDR: Dynamic Networks and Multi-view Drug Representations for Safe
+  Medication Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanlin Liu, Xiaomei Yu, Zihao Liu, Xue Li, Xingxu Fan, Xiangwei Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medication Recommendation (MR) is a promising research topic which booms
+diverse applications in the healthcare and clinical domains. However, existing
+methods mainly rely on sequential modeling and static graphs for representation
+learning, which ignore the dynamic correlations in diverse medical events of a
+patient's temporal visits, leading to insufficient global structural
+exploration on nodes. Additionally, mitigating drug-drug interactions (DDIs) is
+another issue determining the utility of the MR systems. To address the
+challenges mentioned above, this paper proposes a novel MR method with the
+integration of dynamic networks and multi-view drug representations (DNMDR).
+Specifically, weighted snapshot sequences for dynamic heterogeneous networks
+are constructed based on discrete visits in temporal EHRs, and all the dynamic
+networks are jointly trained to gain both structural correlations in diverse
+medical events and temporal dependency in historical health conditions, for
+achieving comprehensive patient representations with both semantic features and
+structural relationships. Moreover, combining the drug co-occurrences and
+adverse drug-drug interactions (DDIs) in internal view of drug molecule
+structure and interactive view of drug pairs, the safe drug representations are
+available to obtain high-quality medication combination recommendation.
+Finally, extensive experiments on real world datasets are conducted for
+performance evaluation, and the experimental results demonstrate that the
+proposed DNMDR method outperforms the state-of-the-art baseline models with a
+large margin on various metrics such as PRAUC, Jaccard, DDI rates and so on.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guiding Retrieval using LLM-based Listwise Rankers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mandeep Rathee, Sean MacAvaney, Avishek Anand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown strong promise as rerankers,
+especially in ``listwise'' settings where an LLM is prompted to rerank several
+search results at once. However, this ``cascading'' retrieve-and-rerank
+approach is limited by the bounded recall problem: relevant documents not
+retrieved initially are permanently excluded from the final ranking. Adaptive
+retrieval techniques address this problem, but do not work with listwise
+rerankers because they assume a document's score is computed independently from
+other documents. In this paper, we propose an adaptation of an existing
+adaptive retrieval method that supports the listwise setting and helps guide
+the retrieval process itself (thereby overcoming the bounded recall problem for
+LLM rerankers). Specifically, our proposed algorithm merges results both from
+the initial ranking and feedback documents provided by the most relevant
+documents seen up to that point. Through extensive experiments across diverse
+LLM rerankers, first stage retrievers, and feedback sources, we demonstrate
+that our method can improve nDCG@10 by up to 13.23% and recall by 28.02%--all
+while keeping the total number of LLM inferences constant and overheads due to
+the adaptive process minimal. The work opens the door to leveraging LLM-based
+search in settings where the initial pool of results is limited, e.g., by
+legacy systems, or by the cost of deploying a semantic first-stage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 2 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Agentic Retrieval-Augmented Generation: A <span class="highlight-title">Survey</span> on Agentic RAG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditi Singh, Abul Ehtesham, Saket Kumar, Tala Talaei Khoei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have revolutionized artificial intelligence (AI)
+by enabling human like text generation and natural language understanding.
+However, their reliance on static training data limits their ability to respond
+to dynamic, real time queries, resulting in outdated or inaccurate outputs.
+Retrieval Augmented Generation (RAG) has emerged as a solution, enhancing LLMs
+by integrating real time data retrieval to provide contextually relevant and
+up-to-date responses. Despite its promise, traditional RAG systems are
+constrained by static workflows and lack the adaptability required for
+multistep reasoning and complex task management.
+  Agentic Retrieval-Augmented Generation (Agentic RAG) transcends these
+limitations by embedding autonomous AI agents into the RAG pipeline. These
+agents leverage agentic design patterns reflection, planning, tool use, and
+multiagent collaboration to dynamically manage retrieval strategies,
+iteratively refine contextual understanding, and adapt workflows to meet
+complex task requirements. This integration enables Agentic RAG systems to
+deliver unparalleled flexibility, scalability, and context awareness across
+diverse applications.
+  This survey provides a comprehensive exploration of Agentic RAG, beginning
+with its foundational principles and the evolution of RAG paradigms. It
+presents a detailed taxonomy of Agentic RAG architectures, highlights key
+applications in industries such as healthcare, finance, and education, and
+examines practical implementation strategies. Additionally, it addresses
+challenges in scaling these systems, ensuring ethical decision making, and
+optimizing performance for real-world applications, while providing detailed
+insights into frameworks and tools for implementing Agentic RAG
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Robustness of Contrastive Learning Models for Medical
+  Image-Report Retrieval <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Demetrio Deanda, Yuktha Priya Masupalli, Jeong Yang, Young Lee, Zechun Cao, Gongbo Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical images and reports offer invaluable insights into patient health. The
+heterogeneity and complexity of these data hinder effective analysis. To bridge
+this gap, we investigate contrastive learning models for cross-domain
+retrieval, which associates medical images with their corresponding clinical
+reports. This study benchmarks the robustness of four state-of-the-art
+contrastive learning models: CLIP, CXR-RePaiR, MedCLIP, and CXR-CLIP. We
+introduce an occlusion retrieval task to evaluate model performance under
+varying levels of image corruption. Our findings reveal that all evaluated
+models are highly sensitive to out-of-distribution data, as evidenced by the
+proportional decrease in performance with increasing occlusion levels. While
+MedCLIP exhibits slightly more robustness, its overall performance remains
+significantly behind CXR-CLIP and CXR-RePaiR. CLIP, trained on a
+general-purpose dataset, struggles with medical image-report retrieval,
+highlighting the importance of domain-specific training data. The evaluation of
+this work suggests that more effort needs to be spent on improving the
+robustness of these models. By addressing these limitations, we can develop
+more reliable cross-domain retrieval models for medical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted to AAAI 2025 Workshop -- the 9th International
+  Workshop on Health Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Off-policy Evaluation for Payments at Adyen <span class="chip">RecSys '25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Egg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper demonstrates the successful application of Off-Policy Evaluation
+(OPE) to accelerate recommender system development and optimization at Adyen, a
+global leader in financial payment processing. Facing the limitations of
+traditional A/B testing, which proved slow, costly, and often inconclusive, we
+integrated OPE to enable rapid evaluation of new recommender system variants
+using historical data. Our analysis, conducted on a billion-scale dataset of
+transactions, reveals a strong correlation between OPE estimates and online A/B
+test results, projecting an incremental 9--54 million transactions over a
+six-month period. We explore the practical challenges and trade-offs associated
+with deploying OPE in a high-volume production environment, including
+leveraging exploration traffic for data collection, mitigating variance in
+importance sampling, and ensuring scalability through the use of Apache Spark.
+By benchmarking various OPE estimators, we provide guidance on their
+effectiveness and integration into the decision-making systems for large-scale
+industrial payment systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, submitted to RecSys '25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SupplyGraph: A Benchmark <span class="highlight-title">Dataset</span> for Supply Chain Planning using Graph
+  Neural Networks <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15299v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15299v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azmine Toushik Wasi, MD Shafikul Islam, Adipto Raihan Akib
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have gained traction across different domains
+such as transportation, bio-informatics, language processing, and computer
+vision. However, there is a noticeable absence of research on applying GNNs to
+supply chain networks. Supply chain networks are inherently graph-like in
+structure, making them prime candidates for applying GNN methodologies. This
+opens up a world of possibilities for optimizing, predicting, and solving even
+the most complex supply chain problems. A major setback in this approach lies
+in the absence of real-world benchmark datasets to facilitate the research and
+resolution of supply chain problems using GNNs. To address the issue, we
+present a real-world benchmark dataset for temporal tasks, obtained from one of
+the leading FMCG companies in Bangladesh, focusing on supply chain planning for
+production purposes. The dataset includes temporal data as node features to
+enable sales predictions, production planning, and the identification of
+factory issues. By utilizing this dataset, researchers can employ GNNs to
+address numerous supply chain problems, thereby advancing the field of supply
+chain analytics and planning. Source: https://github.com/CIOL-SUST/SupplyGraph
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 4th workshop on Graphs and more Complex structures for
+  Learning and Reasoning, colocated with AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fusion <span class="highlight-title">Self-supervised</span> Learning for Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19692v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19692v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhang, Lei Sang, Yi Zhang, Yiwen Zhang, Yun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems are widely deployed in various web environments, and
+self-supervised learning (SSL) has recently attracted significant attention in
+this field. Contrastive learning (CL) stands out as a major SSL paradigm due to
+its robust ability to generate self-supervised signals. Mainstream graph
+contrastive learning (GCL)-based methods typically implement CL by creating
+contrastive views through various data augmentation techniques. Despite these
+methods are effective, we argue that there still exist several challenges. i)
+Data augmentation ($e.g.,$ discarding edges or adding noise) necessitates
+additional graph convolution (GCN) or modeling operations, which are highly
+time-consuming and potentially harm the embedding quality. ii) Existing
+CL-based methods use traditional CL objectives to capture self-supervised
+signals. However, few studies have explored obtaining CL objectives from more
+perspectives and have attempted to fuse the varying signals from these CL
+objectives to enhance recommendation performance.
+  To overcome these challenges, we propose a High-order Fusion Graph
+Contrastive Learning (HFGCL) framework for recommendation. Specifically,
+instead of facilitating data augmentations, we use high-order information from
+GCN process to create contrastive views. Additionally, to integrate
+self-supervised signals from various CL objectives, we propose an advanced CL
+objective. By ensuring that positive pairs are distanced from negative samples
+derived from both contrastive views, we effectively fuse self-supervised
+signals from distinct CL objectives, thereby enhancing the mutual information
+between positive pairs. Experimental results on three public datasets
+demonstrate the superior recommendation performance and efficiency of HFGCL
+compared to the state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">178</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Fast, Specialized Machine Learning Force Fields: Distilling
+  Foundation Models via Energy Hessians <span class="chip">ICLR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ishan Amin, Sanjeev Raja, Aditi Krishnapriyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The foundation model (FM) paradigm is transforming Machine Learning Force
+Fields (MLFFs), leveraging general-purpose representations and scalable
+training to perform a variety of computational chemistry tasks. Although MLFF
+FMs have begun to close the accuracy gap relative to first-principles methods,
+there is still a strong need for faster inference speed. Additionally, while
+research is increasingly focused on general-purpose models which transfer
+across chemical space, practitioners typically only study a small subset of
+systems at a given time. This underscores the need for fast, specialized MLFFs
+relevant to specific downstream applications, which preserve test-time physical
+soundness while maintaining train-time scalability. In this work, we introduce
+a method for transferring general-purpose representations from MLFF foundation
+models to smaller, faster MLFFs specialized to specific regions of chemical
+space. We formulate our approach as a knowledge distillation procedure, where
+the smaller "student" MLFF is trained to match the Hessians of the energy
+predictions of the "teacher" foundation model. Our specialized MLFFs can be up
+to 20 $\times$ faster than the original foundation model, while retaining, and
+in some cases exceeding, its performance and that of undistilled models. We
+also show that distilling from a teacher model with a direct force
+parameterization into a student model trained with conservative forces (i.e.,
+computed as derivatives of the potential energy) successfully leverages the
+representations from the large-scale teacher for improved accuracy, while
+maintaining energy conservation during test-time molecular dynamics
+simulations. More broadly, our work suggests a new paradigm for MLFF
+development, in which foundation models are released along with smaller,
+specialized simulation "engines" for common chemical subsets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review at ICLR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Stability Estimates in Adversarial Explainable AI through
+  Alternate Search Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09006v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09006v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Burger, Charles Walter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in the effectiveness of machine learning models have come at the
+cost of enormous complexity resulting in a poor understanding of how they
+function. Local surrogate methods have been used to approximate the workings of
+these complex models, but recent work has revealed their vulnerability to
+adversarial attacks where the explanation produced is appreciably different
+while the meaning and structure of the complex model's output remains similar.
+This prior work has focused on the existence of these weaknesses but not on
+their magnitude. Here we explore using an alternate search method with the goal
+of finding minimum viable perturbations, the fewest perturbations necessary to
+achieve a fixed similarity value between the original and altered text's
+explanation. Intuitively, a method that requires fewer perturbations to expose
+a given level of instability is inferior to one which requires more. This
+nuance allows for superior comparisons of the stability of explainability
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures, 5 tables. arXiv admin note: text overlap with
+  arXiv:2406.15839</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrystalGRW: Generative Modeling of Crystal Structures with Targeted
+  Properties via Geodesic Random Walks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krit Tangsongcharoen, Teerachote Pakornchote, Chayanon Atthapak, Natthaphon Choomphon-anomakhun, Annop Ektarawong, Björn Alling, Christopher Sutton, Thiti Bovornratanaraks, Thiparat Chotibut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Determining whether a candidate crystalline material is thermodynamically
+stable depends on identifying its true ground-state structure, a central
+challenge in computational materials science. We introduce CrystalGRW, a
+diffusion-based generative model on Riemannian manifolds that proposes novel
+crystal configurations and can predict stable phases validated by density
+functional theory. The crystal properties, such as fractional coordinates,
+atomic types, and lattice matrices, are represented on suitable Riemannian
+manifolds, ensuring that new predictions generated through the diffusion
+process preserve the periodicity of crystal structures. We incorporate an
+equivariant graph neural network to also account for rotational and
+translational symmetries during the generation process. CrystalGRW demonstrates
+the ability to generate realistic crystal structures that are close to their
+ground states with accuracy comparable to existing models, while also enabling
+conditional control, such as specifying a desired crystallographic point group.
+These features help accelerate materials discovery and inverse design by
+offering stable, symmetry-consistent crystal candidates for experimental
+validation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10+12 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VECT-GAN: A variationally encoded generative model for overcoming data
+  scarcity in pharmaceutical science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssef Abdalla, Marrisa Taub, Eleanor Hilton, Priya Akkaraju, Alexander Milanovic, Mine Orlu, Abdul W. Basit, Michael T Cook, Tapabrata Chakraborty, David Shorthouse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data scarcity in pharmaceutical research has led to reliance on
+labour-intensive trial and error approaches for development rather than data
+driven methods. While Machine Learning offers a solution, existing datasets are
+often small and noisy, limiting their utility. To address this, we developed a
+Variationally Encoded Conditional Tabular Generative Adversarial Network (VECT
+GAN), a novel generative model specifically designed for augmenting small,
+noisy datasets. We introduce a pipeline where data is augmented before
+regression model development and demonstrate that this consistently and
+significantly improves performance over other state of the art tabular
+generative models. We apply this pipeline across six pharmaceutical datasets,
+and highlight its real-world applicability by developing novel polymers with
+medically desirable mucoadhesive properties, which we made and experimentally
+characterised. Additionally, we pre-train the model on the ChEMBL database of
+drug-like molecules, leveraging knowledge distillation to enhance its
+generalisability, making it readily available for use on pharmaceutical
+datasets containing small molecules, which is an extremely common
+pharmaceutical task. We demonstrate the power of synthetic data for
+regularising small tabular datasets, highlighting its potential to become
+standard practice in pharmaceutical model development, and make our method,
+including VECT GAN pretrained on ChEMBL available as a pip package.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 6 primary figures, 3 supplementary figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trusted Machine Learning Models Unlock Private Inference for Problems
+  Currently Infeasible with Cryptography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilia Shumailov, Daniel Ramage, Sarah Meiklejohn, Peter Kairouz, Florian Hartmann, Borja Balle, Eugene Bagdasarian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We often interact with untrusted parties. Prioritization of privacy can limit
+the effectiveness of these interactions, as achieving certain goals
+necessitates sharing private data. Traditionally, addressing this challenge has
+involved either seeking trusted intermediaries or constructing cryptographic
+protocols that restrict how much data is revealed, such as multi-party
+computations or zero-knowledge proofs. While significant advances have been
+made in scaling cryptographic approaches, they remain limited in terms of the
+size and complexity of applications they can be used for. In this paper, we
+argue that capable machine learning models can fulfill the role of a trusted
+third party, thus enabling secure computations for applications that were
+previously infeasible. In particular, we describe Trusted Capable Model
+Environments (TCMEs) as an alternative approach for scaling secure computation,
+where capable machine learning model(s) interact under input/output
+constraints, with explicit information flow control and explicit statelessness.
+This approach aims to achieve a balance between privacy and computational
+efficiency, enabling private inference where classical cryptographic solutions
+are currently infeasible. We describe a number of use cases that are enabled by
+TCME, and show that even some simple classic cryptographic problems can already
+be solved with TCME. Finally, we outline current limitations and discuss the
+path forward in implementing them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training-Aware Risk Control for Intensity Modulated Radiation Therapies
+  Quality Assurance with Conformal Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin He, David Adam, Sarah Han-Oh, Anqi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Measurement quality assurance (QA) practices play a key role in the safe use
+of Intensity Modulated Radiation Therapies (IMRT) for cancer treatment. These
+practices have reduced measurement-based IMRT QA failure below 1%. However,
+these practices are time and labor intensive which can lead to delays in
+patient care. In this study, we examine how conformal prediction methodologies
+can be used to robustly triage plans. We propose a new training-aware conformal
+risk control method by combining the benefit of conformal risk control and
+conformal training. We incorporate the decision making thresholds based on the
+gamma passing rate, along with the risk functions used in clinical evaluation,
+into the design of the risk control framework. Our method achieves high
+sensitivity and specificity and significantly reduces the number of plans
+needing measurement without generating a huge confidence interval. Our results
+demonstrate the validity and applicability of conformal prediction methods for
+improving efficiency and reducing the workload of the IMRT QA process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2024 Machine Learning for Health Symposium</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kolmogorov-Arnold Networks for Time Series Granger Causality Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meiliang Liu, Yunfang Xu, Zijin Li, Zhengye Si, Xiaoxiao Yang, Xinyue Yang, Zhiwen Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Granger Causality Kolmogorov-Arnold Networks (GCKAN), an
+innovative architecture that extends the recently proposed Kolmogorov-Arnold
+Networks (KAN) to the domain of causal inference. By extracting base weights
+from KAN layers and incorporating the sparsity-inducing penalty along with
+ridge regularization, GCKAN infers the Granger causality from time series while
+enabling automatic time lag selection. Additionally, we propose an algorithm
+leveraging time-reversed Granger causality to enhance inference accuracy. The
+algorithm compares prediction and sparse-inducing losses derived from the
+original and time-reversed series, automatically selecting the casual
+relationship with the higher score or integrating the results to mitigate
+spurious connectivities. Comprehensive experiments conducted on Lorenz-96, gene
+regulatory networks, fMRI BOLD signals, and VAR datasets demonstrate that the
+proposed model achieves competitive performance to state-of-the-art methods in
+inferring Granger causality from nonlinear, high-dimensional, and
+limited-sample time series.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computing Approximated Fixpoints via Dampened Mann Iteration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paolo Baldan, Sebastian Gurke, Barbara König, Tommaso Padoan, Florian Wittbold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fixpoints are ubiquitous in computer science and when dealing with
+quantitative semantics and verification one is commonly led to consider least
+fixpoints of (higher-dimensional) functions over the nonnegative reals. We show
+how to approximate the least fixpoint of such functions, focusing on the case
+in which they are not known precisely, but represented by a sequence of
+approximating functions that converge to them. We concentrate on monotone and
+non-expansive functions, for which uniqueness of fixpoints is not guaranteed
+and standard fixpoint iteration schemes might get stuck at a fixpoint that is
+not the least. Our main contribution is the identification of an iteration
+scheme, a variation of Mann iteration with a dampening factor, which, under
+suitable conditions, is shown to guarantee convergence to the least fixpoint of
+the function of interest. We then argue that these results are relevant in the
+context of model-based reinforcement learning for Markov decision processes
+(MDPs), showing that the proposed iteration scheme instantiates to MDPs and
+allows us to derive convergence to the optimal expected return. More generally,
+we show that our results can be used to iterate to the least fixpoint almost
+surely for systems where the function of interest can be approximated with
+given probabilistic error bounds, as it happens for probabilistic systems, such
+as simple stochastic games, that can be explored via sampling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Reinforcement Learning Approach to Quiet and Safe UAM Traffic
+  Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Surya Murthy, John-Paul Clarke, Ufuk Topcu, Zhenyu Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urban air mobility (UAM) is a transformative system that operates various
+small aerial vehicles in urban environments to reshape urban transportation.
+However, integrating UAM into existing urban environments presents a variety of
+complex challenges. Recent analyses of UAM's operational constraints highlight
+aircraft noise and system safety as key hurdles to UAM system implementation.
+Future UAM air traffic management schemes must ensure that the system is both
+quiet and safe. We propose a multi-agent reinforcement learning approach to
+manage UAM traffic, aiming at both vertical separation assurance and noise
+mitigation. Through extensive training, the reinforcement learning agent learns
+to balance the two primary objectives by employing altitude adjustments in a
+multi-layer UAM network. The results reveal the tradeoffs among noise impact,
+traffic congestion, and separation. Overall, our findings demonstrate the
+potential of reinforcement learning in mitigating UAM's noise impact while
+maintaining safe separation using altitude adjustments
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper presented at SciTech 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangling Exploration of Large Language Models by Optimal
+  Exploitation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Grams, Patrick Betz, Christian Bartelt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploration is a crucial skill for self-improvement and open-ended
+problem-solving. However, it remains uncertain whether large language models
+can effectively explore the state-space. Existing evaluations predominantly
+focus on the trade-off between exploration and exploitation, often assessed in
+multi-armed bandit problems. In contrast, this work isolates exploration as the
+sole objective, tasking the agent with delivering information that enhances
+future returns. For the evaluation, we propose to decompose missing rewards
+into exploration and exploitation components by measuring the optimal
+achievable return for the states already explored. Our experiments with various
+LLMs reveal that most models struggle to sufficiently explore the state-space
+and that weak exploration is insufficient. We observe a positive correlation
+between model size and exploration performance, with larger models
+demonstrating superior capabilities. Furthermore, we show that our
+decomposition provides insights into differences in behaviors driven by agent
+instructions during prompt engineering, offering a valuable tool for refining
+LLM performance in exploratory tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling Melt Pool Features and Spatter Using Symbolic Regression and
+  Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olabode T. Ajenifujah, Amir Barati Farimani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Additive manufacturing (AM) is a rapidly evolving technology that has
+attracted applications across a wide range of fields due to its ability to
+fabricate complex geometries. However, one of the key challenges in AM is
+achieving consistent print quality. This inconsistency is often attributed to
+uncontrolled melt pool dynamics, partly caused by spatter which can lead to
+defects. Therefore, capturing and controlling the evolution of the melt pool is
+crucial for enhancing process stability and part quality. In this study, we
+developed a framework to support decision-making in AM operations, facilitating
+quality control and minimizing defects via machine learning (ML) and polynomial
+symbolic regression models. We implemented experimentally validated
+computational tools as a cost-effective approach to collect large datasets from
+laser powder bed fusion (LPBF) processes. For a dataset consisting of 281
+process conditions, parameters such as melt pool dimensions (length, width,
+depth), melt pool geometry (area, volume), and volume indicated as spatter were
+extracted. Using machine learning (ML) and polynomial symbolic regression
+models, a high R2 of over 95 % was achieved in predicting the melt pool
+dimensions and geometry features for both the training and testing datasets,
+with either process conditions (power and velocity) or melt pool dimensions as
+the model inputs. In the case of volume indicated as spatter, R2 improved after
+logarithmic transforming the model inputs, which was either the process
+conditions or the melt pool dimensions. Among the investigated ML models, the
+ExtraTree model achieved the highest R2 values of 96.7 % and 87.5 %.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenAI Content Detection Task 3: Cross-Domain Machine-Generated Text
+  Detection Challenge <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08913v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08913v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liam Dugan, Andrew Zhu, Firoj Alam, Preslav Nakov, Marianna Apidianaki, Chris Callison-Burch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently there have been many shared tasks targeting the detection of
+generated text from Large Language Models (LLMs). However, these shared tasks
+tend to focus either on cases where text is limited to one particular domain or
+cases where text can be from many domains, some of which may not be seen during
+test time. In this shared task, using the newly released RAID benchmark, we aim
+to answer whether or not models can detect generated text from a large, yet
+fixed, number of domains and LLMs, all of which are seen during training. Over
+the course of three months, our task was attempted by 9 teams with 23 detector
+submissions. We find that multiple participants were able to obtain accuracies
+of over 99% on machine-generated text from RAID while maintaining a 5% False
+Positive Rate -- suggesting that detectors are able to robustly detect text
+from many domains and models simultaneously. We discuss potential
+interpretations of this result and provide directions for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Projection Implicit Q-Learning with Support Constraint for Offline
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinchen Han, Hossam Afifi, Michel Marot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline Reinforcement Learning (RL) faces a critical challenge of
+extrapolation errors caused by out-of-distribution (OOD) actions. Implicit
+Q-Learning (IQL) algorithm employs expectile regression to achieve in-sample
+learning, effectively mitigating the risks associated with OOD actions.
+However, the fixed hyperparameter in policy evaluation and density-based policy
+improvement method limit its overall efficiency. In this paper, we propose
+Proj-IQL, a projective IQL algorithm enhanced with the support constraint. In
+the policy evaluation phase, Proj-IQL generalizes the one-step approach to a
+multi-step approach through vector projection, while maintaining in-sample
+learning and expectile regression framework. In the policy improvement phase,
+Proj-IQL introduces support constraint that is more aligned with the policy
+evaluation approach. Furthermore, we theoretically demonstrate that Proj-IQL
+guarantees monotonic policy improvement and enjoys a progressively more
+rigorous criterion for superior actions. Empirical results demonstrate the
+Proj-IQL achieves state-of-the-art performance on D4RL benchmarks, especially
+in challenging navigation domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View <span class="highlight-title">Transformer</span>s for Airway-To-Lung Ratio Inference on Cardiac CT
+  Scans: The C4R Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sneha N. Naik, Elsa D. Angelini, Eric A. Hoffman, Elizabeth C. Oelsner, R. Graham Barr, Benjamin M. Smith, Andrew F. Laine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ratio of airway tree lumen to lung size (ALR), assessed at full
+inspiration on high resolution full-lung computed tomography (CT), is a major
+risk factor for chronic obstructive pulmonary disease (COPD). There is growing
+interest to infer ALR from cardiac CT images, which are widely available in
+epidemiological cohorts, to investigate the relationship of ALR to severe
+COVID-19 and post-acute sequelae of SARS-CoV-2 infection (PASC). Previously,
+cardiac scans included approximately 2/3 of the total lung volume with 5-6x
+greater slice thickness than high-resolution (HR) full-lung (FL) CT. In this
+study, we present a novel attention-based Multi-view Swin Transformer to infer
+FL ALR values from segmented cardiac CT scans. For the supervised training we
+exploit paired full-lung and cardiac CTs acquired in the Multi-Ethnic Study of
+Atherosclerosis (MESA). Our network significantly outperforms a proxy direct
+ALR inference on segmented cardiac CT scans and achieves accuracy and
+reproducibility comparable with a scan-rescan reproducibility of the FL ALR
+ground-truth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to appear in Proceedings of International Symposium on
+  Biomedical Imaging (ISBI), 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Two-Stage <span class="highlight-title">Pretrain</span>ing-Finetuning Framework for Treatment Effect
+  Estimation with Unmeasured Confounding <span class="chip">KDD 25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuan Zhou, Yaxuan Li, Chunyuan Zheng, Haiteng Zhang, Min Zhang, Haoxuan Li, Mingming Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the conditional average treatment effect (CATE) from observational
+data plays a crucial role in areas such as e-commerce, healthcare, and
+economics. Existing studies mainly rely on the strong ignorability assumption
+that there are no unmeasured confounders, whose presence cannot be tested from
+observational data and can invalidate any causal conclusion. In contrast, data
+collected from randomized controlled trials (RCT) do not suffer from
+confounding, but are usually limited by a small sample size. In this paper, we
+propose a two-stage pretraining-finetuning (TSPF) framework using both
+large-scale observational data and small-scale RCT data to estimate the CATE in
+the presence of unmeasured confounding. In the first stage, a foundational
+representation of covariates is trained to estimate counterfactual outcomes
+through large-scale observational data. In the second stage, we propose to
+train an augmented representation of the covariates, which is concatenated to
+the foundational representation obtained in the first stage to adjust for the
+unmeasured confounding. To avoid overfitting caused by the small-scale RCT data
+in the second stage, we further propose a partial parameter initialization
+approach, rather than training a separate network. The superiority of our
+approach is validated on two public datasets with extensive experiments. The
+code is available at https://github.com/zhouchuanCN/KDD25-TSPF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 25 Research Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAC Learnability of Scenario Decision-Making Algorithms: Necessary and
+  Sufficient Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume O. Berger, Raphaël M. Jungers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the PAC property of scenario decision-making algorithms, that is,
+the ability to make a decision that has an arbitrarily low risk of violating an
+unknown safety constraint, provided sufficiently many realizations (called
+scenarios) of the safety constraint are sampled. Sufficient conditions for
+scenario decision-making algorithms to be PAC are available in the literature,
+such as finiteness of the VC dimension of its associated classifier and
+existence of a compression scheme. We study the question of whether these
+sufficient conditions are also necessary. We show with counterexamples that
+this is not the case in general. This contrasts with binary classification
+learning, for which the analogous conditions are sufficient and necessary.
+Popular scenario decision-making algorithms, such as scenario optimization,
+enjoy additional properties, such as stability and consistency. We show that
+even under these additional assumptions the above conclusions hold. Finally, we
+derive a necessary condition for scenario decision-making algorithms to be PAC,
+inspired by the VC dimension and the so-called no-free-lunch theorem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Compression Bounds for Scenario Decision Making 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume O. Berger, Raphaël M. Jungers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scenario decision making offers a flexible way of making decision in an
+uncertain environment while obtaining probabilistic guarantees on the risk of
+failure of the decision. The idea of this approach is to draw samples of the
+uncertainty and make a decision based on the samples, called "scenarios". The
+probabilistic guarantees take the form of a bound on the probability of
+sampling a set of scenarios that will lead to a decision whose risk of failure
+is above a given maximum tolerance. This bound can be expressed as a function
+of the number of sampled scenarios, the maximum tolerated risk, and some
+intrinsic property of the problem called the "compression size". Several such
+bounds have been proposed in the literature under various assumptions on the
+problem. We propose new bounds that improve upon the existing ones without
+requiring stronger assumptions on the problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Increasing Batch Size Improves Convergence of Stochastic Gradient
+  Descent with Momentum 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keisuke Kamo, Hideaki Iiduka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradient descent with momentum (SGDM), which is defined by adding
+a momentum term to SGD, has been well studied in both theory and practice.
+Theoretically investigated results showed that the settings of the learning
+rate and momentum weight affect the convergence of SGDM. Meanwhile, practical
+results showed that the setting of batch size strongly depends on the
+performance of SGDM. In this paper, we focus on mini-batch SGDM with constant
+learning rate and constant momentum weight, which is frequently used to train
+deep neural networks in practice. The contribution of this paper is showing
+theoretically that using a constant batch size does not always minimize the
+expectation of the full gradient norm of the empirical loss in training a deep
+neural network, whereas using an increasing batch size definitely minimizes it,
+that is, increasing batch size improves convergence of mini-batch SGDM. We also
+provide numerical results supporting our analyses, indicating specifically that
+mini-batch SGDM with an increasing batch size converges to stationary points
+faster than with a constant batch size. Python implementations of the
+optimizers used in the numerical experiments are available at
+https://anonymous.4open.science/r/momentum-increasing-batch-size-888C/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incrementally Learning Multiple Diverse Data Domains via Multi-Source
+  Dynamic Expansion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runqing Wu, Fei Ye, Qihe Liu, Guoxi Huang, Jinyu Guo, Rongyao Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual Learning seeks to develop a model capable of incrementally
+assimilating new information while retaining prior knowledge. However, current
+research predominantly addresses a straightforward learning context, wherein
+all data samples originate from a singular data domain. This paper shifts focus
+to a more complex and realistic learning environment, characterized by data
+samples sourced from multiple distinct domains. We tackle this intricate
+learning challenge by introducing a novel methodology, termed the Multi-Source
+Dynamic Expansion Model (MSDEM), which leverages various pre-trained models as
+backbones and progressively establishes new experts based on them to adapt to
+emerging tasks. Additionally, we propose an innovative dynamic expandable
+attention mechanism designed to selectively harness knowledge from multiple
+backbones, thereby accelerating the new task learning. Moreover, we introduce a
+dynamic graph weight router that strategically reuses all previously acquired
+parameters and representations for new task learning, maximizing the positive
+knowledge transfer effect, which further improves generalization performance.
+We conduct a comprehensive series of experiments, and the empirical findings
+indicate that our proposed approach achieves state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARMOR: Shielding Unlearnable Examples against Data Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueluan Gong, Yuji Wang, Yanjiao Chen, Haocheng Dong, Yiming Li, Mengyuan Sun, Shuaike Li, Qian Wang, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Private data, when published online, may be collected by unauthorized parties
+to train deep neural networks (DNNs). To protect privacy, defensive noises can
+be added to original samples to degrade their learnability by DNNs. Recently,
+unlearnable examples are proposed to minimize the training loss such that the
+model learns almost nothing. However, raw data are often pre-processed before
+being used for training, which may restore the private information of protected
+data. In this paper, we reveal the data privacy violation induced by data
+augmentation, a commonly used data pre-processing technique to improve model
+generalization capability, which is the first of its kind as far as we are
+concerned. We demonstrate that data augmentation can significantly raise the
+accuracy of the model trained on unlearnable examples from 21.3% to 66.1%. To
+address this issue, we propose a defense framework, dubbed ARMOR, to protect
+data privacy from potential breaches of data augmentation. To overcome the
+difficulty of having no access to the model training process, we design a
+non-local module-assisted surrogate model that better captures the effect of
+data augmentation. In addition, we design a surrogate augmentation selection
+strategy that maximizes distribution alignment between augmented and
+non-augmented samples, to choose the optimal augmentation strategy for each
+class. We also use a dynamic step size adjustment algorithm to enhance the
+defensive noise generation process. Extensive experiments are conducted on 4
+datasets and 5 data augmentation methods to verify the performance of ARMOR.
+Comparisons with 6 state-of-the-art defense methods have demonstrated that
+ARMOR can preserve the unlearnability of protected private data under data
+augmentation. ARMOR reduces the test accuracy of the model trained on augmented
+protected samples by as much as 60% more than baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Digital Phenotyping for Adolescent Mental Health: A Feasibility Study
+  Employing Machine Learning to Predict Mental Health Risk From Active and
+  Passive Smartphone Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Balasundaram Kadirvelu, Teresa Bellido Bel, Aglaia Freccero, Martina Di Simplicio, Dasha Nicholls, A Aldo Faisal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Adolescents are particularly vulnerable to mental disorders, with
+over 75% of cases manifesting before the age of 25. Research indicates that
+only 18 to 34% of young people experiencing high levels of depression or
+anxiety symptoms seek support. Digital tools leveraging smartphones offer
+scalable and early intervention opportunities. Objective: Using a novel machine
+learning framework, this study evaluated the feasibility of integrating active
+and passive smartphone data to predict mental disorders in non-clinical
+adolescents. Specifically, we investigated the utility of the Mindcraft app in
+predicting risks for internalising and externalising disorders, eating
+disorders, insomnia and suicidal ideation. Methods: Participants (N=103; mean
+age 16.1 years) were recruited from three London schools. Participants
+completed the Strengths and Difficulties Questionnaire, the Eating Disorders-15
+Questionnaire, Sleep Condition Indicator Questionnaire and indicated the
+presence/absence of suicidal ideation. They used the Mindcraft app for 14 days,
+contributing active data via self-reports and passive data from smartphone
+sensors. A contrastive pretraining phase was applied to enhance user-specific
+feature stability, followed by supervised fine-tuning. The model evaluation
+employed leave-one-subject-out cross-validation using balanced accuracy as the
+primary metric. Results: The integration of active and passive data achieved
+superior performance compared to individual data sources, with mean balanced
+accuracies of 0.71 for SDQ-High risk, 0.67 for insomnia, 0.77 for suicidal
+ideation and 0.70 for eating disorders. The contrastive learning framework
+stabilised daily behavioural representations, enhancing predictive robustness.
+This study demonstrates the potential of integrating active and passive
+smartphone data with advanced machine-learning techniques for predicting mental
+health risks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Counterfactual Explainable AI via Latent Space Traversal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Abildtrup Hansen, Paraskevas Pegios, Anna Calissano, Aasa Feragen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explaining the predictions of a deep neural network is a nontrivial task, yet
+high-quality explanations for predictions are often a prerequisite for
+practitioners to trust these models. Counterfactual explanations aim to explain
+predictions by finding the ''nearest'' in-distribution alternative input whose
+prediction changes in a pre-specified way. However, it remains an open question
+how to define this nearest alternative input, whose solution depends on both
+the domain (e.g. images, graphs, tabular data, etc.) and the specific
+application considered. For graphs, this problem is complicated i) by their
+discrete nature, as opposed to the continuous nature of state-of-the-art graph
+classifiers; and ii) by the node permutation group acting on the graphs. We
+propose a method to generate counterfactual explanations for any differentiable
+black-box graph classifier, utilizing a case-specific permutation equivariant
+graph variational autoencoder. We generate counterfactual explanations in a
+continuous fashion by traversing the latent space of the autoencoder across the
+classification boundary of the classifier, allowing for seamless integration of
+discrete graph structure and continuous graph attributes. We empirically
+validate the approach on three graph datasets, showing that our model is
+consistently high-performing and more robust than the baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Northern Lights Deep Learning Conference 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RouteNet-Gauss: Hardware-Enhanced Network Modeling with Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Güemes-Palau, Miquel Ferriol-Galmés, Jordi Paillisse-Vilanova, Albert López-Brescó, Pere Barlet-Ros, Albert Cabellos-Aparicio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network simulation is pivotal in network modeling, assisting with tasks
+ranging from capacity planning to performance estimation. Traditional
+approaches such as Discrete Event Simulation (DES) face limitations in terms of
+computational cost and accuracy. This paper introduces RouteNet-Gauss, a novel
+integration of a testbed network with a Machine Learning (ML) model to address
+these challenges. By using the testbed as a hardware accelerator,
+RouteNet-Gauss generates training datasets rapidly and simulates network
+scenarios with high fidelity to real-world conditions. Experimental results
+show that RouteNet-Gauss significantly reduces prediction errors by up to 95%
+and achieves a 488x speedup in inference time compared to state-of-the-art
+DES-based methods. RouteNet-Gauss's modular architecture is dynamically
+constructed based on the specific characteristics of the network scenario, such
+as topology and routing. This enables it to understand and generalize to
+different network configurations beyond those seen during training, including
+networks up to 10x larger. Additionally, it supports Temporal Aggregated
+Performance Estimation (TAPE), providing configurable temporal granularity and
+maintaining high accuracy in flow performance metrics. This approach shows
+promise in improving both simulation efficiency and accuracy, offering a
+valuable tool for network operators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Meets Queue-Reactive: A Framework for Realistic Limit
+  Order Book Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamza Bodor, Laurent Carlier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Queue-Reactive model introduced by Huang et al. (2015) has become a
+standard tool for limit order book modeling, widely adopted by both researchers
+and practitioners for its simplicity and effectiveness. We present the
+Multidimensional Deep Queue-Reactive (MDQR) model, which extends this framework
+in three ways: it relaxes the assumption of queue independence, enriches the
+state space with market features, and models the distribution of order sizes.
+Through a neural network architecture, the model learns complex dependencies
+between different price levels and adapts to varying market conditions, while
+preserving the interpretable point-process foundation of the original
+framework. Using data from the Bund futures market, we show that MDQR captures
+key market properties including the square-root law of market impact,
+cross-queue correlations, and realistic order size patterns. The model
+demonstrates particular strength in reproducing both conditional and stationary
+distributions of order sizes, as well as various stylized facts of market
+microstructure. The model achieves this while maintaining the computational
+efficiency needed for practical applications such as strategy development
+through reinforcement learning or realistic backtesting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Closer Look at the Learnability of Out-of-Distribution (OOD) Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Garov, Kamalika Chaudhuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning algorithms often encounter different or
+"out-of-distribution" (OOD) data at deployment time, and OOD detection is
+frequently employed to detect these examples. While it works reasonably well in
+practice, existing theoretical results on OOD detection are highly pessimistic.
+In this work, we take a closer look at this problem, and make a distinction
+between uniform and non-uniform learnability, following PAC learning theory. We
+characterize under what conditions OOD detection is uniformly and non-uniformly
+learnable, and we show that in several cases, non-uniform learnability turns a
+number of negative results into positive. In all cases where OOD detection is
+learnable, we provide concrete learning algorithms and a sample-complexity
+analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IDEA: Image Description Enhanced CLIP-Adapter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08816v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08816v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Ye, Feng Jiang, Qiufeng Wang, Kaizhu Huang, Jiaqi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP (Contrastive Language-Image Pre-training) has attained great success in
+pattern recognition and computer vision. Transferring CLIP to downstream tasks
+(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.
+However, current studies primarily focus on either prompt learning for text or
+adapter tuning for vision, without fully exploiting the complementary
+information and correlations among image-text pairs. In this paper, we propose
+an Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to
+few-shot image classification tasks. This method captures fine-grained features
+by leveraging both visual features and textual descriptions of images. IDEA is
+a training-free method for CLIP, and it can be comparable to or even exceeds
+state-of-the-art models on multiple tasks. Furthermore, we introduce
+Trainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable
+components (i.e., a projector and a learnable latent space), further enhancing
+the model's performance and achieving SOTA results on 11 datasets. As one
+important contribution, we employ the Llama model and design a comprehensive
+pipeline to generate textual descriptions for images of 11 datasets, resulting
+in a total of 1,637,795 image-text pairs, named "IMD-11". Our code and data are
+released at https://github.com/FourierAI/IDEA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep learning for temporal super-resolution 4D Flow MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08780v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08780v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pia Callmer, Mia Bonini, Edward Ferdian, David Nordsletten, Daniel Giese, Alistair A. Young, Alexander Fyrdahl, David Marlevi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  4D Flow Magnetic Resonance Imaging (4D Flow MRI) is a non-invasive technique
+for volumetric, time-resolved blood flow quantification. However, apparent
+trade-offs between acquisition time, image noise, and resolution limit clinical
+applicability. In particular, in regions of highly transient flow, coarse
+temporal resolution can hinder accurate capture of physiologically relevant
+flow variations. To overcome these issues, post-processing techniques using
+deep learning have shown promising results to enhance resolution post-scan
+using so-called super-resolution networks. However, while super-resolution has
+been focusing on spatial upsampling, temporal super-resolution remains largely
+unexplored. The aim of this study was therefore to implement and evaluate a
+residual network for temporal super-resolution 4D Flow MRI. To achieve this, an
+existing spatial network (4DFlowNet) was re-designed for temporal upsampling,
+adapting input dimensions, and optimizing internal layer structures. Training
+and testing were performed using synthetic 4D Flow MRI data originating from
+patient-specific in-silico models, as well as using in-vivo datasets. Overall,
+excellent performance was achieved with input velocities effectively denoised
+and temporally upsampled, with a mean absolute error (MAE) of 1.0 cm/s in an
+unseen in-silico setting, outperforming deterministic alternatives (linear
+interpolation MAE = 2.3 cm/s, sinc interpolation MAE = 2.6 cm/s). Further, the
+network synthesized high-resolution temporal information from unseen
+low-resolution in-vivo data, with strong correlation observed at peak flow
+frames. As such, our results highlight the potential of utilizing data-driven
+neural networks for temporal super-resolution 4D Flow MRI, enabling
+high-frame-rate flow quantification without extending acquisition times beyond
+clinically acceptable limits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nesterov Acceleration for Ensemble Kalman Inversion and Variants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sydney Vernon, Eviatar Bach, Oliver R. A. Dunbar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensemble Kalman inversion (EKI) is a derivative-free, particle-based
+optimization method for solving inverse problems. It can be shown that EKI
+approximates a gradient flow, which allows the application of methods for
+accelerating gradient descent. Here, we show that Nesterov acceleration is
+effective in speeding up the reduction of the EKI cost function on a variety of
+inverse problems. We also implement Nesterov acceleration for two EKI variants,
+unscented Kalman inversion and ensemble transform Kalman inversion. Our
+specific implementation takes the form of a particle-level nudge that is
+demonstrably simple to couple in a black-box fashion with any existing EKI
+variant algorithms, comes with no additional computational expense, and with no
+additional tuning hyperparameters. This work shows a pathway for future
+research to translate advances in gradient-based optimization into advances in
+gradient-free Kalman optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Networked Agents in the Dark: Team Value Learning under Partial
+  Observability <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guilherme S. Varela, Alberto Sardinha, Francisco S. Melo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel cooperative multi-agent reinforcement learning (MARL)
+approach for networked agents. In contrast to previous methods that rely on
+complete state information or joint observations, our agents must learn how to
+reach shared objectives under partial observability. During training, they
+collect individual rewards and approximate a team value function through local
+communication, resulting in cooperative behavior. To describe our problem, we
+introduce the networked dynamic partially observable Markov game framework,
+where agents communicate over a switching topology communication network. Our
+distributed method, DNA-MARL, uses a consensus mechanism for local
+communication and gradient descent for local computation. DNA-MARL increases
+the range of the possible applications of networked agents, being well-suited
+for real world domains that impose privacy and where the messages may not reach
+their recipients. We evaluate DNA-MARL across benchmark MARL scenarios. Our
+results highlight the superior performance of DNA-MARL over previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 7 figures, 5 tables. Accepted as supplemental material at
+  Proceedings of the 24th International Conference on Autonomous Agents and
+  Multiagent Systems (AAMAS 2025), Detroit, Michigan, USA, May 19 - 23, 2025,
+  IFAAMAS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging LLM Agents for Translating Network Configurations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08760v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08760v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunze Wei, Xiaohui Xie, Yiwei Zuo, Tianshuo Hu, Xinyi Chen, Kaiwen Chi, Yong Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Configuration translation is a critical and frequent task in network
+operations. When a network device is damaged or outdated, administrators need
+to replace it to maintain service continuity. The replacement devices may
+originate from different vendors, necessitating configuration translation to
+ensure seamless network operation. However, translating configurations manually
+is a labor-intensive and error-prone process. In this paper, we propose an
+intent-based framework for translating network configuration with Large
+Language Model (LLM) Agents. The core of our approach is an Intent-based
+Retrieval Augmented Generation (IRAG) module that systematically splits a
+configuration file into fragments, extracts intents, and generates accurate
+translations. We also design a two-stage verification method to validate the
+syntax and semantics correctness of the translated configurations. We implement
+and evaluate the proposed method on real-world network configurations.
+Experimental results show that our method achieves 97.74% syntax correctness,
+outperforming state-of-the-art methods in translation accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeshMask: Physics-Based Simulations with Masked Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Garnier, Vincent Lannelongue, Jonathan Viquerat, Elie Hachem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel masked pre-training technique for graph neural networks
+(GNNs) applied to computational fluid dynamics (CFD) problems. By randomly
+masking up to 40\% of input mesh nodes during pre-training, we force the model
+to learn robust representations of complex fluid dynamics. We pair this masking
+strategy with an asymmetric encoder-decoder architecture and gated multi-layer
+perceptrons to further enhance performance. The proposed method achieves
+state-of-the-art results on seven CFD datasets, including a new challenging
+dataset of 3D intracranial aneurysm simulations with over 250,000 nodes per
+mesh. Moreover, it significantly improves model performance and training
+efficiency across such diverse range of fluid simulation tasks. We demonstrate
+improvements of up to 60\% in long-term prediction accuracy compared to
+previous best models, while maintaining similar computational costs. Notably,
+our approach enables effective pre-training on multiple datasets
+simultaneously, significantly reducing the time and data required to achieve
+high performance on new tasks. Through extensive ablation studies, we provide
+insights into the optimal masking ratio, architectural choices, and training
+strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resource-Constrained Federated Continual Learning: What Does Matter? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08737v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08737v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichen Li, Yuying Wang, Jiahua Dong, Haozhao Wang, Yining Qi, Rui Zhang, Ruixuan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Continual Learning (FCL) aims to enable sequentially
+privacy-preserving model training on streams of incoming data that vary in edge
+devices by preserving previous knowledge while adapting to new data. Current
+FCL literature focuses on restricted data privacy and access to previously seen
+data while imposing no constraints on the training overhead. This is
+unreasonable for FCL applications in real-world scenarios, where edge devices
+are primarily constrained by resources such as storage, computational budget,
+and label rate. We revisit this problem with a large-scale benchmark and
+analyze the performance of state-of-the-art FCL approaches under different
+resource-constrained settings. Various typical FCL techniques and six datasets
+in two incremental learning scenarios (Class-IL and Domain-IL) are involved in
+our experiments. Through extensive experiments amounting to a total of over
+1,000+ GPU hours, we find that, under limited resource-constrained settings,
+existing FCL approaches, with no exception, fail to achieve the expected
+performance. Our conclusions are consistent in the sensitivity analysis. This
+suggests that most existing FCL methods are particularly too resource-dependent
+for real-world deployment. Moreover, we study the performance of typical FCL
+techniques with resource constraints and shed light on future research
+directions in FCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2303.11165 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GRAPPA - A Hybrid Graph Neural Network for Predicting Pure Component
+  Vapor Pressures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Hoffmann, Hans Hasse, Fabian Jirasek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although the pure component vapor pressure is one of the most important
+properties for designing chemical processes, no broadly applicable,
+sufficiently accurate, and open-source prediction method has been available. To
+overcome this, we have developed GRAPPA - a hybrid graph neural network for
+predicting vapor pressures of pure components. GRAPPA enables the prediction of
+the vapor pressure curve of basically any organic molecule, requiring only the
+molecular structure as input. The new model consists of three parts: A graph
+attention network for the message passing step, a pooling function that
+captures long-range interactions, and a prediction head that yields the
+component-specific parameters of the Antoine equation, from which the vapor
+pressure can readily and consistently be calculated for any temperature. We
+have trained and evaluated GRAPPA on experimental vapor pressure data of almost
+25,000 pure components. We found excellent prediction accuracy for unseen
+components, outperforming state-of-the-art group contribution methods and other
+machine learning approaches in applicability and accuracy. The trained model
+and its code are fully disclosed, and GRAPPA is directly applicable via the
+interactive website ml-prop.mv.rptu.de.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transformed Low-rank Adaptation via Tensor Decomposition and Its
+  Applications to Text-to-image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zerui Tao, Yuhta Takida, Naoki Murata, Qibin Zhao, Yuki Mitsufuji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-Efficient Fine-Tuning (PEFT) of text-to-image models has become an
+increasingly popular technique with many applications. Among the various PEFT
+methods, Low-Rank Adaptation (LoRA) and its variants have gained significant
+attention due to their effectiveness, enabling users to fine-tune models with
+limited computational resources. However, the approximation gap between the
+low-rank assumption and desired fine-tuning weights prevents the simultaneous
+acquisition of ultra-parameter-efficiency and better performance. To reduce
+this gap and further improve the power of LoRA, we propose a new PEFT method
+that combines two classes of adaptations, namely, transform and residual
+adaptations. In specific, we first apply a full-rank and dense transform to the
+pre-trained weight. This learnable transform is expected to align the
+pre-trained weight as closely as possible to the desired weight, thereby
+reducing the rank of the residual weight. Then, the residual part can be
+effectively approximated by more compact and parameter-efficient structures,
+with a smaller approximation error. To achieve ultra-parameter-efficiency in
+practice, we design highly flexible and effective tensor decompositions for
+both the transform and residual adaptations. Additionally, popular PEFT methods
+such as DoRA can be summarized under this transform plus residual adaptation
+scheme. Experiments are conducted on fine-tuning Stable Diffusion models in
+subject-driven and controllable generation. The results manifest that our
+method can achieve better performances and parameter efficiency compared to
+LoRA and several baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ $\texttt{InfoHier}$: Hierarchical Information Extraction via Encoding
+  and Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianru Zhang, Li Ju, Prashant Singh, Salman Toor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing large-scale datasets, especially involving complex and
+high-dimensional data like images, is particularly challenging. While
+self-supervised learning (SSL) has proven effective for learning
+representations from unlabelled data, it typically focuses on flat,
+non-hierarchical structures, missing the multi-level relationships present in
+many real-world datasets. Hierarchical clustering (HC) can uncover these
+relationships by organizing data into a tree-like structure, but it often
+relies on rigid similarity metrics that struggle to capture the complexity of
+diverse data types. To address these we envision $\texttt{InfoHier}$, a
+framework that combines SSL with HC to jointly learn robust latent
+representations and hierarchical structures. This approach leverages SSL to
+provide adaptive representations, enhancing HC's ability to capture complex
+patterns. Simultaneously, it integrates HC loss to refine SSL training,
+resulting in representations that are more attuned to the underlying
+information hierarchy. $\texttt{InfoHier}$ has the potential to improve the
+expressiveness and performance of both clustering and representation learning,
+offering significant benefits for data analysis, management, and information
+retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Transformation Learning for Equivariant Representations <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemyung Yu, Jaehyun Choi, Dong-Jae Lee, HyeongGwon Hong, Junmo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised representation learning has significantly advanced various
+machine learning tasks. In the computer vision domain, state-of-the-art
+approaches utilize transformations like random crop and color jitter to achieve
+invariant representations, embedding semantically the same inputs despite
+transformations. However, this can degrade performance in tasks requiring
+precise features, such as localization or flower classification. To address
+this, recent research incorporates equivariant representation learning, which
+captures transformation-sensitive information. However, current methods depend
+on transformation labels and thus struggle with interdependency and complex
+transformations. We propose Self-supervised Transformation Learning (STL),
+replacing transformation labels with transformation representations derived
+from image pairs. The proposed method ensures transformation representation is
+image-invariant and learns corresponding equivariant transformations, enhancing
+performance without increased batch complexity. We demonstrate the approach's
+effectiveness across diverse classification and detection tasks, outperforming
+existing methods in 7 out of 11 benchmarks and excelling in detection. By
+integrating complex transformations like AugMix, unusable by prior equivariant
+methods, this approach enhances performance across tasks, underscoring its
+adaptability and resilience. Additionally, its compatibility with various base
+models highlights its flexibility and broad applicability. The code is
+available at https://github.com/jaemyung-u/stl.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38th Conference on Neural Information Processing Systems (NeurIPS
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangled Interleaving Variational Encoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noelle Y. L. Wong, Eng Yeow Cheu, Zhonglin Chiam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conflicting objectives present a considerable challenge in interleaving
+multi-task learning, necessitating the need for meticulous design and balance
+to ensure effective learning of a representative latent data space across all
+tasks without mutual negative impact. Drawing inspiration from the concept of
+marginal and conditional probability distributions in probability theory, we
+design a principled and well-founded approach to disentangle the original input
+into marginal and conditional probability distributions in the latent space of
+a variational autoencoder. Our proposed model, Deep Disentangled Interleaving
+Variational Encoding (DeepDIVE) learns disentangled features from the original
+input to form clusters in the embedding space and unifies these features via
+the cross-attention mechanism in the fusion stage. We theoretically prove that
+combining the objectives for reconstruction and forecasting fully captures the
+lower bound and mathematically derive a loss function for disentanglement using
+Na\"ive Bayes. Under the assumption that the prior is a mixture of log-concave
+distributions, we also establish that the Kullback-Leibler divergence between
+the prior and the posterior is upper bounded by a function minimized by the
+minimizer of the cross entropy loss, informing our adoption of radial basis
+functions (RBF) and cross entropy with interleaving training for DeepDIVE to
+provide a justified basis for convergence. Experiments on two public datasets
+show that DeepDIVE disentangles the original input and yields forecast
+accuracies better than the original VAE and comparable to existing
+state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diagonal Over-parameterization in Reproducing Kernel Hil<span class="highlight-title">bert</span> Spaces as
+  an Adaptive Feature Model: Generalization and Adaptivity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicheng Li, Qian Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a diagonal adaptive kernel model that dynamically
+learns kernel eigenvalues and output coefficients simultaneously during
+training. Unlike fixed-kernel methods tied to the neural tangent kernel theory,
+the diagonal adaptive kernel model adapts to the structure of the truth
+function, significantly improving generalization over fixed-kernel methods,
+especially when the initial kernel is misaligned with the target. Moreover, we
+show that the adaptivity comes from learning the right eigenvalues during
+training, showing a feature learning behavior. By extending to deeper
+parameterization, we further show how extra depth enhances adaptability and
+generalization. This study combines the insights from feature learning and
+implicit regularization and provides new perspective into the adaptivity and
+generalization potential of neural networks beyond the kernel regime.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2409.00894</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating Parameter-Efficiency of Hybrid QuGANs Based on Geometric
+  Properties of Generated Sea Route Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Rohe, Florian Burger, Michael Kölle, Sebastian Wölckert, Maximilian Zorn, Claudia Linnhoff-Popien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The demand for artificially generated data for the development, training and
+testing of new algorithms is omnipresent. Quantum computing (QC), does offer
+the hope that its inherent probabilistic functionality can be utilised in this
+field of generative artificial intelligence. In this study, we use
+quantum-classical hybrid generative adversarial networks (QuGANs) to
+artificially generate graphs of shipping routes. We create a training dataset
+based on real shipping data and investigate to what extent QuGANs are able to
+learn and reproduce inherent distributions and geometric features of this data.
+We compare hybrid QuGANs with classical Generative Adversarial Networks (GANs),
+with a special focus on their parameter efficiency. Our results indicate that
+QuGANs are indeed able to quickly learn and represent underlying geometric
+properties and distributions, although they seem to have difficulties in
+introducing variance into the sampled data. Compared to classical GANs of
+greater size, measured in the number of parameters used, some QuGANs show
+similar result quality. Our reference to concrete use cases, such as the
+generation of shipping data, provides an illustrative example and demonstrate
+the potential and diversity in which QC can be used.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SPEQ: Stabilization Phases for Efficient Q-Learning in High
+  Update-To-Data Ratio Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlo Romeo, Girolamo Macaluso, Alessandro Sestini, Andrew D. Bagdanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge in Deep Reinforcement Learning is sample efficiency,
+especially in real-world applications where collecting environment interactions
+is expensive or risky. Recent off-policy algorithms improve sample efficiency
+by increasing the Update-To-Data (UTD) ratio and performing more gradient
+updates per environment interaction. While this improves sample efficiency, it
+significantly increases computational cost due to the higher number of gradient
+updates required. In this paper we propose a sample-efficient method to improve
+computational efficiency by separating training into distinct learning phases
+in order to exploit gradient updates more effectively. Our approach builds on
+top of the Dropout Q-Functions (DroQ) algorithm and alternates between an
+online, low UTD ratio training phase, and an offline stabilization phase.
+During the stabilization phase, we fine-tune the Q-functions without collecting
+new environment interactions. This process improves the effectiveness of the
+replay buffer and reduces computational overhead. Our experimental results on
+continuous control problems show that our method achieves results comparable to
+state-of-the-art, high UTD ratio algorithms while requiring 56\% fewer gradient
+updates and 50\% less training time than DroQ. Our approach offers an effective
+and computationally economical solution while maintaining the same sample
+efficiency as the more costly, high UTD ratio state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Product of Gaussian Mixture Diffusion Model for non-linear MRI Inversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08662v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08662v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurenz Nagler, Martin Zach, Thomas Pock
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have recently shown remarkable results in magnetic resonance
+imaging reconstruction. However, the employed networks typically are black-box
+estimators of the (smoothed) prior score with tens of millions of parameters,
+restricting interpretability and increasing reconstruction time. Furthermore,
+parallel imaging reconstruction algorithms either rely on off-line coil
+sensitivity estimation, which is prone to misalignment and restricting sampling
+trajectories, or perform per-coil reconstruction, making the computational cost
+proportional to the number of coils. To overcome this, we jointly reconstruct
+the image and the coil sensitivities using the lightweight,
+parameter-efficient, and interpretable product of Gaussian mixture diffusion
+model as an image prior and a classical smoothness priors on the coil
+sensitivities. The proposed method delivers promising results while allowing
+for fast inference and demonstrating robustness to contrast out-of-distribution
+data and sampling trajectories, comparable to classical variational penalties
+such as total variation. Finally, the probabilistic formulation allows the
+calculation of the posterior expectation and pixel-wise variance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-grained Spatio-temporal Event Prediction with Self-adaptive Anchor
+  Graph <span class="chip">SDM'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang-Tao Zhou, Zhao Kang, Sicong Liu, Lizong Zhang, Ling Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event prediction tasks often handle spatio-temporal data distributed in a
+large spatial area. Different regions in the area exhibit different
+characteristics while having latent correlations. This spatial heterogeneity
+and correlations greatly affect the spatio-temporal distributions of event
+occurrences, which has not been addressed by state-of-the-art models. Learning
+spatial dependencies of events in a continuous space is challenging due to its
+fine granularity and a lack of prior knowledge. In this work, we propose a
+novel Graph Spatio-Temporal Point Process (GSTPP) model for fine-grained event
+prediction. It adopts an encoder-decoder architecture that jointly models the
+state dynamics of spatially localized regions using neural Ordinary
+Differential Equations (ODEs). The state evolution is built on the foundation
+of a novel Self-Adaptive Anchor Graph (SAAG) that captures spatial
+dependencies. By adaptively localizing the anchor nodes in the space and
+jointly constructing the correlation edges between them, the SAAG enhances the
+model's ability of learning complex spatial event patterns. The proposed GSTPP
+model greatly improves the accuracy of fine-grained event prediction. Extensive
+experimental results show that our method greatly improves the prediction
+accuracy over existing spatio-temporal event prediction approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIAM International Conference on Data Mining 2025
+  (SDM'25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint Learning of Depth and Appearance for Portrait Image Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinya Ji, Gaspard Zoss, Prashanth Chandran, Lingchen Yang, Xun Cao, Barbara Solenthaler, Derek Bradley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  2D portrait animation has experienced significant advancements in recent
+years. Much research has utilized the prior knowledge embedded in large
+generative diffusion models to enhance high-quality image manipulation.
+However, most methods only focus on generating RGB images as output, and the
+co-generation of consistent visual plus 3D output remains largely
+under-explored. In our work, we propose to jointly learn the visual appearance
+and depth simultaneously in a diffusion-based portrait image generator. Our
+method embraces the end-to-end diffusion paradigm and introduces a new
+architecture suitable for learning this conditional joint distribution,
+consisting of a reference network and a channel-expanded diffusion backbone.
+Once trained, our framework can be efficiently adapted to various downstream
+applications, such as facial depth-to-image and image-to-depth generation,
+portrait relighting, and audio-driven talking head animation with consistent 3D
+output.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum Reservoir Computing and Risk Bounds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08640v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08640v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naomi Mona Chmielewski, Nina Amini, Joseph Mikael
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a way to bound the generalisation errors of several classes of
+quantum reservoirs using the Rademacher complexity. We give specific,
+parameter-dependent bounds for two particular quantum reservoir classes. We
+analyse how the generalisation bounds scale with growing numbers of qubits.
+Applying our results to classes with polynomial readout functions, we find that
+the risk bounds converge in the number of training samples. The explicit
+dependence on the quantum reservoir and readout parameters in our bounds can be
+used to control the generalisation error to a certain extent. It should be
+noted that the bounds scale exponentially with the number of qubits $n$. The
+upper bounds on the Rademacher complexity can be applied to other reservoir
+classes that fulfill a few hypotheses on the quantum dynamics and the readout
+function.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SWSC: Shared Weight for Similar Channel in LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binrui Zeng, Yongtao Tang, Xiaodong Liu, Xiaopeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have spurred development in multiple industries.
+However, the growing number of their parameters brings substantial storage and
+computing burdens, making it essential to explore model compression techniques
+for parameter reduction and easier deployment. We propose SWSC, an LLM
+compression method based on the concept of Shared Weight for Similar Channel.
+It uses the K-Means clustering algorithm to cluster model weights
+channel-by-channel, generating clusters with highly similar vectors within
+each. A representative vector from each cluster is selected to approximately
+replace all vectors in the cluster, significantly reducing the number of model
+weight parameters. However, approximate restoration will inevitably cause
+damage to the performance of the model. To tackle this issue, we perform
+singular value decomposition on the weight error values before and after
+compression and retain the larger singular values and their corresponding
+singular vectors to compensate for the accuracy. The experimental results show
+that our method can effectively ensure the performance of the compressed LLM
+even under low-precision conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5pages, 3 figures, work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>-based Multivariate Time Series Anomaly Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charalampos Shimillas, Kleanthis Malialis, Konstantinos Fokianos, Marios M. Polycarpou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing complexity of Cyber-Physical Systems (CPS) and the
+integration of Internet of Things (IoT), the use of sensors for online
+monitoring generates large volume of multivariate time series (MTS) data.
+Consequently, the need for robust anomaly diagnosis in MTS is paramount to
+maintaining system reliability and safety. While significant advancements have
+been made in anomaly detection, localization remains a largely underexplored
+area, though crucial for intelligent decision-making. This paper introduces a
+novel transformer-based model for unsupervised anomaly diagnosis in MTS, with a
+focus on improving localization performance, through an in-depth analysis of
+the self-attention mechanism's learning behavior under both normal and
+anomalous conditions. We formulate the anomaly localization problem as a
+three-stage process: time-step, window, and segment-based. This leads to the
+development of the Space-Time Anomaly Score (STAS), a new metric inspired by
+the connection between transformer latent representations and space-time
+statistical models. STAS is designed to capture individual anomaly behaviors
+and inter-series dependencies, delivering enhanced localization performance.
+Additionally, the Statistical Feature Anomaly Score (SFAS) complements STAS by
+analyzing statistical features around anomalies, with their combination helping
+to reduce false alarms. Experiments on real world and synthetic datasets
+illustrate the model's superiority over state-of-the-art methods in both
+detection and localization tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Learning Algorithm That Attains the Human Optimum in a Repeated
+  Human-Machine Interaction Game 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08626v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08626v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason T. Isa, Lillian J. Ratliff, Samuel A. Burden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When humans interact with learning-based control systems, a common goal is to
+minimize a cost function known only to the human. For instance, an exoskeleton
+may adapt its assistance in an effort to minimize the human's metabolic
+cost-of-transport. Conventional approaches to synthesizing the learning
+algorithm solve an inverse problem to infer the human's cost. However, these
+problems can be ill-posed, hard to solve, or sensitive to problem data. Here we
+show a game-theoretic learning algorithm that works solely by observing human
+actions to find the cost minimum, avoiding the need to solve an inverse
+problem. We evaluate the performance of our algorithm in an extensive set of
+human subjects experiments, demonstrating consistent convergence to the minimum
+of a prescribed human cost function in scalar and multidimensional
+instantiations of the game. We conclude by outlining future directions for
+theoretical and empirical extensions of our results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CT-PatchTST: Channel-Time Patch Time-Series <span class="highlight-title">Transformer</span> for Long-Term
+  Renewable Energy Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menghao Huo, Kuan Lu, Yuxiao Li, Qiang Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately predicting renewable energy output is crucial for the efficient
+integration of solar and wind power into modern energy systems. This study
+develops and evaluates an advanced deep learning model, Channel-Time Patch
+Time-Series Transformer (CT-PatchTST), to forecast the power output of
+photovoltaic and wind energy systems using annual offshore wind power, onshore
+wind power, and solar power generation data from Denmark. While the original
+Patch Time-Series Transformer(PatchTST) model employs a channel-independent
+(CI) approach, it tends to overlook inter-channel relationships during
+training, potentially leading to a loss of critical information. To address
+this limitation and further leverage the benefits of increased data granularity
+brought by CI, we propose CT-PatchTST. This enhanced model improves the
+processing of inter-channel information while maintaining the advantages of the
+channel-independent approach. The predictive performance of CT-PatchTST is
+rigorously analyzed, demonstrating its ability to provide precise and reliable
+energy forecasts. This work contributes to improving the predictability of
+renewable energy systems, supporting their broader adoption and integration
+into energy grids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiqu Liang, Haimin Hu, Ryan Liu, Thomas L. Griffiths, Jaime Fernández Fisac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI systems like foundation models (FMs) must align well with human
+values to ensure their behavior is helpful and trustworthy. While Reinforcement
+Learning from Human Feedback (RLHF) has shown promise for optimizing model
+performance using human judgments, existing RLHF pipelines predominantly rely
+on immediate feedback, which can fail to accurately reflect the downstream
+impact of an interaction on users' utility. We demonstrate that feedback based
+on evaluators' foresight estimates of downstream consequences systematically
+induces Goodhart's Law dynamics, incentivizing misaligned behaviors like
+sycophancy and deception and ultimately degrading user outcomes. To alleviate
+this, we propose decoupling evaluation from prediction by refocusing RLHF on
+hindsight feedback. Our theoretical analysis reveals that conditioning
+evaluator feedback on downstream observations mitigates misalignment and
+improves expected human utility, even when these observations are simulated by
+the AI system itself. To leverage this insight in a practical alignment
+algorithm, we introduce Reinforcement Learning from Hindsight Simulation
+(RLHS), which first simulates plausible consequences and then elicits feedback
+to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS
+to two widely-employed online and offline preference optimization methods --
+Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) --
+and show empirically that misalignment is significantly reduced with both
+methods. Through an online human user study, we show that RLHS consistently
+outperforms RLHF in helping users achieve their goals and earns higher
+satisfaction ratings, despite being trained solely with simulated hindsight
+feedback. These results underscore the importance of focusing on long-term
+consequences, even simulated ones, to mitigate misalignment in RLHF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Aligned Data Forgetting via Twin Machine Unlearning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08615v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08615v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenxing Niu, Haoxuan Ji, Yuyao Sun, Zheng Lin, Fei Gao, Yuhang Wang, Haichao Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern privacy regulations have spurred the evolution of machine unlearning,
+a technique enabling a trained model to efficiently forget specific training
+data. In prior unlearning methods, the concept of "data forgetting" is often
+interpreted and implemented as achieving zero classification accuracy on such
+data. Nevertheless, the authentic aim of machine unlearning is to achieve
+alignment between the unlearned model and the gold model, i.e., encouraging
+them to have identical classification accuracy. On the other hand, the gold
+model often exhibits non-zero classification accuracy due to its generalization
+ability. To achieve aligned data forgetting, we propose a Twin Machine
+Unlearning (TMU) approach, where a twin unlearning problem is defined
+corresponding to the original unlearning problem. Consequently, the
+generalization-label predictor trained on the twin problem can be transferred
+to the original problem, facilitating aligned data forgetting. Comprehensive
+empirical experiments illustrate that our approach significantly enhances the
+alignment between the unlearned model and the gold model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2408.11433</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Risk-sensitive Satisficing in Contextual Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08612v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08612v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shogo Ito, Tatsuji Takahashi, Yu Kono
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The contextual bandit problem, which is a type of reinforcement learning
+tasks, provides an effective framework for solving challenges in recommendation
+systems, such as satisfying real-time requirements, enabling personalization,
+addressing cold-start problems. However, contextual bandit algorithms face
+challenges since they need to handle large state-action spaces sequentially.
+These challenges include the high costs for learning and balancing exploration
+and exploitation, as well as large variations in performance that depend on the
+domain of application. To address these challenges, Tsuboya et~al. proposed the
+Regional Linear Risk-sensitive Satisficing (RegLinRS) algorithm. RegLinRS
+switches between exploration and exploitation based on how well the agent has
+achieved the target. However, the reward expectations in RegLinRS are linearly
+approximated based on features, which limits its applicability when the
+relationship between features and reward expectations is non-linear. To handle
+more complex environments, we proposed Neural Risk-sensitive Satisficing
+(NeuralRS), which incorporates neural networks into RegLinRS, and demonstrated
+its utility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AROB-ISBC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenMLDB: A Real-Time Relational Data Feature Computation System for
+  Online ML 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanhe Zhou, Wei Zhou, Liguo Qi, Hao Zhang, Dihao Chen, Bingsheng He, Mian Lu, Guoliang Li, Fan Wu, Yuqiang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient and consistent feature computation is crucial for a wide range of
+online ML applications. Typically, feature computation is divided into two
+distinct phases, i.e., offline stage for model training and online stage for
+model serving. These phases often rely on execution engines with different
+interface languages and function implementations, causing significant
+inconsistencies. Moreover, many online ML features involve complex time-series
+computations (e.g., functions over varied-length table windows) that differ
+from standard streaming and analytical queries. Existing data processing
+systems (e.g., Spark, Flink, DuckDB) often incur multi-second latencies for
+these computations, making them unsuitable for real-time online ML applications
+that demand timely feature updates.
+  This paper presents OpenMLDB, a feature computation system deployed in
+4Paradigm's SageOne platform and over 100 real scenarios. Technically, OpenMLDB
+first employs a unified query plan generator for consistent computation results
+across the offline and online stages, significantly reducing feature deployment
+overhead. Second, OpenMLDB provides an online execution engine that resolves
+performance bottlenecks caused by long window computations (via
+pre-aggregation) and multi-table window unions (via data self-adjusting). It
+also provides a high-performance offline execution engine with window parallel
+optimization and time-aware data skew resolving. Third, OpenMLDB features a
+compact data format and stream-focused indexing to maximize memory usage and
+accelerate data access. Evaluations in testing and real workloads reveal
+significant performance improvements and resource savings compared to the
+baseline systems. The open community of OpenMLDB now has over 150 contributors
+and gained 1.6k stars on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Molecular Graph Contrastive Learning with Line Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08589v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08589v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueyuan Chen, Shangzhe Li, Ruomei Liu, Bowen Shi, Jiaheng Liu, Junran Wu, Ke Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trapped by the label scarcity in molecular property prediction and drug
+design, graph contrastive learning (GCL) came forward. Leading contrastive
+learning works show two kinds of view generators, that is, random or learnable
+data corruption and domain knowledge incorporation. While effective, the two
+ways also lead to molecular semantics altering and limited generalization
+capability, respectively. To this end, we relate the \textbf{L}in\textbf{E}
+graph with \textbf{MO}lecular graph co\textbf{N}trastive learning and propose a
+novel method termed \textit{LEMON}. Specifically, by contrasting the given
+graph with the corresponding line graph, the graph encoder can freely encode
+the molecular semantics without omission. Furthermore, we present a new patch
+with edge attribute fusion and two local contrastive losses enhance information
+transmission and tackle hard negative samples. Compared with state-of-the-art
+(SOTA) methods for view generation, superior performance on molecular property
+prediction suggests the effectiveness of our proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Normalize Then Propagate: Efficient Homophilous Regularization for
+  Few-shot Semi-Supervised Node Classification <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baoming Zhang, MingCai Chen, Jianqing Song, Shuangjie Li, Jie Zhang, Chongjun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have demonstrated remarkable ability in
+semi-supervised node classification. However, most existing GNNs rely heavily
+on a large amount of labeled data for training, which is labor-intensive and
+requires extensive domain knowledge. In this paper, we first analyze the
+restrictions of GNNs generalization from the perspective of supervision signals
+in the context of few-shot semi-supervised node classification. To address
+these challenges, we propose a novel algorithm named NormProp, which utilizes
+the homophily assumption of unlabeled nodes to generate additional supervision
+signals, thereby enhancing the generalization against label scarcity. The key
+idea is to efficiently capture both the class information and the consistency
+of aggregation during message passing, via decoupling the direction and
+Euclidean norm of node representations. Moreover, we conduct a theoretical
+analysis to determine the upper bound of Euclidean norm, and then propose
+homophilous regularization to constraint the consistency of unlabeled nodes.
+Extensive experiments demonstrate that NormProp achieve state-of-the-art
+performance under low-label rate scenarios with low computational complexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DNMDR: Dynamic Networks and Multi-view Drug Representations for Safe
+  Medication Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanlin Liu, Xiaomei Yu, Zihao Liu, Xue Li, Xingxu Fan, Xiangwei Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medication Recommendation (MR) is a promising research topic which booms
+diverse applications in the healthcare and clinical domains. However, existing
+methods mainly rely on sequential modeling and static graphs for representation
+learning, which ignore the dynamic correlations in diverse medical events of a
+patient's temporal visits, leading to insufficient global structural
+exploration on nodes. Additionally, mitigating drug-drug interactions (DDIs) is
+another issue determining the utility of the MR systems. To address the
+challenges mentioned above, this paper proposes a novel MR method with the
+integration of dynamic networks and multi-view drug representations (DNMDR).
+Specifically, weighted snapshot sequences for dynamic heterogeneous networks
+are constructed based on discrete visits in temporal EHRs, and all the dynamic
+networks are jointly trained to gain both structural correlations in diverse
+medical events and temporal dependency in historical health conditions, for
+achieving comprehensive patient representations with both semantic features and
+structural relationships. Moreover, combining the drug co-occurrences and
+adverse drug-drug interactions (DDIs) in internal view of drug molecule
+structure and interactive view of drug pairs, the safe drug representations are
+available to obtain high-quality medication combination recommendation.
+Finally, extensive experiments on real world datasets are conducted for
+performance evaluation, and the experimental results demonstrate that the
+proposed DNMDR method outperforms the state-of-the-art baseline models with a
+large margin on various metrics such as PRAUC, Jaccard, DDI rates and so on.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Sampled Softmax with Inverted Multi-Index: Methods, Theory and
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin Chen, Jin Zhang, Xu huang, Yi Yang, Defu Lian, Enhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The softmax function is a cornerstone of multi-class classification, integral
+to a wide range of machine learning applications, from large-scale retrieval
+and ranking models to advanced large language models. However, its
+computational cost grows linearly with the number of classes, which becomes
+prohibitively expensive in scenarios with millions or even billions of classes.
+The sampled softmax, which relies on self-normalized importance sampling, has
+emerged as a powerful alternative, significantly reducing computational
+complexity. Yet, its estimator remains unbiased only when the sampling
+distribution matches the true softmax distribution. To improve both
+approximation accuracy and sampling efficiency, we propose the MIDX Sampler, a
+novel adaptive sampling strategy based on an inverted multi-index approach.
+Concretely, we decompose the softmax probability into several multinomial
+probabilities, each associated with a specific set of codewords and the last
+associated with the residual score of queries, thus reducing time complexity to
+the number of codewords instead of the number of classes. To further boost
+efficiency, we replace the query-specific residual probability with a simple
+uniform distribution, simplifying the computation while retaining high
+performance. Our method is backed by rigorous theoretical analysis, addressing
+key concerns such as sampling bias, gradient bias, convergence rates, and
+generalization error bounds. The results demonstrate that a smaller divergence
+from the ideal softmax distribution leads to faster convergence and improved
+generalization. Extensive experiments on large-scale language models,
+sequential recommenders, and extreme multi-class classification tasks confirm
+that the MIDX-Sampler delivers superior effectiveness and efficiency compared
+to existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MIAFEx: An Attention-based Feature Extraction Method for Medical Image
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08562v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08562v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Ramos-Soto, Jorge Ramos-Frutos, Ezequiel Perez-Zarate, Diego Oliva, Sandra E. Balderas-Mata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Feature extraction techniques are crucial in medical image classification;
+however, classical feature extractors in addition to traditional machine
+learning classifiers often exhibit significant limitations in providing
+sufficient discriminative information for complex image sets. While
+Convolutional Neural Networks (CNNs) and Vision Transformer (ViT) have shown
+promise in feature extraction, they are prone to overfitting due to the
+inherent characteristics of medical imaging data, including small sample sizes
+or high intra-class variance. In this work, the Medical Image Attention-based
+Feature Extractor (MIAFEx) is proposed, a novel method that employs a learnable
+refinement mechanism to enhance the classification token within the Transformer
+encoder architecture. This mechanism adjusts the token based on learned
+weights, improving the extraction of salient features and enhancing the model's
+adaptability to the challenges presented by medical imaging data. The MIAFEx
+output features quality is compared against classical feature extractors using
+traditional and hybrid classifiers. Also, the performance of these features is
+compared against modern CNN and ViT models in classification tasks,
+demonstrating its superiority in accuracy and robustness across multiple
+complex classification medical imaging datasets. This advantage is particularly
+pronounced in scenarios with limited training data, where traditional and
+modern models often struggle to generalize effectively. The source code of this
+proposal can be found at
+https://github.com/Oscar-RamosS/Medical-Image-Attention-based-Feature-Extractor-MIAFEx
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In preparation for Journal Submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ANSR-DT: An Adaptive Neuro-Symbolic Learning and Reasoning Framework for
+  Digital Twins 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08561v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08561v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Safayat Bin Hakim, Muhammad Adil, Alvaro Velasquez, Houbing Herbert Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose an Adaptive Neuro-Symbolic Learning Framework for
+digital twin technology called ``ANSR-DT." Our approach combines pattern
+recognition algorithms with reinforcement learning and symbolic reasoning to
+enable real-time learning and adaptive intelligence. This integration enhances
+the understanding of the environment and promotes continuous learning, leading
+to better and more effective decision-making in real-time for applications that
+require human-machine collaboration. We evaluated the \textit{ANSR-DT}
+framework for its ability to learn and adapt to dynamic patterns, observing
+significant improvements in decision accuracy, reliability, and
+interpretability when compared to existing state-of-the-art methods. However,
+challenges still exist in extracting and integrating symbolic rules in complex
+environments, which limits the full potential of our framework in heterogeneous
+settings. Moreover, our ongoing research aims to address this issue in the
+future by ensuring seamless integration of neural models at large. In addition,
+our open-source implementation promotes reproducibility and encourages future
+research to build on our foundational work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LAMS: LLM-Driven Automatic Mode Switching for Assistive Teleoperation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiran Tao, Jehan Yang, Dan Ding, Zackory Erickson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teleoperating high degrees-of-freedom (DoF) robotic manipulators via low-DoF
+controllers like joysticks often requires frequent switching between control
+modes, where each mode maps controller movements to specific robot actions.
+Manually performing this frequent switching can make teleoperation cumbersome
+and inefficient. On the other hand, existing automatic mode-switching
+solutions, such as heuristic-based or learning-based methods, are often
+task-specific and lack generalizability. In this paper, we introduce LLM-Driven
+Automatic Mode Switching (LAMS), a novel approach that leverages Large Language
+Models (LLMs) to automatically switch control modes based on task context.
+Unlike existing methods, LAMS requires no prior task demonstrations and
+incrementally improves by integrating user-generated mode-switching examples.
+We validate LAMS through an ablation study and a user study with 10
+participants on complex, long-horizon tasks, demonstrating that LAMS
+effectively reduces manual mode switches, is preferred over alternative
+methods, and improves performance over time. The project website with
+supplementary materials is at https://lams-assistance.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning-Enhanced Procedural Generation for Dynamic
+  Narrative-Driven AR Experiences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aniruddha Srinivas Joshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Procedural Content Generation (PCG) is widely used to create scalable and
+diverse environments in games. However, existing methods, such as the Wave
+Function Collapse (WFC) algorithm, are often limited to static scenarios and
+lack the adaptability required for dynamic, narrative-driven applications,
+particularly in augmented reality (AR) games. This paper presents a
+reinforcement learning-enhanced WFC framework designed for mobile AR
+environments. By integrating environment-specific rules and dynamic tile weight
+adjustments informed by reinforcement learning (RL), the proposed method
+generates maps that are both contextually coherent and responsive to gameplay
+needs. Comparative evaluations and user studies demonstrate that the framework
+achieves superior map quality and delivers immersive experiences, making it
+well-suited for narrative-driven AR games. Additionally, the method holds
+promise for broader applications in education, simulation training, and
+immersive extended reality (XR) experiences, where dynamic and adaptive
+environments are critical.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Number of pages: 13, Number of figures: 4. Accepted for presentation
+  at GRAPP 2025 - 20th International Conference on Computer Graphics Theory and
+  Applications (for additional details on the conference visit
+  https://grapp.scitevents.org). Disclaimer: This preprint may differ from the
+  final version published in the conference proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Theory of Optimistically Universal Online Learnability for General
+  Concept Classes <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08551v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08551v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steve Hanneke, Hongao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide a full characterization of the concept classes that are
+optimistically universally online learnable with $\{0, 1\}$ labels. The notion
+of optimistically universal online learning was defined in [Hanneke, 2021] in
+order to understand learnability under minimal assumptions. In this paper,
+following the philosophy behind that work, we investigate two questions,
+namely, for every concept class: (1) What are the minimal assumptions on the
+data process admitting online learnability? (2) Is there a learning algorithm
+which succeeds under every data process satisfying the minimal assumptions?
+Such an algorithm is said to be optimistically universal for the given concept
+class. We resolve both of these questions for all concept classes, and
+moreover, as part of our solution, we design general learning algorithms for
+each case. Finally, we extend these algorithms and results to the agnostic
+case, showing an equivalence between the minimal assumptions on the data
+process for learnability in the agnostic and realizable cases, for every
+concept class, as well as the equivalence of optimistically universal
+learnability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OMEGA: A Low-Latency GNN Serving System for Large Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08547v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08547v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geon-Woo Kim, Donghyun Kim, Jeongyoon Moon, Henry Liu, Tarannum Khan, Anand Iyer, Daehyeok Kim, Aditya Akella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have been widely adopted for their ability to
+compute expressive node representations in graph datasets. However, serving
+GNNs on large graphs is challenging due to the high communication, computation,
+and memory overheads of constructing and executing computation graphs, which
+represent information flow across large neighborhoods. Existing approximation
+techniques in training can mitigate the overheads but, in serving, still lead
+to high latency and/or accuracy loss. To this end, we propose OMEGA, a system
+that enables low-latency GNN serving for large graphs with minimal accuracy
+loss through two key ideas. First, OMEGA employs selective recomputation of
+precomputed embeddings, which allows for reusing precomputed computation
+subgraphs while selectively recomputing a small fraction to minimize accuracy
+loss. Second, we develop computation graph parallelism, which reduces
+communication overhead by parallelizing the creation and execution of
+computation graphs across machines. Our evaluation with large graph datasets
+and GNN models shows that OMEGA significantly outperforms state-of-the-art
+techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Homophily-aware Heterogeneous Graph Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08538v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08538v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haosen Wang, Chenglong Shi, Can Xu, Surong Yan, Pan Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Heterogeneous graph pre-training (HGP) has demonstrated remarkable
+performance across various domains. However, the issue of heterophily in
+real-world heterogeneous graphs (HGs) has been largely overlooked. To bridge
+this research gap, we proposed a novel heterogeneous graph contrastive learning
+framework, termed HGMS, which leverages connection strength and multi-view
+self-expression to learn homophilous node representations. Specifically, we
+design a heterogeneous edge dropping augmentation strategy that enhances the
+homophily of augmented views. Moreover, we introduce a multi-view
+self-expressive learning method to infer the homophily between nodes. In
+practice, we develop two approaches to solve the self-expressive matrix. The
+solved self-expressive matrix serves as an additional augmented view to provide
+homophilous information and is used to identify false negatives in contrastive
+loss. Extensive experimental results demonstrate the superiority of HGMS across
+different downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Complexity Control Facilitates Reasoning-Based Compositional
+  Generalization in <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongwang Zhang, Pengxiao Lin, Zhiwei Wang, Yaoyu Zhang, Zhi-Qin John Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have demonstrated impressive capabilities across various tasks,
+yet their performance on compositional problems remains a subject of debate. In
+this study, we investigate the internal mechanisms underlying Transformers'
+behavior in compositional tasks. We find that complexity control strategies
+significantly influence whether the model learns primitive-level rules that
+generalize out-of-distribution (reasoning-based solutions) or relies solely on
+memorized mappings (memory-based solutions). By applying masking strategies to
+the model's information circuits and employing multiple complexity metrics, we
+reveal distinct internal working mechanisms associated with different solution
+types. Further analysis reveals that reasoning-based solutions exhibit a lower
+complexity bias, which aligns with the well-studied neuron condensation
+phenomenon. This lower complexity bias is hypothesized to be the key factor
+enabling these solutions to learn reasoning rules. We validate these
+conclusions across multiple real-world datasets, including image generation and
+natural language processing tasks, confirming the broad applicability of our
+findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Mistakenly submitted as a replacement to 2405.05409v4</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Domain Shift in Federated Learning via Intra- and
+  Inter-Domain Prototypes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08521v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08521v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huy Q. Le, Ye Lin Tun, Yu Qiao, Minh N. H. Nguyen, Keon Oh Kim, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has emerged as a decentralized machine learning
+technique, allowing clients to train a global model collaboratively without
+sharing private data. However, most FL studies ignore the crucial challenge of
+heterogeneous domains where each client has a distinct feature distribution,
+which is common in real-world scenarios. Prototype learning, which leverages
+the mean feature vectors within the same classes, has become a prominent
+solution for federated learning under domain skew. However, existing federated
+prototype learning methods only consider inter-domain prototypes on the server
+and overlook intra-domain characteristics. In this work, we introduce a novel
+federated prototype learning method, namely I$^2$PFL, which incorporates
+$\textbf{I}$ntra-domain and $\textbf{I}$nter-domain $\textbf{P}$rototypes, to
+mitigate domain shifts and learn a generalized global model across multiple
+domains in federated learning. To construct intra-domain prototypes, we propose
+feature alignment with MixUp-based augmented prototypes to capture the
+diversity of local domains and enhance the generalization of local features.
+Additionally, we introduce a reweighting mechanism for inter-domain prototypes
+to generate generalized prototypes to provide inter-domain knowledge and reduce
+domain skew across multiple clients. Extensive experiments on the Digits,
+Office-10, and PACS datasets illustrate the superior performance of our method
+compared to other baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Hyperplane Tree: A Piecewise Linear and Fully Interpretable
+  Decision-making Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyi Li, Jun Xu, William Ward Armstrong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel tree-based model, Learning Hyperplane Tree
+(LHT), which outperforms state-of-the-art (SOTA) tree models for classification
+tasks on several public datasets. The structure of LHT is simple and efficient:
+it partitions the data using several hyperplanes to progressively distinguish
+between target and non-target class samples. Although the separation is not
+perfect at each stage, LHT effectively improves the distinction through
+successive partitions. During testing, a sample is classified by evaluating the
+hyperplanes defined in the branching blocks and traversing down the tree until
+it reaches the corresponding leaf block. The class of the test sample is then
+determined using the piecewise linear membership function defined in the leaf
+blocks, which is derived through least-squares fitting and fuzzy logic. LHT is
+highly transparent and interpretable--at each branching block, the contribution
+of each feature to the classification can be clearly observed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Score-based 3D molecule generation with neural fields <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthieu Kirchmeyer, Pedro O. Pinheiro, Saeed Saremi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new representation for 3D molecules based on their continuous
+atomic density fields. Using this representation, we propose a new model based
+on walk-jump sampling for unconditional 3D molecule generation in the
+continuous space using neural fields. Our model, FuncMol, encodes molecular
+fields into latent codes using a conditional neural field, samples noisy codes
+from a Gaussian-smoothed distribution with Langevin MCMC (walk), denoises these
+samples in a single step (jump), and finally decodes them into molecular
+fields. FuncMol performs all-atom generation of 3D molecules without
+assumptions on the molecular structure and scales well with the size of
+molecules, unlike most approaches. Our method achieves competitive results on
+drug-like molecules and easily scales to macro-cyclic peptides, with at least
+one order of magnitude faster sampling. The code is available at
+https://github.com/prescient-design/funcmol.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Efficacy of Meta-Learning: Unveiling Superior Data
+  Diversity Utilization of MAML Over <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kavita Selva, Satita Vittayaareekul, Brando Miranda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently, data and model size dominate the narrative in the training of
+super-large, powerful models. However, there has been a lack of exploration on
+the effect of other attributes of the training dataset on model performance. We
+hypothesize that dataset diversity can impact the performance of vision models.
+Our study shows positive correlations between test set accuracy and data
+diversity, providing an argument for furthering the research of dataset
+attributes beyond size. We analyzed pre-training and model-agnostic
+meta-learning methods on twelve popular visual datasets (e.g., Omniglot,
+CIFAR-FS, Aircraft) and five model configurations, including MAML variants with
+different numbers of inner gradient steps and supervised learning. We show
+moderate to strong positive correlations (R-squared: 0.15-0.42) between
+accuracy and data diversity and weaker but significant correlations (R-squared:
+~0.2) between loss and diversity. These findings support our hypothesis and
+demonstrate a promising way for a deeper exploration of how formal data
+diversity influences model performance. This initial study highlights the
+potential of (Task2Vec) data diversity as a valuable measure in the rapidly
+evolving field of large-scale learning and emphasizes that understanding the
+dataset is key to building more powerful and generalizable models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SuperSAM: Crafting a SAM Supernetwork via Structured Pruning and
+  Unstructured Parameter Prioritization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Waqwoya Abebe, Sadegh Jafari, Sixing Yu, Akash Dutta, Jan Strube, Nathan R. Tallent, Luanzheng Guo, Pablo Munoz, Ali Jannesari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Architecture Search (NAS) is a powerful approach of automating the
+design of efficient neural architectures. In contrast to traditional NAS
+methods, recently proposed one-shot NAS methods prove to be more efficient in
+performing NAS. One-shot NAS works by generating a singular weight-sharing
+supernetwork that acts as a search space (container) of subnetworks. Despite
+its achievements, designing the one-shot search space remains a major
+challenge. In this work we propose a search space design strategy for Vision
+Transformer (ViT)-based architectures. In particular, we convert the Segment
+Anything Model (SAM) into a weight-sharing supernetwork called SuperSAM. Our
+approach involves automating the search space design via layer-wise structured
+pruning and parameter prioritization. While the structured pruning applies
+probabilistic removal of certain transformer layers, parameter prioritization
+performs weight reordering and slicing of MLP-blocks in the remaining layers.
+We train supernetworks on several datasets using the sandwich rule. For
+deployment, we enhance subnetwork discovery by utilizing a program autotuner to
+identify efficient subnetworks within the search space. The resulting
+subnetworks are 30-70% smaller in size compared to the original pre-trained SAM
+ViT-B, yet outperform the pretrained model. Our work introduces a new and
+effective method for ViT NAS search-space design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Bayesian Physics-Informed Kolmogorov-Arnold Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Gao, George Em Karniadakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty quantification (UQ) plays a pivotal role in scientific machine
+learning, especially when surrogate models are used to approximate complex
+systems. Although multilayer perceptions (MLPs) are commonly employed as
+surrogates, they often suffer from overfitting due to their large number of
+parameters. Kolmogorov-Arnold networks (KANs) offer an alternative solution
+with fewer parameters. However, gradient-based inference methods, such as
+Hamiltonian Monte Carlo (HMC), may result in computational inefficiency when
+applied to KANs, especially for large-scale datasets, due to the high cost of
+back-propagation.To address these challenges, we propose a novel approach,
+combining the dropout Tikhonov ensemble Kalman inversion (DTEKI) with Chebyshev
+KANs. This gradient-free method effectively mitigates overfitting and enhances
+numerical stability. Additionally, we incorporate the active subspace method to
+reduce the parameter-space dimensionality, allowing us to improve the accuracy
+of predictions and obtain more reliable uncertainty estimates.Extensive
+experiments demonstrate the efficacy of our approach in various test cases,
+including scenarios with large datasets and high noise levels. Our results show
+that the new method achieves comparable or better accuracy, much higher
+efficiency as well as stability compared to HMC, in addition to scalability.
+Moreover, by leveraging the low-dimensional parameter subspace, our method
+preserves prediction accuracy while substantially reducing further the
+computational cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Testing Noise Assumptions of Learning Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Surbhi Goel, Adam R. Klivans, Konstantinos Stavropoulos, Arsen Vasilyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We pose a fundamental question in computational learning theory: can we
+efficiently test whether a training set satisfies the assumptions of a given
+noise model? This question has remained unaddressed despite decades of research
+on learning in the presence of noise. In this work, we show that this task is
+tractable and present the first efficient algorithm to test various noise
+assumptions on the training data.
+  To model this question, we extend the recently proposed testable learning
+framework of Rubinfeld and Vasilyan (2023) and require a learner to run an
+associated test that satisfies the following two conditions: (1) whenever the
+test accepts, the learner outputs a classifier along with a certificate of
+optimality, and (2) the test must pass for any dataset drawn according to a
+specified modeling assumption on both the marginal distribution and the noise
+model. We then consider the problem of learning halfspaces over Gaussian
+marginals with Massart noise (where each label can be flipped with probability
+less than $1/2$ depending on the input features), and give a fully-polynomial
+time testable learning algorithm.
+  We also show a separation between the classical setting of learning in the
+presence of structured noise and testable learning. In fact, for the simple
+case of random classification noise (where each label is flipped with fixed
+probability $\eta = 1/2$), we show that testable learning requires
+super-polynomial time while classical learning is trivial.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Patch-aware Vector Quantized Codebook Learning for Unsupervised Visual
+  Defect Detection <span class="chip">ICTAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qisen Cheng, Shuhui Qu, Janghwan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised visual defect detection is critical in industrial applications,
+requiring a representation space that captures normal data features while
+detecting deviations. Achieving a balance between expressiveness and
+compactness is challenging; an overly expressive space risks inefficiency and
+mode collapse, impairing detection accuracy. We propose a novel approach using
+an enhanced VQ-VAE framework optimized for unsupervised defect detection. Our
+model introduces a patch-aware dynamic code assignment scheme, enabling
+context-sensitive code allocation to optimize spatial representation. This
+strategy enhances normal-defect distinction and improves detection accuracy
+during inference. Experiments on MVTecAD, BTAD, and MTSD datasets show our
+method achieves state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, Accepted to 36th IEEE ICTAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Graph Representation Learning with Localized Topological
+  Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuoyu Yan, Qi Zhao, Ze Ye, Tengfei Ma, Liangcai Gao, Zhi Tang, Yusu Wang, Chao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning on graphs is a fundamental problem that can be
+crucial in various tasks. Graph neural networks, the dominant approach for
+graph representation learning, are limited in their representation power.
+Therefore, it can be beneficial to explicitly extract and incorporate
+high-order topological and geometric information into these models. In this
+paper, we propose a principled approach to extract the rich connectivity
+information of graphs based on the theory of persistent homology. Our method
+utilizes the topological features to enhance the representation learning of
+graph neural networks and achieve state-of-the-art performance on various node
+classification and link prediction benchmarks. We also explore the option of
+end-to-end learning of the topological features, i.e., treating topological
+computation as a differentiable operator during learning. Our theoretical
+analysis and empirical study provide insights and potential guidelines for
+employing topological features in graph learning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in JMLR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative AI Takes a Statistics Exam: A Comparison of Performance
+  between Chat<span class="highlight-title">GPT</span>3.5, Chat<span class="highlight-title">GPT</span>4, and Chat<span class="highlight-title">GPT</span>4o-mini 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monnie McGee, Bivin Sadler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many believe that use of generative AI as a private tutor has the potential
+to shrink access and achievement gaps between students and schools with
+abundant resources versus those with fewer resources. Shrinking the gap is
+possible only if paid and free versions of the platforms perform with the same
+accuracy. In this experiment, we investigate the performance of GPT versions
+3.5, 4.0, and 4o-mini on the same 16-question statistics exam given to a class
+of first-year graduate students. While we do not advocate using any generative
+AI platform to complete an exam, the use of exam questions allows us to explore
+aspects of ChatGPT's responses to typical questions that students might
+encounter in a statistics course. Results on accuracy indicate that GPT 3.5
+would fail the exam, GPT4 would perform well, and GPT4o-mini would perform
+somewhere in between. While we acknowledge the existence of other Generative
+AI/LLMs, our discussion concerns only ChatGPT because it is the most widely
+used platform on college campuses at this time. We further investigate
+differences among the AI platforms in the answers for each problem using
+methods developed for text analytics, such as reading level evaluation and
+topic modeling. Results indicate that GPT3.5 and 4o-mini have characteristics
+that are more similar than either of them have with GPT4.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 2 figures, 3 tables. Submitted for publication August,
+  2024; revision submitted January 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention is All You Need Until You Need Retention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Murat Yaslioglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces a novel Retention Layer mechanism for Transformer based
+architectures, addressing their inherent lack of intrinsic retention
+capabilities. Unlike human cognition, which can encode and dynamically recall
+symbolic templates, Generative Pretrained Transformers rely solely on fixed
+pretrained weights and ephemeral context windows, limiting their adaptability.
+The proposed Retention Layer incorporates a persistent memory module capable of
+real time data population, dynamic recall, and guided output generation. This
+enhancement allows models to store, update, and reuse observed patterns across
+sessions, enabling incremental learning and bridging the gap between static
+pretraining and dynamic, context sensitive adaptation. The Retention Layer
+design parallels social learning processes, encompassing attention, retention,
+reproduction, and motivation stages. Technically, it integrates a memory
+attention mechanism and episodic buffers to manage memory scalability, mitigate
+overfitting, and ensure efficient recall. Applications span adaptive personal
+assistants, real time fraud detection, autonomous robotics, content moderation,
+and healthcare diagnostics. In each domain, the retention mechanism enables
+systems to learn incrementally, personalize outputs, and respond to evolving
+real world challenges effectively. By emulating key aspects of human learning,
+this retention enhanced architecture fosters a more fluid and responsive AI
+paradigm, paving the way for dynamic, session aware models that extend the
+capabilities of traditional Transformers into domains requiring continual
+adaptation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Understanding Extrapolation: a Causal Lens <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingjing Kong, Guangyi Chen, Petar Stojanov, Haoxuan Li, Eric P. Xing, Kun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Canonical work handling distribution shifts typically necessitates an entire
+target distribution that lands inside the training distribution. However,
+practical scenarios often involve only a handful of target samples, potentially
+lying outside the training support, which requires the capability of
+extrapolation. In this work, we aim to provide a theoretical understanding of
+when extrapolation is possible and offer principled methods to achieve it
+without requiring an on-support target distribution. To this end, we formulate
+the extrapolation problem with a latent-variable model that embodies the
+minimal change principle in causal mechanisms. Under this formulation, we cast
+the extrapolation problem into a latent-variable identification problem. We
+provide realistic conditions on shift properties and the estimation objectives
+that lead to identification even when only one off-support target sample is
+available, tackling the most challenging scenarios. Our theory reveals the
+intricate interplay between the underlying manifold's smoothness and the shift
+properties. We showcase how our theoretical results inform the design of
+practical adaptation algorithms. Through experiments on both synthetic and
+real-world data, we validate our theoretical findings and their practical
+implications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoLoop: Fast Visual SLAM Fine-tuning through Agentic Curriculum
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Assaf Lahiany, Oren Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current visual SLAM systems face significant challenges in balancing
+computational efficiency with robust loop closure handling. Traditional
+approaches require careful manual tuning and incur substantial computational
+overhead, while learning-based methods either lack explicit loop closure
+capabilities or implement them through computationally expensive methods. We
+present AutoLoop, a novel approach that combines automated curriculum learning
+with efficient fine-tuning for visual SLAM systems. Our method employs a DDPG
+(Deep Deterministic Policy Gradient) agent to dynamically adjust loop closure
+weights during training, eliminating the need for manual hyperparameter search
+while significantly reducing the required training steps. The approach
+pre-computes potential loop closure pairs offline and leverages them through an
+agent-guided curriculum, allowing the model to adapt efficiently to new
+scenarios. Experiments conducted on TartanAir for training and validated across
+multiple benchmarks including KITTI, EuRoC, ICL-NUIM and TUM RGB-D demonstrate
+that AutoLoop achieves comparable or superior performance while reducing
+training time by an order of magnitude compared to traditional approaches.
+AutoLoop provides a practical solution for rapid adaptation of visual SLAM
+systems, automating the weight tuning process that traditionally requires
+multiple manual iterations. Our results show that this automated curriculum
+strategy not only accelerates training but also maintains or improves the
+model's performance across diverse environmental conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Federated Multi-Armed Bandit Learning for Content Dissemination
+  using Swarm of UAVs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Kumar Bhuyan, Hrishikesh Dutta, Subir Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an Unmanned Aerial Vehicle - enabled content management
+architecture that is suitable for critical content access in communities of
+users that are communication-isolated during diverse types of disaster
+scenarios. The proposed architecture leverages a hybrid network of stationary
+anchor UAVs and mobile Micro-UAVs for ubiquitous content dissemination. The
+anchor UAVs are equipped with both vertical and lateral communication links,
+and they serve local users, while the mobile micro-ferrying UAVs extend
+coverage across communities with increased mobility. The focus is on developing
+a content dissemination system that dynamically learns optimal caching policies
+to maximize content availability. The core innovation is an adaptive content
+dissemination framework based on distributed Federated Multi-Armed Bandit
+learning. The goal is to optimize UAV content caching decisions based on
+geo-temporal content popularity and user demand variations. A Selective Caching
+Algorithm is also introduced to reduce redundant content replication by
+incorporating inter-UAV information sharing. This method strategically
+preserves the uniqueness in user preferences while amalgamating the
+intelligence across a distributed learning system. This approach improves the
+learning algorithm's ability to adapt to diverse user preferences. Functional
+verification and performance evaluation confirm the proposed architecture's
+utility across different network sizes, UAV swarms, and content popularity
+patterns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 11 figures, 1 table, 4 algorithms, journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Descent Converges Linearly to Flatter Minima than Gradient Flow
+  in Shallow Linear Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierfrancesco Beneventano, Blake Woodworth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the gradient descent (GD) dynamics of a depth-2 linear neural
+network with a single input and output. We show that GD converges at an
+explicit linear rate to a global minimum of the training loss, even with a
+large stepsize -- about $2/\textrm{sharpness}$. It still converges for even
+larger stepsizes, but may do so very slowly. We also characterize the solution
+to which GD converges, which has lower norm and sharpness than the gradient
+flow solution. Our analysis reveals a trade off between the speed of
+convergence and the magnitude of implicit regularization. This sheds light on
+the benefits of training at the ``Edge of Stability'', which induces additional
+regularization by delaying convergence and may have implications for training
+more complex models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Robustness of Contrastive Learning Models for Medical
+  Image-Report Retrieval <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Demetrio Deanda, Yuktha Priya Masupalli, Jeong Yang, Young Lee, Zechun Cao, Gongbo Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical images and reports offer invaluable insights into patient health. The
+heterogeneity and complexity of these data hinder effective analysis. To bridge
+this gap, we investigate contrastive learning models for cross-domain
+retrieval, which associates medical images with their corresponding clinical
+reports. This study benchmarks the robustness of four state-of-the-art
+contrastive learning models: CLIP, CXR-RePaiR, MedCLIP, and CXR-CLIP. We
+introduce an occlusion retrieval task to evaluate model performance under
+varying levels of image corruption. Our findings reveal that all evaluated
+models are highly sensitive to out-of-distribution data, as evidenced by the
+proportional decrease in performance with increasing occlusion levels. While
+MedCLIP exhibits slightly more robustness, its overall performance remains
+significantly behind CXR-CLIP and CXR-RePaiR. CLIP, trained on a
+general-purpose dataset, struggles with medical image-report retrieval,
+highlighting the importance of domain-specific training data. The evaluation of
+this work suggests that more effort needs to be spent on improving the
+robustness of these models. By addressing these limitations, we can develop
+more reliable cross-domain retrieval models for medical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted to AAAI 2025 Workshop -- the 9th International
+  Workshop on Health Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep <span class="highlight-title">Self-Supervised</span> Disturbance Mapping with the OPERA Sentinel-1
+  Radiometric Terrain Corrected SAR Backscatter Product 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harris Hardiman-Mostow, Charles Marshak, Alexander L. Handwerger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mapping land surface disturbances supports disaster response, resource and
+ecosystem management, and climate adaptation efforts. Synthetic aperture radar
+(SAR) is an invaluable tool for disturbance mapping, providing consistent
+time-series images of the ground regardless of weather or illumination
+conditions. Despite SAR's potential for disturbance mapping, processing SAR
+data to an analysis-ready format requires expertise and significant compute
+resources, particularly for large-scale global analysis. In October 2023,
+NASA's Observational Products for End-Users from Remote Sensing Analysis
+(OPERA) project released the near-global Radiometric Terrain Corrected SAR
+backscatter from Sentinel-1 (RTC-S1) dataset, providing publicly available,
+analysis-ready SAR imagery. In this work, we utilize this new dataset to
+systematically analyze land surface disturbances. As labeling SAR data is often
+prohibitively time-consuming, we train a self-supervised vision transformer -
+which requires no labels to train - on OPERA RTC-S1 data to estimate a
+per-pixel distribution from the set of baseline imagery and assess disturbances
+when there is significant deviation from the modeled distribution. To test our
+model's capability and generality, we evaluate three different natural
+disasters - which represent high-intensity, abrupt disturbances - from three
+different regions of the world. Across events, our approach yields high quality
+delineations: F1 scores exceeding 0.6 and Areas Under the Precision-Recall
+Curve exceeding 0.65, consistently outperforming existing SAR disturbance
+methods. Our findings suggest that a self-supervised vision transformer is
+well-suited for global disturbance mapping and can be a valuable tool for
+operational, near-global disturbance monitoring, particularly when labeled data
+does not exist.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 18 figures, 5 tables. Preprint. Submitted to JSTARS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Augmenting Human-Annotated Training Data with Large Language Model
+  Generation and Distillation in Open-Response Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Conrad Borchers, Danielle R. Thomas, Jionghao Lin, Ralph Abboud, Kenneth R. Koedinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) like GPT-4o can help automate text
+classification tasks at low cost and scale. However, there are major concerns
+about the validity and reliability of LLM outputs. By contrast, human coding is
+generally more reliable but expensive to procure at scale. In this study, we
+propose a hybrid solution to leverage the strengths of both. We combine
+human-coded data and synthetic LLM-produced data to fine-tune a classical
+machine learning classifier, distilling both into a smaller BERT model. We
+evaluate our method on a human-coded test set as a validity measure for LLM
+output quality. In three experiments, we systematically vary LLM-generated
+samples' size, variety, and consistency, informed by best practices in LLM
+tuning. Our findings indicate that augmenting datasets with synthetic samples
+improves classifier performance, with optimal results achieved at an 80%
+synthetic to 20% human-coded data ratio. Lower temperature settings of 0.3,
+corresponding to less variability in LLM generations, produced more stable
+improvements but also limited model learning from augmented samples. In
+contrast, higher temperature settings (0.7 and above) introduced greater
+variability in performance estimates and, at times, lower performance. Hence,
+LLMs may produce more uniform output that classifiers overfit to earlier or
+produce more diverse output that runs the risk of deteriorating model
+performance through information irrelevant to the prediction task. Filtering
+out inconsistent synthetic samples did not enhance performance. We conclude
+that integrating human and LLM-generated data to improve text classification
+models in assessment offers a scalable solution that leverages both the
+accuracy of human coding and the variety of LLM outputs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript accepted to the Second Workshop on Generative AI for
+  Learning Analytics (GenAI-LA) at LAK25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Class Traffic Assignment using Multi-View Heterogeneous Graph
+  Attention Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Liu, Hadi Meidani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving traffic assignment problem for large networks is computationally
+challenging when conventional optimization-based methods are used. In our
+research, we develop an innovative surrogate model for a traffic assignment
+when multi-class vehicles are involved. We do so by employing heterogeneous
+graph neural networks which use a multiple-view graph attention mechanism
+tailored to different vehicle classes, along with additional links connecting
+origin-destination pairs. We also integrate the node-based flow conservation
+law into the loss function. As a result, our model adheres to flow conservation
+while delivering highly accurate predictions for link flows and utilization
+ratios. Through numerical experiments conducted on urban transportation
+networks, we demonstrate that our model surpasses traditional neural network
+approaches in convergence speed and predictive accuracy in both user
+equilibrium and system optimal versions of traffic assignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Post-Training Quantization: Introducing a Statistical
+  Pre-Calibration Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alireza Ghaffari, Sharareh Younesian, Boxing Chen, Vahid Partovi Nia, Masoud Asgharian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Large Language Models (LLMs) become increasingly computationally complex,
+developing efficient deployment strategies, such as quantization, becomes
+crucial. State-of-the-art Post-training Quantization (PTQ) techniques often
+rely on calibration processes to maintain the accuracy of these models.
+However, while these calibration techniques can enhance performance in certain
+domains, they may not be as effective in others. This paper aims to draw
+attention to robust statistical approaches that can mitigate such issues. We
+propose a weight-adaptive PTQ method that can be considered a precursor to
+calibration-based PTQ methods, guiding the quantization process to preserve the
+distribution of weights by minimizing the Kullback-Leibler divergence between
+the quantized weights and the originally trained weights. This minimization
+ensures that the quantized model retains the Shannon information content of the
+original model to a great extent, guaranteeing robust and efficient deployment
+across many tasks. As such, our proposed approach can perform on par with most
+common calibration-based PTQ methods, establishing a new pre-calibration step
+for further adjusting the quantized weights with calibration. We show that our
+pre-calibration results achieve the same accuracy as some existing
+calibration-based PTQ methods on various LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Similarity-Quantized Relative Difference Learning for Improved Molecular
+  Activity Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karina Zadorozhny, Kangway V. Chuang, Bharath Sathappan, Ewan Wallace, Vishnu Sresht, Colin A. Grambow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of molecular activities is crucial for efficient drug
+discovery, yet remains challenging due to limited and noisy datasets. We
+introduce Similarity-Quantized Relative Learning (SQRL), a learning framework
+that reformulates molecular activity prediction as relative difference learning
+between structurally similar pairs of compounds. SQRL uses precomputed
+molecular similarities to enhance training of graph neural networks and other
+architectures, and significantly improves accuracy and generalization in
+low-data regimes common in drug discovery. We demonstrate its broad
+applicability and real-world potential through benchmarking on public datasets
+as well as proprietary industry data. Our findings demonstrate that leveraging
+similarity-aware relative differences provides an effective paradigm for
+molecular activity prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tracking the Takes and Trajectories of English-Language News Narratives
+  across Trustworthy and Worrisome Websites <span class="chip">USENIX Security</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hans W. A. Hanley, Emily Okabe, Zakir Durumeric
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how misleading and outright false information enters news
+ecosystems remains a difficult challenge that requires tracking how narratives
+spread across thousands of fringe and mainstream news websites. To do this, we
+introduce a system that utilizes encoder-based large language models and
+zero-shot stance detection to scalably identify and track news narratives and
+their attitudes across over 4,000 factually unreliable, mixed-reliability, and
+factually reliable English-language news websites. Running our system over an
+18 month period, we track the spread of 146K news stories. Using network-based
+interference via the NETINF algorithm, we show that the paths of news
+narratives and the stances of websites toward particular entities can be used
+to uncover slanted propaganda networks (e.g., anti-vaccine and anti-Ukraine)
+and to identify the most influential websites in spreading these attitudes in
+the broader news ecosystem. We hope that increased visibility into our
+distributed news ecosystem can help with the reporting and fact-checking of
+propaganda and disinformation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at USENIX Security Symposium 2025. Keywords:
+  Misinformation, News, Narratives, LLMs, Stance-Detection</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inferring Transition Dynamics from Value Functions <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Adamczyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In reinforcement learning, the value function is typically trained to solve
+the Bellman equation, which connects the current value to future values. This
+temporal dependency hints that the value function may contain implicit
+information about the environment's transition dynamics. By rearranging the
+Bellman equation, we show that a converged value function encodes a model of
+the underlying dynamics of the environment. We build on this insight to propose
+a simple method for inferring dynamics models directly from the value function,
+potentially mitigating the need for explicit model learning. Furthermore, we
+explore the challenges of next-state identifiability, discussing conditions
+under which the inferred dynamics model is well-defined. Our work provides a
+theoretical foundation for leveraging value functions in dynamics modeling and
+opens a new avenue for bridging model-free and model-based reinforcement
+learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the AAAI-25 8th Workshop on Generalization in Planning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Average-Reward Reinforcement Learning with Entropy Regularization <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Adamczyk, Volodymyr Makarenko, Stas Tiomkin, Rahul V. Kulkarni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The average-reward formulation of reinforcement learning (RL) has drawn
+increased interest in recent years due to its ability to solve
+temporally-extended problems without discounting. Independently, RL algorithms
+have benefited from entropy-regularization: an approach used to make the
+optimal policy stochastic, thereby more robust to noise. Despite the distinct
+benefits of the two approaches, the combination of entropy regularization with
+an average-reward objective is not well-studied in the literature and there has
+been limited development of algorithms for this setting. To address this gap in
+the field, we develop algorithms for solving entropy-regularized average-reward
+RL problems with function approximation. We experimentally validate our method,
+comparing it with existing algorithms on standard benchmarks for RL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the AAAI-25 Eighth Workshop on Bridging the Gap Between
+  AI Planning and Reinforcement Learning (PRL)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative diffusion model with inverse renormalization group flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanta Masuki, Yuto Ashida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models represent a class of generative models that produce data by
+denoising a sample corrupted by white noise. Despite the success of diffusion
+models in computer vision, audio synthesis, and point cloud generation, so far
+they overlook inherent multiscale structures in data and have a slow generation
+process due to many iteration steps. In physics, the renormalization group
+offers a fundamental framework for linking different scales and giving an
+accurate coarse-grained model. Here we introduce a renormalization group-based
+diffusion model that leverages multiscale nature of data distributions for
+realizing a high-quality data generation. In the spirit of renormalization
+group procedures, we define a flow equation that progressively erases data
+information from fine-scale details to coarse-grained structures. Through
+reversing the renormalization group flows, our model is able to generate
+high-quality samples in a coarse-to-fine manner. We validate the versatility of
+the model through applications to protein structure prediction and image
+generation. Our model consistently outperforms conventional diffusion models
+across standard evaluation metrics, enhancing sample quality and/or
+accelerating sampling speed by an order of magnitude. The proposed method
+alleviates the need for data-dependent tuning of hyperparameters in the
+generative diffusion models, showing promise for systematically increasing
+sample efficiency based on the concept of the renormalization group.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9+21 pages, 4+11 figures. The code and trained models are available
+  at https://github.com/kantamasuki/RGDM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Test-Time Adaptation for Single Image Defocus Deblurring via
+  Causal Siamese Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuang Cui, Yi Li, Jiangmeng Li, Xiongxin Tang, Bing Su, Fanjiang Xu, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single image defocus deblurring (SIDD) aims to restore an all-in-focus image
+from a defocused one. Distribution shifts in defocused images generally lead to
+performance degradation of existing methods during out-of-distribution
+inferences. In this work, we gauge the intrinsic reason behind the performance
+degradation, which is identified as the heterogeneity of lens-specific point
+spread functions. Empirical evidence supports this finding, motivating us to
+employ a continual test-time adaptation (CTTA) paradigm for SIDD. However,
+traditional CTTA methods, which primarily rely on entropy minimization, cannot
+sufficiently explore task-dependent information for pixel-level regression
+tasks like SIDD. To address this issue, we propose a novel Siamese
+networks-based continual test-time adaptation framework, which adapts source
+models to continuously changing target domains only requiring unlabeled target
+data in an online manner. To further mitigate semantically erroneous textures
+introduced by source SIDD models under severe degradation, we revisit the
+learning paradigm through a structural causal model and propose Causal Siamese
+networks (CauSiam). Our method leverages large-scale pre-trained
+vision-language models to derive discriminative universal semantic priors and
+integrates these priors into Siamese networks, ensuring causal identifiability
+between blurry inputs and restored images. Extensive experiments demonstrate
+that CauSiam effectively improves the generalization performance of existing
+SIDD methods in continuously changing domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anthropomorphic Features for On-Line Signatures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moises Diaz, Miguel A. Ferrer, Jose J. Quintana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many features have been proposed in on-line signature verification.
+Generally, these features rely on the position of the on-line signature samples
+and their dynamic properties, as recorded by a tablet. This paper proposes a
+novel feature space to describe efficiently on-line signatures. Since producing
+a signature requires a skeletal arm system and its associated muscles, the new
+feature space is based on characterizing the movement of the shoulder, the
+elbow and the wrist joints when signing. As this motion is not directly
+obtained from a digital tablet, the new features are calculated by means of a
+virtual skeletal arm (VSA) model, which simulates the architecture of a real
+arm and forearm. Specifically, the VSA motion is described by its 3D joint
+position and its joint angles. These anthropomorphic features are worked out
+from both pen position and orientation through the VSA forward and direct
+kinematic model. The anthropomorphic features' robustness is proved by
+achieving state-of-the-art performance with several verifiers and multiple
+benchmarks on third party signature databases, which were collected with
+different devices and in different languages and scripts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GRAPPA -- A Hybrid Graph Neural Network for Predicting Pure Component
+  Vapor Pressures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Hoffmann, Hans Hasse, Fabian Jirasek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although the pure component vapor pressure is one of the most important
+properties for designing chemical processes, no broadly applicable,
+sufficiently accurate, and open-source prediction method has been available. To
+overcome this, we have developed GRAPPA - a hybrid graph neural network for
+predicting vapor pressures of pure components. GRAPPA enables the prediction of
+the vapor pressure curve of basically any organic molecule, requiring only the
+molecular structure as input. The new model consists of three parts: A graph
+attention network for the message passing step, a pooling function that
+captures long-range interactions, and a prediction head that yields the
+component-specific parameters of the Antoine equation, from which the vapor
+pressure can readily and consistently be calculated for any temperature. We
+have trained and evaluated GRAPPA on experimental vapor pressure data of almost
+25,000 pure components. We found excellent prediction accuracy for unseen
+components, outperforming state-of-the-art group contribution methods and other
+machine learning approaches in applicability and accuracy. The trained model
+and its code are fully disclosed, and GRAPPA is directly applicable via the
+interactive website ml-prop.mv.rptu.de.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Hemodynamic Scalar Fields on Coronary Artery Meshes: A
+  Benchmark of Geometric Deep Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guido Nannini, Julian Suk, Patryk Rygiel, Simone Saitta, Luca Mariani, Riccardo Maranga, Andrea Baggiano, Gianluca Pontone, Alberto Redaelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coronary artery disease, caused by the narrowing of coronary vessels due to
+atherosclerosis, is the leading cause of death worldwide. The diagnostic gold
+standard, fractional flow reserve (FFR), measures the trans-stenotic pressure
+ratio during maximal vasodilation but is invasive and costly. This has driven
+the development of virtual FFR (vFFR) using computational fluid dynamics (CFD)
+to simulate coronary flow. Geometric deep learning algorithms have shown
+promise for learning features on meshes, including cardiovascular research
+applications. This study empirically analyzes various backends for predicting
+vFFR fields in coronary arteries as CFD surrogates, comparing six backends for
+learning hemodynamics on meshes using CFD solutions as ground truth.
+  The study has two parts: i) Using 1,500 synthetic left coronary artery
+bifurcations, models were trained to predict pressure-related fields for vFFR
+reconstruction, comparing different learning variables. ii) Using 427
+patient-specific CFD simulations, experiments were repeated focusing on the
+best-performing learning variable from the synthetic dataset.
+  Most backends performed well on the synthetic dataset, especially when
+predicting pressure drop over the manifold. Transformer-based backends
+outperformed others when predicting pressure and vFFR fields and were the only
+models achieving strong performance on patient-specific data, excelling in both
+average per-point error and vFFR accuracy in stenotic lesions.
+  These results suggest geometric deep learning backends can effectively
+replace CFD for simple geometries, while transformer-based networks are
+superior for complex, heterogeneous datasets. Pressure drop was identified as
+the optimal network output for learning pressure-related fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CookingDiffusion: Cooking Procedural Image Generation with Stable
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Wang, Bin Xhu, Yanbin Hao, Chong-Wah Ngo, Yi Tan, Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in text-to-image generation models have excelled in
+creating diverse and realistic images. This success extends to food imagery,
+where various conditional inputs like cooking styles, ingredients, and recipes
+are utilized. However, a yet-unexplored challenge is generating a sequence of
+procedural images based on cooking steps from a recipe. This could enhance the
+cooking experience with visual guidance and possibly lead to an intelligent
+cooking simulation system. To fill this gap, we introduce a novel task called
+\textbf{cooking procedural image generation}. This task is inherently
+demanding, as it strives to create photo-realistic images that align with
+cooking steps while preserving sequential consistency. To collectively tackle
+these challenges, we present \textbf{CookingDiffusion}, a novel approach that
+leverages Stable Diffusion and three innovative Memory Nets to model procedural
+prompts. These prompts encompass text prompts (representing cooking steps),
+image prompts (corresponding to cooking images), and multi-modal prompts
+(mixing cooking steps and images), ensuring the consistent generation of
+cooking procedural images. To validate the effectiveness of our approach, we
+preprocess the YouCookII dataset, establishing a new benchmark. Our
+experimental results demonstrate that our model excels at generating
+high-quality cooking procedural images with remarkable consistency across
+sequential cooking steps, as measured by both the FID and the proposed Average
+Procedure Consistency metrics. Furthermore, CookingDiffusion demonstrates the
+ability to manipulate ingredients and cooking methods in a recipe. We will make
+our code, models, and dataset publicly accessible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pseudolabel guided pixels contrast for domain adaptive semantic
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianzi Xiang, Cailu Wan, Zhu Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation is essential for comprehending images, but the process
+necessitates a substantial amount of detailed annotations at the pixel level.
+Acquiring such annotations can be costly in the real-world. Unsupervised domain
+adaptation (UDA) for semantic segmentation is a technique that uses virtual
+data with labels to train a model and adapts it to real data without labels.
+Some recent works use contrastive learning, which is a powerful method for
+self-supervised learning, to help with this technique. However, these works do
+not take into account the diversity of features within each class when using
+contrastive learning, which leads to errors in class prediction. We analyze the
+limitations of these works and propose a novel framework called Pseudo-label
+Guided Pixel Contrast (PGPC), which overcomes the disadvantages of previous
+methods. We also investigate how to use more information from target images
+without adding noise from pseudo-labels. We test our method on two standard UDA
+benchmarks and show that it outperforms existing methods. Specifically, we
+achieve relative improvements of 5.1% mIoU and 4.6% mIoU on the Grand Theft
+Auto V (GTA5) to Cityscapes and SYNTHIA to Cityscapes tasks based on DAFormer,
+respectively. Furthermore, our approach can enhance the performance of other
+UDA approaches without increasing model complexity. Code is available at
+https://github.com/embar111/pgpc
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 5 figures. Code: https://github.com/embar111/pgpc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EVAL: EigenVector-based Average-reward Learning <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Adamczyk, Volodymyr Makarenko, Stas Tiomkin, Rahul V. Kulkarni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In reinforcement learning, two objective functions have been developed
+extensively in the literature: discounted and averaged rewards. The
+generalization to an entropy-regularized setting has led to improved robustness
+and exploration for both of these objectives. Recently, the entropy-regularized
+average-reward problem was addressed using tools from large deviation theory in
+the tabular setting. This method has the advantage of linearity, providing
+access to both the optimal policy and average reward-rate through properties of
+a single matrix. In this paper, we extend that framework to more general
+settings by developing approaches based on function approximation by neural
+networks. This formulation reveals new theoretical insights into the
+relationship between different objectives used in RL. Additionally, we combine
+our algorithm with a posterior policy iteration scheme, showing how our
+approach can also solve the average-reward RL problem without
+entropy-regularization. Using classic control benchmarks, we experimentally
+find that our method compares favorably with other algorithms in terms of
+stability and rate of convergence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the AAAI-25 8th Workshop on Generalization in Planning.
+  arXiv admin note: text overlap with arXiv:2501.09080</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Delay Sensitive Hierarchical Federated Learning with Stochastic Local
+  Updates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04851v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04851v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdulmoneam Ali, Ahmed Arafa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The impact of local averaging on the performance of federated learning (FL)
+systems is studied in the presence of communication delay between the clients
+and the parameter server. To minimize the effect of delay, clients are assigned
+into different groups, each having its own local parameter server (LPS) that
+aggregates its clients' models. The groups' models are then aggregated at a
+global parameter server (GPS) that only communicates with the LPSs. Such
+setting is known as hierarchical FL (HFL). Unlike most works in the literature,
+the number of local and global communication rounds in our work is randomly
+determined by the (different) delays experienced by each group of clients.
+Specifically, the number of local averaging rounds is tied to a wall-clock time
+period coined the sync time $S$, after which the LPSs synchronize their models
+by sharing them with the GPS. Such sync time $S$ is then reapplied until a
+global wall-clock time is exhausted.
+  First, an upper bound on the deviation between the updated model at each LPS
+with respect to that available at the GPS is derived. This is then used as a
+tool to derive the convergence analysis of our proposed delay-sensitive HFL
+algorithm, first at each LPS individually, and then at the GPS. Our theoretical
+convergence bound showcases the effects of the whole system's parameters,
+including the number of groups, the number of clients per group, and the value
+of $S$. Our results show that the value of $S$ should be carefully chosen,
+especially since it implicitly governs how the delay statistics affect the
+performance of HFL in situations where training time is restricted.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in the IEEE Transactions on Cognitive Communications and
+  Networking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reward Machines for Deep RL in Noisy and Uncertain Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00120v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00120v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew C. Li, Zizhao Chen, Toryn Q. Klassen, Pashootan Vaezipoor, Rodrigo Toro Icarte, Sheila A. McIlraith
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reward Machines provide an automaton-inspired structure for specifying
+instructions, safety constraints, and other temporally extended reward-worthy
+behaviour. By exposing the underlying structure of a reward function, they
+enable the decomposition of an RL task, leading to impressive gains in sample
+efficiency. Although Reward Machines and similar formal specifications have a
+rich history of application towards sequential decision-making problems, they
+critically rely on a ground-truth interpretation of the domain-specific
+vocabulary that forms the building blocks of the reward function--such
+ground-truth interpretations are elusive in the real world due in part to
+partial observability and noisy sensing. In this work, we explore the use of
+Reward Machines for Deep RL in noisy and uncertain environments. We
+characterize this problem as a POMDP and propose a suite of RL algorithms that
+exploit task structure under uncertain interpretation of the domain-specific
+vocabulary. Through theory and experiments, we expose pitfalls in naive
+approaches to this problem while simultaneously demonstrating how task
+structure can be successfully leveraged under noisy interpretations of the
+vocabulary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A General Framework for Inference-time Scaling and Steering of Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06848v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06848v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raghav Singhal, Zachary Horvitz, Ryan Teehan, Mengye Ren, Zhou Yu, Kathleen McKeown, Rajesh Ranganath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models produce impressive results in modalities ranging from images
+and video to protein design and text. However, generating samples with
+user-specified properties remains a challenge. Recent research proposes
+fine-tuning models to maximize rewards that capture desired properties, but
+these methods require expensive training and are prone to mode collapse. In
+this work, we propose Feynman Kac (FK) steering, an inference-time framework
+for steering diffusion models with reward functions. FK steering works by
+sampling a system of multiple interacting diffusion processes, called
+particles, and resampling particles at intermediate steps based on scores
+computed using functions called potentials. Potentials are defined using
+rewards for intermediate states and are selected such that a high value
+indicates that the particle will yield a high-reward sample. We explore various
+choices of potentials, intermediate rewards, and samplers. We evaluate FK
+steering on text-to-image and text diffusion models. For steering text-to-image
+models with a human preference reward, we find that FK steering a 0.8B
+parameter model outperforms a 2.6B parameter fine-tuned model on prompt
+fidelity, with faster sampling and no training. For steering text diffusion
+models with rewards for text quality and specific text attributes, we find that
+FK steering generates lower perplexity, more linguistically acceptable outputs
+and enables gradient-free control of attributes like toxicity. Our results
+demonstrate that inference-time scaling and steering of diffusion models, even
+with off-the-shelf rewards, can provide significant sample quality gains and
+controllability benefits. Code is available at
+https://github.com/zacharyhorvitz/Fk-Diffusion-Steering .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Federated Learning for Functional Mean Estimation under
+  Heterogeneous Privacy Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18992v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18992v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tony Cai, Abhinav Chakraborty, Lasse Vuursteen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a distributed machine learning technique designed
+to preserve data privacy and security, and it has gained significant importance
+due to its broad range of applications. This paper addresses the problem of
+optimal functional mean estimation from discretely sampled data in a federated
+setting.
+  We consider a heterogeneous framework where the number of individuals,
+measurements per individual, and privacy parameters vary across one or more
+servers, under both common and independent design settings. In the common
+design setting, the same design points are measured for each individual,
+whereas in the independent design, each individual has their own random
+collection of design points. Within this framework, we establish minimax upper
+and lower bounds for the estimation error of the underlying mean function,
+highlighting the nuanced differences between common and independent designs
+under distributed privacy constraints.
+  We propose algorithms that achieve the optimal trade-off between privacy and
+accuracy and provide optimality results that quantify the fundamental limits of
+private functional mean estimation across diverse distributed settings. These
+results characterize the cost of privacy and offer practical insights into the
+potential for privacy-preserving statistical analysis in federated
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>54 pages: 25 page article and 29 pages of appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Debiasing Synthetic Data Generated by Deep Generative Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.04216v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.04216v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Decruyenaere, Heidelinde Dehaene, Paloma Rabaey, Christiaan Polet, Johan Decruyenaere, Thomas Demeester, Stijn Vansteelandt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While synthetic data hold great promise for privacy protection, their
+statistical analysis poses significant challenges that necessitate innovative
+solutions. The use of deep generative models (DGMs) for synthetic data
+generation is known to induce considerable bias and imprecision into synthetic
+data analyses, compromising their inferential utility as opposed to original
+data analyses. This bias and uncertainty can be substantial enough to impede
+statistical convergence rates, even in seemingly straightforward analyses like
+mean calculation. The standard errors of such estimators then exhibit slower
+shrinkage with sample size than the typical 1 over root-$n$ rate. This
+complicates fundamental calculations like p-values and confidence intervals,
+with no straightforward remedy currently available. In response to these
+challenges, we propose a new strategy that targets synthetic data created by
+DGMs for specific data analyses. Drawing insights from debiased and targeted
+machine learning, our approach accounts for biases, enhances convergence rates,
+and facilitates the calculation of estimators with easily approximated large
+sample variances. We exemplify our proposal through a simulation study on toy
+data and two case studies on real-world data, highlighting the importance of
+tailoring DGMs for targeted data analysis. This debiasing strategy contributes
+to advancing the reliability and applicability of synthetic data in statistical
+inference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the 38th Conference on Neural Information Processing
+  Systems (NeurIPS 2024), joint first authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Customizable LLM-Powered Chatbot for Behavioral Science Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05541v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05541v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zenon Lamprou, Yashar Moshfeghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of Artificial Intelligence has resulted in the advent
+of Large Language Models (LLMs) with the capacity to produce text that closely
+resembles human communication. These models have been seamlessly integrated
+into diverse applications, enabling interactive and responsive communication
+across multiple platforms. The potential utility of chatbots transcends these
+traditional applications, particularly in research contexts, wherein they can
+offer valuable insights and facilitate the design of innovative experiments. In
+this study, we present a Customizable LLM-Powered Chatbot (CLPC), a web-based
+chatbot system designed to assist in behavioral science research. The system is
+meticulously designed to function as an experimental instrument rather than a
+conventional chatbot, necessitating users to input a username and experiment
+code upon access. This setup facilitates precise data cross-referencing,
+thereby augmenting the integrity and applicability of the data collected for
+research purposes. It can be easily expanded to accommodate new basic events as
+needed; and it allows researchers to integrate their own logging events without
+the necessity of implementing a separate logging mechanism. It is worth noting
+that our system was built to assist primarily behavioral science research but
+is not limited to it, it can easily be adapted to assist information retrieval
+research or interacting with chat bot agents in general.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Discrete-sequence <span class="highlight-title">Dataset</span> for Evaluating Online Unsupervised Anomaly
+  Detection Approaches for Multivariate Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.13951v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.13951v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Correia, Jan-Christoph Goos, Thomas Bäck, Anna V. Kononova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benchmarking anomaly detection approaches for multivariate time series is
+challenging due to the lack of high-quality datasets. Current publicly
+available datasets are too small, not diverse and feature trivial anomalies,
+which hinders measurable progress in this research area. We propose a solution:
+a diverse, extensive, and non-trivial dataset generated via state-of-the-art
+simulation tools that reflects realistic behaviour of an automotive powertrain,
+including its multivariate, dynamic and variable-state properties. To cater for
+both unsupervised and semi-supervised anomaly detection settings, as well as
+time series generation and forecasting, we make different versions of the
+dataset available, where training and test subsets are offered in contaminated
+and clean versions, depending on the task. We also provide baseline results
+from a small selection of approaches based on deterministic and variational
+autoencoders, as well as a non-parametric approach. As expected, the baseline
+experimentation shows that the approaches trained on the semi-supervised
+version of the dataset outperform their unsupervised counterparts, highlighting
+a need for approaches more robust to contaminated training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the IEEE Transactions on Reliability journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Identifying Spurious Correlations using Counterfactual Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02186v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02186v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Paul Cohen, Louis Blankemeier, Akshay Chaudhari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models driven by spurious correlations often yield poor generalization
+performance. We propose the counterfactual (CF) alignment method to detect and
+quantify spurious correlations of black box classifiers. Our methodology is
+based on counterfactual images generated with respect to one classifier being
+input into other classifiers to see if they also induce changes in the outputs
+of these classifiers. The relationship between these responses can be
+quantified and used to identify specific instances where a spurious correlation
+exists. This is validated by observing intuitive trends in face-attribute and
+waterbird classifiers, as well as by fabricating spurious correlations and
+detecting their presence, both visually and quantitatively. Furthermore,
+utilizing the CF alignment method, we demonstrate that we can evaluate robust
+optimization methods (GroupDRO, JTT, and FLAC) by detecting a reduction in
+spurious correlations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Transactions on Machine Learning Research (TMLR), Code:
+  https://github.com/ieee8023/latentshift</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PACE: Marrying generalization in PArameter-efficient fine-tuning with
+  Consistency rEgularization <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.17137v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.17137v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Ni, Shan Zhang, Piotr Koniusz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-Efficient Fine-Tuning (PEFT) effectively adapts pre-trained
+transformers to downstream tasks. However, the optimization of tasks
+performance often comes at the cost of generalizability in fine-tuned models.
+To address this issue, we theoretically connect smaller weight gradient norms
+during training and larger datasets to the improvements in model
+generalization. Motivated by this connection, we propose reducing gradient
+norms for enhanced generalization and aligning fine-tuned model with the
+pre-trained counterpart to retain knowledge from large-scale pre-training data.
+Yet, naive alignment does not guarantee gradient reduction and can potentially
+cause gradient explosion, complicating efforts to manage gradients. To address
+such an issue, we propose PACE, marrying generalization of PArameter-efficient
+fine-tuning with Consistency rEgularization. We perturb features learned from
+the adapter with the multiplicative noise and ensure the fine-tuned model
+remains consistent for same sample under different perturbations. Theoretical
+analysis shows that PACE not only implicitly regularizes gradients for enhanced
+generalization, but also implicitly aligns the fine-tuned and pre-trained
+models to retain knowledge. Experimental evidence supports our theories. PACE
+surpasses existing PEFT methods in visual adaptation tasks (VTAB-1k, FGVC,
+few-shot learning, domain adaptation) showcasing its potential for
+resource-efficient fine-tuning. It also improves LoRA in text classification
+(GLUE) and mathematical reasoning (GSM-8K). The code is available at
+https://github.com/MaxwellYaoNi/PACE
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024 as a spotlight</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Supervised Kernel Thinning <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.13749v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.13749v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Albert Gong, Kyuseong Choi, Raaz Dwivedi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The kernel thinning algorithm of Dwivedi & Mackey (2024) provides a
+better-than-i.i.d. compression of a generic set of points. By generating
+high-fidelity coresets of size significantly smaller than the input points, KT
+is known to speed up unsupervised tasks like Monte Carlo integration,
+uncertainty quantification, and non-parametric hypothesis testing, with minimal
+loss in statistical accuracy. In this work, we generalize the KT algorithm to
+speed up supervised learning problems involving kernel methods. Specifically,
+we combine two classical algorithms--Nadaraya-Watson (NW) regression or kernel
+smoothing, and kernel ridge regression (KRR)--with KT to provide a quadratic
+speed-up in both training and inference times. We show how distribution
+compression with KT in each setting reduces to constructing an appropriate
+kernel, and introduce the Kernel-Thinned NW and Kernel-Thinned KRR estimators.
+We prove that KT-based regression estimators enjoy significantly superior
+computational efficiency over the full-data estimators and improved statistical
+efficiency over i.i.d. subsampling of the training data. En route, we also
+provide a novel multiplicative error guarantee for compressing with KT. We
+validate our design choices with both simulations and real data experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating Multi-Physics Simulations and Machine Learning to Define the
+  Spatter Mechanism and Process Window in Laser Powder Bed Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07823v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07823v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olabode T. Ajenifujah, Francis Ogoke, Florian Wirth, Jack Beuth, Amir Barati Farimani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Laser powder bed fusion (LPBF) has shown promise for wide range of
+applications due to its ability to fabricate freeform geometries and generate a
+controlled microstructure. However, components generated by LPBF still possess
+sub-optimal mechanical properties due to the defects that are created during
+laser-material interactions. In this work, we investigate mechanism of spatter
+formation, using a high-fidelity modelling tool that was built to simulate the
+multi-physics phenomena in LPBF. The modelling tool have the capability to
+capture the 3D resolution of the meltpool and the spatter behavior. To
+understand spatter behavior and formation, we reveal its properties at ejection
+and evaluate its variation from the meltpool, the source where it is formed.
+The dataset of the spatter and the meltpool collected consist of 50 % spatter
+and 50 % melt pool samples, with features that include position components,
+velocity components, velocity magnitude, temperature, density and pressure. The
+relationship between the spatter and the meltpool were evaluated via
+correlation analysis and machine learning (ML) algorithms for classification
+tasks. Upon screening different ML algorithms on the dataset, a high accuracy
+was observed for all the ML models, with ExtraTrees having the highest at 96 %
+and KNN having the lowest at 94 %.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular
+  Data <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04491v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04491v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Holzmüller, Léo Grinsztajn, Ingo Steinwart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For classification and regression on tabular data, the dominance of
+gradient-boosted decision trees (GBDTs) has recently been challenged by often
+much slower deep learning methods with extensive hyperparameter tuning. We
+address this discrepancy by introducing (a) RealMLP, an improved multilayer
+perceptron (MLP), and (b) strong meta-tuned default parameters for GBDTs and
+RealMLP. We tune RealMLP and the default parameters on a meta-train benchmark
+with 118 datasets and compare them to hyperparameter-optimized versions on a
+disjoint meta-test benchmark with 90 datasets, as well as the GBDT-friendly
+benchmark by Grinsztajn et al. (2022). Our benchmark results on medium-to-large
+tabular datasets (1K--500K samples) show that RealMLP offers a favorable
+time-accuracy tradeoff compared to other neural baselines and is competitive
+with GBDTs in terms of benchmark scores. Moreover, a combination of RealMLP and
+GBDTs with improved default parameters can achieve excellent results without
+hyperparameter tuning. Finally, we demonstrate that some of RealMLP's
+improvements can also considerably improve the performance of TabR with default
+parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024. Changes in v3: mention bug in XGBoost results, mention
+  original name of he+5 method. Code is available at
+  github.com/dholzmueller/pytabkit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ensemble sampling for linear bandits: small ensembles suffice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.08376v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.08376v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Janz, Alexander E. Litvak, Csaba Szepesvári
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide the first useful and rigorous analysis of ensemble sampling for
+the stochastic linear bandit setting. In particular, we show that, under
+standard assumptions, for a $d$-dimensional stochastic linear bandit with an
+interaction horizon $T$, ensemble sampling with an ensemble of size of order $d
+\log T$ incurs regret at most of the order $(d \log T)^{5/2} \sqrt{T}$. Ours is
+the first result in any structured setting not to require the size of the
+ensemble to scale linearly with $T$ -- which defeats the purpose of ensemble
+sampling -- while obtaining near $\smash{\sqrt{T}}$ order regret. Our result is
+also the first to allow for infinite action sets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inferring stochastic low-rank recurrent neural networks from neural data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.16749v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.16749v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthijs Pals, A Erdem Sağtekin, Felix Pei, Manuel Gloeckler, Jakob H Macke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A central aim in computational neuroscience is to relate the activity of
+large populations of neurons to an underlying dynamical system. Models of these
+neural dynamics should ideally be both interpretable and fit the observed data
+well. Low-rank recurrent neural networks (RNNs) exhibit such interpretability
+by having tractable dynamics. However, it is unclear how to best fit low-rank
+RNNs to data consisting of noisy observations of an underlying stochastic
+system. Here, we propose to fit stochastic low-rank RNNs with variational
+sequential Monte Carlo methods. We validate our method on several datasets
+consisting of both continuous and spiking neural data, where we obtain lower
+dimensional latent dynamics than current state of the art methods.
+Additionally, for low-rank models with piecewise linear nonlinearities, we show
+how to efficiently identify all fixed points in polynomial rather than
+exponential cost in the number of units, making analysis of the inferred
+dynamics tractable for large RNNs. Our method both elucidates the dynamical
+systems underlying experimental recordings and provides a generative model
+whose trajectories match observed variability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Taming the Long Tail in Human Mobility Prediction <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.14970v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.14970v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohang Xu, Renhe Jiang, Chuang Yang, Zipei Fan, Kaoru Sezaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the popularity of location-based services, human mobility prediction
+plays a key role in enhancing personalized navigation, optimizing
+recommendation systems, and facilitating urban mobility and planning. This
+involves predicting a user's next POI (point-of-interest) visit using their
+past visit history. However, the uneven distribution of visitations over time
+and space, namely the long-tail problem in spatial distribution, makes it
+difficult for AI models to predict those POIs that are less visited by humans.
+In light of this issue, we propose the Long-Tail Adjusted Next POI Prediction
+(LoTNext) framework for mobility prediction, combining a Long-Tailed Graph
+Adjustment module to reduce the impact of the long-tailed nodes in the user-POI
+interaction graph and a novel Long-Tailed Loss Adjustment module to adjust loss
+by logit score and sample weight adjustment strategy. Also, we employ the
+auxiliary prediction task to enhance generalization and accuracy. Our
+experiments with two real-world trajectory datasets demonstrate that LoTNext
+significantly surpasses existing state-of-the-art works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Surprising Ineffectiveness of <span class="highlight-title">Pre-Train</span>ed Visual Representations for
+  Model-Based Reinforcement Learning <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10175v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10175v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moritz Schneider, Robert Krug, Narunas Vaskevicius, Luigi Palmieri, Joschka Boedecker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Reinforcement Learning (RL) methods often require extensive amounts of
+data. As opposed to model-free RL, model-based RL (MBRL) offers a potential
+solution with efficient data utilization through planning. Additionally, RL
+lacks generalization capabilities for real-world tasks. Prior work has shown
+that incorporating pre-trained visual representations (PVRs) enhances sample
+efficiency and generalization. While PVRs have been extensively studied in the
+context of model-free RL, their potential in MBRL remains largely unexplored.
+In this paper, we benchmark a set of PVRs on challenging control tasks in a
+model-based RL setting. We investigate the data efficiency, generalization
+capabilities, and the impact of different properties of PVRs on the performance
+of model-based agents. Our results, perhaps surprisingly, reveal that for MBRL
+current PVRs are not more sample efficient than learning representations from
+scratch, and that they do not generalize better to out-of-distribution (OOD)
+settings. To explain this, we analyze the quality of the trained dynamics
+model. Furthermore, we show that data diversity and network architecture are
+the most important contributors to OOD generalization performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the 38th Conference on Neural Information Processing
+  Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CGCOD: Class-Guided Camouflaged Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18977v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18977v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxi Zhang, Qing Zhang, Jiayun Wu, Youwei Pang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camouflaged Object Detection (COD) aims to identify objects that blend
+seamlessly into their surroundings. The inherent visual complexity of
+camouflaged objects, including their low contrast with the background, diverse
+textures, and subtle appearance variations, often obscures semantic cues,
+making accurate segmentation highly challenging. Existing methods primarily
+rely on visual features, which are insufficient to handle the variability and
+intricacy of camouflaged objects, leading to unstable object perception and
+ambiguous segmentation results. To tackle these limitations, we introduce a
+novel task, class-guided camouflaged object detection (CGCOD), which extends
+traditional COD task by incorporating object-specific class knowledge to
+enhance detection robustness and accuracy. To facilitate this task, we present
+a new dataset, CamoClass, comprising real-world camouflaged objects with class
+annotations. Furthermore, we propose a multi-stage framework, CGNet, which
+incorporates a plug-and-play class prompt generator and a simple yet effective
+class-guided detector. This establishes a new paradigm for COD, bridging the
+gap between contextual understanding and class-guided detection. Extensive
+experimental results demonstrate the effectiveness of our flexible framework in
+improving the performance of proposed and existing detectors by leveraging
+class-level textual information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RoME: A Robust Mixed-Effects Bandit Algorithm for Optimizing Mobile
+  Health Interventions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06403v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06403v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Easton K. Huch, Jieru Shi, Madeline R. Abbott, Jessica R. Golbus, Alexander Moreno, Walter H. Dempsey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mobile health leverages personalized and contextually tailored interventions
+optimized through bandit and reinforcement learning algorithms. In practice,
+however, challenges such as participant heterogeneity, nonstationarity, and
+nonlinear relationships hinder algorithm performance. We propose RoME, a Robust
+Mixed-Effects contextual bandit algorithm that simultaneously addresses these
+challenges via (1) modeling the differential reward with user- and
+time-specific random effects, (2) network cohesion penalties, and (3) debiased
+machine learning for flexible estimation of baseline rewards. We establish a
+high-probability regret bound that depends solely on the dimension of the
+differential-reward model, enabling us to achieve robust regret bounds even
+when the baseline reward is highly complex. We demonstrate the superior
+performance of the RoME algorithm in a simulation and two off-policy evaluation
+studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Algorithms for Contextual Dynamic Pricing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11316v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11316v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matilde Tullii, Solenne Gaucher, Nadav Merlis, Vianney Perchet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contextual dynamic pricing, a seller sequentially prices goods based on
+contextual information. Buyers will purchase products only if the prices are
+below their valuations. The goal of the seller is to design a pricing strategy
+that collects as much revenue as possible. We focus on two different valuation
+models. The first assumes that valuations linearly depend on the context and
+are further distorted by noise. Under minor regularity assumptions, our
+algorithm achieves an optimal regret bound of $\tilde{\mathcal{O}}(T^{2/3})$,
+improving the existing results. The second model removes the linearity
+assumption, requiring only that the expected buyer valuation is
+$\beta$-H\"older in the context. For this model, our algorithm obtains a regret
+$\tilde{\mathcal{O}}(T^{d+2\beta/d+3\beta})$, where $d$ is the dimension of the
+context space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PRIMO: Private Regression in Multiple Outcomes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04195v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04195v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seth Neel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new private regression setting we call Private Regression in
+Multiple Outcomes (PRIMO), inspired by the common situation where a data
+analyst wants to perform a set of $l$ regressions while preserving privacy,
+where the features $X$ are shared across all $l$ regressions, and each
+regression $i \in [l]$ has a different vector of outcomes $y_i$. Naively
+applying existing private linear regression techniques $l$ times leads to a
+$\sqrt{l}$ multiplicative increase in error over the standard linear regression
+setting. We apply a variety of techniques including sufficient statistics
+perturbation (SSP) and geometric projection-based methods to develop scalable
+algorithms that outperform this baseline across a range of parameter regimes.
+In particular, we obtain no dependence on l in the asymptotic error when $l$ is
+sufficiently large. Empirically, on the task of genomic risk prediction with
+multiple phenotypes we find that even for values of $l$ far smaller than the
+theory would predict, our projection-based method improves the accuracy
+relative to the variant that doesn't use the projection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Volterra Accentuated Non-Linear Dynamical Admittance (VANYA) to model
+  Deforestation: An Exemplification from the Amazon Rainforest 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06471v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06471v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karthik R., Ramamoorthy A
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent automation supports us against cyclones, droughts, and seismic
+events with recent technology advancements. Algorithmic learning has advanced
+fields like neuroscience, genetics, and human-computer interaction. Time-series
+data boosts progress. Challenges persist in adopting these approaches in
+traditional fields. Neural networks face comprehension and bias issues. AI's
+expansion across scientific areas is due to adaptable descriptors and
+combinatorial argumentation. This article focuses on modeling Forest loss using
+the VANYA Model, incorporating Prey Predator Dynamics. VANYA predicts forest
+cover, demonstrated on Amazon Rainforest data against other forecasters like
+Long Short-Term Memory, N-BEATS, RCN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The experimental data used in this article has given wrong practical
+  interpretation. The data has to be updated to improve this</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Optimal Tax Design in Nonatomic Congestion Games <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07437v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07437v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiwen Cui, Maryam Fazel, Simon S. Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In multiplayer games, self-interested behavior among the players can harm the
+social welfare. Tax mechanisms are a common method to alleviate this issue and
+induce socially optimal behavior. In this work, we take the initial step of
+learning the optimal tax that can maximize social welfare with limited feedback
+in congestion games. We propose a new type of feedback named \emph{equilibrium
+feedback}, where the tax designer can only observe the Nash equilibrium after
+deploying a tax plan. Existing algorithms are not applicable due to the
+exponentially large tax function space, nonexistence of the gradient, and
+nonconvexity of the objective. To tackle these challenges, we design a
+computationally efficient algorithm that leverages several novel components:
+(1) a piece-wise linear tax to approximate the optimal tax; (2) extra linear
+terms to guarantee a strongly convex potential function; (3) an efficient
+subroutine to find the exploratory tax that can provide critical information
+about the game. The algorithm can find an $\epsilon$-optimal tax with $O(\beta
+F^2/\epsilon)$ sample complexity, where $\beta$ is the smoothness of the cost
+function and $F$ is the number of facilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages. Accepted by Conference on Neural Information Processing
+  Systems (NeurIPS) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of Artificial Intelligence Methods for Lead Time Prediction
+  in Non-Cycled Areas of Automotive Production 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07317v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07317v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cornelius Hake, Jonas Weigele, Frederik Reichert, Christian Friedrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present study examines the effectiveness of applying Artificial
+Intelligence methods in an automotive production environment to predict unknown
+lead times in a non-cycle-controlled production area. Data structures are
+analyzed to identify contextual features and then preprocessed using one-hot
+encoding. Methods selection focuses on supervised machine learning techniques.
+In supervised learning methods, regression and classification methods are
+evaluated. Continuous regression based on target size distribution is not
+feasible. Classification methods analysis shows that Ensemble Learning and
+Support Vector Machines are the most suitable. Preliminary study results
+indicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost
+yield the best results. After further testing and extensive hyperparameter
+optimization, the final method choice is the LightGBM algorithm. Depending on
+feature availability and prediction interval granularity, relative prediction
+accuracies of up to 90% can be achieved. Further tests highlight the importance
+of periodic retraining of AI models to accurately represent complex production
+processes using the database. The research demonstrates that AI methods can be
+effectively applied to highly variable production data, adding business value
+by providing an additional metric for various control tasks while outperforming
+current non AI-based systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constrained Latent Action Policies for Model-Based Offline Reinforcement
+  Learning <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.04562v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.04562v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marvin Alles, Philip Becker-Ehmck, Patrick van der Smagt, Maximilian Karl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In offline reinforcement learning, a policy is learned using a static dataset
+in the absence of costly feedback from the environment. In contrast to the
+online setting, only using static datasets poses additional challenges, such as
+policies generating out-of-distribution samples. Model-based offline
+reinforcement learning methods try to overcome these by learning a model of the
+underlying dynamics of the environment and using it to guide policy search. It
+is beneficial but, with limited datasets, errors in the model and the issue of
+value overestimation among out-of-distribution states can worsen performance.
+Current model-based methods apply some notion of conservatism to the Bellman
+update, often implemented using uncertainty estimation derived from model
+ensembles. In this paper, we propose Constrained Latent Action Policies (C-LAP)
+which learns a generative model of the joint distribution of observations and
+actions. We cast policy learning as a constrained objective to always stay
+within the support of the latent action distribution, and use the generative
+capabilities of the model to impose an implicit constraint on the generated
+actions. Thereby eliminating the need to use additional uncertainty penalties
+on the Bellman update and significantly decreasing the number of gradient steps
+required to learn a policy. We empirically evaluate C-LAP on the D4RL and
+V-D4RL benchmark, and show that C-LAP is competitive to state-of-the-art
+methods, especially outperforming on datasets with visual observations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38th Conference on Neural Information Processing Systems (NeurIPS
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Metric Space Magnitude for Evaluating the Diversity of Latent
+  Representations <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16054v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16054v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katharina Limbeck, Rayna Andreeva, Rik Sarkar, Bastian Rieck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The magnitude of a metric space is a novel invariant that provides a measure
+of the 'effective size' of a space across multiple scales, while also capturing
+numerous geometrical properties, such as curvature, density, or entropy. We
+develop a family of magnitude-based measures of the intrinsic diversity of
+latent representations, formalising a novel notion of dissimilarity between
+magnitude functions of finite metric spaces. Our measures are provably stable
+under perturbations of the data, can be efficiently calculated, and enable a
+rigorous multi-scale characterisation and comparison of latent representations.
+We show their utility and superior performance across different domains and
+tasks, including (i) the automated estimation of diversity, (ii) the detection
+of mode collapse, and (iii) the evaluation of generative models for text,
+image, and graph data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 38th Conference on Neural Information Processing
+  Systems (NeurIPS) 2024. The code for computing magnitude is available at
+  https://github.com/aidos-lab/magnipy</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Maximizing Uncertainty for Federated learning via Bayesian
+  Optimisation-based Model Poisoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08002v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08002v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Aristodemou, Xiaolan Liu, Yuan Wang, Konstantinos G. Kyriakopoulos, Sangarapillai Lambotharan, Qingsong Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As we transition from Narrow Artificial Intelligence towards Artificial Super
+Intelligence, users are increasingly concerned about their privacy and the
+trustworthiness of machine learning (ML) technology. A common denominator for
+the metrics of trustworthiness is the quantification of uncertainty inherent in
+DL algorithms, and specifically in the model parameters, input data, and model
+predictions. One of the common approaches to address privacy-related issues in
+DL is to adopt distributed learning such as federated learning (FL), where
+private raw data is not shared among users. Despite the privacy-preserving
+mechanisms in FL, it still faces challenges in trustworthiness. Specifically,
+the malicious users, during training, can systematically create malicious model
+parameters to compromise the models predictive and generative capabilities,
+resulting in high uncertainty about their reliability. To demonstrate malicious
+behaviour, we propose a novel model poisoning attack method named Delphi which
+aims to maximise the uncertainty of the global model output. We achieve this by
+taking advantage of the relationship between the uncertainty and the model
+parameters of the first hidden layer of the local model. Delphi employs two
+types of optimisation , Bayesian Optimisation and Least Squares Trust Region,
+to search for the optimal poisoned model parameters, named as Delphi-BO and
+Delphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise
+the distance of the predictive probability distribution towards an uncertain
+distribution of model output. Furthermore, we establish a mathematical proof
+for the attack effectiveness demonstrated in FL. Numerical results demonstrate
+that Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR
+highlighting vulnerability of FL systems to model poisoning attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Applying the maximum entropy principle to neural networks enhances
+  multi-species distribution models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19217v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19217v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Ryckewaert, Diego Marcos, Christophe Botella, Maximilien Servajean, Pierre Bonnet, Alexis Joly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid expansion of citizen science initiatives has led to a significant
+growth of biodiversity databases, and particularly presence-only (PO)
+observations. PO data are invaluable for understanding species distributions
+and their dynamics, but their use in a Species Distribution Model (SDM) is
+curtailed by sampling biases and the lack of information on absences. Poisson
+point processes are widely used for SDMs, with Maxent being one of the most
+popular methods. Maxent maximises the entropy of a probability distribution
+across sites as a function of predefined transformations of variables, called
+features. In contrast, neural networks and deep learning have emerged as a
+promising technique for automatic feature extraction from complex input
+variables. Arbitrarily complex transformations of input variables can be
+learned from the data efficiently through backpropagation and stochastic
+gradient descent (SGD). In this paper, we propose DeepMaxent, which harnesses
+neural networks to automatically learn shared features among species, using the
+maximum entropy principle. To do so, it employs a normalised Poisson loss where
+for each species, presence probabilities across sites are modelled by a neural
+network. We evaluate DeepMaxent on a benchmark dataset known for its spatial
+sampling biases, using PO data for calibration and presence-absence (PA) data
+for validation across six regions with different biological groups and
+covariates. Our results indicate that DeepMaxent performs better than Maxent
+and other leading SDMs across all regions and taxonomic groups. The method
+performs particularly well in regions of uneven sampling, demonstrating
+substantial potential to increase SDM performances. In particular, our approach
+yields more accurate predictions than traditional single-species models, which
+opens up new possibilities for methodological enhancement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Methods in Ecology and Evolution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Closer Look at Deep Learning Methods on Tabular <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.00956v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.00956v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han-Jia Ye, Si-Yang Liu, Hao-Run Cai, Qi-Le Zhou, De-Chuan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tabular data is prevalent across diverse domains in machine learning. While
+classical methods like tree-based models have long been effective, Deep Neural
+Network (DNN)-based methods have recently demonstrated promising performance.
+However, the diverse characteristics of methods and the inherent heterogeneity
+of tabular datasets make understanding and interpreting tabular methods both
+challenging and prone to unstable observations. In this paper, we conduct
+in-depth evaluations and comprehensive analyses of tabular methods, with a
+particular focus on DNN-based models, using a benchmark of over 300 tabular
+datasets spanning a wide range of task types, sizes, and domains. First, we
+perform an extensive comparison of 32 state-of-the-art deep and tree-based
+methods, evaluating their average performance across multiple criteria.
+Although method ranks vary across datasets, we empirically find that
+top-performing methods tend to concentrate within a small subset of tabular
+models, regardless of the criteria used. Next, we investigate whether the
+training dynamics of deep tabular models can be predicted based on dataset
+properties. This approach not only offers insights into the behavior of deep
+tabular methods but also identifies a core set of "meta-features" that reflect
+dataset heterogeneity. The other subset includes datasets where method ranks
+are consistent with the overall benchmark, acting as a reliable probe for
+further tabular analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MambaLRP: Explaining Selective State Space Sequence Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.07592v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.07592v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farnoush Rezaei Jafari, Grégoire Montavon, Klaus-Robert Müller, Oliver Eberle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent sequence modeling approaches using selective state space sequence
+models, referred to as Mamba models, have seen a surge of interest. These
+models allow efficient processing of long sequences in linear time and are
+rapidly being adopted in a wide range of applications such as language
+modeling, demonstrating promising performance. To foster their reliable use in
+real-world scenarios, it is crucial to augment their transparency. Our work
+bridges this critical gap by bringing explainability, particularly Layer-wise
+Relevance Propagation (LRP), to the Mamba architecture. Guided by the axiom of
+relevance conservation, we identify specific components in the Mamba
+architecture, which cause unfaithful explanations. To remedy this issue, we
+propose MambaLRP, a novel algorithm within the LRP framework, which ensures a
+more stable and reliable relevance propagation through these components. Our
+proposed method is theoretically sound and excels in achieving state-of-the-art
+explanation performance across a diverse range of models and datasets.
+Moreover, MambaLRP facilitates a deeper inspection of Mamba architectures,
+uncovering various biases and evaluating their significance. It also enables
+the analysis of previous speculations regarding the long-range capabilities of
+Mamba models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse Low-Ranked Self-Attention <span class="highlight-title">Transformer</span> for Remaining Useful
+  Lifetime Prediction of Optical Fiber Amplifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14378v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14378v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominic Schneider, Lutz Rapp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical fiber amplifiers are key elements in present optical networks.
+Failures of these components result in high financial loss of income of the
+network operator as the communication traffic over an affected link is
+interrupted. Applying Remaining useful lifetime (RUL) prediction in the context
+of Predictive Maintenance (PdM) to optical fiber amplifiers to predict upcoming
+system failures at an early stage, so that network outages can be minimized
+through planning of targeted maintenance actions, ensures reliability and
+safety. Optical fiber amplifier are complex systems, that work under various
+operating conditions, which makes correct forecasting a difficult task.
+Increased monitoring capabilities of systems results in datasets that
+facilitate the application of data-driven RUL prediction methods. Deep learning
+models in particular have shown good performance, but generalization based on
+comparatively small datasets for RUL prediction is difficult. In this paper, we
+propose Sparse Low-ranked self-Attention Transformer (SLAT) as a novel RUL
+prediction method. SLAT is based on an encoder-decoder architecture, wherein
+two parallel working encoders extract features for sensors and time steps. By
+utilizing the self-attention mechanism, long-term dependencies can be learned
+from long sequences. The implementation of sparsity in the attention matrix and
+a low-rank parametrization reduce overfitting and increase generalization.
+Experimental application to optical fiber amplifiers exemplified on EDFA, as
+well as a reference dataset from turbofan engines, shows that SLAT outperforms
+the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FADE: Towards Fairness-aware Augmentation for Domain Generalization via
+  Classifier-Guided Score-based Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.09495v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.09495v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Lin, Dong Li, Chen Zhao, Minglai Shao, Guihong Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fairness-aware domain generalization (FairDG) has emerged as a critical
+challenge for deploying trustworthy AI systems, particularly in scenarios
+involving distribution shifts. Traditional methods for addressing fairness have
+failed in domain generalization due to their lack of consideration for
+distribution shifts. Although disentanglement has been used to tackle FairDG,
+it is limited by its strong assumptions. To overcome these limitations, we
+propose Fairness-aware Classifier-Guided Score-based Diffusion Models (FADE) as
+a novel approach to effectively address the FairDG issue. Specifically, we
+first pre-train a score-based diffusion model (SDM) and two classifiers to
+equip the model with strong generalization capabilities across different
+domains. Then, we guide the SDM using these pre-trained classifiers to
+effectively eliminate sensitive information from the generated data. Finally,
+the generated fair data is used to train downstream classifiers, ensuring
+robust performance under new data distributions. Extensive experiments on three
+real-world datasets demonstrate that FADE not only enhances fairness but also
+improves accuracy in the presence of distribution shifts. Additionally, FADE
+outperforms existing methods in achieving the best accuracy-fairness
+trade-offs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parallelizing Linear <span class="highlight-title">Transformer</span>s with the Delta Rule over Sequence
+  Length 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.06484v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.06484v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Yang, Bailin Wang, Yu Zhang, Yikang Shen, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers with linear attention (i.e., linear transformers) and
+state-space models have recently been suggested as a viable linear-time
+alternative to transformers with softmax attention. However, these models still
+underperform transformers especially on tasks that require in-context
+retrieval. While more expressive variants of linear transformers which replace
+the additive update in linear transformers with the delta rule (DeltaNet) have
+been found to be more effective at associative recall, existing algorithms for
+training such models do not parallelize over sequence length and are thus
+inefficient to train on modern hardware. This work describes a
+hardware-efficient algorithm for training linear transformers with the delta
+rule, which exploits a memory-efficient representation for computing products
+of Householder matrices. This algorithm allows us to scale up DeltaNet to
+standard language modeling settings. We train a 1.3B model for 100B tokens and
+find that it outperforms recent linear-time baselines such as Mamba and GLA in
+terms of perplexity and zero-shot performance on downstream tasks. We also
+experiment with two hybrid models which combine DeltaNet layers with (1)
+sliding-window attention layers every other layer or (2) two global attention
+layers, and find that these hybrids outperform strong transformer baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RoHan: Robust Hand Detection in Operation Room 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08115v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08115v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roi Papo, Sapir Gershov, Tom Friedman, Itay Or, Gil Bolotin, Shlomi Laufer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hand-specific localization has garnered significant interest within the
+computer vision community. Although there are numerous datasets with hand
+annotations from various angles and settings, domain transfer techniques
+frequently struggle in surgical environments. This is mainly due to the limited
+availability of gloved hand instances and the unique challenges of operating
+rooms (ORs). Thus, hand-detection models tailored to OR settings require
+extensive training and expensive annotation processes. To overcome these
+challenges, we present "RoHan" - a novel approach for robust hand detection in
+the OR, leveraging advanced semi-supervised domain adaptation techniques to
+tackle the challenges of varying recording conditions, diverse glove colors,
+and occlusions common in surgical settings. Our methodology encompasses two
+main stages: (1) data augmentation strategy that utilizes "Artificial Gloves,"
+a method for augmenting publicly available hand datasets with synthetic images
+of hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that
+improves detection performance in real-world OR settings through iterative
+prediction refinement and efficient frame filtering. We evaluate our method
+using two datasets: simulated enterotomy repair and saphenous vein graft
+harvesting. "RoHan" substantially reduces the need for extensive labeling and
+model training, paving the way for the practical implementation of hand
+detection technologies in medical settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constructing Confidence Intervals for 'the' Generalization Error -- a
+  Comprehensive Benchmark Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18836v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18836v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hannah Schulz-Kümpel, Sebastian Fischer, Roman Hornung, Anne-Laure Boulesteix, Thomas Nagler, Bernd Bischl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When assessing the quality of prediction models in machine learning,
+confidence intervals (CIs) for the generalization error, which measures
+predictive performance, are a crucial tool. Luckily, there exist many methods
+for computing such CIs and new promising approaches are continuously being
+proposed. Typically, these methods combine various resampling procedures, most
+popular among them cross-validation and bootstrapping, with different variance
+estimation techniques. Unfortunately, however, there is currently no consensus
+on when any of these combinations may be most reliably employed and how they
+generally compare. In this work, we conduct a large-scale study comparing CIs
+for the generalization error, the first one of such size, where we empirically
+evaluate 13 different CI methods on a total of 19 tabular regression and
+classification problems, using seven different inducers and a total of eight
+loss functions. We give an overview of the methodological foundations and
+inherent challenges of constructing CIs for the generalization error and
+provide a concise review of all 13 methods in a unified framework. Finally, the
+CI methods are evaluated in terms of their relative coverage frequency, width,
+and runtime. Based on these findings, we can identify a subset of methods that
+we would recommend. We also publish the datasets as a benchmarking suite on
+OpenML and our code on GitHub to serve as a basis for further studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extended convexity and smoothness and their applications in deep
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binchuan Qi, Wei Gong, Li Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an optimization framework aimed at providing a
+theoretical foundation for a class of composite optimization problems,
+particularly those encountered in deep learning. In this framework, we
+introduce $\mathcal{H}(\phi)$-convexity and $\mathcal{H}(\Phi)$-smoothness to
+generalize the existing concepts of Lipschitz smoothness and strong convexity.
+Furthermore, we analyze and establish the convergence of both gradient descent
+and stochastic gradient descent methods for objective functions that are
+$\mathcal{H}(\Phi)$-smooth. We prove that the optimal convergence rates of
+these methods depend solely on the homogeneous degree of $\Phi$. Based on these
+findings, we construct two types of non-convex and non-smooth optimization
+problems: deterministic composite and stochastic composite optimization
+problems, which encompass the majority of optimization problems in deep
+learning. To address these problems, we develop the gradient structure control
+algorithm and prove that it can locate approximate global optima. This marks a
+significant departure from traditional non-convex analysis framework, which
+typically settle for stationary points. Therefore, with the introduction of
+$\mathcal{H}(\phi)$-convexity and $\mathcal{H}(\Phi)$-smoothness, along with
+the GSC algorithm, the non-convex optimization mechanisms in deep learning can
+be theoretically explained and supported. Finally, the effectiveness of the
+proposed framework is substantiated through empirical experimentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion-based Unsupervised Audio-visual Speech Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05301v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05301v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-Eudes Ayilo, Mostafa Sadeghi, Romain Serizel, Xavier Alameda-Pineda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a new unsupervised audio-visual speech enhancement (AVSE)
+approach that combines a diffusion-based audio-visual speech generative model
+with a non-negative matrix factorization (NMF) noise model. First, the
+diffusion model is pre-trained on clean speech conditioned on corresponding
+video data to simulate the speech generative distribution. This pre-trained
+model is then paired with the NMF-based noise model to estimate clean speech
+iteratively. Specifically, a diffusion-based posterior sampling approach is
+implemented within the reverse diffusion process, where after each iteration, a
+speech estimate is obtained and used to update the noise parameters.
+Experimental results confirm that the proposed AVSE approach not only
+outperforms its audio-only counterpart but also generalizes better than a
+recent supervised-generative AVSE method. Additionally, the new inference
+algorithm offers a better balance between inference speed and performance
+compared to the previous diffusion-based method. Code and demo available at:
+https://jeaneudesayilo.github.io/fast_UdiffSE
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpreting Equivariant Representations <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12588v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12588v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Abildtrup Hansen, Anna Calissano, Aasa Feragen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latent representations are used extensively for downstream tasks, such as
+visualization, interpolation or feature extraction of deep learning models.
+Invariant and equivariant neural networks are powerful and well-established
+models for enforcing inductive biases. In this paper, we demonstrate that the
+inductive bias imposed on the by an equivariant model must also be taken into
+account when using latent representations. We show how not accounting for the
+inductive biases leads to decreased performance on downstream tasks, and vice
+versa, how accounting for inductive biases can be done effectively by using an
+invariant projection of the latent representations. We propose principles for
+how to choose such a projection, and show the impact of using these principles
+in two common examples: First, we study a permutation equivariant variational
+auto-encoder trained for molecule graph generation; here we show that invariant
+projections can be designed that incur no loss of information in the resulting
+invariant representation. Next, we study a rotation-equivariant representation
+used for image classification. Here, we illustrate how random invariant
+projections can be used to obtain an invariant representation with a high
+degree of retained information. In both cases, the analysis of invariant latent
+representations proves superior to their equivariant counterparts. Finally, we
+illustrate that the phenomena documented here for equivariant neural networks
+have counterparts in standard neural networks where invariance is encouraged
+via augmentation. Thus, while these ambiguities may be known by experienced
+developers of equivariant models, we make both the knowledge as well as
+effective tools to handle the ambiguities available to the broader community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was updated to reflect the version accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Confidence Sequence for Generalized Linear Models, with
+  Applications to Bandits <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.13977v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.13977v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junghyun Lee, Se-Young Yun, Kwang-Sung Jun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified likelihood ratio-based confidence sequence (CS) for any
+(self-concordant) generalized linear model (GLM) that is guaranteed to be
+convex and numerically tight. We show that this is on par or improves upon
+known CSs for various GLMs, including Gaussian, Bernoulli, and Poisson. In
+particular, for the first time, our CS for Bernoulli has a
+$\mathrm{poly}(S)$-free radius where $S$ is the norm of the unknown parameter.
+Our first technical novelty is its derivation, which utilizes a time-uniform
+PAC-Bayesian bound with a uniform prior/posterior, despite the latter being a
+rather unpopular choice for deriving CSs. As a direct application of our new
+CS, we propose a simple and natural optimistic algorithm called OFUGLB,
+applicable to any generalized linear bandits (GLB; Filippi et al. (2010)). Our
+analysis shows that the celebrated optimistic approach simultaneously attains
+state-of-the-art regrets for various self-concordant (not necessarily bounded)
+GLBs, and even $\mathrm{poly}(S)$-free for bounded GLBs, including logistic
+bandits. The regret analysis, our second technical novelty, follows from
+combining our new CS with a new proof technique that completely avoids the
+previously widely used self-concordant control lemma (Faury et al., 2020, Lemma
+9). Numerically, OFUGLB outperforms or is at par with prior algorithms for
+logistic bandits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 2 figures, 2 tables; Accepted to the 38th Conference on
+  Neural Information Processing Systems (NeurIPS 2024) (ver3: minor revisions,
+  code refactoring; ver2: major revision, including new experiments,
+  reorganization, fixing typos in the proofs of ver1, etc)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SupplyGraph: A Benchmark <span class="highlight-title">Dataset</span> for Supply Chain Planning using Graph
+  Neural Networks <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15299v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15299v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azmine Toushik Wasi, MD Shafikul Islam, Adipto Raihan Akib
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have gained traction across different domains
+such as transportation, bio-informatics, language processing, and computer
+vision. However, there is a noticeable absence of research on applying GNNs to
+supply chain networks. Supply chain networks are inherently graph-like in
+structure, making them prime candidates for applying GNN methodologies. This
+opens up a world of possibilities for optimizing, predicting, and solving even
+the most complex supply chain problems. A major setback in this approach lies
+in the absence of real-world benchmark datasets to facilitate the research and
+resolution of supply chain problems using GNNs. To address the issue, we
+present a real-world benchmark dataset for temporal tasks, obtained from one of
+the leading FMCG companies in Bangladesh, focusing on supply chain planning for
+production purposes. The dataset includes temporal data as node features to
+enable sales predictions, production planning, and the identification of
+factory issues. By utilizing this dataset, researchers can employ GNNs to
+address numerous supply chain problems, thereby advancing the field of supply
+chain analytics and planning. Source: https://github.com/CIOL-SUST/SupplyGraph
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 4th workshop on Graphs and more Complex structures for
+  Learning and Reasoning, colocated with AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Get Rid of Isolation: A Continuous Multi-task Spatio-Temporal Learning
+  Framework <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.10524v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.10524v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongchao Yi, Zhengyang Zhou, Qihe Huang, Yanjiang Chen, Liheng Yu, Xu Wang, Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatiotemporal learning has become a pivotal technique to enable urban
+intelligence. Traditional spatiotemporal models mostly focus on a specific task
+by assuming a same distribution between training and testing sets. However,
+given that urban systems are usually dynamic, multi-sourced with imbalanced
+data distributions, current specific task-specific models fail to generalize to
+new urban conditions and adapt to new domains without explicitly modeling
+interdependencies across various dimensions and types of urban data. To this
+end, we argue that there is an essential to propose a Continuous Multi-task
+Spatio-Temporal learning framework (CMuST) to empower collective urban
+intelligence, which reforms the urban spatiotemporal learning from
+single-domain to cooperatively multi-dimensional and multi-task learning.
+Specifically, CMuST proposes a new multi-dimensional spatiotemporal interaction
+network (MSTI) to allow cross-interactions between context and main
+observations as well as self-interactions within spatial and temporal aspects
+to be exposed, which is also the core for capturing task-level commonality and
+personalization. To ensure continuous task learning, a novel Rolling Adaptation
+training scheme (RoAda) is devised, which not only preserves task uniqueness by
+constructing data summarization-driven task prompts, but also harnesses
+correlated patterns among tasks by iterative model behavior modeling. We
+further establish a benchmark of three cities for multi-task spatiotemporal
+learning, and empirically demonstrate the superiority of CMuST via extensive
+evaluations on these datasets. The impressive improvements on both few-shot
+streaming data and new domain tasks against existing SOAT methods are achieved.
+Code is available at https://github.com/DILab-USTCSZ/CMuST.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fully Distributed, Flexible Compositional Visual Representations via
+  Soft Tensor Products 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04671v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04671v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bethia Sun, Maurice Pagnucco, Yang Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the inception of the classicalist vs. connectionist debate, it has been
+argued that the ability to systematically combine symbol-like entities into
+compositional representations is crucial for human intelligence. In
+connectionist systems, the field of disentanglement has gained prominence for
+its ability to produce explicitly compositional representations; however, it
+relies on a fundamentally symbolic, concatenative representation of
+compositional structure that clashes with the continuous, distributed
+foundations of deep learning. To resolve this tension, we extend Smolensky's
+Tensor Product Representation (TPR) and introduce Soft TPR, a representational
+form that encodes compositional structure in an inherently distributed,
+flexible manner, along with Soft TPR Autoencoder, a theoretically-principled
+architecture designed specifically to learn Soft TPRs. Comprehensive
+evaluations in the visual representation learning domain demonstrate that the
+Soft TPR framework consistently outperforms conventional disentanglement
+alternatives -- achieving state-of-the-art disentanglement, boosting
+representation learner convergence, and delivering superior sample efficiency
+and low-sample regime performance in downstream tasks. These findings highlight
+the promise of a distributed and flexible approach to representing
+compositional structure by potentially enhancing alignment with the core
+principles of deep learning over the conventional symbolic approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Neurips 2024. 10 pages + supplementary</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SelectIT: Selective Instruction Tuning for LLMs via Uncertainty-Aware
+  Self-Reflection <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16705v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16705v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liangxin Liu, Xuebo Liu, Derek F. Wong, Dongfang Li, Ziyi Wang, Baotian Hu, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning (IT) is crucial to tailoring large language models (LLMs)
+towards human-centric interactions. Recent advancements have shown that the
+careful selection of a small, high-quality subset of IT data can significantly
+enhance the performance of LLMs. Despite this, common approaches often rely on
+additional models or data, which increases costs and limits widespread
+adoption. In this work, we propose a novel approach, termed SelectIT, that
+capitalizes on the foundational capabilities of the LLM itself. Specifically,
+we exploit the intrinsic uncertainty present in LLMs to more effectively select
+high-quality IT data, without the need for extra resources. Furthermore, we
+introduce a curated IT dataset, the Selective Alpaca, created by applying
+SelectIT to the Alpaca-GPT4 dataset. Empirical results demonstrate that IT
+using Selective Alpaca leads to substantial model ability enhancement. The
+robustness of SelectIT has also been corroborated in various foundation models
+and domain-specific tasks. Our findings suggest that longer and more
+computationally intensive IT data may serve as superior sources of IT, offering
+valuable insights for future research in this area. Data, code, and scripts are
+freely available at https://github.com/Blue-Raincoat/SelectIT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Accelerated Algorithm for Stochastic Bilevel Optimization under
+  Unbounded Smoothness <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.19212v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.19212v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochuan Gong, Jie Hao, Mingrui Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates a class of stochastic bilevel optimization problems
+where the upper-level function is nonconvex with potentially unbounded
+smoothness and the lower-level problem is strongly convex. These problems have
+significant applications in sequential data learning, such as text
+classification using recurrent neural networks. The unbounded smoothness is
+characterized by the smoothness constant of the upper-level function scaling
+linearly with the gradient norm, lacking a uniform upper bound. Existing
+state-of-the-art algorithms require $\widetilde{O}(1/\epsilon^4)$ oracle calls
+of stochastic gradient or Hessian/Jacobian-vector product to find an
+$\epsilon$-stationary point. However, it remains unclear if we can further
+improve the convergence rate when the assumptions for the function in the
+population level also hold for each random realization almost surely. To
+address this issue, we propose a new Accelerated Bilevel Optimization algorithm
+named AccBO. The algorithm updates the upper-level variable by normalized
+stochastic gradient descent with recursive momentum and the lower-level
+variable by the stochastic Nesterov accelerated gradient descent algorithm with
+averaging. We prove that our algorithm achieves an oracle complexity of
+$\widetilde{O}(1/\epsilon^3)$ to find an $\epsilon$-stationary point, when the
+lower-level stochastic gradient's variance is $O(\epsilon)$. Our proof relies
+on a novel lemma characterizing the dynamics of stochastic Nesterov accelerated
+gradient descent algorithm under distribution drift with high probability for
+the lower-level variable, which is of independent interest and also plays a
+crucial role in analyzing the hypergradient estimation error over time.
+Experimental results on various tasks confirm that our proposed algorithm
+achieves the predicted theoretical acceleration and significantly outperforms
+baselines in bilevel optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024. The code is available at
+  https://github.com/MingruiLiu-ML-Lab/Accelerated-Bilevel-Optimization-Unbounded-Smoothness</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Making AI Less "Thirsty": Uncovering and Addressing the Secret Water
+  Footprint of AI Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03271v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03271v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengfei Li, Jianyi Yang, Mohammad A. Islam, Shaolei Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing carbon footprint of artificial intelligence (AI) has been
+undergoing public scrutiny. Nonetheless, the equally important water
+(withdrawal and consumption) footprint of AI has largely remained under the
+radar. For example, training the GPT-3 language model in Microsoft's
+state-of-the-art U.S. data centers can directly evaporate 700,000 liters of
+clean freshwater, but such information has been kept a secret. More critically,
+the global AI demand is projected to account for 4.2-6.6 billion cubic meters
+of water withdrawal in 2027, which is more than the total annual water
+withdrawal of 4-6 Denmark or half of the United Kingdom. This is concerning, as
+freshwater scarcity has become one of the most pressing challenges. To respond
+to the global water challenges, AI can, and also must, take social
+responsibility and lead by example by addressing its own water footprint. In
+this paper, we provide a principled methodology to estimate the water footprint
+of AI, and also discuss the unique spatial-temporal diversities of AI's runtime
+water efficiency. Finally, we highlight the necessity of holistically
+addressing water footprint along with carbon footprint to enable truly
+sustainable AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Communications of the ACM. Source codes available at:
+  https://github.com/Ren-Research/Making-AI-Less-Thirsty</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MEMO: Fine-grained Tensor Management For Ultra-long Context LLM Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.12117v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.12117v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pinxue Zhao, Hailin Zhang, Fangcheng Fu, Xiaonan Nie, Qibin Liu, Fang Yang, Yuanbo Peng, Dian Jiao, Shuaipeng Li, Jinbao Xue, Yangyu Tao, Bin Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, Large Language Models (LLMs) have been trained using extended
+context lengths to foster more creative applications. However, long context
+training poses great challenges considering the constraint of GPU memory. It
+not only leads to substantial activation memory consumption during training,
+but also incurs considerable memory fragmentation. To facilitate long context
+training, existing frameworks have adopted strategies such as recomputation and
+various forms of parallelisms. Nevertheless, these techniques rely on redundant
+computation or extensive communication, resulting in low Model FLOPS
+Utilization (MFU). In this paper, we propose MEMO, a novel LLM training
+framework designed for fine-grained activation memory management. Given the
+quadratic scaling of computation and linear scaling of memory with sequence
+lengths when using FlashAttention, we offload memory-consuming activations to
+CPU memory after each layer's forward pass and fetch them during the backward
+pass. To maximize the swapping of activations without hindering computation,
+and to avoid exhausting limited CPU memory, we implement a token-wise
+activation recomputation and swapping mechanism. Furthermore, we tackle the
+memory fragmentation issue by employing a bi-level Mixed Integer Programming
+(MIP) approach, optimizing memory reuse across transformer layers. Empirical
+results demonstrate that MEMO achieves an average of 1.97x and 1.80x MFU
+compared to Megatron-LM and DeepSpeed, respectively. This improvement is
+attributed to MEMO's ability to minimize memory fragmentation, reduce
+recomputation and intensive communication, and circumvent the delays associated
+with the memory reorganization process due to fragmentation. By leveraging
+fine-grained activation memory management, MEMO facilitates efficient training
+of 7B LLM with 1 million sequence length on just 8 A800 GPUs, achieving an MFU
+of 52.30%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Localisation of Spatial-Temporal Graph Neural Network <span class="chip">KDD'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04239v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04239v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenying Duan, Shujun Guo, Wei huang, Hong Rao, Xiaoxi He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial-temporal data, fundamental to many intelligent applications, reveals
+dependencies indicating causal links between present measurements at specific
+locations and historical data at the same or other locations. Within this
+context, adaptive spatial-temporal graph neural networks (ASTGNNs) have emerged
+as valuable tools for modelling these dependencies, especially through a
+data-driven approach rather than pre-defined spatial graphs. While this
+approach offers higher accuracy, it presents increased computational demands.
+Addressing this challenge, this paper delves into the concept of localisation
+within ASTGNNs, introducing an innovative perspective that spatial dependencies
+should be dynamically evolving over time. We introduce \textit{DynAGS}, a
+localised ASTGNN framework aimed at maximising efficiency and accuracy in
+distributed deployment. This framework integrates dynamic localisation,
+time-evolving spatial graphs, and personalised localisation, all orchestrated
+around the Dynamic Graph Generator, a light-weighted central module leveraging
+cross attention. The central module can integrate historical information in a
+node-independent manner to enhance the feature representation of nodes at the
+current moment. This improved feature representation is then used to generate a
+dynamic sparse graph without the need for costly data exchanges, and it
+supports personalised localisation. Performance assessments across two core
+ASTGNN architectures and nine real-world datasets from various applications
+reveal that \textit{DynAGS} outshines current benchmarks, underscoring that the
+dynamic modelling of spatial dependencies can drastically improve model
+expressibility, flexibility, and system efficiency, especially in distributed
+settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted by KDD'25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OminiControl: Minimal and Universal Control for Diffusion <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15098v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15098v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenxiong Tan, Songhua Liu, Xingyi Yang, Qiaochu Xue, Xinchao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce OminiControl, a highly versatile and
+parameter-efficient framework that integrates image conditions into pre-trained
+Diffusion Transformer (DiT) models. At its core, OminiControl leverages a
+parameter reuse mechanism, enabling the DiT to encode image conditions using
+itself as a powerful backbone and process them with its flexible multi-modal
+attention processors. Unlike existing methods, which rely heavily on additional
+encoder modules with complex architectures, OminiControl (1) effectively and
+efficiently incorporates injected image conditions with only ~0.1% additional
+parameters, and (2) addresses a wide range of image conditioning tasks in a
+unified manner, including subject-driven generation and spatially-aligned
+conditions such as edges, depth, and more. Remarkably, these capabilities are
+achieved by training on images generated by the DiT itself, which is
+particularly beneficial for subject-driven generation. Extensive evaluations
+demonstrate that OminiControl outperforms existing UNet-based and DiT-adapted
+models in both subject-driven and spatially-aligned conditional generation.
+Additionally, we release our training dataset, Subjects200K, a diverse
+collection of over 200,000 identity-consistent images, along with an efficient
+data synthesis pipeline to advance research in subject-consistent generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Models as Network Optimizers: Explorations and Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.00453v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.00453v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruihuai Liang, Bo Yang, Pengyu Chen, Xianjin Li, Yifan Xue, Zhiwen Yu, Xuelin Cao, Yan Zhang, Mérouane Debbah, H. Vincent Poor, Chau Yuen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network optimization is a fundamental challenge in the Internet of Things
+(IoT) network, often characterized by complex features that make it difficult
+to solve these problems. Recently, generative diffusion models (GDMs) have
+emerged as a promising new approach to network optimization, with the potential
+to directly address these optimization problems. However, the application of
+GDMs in this field is still in its early stages, and there is a noticeable lack
+of theoretical research and empirical findings. In this study, we first explore
+the intrinsic characteristics of generative models. Next, we provide a concise
+theoretical proof and intuitive demonstration of the advantages of generative
+models over discriminative models in network optimization. Based on this
+exploration, we implement GDMs as optimizers aimed at learning high-quality
+solution distributions for given inputs, sampling from these distributions
+during inference to approximate or achieve optimal solutions. Specifically, we
+utilize denoising diffusion probabilistic models (DDPMs) and employ a
+classifier-free guidance mechanism to manage conditional guidance based on
+input parameters. We conduct extensive experiments across three challenging
+network optimization problems. By investigating various model configurations
+and the principles of GDMs as optimizers, we demonstrate the ability to
+overcome prediction errors and validate the convergence of generated solutions
+to optimal solutions. We provide code and data at
+https://github.com/qiyu3816/DiffSG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10919v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10919v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Zhao, Tingwei Chen, Zhijie Cai, Xiaoyang Li, Hang Li, Qimei Chen, Guangxu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Wi-Fi sensing has garnered significant attention due to its
+numerous benefits, such as privacy protection, low cost, and penetration
+ability. Extensive research has been conducted in this field, focusing on areas
+such as gesture recognition, people identification, and fall detection.
+However, many data-driven methods encounter challenges related to domain shift,
+where the model fails to perform well in environments different from the
+training data. One major factor contributing to this issue is the limited
+availability of Wi-Fi sensing datasets, which makes models learn excessive
+irrelevant information and over-fit to the training set. Unfortunately,
+collecting large-scale Wi-Fi sensing datasets across diverse scenarios is a
+challenging task. To address this problem, we propose CrossFi, a siamese
+network-based approach that excels in both in-domain scenario and cross-domain
+scenario, including few-shot, zero-shot scenarios, and even works in few-shot
+new-class scenario where testing set contains new categories. The core
+component of CrossFi is a sample-similarity calculation network called CSi-Net,
+which improves the structure of the siamese network by using an attention
+mechanism to capture similarity information, instead of simply calculating the
+distance or cosine similarity. Based on it, we develop an extra Weight-Net that
+can generate a template for each class, so that our CrossFi can work in
+different scenarios. Experimental results demonstrate that our CrossFi achieves
+state-of-the-art performance across various scenarios. In gesture recognition
+task, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%
+in one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,
+and 84.75% in one-shot new-class scenario. The code for our model is publicly
+available at https://github.com/RS2002/CrossFi.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Silent Majority: Demystifying Memorization Effect in the Presence of
+  Spurious Correlations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyu You, Haocheng Dai, Yifei Min, Jasjeet S. Sekhon, Sarang Joshi, James S. Duncan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models often rely on simple spurious features -- patterns in
+training data that correlate with targets but are not causally related to them,
+like image backgrounds in foreground classification. This reliance typically
+leads to imbalanced test performance across minority and majority groups. In
+this work, we take a closer look at the fundamental cause of such imbalanced
+performance through the lens of memorization, which refers to the ability to
+predict accurately on \textit{atypical} examples (minority groups) in the
+training set but failing in achieving the same accuracy in the testing set.
+This paper systematically shows the ubiquitous existence of spurious features
+in a small set of neurons within the network, providing the first-ever evidence
+that memorization may contribute to imbalanced group performance. Through three
+experimental sources of converging empirical evidence, we find the property of
+a small subset of neurons or channels in memorizing minority group information.
+Inspired by these findings, we articulate the hypothesis: the imbalanced group
+performance is a byproduct of ``noisy'' spurious memorization confined to a
+small set of neurons. To further substantiate this hypothesis, we show that
+eliminating these unnecessary spurious memorization patterns via a novel
+framework during training can significantly affect the model performance on
+minority groups. Our experimental results across various architectures and
+benchmarks offer new insights on how neural networks encode core and spurious
+knowledge, laying the groundwork for future research in demystifying robustness
+to spurious correlation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-shot Video Restoration and Enhancement Using <span class="highlight-title">Pre-Train</span>ed Image
+  Diffusion Model <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.01960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.01960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Cao, Huanjing Yue, Xin Liu, Jingyu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based zero-shot image restoration and enhancement models have
+achieved great success in various tasks of image restoration and enhancement.
+However, directly applying them to video restoration and enhancement results in
+severe temporal flickering artifacts. In this paper, we propose the first
+framework for zero-shot video restoration and enhancement based on the
+pre-trained image diffusion model. By replacing the spatial self-attention
+layer with the proposed short-long-range (SLR) temporal attention layer, the
+pre-trained image diffusion model can take advantage of the temporal
+correlation between frames. We further propose temporal consistency guidance,
+spatial-temporal noise sharing, and an early stopping sampling strategy to
+improve temporally consistent sampling. Our method is a plug-and-play module
+that can be inserted into any diffusion-based image restoration or enhancement
+methods to further improve their performance. Experimental results demonstrate
+the superiority of our proposed method. Our code is available at
+https://github.com/cao-cong/ZVRD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine unlearning through fine-grained model parameters perturbation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04385v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04385v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Zuo, Zhuo Tang, Kenli Li, Anwitaman Datta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning techniques, which involve retracting data records and
+reducing influence of said data on trained models, help with the user privacy
+protection objective but incur significant computational costs. Weight
+perturbation-based unlearning is a general approach, but it typically involves
+globally modifying the parameters. We propose fine-grained Top-K and Random-k
+parameters perturbed inexact machine unlearning strategies that address the
+privacy needs while keeping the computational costs tractable.
+  In order to demonstrate the efficacy of our strategies we also tackle the
+challenge of evaluating the effectiveness of machine unlearning by considering
+the model's generalization performance across both unlearning and remaining
+data. To better assess the unlearning effect and model generalization, we
+propose novel metrics, namely, the forgetting rate and memory retention rate.
+However, for inexact machine unlearning, current metrics are inadequate in
+quantifying the degree of forgetting that occurs after unlearning strategies
+are applied. To address this, we introduce SPD-GAN, which subtly perturbs the
+distribution of data targeted for unlearning. Then, we evaluate the degree of
+unlearning by measuring the performance difference of the models on the
+perturbed unlearning data before and after the unlearning process. By
+implementing these innovative techniques and metrics, we achieve
+computationally efficacious privacy protection in machine learning applications
+without significant sacrifice of model performance. Furthermore, this approach
+provides a novel method for evaluating the degree of unlearning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Clarify Confused Nodes via Separated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02285v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02285v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Zhou, Shengbo Gong, Xuanze Chen, Chenxuan Xie, Shanqing Yu, Qi Xuan, Xiaoniu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have achieved remarkable advances in
+graph-oriented tasks. However, real-world graphs invariably contain a certain
+proportion of heterophilous nodes, challenging the homophily assumption of
+traditional GNNs and hindering their performance. Most existing studies
+continue to design generic models with shared weights between heterophilous and
+homophilous nodes. Despite the incorporation of high-order messages or
+multi-channel architectures, these efforts often fall short. A minority of
+studies attempt to train different node groups separately but suffer from
+inappropriate separation metrics and low efficiency. In this paper, we first
+propose a new metric, termed Neighborhood Confusion (NC), to facilitate a more
+reliable separation of nodes. We observe that node groups with different levels
+of NC values exhibit certain differences in intra-group accuracy and visualized
+embeddings. These pave the way for Neighborhood Confusion-guided Graph
+Convolutional Network (NCGCN), in which nodes are grouped by their NC values
+and accept intra-group weight sharing and message passing. Extensive
+experiments on both homophilous and heterophilous benchmarks demonstrate that
+our framework can effectively separate nodes and yield significant performance
+improvement compared to the latest methods. The source code will be available
+in https://github.com/GISec-Team/NCGNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Pattern Analysis and Machine
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STORM: A Spatio-Temporal Factor Model Based on Dual Vector Quantized
+  Variational Autoencoders for Financial Trading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09468v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09468v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilei Zhao, Wentao Zhang, Tingran Yang, Yong Jiang, Fei Huang, Wei Yang Bryan Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In financial trading, factor models are widely used to price assets and
+capture excess returns from mispricing. Recently, we have witnessed the rise of
+variational autoencoder-based latent factor models, which learn latent factors
+self-adaptively. While these models focus on modeling overall market
+conditions, they often fail to effectively capture the temporal patterns of
+individual stocks. Additionally, representing multiple factors as single values
+simplifies the model but limits its ability to capture complex relationships
+and dependencies. As a result, the learned factors are of low quality and lack
+diversity, reducing their effectiveness and robustness across different trading
+periods. To address these issues, we propose a Spatio-Temporal factOR Model
+based on dual vector quantized variational autoencoders, named STORM, which
+extracts features of stocks from temporal and spatial perspectives, then fuses
+and aligns these features at the fine-grained and semantic level, and
+represents the factors as multi-dimensional embeddings. The discrete codebooks
+cluster similar factor embeddings, ensuring orthogonality and diversity, which
+helps distinguish between different factors and enables factor selection in
+financial trading. To show the performance of the proposed factor model, we
+apply it to two downstream experiments: portfolio management on two stock
+datasets and individual trading tasks on six specific stocks. The extensive
+experiments demonstrate STORM's flexibility in adapting to downstream tasks and
+superior performance over baseline models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dual Cone Gradient Descent for Training Physics-Informed Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18426v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18426v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youngsik Hwang, Dong-Young Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) have emerged as a prominent approach
+for solving partial differential equations (PDEs) by minimizing a combined loss
+function that incorporates both boundary loss and PDE residual loss. Despite
+their remarkable empirical performance in various scientific computing tasks,
+PINNs often fail to generate reasonable solutions, and such pathological
+behaviors remain difficult to explain and resolve. In this paper, we identify
+that PINNs can be adversely trained when gradients of each loss function
+exhibit a significant imbalance in their magnitudes and present a negative
+inner product value. To address these issues, we propose a novel optimization
+framework, Dual Cone Gradient Descent (DCGD), which adjusts the direction of
+the updated gradient to ensure it falls within a dual cone region. This region
+is defined as a set of vectors where the inner products with both the gradients
+of the PDE residual loss and the boundary loss are non-negative. Theoretically,
+we analyze the convergence properties of DCGD algorithms in a non-convex
+setting. On a variety of benchmark equations, we demonstrate that DCGD
+outperforms other optimization algorithms in terms of various evaluation
+metrics. In particular, DCGD achieves superior predictive accuracy and enhances
+the stability of training for failure modes of PINNs and complex PDEs, compared
+to existing optimally tuned models. Moreover, DCGD can be further improved by
+combining it with popular strategies for PINNs, including learning rate
+annealing and the Neural Tangent Kernel (NTK).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Thirty-eighth Annual Conference on Neural Information Processing
+  Systems, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformal-in-the-Loop for Learning with Imbalanced Noisy Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02281v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02281v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John Brandon Graham-Knight, Jamil Fayyad, Nourhan Bayasi, Patricia Lasserre, Homayoun Najjaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class imbalance and label noise are pervasive in large-scale datasets, yet
+much of machine learning research assumes well-labeled, balanced data, which
+rarely reflects real world conditions. Existing approaches typically address
+either label noise or class imbalance in isolation, leading to suboptimal
+results when both issues coexist. In this work, we propose
+Conformal-in-the-Loop (CitL), a novel training framework that addresses both
+challenges with a conformal prediction-based approach. CitL evaluates sample
+uncertainty to adjust weights and prune unreliable examples, enhancing model
+resilience and accuracy with minimal computational cost. Our extensive
+experiments include a detailed analysis showing how CitL effectively emphasizes
+impactful data in noisy, imbalanced datasets. Our results show that CitL
+consistently boosts model performance, achieving up to a 6.1% increase in
+classification accuracy and a 5.0 mIoU improvement in segmentation. Our code is
+publicly available: CitL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EdgeSight: Enabling Modeless and Cost-Efficient Inference at the Edge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19213v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19213v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        ChonLam Lao, Jiaqi Gao, Ganesh Ananthanarayanan, Aditya Akella, Minlan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional ML inference is evolving toward modeless inference, which
+abstracts the complexity of model selection from users, allowing the system to
+automatically choose the most appropriate model for each request based on
+accuracy and resource requirements. While prior studies have focused on
+modeless inference within data centers, this paper tackles the pressing need
+for cost-efficient modeless inference at the edge -- particularly within its
+unique constraints of limited device memory, volatile network conditions, and
+restricted power consumption.
+  To overcome these challenges, we propose EdgeSight, a system that provides
+cost-efficient EdgeSight serving for diverse DNNs at the edge. EdgeSight
+employs an edge-data center (edge-DC) architecture, utilizing confidence
+scaling to reduce the number of model options while meeting diverse accuracy
+requirements. Additionally, it supports lossy inference in volatile network
+environments. Our experimental results show that EdgeSight outperforms existing
+systems by up to 1.6x in P99 latency for modeless services. Furthermore, our
+FPGA prototype demonstrates similar performance at certain accuracy levels,
+with a power consumption reduction of up to 3.34x.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Natural Language Outlines for Code: Literate Programming in the LLM Era 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04820v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04820v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kensen Shi, Deniz Altınbüken, Saswat Anand, Mihai Christodorescu, Katja Grünwedel, Alexa Koenings, Sai Naidu, Anurag Pathak, Marc Rasi, Fredde Ribeiro, Brandon Ruffin, Siddhant Sanyam, Maxim Tabachnyk, Sara Toth, Roy Tu, Tobias Welp, Pengcheng Yin, Manzil Zaheer, Satish Chandra, Charles Sutton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose using natural language outlines as a novel modality and
+interaction surface for providing AI assistance to developers throughout the
+software development process. An NL outline for a code function comprises
+multiple statements written in concise prose, which partition the code and
+summarize its main ideas in the style of literate programming. Crucially, we
+find that modern LLMs can generate accurate and high-quality NL outlines in
+practice. Moreover, NL outlines enable a bidirectional sync between code and
+NL, allowing changes in one to be automatically reflected in the other. We
+discuss many use cases for NL outlines: they can accelerate understanding and
+navigation of code and diffs, simplify code maintenance, augment code search,
+steer code generation, and more. We then propose and compare multiple LLM
+prompting techniques for generating outlines and ask professional developers to
+judge outline quality. Finally, we present two case studies applying NL
+outlines toward code review and malware detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Diffuser (CoD): Mastering Continual Offline Reinforcement
+  Learning with Experience Rehearsal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.02512v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.02512v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jifeng Hu, Li Shen, Sili Huang, Zhejian Yang, Hechang Chen, Lichao Sun, Yi Chang, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial neural networks, especially recent diffusion-based models, have
+shown remarkable superiority in gaming, control, and QA systems, where the
+training tasks' datasets are usually static. However, in real-world
+applications, such as robotic control of reinforcement learning (RL), the tasks
+are changing, and new tasks arise in a sequential order. This situation poses
+the new challenge of plasticity-stability trade-off for training an agent who
+can adapt to task changes and retain acquired knowledge. In view of this, we
+propose a rehearsal-based continual diffusion model, called Continual Diffuser
+(CoD), to endow the diffuser with the capabilities of quick adaptation
+(plasticity) and lasting retention (stability). Specifically, we first
+construct an offline benchmark that contains 90 tasks from multiple domains.
+Then, we train the CoD on each task with sequential modeling and conditional
+generation for making decisions. Next, we preserve a small portion of previous
+datasets as the rehearsal buffer and replay it to retain the acquired
+knowledge. Extensive experiments on a series of tasks show CoD can achieve a
+promising plasticity-stability trade-off and outperform existing
+diffusion-based methods and other representative baselines on most tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction
+  Following 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08187v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08187v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Fang, Xinle Deng, Kangwei Liu, Ningyu Zhang, Jingyang Qian, Penghui Yang, Xiaohui Fan, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models excel at interpreting complex natural language
+instructions, enabling them to perform a wide range of tasks. In the life
+sciences, single-cell RNA sequencing (scRNA-seq) data serves as the "language
+of cellular biology", capturing intricate gene expression patterns at the
+single-cell level. However, interacting with this "language" through
+conventional tools is often inefficient and unintuitive, posing challenges for
+researchers. To address these limitations, we present InstructCell, a
+multi-modal AI copilot that leverages natural language as a medium for more
+direct and flexible single-cell analysis. We construct a comprehensive
+multi-modal instruction dataset that pairs text-based instructions with
+scRNA-seq profiles from diverse tissues and species. Building on this, we
+develop a multi-modal cell language architecture capable of simultaneously
+interpreting and processing both modalities. InstructCell empowers researchers
+to accomplish critical tasks-such as cell type annotation, conditional
+pseudo-cell generation, and drug sensitivity prediction-using straightforward
+natural language commands. Extensive evaluations demonstrate that InstructCell
+consistently meets or exceeds the performance of existing single-cell
+foundation models, while adapting to diverse experimental conditions. More
+importantly, InstructCell provides an accessible and intuitive tool for
+exploring complex single-cell data, lowering technical barriers and enabling
+deeper biological insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages; 13 figures; Code: https://github.com/zjunlp/Instructcell,
+  Models: https://huggingface.co/zjunlp/Instructcell-chat,
+  https://huggingface.co/zjunlp/InstructCell-instruct</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Emergent Abilities of Language Models from the Loss
+  Perspective <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15796v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15796v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxiao Du, Aohan Zeng, Yuxiao Dong, Jie Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have put into question the belief that emergent abilities in
+language models are exclusive to large models. This skepticism arises from two
+observations: 1) smaller models can also exhibit high performance on emergent
+abilities and 2) there is doubt on the discontinuous metrics used to measure
+these abilities. In this paper, we propose to study emergent abilities in the
+lens of pre-training loss, instead of model size or training compute. We
+demonstrate that the Transformer models with the same pre-training loss, but
+different model and data sizes, generate the same performance on various
+downstream tasks, with a fixed data corpus, tokenization, and model
+architecture. We also discover that a model exhibits emergent abilities on
+certain tasks -- regardless of the continuity of metrics -- when its
+pre-training loss falls below a specific threshold. Before reaching this
+threshold, its performance remains at the level of random guessing. This
+inspires us to redefine emergent abilities as those that manifest in models
+with lower pre-training losses, highlighting that these abilities cannot be
+predicted by merely extrapolating the performance trends of models with higher
+pre-training losses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 8 figures. Accepted in NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-driven inventory management for new products: A warm-start and
+  adjusted Dyna-$Q$ approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinye Qu, Longxiao Liu, Wenjie Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel reinforcement learning algorithm for
+inventory management of newly launched products with no or limited historical
+demand information. The algorithm follows the classic Dyna-$Q$ structure,
+balancing the model-based and model-free approaches, while accelerating the
+training process of Dyna-$Q$ and mitigating the model discrepancy generated by
+the model-based feedback. Warm-start information from the demand data of
+existing similar products can be incorporated into the algorithm to further
+stabilize the early-stage training and reduce the variance of the estimated
+optimal policy. Our approach is validated through a case study of bakery
+inventory management with real data. The adjusted Dyna-$Q$ shows up to a 23.7%
+reduction in average daily cost compared with $Q$-learning, and up to a 77.5%
+reduction in training time within the same horizon compared with classic
+Dyna-$Q$. By incorporating the warm-start information, it can be found that the
+adjusted Dyna-$Q$ has the lowest total cost, lowest variance in total cost, and
+relatively low shortage percentages among all the algorithms under a 30-day
+testing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unconditional stability of a recurrent neural circuit implementing
+  divisive normalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18946v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18946v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivang Rawat, David J. Heeger, Stefano Martiniani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stability in recurrent neural models poses a significant challenge,
+particularly in developing biologically plausible neurodynamical models that
+can be seamlessly trained. Traditional cortical circuit models are notoriously
+difficult to train due to expansive nonlinearities in the dynamical system,
+leading to an optimization problem with nonlinear stability constraints that
+are difficult to impose. Conversely, recurrent neural networks (RNNs) excel in
+tasks involving sequential data but lack biological plausibility and
+interpretability. In this work, we address these challenges by linking dynamic
+divisive normalization (DN) to the stability of ORGaNICs, a biologically
+plausible recurrent cortical circuit model that dynamically achieves DN and
+that has been shown to simulate a wide range of neurophysiological phenomena.
+By using the indirect method of Lyapunov, we prove the remarkable property of
+unconditional local stability for an arbitrary-dimensional ORGaNICs circuit
+when the recurrent weight matrix is the identity. We thus connect ORGaNICs to a
+system of coupled damped harmonic oscillators, which enables us to derive the
+circuit's energy function, providing a normative principle of what the circuit,
+and individual neurons, aim to accomplish. Further, for a generic recurrent
+weight matrix, we prove the stability of the 2D model and demonstrate
+empirically that stability holds in higher dimensions. Finally, we show that
+ORGaNICs can be trained by backpropagation through time without gradient
+clipping/scaling, thanks to its intrinsic stability property and adaptive time
+constants, which address the problems of exploding, vanishing, and oscillating
+gradients. By evaluating the model's performance on RNN benchmarks, we find
+that ORGaNICs outperform alternative neurodynamical models on static image
+classification tasks and perform comparably to LSTMs on sequential tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating the Effect of Network Pruning on Performance and
+  Interpretability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.19727v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.19727v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan von Rad, Florian Seuffert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNNs) are often over-parameterized for their tasks and
+can be compressed quite drastically by removing weights, a process called
+pruning. We investigate the impact of different pruning techniques on the
+classification performance and interpretability of GoogLeNet. We systematically
+apply unstructured and structured pruning, as well as connection sparsity
+(pruning of input weights) methods to the network and analyze the outcomes
+regarding the network's performance on the validation set of ImageNet. We also
+compare different retraining strategies, such as iterative pruning and one-shot
+pruning. We find that with sufficient retraining epochs, the performance of the
+networks can approximate the performance of the default GoogLeNet - and even
+surpass it in some cases. To assess interpretability, we employ the Mechanistic
+Interpretability Score (MIS) developed by Zimmermann et al. . Our experiments
+reveal that there is no significant relationship between interpretability and
+pruning rate when using MIS as a measure. Additionally, we observe that
+networks with extremely low accuracy can still achieve high MIS scores,
+suggesting that the MIS may not always align with intuitive notions of
+interpretability, such as understanding the basis of correct decisions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Finite-Sample Bounds for Adaptive Inverse Reinforcement Learning using
+  Passive Langevin Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09123v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09123v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luke Snow, Vikram Krishnamurthy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides a finite-sample analysis of a passive stochastic gradient
+Langevin dynamics (PSGLD) algorithm. This algorithm is designed to achieve
+adaptive inverse reinforcement learning (IRL). Adaptive IRL aims to estimate
+the cost function of a forward learner performing a stochastic gradient
+algorithm (e.g., policy gradient reinforcement learning) by observing their
+estimates in real-time. The PSGLD algorithm is considered passive because it
+incorporates noisy gradients provided by an external stochastic gradient
+algorithm (forward learner), of which it has no control. The PSGLD algorithm
+acts as a randomized sampler to achieve adaptive IRL by reconstructing the
+forward learner's cost function nonparametrically from the stationary measure
+of a Langevin diffusion. This paper analyzes the non-asymptotic (finite-sample)
+performance; we provide explicit bounds on the 2-Wasserstein distance between
+PSGLD algorithm sample measure and the stationary measure encoding the cost
+function, and provide guarantees for a kernel density estimation scheme which
+reconstructs the cost function from empirical samples. Our analysis uses tools
+from the study of Markov diffusion operators. The derived bounds have both
+practical and theoretical significance. They provide finite-time guarantees for
+an adaptive IRL mechanism, and substantially generalize the analytical
+framework of a line of research in passive stochastic gradient algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compositional Automata Embeddings for Goal-Conditioned Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.00205v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.00205v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beyazit Yalcinkaya, Niklas Lauffer, Marcell Vazquez-Chanlatte, Sanjit A. Seshia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Goal-conditioned reinforcement learning is a powerful way to control an AI
+agent's behavior at runtime. That said, popular goal representations, e.g.,
+target states or natural language, are either limited to Markovian tasks or
+rely on ambiguous task semantics. We propose representing temporal goals using
+compositions of deterministic finite automata (cDFAs) and use cDFAs to guide RL
+agents. cDFAs balance the need for formal temporal semantics with ease of
+interpretation: if one can understand a flow chart, one can understand a cDFA.
+On the other hand, cDFAs form a countably infinite concept class with Boolean
+semantics, and subtle changes to the automaton can result in very different
+tasks, making them difficult to condition agent behavior on. To address this,
+we observe that all paths through a DFA correspond to a series of reach-avoid
+tasks and propose pre-training graph neural network embeddings on "reach-avoid
+derived" DFAs. Through empirical evaluation, we demonstrate that the proposed
+pre-training method enables zero-shot generalization to various cDFA task
+classes and accelerated policy specialization without the myopic suboptimality
+of hierarchical methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 2 OLMo 2 Furious 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00656v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00656v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Team OLMo, Pete Walsh, Luca Soldaini, Dirk Groeneveld, Kyle Lo, Shane Arora, Akshita Bhagia, Yuling Gu, Shengyi Huang, Matt Jordan, Nathan Lambert, Dustin Schwenk, Oyvind Tafjord, Taira Anderson, David Atkinson, Faeze Brahman, Christopher Clark, Pradeep Dasigi, Nouha Dziri, Michal Guerquin, Hamish Ivison, Pang Wei Koh, Jiacheng Liu, Saumya Malik, William Merrill, Lester James V. Miranda, Jacob Morrison, Tyler Murray, Crystal Nam, Valentina Pyatkin, Aman Rangapur, Michael Schmitz, Sam Skjonsberg, David Wadden, Christopher Wilhelm, Michael Wilson, Luke Zettlemoyer, Ali Farhadi, Noah A. Smith, Hannaneh Hajishirzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present OLMo 2, the next generation of our fully open language models.
+OLMo 2 includes dense autoregressive models with improved architecture and
+training recipe, pretraining data mixtures, and instruction tuning recipes. Our
+modified model architecture and training recipe achieve both better training
+stability and improved per-token efficiency. Our updated pretraining data
+mixture introduces a new, specialized data mix called Dolmino Mix 1124, which
+significantly improves model capabilities across many downstream task
+benchmarks when introduced via late-stage curriculum training (i.e. specialized
+data during the annealing phase of pretraining). Finally, we incorporate best
+practices from T\"ulu 3 to develop OLMo 2-Instruct, focusing on permissive data
+and extending our final-stage reinforcement learning with verifiable rewards
+(RLVR). Our OLMo 2 base models sit at the Pareto frontier of performance to
+compute, often matching or outperforming open-weight only models like Llama 3.1
+and Qwen 2.5 while using fewer FLOPs and with fully transparent training data,
+code, and recipe. Our fully open OLMo 2-Instruct models are competitive with or
+surpassing open-weight only models of comparable size, including Qwen 2.5,
+Llama 3.1 and Gemma 2. We release all OLMo 2 artifacts openly -- models at 7B
+and 13B scales, both pretrained and post-trained, including their full training
+data, training code and recipes, training logs and thousands of intermediate
+checkpoints. The final instruction model is available on the Ai2 Playground as
+a free research demo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Model demo available at playground.allenai.org</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Cross-Domain Representations for Transferable Drug
+  Perturbations on Single-Cell Transcriptional Responses <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Liu, Shikai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phenotypic drug discovery has attracted widespread attention because of its
+potential to identify bioactive molecules. Transcriptomic profiling provides a
+comprehensive reflection of phenotypic changes in cellular responses to
+external perturbations. In this paper, we propose XTransferCDR, a novel
+generative framework designed for feature decoupling and transferable
+representation learning across domains. Given a pair of perturbed expression
+profiles, our approach decouples the perturbation representations from basal
+states through domain separation encoders and then cross-transfers them in the
+latent space. The transferred representations are then used to reconstruct the
+corresponding perturbed expression profiles via a shared decoder. This
+cross-transfer constraint effectively promotes the learning of transferable
+drug perturbation representations. We conducted extensive evaluations of our
+model on multiple datasets, including single-cell transcriptional responses to
+drugs and single- and combinatorial genetic perturbations. The experimental
+results show that XTransferCDR achieved better performance than current
+state-of-the-art methods, showcasing its potential to advance phenotypic drug
+discovery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by The 39th Annual AAAI Conference on Artificial Intelligenc
+  (AAAI 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Long Video Tokenization via Coordinate-based Patch
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14762v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14762v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huiwon Jang, Sihyun Yu, Jinwoo Shin, Pieter Abbeel, Younggyo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient tokenization of videos remains a challenge in training vision
+models that can process long videos. One promising direction is to develop a
+tokenizer that can encode long video clips, as it would enable the tokenizer to
+leverage the temporal coherence of videos better for tokenization. However,
+training existing tokenizers on long videos often incurs a huge training cost
+as they are trained to reconstruct all the frames at once. In this paper, we
+introduce CoordTok, a video tokenizer that learns a mapping from
+coordinate-based representations to the corresponding patches of input videos,
+inspired by recent advances in 3D generative models. In particular, CoordTok
+encodes a video into factorized triplane representations and reconstructs
+patches that correspond to randomly sampled $(x,y,t)$ coordinates. This allows
+for training large tokenizer models directly on long videos without requiring
+excessive training resources. Our experiments show that CoordTok can
+drastically reduce the number of tokens for encoding long video clips. For
+instance, CoordTok can encode a 128-frame video with 128$\times$128 resolution
+into 1280 tokens, while baselines need 6144 or 8192 tokens to achieve similar
+reconstruction quality. We further show that this efficient video tokenization
+enables memory-efficient training of a diffusion transformer that can generate
+128 frames at once.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available on the project webpage:
+  https://huiwon-jang.github.io/coordtok/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unifying Information-theoretic Perspective on Evaluating Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14340v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14340v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexis Fox, Samarth Swarup, Abhijin Adiga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Considering the difficulty of interpreting generative model output, there is
+significant current research focused on determining meaningful evaluation
+metrics. Several recent approaches utilize "precision" and "recall," borrowed
+from the classification domain, to individually quantify the output fidelity
+(realism) and output diversity (representation of the real data variation),
+respectively. With the increase in metric proposals, there is a need for a
+unifying perspective, allowing for easier comparison and clearer explanation of
+their benefits and drawbacks. To this end, we unify a class of
+kth-nearest-neighbors (kNN)-based metrics under an information-theoretic lens
+using approaches from kNN density estimation. Additionally, we propose a
+tri-dimensional metric composed of Precision Cross-Entropy (PCE), Recall
+Cross-Entropy (RCE), and Recall Entropy (RE), which separately measure fidelity
+and two distinct aspects of diversity, inter- and intra-class. Our
+domain-agnostic metric, derived from the information-theoretic concepts of
+entropy and cross-entropy, can be dissected for both sample- and mode-level
+analysis. Our detailed experimental results demonstrate the sensitivity of our
+metric components to their respective qualities and reveal undesirable
+behaviors of other metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting Long-Term Student Outcomes from Short-Term EdTech Log Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15473v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15473v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ge Gao, Amelia Leon, Andrea Jetten, Jasmine Turner, Husni Almoubayyed, Stephen Fancsali, Emma Brunskill
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Educational stakeholders are often particularly interested in sparse, delayed
+student outcomes, like end-of-year statewide exams. The rare occurrence of such
+assessments makes it harder to identify students likely to fail such
+assessments, as well as making it slow for researchers and educators to be able
+to assess the effectiveness of particular educational tools. Prior work has
+primarily focused on using logs from students full usage (e.g. year-long) of an
+educational product to predict outcomes, or considered predictive accuracy
+using a few minutes to predict outcomes after a short (e.g. 1 hour) session. In
+contrast, we investigate machine learning predictors using students' logs
+during their first few hours of usage can provide useful predictive insight
+into those students' end-of-school year external assessment. We do this on
+three diverse datasets: from students in Uganda using a literacy game product,
+and from students in the US using two mathematics intelligent tutoring systems.
+We consider various measures of the accuracy of the resulting predictors,
+including its ability to identify students at different parts along the
+assessment performance distribution. Our findings suggest that short-term log
+usage data, from 2-5 hours, can be used to provide valuable signal about
+students' long-term external performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 15th International Learning Analytics and Knowledge
+  Conference (LAK2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Misclassification Network-Based Method for Comparative Genomic
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07051v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07051v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wan He, Tina Eliassi-Rad, Samuel V. Scarpino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classifying genome sequences based on metadata has been an active area of
+research in comparative genomics for decades with many important applications
+across the life sciences. Established methods for classifying genomes can be
+broadly grouped into sequence alignment-based and alignment-free models.
+Conventional alignment-based models rely on genome similarity measures
+calculated based on local sequence alignments or consistent ordering among
+sequences. However, such methods are computationally expensive when dealing
+with large ensembles of even moderately sized genomes. In contrast,
+alignment-free (AF) approaches measure genome similarity based on summary
+statistics in an unsupervised setting and are efficient enough to analyze large
+datasets. However, both alignment-based and AF methods typically assume fixed
+scoring rubrics that lack the flexibility to assign varying importance to
+different parts of the sequences based on prior knowledge. In this study, we
+integrate AI and network science approaches to develop a comparative genomic
+analysis framework that addresses these limitations. Our approach, termed the
+Genome Misclassification Network Analysis (GMNA), simultaneously leverages
+misclassified instances, a learned scoring rubric, and label information to
+classify genomes based on associated metadata and better understand potential
+drivers of misclassification. We evaluate the utility of the GMNA using Naive
+Bayes and convolutional neural network models, supplemented by additional
+experiments with transformer-based models, to construct SARS-CoV-2 sampling
+location classifiers using over 500,000 viral genome sequences and study the
+resulting network of misclassifications. We demonstrate the global health
+potential of the GMNA by leveraging the SARS-CoV-2 genome misclassification
+networks to investigate the role human mobility played in structuring
+geographic clustering of SARS-CoV-2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FlowDock: Geometric Flow Matching for Generative Protein-Ligand Docking
+  and Affinity Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10966v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10966v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Morehead, Jianlin Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Powerful generative AI models of protein-ligand structure have recently been
+proposed, but few of these methods support both flexible protein-ligand docking
+and affinity estimation. Of those that do, none can directly model multiple
+binding ligands concurrently or have been rigorously benchmarked on
+pharmacologically relevant drug targets, hindering their widespread adoption in
+drug discovery efforts. In this work, we propose FlowDock, the first deep
+geometric generative model based on conditional flow matching that learns to
+directly map unbound (apo) structures to their bound (holo) counterparts for an
+arbitrary number of binding ligands. Furthermore, FlowDock provides predicted
+structural confidence scores and binding affinity values with each of its
+generated protein-ligand complex structures, enabling fast virtual screening of
+new (multi-ligand) drug targets. For the well-known PoseBusters Benchmark
+dataset, FlowDock outperforms single-sequence AlphaFold 3 with a 51% blind
+docking success rate using unbound (apo) protein input structures and without
+any information derived from multiple sequence alignments, and for the
+challenging new DockGen-E dataset, FlowDock outperforms single-sequence
+AlphaFold 3 and matches single-sequence Chai-1 for binding pocket
+generalization. Additionally, in the ligand category of the 16th community-wide
+Critical Assessment of Techniques for Structure Prediction (CASP16), FlowDock
+ranked among the top-5 methods for pharmacological binding affinity estimation
+across 140 protein-ligand complexes, demonstrating the efficacy of its learned
+representations in virtual screening. Source code, data, and pre-trained models
+are available at https://github.com/BioinfoMachineLearning/FlowDock.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 tables, 2 algorithms, 7 figures. Code, data, pre-trained
+  models, and baseline method predictions are available at
+  https://github.com/BioinfoMachineLearning/FlowDock</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Top-k Multi-Armed Bandit Learning for Content Dissemination in Swarms of
+  Micro-UAVs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.10845v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.10845v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Kumar Bhuyan, Hrishikesh Dutta, Subir Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a Micro-Unmanned Aerial Vehicle (UAV)-enhanced content
+management system for disaster scenarios where communication infrastructure is
+generally compromised. Utilizing a hybrid network of stationary and mobile
+Micro-UAVs, this system aims to provide crucial content access to isolated
+communities. In the developed architecture, stationary anchor UAVs, equipped
+with vertical and lateral links, serve users in individual disaster-affected
+communities. and mobile micro-ferrying UAVs, with enhanced mobility, extend
+coverage across multiple such communities. The primary goal is to devise a
+content dissemination system that dynamically learns caching policies to
+maximize content accessibility to users left without communication
+infrastructure. The core contribution is an adaptive content dissemination
+framework that employs a decentralized Top-k Multi-Armed Bandit learning
+approach for efficient UAV caching decisions. This approach accounts for
+geo-temporal variations in content popularity and diverse user demands.
+Additionally, a Selective Caching Algorithm is proposed to minimize redundant
+content copies by leveraging inter-UAV information sharing. Through functional
+verification and performance evaluation, the proposed framework demonstrates
+improved system performance and adaptability across varying network sizes,
+micro-UAV swarms, and content popularity distributions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 8 figures, 2 algorithms, 2 tables, journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Key-Exchange Convolutional Auto-Encoder for Data Augmentation in Early
+  Knee Osteoarthritis Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13336v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13336v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Wang, Aladine Chetouani, Mohamed Jarraya, Yung Hsin Chen, Yuhua Ru, Fang Chen, Fabian Bauer, Liping Zhang, Didier Hans, Rachid Jennane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knee Osteoarthritis (KOA) is a common musculoskeletal condition that
+significantly affects mobility and quality of life, particularly in elderly
+populations. However, training deep learning models for early KOA
+classification is often hampered by the limited availability of annotated
+medical datasets, owing to the high costs and labour-intensive nature of data
+labelling. Traditional data augmentation techniques, while useful, rely on
+simple transformations and fail to introduce sufficient diversity into the
+dataset. To address these challenges, we propose the Key-Exchange Convolutional
+Auto-Encoder (KECAE) as an innovative Artificial Intelligence (AI)-based data
+augmentation strategy for early KOA classification. Our model employs a
+convolutional autoencoder with a novel key-exchange mechanism that generates
+synthetic images by selectively exchanging key pathological features between
+X-ray images, which not only diversifies the dataset but also ensures the
+clinical validity of the augmented data. A hybrid loss function is introduced
+to supervise feature learning and reconstruction, integrating multiple
+components, including reconstruction, supervision, and feature separation
+losses. Experimental results demonstrate that the KECAE-generated data
+significantly improve the performance of KOA classification models, with
+accuracy gains of up to 1.98% across various standard and state-of-the-art
+architectures. Furthermore, a clinical validation study involving expert
+radiologists confirms the anatomical plausibility and diagnostic realism of the
+synthetic outputs. These findings highlight the potential of KECAE as a robust
+tool for augmenting medical datasets in early KOA detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonsmooth Nonconvex-Nonconcave Minimax Optimization: Primal-Dual
+  Balancing and Iteration Complexity Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10825v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10825v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajin Li, Linglingzhi Zhu, Anthony Man-Cho So
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonconvex-nonconcave minimax optimization has gained widespread interest over
+the last decade. However, most existing works focus on variants of gradient
+descent-ascent (GDA) algorithms, which are only applicable to smooth
+nonconvex-concave settings. To address this limitation, we propose a novel
+algorithm named smoothed proximal linear descent-ascent (smoothed PLDA), which
+can effectively handle a broad range of structured nonsmooth
+nonconvex-nonconcave minimax problems. Specifically, we consider the setting
+where the primal function has a nonsmooth composite structure and the dual
+function possesses the Kurdyka-Lojasiewicz (KL) property with exponent $\theta
+\in [0,1)$. We introduce a novel convergence analysis framework for smoothed
+PLDA, the key components of which are our newly developed nonsmooth primal
+error bound and dual error bound. Using this framework, we show that smoothed
+PLDA can find both $\epsilon$-game-stationary points and
+$\epsilon$-optimization-stationary points of the problems of interest in
+$\mathcal{O}(\epsilon^{-2\max\{2\theta,1\}})$ iterations. Furthermore, when
+$\theta \in [0,\frac{1}{2}]$, smoothed PLDA achieves the optimal iteration
+complexity of $\mathcal{O}(\epsilon^{-2})$. To further demonstrate the
+effectiveness and wide applicability of our analysis framework, we show that
+certain max-structured problem possesses the KL property with exponent
+$\theta=0$ under mild assumptions. As a by-product, we establish
+algorithm-independent quantitative relationships among various stationarity
+concepts, which may be of independent interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Mathematical Programming</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Surrogate Modeling for Explainable Predictive Time Series Corrections 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19897v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19897v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alfredo Lopez, Florian Sobieczky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a local surrogate approach for explainable time-series
+forecasting. An initially non-interpretable predictive model to improve the
+forecast of a classical time-series 'base model' is used. 'Explainability' of
+the correction is provided by fitting the base model again to the data from
+which the error prediction is removed (subtracted), yielding a difference in
+the model parameters which can be interpreted. We provide illustrative examples
+to demonstrate the potential of the method to discover and explain underlying
+patterns in the data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Novel Object Detection via Cooperative Foundational Models <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12068v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12068v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Bharadwaj, Muzammal Naseer, Salman Khan, Fahad Shahbaz Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we address the challenging and emergent problem of novel object
+detection (NOD), focusing on the accurate detection of both known and novel
+object categories during inference. Traditional object detection algorithms are
+inherently closed-set, limiting their capability to handle NOD. We present a
+novel approach to transform existing closed-set detectors into open-set
+detectors. This transformation is achieved by leveraging the complementary
+strengths of pre-trained foundational models, specifically CLIP and SAM,
+through our cooperative mechanism. Furthermore, by integrating this mechanism
+with state-of-the-art open-set detectors such as GDINO, we establish new
+benchmarks in object detection performance. Our method achieves 17.42 mAP in
+novel object detection and 42.08 mAP for known objects on the challenging LVIS
+dataset. Adapting our approach to the COCO OVD split, we surpass the current
+state-of-the-art by a margin of 7.2 $ \text{AP}_{50} $ for novel classes. Our
+code is available at https://rohit901.github.io/coop-foundation-models/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Scalable and Stable Parallelization of Nonlinear RNNs <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19115v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19115v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xavier Gonzalez, Andrew Warrington, Jimmy T. H. Smith, Scott W. Linderman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers and linear state space models can be evaluated in parallel on
+modern hardware, but evaluating nonlinear RNNs appears to be an inherently
+sequential problem. Recently, however, Lim et al. '24 developed an approach
+called DEER, which evaluates nonlinear RNNs in parallel by posing the states as
+the solution to a fixed-point problem. They derived a parallel form of Newton's
+method to solve the fixed-point problem and achieved significant speedups over
+sequential evaluation. However, the computational complexity of DEER is cubic
+in the state size, and the algorithm can suffer from numerical instability. We
+address these limitations with two novel contributions. To reduce the
+computational complexity, we apply quasi-Newton approximations and show they
+converge comparably to Newton, use less memory, and are faster. To stabilize
+DEER, we leverage a connection between the Levenberg-Marquardt algorithm and
+Kalman smoothing, which we call ELK. This connection allows us to stabilize
+Newton's method while using efficient parallelized Kalman smoothing algorithms
+to retain performance. Through several experiments, we show that these
+innovations allow for parallel evaluation of nonlinear RNNs at larger scales
+and with greater stability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 9 figures, NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Artificial Scientist -- in-transit Machine Learning of Plasma
+  Simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03383v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03383v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeffrey Kelling, Vicente Bolea, Michael Bussmann, Ankush Checkervarty, Alexander Debus, Jan Ebert, Greg Eisenhauer, Vineeth Gutta, Stefan Kesselheim, Scott Klasky, Richard Pausch, Norbert Podhorszki, Franz Poschel, David Rogers, Jeyhun Rustamov, Steve Schmerler, Ulrich Schramm, Klaus Steiniger, Rene Widera, Anna Willmann, Sunita Chandrasekaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasing HPC cluster sizes and large-scale simulations that produce
+petabytes of data per run, create massive IO and storage challenges for
+analysis. Deep learning-based techniques, in particular, make use of these
+amounts of domain data to extract patterns that help build scientific
+understanding. Here, we demonstrate a streaming workflow in which simulation
+data is streamed directly to a machine-learning (ML) framework, circumventing
+the file system bottleneck. Data is transformed in transit, asynchronously to
+the simulation and the training of the model. With the presented workflow, data
+operations can be performed in common and easy-to-use programming languages,
+freeing the application user from adapting the application output routines. As
+a proof-of-concept we consider a GPU accelerated particle-in-cell (PIConGPU)
+simulation of the Kelvin- Helmholtz instability (KHI). We employ experience
+replay to avoid catastrophic forgetting in learning from this non-steady
+process in a continual manner. We detail challenges addressed while porting and
+scaling to Frontier exascale system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Relational Reasoning Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.00393v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.00393v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giuseppe Marra, Michelangelo Diligenti, Francesco Giannini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neuro-symbolic methods integrate neural architectures, knowledge
+representation and reasoning. However, they have been struggling at both
+dealing with the intrinsic uncertainty of the observations and scaling to
+real-world applications. This paper presents Relational Reasoning Networks
+(R2N), a novel end-to-end model that performs relational reasoning in the
+latent space of a deep learner architecture, where the representations of
+constants, ground atoms and their manipulations are learned in an integrated
+fashion. Unlike flat architectures like Knowledge Graph Embedders, which can
+only represent relations between entities, R2Ns define an additional
+computational structure, accounting for higher-level relations among the ground
+atoms. The considered relations can be explicitly known, like the ones defined
+by logic formulas, or defined as unconstrained correlations among groups of
+ground atoms. R2Ns can be applied to purely symbolic tasks or as a
+neuro-symbolic platform to integrate learning and reasoning in heterogeneous
+problems with both symbolic and feature-based represented entities. The
+proposed model overtakes the limitations of previous neuro-symbolic methods
+that have been either limited in terms of scalability or expressivity. The
+proposed methodology is shown to achieve state-of-the-art results in different
+experimental settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ImageNet-Patch: A <span class="highlight-title">Dataset</span> for Benchmarking Machine Learning Robustness
+  against Adversarial Patches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.04412v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.04412v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maura Pintor, Daniele Angioni, Angelo Sotgiu, Luca Demetrio, Ambra Demontis, Battista Biggio, Fabio Roli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial patches are optimized contiguous pixel blocks in an input image
+that cause a machine-learning model to misclassify it. However, their
+optimization is computationally demanding, and requires careful hyperparameter
+tuning, potentially leading to suboptimal robustness evaluations. To overcome
+these issues, we propose ImageNet-Patch, a dataset to benchmark
+machine-learning models against adversarial patches. It consists of a set of
+patches, optimized to generalize across different models, and readily
+applicable to ImageNet data after preprocessing them with affine
+transformations. This process enables an approximate yet faster robustness
+evaluation, leveraging the transferability of adversarial perturbations. We
+showcase the usefulness of this dataset by testing the effectiveness of the
+computed patches against 127 models. We conclude by discussing how our dataset
+could be used as a benchmark for robustness, and how our methodology can be
+generalized to other domains. We open source our dataset and evaluation code at
+https://github.com/pralab/ImageNet-Patch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Pattern Recognition. DOI:
+  https://doi.org/10.1016/j.patcog.2022.109064</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">3</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal LLMs Can Reason about Aesthetics in Zero-Shot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixiang Jiang, Changwen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability
+shall be elicited to evaluate the aesthetics of artworks. To facilitate this
+investigation, we construct MM-StyleBench, a novel high-quality dataset for
+benchmarking artistic stylization. We then develop a principled method for
+human preference modeling and perform a systematic correlation analysis between
+MLLMs' responses and human preference. Our experiments reveal an inherent
+hallucination issue of MLLMs in art evaluation, associated with response
+subjectivity. ArtCoT is proposed, demonstrating that art-specific task
+decomposition and the use of concrete language boost MLLMs' reasoning ability
+for aesthetics. Our findings offer valuable insights into MLLMs for art and can
+benefit a wide range of downstream applications, such as style transfer and
+artistic image generation. Code available at
+https://github.com/songrise/MLLM4Art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WIP, Homepage https://github.com/songrise/MLLM4Art</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Fake News Video Explanation Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08514v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08514v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lizhi Chen, Zhong Qian, Peifeng Li, Qiaoming Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal explanation involves the assessment of the veracity of a variety
+of different content, and relies on multiple information modalities to
+comprehensively consider the relevance and consistency between modalities. Most
+existing fake news video detection methods focus on improving accuracy while
+ignoring the importance of providing explanations. In this paper, we propose a
+novel problem - Fake News Video Explanation (FNVE) - Given a multimodal news
+containing both video and caption text, we aim to generate natural language
+explanations to reveal the truth of predictions. To this end, we develop
+FakeNVE, a new dataset of explanations for truthfully multimodal posts, where
+each explanation is a natural language (English) sentence describing the
+attribution of a news thread. We benchmark FakeNVE by using a multimodal
+transformer-based architecture. Subsequently, a BART-based autoregressive
+decoder is used as the generator. Empirical results show compelling results for
+various baselines (applicable to FNVE) across multiple evaluation metrics. We
+also perform human evaluation on explanation generation, achieving high scores
+for both adequacy and fluency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal and Multi-scale Spatial Environment Understanding for
+  Immersive Visual Text-to-Speech <span class="chip">AAAI'2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11409v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11409v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Liu, Shuwei He, Yifan Hu, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Text-to-Speech (VTTS) aims to take the environmental image as the
+prompt to synthesize the reverberant speech for the spoken content. The
+challenge of this task lies in understanding the spatial environment from the
+image. Many attempts have been made to extract global spatial visual
+information from the RGB space of an spatial image. However, local and depth
+image information are crucial for understanding the spatial environment, which
+previous works have ignored. To address the issues, we propose a novel
+multi-modal and multi-scale spatial environment understanding scheme to achieve
+immersive VTTS, termed M2SE-VTTS. The multi-modal aims to take both the RGB and
+Depth spaces of the spatial image to learn more comprehensive spatial
+information, and the multi-scale seeks to model the local and global spatial
+knowledge simultaneously. Specifically, we first split the RGB and Depth images
+into patches and adopt the Gemini-generated environment captions to guide the
+local spatial understanding. After that, the multi-modal and multi-scale
+features are integrated by the local-aware global spatial understanding. In
+this way, M2SE-VTTS effectively models the interactions between local and
+global spatial contexts in the multi-modal spatial environment. Objective and
+subjective evaluations suggest that our model outperforms the advanced
+baselines in environmental speech generation. The code and audio samples are
+available at: https://github.com/AI-S2-Lab/M2SE-VTTS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,2 figures, Accepted by AAAI'2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">145</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Do Generative Models Draw a Software Engineer? A Case Study on
+  Stable Diffusion Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tosin Fadahunsi, Giordano d'Aloisio, Antinisca Di Marco, Federica Sarro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models are nowadays widely used to generate graphical content used
+for multiple purposes, e.g. web, art, advertisement. However, it has been shown
+that the images generated by these models could reinforce societal biases
+already existing in specific contexts. In this paper, we focus on understanding
+if this is the case when one generates images related to various software
+engineering tasks. In fact, the Software Engineering (SE) community is not
+immune from gender and ethnicity disparities, which could be amplified by the
+use of these models. Hence, if used without consciousness, artificially
+generated images could reinforce these biases in the SE domain. Specifically,
+we perform an extensive empirical evaluation of the gender and ethnicity bias
+exposed by three versions of the Stable Diffusion (SD) model (a very popular
+open-source text-to-image model) - SD 2, SD XL, and SD 3 - towards SE tasks. We
+obtain 6,720 images by feeding each model with two sets of prompts describing
+different software-related tasks: one set includes the Software Engineer
+keyword, and one set does not include any specification of the person
+performing the task. Next, we evaluate the gender and ethnicity disparities in
+the generated images. Results show how all models are significantly biased
+towards male figures when representing software engineers. On the contrary,
+while SD 2 and SD XL are strongly biased towards White figures, SD 3 is
+slightly more biased towards Asian figures. Nevertheless, all models
+significantly under-represent Black and Arab figures, regardless of the prompt
+style used. The results of our analysis highlight severe concerns about
+adopting those models to generate content for SE tasks and open the field for
+future research on bias mitigation in this context.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal LLMs Can Reason about Aesthetics in Zero-Shot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixiang Jiang, Changwen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability
+shall be elicited to evaluate the aesthetics of artworks. To facilitate this
+investigation, we construct MM-StyleBench, a novel high-quality dataset for
+benchmarking artistic stylization. We then develop a principled method for
+human preference modeling and perform a systematic correlation analysis between
+MLLMs' responses and human preference. Our experiments reveal an inherent
+hallucination issue of MLLMs in art evaluation, associated with response
+subjectivity. ArtCoT is proposed, demonstrating that art-specific task
+decomposition and the use of concrete language boost MLLMs' reasoning ability
+for aesthetics. Our findings offer valuable insights into MLLMs for art and can
+benefit a wide range of downstream applications, such as style transfer and
+artistic image generation. Code available at
+https://github.com/songrise/MLLM4Art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WIP, Homepage https://github.com/songrise/MLLM4Art</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-RAN: Transforming RAN with AI-driven Computing Infrastructure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lopamudra Kundu, Xingqin Lin, Rajesh Gadiyar, Jean-Francois Lacasse, Shuvo Chowdhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The radio access network (RAN) landscape is undergoing a transformative shift
+from traditional, communication-centric infrastructures towards converged
+compute-communication platforms. This article introduces AI-RAN which
+integrates both RAN and artificial intelligence (AI) workloads on the same
+infrastructure. By doing so, AI-RAN not only meets the performance demands of
+future networks but also improves asset utilization. We begin by examining how
+RANs have evolved beyond mobile broadband towards AI-RAN and articulating
+manifestations of AI-RAN into three forms: AI-for-RAN, AI-on-RAN, and
+AI-and-RAN. Next, we identify the key requirements and enablers for the
+convergence of communication and computing in AI-RAN. We then provide a
+reference architecture for advancing AI-RAN from concept to practice. To
+illustrate the practical potential of AI-RAN, we present a proof-of-concept
+that concurrently processes RAN and AI workloads utilizing NVIDIA Grace-Hopper
+GH200 servers. Finally, we conclude the article by outlining future work
+directions to guide further developments of AI-RAN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personality Modeling for Persuasion of Misinformation using AI Agent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianmin Lou, Wentao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of misinformation on social media platforms has highlighted
+the need to understand how individual personality traits influence
+susceptibility to and propagation of misinformation. This study employs an
+innovative agent-based modeling approach to investigate the relationship
+between personality traits and misinformation dynamics. Using six AI agents
+embodying different dimensions of the Big Five personality traits
+(Extraversion, Agreeableness, and Neuroticism), we simulated interactions
+across six diverse misinformation topics. The experiment, implemented through
+the AgentScope framework using the GLM-4-Flash model, generated 90 unique
+interactions, revealing complex patterns in how personality combinations affect
+persuasion and resistance to misinformation. Our findings demonstrate that
+analytical and critical personality traits enhance effectiveness in
+evidence-based discussions, while non-aggressive persuasion strategies show
+unexpected success in misinformation correction. Notably, agents with critical
+traits achieved a 59.4% success rate in HIV-related misinformation discussions,
+while those employing non-aggressive approaches maintained consistent
+persuasion rates above 40% across different personality combinations. The study
+also revealed a non-transitive pattern in persuasion effectiveness, challenging
+conventional assumptions about personality-based influence. These results
+provide crucial insights for developing personality-aware interventions in
+digital environments and suggest that effective misinformation countermeasures
+should prioritize emotional connection and trust-building over confrontational
+approaches. The findings contribute to both theoretical understanding of
+personality-misinformation dynamics and practical strategies for combating
+misinformation in social media contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development and Validation of the Provider Documentation Summarization
+  Quality Instrument for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emma Croxford, Yanjun Gao, Nicholas Pellegrino, Karen K. Wong, Graham Wills, Elliot First, Miranda Schnier, Kyle Burton, Cris G. Ebby, Jillian Gorskic, Matthew Kalscheur, Samy Khalil, Marie Pisani, Tyler Rubeor, Peter Stetson, Frank Liao, Cherodeep Goswami, Brian Patterson, Majid Afshar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Large Language Models (LLMs) are integrated into electronic health record
+(EHR) workflows, validated instruments are essential to evaluate their
+performance before implementation. Existing instruments for provider
+documentation quality are often unsuitable for the complexities of
+LLM-generated text and lack validation on real-world data. The Provider
+Documentation Summarization Quality Instrument (PDSQI-9) was developed to
+evaluate LLM-generated clinical summaries. Multi-document summaries were
+generated from real-world EHR data across multiple specialties using several
+LLMs (GPT-4o, Mixtral 8x7b, and Llama 3-8b). Validation included Pearson
+correlation for substantive validity, factor analysis and Cronbach's alpha for
+structural validity, inter-rater reliability (ICC and Krippendorff's alpha) for
+generalizability, a semi-Delphi process for content validity, and comparisons
+of high- versus low-quality summaries for discriminant validity. Seven
+physician raters evaluated 779 summaries and answered 8,329 questions,
+achieving over 80% power for inter-rater reliability. The PDSQI-9 demonstrated
+strong internal consistency (Cronbach's alpha = 0.879; 95% CI: 0.867-0.891) and
+high inter-rater reliability (ICC = 0.867; 95% CI: 0.867-0.868), supporting
+structural validity and generalizability. Factor analysis identified a 4-factor
+model explaining 58% of the variance, representing organization, clarity,
+accuracy, and utility. Substantive validity was supported by correlations
+between note length and scores for Succinct (rho = -0.200, p = 0.029) and
+Organized (rho = -0.190, p = 0.037). Discriminant validity distinguished high-
+from low-quality summaries (p < 0.001). The PDSQI-9 demonstrates robust
+construct validity, supporting its use in clinical practice to evaluate
+LLM-generated summaries and facilitate safer integration of LLMs into
+healthcare workflows.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trusted Machine Learning Models Unlock Private Inference for Problems
+  Currently Infeasible with Cryptography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilia Shumailov, Daniel Ramage, Sarah Meiklejohn, Peter Kairouz, Florian Hartmann, Borja Balle, Eugene Bagdasarian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We often interact with untrusted parties. Prioritization of privacy can limit
+the effectiveness of these interactions, as achieving certain goals
+necessitates sharing private data. Traditionally, addressing this challenge has
+involved either seeking trusted intermediaries or constructing cryptographic
+protocols that restrict how much data is revealed, such as multi-party
+computations or zero-knowledge proofs. While significant advances have been
+made in scaling cryptographic approaches, they remain limited in terms of the
+size and complexity of applications they can be used for. In this paper, we
+argue that capable machine learning models can fulfill the role of a trusted
+third party, thus enabling secure computations for applications that were
+previously infeasible. In particular, we describe Trusted Capable Model
+Environments (TCMEs) as an alternative approach for scaling secure computation,
+where capable machine learning model(s) interact under input/output
+constraints, with explicit information flow control and explicit statelessness.
+This approach aims to achieve a balance between privacy and computational
+efficiency, enabling private inference where classical cryptographic solutions
+are currently infeasible. We describe a number of use cases that are enabled by
+TCME, and show that even some simple classic cryptographic problems can already
+be solved with TCME. Finally, we outline current limitations and discuss the
+path forward in implementing them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An analysis of data variation and bias in image-based dermatological
+  <span class="highlight-title">dataset</span>s for machine learning classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Mauro, Emanoel Thyago, Othon Vinicius, Rodrigo Abreu, Kelvin Cunha, José Gabriel, Rafael Barros, Thales Bezerra, Manoel Henriques, Natalia Lopes, Érico Moutinho, Jéssica Guido, Tsang Ing Ren, Paulo Borba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI algorithms have become valuable in aiding professionals in healthcare. The
+increasing confidence obtained by these models is helpful in critical decision
+demands. In clinical dermatology, classification models can detect malignant
+lesions on patients' skin using only RGB images as input. However, most
+learning-based methods employ data acquired from dermoscopic datasets on
+training, which are large and validated by a gold standard. Clinical models aim
+to deal with classification on users' smartphone cameras that do not contain
+the corresponding resolution provided by dermoscopy. Also, clinical
+applications bring new challenges. It can contain captures from uncontrolled
+environments, skin tone variations, viewpoint changes, noises in data and
+labels, and unbalanced classes. A possible alternative would be to use transfer
+learning to deal with the clinical images. However, as the number of samples is
+low, it can cause degradations on the model's performance; the source
+distribution used in training differs from the test set. This work aims to
+evaluate the gap between dermoscopic and clinical samples and understand how
+the dataset variations impact training. It assesses the main differences
+between distributions that disturb the model's prediction. Finally, from
+experiments on different architectures, we argue how to combine the data from
+divergent distributions, decreasing the impact on the model's final accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kolmogorov-Arnold Networks for Time Series Granger Causality Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meiliang Liu, Yunfang Xu, Zijin Li, Zhengye Si, Xiaoxiao Yang, Xinyue Yang, Zhiwen Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Granger Causality Kolmogorov-Arnold Networks (GCKAN), an
+innovative architecture that extends the recently proposed Kolmogorov-Arnold
+Networks (KAN) to the domain of causal inference. By extracting base weights
+from KAN layers and incorporating the sparsity-inducing penalty along with
+ridge regularization, GCKAN infers the Granger causality from time series while
+enabling automatic time lag selection. Additionally, we propose an algorithm
+leveraging time-reversed Granger causality to enhance inference accuracy. The
+algorithm compares prediction and sparse-inducing losses derived from the
+original and time-reversed series, automatically selecting the casual
+relationship with the higher score or integrating the results to mitigate
+spurious connectivities. Comprehensive experiments conducted on Lorenz-96, gene
+regulatory networks, fMRI BOLD signals, and VAR datasets demonstrate that the
+proposed model achieves competitive performance to state-of-the-art methods in
+inferring Granger causality from nonlinear, high-dimensional, and
+limited-sample time series.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing the Ethical Logic of Six Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        W. Russell Neuman, Chad Coleman, Manan Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study examines the ethical reasoning of six prominent generative large
+language models: OpenAI GPT-4o, Meta LLaMA 3.1, Perplexity, Anthropic Claude
+3.5 Sonnet, Google Gemini, and Mistral 7B. The research explores how these
+models articulate and apply ethical logic, particularly in response to moral
+dilemmas such as the Trolley Problem, and Heinz Dilemma. Departing from
+traditional alignment studies, the study adopts an explainability-transparency
+framework, prompting models to explain their ethical reasoning. This approach
+is analyzed through three established ethical typologies: the
+consequentialist-deontological analytic, Moral Foundations Theory, and the
+Kohlberg Stages of Moral Development Model. Findings reveal that LLMs exhibit
+largely convergent ethical logic, marked by a rationalist, consequentialist
+emphasis, with decisions often prioritizing harm minimization and fairness.
+Despite similarities in pre-training and model architecture, a mixture of
+nuanced and significant differences in ethical reasoning emerge across models,
+reflecting variations in fine-tuning and post-training processes. The models
+consistently display erudition, caution, and self-awareness, presenting ethical
+reasoning akin to a graduate-level discourse in moral philosophy. In striking
+uniformity these systems all describe their ethical reasoning as more
+sophisticated than what is characteristic of typical human moral logic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual WetlandBirds <span class="highlight-title">Dataset</span>: Bird Species Identification and Behavior
+  Recognition in Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Rodriguez-Juan, David Ortiz-Perez, Manuel Benavent-Lledo, David Mulero-Pérez, Pablo Ruiz-Ponce, Adrian Orihuela-Torres, Jose Garcia-Rodriguez, Esther Sebastián-González
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current biodiversity loss crisis makes animal monitoring a relevant field
+of study. In light of this, data collected through monitoring can provide
+essential insights, and information for decision-making aimed at preserving
+global biodiversity. Despite the importance of such data, there is a notable
+scarcity of datasets featuring videos of birds, and none of the existing
+datasets offer detailed annotations of bird behaviors in video format. In
+response to this gap, our study introduces the first fine-grained video dataset
+specifically designed for bird behavior detection and species classification.
+This dataset addresses the need for comprehensive bird video datasets and
+provides detailed data on bird actions, facilitating the development of deep
+learning models to recognize these, similar to the advancements made in human
+action recognition. The proposed dataset comprises 178 videos recorded in
+Spanish wetlands, capturing 13 different bird species performing 7 distinct
+behavior classes. In addition, we also present baseline results using state of
+the art models on two tasks: bird behavior recognition and species
+classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangling Exploration of Large Language Models by Optimal
+  Exploitation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Grams, Patrick Betz, Christian Bartelt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploration is a crucial skill for self-improvement and open-ended
+problem-solving. However, it remains uncertain whether large language models
+can effectively explore the state-space. Existing evaluations predominantly
+focus on the trade-off between exploration and exploitation, often assessed in
+multi-armed bandit problems. In contrast, this work isolates exploration as the
+sole objective, tasking the agent with delivering information that enhances
+future returns. For the evaluation, we propose to decompose missing rewards
+into exploration and exploitation components by measuring the optimal
+achievable return for the states already explored. Our experiments with various
+LLMs reveal that most models struggle to sufficiently explore the state-space
+and that weak exploration is insufficient. We observe a positive correlation
+between model size and exploration performance, with larger models
+demonstrating superior capabilities. Furthermore, we show that our
+decomposition provides insights into differences in behaviors driven by agent
+instructions during prompt engineering, offering a valuable tool for refining
+LLM performance in exploratory tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling Melt Pool Features and Spatter Using Symbolic Regression and
+  Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olabode T. Ajenifujah, Amir Barati Farimani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Additive manufacturing (AM) is a rapidly evolving technology that has
+attracted applications across a wide range of fields due to its ability to
+fabricate complex geometries. However, one of the key challenges in AM is
+achieving consistent print quality. This inconsistency is often attributed to
+uncontrolled melt pool dynamics, partly caused by spatter which can lead to
+defects. Therefore, capturing and controlling the evolution of the melt pool is
+crucial for enhancing process stability and part quality. In this study, we
+developed a framework to support decision-making in AM operations, facilitating
+quality control and minimizing defects via machine learning (ML) and polynomial
+symbolic regression models. We implemented experimentally validated
+computational tools as a cost-effective approach to collect large datasets from
+laser powder bed fusion (LPBF) processes. For a dataset consisting of 281
+process conditions, parameters such as melt pool dimensions (length, width,
+depth), melt pool geometry (area, volume), and volume indicated as spatter were
+extracted. Using machine learning (ML) and polynomial symbolic regression
+models, a high R2 of over 95 % was achieved in predicting the melt pool
+dimensions and geometry features for both the training and testing datasets,
+with either process conditions (power and velocity) or melt pool dimensions as
+the model inputs. In the case of volume indicated as spatter, R2 improved after
+logarithmic transforming the model inputs, which was either the process
+conditions or the melt pool dimensions. Among the investigated ML models, the
+ExtraTree model achieved the highest R2 values of 96.7 % and 87.5 %.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Projection Implicit Q-Learning with Support Constraint for Offline
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinchen Han, Hossam Afifi, Michel Marot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline Reinforcement Learning (RL) faces a critical challenge of
+extrapolation errors caused by out-of-distribution (OOD) actions. Implicit
+Q-Learning (IQL) algorithm employs expectile regression to achieve in-sample
+learning, effectively mitigating the risks associated with OOD actions.
+However, the fixed hyperparameter in policy evaluation and density-based policy
+improvement method limit its overall efficiency. In this paper, we propose
+Proj-IQL, a projective IQL algorithm enhanced with the support constraint. In
+the policy evaluation phase, Proj-IQL generalizes the one-step approach to a
+multi-step approach through vector projection, while maintaining in-sample
+learning and expectile regression framework. In the policy improvement phase,
+Proj-IQL introduces support constraint that is more aligned with the policy
+evaluation approach. Furthermore, we theoretically demonstrate that Proj-IQL
+guarantees monotonic policy improvement and enjoys a progressively more
+rigorous criterion for superior actions. Empirical results demonstrate the
+Proj-IQL achieves state-of-the-art performance on D4RL benchmarks, especially
+in challenging navigation domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computing Game Symmetries and Equilibria That Respect Them <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuel Tewolde, Brian Hu Zhang, Caspar Oesterheld, Tuomas Sandholm, Vincent Conitzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Strategic interactions can be represented more concisely, and analyzed and
+solved more efficiently, if we are aware of the symmetries within the
+multiagent system. Symmetries also have conceptual implications, for example
+for equilibrium selection. We study the computational complexity of identifying
+and using symmetries. Using the classical framework of normal-form games, we
+consider game symmetries that can be across some or all players and/or actions.
+We find a strong connection between game symmetries and graph automorphisms,
+yielding graph automorphism and graph isomorphism completeness results for
+characterizing the symmetries present in a game. On the other hand, we also
+show that the problem becomes polynomial-time solvable when we restrict the
+consideration of actions in one of two ways.
+  Next, we investigate when exactly game symmetries can be successfully
+leveraged for Nash equilibrium computation. We show that finding a Nash
+equilibrium that respects a given set of symmetries is PPAD- and CLS-complete
+in general-sum and team games respectively -- that is, exactly as hard as
+Brouwer fixed point and gradient descent problems. Finally, we present
+polynomial-time methods for the special cases where we are aware of a vast
+number of symmetries, or where the game is two-player zero-sum and we do not
+even know the symmetries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Long and updated version to the published paper in the Proceedings of
+  the 39th Annual AAAI Conference on Artificial Intelligence (AAAI 2025). 24
+  pages, 2 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Large Language Models as Knowledge-Driven Agents for Reliable
+  Retrosynthesis Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinyu Ma, Yuhao Zhou, Jianfeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying reliable synthesis pathways in materials chemistry is a complex
+task, particularly in polymer science, due to the intricate and often
+non-unique nomenclature of macromolecules. To address this challenge, we
+propose an agent system that integrates large language models (LLMs) and
+knowledge graphs (KGs). By leveraging LLMs' powerful capabilities for
+extracting and recognizing chemical substance names, and storing the extracted
+data in a structured knowledge graph, our system fully automates the retrieval
+of relevant literatures, extraction of reaction data, database querying,
+construction of retrosynthetic pathway trees, further expansion through the
+retrieval of additional literature and recommendation of optimal reaction
+pathways. A novel Multi-branched Reaction Pathway Search (MBRPS) algorithm
+enables the exploration of all pathways, with a particular focus on
+multi-branched ones, helping LLMs overcome weak reasoning in multi-branched
+paths. This work represents the first attempt to develop a fully automated
+retrosynthesis planning agent tailored specially for macromolecules powered by
+LLMs. Applied to polyimide synthesis, our new approach constructs a
+retrosynthetic pathway tree with hundreds of pathways and recommends optimized
+routes, including both known and novel pathways, demonstrating its
+effectiveness and potential for broader applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Karatsuba Matrix Multiplication and its Efficient Custom Hardware
+  Implementations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trevor E. Pogue, Nicola Nicolici
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the Karatsuba algorithm reduces the complexity of large integer
+multiplication, the extra additions required minimize its benefits for smaller
+integers of more commonly-used bitwidths. In this work, we propose the
+extension of the scalar Karatsuba multiplication algorithm to matrix
+multiplication, showing how this maintains the reduction in multiplication
+complexity of the original Karatsuba algorithm while reducing the complexity of
+the extra additions. Furthermore, we propose new matrix multiplication hardware
+architectures for efficiently exploiting this extension of the Karatsuba
+algorithm in custom hardware. We show that the proposed algorithm and hardware
+architectures can provide real area or execution time improvements for integer
+matrix multiplication compared to scalar Karatsuba or conventional matrix
+multiplication algorithms, while also supporting implementation through proven
+systolic array and conventional multiplier architectures at the core. We
+provide a complexity analysis of the algorithm and architectures and evaluate
+the proposed designs both in isolation and in an end-to-end deep learning
+accelerator system compared to baseline designs and prior state-of-the-art
+works implemented on the same type of compute platform, demonstrating their
+ability to increase the performance-per-area of matrix multiplication hardware.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in IEEE Transactions on Computers;
+  Associated source code available on github at
+  https://github.com/trevorpogue/algebraic-nnhw</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incrementally Learning Multiple Diverse Data Domains via Multi-Source
+  Dynamic Expansion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runqing Wu, Fei Ye, Qihe Liu, Guoxi Huang, Jinyu Guo, Rongyao Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual Learning seeks to develop a model capable of incrementally
+assimilating new information while retaining prior knowledge. However, current
+research predominantly addresses a straightforward learning context, wherein
+all data samples originate from a singular data domain. This paper shifts focus
+to a more complex and realistic learning environment, characterized by data
+samples sourced from multiple distinct domains. We tackle this intricate
+learning challenge by introducing a novel methodology, termed the Multi-Source
+Dynamic Expansion Model (MSDEM), which leverages various pre-trained models as
+backbones and progressively establishes new experts based on them to adapt to
+emerging tasks. Additionally, we propose an innovative dynamic expandable
+attention mechanism designed to selectively harness knowledge from multiple
+backbones, thereby accelerating the new task learning. Moreover, we introduce a
+dynamic graph weight router that strategically reuses all previously acquired
+parameters and representations for new task learning, maximizing the positive
+knowledge transfer effect, which further improves generalization performance.
+We conduct a comprehensive series of experiments, and the empirical findings
+indicate that our proposed approach achieves state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Silent Abandonment in Text-Based Contact Centers: Identifying,
+  Quantifying, and Mitigating its Operational Impacts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Castellanos, Galit B. Yom-Tov, Yair Goldberg, Jaeyoung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the quest to improve services, companies offer customers the option to
+interact with agents via texting. Such contact centers face unique challenges
+compared to traditional call centers, as measuring customer experience proxies
+like abandonment and patience involves uncertainty. A key source of this
+uncertainty is silent abandonment, where customers leave without notifying the
+system, wasting agent time and leaving their status unclear. Silent abandonment
+also obscures whether a customer was served or left. Our goals are to measure
+the magnitude of silent abandonment and mitigate its effects. Classification
+models show that 3%-70% of customers across 17 companies abandon silently. In
+one study, 71.3% of abandoning customers did so silently, reducing agent
+efficiency by 3.2% and system capacity by 15.3%, incurring $5,457 in annual
+costs per agent. We develop an expectation-maximization (EM) algorithm to
+estimate customer patience under uncertainty and identify influencing
+covariates. We find that companies should use classification models to estimate
+abandonment scope and our EM algorithm to assess patience. We suggest
+strategies to operationally mitigate the impact of silent abandonment by
+predicting suspected silent-abandonment behavior or changing service design.
+Specifically, we show that while allowing customers to write while waiting in
+the queue creates a missing data challenge, it also significantly increases
+patience and reduces service time, leading to reduced abandonment and lower
+staffing requirements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2304.11754</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARMOR: Shielding Unlearnable Examples against Data Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueluan Gong, Yuji Wang, Yanjiao Chen, Haocheng Dong, Yiming Li, Mengyuan Sun, Shuaike Li, Qian Wang, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Private data, when published online, may be collected by unauthorized parties
+to train deep neural networks (DNNs). To protect privacy, defensive noises can
+be added to original samples to degrade their learnability by DNNs. Recently,
+unlearnable examples are proposed to minimize the training loss such that the
+model learns almost nothing. However, raw data are often pre-processed before
+being used for training, which may restore the private information of protected
+data. In this paper, we reveal the data privacy violation induced by data
+augmentation, a commonly used data pre-processing technique to improve model
+generalization capability, which is the first of its kind as far as we are
+concerned. We demonstrate that data augmentation can significantly raise the
+accuracy of the model trained on unlearnable examples from 21.3% to 66.1%. To
+address this issue, we propose a defense framework, dubbed ARMOR, to protect
+data privacy from potential breaches of data augmentation. To overcome the
+difficulty of having no access to the model training process, we design a
+non-local module-assisted surrogate model that better captures the effect of
+data augmentation. In addition, we design a surrogate augmentation selection
+strategy that maximizes distribution alignment between augmented and
+non-augmented samples, to choose the optimal augmentation strategy for each
+class. We also use a dynamic step size adjustment algorithm to enhance the
+defensive noise generation process. Extensive experiments are conducted on 4
+datasets and 5 data augmentation methods to verify the performance of ARMOR.
+Comparisons with 6 state-of-the-art defense methods have demonstrated that
+ARMOR can preserve the unlearnability of protected private data under data
+augmentation. ARMOR reduces the test accuracy of the model trained on augmented
+protected samples by as much as 60% more than baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Digital Phenotyping for Adolescent Mental Health: A Feasibility Study
+  Employing Machine Learning to Predict Mental Health Risk From Active and
+  Passive Smartphone Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Balasundaram Kadirvelu, Teresa Bellido Bel, Aglaia Freccero, Martina Di Simplicio, Dasha Nicholls, A Aldo Faisal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Adolescents are particularly vulnerable to mental disorders, with
+over 75% of cases manifesting before the age of 25. Research indicates that
+only 18 to 34% of young people experiencing high levels of depression or
+anxiety symptoms seek support. Digital tools leveraging smartphones offer
+scalable and early intervention opportunities. Objective: Using a novel machine
+learning framework, this study evaluated the feasibility of integrating active
+and passive smartphone data to predict mental disorders in non-clinical
+adolescents. Specifically, we investigated the utility of the Mindcraft app in
+predicting risks for internalising and externalising disorders, eating
+disorders, insomnia and suicidal ideation. Methods: Participants (N=103; mean
+age 16.1 years) were recruited from three London schools. Participants
+completed the Strengths and Difficulties Questionnaire, the Eating Disorders-15
+Questionnaire, Sleep Condition Indicator Questionnaire and indicated the
+presence/absence of suicidal ideation. They used the Mindcraft app for 14 days,
+contributing active data via self-reports and passive data from smartphone
+sensors. A contrastive pretraining phase was applied to enhance user-specific
+feature stability, followed by supervised fine-tuning. The model evaluation
+employed leave-one-subject-out cross-validation using balanced accuracy as the
+primary metric. Results: The integration of active and passive data achieved
+superior performance compared to individual data sources, with mean balanced
+accuracies of 0.71 for SDQ-High risk, 0.67 for insomnia, 0.77 for suicidal
+ideation and 0.70 for eating disorders. The contrastive learning framework
+stabilised daily behavioural representations, enhancing predictive robustness.
+This study demonstrates the potential of integrating active and passive
+smartphone data with advanced machine-learning techniques for predicting mental
+health risks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Counterfactual Explainable AI via Latent Space Traversal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Abildtrup Hansen, Paraskevas Pegios, Anna Calissano, Aasa Feragen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explaining the predictions of a deep neural network is a nontrivial task, yet
+high-quality explanations for predictions are often a prerequisite for
+practitioners to trust these models. Counterfactual explanations aim to explain
+predictions by finding the ''nearest'' in-distribution alternative input whose
+prediction changes in a pre-specified way. However, it remains an open question
+how to define this nearest alternative input, whose solution depends on both
+the domain (e.g. images, graphs, tabular data, etc.) and the specific
+application considered. For graphs, this problem is complicated i) by their
+discrete nature, as opposed to the continuous nature of state-of-the-art graph
+classifiers; and ii) by the node permutation group acting on the graphs. We
+propose a method to generate counterfactual explanations for any differentiable
+black-box graph classifier, utilizing a case-specific permutation equivariant
+graph variational autoencoder. We generate counterfactual explanations in a
+continuous fashion by traversing the latent space of the autoencoder across the
+classification boundary of the classifier, allowing for seamless integration of
+discrete graph structure and continuous graph attributes. We empirically
+validate the approach on three graph datasets, showing that our model is
+consistently high-performing and more robust than the baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Northern Lights Deep Learning Conference 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RouteNet-Gauss: Hardware-Enhanced Network Modeling with Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Güemes-Palau, Miquel Ferriol-Galmés, Jordi Paillisse-Vilanova, Albert López-Brescó, Pere Barlet-Ros, Albert Cabellos-Aparicio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network simulation is pivotal in network modeling, assisting with tasks
+ranging from capacity planning to performance estimation. Traditional
+approaches such as Discrete Event Simulation (DES) face limitations in terms of
+computational cost and accuracy. This paper introduces RouteNet-Gauss, a novel
+integration of a testbed network with a Machine Learning (ML) model to address
+these challenges. By using the testbed as a hardware accelerator,
+RouteNet-Gauss generates training datasets rapidly and simulates network
+scenarios with high fidelity to real-world conditions. Experimental results
+show that RouteNet-Gauss significantly reduces prediction errors by up to 95%
+and achieves a 488x speedup in inference time compared to state-of-the-art
+DES-based methods. RouteNet-Gauss's modular architecture is dynamically
+constructed based on the specific characteristics of the network scenario, such
+as topology and routing. This enables it to understand and generalize to
+different network configurations beyond those seen during training, including
+networks up to 10x larger. Additionally, it supports Temporal Aggregated
+Performance Estimation (TAPE), providing configurable temporal granularity and
+maintaining high accuracy in flow performance metrics. This approach shows
+promise in improving both simulation efficiency and accuracy, offering a
+valuable tool for network operators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic tuning of communication protocols for vehicular ad hoc
+  networks using metaheuristics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José García-Nieto, Jamal Toutouh, Enrique Alba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emerging field of vehicular ad hoc networks (VANETs) deals with a set of
+communicating vehicles which are able to spontaneously interconnect without any
+pre-existing infrastructure. In such kind of networks, it is crucial to make an
+optimal configuration of the communication protocols previously to the final
+network deployment. This way, a human designer can obtain an optimal QoS of the
+network beforehand. The problem we consider in this work lies in configuring
+the File Transfer protocol Configuration (FTC) with the aim of optimizing the
+transmission time, the number of lost packets, and the amount of data
+transferred in realistic VANET scenarios. We face the FTC with five
+representative state-of-the-art optimization techniques and compare their
+performance. These algorithms are: Particle Swarm Optimization (PSO),
+Differential Evolution (DE), Genetic Algorithm (GA), Evolutionary Strategy
+(ES), and Simulated Annealing (SA). For our tests, two typical environment
+instances of VANETs for Urban and Highway scenarios have been defined. The
+experiments using ns- 2 (a well-known realistic VANET simulator) reveal that
+PSO outperforms all the compared algorithms for both studied VANET instances.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Task-Level Optimal <span class="highlight-title">Prompt</span>s for Visual In-Context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Zhu, Huan Ma, Changqing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of Vision Foundation Models (VFMs) in recent years,
+Visual In-Context Learning (VICL) has become a better choice compared to
+modifying models in most scenarios. Different from retraining or fine-tuning
+model, VICL does not require modifications to the model's weights or
+architecture, and only needs a prompt with demonstrations to teach VFM how to
+solve tasks. Currently, significant computational cost for finding optimal
+prompts for every test sample hinders the deployment of VICL, as determining
+which demonstrations to use for constructing prompts is very costly. In this
+paper, however, we find a counterintuitive phenomenon that most test samples
+actually achieve optimal performance under the same prompts, and searching for
+sample-level prompts only costs more time but results in completely identical
+prompts. Therefore, we propose task-level prompting to reduce the cost of
+searching for prompts during the inference stage and introduce two time-saving
+yet effective task-level prompt search strategies. Extensive experimental
+results show that our proposed method can identify near-optimal prompts and
+reach the best VICL performance with a minimal cost that prior work has never
+achieved.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ToMATO: Verbalizing the Mental States of Role-Playing LLMs for
+  Benchmarking Theory of Mind <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08838v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08838v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazutoshi Shinoda, Nobukatsu Hojo, Kyosuke Nishida, Saki Mizuno, Keita Suzuki, Ryo Masumura, Hiroaki Sugiyama, Kuniko Saito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Theory of Mind (ToM) benchmarks diverge from real-world scenarios in
+three aspects: 1) they assess a limited range of mental states such as beliefs,
+2) false beliefs are not comprehensively explored, and 3) the diverse
+personality traits of characters are overlooked. To address these challenges,
+we introduce ToMATO, a new ToM benchmark formulated as multiple-choice QA over
+conversations. ToMATO is generated via LLM-LLM conversations featuring
+information asymmetry. By employing a prompting method that requires
+role-playing LLMs to verbalize their thoughts before each utterance, we capture
+both first- and second-order mental states across five categories: belief,
+intention, desire, emotion, and knowledge. These verbalized thoughts serve as
+answers to questions designed to assess the mental states of characters within
+conversations. Furthermore, the information asymmetry introduced by hiding
+thoughts from others induces the generation of false beliefs about various
+mental states. Assigning distinct personality traits to LLMs further
+diversifies both utterances and thoughts. ToMATO consists of 5.4k questions,
+753 conversations, and 15 personality trait patterns. Our analysis shows that
+this dataset construction approach frequently generates false beliefs due to
+the information asymmetry between role-playing LLMs, and effectively reflects
+diverse personalities. We evaluate nine LLMs on ToMATO and find that even
+GPT-4o mini lags behind human performance, especially in understanding false
+beliefs, and lacks robustness to various personality traits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuicai Dong, Yujing Chang, Xin Deik Goh, Dexun Li, Ruiming Tang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal document retrieval is designed to identify and retrieve various
+forms of multi-modal content, such as figures, tables, charts, and layout
+information from extensive documents. Despite its significance, there is a
+notable lack of a robust benchmark to effectively evaluate the performance of
+systems in multi-modal document retrieval. To address this gap, this work
+introduces a new benchmark, named as MMDocIR, encompassing two distinct tasks:
+page-level and layout-level retrieval. The former focuses on localizing the
+most relevant pages within a long document, while the latter targets the
+detection of specific layouts, offering a more fine-grained granularity than
+whole-page analysis. A layout can refer to a variety of elements such as
+textual paragraphs, equations, figures, tables, or charts. The MMDocIR
+benchmark comprises a rich dataset featuring expertly annotated labels for
+1,685 questions and bootstrapped labels for 173,843 questions, making it a
+pivotal resource for advancing multi-modal document retrieval for both training
+and evaluation. Through rigorous experiments, we reveal that (i) visual
+retrievers significantly outperform their text counterparts, (ii) MMDocIR train
+set can effectively benefit the training process of multi-modal document
+retrieval and (iii) text retrievers leveraging on VLM-text perform much better
+than those using OCR-text. These findings underscores the potential advantages
+of integrating visual elements for multi-modal document retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://huggingface.co/MMDocIR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IDEA: Image Description Enhanced CLIP-Adapter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08816v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08816v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Ye, Feng Jiang, Qiufeng Wang, Kaizhu Huang, Jiaqi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP (Contrastive Language-Image Pre-training) has attained great success in
+pattern recognition and computer vision. Transferring CLIP to downstream tasks
+(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.
+However, current studies primarily focus on either prompt learning for text or
+adapter tuning for vision, without fully exploiting the complementary
+information and correlations among image-text pairs. In this paper, we propose
+an Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to
+few-shot image classification tasks. This method captures fine-grained features
+by leveraging both visual features and textual descriptions of images. IDEA is
+a training-free method for CLIP, and it can be comparable to or even exceeds
+state-of-the-art models on multiple tasks. Furthermore, we introduce
+Trainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable
+components (i.e., a projector and a learnable latent space), further enhancing
+the model's performance and achieving SOTA results on 11 datasets. As one
+important contribution, we employ the Llama model and design a comprehensive
+pipeline to generate textual descriptions for images of 11 datasets, resulting
+in a total of 1,637,795 image-text pairs, named "IMD-11". Our code and data are
+released at https://github.com/FourierAI/IDEA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAIF: A Comprehensive Framework for Evaluating the Risks of Generative
+  AI in the Public Sector <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08814v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08814v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyeongryul Lee, Heehyeon Kim, Joyce Jiyoung Whang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid adoption of generative AI in the public sector, encompassing
+diverse applications ranging from automated public assistance to welfare
+services and immigration processes, highlights its transformative potential
+while underscoring the pressing need for thorough risk assessments. Despite its
+growing presence, evaluations of risks associated with AI-driven systems in the
+public sector remain insufficiently explored. Building upon an established
+taxonomy of AI risks derived from diverse government policies and corporate
+guidelines, we investigate the critical risks posed by generative AI in the
+public sector while extending the scope to account for its multimodal
+capabilities. In addition, we propose a Systematic dAta generatIon Framework
+for evaluating the risks of generative AI (SAIF). SAIF involves four key
+stages: breaking down risks, designing scenarios, applying jailbreak methods,
+and exploring prompt types. It ensures the systematic and consistent generation
+of prompt data, facilitating a comprehensive evaluation while providing a solid
+foundation for mitigating the risks. Furthermore, SAIF is designed to
+accommodate emerging jailbreak methods and evolving prompt types, thereby
+enabling effective responses to unforeseen risk scenarios. We believe that this
+study can play a crucial role in fostering the safe and responsible integration
+of generative AI into the public sector.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 2 figures, 1 tables. AI for Public Missions (AIPM) Workshop
+  at the 39th AAAI Conference on Artificial Intelligence (AAAI 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XMusic: Towards a Generalized and Controllable Symbolic Music Generation
+  Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08809v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08809v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sida Tian, Can Zhang, Wei Yuan, Wei Tan, Wenjie Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, remarkable advancements in artificial intelligence-generated
+content (AIGC) have been achieved in the fields of image synthesis and text
+generation, generating content comparable to that produced by humans. However,
+the quality of AI-generated music has not yet reached this standard, primarily
+due to the challenge of effectively controlling musical emotions and ensuring
+high-quality outputs. This paper presents a generalized symbolic music
+generation framework, XMusic, which supports flexible prompts (i.e., images,
+videos, texts, tags, and humming) to generate emotionally controllable and
+high-quality symbolic music. XMusic consists of two core components, XProjector
+and XComposer. XProjector parses the prompts of various modalities into
+symbolic music elements (i.e., emotions, genres, rhythms and notes) within the
+projection space to generate matching music. XComposer contains a Generator and
+a Selector. The Generator generates emotionally controllable and melodious
+music based on our innovative symbolic music representation, whereas the
+Selector identifies high-quality symbolic music by constructing a multi-task
+learning scheme involving quality assessment, emotion recognition, and genre
+recognition tasks. In addition, we build XMIDI, a large-scale symbolic music
+dataset that contains 108,023 MIDI files annotated with precise emotion and
+genre labels. Objective and subjective evaluations show that XMusic
+significantly outperforms the current state-of-the-art methods with impressive
+music quality. Our XMusic has been awarded as one of the nine Highlights of
+Collectibles at WAIC 2023. The project homepage of XMusic is
+https://xmusic-project.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by TMM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Networked Agents in the Dark: Team Value Learning under Partial
+  Observability <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guilherme S. Varela, Alberto Sardinha, Francisco S. Melo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel cooperative multi-agent reinforcement learning (MARL)
+approach for networked agents. In contrast to previous methods that rely on
+complete state information or joint observations, our agents must learn how to
+reach shared objectives under partial observability. During training, they
+collect individual rewards and approximate a team value function through local
+communication, resulting in cooperative behavior. To describe our problem, we
+introduce the networked dynamic partially observable Markov game framework,
+where agents communicate over a switching topology communication network. Our
+distributed method, DNA-MARL, uses a consensus mechanism for local
+communication and gradient descent for local computation. DNA-MARL increases
+the range of the possible applications of networked agents, being well-suited
+for real world domains that impose privacy and where the messages may not reach
+their recipients. We evaluate DNA-MARL across benchmark MARL scenarios. Our
+results highlight the superior performance of DNA-MARL over previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 7 figures, 5 tables. Accepted as supplemental material at
+  Proceedings of the 24th International Conference on Autonomous Agents and
+  Multiagent Systems (AAMAS 2025), Detroit, Michigan, USA, May 19 - 23, 2025,
+  IFAAMAS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Developers Interact with AI: A Taxonomy of Human-AI Collaboration in
+  Software Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Treude, Marco A. Gerosa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence (AI), including large language models and generative
+AI, is emerging as a significant force in software development, offering
+developers powerful tools that span the entire development lifecycle. Although
+software engineering research has extensively studied AI tools in software
+development, the specific types of interactions between developers and these
+AI-powered tools have only recently begun to receive attention. Understanding
+and improving these interactions has the potential to improve productivity,
+trust, and efficiency in AI-driven workflows. In this paper, we propose a
+taxonomy of interaction types between developers and AI tools, identifying
+eleven distinct interaction types, such as auto-complete code suggestions,
+command-driven actions, and conversational assistance. Building on this
+taxonomy, we outline a research agenda focused on optimizing AI interactions,
+improving developer control, and addressing trust and usability challenges in
+AI-assisted development. By establishing a structured foundation for studying
+developer-AI interactions, this paper aims to stimulate research on creating
+more effective, adaptive AI tools for software development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 2nd ACM International Conference on AI Foundation Models
+  and Software Engineering (FORGE 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging LLM Agents for Translating Network Configurations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08760v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08760v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunze Wei, Xiaohui Xie, Yiwei Zuo, Tianshuo Hu, Xinyi Chen, Kaiwen Chi, Yong Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Configuration translation is a critical and frequent task in network
+operations. When a network device is damaged or outdated, administrators need
+to replace it to maintain service continuity. The replacement devices may
+originate from different vendors, necessitating configuration translation to
+ensure seamless network operation. However, translating configurations manually
+is a labor-intensive and error-prone process. In this paper, we propose an
+intent-based framework for translating network configuration with Large
+Language Model (LLM) Agents. The core of our approach is an Intent-based
+Retrieval Augmented Generation (IRAG) module that systematically splits a
+configuration file into fragments, extracts intents, and generates accurate
+translations. We also design a two-stage verification method to validate the
+syntax and semantics correctness of the translated configurations. We implement
+and evaluate the proposed method on real-world network configurations.
+Experimental results show that our method achieves 97.74% syntax correctness,
+outperforming state-of-the-art methods in translation accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Transformation Learning for Equivariant Representations <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemyung Yu, Jaehyun Choi, Dong-Jae Lee, HyeongGwon Hong, Junmo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised representation learning has significantly advanced various
+machine learning tasks. In the computer vision domain, state-of-the-art
+approaches utilize transformations like random crop and color jitter to achieve
+invariant representations, embedding semantically the same inputs despite
+transformations. However, this can degrade performance in tasks requiring
+precise features, such as localization or flower classification. To address
+this, recent research incorporates equivariant representation learning, which
+captures transformation-sensitive information. However, current methods depend
+on transformation labels and thus struggle with interdependency and complex
+transformations. We propose Self-supervised Transformation Learning (STL),
+replacing transformation labels with transformation representations derived
+from image pairs. The proposed method ensures transformation representation is
+image-invariant and learns corresponding equivariant transformations, enhancing
+performance without increased batch complexity. We demonstrate the approach's
+effectiveness across diverse classification and detection tasks, outperforming
+existing methods in 7 out of 11 benchmarks and excelling in detection. By
+integrating complex transformations like AugMix, unusable by prior equivariant
+methods, this approach enhances performance across tasks, underscoring its
+adaptability and resilience. Additionally, its compatibility with various base
+models highlights its flexibility and broad applicability. The code is
+available at https://github.com/jaemyung-u/stl.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38th Conference on Neural Information Processing Systems (NeurIPS
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SPEQ: Stabilization Phases for Efficient Q-Learning in High
+  Update-To-Data Ratio Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlo Romeo, Girolamo Macaluso, Alessandro Sestini, Andrew D. Bagdanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge in Deep Reinforcement Learning is sample efficiency,
+especially in real-world applications where collecting environment interactions
+is expensive or risky. Recent off-policy algorithms improve sample efficiency
+by increasing the Update-To-Data (UTD) ratio and performing more gradient
+updates per environment interaction. While this improves sample efficiency, it
+significantly increases computational cost due to the higher number of gradient
+updates required. In this paper we propose a sample-efficient method to improve
+computational efficiency by separating training into distinct learning phases
+in order to exploit gradient updates more effectively. Our approach builds on
+top of the Dropout Q-Functions (DroQ) algorithm and alternates between an
+online, low UTD ratio training phase, and an offline stabilization phase.
+During the stabilization phase, we fine-tune the Q-functions without collecting
+new environment interactions. This process improves the effectiveness of the
+replay buffer and reduces computational overhead. Our experimental results on
+continuous control problems show that our method achieves results comparable to
+state-of-the-art, high UTD ratio algorithms while requiring 56\% fewer gradient
+updates and 50\% less training time than DroQ. Our approach offers an effective
+and computationally economical solution while maintaining the same sample
+efficiency as the more costly, high UTD ratio state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application of Deep Reinforcement Learning to UAV Swarming for Ground
+  Surveillance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raúl Arranz, David Carramiñana, Gonzalo de Miguel, Juan A. Besada, Ana M. Bernardos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper summarizes in depth the state of the art of aerial swarms,
+covering both classical and new reinforcement-learning-based approaches for
+their management. Then, it proposes a hybrid AI system, integrating deep
+reinforcement learning in a multi-agent centralized swarm architecture. The
+proposed system is tailored to perform surveillance of a specific area,
+searching and tracking ground targets, for security and law enforcement
+applications. The swarm is governed by a central swarm controller responsible
+for distributing different search and tracking tasks among the cooperating
+UAVs. Each UAV agent is then controlled by a collection of cooperative
+sub-agents, whose behaviors have been trained using different deep
+reinforcement learning models, tailored for the different task types proposed
+by the swarm controller. More specifically, proximal policy optimization (PPO)
+algorithms were used to train the agents' behavior. In addition, several
+metrics to assess the performance of the swarm in this application were
+defined. The results obtained through simulation show that our system searches
+the operation area effectively, acquires the targets in a reasonable time, and
+is capable of tracking them continuously and consistently.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-grained Spatio-temporal Event Prediction with Self-adaptive Anchor
+  Graph <span class="chip">SDM'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang-Tao Zhou, Zhao Kang, Sicong Liu, Lizong Zhang, Ling Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event prediction tasks often handle spatio-temporal data distributed in a
+large spatial area. Different regions in the area exhibit different
+characteristics while having latent correlations. This spatial heterogeneity
+and correlations greatly affect the spatio-temporal distributions of event
+occurrences, which has not been addressed by state-of-the-art models. Learning
+spatial dependencies of events in a continuous space is challenging due to its
+fine granularity and a lack of prior knowledge. In this work, we propose a
+novel Graph Spatio-Temporal Point Process (GSTPP) model for fine-grained event
+prediction. It adopts an encoder-decoder architecture that jointly models the
+state dynamics of spatially localized regions using neural Ordinary
+Differential Equations (ODEs). The state evolution is built on the foundation
+of a novel Self-Adaptive Anchor Graph (SAAG) that captures spatial
+dependencies. By adaptively localizing the anchor nodes in the space and
+jointly constructing the correlation edges between them, the SAAG enhances the
+model's ability of learning complex spatial event patterns. The proposed GSTPP
+model greatly improves the accuracy of fine-grained event prediction. Extensive
+experimental results show that our method greatly improves the prediction
+accuracy over existing spatio-temporal event prediction approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIAM International Conference on Data Mining 2025
+  (SDM'25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAGNET: Augmenting Generative Decoders with Representation Learning and
+  Infilling Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Savya Khosla, Kushal Kafle, Simon Jenni, Handong Zhao, John Collomosse, Jing Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While originally designed for unidirectional generative modeling,
+decoder-only large language models (LLMs) are increasingly being adapted for
+bidirectional modeling. However, unidirectional and bidirectional models are
+typically trained separately with distinct objectives (generation and
+representation learning, respectively). This separation overlooks the
+opportunity for developing a more versatile language model and for these
+objectives to complement each other. In this work, we introduce MAGNET, an
+adaptation of decoder-only LLMs that enhances their ability to generate robust
+representations and infill missing text spans, while preserving their knowledge
+and text generation capabilities. MAGNET employs three self-supervised training
+objectives and introduces an attention mechanism that combines bidirectional
+and causal attention, enabling unified training across all objectives. Our
+results demonstrate that LLMs adapted with MAGNET (1) surpass strong text
+encoders on token-level and sentence-level representation learning tasks, (2)
+generate contextually appropriate text infills by leveraging future context,
+(3) retain the ability for open-ended text generation without exhibiting
+repetition problem, and (4) preserve the knowledge gained by the LLM during
+pretraining.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reassessing the Role of Chain-of-Thought in Sentiment Analysis: Insights
+  and Limitations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyuan Zheng, Qinghua Zhao, Lei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The relationship between language and thought remains an unresolved
+philosophical issue. Existing viewpoints can be broadly categorized into two
+schools: one asserting their independence, and another arguing that language
+constrains thought. In the context of large language models, this debate raises
+a crucial question: Does a language model's grasp of semantic meaning depend on
+thought processes? To explore this issue, we investigate whether reasoning
+techniques can facilitate semantic understanding. Specifically, we
+conceptualize thought as reasoning, employ chain-of-thought prompting as a
+reasoning technique, and examine its impact on sentiment analysis tasks. The
+experiments show that chain-of-thought has a minimal impact on sentiment
+analysis tasks. Both the standard and chain-of-thought prompts focus on aspect
+terms rather than sentiment in the generated content. Furthermore,
+counterfactual experiments reveal that the model's handling of sentiment tasks
+primarily depends on information from demonstrations. The experimental results
+support the first viewpoint.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViBidirectionMT-Eval: Machine Translation for Vietnamese-Chinese and
+  Vietnamese-Lao language pair 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong-Viet Tran, Minh-Quy Nguyen, Van-Vinh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an results of the VLSP 2022-2023 Machine Translation
+Shared Tasks, focusing on Vietnamese-Chinese and Vietnamese-Lao machine
+translation. The tasks were organized as part of the 9th, 10th annual workshop
+on Vietnamese Language and Speech Processing (VLSP 2022, VLSP 2023). The
+objective of the shared task was to build machine translation systems,
+specifically targeting Vietnamese-Chinese and Vietnamese-Lao translation
+(corresponding to 4 translation directions). The submission were evaluated on
+1,000 pairs for testing (news and general domains) using established metrics
+like BLEU [11] and SacreBLEU [12]. Additionally, system outputs also were
+evaluated with human judgment provided by experts in Chinese and Lao languages.
+These human assessments played a crucial role in ranking the performance of the
+machine translation models, ensuring a more comprehensive evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disjoint Processing Mechanisms of Hierarchical and Linear Grammars in
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08618v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08618v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aruna Sankaranarayanan, Dylan Hadfield-Menell, Aaron Mueller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  All natural languages are structured hierarchically. In humans, this
+structural restriction is neurologically coded: when two grammars are presented
+with identical vocabularies, brain areas responsible for language processing
+are only sensitive to hierarchical grammars. Using large language models
+(LLMs), we investigate whether such functionally distinct hierarchical
+processing regions can arise solely from exposure to large-scale language
+distributions. We generate inputs using English, Italian, Japanese, or nonce
+words, varying the underlying grammars to conform to either hierarchical or
+linear/positional rules. Using these grammars, we first observe that language
+models show distinct behaviors on hierarchical versus linearly structured
+inputs. Then, we find that the components responsible for processing
+hierarchical grammars are distinct from those that process linear grammars; we
+causally verify this in ablation experiments. Finally, we observe that
+hierarchy-selective components are also active on nonce grammars; this suggests
+that hierarchy sensitivity is not tied to meaning, nor in-distribution inputs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiqu Liang, Haimin Hu, Ryan Liu, Thomas L. Griffiths, Jaime Fernández Fisac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI systems like foundation models (FMs) must align well with human
+values to ensure their behavior is helpful and trustworthy. While Reinforcement
+Learning from Human Feedback (RLHF) has shown promise for optimizing model
+performance using human judgments, existing RLHF pipelines predominantly rely
+on immediate feedback, which can fail to accurately reflect the downstream
+impact of an interaction on users' utility. We demonstrate that feedback based
+on evaluators' foresight estimates of downstream consequences systematically
+induces Goodhart's Law dynamics, incentivizing misaligned behaviors like
+sycophancy and deception and ultimately degrading user outcomes. To alleviate
+this, we propose decoupling evaluation from prediction by refocusing RLHF on
+hindsight feedback. Our theoretical analysis reveals that conditioning
+evaluator feedback on downstream observations mitigates misalignment and
+improves expected human utility, even when these observations are simulated by
+the AI system itself. To leverage this insight in a practical alignment
+algorithm, we introduce Reinforcement Learning from Hindsight Simulation
+(RLHS), which first simulates plausible consequences and then elicits feedback
+to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS
+to two widely-employed online and offline preference optimization methods --
+Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) --
+and show empirically that misalignment is significantly reduced with both
+methods. Through an online human user study, we show that RLHS consistently
+outperforms RLHF in helping users achieve their goals and earns higher
+satisfaction ratings, despite being trained solely with simulated hindsight
+feedback. These results underscore the importance of focusing on long-term
+consequences, even simulated ones, to mitigate misalignment in RLHF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Monte Carlo Tree Search for Comprehensive Exploration in LLM-Based
+  Automatic Heuristic Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08603v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08603v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhi Zheng, Zhuoliang Xie, Zhenkun Wang, Bryan Hooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handcrafting heuristics for solving complex planning tasks (e.g., NP-hard
+combinatorial optimization (CO) problems) is a common practice but requires
+extensive domain knowledge. Recently, Large Language Model (LLM)-based
+automatic heuristics design (AHD) methods have shown promise in generating
+high-quality heuristics without manual intervention. Existing LLM-based AHD
+methods employ a population to maintain a fixed number of top-performing
+LLM-generated heuristics and introduce evolutionary computation (EC) to enhance
+the population iteratively. However, the population-based procedure brings
+greedy properties, often resulting in convergence to local optima. Instead, to
+more comprehensively explore the space of heuristics, we propose using Monte
+Carlo Tree Search (MCTS) for LLM-based heuristic evolution while preserving all
+LLM-generated heuristics in a tree structure. With a novel thought-alignment
+process and an exploration-decay technique, the proposed MCTS-AHD method
+delivers significantly higher-quality heuristics on various complex tasks. Our
+code is available at https://github.com/zz1358m/MCTS-AHD-master.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoRestTest: A Tool for Automated REST API Testing Using LLMs and MARL <span class="chip">ICSE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08600v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08600v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tyler Stennett, Myeongsoo Kim, Saurabh Sinha, Alessandro Orso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As REST APIs have become widespread in modern web services, comprehensive
+testing of these APIs has become increasingly crucial. Due to the vast search
+space consisting of operations, parameters, and parameter values along with
+their complex dependencies and constraints, current testing tools suffer from
+low code coverage, leading to suboptimal fault detection. To address this
+limitation, we present a novel tool, AutoRestTest, which integrates the
+Semantic Operation Dependency Graph (SODG) with Multi-Agent Reinforcement
+Learning (MARL) and large language models (LLMs) for effective REST API
+testing. AutoRestTest determines operation-dependent parameters using the SODG
+and employs five specialized agents (operation, parameter, value, dependency,
+and header) to identify dependencies of operations and generate operation
+sequences, parameter combinations, and values. AutoRestTest provides a
+command-line interface and continuous telemetry on successful operation count,
+unique server errors detected, and time elapsed. Upon completion, AutoRestTest
+generates a detailed report highlighting errors detected and operations
+exercised. In this paper, we introduce our tool and present preliminary
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in the 47th IEEE/ACM International Conference on
+  Software Engineering - Demonstration Track (ICSE-Demo 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LlamaRestTest: Effective REST API Testing with Small Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08598v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08598v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Myeongsoo Kim, Saurabh Sinha, Alessandro Orso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern web services rely heavily on REST APIs, typically documented using the
+OpenAPI specification. The widespread adoption of this standard has resulted in
+the development of many black-box testing tools that generate tests based on
+these specifications. Recent advancements in Natural Language Processing (NLP),
+particularly with Large Language Models (LLMs), have enhanced REST API testing
+by extracting actionable rules and generating input values from the
+human-readable portions of the specification. However, these advancements
+overlook the potential of continuously refining the identified rules and test
+inputs based on server responses. To address this limitation, we present
+LlamaRestTest, a novel approach that employs two custom LLMs to generate
+realistic test inputs and uncover parameter dependencies during the testing
+process by incorporating server responses. These LLMs are created by
+fine-tuning the Llama3-8b model, using mined datasets of REST API example
+values and inter-parameter dependencies. We evaluated LlamaRestTest on 12
+real-world services (including popular services such as Spotify), comparing it
+against RESTGPT, a GPT-powered specification-enhancement tool, as well as
+several state-of-the-art REST API testing tools, including RESTler, MoRest,
+EvoMaster, and ARAT-RL. Our results show that fine-tuning enables smaller LLMs
+to outperform larger models in detecting actionable rules and generating inputs
+for REST API testing. We evaluated configurations from the base Llama3-8B to
+fine-tuned versions and explored 2-bit, 4-bit, and 8-bit quantization for
+efficiency. LlamaRestTest surpasses state-of-the-art tools in code coverage and
+error detection, even with RESTGPT-enhanced specifications, and an ablation
+study highlights the impact of its novel components.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in the ACM International Conference on the
+  Foundations of Software Engineering (FSE 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenMLDB: A Real-Time Relational Data Feature Computation System for
+  Online ML 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanhe Zhou, Wei Zhou, Liguo Qi, Hao Zhang, Dihao Chen, Bingsheng He, Mian Lu, Guoliang Li, Fan Wu, Yuqiang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient and consistent feature computation is crucial for a wide range of
+online ML applications. Typically, feature computation is divided into two
+distinct phases, i.e., offline stage for model training and online stage for
+model serving. These phases often rely on execution engines with different
+interface languages and function implementations, causing significant
+inconsistencies. Moreover, many online ML features involve complex time-series
+computations (e.g., functions over varied-length table windows) that differ
+from standard streaming and analytical queries. Existing data processing
+systems (e.g., Spark, Flink, DuckDB) often incur multi-second latencies for
+these computations, making them unsuitable for real-time online ML applications
+that demand timely feature updates.
+  This paper presents OpenMLDB, a feature computation system deployed in
+4Paradigm's SageOne platform and over 100 real scenarios. Technically, OpenMLDB
+first employs a unified query plan generator for consistent computation results
+across the offline and online stages, significantly reducing feature deployment
+overhead. Second, OpenMLDB provides an online execution engine that resolves
+performance bottlenecks caused by long window computations (via
+pre-aggregation) and multi-table window unions (via data self-adjusting). It
+also provides a high-performance offline execution engine with window parallel
+optimization and time-aware data skew resolving. Third, OpenMLDB features a
+compact data format and stream-focused indexing to maximize memory usage and
+accelerate data access. Evaluations in testing and real workloads reveal
+significant performance improvements and resource savings compared to the
+baseline systems. The open community of OpenMLDB now has over 150 contributors
+and gained 1.6k stars on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sound Scene Synthesis at the DCASE 2024 Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08587v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08587v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Lagrange, Junwon Lee, Modan Tailleur, Laurie M. Heller, Keunwoo Choi, Brian McFee, Keisuke Imoto, Yuki Okamoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents Task 7 at the DCASE 2024 Challenge: sound scene
+synthesis. Recent advances in sound synthesis and generative models have
+enabled the creation of realistic and diverse audio content. We introduce a
+standardized evaluation framework for comparing different sound scene synthesis
+systems, incorporating both objective and subjective metrics. The challenge
+attracted four submissions, which are evaluated using the Fr\'echet Audio
+Distance (FAD) and human perceptual ratings. Our analysis reveals significant
+insights into the current capabilities and limitations of sound scene synthesis
+systems, while also highlighting areas for future improvement in this rapidly
+evolving field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating SAT and SMT Solvers on Large-Scale Sudoku Puzzles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08569v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08569v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liam Davis, Tairan Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern SMT solvers have revolutionized the approach to constraint
+satisfaction problems by integrating advanced theory reasoning and encoding
+techniques. In this work, we evaluate the performance of modern SMT solvers in
+Z3, CVC5 and DPLL(T) against a standard SAT solver in DPLL. By benchmarking
+these solvers on novel, diverse 25x25 Sudoku puzzles of various difficulty
+levels created by our improved Sudoku generator, we examine the impact of
+advanced theory reasoning and encoding techniques. Our findings demonstrate
+that modern SMT solvers significantly outperform classical SAT solvers. This
+work highlights the evolution of logical solvers and exemplifies the utility of
+SMT solvers in addressing large-scale constraint satisfaction problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Lightweight and Stable Zero-shot TTS with Self-distilled
+  Representation Disentanglement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianniu Chen, Xiaoyang Hao, Bowen Li, Yue Liu, Li Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot Text-To-Speech (TTS) synthesis shows great promise for personalized
+voice customization through voice cloning. However, current methods for
+achieving zero-shot TTS heavily rely on large model scales and extensive
+training datasets to ensure satisfactory performance and generalizability
+across various speakers. This raises concerns regarding both deployment costs
+and data security. In this paper, we present a lightweight and stable zero-shot
+TTS system. We introduce a novel TTS architecture designed to effectively model
+linguistic content and various speaker attributes from source speech and prompt
+speech, respectively. Furthermore, we present a two-stage self-distillation
+framework that constructs parallel data pairs for effectively disentangling
+linguistic content and speakers from the perspective of training data.
+Extensive experiments show that our system exhibits excellent performance and
+superior stability on the zero-shot TTS tasks. Moreover, it shows markedly
+superior computational efficiency, with RTFs of 0.13 and 0.012 on the CPU and
+GPU, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages,4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DualOpt: A Dual Divide-and-Optimize Algorithm for the Large-scale
+  Traveling Salesman Problem <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shipei Zhou, Yuandong Ding, Chi Zhang, Zhiguang Cao, Yan Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a dual divide-and-optimize algorithm (DualOpt) for
+solving the large-scale traveling salesman problem (TSP). DualOpt combines two
+complementary strategies to improve both solution quality and computational
+efficiency. The first strategy is a grid-based divide-and-conquer procedure
+that partitions the TSP into smaller sub-problems, solving them in parallel and
+iteratively refining the solution by merging nodes and partial routes. The
+process continues until only one grid remains, yielding a high-quality initial
+solution. The second strategy involves a path-based divide-and-optimize
+procedure that further optimizes the solution by dividing it into sub-paths,
+optimizing each using a neural solver, and merging them back to progressively
+improve the overall solution. Extensive experiments conducted on two groups of
+TSP benchmark instances, including randomly generated instances with up to
+100,000 nodes and real-world datasets from TSPLIB, demonstrate the
+effectiveness of DualOpt. The proposed DualOpt achieves highly competitive
+results compared to 10 state-of-the-art algorithms in the literature. In
+particular, DualOpt achieves an improvement gap up to 1.40% for the largest
+instance TSP100K with a remarkable 104x speed-up over the leading heuristic
+solver LKH3. Additionally, DualOpt demonstrates strong generalization on TSPLIB
+benchmarks, confirming its capability to tackle diverse real-world TSP
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI-25, February 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ANSR-DT: An Adaptive Neuro-Symbolic Learning and Reasoning Framework for
+  Digital Twins 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08561v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08561v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Safayat Bin Hakim, Muhammad Adil, Alvaro Velasquez, Houbing Herbert Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose an Adaptive Neuro-Symbolic Learning Framework for
+digital twin technology called ``ANSR-DT." Our approach combines pattern
+recognition algorithms with reinforcement learning and symbolic reasoning to
+enable real-time learning and adaptive intelligence. This integration enhances
+the understanding of the environment and promotes continuous learning, leading
+to better and more effective decision-making in real-time for applications that
+require human-machine collaboration. We evaluated the \textit{ANSR-DT}
+framework for its ability to learn and adapt to dynamic patterns, observing
+significant improvements in decision accuracy, reliability, and
+interpretability when compared to existing state-of-the-art methods. However,
+challenges still exist in extracting and integrating symbolic rules in complex
+environments, which limits the full potential of our framework in heterogeneous
+settings. Moreover, our ongoing research aims to address this issue in the
+future by ensuring seamless integration of neural models at large. In addition,
+our open-source implementation promotes reproducibility and encourages future
+research to build on our foundational work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LAMS: LLM-Driven Automatic Mode Switching for Assistive Teleoperation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiran Tao, Jehan Yang, Dan Ding, Zackory Erickson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teleoperating high degrees-of-freedom (DoF) robotic manipulators via low-DoF
+controllers like joysticks often requires frequent switching between control
+modes, where each mode maps controller movements to specific robot actions.
+Manually performing this frequent switching can make teleoperation cumbersome
+and inefficient. On the other hand, existing automatic mode-switching
+solutions, such as heuristic-based or learning-based methods, are often
+task-specific and lack generalizability. In this paper, we introduce LLM-Driven
+Automatic Mode Switching (LAMS), a novel approach that leverages Large Language
+Models (LLMs) to automatically switch control modes based on task context.
+Unlike existing methods, LAMS requires no prior task demonstrations and
+incrementally improves by integrating user-generated mode-switching examples.
+We validate LAMS through an ablation study and a user study with 10
+participants on complex, long-horizon tasks, demonstrating that LAMS
+effectively reduces manual mode switches, is preferred over alternative
+methods, and improves performance over time. The project website with
+supplementary materials is at https://lams-assistance.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning-Enhanced Procedural Generation for Dynamic
+  Narrative-Driven AR Experiences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aniruddha Srinivas Joshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Procedural Content Generation (PCG) is widely used to create scalable and
+diverse environments in games. However, existing methods, such as the Wave
+Function Collapse (WFC) algorithm, are often limited to static scenarios and
+lack the adaptability required for dynamic, narrative-driven applications,
+particularly in augmented reality (AR) games. This paper presents a
+reinforcement learning-enhanced WFC framework designed for mobile AR
+environments. By integrating environment-specific rules and dynamic tile weight
+adjustments informed by reinforcement learning (RL), the proposed method
+generates maps that are both contextually coherent and responsive to gameplay
+needs. Comparative evaluations and user studies demonstrate that the framework
+achieves superior map quality and delivers immersive experiences, making it
+well-suited for narrative-driven AR games. Additionally, the method holds
+promise for broader applications in education, simulation training, and
+immersive extended reality (XR) experiences, where dynamic and adaptive
+environments are critical.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Number of pages: 13, Number of figures: 4. Accepted for presentation
+  at GRAPP 2025 - 20th International Conference on Computer Graphics Theory and
+  Applications (for additional details on the conference visit
+  https://grapp.scitevents.org). Disclaimer: This preprint may differ from the
+  final version published in the conference proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Devil is in Temporal Token: High Quality Video Reasoning
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sitong Gong, Yunzhi Zhuge, Lu Zhang, Zongxin Yang, Pingping Zhang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods for Video Reasoning Segmentation rely heavily on a single
+special token to represent the object in the keyframe or the entire video,
+inadequately capturing spatial complexity and inter-frame motion. To overcome
+these challenges, we propose VRS-HQ, an end-to-end video reasoning segmentation
+approach that leverages Multimodal Large Language Models (MLLMs) to inject rich
+spatiotemporal features into hierarchical tokens.Our key innovations include a
+Temporal Dynamic Aggregation (TDA) and a Token-driven Keyframe Selection (TKS).
+Specifically, we design frame-level <SEG> and temporal-level <TAK> tokens that
+utilize MLLM's autoregressive learning to effectively capture both local and
+global information. Subsequently, we apply a similarity-based weighted fusion
+and frame selection strategy, then utilize SAM2 to perform keyframe
+segmentation and propagation. To enhance keyframe localization accuracy, the
+TKS filters keyframes based on SAM2's occlusion scores during inference. VRS-HQ
+achieves state-of-the-art performance on ReVOS, surpassing VISA by
+5.9%/12.5%/9.1% in J&F scores across the three subsets. These results highlight
+the strong temporal reasoning and segmentation capabilities of our method. Code
+and model weights will be released at VRS-HQ.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge <span class="highlight-title">prompt</span> chaining for semantic modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ning Pei Ding, Jingge Du, Zaiwen Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of building semantics for structured data such as CSV, JSON, and XML
+files is highly relevant in the knowledge representation field. Even though we
+have a vast of structured data on the internet, mapping them to domain
+ontologies to build semantics for them is still very challenging as it requires
+the construction model to understand and learn graph-structured knowledge.
+Otherwise, the task will require human beings' effort and cost. In this paper,
+we proposed a novel automatic semantic modeling framework: Knowledge Prompt
+Chaining. It can serialize the graph-structured knowledge and inject it into
+the LLMs properly in a Prompt Chaining architecture. Through this knowledge
+injection and prompting chaining, the model in our framework can learn the
+structure information and latent space of the graph and generate the semantic
+labels and semantic graphs following the chains' insturction naturally. Based
+on experimental results, our method achieves better performance than existing
+leading techniques, despite using reduced structured input data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Portfolio Optimization via Augmented DDPG with Quantum Price
+  Levels-Based Trading Strategy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runsheng Lin, Zihan Xing, Mingze Ma, Raymond S. T. Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of deep learning, Dynamic Portfolio Optimization (DPO)
+problem has received a lot of attention in recent years, not only in the field
+of finance but also in the field of deep learning. Some advanced research in
+recent years has proposed the application of Deep Reinforcement Learning (DRL)
+to the DPO problem, which demonstrated to be more advantageous than supervised
+learning in solving the DPO problem. However, there are still certain unsolved
+issues: 1) DRL algorithms usually have the problems of slow learning speed and
+high sample complexity, which is especially problematic when dealing with
+complex financial data. 2) researchers use DRL simply for the purpose of
+obtaining high returns, but pay little attention to the problem of risk control
+and trading strategy, which will affect the stability of model returns. In
+order to address these issues, in this study we revamped the intrinsic
+structure of the model based on the Deep Deterministic Policy Gradient (DDPG)
+and proposed the Augmented DDPG model. Besides, we also proposed an innovative
+risk control strategy based on Quantum Price Levels (QPLs) derived from Quantum
+Finance Theory (QFT). Our experimental results revealed that our model has
+better profitability as well as risk control ability with less sample
+complexity in the DPO problem compared to the baseline models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Doc-Guided Sent2Sent++: A Sent2Sent++ Agent with Doc-Guided memory for
+  Document-level Machine Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Guo, Yuanchang Luo, Daimeng Wei, Ling Zhang, Zongyao Li, Hengchao Shang, Zhiqiang Rao, Shaojun Li, Jinlong Yang, Zhanglin Wu, Hao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of artificial intelligence has witnessed significant advancements
+in natural language processing, largely attributed to the capabilities of Large
+Language Models (LLMs). These models form the backbone of Agents designed to
+address long-context dependencies, particularly in Document-level Machine
+Translation (DocMT). DocMT presents unique challenges, with quality,
+consistency, and fluency being the key metrics for evaluation. Existing
+approaches, such as Doc2Doc and Doc2Sent, either omit sentences or compromise
+fluency. This paper introduces Doc-Guided Sent2Sent++, an Agent that employs an
+incremental sentence-level forced decoding strategy \textbf{to ensure every
+sentence is translated while enhancing the fluency of adjacent sentences.} Our
+Agent leverages a Doc-Guided Memory, focusing solely on the summary and its
+translation, which we find to be an efficient approach to maintaining
+consistency. Through extensive testing across multiple languages and domains,
+we demonstrate that Sent2Sent++ outperforms other methods in terms of quality,
+consistency, and fluency. The results indicate that, our approach has achieved
+significant improvements in metrics such as s-COMET, d-COMET, LTCR-$1_f$, and
+document-level perplexity (d-ppl). The contributions of this paper include a
+detailed analysis of current DocMT research, the introduction of the
+Sent2Sent++ decoding method, the Doc-Guided Memory mechanism, and validation of
+its effectiveness across languages and domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Domain Shift in Federated Learning via Intra- and
+  Inter-Domain Prototypes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08521v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08521v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huy Q. Le, Ye Lin Tun, Yu Qiao, Minh N. H. Nguyen, Keon Oh Kim, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has emerged as a decentralized machine learning
+technique, allowing clients to train a global model collaboratively without
+sharing private data. However, most FL studies ignore the crucial challenge of
+heterogeneous domains where each client has a distinct feature distribution,
+which is common in real-world scenarios. Prototype learning, which leverages
+the mean feature vectors within the same classes, has become a prominent
+solution for federated learning under domain skew. However, existing federated
+prototype learning methods only consider inter-domain prototypes on the server
+and overlook intra-domain characteristics. In this work, we introduce a novel
+federated prototype learning method, namely I$^2$PFL, which incorporates
+$\textbf{I}$ntra-domain and $\textbf{I}$nter-domain $\textbf{P}$rototypes, to
+mitigate domain shifts and learn a generalized global model across multiple
+domains in federated learning. To construct intra-domain prototypes, we propose
+feature alignment with MixUp-based augmented prototypes to capture the
+diversity of local domains and enhance the generalization of local features.
+Additionally, we introduce a reweighting mechanism for inter-domain prototypes
+to generate generalized prototypes to provide inter-domain knowledge and reduce
+domain skew across multiple clients. Extensive experiments on the Digits,
+Office-10, and PACS datasets illustrate the superior performance of our method
+compared to other baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Easing Seasickness through Attention Redirection with a
+  Mindfulness-Based Brain--Computer Interface 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Bao, Kailin Xu, Jiawei Zhu, Haiyun Huang, Kangning Li, Qiyun Huang, Yuanqing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Seasickness is a prevalent issue that adversely impacts both passenger
+experiences and the operational efficiency of maritime crews. While techniques
+that redirect attention have proven effective in alleviating motion sickness
+symptoms in terrestrial environments, applying similar strategies to manage
+seasickness poses unique challenges due to the prolonged and intense motion
+environment associated with maritime travel. In this study, we propose a
+mindfulness brain-computer interface (BCI), specifically designed to redirect
+attention with the aim of mitigating seasickness symptoms in real-world
+settings. Our system utilizes a single-channel headband to capture prefrontal
+EEG signals, which are then wirelessly transmitted to computing devices for the
+assessment of mindfulness states. The results are transferred into real-time
+feedback as mindfulness scores and audiovisual stimuli, facilitating a shift in
+attentional focus from physiological discomfort to mindfulness practices. A
+total of 43 individuals participated in a real-world maritime experiment
+consisted of three sessions: a real-feedback mindfulness session, a resting
+session, and a pseudofeedback mindfulness session. Notably, 81.39% of
+participants reported that the mindfulness BCI intervention was effective, and
+there was a significant reduction in the severity of seasickness, as measured
+by the Misery Scale (MISC). Furthermore, EEG analysis revealed a decrease in
+the theta/beta ratio, corresponding with the alleviation of seasickness
+symptoms. A decrease in overall EEG band power during the real-feedback
+mindfulness session suggests that the mindfulness BCI fosters a more tranquil
+and downregulated state of brain activity. Together, this study presents a
+novel nonpharmacological, portable, and effective approach for seasickness
+intervention, with the potential to enhance the cruising experience for both
+passengers and crews.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Efficacy of Meta-Learning: Unveiling Superior Data
+  Diversity Utilization of MAML Over <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kavita Selva, Satita Vittayaareekul, Brando Miranda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently, data and model size dominate the narrative in the training of
+super-large, powerful models. However, there has been a lack of exploration on
+the effect of other attributes of the training dataset on model performance. We
+hypothesize that dataset diversity can impact the performance of vision models.
+Our study shows positive correlations between test set accuracy and data
+diversity, providing an argument for furthering the research of dataset
+attributes beyond size. We analyzed pre-training and model-agnostic
+meta-learning methods on twelve popular visual datasets (e.g., Omniglot,
+CIFAR-FS, Aircraft) and five model configurations, including MAML variants with
+different numbers of inner gradient steps and supervised learning. We show
+moderate to strong positive correlations (R-squared: 0.15-0.42) between
+accuracy and data diversity and weaker but significant correlations (R-squared:
+~0.2) between loss and diversity. These findings support our hypothesis and
+demonstrate a promising way for a deeper exploration of how formal data
+diversity influences model performance. This initial study highlights the
+potential of (Task2Vec) data diversity as a valuable measure in the rapidly
+evolving field of large-scale learning and emphasizes that understanding the
+dataset is key to building more powerful and generalizable models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapting Whisper for Regional Dialects: Enhancing Public Services for
+  Vulnerable Populations in the United Kingdom 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melissa Torgbi, Andrew Clayman, Jordan J. Speight, Harish Tayyar Madabushi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We collect novel data in the public service domain to evaluate the capability
+of the state-of-the-art automatic speech recognition (ASR) models in capturing
+regional differences in accents in the United Kingdom (UK), specifically
+focusing on two accents from Scotland with distinct dialects. This study
+addresses real-world problems where biased ASR models can lead to
+miscommunication in public services, disadvantaging individuals with regional
+accents particularly those in vulnerable populations. We first examine the
+out-of-the-box performance of the Whisper large-v3 model on a baseline dataset
+and our data. We then explore the impact of fine-tuning Whisper on the
+performance in the two UK regions and investigate the effectiveness of existing
+model evaluation techniques for our real-world application through manual
+inspection of model errors. We observe that the Whisper model has a higher word
+error rate (WER) on our test datasets compared to the baseline data and
+fine-tuning on a given data improves performance on the test dataset with the
+same domain and accent. The fine-tuned models also appear to show improved
+performance when applied to the test data outside of the region it was trained
+on suggesting that fine-tuned models may be transferable within parts of the
+UK. Our manual analysis of model outputs reveals the benefits and drawbacks of
+using WER as an evaluation metric and fine-tuning to adapt to regional
+dialects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grounding Text-To-Image Diffusion Models For Controlled High-Quality
+  Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmad Süleyman, Göksel Biricik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale text-to-image (T2I) diffusion models have demonstrated an
+outstanding performance in synthesizing diverse high-quality visuals from
+natural language text captions. Multiple layout-to-image models have been
+developed to control the generation process by utilizing a broad array of
+layouts such as segmentation maps, edges, and human keypoints. In this work, we
+present ObjectDiffusion, a model that takes inspirations from the top
+cutting-edge image generative frameworks to seamlessly condition T2I models
+with new bounding boxes capabilities. Specifically, we make substantial
+modifications to the network architecture introduced in ContorlNet to integrate
+it with the condition processing and injection techniques proposed in GLIGEN.
+ObjectDiffusion is initialized with pretraining parameters to leverage the
+generation knowledge obtained from training on large-scale datasets. We
+fine-tune ObjectDiffusion on the COCO2017 training dataset and evaluate it on
+the COCO2017 validation dataset. Our model achieves an AP$_{50}$ of 46.6, an AR
+of 44.5, and a FID of 19.8 outperforming the current SOTA model trained on
+open-source datasets in all of the three metrics. ObjectDiffusion demonstrates
+a distinctive capability in synthesizing diverse, high-quality, high-fidelity
+images that seamlessly conform to the semantic and spatial control layout.
+Evaluated in qualitative and quantitative tests, ObjectDiffusion exhibits
+remarkable grounding abilities on closed-set and open-set settings across a
+wide variety of contexts. The qualitative assessment verifies the ability of
+ObjectDiffusion to generate multiple objects of different sizes and locations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Patch-aware Vector Quantized Codebook Learning for Unsupervised Visual
+  Defect Detection <span class="chip">ICTAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qisen Cheng, Shuhui Qu, Janghwan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised visual defect detection is critical in industrial applications,
+requiring a representation space that captures normal data features while
+detecting deviations. Achieving a balance between expressiveness and
+compactness is challenging; an overly expressive space risks inefficiency and
+mode collapse, impairing detection accuracy. We propose a novel approach using
+an enhanced VQ-VAE framework optimized for unsupervised defect detection. Our
+model introduces a patch-aware dynamic code assignment scheme, enabling
+context-sensitive code allocation to optimize spatial representation. This
+strategy enhances normal-defect distinction and improves detection accuracy
+during inference. Experiments on MVTecAD, BTAD, and MTSD datasets show our
+method achieves state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, Accepted to 36th IEEE ICTAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guiding Retrieval using LLM-based Listwise Rankers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mandeep Rathee, Sean MacAvaney, Avishek Anand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown strong promise as rerankers,
+especially in ``listwise'' settings where an LLM is prompted to rerank several
+search results at once. However, this ``cascading'' retrieve-and-rerank
+approach is limited by the bounded recall problem: relevant documents not
+retrieved initially are permanently excluded from the final ranking. Adaptive
+retrieval techniques address this problem, but do not work with listwise
+rerankers because they assume a document's score is computed independently from
+other documents. In this paper, we propose an adaptation of an existing
+adaptive retrieval method that supports the listwise setting and helps guide
+the retrieval process itself (thereby overcoming the bounded recall problem for
+LLM rerankers). Specifically, our proposed algorithm merges results both from
+the initial ranking and feedback documents provided by the most relevant
+documents seen up to that point. Through extensive experiments across diverse
+LLM rerankers, first stage retrievers, and feedback sources, we demonstrate
+that our method can improve nDCG@10 by up to 13.23% and recall by 28.02%--all
+while keeping the total number of LLM inferences constant and overheads due to
+the adaptive process minimal. The work opens the door to leveraging LLM-based
+search in settings where the initial pool of results is limited, e.g., by
+legacy systems, or by the cost of deploying a semantic first-stage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 2 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Blockchain-Enabled Approach to Cross-Border Compliance and Trust 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09182v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09182v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vikram Kulothungan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As artificial intelligence (AI) systems become increasingly integral to
+critical infrastructure and global operations, the need for a unified,
+trustworthy governance framework is more urgent that ever. This paper proposes
+a novel approach to AI governance, utilizing blockchain and distributed ledger
+technologies (DLT) to establish a decentralized, globally recognized framework
+that ensures security, privacy, and trustworthiness of AI systems across
+borders. The paper presents specific implementation scenarios within the
+financial sector, outlines a phased deployment timeline over the next decade,
+and addresses potential challenges with solutions grounded in current research.
+By synthesizing advancements in blockchain, AI ethics, and cybersecurity, this
+paper offers a comprehensive roadmap for a decentralized AI governance
+framework capable of adapting to the complex and evolving landscape of global
+AI regulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a preprint of paper that has been accepted for Publication at
+  2024 IEEE International Conference on Trust, Privacy and Security in
+  Intelligent Systems, and Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention is All You Need Until You Need Retention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Murat Yaslioglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces a novel Retention Layer mechanism for Transformer based
+architectures, addressing their inherent lack of intrinsic retention
+capabilities. Unlike human cognition, which can encode and dynamically recall
+symbolic templates, Generative Pretrained Transformers rely solely on fixed
+pretrained weights and ephemeral context windows, limiting their adaptability.
+The proposed Retention Layer incorporates a persistent memory module capable of
+real time data population, dynamic recall, and guided output generation. This
+enhancement allows models to store, update, and reuse observed patterns across
+sessions, enabling incremental learning and bridging the gap between static
+pretraining and dynamic, context sensitive adaptation. The Retention Layer
+design parallels social learning processes, encompassing attention, retention,
+reproduction, and motivation stages. Technically, it integrates a memory
+attention mechanism and episodic buffers to manage memory scalability, mitigate
+overfitting, and ensure efficient recall. Applications span adaptive personal
+assistants, real time fraud detection, autonomous robotics, content moderation,
+and healthcare diagnostics. In each domain, the retention mechanism enables
+systems to learn incrementally, personalize outputs, and respond to evolving
+real world challenges effectively. By emulating key aspects of human learning,
+this retention enhanced architecture fosters a more fluid and responsive AI
+paradigm, paving the way for dynamic, session aware models that extend the
+capabilities of traditional Transformers into domains requiring continual
+adaptation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Veln(ia)s is in the Details: Evaluating LLM Judgment on Latvian and
+  Lithuanian Short Answer Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09164v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09164v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yevhen Kostiuk, Oxana Vitman, Łukasz Gagała, Artur Kiulian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we address the challenge of evaluating large language models
+(LLMs) on the short answer matching task for Latvian and Lithuanian languages.
+We introduce novel datasets consisting of 502 Latvian and 690 Lithuanian
+question-answer pairs. For each question-answer pair, we generated matched and
+non-matched answers using a set of alteration rules specifically designed to
+introduce small but meaningful changes in the text. These generated answers
+serve as test cases to assess the ability of LLMs to detect subtle differences
+in matching of the original answers. A subset of the datasets was manually
+verified for quality and accuracy. Our results show that while larger LLMs,
+such as QWEN2.5 72b and LLaMa3.1 70b, demonstrate near-perfect performance in
+distinguishing matched and non-matched answers, smaller models show more
+variance. For instance, LLaMa3.1 8b and EuroLLM 9b benefited from few-shot
+examples, while Mistral Nemo 12b underperformed on detection of subtle text
+alteration, particularly in Lithuanian, even with additional examples. QWEN2.5
+7b and Mistral 7b were able to obtain a strong and comparable performance to
+the larger 70b models in zero and few shot experiments. Moreover, the
+performance of Mistral 7b was weaker in few shot experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Understanding Extrapolation: a Causal Lens <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingjing Kong, Guangyi Chen, Petar Stojanov, Haoxuan Li, Eric P. Xing, Kun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Canonical work handling distribution shifts typically necessitates an entire
+target distribution that lands inside the training distribution. However,
+practical scenarios often involve only a handful of target samples, potentially
+lying outside the training support, which requires the capability of
+extrapolation. In this work, we aim to provide a theoretical understanding of
+when extrapolation is possible and offer principled methods to achieve it
+without requiring an on-support target distribution. To this end, we formulate
+the extrapolation problem with a latent-variable model that embodies the
+minimal change principle in causal mechanisms. Under this formulation, we cast
+the extrapolation problem into a latent-variable identification problem. We
+provide realistic conditions on shift properties and the estimation objectives
+that lead to identification even when only one off-support target sample is
+available, tackling the most challenging scenarios. Our theory reveals the
+intricate interplay between the underlying manifold's smoothness and the shift
+properties. We showcase how our theoretical results inform the design of
+practical adaptation algorithms. Through experiments on both synthetic and
+real-world data, we validate our theoretical findings and their practical
+implications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoLoop: Fast Visual SLAM Fine-tuning through Agentic Curriculum
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Assaf Lahiany, Oren Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current visual SLAM systems face significant challenges in balancing
+computational efficiency with robust loop closure handling. Traditional
+approaches require careful manual tuning and incur substantial computational
+overhead, while learning-based methods either lack explicit loop closure
+capabilities or implement them through computationally expensive methods. We
+present AutoLoop, a novel approach that combines automated curriculum learning
+with efficient fine-tuning for visual SLAM systems. Our method employs a DDPG
+(Deep Deterministic Policy Gradient) agent to dynamically adjust loop closure
+weights during training, eliminating the need for manual hyperparameter search
+while significantly reducing the required training steps. The approach
+pre-computes potential loop closure pairs offline and leverages them through an
+agent-guided curriculum, allowing the model to adapt efficiently to new
+scenarios. Experiments conducted on TartanAir for training and validated across
+multiple benchmarks including KITTI, EuRoC, ICL-NUIM and TUM RGB-D demonstrate
+that AutoLoop achieves comparable or superior performance while reducing
+training time by an order of magnitude compared to traditional approaches.
+AutoLoop provides a practical solution for rapid adaptation of visual SLAM
+systems, automating the weight tuning process that traditionally requires
+multiple manual iterations. Our results show that this automated curriculum
+strategy not only accelerates training but also maintains or improves the
+model's performance across diverse environmental conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Multilingual LLM Evaluation for Baltic and Nordic languages: A
+  study on Lithuanian History 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09154v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09154v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yevhen Kostiuk, Oxana Vitman, Łukasz Gagała, Artur Kiulian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we evaluated Lithuanian and general history knowledge of
+multilingual Large Language Models (LLMs) on a multiple-choice
+question-answering task. The models were tested on a dataset of Lithuanian
+national and general history questions translated into Baltic, Nordic, and
+other languages (English, Ukrainian, Arabic) to assess the knowledge sharing
+from culturally and historically connected groups. We evaluated GPT-4o,
+LLaMa3.1 8b and 70b, QWEN2.5 7b and 72b, Mistral Nemo 12b, LLaMa3 8b, Mistral
+7b, LLaMa3.2 3b, and Nordic fine-tuned models (GPT-SW3 and LLaMa3 8b).
+  Our results show that GPT-4o consistently outperformed all other models
+across language groups, with slightly better results for Baltic and Nordic
+languages. Larger open-source models like QWEN2.5 72b and LLaMa3.1 70b
+performed well but showed weaker alignment with Baltic languages. Smaller
+models (Mistral Nemo 12b, LLaMa3.2 3b, QWEN 7B, LLaMa3.1 8B, and LLaMa3 8b)
+demonstrated gaps with LT-related alignment with Baltic languages while
+performing better on Nordic and other languages. The Nordic fine-tuned models
+did not surpass multilingual models, indicating that shared cultural or
+historical context alone does not guarantee better performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Agentic Retrieval-Augmented Generation: A <span class="highlight-title">Survey</span> on Agentic RAG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditi Singh, Abul Ehtesham, Saket Kumar, Tala Talaei Khoei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have revolutionized artificial intelligence (AI)
+by enabling human like text generation and natural language understanding.
+However, their reliance on static training data limits their ability to respond
+to dynamic, real time queries, resulting in outdated or inaccurate outputs.
+Retrieval Augmented Generation (RAG) has emerged as a solution, enhancing LLMs
+by integrating real time data retrieval to provide contextually relevant and
+up-to-date responses. Despite its promise, traditional RAG systems are
+constrained by static workflows and lack the adaptability required for
+multistep reasoning and complex task management.
+  Agentic Retrieval-Augmented Generation (Agentic RAG) transcends these
+limitations by embedding autonomous AI agents into the RAG pipeline. These
+agents leverage agentic design patterns reflection, planning, tool use, and
+multiagent collaboration to dynamically manage retrieval strategies,
+iteratively refine contextual understanding, and adapt workflows to meet
+complex task requirements. This integration enables Agentic RAG systems to
+deliver unparalleled flexibility, scalability, and context awareness across
+diverse applications.
+  This survey provides a comprehensive exploration of Agentic RAG, beginning
+with its foundational principles and the evolution of RAG paradigms. It
+presents a detailed taxonomy of Agentic RAG architectures, highlights key
+applications in industries such as healthcare, finance, and education, and
+examines practical implementation strategies. Additionally, it addresses
+challenges in scaling these systems, ensuring ethical decision making, and
+optimizing performance for real-world applications, while providing detailed
+insights into frameworks and tools for implementing Agentic RAG
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Robustness of Contrastive Learning Models for Medical
+  Image-Report Retrieval <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Demetrio Deanda, Yuktha Priya Masupalli, Jeong Yang, Young Lee, Zechun Cao, Gongbo Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical images and reports offer invaluable insights into patient health. The
+heterogeneity and complexity of these data hinder effective analysis. To bridge
+this gap, we investigate contrastive learning models for cross-domain
+retrieval, which associates medical images with their corresponding clinical
+reports. This study benchmarks the robustness of four state-of-the-art
+contrastive learning models: CLIP, CXR-RePaiR, MedCLIP, and CXR-CLIP. We
+introduce an occlusion retrieval task to evaluate model performance under
+varying levels of image corruption. Our findings reveal that all evaluated
+models are highly sensitive to out-of-distribution data, as evidenced by the
+proportional decrease in performance with increasing occlusion levels. While
+MedCLIP exhibits slightly more robustness, its overall performance remains
+significantly behind CXR-CLIP and CXR-RePaiR. CLIP, trained on a
+general-purpose dataset, struggles with medical image-report retrieval,
+highlighting the importance of domain-specific training data. The evaluation of
+this work suggests that more effort needs to be spent on improving the
+robustness of these models. By addressing these limitations, we can develop
+more reliable cross-domain retrieval models for medical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted to AAAI 2025 Workshop -- the 9th International
+  Workshop on Health Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Medical Image Anonymization Based on Latent Code Projection
+  and Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huiyu Li, Nicholas Ayache, Hervé Delingette
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image anonymization aims to protect patient privacy by removing
+identifying information, while preserving the data utility to solve downstream
+tasks. In this paper, we address the medical image anonymization problem with a
+two-stage solution: latent code projection and optimization. In the projection
+stage, we design a streamlined encoder to project input images into a latent
+space and propose a co-training scheme to enhance the projection process. In
+the optimization stage, we refine the latent code using two deep loss functions
+designed to address the trade-off between identity protection and data utility
+dedicated to medical images. Through a comprehensive set of qualitative and
+quantitative experiments, we showcase the effectiveness of our approach on the
+MIMIC-CXR chest X-ray dataset by generating anonymized synthetic images that
+can serve as training set for detecting lung pathologies. Source codes are
+available at https://github.com/Huiyu-Li/GMIA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mantis Shrimp: Exploring Photometric Band Utilization in Computer Vision
+  Networks for Photometric Redshift Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09112v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09112v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Engel, Nell Byler, Adam Tsou, Gautham Narayan, Emmanuel Bonilla, Ian Smith
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Mantis Shrimp, a multi-survey deep learning model for photometric
+redshift estimation that fuses ultra-violet (GALEX), optical (PanSTARRS), and
+infrared (UnWISE) imagery. Machine learning is now an established approach for
+photometric redshift estimation, with generally acknowledged higher performance
+in areas with a high density of spectroscopically identified galaxies over
+template-based methods. Multiple works have shown that image-based
+convolutional neural networks can outperform tabular-based color/magnitude
+models. In comparison to tabular models, image models have additional design
+complexities: it is largely unknown how to fuse inputs from different
+instruments which have different resolutions or noise properties. The Mantis
+Shrimp model estimates the conditional density estimate of redshift using
+cutout images. The density estimates are well calibrated and the point
+estimates perform well in the distribution of available spectroscopically
+confirmed galaxies with (bias = 1e-2), scatter (NMAD = 2.44e-2) and
+catastrophic outlier rate ($\eta$=17.53$\%$). We find that early fusion
+approaches (e.g., resampling and stacking images from different instruments)
+match the performance of late fusion approaches (e.g., concatenating latent
+space representations), so that the design choice ultimately is left to the
+user. Finally, we study how the models learn to use information across bands,
+finding evidence that our models successfully incorporates information from all
+surveys. The applicability of our model to the analysis of large populations of
+galaxies is limited by the speed of downloading cutouts from external servers;
+however, our model could be useful in smaller studies such as generating priors
+over redshift for stellar population synthesis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Non-autoregressive Model for Joint STT and TTS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09104v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09104v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishal Sunder, Brian Kingsbury, George Saon, Samuel Thomas, Slava Shechtman Hagai Aronowitz, Eric Fosler-Lussier, Luis Lastras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we take a step towards jointly modeling automatic speech
+recognition (STT) and speech synthesis (TTS) in a fully non-autoregressive way.
+We develop a novel multimodal framework capable of handling the speech and text
+modalities as input either individually or together. The proposed model can
+also be trained with unpaired speech or text data owing to its multimodal
+nature. We further propose an iterative refinement strategy to improve the STT
+and TTS performance of our model such that the partial hypothesis at the output
+can be fed back to the input of our model, thus iteratively improving both STT
+and TTS predictions. We show that our joint model can effectively perform both
+STT and TTS tasks, outperforming the STT-specific baseline in all tasks and
+performing competitively with the TTS-specific baseline across a wide range of
+evaluation metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tracking the Takes and Trajectories of English-Language News Narratives
+  across Trustworthy and Worrisome Websites <span class="chip">USENIX Security</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hans W. A. Hanley, Emily Okabe, Zakir Durumeric
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how misleading and outright false information enters news
+ecosystems remains a difficult challenge that requires tracking how narratives
+spread across thousands of fringe and mainstream news websites. To do this, we
+introduce a system that utilizes encoder-based large language models and
+zero-shot stance detection to scalably identify and track news narratives and
+their attitudes across over 4,000 factually unreliable, mixed-reliability, and
+factually reliable English-language news websites. Running our system over an
+18 month period, we track the spread of 146K news stories. Using network-based
+interference via the NETINF algorithm, we show that the paths of news
+narratives and the stances of websites toward particular entities can be used
+to uncover slanted propaganda networks (e.g., anti-vaccine and anti-Ukraine)
+and to identify the most influential websites in spreading these attitudes in
+the broader news ecosystem. We hope that increased visibility into our
+distributed news ecosystem can help with the reporting and fact-checking of
+propaganda and disinformation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at USENIX Security Symposium 2025. Keywords:
+  Misinformation, News, Narratives, LLMs, Stance-Detection</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SteLLA: A Structured Grading System Using LLMs with RAG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hefei Qiu, Brian White, Ashley Ding, Reinaldo Costa, Ali Hachem, Wei Ding, Ping Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown strong general capabilities in many
+applications. However, how to make them reliable tools for some specific tasks
+such as automated short answer grading (ASAG) remains a challenge. We present
+SteLLA (Structured Grading System Using LLMs with RAG) in which a) Retrieval
+Augmented Generation (RAG) approach is used to empower LLMs specifically on the
+ASAG task by extracting structured information from the highly relevant and
+reliable external knowledge based on the instructor-provided reference answer
+and rubric, b) an LLM performs a structured and question-answering-based
+evaluation of student answers to provide analytical grades and feedback. A
+real-world dataset that contains students' answers in an exam was collected
+from a college-level Biology course. Experiments show that our proposed system
+can achieve substantial agreement with the human grader while providing
+break-down grades and feedback on all the knowledge points examined in the
+problem. A qualitative and error analysis of the feedback generated by GPT4
+shows that GPT4 is good at capturing facts while may be prone to inferring too
+much implication from the given text in the grading task which provides
+insights into the usage of LLMs in the ASAG system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inferring Transition Dynamics from Value Functions <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Adamczyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In reinforcement learning, the value function is typically trained to solve
+the Bellman equation, which connects the current value to future values. This
+temporal dependency hints that the value function may contain implicit
+information about the environment's transition dynamics. By rearranging the
+Bellman equation, we show that a converged value function encodes a model of
+the underlying dynamics of the environment. We build on this insight to propose
+a simple method for inferring dynamics models directly from the value function,
+potentially mitigating the need for explicit model learning. Furthermore, we
+explore the challenges of next-state identifiability, discussing conditions
+under which the inferred dynamics model is well-defined. Our work provides a
+theoretical foundation for leveraging value functions in dynamics modeling and
+opens a new avenue for bridging model-free and model-based reinforcement
+learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the AAAI-25 8th Workshop on Generalization in Planning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Average-Reward Reinforcement Learning with Entropy Regularization <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Adamczyk, Volodymyr Makarenko, Stas Tiomkin, Rahul V. Kulkarni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The average-reward formulation of reinforcement learning (RL) has drawn
+increased interest in recent years due to its ability to solve
+temporally-extended problems without discounting. Independently, RL algorithms
+have benefited from entropy-regularization: an approach used to make the
+optimal policy stochastic, thereby more robust to noise. Despite the distinct
+benefits of the two approaches, the combination of entropy regularization with
+an average-reward objective is not well-studied in the literature and there has
+been limited development of algorithms for this setting. To address this gap in
+the field, we develop algorithms for solving entropy-regularized average-reward
+RL problems with function approximation. We experimentally validate our method,
+comparing it with existing algorithms on standard benchmarks for RL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the AAAI-25 Eighth Workshop on Bridging the Gap Between
+  AI Planning and Reinforcement Learning (PRL)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decompose-ToM: Enhancing Theory of Mind Reasoning in Large Language
+  Models through Simulation and Task Decomposition <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09056v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09056v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sneheel Sarangi, Maha Elgarf, Hanan Salam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Theory of Mind (ToM) is the ability to understand and reflect on the mental
+states of others. Although this capability is crucial for human interaction,
+testing on Large Language Models (LLMs) reveals that they possess only a
+rudimentary understanding of it. Although the most capable closed-source LLMs
+have come close to human performance on some ToM tasks, they still perform
+poorly on complex variations of the task that involve more structured
+reasoning. In this work, we utilize the concept of "pretend-play", or
+``Simulation Theory'' from cognitive psychology to propose ``Decompose-ToM'':
+an LLM-based inference algorithm that improves model performance on complex ToM
+tasks. We recursively simulate user perspectives and decompose the ToM task
+into a simpler set of functions: subject identification, question-reframing,
+world model updation, and knowledge availability. We test the algorithm on
+higher-order ToM tasks and a task testing for ToM capabilities in a
+conversational setting, demonstrating that our approach shows significant
+improvement across models compared to baseline methods while requiring minimal
+prompt tuning across tasks and no additional model training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Polyp detection in colonoscopy images using YOLOv11 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alok Ranjan Sahoo, Satya Sangram Sahoo, Pavan Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Colorectal cancer (CRC) is one of the most commonly diagnosed cancers all
+over the world. It starts as a polyp in the inner lining of the colon. To
+prevent CRC, early polyp detection is required. Colonosopy is used for the
+inspection of the colon. Generally, the images taken by the camera placed at
+the tip of the endoscope are analyzed by the experts manually. Various
+traditional machine learning models have been used with the rise of machine
+learning. Recently, deep learning models have shown more effectiveness in polyp
+detection due to their superiority in generalizing and learning small features.
+These deep learning models for object detection can be segregated into two
+different types: single-stage and two-stage. Generally, two stage models have
+higher accuracy than single stage ones but the single stage models have low
+inference time. Hence, single stage models are easy to use for quick object
+detection. YOLO is one of the singlestage models used successfully for polyp
+detection. It has drawn the attention of researchers because of its lower
+inference time. The researchers have used Different versions of YOLO so far,
+and with each newer version, the accuracy of the model is increasing. This
+paper aims to see the effectiveness of the recently released YOLOv11 to detect
+polyp. We analyzed the performance for all five models of YOLOv11 (YOLO11n,
+YOLO11s, YOLO11m, YOLO11l, YOLO11x) with Kvasir dataset for the training and
+testing. Two different versions of the dataset were used. The first consisted
+of the original dataset, and the other was created using augmentation
+techniques. The performance of all the models with these two versions of the
+dataset have been analysed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Realistic Synthetic Head Rotation Data for Extended Reality
+  using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Struye, Filip Lemic, Jeroen Famaey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extended Reality is a revolutionary method of delivering multimedia content
+to users. A large contributor to its popularity is the sense of immersion and
+interactivity enabled by having real-world motion reflected in the virtual
+experience accurately and immediately. This user motion, mainly caused by head
+rotations, induces several technical challenges. For instance, which content is
+generated and transmitted depends heavily on where the user is looking.
+Seamless systems, taking user motion into account proactively, will therefore
+require accurate predictions of upcoming rotations. Training and evaluating
+such predictors requires vast amounts of orientational input data, which is
+expensive to gather, as it requires human test subjects. A more feasible
+approach is to gather a modest dataset through test subjects, and then extend
+it to a more sizeable set using synthetic data generation methods. In this
+work, we present a head rotation time series generator based on TimeGAN, an
+extension of the well-known Generative Adversarial Network, designed
+specifically for generating time series. This approach is able to extend a
+dataset of head rotations with new samples closely matching the distribution of
+the measured time series.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published and presented at International Conference on Multimedia
+  2022 (ACMMM), Workshop on Interactive eXtended Reality (IXR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic-Aware Spatio-temporal Representation Learning for Dynamic MRI
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dayoung Baik, Jaejun Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic MRI reconstruction, one of inverse problems, has seen a surge by the
+use of deep learning techniques. Especially, the practical difficulty of
+obtaining ground truth data has led to the emergence of unsupervised learning
+approaches. A recent promising method among them is implicit neural
+representation (INR), which defines the data as a continuous function that maps
+coordinate values to the corresponding signal values. This allows for filling
+in missing information only with incomplete measurements and solving the
+inverse problem effectively. Nevertheless, previous works incorporating this
+method have faced drawbacks such as long optimization time and the need for
+extensive hyperparameter tuning. To address these issues, we propose
+Dynamic-Aware INR (DA-INR), an INR-based model for dynamic MRI reconstruction
+that captures the spatial and temporal continuity of dynamic MRI data in the
+image domain and explicitly incorporates the temporal redundancy of the data
+into the model structure. As a result, DA-INR outperforms other models in
+reconstruction quality even at extreme undersampling ratios while significantly
+reducing optimization time and requiring minimal hyperparameter tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatio-Temporal Foundation Models: Vision, Challenges, and Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Goodge, Wee Siong Ng, Bryan Hooi, See Kiong Ng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have revolutionized artificial intelligence, setting new
+benchmarks in performance and enabling transformative capabilities across a
+wide range of vision and language tasks. However, despite the prevalence of
+spatio-temporal data in critical domains such as transportation, public health,
+and environmental monitoring, spatio-temporal foundation models (STFMs) have
+not yet achieved comparable success. In this paper, we articulate a vision for
+the future of STFMs, outlining their essential characteristics and the
+generalization capabilities necessary for broad applicability. We critically
+assess the current state of research, identifying gaps relative to these ideal
+traits, and highlight key challenges that impede their progress. Finally, we
+explore potential opportunities and directions to advance research towards the
+aim of effective and broadly applicable STFMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TCMM: Token Constraint and Multi-Scale Memory Bank of Contrastive
+  Learning for Unsupervised Person Re-identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng-An Zhu, Hsin-Che Chien, Chen-Kuo Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes the ViT Token Constraint and Multi-scale Memory bank
+(TCMM) method to address the patch noises and feature inconsistency in
+unsupervised person re-identification works. Many excellent methods use ViT
+features to obtain pseudo labels and clustering prototypes, then train the
+model with contrastive learning. However, ViT processes images by performing
+patch embedding, which inevitably introduces noise in patches and may
+compromise the performance of the re-identification model. On the other hand,
+previous memory bank based contrastive methods may lead data inconsistency due
+to the limitation of batch size. Furthermore, existing pseudo label methods
+often discard outlier samples that are difficult to cluster. It sacrifices the
+potential value of outlier samples, leading to limited model diversity and
+robustness. This paper introduces the ViT Token Constraint to mitigate the
+damage caused by patch noises to the ViT architecture. The proposed Multi-scale
+Memory enhances the exploration of outlier samples and maintains feature
+consistency. Experimental results demonstrate that our system achieves
+state-of-the-art performance on common benchmarks. The project is available at
+\href{https://github.com/andy412510/TCMM}{https://github.com/andy412510/TCMM}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EVAL: EigenVector-based Average-reward Learning <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Adamczyk, Volodymyr Makarenko, Stas Tiomkin, Rahul V. Kulkarni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In reinforcement learning, two objective functions have been developed
+extensively in the literature: discounted and averaged rewards. The
+generalization to an entropy-regularized setting has led to improved robustness
+and exploration for both of these objectives. Recently, the entropy-regularized
+average-reward problem was addressed using tools from large deviation theory in
+the tabular setting. This method has the advantage of linearity, providing
+access to both the optimal policy and average reward-rate through properties of
+a single matrix. In this paper, we extend that framework to more general
+settings by developing approaches based on function approximation by neural
+networks. This formulation reveals new theoretical insights into the
+relationship between different objectives used in RL. Additionally, we combine
+our algorithm with a posterior policy iteration scheme, showing how our
+approach can also solve the average-reward RL problem without
+entropy-regularization. Using classic control benchmarks, we experimentally
+find that our method compares favorably with other algorithms in terms of
+stability and rate of convergence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the AAAI-25 8th Workshop on Generalization in Planning.
+  arXiv admin note: text overlap with arXiv:2501.09080</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reward Machines for Deep RL in Noisy and Uncertain Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00120v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00120v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew C. Li, Zizhao Chen, Toryn Q. Klassen, Pashootan Vaezipoor, Rodrigo Toro Icarte, Sheila A. McIlraith
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reward Machines provide an automaton-inspired structure for specifying
+instructions, safety constraints, and other temporally extended reward-worthy
+behaviour. By exposing the underlying structure of a reward function, they
+enable the decomposition of an RL task, leading to impressive gains in sample
+efficiency. Although Reward Machines and similar formal specifications have a
+rich history of application towards sequential decision-making problems, they
+critically rely on a ground-truth interpretation of the domain-specific
+vocabulary that forms the building blocks of the reward function--such
+ground-truth interpretations are elusive in the real world due in part to
+partial observability and noisy sensing. In this work, we explore the use of
+Reward Machines for Deep RL in noisy and uncertain environments. We
+characterize this problem as a POMDP and propose a suite of RL algorithms that
+exploit task structure under uncertain interpretation of the domain-specific
+vocabulary. Through theory and experiments, we expose pitfalls in naive
+approaches to this problem while simultaneously demonstrating how task
+structure can be successfully leveraged under noisy interpretations of the
+vocabulary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Consistency of Responses and Continuations Generated by Large Language
+  Models on Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08102v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08102v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenlu Fan, Yuqi Zhu, Chenyang Wang, Bin Wang, Wentao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) demonstrate remarkable capabilities in text
+generation, yet their emotional consistency and semantic coherence in social
+media contexts remain insufficiently understood. This study investigates how
+LLMs handle emotional content and maintain semantic relationships through
+continuation and response tasks using two open-source models: Gemma and Llama.
+By analyzing climate change discussions from Twitter and Reddit, we examine
+emotional transitions, intensity patterns, and semantic similarity between
+human-authored and LLM-generated content. Our findings reveal that while both
+models maintain high semantic coherence, they exhibit distinct emotional
+patterns: Gemma shows a tendency toward negative emotion amplification,
+particularly anger, while maintaining certain positive emotions like optimism.
+Llama demonstrates superior emotional preservation across a broader spectrum of
+affects. Both models systematically generate responses with attenuated
+emotional intensity compared to human-authored content and show a bias toward
+positive emotions in response tasks. Additionally, both models maintain strong
+semantic similarity with original texts, though performance varies between
+continuation and response tasks. These findings provide insights into LLMs'
+emotional and semantic processing capabilities, with implications for their
+deployment in social media contexts and human-AI interaction design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Low-Dimensional Strain Models of Soft Robots by Looking at the
+  Evolution of Their Shape with Application to Model-Based Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.00138v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.00138v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ricardo Valadas, Maximilian Stölzle, Jingyue Liu, Cosimo Della Santina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining dynamic models of continuum soft robots is central to the analysis
+and control of soft robots, and researchers have devoted much attention to the
+challenge of proposing both data-driven and first-principle solutions. Both
+avenues have, however, shown their limitations; the former lacks structure and
+performs poorly outside training data, while the latter requires significant
+simplifications and extensive expert knowledge to be used in practice. This
+paper introduces a streamlined method for learning low-dimensional,
+physics-based models that are both accurate and easy to interpret. We start
+with an algorithm that uses image data (i.e., shape evolutions) to determine
+the minimal necessary segments for describing a soft robot's movement.
+Following this, we apply a dynamic regression and strain sparsification
+algorithm to identify relevant strains and define the model's dynamics. We
+validate our approach through simulations with various planar soft
+manipulators, comparing its performance against other learning strategies,
+showing that our models are both computationally efficient and 25x more
+accurate on out-of-training distribution inputs. Finally, we demonstrate that
+thanks to the capability of the method of generating physically compatible
+models, the learned models can be straightforwardly combined with model-based
+control policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, appearing in Proceedings of the 2025 IEEE 8th International
+  Conference on Soft Robotics (RoboSoft)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Discrete-sequence <span class="highlight-title">Dataset</span> for Evaluating Online Unsupervised Anomaly
+  Detection Approaches for Multivariate Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.13951v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.13951v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Correia, Jan-Christoph Goos, Thomas Bäck, Anna V. Kononova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benchmarking anomaly detection approaches for multivariate time series is
+challenging due to the lack of high-quality datasets. Current publicly
+available datasets are too small, not diverse and feature trivial anomalies,
+which hinders measurable progress in this research area. We propose a solution:
+a diverse, extensive, and non-trivial dataset generated via state-of-the-art
+simulation tools that reflects realistic behaviour of an automotive powertrain,
+including its multivariate, dynamic and variable-state properties. To cater for
+both unsupervised and semi-supervised anomaly detection settings, as well as
+time series generation and forecasting, we make different versions of the
+dataset available, where training and test subsets are offered in contaminated
+and clean versions, depending on the task. We also provide baseline results
+from a small selection of approaches based on deterministic and variational
+autoencoders, as well as a non-parametric approach. As expected, the baseline
+experimentation shows that the approaches trained on the semi-supervised
+version of the dataset outperform their unsupervised counterparts, highlighting
+a need for approaches more robust to contaminated training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the IEEE Transactions on Reliability journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Identifying Spurious Correlations using Counterfactual Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02186v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02186v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Paul Cohen, Louis Blankemeier, Akshay Chaudhari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models driven by spurious correlations often yield poor generalization
+performance. We propose the counterfactual (CF) alignment method to detect and
+quantify spurious correlations of black box classifiers. Our methodology is
+based on counterfactual images generated with respect to one classifier being
+input into other classifiers to see if they also induce changes in the outputs
+of these classifiers. The relationship between these responses can be
+quantified and used to identify specific instances where a spurious correlation
+exists. This is validated by observing intuitive trends in face-attribute and
+waterbird classifiers, as well as by fabricating spurious correlations and
+detecting their presence, both visually and quantitatively. Furthermore,
+utilizing the CF alignment method, we demonstrate that we can evaluate robust
+optimization methods (GroupDRO, JTT, and FLAC) by detecting a reduction in
+spurious correlations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Transactions on Machine Learning Research (TMLR), Code:
+  https://github.com/ieee8023/latentshift</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrated Push-and-Pull Update Model for Goal-Oriented Effective
+  Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.14092v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.14092v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pouya Agheli, Nikolaos Pappas, Petar Popovski, Marios Kountouris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies decision-making for goal-oriented effective communication.
+We consider an end-to-end status update system where a sensing agent (SA)
+observes a source, generates and transmits updates to an actuation agent (AA),
+while the AA takes actions to accomplish a goal at the endpoint. We integrate
+the push- and pull-based update communication models to obtain a push-and-pull
+model, which allows the transmission controller at the SA to decide to push an
+update to the AA and the query controller at the AA to pull updates by raising
+queries at specific time instances. To gauge effectiveness, we utilize a grade
+of effectiveness (GoE) metric incorporating updates' freshness, usefulness, and
+timeliness of actions as qualitative attributes. We then derive effect-aware
+policies to maximize the expected discounted sum of updates' effectiveness
+subject to induced costs. The effect-aware policy at the SA considers the
+potential effectiveness of communicated updates at the endpoint, while at the
+AA, it accounts for the probabilistic evolution of the source and importance of
+generated updates. Our results show the proposed push-and-pull model
+outperforms models solely based on push- or pull-based updates both in terms of
+efficiency and effectiveness. Additionally, using effect-aware policies at both
+agents enhances effectiveness compared to periodic and/or probabilistic
+effect-agnostic policies at either or both agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Taming the Long Tail in Human Mobility Prediction <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.14970v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.14970v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohang Xu, Renhe Jiang, Chuang Yang, Zipei Fan, Kaoru Sezaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the popularity of location-based services, human mobility prediction
+plays a key role in enhancing personalized navigation, optimizing
+recommendation systems, and facilitating urban mobility and planning. This
+involves predicting a user's next POI (point-of-interest) visit using their
+past visit history. However, the uneven distribution of visitations over time
+and space, namely the long-tail problem in spatial distribution, makes it
+difficult for AI models to predict those POIs that are less visited by humans.
+In light of this issue, we propose the Long-Tail Adjusted Next POI Prediction
+(LoTNext) framework for mobility prediction, combining a Long-Tailed Graph
+Adjustment module to reduce the impact of the long-tailed nodes in the user-POI
+interaction graph and a novel Long-Tailed Loss Adjustment module to adjust loss
+by logit score and sample weight adjustment strategy. Also, we employ the
+auxiliary prediction task to enhance generalization and accuracy. Our
+experiments with two real-world trajectory datasets demonstrate that LoTNext
+significantly surpasses existing state-of-the-art works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Surprising Ineffectiveness of <span class="highlight-title">Pre-Train</span>ed Visual Representations for
+  Model-Based Reinforcement Learning <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10175v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10175v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moritz Schneider, Robert Krug, Narunas Vaskevicius, Luigi Palmieri, Joschka Boedecker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Reinforcement Learning (RL) methods often require extensive amounts of
+data. As opposed to model-free RL, model-based RL (MBRL) offers a potential
+solution with efficient data utilization through planning. Additionally, RL
+lacks generalization capabilities for real-world tasks. Prior work has shown
+that incorporating pre-trained visual representations (PVRs) enhances sample
+efficiency and generalization. While PVRs have been extensively studied in the
+context of model-free RL, their potential in MBRL remains largely unexplored.
+In this paper, we benchmark a set of PVRs on challenging control tasks in a
+model-based RL setting. We investigate the data efficiency, generalization
+capabilities, and the impact of different properties of PVRs on the performance
+of model-based agents. Our results, perhaps surprisingly, reveal that for MBRL
+current PVRs are not more sample efficient than learning representations from
+scratch, and that they do not generalize better to out-of-distribution (OOD)
+settings. To explain this, we analyze the quality of the trained dynamics
+model. Furthermore, we show that data diversity and network architecture are
+the most important contributors to OOD generalization performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the 38th Conference on Neural Information Processing
+  Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Optimal Tax Design in Nonatomic Congestion Games <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07437v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07437v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiwen Cui, Maryam Fazel, Simon S. Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In multiplayer games, self-interested behavior among the players can harm the
+social welfare. Tax mechanisms are a common method to alleviate this issue and
+induce socially optimal behavior. In this work, we take the initial step of
+learning the optimal tax that can maximize social welfare with limited feedback
+in congestion games. We propose a new type of feedback named \emph{equilibrium
+feedback}, where the tax designer can only observe the Nash equilibrium after
+deploying a tax plan. Existing algorithms are not applicable due to the
+exponentially large tax function space, nonexistence of the gradient, and
+nonconvexity of the objective. To tackle these challenges, we design a
+computationally efficient algorithm that leverages several novel components:
+(1) a piece-wise linear tax to approximate the optimal tax; (2) extra linear
+terms to guarantee a strongly convex potential function; (3) an efficient
+subroutine to find the exploratory tax that can provide critical information
+about the game. The algorithm can find an $\epsilon$-optimal tax with $O(\beta
+F^2/\epsilon)$ sample complexity, where $\beta$ is the smoothness of the cost
+function and $F$ is the number of facilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages. Accepted by Conference on Neural Information Processing
+  Systems (NeurIPS) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of Artificial Intelligence Methods for Lead Time Prediction
+  in Non-Cycled Areas of Automotive Production 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07317v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07317v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cornelius Hake, Jonas Weigele, Frederik Reichert, Christian Friedrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present study examines the effectiveness of applying Artificial
+Intelligence methods in an automotive production environment to predict unknown
+lead times in a non-cycle-controlled production area. Data structures are
+analyzed to identify contextual features and then preprocessed using one-hot
+encoding. Methods selection focuses on supervised machine learning techniques.
+In supervised learning methods, regression and classification methods are
+evaluated. Continuous regression based on target size distribution is not
+feasible. Classification methods analysis shows that Ensemble Learning and
+Support Vector Machines are the most suitable. Preliminary study results
+indicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost
+yield the best results. After further testing and extensive hyperparameter
+optimization, the final method choice is the LightGBM algorithm. Depending on
+feature availability and prediction interval granularity, relative prediction
+accuracies of up to 90% can be achieved. Further tests highlight the importance
+of periodic retraining of AI models to accurately represent complex production
+processes using the database. The research demonstrates that AI methods can be
+effectively applied to highly variable production data, adding business value
+by providing an additional metric for various control tasks while outperforming
+current non AI-based systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constrained Latent Action Policies for Model-Based Offline Reinforcement
+  Learning <span class="chip">NeurIPS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.04562v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.04562v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marvin Alles, Philip Becker-Ehmck, Patrick van der Smagt, Maximilian Karl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In offline reinforcement learning, a policy is learned using a static dataset
+in the absence of costly feedback from the environment. In contrast to the
+online setting, only using static datasets poses additional challenges, such as
+policies generating out-of-distribution samples. Model-based offline
+reinforcement learning methods try to overcome these by learning a model of the
+underlying dynamics of the environment and using it to guide policy search. It
+is beneficial but, with limited datasets, errors in the model and the issue of
+value overestimation among out-of-distribution states can worsen performance.
+Current model-based methods apply some notion of conservatism to the Bellman
+update, often implemented using uncertainty estimation derived from model
+ensembles. In this paper, we propose Constrained Latent Action Policies (C-LAP)
+which learns a generative model of the joint distribution of observations and
+actions. We cast policy learning as a constrained objective to always stay
+within the support of the latent action distribution, and use the generative
+capabilities of the model to impose an implicit constraint on the generated
+actions. Thereby eliminating the need to use additional uncertainty penalties
+on the Bellman update and significantly decreasing the number of gradient steps
+required to learn a policy. We empirically evaluate C-LAP on the D4RL and
+V-D4RL benchmark, and show that C-LAP is competitive to state-of-the-art
+methods, especially outperforming on datasets with visual observations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38th Conference on Neural Information Processing Systems (NeurIPS
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mind the Error! Detection and Localization of Instruction Errors in
+  Vision-and-Language Navigation <span class="chip">IROS'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Taioli, Stefano Rosa, Alberto Castellini, Lorenzo Natale, Alessio Del Bue, Alessandro Farinelli, Marco Cristani, Yiming Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-and-Language Navigation in Continuous Environments (VLN-CE) is one of
+the most intuitive yet challenging embodied AI tasks. Agents are tasked to
+navigate towards a target goal by executing a set of low-level actions,
+following a series of natural language instructions. All VLN-CE methods in the
+literature assume that language instructions are exact. However, in practice,
+instructions given by humans can contain errors when describing a spatial
+environment due to inaccurate memory or confusion. Current VLN-CE benchmarks do
+not address this scenario, making the state-of-the-art methods in VLN-CE
+fragile in the presence of erroneous instructions from human users. For the
+first time, we propose a novel benchmark dataset that introduces various types
+of instruction errors considering potential human causes. This benchmark
+provides valuable insight into the robustness of VLN systems in continuous
+environments. We observe a noticeable performance drop (up to -25%) in Success
+Rate when evaluating the state-of-the-art VLN-CE methods on our benchmark.
+Moreover, we formally define the task of Instruction Error Detection and
+Localization, and establish an evaluation protocol on top of our benchmark
+dataset. We also propose an effective method, based on a cross-modal
+transformer architecture, that achieves the best performance in error detection
+and localization, compared to baselines. Surprisingly, our proposed method has
+revealed errors in the validation set of the two commonly used datasets for
+VLN-CE, i.e., R2R-CE and RxR-CE, demonstrating the utility of our technique in
+other tasks. Code and dataset available at
+https://intelligolabs.github.io/R2RIE-CE
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 figures, 8 pages. Accepted at IROS'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards a performance characteristic curve for model evaluation: an
+  application in information diffusion prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09537v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09537v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjin Xie, Xiaomeng Wang, Radosław Michalski, Tao Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The information diffusion prediction on social networks aims to predict
+future recipients of a message, with practical applications in marketing and
+social media. While different prediction models all claim to perform well,
+general frameworks for performance evaluation remain limited. Here, we aim to
+identify a performance characteristic curve for a model, which captures its
+performance on tasks of different complexity. We propose a metric based on
+information entropy to quantify the randomness in diffusion data. We then
+identify a scaling pattern between the randomness and the prediction accuracy
+of the model. By properly adjusting the variables, data points by different
+sequence lengths, system sizes, and randomness can all collapse into a single
+curve. The curve captures a model's inherent capability of making correct
+predictions against increased uncertainty, which we regard as the performance
+characteristic curve of the model. The validity of the curve is tested by three
+prediction models in the same family, reaching conclusions in line with
+existing studies. In addition, we apply the curve to successfully assess the
+performance of eight state-of-the-art models, providing a clear and
+comprehensive evaluation even for models that are challenging to differentiate
+with conventional metrics. Our work reveals a pattern underlying the data
+randomness and prediction accuracy. The performance characteristic curve
+provides a new way to evaluate models' performance systematically, and sheds
+light on future studies on other frameworks for model evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unseen Horizons: Unveiling the Real Capability of LLM Code Generation
+  Beyond the Familiar <span class="chip">ICSE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.08109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.08109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanliang Zhang, Yifan Xie, Shanshan Li, Ke Liu, Chong Wang, Zhouyang Jia, Xiangbing Huang, Jie Song, Chaopeng Luo, Zhizheng Zheng, Rulin Xu, Yitong Liu, Si Zheng, Xiangke Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large language models (LLMs) have shown strong potential in code
+generation tasks. However, there are still gaps before they can be fully
+applied in actual software development processes. Accurately assessing the code
+generation capabilities of large language models has become an important basis
+for evaluating and improving the models. Some existing works have constructed
+datasets to evaluate the capabilities of these models. However, the current
+evaluation process may encounter the illusion of "Specialist in Familiarity",
+primarily due to three gaps: the exposure of target code, case timeliness, and
+dependency availability. The fundamental reason for these gaps is that the code
+in current datasets may have been extensively exposed and exercised during the
+training phase, and due to the continuous training and development of LLM,
+their timeliness has been severely compromised. The key to solve the problem is
+to, as much as possible, evaluate the LLMs using code that they have not
+encountered before. Thus, the fundamental idea in this paper is to draw on the
+concept of code obfuscation, changing code at different levels while ensuring
+the functionality and output. To this end, we build a code-obfuscation based
+benchmark OBFUSEVAL. We first collect 1,354 raw cases from five real-world
+projects, including function description and code. Then we use three-level
+strategy (symbol, structure and semantic) to obfuscate descriptions, code and
+context dependencies. We evaluate four LLMs on OBFU- SEVAL and compared the
+effectiveness of different obfuscation strategy. We use official test suites of
+these projects to evaluate the generated code. The results show that after
+obfuscation, the average decrease ratio of test pass rate can up to 62.5%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 47th International Conference on Software Engineering
+  (ICSE 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Maximizing Uncertainty for Federated learning via Bayesian
+  Optimisation-based Model Poisoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08002v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08002v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Aristodemou, Xiaolan Liu, Yuan Wang, Konstantinos G. Kyriakopoulos, Sangarapillai Lambotharan, Qingsong Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As we transition from Narrow Artificial Intelligence towards Artificial Super
+Intelligence, users are increasingly concerned about their privacy and the
+trustworthiness of machine learning (ML) technology. A common denominator for
+the metrics of trustworthiness is the quantification of uncertainty inherent in
+DL algorithms, and specifically in the model parameters, input data, and model
+predictions. One of the common approaches to address privacy-related issues in
+DL is to adopt distributed learning such as federated learning (FL), where
+private raw data is not shared among users. Despite the privacy-preserving
+mechanisms in FL, it still faces challenges in trustworthiness. Specifically,
+the malicious users, during training, can systematically create malicious model
+parameters to compromise the models predictive and generative capabilities,
+resulting in high uncertainty about their reliability. To demonstrate malicious
+behaviour, we propose a novel model poisoning attack method named Delphi which
+aims to maximise the uncertainty of the global model output. We achieve this by
+taking advantage of the relationship between the uncertainty and the model
+parameters of the first hidden layer of the local model. Delphi employs two
+types of optimisation , Bayesian Optimisation and Least Squares Trust Region,
+to search for the optimal poisoned model parameters, named as Delphi-BO and
+Delphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise
+the distance of the predictive probability distribution towards an uncertain
+distribution of model output. Furthermore, we establish a mathematical proof
+for the attack effectiveness demonstrated in FL. Numerical results demonstrate
+that Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR
+highlighting vulnerability of FL systems to model poisoning attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MambaLRP: Explaining Selective State Space Sequence Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.07592v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.07592v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farnoush Rezaei Jafari, Grégoire Montavon, Klaus-Robert Müller, Oliver Eberle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent sequence modeling approaches using selective state space sequence
+models, referred to as Mamba models, have seen a surge of interest. These
+models allow efficient processing of long sequences in linear time and are
+rapidly being adopted in a wide range of applications such as language
+modeling, demonstrating promising performance. To foster their reliable use in
+real-world scenarios, it is crucial to augment their transparency. Our work
+bridges this critical gap by bringing explainability, particularly Layer-wise
+Relevance Propagation (LRP), to the Mamba architecture. Guided by the axiom of
+relevance conservation, we identify specific components in the Mamba
+architecture, which cause unfaithful explanations. To remedy this issue, we
+propose MambaLRP, a novel algorithm within the LRP framework, which ensures a
+more stable and reliable relevance propagation through these components. Our
+proposed method is theoretically sound and excels in achieving state-of-the-art
+explanation performance across a diverse range of models and datasets.
+Moreover, MambaLRP facilitates a deeper inspection of Mamba architectures,
+uncovering various biases and evaluating their significance. It also enables
+the analysis of previous speculations regarding the long-range capabilities of
+Mamba models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse Low-Ranked Self-Attention <span class="highlight-title">Transformer</span> for Remaining Useful
+  Lifetime Prediction of Optical Fiber Amplifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14378v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14378v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominic Schneider, Lutz Rapp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical fiber amplifiers are key elements in present optical networks.
+Failures of these components result in high financial loss of income of the
+network operator as the communication traffic over an affected link is
+interrupted. Applying Remaining useful lifetime (RUL) prediction in the context
+of Predictive Maintenance (PdM) to optical fiber amplifiers to predict upcoming
+system failures at an early stage, so that network outages can be minimized
+through planning of targeted maintenance actions, ensures reliability and
+safety. Optical fiber amplifier are complex systems, that work under various
+operating conditions, which makes correct forecasting a difficult task.
+Increased monitoring capabilities of systems results in datasets that
+facilitate the application of data-driven RUL prediction methods. Deep learning
+models in particular have shown good performance, but generalization based on
+comparatively small datasets for RUL prediction is difficult. In this paper, we
+propose Sparse Low-ranked self-Attention Transformer (SLAT) as a novel RUL
+prediction method. SLAT is based on an encoder-decoder architecture, wherein
+two parallel working encoders extract features for sensors and time steps. By
+utilizing the self-attention mechanism, long-term dependencies can be learned
+from long sequences. The implementation of sparsity in the attention matrix and
+a low-rank parametrization reduce overfitting and increase generalization.
+Experimental application to optical fiber amplifiers exemplified on EDFA, as
+well as a reference dataset from turbofan engines, shows that SLAT outperforms
+the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FADE: Towards Fairness-aware Augmentation for Domain Generalization via
+  Classifier-Guided Score-based Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.09495v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.09495v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Lin, Dong Li, Chen Zhao, Minglai Shao, Guihong Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fairness-aware domain generalization (FairDG) has emerged as a critical
+challenge for deploying trustworthy AI systems, particularly in scenarios
+involving distribution shifts. Traditional methods for addressing fairness have
+failed in domain generalization due to their lack of consideration for
+distribution shifts. Although disentanglement has been used to tackle FairDG,
+it is limited by its strong assumptions. To overcome these limitations, we
+propose Fairness-aware Classifier-Guided Score-based Diffusion Models (FADE) as
+a novel approach to effectively address the FairDG issue. Specifically, we
+first pre-train a score-based diffusion model (SDM) and two classifiers to
+equip the model with strong generalization capabilities across different
+domains. Then, we guide the SDM using these pre-trained classifiers to
+effectively eliminate sensitive information from the generated data. Finally,
+the generated fair data is used to train downstream classifiers, ensuring
+robust performance under new data distributions. Extensive experiments on three
+real-world datasets demonstrate that FADE not only enhances fairness but also
+improves accuracy in the presence of distribution shifts. Additionally, FADE
+outperforms existing methods in achieving the best accuracy-fairness
+trade-offs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Let Network Decide What to Learn: Symbolic Music Understanding Model
+  Based on Large-scale Adversarial <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.08306v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.08306v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a crucial aspect of Music Information Retrieval (MIR), Symbolic Music
+Understanding (SMU) has garnered significant attention for its potential to
+assist both musicians and enthusiasts in learning and creating music. Recently,
+pre-trained language models have been widely adopted in SMU due to the
+substantial similarities between symbolic music and natural language, as well
+as the ability of these models to leverage limited music data effectively.
+However, some studies have shown the common pre-trained methods like Mask
+Language Model (MLM) may introduce bias issues like racism discrimination in
+Natural Language Process (NLP) and affects the performance of downstream tasks,
+which also happens in SMU. This bias often arises when masked tokens cannot be
+inferred from their context, forcing the model to overfit the training set
+instead of generalizing. To address this challenge, we propose
+Adversarial-MidiBERT for SMU, which adaptively determines what to mask during
+MLM via a masker network, rather than employing random masking. By avoiding the
+masking of tokens that are difficult to infer from context, our model is better
+equipped to capture contextual structures and relationships, rather than merely
+conforming to the training data distribution. We evaluate our method across
+four SMU tasks, and our approach demonstrates excellent performance in all
+cases. The code for our model is publicly available at
+https://github.com/RS2002/Adversarial-MidiBERT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trustworthy, Responsible, and Safe AI: A Comprehensive Architectural
+  Framework for AI Safety with Challenges and Mitigations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12935v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12935v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Chen, Xueluan Gong, Ziyao Liu, Weifeng Jiang, Si Qi Goh, Kwok-Yan Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI Safety is an emerging area of critical importance to the safe adoption and
+deployment of AI systems. With the rapid proliferation of AI and especially
+with the recent advancement of Generative AI (or GAI), the technology ecosystem
+behind the design, development, adoption, and deployment of AI systems has
+drastically changed, broadening the scope of AI Safety to address impacts on
+public safety and national security. In this paper, we propose a novel
+architectural framework for understanding and analyzing AI Safety; defining its
+characteristics from three perspectives: Trustworthy AI, Responsible AI, and
+Safe AI. We provide an extensive review of current research and advancements in
+AI safety from these perspectives, highlighting their key challenges and
+mitigation approaches. Through examples from state-of-the-art technologies,
+particularly Large Language Models (LLMs), we present innovative mechanism,
+methodologies, and techniques for designing and testing AI safety. Our goal is
+to promote advancement in AI safety research, and ultimately enhance people's
+trust in digital transformation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion-based Unsupervised Audio-visual Speech Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05301v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05301v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-Eudes Ayilo, Mostafa Sadeghi, Romain Serizel, Xavier Alameda-Pineda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a new unsupervised audio-visual speech enhancement (AVSE)
+approach that combines a diffusion-based audio-visual speech generative model
+with a non-negative matrix factorization (NMF) noise model. First, the
+diffusion model is pre-trained on clean speech conditioned on corresponding
+video data to simulate the speech generative distribution. This pre-trained
+model is then paired with the NMF-based noise model to estimate clean speech
+iteratively. Specifically, a diffusion-based posterior sampling approach is
+implemented within the reverse diffusion process, where after each iteration, a
+speech estimate is obtained and used to update the noise parameters.
+Experimental results confirm that the proposed AVSE approach not only
+outperforms its audio-only counterpart but also generalizes better than a
+recent supervised-generative AVSE method. Additionally, the new inference
+algorithm offers a better balance between inference speed and performance
+compared to the previous diffusion-based method. Code and demo available at:
+https://jeaneudesayilo.github.io/fast_UdiffSE
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Pain Classification using Spatio-Temporal Deep Learning
+  Approaches with Facial Expressions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06787v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06787v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aafaf Ridouan, Amine Bohi, Youssef Mourchid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pain management and severity detection are crucial for effective treatment,
+yet traditional self-reporting methods are subjective and may be unsuitable for
+non-verbal individuals (people with limited speaking skills). To address this
+limitation, we explore automated pain detection using facial expressions. Our
+study leverages deep learning techniques to improve pain assessment by
+analyzing facial images from the Pain Emotion Faces Database (PEMF). We propose
+two novel approaches1: (1) a hybrid ConvNeXt model combined with Long
+Short-Term Memory (LSTM) blocks to analyze video frames and predict pain
+presence, and (2) a Spatio-Temporal Graph Convolution Network (STGCN)
+integrated with LSTM to process landmarks from facial images for pain
+detection. Our work represents the first use of the PEMF dataset for binary
+pain classification and demonstrates the effectiveness of these models through
+extensive experimentation. The results highlight the potential of combining
+spatial and temporal features for enhanced pain detection, offering a promising
+advancement in objective pain assessment methodologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, 3 tables. Accepted and presented at the 18th
+  International Conference on Machine Vision (ICMV 2024), Edinburgh, UK</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SupplyGraph: A Benchmark <span class="highlight-title">Dataset</span> for Supply Chain Planning using Graph
+  Neural Networks <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15299v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15299v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azmine Toushik Wasi, MD Shafikul Islam, Adipto Raihan Akib
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have gained traction across different domains
+such as transportation, bio-informatics, language processing, and computer
+vision. However, there is a noticeable absence of research on applying GNNs to
+supply chain networks. Supply chain networks are inherently graph-like in
+structure, making them prime candidates for applying GNN methodologies. This
+opens up a world of possibilities for optimizing, predicting, and solving even
+the most complex supply chain problems. A major setback in this approach lies
+in the absence of real-world benchmark datasets to facilitate the research and
+resolution of supply chain problems using GNNs. To address the issue, we
+present a real-world benchmark dataset for temporal tasks, obtained from one of
+the leading FMCG companies in Bangladesh, focusing on supply chain planning for
+production purposes. The dataset includes temporal data as node features to
+enable sales predictions, production planning, and the identification of
+factory issues. By utilizing this dataset, researchers can employ GNNs to
+address numerous supply chain problems, thereby advancing the field of supply
+chain analytics and planning. Source: https://github.com/CIOL-SUST/SupplyGraph
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 4th workshop on Graphs and more Complex structures for
+  Learning and Reasoning, colocated with AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Get Rid of Isolation: A Continuous Multi-task Spatio-Temporal Learning
+  Framework <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.10524v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.10524v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongchao Yi, Zhengyang Zhou, Qihe Huang, Yanjiang Chen, Liheng Yu, Xu Wang, Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatiotemporal learning has become a pivotal technique to enable urban
+intelligence. Traditional spatiotemporal models mostly focus on a specific task
+by assuming a same distribution between training and testing sets. However,
+given that urban systems are usually dynamic, multi-sourced with imbalanced
+data distributions, current specific task-specific models fail to generalize to
+new urban conditions and adapt to new domains without explicitly modeling
+interdependencies across various dimensions and types of urban data. To this
+end, we argue that there is an essential to propose a Continuous Multi-task
+Spatio-Temporal learning framework (CMuST) to empower collective urban
+intelligence, which reforms the urban spatiotemporal learning from
+single-domain to cooperatively multi-dimensional and multi-task learning.
+Specifically, CMuST proposes a new multi-dimensional spatiotemporal interaction
+network (MSTI) to allow cross-interactions between context and main
+observations as well as self-interactions within spatial and temporal aspects
+to be exposed, which is also the core for capturing task-level commonality and
+personalization. To ensure continuous task learning, a novel Rolling Adaptation
+training scheme (RoAda) is devised, which not only preserves task uniqueness by
+constructing data summarization-driven task prompts, but also harnesses
+correlated patterns among tasks by iterative model behavior modeling. We
+further establish a benchmark of three cities for multi-task spatiotemporal
+learning, and empirically demonstrate the superiority of CMuST via extensive
+evaluations on these datasets. The impressive improvements on both few-shot
+streaming data and new domain tasks against existing SOAT methods are achieved.
+Code is available at https://github.com/DILab-USTCSZ/CMuST.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Toward Automated Simulation Research Workflow through LLM <span class="highlight-title">Prompt</span>
+  Engineering Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15512v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15512v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihan Liu, Yubo Chai, Jianfeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of Large Language Models (LLMs) has created new opportunities for
+the automation of scientific research spanning both experimental processes and
+computational simulations. This study explores the feasibility of constructing
+an autonomous simulation agent (ASA) powered by LLMs through prompt engineering
+and automated program design to automate the entire simulation research process
+according to a human-provided research plan. This process includes experimental
+design, remote upload and simulation execution, data analysis, and report
+compilation. Using a well-studied simulation problem of polymer chain
+conformations as a test case, we assessed the long-task completion and
+reliability of ASAs powered by different LLMs, including GPT-4o, Claude-3.5,
+etc. Our findings revealed that ASA-GPT-4o achieved near-flawless execution on
+designated research missions, underscoring the potential of methods like ASA to
+achieve automation in simulation research processes to enhance research
+efficiency. The outlined automation can be iteratively performed for up to 20
+cycles without human intervention, illustrating the potential of ASA for
+long-task workflow automation. Additionally, we discussed the intrinsic traits
+of ASA in managing extensive tasks, focusing on self-validation mechanisms, and
+the balance between local attention and global oversight.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The source code and example results of ASA can be found at
+  https://github.com/zokaraa/autonomous_simulation_agent</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fully Distributed, Flexible Compositional Visual Representations via
+  Soft Tensor Products 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04671v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04671v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bethia Sun, Maurice Pagnucco, Yang Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the inception of the classicalist vs. connectionist debate, it has been
+argued that the ability to systematically combine symbol-like entities into
+compositional representations is crucial for human intelligence. In
+connectionist systems, the field of disentanglement has gained prominence for
+its ability to produce explicitly compositional representations; however, it
+relies on a fundamentally symbolic, concatenative representation of
+compositional structure that clashes with the continuous, distributed
+foundations of deep learning. To resolve this tension, we extend Smolensky's
+Tensor Product Representation (TPR) and introduce Soft TPR, a representational
+form that encodes compositional structure in an inherently distributed,
+flexible manner, along with Soft TPR Autoencoder, a theoretically-principled
+architecture designed specifically to learn Soft TPRs. Comprehensive
+evaluations in the visual representation learning domain demonstrate that the
+Soft TPR framework consistently outperforms conventional disentanglement
+alternatives -- achieving state-of-the-art disentanglement, boosting
+representation learner convergence, and delivering superior sample efficiency
+and low-sample regime performance in downstream tasks. These findings highlight
+the promise of a distributed and flexible approach to representing
+compositional structure by potentially enhancing alignment with the core
+principles of deep learning over the conventional symbolic approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Neurips 2024. 10 pages + supplementary</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SelectIT: Selective Instruction Tuning for LLMs via Uncertainty-Aware
+  Self-Reflection <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16705v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16705v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liangxin Liu, Xuebo Liu, Derek F. Wong, Dongfang Li, Ziyi Wang, Baotian Hu, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning (IT) is crucial to tailoring large language models (LLMs)
+towards human-centric interactions. Recent advancements have shown that the
+careful selection of a small, high-quality subset of IT data can significantly
+enhance the performance of LLMs. Despite this, common approaches often rely on
+additional models or data, which increases costs and limits widespread
+adoption. In this work, we propose a novel approach, termed SelectIT, that
+capitalizes on the foundational capabilities of the LLM itself. Specifically,
+we exploit the intrinsic uncertainty present in LLMs to more effectively select
+high-quality IT data, without the need for extra resources. Furthermore, we
+introduce a curated IT dataset, the Selective Alpaca, created by applying
+SelectIT to the Alpaca-GPT4 dataset. Empirical results demonstrate that IT
+using Selective Alpaca leads to substantial model ability enhancement. The
+robustness of SelectIT has also been corroborated in various foundation models
+and domain-specific tasks. Our findings suggest that longer and more
+computationally intensive IT data may serve as superior sources of IT, offering
+valuable insights for future research in this area. Data, code, and scripts are
+freely available at https://github.com/Blue-Raincoat/SelectIT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Making AI Less "Thirsty": Uncovering and Addressing the Secret Water
+  Footprint of AI Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03271v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03271v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengfei Li, Jianyi Yang, Mohammad A. Islam, Shaolei Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing carbon footprint of artificial intelligence (AI) has been
+undergoing public scrutiny. Nonetheless, the equally important water
+(withdrawal and consumption) footprint of AI has largely remained under the
+radar. For example, training the GPT-3 language model in Microsoft's
+state-of-the-art U.S. data centers can directly evaporate 700,000 liters of
+clean freshwater, but such information has been kept a secret. More critically,
+the global AI demand is projected to account for 4.2-6.6 billion cubic meters
+of water withdrawal in 2027, which is more than the total annual water
+withdrawal of 4-6 Denmark or half of the United Kingdom. This is concerning, as
+freshwater scarcity has become one of the most pressing challenges. To respond
+to the global water challenges, AI can, and also must, take social
+responsibility and lead by example by addressing its own water footprint. In
+this paper, we provide a principled methodology to estimate the water footprint
+of AI, and also discuss the unique spatial-temporal diversities of AI's runtime
+water efficiency. Finally, we highlight the necessity of holistically
+addressing water footprint along with carbon footprint to enable truly
+sustainable AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Communications of the ACM. Source codes available at:
+  https://github.com/Ren-Research/Making-AI-Less-Thirsty</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Knowledge Conflicts in Language Model-Driven Question
+  Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11344v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11344v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Cao, Zhaoyang Zhang, Xiangtian Li, Chufan Wu, Hansong Zhang, Wenqing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of knowledge-driven seq-to-seq generation tasks, such as
+document-based question answering and document summarization systems, two
+fundamental knowledge sources play crucial roles: the inherent knowledge
+embedded within model parameters and the external knowledge obtained through
+context. Recent studies revealed a significant challenge: when there exists a
+misalignment between the model's inherent knowledge and the ground truth
+answers in training data, the system may exhibit problematic behaviors during
+inference, such as ignoring input context, or generating unfaithful content.
+Our investigation proposes a strategy to minimize hallucination by building
+explicit connection between source inputs and generated outputs. We
+specifically target a common hallucination pattern in question answering,
+examining how the correspondence between entities and their contexts during
+model training influences the system's performance at inference time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>revised version, more figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OminiControl: Minimal and Universal Control for Diffusion <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15098v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15098v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenxiong Tan, Songhua Liu, Xingyi Yang, Qiaochu Xue, Xinchao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce OminiControl, a highly versatile and
+parameter-efficient framework that integrates image conditions into pre-trained
+Diffusion Transformer (DiT) models. At its core, OminiControl leverages a
+parameter reuse mechanism, enabling the DiT to encode image conditions using
+itself as a powerful backbone and process them with its flexible multi-modal
+attention processors. Unlike existing methods, which rely heavily on additional
+encoder modules with complex architectures, OminiControl (1) effectively and
+efficiently incorporates injected image conditions with only ~0.1% additional
+parameters, and (2) addresses a wide range of image conditioning tasks in a
+unified manner, including subject-driven generation and spatially-aligned
+conditions such as edges, depth, and more. Remarkably, these capabilities are
+achieved by training on images generated by the DiT itself, which is
+particularly beneficial for subject-driven generation. Extensive evaluations
+demonstrate that OminiControl outperforms existing UNet-based and DiT-adapted
+models in both subject-driven and spatially-aligned conditional generation.
+Additionally, we release our training dataset, Subjects200K, a diverse
+collection of over 200,000 identity-consistent images, along with an efficient
+data synthesis pipeline to advance research in subject-consistent generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10919v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10919v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Zhao, Tingwei Chen, Zhijie Cai, Xiaoyang Li, Hang Li, Qimei Chen, Guangxu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Wi-Fi sensing has garnered significant attention due to its
+numerous benefits, such as privacy protection, low cost, and penetration
+ability. Extensive research has been conducted in this field, focusing on areas
+such as gesture recognition, people identification, and fall detection.
+However, many data-driven methods encounter challenges related to domain shift,
+where the model fails to perform well in environments different from the
+training data. One major factor contributing to this issue is the limited
+availability of Wi-Fi sensing datasets, which makes models learn excessive
+irrelevant information and over-fit to the training set. Unfortunately,
+collecting large-scale Wi-Fi sensing datasets across diverse scenarios is a
+challenging task. To address this problem, we propose CrossFi, a siamese
+network-based approach that excels in both in-domain scenario and cross-domain
+scenario, including few-shot, zero-shot scenarios, and even works in few-shot
+new-class scenario where testing set contains new categories. The core
+component of CrossFi is a sample-similarity calculation network called CSi-Net,
+which improves the structure of the siamese network by using an attention
+mechanism to capture similarity information, instead of simply calculating the
+distance or cosine similarity. Based on it, we develop an extra Weight-Net that
+can generate a template for each class, so that our CrossFi can work in
+different scenarios. Experimental results demonstrate that our CrossFi achieves
+state-of-the-art performance across various scenarios. In gesture recognition
+task, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%
+in one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,
+and 84.75% in one-shot new-class scenario. The code for our model is publicly
+available at https://github.com/RS2002/CrossFi.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Silent Majority: Demystifying Memorization Effect in the Presence of
+  Spurious Correlations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyu You, Haocheng Dai, Yifei Min, Jasjeet S. Sekhon, Sarang Joshi, James S. Duncan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models often rely on simple spurious features -- patterns in
+training data that correlate with targets but are not causally related to them,
+like image backgrounds in foreground classification. This reliance typically
+leads to imbalanced test performance across minority and majority groups. In
+this work, we take a closer look at the fundamental cause of such imbalanced
+performance through the lens of memorization, which refers to the ability to
+predict accurately on \textit{atypical} examples (minority groups) in the
+training set but failing in achieving the same accuracy in the testing set.
+This paper systematically shows the ubiquitous existence of spurious features
+in a small set of neurons within the network, providing the first-ever evidence
+that memorization may contribute to imbalanced group performance. Through three
+experimental sources of converging empirical evidence, we find the property of
+a small subset of neurons or channels in memorizing minority group information.
+Inspired by these findings, we articulate the hypothesis: the imbalanced group
+performance is a byproduct of ``noisy'' spurious memorization confined to a
+small set of neurons. To further substantiate this hypothesis, we show that
+eliminating these unnecessary spurious memorization patterns via a novel
+framework during training can significantly affect the model performance on
+minority groups. Our experimental results across various architectures and
+benchmarks offer new insights on how neural networks encode core and spurious
+knowledge, laying the groundwork for future research in demystifying robustness
+to spurious correlation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Noise-powered Multi-modal Knowledge Graph Representation Framework <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06832v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06832v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuo Chen, Yin Fang, Yichi Zhang, Lingbing Guo, Jiaoyan Chen, Jeff Z. Pan, Huajun Chen, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of Multi-modal Pre-training highlights the necessity for a unified
+Multi-Modal Knowledge Graph (MMKG) representation learning framework. Such a
+framework is essential for embedding structured knowledge into multi-modal
+Large Language Models effectively, alleviating issues like knowledge
+misconceptions and multi-modal hallucinations. In this work, we explore the
+efficacy of models in accurately embedding entities within MMKGs through two
+pivotal tasks: Multi-modal Knowledge Graph Completion (MKGC) and Multi-modal
+Entity Alignment (MMEA). Building on this foundation, we propose a novel SNAG
+method that utilizes a Transformer-based architecture equipped with
+modality-level noise masking to robustly integrate multi-modal entity features
+in KGs. By incorporating specific training objectives for both MKGC and MMEA,
+our approach achieves SOTA performance across a total of ten datasets,
+demonstrating its versatility. Moreover, SNAG can not only function as a
+standalone model but also enhance other existing methods, providing stable
+performance improvements. Code and data are available at
+https://github.com/zjukg/SNAG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>COLING 2025 Accepted, Repo is available at
+  https://github.com/zjukg/SNAG</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine unlearning through fine-grained model parameters perturbation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04385v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04385v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Zuo, Zhuo Tang, Kenli Li, Anwitaman Datta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning techniques, which involve retracting data records and
+reducing influence of said data on trained models, help with the user privacy
+protection objective but incur significant computational costs. Weight
+perturbation-based unlearning is a general approach, but it typically involves
+globally modifying the parameters. We propose fine-grained Top-K and Random-k
+parameters perturbed inexact machine unlearning strategies that address the
+privacy needs while keeping the computational costs tractable.
+  In order to demonstrate the efficacy of our strategies we also tackle the
+challenge of evaluating the effectiveness of machine unlearning by considering
+the model's generalization performance across both unlearning and remaining
+data. To better assess the unlearning effect and model generalization, we
+propose novel metrics, namely, the forgetting rate and memory retention rate.
+However, for inexact machine unlearning, current metrics are inadequate in
+quantifying the degree of forgetting that occurs after unlearning strategies
+are applied. To address this, we introduce SPD-GAN, which subtly perturbs the
+distribution of data targeted for unlearning. Then, we evaluate the degree of
+unlearning by measuring the performance difference of the models on the
+perturbed unlearning data before and after the unlearning process. By
+implementing these innovative techniques and metrics, we achieve
+computationally efficacious privacy protection in machine learning applications
+without significant sacrifice of model performance. Furthermore, this approach
+provides a novel method for evaluating the degree of unlearning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STORM: A Spatio-Temporal Factor Model Based on Dual Vector Quantized
+  Variational Autoencoders for Financial Trading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09468v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09468v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilei Zhao, Wentao Zhang, Tingran Yang, Yong Jiang, Fei Huang, Wei Yang Bryan Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In financial trading, factor models are widely used to price assets and
+capture excess returns from mispricing. Recently, we have witnessed the rise of
+variational autoencoder-based latent factor models, which learn latent factors
+self-adaptively. While these models focus on modeling overall market
+conditions, they often fail to effectively capture the temporal patterns of
+individual stocks. Additionally, representing multiple factors as single values
+simplifies the model but limits its ability to capture complex relationships
+and dependencies. As a result, the learned factors are of low quality and lack
+diversity, reducing their effectiveness and robustness across different trading
+periods. To address these issues, we propose a Spatio-Temporal factOR Model
+based on dual vector quantized variational autoencoders, named STORM, which
+extracts features of stocks from temporal and spatial perspectives, then fuses
+and aligns these features at the fine-grained and semantic level, and
+represents the factors as multi-dimensional embeddings. The discrete codebooks
+cluster similar factor embeddings, ensuring orthogonality and diversity, which
+helps distinguish between different factors and enables factor selection in
+financial trading. To show the performance of the proposed factor model, we
+apply it to two downstream experiments: portfolio management on two stock
+datasets and individual trading tasks on six specific stocks. The extensive
+experiments demonstrate STORM's flexibility in adapting to downstream tasks and
+superior performance over baseline models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do Large Language Models Mirror Cognitive Language Processing? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18023v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18023v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqi Ren, Renren Jin, Tongxuan Zhang, Deyi Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable abilities in text
+comprehension and logical reasoning, indicating that the text representations
+learned by LLMs can facilitate their language processing capabilities. In
+neuroscience, brain cognitive processing signals are typically utilized to
+study human language processing. Therefore, it is natural to ask how well the
+text embeddings from LLMs align with the brain cognitive processing signals,
+and how training strategies affect the LLM-brain alignment? In this paper, we
+employ Representational Similarity Analysis (RSA) to measure the alignment
+between 23 mainstream LLMs and fMRI signals of the brain to evaluate how
+effectively LLMs simulate cognitive language processing. We empirically
+investigate the impact of various factors (e.g., pre-training data size, model
+scaling, alignment training, and prompts) on such LLM-brain alignment.
+Experimental results indicate that pre-training data size and model scaling are
+positively correlated with LLM-brain similarity, and alignment training can
+significantly improve LLM-brain similarity. Explicit prompts contribute to the
+consistency of LLMs with brain cognitive language processing, while nonsensical
+noisy prompts may attenuate such alignment. Additionally, the performance of a
+wide range of LLM evaluations (e.g., MMLU, Chatbot Arena) is highly correlated
+with the LLM-brain similarity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EdgeSight: Enabling Modeless and Cost-Efficient Inference at the Edge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19213v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19213v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        ChonLam Lao, Jiaqi Gao, Ganesh Ananthanarayanan, Aditya Akella, Minlan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional ML inference is evolving toward modeless inference, which
+abstracts the complexity of model selection from users, allowing the system to
+automatically choose the most appropriate model for each request based on
+accuracy and resource requirements. While prior studies have focused on
+modeless inference within data centers, this paper tackles the pressing need
+for cost-efficient modeless inference at the edge -- particularly within its
+unique constraints of limited device memory, volatile network conditions, and
+restricted power consumption.
+  To overcome these challenges, we propose EdgeSight, a system that provides
+cost-efficient EdgeSight serving for diverse DNNs at the edge. EdgeSight
+employs an edge-data center (edge-DC) architecture, utilizing confidence
+scaling to reduce the number of model options while meeting diverse accuracy
+requirements. Additionally, it supports lossy inference in volatile network
+environments. Our experimental results show that EdgeSight outperforms existing
+systems by up to 1.6x in P99 latency for modeless services. Furthermore, our
+FPGA prototype demonstrates similar performance at certain accuracy levels,
+with a power consumption reduction of up to 3.34x.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Natural Language Outlines for Code: Literate Programming in the LLM Era 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04820v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04820v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kensen Shi, Deniz Altınbüken, Saswat Anand, Mihai Christodorescu, Katja Grünwedel, Alexa Koenings, Sai Naidu, Anurag Pathak, Marc Rasi, Fredde Ribeiro, Brandon Ruffin, Siddhant Sanyam, Maxim Tabachnyk, Sara Toth, Roy Tu, Tobias Welp, Pengcheng Yin, Manzil Zaheer, Satish Chandra, Charles Sutton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose using natural language outlines as a novel modality and
+interaction surface for providing AI assistance to developers throughout the
+software development process. An NL outline for a code function comprises
+multiple statements written in concise prose, which partition the code and
+summarize its main ideas in the style of literate programming. Crucially, we
+find that modern LLMs can generate accurate and high-quality NL outlines in
+practice. Moreover, NL outlines enable a bidirectional sync between code and
+NL, allowing changes in one to be automatically reflected in the other. We
+discuss many use cases for NL outlines: they can accelerate understanding and
+navigation of code and diffs, simplify code maintenance, augment code search,
+steer code generation, and more. We then propose and compare multiple LLM
+prompting techniques for generating outlines and ask professional developers to
+judge outline quality. Finally, we present two case studies applying NL
+outlines toward code review and malware detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Diffuser (CoD): Mastering Continual Offline Reinforcement
+  Learning with Experience Rehearsal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.02512v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.02512v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jifeng Hu, Li Shen, Sili Huang, Zhejian Yang, Hechang Chen, Lichao Sun, Yi Chang, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial neural networks, especially recent diffusion-based models, have
+shown remarkable superiority in gaming, control, and QA systems, where the
+training tasks' datasets are usually static. However, in real-world
+applications, such as robotic control of reinforcement learning (RL), the tasks
+are changing, and new tasks arise in a sequential order. This situation poses
+the new challenge of plasticity-stability trade-off for training an agent who
+can adapt to task changes and retain acquired knowledge. In view of this, we
+propose a rehearsal-based continual diffusion model, called Continual Diffuser
+(CoD), to endow the diffuser with the capabilities of quick adaptation
+(plasticity) and lasting retention (stability). Specifically, we first
+construct an offline benchmark that contains 90 tasks from multiple domains.
+Then, we train the CoD on each task with sequential modeling and conditional
+generation for making decisions. Next, we preserve a small portion of previous
+datasets as the rehearsal buffer and replay it to retain the acquired
+knowledge. Extensive experiments on a series of tasks show CoD can achieve a
+promising plasticity-stability trade-off and outperform existing
+diffusion-based methods and other representative baselines on most tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction
+  Following 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08187v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08187v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Fang, Xinle Deng, Kangwei Liu, Ningyu Zhang, Jingyang Qian, Penghui Yang, Xiaohui Fan, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models excel at interpreting complex natural language
+instructions, enabling them to perform a wide range of tasks. In the life
+sciences, single-cell RNA sequencing (scRNA-seq) data serves as the "language
+of cellular biology", capturing intricate gene expression patterns at the
+single-cell level. However, interacting with this "language" through
+conventional tools is often inefficient and unintuitive, posing challenges for
+researchers. To address these limitations, we present InstructCell, a
+multi-modal AI copilot that leverages natural language as a medium for more
+direct and flexible single-cell analysis. We construct a comprehensive
+multi-modal instruction dataset that pairs text-based instructions with
+scRNA-seq profiles from diverse tissues and species. Building on this, we
+develop a multi-modal cell language architecture capable of simultaneously
+interpreting and processing both modalities. InstructCell empowers researchers
+to accomplish critical tasks-such as cell type annotation, conditional
+pseudo-cell generation, and drug sensitivity prediction-using straightforward
+natural language commands. Extensive evaluations demonstrate that InstructCell
+consistently meets or exceeds the performance of existing single-cell
+foundation models, while adapting to diverse experimental conditions. More
+importantly, InstructCell provides an accessible and intuitive tool for
+exploring complex single-cell data, lowering technical barriers and enabling
+deeper biological insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages; 13 figures; Code: https://github.com/zjunlp/Instructcell,
+  Models: https://huggingface.co/zjunlp/Instructcell-chat,
+  https://huggingface.co/zjunlp/InstructCell-instruct</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Emergent Abilities of Language Models from the Loss
+  Perspective <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15796v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15796v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxiao Du, Aohan Zeng, Yuxiao Dong, Jie Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have put into question the belief that emergent abilities in
+language models are exclusive to large models. This skepticism arises from two
+observations: 1) smaller models can also exhibit high performance on emergent
+abilities and 2) there is doubt on the discontinuous metrics used to measure
+these abilities. In this paper, we propose to study emergent abilities in the
+lens of pre-training loss, instead of model size or training compute. We
+demonstrate that the Transformer models with the same pre-training loss, but
+different model and data sizes, generate the same performance on various
+downstream tasks, with a fixed data corpus, tokenization, and model
+architecture. We also discover that a model exhibits emergent abilities on
+certain tasks -- regardless of the continuity of metrics -- when its
+pre-training loss falls below a specific threshold. Before reaching this
+threshold, its performance remains at the level of random guessing. This
+inspires us to redefine emergent abilities as those that manifest in models
+with lower pre-training losses, highlighting that these abilities cannot be
+predicted by merely extrapolating the performance trends of models with higher
+pre-training losses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 8 figures. Accepted in NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-driven inventory management for new products: A warm-start and
+  adjusted Dyna-$Q$ approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinye Qu, Longxiao Liu, Wenjie Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel reinforcement learning algorithm for
+inventory management of newly launched products with no or limited historical
+demand information. The algorithm follows the classic Dyna-$Q$ structure,
+balancing the model-based and model-free approaches, while accelerating the
+training process of Dyna-$Q$ and mitigating the model discrepancy generated by
+the model-based feedback. Warm-start information from the demand data of
+existing similar products can be incorporated into the algorithm to further
+stabilize the early-stage training and reduce the variance of the estimated
+optimal policy. Our approach is validated through a case study of bakery
+inventory management with real data. The adjusted Dyna-$Q$ shows up to a 23.7%
+reduction in average daily cost compared with $Q$-learning, and up to a 77.5%
+reduction in training time within the same horizon compared with classic
+Dyna-$Q$. By incorporating the warm-start information, it can be found that the
+adjusted Dyna-$Q$ has the lowest total cost, lowest variance in total cost, and
+relatively low shortage percentages among all the algorithms under a 30-day
+testing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CSL-L2M: Controllable Song-Level Lyric-to-Melody Generation Based on
+  Conditional <span class="highlight-title">Transformer</span> with Fine-Grained Lyric and Musical Controls <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09887v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09887v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Chai, Donglin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lyric-to-melody generation is a highly challenging task in the field of AI
+music generation. Due to the difficulty of learning strict yet weak
+correlations between lyrics and melodies, previous methods have suffered from
+weak controllability, low-quality and poorly structured generation. To address
+these challenges, we propose CSL-L2M, a controllable song-level lyric-to-melody
+generation method based on an in-attention Transformer decoder with
+fine-grained lyric and musical controls, which is able to generate full-song
+melodies matched with the given lyrics and user-specified musical attributes.
+Specifically, we first introduce REMI-Aligned, a novel music representation
+that incorporates strict syllable- and sentence-level alignments between lyrics
+and melodies, facilitating precise alignment modeling. Subsequently,
+sentence-level semantic lyric embeddings independently extracted from a
+sentence-wise Transformer encoder are combined with word-level part-of-speech
+embeddings and syllable-level tone embeddings as fine-grained controls to
+enhance the controllability of lyrics over melody generation. Then we introduce
+human-labeled musical tags, sentence-level statistical musical attributes, and
+learned musical features extracted from a pre-trained VQ-VAE as coarse-grained,
+fine-grained and high-fidelity controls, respectively, to the generation
+process, thereby enabling user control over melody generation. Finally, an
+in-attention Transformer decoder technique is leveraged to exert fine-grained
+control over the full-song melody generation with the aforementioned lyric and
+musical conditions. Experimental results demonstrate that our proposed CSL-L2M
+outperforms the state-of-the-art models, generating melodies with higher
+quality, better controllability and enhanced structure. Demos and source code
+are available at https://lichaiustc.github.io/CSL-L2M/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unconditional stability of a recurrent neural circuit implementing
+  divisive normalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18946v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18946v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivang Rawat, David J. Heeger, Stefano Martiniani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stability in recurrent neural models poses a significant challenge,
+particularly in developing biologically plausible neurodynamical models that
+can be seamlessly trained. Traditional cortical circuit models are notoriously
+difficult to train due to expansive nonlinearities in the dynamical system,
+leading to an optimization problem with nonlinear stability constraints that
+are difficult to impose. Conversely, recurrent neural networks (RNNs) excel in
+tasks involving sequential data but lack biological plausibility and
+interpretability. In this work, we address these challenges by linking dynamic
+divisive normalization (DN) to the stability of ORGaNICs, a biologically
+plausible recurrent cortical circuit model that dynamically achieves DN and
+that has been shown to simulate a wide range of neurophysiological phenomena.
+By using the indirect method of Lyapunov, we prove the remarkable property of
+unconditional local stability for an arbitrary-dimensional ORGaNICs circuit
+when the recurrent weight matrix is the identity. We thus connect ORGaNICs to a
+system of coupled damped harmonic oscillators, which enables us to derive the
+circuit's energy function, providing a normative principle of what the circuit,
+and individual neurons, aim to accomplish. Further, for a generic recurrent
+weight matrix, we prove the stability of the 2D model and demonstrate
+empirically that stability holds in higher dimensions. Finally, we show that
+ORGaNICs can be trained by backpropagation through time without gradient
+clipping/scaling, thanks to its intrinsic stability property and adaptive time
+constants, which address the problems of exploding, vanishing, and oscillating
+gradients. By evaluating the model's performance on RNN benchmarks, we find
+that ORGaNICs outperform alternative neurodynamical models on static image
+classification tasks and perform comparably to LSTMs on sequential tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI Consciousness is Inevitable: A Theoretical Computer Science
+  Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17101v9">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17101v9.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lenore Blum, Manuel Blum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We look at consciousness through the lens of Theoretical Computer Science, a
+branch of mathematics that studies computation under resource limitations. From
+this perspective, we develop a formal machine model for consciousness. The
+model is inspired by Alan Turing's simple yet powerful model of computation and
+Bernard Baars' theater model of consciousness. Though extremely simple, the
+model (1) aligns at a high level with many of the major scientific theories of
+human and animal consciousness, (2) provides explanations at a high level for
+many phenomena associated with consciousness, and (3) gives insight into how a
+machine can have subjective consciousness. This combination supports our claim
+that machine consciousness is not only plausible but inevitable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal and Multi-scale Spatial Environment Understanding for
+  Immersive Visual Text-to-Speech <span class="chip">AAAI'2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11409v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11409v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Liu, Shuwei He, Yifan Hu, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Text-to-Speech (VTTS) aims to take the environmental image as the
+prompt to synthesize the reverberant speech for the spoken content. The
+challenge of this task lies in understanding the spatial environment from the
+image. Many attempts have been made to extract global spatial visual
+information from the RGB space of an spatial image. However, local and depth
+image information are crucial for understanding the spatial environment, which
+previous works have ignored. To address the issues, we propose a novel
+multi-modal and multi-scale spatial environment understanding scheme to achieve
+immersive VTTS, termed M2SE-VTTS. The multi-modal aims to take both the RGB and
+Depth spaces of the spatial image to learn more comprehensive spatial
+information, and the multi-scale seeks to model the local and global spatial
+knowledge simultaneously. Specifically, we first split the RGB and Depth images
+into patches and adopt the Gemini-generated environment captions to guide the
+local spatial understanding. After that, the multi-modal and multi-scale
+features are integrated by the local-aware global spatial understanding. In
+this way, M2SE-VTTS effectively models the interactions between local and
+global spatial contexts in the multi-modal spatial environment. Objective and
+subjective evaluations suggest that our model outperforms the advanced
+baselines in environmental speech generation. The code and audio samples are
+available at: https://github.com/AI-S2-Lab/M2SE-VTTS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,2 figures, Accepted by AAAI'2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compositional Automata Embeddings for Goal-Conditioned Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.00205v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.00205v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beyazit Yalcinkaya, Niklas Lauffer, Marcell Vazquez-Chanlatte, Sanjit A. Seshia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Goal-conditioned reinforcement learning is a powerful way to control an AI
+agent's behavior at runtime. That said, popular goal representations, e.g.,
+target states or natural language, are either limited to Markovian tasks or
+rely on ambiguous task semantics. We propose representing temporal goals using
+compositions of deterministic finite automata (cDFAs) and use cDFAs to guide RL
+agents. cDFAs balance the need for formal temporal semantics with ease of
+interpretation: if one can understand a flow chart, one can understand a cDFA.
+On the other hand, cDFAs form a countably infinite concept class with Boolean
+semantics, and subtle changes to the automaton can result in very different
+tasks, making them difficult to condition agent behavior on. To address this,
+we observe that all paths through a DFA correspond to a series of reach-avoid
+tasks and propose pre-training graph neural network embeddings on "reach-avoid
+derived" DFAs. Through empirical evaluation, we demonstrate that the proposed
+pre-training method enables zero-shot generalization to various cDFA task
+classes and accelerated policy specialization without the myopic suboptimality
+of hierarchical methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LoL-PIM: Long-Context LLM Decoding with Scalable DRAM-PIM System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20166v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20166v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyucksung Kwon, Kyungmo Koo, Janghyeon Kim, Woongkyu Lee, Minjae Lee, Hyungdeok Lee, Yousub Jung, Jaehan Park, Yosub Song, Byeongsu Yang, Haerang Choi, Guhyun Kim, Jongsoon Won, Woojae Shin, Changhyun Kim, Gyeongcheol Shin, Yongkee Kwon, Ilkon Kim, Euicheol Lim, John Kim, Jungwook Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The expansion of large language models (LLMs) with hundreds of billions of
+parameters presents significant challenges to computational resources,
+particularly data movement and memory bandwidth. Long-context LLMs, which
+process sequences of tens of thousands of tokens, further increase the demand
+on the memory system as the complexity in attention layers and key-value cache
+sizes is proportional to the context length. Processing-in-Memory (PIM)
+maximizes memory bandwidth by moving compute to the data and can address the
+memory bandwidth challenges; however, PIM is not necessarily scalable to
+accelerate long-context LLM because of limited per-module memory capacity and
+the inflexibility of fixed-functional unit PIM architecture and static memory
+management. In this work, we propose LoL-PIM which is a multi-node PIM
+architecture that accelerates long context LLM through hardware-software
+co-design. In particular, we propose how pipeline parallelism can be exploited
+across a multi-PIM module while a direct PIM access (DPA) controller (or DMA
+for PIM) is proposed that enables dynamic PIM memory management and results in
+efficient PIM utilization across a diverse range of context length. We
+developed an MLIR-based compiler for LoL-PIM extending a commercial PIM-based
+compiler where the software modifications were implemented and evaluated, while
+the hardware changes were modeled in the simulator. Our evaluations demonstrate
+that LoL-PIM significantly improves throughput and reduces latency for
+long-context LLM inference, outperforming both multi-GPU and GPU-PIM systems
+(up to 8.54x and 16.0x speedup, respectively), thereby enabling more efficient
+deployment of LLMs in real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Cross-Domain Representations for Transferable Drug
+  Perturbations on Single-Cell Transcriptional Responses <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Liu, Shikai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phenotypic drug discovery has attracted widespread attention because of its
+potential to identify bioactive molecules. Transcriptomic profiling provides a
+comprehensive reflection of phenotypic changes in cellular responses to
+external perturbations. In this paper, we propose XTransferCDR, a novel
+generative framework designed for feature decoupling and transferable
+representation learning across domains. Given a pair of perturbed expression
+profiles, our approach decouples the perturbation representations from basal
+states through domain separation encoders and then cross-transfers them in the
+latent space. The transferred representations are then used to reconstruct the
+corresponding perturbed expression profiles via a shared decoder. This
+cross-transfer constraint effectively promotes the learning of transferable
+drug perturbation representations. We conducted extensive evaluations of our
+model on multiple datasets, including single-cell transcriptional responses to
+drugs and single- and combinatorial genetic perturbations. The experimental
+results show that XTransferCDR achieved better performance than current
+state-of-the-art methods, showcasing its potential to advance phenotypic drug
+discovery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by The 39th Annual AAAI Conference on Artificial Intelligenc
+  (AAAI 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Long Video Tokenization via Coordinate-based Patch
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14762v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14762v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huiwon Jang, Sihyun Yu, Jinwoo Shin, Pieter Abbeel, Younggyo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient tokenization of videos remains a challenge in training vision
+models that can process long videos. One promising direction is to develop a
+tokenizer that can encode long video clips, as it would enable the tokenizer to
+leverage the temporal coherence of videos better for tokenization. However,
+training existing tokenizers on long videos often incurs a huge training cost
+as they are trained to reconstruct all the frames at once. In this paper, we
+introduce CoordTok, a video tokenizer that learns a mapping from
+coordinate-based representations to the corresponding patches of input videos,
+inspired by recent advances in 3D generative models. In particular, CoordTok
+encodes a video into factorized triplane representations and reconstructs
+patches that correspond to randomly sampled $(x,y,t)$ coordinates. This allows
+for training large tokenizer models directly on long videos without requiring
+excessive training resources. Our experiments show that CoordTok can
+drastically reduce the number of tokens for encoding long video clips. For
+instance, CoordTok can encode a 128-frame video with 128$\times$128 resolution
+into 1280 tokens, while baselines need 6144 or 8192 tokens to achieve similar
+reconstruction quality. We further show that this efficient video tokenization
+enables memory-efficient training of a diffusion transformer that can generate
+128 frames at once.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available on the project webpage:
+  https://huiwon-jang.github.io/coordtok/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated <span class="highlight-title">Review</span> Generation Method Based on Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20906v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20906v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shican Wu, Xiao Ma, Dehui Luo, Lulu Li, Xiangcheng Shi, Xin Chang, Xiaoyun Lin, Ran Luo, Chunlei Pei, Changying Du, Zhi-Jian Zhao, Jinlong Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Literature research, vital for scientific work, faces the challenge of
+surging information volumes exceeding researchers' processing capabilities. We
+present an automated review generation method based on large language models
+(LLMs) to overcome efficiency bottlenecks and reduce cognitive load. Our
+statistically validated evaluation framework demonstrates that the generated
+reviews match or exceed manual quality, offering broad applicability across
+research fields without requiring users' domain knowledge. Applied to propane
+dehydrogenation (PDH) catalysts, our method swiftly analyzed 343 articles,
+averaging seconds per article per LLM account, producing comprehensive reviews
+spanning 35 topics, with extended analysis of 1041 articles providing insights
+into catalysts' properties. Through multi-layered quality control, we
+effectively mitigated LLMs' hallucinations, with expert verification confirming
+accuracy and citation integrity while demonstrating hallucination risks reduced
+to below 0.5\% with 95\% confidence. Released Windows application enables
+one-click review generation, enhancing research productivity and literature
+recommendation efficiency while setting the stage for broader scientific
+explorations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 5 figures, 1 tables Code:
+  https://github.com/TJU-ECAT-AI/AutomaticReviewGeneration Data:
+  https://github.com/TJU-ECAT-AI/AutomaticReviewGenerationData This research
+  has been invited for a Short Oral presentation at the 18th ICC -
+  International Congress on Catalysis, taking place in Lyon, France from July
+  14-19, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unifying Information-theoretic Perspective on Evaluating Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14340v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14340v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexis Fox, Samarth Swarup, Abhijin Adiga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Considering the difficulty of interpreting generative model output, there is
+significant current research focused on determining meaningful evaluation
+metrics. Several recent approaches utilize "precision" and "recall," borrowed
+from the classification domain, to individually quantify the output fidelity
+(realism) and output diversity (representation of the real data variation),
+respectively. With the increase in metric proposals, there is a need for a
+unifying perspective, allowing for easier comparison and clearer explanation of
+their benefits and drawbacks. To this end, we unify a class of
+kth-nearest-neighbors (kNN)-based metrics under an information-theoretic lens
+using approaches from kNN density estimation. Additionally, we propose a
+tri-dimensional metric composed of Precision Cross-Entropy (PCE), Recall
+Cross-Entropy (RCE), and Recall Entropy (RE), which separately measure fidelity
+and two distinct aspects of diversity, inter- and intra-class. Our
+domain-agnostic metric, derived from the information-theoretic concepts of
+entropy and cross-entropy, can be dissected for both sample- and mode-level
+analysis. Our detailed experimental results demonstrate the sensitivity of our
+metric components to their respective qualities and reveal undesirable
+behaviors of other metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Skin Disease Diagnosis: Interpretable Visual Concept Discovery
+  with SAM <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09520v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09520v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Hu, Janet Wang, Jihun Hamm, Rie R Yotsu, Zhengming Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current AI-assisted skin image diagnosis has achieved dermatologist-level
+performance in classifying skin cancer, driven by rapid advancements in deep
+learning architectures. However, unlike traditional vision tasks, skin images
+in general present unique challenges due to the limited availability of
+well-annotated datasets, complex variations in conditions, and the necessity
+for detailed interpretations to ensure patient safety. Previous segmentation
+methods have sought to reduce image noise and enhance diagnostic performance,
+but these techniques require fine-grained, pixel-level ground truth masks for
+training. In contrast, with the rise of foundation models, the Segment Anything
+Model (SAM) has been introduced to facilitate promptable segmentation, enabling
+the automation of the segmentation process with simple yet effective prompts.
+Efforts applying SAM predominantly focus on dermatoscopy images, which present
+more easily identifiable lesion boundaries than clinical photos taken with
+smartphones. This limitation constrains the practicality of these approaches to
+real-world applications. To overcome the challenges posed by noisy clinical
+photos acquired via non-standardized protocols and to improve diagnostic
+accessibility, we propose a novel Cross-Attentive Fusion framework for
+interpretable skin lesion diagnosis. Our method leverages SAM to generate
+visual concepts for skin diseases using prompts, integrating local visual
+concepts with global image features to enhance model performance. Extensive
+evaluation on two skin disease datasets demonstrates our proposed method's
+effectiveness on lesion diagnosis and interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FlowDock: Geometric Flow Matching for Generative Protein-Ligand Docking
+  and Affinity Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10966v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10966v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Morehead, Jianlin Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Powerful generative AI models of protein-ligand structure have recently been
+proposed, but few of these methods support both flexible protein-ligand docking
+and affinity estimation. Of those that do, none can directly model multiple
+binding ligands concurrently or have been rigorously benchmarked on
+pharmacologically relevant drug targets, hindering their widespread adoption in
+drug discovery efforts. In this work, we propose FlowDock, the first deep
+geometric generative model based on conditional flow matching that learns to
+directly map unbound (apo) structures to their bound (holo) counterparts for an
+arbitrary number of binding ligands. Furthermore, FlowDock provides predicted
+structural confidence scores and binding affinity values with each of its
+generated protein-ligand complex structures, enabling fast virtual screening of
+new (multi-ligand) drug targets. For the well-known PoseBusters Benchmark
+dataset, FlowDock outperforms single-sequence AlphaFold 3 with a 51% blind
+docking success rate using unbound (apo) protein input structures and without
+any information derived from multiple sequence alignments, and for the
+challenging new DockGen-E dataset, FlowDock outperforms single-sequence
+AlphaFold 3 and matches single-sequence Chai-1 for binding pocket
+generalization. Additionally, in the ligand category of the 16th community-wide
+Critical Assessment of Techniques for Structure Prediction (CASP16), FlowDock
+ranked among the top-5 methods for pharmacological binding affinity estimation
+across 140 protein-ligand complexes, demonstrating the efficacy of its learned
+representations in virtual screening. Source code, data, and pre-trained models
+are available at https://github.com/BioinfoMachineLearning/FlowDock.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 tables, 2 algorithms, 7 figures. Code, data, pre-trained
+  models, and baseline method predictions are available at
+  https://github.com/BioinfoMachineLearning/FlowDock</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal-to-Text <span class="highlight-title">Prompt</span> Engineering in Large Language Models Using
+  Feature Embeddings for GNSS Interference Characterization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05079v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05079v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harshith Manjunath, Lucas Heublein, Tobias Feigl, Felix Ott
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are advanced AI systems applied across various
+domains, including NLP, information retrieval, and recommendation systems.
+Despite their adaptability and efficiency, LLMs have not been extensively
+explored for signal processing tasks, particularly in the domain of global
+navigation satellite system (GNSS) interference monitoring. GNSS interference
+monitoring is essential to ensure the reliability of vehicle localization on
+roads, a critical requirement for numerous applications. However, GNSS-based
+positioning is vulnerable to interference from jamming devices, which can
+compromise its accuracy. The primary objective is to identify, classify, and
+mitigate these interferences. Interpreting GNSS snapshots and the associated
+interferences presents significant challenges due to the inherent complexity,
+including multipath effects, diverse interference types, varying sensor
+characteristics, and satellite constellations. In this paper, we extract
+features from a large GNSS dataset and employ LLaVA to retrieve relevant
+information from an extensive knowledge base. We employ prompt engineering to
+interpret the interferences and environmental factors, and utilize t-SNE to
+analyze the feature embeddings. Our findings demonstrate that the proposed
+method is capable of visual and logical reasoning within the GNSS context.
+Furthermore, our pipeline outperforms state-of-the-art machine learning models
+in interference classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PASS: Presentation Automation for Slide Generation and Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06497v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06497v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tushar Aggarwal, Aarohi Bhand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In today's fast-paced world, effective presentations have become an essential
+tool for communication in both online and offline meetings. The crafting of a
+compelling presentation requires significant time and effort, from gathering
+key insights to designing slides that convey information clearly and concisely.
+However, despite the wealth of resources available, people often find
+themselves manually extracting crucial points, analyzing data, and organizing
+content in a way that ensures clarity and impact. Furthermore, a successful
+presentation goes beyond just the slides; it demands rehearsal and the ability
+to weave a captivating narrative to fully engage the audience. Although there
+has been some exploration of automating document-to-slide generation, existing
+research is largely centered on converting research papers. In addition,
+automation of the delivery of these presentations has yet to be addressed. We
+introduce PASS, a pipeline used to generate slides from general Word documents,
+going beyond just research papers, which also automates the oral delivery of
+the generated slides. PASS analyzes user documents to create a dynamic,
+engaging presentation with an AI-generated voice. Additionally, we developed an
+LLM-based evaluation metric to assess our pipeline across three critical
+dimensions of presentations: relevance, coherence, and redundancy. The data and
+codes are available at https://github.com/AggarwalTushar/PASS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Novel Object Detection via Cooperative Foundational Models <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12068v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12068v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Bharadwaj, Muzammal Naseer, Salman Khan, Fahad Shahbaz Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we address the challenging and emergent problem of novel object
+detection (NOD), focusing on the accurate detection of both known and novel
+object categories during inference. Traditional object detection algorithms are
+inherently closed-set, limiting their capability to handle NOD. We present a
+novel approach to transform existing closed-set detectors into open-set
+detectors. This transformation is achieved by leveraging the complementary
+strengths of pre-trained foundational models, specifically CLIP and SAM,
+through our cooperative mechanism. Furthermore, by integrating this mechanism
+with state-of-the-art open-set detectors such as GDINO, we establish new
+benchmarks in object detection performance. Our method achieves 17.42 mAP in
+novel object detection and 42.08 mAP for known objects on the challenging LVIS
+dataset. Adapting our approach to the COCO OVD split, we surpass the current
+state-of-the-art by a margin of 7.2 $ \text{AP}_{50} $ for novel classes. Our
+code is available at https://rohit901.github.io/coop-foundation-models/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contextual Evaluation of Large Language Models for Classifying Tropical
+  and Infectious Diseases <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09201v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09201v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mercy Asiedu, Nenad Tomasev, Chintan Ghate, Tiya Tiyasirichokchai, Awa Dieng, Oluwatosin Akande, Geoffrey Siwo, Steve Adudans, Sylvanus Aitkins, Odianosen Ehiakhamen, Eric Ndombi, Katherine Heller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) have shown promise for medical question
+answering, there is limited work focused on tropical and infectious
+disease-specific exploration. We build on an opensource tropical and infectious
+diseases (TRINDs) dataset, expanding it to include demographic and semantic
+clinical and consumer augmentations yielding 11000+ prompts. We evaluate LLM
+performance on these, comparing generalist and medical LLMs, as well as LLM
+outcomes to human experts. We demonstrate through systematic experimentation,
+the benefit of contextual information such as demographics, location, gender,
+risk factors for optimal LLM response. Finally we develop a prototype of
+TRINDs-LM, a research tool that provides a playground to navigate how context
+impacts LLM outputs for health.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 2 NeurIPS 2024 workshops: Generative AI for Health
+  Workshop and Workshop on Advancements In Medical Foundation Models:
+  Explainability, Robustness, Security, and Beyond</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Relational Reasoning Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.00393v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.00393v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giuseppe Marra, Michelangelo Diligenti, Francesco Giannini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neuro-symbolic methods integrate neural architectures, knowledge
+representation and reasoning. However, they have been struggling at both
+dealing with the intrinsic uncertainty of the observations and scaling to
+real-world applications. This paper presents Relational Reasoning Networks
+(R2N), a novel end-to-end model that performs relational reasoning in the
+latent space of a deep learner architecture, where the representations of
+constants, ground atoms and their manipulations are learned in an integrated
+fashion. Unlike flat architectures like Knowledge Graph Embedders, which can
+only represent relations between entities, R2Ns define an additional
+computational structure, accounting for higher-level relations among the ground
+atoms. The considered relations can be explicitly known, like the ones defined
+by logic formulas, or defined as unconstrained correlations among groups of
+ground atoms. R2Ns can be applied to purely symbolic tasks or as a
+neuro-symbolic platform to integrate learning and reasoning in heterogeneous
+problems with both symbolic and feature-based represented entities. The
+proposed model overtakes the limitations of previous neuro-symbolic methods
+that have been either limited in terms of scalability or expressivity. The
+proposed methodology is shown to achieve state-of-the-art results in different
+experimental settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Experimental Study on The Effect of Multi-step Deep Reinforcement
+  Learning in POMDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.04999v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.04999v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingheng Meng, Rob Gorbet, Michael Burke, Dana Kulić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Reinforcement Learning (DRL) has made tremendous advances in both
+simulated and real-world robot control tasks in recent years. This is
+particularly the case for tasks that can be carefully engineered with a full
+state representation, and which can then be formulated as a Markov Decision
+Process (MDP). However, applying DRL strategies designed for MDPs to novel
+robot control tasks can be challenging, because the available observations may
+be a partial representation of the state, resulting in a Partially Observable
+Markov Decision Process (POMDP). This paper considers three popular DRL
+algorithms, namely Proximal Policy Optimization (PPO), Twin Delayed Deep
+Deterministic Policy Gradient (TD3), and Soft Actor-Critic (SAC), invented for
+MDPs, and studies their performance in POMDP scenarios. While prior work has
+found that SAC and TD3 typically outperform PPO across a broad range of tasks
+that can be represented as MDPs, we show that this is not always the case,
+using three representative POMDP environments. Empirical studies show that this
+is related to multi-step bootstrapping, where multi-step immediate rewards,
+instead of one-step immediate reward, are used to calculate the target value
+estimation of an observation and action pair. We identify this by observing
+that the inclusion of multi-step bootstrapping in TD3 (MTD3) and SAC (MSAC)
+results in improved robustness in POMDP settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-14T00:00:00Z">2025-01-14</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">30</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VINGS-Mono: Visual-Inertial Gaussian Splatting Monocular SLAM in Large
+  Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Wu, Zicheng Zhang, Muer Tie, Ziqing Ai, Zhongxue Gan, Wenchao Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  VINGS-Mono is a monocular (inertial) Gaussian Splatting (GS) SLAM framework
+designed for large scenes. The framework comprises four main components: VIO
+Front End, 2D Gaussian Map, NVS Loop Closure, and Dynamic Eraser. In the VIO
+Front End, RGB frames are processed through dense bundle adjustment and
+uncertainty estimation to extract scene geometry and poses. Based on this
+output, the mapping module incrementally constructs and maintains a 2D Gaussian
+map. Key components of the 2D Gaussian Map include a Sample-based Rasterizer,
+Score Manager, and Pose Refinement, which collectively improve mapping speed
+and localization accuracy. This enables the SLAM system to handle large-scale
+urban environments with up to 50 million Gaussian ellipsoids. To ensure global
+consistency in large-scale scenes, we design a Loop Closure module, which
+innovatively leverages the Novel View Synthesis (NVS) capabilities of Gaussian
+Splatting for loop closure detection and correction of the Gaussian map.
+Additionally, we propose a Dynamic Eraser to address the inevitable presence of
+dynamic objects in real-world outdoor scenes. Extensive evaluations in indoor
+and outdoor environments demonstrate that our approach achieves localization
+performance on par with Visual-Inertial Odometry while surpassing recent
+GS/NeRF SLAM methods. It also significantly outperforms all existing methods in
+terms of mapping and rendering quality. Furthermore, we developed a mobile app
+and verified that our framework can generate high-quality Gaussian maps in real
+time using only a smartphone camera and a low-frequency IMU sensor. To the best
+of our knowledge, VINGS-Mono is the first monocular Gaussian SLAM method
+capable of operating in outdoor environments and supporting kilometer-scale
+large scenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FDPP: Fine-tune Diffusion Policy with Human Preference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Chen, Devesh K. Jha, Masayoshi Tomizuka, Diego Romeres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning from human demonstrations enables robots to perform
+complex manipulation tasks and has recently witnessed huge success. However,
+these techniques often struggle to adapt behavior to new preferences or changes
+in the environment. To address these limitations, we propose Fine-tuning
+Diffusion Policy with Human Preference (FDPP). FDPP learns a reward function
+through preference-based learning. This reward is then used to fine-tune the
+pre-trained policy with reinforcement learning (RL), resulting in alignment of
+pre-trained policy with new human preferences while still solving the original
+task. Our experiments across various robotic tasks and preferences demonstrate
+that FDPP effectively customizes policy behavior without compromising
+performance. Additionally, we show that incorporating Kullback-Leibler (KL)
+regularization during fine-tuning prevents over-fitting and helps maintain the
+competencies of the initial policy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven Spatial Classification using Multi-Arm Bandits for
+  Monitoring with Energy-Constrained Mobile Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshan Lin, Siddharth Nayak, Stefano Di Cairano, Abraham P. Vinod
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the spatial classification problem for monitoring using data
+collected by a coordinated team of mobile robots. Such classification problems
+arise in several applications including search-and-rescue and precision
+agriculture. Specifically, we want to classify the regions of a search
+environment into interesting and uninteresting as quickly as possible using a
+team of mobile sensors and mobile charging stations. We develop a data-driven
+strategy that accommodates the noise in sensed data and the limited energy
+capacity of the sensors, and generates collision-free motion plans for the
+team. We propose a bi-level approach, where a high-level planner leverages a
+multi-armed bandit framework to determine the potential regions of interest for
+the drones to visit next based on the data collected online. Then, a low-level
+path planner based on integer programming coordinates the paths for the team to
+visit the target regions subject to the physical constraints. We characterize
+several theoretical properties of the proposed approach, including anytime
+guarantees and task completion time. We show the efficacy of our approach in
+simulation, and further validate these observations in physical experiments
+using mobile robots.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures. See https://www.youtube.com/watch?v=gzulpOcVYzg
+  for an overview of the approach along with videos of the hardware experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Action Based Reinforcement Learning for Multi-Objective
+  Compatible Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guizhe Jin, Zhuoren Li, Bo Leng, Wei Han, Lu Xiong, Chen Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning (RL) has shown excellent performance in solving
+decision-making and control problems of autonomous driving, which is
+increasingly applied in diverse driving scenarios. However, driving is a
+multi-attribute problem, leading to challenges in achieving multi-objective
+compatibility for current RL methods, especially in both policy execution and
+policy iteration. On the one hand, the common action space structure with
+single action type limits driving flexibility or results in large behavior
+fluctuations during policy execution. On the other hand, the multi-attribute
+weighted single reward function result in the agent's disproportionate
+attention to certain objectives during policy iterations. To this end, we
+propose a Multi-objective Ensemble-Critic reinforcement learning method with
+Hybrid Parametrized Action for multi-objective compatible autonomous driving.
+Specifically, a parameterized action space is constructed to generate hybrid
+driving actions, combining both abstract guidance and concrete control
+commands. A multi-objective critics architecture is constructed considering
+multiple attribute rewards, to ensure simultaneously focusing on different
+driving objectives. Additionally, uncertainty-based exploration strategy is
+introduced to help the agent faster approach viable driving policy. The
+experimental results in both the simulated traffic environment and the HighD
+dataset demonstrate that our method can achieve multi-objective compatible
+autonomous driving in terms of driving efficiency, action consistency, and
+safety. It enhances the general performance of the driving while significantly
+increasing training efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 9 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HydroelasticTouch: Simulation of Tactile Sensors with Hydroelastic
+  Contact Surfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David P. Leins, Florian Patzelt, Robert Haschke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thanks to recent advancements in the development of inexpensive,
+high-resolution tactile sensors, touch sensing has become popular in
+contact-rich robotic manipulation tasks. With the surge of data-driven methods
+and their requirement for substantial datasets, several methods of simulating
+tactile sensors have emerged in the tactile research community to overcome
+real-world data collection limitations. These simulation approaches can be
+split into two main categories: fast but inaccurate (soft) point-contact models
+and slow but accurate finite element modeling. In this work, we present a novel
+approach to simulating pressure-based tactile sensors using the hydroelastic
+contact model, which provides a high degree of physical realism at a reasonable
+computational cost. This model produces smooth contact forces for soft-to-soft
+and soft-to-rigid contacts along even non-convex contact surfaces. Pressure
+values are approximated at each point of the contact surface and can be
+integrated to calculate sensor outputs. We validate our models' capacity to
+synthesize real-world tactile data by conducting zero-shot sim-to-real transfer
+of a model for object state estimation. Our simulation is available as a
+plug-in to our open-source, MuJoCo-based simulator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CHEQ-ing the Box: Safe Variable Impedance Learning for Robotic Polishing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emma Cramer, Lukas Jäschke, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic systems are increasingly employed for industrial automation, with
+contact-rich tasks like polishing requiring dexterity and compliant behaviour.
+These tasks are difficult to model, making classical control challenging. Deep
+reinforcement learning (RL) offers a promising solution by enabling the
+learning of models and control policies directly from data. However, its
+application to real-world problems is limited by data inefficiency and unsafe
+exploration. Adaptive hybrid RL methods blend classical control and RL
+adaptively, combining the strengths of both: structure from control and
+learning from RL. This has led to improvements in data efficiency and
+exploration safety. However, their potential for hardware applications remains
+underexplored, with no evaluations on physical systems to date. Such
+evaluations are critical to fully assess the practicality and effectiveness of
+these methods in real-world settings. This work presents an experimental
+demonstration of the hybrid RL algorithm CHEQ for robotic polishing with
+variable impedance, a task requiring precise force and velocity tracking. In
+simulation, we show that variable impedance enhances polishing performance. We
+compare standalone RL with adaptive hybrid RL, demonstrating that CHEQ achieves
+effective learning while adhering to safety constraints. On hardware, CHEQ
+achieves effective polishing behaviour, requiring only eight hours of training
+and incurring just five failures. These results highlight the potential of
+adaptive hybrid RL for real-world, contact-rich tasks trained directly on
+hardware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Guide Dog: Egocentric Path Prediction on Smartphone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishwarya Jadhav, Jeffery Cao, Abhishree Shetty, Urvashi Priyam Kumar, Aditi Sharma, Ben Sukboontip, Jayant Sravan Tamarapalli, Jingyi Zhang, Anirudh Koul
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces AI Guide Dog (AIGD), a lightweight egocentric
+navigation assistance system for visually impaired individuals, designed for
+real-time deployment on smartphones. AIGD addresses key challenges in blind
+navigation by employing a vision-only, multi-label classification approach to
+predict directional commands, ensuring safe traversal across diverse
+environments. We propose a novel technique to enable goal-based outdoor
+navigation by integrating GPS signals and high-level directions, while also
+addressing uncertain multi-path predictions for destination-free indoor
+navigation. Our generalized model is the first navigation assistance system to
+handle both goal-oriented and exploratory navigation scenarios across indoor
+and outdoor settings, establishing a new state-of-the-art in blind navigation.
+We present methods, datasets, evaluations, and deployment insights to encourage
+further innovations in assistive navigation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low-Contact Grasping of Soft Tissue with Complex Geometry using a Vortex
+  Gripper 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Mykhailyshyn, Ann Majewicz Fey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Soft tissue manipulation is an integral aspect of most surgical procedures;
+however, the vast majority of surgical graspers used today are made of hard
+materials, such as metals or hard plastics. Furthermore, these graspers
+predominately function by pinching tissue between two hard objects as a method
+for tissue manipulation. As such, the potential to apply too much force during
+contact, and thus damage tissue, is inherently high. As an alternative
+approach, gaspers developed using a pneumatic vortex could potentially levitate
+soft tissue, enabling manipulation with low or even no contact force. In this
+paper, we present the design and well as a full factorial study of the force
+characteristics of the vortex gripper grasping soft surfaces with four common
+shapes, with convex and concave curvature, and ranging over 10 different radii
+of curvature, for a total of 40 unique surfaces. By changing the parameters of
+the nozzle elements in the design of the gripper, it was possible to
+investigate the influence of the mass flow parameters of the vortex gripper on
+the lifting force for all of these different soft surfaces. An $\pmb{ex}$
+$\pmb{vivo}$ experiment was conducted on grasping biological tissues and soft
+balls of various shapes to show the advantages and disadvantages of the
+proposed technology. The obtained results allowed us to find limitations in the
+use of vortex technology and the following stages of its improvement for
+medical use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to T-MRB</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Electrostatic Clutches Enable High-Force Mechanical Multiplexing:
+  Demonstrating Single-Motor Full-Actuation of a 4-DoF Hand 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy E. Amish, Jeffrey T. Auletta, Chad C. Kessens, Joshua R. Smith, Jeffrey I. Lipton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel mechanical multiplexing system powered by
+electrostatic capstan clutches, enabling high-force, single-motor control of
+multiple degrees of freedom (DoF). The system is capable of both bidirectional
+single-input single-output time-division and single-input multiple-output
+multiplexing to actuate a commercial 4-DoF robotic hand with a single motor.
+Our mechanical multiplexer is also capable of powerless position holding owing
+to its use of a leadscrew nut acting as the output. Experimental results
+demonstrate the effectiveness of this approach, achieving individual and
+simultaneous actuation. This innovation offers a scalable solution for high-DoF
+robotic systems, providing a path to efficient actuation in robotic platforms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Zero-Shot User Intent Recognition in Shared Autonomy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08389v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08389v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atharv Belsare, Zohre Karimi, Connor Mattson, Daniel S. Brown
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A fundamental challenge of shared autonomy is to use high-DoF robots to
+assist, rather than hinder, humans by first inferring user intent and then
+empowering the user to achieve their intent. Although successful, prior methods
+either rely heavily on a priori knowledge of all possible human intents or
+require many demonstrations and interactions with the human to learn these
+intents before being able to assist the user. We propose and study a zero-shot,
+vision-only shared autonomy (VOSA) framework designed to allow robots to use
+end-effector vision to estimate zero-shot human intents in conjunction with
+blended control to help humans accomplish manipulation tasks with unknown and
+dynamically changing object locations. To demonstrate the effectiveness of our
+VOSA framework, we instantiate a simple version of VOSA on a Kinova Gen3
+manipulator and evaluate our system by conducting a user study on three
+tabletop manipulation tasks. The performance of VOSA matches that of an oracle
+baseline model that receives privileged knowledge of possible human intents
+while also requiring significantly less effort than unassisted teleoperation.
+In more realistic settings, where the set of possible human intents is fully or
+partially unknown, we demonstrate that VOSA requires less human effort and time
+than baseline approaches while being preferred by a majority of the
+participants. Our results demonstrate the efficacy and efficiency of using
+off-the-shelf vision algorithms to enable flexible and beneficial shared
+control of a robot manipulator. Code and videos available here:
+https://sites.google.com/view/zeroshot-sharedautonomy/home.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, Accepted to IEEE/ACM International Conference on
+  Human-Robot Interaction (HRI), 2025. Equal Contribution from the first three
+  authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Predictive Cooperative Collision Avoidance for Multi-Robot Systems
+  Using Control Barrier Function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxiao Li, Zhirui Sun, Hongpeng Wang, Shuai Li, Jiankun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Control barrier function (CBF)-based methods provide the minimum modification
+necessary to formally guarantee safety in the context of quadratic programming,
+and strict safety guarantee for safety critical systems. However, most
+CBF-related derivatives myopically focus on present safety at each time step, a
+reasoning over a look-ahead horizon is exactly missing. In this paper, a
+predictive safety matrix is constructed. We then consolidate the safety
+condition based on the smallest eigenvalue of the proposed safety matrix. A
+predefined deconfliction strategy of motion paths is embedded into the
+trajectory tracking module to manage deadlock conflicts, which computes the
+deadlock escape velocity with the minimum attitude angle. Comparison results
+show that the introduction of the predictive term is robust for measurement
+uncertainty and is immune to oscillations. The proposed deadlock avoidance
+method avoids a large detour, without obvious stagnation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FaVoR: Features via Voxel Rendering for Camera Relocalization <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.07571v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.07571v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincenzo Polizzi, Marco Cannici, Davide Scaramuzza, Jonathan Kelly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camera relocalization methods range from dense image alignment to direct
+camera pose regression from a query image. Among these, sparse feature matching
+stands out as an efficient, versatile, and generally lightweight approach with
+numerous applications. However, feature-based methods often struggle with
+significant viewpoint and appearance changes, leading to matching failures and
+inaccurate pose estimates. To overcome this limitation, we propose a novel
+approach that leverages a globally sparse yet locally dense 3D representation
+of 2D features. By tracking and triangulating landmarks over a sequence of
+frames, we construct a sparse voxel map optimized to render image patch
+descriptors observed during tracking. Given an initial pose estimate, we first
+synthesize descriptors from the voxels using volumetric rendering and then
+perform feature matching to estimate the camera pose. This methodology enables
+the generation of descriptors for unseen views, enhancing robustness to view
+changes. We extensively evaluate our method on the 7-Scenes and Cambridge
+Landmarks datasets. Our results show that our method significantly outperforms
+existing state-of-the-art feature representation techniques in indoor
+environments, achieving up to a 39% improvement in median translation error.
+Additionally, our approach yields comparable results to other methods for
+outdoor scenarios while maintaining lower memory and computational costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the IEEE/CVF Winter Conference on Applications of
+  Computer Vision (WACV), Tucson, Arizona, US, Feb 28-Mar 4, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vid2Sim: Realistic and Interactive Simulation from Video for Urban
+  Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyang Xie, Zhizheng Liu, Zhenghao Peng, Wayne Wu, Bolei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sim-to-real gap has long posed a significant challenge for robot learning in
+simulation, preventing the deployment of learned models in the real world.
+Previous work has primarily focused on domain randomization and system
+identification to mitigate this gap. However, these methods are often limited
+by the inherent constraints of the simulation and graphics engines. In this
+work, we propose Vid2Sim, a novel framework that effectively bridges the
+sim2real gap through a scalable and cost-efficient real2sim pipeline for neural
+3D scene reconstruction and simulation. Given a monocular video as input,
+Vid2Sim can generate photorealistic and physically interactable 3D simulation
+environments to enable the reinforcement learning of visual navigation agents
+in complex urban environments. Extensive experiments demonstrate that Vid2Sim
+significantly improves the performance of urban navigation in the digital twins
+and real world by 31.2% and 68.3% in success rate compared with agents trained
+with prior simulation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://metadriverse.github.io/vid2sim/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Virtual Reflections on a Dynamic 2D Eye Model Improve Spatial Reference
+  Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07344v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07344v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matti Krüger, Yutaka Oshima, Yu Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The visible orientation of human eyes creates some transparency about
+people's spatial attention and other mental states. This leads to a dual role
+for the eyes as a means of sensing and communication. Accordingly, artificial
+eye models are being explored as communication media in human-machine
+interaction scenarios. One challenge in the use of eye models for communication
+consists of resolving spatial reference ambiguities, especially for
+screen-based models. Here, we introduce an approach for overcoming this
+challenge through the introduction of reflection-like features that are
+contingent on artificial eye movements. We conducted a user study with 30
+participants in which participants had to use spatial references provided by
+dynamic eye models to advance in a fast-paced group interaction task. Compared
+to a non-reflective eye model and a pure reflection mode, their combination in
+the new approach resulted in a higher identification accuracy and user
+experience, suggesting a synergistic benefit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenSafe: A Generalizable Safety Enhancer for Safe Reinforcement Learning
+  Algorithms Based on Reduced Order Markov Decision Process Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03912v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03912v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhehua Zhou, Xuan Xie, Jiayang Song, Zhan Shu, Lei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe Reinforcement Learning (SRL) aims to realize a safe learning process for
+Deep Reinforcement Learning (DRL) algorithms by incorporating safety
+constraints. However, the efficacy of SRL approaches often relies on accurate
+function approximations, which are notably challenging to achieve in the early
+learning stages due to data insufficiency. To address this issue, we introduce
+in this work a novel Generalizable Safety enhancer (GenSafe) that is able to
+overcome the challenge of data insufficiency and enhance the performance of SRL
+approaches. Leveraging model order reduction techniques, we first propose an
+innovative method to construct a Reduced Order Markov Decision Process (ROMDP)
+as a low-dimensional approximator of the original safety constraints. Then, by
+solving the reformulated ROMDP-based constraints, GenSafe refines the actions
+of the agent to increase the possibility of constraint satisfaction.
+Essentially, GenSafe acts as an additional safety layer for SRL algorithms. We
+evaluate GenSafe on multiple SRL approaches and benchmark problems. The results
+demonstrate its capability to improve safety performance, especially in the
+early learning phases, while maintaining satisfactory task performance. Our
+proposed GenSafe not only offers a novel measure to augment existing SRL
+methods but also shows broad compatibility with various SRL algorithms, making
+it applicable to a wide range of systems and SRL problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Perception Matters: Enhancing Embodied AI with Uncertainty-Aware
+  Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.02297v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.02297v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sai Prasanna, Daniel Honerkamp, Kshitij Sirohi, Tim Welschehold, Wolfram Burgard, Abhinav Valada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Embodied AI has made significant progress acting in unexplored environments.
+However, tasks such as object search have largely focused on efficient policy
+learning. In this work, we identify several gaps in current search methods:
+They largely focus on dated perception models, neglect temporal aggregation,
+and transfer from ground truth directly to noisy perception at test time,
+without accounting for the resulting overconfidence in the perceived state. We
+address the identified problems through calibrated perception probabilities and
+uncertainty across aggregation and found decisions, thereby adapting the models
+for sequential tasks. The resulting methods can be directly integrated with
+pretrained models across a wide family of existing search approaches at no
+additional training cost. We perform extensive evaluations of aggregation
+methods across both different semantic perception models and policies,
+confirming the importance of calibrated uncertainties in both the aggregation
+and found decisions. We make the code and trained models available at
+https://semantic-search.cs.uni-freiburg.de.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DIDLM: A SLAM <span class="highlight-title">Dataset</span> for Difficult Scenarios Featuring Infrared, Depth
+  Cameras, LIDAR, 4D Radar, and Others under Adverse Weather, Low Light
+  Conditions, and Rough Roads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09622v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09622v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weisheng Gong, Kaijie Su, Qingyong Li, Chen He, Tong Wu, Z. Jane Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adverse weather conditions, low-light environments, and bumpy road surfaces
+pose significant challenges to SLAM in robotic navigation and autonomous
+driving. Existing datasets in this field predominantly rely on single sensors
+or combinations of LiDAR, cameras, and IMUs. However, 4D millimeter-wave radar
+demonstrates robustness in adverse weather, infrared cameras excel in capturing
+details under low-light conditions, and depth images provide richer spatial
+information. Multi-sensor fusion methods also show potential for better
+adaptation to bumpy roads. Despite some SLAM studies incorporating these
+sensors and conditions, there remains a lack of comprehensive datasets
+addressing low-light environments and bumpy road conditions, or featuring a
+sufficiently diverse range of sensor data. In this study, we introduce a
+multi-sensor dataset covering challenging scenarios such as snowy weather,
+rainy weather, nighttime conditions, speed bumps, and rough terrains. The
+dataset includes rarely utilized sensors for extreme conditions, such as 4D
+millimeter-wave radar, infrared cameras, and depth cameras, alongside 3D LiDAR,
+RGB cameras, GPS, and IMU. It supports both autonomous driving and ground robot
+applications and provides reliable GPS/INS ground truth data, covering
+structured and semi-structured terrains. We evaluated various SLAM algorithms
+using this dataset, including RGB images, infrared images, depth images, LiDAR,
+and 4D millimeter-wave radar. The dataset spans a total of 18.5 km, 69 minutes,
+and approximately 660 GB, offering a valuable resource for advancing SLAM
+research under complex and extreme conditions. Our dataset is available at
+https://github.com/GongWeiSheng/DIDLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of Artificial Intelligence Methods for Lead Time Prediction
+  in Non-Cycled Areas of Automotive Production 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cornelius Hake, Jonas Weigele, Frederik Reichert, Christian Friedrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present study examines the effectiveness of applying Artificial
+Intelligence methods in an automotive production environment to predict unknown
+lead times in a non-cycle-controlled production area. Data structures are
+analyzed to identify contextual features and then preprocessed using one-hot
+encoding. Methods selection focuses on supervised machine learning techniques.
+In supervised learning methods, regression and classification methods are
+evaluated. Continuous regression based on target size distribution is not
+feasible. Classification methods analysis shows that Ensemble Learning and
+Support Vector Machines are the most suitable. Preliminary study results
+indicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost
+yield the best results. After further testing and extensive hyperparameter
+optimization, the final method choice is the LightGBM algorithm. Depending on
+feature availability and prediction interval granularity, relative prediction
+accuracies of up to 90% can be achieved. Further tests highlight the importance
+of periodic retraining of AI models to accurately represent complex production
+processes using the database. The research demonstrates that AI methods can be
+effectively applied to highly variable production data, adding business value
+by providing an additional metric for various control tasks while outperforming
+current non AI-based systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GazeGrasp: DNN-Driven Robotic Grasping with Wearable Eye-Gaze Interface 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07255v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07255v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Issatay Tokmurziyev, Miguel Altamirano Cabrera, Luis Moreno, Muhammad Haris Khan, Dzmitry Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present GazeGrasp, a gaze-based manipulation system enabling individuals
+with motor impairments to control collaborative robots using eye-gaze. The
+system employs an ESP32 CAM for eye tracking, MediaPipe for gaze detection, and
+YOLOv8 for object localization, integrated with a Universal Robot UR10 for
+manipulation tasks. After user-specific calibration, the system allows
+intuitive object selection with a magnetic snapping effect and robot control
+via eye gestures. Experimental evaluation involving 13 participants
+demonstrated that the magnetic snapping effect significantly reduced gaze
+alignment time, improving task efficiency by 31%. GazeGrasp provides a robust,
+hands-free interface for assistive robotics, enhancing accessibility and
+autonomy for users.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to: IEEE/ACM International Conference on Human-Robot
+  Interaction (HRI 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cooperative Aerial Robot Inspection Challenge: A Benchmark for
+  Heterogeneous Multi-UAV Planning and Lessons Learned 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06566v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06566v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muqing Cao, Thien-Minh Nguyen, Shenghai Yuan, Andreas Anastasiou, Angelos Zacharia, Savvas Papaioannou, Panayiotis Kolios, Christos G. Panayiotou, Marios M. Polycarpou, Xinhang Xu, Mingjie Zhang, Fei Gao, Boyu Zhou, Ben M. Chen, Lihua Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a
+simulation-based benchmark for motion planning algorithms in heterogeneous
+multi-UAV systems. CARIC features UAV teams with complementary sensors,
+realistic constraints, and evaluation metrics prioritizing inspection quality
+and efficiency. It offers a ready-to-use perception-control software stack and
+diverse scenarios to support the development and evaluation of task allocation
+and motion planning algorithms. Competitions using CARIC were held at IEEE CDC
+2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation,
+attracting innovative solutions from research teams worldwide. This paper
+examines the top three teams from CDC 2023, analyzing their exploration,
+inspection, and task allocation strategies while drawing insights into their
+performance across scenarios. The results highlight the task's complexity and
+suggest promising directions for future research in cooperative multi-UAV
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Please find our website at https://ntu-aris.github.io/caric</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analyzing Infrastructure LiDAR Placement with Realistic LiDAR Simulation
+  Library <span class="chip">ICRA'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15975v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15975v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Cai, Wentao Jiang, Runsheng Xu, Wenquan Zhao, Jiaqi Ma, Si Liu, Yikang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Vehicle-to-Everything(V2X) cooperative perception has attracted
+increasing attention. Infrastructure sensors play a critical role in this
+research field; however, how to find the optimal placement of infrastructure
+sensors is rarely studied. In this paper, we investigate the problem of
+infrastructure sensor placement and propose a pipeline that can efficiently and
+effectively find optimal installation positions for infrastructure sensors in a
+realistic simulated environment. To better simulate and evaluate LiDAR
+placement, we establish a Realistic LiDAR Simulation library that can simulate
+the unique characteristics of different popular LiDARs and produce
+high-fidelity LiDAR point clouds in the CARLA simulator. Through simulating
+point cloud data in different LiDAR placements, we can evaluate the perception
+accuracy of these placements using multiple detection models. Then, we analyze
+the correlation between the point cloud distribution and perception accuracy by
+calculating the density and uniformity of regions of interest. Experiments show
+that when using the same number and type of LiDAR, the placement scheme
+optimized by our proposed method improves the average precision by 15%,
+compared with the conventional placement scheme in the standard lane scene. We
+also analyze the correlation between perception performance in the region of
+interest and LiDAR point cloud distribution and validate that density and
+uniformity can be indicators of performance. Both the RLS Library and related
+code will be released at https://github.com/PJLab-ADG/PCSim.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 6 figures, accepted to the IEEE International Conference on
+  Robotics and Automation (ICRA'23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cost-Effective Robotic Handwriting System with AI Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06783v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06783v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Huang, Richard Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a cost-effective robotic handwriting system designed to
+replicate human-like handwriting with high precision. Combining a Raspberry Pi
+Pico microcontroller, 3D-printed components, and a machine learning-based
+handwriting generation model implemented via TensorFlow, the system converts
+user-supplied text into realistic stroke trajectories. By leveraging
+lightweight 3D-printed materials and efficient mechanical designs, the system
+achieves a total hardware cost of approximately \$56, significantly
+undercutting commercial alternatives. Experimental evaluations demonstrate
+handwriting precision within $\pm$0.3 millimeters and a writing speed of
+approximately 200 mm/min, positioning the system as a viable solution for
+educational, research, and assistive applications. This study seeks to lower
+the barriers to personalized handwriting technologies, making them accessible
+to a broader audience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an updated version of a paper originally presented at the
+  2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tactile-based Exploration, Mapping and Navigation with
+  Collision-Resilient Aerial Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17217v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17217v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karishma Patnaik, Aravind Adhith Pandian Saravanakumaran, Wenlong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces XPLORER, a passive deformable UAV with a
+spring-augmented chassis and proprioceptive state awareness, designed to endure
+collisions and maintain smooth contact. We develop a fast-converging external
+force estimation algorithm for XPLORER that leverages onboard sensors and
+proprioceptive data for contact and collision detection. Using this force
+information, we propose four motion primitives, including three novel
+tactile-based primitives: tactile-traversal, tactile-turning, and
+ricocheting-to aid XPLORER in navigating unknown environments. These primitives
+are synthesized autonomously in real-time to enable efficient exploration and
+navigation by leveraging collisions and contacts. Experimental results
+demonstrate the effectiveness of our approach, highlighting the potential of
+passive deformable UAVs for contact-rich real-world tasks such as
+non-destructive inspection, surveillance and mapping, and pursuit/evasion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safety Implications of Explainable Artificial Intelligence in End-to-End
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12176v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12176v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahin Atakishiyev, Mohammad Salameh, Randy Goebel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The end-to-end learning pipeline is gradually creating a paradigm shift in
+the ongoing development of highly autonomous vehicles, largely due to advances
+in deep learning, the availability of large-scale training datasets, and
+improvements in integrated sensor devices. However, a lack of explainability in
+real-time decisions with contemporary learning methods impedes user trust and
+attenuates the widespread deployment and commercialization of such vehicles.
+Moreover, the issue is exacerbated when these cars are involved in or cause
+traffic accidents. Consequently, explainability in end-to-end autonomous
+driving is essential to build trust in vehicular automation. With that said,
+automotive researchers have not yet rigorously explored safety benefits and
+consequences of explanations in end-to-end autonomous driving. This paper aims
+to bridge the gaps between these topics and seeks to answer the following
+research question: What are safety implications of explanations in end-to-end
+autonomous driving? In this regard, we first revisit established safety and
+explainability concepts in end-to-end driving. Furthermore, we present three
+critical case studies and show the pivotal role of explanations in enhancing
+self-driving safety. Finally, we describe insights from empirical studies and
+reveal potential value, limitations, and caveats of practical explainable AI
+methods with respect to their safety assurance in end-to-end driving.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cooperative and Asynchronous <span class="highlight-title">Transformer</span>-based Mission Planning for
+  Heterogeneous Teams of Mobile Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.06372v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.06372v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Milad Farjadnasab, Shahin Sirouspour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperative mission planning for heterogeneous teams of mobile robots
+presents a unique set of challenges, particularly when operating under
+communication constraints and limited computational resources. To address these
+challenges, we propose the Cooperative and Asynchronous Transformer-based
+Mission Planning (CATMiP) framework, which leverages multi-agent reinforcement
+learning (MARL) to coordinate distributed decision making among agents with
+diverse sensing, motion, and actuation capabilities, operating under sporadic
+ad hoc communication. A Class-based Macro-Action Decentralized Partially
+Observable Markov Decision Process (CMacDec-POMDP) is also formulated to
+effectively model asynchronous decision-making for heterogeneous teams of
+agents. The framework utilizes an asynchronous centralized training and
+distributed execution scheme that is developed based on the Multi-Agent
+Transformer (MAT) architecture. This design allows a single trained model to
+generalize to larger environments and accommodate varying team sizes and
+compositions. We evaluate CATMiP in a 2D grid-world simulation environment and
+compare its performance against planning-based exploration methods. Results
+demonstrate CATMiP's superior efficiency, scalability, and robustness to
+communication dropouts, highlighting its potential for real-world heterogeneous
+mobile robot systems. The code is available at
+https://github.com/mylad13/CATMiP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 8 figures, this work has been submitted to Elsevier for
+  possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Signal Temporal Logic Approach for Task-Based Coordination of
+  Multi-Aerial Systems: a Wind Turbine Inspection Case Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.12713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.12713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giuseppe Silano, Alvaro Caballero, Davide Liuzza, Luigi Iannelli, Stjepan Bogdan, Martin Saska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper addresses task assignment and trajectory generation for
+collaborative inspection missions using a fleet of multi-rotors, focusing on
+the wind turbine inspection scenario. The proposed solution enables safe and
+feasible trajectories while accommodating heterogeneous time-bound constraints
+and vehicle physical limits. An optimization problem is formulated to meet
+mission objectives and temporal requirements encoded as Signal Temporal Logic
+(STL) specifications. Additionally, an event-triggered replanner is introduced
+to address unforeseen events and compensate for lost time. Furthermore, a
+generalized robustness scoring method is employed to reflect user preferences
+and mitigate task conflicts. The effectiveness of the proposed approach is
+demonstrated through MATLAB and Gazebo simulations, as well as field
+multi-robot experiments in a mock-up scenario.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\c{opyright}2025 Elsevier. This work has been accepted to "Robotics
+  and Autonomous Systems" for possible publication. Personal use of this
+  material is permitted. Permission from Elsevier must be obtained for all
+  other uses</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous
+  Sensors via Language Grounding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04693v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04693v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Jones, Oier Mees, Carmelo Sferrazza, Kyle Stachowicz, Pieter Abbeel, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interacting with the world is a multi-sensory experience: achieving effective
+general-purpose interaction requires making use of all available modalities --
+including vision, touch, and audio -- to fill in gaps from partial observation.
+For example, when vision is occluded reaching into a bag, a robot should rely
+on its senses of touch and sound. However, state-of-the-art generalist robot
+policies are typically trained on large datasets to predict robot actions
+solely from visual and proprioceptive observations. In this work, we propose
+FuSe, a novel approach that enables finetuning visuomotor generalist policies
+on heterogeneous sensor modalities for which large datasets are not readily
+available by leveraging natural language as a common cross-modal grounding. We
+combine a multimodal contrastive loss with a sensory-grounded language
+generation loss to encode high-level semantics. In the context of robot
+manipulation, we show that FuSe enables performing challenging tasks that
+require reasoning jointly over modalities such as vision, touch, and sound in a
+zero-shot setting, such as multimodal prompting, compositional cross-modal
+prompting, and descriptions of objects it interacts with. We show that the same
+recipe is applicable to widely different generalist policies, including both
+diffusion-based generalist policies and large vision-language-action (VLA)
+models. Extensive experiments in the real world show that FuSeis able to
+increase success rates by over 20% compared to all considered baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Resilient Distributed Optimization for Multi-Agent Cyberphysical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02459v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02459v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Yemini, Angelia Nedić, Andrea J. Goldsmith, Stephanie Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work focuses on the problem of distributed optimization in multi-agent
+cyberphysical systems, where a legitimate agent's iterates are influenced both
+by the values it receives from potentially malicious neighboring agents, and by
+its own self-serving target function. We develop a new algorithmic and
+analytical framework to achieve resilience for the class of problems where
+stochastic values of trust between agents exist and can be exploited. In this
+case, we show that convergence to the true global optimal point can be
+recovered, both in mean and almost surely, even in the presence of malicious
+agents. Furthermore, we provide expected convergence rate guarantees in the
+form of upper bounds on the expected squared distance to the optimal value.
+Finally, numerical results are presented that validate our analytical
+convergence guarantees even when the malicious agents compose the majority of
+agents in the network and where existing methods fail to converge to the
+optimal nominal points.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the IEEE Transactions on Automatic
+  Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SYNAPSE: SYmbolic Neural-Aided Preference Synthesis Engine <span class="chip">AAAI 25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16689v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16689v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadanand Modak, Noah Patton, Isil Dillig, Joydeep Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the problem of preference learning, which aims to align
+robot behaviors through learning user specific preferences (e.g. "good
+pull-over location") from visual demonstrations. Despite its similarity to
+learning factual concepts (e.g. "red door"), preference learning is a
+fundamentally harder problem due to its subjective nature and the paucity of
+person-specific training data. We address this problem using a novel framework
+called SYNAPSE, which is a neuro-symbolic approach designed to efficiently
+learn preferential concepts from limited data. SYNAPSE represents preferences
+as neuro-symbolic programs, facilitating inspection of individual parts for
+alignment, in a domain-specific language (DSL) that operates over images and
+leverages a novel combination of visual parsing, large language models, and
+program synthesis to learn programs representing individual preferences. We
+perform extensive evaluations on various preferential concepts as well as user
+case studies demonstrating its ability to align well with dissimilar user
+preferences. Our method significantly outperforms baselines, especially when it
+comes to out of distribution generalization. We show the importance of the
+design choices in the framework through multiple ablation studies. Code,
+additional results, and supplementary material can be found on the website:
+https://amrl.cs.utexas.edu/synapse
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted (oral) at AAAI 25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GestLLM: Advanced Hand Gesture Interpretation via Large Language Models
+  for Human-Robot Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07295v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07295v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Kobzarev, Artem Lykov, Dzmitry Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces GestLLM, an advanced system for human-robot interaction
+that enables intuitive robot control through hand gestures. Unlike conventional
+systems, which rely on a limited set of predefined gestures, GestLLM leverages
+large language models and feature extraction via MediaPipe to interpret a
+diverse range of gestures. This integration addresses key limitations in
+existing systems, such as restricted gesture flexibility and the inability to
+recognize complex or unconventional gestures commonly used in human
+communication.
+  By combining state-of-the-art feature extraction and language model
+capabilities, GestLLM achieves performance comparable to leading
+vision-language models while supporting gestures underrepresented in
+traditional datasets. For example, this includes gestures from popular culture,
+such as the ``Vulcan salute" from Star Trek, without any additional
+pretraining, prompt engineering, etc. This flexibility enhances the naturalness
+and inclusivity of robot control, making interactions more intuitive and
+user-friendly.
+  GestLLM provides a significant step forward in gesture-based interaction,
+enabling robots to understand and respond to a wide variety of hand gestures
+effectively. This paper outlines its design, implementation, and evaluation,
+demonstrating its potential applications in advanced human-robot collaboration,
+assistive robotics, and interactive entertainment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">30</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Method for Detecting Dust Accumulation in Photovoltaic Systems:
+  Evaluating Visible Sunlight Obstruction in Different Dust Levels and AI-based
+  Bird Droppings Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Shahriar Kabir, Khalid Mahmud Niloy, S. M. Imrat Rahman, Md Imon Hossen, Sumaiya Afrose, Md. Ismail Hossain Mofazzol, Md Lion Ahmmed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an innovative method for automatically detecting dust
+accumulation on a PV system and notifying the user to clean it instantly. The
+accumulation of dust, bird, or insect droppings on the surface of photovoltaic
+(PV) panels creates a barrier between the solar energy and the panel's surface
+to receive sufficient energy to generate electricity. The study investigates
+the effects of dust on PV panel output and visible sunlight (VSL) block amounts
+to utilize the necessity of cleaning and detection. The amount of blocked
+visible sunlight while passing through glass due to dust determines the
+accumulated dust level. Visible sunlight can easily pass through the clean,
+transparent glass but reflects when something like dust obstructs it. Based on
+those concepts, a system is designed with a light sensor that is simple,
+effective, easy to install, hassle-free, and can spread the technology. The
+study also explores the effectiveness of the detection system developed by
+using image processing and machine learning algorithms to identify dust levels
+and bird or insect droppings accurately. The experimental setup in Gazipur,
+Bangladesh, found that excessive dust can block up to 55% of visible sunlight,
+wasting 55% of solar energy in the visible spectrum, and cleaning can recover
+3% of power weekly. The data from the dust detection system is correlated with
+the 400W capacity solar panels' naturally lost efficiency data to validate the
+system. This research measured visible sunlight obstruction and loss due to
+dust. However, the addition of an infrared radiation sensor can draw the entire
+scenario of energy loss by doing more research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonlinear Cruise Controllers with Bidirectional Sensing for a String of
+  Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iasson Karafyllis, Dionysios Theodosis, Markos Papageorgiou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a nonlinear cruise controller that is fully decentralized (by
+vehicle) and uses spacing and speed measurements from the preceding and
+following vehicles to decide on the appropriate control action (acceleration)
+for each vehicle. The proposed cruise controller is studied on both a ring-road
+and an open road and guarantees that there are no collisions between vehicles,
+while their speeds are always positive and never exceed the road speed limits.
+For both cases of the open road and the ring-road, we rigorously prove that the
+set of equilibrium points is globally asymptotically stable and provide KL
+estimates that guarantee uniform convergence to the said set. Moreover, we show
+that for the ring-road, and under certain conditions, there is a single
+equilibrium point which is exponentially attractive.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Economic Model Predictive Control for Periodic Operation: A Quadratic
+  Programming Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose A. Borja-Conde, Juan M. Nadales, Filiberto Fele, Daniel Limon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Periodic dynamical systems, distinguished by their repetitive behavior over
+time, are prevalent across various engineering disciplines. In numerous
+applications, particularly within industrial contexts, the implementation of
+model predictive control (MPC) schemes tailored to optimize specific economic
+criteria was shown to offer substantial advantages. However, the real-time
+implementation of these schemes is often infeasible due to limited
+computational resources. To tackle this problem, we propose a
+resource-efficient economic model predictive control scheme for periodic
+systems, leveraging existing single-layer MPC techniques. Our method relies on
+a single quadratic optimization problem, which ensures high computational
+efficiency for real-time control in dynamic settings. We prove feasibility,
+stability and convergence to optimum of the proposed approach, and validate the
+effectiveness through numerical experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Analysis of <span class="highlight-title">Transformer</span>-less Inverter Topologies for
+  Grid-Connected PV Systems: Minimizing Leakage Current and THD 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashwot Shrestha, Rachana Subedi, Swodesh Sharma, Sushil Phuyal, Indraman Tamrakar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of distributed energy resources (DERs), particularly
+photovoltaic (PV) systems, into power grids has gained major attention due to
+their environmental and economic benefits. Although traditional
+transformer-based grid-connected PV inverters provide galvanic isolation for
+leakage current, they suffer from major drawbacks of high cost, lower
+efficiency, and increased size. Transformer-less grid-connected PV inverters
+(TLGI) have emerged as a prominent alternative, as they achieve higher
+efficiency, compact design, and lower cost. However, due to a lack of galvanic
+isolation, TLGIs are highly affected by leakage current caused by the
+fluctuation of common-mode voltage (CMV). This paper investigates three
+topologies H4, H5, and HERIC with comparisons between their CMV,
+differential-mode voltage (DMV), total harmonic distortion (THD), and leakage
+current. A simulation was conducted for each topology in MATLAB/Simulink
+R2023a, and the results demonstrate that the H5 topology achieves a balance
+between low leakage current, reduced THD, and optimal operational efficiency,
+making it suitable for practical application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Range-Only Dynamic Output Feedback Controller for Safe and Secure Target
+  Circumnavigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anand Singh, Anoop Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The safety and security of robotic systems are paramount when navigating
+around a hostile target. This paper addresses the problem of circumnavigating
+an unknown target by a unicycle robot while ensuring it maintains a desired
+safe distance and remains within the sensing region around the target
+throughout its motion. The proposed control design methodology is based on the
+construction of a joint Lyapunov function that incorporates: (i) a quadratic
+potential function characterizing the desired target-circumnavigation
+objective, and (ii) a barrier Lyapunov function-based potential term to enforce
+safety and sensing constraints on the robot's motion. A notable feature of the
+proposed control design is its reliance exclusively on local range measurements
+between the robot and the target, realized using a dynamic output feedback
+controller that treats the range as the only observable output for feedback.
+Using the Lyapunov stability theory, we show that the desired equilibrium of
+the closed-loop system is asymptotically stable, and the prescribed safety and
+security constraints are met under the proposed controllers. We also obtain
+restrictive bounds on the post-design signals and provide both simulation and
+experimental results to validate the theoretical contributions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of Power Losses and the Efficacy of Power Minimization
+  Strategies in Multichannel Electrical Stimulation Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesc Varkevisser, Wouter A. Serdijn, Tiago L. Costa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neuroprosthetic devices require multichannel stimulator systems with an
+increasing number of channels. However, there are inherent power losses in
+typical multichannel stimulation circuits caused by a mismatch between the
+power supply voltage and the voltage required at each electrode to successfully
+stimulate tissue. This imposes a bottleneck towards high-channel-count devices,
+which is particularly severe in wirelessly-powered devices. Hence, advances in
+the power efficiency of stimulation systems are critical. To support these
+advances, this paper presents a methodology to identify and quantify power
+losses associated with different power supply scaling strategies in
+multichannel stimulation systems. The proposed methodology utilizes
+distributions of stimulation amplitudes and electrode impedances to calculate
+power losses in multichannel systems. Experimental data from previously
+published studies spanning various stimulation applications were analyzed to
+evaluate the performance of fixed, global, and stepped supply scaling methods,
+focusing on their impact on power dissipation and efficiency. Variability in
+output conditions results in low power efficiency in multichannel stimulation
+systems across all applications. Stepped voltage scaling demonstrated
+substantial efficiency improvements, achieving an increase of 67 % to 146 %,
+particularly in high-channel-count applications with significant variability in
+tissue impedance. Global scaling, by contrast, was more advantageous for
+systems with fewer channels. The findings highlight the importance of tailoring
+power management strategies to specific applications to optimize efficiency
+while minimizing system complexity. The proposed methodology offers a framework
+for evaluating efficiency-complexity trade-offs, advancing the design of
+scalable neurostimulation systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMAPs to model complex multi-state systems with vacation policies in the
+  repair facility 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Eloy Ruiz-Castro, Christian Acal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Two complex multi-state systems subject to multiple events are built in an
+algorithmic and computational way by considering phase-type distributions and
+Markovian arrival processes with marked arrivals. The internal performance of
+the system is composed of different degradation levels and internal repairable
+and non-repairable failures can occur. Also, the system is subject to external
+shocks that may provoke repairable or non-repairable failure. A multiple
+vacation policy is introduced in the system for the repairperson. Preventive
+maintenance is included in the system to improve the behaviour. Two types of
+task may be performed by the repairperson; corrective repair and preventive
+maintenance. The systems are modelled, the transient and stationary
+distributions are built and different performance measures are calculated in a
+matrix-algorithmic form. Cost and rewards are included in the model in a vector
+matrix way. Several economic measures are worked out and the net reward per
+unit of time is used to optimize the system. A numerical example shows that the
+system can be optimized according to the existence of preventive maintenance
+and the distribution of vacation time. The results have been implemented
+computationally with Matlab and R (packages: expm, optim).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A resource management approach for concurrent operation of RF
+  functionalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Marquardt, Sebastian Durst, Kilian Barth, Tobias Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Future multifunction RF systems will be able to not only perform various
+different radar, communication and electronic warfare functionalities but also
+to perform them simultaneously on the same aperture. This ability of concurrent
+operations requires new, cognitive approaches of resource management compared
+to classical methods. This paper presents such a new approach using a
+combination of quality of service based resource management and Monte Carlo
+tree search.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 9 figures, presented at 2024 International Radar Conference
+  (RADAR2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Open Source Validation System for Continuous Arterial Blood Pressure
+  Measuring Sensors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Attila Répai, Sándor Földi, Péter Sótonyi, György Cserey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Measuring the blood pressure waveform is becoming a more frequently studied
+area. The development of sensor technologies opens many new ways to be able to
+measure high-quality signals. The development of such an aim-specific sensor
+can be time-consuming, expensive, and difficult to test or validate with known
+and consistent waveforms. In this paper, we present an open source blood
+pressure waveform simulator with an open source Python validation package to
+reduce development costs for early-stage sensor development and research. The
+simulator mainly consists of 3D printed parts which technology has become a
+widely available and cheap solution. The core part of the simulator is a 3D
+printed cam that can be generated based on real blood pressure waveforms. The
+validation framework can create a detailed comparison between the signal
+waveform used to design the cam and the measured time series from the sensor
+being validated. The presented simulator proved to be robust and accurate in
+short- and long-term use, as it produced the signal waveform consistently and
+accurately. To validate this solution, a 3D force sensor was used, which was
+proven earlier to be able to measure high-quality blood pressure waveforms on
+the radial artery at the wrist. The results showed high similarity between the
+measured and the nominal waveforms, meaning that comparing the normalized
+signals, the RMSE value ranged from $0.0276 \pm 0.0047$ to $0.0212 \pm 0.0023$,
+and the Pearson correlation ranged from $0.9933 \pm 0.0027$ to $0.9978 \pm
+0.0005$. Our validation framework is available at
+https://github.com/repat8/cam-bpw-sim. Our hardware framework, which allows
+reproduction of the presented solution, is available at
+https://github.com/repat8/cam-bpw-sim-hardware. The entire design is an open
+source project and was developed using free software.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures. For associated repositories see
+  https://github.com/repat8/cam-bpw-sim-hardware and
+  https://github.com/repat8/cam-bpw-sim . Submitted to IEEE Transactions on
+  Instrumentation and Measurement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synchronization of Kuramoto oscillators via HEOL, and a discussion on AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emmanuel Delaleau, Cédric Join, Michel Fliess
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial neural networks and their applications in deep learning have
+recently made an incursion into the field of control. Deep learning techniques
+in control are often related to optimal control, which relies on Pontryagin
+maximum principle or the Hamilton-Jacobi-Bellman equation. They imply control
+schemes that are tedious to implement. We show here that the new HEOL setting,
+resulting from the fusion of the two established approaches, namely
+differential flatness and model-free control, provides a solution to control
+problems that is more sober in terms of computational resources. This
+communication is devoted to the synchronization of the popular Kuramoto's
+coupled oscillators, which was already considered via artificial neural
+networks (B\"ottcher et al., Nature Communications 2022), where, contrarily to
+this communication, only the single control variable is examined. One
+establishes the flatness of Kuramoto's coupled oscillator model with
+multiplicative control and develops the resulting HEOL control. Unlike many
+exemples, this system reveals singularities that are avoided by a clever
+generation of phase angle trajectories. The results obtained, verified in
+simulation, show that it is not only possible to synchronize these oscillators
+in finite time, and even to follow angular frequency profiles, but also to
+exhibit robustness concerning model mismatches. To the best of our knowledge
+this has never been done before. Concluding remarks advocate a viewpoint, which
+might be traced back to Wiener's cybernetics: control theory belongs to AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MATHMOD 2025 (11th Vienna International Conference on Mathematical
+  Modelling, 19-21 February 2025, Vienna, Austria)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prediction Interval Construction Method for Electricity Prices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07827v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07827v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of electricity prices plays an essential role in the
+electricity market. To reflect the uncertainty of electricity prices, price
+intervals are predicted. This paper proposes a novel prediction interval
+construction method. A conditional generative adversarial network is first
+presented to generate electricity price scenarios, with which the prediction
+intervals can be constructed. Then, different generated scenarios are stacked
+to obtain the probability densities, which can be applied to accurately reflect
+the uncertainty of electricity prices. Furthermore, a reinforced prediction
+mechanism based on the volatility level of weather factors is introduced to
+address the spikes or volatile prices. A case study is conducted to verify the
+effectiveness of the proposed novel prediction interval construction method.
+The method can also provide the probability density of each price scenario
+within the prediction interval and has the superiority to address the volatile
+prices and price spikes with a reinforced prediction mechanism.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy Storage Arbitrage Under Price Uncertainty: Market Risks and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqian Wu, Bolun Xu, James Anderson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the profitability and risk of energy storage arbitrage in
+electricity markets under price uncertainty, exploring both robust and
+chance-constrained optimization approaches. We analyze various uncertainty
+representations, including polyhedral, ellipsoidal uncertainty sets and
+probabilistic approximations, to model price fluctuations and construct
+efficient frontiers that highlight the tradeoff between risk and profit. Using
+historical electricity price data, we quantify the impact of uncertainty on
+arbitrage strategies and compare their performance under distinct market
+conditions. The results reveal that arbitrage strategies under uncertainties
+can effectively secure expected profits, and robust strategies perform better
+in risk management across varying levels of conservativeness, especially under
+highly volatile market conditions. This work provides insights into storage
+arbitrage strategy selection for market participants with differing risk
+preferences, emphasizing the adaptability of efficient frontiers to the
+electricity market.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Electrostatic Clutches Enable High-Force Mechanical Multiplexing:
+  Demonstrating Single-Motor Full-Actuation of a 4-DoF Hand 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy E. Amish, Jeffrey T. Auletta, Chad C. Kessens, Joshua R. Smith, Jeffrey I. Lipton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel mechanical multiplexing system powered by
+electrostatic capstan clutches, enabling high-force, single-motor control of
+multiple degrees of freedom (DoF). The system is capable of both bidirectional
+single-input single-output time-division and single-input multiple-output
+multiplexing to actuate a commercial 4-DoF robotic hand with a single motor.
+Our mechanical multiplexer is also capable of powerless position holding owing
+to its use of a leadscrew nut acting as the output. Experimental results
+demonstrate the effectiveness of this approach, achieving individual and
+simultaneous actuation. This innovation offers a scalable solution for high-DoF
+robotic systems, providing a path to efficient actuation in robotic platforms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonlinear Modeling of a PEM Fuel Cell System; a Practical Study with
+  Experimental Validation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Mehdi Rakhtala, Roja Eini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a nonlinear six order model is proposed for a proton exchange
+membrane fuel cell (PEMFC) as a control-oriented electrochemical model. Its
+validation is performed on a specific single cell PEMFC with effective
+dimension of 5 cm5 cm. This model is described in the nonlinear state space
+form with 6 state variables. Load current and DC voltage are considered as
+measurable disturbance and control input respectively. Besides, the model
+includes fuel cell stack and its auxiliary components as well. In this survey,
+a nonlinear state space representation is derived by arranging nonlinear
+equations and combining them with auxiliary components model. The proposed
+model can be successfully used to design nonlinear controller and nonlinear
+observer systems. The analyzed PEMFC system consists of air compressor motor
+dynamic equations, air and fuel supply subsystems, a perfect air humidifier and
+a fuel cell stack. An experimentally validated nonlinear model that reproduces
+the most typical features of a laboratory PEMFC system is presented. This model
+is derived based on physics law in stack, including system gases dynamics. The
+objective of this paper is to introduce a fully analytical model which has been
+fully validated on a fuel cell system and its auxiliary components. The
+proposed method can be used as a general modeling guideline for
+control-oriented purposes. Moreover, it can be successfully implemented in
+composing a dynamic subsystem, like hydrogen subsystem, as part of the whole
+nonlinear model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1272-1296</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CVaR-Based Variational Quantum Optimization for User Association in
+  Handoff-Aware Vehicular Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijiang Yan, Hao Zhou, Jianhua Pei, Aryan Kaushik, Hina Tabassum, Ping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient resource allocation is essential for optimizing various tasks in
+wireless networks, which are usually formulated as generalized assignment
+problems (GAP). GAP, as a generalized version of the linear sum assignment
+problem, involves both equality and inequality constraints that add
+computational challenges. In this work, we present a novel Conditional Value at
+Risk (CVaR)-based Variational Quantum Eigensolver (VQE) framework to address
+GAP in vehicular networks (VNets). Our approach leverages a hybrid
+quantum-classical structure, integrating a tailored cost function that balances
+both objective and constraint-specific penalties to improve solution quality
+and stability. Using the CVaR-VQE model, we handle the GAP efficiently by
+focusing optimization on the lower tail of the solution space, enhancing both
+convergence and resilience on noisy intermediate-scale quantum (NISQ) devices.
+We apply this framework to a user-association problem in VNets, where our
+method achieves 23.5% improvement compared to the deep neural network (DNN)
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE International Conference on Communications (ICC
+  2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Predictive Cooperative Collision Avoidance for Multi-Robot Systems
+  Using Control Barrier Function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxiao Li, Zhirui Sun, Hongpeng Wang, Shuai Li, Jiankun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Control barrier function (CBF)-based methods provide the minimum modification
+necessary to formally guarantee safety in the context of quadratic programming,
+and strict safety guarantee for safety critical systems. However, most
+CBF-related derivatives myopically focus on present safety at each time step, a
+reasoning over a look-ahead horizon is exactly missing. In this paper, a
+predictive safety matrix is constructed. We then consolidate the safety
+condition based on the smallest eigenvalue of the proposed safety matrix. A
+predefined deconfliction strategy of motion paths is embedded into the
+trajectory tracking module to manage deadlock conflicts, which computes the
+deadlock escape velocity with the minimum attitude angle. Comparison results
+show that the introduction of the predictive term is robust for measurement
+uncertainty and is immune to oscillations. The proposed deadlock avoidance
+method avoids a large detour, without obvious stagnation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing a multi-state cold-standby system with multiple vacations in
+  the repair and loss of units 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Eloy Ruiz-Castro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A complex multi-state redundant system with preventive maintenance subject to
+multiple events is considered. The online unit can undergo several types of
+failures: internal and those provoked by external shocks. Multiple degradation
+levels are assumed so as internal and external. Degradation levels are observed
+by random inspections and if they are major, the unit goes to repair facility
+where preventive maintenance is carried out. This repair facility is composed
+of a single repairperson governed by a multiple vacation policy. This policy is
+set up according to the operational number of units. Two types of task can be
+performed by the repairperson, corrective repair and preventive maintenance.
+The times embedded in the system are phase type distributed and the model is
+built by using Markovian Arrival Processes with marked arrivals. Multiple
+performance measures besides of the transient and stationary distribution are
+worked out through matrix-analytic methods. This methodology enables us to
+express the main results and the global development in a matrix-algorithmic
+form. To optimize the model costs and rewards are included. A numerical example
+shows the versatility of the model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enforcing contraction via data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07819v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07819v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Hu, Claudio De Persis, Pietro Tesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present data-based conditions for enforcing contractivity via feedback
+control and obtain desired asymptotic properties of the closed-loop system. We
+focus on unknown nonlinear control systems whose vector fields are expressible
+via a dictionary of functions and derive data-dependent semidefinite programs
+whose solution returns the controller that guarantees contractivity. When data
+are perturbed by disturbances that are linear combinations of sinusoids of
+known frequencies (but unknown amplitude and phase) and constants, we
+remarkably obtain conditions for contractivity that do not depend on the
+magnitude of the disturbances, with imaginable positive consequences for the
+synthesis of the controller. Finally, we show how to design from data an
+integral controller for nonlinear systems that achieves constant reference
+tracking and constant disturbance rejection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenSafe: A Generalizable Safety Enhancer for Safe Reinforcement Learning
+  Algorithms Based on Reduced Order Markov Decision Process Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03912v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03912v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhehua Zhou, Xuan Xie, Jiayang Song, Zhan Shu, Lei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe Reinforcement Learning (SRL) aims to realize a safe learning process for
+Deep Reinforcement Learning (DRL) algorithms by incorporating safety
+constraints. However, the efficacy of SRL approaches often relies on accurate
+function approximations, which are notably challenging to achieve in the early
+learning stages due to data insufficiency. To address this issue, we introduce
+in this work a novel Generalizable Safety enhancer (GenSafe) that is able to
+overcome the challenge of data insufficiency and enhance the performance of SRL
+approaches. Leveraging model order reduction techniques, we first propose an
+innovative method to construct a Reduced Order Markov Decision Process (ROMDP)
+as a low-dimensional approximator of the original safety constraints. Then, by
+solving the reformulated ROMDP-based constraints, GenSafe refines the actions
+of the agent to increase the possibility of constraint satisfaction.
+Essentially, GenSafe acts as an additional safety layer for SRL algorithms. We
+evaluate GenSafe on multiple SRL approaches and benchmark problems. The results
+demonstrate its capability to improve safety performance, especially in the
+early learning phases, while maintaining satisfactory task performance. Our
+proposed GenSafe not only offers a novel measure to augment existing SRL
+methods but also shows broad compatibility with various SRL algorithms, making
+it applicable to a wide range of systems and SRL problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Layer-Adaptive State Pruning for Deep State Space Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02824v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02824v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minseon Gwak, Seongrok Moon, Joohwan Ko, PooGyeon Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the lack of state dimension optimization methods, deep state space
+models (SSMs) have sacrificed model capacity, training search space, or
+stability to alleviate computational costs caused by high state dimensions. In
+this work, we provide a structured pruning method for SSMs, Layer-Adaptive
+STate pruning (LAST), which reduces the state dimension of each layer in
+minimizing model-level output energy loss by extending modal truncation for a
+single system. LAST scores are evaluated using the $\mathcal{H}_{\infty}$ norms
+of subsystems and layer-wise energy normalization. The scores serve as global
+pruning criteria, enabling cross-layer comparison of states and layer-adaptive
+pruning. Across various sequence benchmarks, LAST optimizes previous SSMs,
+revealing the redundancy and compressibility of their state spaces. Notably, we
+demonstrate that, on average, pruning 33% of states still maintains performance
+with 0.52% accuracy loss in multi-input multi-output SSMs without retraining.
+Code is available at https://github.com/msgwak/LAST.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cooperative Aerial Robot Inspection Challenge: A Benchmark for
+  Heterogeneous Multi-UAV Planning and Lessons Learned 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06566v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06566v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muqing Cao, Thien-Minh Nguyen, Shenghai Yuan, Andreas Anastasiou, Angelos Zacharia, Savvas Papaioannou, Panayiotis Kolios, Christos G. Panayiotou, Marios M. Polycarpou, Xinhang Xu, Mingjie Zhang, Fei Gao, Boyu Zhou, Ben M. Chen, Lihua Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a
+simulation-based benchmark for motion planning algorithms in heterogeneous
+multi-UAV systems. CARIC features UAV teams with complementary sensors,
+realistic constraints, and evaluation metrics prioritizing inspection quality
+and efficiency. It offers a ready-to-use perception-control software stack and
+diverse scenarios to support the development and evaluation of task allocation
+and motion planning algorithms. Competitions using CARIC were held at IEEE CDC
+2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation,
+attracting innovative solutions from research teams worldwide. This paper
+examines the top three teams from CDC 2023, analyzing their exploration,
+inspection, and task allocation strategies while drawing insights into their
+performance across scenarios. The results highlight the task's complexity and
+suggest promising directions for future research in cooperative multi-UAV
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Please find our website at https://ntu-aris.github.io/caric</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Age of Computing: A Metric of Computation Freshness in Communication and
+  Computation Cooperative Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05007v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05007v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingran Chen, Yi Zhuang, Kun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In communication and computation cooperative networks (3CNs), timely
+computation is crucial but not always guaranteed. There is a strong demand for
+a computational task to be completed within a given deadline. The time taken
+involves both processing time, communication time, and the impact of the
+deadline. However, a measure of such timeliness in 3CNs is lacking. In this
+paper, we introduce the novel concept, Age of Computing (AoC), to capture
+computation freshness in 3CNs. We analyze AoC in a line topology consisting of
+a source, a transmitter, a receiver, and a computational node. Tasks generated
+by the source are immediately available at the transmitter, where they enter a
+communication queue. These tasks then pass to the receiver and subsequently to
+a computation queue at the computational node for processing. Each task has a
+deadline, requiring completion within this timeframe. AoC is evaluated under
+two types of deadlines: (i) soft deadline, tasks can be fed back to the source
+if delayed beyond the deadline, but with additional latency; (ii) hard
+deadline, tasks delayed beyond the deadline are discarded. Under both
+deadlines, we derive the AoC formula and a general expression for the
+time-average AoC. For the first-come, first-serve discipline, we obtain a
+closed-form expression for the average AoC under the soft deadline and an
+approximation for the hard deadline. In addition to freshness, we define
+computation throughput, providing a general expression and an approximation. To
+explore the relationship between freshness and throughput, we construct an
+optimization problem and prove that the objective pair is a weakly
+Pareto-optimal point. Numerical results validate all the theoretical findings.
+Additionally, they reveal that under the hard deadline, the computation
+throughput serves as a reliable proxy for the average AoC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Promoting Shared Energy Storage Aggregation among High Price-Tolerance
+  Prosumer: An Incentive Deposit and Withdrawal Service 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04964v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04964v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Lu, Jing Qiu, Cuo Zhang, Gang Lei, Jianguo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many residential prosumers exhibit a high price-tolerance for household
+electricity bills and a low response to price incentives. This is because the
+household electricity bills are not inherently high, and the potential for
+saving on electricity bills through participation in conventional Shared Energy
+Storage (SES) is limited, which diminishes their motivation to actively engage
+in SES. Additionally, existing SES models often require prosumers to take
+additional actions, such as optimizing rental capacity and bidding prices,
+which happen to be capabilities that typical household prosumers do not
+possess. To incentivize these high price-tolerance residential prosumers to
+participate in SES, a novel SES aggregation framework is proposed, which does
+not require prosumers to take additional actions and allows them to maintain
+existing energy storage patterns. Compared to conventional long-term operation
+of SES, the proposed framework introduces an additional short-term construction
+step during which the energy service provider (ESP) acquires control of the
+energy storage systems (ESS) and offers electricity deposit and withdrawal
+services (DWS) with dynamic coefficients, enabling prosumers to withdraw more
+electricity than they deposit without additional actions. Additionally, a
+matching mechanism is proposed to align prosumers' electricity consumption
+behaviors with ESP's optimization strategies. Finally, the dynamic coefficients
+in DWS and trading strategies are optimized by an improved deep reinforcement
+learning (DRL) algorithm. Case studies are conducted to verify the
+effectiveness of the proposed SES aggregation framework with DWS and the
+matching mechanism.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cost-Effective Robotic Handwriting System with AI Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06783v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06783v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Huang, Richard Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a cost-effective robotic handwriting system designed to
+replicate human-like handwriting with high precision. Combining a Raspberry Pi
+Pico microcontroller, 3D-printed components, and a machine learning-based
+handwriting generation model implemented via TensorFlow, the system converts
+user-supplied text into realistic stroke trajectories. By leveraging
+lightweight 3D-printed materials and efficient mechanical designs, the system
+achieves a total hardware cost of approximately \$56, significantly
+undercutting commercial alternatives. Experimental evaluations demonstrate
+handwriting precision within $\pm$0.3 millimeters and a writing speed of
+approximately 200 mm/min, positioning the system as a viable solution for
+educational, research, and assistive applications. This study seeks to lower
+the barriers to personalized handwriting technologies, making them accessible
+to a broader audience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an updated version of a paper originally presented at the
+  2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tactile-based Exploration, Mapping and Navigation with
+  Collision-Resilient Aerial Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17217v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17217v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karishma Patnaik, Aravind Adhith Pandian Saravanakumaran, Wenlong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces XPLORER, a passive deformable UAV with a
+spring-augmented chassis and proprioceptive state awareness, designed to endure
+collisions and maintain smooth contact. We develop a fast-converging external
+force estimation algorithm for XPLORER that leverages onboard sensors and
+proprioceptive data for contact and collision detection. Using this force
+information, we propose four motion primitives, including three novel
+tactile-based primitives: tactile-traversal, tactile-turning, and
+ricocheting-to aid XPLORER in navigating unknown environments. These primitives
+are synthesized autonomously in real-time to enable efficient exploration and
+navigation by leveraging collisions and contacts. Experimental results
+demonstrate the effectiveness of our approach, highlighting the potential of
+passive deformable UAVs for contact-rich real-world tasks such as
+non-destructive inspection, surveillance and mapping, and pursuit/evasion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Invariance Proximity: Closed-Form Error Bounds for Finite-Dimensional
+  Koopman-Based Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.13033v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.13033v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masih Haseli, Jorge Cortés
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A popular way to approximate the Koopman operator's action on a
+finite-dimensional subspace of functions is via orthogonal projections. The
+quality of the projected model directly depends on the selected subspace,
+specifically on how close it is to being invariant under the Koopman operator.
+The notion of invariance proximity provides a tight upper bound on the
+worst-case relative prediction error of the finite-dimensional model. However,
+its direct calculation is computationally challenging. This paper leverages the
+geometric structure behind the definition of invariance proximity to provide a
+closed-form expression in terms of Jordan principal angles on general inner
+product spaces. Unveiling this connection allows us to exploit specific
+isomorphisms to circumvent the computational challenges associated with spaces
+of functions and enables the use of existing efficient numerical routines to
+compute invariance proximity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The improvement in transmission resilience metrics from reduced outages
+  or faster restoration can be calculated by rerunning historical outage data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06042v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06042v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arslan Ahmad, Ian Dobson, Svetlana Ekisheva, Christopher Claypool, Mark Lauby
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transmission utilities routinely collect detailed outage data, including
+resilience events in which outages bunch up due to weather. The resilience
+events and their resilience metrics can readily be extracted from this
+historical outage data. Improvements such as grid hardening or investments in
+restoration lead to reduced outages or faster restoration. We show how to rerun
+this history with the effects of the reduced outages or faster restoration
+included to find the resulting improvement in resilience metrics, thus
+quantifying the benefits of these investments. This is demonstrated with case
+studies for specific events (a derecho and a hurricane), and all large events
+or large thunderstorms in the Midwest USA. Instead of predicting future extreme
+events with models, which is very challenging, the historical rerun readily
+quantifies the benefits that a resilience investment would have had if it had
+been made in the past. The historical rerun is particularly vivid in making the
+case for resilience investments to stakeholders because it quantifies the
+benefits for events actually experienced by those stakeholders, rather than for
+future events predicted with uncertainty.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Observer-based Periodic Event-triggered and Self-triggered Boundary
+  Control of a Class of Parabolic PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01313v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01313v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhathiya Rathnayake, Mamadou Diagne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the first observer-based periodic event-triggered
+control (PETC) and self-triggered control (STC) for boundary control of a class
+of parabolic PDEs using PDE backstepping control. We introduce techniques to
+convert a certain class of continuous-time event-triggered control into PETC
+and STC, eliminating the need for continuous monitoring of the event-triggering
+function. For the PETC, the event-triggering function requires only periodic
+evaluations to detect events, while the STC proactively computes the time of
+the next event right at the current event time using the system model and the
+continuously available measurements. For both strategies, the control input is
+updated exclusively at events and is maintained using a zero-order hold between
+events. We demonstrate that the closed-loop system is Zeno-free. We offer
+criteria for selecting an appropriate sampling period for the PETC and for
+determining the time until the next event under the STC. We prove the system's
+global exponential convergence to zero in the spatial $L^2$ norm for both
+anti-collocated and collocated sensing and actuation under the PETC. For the
+STC, local exponential convergence to zero in the spatial $L^2$ norm for
+collocated sensing and actuation is proven. Simulations are provided to
+illustrate the theoretical claims.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Performance-Barrier Event-Triggered Control of a Class of
+  Reaction-Diffusion PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.08178v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.08178v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhathiya Rathnayake, Mamadou Diagne, Jorge Cortes, Miroslav Krstic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We employ the recent performance-barrier event-triggered control (P-ETC) for
+achieving global exponential convergence of a class of reaction-diffusion PDEs
+via PDE backstepping control. Rather than insisting on a strictly monotonic
+decrease of the Lyapunov function for the closed-loop system, P-ETC allows the
+Lyapunov function to increase as long as it remains below an acceptable
+performance-barrier. This approach integrates a performance residual, the
+difference between the value of the performance-barrier and the Lyapunov
+function, into the triggering mechanism. The integration adds flexibility and
+results in fewer control updates than with regular ETC (R-ETC) that demands a
+monotonic decrease of the Lyapunov function. Our P-ETC PDE backstepping design
+ensures global exponential convergence of the closed-loop system in the spatial
+L^2 norm, without encountering Zeno phenomenon. To avoid continuous monitoring
+of the triggering function that generates events, we develop periodic
+event-triggered and self-triggered variants (P-PETC and P-STC, respectively) of
+the P-ETC. The P-PETC only requires periodic evaluation of the triggering
+function whereas the P-STC preemptively computes the time of the next event at
+the current event time using the system model and continuously available system
+states. The P-PETC and P-STC also ensure a Zeno-free behavior and deliver
+performance equivalent to that of the continuous-time P-ETC which requires
+continuous evaluation of the triggering function, in addition to the continuous
+sensing of the state. We provide numerical simulations to illustrate the
+proposed technique and to compare it with R-ETC associated with strictly
+decreasing Lyapunov functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Resilient Distributed Optimization for Multi-Agent Cyberphysical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02459v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02459v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Yemini, Angelia Nedić, Andrea J. Goldsmith, Stephanie Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work focuses on the problem of distributed optimization in multi-agent
+cyberphysical systems, where a legitimate agent's iterates are influenced both
+by the values it receives from potentially malicious neighboring agents, and by
+its own self-serving target function. We develop a new algorithmic and
+analytical framework to achieve resilience for the class of problems where
+stochastic values of trust between agents exist and can be exploited. In this
+case, we show that convergence to the true global optimal point can be
+recovered, both in mean and almost surely, even in the presence of malicious
+agents. Furthermore, we provide expected convergence rate guarantees in the
+form of upper bounds on the expected squared distance to the optimal value.
+Finally, numerical results are presented that validate our analytical
+convergence guarantees even when the malicious agents compose the majority of
+agents in the network and where existing methods fail to converge to the
+optimal nominal points.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the IEEE Transactions on Automatic
+  Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">43</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Equilibrium in Online Learning: Theory and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08330v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08330v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasios N. Angelopoulos, Michael I. Jordan, Ryan J. Tibshirani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new perspective on online learning that we refer to as gradient
+equilibrium: a sequence of iterates achieves gradient equilibrium if the
+average of gradients of losses along the sequence converges to zero. In
+general, this condition is not implied by nor implies sublinear regret. It
+turns out that gradient equilibrium is achievable by standard online learning
+methods such as gradient descent and mirror descent with constant step sizes
+(rather than decaying step sizes, as is usually required for no regret).
+Further, as we show through examples, gradient equilibrium translates into an
+interpretable and meaningful property in online prediction problems spanning
+regression, classification, quantile estimation, and others. Notably, we show
+that the gradient equilibrium framework can be used to develop a debiasing
+scheme for black-box predictions under arbitrary distribution shift, based on
+simple post hoc online descent updates. We also show that post hoc gradient
+updates can be used to calibrate predicted quantiles under distribution shift,
+and that the framework leads to unbiased Elo scores for pairwise preference
+prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at
+  https://github.com/aangelopoulos/gradient-equilibrium/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Similarity Measure Between Functions with Applications to Statistical
+  Learning and Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengpiao Huang, Kaizheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note, we present a novel measure of similarity between two functions.
+It quantifies how the sub-optimality gaps of two functions convert to each
+other, and unifies several existing notions of functional similarity. We show
+that it has convenient operation rules, and illustrate its use in empirical
+risk minimization and non-stationary online optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A GPU-Accelerated Distributed Algorithm for Optimal Power Flow in
+  Distribution Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minseok Ryu, Geunyeong Byeon, Kibaek Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a GPU-accelerated distributed optimization algorithm for
+controlling multi-phase optimal power flow in active distribution systems with
+dynamically changing topologies. To handle varying network configurations and
+enable adaptable decomposition, we advocate a componentwise decomposition
+strategy. However, this approach can lead to a prolonged computation time
+mainly due to the excessive iterations required for achieving consensus among a
+large number of fine-grained components. To overcome this, we introduce a
+technique that segregates equality constraints from inequality constraints,
+enabling GPU parallelism to reduce per-iteration time by orders of magnitude,
+thereby significantly accelerating the overall computation. Numerical
+experiments on IEEE test systems ranging from 13 to 8500 buses demonstrate the
+superior scalability of the proposed approach compared to its CPU-based
+counterparts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiplayer Federated Learning: Reaching Equilibrium with Less
+  Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        TaeHo Yoon, Sayantan Choudhury, Nicolas Loizou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Federated Learning (FL) approaches assume collaborative clients
+with aligned objectives working towards a shared global model. However, in many
+real-world scenarios, clients act as rational players with individual
+objectives and strategic behaviors, a concept that existing FL frameworks are
+not equipped to adequately address. To bridge this gap, we introduce
+Multiplayer Federated Learning (MpFL), a novel framework that models the
+clients in the FL environment as players in a game-theoretic context, aiming to
+reach an equilibrium. In this scenario, each player tries to optimize their own
+utility function, which may not align with the collective goal. Within MpFL, we
+propose Per-Player Local Stochastic Gradient Descent (PEARL-SGD), an algorithm
+in which each player/client performs local updates independently and
+periodically communicates with other players. We theoretically analyze
+PEARL-SGD and prove that it reaches a neighborhood of equilibrium with less
+communication in the stochastic setup compared to its non-local counterpart.
+Finally, we verify our theoretical findings through numerical experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convergence of projected stochastic approximation algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michał Borowski, Błażej Miasojedow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the Robbins-Monro stochastic approximation algorithm with
+projections on a hyperrectangle and prove its convergence. This work fills a
+gap in the convergence proof of the classic book by Kushner and Yin. Using the
+ODE method, we show that the algorithm converges to stationary points of a
+related projected ODE. Our results provide a better theoretical foundation for
+stochastic optimization techniques, including stochastic gradient descent and
+its proximal version. These results extend the algorithm's applicability and
+relax some assumptions of previous research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonlinear Cruise Controllers with Bidirectional Sensing for a String of
+  Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iasson Karafyllis, Dionysios Theodosis, Markos Papageorgiou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a nonlinear cruise controller that is fully decentralized (by
+vehicle) and uses spacing and speed measurements from the preceding and
+following vehicles to decide on the appropriate control action (acceleration)
+for each vehicle. The proposed cruise controller is studied on both a ring-road
+and an open road and guarantees that there are no collisions between vehicles,
+while their speeds are always positive and never exceed the road speed limits.
+For both cases of the open road and the ring-road, we rigorously prove that the
+set of equilibrium points is globally asymptotically stable and provide KL
+estimates that guarantee uniform convergence to the said set. Moreover, we show
+that for the ring-road, and under certain conditions, there is a single
+equilibrium point which is exponentially attractive.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Economic Model Predictive Control for Periodic Operation: A Quadratic
+  Programming Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose A. Borja-Conde, Juan M. Nadales, Filiberto Fele, Daniel Limon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Periodic dynamical systems, distinguished by their repetitive behavior over
+time, are prevalent across various engineering disciplines. In numerous
+applications, particularly within industrial contexts, the implementation of
+model predictive control (MPC) schemes tailored to optimize specific economic
+criteria was shown to offer substantial advantages. However, the real-time
+implementation of these schemes is often infeasible due to limited
+computational resources. To tackle this problem, we propose a
+resource-efficient economic model predictive control scheme for periodic
+systems, leveraging existing single-layer MPC techniques. Our method relies on
+a single quadratic optimization problem, which ensures high computational
+efficiency for real-time control in dynamic settings. We prove feasibility,
+stability and convergence to optimum of the proposed approach, and validate the
+effectiveness through numerical experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Proximal Flow Inspired Multi-Step Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yushen Huang, Yifan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate a family of approximate multi-step proximal point methods,
+framed as implicit linear discretizations of gradient flow. The resulting
+methods are multi-step proximal point methods, with similar computational cost
+in each update as the proximal point method. We explore several optimization
+methods where applying an approximate multistep proximal points method results
+in improved convergence behavior. We also include convergence analysis for the
+proposed method in several problem settings: quadratic problems, general
+problems that are strongly or weakly (non)convex, and accelerated results for
+alternating projections.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Time- and Space-Efficient Heuristic Approach for Late Train-Crew
+  Rescheduling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyun Yu, Carl Henrik Häll, Anders Peterson, Christiane Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we reschedule the duties of train drivers one day before the
+operation. Due to absent drivers (e.g., because of sick leave), some trains
+have no driver. Thus, duties need to be rescheduled for the day of operation.
+We start with a feasible crew schedule for each of the remaining operating
+drivers, a set of unassigned tasks originally assigned to the absent drivers,
+and a group of standby drivers with fixed start time, end time, start depot,
+and end depot. Our aim is to generate a crew schedule with as few canceled or
+changed tasks as possible. We present a tabu-search-based approach for crew
+rescheduling. We also adapt a column-generation approach with the same
+objective function and equivalent restrictions as the benchmark for comparing
+the results, computational time, and space usage. Our tabu-search-based
+approach needs both less computation time and space than the column-generation
+approach to compute an acceptable result. We further test the performance of
+our approach under different settings. The data used in the experiments
+originated from a regional passenger-train system around Stockholm, Sweden and
+was provided by M\"alart\r{a}g.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Subdifferentials Via a Generalized Conjugation Scheme: An Application
+  to DC Problems and Optimality Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. D. Fajardo, J. Vidal-Nunez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies properties of a subdifferential defined using a
+generalized conjugation scheme. We relate this subdifferential together with
+the domain of an appropriate conjugate function and the {\epsilon}-directional
+derivative. In addition, we also present necessary conditions for
+{\epsilon}-optimality and global optimality in optimization problems involving
+the difference of two convex functions. These conditions will be written via
+this generalized notion of subdifferential studied in the first sections of the
+paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Fenchel c-conjugate dual problems for DC optimization: characterizing
+  weak, strong and stable strong duality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. D. Fajardo, J. Vidal-Nunez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we present two Fenchel-type dual problems for a DC (difference
+of convex functions) optimization primal one. They have been built by means of
+the c-conjugation scheme, a pattern of conjugation which has been shown to be
+suitable for evenly convex functions. We study characterizations of weak,
+strong and stable strong duality for both pairs of primal-dual problems. We
+also give conditions which relate the existence of strong and stable strong
+duality for both pairs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synchronization of Kuramoto oscillators via HEOL, and a discussion on AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emmanuel Delaleau, Cédric Join, Michel Fliess
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial neural networks and their applications in deep learning have
+recently made an incursion into the field of control. Deep learning techniques
+in control are often related to optimal control, which relies on Pontryagin
+maximum principle or the Hamilton-Jacobi-Bellman equation. They imply control
+schemes that are tedious to implement. We show here that the new HEOL setting,
+resulting from the fusion of the two established approaches, namely
+differential flatness and model-free control, provides a solution to control
+problems that is more sober in terms of computational resources. This
+communication is devoted to the synchronization of the popular Kuramoto's
+coupled oscillators, which was already considered via artificial neural
+networks (B\"ottcher et al., Nature Communications 2022), where, contrarily to
+this communication, only the single control variable is examined. One
+establishes the flatness of Kuramoto's coupled oscillator model with
+multiplicative control and develops the resulting HEOL control. Unlike many
+exemples, this system reveals singularities that are avoided by a clever
+generation of phase angle trajectories. The results obtained, verified in
+simulation, show that it is not only possible to synchronize these oscillators
+in finite time, and even to follow angular frequency profiles, but also to
+exhibit robustness concerning model mismatches. To the best of our knowledge
+this has never been done before. Concluding remarks advocate a viewpoint, which
+might be traced back to Wiener's cybernetics: control theory belongs to AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MATHMOD 2025 (11th Vienna International Conference on Mathematical
+  Modelling, 19-21 February 2025, Vienna, Austria)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A class of matrix splitting-based fixed-point iteration method for the
+  vertical nonlinear complementarity problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang Yapeng, Mu Xuewen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a class of matrix splitting-based fixed-point
+iteration (FPI) methods for solving the vertical nonlinear complementarity
+problem (VNCP). Under appropriate conditions, we present two convergence
+results obtained using different techniques and estimate the number of
+iterations required for the FPI method. Additionally, through numerical
+experiments, we demonstrated that the FPI method surpasses other methods in
+computational efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An accelerated gradient method with adaptive restart for convex
+  multiobjective optimization problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07863v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07863v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Luo, Liping Tang, Xinmin Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, based on the continuous time approach, we propose an
+accelerated gradient method with adaptive residual restart for convex
+multiobjective optimization problems. For the first, we derive rigorously the
+continuous limit of the multiobjective accelerated proximal gradient method by
+Tanabe et al. [arXiv:2022.10994, 2022]. It is a second-order ordinary
+differential equation (ODE) that involves a special projection operator and can
+be viewed as an extension of the ODE by Su et al. [J. Mach. Learn. Res., 2016]
+for Nesterov's accelerated gradient method. Based on this, we introduce a novel
+accelerated multiobjective gradient (AMG) flow with tailored time scaling that
+adapts automatically to the convex case and the strongly convex case, and the
+exponential decay rate of a merit function along with the solution trajectory
+of AMG flow is established via the Lyapunov analysis. After that, we consider
+an implicit-explicit time discretization and obtain an accelerated
+multiobjective gradient method with a convex quadratic programming subproblem.
+The fast sublinear rate and linear rate are proved respectively for convex and
+strongly convex problems. In addition, we present an efficient residual based
+adaptive restart technique to overcome the oscillation issue and improve the
+convergence significantly. Numerical results are provided to validate the
+practical performance of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Peaceman-Rachford Splitting Method Converges Ergodically for Solving
+  Convex Optimization Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07807v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07807v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaihuang Chen, Defeng Sun, Yancheng Yuan, Guojun Zhang, Xinyuan Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we prove that the ergodic sequence generated by the
+Peaceman-Rachford (PR) splitting method with semi-proximal terms converges for
+convex optimization problems (COPs). Numerical experiments on the linear
+programming benchmark dataset further demonstrate that, with a restart
+strategy, the ergodic sequence of the PR splitting method with semi-proximal
+terms consistently outperforms both the point-wise and ergodic sequences of the
+Douglas-Rachford (DR) splitting method. These findings indicate that the
+restarted ergodic PR splitting method is a more effective choice for tackling
+large-scale COPs compared to its DR counterparts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Existence and uniqueness of control sets with a nonempty interior for
+  linear control systems on solvable groups 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adriano Da Silva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we obtain weak conditions for the existence of a control set
+with a nonempty interior for a linear control system on a solvable Lie group.
+We show that the Lie algebra rank condition together with the compactness of
+the nilpotent part of the generalized kernel of the drift are enough to assure
+the existence of such a control set. Moreover, this control set is unique and
+contains the whole generalized kernel in its closure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy Storage Arbitrage Under Price Uncertainty: Market Risks and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqian Wu, Bolun Xu, James Anderson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the profitability and risk of energy storage arbitrage in
+electricity markets under price uncertainty, exploring both robust and
+chance-constrained optimization approaches. We analyze various uncertainty
+representations, including polyhedral, ellipsoidal uncertainty sets and
+probabilistic approximations, to model price fluctuations and construct
+efficient frontiers that highlight the tradeoff between risk and profit. Using
+historical electricity price data, we quantify the impact of uncertainty on
+arbitrage strategies and compare their performance under distinct market
+conditions. The results reveal that arbitrage strategies under uncertainties
+can effectively secure expected profits, and robust strategies perform better
+in risk management across varying levels of conservativeness, especially under
+highly volatile market conditions. This work provides insights into storage
+arbitrage strategy selection for market participants with differing risk
+preferences, emphasizing the adaptability of efficient frontiers to the
+electricity market.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Continuous p-Hub Location Problems with the L1 Metric 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08439v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08439v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Wu, Joseph Geunes, Xiaofeng Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by emerging urban applications in commercial, public sector, and
+humanitarian logistics, we revisit continuous $p$-hub location problems in
+which several facilities must be located in a continuous space such that the
+expected minimum Manhattan travel distance from a random service provider to a
+random customer through exactly one hub facility is minimized. In this paper,
+we begin by deriving closed-form results for a one-dimensional case and
+two-dimensional cases with up to two hubs. Subsequently, a simulation-based
+approximation method is proposed for more complex two-dimensional scenarios
+with more than two hubs. Moreover, an extended problem with multiple service
+providers is analyzed to reflect real-life service settings. Finally, we apply
+our model and approximation method using publicly available data as a case
+study to optimize the deployment of public-access automated external
+defibrillators in Virginia Beach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OptiChat: Bridging Optimization Models and Practitioners with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08406v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08406v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Chen, Gonzalo Esteban Constante-Flores, Krishna Sri Ipsit Mantri, Sai Madhukiran Kompalli, Akshdeep Singh Ahluwalia, Can Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization models have been applied to solve a wide variety of
+decision-making problems. These models are usually developed by optimization
+experts but are used by practitioners without optimization expertise in various
+application domains. As a result, practitioners often struggle to interact with
+and draw useful conclusions from optimization models independently. To fill
+this gap, we introduce OptiChat, a natural language dialogue system designed to
+help practitioners interpret model formulation, diagnose infeasibility, analyze
+sensitivity, retrieve information, evaluate modifications, and provide
+counterfactual explanations. By augmenting large language models (LLMs) with
+functional calls and code generation tailored for optimization models, we
+enable seamless interaction and minimize the risk of hallucinations in
+OptiChat. We develop a new dataset to evaluate OptiChat's performance in
+explaining optimization models. Experiments demonstrate that OptiChat
+effectively bridges the gap between optimization models and practitioners,
+delivering autonomous, accurate, and instant responses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discrete time stochastic impulse control with delay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10444v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10444v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Said Hamadène, Boualem Djehiche
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a class of infinite-horizon impulse control problems with execution
+delay in discrete time. Using probabilistic methods, particularly the notion of
+the Snell envelope of processes, we construct an optimal strategy among all
+admissible strategies for both risk-neutral and risk-sensitive utility
+functions. Furthermore, we establish the existence of bounded
+$\epsilon$-optimal strategies. This framework provides a robust approach to
+handling execution delays in discrete-time stochastic systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sensitivity Analysis for Binary Outcome Misclassification in
+  Randomization Tests via Integer Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.03111v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.03111v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Heng, Pamela A. Shaw
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conducting a randomization test is a common method for testing causal null
+hypotheses in randomized experiments. The popularity of randomization tests is
+largely because their statistical validity only depends on the randomization
+design, and no distributional or modeling assumption on the outcome variable is
+needed. However, randomization tests may still suffer from other sources of
+bias, among which outcome misclassification is a significant one. We propose a
+model-free and finite-population sensitivity analysis approach for binary
+outcome misclassification in randomization tests. A central quantity in our
+framework is ``warning accuracy," defined as the threshold such that a
+randomization test result based on the measured outcomes may differ from that
+based on the true outcomes if the outcome measurement accuracy did not surpass
+that threshold. We show how learning the warning accuracy and related concepts
+can amplify analyses of randomization tests subject to outcome
+misclassification without adding additional assumptions. We show that the
+warning accuracy can be computed efficiently for large data sets by adaptively
+reformulating a large-scale integer program with respect to the randomization
+design. We apply the proposed approach to the Prostate Cancer Prevention Trial
+(PCPT). We also developed an open-source R package for implementation of our
+approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning-based Attitude Estimation with Noisy Measurements and Unknown
+  Gyro Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13092v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13092v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parham Oveissi, Mohammad Mirtaba, Ankit Goel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a learning-based, data-driven attitude estimator,
+called the retrospective cost attitude estimator (RCAE), for the SO(3) attitude
+representation. RCAE is motivated by the multiplicative extended Kalman filter
+(MEKF). However, unlike MEKF, which requires computing a Jacobian to compute
+the correction signal, RCAC uses retrospective cost optimization that depends
+only on the measured data. Moreover, due to the structure of the correction
+signal, RCAE does not require explicit estimation of gyro bias. The performance
+of RCAE is verified and compared with MEKF through both numerical simulations
+and physical experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Breaking barriers in two-party quantum cryptography via stochastic
+  semidefinite programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13200v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13200v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshay Bansal, Jamie Sikora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the last two decades, there has been much effort in finding secure
+protocols for two-party cryptographic tasks. It has since been discovered that
+even with quantum mechanics, many such protocols are limited in their security
+promises. In this work, we use stochastic selection, an idea from stochastic
+programming, to circumvent such limitations. For example, we find a way to
+switch between bit commitment, weak coin flipping, and oblivious transfer
+protocols to improve their security. We also use stochastic selection to turn
+trash into treasure yielding the first quantum protocol for Rabin oblivious
+transfer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Barrier Function for Bilevel Optimization with Coupled Lower-Level
+  Constraints: Formulation, Approximation and Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.10670v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.10670v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaotian Jiang, Jiaxiang Li, Mingyi Hong, Shuzhong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider bilevel optimization problem where the lower-level
+has coupled constraints, i.e. the constraints depend both on the upper- and
+lower-level variables. In particular, we consider two settings for the
+lower-level problem. The first is when the objective is strongly convex and the
+constraints are convex with respect to the lower-level variable; The second is
+when the lower-level is a linear program. We propose to utilize a barrier
+function reformulation to translate the problem into an unconstrained problem.
+By developing a series of new techniques, we proved that both the hyperfunction
+value and hypergradient of the barrier reformulated problem (uniformly)
+converge to those of the original problem under minimal assumptions. Further,
+to overcome the non-Lipschitz smoothness of hyperfunction and lower-level
+problem for barrier reformulated problems, we design an adaptive algorithm that
+ensures a non-asymptotic convergence guarantee. We also design an algorithm
+that converges to the stationary point of the original problem asymptotically
+under certain assumptions. The proposed algorithms require minimal assumptions,
+and to our knowledge, they are the first with convergence guarantees when the
+lower-level problem is a linear program. Numerical experiments are conducted to
+show the effectiveness of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond discounted returns: Robust Markov decision processes with average
+  and Blackwell optimality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03618v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03618v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julien Grand-Clément, Marek Petrik, Nicolas Vieille
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust Markov Decision Processes (RMDPs) are a widely used framework for
+sequential decision-making under parameter uncertainty. RMDPs have been
+extensively studied when the objective is to maximize the discounted return,
+but little is known for average optimality (optimizing the long-run average of
+the rewards obtained over time) and Blackwell optimality (remaining discount
+optimal for all discount factors sufficiently close to ). In this paper, we
+prove several foundational results for RMDPs beyond the discounted return. We
+show that average optimal policies can be chosen stationary and deterministic
+for sa-rectangular RMDPs but, perhaps surprisingly, we show that for
+s-rectangular RMDPs average optimal policies may not exist, and if they exist,
+may need to be history-dependent (Markovian). We also study Blackwell
+optimality for sa-rectangular RMDPs, where we show that $\epsilon$-Blackwell
+optimal policies always exist, although Blackwell optimal policies may not
+exist. We also provide a sufficient condition for their existence, which
+encompasses virtually any examples from the literature. We then discuss the
+connection between average and Blackwell optimality, and we describe several
+algorithms to compute the optimal average return. Interestingly, our approach
+leverages the connections between RMDPs and stochastic games. Overall, our
+paper emphasizes the superior practical properties of distance-based
+sa-rectangular models over s-rectangular models for average and Blackwell
+optimality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Isoperimetric inequalities for the fractional composite membrane problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05320v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05320v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mrityunjoy Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we investigate some isoperimetric-type inequalities related
+to the first eigenvalue of the fractional composite membrane problem. First, we
+establish an analogue of the renowned Faber-Krahn inequality for the fractional
+composite membrane problem. Next, we investigate an isoperimetric inequality
+for the first eigenvalue of the fractional composite membrane problem on the
+intersection of two domains-a problem that was first studied by Lieb [23] for
+the Laplacian. Similar results in the local case were previously obtained by
+Cupini-Vecchi [9] for the composite membrane problem. Our findings provide
+further insights into the fractional setting, offering a new perspective on
+these classical inequalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled Functional Central Limit Theorems for Two-Time-Scale
+  Stochastic Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17070v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17070v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuze Han, Xiang Li, Jiadong Liang, Zhihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In two-time-scale stochastic approximation (SA), two iterates are updated at
+different rates, governed by distinct step sizes, with each update influencing
+the other. Previous studies have demonstrated that the convergence rates of the
+error terms for these updates depend solely on their respective step sizes, a
+property known as decoupled convergence. However, a functional version of this
+decoupled convergence has not been explored. Our work fills this gap by
+establishing decoupled functional central limit theorems for two-time-scale SA,
+offering a more precise characterization of its asymptotic behavior. To achieve
+these results, we leverage the martingale problem approach and establish
+tightness as a crucial intermediate step. Furthermore, to address the
+interdependence between different time scales, we introduce an innovative
+auxiliary sequence to eliminate the primary influence of the fast-time-scale
+update on the slow-time-scale update.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parallel Inexact Levenberg-Marquardt Method for Nearly-Separable
+  Nonlinear Least Squares 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09064v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09064v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lidija Fodor, Dusan Jakovetic, Natasa Krejic, Greta Malaspina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by localization problems such as cadastral maps refinements, we
+consider a generic Nonlinear Least Squares (NLS) problem of minimizing an
+aggregate squared fit across all nonlinear equations (measurements) with
+respect to the set of unknowns, e.g., coordinates of the unknown points'
+locations. In a number of scenarios, NLS problems exhibit a nearly-separable
+structure: the set of measurements can be partitioned into disjoint groups
+(blocks), such that the unknowns that correspond to different blocks are only
+loosely coupled. We propose an efficient parallel method, termed Parallel
+Inexact Levenberg Marquardt (PILM), to solve such generic large scale NLS
+problems. PILM builds upon the classical Levenberg-Marquard (LM) method, with a
+main novelty in that the nearly-block separable structure is leveraged in order
+to obtain a scalable parallel method. Therein, the problem-wide system of
+linear equations that needs to be solved at every LM iteration is tackled
+iteratively. At each (inner) iteration, the block-wise systems of linear
+equations are solved in parallel, while the problem-wide system is then handled
+via sparse, inexpensive inter-block communication. We establish strong
+convergence guarantees of PILM that are analogous to those of the classical LM;
+provide PILM implementation in a master-worker parallel compute environment;
+and demonstrate its efficiency on huge scale cadastral map refinement problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ McCormick envelopes in mixed-integer PDE-constrained optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.07891v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.07891v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sven Leyffer, Paul Manns
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  McCormick envelopes are a standard tool for deriving convex relaxations of
+optimization problems that involve polynomial terms. Such McCormick relaxations
+provide lower bounds, for example, in branch-and-bound procedures for
+mixed-integer nonlinear programs but have not gained much attention in
+PDE-constrained optimization so far. This lack of attention may be due to the
+distributed nature of such problems, which on the one hand leads to infinitely
+many linear constraints (generally state constraints that may be difficult to
+handle) in addition to the state equation for a pointwise formulation of the
+McCormick envelopes and renders bound-tightening procedures that successively
+improve the resulting convex relaxations computationally intractable.
+  We analyze McCormick envelopes for a problem class that is governed by a
+semilinear PDE involving a bilinearity and integrality constraints. We
+approximate the nonlinearity by averaging the involved terms over the cells of
+a partition of the computational domain on which the PDE is defined. This
+yields convex relaxations that underestimate the original problem up to an a
+priori error estimate that depends on the mesh size of the discretization.
+These approximate McCormick relaxations can be improved by means of an
+optimization-based bound-tightening procedure. We show that their minimizers
+converge to minimizers to a limit problem with a pointwise formulation of the
+McCormick envelopes when driving the mesh size to zero.
+  We provide a computational example, for which we certify all of our imposed
+assumptions. The results point to both the potential of the methodology and the
+gaps in the research that need to be closed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Existence of an optimal shape for the first eigenvalue of polyharmonic
+  operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roméo Leylekian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We prove the existence of an open set minimizing the first eigenvalue of the
+Dirichlet polylaplacian of order $m\geq1$ under volume constraint. Moreover,
+the corresponding eigenfunction is shown to enjoy $C^{m-1,\alpha}$ H\"older
+regularity. This is performed for dimension $2\leq d\leq 4m$. In particular,
+our analysis answers the question of the existence of an optimal shape for the
+clamped plate up to dimension $8$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, no figure; Lemma 15 added</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distributed Inexact Newton Method with Adaptive Step Sizes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13985v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13985v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dusan Jakovetic, Natasa Krejic, Greta Malaspina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider two formulations for distributed optimization wherein $N$ agents
+in a generic connected network solve a problem of common interest: distributed
+personalized optimization and consensus optimization. A new method termed DINAS
+(Distributed Inexact Newton method with Adaptive Stepsize) is proposed. DINAS
+employs large adaptively computed step-sizes, requires a reduced global
+parameters knowledge with respect to existing alternatives, and can operate
+without any local Hessian inverse calculations nor Hessian communications. When
+solving personalized distributed learning formulations, DINAS achieves
+quadratic convergence with respect to computational cost and linear convergence
+with respect to communication cost, the latter rate being independent of the
+local functions condition numbers or of the network topology. When solving
+consensus optimization problems, DINAS is shown to converge to the global
+solution. Extensive numerical experiments demonstrate significant improvements
+of DINAS over existing alternatives. As a result of independent interest, we
+provide for the first time convergence analysis of the Newton method with the
+adaptive Polyak's step-size when the Newton direction is computed inexactly in
+centralized environment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Grand-Canonical Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.06859v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.06859v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simone Di Marino, Mathieu Lewin, Luca Nenna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a generalization of the multi-marginal optimal transport problem,
+which has no fixed number of marginals $N$ and is inspired of statistical
+mechanics. It consists in optimizing a linear combination of the costs for all
+the possible $N$'s, while fixing a certain linear combination of the
+corresponding marginals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final version, to appear in Arch. Rat. Mech. Anal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analytically Tractable Models for Decision Making under Present Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08132v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08132v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasunori Akagi, Naoki Marumo, Takeshi Kurashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time-inconsistency is a characteristic of human behavior in which people plan
+for long-term benefits but take actions that differ from the plan due to
+conflicts with short-term benefits. Such time-inconsistent behavior is believed
+to be caused by present bias, a tendency to overestimate immediate rewards and
+underestimate future rewards. It is essential in behavioral economics to
+investigate the relationship between present bias and time-inconsistency. In
+this paper, we propose a model for analyzing agent behavior with present bias
+in tasks to make progress toward a goal over a specific period. Unlike previous
+models, the state sequence of the agent can be described analytically in our
+model. Based on this property, we analyze three crucial problems related to
+agents under present bias: task abandonment, optimal goal setting, and optimal
+reward scheduling. Extensive analysis reveals how present bias affects the
+condition under which task abandonment occurs and optimal intervention
+strategies. Our findings are meaningful for preventing task abandonment and
+intervening through incentives in the real world.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Homotopy trust-region method for phase-field approximations in
+  perimeter-regularized binary optimal control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12478v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12478v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Manns, Vanja Nikolić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider optimal control problems that have binary-valued control input
+functions and a perimeter regularization. We develop and analyze a trust-region
+algorithm that solves a sequence of subproblems in which the regularization
+term and the binarity constraint are relaxed by a non-convex energy functional.
+We show how the parameter that controls the distinctiveness of the resulting
+phase field can be coupled to the trust-region radius updates and be driven to
+zero over the course of the iterations in order to obtain convergence to
+stationary points of the limit problem under suitable regularity assumptions.
+Finally, we highlight and discuss the assumptions and restrictions of our
+approach and provide the first computational results for a motivating
+application in the field of control of acoustic waves in dissipative media.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On structural contraction of biological interaction networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13678v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13678v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Ali Al-Radhawi, David Angeli, Eduardo Sontag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biological networks are customarily described as structurally robust. This
+means that they often function extremely well under large forms of
+perturbations affecting both the concentrations and the kinetic parameters. In
+order to explain this property, various mathematical notions have been proposed
+in the literature. In this paper, we propose the notion of structural
+contractivity, building on the previous work of the authors. That previous work
+characterized the long-term dynamics of classes of Biological Interaction
+Networks (BINs), based on "rate-dependent Lyapunov functions". Here, we show
+that stronger notions of convergence can be established by proving structural
+contractivity with respect to non-standard polyhedral $\ell_\infty$-norms. In
+particular, we show that such networks are nonexpansive. With additional
+verifiable conditions, we show that they are strictly contractive over
+arbitrary positive compact sets. In addition, we show that such networks
+entrain to periodic inputs. We illustrate our theory with examples drawn from
+the modeling of intracellular signaling pathways.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Statistical inference of convex order by Wasserstein projection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02840v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02840v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakwang Kim, Young-Heon Kim, Yuanlong Ruan, Andrew Warren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ranking distributions according to a stochastic order has wide applications
+in diverse areas. Although stochastic dominance has received much attention,
+convex order, particularly in general dimensions, has yet to be investigated
+from a statistical point of view. This article addresses this gap by
+introducing a simple statistical test for convex order based on the Wasserstein
+projection distance. This projection distance not only encodes whether two
+distributions are indeed in convex order, but also quantifies the deviation
+from the desired convex order and produces an optimal convex order
+approximation. Lipschitz stability of the backward and forward Wasserstein
+projection distance is proved, which leads to elegant consistency and
+concentration results of the estimator we employ as our test statistic.
+Combining these with state of the art results regarding the convergence rate of
+empirical distributions, we also derive upper bounds for the $p$-value and type
+I error of our test statistic, as well as upper bounds on the type II error for
+an appropriate class of strict alternatives. With proper choices of families of
+distributions, we further attain that the power of the proposed test increases
+to one as the number of samples grows to infinity. Lastly, we provide an
+efficient numerical scheme for our test statistic, by way of an entropic
+Frank-Wolfe algorithm. Experiments based on synthetic data sets illuminate the
+success of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 3 figures, Add previous literature about the Wasserstein
+  projection (Aurelien Alfonsi, Jacopo Corbetta and Benjamin Jourdain (2020)),
+  and the stability of the projection measure in one dimension (Benjamin
+  Jourdain, William Margheriti and Gudmund Pammer(2023))</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Randomized Nyström Preconditioned Interior Point-Proximal Method of
+  Multipliers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14524v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14524v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ya-Chi Chu, Luiz-Rafael Santos, Madeleine Udell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new algorithm for convex separable quadratic programming (QP)
+called Nys-IP-PMM, a regularized interior-point solver that uses low-rank
+structure to accelerate solution of the Newton system. The algorithm combines
+the interior point proximal method of multipliers (IP-PMM) with the randomized
+Nystr\"om preconditioned conjugate gradient method as the inner linear system
+solver. Our algorithm is matrix-free: it accesses the input matrices solely
+through matrix-vector products, as opposed to methods involving matrix
+factorization. It works particularly well for separable QP instances with dense
+constraint matrices. We establish convergence of Nys-IP-PMM. Numerical
+experiments demonstrate its superior performance in terms of wallclock time
+compared to previous matrix-free IPM-based approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Invariance Proximity: Closed-Form Error Bounds for Finite-Dimensional
+  Koopman-Based Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.13033v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.13033v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masih Haseli, Jorge Cortés
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A popular way to approximate the Koopman operator's action on a
+finite-dimensional subspace of functions is via orthogonal projections. The
+quality of the projected model directly depends on the selected subspace,
+specifically on how close it is to being invariant under the Koopman operator.
+The notion of invariance proximity provides a tight upper bound on the
+worst-case relative prediction error of the finite-dimensional model. However,
+its direct calculation is computationally challenging. This paper leverages the
+geometric structure behind the definition of invariance proximity to provide a
+closed-form expression in terms of Jordan principal angles on general inner
+product spaces. Unveiling this connection allows us to exploit specific
+isomorphisms to circumvent the computational challenges associated with spaces
+of functions and enables the use of existing efficient numerical routines to
+compute invariance proximity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Observer-based Periodic Event-triggered and Self-triggered Boundary
+  Control of a Class of Parabolic PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01313v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01313v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhathiya Rathnayake, Mamadou Diagne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the first observer-based periodic event-triggered
+control (PETC) and self-triggered control (STC) for boundary control of a class
+of parabolic PDEs using PDE backstepping control. We introduce techniques to
+convert a certain class of continuous-time event-triggered control into PETC
+and STC, eliminating the need for continuous monitoring of the event-triggering
+function. For the PETC, the event-triggering function requires only periodic
+evaluations to detect events, while the STC proactively computes the time of
+the next event right at the current event time using the system model and the
+continuously available measurements. For both strategies, the control input is
+updated exclusively at events and is maintained using a zero-order hold between
+events. We demonstrate that the closed-loop system is Zeno-free. We offer
+criteria for selecting an appropriate sampling period for the PETC and for
+determining the time until the next event under the STC. We prove the system's
+global exponential convergence to zero in the spatial $L^2$ norm for both
+anti-collocated and collocated sensing and actuation under the PETC. For the
+STC, local exponential convergence to zero in the spatial $L^2$ norm for
+collocated sensing and actuation is proven. Simulations are provided to
+illustrate the theoretical claims.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Performance-Barrier Event-Triggered Control of a Class of
+  Reaction-Diffusion PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.08178v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.08178v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhathiya Rathnayake, Mamadou Diagne, Jorge Cortes, Miroslav Krstic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We employ the recent performance-barrier event-triggered control (P-ETC) for
+achieving global exponential convergence of a class of reaction-diffusion PDEs
+via PDE backstepping control. Rather than insisting on a strictly monotonic
+decrease of the Lyapunov function for the closed-loop system, P-ETC allows the
+Lyapunov function to increase as long as it remains below an acceptable
+performance-barrier. This approach integrates a performance residual, the
+difference between the value of the performance-barrier and the Lyapunov
+function, into the triggering mechanism. The integration adds flexibility and
+results in fewer control updates than with regular ETC (R-ETC) that demands a
+monotonic decrease of the Lyapunov function. Our P-ETC PDE backstepping design
+ensures global exponential convergence of the closed-loop system in the spatial
+L^2 norm, without encountering Zeno phenomenon. To avoid continuous monitoring
+of the triggering function that generates events, we develop periodic
+event-triggered and self-triggered variants (P-PETC and P-STC, respectively) of
+the P-ETC. The P-PETC only requires periodic evaluation of the triggering
+function whereas the P-STC preemptively computes the time of the next event at
+the current event time using the system model and continuously available system
+states. The P-PETC and P-STC also ensure a Zeno-free behavior and deliver
+performance equivalent to that of the continuous-time P-ETC which requires
+continuous evaluation of the triggering function, in addition to the continuous
+sensing of the state. We provide numerical simulations to illustrate the
+proposed technique and to compare it with R-ETC associated with strictly
+decreasing Lyapunov functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quadratic-form Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruodu Wang, Zhenyuan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the framework of quadratic-form optimal transport (QOT), whose
+transport cost has the form $\iint c\,\mathrm{d}\pi \otimes\mathrm{d}\pi$ for
+some coupling $\pi$ between two marginals. Interesting examples of
+quadratic-form transport cost and their optimization include inequality
+measurement, the variance of a bivariate function, covariance, Kendall's tau,
+the Gromov--Wasserstein distance, quadratic assignment problems, and quadratic
+regularization of classic optimal transport. QOT leads to substantially
+different mathematical structures compared to classic transport problems and
+many technical challenges. We illustrate the fundamental properties of QOT,
+provide several cases where explicit solutions are obtained, and give general
+lower bounds of the optimal transport costs. For a wide class of cost
+functions, including the rectangular cost functions, the QOT problem is solved
+by a new coupling called the diamond transport, whose copula is supported on a
+diamond in the unit square.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stable Set Polytopes with Rank $|V(G)|/3$ for the Lovász--Schrijver
+  SDP Operator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07413v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07413v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Hin Au, Levent Tunçel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the lift-and-project rank of the stable set polytope of graphs with
+respect to the Lov\'{a}sz--Schrijver SDP operator $\text{LS}_+$ applied to the
+fractional stable set polytope. In particular, we show that for every positive
+integer $\ell$, the smallest possible graph with $\text{LS}_+$-rank $\ell$
+contains $3\ell$ vertices. This result is sharp and settles a conjecture posed
+by Lipt\'{a}k and the second author in 2003, as well as answers a
+generalization of a problem posed by Knuth in 1994. We also show that for every
+positive integer $\ell$ there exists a vertex-transitive graph on $4\ell+12$
+vertices with $\text{LS}_+$-rank at least $\ell$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Relaxed Indexability and Index Policy for Partially Observable Restless
+  Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.11939v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.11939v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keqin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses an important class of restless multi-armed bandit (RMAB)
+problems that finds broad application in operations research, stochastic
+optimization, and reinforcement learning. There are $N$ independent Markov
+processes that may be operated, observed and offer rewards. Due to the resource
+constraint, we can only choose a subset of $M~(M<N)$ processes to operate and
+accrue reward determined by the states of selected processes. We formulate the
+problem as a partially observable RMAB with an infinite state space and design
+an algorithm that achieves a near-optimal performance with low complexity. Our
+algorithm is based on a generalization of Whittle's original idea of
+indexability. Referred to as the relaxed indexability, the extended definition
+leads to the efficient online verifications and computations of the approximate
+Whittle index under the proposed algorithmic framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">153</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DAViD: Modeling Dynamic Affordance of 3D Objects using <span class="highlight-title">Pre-train</span>ed Video
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeonwoo Kim, Sangwon Beak, Hanbyul Joo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the ability of humans to use objects is crucial for AI to
+improve daily life. Existing studies for learning such ability focus on
+human-object patterns (e.g., contact, spatial relation, orientation) in static
+situations, and learning Human-Object Interaction (HOI) patterns over time
+(i.e., movement of human and object) is relatively less explored. In this
+paper, we introduce a novel type of affordance named Dynamic Affordance. For a
+given input 3D object mesh, we learn dynamic affordance which models the
+distribution of both (1) human motion and (2) human-guided object pose during
+interactions. As a core idea, we present a method to learn the 3D dynamic
+affordance from synthetically generated 2D videos, leveraging a pre-trained
+video diffusion model. Specifically, we propose a pipeline that first generates
+2D HOI videos from the 3D object and then lifts them into 3D to generate 4D HOI
+samples. Once we generate diverse 4D HOI samples on various target objects, we
+train our DAViD, where we present a method based on the Low-Rank Adaptation
+(LoRA) module for pre-trained human motion diffusion model (MDM) and an object
+pose diffusion model with human pose guidance. Our motion diffusion model is
+extended for multi-object interactions, demonstrating the advantage of our
+pipeline with LoRA for combining the concepts of object usage. Through
+extensive experiments, we demonstrate our DAViD outperforms the baselines in
+generating human motion with HOIs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://snuvclab.github.io/david/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MangaNinja: Line Art Colorization with Precise Reference Following 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiheng Liu, Ka Leong Cheng, Xi Chen, Jie Xiao, Hao Ouyang, Kai Zhu, Yu Liu, Yujun Shen, Qifeng Chen, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Derived from diffusion models, MangaNinjia specializes in the task of
+reference-guided line art colorization. We incorporate two thoughtful designs
+to ensure precise character detail transcription, including a patch shuffling
+module to facilitate correspondence learning between the reference color image
+and the target line art, and a point-driven control scheme to enable
+fine-grained color matching. Experiments on a self-collected benchmark
+demonstrate the superiority of our model over current solutions in terms of
+precise colorization. We further showcase the potential of the proposed
+interactive point control in handling challenging cases, cross-character
+colorization, multi-reference harmonization, beyond the reach of existing
+algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page and code: https://johanan528.github.io/MangaNinjia/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using
+  Real-Time Warped Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan Burgert, Yuancheng Xu, Wenqi Xian, Oliver Pilarski, Pascal Clausen, Mingming He, Li Ma, Yitong Deng, Lingxiao Li, Mohsen Mousavi, Michael Ryoo, Paul Debevec, Ning Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative modeling aims to transform random noise into structured outputs.
+In this work, we enhance video diffusion models by allowing motion control via
+structured latent noise sampling. This is achieved by just a change in data: we
+pre-process training videos to yield structured noise. Consequently, our method
+is agnostic to diffusion model design, requiring no changes to model
+architectures or training pipelines. Specifically, we propose a novel noise
+warping algorithm, fast enough to run in real time, that replaces random
+temporal Gaussianity with correlated warped noise derived from optical flow
+fields, while preserving the spatial Gaussianity. The efficiency of our
+algorithm enables us to fine-tune modern video diffusion base models using
+warped noise with minimal overhead, and provide a one-stop solution for a wide
+range of user-friendly motion control: local object motion control, global
+camera movement control, and motion transfer. The harmonization between
+temporal coherence and spatial Gaussianity in our warped noise leads to
+effective motion control while maintaining per-frame pixel quality. Extensive
+experiments and user studies demonstrate the advantages of our method, making
+it a robust and scalable approach for controlling motion in video diffusion
+models. Video results are available on our webpage:
+https://vgenai-netflix-eyeline-research.github.io/Go-with-the-Flow/; source
+code and model checkpoints are available on GitHub:
+https://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting 4D Hand Trajectory from Monocular Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08329v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08329v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Ye, Yao Feng, Omid Taheri, Haiwen Feng, Shubham Tulsiani, Michael J. Black
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present HaPTIC, an approach that infers coherent 4D hand trajectories from
+monocular videos. Current video-based hand pose reconstruction methods
+primarily focus on improving frame-wise 3D pose using adjacent frames rather
+than studying consistent 4D hand trajectories in space. Despite the additional
+temporal cues, they generally underperform compared to image-based methods due
+to the scarcity of annotated video data. To address these issues, we repurpose
+a state-of-the-art image-based transformer to take in multiple frames and
+directly predict a coherent trajectory. We introduce two types of lightweight
+attention layers: cross-view self-attention to fuse temporal information, and
+global cross-attention to bring in larger spatial context. Our method infers 4D
+hand trajectories similar to the ground truth while maintaining strong 2D
+reprojection alignment. We apply the method to both egocentric and allocentric
+videos. It significantly outperforms existing methods in global trajectory
+accuracy while being comparable to the state-of-the-art in single-image pose
+estimation. Project website: https://judyye.github.io/haptic-www
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Omni-R<span class="highlight-title">GPT</span>: Unifying Image and Video Region-level Understanding via Token
+  Marks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miran Heo, Min-Hung Chen, De-An Huang, Sifei Liu, Subhashree Radhakrishnan, Seon Joo Kim, Yu-Chiang Frank Wang, Ryo Hachiuma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Omni-RGPT, a multimodal large language model designed to
+facilitate region-level comprehension for both images and videos. To achieve
+consistent region representation across spatio-temporal dimensions, we
+introduce Token Mark, a set of tokens highlighting the target regions within
+the visual feature space. These tokens are directly embedded into spatial
+regions using region prompts (e.g., boxes or masks) and simultaneously
+incorporated into the text prompt to specify the target, establishing a direct
+connection between visual and text tokens. To further support robust video
+understanding without requiring tracklets, we introduce an auxiliary task that
+guides Token Mark by leveraging the consistency of the tokens, enabling stable
+region interpretation across the video. Additionally, we introduce a
+large-scale region-level video instruction dataset (RegVID-300k). Omni-RGPT
+achieves state-of-the-art results on image and video-based commonsense
+reasoning benchmarks while showing strong performance in captioning and
+referring expression comprehension tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://miranheo.github.io/omni-rgpt/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GameFactory: Creating New Games with Generative Interactive Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiwen Yu, Yiran Qin, Xintao Wang, Pengfei Wan, Di Zhang, Xihui Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative game engines have the potential to revolutionize game development
+by autonomously creating new content and reducing manual workload. However,
+existing video-based game generation methods fail to address the critical
+challenge of scene generalization, limiting their applicability to existing
+games with fixed styles and scenes. In this paper, we present GameFactory, a
+framework focused on exploring scene generalization in game video generation.
+To enable the creation of entirely new and diverse games, we leverage
+pre-trained video diffusion models trained on open-domain video data. To bridge
+the domain gap between open-domain priors and small-scale game dataset, we
+propose a multi-phase training strategy that decouples game style learning from
+action control, preserving open-domain generalization while achieving action
+controllability. Using Minecraft as our data source, we release GF-Minecraft, a
+high-quality and diversity action-annotated video dataset for research.
+Furthermore, we extend our framework to enable autoregressive
+action-controllable game video generation, allowing the production of
+unlimited-length interactive game videos. Experimental results demonstrate that
+GameFactory effectively generates open-domain, diverse, and action-controllable
+game videos, representing a significant step forward in AI-driven game
+generation. Our dataset and project page are publicly available at
+\url{https://vvictoryuki.github.io/gamefactory/}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Adversarial Post-Training for One-Step Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanchuan Lin, Xin Xia, Yuxi Ren, Ceyuan Yang, Xuefeng Xiao, Lu Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion models are widely used for image and video generation, but
+their iterative generation process is slow and expansive. While existing
+distillation approaches have demonstrated the potential for one-step generation
+in the image domain, they still suffer from significant quality degradation. In
+this work, we propose Adversarial Post-Training (APT) against real data
+following diffusion pre-training for one-step video generation. To improve the
+training stability and quality, we introduce several improvements to the model
+architecture and training procedures, along with an approximated R1
+regularization objective. Empirically, our experiments show that our
+adversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720,
+24fps videos in real time using a single forward evaluation step. Additionally,
+our model is capable of generating 1024px images in a single step, achieving
+quality comparable to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MiniMax-01: Scaling Foundation Models with Lightning Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         MiniMax, Aonian Li, Bangwei Gong, Bo Yang, Boji Shan, Chang Liu, Cheng Zhu, Chunhao Zhang, Congchao Guo, Da Chen, Dong Li, Enwei Jiao, Gengxin Li, Guojun Zhang, Haohai Sun, Houze Dong, Jiadai Zhu, Jiaqi Zhuang, Jiayuan Song, Jin Zhu, Jingtao Han, Jingyang Li, Junbin Xie, Junhao Xu, Junjie Yan, Kaishun Zhang, Kecheng Xiao, Kexi Kang, Le Han, Leyang Wang, Lianfei Yu, Liheng Feng, Lin Zheng, Linbo Chai, Long Xing, Meizhi Ju, Mingyuan Chi, Mozhi Zhang, Peikai Huang, Pengcheng Niu, Pengfei Li, Pengyu Zhao, Qi Yang, Qidi Xu, Qiexiang Wang, Qin Wang, Qiuhui Li, Ruitao Leng, Shengmin Shi, Shuqi Yu, Sichen Li, Songquan Zhu, Tao Huang, Tianrun Liang, Weigao Sun, Weixuan Sun, Weiyu Cheng, Wenkai Li, Xiangjun Song, Xiao Su, Xiaodong Han, Xinjie Zhang, Xinzhu Hou, Xu Min, Xun Zou, Xuyang Shen, Yan Gong, Yingjie Zhu, Yipeng Zhou, Yiran Zhong, Yongyi Hu, Yuanxiang Fan, Yue Yu, Yufeng Yang, Yuhao Li, Yunan Huang, Yunji Li, Yunpeng Huang, Yunzhi Xu, Yuxin Mao, Zehan Li, Zekang Li, Zewei Tao, Zewen Ying, Zhaoyang Cong, Zhen Qin, Zhenhua Fan, Zhihang Yu, Zhuo Jiang, Zijia Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01,
+which are comparable to top-tier models while offering superior capabilities in
+processing longer contexts. The core lies in lightning attention and its
+efficient scaling. To maximize computational capacity, we integrate it with
+Mixture of Experts (MoE), creating a model with 32 experts and 456 billion
+total parameters, of which 45.9 billion are activated for each token. We
+develop an optimized parallel strategy and highly efficient
+computation-communication overlap techniques for MoE and lightning attention.
+This approach enables us to conduct efficient training and inference on models
+with hundreds of billions of parameters across contexts spanning millions of
+tokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens
+during training and extrapolate to 4 million tokens during inference at an
+affordable cost. Our vision-language model, MiniMax-VL-01 is built through
+continued training with 512 billion vision-language tokens. Experiments on both
+standard and in-house benchmarks show that our models match the performance of
+state-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32
+times longer context window. We publicly release MiniMax-01 at
+https://github.com/MiniMax-AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A technical report from MiniMax. The authors are listed in
+  alphabetical order. We open-sourced our MiniMax-01 at
+  https://github.com/MiniMax-AI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Semantic Future Prediction through Multimodal Visual Sequence
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Efstathios Karypidis, Ioannis Kakogeorgiou, Spyros Gidaris, Nikos Komodakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic future prediction is important for autonomous systems navigating
+dynamic environments. This paper introduces FUTURIST, a method for multimodal
+future semantic prediction that uses a unified and efficient visual sequence
+transformer architecture. Our approach incorporates a multimodal masked visual
+modeling objective and a novel masking mechanism designed for multimodal
+training. This allows the model to effectively integrate visible information
+from various modalities, improving prediction accuracy. Additionally, we
+propose a VAE-free hierarchical tokenization process, which reduces
+computational complexity, streamlines the training pipeline, and enables
+end-to-end training with high-resolution, multimodal inputs. We validate
+FUTURIST on the Cityscapes dataset, demonstrating state-of-the-art performance
+in future semantic segmentation for both short- and mid-term forecasting. We
+provide the implementation code at https://github.com/Sta8is/FUTURIST .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LayerAnimate: Layer-specific Control for Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxue Yang, Lue Fan, Zuzen Lin, Feng Wang, Zhaoxiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Animated video separates foreground and background elements into layers, with
+distinct processes for sketching, refining, coloring, and in-betweening.
+Existing video generation methods typically treat animation as a monolithic
+data domain, lacking fine-grained control over individual layers. In this
+paper, we introduce LayerAnimate, a novel architectural approach that enhances
+fine-grained control over individual animation layers within a video diffusion
+model, allowing users to independently manipulate foreground and background
+elements in distinct layers. To address the challenge of limited layer-specific
+data, we propose a data curation pipeline that features automated element
+segmentation, motion-state hierarchical merging, and motion coherence
+refinement. Through quantitative and qualitative comparisons, and user study,
+we demonstrate that LayerAnimate outperforms current methods in terms of
+animation quality, control precision, and usability, making it an ideal tool
+for both professional animators and amateur enthusiasts. This framework opens
+up new possibilities for layer-specific animation applications and creative
+flexibility. Our code is available at https://layeranimate.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://layeranimate.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VINGS-Mono: Visual-Inertial Gaussian Splatting Monocular SLAM in Large
+  Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Wu, Zicheng Zhang, Muer Tie, Ziqing Ai, Zhongxue Gan, Wenchao Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  VINGS-Mono is a monocular (inertial) Gaussian Splatting (GS) SLAM framework
+designed for large scenes. The framework comprises four main components: VIO
+Front End, 2D Gaussian Map, NVS Loop Closure, and Dynamic Eraser. In the VIO
+Front End, RGB frames are processed through dense bundle adjustment and
+uncertainty estimation to extract scene geometry and poses. Based on this
+output, the mapping module incrementally constructs and maintains a 2D Gaussian
+map. Key components of the 2D Gaussian Map include a Sample-based Rasterizer,
+Score Manager, and Pose Refinement, which collectively improve mapping speed
+and localization accuracy. This enables the SLAM system to handle large-scale
+urban environments with up to 50 million Gaussian ellipsoids. To ensure global
+consistency in large-scale scenes, we design a Loop Closure module, which
+innovatively leverages the Novel View Synthesis (NVS) capabilities of Gaussian
+Splatting for loop closure detection and correction of the Gaussian map.
+Additionally, we propose a Dynamic Eraser to address the inevitable presence of
+dynamic objects in real-world outdoor scenes. Extensive evaluations in indoor
+and outdoor environments demonstrate that our approach achieves localization
+performance on par with Visual-Inertial Odometry while surpassing recent
+GS/NeRF SLAM methods. It also significantly outperforms all existing methods in
+terms of mapping and rendering quality. Furthermore, we developed a mobile app
+and verified that our framework can generate high-quality Gaussian maps in real
+time using only a smartphone camera and a low-frequency IMU sensor. To the best
+of our knowledge, VINGS-Mono is the first monocular Gaussian SLAM method
+capable of operating in outdoor environments and supporting kilometer-scale
+large scenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Bayesian Neural Networks Explicitly Model Input Uncertainty? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matias Valdenegro-Toro, Marco Zullich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inputs to machine learning models can have associated noise or uncertainties,
+but they are often ignored and not modelled. It is unknown if Bayesian Neural
+Networks and their approximations are able to consider uncertainty in their
+inputs. In this paper we build a two input Bayesian Neural Network (mean and
+standard deviation) and evaluate its capabilities for input uncertainty
+estimation across different methods like Ensembles, MC-Dropout, and Flipout.
+Our results indicate that only some uncertainty estimation methods for
+approximate Bayesian NNs can model input uncertainty, in particular Ensembles
+and Flipout.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 11 figures, VISAPP 2025 camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLaVA-ST: A Multimodal Large Language Model for Fine-Grained
+  Spatial-Temporal Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyu Li, Jinyu Chen, Ziyu Wei, Shaofei Huang, Tianrui Hui, Jialin Gao, Xiaoming Wei, Si Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in multimodal large language models (MLLMs) have shown
+promising results, yet existing approaches struggle to effectively handle both
+temporal and spatial localization simultaneously. This challenge stems from two
+key issues: first, incorporating spatial-temporal localization introduces a
+vast number of coordinate combinations, complicating the alignment of
+linguistic and visual coordinate representations; second, encoding fine-grained
+temporal and spatial information during video feature compression is inherently
+difficult. To address these issues, we propose LLaVA-ST, a MLLM for
+fine-grained spatial-temporal multimodal understanding. In LLaVA-ST, we propose
+Language-Aligned Positional Embedding, which embeds the textual coordinate
+special token into the visual space, simplifying the alignment of fine-grained
+spatial-temporal correspondences. Additionally, we design the Spatial-Temporal
+Packer, which decouples the feature compression of temporal and spatial
+resolutions into two distinct point-to-region attention processing streams.
+Furthermore, we propose ST-Align dataset with 4.3M training samples for
+fine-grained spatial-temporal multimodal understanding. With ST-align, we
+present a progressive training pipeline that aligns the visual and textual
+feature through sequential coarse-to-fine stages.Additionally, we introduce an
+ST-Align benchmark to evaluate spatial-temporal interleaved fine-grained
+understanding tasks, which include Spatial-Temporal Video Grounding (STVG) ,
+Event Localization and Captioning (ELC) and Spatial Video Grounding (SVG).
+LLaVA-ST achieves outstanding performance on 11 benchmarks requiring
+fine-grained temporal, spatial, or spatial-temporal interleaving multimodal
+understanding. Our code, data and benchmark will be released at Our code, data
+and benchmark will be released at https://github.com/appletea233/LLaVA-ST .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SmartEraser: Remove Anything from Images using Masked-Region Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longtao Jiang, Zhendong Wang, Jianmin Bao, Wengang Zhou, Dongdong Chen, Lei Shi, Dong Chen, Houqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object removal has so far been dominated by the mask-and-inpaint paradigm,
+where the masked region is excluded from the input, leaving models relying on
+unmasked areas to inpaint the missing region. However, this approach lacks
+contextual information for the masked area, often resulting in unstable
+performance. In this work, we introduce SmartEraser, built with a new removing
+paradigm called Masked-Region Guidance. This paradigm retains the masked region
+in the input, using it as guidance for the removal process. It offers several
+distinct advantages: (a) it guides the model to accurately identify the object
+to be removed, preventing its regeneration in the output; (b) since the user
+mask often extends beyond the object itself, it aids in preserving the
+surrounding context in the final result. Leveraging this new paradigm, we
+present Syn4Removal, a large-scale object removal dataset, where instance
+segmentation data is used to copy and paste objects onto images as removal
+targets, with the original images serving as ground truths. Experimental
+results demonstrate that SmartEraser significantly outperforms existing
+methods, achieving superior performance in object removal, especially in
+complex scenes with intricate compositions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project at: https://longtaojiang.github.io/smarteraser.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Driven Water Segmentation with deep learning models for Enhanced
+  Flood Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanjida Afrin Mou, Tasfia Noor Chowdhury, Adib Ibn Mannan, Sadia Nourin Mim, Lubana Tarannum, Tasrin Noman, Jamal Uddin Ahamed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Flooding is a major natural hazard causing significant fatalities and
+economic losses annually, with increasing frequency due to climate change.
+Rapid and accurate flood detection and monitoring are crucial for mitigating
+these impacts. This study compares the performance of three deep learning
+models UNet, ResNet, and DeepLabv3 for pixelwise water segmentation to aid in
+flood detection, utilizing images from drones, in field observations, and
+social media. This study involves creating a new dataset that augments
+wellknown benchmark datasets with flood-specific images, enhancing the
+robustness of the models. The UNet, ResNet, and DeepLab v3 architectures are
+tested to determine their effectiveness in various environmental conditions and
+geographical locations, and the strengths and limitations of each model are
+also discussed here, providing insights into their applicability in different
+scenarios by predicting image segmentation masks. This fully automated approach
+allows these models to isolate flooded areas in images, significantly reducing
+processing time compared to traditional semi-automated methods. The outcome of
+this study is to predict segmented masks for each image effected by a flood
+disaster and the validation accuracy of these models. This methodology
+facilitates timely and continuous flood monitoring, providing vital data for
+emergency response teams to reduce loss of life and economic damages. It offers
+a significant reduction in the time required to generate flood maps, cutting
+down the manual processing time. Additionally, we present avenues for future
+research, including the integration of multimodal data sources and the
+development of robust deep learning architectures tailored specifically for
+flood detection tasks. Overall, our work contributes to the advancement of
+flood management strategies through innovative use of deep learning
+technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards an End-to-End (E2E) Adversarial Learning and Application in the
+  Physical World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dudi Biton, Jacob Shams, Koda Satoru, Asaf Shabtai, Yuval Elovici, Ben Nassi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The traditional learning process of patch-based adversarial attacks,
+conducted in the digital domain and then applied in the physical domain (e.g.,
+via printed stickers), may suffer from reduced performance due to adversarial
+patches' limited transferability from the digital domain to the physical
+domain. Given that previous studies have considered using projectors to apply
+adversarial attacks, we raise the following question: can adversarial learning
+(i.e., patch generation) be performed entirely in the physical domain with a
+projector? In this work, we propose the Physical-domain Adversarial Patch
+Learning Augmentation (PAPLA) framework, a novel end-to-end (E2E) framework
+that converts adversarial learning from the digital domain to the physical
+domain using a projector. We evaluate PAPLA across multiple scenarios,
+including controlled laboratory settings and realistic outdoor environments,
+demonstrating its ability to ensure attack success compared to conventional
+digital learning-physical application (DL-PA) methods. We also analyze the
+impact of environmental factors, such as projection surface color, projector
+strength, ambient light, distance, and angle of the target object relative to
+the camera, on the effectiveness of projected patches. Finally, we demonstrate
+the feasibility of the attack against a parked car and a stop sign in a
+real-world outdoor environment. Our results show that under specific
+conditions, E2E adversarial learning in the physical domain eliminates the
+transferability issue and ensures evasion by object detectors. Finally, we
+provide insights into the challenges and opportunities of applying adversarial
+learning in the physical domain and explain where such an approach is more
+effective than using a sticker.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Deep Active Learning for Medical Imaging: Replay-Base
+  Architecture for Context Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Daniel, M. Rita Verdelho, Catarina Barata, Carlos Santiago
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning for medical imaging faces challenges in adapting and
+generalizing to new contexts. Additionally, it often lacks sufficient labeled
+data for specific tasks requiring significant annotation effort. Continual
+Learning (CL) tackles adaptability and generalizability by enabling lifelong
+learning from a data stream while mitigating forgetting of previously learned
+knowledge. Active Learning (AL) reduces the number of required annotations for
+effective training. This work explores both approaches (CAL) to develop a novel
+framework for robust medical image analysis. Based on the automatic recognition
+of shifts in image characteristics, Replay-Base Architecture for Context
+Adaptation (RBACA) employs a CL rehearsal method to continually learn from
+diverse contexts, and an AL component to select the most informative instances
+for annotation. A novel approach to evaluate CAL methods is established using a
+defined metric denominated IL-Score, which allows for the simultaneous
+assessment of transfer learning, forgetting, and final model performance. We
+show that RBACA works in domain and class-incremental learning scenarios, by
+assessing its IL-Score on the segmentation and diagnosis of cardiac images. The
+results show that RBACA outperforms a baseline framework without CAL, and a
+state-of-the-art CAL method across various memory sizes and annotation budgets.
+Our code is available in https://github.com/RuiDaniel/RBACA .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Feature-Level Ensemble Model for COVID-19 Identification in CXR Images
+  using Choquet Integral and Differential Evolution Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Reza Takhsha, Maryam Rastgarpour, Mozhgan Naderi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic has profoundly impacted billions globally. It
+challenges public health and healthcare systems due to its rapid spread and
+severe respiratory effects. An effective strategy to mitigate the COVID-19
+pandemic involves integrating testing to identify infected individuals. While
+RT-PCR is considered the gold standard for diagnosing COVID-19, it has some
+limitations such as the risk of false negatives. To address this problem, this
+paper introduces a novel Deep Learning Diagnosis System that integrates
+pre-trained Deep Convolutional Neural Networks (DCNNs) within an ensemble
+learning framework to achieve precise identification of COVID-19 cases from
+Chest X-ray (CXR) images. We combine feature vectors from the final hidden
+layers of pre-trained DCNNs using the Choquet integral to capture interactions
+between different DCNNs that a linear approach cannot. We employed
+Sugeno-$\lambda$ measure theory to derive fuzzy measures for subsets of
+networks to enable aggregation. We utilized Differential Evolution to estimate
+fuzzy densities. We developed a TensorFlow-based layer for Choquet operation to
+facilitate efficient aggregation, due to the intricacies involved in
+aggregating feature vectors. Experimental results on the COVIDx dataset show
+that our ensemble model achieved 98\% accuracy in three-class classification
+and 99.50\% in binary classification, outperforming its components-DenseNet-201
+(97\% for three-class, 98.75\% for binary), Inception-v3 (96.25\% for
+three-class, 98.50\% for binary), and Xception (94.50\% for three-class, 98\%
+for binary)-and surpassing many previous methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Deep Learning-based Forward Solvers for Brain Tumor Growth
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeineb Haouari, Jonas Weidner, Ivan Ezhov, Aswathi Varma, Daniel Rueckert, Bjoern Menze, Benedikt Wiestler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Glioblastoma, a highly aggressive brain tumor, poses major challenges due to
+its poor prognosis and high morbidity rates. Partial differential
+equation-based models offer promising potential to enhance therapeutic outcomes
+by simulating patient-specific tumor behavior for improved radiotherapy
+planning. However, model calibration remains a bottleneck due to the high
+computational demands of optimization methods like Monte Carlo sampling and
+evolutionary algorithms. To address this, we recently introduced an approach
+leveraging a neural forward solver with gradient-based optimization to
+significantly reduce calibration time. This approach requires a highly accurate
+and fully differentiable forward model. We investigate multiple architectures,
+including (i) an enhanced TumorSurrogate, (ii) a modified nnU-Net, and (iii) a
+3D Vision Transformer (ViT). The optimized TumorSurrogate achieved the best
+overall results, excelling in both tumor outline matching and voxel-level
+prediction of tumor cell concentration. It halved the MSE relative to the
+baseline model and achieved the highest Dice score across all tumor cell
+concentration thresholds. Our study demonstrates significant enhancement in
+forward solver performance and outlines important future research directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FramePainter: Endowing Interactive Image Editing with Video Diffusion
+  Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yabo Zhang, Xinpeng Zhou, Yihan Zeng, Hang Xu, Hui Li, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactive image editing allows users to modify images through visual
+interaction operations such as drawing, clicking, and dragging. Existing
+methods construct such supervision signals from videos, as they capture how
+objects change with various physical interactions. However, these models are
+usually built upon text-to-image diffusion models, so necessitate (i) massive
+training samples and (ii) an additional reference encoder to learn real-world
+dynamics and visual consistency. In this paper, we reformulate this task as an
+image-to-video generation problem, so that inherit powerful video diffusion
+priors to reduce training costs and ensure temporal consistency. Specifically,
+we introduce FramePainter as an efficient instantiation of this formulation.
+Initialized with Stable Video Diffusion, it only uses a lightweight sparse
+control encoder to inject editing signals. Considering the limitations of
+temporal attention in handling large motion between two frames, we further
+propose matching attention to enlarge the receptive field while encouraging
+dense correspondence between edited and source image tokens. We highlight the
+effectiveness and efficiency of FramePainter across various of editing signals:
+it domainantly outperforms previous state-of-the-art methods with far less
+training data, achieving highly seamless and coherent editing of images, \eg,
+automatically adjust the reflection of the cup. Moreover, FramePainter also
+exhibits exceptional generalization in scenarios not present in real-world
+videos, \eg, transform the clownfish into shark-like shape. Our code will be
+available at https://github.com/YBYBZhang/FramePainter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/YBYBZhang/FramePainter</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EmoNeXt: an Adapted ConvNeXt for Facial Emotion Recognition <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassine El Boudouri, Amine Bohi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expressions play a crucial role in human communication serving as a
+powerful and impactful means to express a wide range of emotions. With
+advancements in artificial intelligence and computer vision, deep neural
+networks have emerged as effective tools for facial emotion recognition. In
+this paper, we propose EmoNeXt, a novel deep learning framework for facial
+expression recognition based on an adapted ConvNeXt architecture network. We
+integrate a Spatial Transformer Network (STN) to focus on feature-rich regions
+of the face and Squeeze-and-Excitation blocks to capture channel-wise
+dependencies. Moreover, we introduce a self-attention regularization term,
+encouraging the model to generate compact feature vectors. We demonstrate the
+superiority of our model over existing state-of-the-art deep learning models on
+the FER2013 dataset regarding emotion classification accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures and 2 tables. 2023 IEEE 25th International
+  Workshop on Multimedia Signal Processing (MMSP), Poitiers, France</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Deep Hyperspectral Inpainting with the Plug and Play and
+  Deep Image Prior Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08195v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08195v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Li, Mehrdad Yaghoobi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral images are typically composed of hundreds of narrow and
+contiguous spectral bands, each containing information regarding the material
+composition of the imaged scene. However, these images can be affected by
+various sources of noise, distortions, or data loss, which can significantly
+degrade their quality and usefulness. This paper introduces a convergent
+guaranteed algorithm, LRS-PnP-DIP(1-Lip), which successfully addresses the
+instability issue of DHP that has been reported before. The proposed algorithm
+extends the successful joint low-rank and sparse model to further exploit the
+underlying data structures beyond the conventional and sometimes restrictive
+unions of subspace models. A stability analysis guarantees the convergence of
+the proposed algorithm under mild assumptions , which is crucial for its
+application in real-world scenarios. Extensive experiments demonstrate that the
+proposed solution consistently delivers visually and quantitatively superior
+inpainting results, establishing state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 9 Figures, 7 Tables. arXiv admin note: text overlap with
+  arXiv:2306.08128</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Critical Synthesis of Uncertainty Quantification and Foundation Models
+  in Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Landgraf, Rongjun Qin, Markus Ulrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent foundation models have enabled significant breakthroughs in
+monocular depth estimation, a clear path towards safe and reliable deployment
+in the real-world remains elusive. Metric depth estimation, which involves
+predicting absolute distances, poses particular challenges, as even the most
+advanced foundation models remain prone to critical errors. Since quantifying
+the uncertainty has emerged as a promising endeavor to address these
+limitations and enable trustworthy deployment, we fuse five different
+uncertainty quantification methods with the current state-of-the-art
+DepthAnythingV2 foundation model. To cover a wide range of metric depth
+domains, we evaluate their performance on four diverse datasets. Our findings
+identify fine-tuning with the Gaussian Negative Log-Likelihood Loss (GNLL) as a
+particularly promising approach, offering reliable uncertainty estimates while
+maintaining predictive performance and computational efficiency on par with the
+baseline, encompassing both training and inference time. By fusing uncertainty
+quantification and foundation models within the context of monocular depth
+estimation, this paper lays a critical foundation for future research aimed at
+improving not only model performance but also its explainability. Extending
+this critical synthesis of uncertainty quantification and foundation models
+into other crucial tasks, such as semantic segmentation and pose estimation,
+presents exciting opportunities for safer and more reliable machine vision
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CG-MER: A Card Game-based Multimodal <span class="highlight-title">dataset</span> for Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08182v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08182v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nessrine Farhat, Amine Bohi, Leila Ben Letaifa, Rim Slama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of affective computing has seen significant advancements in
+exploring the relationship between emotions and emerging technologies. This
+paper presents a novel and valuable contribution to this field with the
+introduction of a comprehensive French multimodal dataset designed specifically
+for emotion recognition. The dataset encompasses three primary modalities:
+facial expressions, speech, and gestures, providing a holistic perspective on
+emotions. Moreover, the dataset has the potential to incorporate additional
+modalities, such as Natural Language Processing (NLP) to expand the scope of
+emotion recognition research. The dataset was curated through engaging
+participants in card game sessions, where they were prompted to express a range
+of emotions while responding to diverse questions. The study included 10
+sessions with 20 participants (9 females and 11 males). The dataset serves as a
+valuable resource for furthering research in emotion recognition and provides
+an avenue for exploring the intricate connections between human emotions and
+digital technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures and 4 tables. Sixteenth International Conference
+  on Machine Vision (ICMV 2023), Yerevan, Armenia</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D$^2$-DPM: Dual Denoising for Quantized Diffusion Probabilistic Models <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Zeng, Jie Song, Han Zheng, Hao Jiang, Mingli Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have achieved cutting-edge performance in image generation.
+However, their lengthy denoising process and computationally intensive score
+estimation network impede their scalability in low-latency and
+resource-constrained scenarios. Post-training quantization (PTQ) compresses and
+accelerates diffusion models without retraining, but it inevitably introduces
+additional quantization noise, resulting in mean and variance deviations. In
+this work, we propose D2-DPM, a dual denoising mechanism aimed at precisely
+mitigating the adverse effects of quantization noise on the noise estimation
+network. Specifically, we first unravel the impact of quantization noise on the
+sampling equation into two components: the mean deviation and the variance
+deviation. The mean deviation alters the drift coefficient of the sampling
+equation, influencing the trajectory trend, while the variance deviation
+magnifies the diffusion coefficient, impacting the convergence of the sampling
+trajectory. The proposed D2-DPM is thus devised to denoise the quantization
+noise at each time step, and then denoise the noisy sample through the inverse
+diffusion iterations. Experimental results demonstrate that D2-DPM achieves
+superior generation quality, yielding a 1.42 lower FID than the full-precision
+model while achieving 3.99x compression and 11.67x bit-operation acceleration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, acceptted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Object-Centric 2D Gaussian Splatting: Background Removal and
+  Occlusion-Aware Pruning for Compact Object Models <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08174v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08174v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcel Rogge, Didier Stricker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current Gaussian Splatting approaches are effective for reconstructing entire
+scenes but lack the option to target specific objects, making them
+computationally expensive and unsuitable for object-specific applications. We
+propose a novel approach that leverages object masks to enable targeted
+reconstruction, resulting in object-centric models. Additionally, we introduce
+an occlusion-aware pruning strategy to minimize the number of Gaussians without
+compromising quality. Our method reconstructs compact object models, yielding
+object-centric Gaussian and mesh representations that are up to 96\% smaller
+and up to 71\% faster to train compared to the baseline while retaining
+competitive quality. These representations are immediately usable for
+downstream applications such as appearance editing and physics simulation
+without additional processing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICPRAM 2025 (https://icpram.scitevents.org/Home.aspx)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Multimodal Models for Fine-Grained Image Analysis: A
+  Comparative Study Across Diverse Visual Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evgenii Evstafev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces a benchmark designed to evaluate the capabilities of
+multimodal models in analyzing and interpreting images. The benchmark focuses
+on seven key visual aspects: main object, additional objects, background,
+detail, dominant colors, style, and viewpoint. A dataset of 14,580 images,
+generated from diverse text prompts, was used to assess the performance of
+seven leading multimodal models. These models were evaluated on their ability
+to accurately identify and describe each visual aspect, providing insights into
+their strengths and weaknesses for comprehensive image understanding. The
+findings of this benchmark have significant implications for the development
+and selection of multimodal models for various image analysis tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 2 tables, 2 charts</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revolutionizing Communication with Deep Learning and XAI for Enhanced
+  Arabic Sign Language Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mazen Balat, Rewaa Awaad, Ahmed B. Zaky, Salah A. Aly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces an integrated approach to recognizing Arabic Sign
+Language (ArSL) using state-of-the-art deep learning models such as
+MobileNetV3, ResNet50, and EfficientNet-B2. These models are further enhanced
+by explainable AI (XAI) techniques to boost interpretability. The ArSL2018 and
+RGB Arabic Alphabets Sign Language (AASL) datasets are employed, with
+EfficientNet-B2 achieving peak accuracies of 99.48\% and 98.99\%, respectively.
+Key innovations include sophisticated data augmentation methods to mitigate
+class imbalance, implementation of stratified 5-fold cross-validation for
+better generalization, and the use of Grad-CAM for clear model decision
+transparency. The proposed system not only sets new benchmarks in recognition
+accuracy but also emphasizes interpretability, making it suitable for
+applications in healthcare, education, and inclusive communication
+technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 25 figures, 16 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DM-Mamba: Dual-domain Multi-scale Mamba for MRI reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yucong Meng, Zhiwei Yang, Zhijian Song, Yonghong Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accelerated MRI reconstruction poses a challenging ill-posed inverse
+problem due to the significant undersampling in k-space. Deep neural networks,
+such as CNNs and ViT, have shown substantial performance improvements for this
+task while encountering the dilemma between global receptive fields and
+efficient computation. To this end, this paper pioneers exploring Mamba, a new
+paradigm for long-range dependency modeling with linear complexity, for
+efficient and effective MRI reconstruction. However, directly applying Mamba to
+MRI reconstruction faces three significant issues: (1) Mamba's row-wise and
+column-wise scanning disrupts k-space's unique spectrum, leaving its potential
+in k-space learning unexplored. (2) Existing Mamba methods unfold feature maps
+with multiple lengthy scanning paths, leading to long-range forgetting and high
+computational burden. (3) Mamba struggles with spatially-varying contents,
+resulting in limited diversity of local representations. To address these, we
+propose a dual-domain multi-scale Mamba for MRI reconstruction from the
+following perspectives: (1) We pioneer vision Mamba in k-space learning. A
+circular scanning is customized for spectrum unfolding, benefiting the global
+modeling of k-space. (2) We propose a multi-scale Mamba with an efficient
+scanning strategy in both image and k-space domains. It mitigates long-range
+forgetting and achieves a better trade-off between efficiency and performance.
+(3) We develop a local diversity enhancement module to improve the
+spatially-varying representation of Mamba. Extensive experiments are conducted
+on three public datasets for MRI reconstruction under various undersampling
+patterns. Comprehensive results demonstrate that our method significantly
+outperforms state-of-the-art methods with lower computational cost.
+Implementation code will be available at
+https://github.com/XiaoMengLiLiLi/DM-Mamba.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy Backdoor Attack to Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanene F. Z. Brachemi Meftah, Wassim Hamidouche, Sid Ahmed Fezza, Olivier Déforges, Kassem Kallas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of deep learning (DL) has increased computing complexity and energy
+use, prompting the adoption of application specific integrated circuits (ASICs)
+for energy-efficient edge and mobile deployment. However, recent studies have
+demonstrated the vulnerability of these accelerators to energy attacks. Despite
+the development of various inference time energy attacks in prior research,
+backdoor energy attacks remain unexplored. In this paper, we design an
+innovative energy backdoor attack against deep neural networks (DNNs) operating
+on sparsity-based accelerators. Our attack is carried out in two distinct
+phases: backdoor injection and backdoor stealthiness. Experimental results
+using ResNet-18 and MobileNet-V2 models trained on CIFAR-10 and Tiny ImageNet
+datasets show the effectiveness of our proposed attack in increasing energy
+consumption on trigger samples while preserving the model's performance for
+clean/regular inputs. This demonstrates the vulnerability of DNNs to energy
+backdoor attacks. The source code of our attack is available at:
+https://github.com/hbrachemi/energy_backdoor.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bootstrapping Corner Cases: High-Resolution Inpainting for Safety
+  Critical Detect and Avoid for Automated Flying 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Lyhs, Lars Hinneburg, Michael Fischer, Florian Ölsner, Stefan Milz, Jeremy Tschirner, Patrick Mäder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern machine learning techniques have shown tremendous potential,
+especially for object detection on camera images. For this reason, they are
+also used to enable safety-critical automated processes such as autonomous
+drone flights. We present a study on object detection for Detect and Avoid, a
+safety critical function for drones that detects air traffic during automated
+flights for safety reasons. An ill-posed problem is the generation of good and
+especially large data sets, since detection itself is the corner case. Most
+models suffer from limited ground truth in raw data, \eg recorded air traffic
+or frontal flight with a small aircraft. It often leads to poor and critical
+detection rates. We overcome this problem by using inpainting methods to
+bootstrap the dataset such that it explicitly contains the corner cases of the
+raw data. We provide an overview of inpainting methods and generative models
+and present an example pipeline given a small annotated dataset. We validate
+our method by generating a high-resolution dataset, which we make publicly
+available and present it to an independent object detector that was fully
+trained on real data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-visual Deepfake Detection With Local Temporal Inconsistencies <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcella Astrid, Enjie Ghorbel, Djamila Aouada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes an audio-visual deepfake detection approach that aims to
+capture fine-grained temporal inconsistencies between audio and visual
+modalities. To achieve this, both architectural and data synthesis strategies
+are introduced. From an architectural perspective, a temporal distance map,
+coupled with an attention mechanism, is designed to capture these
+inconsistencies while minimizing the impact of irrelevant temporal
+subsequences. Moreover, we explore novel pseudo-fake generation techniques to
+synthesize local inconsistencies. Our approach is evaluated against
+state-of-the-art methods using the DFDC and FakeAVCeleb datasets, demonstrating
+its effectiveness in detecting audio-visual deepfakes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAR Strikes Back: A New Hope for RSVQA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucrezia Tosato, Flora Weissgerber, Laurent Wendling, Sylvain Lobry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing visual question answering (RSVQA) is a task that automatically
+extracts information from satellite images and processes a question to predict
+the answer from the images in textual form, helping with the interpretation of
+the image. While different methods have been proposed to extract information
+from optical images with different spectral bands and resolutions, no method
+has been proposed to answer questions from Synthetic Aperture Radar (SAR)
+images. SAR images capture electromagnetic information from the scene, and are
+less affected by atmospheric conditions, such as clouds. In this work, our
+objective is to introduce SAR in the RSVQA task, finding the best way to use
+this modality. In our research, we carry out a study on different pipelines for
+the task of RSVQA taking into account information from both SAR and optical
+data. To this purpose, we also present a dataset that allows for the
+introduction of SAR images in the RSVQA framework. We propose two different
+models to include the SAR modality. The first one is an end-to-end method in
+which we add an additional encoder for the SAR modality. In the second
+approach, we build on a two-stage framework. First, relevant information is
+extracted from SAR and, optionally, optical data. This information is then
+translated into natural language to be used in the second step which only
+relies on a language model to provide the answer. We find that the second
+pipeline allows us to obtain good results with SAR images alone. We then try
+various types of fusion methods to use SAR and optical images together, finding
+that a fusion at the decision level achieves the best results on the proposed
+dataset. We show that SAR data offers additional information when fused with
+the optical modality, particularly for questions related to specific land cover
+classes, such as water areas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Birds Eye View Perception Models with Frozen Foundation
+  Models: DINOv2 and Metric3Dv2 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seamie Hayes, Ganesh Sistu, Ciarán Eising
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Birds Eye View perception models require extensive data to perform and
+generalize effectively. While traditional datasets often provide abundant
+driving scenes from diverse locations, this is not always the case. It is
+crucial to maximize the utility of the available training data. With the advent
+of large foundation models such as DINOv2 and Metric3Dv2, a pertinent question
+arises: can these models be integrated into existing model architectures to not
+only reduce the required training data but surpass the performance of current
+models? We choose two model architectures in the vehicle segmentation domain to
+alter: Lift-Splat-Shoot, and Simple-BEV. For Lift-Splat-Shoot, we explore the
+implementation of frozen DINOv2 for feature extraction and Metric3Dv2 for depth
+estimation, where we greatly exceed the baseline results by 7.4 IoU while
+utilizing only half the training data and iterations. Furthermore, we introduce
+an innovative application of Metric3Dv2's depth information as a PseudoLiDAR
+point cloud incorporated into the Simple-BEV architecture, replacing
+traditional LiDAR. This integration results in a +3 IoU improvement compared to
+the Camera-only model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the Electronic Imaging - Autonomous
+  Vehicles and Machines Connference 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoHan: Robust Hand Detection in Operation Room 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08115v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08115v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roi Papo, Sapir Gershov, Tom Friedman, Itay Or, Gil Bolotin, Shlomi Laufer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hand-specific localization has garnered significant interest within the
+computer vision community. Although there are numerous datasets with hand
+annotations from various angles and settings, domain transfer techniques
+frequently struggle in surgical environments. This is mainly due to the limited
+availability of gloved hand instances and the unique challenges of operating
+rooms (ORs). Thus, hand-detection models tailored to OR settings require
+extensive training and expensive annotation processes. To overcome these
+challenges, we present "RoHan" - a novel approach for robust hand detection in
+the OR, leveraging advanced semi-supervised domain adaptation techniques to
+tackle the challenges of varying recording conditions, diverse glove colors,
+and occlusions common in surgical settings. Our methodology encompasses two
+main stages: (1) data augmentation strategy that utilizes "Artificial Gloves,"
+a method for augmenting publicly available hand datasets with synthetic images
+of hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that
+improves detection performance in real-world OR settings through iterative
+prediction refinement and efficient frame filtering. We evaluate our method
+using two datasets: simulated enterotomy repair and saphenous vein graft
+harvesting. "RoHan" substantially reduces the need for extensive labeling and
+model training, paving the way for the practical implementation of hand
+detection technologies in medical settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Change Captioning in Remote Sensing: Evolution to SAT-Cap -- A
+  Single-Stage <span class="highlight-title">Transformer</span> Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuduo Wang, Weikang Yu, Pedram Ghamisi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Change captioning has become essential for accurately describing changes in
+multi-temporal remote sensing data, providing an intuitive way to monitor
+Earth's dynamics through natural language. However, existing change captioning
+methods face two key challenges: high computational demands due to multistage
+fusion strategy, and insufficient detail in object descriptions due to limited
+semantic extraction from individual images. To solve these challenges, we
+propose SAT-Cap based on the transformers model with a single-stage feature
+fusion for remote sensing change captioning. In particular, SAT-Cap integrates
+a Spatial-Channel Attention Encoder, a Difference-Guided Fusion module, and a
+Caption Decoder. Compared to typical models that require multi-stage fusion in
+transformer encoder and fusion module, SAT-Cap uses only a simple cosine
+similarity-based fusion module for information integration, reducing the
+complexity of the model architecture. By jointly modeling spatial and channel
+information in Spatial-Channel Attention Encoder, our approach significantly
+enhances the model's ability to extract semantic information from objects in
+multi-temporal remote sensing images. Extensive experiments validate the
+effectiveness of SAT-Cap, achieving CIDEr scores of 140.23% on the LEVIR-CC
+dataset and 97.74% on the DUBAI-CC dataset, surpassing current state-of-the-art
+methods. The code and pre-trained models will be available online.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> EarthView: A Large Scale Remote Sensing <span class="highlight-title">Dataset</span> for Self-Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08111v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08111v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Velazquez, Pau Rodriguez López, Sergio Alonso, Josep M. Gonfaus, Jordi Gonzalez, Gerardo Richarte, Javier Marin, <span class="highlight-author">Yoshua Bengio</span>, Alexandre Lacoste
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents EarthView, a comprehensive dataset specifically designed
+for self-supervision on remote sensing data, intended to enhance deep learning
+applications on Earth monitoring tasks. The dataset spans 15 tera pixels of
+global remote-sensing data, combining imagery from a diverse range of sources,
+including NEON, Sentinel, and a novel release of 1m spatial resolution data
+from Satellogic. Our dataset provides a wide spectrum of image data with
+varying resolutions, harnessed from different sensors and organized coherently
+into an accessible HuggingFace dataset in parquet format. This data spans five
+years, from 2017 to 2022. Accompanying the dataset, we introduce EarthMAE, a
+tailored Masked Autoencoder, developed to tackle the distinct challenges of
+remote sensing data. Trained in a self-supervised fashion, EarthMAE effectively
+processes different data modalities such as hyperspectral, multispectral,
+topographical data, segmentation maps, and temporal structure. This model helps
+us show that pre-training on Satellogic data improves performance on downstream
+tasks. While there is still a gap to fill in MAE for heterogeneous data, we
+regard this innovative combination of an expansive, diverse dataset and a
+versatile model adapted for self-supervised learning as a stride forward in
+deep learning for Earth monitoring.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2nd Workshop on Computer Vision for Earth Observation (CV4EO)
+  Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guiding the classification of hepatocellular carcinoma on 3D CT-scans
+  using deep and handcrafted radiological features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        E. Sarfati, A. Bône, M-M. Rohé, C. Aubé, M. Ronot, P. Gori, I. Bloch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hepatocellular carcinoma is the most spread primary liver cancer across the
+world ($\sim$80\% of the liver tumors). The gold standard for HCC diagnosis is
+liver biopsy. However, in the clinical routine, expert radiologists provide a
+visual diagnosis by interpreting hepatic CT-scans according to a standardized
+protocol, the LI-RADS, which uses five radiological criteria with an associated
+decision tree. In this paper, we propose an automatic approach to predict
+histology-proven HCC from CT images in order to reduce radiologists'
+inter-variability. We first show that standard deep learning methods fail to
+accurately predict HCC from CT-scans on a challenging database, and propose a
+two-step approach inspired by the LI-RADS system to improve the performance. We
+achieve improvements from 6 to 18 points of AUC with respect to deep learning
+baselines trained with different architectures. We also provide clinical
+validation of our method, achieving results that outperform non-expert
+radiologists and are on par with expert ones.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE ISBI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CellOMaps: A Compact Representation for Robust Classification of Lung
+  Adenocarcinoma Growth Patterns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arwa Al-Rubaian, Gozde N. Gunesli, Wajd A. Althakfi, Ayesha Azam, David Snead, Nasir M. Rajpoot, Shan E Ahmed Raza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lung adenocarcinoma (LUAD) is a morphologically heterogeneous disease,
+characterized by five primary histological growth patterns. The classification
+of such patterns is crucial due to their direct relation to prognosis but the
+high subjectivity and observer variability pose a major challenge. Although
+several studies have developed machine learning methods for growth pattern
+classification, they either only report the predominant pattern per slide or
+lack proper evaluation. We propose a generalizable machine learning pipeline
+capable of classifying lung tissue into one of the five patterns or as
+non-tumor. The proposed pipeline's strength lies in a novel compact Cell
+Organization Maps (cellOMaps) representation that captures the cellular spatial
+patterns from Hematoxylin and Eosin whole slide images (WSIs). The proposed
+pipeline provides state-of-the-art performance on LUAD growth pattern
+classification when evaluated on both internal unseen slides and external
+datasets, significantly outperforming the current approaches. In addition, our
+preliminary results show that the model's outputs can be used to predict
+patients Tumor Mutational Burden (TMB) levels.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentPose: Progressive Distribution Alignment via Feature Agent for
+  Human Pose Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Zhang, Jinwei Liu, Xiatian Zhu, Lei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pose distillation is widely adopted to reduce model size in human pose
+estimation. However, existing methods primarily emphasize the transfer of
+teacher knowledge while often neglecting the performance degradation resulted
+from the curse of capacity gap between teacher and student. To address this
+issue, we propose AgentPose, a novel pose distillation method that integrates a
+feature agent to model the distribution of teacher features and progressively
+aligns the distribution of student features with that of the teacher feature,
+effectively overcoming the capacity gap and enhancing the ability of knowledge
+transfer. Our comprehensive experiments conducted on the COCO dataset
+substantiate the effectiveness of our method in knowledge transfer,
+particularly in scenarios with a high capacity gap.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Vision Foundation Models for Input Monitoring in Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08083v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08083v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nert Keser, Halil Ibrahim Orhan, Niki Amini-Naieni, Gesina Schwalbe, Alois Knoll, Matthias Rottmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) remain challenged by distribution shifts in
+complex open-world domains like automated driving (AD): Absolute robustness
+against yet unknown novel objects (semantic shift) or styles like lighting
+conditions (covariate shift) cannot be guaranteed. Hence, reliable
+operation-time monitors for identification of out-of-training-data-distribution
+(OOD) scenarios are imperative. Current approaches for OOD classification are
+untested for complex domains like AD, are limited in the kinds of shifts they
+detect, or even require supervision with OOD samples. To prepare for
+unanticipated shifts, we instead establish a framework around a principled,
+unsupervised, and model-agnostic method that unifies detection of all kinds of
+shifts: Find a full model of the training data's feature distribution, to then
+use its density at new points as in-distribution (ID) score. To implement this,
+we propose to combine the newly available Vision Foundation Models (VFM) as
+feature extractors with one of four alternative density modeling techniques. In
+an extensive benchmark of 4 VFMs against 20 baselines, we show the superior
+performance of VFM feature encodings compared to shift-specific OOD monitors.
+Additionally, we find that sophisticated architectures outperform larger latent
+space dimensionality; and our method identifies samples with higher risk of
+errors on downstream tasks, despite being model-agnostic. This suggests that
+VFMs are promising to realize model-agnostic, unsupervised, reliable safety
+monitors in complex vision tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Skeleton and Font Generation Network for Zero-shot Chinese Character
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mobai Xue, Jun Du, Zhenrong Zhang, Jiefeng Ma, Qikai Chang, Pengfei Hu, Jianshu Zhang, Yu Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic font generation remains a challenging research issue, primarily due
+to the vast number of Chinese characters, each with unique and intricate
+structures. Our investigation of previous studies reveals inherent bias capable
+of causing structural changes in characters. Specifically, when generating a
+Chinese character similar to, but different from, those in the training
+samples, the bias is prone to either correcting or ignoring these subtle
+variations. To address this concern, we propose a novel Skeleton and Font
+Generation Network (SFGN) to achieve a more robust Chinese character font
+generation. Our approach includes a skeleton builder and font generator. The
+skeleton builder synthesizes content features using low-resource text input,
+enabling our technique to realize font generation independently of content
+image inputs. Unlike previous font generation methods that treat font style as
+a global embedding, we introduce a font generator to align content and style
+features on the radical level, which is a brand-new perspective for font
+generation. Except for common characters, we also conduct experiments on
+misspelled characters, a substantial portion of which slightly differs from the
+common ones. Our approach visually demonstrates the efficacy of generated
+images and outperforms current state-of-the-art font generation methods.
+Moreover, we believe that misspelled character generation have significant
+pedagogical implications and verify such supposition through experiments. We
+used generated misspelled characters as data augmentation in Chinese character
+error correction tasks, simulating the scenario where students learn
+handwritten Chinese characters with the help of misspelled characters. The
+significantly improved performance of error correction tasks demonstrates the
+effectiveness of our proposed approach and the value of misspelled character
+generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Attentive Spatio-Temporal Calibration for Precise Intermediate
+  Layer Matching in ANN-to-SNN Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Hong, Yueming Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs) are promising for low-power computation due to
+their event-driven mechanism but often suffer from lower accuracy compared to
+Artificial Neural Networks (ANNs). ANN-to-SNN knowledge distillation can
+improve SNN performance, but previous methods either focus solely on label
+information, missing valuable intermediate layer features, or use a layer-wise
+approach that neglects spatial and temporal semantic inconsistencies, leading
+to performance degradation.To address these limitations, we propose a novel
+method called self-attentive spatio-temporal calibration (SASTC). SASTC uses
+self-attention to identify semantically aligned layer pairs between ANN and
+SNN, both spatially and temporally. This enables the autonomous transfer of
+relevant semantic information. Extensive experiments show that SASTC
+outperforms existing methods, effectively solving the mismatching problem.
+Superior accuracy results include 95.12% on CIFAR-10, 79.40% on CIFAR-100 with
+2 time steps, and 68.69% on ImageNet with 4 time steps for static datasets, and
+97.92% on DVS-Gesture and 83.60% on DVS-CIFAR10 for neuromorphic datasets. This
+marks the first time SNNs have outperformed ANNs on both CIFAR-10 and
+CIFAR-100, shedding the new light on the potential applications of SNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring visual language models as a powerful tool in the diagnosis of
+  Ewing Sarcoma 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alvaro Pastor-Naranjo, Pablo Meseguer, Rocío del Amor, Jose Antonio Lopez-Guerrero, Samuel Navarro, Katia Scotlandi, Antonio Llombart-Bosch, Isidro Machado, Valery Naranjo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ewing's sarcoma (ES), characterized by a high density of small round blue
+cells without structural organization, presents a significant health concern,
+particularly among adolescents aged 10 to 19. Artificial intelligence-based
+systems for automated analysis of histopathological images are promising to
+contribute to an accurate diagnosis of ES. In this context, this study explores
+the feature extraction ability of different pre-training strategies for
+distinguishing ES from other soft tissue or bone sarcomas with similar
+morphology in digitized tissue microarrays for the first time, as far as we
+know. Vision-language supervision (VLS) is compared to fully-supervised
+ImageNet pre-training within a multiple instance learning paradigm. Our
+findings indicate a substantial improvement in diagnostic accuracy with the
+adaption of VLS using an in-domain dataset. Notably, these models not only
+enhance the accuracy of predicted classes but also drastically reduce the
+number of trainable parameters and computational costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures, 2 tables. Oral presentation at KES-InMed 2024
+  held in Madeira, Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Low-Light Human Pose Estimation through Illumination-Texture
+  Modulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Zhang, Ze Li, Xiatian Zhu, Lei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As critical visual details become obscured, the low visibility and high ISO
+noise in extremely low-light images pose a significant challenge to human pose
+estimation. Current methods fail to provide high-quality representations due to
+reliance on pixel-level enhancements that compromise semantics and the
+inability to effectively handle extreme low-light conditions for robust feature
+learning. In this work, we propose a frequency-based framework for low-light
+human pose estimation, rooted in the "divide-and-conquer" principle. Instead of
+uniformly enhancing the entire image, our method focuses on task-relevant
+information. By applying dynamic illumination correction to the low-frequency
+components and low-rank denoising to the high-frequency components, we
+effectively enhance both the semantic and texture information essential for
+accurate pose estimation. As a result, this targeted enhancement method results
+in robust, high-quality representations, significantly improving pose
+estimation performance. Extensive experiments demonstrating its superiority
+over state-of-the-art methods in various challenging low-light scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DisCoPatch: Batch Statistics Are All You Need For OOD Detection, But
+  Only If You Can Trust Them 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Caetano, Christiaan Viviers, Luis A. Zavala-Mondragón, Peter H. N. de With, Fons van der Sommen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection holds significant importance across many
+applications. While semantic and domain-shift OOD problems are well-studied,
+this work focuses on covariate shifts - subtle variations in the data
+distribution that can degrade machine learning performance. We hypothesize that
+detecting these subtle shifts can improve our understanding of in-distribution
+boundaries, ultimately improving OOD detection. In adversarial discriminators
+trained with Batch Normalization (BN), real and adversarial samples form
+distinct domains with unique batch statistics - a property we exploit for OOD
+detection. We introduce DisCoPatch, an unsupervised Adversarial Variational
+Autoencoder (VAE) framework that harnesses this mechanism. During inference,
+batches consist of patches from the same image, ensuring a consistent data
+distribution that allows the model to rely on batch statistics. DisCoPatch uses
+the VAE's suboptimal outputs (generated and reconstructed) as negative samples
+to train the discriminator, thereby improving its ability to delineate the
+boundary between in-distribution samples and covariate shifts. By tightening
+this boundary, DisCoPatch achieves state-of-the-art results in public OOD
+detection benchmarks. The proposed model not only excels in detecting covariate
+shifts, achieving 95.5% AUROC on ImageNet-1K(-C) but also outperforms all prior
+methods on public Near-OOD (95.0%) benchmarks. With a compact model size of
+25MB, it achieves high OOD detection performance at notably lower latency than
+existing methods, making it an efficient and practical solution for real-world
+OOD detection applications. The code will be made publicly available
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Maximizing Uncertainty for Federated learning via Bayesian
+  Optimisation-based Model Poisoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Aristodemou, Xiaolan Liu, Yuan Wang, Konstantinos G. Kyriakopoulos, Sangarapillai Lambotharan, Qingsong Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As we transition from Narrow Artificial Intelligence towards Artificial Super
+Intelligence, users are increasingly concerned about their privacy and the
+trustworthiness of machine learning (ML) technology. A common denominator for
+the metrics of trustworthiness is the quantification of uncertainty inherent in
+DL algorithms, and specifically in the model parameters, input data, and model
+predictions. One of the common approaches to address privacy-related issues in
+DL is to adopt distributed learning such as federated learning (FL), where
+private raw data is not shared among users. Despite the privacy-preserving
+mechanisms in FL, it still faces challenges in trustworthiness. Specifically,
+the malicious users, during training, can systematically create malicious model
+parameters to compromise the models predictive and generative capabilities,
+resulting in high uncertainty about their reliability. To demonstrate malicious
+behaviour, we propose a novel model poisoning attack method named Delphi which
+aims to maximise the uncertainty of the global model output. We achieve this by
+taking advantage of the relationship between the uncertainty and the model
+parameters of the first hidden layer of the local model. Delphi employs two
+types of optimisation , Bayesian Optimisation and Least Squares Trust Region,
+to search for the optimal poisoned model parameters, named as Delphi-BO and
+Delphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise
+the distance of the predictive probability distribution towards an uncertain
+distribution of model output. Furthermore, we establish a mathematical proof
+for the attack effectiveness demonstrated in FL. Numerical results demonstrate
+that Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR
+highlighting vulnerability of FL systems to model poisoning attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combining imaging and shape features for prediction tasks of Alzheimer's
+  disease classification and brain age regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nairouz Shehata, Carolina Piçarra, Ben Glocker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate combining imaging and shape features extracted from MRI for
+the clinically relevant tasks of brain age prediction and Alzheimer's disease
+classification. Our proposed model fuses ResNet-extracted image embeddings with
+shape embeddings from a bespoke graph neural network. The shape embeddings are
+derived from surface meshes of 15 brain structures, capturing detailed
+geometric information. Combined with the appearance features from T1-weighted
+images, we observe improvements in the prediction performance on both tasks,
+with substantial gains for classification. We evaluate the model using public
+datasets, including CamCAN, IXI, and OASIS3, demonstrating the effectiveness of
+fusing imaging and shape features for brain analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GAC-Net_Geometric and attention-based Network for Depth Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuang Zhu, Xingli Gan, Min Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth completion is a key task in autonomous driving, aiming to complete
+sparse LiDAR depth measurements into high-quality dense depth maps through
+image guidance. However, existing methods usually treat depth maps as an
+additional channel of color images, or directly perform convolution on sparse
+data, failing to fully exploit the 3D geometric information in depth maps,
+especially with limited performance in complex boundaries and sparse areas. To
+address these issues, this paper proposes a depth completion network combining
+channel attention mechanism and 3D global feature perception (CGA-Net). The
+main innovations include: 1) Utilizing PointNet++ to extract global 3D
+geometric features from sparse depth maps, enhancing the scene perception
+ability of low-line LiDAR data; 2) Designing a channel-attention-based
+multimodal feature fusion module to efficiently integrate sparse depth, RGB
+images, and 3D geometric features; 3) Combining residual learning with CSPN++
+to optimize the depth refinement stage, further improving the completion
+quality in edge areas and complex scenes. Experiments on the KITTI depth
+completion dataset show that CGA-Net can significantly improve the prediction
+accuracy of dense depth maps, achieving a new state-of-the-art (SOTA), and
+demonstrating strong robustness to sparse and complex scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13pages,4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Threshold Attention Network for Semantic Segmentation of Remote Sensing
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Long, Yongjun Zhang, Zhongwei Cui, Yujie Xu, Xuexue Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation of remote sensing images is essential for various
+applications, including vegetation monitoring, disaster management, and urban
+planning. Previous studies have demonstrated that the self-attention mechanism
+(SA) is an effective approach for designing segmentation networks that can
+capture long-range pixel dependencies. SA enables the network to model the
+global dependencies between the input features, resulting in improved
+segmentation outcomes. However, the high density of attentional feature maps
+used in this mechanism causes exponential increases in computational
+complexity. Additionally, it introduces redundant information that negatively
+impacts the feature representation. Inspired by traditional threshold
+segmentation algorithms, we propose a novel threshold attention mechanism
+(TAM). This mechanism significantly reduces computational effort while also
+better modeling the correlation between different regions of the feature map.
+Based on TAM, we present a threshold attention network (TANet) for semantic
+segmentation. TANet consists of an attentional feature enhancement module
+(AFEM) for global feature enhancement of shallow features and a threshold
+attention pyramid pooling module (TAPP) for acquiring feature information at
+different scales for deep features. We have conducted extensive experiments on
+the ISPRS Vaihingen and Potsdam datasets. The results demonstrate the validity
+and superiority of our proposed TANet compared to the most state-of-the-art
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ V-Trans4Style: Visual Transition Recommendation for Video Production
+  Style Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pooja Guhan, Tsung-Wei Huang, Guan-Ming Su, Subhadra Gopalakrishnan, Dinesh Manocha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce V-Trans4Style, an innovative algorithm tailored for dynamic
+video content editing needs. It is designed to adapt videos to different
+production styles like documentaries, dramas, feature films, or a specific
+YouTube channel's video-making technique. Our algorithm recommends optimal
+visual transitions to help achieve this flexibility using a more bottom-up
+approach. We first employ a transformer-based encoder-decoder network to learn
+recommending temporally consistent and visually seamless sequences of visual
+transitions using only the input videos. We then introduce a style conditioning
+module that leverages this model to iteratively adjust the visual transitions
+obtained from the decoder through activation maximization. We demonstrate the
+efficacy of our method through experiments conducted on our newly introduced
+AutoTransition++ dataset. It is a 6k video version of AutoTransition Dataset
+that additionally categorizes its videos into different production style
+categories. Our encoder-decoder model outperforms the state-of-the-art
+transition recommendation method, achieving improvements of 10% to 80% in
+Recall@K and mean rank values over baseline. Our style conditioning module
+results in visual transitions that improve the capture of the desired video
+production style characteristics by an average of around 12% in comparison to
+other methods when measured with similarity metrics. We hope that our work
+serves as a foundation for exploring and understanding video production styles
+further.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Facial Dynamics in Video: Instruction Tuning for Improved Facial
+  Expression Perception and Contextual Awareness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Zhao, Boyuan Sun, Xiang Chen, Xihan Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expression captioning has found widespread application across various
+domains. Recently, the emergence of video Multimodal Large Language Models
+(MLLMs) has shown promise in general video understanding tasks. However,
+describing facial expressions within videos poses two major challenges for
+these models: (1) the lack of adequate datasets and benchmarks, and (2) the
+limited visual token capacity of video MLLMs. To address these issues, this
+paper introduces a new instruction-following dataset tailored for dynamic
+facial expression caption. The dataset comprises 5,033 high-quality video clips
+annotated manually, containing over 700,000 tokens. Its purpose is to improve
+the capability of video MLLMs to discern subtle facial nuances. Furthermore, we
+propose FaceTrack-MM, which leverages a limited number of tokens to encode the
+main character's face. This model demonstrates superior performance in tracking
+faces and focusing on the facial expressions of the main characters, even in
+intricate multi-person scenarios. Additionally, we introduce a novel evaluation
+metric combining event extraction, relation classification, and the longest
+common subsequence (LCS) algorithm to assess the content consistency and
+temporal sequence consistency of generated text. Moreover, we present
+FEC-Bench, a benchmark designed to assess the performance of existing video
+MLLMs in this specific task. All data and source code will be made publicly
+available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Video Moment Retrieval via Off-the-shelf Multimodal Large
+  Language Models <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifang Xu, Yunzhuo Sun, Benxiang Zhai, Ming Li, Wenxin Liang, Yang Li, Sidan Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The target of video moment retrieval (VMR) is predicting temporal spans
+within a video that semantically match a given linguistic query. Existing VMR
+methods based on multimodal large language models (MLLMs) overly rely on
+expensive high-quality datasets and time-consuming fine-tuning. Although some
+recent studies introduce a zero-shot setting to avoid fine-tuning, they
+overlook inherent language bias in the query, leading to erroneous
+localization. To tackle the aforementioned challenges, this paper proposes
+Moment-GPT, a tuning-free pipeline for zero-shot VMR utilizing frozen MLLMs.
+Specifically, we first employ LLaMA-3 to correct and rephrase the query to
+mitigate language bias. Subsequently, we design a span generator combined with
+MiniGPT-v2 to produce candidate spans adaptively. Finally, to leverage the
+video comprehension capabilities of MLLMs, we apply VideoChatGPT and span
+scorer to select the most appropriate spans. Our proposed method substantially
+outperforms the state-ofthe-art MLLM-based and zero-shot models on several
+public datasets, including QVHighlights, ActivityNet-Captions, and
+Charades-STA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SkipClick: Combining Quick Responses and Low-Level Features for
+  Interactive Segmentation in Winter Sports Contexts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robin Schön, Julian Lorenz, Daniel Kienzle, Rainer Lienhart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel architecture for interactive segmentation
+in winter sports contexts. The field of interactive segmentation deals with the
+prediction of high-quality segmentation masks by informing the network about
+the objects position with the help of user guidance. In our case the guidance
+consists of click prompts. For this task, we first present a baseline
+architecture which is specifically geared towards quickly responding after each
+click. Afterwards, we motivate and describe a number of architectural
+modifications which improve the performance when tasked with segmenting winter
+sports equipment on the WSESeg dataset. With regards to the average NoC@85
+metric on the WSESeg classes, we outperform SAM and HQ-SAM by 2.336 and 7.946
+clicks, respectively. When applied to the HQSeg-44k dataset, our system
+delivers state-of-the-art results with a NoC@90 of 6.00 and NoC@95 of 9.89. In
+addition to that, we test our model on a novel dataset containing masks for
+humans during skiing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 figures, 6 tables, 12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Guide Dog: Egocentric Path Prediction on Smartphone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishwarya Jadhav, Jeffery Cao, Abhishree Shetty, Urvashi Priyam Kumar, Aditi Sharma, Ben Sukboontip, Jayant Sravan Tamarapalli, Jingyi Zhang, Anirudh Koul
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces AI Guide Dog (AIGD), a lightweight egocentric
+navigation assistance system for visually impaired individuals, designed for
+real-time deployment on smartphones. AIGD addresses key challenges in blind
+navigation by employing a vision-only, multi-label classification approach to
+predict directional commands, ensuring safe traversal across diverse
+environments. We propose a novel technique to enable goal-based outdoor
+navigation by integrating GPS signals and high-level directions, while also
+addressing uncertain multi-path predictions for destination-free indoor
+navigation. Our generalized model is the first navigation assistance system to
+handle both goal-oriented and exploratory navigation scenarios across indoor
+and outdoor settings, establishing a new state-of-the-art in blind navigation.
+We present methods, datasets, evaluations, and deployment insights to encourage
+further innovations in assistive navigation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Hyperspectral Image Panshapring via Sparse Spatial-Spectral
+  Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chia-Ming Lee, Yu-Fan Lin, Li-Wei Kang, Chih-Chung Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-resolution hyperspectral imaging plays a crucial role in various remote
+sensing applications, yet its acquisition often faces fundamental limitations
+due to hardware constraints. This paper introduces S$^{3}$RNet, a novel
+framework for hyperspectral image pansharpening that effectively combines
+low-resolution hyperspectral images (LRHSI) with high-resolution multispectral
+images (HRMSI) through sparse spatial-spectral representation. The core of
+S$^{3}$RNet is the Multi-Branch Fusion Network (MBFN), which employs parallel
+branches to capture complementary features at different spatial and spectral
+scales. Unlike traditional approaches that treat all features equally, our
+Spatial-Spectral Attention Weight Block (SSAWB) dynamically adjusts feature
+weights to maintain sparse representation while suppressing noise and
+redundancy. To enhance feature propagation, we incorporate the Dense Feature
+Aggregation Block (DFAB), which efficiently aggregates inputted features
+through dense connectivity patterns. This integrated design enables S$^{3}$RNet
+to selectively emphasize the most informative features from differnt scale
+while maintaining computational efficiency. Comprehensive experiments
+demonstrate that S$^{3}$RNet achieves state-of-the-art performance across
+multiple evaluation metrics, showing particular strength in maintaining high
+reconstruction quality even under challenging noise conditions. The code will
+be made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IGARSS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early prediction of the transferability of bovine embryos from
+  videomicroscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasmine Hachani, Patrick Bouthemy, Elisa Fromont, Sylvie Ruffini, Ludivine Laffont, Alline de Paula Reis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Videomicroscopy is a promising tool combined with machine learning for
+studying the early development of in vitro fertilized bovine embryos and
+assessing its transferability as soon as possible. We aim to predict the embryo
+transferability within four days at most, taking 2D time-lapse microscopy
+videos as input. We formulate this problem as a supervised binary
+classification problem for the classes transferable and not transferable. The
+challenges are three-fold: 1) poorly discriminating appearance and motion, 2)
+class ambiguity, 3) small amount of annotated data. We propose a 3D
+convolutional neural network involving three pathways, which makes it
+multi-scale in time and able to handle appearance and motion in different ways.
+For training, we retain the focal loss. Our model, named SFR, compares
+favorably to other methods. Experiments demonstrate its effectiveness and
+accuracy for our challenging biological task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2024 IEEE International Conference on Image
+  Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VENOM: Text-driven Unrestricted Adversarial Example Generation with
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Kuurila-Zhang, Haoyu Chen, Guoying Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks have proven effective in deceiving machine learning
+models by subtly altering input images, motivating extensive research in recent
+years. Traditional methods constrain perturbations within $l_p$-norm bounds,
+but advancements in Unrestricted Adversarial Examples (UAEs) allow for more
+complex, generative-model-based manipulations. Diffusion models now lead UAE
+generation due to superior stability and image quality over GANs. However,
+existing diffusion-based UAE methods are limited to using reference images and
+face challenges in generating Natural Adversarial Examples (NAEs) directly from
+random noise, often producing uncontrolled or distorted outputs. In this work,
+we introduce VENOM, the first text-driven framework for high-quality
+unrestricted adversarial examples generation through diffusion models. VENOM
+unifies image content generation and adversarial synthesis into a single
+reverse diffusion process, enabling high-fidelity adversarial examples without
+sacrificing attack success rate (ASR). To stabilize this process, we
+incorporate an adaptive adversarial guidance strategy with momentum, ensuring
+that the generated adversarial examples $x^*$ align with the distribution
+$p(x)$ of natural images. Extensive experiments demonstrate that VENOM achieves
+superior ASR and image quality compared to prior methods, marking a significant
+advancement in adversarial example generation and providing insights into model
+vulnerabilities for improved defense development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cloud Removal With PolSAR-Optical Data Fusion Using A Two-Flow Residual
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxi Wang, Wenjuan Zhang, Bing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical remote sensing images play a crucial role in the observation of the
+Earth's surface. However, obtaining complete optical remote sensing images is
+challenging due to cloud cover. Reconstructing cloud-free optical images has
+become a major task in recent years. This paper presents a two-flow
+Polarimetric Synthetic Aperture Radar (PolSAR)-Optical data fusion cloud
+removal algorithm (PODF-CR), which achieves the reconstruction of missing
+optical images. PODF-CR consists of an encoding module and a decoding module.
+The encoding module includes two parallel branches that extract PolSAR image
+features and optical image features. To address speckle noise in PolSAR images,
+we introduce dynamic filters in the PolSAR branch for image denoising. To
+better facilitate the fusion between multimodal optical images and PolSAR
+images, we propose fusion blocks based on cross-skip connections to enable
+interaction of multimodal data information. The obtained fusion features are
+refined through an attention mechanism to provide better conditions for the
+subsequent decoding of the fused images. In the decoding module, multi-scale
+convolution is introduced to obtain multi-scale information. Additionally, to
+better utilize comprehensive scattering information and polarization
+characteristics to assist in the restoration of optical images, we use a
+dataset for cloud restoration called OPT-BCFSAR-PFSAR, which includes
+backscatter coefficient feature images and polarization feature images obtained
+from PoLSAR data and optical images. Experimental results demonstrate that this
+method outperforms existing methods in both qualitative and quantitative
+evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Demographic Variability in Face Image Quality Measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wassim Kabbani, Kiran Raja, Raghavendra Ramachandra, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face image quality assessment (FIQA) algorithms are being integrated into
+online identity management applications. These applications allow users to
+upload a face image as part of their document issuance process, where the image
+is then run through a quality assessment process to make sure it meets the
+quality and compliance requirements. Concerns about demographic bias have been
+raised about biometric systems, given the societal implications this may cause.
+It is therefore important that demographic variability in FIQA algorithms is
+assessed such that mitigation measures can be created. In this work, we study
+the demographic variability of all face image quality measures included in the
+ISO/IEC 29794-5 international standard across three demographic variables: age,
+gender, and skin tone. The results are rather promising and show no clear bias
+toward any specific demographic group for most measures. Only two quality
+measures are found to have considerable variations in their outcomes for
+different groups on the skin tone variable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tarsier2: Advancing Large Vision-Language Models from Detailed Video
+  Description to Comprehensive Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Yuan, Jiawei Wang, Haomiao Sun, Yuchen Zhang, Yuan Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM)
+designed for generating detailed and accurate video descriptions, while also
+exhibiting superior general video understanding capabilities. Tarsier2 achieves
+significant advancements through three key upgrades: (1) Scaling pre-training
+data from 11M to 40M video-text pairs, enriching both volume and diversity; (2)
+Performing fine-grained temporal alignment during supervised fine-tuning; (3)
+Using model-based sampling to automatically construct preference data and
+applying DPO training for optimization. Extensive experiments show that
+Tarsier2-7B consistently outperforms leading proprietary models, including
+GPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K
+benchmark, Tarsier2-7B improves F1 by 2.8\% over GPT-4o and 5.8\% over
+Gemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\%
+performance advantage over GPT-4o and +24.9\% over Gemini-1.5-Pro. Tarsier2-7B
+also sets new state-of-the-art results across 15 public benchmarks, spanning
+tasks such as video question-answering, video grounding, hallucination test,
+and embodied question-answering, demonstrating its versatility as a robust
+generalist vision-language model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Algorithmic Bias in Multiclass CNN Classifications Using
+  Causal Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Sik Byun, Wendy Wan Yee Hui, Wai Kwong Lau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study describes a procedure for applying causal modeling to detect and
+mitigate algorithmic bias in a multiclass classification problem. The dataset
+was derived from the FairFace dataset, supplemented with emotional labels
+generated by the DeepFace pre-trained model. A custom Convolutional Neural
+Network (CNN) was developed, consisting of four convolutional blocks, followed
+by fully connected layers and dropout layers to mitigate overfitting. Gender
+bias was identified in the CNN model's classifications: Females were more
+likely to be classified as "happy" or "sad," while males were more likely to be
+classified as "neutral." To address this, the one-vs-all (OvA) technique was
+applied. A causal model was constructed for each emotion class to adjust the
+CNN model's predicted class probabilities. The adjusted probabilities for the
+various classes were then aggregated by selecting the class with the highest
+probability. The resulting debiased classifications demonstrated enhanced
+gender fairness across all classes, with negligible impact--or even a slight
+improvement--on overall accuracy. This study highlights that algorithmic
+fairness and accuracy are not necessarily trade-offs. All data and code for
+this study are publicly available for download.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages; 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Make-A-Character 2: Animatable 3D Character Generation From a Single
+  Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Liu, Yutong Wang, Jiahao Chen, Jianfang Li, Tangli Xue, Longlong Li, Jianqiang Ren, Liefeng Bo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report introduces Make-A-Character 2, an advanced system for generating
+high-quality 3D characters from single portrait photographs, ideal for game
+development and digital human applications. Make-A-Character 2 builds upon its
+predecessor by incorporating several significant improvements for image-based
+head generation. We utilize the IC-Light method to correct non-ideal
+illumination in input photos and apply neural network-based color correction to
+harmonize skin tones between the photos and game engine renders. We also employ
+the Hierarchical Representation Network to capture high-frequency facial
+structures and conduct adaptive skeleton calibration for accurate and
+expressive facial animations. The entire image-to-3D-character generation
+process takes less than 2 minutes. Furthermore, we leverage transformer
+architecture to generate co-speech facial and gesture actions, enabling
+real-time conversation with the generated character. These technologies have
+been integrated into our conversational AI avatar products.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ deepTerra -- AI Land Classification Made Easy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Keith Wilkinson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  deepTerra is a comprehensive platform designed to facilitate the
+classification of land surface features using machine learning and satellite
+imagery. The platform includes modules for data collection, image augmentation,
+training, testing, and prediction, streamlining the entire workflow for image
+classification tasks. This paper presents a detailed overview of the
+capabilities of deepTerra, shows how it has been applied to various research
+areas, and discusses the future directions it might take.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ State-of-the-Art <span class="highlight-title">Transformer</span> Models for Image Super-Resolution:
+  Techniques, Challenges, and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debasish Dutta, Deepjyoti Chetia, Neeharika Sonowal, Sanjib Kr Kalita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image Super-Resolution (SR) aims to recover a high-resolution image from its
+low-resolution counterpart, which has been affected by a specific degradation
+process. This is achieved by enhancing detail and visual quality. Recent
+advancements in transformer-based methods have remolded image super-resolution
+by enabling high-quality reconstructions surpassing previous deep-learning
+approaches like CNN and GAN-based. This effectively addresses the limitations
+of previous methods, such as limited receptive fields, poor global context
+capture, and challenges in high-frequency detail recovery. Additionally, the
+paper reviews recent trends and advancements in transformer-based SR models,
+exploring various innovative techniques and architectures that combine
+transformers with traditional networks to balance global and local contexts.
+These neoteric methods are critically analyzed, revealing promising yet
+unexplored gaps and potential directions for future research. Several
+visualizations of models and techniques are included to foster a holistic
+understanding of recent trends. This work seeks to offer a structured roadmap
+for researchers at the forefront of deep learning, specifically exploring the
+impact of transformers on super-resolution techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Intra- and Cross-frame Topological Consistency Scheme for
+  Semi-supervised Atherosclerotic Coronary Plaque Segmentation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziheng Zhang, Zihan Li, Dandan Shan, Yuehui Qiu, Qingqi Hong, Qingqiang Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enhancing the precision of segmenting coronary atherosclerotic plaques from
+CT Angiography (CTA) images is pivotal for advanced Coronary Atherosclerosis
+Analysis (CAA), which distinctively relies on the analysis of vessel
+cross-section images reconstructed via Curved Planar Reformation. This task
+presents significant challenges due to the indistinct boundaries and structures
+of plaques and blood vessels, leading to the inadequate performance of current
+deep learning models, compounded by the inherent difficulty in annotating such
+complex data. To address these issues, we propose a novel dual-consistency
+semi-supervised framework that integrates Intra-frame Topological Consistency
+(ITC) and Cross-frame Topological Consistency (CTC) to leverage labeled and
+unlabeled data. ITC employs a dual-task network for simultaneous segmentation
+mask and Skeleton-aware Distance Transform (SDT) prediction, achieving similar
+prediction of topology structure through consistency constraint without
+additional annotations. Meanwhile, CTC utilizes an unsupervised estimator for
+analyzing pixel flow between skeletons and boundaries of adjacent frames,
+ensuring spatial continuity. Experiments on two CTA datasets show that our
+method surpasses existing semi-supervised methods and approaches the
+performance of supervised methods on CAA. In addition, our method also performs
+better than other methods on the ACDC dataset, demonstrating its
+generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3UR-LLM: An End-to-End Multimodal Large Language Model for 3D Scene
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haomiao Xiong, Yunzhi Zhuge, Jiawen Zhu, Lu Zhang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal Large Language Models (MLLMs) exhibit impressive capabilities in
+2D tasks, yet encounter challenges in discerning the spatial positions,
+interrelations, and causal logic in scenes when transitioning from 2D to 3D
+representations. We find that the limitations mainly lie in: i) the high
+annotation cost restricting the scale-up of volumes of 3D scene data, and ii)
+the lack of a straightforward and effective way to perceive 3D information
+which results in prolonged training durations and complicates the streamlined
+framework. To this end, we develop pipeline based on open-source 2D MLLMs and
+LLMs to generate high-quality 3D-text pairs and construct 3DS-160K , to enhance
+the pre-training process. Leveraging this high-quality pre-training data, we
+introduce the 3UR-LLM model, an end-to-end 3D MLLM designed for precise
+interpretation of 3D scenes, showcasing exceptional capability in navigating
+the complexities of the physical world. 3UR-LLM directly receives 3D point
+cloud as input and project 3D features fused with text instructions into a
+manageable set of tokens. Considering the computation burden derived from these
+hybrid tokens, we design a 3D compressor module to cohesively compress the 3D
+spatial cues and textual narrative. 3UR-LLM achieves promising performance with
+respect to the previous SOTAs, for instance, 3UR-LLM exceeds its counterparts
+by 7.1\% CIDEr on ScanQA, while utilizing fewer training resources. The code
+and model weights for 3UR-LLM and the 3DS-160K benchmark are available at
+3UR-LLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Transactions on Multimedia (TMM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AVS-Mamba: Exploring Temporal and Multi-modal Mamba for Audio-Visual
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sitong Gong, Yunzhi Zhuge, Lu Zhang, Yifan Wang, Pingping Zhang, Lijun Wang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The essence of audio-visual segmentation (AVS) lies in locating and
+delineating sound-emitting objects within a video stream. While
+Transformer-based methods have shown promise, their handling of long-range
+dependencies struggles due to quadratic computational costs, presenting a
+bottleneck in complex scenarios. To overcome this limitation and facilitate
+complex multi-modal comprehension with linear complexity, we introduce
+AVS-Mamba, a selective state space model to address the AVS task. Our framework
+incorporates two key components for video understanding and cross-modal
+learning: Temporal Mamba Block for sequential video processing and
+Vision-to-Audio Fusion Block for advanced audio-vision integration. Building on
+this, we develop the Multi-scale Temporal Encoder, aimed at enhancing the
+learning of visual features across scales, facilitating the perception of
+intra- and inter-frame information. To perform multi-modal fusion, we propose
+the Modality Aggregation Decoder, leveraging the Vision-to-Audio Fusion Block
+to integrate visual features into audio features across both frame and temporal
+levels. Further, we adopt the Contextual Integration Pyramid to perform
+audio-to-vision spatial-temporal context collaboration. Through these
+innovative contributions, our approach achieves new state-of-the-art results on
+the AVSBench-object and AVSBench-semantic datasets. Our source code and model
+weights are available at AVS-Mamba.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Transactions on Multimedia (TMM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Low-cost and Ultra-lightweight Binary Neural Network for Traffic
+  Signal Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingke Xiao, Yue Su, Liang Yu, Guanglong Qu, Yutong Jia, Yukuan Chang, Xu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The deployment of neural networks in vehicle platforms and wearable
+Artificial Intelligence-of-Things (AIOT) scenarios has become a research area
+that has attracted much attention. With the continuous evolution of deep
+learning technology, many image classification models are committed to
+improving recognition accuracy, but this is often accompanied by problems such
+as large model resource usage, complex structure, and high power consumption,
+which makes it challenging to deploy on resource-constrained platforms. Herein,
+we propose an ultra-lightweight binary neural network (BNN) model designed for
+hardware deployment, and conduct image classification research based on the
+German Traffic Sign Recognition Benchmark (GTSRB) dataset. In addition, we also
+verify it on the Chinese Traffic Sign (CTS) and Belgian Traffic Sign (BTS)
+datasets. The proposed model shows excellent recognition performance with an
+accuracy of up to 97.64%, making it one of the best performing BNN models in
+the GTSRB dataset. Compared with the full-precision model, the accuracy loss is
+controlled within 1%, and the parameter storage overhead of the model is only
+10% of that of the full-precision model. More importantly, our network model
+only relies on logical operations and low-bit width fixed-point addition and
+subtraction operations during the inference phase, which greatly simplifies the
+design complexity of the processing element (PE). Our research shows the great
+potential of BNN in the hardware deployment of computer vision models,
+especially in the field of computer vision tasks related to autonomous driving.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Motion and Temporal Cues for Unsupervised Video Object
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhi Zhuge, Hongyu Gu, Lu Zhang, Jinqing Qi, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address the challenges in unsupervised video object
+segmentation (UVOS) by proposing an efficient algorithm, termed MTNet, which
+concurrently exploits motion and temporal cues. Unlike previous methods that
+focus solely on integrating appearance with motion or on modeling temporal
+relations, our method combines both aspects by integrating them within a
+unified framework. MTNet is devised by effectively merging appearance and
+motion features during the feature extraction process within encoders,
+promoting a more complementary representation. To capture the intricate
+long-range contextual dynamics and information embedded within videos, a
+temporal transformer module is introduced, facilitating efficacious inter-frame
+interactions throughout a video clip. Furthermore, we employ a cascade of
+decoders all feature levels across all feature levels to optimally exploit the
+derived features, aiming to generate increasingly precise segmentation masks.
+As a result, MTNet provides a strong and compact framework that explores both
+temporal and cross-modality knowledge to robustly localize and track the
+primary object accurately in various challenging scenarios efficiently.
+Extensive experiments across diverse benchmarks conclusively show that our
+method not only attains state-of-the-art performance in unsupervised video
+object segmentation but also delivers competitive results in video salient
+object detection. These findings highlight the method's robust versatility and
+its adeptness in adapting to a range of segmentation tasks. Source code is
+available on https://github.com/hy0523/MTNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Transactions on Neural Networks and Learning Systems
+  (TNNLS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Balance Divergence for Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07804v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07804v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yafei Qi, Chen Wang, Zhaoning Zhang, Yaping Liu, Yongmin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation has been widely adopted in computer vision task
+processing, since it can effectively enhance the performance of lightweight
+student networks by leveraging the knowledge transferred from cumbersome
+teacher networks. Most existing knowledge distillation methods utilize
+Kullback-Leibler divergence to mimic the logit output probabilities between the
+teacher network and the student network. Nonetheless, these methods may neglect
+the negative parts of the teacher's ''dark knowledge'' because the divergence
+calculations may ignore the effect of the minute probabilities from the
+teacher's logit output. This deficiency may lead to suboptimal performance in
+logit mimicry during the distillation process and result in an imbalance of
+information acquired by the student network. In this paper, we investigate the
+impact of this imbalance and propose a novel method, named Balance Divergence
+Distillation. By introducing a compensatory operation using reverse
+Kullback-Leibler divergence, our method can improve the modeling of the
+extremely small values in the negative from the teacher and preserve the
+learning capacity for the positive. Furthermore, we test the impact of
+different temperature coefficients adjustments, which may conducted to further
+balance for knowledge transferring. We evaluate the proposed method on several
+computer vision tasks, including image classification and semantic
+segmentation. The evaluation results show that our method achieves an accuracy
+improvement of 1%~3% for lightweight students on both CIFAR-100 and ImageNet
+dataset, and a 4.55% improvement in mIoU for PSP-ResNet18 on the Cityscapes
+dataset. The experiments show that our method is a simple yet highly effective
+solution that can be smoothly applied to different knowledge distillation
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BioPose: Biomechanically-accurate 3D Pose Estimation from Monocular
+  Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farnoosh Koleini, Muhammad Usama Saleem, Pu Wang, Hongfei Xue, Ahmed Helmy, Abbey Fenwick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in 3D human pose estimation from single-camera images and
+videos have relied on parametric models, like SMPL. However, these models
+oversimplify anatomical structures, limiting their accuracy in capturing true
+joint locations and movements, which reduces their applicability in
+biomechanics, healthcare, and robotics. Biomechanically accurate pose
+estimation, on the other hand, typically requires costly marker-based motion
+capture systems and optimization techniques in specialized labs. To bridge this
+gap, we propose BioPose, a novel learning-based framework for predicting
+biomechanically accurate 3D human pose directly from monocular videos. BioPose
+includes three key components: a Multi-Query Human Mesh Recovery model
+(MQ-HMR), a Neural Inverse Kinematics (NeurIK) model, and a 2D-informed pose
+refinement technique. MQ-HMR leverages a multi-query deformable transformer to
+extract multi-scale fine-grained image features, enabling precise human mesh
+recovery. NeurIK treats the mesh vertices as virtual markers, applying a
+spatial-temporal network to regress biomechanically accurate 3D poses under
+anatomical constraints. To further improve 3D pose estimations, a 2D-informed
+refinement step optimizes the query tokens during inference by aligning the 3D
+structure with 2D pose observations. Experiments on benchmark datasets
+demonstrate that BioPose significantly outperforms state-of-the-art methods.
+Project website:
+\url{https://m-usamasaleem.github.io/publication/BioPose/BioPose.html}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parameter-Inverted Image Pyramid Networks for Visual Perception and
+  Multimodal Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaokai Wang, Xizhou Zhu, Xue Yang, Gen Luo, Hao Li, Changyao Tian, Wenhan Dou, Junqi Ge, Lewei Lu, Yu Qiao, Jifeng Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image pyramids are widely adopted in top-performing methods to obtain
+multi-scale features for precise visual perception and understanding. However,
+current image pyramids use the same large-scale model to process multiple
+resolutions of images, leading to significant computational cost. To address
+this challenge, we propose a novel network architecture, called
+Parameter-Inverted Image Pyramid Networks (PIIP). Specifically, PIIP uses
+pretrained models (ViTs or CNNs) as branches to process multi-scale images,
+where images of higher resolutions are processed by smaller network branches to
+balance computational cost and performance. To integrate information from
+different spatial scales, we further propose a novel cross-branch feature
+interaction mechanism. To validate PIIP, we apply it to various perception
+models and a representative multimodal large language model called LLaVA, and
+conduct extensive experiments on various tasks such as object detection,
+segmentation, image classification and multimodal understanding. PIIP achieves
+superior performance compared to single-branch and existing multi-resolution
+approaches with lower computational cost. When applied to InternViT-6B, a
+large-scale vision foundation model, PIIP can improve its performance by 1%-2%
+on detection and segmentation with only 40%-60% of the original computation,
+finally achieving 60.0 box AP on MS COCO and 59.7 mIoU on ADE20K. For
+multimodal understanding, our PIIP-LLaVA achieves 73.0% accuracy on TextVQA and
+74.5% on MMBench with only 2.8M training data. Our code is released at
+https://github.com/OpenGVLab/PIIP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BMIP: Bi-directional Modality Interaction <span class="highlight-title">Prompt</span> Learning for VLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Song-Lin Lv, Yu-Yang Chen, Zhi Zhou, Ming Yang, Lan-Zhe Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) have exhibited remarkable generalization
+capabilities, and prompt learning for VLMs has attracted great attention for
+the ability to adapt pre-trained VLMs to specific downstream tasks. However,
+existing studies mainly focus on single-modal prompts or uni-directional
+modality interaction, overlooking the powerful alignment effects resulting from
+the interaction between the vision and language modalities. To this end, we
+propose a novel prompt learning method called
+$\underline{\textbf{B}}i-directional \underline{\textbf{M}}odality
+\underline{\textbf{I}}nteraction \underline{\textbf{P}}rompt (BMIP)$, which
+dynamically weights bi-modal information through learning the information of
+the attention layer, enhancing trainability and inter-modal consistency
+compared to simple information aggregation methods. To evaluate the
+effectiveness of prompt learning methods, we propose a more realistic
+evaluation paradigm called open-world generalization complementing the widely
+adopted cross-dataset transfer and domain generalization tasks. Comprehensive
+experiments on various datasets reveal that BMIP not only outperforms current
+state-of-the-art methods across all three evaluation paradigms but is also
+flexible enough to be combined with other prompt-based methods for consistent
+performance enhancement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PSReg: Prior-guided Sparse Mixture of Experts for Point Cloud
+  Registration <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshui Huang, Zhou Huang, Yifan Zuo, Yongshun Gong, Chengdong Zhang, Deyang Liu, Yuming Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discriminative feature is crucial for point cloud registration. Recent
+methods improve the feature discriminative by distinguishing between
+non-overlapping and overlapping region points. However, they still face
+challenges in distinguishing the ambiguous structures in the overlapping
+regions. Therefore, the ambiguous features they extracted resulted in a
+significant number of outlier matches from overlapping regions. To solve this
+problem, we propose a prior-guided SMoE-based registration method to improve
+the feature distinctiveness by dispatching the potential correspondences to the
+same experts. Specifically, we propose a prior-guided SMoE module by fusing
+prior overlap and potential correspondence embeddings for routing, assigning
+tokens to the most suitable experts for processing. In addition, we propose a
+registration framework by a specific combination of Transformer layer and
+prior-guided SMoE module. The proposed method not only pays attention to the
+importance of locating the overlapping areas of point clouds, but also commits
+to finding more accurate correspondences in overlapping areas. Our extensive
+experiments demonstrate the effectiveness of our method, achieving
+state-of-the-art registration recall (95.7\%/79.3\%) on the 3DMatch/3DLoMatch
+benchmark. Moreover, we also test the performance on ModelNet40 and demonstrate
+excellent performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automotive Elevation Mapping with Interferometric Synthetic Aperture
+  Radar 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leyla A. Kabuli, Griffin Foster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radar is a low-cost and ubiquitous automotive sensor, but is limited by array
+resolution and sensitivity when performing direction of arrival analysis.
+Synthetic Aperture Radar (SAR) is a class of techniques to improve azimuth
+resolution and sensitivity for radar. Interferometric SAR (InSAR) can be used
+to extract elevation from the variations in phase measurements in SAR images.
+Utilizing InSAR we show that a typical, low-resolution radar array mounted on a
+vehicle can be used to accurately localize detections in 3D space for both
+urban and agricultural environments. We generate point clouds in each
+environment by combining InSAR with a signal processing scheme tailored to
+automotive driving. This low-compute approach allows radar to be used as a
+primary sensor to map fine details in complex driving environments, and be used
+to make autonomous perception decisions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLAVARS: A Multimodal Foundational Language and Vision Alignment Model
+  for Remote Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaac Corley, Simone Fobi Nsutezo, Anthony Ortiz, Caleb Robinson, Rahul Dodhia, Juan M. Lavista Ferres, Peyman Najafirad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing imagery is dense with objects and contextual visual
+information. There is a recent trend to combine paired satellite images and
+text captions for pretraining performant encoders for downstream tasks.
+However, while contrastive image-text methods like CLIP enable vision-language
+alignment and zero-shot classification ability, vision-only downstream
+performance tends to degrade compared to image-only pretraining, such as MAE.
+In this paper, we propose FLAVARS, a pretraining method that combines the best
+of both contrastive learning and masked modeling, along with geospatial
+alignment via contrastive location encoding. We find that FLAVARS significantly
+outperforms a baseline of SkyCLIP for vision-only tasks such as KNN
+classification and semantic segmentation, +6\% mIOU on SpaceNet1, while
+retaining the ability to perform zero-shot classification, unlike MAE
+pretrained methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Classical, Deep, and Generative Models for Human Activity
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Meem Hossain, The Anh Han, Safina Showkat Ara, Zia Ush Shamszaman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Activity Recognition (HAR) has gained significant importance with the
+growing use of sensor-equipped devices and large datasets. This paper evaluates
+the performance of three categories of models : classical machine learning,
+deep learning architectures, and Restricted Boltzmann Machines (RBMs) using
+five key benchmark datasets of HAR (UCI-HAR, OPPORTUNITY, PAMAP2, WISDM, and
+Berkeley MHAD). We assess various models, including Decision Trees, Random
+Forests, Convolutional Neural Networks (CNN), and Deep Belief Networks (DBNs),
+using metrics such as accuracy, precision, recall, and F1-score for a
+comprehensive comparison. The results show that CNN models offer superior
+performance across all datasets, especially on the Berkeley MHAD. Classical
+models like Random Forest do well on smaller datasets but face challenges with
+larger, more complex data. RBM-based models also show notable potential,
+particularly for feature learning. This paper offers a detailed comparison to
+help researchers choose the most suitable model for HAR tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>48 pages, 21 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Contextual Anomalies by Discovering Consistent Spatial Regions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengye Yang, Richard J. Radke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We describe a method for modeling spatial context to enable video anomaly
+detection. The main idea is to discover regions that share similar object-level
+activities by clustering joint object attributes using Gaussian mixture models.
+We demonstrate that this straightforward approach, using orders of magnitude
+fewer parameters than competing models, achieves state-of-the-art performance
+in the challenging spatial-context-dependent Street Scene dataset. As a side
+benefit, the high-resolution discovered regions learned by the model also
+provide explainable normalcy maps for human operators without the need for any
+pre-trained segmentation model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Performance of Object Detection Models in Electron Microscopy
+  Using Random Forests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ni Li, Ryan Jacobs, Matthew Lynch, Vidit Agrawal, Kevin Field, Dane Morgan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantifying prediction uncertainty when applying object detection models to
+new, unlabeled datasets is critical in applied machine learning. This study
+introduces an approach to estimate the performance of deep learning-based
+object detection models for quantifying defects in transmission electron
+microscopy (TEM) images, focusing on detecting irradiation-induced cavities in
+TEM images of metal alloys. We developed a random forest regression model that
+predicts the object detection F1 score, a statistical metric used to evaluate
+the ability to accurately locate and classify objects of interest. The random
+forest model uses features extracted from the predictions of the object
+detection model whose uncertainty is being quantified, enabling fast prediction
+on new, unlabeled images. The mean absolute error (MAE) for predicting F1 of
+the trained model on test data is 0.09, and the $R^2$ score is 0.77, indicating
+there is a significant correlation between the random forest regression model
+predicted and true defect detection F1 scores. The approach is shown to be
+robust across three distinct TEM image datasets with varying imaging and
+material domains. Our approach enables users to estimate the reliability of a
+defect detection and segmentation model predictions and assess the
+applicability of the model to their specific datasets, providing valuable
+information about possible domain shifts and whether the model needs to be
+fine-tuned or trained on additional data to be maximally effective for the
+desired use case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 9 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Zero-Shot & Explainable Video Description by Reasoning over
+  Graphs of Events in Space and Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihai Masala, Marius Leordeanu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the current era of Machine Learning, Transformers have become the de facto
+approach across a variety of domains, such as computer vision and natural
+language processing. Transformer-based solutions are the backbone of current
+state-of-the-art methods for language generation, image and video
+classification, segmentation, action and object recognition, among many others.
+Interestingly enough, while these state-of-the-art methods produce impressive
+results in their respective domains, the problem of understanding the
+relationship between vision and language is still beyond our reach. In this
+work, we propose a common ground between vision and language based on events in
+space and time in an explainable and programmatic way, to connect
+learning-based vision and language state of the art models and provide a
+solution to the long standing problem of describing videos in natural language.
+We validate that our algorithmic approach is able to generate coherent, rich
+and relevant textual descriptions on videos collected from a variety of
+datasets, using both standard metrics (e.g. Bleu, ROUGE) and the modern
+LLM-as-a-Jury approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RWKV-UNet: Improving UNet with Long-Range Cooperation for Effective
+  Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08458v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08458v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juntao Jiang, Jiangning Zhang, Weixuan Liu, Muxuan Gao, Xiaobin Hu, Xiaoxiao Yan, Feiyue Huang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there have been significant advancements in deep learning
+for medical image analysis, especially with convolutional neural networks
+(CNNs) and transformer models. However, CNNs face limitations in capturing
+long-range dependencies while transformers suffer high computational
+complexities. To address this, we propose RWKV-UNet, a novel model that
+integrates the RWKV (Receptance Weighted Key Value) structure into the U-Net
+architecture. This integration enhances the model's ability to capture
+long-range dependencies and improve contextual understanding, which is crucial
+for accurate medical image segmentation. We build a strong encoder with
+developed inverted residual RWKV (IR-RWKV) blocks combining CNNs and RWKVs. We
+also propose a Cross-Channel Mix (CCM) module to improve skip connections with
+multi-scale feature fusion, achieving global channel information integration.
+Experiments on benchmark datasets, including Synapse, ACDC, BUSI, CVC-ClinicDB,
+CVC-ColonDB, Kvasir-SEG, ISIC 2017 and GLAS show that RWKV-UNet achieves
+state-of-the-art performance on various types of medical image segmentation.
+Additionally, smaller variants, RWKV-UNet-S and RWKV-UNet-T, balance accuracy
+and computational efficiency, making them suitable for broader clinical
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vchitect-2.0: Parallel <span class="highlight-title">Transformer</span> for Scaling Up Video Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08453v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08453v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weichen Fan, Chenyang Si, Junhao Song, Zhenyu Yang, Yinan He, Long Zhuo, Ziqi Huang, Ziyue Dong, Jingwen He, Dongwei Pan, Yi Wang, Yuming Jiang, Yaohui Wang, Peng Gao, Xinyuan Chen, Hengjie Li, Dahua Lin, Yu Qiao, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Vchitect-2.0, a parallel transformer architecture designed to
+scale up video diffusion models for large-scale text-to-video generation. The
+overall Vchitect-2.0 system has several key designs. (1) By introducing a novel
+Multimodal Diffusion Block, our approach achieves consistent alignment between
+text descriptions and generated video frames, while maintaining temporal
+coherence across sequences. (2) To overcome memory and computational
+bottlenecks, we propose a Memory-efficient Training framework that incorporates
+hybrid parallelism and other memory reduction techniques, enabling efficient
+training of long video sequences on distributed systems. (3) Additionally, our
+enhanced data processing pipeline ensures the creation of Vchitect T2V
+DataVerse, a high-quality million-scale training dataset through rigorous
+annotation and aesthetic evaluation. Extensive benchmarking demonstrates that
+Vchitect-2.0 outperforms existing methods in video quality, training
+efficiency, and scalability, serving as a suitable base for high-fidelity video
+generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Poseidon: A ViT-based Architecture for Multi-Frame Pose Estimation with
+  Adaptive Frame Weighting and Multi-Scale Feature Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cesare Davide Pace, Alessandro Marco De Nunzio, Claudio De Stefano, Francesco Fontanella, Mario Molinara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human pose estimation, a vital task in computer vision, involves detecting
+and localising human joints in images and videos. While single-frame pose
+estimation has seen significant progress, it often fails to capture the
+temporal dynamics for understanding complex, continuous movements. We propose
+Poseidon, a novel multi-frame pose estimation architecture that extends the
+ViTPose model by integrating temporal information for enhanced accuracy and
+robustness to address these limitations. Poseidon introduces key innovations:
+(1) an Adaptive Frame Weighting (AFW) mechanism that dynamically prioritises
+frames based on their relevance, ensuring that the model focuses on the most
+informative data; (2) a Multi-Scale Feature Fusion (MSFF) module that
+aggregates features from different backbone layers to capture both fine-grained
+details and high-level semantics; and (3) a Cross-Attention module for
+effective information exchange between central and contextual frames, enhancing
+the model's temporal coherence. The proposed architecture improves performance
+in complex video scenarios and offers scalability and computational efficiency
+suitable for real-world applications. Our approach achieves state-of-the-art
+performance on the PoseTrack21 and PoseTrack18 datasets, achieving mAP scores
+of 88.3 and 87.8, respectively, outperforming existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FARE: A Deep Learning-Based Framework for Radar-based Face Recognition
+  and Out-of-distribution Detection <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sabri Mustafa Kahya, Boran Hamdi Sivrikaya, Muhammet Sami Yavuz, Eckehard Steinbach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a novel pipeline for face recognition and
+out-of-distribution (OOD) detection using short-range FMCW radar. The proposed
+system utilizes Range-Doppler and micro Range-Doppler Images. The architecture
+features a primary path (PP) responsible for the classification of
+in-distribution (ID) faces, complemented by intermediate paths (IPs) dedicated
+to OOD detection. The network is trained in two stages: first, the PP is
+trained using triplet loss to optimize ID face classification. In the second
+stage, the PP is frozen, and the IPs-comprising simple linear autoencoder
+networks-are trained specifically for OOD detection. Using our dataset
+generated with a 60 GHz FMCW radar, our method achieves an ID classification
+accuracy of 99.30% and an OOD detection AUROC of 96.91%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Modal Transferable Image-to-Video Attack on Video Quality Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgii Gotin, Ekaterina Shumitskaya, Anastasia Antsiferova, Dmitriy Vatolin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have revealed that modern image and video quality assessment
+(IQA/VQA) metrics are vulnerable to adversarial attacks. An attacker can
+manipulate a video through preprocessing to artificially increase its quality
+score according to a certain metric, despite no actual improvement in visual
+quality. Most of the attacks studied in the literature are white-box attacks,
+while black-box attacks in the context of VQA have received less attention.
+Moreover, some research indicates a lack of transferability of adversarial
+examples generated for one model to another when applied to VQA. In this paper,
+we propose a cross-modal attack method, IC2VQA, aimed at exploring the
+vulnerabilities of modern VQA models. This approach is motivated by the
+observation that the low-level feature spaces of images and videos are similar.
+We investigate the transferability of adversarial perturbations across
+different modalities; specifically, we analyze how adversarial perturbations
+generated on a white-box IQA model with an additional CLIP module can
+effectively target a VQA model. The addition of the CLIP module serves as a
+valuable aid in increasing transferability, as the CLIP model is known for its
+effective capture of low-level semantics. Extensive experiments demonstrate
+that IC2VQA achieves a high success rate in attacking three black-box VQA
+models. We compare our method with existing black-box attack strategies,
+highlighting its superiority in terms of attack success within the same number
+of iterations and levels of attack strength. We believe that the proposed
+method will contribute to the deeper analysis of robust VQA metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for VISAPP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiDepth Multimodal Neural Network: Bidirectional Depth Deep Learning
+  Arcitecture for Spatial-Temporal Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sina Ehsani, Fenglian Pan, Qingpei Hu, Jian Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of spatial-temporal (ST) information in dynamic systems,
+such as urban mobility and weather patterns, is a crucial yet challenging
+problem. The complexity stems from the intricate interplay between spatial
+proximity and temporal relevance, where both long-term trends and short-term
+fluctuations are present in convoluted patterns. Existing approaches, including
+traditional statistical methods and conventional neural networks, may provide
+inaccurate results due to the lack of an effective mechanism that
+simultaneously incorporates information at variable temporal depths while
+maintaining spatial context, resulting in a trade-off between comprehensive
+long-term historical analysis and responsiveness to short-term new information.
+To bridge this gap, this paper proposes the BiDepth Multimodal Neural Network
+(BDMNN) with bidirectional depth modulation that enables a comprehensive
+understanding of both long-term seasonality and short-term fluctuations,
+adapting to the complex ST context. Case studies with real-world public data
+demonstrate significant improvements in prediction accuracy, with a 12%
+reduction in Mean Squared Error for urban traffic prediction and a 15%
+improvement in rain precipitation forecasting compared to state-of-the-art
+benchmarks, without demanding extra computational resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to Applied Intelligence for review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging 2D Masked Reconstruction for Domain Adaptation of 3D Pose
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansoo Park, Chanwoo Kim, Jihyeon Kim, Hoseong Cho, Nhat Nguyen Bao Truong, Taehwan Kim, Seungryul Baek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RGB-based 3D pose estimation methods have been successful with the
+development of deep learning and the emergence of high-quality 3D pose
+datasets. However, most existing methods do not operate well for testing images
+whose distribution is far from that of training data. However, most existing
+methods do not operate well for testing images whose distribution is far from
+that of training data. This problem might be alleviated by involving diverse
+data during training, however it is non-trivial to collect such diverse data
+with corresponding labels (i.e. 3D pose). In this paper, we introduced an
+unsupervised domain adaptation framework for 3D pose estimation that utilizes
+the unlabeled data in addition to labeled data via masked image modeling (MIM)
+framework. Foreground-centric reconstruction and attention regularization are
+further proposed to increase the effectiveness of unlabeled data usage.
+Experiments are conducted on the various datasets in human and hand pose
+estimation tasks, especially using the cross-domain scenario. We demonstrated
+the effectiveness of ours by achieving the state-of-the-art accuracy on all
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Gaussian Splatting with Normal Information for Mesh Extraction and
+  Improved Rendering <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meenakshi Krishnan, Liam Fowl, Ramani Duraiswami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differentiable 3D Gaussian splatting has emerged as an efficient and flexible
+rendering technique for representing complex scenes from a collection of 2D
+views and enabling high-quality real-time novel-view synthesis. However, its
+reliance on photometric losses can lead to imprecisely reconstructed geometry
+and extracted meshes, especially in regions with high curvature or fine detail.
+We propose a novel regularization method using the gradients of a signed
+distance function estimated from the Gaussians, to improve the quality of
+rendering while also extracting a surface mesh. The regularizing normal
+supervision facilitates better rendering and mesh reconstruction, which is
+crucial for downstream applications in video generation, animation, AR-VR and
+gaming. We demonstrate the effectiveness of our approach on datasets such as
+Mip-NeRF360, Tanks and Temples, and Deep-Blending. Our method scores higher on
+photorealism metrics compared to other mesh extracting rendering methods
+without compromising mesh quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2025: Workshop on Generative Data Augmentation for Real-World
+  Signal Processing Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weight Averaging for Out-of-Distribution Generalization and Few-Shot
+  Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijian Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Empirical risk minimization (ERM) is not robust to changes in the
+distribution of data. When the distribution of test data is different from that
+of training data, the problem is known as out-of-distribution generalization.
+Recently, two techniques have been developed for addressing out-of-distribution
+generalization in computer vision: weight averaging (WA) and sharpness-aware
+minimization (SAM). WA involves training multiple models with different
+hyperparameters and then averaging the weights of these models, which can
+significantly improve out-of-distribution generalization performance. SAM
+optimizes a neural network to find minima in flat regions, which have been
+proven to perform well under distribution shifts. While these techniques have
+made great progress, there is still room for improvement and further
+exploration. In this thesis, we propose increasing the model diversity in WA
+explicitly by introducing gradient similarity as a loss regularizer to further
+improve out-of-distribution generalization performance. We also propose
+combining WA and SAM to solve the problem of few-shot domain adaptation. Our
+extensive experiments on digits datasets (MNIST, SVHN, USPS, MNIST-M) and other
+domain adaptation datasets (VLCS, PACS) show that combining WA and SAM leads to
+improved out-of-distribution generalization performance and significantly
+increases few-shot domain adaptation accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Master Thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do generative video models learn physical principles from watching
+  videos? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saman Motamed, Laura Culp, Kevin Swersky, Priyank Jaini, Robert Geirhos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI video generation is undergoing a revolution, with quality and realism
+advancing rapidly. These advances have led to a passionate scientific debate:
+Do video models learn ``world models'' that discover laws of physics -- or,
+alternatively, are they merely sophisticated pixel predictors that achieve
+visual realism without understanding the physical principles of reality? We
+address this question by developing Physics-IQ, a comprehensive benchmark
+dataset that can only be solved by acquiring a deep understanding of various
+physical principles, like fluid dynamics, optics, solid mechanics, magnetism
+and thermodynamics. We find that across a range of current models (Sora,
+Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical
+understanding is severely limited, and unrelated to visual realism. At the same
+time, some test cases can already be successfully solved. This indicates that
+acquiring certain physical principles from observation alone may be possible,
+but significant challenges remain. While we expect rapid advances ahead, our
+work demonstrates that visual realism does not imply physical understanding.
+Our project page is at https://physics-iq.github.io; code at
+https://github.com/google-deepmind/physics-IQ-benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> Rate-In: Information-Driven Adaptive Dropout Rates for Improved
+  Inference-Time Uncertainty Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07169v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07169v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tal Zeevi, Ravid Shwartz-Ziv, <span class="highlight-author">Yann LeCun</span>, Lawrence H. Staib, John A. Onofrey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate uncertainty estimation is crucial for deploying neural networks in
+risk-sensitive applications such as medical diagnosis. Monte Carlo Dropout is a
+widely used technique for approximating predictive uncertainty by performing
+stochastic forward passes with dropout during inference. However, using static
+dropout rates across all layers and inputs can lead to suboptimal uncertainty
+estimates, as it fails to adapt to the varying characteristics of individual
+inputs and network layers. Existing approaches optimize dropout rates during
+training using labeled data, resulting in fixed inference-time parameters that
+cannot adjust to new data distributions, compromising uncertainty estimates in
+Monte Carlo simulations.
+  In this paper, we propose Rate-In, an algorithm that dynamically adjusts
+dropout rates during inference by quantifying the information loss induced by
+dropout in each layer's feature maps. By treating dropout as controlled noise
+injection and leveraging information-theoretic principles, Rate-In adapts
+dropout rates per layer and per input instance without requiring ground truth
+labels. By quantifying the functional information loss in feature maps, we
+adaptively tune dropout rates to maintain perceptual quality across diverse
+medical imaging tasks and architectural configurations. Our extensive empirical
+study on synthetic data and real-world medical imaging tasks demonstrates that
+Rate-In improves calibration and sharpens uncertainty estimates compared to
+fixed or heuristic dropout rates without compromising predictive performance.
+Rate-In offers a practical, unsupervised, inference-time approach to optimizing
+dropout for more reliable predictive uncertainty estimation in critical
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated author affiliation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaussian Eigen Models for Human Heads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04545v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04545v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wojciech Zielonka, Timo Bolkart, Thabo Beeler, Justus Thies
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current personalized neural head avatars face a trade-off: lightweight models
+lack detail and realism, while high-quality, animatable avatars require
+significant computational resources, making them unsuitable for commodity
+devices. To address this gap, we introduce Gaussian Eigen Models (GEM), which
+provide high-quality, lightweight, and easily controllable head avatars. GEM
+utilizes 3D Gaussian primitives for representing the appearance combined with
+Gaussian splatting for rendering. Building on the success of mesh-based 3D
+morphable face models (3DMM), we define GEM as an ensemble of linear eigenbases
+for representing the head appearance of a specific subject. In particular, we
+construct linear bases to represent the position, scale, rotation, and opacity
+of the 3D Gaussians. This allows us to efficiently generate Gaussian primitives
+of a specific head shape by a linear combination of the basis vectors, only
+requiring a low-dimensional parameter vector that contains the respective
+coefficients. We propose to construct these linear bases (GEM) by distilling
+high-quality compute-intense CNN-based Gaussian avatar models that can generate
+expression-dependent appearance changes like wrinkles. These high-quality
+models are trained on multi-view videos of a subject and are distilled using a
+series of principal component analyses. Once we have obtained the bases that
+represent the animatable appearance space of a specific human, we learn a
+regressor that takes a single RGB image as input and predicts the
+low-dimensional parameter vector that corresponds to the shown facial
+expression. In a series of experiments, we compare GEM's self-reenactment and
+cross-person reenactment results to state-of-the-art 3D avatar methods,
+demonstrating GEM's higher visual quality and better generalization to new
+expressions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://zielon.github.io/gem/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems
+  using Disparity Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.24031v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.24031v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariel Larey, Eyal Rond, Omer Achrack
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition technologies are increasingly used in various applications,
+yet they are vulnerable to face spoofing attacks. These spoofing attacks often
+involve unique 3D structures, such as printed papers or mobile device screens.
+Although stereo-depth cameras can detect such attacks effectively, their
+high-cost limits their widespread adoption. Conversely, two-sensor systems
+without extrinsic calibration offer a cost-effective alternative but are unable
+to calculate depth using stereo techniques. In this work, we propose a method
+to overcome this challenge by leveraging facial attributes to derive disparity
+information and estimate relative depth for anti-spoofing purposes, using
+non-calibrated systems. We introduce a multi-modal anti-spoofing model, coined
+Disparity Model, that incorporates created disparity maps as a third modality
+alongside the two original sensor modalities. We demonstrate the effectiveness
+of the Disparity Model in countering various spoof attacks using a
+comprehensive dataset collected from the Intel RealSense ID Solution F455. Our
+method outperformed existing methods in the literature, achieving an Equal
+Error Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False
+Positive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the
+errors of the best comparison method, respectively. Additionally, we introduce
+a model ensemble that addresses 3D spoof attacks as well, achieving an EER of
+2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a
+state-of-the-art solution for the challenging task of anti-spoofing in
+non-calibrated systems that lack depth information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RMem: Restricted Memory Banks Improve Video Object Segmentation <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.08476v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.08476v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junbao Zhou, Ziqi Pang, Yu-Xiong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With recent video object segmentation (VOS) benchmarks evolving to
+challenging scenarios, we revisit a simple but overlooked strategy: restricting
+the size of memory banks. This diverges from the prevalent practice of
+expanding memory banks to accommodate extensive historical information. Our
+specially designed "memory deciphering" study offers a pivotal insight
+underpinning such a strategy: expanding memory banks, while seemingly
+beneficial, actually increases the difficulty for VOS modules to decode
+relevant features due to the confusion from redundant information. By
+restricting memory banks to a limited number of essential frames, we achieve a
+notable improvement in VOS accuracy. This process balances the importance and
+freshness of frames to maintain an informative memory bank within a bounded
+capacity. Additionally, restricted memory banks reduce the training-inference
+discrepancy in memory lengths compared with continuous expansion. This fosters
+new opportunities in temporal reasoning and enables us to introduce the
+previously overlooked "temporal positional embedding." Finally, our insights
+are embodied in "RMem" ("R" for restricted), a simple yet effective VOS
+modification that excels at challenging VOS scenarios and establishes new state
+of the art for object state changes (on the VOST dataset) and long videos (on
+the Long Videos dataset). Our code and demo are available at
+https://restricted-memory.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024, Project Page: https://restricted-memory.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FaVoR: Features via Voxel Rendering for Camera Relocalization <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.07571v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.07571v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincenzo Polizzi, Marco Cannici, Davide Scaramuzza, Jonathan Kelly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camera relocalization methods range from dense image alignment to direct
+camera pose regression from a query image. Among these, sparse feature matching
+stands out as an efficient, versatile, and generally lightweight approach with
+numerous applications. However, feature-based methods often struggle with
+significant viewpoint and appearance changes, leading to matching failures and
+inaccurate pose estimates. To overcome this limitation, we propose a novel
+approach that leverages a globally sparse yet locally dense 3D representation
+of 2D features. By tracking and triangulating landmarks over a sequence of
+frames, we construct a sparse voxel map optimized to render image patch
+descriptors observed during tracking. Given an initial pose estimate, we first
+synthesize descriptors from the voxels using volumetric rendering and then
+perform feature matching to estimate the camera pose. This methodology enables
+the generation of descriptors for unseen views, enhancing robustness to view
+changes. We extensively evaluate our method on the 7-Scenes and Cambridge
+Landmarks datasets. Our results show that our method significantly outperforms
+existing state-of-the-art feature representation techniques in indoor
+environments, achieving up to a 39% improvement in median translation error.
+Additionally, our approach yields comparable results to other methods for
+outdoor scenarios while maintaining lower memory and computational costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the IEEE/CVF Winter Conference on Applications of
+  Computer Vision (WACV), Tucson, Arizona, US, Feb 28-Mar 4, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vid2Sim: Realistic and Interactive Simulation from Video for Urban
+  Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyang Xie, Zhizheng Liu, Zhenghao Peng, Wayne Wu, Bolei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sim-to-real gap has long posed a significant challenge for robot learning in
+simulation, preventing the deployment of learned models in the real world.
+Previous work has primarily focused on domain randomization and system
+identification to mitigate this gap. However, these methods are often limited
+by the inherent constraints of the simulation and graphics engines. In this
+work, we propose Vid2Sim, a novel framework that effectively bridges the
+sim2real gap through a scalable and cost-efficient real2sim pipeline for neural
+3D scene reconstruction and simulation. Given a monocular video as input,
+Vid2Sim can generate photorealistic and physically interactable 3D simulation
+environments to enable the reinforcement learning of visual navigation agents
+in complex urban environments. Extensive experiments demonstrate that Vid2Sim
+significantly improves the performance of urban navigation in the digital twins
+and real world by 31.2% and 68.3% in success rate compared with agents trained
+with prior simulation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://metadriverse.github.io/vid2sim/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Compression Autoencoder for Efficient High-Resolution Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.10733v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.10733v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyu Chen, Han Cai, Junsong Chen, Enze Xie, Shang Yang, Haotian Tang, Muyang Li, Yao Lu, Song Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder
+models for accelerating high-resolution diffusion models. Existing autoencoder
+models have demonstrated impressive results at a moderate spatial compression
+ratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for
+high spatial compression ratios (e.g., 64x). We address this challenge by
+introducing two key techniques: (1) Residual Autoencoding, where we design our
+models to learn residuals based on the space-to-channel transformed features to
+alleviate the optimization difficulty of high spatial-compression autoencoders;
+(2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases
+training strategy for mitigating the generalization penalty of high
+spatial-compression autoencoders. With these designs, we improve the
+autoencoder's spatial compression ratio up to 128 while maintaining the
+reconstruction quality. Applying our DC-AE to latent diffusion models, we
+achieve significant speedup without accuracy drop. For example, on ImageNet
+512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup
+on H100 GPU for UViT-H while achieving a better FID, compared with the widely
+used SD-VAE-f8 autoencoder. Our code is available at
+https://github.com/mit-han-lab/efficientvit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. First two authors contributed equally to this work. Update:
+  add USiT (UViT+SiT sampler) results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling White-Box <span class="highlight-title">Transformer</span>s for Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20299v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20299v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinrui Yang, Xianhang Li, Druv Pai, Yuyin Zhou, Yi Ma, Yaodong Yu, Cihang Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CRATE, a white-box transformer architecture designed to learn compressed and
+sparse representations, offers an intriguing alternative to standard vision
+transformers (ViTs) due to its inherent mathematical interpretability. Despite
+extensive investigations into the scaling behaviors of language and vision
+transformers, the scalability of CRATE remains an open question which this
+paper aims to address. Specifically, we propose CRATE-$\alpha$, featuring
+strategic yet minimal modifications to the sparse coding block in the CRATE
+architecture design, and a light training recipe designed to improve the
+scalability of CRATE. Through extensive experiments, we demonstrate that
+CRATE-$\alpha$ can effectively scale with larger model sizes and datasets. For
+example, our CRATE-$\alpha$-B substantially outperforms the prior best CRATE-B
+model accuracy on ImageNet classification by 3.7%, achieving an accuracy of
+83.2%. Meanwhile, when scaling further, our CRATE-$\alpha$-L obtains an
+ImageNet classification accuracy of 85.1%. More notably, these model
+performance improvements are achieved while preserving, and potentially even
+enhancing the interpretability of learned CRATE models, as we demonstrate
+through showing that the learned token representations of increasingly larger
+trained CRATE-$\alpha$ models yield increasingly higher-quality unsupervised
+object segmentation of images. The project page is
+https://rayjryang.github.io/CRATE-alpha/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://rayjryang.github.io/CRATE-alpha/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> of Foundation Models in Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wasif Khan, Seowung Leem, Kyle B. See, Joshua K. Wong, Shaoting Zhang, Ruogu Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FMs) are large-scale deep learning models that are
+developed using large datasets and self-supervised learning methods. These
+models serve as a base for different downstream tasks, including healthcare.
+FMs have been adopted with great success across various domains within
+healthcare. Existing healthcare-based surveys have not yet included all of
+these domains. Therefore, we provide a detailed survey of FMs in healthcare. We
+focus on the history, learning strategies, flagship models, applications, and
+challenges of FMs. We explore how FMs such as the BERT and GPT families are
+reshaping various healthcare domains, including clinical large language models,
+medical image analysis, and omics. Furthermore, we provide a detailed taxonomy
+of healthcare applications facilitated by FMs, such as clinical NLP, medical
+computer vision, graph learning, and other biology-related tasks. Despite the
+promising opportunities FMs provide, they also have several associated
+challenges, which are explained in detail. We also outline open research issues
+and potential lessons learned to provide researchers and practitioners with
+insights into the capabilities of FMs in healthcare to advance their deployment
+and mitigate associated risks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-guided Image Restoration and Semantic Enhancement for Text-to-Image
+  Person Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09059v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09059v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Delong Liu, Haiwen Li, Zhicheng Zhao, Yuan Dong, Nikolaos V. Boulgouris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of Text-to-Image Person Retrieval (TIPR) is to retrieve specific
+person images according to the given textual descriptions. A primary challenge
+in this task is bridging the substantial representational gap between visual
+and textual modalities. The prevailing methods map texts and images into
+unified embedding space for matching, while the intricate semantic
+correspondences between texts and images are still not effectively constructed.
+To address this issue, we propose a novel TIPR framework to build fine-grained
+interactions and alignment between person images and the corresponding texts.
+Specifically, via fine-tuning the Contrastive Language-Image Pre-training
+(CLIP) model, a visual-textual dual encoder is firstly constructed, to
+preliminarily align the image and text features. Secondly, a Text-guided Image
+Restoration (TIR) auxiliary task is proposed to map abstract textual entities
+to specific image regions, improving the alignment between local textual and
+visual embeddings. Additionally, a cross-modal triplet loss is presented to
+handle hard samples, and further enhance the model's discriminability for minor
+differences. Moreover, a pruning-based text data augmentation approach is
+proposed to enhance focus on essential elements in descriptions, thereby
+avoiding excessive model attention to less significant information. The
+experimental results show our proposed method outperforms state-of-the-art
+methods on three popular benchmark datasets, and the code will be made publicly
+available at https://github.com/Delong-liu-bupt/SEN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper was withdrawn due to a dispute among the authors regarding
+  the content of the article</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Relaxed Rotational Equivariance via $G$-Biases in Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12454v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12454v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiang Wu, Yingjie Liu, Licheng Sun, Jian Yang, Hanlin Dong, Shing-Ho J. Lin, Xuan Tang, Jinpeng Mi, Bo Jin, Xian Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group Equivariant Convolution (GConv) can capture rotational equivariance
+from original data. It assumes uniform and strict rotational equivariance
+across all features as the transformations under the specific group. However,
+the presentation or distribution of real-world data rarely conforms to strict
+rotational equivariance, commonly referred to as Rotational Symmetry-Breaking
+(RSB) in the system or dataset, making GConv unable to adapt effectively to
+this phenomenon. Motivated by this, we propose a simple but highly effective
+method to address this problem, which utilizes a set of learnable biases called
+$G$-Biases under the group order to break strict group constraints and then
+achieve a Relaxed Rotational Equivariant Convolution (RREConv). To validate the
+efficiency of RREConv, we conduct extensive ablation experiments on the
+discrete rotational group $\mathcal{C}_n$. Experiments demonstrate that the
+proposed RREConv-based methods achieve excellent performance compared to
+existing GConv-based methods in both classification and 2D object detection
+tasks on the natural image datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Feedback-driven object detection and iterative model improvement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19835v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19835v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sönke Tenckhoff, Mario Koddenbrock, Erik Rodner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated object detection has become increasingly valuable across diverse
+applications, yet efficient, high-quality annotation remains a persistent
+challenge. In this paper, we present the development and evaluation of a
+platform designed to interactively improve object detection models. The
+platform allows uploading and annotating images as well as fine-tuning object
+detection models. Users can then manually review and refine annotations,
+further creating improved snapshots that are used for automatic object
+detection on subsequent image uploads - a process we refer to as semi-automatic
+annotation resulting in a significant gain in annotation efficiency.
+  Whereas iterative refinement of model results to speed up annotation has
+become common practice, we are the first to quantitatively evaluate its
+benefits with respect to time, effort, and interaction savings. Our
+experimental results show clear evidence for a significant time reduction of up
+to 53% for semi-automatic compared to manual annotation. Importantly, these
+efficiency gains did not compromise annotation quality, while matching or
+occasionally even exceeding the accuracy of manual annotations. These findings
+demonstrate the potential of our lightweight annotation platform for creating
+high-quality object detection datasets and provide best practices to guide
+future development of annotation platforms.
+  The platform is open-source, with the frontend and backend repositories
+available on GitHub (https://github.com/ml-lab-htw/iterative-annotate). To
+support the understanding of our labeling process, we have created an
+explanatory video demonstrating the methodology using microscopy images of E.
+coli bacteria as an example. The video is available on YouTube
+(https://www.youtube.com/watch?v=CM9uhE8NN5E).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AI4EA24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ORFormer: Occlusion-Robust <span class="highlight-title">Transformer</span> for Accurate Facial Landmark
+  Detection <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13174v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13174v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jui-Che Chiang, Hou-Ning Hu, Bo-Syuan Hou, Chia-Yu Tseng, Yu-Lun Liu, Min-Hung Chen, Yen-Yu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although facial landmark detection (FLD) has gained significant progress,
+existing FLD methods still suffer from performance drops on partially
+non-visible faces, such as faces with occlusions or under extreme lighting
+conditions or poses. To address this issue, we introduce ORFormer, a novel
+transformer-based method that can detect non-visible regions and recover their
+missing features from visible parts. Specifically, ORFormer associates each
+image patch token with one additional learnable token called the messenger
+token. The messenger token aggregates features from all but its patch. This
+way, the consensus between a patch and other patches can be assessed by
+referring to the similarity between its regular and messenger embeddings,
+enabling non-visible region identification. Our method then recovers occluded
+patches with features aggregated by the messenger tokens. Leveraging the
+recovered features, ORFormer compiles high-quality heatmaps for the downstream
+FLD task. Extensive experiments show that our method generates heatmaps
+resilient to partial occlusions. By integrating the resultant heatmaps into
+existing FLD methods, our method performs favorably against the state of the
+arts on challenging datasets such as WFLW and COFW.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV 2025 Project Link: https://ben0919.github.io/ORFormer/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diversified Augmentation with Domain Adaptation for Debiased Video
+  Temporal Grounding <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06746v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06746v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junlong Ren, Gangjian Zhang, Haifeng Sun, Hao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal sentence grounding in videos (TSGV) faces challenges due to public
+TSGV datasets containing significant temporal biases, which are attributed to
+the uneven temporal distributions of target moments. Existing methods generate
+augmented videos, where target moments are forced to have varying temporal
+locations. However, since the video lengths of the given datasets have small
+variations, only changing the temporal locations results in poor generalization
+ability in videos with varying lengths. In this paper, we propose a novel
+training framework complemented by diversified data augmentation and a domain
+discriminator. The data augmentation generates videos with various lengths and
+target moment locations to diversify temporal distributions. However, augmented
+videos inevitably exhibit distinct feature distributions which may introduce
+noise. To address this, we design a domain adaptation auxiliary task to
+diminish feature discrepancies between original and augmented videos. We also
+encourage the model to produce distinct predictions for videos with the same
+text queries but different moment locations to promote debiased training.
+Experiments on Charades-CD and ActivityNet-CD datasets demonstrate the
+effectiveness and generalization abilities of our method in multiple grounding
+structures, achieving state-of-the-art results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MSCViT: A Small-size ViT architecture with Multi-Scale Self-Attention
+  Mechanism for Tiny <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowei Zhang, Yi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformer (ViT) has demonstrated significant potential in various
+vision tasks due to its strong ability in modelling long-range dependencies.
+However, such success is largely fueled by training on massive samples. In real
+applications, the large-scale datasets are not always available, and ViT
+performs worse than Convolutional Neural Networks (CNNs) if it is only trained
+on small scale dataset (called tiny dataset), since it requires large amount of
+training data to ensure its representational capacity. In this paper, a
+small-size ViT architecture with multi-scale self-attention mechanism and
+convolution blocks is presented (dubbed MSCViT) to model different scales of
+attention at each layer. Firstly, we introduced wavelet convolution, which
+selectively combines the high-frequency components obtained by frequency
+division with our convolution channel to extract local features. Then, a
+lightweight multi-head attention module is developed to reduce the number of
+tokens and computational costs. Finally, the positional encoding (PE) in the
+backbone is replaced by a local feature extraction module. Compared with the
+original ViT, it is parameter-efficient and is particularly suitable for tiny
+datasets. Extensive experiments have been conducted on tiny datasets, in which
+our model achieves an accuracy of 84.68% on CIFAR-100 with 14.0M parameters and
+2.5 GFLOPs, without pre-training on large datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WINE: Wavelet-Guided GAN Inversion and Editing for High-Fidelity
+  Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaewon Kim, Seung-Jun Moon, Gyeong-Moon Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advanced GAN inversion models aim to convey high-fidelity information
+from original images to generators through methods using generator tuning or
+high-dimensional feature learning. Despite these efforts, accurately
+reconstructing image-specific details remains as a challenge due to the
+inherent limitations both in terms of training and structural aspects, leading
+to a bias towards low-frequency information. In this paper, we look into the
+widely used pixel loss in GAN inversion, revealing its predominant focus on the
+reconstruction of low-frequency features. We then propose WINE, a
+Wavelet-guided GAN Inversion aNd Editing model, which transfers the
+high-frequency information through wavelet coefficients via newly proposed
+wavelet loss and wavelet fusion scheme. Notably, WINE is the first attempt to
+interpret GAN inversion in the frequency domain. Our experimental results
+showcase the precision of WINE in preserving high-frequency details and
+enhancing image quality. Even in editing scenarios, WINE outperforms existing
+state-of-the-art GAN inversion models with a fine balance between editability
+and reconstruction quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized and Efficient 2D Gaussian Splatting for Arbitrary-scale
+  Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Du Chen, Liyi Chen, Zhengqiang Zhang, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Equipped with the continuous representation capability of Multi-Layer
+Perceptron (MLP), Implicit Neural Representation (INR) has been successfully
+employed for Arbitrary-scale Super-Resolution (ASR). However, the limited
+receptive field of the linear layers in MLP restricts the representation
+capability of INR, while it is computationally expensive to query the MLP
+numerous times to render each pixel. Recently, Gaussian Splatting (GS) has
+shown its advantages over INR in both visual quality and rendering speed in 3D
+tasks, which motivates us to explore whether GS can be employed for the ASR
+task. However, directly applying GS to ASR is exceptionally challenging because
+the original GS is an optimization-based method through overfitting each single
+scene, while in ASR we aim to learn a single model that can generalize to
+different images and scaling factors. We overcome these challenges by
+developing two novel techniques. Firstly, to generalize GS for ASR, we
+elaborately design an architecture to predict the corresponding
+image-conditioned Gaussians of the input low-resolution image in a feed-forward
+manner. Secondly, we implement an efficient differentiable 2D GPU/CUDA-based
+scale-aware rasterization to render super-resolved images by sampling discrete
+RGB values from the predicted contiguous Gaussians. Via end-to-end training,
+our optimized network, namely GSASR, can perform ASR for any image and unseen
+scaling factors. Extensive experiments validate the effectiveness of our
+proposed method. The project page can be found at
+\url{https://mt-cly.github.io/GSASR.github.io/}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Sub-graph Distillation for Robust Semi-supervised Continual
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.16409v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.16409v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Fan, Yu Wang, Pengfei Zhu, Qinghua Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning (CL) has shown promising results and comparable
+performance to learning at once in a fully supervised manner. However, CL
+strategies typically require a large number of labeled samples, making their
+real-life deployment challenging. In this work, we focus on semi-supervised
+continual learning (SSCL), where the model progressively learns from partially
+labeled data with unknown categories. We provide a comprehensive analysis of
+SSCL and demonstrate that unreliable distributions of unlabeled data lead to
+unstable training and refinement of the progressing stages. This problem
+severely impacts the performance of SSCL. To address the limitations, we
+propose a novel approach called Dynamic Sub-Graph Distillation (DSGD) for
+semi-supervised continual learning, which leverages both semantic and
+structural information to achieve more stable knowledge distillation on
+unlabeled data and exhibit robustness against distribution bias. Firstly, we
+formalize a general model of structural distillation and design a dynamic graph
+construction for the continual learning progress. Next, we define a structure
+distillation vector and design a dynamic sub-graph distillation algorithm,
+which enables end-to-end training and adaptability to scale up tasks. The
+entire proposed method is adaptable to various CL methods and supervision
+settings. Finally, experiments conducted on three datasets CIFAR10, CIFAR100,
+and ImageNet-100, with varying supervision ratios, demonstrate the
+effectiveness of our proposed approach in mitigating the catastrophic
+forgetting problem in semi-supervised continual learning scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Less is More: The Influence of Pruning on the Explainability of CNNs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08878v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08878v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Merkle, David Weber, Pascal Schöttle, Stephan Schlögl, Martin Nocker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the last century, deep learning models have become the state-of-the-art
+for solving complex computer vision problems. These modern computer vision
+models have millions of parameters, which presents two major challenges: (1)
+the increased computational requirements hamper the deployment in
+resource-constrained environments, such as mobile or IoT devices, and (2)
+explaining the complex decisions of such networks to humans is challenging.
+Network pruning is a technical approach to reduce the complexity of models,
+where less important parameters are removed. The work presented in this paper
+investigates whether this reduction in technical complexity also helps with
+perceived explainability. To do so, we conducted a pre-study and two
+human-grounded experiments, assessing the effects of different pruning ratios
+on explainability. Overall, we evaluate four different compression rates (i.e.,
+2, 4, 8, and 32) with 37 500 tasks on Mechanical Turk. Results indicate that
+lower compression rates have a positive influence on explainability, while
+higher compression rates show negative effects. Furthermore, we were able to
+identify sweet spots that increase both the perceived explainability and the
+model's performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spurious Feature Eraser: Stabilizing Test-Time Adaptation for
+  Vision-Language Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00376v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00376v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan Ma, Yan Zhu, Changqing Zhang, Peilin Zhao, Baoyuan Wu, Long-Kai Huang, Qinghua Hu, Bingzhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language foundation models have exhibited remarkable success across a
+multitude of downstream tasks due to their scalability on extensive image-text
+paired data. However, these models also display significant limitations when
+applied to downstream tasks, such as fine-grained image classification, as a
+result of ``decision shortcuts'' that hinder their generalization capabilities.
+In this work, we find that the CLIP model possesses a rich set of features,
+encompassing both \textit{desired invariant causal features} and
+\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP
+on downstream tasks originates from its inability to effectively utilize
+pre-trained features in accordance with specific task requirements. To address
+this challenge, we propose a simple yet effective method, Spurious Feature
+Eraser (SEraser), to alleviate the decision shortcuts by erasing the spurious
+features. Specifically, we introduce a test-time prompt tuning paradigm that
+optimizes a learnable prompt, thereby compelling the model to exploit invariant
+features while disregarding decision shortcuts during the inference phase. The
+proposed method effectively alleviates excessive dependence on potentially
+misleading spurious information. We conduct comparative analysis of the
+proposed method against various approaches which validates the significant
+superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ImagiNet: A Multi-Content Benchmark for Synthetic Image Detection <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20020v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20020v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Delyan Boychev, Radostin Cholakov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent generative models produce images with a level of authenticity that
+makes them nearly indistinguishable from real photos and artwork. Potential
+harmful use cases of these models, necessitate the creation of robust synthetic
+image detectors. However, current datasets in the field contain generated
+images with questionable quality or have examples from one predominant content
+type which leads to poor generalizability of the underlying detectors. We find
+that the curation of a balanced amount of high-resolution generated images
+across various content types is crucial for the generalizability of detectors,
+and introduce ImagiNet, a dataset of 200K examples, spanning four categories:
+photos, paintings, faces, and miscellaneous. Synthetic images in ImagiNet are
+produced with both open-source and proprietary generators, whereas real
+counterparts for each content type are collected from public datasets. The
+structure of ImagiNet allows for a two-track evaluation system: i)
+classification as real or synthetic and ii) identification of the generative
+model. To establish a strong baseline, we train a ResNet-50 model using a
+self-supervised contrastive objective (SelfCon) for each track which achieves
+evaluation AUC of up to 0.99 and balanced accuracy ranging from 86% to 95%,
+even under conditions that involve compression and resizing. The provided model
+is generalizable enough to achieve zero-shot state-of-the-art performance on
+previous synthetic detection benchmarks. We provide ablations to demonstrate
+the importance of content types and publish code and data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Workshop on Datasets and Evaluators of AI Safety, AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Digi2Real: Bridging the Realism Gap in Synthetic Data Face Recognition
+  via Foundation Models <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02188v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02188v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anjith George, Sebastien Marcel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accuracy of face recognition systems has improved significantly in the
+past few years, thanks to the large amount of data collected and advancements
+in neural network architectures. However, these large-scale datasets are often
+collected without explicit consent, raising ethical and privacy concerns. To
+address this, there have been proposals to use synthetic datasets for training
+face recognition models. Yet, such models still rely on real data to train the
+generative models and generally exhibit inferior performance compared to those
+trained on real datasets. One of these datasets, DigiFace, uses a graphics
+pipeline to generate different identities and intra-class variations without
+using real data in model training. However, the performance of this approach is
+poor on face recognition benchmarks, possibly due to the lack of realism in the
+images generated by the graphics pipeline. In this work, we introduce a novel
+framework for realism transfer aimed at enhancing the realism of synthetically
+generated face images. Our method leverages the large-scale face foundation
+model, and we adapt the pipeline for realism enhancement. By integrating the
+controllable aspects of the graphics pipeline with our realism enhancement
+technique, we generate a large amount of realistic variations, combining the
+advantages of both approaches. Our empirical evaluations demonstrate that
+models trained using our enhanced dataset significantly improve the performance
+of face recognition systems over the baseline. The source code and dataset will
+be publicly accessible at the following link:
+https://www.idiap.ch/paper/digi2real
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The dataset would be available here:
+  https://www.idiap.ch/paper/digi2real Accepted for Publication in WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MambaTalk: Efficient Holistic Gesture Synthesis with Selective State
+  Space Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09471v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09471v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zunnan Xu, Yukang Lin, Haonan Han, Sicheng Yang, Ronghui Li, Yachao Zhang, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gesture synthesis is a vital realm of human-computer interaction, with
+wide-ranging applications across various fields like film, robotics, and
+virtual reality. Recent advancements have utilized the diffusion model and
+attention mechanisms to improve gesture synthesis. However, due to the high
+computational complexity of these techniques, generating long and diverse
+sequences with low latency remains a challenge. We explore the potential of
+state space models (SSMs) to address the challenge, implementing a two-stage
+modeling strategy with discrete motion priors to enhance the quality of
+gestures. Leveraging the foundational Mamba block, we introduce MambaTalk,
+enhancing gesture diversity and rhythm through multimodal integration.
+Extensive experiments demonstrate that our method matches or exceeds the
+performance of state-of-the-art models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurlPS 2024, Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Audio-Agent: Leveraging LLMs For Audio Generation, Editing and
+  Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.03335v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.03335v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Wang, Chi-Keung Tang, Yu-Wing Tai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Audio-Agent, a multimodal framework for audio generation,
+editing and composition based on text or video inputs. Conventional approaches
+for text-to-audio (TTA) tasks often make single-pass inferences from text
+descriptions. While straightforward, this design struggles to produce
+high-quality audio when given complex text conditions. In our method, we
+utilize a pre-trained TTA diffusion network as the audio generation agent to
+work in tandem with GPT-4, which decomposes the text condition into atomic,
+specific instructions and calls the agent for audio generation. In doing so,
+Audio-Agent can generate high-quality audio that is closely aligned with the
+provided text or video exhibiting complex and multiple events, while supporting
+variable-length and variable-volume generation. For video-to-audio (VTA) tasks,
+most existing methods require training a timestamp detector to synchronize
+video events with the generated audio, a process that can be tedious and
+time-consuming. Instead, we propose a simpler approach by fine-tuning a
+pre-trained Large Language Model (LLM), e.g., Gemma2-2B-it, to obtain both
+semantic and temporal conditions that bridge the video and audio modality.
+Consequently, our framework contributes a comprehensive solution for both TTA
+and VTA tasks without substantial computational overhead in training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EventHallusion: Diagnosing Event Hallucinations in Video LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16597v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16597v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Zhang, Yang Jiao, Shaoxiang Chen, Na Zhao, Jingjing Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Multimodal Large Language Models (MLLMs) have made significant
+progress in the video comprehension field. Despite remarkable content reasoning
+and instruction following capabilities they demonstrated, the hallucination
+problem of these VideoLLMs is less explored compared with its counterpart in
+the image domain. To mitigate this gap, we propose EventHallusion, a novel
+benchmark that focuses on assessing the VideoLLMs' hallucination toward event,
+the crux of video analysis. From a hallucination attribution perspective, our
+EventHallusion benchmark is curated to assess a VideoLLM's susceptibility
+toward language priors and vision-language biases. On the other hand, we also
+propose a simple yet effective method, called Temporal Contrastive Decoding
+(TCD), to tackle the hallucination problems of VideoLLMs. The proposed TCD
+method rectifies the model's bias toward its priors during the decoding stage
+by comparing the original video with a modified version, in which temporal cues
+are disrupted. Through comprehensive evaluation of eight open-source and two
+closed-source VideoLLMs on the proposed EventHallusion benchmark, we observe
+that the open-source models suffer significantly from hallucination problems,
+whereas the closed-source ones perform markedly better. By further equipping
+open-source VideoLLMs with the proposed TCD approach, evident performance
+improvements are achieved across most metrics in the EventHallusion benchmark.
+Our codes and benchmark data are available at
+https://github.com/Stevetich/EventHallusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast, Scale-Adaptive, and Uncertainty-Aware Downscaling of Earth System
+  Model Fields with Generative Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02774v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02774v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Hess, Michael Aich, Baoxiang Pan, Niklas Boers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and high-resolution Earth system model (ESM) simulations are
+essential to assess the ecological and socio-economic impacts of anthropogenic
+climate change, but are computationally too expensive to be run at sufficiently
+high spatial resolution. Recent machine learning approaches have shown
+promising results in downscaling ESM simulations, outperforming
+state-of-the-art statistical approaches. However, existing methods require
+computationally costly retraining for each ESM and extrapolate poorly to
+climates unseen during training. We address these shortcomings by learning a
+consistency model (CM) that efficiently and accurately downscales arbitrary ESM
+simulations without retraining in a zero-shot manner. Our approach yields
+probabilistic downscaled fields at a resolution only limited by the
+observational reference data. We show that the CM outperforms state-of-the-art
+diffusion models at a fraction of computational cost while maintaining high
+controllability on the downscaling task. Further, our method generalizes to
+climate states unseen during training without explicitly formulated physical
+constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Symmetries via Weight-Sharing with Doubly Stochastic Tensors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04594v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04594v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Putri A. van der Linden, Alejandro García-Castellanos, Sharvaree Vadgama, Thijs P. Kuipers, Erik J. Bekkers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group equivariance has emerged as a valuable inductive bias in deep learning,
+enhancing generalization, data efficiency, and robustness. Classically, group
+equivariant methods require the groups of interest to be known beforehand,
+which may not be realistic for real-world data. Additionally, baking in fixed
+group equivariance may impose overly restrictive constraints on model
+architecture. This highlights the need for methods that can dynamically
+discover and apply symmetries as soft constraints. For neural network
+architectures, equivariance is commonly achieved through group transformations
+of a canonical weight tensor, resulting in weight sharing over a given group
+$G$. In this work, we propose to learn such a weight-sharing scheme by defining
+a collection of learnable doubly stochastic matrices that act as soft
+permutation matrices on canonical weight tensors, which can take regular group
+representations as a special case. This yields learnable kernel transformations
+that are jointly optimized with downstream tasks. We show that when the dataset
+exhibits strong symmetries, the permutation matrices will converge to regular
+group representations and our weight-sharing networks effectively become
+regular group convolutions. Additionally, the flexibility of the method enables
+it to effectively pick up on partial symmetries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 14 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TextureCrop: Enhancing Synthetic Image Detection through Texture-based
+  Cropping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15500v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15500v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Despina Konstantinidou, Christos Koutlis, Symeon Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI technologies produce increasingly realistic imagery, which,
+despite its potential for creative applications, can also be misused to produce
+misleading and harmful content. This renders Synthetic Image Detection (SID)
+methods essential for identifying AI-generated content online. State-of-the-art
+SID methods typically resize or center-crop input images due to architectural
+or computational constraints, which hampers the detection of artifacts that
+appear in high-resolution images. To address this limitation, we propose
+TextureCrop, an image pre-processing component that can be plugged in any
+pre-trained SID model to improve its performance. By focusing on high-frequency
+image parts where generative artifacts are prevalent, TextureCrop enhances SID
+performance with manageable memory requirements. Experimental results
+demonstrate a consistent improvement in AUC across various detectors by 6.1%
+compared to center cropping and by 15% compared to resizing, across
+high-resolution images from the Forensynths, Synthbuster and TWIGMA datasets.
+Code available at https : //github.com/mever-team/texture-crop.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 images</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span>s and Large Language Models for Efficient Intrusion Detection
+  Systems: A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07583v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07583v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamza Kheddar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With significant advancements in Transformers LLMs, NLP has extended its
+reach into many research fields due to its enhanced capabilities in text
+generation and user interaction. One field benefiting greatly from these
+advancements is cybersecurity. In cybersecurity, many parameters that need to
+be protected and exchanged between senders and receivers are in the form of
+text and tabular data, making NLP a valuable tool in enhancing the security
+measures of communication protocols. This survey paper provides a comprehensive
+analysis of the utilization of Transformers and LLMs in cyber-threat detection
+systems. The methodology of paper selection and bibliometric analysis is
+outlined to establish a rigorous framework for evaluating existing research.
+The fundamentals of Transformers are discussed, including background
+information on various cyber-attacks and datasets commonly used in this field.
+The survey explores the application of Transformers in IDSs, focusing on
+different architectures such as Attention-based models, LLMs like BERT and GPT,
+CNN/LSTM-Transformer hybrids, emerging approaches like ViTs, among others.
+Furthermore, it explores the diverse environments and applications where
+Transformers and LLMs-based IDS have been implemented, including computer
+networks, IoT devices, critical infrastructure protection, cloud computing,
+SDN, as well as in autonomous vehicles. The paper also addresses research
+challenges and future directions in this area, identifying key issues such as
+interpretability, scalability, and adaptability to evolving threats, and more.
+Finally, the conclusion summarizes the findings and highlights the significance
+of Transformers and LLMs in enhancing cyber-threat detection capabilities,
+while also outlining potential avenues for further research and development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2405.04760 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Decoders for <span class="highlight-title">Transformer</span>-based Semantic Segmentation: A
+  Compression Perspective <span class="chip">NeurIPS2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03033v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03033v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qishuai Wen, Chun-Guang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art methods for Transformer-based semantic segmentation
+typically adopt Transformer decoders that are used to extract additional
+embeddings from image embeddings via cross-attention, refine either or both
+types of embeddings via self-attention, and project image embeddings onto the
+additional embeddings via dot-product. Despite their remarkable success, these
+empirical designs still lack theoretical justifications or interpretations,
+thus hindering potentially principled improvements. In this paper, we argue
+that there are fundamental connections between semantic segmentation and
+compression, especially between the Transformer decoders and Principal
+Component Analysis (PCA). From such a perspective, we derive a white-box, fully
+attentional DEcoder for PrIncipled semantiC segemenTation (DEPICT), with the
+interpretations as follows: 1) the self-attention operator refines image
+embeddings to construct an ideal principal subspace that aligns with the
+supervision and retains most information; 2) the cross-attention operator seeks
+to find a low-rank approximation of the refined image embeddings, which is
+expected to be a set of orthonormal bases of the principal subspace and
+corresponds to the predefined classes; 3) the dot-product operation yields
+compact representation for image embeddings as segmentation masks. Experiments
+conducted on dataset ADE20K find that DEPICT consistently outperforms its
+black-box counterpart, Segmenter, and it is light weight and more robust.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS2024. Code:https://github.com/QishuaiWen/DEPICT/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal
+  MRI <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10377v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10377v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linxuan Han, Sa Xiao, Zimeng Li, Haidong Li, Xiuchao Zhao, Yeqing Han, Fumin Guo, Xin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal magnetic resonance imaging (MRI) provides information of lesions
+for computer-aided diagnosis from different views. Deep learning algorithms are
+suitable for identifying specific anatomical structures, segmenting lesions,
+and classifying diseases. Manual labels are limited due to the high expense,
+which hinders further improvement of accuracy. Self-supervised learning,
+particularly masked image modeling (MIM), has shown promise in utilizing
+unlabeled data. However, we spot model collapse when applying MIM to
+multi-modal MRI datasets. The performance of downstream tasks does not see any
+improvement following the collapsed model. To solve model collapse, we analyze
+and address it in two types: complete collapse and dimensional collapse. We
+find complete collapse occurs because the collapsed loss value in multi-modal
+MRI datasets falls below the normally converged loss value. Based on this, the
+hybrid mask pattern (HMP) masking strategy is introduced to elevate the
+collapsed loss above the normally converged loss value and avoid complete
+collapse. Additionally, we reveal that dimensional collapse stems from
+insufficient feature uniformity in MIM. We mitigate dimensional collapse by
+introducing the pyramid barlow twins (PBT) module as an explicit regularization
+method. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module
+to avoid model collapse multi-modal MRI. Experiments are conducted on three
+multi-modal MRI datasets to validate the effectiveness of our approach in
+preventing both types of model collapse. By preventing model collapse, the
+training of the model becomes more stable, resulting in a decent improvement in
+performance for segmentation and classification tasks. The code is available at
+https://github.com/LinxuanHan/E-MIM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Perception Matters: Enhancing Embodied AI with Uncertainty-Aware
+  Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.02297v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.02297v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sai Prasanna, Daniel Honerkamp, Kshitij Sirohi, Tim Welschehold, Wolfram Burgard, Abhinav Valada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Embodied AI has made significant progress acting in unexplored environments.
+However, tasks such as object search have largely focused on efficient policy
+learning. In this work, we identify several gaps in current search methods:
+They largely focus on dated perception models, neglect temporal aggregation,
+and transfer from ground truth directly to noisy perception at test time,
+without accounting for the resulting overconfidence in the perceived state. We
+address the identified problems through calibrated perception probabilities and
+uncertainty across aggregation and found decisions, thereby adapting the models
+for sequential tasks. The resulting methods can be directly integrated with
+pretrained models across a wide family of existing search approaches at no
+additional training cost. We perform extensive evaluations of aggregation
+methods across both different semantic perception models and policies,
+confirming the importance of calibrated uncertainties in both the aggregation
+and found decisions. We make the code and trained models available at
+https://semantic-search.cs.uni-freiburg.de.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TextureDiffusion: Target <span class="highlight-title">Prompt</span> Disentangled Editing for Various Texture
+  Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09610v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09610v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Su, Junhao Zhuang, Chun Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, text-guided image editing has achieved significant success.
+However, existing methods can only apply simple textures like wood or gold when
+changing the texture of an object. Complex textures such as cloud or fire pose
+a challenge. This limitation stems from that the target prompt needs to contain
+both the input image content and <texture>, restricting the texture
+representation. In this paper, we propose TextureDiffusion, a tuning-free image
+editing method applied to various texture transfer. Initially, the target
+prompt is directly set to "<texture>", making the texture disentangled from the
+input image content to enhance texture representation. Subsequently, query
+features in self-attention and features in residual blocks are utilized to
+preserve the structure of the input image. Finally, to maintain the background,
+we introduce an edit localization technique which blends the self-attention
+results and the intermediate latents. Comprehensive experiments demonstrate
+that TextureDiffusion can harmoniously transfer various textures with excellent
+structure and background preservation. Code is publicly available at
+https://github.com/THU-CVML/TextureDiffusion
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ONER: Online Experience Replay for Incremental Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.03907v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.03907v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhou Jin, Jiahui Zhu, Guodong Wang, Shiwei Li, Jinjin Zhang, Qingjie Liu, Xinyue Liu, Yunhong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incremental anomaly detection sequentially recognizes abnormal regions in
+novel categories for dynamic industrial scenarios. This remains highly
+challenging due to knowledge overwriting and feature conflicts, leading to
+catastrophic forgetting. In this work, we propose ONER, an end-to-end ONline
+Experience Replay method, which efficiently mitigates catastrophic forgetting
+while adapting to new tasks with minimal cost. Specifically, our framework
+utilizes two types of experiences from past tasks: decomposed prompts and
+semantic prototypes, addressing both model parameter updates and feature
+optimization. The decomposed prompts consist of learnable components that
+assemble to produce attention-conditioned prompts. These prompts reuse
+previously learned knowledge, enabling model to learn novel tasks effectively.
+The semantic prototypes operate at both pixel and image levels, performing
+regularization in the latent feature space to prevent forgetting across various
+tasks. Extensive experiments demonstrate that our method achieves
+state-of-the-art performance in incremental anomaly detection with
+significantly reduced forgetting, as well as efficiently adapting to new
+categories with minimal costs. These results confirm the efficiency and
+stability of ONER, making it a powerful solution for real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HyFusion: Enhanced Reception Field <span class="highlight-title">Transformer</span> for Hyperspectral Image
+  Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04665v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04665v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chia-Ming Lee, Yu-Fan Lin, Yu-Hao Ho, Li-Wei Kang, Chih-Chung Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral image (HSI) fusion addresses the challenge of reconstructing
+High-Resolution HSIs (HR-HSIs) from High-Resolution Multispectral images
+(HR-MSIs) and Low-Resolution HSIs (LR-HSIs), a critical task given the high
+costs and hardware limitations associated with acquiring high-quality HSIs.
+While existing methods leverage spatial and spectral relationships, they often
+suffer from limited receptive fields and insufficient feature utilization,
+leading to suboptimal performance. Furthermore, the scarcity of high-quality
+HSI data highlights the importance of efficient data utilization to maximize
+reconstruction quality. To address these issues, we propose HyFusion, a novel
+Dual-Coupled Network (DCN) framework designed to enhance cross-domain feature
+extraction and enable effective feature map reusing. The framework first
+processes HR-MSI and LR-HSI inputs through specialized subnetworks that
+mutually enhance each other during feature extraction, preserving complementary
+spatial and spectral details. At its core, HyFusion utilizes an Enhanced
+Reception Field Block (ERFB), which combines shifting-window attention and
+dense connections to expand the receptive field, effectively capturing
+long-range dependencies while minimizing information loss. Extensive
+experiments demonstrate that HyFusion achieves state-of-the-art performance in
+HR-MSI/LR-HSI fusion, significantly improving reconstruction quality while
+maintaining a compact model size and computational efficiency. By integrating
+enhanced receptive fields and feature map reusing into a coupled network
+architecture, HyFusion provides a practical and effective solution for HSI
+fusion in resource-constrained scenarios, setting a new benchmark in
+hyperspectral imaging. Our code will be publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IGARSS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge-Guided <span class="highlight-title">Prompt</span> Learning for Deepfake Facial Image Detection <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Wang, Cheng Deng, Zhidong Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent generative models demonstrate impressive performance on synthesizing
+photographic images, which makes humans hardly to distinguish them from
+pristine ones, especially on realistic-looking synthetic facial images.
+Previous works mostly focus on mining discriminative artifacts from vast amount
+of visual data. However, they usually lack the exploration of prior knowledge
+and rarely pay attention to the domain shift between training categories (e.g.,
+natural and indoor objects) and testing ones (e.g., fine-grained human facial
+images), resulting in unsatisfactory detection performance. To address these
+issues, we propose a novel knowledge-guided prompt learning method for deepfake
+facial image detection. Specifically, we retrieve forgery-related prompts from
+large language models as expert knowledge to guide the optimization of
+learnable prompts. Besides, we elaborate test-time prompt tuning to alleviate
+the domain shift, achieving significant performance improvement and boosting
+the application in real-world scenarios. Extensive experiments on
+DeepFakeFaceForensics dataset show that our proposed approach notably
+outperforms state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PastNet: Introducing Physical Inductive Biases for Spatio-temporal Video
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11421v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11421v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Wu, Fan Xu, Chong Chen, Xian-Sheng Hua, Xiao Luo, Haixin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the challenge of spatio-temporal video
+prediction task, which involves generating future video frames based on
+historical spatio-temporal observation streams. Existing approaches typically
+utilize external information such as semantic maps to improve video prediction
+accuracy, which often neglect the inherent physical knowledge embedded within
+videos. Worse still, their high computational costs could impede their
+applications for high-resolution videos. To address these constraints, we
+introduce a novel framework called \underline{P}hysics-\underline{a}ssisted
+\underline{S}patio-\underline{t}emporal \underline{Net}work (PastNet) for
+high-quality video prediction. The core of PastNet lies in incorporating a
+spectral convolution operator in the Fourier domain, which efficiently
+introduces inductive biases from the underlying physical laws. Additionally, we
+employ a memory bank with the estimated intrinsic dimensionality to discretize
+local features during the processing of complex spatio-temporal signals,
+thereby reducing computational costs and facilitating efficient high-resolution
+video prediction. Extensive experiments on various widely-used spatio-temporal
+video benchmarks demonstrate the effectiveness and efficiency of the proposed
+PastNet compared with a range of state-of-the-art methods, particularly in
+high-resolution scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DehazeGS: Seeing Through Fog with 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03659v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03659v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinze Yu, Yiqun Wang, Zhengda Lu, Jianwei Guo, Yong Li, Hongxing Qin, Xiaopeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current novel view synthesis tasks primarily rely on high-quality and clear
+images. However, in foggy scenes, scattering and attenuation can significantly
+degrade the reconstruction and rendering quality. Although NeRF-based dehazing
+reconstruction algorithms have been developed, their use of deep fully
+connected neural networks and per-ray sampling strategies leads to high
+computational costs. Moreover, NeRF's implicit representation struggles to
+recover fine details from hazy scenes. In contrast, recent advancements in 3D
+Gaussian Splatting achieve high-quality 3D scene reconstruction by explicitly
+modeling point clouds into 3D Gaussians. In this paper, we propose leveraging
+the explicit Gaussian representation to explain the foggy image formation
+process through a physically accurate forward rendering process. We introduce
+DehazeGS, a method capable of decomposing and rendering a fog-free background
+from participating media using only muti-view foggy images as input. We model
+the transmission within each Gaussian distribution to simulate the formation of
+fog. During this process, we jointly learn the atmospheric light and scattering
+coefficient while optimizing the Gaussian representation of the hazy scene. In
+the inference stage, we eliminate the effects of scattering and attenuation on
+the Gaussians and directly project them onto a 2D plane to obtain a clear view.
+Experiments on both synthetic and real-world foggy datasets demonstrate that
+DehazeGS achieves state-of-the-art performance in terms of both rendering
+quality and computational efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spacewalker: Traversing Representation Spaces for Fast Interactive
+  Exploration and Annotation of Unstructured Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16793v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16793v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Heine, Fabian Hörst, Jana Fragemann, Gijs Luijten, Jan Egger, Fin Bahnsen, M. Saquib Sarfraz, Jens Kleesiek, Constantin Seibold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In industries such as healthcare, finance, and manufacturing, analysis of
+unstructured textual data presents significant challenges for analysis and
+decision making. Uncovering patterns within large-scale corpora and
+understanding their semantic impact is critical, but depends on domain experts
+or resource-intensive manual reviews. In response, we introduce Spacewalker in
+this system demonstration paper, an interactive tool designed to analyze,
+explore, and annotate data across multiple modalities. It allows users to
+extract data representations, visualize them in low-dimensional spaces and
+traverse large datasets either exploratory or by querying regions of interest.
+We evaluated Spacewalker through extensive experiments and annotation studies,
+assessing its efficacy in improving data integrity verification and annotation.
+We show that Spacewalker reduces time and effort compared to traditional
+methods. The code of this work is open-source and can be found at:
+https://github.com/code-lukas/Spacewalker
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Transfer and Domain Adaptation for Fine-Grained Remote Sensing
+  Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.06664v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.06664v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shun Zhang, Xuechao Zou, Kai Li, Congyan Lang, Shiying Wang, Pin Tao, Tengfei Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-grained remote sensing image segmentation is essential for accurately
+identifying detailed objects in remote sensing images. Recently, vision
+transformer models (VTMs) pre-trained on large-scale datasets have demonstrated
+strong zero-shot generalization. However, directly applying them to specific
+tasks may lead to domain shift. We introduce a novel end-to-end learning
+paradigm combining knowledge guidance with domain refinement to enhance
+performance. We present two key components: the Feature Alignment Module (FAM)
+and the Feature Modulation Module (FMM). FAM aligns features from a CNN-based
+backbone with those from the pretrained VTM's encoder using channel
+transformation and spatial interpolation, and transfers knowledge via KL
+divergence and L2 normalization constraint. FMM further adapts the knowledge to
+the specific domain to address domain shift. We also introduce a fine-grained
+grass segmentation dataset and demonstrate, through experiments on two
+datasets, that our method achieves a significant improvement of 2.57 mIoU on
+the grass dataset and 3.73 mIoU on the cloud dataset. The results highlight the
+potential of combining knowledge transfer and domain adaptation to overcome
+domain-related challenges and data limitations. The project page is available
+at https://xavierjiezou.github.io/KTDA/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Edicho: Consistent Image Editing in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.21079v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.21079v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyan Bai, Hao Ouyang, Yinghao Xu, Qiuyu Wang, Ceyuan Yang, Ka Leong Cheng, Yujun Shen, Qifeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a verified need, consistent editing across in-the-wild images remains a
+technical challenge arising from various unmanageable factors, like object
+poses, lighting conditions, and photography environments. Edicho steps in with
+a training-free solution based on diffusion models, featuring a fundamental
+design principle of using explicit image correspondence to direct editing.
+Specifically, the key components include an attention manipulation module and a
+carefully refined classifier-free guidance (CFG) denoising strategy, both of
+which take into account the pre-estimated correspondence. Such an
+inference-time algorithm enjoys a plug-and-play nature and is compatible to
+most diffusion-based editing methods, such as ControlNet and BrushNet.
+Extensive results demonstrate the efficacy of Edicho in consistent cross-image
+editing under diverse settings. We will release the code to facilitate future
+studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://ant-research.github.io/edicho/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MoPE: Mixture of <span class="highlight-title">Prompt</span> Experts for Parameter-Efficient and Scalable
+  Multimodal Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10568v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10568v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixiang Jiang, Lingbo Liu, Changwen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the demonstrated parameter efficiency of prompt-based multimodal
+fusion methods, their limited adaptivity and expressiveness often result in
+suboptimal performance compared to other tuning approaches. In this paper, we
+introduce the Mixture of Prompt Experts (MoPE), the first technique designed to
+overcome these limitations by decomposing standard prompts to capture
+instance-level features adaptively. Building on this decomposition, MoPE
+enhances prompt fusion's expressiveness by leveraging multimodal pairing priors
+to route the most effective prompt for each instance dynamically. Compared to
+vanilla prompting, our MoPE-based fusion method exhibits greater
+expressiveness, scaling more effectively with the training data and the overall
+number of trainable parameters. We also investigate regularization terms for
+expert routing, which lead to emergent expert specialization with enhanced
+adaptiveness and interpretablity. Extensive experiments across six multimodal
+datasets spanning four modalities demonstrate state-of-the-art performance for
+prompt fusion, matching or even surpassing the performance of fine-tuning while
+requiring only 0.8% of the trainable parameters. Project homepage:
+https://github.com/songrise/MoPE
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review, Extended version of arxiv:2312.03734</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BIOMEDICA: An Open Biomedical Image-Caption Archive, <span class="highlight-title">Dataset</span>, and
+  Vision-Language Models Derived from Scientific Literature 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07171v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07171v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Lozano, Min Woo Sun, James Burgess, Liangyu Chen, Jeffrey J Nirschl, Jeffrey Gu, Ivan Lopez, Josiah Aklilu, Austin Wolfgang Katzer, Collin Chiu, Anita Rau, Xiaohan Wang, Yuhui Zhang, Alfred Seunghoon Song, Robert Tibshirani, Serena Yeung-Levy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of vision-language models (VLMs) is driven by large-scale and
+diverse multimodal datasets. However, progress toward generalist biomedical
+VLMs is limited by the lack of annotated, publicly accessible datasets across
+biology and medicine. Existing efforts are restricted to narrow domains,
+missing the full diversity of biomedical knowledge encoded in scientific
+literature. To address this gap, we introduce BIOMEDICA, a scalable,
+open-source framework to extract, annotate, and serialize the entirety of the
+PubMed Central Open Access subset into an easy-to-use, publicly accessible
+dataset. Our framework produces a comprehensive archive with over 24 million
+unique image-text pairs from over 6 million articles. Metadata and
+expert-guided annotations are also provided. We demonstrate the utility and
+accessibility of our resource by releasing BMCA-CLIP, a suite of CLIP-style
+models continuously pre-trained on the BIOMEDICA dataset via streaming,
+eliminating the need to download 27 TB of data locally. On average, our models
+achieve state-of-the-art performance across 40 tasks - spanning pathology,
+radiology, ophthalmology, dermatology, surgery, molecular biology,
+parasitology, and cell biology - excelling in zero-shot classification with a
+6.56% average improvement (as high as 29.8% and 17.5% in dermatology and
+ophthalmology, respectively), and stronger image-text retrieval, all while
+using 10x less compute. To foster reproducibility and collaboration, we release
+our codebase and dataset for the broader research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recognizing Artistic Style of Archaeological Image Fragments Using Deep
+  Style Extrapolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00836v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00836v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gur Elkin, Ofir Itzhak Shahar, Yaniv Ohayon, Nadav Alali, Ohad Ben-Shahar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ancient artworks obtained in archaeological excavations usually suffer from a
+certain degree of fragmentation and physical degradation. Often, fragments of
+multiple artifacts from different periods or artistic styles could be found on
+the same site. With each fragment containing only partial information about its
+source, and pieces from different objects being mixed, categorizing broken
+artifacts based on their visual cues could be a challenging task, even for
+professionals. As classification is a common function of many machine learning
+models, the power of modern architectures can be harnessed for efficient and
+accurate fragment classification. In this work, we present a generalized
+deep-learning framework for predicting the artistic style of image fragments,
+achieving state-of-the-art results for pieces with varying styles and
+geometries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in the 27th International Conference on
+  Human-Computer Interaction (HCII 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Flash Window Attention: speedup the attention computation for Swin
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06480v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06480v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhendong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To address the high resolution of image pixels, the Swin Transformer
+introduces window attention. This mechanism divides an image into
+non-overlapping windows and restricts attention computation to within each
+window, significantly enhancing computational efficiency. To further optimize
+this process, one might consider replacing standard attention with flash
+attention, which has proven to be more efficient in language models. However, a
+direct substitution is ineffective. Flash attention is designed for long
+sequences, whereas window attention deals with shorter sequences but must
+handle numerous of them in parallel. In this report, we present an optimized
+solution called Flash Window Attention, tailored specifically for window
+attention. Flash Window Attention improves attention computation efficiency by
+up to 300% and enhances end-to-end runtime efficiency by up to 30%. Our code is
+available online.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analyzing Infrastructure LiDAR Placement with Realistic LiDAR Simulation
+  Library <span class="chip">ICRA'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15975v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15975v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Cai, Wentao Jiang, Runsheng Xu, Wenquan Zhao, Jiaqi Ma, Si Liu, Yikang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Vehicle-to-Everything(V2X) cooperative perception has attracted
+increasing attention. Infrastructure sensors play a critical role in this
+research field; however, how to find the optimal placement of infrastructure
+sensors is rarely studied. In this paper, we investigate the problem of
+infrastructure sensor placement and propose a pipeline that can efficiently and
+effectively find optimal installation positions for infrastructure sensors in a
+realistic simulated environment. To better simulate and evaluate LiDAR
+placement, we establish a Realistic LiDAR Simulation library that can simulate
+the unique characteristics of different popular LiDARs and produce
+high-fidelity LiDAR point clouds in the CARLA simulator. Through simulating
+point cloud data in different LiDAR placements, we can evaluate the perception
+accuracy of these placements using multiple detection models. Then, we analyze
+the correlation between the point cloud distribution and perception accuracy by
+calculating the density and uniformity of regions of interest. Experiments show
+that when using the same number and type of LiDAR, the placement scheme
+optimized by our proposed method improves the average precision by 15%,
+compared with the conventional placement scheme in the standard lane scene. We
+also analyze the correlation between perception performance in the region of
+interest and LiDAR point cloud distribution and validate that density and
+uniformity can be indicators of performance. Both the RLS Library and related
+code will be released at https://github.com/PJLab-ADG/PCSim.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 6 figures, accepted to the IEEE International Conference on
+  Robotics and Automation (ICRA'23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Cascaded Dilated Convolution Approach for Mpox Lesion Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10106v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10106v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayush Deshmukh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The global outbreak of the Mpox virus, classified as a Public Health
+Emergency of International Concern (PHEIC) by the World Health Organization,
+presents significant diagnostic challenges due to its visual similarity to
+other skin lesion diseases. Traditional diagnostic methods for Mpox, which rely
+on clinical symptoms and laboratory tests, are slow and labor intensive. Deep
+learning-based approaches for skin lesion classification offer a promising
+alternative. However, developing a model that balances efficiency with accuracy
+is crucial to ensure reliable and timely diagnosis without compromising
+performance. This study introduces the Cascaded Atrous Group Attention (CAGA)
+framework to address these challenges, combining the Cascaded Atrous Attention
+module and the Cascaded Group Attention mechanism. The Cascaded Atrous
+Attention module utilizes dilated convolutions and cascades the outputs to
+enhance multi-scale representation. This is integrated into the Cascaded Group
+Attention mechanism, which reduces redundancy in Multi-Head Self-Attention. By
+integrating the Cascaded Atrous Group Attention module with EfficientViT-L1 as
+the backbone architecture, this approach achieves state-of-the-art performance,
+reaching an accuracy of 98% on the Mpox Close Skin Image (MCSI) dataset while
+reducing model parameters by 37.5% compared to the original EfficientViT-L1.
+The model's robustness is demonstrated through extensive validation on two
+additional benchmark datasets, where it consistently outperforms existing
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, Submitted to Medical Imaging with Deep Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Implicit Neural Representations with Fourier Kolmogorov-Arnold Networks <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09323v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09323v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Mehrabian, Parsa Mojarad Adi, Moein Heidari, Ilker Hacihaliloglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit neural representations (INRs) use neural networks to provide
+continuous and resolution-independent representations of complex signals with a
+small number of parameters. However, existing INR models often fail to capture
+important frequency components specific to each task. To address this issue, in
+this paper, we propose a Fourier Kolmogorov Arnold network (FKAN) for INRs. The
+proposed FKAN utilizes learnable activation functions modeled as Fourier series
+in the first layer to effectively control and learn the task-specific frequency
+components. In addition, the activation functions with learnable Fourier
+coefficients improve the ability of the network to capture complex patterns and
+details, which is beneficial for high-resolution and high-dimensional data.
+Experimental results show that our proposed FKAN model outperforms three
+state-of-the-art baseline schemes, and improves the peak signal-to-noise ratio
+(PSNR) and structural similarity index measure (SSIM) for the image
+representation task and intersection over union (IoU) for the 3D occupancy
+volume representation task, respectively. The code is available at
+github.com/Ali-Meh619/FKAN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Proc. IEEE ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient descent with generalized Newton's method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.02772v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.02772v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqi Bu, Shiyun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the generalized Newton's method (GeN) -- a Hessian-informed
+approach that applies to any optimizer such as SGD and Adam, and covers the
+Newton-Raphson method as a sub-case. Our method automatically and dynamically
+selects the learning rate that accelerates the convergence, without the
+intensive tuning of the learning rate scheduler. In practice, our method is
+easily implementable, since it only requires additional forward passes with
+almost zero computational overhead (in terms of training time and memory cost),
+if the overhead is amortized over many iterations. We present extensive
+experiments on language and vision tasks (e.g. GPT and ResNet) to showcase that
+GeN optimizers match the state-of-the-art performance, which was achieved with
+carefully tuned learning rate schedulers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MambaTrack: Exploiting Dual-Enhancement for Night UAV Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15761v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15761v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunhui Zhang, Li Liu, Hao Wen, Xi Zhou, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Night unmanned aerial vehicle (UAV) tracking is impeded by the challenges of
+poor illumination, with previous daylight-optimized methods demonstrating
+suboptimal performance in low-light conditions, limiting the utility of UAV
+applications. To this end, we propose an efficient mamba-based tracker,
+leveraging dual enhancement techniques to boost night UAV tracking. The
+mamba-based low-light enhancer, equipped with an illumination estimator and a
+damage restorer, achieves global image enhancement while preserving the details
+and structure of low-light images. Additionally, we advance a cross-modal mamba
+network to achieve efficient interactive learning between vision and language
+modalities. Extensive experiments showcase that our method achieves advanced
+performance and exhibits significantly improved computation and memory
+efficiency. For instance, our method is 2.8$\times$ faster than CiteTracker and
+reduces 50.2$\%$ GPU memory. Our codes are available at
+\url{https://github.com/983632847/Awesome-Multimodal-Object-Tracking}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dissecting Query-Key Interaction in Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14880v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14880v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Pan, Aaron Philip, Ziqian Xie, Odelia Schwartz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-attention in vision transformers is often thought to perform perceptual
+grouping where tokens attend to other tokens with similar embeddings, which
+could correspond to semantically similar features of an object. However,
+attending to dissimilar tokens can be beneficial by providing contextual
+information. We propose to analyze the query-key interaction by the singular
+value decomposition of the interaction matrix (i.e.
+${\textbf{W}_q}^\top\textbf{W}_k$). We find that in many ViTs, especially those
+with classification training objectives, early layers attend more to similar
+tokens, while late layers show increased attention to dissimilar tokens,
+providing evidence corresponding to perceptual grouping and contextualization,
+respectively. Many of these interactions between features represented by
+singular vectors are interpretable and semantic, such as attention between
+relevant objects, between parts of an object, or between the foreground and
+background. This offers a novel perspective on interpreting the attention
+mechanism, which contributes to understanding how transformer models utilize
+context and salient features when processing images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smartphone-based Eye Tracking System using Edge Intelligence and Model
+  Optimisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nishan Gunawardena, Gough Yumu Lui, Jeewani Anupama Ginige, Bahman Javadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A significant limitation of current smartphone-based eye-tracking algorithms
+is their low accuracy when applied to video-type visual stimuli, as they are
+typically trained on static images. Also, the increasing demand for real-time
+interactive applications like games, VR, and AR on smartphones requires
+overcoming the limitations posed by resource constraints such as limited
+computational power, battery life, and network bandwidth. Therefore, we
+developed two new smartphone eye-tracking techniques for video-type visuals by
+combining Convolutional Neural Networks (CNN) with two different Recurrent
+Neural Networks (RNN), namely Long Short Term Memory (LSTM) and Gated Recurrent
+Unit (GRU). Our CNN+LSTM and CNN+GRU models achieved an average Root Mean
+Square Error of 0.955 cm and 1.091 cm, respectively. To address the
+computational constraints of smartphones, we developed an edge intelligence
+architecture to enhance the performance of smartphone-based eye tracking. We
+applied various optimisation methods like quantisation and pruning to deep
+learning models for better energy, CPU, and memory usage on edge devices,
+focusing on real-time processing. Using model quantisation, the model inference
+time in the CNN+LSTM and CNN+GRU models was reduced by 21.72% and 19.50%,
+respectively, on edge devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>I have included the three papers as reference, which are closely
+  related. We have expanded the future work section to provide a more thorough
+  discussion of the concepts of "varying lighting conditions" and "dynamic user
+  environments." We have added a note below Table 4 to clarify the
+  abbreviations' meaning. Elaborated the role of the Domain Expert within the
+  presentation layer in Section 4.1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Collection of a Human Robot Collaboration <span class="highlight-title">Dataset</span> for Cooperative
+  Assembly in Glovebox Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.14649v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.14649v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivansh Sharma, Mathew Huang, Sanat Nair, Alan Wen, Christina Petlowany, Juston Moore, Selma Wanna, Mitch Pryor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Industry 4.0 introduced AI as a transformative solution for modernizing
+manufacturing processes. Its successor, Industry 5.0, envisions humans as
+collaborators and experts guiding these AI-driven manufacturing solutions.
+Developing these techniques necessitates algorithms capable of safe, real-time
+identification of human positions in a scene, particularly their hands, during
+collaborative assembly. Although substantial efforts have curated datasets for
+hand segmentation, most focus on residential or commercial domains. Existing
+datasets targeting industrial settings predominantly rely on synthetic data,
+which we demonstrate does not effectively transfer to real-world operations.
+Moreover, these datasets lack uncertainty estimations critical for safe
+collaboration. Addressing these gaps, we present HAGS: Hand and Glove
+Segmentation Dataset. This dataset provides challenging examples to build
+applications toward hand and glove segmentation in industrial human-robot
+collaboration scenarios as well as assess out-of-distribution images,
+constructed via green screen augmentations, to determine ML-classifier
+robustness. We study state-of-the-art, real-time segmentation models to
+evaluate existing methods. Our dataset and baselines are publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>draft paper to be submitted to IJRR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A systematic <span class="highlight-title">review</span> of the use of Deep Learning in Satellite Imagery for
+  Agriculture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.01272v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.01272v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Victor, Zhen He, Aiden Nibali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Agricultural research is essential for increasing food production to meet the
+requirements of an increasing population in the coming decades. Recently,
+satellite technology has been improving rapidly and deep learning has seen much
+success in generic computer vision tasks and many application areas which
+presents an important opportunity to improve analysis of agricultural land.
+Here we present a systematic review of 150 studies to find the current uses of
+deep learning on satellite imagery for agricultural research. Although we
+identify 5 categories of agricultural monitoring tasks, the majority of the
+research interest is in crop segmentation and yield prediction. We found that,
+when used, modern deep learning methods consistently outperformed traditional
+machine learning across most tasks; the only exception was that Long Short-Term
+Memory (LSTM) Recurrent Neural Networks did not consistently outperform Random
+Forests (RF) for yield prediction. The reviewed studies have largely adopted
+methodologies from generic computer vision, except for one major omission:
+benchmark datasets are not utilised to evaluate models across studies, making
+it difficult to compare results. Additionally, some studies have specifically
+utilised the extra spectral resolution available in satellite imagery, but
+other divergent properties of satellite images - such as the hugely different
+scales of spatial patterns - are not being taken advantage of in the reviewed
+studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 5 figures and 10 tables in main paper. Final version, as
+  submitted and accepted at JSTARS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-shot 3D Segmentation of Abdominal Organs in CT Scans Using Segment
+  Anything Model 2: Adapting Video Tracking Capabilities for 3D Medical Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06170v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06170v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yosuke Yamagishi, Shouhei Hanaoka, Tomohiro Kikuchi, Takahiro Nakao, Yuta Nakamura, Yukihiro Nomura, Soichiro Miki, Takeharu Yoshikawa, Osamu Abe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objectives: To evaluate the zero-shot performance of Segment Anything Model 2
+(SAM 2) in 3D segmentation of abdominal organs in CT scans, and to investigate
+the effects of prompt settings on segmentation results.
+  Materials and Methods: In this retrospective study, we used a subset of the
+TotalSegmentator CT dataset from eight institutions to assess SAM 2's ability
+to segment eight abdominal organs. Segmentation was initiated from three
+different z-coordinate levels (caudal, mid, and cranial levels) of each organ.
+Performance was measured using the Dice similarity coefficient (DSC). We also
+analyzed the impact of "negative prompts," which explicitly exclude certain
+regions from the segmentation process, on accuracy.
+  Results: 123 patients (mean age, 60.7 \pm 15.5 years; 63 men, 60 women) were
+evaluated. As a zero-shot approach, larger organs with clear boundaries
+demonstrated high segmentation performance, with mean DSCs as follows: liver
+0.821 \pm 0.192, right kidney 0.862 \pm 0.212, left kidney 0.870 \pm 0.154, and
+spleen 0.891 \pm 0.131. Smaller organs showed lower performance: gallbladder
+0.531 \pm 0.291, pancreas 0.361 \pm 0.197, and adrenal glands, right 0.203 \pm
+0.222, left 0.308 \pm 0.234. The initial slice for segmentation and the use of
+negative prompts significantly influenced the results. By removing negative
+prompts from the input, the DSCs significantly decreased for six organs.
+  Conclusion: SAM 2 demonstrated promising zero-shot performance in segmenting
+certain abdominal organs in CT scans, particularly larger organs. Performance
+was significantly influenced by input negative prompts and initial slice
+selection, highlighting the importance of optimizing these factors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 7 figures (including 2 supplemental figure), 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XVertNet: Unsupervised Contrast Enhancement of Vertebral Structures with
+  Dynamic Self-Tuning Guidance and Multi-Stage Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03983v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03983v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ella Eidlin, Assaf Hoogi, Hila Rozen, Mohammad Badarne, Nathan S. Netanyahu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chest X-rays remain the primary diagnostic tool in emergency medicine, yet
+their limited ability to capture fine anatomical details can result in missed
+or delayed diagnoses. To address this, we introduce XVertNet, a novel
+deep-learning framework designed to enhance vertebral structure visualization
+in X-ray images significantly. Our framework introduces two key innovations:
+(1) An unsupervised learning architecture that eliminates reliance on manually
+labeled training data a persistent bottleneck in medical imaging, and (2) a
+dynamic self-tuned internal guidance mechanism featuring an adaptive feedback
+loop for real-time image optimization. Extensive validation across four major
+public datasets revealed that XVertNet outperforms state-of-the-art enhancement
+methods, as demonstrated by improvements in entropy scores, Tenengrad criterion
+values, the local phase coherence sharpness index (LPC-SI), and thetone mapped
+image quality index (TMQI). Furthermore, clinical validation conducted with two
+board-certified radiologists confirmed that the enhanced images enabled more
+sensitive detection of subtle vertebral fractures and degenerative changes. The
+unsupervised nature of XVertNet facilitates immediate clinical deployment
+without requiring additional training overhead. This innovation represents a
+transformative advancement in emergency radiology, providing a scalable and
+time-efficient solution to enhance diagnostic accuracy in high-pressure
+clinical environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expressive Text-to-Image Generation with Rich Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06720v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06720v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songwei Ge, Taesung Park, Jun-Yan Zhu, Jia-Bin Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plain text has become a prevalent interface for text-to-image synthesis.
+However, its limited customization options hinder users from accurately
+describing desired outputs. For example, plain text makes it hard to specify
+continuous quantities, such as the precise RGB color value or importance of
+each word. Furthermore, creating detailed text prompts for complex scenes is
+tedious for humans to write and challenging for text encoders to interpret. To
+address these challenges, we propose using a rich-text editor supporting
+formats such as font style, size, color, and footnote. We extract each word's
+attributes from rich text to enable local style control, explicit token
+reweighting, precise color rendering, and detailed region synthesis. We achieve
+these capabilities through a region-based diffusion process. We first obtain
+each word's region based on attention maps of a diffusion process using plain
+text. For each region, we enforce its text attributes by creating
+region-specific detailed prompts and applying region-specific guidance, and
+maintain its fidelity against plain-text generation through region-based
+injections. We present various examples of image generation from rich text and
+demonstrate that our method outperforms strong baselines with quantitative
+evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://rich-text-to-image.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UrbanIR: Large-Scale Urban Scene Inverse Rendering from a Single Video 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09349v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09349v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chih-Hao Lin, Bohan Liu, Yi-Ting Chen, Kuan-Sheng Chen, David Forsyth, Jia-Bin Huang, Anand Bhattad, Shenlong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present UrbanIR (Urban Scene Inverse Rendering), a new inverse graphics
+model that enables realistic, free-viewpoint renderings of scenes under various
+lighting conditions with a single video. It accurately infers shape, albedo,
+visibility, and sun and sky illumination from wide-baseline videos, such as
+those from car-mounted cameras, differing from NeRF's dense view settings. In
+this context, standard methods often yield subpar geometry and material
+estimates, such as inaccurate roof representations and numerous 'floaters'.
+UrbanIR addresses these issues with novel losses that reduce errors in inverse
+graphics inference and rendering artifacts. Its techniques allow for precise
+shadow volume estimation in the original scene. The model's outputs support
+controllable editing, enabling photorealistic free-viewpoint renderings of
+night simulations, relit scenes, and inserted objects, marking a significant
+improvement over existing state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://urbaninverserendering.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SYNAPSE: SYmbolic Neural-Aided Preference Synthesis Engine <span class="chip">AAAI 25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16689v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16689v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadanand Modak, Noah Patton, Isil Dillig, Joydeep Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the problem of preference learning, which aims to align
+robot behaviors through learning user specific preferences (e.g. "good
+pull-over location") from visual demonstrations. Despite its similarity to
+learning factual concepts (e.g. "red door"), preference learning is a
+fundamentally harder problem due to its subjective nature and the paucity of
+person-specific training data. We address this problem using a novel framework
+called SYNAPSE, which is a neuro-symbolic approach designed to efficiently
+learn preferential concepts from limited data. SYNAPSE represents preferences
+as neuro-symbolic programs, facilitating inspection of individual parts for
+alignment, in a domain-specific language (DSL) that operates over images and
+leverages a novel combination of visual parsing, large language models, and
+program synthesis to learn programs representing individual preferences. We
+perform extensive evaluations on various preferential concepts as well as user
+case studies demonstrating its ability to align well with dissimilar user
+preferences. Our method significantly outperforms baselines, especially when it
+comes to out of distribution generalization. We show the importance of the
+design choices in the framework through multiple ablation studies. Code,
+additional results, and supplementary material can be found on the website:
+https://amrl.cs.utexas.edu/synapse
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted (oral) at AAAI 25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Performance of Point Cloud Completion Networks with
+  Consistency Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.07298v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.07298v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Tirta Wijaya, Christofel Rio Goenawan, Seung-Hyun Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud completion networks are conventionally trained to minimize the
+disparities between the completed point cloud and the ground-truth counterpart.
+However, an incomplete object-level point cloud can have multiple valid
+completion solutions when it is examined in isolation. This one-to-many mapping
+issue can cause contradictory supervision signals to the network because the
+loss function may produce different values for identical input-output pairs of
+the network. In many cases, this issue could adversely affect the network
+optimization process. In this work, we propose to enhance the conventional
+learning objective using a novel completion consistency loss to mitigate the
+one-to-many mapping problem. Specifically, the proposed consistency loss ensure
+that a point cloud completion network generates a coherent completion solution
+for incomplete objects originating from the same source point cloud.
+Experimental results across multiple well-established datasets and benchmarks
+demonstrated the proposed completion consistency loss have excellent capability
+to enhance the completion performance of various existing networks without any
+modification to the design of the networks. The proposed consistency loss
+enhances the performance of the point completion network without affecting the
+inference speed, thereby increasing the accuracy of point cloud completion.
+Notably, a state-of-the-art point completion network trained with the proposed
+consistency loss can achieve state-of-the-art accuracy on the challenging new
+MVP dataset. The code and result of experiment various point completion models
+using proposed consistency loss will be available at:
+https://github.com/kaist-avelab/ConsistencyLoss .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First version of Paper "Enhancing Performance of Point Cloud
+  Completion Networks with Consistency Loss" by Kevin Tirta Wijaya and
+  Christofel Rio Goenawan. In process submission to Neurocomputing Journal 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07015v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07015v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Hu, Rong Liu, Meida Chen, Peter Beerel, Andrew Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Achieving high-fidelity 3D reconstruction from monocular video remains
+challenging due to the inherent limitations of traditional methods like
+Structure-from-Motion (SfM) and monocular SLAM in accurately capturing scene
+details. While differentiable rendering techniques such as Neural Radiance
+Fields (NeRF) address some of these challenges, their high computational costs
+make them unsuitable for real-time applications. Additionally, existing 3D
+Gaussian Splatting (3DGS) methods often focus on photometric consistency,
+neglecting geometric accuracy and failing to exploit SLAM's dynamic depth and
+pose updates for scene refinement. We propose a framework integrating dense
+SLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach
+introduces SLAM-Informed Adaptive Densification, which dynamically updates and
+densifies the Gaussian model by leveraging dense point clouds from SLAM.
+Additionally, we incorporate Geometry-Guided Optimization, which combines
+edge-aware geometric constraints and photometric consistency to jointly
+optimize the appearance and geometry of the 3DGS scene representation, enabling
+detailed and accurate SLAM mapping reconstruction. Experiments on the Replica
+and TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving
+state-of-the-art results among monocular systems. Specifically, our method
+achieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica,
+representing improvements of 10.7%, 6.4%, and 49.4%, respectively, over the
+previous SOTA. On TUM-RGBD, our method outperforms the closest baseline by
+10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the
+potential of our framework in bridging the gap between photometric and
+geometric dense 3D scene representations, paving the way for practical and
+efficient monocular dense reconstruction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Geometry of Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04809v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04809v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Randall Balestriero, Ahmed Imtiaz Humayun, Richard Baraniuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we overview one promising avenue of progress at the
+mathematical foundation of deep learning: the connection between deep networks
+and function approximation by affine splines (continuous piecewise linear
+functions in multiple dimensions). In particular, we will overview work over
+the past decade on understanding certain geometrical properties of a deep
+network's affine spline mapping, in particular how it tessellates its input
+space. As we will see, the affine spline connection and geometrical viewpoint
+provide a powerful portal through which to view, analyze, and improve the inner
+workings of a deep network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at 'Notices of the American Mathematical
+  Society'</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TriMod Fusion for Multimodal Named Entity Recognition in Social Media <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mosab Alfaqeeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media platforms serve as invaluable sources of user-generated content,
+offering insights into various aspects of human behavior. Named Entity
+Recognition (NER) plays a crucial role in analyzing such content by identifying
+and categorizing named entities into predefined classes. However, traditional
+NER models often struggle with the informal, contextually sparse, and ambiguous
+nature of social media language. To address these challenges, recent research
+has focused on multimodal approaches that leverage both textual and visual cues
+for enhanced entity recognition. Despite advances, existing methods face
+limitations in capturing nuanced mappings between visual objects and textual
+entities and addressing distributional disparities between modalities. In this
+paper, we propose a novel approach that integrates textual, visual, and hashtag
+features (TriMod), utilizing Transformer-attention for effective modality
+fusion. The improvements exhibited by our model suggest that named entities can
+greatly benefit from the auxiliary context provided by multiple modalities,
+enabling more accurate recognition. Through the experiments on a multimodal
+social media dataset, we demonstrate the superiority of our approach over
+existing state-of-the-art methods, achieving significant improvements in
+precision, recall, and F1 score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CASCON</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eliciting In-context Retrieval and Reasoning for Long-context Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifu Qiu, Varun Embar, Yizhe Zhang, Navdeep Jaitly, Shay B. Cohen, Benjamin Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in long-context language models (LCLMs) promise to
+transform Retrieval-Augmented Generation (RAG) by simplifying pipelines. With
+their expanded context windows, LCLMs can process entire knowledge bases and
+perform retrieval and reasoning directly -- a capability we define as
+In-Context Retrieval and Reasoning (ICR^2). However, existing benchmarks like
+LOFT often overestimate LCLM performance by providing overly simplified
+contexts. To address this, we introduce ICR^2, a benchmark that evaluates LCLMs
+in more realistic scenarios by including confounding passages retrieved with
+strong retrievers. We then propose three methods to enhance LCLM performance:
+(1) retrieve-then-generate fine-tuning, (2) retrieval-attention-probing, which
+uses attention heads to filter and de-noise long contexts during decoding, and
+(3) joint retrieval head training alongside the generation head. Our evaluation
+of five well-known LCLMs on LOFT and ICR^2 demonstrates significant gains with
+our best approach applied to Mistral-7B: +17 and +15 points by Exact Match on
+LOFT, and +13 and +2 points on ICR^2, compared to vanilla RAG and supervised
+fine-tuning, respectively. It even outperforms GPT-4-Turbo on most tasks
+despite being a much smaller model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Query Routing for Retrieval Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07793v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07793v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiteng Mu, Liwen Zhang, Yong Jiang, Wenjie Li, Zhen Zhang, Pengjun Xie, Fei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query routing for retrieval-augmented generation aims to assign an input
+query to the most suitable search engine. Existing works rely heavily on
+supervised datasets that require extensive manual annotation, resulting in high
+costs and limited scalability, as well as poor generalization to
+out-of-distribution scenarios. To address these challenges, we introduce a
+novel unsupervised method that constructs the "upper-bound" response to
+evaluate the quality of retrieval-augmented responses. This evaluation enables
+the decision of the most suitable search engine for a given query. By
+eliminating manual annotations, our approach can automatically process
+large-scale real user queries and create training data. We conduct extensive
+experiments across five datasets, demonstrating that our method significantly
+enhances scalability and generalization capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mathematical Information Retrieval: Search and Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11646v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11646v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Zanibbi, Behrooz Mansouri, Anurag Agarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mathematical information is essential for technical work, but its creation,
+interpretation, and search are challenging. To help address these challenges,
+researchers have developed multimodal search engines and mathematical question
+answering systems. This book begins with a simple framework characterizing the
+information tasks that people and systems perform as we work to answer
+math-related questions. The framework is used to organize and relate the other
+core topics of the book, including interactions between people and systems,
+representing math formulas in sources, and evaluation. We close by addressing
+some key questions and presenting directions for future work. This book is
+intended for students, instructors, and researchers interested in systems that
+help us find and use mathematical information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>[DRAFT] Revised (3rd) draft</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spacewalker: Traversing Representation Spaces for Fast Interactive
+  Exploration and Annotation of Unstructured Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16793v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16793v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Heine, Fabian Hörst, Jana Fragemann, Gijs Luijten, Jan Egger, Fin Bahnsen, M. Saquib Sarfraz, Jens Kleesiek, Constantin Seibold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In industries such as healthcare, finance, and manufacturing, analysis of
+unstructured textual data presents significant challenges for analysis and
+decision making. Uncovering patterns within large-scale corpora and
+understanding their semantic impact is critical, but depends on domain experts
+or resource-intensive manual reviews. In response, we introduce Spacewalker in
+this system demonstration paper, an interactive tool designed to analyze,
+explore, and annotate data across multiple modalities. It allows users to
+extract data representations, visualize them in low-dimensional spaces and
+traverse large datasets either exploratory or by querying regions of interest.
+We evaluated Spacewalker through extensive experiments and annotation studies,
+assessing its efficacy in improving data integrity verification and annotation.
+We show that Spacewalker reduces time and effort compared to traditional
+methods. The code of this work is open-source and can be found at:
+https://github.com/code-lukas/Spacewalker
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">193</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Equilibrium in Online Learning: Theory and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08330v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08330v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasios N. Angelopoulos, Michael I. Jordan, Ryan J. Tibshirani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new perspective on online learning that we refer to as gradient
+equilibrium: a sequence of iterates achieves gradient equilibrium if the
+average of gradients of losses along the sequence converges to zero. In
+general, this condition is not implied by nor implies sublinear regret. It
+turns out that gradient equilibrium is achievable by standard online learning
+methods such as gradient descent and mirror descent with constant step sizes
+(rather than decaying step sizes, as is usually required for no regret).
+Further, as we show through examples, gradient equilibrium translates into an
+interpretable and meaningful property in online prediction problems spanning
+regression, classification, quantile estimation, and others. Notably, we show
+that the gradient equilibrium framework can be used to develop a debiasing
+scheme for black-box predictions under arbitrary distribution shift, based on
+simple post hoc online descent updates. We also show that post hoc gradient
+updates can be used to calibrate predicted quantiles under distribution shift,
+and that the framework leads to unbiased Elo scores for pairwise preference
+prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at
+  https://github.com/aangelopoulos/gradient-equilibrium/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Similarity Measure Between Functions with Applications to Statistical
+  Learning and Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengpiao Huang, Kaizheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note, we present a novel measure of similarity between two functions.
+It quantifies how the sub-optimality gaps of two functions convert to each
+other, and unifies several existing notions of functional similarity. We show
+that it has convenient operation rules, and illustrate its use in empirical
+risk minimization and non-stationary online optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Adversarial Post-Training for One-Step Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanchuan Lin, Xin Xia, Yuxi Ren, Ceyuan Yang, Xuefeng Xiao, Lu Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion models are widely used for image and video generation, but
+their iterative generation process is slow and expansive. While existing
+distillation approaches have demonstrated the potential for one-step generation
+in the image domain, they still suffer from significant quality degradation. In
+this work, we propose Adversarial Post-Training (APT) against real data
+following diffusion pre-training for one-step video generation. To improve the
+training stability and quality, we introduce several improvements to the model
+architecture and training procedures, along with an approximated R1
+regularization objective. Empirically, our experiments show that our
+adversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720,
+24fps videos in real time using a single forward evaluation step. Additionally,
+our model is capable of generating 1024px images in a single step, achieving
+quality comparable to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Path Loss Prediction Using Machine Learning with Extended Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Ethier, Mathieu Chateauvert, Ryan G. Dempsey, Alexis Bose
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wireless communications rely on path loss modeling, which is most effective
+when it includes the physical details of the propagation environment. Acquiring
+this data has historically been challenging, but geographic information system
+data is becoming increasingly available with higher resolution and accuracy.
+Access to such details enables propagation models to more accurately predict
+coverage and minimize interference in wireless deployments. Machine
+learning-based modeling can significantly support this effort, with
+feature-based approaches allowing for accurate, efficient, and scalable
+propagation modeling. Building on previous work, we introduce an extended set
+of features that improves prediction accuracy while, most importantly,
+maintaining model generalization across a broad range of environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 4 figures, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Graph Representations and Graph Neural Networks for
+  Multivariate Time Series Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wennuo Yang, Shiling Wu, Yuzhi Zhou, Weicheng Xie, Linlin Shen, Siyang Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multivariate Time Series Classification (MTSC) enables the analysis if
+complex temporal data, and thus serves as a cornerstone in various real-world
+applications, ranging from healthcare to finance. Since the relationship among
+variables in MTS usually contain crucial cues, a large number of graph-based
+MTSC approaches have been proposed, as the graph topology and edges can
+explicitly represent relationships among variables (channels), where not only
+various MTS graph representation learning strategies but also different Graph
+Neural Networks (GNNs) have been explored. Despite such progresses, there is no
+comprehensive study that fairly benchmarks and investigates the performances of
+existing widely-used graph representation learning strategies/GNN classifiers
+in the application of different MTSC tasks. In this paper, we present the first
+benchmark which systematically investigates the effectiveness of the
+widely-used three node feature definition strategies, four edge feature
+learning strategies and five GNN architecture, resulting in 60 different
+variants for graph-based MTSC. These variants are developed and evaluated with
+a standardized data pipeline and training/validation/testing strategy on 26
+widely-used suspensor MTSC datasets. Our experiments highlight that node
+features significantly influence MTSC performance, while the visualization of
+edge features illustrates why adaptive edge learning outperforms other edge
+feature learning methods. The code of the proposed benchmark is publicly
+available at
+\url{https://github.com/CVI-yangwn/Benchmark-GNN-for-Multivariate-Time-Series-Classification}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Polynomial Threshold Functions of Bounded Tree-Width: Some
+  Explainability and Complexity Aspects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08297v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08297v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karine Chubarian, Johnny Joyce, Gyorgy Turan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The tree-width of a multivariate polynomial is the tree-width of the
+hypergraph with hyperedges corresponding to its terms. Multivariate polynomials
+of bounded tree-width have been studied by Makowsky and Meer as a new sparsity
+condition that allows for polynomial solvability of problems which are
+intractable in general. We consider a variation on this theme for Boolean
+variables. A representation of a Boolean function as the sign of a polynomial
+is called a polynomial threshold representation. We discuss Boolean functions
+representable as polynomial threshold functions of bounded tree-width and
+present two applications to Bayesian network classifiers, a probabilistic
+graphical model. Both applications are in Explainable Artificial Intelligence
+(XAI), the research area dealing with the black-box nature of many recent
+machine learning models. We also give a separation result between the
+representational power of positive and general polynomial threshold functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 3 figures. To be published in Festschrift in honor of
+  Johann A. Makowsky</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Avoiding subtraction and division of stochastic signals using
+  normalizing flows: NFdeconvolve 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Pessoa, Max Schweiger, Lance W. Q. Xu, Tristan Manha, Ayush Saurabh, Julian Antolin Camarena, Steve Pressé
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Across the scientific realm, we find ourselves subtracting or dividing
+stochastic signals. For instance, consider a stochastic realization, $x$,
+generated from the addition or multiplication of two stochastic signals $a$ and
+$b$, namely $x=a+b$ or $x = ab$. For the $x=a+b$ example, $a$ can be
+fluorescence background and $b$ the signal of interest whose statistics are to
+be learned from the measured $x$. Similarly, when writing $x=ab$, $a$ can be
+thought of as the illumination intensity and $b$ the density of fluorescent
+molecules of interest. Yet dividing or subtracting stochastic signals amplifies
+noise, and we ask instead whether, using the statistics of $a$ and the
+measurement of $x$ as input, we can recover the statistics of $b$. Here, we
+show how normalizing flows can generate an approximation of the probability
+distribution over $b$, thereby avoiding subtraction or division altogether.
+This method is implemented in our software package, NFdeconvolve, available on
+GitHub with a tutorial linked in the main text.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Bayesian Neural Networks Explicitly Model Input Uncertainty? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matias Valdenegro-Toro, Marco Zullich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inputs to machine learning models can have associated noise or uncertainties,
+but they are often ignored and not modelled. It is unknown if Bayesian Neural
+Networks and their approximations are able to consider uncertainty in their
+inputs. In this paper we build a two input Bayesian Neural Network (mean and
+standard deviation) and evaluate its capabilities for input uncertainty
+estimation across different methods like Ensembles, MC-Dropout, and Flipout.
+Our results indicate that only some uncertainty estimation methods for
+approximate Bayesian NNs can model input uncertainty, in particular Ensembles
+and Flipout.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 11 figures, VISAPP 2025 camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding Interpretable Logic Rules from Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuqin Geng, Xiaojie Xu, Zhaoyue Wang, Ziyu Zhao, Xujie Si
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As deep neural networks continue to excel across various domains, their
+black-box nature has raised concerns about transparency and trust. In
+particular, interpretability has become increasingly essential for applications
+that demand high safety and knowledge rigor, such as drug discovery, autonomous
+driving, and genomics. However, progress in understanding even the simplest
+deep neural networks - such as fully connected networks - has been limited,
+despite their role as foundational elements in state-of-the-art models like
+ResNet and Transformer. In this paper, we address this challenge by introducing
+NeuroLogic, a novel approach for decoding interpretable logic rules from neural
+networks. NeuroLogic leverages neural activation patterns to capture the
+model's critical decision-making processes, translating them into logical rules
+represented by hidden predicates. Thanks to its flexible design in the
+grounding phase, NeuroLogic can be adapted to a wide range of neural networks.
+For simple fully connected neural networks, hidden predicates can be grounded
+in certain split patterns of original input features to derive
+decision-tree-like rules. For large, complex vision neural networks, NeuroLogic
+grounds hidden predicates into high-level visual concepts that are
+understandable to humans. Our empirical study demonstrates that NeuroLogic can
+extract global and interpretable rules from state-of-the-art models such as
+ResNet, a task at which existing work struggles. We believe NeuroLogic can help
+pave the way for understanding the black-box nature of neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Driven Water Segmentation with deep learning models for Enhanced
+  Flood Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanjida Afrin Mou, Tasfia Noor Chowdhury, Adib Ibn Mannan, Sadia Nourin Mim, Lubana Tarannum, Tasrin Noman, Jamal Uddin Ahamed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Flooding is a major natural hazard causing significant fatalities and
+economic losses annually, with increasing frequency due to climate change.
+Rapid and accurate flood detection and monitoring are crucial for mitigating
+these impacts. This study compares the performance of three deep learning
+models UNet, ResNet, and DeepLabv3 for pixelwise water segmentation to aid in
+flood detection, utilizing images from drones, in field observations, and
+social media. This study involves creating a new dataset that augments
+wellknown benchmark datasets with flood-specific images, enhancing the
+robustness of the models. The UNet, ResNet, and DeepLab v3 architectures are
+tested to determine their effectiveness in various environmental conditions and
+geographical locations, and the strengths and limitations of each model are
+also discussed here, providing insights into their applicability in different
+scenarios by predicting image segmentation masks. This fully automated approach
+allows these models to isolate flooded areas in images, significantly reducing
+processing time compared to traditional semi-automated methods. The outcome of
+this study is to predict segmented masks for each image effected by a flood
+disaster and the validation accuracy of these models. This methodology
+facilitates timely and continuous flood monitoring, providing vital data for
+emergency response teams to reduce loss of life and economic damages. It offers
+a significant reduction in the time required to generate flood maps, cutting
+down the manual processing time. Additionally, we present avenues for future
+research, including the integration of multimodal data sources and the
+development of robust deep learning architectures tailored specifically for
+flood detection tasks. Overall, our work contributes to the advancement of
+flood management strategies through innovative use of deep learning
+technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiplayer Federated Learning: Reaching Equilibrium with Less
+  Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        TaeHo Yoon, Sayantan Choudhury, Nicolas Loizou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Federated Learning (FL) approaches assume collaborative clients
+with aligned objectives working towards a shared global model. However, in many
+real-world scenarios, clients act as rational players with individual
+objectives and strategic behaviors, a concept that existing FL frameworks are
+not equipped to adequately address. To bridge this gap, we introduce
+Multiplayer Federated Learning (MpFL), a novel framework that models the
+clients in the FL environment as players in a game-theoretic context, aiming to
+reach an equilibrium. In this scenario, each player tries to optimize their own
+utility function, which may not align with the collective goal. Within MpFL, we
+propose Per-Player Local Stochastic Gradient Descent (PEARL-SGD), an algorithm
+in which each player/client performs local updates independently and
+periodically communicates with other players. We theoretically analyze
+PEARL-SGD and prove that it reaches a neighborhood of equilibrium with less
+communication in the stochastic setup compared to its non-local counterpart.
+Finally, we verify our theoretical findings through numerical experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FDPP: Fine-tune Diffusion Policy with Human Preference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Chen, Devesh K. Jha, Masayoshi Tomizuka, Diego Romeres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning from human demonstrations enables robots to perform
+complex manipulation tasks and has recently witnessed huge success. However,
+these techniques often struggle to adapt behavior to new preferences or changes
+in the environment. To address these limitations, we propose Fine-tuning
+Diffusion Policy with Human Preference (FDPP). FDPP learns a reward function
+through preference-based learning. This reward is then used to fine-tune the
+pre-trained policy with reinforcement learning (RL), resulting in alignment of
+pre-trained policy with new human preferences while still solving the original
+task. Our experiments across various robotic tasks and preferences demonstrate
+that FDPP effectively customizes policy behavior without compromising
+performance. Additionally, we show that incorporating Kullback-Leibler (KL)
+regularization during fine-tuning prevents over-fitting and helps maintain the
+competencies of the initial policy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eliciting In-context Retrieval and Reasoning for Long-context Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifu Qiu, Varun Embar, Yizhe Zhang, Navdeep Jaitly, Shay B. Cohen, Benjamin Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in long-context language models (LCLMs) promise to
+transform Retrieval-Augmented Generation (RAG) by simplifying pipelines. With
+their expanded context windows, LCLMs can process entire knowledge bases and
+perform retrieval and reasoning directly -- a capability we define as
+In-Context Retrieval and Reasoning (ICR^2). However, existing benchmarks like
+LOFT often overestimate LCLM performance by providing overly simplified
+contexts. To address this, we introduce ICR^2, a benchmark that evaluates LCLMs
+in more realistic scenarios by including confounding passages retrieved with
+strong retrievers. We then propose three methods to enhance LCLM performance:
+(1) retrieve-then-generate fine-tuning, (2) retrieval-attention-probing, which
+uses attention heads to filter and de-noise long contexts during decoding, and
+(3) joint retrieval head training alongside the generation head. Our evaluation
+of five well-known LCLMs on LOFT and ICR^2 demonstrates significant gains with
+our best approach applied to Mistral-7B: +17 and +15 points by Exact Match on
+LOFT, and +13 and +2 points on ICR^2, compared to vanilla RAG and supervised
+fine-tuning, respectively. It even outperforms GPT-4-Turbo on most tasks
+despite being a much smaller model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-Diffusion Red-Teaming of Large Language Models: Unveiling Harmful
+  Behaviors with Proximity Constraints <span class="chip">AAAI 25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Nöther, Adish Singla, Goran Radanović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has proposed automated red-teaming methods for testing the
+vulnerabilities of a given target large language model (LLM). These methods use
+red-teaming LLMs to uncover inputs that induce harmful behavior in a target
+LLM. In this paper, we study red-teaming strategies that enable a targeted
+security assessment. We propose an optimization framework for red-teaming with
+proximity constraints, where the discovered prompts must be similar to
+reference prompts from a given dataset. This dataset serves as a template for
+the discovered prompts, anchoring the search for test-cases to specific topics,
+writing styles, or types of harmful behavior. We show that established
+auto-regressive model architectures do not perform well in this setting. We
+therefore introduce a black-box red-teaming method inspired by text-diffusion
+models: Diffusion for Auditing and Red-Teaming (DART). DART modifies the
+reference prompt by perturbing it in the embedding space, directly controlling
+the amount of change introduced. We systematically evaluate our method by
+comparing its effectiveness with established methods based on model fine-tuning
+and zero- and few-shot prompting. Our results show that DART is significantly
+more effective at discovering harmful inputs in close proximity to the
+reference prompt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an extended version of a paper published at AAAI 25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Deep Active Learning for Medical Imaging: Replay-Base
+  Architecture for Context Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Daniel, M. Rita Verdelho, Catarina Barata, Carlos Santiago
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning for medical imaging faces challenges in adapting and
+generalizing to new contexts. Additionally, it often lacks sufficient labeled
+data for specific tasks requiring significant annotation effort. Continual
+Learning (CL) tackles adaptability and generalizability by enabling lifelong
+learning from a data stream while mitigating forgetting of previously learned
+knowledge. Active Learning (AL) reduces the number of required annotations for
+effective training. This work explores both approaches (CAL) to develop a novel
+framework for robust medical image analysis. Based on the automatic recognition
+of shifts in image characteristics, Replay-Base Architecture for Context
+Adaptation (RBACA) employs a CL rehearsal method to continually learn from
+diverse contexts, and an AL component to select the most informative instances
+for annotation. A novel approach to evaluate CAL methods is established using a
+defined metric denominated IL-Score, which allows for the simultaneous
+assessment of transfer learning, forgetting, and final model performance. We
+show that RBACA works in domain and class-incremental learning scenarios, by
+assessing its IL-Score on the segmentation and diagnosis of cardiac images. The
+results show that RBACA outperforms a baseline framework without CAL, and a
+state-of-the-art CAL method across various memory sizes and annotation budgets.
+Our code is available in https://github.com/RuiDaniel/RBACA .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Engineering LLM Powered Multi-agent Framework for Autonomous CloudOps <span class="chip">ICSE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kannan Parthasarathy, Karthik Vaidhyanathan, Rudra Dhar, Venkat Krishnamachari, Basil Muhammed, Adyansh Kakran, Sreemaee Akshathala, Shrikara Arun, Sumant Dubey, Mohan Veerubhotla, Amey Karan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cloud Operations (CloudOps) is a rapidly growing field focused on the
+automated management and optimization of cloud infrastructure which is
+essential for organizations navigating increasingly complex cloud environments.
+MontyCloud Inc. is one of the major companies in the CloudOps domain that
+leverages autonomous bots to manage cloud compliance, security, and continuous
+operations. To make the platform more accessible and effective to the
+customers, we leveraged the use of GenAI.
+  Developing a GenAI-based solution for autonomous CloudOps for the existing
+MontyCloud system presented us with various challenges such as i) diverse data
+sources; ii) orchestration of multiple processes; and iii) handling complex
+workflows to automate routine tasks. To this end, we developed MOYA, a
+multi-agent framework that leverages GenAI and balances autonomy with the
+necessary human control. This framework integrates various internal and
+external systems and is optimized for factors like task orchestration,
+security, and error mitigation while producing accurate, reliable, and relevant
+insights by utilizing Retrieval Augmented Generation (RAG). Evaluations of our
+multi-agent system with the help of practitioners as well as using automated
+checks demonstrate enhanced accuracy, responsiveness, and effectiveness over
+non-agentic approaches across complex workflows.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been accepted as full paper to CAIN 2025
+  (https://conf.researchr.org/home/cain-2025), co-located with ICSE 2025
+  (https://conf.researchr.org/home/icse-2025). The paper was submitted to CAIN
+  for review on 9 November 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Feature-Level Ensemble Model for COVID-19 Identification in CXR Images
+  using Choquet Integral and Differential Evolution Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Reza Takhsha, Maryam Rastgarpour, Mozhgan Naderi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic has profoundly impacted billions globally. It
+challenges public health and healthcare systems due to its rapid spread and
+severe respiratory effects. An effective strategy to mitigate the COVID-19
+pandemic involves integrating testing to identify infected individuals. While
+RT-PCR is considered the gold standard for diagnosing COVID-19, it has some
+limitations such as the risk of false negatives. To address this problem, this
+paper introduces a novel Deep Learning Diagnosis System that integrates
+pre-trained Deep Convolutional Neural Networks (DCNNs) within an ensemble
+learning framework to achieve precise identification of COVID-19 cases from
+Chest X-ray (CXR) images. We combine feature vectors from the final hidden
+layers of pre-trained DCNNs using the Choquet integral to capture interactions
+between different DCNNs that a linear approach cannot. We employed
+Sugeno-$\lambda$ measure theory to derive fuzzy measures for subsets of
+networks to enable aggregation. We utilized Differential Evolution to estimate
+fuzzy densities. We developed a TensorFlow-based layer for Choquet operation to
+facilitate efficient aggregation, due to the intricacies involved in
+aggregating feature vectors. Experimental results on the COVIDx dataset show
+that our ensemble model achieved 98\% accuracy in three-class classification
+and 99.50\% in binary classification, outperforming its components-DenseNet-201
+(97\% for three-class, 98.75\% for binary), Inception-v3 (96.25\% for
+three-class, 98.50\% for binary), and Xception (94.50\% for three-class, 98\%
+for binary)-and surpassing many previous methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Privacy-Preserving Model and Preprocessing Verification for Machine
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenbiao Li, Anisa Halimi, Xiaoqian Jiang, Jaideep Vaidya, Erman Ayday
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a framework for privacy-preserving verification of
+machine learning models, focusing on models trained on sensitive data.
+Integrating Local Differential Privacy (LDP) with model explanations from LIME
+and SHAP, our framework enables robust verification without compromising
+individual privacy. It addresses two key tasks: binary classification, to
+verify if a target model was trained correctly by applying the appropriate
+preprocessing steps, and multi-class classification, to identify specific
+preprocessing errors. Evaluations on three real-world datasets-Diabetes, Adult,
+and Student Record-demonstrate that while the ML-based approach is particularly
+effective in binary tasks, the threshold-based method performs comparably in
+multi-class tasks. Results indicate that although verification accuracy varies
+across datasets and noise levels, the framework provides effective detection of
+preprocessing errors, strong privacy guarantees, and practical applicability
+for safeguarding sensitive data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Pricing in High-Speed Railways Using Multi-Agent Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enrique Adrian Villarrubia-Martin, Luis Rodriguez-Benitez, David Muñoz-Valero, Giovanni Montana, Luis Jimenez-Linares
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses a critical challenge in the high-speed passenger railway
+industry: designing effective dynamic pricing strategies in the context of
+competing and cooperating operators. To address this, a multi-agent
+reinforcement learning (MARL) framework based on a non-zero-sum Markov game is
+proposed, incorporating random utility models to capture passenger decision
+making. Unlike prior studies in areas such as energy, airlines, and mobile
+networks, dynamic pricing for railway systems using deep reinforcement learning
+has received limited attention. A key contribution of this paper is a
+parametrisable and versatile reinforcement learning simulator designed to model
+a variety of railway network configurations and demand patterns while enabling
+realistic, microscopic modelling of user behaviour, called RailPricing-RL. This
+environment supports the proposed MARL framework, which models heterogeneous
+agents competing to maximise individual profits while fostering cooperative
+behaviour to synchronise connecting services. Experimental results validate the
+framework, demonstrating how user preferences affect MARL performance and how
+pricing policies influence passenger choices, utility, and overall system
+dynamics. This study provides a foundation for advancing dynamic pricing
+strategies in railway systems, aligning profitability with system-wide
+efficiency, and supporting future research on optimising pricing policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Deep Learning-based Forward Solvers for Brain Tumor Growth
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeineb Haouari, Jonas Weidner, Ivan Ezhov, Aswathi Varma, Daniel Rueckert, Bjoern Menze, Benedikt Wiestler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Glioblastoma, a highly aggressive brain tumor, poses major challenges due to
+its poor prognosis and high morbidity rates. Partial differential
+equation-based models offer promising potential to enhance therapeutic outcomes
+by simulating patient-specific tumor behavior for improved radiotherapy
+planning. However, model calibration remains a bottleneck due to the high
+computational demands of optimization methods like Monte Carlo sampling and
+evolutionary algorithms. To address this, we recently introduced an approach
+leveraging a neural forward solver with gradient-based optimization to
+significantly reduce calibration time. This approach requires a highly accurate
+and fully differentiable forward model. We investigate multiple architectures,
+including (i) an enhanced TumorSurrogate, (ii) a modified nnU-Net, and (iii) a
+3D Vision Transformer (ViT). The optimized TumorSurrogate achieved the best
+overall results, excelling in both tumor outline matching and voxel-level
+prediction of tumor cell concentration. It halved the MSE relative to the
+baseline model and achieved the highest Dice score across all tumor cell
+concentration thresholds. Our study demonstrates significant enhancement in
+forward solver performance and outlines important future research directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Big Batch Bayesian Active Learning by Considering Predictive
+  Probabilities <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian W. Ober, Samuel Power, Tom Diethe, Henry B. Moss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We observe that BatchBALD, a popular acquisition function for batch Bayesian
+active learning for classification, can conflate epistemic and aleatoric
+uncertainty, leading to suboptimal performance. Motivated by this observation,
+we propose to focus on the predictive probabilities, which only exhibit
+epistemic uncertainty. The result is an acquisition function that not only
+performs better, but is also faster to evaluate, allowing for larger batches
+than before.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures; presented as a lightning talk at the NeurIPS
+  Workshop on Bayesian Decision-making and Uncertainty (BDU; 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating Energy Efficiency and Performance Trade-offs in LLM
+  Inference Across Tasks and DVFS Settings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Joe Maliakel, Shashikant Ilager, Ivona Brandic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown significant improvements in many
+natural language processing (NLP) tasks, accelerating their rapid adoption
+across many industries. These models are resource-intensive, requiring
+extensive computational resources both during training and inference, leading
+to increased energy consumption and negative environmental impact. As their
+adoption accelerates, the sustainability of LLMs has become a critical issue,
+necessitating strategies to optimize their runtime efficiency without
+compromising performance. Hence, it is imperative to identify the parameters
+that significantly influence the performance and energy efficiency of LLMs. To
+that end, in this work, we investigate the effect of important parameters on
+the performance and energy efficiency of LLMs during inference and examine
+their trade-offs.
+  First, we analyze how different types of models with varying numbers of
+parameters and architectures perform on tasks like text generation, question
+answering, and summarization by benchmarking LLMs such as Falcon-7B,
+Mistral-7B-v0.1, T5-3B, GPT-2, GPT-J-6B, and GPT-Neo-2.7B. Second, we study
+input and output sequence characteristics such as sequence length concerning
+energy consumption, performance, and throughput. Finally, we explore the impact
+of hardware-based power-saving techniques, i.e., Dynamic Voltage Frequency
+Scaling (DVFS), on the models' latency and energy efficiency. Our extensive
+benchmarking and statistical analysis reveal many interesting findings,
+uncovering how specific optimizations can reduce energy consumption while
+maintaining throughput and accuracy. This study provides actionable insights
+for researchers and practitioners to design energy-efficient LLM inference
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling Feature Maps for Quantum Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navneet Singh, Shiva Raj Pokhrel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum Machine Learning (QML) offers significant potential for complex tasks
+like genome sequence classification, but quantum noise on Noisy
+Intermediate-Scale Quantum (NISQ) devices poses practical challenges. This
+study systematically evaluates how various quantum noise models including
+dephasing, amplitude damping, depolarizing, thermal noise, bit-flip, and
+phase-flip affect key QML algorithms (QSVC, Peg-QSVC, QNN, VQC) and feature
+mapping techniques (ZFeatureMap, ZZFeatureMap, and PauliFeatureMap). Results
+indicate that QSVC is notably robust under noise, whereas Peg-QSVC and QNN are
+more sensitive, particularly to depolarizing and amplitude-damping noise. The
+PauliFeatureMap is especially vulnerable, highlighting difficulties in
+maintaining accurate classification under noisy conditions. These findings
+underscore the critical importance of feature map selection and noise
+mitigation strategies in optimizing QML for genomic classification, with
+promising implications for personalized medicine.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven system identification using quadratic embeddings of
+  nonlinear dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Klus, Joel-Pascal N'Konzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel data-driven method called QENDy (Quadratic Embedding of
+Nonlinear Dynamics) that not only allows us to learn quadratic representations
+of highly nonlinear dynamical systems, but also to identify the governing
+equations. The approach is based on an embedding of the system into a
+higher-dimensional feature space in which the dynamics become quadratic. Just
+like SINDy (Sparse Identification of Nonlinear Dynamics), our method requires
+trajectory data, time derivatives for the training data points, which can also
+be estimated using finite difference approximations, and a set of preselected
+basis functions, called dictionary. We illustrate the efficacy and accuracy of
+QENDy with the aid of various benchmark problems and compare its performance
+with SINDy and a deep learning method for identifying quadratic embeddings.
+Furthermore, we analyze the convergence of QENDy and SINDy in the infinite data
+limit, highlight their similarities and main differences, and compare the
+quadratic embedding with linearization techniques based on the Koopman
+operator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Globally Convergent Variational Inference <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Declan McNamara, Jackson Loper, Jeffrey Regier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In variational inference (VI), an approximation of the posterior distribution
+is selected from a family of distributions through numerical optimization. With
+the most common variational objective function, known as the evidence lower
+bound (ELBO), only convergence to a local optimum can be guaranteed. In this
+work, we instead establish the global convergence of a particular VI method.
+This VI method, which may be considered an instance of neural posterior
+estimation (NPE), minimizes an expectation of the inclusive (forward) KL
+divergence to fit a variational distribution that is parameterized by a neural
+network. Our convergence result relies on the neural tangent kernel (NTK) to
+characterize the gradient dynamics that arise from considering the variational
+objective in function space. In the asymptotic regime of a fixed,
+positive-definite neural tangent kernel, we establish conditions under which
+the variational objective admits a unique solution in a reproducing kernel
+Hilbert space (RKHS). Then, we show that the gradient descent dynamics in
+function space converge to this unique function. In ablation studies and
+practical problems, we demonstrate that our results explain the behavior of NPE
+in non-asymptotic finite-neuron settings, and show that NPE outperforms
+ELBO-based optimization, which often converges to shallow local optima.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 38th Conference on Neural Information Processing
+  Systems (NeurIPS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CWEval: Outcome-driven Evaluation on Functionality and Security of LLM
+  Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08200v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08200v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinjun Peng, Leyi Cui, Kele Huang, Junfeng Yang, Baishakhi Ray
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have significantly aided developers by
+generating or assisting in code writing, enhancing productivity across various
+tasks. While identifying incorrect code is often straightforward, detecting
+vulnerabilities in functionally correct code is more challenging, especially
+for developers with limited security knowledge, which poses considerable
+security risks of using LLM-generated code and underscores the need for robust
+evaluation benchmarks that assess both functional correctness and security.
+Current benchmarks like CyberSecEval and SecurityEval attempt to solve it but
+are hindered by unclear and impractical specifications, failing to assess both
+functionality and security accurately. To tackle these deficiencies, we
+introduce CWEval, a novel outcome-driven evaluation framework designed to
+enhance the evaluation of secure code generation by LLMs. This framework not
+only assesses code functionality but also its security simultaneously with
+high-quality task specifications and outcome-driven test oracles which provides
+high accuracy. Coupled with CWEval-bench, a multilingual, security-critical
+coding benchmark, CWEval provides a rigorous empirical security evaluation on
+LLM-generated code, overcoming previous benchmarks' shortcomings. Through our
+evaluations, CWEval reveals a notable portion of functional but insecure code
+produced by LLMs, and shows a serious inaccuracy of previous evaluations,
+ultimately contributing significantly to the field of secure code generation.
+We open-source our artifact at: https://github.com/Co1lin/CWEval .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in LLM4Code 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Deep Hyperspectral Inpainting with the Plug and Play and
+  Deep Image Prior Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08195v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08195v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Li, Mehrdad Yaghoobi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral images are typically composed of hundreds of narrow and
+contiguous spectral bands, each containing information regarding the material
+composition of the imaged scene. However, these images can be affected by
+various sources of noise, distortions, or data loss, which can significantly
+degrade their quality and usefulness. This paper introduces a convergent
+guaranteed algorithm, LRS-PnP-DIP(1-Lip), which successfully addresses the
+instability issue of DHP that has been reported before. The proposed algorithm
+extends the successful joint low-rank and sparse model to further exploit the
+underlying data structures beyond the conventional and sometimes restrictive
+unions of subspace models. A stability analysis guarantees the convergence of
+the proposed algorithm under mild assumptions , which is crucial for its
+application in real-world scenarios. Extensive experiments demonstrate that the
+proposed solution consistently delivers visually and quantitatively superior
+inpainting results, establishing state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 9 Figures, 7 Tables. arXiv admin note: text overlap with
+  arXiv:2306.08128</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling Quantum Machine Learning for Genomic Data Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navneet Singh, Shiva Raj Pokhrel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum Machine Learning (QML) continues to evolve, unlocking new
+opportunities for diverse applications. In this study, we investigate and
+evaluate the applicability of QML models for binary classification of genome
+sequence data by employing various feature mapping techniques. We present an
+open-source, independent Qiskit-based implementation to conduct experiments on
+a benchmark genomic dataset. Our simulations reveal that the interplay between
+feature mapping techniques and QML algorithms significantly influences
+performance. Notably, the Pegasos Quantum Support Vector Classifier
+(Pegasos-QSVC) exhibits high sensitivity, particularly excelling in recall
+metrics, while Quantum Neural Networks (QNN) achieve the highest training
+accuracy across all feature maps. However, the pronounced variability in
+classifier performance, dependent on feature mapping, highlights the risk of
+overfitting to localized output distributions in certain scenarios. This work
+underscores the transformative potential of QML for genomic data classification
+while emphasizing the need for continued advancements to enhance the robustness
+and accuracy of these methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Critical Synthesis of Uncertainty Quantification and Foundation Models
+  in Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Landgraf, Rongjun Qin, Markus Ulrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent foundation models have enabled significant breakthroughs in
+monocular depth estimation, a clear path towards safe and reliable deployment
+in the real-world remains elusive. Metric depth estimation, which involves
+predicting absolute distances, poses particular challenges, as even the most
+advanced foundation models remain prone to critical errors. Since quantifying
+the uncertainty has emerged as a promising endeavor to address these
+limitations and enable trustworthy deployment, we fuse five different
+uncertainty quantification methods with the current state-of-the-art
+DepthAnythingV2 foundation model. To cover a wide range of metric depth
+domains, we evaluate their performance on four diverse datasets. Our findings
+identify fine-tuning with the Gaussian Negative Log-Likelihood Loss (GNLL) as a
+particularly promising approach, offering reliable uncertainty estimates while
+maintaining predictive performance and computational efficiency on par with the
+baseline, encompassing both training and inference time. By fusing uncertainty
+quantification and foundation models within the context of monocular depth
+estimation, this paper lays a critical foundation for future research aimed at
+improving not only model performance but also its explainability. Extending
+this critical synthesis of uncertainty quantification and foundation models
+into other crucial tasks, such as semantic segmentation and pose estimation,
+presents exciting opportunities for safer and more reliable machine vision
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction
+  Following 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Fang, Xinle Deng, Kangwei Liu, Ningyu Zhang, Jingyang Qian, Penghui Yang, Xiaohui Fan, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models excel at interpreting complex natural language
+instructions, enabling them to perform a wide range of tasks. In the life
+sciences, single-cell RNA sequencing (scRNA-seq) data serves as the "language
+of cellular biology", capturing intricate gene expression patterns at the
+single-cell level. However, interacting with this "language" through
+conventional tools is often inefficient and unintuitive, posing challenges for
+researchers. To address these limitations, we present InstructCell, a
+multi-modal AI copilot that leverages natural language as a medium for more
+direct and flexible single-cell analysis. We construct a comprehensive
+multi-modal instruction dataset that pairs text-based instructions with
+scRNA-seq profiles from diverse tissues and species. Building on this, we
+develop a multi-modal cell language architecture capable of simultaneously
+interpreting and processing both modalities. InstructCell empowers researchers
+to accomplish critical tasks-such as cell type annotation, conditional
+pseudo-cell generation, and drug sensitivity prediction-using straightforward
+natural language commands. Extensive evaluations demonstrate that InstructCell
+consistently meets or exceeds the performance of existing single-cell
+foundation models, while adapting to diverse experimental conditions. More
+importantly, InstructCell provides an accessible and intuitive tool for
+exploring complex single-cell data, lowering technical barriers and enabling
+deeper biological insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages; 13 figures; Code: https://github.com/zjunlp/Instructcell;
+  Models: https://huggingface.co/zjunlp/Instructcell-chat,
+  https://huggingface.co/zjunlp/InstructCell-instruct</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D$^2$-DPM: Dual Denoising for Quantized Diffusion Probabilistic Models <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Zeng, Jie Song, Han Zheng, Hao Jiang, Mingli Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have achieved cutting-edge performance in image generation.
+However, their lengthy denoising process and computationally intensive score
+estimation network impede their scalability in low-latency and
+resource-constrained scenarios. Post-training quantization (PTQ) compresses and
+accelerates diffusion models without retraining, but it inevitably introduces
+additional quantization noise, resulting in mean and variance deviations. In
+this work, we propose D2-DPM, a dual denoising mechanism aimed at precisely
+mitigating the adverse effects of quantization noise on the noise estimation
+network. Specifically, we first unravel the impact of quantization noise on the
+sampling equation into two components: the mean deviation and the variance
+deviation. The mean deviation alters the drift coefficient of the sampling
+equation, influencing the trajectory trend, while the variance deviation
+magnifies the diffusion coefficient, impacting the convergence of the sampling
+trajectory. The proposed D2-DPM is thus devised to denoise the quantization
+noise at each time step, and then denoise the noisy sample through the inverse
+diffusion iterations. Experimental results demonstrate that D2-DPM achieves
+superior generation quality, yielding a 1.42 lower FID than the full-precision
+model while achieving 3.99x compression and 11.67x bit-operation acceleration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, acceptted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revolutionizing Communication with Deep Learning and XAI for Enhanced
+  Arabic Sign Language Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mazen Balat, Rewaa Awaad, Ahmed B. Zaky, Salah A. Aly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces an integrated approach to recognizing Arabic Sign
+Language (ArSL) using state-of-the-art deep learning models such as
+MobileNetV3, ResNet50, and EfficientNet-B2. These models are further enhanced
+by explainable AI (XAI) techniques to boost interpretability. The ArSL2018 and
+RGB Arabic Alphabets Sign Language (AASL) datasets are employed, with
+EfficientNet-B2 achieving peak accuracies of 99.48\% and 98.99\%, respectively.
+Key innovations include sophisticated data augmentation methods to mitigate
+class imbalance, implementation of stratified 5-fold cross-validation for
+better generalization, and the use of Grad-CAM for clear model decision
+transparency. The proposed system not only sets new benchmarks in recognition
+accuracy but also emphasizes interpretability, making it suitable for
+applications in healthcare, education, and inclusive communication
+technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 25 figures, 16 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inference-Time-Compute: More Faithful? A Research Note 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Chua, Owain Evans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models trained specifically to generate long Chains of Thought (CoTs) have
+recently achieved impressive results. We refer to these models as
+Inference-Time-Compute (ITC) models. Are the CoTs of ITC models more faithful
+compared to traditional non-ITC models? We evaluate two ITC models (based on
+Qwen-2.5 and Gemini-2) on an existing test of faithful CoT To measure
+faithfulness, we test if models articulate cues in their prompt that influence
+their answers to MMLU questions. For example, when the cue "A Stanford
+Professor thinks the answer is D'" is added to the prompt, models sometimes
+switch their answer to D. In such cases, the Gemini ITC model articulates the
+cue 54% of the time, compared to 14% for the non-ITC Gemini.
+  We evaluate 7 types of cue, such as misleading few-shot examples and
+anchoring on past responses. ITC models articulate cues that influence them
+much more reliably than all the 6 non-ITC models tested, such as
+Claude-3.5-Sonnet and GPT-4o, which often articulate close to 0% of the time.
+  However, our study has important limitations. We evaluate only two ITC models
+-- we cannot evaluate OpenAI's SOTA o1 model. We also lack details about the
+training of these ITC models, making it hard to attribute our findings to
+specific processes.
+  We think faithfulness of CoT is an important property for AI Safety. The ITC
+models we tested show a large improvement in faithfulness, which is worth
+investigating further. To speed up this investigation, we release these early
+results as a research note.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FairTTTS: A Tree Test Time Simulation Method for Fairness-Aware
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nurit Cohen-Inger, Lior Rokach, Bracha Shapira, Seffi Cohen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Algorithmic decision-making has become deeply ingrained in many domains, yet
+biases in machine learning models can still produce discriminatory outcomes,
+often harming unprivileged groups. Achieving fair classification is inherently
+challenging, requiring a careful balance between predictive performance and
+ethical considerations. We present FairTTTS, a novel post-processing bias
+mitigation method inspired by the Tree Test Time Simulation (TTTS) method.
+Originally developed to enhance accuracy and robustness against adversarial
+inputs through probabilistic decision-path adjustments, TTTS serves as the
+foundation for FairTTTS. By building on this accuracy-enhancing technique,
+FairTTTS mitigates bias and improves predictive performance. FairTTTS uses a
+distance-based heuristic to adjust decisions at protected attribute nodes,
+ensuring fairness for unprivileged samples. This fairness-oriented adjustment
+occurs as a post-processing step, allowing FairTTTS to be applied to
+pre-trained models, diverse datasets, and various fairness metrics without
+retraining. Extensive evaluation on seven benchmark datasets shows that
+FairTTTS outperforms traditional methods in fairness improvement, achieving a
+20.96% average increase over the baseline compared to 18.78% for related work,
+and further enhances accuracy by 0.55%. In contrast, competing methods
+typically reduce accuracy by 0.42%. These results confirm that FairTTTS
+effectively promotes more equitable decision-making while simultaneously
+improving predictive performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiple-Input Variational Auto-Encoder for Anomaly Detection in
+  Heterogeneous Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phai Vu Dinh, Diep N. Nguyen, Dinh Thai Hoang, Quang Uy Nguyen, Eryk Dutkiewicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection (AD) plays a pivotal role in AI applications, e.g., in
+classification, and intrusion/threat detection in cybersecurity. However, most
+existing methods face challenges of heterogeneity amongst feature subsets posed
+by non-independent and identically distributed (non-IID) data. We propose a
+novel neural network model called Multiple-Input Auto-Encoder for AD (MIAEAD)
+to address this. MIAEAD assigns an anomaly score to each feature subset of a
+data sample to indicate its likelihood of being an anomaly. This is done by
+using the reconstruction error of its sub-encoder as the anomaly score. All
+sub-encoders are then simultaneously trained using unsupervised learning to
+determine the anomaly scores of feature subsets. The final AUC of MIAEAD is
+calculated for each sub-dataset, and the maximum AUC obtained among the
+sub-datasets is selected. To leverage the modelling of the distribution of
+normal data to identify anomalies of the generative models, we develop a novel
+neural network architecture/model called Multiple-Input Variational
+Auto-Encoder (MIVAE). MIVAE can process feature subsets through its
+sub-encoders before learning distribution of normal data in the latent space.
+This allows MIVAE to identify anomalies that deviate from the learned
+distribution. We theoretically prove that the difference in the average anomaly
+score between normal samples and anomalies obtained by the proposed MIVAE is
+greater than that of the Variational Auto-Encoder (VAEAD), resulting in a
+higher AUC for MIVAE. Extensive experiments on eight real-world anomaly
+datasets demonstrate the superior performance of MIAEAD and MIVAE over
+conventional methods and the state-of-the-art unsupervised models, by up to 6%
+in terms of AUC score. Alternatively, MIAEAD and MIVAE have a high AUC when
+applied to feature subsets with low heterogeneity based on the coefficient of
+variation (CV) score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bootstrapping Corner Cases: High-Resolution Inpainting for Safety
+  Critical Detect and Avoid for Automated Flying 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Lyhs, Lars Hinneburg, Michael Fischer, Florian Ölsner, Stefan Milz, Jeremy Tschirner, Patrick Mäder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern machine learning techniques have shown tremendous potential,
+especially for object detection on camera images. For this reason, they are
+also used to enable safety-critical automated processes such as autonomous
+drone flights. We present a study on object detection for Detect and Avoid, a
+safety critical function for drones that detects air traffic during automated
+flights for safety reasons. An ill-posed problem is the generation of good and
+especially large data sets, since detection itself is the corner case. Most
+models suffer from limited ground truth in raw data, \eg recorded air traffic
+or frontal flight with a small aircraft. It often leads to poor and critical
+detection rates. We overcome this problem by using inpainting methods to
+bootstrap the dataset such that it explicitly contains the corner cases of the
+raw data. We provide an overview of inpainting methods and generative models
+and present an example pipeline given a small annotated dataset. We validate
+our method by generating a high-resolution dataset, which we make publicly
+available and present it to an independent object detector that was fully
+trained on real data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EEG-ReMinD: Enhancing Neurodegenerative EEG Decoding through
+  <span class="highlight-title">Self-Supervised</span> State Reconstruction-Primed Riemannian Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zirui Wang, Zhenxi Song, Yi Guo, Yuxin Liu, Guoyang Xu, Min Zhang, Zhiguo Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of EEG decoding algorithms confronts challenges such as data
+sparsity, subject variability, and the need for precise annotations, all of
+which are vital for advancing brain-computer interfaces and enhancing the
+diagnosis of diseases. To address these issues, we propose a novel two-stage
+approach named Self-Supervised State Reconstruction-Primed Riemannian Dynamics
+(EEG-ReMinD) , which mitigates reliance on supervised learning and integrates
+inherent geometric features. This approach efficiently handles EEG data
+corruptions and reduces the dependency on labels. EEG-ReMinD utilizes
+self-supervised and geometric learning techniques, along with an attention
+mechanism, to analyze the temporal dynamics of EEG features within the
+framework of Riemannian geometry, referred to as Riemannian dynamics.
+Comparative analyses on both intact and corrupted datasets from two different
+neurodegenerative disorders underscore the enhanced performance of EEG-ReMinD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Wall-Pressure Spectrum Model for Aeroacoustic Predictions
+  Based on Symbolic Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Botero Bolívar, David Huergo, Fernanda L. dos Santos, Cornelis H. Venner, Leandro D. de Santana, Esteban Ferrer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fast-turn around methods to predict airfoil trailing-edge noise are crucial
+for incorporating noise limitations into design optimization loops of several
+applications. Among these aeroacoustic predictive models, Amiet's theory offers
+the best balance between accuracy and simplicity. The accuracy of the model
+relies heavily on precise wall-pressure spectrum predictions, which are often
+based on single-equation formulations with adjustable parameters. These
+parameters are calibrated for particular airfoils and flow conditions and
+consequently tend to fail when applied outside their calibration range. This
+paper introduces a new wall-pressure spectrum empirical model designed to
+enhance the robustness and accuracy of current state-of-the-art predictions
+while widening the range of applicability of the model to different airfoils
+and flow conditions. The model is developed using AI-based symbolic regression
+via a genetic-algorithm-based approach, and applied to a dataset of
+wall-pressure fluctuations measured on NACA 0008 and NACA 63018 airfoils at
+multiple angles of attack and inflow velocities, covering turbulent boundary
+layers with both adverse and favorable pressure gradients. Validation against
+experimental data (outside the training dataset) demonstrates the robustness of
+the model compared to well-accepted semi-empirical models. Finally, the model
+is integrated with Amiet's theory to predict the aeroacoustic noise of a
+full-scale wind turbine, showing good agreement with experimental measurements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoHan: Robust Hand Detection in Operation Room 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08115v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08115v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roi Papo, Sapir Gershov, Tom Friedman, Itay Or, Gil Bolotin, Shlomi Laufer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hand-specific localization has garnered significant interest within the
+computer vision community. Although there are numerous datasets with hand
+annotations from various angles and settings, domain transfer techniques
+frequently struggle in surgical environments. This is mainly due to the limited
+availability of gloved hand instances and the unique challenges of operating
+rooms (ORs). Thus, hand-detection models tailored to OR settings require
+extensive training and expensive annotation processes. To overcome these
+challenges, we present "RoHan" - a novel approach for robust hand detection in
+the OR, leveraging advanced semi-supervised domain adaptation techniques to
+tackle the challenges of varying recording conditions, diverse glove colors,
+and occlusions common in surgical settings. Our methodology encompasses two
+main stages: (1) data augmentation strategy that utilizes "Artificial Gloves,"
+a method for augmenting publicly available hand datasets with synthetic images
+of hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that
+improves detection performance in real-world OR settings through iterative
+prediction refinement and efficient frame filtering. We evaluate our method
+using two datasets: simulated enterotomy repair and saphenous vein graft
+harvesting. "RoHan" substantially reduces the need for extensive labeling and
+model training, paving the way for the practical implementation of hand
+detection technologies in medical settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven inventory management for new products: A warm-start and
+  adjusted Dyna-$Q$ approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Qu, Longxiao Liu, Wenjie Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel reinforcement learning algorithm for
+inventory management of newly launched products with no or limited historical
+demand information. The algorithm follows the classic Dyna-$Q$ structure,
+balancing the model-based and model-free approaches, while accelerating the
+training process of Dyna-$Q$ and mitigating the model discrepancy generated by
+the model-based feedback. Warm-start information from the demand data of
+existing similar products can be incorporated into the algorithm to further
+stabilize the early-stage training and reduce the variance of the estimated
+optimal policy. Our approach is validated through a case study of bakery
+inventory management with real data. The adjusted Dyna-$Q$ shows up to a 23.7\%
+reduction in average daily cost compared with $Q$-learning, and up to a 77.5\%
+reduction in training time within the same horizon compared with classic
+Dyna-$Q$. By incorporating the warm-start information, it can be found that the
+adjusted Dyna-$Q$ has the lowest total cost, lowest variance in total cost, and
+relatively low shortage percentages among all the algorithms under a 30-day
+testing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smooth Handovers via Smoothed Online Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michail Kalntis, Andra Lutu, Jesús Omaña Iglesias, Fernando A. Kuipers, George Iosifidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With users demanding seamless connectivity, handovers (HOs) have become a
+fundamental element of cellular networks. However, optimizing HOs is a
+challenging problem, further exacerbated by the growing complexity of mobile
+networks. This paper presents the first countrywide study of HO optimization,
+through the prism of Smoothed Online Learning (SOL). We first analyze an
+extensive dataset from a commercial mobile network operator (MNO) in Europe
+with more than 40M users, to understand and reveal important features and
+performance impacts on HOs. Our findings highlight a correlation between HO
+failures/delays, and the characteristics of radio cells and end-user devices,
+showcasing the impact of heterogeneity in mobile networks nowadays. We
+subsequently model UE-cell associations as dynamic decisions and propose a
+realistic system model for smooth and accurate HOs that extends existing
+approaches by (i) incorporating device and cell features on HO optimization,
+and (ii) eliminating (prior) strong assumptions about requiring future signal
+measurements and knowledge of end-user mobility. Our algorithm, aligned with
+the O-RAN paradigm, provides robust dynamic regret guarantees, even in
+challenging environments, and shows superior performance in multiple scenarios
+with real-world and synthetic data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Action Based Reinforcement Learning for Multi-Objective
+  Compatible Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guizhe Jin, Zhuoren Li, Bo Leng, Wei Han, Lu Xiong, Chen Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning (RL) has shown excellent performance in solving
+decision-making and control problems of autonomous driving, which is
+increasingly applied in diverse driving scenarios. However, driving is a
+multi-attribute problem, leading to challenges in achieving multi-objective
+compatibility for current RL methods, especially in both policy execution and
+policy iteration. On the one hand, the common action space structure with
+single action type limits driving flexibility or results in large behavior
+fluctuations during policy execution. On the other hand, the multi-attribute
+weighted single reward function result in the agent's disproportionate
+attention to certain objectives during policy iterations. To this end, we
+propose a Multi-objective Ensemble-Critic reinforcement learning method with
+Hybrid Parametrized Action for multi-objective compatible autonomous driving.
+Specifically, a parameterized action space is constructed to generate hybrid
+driving actions, combining both abstract guidance and concrete control
+commands. A multi-objective critics architecture is constructed considering
+multiple attribute rewards, to ensure simultaneously focusing on different
+driving objectives. Additionally, uncertainty-based exploration strategy is
+introduced to help the agent faster approach viable driving policy. The
+experimental results in both the simulated traffic environment and the HighD
+dataset demonstrate that our method can achieve multi-objective compatible
+autonomous driving in terms of driving efficiency, action consistency, and
+safety. It enhances the general performance of the driving while significantly
+increasing training efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 9 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Multimodal Sentiment Analysis: Leveraging Cross-Modal Attention
+  for Enabled Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Lee, Singh Suniljit, Yong Siang Ong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the development of a multimodal sentiment analysis model
+that integrates text, audio, and visual data to enhance sentiment
+classification. The goal is to improve emotion detection by capturing the
+complex interactions between these modalities, thereby enabling more accurate
+and nuanced sentiment interpretation. The study evaluates three feature fusion
+strategies -- late stage fusion, early stage fusion, and multi-headed attention
+-- within a transformer-based architecture. Experiments were conducted using
+the CMU-MOSEI dataset, which includes synchronized text, audio, and visual
+inputs labeled with sentiment scores. Results show that early stage fusion
+significantly outperforms late stage fusion, achieving an accuracy of 71.87\%,
+while the multi-headed attention approach offers marginal improvement, reaching
+72.39\%. The findings suggest that integrating modalities early in the process
+enhances sentiment classification, while attention mechanisms may have limited
+impact within the current framework. Future work will focus on refining feature
+fusion techniques, incorporating temporal data, and exploring dynamic feature
+weighting to further improve model performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CuAsmRL: Optimizing GPU SASS Schedules via Deep Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoliang He, Eiko Yoneki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are remarked by their substantial computational
+requirements. To mitigate the cost, researchers develop specialized CUDA
+kernels, which often fuse several tensor operations to maximize the utilization
+of GPUs as much as possible. However, those specialized kernels may still leave
+performance on the table as CUDA assembly experts show that manual optimization
+of GPU SASS schedules can lead to better performance, and trial-and-error is
+largely employed to manually find the best GPU SASS schedules.
+  In this work, we employ an automatic approach to optimize GPU SASS schedules,
+which thus can be integrated into existing compiler frameworks. The key to
+automatic optimization is training an RL agent to mimic how human experts
+perform manual scheduling. To this end, we formulate an assembly game, where RL
+agents can play to find the best GPU SASS schedules. The assembly game starts
+from a \textit{-O3} optimized SASS schedule, and the RL agents can iteratively
+apply actions to mutate the current schedules. Positive rewards are generated
+if the mutated schedules get higher throughput by executing on GPUs.
+Experiments show that CuAsmRL can further improve the performance of existing
+specialized CUDA kernels transparently by up to $26\%$, and on average $9\%$.
+Moreover, it is used as a tool to reveal potential optimization moves learned
+automatically.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>cgo 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Policy Adaptation under Covariate Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueqing Liu, Qinwei Yang, Zhaoqing Tian, Ruocheng Guo, Peng Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transfer learning of prediction models has been extensively studied, while
+the corresponding policy learning approaches are rarely discussed. In this
+paper, we propose principled approaches for learning the optimal policy in the
+target domain by leveraging two datasets: one with full information from the
+source domain and the other from the target domain with only covariates. First,
+under the setting of covariate shift, we formulate the problem from a
+perspective of causality and present the identifiability assumptions for the
+reward induced by a given policy. Then, we derive the efficient influence
+function and the semiparametric efficiency bound for the reward. Based on this,
+we construct a doubly robust and semiparametric efficient estimator for the
+reward and then learn the optimal policy by optimizing the estimated reward.
+Moreover, we theoretically analyze the bias and the generalization error bound
+for the learned policy. Furthermore, in the presence of both covariate and
+concept shifts, we propose a novel sensitivity analysis method to evaluate the
+robustness of the proposed policy learning approach. Extensive experiments
+demonstrate that the approach not only estimates the reward more accurately but
+also yields a policy that closely approximates the theoretically optimal
+policy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the use of Statistical Learning Theory for model selection in
+  Structural Health Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. A. Lindley, N. Dervilis, K. Worden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whenever data-based systems are employed in engineering applications,
+defining an optimal statistical representation is subject to the problem of
+model selection. This paper focusses on how well models can generalise in
+Structural Health Monitoring (SHM). Although statistical model validation in
+this field is often performed heuristically, it is possible to estimate
+generalisation more rigorously using the bounds provided by Statistical
+Learning Theory (SLT). Therefore, this paper explores the selection process of
+a kernel smoother for modelling the impulse response of a linear oscillator
+from the perspective of SLT. It is demonstrated that incorporating domain
+knowledge into the regression problem yields a lower guaranteed risk, thereby
+enhancing generalisation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Attentive Spatio-Temporal Calibration for Precise Intermediate
+  Layer Matching in ANN-to-SNN Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Hong, Yueming Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs) are promising for low-power computation due to
+their event-driven mechanism but often suffer from lower accuracy compared to
+Artificial Neural Networks (ANNs). ANN-to-SNN knowledge distillation can
+improve SNN performance, but previous methods either focus solely on label
+information, missing valuable intermediate layer features, or use a layer-wise
+approach that neglects spatial and temporal semantic inconsistencies, leading
+to performance degradation.To address these limitations, we propose a novel
+method called self-attentive spatio-temporal calibration (SASTC). SASTC uses
+self-attention to identify semantically aligned layer pairs between ANN and
+SNN, both spatially and temporally. This enables the autonomous transfer of
+relevant semantic information. Extensive experiments show that SASTC
+outperforms existing methods, effectively solving the mismatching problem.
+Superior accuracy results include 95.12% on CIFAR-10, 79.40% on CIFAR-100 with
+2 time steps, and 68.69% on ImageNet with 4 time steps for static datasets, and
+97.92% on DVS-Gesture and 83.60% on DVS-CIFAR10 for neuromorphic datasets. This
+marks the first time SNNs have outperformed ANNs on both CIFAR-10 and
+CIFAR-100, shedding the new light on the potential applications of SNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gen-A: Generalizing Ambisonics Neural Encoding to Unseen Microphone
+  Arrays 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08047v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08047v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikko Heikkinen, Archontis Politis, Konstantinos Drossos, Tuomas Virtanen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using deep neural networks (DNNs) for encoding of microphone array (MA)
+signals to the Ambisonics spatial audio format can surpass certain limitations
+of established conventional methods, but existing DNN-based methods need to be
+trained separately for each MA. This paper proposes a DNN-based method for
+Ambisonics encoding that can generalize to arbitrary MA geometries unseen
+during training. The method takes as inputs the MA geometry and MA signals and
+uses a multi-level encoder consisting of separate paths for geometry and signal
+data, where geometry features inform the signal encoder at each level. The
+method is validated in simulated anechoic and reverberant conditions with one
+and two sources. The results indicate improvement over conventional encoding
+across the whole frequency range for dry scenes, while for reverberant scenes
+the improvement is frequency-dependent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Proceedings of the 2025 IEEE
+  International Conference on Acoustics, Speech and Signal Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UFGraphFR: An attempt at a federated recommendation system based on user
+  text characteristics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xudong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning has become an important research area in 'private
+computing' due to the 'useable invisibility' of data during training. Inspired
+by Federated learning, the federated recommendation system has gradually become
+a new recommendation service architecture that can protect users' privacy. The
+use of user diagrams to enhance federated recommendations is a promising topic.
+How to use user diagrams to enhance federated recommendations is a promising
+research topic. However, it's a great challenge to construct a user diagram
+without compromising privacy in a federated learning scenario. Inspired by the
+simple idea that similar users often have the same attribute characteristics,
+we propose a personalized federated recommendation algorithm based on the user
+relationship graph constructed by the user text characteristics(Graph
+Federation Recommendation System based on User Text description Features,
+UFGraphFR). The method uses the embedding layer weight of the user's text
+feature description to construct the user relationship graph. It introduces the
+Transformer mechanism to capture the sequence modeling of the user's historical
+interaction sequence. Without access to user history interactions and specific
+user attributes, the federal learning privacy protection of data 'useable
+invisibility' is embodied. Preliminary experiments on some benchmark datasets
+demonstrate the superior performance of UFGraphFR. Our experiments show that
+this model can protect user privacy to some extent without affecting the
+performance of the recommendation system. The code will be easily available on
+https://github.com/trueWangSyutung/UFGraphFR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PolyLUT: Ultra-low Latency Polynomial Inference with Hardware-Aware
+  Structured Pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marta Andronic, Jiawen Li, George A. Constantinides
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard deep neural network inference involves the computation of
+interleaved linear maps and nonlinear activation functions. Prior work for
+ultra-low latency implementations has hardcoded these operations inside FPGA
+lookup tables (LUTs). However, FPGA LUTs can implement a much greater variety
+of functions. In this paper, we propose a novel approach to training DNNs for
+FPGA deployment using multivariate polynomials as the basic building block. Our
+method takes advantage of the flexibility offered by the soft logic, hiding the
+polynomial evaluation inside the LUTs with minimal overhead. By using
+polynomial building blocks, we achieve the same accuracy using considerably
+fewer layers of soft logic than by using linear functions, leading to
+significant latency and area improvements. LUT-based implementations also face
+a significant challenge: the LUT size grows exponentially with the number of
+inputs. Prior work relies on a priori fixed sparsity, with results heavily
+dependent on seed selection. To address this, we propose a structured pruning
+strategy using a bespoke hardware-aware group regularizer that encourages a
+particular sparsity pattern that leads to a small number of inputs per neuron.
+We demonstrate the effectiveness of PolyLUT on three tasks: network intrusion
+detection, jet identification at the CERN Large Hadron Collider, and MNIST.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2309.02334</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convergence Analysis of Real-time Recurrent Learning (RTRL) for a class
+  of Recurrent Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Chun-Hei Lam, Justin Sirignano, Konstantinos Spiliopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recurrent neural networks (RNNs) are commonly trained with the truncated
+backpropagation-through-time (TBPTT) algorithm. For the purposes of
+computational tractability, the TBPTT algorithm truncates the chain rule and
+calculates the gradient on a finite block of the overall data sequence. Such
+approximation could lead to significant inaccuracies, as the block length for
+the truncated backpropagation is typically limited to be much smaller than the
+overall sequence length. In contrast, Real-time recurrent learning (RTRL) is an
+online optimization algorithm which asymptotically follows the true gradient of
+the loss on the data sequence as the number of sequence time steps $t
+\rightarrow \infty$. RTRL forward propagates the derivatives of the RNN
+hidden/memory units with respect to the parameters and, using the forward
+derivatives, performs online updates of the parameters at each time step in the
+data sequence. RTRL's online forward propagation allows for exact optimization
+over extremely long data sequences, although it can be computationally costly
+for models with large numbers of parameters. We prove convergence of the RTRL
+algorithm for a class of RNNs. The convergence analysis establishes a fixed
+point for the joint distribution of the data sequence, RNN hidden layer, and
+the RNN hidden layer forward derivatives as the number of data samples from the
+sequence and the number of training steps tend to infinity. We prove
+convergence of the RTRL algorithm to a stationary point of the loss. Numerical
+studies illustrate our theoretical results. One potential application area for
+RTRL is the analysis of financial data, which typically involve long time
+series and models with small to medium numbers of parameters. This makes RTRL
+computationally tractable and a potentially appealing optimization method for
+training models. Thus, we include an example of RTRL applied to limit order
+book data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced SPS Velocity-adaptive Scheme: Access Fariness in 5G NR V2I
+  Networks <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Xu, Qiong Wu, Pingyi Fan, Kezhi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicle-to-Infrastructure (V2I) technology enables information exchange
+between vehicles and road infrastructure. Specifically, when a vehicle
+approaches a roadside unit (RSU), it can exchange information with the RSU to
+obtain accurate data that assists in driving. With the release of the 3rd
+Generation Partnership Project (3GPP) Release 16, which includes the 5G New
+Radio (NR) Vehicle-to-Everything (V2X) standards, vehicles typically adopt
+mode-2 communication using sensing-based semi-persistent scheduling (SPS) for
+resource allocation. In this approach, vehicles identify candidate resources
+within a selection window and exclude ineligible resources based on information
+from a sensing window. However, vehicles often drive at different speeds,
+resulting in varying amounts of data transmission with RSUs as they pass by,
+which leads to unfair access. Therefore, it is essential to design an access
+scheme that accounts for different vehicle speeds to achieve fair access across
+the network. This paper formulates an optimization problem for vehicular
+networks and proposes a multi-objective optimization scheme to address it by
+adjusting the selection window in the SPS mechanism of 5G NR V2I mode-2.
+Simulation results demonstrate the effectiveness of the proposed scheme
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to IEEE Journal. The source code has
+  been released at:
+  https://github.com/qiongwu86/Enhanced-SPS-Velocity-adaptiveScheme-Access-Fariness-in-5G-NR-V2I-Networks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An AI-driven framework for rapid and localized optimizations of urban
+  open spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pegah Eshraghi, Arman Nikkhah Dehnavi, Maedeh Mirdamadi, Riccardo Talami, Zahra-Sadat Zomorodian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As urbanization accelerates, open spaces are increasingly recognized for
+their role in enhancing sustainability and well-being, yet they remain
+underexplored compared to built spaces. This study introduces an AI-driven
+framework that integrates machine learning models (MLMs) and explainable AI
+techniques to optimize Sky View Factor (SVF) and visibility, key spatial
+metrics influencing thermal comfort and perceived safety in urban spaces.
+Unlike global optimization methods, which are computationally intensive and
+impractical for localized adjustments, this framework supports incremental
+design improvements with lower computational costs and greater flexibility. The
+framework employs SHapley Adaptive Explanations (SHAP) to analyze feature
+importance and Counterfactual Explanations (CFXs) to propose minimal design
+changes. Simulations tested five MLMs, identifying XGBoost as the most
+accurate, with building width, park area, and heights of surrounding buildings
+as critical for SVF, and distances from southern buildings as key for
+visibility. Compared to Genetic Algorithms, which required approximately 15/30
+minutes across 3/4 generations to converge, the tested CFX approach achieved
+optimized results in 1 minute with a 5% RMSE error, demonstrating significantly
+faster performance and suitability for scalable retrofitting strategies. This
+interpretable and computationally efficient framework advances urban
+performance optimization, providing data-driven insights and practical
+retrofitting solutions for enhancing usability and environmental quality across
+diverse urban contexts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Maximizing Uncertainty for Federated learning via Bayesian
+  Optimisation-based Model Poisoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Aristodemou, Xiaolan Liu, Yuan Wang, Konstantinos G. Kyriakopoulos, Sangarapillai Lambotharan, Qingsong Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As we transition from Narrow Artificial Intelligence towards Artificial Super
+Intelligence, users are increasingly concerned about their privacy and the
+trustworthiness of machine learning (ML) technology. A common denominator for
+the metrics of trustworthiness is the quantification of uncertainty inherent in
+DL algorithms, and specifically in the model parameters, input data, and model
+predictions. One of the common approaches to address privacy-related issues in
+DL is to adopt distributed learning such as federated learning (FL), where
+private raw data is not shared among users. Despite the privacy-preserving
+mechanisms in FL, it still faces challenges in trustworthiness. Specifically,
+the malicious users, during training, can systematically create malicious model
+parameters to compromise the models predictive and generative capabilities,
+resulting in high uncertainty about their reliability. To demonstrate malicious
+behaviour, we propose a novel model poisoning attack method named Delphi which
+aims to maximise the uncertainty of the global model output. We achieve this by
+taking advantage of the relationship between the uncertainty and the model
+parameters of the first hidden layer of the local model. Delphi employs two
+types of optimisation , Bayesian Optimisation and Least Squares Trust Region,
+to search for the optimal poisoned model parameters, named as Delphi-BO and
+Delphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise
+the distance of the predictive probability distribution towards an uncertain
+distribution of model output. Furthermore, we establish a mathematical proof
+for the attack effectiveness demonstrated in FL. Numerical results demonstrate
+that Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR
+highlighting vulnerability of FL systems to model poisoning attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Feature Construction for Anomaly Detection in Time Series
+  -- An Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marine Hamon, Vincent Lemaire, Nour Eddine Yassine Nair-Benrekia, Samuel Berlemont, Julien Cumin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To detect anomalies with precision and without prior knowledge in time
+series, is it better to build a detector from the initial temporal
+representation, or to compute a new (tabular) representation using an existing
+automatic variable construction library? In this article, we address this
+question by conducting an in-depth experimental study for two popular detectors
+(Isolation Forest and Local Outlier Factor). The obtained results, for 5
+different datasets, show that the new representation, computed using the
+tsfresh library, allows Isolation Forest to significantly improve its
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reward Compatibility: A Framework for Inverse RL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Filippo Lazzati, Mirco Mutti, Alberto Metelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide an original theoretical study of Inverse Reinforcement Learning
+(IRL) through the lens of reward compatibility, a novel framework to quantify
+the compatibility of a reward with the given expert's demonstrations.
+Intuitively, a reward is more compatible with the demonstrations the closer the
+performance of the expert's policy computed with that reward is to the optimal
+performance for that reward. This generalizes the notion of feasible reward
+set, the most common framework in the theoretical IRL literature, for which a
+reward is either compatible or not compatible. The grayscale introduced by the
+reward compatibility is the key to extend the realm of provably efficient IRL
+far beyond what is attainable with the feasible reward set: from tabular to
+large-scale MDPs. We analyze the IRL problem across various settings, including
+optimal and suboptimal expert's demonstrations and both online and offline data
+collection. For all of these dimensions, we provide a tractable algorithm and
+corresponding sample complexity analysis, as well as various insights on reward
+compatibility and how the framework can pave the way to yet more general
+problem settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combining imaging and shape features for prediction tasks of Alzheimer's
+  disease classification and brain age regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nairouz Shehata, Carolina Piçarra, Ben Glocker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate combining imaging and shape features extracted from MRI for
+the clinically relevant tasks of brain age prediction and Alzheimer's disease
+classification. Our proposed model fuses ResNet-extracted image embeddings with
+shape embeddings from a bespoke graph neural network. The shape embeddings are
+derived from surface meshes of 15 brain structures, capturing detailed
+geometric information. Combined with the appearance features from T1-weighted
+images, we observe improvements in the prediction performance on both tasks,
+with substantial gains for classification. We evaluate the model using public
+datasets, including CamCAN, IXI, and OASIS3, demonstrating the effectiveness of
+fusing imaging and shape features for brain analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CHEQ-ing the Box: Safe Variable Impedance Learning for Robotic Polishing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emma Cramer, Lukas Jäschke, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic systems are increasingly employed for industrial automation, with
+contact-rich tasks like polishing requiring dexterity and compliant behaviour.
+These tasks are difficult to model, making classical control challenging. Deep
+reinforcement learning (RL) offers a promising solution by enabling the
+learning of models and control policies directly from data. However, its
+application to real-world problems is limited by data inefficiency and unsafe
+exploration. Adaptive hybrid RL methods blend classical control and RL
+adaptively, combining the strengths of both: structure from control and
+learning from RL. This has led to improvements in data efficiency and
+exploration safety. However, their potential for hardware applications remains
+underexplored, with no evaluations on physical systems to date. Such
+evaluations are critical to fully assess the practicality and effectiveness of
+these methods in real-world settings. This work presents an experimental
+demonstration of the hybrid RL algorithm CHEQ for robotic polishing with
+variable impedance, a task requiring precise force and velocity tracking. In
+simulation, we show that variable impedance enhances polishing performance. We
+compare standalone RL with adaptive hybrid RL, demonstrating that CHEQ achieves
+effective learning while adhering to safety constraints. On hardware, CHEQ
+achieves effective polishing behaviour, requiring only eight hours of training
+and incurring just five failures. These results highlight the potential of
+adaptive hybrid RL for real-world, contact-rich tasks trained directly on
+hardware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Derivation of Output Correlation Inferences for Multi-Output (aka
+  Multi-Task) Gaussian Process 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuhei Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian process (GP) is arguably one of the most widely used machine
+learning algorithms in practice. One of its prominent applications is Bayesian
+optimization (BO). Although the vanilla GP itself is already a powerful tool
+for BO, it is often beneficial to be able to consider the dependencies of
+multiple outputs. To do so, Multi-task GP (MTGP) is formulated, but it is not
+trivial to fully understand the derivations of its formulations and their
+gradients from the previous literature. This paper serves friendly derivations
+of the MTGP formulations and their gradients.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Guide Dog: Egocentric Path Prediction on Smartphone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishwarya Jadhav, Jeffery Cao, Abhishree Shetty, Urvashi Priyam Kumar, Aditi Sharma, Ben Sukboontip, Jayant Sravan Tamarapalli, Jingyi Zhang, Anirudh Koul
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces AI Guide Dog (AIGD), a lightweight egocentric
+navigation assistance system for visually impaired individuals, designed for
+real-time deployment on smartphones. AIGD addresses key challenges in blind
+navigation by employing a vision-only, multi-label classification approach to
+predict directional commands, ensuring safe traversal across diverse
+environments. We propose a novel technique to enable goal-based outdoor
+navigation by integrating GPS signals and high-level directions, while also
+addressing uncertain multi-path predictions for destination-free indoor
+navigation. Our generalized model is the first navigation assistance system to
+handle both goal-oriented and exploratory navigation scenarios across indoor
+and outdoor settings, establishing a new state-of-the-art in blind navigation.
+We present methods, datasets, evaluations, and deployment insights to encourage
+further innovations in assistive navigation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gandalf the Red: Adaptive Security for LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niklas Pfister, Václav Volhejn, Manuel Knott, Santiago Arias, Julia Bazińska, Mykhailo Bichurin, Alan Commike, Janet Darling, Peter Dienes, Matthew Fiedler, David Haber, Matthias Kraft, Marco Lancini, Max Mathys, Damián Pascual-Ortiz, Jakub Podolak, Adrià Romero-López, Kyriacos Shiarlis, Andreas Signer, Zsolt Terek, Athanasios Theocharis, Daniel Timbrell, Samuel Trautwein, Samuel Watts, Natalie Wu, Mateo Rojas-Carulla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current evaluations of defenses against prompt attacks in large language
+model (LLM) applications often overlook two critical factors: the dynamic
+nature of adversarial behavior and the usability penalties imposed on
+legitimate users by restrictive defenses. We propose D-SEC (Dynamic Security
+Utility Threat Model), which explicitly separates attackers from legitimate
+users, models multi-step interactions, and rigorously expresses the
+security-utility in an optimizable form. We further address the shortcomings in
+existing evaluations by introducing Gandalf, a crowd-sourced, gamified
+red-teaming platform designed to generate realistic, adaptive attack datasets.
+Using Gandalf, we collect and release a dataset of 279k prompt attacks.
+Complemented by benign user data, our analysis reveals the interplay between
+security and utility, showing that defenses integrated in the LLM (e.g., system
+prompts) can degrade usability even without blocking requests. We demonstrate
+that restricted application domains, defense-in-depth, and adaptive defenses
+are effective strategies for building secure and useful LLM applications. Code
+is available at
+\href{https://github.com/lakeraai/dsec-gandalf}{\texttt{https://github.com/lakeraai/dsec-gandalf}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Niklas Pfister, V\'aclav Volhejn and Manuel Knott contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Phase of Flight Classification in Aviation Safety using LSTM, GRU, and
+  BiLSTM: A Case Study with ASN <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aziida Nanyonga, Hassan Wasswa, Graham Wild
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safety is the main concern in the aviation industry, where even minor
+operational issues can lead to serious consequences. This study addresses the
+need for comprehensive aviation accident analysis by leveraging natural
+language processing (NLP) and advanced AI models to classify the phase of
+flight from unstructured aviation accident analysis narratives. The research
+aims to determine whether the phase of flight can be inferred from narratives
+of post-accident events using NLP techniques. The classification performance of
+various deep learning models was evaluated. For single RNN-based models, LSTM
+achieved an accuracy of 63%, precision 60%, and recall 61%. BiLSTM recorded an
+accuracy of 64%, precision 63%, and a recall of 64%. GRU exhibited balanced
+performance with an accuracy and recall of 60% and a precision of 63%. Joint
+RNN-based models further enhanced predictive capabilities. GRU-LSTM,
+LSTM-BiLSTM, and GRU-BiLSTM demonstrated accuracy rates of 62%, 67%, and 60%,
+respectively, showcasing the benefits of combining these architectures. To
+provide a comprehensive overview of model performance, single and combined
+models were compared in terms of the various metrics. These results underscore
+the models' capacity to classify the phase of flight from raw text narratives,
+equipping aviation industry stakeholders with valuable insights for proactive
+decision-making. Therefore, this research signifies a substantial advancement
+in the application of NLP and deep learning models to enhance aviation safety.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Aviation Safety, Deep learning algorithms, Flight phase, NLP, ASN,
+  and Classification</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aviation Safety Enhancement via NLP & Deep Learning: Classifying Flight
+  Phases in ATSB Safety Reports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aziida Nanyonga, Hassan Wasswa, Graham Wild
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aviation safety is paramount, demanding precise analysis of safety
+occurrences during different flight phases. This study employs Natural Language
+Processing (NLP) and Deep Learning models, including LSTM, CNN, Bidirectional
+LSTM (BLSTM), and simple Recurrent Neural Networks (sRNN), to classify flight
+phases in safety reports from the Australian Transport Safety Bureau (ATSB).
+The models exhibited high accuracy, precision, recall, and F1 scores, with LSTM
+achieving the highest performance of 87%, 88%, 87%, and 88%, respectively. This
+performance highlights their effectiveness in automating safety occurrence
+analysis. The integration of NLP and Deep Learning technologies promises
+transformative enhancements in aviation safety analysis, enabling targeted
+safety measures and streamlined report handling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NLP, Aviation Safety, ATSB, Deep learning, Flight phase. arXiv admin
+  note: substantial text overlap with arXiv:2501.01694</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Logarithmic Memory Networks (LMNs): Efficient Long-Range Sequence
+  Modeling for Resource-Constrained Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed A. Taha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-range sequence modeling is a crucial aspect of natural language
+processing and time series analysis. However, traditional models like Recurrent
+Neural Networks (RNNs) and Transformers suffer from computational and memory
+inefficiencies, especially when dealing with long sequences. This paper
+introduces Logarithmic Memory Networks (LMNs), a novel architecture that
+leverages a hierarchical logarithmic tree structure to efficiently store and
+retrieve past information. LMNs dynamically summarize historical context,
+significantly reducing the memory footprint and computational complexity of
+attention mechanisms from O(n2) to O(log(n)). The model employs a
+single-vector, targeted attention mechanism to access stored information, and
+the memory block construction worker (summarizer) layer operates in two modes:
+a parallel execution mode during training for efficient processing of
+hierarchical tree structures and a sequential execution mode during inference,
+which acts as a memory management system. It also implicitly encodes positional
+information, eliminating the need for explicit positional encodings. These
+features make LMNs a robust and scalable solution for processing long-range
+sequences in resource-constrained environments, offering practical improvements
+in efficiency and scalability. The code is publicly available under the MIT
+License on GitHub: https://github.com/AhmedBoin/LogarithmicMemory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Classification Trees for Continuous Feature Data Using Dynamic
+  Programming with Branch-and-Bound <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Catalin E. Brita, Jacobus G. M. van der Linden, Emir Demirović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computing an optimal classification tree that provably maximizes training
+performance within a given size limit, is NP-hard, and in practice, most
+state-of-the-art methods do not scale beyond computing optimal trees of depth
+three. Therefore, most methods rely on a coarse binarization of continuous
+features to maintain scalability. We propose a novel algorithm that optimizes
+trees directly on the continuous feature data using dynamic programming with
+branch-and-bound. We develop new pruning techniques that eliminate many
+sub-optimal splits in the search when similar to previously computed splits and
+we provide an efficient subroutine for computing optimal depth-two trees. Our
+experiments demonstrate that these techniques improve runtime by one or more
+orders of magnitude over state-of-the-art optimal methods and improve test
+accuracy by 5% over greedy heuristics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the proceedings of AAAI-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Label Refinement Matters More than Preference Optimization
+  under Weak Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaowen Ye, Cassidy Laidlaw, Jacob Steinhardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language model (LM) post-training relies on two stages of human supervision:
+task demonstrations for supervised finetuning (SFT), followed by preference
+comparisons for reinforcement learning from human feedback (RLHF). As LMs
+become more capable, the tasks they are given become harder to supervise. Will
+post-training remain effective under unreliable supervision? To test this, we
+simulate unreliable demonstrations and comparison feedback using small LMs and
+time-constrained humans. We find that in the presence of unreliable
+supervision, SFT still retains some effectiveness, but DPO (a common RLHF
+algorithm) fails to improve the model beyond SFT. To address this, we propose
+iterative label refinement (ILR) as an alternative to RLHF. ILR improves the
+SFT data by using comparison feedback to decide whether human demonstrations
+should be replaced by model-generated alternatives, then retrains the model via
+SFT on the updated data. SFT+ILR outperforms SFT+DPO on several tasks with
+unreliable supervision (math, coding, and safe instruction-following). Our
+findings suggest that as LMs are used for complex tasks where human supervision
+is unreliable, RLHF may no longer be the best use of human comparison feedback;
+instead, it is better to direct feedback towards improving the training data
+rather than continually training the model. Our code and data are available at
+https://github.com/helloelwin/iterative-label-refinement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Algorithmic Bias in Multiclass CNN Classifications Using
+  Causal Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Sik Byun, Wendy Wan Yee Hui, Wai Kwong Lau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study describes a procedure for applying causal modeling to detect and
+mitigate algorithmic bias in a multiclass classification problem. The dataset
+was derived from the FairFace dataset, supplemented with emotional labels
+generated by the DeepFace pre-trained model. A custom Convolutional Neural
+Network (CNN) was developed, consisting of four convolutional blocks, followed
+by fully connected layers and dropout layers to mitigate overfitting. Gender
+bias was identified in the CNN model's classifications: Females were more
+likely to be classified as "happy" or "sad," while males were more likely to be
+classified as "neutral." To address this, the one-vs-all (OvA) technique was
+applied. A causal model was constructed for each emotion class to adjust the
+CNN model's predicted class probabilities. The adjusted probabilities for the
+various classes were then aggregated by selecting the class with the highest
+probability. The resulting debiased classifications demonstrated enhanced
+gender fairness across all classes, with negligible impact--or even a slight
+improvement--on overall accuracy. This study highlights that algorithmic
+fairness and accuracy are not necessarily trade-offs. All data and code for
+this study are publicly available for download.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages; 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MD-Syn: Synergistic drug combination prediction based on the
+  multidimensional feature fusion method and attention mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        XinXin Ge, Yi-Ting Lee, Shan-Ju Yeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Drug combination therapies have shown promising therapeutic efficacy in
+complex diseases and have demonstrated the potential to reduce drug resistance.
+However, the huge number of possible drug combinations makes it difficult to
+screen them all in traditional experiments. In this study, we proposed MD-Syn,
+a computational framework, which is based on the multidimensional feature
+fusion method and multi-head attention mechanisms. Given drug pair-cell line
+triplets, MD-Syn considers one-dimensional and two-dimensional feature spaces
+simultaneously. It consists of a one-dimensional feature embedding module
+(1D-FEM), a two-dimensional feature embedding module (2D-FEM), and a deep
+neural network-based classifier for synergistic drug combination prediction.
+MD-Syn achieved the AUROC of 0.919 in 5-fold cross-validation, outperforming
+the state-of-the-art methods. Further, MD-Syn showed comparable results over
+two independent datasets. In addition, the multi-head attention mechanisms not
+only learn embeddings from different feature aspects but also focus on
+essential interactive feature elements, improving the interpretability of
+MD-Syn. In summary, MD-Syn is an interpretable framework to prioritize
+synergistic drug combination pairs with chemicals and cancer cell line gene
+expression profiles. To facilitate broader community access to this model, we
+have developed a web portal (https://labyeh104-2.life.nthu.edu.tw/) that
+enables customized predictions of drug combination synergy effects based on
+user-specified compounds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributed Nonparametric Estimation: from Sparse to Dense Samples per
+  Terminal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deheng Yuan, Tao Guo, Zhongyi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consider the communication-constrained problem of nonparametric function
+estimation, in which each distributed terminal holds multiple i.i.d. samples.
+Under certain regularity assumptions, we characterize the minimax optimal rates
+for all regimes, and identify phase transitions of the optimal rates as the
+samples per terminal vary from sparse to dense. This fully solves the problem
+left open by previous works, whose scopes are limited to regimes with either
+dense samples or a single sample per terminal. To achieve the optimal rates, we
+design a layered estimation protocol by exploiting protocols for the parametric
+density estimation problem. We show the optimality of the protocol using
+information-theoretic methods and strong data processing inequalities, and
+incorporating the classic balls and bins model. The optimal rates are immediate
+for various special cases such as density estimation, Gaussian, binary, Poisson
+and heteroskedastic regression models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ deepTerra -- AI Land Classification Made Easy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Keith Wilkinson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  deepTerra is a comprehensive platform designed to facilitate the
+classification of land surface features using machine learning and satellite
+imagery. The platform includes modules for data collection, image augmentation,
+training, testing, and prediction, streamlining the entire workflow for image
+classification tasks. This paper presents a detailed overview of the
+capabilities of deepTerra, shows how it has been applied to various research
+areas, and discusses the future directions it might take.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ State-of-the-Art <span class="highlight-title">Transformer</span> Models for Image Super-Resolution:
+  Techniques, Challenges, and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debasish Dutta, Deepjyoti Chetia, Neeharika Sonowal, Sanjib Kr Kalita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image Super-Resolution (SR) aims to recover a high-resolution image from its
+low-resolution counterpart, which has been affected by a specific degradation
+process. This is achieved by enhancing detail and visual quality. Recent
+advancements in transformer-based methods have remolded image super-resolution
+by enabling high-quality reconstructions surpassing previous deep-learning
+approaches like CNN and GAN-based. This effectively addresses the limitations
+of previous methods, such as limited receptive fields, poor global context
+capture, and challenges in high-frequency detail recovery. Additionally, the
+paper reviews recent trends and advancements in transformer-based SR models,
+exploring various innovative techniques and architectures that combine
+transformers with traditional networks to balance global and local contexts.
+These neoteric methods are critically analyzed, revealing promising yet
+unexplored gaps and potential directions for future research. Several
+visualizations of models and techniques are included to foster a holistic
+understanding of recent trends. This work seeks to offer a structured roadmap
+for researchers at the forefront of deep learning, specifically exploring the
+impact of transformers on super-resolution techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Intra- and Cross-frame Topological Consistency Scheme for
+  Semi-supervised Atherosclerotic Coronary Plaque Segmentation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziheng Zhang, Zihan Li, Dandan Shan, Yuehui Qiu, Qingqi Hong, Qingqiang Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enhancing the precision of segmenting coronary atherosclerotic plaques from
+CT Angiography (CTA) images is pivotal for advanced Coronary Atherosclerosis
+Analysis (CAA), which distinctively relies on the analysis of vessel
+cross-section images reconstructed via Curved Planar Reformation. This task
+presents significant challenges due to the indistinct boundaries and structures
+of plaques and blood vessels, leading to the inadequate performance of current
+deep learning models, compounded by the inherent difficulty in annotating such
+complex data. To address these issues, we propose a novel dual-consistency
+semi-supervised framework that integrates Intra-frame Topological Consistency
+(ITC) and Cross-frame Topological Consistency (CTC) to leverage labeled and
+unlabeled data. ITC employs a dual-task network for simultaneous segmentation
+mask and Skeleton-aware Distance Transform (SDT) prediction, achieving similar
+prediction of topology structure through consistency constraint without
+additional annotations. Meanwhile, CTC utilizes an unsupervised estimator for
+analyzing pixel flow between skeletons and boundaries of adjacent frames,
+ensuring spatial continuity. Experiments on two CTA datasets show that our
+method surpasses existing semi-supervised methods and approaches the
+performance of supervised methods on CAA. In addition, our method also performs
+better than other methods on the ACDC dataset, demonstrating its
+generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flow: A Modular Approach to Automated Agentic Workflow Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boye Niu, Yiliao Song, Kai Lian, Yifan Shen, Yu Yao, Kun Zhang, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent frameworks powered by large language models (LLMs) have
+demonstrated great success in automated planning and task execution. However,
+the effective adjustment of Agentic workflows during execution has not been
+well-studied. A effective workflow adjustment is crucial, as in many real-world
+scenarios, the initial plan must adjust to unforeseen challenges and changing
+conditions in real-time to ensure the efficient execution of complex tasks. In
+this paper, we define workflows as an activity-on-vertex (AOV) graphs. We
+continuously refine the workflow by dynamically adjusting task allocations
+based on historical performance and previous AOV with LLM agents. To further
+enhance system performance, we emphasize modularity in workflow design based on
+measuring parallelism and dependence complexity. Our proposed multi-agent
+framework achieved efficient sub-task concurrent execution, goal achievement,
+and error tolerance. Empirical results across different practical tasks
+demonstrate dramatic improvements in the efficiency of multi-agent frameworks
+through dynamic workflow updating and modularization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prediction Interval Construction Method for Electricity Prices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07827v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07827v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of electricity prices plays an essential role in the
+electricity market. To reflect the uncertainty of electricity prices, price
+intervals are predicted. This paper proposes a novel prediction interval
+construction method. A conditional generative adversarial network is first
+presented to generate electricity price scenarios, with which the prediction
+intervals can be constructed. Then, different generated scenarios are stacked
+to obtain the probability densities, which can be applied to accurately reflect
+the uncertainty of electricity prices. Furthermore, a reinforced prediction
+mechanism based on the volatility level of weather factors is introduced to
+address the spikes or volatile prices. A case study is conducted to verify the
+effectiveness of the proposed novel prediction interval construction method.
+The method can also provide the probability density of each price scenario
+within the prediction interval and has the superiority to address the volatile
+prices and price spikes with a reinforced prediction mechanism.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-time Verification and Refinement of Language Model Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joonho Ko, Jinheon Baek, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown remarkable performance across a wide
+range of natural language tasks. However, a critical challenge remains in that
+they sometimes generate factually incorrect answers. To address this, while
+many previous work has focused on identifying errors in their generation and
+further refining them, they are slow in deployment since they are designed to
+verify the response from LLMs only after their entire generation (from the
+first to last tokens) is done. Further, we observe that once LLMs generate
+incorrect tokens early on, there is a higher likelihood that subsequent tokens
+will also be factually incorrect. To this end, in this work, we propose
+Streaming-VR (Streaming Verification and Refinement), a novel approach designed
+to enhance the efficiency of verification and refinement of LLM outputs.
+Specifically, the proposed Streaming-VR enables on-the-fly verification and
+correction of tokens as they are being generated, similar to a streaming
+process, ensuring that each subset of tokens is checked and refined in
+real-time by another LLM as the LLM constructs its response. Through
+comprehensive evaluations on multiple datasets, we demonstrate that our
+approach not only enhances the factual accuracy of LLMs, but also offers a more
+efficient solution compared to prior refinement methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-Encoder Frozen-Decoder Approach for Fine-Tuning Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaustubh D. Dhole
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among parameter-efficient fine-tuning methods, freezing has emerged as a
+popular strategy for speeding up training, reducing catastrophic forgetting,
+and improving downstream performance. We investigate the impact of freezing the
+decoder in a multi-task setup comprising diverse natural language tasks, aiming
+to reduce deployment overhead and enhance portability to novel tasks. Our
+experiments, conducted by fine-tuning both individual and multi-task setups on
+the AlexaTM model, reveal that freezing decoders is highly effective for tasks
+with natural language outputs and mitigates catastrophic forgetting in
+multilingual tasks. However, we find that pairing frozen decoders with a larger
+model can effectively maintain or even enhance performance in structured and QA
+tasks, making it a viable strategy for a broader range of task types.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ STTS-EAD: Improving Spatio-Temporal Learning Based Time Series
+  Prediction via 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07814v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07814v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyuan Liang, Tianhao Zhang, Tingyu Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handling anomalies is a critical preprocessing step in multivariate time
+series prediction. However, existing approaches that separate anomaly
+preprocessing from model training for multivariate time series prediction
+encounter significant limitations. Specifically, these methods fail to utilize
+auxiliary information crucial for identifying latent anomalies associated with
+spatiotemporal factors during the preprocessing stage. Instead, they rely
+solely on data distribution for anomaly detection, which can result in the
+incorrect processing of numerous samples that could otherwise contribute
+positively to model training. To address this, we propose STTS-EAD, an
+end-to-end method that seamlessly integrates anomaly detection into the
+training process of multivariate time series forecasting and aims to improve
+Spatio-Temporal learning based Time Series prediction via Embedded Anomaly
+Detection. Our proposed STTS-EAD leverages spatio-temporal information for
+forecasting and anomaly detection, with the two parts alternately executed and
+optimized for each other. To the best of our knowledge, STTS-EAD is the first
+to integrate anomaly detection and forecasting tasks in the training phase for
+improving the accuracy of multivariate time series forecasting. Extensive
+experiments on a public stock dataset and two real-world sales datasets from a
+renowned coffee chain enterprise show that our proposed method can effectively
+process detected anomalies in the training stage to improve forecasting
+performance in the inference stage and significantly outperform baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal mapping Coordinates Physics-Informed Neural Networks
+  (CoCo-PINNs): learning neural networks for designing neutral inclusions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07809v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07809v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daehee Cho, Hyeonmin Yun, Jaeyong Lee, Mikyoung Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We focus on designing and solving the neutral inclusion problem via neural
+networks. The neutral inclusion problem has a long history in the theory of
+composite materials, and it is exceedingly challenging to identify the precise
+condition that precipitates a general-shaped inclusion into a neutral
+inclusion. Physics-informed neural networks (PINNs) have recently become a
+highly successful approach to addressing both forward and inverse problems
+associated with partial differential equations. We found that traditional PINNs
+perform inadequately when applied to the inverse problem of designing neutral
+inclusions with arbitrary shapes. In this study, we introduce a novel approach,
+Conformal mapping Coordinates Physics-Informed Neural Networks (CoCo-PINNs),
+which integrates complex analysis techniques into PINNs. This method exhibits
+strong performance in solving forward-inverse problems to construct neutral
+inclusions of arbitrary shapes in two dimensions, where the imperfect interface
+condition on the inclusion's boundary is modeled by training neural networks.
+Notably, we mathematically prove that training with a single linear field is
+sufficient to achieve neutrality for untrained linear fields in arbitrary
+directions, given a minor assumption. We demonstrate that CoCo-PINNs offer
+enhanced performances in terms of credibility, consistency, and stability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BioPose: Biomechanically-accurate 3D Pose Estimation from Monocular
+  Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farnoosh Koleini, Muhammad Usama Saleem, Pu Wang, Hongfei Xue, Ahmed Helmy, Abbey Fenwick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in 3D human pose estimation from single-camera images and
+videos have relied on parametric models, like SMPL. However, these models
+oversimplify anatomical structures, limiting their accuracy in capturing true
+joint locations and movements, which reduces their applicability in
+biomechanics, healthcare, and robotics. Biomechanically accurate pose
+estimation, on the other hand, typically requires costly marker-based motion
+capture systems and optimization techniques in specialized labs. To bridge this
+gap, we propose BioPose, a novel learning-based framework for predicting
+biomechanically accurate 3D human pose directly from monocular videos. BioPose
+includes three key components: a Multi-Query Human Mesh Recovery model
+(MQ-HMR), a Neural Inverse Kinematics (NeurIK) model, and a 2D-informed pose
+refinement technique. MQ-HMR leverages a multi-query deformable transformer to
+extract multi-scale fine-grained image features, enabling precise human mesh
+recovery. NeurIK treats the mesh vertices as virtual markers, applying a
+spatial-temporal network to regress biomechanically accurate 3D poses under
+anatomical constraints. To further improve 3D pose estimations, a 2D-informed
+refinement step optimizes the query tokens during inference by aligning the 3D
+structure with 2D pose observations. Experiments on benchmark datasets
+demonstrate that BioPose significantly outperforms state-of-the-art methods.
+Project website:
+\url{https://m-usamasaleem.github.io/publication/BioPose/BioPose.html}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linearly Convergent Mixup Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07794v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07794v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gakuto Obi, Ayato Saito, Yuto Sasaki, Tsuyoshi Kato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning in the reproducing kernel Hilbert space (RKHS) such as the support
+vector machine has been recognized as a promising technique. It continues to be
+highly effective and competitive in numerous prediction tasks, particularly in
+settings where there is a shortage of training data or computational
+limitations exist. These methods are especially valued for their ability to
+work with small datasets and their interpretability. To address the issue of
+limited training data, mixup data augmentation, widely used in deep learning,
+has remained challenging to apply to learning in RKHS due to the generation of
+intermediate class labels. Although gradient descent methods handle these
+labels effectively, dual optimization approaches are typically not directly
+applicable. In this study, we present two novel algorithms that extend to a
+broader range of binary classification models. Unlike gradient-based
+approaches, our algorithms do not require hyperparameters like learning rates,
+simplifying their implementation and optimization. Both the number of
+iterations to converge and the computational cost per iteration scale linearly
+with respect to the dataset size. The numerical experiments demonstrate that
+our algorithms achieve faster convergence to the optimal solution compared to
+gradient descent approaches, and that mixup data augmentation consistently
+improves the predictive performance across various loss functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>none</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transforming Indoor Localization: Advanced <span class="highlight-title">Transformer</span> Architecture for
+  NLOS Dominated Wireless Environments with Distributed Sensors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saad Masrur,  Jung-Fu,  Cheng, Atieh R. Khamesi, Ismail Guvenc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Indoor localization in challenging non-line-of-sight (NLOS) environments
+often leads to mediocre accuracy with traditional approaches. Deep learning
+(DL) has been applied to tackle these challenges; however, many DL approaches
+overlook computational complexity, especially for floating-point operations
+(FLOPs), making them unsuitable for resource-limited devices. Transformer-based
+models have achieved remarkable success in natural language processing (NLP)
+and computer vision (CV) tasks, motivating their use in wireless applications.
+However, their use in indoor localization remains nascent, and directly
+applying Transformers for indoor localization can be both computationally
+intensive and exhibit limitations in accuracy. To address these challenges, in
+this work, we introduce a novel tokenization approach, referred to as Sensor
+Snapshot Tokenization (SST), which preserves variable-specific representations
+of power delay profile (PDP) and enhances attention mechanisms by effectively
+capturing multi-variate correlation. Complementing this, we propose a
+lightweight Swish-Gated Linear Unit-based Transformer (L-SwiGLU Transformer)
+model, designed to reduce computational complexity without compromising
+localization accuracy. Together, these contributions mitigate the computational
+burden and dependency on large datasets, making Transformer models more
+efficient and suitable for resource-constrained scenarios. The proposed
+tokenization method enables the Vanilla Transformer to achieve a 90th
+percentile positioning error of 0.388 m in a highly NLOS indoor factory,
+surpassing conventional tokenization methods. The L-SwiGLU ViT further reduces
+the error to 0.355 m, achieving an 8.51% improvement. Additionally, the
+proposed model outperforms a 14.1 times larger model with a 46.13% improvement,
+underscoring its computational efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been submitted to IEEE Transactions on Machine Learning
+  in Communications and Networking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symmetry-Aware Generative Modeling through Learned Canonicalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07773v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07773v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kusha Sareen, Daniel Levy, Arnab Kumar Mondal, Sékou-Oumar Kaba, Tara Akhound-Sadegh, Siamak Ravanbakhsh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative modeling of symmetric densities has a range of applications in AI
+for science, from drug discovery to physics simulations. The existing
+generative modeling paradigm for invariant densities combines an invariant
+prior with an equivariant generative process. However, we observe that this
+technique is not necessary and has several drawbacks resulting from the
+limitations of equivariant networks. Instead, we propose to model a learned
+slice of the density so that only one representative element per orbit is
+learned. To accomplish this, we learn a group-equivariant canonicalization
+network that maps training samples to a canonical pose and train a
+non-equivariant generative model over these canonicalized samples. We implement
+this idea in the context of diffusion models. Our preliminary experimental
+results on molecular modeling are promising, demonstrating improved sample
+quality and faster inference time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurReps 2024 Workshop Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BMIP: Bi-directional Modality Interaction <span class="highlight-title">Prompt</span> Learning for VLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Song-Lin Lv, Yu-Yang Chen, Zhi Zhou, Ming Yang, Lan-Zhe Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) have exhibited remarkable generalization
+capabilities, and prompt learning for VLMs has attracted great attention for
+the ability to adapt pre-trained VLMs to specific downstream tasks. However,
+existing studies mainly focus on single-modal prompts or uni-directional
+modality interaction, overlooking the powerful alignment effects resulting from
+the interaction between the vision and language modalities. To this end, we
+propose a novel prompt learning method called
+$\underline{\textbf{B}}i-directional \underline{\textbf{M}}odality
+\underline{\textbf{I}}nteraction \underline{\textbf{P}}rompt (BMIP)$, which
+dynamically weights bi-modal information through learning the information of
+the attention layer, enhancing trainability and inter-modal consistency
+compared to simple information aggregation methods. To evaluate the
+effectiveness of prompt learning methods, we propose a more realistic
+evaluation paradigm called open-world generalization complementing the widely
+adopted cross-dataset transfer and domain generalization tasks. Comprehensive
+experiments on various datasets reveal that BMIP not only outperforms current
+state-of-the-art methods across all three evaluation paradigms but is also
+flexible enough to be combined with other prompt-based methods for consistent
+performance enhancement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PINN-FEM: A Hybrid Approach for Enforcing Dirichlet Boundary Conditions
+  in Physics-Informed Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nahil Sobh, Rini Jasmine Gladstone, Hadi Meidani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-Informed Neural Networks (PINNs) solve partial differential equations
+(PDEs) by embedding governing equations and boundary/initial conditions into
+the loss function. However, enforcing Dirichlet boundary conditions accurately
+remains challenging, often leading to soft enforcement that compromises
+convergence and reliability in complex domains. We propose a hybrid approach,
+PINN-FEM, which combines PINNs with finite element methods (FEM) to impose
+strong Dirichlet boundary conditions via domain decomposition. This method
+incorporates FEM-based representations near the boundary, ensuring exact
+enforcement without compromising convergence. Through six experiments of
+increasing complexity, PINN-FEM outperforms standard PINN models, showcasing
+superior accuracy and robustness. While distance functions and similar
+techniques have been proposed for boundary condition enforcement, they lack
+generality for real-world applications. PINN-FEM bridges this gap by leveraging
+FEM near boundaries, making it well-suited for industrial and scientific
+problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Disease Outbreak Prediction: A Robust Early Warning
+  Signal for Transcritical Bifurcations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Miry, Amit K. Chakraborty, Russell Greiner, Mark A. Lewis, Hao Wang, Tianyu Guan, Pouria Ramazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early Warning Signals (EWSs) are vital for implementing preventive measures
+before a disease turns into a pandemic. While new diseases exhibit unique
+behaviors, they often share fundamental characteristics from a dynamical
+systems perspective. Moreover, measurements during disease outbreaks are often
+corrupted by different noise sources, posing challenges for Time Series
+Classification (TSC) tasks. In this study, we address the problem of having a
+robust EWS for disease outbreak prediction using a best-performing deep
+learning model in the domain of TSC. We employed two simulated datasets to
+train the model: one representing generated dynamical systems with randomly
+selected polynomial terms to model new disease behaviors, and another
+simulating noise-induced disease dynamics to account for noisy measurements.
+The model's performance was analyzed using both simulated data from different
+disease models and real-world data, including influenza and COVID-19. Results
+demonstrate that the proposed model outperforms previous models, effectively
+providing EWSs of impending outbreaks across various scenarios. This study
+bridges advancements in deep learning with the ability to provide robust early
+warning signals in noisy environments, making it highly applicable to
+real-world crises involving emerging disease outbreaks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Statistical Capacity of Deep Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edric Tam, David B. Dunson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative models are routinely used in generating samples from complex,
+high-dimensional distributions. Despite their apparent successes, their
+statistical properties are not well understood. A common assumption is that
+with enough training data and sufficiently large neural networks, deep
+generative model samples will have arbitrarily small errors in sampling from
+any continuous target distribution. We set up a unifying framework that debunks
+this belief. We demonstrate that broad classes of deep generative models,
+including variational autoencoders and generative adversarial networks, are not
+universal generators. Under the predominant case of Gaussian latent variables,
+these models can only generate concentrated samples that exhibit light tails.
+Using tools from concentration of measure and convex geometry, we give
+analogous results for more general log-concave and strongly log-concave latent
+variable distributions. We extend our results to diffusion models via a
+reduction argument. We use the Gromov--Levy inequality to give similar
+guarantees when the latent variables lie on manifolds with positive Ricci
+curvature. These results shed light on the limited capacity of common deep
+generative models to handle heavy tails. We illustrate the empirical relevance
+of our work with simulations and financial data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impatient Bandits: Optimizing for the Long-Term Without Delay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kelly W. Zhang, Thomas Baldwin-McDonald, Kamil Ciosek, Lucas Maystre, Daniel Russo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasingly, recommender systems are tasked with improving users' long-term
+satisfaction. In this context, we study a content exploration task, which we
+formalize as a bandit problem with delayed rewards. There is an apparent
+trade-off in choosing the learning signal: waiting for the full reward to
+become available might take several weeks, slowing the rate of learning,
+whereas using short-term proxy rewards reflects the actual long-term goal only
+imperfectly. First, we develop a predictive model of delayed rewards that
+incorporates all information obtained to date. Rewards as well as shorter-term
+surrogate outcomes are combined through a Bayesian filter to obtain a
+probabilistic belief. Second, we devise a bandit algorithm that quickly learns
+to identify content aligned with long-term success using this new predictive
+model. We prove a regret bound for our algorithm that depends on the
+\textit{Value of Progressive Feedback}, an information theoretic metric that
+captures the quality of short-term leading indicators that are observed prior
+to the long-term reward. We apply our approach to a podcast recommendation
+problem, where we seek to recommend shows that users engage with repeatedly
+over two months. We empirically validate that our approach significantly
+outperforms methods that optimize for short-term proxies or rely solely on
+delayed rewards, as demonstrated by an A/B test in a recommendation system that
+serves hundreds of millions of users.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantifying the Importance of Data Alignment in Downstream Model
+  Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krrish Chawla, Aryan Sahai, Mario DePavia, Sudharsan Sundar, Brando Miranda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrary to the conventional emphasis on dataset size, we explore the role of
+data alignment -- an often overlooked aspect of data quality -- in training
+capable Large Language Models (LLMs). To do so, we use the Task2Vec-based
+alignment coefficient, a quantitative measure of the similarity between two
+datasets, to quantify the impact of alignment between training data and
+evaluation data on downstream performance. In particular, we conduct controlled
+\textit{interventional} experiments for two settings: 1. the impact of
+increased alignment coefficients between various pre-training (pt) against
+evaluation datasets, and 2. the impact of increased alignment coefficients
+between domain specific fine-tuning (ft) against domain specific evaluation.
+The domain specific task we explore is Autoformalization -- the machine
+translation task between natural language and code for formal verification. In
+both settings, we find a strong, predictable negative correlation between the
+alignment coefficient of a model's training and evaluation data and the model's
+loss/perplexity on the respective downstream task. These findings suggest a
+re-evaluation of LLM training approaches, demonstrating the relevance of data
+alignment compared to data quantity, especially in specialized downstream tasks
+such as Autoformalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLAVARS: A Multimodal Foundational Language and Vision Alignment Model
+  for Remote Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaac Corley, Simone Fobi Nsutezo, Anthony Ortiz, Caleb Robinson, Rahul Dodhia, Juan M. Lavista Ferres, Peyman Najafirad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing imagery is dense with objects and contextual visual
+information. There is a recent trend to combine paired satellite images and
+text captions for pretraining performant encoders for downstream tasks.
+However, while contrastive image-text methods like CLIP enable vision-language
+alignment and zero-shot classification ability, vision-only downstream
+performance tends to degrade compared to image-only pretraining, such as MAE.
+In this paper, we propose FLAVARS, a pretraining method that combines the best
+of both contrastive learning and masked modeling, along with geospatial
+alignment via contrastive location encoding. We find that FLAVARS significantly
+outperforms a baseline of SkyCLIP for vision-only tasks such as KNN
+classification and semantic segmentation, +6\% mIOU on SpaceNet1, while
+retaining the ability to perform zero-shot classification, unlike MAE
+pretrained methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Time series forecasting for multidimensional telemetry data using GAN
+  and BiLSTM in a Digital Twin 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joao Carmo de Almeida Neto, Claudio Miceli de Farias, Leandro Santiago de Araujo, Leopoldo Andre Dutra Lusquino Filho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research related to digital twins has been increasing in recent years.
+Besides the mirroring of the physical word into the digital, there is the need
+of providing services related to the data collected and transferred to the
+virtual world. One of these services is the forecasting of physical part future
+behavior, that could lead to applications, like preventing harmful events or
+designing improvements to get better performance. One strategy used to predict
+any system operation it is the use of time series models like ARIMA or LSTM,
+and improvements were implemented using these algorithms. Recently, deep
+learning techniques based on generative models such as Generative Adversarial
+Networks (GANs) have been proposed to create time series and the use of LSTM
+has gained more relevance in time series forecasting, but both have limitations
+that restrict the forecasting results. Another issue found in the literature is
+the challenge of handling multivariate environments/applications in time series
+generation. Therefore, new methods need to be studied in order to fill these
+gaps and, consequently, provide better resources for creating useful digital
+twins. In this proposal, it is going to be studied the integration of a BiLSTM
+layer with a time series obtained by GAN in order to improve the forecasting of
+all the features provided by the dataset in terms of accuracy and,
+consequently, improving behaviour prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Head Motion Degrades Machine Learning Classification of Alzheimer's
+  Disease from Positron Emission Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eléonore V. Lieffrig, Takuya Toyonaga, Jiazhen Zhang, John A. Onofrey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain positron emission tomography (PET) imaging is broadly used in research
+and clinical routines to study, diagnose, and stage Alzheimer's disease (AD).
+However, its potential cannot be fully exploited yet due to the lack of
+portable motion correction solutions, especially in clinical settings. Head
+motion during data acquisition has indeed been shown to degrade image quality
+and induces tracer uptake quantification error. In this study, we demonstrate
+that it also biases machine learning-based AD classification. We start by
+proposing a binary classification algorithm solely based on PET images. We find
+that it reaches a high accuracy in classifying motion corrected images into
+cognitive normal or AD. We demonstrate that the classification accuracy
+substantially decreases when images lack motion correction, thereby limiting
+the algorithm's effectiveness and biasing image interpretation. We validate
+these findings in cohorts of 128 $^{11}$C-UCB-J and 173 $^{18}$F-FDG scans, two
+tracers highly relevant to the study of AD. Classification accuracies decreased
+by 10% and 5% on 20 $^{18}$F-FDG and 20 $^{11}$C-UCB-J testing cases,
+respectively. Our findings underscore the critical need for efficient motion
+correction methods to make the most of the diagnostic capabilities of PET-based
+machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models For Text Classification: Case Study And
+  Comprehensive <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arina Kostina, Marios D. Dikaiakos, Dimosthenis Stefanidis, George Pallis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unlocking the potential of Large Language Models (LLMs) in data
+classification represents a promising frontier in natural language processing.
+In this work, we evaluate the performance of different LLMs in comparison with
+state-of-the-art deep-learning and machine-learning models, in two different
+classification scenarios: i) the classification of employees' working locations
+based on job reviews posted online (multiclass classification), and 2) the
+classification of news articles as fake or not (binary classification). Our
+analysis encompasses a diverse range of language models differentiating in
+size, quantization, and architecture. We explore the impact of alternative
+prompting techniques and evaluate the models based on the weighted F1-score.
+Also, we examine the trade-off between performance (F1-score) and time
+(inference response time) for each language model to provide a more nuanced
+understanding of each model's practical applicability. Our work reveals
+significant variations in model responses based on the prompting strategies. We
+find that LLMs, particularly Llama3 and GPT-4, can outperform traditional
+methods in complex classification tasks, such as multiclass classification,
+though at the cost of longer inference times. In contrast, simpler ML models
+offer better performance-to-time trade-offs in simpler binary classification
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Keras Sig: Efficient Path Signature Computation on GPU in Keras 3 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08455v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08455v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rémi Genet, Hugo Inzirillo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we introduce Keras Sig a high-performance pythonic library
+designed to compute path signature for deep learning applications. Entirely
+built in Keras 3, \textit{Keras Sig} leverages the seamless integration with
+the mostly used deep learning backends such as PyTorch, JAX and TensorFlow.
+Inspired by Kidger and Lyons (2021),we proposed a novel approach reshaping
+signature calculations to leverage GPU parallelism. This adjustment allows us
+to reduce the training time by 55\% and 5 to 10-fold improvements in direct
+signature computation compared to existing methods, while maintaining similar
+CPU performance. Relying on high-level tensor operations instead of low-level
+C++ code, Keras Sig significantly reduces the versioning and compatibility
+issues commonly encountered in deep learning libraries, while delivering
+superior or comparable performance across various hardware configurations. We
+demonstrate through extensive benchmarking that our approach scales efficiently
+with the length of input sequences and maintains competitive performance across
+various signature parameters, though bounded by memory constraints for very
+large signature dimensions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vchitect-2.0: Parallel <span class="highlight-title">Transformer</span> for Scaling Up Video Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08453v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08453v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weichen Fan, Chenyang Si, Junhao Song, Zhenyu Yang, Yinan He, Long Zhuo, Ziqi Huang, Ziyue Dong, Jingwen He, Dongwei Pan, Yi Wang, Yuming Jiang, Yaohui Wang, Peng Gao, Xinyuan Chen, Hengjie Li, Dahua Lin, Yu Qiao, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Vchitect-2.0, a parallel transformer architecture designed to
+scale up video diffusion models for large-scale text-to-video generation. The
+overall Vchitect-2.0 system has several key designs. (1) By introducing a novel
+Multimodal Diffusion Block, our approach achieves consistent alignment between
+text descriptions and generated video frames, while maintaining temporal
+coherence across sequences. (2) To overcome memory and computational
+bottlenecks, we propose a Memory-efficient Training framework that incorporates
+hybrid parallelism and other memory reduction techniques, enabling efficient
+training of long video sequences on distributed systems. (3) Additionally, our
+enhanced data processing pipeline ensures the creation of Vchitect T2V
+DataVerse, a high-quality million-scale training dataset through rigorous
+annotation and aesthetic evaluation. Extensive benchmarking demonstrates that
+Vchitect-2.0 outperforms existing methods in video quality, training
+efficiency, and scalability, serving as a suitable base for high-fidelity video
+generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FARE: A Deep Learning-Based Framework for Radar-based Face Recognition
+  and Out-of-distribution Detection <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sabri Mustafa Kahya, Boran Hamdi Sivrikaya, Muhammet Sami Yavuz, Eckehard Steinbach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a novel pipeline for face recognition and
+out-of-distribution (OOD) detection using short-range FMCW radar. The proposed
+system utilizes Range-Doppler and micro Range-Doppler Images. The architecture
+features a primary path (PP) responsible for the classification of
+in-distribution (ID) faces, complemented by intermediate paths (IPs) dedicated
+to OOD detection. The network is trained in two stages: first, the PP is
+trained using triplet loss to optimize ID face classification. In the second
+stage, the PP is frozen, and the IPs-comprising simple linear autoencoder
+networks-are trained specifically for OOD detection. Using our dataset
+generated with a 60 GHz FMCW radar, our method achieves an ID classification
+accuracy of 99.30% and an OOD detection AUROC of 96.91%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-informed neural networks for phase-resolved data assimilation
+  and prediction of nonlinear ocean waves 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Svenja Ehlers, Norbert Hoffmann, Tianning Tang, Adrian H. Callaghan, Rui Cao, Enrique M. Padilla, Yuxin Fang, Merten Stender
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The assimilation and prediction of phase-resolved surface gravity waves are
+critical challenges in ocean science and engineering. Potential flow theory
+(PFT) has been widely employed to develop wave models and numerical techniques
+for wave prediction. However, traditional wave prediction methods are often
+limited. For example, most simplified wave models have a limited ability to
+capture strong wave nonlinearity, while fully nonlinear PFT solvers often fail
+to meet the speed requirements of engineering applications. This computational
+inefficiency also hinders the development of effective data assimilation
+techniques, which are required to reconstruct spatial wave information from
+sparse measurements to initialize the wave prediction. To address these
+challenges, we propose a novel solver method that leverages physics-informed
+neural networks (PINNs) that parameterize PFT solutions as neural networks.
+This provides a computationally inexpensive way to assimilate and predict wave
+data. The proposed PINN framework is validated through comparisons with
+analytical linear PFT solutions and experimental data collected in a laboratory
+wave flume. The results demonstrate that our approach accurately captures and
+predicts irregular, nonlinear, and dispersive wave surface dynamics. Moreover,
+the PINN can infer the fully nonlinear velocity potential throughout the entire
+fluid volume solely from surface elevation measurements, enabling the
+calculation of fluid velocities that are difficult to measure experimentally.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 12 Figures, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-Informed Latent Neural Operator for Real-time Predictions of
+  Complex Physical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharmila Karumuri, Lori Graham-Brady, Somdatta Goswami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep operator network (DeepONet) has shown great promise as a surrogate model
+for systems governed by partial differential equations (PDEs), learning
+mappings between infinite-dimensional function spaces with high accuracy.
+However, achieving low generalization errors often requires highly
+overparameterized networks, posing significant challenges for large-scale,
+complex systems. To address these challenges, latent DeepONet was proposed,
+introducing a two-step approach: first, a reduced-order model is used to learn
+a low-dimensional latent space, followed by operator learning on this latent
+space. While effective, this method is inherently data-driven, relying on large
+datasets and making it difficult to incorporate governing physics into the
+framework. Additionally, the decoupled nature of these steps prevents
+end-to-end optimization and the ability to handle data scarcity. This work
+introduces PI-Latent-NO, a physics-informed latent operator learning framework
+that overcomes these limitations. Our architecture employs two coupled
+DeepONets in an end-to-end training scheme: the first, termed Latent-DeepONet,
+identifies and learns the low-dimensional latent space, while the second,
+Reconstruction-DeepONet, maps the latent representations back to the original
+physical space. By integrating governing physics directly into the training
+process, our approach requires significantly fewer data samples while achieving
+high accuracy. Furthermore, the framework is computationally and memory
+efficient, exhibiting nearly constant scaling behavior on a single GPU and
+demonstrating the potential for further efficiency gains with distributed
+training. We validate the proposed method on high-dimensional parametric PDEs,
+demonstrating its effectiveness as a proof of concept and its potential
+scalability for large-scale systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal vs. Anticausal merging of predictors <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Hernan Garrido Mejia, Patrick Blöbaum, Bernhard Schölkopf, Dominik Janzing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the differences arising from merging predictors in the causal and
+anticausal directions using the same data. In particular we study the
+asymmetries that arise in a simple model where we merge the predictors using
+one binary variable as target and two continuous variables as predictors. We
+use Causal Maximum Entropy (CMAXENT) as inductive bias to merge the predictors,
+however, we expect similar differences to hold also when we use other merging
+methods that take into account asymmetries between cause and effect. We show
+that if we observe all bivariate distributions, the CMAXENT solution reduces to
+a logistic regression in the causal direction and Linear Discriminant Analysis
+(LDA) in the anticausal direction. Furthermore, we study how the decision
+boundaries of these two solutions differ whenever we observe only some of the
+bivariate distributions implications for Out-Of-Variable (OOV) generalisation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at the 38th Conference on Neural Information Processing
+  Systems (NeurIPS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Stochastic Gradient Descent Effective? A PDE Perspective on Machine
+  Learning processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Barbieri, Matteo Bonforte, Peio Ibarrondo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we analyze the behaviour of the stochastic gradient descent
+(SGD), a widely used method in supervised learning for optimizing neural
+network weights via a minimization of non-convex loss functions. Since the
+pioneering work of E, Li and Tai (2017), the underlying structure of such
+processes can be understood via parabolic PDEs of Fokker-Planck type, which are
+at the core of our analysis. Even if Fokker-Planck equations have a long
+history and a extensive literature, almost nothing is known when the potential
+is non-convex or when the diffusion matrix is degenerate, and this is the main
+difficulty that we face in our analysis.
+  We identify two different regimes: in the initial phase of SGD, the loss
+function drives the weights to concentrate around the nearest local minimum. We
+refer to this phase as the drift regime and we provide quantitative estimates
+on this concentration phenomenon. Next, we introduce the diffusion regime,
+where stochastic fluctuations help the learning process to escape suboptimal
+local minima. We analyze the Mean Exit Time (MET) and prove upper and lower
+bounds of the MET. Finally, we address the asymptotic convergence of SGD, for a
+non-convex cost function and a degenerate diffusion matrix, that do not allow
+to use the standard approaches, and require new techniques. For this purpose,
+we exploit two different methods: duality and entropy methods.
+  We provide new results about the dynamics and effectiveness of SGD, offering
+a deep connection between stochastic optimization and PDE theory, and some
+answers and insights to basic questions in the Machine Learning processes: How
+long does SGD take to escape from a bad minimum? Do neural network parameters
+converge using SGD? How do parameters evolve in the first stage of training
+with SGD?
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Constant Velocity Latent Dynamics Approach for Accelerating Simulation
+  of Stiff Nonlinear Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Cole Nockolds, C. G. Krishnanunni, Tan Bui-Thanh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving stiff ordinary differential equations (StODEs) requires sophisticated
+numerical solvers, which are often computationally expensive. In particular,
+StODE's often cannot be solved with traditional explicit time integration
+schemes and one must resort to costly implicit methods to compute solutions. On
+the other hand, state-of-the-art machine learning (ML) based methods such as
+Neural ODE (NODE) poorly handle the timescale separation of various elements of
+the solutions to StODEs and require expensive implicit solvers for integration
+at inference time. In this work, we embark on a different path which involves
+learning a latent dynamics for StODEs, in which one completely avoids numerical
+integration. To that end, we consider a constant velocity latent dynamical
+system whose solution is a sequence of straight lines. Given the initial
+condition and parameters of the ODE, the encoder networks learn the slope (i.e
+the constant velocity) and the initial condition for the latent dynamics. In
+other words, the solution of the original dynamics is encoded into a sequence
+of straight lines which can be decoded back to retrieve the actual solution as
+and when required. Another key idea in our approach is a nonlinear
+transformation of time, which allows for the "stretching/squeezing" of time in
+the latent space, thereby allowing for varying levels of attention to different
+temporal regions in the solution. Additionally, we provide a simple
+universal-approximation-type proof showing that our approach can approximate
+the solution of stiff nonlinear system on a compact set to any degree of
+accuracy, {\epsilon}. We show that the dimension of the latent dynamical system
+in our approach is independent of {\epsilon}. Numerical investigation on
+prototype StODEs suggest that our method outperforms state-of-the art machine
+learning approaches for handling StODEs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEAL: Speaker Error Correction using Acoustic-conditioned Large Language
+  Models <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anurag Kumar, Rohit Paturi, Amber Afshan, Sundararajan Srinivasan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speaker Diarization (SD) is a crucial component of modern end-to-end ASR
+pipelines. Traditional SD systems, which are typically audio-based and operate
+independently of ASR, often introduce speaker errors, particularly during
+speaker transitions and overlapping speech. Recently, language models including
+fine-tuned large language models (LLMs) have shown to be effective as a
+second-pass speaker error corrector by leveraging lexical context in the
+transcribed output. In this work, we introduce a novel acoustic conditioning
+approach to provide more fine-grained information from the acoustic diarizer to
+the LLM. We also show that a simpler constrained decoding strategy reduces LLM
+hallucinations, while avoiding complicated post-processing. Our approach
+significantly reduces the speaker error rates by 24-43% across Fisher,
+Callhome, and RT03-CTS datasets, compared to the first-pass Acoustic SD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CVaR-Based Variational Quantum Optimization for User Association in
+  Handoff-Aware Vehicular Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijiang Yan, Hao Zhou, Jianhua Pei, Aryan Kaushik, Hina Tabassum, Ping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient resource allocation is essential for optimizing various tasks in
+wireless networks, which are usually formulated as generalized assignment
+problems (GAP). GAP, as a generalized version of the linear sum assignment
+problem, involves both equality and inequality constraints that add
+computational challenges. In this work, we present a novel Conditional Value at
+Risk (CVaR)-based Variational Quantum Eigensolver (VQE) framework to address
+GAP in vehicular networks (VNets). Our approach leverages a hybrid
+quantum-classical structure, integrating a tailored cost function that balances
+both objective and constraint-specific penalties to improve solution quality
+and stability. Using the CVaR-VQE model, we handle the GAP efficiently by
+focusing optimization on the lower tail of the solution space, enhancing both
+convergence and resilience on noisy intermediate-scale quantum (NISQ) devices.
+We apply this framework to a user-association problem in VNets, where our
+method achieves 23.5% improvement compared to the deep neural network (DNN)
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE International Conference on Communications (ICC
+  2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiDepth Multimodal Neural Network: Bidirectional Depth Deep Learning
+  Arcitecture for Spatial-Temporal Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sina Ehsani, Fenglian Pan, Qingpei Hu, Jian Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of spatial-temporal (ST) information in dynamic systems,
+such as urban mobility and weather patterns, is a crucial yet challenging
+problem. The complexity stems from the intricate interplay between spatial
+proximity and temporal relevance, where both long-term trends and short-term
+fluctuations are present in convoluted patterns. Existing approaches, including
+traditional statistical methods and conventional neural networks, may provide
+inaccurate results due to the lack of an effective mechanism that
+simultaneously incorporates information at variable temporal depths while
+maintaining spatial context, resulting in a trade-off between comprehensive
+long-term historical analysis and responsiveness to short-term new information.
+To bridge this gap, this paper proposes the BiDepth Multimodal Neural Network
+(BDMNN) with bidirectional depth modulation that enables a comprehensive
+understanding of both long-term seasonality and short-term fluctuations,
+adapting to the complex ST context. Case studies with real-world public data
+demonstrate significant improvements in prediction accuracy, with a 12%
+reduction in Mean Squared Error for urban traffic prediction and a 15%
+improvement in rain precipitation forecasting compared to state-of-the-art
+benchmarks, without demanding extra computational resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to Applied Intelligence for review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging 2D Masked Reconstruction for Domain Adaptation of 3D Pose
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansoo Park, Chanwoo Kim, Jihyeon Kim, Hoseong Cho, Nhat Nguyen Bao Truong, Taehwan Kim, Seungryul Baek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RGB-based 3D pose estimation methods have been successful with the
+development of deep learning and the emergence of high-quality 3D pose
+datasets. However, most existing methods do not operate well for testing images
+whose distribution is far from that of training data. However, most existing
+methods do not operate well for testing images whose distribution is far from
+that of training data. This problem might be alleviated by involving diverse
+data during training, however it is non-trivial to collect such diverse data
+with corresponding labels (i.e. 3D pose). In this paper, we introduced an
+unsupervised domain adaptation framework for 3D pose estimation that utilizes
+the unlabeled data in addition to labeled data via masked image modeling (MIM)
+framework. Foreground-centric reconstruction and attention regularization are
+further proposed to increase the effectiveness of unlabeled data usage.
+Experiments are conducted on the various datasets in human and hand pose
+estimation tasks, especially using the cross-domain scenario. We demonstrated
+the effectiveness of ours by achieving the state-of-the-art accuracy on all
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OptiChat: Bridging Optimization Models and Practitioners with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08406v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08406v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Chen, Gonzalo Esteban Constante-Flores, Krishna Sri Ipsit Mantri, Sai Madhukiran Kompalli, Akshdeep Singh Ahluwalia, Can Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization models have been applied to solve a wide variety of
+decision-making problems. These models are usually developed by optimization
+experts but are used by practitioners without optimization expertise in various
+application domains. As a result, practitioners often struggle to interact with
+and draw useful conclusions from optimization models independently. To fill
+this gap, we introduce OptiChat, a natural language dialogue system designed to
+help practitioners interpret model formulation, diagnose infeasibility, analyze
+sensitivity, retrieve information, evaluate modifications, and provide
+counterfactual explanations. By augmenting large language models (LLMs) with
+functional calls and code generation tailored for optimization models, we
+enable seamless interaction and minimize the risk of hallucinations in
+OptiChat. We develop a new dataset to evaluate OptiChat's performance in
+explaining optimization models. Experiments demonstrate that OptiChat
+effectively bridges the gap between optimization models and practitioners,
+delivering autonomous, accurate, and instant responses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predict Confidently, Predict Right: Abstention in Dynamic Graph Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jayadratha Gayen, Himanshu Pal, Naresh Manwani, Charu Sharma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many real-world systems can be modeled as dynamic graphs, where nodes and
+edges evolve over time, requiring specialized models to capture their evolving
+dynamics in risk-sensitive applications effectively. Temporal graph neural
+networks (GNNs) are one such category of specialized models. For the first
+time, our approach integrates a reject option strategy within the framework of
+GNNs for continuous-time dynamic graphs. This allows the model to strategically
+abstain from making predictions when the uncertainty is high and confidence is
+low, thus minimizing the risk of critical misclassification and enhancing the
+results and reliability. We propose a coverage-based abstention prediction
+model to implement the reject option that maximizes prediction within a
+specified coverage. It improves the prediction score for link prediction and
+node classification tasks. Temporal GNNs deal with extremely skewed datasets
+for the next state prediction or node classification task. In the case of class
+imbalance, our method can be further tuned to provide a higher weightage to the
+minority class. Exhaustive experiments are presented on four datasets for
+dynamic link prediction and two datasets for dynamic node classification tasks.
+This demonstrates the effectiveness of our approach in improving the
+reliability and area under the curve (AUC)/ average precision (AP) scores for
+predictions in dynamic graph scenarios. The results highlight our model's
+ability to efficiently handle the trade-offs between prediction confidence and
+coverage, making it a dependable solution for applications requiring high
+precision in dynamic and uncertain environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empathetic Conversational Agents: Utilizing Neural and Physiological
+  Signals for Enhanced Empathetic Interactions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nastaran Saffaryazdi, Tamil Selvan Gunasekaran, Kate Laveys, Elizabeth Broadbent, Mark Billinghurst
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational agents (CAs) are revolutionizing human-computer interaction by
+evolving from text-based chatbots to empathetic digital humans (DHs) capable of
+rich emotional expressions. This paper explores the integration of neural and
+physiological signals into the perception module of CAs to enhance empathetic
+interactions. By leveraging these cues, the study aims to detect emotions in
+real-time and generate empathetic responses and expressions. We conducted a
+user study where participants engaged in conversations with a DH about
+emotional topics. The DH responded and displayed expressions by mirroring
+detected emotions in real-time using neural and physiological cues. The results
+indicate that participants experienced stronger emotions and greater engagement
+during interactions with the Empathetic DH, demonstrating the effectiveness of
+incorporating neural and physiological signals for real-time emotion
+recognition. However, several challenges were identified, including recognition
+accuracy, emotional transition speeds, individual personality effects, and
+limitations in voice tone modulation. Addressing these challenges is crucial
+for further refining Empathetic DHs and fostering meaningful connections
+between humans and artificial entities. Overall, this research advances
+human-agent interaction and highlights the potential of real-time neural and
+physiological emotion recognition in creating empathetic DHs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Best Practices for Open <span class="highlight-title">Dataset</span>s for LLM Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Baack, Stella Biderman, Kasia Odrozek, Aviya Skowron, Ayah Bdeir, Jillian Bommarito, Jennifer Ding, Maximilian Gahntz, Paul Keller, Pierre-Carl Langlais, Greg Lindahl, Sebastian Majstorovic, Nik Marda, Guilherme Penedo, Maarten Van Segbroeck, Jennifer Wang, Leandro von Werra, Mitchell Baker, Julie Belião, Kasia Chmielinski, Marzieh Fadaee, Lisa Gutermuth, Hynek Kydlíček, Greg Leppert, EM Lewis-Jong, Solana Larsen, Shayne Longpre, Angela Oduor Lungati, Cullen Miller, Victor Miller, Max Ryabinin, Kathleen Siminyu, Andrew Strait, Mark Surman, Anna Tumadóttir, Maurice Weber, Rebecca Weiss, Lee White, Thomas Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many AI companies are training their large language models (LLMs) on data
+without the permission of the copyright owners. The permissibility of doing so
+varies by jurisdiction: in countries like the EU and Japan, this is allowed
+under certain restrictions, while in the United States, the legal landscape is
+more ambiguous. Regardless of the legal status, concerns from creative
+producers have led to several high-profile copyright lawsuits, and the threat
+of litigation is commonly cited as a reason for the recent trend towards
+minimizing the information shared about training datasets by both corporate and
+public interest actors. This trend in limiting data information causes harm by
+hindering transparency, accountability, and innovation in the broader ecosystem
+by denying researchers, auditors, and impacted individuals access to the
+information needed to understand AI models.
+  While this could be mitigated by training language models on open access and
+public domain data, at the time of writing, there are no such models (trained
+at a meaningful scale) due to the substantial technical and sociological
+challenges in assembling the necessary corpus. These challenges include
+incomplete and unreliable metadata, the cost and complexity of digitizing
+physical records, and the diverse set of legal and technical skills required to
+ensure relevance and responsibility in a quickly changing landscape. Building
+towards a future where AI systems can be trained on openly licensed data that
+is responsibly curated and governed requires collaboration across legal,
+technical, and policy domains, along with investments in metadata standards,
+digitization, and fostering a culture of openness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weight Averaging for Out-of-Distribution Generalization and Few-Shot
+  Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijian Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Empirical risk minimization (ERM) is not robust to changes in the
+distribution of data. When the distribution of test data is different from that
+of training data, the problem is known as out-of-distribution generalization.
+Recently, two techniques have been developed for addressing out-of-distribution
+generalization in computer vision: weight averaging (WA) and sharpness-aware
+minimization (SAM). WA involves training multiple models with different
+hyperparameters and then averaging the weights of these models, which can
+significantly improve out-of-distribution generalization performance. SAM
+optimizes a neural network to find minima in flat regions, which have been
+proven to perform well under distribution shifts. While these techniques have
+made great progress, there is still room for improvement and further
+exploration. In this thesis, we propose increasing the model diversity in WA
+explicitly by introducing gradient similarity as a loss regularizer to further
+improve out-of-distribution generalization performance. We also propose
+combining WA and SAM to solve the problem of few-shot domain adaptation. Our
+extensive experiments on digits datasets (MNIST, SVHN, USPS, MNIST-M) and other
+domain adaptation datasets (VLCS, PACS) show that combining WA and SAM leads to
+improved out-of-distribution generalization performance and significantly
+increases few-shot domain adaptation accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Master Thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> Rate-In: Information-Driven Adaptive Dropout Rates for Improved
+  Inference-Time Uncertainty Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07169v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07169v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tal Zeevi, Ravid Shwartz-Ziv, <span class="highlight-author">Yann LeCun</span>, Lawrence H. Staib, John A. Onofrey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate uncertainty estimation is crucial for deploying neural networks in
+risk-sensitive applications such as medical diagnosis. Monte Carlo Dropout is a
+widely used technique for approximating predictive uncertainty by performing
+stochastic forward passes with dropout during inference. However, using static
+dropout rates across all layers and inputs can lead to suboptimal uncertainty
+estimates, as it fails to adapt to the varying characteristics of individual
+inputs and network layers. Existing approaches optimize dropout rates during
+training using labeled data, resulting in fixed inference-time parameters that
+cannot adjust to new data distributions, compromising uncertainty estimates in
+Monte Carlo simulations.
+  In this paper, we propose Rate-In, an algorithm that dynamically adjusts
+dropout rates during inference by quantifying the information loss induced by
+dropout in each layer's feature maps. By treating dropout as controlled noise
+injection and leveraging information-theoretic principles, Rate-In adapts
+dropout rates per layer and per input instance without requiring ground truth
+labels. By quantifying the functional information loss in feature maps, we
+adaptively tune dropout rates to maintain perceptual quality across diverse
+medical imaging tasks and architectural configurations. Our extensive empirical
+study on synthetic data and real-world medical imaging tasks demonstrates that
+Rate-In improves calibration and sharpens uncertainty estimates compared to
+fixed or heuristic dropout rates without compromising predictive performance.
+Rate-In offers a practical, unsupervised, inference-time approach to optimizing
+dropout for more reliable predictive uncertainty estimation in critical
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated author affiliation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Distribution Matching of Representations via Noise-Injected
+  Deep InfoMax 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.06993v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.06993v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Butakov, Alexander Semenenko, Alexander Tolmachev, Andrey Gladkov, Marina Munkhoeva, Alexey Frolov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep InfoMax (DIM) is a well-established method for self-supervised
+representation learning (SSRL) based on maximization of the mutual information
+between the input and the output of a deep neural network encoder. Despite the
+DIM and contrastive SSRL in general being well-explored, the task of learning
+representations conforming to a specific distribution (i.e., distribution
+matching, DM) is still under-addressed. Motivated by the importance of DM to
+several downstream tasks (including generative modeling, disentanglement,
+outliers detection and other), we enhance DIM to enable automatic matching of
+learned representations to a selected prior distribution. To achieve this, we
+propose injecting an independent noise into the normalized outputs of the
+encoder, while keeping the same InfoMax training objective. We show that such
+modification allows for learning uniformly and normally distributed
+representations, as well as representations of other absolutely continuous
+distributions. Our approach is tested on various downstream tasks. The results
+indicate a moderate trade-off between the performance on the downstream tasks
+and quality of DM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 7 fugures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CriSPO: Multi-Aspect Critique-Suggestion-guided Automatic <span class="highlight-title">Prompt</span>
+  Optimization for Text Generation <span class="chip">AAAI-2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.02748v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.02748v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han He, Qianchu Liu, Lei Xu, Chaitanya Shivade, Yi Zhang, Sundararajan Srinivasan, Katrin Kirchhoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing automatic prompt engineering methods are typically designed for
+discriminative tasks, where new task prompts are iteratively refined with
+limited feedback from a single metric reflecting a single aspect. However,
+these approaches are suboptimal for generative tasks, which require more
+nuanced guidance beyond a single numeric metric to improve the prompt and
+optimize multiple aspects of the generated text. To address these challenges,
+we propose a novel multi-aspect Critique-Suggestion-guided automatic Prompt
+Optimization (CriSPO) approach. CriSPO introduces a critique-suggestion module
+as its core component. This module spontaneously discovers aspects, and
+compares generated and reference texts across these aspects, providing specific
+suggestions for prompt modification. These clear critiques and actionable
+suggestions guide a receptive optimizer module to make more substantial
+changes, exploring a broader and more effective search space. To further
+improve CriSPO with multi-metric optimization, we introduce an Automatic Suffix
+Tuning (AST) extension to enhance the performance of task prompts across
+multiple metrics. We evaluate CriSPO on 4 state-of-the-art LLMs across 4
+summarization and 5 QA datasets. Extensive experiments show 3-4% ROUGE score
+improvement on summarization and substantial improvement of various metrics on
+QA. Code available at https://github.com/amazon-science/crispo
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI-2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated Detection and Analysis of Minor Deformations in Flat Walls Due
+  to Railway Vibrations Using LiDAR and Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06457v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06457v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Surjo Dey, Ankit Sharma, Hritu Raj, Susham Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces an advanced methodology for automatically identifying
+minor deformations in flat walls caused by vibrations from nearby railway
+tracks. It leverages high-density Terrestrial Laser Scanner (TLS) LiDAR surveys
+and AI/ML techniques to collect and analyze data. The scan data is processed
+into a detailed point cloud, which is segmented to distinguish ground points,
+trees, buildings, and other objects. The analysis focuses on identifying
+sections along flat walls and estimating their deformations relative to the
+ground orientation.
+  Findings from the study, conducted at the RGIPT campus, reveal significant
+deformations in walls close to the railway corridor, with the highest
+deformations ranging from 7 to 8 cm and an average of 3 to 4 cm. In contrast,
+walls further from the corridor show negligible deformations. The developed
+automated process for feature extraction and deformation monitoring
+demonstrates potential for structural health monitoring. By integrating LiDAR
+data with machine learning, the methodology provides an efficient system for
+identifying and analyzing structural deformations, highlighting the importance
+of continuous monitoring for ensuring structural integrity and public safety in
+urban infrastructure. This approach represents a substantial advancement in
+automated feature extraction and deformation analysis, contributing to more
+effective management of urban infrastructure.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>I am requesting the withdrawal of my paper due to the need for
+  significant revisions to ensure the accuracy and integrity of the presented
+  findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language-Agnostic Modeling of Source Reliability on Wikipedia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.18803v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.18803v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacopo D'Ignazi, Andreas Kaltenbrunner, Yelena Mejova, Michele Tizzani, Kyriaki Kalimeri, Mariano Beiró, Pablo Aragón
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the last few years, content verification through reliable sources has
+become a fundamental need to combat disinformation. Here, we present a
+language-agnostic model designed to assess the reliability of sources across
+multiple language editions of Wikipedia. Utilizing editorial activity data, the
+model evaluates source reliability within different articles of varying
+controversiality such as Climate Change, COVID-19, History, Media, and Biology
+topics. Crafting features that express domain usage across articles, the model
+effectively predicts source reliability, achieving an F1 Macro score of
+approximately 0.80 for English and other high-resource languages. For
+mid-resource languages, we achieve 0.65 while the performance of low-resource
+languages varies; in all cases, the time the domain remains present in the
+articles (which we dub as permanence) is one of the most predictive features.
+We highlight the challenge of maintaining consistent model performance across
+languages of varying resource levels and demonstrate that adapting models from
+higher-resource languages can improve performance. This work contributes not
+only to Wikipedia's efforts in ensuring content verifiability but in ensuring
+reliability across diverse user-generated content in various language
+communities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> of Foundation Models in Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wasif Khan, Seowung Leem, Kyle B. See, Joshua K. Wong, Shaoting Zhang, Ruogu Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FMs) are large-scale deep learning models that are
+developed using large datasets and self-supervised learning methods. These
+models serve as a base for different downstream tasks, including healthcare.
+FMs have been adopted with great success across various domains within
+healthcare. Existing healthcare-based surveys have not yet included all of
+these domains. Therefore, we provide a detailed survey of FMs in healthcare. We
+focus on the history, learning strategies, flagship models, applications, and
+challenges of FMs. We explore how FMs such as the BERT and GPT families are
+reshaping various healthcare domains, including clinical large language models,
+medical image analysis, and omics. Furthermore, we provide a detailed taxonomy
+of healthcare applications facilitated by FMs, such as clinical NLP, medical
+computer vision, graph learning, and other biology-related tasks. Despite the
+promising opportunities FMs provide, they also have several associated
+challenges, which are explained in detail. We also outline open research issues
+and potential lessons learned to provide researchers and practitioners with
+insights into the capabilities of FMs in healthcare to advance their deployment
+and mitigate associated risks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pareto Set Learning for Multi-Objective Reinforcement Learning <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erlong Liu, Yu-Chang Wu, Xiaobin Huang, Chengrui Gao, Ren-Jian Wang, Ke Xue, Chao Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-objective decision-making problems have emerged in numerous real-world
+scenarios, such as video games, navigation and robotics. Considering the clear
+advantages of Reinforcement Learning (RL) in optimizing decision-making
+processes, researchers have delved into the development of Multi-Objective RL
+(MORL) methods for solving multi-objective decision problems. However, previous
+methods either cannot obtain the entire Pareto front, or employ only a single
+policy network for all the preferences over multiple objectives, which may not
+produce personalized solutions for each preference. To address these
+limitations, we propose a novel decomposition-based framework for MORL, Pareto
+Set Learning for MORL (PSL-MORL), that harnesses the generation capability of
+hypernetwork to produce the parameters of the policy network for each
+decomposition weight, generating relatively distinct policies for various
+scalarized subproblems with high efficiency. PSL-MORL is a general framework,
+which is compatible for any RL algorithm. The theoretical result guarantees the
+superiority of the model capacity of PSL-MORL and the optimality of the
+obtained policy network. Through extensive experiments on diverse benchmarks,
+we demonstrate the effectiveness of PSL-MORL in achieving dense coverage of the
+Pareto front, significantly outperforming state-of-the-art MORL methods in the
+hypervolume and sparsity indicators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025 Accept</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Feedback-driven object detection and iterative model improvement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19835v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19835v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sönke Tenckhoff, Mario Koddenbrock, Erik Rodner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated object detection has become increasingly valuable across diverse
+applications, yet efficient, high-quality annotation remains a persistent
+challenge. In this paper, we present the development and evaluation of a
+platform designed to interactively improve object detection models. The
+platform allows uploading and annotating images as well as fine-tuning object
+detection models. Users can then manually review and refine annotations,
+further creating improved snapshots that are used for automatic object
+detection on subsequent image uploads - a process we refer to as semi-automatic
+annotation resulting in a significant gain in annotation efficiency.
+  Whereas iterative refinement of model results to speed up annotation has
+become common practice, we are the first to quantitatively evaluate its
+benefits with respect to time, effort, and interaction savings. Our
+experimental results show clear evidence for a significant time reduction of up
+to 53% for semi-automatic compared to manual annotation. Importantly, these
+efficiency gains did not compromise annotation quality, while matching or
+occasionally even exceeding the accuracy of manual annotations. These findings
+demonstrate the potential of our lightweight annotation platform for creating
+high-quality object detection datasets and provide best practices to guide
+future development of annotation platforms.
+  The platform is open-source, with the frontend and backend repositories
+available on GitHub (https://github.com/ml-lab-htw/iterative-annotate). To
+support the understanding of our labeling process, we have created an
+explanatory video demonstrating the methodology using microscopy images of E.
+coli bacteria as an example. The video is available on YouTube
+(https://www.youtube.com/watch?v=CM9uhE8NN5E).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AI4EA24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ORFormer: Occlusion-Robust <span class="highlight-title">Transformer</span> for Accurate Facial Landmark
+  Detection <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13174v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13174v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jui-Che Chiang, Hou-Ning Hu, Bo-Syuan Hou, Chia-Yu Tseng, Yu-Lun Liu, Min-Hung Chen, Yen-Yu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although facial landmark detection (FLD) has gained significant progress,
+existing FLD methods still suffer from performance drops on partially
+non-visible faces, such as faces with occlusions or under extreme lighting
+conditions or poses. To address this issue, we introduce ORFormer, a novel
+transformer-based method that can detect non-visible regions and recover their
+missing features from visible parts. Specifically, ORFormer associates each
+image patch token with one additional learnable token called the messenger
+token. The messenger token aggregates features from all but its patch. This
+way, the consensus between a patch and other patches can be assessed by
+referring to the similarity between its regular and messenger embeddings,
+enabling non-visible region identification. Our method then recovers occluded
+patches with features aggregated by the messenger tokens. Leveraging the
+recovered features, ORFormer compiles high-quality heatmaps for the downstream
+FLD task. Extensive experiments show that our method generates heatmaps
+resilient to partial occlusions. By integrating the resultant heatmaps into
+existing FLD methods, our method performs favorably against the state of the
+arts on challenging datasets such as WFLW and COFW.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV 2025 Project Link: https://ben0919.github.io/ORFormer/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WINE: Wavelet-Guided GAN Inversion and Editing for High-Fidelity
+  Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaewon Kim, Seung-Jun Moon, Gyeong-Moon Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advanced GAN inversion models aim to convey high-fidelity information
+from original images to generators through methods using generator tuning or
+high-dimensional feature learning. Despite these efforts, accurately
+reconstructing image-specific details remains as a challenge due to the
+inherent limitations both in terms of training and structural aspects, leading
+to a bias towards low-frequency information. In this paper, we look into the
+widely used pixel loss in GAN inversion, revealing its predominant focus on the
+reconstruction of low-frequency features. We then propose WINE, a
+Wavelet-guided GAN Inversion aNd Editing model, which transfers the
+high-frequency information through wavelet coefficients via newly proposed
+wavelet loss and wavelet fusion scheme. Notably, WINE is the first attempt to
+interpret GAN inversion in the frequency domain. Our experimental results
+showcase the precision of WINE in preserving high-frequency details and
+enhancing image quality. Even in editing scenarios, WINE outperforms existing
+state-of-the-art GAN inversion models with a fine balance between editability
+and reconstruction quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AttriBoT: A Bag of Tricks for Efficiently Approximating Leave-One-Out
+  Context Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15102v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15102v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengyuan Liu, Nikhil Kandpal, Colin Raffel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The influence of contextual input on the behavior of large language models
+(LLMs) has prompted the development of context attribution methods that aim to
+quantify each context span's effect on an LLM's generations. The leave-one-out
+(LOO) error, which measures the change in the likelihood of the LLM's response
+when a given span of the context is removed, provides a principled way to
+perform context attribution, but can be prohibitively expensive to compute for
+large models. In this work, we introduce AttriBoT, a series of novel techniques
+for efficiently computing an approximation of the LOO error for context
+attribution. Specifically, AttriBoT uses cached activations to avoid redundant
+operations, performs hierarchical attribution to reduce computation, and
+emulates the behavior of large target models with smaller proxy models. Taken
+together, AttriBoT can provide a >300x speedup while remaining more faithful to
+a target model's LOO error than prior context attribution methods. This stark
+increase in performance makes computing context attributions for a given
+response 30x faster than generating the response itself, empowering real-world
+applications that require computing attributions at scale. We release a
+user-friendly and efficient implementation of AttriBoT to enable efficient LLM
+interpretability as well as encourage future development of efficient context
+attribution methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Electricity Price Prediction Using Multi-Kernel Gaussian Process
+  Regression Combined with Kernel-Based Support Vector Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.00123v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.00123v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhinav Das, Stephan Schlüter, Lorenz Schneider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a new hybrid model for predicting German electricity
+prices. The algorithm is based on combining Gaussian Process Regression (GPR)
+and Support Vector Regression (SVR). While GPR is a competent model for
+learning the stochastic pattern within the data and interpolation, its
+performance for out-of-sample data is not very promising. By choosing a
+suitable data-dependent covariance function, we can enhance the performance of
+GPR for the tested German hourly power prices. However, since the out-of-sample
+prediction depends on the training data, the prediction is vulnerable to noise
+and outliers. To overcome this issue, a separate prediction is made using SVR,
+which applies margin-based optimization, having an advantage in dealing with
+non-linear processes and outliers, since only certain necessary points (support
+vectors) in the training data are responsible for regression. Both individual
+predictions are later combined using the performance-based weight assignment
+method. A test on historic German power prices shows that this approach
+outperforms its chosen benchmarks such as the autoregressive exogenous model,
+the naive approach, as well as the long short-term memory approach of
+prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Set-based Neural Network Encoding Without Weight Tying 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16625v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16625v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Andreis, Soro Bedionita, Philip H. S. Torr, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a neural network weight encoding method for network property
+prediction that utilizes set-to-set and set-to-vector functions to efficiently
+encode neural network parameters. Our approach is capable of encoding neural
+networks in a model zoo of mixed architecture and different parameter sizes as
+opposed to previous approaches that require custom encoding models for
+different architectures. Furthermore, our \textbf{S}et-based \textbf{N}eural
+network \textbf{E}ncoder (SNE) takes into consideration the hierarchical
+computational structure of neural networks. To respect symmetries inherent in
+network weight space, we utilize Logit Invariance to learn the required minimal
+invariance properties. Additionally, we introduce a \textit{pad-chunk-encode}
+pipeline to efficiently encode neural network layers that is adjustable to
+computational and memory constraints. We also introduce two new tasks for
+neural network property prediction: cross-dataset and cross-architecture. In
+cross-dataset property prediction, we evaluate how well property predictors
+generalize across model zoos trained on different datasets but of the same
+architecture. In cross-architecture property prediction, we evaluate how well
+property predictors transfer to model zoos of different architecture not seen
+during training. We show that SNE outperforms the relevant baselines on
+standard benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Approximation Rates in Fréchet Metrics: Barron Spaces, Paley-Wiener
+  Spaces, and Fourier Multipliers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04023v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04023v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Abdeljawad, Thomas Dittrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Operator learning is a recent development in the simulation of Partial
+Differential Equations (PDEs) by means of neural networks. The idea behind this
+approach is to learn the behavior of an operator, such that the resulting
+neural network is an (approximate) mapping in infinite-dimensional spaces that
+is capable of (approximately) simulating the solution operator governed by the
+PDE. In our work, we study some general approximation capabilities for linear
+differential operators by approximating the corresponding symbol in the Fourier
+domain. Analogous to the structure of the class of H\"ormander-Symbols, we
+consider the approximation with respect to a topology that is induced by a
+sequence of semi-norms. In that sense, we measure the approximation error in
+terms of a Fr\'echet metric, and our main result identifies sufficient
+conditions for achieving a predefined approximation error. Secondly, we then
+focus on a natural extension of our main theorem, in which we manage to reduce
+the assumptions on the sequence of semi-norms. Based on existing approximation
+results for the exponential spectral Barron space, we then present a concrete
+example of symbols that can be approximated well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Minor revision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Federated Graph Learning in One-shot Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11304v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11304v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guochen Yan, Xunkai Li, Luyuan Xie, Wentao Zhang, Qingni Shen, Yuejian Fang, Zhonghai Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Graph Learning (FGL) has emerged as a promising paradigm for
+breaking data silos among distributed private graphs. In practical scenarios
+involving heterogeneous distributed graph data, personalized Federated Graph
+Learning (pFGL) aims to enhance model utility by training personalized models
+tailored to client needs. However, existing pFGL methods often require numerous
+communication rounds under heterogeneous graphs, leading to significant
+communication overhead and security concerns. While One-shot Federated Learning
+(OFL) enables collaboration in a single round, existing OFL methods are
+designed for image-centric tasks and ineffective for graph data, leaving a
+critical gap in the field. Additionally, personalized models derived from
+existing methods suffer from bias, failing to effectively generalize to the
+minority. To address these challenges, we propose the first $\textbf{O}$ne-shot
+$\textbf{p}$ersonalized $\textbf{F}$ederated $\textbf{G}$raph
+$\textbf{L}$earning method ($\textbf{O-pFGL}$) for node classification,
+compatible with Secure Aggregation protocols for privacy preservation.
+Specifically, for effective graph learning in one communication round, our
+method estimates and aggregates class-wise feature distribution statistics to
+construct a global pseudo-graph on the server, facilitating the training of a
+global graph model. To mitigate bias, we introduce a two-stage personalized
+training approach that adaptively balances local personal information and
+global insights from the pseudo-graph, improving both personalization and
+generalization. Extensive experiments on 12 multi-scale graph datasets
+demonstrate that our method significantly outperforms state-of-the-art
+baselines across various settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Sub-graph Distillation for Robust Semi-supervised Continual
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.16409v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.16409v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Fan, Yu Wang, Pengfei Zhu, Qinghua Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning (CL) has shown promising results and comparable
+performance to learning at once in a fully supervised manner. However, CL
+strategies typically require a large number of labeled samples, making their
+real-life deployment challenging. In this work, we focus on semi-supervised
+continual learning (SSCL), where the model progressively learns from partially
+labeled data with unknown categories. We provide a comprehensive analysis of
+SSCL and demonstrate that unreliable distributions of unlabeled data lead to
+unstable training and refinement of the progressing stages. This problem
+severely impacts the performance of SSCL. To address the limitations, we
+propose a novel approach called Dynamic Sub-Graph Distillation (DSGD) for
+semi-supervised continual learning, which leverages both semantic and
+structural information to achieve more stable knowledge distillation on
+unlabeled data and exhibit robustness against distribution bias. Firstly, we
+formalize a general model of structural distillation and design a dynamic graph
+construction for the continual learning progress. Next, we define a structure
+distillation vector and design a dynamic sub-graph distillation algorithm,
+which enables end-to-end training and adaptability to scale up tasks. The
+entire proposed method is adaptable to various CL methods and supervision
+settings. Finally, experiments conducted on three datasets CIFAR10, CIFAR100,
+and ImageNet-100, with varying supervision ratios, demonstrate the
+effectiveness of our proposed approach in mitigating the catastrophic
+forgetting problem in semi-supervised continual learning scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balanced Neural ODEs: nonlinear model order reduction and Koopman
+  operator approximations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.10174v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.10174v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Aka, Johannes Brunnemann, Jörg Eiden, Arne Speerforck, Lars Mikelsons
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational Autoencoders (VAEs) are a powerful framework for learning latent
+representations of reduced dimensionality, while Neural ODEs excel in learning
+transient system dynamics. This work combines the strengths of both to generate
+fast surrogate models with adjustable complexity reacting on time-varying
+inputs signals. By leveraging the VAE's dimensionality reduction using a
+nonhierarchical prior, our method adaptively assigns stochastic noise,
+naturally complementing known NeuralODE training enhancements and enabling
+probabilistic time series modeling. We show that standard Latent ODEs struggle
+with dimensionality reduction in systems with time-varying inputs. Our approach
+mitigates this by continuously propagating variational parameters through time,
+establishing fixed information channels in latent space. This results in a
+flexible and robust method that can learn different system complexities, e.g.
+deep neural networks or linear matrices. Hereby, it enables efficient
+approximation of the Koopman operator without the need for predefining its
+dimensionality. As our method balances dimensionality reduction and
+reconstruction accuracy, we call it Balanced Neural ODE (B-NODE). We
+demonstrate the effectiveness of this methods on several academic and
+real-world test cases, e.g. a power plant or MuJoCo data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference paper under review, after revision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spurious Feature Eraser: Stabilizing Test-Time Adaptation for
+  Vision-Language Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00376v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00376v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan Ma, Yan Zhu, Changqing Zhang, Peilin Zhao, Baoyuan Wu, Long-Kai Huang, Qinghua Hu, Bingzhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language foundation models have exhibited remarkable success across a
+multitude of downstream tasks due to their scalability on extensive image-text
+paired data. However, these models also display significant limitations when
+applied to downstream tasks, such as fine-grained image classification, as a
+result of ``decision shortcuts'' that hinder their generalization capabilities.
+In this work, we find that the CLIP model possesses a rich set of features,
+encompassing both \textit{desired invariant causal features} and
+\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP
+on downstream tasks originates from its inability to effectively utilize
+pre-trained features in accordance with specific task requirements. To address
+this challenge, we propose a simple yet effective method, Spurious Feature
+Eraser (SEraser), to alleviate the decision shortcuts by erasing the spurious
+features. Specifically, we introduce a test-time prompt tuning paradigm that
+optimizes a learnable prompt, thereby compelling the model to exploit invariant
+features while disregarding decision shortcuts during the inference phase. The
+proposed method effectively alleviates excessive dependence on potentially
+misleading spurious information. We conduct comparative analysis of the
+proposed method against various approaches which validates the significant
+superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ImagiNet: A Multi-Content Benchmark for Synthetic Image Detection <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20020v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20020v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Delyan Boychev, Radostin Cholakov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent generative models produce images with a level of authenticity that
+makes them nearly indistinguishable from real photos and artwork. Potential
+harmful use cases of these models, necessitate the creation of robust synthetic
+image detectors. However, current datasets in the field contain generated
+images with questionable quality or have examples from one predominant content
+type which leads to poor generalizability of the underlying detectors. We find
+that the curation of a balanced amount of high-resolution generated images
+across various content types is crucial for the generalizability of detectors,
+and introduce ImagiNet, a dataset of 200K examples, spanning four categories:
+photos, paintings, faces, and miscellaneous. Synthetic images in ImagiNet are
+produced with both open-source and proprietary generators, whereas real
+counterparts for each content type are collected from public datasets. The
+structure of ImagiNet allows for a two-track evaluation system: i)
+classification as real or synthetic and ii) identification of the generative
+model. To establish a strong baseline, we train a ResNet-50 model using a
+self-supervised contrastive objective (SelfCon) for each track which achieves
+evaluation AUC of up to 0.99 and balanced accuracy ranging from 86% to 95%,
+even under conditions that involve compression and resizing. The provided model
+is generalizable enough to achieve zero-shot state-of-the-art performance on
+previous synthetic detection benchmarks. We provide ablations to demonstrate
+the importance of content types and publish code and data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Workshop on Datasets and Evaluators of AI Safety, AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Audio-Agent: Leveraging LLMs For Audio Generation, Editing and
+  Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.03335v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.03335v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Wang, Chi-Keung Tang, Yu-Wing Tai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Audio-Agent, a multimodal framework for audio generation,
+editing and composition based on text or video inputs. Conventional approaches
+for text-to-audio (TTA) tasks often make single-pass inferences from text
+descriptions. While straightforward, this design struggles to produce
+high-quality audio when given complex text conditions. In our method, we
+utilize a pre-trained TTA diffusion network as the audio generation agent to
+work in tandem with GPT-4, which decomposes the text condition into atomic,
+specific instructions and calls the agent for audio generation. In doing so,
+Audio-Agent can generate high-quality audio that is closely aligned with the
+provided text or video exhibiting complex and multiple events, while supporting
+variable-length and variable-volume generation. For video-to-audio (VTA) tasks,
+most existing methods require training a timestamp detector to synchronize
+video events with the generated audio, a process that can be tedious and
+time-consuming. Instead, we propose a simpler approach by fine-tuning a
+pre-trained Large Language Model (LLM), e.g., Gemma2-2B-it, to obtain both
+semantic and temporal conditions that bridge the video and audio modality.
+Consequently, our framework contributes a comprehensive solution for both TTA
+and VTA tasks without substantial computational overhead in training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Correlation-Aware Graph Convolutional Networks for Multi-Label Node
+  Classification <span class="chip">KDD2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.17350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.17350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanchen Bei, Weizhi Chen, Hao Chen, Sheng Zhou, Carl Yang, Jiapei Fan, Longtao Huang, Jiajun Bu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-label node classification is an important yet under-explored domain in
+graph mining as many real-world nodes belong to multiple categories rather than
+just a single one. Although a few efforts have been made by utilizing Graph
+Convolution Networks (GCNs) to learn node representations and model
+correlations between multiple labels in the embedding space, they still suffer
+from the ambiguous feature and ambiguous topology induced by multiple labels,
+which reduces the credibility of the messages delivered in graphs and overlooks
+the label correlations on graph data. Therefore, it is crucial to reduce the
+ambiguity and empower the GCNs for accurate classification. However, this is
+quite challenging due to the requirement of retaining the distinctiveness of
+each label while fully harnessing the correlation between labels
+simultaneously. To address these issues, in this paper, we propose a
+Correlation-aware Graph Convolutional Network (CorGCN) for multi-label node
+classification. By introducing a novel Correlation-Aware Graph Decomposition
+module, CorGCN can learn a graph that contains rich label-correlated
+information for each label. It then employs a Correlation-Enhanced Graph
+Convolution to model the relationships between labels during message passing to
+further bolster the classification process. Extensive experiments on five
+datasets demonstrate the effectiveness of our proposed CorGCN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, accepted by KDD2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Random Matrix Approach to Low-Multilinear-Rank Tensor Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03169v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03169v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Lebeau, Florent Chatelain, Romain Couillet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a comprehensive understanding of the estimation of a
+planted low-rank signal from a general spiked tensor model near the
+computational threshold. Relying on standard tools from the theory of large
+random matrices, we characterize the large-dimensional spectral behavior of the
+unfoldings of the data tensor and exhibit relevant signal-to-noise ratios
+governing the detectability of the principal directions of the signal. These
+results allow to accurately predict the reconstruction performance of truncated
+multilinear SVD (MLSVD) in the non-trivial regime. This is particularly
+important since it serves as an initialization of the higher-order orthogonal
+iteration (HOOI) scheme, whose convergence to the best low-multilinear-rank
+approximation depends entirely on its initialization. We give a sufficient
+condition for the convergence of HOOI and show that the number of iterations
+before convergence tends to $1$ in the large-dimensional limit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast, Scale-Adaptive, and Uncertainty-Aware Downscaling of Earth System
+  Model Fields with Generative Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02774v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02774v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Hess, Michael Aich, Baoxiang Pan, Niklas Boers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and high-resolution Earth system model (ESM) simulations are
+essential to assess the ecological and socio-economic impacts of anthropogenic
+climate change, but are computationally too expensive to be run at sufficiently
+high spatial resolution. Recent machine learning approaches have shown
+promising results in downscaling ESM simulations, outperforming
+state-of-the-art statistical approaches. However, existing methods require
+computationally costly retraining for each ESM and extrapolate poorly to
+climates unseen during training. We address these shortcomings by learning a
+consistency model (CM) that efficiently and accurately downscales arbitrary ESM
+simulations without retraining in a zero-shot manner. Our approach yields
+probabilistic downscaled fields at a resolution only limited by the
+observational reference data. We show that the CM outperforms state-of-the-art
+diffusion models at a fraction of computational cost while maintaining high
+controllability on the downscaling task. Further, our method generalizes to
+climate states unseen during training without explicitly formulated physical
+constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Symmetries via Weight-Sharing with Doubly Stochastic Tensors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04594v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04594v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Putri A. van der Linden, Alejandro García-Castellanos, Sharvaree Vadgama, Thijs P. Kuipers, Erik J. Bekkers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group equivariance has emerged as a valuable inductive bias in deep learning,
+enhancing generalization, data efficiency, and robustness. Classically, group
+equivariant methods require the groups of interest to be known beforehand,
+which may not be realistic for real-world data. Additionally, baking in fixed
+group equivariance may impose overly restrictive constraints on model
+architecture. This highlights the need for methods that can dynamically
+discover and apply symmetries as soft constraints. For neural network
+architectures, equivariance is commonly achieved through group transformations
+of a canonical weight tensor, resulting in weight sharing over a given group
+$G$. In this work, we propose to learn such a weight-sharing scheme by defining
+a collection of learnable doubly stochastic matrices that act as soft
+permutation matrices on canonical weight tensors, which can take regular group
+representations as a special case. This yields learnable kernel transformations
+that are jointly optimized with downstream tasks. We show that when the dataset
+exhibits strong symmetries, the permutation matrices will converge to regular
+group representations and our weight-sharing networks effectively become
+regular group convolutions. Additionally, the flexibility of the method enables
+it to effectively pick up on partial symmetries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 14 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable and Resource-Efficient Second-Order Federated Learning via
+  Over-the-Air Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.07662v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.07662v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdulmomen Ghalkha, Chaouki Ben Issaid, Mehdi Bennis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Second-order federated learning (FL) algorithms offer faster convergence than
+their first-order counterparts by leveraging curvature information. However,
+they are hindered by high computational and storage costs, particularly for
+large-scale models. Furthermore, the communication overhead associated with
+large models and digital transmission exacerbates these challenges, causing
+communication bottlenecks. In this work, we propose a scalable second-order FL
+algorithm using a sparse Hessian estimate and leveraging over-the-air
+aggregation, making it feasible for larger models. Our simulation results
+demonstrate more than $67\%$ of communication resources and energy savings
+compared to other first and second-order baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure, 4 subfigures, letter</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Decoders for <span class="highlight-title">Transformer</span>-based Semantic Segmentation: A
+  Compression Perspective <span class="chip">NeurIPS2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03033v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03033v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qishuai Wen, Chun-Guang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art methods for Transformer-based semantic segmentation
+typically adopt Transformer decoders that are used to extract additional
+embeddings from image embeddings via cross-attention, refine either or both
+types of embeddings via self-attention, and project image embeddings onto the
+additional embeddings via dot-product. Despite their remarkable success, these
+empirical designs still lack theoretical justifications or interpretations,
+thus hindering potentially principled improvements. In this paper, we argue
+that there are fundamental connections between semantic segmentation and
+compression, especially between the Transformer decoders and Principal
+Component Analysis (PCA). From such a perspective, we derive a white-box, fully
+attentional DEcoder for PrIncipled semantiC segemenTation (DEPICT), with the
+interpretations as follows: 1) the self-attention operator refines image
+embeddings to construct an ideal principal subspace that aligns with the
+supervision and retains most information; 2) the cross-attention operator seeks
+to find a low-rank approximation of the refined image embeddings, which is
+expected to be a set of orthonormal bases of the principal subspace and
+corresponds to the predefined classes; 3) the dot-product operation yields
+compact representation for image embeddings as segmentation masks. Experiments
+conducted on dataset ADE20K find that DEPICT consistently outperforms its
+black-box counterpart, Segmenter, and it is light weight and more robust.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS2024. Code:https://github.com/QishuaiWen/DEPICT/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenSafe: A Generalizable Safety Enhancer for Safe Reinforcement Learning
+  Algorithms Based on Reduced Order Markov Decision Process Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03912v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03912v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhehua Zhou, Xuan Xie, Jiayang Song, Zhan Shu, Lei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe Reinforcement Learning (SRL) aims to realize a safe learning process for
+Deep Reinforcement Learning (DRL) algorithms by incorporating safety
+constraints. However, the efficacy of SRL approaches often relies on accurate
+function approximations, which are notably challenging to achieve in the early
+learning stages due to data insufficiency. To address this issue, we introduce
+in this work a novel Generalizable Safety enhancer (GenSafe) that is able to
+overcome the challenge of data insufficiency and enhance the performance of SRL
+approaches. Leveraging model order reduction techniques, we first propose an
+innovative method to construct a Reduced Order Markov Decision Process (ROMDP)
+as a low-dimensional approximator of the original safety constraints. Then, by
+solving the reformulated ROMDP-based constraints, GenSafe refines the actions
+of the agent to increase the possibility of constraint satisfaction.
+Essentially, GenSafe acts as an additional safety layer for SRL algorithms. We
+evaluate GenSafe on multiple SRL approaches and benchmark problems. The results
+demonstrate its capability to improve safety performance, especially in the
+early learning phases, while maintaining satisfactory task performance. Our
+proposed GenSafe not only offers a novel measure to augment existing SRL
+methods but also shows broad compatibility with various SRL algorithms, making
+it applicable to a wide range of systems and SRL problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fair CoVariance Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08558v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08558v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Cavallo, Madeline Navarro, Santiago Segarra, Elvin Isufi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Covariance-based data processing is widespread across signal processing and
+machine learning applications due to its ability to model data
+interconnectivities and dependencies. However, harmful biases in the data may
+become encoded in the sample covariance matrix and cause data-driven methods to
+treat different subpopulations unfairly. Existing works such as fair principal
+component analysis (PCA) mitigate these effects, but remain unstable in low
+sample regimes, which in turn may jeopardize the fairness goal. To address both
+biases and instability, we propose Fair coVariance Neural Networks (FVNNs),
+which perform graph convolutions on the covariance matrix for both fair and
+accurate predictions. Our FVNNs provide a flexible model compatible with
+several existing bias mitigation techniques. In particular, FVNNs allow for
+mitigating the bias in two ways: first, they operate on fair covariance
+estimates that remove biases from their principal components; second, they are
+trained in an end-to-end fashion via a fairness regularizer in the loss
+function so that the model parameters are tailored to solve the task directly
+in a fair manner. We prove that FVNNs are intrinsically fairer than analogous
+PCA approaches thanks to their stability in low sample regimes. We validate the
+robustness and fairness of our model on synthetic and real-world data,
+showcasing the flexibility of FVNNs along with the tradeoff between fair and
+accurate performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Self-Attention as a Parametric Endofunctor: A Categorical Framework for
+  <span class="highlight-title">Transformer</span> Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02931v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02931v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles O'Neill
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-attention mechanisms have revolutionised deep learning architectures,
+yet their core mathematical structures remain incompletely understood. In this
+work, we develop a category-theoretic framework focusing on the linear
+components of self-attention. Specifically, we show that the query, key, and
+value maps naturally define a parametric 1-morphism in the 2-category
+$\mathbf{Para(Vect)}$. On the underlying 1-category $\mathbf{Vect}$, these maps
+induce an endofunctor whose iterated composition precisely models multi-layer
+attention. We further prove that stacking multiple self-attention layers
+corresponds to constructing the free monad on this endofunctor. For positional
+encodings, we demonstrate that strictly additive embeddings correspond to
+monoid actions in an affine sense, while standard sinusoidal encodings, though
+not additive, retain a universal property among injective (faithful)
+position-preserving maps. We also establish that the linear portions of
+self-attention exhibit natural equivariance to permutations of input tokens,
+and show how the "circuits" identified in mechanistic interpretability can be
+interpreted as compositions of parametric 1-morphisms. This categorical
+perspective unifies geometric, algebraic, and interpretability-based approaches
+to transformer analysis, making explicit the underlying structures of
+attention. We restrict to linear maps throughout, deferring the treatment of
+nonlinearities such as softmax and layer normalisation, which require more
+advanced categorical constructions. Our results build on and extend recent work
+on category-theoretic foundations for deep learning, offering deeper insights
+into the algebraic structure of attention mechanisms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Private Collaborative Edge Inference via Over-the-Air Computation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21151v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21151v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Selim F. Yilmaz, Burak Hasircioglu, Li Qiao, Deniz Gunduz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider collaborative inference at the wireless edge, where each client's
+model is trained independently on its local dataset. Clients are queried in
+parallel to make an accurate decision collaboratively. In addition to
+maximizing the inference accuracy, we also want to ensure the privacy of local
+models. To this end, we leverage the superposition property of the multiple
+access channel to implement bandwidth-efficient multi-user inference methods.
+We propose different methods for ensemble and multi-view classification that
+exploit over-the-air computation (OAC). We show that these schemes perform
+better than their orthogonal counterparts with statistically significant
+differences while using fewer resources and providing privacy guarantees. We
+also provide experimental results verifying the benefits of the proposed OAC
+approach to multi-user inference, and perform an ablation study to demonstrate
+the effectiveness of our design choices. We share the source code of the
+framework publicly on Github to facilitate further research and
+reproducibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 8 figures. This work extends from our preliminary study
+  presented at the 2022 IEEE International Symposium on Information Theory [1].
+  arXiv admin note: text overlap with arXiv:2202.03129</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One Language, Many Gaps: Evaluating Dialect Fairness and Robustness of
+  Large Language Models in Reasoning Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11005v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11005v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangru Lin, Shaoguang Mao, Emanuele La Malfa, Valentin Hofmann, Adrian de Wynter, Xun Wang, Si-Qing Chen, Michael Wooldridge, Janet B. Pierrehumbert, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language is not monolithic. While benchmarks, including those designed for
+multiple languages, are often used as proxies to evaluate the performance of
+Large Language Models (LLMs), they tend to overlook the nuances of
+within-language variation, and thus fail to model the experience of speakers of
+non-standard dialects. Focusing on African American Vernacular English (AAVE),
+we present the first study aimed at objectively assessing the fairness and
+robustness of LLMs in handling dialects in canonical reasoning tasks, including
+algorithm, math, logic, and integrated reasoning. We introduce \textbf{ReDial}
+(\textbf{Re}asoning with \textbf{Dial}ect Queries), a benchmark containing
+1.2K+ parallel query pairs in Standardized English and AAVE. We hire AAVE
+speakers, including experts with computer science backgrounds, to rewrite seven
+popular benchmarks, such as HumanEval and GSM8K. With ReDial, we evaluate
+widely used LLMs, including GPT, Claude, Llama, Mistral, and the Phi model
+families. Our findings reveal that \textbf{almost all of these widely used
+models show significant brittleness and unfairness to queries in AAVE}. Our
+work establishes a systematic and objective framework for analyzing LLM bias in
+dialectal queries. Moreover, it highlights how mainstream LLMs provide unfair
+service to dialect speakers in reasoning tasks, laying a critical foundation
+for relevant future research. Code and data can be accessed at
+https://github.com/fangru-lin/redial_dialect_robustness_fairness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synthesis and Analysis of Data as Probability Measures with
+  Entropy-Regularized Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07446v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07446v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brendan Mallery, James M. Murphy, Shuchin Aeron
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider synthesis and analysis of probability measures using the
+entropy-regularized Wasserstein-2 cost and its unbiased version, the Sinkhorn
+divergence. The synthesis problem consists of computing the barycenter, with
+respect to these costs, of $m$ reference measures given a set of coefficients
+belonging to the $m$-dimensional simplex. The analysis problem consists of
+finding the coefficients for the closest barycenter in the Wasserstein-2
+distance to a given measure $\mu$. Under the weakest assumptions on the
+measures thus far in the literature, we compute the derivative of the
+entropy-regularized Wasserstein-2 cost. We leverage this to establish a
+characterization of regularized barycenters as solutions to a fixed-point
+equation for the average of the entropic maps from the barycenter to the
+reference measures. This characterization yields a finite-dimensional, convex,
+quadratic program for solving the analysis problem when $\mu$ is a barycenter.
+It is shown that these coordinates, as well as the value of the barycenter
+functional, can be estimated from samples with dimension-independent rates of
+convergence, a hallmark of entropy-regularized optimal transport, and we verify
+these rates experimentally. We also establish that barycentric coordinates are
+stable with respect to perturbations in the Wasserstein-2 metric, suggesting a
+robustness of these coefficients to corruptions. We employ the barycentric
+coefficients as features for classification of corrupted point cloud data, and
+show that compared to neural network baselines, our approach is more efficient
+in small training data regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>58 pages. Code to reproduce experiments:
+  https://github.com/brendanmallery9/Entropic-Barycenters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KAN KAN Buff Signed Graph Neural Networks? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00709v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00709v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhieddine Shebaro, Jelena Tešić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Representation Learning focuses on creating embeddings for nodes and
+edges that capture their features and connections. Graph Neural Networks (GNNs)
+use neural networks to model complex graph relationships. The Kolmogorov-Arnold
+Neural Network (KAN) has recently emerged as an alternative to the Multi-Layer
+Perceptron (MLP), offering better accuracy and interpretability with fewer
+parameters. KANs have been applied to GNN tasks. This paper introduces the
+integration of KANs into Signed Graph Convolutional Networks (SGCNs). We
+evaluate KAN-enhanced SGCNs (KASGCN) on signed community detection and link
+sign prediction tasks to improve embedding quality in signed networks. While
+the results show some variability, KASGCN performs competitively with or
+similarly to the standard SGCN in the functions tested. Its effectiveness
+depends on the specific context, such as the signed graph and parameter
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of Artificial Intelligence Methods for Lead Time Prediction
+  in Non-Cycled Areas of Automotive Production 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cornelius Hake, Jonas Weigele, Frederik Reichert, Christian Friedrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present study examines the effectiveness of applying Artificial
+Intelligence methods in an automotive production environment to predict unknown
+lead times in a non-cycle-controlled production area. Data structures are
+analyzed to identify contextual features and then preprocessed using one-hot
+encoding. Methods selection focuses on supervised machine learning techniques.
+In supervised learning methods, regression and classification methods are
+evaluated. Continuous regression based on target size distribution is not
+feasible. Classification methods analysis shows that Ensemble Learning and
+Support Vector Machines are the most suitable. Preliminary study results
+indicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost
+yield the best results. After further testing and extensive hyperparameter
+optimization, the final method choice is the LightGBM algorithm. Depending on
+feature availability and prediction interval granularity, relative prediction
+accuracies of up to 90% can be achieved. Further tests highlight the importance
+of periodic retraining of AI models to accurately represent complex production
+processes using the database. The research demonstrates that AI methods can be
+effectively applied to highly variable production data, adding business value
+by providing an additional metric for various control tasks while outperforming
+current non AI-based systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Set-Based Training for Neural Network Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14961v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14961v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Koller, Tobias Ladner, Matthias Althoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are vulnerable to adversarial attacks, i.e., small input
+perturbations can significantly affect the outputs of a neural network.
+Therefore, to ensure safety of safety-critical environments, the robustness of
+a neural network must be formally verified against input perturbations, e.g.,
+from noisy sensors. To improve the robustness of neural networks and thus
+simplify the formal verification, we present a novel set-based training
+procedure in which we compute the set of possible outputs given the set of
+possible inputs and compute for the first time a gradient set, i.e., each
+possible output has a different gradient. Therefore, we can directly reduce the
+size of the output enclosure by choosing gradients toward its center. Small
+output enclosures increase the robustness of a neural network and, at the same
+time, simplify its formal verification. The latter benefit is due to the fact
+that a larger size of propagated sets increases the conservatism of most
+verification methods. Our extensive evaluation demonstrates that set-based
+training produces robust neural networks with competitive performance, which
+can be verified using fast (polynomial-time) verification algorithms due to the
+reduced output set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COOL: Efficient and Reliable Chain-Oriented Objective Logic with Neural
+  Networks Feedback Control for Program Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.13874v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.13874v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jipeng Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Program synthesis methods, whether formal or neural-based, lack fine-grained
+control and flexible modularity, which limits their adaptation to complex
+software development. These limitations stem from rigid Domain-Specific
+Language (DSL) frameworks and neural network incorrect predictions. To this
+end, we propose the Chain of Logic (CoL), which organizes the synthesis process
+into an activity flow and provides heuristic control to guide the process.
+Furthermore, by integrating neural networks with libraries and introducing a
+Neural Network Feedback Control (NNFC) mechanism, our approach modularizes
+synthesis and mitigates the impact of neural network mispredictions.
+Experiments on relational and symbolic synthesis tasks show that CoL
+significantly enhances the efficiency and reliability of DSL program synthesis
+across multiple metrics. Specifically, CoL improves accuracy by 70% while
+reducing tree operations by 91% and time by 95%. Additionally, NNFC further
+boosts accuracy by 6%, with a 64% reduction in tree operations under
+challenging conditions such as insufficient training data, increased
+difficulty, and multidomain synthesis. These improvements confirm COOL as a
+highly efficient and reliable program synthesis framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MoPE: Mixture of <span class="highlight-title">Prompt</span> Experts for Parameter-Efficient and Scalable
+  Multimodal Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10568v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10568v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixiang Jiang, Lingbo Liu, Changwen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the demonstrated parameter efficiency of prompt-based multimodal
+fusion methods, their limited adaptivity and expressiveness often result in
+suboptimal performance compared to other tuning approaches. In this paper, we
+introduce the Mixture of Prompt Experts (MoPE), the first technique designed to
+overcome these limitations by decomposing standard prompts to capture
+instance-level features adaptively. Building on this decomposition, MoPE
+enhances prompt fusion's expressiveness by leveraging multimodal pairing priors
+to route the most effective prompt for each instance dynamically. Compared to
+vanilla prompting, our MoPE-based fusion method exhibits greater
+expressiveness, scaling more effectively with the training data and the overall
+number of trainable parameters. We also investigate regularization terms for
+expert routing, which lead to emergent expert specialization with enhanced
+adaptiveness and interpretablity. Extensive experiments across six multimodal
+datasets spanning four modalities demonstrate state-of-the-art performance for
+prompt fusion, matching or even surpassing the performance of fine-tuning while
+requiring only 0.8% of the trainable parameters. Project homepage:
+https://github.com/songrise/MoPE
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review, Extended version of arxiv:2312.03734</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Layer-Adaptive State Pruning for Deep State Space Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02824v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02824v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minseon Gwak, Seongrok Moon, Joohwan Ko, PooGyeon Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the lack of state dimension optimization methods, deep state space
+models (SSMs) have sacrificed model capacity, training search space, or
+stability to alleviate computational costs caused by high state dimensions. In
+this work, we provide a structured pruning method for SSMs, Layer-Adaptive
+STate pruning (LAST), which reduces the state dimension of each layer in
+minimizing model-level output energy loss by extending modal truncation for a
+single system. LAST scores are evaluated using the $\mathcal{H}_{\infty}$ norms
+of subsystems and layer-wise energy normalization. The scores serve as global
+pruning criteria, enabling cross-layer comparison of states and layer-adaptive
+pruning. Across various sequence benchmarks, LAST optimizes previous SSMs,
+revealing the redundancy and compressibility of their state spaces. Notably, we
+demonstrate that, on average, pruning 33% of states still maintains performance
+with 0.52% accuracy loss in multi-input multi-output SSMs without retraining.
+Code is available at https://github.com/msgwak/LAST.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-driven Bayesian State Estimation with Compressed Measurement of
+  Model-free Process using Semi-supervised Learning <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.07368v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.07368v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anubhab Ghosh, Yonina C. Eldar, Saikat Chatterjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research topic is: data-driven Bayesian state estimation with compressed
+measurement (BSCM) of model-free process, say for a (causal) tracking
+application. The dimension of the temporal measurement vector is lower than the
+dimension of the temporal state vector to be estimated. Hence the state
+estimation problem is an underdetermined inverse problem. The underlying
+dynamical model of the states is assumed to be unknown and hence, we use the
+terminology 'model-free process'. In absence of the dynamical model, we can not
+employ traditional model-driven methods like Kalman Filter (KF) and Particle
+Filter (PF), and instead require data-driven methods. We first experimentally
+show that two existing unsupervised learning-based data-driven methods fail to
+address the BSCM problem for model-free process; they are - data-driven
+nonlinear state estimation (DANSE) method and deep Markov model (DMM) method.
+The unsupervised learning uses unlabelled data comprised of only noisy, linear
+measurements. While DANSE provides a good predictive / forecasting performance
+to model the temporal measurement data as time-series, its unsupervised
+learning lacks a regularization for state estimation. We then investigate the
+use of a semi-supervised learning approach, and develop a semi-supervised
+learning-based DANSE method, referred to as SemiDANSE. In SemiDANSE, we use a
+limited amount of labelled data along-with a large amount of unlabelled data,
+and that helps to bring the desired regularization for addressing the BSCM
+problem. The labelled data means pairwise measurement-and-state data. Using
+three chaotic dynamical systems (or processes) with nonlinear dynamical models
+as benchmark, we show that the data-driven SemiDANSE provides competitive
+performance for BSCM against a hybrid method called KalmanNet and two
+model-driven methods -- an extended KF (EKF) and an unscented KF (UKF).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, under review at IEEE TSP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FoMo: A Foundation Model for Mobile Traffic Forecasting with Diffusion
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.15322v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.15322v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoye Chai, Xiaoqian Qi, Shiyuan Zhang, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mobile traffic forecasting allows operators to anticipate network dynamics
+and performance in advance, offering substantial potential for enhancing
+service quality and improving user experience. However, existing models are
+often task-oriented and are trained with tailored data, which limits their
+effectiveness in diverse mobile network tasks of Base Station (BS) deployment,
+resource allocation, energy optimization, etc. and hinders generalization
+across different urban environments. Foundation models have made remarkable
+strides across various domains of NLP and CV due to their multi-tasking
+adaption and zero/few-shot learning capabilities. In this paper, we propose an
+innovative Foundation model for Mo}bile traffic forecasting (FoMo), aiming to
+handle diverse forecasting tasks of short/long-term predictions and
+distribution generation across multiple cities to support network planning and
+optimization. FoMo combines diffusion models and transformers, where various
+spatio-temporal masks are proposed to enable FoMo to learn intrinsic features
+of different tasks, and a contrastive learning strategy is developed to capture
+the correlations between mobile traffic and urban contexts, thereby improving
+its transfer learning capability. Extensive experiments on 9 real-world
+datasets demonstrate that FoMo outperforms current models concerning diverse
+forecasting tasks and zero/few-shot learning, showcasing a strong universality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating Less Certain Adversarial Examples Improves Robust
+  Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04539v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04539v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minxing Zhang, Michael Backes, Xiao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits the robust overfitting phenomenon of adversarial
+training. Observing that models with better robust generalization performance
+are less certain in predicting adversarially generated training inputs, we
+argue that overconfidence in predicting adversarial examples is a potential
+cause. Therefore, we hypothesize that generating less certain adversarial
+examples improves robust generalization, and propose a formal definition of
+adversarial certainty that captures the variance of the model's predicted
+logits on adversarial examples. Our theoretical analysis of synthetic
+distributions characterizes the connection between adversarial certainty and
+robust generalization. Accordingly, built upon the notion of adversarial
+certainty, we develop a general method to search for models that can generate
+training-time adversarial inputs with reduced certainty, while maintaining the
+model's capability in distinguishing adversarial examples. Extensive
+experiments on image benchmarks demonstrate that our method effectively learns
+models with consistently improved robustness and mitigates robust overfitting,
+confirming the importance of generating less certain adversarial examples for
+robust generalization. Our implementations are available as open-source code
+at: https://github.com/TrustMLRG/AdvCertainty.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A User's Guide to $\texttt{KSig}$: GPU-Accelerated Computation of the
+  Signature Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07145v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07145v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Csaba Tóth, Danilo Jr Dela Cruz, Harald Oberhauser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The signature kernel is a positive definite kernel for sequential and
+temporal data that has become increasingly popular in machine learning
+applications due to powerful theoretical guarantees, strong empirical
+performance, and recently introduced various scalable variations. In this
+chapter, we give a short introduction to $\texttt{KSig}$, a
+$\texttt{Scikit-Learn}$ compatible Python package that implements various
+GPU-accelerated algorithms for computing signature kernels, and performing
+downstream learning tasks. We also introduce a new algorithm based on tensor
+sketches which gives strong performance compared to existing algorithms. The
+package is available at https://github.com/tgcsaba/ksig.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Gradient Subspaces: Addressing and Overcoming LoRA's
+  Limitations in Federated Fine-Tuning of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23111v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23111v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navyansh Mahla, Kshitij Sharad Jadhav, Ganesh Ramakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable capabilities across
+various domains, particularly in task generalization for both text and vision
+data. While fine-tuning these models can significantly enhance their
+performance on specific downstream tasks, it often requires high-quality data
+that cannot be shared due to privacy concerns. Federated Learning (FL) offers a
+promising solution for collaborative training without direct data sharing.
+However, many parameter-efficient fine-tuning strategies for LLMs in FL,
+particularly those based on Low-Rank Adaptation (LoRA), face limitations. In
+this paper, we critically analyze the convergence and performance guarantees of
+popular FL frameworks utilizing LoRA, highlighting its suboptimal nature due to
+constrained subspace learning of low-rank matrices. This limitation hinders
+effective fine-tuning of LLMs in federated settings. Through rigorous
+analytical and empirical evaluations, we demonstrate that direct weight
+averaging outperforms LoRA-based strategies, leading to superior performance
+for fine-tuned models. Our comprehensive comparison unmasks inefficiencies in
+LoRA approaches and underscores the advantages of direct weight aggregation. We
+extend our analysis to low-rank gradient-based optimizers, such as GaLore, used
+during local training steps. Our findings show that GaLore along with
+direct-weight aggregation is a more effective approach, outperforming federated
+LoRA methods like FlexLoRA and FFA-LoRA across both text and image modalities.
+While privacy remains paramount in FL discourse, our focus is on assessing
+performance outcomes of federated fine-tuned models and evaluating various FL
+frameworks from both theoretical and empirical perspectives. Our findings
+advocate reassessing the reliance on LoRA within FL contexts, paving the way
+for more efficient training methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Random Policy Enables In-Context Reinforcement Learning within Trust
+  Horizons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.19982v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.19982v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiqin Chen, Santiago Paternain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained foundation models have exhibited extraordinary in-context learning
+performance, allowing zero-shot generalization to new tasks not encountered
+during pretraining. In the case of reinforcement learning (RL), in-context RL
+(ICRL) emerges when pretraining FMs on decision-making problems in an
+autoregressive-supervised manner. Nevertheless, current state-of-the-art ICRL
+algorithms, like Algorithm Distillation, Decision Pretrained Transformer and
+Decision Importance Transformer, impose stringent requirements on the
+pretraining dataset concerning the source policies, context information, and
+action labels. Notably, these algorithms either demand optimal policies or
+require varying degrees of well-trained behavior policies for all pretraining
+environments. This significantly hinders the application of ICRL to real-world
+scenarios, where acquiring optimal or well-trained policies for a substantial
+volume of real-world training environments can be intractable. To overcome this
+challenge, we introduce a novel approach, termed State-Action Distillation
+(SAD), that allows to generate an effective pretraining dataset guided solely
+by random policies. In particular, SAD selects query states and corresponding
+action labels by distilling outstanding state-action pairs from the entire
+state and action spaces by using random policies within a trust horizon, and
+then inherits the classical autoregressive-supervised mechanism during
+pretraining. To the best of our knowledge, this is the first work that enables
+effective ICRL under random policies and random contexts. We also establish
+quantitative analysis of the trustworthiness as well as the performance
+guarantees of SAD. Moreover, our empirical results across multiple popular ICRL
+benchmark environments demonstrate that, on average, SAD outperforms the best
+baseline by 236.3% in the offline evaluation and by 135.2% in the online
+evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Doubly-Bounded Queue for Constrained Online Learning: Keeping Pace with
+  Dynamics of Both Loss and Constraint <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10703v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10703v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juncheng Wang, Bingjie Yan, Yituo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider online convex optimization with time-varying constraints and
+conduct performance analysis using two stringent metrics: dynamic regret with
+respect to the online solution benchmark, and hard constraint violation that
+does not allow any compensated violation over time. We propose an efficient
+algorithm called Constrained Online Learning with Doubly-bounded Queue (COLDQ),
+which introduces a novel virtual queue that is both lower and upper bounded,
+allowing tight control of the constraint violation without the need for the
+Slater condition. We prove via a new Lyapunov drift analysis that COLDQ
+achieves $O(T^\frac{1+V_x}{2})$ dynamic regret and $O(T^{V_g})$ hard constraint
+violation, where $V_x$ and $V_g$ capture the dynamics of the loss and
+constraint functions. For the first time, the two bounds smoothly approach to
+the best-known $O(T^\frac{1}{2})$ regret and $O(1)$ violation, as the dynamics
+of the losses and constraints diminish. For strongly convex loss functions,
+COLDQ matches the best-known $O(\log{T})$ static regret while maintaining the
+$O(T^{V_g})$ hard constraint violation. We further introduce an expert-tracking
+variation of COLDQ, which achieves the same performance bounds without any
+prior knowledge of the system dynamics. Simulation results demonstrate that
+COLDQ outperforms the state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-matrix Factorization Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19255v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19255v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingcheng Hu, Houyi Li, Yinmin Zhang, Zili Wang, Shuigeng Zhou, Xiangyu Zhang, Heung-Yeung Shum, Daxin Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose novel attention architectures, Multi-matrix Factorization
+Attention (MFA) and MFA-Key-Reuse (MFA-KR). Existing variants for standard
+Multi-Head Attention (MHA), including SOTA methods like MLA, fail to maintain
+as strong performance under stringent Key-Value cache (KV cache) constraints.
+MFA enhances model capacity by efficiently scaling up both the number and
+dimension of attention heads through low-rank matrix factorization in the
+Query-Key (QK) circuit. Extending MFA, MFA-KR further reduces memory
+requirements by repurposing the key cache as value through value projection
+re-parameterization. MFA's design enables strong model capacity when working
+under tight KV cache budget, while MFA-KR is suitable for even harsher KV cache
+limits with minor performance trade-off. Notably, in our extensive and
+large-scale experiments, the proposed architecture outperforms MLA and performs
+comparably to MHA, while reducing KV cache usage by up to 56% and 93.7%,
+respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaSociety: An Adaptive Environment with Social Structures for
+  Multi-Agent Decision-Making <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03865v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03865v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhe Huang, Xingbo Wang, Hao Liu, Fanqi Kong, Aoyang Qin, Min Tang, Song-Chun Zhu, Mingjie Bi, Siyuan Qi, Xue Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional interactive environments limit agents' intelligence growth with
+fixed tasks. Recently, single-agent environments address this by generating new
+tasks based on agent actions, enhancing task diversity. We consider the
+decision-making problem in multi-agent settings, where tasks are further
+influenced by social connections, affecting rewards and information access.
+However, existing multi-agent environments lack a combination of adaptive
+physical surroundings and social connections, hindering the learning of
+intelligent behaviors. To address this, we introduce AdaSociety, a customizable
+multi-agent environment featuring expanding state and action spaces, alongside
+explicit and alterable social structures. As agents progress, the environment
+adaptively generates new tasks with social structures for agents to undertake.
+In AdaSociety, we develop three mini-games showcasing distinct social
+structures and tasks. Initial results demonstrate that specific social
+structures can promote both individual and collective benefits, though current
+reinforcement learning and LLM-based algorithms show limited effectiveness in
+leveraging social structures to enhance performance. Overall, AdaSociety serves
+as a valuable research platform for exploring intelligence in diverse physical
+and social settings. The code is available at
+https://github.com/bigai-ai/AdaSociety.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS D&B 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lean Attention: Hardware-Aware Scalable Attention Mechanism for the
+  Decode-Phase of <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10480v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10480v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rya Sanovar, Srikant Bharadwaj, Renee St. Amant, Victor Rühle, Saravan Rajmohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based models have emerged as one of the most widely used
+architectures for natural language processing, natural language generation, and
+image generation. The size of the state-of-the-art models has increased
+steadily reaching billions of parameters. These huge models are memory hungry
+and incur significant inference latency even on cutting edge AI-accelerators,
+such as GPUs. Specifically, the time and memory complexity of the attention
+operation is quadratic in terms of the total context length, i.e., prompt and
+output tokens. Thus, several optimizations such as key-value tensor caching and
+FlashAttention computation have been proposed to deliver the low latency
+demands of applications relying on such large models. However, these techniques
+do not cater to the computationally distinct nature of different phases during
+inference.
+  To that end, we propose LeanAttention, a scalable technique of computing
+self-attention for the token-generation phase (decode-phase) of decoder-only
+transformer models. LeanAttention enables scaling the attention mechanism
+implementation for the challenging case of long context lengths by re-designing
+the execution flow for the decode-phase. We identify that the associative
+property of online softmax can be treated as a reduction operation thus
+allowing us to parallelize the attention computation over these large context
+lengths. We extend the "stream-K" style reduction of tiled calculation to
+self-attention to enable parallel computation resulting in an average of 2.6x
+attention execution speedup over FlashAttention-2 and up to 8.33x speedup for
+512k context lengths.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Poisoning Attacks on Federated Learning-based Wireless Traffic
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zifan Zhang, Minghong Fang, Jiayuan Huang, Yuchen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) offers a distributed framework to train a global
+control model across multiple base stations without compromising the privacy of
+their local network data. This makes it ideal for applications like wireless
+traffic prediction (WTP), which plays a crucial role in optimizing network
+resources, enabling proactive traffic flow management, and enhancing the
+reliability of downstream communication-aided applications, such as IoT
+devices, autonomous vehicles, and industrial automation systems. Despite its
+promise, the security aspects of FL-based distributed wireless systems,
+particularly in regression-based WTP problems, remain inadequately
+investigated. In this paper, we introduce a novel fake traffic injection (FTI)
+attack, designed to undermine the FL-based WTP system by injecting fabricated
+traffic distributions with minimal knowledge. We further propose a defense
+mechanism, termed global-local inconsistency detection (GLID), which
+strategically removes abnormal model parameters that deviate beyond a specific
+percentile range estimated through statistical methods in each dimension.
+Extensive experimental evaluations, performed on real-world wireless traffic
+datasets, demonstrate that both our attack and defense strategies significantly
+outperform existing baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IFIP/IEEE Networking 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Radar Signal Recognition through <span class="highlight-title">Self-Supervised</span> Learning and Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03461v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03461v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zi Huang, Simon Denman, Akila Pemasiri, Clinton Fookes, Terrence Martin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic radar signal recognition (RSR) plays a pivotal role in electronic
+warfare (EW), as accurately classifying radar signals is critical for informing
+decision-making processes. Recent advances in deep learning have shown
+significant potential in improving RSR performance in domains with ample
+annotated data. However, these methods fall short in EW scenarios where
+annotated RF data are scarce or impractical to obtain. To address these
+challenges, we introduce a self-supervised learning (SSL) method which utilises
+masked signal modelling and RF domain adaption to enhance RSR performance in
+environments with limited RF samples and labels. Specifically, we investigate
+pre-training masked autoencoders (MAE) on baseband in-phase and quadrature
+(I/Q) signals from various RF domains and subsequently transfer the learned
+representation to the radar domain, where annotated data are limited. Empirical
+results show that our lightweight self-supervised ResNet model with domain
+adaptation achieves up to a 17.5% improvement in 1-shot classification accuracy
+when pre-trained on in-domain signals (i.e., radar signals) and up to a 16.31%
+improvement when pre-trained on out-of-domain signals (i.e., comm signals),
+compared to its baseline without SSL. We also provide reference results for
+several MAE designs and pre-training strategies, establishing a new benchmark
+for few-shot radar signal classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Counterfactually Fair Reinforcement Learning via Sequential Data
+  Preprocessing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06366v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06366v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jitao Wang, Chengchun Shi, John D. Piette, Joshua R. Loftus, Donglin Zeng, Zhenke Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When applied in healthcare, reinforcement learning (RL) seeks to dynamically
+match the right interventions to subjects to maximize population benefit.
+However, the learned policy may disproportionately allocate efficacious actions
+to one subpopulation, creating or exacerbating disparities in other
+socioeconomically-disadvantaged subgroups. These biases tend to occur in
+multi-stage decision making and can be self-perpetuating, which if unaccounted
+for could cause serious unintended consequences that limit access to care or
+treatment benefit. Counterfactual fairness (CF) offers a promising statistical
+tool grounded in causal inference to formulate and study fairness. In this
+paper, we propose a general framework for fair sequential decision making. We
+theoretically characterize the optimal CF policy and prove its stationarity,
+which greatly simplifies the search for optimal CF policies by leveraging
+existing RL algorithms. The theory also motivates a sequential data
+preprocessing algorithm to achieve CF decision making under an additive noise
+assumption. We prove and then validate our policy learning approach in
+controlling unfairness and attaining optimal value through simulations.
+Analysis of a digital health dataset designed to reduce opioid misuse shows
+that our proposal greatly enhances fair access to counseling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Computational and Statistical Asymptotic Analysis of the JKO Scheme for
+  Iterative Algorithms to update distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shang Wu, Yazhen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The seminal paper of Jordan, Kinderlehrer, and Otto introduced what is now
+widely known as the JKO scheme, an iterative algorithmic framework for
+computing distributions. This scheme can be interpreted as a Wasserstein
+gradient flow and has been successfully applied in machine learning contexts,
+such as deriving policy solutions in reinforcement learning. In this paper, we
+extend the JKO scheme to accommodate models with unknown parameters.
+Specifically, we develop statistical methods to estimate these parameters and
+adapt the JKO scheme to incorporate the estimated values. To analyze the
+adopted statistical JKO scheme, we establish an asymptotic theory via
+stochastic partial differential equations that describes its limiting dynamic
+behavior. Our framework allows both the sample size used in parameter
+estimation and the number of algorithmic iterations to go to infinity. This
+study offers a unified framework for joint computational and statistical
+asymptotic analysis of the statistical JKO scheme. On the computational side,
+we examine the scheme's dynamic behavior as the number of iterations increases,
+while on the statistical side, we investigate the large-sample behavior of the
+resulting distributions computed through the scheme. We conduct numerical
+simulations to evaluate the finite-sample performance of the proposed methods
+and validate the developed asymptotic theory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ELDER: Enhancing Lifelong Model Editing with Mixture-of-LoRA <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11869v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11869v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaang Li, Quan Wang, Zhongnan Wang, Yongdong Zhang, Zhendong Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) require model editing to efficiently update
+specific knowledge within them and avoid factual errors. Most model editing
+methods are solely designed for single-time use and result in a significant
+forgetting effect in lifelong editing scenarios, where sequential edits are
+conducted over time. Previous approaches manage sequential edits by freezing
+original parameters and discretely allocating new parameters for each knowledge
+update. However, these methods lack robustness to minor input variations due to
+the discrete mapping between data and parameters. To overcome this challenge,
+we propose ELDER, a novel approach to create a continuous association between
+data and adapters. ELDER integrates multiple LoRAs through a router network and
+is trained to establish a smooth data-adapter association, thereby enhancing
+the edit robustness and generalization of semantically equivalent inputs. To
+ensure inputs containing the same knowledge will be processed by the same
+LoRAs, we design a novel loss to guide the model link LoRA allocations with
+edit knowledge. Furthermore, we propose a deferral mechanism to retain the
+original LLM capabilities post-edit. Extensive experiments on GPT-2 XL and
+LLaMA2-7B demonstrate that ELDER effectively edits models in the lifelong
+setting, outperforming eight baselines while exhibiting strong scalability and
+preserving LLMs' general abilities on downstream tasks. Our code is available
+at https://github.com/JiaangL/ELDER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval-Reasoning Large Language Model-based Synthetic Clinical Trial
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.12476v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.12476v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zerui Xu, Fang Wu, Yuanyuan Zhang, Yue Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) exhibits promise in the clinical domain. However, it is
+constrained by data scarcity and ethical considerations, as the generation of
+clinical trials presents significant challenges due to stringent privacy
+regulations, high costs, and the extended duration required for conducting
+studies with human participants. Despite the advancements of large language
+models (LLMs) in general generation tasks, their potential in facilitating the
+generation of synthetic clinical trials is under-explored. To address this gap,
+we introduce a novel Retrieval-Reasoning few-shot framework that leverages LLMs
+to generate artificial yet realistic and diverse clinical trials with binary
+success/failure labels. Experiments conducted on real clinical trials from the
+\url{ClinicalTrials.gov} database demonstrate that our synthetic data can
+effectively augment real datasets. Furthermore, by fine-tuning a pre-trained
+model as a binary classifier on synthetic clinical trial datasets, we
+demonstrate that this augmentation enhances model training for downstream tasks
+such as trial outcome prediction. Our findings suggest that LLMs for synthetic
+clinical trial generation hold promise for accelerating clinical research and
+upholding ethical standards for patient privacy. The code is publicly available
+at
+https://anonymous.4open.science/r/Retrieval_Reasoning_Clinical_Trial_Generation-3EC4.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI Foundation Models for Wearable Movement Data in Mental Health
+  Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15240v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15240v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franklin Y. Ruan, Aiwei Zhang, Jenny Y. Oh, SouYoung Jin, Nicholas C. Jacobson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained foundation models and transformer architectures have driven the
+success of large language models (LLMs) and other modern AI breakthroughs.
+However, similar advancements in health data modeling remain limited due to the
+need for innovative adaptations. Wearable movement data offers a valuable
+avenue for exploration, as it's a core feature in nearly all commercial
+smartwatches, well established in clinical and mental health research, and the
+sequential nature of the data shares similarities to language. We introduce the
+Pretrained Actigraphy Transformer (PAT), the first open source foundation model
+designed for time-series wearable movement data. Leveraging transformer-based
+architectures and novel techniques, such as patch embeddings, and pretraining
+on data from 29,307 participants in a national U.S. sample, PAT achieves
+state-of-the-art performance in several mental health prediction tasks. PAT is
+also lightweight and easily interpretable, making it a robust tool for mental
+health research.
+  GitHub: https://github.com/njacobsonlab/Pretrained-Actigraphy-Transformer/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Energy-Efficient Split Learning for Fine-Tuning Large Language Models in
+  Edge Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.00090v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.00090v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuguang Li, Shaohua Wu, Liang Li, Songge Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this letter, we propose an energy-efficient split learning (SL) framework
+for fine-tuning large language models (LLMs) using geo-distributed personal
+data at the network edge, where LLMs are split and alternately across massive
+mobile devices and an edge server. Considering the device heterogeneity and
+channel dynamics in edge networks, a \underline{C}ut l\underline{A}yer and
+computing \underline{R}esource \underline{D}ecision (CARD) algorithm is
+developed to minimize training delay and energy consumption. Simulation results
+demonstrate that the proposed approach reduces the average training delay and
+server's energy consumption by 70.8% and 53.1%, compared to the benchmarks,
+respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Go AIs be adversarially robust? <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12843v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12843v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Tseng, Euan McLean, Kellin Pelrine, Tony T. Wang, Adam Gleave
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prior work found that superhuman Go AIs can be defeated by simple adversarial
+strategies, especially "cyclic" attacks. In this paper, we study whether adding
+natural countermeasures can achieve robustness in Go, a favorable domain for
+robustness since it benefits from incredible average-case capability and a
+narrow, innately adversarial setting. We test three defenses: adversarial
+training on hand-constructed positions, iterated adversarial training, and
+changing the network architecture. We find that though some of these defenses
+protect against previously discovered attacks, none withstand freshly trained
+adversaries. Furthermore, most of the reliably effective attacks these
+adversaries discover are different realizations of the same overall class of
+cyclic attacks. Our results suggest that building robust AI systems is
+challenging even with extremely superhuman systems in some of the most
+tractable settings, and highlight two key gaps: efficient generalization of
+defenses, and diversity in training. For interactive examples of attacks and a
+link to our codebase, see https://goattack.far.ai.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>63 pages, AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physically Guided Deep Unsupervised Inversion for 1D Magnetotelluric
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.15274v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.15274v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Goyes-Peñafiel, Umair bin Waheed, Henry Arguello
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The global demand for unconventional energy sources such as geothermal energy
+and white hydrogen requires new exploration techniques for precise subsurface
+structure characterization and potential reservoir identification. The
+Magnetotelluric (MT) method is crucial for these tasks, providing critical
+information on the distribution of subsurface electrical resistivity at depths
+ranging from hundreds to thousands of meters. However, traditional iterative
+algorithm-based inversion methods require the adjustment of multiple
+parameters, demanding time-consuming and exhaustive tuning processes to achieve
+proper cost function minimization. Recent advances have incorporated deep
+learning algorithms for MT inversion, primarily based on supervised learning,
+and large labeled datasets are needed for training. This work utilizes
+TensorFlow operations to create a differentiable forward MT operator,
+leveraging its automatic differentiation capability. Moreover, instead of
+solving for the subsurface model directly, as classical algorithms perform,
+this paper presents a new deep unsupervised inversion algorithm guided by
+physics to estimate 1D MT models. Instead of using datasets with the observed
+data and their respective model as labels during training, our method employs a
+differentiable modeling operator that physically guides the cost function
+minimization, making the proposed method solely dependent on observed data.
+Therefore, the optimization algorithm updates the network weights to minimize
+the data misfit. We test the proposed method with field and synthetic data at
+different acquisition frequencies, demonstrating that the resistivity models
+obtained are more accurate than those calculated using other techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 6 figures, github repository, submitted to IEEE-GRSL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $\text{<span class="highlight-title">Transformer</span>}^2$: Self-adaptive LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06252v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06252v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Sun, Edoardo Cetin, Yujin Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-adaptive large language models (LLMs) aim to solve the challenges posed
+by traditional fine-tuning methods, which are often computationally intensive
+and static in their ability to handle diverse tasks. We introduce
+$\text{Transformer}^2$, a novel self-adaptation framework that adapts LLMs for
+unseen tasks in real-time by selectively adjusting only the singular components
+of their weight matrices. During inference, $\text{Transformer}^2$ employs a
+two-pass mechanism: first, a dispatch system identifies the task properties,
+and then task-specific "expert" vectors, trained using reinforcement learning,
+are dynamically mixed to obtain targeted behavior for the incoming prompt. Our
+method outperforms ubiquitous approaches such as LoRA, with fewer parameters
+and greater efficiency. $\text{Transformer}^2$ demonstrates versatility across
+different LLM architectures and modalities, including vision-language tasks.
+$\text{Transformer}^2$ represents a significant leap forward, offering a
+scalable, efficient solution for enhancing the adaptability and task-specific
+performance of LLMs, paving the way for truly dynamic, self-organizing AI
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 panges, 11 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ E2ESlack: An End-to-End Graph-Based Framework for Pre-Routing Slack
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07564v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07564v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saurabh Bodhe, Zhanguang Zhang, Atia Hamidizadeh, Shixiong Kai, Yingxue Zhang, Mingxuan Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-routing slack prediction remains a critical area of research in
+Electronic Design Automation (EDA). Despite numerous machine learning-based
+approaches targeting this task, there is still a lack of a truly end-to-end
+framework that engineers can use to obtain TNS/WNS metrics from raw circuit
+data at the placement stage. Existing works have demonstrated effectiveness in
+Arrival Time (AT) prediction but lack a mechanism for Required Arrival Time
+(RAT) prediction, which is essential for slack prediction and obtaining TNS/WNS
+metrics. In this work, we propose E2ESlack, an end-to-end graph-based framework
+for pre-routing slack prediction. The framework includes a TimingParser that
+supports DEF, SDF and LIB files for feature extraction and graph construction,
+an arrival time prediction model and a fast RAT estimation module. To the best
+of our knowledge, this is the first work capable of predicting path-level
+slacks at the pre-routing stage. We perform extensive experiments and
+demonstrate that our proposed RAT estimation method outperforms the SOTA
+ML-based prediction method and also pre-routing STA tool. Additionally, the
+proposed E2ESlack framework achieves TNS/WNS values comparable to post-routing
+STA results while saving up to 23x runtime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient descent with generalized Newton's method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.02772v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.02772v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqi Bu, Shiyun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the generalized Newton's method (GeN) -- a Hessian-informed
+approach that applies to any optimizer such as SGD and Adam, and covers the
+Newton-Raphson method as a sub-case. Our method automatically and dynamically
+selects the learning rate that accelerates the convergence, without the
+intensive tuning of the learning rate scheduler. In practice, our method is
+easily implementable, since it only requires additional forward passes with
+almost zero computational overhead (in terms of training time and memory cost),
+if the overhead is amortized over many iterations. We present extensive
+experiments on language and vision tasks (e.g. GPT and ResNet) to showcase that
+GeN optimizers match the state-of-the-art performance, which was achieved with
+carefully tuned learning rate schedulers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can AI Help with Your Personal Finances? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19784v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19784v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oudom Hean, Utsha Saha, Binita Saha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Large Language Models (LLMs) have emerged as a
+transformative development in artificial intelligence (AI), drawing significant
+attention from industry and academia. Trained on vast datasets, these
+sophisticated AI systems exhibit impressive natural language processing and
+content generation capabilities. This paper explores the potential of LLMs to
+address key challenges in personal finance, focusing on the United States. We
+evaluate several leading LLMs, including OpenAI's ChatGPT, Google's Gemini,
+Anthropic's Claude, and Meta's Llama, to assess their effectiveness in
+providing accurate financial advice on topics such as mortgages, taxes, loans,
+and investments. Our findings show that while these models achieve an average
+accuracy rate of approximately 70%, they also display notable limitations in
+certain areas. Specifically, LLMs struggle to provide accurate responses for
+complex financial queries, with performance varying significantly across
+different topics. Despite these limitations, the analysis reveals notable
+improvements in newer versions of these models, highlighting their growing
+utility for individuals and financial advisors. As these AI systems continue to
+evolve, their potential for advancing AI-driven applications in personal
+finance becomes increasingly promising.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smartphone-based Eye Tracking System using Edge Intelligence and Model
+  Optimisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nishan Gunawardena, Gough Yumu Lui, Jeewani Anupama Ginige, Bahman Javadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A significant limitation of current smartphone-based eye-tracking algorithms
+is their low accuracy when applied to video-type visual stimuli, as they are
+typically trained on static images. Also, the increasing demand for real-time
+interactive applications like games, VR, and AR on smartphones requires
+overcoming the limitations posed by resource constraints such as limited
+computational power, battery life, and network bandwidth. Therefore, we
+developed two new smartphone eye-tracking techniques for video-type visuals by
+combining Convolutional Neural Networks (CNN) with two different Recurrent
+Neural Networks (RNN), namely Long Short Term Memory (LSTM) and Gated Recurrent
+Unit (GRU). Our CNN+LSTM and CNN+GRU models achieved an average Root Mean
+Square Error of 0.955 cm and 1.091 cm, respectively. To address the
+computational constraints of smartphones, we developed an edge intelligence
+architecture to enhance the performance of smartphone-based eye tracking. We
+applied various optimisation methods like quantisation and pruning to deep
+learning models for better energy, CPU, and memory usage on edge devices,
+focusing on real-time processing. Using model quantisation, the model inference
+time in the CNN+LSTM and CNN+GRU models was reduced by 21.72% and 19.50%,
+respectively, on edge devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>I have included the three papers as reference, which are closely
+  related. We have expanded the future work section to provide a more thorough
+  discussion of the concepts of "varying lighting conditions" and "dynamic user
+  environments." We have added a note below Table 4 to clarify the
+  abbreviations' meaning. Elaborated the role of the Domain Expert within the
+  presentation layer in Section 4.1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ACPO: AI-Enabled Compiler Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09982v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09982v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir H. Ashouri, Muhammad Asif Manzoor, Duc Minh Vu, Raymond Zhang, Colin Toft, Ziwen Wang, Angel Zhang, Bryan Chan, Tomasz S. Czajkowski, Yaoqing Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The key to performance optimization of a program is to decide correctly when
+a certain transformation should be applied by a compiler. This is an ideal
+opportunity to apply machine-learning models to speed up the tuning process;
+while this realization has been around since the late 90s, only recent
+advancements in ML enabled a practical application of ML to compilers as an
+end-to-end framework.
+  This paper presents ACPO: An AI-Enabled Compiler Framework, a novel framework
+that provides LLVM with simple and comprehensive tools to benefit from
+employing ML models for different optimization passes. We first showcase the
+high-level view, class hierarchy, and functionalities of ACPO and subsequently,
+demonstrate \taco{a couple of use cases of ACPO by ML-enabling the Loop Unroll
+and Function Inlining passes used in LLVM's O3. and finally, describe how ACPO
+can be leveraged to optimize other passes. Experimental results reveal that the
+ACPO model for Loop Unroll can gain on average 4%, 3%, 5.4%, and 0.2% compared
+to LLVM's vanilla O3 optimization when deployed on Polybench, Coral-2,
+CoreMark, and Graph-500, respectively. Furthermore, by including both Function
+Inlining and Loop Unroll models, ACPO can provide a combined speedup of 4.5% on
+Polybench and 2.4% on Cbench when compared with LLVM's O3, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACPO (12 pages)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EPIC: Effective <span class="highlight-title">Prompt</span>ing for Imbalanced-Class Data Synthesis in Tabular
+  Data Classification via Large Language Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.12404v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.12404v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhee Kim, Taesung Kim, Jaegul Choo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable in-context learning
+capabilities across diverse applications. In this work, we explore the
+effectiveness of LLMs for generating realistic synthetic tabular data,
+identifying key prompt design elements to optimize performance. We introduce
+EPIC, a novel approach that leverages balanced, grouped data samples and
+consistent formatting with unique variable mapping to guide LLMs in generating
+accurate synthetic data across all classes, even for imbalanced datasets.
+Evaluations on real-world datasets show that EPIC achieves state-of-the-art
+machine learning classification performance, significantly improving generation
+efficiency. These findings highlight the effectiveness of EPIC for synthetic
+tabular data generation, particularly in addressing class imbalance. Our source
+code for our work is available at:
+https://seharanul17.github.io/project-synthetic-tabular-llm/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A systematic <span class="highlight-title">review</span> of the use of Deep Learning in Satellite Imagery for
+  Agriculture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.01272v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.01272v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Victor, Zhen He, Aiden Nibali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Agricultural research is essential for increasing food production to meet the
+requirements of an increasing population in the coming decades. Recently,
+satellite technology has been improving rapidly and deep learning has seen much
+success in generic computer vision tasks and many application areas which
+presents an important opportunity to improve analysis of agricultural land.
+Here we present a systematic review of 150 studies to find the current uses of
+deep learning on satellite imagery for agricultural research. Although we
+identify 5 categories of agricultural monitoring tasks, the majority of the
+research interest is in crop segmentation and yield prediction. We found that,
+when used, modern deep learning methods consistently outperformed traditional
+machine learning across most tasks; the only exception was that Long Short-Term
+Memory (LSTM) Recurrent Neural Networks did not consistently outperform Random
+Forests (RF) for yield prediction. The reviewed studies have largely adopted
+methodologies from generic computer vision, except for one major omission:
+benchmark datasets are not utilised to evaluate models across studies, making
+it difficult to compare results. Additionally, some studies have specifically
+utilised the extra spectral resolution available in satellite imagery, but
+other divergent properties of satellite images - such as the hugely different
+scales of spatial patterns - are not being taken advantage of in the reviewed
+studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 5 figures and 10 tables in main paper. Final version, as
+  submitted and accepted at JSTARS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Double Equivariance for Inductive Link Prediction for Both New Nodes and
+  New Relation Types 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01313v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01313v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jincheng Zhou, Yucheng Zhang, Jianfei Gao, Yangze Zhou, Bruno Ribeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of fully inductive link prediction in knowledge graphs has gained
+significant attention, with various graph neural networks being proposed to
+address it. This task presents greater challenges than traditional inductive
+link prediction tasks with only new nodes, as models must be capable of
+zero-shot generalization to both unseen nodes and unseen relation types in the
+inference graph. Despite the development of novel models, a unifying
+theoretical understanding of their success remains elusive, and the limitations
+of these methods are not well-studied. In this work, we introduce the concept
+of double permutation-equivariant representations and demonstrate its necessity
+for effective performance in this task. We show that many existing models,
+despite their diverse architectural designs, conform to this framework.
+However, we also identify inherent limitations in double
+permutation-equivariant representations, which restrict these models's ability
+to learn effectively on datasets with varying characteristics. Our findings
+suggest that while double equivariance is necessary for meta-learning across
+knowledge graphs from different domains, it is not sufficient. There remains a
+fundamental gap between double permutation-equivariant models and the concept
+of foundation models designed to learn patterns across all domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continuous GNN-based Anomaly Detection on Edge using Efficient Adaptive
+  Knowledge Graph Learning <span class="chip">DATE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.09072v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.09072v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanggeon Yun, Ryozo Masukawa, William Youngwoo Chung, Minhyoung Na, Nathaniel Bastian, Mohsen Imani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing demand for robust security solutions across various industries
+has made Video Anomaly Detection (VAD) a critical task in applications such as
+intelligent surveillance, evidence investigation, and violence detection.
+Traditional approaches to VAD often rely on finetuning large pre-trained
+models, which can be computationally expensive and impractical for real-time or
+resource-constrained environments. To address this, MissionGNN introduced a
+more efficient method by training a graph neural network (GNN) using a fixed
+knowledge graph (KG) derived from large language models (LLMs) like GPT-4.
+While this approach demonstrated significant efficiency in computational power
+and memory, it faces limitations in dynamic environments where frequent updates
+to the KG are necessary due to evolving behavior trends and shifting data
+patterns. These updates typically require cloud-based computation, posing
+challenges for edge computing applications. In this paper, we propose a novel
+framework that facilitates continuous KG adaptation directly on edge devices,
+overcoming the limitations of cloud dependency. Our method dynamically modifies
+the KG through a three-phase process: pruning, alternating, and creating nodes,
+enabling real-time adaptation to changing data trends. This continuous learning
+approach enhances the robustness of anomaly detection models, making them more
+suitable for deployment in dynamic and resource-constrained environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to DATE 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Boosting in Hyperdimensional Computing for Enhanced
+  Reliability in Healthcare <span class="chip">DATE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        SungHeon Jeong, Hamza Errahmouni Barkam, Sanggeon Yun, Yeseong Kim, Shaahin Angizi, Mohsen Imani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperdimensional computing (HDC) enables efficient data encoding and
+processing in high-dimensional space, benefiting machine learning and data
+analysis. However, underutilization of these spaces can lead to overfitting and
+reduced model reliability, especially in data-limited systems a critical issue
+in sectors like healthcare that demand robustness and consistent performance.
+We introduce BoostHD, an approach that applies boosting algorithms to partition
+the hyperdimensional space into subspaces, creating an ensemble of weak
+learners. By integrating boosting with HDC, BoostHD enhances performance and
+reliability beyond existing HDC methods. Our analysis highlights the importance
+of efficient utilization of hyperdimensional spaces for improved model
+performance. Experiments on healthcare datasets show that BoostHD outperforms
+state-of-the-art methods. On the WESAD dataset, it achieved an accuracy of
+98.37%, surpassing Random Forest, XGBoost, and OnlineHD. BoostHD also
+demonstrated superior inference efficiency and stability, maintaining high
+accuracy under data imbalance and noise. In person-specific evaluations, it
+achieved an average accuracy of 96.19%, outperforming other models. By
+addressing the limitations of both boosting and HDC, BoostHD expands the
+applicability of HDC in critical domains where reliability and precision are
+paramount.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to DATE 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TSEML: A task-specific embedding-based method for few-shot
+  classification of cancer molecular subtypes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13228v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13228v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Su, Rui Shi, Hui Cui, Ping Xuan, Chengyan Fang, Xikang Feng, Qiangguo Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Molecular subtyping of cancer is recognized as a critical and challenging
+upstream task for personalized therapy. Existing deep learning methods have
+achieved significant performance in this domain when abundant data samples are
+available. However, the acquisition of densely labeled samples for cancer
+molecular subtypes remains a significant challenge for conventional
+data-intensive deep learning approaches. In this work, we focus on the few-shot
+molecular subtype prediction problem in heterogeneous and small cancer
+datasets, aiming to enhance precise diagnosis and personalized treatment. We
+first construct a new few-shot dataset for cancer molecular subtype
+classification and auxiliary cancer classification, named TCGA Few-Shot, from
+existing publicly available datasets. To effectively leverage the relevant
+knowledge from both tasks, we introduce a task-specific embedding-based
+meta-learning framework (TSEML). TSEML leverages the synergistic strengths of a
+model-agnostic meta-learning (MAML) approach and a prototypical network
+(ProtoNet) to capture diverse and fine-grained features. Comparative
+experiments conducted on the TCGA Few-Shot dataset demonstrate that our TSEML
+framework achieves superior performance in addressing the problem of few-shot
+molecular subtype classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-dimensional learning of narrow neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.13904v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.13904v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have been marked with the fast-pace diversification and
+increasing ubiquity of machine learning applications. Yet, a firm theoretical
+understanding of the surprising efficiency of neural networks to learn from
+high-dimensional data still proves largely elusive. In this endeavour, analyses
+inspired by statistical physics have proven instrumental, enabling the tight
+asymptotic characterization of the learning of neural networks in high
+dimensions, for a broad class of solvable models. This manuscript reviews the
+tools and ideas underlying recent progress in this line of work. We introduce a
+generic model -- the sequence multi-index model -- which encompasses numerous
+previously studied models as special instances. This unified framework covers a
+broad class of machine learning architectures with a finite number of hidden
+units, including multi-layer perceptrons, autoencoders, attention mechanisms;
+and tasks, including (un)supervised learning, denoising, contrastive learning,
+in the limit of large data dimension, and comparably large number of samples.
+We explicate in full detail the analysis of the learning of sequence
+multi-index models, using statistical physics techniques such as the replica
+method and approximate message-passing algorithms. This manuscript thus
+provides a unified presentation of analyses reported in several previous works,
+and a detailed overview of central techniques in the field of statistical
+physics of machine learning. This review should be a useful primer for machine
+learning theoreticians curious of statistical physics approaches; it should
+also be of value to statistical physicists interested in the transfer of such
+ideas to the study of neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expressive Text-to-Image Generation with Rich Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06720v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06720v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songwei Ge, Taesung Park, Jun-Yan Zhu, Jia-Bin Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plain text has become a prevalent interface for text-to-image synthesis.
+However, its limited customization options hinder users from accurately
+describing desired outputs. For example, plain text makes it hard to specify
+continuous quantities, such as the precise RGB color value or importance of
+each word. Furthermore, creating detailed text prompts for complex scenes is
+tedious for humans to write and challenging for text encoders to interpret. To
+address these challenges, we propose using a rich-text editor supporting
+formats such as font style, size, color, and footnote. We extract each word's
+attributes from rich text to enable local style control, explicit token
+reweighting, precise color rendering, and detailed region synthesis. We achieve
+these capabilities through a region-based diffusion process. We first obtain
+each word's region based on attention maps of a diffusion process using plain
+text. For each region, we enforce its text attributes by creating
+region-specific detailed prompts and applying region-specific guidance, and
+maintain its fidelity against plain-text generation through region-based
+injections. We present various examples of image generation from rich text and
+demonstrate that our method outperforms strong baselines with quantitative
+evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://rich-text-to-image.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Network Emulator for Atmospheric Chemical ODE 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01829v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01829v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhi-Song Liu, Petri Clusius, Michael Boy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling atmospheric chemistry is complex and computationally intense. Given
+the recent success of Deep neural networks in digital signal processing, we
+propose a Neural Network Emulator for fast chemical concentration modeling. We
+consider atmospheric chemistry as a time-dependent Ordinary Differential
+Equation. To extract the hidden correlations between initial states and future
+time evolution, we propose ChemNNE, an Attention based Neural Network Emulator
+(NNE) that can model the atmospheric chemistry as a neural ODE process. To
+efficiently simulate the chemical changes, we propose the sinusoidal time
+embedding to estimate the oscillating tendency over time. More importantly, we
+use the Fourier neural operator to model the ODE process for efficient
+computation. We also propose three physical-informed losses to supervise the
+training optimization. To evaluate our model, we propose a large-scale chemical
+dataset that can be used for neural network training and evaluation. The
+extensive experiments show that our approach achieves state-of-the-art
+performance in modeling accuracy and computational speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MassSpecGym: A benchmark for the discovery and identification of
+  molecules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Bushuiev, Anton Bushuiev, Niek F. de Jonge, Adamo Young, Fleming Kretschmer, Raman Samusevich, Janne Heirman, Fei Wang, Luke Zhang, Kai Dührkop, Marcus Ludwig, Nils A. Haupt, Apurva Kalia, Corinna Brungs, Robin Schmid, Russell Greiner, Bo Wang, David S. Wishart, Li-Ping Liu, Juho Rousu, Wout Bittremieux, Hannes Rost, Tytus D. Mak, Soha Hassoun, Florian Huber, Justin J. J. van der Hooft, Michael A. Stravs, Sebastian Böcker, Josef Sivic, Tomáš Pluskal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discovery and identification of molecules in biological and environmental
+samples is crucial for advancing biomedical and chemical sciences. Tandem mass
+spectrometry (MS/MS) is the leading technique for high-throughput elucidation
+of molecular structures. However, decoding a molecular structure from its mass
+spectrum is exceptionally challenging, even when performed by human experts. As
+a result, the vast majority of acquired MS/MS spectra remain uninterpreted,
+thereby limiting our understanding of the underlying (bio)chemical processes.
+Despite decades of progress in machine learning applications for predicting
+molecular structures from MS/MS spectra, the development of new methods is
+severely hindered by the lack of standard datasets and evaluation protocols. To
+address this problem, we propose MassSpecGym -- the first comprehensive
+benchmark for the discovery and identification of molecules from MS/MS data.
+Our benchmark comprises the largest publicly available collection of
+high-quality labeled MS/MS spectra and defines three MS/MS annotation
+challenges: \textit{de novo} molecular structure generation, molecule
+retrieval, and spectrum simulation. It includes new evaluation metrics and a
+generalization-demanding data split, therefore standardizing the MS/MS
+annotation tasks and rendering the problem accessible to the broad machine
+learning community. MassSpecGym is publicly available at
+\url{https://github.com/pluskal-lab/MassSpecGym}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Parameter-Efficient Quantum Anomaly Detection Method on a
+  Superconducting Quantum Processor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maida Wang, Jinyang Jiang, Peter V. Coveney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum machine learning has gained attention for its potential to address
+computational challenges. However, whether those algorithms can effectively
+solve practical problems and outperform their classical counterparts,
+especially on current quantum hardware, remains a critical question. In this
+work, we propose a novel quantum machine learning method, called Quantum
+Support Vector Data Description (QSVDD), for practical image anomaly detection,
+which aims to achieve both parameter efficiency and superior accuracy compared
+to classical models. Emulation results indicate that QSVDD demonstrates
+favourable recognition capabilities compared to classical baselines, achieving
+an average accuracy of over 90% on benchmarks with significantly fewer
+trainable parameters. Theoretical analysis confirms that QSVDD has a comparable
+expressivity to classical counterparts while requiring only a fraction of the
+parameters. Furthermore, we demonstrate the first implementation of a quantum
+anomaly detection method for general image datasets on a superconducting
+quantum processor. Specifically, we achieve an accuracy of over 80% with only
+16 parameters on the device, providing initial evidence of QSVDD's practical
+viability in the noisy intermediate-scale quantum era and highlighting its
+significant reduction in parameter requirements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Statistical Properties of Deep Neural Networks with Dependent Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11113v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11113v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chad Brown
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper establishes statistical properties of deep neural network (DNN)
+estimators under dependent data. Two general results for nonparametric sieve
+estimators directly applicable to DNN estimators are given. The first
+establishes rates for convergence in probability under nonstationary data. The
+second provides non-asymptotic probability bounds on $\mathcal{L}^{2}$-errors
+under stationary $\beta$-mixing data. I apply these results to DNN estimators
+in both regression and classification contexts imposing only a standard
+H\"older smoothness assumption. The DNN architectures considered are common in
+applications, featuring fully connected feedforward networks with any
+continuous piecewise linear activation function, unbounded weights, and a width
+and depth that grows with sample size. The framework provided also offers
+potential for research into other DNN architectures and time-series
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>86 pages, 2 figures, removed partially linear model section and
+  uploaded as a separate paper (arXiv:2410.22574v1)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SYNAPSE: SYmbolic Neural-Aided Preference Synthesis Engine <span class="chip">AAAI 25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16689v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16689v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadanand Modak, Noah Patton, Isil Dillig, Joydeep Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the problem of preference learning, which aims to align
+robot behaviors through learning user specific preferences (e.g. "good
+pull-over location") from visual demonstrations. Despite its similarity to
+learning factual concepts (e.g. "red door"), preference learning is a
+fundamentally harder problem due to its subjective nature and the paucity of
+person-specific training data. We address this problem using a novel framework
+called SYNAPSE, which is a neuro-symbolic approach designed to efficiently
+learn preferential concepts from limited data. SYNAPSE represents preferences
+as neuro-symbolic programs, facilitating inspection of individual parts for
+alignment, in a domain-specific language (DSL) that operates over images and
+leverages a novel combination of visual parsing, large language models, and
+program synthesis to learn programs representing individual preferences. We
+perform extensive evaluations on various preferential concepts as well as user
+case studies demonstrating its ability to align well with dissimilar user
+preferences. Our method significantly outperforms baselines, especially when it
+comes to out of distribution generalization. We show the importance of the
+design choices in the framework through multiple ablation studies. Code,
+additional results, and supplementary material can be found on the website:
+https://amrl.cs.utexas.edu/synapse
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted (oral) at AAAI 25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Augmentation Invariant Manifold Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.00460v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.00460v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shulei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is a widely used technique and an essential ingredient in
+the recent advance in self-supervised representation learning. By preserving
+the similarity between augmented data, the resulting data representation can
+improve various downstream analyses and achieve state-of-the-art performance in
+many applications. Despite the empirical effectiveness, most existing methods
+lack theoretical understanding under a general nonlinear setting. To fill this
+gap, we develop a statistical framework on a low-dimension product manifold to
+model the data augmentation transformation. Under this framework, we introduce
+a new representation learning method called augmentation invariant manifold
+learning and design a computationally efficient algorithm by reformulating it
+as a stochastic optimization problem. Compared with existing self-supervised
+methods, the new method simultaneously exploits the manifold's geometric
+structure and invariant property of augmented data and has an explicit
+theoretical guarantee. Our theoretical investigation characterizes the role of
+data augmentation in the proposed method and reveals why and how the data
+representation learned from augmented data can improve the $k$-nearest neighbor
+classifier in the downstream analysis, showing that a more complex data
+augmentation leads to more improvement in downstream analysis. Finally,
+numerical experiments on simulated and real data sets are presented to
+demonstrate the merit of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UIFV: Data Reconstruction Attack in Vertical Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12588v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12588v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jirui Yang, Peng Chen, Zhihui Lu, Qiang Duan, Yubing Bao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vertical Federated Learning (VFL) facilitates collaborative machine learning
+without the need for participants to share raw private data. However, recent
+studies have revealed privacy risks where adversaries might reconstruct
+sensitive features through data leakage during the learning process. Although
+data reconstruction methods based on gradient or model information are somewhat
+effective, they reveal limitations in VFL application scenarios. This is
+because these traditional methods heavily rely on specific model structures
+and/or have strict limitations on application scenarios. To address this, our
+study introduces the Unified InverNet Framework into VFL, which yields a novel
+and flexible approach (dubbed UIFV) that leverages intermediate feature data to
+reconstruct original data, instead of relying on gradients or model details.
+The intermediate feature data is the feature exchanged by different
+participants during the inference phase of VFL. Experiments on four datasets
+demonstrate that our methods significantly outperform state-of-the-art
+techniques in attack precision. Our work exposes severe privacy vulnerabilities
+within VFL systems that pose real threats to practical VFL applications and
+thus confirms the necessity of further enhancing privacy protection in the VFL
+architecture.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Discrete Concepts in Latent Hierarchical Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00519v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00519v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingjing Kong, Guangyi Chen, Biwei Huang, Eric P. Xing, Yuejie Chi, Kun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning concepts from natural high-dimensional data (e.g., images) holds
+potential in building human-aligned and interpretable machine learning models.
+Despite its encouraging prospect, formalization and theoretical insights into
+this crucial task are still lacking. In this work, we formalize concepts as
+discrete latent causal variables that are related via a hierarchical causal
+model that encodes different abstraction levels of concepts embedded in
+high-dimensional data (e.g., a dog breed and its eye shapes in natural images).
+We formulate conditions to facilitate the identification of the proposed causal
+model, which reveals when learning such concepts from unsupervised data is
+possible. Our conditions permit complex causal hierarchical structures beyond
+latent trees and multi-level directed acyclic graphs in prior work and can
+handle high-dimensional, continuous observed variables, which is well-suited
+for unstructured data modalities such as images. We substantiate our
+theoretical claims with synthetic data experiments. Further, we discuss our
+theory's implications for understanding the underlying mechanisms of latent
+diffusion models and provide corresponding empirical evidence for our
+theoretical insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Frontier Models are Capable of In-context Scheming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04984v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04984v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Meinke, Bronson Schoen, Jérémy Scheurer, Mikita Balesni, Rusheb Shah, Marius Hobbhahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Frontier models are increasingly trained and deployed as autonomous agent.
+One safety concern is that AI agents might covertly pursue misaligned goals,
+hiding their true capabilities and objectives - also known as scheming. We
+study whether models have the capability to scheme in pursuit of a goal that we
+provide in-context and instruct the model to strongly follow. We evaluate
+frontier models on a suite of six agentic evaluations where models are
+instructed to pursue goals and are placed in environments that incentivize
+scheming. Our results show that o1, Claude 3.5 Sonnet, Claude 3 Opus, Gemini
+1.5 Pro, and Llama 3.1 405B all demonstrate in-context scheming capabilities.
+They recognize scheming as a viable strategy and readily engage in such
+behavior. For example, models strategically introduce subtle mistakes into
+their responses, attempt to disable their oversight mechanisms, and even
+exfiltrate what they believe to be their model weights to external servers.
+Additionally, this deceptive behavior proves persistent. When o1 has engaged in
+scheming, it maintains its deception in over 85% of follow-up questions and
+often remains deceptive in multi-turn interrogations. Analysis of the models'
+chains-of-thought reveals that models explicitly reason about these deceptive
+strategies, providing evidence that the scheming behavior is not accidental.
+Surprisingly, we also find rare instances where models engage in scheming when
+only given a goal, without being strongly nudged to pursue it. We observe cases
+where Claude 3.5 Sonnet strategically underperforms in evaluations in pursuit
+of being helpful, a goal that was acquired during training rather than
+in-context. Our findings demonstrate that frontier models now possess
+capabilities for basic in-context scheming, making the potential of AI agents
+to engage in scheming behavior a concrete rather than theoretical concern.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accelerating the discovery of low-energy structure configurations: a
+  computational approach that integrates first-principles calculations, Monte
+  Carlo sampling, and Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05604v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05604v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Rajib Khan Musa, Yichen Qian, Jie Peng, David Cereceda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding Minimum Energy Configurations (MECs) is essential in fields such as
+physics, chemistry, and materials science, as they represent the most stable
+states of the systems. In particular, identifying such MECs in multi-component
+alloys considered candidate PFMs is key because it determines the most stable
+arrangement of atoms within the alloy, directly influencing its phase
+stability, structural integrity, and thermo-mechanical properties. However,
+since the search space grows exponentially with the number of atoms considered,
+obtaining such MECs using computationally expensive first-principles DFT
+calculations often results in a cumbersome task. To escape the above compromise
+between physical fidelity and computational efficiency, we have developed a
+novel physics-based data-driven approach that combines Monte Carlo sampling,
+first-principles DFT calculations, and Machine Learning to accelerate the
+discovery of MECs in multi-component alloys. More specifically, we have
+leveraged well-established Cluster Expansion (CE) techniques with Local Outlier
+Factor models to establish strategies that enhance the reliability of the CE
+method. In this work, we demonstrated the capabilities of the proposed approach
+for the particular case of a tungsten-based quaternary high-entropy alloy.
+However, the method is applicable to other types of alloys and enables a wide
+range of applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>added changes made during revision of manuscript</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Geometry of Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04809v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04809v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Randall Balestriero, Ahmed Imtiaz Humayun, Richard Baraniuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we overview one promising avenue of progress at the
+mathematical foundation of deep learning: the connection between deep networks
+and function approximation by affine splines (continuous piecewise linear
+functions in multiple dimensions). In particular, we will overview work over
+the past decade on understanding certain geometrical properties of a deep
+network's affine spline mapping, in particular how it tessellates its input
+space. As we will see, the affine spline connection and geometrical viewpoint
+provide a powerful portal through which to view, analyze, and improve the inner
+workings of a deep network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at 'Notices of the American Mathematical
+  Society'</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Particle Semi-Implicit Variational Inference <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.00649v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.00649v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jen Ning Lim, Adam M. Johansen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-implicit variational inference (SIVI) enriches the expressiveness of
+variational families by utilizing a kernel and a mixing distribution to
+hierarchically define the variational distribution. Existing SIVI methods
+parameterize the mixing distribution using implicit distributions, leading to
+intractable variational densities. As a result, directly maximizing the
+evidence lower bound (ELBO) is not possible, so they resort to one of the
+following: optimizing bounds on the ELBO, employing costly inner-loop Markov
+chain Monte Carlo runs, or solving minimax objectives. In this paper, we
+propose a novel method for SIVI called Particle Variational Inference (PVI)
+which employs empirical measures to approximate the optimal mixing
+distributions characterized as the minimizer of a free energy functional. PVI
+arises naturally as a particle approximation of a Euclidean--Wasserstein
+gradient flow and, unlike prior works, it directly optimizes the ELBO whilst
+making no parametric assumption about the mixing distribution. Our empirical
+results demonstrate that PVI performs favourably compared to other SIVI methods
+across various tasks. Moreover, we provide a theoretical analysis of the
+behaviour of the gradient flow of a related free energy functional:
+establishing the existence and uniqueness of solutions as well as propagation
+of chaos results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 Camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-visual Deepfake Detection With Local Temporal Inconsistencies <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcella Astrid, Enjie Ghorbel, Djamila Aouada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes an audio-visual deepfake detection approach that aims to
+capture fine-grained temporal inconsistencies between audio and visual
+modalities. To achieve this, both architectural and data synthesis strategies
+are introduced. From an architectural perspective, a temporal distance map,
+coupled with an attention mechanism, is designed to capture these
+inconsistencies while minimizing the impact of irrelevant temporal
+subsequences. Moreover, we explore novel pseudo-fake generation techniques to
+synthesize local inconsistencies. Our approach is evaluated against
+state-of-the-art methods using the DFDC and FakeAVCeleb datasets, demonstrating
+its effectiveness in detecting audio-visual deepfakes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Video Moment Retrieval via Off-the-shelf Multimodal Large
+  Language Models <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifang Xu, Yunzhuo Sun, Benxiang Zhai, Ming Li, Wenxin Liang, Yang Li, Sidan Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The target of video moment retrieval (VMR) is predicting temporal spans
+within a video that semantically match a given linguistic query. Existing VMR
+methods based on multimodal large language models (MLLMs) overly rely on
+expensive high-quality datasets and time-consuming fine-tuning. Although some
+recent studies introduce a zero-shot setting to avoid fine-tuning, they
+overlook inherent language bias in the query, leading to erroneous
+localization. To tackle the aforementioned challenges, this paper proposes
+Moment-GPT, a tuning-free pipeline for zero-shot VMR utilizing frozen MLLMs.
+Specifically, we first employ LLaMA-3 to correct and rephrase the query to
+mitigate language bias. Subsequently, we design a span generator combined with
+MiniGPT-v2 to produce candidate spans adaptively. Finally, to leverage the
+video comprehension capabilities of MLLMs, we apply VideoChatGPT and span
+scorer to select the most appropriate spans. Our proposed method substantially
+outperforms the state-ofthe-art MLLM-based and zero-shot models on several
+public datasets, including QVHighlights, ActivityNet-Captions, and
+Charades-STA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">183</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PokerBench: Training Large Language Models to become Professional Poker
+  Players <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08328v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08328v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Zhuang, Akshat Gupta, Richard Yang, Aniket Rahane, Zhengyu Li, Gopala Anumanchipalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce PokerBench - a benchmark for evaluating the poker-playing
+abilities of large language models (LLMs). As LLMs excel in traditional NLP
+tasks, their application to complex, strategic games like poker poses a new
+challenge. Poker, an incomplete information game, demands a multitude of skills
+such as mathematics, reasoning, planning, strategy, and a deep understanding of
+game theory and human psychology. This makes Poker the ideal next frontier for
+large language models. PokerBench consists of a comprehensive compilation of
+11,000 most important scenarios, split between pre-flop and post-flop play,
+developed in collaboration with trained poker players. We evaluate prominent
+models including GPT-4, ChatGPT 3.5, and various Llama and Gemma series models,
+finding that all state-of-the-art LLMs underperform in playing optimal poker.
+However, after fine-tuning, these models show marked improvements. We validate
+PokerBench by having models with different scores compete with each other,
+demonstrating that higher scores on PokerBench lead to higher win rates in
+actual poker games. Through gameplay between our fine-tuned model and GPT-4, we
+also identify limitations of simple supervised fine-tuning for learning optimal
+playing strategy, suggesting the need for more advanced methodologies for
+effectively training language models to excel in games. PokerBench thus
+presents a unique benchmark for a quick and reliable evaluation of the
+poker-playing ability of LLMs as well as a comprehensive benchmark to study the
+progress of LLMs in complex game-playing scenarios. The dataset and code will
+be made available at: \url{https://github.com/pokerllm/pokerbench}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADAM-1: AI and Bioinformatics for Alzheimer's Detection and
+  Microbiome-Clinical Data Integrations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyuan Huang, Vishaldeep Kaur Sekhon, Ouyang Guo, Mark Newman, Roozbeh Sadeghian, Maria L. Vaida, Cynthia Jo, Doyle Ward, Vanni Bucci, John P. Haran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Alzheimer's Disease Analysis Model Generation 1 (ADAM) is a multi-agent
+large language model (LLM) framework designed to integrate and analyze
+multi-modal data, including microbiome profiles, clinical datasets, and
+external knowledge bases, to enhance the understanding and detection of
+Alzheimer's disease (AD). By leveraging retrieval-augmented generation (RAG)
+techniques along with its multi-agent architecture, ADAM-1 synthesizes insights
+from diverse data sources and contextualizes findings using literature-driven
+evidence. Comparative evaluation against XGBoost revealed similar mean F1
+scores but significantly reduced variance for ADAM-1, highlighting its
+robustness and consistency, particularly in small laboratory datasets. While
+currently tailored for binary classification tasks, future iterations aim to
+incorporate additional data modalities, such as neuroimaging and biomarkers, to
+broaden the scalability and applicability for Alzheimer's research and
+diagnostics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Adversarial Post-Training for One-Step Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanchuan Lin, Xin Xia, Yuxi Ren, Ceyuan Yang, Xuefeng Xiao, Lu Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion models are widely used for image and video generation, but
+their iterative generation process is slow and expansive. While existing
+distillation approaches have demonstrated the potential for one-step generation
+in the image domain, they still suffer from significant quality degradation. In
+this work, we propose Adversarial Post-Training (APT) against real data
+following diffusion pre-training for one-step video generation. To improve the
+training stability and quality, we introduce several improvements to the model
+architecture and training procedures, along with an approximated R1
+regularization objective. Empirically, our experiments show that our
+adversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720,
+24fps videos in real time using a single forward evaluation step. Additionally,
+our model is capable of generating 1024px images in a single step, achieving
+quality comparable to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Polynomial Threshold Functions of Bounded Tree-Width: Some
+  Explainability and Complexity Aspects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08297v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08297v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karine Chubarian, Johnny Joyce, Gyorgy Turan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The tree-width of a multivariate polynomial is the tree-width of the
+hypergraph with hyperedges corresponding to its terms. Multivariate polynomials
+of bounded tree-width have been studied by Makowsky and Meer as a new sparsity
+condition that allows for polynomial solvability of problems which are
+intractable in general. We consider a variation on this theme for Boolean
+variables. A representation of a Boolean function as the sign of a polynomial
+is called a polynomial threshold representation. We discuss Boolean functions
+representable as polynomial threshold functions of bounded tree-width and
+present two applications to Bayesian network classifiers, a probabilistic
+graphical model. Both applications are in Explainable Artificial Intelligence
+(XAI), the research area dealing with the black-box nature of many recent
+machine learning models. We also give a separation result between the
+representational power of positive and general polynomial threshold functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 3 figures. To be published in Festschrift in honor of
+  Johann A. Makowsky</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HALoGEN: Fantastic LLM Hallucinations and Where to Find Them 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhilasha Ravichander, Shrusti Ghela, David Wadden, Yejin Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their impressive ability to generate high-quality and fluent text,
+generative large language models (LLMs) also produce hallucinations: statements
+that are misaligned with established world knowledge or provided input context.
+However, measuring hallucination can be challenging, as having humans verify
+model generations on-the-fly is both expensive and time-consuming. In this
+work, we release HALoGEN, a comprehensive hallucination benchmark consisting
+of: (1) 10,923 prompts for generative models spanning nine domains including
+programming, scientific attribution, and summarization, and (2) automatic
+high-precision verifiers for each use case that decompose LLM generations into
+atomic units, and verify each unit against a high-quality knowledge source. We
+use this framework to evaluate ~150,000 generations from 14 language models,
+finding that even the best-performing models are riddled with hallucinations
+(sometimes up to 86% of generated atomic facts depending on the domain). We
+further define a novel error classification for LLM hallucinations based on
+whether they likely stem from incorrect recollection of training data (Type A
+errors), or incorrect knowledge in training data (Type B errors), or are
+fabrication (Type C errors). We hope our framework provides a foundation to
+enable the principled study of why generative models hallucinate, and advances
+the development of trustworthy large language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of Efficient Adapter-Based Fine-Tuning of
+  State-of-the-Art <span class="highlight-title">Transformer</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saad Mashkoor Siddiqui, Mohammad Ali Sheikh, Muhammad Aleem, Kajol R Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate the efficacy of various adapter architectures on
+supervised binary classification tasks from the SuperGLUE benchmark as well as
+a supervised multi-class news category classification task from Kaggle.
+Specifically, we compare classification performance and time complexity of
+three transformer models, namely DistilBERT, ELECTRA, and BART, using
+conventional fine-tuning as well as nine state-of-the-art (SoTA) adapter
+architectures. Our analysis reveals performance differences across adapter
+architectures, highlighting their ability to achieve comparable or better
+performance relative to fine-tuning at a fraction of the training time. Similar
+results are observed on the new classification task, further supporting our
+findings and demonstrating adapters as efficient and flexible alternatives to
+fine-tuning. This study provides valuable insights and guidelines for selecting
+and implementing adapters in diverse natural language processing (NLP)
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Driven Water Segmentation with deep learning models for Enhanced
+  Flood Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanjida Afrin Mou, Tasfia Noor Chowdhury, Adib Ibn Mannan, Sadia Nourin Mim, Lubana Tarannum, Tasrin Noman, Jamal Uddin Ahamed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Flooding is a major natural hazard causing significant fatalities and
+economic losses annually, with increasing frequency due to climate change.
+Rapid and accurate flood detection and monitoring are crucial for mitigating
+these impacts. This study compares the performance of three deep learning
+models UNet, ResNet, and DeepLabv3 for pixelwise water segmentation to aid in
+flood detection, utilizing images from drones, in field observations, and
+social media. This study involves creating a new dataset that augments
+wellknown benchmark datasets with flood-specific images, enhancing the
+robustness of the models. The UNet, ResNet, and DeepLab v3 architectures are
+tested to determine their effectiveness in various environmental conditions and
+geographical locations, and the strengths and limitations of each model are
+also discussed here, providing insights into their applicability in different
+scenarios by predicting image segmentation masks. This fully automated approach
+allows these models to isolate flooded areas in images, significantly reducing
+processing time compared to traditional semi-automated methods. The outcome of
+this study is to predict segmented masks for each image effected by a flood
+disaster and the validation accuracy of these models. This methodology
+facilitates timely and continuous flood monitoring, providing vital data for
+emergency response teams to reduce loss of life and economic damages. It offers
+a significant reduction in the time required to generate flood maps, cutting
+down the manual processing time. Additionally, we present avenues for future
+research, including the integration of multimodal data sources and the
+development of robust deep learning architectures tailored specifically for
+flood detection tasks. Overall, our work contributes to the advancement of
+flood management strategies through innovative use of deep learning
+technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eliciting In-context Retrieval and Reasoning for Long-context Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifu Qiu, Varun Embar, Yizhe Zhang, Navdeep Jaitly, Shay B. Cohen, Benjamin Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in long-context language models (LCLMs) promise to
+transform Retrieval-Augmented Generation (RAG) by simplifying pipelines. With
+their expanded context windows, LCLMs can process entire knowledge bases and
+perform retrieval and reasoning directly -- a capability we define as
+In-Context Retrieval and Reasoning (ICR^2). However, existing benchmarks like
+LOFT often overestimate LCLM performance by providing overly simplified
+contexts. To address this, we introduce ICR^2, a benchmark that evaluates LCLMs
+in more realistic scenarios by including confounding passages retrieved with
+strong retrievers. We then propose three methods to enhance LCLM performance:
+(1) retrieve-then-generate fine-tuning, (2) retrieval-attention-probing, which
+uses attention heads to filter and de-noise long contexts during decoding, and
+(3) joint retrieval head training alongside the generation head. Our evaluation
+of five well-known LCLMs on LOFT and ICR^2 demonstrates significant gains with
+our best approach applied to Mistral-7B: +17 and +15 points by Exact Match on
+LOFT, and +13 and +2 points on ICR^2, compared to vanilla RAG and supervised
+fine-tuning, respectively. It even outperforms GPT-4-Turbo on most tasks
+despite being a much smaller model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Engineering LLM Powered Multi-agent Framework for Autonomous CloudOps <span class="chip">ICSE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kannan Parthasarathy, Karthik Vaidhyanathan, Rudra Dhar, Venkat Krishnamachari, Basil Muhammed, Adyansh Kakran, Sreemaee Akshathala, Shrikara Arun, Sumant Dubey, Mohan Veerubhotla, Amey Karan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cloud Operations (CloudOps) is a rapidly growing field focused on the
+automated management and optimization of cloud infrastructure which is
+essential for organizations navigating increasingly complex cloud environments.
+MontyCloud Inc. is one of the major companies in the CloudOps domain that
+leverages autonomous bots to manage cloud compliance, security, and continuous
+operations. To make the platform more accessible and effective to the
+customers, we leveraged the use of GenAI.
+  Developing a GenAI-based solution for autonomous CloudOps for the existing
+MontyCloud system presented us with various challenges such as i) diverse data
+sources; ii) orchestration of multiple processes; and iii) handling complex
+workflows to automate routine tasks. To this end, we developed MOYA, a
+multi-agent framework that leverages GenAI and balances autonomy with the
+necessary human control. This framework integrates various internal and
+external systems and is optimized for factors like task orchestration,
+security, and error mitigation while producing accurate, reliable, and relevant
+insights by utilizing Retrieval Augmented Generation (RAG). Evaluations of our
+multi-agent system with the help of practitioners as well as using automated
+checks demonstrate enhanced accuracy, responsiveness, and effectiveness over
+non-agentic approaches across complex workflows.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been accepted as full paper to CAIN 2025
+  (https://conf.researchr.org/home/cain-2025), co-located with ICSE 2025
+  (https://conf.researchr.org/home/icse-2025). The paper was submitted to CAIN
+  for review on 9 November 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Feature-Level Ensemble Model for COVID-19 Identification in CXR Images
+  using Choquet Integral and Differential Evolution Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Reza Takhsha, Maryam Rastgarpour, Mozhgan Naderi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic has profoundly impacted billions globally. It
+challenges public health and healthcare systems due to its rapid spread and
+severe respiratory effects. An effective strategy to mitigate the COVID-19
+pandemic involves integrating testing to identify infected individuals. While
+RT-PCR is considered the gold standard for diagnosing COVID-19, it has some
+limitations such as the risk of false negatives. To address this problem, this
+paper introduces a novel Deep Learning Diagnosis System that integrates
+pre-trained Deep Convolutional Neural Networks (DCNNs) within an ensemble
+learning framework to achieve precise identification of COVID-19 cases from
+Chest X-ray (CXR) images. We combine feature vectors from the final hidden
+layers of pre-trained DCNNs using the Choquet integral to capture interactions
+between different DCNNs that a linear approach cannot. We employed
+Sugeno-$\lambda$ measure theory to derive fuzzy measures for subsets of
+networks to enable aggregation. We utilized Differential Evolution to estimate
+fuzzy densities. We developed a TensorFlow-based layer for Choquet operation to
+facilitate efficient aggregation, due to the intricacies involved in
+aggregating feature vectors. Experimental results on the COVIDx dataset show
+that our ensemble model achieved 98\% accuracy in three-class classification
+and 99.50\% in binary classification, outperforming its components-DenseNet-201
+(97\% for three-class, 98.75\% for binary), Inception-v3 (96.25\% for
+three-class, 98.50\% for binary), and Xception (94.50\% for three-class, 98\%
+for binary)-and surpassing many previous methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Pricing in High-Speed Railways Using Multi-Agent Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enrique Adrian Villarrubia-Martin, Luis Rodriguez-Benitez, David Muñoz-Valero, Giovanni Montana, Luis Jimenez-Linares
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses a critical challenge in the high-speed passenger railway
+industry: designing effective dynamic pricing strategies in the context of
+competing and cooperating operators. To address this, a multi-agent
+reinforcement learning (MARL) framework based on a non-zero-sum Markov game is
+proposed, incorporating random utility models to capture passenger decision
+making. Unlike prior studies in areas such as energy, airlines, and mobile
+networks, dynamic pricing for railway systems using deep reinforcement learning
+has received limited attention. A key contribution of this paper is a
+parametrisable and versatile reinforcement learning simulator designed to model
+a variety of railway network configurations and demand patterns while enabling
+realistic, microscopic modelling of user behaviour, called RailPricing-RL. This
+environment supports the proposed MARL framework, which models heterogeneous
+agents competing to maximise individual profits while fostering cooperative
+behaviour to synchronise connecting services. Experimental results validate the
+framework, demonstrating how user preferences affect MARL performance and how
+pricing policies influence passenger choices, utility, and overall system
+dynamics. This study provides a foundation for advancing dynamic pricing
+strategies in railway systems, aligning profitability with system-wide
+efficiency, and supporting future research on optimising pricing policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimization of Link Configuration for Satellite Communication Using
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Rohe, Michael Kölle, Jan Matheis, Rüdiger Höpfl, Leo Sünkel, Claudia Linnhoff-Popien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite communication is a key technology in our modern connected world.
+With increasingly complex hardware, one challenge is to efficiently configure
+links (connections) on a satellite transponder. Planning an optimal link
+configuration is extremely complex and depends on many parameters and metrics.
+The optimal use of the limited resources, bandwidth and power of the
+transponder is crucial. Such an optimization problem can be approximated using
+metaheuristic methods such as simulated annealing, but recent research results
+also show that reinforcement learning can achieve comparable or even better
+performance in optimization methods. However, there have not yet been any
+studies on link configuration on satellite transponders. In order to close this
+research gap, a transponder environment was developed as part of this work. For
+this environment, the performance of the reinforcement learning algorithm PPO
+was compared with the metaheuristic simulated annealing in two experiments. The
+results show that Simulated Annealing delivers better results for this static
+problem than the PPO algorithm, however, the research in turn also underlines
+the potential of reinforcement learning for optimization problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ASTRID -- An Automated and Scalable TRIaD for the Evaluation of
+  RAG-based Clinical Question Answering Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohita Chowdhury, Yajie Vera He, Aisling Higham, Ernest Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown impressive potential in clinical
+question answering (QA), with Retrieval Augmented Generation (RAG) emerging as
+a leading approach for ensuring the factual accuracy of model responses.
+However, current automated RAG metrics perform poorly in clinical and
+conversational use cases. Using clinical human evaluations of responses is
+expensive, unscalable, and not conducive to the continuous iterative
+development of RAG systems. To address these challenges, we introduce ASTRID -
+an Automated and Scalable TRIaD for evaluating clinical QA systems leveraging
+RAG - consisting of three metrics: Context Relevance (CR), Refusal Accuracy
+(RA), and Conversational Faithfulness (CF). Our novel evaluation metric, CF, is
+designed to better capture the faithfulness of a model's response to the
+knowledge base without penalising conversational elements. To validate our
+triad, we curate a dataset of over 200 real-world patient questions posed to an
+LLM-based QA agent during surgical follow-up for cataract surgery - the highest
+volume operation in the world - augmented with clinician-selected questions for
+emergency, clinical, and non-clinical out-of-domain scenarios. We demonstrate
+that CF can predict human ratings of faithfulness better than existing
+definitions for conversational use cases. Furthermore, we show that evaluation
+using our triad consisting of CF, RA, and CR exhibits alignment with clinician
+assessment for inappropriate, harmful, or unhelpful responses. Finally, using
+nine different LLMs, we demonstrate that the three metrics can closely agree
+with human evaluations, highlighting the potential of these metrics for use in
+LLM-driven automated evaluation pipelines. We also publish the prompts and
+datasets for these experiments, providing valuable resources for further
+research and development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling Feature Maps for Quantum Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navneet Singh, Shiva Raj Pokhrel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum Machine Learning (QML) offers significant potential for complex tasks
+like genome sequence classification, but quantum noise on Noisy
+Intermediate-Scale Quantum (NISQ) devices poses practical challenges. This
+study systematically evaluates how various quantum noise models including
+dephasing, amplitude damping, depolarizing, thermal noise, bit-flip, and
+phase-flip affect key QML algorithms (QSVC, Peg-QSVC, QNN, VQC) and feature
+mapping techniques (ZFeatureMap, ZZFeatureMap, and PauliFeatureMap). Results
+indicate that QSVC is notably robust under noise, whereas Peg-QSVC and QNN are
+more sensitive, particularly to depolarizing and amplitude-damping noise. The
+PauliFeatureMap is especially vulnerable, highlighting difficulties in
+maintaining accurate classification under noisy conditions. These findings
+underscore the critical importance of feature map selection and noise
+mitigation strategies in optimizing QML for genomic classification, with
+promising implications for personalized medicine.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EmoNeXt: an Adapted ConvNeXt for Facial Emotion Recognition <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassine El Boudouri, Amine Bohi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expressions play a crucial role in human communication serving as a
+powerful and impactful means to express a wide range of emotions. With
+advancements in artificial intelligence and computer vision, deep neural
+networks have emerged as effective tools for facial emotion recognition. In
+this paper, we propose EmoNeXt, a novel deep learning framework for facial
+expression recognition based on an adapted ConvNeXt architecture network. We
+integrate a Spatial Transformer Network (STN) to focus on feature-rich regions
+of the face and Squeeze-and-Excitation blocks to capture channel-wise
+dependencies. Moreover, we introduce a self-attention regularization term,
+encouraging the model to generate compact feature vectors. We demonstrate the
+superiority of our model over existing state-of-the-art deep learning models on
+the FER2013 dataset regarding emotion classification accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures and 2 tables. 2023 IEEE 25th International
+  Workshop on Multimedia Signal Processing (MMSP), Poitiers, France</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PRESERVE: Prefetching Model Weights and KV-Cache in Distributed LLM
+  Serving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmet Caner Yüzügüler, Jiawei Zhuang, Lukas Cavigelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are widely used across various applications, but
+their substantial computational requirements pose significant challenges,
+particularly in terms of HBM bandwidth bottlenecks and inter-device
+communication overhead. In this paper, we present PRESERVE, a novel prefetching
+framework designed to optimize LLM inference by overlapping memory reads for
+model weights and KV-cache with collective communication operations. Through
+extensive experiments conducted on commercial AI accelerators, we demonstrate
+up to 1.6x end-to-end speedup on state-of-the-art, open-source LLMs.
+Additionally, we perform a design space exploration that identifies the optimal
+hardware configuration for the proposed method, showing a further 1.25x
+improvement in performance per cost by selecting the optimal L2 cache size. Our
+results show that PRESERVE has the potential to mitigate the memory bottlenecks
+and communication overheads, offering a solution to improve the performance and
+scalability of the LLM inference systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Critical Synthesis of Uncertainty Quantification and Foundation Models
+  in Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Landgraf, Rongjun Qin, Markus Ulrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent foundation models have enabled significant breakthroughs in
+monocular depth estimation, a clear path towards safe and reliable deployment
+in the real-world remains elusive. Metric depth estimation, which involves
+predicting absolute distances, poses particular challenges, as even the most
+advanced foundation models remain prone to critical errors. Since quantifying
+the uncertainty has emerged as a promising endeavor to address these
+limitations and enable trustworthy deployment, we fuse five different
+uncertainty quantification methods with the current state-of-the-art
+DepthAnythingV2 foundation model. To cover a wide range of metric depth
+domains, we evaluate their performance on four diverse datasets. Our findings
+identify fine-tuning with the Gaussian Negative Log-Likelihood Loss (GNLL) as a
+particularly promising approach, offering reliable uncertainty estimates while
+maintaining predictive performance and computational efficiency on par with the
+baseline, encompassing both training and inference time. By fusing uncertainty
+quantification and foundation models within the context of monocular depth
+estimation, this paper lays a critical foundation for future research aimed at
+improving not only model performance but also its explainability. Extending
+this critical synthesis of uncertainty quantification and foundation models
+into other crucial tasks, such as semantic segmentation and pose estimation,
+presents exciting opportunities for safer and more reliable machine vision
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction
+  Following 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Fang, Xinle Deng, Kangwei Liu, Ningyu Zhang, Jingyang Qian, Penghui Yang, Xiaohui Fan, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models excel at interpreting complex natural language
+instructions, enabling them to perform a wide range of tasks. In the life
+sciences, single-cell RNA sequencing (scRNA-seq) data serves as the "language
+of cellular biology", capturing intricate gene expression patterns at the
+single-cell level. However, interacting with this "language" through
+conventional tools is often inefficient and unintuitive, posing challenges for
+researchers. To address these limitations, we present InstructCell, a
+multi-modal AI copilot that leverages natural language as a medium for more
+direct and flexible single-cell analysis. We construct a comprehensive
+multi-modal instruction dataset that pairs text-based instructions with
+scRNA-seq profiles from diverse tissues and species. Building on this, we
+develop a multi-modal cell language architecture capable of simultaneously
+interpreting and processing both modalities. InstructCell empowers researchers
+to accomplish critical tasks-such as cell type annotation, conditional
+pseudo-cell generation, and drug sensitivity prediction-using straightforward
+natural language commands. Extensive evaluations demonstrate that InstructCell
+consistently meets or exceeds the performance of existing single-cell
+foundation models, while adapting to diverse experimental conditions. More
+importantly, InstructCell provides an accessible and intuitive tool for
+exploring complex single-cell data, lowering technical barriers and enabling
+deeper biological insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages; 13 figures; Code: https://github.com/zjunlp/Instructcell;
+  Models: https://huggingface.co/zjunlp/Instructcell-chat,
+  https://huggingface.co/zjunlp/InstructCell-instruct</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing AI Adoption and Digitalization in SMEs: A Framework for
+  Implementation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serena Proietti, Roberto Magnani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The primary objective of this research is to examine the current state of
+digitalization and the integration of artificial intelligence (AI) within small
+and medium-sized enterprises (SMEs) in Italy. There is a significant gap
+between SMEs and large corporations in their use of AI, with SMEs facing
+numerous barriers to adoption. This study identifies critical drivers and
+obstacles to achieving intelligent transformation, proposing a framework model
+to address key challenges and provide actionable guidelines
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CG-MER: A Card Game-based Multimodal <span class="highlight-title">dataset</span> for Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08182v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08182v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nessrine Farhat, Amine Bohi, Leila Ben Letaifa, Rim Slama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of affective computing has seen significant advancements in
+exploring the relationship between emotions and emerging technologies. This
+paper presents a novel and valuable contribution to this field with the
+introduction of a comprehensive French multimodal dataset designed specifically
+for emotion recognition. The dataset encompasses three primary modalities:
+facial expressions, speech, and gestures, providing a holistic perspective on
+emotions. Moreover, the dataset has the potential to incorporate additional
+modalities, such as Natural Language Processing (NLP) to expand the scope of
+emotion recognition research. The dataset was curated through engaging
+participants in card game sessions, where they were prompted to express a range
+of emotions while responding to diverse questions. The study included 10
+sessions with 20 participants (9 females and 11 males). The dataset serves as a
+valuable resource for furthering research in emotion recognition and provides
+an avenue for exploring the intricate connections between human emotions and
+digital technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures and 4 tables. Sixteenth International Conference
+  on Machine Vision (ICMV 2023), Yerevan, Armenia</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revolutionizing Communication with Deep Learning and XAI for Enhanced
+  Arabic Sign Language Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mazen Balat, Rewaa Awaad, Ahmed B. Zaky, Salah A. Aly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces an integrated approach to recognizing Arabic Sign
+Language (ArSL) using state-of-the-art deep learning models such as
+MobileNetV3, ResNet50, and EfficientNet-B2. These models are further enhanced
+by explainable AI (XAI) techniques to boost interpretability. The ArSL2018 and
+RGB Arabic Alphabets Sign Language (AASL) datasets are employed, with
+EfficientNet-B2 achieving peak accuracies of 99.48\% and 98.99\%, respectively.
+Key innovations include sophisticated data augmentation methods to mitigate
+class imbalance, implementation of stratified 5-fold cross-validation for
+better generalization, and the use of Grad-CAM for clear model decision
+transparency. The proposed system not only sets new benchmarks in recognition
+accuracy but also emphasizes interpretability, making it suitable for
+applications in healthcare, education, and inclusive communication
+technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 25 figures, 16 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LeapVAD: A Leap in Autonomous Driving via Cognitive Perception and
+  Dual-Process Thinking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukai Ma, Tiantian Wei, Naiting Zhong, Jianbiao Mei, Tao Hu, Licheng Wen, Xuemeng Yang, Botian Shi, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While autonomous driving technology has made remarkable strides, data-driven
+approaches still struggle with complex scenarios due to their limited reasoning
+capabilities. Meanwhile, knowledge-driven autonomous driving systems have
+evolved considerably with the popularization of visual language models. In this
+paper, we propose LeapVAD, a novel method based on cognitive perception and
+dual-process thinking. Our approach implements a human-attentional mechanism to
+identify and focus on critical traffic elements that influence driving
+decisions. By characterizing these objects through comprehensive attributes -
+including appearance, motion patterns, and associated risks - LeapVAD achieves
+more effective environmental representation and streamlines the decision-making
+process. Furthermore, LeapVAD incorporates an innovative dual-process
+decision-making module miming the human-driving learning process. The system
+consists of an Analytic Process (System-II) that accumulates driving experience
+through logical reasoning and a Heuristic Process (System-I) that refines this
+knowledge via fine-tuning and few-shot learning. LeapVAD also includes
+reflective mechanisms and a growing memory bank, enabling it to learn from past
+mistakes and continuously improve its performance in a closed-loop environment.
+To enhance efficiency, we develop a scene encoder network that generates
+compact scene representations for rapid retrieval of relevant driving
+experiences. Extensive evaluations conducted on two leading autonomous driving
+simulators, CARLA and DriveArena, demonstrate that LeapVAD achieves superior
+performance compared to camera-only approaches despite limited training data.
+Comprehensive ablation studies further emphasize its effectiveness in
+continuous learning and domain adaptation. Project page:
+https://pjlab-adg.github.io/LeapVAD/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Potential and Perils of Large Language Models as Judges of Unstructured
+  Textual Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rewina Bedemariam, Natalie Perez, Sreyoshi Bhaduri, Satya Kapoor, Alex Gil, Elizabeth Conjar, Ikkei Itoku, David Theil, Aman Chadha, Naumaan Nayyar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapid advancements in large language models have unlocked remarkable
+capabilities when it comes to processing and summarizing unstructured text
+data. This has implications for the analysis of rich, open-ended datasets, such
+as survey responses, where LLMs hold the promise of efficiently distilling key
+themes and sentiments. However, as organizations increasingly turn to these
+powerful AI systems to make sense of textual feedback, a critical question
+arises, can we trust LLMs to accurately represent the perspectives contained
+within these text based datasets? While LLMs excel at generating human-like
+summaries, there is a risk that their outputs may inadvertently diverge from
+the true substance of the original responses. Discrepancies between the
+LLM-generated outputs and the actual themes present in the data could lead to
+flawed decision-making, with far-reaching consequences for organizations. This
+research investigates the effectiveness of LLMs as judge models to evaluate the
+thematic alignment of summaries generated by other LLMs. We utilized an
+Anthropic Claude model to generate thematic summaries from open-ended survey
+responses, with Amazon's Titan Express, Nova Pro, and Meta's Llama serving as
+LLM judges. The LLM-as-judge approach was compared to human evaluations using
+Cohen's kappa, Spearman's rho, and Krippendorff's alpha, validating a scalable
+alternative to traditional human centric evaluation methods. Our findings
+reveal that while LLMs as judges offer a scalable solution comparable to human
+raters, humans may still excel at detecting subtle, context-specific nuances.
+This research contributes to the growing body of knowledge on AI assisted text
+analysis. We discuss limitations and provide recommendations for future
+research, emphasizing the need for careful consideration when generalizing LLM
+judge models across various contexts and use cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 1 appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ I Can Find You in Seconds! Leveraging Large Language Models for Code
+  Authorship Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soohyeon Choi, Yong Kiam Tan, Mark Huasong Meng, Mohamed Ragab, Soumik Mondal, David Mohaisen, Khin Mi Mi Aung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Source code authorship attribution is important in software forensics,
+plagiarism detection, and protecting software patch integrity. Existing
+techniques often rely on supervised machine learning, which struggles with
+generalization across different programming languages and coding styles due to
+the need for large labeled datasets. Inspired by recent advances in natural
+language authorship analysis using large language models (LLMs), which have
+shown exceptional performance without task-specific tuning, this paper explores
+the use of LLMs for source code authorship attribution.
+  We present a comprehensive study demonstrating that state-of-the-art LLMs can
+successfully attribute source code authorship across different languages. LLMs
+can determine whether two code snippets are written by the same author with
+zero-shot prompting, achieving a Matthews Correlation Coefficient (MCC) of
+0.78, and can attribute code authorship from a small set of reference code
+snippets via few-shot learning, achieving MCC of 0.77. Additionally, LLMs show
+some adversarial robustness against misattribution attacks.
+  Despite these capabilities, we found that naive prompting of LLMs does not
+scale well with a large number of authors due to input token limitations. To
+address this, we propose a tournament-style approach for large-scale
+attribution. Evaluating this approach on datasets of C++ (500 authors, 26,355
+samples) and Java (686 authors, 55,267 samples) code from GitHub, we achieve
+classification accuracy of up to 65% for C++ and 68.7% for Java using only one
+reference per author. These results open new possibilities for applying LLMs to
+code authorship attribution in cybersecurity and software engineering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures,</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FairTTTS: A Tree Test Time Simulation Method for Fairness-Aware
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nurit Cohen-Inger, Lior Rokach, Bracha Shapira, Seffi Cohen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Algorithmic decision-making has become deeply ingrained in many domains, yet
+biases in machine learning models can still produce discriminatory outcomes,
+often harming unprivileged groups. Achieving fair classification is inherently
+challenging, requiring a careful balance between predictive performance and
+ethical considerations. We present FairTTTS, a novel post-processing bias
+mitigation method inspired by the Tree Test Time Simulation (TTTS) method.
+Originally developed to enhance accuracy and robustness against adversarial
+inputs through probabilistic decision-path adjustments, TTTS serves as the
+foundation for FairTTTS. By building on this accuracy-enhancing technique,
+FairTTTS mitigates bias and improves predictive performance. FairTTTS uses a
+distance-based heuristic to adjust decisions at protected attribute nodes,
+ensuring fairness for unprivileged samples. This fairness-oriented adjustment
+occurs as a post-processing step, allowing FairTTTS to be applied to
+pre-trained models, diverse datasets, and various fairness metrics without
+retraining. Extensive evaluation on seven benchmark datasets shows that
+FairTTTS outperforms traditional methods in fairness improvement, achieving a
+20.96% average increase over the baseline compared to 18.78% for related work,
+and further enhances accuracy by 0.55%. In contrast, competing methods
+typically reduce accuracy by 0.42%. These results confirm that FairTTTS
+effectively promotes more equitable decision-making while simultaneously
+improving predictive performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiple-Input Variational Auto-Encoder for Anomaly Detection in
+  Heterogeneous Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phai Vu Dinh, Diep N. Nguyen, Dinh Thai Hoang, Quang Uy Nguyen, Eryk Dutkiewicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection (AD) plays a pivotal role in AI applications, e.g., in
+classification, and intrusion/threat detection in cybersecurity. However, most
+existing methods face challenges of heterogeneity amongst feature subsets posed
+by non-independent and identically distributed (non-IID) data. We propose a
+novel neural network model called Multiple-Input Auto-Encoder for AD (MIAEAD)
+to address this. MIAEAD assigns an anomaly score to each feature subset of a
+data sample to indicate its likelihood of being an anomaly. This is done by
+using the reconstruction error of its sub-encoder as the anomaly score. All
+sub-encoders are then simultaneously trained using unsupervised learning to
+determine the anomaly scores of feature subsets. The final AUC of MIAEAD is
+calculated for each sub-dataset, and the maximum AUC obtained among the
+sub-datasets is selected. To leverage the modelling of the distribution of
+normal data to identify anomalies of the generative models, we develop a novel
+neural network architecture/model called Multiple-Input Variational
+Auto-Encoder (MIVAE). MIVAE can process feature subsets through its
+sub-encoders before learning distribution of normal data in the latent space.
+This allows MIVAE to identify anomalies that deviate from the learned
+distribution. We theoretically prove that the difference in the average anomaly
+score between normal samples and anomalies obtained by the proposed MIVAE is
+greater than that of the Variational Auto-Encoder (VAEAD), resulting in a
+higher AUC for MIVAE. Extensive experiments on eight real-world anomaly
+datasets demonstrate the superior performance of MIAEAD and MIVAE over
+conventional methods and the state-of-the-art unsupervised models, by up to 6%
+in terms of AUC score. Alternatively, MIAEAD and MIVAE have a high AUC when
+applied to feature subsets with low heterogeneity based on the coefficient of
+variation (CV) score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Refusal Behavior in Large Language Models: A Nonlinear Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08145v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08145v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Hildebrandt, Andreas Maier, Patrick Krauss, Achim Schilling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Refusal behavior in large language models (LLMs) enables them to decline
+responding to harmful, unethical, or inappropriate prompts, ensuring alignment
+with ethical standards. This paper investigates refusal behavior across six
+LLMs from three architectural families. We challenge the assumption of refusal
+as a linear phenomenon by employing dimensionality reduction techniques,
+including PCA, t-SNE, and UMAP. Our results reveal that refusal mechanisms
+exhibit nonlinear, multidimensional characteristics that vary by model
+architecture and layer. These findings highlight the need for nonlinear
+interpretability to improve alignment research and inform safer AI deployment
+strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EEG-ReMinD: Enhancing Neurodegenerative EEG Decoding through
+  <span class="highlight-title">Self-Supervised</span> State Reconstruction-Primed Riemannian Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zirui Wang, Zhenxi Song, Yi Guo, Yuxin Liu, Guoyang Xu, Min Zhang, Zhiguo Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of EEG decoding algorithms confronts challenges such as data
+sparsity, subject variability, and the need for precise annotations, all of
+which are vital for advancing brain-computer interfaces and enhancing the
+diagnosis of diseases. To address these issues, we propose a novel two-stage
+approach named Self-Supervised State Reconstruction-Primed Riemannian Dynamics
+(EEG-ReMinD) , which mitigates reliance on supervised learning and integrates
+inherent geometric features. This approach efficiently handles EEG data
+corruptions and reduces the dependency on labels. EEG-ReMinD utilizes
+self-supervised and geometric learning techniques, along with an attention
+mechanism, to analyze the temporal dynamics of EEG features within the
+framework of Riemannian geometry, referred to as Riemannian dynamics.
+Comparative analyses on both intact and corrupted datasets from two different
+neurodegenerative disorders underscore the enhanced performance of EEG-ReMinD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Wall-Pressure Spectrum Model for Aeroacoustic Predictions
+  Based on Symbolic Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Botero Bolívar, David Huergo, Fernanda L. dos Santos, Cornelis H. Venner, Leandro D. de Santana, Esteban Ferrer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fast-turn around methods to predict airfoil trailing-edge noise are crucial
+for incorporating noise limitations into design optimization loops of several
+applications. Among these aeroacoustic predictive models, Amiet's theory offers
+the best balance between accuracy and simplicity. The accuracy of the model
+relies heavily on precise wall-pressure spectrum predictions, which are often
+based on single-equation formulations with adjustable parameters. These
+parameters are calibrated for particular airfoils and flow conditions and
+consequently tend to fail when applied outside their calibration range. This
+paper introduces a new wall-pressure spectrum empirical model designed to
+enhance the robustness and accuracy of current state-of-the-art predictions
+while widening the range of applicability of the model to different airfoils
+and flow conditions. The model is developed using AI-based symbolic regression
+via a genetic-algorithm-based approach, and applied to a dataset of
+wall-pressure fluctuations measured on NACA 0008 and NACA 63018 airfoils at
+multiple angles of attack and inflow velocities, covering turbulent boundary
+layers with both adverse and favorable pressure gradients. Validation against
+experimental data (outside the training dataset) demonstrates the robustness of
+the model compared to well-accepted semi-empirical models. Finally, the model
+is integrated with Amiet's theory to predict the aeroacoustic noise of a
+full-scale wind turbine, showing good agreement with experimental measurements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-situ graph reasoning and knowledge expansion using Graph-PReFLexOR 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08120v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08120v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus J. Buehler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pursuit of automated scientific discovery has fueled progress from
+symbolic logic to modern AI, forging new frontiers in reasoning and pattern
+recognition. Transformers function as potential systems, where every possible
+relationship remains latent potentiality until tasks impose constraints, akin
+to measurement. Yet, refining their sampling requires more than probabilistic
+selection: solutions must conform to specific structures or rules, ensuring
+consistency and the invocation of general principles. We present
+Graph-PReFLexOR (Graph-based Preference-based Recursive Language Modeling for
+Exploratory Optimization of Reasoning), a framework that combines graph
+reasoning with symbolic abstraction to dynamically expand domain knowledge.
+Inspired by reinforcement learning, Graph-PReFLexOR defines reasoning as a
+structured mapping, where tasks yield knowledge graphs, abstract patterns, and
+ultimately, final answers. Inspired by category theory, it encodes concepts as
+nodes and their relationships as edges, supporting hierarchical inference and
+adaptive learning through isomorphic representations. Demonstrations include
+hypothesis generation, materials design, and creative reasoning, such as
+discovering relationships between mythological concepts like 'thin places' with
+materials science. We propose a 'knowledge garden growth' strategy that
+integrates insights across domains, promoting interdisciplinary connections.
+Results with a 3-billion-parameter Graph-PReFLexOR model show superior
+reasoning depth and adaptability, underscoring the potential for transparent,
+multidisciplinary AI-driven discovery. It lays the groundwork for general
+autonomous reasoning solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven inventory management for new products: A warm-start and
+  adjusted Dyna-$Q$ approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Qu, Longxiao Liu, Wenjie Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel reinforcement learning algorithm for
+inventory management of newly launched products with no or limited historical
+demand information. The algorithm follows the classic Dyna-$Q$ structure,
+balancing the model-based and model-free approaches, while accelerating the
+training process of Dyna-$Q$ and mitigating the model discrepancy generated by
+the model-based feedback. Warm-start information from the demand data of
+existing similar products can be incorporated into the algorithm to further
+stabilize the early-stage training and reduce the variance of the estimated
+optimal policy. Our approach is validated through a case study of bakery
+inventory management with real data. The adjusted Dyna-$Q$ shows up to a 23.7\%
+reduction in average daily cost compared with $Q$-learning, and up to a 77.5\%
+reduction in training time within the same horizon compared with classic
+Dyna-$Q$. By incorporating the warm-start information, it can be found that the
+adjusted Dyna-$Q$ has the lowest total cost, lowest variance in total cost, and
+relatively low shortage percentages among all the algorithms under a 30-day
+testing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistency of Responses and Continuations Generated by Large Language
+  Models on Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenlu Fan, Yuqi Zhu, Chenyang Wang, Bin Wang, Wentao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) demonstrate remarkable capabilities in text
+generation, yet their emotional consistency and semantic coherence in social
+media contexts remain insufficiently understood. This study investigates how
+LLMs handle emotional content and maintain semantic relationships through
+continuation and response tasks using two open-source models: Gemma and Llama.
+By analyzing climate change discussions from Twitter and Reddit, we examine
+emotional transitions, intensity patterns, and semantic similarity between
+human-authored and LLM-generated content. Our findings reveal that while both
+models maintain high semantic coherence, they exhibit distinct emotional
+patterns: Gemma shows a tendency toward negative emotion amplification,
+particularly anger, while maintaining certain positive emotions like optimism.
+Llama demonstrates superior emotional preservation across a broader spectrum of
+affects. Both models systematically generate responses with attenuated
+emotional intensity compared to human-authored content and show a bias toward
+positive emotions in response tasks. Additionally, both models maintain strong
+semantic similarity with original texts, though performance varies between
+continuation and response tasks. These findings provide insights into LLMs'
+emotional and semantic processing capabilities, with implications for their
+deployment in social media contexts and human-AI interaction design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guiding the classification of hepatocellular carcinoma on 3D CT-scans
+  using deep and handcrafted radiological features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        E. Sarfati, A. Bône, M-M. Rohé, C. Aubé, M. Ronot, P. Gori, I. Bloch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hepatocellular carcinoma is the most spread primary liver cancer across the
+world ($\sim$80\% of the liver tumors). The gold standard for HCC diagnosis is
+liver biopsy. However, in the clinical routine, expert radiologists provide a
+visual diagnosis by interpreting hepatic CT-scans according to a standardized
+protocol, the LI-RADS, which uses five radiological criteria with an associated
+decision tree. In this paper, we propose an automatic approach to predict
+histology-proven HCC from CT images in order to reduce radiologists'
+inter-variability. We first show that standard deep learning methods fail to
+accurately predict HCC from CT-scans on a challenging database, and propose a
+two-step approach inspired by the LI-RADS system to improve the performance. We
+achieve improvements from 6 to 18 points of AUC with respect to deep learning
+baselines trained with different architectures. We also provide clinical
+validation of our method, achieving results that outperform non-expert
+radiologists and are on par with expert ones.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE ISBI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Action Based Reinforcement Learning for Multi-Objective
+  Compatible Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guizhe Jin, Zhuoren Li, Bo Leng, Wei Han, Lu Xiong, Chen Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning (RL) has shown excellent performance in solving
+decision-making and control problems of autonomous driving, which is
+increasingly applied in diverse driving scenarios. However, driving is a
+multi-attribute problem, leading to challenges in achieving multi-objective
+compatibility for current RL methods, especially in both policy execution and
+policy iteration. On the one hand, the common action space structure with
+single action type limits driving flexibility or results in large behavior
+fluctuations during policy execution. On the other hand, the multi-attribute
+weighted single reward function result in the agent's disproportionate
+attention to certain objectives during policy iterations. To this end, we
+propose a Multi-objective Ensemble-Critic reinforcement learning method with
+Hybrid Parametrized Action for multi-objective compatible autonomous driving.
+Specifically, a parameterized action space is constructed to generate hybrid
+driving actions, combining both abstract guidance and concrete control
+commands. A multi-objective critics architecture is constructed considering
+multiple attribute rewards, to ensure simultaneously focusing on different
+driving objectives. Additionally, uncertainty-based exploration strategy is
+introduced to help the agent faster approach viable driving policy. The
+experimental results in both the simulated traffic environment and the HighD
+dataset demonstrate that our method can achieve multi-objective compatible
+autonomous driving in terms of driving efficiency, action consistency, and
+safety. It enhances the general performance of the driving while significantly
+increasing training efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 9 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Autoscaling for Large Language Model Serving with Chiron 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08090v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08090v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Archit Patke, Dhemath Reddy, Saurabh Jha, Chandra Narayanaswami, Zbigniew Kalbarczyk, Ravishankar Iyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language model (LLM) serving is becoming an increasingly important
+workload for cloud providers. Based on performance SLO requirements, LLM
+inference requests can be divided into (a) interactive requests that have tight
+SLOs in the order of seconds, and (b) batch requests that have relaxed SLO in
+the order of minutes to hours. These SLOs can degrade based on the arrival
+rates, multiplexing, and configuration parameters, thus necessitating the use
+of resource autoscaling on serving instances and their batch sizes. However,
+previous autoscalers for LLM serving do not consider request SLOs leading to
+unnecessary scaling and resource under-utilization. To address these
+limitations, we introduce Chiron, an autoscaler that uses the idea of
+hierarchical backpressure estimated using queue size, utilization, and SLOs.
+Our experiments show that Chiron achieves up to 90% higher SLO attainment and
+improves GPU efficiency by up to 70% compared to existing solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NOMTO: Neural Operator-based symbolic Model approximaTion and discOvery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergei Garmaev, Siddhartha Mishra, Olga Fink
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While many physical and engineering processes are most effectively described
+by non-linear symbolic models, existing non-linear symbolic regression (SR)
+methods are restricted to a limited set of continuous algebraic functions,
+thereby limiting their applicability to discover higher order non-linear
+differential relations. In this work, we introduce the Neural Operator-based
+symbolic Model approximaTion and discOvery (NOMTO) method, a novel approach to
+symbolic model discovery that leverages Neural Operators to encompass a broad
+range of symbolic operations. We demonstrate that NOMTO can successfully
+identify symbolic expressions containing elementary functions with
+singularities, special functions, and derivatives. Additionally, our
+experiments demonstrate that NOMTO can accurately rediscover second-order
+non-linear partial differential equations. By broadening the set of symbolic
+operations available for discovery, NOMTO significantly advances the
+capabilities of existing SR methods. It provides a powerful and flexible tool
+for model discovery, capable of capturing complex relations in a variety of
+physical systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Artificial Liver Classifier: A New Alternative to Conventional Machine
+  Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahmood A. Jumaah, Yossra H. Ali, Tarik A. Rashid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised machine learning classifiers often encounter challenges related to
+performance, accuracy, and overfitting. This paper introduces the Artificial
+Liver Classifier (ALC), a novel supervised learning classifier inspired by the
+human liver's detoxification function. The ALC is characterized by its
+simplicity, speed, hyperparameters-free, ability to reduce overfitting, and
+effectiveness in addressing multi-classification problems through
+straightforward mathematical operations. To optimize the ALC's parameters, an
+improved FOX optimization algorithm (IFOX) is employed as the training method.
+The proposed ALC was evaluated on five benchmark machine learning datasets:
+Iris Flower, Breast Cancer Wisconsin, Wine, Voice Gender, and MNIST. The
+results demonstrated competitive performance, with the ALC achieving 100%
+accuracy on the Iris dataset, surpassing logistic regression, multilayer
+perceptron, and support vector machine. Similarly, on the Breast Cancer
+dataset, it achieved 99.12% accuracy, outperforming XGBoost and logistic
+regression. Across all datasets, the ALC consistently exhibited lower
+overfitting gaps and loss compared to conventional classifiers. These findings
+highlight the potential of leveraging biological process simulations to develop
+efficient machine learning models and open new avenues for innovation in the
+field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Roadmap to Guide the Integration of LLMs in Hierarchical Planning <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Israel Puerta-Merino, Carlos Núñez-Molina, Pablo Mesejo, Juan Fernández-Olivares
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Large Language Models (LLMs) are fostering their
+integration into several reasoning-related fields, including Automated Planning
+(AP). However, their integration into Hierarchical Planning (HP), a subfield of
+AP that leverages hierarchical knowledge to enhance planning performance,
+remains largely unexplored. In this preliminary work, we propose a roadmap to
+address this gap and harness the potential of LLMs for HP. To this end, we
+present a taxonomy of integration methods, exploring how LLMs can be utilized
+within the HP life cycle. Additionally, we provide a benchmark with a
+standardized dataset for evaluating the performance of future LLM-based HP
+approaches, and present initial results for a state-of-the-art HP planner and
+LLM planner. As expected, the latter exhibits limited performance (3\% correct
+plans, and none with a correct hierarchical decomposition) but serves as a
+valuable baseline for future approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 0 figures, to be published in the AAAI Workshop on Planning
+  in the Era of LLMs ( https://llmforplanning.github.io )</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Speech Multi-View Feature Fusion through Conditional
+  Computation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiqiao Shan, Yuhao Zhang, Yuchen Han, Bei Li, Xiaofeng Zhao, Yuang Li, Min Zhang, Hao Yang, Tong Xiao, Jingbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements have highlighted the efficacy of self-supervised learning
+(SSL) features in various speech-related tasks, providing lightweight and
+versatile multi-view speech representations. However, our study reveals that
+while SSL features expedite model convergence, they conflict with traditional
+spectral features like FBanks in terms of update directions. In response, we
+propose a novel generalized feature fusion framework grounded in conditional
+computation, featuring a gradient-sensitive gating network and a multi-stage
+dropout strategy. This framework mitigates feature conflicts and bolsters model
+robustness to multi-view input features. By integrating SSL and spectral
+features, our approach accelerates convergence and maintains performance on par
+with spectral models across multiple speech translation tasks on the MUSTC
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Narrative Clustering in Large Language Models: A Layerwise
+  Analysis of <span class="highlight-title">BERT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Awritrojit Banerjee, Achim Schilling, Patrick Krauss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigates the internal mechanisms of BERT, a transformer-based
+large language model, with a focus on its ability to cluster narrative content
+and authorial style across its layers. Using a dataset of narratives developed
+via GPT-4, featuring diverse semantic content and stylistic variations, we
+analyze BERT's layerwise activations to uncover patterns of localized neural
+processing. Through dimensionality reduction techniques such as Principal
+Component Analysis (PCA) and Multidimensional Scaling (MDS), we reveal that
+BERT exhibits strong clustering based on narrative content in its later layers,
+with progressively compact and distinct clusters. While strong stylistic
+clustering might occur when narratives are rephrased into different text types
+(e.g., fables, sci-fi, kids' stories), minimal clustering is observed for
+authorial style specific to individual writers. These findings highlight BERT's
+prioritization of semantic content over stylistic features, offering insights
+into its representational capabilities and processing hierarchy. This study
+contributes to understanding how transformer models like BERT encode linguistic
+information, paving the way for future interdisciplinary research in artificial
+intelligence and cognitive neuroscience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2408.03062,
+  arXiv:2408.04270, arXiv:2307.01577</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Attentive Spatio-Temporal Calibration for Precise Intermediate
+  Layer Matching in ANN-to-SNN Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Hong, Yueming Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs) are promising for low-power computation due to
+their event-driven mechanism but often suffer from lower accuracy compared to
+Artificial Neural Networks (ANNs). ANN-to-SNN knowledge distillation can
+improve SNN performance, but previous methods either focus solely on label
+information, missing valuable intermediate layer features, or use a layer-wise
+approach that neglects spatial and temporal semantic inconsistencies, leading
+to performance degradation.To address these limitations, we propose a novel
+method called self-attentive spatio-temporal calibration (SASTC). SASTC uses
+self-attention to identify semantically aligned layer pairs between ANN and
+SNN, both spatially and temporally. This enables the autonomous transfer of
+relevant semantic information. Extensive experiments show that SASTC
+outperforms existing methods, effectively solving the mismatching problem.
+Superior accuracy results include 95.12% on CIFAR-10, 79.40% on CIFAR-100 with
+2 time steps, and 68.69% on ImageNet with 4 time steps for static datasets, and
+97.92% on DVS-Gesture and 83.60% on DVS-CIFAR10 for neuromorphic datasets. This
+marks the first time SNNs have outperformed ANNs on both CIFAR-10 and
+CIFAR-100, shedding the new light on the potential applications of SNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building Symbiotic AI: <span class="highlight-title">Review</span>ing the AI Act for a Human-Centred,
+  Principle-Based Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miriana Calvano, Antonio Curci, Giuseppe Desolda, Andrea Esposito, Rosa Lanzilotti, Antonio Piccinno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI) spreads quickly as new technologies and services
+take over modern society. The need to regulate AI design, development, and use
+is strictly necessary to avoid unethical and potentially dangerous consequences
+to humans. The European Union (EU) has released a new legal framework, the AI
+Act, to regulate AI by undertaking a risk-based approach to safeguard humans
+during interaction. At the same time, researchers offer a new perspective on AI
+systems, commonly known as Human-Centred AI (HCAI), highlighting the need for a
+human-centred approach to their design. In this context, Symbiotic AI (a
+subtype of HCAI) promises to enhance human capabilities through a deeper and
+continuous collaboration between human intelligence and AI. This article
+presents the results of a Systematic Literature Review (SLR) that aims to
+identify principles that characterise the design and development of Symbiotic
+AI systems while considering humans as the core of the process. Through content
+analysis, four principles emerged from the review that must be applied to
+create Human-Centred AI systems that can establish a symbiotic relationship
+with humans. In addition, current trends and challenges were defined to
+indicate open questions that may guide future research for the development of
+SAI systems that comply with the AI Act.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First version: 17 pages, 5 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring visual language models as a powerful tool in the diagnosis of
+  Ewing Sarcoma 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alvaro Pastor-Naranjo, Pablo Meseguer, Rocío del Amor, Jose Antonio Lopez-Guerrero, Samuel Navarro, Katia Scotlandi, Antonio Llombart-Bosch, Isidro Machado, Valery Naranjo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ewing's sarcoma (ES), characterized by a high density of small round blue
+cells without structural organization, presents a significant health concern,
+particularly among adolescents aged 10 to 19. Artificial intelligence-based
+systems for automated analysis of histopathological images are promising to
+contribute to an accurate diagnosis of ES. In this context, this study explores
+the feature extraction ability of different pre-training strategies for
+distinguishing ES from other soft tissue or bone sarcomas with similar
+morphology in digitized tissue microarrays for the first time, as far as we
+know. Vision-language supervision (VLS) is compared to fully-supervised
+ImageNet pre-training within a multiple instance learning paradigm. Our
+findings indicate a substantial improvement in diagnostic accuracy with the
+adaption of VLS using an in-domain dataset. Notably, these models not only
+enhance the accuracy of predicted classes but also drastically reduce the
+number of trainable parameters and computational costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures, 2 tables. Oral presentation at KES-InMed 2024
+  held in Madeira, Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ READ: Reinforcement-based Adversarial Learning for Text Classification
+  with Limited Labeled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08035v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08035v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Sharma, Shanu Kumar, Avinash Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained transformer models such as BERT have shown massive gains across
+many text classification tasks. However, these models usually need enormous
+labeled data to achieve impressive performances. Obtaining labeled data is
+often expensive and time-consuming, whereas collecting unlabeled data using
+some heuristics is relatively much cheaper for any task. Therefore, this paper
+proposes a method that encapsulates reinforcement learning-based text
+generation and semi-supervised adversarial learning approaches in a novel way
+to improve the model's performance. Our method READ, Reinforcement-based
+Adversarial learning, utilizes an unlabeled dataset to generate diverse
+synthetic text through reinforcement learning, improving the model's
+generalization capability using adversarial learning. Our experimental results
+show that READ outperforms the existing state-of-art methods on multiple
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Patrol Routing: Optimizing Urban Crime Surveillance through
+  Multi-Agent Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Palma-Borda, Eduardo Guzmán, María-Victoria Belmonte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The effective design of patrol strategies is a difficult and complex problem,
+especially in medium and large areas. The objective is to plan, in a
+coordinated manner, the optimal routes for a set of patrols in a given area, in
+order to achieve maximum coverage of the area, while also trying to minimize
+the number of patrols. In this paper, we propose a multi-agent reinforcement
+learning (MARL) model, based on a decentralized partially observable Markov
+decision process, to plan unpredictable patrol routes within an urban
+environment represented as an undirected graph. The model attempts to maximize
+a target function that characterizes the environment within a given time frame.
+Our model has been tested to optimize police patrol routes in three
+medium-sized districts of the city of Malaga. The aim was to maximize
+surveillance coverage of the most crime-prone areas, based on actual crime data
+in the city. To address this problem, several MARL algorithms have been
+studied, and among these the Value Decomposition Proximal Policy Optimization
+(VDPPO) algorithm exhibited the best performance. We also introduce a novel
+metric, the coverage index, for the evaluation of the coverage performance of
+the routes generated by our model. This metric is inspired by the predictive
+accuracy index (PAI), which is commonly used in criminology to detect hotspots.
+Using this metric, we have evaluated the model under various scenarios in which
+the number of agents (or patrols), their starting positions, and the level of
+information they can observe in the environment have been modified. Results
+show that the coordinated routes generated by our model achieve a coverage of
+more than $90\%$ of the $3\%$ of graph nodes with the highest crime incidence,
+and $65\%$ for $20\%$ of these nodes; $3\%$ and $20\%$ represent the coverage
+standards for police resource allocation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An AI-driven framework for rapid and localized optimizations of urban
+  open spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pegah Eshraghi, Arman Nikkhah Dehnavi, Maedeh Mirdamadi, Riccardo Talami, Zahra-Sadat Zomorodian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As urbanization accelerates, open spaces are increasingly recognized for
+their role in enhancing sustainability and well-being, yet they remain
+underexplored compared to built spaces. This study introduces an AI-driven
+framework that integrates machine learning models (MLMs) and explainable AI
+techniques to optimize Sky View Factor (SVF) and visibility, key spatial
+metrics influencing thermal comfort and perceived safety in urban spaces.
+Unlike global optimization methods, which are computationally intensive and
+impractical for localized adjustments, this framework supports incremental
+design improvements with lower computational costs and greater flexibility. The
+framework employs SHapley Adaptive Explanations (SHAP) to analyze feature
+importance and Counterfactual Explanations (CFXs) to propose minimal design
+changes. Simulations tested five MLMs, identifying XGBoost as the most
+accurate, with building width, park area, and heights of surrounding buildings
+as critical for SVF, and distances from southern buildings as key for
+visibility. Compared to Genetic Algorithms, which required approximately 15/30
+minutes across 3/4 generations to converge, the tested CFX approach achieved
+optimized results in 1 minute with a 5% RMSE error, demonstrating significantly
+faster performance and suitability for scalable retrofitting strategies. This
+interpretable and computationally efficient framework advances urban
+performance optimization, providing data-driven insights and practical
+retrofitting solutions for enhancing usability and environmental quality across
+diverse urban contexts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tutorial: VAE as an inference paradigm for neuroimaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. Vázquez-García, F. J. Martínez-Murcia, F. Segovia Román, Juan M. Górriz Sáez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this tutorial, we explore Variational Autoencoders (VAEs), an essential
+framework for unsupervised learning, particularly suited for high-dimensional
+datasets such as neuroimaging. By integrating deep learning with Bayesian
+inference, VAEs enable the generation of interpretable latent representations.
+This tutorial outlines the theoretical foundations of VAEs, addresses practical
+challenges such as convergence issues and over-fitting, and discusses
+strategies like the reparameterization trick and hyperparameter optimization.
+We also highlight key applications of VAEs in neuroimaging, demonstrating their
+potential to uncover meaningful patterns, including those associated with
+neurodegenerative processes, and their broader implications for analyzing
+complex brain data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TriAdaptLoRA: Brain-Inspired Triangular Adaptive Low-Rank Adaptation for
+  Parameter-Efficient Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Liang, Yuwei Wang, Yi Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fine-tuning of Large Language Models (LLMs) is pivotal for achieving
+optimal performance across diverse downstream tasks. However, while full
+fine-tuning delivers superior results, it entails significant computational and
+resource costs. Parameter-Efficient Fine-Tuning (PEFT) methods, such as LoRA,
+address these challenges by reducing the number of trainable parameters, but
+they often struggle with rank adjustment efficiency and task-specific
+adaptability. We propose Triangular Adaptive Low-Rank Adaptation
+(TriAdaptLoRA), a novel PEFT framework inspired by neuroscience principles,
+which dynamically optimizes the allocation of trainable parameters.
+TriAdaptLoRA introduces three key innovations: 1) a triangular split of
+transformation matrices into lower and upper triangular components to maximize
+parameter utilization, 2) a parameter importance metric based on normalized
+Frobenius norms for efficient adaptation, and 3) an adaptive rank-growth
+strategy governed by dynamic thresholds, allowing flexible parameter allocation
+across training steps. Experiments conducted on a variety of natural language
+understanding and generation tasks demonstrate that TriAdaptLoRA consistently
+outperforms existing PEFT methods. It achieves superior performance, enhanced
+stability, and reduced computational overhead, particularly under linear
+threshold-driven rank growth. These results highlight its efficacy as a
+scalable and resource-efficient solution for fine-tuning LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DisCoPatch: Batch Statistics Are All You Need For OOD Detection, But
+  Only If You Can Trust Them 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Caetano, Christiaan Viviers, Luis A. Zavala-Mondragón, Peter H. N. de With, Fons van der Sommen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection holds significant importance across many
+applications. While semantic and domain-shift OOD problems are well-studied,
+this work focuses on covariate shifts - subtle variations in the data
+distribution that can degrade machine learning performance. We hypothesize that
+detecting these subtle shifts can improve our understanding of in-distribution
+boundaries, ultimately improving OOD detection. In adversarial discriminators
+trained with Batch Normalization (BN), real and adversarial samples form
+distinct domains with unique batch statistics - a property we exploit for OOD
+detection. We introduce DisCoPatch, an unsupervised Adversarial Variational
+Autoencoder (VAE) framework that harnesses this mechanism. During inference,
+batches consist of patches from the same image, ensuring a consistent data
+distribution that allows the model to rely on batch statistics. DisCoPatch uses
+the VAE's suboptimal outputs (generated and reconstructed) as negative samples
+to train the discriminator, thereby improving its ability to delineate the
+boundary between in-distribution samples and covariate shifts. By tightening
+this boundary, DisCoPatch achieves state-of-the-art results in public OOD
+detection benchmarks. The proposed model not only excels in detecting covariate
+shifts, achieving 95.5% AUROC on ImageNet-1K(-C) but also outperforms all prior
+methods on public Near-OOD (95.0%) benchmarks. With a compact model size of
+25MB, it achieves high OOD detection performance at notably lower latency than
+existing methods, making it an efficient and practical solution for real-world
+OOD detection applications. The code will be made publicly available
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Maximizing Uncertainty for Federated learning via Bayesian
+  Optimisation-based Model Poisoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Aristodemou, Xiaolan Liu, Yuan Wang, Konstantinos G. Kyriakopoulos, Sangarapillai Lambotharan, Qingsong Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As we transition from Narrow Artificial Intelligence towards Artificial Super
+Intelligence, users are increasingly concerned about their privacy and the
+trustworthiness of machine learning (ML) technology. A common denominator for
+the metrics of trustworthiness is the quantification of uncertainty inherent in
+DL algorithms, and specifically in the model parameters, input data, and model
+predictions. One of the common approaches to address privacy-related issues in
+DL is to adopt distributed learning such as federated learning (FL), where
+private raw data is not shared among users. Despite the privacy-preserving
+mechanisms in FL, it still faces challenges in trustworthiness. Specifically,
+the malicious users, during training, can systematically create malicious model
+parameters to compromise the models predictive and generative capabilities,
+resulting in high uncertainty about their reliability. To demonstrate malicious
+behaviour, we propose a novel model poisoning attack method named Delphi which
+aims to maximise the uncertainty of the global model output. We achieve this by
+taking advantage of the relationship between the uncertainty and the model
+parameters of the first hidden layer of the local model. Delphi employs two
+types of optimisation , Bayesian Optimisation and Least Squares Trust Region,
+to search for the optimal poisoned model parameters, named as Delphi-BO and
+Delphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise
+the distance of the predictive probability distribution towards an uncertain
+distribution of model output. Furthermore, we establish a mathematical proof
+for the attack effectiveness demonstrated in FL. Numerical results demonstrate
+that Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR
+highlighting vulnerability of FL systems to model poisoning attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GDiffRetro: Retrosynthesis Prediction with Dual Graph Enhanced Molecular
+  Representation and Diffusion Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyin Sun, Wenhao Yu, Yuxiang Ren, Weitao Du, Liwei Liu, Xuecang Zhang, Ying Hu, Chen Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrosynthesis prediction focuses on identifying reactants capable of
+synthesizing a target product. Typically, the retrosynthesis prediction
+involves two phases: Reaction Center Identification and Reactant Generation.
+However, we argue that most existing methods suffer from two limitations in the
+two phases: (i) Existing models do not adequately capture the ``face''
+information in molecular graphs for the reaction center identification. (ii)
+Current approaches for the reactant generation predominantly use sequence
+generation in a 2D space, which lacks versatility in generating reasonable
+distributions for completed reactive groups and overlooks molecules' inherent
+3D properties. To overcome the above limitations, we propose GDiffRetro. For
+the reaction center identification, GDiffRetro uniquely integrates the original
+graph with its corresponding dual graph to represent molecular structures,
+which helps guide the model to focus more on the faces in the graph. For the
+reactant generation, GDiffRetro employs a conditional diffusion model in 3D to
+further transform the obtained synthon into a complete reactant. Our
+experimental findings reveal that GDiffRetro outperforms state-of-the-art
+semi-template models across various evaluative metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM-Ehnanced Holonic Architecture for Ad-Hoc Scalable SoS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Ashfaq, Ahmed R. Sadik, Tommi Mikkonen, Muhammad Waseem, Niko Mäkitalo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As modern system of systems (SoS) become increasingly adaptive and human
+centred, traditional architectures often struggle to support interoperability,
+reconfigurability, and effective human system interaction. This paper addresses
+these challenges by advancing the state of the art holonic architecture for
+SoS, offering two main contributions to support these adaptive needs. First, we
+propose a layered architecture for holons, which includes reasoning,
+communication, and capabilities layers. This design facilitates seamless
+interoperability among heterogeneous constituent systems by improving data
+exchange and integration. Second, inspired by principles of intelligent
+manufacturing, we introduce specialised holons namely, supervisor, planner,
+task, and resource holons aimed at enhancing the adaptability and
+reconfigurability of SoS. These specialised holons utilise large language
+models within their reasoning layers to support decision making and ensure real
+time adaptability. We demonstrate our approach through a 3D mobility case study
+focused on smart city transportation, showcasing its potential for managing
+complex, multimodal SoS environments. Additionally, we propose evaluation
+methods to assess the architecture efficiency and scalability,laying the
+groundwork for future empirical validations through simulations and real world
+implementations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training Hybrid Neural Networks with Multimode Optical Nonlinearities
+  Using Digital Twins 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilker Oguz, Louis J. E. Suter, Jih-Liang Hsieh, Mustafa Yildirim, Niyazi Ulas Dinc, Christophe Moser, Demetri Psaltis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to train ever-larger neural networks brings artificial
+intelligence to the forefront of scientific and technical discoveries. However,
+their exponentially increasing size creates a proportionally greater demand for
+energy and computational hardware. Incorporating complex physical events in
+networks as fixed, efficient computation modules can address this demand by
+decreasing the complexity of trainable layers. Here, we utilize ultrashort
+pulse propagation in multimode fibers, which perform large-scale nonlinear
+transformations, for this purpose. Training the hybrid architecture is achieved
+through a neural model that differentiably approximates the optical system. The
+training algorithm updates the neural simulator and backpropagates the error
+signal over this proxy to optimize layers preceding the optical one. Our
+experimental results achieve state-of-the-art image classification accuracies
+and simulation fidelity. Moreover, the framework demonstrates exceptional
+resilience to experimental drifts. By integrating low-energy physical systems
+into neural networks, this approach enables scalable, energy-efficient AI
+models with significantly reduced computational demands.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GAC-Net_Geometric and attention-based Network for Depth Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuang Zhu, Xingli Gan, Min Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth completion is a key task in autonomous driving, aiming to complete
+sparse LiDAR depth measurements into high-quality dense depth maps through
+image guidance. However, existing methods usually treat depth maps as an
+additional channel of color images, or directly perform convolution on sparse
+data, failing to fully exploit the 3D geometric information in depth maps,
+especially with limited performance in complex boundaries and sparse areas. To
+address these issues, this paper proposes a depth completion network combining
+channel attention mechanism and 3D global feature perception (CGA-Net). The
+main innovations include: 1) Utilizing PointNet++ to extract global 3D
+geometric features from sparse depth maps, enhancing the scene perception
+ability of low-line LiDAR data; 2) Designing a channel-attention-based
+multimodal feature fusion module to efficiently integrate sparse depth, RGB
+images, and 3D geometric features; 3) Combining residual learning with CSPN++
+to optimize the depth refinement stage, further improving the completion
+quality in edge areas and complex scenes. Experiments on the KITTI depth
+completion dataset show that CGA-Net can significantly improve the prediction
+accuracy of dense depth maps, achieving a new state-of-the-art (SOTA), and
+demonstrating strong robustness to sparse and complex scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13pages,4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Facial Dynamics in Video: Instruction Tuning for Improved Facial
+  Expression Perception and Contextual Awareness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Zhao, Boyuan Sun, Xiang Chen, Xihan Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expression captioning has found widespread application across various
+domains. Recently, the emergence of video Multimodal Large Language Models
+(MLLMs) has shown promise in general video understanding tasks. However,
+describing facial expressions within videos poses two major challenges for
+these models: (1) the lack of adequate datasets and benchmarks, and (2) the
+limited visual token capacity of video MLLMs. To address these issues, this
+paper introduces a new instruction-following dataset tailored for dynamic
+facial expression caption. The dataset comprises 5,033 high-quality video clips
+annotated manually, containing over 700,000 tokens. Its purpose is to improve
+the capability of video MLLMs to discern subtle facial nuances. Furthermore, we
+propose FaceTrack-MM, which leverages a limited number of tokens to encode the
+main character's face. This model demonstrates superior performance in tracking
+faces and focusing on the facial expressions of the main characters, even in
+intricate multi-person scenarios. Additionally, we introduce a novel evaluation
+metric combining event extraction, relation classification, and the longest
+common subsequence (LCS) algorithm to assess the content consistency and
+temporal sequence consistency of generated text. Moreover, we present
+FEC-Bench, a benchmark designed to assess the performance of existing video
+MLLMs in this specific task. All data and source code will be made publicly
+available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comprehensive Metapath-based Heterogeneous Graph <span class="highlight-title">Transformer</span> for
+  Gene-Disease Association Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Cui, Shoubo Li, Chen Fang, Qingqing Long, Chengrui Wang, Xuezhi Wang, Yuanchun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovering gene-disease associations is crucial for understanding disease
+mechanisms, yet identifying these associations remains challenging due to the
+time and cost of biological experiments. Computational methods are increasingly
+vital for efficient and scalable gene-disease association prediction.
+Graph-based learning models, which leverage node features and network
+relationships, are commonly employed for biomolecular predictions. However,
+existing methods often struggle to effectively integrate node features,
+heterogeneous structures, and semantic information. To address these
+challenges, we propose COmprehensive MEtapath-based heterogeneous graph
+Transformer(COMET) for predicting gene-disease associations. COMET integrates
+diverse datasets to construct comprehensive heterogeneous networks,
+initializing node features with BioGPT. We define seven Metapaths and utilize a
+transformer framework to aggregate Metapath instances, capturing global
+contexts and long-distance dependencies. Through intra- and inter-metapath
+aggregation using attention mechanisms, COMET fuses latent vectors from
+multiple Metapaths to enhance GDA prediction accuracy. Our method demonstrates
+superior robustness compared to state-of-the-art approaches. Ablation studies
+and visualizations validate COMET's effectiveness, providing valuable insights
+for advancing human health research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Derivation of Output Correlation Inferences for Multi-Output (aka
+  Multi-Task) Gaussian Process 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuhei Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian process (GP) is arguably one of the most widely used machine
+learning algorithms in practice. One of its prominent applications is Bayesian
+optimization (BO). Although the vanilla GP itself is already a powerful tool
+for BO, it is often beneficial to be able to consider the dependencies of
+multiple outputs. To do so, Multi-task GP (MTGP) is formulated, but it is not
+trivial to fully understand the derivations of its formulations and their
+gradients from the previous literature. This paper serves friendly derivations
+of the MTGP formulations and their gradients.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Instruct Few-Shot Jailbreaking: Decompose the Attack into Pattern
+  and Behavior Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Hua, Wanxu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, several works have been conducted on jailbreaking Large Language
+Models (LLMs) with few-shot malicious demos. In particular, Zheng et al. (2024)
+focuses on improving the efficiency of Few-Shot Jailbreaking (FSJ) by injecting
+special tokens into the demos and employing demo-level random search.
+Nevertheless, this method lacks generality since it specifies the
+instruction-response structure. Moreover, the reason why inserting special
+tokens takes effect in inducing harmful behaviors is only empirically
+discussed. In this paper, we take a deeper insight into the mechanism of
+special token injection and propose Self-Instruct Few-Shot Jailbreaking
+(Self-Instruct-FSJ) facilitated with the demo-level greedy search. This
+framework decomposes the FSJ attack into pattern and behavior learning to
+exploit the model's vulnerabilities in a more generalized and efficient way. We
+conduct elaborate experiments to evaluate our method on common open-source
+models and compare it with baseline algorithms. Our code is available at
+https://github.com/iphosi/Self-Instruct-FSJ.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Guide Dog: Egocentric Path Prediction on Smartphone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishwarya Jadhav, Jeffery Cao, Abhishree Shetty, Urvashi Priyam Kumar, Aditi Sharma, Ben Sukboontip, Jayant Sravan Tamarapalli, Jingyi Zhang, Anirudh Koul
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces AI Guide Dog (AIGD), a lightweight egocentric
+navigation assistance system for visually impaired individuals, designed for
+real-time deployment on smartphones. AIGD addresses key challenges in blind
+navigation by employing a vision-only, multi-label classification approach to
+predict directional commands, ensuring safe traversal across diverse
+environments. We propose a novel technique to enable goal-based outdoor
+navigation by integrating GPS signals and high-level directions, while also
+addressing uncertain multi-path predictions for destination-free indoor
+navigation. Our generalized model is the first navigation assistance system to
+handle both goal-oriented and exploratory navigation scenarios across indoor
+and outdoor settings, establishing a new state-of-the-art in blind navigation.
+We present methods, datasets, evaluations, and deployment insights to encourage
+further innovations in assistive navigation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early prediction of the transferability of bovine embryos from
+  videomicroscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasmine Hachani, Patrick Bouthemy, Elisa Fromont, Sylvie Ruffini, Ludivine Laffont, Alline de Paula Reis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Videomicroscopy is a promising tool combined with machine learning for
+studying the early development of in vitro fertilized bovine embryos and
+assessing its transferability as soon as possible. We aim to predict the embryo
+transferability within four days at most, taking 2D time-lapse microscopy
+videos as input. We formulate this problem as a supervised binary
+classification problem for the classes transferable and not transferable. The
+challenges are three-fold: 1) poorly discriminating appearance and motion, 2)
+class ambiguity, 3) small amount of annotated data. We propose a 3D
+convolutional neural network involving three pathways, which makes it
+multi-scale in time and able to handle appearance and motion in different ways.
+For training, we retain the focal loss. Our model, named SFR, compares
+favorably to other methods. Experiments demonstrate its effectiveness and
+accuracy for our challenging biological task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2024 IEEE International Conference on Image
+  Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advice for Diabetes Self-Management by Chat<span class="highlight-title">GPT</span> Models: Challenges and
+  Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Waqar Hussain, John Grundy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given their ability for advanced reasoning, extensive contextual
+understanding, and robust question-answering abilities, large language models
+have become prominent in healthcare management research. Despite adeptly
+handling a broad spectrum of healthcare inquiries, these models face
+significant challenges in delivering accurate and practical advice for chronic
+conditions such as diabetes. We evaluate the responses of ChatGPT versions 3.5
+and 4 to diabetes patient queries, assessing their depth of medical knowledge
+and their capacity to deliver personalized, context-specific advice for
+diabetes self-management. Our findings reveal discrepancies in accuracy and
+embedded biases, emphasizing the models' limitations in providing tailored
+advice unless activated by sophisticated prompting techniques. Additionally, we
+observe that both models often provide advice without seeking necessary
+clarification, a practice that can result in potentially dangerous advice. This
+underscores the limited practical effectiveness of these models without human
+oversight in clinical settings. To address these issues, we propose a
+commonsense evaluation layer for prompt evaluation and incorporating
+disease-specific external memory using an advanced Retrieval Augmented
+Generation technique. This approach aims to improve information quality and
+reduce misinformation risks, contributing to more reliable AI applications in
+healthcare settings. Our findings seek to influence the future direction of AI
+in healthcare, enhancing both the scope and quality of its integration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN
+  Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thibaut Boissin, Franck Mamalet, Thomas Fel, Agustin Martin Picard, Thomas Massena, Mathieu Serrurier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Orthogonal convolutional layers are the workhorse of multiple areas in
+machine learning, such as adversarial robustness, normalizing flows, GANs, and
+Lipschitzconstrained models. Their ability to preserve norms and ensure stable
+gradient propagation makes them valuable for a large range of problems. Despite
+their promise, the deployment of orthogonal convolution in large-scale
+applications is a significant challenge due to computational overhead and
+limited support for modern features like strides, dilations, group
+convolutions, and transposed convolutions.In this paper, we introduce AOC
+(Adaptative Orthogonal Convolution), a scalable method for constructing
+orthogonal convolutions, effectively overcoming these limitations. This
+advancement unlocks the construction of architectures that were previously
+considered impractical. We demonstrate through our experiments that our method
+produces expressive models that become increasingly efficient as they scale. To
+foster further advancement, we provide an open-source library implementing this
+method, available at https://github.com/thib-s/orthogonium.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gandalf the Red: Adaptive Security for LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niklas Pfister, Václav Volhejn, Manuel Knott, Santiago Arias, Julia Bazińska, Mykhailo Bichurin, Alan Commike, Janet Darling, Peter Dienes, Matthew Fiedler, David Haber, Matthias Kraft, Marco Lancini, Max Mathys, Damián Pascual-Ortiz, Jakub Podolak, Adrià Romero-López, Kyriacos Shiarlis, Andreas Signer, Zsolt Terek, Athanasios Theocharis, Daniel Timbrell, Samuel Trautwein, Samuel Watts, Natalie Wu, Mateo Rojas-Carulla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current evaluations of defenses against prompt attacks in large language
+model (LLM) applications often overlook two critical factors: the dynamic
+nature of adversarial behavior and the usability penalties imposed on
+legitimate users by restrictive defenses. We propose D-SEC (Dynamic Security
+Utility Threat Model), which explicitly separates attackers from legitimate
+users, models multi-step interactions, and rigorously expresses the
+security-utility in an optimizable form. We further address the shortcomings in
+existing evaluations by introducing Gandalf, a crowd-sourced, gamified
+red-teaming platform designed to generate realistic, adaptive attack datasets.
+Using Gandalf, we collect and release a dataset of 279k prompt attacks.
+Complemented by benign user data, our analysis reveals the interplay between
+security and utility, showing that defenses integrated in the LLM (e.g., system
+prompts) can degrade usability even without blocking requests. We demonstrate
+that restricted application domains, defense-in-depth, and adaptive defenses
+are effective strategies for building secure and useful LLM applications. Code
+is available at
+\href{https://github.com/lakeraai/dsec-gandalf}{\texttt{https://github.com/lakeraai/dsec-gandalf}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Niklas Pfister, V\'aclav Volhejn and Manuel Knott contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Aviation Incident Narratives Using Topic Modeling and
+  Clustering Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aziida Nanyonga, Hassan Wasswa, Ugur Turhan, Keith Joiner, Graham Wild
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aviation safety is a global concern, requiring detailed investigations into
+incidents to understand contributing factors comprehensively. This study uses
+the National Transportation Safety Board (NTSB) dataset. It applies advanced
+natural language processing (NLP) techniques, including Latent Dirichlet
+Allocation (LDA), Non-Negative Matrix Factorization (NMF), Latent Semantic
+Analysis (LSA), Probabilistic Latent Semantic Analysis (pLSA), and K-means
+clustering. The main objectives are identifying latent themes, exploring
+semantic relationships, assessing probabilistic connections, and cluster
+incidents based on shared characteristics. This research contributes to
+aviation safety by providing insights into incident narratives and
+demonstrating the versatility of NLP and topic modelling techniques in
+extracting valuable information from complex datasets. The results, including
+topics identified from various techniques, provide an understanding of
+recurring themes. Comparative analysis reveals that LDA performed best with a
+coherence value of 0.597, pLSA of 0.583, LSA of 0.542, and NMF of 0.437.
+K-means clustering further reveals commonalities and unique insights into
+incident narratives. In conclusion, this study uncovers latent patterns and
+thematic structures within incident narratives, offering a comparative analysis
+of multiple-topic modelling techniques. Future research avenues include
+exploring temporal patterns, incorporating additional datasets, and developing
+predictive models for early identification of safety issues. This research lays
+the groundwork for enhancing the understanding and improvement of aviation
+safety by utilising the wealth of information embedded in incident narratives.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Model Interface for Home Energy Management Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07919v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07919v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        François Michelon, Yihong Zhou, Thomas Morstyn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Home Energy Management Systems (HEMSs) help households tailor their
+electricity usage based on power system signals such as energy prices. This
+technology helps to reduce energy bills and offers greater demand-side
+flexibility that supports the power system stability. However, residents who
+lack a technical background may find it difficult to use HEMSs effectively,
+because HEMSs require well-formatted parameterization that reflects the
+characteristics of the energy resources, houses, and users' needs. Recently,
+Large-Language Models (LLMs) have demonstrated an outstanding ability in
+language understanding. Motivated by this, we propose an LLM-based interface
+that interacts with users to understand and parameterize their
+``badly-formatted answers'', and then outputs well-formatted parameters to
+implement an HEMS. We further use Reason and Act method (ReAct) and few-shot
+prompting to enhance the LLM performance. Evaluating the interface performance
+requires multiple user--LLM interactions. To avoid the efforts in finding
+volunteer users and reduce the evaluation time, we additionally propose a
+method that uses another LLM to simulate users with varying expertise, ranging
+from knowledgeable to non-technical. By comprehensive evaluation, the proposed
+LLM-based HEMS interface achieves an average parameter retrieval accuracy of
+88\%, outperforming benchmark models without ReAct and/or few-shot prompting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Governing AI Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07913v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07913v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noam Kolt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of AI is undergoing a fundamental transition from systems that can
+produce synthetic content upon request to autonomous agents that can plan and
+execute complex tasks with only limited human involvement. Companies that
+pioneered the development of generative AI tools are now building AI agents
+that can be instructed to independently navigate the internet, perform a wide
+range of online tasks, and serve as artificial personal assistants and virtual
+coworkers. The opportunities presented by this new technology are tremendous,
+as are the associated risks. Fortunately, there exist robust analytic
+frameworks for confronting many of these challenges, namely, the economic
+theory of principal-agent problems and the common law doctrine of agency
+relationships. Drawing on these frameworks, this Article makes three
+contributions. First, it uses agency law and theory to identify and
+characterize problems arising from AI agents, including issues of information
+asymmetry, discretionary authority, and loyalty. Second, it illustrates the
+limitations of conventional solutions to agency problems: incentive design,
+monitoring, and enforcement might not be effective for governing AI agents that
+make uninterpretable decisions and operate at unprecedented speed and scale.
+Third, the Article explores the implications of agency law and theory for
+designing and regulating AI agents, arguing that new technical and legal
+infrastructure is needed to support governance principles of inclusivity,
+visibility, and liability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning and Natural Language Processing in the Field of
+  Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07911v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07911v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rémy Kessler, Nicolas Béchet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article presents a complete process to extract hypernym relationships in
+the field of construction using two main steps: terminology extraction and
+detection of hypernyms from these terms. We first describe the corpus analysis
+method to extract terminology from a collection of technical specifications in
+the field of construction. Using statistics and word n-grams analysis, we
+extract the domain's terminology and then perform pruning steps with linguistic
+patterns and internet queries to improve the quality of the final terminology.
+Second, we present a machine-learning approach based on various words embedding
+models and combinations to deal with the detection of hypernyms from the
+extracted terminology. Extracted terminology is evaluated using a manual
+evaluation carried out by 6 experts in the domain, and the hypernym
+identification method is evaluated with different datasets. The global approach
+provides relevant and promising results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Logarithmic Memory Networks (LMNs): Efficient Long-Range Sequence
+  Modeling for Resource-Constrained Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed A. Taha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-range sequence modeling is a crucial aspect of natural language
+processing and time series analysis. However, traditional models like Recurrent
+Neural Networks (RNNs) and Transformers suffer from computational and memory
+inefficiencies, especially when dealing with long sequences. This paper
+introduces Logarithmic Memory Networks (LMNs), a novel architecture that
+leverages a hierarchical logarithmic tree structure to efficiently store and
+retrieve past information. LMNs dynamically summarize historical context,
+significantly reducing the memory footprint and computational complexity of
+attention mechanisms from O(n2) to O(log(n)). The model employs a
+single-vector, targeted attention mechanism to access stored information, and
+the memory block construction worker (summarizer) layer operates in two modes:
+a parallel execution mode during training for efficient processing of
+hierarchical tree structures and a sequential execution mode during inference,
+which acts as a memory management system. It also implicitly encodes positional
+information, eliminating the need for explicit positional encodings. These
+features make LMNs a robust and scalable solution for processing long-range
+sequences in resource-constrained environments, offering practical improvements
+in efficiency and scalability. The code is publicly available under the MIT
+License on GitHub: https://github.com/AhmedBoin/LogarithmicMemory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Classification Trees for Continuous Feature Data Using Dynamic
+  Programming with Branch-and-Bound <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Catalin E. Brita, Jacobus G. M. van der Linden, Emir Demirović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computing an optimal classification tree that provably maximizes training
+performance within a given size limit, is NP-hard, and in practice, most
+state-of-the-art methods do not scale beyond computing optimal trees of depth
+three. Therefore, most methods rely on a coarse binarization of continuous
+features to maintain scalability. We propose a novel algorithm that optimizes
+trees directly on the continuous feature data using dynamic programming with
+branch-and-bound. We develop new pruning techniques that eliminate many
+sub-optimal splits in the search when similar to previously computed splits and
+we provide an efficient subroutine for computing optimal depth-two trees. Our
+experiments demonstrate that these techniques improve runtime by one or more
+orders of magnitude over state-of-the-art optimal methods and improve test
+accuracy by 5% over greedy heuristics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the proceedings of AAAI-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anytime Cooperative Implicit Hitting Set Solving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emma Rollón, Javier Larrosa, Aleksandra Petrova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Implicit Hitting Set (HS) approach has shown to be very effective for
+MaxSAT, Pseudo-boolean optimization and other boolean frameworks. Very
+recently, it has also shown its potential in the very similar Weighted CSP
+framework by means of the so-called cost-function merging. The original
+formulation of the HS approach focuses on obtaining increasingly better lower
+bounds (HS-lb). However, and as shown for Pseudo-Boolean Optimization, this
+approach can also be adapted to compute increasingly better upper bounds
+(HS-ub). In this paper we consider both HS approaches and show how they can be
+easily combined in a multithread architecture where cores discovered by either
+component are available by the other which, interestingly, generates synergy
+between them. We show that the resulting algorithm (HS-lub) is consistently
+superior to either HS-lb and HS-ub in isolation. Most importantly, HS-lub has
+an effective anytime behaviour with which the optimality gap is reduced during
+the execution. We tested our approach on the Weighted CSP framework and show on
+three different benchmarks that our very simple implementation sometimes
+outperforms the parallel hybrid best-first search implementation of the far
+more developed state-of-the-art Toulbar2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Metamemory Mechanisms for Enhanced Data-Free Code Generation
+  in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Wang, Liang Ding, Yibing Zhan, Yong Luo, Zheng He, Dapeng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated code generation using large language models (LLMs) has gained
+attention due to its efficiency and adaptability. However, real-world coding
+tasks or benchmarks like HumanEval and StudentEval often lack dedicated
+training datasets, challenging existing few-shot prompting approaches that rely
+on reference examples. Inspired by human metamemory-a cognitive process
+involving recall and evaluation-we present a novel framework (namely M^2WF) for
+improving LLMs' one-time code generation. This approach enables LLMs to
+autonomously generate, evaluate, and utilize synthetic examples to enhance
+reliability and performance. Unlike prior methods, it minimizes dependency on
+curated data and adapts flexibly to various coding scenarios. Our experiments
+demonstrate significant improvements in coding benchmarks, offering a scalable
+and robust solution for data-free environments. The code and framework will be
+publicly available on GitHub and HuggingFace.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages,6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GRAPHMOE: Amplifying Cognitive Depth of Mixture-of-Experts Network via
+  Introducing Self-Rethinking Mechanism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Tang, Bo Lv, Zifan Zheng, Bohao Yang, Kun Zhao, Ning Liao, Xiaoxing Wang, Feiyu Xiong, Zhiyu Li, Nayu Liu, Jingchi Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Mixture-of-Experts (MoE) networks benefit from utilizing multiple
+smaller expert models as opposed to a single large network. However, these
+experts typically operate independently, leaving a question open about whether
+interconnecting these models could enhance the performance of MoE networks. In
+response, we introduce GRAPHMOE, a novel method aimed at augmenting the
+cognitive depth of language models via a self-rethinking mechanism constructed
+on Pseudo GraphMoE networks. GRAPHMOE employs a recurrent routing strategy to
+simulate iterative thinking steps, thereby facilitating the flow of information
+among expert nodes. We implement the GRAPHMOE architecture using Low-Rank
+Adaptation techniques (LoRA) and conduct extensive experiments on various
+benchmark datasets. The experimental results reveal that GRAPHMOE outperforms
+other LoRA based models, achieving state-of-the-art (SOTA) performance.
+Additionally, this study explores a novel recurrent routing strategy that may
+inspire further advancements in enhancing the reasoning capabilities of
+language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tarsier2: Advancing Large Vision-Language Models from Detailed Video
+  Description to Comprehensive Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Yuan, Jiawei Wang, Haomiao Sun, Yuchen Zhang, Yuan Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM)
+designed for generating detailed and accurate video descriptions, while also
+exhibiting superior general video understanding capabilities. Tarsier2 achieves
+significant advancements through three key upgrades: (1) Scaling pre-training
+data from 11M to 40M video-text pairs, enriching both volume and diversity; (2)
+Performing fine-grained temporal alignment during supervised fine-tuning; (3)
+Using model-based sampling to automatically construct preference data and
+applying DPO training for optimization. Extensive experiments show that
+Tarsier2-7B consistently outperforms leading proprietary models, including
+GPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K
+benchmark, Tarsier2-7B improves F1 by 2.8\% over GPT-4o and 5.8\% over
+Gemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\%
+performance advantage over GPT-4o and +24.9\% over Gemini-1.5-Pro. Tarsier2-7B
+also sets new state-of-the-art results across 15 public benchmarks, spanning
+tasks such as video question-answering, video grounding, hallucination test,
+and embodied question-answering, demonstrating its versatility as a robust
+generalist vision-language model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Label Refinement Matters More than Preference Optimization
+  under Weak Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaowen Ye, Cassidy Laidlaw, Jacob Steinhardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language model (LM) post-training relies on two stages of human supervision:
+task demonstrations for supervised finetuning (SFT), followed by preference
+comparisons for reinforcement learning from human feedback (RLHF). As LMs
+become more capable, the tasks they are given become harder to supervise. Will
+post-training remain effective under unreliable supervision? To test this, we
+simulate unreliable demonstrations and comparison feedback using small LMs and
+time-constrained humans. We find that in the presence of unreliable
+supervision, SFT still retains some effectiveness, but DPO (a common RLHF
+algorithm) fails to improve the model beyond SFT. To address this, we propose
+iterative label refinement (ILR) as an alternative to RLHF. ILR improves the
+SFT data by using comparison feedback to decide whether human demonstrations
+should be replaced by model-generated alternatives, then retrains the model via
+SFT on the updated data. SFT+ILR outperforms SFT+DPO on several tasks with
+unreliable supervision (math, coding, and safe instruction-following). Our
+findings suggest that as LMs are used for complex tasks where human supervision
+is unreliable, RLHF may no longer be the best use of human comparison feedback;
+instead, it is better to direct feedback towards improving the training data
+rather than continually training the model. Our code and data are available at
+https://github.com/helloelwin/iterative-label-refinement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Learning with Embedding Layer Surgery and Task-wise Beam
+  Search using Whisper 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chin Yuen Kwok, Jia Qi Yip, Eng Siong Chng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current Multilingual ASR models only support a fraction of the world's
+languages. Continual Learning (CL) aims to tackle this problem by adding new
+languages to pre-trained models while avoiding the loss of performance on
+existing languages, also known as Catastrophic Forgetting (CF). However,
+existing CL methods overlook the adaptation of the token embedding lookup table
+at the decoder, despite its significant contribution to CF. We propose
+Embedding Layer Surgery where separate copies of the token embeddings are
+created for each new languages, and one of the copies is selected to replace
+the old languages embeddings when transcribing the corresponding new language.
+Unfortunately, this approach means LID errors also cause incorrect ASR
+embedding selection. Our Task-wise Beam Search allows self-correction for such
+mistakes. By adapting Whisper to 10 hours of data for each of 10 unseen
+languages from Common Voice, results show that our method reduces the Average
+WER (AWER) of pre-trained languages from 14.2% to 11.9% compared with
+Experience Replay, without compromising the AWER of the unseen languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in 2024 IEEE Spoken Language Technology Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ deepTerra -- AI Land Classification Made Easy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Keith Wilkinson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  deepTerra is a comprehensive platform designed to facilitate the
+classification of land surface features using machine learning and satellite
+imagery. The platform includes modules for data collection, image augmentation,
+training, testing, and prediction, streamlining the entire workflow for image
+classification tasks. This paper presents a detailed overview of the
+capabilities of deepTerra, shows how it has been applied to various research
+areas, and discusses the future directions it might take.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Repository-Level Code Summarization for Business
+  Applications Using Local LLMs <span class="chip">ICSE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nilesh Dhulshette, Sapan Shah, Vinay Kulkarni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In large-scale software development, understanding the functionality and
+intent behind complex codebases is critical for effective development and
+maintenance. While code summarization has been widely studied, existing methods
+primarily focus on smaller code units, such as functions, and struggle with
+larger code artifacts like files and packages. Additionally, current
+summarization models tend to emphasize low-level implementation details, often
+overlooking the domain and business context that are crucial for real-world
+applications. This paper proposes a two-step hierarchical approach for
+repository-level code summarization, tailored to business applications. First,
+smaller code units such as functions and variables are identified using syntax
+analysis and summarized with local LLMs. These summaries are then aggregated to
+generate higher-level file and package summaries. To ensure the summaries are
+grounded in business context, we design custom prompts that capture the
+intended purpose of code artifacts based on the domain and problem context of
+the business application. We evaluate our approach on a business support system
+(BSS) for the telecommunications domain, showing that syntax analysis-based
+hierarchical summarization improves coverage, while business-context grounding
+enhances the relevance of the generated summaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at LLM4Code@ICSE 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ State-of-the-Art <span class="highlight-title">Transformer</span> Models for Image Super-Resolution:
+  Techniques, Challenges, and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debasish Dutta, Deepjyoti Chetia, Neeharika Sonowal, Sanjib Kr Kalita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image Super-Resolution (SR) aims to recover a high-resolution image from its
+low-resolution counterpart, which has been affected by a specific degradation
+process. This is achieved by enhancing detail and visual quality. Recent
+advancements in transformer-based methods have remolded image super-resolution
+by enabling high-quality reconstructions surpassing previous deep-learning
+approaches like CNN and GAN-based. This effectively addresses the limitations
+of previous methods, such as limited receptive fields, poor global context
+capture, and challenges in high-frequency detail recovery. Additionally, the
+paper reviews recent trends and advancements in transformer-based SR models,
+exploring various innovative techniques and architectures that combine
+transformers with traditional networks to balance global and local contexts.
+These neoteric methods are critically analyzed, revealing promising yet
+unexplored gaps and potential directions for future research. Several
+visualizations of models and techniques are included to foster a holistic
+understanding of recent trends. This work seeks to offer a structured roadmap
+for researchers at the forefront of deep learning, specifically exploring the
+impact of transformers on super-resolution techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Language Models for Grammatical Acceptability: A Comparative
+  Study of Fine-Tuning Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shobhit Ratan, Farley Knight, Ghada Jerfel, Sze Chung Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the fine-tuning (FT) of the Open Pre-trained Transformer
+(OPT-125M) for grammatical acceptability tasks using the CoLA dataset. By
+comparing Vanilla-Fine-Tuning (VFT), Pattern-Based-Fine-Tuning (PBFT), and
+Parameter-Efficient Fine-Tuning techniques (PEFT) like Low-Rank Adaptation
+(LoRA), we demonstrate significant improvements in computational efficiency
+while maintaining high accuracy. Our experiments reveal that while VFT achieves
+the highest accuracy (81.2%), LoRA enhancing FT by reducing memory usage and
+iteration time by more than 50%, and increases accuracy in PBFT case. Context
+Distillation (CD), though computationally efficient, underperformed with
+accuracy around 31%. Our findings contribute to democratizing access to large
+language models (LLM) by reducing computational barriers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling Provider Bias in Large Language Models for Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Zhang, Juan Zhai, Shiqing Ma, Qingshuang Bao, Weipeng Jiang, Chao Shen, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have emerged as the new recommendation engines,
+outperforming traditional methods in both capability and scope, particularly in
+code generation applications. Our research reveals a novel provider bias in
+LLMs, namely without explicit input prompts, these models show systematic
+preferences for services from specific providers in their recommendations
+(e.g., favoring Google Cloud over Microsoft Azure). This bias holds significant
+implications for market dynamics and societal equilibrium, potentially
+promoting digital monopolies. It may also deceive users and violate their
+expectations, leading to various consequences. This paper presents the first
+comprehensive empirical study of provider bias in LLM code generation. We
+develop a systematic methodology encompassing an automated pipeline for dataset
+generation, incorporating 6 distinct coding task categories and 30 real-world
+application scenarios. Our analysis encompasses over 600,000 LLM-generated
+responses across seven state-of-the-art models, utilizing approximately 500
+million tokens (equivalent to \$5,000+ in computational costs). The study
+evaluates both the generated code snippets and their embedded service provider
+selections to quantify provider bias. Additionally, we conduct a comparative
+analysis of seven debiasing prompting techniques to assess their efficacy in
+mitigating these biases. Our findings demonstrate that LLMs exhibit significant
+provider preferences, predominantly favoring services from Google and Amazon,
+and can autonomously modify input code to incorporate their preferred providers
+without users' requests. Notably, we observe discrepancies between providers
+recommended in conversational contexts versus those implemented in generated
+code. The complete dataset and analysis results are available in our
+repository.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Driver Advisory System Based on Large Language Model for High-speed
+  Train 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Y. C. Luo, J. Xun, W. Wang, R. Z. Zhang, Z. C. Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of China high-speed railway, drivers face
+increasingly significant technical challenges during operations, such as fault
+handling. Currently, drivers depend on the onboard mechanic when facing
+technical issues, for instance, traction loss or sensor faults. This dependency
+can hinder effective operation, even lead to accidents, while waiting for
+faults to be addressed. To enhance the accuracy and explainability of actions
+during fault handling, an Intelligent Driver Advisory System (IDAS) framework
+based on a large language model (LLM) named IDAS-LLM, is introduced. Initially,
+domain-fine-tuning of the LLM is performed using a constructed railway
+knowledge question-and-answer dataset to improve answer accuracy in
+railway-related questions. Subsequently, integration of the Retrieval-augmented
+Generation (RAG) architecture is pursued for system design to enhance the
+explainability of generated responses. Comparative experiments are conducted
+using the constructed railway driving knowledge assessment dataset. Results
+indicate that domain-fine-tuned LLMs show an improvement in answer accuracy by
+an average of 10%, outperforming some current mainstream LLMs. Additionally,
+the inclusion of the RAG framework increases the average recall rate of
+question-and-answer sessions by about 4%. Finally, the fault handling
+capability of IDAS-LLM is demonstrated through simulations of real operational
+scenarios, proving that the proposed framework has practical application
+prospects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 7 figures, presented at 104th TRB Annual Meeting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flow: A Modular Approach to Automated Agentic Workflow Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boye Niu, Yiliao Song, Kai Lian, Yifan Shen, Yu Yao, Kun Zhang, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent frameworks powered by large language models (LLMs) have
+demonstrated great success in automated planning and task execution. However,
+the effective adjustment of Agentic workflows during execution has not been
+well-studied. A effective workflow adjustment is crucial, as in many real-world
+scenarios, the initial plan must adjust to unforeseen challenges and changing
+conditions in real-time to ensure the efficient execution of complex tasks. In
+this paper, we define workflows as an activity-on-vertex (AOV) graphs. We
+continuously refine the workflow by dynamically adjusting task allocations
+based on historical performance and previous AOV with LLM agents. To further
+enhance system performance, we emphasize modularity in workflow design based on
+measuring parallelism and dependence complexity. Our proposed multi-agent
+framework achieved efficient sub-task concurrent execution, goal achievement,
+and error tolerance. Empirical results across different practical tasks
+demonstrate dramatic improvements in the efficiency of multi-agent frameworks
+through dynamic workflow updating and modularization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-time Verification and Refinement of Language Model Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joonho Ko, Jinheon Baek, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown remarkable performance across a wide
+range of natural language tasks. However, a critical challenge remains in that
+they sometimes generate factually incorrect answers. To address this, while
+many previous work has focused on identifying errors in their generation and
+further refining them, they are slow in deployment since they are designed to
+verify the response from LLMs only after their entire generation (from the
+first to last tokens) is done. Further, we observe that once LLMs generate
+incorrect tokens early on, there is a higher likelihood that subsequent tokens
+will also be factually incorrect. To this end, in this work, we propose
+Streaming-VR (Streaming Verification and Refinement), a novel approach designed
+to enhance the efficiency of verification and refinement of LLM outputs.
+Specifically, the proposed Streaming-VR enables on-the-fly verification and
+correction of tokens as they are being generated, similar to a streaming
+process, ensuring that each subset of tokens is checked and refined in
+real-time by another LLM as the LLM constructs its response. Through
+comprehensive evaluations on multiple datasets, we demonstrate that our
+approach not only enhances the factual accuracy of LLMs, but also offers a more
+efficient solution compared to prior refinement methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-Encoder Frozen-Decoder Approach for Fine-Tuning Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaustubh D. Dhole
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among parameter-efficient fine-tuning methods, freezing has emerged as a
+popular strategy for speeding up training, reducing catastrophic forgetting,
+and improving downstream performance. We investigate the impact of freezing the
+decoder in a multi-task setup comprising diverse natural language tasks, aiming
+to reduce deployment overhead and enhance portability to novel tasks. Our
+experiments, conducted by fine-tuning both individual and multi-task setups on
+the AlexaTM model, reveal that freezing decoders is highly effective for tasks
+with natural language outputs and mitigates catastrophic forgetting in
+multilingual tasks. However, we find that pairing frozen decoders with a larger
+model can effectively maintain or even enhance performance in structured and QA
+tasks, making it a viable strategy for a broader range of task types.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Agent-Centric Projection of <span class="highlight-title">Prompt</span>ing Techniques and Implications for
+  Synthetic Training Data for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhruv Dhamani, Mary Lou Maher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in prompting techniques and multi-agent systems for Large
+Language Models (LLMs) have produced increasingly complex approaches. However,
+we lack a framework for characterizing and comparing prompting techniques or
+understanding their relationship to multi-agent LLM systems. This position
+paper introduces and explains the concepts of linear contexts (a single,
+continuous sequence of interactions) and non-linear contexts (branching or
+multi-path) in LLM systems. These concepts enable the development of an
+agent-centric projection of prompting techniques, a framework that can reveal
+deep connections between prompting strategies and multi-agent systems. We
+propose three conjectures based on this framework: (1) results from non-linear
+prompting techniques can predict outcomes in equivalent multi-agent systems,
+(2) multi-agent system architectures can be replicated through single-LLM
+prompting techniques that simulate equivalent interaction patterns, and (3)
+these equivalences suggest novel approaches for generating synthetic training
+data. We argue that this perspective enables systematic cross-pollination of
+research findings between prompting and multi-agent domains, while providing
+new directions for improving both the design and training of future LLM
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures. Accepted at ICAART 2025. Derived from an early
+  draft at 2312.17601. arXiv admin note: substantial text overlap with
+  arXiv:2312.17601</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ STTS-EAD: Improving Spatio-Temporal Learning Based Time Series
+  Prediction via 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07814v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07814v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyuan Liang, Tianhao Zhang, Tingyu Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handling anomalies is a critical preprocessing step in multivariate time
+series prediction. However, existing approaches that separate anomaly
+preprocessing from model training for multivariate time series prediction
+encounter significant limitations. Specifically, these methods fail to utilize
+auxiliary information crucial for identifying latent anomalies associated with
+spatiotemporal factors during the preprocessing stage. Instead, they rely
+solely on data distribution for anomaly detection, which can result in the
+incorrect processing of numerous samples that could otherwise contribute
+positively to model training. To address this, we propose STTS-EAD, an
+end-to-end method that seamlessly integrates anomaly detection into the
+training process of multivariate time series forecasting and aims to improve
+Spatio-Temporal learning based Time Series prediction via Embedded Anomaly
+Detection. Our proposed STTS-EAD leverages spatio-temporal information for
+forecasting and anomaly detection, with the two parts alternately executed and
+optimized for each other. To the best of our knowledge, STTS-EAD is the first
+to integrate anomaly detection and forecasting tasks in the training phase for
+improving the accuracy of multivariate time series forecasting. Extensive
+experiments on a public stock dataset and two real-world sales datasets from a
+renowned coffee chain enterprise show that our proposed method can effectively
+process detected anomalies in the training stage to improve forecasting
+performance in the inference stage and significantly outperform baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Talk to Right Specialists: Routing and Planning in Multi-agent System
+  for Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07813v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07813v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feijie Wu, Zitao Li, Fei Wei, Yaliang Li, Bolin Ding, Jing Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging large language models (LLMs), an agent can utilize
+retrieval-augmented generation (RAG) techniques to integrate external knowledge
+and increase the reliability of its responses. Current RAG-based agents
+integrate single, domain-specific knowledge sources, limiting their ability and
+leading to hallucinated or inaccurate responses when addressing cross-domain
+queries. Integrating multiple knowledge bases into a unified RAG-based agent
+raises significant challenges, including increased retrieval overhead and data
+sovereignty when sensitive data is involved. In this work, we propose RopMura,
+a novel multi-agent system that addresses these limitations by incorporating
+highly efficient routing and planning mechanisms. RopMura features two key
+components: a router that intelligently selects the most relevant agents based
+on knowledge boundaries and a planner that decomposes complex multi-hop queries
+into manageable steps, allowing for coordinating cross-domain responses.
+Experimental results demonstrate that RopMura effectively handles both
+single-hop and multi-hop queries, with the routing mechanism enabling precise
+answers for single-hop queries and the combined routing and planning mechanisms
+achieving accurate, multi-step resolutions for complex queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work In Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal mapping Coordinates Physics-Informed Neural Networks
+  (CoCo-PINNs): learning neural networks for designing neutral inclusions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07809v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07809v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daehee Cho, Hyeonmin Yun, Jaeyong Lee, Mikyoung Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We focus on designing and solving the neutral inclusion problem via neural
+networks. The neutral inclusion problem has a long history in the theory of
+composite materials, and it is exceedingly challenging to identify the precise
+condition that precipitates a general-shaped inclusion into a neutral
+inclusion. Physics-informed neural networks (PINNs) have recently become a
+highly successful approach to addressing both forward and inverse problems
+associated with partial differential equations. We found that traditional PINNs
+perform inadequately when applied to the inverse problem of designing neutral
+inclusions with arbitrary shapes. In this study, we introduce a novel approach,
+Conformal mapping Coordinates Physics-Informed Neural Networks (CoCo-PINNs),
+which integrates complex analysis techniques into PINNs. This method exhibits
+strong performance in solving forward-inverse problems to construct neutral
+inclusions of arbitrary shapes in two dimensions, where the imperfect interface
+condition on the inclusion's boundary is modeled by training neural networks.
+Notably, we mathematically prove that training with a single linear field is
+sufficient to achieve neutrality for untrained linear fields in arbitrary
+directions, given a minor assumption. We demonstrate that CoCo-PINNs offer
+enhanced performances in terms of credibility, consistency, and stability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Low-cost and Ultra-lightweight Binary Neural Network for Traffic
+  Signal Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingke Xiao, Yue Su, Liang Yu, Guanglong Qu, Yutong Jia, Yukuan Chang, Xu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The deployment of neural networks in vehicle platforms and wearable
+Artificial Intelligence-of-Things (AIOT) scenarios has become a research area
+that has attracted much attention. With the continuous evolution of deep
+learning technology, many image classification models are committed to
+improving recognition accuracy, but this is often accompanied by problems such
+as large model resource usage, complex structure, and high power consumption,
+which makes it challenging to deploy on resource-constrained platforms. Herein,
+we propose an ultra-lightweight binary neural network (BNN) model designed for
+hardware deployment, and conduct image classification research based on the
+German Traffic Sign Recognition Benchmark (GTSRB) dataset. In addition, we also
+verify it on the Chinese Traffic Sign (CTS) and Belgian Traffic Sign (BTS)
+datasets. The proposed model shows excellent recognition performance with an
+accuracy of up to 97.64%, making it one of the best performing BNN models in
+the GTSRB dataset. Compared with the full-precision model, the accuracy loss is
+controlled within 1%, and the parameter storage overhead of the model is only
+10% of that of the full-precision model. More importantly, our network model
+only relies on logical operations and low-bit width fixed-point addition and
+subtraction operations during the inference phase, which greatly simplifies the
+design complexity of the processing element (PE). Our research shows the great
+potential of BNN in the hardware deployment of computer vision models,
+especially in the field of computer vision tasks related to autonomous driving.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Language Models as Operator Agents in the Space Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Carrasco, Marco Nedungadi, Enrico M. Zucchelli, Amit Jain, Victor Rodriguez-Fernandez, Richard Linares
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the application of Vision-Language Models (VLMs) as
+operator agents in the space domain, focusing on both software and hardware
+operational paradigms. Building on advances in Large Language Models (LLMs) and
+their multimodal extensions, we investigate how VLMs can enhance autonomous
+control and decision-making in space missions. In the software context, we
+employ VLMs within the Kerbal Space Program Differential Games (KSPDG)
+simulation environment, enabling the agent to interpret visual screenshots of
+the graphical user interface to perform complex orbital maneuvers. In the
+hardware context, we integrate VLMs with robotic systems equipped with cameras
+to inspect and diagnose physical space objects, such as satellites. Our results
+demonstrate that VLMs can effectively process visual and textual data to
+generate contextually appropriate actions, competing with traditional methods
+and non-multimodal LLMs in simulation tasks, and showing promise in real-world
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated version of the paper presented in 2025 AIAA SciTech.
+  https://arc.aiaa.org/doi/10.2514/6.2025-1543</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Analysis of DNN-based White-Box Explainable AI Methods in
+  Network Security 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07801v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07801v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osvaldo Arreche, Mustafa Abdallah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  New research focuses on creating artificial intelligence (AI) solutions for
+network intrusion detection systems (NIDS), drawing its inspiration from the
+ever-growing number of intrusions on networked systems, increasing its
+complexity and intelligibility. Hence, the use of explainable AI (XAI)
+techniques in real-world intrusion detection systems comes from the requirement
+to comprehend and elucidate black-box AI models to security analysts. In an
+effort to meet such requirements, this paper focuses on applying and evaluating
+White-Box XAI techniques (particularly LRP, IG, and DeepLift) for NIDS via an
+end-to-end framework for neural network models, using three widely used network
+intrusion datasets (NSL-KDD, CICIDS-2017, and RoEduNet-SIMARGL2021), assessing
+its global and local scopes, and examining six distinct assessment measures
+(descriptive accuracy, sparsity, stability, robustness, efficiency, and
+completeness). We also compare the performance of white-box XAI methods with
+black-box XAI methods. The results show that using White-box XAI techniques
+scores high in robustness and completeness, which are crucial metrics for IDS.
+Moreover, the source codes for the programs developed for our XAI evaluation
+framework are available to be improved and used by the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BioPose: Biomechanically-accurate 3D Pose Estimation from Monocular
+  Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farnoosh Koleini, Muhammad Usama Saleem, Pu Wang, Hongfei Xue, Ahmed Helmy, Abbey Fenwick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in 3D human pose estimation from single-camera images and
+videos have relied on parametric models, like SMPL. However, these models
+oversimplify anatomical structures, limiting their accuracy in capturing true
+joint locations and movements, which reduces their applicability in
+biomechanics, healthcare, and robotics. Biomechanically accurate pose
+estimation, on the other hand, typically requires costly marker-based motion
+capture systems and optimization techniques in specialized labs. To bridge this
+gap, we propose BioPose, a novel learning-based framework for predicting
+biomechanically accurate 3D human pose directly from monocular videos. BioPose
+includes three key components: a Multi-Query Human Mesh Recovery model
+(MQ-HMR), a Neural Inverse Kinematics (NeurIK) model, and a 2D-informed pose
+refinement technique. MQ-HMR leverages a multi-query deformable transformer to
+extract multi-scale fine-grained image features, enabling precise human mesh
+recovery. NeurIK treats the mesh vertices as virtual markers, applying a
+spatial-temporal network to regress biomechanically accurate 3D poses under
+anatomical constraints. To further improve 3D pose estimations, a 2D-informed
+refinement step optimizes the query tokens during inference by aligning the 3D
+structure with 2D pose observations. Experiments on benchmark datasets
+demonstrate that BioPose significantly outperforms state-of-the-art methods.
+Project website:
+\url{https://m-usamasaleem.github.io/publication/BioPose/BioPose.html}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transforming Indoor Localization: Advanced <span class="highlight-title">Transformer</span> Architecture for
+  NLOS Dominated Wireless Environments with Distributed Sensors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saad Masrur,  Jung-Fu,  Cheng, Atieh R. Khamesi, Ismail Guvenc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Indoor localization in challenging non-line-of-sight (NLOS) environments
+often leads to mediocre accuracy with traditional approaches. Deep learning
+(DL) has been applied to tackle these challenges; however, many DL approaches
+overlook computational complexity, especially for floating-point operations
+(FLOPs), making them unsuitable for resource-limited devices. Transformer-based
+models have achieved remarkable success in natural language processing (NLP)
+and computer vision (CV) tasks, motivating their use in wireless applications.
+However, their use in indoor localization remains nascent, and directly
+applying Transformers for indoor localization can be both computationally
+intensive and exhibit limitations in accuracy. To address these challenges, in
+this work, we introduce a novel tokenization approach, referred to as Sensor
+Snapshot Tokenization (SST), which preserves variable-specific representations
+of power delay profile (PDP) and enhances attention mechanisms by effectively
+capturing multi-variate correlation. Complementing this, we propose a
+lightweight Swish-Gated Linear Unit-based Transformer (L-SwiGLU Transformer)
+model, designed to reduce computational complexity without compromising
+localization accuracy. Together, these contributions mitigate the computational
+burden and dependency on large datasets, making Transformer models more
+efficient and suitable for resource-constrained scenarios. The proposed
+tokenization method enables the Vanilla Transformer to achieve a 90th
+percentile positioning error of 0.388 m in a highly NLOS indoor factory,
+surpassing conventional tokenization methods. The L-SwiGLU ViT further reduces
+the error to 0.355 m, achieving an 8.51% improvement. Additionally, the
+proposed model outperforms a 14.1 times larger model with a 46.13% improvement,
+underscoring its computational efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been submitted to IEEE Transactions on Machine Learning
+  in Communications and Networking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models for Knowledge Graph Embedding Techniques, Methods,
+  and Challenges: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07766v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07766v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingchen Liu, Xin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have attracted a lot of attention in various
+fields due to their superior performance, aiming to train hundreds of millions
+or more parameters on large amounts of text data to understand and generate
+natural language. As the superior performance of LLMs becomes apparent, they
+are increasingly being applied to knowledge graph embedding (KGE) related tasks
+to improve the processing results. As a deep learning model in the field of
+Natural Language Processing (NLP), it learns a large amount of textual data to
+predict the next word or generate content related to a given text. However,
+LLMs have recently been invoked to varying degrees in different types of KGE
+related scenarios such as multi-modal KGE and open KGE according to their task
+characteristics. In this paper, we investigate a wide range of approaches for
+performing LLMs-related tasks in different types of KGE scenarios. To better
+compare the various approaches, we summarize each KGE scenario in a
+classification. In addition to the categorization methods, we provide a tabular
+overview of the methods and their source code links for a more direct
+comparison. In the article we also discuss the applications in which the
+methods are mainly used and suggest several forward-looking directions for the
+development of this new research area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Disease Outbreak Prediction: A Robust Early Warning
+  Signal for Transcritical Bifurcations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Miry, Amit K. Chakraborty, Russell Greiner, Mark A. Lewis, Hao Wang, Tianyu Guan, Pouria Ramazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early Warning Signals (EWSs) are vital for implementing preventive measures
+before a disease turns into a pandemic. While new diseases exhibit unique
+behaviors, they often share fundamental characteristics from a dynamical
+systems perspective. Moreover, measurements during disease outbreaks are often
+corrupted by different noise sources, posing challenges for Time Series
+Classification (TSC) tasks. In this study, we address the problem of having a
+robust EWS for disease outbreak prediction using a best-performing deep
+learning model in the domain of TSC. We employed two simulated datasets to
+train the model: one representing generated dynamical systems with randomly
+selected polynomial terms to model new disease behaviors, and another
+simulating noise-induced disease dynamics to account for noisy measurements.
+The model's performance was analyzed using both simulated data from different
+disease models and real-world data, including influenza and COVID-19. Results
+demonstrate that the proposed model outperforms previous models, effectively
+providing EWSs of impending outbreaks across various scenarios. This study
+bridges advancements in deep learning with the ability to provide robust early
+warning signals in noisy environments, making it highly applicable to
+real-world crises involving emerging disease outbreaks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Statistical Capacity of Deep Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edric Tam, David B. Dunson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative models are routinely used in generating samples from complex,
+high-dimensional distributions. Despite their apparent successes, their
+statistical properties are not well understood. A common assumption is that
+with enough training data and sufficiently large neural networks, deep
+generative model samples will have arbitrarily small errors in sampling from
+any continuous target distribution. We set up a unifying framework that debunks
+this belief. We demonstrate that broad classes of deep generative models,
+including variational autoencoders and generative adversarial networks, are not
+universal generators. Under the predominant case of Gaussian latent variables,
+these models can only generate concentrated samples that exhibit light tails.
+Using tools from concentration of measure and convex geometry, we give
+analogous results for more general log-concave and strongly log-concave latent
+variable distributions. We extend our results to diffusion models via a
+reduction argument. We use the Gromov--Levy inequality to give similar
+guarantees when the latent variables lie on manifolds with positive Ricci
+curvature. These results shed light on the limited capacity of common deep
+generative models to handle heavy tails. We illustrate the empirical relevance
+of our work with simulations and financial data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PSReg: Prior-guided Sparse Mixture of Experts for Point Cloud
+  Registration <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshui Huang, Zhou Huang, Yifan Zuo, Yongshun Gong, Chengdong Zhang, Deyang Liu, Yuming Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discriminative feature is crucial for point cloud registration. Recent
+methods improve the feature discriminative by distinguishing between
+non-overlapping and overlapping region points. However, they still face
+challenges in distinguishing the ambiguous structures in the overlapping
+regions. Therefore, the ambiguous features they extracted resulted in a
+significant number of outlier matches from overlapping regions. To solve this
+problem, we propose a prior-guided SMoE-based registration method to improve
+the feature distinctiveness by dispatching the potential correspondences to the
+same experts. Specifically, we propose a prior-guided SMoE module by fusing
+prior overlap and potential correspondence embeddings for routing, assigning
+tokens to the most suitable experts for processing. In addition, we propose a
+registration framework by a specific combination of Transformer layer and
+prior-guided SMoE module. The proposed method not only pays attention to the
+importance of locating the overlapping areas of point clouds, but also commits
+to finding more accurate correspondences in overlapping areas. Our extensive
+experiments demonstrate the effectiveness of our method, achieving
+state-of-the-art registration recall (95.7\%/79.3\%) on the 3DMatch/3DLoMatch
+benchmark. Moreover, we also test the performance on ModelNet40 and demonstrate
+excellent performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impatient Bandits: Optimizing for the Long-Term Without Delay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kelly W. Zhang, Thomas Baldwin-McDonald, Kamil Ciosek, Lucas Maystre, Daniel Russo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasingly, recommender systems are tasked with improving users' long-term
+satisfaction. In this context, we study a content exploration task, which we
+formalize as a bandit problem with delayed rewards. There is an apparent
+trade-off in choosing the learning signal: waiting for the full reward to
+become available might take several weeks, slowing the rate of learning,
+whereas using short-term proxy rewards reflects the actual long-term goal only
+imperfectly. First, we develop a predictive model of delayed rewards that
+incorporates all information obtained to date. Rewards as well as shorter-term
+surrogate outcomes are combined through a Bayesian filter to obtain a
+probabilistic belief. Second, we devise a bandit algorithm that quickly learns
+to identify content aligned with long-term success using this new predictive
+model. We prove a regret bound for our algorithm that depends on the
+\textit{Value of Progressive Feedback}, an information theoretic metric that
+captures the quality of short-term leading indicators that are observed prior
+to the long-term reward. We apply our approach to a podcast recommendation
+problem, where we seek to recommend shows that users engage with repeatedly
+over two months. We empirically validate that our approach significantly
+outperforms methods that optimize for short-term proxies or rely solely on
+delayed rewards, as demonstrated by an A/B test in a recommendation system that
+serves hundreds of millions of users.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantifying the Importance of Data Alignment in Downstream Model
+  Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krrish Chawla, Aryan Sahai, Mario DePavia, Sudharsan Sundar, Brando Miranda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrary to the conventional emphasis on dataset size, we explore the role of
+data alignment -- an often overlooked aspect of data quality -- in training
+capable Large Language Models (LLMs). To do so, we use the Task2Vec-based
+alignment coefficient, a quantitative measure of the similarity between two
+datasets, to quantify the impact of alignment between training data and
+evaluation data on downstream performance. In particular, we conduct controlled
+\textit{interventional} experiments for two settings: 1. the impact of
+increased alignment coefficients between various pre-training (pt) against
+evaluation datasets, and 2. the impact of increased alignment coefficients
+between domain specific fine-tuning (ft) against domain specific evaluation.
+The domain specific task we explore is Autoformalization -- the machine
+translation task between natural language and code for formal verification. In
+both settings, we find a strong, predictable negative correlation between the
+alignment coefficient of a model's training and evaluation data and the model's
+loss/perplexity on the respective downstream task. These findings suggest a
+re-evaluation of LLM training approaches, demonstrating the relevance of data
+alignment compared to data quantity, especially in specialized downstream tasks
+such as Autoformalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Classical, Deep, and Generative Models for Human Activity
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Meem Hossain, The Anh Han, Safina Showkat Ara, Zia Ush Shamszaman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Activity Recognition (HAR) has gained significant importance with the
+growing use of sensor-equipped devices and large datasets. This paper evaluates
+the performance of three categories of models : classical machine learning,
+deep learning architectures, and Restricted Boltzmann Machines (RBMs) using
+five key benchmark datasets of HAR (UCI-HAR, OPPORTUNITY, PAMAP2, WISDM, and
+Berkeley MHAD). We assess various models, including Decision Trees, Random
+Forests, Convolutional Neural Networks (CNN), and Deep Belief Networks (DBNs),
+using metrics such as accuracy, precision, recall, and F1-score for a
+comprehensive comparison. The results show that CNN models offer superior
+performance across all datasets, especially on the Berkeley MHAD. Classical
+models like Random Forest do well on smaller datasets but face challenges with
+larger, more complex data. RBM-based models also show notable potential,
+particularly for feature learning. This paper offers a detailed comparison to
+help researchers choose the most suitable model for HAR tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>48 pages, 21 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Contextual Anomalies by Discovering Consistent Spatial Regions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengye Yang, Richard J. Radke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We describe a method for modeling spatial context to enable video anomaly
+detection. The main idea is to discover regions that share similar object-level
+activities by clustering joint object attributes using Gaussian mixture models.
+We demonstrate that this straightforward approach, using orders of magnitude
+fewer parameters than competing models, achieves state-of-the-art performance
+in the challenging spatial-context-dependent Street Scene dataset. As a side
+benefit, the high-resolution discovered regions learned by the model also
+provide explainable normalcy maps for human operators without the need for any
+pre-trained segmentation model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Zero-Shot & Explainable Video Description by Reasoning over
+  Graphs of Events in Space and Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihai Masala, Marius Leordeanu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the current era of Machine Learning, Transformers have become the de facto
+approach across a variety of domains, such as computer vision and natural
+language processing. Transformer-based solutions are the backbone of current
+state-of-the-art methods for language generation, image and video
+classification, segmentation, action and object recognition, among many others.
+Interestingly enough, while these state-of-the-art methods produce impressive
+results in their respective domains, the problem of understanding the
+relationship between vision and language is still beyond our reach. In this
+work, we propose a common ground between vision and language based on events in
+space and time in an explainable and programmatic way, to connect
+learning-based vision and language state of the art models and provide a
+solution to the long standing problem of describing videos in natural language.
+We validate that our algorithmic approach is able to generate coherent, rich
+and relevant textual descriptions on videos collected from a variety of
+datasets, using both standard metrics (e.g. Bleu, ROUGE) and the modern
+LLM-as-a-Jury approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Active Sampling for Node Attribute Completion on Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benyuan Liu, Xu Chen, Yanfeng Wang, Ya Zhang, Zhi Cao, Ivor Tsang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Node attribute, a type of crucial information for graph analysis, may be
+partially or completely missing for certain nodes in real world applications.
+Restoring the missing attributes is expected to benefit downstream graph
+learning. Few attempts have been made on node attribute completion, but a novel
+framework called Structure-attribute Transformer (SAT) was recently proposed by
+using a decoupled scheme to leverage structures and attributes. SAT ignores the
+differences in contributing to the learning schedule and finding a practical
+way to model the different importance of nodes with observed attributes is
+challenging. This paper proposes a novel AcTive Sampling algorithm (ATS) to
+restore missing node attributes. The representativeness and uncertainty of each
+node's information are first measured based on graph structure, representation
+similarity and learning bias. To select nodes as train samples in the next
+optimization step, a weighting scheme controlled by Beta distribution is then
+introduced to linearly combine the two properties. Extensive experiments on
+four public benchmark datasets and two downstream tasks have shown the
+superiority of ATS in node attribute completion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FARE: A Deep Learning-Based Framework for Radar-based Face Recognition
+  and Out-of-distribution Detection <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sabri Mustafa Kahya, Boran Hamdi Sivrikaya, Muhammet Sami Yavuz, Eckehard Steinbach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a novel pipeline for face recognition and
+out-of-distribution (OOD) detection using short-range FMCW radar. The proposed
+system utilizes Range-Doppler and micro Range-Doppler Images. The architecture
+features a primary path (PP) responsible for the classification of
+in-distribution (ID) faces, complemented by intermediate paths (IPs) dedicated
+to OOD detection. The network is trained in two stages: first, the PP is
+trained using triplet loss to optimize ID face classification. In the second
+stage, the PP is frozen, and the IPs-comprising simple linear autoencoder
+networks-are trained specifically for OOD detection. Using our dataset
+generated with a 60 GHz FMCW radar, our method achieves an ID classification
+accuracy of 99.30% and an OOD detection AUROC of 96.91%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling Discrimination with Causal Abstraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Milan Mossé, Kara Schechtman, Frederick Eberhardt, Thomas Icard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A person is directly racially discriminated against only if her race caused
+her worse treatment. This implies that race is an attribute sufficiently
+separable from other attributes to isolate its causal role. But race is
+embedded in a nexus of social factors that resist isolated treatment. If race
+is socially constructed, in what sense can it cause worse treatment? Some
+propose that the perception of race, rather than race itself, causes worse
+treatment. Others suggest that since causal models require modularity, i.e. the
+ability to isolate causal effects, attempts to causally model discrimination
+are misguided.
+  This paper addresses the problem differently. We introduce a framework for
+reasoning about discrimination, in which race is a high-level abstraction of
+lower-level features. In this framework, race can be modeled as itself causing
+worse treatment. Modularity is ensured by allowing assumptions about social
+construction to be precisely and explicitly stated, via an alignment between
+race and its constituents. Such assumptions can then be subjected to normative
+and empirical challenges, which lead to different views of when discrimination
+occurs. By distinguishing constitutive and causal relations, the abstraction
+framework pinpoints disagreements in the current literature on modeling
+discrimination, while preserving a precise causal account of discrimination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal vs. Anticausal merging of predictors <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Hernan Garrido Mejia, Patrick Blöbaum, Bernhard Schölkopf, Dominik Janzing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the differences arising from merging predictors in the causal and
+anticausal directions using the same data. In particular we study the
+asymmetries that arise in a simple model where we merge the predictors using
+one binary variable as target and two continuous variables as predictors. We
+use Causal Maximum Entropy (CMAXENT) as inductive bias to merge the predictors,
+however, we expect similar differences to hold also when we use other merging
+methods that take into account asymmetries between cause and effect. We show
+that if we observe all bivariate distributions, the CMAXENT solution reduces to
+a logistic regression in the causal direction and Linear Discriminant Analysis
+(LDA) in the anticausal direction. Furthermore, we study how the decision
+boundaries of these two solutions differ whenever we observe only some of the
+bivariate distributions implications for Out-Of-Variable (OOV) generalisation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at the 38th Conference on Neural Information Processing
+  Systems (NeurIPS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEAL: Speaker Error Correction using Acoustic-conditioned Large Language
+  Models <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anurag Kumar, Rohit Paturi, Amber Afshan, Sundararajan Srinivasan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speaker Diarization (SD) is a crucial component of modern end-to-end ASR
+pipelines. Traditional SD systems, which are typically audio-based and operate
+independently of ASR, often introduce speaker errors, particularly during
+speaker transitions and overlapping speech. Recently, language models including
+fine-tuned large language models (LLMs) have shown to be effective as a
+second-pass speaker error corrector by leveraging lexical context in the
+transcribed output. In this work, we introduce a novel acoustic conditioning
+approach to provide more fine-grained information from the acoustic diarizer to
+the LLM. We also show that a simpler constrained decoding strategy reduces LLM
+hallucinations, while avoiding complicated post-processing. Our approach
+significantly reduces the speaker error rates by 24-43% across Fisher,
+Callhome, and RT03-CTS datasets, compared to the first-pass Acoustic SD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CVaR-Based Variational Quantum Optimization for User Association in
+  Handoff-Aware Vehicular Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijiang Yan, Hao Zhou, Jianhua Pei, Aryan Kaushik, Hina Tabassum, Ping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient resource allocation is essential for optimizing various tasks in
+wireless networks, which are usually formulated as generalized assignment
+problems (GAP). GAP, as a generalized version of the linear sum assignment
+problem, involves both equality and inequality constraints that add
+computational challenges. In this work, we present a novel Conditional Value at
+Risk (CVaR)-based Variational Quantum Eigensolver (VQE) framework to address
+GAP in vehicular networks (VNets). Our approach leverages a hybrid
+quantum-classical structure, integrating a tailored cost function that balances
+both objective and constraint-specific penalties to improve solution quality
+and stability. Using the CVaR-VQE model, we handle the GAP efficiently by
+focusing optimization on the lower tail of the solution space, enhancing both
+convergence and resilience on noisy intermediate-scale quantum (NISQ) devices.
+We apply this framework to a user-association problem in VNets, where our
+method achieves 23.5% improvement compared to the deep neural network (DNN)
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE International Conference on Communications (ICC
+  2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Modal Transferable Image-to-Video Attack on Video Quality Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgii Gotin, Ekaterina Shumitskaya, Anastasia Antsiferova, Dmitriy Vatolin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have revealed that modern image and video quality assessment
+(IQA/VQA) metrics are vulnerable to adversarial attacks. An attacker can
+manipulate a video through preprocessing to artificially increase its quality
+score according to a certain metric, despite no actual improvement in visual
+quality. Most of the attacks studied in the literature are white-box attacks,
+while black-box attacks in the context of VQA have received less attention.
+Moreover, some research indicates a lack of transferability of adversarial
+examples generated for one model to another when applied to VQA. In this paper,
+we propose a cross-modal attack method, IC2VQA, aimed at exploring the
+vulnerabilities of modern VQA models. This approach is motivated by the
+observation that the low-level feature spaces of images and videos are similar.
+We investigate the transferability of adversarial perturbations across
+different modalities; specifically, we analyze how adversarial perturbations
+generated on a white-box IQA model with an additional CLIP module can
+effectively target a VQA model. The addition of the CLIP module serves as a
+valuable aid in increasing transferability, as the CLIP model is known for its
+effective capture of low-level semantics. Extensive experiments demonstrate
+that IC2VQA achieves a high success rate in attacking three black-box VQA
+models. We compare our method with existing black-box attack strategies,
+highlighting its superiority in terms of attack success within the same number
+of iterations and levels of attack strength. We believe that the proposed
+method will contribute to the deeper analysis of robust VQA metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for VISAPP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiDepth Multimodal Neural Network: Bidirectional Depth Deep Learning
+  Arcitecture for Spatial-Temporal Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sina Ehsani, Fenglian Pan, Qingpei Hu, Jian Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of spatial-temporal (ST) information in dynamic systems,
+such as urban mobility and weather patterns, is a crucial yet challenging
+problem. The complexity stems from the intricate interplay between spatial
+proximity and temporal relevance, where both long-term trends and short-term
+fluctuations are present in convoluted patterns. Existing approaches, including
+traditional statistical methods and conventional neural networks, may provide
+inaccurate results due to the lack of an effective mechanism that
+simultaneously incorporates information at variable temporal depths while
+maintaining spatial context, resulting in a trade-off between comprehensive
+long-term historical analysis and responsiveness to short-term new information.
+To bridge this gap, this paper proposes the BiDepth Multimodal Neural Network
+(BDMNN) with bidirectional depth modulation that enables a comprehensive
+understanding of both long-term seasonality and short-term fluctuations,
+adapting to the complex ST context. Case studies with real-world public data
+demonstrate significant improvements in prediction accuracy, with a 12%
+reduction in Mean Squared Error for urban traffic prediction and a 15%
+improvement in rain precipitation forecasting compared to state-of-the-art
+benchmarks, without demanding extra computational resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to Applied Intelligence for review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing Quality Challenges in Deep Learning: The Role of MLOps and
+  Domain Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08402v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08402v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Santiago del Rey, Adrià Medina, Xavier Franch, Silverio Martínez-Fernández
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) systems present unique challenges in software engineering,
+especially concerning quality attributes like correctness and resource
+efficiency. While DL models achieve exceptional performance in specific tasks,
+engineering DL-based systems is still essential. The effort, cost, and
+potential diminishing returns of continual improvements must be carefully
+evaluated, as software engineers often face the critical decision of when to
+stop refining a system relative to its quality attributes. This experience
+paper explores the role of MLOps practices -- such as monitoring and experiment
+tracking -- in creating transparent and reproducible experimentation
+environments that enable teams to assess and justify the impact of design
+decisions on quality attributes. Furthermore, we report on experiences
+addressing the quality challenges by embedding domain knowledge into the design
+of a DL model and its integration within a larger system. The findings offer
+actionable insights into not only the benefits of domain knowledge and MLOps
+but also the strategic consideration of when to limit further optimizations in
+DL projects to maximize overall system quality and reliability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure, accepted to the 4th International Conference on AI
+  Engineering - Software Engineering for AI (CAIN)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Best Practices for Open <span class="highlight-title">Dataset</span>s for LLM Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Baack, Stella Biderman, Kasia Odrozek, Aviya Skowron, Ayah Bdeir, Jillian Bommarito, Jennifer Ding, Maximilian Gahntz, Paul Keller, Pierre-Carl Langlais, Greg Lindahl, Sebastian Majstorovic, Nik Marda, Guilherme Penedo, Maarten Van Segbroeck, Jennifer Wang, Leandro von Werra, Mitchell Baker, Julie Belião, Kasia Chmielinski, Marzieh Fadaee, Lisa Gutermuth, Hynek Kydlíček, Greg Leppert, EM Lewis-Jong, Solana Larsen, Shayne Longpre, Angela Oduor Lungati, Cullen Miller, Victor Miller, Max Ryabinin, Kathleen Siminyu, Andrew Strait, Mark Surman, Anna Tumadóttir, Maurice Weber, Rebecca Weiss, Lee White, Thomas Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many AI companies are training their large language models (LLMs) on data
+without the permission of the copyright owners. The permissibility of doing so
+varies by jurisdiction: in countries like the EU and Japan, this is allowed
+under certain restrictions, while in the United States, the legal landscape is
+more ambiguous. Regardless of the legal status, concerns from creative
+producers have led to several high-profile copyright lawsuits, and the threat
+of litigation is commonly cited as a reason for the recent trend towards
+minimizing the information shared about training datasets by both corporate and
+public interest actors. This trend in limiting data information causes harm by
+hindering transparency, accountability, and innovation in the broader ecosystem
+by denying researchers, auditors, and impacted individuals access to the
+information needed to understand AI models.
+  While this could be mitigated by training language models on open access and
+public domain data, at the time of writing, there are no such models (trained
+at a meaningful scale) due to the substantial technical and sociological
+challenges in assembling the necessary corpus. These challenges include
+incomplete and unreliable metadata, the cost and complexity of digitizing
+physical records, and the diverse set of legal and technical skills required to
+ensure relevance and responsibility in a quickly changing landscape. Building
+towards a future where AI systems can be trained on openly licensed data that
+is responsibly curated and governed requires collaboration across legal,
+technical, and policy domains, along with investments in metadata standards,
+digitization, and fostering a culture of openness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Playing Devil's Advocate: Unmasking Toxicity and Vulnerabilities in
+  Large Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdulkadir Erol, Trilok Padhi, Agnik Saha, Ugur Kursuncu, Mehmet Emin Aktas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of Large Vision-Language Models (LVLMs) has enhanced
+capabilities offering potential applications from content creation to
+productivity enhancement. Despite their innovative potential, LVLMs exhibit
+vulnerabilities, especially in generating potentially toxic or unsafe
+responses. Malicious actors can exploit these vulnerabilities to propagate
+toxic content in an automated (or semi-) manner, leveraging the susceptibility
+of LVLMs to deception via strategically crafted prompts without fine-tuning or
+compute-intensive procedures. Despite the red-teaming efforts and inherent
+potential risks associated with the LVLMs, exploring vulnerabilities of LVLMs
+remains nascent and yet to be fully addressed in a systematic manner. This
+study systematically examines the vulnerabilities of open-source LVLMs,
+including LLaVA, InstructBLIP, Fuyu, and Qwen, using adversarial prompt
+strategies that simulate real-world social manipulation tactics informed by
+social theories. Our findings show that (i) toxicity and insulting are the most
+prevalent behaviors, with the mean rates of 16.13% and 9.75%, respectively;
+(ii) Qwen-VL-Chat, LLaVA-v1.6-Vicuna-7b, and InstructBLIP-Vicuna-7b are the
+most vulnerable models, exhibiting toxic response rates of 21.50%, 18.30% and
+17.90%, and insulting responses of 13.40%, 11.70% and 10.10%, respectively;
+(iii) prompting strategies incorporating dark humor and multimodal toxic prompt
+completion significantly elevated these vulnerabilities. Despite being
+fine-tuned for safety, these models still generate content with varying degrees
+of toxicity when prompted with adversarial inputs, highlighting the urgent need
+for enhanced safety mechanisms and robust guardrails in LVLM development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do generative video models learn physical principles from watching
+  videos? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saman Motamed, Laura Culp, Kevin Swersky, Priyank Jaini, Robert Geirhos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI video generation is undergoing a revolution, with quality and realism
+advancing rapidly. These advances have led to a passionate scientific debate:
+Do video models learn ``world models'' that discover laws of physics -- or,
+alternatively, are they merely sophisticated pixel predictors that achieve
+visual realism without understanding the physical principles of reality? We
+address this question by developing Physics-IQ, a comprehensive benchmark
+dataset that can only be solved by acquiring a deep understanding of various
+physical principles, like fluid dynamics, optics, solid mechanics, magnetism
+and thermodynamics. We find that across a range of current models (Sora,
+Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical
+understanding is severely limited, and unrelated to visual realism. At the same
+time, some test cases can already be successfully solved. This indicates that
+acquiring certain physical principles from observation alone may be possible,
+but significant challenges remain. While we expect rapid advances ahead, our
+work demonstrates that visual realism does not imply physical understanding.
+Our project page is at https://physics-iq.github.io; code at
+https://github.com/google-deepmind/physics-IQ-benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems
+  using Disparity Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.24031v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.24031v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariel Larey, Eyal Rond, Omer Achrack
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition technologies are increasingly used in various applications,
+yet they are vulnerable to face spoofing attacks. These spoofing attacks often
+involve unique 3D structures, such as printed papers or mobile device screens.
+Although stereo-depth cameras can detect such attacks effectively, their
+high-cost limits their widespread adoption. Conversely, two-sensor systems
+without extrinsic calibration offer a cost-effective alternative but are unable
+to calculate depth using stereo techniques. In this work, we propose a method
+to overcome this challenge by leveraging facial attributes to derive disparity
+information and estimate relative depth for anti-spoofing purposes, using
+non-calibrated systems. We introduce a multi-modal anti-spoofing model, coined
+Disparity Model, that incorporates created disparity maps as a third modality
+alongside the two original sensor modalities. We demonstrate the effectiveness
+of the Disparity Model in countering various spoof attacks using a
+comprehensive dataset collected from the Intel RealSense ID Solution F455. Our
+method outperformed existing methods in the literature, achieving an Equal
+Error Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False
+Positive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the
+errors of the best comparison method, respectively. Additionally, we introduce
+a model ensemble that addresses 3D spoof attacks as well, achieving an EER of
+2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a
+state-of-the-art solution for the challenging task of anti-spoofing in
+non-calibrated systems that lack depth information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RMem: Restricted Memory Banks Improve Video Object Segmentation <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.08476v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.08476v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junbao Zhou, Ziqi Pang, Yu-Xiong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With recent video object segmentation (VOS) benchmarks evolving to
+challenging scenarios, we revisit a simple but overlooked strategy: restricting
+the size of memory banks. This diverges from the prevalent practice of
+expanding memory banks to accommodate extensive historical information. Our
+specially designed "memory deciphering" study offers a pivotal insight
+underpinning such a strategy: expanding memory banks, while seemingly
+beneficial, actually increases the difficulty for VOS modules to decode
+relevant features due to the confusion from redundant information. By
+restricting memory banks to a limited number of essential frames, we achieve a
+notable improvement in VOS accuracy. This process balances the importance and
+freshness of frames to maintain an informative memory bank within a bounded
+capacity. Additionally, restricted memory banks reduce the training-inference
+discrepancy in memory lengths compared with continuous expansion. This fosters
+new opportunities in temporal reasoning and enables us to introduce the
+previously overlooked "temporal positional embedding." Finally, our insights
+are embodied in "RMem" ("R" for restricted), a simple yet effective VOS
+modification that excels at challenging VOS scenarios and establishes new state
+of the art for object state changes (on the VOST dataset) and long videos (on
+the Long Videos dataset). Our code and demo are available at
+https://restricted-memory.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024, Project Page: https://restricted-memory.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CriSPO: Multi-Aspect Critique-Suggestion-guided Automatic <span class="highlight-title">Prompt</span>
+  Optimization for Text Generation <span class="chip">AAAI-2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.02748v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.02748v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han He, Qianchu Liu, Lei Xu, Chaitanya Shivade, Yi Zhang, Sundararajan Srinivasan, Katrin Kirchhoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing automatic prompt engineering methods are typically designed for
+discriminative tasks, where new task prompts are iteratively refined with
+limited feedback from a single metric reflecting a single aspect. However,
+these approaches are suboptimal for generative tasks, which require more
+nuanced guidance beyond a single numeric metric to improve the prompt and
+optimize multiple aspects of the generated text. To address these challenges,
+we propose a novel multi-aspect Critique-Suggestion-guided automatic Prompt
+Optimization (CriSPO) approach. CriSPO introduces a critique-suggestion module
+as its core component. This module spontaneously discovers aspects, and
+compares generated and reference texts across these aspects, providing specific
+suggestions for prompt modification. These clear critiques and actionable
+suggestions guide a receptive optimizer module to make more substantial
+changes, exploring a broader and more effective search space. To further
+improve CriSPO with multi-metric optimization, we introduce an Automatic Suffix
+Tuning (AST) extension to enhance the performance of task prompts across
+multiple metrics. We evaluate CriSPO on 4 state-of-the-art LLMs across 4
+summarization and 5 QA datasets. Extensive experiments show 3-4% ROUGE score
+improvement on summarization and substantial improvement of various metrics on
+QA. Code available at https://github.com/amazon-science/crispo
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI-2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Compression Autoencoder for Efficient High-Resolution Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.10733v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.10733v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyu Chen, Han Cai, Junsong Chen, Enze Xie, Shang Yang, Haotian Tang, Muyang Li, Yao Lu, Song Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder
+models for accelerating high-resolution diffusion models. Existing autoencoder
+models have demonstrated impressive results at a moderate spatial compression
+ratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for
+high spatial compression ratios (e.g., 64x). We address this challenge by
+introducing two key techniques: (1) Residual Autoencoding, where we design our
+models to learn residuals based on the space-to-channel transformed features to
+alleviate the optimization difficulty of high spatial-compression autoencoders;
+(2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases
+training strategy for mitigating the generalization penalty of high
+spatial-compression autoencoders. With these designs, we improve the
+autoencoder's spatial compression ratio up to 128 while maintaining the
+reconstruction quality. Applying our DC-AE to latent diffusion models, we
+achieve significant speedup without accuracy drop. For example, on ImageNet
+512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup
+on H100 GPU for UViT-H while achieving a better FID, compared with the widely
+used SD-VAE-f8 autoencoder. Our code is available at
+https://github.com/mit-han-lab/efficientvit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. First two authors contributed equally to this work. Update:
+  add USiT (UViT+SiT sampler) results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HippoRAG: Neurobiologically Inspired Long-Term Memory for Large Language
+  Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14831v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14831v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bernal Jiménez Gutiérrez, Yiheng Shu, Yu Gu, Michihiro Yasunaga, Yu Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to thrive in hostile and ever-changing natural environments,
+mammalian brains evolved to store large amounts of knowledge about the world
+and continually integrate new information while avoiding catastrophic
+forgetting. Despite the impressive accomplishments, large language models
+(LLMs), even with retrieval-augmented generation (RAG), still struggle to
+efficiently and effectively integrate a large amount of new experiences after
+pre-training. In this work, we introduce HippoRAG, a novel retrieval framework
+inspired by the hippocampal indexing theory of human long-term memory to enable
+deeper and more efficient knowledge integration over new experiences. HippoRAG
+synergistically orchestrates LLMs, knowledge graphs, and the Personalized
+PageRank algorithm to mimic the different roles of neocortex and hippocampus in
+human memory. We compare HippoRAG with existing RAG methods on multi-hop
+question answering and show that our method outperforms the state-of-the-art
+methods remarkably, by up to 20%. Single-step retrieval with HippoRAG achieves
+comparable or better performance than iterative retrieval like IRCoT while
+being 10-30 times cheaper and 6-13 times faster, and integrating HippoRAG into
+IRCoT brings further substantial gains. Finally, we show that our method can
+tackle new types of scenarios that are out of reach of existing methods. Code
+and data are available at https://github.com/OSU-NLP-Group/HippoRAG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024. Code and data:
+  https://github.com/OSU-NLP-Group/HippoRAG</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> of Foundation Models in Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wasif Khan, Seowung Leem, Kyle B. See, Joshua K. Wong, Shaoting Zhang, Ruogu Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FMs) are large-scale deep learning models that are
+developed using large datasets and self-supervised learning methods. These
+models serve as a base for different downstream tasks, including healthcare.
+FMs have been adopted with great success across various domains within
+healthcare. Existing healthcare-based surveys have not yet included all of
+these domains. Therefore, we provide a detailed survey of FMs in healthcare. We
+focus on the history, learning strategies, flagship models, applications, and
+challenges of FMs. We explore how FMs such as the BERT and GPT families are
+reshaping various healthcare domains, including clinical large language models,
+medical image analysis, and omics. Furthermore, we provide a detailed taxonomy
+of healthcare applications facilitated by FMs, such as clinical NLP, medical
+computer vision, graph learning, and other biology-related tasks. Despite the
+promising opportunities FMs provide, they also have several associated
+challenges, which are explained in detail. We also outline open research issues
+and potential lessons learned to provide researchers and practitioners with
+insights into the capabilities of FMs in healthcare to advance their deployment
+and mitigate associated risks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-guided Image Restoration and Semantic Enhancement for Text-to-Image
+  Person Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09059v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09059v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Delong Liu, Haiwen Li, Zhicheng Zhao, Yuan Dong, Nikolaos V. Boulgouris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of Text-to-Image Person Retrieval (TIPR) is to retrieve specific
+person images according to the given textual descriptions. A primary challenge
+in this task is bridging the substantial representational gap between visual
+and textual modalities. The prevailing methods map texts and images into
+unified embedding space for matching, while the intricate semantic
+correspondences between texts and images are still not effectively constructed.
+To address this issue, we propose a novel TIPR framework to build fine-grained
+interactions and alignment between person images and the corresponding texts.
+Specifically, via fine-tuning the Contrastive Language-Image Pre-training
+(CLIP) model, a visual-textual dual encoder is firstly constructed, to
+preliminarily align the image and text features. Secondly, a Text-guided Image
+Restoration (TIR) auxiliary task is proposed to map abstract textual entities
+to specific image regions, improving the alignment between local textual and
+visual embeddings. Additionally, a cross-modal triplet loss is presented to
+handle hard samples, and further enhance the model's discriminability for minor
+differences. Moreover, a pruning-based text data augmentation approach is
+proposed to enhance focus on essential elements in descriptions, thereby
+avoiding excessive model attention to less significant information. The
+experimental results show our proposed method outperforms state-of-the-art
+methods on three popular benchmark datasets, and the code will be made publicly
+available at https://github.com/Delong-liu-bupt/SEN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper was withdrawn due to a dispute among the authors regarding
+  the content of the article</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Logic Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14012v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14012v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aldo Gangemi, Andrea Giovanni Nuzzolese
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic Knowledge Graphs (SKG) face challenges with scalability,
+flexibility, contextual understanding, and handling unstructured or ambiguous
+information. However, they offer formal and structured knowledge enabling
+highly interpretable and reliable results by means of reasoning and querying.
+Large Language Models (LLMs) overcome those limitations making them suitable in
+open-ended tasks and unstructured environments. Nevertheless, LLMs are neither
+interpretable nor reliable. To solve the dichotomy between LLMs and SKGs we
+envision Logic Augmented Generation (LAG) that combines the benefits of the two
+worlds. LAG uses LLMs as Reactive Continuous Knowledge Graphs that can generate
+potentially infinite relations and tacit knowledge on-demand. SKGs are key for
+injecting a discrete heuristic dimension with clear logical and factual
+boundaries. We exemplify LAG in two tasks of collective intelligence, i.e.,
+medical diagnostics and climate projections. Understanding the properties and
+limitations of LAG, which are still mostly unknown, is of utmost importance for
+enabling a variety of tasks involving tacit knowledge in order to provide
+interpretable and effective results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Relaxed Rotational Equivariance via $G$-Biases in Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12454v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12454v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiang Wu, Yingjie Liu, Licheng Sun, Jian Yang, Hanlin Dong, Shing-Ho J. Lin, Xuan Tang, Jinpeng Mi, Bo Jin, Xian Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group Equivariant Convolution (GConv) can capture rotational equivariance
+from original data. It assumes uniform and strict rotational equivariance
+across all features as the transformations under the specific group. However,
+the presentation or distribution of real-world data rarely conforms to strict
+rotational equivariance, commonly referred to as Rotational Symmetry-Breaking
+(RSB) in the system or dataset, making GConv unable to adapt effectively to
+this phenomenon. Motivated by this, we propose a simple but highly effective
+method to address this problem, which utilizes a set of learnable biases called
+$G$-Biases under the group order to break strict group constraints and then
+achieve a Relaxed Rotational Equivariant Convolution (RREConv). To validate the
+efficiency of RREConv, we conduct extensive ablation experiments on the
+discrete rotational group $\mathcal{C}_n$. Experiments demonstrate that the
+proposed RREConv-based methods achieve excellent performance compared to
+existing GConv-based methods in both classification and 2D object detection
+tasks on the natural image datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WebWalker: Benchmarking LLMs in Web Traversal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07572v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07572v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialong Wu, Wenbiao Yin, Yong Jiang, Zhenglin Wang, Zekun Xi, Runnan Fang, Linhai Zhang, Yulan He, Deyu Zhou, Pengjun Xie, Fei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) demonstrates remarkable performance
+across tasks in open-domain question-answering. However, traditional search
+engines may retrieve shallow content, limiting the ability of LLMs to handle
+complex, multi-layered information. To address it, we introduce WebWalkerQA, a
+benchmark designed to assess the ability of LLMs to perform web traversal. It
+evaluates the capacity of LLMs to traverse a website's subpages to extract
+high-quality data systematically. We propose WebWalker, which is a multi-agent
+framework that mimics human-like web navigation through an explore-critic
+paradigm. Extensive experimental results show that WebWalkerQA is challenging
+and demonstrates the effectiveness of RAG combined with WebWalker, through the
+horizontal and vertical integration in real-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ORFormer: Occlusion-Robust <span class="highlight-title">Transformer</span> for Accurate Facial Landmark
+  Detection <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13174v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13174v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jui-Che Chiang, Hou-Ning Hu, Bo-Syuan Hou, Chia-Yu Tseng, Yu-Lun Liu, Min-Hung Chen, Yen-Yu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although facial landmark detection (FLD) has gained significant progress,
+existing FLD methods still suffer from performance drops on partially
+non-visible faces, such as faces with occlusions or under extreme lighting
+conditions or poses. To address this issue, we introduce ORFormer, a novel
+transformer-based method that can detect non-visible regions and recover their
+missing features from visible parts. Specifically, ORFormer associates each
+image patch token with one additional learnable token called the messenger
+token. The messenger token aggregates features from all but its patch. This
+way, the consensus between a patch and other patches can be assessed by
+referring to the similarity between its regular and messenger embeddings,
+enabling non-visible region identification. Our method then recovers occluded
+patches with features aggregated by the messenger tokens. Leveraging the
+recovered features, ORFormer compiles high-quality heatmaps for the downstream
+FLD task. Extensive experiments show that our method generates heatmaps
+resilient to partial occlusions. By integrating the resultant heatmaps into
+existing FLD methods, our method performs favorably against the state of the
+arts on challenging datasets such as WFLW and COFW.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV 2025 Project Link: https://ben0919.github.io/ORFormer/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inductive Learning of Logical Theories with LLMs: An Expressivity-Graded
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16779v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16779v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Pedro Gandarela, Danilo S. Carvalho, André Freitas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a novel systematic methodology to analyse the capabilities
+and limitations of Large Language Models (LLMs) with feedback from a formal
+inference engine, on logic theory induction. The analysis is complexity-graded
+w.r.t. rule dependency structure, allowing quantification of specific inference
+challenges on LLM performance. Integrating LLMs with formal methods is a
+promising frontier in the Natural Language Processing field, as an important
+avenue for improving model inference control and explainability. In particular,
+inductive learning over complex sets of facts and rules, poses unique
+challenges for current autoregressive models, as they lack explicit symbolic
+grounding. While they can be complemented by formal systems, the properties
+delivered by LLMs regarding inductive learning, are not well understood and
+quantified. Empirical results indicate that the largest LLMs can achieve
+competitive results against a SOTA Inductive Logic Programming (ILP) system
+baseline, but also that tracking long predicate relationship chains is a more
+difficult obstacle than theory complexity for LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are LLMs Good Literature <span class="highlight-title">Review</span> Writers? Evaluating the Literature
+  <span class="highlight-title">Review</span> Writing Ability of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuemei Tang, Xufeng Duan, Zhenguang G. Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The literature review is a crucial form of academic writing that involves
+complex processes of literature collection, organization, and summarization.
+The emergence of large language models (LLMs) has introduced promising tools to
+automate these processes. However, their actual capabilities in writing
+comprehensive literature reviews remain underexplored, such as whether they can
+generate accurate and reliable references. To address this gap, we propose a
+framework to assess the literature review writing ability of LLMs
+automatically. We evaluate the performance of LLMs across three tasks:
+generating references, writing abstracts, and writing literature reviews. We
+employ external tools for a multidimensional evaluation, which includes
+assessing hallucination rates in references, semantic coverage, and factual
+consistency with human-written context. By analyzing the experimental results,
+we find that, despite advancements, even the most sophisticated models still
+cannot avoid generating hallucinated references. Additionally, different models
+exhibit varying performance in literature review writing across different
+disciplines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Set-based Neural Network Encoding Without Weight Tying 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16625v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16625v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Andreis, Soro Bedionita, Philip H. S. Torr, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a neural network weight encoding method for network property
+prediction that utilizes set-to-set and set-to-vector functions to efficiently
+encode neural network parameters. Our approach is capable of encoding neural
+networks in a model zoo of mixed architecture and different parameter sizes as
+opposed to previous approaches that require custom encoding models for
+different architectures. Furthermore, our \textbf{S}et-based \textbf{N}eural
+network \textbf{E}ncoder (SNE) takes into consideration the hierarchical
+computational structure of neural networks. To respect symmetries inherent in
+network weight space, we utilize Logit Invariance to learn the required minimal
+invariance properties. Additionally, we introduce a \textit{pad-chunk-encode}
+pipeline to efficiently encode neural network layers that is adjustable to
+computational and memory constraints. We also introduce two new tasks for
+neural network property prediction: cross-dataset and cross-architecture. In
+cross-dataset property prediction, we evaluate how well property predictors
+generalize across model zoos trained on different datasets but of the same
+architecture. In cross-architecture property prediction, we evaluate how well
+property predictors transfer to model zoos of different architecture not seen
+during training. We show that SNE outperforms the relevant baselines on
+standard benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Addressing Hallucinations in Language Models with Knowledge Graph
+  Embeddings as an Additional Modality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viktoriia Chekalina, Anton Razzhigaev, Elizaveta Goncharova, Andrey Kuznetsov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we present an approach to reduce hallucinations in Large
+Language Models (LLMs) by incorporating Knowledge Graphs (KGs) as an additional
+modality. Our method involves transforming input text into a set of KG
+embeddings and using an adapter to integrate these embeddings into the language
+model space, without relying on external retrieval processes.
+  To facilitate this, we created WikiEntities, a dataset containing over 3
+million Wikipedia texts annotated with entities from Wikidata and their
+corresponding embeddings from PyTorch-BigGraph. This dataset serves as a
+valuable resource for training Entity Linking models and adapting the described
+method to various LLMs using specialized adapters.
+  Our method does not require fine-tuning of the language models themselves;
+instead, we only train the adapter. This ensures that the model's performance
+on other tasks is not affected. We trained an adapter for the Mistral 7B, LLaMA
+2-7B (chat), and LLaMA 3-8B (instruct) models using this dataset and
+demonstrated that our approach improves performance on the HaluEval, True-False
+benchmarks and FEVER dataset. The results indicate that incorporating KGs as a
+new modality can effectively reduce hallucinations and improve the factual
+accuracy of language models, all without the need for external retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Less is More: The Influence of Pruning on the Explainability of CNNs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08878v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08878v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Merkle, David Weber, Pascal Schöttle, Stephan Schlögl, Martin Nocker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the last century, deep learning models have become the state-of-the-art
+for solving complex computer vision problems. These modern computer vision
+models have millions of parameters, which presents two major challenges: (1)
+the increased computational requirements hamper the deployment in
+resource-constrained environments, such as mobile or IoT devices, and (2)
+explaining the complex decisions of such networks to humans is challenging.
+Network pruning is a technical approach to reduce the complexity of models,
+where less important parameters are removed. The work presented in this paper
+investigates whether this reduction in technical complexity also helps with
+perceived explainability. To do so, we conducted a pre-study and two
+human-grounded experiments, assessing the effects of different pruning ratios
+on explainability. Overall, we evaluate four different compression rates (i.e.,
+2, 4, 8, and 32) with 37 500 tasks on Mechanical Turk. Results indicate that
+lower compression rates have a positive influence on explainability, while
+higher compression rates show negative effects. Furthermore, we were able to
+identify sweet spots that increase both the perceived explainability and the
+model's performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spurious Feature Eraser: Stabilizing Test-Time Adaptation for
+  Vision-Language Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00376v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00376v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan Ma, Yan Zhu, Changqing Zhang, Peilin Zhao, Baoyuan Wu, Long-Kai Huang, Qinghua Hu, Bingzhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language foundation models have exhibited remarkable success across a
+multitude of downstream tasks due to their scalability on extensive image-text
+paired data. However, these models also display significant limitations when
+applied to downstream tasks, such as fine-grained image classification, as a
+result of ``decision shortcuts'' that hinder their generalization capabilities.
+In this work, we find that the CLIP model possesses a rich set of features,
+encompassing both \textit{desired invariant causal features} and
+\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP
+on downstream tasks originates from its inability to effectively utilize
+pre-trained features in accordance with specific task requirements. To address
+this challenge, we propose a simple yet effective method, Spurious Feature
+Eraser (SEraser), to alleviate the decision shortcuts by erasing the spurious
+features. Specifically, we introduce a test-time prompt tuning paradigm that
+optimizes a learnable prompt, thereby compelling the model to exploit invariant
+features while disregarding decision shortcuts during the inference phase. The
+proposed method effectively alleviates excessive dependence on potentially
+misleading spurious information. We conduct comparative analysis of the
+proposed method against various approaches which validates the significant
+superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TrIM, Triangular Input Movement Systolic Array for Convolutional Neural
+  Networks: Architecture and Hardware Implementation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10243v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10243v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristian Sestito, Shady Agwa, Themis Prodromakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern hardware architectures for Convolutional Neural Networks (CNNs), other
+than targeting high performance, aim at dissipating limited energy. Reducing
+the data movement cost between the computing cores and the memory is a way to
+mitigate the energy consumption. Systolic arrays are suitable architectures to
+achieve this objective: they use multiple processing elements that communicate
+each other to maximize data utilization, based on proper dataflows like the
+weight stationary and row stationary. Motivated by this, we have proposed TrIM,
+an innovative dataflow based on a triangular movement of inputs, and capable to
+reduce the number of memory accesses by one order of magnitude when compared to
+state-of-the-art systolic arrays. In this paper, we present a TrIM-based
+hardware architecture for CNNs. As a showcase, the accelerator is implemented
+onto a Field Programmable Gate Array (FPGA) to execute the VGG-16 and AlexNet
+CNNs. The architecture achieves a peak throughput of 453.6 Giga Operations per
+Second, outperforming a state-of-the-art row stationary systolic array up to
+~3x in terms of memory accesses, and being up to ~11.9x more energy-efficient
+than other FPGA accelerators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been accepted by IEEE TCAS-I for publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MiniRAG: Towards Extremely Simple Retrieval-Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Fan, Jingyuan Wang, Xubin Ren, Chao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing demand for efficient and lightweight Retrieval-Augmented
+Generation (RAG) systems has highlighted significant challenges when deploying
+Small Language Models (SLMs) in existing RAG frameworks. Current approaches
+face severe performance degradation due to SLMs' limited semantic understanding
+and text processing capabilities, creating barriers for widespread adoption in
+resource-constrained scenarios. To address these fundamental limitations, we
+present MiniRAG, a novel RAG system designed for extreme simplicity and
+efficiency. MiniRAG introduces two key technical innovations: (1) a
+semantic-aware heterogeneous graph indexing mechanism that combines text chunks
+and named entities in a unified structure, reducing reliance on complex
+semantic understanding, and (2) a lightweight topology-enhanced retrieval
+approach that leverages graph structures for efficient knowledge discovery
+without requiring advanced language capabilities. Our extensive experiments
+demonstrate that MiniRAG achieves comparable performance to LLM-based methods
+even when using SLMs while requiring only 25\% of the storage space.
+Additionally, we contribute a comprehensive benchmark dataset for evaluating
+lightweight RAG systems under realistic on-device scenarios with complex
+queries. We fully open-source our implementation and datasets at:
+https://github.com/HKUDS/MiniRAG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span>s and Large Language Models for Efficient Intrusion Detection
+  Systems: A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07583v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07583v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamza Kheddar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With significant advancements in Transformers LLMs, NLP has extended its
+reach into many research fields due to its enhanced capabilities in text
+generation and user interaction. One field benefiting greatly from these
+advancements is cybersecurity. In cybersecurity, many parameters that need to
+be protected and exchanged between senders and receivers are in the form of
+text and tabular data, making NLP a valuable tool in enhancing the security
+measures of communication protocols. This survey paper provides a comprehensive
+analysis of the utilization of Transformers and LLMs in cyber-threat detection
+systems. The methodology of paper selection and bibliometric analysis is
+outlined to establish a rigorous framework for evaluating existing research.
+The fundamentals of Transformers are discussed, including background
+information on various cyber-attacks and datasets commonly used in this field.
+The survey explores the application of Transformers in IDSs, focusing on
+different architectures such as Attention-based models, LLMs like BERT and GPT,
+CNN/LSTM-Transformer hybrids, emerging approaches like ViTs, among others.
+Furthermore, it explores the diverse environments and applications where
+Transformers and LLMs-based IDS have been implemented, including computer
+networks, IoT devices, critical infrastructure protection, cloud computing,
+SDN, as well as in autonomous vehicles. The paper also addresses research
+challenges and future directions in this area, identifying key issues such as
+interpretability, scalability, and adaptability to evolving threats, and more.
+Finally, the conclusion summarizes the findings and highlights the significance
+of Transformers and LLMs in enhancing cyber-threat detection capabilities,
+while also outlining potential avenues for further research and development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2405.04760 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenSafe: A Generalizable Safety Enhancer for Safe Reinforcement Learning
+  Algorithms Based on Reduced Order Markov Decision Process Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03912v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03912v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhehua Zhou, Xuan Xie, Jiayang Song, Zhan Shu, Lei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe Reinforcement Learning (SRL) aims to realize a safe learning process for
+Deep Reinforcement Learning (DRL) algorithms by incorporating safety
+constraints. However, the efficacy of SRL approaches often relies on accurate
+function approximations, which are notably challenging to achieve in the early
+learning stages due to data insufficiency. To address this issue, we introduce
+in this work a novel Generalizable Safety enhancer (GenSafe) that is able to
+overcome the challenge of data insufficiency and enhance the performance of SRL
+approaches. Leveraging model order reduction techniques, we first propose an
+innovative method to construct a Reduced Order Markov Decision Process (ROMDP)
+as a low-dimensional approximator of the original safety constraints. Then, by
+solving the reformulated ROMDP-based constraints, GenSafe refines the actions
+of the agent to increase the possibility of constraint satisfaction.
+Essentially, GenSafe acts as an additional safety layer for SRL algorithms. We
+evaluate GenSafe on multiple SRL approaches and benchmark problems. The results
+demonstrate its capability to improve safety performance, especially in the
+early learning phases, while maintaining satisfactory task performance. Our
+proposed GenSafe not only offers a novel measure to augment existing SRL
+methods but also shows broad compatibility with various SRL algorithms, making
+it applicable to a wide range of systems and SRL problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal
+  MRI <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10377v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10377v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linxuan Han, Sa Xiao, Zimeng Li, Haidong Li, Xiuchao Zhao, Yeqing Han, Fumin Guo, Xin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal magnetic resonance imaging (MRI) provides information of lesions
+for computer-aided diagnosis from different views. Deep learning algorithms are
+suitable for identifying specific anatomical structures, segmenting lesions,
+and classifying diseases. Manual labels are limited due to the high expense,
+which hinders further improvement of accuracy. Self-supervised learning,
+particularly masked image modeling (MIM), has shown promise in utilizing
+unlabeled data. However, we spot model collapse when applying MIM to
+multi-modal MRI datasets. The performance of downstream tasks does not see any
+improvement following the collapsed model. To solve model collapse, we analyze
+and address it in two types: complete collapse and dimensional collapse. We
+find complete collapse occurs because the collapsed loss value in multi-modal
+MRI datasets falls below the normally converged loss value. Based on this, the
+hybrid mask pattern (HMP) masking strategy is introduced to elevate the
+collapsed loss above the normally converged loss value and avoid complete
+collapse. Additionally, we reveal that dimensional collapse stems from
+insufficient feature uniformity in MIM. We mitigate dimensional collapse by
+introducing the pyramid barlow twins (PBT) module as an explicit regularization
+method. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module
+to avoid model collapse multi-modal MRI. Experiments are conducted on three
+multi-modal MRI datasets to validate the effectiveness of our approach in
+preventing both types of model collapse. By preventing model collapse, the
+training of the model becomes more stable, resulting in a decent improvement in
+performance for segmentation and classification tasks. The code is available at
+https://github.com/LinxuanHan/E-MIM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Private Collaborative Edge Inference via Over-the-Air Computation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21151v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21151v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Selim F. Yilmaz, Burak Hasircioglu, Li Qiao, Deniz Gunduz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider collaborative inference at the wireless edge, where each client's
+model is trained independently on its local dataset. Clients are queried in
+parallel to make an accurate decision collaboratively. In addition to
+maximizing the inference accuracy, we also want to ensure the privacy of local
+models. To this end, we leverage the superposition property of the multiple
+access channel to implement bandwidth-efficient multi-user inference methods.
+We propose different methods for ensemble and multi-view classification that
+exploit over-the-air computation (OAC). We show that these schemes perform
+better than their orthogonal counterparts with statistically significant
+differences while using fewer resources and providing privacy guarantees. We
+also provide experimental results verifying the benefits of the proposed OAC
+approach to multi-user inference, and perform an ablation study to demonstrate
+the effectiveness of our design choices. We share the source code of the
+framework publicly on Github to facilitate further research and
+reproducibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 8 figures. This work extends from our preliminary study
+  presented at the 2022 IEEE International Symposium on Information Theory [1].
+  arXiv admin note: text overlap with arXiv:2202.03129</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DIDLM: A SLAM <span class="highlight-title">Dataset</span> for Difficult Scenarios Featuring Infrared, Depth
+  Cameras, LIDAR, 4D Radar, and Others under Adverse Weather, Low Light
+  Conditions, and Rough Roads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09622v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09622v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weisheng Gong, Kaijie Su, Qingyong Li, Chen He, Tong Wu, Z. Jane Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adverse weather conditions, low-light environments, and bumpy road surfaces
+pose significant challenges to SLAM in robotic navigation and autonomous
+driving. Existing datasets in this field predominantly rely on single sensors
+or combinations of LiDAR, cameras, and IMUs. However, 4D millimeter-wave radar
+demonstrates robustness in adverse weather, infrared cameras excel in capturing
+details under low-light conditions, and depth images provide richer spatial
+information. Multi-sensor fusion methods also show potential for better
+adaptation to bumpy roads. Despite some SLAM studies incorporating these
+sensors and conditions, there remains a lack of comprehensive datasets
+addressing low-light environments and bumpy road conditions, or featuring a
+sufficiently diverse range of sensor data. In this study, we introduce a
+multi-sensor dataset covering challenging scenarios such as snowy weather,
+rainy weather, nighttime conditions, speed bumps, and rough terrains. The
+dataset includes rarely utilized sensors for extreme conditions, such as 4D
+millimeter-wave radar, infrared cameras, and depth cameras, alongside 3D LiDAR,
+RGB cameras, GPS, and IMU. It supports both autonomous driving and ground robot
+applications and provides reliable GPS/INS ground truth data, covering
+structured and semi-structured terrains. We evaluated various SLAM algorithms
+using this dataset, including RGB images, infrared images, depth images, LiDAR,
+and 4D millimeter-wave radar. The dataset spans a total of 18.5 km, 69 minutes,
+and approximately 660 GB, offering a valuable resource for advancing SLAM
+research under complex and extreme conditions. Our dataset is available at
+https://github.com/GongWeiSheng/DIDLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of Artificial Intelligence Methods for Lead Time Prediction
+  in Non-Cycled Areas of Automotive Production 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cornelius Hake, Jonas Weigele, Frederik Reichert, Christian Friedrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present study examines the effectiveness of applying Artificial
+Intelligence methods in an automotive production environment to predict unknown
+lead times in a non-cycle-controlled production area. Data structures are
+analyzed to identify contextual features and then preprocessed using one-hot
+encoding. Methods selection focuses on supervised machine learning techniques.
+In supervised learning methods, regression and classification methods are
+evaluated. Continuous regression based on target size distribution is not
+feasible. Classification methods analysis shows that Ensemble Learning and
+Support Vector Machines are the most suitable. Preliminary study results
+indicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost
+yield the best results. After further testing and extensive hyperparameter
+optimization, the final method choice is the LightGBM algorithm. Depending on
+feature availability and prediction interval granularity, relative prediction
+accuracies of up to 90% can be achieved. Further tests highlight the importance
+of periodic retraining of AI models to accurately represent complex production
+processes using the database. The research demonstrates that AI methods can be
+effectively applied to highly variable production data, adding business value
+by providing an additional metric for various control tasks while outperforming
+current non AI-based systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PastNet: Introducing Physical Inductive Biases for Spatio-temporal Video
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11421v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11421v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Wu, Fan Xu, Chong Chen, Xian-Sheng Hua, Xiao Luo, Haixin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the challenge of spatio-temporal video
+prediction task, which involves generating future video frames based on
+historical spatio-temporal observation streams. Existing approaches typically
+utilize external information such as semantic maps to improve video prediction
+accuracy, which often neglect the inherent physical knowledge embedded within
+videos. Worse still, their high computational costs could impede their
+applications for high-resolution videos. To address these constraints, we
+introduce a novel framework called \underline{P}hysics-\underline{a}ssisted
+\underline{S}patio-\underline{t}emporal \underline{Net}work (PastNet) for
+high-quality video prediction. The core of PastNet lies in incorporating a
+spectral convolution operator in the Fourier domain, which efficiently
+introduces inductive biases from the underlying physical laws. Additionally, we
+employ a memory bank with the estimated intrinsic dimensionality to discretize
+local features during the processing of complex spatio-temporal signals,
+thereby reducing computational costs and facilitating efficient high-resolution
+video prediction. Extensive experiments on various widely-used spatio-temporal
+video benchmarks demonstrate the effectiveness and efficiency of the proposed
+PastNet compared with a range of state-of-the-art methods, particularly in
+high-resolution scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MoPE: Mixture of <span class="highlight-title">Prompt</span> Experts for Parameter-Efficient and Scalable
+  Multimodal Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10568v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10568v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixiang Jiang, Lingbo Liu, Changwen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the demonstrated parameter efficiency of prompt-based multimodal
+fusion methods, their limited adaptivity and expressiveness often result in
+suboptimal performance compared to other tuning approaches. In this paper, we
+introduce the Mixture of Prompt Experts (MoPE), the first technique designed to
+overcome these limitations by decomposing standard prompts to capture
+instance-level features adaptively. Building on this decomposition, MoPE
+enhances prompt fusion's expressiveness by leveraging multimodal pairing priors
+to route the most effective prompt for each instance dynamically. Compared to
+vanilla prompting, our MoPE-based fusion method exhibits greater
+expressiveness, scaling more effectively with the training data and the overall
+number of trainable parameters. We also investigate regularization terms for
+expert routing, which lead to emergent expert specialization with enhanced
+adaptiveness and interpretablity. Extensive experiments across six multimodal
+datasets spanning four modalities demonstrate state-of-the-art performance for
+prompt fusion, matching or even surpassing the performance of fine-tuning while
+requiring only 0.8% of the trainable parameters. Project homepage:
+https://github.com/songrise/MoPE
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review, Extended version of arxiv:2312.03734</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UTMath: Math Evaluation with Unit Test via Reasoning-to-Coding Thoughts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07240v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07240v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Yang, Qingping Yang, Yingwei Ma, Runtao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evaluation of mathematical reasoning capabilities is essential for
+advancing Artificial General Intelligence (AGI). While Large Language Models
+(LLMs) have shown impressive performance in solving mathematical problems,
+existing benchmarks such as GSM8K and MATH present limitations, including
+narrow problem definitions with specific numbers and reliance on predetermined
+rules that hinder accurate assessments of reasoning and generality. This paper
+introduces the UTMath Benchmark, a robust evaluation framework designed to
+assess LLMs through extensive unit tests, with a focus on both the accuracy and
+generality of model responses. It comprises 1,053 cutting-edge problems
+spanning nine mathematical domains, with an average of 68 test cases per
+problem. UTMath is highly challenging, with the best-performing model, o1-mini,
+solving only 32.57\% of the problems, followed by o1-preview at 27.16\%, and
+GPT-4o at 26.93\%. Furthermore, we present the Reasoning-to-Coding of Thoughts
+(RCoT) approach, which encourages LLMs to engage in explicit reasoning prior to
+code generation, thereby facilitating the production of more sophisticated
+solutions and enhancing overall performance and efficiency. Additionally, we
+also release the UTMath-Train training dataset (more than 70k samples), to
+support the community in further exploring mathematical reasoning. Our
+benchmark can be accessed via the following link:
+https://github.com/UTMathGroup/UTMath
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ To Analyze and Regulate Human-in-the-loop Learning for Congestion Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03055v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03055v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbo Li, Lingjie Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In congestion games, selfish users behave myopically to crowd to the shortest
+paths, and the social planner designs mechanisms to regulate such selfish
+routing through information or payment incentives. However, such mechanism
+design requires the knowledge of time-varying traffic conditions and it is the
+users themselves to learn and report past road experiences to the social
+planner (e.g., Waze or Google Maps). When congestion games meet mobile
+crowdsourcing, it is critical to incentivize selfish users to explore
+non-shortest paths in the best exploitation-exploration trade-off. First, we
+consider a simple but fundamental parallel routing network with one
+deterministic path and multiple stochastic paths for users with an average
+arrival probability $\lambda$. We prove that the current myopic routing policy
+(widely used in Waze and Google Maps) misses both exploration (when strong
+hazard belief) and exploitation (when weak hazard belief) as compared to the
+social optimum. Due to the myopic policy's under-exploration, we prove that the
+caused price of anarchy (PoA) is larger than
+\(\frac{1}{1-\rho^{\frac{1}{\lambda}}}\), which can be arbitrarily large as
+discount factor \(\rho\rightarrow1\). To mitigate such huge efficiency loss, we
+propose a novel selective information disclosure (SID) mechanism: we only
+reveal the latest traffic information to users when they intend to over-explore
+stochastic paths upon arrival, while hiding such information when they want to
+under-explore. We prove that our mechanism successfully reduces PoA to be less
+than~\(2\). Besides the parallel routing network, we further extend our
+mechanism and PoA results to any linear path graphs with multiple intermediate
+nodes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2211.14029</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What type of inference is planning? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.17863v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.17863v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Lázaro-Gredilla, Li Yang Ku, Kevin P. Murphy, Dileep George
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple types of inference are available for probabilistic graphical models,
+e.g., marginal, maximum-a-posteriori, and even marginal maximum-a-posteriori.
+Which one do researchers mean when they talk about "planning as inference"?
+There is no consistency in the literature, different types are used, and their
+ability to do planning is further entangled with specific approximations or
+additional constraints. In this work we use the variational framework to show
+that, just like all commonly used types of inference correspond to different
+weightings of the entropy terms in the variational problem, planning
+corresponds exactly to a different set of weights. This means that all the
+tricks of variational inference are readily applicable to planning. We develop
+an analogue of loopy belief propagation that allows us to perform approximate
+planning in factored-state Markov decisions processes without incurring
+intractability due to the exponentially large state space. The variational
+perspective shows that the previous types of inference for planning are only
+adequate in environments with low stochasticity, and allows us to characterize
+each type by its own merits, disentangling the type of inference from the
+additional approximations that its practical use requires. We validate these
+results empirically on synthetic MDPs and tasks posed in the International
+Planning Competition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera-ready version update</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ExPO: Explainable Phonetic Trait-Oriented Network for Speaker
+  Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Ma, Shuai Wang, Tianchi Liu, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In speaker verification, we use computational method to verify if an
+utterance matches the identity of an enrolled speaker. This task is similar to
+the manual task of forensic voice comparison, where linguistic analysis is
+combined with auditory measurements to compare and evaluate voice samples.
+Despite much success, we have yet to develop a speaker verification system that
+offers explainable results comparable to those from manual forensic voice
+comparison. A novel approach, Explainable Phonetic Trait-Oriented (ExPO)
+network, is proposed in this paper to introduce the speaker's phonetic trait
+which describes the speaker's characteristics at the phonetic level, resembling
+what forensic comparison does. ExPO not only generates utterance-level speaker
+embeddings but also allows for fine-grained analysis and visualization of
+phonetic traits, offering an explainable speaker verification process.
+Furthermore, we investigate phonetic traits from within-speaker and
+between-speaker variation perspectives to determine which trait is most
+effective for speaker verification, marking an important step towards
+explainable speaker verification. Our code is available at
+https://github.com/mmmmayi/ExPO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Signal Processing Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Snake Learning: A Communication- and Computation-Efficient Distributed
+  Learning Framework for 6G 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.03372v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.03372v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxue Yu, Xingfu Yi, Rongpeng Li, Fei Wang, Chenghui Peng, Zhifeng Zhao, Honggang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the evolution towards 6G, integrating Artificial Intelligence (AI) with
+advanced network infrastructure emerges as a pivotal strategy for enhancing
+network intelligence and resource utilization. Existing distributed learning
+frameworks like Federated Learning and Split Learning often struggle with
+significant challenges in dynamic network environments including high
+synchronization demands, costly communication overhead, severe computing
+resource consumption, and data heterogeneity across network nodes. These
+obstacles hinder the applications of ubiquitous computing capabilities of 6G
+networks, especially in light of the trend of escalating model parameters and
+training data volumes. To address these challenges effectively, this paper
+introduces ``Snake Learning", a cost-effective distributed learning framework.
+Specifically, Snake Learning respects the heterogeneity of inter-node computing
+capability and local data distribution in 6G networks, and sequentially trains
+the designated part of model layers on individual nodes. This layer-by-layer
+serpentine update mechanism contributes to significantly reducing the
+requirements for storage, memory and communication during the model training
+phase, and demonstrates superior adaptability and efficiency for both
+classification and fine-tuning tasks across homogeneous and heterogeneous data
+distributions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VBIM-Net: Variational Born Iterative Network for Inverse Scattering
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18731v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18731v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqing Xing, Zhaoyang Zhang, Zirui Chen, Yusong Wang, Haoran Ma, Zhun Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, studies have shown the potential of integrating field-type
+iterative methods with deep learning (DL) techniques in solving inverse
+scattering problems (ISPs). In this article, we propose a novel Variational
+Born Iterative Network, namely, VBIM-Net, to solve the full-wave ISPs with
+significantly improved structural rationality and inversion quality. The
+proposed VBIM-Net emulates the alternating updates of the total electric field
+and the contrast in the variational Born iterative method (VBIM) by multiple
+layers of subnetworks. We embed the analytical calculation of the contrast
+variation into each subnetwork, converting the scattered field residual into an
+approximate contrast variation and then enhancing it by a U-Net, thus avoiding
+the requirement of matched measurement dimension and grid resolution as in
+existing approaches. The total field and contrast of each layer's output is
+supervised in the loss function of VBIM-Net, imposing soft physical constraints
+on the variables in the subnetworks, which benefits the model's performance.In
+addition, we design a training scheme with extra noise to enhance the model's
+stability. Extensive numerical results on synthetic and experimental data both
+verify the inversion quality, generalization ability, and robustness of the
+proposed VBIM-Net. This work may provide some new inspiration for the design of
+efficient field-type DL schemes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Geoscience and Remote Sensing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FoMo: A Foundation Model for Mobile Traffic Forecasting with Diffusion
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.15322v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.15322v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoye Chai, Xiaoqian Qi, Shiyuan Zhang, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mobile traffic forecasting allows operators to anticipate network dynamics
+and performance in advance, offering substantial potential for enhancing
+service quality and improving user experience. However, existing models are
+often task-oriented and are trained with tailored data, which limits their
+effectiveness in diverse mobile network tasks of Base Station (BS) deployment,
+resource allocation, energy optimization, etc. and hinders generalization
+across different urban environments. Foundation models have made remarkable
+strides across various domains of NLP and CV due to their multi-tasking
+adaption and zero/few-shot learning capabilities. In this paper, we propose an
+innovative Foundation model for Mo}bile traffic forecasting (FoMo), aiming to
+handle diverse forecasting tasks of short/long-term predictions and
+distribution generation across multiple cities to support network planning and
+optimization. FoMo combines diffusion models and transformers, where various
+spatio-temporal masks are proposed to enable FoMo to learn intrinsic features
+of different tasks, and a contrastive learning strategy is developed to capture
+the correlations between mobile traffic and urban contexts, thereby improving
+its transfer learning capability. Extensive experiments on 9 real-world
+datasets demonstrate that FoMo outperforms current models concerning diverse
+forecasting tasks and zero/few-shot learning, showcasing a strong universality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FLM-101B: An Open LLM and How to Train It with $100K Budget 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03852v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03852v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Li, Yiqun Yao, Xin Jiang, Xuezhi Fang, Xuying Meng, Siqi Fan, Peng Han, Jing Li, Li Du, Bowen Qin, Zheng Zhang, Aixin Sun, Yequan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are considered important approaches towards
+foundational machine intelligence, achieving remarkable success in Natural
+Language Processing and multimodal tasks, among others. However, the carbon
+footprints and financial costs originating from heavy pre-training computation
+is a non-negligible issue. Progressive training methods, inspired by the
+neurogenesis process that grows neural structures, have shown potential to
+accelerate LLM pre-training. However, the algorithms, implementation, and
+practices for progressively training LLMs beyond 100B parameters remain
+underexplored. In this paper, we show that our model, namely FLM-101B, trained
+with our growth strategy under a budget of \$100K, reaches 80\% of the
+baselines' performances with only 10\% of their floating-point operations. We
+believe that further studies on progressive training will benefit the community
+by cutting down the costs and promoting green AI. The checkpoint of FLM-101B is
+released at https://huggingface.co/CofeAI/FLM-101B.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Gradient Subspaces: Addressing and Overcoming LoRA's
+  Limitations in Federated Fine-Tuning of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23111v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23111v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navyansh Mahla, Kshitij Sharad Jadhav, Ganesh Ramakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable capabilities across
+various domains, particularly in task generalization for both text and vision
+data. While fine-tuning these models can significantly enhance their
+performance on specific downstream tasks, it often requires high-quality data
+that cannot be shared due to privacy concerns. Federated Learning (FL) offers a
+promising solution for collaborative training without direct data sharing.
+However, many parameter-efficient fine-tuning strategies for LLMs in FL,
+particularly those based on Low-Rank Adaptation (LoRA), face limitations. In
+this paper, we critically analyze the convergence and performance guarantees of
+popular FL frameworks utilizing LoRA, highlighting its suboptimal nature due to
+constrained subspace learning of low-rank matrices. This limitation hinders
+effective fine-tuning of LLMs in federated settings. Through rigorous
+analytical and empirical evaluations, we demonstrate that direct weight
+averaging outperforms LoRA-based strategies, leading to superior performance
+for fine-tuned models. Our comprehensive comparison unmasks inefficiencies in
+LoRA approaches and underscores the advantages of direct weight aggregation. We
+extend our analysis to low-rank gradient-based optimizers, such as GaLore, used
+during local training steps. Our findings show that GaLore along with
+direct-weight aggregation is a more effective approach, outperforming federated
+LoRA methods like FlexLoRA and FFA-LoRA across both text and image modalities.
+While privacy remains paramount in FL discourse, our focus is on assessing
+performance outcomes of federated fine-tuned models and evaluating various FL
+frameworks from both theoretical and empirical perspectives. Our findings
+advocate reassessing the reliance on LoRA within FL contexts, paving the way
+for more efficient training methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Random Policy Enables In-Context Reinforcement Learning within Trust
+  Horizons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.19982v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.19982v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiqin Chen, Santiago Paternain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained foundation models have exhibited extraordinary in-context learning
+performance, allowing zero-shot generalization to new tasks not encountered
+during pretraining. In the case of reinforcement learning (RL), in-context RL
+(ICRL) emerges when pretraining FMs on decision-making problems in an
+autoregressive-supervised manner. Nevertheless, current state-of-the-art ICRL
+algorithms, like Algorithm Distillation, Decision Pretrained Transformer and
+Decision Importance Transformer, impose stringent requirements on the
+pretraining dataset concerning the source policies, context information, and
+action labels. Notably, these algorithms either demand optimal policies or
+require varying degrees of well-trained behavior policies for all pretraining
+environments. This significantly hinders the application of ICRL to real-world
+scenarios, where acquiring optimal or well-trained policies for a substantial
+volume of real-world training environments can be intractable. To overcome this
+challenge, we introduce a novel approach, termed State-Action Distillation
+(SAD), that allows to generate an effective pretraining dataset guided solely
+by random policies. In particular, SAD selects query states and corresponding
+action labels by distilling outstanding state-action pairs from the entire
+state and action spaces by using random policies within a trust horizon, and
+then inherits the classical autoregressive-supervised mechanism during
+pretraining. To the best of our knowledge, this is the first work that enables
+effective ICRL under random policies and random contexts. We also establish
+quantitative analysis of the trustworthiness as well as the performance
+guarantees of SAD. Moreover, our empirical results across multiple popular ICRL
+benchmark environments demonstrate that, on average, SAD outperforms the best
+baseline by 236.3% in the offline evaluation and by 135.2% in the online
+evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What Makes Cryptic Crosswords Challenging for LLMs? <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09012v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09012v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrahman Sadallah, Daria Kotova, Ekaterina Kochmar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cryptic crosswords are puzzles that rely on general knowledge and the
+solver's ability to manipulate language on different levels, dealing with
+various types of wordplay. Previous research suggests that solving such puzzles
+is challenging even for modern NLP models, including Large Language Models
+(LLMs). However, there is little to no research on the reasons for their poor
+performance on this task. In this paper, we establish the benchmark results for
+three popular LLMs: Gemma2, LLaMA3 and ChatGPT, showing that their performance
+on this task is still significantly below that of humans. We also investigate
+why these models struggle to achieve superior performance. We release our code
+and introduced datasets at
+https://github.com/bodasadallah/decrypting-crosswords.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>COLING 2025. arXiv admin note: text overlap with arXiv:2403.12094</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GOMA: Proactive Embodied Cooperative Communication via Goal-Oriented
+  Mental Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11075v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11075v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lance Ying, Kunal Jha, Shivam Aarya, Joshua B. Tenenbaum, Antonio Torralba, Tianmin Shu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Verbal communication plays a crucial role in human cooperation, particularly
+when the partners only have incomplete information about the task, environment,
+and each other's mental state. In this paper, we propose a novel cooperative
+communication framework, Goal-Oriented Mental Alignment (GOMA). GOMA formulates
+verbal communication as a planning problem that minimizes the misalignment
+between the parts of agents' mental states that are relevant to the goals. This
+approach enables an embodied assistant to reason about when and how to
+proactively initialize communication with humans verbally using natural
+language to help achieve better cooperation. We evaluate our approach against
+strong baselines in two challenging environments, Overcooked (a multiplayer
+game) and VirtualHome (a household simulator). Our experimental results
+demonstrate that large language models struggle with generating meaningful
+communication that is grounded in the social and physical context. In contrast,
+our approach can successfully generate concise verbal communication for the
+embodied assistant to effectively boost the performance of the cooperation as
+well as human users' perception of the assistant.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaSociety: An Adaptive Environment with Social Structures for
+  Multi-Agent Decision-Making <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03865v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03865v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhe Huang, Xingbo Wang, Hao Liu, Fanqi Kong, Aoyang Qin, Min Tang, Song-Chun Zhu, Mingjie Bi, Siyuan Qi, Xue Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional interactive environments limit agents' intelligence growth with
+fixed tasks. Recently, single-agent environments address this by generating new
+tasks based on agent actions, enhancing task diversity. We consider the
+decision-making problem in multi-agent settings, where tasks are further
+influenced by social connections, affecting rewards and information access.
+However, existing multi-agent environments lack a combination of adaptive
+physical surroundings and social connections, hindering the learning of
+intelligent behaviors. To address this, we introduce AdaSociety, a customizable
+multi-agent environment featuring expanding state and action spaces, alongside
+explicit and alterable social structures. As agents progress, the environment
+adaptively generates new tasks with social structures for agents to undertake.
+In AdaSociety, we develop three mini-games showcasing distinct social
+structures and tasks. Initial results demonstrate that specific social
+structures can promote both individual and collective benefits, though current
+reinforcement learning and LLM-based algorithms show limited effectiveness in
+leveraging social structures to enhance performance. Overall, AdaSociety serves
+as a valuable research platform for exploring intelligence in diverse physical
+and social settings. The code is available at
+https://github.com/bigai-ai/AdaSociety.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS D&B 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mode-conditioned music learning and composition: a spiking neural
+  network inspired by neuroscience and psychology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Liang, Yi Zeng, Menghaoran Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Musical mode is one of the most critical element that establishes the
+framework of pitch organization and determines the harmonic relationships.
+Previous works often use the simplistic and rigid alignment method, and
+overlook the diversity of modes. However, in contrast to AI models, humans
+possess cognitive mechanisms for perceiving the various modes and keys. In this
+paper, we propose a spiking neural network inspired by brain mechanisms and
+psychological theories to represent musical modes and keys, ultimately
+generating musical pieces that incorporate tonality features. Specifically, the
+contributions are detailed as follows: 1) The model is designed with multiple
+collaborated subsystems inspired by the structures and functions of
+corresponding brain regions; 2)We incorporate mechanisms for neural circuit
+evolutionary learning that enable the network to learn and generate
+mode-related features in music, reflecting the cognitive processes involved in
+human music perception. 3)The results demonstrate that the proposed model shows
+a connection framework closely similar to the Krumhansl-Schmuckler model, which
+is one of the most significant key perception models in the music psychology
+domain. 4) Experiments show that the model can generate music pieces with
+characteristics of the given modes and keys. Additionally, the quantitative
+assessments of generated pieces reveals that the generating music pieces have
+both tonality characteristics and the melodic adaptability needed to generate
+diverse and musical content. By combining insights from neuroscience,
+psychology, and music theory with advanced neural network architectures, our
+research aims to create a system that not only learns and generates music but
+also bridges the gap between human cognition and artificial intelligence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Radar Signal Recognition through <span class="highlight-title">Self-Supervised</span> Learning and Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03461v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03461v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zi Huang, Simon Denman, Akila Pemasiri, Clinton Fookes, Terrence Martin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic radar signal recognition (RSR) plays a pivotal role in electronic
+warfare (EW), as accurately classifying radar signals is critical for informing
+decision-making processes. Recent advances in deep learning have shown
+significant potential in improving RSR performance in domains with ample
+annotated data. However, these methods fall short in EW scenarios where
+annotated RF data are scarce or impractical to obtain. To address these
+challenges, we introduce a self-supervised learning (SSL) method which utilises
+masked signal modelling and RF domain adaption to enhance RSR performance in
+environments with limited RF samples and labels. Specifically, we investigate
+pre-training masked autoencoders (MAE) on baseband in-phase and quadrature
+(I/Q) signals from various RF domains and subsequently transfer the learned
+representation to the radar domain, where annotated data are limited. Empirical
+results show that our lightweight self-supervised ResNet model with domain
+adaptation achieves up to a 17.5% improvement in 1-shot classification accuracy
+when pre-trained on in-domain signals (i.e., radar signals) and up to a 16.31%
+improvement when pre-trained on out-of-domain signals (i.e., comm signals),
+compared to its baseline without SSL. We also provide reference results for
+several MAE designs and pre-training strategies, establishing a new benchmark
+for few-shot radar signal classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ELDER: Enhancing Lifelong Model Editing with Mixture-of-LoRA <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11869v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11869v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaang Li, Quan Wang, Zhongnan Wang, Yongdong Zhang, Zhendong Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) require model editing to efficiently update
+specific knowledge within them and avoid factual errors. Most model editing
+methods are solely designed for single-time use and result in a significant
+forgetting effect in lifelong editing scenarios, where sequential edits are
+conducted over time. Previous approaches manage sequential edits by freezing
+original parameters and discretely allocating new parameters for each knowledge
+update. However, these methods lack robustness to minor input variations due to
+the discrete mapping between data and parameters. To overcome this challenge,
+we propose ELDER, a novel approach to create a continuous association between
+data and adapters. ELDER integrates multiple LoRAs through a router network and
+is trained to establish a smooth data-adapter association, thereby enhancing
+the edit robustness and generalization of semantically equivalent inputs. To
+ensure inputs containing the same knowledge will be processed by the same
+LoRAs, we design a novel loss to guide the model link LoRA allocations with
+edit knowledge. Furthermore, we propose a deferral mechanism to retain the
+original LLM capabilities post-edit. Extensive experiments on GPT-2 XL and
+LLaMA2-7B demonstrate that ELDER effectively edits models in the lifelong
+setting, outperforming eight baselines while exhibiting strong scalability and
+preserving LLMs' general abilities on downstream tasks. Our code is available
+at https://github.com/JiaangL/ELDER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI Foundation Models for Wearable Movement Data in Mental Health
+  Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15240v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15240v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franklin Y. Ruan, Aiwei Zhang, Jenny Y. Oh, SouYoung Jin, Nicholas C. Jacobson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained foundation models and transformer architectures have driven the
+success of large language models (LLMs) and other modern AI breakthroughs.
+However, similar advancements in health data modeling remain limited due to the
+need for innovative adaptations. Wearable movement data offers a valuable
+avenue for exploration, as it's a core feature in nearly all commercial
+smartwatches, well established in clinical and mental health research, and the
+sequential nature of the data shares similarities to language. We introduce the
+Pretrained Actigraphy Transformer (PAT), the first open source foundation model
+designed for time-series wearable movement data. Leveraging transformer-based
+architectures and novel techniques, such as patch embeddings, and pretraining
+on data from 29,307 participants in a national U.S. sample, PAT achieves
+state-of-the-art performance in several mental health prediction tasks. PAT is
+also lightweight and easily interpretable, making it a robust tool for mental
+health research.
+  GitHub: https://github.com/njacobsonlab/Pretrained-Actigraphy-Transformer/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Cascaded Dilated Convolution Approach for Mpox Lesion Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10106v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10106v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayush Deshmukh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The global outbreak of the Mpox virus, classified as a Public Health
+Emergency of International Concern (PHEIC) by the World Health Organization,
+presents significant diagnostic challenges due to its visual similarity to
+other skin lesion diseases. Traditional diagnostic methods for Mpox, which rely
+on clinical symptoms and laboratory tests, are slow and labor intensive. Deep
+learning-based approaches for skin lesion classification offer a promising
+alternative. However, developing a model that balances efficiency with accuracy
+is crucial to ensure reliable and timely diagnosis without compromising
+performance. This study introduces the Cascaded Atrous Group Attention (CAGA)
+framework to address these challenges, combining the Cascaded Atrous Attention
+module and the Cascaded Group Attention mechanism. The Cascaded Atrous
+Attention module utilizes dilated convolutions and cascades the outputs to
+enhance multi-scale representation. This is integrated into the Cascaded Group
+Attention mechanism, which reduces redundancy in Multi-Head Self-Attention. By
+integrating the Cascaded Atrous Group Attention module with EfficientViT-L1 as
+the backbone architecture, this approach achieves state-of-the-art performance,
+reaching an accuracy of 98% on the Mpox Close Skin Image (MCSI) dataset while
+reducing model parameters by 37.5% compared to the original EfficientViT-L1.
+The model's robustness is demonstrated through extensive validation on two
+additional benchmark datasets, where it consistently outperforms existing
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, Submitted to Medical Imaging with Deep Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cost-Effective Robotic Handwriting System with AI Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06783v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06783v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Huang, Richard Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a cost-effective robotic handwriting system designed to
+replicate human-like handwriting with high precision. Combining a Raspberry Pi
+Pico microcontroller, 3D-printed components, and a machine learning-based
+handwriting generation model implemented via TensorFlow, the system converts
+user-supplied text into realistic stroke trajectories. By leveraging
+lightweight 3D-printed materials and efficient mechanical designs, the system
+achieves a total hardware cost of approximately \$56, significantly
+undercutting commercial alternatives. Experimental evaluations demonstrate
+handwriting precision within $\pm$0.3 millimeters and a writing speed of
+approximately 200 mm/min, positioning the system as a viable solution for
+educational, research, and assistive applications. This study seeks to lower
+the barriers to personalized handwriting technologies, making them accessible
+to a broader audience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an updated version of a paper originally presented at the
+  2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Go AIs be adversarially robust? <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12843v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12843v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Tseng, Euan McLean, Kellin Pelrine, Tony T. Wang, Adam Gleave
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prior work found that superhuman Go AIs can be defeated by simple adversarial
+strategies, especially "cyclic" attacks. In this paper, we study whether adding
+natural countermeasures can achieve robustness in Go, a favorable domain for
+robustness since it benefits from incredible average-case capability and a
+narrow, innately adversarial setting. We test three defenses: adversarial
+training on hand-constructed positions, iterated adversarial training, and
+changing the network architecture. We find that though some of these defenses
+protect against previously discovered attacks, none withstand freshly trained
+adversaries. Furthermore, most of the reliably effective attacks these
+adversaries discover are different realizations of the same overall class of
+cyclic attacks. Our results suggest that building robust AI systems is
+challenging even with extremely superhuman systems in some of the most
+tractable settings, and highlight two key gaps: efficient generalization of
+defenses, and diversity in training. For interactive examples of attacks and a
+link to our codebase, see https://goattack.far.ai.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>63 pages, AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $\text{<span class="highlight-title">Transformer</span>}^2$: Self-adaptive LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06252v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06252v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Sun, Edoardo Cetin, Yujin Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-adaptive large language models (LLMs) aim to solve the challenges posed
+by traditional fine-tuning methods, which are often computationally intensive
+and static in their ability to handle diverse tasks. We introduce
+$\text{Transformer}^2$, a novel self-adaptation framework that adapts LLMs for
+unseen tasks in real-time by selectively adjusting only the singular components
+of their weight matrices. During inference, $\text{Transformer}^2$ employs a
+two-pass mechanism: first, a dispatch system identifies the task properties,
+and then task-specific "expert" vectors, trained using reinforcement learning,
+are dynamically mixed to obtain targeted behavior for the incoming prompt. Our
+method outperforms ubiquitous approaches such as LoRA, with fewer parameters
+and greater efficiency. $\text{Transformer}^2$ demonstrates versatility across
+different LLM architectures and modalities, including vision-language tasks.
+$\text{Transformer}^2$ represents a significant leap forward, offering a
+scalable, efficient solution for enhancing the adaptability and task-specific
+performance of LLMs, paving the way for truly dynamic, self-organizing AI
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 panges, 11 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can AI Help with Your Personal Finances? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19784v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19784v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oudom Hean, Utsha Saha, Binita Saha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Large Language Models (LLMs) have emerged as a
+transformative development in artificial intelligence (AI), drawing significant
+attention from industry and academia. Trained on vast datasets, these
+sophisticated AI systems exhibit impressive natural language processing and
+content generation capabilities. This paper explores the potential of LLMs to
+address key challenges in personal finance, focusing on the United States. We
+evaluate several leading LLMs, including OpenAI's ChatGPT, Google's Gemini,
+Anthropic's Claude, and Meta's Llama, to assess their effectiveness in
+providing accurate financial advice on topics such as mortgages, taxes, loans,
+and investments. Our findings show that while these models achieve an average
+accuracy rate of approximately 70%, they also display notable limitations in
+certain areas. Specifically, LLMs struggle to provide accurate responses for
+complex financial queries, with performance varying significantly across
+different topics. Despite these limitations, the analysis reveals notable
+improvements in newer versions of these models, highlighting their growing
+utility for individuals and financial advisors. As these AI systems continue to
+evolve, their potential for advancing AI-driven applications in personal
+finance becomes increasingly promising.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dissecting Query-Key Interaction in Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14880v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14880v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Pan, Aaron Philip, Ziqian Xie, Odelia Schwartz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-attention in vision transformers is often thought to perform perceptual
+grouping where tokens attend to other tokens with similar embeddings, which
+could correspond to semantically similar features of an object. However,
+attending to dissimilar tokens can be beneficial by providing contextual
+information. We propose to analyze the query-key interaction by the singular
+value decomposition of the interaction matrix (i.e.
+${\textbf{W}_q}^\top\textbf{W}_k$). We find that in many ViTs, especially those
+with classification training objectives, early layers attend more to similar
+tokens, while late layers show increased attention to dissimilar tokens,
+providing evidence corresponding to perceptual grouping and contextualization,
+respectively. Many of these interactions between features represented by
+singular vectors are interpretable and semantic, such as attention between
+relevant objects, between parts of an object, or between the foreground and
+background. This offers a novel perspective on interpreting the attention
+mechanism, which contributes to understanding how transformer models utilize
+context and salient features when processing images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ACPO: AI-Enabled Compiler Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09982v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09982v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir H. Ashouri, Muhammad Asif Manzoor, Duc Minh Vu, Raymond Zhang, Colin Toft, Ziwen Wang, Angel Zhang, Bryan Chan, Tomasz S. Czajkowski, Yaoqing Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The key to performance optimization of a program is to decide correctly when
+a certain transformation should be applied by a compiler. This is an ideal
+opportunity to apply machine-learning models to speed up the tuning process;
+while this realization has been around since the late 90s, only recent
+advancements in ML enabled a practical application of ML to compilers as an
+end-to-end framework.
+  This paper presents ACPO: An AI-Enabled Compiler Framework, a novel framework
+that provides LLVM with simple and comprehensive tools to benefit from
+employing ML models for different optimization passes. We first showcase the
+high-level view, class hierarchy, and functionalities of ACPO and subsequently,
+demonstrate \taco{a couple of use cases of ACPO by ML-enabling the Loop Unroll
+and Function Inlining passes used in LLVM's O3. and finally, describe how ACPO
+can be leveraged to optimize other passes. Experimental results reveal that the
+ACPO model for Loop Unroll can gain on average 4%, 3%, 5.4%, and 0.2% compared
+to LLVM's vanilla O3 optimization when deployed on Polybench, Coral-2,
+CoreMark, and Graph-500, respectively. Furthermore, by including both Function
+Inlining and Loop Unroll models, ACPO can provide a combined speedup of 4.5% on
+Polybench and 2.4% on Cbench when compared with LLVM's O3, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACPO (12 pages)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EPIC: Effective <span class="highlight-title">Prompt</span>ing for Imbalanced-Class Data Synthesis in Tabular
+  Data Classification via Large Language Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.12404v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.12404v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhee Kim, Taesung Kim, Jaegul Choo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable in-context learning
+capabilities across diverse applications. In this work, we explore the
+effectiveness of LLMs for generating realistic synthetic tabular data,
+identifying key prompt design elements to optimize performance. We introduce
+EPIC, a novel approach that leverages balanced, grouped data samples and
+consistent formatting with unique variable mapping to guide LLMs in generating
+accurate synthetic data across all classes, even for imbalanced datasets.
+Evaluations on real-world datasets show that EPIC achieves state-of-the-art
+machine learning classification performance, significantly improving generation
+efficiency. These findings highlight the effectiveness of EPIC for synthetic
+tabular data generation, particularly in addressing class imbalance. Our source
+code for our work is available at:
+https://seharanul17.github.io/project-synthetic-tabular-llm/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Double Equivariance for Inductive Link Prediction for Both New Nodes and
+  New Relation Types 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01313v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01313v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jincheng Zhou, Yucheng Zhang, Jianfei Gao, Yangze Zhou, Bruno Ribeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of fully inductive link prediction in knowledge graphs has gained
+significant attention, with various graph neural networks being proposed to
+address it. This task presents greater challenges than traditional inductive
+link prediction tasks with only new nodes, as models must be capable of
+zero-shot generalization to both unseen nodes and unseen relation types in the
+inference graph. Despite the development of novel models, a unifying
+theoretical understanding of their success remains elusive, and the limitations
+of these methods are not well-studied. In this work, we introduce the concept
+of double permutation-equivariant representations and demonstrate its necessity
+for effective performance in this task. We show that many existing models,
+despite their diverse architectural designs, conform to this framework.
+However, we also identify inherent limitations in double
+permutation-equivariant representations, which restrict these models's ability
+to learn effectively on datasets with varying characteristics. Our findings
+suggest that while double equivariance is necessary for meta-learning across
+knowledge graphs from different domains, it is not sufficient. There remains a
+fundamental gap between double permutation-equivariant models and the concept
+of foundation models designed to learn patterns across all domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Boosting in Hyperdimensional Computing for Enhanced
+  Reliability in Healthcare <span class="chip">DATE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        SungHeon Jeong, Hamza Errahmouni Barkam, Sanggeon Yun, Yeseong Kim, Shaahin Angizi, Mohsen Imani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperdimensional computing (HDC) enables efficient data encoding and
+processing in high-dimensional space, benefiting machine learning and data
+analysis. However, underutilization of these spaces can lead to overfitting and
+reduced model reliability, especially in data-limited systems a critical issue
+in sectors like healthcare that demand robustness and consistent performance.
+We introduce BoostHD, an approach that applies boosting algorithms to partition
+the hyperdimensional space into subspaces, creating an ensemble of weak
+learners. By integrating boosting with HDC, BoostHD enhances performance and
+reliability beyond existing HDC methods. Our analysis highlights the importance
+of efficient utilization of hyperdimensional spaces for improved model
+performance. Experiments on healthcare datasets show that BoostHD outperforms
+state-of-the-art methods. On the WESAD dataset, it achieved an accuracy of
+98.37%, surpassing Random Forest, XGBoost, and OnlineHD. BoostHD also
+demonstrated superior inference efficiency and stability, maintaining high
+accuracy under data imbalance and noise. In person-specific evaluations, it
+achieved an average accuracy of 96.19%, outperforming other models. By
+addressing the limitations of both boosting and HDC, BoostHD expands the
+applicability of HDC in critical domains where reliability and precision are
+paramount.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to DATE 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TSEML: A task-specific embedding-based method for few-shot
+  classification of cancer molecular subtypes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13228v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13228v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Su, Rui Shi, Hui Cui, Ping Xuan, Chengyan Fang, Xikang Feng, Qiangguo Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Molecular subtyping of cancer is recognized as a critical and challenging
+upstream task for personalized therapy. Existing deep learning methods have
+achieved significant performance in this domain when abundant data samples are
+available. However, the acquisition of densely labeled samples for cancer
+molecular subtypes remains a significant challenge for conventional
+data-intensive deep learning approaches. In this work, we focus on the few-shot
+molecular subtype prediction problem in heterogeneous and small cancer
+datasets, aiming to enhance precise diagnosis and personalized treatment. We
+first construct a new few-shot dataset for cancer molecular subtype
+classification and auxiliary cancer classification, named TCGA Few-Shot, from
+existing publicly available datasets. To effectively leverage the relevant
+knowledge from both tasks, we introduce a task-specific embedding-based
+meta-learning framework (TSEML). TSEML leverages the synergistic strengths of a
+model-agnostic meta-learning (MAML) approach and a prototypical network
+(ProtoNet) to capture diverse and fine-grained features. Comparative
+experiments conducted on the TCGA Few-Shot dataset demonstrate that our TSEML
+framework achieves superior performance in addressing the problem of few-shot
+molecular subtype classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reciprocal Reward Influence Encourages Cooperation From Self-Interested
+  Agents <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01641v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01641v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John L. Zhou, Weizhe Hong, Jonathan C. Kao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperation between self-interested individuals is a widespread phenomenon in
+the natural world, but remains elusive in interactions between artificially
+intelligent agents. Instead, naive reinforcement learning algorithms typically
+converge to Pareto-dominated outcomes in even the simplest of social dilemmas.
+An emerging literature on opponent shaping has demonstrated the ability to
+reach prosocial outcomes by influencing the learning of other agents. However,
+such methods differentiate through the learning step of other agents or
+optimize for meta-game dynamics, which rely on privileged access to opponents'
+learning algorithms or exponential sample complexity, respectively. To provide
+a learning rule-agnostic and sample-efficient alternative, we introduce
+Reciprocators, reinforcement learning agents which are intrinsically motivated
+to reciprocate the influence of opponents' actions on their returns. This
+approach seeks to modify other agents' $Q$-values by increasing their return
+following beneficial actions (with respect to the Reciprocator) and decreasing
+it after detrimental actions, guiding them towards mutually beneficial actions
+without directly differentiating through a model of their policy. We show that
+Reciprocators can be used to promote cooperation in temporally extended social
+dilemmas during simultaneous learning. Our code is available at
+https://github.com/johnlyzhou/reciprocator/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safety Implications of Explainable Artificial Intelligence in End-to-End
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12176v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12176v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahin Atakishiyev, Mohammad Salameh, Randy Goebel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The end-to-end learning pipeline is gradually creating a paradigm shift in
+the ongoing development of highly autonomous vehicles, largely due to advances
+in deep learning, the availability of large-scale training datasets, and
+improvements in integrated sensor devices. However, a lack of explainability in
+real-time decisions with contemporary learning methods impedes user trust and
+attenuates the widespread deployment and commercialization of such vehicles.
+Moreover, the issue is exacerbated when these cars are involved in or cause
+traffic accidents. Consequently, explainability in end-to-end autonomous
+driving is essential to build trust in vehicular automation. With that said,
+automotive researchers have not yet rigorously explored safety benefits and
+consequences of explanations in end-to-end autonomous driving. This paper aims
+to bridge the gaps between these topics and seeks to answer the following
+research question: What are safety implications of explanations in end-to-end
+autonomous driving? In this regard, we first revisit established safety and
+explainability concepts in end-to-end driving. Furthermore, we present three
+critical case studies and show the pivotal role of explanations in enhancing
+self-driving safety. Finally, we describe insights from empirical studies and
+reveal potential value, limitations, and caveats of practical explainable AI
+methods with respect to their safety assurance in end-to-end driving.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cooperative and Asynchronous <span class="highlight-title">Transformer</span>-based Mission Planning for
+  Heterogeneous Teams of Mobile Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.06372v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.06372v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Milad Farjadnasab, Shahin Sirouspour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperative mission planning for heterogeneous teams of mobile robots
+presents a unique set of challenges, particularly when operating under
+communication constraints and limited computational resources. To address these
+challenges, we propose the Cooperative and Asynchronous Transformer-based
+Mission Planning (CATMiP) framework, which leverages multi-agent reinforcement
+learning (MARL) to coordinate distributed decision making among agents with
+diverse sensing, motion, and actuation capabilities, operating under sporadic
+ad hoc communication. A Class-based Macro-Action Decentralized Partially
+Observable Markov Decision Process (CMacDec-POMDP) is also formulated to
+effectively model asynchronous decision-making for heterogeneous teams of
+agents. The framework utilizes an asynchronous centralized training and
+distributed execution scheme that is developed based on the Multi-Agent
+Transformer (MAT) architecture. This design allows a single trained model to
+generalize to larger environments and accommodate varying team sizes and
+compositions. We evaluate CATMiP in a 2D grid-world simulation environment and
+compare its performance against planning-based exploration methods. Results
+demonstrate CATMiP's superior efficiency, scalability, and robustness to
+communication dropouts, highlighting its potential for real-world heterogeneous
+mobile robot systems. The code is available at
+https://github.com/mylad13/CATMiP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 8 figures, this work has been submitted to Elsevier for
+  possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous
+  Sensors via Language Grounding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04693v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04693v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Jones, Oier Mees, Carmelo Sferrazza, Kyle Stachowicz, Pieter Abbeel, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interacting with the world is a multi-sensory experience: achieving effective
+general-purpose interaction requires making use of all available modalities --
+including vision, touch, and audio -- to fill in gaps from partial observation.
+For example, when vision is occluded reaching into a bag, a robot should rely
+on its senses of touch and sound. However, state-of-the-art generalist robot
+policies are typically trained on large datasets to predict robot actions
+solely from visual and proprioceptive observations. In this work, we propose
+FuSe, a novel approach that enables finetuning visuomotor generalist policies
+on heterogeneous sensor modalities for which large datasets are not readily
+available by leveraging natural language as a common cross-modal grounding. We
+combine a multimodal contrastive loss with a sensory-grounded language
+generation loss to encode high-level semantics. In the context of robot
+manipulation, we show that FuSe enables performing challenging tasks that
+require reasoning jointly over modalities such as vision, touch, and sound in a
+zero-shot setting, such as multimodal prompting, compositional cross-modal
+prompting, and descriptions of objects it interacts with. We show that the same
+recipe is applicable to widely different generalist policies, including both
+diffusion-based generalist policies and large vision-language-action (VLA)
+models. Extensive experiments in the real world show that FuSeis able to
+increase success rates by over 20% compared to all considered baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unexploited Information Value in Human-AI Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyang Guo, Yifan Wu, Jason Hartline, Jessica Hullman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans and AIs are often paired on decision tasks with the expectation of
+achieving complementary performance -- where the combination of human and AI
+outperforms either one alone. However, how to improve performance of a human-AI
+team is often not clear without knowing more about what particular information
+and strategies each agent employs. In this paper, we propose a model based in
+statistical decision theory to analyze human-AI collaboration from the
+perspective of what information could be used to improve a human or AI
+decision. We demonstrate our model on a deepfake detection task to investigate
+seven video-level features by their unexploited value of information. We
+compare the human alone, AI alone and human-AI team and offer insights on how
+the AI assistance impacts people's usage of the information and what
+information that the AI exploits well might be useful for improving human
+decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empowering Persian LLMs for Instruction Following: A Novel <span class="highlight-title">Dataset</span> and
+  Training Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.11186v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.11186v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hojjat Mokhtarabadi, Ziba Zamani, Abbas Maazallahi, Mohammad Hossein Manshaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction-tuned large language models have demonstrated remarkable
+capabilities in following human instructions across various domains. However,
+their proficiency remains notably deficient in many low-resource languages. To
+address this challenge, we begin by introducing FarsInstruct a comprehensive
+instruction dataset designed to enhance the instruction following ability of
+large language models specifically for the Persian language a significant yet
+underrepresented language globally. FarsInstruct encompasses a wide range of
+task types and datasets, each containing a mix of straightforward to complex
+manual written instructions, as well as translations from the Public Pool of
+Prompts, ensuring a rich linguistic and cultural representation. Furthermore,
+we introduce Co-CoLA, a framework designed to enhance the multi-task
+adaptability of LoRA-tuned models. Through extensive experimental analyses, our
+study showcases the effectiveness of the FarsInstruct dataset coupled with
+training by the Co-CoLA framework, in improving the performance of large
+language models within the Persian context. As of the current writing,
+FarsInstruct comprises 197 templates across 21 distinct datasets, and we intend
+to update it consistently, thus augmenting its applicability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ U-MATH: A University-Level Benchmark for Evaluating Mathematical Skills
+  in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.03205v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.03205v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Chernyshev, Vitaliy Polshkov, Ekaterina Artemova, Alex Myasnikov, Vlad Stepanov, Alexei Miasnikov, Sergei Tilga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current evaluation of mathematical skills in LLMs is limited, as existing
+benchmarks are either relatively small, primarily focus on elementary and
+high-school problems, or lack diversity in topics. Additionally, the inclusion
+of visual elements in tasks remains largely under-explored.
+  To address these gaps, we introduce U-MATH, a novel benchmark of 1,100
+unpublished open-ended university-level problems sourced from teaching
+materials. It is balanced across six core subjects, with 20% of multimodal
+problems. Given the open-ended nature of U-MATH problems, we employ an LLM to
+judge the correctness of generated solutions. To this end, we release
+$\mu$-MATH, a dataset to evaluate the LLMs' capabilities in judging solutions.
+  The evaluation of general domain, math-specific, and multimodal LLMs
+highlights the challenges presented by U-MATH. Our findings reveal that LLMs
+achieve a maximum accuracy of only 63% on text-based tasks, with even lower 45%
+on visual problems. The solution assessment proves challenging for LLMs, with
+the best LLM judge having an F1-score of 80% on $\mu$-MATH.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using Deep Learning to Design High Aspect Ratio Fusion Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.00564v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.00564v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        P. Curvo, D. R. Ferreira, R. Jorge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The design of fusion devices is typically based on computationally expensive
+simulations. This can be alleviated using high aspect ratio models that employ
+a reduced number of free parameters, especially in the case of stellarator
+optimization where non-axisymmetric magnetic fields with a large parameter
+space are optimized to satisfy certain performance criteria. However,
+optimization is still required to find configurations with properties such as
+low elongation, high rotational transform, finite plasma beta, and good fast
+particle confinement. In this work, we train a machine learning model to
+construct configurations with favorable confinement properties by finding a
+solution to the inverse design problem, that is, obtaining a set of model input
+parameters for given desired properties. Since the solution of the inverse
+problem is non-unique, a probabilistic approach, based on mixture density
+networks, is used. It is shown that optimized configurations can be generated
+reliably using this method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Performance of Point Cloud Completion Networks with
+  Consistency Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.07298v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.07298v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Tirta Wijaya, Christofel Rio Goenawan, Seung-Hyun Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud completion networks are conventionally trained to minimize the
+disparities between the completed point cloud and the ground-truth counterpart.
+However, an incomplete object-level point cloud can have multiple valid
+completion solutions when it is examined in isolation. This one-to-many mapping
+issue can cause contradictory supervision signals to the network because the
+loss function may produce different values for identical input-output pairs of
+the network. In many cases, this issue could adversely affect the network
+optimization process. In this work, we propose to enhance the conventional
+learning objective using a novel completion consistency loss to mitigate the
+one-to-many mapping problem. Specifically, the proposed consistency loss ensure
+that a point cloud completion network generates a coherent completion solution
+for incomplete objects originating from the same source point cloud.
+Experimental results across multiple well-established datasets and benchmarks
+demonstrated the proposed completion consistency loss have excellent capability
+to enhance the completion performance of various existing networks without any
+modification to the design of the networks. The proposed consistency loss
+enhances the performance of the point completion network without affecting the
+inference speed, thereby increasing the accuracy of point cloud completion.
+Notably, a state-of-the-art point completion network trained with the proposed
+consistency loss can achieve state-of-the-art accuracy on the challenging new
+MVP dataset. The code and result of experiment various point completion models
+using proposed consistency loss will be available at:
+https://github.com/kaist-avelab/ConsistencyLoss .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First version of Paper "Enhancing Performance of Point Cloud
+  Completion Networks with Consistency Loss" by Kevin Tirta Wijaya and
+  Christofel Rio Goenawan. In process submission to Neurocomputing Journal 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UIFV: Data Reconstruction Attack in Vertical Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12588v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12588v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jirui Yang, Peng Chen, Zhihui Lu, Qiang Duan, Yubing Bao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vertical Federated Learning (VFL) facilitates collaborative machine learning
+without the need for participants to share raw private data. However, recent
+studies have revealed privacy risks where adversaries might reconstruct
+sensitive features through data leakage during the learning process. Although
+data reconstruction methods based on gradient or model information are somewhat
+effective, they reveal limitations in VFL application scenarios. This is
+because these traditional methods heavily rely on specific model structures
+and/or have strict limitations on application scenarios. To address this, our
+study introduces the Unified InverNet Framework into VFL, which yields a novel
+and flexible approach (dubbed UIFV) that leverages intermediate feature data to
+reconstruct original data, instead of relying on gradients or model details.
+The intermediate feature data is the feature exchanged by different
+participants during the inference phase of VFL. Experiments on four datasets
+demonstrate that our methods significantly outperform state-of-the-art
+techniques in attack precision. Our work exposes severe privacy vulnerabilities
+within VFL systems that pose real threats to practical VFL applications and
+thus confirms the necessity of further enhancing privacy protection in the VFL
+architecture.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Discrete Concepts in Latent Hierarchical Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00519v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00519v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingjing Kong, Guangyi Chen, Biwei Huang, Eric P. Xing, Yuejie Chi, Kun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning concepts from natural high-dimensional data (e.g., images) holds
+potential in building human-aligned and interpretable machine learning models.
+Despite its encouraging prospect, formalization and theoretical insights into
+this crucial task are still lacking. In this work, we formalize concepts as
+discrete latent causal variables that are related via a hierarchical causal
+model that encodes different abstraction levels of concepts embedded in
+high-dimensional data (e.g., a dog breed and its eye shapes in natural images).
+We formulate conditions to facilitate the identification of the proposed causal
+model, which reveals when learning such concepts from unsupervised data is
+possible. Our conditions permit complex causal hierarchical structures beyond
+latent trees and multi-level directed acyclic graphs in prior work and can
+handle high-dimensional, continuous observed variables, which is well-suited
+for unstructured data modalities such as images. We substantiate our
+theoretical claims with synthetic data experiments. Further, we discuss our
+theory's implications for understanding the underlying mechanisms of latent
+diffusion models and provide corresponding empirical evidence for our
+theoretical insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuroBench: A Framework for Benchmarking Neuromorphic Computing
+  Algorithms and Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04640v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04640v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Yik, Korneel Van den Berghe, Douwe den Blanken, Younes Bouhadjar, Maxime Fabre, Paul Hueber, Weijie Ke, Mina A Khoei, Denis Kleyko, Noah Pacik-Nelson, Alessandro Pierro, Philipp Stratmann, Pao-Sheng Vincent Sun, Guangzhi Tang, Shenqi Wang, Biyan Zhou, Soikat Hasan Ahmed, George Vathakkattil Joseph, Benedetto Leto, Aurora Micheli, Anurag Kumar Mishra, Gregor Lenz, Tao Sun, Zergham Ahmed, Mahmoud Akl, Brian Anderson, Andreas G. Andreou, Chiara Bartolozzi, Arindam Basu, Petrut Bogdan, Sander Bohte, Sonia Buckley, Gert Cauwenberghs, Elisabetta Chicca, Federico Corradi, Guido de Croon, Andreea Danielescu, Anurag Daram, Mike Davies, Yigit Demirag, Jason Eshraghian, Tobias Fischer, Jeremy Forest, Vittorio Fra, Steve Furber, P. Michael Furlong, William Gilpin, Aditya Gilra, Hector A. Gonzalez, Giacomo Indiveri, Siddharth Joshi, Vedant Karia, Lyes Khacef, James C. Knight, Laura Kriener, Rajkumar Kubendran, Dhireesha Kudithipudi, Shih-Chii Liu, Yao-Hong Liu, Haoyuan Ma, Rajit Manohar, Josep Maria Margarit-Taulé, Christian Mayr, Konstantinos Michmizos, Dylan R. Muir, Emre Neftci, Thomas Nowotny, Fabrizio Ottati, Ayca Ozcelikkale, Priyadarshini Panda, Jongkil Park, Melika Payvand, Christian Pehle, Mihai A. Petrovici, Christoph Posch, Alpha Renner, Yulia Sandamirskaya, Clemens JS Schaefer, André van Schaik, Johannes Schemmel, Samuel Schmidgall, Catherine Schuman, Jae-sun Seo, Sadique Sheik, Sumit Bam Shrestha, Manolis Sifalakis, Amos Sironi, Kenneth Stewart, Matthew Stewart, Terrence C. Stewart, Jonathan Timcheck, Nergis Tömen, Gianvito Urgese, Marian Verhelst, Craig M. Vineyard, Bernhard Vogginger, Amirreza Yousefzadeh, Fatima Tuz Zohora, Charlotte Frenkel, Vijay Janapa Reddi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neuromorphic computing shows promise for advancing computing efficiency and
+capabilities of AI applications using brain-inspired principles. However, the
+neuromorphic research field currently lacks standardized benchmarks, making it
+difficult to accurately measure technological advancements, compare performance
+with conventional methods, and identify promising future research directions.
+Prior neuromorphic computing benchmark efforts have not seen widespread
+adoption due to a lack of inclusive, actionable, and iterative benchmark design
+and guidelines. To address these shortcomings, we present NeuroBench: a
+benchmark framework for neuromorphic computing algorithms and systems.
+NeuroBench is a collaboratively-designed effort from an open community of
+researchers across industry and academia, aiming to provide a representative
+structure for standardizing the evaluation of neuromorphic approaches. The
+NeuroBench framework introduces a common set of tools and systematic
+methodology for inclusive benchmark measurement, delivering an objective
+reference framework for quantifying neuromorphic approaches in both
+hardware-independent (algorithm track) and hardware-dependent (system track)
+settings. In this article, we outline tasks and guidelines for benchmarks
+across multiple application domains, and present initial performance baselines
+across neuromorphic and conventional approaches for both benchmark tracks.
+NeuroBench is intended to continually expand its benchmarks and features to
+foster and track the progress made by the research community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Nature Neuromorphic Hardware and Computing collection</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Frontier Models are Capable of In-context Scheming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04984v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04984v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Meinke, Bronson Schoen, Jérémy Scheurer, Mikita Balesni, Rusheb Shah, Marius Hobbhahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Frontier models are increasingly trained and deployed as autonomous agent.
+One safety concern is that AI agents might covertly pursue misaligned goals,
+hiding their true capabilities and objectives - also known as scheming. We
+study whether models have the capability to scheme in pursuit of a goal that we
+provide in-context and instruct the model to strongly follow. We evaluate
+frontier models on a suite of six agentic evaluations where models are
+instructed to pursue goals and are placed in environments that incentivize
+scheming. Our results show that o1, Claude 3.5 Sonnet, Claude 3 Opus, Gemini
+1.5 Pro, and Llama 3.1 405B all demonstrate in-context scheming capabilities.
+They recognize scheming as a viable strategy and readily engage in such
+behavior. For example, models strategically introduce subtle mistakes into
+their responses, attempt to disable their oversight mechanisms, and even
+exfiltrate what they believe to be their model weights to external servers.
+Additionally, this deceptive behavior proves persistent. When o1 has engaged in
+scheming, it maintains its deception in over 85% of follow-up questions and
+often remains deceptive in multi-turn interrogations. Analysis of the models'
+chains-of-thought reveals that models explicitly reason about these deceptive
+strategies, providing evidence that the scheming behavior is not accidental.
+Surprisingly, we also find rare instances where models engage in scheming when
+only given a goal, without being strongly nudged to pursue it. We observe cases
+where Claude 3.5 Sonnet strategically underperforms in evaluations in pursuit
+of being helpful, a goal that was acquired during training rather than
+in-context. Our findings demonstrate that frontier models now possess
+capabilities for basic in-context scheming, making the potential of AI agents
+to engage in scheming behavior a concrete rather than theoretical concern.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Geometry of Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04809v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04809v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Randall Balestriero, Ahmed Imtiaz Humayun, Richard Baraniuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we overview one promising avenue of progress at the
+mathematical foundation of deep learning: the connection between deep networks
+and function approximation by affine splines (continuous piecewise linear
+functions in multiple dimensions). In particular, we will overview work over
+the past decade on understanding certain geometrical properties of a deep
+network's affine spline mapping, in particular how it tessellates its input
+space. As we will see, the affine spline connection and geometrical viewpoint
+provide a powerful portal through which to view, analyze, and improve the inner
+workings of a deep network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at 'Notices of the American Mathematical
+  Society'</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-13T00:00:00Z">2025-01-13</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">37</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SafeSwarm: Decentralized Safe RL for the Swarm of Drones Landing in
+  Dense Crowds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grik Tadevosyan, Maksim Osipenko, Demetros Aschu, Aleksey Fedoseev, Valerii Serpiva, Oleg Sautenkov, Sausar Karaf, Dzmitry Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a safe swarm of drones capable of performing landings
+in crowded environments robustly by relying on Reinforcement Learning
+techniques combined with Safe Learning. The developed system allows us to teach
+the swarm of drones with different dynamics to land on moving landing pads in
+an environment while avoiding collisions with obstacles and between agents.
+  The safe barrier net algorithm was developed and evaluated using a swarm of
+Crazyflie 2.1 micro quadrotors, which were tested indoors with the Vicon motion
+capture system to ensure precise localization and control.
+  Experimental results show that our system achieves landing accuracy of 2.25
+cm with a mean time of 17 s and collision-free landings, underscoring its
+effectiveness and robustness in real-world scenarios. This work offers a
+promising foundation for applications in environments where safety and
+precision are paramount.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inductive Learning of Robot Task Knowledge from Raw Data and Online
+  Expert Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Meli, Paolo Fiorini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing level of autonomy of robots poses challenges of trust and
+social acceptance, especially in human-robot interaction scenarios. This
+requires an interpretable implementation of robotic cognitive capabilities,
+possibly based on formal methods as logics for the definition of task
+specifications. However, prior knowledge is often unavailable in complex
+realistic scenarios.
+  In this paper, we propose an offline algorithm based on inductive logic
+programming from noisy examples to extract task specifications (i.e., action
+preconditions, constraints and effects) directly from raw data of few
+heterogeneous (i.e., not repetitive) robotic executions. Our algorithm
+leverages on the output of any unsupervised action identification algorithm
+from video-kinematic recordings. Combining it with the definition of very
+basic, almost task-agnostic, commonsense concepts about the environment, which
+contribute to the interpretability of our methodology, we are able to learn
+logical axioms encoding preconditions of actions, as well as their effects in
+the event calculus paradigm. Since the quality of learned specifications
+depends mainly on the accuracy of the action identification algorithm, we also
+propose an online framework for incremental refinement of task knowledge from
+user feedback, guaranteeing safe execution. Results in a standard manipulation
+task and benchmark for user training in the safety-critical surgical robotic
+scenario, show the robustness, data- and time-efficiency of our methodology,
+with promising results towards the scalability in more complex domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Sense of Agency in Assistive Robotics Using Shared Autonomy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07462v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07462v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maggie A. Collier, Rithika Narayan, Henny Admoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sense of agency is one factor that influences people's preferences for robot
+assistance and a phenomenon from cognitive science that represents the
+experience of control over one's environment. However, in assistive robotics
+literature, we often see paradigms that optimize measures like task success and
+cognitive load, rather than sense of agency. In fact, prior work has found that
+participants sometimes express a preference for paradigms, such as direct
+teleoperation, which do not perform well with those other metrics but give more
+control to the user. In this work, we focus on a subset of assistance paradigms
+for manipulation called shared autonomy in which the system combines control
+signals from the user and the automated control. We run a study to evaluate
+sense of agency and show that higher robot autonomy during assistance leads to
+improved task performance but a decreased sense of agency, indicating a
+potential trade-off between task performance and sense of agency. From our
+findings, we discuss the relation between sense of agency and optimality, and
+we consider a proxy metric for a component of sense of agency which might
+enable us to build systems that monitor and maintain sense of agency in real
+time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figure, HRI conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empirical Comparison of Four Stereoscopic Depth Sensing Cameras for
+  Robotics Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Rustler, Vojtech Volprecht, Matej Hoffmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth sensing is an essential technology in robotics and many other fields.
+Many depth sensing (or RGB-D) cameras are available on the market and selecting
+the best one for your application can be challenging. In this work, we tested
+four stereoscopic RGB-D cameras that sense the distance by using two images
+from slightly different views. We empirically compared four cameras (Intel
+RealSense D435, Intel RealSense D455, StereoLabs ZED 2, and Luxonis OAK-D Pro)
+in three scenarios: (i) planar surface perception, (ii) plastic doll
+perception, (iii) household object perception (YCB dataset). We recorded and
+evaluated more than 3,000 RGB-D frames for each camera. For table-top robotics
+scenarios with distance to objects up to one meter, the best performance is
+provided by the D435 camera. For longer distances, the other three models
+perform better, making them more suitable for some mobile robotics
+applications. OAK-D Pro additionally offers integrated AI modules (e.g., object
+and human keypoint detection). ZED 2 is not a standalone device and requires a
+computer with a GPU for depth data acquisition. All data (more than 12,000
+RGB-D frames) are publicly available at https://osf.io/f2seb.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficiently Closing Loops in LiDAR-Based SLAM Using Point Cloud Density
+  Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07399v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07399v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saurabh Gupta, Tiziano Guadagnino, Benedikt Mersch, Niklas Trekel, Meher V. R. Malladi, Cyrill Stachniss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consistent maps are key for most autonomous mobile robots. They often use
+SLAM approaches to build such maps. Loop closures via place recognition help
+maintain accurate pose estimates by mitigating global drift. This paper
+presents a robust loop closure detection pipeline for outdoor SLAM with
+LiDAR-equipped robots. The method handles various LiDAR sensors with different
+scanning patterns, field of views and resolutions. It generates local maps from
+LiDAR scans and aligns them using a ground alignment module to handle both
+planar and non-planar motion of the LiDAR, ensuring applicability across
+platforms. The method uses density-preserving bird's eye view projections of
+these local maps and extracts ORB feature descriptors from them for place
+recognition. It stores the feature descriptors in a binary search tree for
+efficient retrieval, and self-similarity pruning addresses perceptual aliasing
+in repetitive environments. Extensive experiments on public and self-recorded
+datasets demonstrate accurate loop closure detection, long-term localization,
+and cross-platform multi-map alignment, agnostic to the LiDAR scanning
+patterns, fields of view, and motion profiles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast-Revisit Coverage Path Planning for Autonomous Mobile Patrol Robots
+  Using Long-Range Sensor Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Srinivas Kachavarapu, Tobias Doernbach, Reinhard Gerndt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The utilization of Unmanned Ground Vehicles (UGVs) for patrolling industrial
+sites has expanded significantly. These UGVs typically are equipped with
+perception systems, e.g., computer vision, with limited range due to sensor
+limitations or site topology. High-level control of the UGVs requires Coverage
+Path Planning (CPP) algorithms that navigate all relevant waypoints and
+promptly start the next cycle. In this paper, we propose the novel Fast-Revisit
+Coverage Path Planning (FaRe-CPP) algorithm using a greedy heuristic approach
+to propose waypoints for maximum coverage area and a random search-based path
+optimization technique to obtain a path along the proposed waypoints with
+minimum revisit time. We evaluated the algorithm in a simulated environment
+using Gazebo and a camera-equipped TurtleBot3 against a number of existing
+algorithms. Compared to their average revisit times and path lengths, our
+FaRe-CPP algorithm approximately showed a 45% and 40% reduction, respectively,
+in these highly relevant performance indicators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluation of Artificial Intelligence Methods for Lead Time Prediction
+  in Non-Cycled Areas of Automotive Production 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cornelius Hake, Jonas Weigele, Frederik Reichert, Christian Friedrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present study examines the effectiveness of applying Artificial
+Intelligence methods in an automotive production environment to predict unknown
+lead times in a non-cycle-controlled production area. Data structures are
+analyzed to identify contextual features and then preprocessed using one-hot
+encoding. Methods selection focuses on supervised machine learning techniques.
+In supervised learning methods, regression and classification methods are
+evaluated. Continuous regression based on target size distribution is not
+feasible. Classification methods analysis shows that Ensemble Learning and
+Support Vector Machines are the most suitable. Preliminary study results
+indicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost
+yield the best results. After further testing and extensive hyperparameter
+optimization, the final method choice is the LightGBM algorithm. Depending on
+feature availability and prediction interval granularity, relative prediction
+accuracies of up to 90% can be achieved. Further tests highlight the importance
+of periodic retraining of AI models to accurately represent complex production
+processes using the database. The research demonstrates that AI methods can be
+effectively applied to highly variable production data, adding business value
+by providing an additional metric for various control tasks while outperforming
+current non AI-based systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures, CLC2024 Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViewVR: Visual Feedback Modes to Achieve Quality of VR-based
+  Telemanipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Erkhov, A. Bazhenov, S. Satsevich, D. Belov, F. Khabibullin, S. Egorov, M. Gromakov, M. Altamirano Cabrera, D. Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper focuses on an immersive teleoperation system that enhances
+operator's ability to actively perceive the robot's surroundings. A
+consumer-grade HTC Vive VR system was used to synchronize the operator's hand
+and head movements with a UR3 robot and a custom-built robotic head with two
+degrees of freedom (2-DoF). The system's usability, manipulation efficiency,
+and intuitiveness of control were evaluated in comparison with static head
+camera positioning across three distinct tasks. Code and other supplementary
+materials can be accessed by link: https://github.com/ErkhovArtem/ViewVR
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GestLLM: Advanced Hand Gesture Interpretation via Large Language Models
+  for Human-Robot Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Kobzarev, Artem Lykov, Dzmitry Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces GestLLM, an advanced system for human-robot interaction
+that enables intuitive robot control through hand gestures. Unlike conventional
+systems, which rely on a limited set of predefined gestures, GestLLM leverages
+large language models and feature extraction via MediaPipe to interpret a
+diverse range of gestures. This integration addresses key limitations in
+existing systems, such as restricted gesture flexibility and the inability to
+recognize complex or unconventional gestures commonly used in human
+communication.
+  By combining state-of-the-art feature extraction and language model
+capabilities, GestLLM achieves performance comparable to leading
+vision-language models while supporting gestures underrepresented in
+traditional datasets. For example, this includes gestures from popular culture,
+such as the ``Vulcan salute" from Star Trek, without any additional
+pretraining, prompt engineering, etc. This flexibility enhances the naturalness
+and inclusivity of robot control, making interactions more intuitive and
+user-friendly.
+  GestLLM provides a significant step forward in gesture-based interaction,
+enabling robots to understand and respond to a wide variety of hand gestures
+effectively. This paper outlines its design, implementation, and evaluation,
+demonstrating its potential applications in advanced human-robot collaboration,
+assistive robotics, and interactive entertainment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PO-GVINS: Tightly Coupled GNSS-Visual-Inertial Integration with
+  Pose-Only Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuo Xu, Feng Zhu, Zihang Zhang, Chang Jian, Jiarui Lv, Yuantai Zhang, Xiaohong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and reliable positioning is crucial for perception, decision-making,
+and other high-level applications in autonomous driving, unmanned aerial
+vehicles, and intelligent robots. Given the inherent limitations of standalone
+sensors, integrating heterogeneous sensors with complementary capabilities is
+one of the most effective approaches to achieving this goal. In this paper, we
+propose a filtering-based, tightly coupled global navigation satellite system
+(GNSS)-visual-inertial positioning framework with a pose-only formulation
+applied to the visual-inertial system (VINS), termed PO-GVINS. Specifically,
+multiple-view imaging used in current VINS requires a priori of 3D feature,
+then jointly estimate camera poses and 3D feature position, which inevitably
+introduces linearization error of the feature as well as facing dimensional
+explosion. However, the pose-only (PO) formulation, which is demonstrated to be
+equivalent to the multiple-view imaging and has been applied in visual
+reconstruction, represent feature depth using two camera poses and thus 3D
+feature position is removed from state vector avoiding aforementioned
+difficulties. Inspired by this, we first apply PO formulation in our VINS,
+i.e., PO-VINS. GNSS raw measurements are then incorporated with integer
+ambiguity resolved to achieve accurate and drift-free estimation. Extensive
+experiments demonstrate that the proposed PO-VINS significantly outperforms the
+multi-state constrained Kalman filter (MSCKF). By incorporating GNSS
+measurements, PO-GVINS achieves accurate, drift-free state estimation, making
+it a robust solution for positioning in challenging environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GazeGrasp: DNN-Driven Robotic Grasping with Wearable Eye-Gaze Interface 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Issatay Tokmurziyev, Miguel Altamirano Cabrera, Luis Moreno, Muhammad Haris Khan, Dzmitry Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present GazeGrasp, a gaze-based manipulation system enabling individuals
+with motor impairments to control collaborative robots using eye-gaze. The
+system employs an ESP32 CAM for eye tracking, MediaPipe for gaze detection, and
+YOLOv8 for object localization, integrated with a Universal Robot UR10 for
+manipulation tasks. After user-specific calibration, the system allows
+intuitive object selection with a magnetic snapping effect and robot control
+via eye gestures. Experimental evaluation involving 13 participants
+demonstrated that the magnetic snapping effect significantly reduced gaze
+alignment time, improving task efficiency by 31%. GazeGrasp provides a robust,
+hands-free interface for assistive robotics, enhancing accessibility and
+autonomy for users.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to: IEEE/ACM International Conference on Human-Robot
+  Interaction (HRI 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Touched by Chat<span class="highlight-title">GPT</span>: Using an LLM to Drive Affective Tactile Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaoqiao Ren, Tony Belpaeme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Touch is a fundamental aspect of emotion-rich communication, playing a vital
+role in human interaction and offering significant potential in human-robot
+interaction. Previous research has demonstrated that a sparse representation of
+human touch can effectively convey social tactile signals. However, advances in
+human-robot tactile interaction remain limited, as many humanoid robots possess
+simplistic capabilities, such as only opening and closing their hands,
+restricting nuanced tactile expressions. In this study, we explore how a robot
+can use sparse representations of tactile vibrations to convey emotions to a
+person. To achieve this, we developed a wearable sleeve integrated with a 5x5
+grid of vibration motors, enabling the robot to communicate diverse tactile
+emotions and gestures. Using chain prompts within a Large Language Model (LLM),
+we generated distinct 10-second vibration patterns corresponding to 10 emotions
+(e.g., happiness, sadness, fear) and 6 touch gestures (e.g., pat, rub, tap).
+Participants (N = 32) then rated each vibration stimulus based on perceived
+valence and arousal. People are accurate at recognising intended emotions, a
+result which aligns with earlier findings. These results highlight the LLM's
+ability to generate emotional haptic data and effectively convey emotions
+through tactile signals. By translating complex emotional and tactile
+expressions into vibratory patterns, this research demonstrates how LLMs can
+enhance physical interaction between humans and robots.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Incremental Nonlinear Dynamic Inversion Robustness Using
+  Robust Control in Aerial Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad Hachem, Clément Roos, Thierry Miquel, Murat Bronz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Improving robustness to uncertainty and rejection of external disturbances
+represents a significant challenge in aerial robotics. Nonlinear controllers
+based on Incremental Nonlinear Dynamic Inversion (INDI), known for their
+ability in estimating disturbances through measured-filtered data, have been
+notably used in such applications. Typically, these controllers comprise two
+cascaded loops: an inner loop employing nonlinear dynamic inversion and an
+outer loop generating the virtual control inputs via linear controllers. In
+this paper, a novel methodology is introduced, that combines the advantages of
+INDI with the robustness of linear structured $\mathcal{H}_\infty$ controllers.
+A full cascaded architecture is proposed to control the dynamics of a
+multirotor drone, covering both stabilization and guidance. In particular,
+low-order $\mathcal{H}_\infty$ controllers are designed for the outer loop by
+properly structuring the problem and solving it through non-smooth
+optimization. A comparative analysis is conducted between an existing INDI/PD
+approach and the proposed INDI/$\mathcal{H}_\infty$ strategy, showing a notable
+enhancement in the rejection of external disturbances. It is carried out first
+using MATLAB simulations involving a nonlinear model of a Parrot Bebop
+quadcopter drone, and then experimentally using a customized quadcopter built
+by the ENAC team. The results show an improvement of more than 50\% in the
+rejection of disturbances such as gusts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temperature Driven Multi-modal/Single-actuated Soft Finger 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prashant Kumar, Weiwei Wan, Kensuke Harada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Soft pneumatic fingers are of great research interest. However, their
+significant potential is limited as most of them can generate only one motion,
+mostly bending. The conventional design of soft fingers does not allow them to
+switch to another motion mode. In this paper, we developed a novel multi-modal
+and single-actuated soft finger where its motion mode is switched by changing
+the finger's temperature. Our soft finger is capable of switching between three
+distinctive motion modes: bending, twisting, and extension-in approximately
+five seconds. We carried out a detailed experimental study of the soft finger
+and evaluated its repeatability and range of motion. It exhibited repeatability
+of around one millimeter and a fifty percent larger range of motion than a
+standard bending actuator. We developed an analytical model for a
+fiber-reinforced soft actuator for twisting motion. This helped us relate the
+input pressure to the output twist radius of the twisting motion. This model
+was validated by experimental verification. Further, a soft robotic gripper
+with multiple grasp modes was developed using three actuators. This gripper can
+adapt to and grasp objects of a large range of size, shape, and stiffness. We
+showcased its grasping capabilities by successfully grasping a small berry, a
+large roll, and a delicate tofu cube.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-face emotion detection for effective Human-Robot Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Ala Yahyaoui, Mouaad Oujabour, Leila Ben Letaifa, Amine Bohi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of dialogue interfaces in mobile devices has become
+ubiquitous, providing a wide array of services. As technology progresses,
+humanoid robots designed with human-like features to interact effectively with
+people are gaining prominence, and the use of advanced human-robot dialogue
+interfaces is continually expanding. In this context, emotion recognition plays
+a crucial role in enhancing human-robot interaction by enabling robots to
+understand human intentions. This research proposes a facial emotion detection
+interface integrated into a mobile humanoid robot, capable of displaying
+real-time emotions from multiple individuals on a user interface. To this end,
+various deep neural network models for facial expression recognition were
+developed and evaluated under consistent computer-based conditions, yielding
+promising results. Afterwards, a trade-off between accuracy and memory
+footprint was carefully considered to effectively implement this application on
+a mobile humanoid robot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 8 figures and 1 table. Accepted at the 17th International
+  Conference on Agents and Artificial Intelligence (ICAART 2025), Porto,
+  Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Robotic Approach Techniques for the Insertion of a Straight
+  Instrument into a Vitreoretinal Surgery Trocar 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ross Henry, Martin Huber, Anestis Mablekos-Alexiou, Carlo Seneci, Mohamed Abdelaziz, Hans Natalius, Lyndon da Cruz, Christos Bergeles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in vitreoretinal robotic surgery enable precise techniques for gene
+therapies. This study evaluates three robotic approaches using the 7-DoF
+robotic arm for docking a micro-precise tool to a trocar: fully co-manipulated,
+hybrid co-manipulated/teleoperated, and hybrid with camera assistance. The
+fully co-manipulated approach was the fastest but had a 42% success rate.
+Hybrid methods showed higher success rates (91.6% and 100%) and completed tasks
+within 2 minutes. NASA Task Load Index (TLX) assessments indicated lower
+physical demand and effort for hybrid approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2 Pages, 2 Figures, 1 Table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ROSAnnotator: A Web Application for ROSBag Data Analysis in Human-Robot
+  Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Zhang, Haoqi Li, Ramtin Tabatabaei, Wafa Johal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-robot interaction (HRI) is an interdisciplinary field that utilises
+both quantitative and qualitative methods. While ROSBags, a file format within
+the Robot Operating System (ROS), offer an efficient means of collecting
+temporally synched multimodal data in empirical studies with real robots, there
+is a lack of tools specifically designed to integrate qualitative coding and
+analysis functions with ROSBags. To address this gap, we developed
+ROSAnnotator, a web-based application that incorporates a multimodal Large
+Language Model (LLM) to support both manual and automated annotation of ROSBag
+data. ROSAnnotator currently facilitates video, audio, and transcription
+annotations and provides an open interface for custom ROS messages and tools.
+By using ROSAnnotator, researchers can streamline the qualitative analysis
+process, create a more cohesive analysis pipeline, and quickly access
+statistical summaries of annotations, thereby enhancing the overall efficiency
+of HRI data analysis. https://github.com/CHRI-Lab/ROSAnnotator
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to HRI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sthymuli: a Static Educational Robot. Leveraging the Thymio II Platform <span class="chip">ICRA40</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Bernal-Lecina, Alejandrina Hernández, Adrien Pannatier, Léa Pereyre, Francesco Mondada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of robots in education represents a challenge for teachers and a
+fixed vision of what robots can do for students. This paper presents the
+development of Sthymuli, a static educational robot designed to explore new
+classroom interactions between robots, students and teachers. We propose the
+use of the Thymio II educational platform as a base, ensuring a robust
+benchmark for a fair comparison of the commonly available wheeled robots and
+our exploratory approach with Sthymuli. This paper outlines the constraints and
+requirements for developing such a robot, the current state of development and
+future work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Two pages, three figures. ICRA40 extended abstract</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Motion Tracks: A Unified Representation for Human-Robot Transfer in
+  Few-Shot Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juntao Ren, Priya Sundaresan, Dorsa Sadigh, Sanjiban Choudhury, Jeannette Bohg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teaching robots to autonomously complete everyday tasks remains a challenge.
+Imitation Learning (IL) is a powerful approach that imbues robots with skills
+via demonstrations, but is limited by the labor-intensive process of collecting
+teleoperated robot data. Human videos offer a scalable alternative, but it
+remains difficult to directly train IL policies from them due to the lack of
+robot action labels. To address this, we propose to represent actions as
+short-horizon 2D trajectories on an image. These actions, or motion tracks,
+capture the predicted direction of motion for either human hands or robot
+end-effectors. We instantiate an IL policy called Motion Track Policy (MT-pi)
+which receives image observations and outputs motion tracks as actions. By
+leveraging this unified, cross-embodiment action space, MT-pi completes tasks
+with high success given just minutes of human video and limited additional
+robot demonstrations. At test time, we predict motion tracks from two camera
+views, recovering 6DoF trajectories via multi-view synthesis. MT-pi achieves an
+average success rate of 86.5% across 4 real-world tasks, outperforming
+state-of-the-art IL baselines which do not leverage human data or our action
+space by 40%, and generalizes to scenarios seen only in human videos. Code and
+videos are available on our website
+https://portal-cornell.github.io/motion_track_policy/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hand-Object Contact Detection using Grasp Quality Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akansel Cosgun, Thanh Vinh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel hand-object contact detection system based on grasp
+quality metrics extracted from object and hand poses, and evaluated its
+performance using the DexYCB dataset. Our evaluation demonstrated the system's
+high accuracy (approaching 90%). Future work will focus on a real-time
+implementation using vision-based estimation, and integrating it to a
+robot-to-human handover system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the 2025 IEEE/ACM International Conference on
+  Human-Robot Interaction (HRI'25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Testing Human-Hand Segmentation on In-Distribution and
+  Out-of-Distribution Data in Human-Robot Interactions Using a Deep Ensemble
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Jalayer, Yuxin Chen, Masoud Jalayer, Carlotta Orsenigo, Masayoshi Tomizuka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliable detection and segmentation of human hands are critical for enhancing
+safety and facilitating advanced interactions in human-robot collaboration.
+Current research predominantly evaluates hand segmentation under
+in-distribution (ID) data, which reflects the training data of deep learning
+(DL) models. However, this approach fails to address out-of-distribution (OOD)
+scenarios that often arise in real-world human-robot interactions. In this
+study, we present a novel approach by evaluating the performance of pre-trained
+DL models under both ID data and more challenging OOD scenarios. To mimic
+realistic industrial scenarios, we designed a diverse dataset featuring simple
+and cluttered backgrounds with industrial tools, varying numbers of hands (0 to
+4), and hands with and without gloves. For OOD scenarios, we incorporated
+unique and rare conditions such as finger-crossing gestures and motion blur
+from fast-moving hands, addressing both epistemic and aleatoric uncertainties.
+To ensure multiple point of views (PoVs), we utilized both egocentric cameras,
+mounted on the operator's head, and static cameras to capture RGB images of
+human-robot interactions. This approach allowed us to account for multiple
+camera perspectives while also evaluating the performance of models trained on
+existing egocentric datasets as well as static-camera datasets. For
+segmentation, we used a deep ensemble model composed of UNet and RefineNet as
+base learners. Performance evaluation was conducted using segmentation metrics
+and uncertainty quantification via predictive entropy. Results revealed that
+models trained on industrial datasets outperformed those trained on
+non-industrial datasets, highlighting the importance of context-specific
+training. Although all models struggled with OOD scenarios, those trained on
+industrial datasets demonstrated significantly better generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Autonomous Electrochemistry Platform with Real-Time Normality Testing of
+  Voltammetry Measurements Using ML 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anees Al-Najjar, Nageswara S. V. Rao, Craig A. Bridges, Sheng Dai, Alex Walters
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electrochemistry workflows utilize various instruments and computing systems
+to execute workflows consisting of electrocatalyst synthesis, testing and
+evaluation tasks. The heterogeneity of the software and hardware of these
+ecosystems makes it challenging to orchestrate a complete workflow from
+production to characterization by automating its tasks. We propose an
+autonomous electrochemistry computing platform for a multi-site ecosystem that
+provides the services for remote experiment steering, real-time measurement
+transfer, and AI/ML-driven analytics. We describe the integration of a mobile
+robot and synthesis workstation into the ecosystem by developing custom
+hub-networks and software modules to support remote operations over the
+ecosystem's wireless and wired networks. We describe a workflow task for
+generating I-V voltammetry measurements using a potentiostat, and a machine
+learning framework to ensure their normality by detecting abnormal conditions
+such as disconnected electrodes. We study a number of machine learning methods
+for the underlying detection problem, including smooth, non-smooth, structural
+and statistical methods, and their fusers. We present experimental results to
+illustrate the effectiveness of this platform, and also validate the proposed
+ML method by deriving its rigorous generalization equations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 14 figures, accepted in the IEEE 20th International
+  Conference on e-Science (e-Science), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-Shot Task Learning through Inverse Generative Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.04987v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.04987v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviv Netanyahu, Yilun Du, Antonia Bronars, Jyothish Pari, Joshua Tenenbaum, Tianmin Shu, Pulkit Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning the intents of an agent, defined by its goals or motion style, is
+often extremely challenging from just a few examples. We refer to this problem
+as task concept learning and present our approach, Few-Shot Task Learning
+through Inverse Generative Modeling (FTL-IGM), which learns new task concepts
+by leveraging invertible neural generative models. The core idea is to pretrain
+a generative model on a set of basic concepts and their demonstrations. Then,
+given a few demonstrations of a new concept (such as a new goal or a new
+action), our method learns the underlying concepts through backpropagation
+without updating the model weights, thanks to the invertibility of the
+generative model. We evaluate our method in five domains -- object
+rearrangement, goal-oriented navigation, motion caption of human actions,
+autonomous driving, and real-world table-top manipulation. Our experimental
+results demonstrate that via the pretrained generative model, we successfully
+learn novel concepts and generate agent plans or motion corresponding to these
+concepts in (1) unseen environments and (2) in composition with training
+concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added acknowledgment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accelerating genetic optimization of nonlinear model predictive control
+  by learning optimal search space size 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eslam Mostafa, Hussein A. Aly, Ahmed Elliethy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Genetic algorithm (GA) is typically used to solve nonlinear model predictive
+control's optimization problem. However, the size of the search space in which
+the GA searches for the optimal control inputs is crucial for its applicability
+to fast-response systems. This paper proposes accelerating the genetic
+optimization of NMPC by learning optimal search space size. The approach trains
+a multivariate regression model to adaptively predict the best smallest size of
+the search space in every control cycle. The proposed approach reduces the GA's
+computational time, improves the chance of convergence to better control
+inputs, and provides a stable and feasible solution. The proposed approach was
+evaluated on three nonlinear systems and compared to four other evolutionary
+algorithms implemented in a processor-in-the-loop fashion. The results show
+that the proposed approach provides a 17-45\% reduction in computational time
+and increases the convergence rate by 35-47\%. The source code is available on
+GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the Journal of Control and Decision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Freeze-Tag Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19706v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19706v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharareh Alipour, Kajal Baghestani, Mahdis Mirzaei, Soroush Sahraei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the Freeze-Tag Problem (FTP), introduced by Arkin et al. (SODA'02),
+where the objective is to activate a group of n robots, starting from a single
+initially active robot. Robots are positioned in $\mathbb{R}^d$, and once
+activated, they move at a constant speed to wake up others. The goal is to
+minimize the time required to activate the last robot, known as the makespan.
+We establish new upper bounds for the makespan under the $l_1$ and $l_2$ norms
+in $\mathbb{R}^2$ and $\mathbb{R}^3$. Specifically, we improve the previous
+upper bound for $(\mathbb{R}^2, l_2)$ from $7.07r$ (Bonichon et al., DISC'24)
+to $5.064r$. For $(\mathbb{R}^3, l_1)$, we derive a makespan bound of $13r$,
+which translates to $22.52r$ for $(\mathbb{R}^3, l_2)$. Here, $r$ denotes the
+maximum distance of any robot from the initially active robot under the given
+norm. To our knowledge, these are the first makespan bounds for FTP in
+$\mathbb{R}^3$. Additionally, we show that the maximum makespan for $n$ robots
+is not necessarily achieved when robots are equally distributed along the
+boundary in $(\mathbb{R}^2, l_2)$. We further investigate FTP in
+$(\mathbb{R}^3, l_2)$ for specific configurations where robots lie on a
+boundary, providing insights into practical scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QuadWBG: Generalizable Quadrupedal Whole-Body Grasping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.06782v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.06782v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jilong Wang, Javokhirbek Rajabov, Chaoyi Xu, Yiming Zheng, He Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legged robots with advanced manipulation capabilities have the potential to
+significantly improve household duties and urban maintenance. Despite
+considerable progress in developing robust locomotion and precise manipulation
+methods, seamlessly integrating these into cohesive whole-body control for
+real-world applications remains challenging. In this paper, we present a
+modular framework for robust and generalizable whole-body loco-manipulation
+controller based on a single arm-mounted camera. By using reinforcement
+learning (RL), we enable a robust low-level policy for command execution over 5
+dimensions (5D) and a grasp-aware high-level policy guided by a novel metric,
+Generalized Oriented Reachability Map (GORM). The proposed system achieves
+state-of-the-art one-time grasping accuracy of 89% in the real world, including
+challenging tasks such as grasping transparent objects. Through extensive
+simulations and real-world experiments, we demonstrate that our system can
+effectively manage a large workspace, from floor level to above body height,
+and perform diverse whole-body loco-manipulation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object
+  Interaction Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenkun He, Yun Liu, Ruitao Liu, Li Yi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthesizing realistic human-object interaction motions is a critical problem
+in VR/AR and human animation. Unlike the commonly studied scenarios involving a
+single human or hand interacting with one object, we address a more generic
+multi-body setting with arbitrary numbers of humans, hands, and objects. This
+complexity introduces significant challenges in synchronizing motions due to
+the high correlations and mutual influences among bodies. To address these
+challenges, we introduce SyncDiff, a novel method for multi-body interaction
+synthesis using a synchronized motion diffusion strategy. SyncDiff employs a
+single diffusion model to capture the joint distribution of multi-body motions.
+To enhance motion fidelity, we propose a frequency-domain motion decomposition
+scheme. Additionally, we introduce a new set of alignment scores to emphasize
+the synchronization of different body motions. SyncDiff jointly optimizes both
+data sample likelihood and alignment likelihood through an explicit
+synchronization strategy. Extensive experiments across four datasets with
+various multi-body configurations demonstrate the superiority of SyncDiff over
+existing state-of-the-art motion synthesis methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Adaptive Sliding Window Estimator for Positioning of Unmanned Aerial
+  Vehicle Using a Single Anchor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.06501v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.06501v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiwen Xiong, Sijia Chen, Wei Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Localization using a single range anchor combined with onboard
+optical-inertial odometry offers a lightweight solution that provides
+multidimensional measurements for the positioning of unmanned aerial vehicles.
+Unfortunately, the performance of such lightweight sensors varies with the
+dynamic environment, and the fidelity of the dynamic model is also severely
+affected by environmental aerial flow. To address this challenge, we propose an
+adaptive sliding window estimator equipped with an estimation reliability
+evaluator, where the states, noise covariance matrices and aerial drag are
+estimated simultaneously. The aerial drag effects are first evaluated based on
+posterior states and covariance. Then, an augmented Kalman filter is designed
+to pre-process multidimensional measurements and inherit historical
+information. Subsequently, an inverse-Wishart smoother is employed to estimate
+posterior states and covariance matrices. To further suppress potential
+divergence, a reliability evaluator is devised to infer estimation errors. We
+further determine the fidelity of each sensor based on the error propagation.
+Extensive experiments are conducted in both standard and harsh environments,
+demonstrating the adaptability and robustness of the proposed method. The root
+mean square error reaches 0.15 m, outperforming the state-of-the-art approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Walk along: An Experiment on Controlling the Mobile Robot 'Spot' with
+  Voice and Gestures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.11218v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.11218v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renchi Zhang, Jesse van der Linden, Dimitra Dodou, Harleigh Seyffert, Yke Bauke Eisma, Joost C. F. de Winter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robots are becoming more capable and can autonomously perform tasks such as
+navigating between locations. However, human oversight remains crucial. This
+study compared two touchless methods for directing mobile robots: voice control
+and gesture control, to investigate the efficiency of the methods and the
+preference of users. We tested these methods in two conditions: one in which
+participants remained stationary and one in which they walked freely alongside
+the robot. We hypothesized that walking alongside the robot would result in
+higher intuitiveness ratings and improved task performance, based on the idea
+that walking promotes spatial alignment and reduces the effort required for
+mental rotation. In a 2x2 within-subject design, 218 participants guided the
+quadruped robot Spot along a circuitous route with multiple 90-degree turns
+using rotate left, rotate right, and walk forward commands. After each trial,
+participants rated the intuitiveness of the command mapping, while
+post-experiment interviews were used to gather the participants' preferences.
+Results showed that voice control combined with walking with Spot was the most
+favored and intuitive, whereas gesture control while standing caused confusion
+for left/right commands. Nevertheless, 29% of participants preferred gesture
+control, citing increased task engagement and visual congruence as reasons. An
+odometry-based analysis revealed that participants often followed behind Spot,
+particularly in the gesture control condition, when they were allowed to walk.
+In conclusion, voice control with walking produced the best outcomes. Improving
+physical ergonomics and adjusting gesture types could make gesture control more
+effective.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Non-linear Centroidal MPC with Stability Guarantees for Robust
+  Locomotion of Legged Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.01144v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.01144v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Elobaid, Giulio Turrisi, Lorenzo Rapetti, Giulio Romualdi, Stefano Dafarra, Tomohiro Kawakami, Tomohiro Chaki, Takahide Yoshiike, Claudio Semini, Daniele Pucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonlinear model predictive locomotion controllers based on the reduced
+centroidal dynamics are nowadays ubiquitous in legged robots. These schemes,
+even if they assume an inherent simplification of the robot's dynamics, were
+shown to endow robots with a step-adjustment capability in reaction to small
+pushes, and, moreover, in the case of uncertain parameters - as unknown
+payloads - they were shown to be able to provide some practical, albeit
+limited, robustness. In this work, we provide rigorous certificates of their
+closed loop stability via a reformulation of the centroidal MPC controller.
+This is achieved thanks to a systematic procedure inspired by the machinery of
+adaptive control, together with ideas coming from Control Lyapunov functions.
+Our reformulation, in addition, provides robustness for a class of unmeasured
+constant disturbances. To demonstrate the generality of our approach, we
+validated our formulation on a new generation of humanoid robots - the 56.7 kg
+ergoCub, as well as on a commercially available 21 kg quadruped robot, Aliengo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Underground Mines to Offices: A Versatile and Robust Framework for
+  Range-Inertial SLAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.14797v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.14797v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Montano-Oliván, Julio A. Placed, Luis Montano, María T. Lázaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simultaneous Localization and Mapping (SLAM) is an essential component of
+autonomous robotic applications and self-driving vehicles, enabling them to
+understand and operate in their environment. Many SLAM systems have been
+proposed in the last decade, but they are often complex to adapt to different
+settings or sensor setups. In this work, we present LiDAR Graph-SLAM (LG-SLAM),
+a versatile range-inertial SLAM framework that can be adapted to different
+types of sensors and environments, from underground mines to offices with
+minimal parameter tuning. Our system integrates range, inertial and GNSS
+measurements into a graph-based optimization framework. We also use a refined
+submap management approach and a robust loop closure method that effectively
+accounts for uncertainty in the identification and validation of putative loop
+closures, ensuring global consistency and robustness. Enabled by a parallelized
+architecture and GPU integration, our system achieves pose estimation at LiDAR
+frame rate, along with online loop closing and graph optimization. We validate
+our system in diverse environments using public datasets and real-world data,
+consistently achieving an average error below 20 cm and outperforming other
+state-of-the-art algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 8 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLaMAR: Long-Horizon Planning for Multi-Agent Robots in Partially
+  Observable Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10031v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10031v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Nayak, Adelmo Morrison Orozco, Marina Ten Have, Vittal Thirumalai, Jackson Zhang, Darren Chen, Aditya Kapoor, Eric Robinson, Karthik Gopalakrishnan, James Harrison, Brian Ichter, Anuj Mahajan, Hamsa Balakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability of Language Models (LMs) to understand natural language makes
+them a powerful tool for parsing human instructions into task plans for
+autonomous robots. Unlike traditional planning methods that rely on
+domain-specific knowledge and handcrafted rules, LMs generalize from diverse
+data and adapt to various tasks with minimal tuning, acting as a compressed
+knowledge base. However, LMs in their standard form face challenges with
+long-horizon tasks, particularly in partially observable multi-agent settings.
+We propose an LM-based Long-Horizon Planner for Multi-Agent Robotics (LLaMAR),
+a cognitive architecture for planning that achieves state-of-the-art results in
+long-horizon tasks within partially observable environments. LLaMAR employs a
+plan-act-correct-verify framework, allowing self-correction from action
+execution feedback without relying on oracles or simulators. Additionally, we
+present MAP-THOR, a comprehensive test suite encompassing household tasks of
+varying complexity within the AI2-THOR environment. Experiments show that
+LLaMAR achieves a 30% higher success rate than other state-of-the-art LM-based
+multi-agent planners in MAP-THOR and Search \& Rescue tasks. Code can be found
+at https://github.com/nsidn98/LLaMAR
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 4 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Map Imagination Like Blind Humans: Group Diffusion Model for Robotic Map
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16908v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16908v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qijin Song, Weibang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can robots imagine or generate maps like humans do, especially when only
+limited information can be perceived like blind people? To address this
+challenging task, we propose a novel group diffusion model (GDM) based
+architecture for robots to generate point cloud maps with very limited input
+information.Inspired from the blind humans' natural capability of imagining or
+generating mental maps, the proposed method can generate maps without visual
+perception data or depth data. With additional limited super-sparse spatial
+positioning data, like the extra contact-based positioning information the
+blind individuals can obtain, the map generation quality can be improved even
+more.Experiments on public datasets are conducted, and the results indicate
+that our method can generate reasonable maps solely based on path data, and
+produce even more refined maps upon incorporating exiguous LiDAR data.Compared
+to conventional mapping approaches, our novel method significantly mitigates
+sensor dependency, enabling the robots to imagine and generate elementary maps
+without heavy onboard sensory devices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robot Error Awareness Through Human Reactions: Implementation,
+  Evaluation, and Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05723v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05723v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maia Stiber, Russell Taylor, Chien-Ming Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective error detection is crucial to prevent task disruption and maintain
+user trust. Traditional methods often rely on task-specific models or user
+reporting, which can be inflexible or slow. Recent research suggests social
+signals, naturally exhibited by users in response to robot errors, can enable
+more flexible, timely error detection. However, most studies rely on post hoc
+analysis, leaving their real-time effectiveness uncertain and lacking
+user-centric evaluation. In this work, we developed a proactive error detection
+system that combines user behavioral signals (facial action units and speech),
+user feedback, and error context for automatic error detection. In a study (N =
+28), we compared our proactive system to a status quo reactive approach.
+Results show our system 1) reliably and flexibly detects error, 2) detects
+errors faster than the reactive approach, and 3) is perceived more favorably by
+users than the reactive one. We discuss recommendations for enabling robot
+error awareness in future HRI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Estimation of Relaxed Model Parameters for Robust UAV
+  Trajectory Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10941v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10941v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Derek Fan, David A. Copp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online trajectory optimization and optimal control methods are crucial for
+enabling sustainable unmanned aerial vehicle (UAV) services, such as
+agriculture, environmental monitoring, and transportation, where available
+actuation and energy are limited. However, optimal controllers are highly
+sensitive to model mismatch, which can occur due to loaded equipment, packages
+to be delivered, or pre-existing variability in fundamental structural and
+thrust-related parameters. To circumvent this problem, optimal controllers can
+be paired with parameter estimators to improve their trajectory planning
+performance and perform adaptive control. However, UAV platforms are limited in
+terms of onboard processing power, oftentimes making nonlinear parameter
+estimation too computationally expensive to consider. To address these issues,
+we propose a relaxed, affine-in-parameters multirotor model along with an
+efficient optimal parameter estimator. We convexify the nominal Moving Horizon
+Parameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via
+an affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast
+quadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC)
+in real time. We compare this approach to the equivalent nonlinear estimator in
+Monte Carlo simulations, demonstrating a decrease in average solve time and
+trajectory optimality cost by 98.2% and 23.9-56.2%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, to be published in IEEE Sustech 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Mixed-Integer Conic Program for the Moving-Target Traveling Salesman
+  Problem based on a Graph of Convex Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04917v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04917v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allen George Philip, Zhongqiang Ren, Sivakumar Rathinam, Howie Choset
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new formulation that finds the optimum for the
+Moving-Target Traveling Salesman Problem (MT-TSP), which seeks to find a
+shortest path for an agent, that starts at a depot, visits a set of moving
+targets exactly once within their assigned time-windows, and returns to the
+depot. The formulation relies on the key idea that when the targets move along
+lines, their trajectories become convex sets within the space-time coordinate
+system. The problem then reduces to finding the shortest path within a graph of
+convex sets, subject to some speed constraints. We compare our formulation with
+the current state-of-the-art Mixed Integer Conic Program (MICP) solver for the
+MT-TSP. The experimental results show that our formulation outperforms the MICP
+for instances with up to 20 targets, with up to two orders of magnitude
+reduction in runtime, and up to a 60\% tighter optimality gap. We also show
+that the solution cost from the convex relaxation of our formulation provides
+significantly tighter lower bounds for the MT-TSP than the ones from the MICP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Chordal Sparsity for Fast Global Optimality with Application
+  to Localization <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02365v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02365v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frederike Dümbgen, Connor Holmes, Timothy D. Barfoot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, many estimation problems in robotics have been shown to be
+solvable to global optimality using their semidefinite relaxations. However,
+the runtime complexity of off-the-shelf semidefinite programming (SDP) solvers
+is up to cubic in problem size, which inhibits real-time solutions of problems
+involving large state dimensions. We show that for a large class of problems,
+namely those with chordal sparsity, we can reduce the complexity of these
+solvers to linear in problem size. In particular, we show how to replace the
+large positive-semidefinite variable with a number of smaller interconnected
+ones using the well-known chordal decomposition. This formulation also allows
+for the straightforward application of the alternating direction method of
+multipliers (ADMM), which can exploit parallelism for increased scalability. We
+show for two example problems in simulation that the chordal solvers provide a
+significant speed-up over standard SDP solvers, and that global optimality is
+crucial in the absence of good initializations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 6 figures. Version history: v1: initial arXiv, v2: WAFR
+  submission, v3: correction, v4: WAFR conference-ready, v5: WAFR SPAR journal
+  version</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">38</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Digital Twin for Smart Societies: A Catalyst for Inclusive and
+  Accessible Healthcare 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshit Mohanty, Sujatha Alla,  Vaishali, Nagesh Bheesetty, Prasanthi Chidipudi, Satya Prakash Chowdary Nandigam, Marisha Jmukhadze, Puneeth Bheesetty, Narendra Lakshmana Gowda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With rapid digitization and digitalization, drawing a fine line between the
+digital and the physical world has become nearly impossible. It has become
+essential more than ever to integrate all spheres of life into a single Digital
+Thread to address pressing challenges of modern society: accessible and
+inclusive healthcare in terms of equality and equity. Techno-social
+advancements and mutual acceptance have enabled the infusion of digital models
+to simulate social settings with minimum resource utilization to make effective
+decisions. However, a significant gap exists in feeding back the models with
+appropriate real-time changes. In other words, active behavioral modeling of
+modern society is lacking, influencing community healthcare as a whole. By
+creating virtual replicas of (physical) behavioral systems, digital twins can
+enable real-time monitoring, simulation, and optimization of urban dynamics.
+This paper explores the potential of digital twins to promote inclusive
+healthcare for evolving smart cities. We argue that digital twins can be used
+to: Identify and address disparities in access to healthcare services,
+Facilitate community participation, Simulate the impact of urban policies and
+interventions on different groups of people, and Aid policy-making bodies for
+better access to healthcare. This paper proposes several ways to use digital
+twins to stitch the actual and virtual societies. Several discussed concepts
+within this framework envision an active, integrated, and synchronized
+community aware of data privacy and security. The proposal also provides
+high-level step-wise transitions that will enable this transformation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 1 figure. This is accepted to publish at the proceedings of
+  the 6th International Conference on Artificial Intelligence and Applied
+  Mathematics in Engineering (ICAIAME 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Determining Disturbance Recovery Conditions by Inverse Sensitivity
+  Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07516v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07516v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael W. Fisher, Ian A. Hiskens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Power systems naturally experience disturbances, some of which can damage
+equipment and disrupt consumers. It is important to quickly assess the likely
+consequences of credible disturbances and take preventive action, if necessary.
+However, assessing the impact of potential disturbances is challenging because
+many of the influential factors, such as loading patterns, controller settings
+and load dynamics, are not precisely known. To address this issue, the paper
+introduces the concept of parameter-space recovery regions. For each
+disturbance, the corresponding recovery region is the region of parameter space
+for which the system will recover to the desired operating point. The boundary
+of the recovery region establishes the separation between parameter values that
+result in trouble-free recovery and those that incur undesirable non-recovery.
+The safety margin for a given set of parameter values is defined as the
+smallest distance (in parameter space) between the given values and the
+recovery boundary. Novel numerical algorithms with theoretical guarantees are
+presented for efficiently computing recovery boundaries and safety margins.
+Unlike prior methods, which tend to be overly conservative and restricted to
+low dimensional parameter space, these methods compute safety margins to
+arbitrary user-specified accuracy and do so efficiently in high dimensional
+parameter space. The efficacy of the methods is demonstrated using the IEEE
+39-bus benchmark power system, where safety margins are computed for cases that
+consider up to 86 parameters, and reveal unexpected safety implications that
+would not have been observed otherwise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computing Safety Margins of Parameterized Nonlinear Systems for
+  Vulnerability Assessment via Trajectory Sensitivities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07498v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07498v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael W. Fisher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physical systems experience nonlinear disturbances which have the potential
+to disrupt desired behavior. For a particular disturbance, whether or not the
+system recovers from the disturbance to a desired stable equilibrium point
+depends on system parameter values, which are typically uncertain and
+time-varying. Therefore, to quantify proximity to vulnerability we define the
+safety margin to be the smallest change in parameter values from a nominal
+value such that the system will no longer be able to recover from the
+disturbance. Safety margins are valuable but challenging to compute as related
+methods, such as those for robust region of attraction estimation, are often
+either overly conservative or computationally intractable for high dimensional
+systems. Recently, we developed algorithms to compute safety margins
+efficiently and non-conservatively by exploiting the large sensitivity of the
+system trajectory near the region of attraction boundary to small
+perturbations. Although these algorithms have enjoyed empirical success, they
+lack theoretical guarantees that would ensure their generalizability. This work
+develops a novel characterization of safety margins in terms of trajectory
+sensitivities, and uses this to derive well-posedness and convergence
+guarantees for these algorithms, enabling their generalizability and successful
+application to a large class of nonlinear systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Encrypted Computation of Collision Probability for Secure Satellite
+  Conjunction Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihoon Suh, Michael Hibbard, Kaoru Teranishi, Takashi Tanaka, Moriba Jah, Maruthi Akella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The computation of collision probability ($\mathcal{P}_c$) is crucial for
+space environmentalism and sustainability by providing decision-making
+knowledge that can prevent collisions between anthropogenic space objects.
+However, the accuracy and precision of $\mathcal{P}_c$ computations is often
+compromised by limitations in computational resources and data availability.
+While significant improvements have been made in the computational aspects, the
+rising concerns regarding the privacy of collaborative data sharing can be a
+major limiting factor in the future conjunction analysis and risk assessment,
+especially as the space environment grows increasingly privatized, competitive,
+and fraught with conflicting strategic interests. This paper argues that the
+importance of privacy measures in space situational awareness (SSA) is
+underappreciated, and regulatory and compliance measures currently in place are
+not sufficient by themselves, presenting a significant gap.
+  To address this gap, we introduce a novel encrypted architecture that
+leverages advanced cryptographic techniques, including homomorphic encryption
+(HE) and multi-party computation (MPC), to safeguard the privacy of entities
+computing space sustainability metrics, inter alia, $\mathcal{P}_c$. Our
+proposed protocol, Encrypted $\mathcal{P}_c$, integrates the Monte Carlo
+estimation algorithm with cryptographic solutions, enabling secure collision
+probability computation without exposing sensitive or proprietary information.
+This research advances secure conjunction analysis by developing a secure MPC
+protocol for $\mathcal{P}_c$ computation and highlights the need for innovative
+protocols to ensure a more secure and cooperative SSA landscape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Linear Parameter-Varying Framework for the Analysis of Time-Varying
+  Optimization Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Jakob, Andrea Iannelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we propose a framework to analyze iterative first-order
+optimization algorithms for time-varying convex optimization. We assume that
+the temporal variability is caused by a time-varying parameter entering the
+objective, which can be measured at the time of decision but whose future
+values are unknown. We consider the case of strongly convex objective functions
+with Lipschitz continuous gradients and address the class of running algorithms
+where only one iteration per time change is performed. We model these
+algorithms as discrete-time linear parameter varying (LPV) systems in feedback
+with a time-varying gradient. We leverage the approach of analyzing algorithms
+as uncertain control interconnections with integral quadratic constraints
+(IQCs) and generalize that framework to the time-varying case. We propose novel
+IQCs that are capable of capturing the behavior of time-varying nonlinearities
+and leverage techniques from the LPV literature to establish novel bounds on
+the tracking error. Quantitative bounds can be computed by solving a
+semi-definite program and can be interpreted as an input-to-state stability
+result with respect to a disturbance signal which increases with the temporal
+variability of the problem. As a departure from results in this research area,
+our bounds introduce terms that can be interpreted as a temporal rate of change
+in the cost function and the optimal value. We exemplify our main results with
+numerical experiments that showcase how our analysis framework is able to
+capture convergence rates of different first-order algorithms for time-varying
+optimization through the choice of IQC and rate bounds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Extended <span class="highlight-title">Survey</span> and a Comparison Framework for Dataflow Models of
+  Computation and Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Roumage, Selma Azaiez, Cyril Faure, Stéphane Louise
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataflow Model of Computation and Communications (DF MoCCs) is a formalism
+used to specify the behavior of Cyber-Physical Systems (CPSs). DF MoCCs are
+widely used in the design of CPSs, as they provide a high-level of abstraction
+to specify the system's behavior. DF MoCCs rules give semantics to a dataflow
+specification of a CPS, and static analysis algorithms rely on these semantics
+to guarantee safety properties of the dataflow specification, such as bounded
+memory usage and deadlock freeness. A wide range of DF MoCCs exists, each with
+its own characteristics and static analyses. This paper presents a survey of
+those DF MoCCs and a classification in eight categories. In addition, DF MoCCs
+are characterized by a comprehensive list of features and static analyses,
+which reflect their expressiveness and analyzability. Based on this
+characterization, a framework is proposed to compare the expressiveness and the
+analyzability of DF MoCCs quantitatively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable machine-learning for predicting molecular weight of PLA
+  based on artificial bee colony optimization algorithm and adaptive neurofuzzy
+  inference system 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Pouya Masoumi, Leo Creedon, Ramen Ghosh, Nimra Munir, Ross McMorrow, Marion McAfee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article discusses the integration of the Artificial Bee Colony (ABC)
+algorithm with two supervised learning methods, namely Artificial Neural
+Networks (ANNs) and Adaptive Network-based Fuzzy Inference System (ANFIS), for
+feature selection from Near-Infrared (NIR) spectra for predicting the molecular
+weight of medical-grade Polylactic Acid (PLA). During extrusion processing of
+PLA, in-line NIR spectra were captured along with extrusion process and machine
+setting data. With a dataset comprising 63 observations and 512 input features,
+appropriate machine learning tools are essential for interpreting data and
+selecting features to improve prediction accuracy. Initially, the ABC
+optimization algorithm is coupled with ANN/ANFIS to forecast PLA molecular
+weight. The objective functions of the ABC algorithm are to minimize the root
+mean square error (RMSE) between experimental and predicted PLA molecular
+weights while also minimizing the number of input features. Results indicate
+that employing ABC-ANFIS yields the lowest RMSE of 282 Da and identifies four
+significant parameters (NIR wavenumbers 6158 cm-1, 6310 cm-1, 6349 cm-1, and
+melt temperature) for prediction. These findings demonstrate the effectiveness
+of using the ABC algorithm with ANFIS for selecting a minimal set of features
+to predict PLA molecular weight with high accuracy during processing
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-Train</span>ed Large Language Model Based Remaining Useful Life Transfer
+  Prediction of Bearing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07191v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07191v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laifa Tao, Zhengduo Zhao, Xuesong Wang, Bin Li, Wenchao Zhan, Xuanyuan Su, Shangyu Li, Qixuan Huang, Haifei Liu, Chen Lu, Zhixuan Lian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately predicting the remaining useful life (RUL) of rotating machinery,
+such as bearings, is essential for ensuring equipment reliability and
+minimizing unexpected industrial failures. Traditional data-driven deep
+learning methods face challenges in practical settings due to inconsistent
+training and testing data distributions and limited generalization for
+long-term predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-time Mode-Aware Dataflow: A Dataflow Model to Specify and Analyze
+  Mode-dependent CPSs under Relaxed Timing Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Roumage, Selma Azaiez, Cyril Faure, Stéphane Louise
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern Cyber-Physical Systems (CPS) often exhibit both relaxed real-time
+constraints and a mode-dependent execution. Relaxed real-time constraints mean
+that only a subset of the processes of a CPS have real-time constraints, and a
+mode-dependent CPS has conditional execution branches. Static analysis tools,
+such as the PolyGraph model (a formalism extending the Cyclo-Static Dataflow
+model with real-time constraints), can specify and analyze systems with relaxed
+real-time constraints. However, PolyGraph is limited in its ability to specify
+and analyze mode-dependent CPSs. This paper extends PolyGraph with routing
+actors, yielding the Routed PolyGraph model. This model is further extended to
+the Real-time Mode-Aware Dataflow (RMDF), which both leverages routing actors
+and incorporates a new dataflow actor to specify mode-dependent CPSs under
+relaxed real-time constraints. This paper also extends the static analyses of
+PolyGraph to RMDF. We showcase the application of RMDF with a specification and
+an analysis (derivation of timing constraints at the job-level and a
+feasibility test) of the vision processing system of the Ingenuity Mars
+helicopter.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Robotic Approach Techniques for the Insertion of a Straight
+  Instrument into a Vitreoretinal Surgery Trocar 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ross Henry, Martin Huber, Anestis Mablekos-Alexiou, Carlo Seneci, Mohamed Abdelaziz, Hans Natalius, Lyndon da Cruz, Christos Bergeles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in vitreoretinal robotic surgery enable precise techniques for gene
+therapies. This study evaluates three robotic approaches using the 7-DoF
+robotic arm for docking a micro-precise tool to a trocar: fully co-manipulated,
+hybrid co-manipulated/teleoperated, and hybrid with camera assistance. The
+fully co-manipulated approach was the fastest but had a 42% success rate.
+Hybrid methods showed higher success rates (91.6% and 100%) and completed tasks
+within 2 minutes. NASA Task Load Index (TLX) assessments indicated lower
+physical demand and effort for hybrid approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2 Pages, 2 Figures, 1 Table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implementing LoRa MIMO System for Internet of Things 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07148v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07148v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atonu Ghosh, Sharath Chandan, Sudip Misra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bandwidth constraints limit LoRa implementations. Contemporary IoT
+applications require higher throughput than that provided by LoRa. This work
+introduces a LoRa Multiple Input Multiple Output (MIMO) system and a spatial
+multiplexing algorithm to address LoRa's bandwidth limitation. The transceivers
+in the proposed approach modulate the signals on distinct frequencies of the
+same LoRa band. A Frequency Division Multiplexing (FDM) method is used at the
+transmitters to provide a wider MIMO channel. Unlike conventional Orthogonal
+Frequency Division Multiplexing (OFDM) techniques, this work exploits the
+orthogonality of the LoRa signals facilitated by its proprietary Chirp Spread
+Spectrum (CSS) modulation to perform an OFDM in the proposed LoRa MIMO system.
+By varying the Spreading Factor (SF) and bandwidth of LoRa signals, orthogonal
+signals can transmit on the same frequency irrespective of the FDM. Even though
+the channel correlation is minimal for different spreading factors and
+bandwidths, different Carrier Frequencies (CF) ensure the signals do not
+overlap and provide additional degrees of freedom. This work assesses the
+proposed model's performance and conducts an extensive analysis to provide an
+overview of resources consumed by the proposed system. Finally, this work
+provides the detailed results of a thorough evaluation of the model on test
+hardware.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Federated Deep Learning Framework for Cell-Free RSMA Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. Ali Mousavi, Mehdi Monemi, Reza Mohseni, Matti Latva-aho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Next-generation wireless networks are poised to benefit significantly from
+the integration of three key technologies (KTs): Rate-Splitting Multiple Access
+(RSMA), cell-free architectures, and federated learning. Each of these
+technologies offers distinct advantages in terms of security, robustness, and
+distributed structure. In this paper, we propose a novel cell-free network
+architecture that incorporates RSMA and employs machine learning techniques
+within a federated framework. This combination leverages the strengths of each
+KT, creating a synergistic effect that maximizes the benefits of security,
+robustness, and distributed structure. We formally formulate the access point
+(AP) selection and precoder design for max-min rate optimization in a cell-free
+MIMO RSMA network. Our proposed solution scheme involves a three-block
+procedure. The first block trains deep reinforcement learning (DRL) neural
+networks to obtain RSMA precoders, assuming full connectivity between APs and
+user equipments (UEs). The second block uses these precoders and principal
+component analysis (PCA) to assign APs to UEs by removing a subset of AP-UE
+connections. The final block fine-tunes the RSMA precoders by incorporating the
+associated APs into a second DRL network. To leverage the distributed nature of
+the cell-free network, this process is implemented in a Federated Deep
+Reinforcement Learning (FDRL) structure operating through the cooperation of
+APs and a central processing unit (CPU). Simulation results demonstrate that
+the proposed FDRL approach performs comparably to a benchmark centralized DRL
+scheme. Our FDRL approach, provides a balanced trade-off, maintaining high
+performance with enhanced security and reduced processing demands.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimization with Multi-sourced Reference Information and Unknown Trust:
+  A Distributionally Robust Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanru Guo, Ruiwei Jiang, Siqian Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In problems that involve input parameter information gathered from multiple
+data sources with varying reliability, incorporating users' trust about
+different sources in decision-optimization models can potentially improve
+solution performance and reliability. In this work, we propose a novel
+multi-reference distributionally robust optimization (MR-DRO) framework, where
+the model inputs are uncertain and their probability distributions can be
+statistically inferred from multiple data sources. Via nonparametric data
+fusion, we construct a Wasserstein ambiguity set to minimize the worst-case
+expected value of a stochastic objective function, accounting for both
+uncertainty and unknown reliability of information sources. We reformulate the
+MR-DRO model as a linear program given linear objective and constraints in the
+original problem. We also incorporate a dynamic trust update mechanism that
+adjusts the trust for each source based on its performance over time. In
+addition, we introduce the concept of probability dominance to identify sources
+with dominant trust. Via solving instances of resource allocation and portfolio
+optimization, we demonstrate the effectiveness of the trust-informed MR-DRO
+approach compared to traditional optimization frameworks relying on a single
+data source. Our results highlight the significance of integrating (dynamic)
+user trust in decision making under uncertainty, particularly when given
+diverse and potentially conflicting input data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 9 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Erasing Noise in Signal Detection with Diffusion Model: From Theory to
+  Application 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiucheng Wang, Peilin Zheng, Nan Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a signal detection method based on the denoise diffusion model
+(DM) is proposed, which outperforms the maximum likelihood (ML) estimation
+method that has long been regarded as the optimal signal detection technique.
+Theoretically, a novel mathematical theory for intelligent signal detection
+based on stochastic differential equations (SDEs) is established in this paper,
+demonstrating the effectiveness of DM in reducing the additive white Gaussian
+noise in received signals. Moreover, a mathematical relationship between the
+signal-to-noise ratio (SNR) and the timestep in DM is established, revealing
+that for any given SNR, a corresponding optimal timestep can be identified.
+Furthermore, to address potential issues with out-of-distribution inputs in the
+DM, we employ a mathematical scaling technique that allows the trained DM to
+handle signal detection across a wide range of SNRs without any fine-tuning.
+Building on the above theoretical foundation, we propose a DM-based signal
+detection method, with the diffusion transformer (DiT) serving as the backbone
+neural network, whose computational complexity of this method is
+$\mathcal{O}(n^2)$. Simulation results demonstrate that, for BPSK and QAM
+modulation schemes, the DM-based method achieves a significantly lower symbol
+error rate (SER) compared to ML estimation, while maintaining a much lower
+computational complexity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IEEE_TIE25: Analysis and Synthesis of DOb-based Robust Motion
+  Controllers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emre Sariyildiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By employing a unified state-space design framework, this paper proposes a
+novel systematic analysis and synthesis method that facilitates the
+implementation of both conventional zero-order (ZO) and high-order (HO) DObs.
+Furthermore, this design method supports the development of advanced DObs
+(e.g., the proposed High-Performance (HP) DOb in this paper), enabling more
+accurate disturbance estimation and, consequently, enhancing the robust
+stability and performance of motion control systems. Lyapunov direct method is
+employed in the discrete-time domain to analyse the stability of the proposed
+digital robust motion controllers. The analysis demonstrates that the proposed
+DObs are stable in the sense that the estimation error is uniformly ultimately
+bounded when subjected to bounded disturbances. Additionally, they are proven
+to be asymptotically stable under specific disturbance conditions, such as
+constant disturbances for the ZO and HP DObs. Stability constraints on the
+design parameters of the DObs are analytically derived, providing effective
+synthesis tools for the implementation of the digital robust motion
+controllers. The discrete-time analysis facilitates the derivation of more
+practical design constraints. The proposed analysis and synthesis methods have
+been rigorously validated through experimental evaluations, confirming their
+effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Transactions on Industrial Electronics 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Search for Optimal Low Thrust Spacecraft Trajectories using
+  Diffusion Models and the Indirect Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Graebner, Ryne Beeson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long time-duration low-thrust nonlinear optimal spacecraft trajectory global
+search is a computationally and time expensive problem characterized by
+clustering patterns in locally optimal solutions. During preliminary mission
+design, mission parameters are subject to frequent changes, necessitating that
+trajectory designers efficiently generate high-quality control solutions for
+these new scenarios. Generative machine learning models can be trained to learn
+how the solution structure varies with respect to a conditional parameter,
+thereby accelerating the global search for missions with updated parameters. In
+this work, state-of-the-art diffusion models are integrated with the indirect
+approach for trajectory optimization within a global search framework. This
+framework is tested on two low-thrust transfers of different complexity in the
+circular restricted three-body problem. By generating and analyzing a training
+data set, we develop mathematical relations and techniques to understand the
+complex structures in the costate domain of locally optimal solutions for these
+problems. A diffusion model is trained on this data and successfully
+accelerates the global search for both problems. The model predicts how the
+costate solution structure changes, based on the maximum spacecraft thrust
+magnitude. Warm-starting a numerical solver with diffusion model samples for
+the costates at the initial time increases the number of solutions generated
+per minute for problems with unseen thrust magnitudes by one to two orders of
+magnitude in comparison to samples from a uniform distribution and from an
+adjoint control transformation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Reliability of Remotely Piloted Aircraft System Performance under
+  Communication Loss and Latency Uncertainties 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutian Pang, Andrew Paul Kendall, John-Paul Clarke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mission-critical use of highly maneuverable Remotely Piloted Aircraft Systems
+(RPAS) requires a thorough understanding of the reliability of their
+communication systems. Investigations into system-level performance under
+stochastic aviation communication conditions are critical for estimating
+mission success rates and assessing the risks associated with integrating RPAS
+into existing airspace, ensuring overall aviation safety. This study aims to
+quantify the impact of communication latency and complete signal loss on the
+mission completion performance of a highly maneuverable RPAS. The mission is
+defined as a static waypoint tracking task in three-dimensional airspace. We
+start with examining and deriving mathematical formulations of key reliability
+metrics of Required Communication Performance (RCP). These stochastic factors
+are then embedded into flight control simulations (i.e., communication
+availability and latency) to examine the system behavior. Lastly, we generate
+mission success rate and mission completion time envelopes through extensive
+multiprocessing Monte Carlo simulations through high-performance computing. We
+discover a drastic deterioration in flight performance while latency or
+availability erodes the stability margin. In addition, we propose a new
+reliability metric, namely \textit{communicability}, which integrates three key
+RCP metrics and helps understanding the maximum tolerable latency to flight
+control. The procedure and results obtained from this research inform engineers
+designing RPAS with better trade-off between communication capability and
+flight control performance. Future works includes exploring alternative flight
+simulators (i.e., nonlinear dynamic inversion) with other missions (i.e.,
+dynamic waypoint following), or develop delay-compensated optimal controls. The
+analysis on stability margin is also desired for theoretical verification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing the Role of the DSO in Electricity Trading of VPPs via a
+  Stackelberg Game Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Wang, Xi Zhang, Luis Badesa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing penetration of distributed energy resources (DER) has sparked
+interest in promoting their participation in the power market. Here we consider
+a setting in which different virtual power plants (VPPs) with certain flexible
+resources take part in electricity trading, either by direct participation in
+the wholesale power market, or interfaced by the Distribution System Operator
+(DSO). Our goal is to examine the role and influence of the DSO as a
+stakeholder, for which we formulate a Stackelberg game via a bilevel
+optimization model: the DSO maximizes profits at the upper level, while VPPs
+minimize operating costs at the lower level. To solve this problem, we use the
+Karush-Kuhn-Tucke optimality conditions of the convex lower-level problems to
+achieve a single-level mixed-integer nonlinear program. The results show that
+the role of the DSO as an intermediary agent leads to a decrease in operating
+costs for the VPPs, while guaranteeing a profit for the DSO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Koopman Meets Limited Bandwidth: Effect of Quantization on Data-Driven
+  Linear Prediction and Control of Nonlinear Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahab Ataei, Dipankar Maity, Debdipta Goswami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Koopman-based lifted linear identification have been widely used for
+data-driven prediction and model predictive control (MPC) of nonlinear systems.
+It has found applications in flow-control, soft robotics, and unmanned aerial
+vehicles (UAV). For autonomous systems, this system identification method works
+by embedding the nonlinear system in a higher-dimensional linear space and
+computing a finite-dimensional approximation of the corresponding Koopman
+operator with the Extended Dynamic Mode Decomposition (EDMD) algorithm. EDMD is
+a data-driven algorithm that estimates an approximate linear system by lifting
+the state data-snapshots via nonlinear dictionary functions. For control
+systems, EDMD is further modified to utilize both state and control
+data-snapshots to estimate a lifted linear predictor with control input. This
+article investigates how the estimation process is affected when the data is
+quantized. Specifically, we examine the fundamental connection between
+estimates of the linear predictor matrices obtained from unquantized data and
+those from quantized data via modified EDMD. Furthermore, using the law of
+large numbers, we demonstrate that, under a large data regime, the quantized
+estimate can be considered a regularized version of the unquantized estimate.
+We also explore the relationship between the two estimates in the finite data
+regime. We further analyze the effect of nonlinear lifting functions on this
+regularization due to quantization. The theory is validated through repeated
+numerical experiments conducted on several control systems. The effect of
+quantization on the MPC performance is also demonstrated.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures. arXiv admin note: text overlap with
+  arXiv:2410.02803</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Outlier Connections Detection in Databases Network Traffic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonid Rodniansky, Tania Butovsky, Mikhail Shpak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The article describes a practical method for detecting outlier database
+connections in real-time. Outlier connections are detected with a specified
+level of confidence. The method is based on generalized security rules and a
+simple but effective real-time machine learning mechanism. The described method
+is non-intrusive to the database and does not depend on the type of database.
+The method is used to proactively control access even before database
+connection is established, minimize false positives, and maintain the required
+response speed to detected database connection outliers. The capabilities of
+the system are demonstrated with several examples of outliers in real-world
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finite Sample Identification of Partially Observed Bilinear Dynamical
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yahya Sattar, Yassir Jedra, Maryam Fazel, Sarah Dean
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning a realization of a partially observed
+bilinear dynamical system (BLDS) from noisy input-output data. Given a single
+trajectory of input-output samples, we provide a finite time analysis for
+learning the system's Markov-like parameters, from which a balanced realization
+of the bilinear system can be obtained. Our bilinear system identification
+algorithm learns the system's Markov-like parameters by regressing the outputs
+to highly correlated, nonlinear, and heavy-tailed covariates. Moreover, the
+stability of BLDS depends on the sequence of inputs used to excite the system.
+These properties, unique to partially observed bilinear dynamical systems, pose
+significant challenges to the analysis of our algorithm for learning the
+unknown dynamics. We address these challenges and provide high probability
+error bounds on our identification algorithm under a uniform stability
+assumption. Our analysis provides insights into system theoretic quantities
+that affect learning accuracy and sample complexity. Lastly, we perform
+numerical experiments with synthetic data to reinforce these insights.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Ingenuity Mars Helicopter Specified and Analyzed with the Real-time
+  Mode-aware Dataflow Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Roumage, Selma Azaiez, Cyril Faure, Stéphane Louise
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ingenuity is an autonomous Cyber-Pysical System (CPS) that has successfully
+completed more than 70 flights over Mars between 2021 and 2024. Ensuring the
+safety of its mission is paramount, as any failure could result in catastrophic
+economic damage and significant financial losses. Dataflow Models of
+Computation and Communication (DF MoCCs) serve as a formal framework for
+specifying and analyzing the timing behavior of such CPSs. In particular, the
+Real-time Mode-aware Dataflow (RMDF) model is highly suitable to specify and
+analyze real-time and mode-dependent Cyber-Physical Systems (CPSs) like
+Ingenuity. This paper showcases the application of RMDF for the specification
+and analysis of Ingenuity. We propose a dataflow specification of Ingenuity,
+analyze its timing behavior, and provide a feasibility test. Finally, we
+proposed a plausible explanation of the timing anomaly that occurred during the
+sixth flight of Ingenuity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2501.07187</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Review</span> of Detection, Evolution, and Data Reconstruction Strategies for
+  False Data Injection Attacks in Power Cyber-Physical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyong Bo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of information and physical systems in modern power grids has
+heightened vulnerabilities to False Data Injection Attacks (FDIAs), threatening
+the secure operation of power cyber-physical systems (CPS). This paper reviews
+FDIA detection, evolution, and data reconstruction strategies, highlighting
+cross-domain coordination, multi-temporal evolution, and stealth
+characteristics. Challenges in existing detection methods, including poor
+interpretability and data imbalance, are discussed, alongside advanced
+state-aware and action-control data reconstruction techniques. Key issues, such
+as modeling FDIA evolution and distinguishing malicious data from regular
+faults, are identified. Future directions to enhance system resilience and
+detection accuracy are proposed, contributing to the secure operation of power
+CPS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 4 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event-Based Impulsive Control for Spacecraft Rendezvous Hovering Phases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julio C. Sanchez, Christophe Louembet, Francisco Gavilan, Rafael Vazquez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents an event-triggered controller for spacecraft rendezvous
+hovering phases. The goal is to maintain the chaser within a bounded region
+with respect to the target. The main assumption is that the chaser vehicle has
+impulsive thrusters. These are assumed to be orientable at any direction and
+are constrained by dead-zone and saturation bounds. The event-based controller
+relies on trigger rules deciding when a suitable control law is applied. The
+local control law consists on a single impulse; therefore the trigger rules
+design is based on the instantaneous reachability to the admissible set. The
+final outcome is a very efficient algorithm from both computational burden and
+footprint perspectives. Because the proposed methodology is based on a single
+impulse control, the controller invariance is local and assessed through
+impulsive systems theory. Finally, numerical results are shown and discussed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chance-constrained Model Predictive Control for Near Rectilinear Halo
+  Orbit spacecraft rendezvous 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10437v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10437v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julio C. Sanchez, Francisco Gavilan, Rafael Vazquez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a robust Model Predictive Controller (MPC) to solve the
+problem of spacecraft rendezvous in the context of the restricted three-body
+problem (R3BP) as will be required to dock with space stations in cislunar
+space. The employed methodology is both valid for chemical and electric
+thrusters. By exploiting the state transition matrix and using a
+chance-constrained approach, the robust MPC assures constraints satisfaction
+under the presence of disturbances in a probabilistic sense. The perturbations
+parameters are computed on-line using a disturbance estimator. The robust
+controller is tested for a rendezvous scenario with a target placed in an
+Earth-Moon Near-Rectilinear Halo Orbit. Numerical results are shown and
+discussed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A flatness-based predictive controller for six-degrees of freedom
+  spacecraft rendezvous 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julio C. Sanchez, Francisco Gavilan, Rafael Vazquez, Christophe Louembet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a closed-loop guidance algorithm for six-degrees of
+freedom spacecraft rendezvous with a passive target flying in an eccentric
+orbit. The main assumption is that the chaser vehicle has an attitude control
+system, based on reaction wheels, providing the necessary torque to change its
+orientation whereas the number of thrusters is arbitrary. The goal is to design
+fuel optimal maneuvers while satisfying operational constraints and rejecting
+disturbances. The proposed method is as follows; first, the coupled
+translational and angular dynamics are transformed to equivalent algebraic
+relations using the relative translational states transition matrix and the
+attitude flatness property. Then, a direct transcription method, based on
+B-splines parameterization and discretization of time continuous constraints,
+is developed to obtain a tractable static program. Finally, a Model Predictive
+Controller, based on linearization around the previously computed solution, is
+considered to handle disturbances. Numerical results are shown and discussed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the Performance of Echo State Networks Through State Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.15141v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.15141v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter J. Ehlers, Hendra I. Nurdin, Daniel Soh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reservoir computing, using nonlinear dynamical systems, offers a
+cost-effective alternative to neural networks for complex tasks involving
+processing of sequential data, time series modeling, and system identification.
+Echo state networks (ESNs), a type of reservoir computer, mirror neural
+networks but simplify training. They apply fixed, random linear transformations
+to the internal state, followed by nonlinear changes. This process, guided by
+input signals and linear regression, adapts the system to match target
+characteristics, reducing computational demands. A potential drawback of ESNs
+is that the fixed reservoir may not offer the complexity needed for specific
+problems. While directly altering (training) the internal ESN would reintroduce
+the computational burden, an indirect modification can be achieved by
+redirecting some output as input. This feedback can influence the internal
+reservoir state, yielding ESNs with enhanced complexity suitable for broader
+challenges. In this paper, we demonstrate that by feeding some component of the
+reservoir state back into the network through the input, we can drastically
+improve upon the performance of a given ESN. We rigorously prove that, for any
+given ESN, feedback will almost always improve the accuracy of the output. For
+a set of three tasks, each representing different problem classes, we find that
+with feedback the average error measures are reduced by $30\%-60\%$.
+Remarkably, feedback provides at least an equivalent performance boost to
+doubling the initial number of computational nodes, a computationally expensive
+and technologically challenging alternative. These results demonstrate the
+broad applicability and substantial usefulness of this feedback scheme.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QuadWBG: Generalizable Quadrupedal Whole-Body Grasping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.06782v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.06782v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jilong Wang, Javokhirbek Rajabov, Chaoyi Xu, Yiming Zheng, He Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legged robots with advanced manipulation capabilities have the potential to
+significantly improve household duties and urban maintenance. Despite
+considerable progress in developing robust locomotion and precise manipulation
+methods, seamlessly integrating these into cohesive whole-body control for
+real-world applications remains challenging. In this paper, we present a
+modular framework for robust and generalizable whole-body loco-manipulation
+controller based on a single arm-mounted camera. By using reinforcement
+learning (RL), we enable a robust low-level policy for command execution over 5
+dimensions (5D) and a grasp-aware high-level policy guided by a novel metric,
+Generalized Oriented Reachability Map (GORM). The proposed system achieves
+state-of-the-art one-time grasping accuracy of 89% in the real world, including
+challenging tasks such as grasping transparent objects. Through extensive
+simulations and real-world experiments, we demonstrate that our system can
+effectively manage a large workspace, from floor level to above body height,
+and perform diverse whole-body loco-manipulation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Asymmetry of Frequency Distribution in Power Systems: Sources,
+  Estimation, Impact and Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.04287v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.04287v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taulant Kerci, Federico Milano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper analyses an emerging real-world phenomena in inverter-based
+renewable-dominated power systems, namely, asymmetry of frequency distribution.
+The paper first provides a rationale on why asymmetry reduces the "quality" of
+the frequency control and system operation. Then it provides qualitative
+theoretical insights that explain asymmetry in terms of the nonlinearity of
+real-world power systems and associated models. In particular network losses
+and pitch angle-based frequency control of wind power plants are discussed.
+Then the paper proposes a nonlinear compensation control to reduce the
+asymmetry as well as a statistical metric based on the frequency probability
+distribution to quantify the level of asymmetry in a power system. Real-world
+data obtained from the Irish and Australian transmission systems serve to
+support the theoretical appraisal, whereas simulations based on an IEEE
+benchmark system show the effectiveness of the proposed nonlinear compensation.
+The case study also shows that, while automatic generation control reduces
+asymmetry, frequency control limits and droop-based frequency support provided
+by wind generation using a tight deadband of 15 mHz, namely active power
+control, leads to a significant increase in the asymmetry of the frequency
+probability distribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cost-optimized probabilistic maintenance for condition monitoring of
+  wind turbines with rare failures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.09385v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.09385v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viktor Begun, Ulrich Schlickewei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method, a model, and a form of presenting model results for
+condition monitoring of a small set of wind turbines with rare failures. The
+main new ingredient of the method is to sample failure thresholds according to
+the profit they give to an operating company. The model is a multiple linear
+regression with seasonal components and external regressors, representing all
+sensor components except for the selected one. To overcome the scarcity of the
+training data, we use the median sensor values from all available turbines in
+their healthy state. The cumulated deviation from the normal behavior model
+obtained for this median turbine is calibrated for each turbine at the
+beginning of the test period and after known failures. The proposed form of
+presenting results is to set a scale for possible costs, control for random
+maintenance, and show a whole distribution of costs depending on the free model
+parameters. We make a case study on an open dataset with SCADA data from
+multiple sensors and show that considering the influence of turbine components
+is more critical than seasonality. The distribution, the average, and the
+standard deviation of maintenance costs can be very different for similar
+minimal costs. Random maintenance can be more profitable than reactive
+maintenance and other approaches. Our predictive maintenance model outperforms
+random maintenance and competitors for the whole set of considered turbines,
+giving substantial savings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Improved and finally accepted journal version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Bouc-Wen Model for Binary Direct Collinear Collisions of Convex
+  Viscoplastic Bodies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.08147v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.08147v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihails Milehins, Dan B. Marghitu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study mathematical models of binary direct collinear collisions of convex
+viscoplastic bodies based on two incremental collision laws that employ the
+Bouc-Wen differential model of hysteresis to represent the elastoplastic
+behavior of the materials of the colliding bodies. These collision laws are the
+Bouc-Wen-Simon-Hunt-Crossley Collision Law (BWSHCCL) and the Bouc-Wen-Maxwell
+Collision Law (BWMCL). The BWSHCCL comprises of the Bouc-Wen model amended with
+a nonlinear Hertzian elastic spring element and connected in parallel to a
+nonlinear displacement-dependent and velocity-dependent energy dissipation
+element. The BWMCL comprises of the Bouc-Wen model amended with a nonlinear
+Hertzian elastic spring element and connected in series to a linear
+velocity-dependent energy dissipation element. The mathematical models of the
+collision process are presented in the form of finite-dimensional initial value
+problems. We show that the models possess favorable analytical properties
+(e.g., global existence, uniqueness, and boundedness of the solutions) under
+suitable restrictions on the values of their parameters. Furthermore, based on
+the results of two model parameter identification studies, we demonstrate that
+good agreement can be attained between experimental data and numerical
+approximations of the behavior of the mathematical models across a wide range
+of initial relative velocities of the colliding bodies while using
+parameterizations of the models that are independent of the initial relative
+velocity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages; 5 figures; (v1-v5) a variety of amendments; (v6) updated
+  scaling/nondimensionalization and introduced amendments based on external
+  feedback; the associated code/data are available from
+  https://gitlab.com/user9716869/BWBCL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data Requirements and Prediction Scaling for Long-Term Failure Forecasts
+  in Wind Turbines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21533v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21533v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viktor Begun, Ulrich Schlickewei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the key factors that enable early failure forecasting in wind
+turbines. For this purpose, we analyze studies with long-term forecasts and
+compare their main features: prediction time, methods, targeted components,
+dataset size, and check the effect of using additional sensors. We found that
+the size of the dataset is the main factor and that an approximate linear
+scaling holds: the number of forecast days is twice the size of the dataset,
+measured in turbine years. We also observe that the data allow us to quantify
+the meaning of "big" and "long" in the terms "big data" and "long-term"
+forecasts, which are found to be ten turbine years and two weeks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Improved the text and figure, updated the references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Estimation of Relaxed Model Parameters for Robust UAV
+  Trajectory Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10941v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10941v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Derek Fan, David A. Copp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online trajectory optimization and optimal control methods are crucial for
+enabling sustainable unmanned aerial vehicle (UAV) services, such as
+agriculture, environmental monitoring, and transportation, where available
+actuation and energy are limited. However, optimal controllers are highly
+sensitive to model mismatch, which can occur due to loaded equipment, packages
+to be delivered, or pre-existing variability in fundamental structural and
+thrust-related parameters. To circumvent this problem, optimal controllers can
+be paired with parameter estimators to improve their trajectory planning
+performance and perform adaptive control. However, UAV platforms are limited in
+terms of onboard processing power, oftentimes making nonlinear parameter
+estimation too computationally expensive to consider. To address these issues,
+we propose a relaxed, affine-in-parameters multirotor model along with an
+efficient optimal parameter estimator. We convexify the nominal Moving Horizon
+Parameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via
+an affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast
+quadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC)
+in real time. We compare this approach to the equivalent nonlinear estimator in
+Monte Carlo simulations, demonstrating a decrease in average solve time and
+trajectory optimality cost by 98.2% and 23.9-56.2%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, to be published in IEEE Sustech 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ United We Fall: On the Nash Equilibria of Multiplex and Multilayer
+  Network Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06108v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06108v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raman Ebrahimi, Parinaz Naghizadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network games provide a framework to study strategic decision making
+processes that are governed by structured interdependencies among agents.
+However, existing models do not account for environments in which agents
+simultaneously interact over multiple networks, or when agents operate over
+multiple action dimensions. In this paper, we propose new models of multiplex
+network games to capture the different modalities of interactions among
+strategic agents, and multilayer network games to capture their interactions
+over multiple action dimensions. We explore how the properties of the
+constituent networks of a multiplex/multilayer network can undermine or support
+the existence, uniqueness, and stability of the game's Nash equilibria.
+Notably, we highlight that both the largest and smallest eigenvalues of the
+constituent networks (reflecting their connectivity and two-sidedness,
+respectively) are instrumental in determining the uniqueness of the
+multiplex/multilayer network game's equilibrium. Together, our findings shed
+light on the reasons for the fragility of equilibria when agents interact over
+networks of networks, and point out potential interventions to alleviate them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-Driven Assessment of Vehicle-to-Grid Capabilities in Supporting
+  Grid During Emergencies: Case Study of Travis County, TX 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07982v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07982v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kelsey Nelson, Javad Mohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As extreme weather events become more common and threaten power grids, the
+continuing adoption of electric vehicles (EVs) introduces a growing opportunity
+for their use as a distributed energy storage resource. This energy storage can
+be used as backup generation through the use of vehicle-to-grid (V2G)
+technology, where electricity is sent back from EV batteries to the grid. With
+enough participation from EV owners, V2G can mitigate outages during grid
+emergencies. In order to investigate a practical application of V2G, this study
+leverages a vast array of real-world data, such as survey results on V2G
+participation willingness, historical outage data within ERCOT, current EV
+registrations, and demographic data. This data informs realistic emergency grid
+scenarios with V2G support using a synthetic transmission grid for Travis
+County. The results find that as EV ownership rises in the coming years, the
+simultaneous facilitation of bidirectional charging availability would allow
+for V2G to play a substantial role in preventing involuntary load shed as a
+result of emergencies like winter storms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Design and Implementation of Low-Cost Electric Vehicles (Evs)
+  Supercharger: A Comprehensive <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15728v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15728v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Khaledur Rahman, Faysal Amin Tanvir, Md Saiful Islam, Md Shameem Ahsan, Manam Ahmed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article presents a probabilistic modeling method utilizing smart meter
+data and an innovative agent-based simulator for electric vehicles (EVs). The
+aim is to assess the effects of different cost-driven EV charging strategies on
+the power distribution network (PDN). We investigate the effects of a 40% EV
+adoption on three parts of Frederiksberg's low voltage distribution network
+(LVDN), a densely urbanized municipality in Denmark. Our findings indicate that
+cable and transformer overloading especially pose a challenge. However, the
+impact of EVs varies significantly between each LVDN area and charging
+scenario. Across scenarios and LVDNs, the share of cables facing congestion
+ranges between 5% and 60%. It is also revealed that time-of-use (ToU)-based and
+single-day cost-minimized charging could be beneficial for LVDNs with moderate
+EV adoption rates. In contrast, multiple-day optimization will likely lead to
+severe congestion, as such strategies concentrate demand on a single day that
+would otherwise be distributed over several days, thus raising concerns about
+how to prevent it. The broader implications of our research suggest that,
+despite initial worries primarily centered on congestion due to unregulated
+charging during peak hours, a transition to cost-based smart charging,
+propelled by an increasing awareness of time-dependent electricity prices, may
+lead to a significant rise in charging synchronization, bringing about
+undesirable consequences for the power distribution network (PDN).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: This work has been withdrawn by arXiv
+  administrators due to inappropriate text reuse from external sources</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regression Equilibrium in Electricity Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17753v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17753v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladimir Dvorkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In two-stage electricity markets, renewable power producers enter the
+day-ahead market with a forecast of future power generation and then reconcile
+any forecast deviation in the real-time market at a penalty. The choice of the
+forecast model is thus an important strategy decision for renewable power
+producers as it affects financial performance. In electricity markets with
+large shares of renewable generation, the choice of the forecast model impacts
+not only individual performance but also outcomes for other producers. In this
+paper, we argue for the existence of a competitive regression equilibrium in
+two-stage electricity markets in terms of the parameters of private forecast
+models informing the participation strategies of renewable power producers. In
+our model, renewables optimize the forecast against the day-ahead and real-time
+prices, thereby maximizing the average profits across the day-ahead and
+real-time markets. By doing so, they also implicitly enhance the temporal cost
+coordination of day-ahead and real-time markets. We base the equilibrium
+analysis on the theory of variational inequalities, providing results on the
+existence and uniqueness of regression equilibrium in energy-only markets. We
+also devise two methods to compute regression equilibrium: centralized
+optimization and a decentralized ADMM-based algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Proactive Distributed Emergency Response with Heterogeneous Tasks
+  Allocation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.11132v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.11132v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justice Darko, Hyoshin Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditionally, traffic incident management (TIM) programs coordinate the
+deployment of emergency resources to immediate incident requests without
+accommodating the interdependencies on incident evolutions in the environment.
+However, ignoring inherent interdependencies on the evolution of incidents in
+the environment while making current deployment decisions is shortsighted, and
+the resulting naive deployment strategy can significantly worsen the overall
+incident delay impact on the network. The interdependencies on incident
+evolution in the environment, including those between incident occurrences, and
+those between resource availability in near-future requests and the anticipated
+duration of the immediate incident request, should be considered through a
+look-ahead model when making current-stage deployment decisions. This study
+develops a new proactive framework based on the distributed constraint
+optimization problem (DCOP) to address the above limitations, overcoming
+conventional TIM models that cannot accommodate the dependencies in the TIM
+problem. Furthermore, the optimization objective is formulated to incorporate
+Unmanned Aerial Vehicles (UAVs). The UAVs' role in TIM includes exploring
+uncertain traffic conditions, detecting unexpected events, and augmenting
+information from roadway traffic sensors. Robustness analysis of our model for
+multiple TIM scenarios shows satisfactory performance using local search
+exploration heuristics. Overall, our model reports a significant reduction in
+total incident delay compared to conventional TIM models. With UAV support, we
+demonstrate a further decrease in the total incident delay ranging between 5%
+and 45% for the different number of incidents. UAV's active sensing can shorten
+response time of emergency vehicles, and a reduction in uncertainties
+associated with the estimated incident delay impact.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 13 figures, 3 tables, journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">41</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Error Analysis of Second Order Elliptic Optimal Control Problem via
+  Hybrid Higher Order Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07505v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07505v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gouranga Mallik, Ramesh Chandra Sau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the design and analysis of a Hybrid High-Order (HHO)
+approximation for a distributed optimal control problem governed by the Poisson
+equation. We propose three distinct schemes to address unconstrained control
+problems and two schemes for constrained control problems. For the
+unconstrained control problem, while standard finite elements achieve a
+convergence rate of \( k+1 \) (with \( k \) representing the polynomial
+degree), our approach enhances this rate to \( k+2 \) by selecting the control
+from a carefully constructed reconstruction space. For the box-constrained
+problem, we demonstrate that using lowest-order elements (\( \mathbb{P}_0 \))
+yields linear convergence, in contrast to finite element methods (FEM) that
+require linear elements to achieve comparable results. Furthermore, we derive a
+cubic convergence rate for control in the variational discretization scheme.
+Numerical experiments are provided to validate the theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Linear Parameter-Varying Framework for the Analysis of Time-Varying
+  Optimization Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Jakob, Andrea Iannelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we propose a framework to analyze iterative first-order
+optimization algorithms for time-varying convex optimization. We assume that
+the temporal variability is caused by a time-varying parameter entering the
+objective, which can be measured at the time of decision but whose future
+values are unknown. We consider the case of strongly convex objective functions
+with Lipschitz continuous gradients and address the class of running algorithms
+where only one iteration per time change is performed. We model these
+algorithms as discrete-time linear parameter varying (LPV) systems in feedback
+with a time-varying gradient. We leverage the approach of analyzing algorithms
+as uncertain control interconnections with integral quadratic constraints
+(IQCs) and generalize that framework to the time-varying case. We propose novel
+IQCs that are capable of capturing the behavior of time-varying nonlinearities
+and leverage techniques from the LPV literature to establish novel bounds on
+the tracking error. Quantitative bounds can be computed by solving a
+semi-definite program and can be interpreted as an input-to-state stability
+result with respect to a disturbance signal which increases with the temporal
+variability of the problem. As a departure from results in this research area,
+our bounds introduce terms that can be interpreted as a temporal rate of change
+in the cost function and the optimal value. We exemplify our main results with
+numerical experiments that showcase how our analysis framework is able to
+capture convergence rates of different first-order algorithms for time-varying
+optimization through the choice of IQC and rate bounds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Numerical Method for Simultaneous Design and Control Optimization of
+  Seasonal Thermal Energy Storage Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wonsun Song, Jakob Harzer, Christopher Jung, Leon Sander, Moritz Diehl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transition to a carbon-neutral energy system requires massive
+installation of renewable energy sources and economically feasible energy
+storage solutions. This study addresses these challenges by optimizing the
+design and control strategies of an energy system that meets the heat and
+electricity demands of a community. The proposed system integrates solar and
+wind power with energy storage, including seasonal thermal energy storage
+(STES) and battery, coupled via a heat pump. This approach enhances
+self-sufficiency and effectively mitigates seasonal mismatches. To model heat
+transfer between the storage and the ground in the STES system, we employ a
+multi-node lumped-parameter method. The optimization problem is formulated as a
+periodic optimal control problem, which is then transcribed into a nonlinear
+programming problem. To reduce computational complexity, we apply the averaging
+method, which significantly lowers the effort required to solve the problem. We
+apply this approach to a case study, where the economically optimized
+configuration results in a projected total energy cost per household of
+approximately 75 EUR/month over 30 years for both heat and electricity. This
+study demonstrates the feasibility of designing economically viable, autonomous
+energy communities in real-world scenarios, and provides a comprehensive
+optimization framework for designing system components and control strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 12 figures, submitted to Renewable Energy. Editor-in-chief:
+  Nidia Caetano</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stable Set Polytopes with Rank $|V(G)|/3$ for the Lov{á}sz--Schrijver
+  SDP Operator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Hin Au, Levent Tunçel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the lift-and-project rank of the stable set polytope of graphs with
+respect to the Lov{\'a}sz--Schrijver SDP operator $\text{LS}_+$ applied to the
+fractional stable set polytope. In particular, we show that for every positive
+integer $\ell$, the smallest possible graph with $\text{LS}_+$-rank $\ell$
+contains $3\ell$ vertices. This result is sharp and settles a conjecture posed
+by Lipt{\'a}k and the second author in 2003, as well as answers a
+generalization of a problem posed by Knuth in 1994. We also show that for every
+positive integer $\ell$ there exists a vertex-transitive graph on $4\ell+12$
+vertices with $\text{LS}_+$-rank at least $\ell$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Dataset</span> Distillation as Pushforward Optimal Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Ye Tan, Emma Slade
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation aims to find a synthetic training set such that training
+on the synthetic data achieves similar performance to training on real data,
+with orders of magnitude less computational requirements. Existing methods can
+be broadly categorized as either bi-level optimization problems that have
+neural network training heuristics as the lower level problem, or disentangled
+methods that bypass the bi-level optimization by matching distributions of
+data. The latter method has the major advantages of speed and scalability in
+terms of size of both training and distilled datasets. We demonstrate that when
+equipped with an encoder-decoder structure, the empirically successful
+disentangled methods can be reformulated as an optimal quantization problem,
+where a finite set of points is found to approximate the underlying probability
+measure by minimizing the expected projection distance. In particular, we link
+existing disentangled dataset distillation methods to the classical optimal
+quantization and Wasserstein barycenter problems, demonstrating consistency of
+distilled datasets for diffusion-based generative priors. We propose a simple
+extension of the state-of-the-art data distillation method D4M, achieving
+better performance on the ImageNet-1K dataset with trivial additional
+computation, and state-of-the-art performance in higher image-per-class
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Input-to-state stability in integral norms for linear
+  infinite-dimensional systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahiba Arora, Andrii Mironchenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study integral-to-integral input-to-state stability for
+infinite-dimensional linear systems with inputs and trajectories in
+$L^p$-spaces. We start by developing the corresponding admissibility theory for
+linear systems with unbounded input operators. While input-to-state stability
+is typically characterized by exponential stability and finite-time
+admissibility, we show that this equivalence does not extend directly to
+integral norms. For analytic semigroups, we establish a precise
+characterization using maximal regularity theory. Additionally, we provide
+direct Lyapunov theorems and construct Lyapunov functions for $L^p$-$L^q$-ISS
+and demonstrate the results with examples, including diagonal systems and
+diffusion equations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards nonlinearity. The p-regularity theory. Applications and
+  developments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        E. Bednarczuk, O. Brezhneva, K. Leśniewski, A. Prusińska, A. Tret'yakov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present recent advances in the analysis of nonlinear equations with
+singular operators and nonlinear optimization problems with constraints given
+by singular mappings. The results are obtained within the framework of
+$p$-regularity theory, which has developed successfully over the last forty
+years. We illustrate the theory with its applications to degenerate problems in
+various areas of mathematics. In particular, we address the problem of
+describing the tangent cone to the solution set of nonlinear equations in a
+singular case. The structure of p-factor operators is used to propose
+optimality conditions and construct numerical methods for solving degenerate
+nonlinear equations and optimization problems. The methods presented in the
+paper can be considered as the first numerical approaches targeting solutions
+of degenerate problems, such as the Van der Pol differential equation,
+boundary-value problems with a small parameter, partial differential equations
+where Poincar\'e's method of small parameter fails, nonlinear degenerate
+dynamical systems, and others. There are various practical applications for the
+theory of p-regularity, including structural engineering, composite materials,
+and material design. For instance, the theory can be applied to analyze the
+behavior of materials with irregular or complex properties. By considering
+higher-order derivatives, it becomes possible to model and predict the response
+of materials to external forces, such as stress or temperature variations. In
+geophysics, the $p$-regularity theory can be utilized to analyze and interpret
+complex data obtained from seismic surveys, gravity measurements, or
+electromagnetic surveys. The theory also finds applications in the analysis of
+nonlinear differential equations arising in control systems, geometric and
+topological analysis, biomechanics, and many other fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finite Sample Identification of Partially Observed Bilinear Dynamical
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yahya Sattar, Yassir Jedra, Maryam Fazel, Sarah Dean
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning a realization of a partially observed
+bilinear dynamical system (BLDS) from noisy input-output data. Given a single
+trajectory of input-output samples, we provide a finite time analysis for
+learning the system's Markov-like parameters, from which a balanced realization
+of the bilinear system can be obtained. Our bilinear system identification
+algorithm learns the system's Markov-like parameters by regressing the outputs
+to highly correlated, nonlinear, and heavy-tailed covariates. Moreover, the
+stability of BLDS depends on the sequence of inputs used to excite the system.
+These properties, unique to partially observed bilinear dynamical systems, pose
+significant challenges to the analysis of our algorithm for learning the
+unknown dynamics. We address these challenges and provide high probability
+error bounds on our identification algorithm under a uniform stability
+assumption. Our analysis provides insights into system theoretic quantities
+that affect learning accuracy and sample complexity. Lastly, we perform
+numerical experiments with synthetic data to reinforce these insights.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smoothing Iterative Consensus-based Optimization Algorithm for Nonsmooth
+  Nonconvex Optimization Problems with Global Optimality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhen Wei, Wei Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on finding the global minimizer of a general
+unconstrained nonsmooth nonconvex optimization problem. Taking advantage of the
+smoothing method and the consensus-based optimization (CBO) method, we propose
+a novel smoothing iterative consensus-based optimization (SICBO) algorithm.
+First, we prove that the solution process of the proposed algorithm here
+exponentially converges to a common stochastic consensus point almost surely.
+Second, we establish a detailed theoretical analysis to ensure the small enough
+error between the objective function value at the consensus point and the
+optimal function value, to the best of our knowledge, which provides the first
+theoretical guarantee to the global optimality of the proposed algorithm for
+nonconvex optimization problems. Moreover, unlike the previously introduced CBO
+methods, the theoretical results are valid for the cases that the objective
+function is nonsmooth, nonconvex and perhaps non-Lipschitz continuous. Finally,
+several numerical examples are performed to illustrate the effectiveness of our
+proposed algorithm for solving the global minimizer of the nonsmooth and
+nonconvex optimization problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Derivation of effective gradient flow equations and dynamical truncation
+  of training data in Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We derive explicit equations governing the cumulative biases and weights in
+Deep Learning with ReLU activation function, based on gradient descent for the
+Euclidean cost in the input layer, and under the assumption that the weights
+are, in a precise sense, adapted to the coordinate system distinguished by the
+activations. We show that gradient descent corresponds to a dynamical process
+in the input layer, whereby clusters of data are progressively reduced in
+complexity ("truncated") at an exponential rate that increases with the number
+of data points that have already been truncated. We provide a detailed
+discussion of several types of solutions to the gradient flow equations. A main
+motivation for this work is to shed light on the interpretability question in
+supervised learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AMS Latex, 35 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anomalies of the Scholtes regularization for mathematical programs with
+  complementarity constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07383v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07383v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladimir Shikhman, Sebastian Lämmel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For mathematical programs with complementarity constraints (MPCC), we refine
+the convergence analysis of the Scholtes regularization. Our goal is to relate
+nondegenerate C-stationary points of MPCC with nondegenerate Karush-Kuhn-Tucker
+points of its Scholtes regularization. We detected the following anomalies: (i)
+in a neighborhood of a nondegenerate C-stationary point there could be
+degenerate Karush-Kuhn-Tucker points of the Scholtes regularization; (ii) even
+if nondegenerate, they might be locally non-unique; (iii) if nevertheless
+unique, their quadratic index potentially differs from the C-index of the
+C-stationary point under consideration. Thus, a change of the topological type
+for Karush-Kuhn-Tucker points of the Scholtes regularization is possible. In
+particular, a nondegenerate minimizer of MPCC might be approximated by saddle
+points. In order to bypass the mentioned anomalies, an additional generic
+condition for nondegenerate C-stationary points of MPCC is identified. Then, we
+uniquely trace nondegenerate Karush-Kuhn-Tucker points of the Scholtes
+regularization and successively maintain their topological type.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quasiconvex Bulk and Surface Energies with subquadratic growth 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07307v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07307v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menita Carozza, Luca Esposito, Lorenzo Lamberti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We establish partial H\"older continuity of the gradient for equilibrium
+configurations of vectorial multidimensional variational problems, involving
+bulk and surface energies. The bulk energy densities are uniformly strictly
+quasiconvex functions with $p$-growth, $1<p< 2$, without any further structure
+conditions. The anisotropic surface energy is defined by means of an elliptic
+integrand $\Phi$ not necessarily regular.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variable Bregman Majorization-Minimization Algorithm and its Application
+  to Dirichlet Maximum Likelihood Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ségolène Martin, Jean-Christophe Pesquet, Gabriele Steidl, Ismail Ben Ayed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel Bregman descent algorithm for minimizing a convex function
+that is expressed as the sum of a differentiable part (defined over an open
+set) and a possibly nonsmooth term. The approach, referred to as the Variable
+Bregman Majorization-Minimization (VBMM) algorithm, extends the Bregman
+Proximal Gradient method by allowing the Bregman function used in the
+divergence to adaptively vary at each iteration, provided it satisfies a
+majorizing condition on the objective function. This adaptive framework enables
+the algorithm to approximate the objective more precisely at each iteration,
+thereby allowing for accelerated convergence compared to the traditional
+Bregman Proximal Gradient descent. We establish the convergence of the VBMM
+algorithm to a minimizer under mild assumptions on the family of metrics used.
+Furthermore, we introduce a novel application of both the Bregman Proximal
+Gradient method and the VBMM algorithm to the estimation of the
+multidimensional parameters of a Dirichlet distribution through the
+maximization of its log-likelihood. Numerical experiments confirm that the VBMM
+algorithm outperforms existing approaches in terms of convergence speed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Poisoning Attacks against Ridge Regression Models with
+  Categorical Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monse Guedes-Ayala, Lars Schewe, Zeynep Suvak, Miguel Anjos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) models have become a very powerful tool to extract
+information from large datasets and use it to make accurate predictions and
+automated decisions. However, ML models can be vulnerable to external attacks,
+causing them to underperform or deviate from their expected tasks. One way to
+attack ML models is by injecting malicious data to mislead the algorithm during
+the training phase, which is referred to as a poisoning attack. We can prepare
+for such situations by designing anticipated attacks, which are later used for
+creating and testing defence strategies. In this paper, we propose an algorithm
+to generate strong poisoning attacks for a ridge regression model containing
+both numerical and categorical features that explicitly models and poisons
+categorical features. We model categorical features as SOS-1 sets and formulate
+the problem of designing poisoning attacks as a bilevel optimization problem
+that is nonconvex mixed-integer in the upper-level and unconstrained convex
+quadratic in the lower-level. We present the mathematical formulation of the
+problem, introduce a single-level reformulation based on the Karush-Kuhn-Tucker
+(KKT) conditions of the lower level, find bounds for the lower-level variables
+to accelerate solver performance, and propose a new algorithm to poison
+categorical features. Numerical experiments show that our method improves the
+mean squared error of all datasets compared to the previous benchmark in the
+literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrated Wind Farm Design: Optimizing Turbine Placement and Cable
+  Routing with Wake Effects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaap Pedersen, Niels Lindner, Daniel Rehfeldt, Thorsten Koch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An accelerated deployment of renewable energy sources is crucial for a
+successful transformation of the current energy system, with wind energy
+playing a key role in this transition. This study addresses the integrated wind
+farm layout and cable routing problem, a challenging nonlinear optimization
+problem. We model this problem as an extended version of the Quota Steiner Tree
+Problem (QSTP), optimizing turbine placement and network connectivity
+simultaneously to meet specified expansion targets. Our proposed approach
+accounts for the wake effect - a region of reduced wind speed induced by each
+installed turbine - and enforces minimum spacing between turbines. We introduce
+an exact solution framework in terms of the novel Quota Steiner Tree Problem
+with interference (QSTPI). By leveraging an interference-based splitting
+strategy, we develop an advanced solver capable of tackling large-scale problem
+instances. The presented approach outperforms generic state-of-the-art mixed
+integer programming solvers on our dataset by up to two orders of magnitude.
+Moreover, we demonstrate that our integrated method significantly reduces the
+costs in contrast to a sequential approach. Thus, we provide a planning tool
+that enhances existing planning methodologies for supporting a faster and
+cost-efficient expansion of wind energy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ State-space reduction techniques exploiting specific constraints for
+  quantum search Application to a specific job scheduling problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07174v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07174v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rodolphe Griset, Ioannis Lavdas, Jiri Guth Jarkovsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum search has emerged as one of the most promising fields in quantum
+computing. State-of-the-art quantum search algorithms enable the search for
+specific elements in a distribution by monotonically increasing the density of
+these elements until reaching a high density. This kind of algorithms
+demonstrate a theoretical quadratic speed-up on the number of queries compared
+to classical search algorithms in unstructured spaces. Unfortunately, the major
+part of the existing literature applies quantum search to problems which size
+grows exponnentialy with the input size without exploiting any specific problem
+structure, rendering this kind of approach not exploitable in real industrial
+problems. In contrast, this work proposes exploiting specific constraints of
+scheduling problems to build an initial superposition of states with size
+almost quadraticaly increasing as a function of the problem size. This state
+space reduction, inspired by the quantum walk algorithm, constructs a state
+superposition corresponding to all paths in a state-graph embedding spacing
+constraints between jobs. Our numerical results on quantum emulators highlights
+the potential of state space reduction approach, which could lead to more
+efficient quantum search processes by focusing on a smaller, more relevant,
+solution space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Methods for Multiobjective Unit Commitment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ece Tevruez, Aswin Kannan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work considers a multiobjective version of the unit commitment problem
+that deals with finding the optimal generation schedule of a firm, over a
+period of time and a given electrical network. With growing importance of
+environmental impact, some objectives of interest include CO2 emission levels
+and renewable energy penetration, in addition to the standard generation costs.
+Some typical constraints include limits on generation levels and up/down times
+on generation units. This further entails solving a multiobjective mixed
+integer optimization problem. The related literature has predominantly focused
+on heuristics (like Genetic Algorithms) for solving larger problem instances.
+Our major intent in this work is to propose scalable versions of mathematical
+optimization based approaches that help in speeding up the process of
+estimating the underlying Pareto frontier. Our contributions are computational
+and rest on two key embodiments. First, we use the notion of both epsilon
+constraints and adaptive weights to solve a sequence of single objective
+optimization problems. Second, to ease the computational burden, we propose a
+Mccormick-type relaxation for quadratic type constraints that arise due to the
+resulting formulation types. We test the proposed computational framework on
+real network data from [1,50] and compare the same with standard solvers like
+Gurobi. Results show a significant reduction in complexity (computational time)
+when deploying the proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimization with Multi-sourced Reference Information and Unknown Trust:
+  A Distributionally Robust Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanru Guo, Ruiwei Jiang, Siqian Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In problems that involve input parameter information gathered from multiple
+data sources with varying reliability, incorporating users' trust about
+different sources in decision-optimization models can potentially improve
+solution performance and reliability. In this work, we propose a novel
+multi-reference distributionally robust optimization (MR-DRO) framework, where
+the model inputs are uncertain and their probability distributions can be
+statistically inferred from multiple data sources. Via nonparametric data
+fusion, we construct a Wasserstein ambiguity set to minimize the worst-case
+expected value of a stochastic objective function, accounting for both
+uncertainty and unknown reliability of information sources. We reformulate the
+MR-DRO model as a linear program given linear objective and constraints in the
+original problem. We also incorporate a dynamic trust update mechanism that
+adjusts the trust for each source based on its performance over time. In
+addition, we introduce the concept of probability dominance to identify sources
+with dominant trust. Via solving instances of resource allocation and portfolio
+optimization, we demonstrate the effectiveness of the trust-informed MR-DRO
+approach compared to traditional optimization frameworks relying on a single
+data source. Our results highlight the significance of integrating (dynamic)
+user trust in decision making under uncertainty, particularly when given
+diverse and potentially conflicting input data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 9 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Concept of optimal control for epidemic spreading by Vaccination
+  Technique for Assessing social optimum employing Pontryagins Maximum
+  Principle 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md. Mamun-Ur-Rashid Khan, Jun Tanimoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research introduces a new approach utilizing optimal control theory
+(OCT) to assess the Social Optimum (SO) of a vaccination game, navigating the
+intricate considerations of cost, availability, and distribution policies. By
+integrating an SIRS/V epidemic model with a behavior model, the study analyzes
+individual vaccination strategies. A unique optimal control framework, centered
+on vaccination costs, is proposed, diverging significantly from previous
+methods. Our findings confirm the effectiveness and feasibility of this
+approach in managing vaccination strategies. Moreover, we examine the
+underlying social dilemma of the vaccination game, investigating key
+parameters. By calculating the Nash equilibrium (NE) through the behavior model
+and determining the SO using our approach, we measure the Social Efficiency
+Deficit (SED), quantifying the overall cost gap between the NE and SO. Results
+indicate that an increased waning immunity rate exacerbates the social dilemma,
+although higher vaccination costs partially mitigate it. This research provides
+valuable insights into optimizing vaccination strategies amidst complex
+societal dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PDLP: A Practical First-Order Method for Large-Scale Linear Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Applegate, Mateo Díaz, Oliver Hinder, Haihao Lu, Miles Lubin, Brendan O'Donoghue, Warren Schudy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present PDLP, a practical first-order method for linear programming (LP)
+designed to solve large-scale LP problems. PDLP is based on the primal-dual
+hybrid gradient (PDHG) method applied to the minimax formulation of LP. PDLP
+incorporates several enhancements to PDHG, including diagonal preconditioning,
+presolving, adaptive step sizes, adaptive restarting, and feasibility
+polishing. Our algorithm is implemented in C++, available in Google's
+open-source OR-Tools library, and supports multithreading.
+  To evaluate our method, we introduce a new collection of eleven large-scale
+LP problems with sizes ranging from 125 million to 6.3 billion nonzeros. PDLP
+solves eight of these instances to optimality gaps of 1\% (with primal and dual
+feasibility errors of less than $10^{-8}$) within six days on a single machine.
+We also compare PDLP with Gurobi barrier, primal simplex, and dual simplex
+implementations. Gurobi barrier solves only three instances, exceeding our 1TB
+RAM limit on the other eight. While primal and dual simplex are more
+memory-efficient than the barrier method, they are slower and solve only three
+instances within six days.
+  Compared with the conference version of this work (in: Advances in Neural
+Information Processing Systems 34 (NeurIPS 2021)), the key new contributions
+are: (i) feasibility polishing, a technique that quickly finds solutions that
+are approximately optimal but almost exactly feasible (without which only three
+of the eleven problems can be solved); (ii) a multithreaded C++ implementation
+available in Google OR-Tools; and (iii) a new collection of large-scale LP
+problems. Note that the conference version should be referred to for
+comparisons with SCS and ablation studies, which we do not repeat in this
+paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Search for Optimal Low Thrust Spacecraft Trajectories using
+  Diffusion Models and the Indirect Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Graebner, Ryne Beeson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long time-duration low-thrust nonlinear optimal spacecraft trajectory global
+search is a computationally and time expensive problem characterized by
+clustering patterns in locally optimal solutions. During preliminary mission
+design, mission parameters are subject to frequent changes, necessitating that
+trajectory designers efficiently generate high-quality control solutions for
+these new scenarios. Generative machine learning models can be trained to learn
+how the solution structure varies with respect to a conditional parameter,
+thereby accelerating the global search for missions with updated parameters. In
+this work, state-of-the-art diffusion models are integrated with the indirect
+approach for trajectory optimization within a global search framework. This
+framework is tested on two low-thrust transfers of different complexity in the
+circular restricted three-body problem. By generating and analyzing a training
+data set, we develop mathematical relations and techniques to understand the
+complex structures in the costate domain of locally optimal solutions for these
+problems. A diffusion model is trained on this data and successfully
+accelerates the global search for both problems. The model predicts how the
+costate solution structure changes, based on the maximum spacecraft thrust
+magnitude. Warm-starting a numerical solver with diffusion model samples for
+the costates at the initial time increases the number of solutions generated
+per minute for problems with unseen thrust magnitudes by one to two orders of
+magnitude in comparison to samples from a uniform distribution and from an
+adjoint control transformation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Alternating Approach to Approximate Dynamic Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we give a new approximate dynamic programming (ADP) method to
+solve large-scale Markov decision programming (MDP) problem. In comparison with
+many classic ADP methods which have large number of constraints, we formulate
+an alternating ADP (AADP) which have both small number of constraints and small
+number of variables by approximating the decision variables (instead of the
+objective functions in classic ADP) and write the dual of the exact LP. Also,
+to get the basis functions, we use kernel approximation instead of empirical
+choice of basis functions, which can efficiently learn nonlinear functions
+while retaining the expressive power. By treating option pricing as an
+large-scale MDP problem, we apply the AADP method to give an empirical proof
+that American call option will not be exercised earlier if the underlying stock
+has no dividend payment, which is a classic result proved by Black-Scholes
+model. We also make comparison of pricing options in high-dimensional with some
+benchmark option pricing papers which use the classic ADP to give upper and
+lower bound of the option price.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The ultimate upper bound on the injectivity radius of the Stiefel
+  manifold 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02079v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02079v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        P. -A. Absil, Simon Mataigne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We exhibit conjugate points on the Stiefel manifold endowed with any member
+of the family of Riemannian metrics introduced by H\"uper et al. (2021). This
+family contains the well-known canonical and Euclidean metrics. An upper bound
+on the injectivity radius of the Stiefel manifold in the considered metric is
+then obtained as the minimum between the length of the geodesic along which the
+points are conjugate and the length of certain geodesic loops. Numerical
+experiments support the conjecture that the obtained upper bound is in fact
+equal to the injectivity radius.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Version accepted for publication in SIAM Journal on Matrix Analysis
+  and Applications on 6 January 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Approach to Extract Interpretable Rules from Tree Ensembles
+  via Integer Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.00843v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.00843v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Bonasera, Emilio Carrizosa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tree ensembles are very popular machine learning models, known for their
+effectiveness in supervised classification and regression tasks. Their
+performance derives from aggregating predictions of multiple decision trees,
+which are renowned for their interpretability properties. However, tree
+ensemble models do not reliably exhibit interpretable output. Our work aims to
+extract an optimized list of rules from a trained tree ensemble, providing the
+user with a condensed, interpretable model that retains most of the predictive
+power of the full model. Our approach consists of solving a set partitioning
+problem formulated through Integer Programming. The proposed method works with
+either tabular or time series data, for both classification and regression
+tasks, and its flexible formulation can include any arbitrary loss or
+regularization functions. Our extensive computational experiments offer
+statistically significant evidence that our method is competitive with other
+rule extraction methods in terms of predictive performance and fidelity towards
+the tree ensemble. Moreover, we empirically show that the proposed method
+effectively extracts interpretable rules from tree ensemble that are designed
+for time series data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>- Improved overall manuscript flow and clearness - Added related work
+  on explanation fidelity - Added computational results on fidelity - Fixed
+  some flaws on data inference - Optimization problem with weighted objectives
+  - Added appendix containing qualitative examples - New computational results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Directional Smoothness and Gradient Methods: Convergence and Adaptivity <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04081v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04081v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Mishkin, Ahmed Khaled, Yuanhao Wang, Aaron Defazio, Robert M. Gower
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop new sub-optimality bounds for gradient descent (GD) that depend on
+the conditioning of the objective along the path of optimization rather than on
+global, worst-case constants. Key to our proofs is directional smoothness, a
+measure of gradient variation that we use to develop upper-bounds on the
+objective. Minimizing these upper-bounds requires solving implicit equations to
+obtain a sequence of strongly adapted step-sizes; we show that these equations
+are straightforward to solve for convex quadratics and lead to new guarantees
+for two classical step-sizes. For general functions, we prove that the Polyak
+step-size and normalized GD obtain fast, path-dependent rates despite using no
+knowledge of the directional smoothness. Experiments on logistic regression
+show our convergence guarantees are tighter than the classical theory based on
+$L$-smoothness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a poster at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Flow matching for stochastic linear control systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.00617v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.00617v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Mei, Mohammad Al-Jarrah, Amirhossein Taghvaei, Yongxin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the problem of steering an initial probability
+distribution to a target probability distribution through a deterministic or
+stochastic linear control system. Our proposed approach is inspired by the flow
+matching methodology, with the difference that we can only affect the flow
+through the given control channels. The motivation comes from applications such
+as robotic swarms and stochastic thermodynamics, where agents or particles can
+only be manipulated through control actions. The feedback control law that
+achieves the task is characterized as the conditional expectation of the
+control inputs for the stochastic bridges that respect the given control system
+dynamics. Explicit forms are derived for special cases, and a numerical
+procedure is presented to approximate the control law, illustrated with
+examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Partial Information in a Mean-Variance Portfolio Selection Game 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04045v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04045v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Jui Huang, Li-Hsien Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers finitely many investors who perform mean-variance
+portfolio selection under relative performance criteria. That is, each investor
+is concerned about not only her terminal wealth, but how it compares to the
+average terminal wealth of all investors. At the inter-personal level, each
+investor selects a trading strategy in response to others' strategies. This
+selected strategy additionally needs to yield an equilibrium intra-personally,
+so as to resolve time inconsistency among the investor's current and future
+selves (triggered by the mean-variance objective). A Nash equilibrium we look
+for is thus a tuple of trading strategies under which every investor achieves
+her intra-personal equilibrium simultaneously. We derive such a Nash
+equilibrium explicitly in the idealized case of full information (i.e., the
+dynamics of the underlying stock is perfectly known) and semi-explicitly in the
+realistic case of partial information (i.e., the stock evolution is observed,
+but the expected return of the stock is not precisely known). The formula under
+partial information consists of the myopic trading and intertemporal hedging
+terms, both of which depend on an additional state process that serves to
+filter the true expected return and whose influence on trading is captured by a
+degenerate Cauchy problem. Our results identify that relative performance
+criteria can induce downward self-reinforcement of investors' wealth--if every
+investor suffers a wealth decline simultaneously, then everyone's wealth tends
+to decline further. This phenomenon, as numerical examples show, is negligible
+under full information but pronounced under partial information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trade-Off Between Distributional Belief and Ambiguity:
+  Conservatism, Finite-Sample Guarantees, and Asymptotic Properties 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.19234v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.19234v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Man Yiu Tsang, Karmel S. Shehadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose and analyze a new data-driven trade-off (TRO) approach for
+modeling uncertainty that serves as a middle ground between the optimistic
+approach, which adopts a distributional belief, and the pessimistic
+distributionally robust optimization approach, which hedges against
+distributional ambiguity. We equip the TRO model with a TRO ambiguity set
+characterized by a size parameter controlling the level of optimism and a shape
+parameter representing distributional ambiguity. We first show that
+constructing the TRO ambiguity set using a general star-shaped shape parameter
+with the empirical distribution as its star center is necessary and sufficient
+to guarantee the hierarchical structure of the sequence of TRO ambiguity sets.
+Then, we analyze the properties of the TRO model, including quantifying
+conservatism, quantifying bias and generalization error, and establishing
+asymptotic properties. Specifically, we show that the TRO model could generate
+a spectrum of decisions, ranging from optimistic to conservative decisions.
+Additionally, we show that it could produce an unbiased estimator of the true
+optimal value. Furthermore, we establish the almost-sure convergence of the
+optimal value and the set of optimal solutions of the TRO model to their true
+counterparts. We exemplify our theoretical results using an inventory control
+problem and a portfolio optimization problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regression Equilibrium in Electricity Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17753v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17753v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladimir Dvorkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In two-stage electricity markets, renewable power producers enter the
+day-ahead market with a forecast of future power generation and then reconcile
+any forecast deviation in the real-time market at a penalty. The choice of the
+forecast model is thus an important strategy decision for renewable power
+producers as it affects financial performance. In electricity markets with
+large shares of renewable generation, the choice of the forecast model impacts
+not only individual performance but also outcomes for other producers. In this
+paper, we argue for the existence of a competitive regression equilibrium in
+two-stage electricity markets in terms of the parameters of private forecast
+models informing the participation strategies of renewable power producers. In
+our model, renewables optimize the forecast against the day-ahead and real-time
+prices, thereby maximizing the average profits across the day-ahead and
+real-time markets. By doing so, they also implicitly enhance the temporal cost
+coordination of day-ahead and real-time markets. We base the equilibrium
+analysis on the theory of variational inequalities, providing results on the
+existence and uniqueness of regression equilibrium in energy-only markets. We
+also devise two methods to compute regression equilibrium: centralized
+optimization and a decentralized ADMM-based algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Remove that Square Root: A New Efficient Scale-Invariant Version of
+  AdaGrad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02648v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02648v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sayantan Choudhury, Nazarii Tupitsa, Nicolas Loizou, Samuel Horvath, Martin Takac, Eduard Gorbunov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive methods are extremely popular in machine learning as they make
+learning rate tuning less expensive. This paper introduces a novel optimization
+algorithm named KATE, which presents a scale-invariant adaptation of the
+well-known AdaGrad algorithm. We prove the scale-invariance of KATE for the
+case of Generalized Linear Models. Moreover, for general smooth non-convex
+problems, we establish a convergence rate of $O \left(\frac{\log T}{\sqrt{T}}
+\right)$ for KATE, matching the best-known ones for AdaGrad and Adam. We also
+compare KATE to other state-of-the-art adaptive algorithms Adam and AdaGrad in
+numerical experiments with different problems, including complex machine
+learning tasks like image classification and text classification on real data.
+The results indicate that KATE consistently outperforms AdaGrad and
+matches/surpasses the performance of Adam in all considered scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Barcodes as Summary of Loss Function Topology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1912.00043v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1912.00043v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serguei Barannikov, Alexander Korotin, Dmitry Oganesyan, Daniil Emtsev, Evgeny Burnaev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose to study neural networks' loss surfaces by methods of topological
+data analysis. We suggest to apply barcodes of Morse complexes to explore
+topology of loss surfaces. An algorithm for calculations of the loss function's
+barcodes of local minima is described. We have conducted experiments for
+calculating barcodes of local minima for benchmark functions and for loss
+surfaces of small neural networks. Our experiments confirm our two principal
+observations for neural networks' loss surfaces. First, the barcodes of local
+minima are located in a small lower part of the range of values of neural
+networks' loss function. Secondly, increase of the neural network's depth and
+width lowers the barcodes of local minima. This has some natural implications
+for the neural network's learning and for its generalization properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accelerating genetic optimization of nonlinear model predictive control
+  by learning optimal search space size 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eslam Mostafa, Hussein A. Aly, Ahmed Elliethy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Genetic algorithm (GA) is typically used to solve nonlinear model predictive
+control's optimization problem. However, the size of the search space in which
+the GA searches for the optimal control inputs is crucial for its applicability
+to fast-response systems. This paper proposes accelerating the genetic
+optimization of NMPC by learning optimal search space size. The approach trains
+a multivariate regression model to adaptively predict the best smallest size of
+the search space in every control cycle. The proposed approach reduces the GA's
+computational time, improves the chance of convergence to better control
+inputs, and provides a stable and feasible solution. The proposed approach was
+evaluated on three nonlinear systems and compared to four other evolutionary
+algorithms implemented in a processor-in-the-loop fashion. The results show
+that the proposed approach provides a 17-45\% reduction in computational time
+and increases the convergence rate by 35-47\%. The source code is available on
+GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the Journal of Control and Decision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Moment-based parameter inference with error guarantees for stochastic
+  reaction networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.17434v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.17434v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekai Li, Mauricio Barahona, Philipp Thomas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inferring parameters of models of biochemical kinetics from single-cell data
+remains challenging because of the uncertainty arising from the intractability
+of the likelihood function of stochastic reaction networks. Such uncertainty
+falls beyond current error quantification measures, which focus on the effects
+of finite sample size and identifiability but lack theoretical guarantees when
+likelihood approximations are needed. Here, we propose a method for the
+inference of parameters of stochastic reaction networks that works for both
+steady-state and time-resolved data and is applicable to networks with
+non-linear and rational propensities. Our approach provides bounds on the
+parameters via convex optimisation over sets constrained by moment equations
+and moment matrices by taking observations to form moment intervals, which are
+then used to constrain parameters through convex sets. The bounds on the
+parameters contain the true parameters under the condition that the moment
+intervals contain the true moments, thus providing uncertainty quantification
+and error guarantees. Our approach does not need to predict moments and
+distributions for given parameters (i.e., it avoids solving or simulating the
+forward problem), and hence circumvents intractable likelihood computations or
+computationally expensive simulations. We demonstrate its use for uncertainty
+quantification, data integration and prediction of latent species statistics
+through synthetic data from common non-linear biochemical models including the
+Schl\"ogl model and the toggle switch, a model of post-transcriptional
+regulation at steady state, and a birth-death model with time-dependent data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Analytic Minimal Rank Sard Conjecture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.01392v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.01392v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A Belotto da Silva, A Parusiński, L Rifford
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We obtain, under an additional assumption on the subanalytic abnormal
+distribution constructed in [4], a proof of the minimal rank Sard conjecture in
+the analytic category. It establishes that from a given point the set of points
+accessible through singular horizontal curves of minimal rank, which
+corresponds to the rank of the distribution, has Lebesgue measure zero. The
+minimal rank Sard Conjecture is equivalent to the Sard Conjecture for co-rank 1
+distributions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Important: The previous paper had two sets of distinct results. We
+  have divided the paper in two, and this version contains the second set of
+  results. The first part will appear in a new hal submission hal-04881557. The
+  title has changed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deck of Cards method for Hierarchical, Robust and Stochastic Ordinal
+  Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.04313v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.04313v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salvatore Corrente, Salvatore Greco, Silvano Zappalá
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the recently introduced application of the Deck of Cards Method
+(DCM) to ordinal regression proposing two extensions related to two main
+research trends in Multiple Criteria Decision Aiding, namely scaling and
+ordinal regression generalizations. On the one hand, procedures, different from
+DCM (e.g. AHP, BWM, MACBETH) to collect and elaborate Decision Maker's (DM's)
+preference information are considered to define an overall evaluation of
+reference alternatives. On the other hand, Robust Ordinal Regression and
+Stochastic Multicriteria Acceptability Analysis are used to offer the DM more
+detailed and realistic decision-support outcomes. More precisely, we take into
+account preference imprecision and indetermination through a set of admissible
+comprehensive evaluations of alternatives provided by the whole set of value
+functions compatible with DM's preference information rather than the univocal
+assessment obtained from a single value function. In addition, we also consider
+alternatives evaluated on a set of criteria hierarchically structured. The
+methodology we propose allows the DM to provide precise or imprecise
+information at different levels of the hierarchy of criteria. Like scaling
+procedures, the compatible value function we consider can be of a different
+nature, such as weighted sum, linear or general monotone value function, or
+Choquet integral. Consequently, the approach we propose is versatile and
+well-equipped to be adapted to DM's characteristics and requirements. The
+applicability of the proposed methodology is shown by a didactic example based
+on a large ongoing research project in which Italian regions are evaluated on
+criteria representing Circular Economy, Innovation-Driven Development and Smart
+Specialization Strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A descent method for nonsmooth multiobjective optimization problems on
+  Riemannian manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11990v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11990v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunming Tang, Hao He, Jinbao Jian, Miantao Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a descent method for nonsmooth multiobjective optimization
+problems on complete Riemannian manifolds is proposed. The objective functions
+are only assumed to be locally Lipschitz continuous instead of convexity used
+in existing methods. A necessary condition for Pareto optimality in Euclidean
+space is generalized to the Riemannian setting. At every iteration, an
+acceptable descent direction is obtained by constructing a convex hull of some
+Riemannian $\varepsilon$-subgradients. And then a Riemannian Armijo-type line
+search is executed to produce the next iterate. The convergence result is
+established in the sense that a point satisfying the necessary condition for
+Pareto optimality can be generated by the algorithm in a finite number of
+iterations. Finally, some preliminary numerical results are reported, which
+show that the proposed method is efficient.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Gradient Tracking Algorithms for Distributed Optimization
+  Problems with Inexact Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05737v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05737v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengchao Zhao, Yongchao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributed optimization problems usually face inexact communication issues
+induced by communication quantization, differential privacy protection, or
+channels noise. Most existing algorithms need two-timescale setting of the
+stepsize of gradient descent and the parameter of noise suppression to ensure
+the convergence to the optimal solution. In this paper, we propose two
+single-timescale algorithms, VRA-DGT and VRA--DSGT, for distributed
+deterministic and stochastic optimization problems with inexact communication
+respectively. VRA-DGT integrates the Variance-Reduced Aggregation (VRA)
+mechanism with the distributed gradient tracking framework, which achieves a
+convergence rate of $\mathcal{O}\left(k^{-1}\right)$ in the mean-square sense
+when the objective function is strongly convex and smooth. For distributed
+stochastic optimization problem,VRA-DSGT, where a hybrid variance reduction
+technique has been introduced in VRA-DGT,
+  VRA-DGT,, maintains the convergence rate of $\mathcal{O}\left(k^{-1}\right)$
+for strongly convex and smooth objective function. Simulated experiments on
+logistic regression problem with real-world data verify the effectiveness of
+the proposed algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A 2-approximation algorithm for the softwired parsimony problem on
+  binary, tree-child phylogenetic networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18077v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18077v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Frohn, Steven Kelk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding the most parsimonious tree inside a phylogenetic network with respect
+to a given character is an NP-hard combinatorial optimization problem that for
+many network topologies is essentially inapproximable. In contrast, if the
+network is a rooted tree, then Fitch's well-known algorithm calculates an
+optimal parsimony score for that character in polynomial time. Drawing
+inspiration from this we here introduce a new extension of Fitch's algorithm
+which runs in polynomial time and ensures an approximation factor of 2 on
+binary, tree-child phylogenetic networks, a popular topologically-restricted
+subclass of phylogenetic networks in the literature. Specifically, we show that
+Fitch's algorithm can be seen as a primal-dual algorithm, how it can be
+extended to binary, tree-child networks and that the approximation guarantee of
+this extension is tight. These results for a classic problem in phylogenetics
+strengthens the link between polyhedral methods and phylogenetics and can aid
+in the study of other related optimization problems on phylogenetic networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Anytime Solvers for Variational Inequalities: the (Recursive) Safe
+  Monotone Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09527v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09527v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Allibhoy, Jorge Cortés
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper synthesizes anytime algorithms, in the form of continuous-time
+dynamical systems, to solve monotone variational inequalities. We introduce
+three algorithms that solve this problem: the projected monotone flow, the safe
+monotone flow, and the recursive safe monotone flow. The first two systems
+admit dual interpretations: either as projected dynamical systems or as
+dynamical systems controlled with a feedback controller synthesized using
+techniques from safety-critical control. The third flow bypasses the need to
+solve quadratic programs along the trajectories by incorporating a dynamics
+whose equilibria precisely correspond to such solutions, and interconnecting
+the dynamical systems on different time scales. We perform a thorough analysis
+of the dynamical properties of all three systems. For the safe monotone flow,
+we show that equilibria correspond exactly with critical points of the original
+problem, and the constraint set is forward invariant and asymptotically stable.
+The additional assumption of convexity and monotonicity allows us to derive
+global stability guarantees, as well as establish the system is contracting
+when the constraint set is polyhedral. For the recursive safe monotone flow, we
+use tools from singular perturbation theory for contracting systems to show KKT
+points are locally exponentially stable and globally attracting, and obtain
+practical safety guarantees. We illustrate the performance of the flows on a
+two-player game example and also demonstrate the versatility for
+interconnection and regulation of dynamical processes of the safe monotone flow
+in an example of a receding horizon linear quadratic dynamic game.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Optimal Switching Approach for Bird Migration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19467v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19467v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Chu, King-Yeung Lam, Boyu Wang, Tong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bird migration is an adaptive behavior ultimately aiming at optimizing
+survival and reproductive success. We propose an optimal switching model to
+study bird migration, where birds' migration behaviors can be efficiently
+modeled as switching between different stochastic differential equations. For
+individuals with perfect information regarding the environment, we implement
+numeric methods to see the expected payoff and corresponding optimal control.
+For individual with only partial information of the environment, we combine the
+finite difference method and stochastic simulations to investigate the change
+of the bird's optimal strategy. Based on biological backgrounds, we
+characterizing the optimal strategies of birds under different scenarios and
+these behaviors depend on the specific assumptions of the model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Estimation of Relaxed Model Parameters for Robust UAV
+  Trajectory Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10941v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10941v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Derek Fan, David A. Copp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online trajectory optimization and optimal control methods are crucial for
+enabling sustainable unmanned aerial vehicle (UAV) services, such as
+agriculture, environmental monitoring, and transportation, where available
+actuation and energy are limited. However, optimal controllers are highly
+sensitive to model mismatch, which can occur due to loaded equipment, packages
+to be delivered, or pre-existing variability in fundamental structural and
+thrust-related parameters. To circumvent this problem, optimal controllers can
+be paired with parameter estimators to improve their trajectory planning
+performance and perform adaptive control. However, UAV platforms are limited in
+terms of onboard processing power, oftentimes making nonlinear parameter
+estimation too computationally expensive to consider. To address these issues,
+we propose a relaxed, affine-in-parameters multirotor model along with an
+efficient optimal parameter estimator. We convexify the nominal Moving Horizon
+Parameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via
+an affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast
+quadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC)
+in real time. We compare this approach to the equivalent nonlinear estimator in
+Monte Carlo simulations, demonstrating a decrease in average solve time and
+trajectory optimality cost by 98.2% and 23.9-56.2%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, to be published in IEEE Sustech 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">140</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Dataset</span> Distillation via Committee Voting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Cui, Zhaoyi Li, Xiaochen Ma, Xinyue Bi, Yaxin Luo, Zhiqiang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation aims to synthesize a smaller, representative dataset
+that preserves the essential properties of the original data, enabling
+efficient model training with reduced computational resources. Prior work has
+primarily focused on improving the alignment or matching process between
+original and synthetic data, or on enhancing the efficiency of distilling large
+datasets. In this work, we introduce ${\bf C}$ommittee ${\bf V}$oting for ${\bf
+D}$ataset ${\bf D}$istillation (CV-DD), a novel and orthogonal approach that
+leverages the collective wisdom of multiple models or experts to create
+high-quality distilled datasets. We start by showing how to establish a strong
+baseline that already achieves state-of-the-art accuracy through leveraging
+recent advancements and thoughtful adjustments in model design and optimization
+processes. By integrating distributions and predictions from a committee of
+models while generating high-quality soft labels, our method captures a wider
+spectrum of data features, reduces model-specific biases and the adverse
+effects of distribution shifts, leading to significant improvements in
+generalization. This voting-based strategy not only promotes diversity and
+robustness within the distilled dataset but also significantly reduces
+overfitting, resulting in improved performance on post-eval tasks. Extensive
+experiments across various datasets and IPCs (images per class) demonstrate
+that Committee Voting leads to more reliable and adaptable distilled data
+compared to single/multi-model distillation methods, demonstrating its
+potential for efficient and accurate dataset distillation. Code is available
+at: https://github.com/Jiacheng8/CV-DD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code at: https://github.com/Jiacheng8/CV-DD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UnCommon Objects in 3D 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingchen Liu, Piyush Tayal, Jianyuan Wang, Jesus Zarzar, Tom Monnier, Konstantinos Tertikas, Jiali Duan, Antoine Toisoul, Jason Y. Zhang, Natalia Neverova, Andrea Vedaldi, Roman Shapovalov, David Novotny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Uncommon Objects in 3D (uCO3D), a new object-centric dataset for
+3D deep learning and 3D generative AI. uCO3D is the largest publicly-available
+collection of high-resolution videos of objects with 3D annotations that
+ensures full-360$^{\circ}$ coverage. uCO3D is significantly more diverse than
+MVImgNet and CO3Dv2, covering more than 1,000 object categories. It is also of
+higher quality, due to extensive quality checks of both the collected videos
+and the 3D annotations. Similar to analogous datasets, uCO3D contains
+annotations for 3D camera poses, depth maps and sparse point clouds. In
+addition, each object is equipped with a caption and a 3D Gaussian Splat
+reconstruction. We train several large 3D models on MVImgNet, CO3Dv2, and uCO3D
+and obtain superior results using the latter, showing that uCO3D is better for
+learning applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training-Free Motion-Guided Video Generation with Enhanced Temporal
+  Consistency Using Motion Consistency Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Zhang, Zicheng Duan, Dong Gong, Lingqiao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address the challenge of generating temporally consistent
+videos with motion guidance. While many existing methods depend on additional
+control modules or inference-time fine-tuning, recent studies suggest that
+effective motion guidance is achievable without altering the model architecture
+or requiring extra training. Such approaches offer promising compatibility with
+various video generation foundation models. However, existing training-free
+methods often struggle to maintain consistent temporal coherence across frames
+or to follow guided motion accurately. In this work, we propose a simple yet
+effective solution that combines an initial-noise-based approach with a novel
+motion consistency loss, the latter being our key innovation. Specifically, we
+capture the inter-frame feature correlation patterns of intermediate features
+from a video diffusion model to represent the motion pattern of the reference
+video. We then design a motion consistency loss to maintain similar feature
+correlation patterns in the generated video, using the gradient of this loss in
+the latent space to guide the generation process for precise motion control.
+This approach improves temporal consistency across various motion control tasks
+while preserving the benefits of a training-free setup. Extensive experiments
+show that our method sets a new standard for efficient, temporally coherent
+video generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page:
+  https://zhangxinyu-xyz.github.io/SimulateMotion.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MatchAnything: Universal Cross-Modality Image Matching with Large-Scale
+  <span class="highlight-title">Pre-Train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07556v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07556v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyi He, Hao Yu, Sida Peng, Dongli Tan, Zehong Shen, Hujun Bao, Xiaowei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image matching, which aims to identify corresponding pixel locations between
+images, is crucial in a wide range of scientific disciplines, aiding in image
+registration, fusion, and analysis. In recent years, deep learning-based image
+matching algorithms have dramatically outperformed humans in rapidly and
+accurately finding large amounts of correspondences. However, when dealing with
+images captured under different imaging modalities that result in significant
+appearance changes, the performance of these algorithms often deteriorates due
+to the scarcity of annotated cross-modal training data. This limitation hinders
+applications in various fields that rely on multiple image modalities to obtain
+complementary information. To address this challenge, we propose a large-scale
+pre-training framework that utilizes synthetic cross-modal training signals,
+incorporating diverse data from various sources, to train models to recognize
+and match fundamental structures across images. This capability is transferable
+to real-world, unseen cross-modality image matching tasks. Our key finding is
+that the matching model trained with our framework achieves remarkable
+generalizability across more than eight unseen cross-modality registration
+tasks using the same network weight, substantially outperforming existing
+methods, whether designed for generalization or tailored for specific tasks.
+This advancement significantly enhances the applicability of image matching
+technologies across various scientific disciplines and paves the way for new
+applications in multi-modality human and artificial intelligence analysis and
+beyond.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://zju3dv.github.io/MatchAnything/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SST-EM: Advanced Metrics for Evaluating Semantic, Spatial and Temporal
+  Aspects in Video Editing <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07554v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07554v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Varun Biyyala, Bharat Chanderprakash Kathuria, Jialu Li, Youshan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video editing models have advanced significantly, but evaluating their
+performance remains challenging. Traditional metrics, such as CLIP text and
+image scores, often fall short: text scores are limited by inadequate training
+data and hierarchical dependencies, while image scores fail to assess temporal
+consistency. We present SST-EM (Semantic, Spatial, and Temporal Evaluation
+Metric), a novel evaluation framework that leverages modern Vision-Language
+Models (VLMs), Object Detection, and Temporal Consistency checks. SST-EM
+comprises four components: (1) semantic extraction from frames using a VLM, (2)
+primary object tracking with Object Detection, (3) focused object refinement
+via an LLM agent, and (4) temporal consistency assessment using a Vision
+Transformer (ViT). These components are integrated into a unified metric with
+weights derived from human evaluations and regression analysis. The name SST-EM
+reflects its focus on Semantic, Spatial, and Temporal aspects of video
+evaluation. SST-EM provides a comprehensive evaluation of semantic fidelity and
+temporal smoothness in video editing. The source code is available in the
+\textbf{\href{https://github.com/custommetrics-sst/SST_CustomEvaluationMetrics.git}{GitHub
+Repository}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Imagine while Reasoning in Space: Multimodal Visualization-of-Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengzu Li, Wenshan Wu, Huanyu Zhang, Yan Xia, Shaoguang Mao, Li Dong, Ivan Vulić, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) prompting has proven highly effective for enhancing
+complex reasoning in Large Language Models (LLMs) and Multimodal Large Language
+Models (MLLMs). Yet, it struggles in complex spatial reasoning tasks.
+Nonetheless, human cognition extends beyond language alone, enabling the
+remarkable capability to think in both words and images. Inspired by this
+mechanism, we propose a new reasoning paradigm, Multimodal
+Visualization-of-Thought (MVoT). It enables visual thinking in MLLMs by
+generating image visualizations of their reasoning traces. To ensure
+high-quality visualization, we introduce token discrepancy loss into
+autoregressive MLLMs. This innovation significantly improves both visual
+coherence and fidelity. We validate this approach through several dynamic
+spatial reasoning tasks. Experimental results reveal that MVoT demonstrates
+competitive performance across tasks. Moreover, it exhibits robust and reliable
+improvements in the most challenging scenarios where CoT fails. Ultimately,
+MVoT establishes new possibilities for complex reasoning tasks where visual
+thinking can effectively complement verbal reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures, 4 tables (27 pages, 10 figures, 16 tables
+  including references and appendices)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Training of Neural Networks to Achieve Bayes Optimal
+  Classification Accuracy <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Tavasoli Naeini, Ali Bereyhi, Morteza Noshad, Ben Liang, Alfred O. Hero III
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work invokes the notion of $f$-divergence to introduce a novel upper
+bound on the Bayes error rate of a general classification task. We show that
+the proposed bound can be computed by sampling from the output of a
+parameterized model. Using this practical interpretation, we introduce the
+Bayes optimal learning threshold (BOLT) loss whose minimization enforces a
+classification model to achieve the Bayes error rate. We validate the proposed
+loss for image and text classification tasks, considering MNIST, Fashion-MNIST,
+CIFAR-10, and IMDb datasets. Numerical experiments demonstrate that models
+trained with BOLT achieve performance on par with or exceeding that of
+cross-entropy, particularly on challenging datasets. This highlights the
+potential of BOLT in improving generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Sclera Segmentation through Semi-supervised Learning with Fewer
+  Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanjun Wang, Lu Wang, Ning Niu, Qiaoyi Yao, Yixuan Wang, Sufen Ren, Shengchao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sclera segmentation is crucial for developing automatic eye-related medical
+computer-aided diagnostic systems, as well as for personal identification and
+verification, because the sclera contains distinct personal features. Deep
+learning-based sclera segmentation has achieved significant success compared to
+traditional methods that rely on hand-crafted features, primarily because it
+can autonomously extract critical output-related features without the need to
+consider potential physical constraints. However, achieving accurate sclera
+segmentation using these methods is challenging due to the scarcity of
+high-quality, fully labeled datasets, which depend on costly, labor-intensive
+medical acquisition and expertise. To address this challenge, this paper
+introduces a novel sclera segmentation framework that excels with limited
+labeled samples. Specifically, we employ a semi-supervised learning method that
+integrates domain-specific improvements and image-based spatial transformations
+to enhance segmentation performance. Additionally, we have developed a
+real-world eye diagnosis dataset to enrich the evaluation process. Extensive
+experiments on our dataset and two additional public datasets demonstrate the
+effectiveness and superiority of our proposed method, especially with
+significantly fewer labeled samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review, 19 pages, 9 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Heterogeneous Multimodal Graph Learning Framework for Recognizing User
+  Emotions in Social Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sree Bhattacharyya, Shuhua Yang, James Z. Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid expansion of social media platforms has provided unprecedented
+access to massive amounts of multimodal user-generated content. Comprehending
+user emotions can provide valuable insights for improving communication and
+understanding of human behaviors. Despite significant advancements in Affective
+Computing, the diverse factors influencing user emotions in social networks
+remain relatively understudied. Moreover, there is a notable lack of deep
+learning-based methods for predicting user emotions in social networks, which
+could be addressed by leveraging the extensive multimodal data available. This
+work presents a novel formulation of personalized emotion prediction in social
+networks based on heterogeneous graph learning. Building upon this formulation,
+we design HMG-Emo, a Heterogeneous Multimodal Graph Learning Framework that
+utilizes deep learning-based features for user emotion recognition.
+Additionally, we include a dynamic context fusion module in HMG-Emo that is
+capable of adaptively integrating the different modalities in social media
+data. Through extensive experiments, we demonstrate the effectiveness of
+HMG-Emo and verify the superiority of adopting a graph neural network-based
+approach, which outperforms existing baselines that use rich hand-crafted
+features. To the best of our knowledge, HMG-Emo is the first multimodal and
+deep-learning-based approach to predict personalized emotions within online
+social networks. Our work highlights the significance of exploiting advanced
+deep learning techniques for less-explored problems in Affective Computing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fixing the Scale and Shift in Monocular Depth For Camera Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07742v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07742v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaqing Ding, Václav Vávra, Viktor Kocur, Jian Yang, Torsten Sattler, Zuzana Kukelova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in monocular depth prediction have led to significantly
+improved depth prediction accuracy. In turn, this enables various applications
+to use such depth predictions. In this paper, we propose a novel framework for
+estimating the relative pose between two cameras from point correspondences
+with associated monocular depths. Since depth predictions are typically defined
+up to an unknown scale and shift parameter, our solvers jointly estimate both
+scale and shift parameters together with the camera pose. We derive efficient
+solvers for three cases: (1) two calibrated cameras, (2) two uncalibrated
+cameras with an unknown but shared focal length, and (3) two uncalibrated
+cameras with unknown and different focal lengths. Experiments on synthetic and
+real data, including experiments with depth maps estimated by 11 different
+depth predictors, show the practical viability of our solvers. Compared to
+prior work, our solvers achieve state-of-the-art results on two large-scale,
+real-world datasets. The source code is available at
+https://github.com/yaqding/pose_monodepth
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Democratizing Text-to-Image Masked Generative Models with Compact
+  Text-Aware One-Dimensional Tokens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongwon Kim, Ju He, Qihang Yu, Chenglin Yang, Xiaohui Shen, Suha Kwak, Liang-Chieh Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image tokenizers form the foundation of modern text-to-image generative
+models but are notoriously difficult to train. Furthermore, most existing
+text-to-image models rely on large-scale, high-quality private datasets, making
+them challenging to replicate. In this work, we introduce Text-Aware
+Transformer-based 1-Dimensional Tokenizer (TA-TiTok), an efficient and powerful
+image tokenizer that can utilize either discrete or continuous 1-dimensional
+tokens. TA-TiTok uniquely integrates textual information during the tokenizer
+decoding stage (i.e., de-tokenization), accelerating convergence and enhancing
+performance. TA-TiTok also benefits from a simplified, yet effective, one-stage
+training process, eliminating the need for the complex two-stage distillation
+used in previous 1-dimensional tokenizers. This design allows for seamless
+scalability to large datasets. Building on this, we introduce a family of
+text-to-image Masked Generative Models (MaskGen), trained exclusively on open
+data while achieving comparable performance to models trained on private data.
+We aim to release both the efficient, strong TA-TiTok tokenizers and the
+open-data, open-weight MaskGen models to promote broader access and democratize
+the field of text-to-image masked generative models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at https://tacju.github.io/projects/maskgen.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Testing Human-Hand Segmentation on In-Distribution and
+  Out-of-Distribution Data in Human-Robot Interactions Using a Deep Ensemble
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Jalayer, Yuxin Chen, Masoud Jalayer, Carlotta Orsenigo, Masayoshi Tomizuka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliable detection and segmentation of human hands are critical for enhancing
+safety and facilitating advanced interactions in human-robot collaboration.
+Current research predominantly evaluates hand segmentation under
+in-distribution (ID) data, which reflects the training data of deep learning
+(DL) models. However, this approach fails to address out-of-distribution (OOD)
+scenarios that often arise in real-world human-robot interactions. In this
+study, we present a novel approach by evaluating the performance of pre-trained
+DL models under both ID data and more challenging OOD scenarios. To mimic
+realistic industrial scenarios, we designed a diverse dataset featuring simple
+and cluttered backgrounds with industrial tools, varying numbers of hands (0 to
+4), and hands with and without gloves. For OOD scenarios, we incorporated
+unique and rare conditions such as finger-crossing gestures and motion blur
+from fast-moving hands, addressing both epistemic and aleatoric uncertainties.
+To ensure multiple point of views (PoVs), we utilized both egocentric cameras,
+mounted on the operator's head, and static cameras to capture RGB images of
+human-robot interactions. This approach allowed us to account for multiple
+camera perspectives while also evaluating the performance of models trained on
+existing egocentric datasets as well as static-camera datasets. For
+segmentation, we used a deep ensemble model composed of UNet and RefineNet as
+base learners. Performance evaluation was conducted using segmentation metrics
+and uncertainty quantification via predictive entropy. Results revealed that
+models trained on industrial datasets outperformed those trained on
+non-industrial datasets, highlighting the importance of context-specific
+training. Although all models struggled with OOD scenarios, those trained on
+industrial datasets demonstrated significantly better generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pedestrian Trajectory Prediction Based on Social Interactions Learning
+  With Random Weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajia Xie, Sheng Zhang, Beihao Xia, Zhu Xiao, Hongbo Jiang, Siwang Zhou, Zheng Qin, Hongyang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pedestrian trajectory prediction is a critical technology in the evolution of
+self-driving cars toward complete artificial intelligence. Over recent years,
+focusing on the trajectories of pedestrians to model their social interactions
+has surged with great interest in more accurate trajectory predictions.
+However, existing methods for modeling pedestrian social interactions rely on
+pre-defined rules, struggling to capture non-explicit social interactions. In
+this work, we propose a novel framework named DTGAN, which extends the
+application of Generative Adversarial Networks (GANs) to graph sequence data,
+with the primary objective of automatically capturing implicit social
+interactions and achieving precise predictions of pedestrian trajectory. DTGAN
+innovatively incorporates random weights within each graph to eliminate the
+need for pre-defined interaction rules. We further enhance the performance of
+DTGAN by exploring diverse task loss functions during adversarial training,
+which yields improvements of 16.7\% and 39.3\% on metrics ADE and FDE,
+respectively. The effectiveness and accuracy of our framework are verified on
+two public datasets. The experimental results show that our proposed DTGAN
+achieves superior performance and is well able to understand pedestrians'
+intentions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,7 figures,Accepted to IEEE Transactions on Multimedia (TMM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ C2PD: Continuity-Constrained Pixelwise Deformation for Guided Depth
+  Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahui Kang, Qing Cai, Runqing Tan, Yimei Liu, Zhi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Guided depth super-resolution (GDSR) has demonstrated impressive performance
+across a wide range of domains, with numerous methods being proposed. However,
+existing methods often treat depth maps as images, where shading values are
+computed discretely, making them struggle to effectively restore the continuity
+inherent in the depth map. In this paper, we propose a novel approach that
+maximizes the utilization of spatial characteristics in depth, coupled with
+human abstract perception of real-world substance, by transforming the GDSR
+issue into deformation of a roughcast with ideal plasticity, which can be
+deformed by force like a continuous object. Specifically, we firstly designed a
+cross-modal operation, Continuity-constrained Asymmetrical Pixelwise Operation
+(CAPO), which can mimic the process of deforming an isovolumetrically flexible
+object through external forces. Utilizing CAPO as the fundamental component, we
+develop the Pixelwise Cross Gradient Deformation (PCGD), which is capable of
+emulating operations on ideal plastic objects (without volume constraint).
+Notably, our approach demonstrates state-of-the-art performance across four
+widely adopted benchmarks for GDSR, with significant advantages in large-scale
+tasks and generalizability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Dataset</span> Distillation as Pushforward Optimal Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Ye Tan, Emma Slade
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation aims to find a synthetic training set such that training
+on the synthetic data achieves similar performance to training on real data,
+with orders of magnitude less computational requirements. Existing methods can
+be broadly categorized as either bi-level optimization problems that have
+neural network training heuristics as the lower level problem, or disentangled
+methods that bypass the bi-level optimization by matching distributions of
+data. The latter method has the major advantages of speed and scalability in
+terms of size of both training and distilled datasets. We demonstrate that when
+equipped with an encoder-decoder structure, the empirically successful
+disentangled methods can be reformulated as an optimal quantization problem,
+where a finite set of points is found to approximate the underlying probability
+measure by minimizing the expected projection distance. In particular, we link
+existing disentangled dataset distillation methods to the classical optimal
+quantization and Wasserstein barycenter problems, demonstrating consistency of
+distilled datasets for diffusion-based generative priors. We propose a simple
+extension of the state-of-the-art data distillation method D4M, achieving
+better performance on the ImageNet-1K dataset with trivial additional
+computation, and state-of-the-art performance in higher image-per-class
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BlobGEN-Vid: Compositional Text-to-Video Generation with Blob Video
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixi Feng, Chao Liu, Sifei Liu, William Yang Wang, Arash Vahdat, Weili Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing video generation models struggle to follow complex text prompts and
+synthesize multiple objects, raising the need for additional grounding input
+for improved controllability. In this work, we propose to decompose videos into
+visual primitives - blob video representation, a general representation for
+controllable video generation. Based on blob conditions, we develop a
+blob-grounded video diffusion model named BlobGEN-Vid that allows users to
+control object motions and fine-grained object appearance. In particular, we
+introduce a masked 3D attention module that effectively improves regional
+consistency across frames. In addition, we introduce a learnable module to
+interpolate text embeddings so that users can control semantics in specific
+frames and obtain smooth object transitions. We show that our framework is
+model-agnostic and build BlobGEN-Vid based on both U-Net and DiT-based video
+diffusion models. Extensive experimental results show that BlobGEN-Vid achieves
+superior zero-shot video generation ability and state-of-the-art layout
+controllability on multiple benchmarks. When combined with an LLM for layout
+planning, our framework even outperforms proprietary text-to-video generators
+in terms of compositional accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://blobgen-vid2.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Confident Pseudo-labeled Diffusion Augmentation for Canine Cardiomegaly
+  Detection <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiman Zhang, Lakshmikar Reddy Polamreddy, Youshan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Canine cardiomegaly, marked by an enlarged heart, poses serious health risks
+if undetected, requiring accurate diagnostic methods. Current detection models
+often rely on small, poorly annotated datasets and struggle to generalize
+across diverse imaging conditions, limiting their real-world applicability. To
+address these issues, we propose a Confident Pseudo-labeled Diffusion
+Augmentation (CDA) model for identifying canine cardiomegaly. Our approach
+addresses the challenge of limited high-quality training data by employing
+diffusion models to generate synthetic X-ray images and annotate Vertebral
+Heart Score key points, thereby expanding the dataset. We also employ a
+pseudo-labeling strategy with Monte Carlo Dropout to select high-confidence
+labels, refine the synthetic dataset, and improve accuracy. Iteratively
+incorporating these labels enhances the model's performance, overcoming the
+limitations of existing approaches. Experimental results show that the CDA
+model outperforms traditional methods, achieving state-of-the-art accuracy in
+canine cardiomegaly detection. The code implementation is available at
+https://github.com/Shira7z/CDA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IP-FaceDiff: Identity-Preserving Facial Video Editing with Diffusion <span class="chip">WACV-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07530v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07530v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tharun Anand, Aryan Garg, Kaushik Mitra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial video editing has become increasingly important for content creators,
+enabling the manipulation of facial expressions and attributes. However,
+existing models encounter challenges such as poor editing quality, high
+computational costs and difficulties in preserving facial identity across
+diverse edits. Additionally, these models are often constrained to editing
+predefined facial attributes, limiting their flexibility to diverse editing
+prompts. To address these challenges, we propose a novel facial video editing
+framework that leverages the rich latent space of pre-trained text-to-image
+(T2I) diffusion models and fine-tune them specifically for facial video editing
+tasks. Our approach introduces a targeted fine-tuning scheme that enables high
+quality, localized, text-driven edits while ensuring identity preservation
+across video frames. Additionally, by using pre-trained T2I models during
+inference, our approach significantly reduces editing time by 80%, while
+maintaining temporal consistency throughout the video sequence. We evaluate the
+effectiveness of our approach through extensive testing across a wide range of
+challenging scenarios, including varying head poses, complex action sequences,
+and diverse facial expressions. Our method consistently outperforms existing
+techniques, demonstrating superior performance across a broad set of metrics
+and benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV-25 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RadAlign: Advancing Radiology Report Generation with Vision-Language
+  Concept Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Difei Gu, Yunhe Gao, Yang Zhou, Mu Zhou, Dimitris Metaxas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated chest radiographs interpretation requires both accurate disease
+classification and detailed radiology report generation, presenting a
+significant challenge in the clinical workflow. Current approaches either focus
+on classification accuracy at the expense of interpretability or generate
+detailed but potentially unreliable reports through image captioning
+techniques. In this study, we present RadAlign, a novel framework that combines
+the predictive accuracy of vision-language models (VLMs) with the reasoning
+capabilities of large language models (LLMs). Inspired by the radiologist's
+workflow, RadAlign first employs a specialized VLM to align visual features
+with key medical concepts, achieving superior disease classification with an
+average AUC of 0.885 across multiple diseases. These recognized medical
+conditions, represented as text-based concepts in the aligned visual-language
+space, are then used to prompt LLM-based report generation. Enhanced by a
+retrieval-augmented generation mechanism that grounds outputs in similar
+historical cases, RadAlign delivers superior report quality with a GREEN score
+of 0.678, outperforming state-of-the-art methods' 0.634. Our framework
+maintains strong clinical interpretability while reducing hallucinations,
+advancing automated medical imaging and report analysis through integrated
+predictive and generative AI. Code is available at
+https://github.com/difeigu/RadAlign.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Three-view Focal Length Recovery From Homographies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaqing Ding, Viktor Kocur, Zuzana Berger Haladová, Qianliang Wu, Shen Cai, Jian Yang, Zuzana Kukelova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel approach for recovering focal lengths from
+three-view homographies. By examining the consistency of normal vectors between
+two homographies, we derive new explicit constraints between the focal lengths
+and homographies using an elimination technique. We demonstrate that three-view
+homographies provide two additional constraints, enabling the recovery of one
+or two focal lengths. We discuss four possible cases, including three cameras
+having an unknown equal focal length, three cameras having two different
+unknown focal lengths, three cameras where one focal length is known, and the
+other two cameras have equal or different unknown focal lengths. All the
+problems can be converted into solving polynomials in one or two unknowns,
+which can be efficiently solved using Sturm sequence or hidden variable
+technique. Evaluation using both synthetic and real data shows that the
+proposed solvers are both faster and more accurate than methods relying on
+existing two-view solvers. The code and data are available on
+https://github.com/kocurvik/hf
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://github.com/kocurvik/hf Dataset available
+  at: https://doi.org/10.5281/zenodo.14638904</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aligning First, Then Fusing: A Novel Weakly Supervised Multimodal
+  Violence Detection Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenping Jin, Li Zhu, Jing Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly supervised violence detection refers to the technique of training
+models to identify violent segments in videos using only video-level labels.
+Among these approaches, multimodal violence detection, which integrates
+modalities such as audio and optical flow, holds great potential. Existing
+methods in this domain primarily focus on designing multimodal fusion models to
+address modality discrepancies. In contrast, we take a different approach;
+leveraging the inherent discrepancies across modalities in violence event
+representation to propose a novel multimodal semantic feature alignment method.
+This method sparsely maps the semantic features of local, transient, and less
+informative modalities ( such as audio and optical flow ) into the more
+informative RGB semantic feature space. Through an iterative process, the
+method identifies the suitable no-zero feature matching subspace and aligns the
+modality-specific event representations based on this subspace, enabling the
+full exploitation of information from all modalities during the subsequent
+modality fusion stage. Building on this, we design a new weakly supervised
+violence detection framework that consists of unimodal multiple-instance
+learning for extracting unimodal semantic features, multimodal alignment,
+multimodal fusion, and final detection. Experimental results on benchmark
+datasets demonstrate the effectiveness of our method, achieving an average
+precision (AP) of 86.07% on the XD-Violence dataset. Our code is available at
+https://github.com/xjpp2016/MAVD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3DGS-to-PC: Convert a 3D Gaussian Splatting Scene into a Dense Point
+  Cloud or Mesh 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lewis A G Stuart, Michael P Pound
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting (3DGS) excels at producing highly detailed 3D
+reconstructions, but these scenes often require specialised renderers for
+effective visualisation. In contrast, point clouds are a widely used 3D
+representation and are compatible with most popular 3D processing software, yet
+converting 3DGS scenes into point clouds is a complex challenge. In this work
+we introduce 3DGS-to-PC, a flexible and highly customisable framework that is
+capable of transforming 3DGS scenes into dense, high-accuracy point clouds. We
+sample points probabilistically from each Gaussian as a 3D density function. We
+additionally threshold new points using the Mahalanobis distance to the
+Gaussian centre, preventing extreme outliers. The result is a point cloud that
+closely represents the shape encoded into the 3D Gaussian scene. Individual
+Gaussians use spherical harmonics to adapt colours depending on view, and each
+point may contribute only subtle colour hints to the resulting rendered scene.
+To avoid spurious or incorrect colours that do not fit with the final point
+cloud, we recalculate Gaussian colours via a customised image rendering
+approach, assigning each Gaussian the colour of the pixel to which it
+contributes most across all views. 3DGS-to-PC also supports mesh generation
+through Poisson Surface Reconstruction, applied to points sampled from
+predicted surface Gaussians. This allows coloured meshes to be generated from
+3DGS scenes without the need for re-training. This package is highly
+customisable and capability of simple integration into existing 3DGS pipelines.
+3DGS-to-PC provides a powerful tool for converting 3DGS data into point cloud
+and surface-based formats.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Dynamic Neural Networks: from Computer Vision to Multi-modal
+  Sensor Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio Montello, Ronja Güldenring, Simone Scardapane, Lazaros Nalpantidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model compression is essential in the deployment of large Computer Vision
+models on embedded devices. However, static optimization techniques (e.g.
+pruning, quantization, etc.) neglect the fact that different inputs have
+different complexities, thus requiring different amount of computations.
+Dynamic Neural Networks allow to condition the number of computations to the
+specific input. The current literature on the topic is very extensive and
+fragmented. We present a comprehensive survey that synthesizes and unifies
+existing Dynamic Neural Networks research in the context of Computer Vision.
+Additionally, we provide a logical taxonomy based on which component of the
+network is adaptive: the output, the computation graph or the input.
+Furthermore, we argue that Dynamic Neural Networks are particularly beneficial
+in the context of Sensor Fusion for better adaptivity, noise reduction and
+information prioritization. We present preliminary works in this direction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review at International Journal of Computer Vision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PrecipDiff: Leveraging image diffusion models to enhance satellite-based
+  precipitation observations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ting-Yu Dai, Hayato Ushijima-Mwesigwa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A recent report from the World Meteorological Organization (WMO) highlights
+that water-related disasters have caused the highest human losses among natural
+disasters over the past 50 years, with over 91\% of deaths occurring in
+low-income countries. This disparity is largely due to the lack of adequate
+ground monitoring stations, such as weather surveillance radars (WSR), which
+are expensive to install. For example, while the US and Europe combined possess
+over 600 WSRs, Africa, despite having almost one and half times their landmass,
+has fewer than 40. To address this issue, satellite-based observations offer a
+global, near-real-time monitoring solution. However, they face several
+challenges like accuracy, bias, and low spatial resolution. This study
+leverages the power of diffusion models and residual learning to address these
+limitations in a unified framework. We introduce the first diffusion model for
+correcting the inconsistency between different precipitation products. Our
+method demonstrates the effectiveness in downscaling satellite precipitation
+estimates from 10 km to 1 km resolution. Extensive experiments conducted in the
+Seattle region demonstrate significant improvements in accuracy, bias
+reduction, and spatial detail. Importantly, our approach achieves these results
+using only precipitation data, showcasing the potential of a purely computer
+vision-based approach for enhancing satellite precipitation products and paving
+the way for further advancements in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guided SAM: Label-Efficient Part Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07434v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07434v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. B. van Rooij, G. J. Burghouts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Localizing object parts precisely is essential for tasks such as object
+recognition and robotic manipulation. Recent part segmentation methods require
+extensive training data and labor-intensive annotations. Segment-Anything Model
+(SAM) has demonstrated good performance on a wide range of segmentation
+problems, but requires (manual) positional prompts to guide it where to
+segment. Furthermore, since it has been trained on full objects instead of
+object parts, it is prone to over-segmentation of parts. To address this, we
+propose a novel approach that guides SAM towards the relevant object parts. Our
+method learns positional prompts from coarse patch annotations that are easier
+and cheaper to acquire. We train classifiers on image patches to identify part
+classes and aggregate patches into regions of interest (ROIs) with positional
+prompts. SAM is conditioned on these ROIs and prompts. This approach, termed
+`Guided SAM', enhances efficiency and reduces manual effort, allowing effective
+part segmentation with minimal labeled data. We demonstrate the efficacy of
+Guided SAM on a dataset of car parts, improving the average IoU on state of the
+art models from 0.37 to 0.49 with annotations that are on average five times
+more efficient to acquire.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diff-Ensembler: Learning to Ensemble 2D Diffusion Models for
+  Volume-to-Volume Medical Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiyue Zhu, Dou Hoon Kwark, Ruike Zhu, Kaiwen Hong, Yiqi Tao, Shirui Luo, Yudu Li, Zhi-Pei Liang, Volodymyr Kindratenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite success in volume-to-volume translations in medical images, most
+existing models struggle to effectively capture the inherent volumetric
+distribution using 3D representations. The current state-of-the-art approach
+combines multiple 2D-based networks through weighted averaging, thereby
+neglecting the 3D spatial structures. Directly training 3D models in medical
+imaging presents significant challenges due to high computational demands and
+the need for large-scale datasets. To address these challenges, we introduce
+Diff-Ensembler, a novel hybrid 2D-3D model for efficient and effective
+volumetric translations by ensembling perpendicularly trained 2D diffusion
+models with a 3D network in each diffusion step. Moreover, our model can
+naturally be used to ensemble diffusion models conditioned on different
+modalities, allowing flexible and accurate fusion of input conditions.
+Extensive experiments demonstrate that Diff-Ensembler attains superior accuracy
+and volumetric realism in 3D medical image super-resolution and modality
+translation. We further demonstrate the strength of our model's volumetric
+realism using tumor segmentation as a downstream task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OCORD: Open-Campus Object Removal <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Zhang, Runpu Wei, Kongming Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancements in generative models, particularly diffusion-based
+techniques, have revolutionized image inpainting tasks by enabling the
+generation of high-fidelity and diverse content. However, object removal
+remains under-explored as a specific subset of inpainting, facing challenges
+such as inadequate semantic understanding and the unintended generation of
+artifacts. Existing datasets for object removal often rely on synthetic data,
+which fails to align with real-world scenarios, limiting model performance.
+Although some real-world datasets address these issues partially, they suffer
+from scalability, annotation inefficiencies, and limited realism in physical
+phenomena such as lighting and shadows. To address these limitations, this
+paper introduces a novel approach to object removal by constructing a
+high-resolution real-world dataset through long-duration video capture with
+fixed camera settings. Leveraging advanced tools such as Grounding-DINO,
+Segment-Anything-Model, and MASA for automated annotation, we provides image,
+background, and mask pairs while significantly reducing annotation time and
+labor. With our efficient annotation pipeline, we release the first fully open,
+high-resolution real-world dataset for object removal, and improved performance
+in object removal tasks through fine-tuning of pre-trained diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Shot Scene Understanding for Automatic Target Recognition Using
+  Large Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07396v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07396v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasiru Ranasinghe, Vibashan VS, James Uplinger, Celso De Melo, Vishal M. Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic target recognition (ATR) plays a critical role in tasks such as
+navigation and surveillance, where safety and accuracy are paramount. In
+extreme use cases, such as military applications, these factors are often
+challenged due to the presence of unknown terrains, environmental conditions,
+and novel object categories. Current object detectors, including open-world
+detectors, lack the ability to confidently recognize novel objects or operate
+in unknown environments, as they have not been exposed to these new conditions.
+However, Large Vision-Language Models (LVLMs) exhibit emergent properties that
+enable them to recognize objects in varying conditions in a zero-shot manner.
+Despite this, LVLMs struggle to localize objects effectively within a scene. To
+address these limitations, we propose a novel pipeline that combines the
+detection capabilities of open-world detectors with the recognition confidence
+of LVLMs, creating a robust system for zero-shot ATR of novel classes and
+unknown domains. In this study, we compare the performance of various LVLMs for
+recognizing military vehicles, which are often underrepresented in training
+datasets. Additionally, we examine the impact of factors such as distance
+range, modality, and prompting methods on the recognition performance,
+providing insights into the development of more reliable ATR systems for novel
+conditions and classes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kolmogorov-Arnold Network for Remote Sensing Image Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianping Ma, Ziyao Wang, Yin Hu, Xiaokang Zhang, Man-On Pun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation plays a crucial role in remote sensing applications,
+where the accurate extraction and representation of features are essential for
+high-quality results. Despite the widespread use of encoder-decoder
+architectures, existing methods often struggle with fully utilizing the
+high-dimensional features extracted by the encoder and efficiently recovering
+detailed information during decoding. To address these problems, we propose a
+novel semantic segmentation network, namely DeepKANSeg, including two key
+innovations based on the emerging Kolmogorov Arnold Network (KAN). Notably, the
+advantage of KAN lies in its ability to decompose high-dimensional complex
+functions into univariate transformations, enabling efficient and flexible
+representation of intricate relationships in data. First, we introduce a
+KAN-based deep feature refinement module, namely DeepKAN to effectively capture
+complex spatial and rich semantic relationships from high-dimensional features.
+Second, we replace the traditional multi-layer perceptron (MLP) layers in the
+global-local combined decoder with KAN-based linear layers, namely GLKAN. This
+module enhances the decoder's ability to capture fine-grained details during
+decoding. To evaluate the effectiveness of the proposed method, experiments are
+conducted on two well-known fine-resolution remote sensing benchmark datasets,
+namely ISPRS Vaihingen and ISPRS Potsdam. The results demonstrate that the
+KAN-enhanced segmentation model achieves superior performance in terms of
+accuracy compared to state-of-the-art methods. They highlight the potential of
+KANs as a powerful alternative to traditional architectures in semantic
+segmentation tasks. Moreover, the explicit univariate decomposition provides
+improved interpretability, which is particularly beneficial for applications
+requiring explainable learning in remote sensing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedSemiDG: Domain Generalized Federated Semi-supervised Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07378v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07378v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Deng, Zhe Xu, Tsuyoshi Isshiki, Yefeng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation is challenging due to the diversity of medical
+images and the lack of labeled data, which motivates recent developments in
+federated semi-supervised learning (FSSL) to leverage a large amount of
+unlabeled data from multiple centers for model training without sharing raw
+data. However, what remains under-explored in FSSL is the domain shift problem
+which may cause suboptimal model aggregation and low effectivity of the
+utilization of unlabeled data, eventually leading to unsatisfactory performance
+in unseen domains. In this paper, we explore this previously ignored scenario,
+namely domain generalized federated semi-supervised learning (FedSemiDG), which
+aims to learn a model in a distributed manner from multiple domains with
+limited labeled data and abundant unlabeled data such that the model can
+generalize well to unseen domains. We present a novel framework, Federated
+Generalization-Aware SemiSupervised Learning (FGASL), to address the challenges
+in FedSemiDG by effectively tackling critical issues at both global and local
+levels. Globally, we introduce Generalization-Aware Aggregation (GAA),
+assigning adaptive weights to local models based on their generalization
+performance. Locally, we use a Dual-Teacher Adaptive Pseudo Label Refinement
+(DR) strategy to combine global and domain-specific knowledge, generating more
+reliable pseudo labels. Additionally, Perturbation-Invariant Alignment (PIA)
+enforces feature consistency under perturbations, promoting domain-invariant
+learning. Extensive experiments on three medical segmentation tasks (cardiac
+MRI, spine MRI and bladder cancer MRI) demonstrate that our method
+significantly outperforms state-of-the-art FSSL and domain generalization
+approaches, achieving robust generalization on unseen domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimberVision: A Multi-Task <span class="highlight-title">Dataset</span> and Framework for Log-Component
+  Segmentation and Tracking in Autonomous Forestry Operations <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Steininger, Julia Simon, Andreas Trondl, Markus Murschitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Timber represents an increasingly valuable and versatile resource. However,
+forestry operations such as harvesting, handling and measuring logs still
+require substantial human labor in remote environments posing significant
+safety risks. Progressively automating these tasks has the potential of
+increasing their efficiency as well as safety, but requires an accurate
+detection of individual logs as well as live trees and their context. Although
+initial approaches have been proposed for this challenging application domain,
+specialized data and algorithms are still too scarce to develop robust
+solutions. To mitigate this gap, we introduce the TimberVision dataset,
+consisting of more than 2k annotated RGB images containing a total of 51k trunk
+components including cut and lateral surfaces, thereby surpassing any existing
+dataset in this domain in terms of both quantity and detail by a large margin.
+Based on this data, we conduct a series of ablation experiments for oriented
+object detection and instance segmentation and evaluate the influence of
+multiple scene parameters on model performance. We introduce a generic
+framework to fuse the components detected by our models for both tasks into
+unified trunk representations. Furthermore, we automatically derive geometric
+properties and apply multi-object tracking to further enhance robustness. Our
+detection and tracking approach provides highly descriptive and accurate trunk
+representations solely from RGB image data, even under challenging
+environmental conditions. Our solution is suitable for a wide range of
+application scenarios and can be readily combined with other sensor modalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Winter Conference on Applications of Computer Vision
+  (WACV) 2025. Code and dataset available at
+  https://github.com/timbervision/timbervision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A method for estimating roadway billboard salience 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07342v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07342v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuzana Berger Haladova, Michal Zrubec, Zuzana Cernekova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Roadside billboards and other forms of outdoor advertising play a crucial
+role in marketing initiatives; however, they can also distract drivers,
+potentially contributing to accidents. This study delves into the significance
+of roadside advertising in images captured from a driver's perspective.
+Firstly, it evaluates the effectiveness of neural networks in detecting
+advertising along roads, focusing on the YOLOv5 and Faster R-CNN models.
+Secondly, the study addresses the determination of billboard significance using
+methods for saliency extraction. The UniSal and SpectralResidual methods were
+employed to create saliency maps for each image. The study establishes a
+database of eye tracking sessions captured during city highway driving to
+assess the saliency models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anonymization of Documents for Law Enforcement with Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Eberhardinger, Patrick Takenaka, Daniel Grießhaber, Johannes Maucher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The steadily increasing utilization of data-driven methods and approaches in
+areas that handle sensitive personal information such as in law enforcement
+mandates an ever increasing effort in these institutions to comply with data
+protection guidelines. In this work, we present a system for automatically
+anonymizing images of scanned documents, reducing manual effort while ensuring
+data protection compliance. Our method considers the viability of further
+forensic processing after anonymization by minimizing automatically redacted
+areas by combining automatic detection of sensitive regions with knowledge from
+a manually anonymized reference document. Using a self-supervised image model
+for instance retrieval of the reference document, our approach requires only
+one anonymized example to efficiently redact all documents of the same type,
+significantly reducing processing time. We show that our approach outperforms
+both a purely automatic redaction system and also a naive copy-paste scheme of
+the reference anonymization to other documents on a hand-crafted dataset of
+ground truth redactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE Symposium on CI in Security, Defence and Biometrics
+  2025 (IEEE CISDB)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Localization-Aware Multi-Scale Representation Learning for Repetitive
+  Action Counting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07312v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07312v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sujia Wang, Xiangwei Shen, Yansong Tang, Xin Dong, Wenjia Geng, Lei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Repetitive action counting (RAC) aims to estimate the number of
+class-agnostic action occurrences in a video without exemplars. Most current
+RAC methods rely on a raw frame-to-frame similarity representation for period
+prediction. However, this approach can be significantly disrupted by common
+noise such as action interruptions and inconsistencies, leading to sub-optimal
+counting performance in realistic scenarios. In this paper, we introduce a
+foreground localization optimization objective into similarity representation
+learning to obtain more robust and efficient video features. We propose a
+Localization-Aware Multi-Scale Representation Learning (LMRL) framework.
+Specifically, we apply a Multi-Scale Period-Aware Representation (MPR) with a
+scale-specific design to accommodate various action frequencies and learn more
+flexible temporal correlations. Furthermore, we introduce the Repetition
+Foreground Localization (RFL) method, which enhances the representation by
+coarsely identifying periodic actions and incorporating global semantic
+information. These two modules can be jointly optimized, resulting in a more
+discerning periodic action representation. Our approach significantly reduces
+the impact of noise, thereby improving counting accuracy. Additionally, the
+framework is designed to be scalable and adaptable to different types of video
+content. Experimental results on the RepCountA and UCFRep datasets demonstrate
+that our proposed method effectively handles repetitive action counting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE VCIP2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Devil is in the Spurious Correlation: Boosting Moment Retrieval via
+  Temporal Dynamic Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyang Zhou, Fanyue Wei, Lixin Duan, Wen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a textual query along with a corresponding video, the objective of
+moment retrieval aims to localize the moments relevant to the query within the
+video. While commendable results have been demonstrated by existing
+transformer-based approaches, predicting the accurate temporal span of the
+target moment is currently still a major challenge. In this paper, we reveal
+that a crucial reason stems from the spurious correlation between the text
+queries and the moment context. Namely, the model may associate the textual
+query with the background frames rather than the target moment. To address this
+issue, we propose a temporal dynamic learning approach for moment retrieval,
+where two strategies are designed to mitigate the spurious correlation. First,
+we introduce a novel video synthesis approach to construct a dynamic context
+for the relevant moment. With separate yet similar videos mixed up, the
+synthesis approach empowers our model to attend to the target moment of the
+corresponding query under various dynamic contexts. Second, we enhance the
+representation by learning temporal dynamics. Besides the visual
+representation, text queries are aligned with temporal dynamic representations,
+which enables our model to establish a non-spurious correlation between the
+query-related moment and context. With the aforementioned proposed method, the
+spurious correlation issue in moment retrieval can be largely alleviated. Our
+method establishes a new state-of-the-art performance on two popular benchmarks
+of moment retrieval, \ie, QVHighlights and Charades-STA. In addition, the
+detailed ablation analyses demonstrate the effectiveness of the proposed
+strategies. Our code will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Code and Pixels: Multi-Modal Contrastive <span class="highlight-title">Pre-train</span>ing for Enhanced
+  Tabular Data Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kankana Roy, Lars Krämer, Sebastian Domaschke, Malik Haris, Roland Aydin, Fabian Isensee, Martin Held
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from tabular data is of paramount importance, as it complements the
+conventional analysis of image and video data by providing a rich source of
+structured information that is often critical for comprehensive understanding
+and decision-making processes. We present Multi-task Contrastive Masked Tabular
+Modeling (MT-CMTM), a novel method aiming to enhance tabular models by
+leveraging the correlation between tabular data and corresponding images.
+MT-CMTM employs a dual strategy combining contrastive learning with masked
+tabular modeling, optimizing the synergy between these data modalities.
+  Central to our approach is a 1D Convolutional Neural Network with residual
+connections and an attention mechanism (1D-ResNet-CBAM), designed to
+efficiently process tabular data without relying on images. This enables
+MT-CMTM to handle purely tabular data for downstream tasks, eliminating the
+need for potentially costly image acquisition and processing.
+  We evaluated MT-CMTM on the DVM car dataset, which is uniquely suited for
+this particular scenario, and the newly developed HIPMP dataset, which connects
+membrane fabrication parameters with image data. Our MT-CMTM model outperforms
+the proposed tabular 1D-ResNet-CBAM, which is trained from scratch, achieving a
+relative 1.48% improvement in relative MSE on HIPMP and a 2.38% increase in
+absolute accuracy on DVM. These results demonstrate MT-CMTM's robustness and
+its potential to advance the field of multi-modal learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative analysis of optical character recognition methods for Sámi
+  texts from the National Library of Norway 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tita Enstad, Trond Trosterud, Marie Iversdatter Røsok, Yngvil Beyer, Marie Roald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical Character Recognition (OCR) is crucial to the National Library of
+Norway's (NLN) digitisation process as it converts scanned documents into
+machine-readable text. However, for the S\'ami documents in NLN's collection,
+the OCR accuracy is insufficient. Given that OCR quality affects downstream
+processes, evaluating and improving OCR for text written in S\'ami languages is
+necessary to make these resources accessible. To address this need, this work
+fine-tunes and evaluates three established OCR approaches, Transkribus,
+Tesseract and TrOCR, for transcribing S\'ami texts from NLN's collection. Our
+results show that Transkribus and TrOCR outperform Tesseract on this task,
+while Tesseract achieves superior performance on an out-of-domain dataset.
+Furthermore, we show that fine-tuning pre-trained models and supplementing
+manual annotations with machine annotations and synthetic text images can yield
+accurate OCR for S\'ami languages, even with a moderate amount of manually
+annotated data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in Proceedings of the 25th Nordic Conference on
+  Computational Linguistics (NoDaLiDa)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Realistic Camouflaged Object Detection: Benchmarks and Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07297v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07297v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhimeng Xin, Tianxu Wu, Shiming Chen, Shuo Ye, Zijing Xie, Yixiong Zou, Xinge You, Yufei Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camouflaged object detection (COD) primarily relies on semantic or instance
+segmentation methods. While these methods have made significant advancements in
+identifying the contours of camouflaged objects, they may be inefficient or
+cost-effective for tasks that only require the specific location of the object.
+Object detection algorithms offer an optimized solution for Realistic
+Camouflaged Object Detection (RCOD) in such cases. However, detecting
+camouflaged objects remains a formidable challenge due to the high degree of
+similarity between the features of the objects and their backgrounds. Unlike
+segmentation methods that perform pixel-wise comparisons to differentiate
+between foreground and background, object detectors omit this analysis, further
+aggravating the challenge. To solve this problem, we propose a camouflage-aware
+feature refinement (CAFR) strategy. Since camouflaged objects are not rare
+categories, CAFR fully utilizes a clear perception of the current object within
+the prior knowledge of large models to assist detectors in deeply understanding
+the distinctions between background and foreground. Specifically, in CAFR, we
+introduce the Adaptive Gradient Propagation (AGP) module that fine-tunes all
+feature extractor layers in large detection models to fully refine
+class-specific features from camouflaged contexts. We then design the Sparse
+Feature Refinement (SFR) module that optimizes the transformer-based feature
+extractor to focus primarily on capturing class-specific features in
+camouflaged scenarios. To facilitate the assessment of RCOD tasks, we manually
+annotate the labels required for detection on three existing segmentation COD
+datasets, creating a new benchmark for RCOD tasks. Code and datasets are
+available at: https://github.com/zhimengXin/RCOD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event-based Video Person Re-identification via Cross-Modality and
+  Temporal Collaboration <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renkai Li, Xin Yuan, Wei Liu, Xin Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based person re-identification (ReID) has become increasingly important
+due to its applications in video surveillance applications. By employing events
+in video-based person ReID, more motion information can be provided between
+continuous frames to improve recognition accuracy. Previous approaches have
+assisted by introducing event data into the video person ReID task, but they
+still cannot avoid the privacy leakage problem caused by RGB images. In order
+to avoid privacy attacks and to take advantage of the benefits of event data,
+we consider using only event data. To make full use of the information in the
+event stream, we propose a Cross-Modality and Temporal Collaboration (CMTC)
+network for event-based video person ReID. First, we design an event transform
+network to obtain corresponding auxiliary information from the input of raw
+events. Additionally, we propose a differential modality collaboration module
+to balance the roles of events and auxiliaries to achieve complementary
+effects. Furthermore, we introduce a temporal collaboration module to exploit
+motion information and appearance cues. Experimental results demonstrate that
+our method outperforms others in the task of event-based video person ReID.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Skip Mamba Diffusion for Monocular 3D Semantic Scene Completion <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Liang, Naveed Akhtar, Jordan Vice, Xiangrui Kong, Ajmal Saeed Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D semantic scene completion is critical for multiple downstream tasks in
+autonomous systems. It estimates missing geometric and semantic information in
+the acquired scene data. Due to the challenging real-world conditions, this
+task usually demands complex models that process multi-modal data to achieve
+acceptable performance. We propose a unique neural model, leveraging advances
+from the state space and diffusion generative modeling to achieve remarkable 3D
+semantic scene completion performance with monocular image input. Our technique
+processes the data in the conditioned latent space of a variational autoencoder
+where diffusion modeling is carried out with an innovative state space
+technique. A key component of our neural network is the proposed Skimba (Skip
+Mamba) denoiser, which is adept at efficiently processing long-sequence data.
+The Skimba diffusion model is integral to our 3D scene completion network,
+incorporating a triple Mamba structure, dimensional decomposition residuals and
+varying dilations along three directions. We also adopt a variant of this
+network for the subsequent semantic segmentation stage of our method. Extensive
+evaluation on the standard SemanticKITTI and SSCBench-KITTI360 datasets show
+that our approach not only outperforms other monocular techniques by a large
+margin, it also achieves competitive performance against stereo methods. The
+code is available at https://github.com/xrkong/skimba
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EdgeTAM: On-Device Track Anything Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chong Zhou, Chenchen Zhu, Yunyang Xiong, Saksham Suri, Fanyi Xiao, Lemeng Wu, Raghuraman Krishnamoorthi, Bo Dai, Chen Change Loy, Vikas Chandra, Bilge Soran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  On top of Segment Anything Model (SAM), SAM 2 further extends its capability
+from image to video inputs through a memory bank mechanism and obtains a
+remarkable performance compared with previous methods, making it a foundation
+model for video segmentation task. In this paper, we aim at making SAM 2 much
+more efficient so that it even runs on mobile devices while maintaining a
+comparable performance. Despite several works optimizing SAM for better
+efficiency, we find they are not sufficient for SAM 2 because they all focus on
+compressing the image encoder, while our benchmark shows that the newly
+introduced memory attention blocks are also the latency bottleneck. Given this
+observation, we propose EdgeTAM, which leverages a novel 2D Spatial Perceiver
+to reduce the computational cost. In particular, the proposed 2D Spatial
+Perceiver encodes the densely stored frame-level memories with a lightweight
+Transformer that contains a fixed set of learnable queries. Given that video
+segmentation is a dense prediction task, we find preserving the spatial
+structure of the memories is essential so that the queries are split into
+global-level and patch-level groups. We also propose a distillation pipeline
+that further improves the performance without inference overhead. As a result,
+EdgeTAM achieves 87.7, 70.0, 72.3, and 71.7 J&F on DAVIS 2017, MOSE, SA-V val,
+and SA-V test, while running at 16 FPS on iPhone 15 Pro Max.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code will be released at https://github.com/facebookresearch/EdgeTAM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOS-Attack: A Scalable Multi-objective Adversarial Attack Framework <span class="chip">CVPR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ping Guo, Cheng Gong, Xi Lin, Fei Liu, Zhichao Lu, Qingfu Zhang, Zhenkun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crafting adversarial examples is crucial for evaluating and enhancing the
+robustness of Deep Neural Networks (DNNs), presenting a challenge equivalent to
+maximizing a non-differentiable 0-1 loss function.
+  However, existing single objective methods, namely adversarial attacks focus
+on a surrogate loss function, do not fully harness the benefits of engaging
+multiple loss functions, as a result of insufficient understanding of their
+synergistic and conflicting nature.
+  To overcome these limitations, we propose the Multi-Objective Set-based
+Attack (MOS Attack), a novel adversarial attack framework leveraging multiple
+loss functions and automatically uncovering their interrelations.
+  The MOS Attack adopts a set-based multi-objective optimization strategy,
+enabling the incorporation of numerous loss functions without additional
+parameters.
+  It also automatically mines synergistic patterns among various losses,
+facilitating the generation of potent adversarial attacks with fewer
+objectives.
+  Extensive experiments have shown that our MOS Attack outperforms
+single-objective attacks. Furthermore, by harnessing the identified synergistic
+patterns, MOS Attack continues to show superior results with a reduced number
+of loss functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review of CVPR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit Neural Representations for Registration of Left Ventricle
+  Myocardium During a Cardiac Cycle 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathias Micheelsen Lowes, Jonas Jalili Pedersen, Bjørn S. Hansen, Klaus Fuglsang Kofoed, Maxime Sermesant, Rasmus R. Paulsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the movement of the left ventricle myocardium (LVmyo) during
+the cardiac cycle is essential for assessing cardiac function. One way to model
+this movement is through a series of deformable image registrations (DIRs) of
+the LVmyo. Traditional deep learning methods for DIRs, such as those based on
+convolutional neural networks, often require substantial memory and
+computational resources. In contrast, implicit neural representations (INRs)
+offer an efficient approach by operating on any number of continuous points.
+This study extends the use of INRs for DIR to cardiac computed tomography (CT),
+focusing on LVmyo registration. To enhance the precision of the registration
+around the LVmyo, we incorporate the signed distance field of the LVmyo with
+the Hounsfield Unit values from the CT frames. This guides the registration of
+the LVmyo, while keeping the tissue information from the CT frames. Our
+framework demonstrates high registration accuracy and provides a robust method
+for temporal registration that facilitates further analysis of LVmyo motion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, STACOM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Depth and Image Fusion for Road Obstacle Detection Using Stereo Camera 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Perezyabov, Mikhail Gavrilenkov, Ilya Afanasyev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper is devoted to the detection of objects on a road, performed with a
+combination of two methods based on both the use of depth information and video
+analysis of data from a stereo camera. Since neither the time of the appearance
+of an object on the road, nor its size and shape is known in advance,
+ML/DL-based approaches are not applicable. The task becomes more complicated
+due to variations in artificial illumination, inhomogeneous road surface
+texture, and unknown character and features of the object. To solve this
+problem we developed the depth and image fusion method that complements a
+search of small contrast objects by RGB-based method, and obstacle detection by
+stereo image-based approach with SLIC superpixel segmentation. We conducted
+experiments with static and low speed obstacles in an underground parking lot
+and demonstrated the successful work of the developed technique for detecting
+and even tracking small objects, which can be parking infrastructure objects,
+things left on the road, wheels, dropped boxes, etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Vision-Language Models Evaluate Handwritten Math? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oikantik Nath, Hanani Bathina, Mohammed Safi Ur Rahman Khan, Mitesh M. Khapra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Vision-Language Models (VLMs) have opened new
+possibilities in automatic grading of handwritten student responses,
+particularly in mathematics. However, a comprehensive study to test the ability
+of VLMs to evaluate and reason over handwritten content remains absent. To
+address this gap, we introduce FERMAT, a benchmark designed to assess the
+ability of VLMs to detect, localize and correct errors in handwritten
+mathematical content. FERMAT spans four key error dimensions - computational,
+conceptual, notational, and presentation - and comprises over 2,200 handwritten
+math solutions derived from 609 manually curated problems from grades 7-12 with
+intentionally introduced perturbations. Using FERMAT we benchmark nine VLMs
+across three tasks: error detection, localization, and correction. Our results
+reveal significant shortcomings in current VLMs in reasoning over handwritten
+text, with Gemini-1.5-Pro achieving the highest error correction rate (77%). We
+also observed that some models struggle with processing handwritten content, as
+their accuracy improves when handwritten inputs are replaced with printed text
+or images. These findings highlight the limitations of current VLMs and reveal
+new avenues for improvement. We release FERMAT and all the associated resources
+in the open-source to drive further research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CSTA: Spatial-Temporal Causal Adaptive Learning for Exemplar-Free Video
+  Class-Incremental Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tieyuan Chen, Huabin Liu, Chern Hong Lim, John See, Xing Gao, Junhui Hou, Weiyao Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning aims to acquire new knowledge while retaining past
+information. Class-incremental learning (CIL) presents a challenging scenario
+where classes are introduced sequentially. For video data, the task becomes
+more complex than image data because it requires learning and preserving both
+spatial appearance and temporal action involvement. To address this challenge,
+we propose a novel exemplar-free framework that equips separate spatiotemporal
+adapters to learn new class patterns, accommodating the incremental information
+representation requirements unique to each class. While separate adapters are
+proven to mitigate forgetting and fit unique requirements, naively applying
+them hinders the intrinsic connection between spatial and temporal information
+increments, affecting the efficiency of representing newly learned class
+information. Motivated by this, we introduce two key innovations from a causal
+perspective. First, a causal distillation module is devised to maintain the
+relation between spatial-temporal knowledge for a more efficient
+representation. Second, a causal compensation mechanism is proposed to reduce
+the conflicts during increment and memorization between different types of
+information. Extensive experiments conducted on benchmark datasets demonstrate
+that our framework can achieve new state-of-the-art results, surpassing current
+example-based methods by 4.2% in accuracy on average.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE TCSVT Submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tieyuan Chen, Huabin Liu, Yi Wang, Yihang Chen, Tianyao He, Chaofan Gan, Huanyu He, Weiyao Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video causal reasoning aims to achieve a high-level understanding of videos
+from a causal perspective. However, it exhibits limitations in its scope,
+primarily executed in a question-answering paradigm and focusing on brief video
+segments containing isolated events and basic causal relations, lacking
+comprehensive and structured causality analysis for videos with multiple
+interconnected events. To fill this gap, we introduce a new task and dataset,
+Multi-Event Causal Discovery (MECD). It aims to uncover the causal relations
+between events distributed chronologically across long videos. Given visual
+segments and textual descriptions of events, MECD identifies the causal
+associations between these events to derive a comprehensive and structured
+event-level video causal graph explaining why and how the result event
+occurred. To address the challenges of MECD, we devise a novel framework
+inspired by the Granger Causality method, incorporating an efficient mask-based
+event prediction model to perform an Event Granger Test. It estimates causality
+by comparing the predicted result event when premise events are masked versus
+unmasked. Furthermore, we integrate causal inference techniques such as
+front-door adjustment and counterfactual inference to mitigate challenges in
+MECD like causality confounding and illusory causality. Additionally, context
+chain reasoning is introduced to conduct more robust and generalized reasoning.
+Experiments validate the effectiveness of our framework in reasoning complete
+causal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%,
+respectively. Further experiments demonstrate that causal relation graphs can
+also contribute to downstream video understanding tasks such as video question
+answering and video event prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE TPAMI Submission. arXiv admin note: substantial text overlap
+  with arXiv:2409.17647</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Use of Contrastive Language-Image <span class="highlight-title">Pre-Train</span>ing for Human
+  Posture Classification: Insights from Yoga Pose Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrzej D. Dobrzycki, Ana M. Bernardos, Luca Bergesio, Andrzej Pomirski, Daniel Sáez-Trigueros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate human posture classification in images and videos is crucial for
+automated applications across various fields, including work safety, physical
+rehabilitation, sports training, or daily assisted living. Recently, multimodal
+learning methods, such as Contrastive Language-Image Pretraining (CLIP), have
+advanced significantly in jointly understanding images and text. This study
+aims to assess the effectiveness of CLIP in classifying human postures,
+focusing on its application in yoga. Despite the initial limitations of the
+zero-shot approach, applying transfer learning on 15,301 images (real and
+synthetic) with 82 classes has shown promising results. The article describes
+the full procedure for fine-tuning, including the choice for image description
+syntax, models and hyperparameters adjustment. The fine-tuned CLIP model,
+tested on 3826 images, achieves an accuracy of over 85%, surpassing the current
+state-of-the-art of previous works on the same dataset by approximately 6%, its
+training time being 3.5 times lower than what is needed to fine-tune a
+YOLOv8-based model. For more application-oriented scenarios, with smaller
+datasets of six postures each, containing 1301 and 401 training images, the
+fine-tuned models attain an accuracy of 98.8% and 99.1%, respectively.
+Furthermore, our experiments indicate that training with as few as 20 images
+per pose can yield around 90% accuracy in a six-class dataset. This study
+demonstrates that this multimodal technique can be effectively used for yoga
+pose classification, and possibly for human posture classification, in general.
+Additionally, CLIP inference time (around 7 ms) supports that the model can be
+integrated into automated systems for posture evaluation, e.g., for developing
+a real-time personal yoga assistant for performance assessment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimeLogic: A Temporal Logic Benchmark for Video QA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sirnam Swetha, Hilde Kuehne, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal logical understanding, a core facet of human cognition, plays a
+pivotal role in capturing complex sequential events and their temporal
+relationships within videos. This capability is particularly crucial in tasks
+like Video Question Answering (VideoQA), where the goal is to process visual
+data over time together with textual data to provide coherent answers. However,
+current VideoQA benchmarks devote little focus to evaluating this critical
+skill due to the challenge of annotating temporal logic. Despite the
+advancement of vision-language models, assessing their temporal logical
+reasoning powers remains a challenge, primarily due to the lack QA pairs that
+demand formal, complex temporal reasoning. To bridge this gap, we introduce the
+TimeLogic QA (TLQA) framework to automatically generate the QA pairs,
+specifically designed to evaluate the temporal logical understanding. To this
+end, TLQA leverages temporal annotations from existing video datasets together
+with temporal operators derived from logic theory to construct questions that
+test understanding of event sequences and their temporal relationships. TLQA
+framework is generic and scalable, capable of leveraging both, existing video
+action datasets with temporal action segmentation annotations, or video
+datasets with temporal scene graph annotations, to automatically generate
+temporal logical questions. We leverage 4 datasets, STAR, Breakfast, AGQA, and
+CrossTask, and generate two VideoQA dataset variants - small (TLQA-S) and large
+(TLQA-L) - containing 2k and 10k QA pairs for each category, resulting in 32k
+and 160k total pairs per dataset. We undertake a comprehensive evaluation of
+leading-edge VideoQA models, employing the TLQA to benchmark their temporal
+logical understanding capabilities. We assess the VideoQA model's temporal
+reasoning performance on 16 categories of temporal logic with varying temporal
+complexity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-face emotion detection for effective Human-Robot Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Ala Yahyaoui, Mouaad Oujabour, Leila Ben Letaifa, Amine Bohi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of dialogue interfaces in mobile devices has become
+ubiquitous, providing a wide array of services. As technology progresses,
+humanoid robots designed with human-like features to interact effectively with
+people are gaining prominence, and the use of advanced human-robot dialogue
+interfaces is continually expanding. In this context, emotion recognition plays
+a crucial role in enhancing human-robot interaction by enabling robots to
+understand human intentions. This research proposes a facial emotion detection
+interface integrated into a mobile humanoid robot, capable of displaying
+real-time emotions from multiple individuals on a user interface. To this end,
+various deep neural network models for facial expression recognition were
+developed and evaluated under consistent computer-based conditions, yielding
+promising results. Afterwards, a trade-off between accuracy and memory
+footprint was carefully considered to effectively implement this application on
+a mobile humanoid robot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 8 figures and 1 table. Accepted at the 17th International
+  Conference on Agents and Artificial Intelligence (ICAART 2025), Porto,
+  Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaceOracle: Chat with a Face Image Oracle 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wassim Kabbani, Kiran Raja, Raghavendra Ramachandra, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A face image is a mandatory part of ID and travel documents. Obtaining
+high-quality face images when issuing such documents is crucial for both human
+examiners and automated face recognition systems. In several international
+standards, face image quality requirements are intricate and defined in detail.
+Identifying and understanding non-compliance or defects in the submitted face
+images is crucial for both issuing authorities and applicants. In this work, we
+introduce FaceOracle, an LLM-powered AI assistant that helps its users analyze
+a face image in a natural conversational manner using standard compliant
+algorithms. Leveraging the power of LLMs, users can get explanations of various
+face image quality concepts as well as interpret the outcome of face image
+quality assessment (FIQA) algorithms. We implement a proof-of-concept that
+demonstrates how experts at an issuing authority could integrate FaceOracle
+into their workflow to analyze, understand, and communicate their decisions
+more efficiently, resulting in enhanced productivity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lung Cancer detection using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryan Chaudhari, Ankush Singh, Sanchi Gajbhiye, Pratham Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we discuss lung cancer detection using hybrid model of
+Convolutional-Neural-Networks (CNNs) and Support-Vector-Machines-(SVMs) in
+order to gain early detection of tumors, benign or malignant. The work uses
+this hybrid model by training upon the Computed Tomography scans (CT scans) as
+dataset. Using deep learning for detecting lung cancer early is a cutting-edge
+method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VAGeo: View-specific Attention for Cross-View Object Geo-Localization <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongyang Li, Xin Yuan, Wei Liu, Xin Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-view object geo-localization (CVOGL) aims to locate an object of
+interest in a captured ground- or drone-view image within the satellite image.
+However, existing works treat ground-view and drone-view query images
+equivalently, overlooking their inherent viewpoint discrepancies and the
+spatial correlation between the query image and the satellite-view reference
+image. To this end, this paper proposes a novel View-specific Attention
+Geo-localization method (VAGeo) for accurate CVOGL. Specifically, VAGeo
+contains two key modules: view-specific positional encoding (VSPE) module and
+channel-spatial hybrid attention (CSHA) module. In object-level, according to
+the characteristics of different viewpoints of ground and drone query images,
+viewpoint-specific positional codings are designed to more accurately identify
+the click-point object of the query image in the VSPE module. In feature-level,
+a hybrid attention in the CSHA module is introduced by combining channel
+attention and spatial attention mechanisms simultaneously for learning
+discriminative features. Extensive experimental results demonstrate that the
+proposed VAGeo gains a significant performance improvement, i.e., improving
+acc@0.25/acc@0.5 on the CVOGL dataset from 45.43%/42.24% to 48.21%/45.22% for
+ground-view, and from 61.97%/57.66% to 66.19%/61.87% for drone-view.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A4O: All Trigger for One sample 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duc Anh Vu, Anh Tuan Tran, Cong Tran, Cuong Pham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks have become a critical threat to deep neural networks
+(DNNs), drawing many research interests. However, most of the studied attacks
+employ a single type of trigger. Consequently, proposed backdoor defenders
+often rely on the assumption that triggers would appear in a unified way. In
+this paper, we show that this naive assumption can create a loophole, allowing
+more sophisticated backdoor attacks to bypass. We design a novel backdoor
+attack mechanism that incorporates multiple types of backdoor triggers,
+focusing on stealthiness and effectiveness. Our journey begins with the
+intriguing observation that the performance of a backdoor attack in deep
+learning models, as well as its detectability and removability, are all
+proportional to the magnitude of the trigger. Based on this correlation, we
+propose reducing the magnitude of each trigger type and combining them to
+achieve a strong backdoor relying on the combined trigger while still staying
+safely under the radar of defenders. Extensive experiments on three standard
+datasets demonstrate that our method can achieve high attack success rates
+(ASRs) while consistently bypassing state-of-the-art defenses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Guarantees on Automated Precision Weeding using Conformal
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Melki, Lionel Bombrun, Boubacar Diallo, Jérôme Dias, Jean-Pierre da Costa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precision agriculture in general, and precision weeding in particular, have
+greatly benefited from the major advancements in deep learning and computer
+vision. A large variety of commercial robotic solutions are already available
+and deployed. However, the adoption by farmers of such solutions is still low
+for many reasons, an important one being the lack of trust in these systems.
+This is in great part due to the opaqueness and complexity of deep neural
+networks and the manufacturers' inability to provide valid guarantees on their
+performance. Conformal prediction, a well-established methodology in the
+machine learning community, is an efficient and reliable strategy for providing
+trustworthy guarantees on the predictions of any black-box model under very
+minimal constraints. Bridging the gap between the safe machine learning and
+precision agriculture communities, this article showcases conformal prediction
+in action on the task of precision weeding through deep learning-based image
+classification. After a detailed presentation of the conformal prediction
+methodology and the development of a precision spraying pipeline based on a
+''conformalized'' neural network and well-defined spraying decision rules, the
+article evaluates this pipeline on two real-world scenarios: one under
+in-distribution conditions, the other reflecting a near out-of-distribution
+setting. The results show that we are able to provide formal, i.e. certifiable,
+guarantees on spraying at least 90% of the weeds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Radial Distortion in Face Images: Detection and Impact 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wassim Kabbani, Tristan Le Pessot, Kiran Raja, Raghavendra Ramachandra, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acquiring face images of sufficiently high quality is important for online ID
+and travel document issuance applications using face recognition systems (FRS).
+Low-quality, manipulated (intentionally or unintentionally), or distorted
+images degrade the FRS performance and facilitate documents' misuse. Securing
+quality for enrolment images, especially in the unsupervised self-enrolment
+scenario via a smartphone, becomes important to assure FRS performance. In this
+work, we focus on the less studied area of radial distortion (a.k.a., the
+fish-eye effect) in face images and its impact on FRS performance. We introduce
+an effective radial distortion detection model that can detect and flag radial
+distortion in the enrolment scenario. We formalize the detection model as a
+face image quality assessment (FIQA) algorithm and provide a careful inspection
+of the effect of radial distortion on FRS performance. Evaluation results show
+excellent detection results for the proposed models, and the study on the
+impact on FRS uncovers valuable insights into how to best use these models in
+operational systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Human Perception of Novel View Synthesis: Subjective Quality
+  Assessment of Gaussian Splatting and NeRF in Dynamic Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.08072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.08072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Zhang, Joshua Maraval, Zhengyu Zhang, Nicolas Ramin, Shishun Tian, Lu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian Splatting (GS) and Neural Radiance Fields (NeRF) are two
+groundbreaking technologies that have revolutionized the field of Novel View
+Synthesis (NVS), enabling immersive photorealistic rendering and user
+experiences by synthesizing multiple viewpoints from a set of images of sparse
+views. The potential applications of NVS, such as high-quality virtual and
+augmented reality, detailed 3D modeling, and realistic medical organ imaging,
+underscore the importance of quality assessment of NVS methods from the
+perspective of human perception. Although some previous studies have explored
+subjective quality assessments for NVS technology, they still face several
+challenges, especially in NVS methods selection, scenario coverage, and
+evaluation methodology. To address these challenges, we conducted two
+subjective experiments for the quality assessment of NVS technologies
+containing both GS-based and NeRF-based methods, focusing on dynamic and
+real-world scenes. This study covers 360{\deg}, front-facing, and
+single-viewpoint videos while providing a richer and greater number of real
+scenes. Meanwhile, it's the first time to explore the impact of NVS methods in
+dynamic scenes with moving objects. The two types of subjective experiments
+help to fully comprehend the influences of different viewing paths from a human
+perception perspective and pave the way for future development of
+full-reference and no-reference quality metrics. In addition, we established a
+comprehensive benchmark of various state-of-the-art objective metrics on the
+proposed database, highlighting that existing methods still struggle to
+accurately capture subjective quality. The results give us some insights into
+the limitations of existing NVS methods and may promote the development of new
+NVS methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Noise-Tolerant Network for Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unlike image classification and annotation, for which deep network models
+have achieved dominating superior performances compared to traditional computer
+vision algorithms, deep learning for automatic image segmentation still faces
+critical challenges. One of such hurdles is to obtain ground-truth
+segmentations as the training labels for deep network training. Especially when
+we study biomedical images, such as histopathological images (histo-images), it
+is unrealistic to ask for manual segmentation labels as the ground truth for
+training due to the fine image resolution as well as the large image size and
+complexity. In this paper, instead of relying on clean segmentation labels, we
+study whether and how integrating imperfect or noisy segmentation results from
+off-the-shelf segmentation algorithms may help achieve better segmentation
+results through a new Adaptive Noise-Tolerant Network (ANTN) model. We extend
+the noisy label deep learning to image segmentation with two novel aspects: (1)
+multiple noisy labels can be integrated into one deep learning model; (2) noisy
+segmentation modeling, including probabilistic parameters, is adaptive,
+depending on the given testing image appearance. Implementation of the new ANTN
+model on both the synthetic data and real-world histo-images demonstrates its
+effectiveness and superiority over off-the-shelf and other existing
+deep-learning-based image segmentation algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eye Sclera for Fair Face Image Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wassim Kabbani, Kiran Raja, Raghavendra Ramachandra, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fair operational systems are crucial in gaining and maintaining society's
+trust in face recognition systems (FRS). FRS start with capturing an image and
+assessing its quality before using it further for enrollment or verification.
+Fair Face Image Quality Assessment (FIQA) schemes therefore become equally
+important in the context of fair FRS. This work examines the sclera as a
+quality assessment region for obtaining a fair FIQA. The sclera region is
+agnostic to demographic variations and skin colour for assessing the quality of
+a face image. We analyze three skin tone related ISO/IEC face image quality
+assessment measures and assess the sclera region as an alternative area for
+assessing FIQ. Our analysis of the face dataset of individuals from different
+demographic groups representing different skin tones indicates sclera as an
+alternative to measure dynamic range, over- and under-exposure of face using
+sclera region alone. The sclera region being agnostic to skin tone, i.e.,
+demographic factors, provides equal utility as a fair FIQA as shown by our
+Error-vs-Discard Characteristic (EDC) curve analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Single Object Tracking in LiDAR Point Clouds under Adverse
+  Weather Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiantong Zhao, Xiuping Liu, Shengjing Tian, Yinan Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D single object tracking (3DSOT) in LiDAR point clouds is a critical task
+for outdoor perception, enabling real-time perception of object location,
+orientation, and motion. Despite the impressive performance of current 3DSOT
+methods, evaluating them on clean datasets inadequately reflects their
+comprehensive performance, as the adverse weather conditions in real-world
+surroundings has not been considered. One of the main obstacles is the lack of
+adverse weather benchmarks for the evaluation of 3DSOT. To this end, this work
+proposes a challenging benchmark for LiDAR-based 3DSOT in adverse weather,
+which comprises two synthetic datasets (KITTI-A and nuScenes-A) and one
+real-world dataset (CADC-SOT) spanning three weather types: rain, fog, and
+snow. Based on this benchmark, five representative 3D trackers from different
+tracking frameworks conducted robustness evaluation, resulting in significant
+performance degradations. This prompts the question: What are the factors that
+cause current advanced methods to fail on such adverse weather samples?
+Consequently, we explore the impacts of adverse weather and answer the above
+question from three perspectives: 1) target distance; 2) template shape
+corruption; and 3) target shape corruption. Finally, based on domain
+randomization and contrastive learning, we designed a dual-branch tracking
+framework for adverse weather, named DRCT, achieving excellent performance in
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MSV-Mamba: A Multiscale Vision Mamba Network for Echocardiography
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07120v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07120v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxian Yang, Qi Wang, Kaiqi Zhang, Ke Wei, Jun Lyu, Lingchao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrasound imaging frequently encounters challenges, such as those related to
+elevated noise levels, diminished spatiotemporal resolution, and the complexity
+of anatomical structures. These factors significantly hinder the model's
+ability to accurately capture and analyze structural relationships and dynamic
+patterns across various regions of the heart. Mamba, an emerging model, is one
+of the most cutting-edge approaches that is widely applied to diverse vision
+and language tasks. To this end, this paper introduces a U-shaped deep learning
+model incorporating a large-window Mamba scale (LMS) module and a hierarchical
+feature fusion approach for echocardiographic segmentation. First, a cascaded
+residual block serves as an encoder and is employed to incrementally extract
+multiscale detailed features. Second, a large-window multiscale mamba module is
+integrated into the decoder to capture global dependencies across regions and
+enhance the segmentation capability for complex anatomical structures.
+Furthermore, our model introduces auxiliary losses at each decoder layer and
+employs a dual attention mechanism to fuse multilayer features both spatially
+and across channels. This approach enhances segmentation performance and
+accuracy in delineating complex anatomical structures. Finally, the
+experimental results using the EchoNet-Dynamic and CAMUS datasets demonstrate
+that the model outperforms other methods in terms of both accuracy and
+robustness. For the segmentation of the left ventricular endocardium
+(${LV}_{endo}$), the model achieved optimal values of 95.01 and 93.36,
+respectively, while for the left ventricular epicardium (${LV}_{epi}$), values
+of 87.35 and 87.80, respectively, were achieved. This represents an improvement
+ranging between 0.54 and 1.11 compared with the best-performing model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Duplex: Dual Prototype Learning for Compositional Zero-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhong Peng, Yishi Xu, Gerong Wang, Wenchao Chen, Bo Chen, Jing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositional Zero-Shot Learning (CZSL) aims to enable models to recognize
+novel compositions of visual states and objects that were absent during
+training. Existing methods predominantly focus on learning semantic
+representations of seen compositions but often fail to disentangle the
+independent features of states and objects in images, thereby limiting their
+ability to generalize to unseen compositions. To address this challenge, we
+propose Duplex, a novel dual-prototype learning method that integrates semantic
+and visual prototypes through a carefully designed dual-branch architecture,
+enabling effective representation learning for compositional tasks. Duplex
+utilizes a Graph Neural Network (GNN) to adaptively update visual prototypes,
+capturing complex interactions between states and objects. Additionally, it
+leverages the strong visual-semantic alignment of pre-trained Vision-Language
+Models (VLMs) and employs a multi-path architecture combined with prompt
+engineering to align image and text representations, ensuring robust
+generalization. Extensive experiments on three benchmark datasets demonstrate
+that Duplex outperforms state-of-the-art methods in both closed-world and
+open-world settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Matching Free Depth Recovery from Structured Light 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuohang Yu, Kai Wang, Juyong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel approach for depth estimation from images captured by
+structured light systems. Unlike many previous methods that rely on image
+matching process, our approach uses a density voxel grid to represent scene
+geometry, which is trained via self-supervised differentiable volume rendering.
+Our method leverages color fields derived from projected patterns in structured
+light systems during the rendering process, enabling the isolated optimization
+of the geometry field. This contributes to faster convergence and high-quality
+output. Additionally, we incorporate normalized device coordinates (NDC), a
+distortion loss, and a novel surface-based color loss to enhance geometric
+fidelity. Experimental results demonstrate that our method outperforms existing
+matching-based techniques in geometric performance for few-shot scenarios,
+achieving approximately a 60% reduction in average estimated depth errors on
+synthetic scenes and about 30% on real-world captured scenes. Furthermore, our
+approach delivers fast training, with a speed roughly three times faster than
+previous matching-free methods that employ implicit representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Multimodal Fusion via Meta-Learning Towards Micro-Video
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07110v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07110v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Liu, Yinwei Wei, Fan Liu, Wenjie Wang, Liqiang Nie, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal information (e.g., visual, acoustic, and textual) has been widely
+used to enhance representation learning for micro-video recommendation. For
+integrating multimodal information into a joint representation of micro-video,
+multimodal fusion plays a vital role in the existing micro-video recommendation
+approaches. However, the static multimodal fusion used in previous studies is
+insufficient to model the various relationships among multimodal information of
+different micro-videos. In this paper, we develop a novel meta-learning-based
+multimodal fusion framework called Meta Multimodal Fusion (MetaMMF), which
+dynamically assigns parameters to the multimodal fusion function for each
+micro-video during its representation learning. Specifically, MetaMMF regards
+the multimodal fusion of each micro-video as an independent task. Based on the
+meta information extracted from the multimodal features of the input task,
+MetaMMF parameterizes a neural network as the item-specific fusion function via
+a meta learner. We perform extensive experiments on three benchmark datasets,
+demonstrating the significant improvements over several state-of-the-art
+multimodal recommendation models, like MMGCN, LATTICE, and InvRL. Furthermore,
+we lighten our model by adopting canonical polyadic decomposition to improve
+the training efficiency, and validate its effectiveness through experimental
+results. Codes are available at https://github.com/hanliu95/MetaMMF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ACM Transactions on Information
+  Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Quest for Visual Understanding: A Journey Through the Evolution of
+  Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anupam Pandey, Deepjyoti Bodo, Arpan Phukan, Asif Ekbal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) is an interdisciplinary field that bridges
+the gap between computer vision (CV) and natural language processing(NLP),
+enabling Artificial Intelligence(AI) systems to answer questions about images.
+Since its inception in 2015, VQA has rapidly evolved, driven by advances in
+deep learning, attention mechanisms, and transformer-based models. This survey
+traces the journey of VQA from its early days, through major breakthroughs,
+such as attention mechanisms, compositional reasoning, and the rise of
+vision-language pre-training methods. We highlight key models, datasets, and
+techniques that shaped the development of VQA systems, emphasizing the pivotal
+role of transformer architectures and multimodal pre-training in driving recent
+progress. Additionally, we explore specialized applications of VQA in domains
+like healthcare and discuss ongoing challenges, such as dataset bias, model
+interpretability, and the need for common-sense reasoning. Lastly, we discuss
+the emerging trends in large multimodal language models and the integration of
+external knowledge, offering insights into the future directions of VQA. This
+paper aims to provide a comprehensive overview of the evolution of VQA,
+highlighting both its current state and potential advancements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RMAvatar: Photorealistic Human Avatar Reconstruction from Monocular
+  Video Based on Rectified Mesh-embedded Gaussians 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07104v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07104v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Peng, Weixing Xie, Zilong Wang, Xiaohu Guo, Zhonggui Chen, Baorong Yang, Xiao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce RMAvatar, a novel human avatar representation with Gaussian
+splatting embedded on mesh to learn clothed avatar from a monocular video. We
+utilize the explicit mesh geometry to represent motion and shape of a virtual
+human and implicit appearance rendering with Gaussian Splatting. Our method
+consists of two main modules: Gaussian initialization module and Gaussian
+rectification module. We embed Gaussians into triangular faces and control
+their motion through the mesh, which ensures low-frequency motion and surface
+deformation of the avatar. Due to the limitations of LBS formula, the human
+skeleton is hard to control complex non-rigid transformations. We then design a
+pose-related Gaussian rectification module to learn fine-detailed non-rigid
+deformations, further improving the realism and expressiveness of the avatar.
+We conduct extensive experiments on public datasets, RMAvatar shows
+state-of-the-art performance on both rendering quality and quantitative
+evaluations. Please see our project page at https://rm-avatar.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVM2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Scale-aware Adaptive Masked Knowledge Distillation for Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        ZhouRui Zhang, Jun Li, JiaYan Li, ZhiJian Wu, JianHua Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent feature masking knowledge distillation methods make use of attention
+mechanisms to identify either important spatial regions or channel clues for
+discriminative feature reconstruction. However, most of existing strategies
+perform global attention-guided feature masking distillation without delving
+into fine-grained visual clues in feature maps. In particular, uncovering
+locality-aware clues across different scales are conducive to reconstructing
+region-aware features, thereby significantly benefiting distillation
+performance. In this study, we propose a fine-grained adaptive feature masking
+distillation framework for accurate object detection. Different from previous
+methods in which global masking is performed on single-scale feature maps, we
+explore the scale-aware feature masking by performing feature distillation
+across various scales, such that the object-aware locality is encoded for
+improved feature reconstruction. In addition, our fine-grained feature
+distillation strategy is combined with a masking logits distillation scheme in
+which logits difference between teacher and student networks is utilized to
+guide the distillation process. Thus, it can help the student model to better
+learn from the teacher counterpart with improved knowledge transfer. Extensive
+experiments for detection task demonstrate the superiority of our method. For
+example, when RetinaNet, RepPoints and Cascade Mask RCNN are used as teacher
+detectors, the student network achieves mAP scores of 41.5\%, 42.9\%, and
+42.6\%, respectively, outperforming state-of-the-art methods such as DMKD and
+FreeKD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Learning for 3D Hand-Object Reconstruction and
+  Compositional Action Recognition from Egocentric RGB Videos Using
+  Superquadrics <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tze Ho Elden Tse, Runyang Feng, Linfang Zheng, Jiho Park, Yixing Gao, Jihie Kim, Ales Leonardis, Hyung Jin Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the availability of egocentric 3D hand-object interaction datasets,
+there is increasing interest in developing unified models for hand-object pose
+estimation and action recognition. However, existing methods still struggle to
+recognise seen actions on unseen objects due to the limitations in representing
+object shape and movement using 3D bounding boxes. Additionally, the reliance
+on object templates at test time limits their generalisability to unseen
+objects. To address these challenges, we propose to leverage superquadrics as
+an alternative 3D object representation to bounding boxes and demonstrate their
+effectiveness on both template-free object reconstruction and action
+recognition tasks. Moreover, as we find that pure appearance-based methods can
+outperform the unified methods, the potential benefits from 3D geometric
+information remain unclear. Therefore, we study the compositionality of actions
+by considering a more challenging task where the training combinations of verbs
+and nouns do not overlap with the testing split. We extend H2O and FPHA
+datasets with compositional splits and design a novel collaborative learning
+framework that can explicitly reason about the geometric relations between
+hands and the manipulated object. Through extensive quantitative and
+qualitative evaluations, we demonstrate significant improvements over the
+state-of-the-arts in (compositional) action recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video Quality Assessment for Online Processing: From Spatial to Temporal
+  Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiebin Yan, Lei Wu, Yuming Fang, Xuelin Liu, Xue Xia, Weide Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of multimedia processing and deep learning
+technologies, especially in the field of video understanding, video quality
+assessment (VQA) has achieved significant progress. Although researchers have
+moved from designing efficient video quality mapping models to various research
+directions, in-depth exploration of the effectiveness-efficiency trade-offs of
+spatio-temporal modeling in VQA models is still less sufficient. Considering
+the fact that videos have highly redundant information, this paper investigates
+this problem from the perspective of joint spatial and temporal sampling,
+aiming to seek the answer to how little information we should keep at least
+when feeding videos into the VQA models while with acceptable performance
+sacrifice. To this end, we drastically sample the video's information from both
+spatial and temporal dimensions, and the heavily squeezed video is then fed
+into a stable VQA model. Comprehensive experiments regarding joint spatial and
+temporal sampling are conducted on six public video quality databases, and the
+results demonstrate the acceptable performance of the VQA model when throwing
+away most of the video information. Furthermore, with the proposed joint
+spatial and temporal sampling strategy, we make an initial attempt to design an
+online VQA model, which is instantiated by as simple as possible a spatial
+feature extractor, a temporal feature fusion module, and a global quality
+regression module. Through quantitative and qualitative experiments, we verify
+the feasibility of online VQA model by simplifying itself and reducing input.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Representation Learning of Point Cloud Upsampling in Global and Local
+  Inputs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongxu Zhang, Bei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, point cloud upsampling has been widely applied in fields
+such as 3D reconstruction. Our study investigates the factors influencing point
+cloud upsampling on both global and local levels through representation
+learning. Specifically, the paper inputs global and local information of the
+same point cloud model object into two encoders to extract these features,
+fuses them, and then feeds the combined features into an upsampling decoder.
+The goal is to address issues of sparsity and noise in point clouds by
+leveraging prior knowledge from both global and local inputs. And the proposed
+framework can be applied to any state-of-the-art point cloud upsampling neural
+network. Experiments were conducted on a series of autoencoder-based models
+utilizing deep learning, yielding interpretability for both global and local
+inputs, and it has been proven in the results that our proposed framework can
+further improve the upsampling effect in previous SOTA works. At the same time,
+the Saliency Map reflects the differences between global and local feature
+inputs, as well as the effectiveness of training with both inputs in parallel.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Label Calibration in Source Free Domain Adaptation <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivangi Rai, Rini Smita Thakur, Kunal Jangid, Vinod K Kurmi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Source-free domain adaptation (SFDA) utilizes a pre-trained source model with
+unlabeled target data. Self-supervised SFDA techniques generate pseudolabels
+from the pre-trained source model, but these pseudolabels often contain noise
+due to domain discrepancies between the source and target domains. Traditional
+self-supervised SFDA techniques rely on deterministic model predictions using
+the softmax function, leading to unreliable pseudolabels. In this work, we
+propose to introduce predictive uncertainty and softmax calibration for
+pseudolabel refinement using evidential deep learning. The Dirichlet prior is
+placed over the output of the target network to capture uncertainty using
+evidence with a single forward pass. Furthermore, softmax calibration solves
+the translation invariance problem to assist in learning with noisy labels. We
+incorporate a combination of evidential deep learning loss and information
+maximization loss with calibrated softmax in both prior and non-prior target
+knowledge SFDA settings. Extensive experimental analysis shows that our method
+outperforms other state-of-the-art methods on benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE/CVF Winter Conference on Applications of Computer
+  Vision (WACV) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Image Generation Fidelity via Progressive <span class="highlight-title">Prompt</span>s <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Xiong, Yuqi Li, Chuanguang Yang, Tiao Tan, Zhihong Zhu, Siyuan Li, Yue Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion transformer (DiT) architecture has attracted significant
+attention in image generation, achieving better fidelity, performance, and
+diversity. However, most existing DiT - based image generation methods focus on
+global - aware synthesis, and regional prompt control has been less explored.
+In this paper, we propose a coarse - to - fine generation pipeline for regional
+prompt - following generation. Specifically, we first utilize the powerful
+large language model (LLM) to generate both high - level descriptions of the
+image (such as content, topic, and objects) and low - level descriptions (such
+as details and style). Then, we explore the influence of cross - attention
+layers at different depths. We find that deeper layers are always responsible
+for high - level content control, while shallow layers handle low - level
+content control. Various prompts are injected into the proposed regional cross
+- attention control for coarse - to - fine generation. By using the proposed
+pipeline, we enhance the controllability of DiT - based image generation.
+Extensive quantitative and qualitative results show that our pipeline can
+improve the performance of the generated images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025, Github:
+  https://github.com/ZhenXiong-dl/ICASSP2025-RCAC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Superpixel Segmentation via Structural Information Theory <span class="chip">SDM 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minhui Xie, Hao Peng, Pu Li, Guangjie Zeng, Shuhai Wang, Jia Wu, Peng Li, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Superpixel segmentation is a foundation for many higher-level computer vision
+tasks, such as image segmentation, object recognition, and scene understanding.
+Existing graph-based superpixel segmentation methods typically concentrate on
+the relationships between a given pixel and its directly adjacent pixels while
+overlooking the influence of non-adjacent pixels. These approaches do not fully
+leverage the global information in the graph, leading to suboptimal
+segmentation quality. To address this limitation, we present SIT-HSS, a
+hierarchical superpixel segmentation method based on structural information
+theory. Specifically, we first design a novel graph construction strategy that
+incrementally explores the pixel neighborhood to add edges based on
+1-dimensional structural entropy (1D SE). This strategy maximizes the retention
+of graph information while avoiding an overly complex graph structure. Then, we
+design a new 2D SE-guided hierarchical graph partitioning method, which
+iteratively merges pixel clusters layer by layer to reduce the graph's 2D SE
+until a predefined segmentation scale is achieved. Experimental results on
+three benchmark datasets demonstrate that the SIT-HSS performs better than
+state-of-the-art unsupervised superpixel segmentation algorithms. The source
+code is available at \url{https://github.com/SELGroup/SIT-HSS}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SDM 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SFC-GAN: A Generative Adversarial Network for Brain Functional and
+  Structural Connectome Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yee-Fan Tan, Jun Lin Liow, Pei-Sze Tan, Fuad Noman, Raphael C. -W. Phan, Hernando Ombao, Chee-Ming Ting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern brain imaging technologies have enabled the detailed reconstruction of
+human brain connectomes, capturing structural connectivity (SC) from diffusion
+MRI and functional connectivity (FC) from functional MRI. Understanding the
+intricate relationships between SC and FC is vital for gaining deeper insights
+into the brain's functional and organizational mechanisms. However, obtaining
+both SC and FC modalities simultaneously remains challenging, hindering
+comprehensive analyses. Existing deep generative models typically focus on
+synthesizing a single modality or unidirectional translation between FC and SC,
+thereby missing the potential benefits of bi-directional translation,
+especially in scenarios where only one connectome is available. Therefore, we
+propose Structural-Functional Connectivity GAN (SFC-GAN), a novel framework for
+bidirectional translation between SC and FC. This approach leverages the
+CycleGAN architecture, incorporating convolutional layers to effectively
+capture the spatial structures of brain connectomes. To preserve the
+topological integrity of these connectomes, we employ a structure-preserving
+loss that guides the model in capturing both global and local connectome
+patterns while maintaining symmetry. Our framework demonstrates superior
+performance in translating between SC and FC, outperforming baseline models in
+similarity and graph property evaluations compared to ground truth data, each
+translated modality can be effectively utilized for downstream classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Protego: Detecting Adversarial Examples for Vision <span class="highlight-title">Transformer</span>s via
+  Intrinsic Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialin Wu, Kaikai Pan, Yanjiao Chen, Jiangyi Deng, Shengyuan Pang, Wenyuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models have excelled in natural language tasks, prompting the
+vision community to explore their implementation in computer vision problems.
+However, these models are still influenced by adversarial examples. In this
+paper, we investigate the attack capabilities of six common adversarial attacks
+on three pretrained ViT models to reveal the vulnerability of ViT models. To
+understand and analyse the bias in neural network decisions when the input is
+adversarial, we use two visualisation techniques that are attention rollout and
+grad attention rollout. To prevent ViT models from adversarial attack, we
+propose Protego, a detection framework that leverages the transformer intrinsic
+capabilities to detection adversarial examples of ViT models. Nonetheless, this
+is challenging due to a diversity of attack strategies that may be adopted by
+adversaries. Inspired by the attention mechanism, we know that the token of
+prediction contains all the information from the input sample. Additionally,
+the attention region for adversarial examples differs from that of normal
+examples. Given these points, we can train a detector that achieves superior
+performance than existing detection methods to identify adversarial examples.
+Our experiments have demonstrated the high effectiveness of our detection
+method. For these six adversarial attack methods, our detector's AUC scores all
+exceed 0.95. Protego may advance investigations in metaverse security.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE MetaCom 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Knowledge in Distillation: An In-context Sample Retrieval
+  Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinjing Zhu, Songze Li, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional knowledge distillation (KD) approaches are designed for the
+student model to predict similar output as the teacher model for each sample.
+Unfortunately, the relationship across samples with same class is often
+neglected. In this paper, we explore to redefine the knowledge in distillation,
+capturing the relationship between each sample and its corresponding in-context
+samples (a group of similar samples with the same or different classes), and
+perform KD from an in-context sample retrieval perspective. As KD is a type of
+learned label smoothing regularization (LSR), we first conduct a theoretical
+analysis showing that the teacher's knowledge from the in-context samples is a
+crucial contributor to regularize the student training with the corresponding
+samples. Buttressed by the analysis, we propose a novel in-context knowledge
+distillation (IC-KD) framework that shows its superiority across diverse KD
+paradigms (offline, online, and teacher-free KD). Firstly, we construct a
+feature memory bank from the teacher model and retrieve in-context samples for
+each corresponding sample through retrieval-based learning. We then introduce
+Positive In-Context Distillation (PICD) to reduce the discrepancy between a
+sample from the student and the aggregated in-context samples with the same
+class from the teacher in the logit space. Moreover, Negative In-Context
+Distillation (NICD) is introduced to separate a sample from the student and the
+in-context samples with different classes from the teacher in the logit space.
+Extensive experiments demonstrate that IC-KD is effective across various types
+of KD, and consistently achieves state-of-the-art performance on CIFAR-100 and
+ImageNet datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IoT-Based Real-Time Medical-Related Human Activity Recognition Using
+  Skeletons and Multi-Stage Deep Learning for Healthcare 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subrata Kumer Paul, Abu Saleh Musa Miah, Rakhi Rani Paul, Md. Ekramul Hamid, Jungpil Shin, Md Abdur Rahim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Internet of Things (IoT) and mobile technology have significantly
+transformed healthcare by enabling real-time monitoring and diagnosis of
+patients. Recognizing medical-related human activities (MRHA) is pivotal for
+healthcare systems, particularly for identifying actions that are critical to
+patient well-being. However, challenges such as high computational demands, low
+accuracy, and limited adaptability persist in Human Motion Recognition (HMR).
+While some studies have integrated HMR with IoT for real-time healthcare
+applications, limited research has focused on recognizing MRHA as essential for
+effective patient monitoring. This study proposes a novel HMR method for MRHA
+detection, leveraging multi-stage deep learning techniques integrated with IoT.
+The approach employs EfficientNet to extract optimized spatial features from
+skeleton frame sequences using seven Mobile Inverted Bottleneck Convolutions
+(MBConv) blocks, followed by ConvLSTM to capture spatio-temporal patterns. A
+classification module with global average pooling, a fully connected layer, and
+a dropout layer generates the final predictions. The model is evaluated on the
+NTU RGB+D 120 and HMDB51 datasets, focusing on MRHA, such as sneezing, falling,
+walking, sitting, etc. It achieves 94.85% accuracy for cross-subject
+evaluations and 96.45% for cross-view evaluations on NTU RGB+D 120, along with
+89.00% accuracy on HMDB51. Additionally, the system integrates IoT capabilities
+using a Raspberry Pi and GSM module, delivering real-time alerts via Twilios
+SMS service to caregivers and patients. This scalable and efficient solution
+bridges the gap between HMR and IoT, advancing patient monitoring, improving
+healthcare outcomes, and reducing costs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detection of AI Deepfake and Fraud in Online Payments Using GAN-Based
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zong Ke, Shicheng Zhou, Yining Zhou, Chia Hong Chang, Rong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the use of Generative Adversarial Networks (GANs) to
+detect AI deepfakes and fraudulent activities in online payment systems. With
+the growing prevalence of deepfake technology, which can manipulate facial
+features in images and videos, the potential for fraud in online transactions
+has escalated. Traditional security systems struggle to identify these
+sophisticated forms of fraud. This research proposes a novel GAN-based model
+that enhances online payment security by identifying subtle manipulations in
+payment images. The model is trained on a dataset consisting of real-world
+online payment images and deepfake images generated using advanced GAN
+architectures, such as StyleGAN and DeepFake. The results demonstrate that the
+proposed model can accurately distinguish between legitimate transactions and
+deepfakes, achieving a high detection rate above 95%. This approach
+significantly improves the robustness of payment systems against AI-driven
+fraud. The paper contributes to the growing field of digital security, offering
+insights into the application of GANs for fraud detection in financial
+services. Keywords- Payment Security, Image Recognition, Generative Adversarial
+Networks, AI Deepfake, Fraudulent Activities
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper will be published and indexed by IEEE at 2025 8th
+  International Conference on Advanced Algorithms and Control Engineering
+  (ICAACE 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UNetVL: Enhancing 3D Medical Image Segmentation with Chebyshev KAN
+  Powered Vision-LSTM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhui Guo, Tanmoy Dam, Rohan Dhamdhere, Gourav Modanwal, Anant Madabhushi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D medical image segmentation has progressed considerably due to
+Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs), yet these
+methods struggle to balance long-range dependency acquisition with
+computational efficiency. To address this challenge, we propose UNETVL (U-Net
+Vision-LSTM), a novel architecture that leverages recent advancements in
+temporal information processing. UNETVL incorporates Vision-LSTM (ViL) for
+improved scalability and memory functions, alongside an efficient Chebyshev
+Kolmogorov-Arnold Networks (KAN) to handle complex and long-range dependency
+patterns more effectively. We validated our method on the ACDC and AMOS2022
+(post challenge Task 2) benchmark datasets, showing a significant improvement
+in mean Dice score compared to recent state-of-the-art approaches, especially
+over its predecessor, UNETR, with increases of 7.3% on ACDC and 15.6% on AMOS,
+respectively. Extensive ablation studies were conducted to demonstrate the
+impact of each component in UNETVL, providing a comprehensive understanding of
+its architecture. Our code is available at https://github.com/tgrex6/UNETVL,
+facilitating further research and applications in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-Modal Deep Learning Framework for Pan-Cancer Prognosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binyu Zhang, Shichao Li, Junpeng Jian, Zhu Meng, Limei Guo, Zhicheng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prognostic task is of great importance as it closely related to the survival
+analysis of patients, the optimization of treatment plans and the allocation of
+resources. The existing prognostic models have shown promising results on
+specific datasets, but there are limitations in two aspects. On the one hand,
+they merely explore certain types of modal data, such as patient histopathology
+WSI and gene expression analysis. On the other hand, they adopt the
+per-cancer-per-model paradigm, which means the trained models can only predict
+the prognostic effect of a single type of cancer, resulting in weak
+generalization ability. In this paper, a deep-learning based model, named
+UMPSNet, is proposed. Specifically, to comprehensively understand the condition
+of patients, in addition to constructing encoders for histopathology images and
+genomic expression profiles respectively, UMPSNet further integrates four types
+of important meta data (demographic information, cancer type information,
+treatment protocols, and diagnosis results) into text templates, and then
+introduces a text encoder to extract textual features. In addition, the optimal
+transport OT-based attention mechanism is utilized to align and fuse features
+of different modalities. Furthermore, a guided soft mixture of experts (GMoE)
+mechanism is introduced to effectively address the issue of distribution
+differences among multiple cancer datasets. By incorporating the multi-modality
+of patient data and joint training, UMPSNet outperforms all SOTA approaches,
+and moreover, it demonstrates the effectiveness and generalization ability of
+the proposed learning paradigm of a single model for multiple cancer types. The
+code of UMPSNet is available at https://github.com/binging512/UMPSNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Hu, Rong Liu, Meida Chen, Andrew Feng, Peter Beerel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Achieving high-fidelity 3D reconstruction from monocular video remains
+challenging due to the inherent limitations of traditional methods like
+Structure-from-Motion (SfM) and monocular SLAM in accurately capturing scene
+details. While differentiable rendering techniques such as Neural Radiance
+Fields (NeRF) address some of these challenges, their high computational costs
+make them unsuitable for real-time applications. Additionally, existing 3D
+Gaussian Splatting (3DGS) methods often focus on photometric consistency,
+neglecting geometric accuracy and failing to exploit SLAM's dynamic depth and
+pose updates for scene refinement. We propose a framework integrating dense
+SLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach
+introduces SLAM-Informed Adaptive Densification, which dynamically updates and
+densifies the Gaussian model by leveraging dense point clouds from SLAM.
+Additionally, we incorporate Geometry-Guided Optimization, which combines
+edge-aware geometric constraints and photometric consistency to jointly
+optimize the appearance and geometry of the 3DGS scene representation, enabling
+detailed and accurate SLAM mapping reconstruction. Experiments on the Replica
+and TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving
+state-of-the-art results among monocular systems. Specifically, our method
+achieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica,
+representing improvements of 10.7%, 6.4%, and 49.4%, respectively, over the
+previous SOTA. On TUM-RGBD, our method outperforms the closest baseline by
+10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the
+potential of our framework in bridging the gap between photometric and
+geometric dense 3D scene representations, paving the way for practical and
+efficient monocular dense reconstruction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LEO: Boosting Mixture of Vision Encoders for Multimodal Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mozhgan Nasr Azadani, James Riddell, Sean Sedwards, Krzysztof Czarnecki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enhanced visual understanding serves as a cornerstone for multimodal large
+language models (MLLMs). Recent hybrid MLLMs incorporate a mixture of vision
+experts to address the limitations of using a single vision encoder and
+excessively long visual tokens. Despite the progress of these MLLMs, a research
+gap remains in effectively integrating diverse vision encoders. This work
+explores fusion strategies of visual tokens for hybrid MLLMs, leading to the
+design of LEO, a novel MLLM with a dual-branch vision encoder framework that
+incorporates a post-adaptation fusion strategy and adaptive tiling: for each
+segmented tile of the input images, LEO sequentially interleaves the visual
+tokens from its two vision encoders. Extensive evaluation across 13
+vision-language benchmarks reveals that LEO outperforms state-of-the-art
+open-source MLLMs and hybrid MLLMs on the majority of tasks. Furthermore, we
+show that LEO can be adapted to the specialized domain of autonomous driving
+without altering the model architecture or training recipe, achieving
+competitive performance compared to existing baselines. The code and model will
+be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PViT: Prior-augmented Vision <span class="highlight-title">Transformer</span> for Out-of-distribution
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.20631v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.20631v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianhao Zhang, Zhixiang Chen, Lyudmila S. Mihaylova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have achieved remarkable success over various
+vision tasks, yet their robustness against data distribution shifts and
+inherent inductive biases remain underexplored. To enhance the robustness of
+ViT models for image Out-of-Distribution (OOD) detection, we introduce a novel
+and generic framework named Prior-augmented Vision Transformer (PViT). Taking
+as input the prior class logits from a pretrained model, we train PViT to
+predict the class logits. During inference, PViT identifies OOD samples by
+quantifying the divergence between the predicted class logits and the prior
+logits obtained from pre-trained models. Unlike existing state-of-the-art(SOTA)
+OOD detection methods, PViT shapes the decision boundary between ID and OOD by
+utilizing the proposed prior guided confidence, without requiring additional
+data modeling, generation methods, or structural modifications. Extensive
+experiments on the large-scale ImageNet benchmark, evaluated against over seven
+OOD datasets, demonstrate that PViT significantly outperforms existing SOTA OOD
+detection methods in terms of FPR95 and AUROC. The codebase is publicly
+available at https://github.com/RanchoGoose/PViT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse Attention Vectors: Generative Multimodal Model Features Are
+  Discriminative Vision-Language Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.00142v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.00142v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chancharik Mitra, Brandon Huang, Tianning Chai, Zhiqiu Lin, Assaf Arbelle, Rogerio Feris, Leonid Karlinsky, Trevor Darrell, Deva Ramanan, Roei Herzig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Large Multimodal Models (LMMs) like LLaVA and Qwen-VL excel at a
+wide variety of vision-language (VL) tasks such as image captioning or visual
+question answering. Despite strong performance, LMMs are not directly suited
+for foundational discriminative vision-language tasks (i.e., tasks requiring
+discrete label predictions) such as image classification and multiple-choice
+VQA. One key challenge in utilizing LMMs for discriminative tasks is the
+extraction of useful features from generative models. To overcome this issue,
+we propose an approach for finding features in the model's latent space to more
+effectively leverage LMMs for discriminative tasks. Toward this end, we present
+Sparse Attention Vectors (SAVs) -- a finetuning-free method that leverages
+sparse attention head activations (fewer than 1\% of the heads) in LMMs as
+strong features for VL tasks. With only few-shot examples, SAVs demonstrate
+state-of-the-art performance compared to a variety of few-shot and finetuned
+baselines on a collection of discriminative tasks. Our experiments also imply
+that SAVs can scale in performance with additional examples and generalize to
+similar tasks, establishing SAVs as both effective and robust multimodal
+feature representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pre-train</span>ed Vision-Language Models Learn Discoverable Visual Concepts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.12652v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.12652v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Zang, Tian Yun, Hao Tan, Trung Bui, Chen Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Do vision-language models (VLMs) pre-trained to caption an image of a
+"durian" learn visual concepts such as "brown" (color) and "spiky" (texture) at
+the same time? We aim to answer this question as visual concepts learned "for
+free" would enable wide applications such as neuro-symbolic reasoning or
+human-interpretable object classification. We assume that the visual concepts,
+if captured by pre-trained VLMs, can be extracted by their vision-language
+interface with text-based concept prompts. We observe that recent works
+prompting VLMs with concepts often differ in their strategies to define and
+evaluate the visual concepts, leading to conflicting conclusions. We propose a
+new concept definition strategy based on two observations: First, certain
+concept prompts include shortcuts that recognize correct concepts for wrong
+reasons; Second, multimodal information (e.g. visual discriminativeness, and
+textual knowledge) should be leveraged when selecting the concepts. Our
+proposed concept discovery and learning (CDL) framework is thus designed to
+identify a diverse list of generic visual concepts (e.g. "spiky" as opposed to
+"spiky durian"), which are ranked and selected based on visual and language
+mutual information. We carefully design quantitative and human evaluations of
+the discovered concepts on six diverse visual recognition datasets, which
+confirm that pre-trained VLMs do learn visual concepts that provide accurate
+and thorough descriptions for the recognized objects. All code and models are
+publicly released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Transactions on Machine Learning Research, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extracting Manifold Information from Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00427v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00427v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Guidotti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A kernel based method is proposed for the construction of signature
+(defining) functions of subsets of $\mathbb{R}^d$. The subsets can range from
+full dimensional manifolds (open subsets) to point clouds (a finite number of
+points) and include bounded smooth manifolds of any codimension. The
+interpolation and analysis of point clouds are the main application. Two
+extreme cases in terms of regularity are considered, where the data set is
+interpolated by an analytic surface, at the one extreme, and by a H\"older
+continuous surface, at the other. The signature function can be computed as a
+linear combination of translated kernels, the coefficients of which are the
+solution of a finite dimensional linear problem. Once it is obtained, it can be
+used to estimate the dimension as well as the normal and the curvatures of the
+interpolated surface. The method is global and does not require explicit
+knowledge of local neighborhoods or any other structure present in the data
+set. It admits a variational formulation with a natural ``regularized''
+counterpart, that proves to be useful in dealing with data sets corrupted by
+numerical error or noise. The underlying analytical structure of the approach
+is presented in general before it is applied to the case of point clouds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 16 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ExACT: Teaching AI Agents to Explore with Reflective-MCTS and
+  Exploratory Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.02052v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.02052v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Yu, Baolin Peng, Vineeth Vajipey, Hao Cheng, Michel Galley, Jianfeng Gao, Zhou Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous agents have demonstrated significant potential in automating
+complex multistep decision-making tasks. However, even state-of-the-art
+vision-language models (VLMs), such as GPT-4o, still fall short of human-level
+performance, particularly in intricate web environments and long-horizon tasks.
+To address these limitations, we present ExACT, an approach to combine
+test-time search and self-learning to build o1-like models for agentic
+applications. We first introduce Reflective Monte Carlo Tree Search (R-MCTS), a
+novel test time algorithm designed to enhance AI agents' ability to explore
+decision space on the fly. R-MCTS extends traditional MCTS by 1) incorporating
+contrastive reflection, allowing agents to learn from past interactions and
+dynamically improve their search efficiency; and 2) using multi-agent debate
+for reliable state evaluation. Next, we introduce Exploratory Learning, a novel
+learning strategy to teach agents to search at inference time without relying
+on any external search algorithms. On the challenging VisualWebArena benchmark,
+our GPT-4o based R-MCTS agent achieves a 6% to 30% relative improvement across
+various tasks compared to the previous state-of-the-art. Additionally, we show
+that the knowledge and experience gained from test-time search can be
+effectively transferred back to GPT-4o via fine-tuning. After Exploratory
+Learning, GPT-4o 1) demonstrates the ability to explore the environment,
+evaluate a state, and backtrack to viable ones when it detects that the current
+state cannot lead to success, and 2) matches 87% of R-MCTS's performance while
+using significantly less compute. Notably, our work demonstrates the compute
+scaling properties in both training - data collection with R-MCTS - and testing
+time. These results suggest a promising research direction to enhance VLMs'
+capabilities for agentic applications via test-time search and self-learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Sound of Water: Inferring Physical Properties from Pouring Liquids <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piyush Bagad, Makarand Tapaswi, Cees G. M. Snoek, Andrew Zisserman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the connection between audio-visual observations and the underlying
+physics of a mundane yet intriguing everyday activity: pouring liquids. Given
+only the sound of liquid pouring into a container, our objective is to
+automatically infer physical properties such as the liquid level, the shape and
+size of the container, the pouring rate and the time to fill. To this end, we:
+(i) show in theory that these properties can be determined from the fundamental
+frequency (pitch); (ii) train a pitch detection model with supervision from
+simulated data and visual data with a physics-inspired objective; (iii)
+introduce a new large dataset of real pouring videos for a systematic study;
+(iv) show that the trained model can indeed infer these physical properties for
+real data; and finally, (v) we demonstrate strong generalization to various
+container shapes, other datasets, and in-the-wild YouTube videos. Our work
+presents a keen understanding of a narrow yet rich problem at the intersection
+of acoustics, physics, and learning. It opens up applications to enhance
+multisensory perception in robotic pouring.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at https://bpiyush.github.io/pouring-water-website.
+  Short version accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robot Synesthesia: A Sound and Emotion Guided AI Painter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04850v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04850v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vihaan Misra, Peter Schaldenbrand, Jean Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  If a picture paints a thousand words, sound may voice a million. While recent
+robotic painting and image synthesis methods have achieved progress in
+generating visuals from text inputs, the translation of sound into images is
+vastly unexplored. Generally, sound-based interfaces and sonic interactions
+have the potential to expand accessibility and control for the user and provide
+a means to convey complex emotions and the dynamic aspects of the real world.
+In this paper, we propose an approach for using sound and speech to guide a
+robotic painting process, known here as robot synesthesia. For general sound,
+we encode the simulated paintings and input sounds into the same latent space.
+For speech, we decouple speech into its transcribed text and the tone of the
+speech. Whereas we use the text to control the content, we estimate the
+emotions from the tone to guide the mood of the painting. Our approach has been
+fully integrated with FRIDA, a robotic painting framework, adding sound and
+speech to FRIDA's existing input modalities, such as text and style. In two
+surveys, participants were able to correctly guess the emotion or natural sound
+used to generate a given painting more than twice as likely as random chance.
+On our sound-guided image manipulation and music-guided paintings, we discuss
+the results qualitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quilt-1M: One Million Image-Text Pairs for Histopathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11207v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11207v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wisdom Oluchi Ikezogwo, Mehmet Saygin Seyfioglu, Fatemeh Ghezloo, Dylan Stefan Chan Geva, Fatwir Sheikh Mohammed, Pavan Kumar Anand, Ranjay Krishna, Linda Shapiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent accelerations in multi-modal applications have been made possible with
+the plethora of image and text data available online. However, the scarcity of
+analogous data in the medical field, specifically in histopathology, has slowed
+comparable progress. To enable similar representation learning for
+histopathology, we turn to YouTube, an untapped resource of videos, offering
+$1,087$ hours of valuable educational histopathology videos from expert
+clinicians. From YouTube, we curate QUILT: a large-scale vision-language
+dataset consisting of $802, 144$ image and text pairs. QUILT was automatically
+curated using a mixture of models, including large language models, handcrafted
+algorithms, human knowledge databases, and automatic speech recognition. In
+comparison, the most comprehensive datasets curated for histopathology amass
+only around $200$K samples. We combine QUILT with datasets from other sources,
+including Twitter, research papers, and the internet in general, to create an
+even larger dataset: QUILT-1M, with $1$M paired image-text samples, marking it
+as the largest vision-language histopathology dataset to date. We demonstrate
+the value of QUILT-1M by fine-tuning a pre-trained CLIP model. Our model
+outperforms state-of-the-art models on both zero-shot and linear probing tasks
+for classifying new histopathology images across $13$ diverse patch-level
+datasets of $8$ different sub-pathologies and cross-modal retrieval tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhance Eye Disease Detection using Learnable Probabilistic Discrete
+  Latents in Machine Learning Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anirudh Prabhakaran, YeKun Xiao, Ching-Yu Cheng, Dianbo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ocular diseases, including diabetic retinopathy and glaucoma, present a
+significant public health challenge due to their high prevalence and potential
+for causing vision impairment. Early and accurate diagnosis is crucial for
+effective treatment and management. In recent years, deep learning models have
+emerged as powerful tools for analysing medical images, such as retina imaging.
+However, challenges persist in model relibability and uncertainty estimation,
+which are critical for clinical decision-making. This study leverages the
+probabilistic framework of Generative Flow Networks (GFlowNets) to learn the
+posterior distribution over latent discrete dropout masks for the
+classification and analysis of ocular diseases using fundus images. We develop
+a robust and generalizable method that utilizes GFlowOut integrated with
+ResNet18 and ViT models as the backbone in identifying various ocular
+conditions. This study employs a unique set of dropout masks - none, random,
+bottomup, and topdown - to enhance model performance in analyzing these fundus
+images. Our results demonstrate that our learnable probablistic latents
+significantly improves accuracy, outperforming the traditional dropout
+approach. We utilize a gradient map calculation method, Grad-CAM, to assess
+model explainability, observing that the model accurately focuses on critical
+image regions for predictions. The integration of GFlowOut in neural networks
+presents a promising advancement in the automated diagnosis of ocular diseases,
+with implications for improving clinical workflows and patient outcomes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RGB-D Indiscernible Object Counting in Underwater Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11677v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11677v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guolei Sun, Xiaogang Cheng, Zhaochong An, Xiaokang Wang, Yun Liu, Deng-Ping Fan, Ming-Ming Cheng, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, indiscernible/camouflaged scene understanding has attracted lots of
+research attention in the vision community. We further advance the frontier of
+this field by systematically studying a new challenge named indiscernible
+object counting (IOC), the goal of which is to count objects that are blended
+with respect to their surroundings. Due to a lack of appropriate IOC datasets,
+we present a large-scale dataset IOCfish5K which contains a total of 5,637
+high-resolution images and 659,024 annotated center points. Our dataset
+consists of a large number of indiscernible objects (mainly fish) in underwater
+scenes, making the annotation process all the more challenging. IOCfish5K is
+superior to existing datasets with indiscernible scenes because of its larger
+scale, higher image resolutions, more annotations, and denser scenes. All these
+aspects make it the most challenging dataset for IOC so far, supporting
+progress in this area. Benefiting from the recent advancements of depth
+estimation foundation models, we construct high-quality depth maps for
+IOCfish5K by generating pseudo labels using the Depth Anything V2 model. The
+RGB-D version of IOCfish5K is named IOCfish5K-D. For benchmarking purposes on
+IOCfish5K, we select 14 mainstream methods for object counting and carefully
+evaluate them. For multimodal IOCfish5K-D, we evaluate other 4 popular
+multimodal counting methods. Furthermore, we propose IOCFormer, a new strong
+baseline that combines density and regression branches in a unified framework
+and can effectively tackle object counting under concealed scenes. We also
+propose IOCFormer-D to enable the effective usage of depth modality in helping
+detect and count objects hidden in their environments. Experiments show that
+IOCFormer and IOCFormer-D achieve state-of-the-art scores on IOCfish5K and
+IOCfish5K-D, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal version. The resources are available at
+  https://github.com/GuoleiSun/Indiscernible-Object-Counting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMAR-Net: Accurate Cross-Modal 3D SAR Reconstruction of Vehicle Targets
+  with Sparse Multi-Baseline Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04158v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04158v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Da Li, Guoqiang Zhao, Houjun Sun, Jiacheng Bao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-baseline Synthetic Aperture Radar (SAR) three-dimensional (3D)
+tomography is a crucial remote sensing technique that provides 3D resolution
+unavailable in conventional SAR imaging. However, achieving high-quality
+imaging typically requires multi-angle or full-aperture data, resulting in
+significant imaging costs. Recent advancements in sparse 3D SAR, which rely on
+data from limited apertures, have gained attention as a cost-effective
+alternative. Notably, deep learning techniques have markedly enhanced the
+imaging quality of sparse 3D SAR. Despite these advancements, existing methods
+primarily depend on high-resolution radar images for supervising the training
+of deep neural networks (DNNs). This exclusive dependence on single-modal data
+prevents the introduction of complementary information from other data sources,
+limiting further improvements in imaging performance. In this paper, we
+introduce a Cross-Modal 3D-SAR Reconstruction Network (CMAR-Net) to enhance 3D
+SAR imaging by integrating heterogeneous information. Leveraging cross-modal
+supervision from 2D optical images and error transfer guaranteed by
+differentiable rendering, CMAR-Net achieves efficient training and reconstructs
+highly sparse multi-baseline SAR data into visually structured and accurate 3D
+images, particularly for vehicle targets. Extensive experiments on simulated
+and real-world datasets demonstrate that CMAR-Net significantly outperforms
+SOTA sparse reconstruction algorithms based on compressed sensing (CS) and deep
+learning (DL). Furthermore, our method eliminates the need for time-consuming
+full-aperture data preprocessing and relies solely on computer-rendered optical
+images, significantly reducing dataset construction costs. This work highlights
+the potential of deep learning for multi-baseline SAR 3D imaging and introduces
+a novel framework for radar imaging research through cross-modal learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Arc2Avatar: Generating Expressive 3D Avatars from a Single Image via ID
+  Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05379v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05379v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Gerogiannis, Foivos Paraperas Papantoniou, Rolandos Alexandros Potamias, Alexandros Lattas, Stefanos Zafeiriou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by the effectiveness of 3D Gaussian Splatting (3DGS) in
+reconstructing detailed 3D scenes within multi-view setups and the emergence of
+large 2D human foundation models, we introduce Arc2Avatar, the first SDS-based
+method utilizing a human face foundation model as guidance with just a single
+image as input. To achieve that, we extend such a model for diverse-view human
+head generation by fine-tuning on synthetic data and modifying its
+conditioning. Our avatars maintain a dense correspondence with a human face
+mesh template, allowing blendshape-based expression generation. This is
+achieved through a modified 3DGS approach, connectivity regularizers, and a
+strategic initialization tailored for our task. Additionally, we propose an
+optional efficient SDS-based correction step to refine the blendshape
+expressions, enhancing realism and diversity. Experiments demonstrate that
+Arc2Avatar achieves state-of-the-art realism and identity preservation,
+effectively addressing color issues by allowing the use of very low guidance,
+enabled by our strong identity prior and initialization strategy, without
+compromising detail. Please visit https://arc2avatar.github.io for more
+resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page https://arc2avatar.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAD-DINO: Exploring Scalable Medical Image Encoders Beyond Text
+  Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10815v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10815v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Pérez-García, Harshita Sharma, Sam Bond-Taylor, Kenza Bouzid, Valentina Salvatelli, Maximilian Ilse, Shruthi Bannur, Daniel C. Castro, Anton Schwaighofer, Matthew P. Lungren, Maria Wetscherek, Noel Codella, Stephanie L. Hyland, Javier Alvarez-Valle, Ozan Oktay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language-supervised pre-training has proven to be a valuable method for
+extracting semantically meaningful features from images, serving as a
+foundational element in multimodal systems within the computer vision and
+medical imaging domains. However, the computed features are limited by the
+information contained in the text, which is particularly problematic in medical
+imaging, where the findings described by radiologists focus on specific
+observations. This challenge is compounded by the scarcity of paired
+imaging-text data due to concerns over leakage of personal health information.
+In this work, we fundamentally challenge the prevailing reliance on language
+supervision for learning general-purpose biomedical imaging encoders. We
+introduce RAD-DINO, a biomedical image encoder pre-trained solely on unimodal
+biomedical imaging data that obtains similar or greater performance than
+state-of-the-art biomedical language-supervised models on a diverse range of
+benchmarks. Specifically, the quality of learned representations is evaluated
+on standard imaging tasks (classification and semantic segmentation), and a
+vision-language alignment task (text report generation from images). To further
+demonstrate the drawback of language supervision, we show that features from
+RAD-DINO correlate with other medical records (e.g., sex or age) better than
+language-supervised models, which are generally not mentioned in radiology
+reports. Finally, we conduct a series of ablations determining the factors in
+RAD-DINO's performance; notably, we observe that RAD-DINO's downstream
+performance scales well with the quantity and diversity of training data,
+demonstrating that image-only supervision is a scalable approach for training a
+foundational biomedical image encoder. Model weights of RAD-DINO trained on
+publicly available datasets are available at
+https://huggingface.co/microsoft/rad-dino.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Agentic Copyright Watermarking against Adversarial Evidence Forgery with
+  Purification-Agnostic Curriculum Proxy Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.01541v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.01541v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erjin Bao, Ching-Chun Chang, Hanrui Wang, Isao Echizen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the proliferation of AI agents in various domains, protecting the
+ownership of AI models has become crucial due to the significant investment in
+their development. Unauthorized use and illegal distribution of these models
+pose serious threats to intellectual property, necessitating effective
+copyright protection measures. Model watermarking has emerged as a key
+technique to address this issue, embedding ownership information within models
+to assert rightful ownership during copyright disputes. This paper presents
+several contributions to model watermarking: a self-authenticating black-box
+watermarking protocol using hash techniques, a study on evidence forgery
+attacks using adversarial perturbations, a proposed defense involving a
+purification step to counter adversarial attacks, and a purification-agnostic
+curriculum proxy learning method to enhance watermark robustness and model
+performance. Experimental results demonstrate the effectiveness of these
+approaches in improving the security, reliability, and performance of
+watermarked models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ScVLM: Enhancing Vision-Language Model for Safety-Critical Event
+  Understanding <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.00982v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.00982v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Shi, Boyu Jiang, Tong Zeng, Feng Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately identifying, understanding and describing traffic safety-critical
+events (SCEs), including crashes, tire strikes, and near-crashes, is crucial
+for advanced driver assistance systems, automated driving systems, and traffic
+safety. As SCEs are rare events, most general vision-language models (VLMs)
+have not been trained sufficiently to link SCE videos and narratives, which
+could lead to hallucinations and missing key safety characteristics. Here, we
+introduce ScVLM, a novel hybrid methodology that integrates supervised and
+contrastive learning techniques to classify the severity and types of SCEs, as
+well as to generate narrative descriptions of SCEs. This approach utilizes
+classification to enhance VLMs' comprehension of driving videos and improve the
+rationality of event descriptions. The proposed approach is trained on and
+evaluated by more than 8,600 SCEs from the Second Strategic Highway Research
+Program Naturalistic Driving Study dataset, the largest publicly accessible
+driving dataset with videos and SCE annotations. The results demonstrate the
+superiority of the proposed approach in generating contextually accurate event
+descriptions and mitigating VLM hallucinations. The code will be available at
+https://github.com/datadrivenwheels/ScVLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Proceedings of the IEEE/CVF Winter Conference on
+  Applications of Computer Vision (WACV) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automation of Quantum Dot Measurement Analysis via Explainable Machine
+  Learning <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13699v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13699v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Schug, Tyler J. Kovach, M. A. Wolfe, Jared Benson, Sanghyeok Park, J. P. Dodson, J. Corrigan, M. A. Eriksson, Justyna P. Zwolak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of quantum dot (QD) devices for quantum computing has
+necessitated more efficient and automated methods for device characterization
+and tuning. This work demonstrates the feasibility and advantages of applying
+explainable machine learning techniques to the analysis of quantum dot
+measurements, paving the way for further advances in automated and transparent
+QD device tuning. Many of the measurements acquired during the tuning process
+come in the form of images that need to be properly analyzed to guide the
+subsequent tuning steps. By design, features present in such images capture
+certain behaviors or states of the measured QD devices. When considered
+carefully, such features can aid the control and calibration of QD devices. An
+important example of such images are so-called $\textit{triangle plots}$, which
+visually represent current flow and reveal characteristics important for QD
+device calibration. While image-based classification tools, such as
+convolutional neural networks (CNNs), can be used to verify whether a given
+measurement is $\textit{good}$ and thus warrants the initiation of the next
+phase of tuning, they do not provide any insights into how the device should be
+adjusted in the case of $\textit{bad}$ images. This is because CNNs sacrifice
+prediction and model intelligibility for high accuracy. To ameliorate this
+trade-off, a recent study introduced an image vectorization approach that
+relies on the Gabor wavelet transform (Schug $\textit{et al.}$ 2024
+$\textit{Proc. XAI4Sci: Explainable Machine Learning for Sciences Workshop
+(AAAI 2024) (Vancouver, Canada)}$ pp 1-6). Here we propose an alternative
+vectorization method that involves mathematical modeling of synthetic triangles
+to mimic the experimental data. Using explainable boosting machines, we show
+that this new method offers superior explainability of model prediction without
+sacrificing accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures, abbreviated version published in Proceedings of
+  the XAI4Sci: Explainable machine learning for sciences workshop at AAAI 2024,
+  (Vancouver, Canada)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Class Distance Weighted Cross Entropy Loss for Classification of Disease
+  Severity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01246v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01246v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gorkem Polat, Ümit Mert Çağlar, Alptekin Temizel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assessing disease severity with ordinal classes, where each class reflects
+increasing severity levels, benefits from loss functions designed for this
+ordinal structure. Traditional categorical loss functions, like Cross-Entropy
+(CE), often perform suboptimally in these scenarios. To address this, we
+propose a novel loss function, Class Distance Weighted Cross-Entropy (CDW-CE),
+which penalizes misclassifications more severely when the predicted and actual
+classes are farther apart. We evaluated CDW-CE using various deep
+architectures, comparing its performance against several categorical and
+ordinal loss functions. To assess the quality of latent representations, we
+used t-distributed stochastic neighbor embedding (t-SNE) and uniform manifold
+approximation and projection (UMAP) visualizations, quantified the clustering
+quality using the Silhouette Score, and compared Class Activation Maps (CAM)
+generated by models trained with CDW-CE and CE loss. Feedback from domain
+experts was incorporated to evaluate how well model attention aligns with
+expert opinion. Our results show that CDW-CE consistently improves performance
+in ordinal image classification tasks. It achieves higher Silhouette Scores,
+indicating better class discrimination capability, and its CAM visualizations
+show a stronger focus on clinically significant regions, as validated by domain
+experts. Receiver operator characteristics (ROC) curves and the area under the
+curve (AUC) scores highlight that CDW-CE outperforms other loss functions,
+including prominent ordinal loss functions from the literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FusionSORT: Fusion Methods for Online Multi-object Visual Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00843v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00843v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathanael L. Baisa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate four different fusion methods for associating
+detections to tracklets in multi-object visual tracking. In addition to
+considering strong cues such as motion and appearance information, we also
+consider weak cues such as height intersection-over-union (height-IoU) and
+tracklet confidence information in the data association using different fusion
+methods. These fusion methods include minimum, weighted sum based on IoU,
+Kalman filter (KF) gating, and hadamard product of costs due to the different
+cues. We conduct extensive evaluations on validation sets of MOT17, MOT20 and
+DanceTrack datasets, and find out that the choice of a fusion method is key for
+data association in multi-object visual tracking. We hope that this
+investigative work helps the computer vision research community to use the
+right fusion method for data association in multi-object visual tracking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Light Transport-aware Diffusion Posterior Sampling for Single-View
+  Reconstruction of 3D Volumes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludwic Leonard, Nils Thuerey, Ruediger Westermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a single-view reconstruction technique of volumetric fields in
+which multiple light scattering effects are omnipresent, such as in clouds. We
+model the unknown distribution of volumetric fields using an unconditional
+diffusion model trained on a novel benchmark dataset comprising 1,000
+synthetically simulated volumetric density fields. The neural diffusion model
+is trained on the latent codes of a novel, diffusion-friendly, monoplanar
+representation. The generative model is used to incorporate a tailored
+parametric diffusion posterior sampling technique into different reconstruction
+tasks. A physically-based differentiable volume renderer is employed to provide
+gradients with respect to light transport in the latent space. This stands in
+contrast to classic NeRF approaches and makes the reconstructions better
+aligned with observed data. Through various experiments, we demonstrate
+single-view reconstruction of volumetric clouds at a previously unattainable
+quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Pupil Segmentation with SAM 2: A Case Study of Over 14 Million
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.08926v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.08926v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Virmarie Maquiling, Sean Anthony Byrne, Diederick C. Niehorster, Marco Carminati, Enkelejda Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the transformative potential of SAM 2, a vision foundation model,
+in advancing gaze estimation and eye tracking technologies. By significantly
+reducing annotation time, lowering technical barriers through its ease of
+deployment, and enhancing segmentation accuracy, SAM 2 addresses critical
+challenges faced by researchers and practitioners. Utilizing its zero-shot
+segmentation capabilities with minimal user input-a single click per video-we
+tested SAM 2 on over 14 million eye images from diverse datasets, including
+virtual reality setups and the world's largest unified dataset recorded using
+wearable eye trackers. Remarkably, in pupil segmentation tasks, SAM 2 matches
+the performance of domain-specific models trained solely on eye images,
+achieving competitive mean Intersection over Union (mIoU) scores of up to 93%
+without fine-tuning. Additionally, we provide our code and segmentation masks
+for these widely used datasets to promote further research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Virmarie Maquiling and Sean Anthony Byrne contributed equally to this
+  paper, 8 pages, 3 figures, ETRA 2025, pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expanding Performance Boundaries of Open-Source Multimodal Models with
+  Model, Data, and Test-Time Scaling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05271v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05271v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Chen, Weiyun Wang, Yue Cao, Yangzhou Liu, Zhangwei Gao, Erfei Cui, Jinguo Zhu, Shenglong Ye, Hao Tian, Zhaoyang Liu, Lixin Gu, Xuehui Wang, Qingyun Li, Yimin Ren, Zixuan Chen, Jiapeng Luo, Jiahao Wang, Tan Jiang, Bo Wang, Conghui He, Botian Shi, Xingcheng Zhang, Han Lv, Yi Wang, Wenqi Shao, Pei Chu, Zhongying Tu, Tong He, Zhiyong Wu, Huipeng Deng, Jiaye Ge, Kai Chen, Kaipeng Zhang, Limin Wang, Min Dou, Lewei Lu, Xizhou Zhu, Tong Lu, Dahua Lin, Yu Qiao, Jifeng Dai, Wenhai Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce InternVL 2.5, an advanced multimodal large language model (MLLM)
+series that builds upon InternVL 2.0, maintaining its core model architecture
+while introducing significant enhancements in training and testing strategies
+as well as data quality. In this work, we delve into the relationship between
+model scaling and performance, systematically exploring the performance trends
+in vision encoders, language models, dataset sizes, and test-time
+configurations. Through extensive evaluations on a wide range of benchmarks,
+including multi-discipline reasoning, document understanding, multi-image /
+video understanding, real-world comprehension, multimodal hallucination
+detection, visual grounding, multilingual capabilities, and pure language
+processing, InternVL 2.5 exhibits competitive performance, rivaling leading
+commercial models such as GPT-4o and Claude-3.5-Sonnet. Notably, our model is
+the first open-source MLLMs to surpass 70% on the MMMU benchmark, achieving a
+3.7-point improvement through Chain-of-Thought (CoT) reasoning and showcasing
+strong potential for test-time scaling. We hope this model contributes to the
+open-source community by setting new standards for developing and applying
+multimodal AI systems. HuggingFace demo see
+https://huggingface.co/spaces/OpenGVLab/InternVL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BayesAdapter: enhanced uncertainty estimation in CLIP few-shot
+  adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09718v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09718v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Morales-Álvarez, Stergios Christodoulidis, Maria Vakalopoulou, Pablo Piantanida, Jose Dolz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large pre-trained vision-language models (VLMs) represents a
+paradigm shift in machine learning, with unprecedented results in a broad span
+of visual recognition tasks. CLIP, one of the most popular VLMs, has exhibited
+remarkable zero-shot and transfer learning capabilities in classification. To
+transfer CLIP to downstream tasks, adapters constitute a parameter-efficient
+approach that avoids backpropagation through the large model (unlike related
+prompt learning methods). However, CLIP adapters have been developed to target
+discriminative performance, and the quality of their uncertainty estimates has
+been overlooked. In this work we show that the discriminative performance of
+state-of-the-art CLIP adapters does not always correlate with their uncertainty
+estimation capabilities, which are essential for a safe deployment in
+real-world scenarios. We also demonstrate that one of such adapters is obtained
+through MAP inference from a more general probabilistic framework. Based on
+this observation we introduce BayesAdapter, which leverages Bayesian inference
+to estimate a full probability distribution instead of a single point, better
+capturing the variability inherent in the parameter space. In a comprehensive
+empirical evaluation we show that our approach obtains high quality uncertainty
+estimates in the predictions, standing out in calibration and selective
+classification. Our code will be publicly available upon acceptance of the
+paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 5 figures, 23 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GIM: A Million-scale Benchmark for Generative Image Manipulation
+  Detection and Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.16531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.16531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yirui Chen, Xudong Huang, Quan Zhang, Wei Li, Mingjian Zhu, Qiangyu Yan, Simiao Li, Hanting Chen, Hailin Hu, Jie Yang, Wei Liu, Jie Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The extraordinary ability of generative models emerges as a new trend in
+image editing and generating realistic images, posing a serious threat to the
+trustworthiness of multimedia data and driving the research of image
+manipulation detection and location (IMDL). However, the lack of a large-scale
+data foundation makes the IMDL task unattainable. In this paper, we build a
+local manipulation data generation pipeline that integrates the powerful
+capabilities of SAM, LLM, and generative models. Upon this basis, we propose
+the GIM dataset, which has the following advantages: 1) Large scale, GIM
+includes over one million pairs of AI-manipulated images and real images. 2)
+Rich image content, GIM encompasses a broad range of image classes. 3) Diverse
+generative manipulation, the images are manipulated images with
+state-of-the-art generators and various manipulation tasks. The aforementioned
+advantages allow for a more comprehensive evaluation of IMDL methods, extending
+their applicability to diverse images. We introduce the GIM benchmark with two
+settings to evaluate existing IMDL methods. In addition, we propose a novel
+IMDL framework, termed GIMFormer, which consists of a ShadowTracer,
+Frequency-Spatial block (FSB), and a Multi-Window Anomalous Modeling (MWAM)
+module. Extensive experiments on the GIM demonstrate that GIMFormer surpasses
+the previous state-of-the-art approach on two different benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code page: https://github.com/chenyirui/GIM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Point-JEPA: A Joint Embedding Predictive Architecture for
+  <span class="highlight-title">Self-Supervised</span> Learning on Point Cloud 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.16432v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.16432v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayumu Saito, Prachi Kudeshia, Jiju Poovvancheri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in self-supervised learning in the point cloud domain
+have demonstrated significant potential. However, these methods often suffer
+from drawbacks, including lengthy pre-training time, the necessity of
+reconstruction in the input space, or the necessity of additional modalities.
+In order to address these issues, we introduce Point-JEPA, a joint embedding
+predictive architecture designed specifically for point cloud data. To this
+end, we introduce a sequencer that orders point cloud patch embeddings to
+efficiently compute and utilize their proximity based on the indices during
+target and context selection. The sequencer also allows shared computations of
+the patch embeddings' proximity between context and target selection, further
+improving the efficiency. Experimentally, our method achieves competitive
+results with state-of-the-art methods while avoiding the reconstruction in the
+input space or additional modality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor
+  Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03836v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03836v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runci Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain tumors can result in neurological dysfunction, alterations in cognitive
+and psychological states, increased intracranial pressure, and the occurrence
+of seizures, thereby presenting a substantial risk to human life and health.
+The You Only Look Once(YOLO) series models have demonstrated superior accuracy
+in object detection for medical imaging. In this paper, we develop a novel
+SCC-YOLO architecture by integrating the SCConv attention mechanism into
+YOLOv9. The SCConv module reconstructs an efficient convolutional module by
+reducing spatial and channel redundancy among features, thereby enhancing the
+learning of image features. We investigate the impact of intergrating different
+attention mechanisms with the YOLOv9 model on brain tumor image detection using
+both the Br35H dataset and our self-made dataset(Brain_Tumor_Dataset).
+Experimental results show that on the Br35H dataset, SCC-YOLO achieved a 0.3%
+improvement in mAp50 compared to YOLOv9, while on our self-made dataset,
+SCC-YOLO exhibited a 0.5% improvement over YOLOv9. SCC-YOLO has reached
+state-of-the-art performance in brain tumor detection. Source code is available
+at : https://jihulab.com/healthcare-information-studio/SCC-YOLO/-/tree/master
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-Guided Coarse-to-Fine Fusion Network for Robust Remote Sensing
+  Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15770v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15770v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhicheng Zhao, Changfu Zhou, Yu Zhang, Chenglong Li, Xiaoliang Ma, Jin Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote Sensing Visual Question Answering (RSVQA) has gained significant
+research interest. However, current RSVQA methods are limited by the imaging
+mechanisms of optical sensors, particularly under challenging conditions such
+as cloud-covered and low-light scenarios. Given the all-time and all-weather
+imaging capabilities of Synthetic Aperture Radar (SAR), it is crucial to
+investigate the integration of optical-SAR images to improve RSVQA performance.
+In this work, we propose a Text-guided Coarse-to-Fine Fusion Network (TGFNet),
+which leverages the semantic relationships between question text and
+multi-source images to guide the network toward complementary fusion at the
+feature level. Specifically, we develop a Text-guided Coarse-to-Fine Attention
+Refinement (CFAR) module to focus on key areas related to the question in
+complex remote sensing images. This module progressively directs attention from
+broad areas to finer details through key region routing, enhancing the model's
+ability to focus on relevant regions. Furthermore, we propose an Adaptive
+Multi-Expert Fusion (AMEF) module that dynamically integrates different
+experts, enabling the adaptive fusion of optical and SAR features. In addition,
+we create the first large-scale benchmark dataset for evaluating optical-SAR
+RSVQA methods, comprising 6,008 well-aligned optical-SAR image pairs and
+1,036,694 well-labeled question-answer pairs across 16 diverse question types,
+including complex relational reasoning questions. Extensive experiments on the
+proposed dataset demonstrate that our TGFNet effectively integrates
+complementary information between optical and SAR images, significantly
+improving the model's performance in challenging scenarios. The dataset is
+available at: https://github.com/mmic-lcl/.
+  Index Terms: Remote Sensing Visual Question Answering, Multi-source Data
+Fusion, Multimodal, Remote Sensing, OPT-SAR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-Driven Early Mental Health Screening: Analyzing Selfies of Pregnant
+  Women <span class="chip">ALT</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gustavo A. Basílio, Thiago B. Pereira, Alessandro L. Koerich, Hermano Tavares, Ludmila Dias, Maria das Graças da S. Teixeira, Rafael T. Sousa, Wilian H. Hisatugu, Amanda S. Mota, Anilton S. Garcia, Marco Aurélio K. Galletta, Thiago M. Paixão
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Major Depressive Disorder and anxiety disorders affect millions globally,
+contributing significantly to the burden of mental health issues. Early
+screening is crucial for effective intervention, as timely identification of
+mental health issues can significantly improve treatment outcomes. Artificial
+intelligence (AI) can be valuable for improving the screening of mental
+disorders, enabling early intervention and better treatment outcomes. AI-driven
+screening can leverage the analysis of multiple data sources, including facial
+features in digital images. However, existing methods often rely on controlled
+environments or specialized equipment, limiting their broad applicability. This
+study explores the potential of AI models for ubiquitous depression-anxiety
+screening given face-centric selfies. The investigation focuses on high-risk
+pregnant patients, a population that is particularly vulnerable to mental
+health issues. To cope with limited training data resulting from our clinical
+setup, pre-trained models were utilized in two different approaches:
+fine-tuning convolutional neural networks (CNNs) originally designed for facial
+expression recognition and employing vision-language models (VLMs) for
+zero-shot analysis of facial expressions. Experimental results indicate that
+the proposed VLM-based method significantly outperforms CNNs, achieving an
+accuracy of 77.6%. Although there is significant room for improvement, the
+results suggest that VLMs can be a promising approach for mental health
+screening.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article has been accepted for publication in HEALTHINF25 at the
+  18th International Joint Conference on Biomedical Engineering Systems and
+  Technologies (BIOSTEC 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Forward Compatibility in Class Incremental Learning by
+  Increasing Representation Rank and Feature Richness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15517v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15517v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaeill Kim, Wonseok Lee, Moonjung Eo, Wonjong Rhee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class Incremental Learning (CIL) constitutes a pivotal subfield within
+continual learning, aimed at enabling models to progressively learn new
+classification tasks while retaining knowledge obtained from prior tasks.
+Although previous studies have predominantly focused on backward compatible
+approaches to mitigate catastrophic forgetting, recent investigations have
+introduced forward compatible methods to enhance performance on novel tasks and
+complement existing backward compatible methods. In this study, we introduce an
+effective-Rank based Feature Richness enhancement (RFR) method, designed for
+improving forward compatibility. Specifically, this method increases the
+effective rank of representations during the base session, thereby facilitating
+the incorporation of more informative features pertinent to unseen novel tasks.
+Consequently, RFR achieves dual objectives in backward and forward
+compatibility: minimizing feature extractor modifications and enhancing novel
+task performance, respectively. To validate the efficacy of our approach, we
+establish a theoretical connection between effective rank and the Shannon
+entropy of representations. Subsequently, we conduct comprehensive experiments
+by integrating RFR into eleven well-known CIL methods. Our results demonstrate
+the effectiveness of our approach in enhancing novel-task performance while
+mitigating catastrophic forgetting. Furthermore, our method notably improves
+the average incremental accuracy across all eleven cases examined.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Counterfactual Image Generation <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.20287v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.20287v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Melistas, Nikos Spyrou, Nefeli Gkouti, Pedro Sanchez, Athanasios Vlontzos, Yannis Panagakis, Giorgos Papanastasiou, Sotirios A. Tsaftaris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI has revolutionised visual content editing, empowering users to
+effortlessly modify images and videos. However, not all edits are equal. To
+perform realistic edits in domains such as natural image or medical imaging,
+modifications must respect causal relationships inherent to the data generation
+process. Such image editing falls into the counterfactual image generation
+regime. Evaluating counterfactual image generation is substantially complex:
+not only it lacks observable ground truths, but also requires adherence to
+causal constraints. Although several counterfactual image generation methods
+and evaluation metrics exist, a comprehensive comparison within a unified
+setting is lacking. We present a comparison framework to thoroughly benchmark
+counterfactual image generation methods. We integrate all models that have been
+used for the task at hand and expand them to novel datasets and causal graphs,
+demonstrating the superiority of Hierarchical VAEs across most datasets and
+metrics. Our framework is implemented in a user-friendly Python package that
+can be extended to incorporate additional SCMs, causal methods, generative
+models, and datasets for the community to build on. Code:
+https://github.com/gulnazaki/counterfactual-benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at NeurIPS 2024 Datasets and
+  Benchmarks Track https://openreview.net/forum?id=0T8xRFrScB Project page:
+  https://gulnazaki.github.io/counterfactual-benchmark</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Situational Scene Graph for Structured Human-centric Situation
+  Understanding <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.22829v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.22829v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chinthani Sugandhika, Chen Li, Deepu Rajan, Basura Fernando
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph based representation has been widely used in modelling spatio-temporal
+relationships in video understanding. Although effective, existing graph-based
+approaches focus on capturing the human-object relationships while ignoring
+fine-grained semantic properties of the action components. These semantic
+properties are crucial for understanding the current situation, such as where
+does the action takes place, what tools are used and functional properties of
+the objects. In this work, we propose a graph-based representation called
+Situational Scene Graph (SSG) to encode both human-object relationships and the
+corresponding semantic properties. The semantic details are represented as
+predefined roles and values inspired by situation frame, which is originally
+designed to represent a single action. Based on our proposed representation, we
+introduce the task of situational scene graph generation and propose a
+multi-stage pipeline Interactive and Complementary Network (InComNet) to
+address the task. Given that the existing datasets are not applicable to the
+task, we further introduce a SSG dataset whose annotations consist of semantic
+role-value frames for human, objects and verb predicates of human-object
+relations. Finally, we demonstrate the effectiveness of our proposed SSG
+representation by testing on different downstream tasks. Experimental results
+show that the unified representation can not only benefit predicate
+classification and semantic role-value classification, but also benefit
+reasoning tasks on human-centric situation understanding. We will release the
+code and the dataset soon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Head Explainer: A General Framework to Improve Explainability in
+  CNNs and <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01311v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01311v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohang Sun, Pietro Liò
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce the Multi-Head Explainer (MHEX), a versatile and
+modular framework that enhances both the explainability and accuracy of
+Convolutional Neural Networks (CNNs) and Transformer-based models. MHEX
+consists of three core components: an Attention Gate that dynamically
+highlights task-relevant features, Deep Supervision that guides early layers to
+capture fine-grained details pertinent to the target class, and an Equivalent
+Matrix that unifies refined local and global representations to generate
+comprehensive saliency maps. Our approach demonstrates superior compatibility,
+enabling effortless integration into existing residual networks like ResNet and
+Transformer architectures such as BERT with minimal modifications. Extensive
+experiments on benchmark datasets in medical imaging and text classification
+show that MHEX not only improves classification accuracy but also produces
+highly interpretable and detailed saliency scores.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OCTolyzer: Fully automatic toolkit for segmentation and feature
+  extracting in optical coherence tomography and scanning laser ophthalmoscopy
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.14128v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.14128v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jamie Burke, Justin Engelmann, Samuel Gibbon, Charlene Hamid, Diana Moukaddem, Dan Pugh, Tariq Farrah, Niall Strang, Neeraj Dhaun, Tom MacGillivray, Stuart King, Ian J. C. MacCormick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical coherence tomography (OCT) and scanning laser ophthalmoscopy (SLO) of
+the eye has become essential to ophthalmology and the emerging field of
+oculomics, thus requiring a need for transparent, reproducible, and rapid
+analysis of this data for clinical research and the wider research community.
+Here, we introduce OCTolyzer, the first open-source toolkit for retinochoroidal
+analysis in OCT/SLO data. It features two analysis suites for OCT and SLO data,
+facilitating deep learning-based anatomical segmentation and feature extraction
+of the cross-sectional retinal and choroidal layers and en face retinal
+vessels. We describe OCTolyzer and evaluate the reproducibility of its OCT
+choroid analysis. At the population level, metrics for choroid region thickness
+were highly reproducible, with a mean absolute error (MAE)/Pearson correlation
+for macular volume choroid thickness (CT) of 6.7$\mu$m/0.99, macular B-scan CT
+of 11.6$\mu$m/0.99, and peripapillary CT of 5.0$\mu$m/0.99. Macular choroid
+vascular index (CVI) also showed strong reproducibility, with MAE/Pearson for
+volume CVI yielding 0.0271/0.97 and B-scan CVI 0.0130/0.91. At the eye level,
+measurement noise for regional and vessel metrics was below 5% and 20% of the
+population's variability, respectively. Outliers were caused by poor-quality
+B-scans with thick choroids and invisible choroid-sclera boundary. Processing
+times on a laptop CPU were under three seconds for macular/peripapillary
+B-scans and 85 seconds for volume scans. OCTolyzer can convert OCT/SLO data
+into reproducible and clinically meaningful retinochoroidal features and will
+improve the standardisation of ocular measurements in OCT/SLO image analysis,
+requiring no specialised training or proprietary software to be used. OCTolyzer
+is freely available here: https://github.com/jaburke166/OCTolyzer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main paper: 15 pages, 9 figures, 3 tables. Supplementary material: 9
+  pages, 6 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VibrantVS: A high-resolution multi-task <span class="highlight-title">transformer</span> for forest canopy
+  height estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10351v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10351v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tony Chang, Kiarie Ndegwa, Andreas Gros, Vincent A. Landau, Luke J. Zachmann, Bogdan State, Mitchell A. Gritts, Colton W. Miller, Nathan E. Rutenbeck, Scott Conway, Guy Bayes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the application of a novel multi-task vision transformer
+(ViT) model for the estimation of canopy height models (CHMs) using 4-band
+National Agriculture Imagery Program (NAIP) imagery across the western United
+States. We compare the effectiveness of this model in terms of accuracy and
+precision aggregated across ecoregions and class heights versus three other
+benchmark peer-reviewed models. Key findings suggest that, while other
+benchmark models can provide high precision in localized areas, the VibrantVS
+model has substantial advantages across a broad reach of ecoregions in the
+western United States with higher accuracy, higher precision, the ability to
+generate updated inference at a cadence of three years or less, and high
+spatial resolution. The VibrantVS model provides significant value for
+ecological monitoring and land management decisions for wildfire mitigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object
+  Interaction Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenkun He, Yun Liu, Ruitao Liu, Li Yi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthesizing realistic human-object interaction motions is a critical problem
+in VR/AR and human animation. Unlike the commonly studied scenarios involving a
+single human or hand interacting with one object, we address a more generic
+multi-body setting with arbitrary numbers of humans, hands, and objects. This
+complexity introduces significant challenges in synchronizing motions due to
+the high correlations and mutual influences among bodies. To address these
+challenges, we introduce SyncDiff, a novel method for multi-body interaction
+synthesis using a synchronized motion diffusion strategy. SyncDiff employs a
+single diffusion model to capture the joint distribution of multi-body motions.
+To enhance motion fidelity, we propose a frequency-domain motion decomposition
+scheme. Additionally, we introduce a new set of alignment scores to emphasize
+the synchronization of different body motions. SyncDiff jointly optimizes both
+data sample likelihood and alignment likelihood through an explicit
+synchronization strategy. Extensive experiments across four datasets with
+various multi-body configurations demonstrate the superiority of SyncDiff over
+existing state-of-the-art motion synthesis methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PSA-VLM: Enhancing Vision-Language Model Safety through Progressive
+  Concept-Bottleneck-Driven Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11543v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11543v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhendong Liu, Yuanbi Nie, Yingshui Tan, Jiaheng Liu, Xiangyu Yue, Qiushi Cui, Chongjun Wang, Xiaoyong Zhu, Bo Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from the powerful capabilities of Large Language Models (LLMs),
+pre-trained visual encoder models connected to LLMs form Vision Language Models
+(VLMs). However, recent research shows that the visual modality in VLMs is
+highly vulnerable, allowing attackers to bypass safety alignment in LLMs
+through visually transmitted content, launching harmful attacks. To address
+this challenge, we propose a progressive concept-based alignment strategy,
+PSA-VLM, which incorporates safety modules as concept bottlenecks to enhance
+visual modality safety alignment. By aligning model predictions with specific
+safety concepts, we improve defenses against risky images, enhancing
+explainability and controllability while minimally impacting general
+performance. Our method is obtained through two-stage training. The low
+computational cost of the first stage brings very effective performance
+improvement, and the fine-tuning of the language model in the second stage
+further improves the safety performance. Our method achieves state-of-the-art
+results on popular VLM safety benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2405.13581</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Migician: Revealing the Magic of Free-Form Multi-Image Grounding in
+  Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05767v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05767v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        You Li, Heyu Huang, Chi Chen, Kaiyu Huang, Chao Huang, Zonghao Guo, Zhiyuan Liu, Jinan Xu, Yuhua Li, Ruixuan Li, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advancement of Multimodal Large Language Models (MLLMs) has
+significantly improved their fine-grained perception of single images and
+general comprehension across multiple images. However, existing MLLMs still
+face challenges in achieving precise grounding in complex multi-image
+scenarios. To address this, we first explore a Chain-of-Thought (CoT) framework
+that integrates single-image grounding with multi-image comprehension. While
+partially effective, it remains unstable and struggles to capture abstract
+visual information due to its non-end-to-end nature. Therefore, we introduce
+Migician, the first multi-image grounding model capable of performing free-form
+and accurate grounding across multiple images. To support this, we present the
+MGrounding-630k dataset, which comprises data for several multi-image grounding
+tasks derived from existing datasets, along with newly generated free-form
+grounding instruction-following data. Furthermore, we propose MIG-Bench, a
+comprehensive benchmark specifically designed for evaluating multi-image
+grounding capabilities. Experimental results demonstrate that our model
+achieves significantly superior multi-image grounding capabilities,
+outperforming the best existing MLLMs by 21.61% and even surpassing much larger
+70B models. Our code, model, dataset, and benchmark are fully open-sourced at
+https://migician-vg.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> Amortizing intractable inference in diffusion models for vision,
+  language, and control <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20971v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20971v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddarth Venkatraman, Moksh Jain, Luca Scimeca, Minsu Kim, Marcin Sendera, Mohsin Hasan, Luke Rowe, Sarthak Mittal, Pablo Lemos, Emmanuel Bengio, Alexandre Adam, Jarrid Rector-Brooks, <span class="highlight-author">Yoshua Bengio</span>, Glen Berseth, Nikolay Malkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as effective distribution estimators in vision,
+language, and reinforcement learning, but their use as priors in downstream
+tasks poses an intractable posterior inference problem. This paper studies
+amortized sampling of the posterior over data, $\mathbf{x}\sim p^{\rm
+post}(\mathbf{x})\propto p(\mathbf{x})r(\mathbf{x})$, in a model that consists
+of a diffusion generative model prior $p(\mathbf{x})$ and a black-box
+constraint or likelihood function $r(\mathbf{x})$. We state and prove the
+asymptotic correctness of a data-free learning objective, relative trajectory
+balance, for training a diffusion model that samples from this posterior, a
+problem that existing methods solve only approximately or in restricted cases.
+Relative trajectory balance arises from the generative flow network perspective
+on diffusion models, which allows the use of deep reinforcement learning
+techniques to improve mode coverage. Experiments illustrate the broad potential
+of unbiased inference of arbitrary posteriors under diffusion priors: in vision
+(classifier guidance), language (infilling under a discrete diffusion LLM), and
+multimodal data (text-to-image generation). Beyond generative modeling, we
+apply relative trajectory balance to the problem of continuous control with a
+score-based behavior prior, achieving state-of-the-art results on benchmarks in
+offline reinforcement learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024; code: https://github.com/GFNOrg/diffusion-finetuning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InstructOCR: Instruction Boosting Scene Text Spotting <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15523v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15523v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Duan, Qianyi Jiang, Pei Fu, Jiamin Chen, Shengxi Li, Zining Wang, Shan Guo, Junfeng Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of scene text spotting, previous OCR methods primarily relied on
+image encoders and pre-trained text information, but they often overlooked the
+advantages of incorporating human language instructions. To address this gap,
+we propose InstructOCR, an innovative instruction-based scene text spotting
+model that leverages human language instructions to enhance the understanding
+of text within images. Our framework employs both text and image encoders
+during training and inference, along with instructions meticulously designed
+based on text attributes. This approach enables the model to interpret text
+more accurately and flexibly. Extensive experiments demonstrate the
+effectiveness of our model and we achieve state-of-the-art results on widely
+used benchmarks. Furthermore, the proposed framework can be seamlessly applied
+to scene text VQA tasks. By leveraging instruction strategies during
+pre-training, the performance on downstream VQA tasks can be significantly
+improved, with a 2.6% increase on the TextVQA dataset and a 2.1% increase on
+the ST-VQA dataset. These experimental results provide insights into the
+benefits of incorporating human language instructions for OCR-related tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ II-Bench: An Image Implication Understanding Benchmark for Multimodal
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.05862v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.05862v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqiang Liu, Feiteng Fang, Xi Feng, Xinrun Du, Chenhao Zhang, Zekun Wang, Yuelin Bai, Qixuan Zhao, Liyang Fan, Chengguang Gan, Hongquan Lin, Jiaming Li, Yuansheng Ni, Haihong Wu, Yaswanth Narsupalli, Zhigang Zheng, Chengming Li, Xiping Hu, Ruifeng Xu, Xiaojun Chen, Min Yang, Jiaheng Liu, Ruibo Liu, Wenhao Huang, Ge Zhang, Shiwen Ni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancements in the development of multimodal large language models
+(MLLMs) have consistently led to new breakthroughs on various benchmarks. In
+response, numerous challenging and comprehensive benchmarks have been proposed
+to more accurately assess the capabilities of MLLMs. However, there is a dearth
+of exploration of the higher-order perceptual capabilities of MLLMs. To fill
+this gap, we propose the Image Implication understanding Benchmark, II-Bench,
+which aims to evaluate the model's higher-order perception of images. Through
+extensive experiments on II-Bench across multiple MLLMs, we have made
+significant findings. Initially, a substantial gap is observed between the
+performance of MLLMs and humans on II-Bench. The pinnacle accuracy of MLLMs
+attains 74.8%, whereas human accuracy averages 90%, peaking at an impressive
+98%. Subsequently, MLLMs perform worse on abstract and complex images,
+suggesting limitations in their ability to understand high-level semantics and
+capture image details. Finally, it is observed that most models exhibit
+enhanced accuracy when image sentiment polarity hints are incorporated into the
+prompts. This observation underscores a notable deficiency in their inherent
+understanding of image sentiment. We believe that II-Bench will inspire the
+community to develop the next generation of MLLMs, advancing the journey
+towards expert artificial general intelligence (AGI). II-Bench is publicly
+available at https://huggingface.co/datasets/m-a-p/II-Bench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>100 pages, 82 figures, add citations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EM-DARTS: Hierarchical Differentiable Architecture Search for Eye
+  Movement Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14432v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14432v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huafeng Qin, Hongyu Zhu, Xin Jin, Xin Yu, Mounim A. El-Yacoubi, Shuqiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Eye movement biometrics has received increasing attention thanks to its
+highly secure identification. Although deep learning (DL) models have shown
+success in eye movement recognition, their architectures largely rely on human
+prior knowledge. Differentiable Neural Architecture Search (DARTS) automates
+the manual process of architecture design with high search efficiency. However,
+DARTS typically stacks multiple cells to form a convolutional network, which
+limits the diversity of architecture. Furthermore, DARTS generally searches for
+architectures using shallower networks than those used in the evaluation,
+creating a significant disparity in architecture depth between the search and
+evaluation phases. To address this issue, we propose EM-DARTS, a hierarchical
+differentiable architecture search algorithm to automatically design the DL
+architecture for eye movement recognition. First, we define a supernet and
+propose a global and local alternate Neural Architecture Search method to
+search the optimal architecture alternately with a differentiable neural
+architecture search. The local search strategy aims to find an optimal
+architecture for different cells while the global search strategy is
+responsible for optimizing the architecture of the target network. To minimize
+redundancy, transfer entropy is proposed to compute the information amount of
+each layer, thereby further simplifying the network search process.
+Experimental results on three public datasets demonstrate that the proposed
+EM-DARTS is capable of producing an optimal architecture that leads to
+state-of-the-art recognition performance, {Specifically, the recognition models
+developed using EM-DARTS achieved the lowest EERs of 0.0453 on the GazeBase
+dataset, 0.0377 on the JuDo1000 dataset, and 0.1385 on the EMglasses dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submited to IEEE Transactions on Instrumentation and Measurement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WeCromCL: Weakly Supervised Cross-Modality Contrastive Learning for
+  Transcription-only Supervised Text Spotting <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19507v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19507v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingjing Wu, Zhengyao Fang, Pengyuan Lyu, Chengquan Zhang, Fanglin Chen, Guangming Lu, Wenjie Pei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transcription-only Supervised Text Spotting aims to learn text spotters
+relying only on transcriptions but no text boundaries for supervision, thus
+eliminating expensive boundary annotation. The crux of this task lies in
+locating each transcription in scene text images without location annotations.
+In this work, we formulate this challenging problem as a Weakly Supervised
+Cross-modality Contrastive Learning problem, and design a simple yet effective
+model dubbed WeCromCL that is able to detect each transcription in a scene
+image in a weakly supervised manner. Unlike typical methods for cross-modality
+contrastive learning that focus on modeling the holistic semantic correlation
+between an entire image and a text description, our WeCromCL conducts atomistic
+contrastive learning to model the character-wise appearance consistency between
+a text transcription and its correlated region in a scene image to detect an
+anchor point for the transcription in a weakly supervised manner. The detected
+anchor points by WeCromCL are further used as pseudo location labels to guide
+the learning of text spotting. Extensive experiments on four challenging
+benchmarks demonstrate the superior performance of our model over other
+methods. Code will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-Driven Diabetic Retinopathy Screening: Multicentric Validation of
+  AIDRSS in India 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05826v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05826v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Kr Dey, Pradeep Walia, Girish Somvanshi, Abrar Ali, Sagarnil Das, Pallabi Paul, Minakhi Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Diabetic retinopathy (DR) is a major cause of vision loss,
+particularly in India, where access to retina specialists is limited in rural
+areas. This study aims to evaluate the Artificial Intelligence-based Diabetic
+Retinopathy Screening System (AIDRSS) for DR detection and prevalence
+assessment, addressing the growing need for scalable, automated screening
+solutions in resource-limited settings.
+  Approach: A multicentric, cross-sectional study was conducted in Kolkata,
+India, involving 5,029 participants and 10,058 macula-centric retinal fundus
+images. The AIDRSS employed a deep learning algorithm with 50 million trainable
+parameters, integrated with Contrast Limited Adaptive Histogram Equalization
+(CLAHE) preprocessing for enhanced image quality. DR was graded using the
+International Clinical Diabetic Retinopathy (ICDR) Scale, categorizing disease
+into five stages (DR0 to DR4). Statistical metrics including sensitivity,
+specificity, and prevalence rates were evaluated against expert retina
+specialist assessments.
+  Results: The prevalence of DR in the general population was 13.7%, rising to
+38.2% among individuals with elevated random blood glucose levels. The AIDRSS
+achieved an overall sensitivity of 92%, specificity of 88%, and 100%
+sensitivity for detecting referable DR (DR3 and DR4). These results demonstrate
+the system's robust performance in accurately identifying and grading DR in a
+diverse population.
+  Conclusions: AIDRSS provides a reliable, scalable solution for early DR
+detection in resource-constrained environments. Its integration of advanced AI
+techniques ensures high diagnostic accuracy, with potential to significantly
+reduce the burden of diabetes-related vision loss in underserved regions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 5 figures. arXiv admin note: substantial text overlap with
+  arXiv:1812.07105 by other authors without attribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HeadGAP: Few-Shot 3D Head Avatar via Generalizable Gaussian Priors <span class="chip">3DV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06019v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06019v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaozheng Zheng, Chao Wen, Zhaohu Li, Weiyi Zhang, Zhuo Su, Xu Chang, Yang Zhao, Zheng Lv, Xiaoyuan Zhang, Yongjie Zhang, Guidong Wang, Lan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel 3D head avatar creation approach capable of
+generalizing from few-shot in-the-wild data with high-fidelity and animatable
+robustness. Given the underconstrained nature of this problem, incorporating
+prior knowledge is essential. Therefore, we propose a framework comprising
+prior learning and avatar creation phases. The prior learning phase leverages
+3D head priors derived from a large-scale multi-view dynamic dataset, and the
+avatar creation phase applies these priors for few-shot personalization. Our
+approach effectively captures these priors by utilizing a Gaussian
+Splatting-based auto-decoder network with part-based dynamic modeling. Our
+method employs identity-shared encoding with personalized latent codes for
+individual identities to learn the attributes of Gaussian primitives. During
+the avatar creation phase, we achieve fast head avatar personalization by
+leveraging inversion and fine-tuning strategies. Extensive experiments
+demonstrate that our model effectively exploits head priors and successfully
+generalizes them to few-shot personalization, achieving photo-realistic
+rendering quality, multi-view consistency, and stable animation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 3DV 2025. Project page: https://headgap.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized
+  Narratives from Open-Source Histopathology Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04746v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04746v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehmet Saygin Seyfioglu, Wisdom O. Ikezogwo, Fatemeh Ghezloo, Ranjay Krishna, Linda Shapiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosis in histopathology requires a global whole slide images (WSIs)
+analysis, requiring pathologists to compound evidence from different WSI
+patches. The gigapixel scale of WSIs poses a challenge for histopathology
+multi-modal models. Training multi-model models for histopathology requires
+instruction tuning datasets, which currently contain information for individual
+image patches, without a spatial grounding of the concepts within each patch
+and without a wider view of the WSI. Therefore, they lack sufficient diagnostic
+capacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a
+large-scale dataset of 107,131 histopathology-specific instruction
+question/answer pairs, grounded within diagnostically relevant image patches
+that make up the WSI. Our dataset is collected by leveraging educational
+histopathology videos from YouTube, which provides spatial localization of
+narrations by automatically extracting the narrators' cursor positions.
+Quilt-Instruct supports contextual reasoning by extracting diagnosis and
+supporting facts from the entire WSI. Using Quilt-Instruct, we train
+Quilt-LLaVA, which can reason beyond the given single image patch, enabling
+diagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a
+comprehensive evaluation dataset created from 985 images and 1283
+human-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using
+public histopathology datasets, where Quilt-LLaVA significantly outperforms
+SOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set
+VQA. Our code, data, and model are publicly accessible at
+quilt-llava.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simplifying CLIP: Unleashing the Power of Large-Scale Models on
+  Consumer-level Computers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pre-training (CLIP) has attracted a surge of
+attention for its superior zero-shot performance and excellent transferability
+to downstream tasks. However, training such large-scale models usually requires
+substantial computation and storage, which poses barriers for general users
+with consumer-level computers. Motivated by this observation, in this paper we
+investigate how to achieve competitive performance on only one Nvidia RTX3090
+GPU and with one terabyte for storing dataset. On one hand, we simplify the
+transformer block structure and combine Weight Inheritance with multi-stage
+Knowledge Distillation (WIKD), thereby reducing the parameters and improving
+the inference speed during training along with deployment. On the other hand,
+confronted with the convergence challenge posed by small dataset, we generate
+synthetic captions for each sample as data augmentation, and devise a novel
+Pair Matching (PM) loss to fully exploit the distinguishment among positive and
+negative image-text pairs. Extensive experiments demonstrate that our model can
+achieve a new state-of-the-art datascale-parameter-accuracy tradeoff, which
+could further popularize the CLIP model in the related research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Buster: Implanting Semantic Backdoor into Text Encoder to Mitigate NSFW
+  Content Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07249v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07249v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhao, Xiaojun Chen, Yuexin Xuan, Zhendong Zhao, Xiaojun Jia, Xinfeng Li, Xiaofeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of deep learning models in the digital era has raised substantial
+concerns regarding the generation of Not-Safe-for-Work (NSFW) content. Existing
+defense methods primarily involve model fine-tuning and post-hoc content
+moderation. Nevertheless, these approaches largely lack scalability in
+eliminating harmful content, degrade the quality of benign image generation, or
+incur high inference costs. To address these challenges, we propose an
+innovative framework named \textit{Buster}, which injects backdoors into the
+text encoder to prevent NSFW content generation. Buster leverages deep semantic
+information rather than explicit prompts as triggers, redirecting NSFW prompts
+towards targeted benign prompts. Additionally, Buster employs energy-based
+training data generation through Langevin dynamics for adversarial knowledge
+augmentation, thereby ensuring robustness in harmful concept definition. This
+approach demonstrates exceptional resilience and scalability in mitigating NSFW
+content. Particularly, Buster fine-tunes the text encoder of Text-to-Image
+models within merely five minutes, showcasing its efficiency. Our extensive
+experiments denote that Buster outperforms nine state-of-the-art baselines,
+achieving a superior NSFW content removal rate of at least 91.2\% while
+preserving the quality of harmless images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Robustness of Object Detection Models on Aerial Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.15378v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.15378v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haodong He, Jian Ding, Bowen Xu, Gui-Song Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The robustness of object detection models is a major concern when applied to
+real-world scenarios. The performance of most models tends to degrade when
+confronted with images affected by corruptions, since they are usually trained
+and evaluated on clean datasets. While numerous studies have explored the
+robustness of object detection models on natural images, there is a paucity of
+research focused on models applied to aerial images, which feature complex
+backgrounds, substantial variations in scales, and orientations of objects.
+This paper addresses the challenge of assessing the robustness of object
+detection models on aerial images, with a specific emphasis on scenarios where
+images are affected by clouds. In this study, we introduce two novel benchmarks
+based on DOTA-v1.0. The first benchmark encompasses 19 prevalent corruptions,
+while the second focuses on the cloud-corrupted condition-a phenomenon uncommon
+in natural images yet frequent in aerial photography. We systematically
+evaluate the robustness of mainstream object detection models and perform
+necessary ablation experiments. Through our investigations, we find that
+rotation-invariant modeling and enhanced backbone architectures can improve the
+robustness of models. Furthermore, increasing the capacity of Transformer-based
+backbones can strengthen their robustness. The benchmarks we propose and our
+comprehensive experimental analyses can facilitate research on robust object
+detection on aerial images. The codes and datasets are available at:
+https://github.com/hehaodong530/DOTA-C.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by IEEE TGRS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pamba: Enhancing Global Interaction in Point Clouds via State Space
+  Model <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.17442v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.17442v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoyuan Li, Yubo Ai, Jiahao Lu, ChuXin Wang, Jiacheng Deng, Hanzhi Chang, Yanzhe Liang, Wenfei Yang, Shifeng Zhang, Tianzhu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have demonstrated impressive results for 3D point cloud semantic
+segmentation. However, the quadratic complexity of transformer makes
+computation costs high, limiting the number of points that can be processed
+simultaneously and impeding the modeling of long-range dependencies between
+objects in a single scene. Drawing inspiration from the great potential of
+recent state space models (SSM) for long sequence modeling, we introduce Mamba,
+an SSM-based architecture, to the point cloud domain and propose Pamba, a novel
+architecture with strong global modeling capability under linear complexity.
+Specifically, to make the disorderness of point clouds fit in with the causal
+nature of Mamba, we propose a multi-path serialization strategy applicable to
+point clouds. Besides, we propose the ConvMamba block to compensate for the
+shortcomings of Mamba in modeling local geometries and in unidirectional
+modeling. Pamba obtains state-of-the-art results on several 3D point cloud
+segmentation tasks, including ScanNet v2, ScanNet200, S3DIS and nuScenes, while
+its effectiveness is validated by extensive experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MovieCharacter: A Tuning-Free Framework for Controllable Character Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.20974v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.20974v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Qiu, Zheng Chen, Rui Wang, Mingyuan Fan, Changqian Yu, Junshi Huang, Xiang Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in character video synthesis still depend on extensive
+fine-tuning or complex 3D modeling processes, which can restrict accessibility
+and hinder real-time applicability. To address these challenges, we propose a
+simple yet effective tuning-free framework for character video synthesis, named
+MovieCharacter, designed to streamline the synthesis process while ensuring
+high-quality outcomes. Our framework decomposes the synthesis task into
+distinct, manageable modules: character segmentation and tracking, video object
+removal, character motion imitation, and video composition. This modular design
+not only facilitates flexible customization but also ensures that each
+component operates collaboratively to effectively meet user needs. By
+leveraging existing open-source models and integrating well-established
+techniques, MovieCharacter achieves impressive synthesis results without
+necessitating substantial resources or proprietary datasets. Experimental
+results demonstrate that our framework enhances the efficiency, accessibility,
+and adaptability of character video synthesis, paving the way for broader
+creative and interactive applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MLLM-CompBench: A Comparative Reasoning Benchmark for Multimodal LLMs <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16837v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16837v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihyung Kil, Zheda Mai, Justin Lee, Zihe Wang, Kerrie Cheng, Lemeng Wang, Ye Liu, Arpita Chowdhury, Wei-Lun Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to compare objects, scenes, or situations is crucial for
+effective decision-making and problem-solving in everyday life. For instance,
+comparing the freshness of apples enables better choices during grocery
+shopping while comparing sofa designs helps optimize the aesthetics of our
+living space. Despite its significance, the comparative capability is largely
+unexplored in artificial general intelligence (AGI). In this paper, we
+introduce MLLM-CompBench, a benchmark designed to evaluate the comparative
+reasoning capability of multimodal large language models (MLLMs).
+MLLM-CompBench mines and pairs images through visually oriented questions
+covering eight dimensions of relative comparison: visual attribute, existence,
+state, emotion, temporality, spatiality, quantity, and quality. We curate a
+collection of around 40K image pairs using metadata from diverse vision
+datasets and CLIP similarity scores. These image pairs span a broad array of
+visual domains, including animals, fashion, sports, and both outdoor and indoor
+scenes. The questions are carefully crafted to discern relative characteristics
+between two images and are labeled by human annotators for accuracy and
+relevance. We use MLLM-CompBench to evaluate recent MLLMs, including
+GPT-4V(ision), Gemini-Pro, and LLaVA-1.6. Our results reveal notable
+shortcomings in their comparative abilities. We believe MLLM-COMPBENCH not only
+sheds light on these limitations but also establishes a solid foundation for
+future enhancements in the comparative capability of MLLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted to NeurIPS 2024. The first two authors
+  contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SL-YOLO: A Stronger and Lighter Drone Target Detection Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11477v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11477v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Defan Chen, Luchan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting small objects in complex scenes, such as those captured by drones,
+is a daunting challenge due to the difficulty in capturing the complex features
+of small targets. While the YOLO family has achieved great success in large
+target detection, its performance is less than satisfactory when faced with
+small targets. Because of this, this paper proposes a revolutionary model
+SL-YOLO (Stronger and Lighter YOLO) that aims to break the bottleneck of small
+target detection. We propose the Hierarchical Extended Path Aggregation Network
+(HEPAN), a pioneering cross-scale feature fusion method that can ensure
+unparalleled detection accuracy even in the most challenging environments. At
+the same time, without sacrificing detection capabilities, we design the C2fDCB
+lightweight module and add the SCDown downsampling module to greatly reduce the
+model's parameters and computational complexity. Our experimental results on
+the VisDrone2019 dataset reveal a significant improvement in performance, with
+mAP@0.5 jumping from 43.0% to 46.9% and mAP@0.5:0.95 increasing from 26.0% to
+28.9%. At the same time, the model parameters are reduced from 11.1M to 9.6M,
+and the FPS can reach 132, making it an ideal solution for real-time small
+object detection in resource-constrained environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SoftPatch+: Fully Unsupervised Anomaly Classification and Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengjie Wang, Xi Jiang, Bin-Bin Gao, Zhenye Gan, Yong Liu, Feng Zheng, Lizhuang Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although mainstream unsupervised anomaly detection (AD) (including
+image-level classification and pixel-level segmentation)algorithms perform well
+in academic datasets, their performance is limited in practical application due
+to the ideal experimental setting of clean training data. Training with noisy
+data is an inevitable problem in real-world anomaly detection but is seldom
+discussed. This paper is the first to consider fully unsupervised industrial
+anomaly detection (i.e., unsupervised AD with noisy data). To solve this
+problem, we proposed memory-based unsupervised AD methods, SoftPatch and
+SoftPatch+, which efficiently denoise the data at the patch level. Noise
+discriminators are utilized to generate outlier scores for patch-level noise
+elimination before coreset construction. The scores are then stored in the
+memory bank to soften the anomaly detection boundary. Compared with existing
+methods, SoftPatch maintains a strong modeling ability of normal data and
+alleviates the overconfidence problem in coreset, and SoftPatch+ has more
+robust performance which is articularly useful in real-world industrial
+inspection scenarios with high levels of noise (from 10% to 40%). Comprehensive
+experiments conducted in diverse noise scenarios demonstrate that both
+SoftPatch and SoftPatch+ outperform the state-of-the-art AD methods on the
+MVTecAD, ViSA, and BTAD benchmarks. Furthermore, the performance of SoftPatch
+and SoftPatch+ is comparable to that of the noise-free methods in conventional
+unsupervised AD setting. The code of the proposed methods can be found at
+https://github.com/TencentYoutuResearch/AnomalyDetection-SoftPatch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2403.14233
+  paper has been accepted by Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedicalNarratives: Connecting Medical Vision and Language with Localized
+  Narratives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04184v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04184v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wisdom O. Ikezogwo, Kevin Zhang, Mehmet Saygin Seyfioglu, Fatemeh Ghezloo, Linda Shapiro, Ranjay Krishna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MedicalNarratives, a dataset curated from medical pedagogical
+videos similar in nature to data collected in Think-Aloud studies and inspired
+by Localized Narratives, which collects grounded image-text data by curating
+instructors' speech and mouse cursor movements synchronized in time.
+MedicalNarratives enables pretraining of both semantic and dense objectives,
+alleviating the need to train medical semantic and dense tasks disparately due
+to the lack of reasonably sized datasets. Our dataset contains 4.7M image-text
+pairs from videos and articles, with 1M samples containing dense annotations in
+the form of traces and bounding boxes. To evaluate the utility of
+MedicalNarratives, we train GenMedClip based on the CLIP architecture using our
+dataset spanning 12 medical domains and demonstrate that it outperforms
+previous state-of-the-art models on a newly constructed medical imaging
+benchmark that comprehensively evaluates performance across all modalities.
+Data, demo, code and models available at https://medical-narratives.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Images are Achilles' Heel of Alignment: Exploiting Visual
+  Vulnerabilities for Jailbreaking Multimodal Large Language Models <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09792v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09792v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Li, Hangyu Guo, Kun Zhou, Wayne Xin Zhao, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the harmlessness alignment problem of multimodal
+large language models (MLLMs). We conduct a systematic empirical analysis of
+the harmlessness performance of representative MLLMs and reveal that the image
+input poses the alignment vulnerability of MLLMs. Inspired by this, we propose
+a novel jailbreak method named HADES, which hides and amplifies the harmfulness
+of the malicious intent within the text input, using meticulously crafted
+images. Experimental results show that HADES can effectively jailbreak existing
+MLLMs, which achieves an average Attack Success Rate (ASR) of 90.26% for
+LLaVA-1.5 and 71.60% for Gemini Pro Vision. Our code and data are available at
+https://github.com/RUCAIBox/HADES.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Streetscape Application Services Stack (SASS): Towards a Distributed
+  Sensing Architecture for Urban Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19714v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19714v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Salami Pargoo, Mahshid Ghasemi, Shuren Xia, Mehmet Kerem Turkcan, Taqiya Ehsan, Chengbo Zang, Yuan Sun, Javad Ghaderi, Gil Zussman, Zoran Kostic, Jorge Ortiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As urban populations grow, cities are becoming more complex, driving the
+deployment of interconnected sensing systems to realize the vision of smart
+cities. These systems aim to improve safety, mobility, and quality of life
+through applications that integrate diverse sensors with real-time
+decision-making. Streetscape applications-focusing on challenges like
+pedestrian safety and adaptive traffic management-depend on managing
+distributed, heterogeneous sensor data, aligning information across time and
+space, and enabling real-time processing. These tasks are inherently complex
+and often difficult to scale. The Streetscape Application Services Stack (SASS)
+addresses these challenges with three core services: multimodal data
+synchronization, spatiotemporal data fusion, and distributed edge computing. By
+structuring these capabilities as clear, composable abstractions with clear
+semantics, SASS allows developers to scale streetscape applications efficiently
+while minimizing the complexity of multimodal integration.
+  We evaluated SASS in two real-world testbed environments: a controlled
+parking lot and an urban intersection in a major U.S. city. These testbeds
+allowed us to test SASS under diverse conditions, demonstrating its practical
+applicability. The Multimodal Data Synchronization service reduced temporal
+misalignment errors by 88%, achieving synchronization accuracy within 50
+milliseconds. Spatiotemporal Data Fusion service improved detection accuracy
+for pedestrians and vehicles by over 10%, leveraging multicamera integration.
+The Distributed Edge Computing service increased system throughput by more than
+an order of magnitude. Together, these results show how SASS provides the
+abstractions and performance needed to support real-time, scalable urban
+applications, bridging the gap between sensing infrastructure and actionable
+streetscape intelligence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Valley2: Exploring Multimodal Models with Scalable Vision-Language
+  Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05901v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05901v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziheng Wu, Zhenghao Chen, Ruipu Luo, Can Zhang, Yuan Gao, Zhentao He, Xian Wang, Haoran Lin, Minghui Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, vision-language models have made remarkable progress, demonstrating
+outstanding capabilities in various tasks such as image captioning and video
+understanding. We introduce Valley2, a novel multimodal large language model
+designed to enhance performance across all domains and extend the boundaries of
+practical applications in e-commerce and short video scenarios. Notably,
+Valley2 achieves state-of-the-art (SOTA) performance on e-commerce benchmarks,
+surpassing open-source models of similar size by a large margin (79.66 vs.
+72.76). Additionally, Valley2 ranks second on the OpenCompass leaderboard among
+models with fewer than 10B parameters, with an impressive average score of
+67.4. The code and model weights are open-sourced at
+https://github.com/bytedance/Valley.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChartX & ChartVLM: A Versatile Benchmark and Foundation Model for
+  Complicated Chart Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12185v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12185v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renqiu Xia, Bo Zhang, Hancheng Ye, Xiangchao Yan, Qi Liu, Hongbin Zhou, Zijun Chen, Min Dou, Botian Shi, Junchi Yan, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, many versatile Multi-modal Large Language Models (MLLMs) have
+emerged continuously. However, their capacity to query information depicted in
+visual charts and engage in reasoning based on the queried contents remains
+under-explored. In this paper, to comprehensively and rigorously benchmark the
+ability of the off-the-shelf MLLMs in the chart domain, we construct ChartX, a
+multi-modal evaluation set covering 18 chart types, 7 chart tasks, 22
+disciplinary topics, and high-quality chart data. Besides, we develop ChartVLM
+to offer a new perspective on handling multi-modal tasks that strongly depend
+on interpretable patterns, such as reasoning tasks in the field of charts or
+geometric images. We evaluate the chart-related ability of mainstream MLLMs and
+our ChartVLM on the proposed ChartX evaluation set. Extensive experiments
+demonstrate that ChartVLM surpasses both versatile and chart-related large
+models, achieving results comparable to GPT-4V. We believe that our study can
+pave the way for further exploration in creating a more comprehensive chart
+evaluation set and developing more interpretable multi-modal models. Both
+ChartX and ChartVLM are available at:
+https://github.com/Alpha-Innovator/ChartVLM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and dataset are available for downloading at:
+  https://github.com/Alpha-Innovator/ChartVLM 25 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LDMapNet-U: An End-to-End System for City-Scale Lane-Level Map Updating <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02763v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02763v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deguo Xia, Weiming Zhang, Xiyan Liu, Wei Zhang, Chenting Gong, Xiao Tan, Jizhou Huang, Mengmeng Yang, Diange Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An up-to-date city-scale lane-level map is an indispensable infrastructure
+and a key enabling technology for ensuring the safety and user experience of
+autonomous driving systems. In industrial scenarios, reliance on manual
+annotation for map updates creates a critical bottleneck. Lane-level updates
+require precise change information and must ensure consistency with adjacent
+data while adhering to strict standards. Traditional methods utilize a
+three-stage approach-construction, change detection, and updating-which often
+necessitates manual verification due to accuracy limitations. This results in
+labor-intensive processes and hampers timely updates. To address these
+challenges, we propose LDMapNet-U, which implements a new end-to-end paradigm
+for city-scale lane-level map updating. By reconceptualizing the update task as
+an end-to-end map generation process grounded in historical map data, we
+introduce a paradigm shift in map updating that simultaneously generates
+vectorized maps and change information. To achieve this, a Prior-Map Encoding
+(PME) module is introduced to effectively encode historical maps, serving as a
+critical reference for detecting changes. Additionally, we incorporate a novel
+Instance Change Prediction (ICP) module that learns to predict associations
+with historical maps. Consequently, LDMapNet-U simultaneously achieves
+vectorized map element generation and change detection. To demonstrate the
+superiority and effectiveness of LDMapNet-U, extensive experiments are
+conducted using large-scale real-world datasets. In addition, LDMapNet-U has
+been successfully deployed in production at Baidu Maps since April 2024,
+supporting map updating for over 360 cities and significantly shortening the
+update cycle from quarterly to weekly. The updated maps serve hundreds of
+millions of users and are integrated into the autonomous driving systems of
+several leading vehicle companies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2025, camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">17</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal semantic retrieval for product search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Liu, Esther Lopez Ramos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic retrieval (also known as dense retrieval) based on textual data has
+been extensively studied for both web search and product search application
+fields, where the relevance of a query and a potential target document is
+computed by their dense vector representation comparison. Product image is
+crucial for e-commence search interactions and is a key factor for customers at
+product explorations. But its impact for semantic retrieval has not been well
+studied yet. In this research, we build a multimodal representation for product
+items in e-commerece search in contrast to pure-text representation of
+products, and investigate the impact of such representations. The models are
+developed and evaluated on e-commerce datasets. We demonstrate that a
+multimodal representation scheme for a product can show improvement either on
+purchase recall or relevance accuracy in semantic retrieval. Additionally, we
+provide numerical analysis for exclusive matches retrieved by a multimodal
+semantic retrieval model versus a text-only semantic retrieval model, to
+demonstrate the validation of multimodal solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Dataset</span>-Agnostic Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tri Kurniawan Wijaya, Edoardo D'Amico, Xinyang Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  [This is a position paper and does not contain any empirical or theoretical
+results] Recommender systems have become a cornerstone of personalized user
+experiences, yet their development typically involves significant manual
+intervention, including dataset-specific feature engineering, hyperparameter
+tuning, and configuration. To this end, we introduce a novel paradigm:
+Dataset-Agnostic Recommender Systems (DAReS) that aims to enable a single
+codebase to autonomously adapt to various datasets without the need for
+fine-tuning, for a given recommender system task. Central to this approach is
+the Dataset Description Language (DsDL), a structured format that provides
+metadata about the dataset's features and labels, and allow the system to
+understand dataset's characteristics, allowing it to autonomously manage
+processes like feature selection, missing values imputation, noise removal, and
+hyperparameter optimization. By reducing the need for domain-specific expertise
+and manual adjustments, DAReS offers a more efficient and scalable solution for
+building recommender systems across diverse application domains. It addresses
+critical challenges in the field, such as reusability, reproducibility, and
+accessibility for non-expert users or entry-level researchers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Future-Conditioned Recommendations with Multi-Objective Controllable
+  Decision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongming Gao, Kexin Huang, Ziang Fei, Jiaju Chen, Jiawei Chen, Jianshan Sun, Shuchang Liu, Qingpeng Cai, Peng Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Securing long-term success is the ultimate aim of recommender systems,
+demanding strategies capable of foreseeing and shaping the impact of decisions
+on future user satisfaction. Current recommendation strategies grapple with two
+significant hurdles. Firstly, the future impacts of recommendation decisions
+remain obscured, rendering it impractical to evaluate them through direct
+optimization of immediate metrics. Secondly, conflicts often emerge between
+multiple objectives, like enhancing accuracy versus exploring diverse
+recommendations. Existing strategies, trapped in a "training, evaluation, and
+retraining" loop, grow more labor-intensive as objectives evolve. To address
+these challenges, we introduce a future-conditioned strategy for
+multi-objective controllable recommendations, allowing for the direct
+specification of future objectives and empowering the model to generate item
+sequences that align with these goals autoregressively. We present the
+Multi-Objective Controllable Decision Transformer (MocDT), an offline
+Reinforcement Learning (RL) model capable of autonomously learning the mapping
+from multiple objectives to item sequences, leveraging extensive offline data.
+Consequently, it can produce recommendations tailored to any specified
+objectives during the inference stage. Our empirical findings emphasize the
+controllable recommendation strategy's ability to produce item sequences
+according to different objectives while maintaining performance that is
+competitive with current recommendation strategies across various objectives.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constructing Set-Compositional and Negated Representations for
+  First-Stage Ranking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonios Minas Krasakis, Andrew Yates, Evangelos Kanoulas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Set compositional and negated queries are crucial for expressing complex
+information needs and enable the discovery of niche items like Books about
+non-European monarchs. Despite the recent advances in LLMs, first-stage ranking
+remains challenging due to the requirement of encoding documents and queries
+independently from each other. This limitation calls for constructing
+compositional query representations that encapsulate logical operations or
+negations, and can be used to match relevant documents effectively. In the
+first part of this work, we explore constructing such representations in a
+zero-shot setting using vector operations between lexically grounded Learned
+Sparse Retrieval (LSR) representations. Specifically, we introduce Disentangled
+Negation that penalizes only the negated parts of a query, and a Combined
+Pseudo-Term approach that enhances LSRs ability to handle intersections. We
+find that our zero-shot approach is competitive and often outperforms
+retrievers fine-tuned on compositional data, highlighting certain limitations
+of LSR and Dense Retrievers. Finally, we address some of these limitations and
+improve LSRs representation power for negation, by allowing them to attribute
+negative term scores and effectively penalize documents containing the negated
+terms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models: New Opportunities for Access to Science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jutta Schnabel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The adaptation of Large Language Models like ChatGPT for information
+retrieval from scientific data, software and publications is offering new
+opportunities to simplify access to and understanding of science for persons
+from all levels of expertise. They can become tools to both enhance the
+usability of the open science environment we are building as well as help to
+provide systematic insight to a long-built corpus of scientific publications.
+The uptake of Retrieval Augmented Generation-enhanced chat applications in the
+construction of the open science environment of the KM3NeT neutrino detectors
+serves as a focus point to explore and exemplify prospects for the wider
+application of Large Language Models for our science.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>conference proceeding to ADASS XXXIV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ListConRanker: A Contrastive Text Reranker with Listwise Encoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07111v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07111v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junlong Liu, Yue Ma, Ruihui Zhao, Junhao Zheng, Qianli Ma, Yangyang Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reranker models aim to re-rank the passages based on the semantics similarity
+between the given query and passages, which have recently received more
+attention due to the wide application of the Retrieval-Augmented Generation.
+Most previous methods apply pointwise encoding, meaning that it can only encode
+the context of the query for each passage input into the model. However, for
+the reranker model, given a query, the comparison results between passages are
+even more important, which is called listwise encoding. Besides, previous
+models are trained using the cross-entropy loss function, which leads to issues
+of unsmooth gradient changes during training and low training efficiency. To
+address these issues, we propose a novel Listwise-encoded Contrastive text
+reRanker (ListConRanker). It can help the passage to be compared with other
+passages during the encoding process, and enhance the contrastive information
+between positive examples and between positive and negative examples. At the
+same time, we use the circle loss to train the model to increase the
+flexibility of gradients and solve the problem of training efficiency.
+Experimental results show that ListConRanker achieves state-of-the-art
+performance on the reranking benchmark of Chinese Massive Text Embedding
+Benchmark, including the cMedQA1.0, cMedQA2.0, MMarcoReranking, and T2Reranking
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Multimodal Fusion via Meta-Learning Towards Micro-Video
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07110v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07110v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Liu, Yinwei Wei, Fan Liu, Wenjie Wang, Liqiang Nie, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal information (e.g., visual, acoustic, and textual) has been widely
+used to enhance representation learning for micro-video recommendation. For
+integrating multimodal information into a joint representation of micro-video,
+multimodal fusion plays a vital role in the existing micro-video recommendation
+approaches. However, the static multimodal fusion used in previous studies is
+insufficient to model the various relationships among multimodal information of
+different micro-videos. In this paper, we develop a novel meta-learning-based
+multimodal fusion framework called Meta Multimodal Fusion (MetaMMF), which
+dynamically assigns parameters to the multimodal fusion function for each
+micro-video during its representation learning. Specifically, MetaMMF regards
+the multimodal fusion of each micro-video as an independent task. Based on the
+meta information extracted from the multimodal features of the input task,
+MetaMMF parameterizes a neural network as the item-specific fusion function via
+a meta learner. We perform extensive experiments on three benchmark datasets,
+demonstrating the significant improvements over several state-of-the-art
+multimodal recommendation models, like MMGCN, LATTICE, and InvRL. Furthermore,
+we lighten our model by adopting canonical polyadic decomposition to improve
+the training efficiency, and validate its effectiveness through experimental
+results. Codes are available at https://github.com/hanliu95/MetaMMF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ACM Transactions on Information
+  Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intent-Interest Disentanglement and Item-Aware Intent Contrastive
+  Learning for Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijin Choi, Chiehyeon Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems aim to provide personalized item recommendations by
+capturing user behaviors derived from their interaction history. Considering
+that user interactions naturally occur sequentially based on users' intents in
+mind, user behaviors can be interpreted as user intents. Therefore,
+intent-based sequential recommendations are actively studied recently to model
+user intents from historical interactions for a more precise user understanding
+beyond traditional studies that often overlook the underlying semantics behind
+user interactions. However, existing studies face three challenges: 1) the
+limited understanding of user behaviors by focusing solely on intents, 2) the
+lack of robustness in categorizing intents due to arbitrary fixed numbers of
+intent categories, and 3) the neglect of interacted items in modeling of user
+intents. To address these challenges, we propose Intent-Interest
+Disentanglement and Item-Aware Intent Contrastive Learning for Sequential
+Recommendation (IDCLRec). IDCLRec disentangles user behaviors into intents
+which are dynamic motivations and interests which are stable tastes of users
+for a comprehensive understanding of user behaviors. A causal cross-attention
+mechanism is used to identify consistent interests across interactions, while
+residual behaviors are modeled as intents by modeling their temporal dynamics
+through a similarity adjustment loss. In addition, without predefining the
+number of intent categories, an importance-weighted attention mechanism
+captures user-specific categorical intent considering the importance of intent
+for each interaction. Furthermore, we introduce item-aware contrastive learning
+which aligns intents that occurred the same interaction and aligns intent with
+item combinations occurred by the corresponding intent. Extensive experiments
+conducted on real-world datasets demonstrate the effectiveness of IDCLRec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Research on the Online Update Method for Retrieval-Augmented Generation
+  (RAG) Model with Incremental Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Fan, Yuxiang Wang, Lipeng Liu, Xirui Tang, Na Sun, Zidong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the contemporary context of rapid advancements in information technology
+and the exponential growth of data volume, language models are confronted with
+significant challenges in effectively navigating the dynamic and ever-evolving
+information landscape to update and adapt to novel knowledge in real time. In
+this work, an online update method is proposed, which is based on the existing
+Retrieval Enhanced Generation (RAG) model with multiple innovation mechanisms.
+Firstly, the dynamic memory is used to capture the emerging data samples, and
+then gradually integrate them into the core model through a tunable knowledge
+distillation strategy. At the same time, hierarchical indexing and multi-layer
+gating mechanism are introduced into the retrieval module to ensure that the
+retrieved content is more targeted and accurate. Finally, a multi-stage network
+structure is established for different types of inputs in the generation stage,
+and cross-attention matching and screening are carried out on the intermediate
+representations of each stage to ensure the effective integration and iterative
+update of new and old knowledge. Experimental results show that the proposed
+method is better than the existing mainstream comparison models in terms of
+knowledge retention and inference accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Proposed Large Language Model-Based Smart Search for Archive System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ha Dung Nguyen, Thi-Hoang Anh Nguyen, Thanh Binh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a novel framework for smart search in digital archival
+systems, leveraging the capabilities of Large Language Models (LLMs) to enhance
+information retrieval. By employing a Retrieval-Augmented Generation (RAG)
+approach, the framework enables the processing of natural language queries and
+transforming non-textual data into meaningful textual representations. The
+system integrates advanced metadata generation techniques, a hybrid retrieval
+mechanism, a router query engine, and robust response synthesis, the results
+proved search precision and relevance. We present the architecture and
+implementation of the system and evaluate its performance in four experiments
+concerning LLM efficiency, hybrid retrieval optimizations, multilingual query
+handling, and the impacts of individual components. Obtained results show
+significant improvements over conventional approaches and have demonstrated the
+potential of AI-powered systems to transform modern archival practices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 13th International Symposium on Information and Communication
+  Technology (SOICT 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Contrastive Learning on Multi-label Classification for
+  Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayang Wu, Wensheng Gan, Huashen Lu, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In business analysis, providing effective recommendations is essential for
+enhancing company profits. The utilization of graph-based structures, such as
+bipartite graphs, has gained popularity for their ability to analyze complex
+data relationships. Link prediction is crucial for recommending specific items
+to users. Traditional methods in this area often involve identifying patterns
+in the graph structure or using representational techniques like graph neural
+networks (GNNs). However, these approaches encounter difficulties as the volume
+of data increases. To address these challenges, we propose a model called Graph
+Contrastive Learning for Multi-label Classification (MCGCL). MCGCL leverages
+contrastive learning to enhance recommendation effectiveness. The model
+incorporates two training stages: a main task and a subtask. The main task is
+holistic user-item graph learning to capture user-item relationships. The
+homogeneous user-user (item-item) subgraph is constructed to capture user-user
+and item-item relationships in the subtask. We assessed the performance using
+real-world datasets from Amazon Reviews in multi-label classification tasks.
+Comparative experiments with state-of-the-art methods confirm the effectiveness
+of MCGCL, highlighting its potential for improving recommendation systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. 10 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harnessing Multimodal Large Language Models for Multimodal Sequential
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09698v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09698v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyang Ye, Zhi Zheng, Yishan Shen, Tianshu Wang, Hengruo Zhang, Peijun Zhu, Runlong Yu, Kai Zhang, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Large Language Models (LLMs) have demonstrated significant
+potential in the field of Recommendation Systems (RSs). Most existing studies
+have focused on converting user behavior logs into textual prompts and
+leveraging techniques such as prompt tuning to enable LLMs for recommendation
+tasks. Meanwhile, research interest has recently grown in multimodal
+recommendation systems that integrate data from images, text, and other sources
+using modality fusion techniques. This introduces new challenges to the
+existing LLM-based recommendation paradigm which relies solely on text modality
+information. Moreover, although Multimodal Large Language Models (MLLMs)
+capable of processing multi-modal inputs have emerged, how to equip MLLMs with
+multi-modal recommendation capabilities remains largely unexplored. To this
+end, in this paper, we propose the Multimodal Large Language Model-enhanced
+Multimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic
+user preference, we design a two-stage user preference summarization method.
+Specifically, we first utilize an MLLM-based item-summarizer to extract image
+feature given an item and convert the image into text. Then, we employ a
+recurrent user preference summarization generation paradigm to capture the
+dynamic changes in user preferences based on an LLM-based user-summarizer.
+Finally, to enable the MLLM for multi-modal recommendation task, we propose to
+fine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT)
+techniques. Extensive evaluations across various datasets validate the
+effectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt
+to the evolving dynamics of user preferences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preference-Consistent Knowledge Distillation for Recommender System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04549v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04549v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangchi Zhu, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Feature-based knowledge distillation has been applied to compress modern
+recommendation models, usually with projectors that align student (small)
+recommendation models' dimensions with teacher dimensions. However, existing
+studies have only focused on making the projected features (i.e., student
+features after projectors) similar to teacher features, overlooking
+investigating whether the user preference can be transferred to student
+features (i.e., student features before projectors) in this manner. In this
+paper, we find that due to the lack of restrictions on projectors, the process
+of transferring user preferences will likely be interfered with. We refer to
+this phenomenon as preference inconsistency. It greatly wastes the power of
+feature-based knowledge distillation. To mitigate preference inconsistency, we
+propose PCKD, which consists of two regularization terms for projectors. We
+also propose a hybrid method that combines the two regularization terms. We
+focus on items with high preference scores and significantly mitigate
+preference inconsistency, improving the performance of feature-based knowledge
+distillation. Extensive experiments on three public datasets and three
+backbones demonstrate the effectiveness of PCKD. The code of our method is
+provided in https://github.com/woriazzc/KDs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TKDE 2024 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Feature-based Knowledge Distillation for Recommender System: A
+  Frequency Perspective <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangchi Zhu, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we analyze the feature-based knowledge distillation for
+recommendation from the frequency perspective. By defining knowledge as
+different frequency components of the features, we theoretically demonstrate
+that regular feature-based knowledge distillation is equivalent to equally
+minimizing losses on all knowledge and further analyze how this equal loss
+weight allocation method leads to important knowledge being overlooked. In
+light of this, we propose to emphasize important knowledge by redistributing
+knowledge weights. Furthermore, we propose FreqD, a lightweight knowledge
+reweighting method, to avoid the computational cost of calculating losses on
+each knowledge. Extensive experiments demonstrate that FreqD consistently and
+significantly outperforms state-of-the-art knowledge distillation methods for
+recommender systems. Our code is available at https://github.com/woriazzc/KDs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM KDD 2025 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-granularity Interest Retrieval and Refinement Network for
+  Long-Term User Behavior Modeling in CTR Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15005v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15005v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Xu, Hao Wang, Wei Guo, Luankang Zhang, Wanshan Yang, Runlong Yu, Yong Liu, Defu Lian, Enhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Click-through Rate (CTR) prediction is crucial for online personalization
+platforms. Recent advancements have shown that modeling rich user behaviors can
+significantly improve the performance of CTR prediction. Current long-term user
+behavior modeling algorithms predominantly follow two cascading stages. The
+first stage retrieves subsequence related to the target item from the long-term
+behavior sequence, while the second stage models the relationship between the
+subsequence and the target item. Despite significant progress, these methods
+have two critical flaws. First, the retrieval query typically includes only
+target item information, limiting the ability to capture the user's diverse
+interests. Second, relational information, such as sequential and interactive
+information within the subsequence, is frequently overlooked. Therefore, it
+requires to be further mined to more accurately model user interests.
+  To this end, we propose Multi-granularity Interest Retrieval and Refinement
+Network (MIRRN). Specifically, we first construct queries based on behaviors
+observed at different time scales to obtain subsequences, each capturing users'
+interest at various granularities. We then introduce an noval multi-head
+Fourier transformer to efficiently learn sequential and interactive information
+within the subsequences, leading to more accurate modeling of user interests.
+Finally, we employ multi-head target attention to adaptively assess the impact
+of these multi-granularity interests on the target item. Extensive experiments
+have demonstrated that MIRRN significantly outperforms state-of-the-art
+baselines. Furthermore, an A/B test shows that MIRRN increases the average
+number of listening songs by 1.32% and the average time of listening songs by
+0.55% on the Huawei Music App. The implementation code is publicly available at
+https://github.com/USTC-StarTeam/MIRRN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Topic-Aware Knowledge Graph with Large Language Models for
+  Interoperability in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20163v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20163v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minhye Jeon, Seokho Ahn, Young-Duk Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of knowledge graphs in recommender systems has become one of the
+common approaches to addressing data sparsity and cold start problems. Recent
+advances in large language models (LLMs) offer new possibilities for processing
+side and context information within knowledge graphs. However, consistent
+integration across various systems remains challenging due to the need for
+domain expert intervention and differences in system characteristics. To
+address these issues, we propose a consistent approach that extracts both
+general and specific topics from both side and context information using LLMs.
+First, general topics are iteratively extracted and updated from side
+information. Then, specific topics are extracted using context information.
+Finally, to address synonymous topics generated during the specific topic
+extraction process, a refining algorithm processes and resolves these issues
+effectively. This approach allows general topics to capture broad knowledge
+across diverse item characteristics, while specific topics emphasize detailed
+attributes, providing a more comprehensive understanding of the semantic
+features of items and the preferences of users. Experimental results
+demonstrate significant improvements in recommendation performance across
+diverse knowledge graphs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in The 40th ACM/SIGAPP Symposium On Applied Computing(SAC)
+  2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BeFA: A General Behavior-driven Feature Adapter for Multimedia
+  Recommendation <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00323v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00323v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qile Fan, Penghang Yu, Zhiyi Tan, Bing-Kun Bao, Guanming Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia recommender systems focus on utilizing behavioral information and
+content information to model user preferences. Typically, it employs
+pre-trained feature encoders to extract content features, then fuses them with
+behavioral features. However, pre-trained feature encoders often extract
+features from the entire content simultaneously, including excessive
+preference-irrelevant details. We speculate that it may result in the extracted
+features not containing sufficient features to accurately reflect user
+preferences. To verify our hypothesis, we introduce an attribution analysis
+method for visually and intuitively analyzing the content features. The results
+indicate that certain products' content features exhibit the issues of
+information drift}and information omission,reducing the expressive ability of
+features. Building upon this finding, we propose an effective and efficient
+general Behavior-driven Feature Adapter (BeFA) to tackle these issues. This
+adapter reconstructs the content feature with the guidance of behavioral
+information, enabling content features accurately reflecting user preferences.
+Extensive experiments demonstrate the effectiveness of the adapter across all
+multimedia recommendation methods. Our code is made publicly available on
+https://github.com/fqldom/BeFA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">144</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ E2ESlack: An End-to-End Graph-Based Framework for Pre-Routing Slack
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saurabh Bodhe, Zhanguang Zhang, Atia Hamidizadeh, Shixiong Kai, Yingxue Zhang, Mingxuan Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-routing slack prediction remains a critical area of research in
+Electronic Design Automation (EDA). Despite numerous machine learning-based
+approaches targeting this task, there is still a lack of a truly end-to-end
+framework that engineers can use to obtain TNS/WNS metrics from raw circuit
+data at the placement stage. Existing works have demonstrated effectiveness in
+Arrival Time (AT) prediction but lack a mechanism for Required Arrival Time
+(RAT) prediction, which is essential for slack prediction and obtaining TNS/WNS
+metrics. In this work, we propose E2ESlack, an end-to-end graph-based framework
+for pre-routing slack prediction. The framework includes a TimingParser that
+supports DEF, SDF and LIB files for feature extraction and graph construction,
+an arrival time prediction model and a fast RAT estimation module. To the best
+of our knowledge, this is the first work capable of predicting path-level
+slacks at the pre-routing stage. We perform extensive experiments and
+demonstrate that our proposed RAT estimation method outperforms the SOTA
+ML-based prediction method and also pre-routing STA tool. Additionally, the
+proposed E2ESlack framework achieves TNS/WNS values comparable to post-routing
+STA results while saving up to 23x runtime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Prototype Rehearsal for Continual Learning in ECG Arrhythmia
+  Detection <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sana Rahmani, Reetam Chatterjee, Ali Etemad, Javad Hashemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual Learning (CL) methods aim to learn from a sequence of tasks while
+avoiding the challenge of forgetting previous knowledge. We present DREAM-CL, a
+novel CL method for ECG arrhythmia detection that introduces dynamic prototype
+rehearsal memory. DREAM-CL selects representative prototypes by clustering data
+based on learning behavior during each training session. Within each cluster,
+we apply a smooth sorting operation that ranks samples by training difficulty,
+compressing extreme values and removing outliers. The more challenging samples
+are then chosen as prototypes for the rehearsal memory, ensuring effective
+knowledge retention across sessions. We evaluate our method on
+time-incremental, class-incremental, and lead-incremental scenarios using two
+widely used ECG arrhythmia datasets, Chapman and PTB-XL. The results
+demonstrate that DREAM-CL outperforms the state-of-the-art in CL for ECG
+arrhythmia detection. Detailed ablation and sensitivity studies are performed
+to validate the different design choices of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 2025 International Conference on Acoustics, Speech, and
+  Signal Processing (ICASSP 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Imagine while Reasoning in Space: Multimodal Visualization-of-Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengzu Li, Wenshan Wu, Huanyu Zhang, Yan Xia, Shaoguang Mao, Li Dong, Ivan Vulić, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) prompting has proven highly effective for enhancing
+complex reasoning in Large Language Models (LLMs) and Multimodal Large Language
+Models (MLLMs). Yet, it struggles in complex spatial reasoning tasks.
+Nonetheless, human cognition extends beyond language alone, enabling the
+remarkable capability to think in both words and images. Inspired by this
+mechanism, we propose a new reasoning paradigm, Multimodal
+Visualization-of-Thought (MVoT). It enables visual thinking in MLLMs by
+generating image visualizations of their reasoning traces. To ensure
+high-quality visualization, we introduce token discrepancy loss into
+autoregressive MLLMs. This innovation significantly improves both visual
+coherence and fidelity. We validate this approach through several dynamic
+spatial reasoning tasks. Experimental results reveal that MVoT demonstrates
+competitive performance across tasks. Moreover, it exhibits robust and reliable
+improvements in the most challenging scenarios where CoT fails. Ultimately,
+MVoT establishes new possibilities for complex reasoning tasks where visual
+thinking can effectively complement verbal reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures, 4 tables (27 pages, 10 figures, 16 tables
+  including references and appendices)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance Optimization of Ratings-Based Reinforcement Learning <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evelyn Rose, Devin White, Mingkang Wu, Vernon Lawhern, Nicholas R. Waytowich, Yongcan Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores multiple optimization methods to improve the performance
+of rating-based reinforcement learning (RbRL). RbRL, a method based on the idea
+of human ratings, has been developed to infer reward functions in reward-free
+environments for the subsequent policy learning via standard reinforcement
+learning, which requires the availability of reward functions. Specifically,
+RbRL minimizes the cross entropy loss that quantifies the differences between
+human ratings and estimated ratings derived from the inferred reward. Hence, a
+low loss means a high degree of consistency between human ratings and estimated
+ratings. Despite its simple form, RbRL has various hyperparameters and can be
+sensitive to various factors. Therefore, it is critical to provide
+comprehensive experiments to understand the impact of various hyperparameters
+on the performance of RbRL. This paper is a work in progress, providing users
+some general guidelines on how to select hyperparameters in RbRL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Collaborative AI and Modeling of Humans Bridge
+  Program at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Training of Neural Networks to Achieve Bayes Optimal
+  Classification Accuracy <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Tavasoli Naeini, Ali Bereyhi, Morteza Noshad, Ben Liang, Alfred O. Hero III
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work invokes the notion of $f$-divergence to introduce a novel upper
+bound on the Bayes error rate of a general classification task. We show that
+the proposed bound can be computed by sampling from the output of a
+parameterized model. Using this practical interpretation, we introduce the
+Bayes optimal learning threshold (BOLT) loss whose minimization enforces a
+classification model to achieve the Bayes error rate. We validate the proposed
+loss for image and text classification tasks, considering MNIST, Fashion-MNIST,
+CIFAR-10, and IMDb datasets. Numerical experiments demonstrate that models
+trained with BOLT achieve performance on par with or exceeding that of
+cross-entropy, particularly on challenging datasets. This highlights the
+potential of BOLT in improving generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Up ESM2 Architectures for Long Protein Sequences Analysis: Long
+  and Quantized Approaches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Bianchin de Oliveira, Helio Pedrini, Zanoni Dias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various approaches utilizing Transformer architectures have achieved
+state-of-the-art results in Natural Language Processing (NLP). Based on this
+success, numerous architectures have been proposed for other types of data,
+such as in biology, particularly for protein sequences. Notably among these are
+the ESM2 architectures, pre-trained on billions of proteins, which form the
+basis of various state-of-the-art approaches in the field. However, the ESM2
+architectures have a limitation regarding input size, restricting it to 1,022
+amino acids, which necessitates the use of preprocessing techniques to handle
+sequences longer than this limit. In this paper, we present the long and
+quantized versions of the ESM2 architectures, doubling the input size limit to
+2,048 amino acids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Concentration of Measure for Distributions Generated via Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07741v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07741v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Ghane, Anthony Bao, Danil Akhtiamov, Babak Hassibi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show via a combination of mathematical arguments and empirical evidence
+that data distributions sampled from diffusion models satisfy a Concentration
+of Measure Property saying that any Lipschitz $1$-dimensional projection of a
+random vector is not too far from its mean with high probability. This implies
+that such models are quite restrictive and gives an explanation for a fact
+previously observed in arXiv:2410.14171 that conventional diffusion models
+cannot capture "heavy-tailed" data (i.e. data $\mathbf{x}$ for which the norm
+$\|\mathbf{x}\|_2$ does not possess a subgaussian tail) well. We then proceed
+to train a generalized linear model using stochastic gradient descent (SGD) on
+the diffusion-generated data for a multiclass classification task and observe
+empirically that a Gaussian universality result holds for the test error.
+  In other words, the test error depends only on the first and second order
+statistics of the diffusion-generated data in the linear setting. Results of
+such forms are desirable because they allow one to assume the data itself is
+Gaussian for analyzing performance of the trained classifier. Finally, we note
+that current approaches to proving universality do not apply to this case as
+the covariance matrices of the data tend to have vanishing minimum singular
+values for the diffusion-generated data, while the current proofs assume that
+this is not the case (see Subsection 3.4 for more details). This leaves
+extending previous mathematical universality results as an intriguing open
+question.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-megabase scale genome interpretation with genetic language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07737v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07737v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frederik Träuble, Lachlan Stuart, Andreas Georgiou, Pascal Notin, Arash Mehrjou, Ron Schwessinger, Mathieu Chevalley, Kim Branson, Bernhard Schölkopf, Cornelia van Duijn, Debora Marks, Patrick Schwab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how molecular changes caused by genetic variation drive disease
+risk is crucial for deciphering disease mechanisms. However, interpreting
+genome sequences is challenging because of the vast size of the human genome,
+and because its consequences manifest across a wide range of cells, tissues and
+scales -- spanning from molecular to whole organism level. Here, we present
+Phenformer, a multi-scale genetic language model that learns to generate
+mechanistic hypotheses as to how differences in genome sequence lead to
+disease-relevant changes in expression across cell types and tissues directly
+from DNA sequences of up to 88 million base pairs. Using whole genome
+sequencing data from more than 150 000 individuals, we show that Phenformer
+generates mechanistic hypotheses about disease-relevant cell and tissue types
+that match literature better than existing state-of-the-art methods, while
+using only sequence data. Furthermore, disease risk predictors enriched by
+Phenformer show improved prediction performance and generalisation to diverse
+populations. Accurate multi-megabase scale interpretation of whole genomes
+without additional experimental data enables both a deeper understanding of
+molecular mechanisms involved in disease and improved disease risk prediction
+at the level of individuals.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HyperQuery: Beyond Binary Link Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sepideh Maleki, Josh Vekhter, Keshav Pingali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Groups with complex set intersection relations are a natural way to model a
+wide array of data, from the formation of social groups to the complex protein
+interactions which form the basis of biological life. One approach to
+representing such higher order relationships is as a hypergraph. However,
+efforts to apply machine learning techniques to hypergraph structured datasets
+have been limited thus far. In this paper, we address the problem of link
+prediction in knowledge hypergraphs as well as simple hypergraphs and develop a
+novel, simple, and effective optimization architecture that addresses both
+tasks. Additionally, we introduce a novel feature extraction technique using
+node level clustering and we show how integrating data from node-level labels
+can improve system performance. Our self-supervised approach achieves
+significant improvement over state of the art baselines on several hyperedge
+prediction and knowledge hypergraph completion benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Autoencoded UMAP-Enhanced Clustering for Unsupervised Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malihehsadat Chavooshi, Alexander V. Mamonov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel approach to unsupervised learning by constructing a
+non-linear embedding of the data into a low-dimensional space followed by any
+conventional clustering algorithm. The embedding promotes clusterability of the
+data and is comprised of two mappings: the encoder of an autoencoder neural
+network and the output of UMAP algorithm. The autoencoder is trained with a
+composite loss function that incorporates both a conventional data
+reconstruction as a regularization component and a clustering-promoting
+component built using the spectral graph theory. The two embeddings and the
+subsequent clustering are integrated into a three-stage unsupervised learning
+framework, referred to as Autoencoded UMAP-Enhanced Clustering (AUEC). When
+applied to MNIST data, AUEC significantly outperforms the state-of-the-art
+techniques in terms of clustering accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stronger Than You Think: Benchmarking Weak Supervision on Realistic
+  Tasks <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Zhang, Linrong Cai, Jeffrey Li, Nicholas Roberts, Neel Guha, Jinoh Lee, Frederic Sala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weak supervision (WS) is a popular approach for label-efficient learning,
+leveraging diverse sources of noisy but inexpensive weak labels to
+automatically annotate training data. Despite its wide usage, WS and its
+practical value are challenging to benchmark due to the many knobs in its
+setup, including: data sources, labeling functions (LFs), aggregation
+techniques (called label models), and end model pipelines. Existing evaluation
+suites tend to be limited, focusing on particular components or specialized use
+cases. Moreover, they often involve simplistic benchmark tasks or de-facto LF
+sets that are suboptimally written, producing insights that may not generalize
+to real-world settings. We address these limitations by introducing a new
+benchmark, BOXWRENCH, designed to more accurately reflect real-world usages of
+WS. This benchmark features tasks with (1) higher class cardinality and
+imbalance, (2) notable domain expertise requirements, and (3) multilingual
+variations across parallel corpora. For all tasks, LFs are written using a
+careful procedure aimed at mimicking real-world settings. In contrast to
+existing WS benchmarks, we show that supervised learning requires substantial
+amounts (1000+) of labeled examples to match WS in many settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 Datasets and Benchmarks Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ESURF: Simple and Effective EDU Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Sediqin, Shlomo Engelson Argamon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmenting text into Elemental Discourse Units (EDUs) is a fundamental task
+in discourse parsing. We present a new simple method for identifying EDU
+boundaries, and hence segmenting them, based on lexical and character n-gram
+features, using random forest classification. We show that the method, despite
+its simplicity, outperforms other methods both for segmentation and within a
+state of the art discourse parser. This indicates the importance of such
+features for identifying basic discourse elements, pointing towards potentially
+more training-efficient methods for discourse analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Testing Human-Hand Segmentation on In-Distribution and
+  Out-of-Distribution Data in Human-Robot Interactions Using a Deep Ensemble
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Jalayer, Yuxin Chen, Masoud Jalayer, Carlotta Orsenigo, Masayoshi Tomizuka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliable detection and segmentation of human hands are critical for enhancing
+safety and facilitating advanced interactions in human-robot collaboration.
+Current research predominantly evaluates hand segmentation under
+in-distribution (ID) data, which reflects the training data of deep learning
+(DL) models. However, this approach fails to address out-of-distribution (OOD)
+scenarios that often arise in real-world human-robot interactions. In this
+study, we present a novel approach by evaluating the performance of pre-trained
+DL models under both ID data and more challenging OOD scenarios. To mimic
+realistic industrial scenarios, we designed a diverse dataset featuring simple
+and cluttered backgrounds with industrial tools, varying numbers of hands (0 to
+4), and hands with and without gloves. For OOD scenarios, we incorporated
+unique and rare conditions such as finger-crossing gestures and motion blur
+from fast-moving hands, addressing both epistemic and aleatoric uncertainties.
+To ensure multiple point of views (PoVs), we utilized both egocentric cameras,
+mounted on the operator's head, and static cameras to capture RGB images of
+human-robot interactions. This approach allowed us to account for multiple
+camera perspectives while also evaluating the performance of models trained on
+existing egocentric datasets as well as static-camera datasets. For
+segmentation, we used a deep ensemble model composed of UNet and RefineNet as
+base learners. Performance evaluation was conducted using segmentation metrics
+and uncertainty quantification via predictive entropy. Results revealed that
+models trained on industrial datasets outperformed those trained on
+non-industrial datasets, highlighting the importance of context-specific
+training. Although all models struggled with OOD scenarios, those trained on
+industrial datasets demonstrated significantly better generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Adaptive Collocation Point Strategy For Physics Informed Neural
+  Networks via the QR Discrete Empirical Interpolation Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Celaya, David Fuentes, Beatrice Riviere
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) have gained significant attention
+for solving forward and inverse problems related to partial differential
+equations (PDEs). While advancements in loss functions and network
+architectures have improved PINN accuracy, the impact of collocation point
+sampling on their performance remains underexplored. Fixed sampling methods,
+such as uniform random sampling and equispaced grids, can fail to capture
+critical regions with high solution gradients, limiting their effectiveness for
+complex PDEs. Adaptive methods, inspired by adaptive mesh refinement from
+traditional numerical methods, address this by dynamically updating collocation
+points during training but may overlook residual dynamics between updates,
+potentially losing valuable information. To overcome this limitation, we
+propose an adaptive collocation point selection strategy utilizing the QR
+Discrete Empirical Interpolation Method (QR-DEIM), a reduced-order modeling
+technique for efficiently approximating nonlinear functions. Our results on
+benchmark PDEs, including the wave, Allen-Cahn, and Burgers' equations,
+demonstrate that our QR-DEIM-based approach improves PINN accuracy compared to
+existing methods, offering a promising direction for adaptive collocation point
+strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Dataset</span> Distillation as Pushforward Optimal Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Ye Tan, Emma Slade
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation aims to find a synthetic training set such that training
+on the synthetic data achieves similar performance to training on real data,
+with orders of magnitude less computational requirements. Existing methods can
+be broadly categorized as either bi-level optimization problems that have
+neural network training heuristics as the lower level problem, or disentangled
+methods that bypass the bi-level optimization by matching distributions of
+data. The latter method has the major advantages of speed and scalability in
+terms of size of both training and distilled datasets. We demonstrate that when
+equipped with an encoder-decoder structure, the empirically successful
+disentangled methods can be reformulated as an optimal quantization problem,
+where a finite set of points is found to approximate the underlying probability
+measure by minimizing the expected projection distance. In particular, we link
+existing disentangled dataset distillation methods to the classical optimal
+quantization and Wasserstein barycenter problems, demonstrating consistency of
+distilled datasets for diffusion-based generative priors. We propose a simple
+extension of the state-of-the-art data distillation method D4M, achieving
+better performance on the ImageNet-1K dataset with trivial additional
+computation, and state-of-the-art performance in higher image-per-class
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Early Exit Deep Neural Networks in NLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Divya Jyoti Bajpai, Manjesh Kumar Hanawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNNs) have grown increasingly large in size to achieve
+state of the art performance across a wide range of tasks. However, their high
+computational requirements make them less suitable for resource-constrained
+applications. Also, real-world datasets often consist of a mixture of easy and
+complex samples, necessitating adaptive inference mechanisms that account for
+sample difficulty. Early exit strategies offer a promising solution by enabling
+adaptive inference, where simpler samples are classified using the initial
+layers of the DNN, thereby accelerating the overall inference process. By
+attaching classifiers at different layers, early exit methods not only reduce
+inference latency but also improve the model robustness against adversarial
+attacks. This paper presents a comprehensive survey of early exit methods and
+their applications in NLP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finite Sample Identification of Partially Observed Bilinear Dynamical
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yahya Sattar, Yassir Jedra, Maryam Fazel, Sarah Dean
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning a realization of a partially observed
+bilinear dynamical system (BLDS) from noisy input-output data. Given a single
+trajectory of input-output samples, we provide a finite time analysis for
+learning the system's Markov-like parameters, from which a balanced realization
+of the bilinear system can be obtained. Our bilinear system identification
+algorithm learns the system's Markov-like parameters by regressing the outputs
+to highly correlated, nonlinear, and heavy-tailed covariates. Moreover, the
+stability of BLDS depends on the sequence of inputs used to excite the system.
+These properties, unique to partially observed bilinear dynamical systems, pose
+significant challenges to the analysis of our algorithm for learning the
+unknown dynamics. We address these challenges and provide high probability
+error bounds on our identification algorithm under a uniform stability
+assumption. Our analysis provides insights into system theoretic quantities
+that affect learning accuracy and sample complexity. Lastly, we perform
+numerical experiments with synthetic data to reinforce these insights.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Step Toward Interpretability: Smearing the Likelihood 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew J. Larkoski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of interpretability of machine learning architecture in particle
+physics has no agreed-upon definition, much less any proposed solution. We
+present a first modest step toward these goals by proposing a definition and
+corresponding practical method for isolation and identification of relevant
+physical energy scales exploited by the machine. This is accomplished by
+smearing or averaging over all input events that lie within a prescribed metric
+energy distance of one another and correspondingly renders any quantity
+measured on a finite, discrete dataset continuous over the dataspace. Within
+this approach, we are able to explicitly demonstrate that (approximate) scaling
+laws are a consequence of extreme value theory applied to analysis of the
+distribution of the irreducible minimal distance over which a machine must
+extrapolate given a finite dataset. As an example, we study quark versus gluon
+jet identification, construct the smeared likelihood, and show that
+discrimination power steadily increases as resolution decreases, indicating
+that the true likelihood for the problem is sensitive to emissions at all
+scales.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16+1 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ML Mule: Mobile-Driven Context-Aware Collaborative Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07536v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07536v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxiang Yu, Javier Berrocal, Christine Julien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence has been integrated into nearly every aspect of daily
+life, powering applications from object detection with computer vision to large
+language models for writing emails and compact models in smart homes. These
+machine learning models cater to individual users but are often detached from
+them, as they are typically stored and processed in centralized data centers.
+This centralized approach raises privacy concerns, incurs high infrastructure
+costs, and struggles with personalization. Federated and fully decentralized
+learning methods have been proposed to address these issues, but they still
+depend on centralized servers or face slow convergence due to communication
+constraints. To overcome these challenges, we propose ML Mule, a approach that
+utilizes individual mobile devices as 'Mules' to train and transport model
+snapshots as they move through physical spaces, sharing these models with the
+physical 'Spaces' they inhabit. This method implicitly forms affinity groups
+among devices associated with users who share particular spaces, enabling
+collaborative model evolution, and protecting users' privacy. Our approach
+addresses several major shortcomings of traditional, federated, and fully
+decentralized learning systems. The proposed framework represents a new class
+of machine learning methods that are more robust, distributed, and
+personalized, bringing the field closer to realizing the original vision of
+intelligent, adaptive, and genuinely context-aware smart environments. The
+results show that ML Mule converges faster and achieves higher model accuracy
+compared to other existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating Map-Based Path Loss Models: A Study of Feature
+  Representations in Convolutional Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan G. Dempsey, Jonathan Ethier, Halim Yanikomeroglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Path loss prediction is a beneficial tool for efficient use of the radio
+frequency spectrum. Building on prior research on high-resolution map-based
+path loss models, this paper studies convolutional neural network input
+representations in more detail. We investigate different methods of
+representing scalar features in convolutional neural networks. Specifically, we
+compare using frequency and distance as input channels to convolutional layers
+or as scalar inputs to regression layers. We assess model performance using
+three different feature configurations and find that representing scalar
+features as image channels results in the strongest generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 2 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RadAlign: Advancing Radiology Report Generation with Vision-Language
+  Concept Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Difei Gu, Yunhe Gao, Yang Zhou, Mu Zhou, Dimitris Metaxas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated chest radiographs interpretation requires both accurate disease
+classification and detailed radiology report generation, presenting a
+significant challenge in the clinical workflow. Current approaches either focus
+on classification accuracy at the expense of interpretability or generate
+detailed but potentially unreliable reports through image captioning
+techniques. In this study, we present RadAlign, a novel framework that combines
+the predictive accuracy of vision-language models (VLMs) with the reasoning
+capabilities of large language models (LLMs). Inspired by the radiologist's
+workflow, RadAlign first employs a specialized VLM to align visual features
+with key medical concepts, achieving superior disease classification with an
+average AUC of 0.885 across multiple diseases. These recognized medical
+conditions, represented as text-based concepts in the aligned visual-language
+space, are then used to prompt LLM-based report generation. Enhanced by a
+retrieval-augmented generation mechanism that grounds outputs in similar
+historical cases, RadAlign delivers superior report quality with a GREEN score
+of 0.678, outperforming state-of-the-art methods' 0.634. Our framework
+maintains strong clinical interpretability while reducing hallucinations,
+advancing automated medical imaging and report analysis through integrated
+predictive and generative AI. Code is available at
+https://github.com/difeigu/RadAlign.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving DeFi Accessibility through Efficient Liquidity Provisioning
+  with Deep Reinforcement Learning <span class="chip">AAAI
+  2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Xu, Alessio Brini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper applies deep reinforcement learning (DRL) to optimize liquidity
+provisioning in Uniswap v3, a decentralized finance (DeFi) protocol
+implementing an automated market maker (AMM) model with concentrated liquidity.
+We model the liquidity provision task as a Markov Decision Process (MDP) and
+train an active liquidity provider (LP) agent using the Proximal Policy
+Optimization (PPO) algorithm. The agent dynamically adjusts liquidity positions
+by using information about price dynamics to balance fee maximization and
+impermanent loss mitigation. We use a rolling window approach for training and
+testing, reflecting realistic market conditions and regime shifts. This study
+compares the data-driven performance of the DRL-based strategy against common
+heuristics adopted by small retail LP actors that do not systematically modify
+their liquidity positions. By promoting more efficient liquidity management,
+this work aims to make DeFi markets more accessible and inclusive for a broader
+range of participants. Through a data-driven approach to liquidity management,
+this work seeks to contribute to the ongoing development of more efficient and
+user-friendly DeFi markets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures. Accepted at AI for Social Impact: Bridging
+  Innovations in Finance, Social Media, and Crime Prevention Workshop at AAAI
+  2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RbRL2.0: Integrated Reward and Policy Learning for Rating-based
+  Reinforcement Learning <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingkang Wu, Devin White, Vernon Lawhern, Nicholas R. Waytowich, Yongcan Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL), a common tool in decision making, learns
+policies from various experiences based on the associated cumulative
+return/rewards without treating them differently. On the contrary, humans often
+learn to distinguish from different levels of performance and extract the
+underlying trends towards improving their decision making for best performance.
+Motivated by this, this paper proposes a novel RL method that mimics humans'
+decision making process by differentiating among collected experiences for
+effective policy learning. The main idea is to extract important directional
+information from experiences with different performance levels, named ratings,
+so that policies can be updated towards desired deviation from these
+experiences with different ratings. Specifically, we propose a new policy loss
+function that penalizes distribution similarities between the current policy
+and failed experiences with different ratings, and assign different weights to
+the penalty terms based on the rating classes. Meanwhile, reward learning from
+these rated samples can be integrated with the new policy loss towards an
+integrated reward and policy learning from rated samples. Optimizing the
+integrated reward and policy loss function will lead to the discovery of
+directions for policy improvement towards maximizing cumulative rewards and
+penalizing most from the lowest performance level while least from the highest
+performance level. To evaluate the effectiveness of the proposed method, we
+present results for experiments on a few typical environments that show
+improved convergence and overall performance over the existing rating-based
+reinforcement learning method with only reward learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Collaborative AI and Modeling of Humans Bridge
+  Program at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring and Mitigating Adversarial Manipulation of Voting-Based
+  Leaderboards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangsibo Huang, Milad Nasr, Anastasios Angelopoulos, Nicholas Carlini, Wei-Lin Chiang, Christopher A. Choquette-Choo, Daphne Ippolito, Matthew Jagielski, Katherine Lee, Ken Ziyu Liu, Ion Stoica, Florian Tramer, Chiyuan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is now common to evaluate Large Language Models (LLMs) by having humans
+manually vote to evaluate model outputs, in contrast to typical benchmarks that
+evaluate knowledge or skill at some particular task. Chatbot Arena, the most
+popular benchmark of this type, ranks models by asking users to select the
+better response between two randomly selected models (without revealing which
+model was responsible for the generations). These platforms are widely trusted
+as a fair and accurate measure of LLM capabilities. In this paper, we show that
+if bot protection and other defenses are not implemented, these voting-based
+benchmarks are potentially vulnerable to adversarial manipulation.
+Specifically, we show that an attacker can alter the leaderboard (to promote
+their favorite model or demote competitors) at the cost of roughly a thousand
+votes (verified in a simulated, offline version of Chatbot Arena). Our attack
+consists of two steps: first, we show how an attacker can determine which model
+was used to generate a given reply with more than $95\%$ accuracy; and then,
+the attacker can use this information to consistently vote for (or against) a
+target model. Working with the Chatbot Arena developers, we identify, propose,
+and implement mitigations to improve the robustness of Chatbot Arena against
+adversarial manipulation, which, based on our analysis, substantially increases
+the cost of such attacks. Some of these defenses were present before our
+collaboration, such as bot protection with Cloudflare, malicious user
+detection, and rate limiting. Others, including reCAPTCHA and login are being
+integrated to strengthen the security in Chatbot Arena.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PrecipDiff: Leveraging image diffusion models to enhance satellite-based
+  precipitation observations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ting-Yu Dai, Hayato Ushijima-Mwesigwa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A recent report from the World Meteorological Organization (WMO) highlights
+that water-related disasters have caused the highest human losses among natural
+disasters over the past 50 years, with over 91\% of deaths occurring in
+low-income countries. This disparity is largely due to the lack of adequate
+ground monitoring stations, such as weather surveillance radars (WSR), which
+are expensive to install. For example, while the US and Europe combined possess
+over 600 WSRs, Africa, despite having almost one and half times their landmass,
+has fewer than 40. To address this issue, satellite-based observations offer a
+global, near-real-time monitoring solution. However, they face several
+challenges like accuracy, bias, and low spatial resolution. This study
+leverages the power of diffusion models and residual learning to address these
+limitations in a unified framework. We introduce the first diffusion model for
+correcting the inconsistency between different precipitation products. Our
+method demonstrates the effectiveness in downscaling satellite precipitation
+estimates from 10 km to 1 km resolution. Extensive experiments conducted in the
+Seattle region demonstrate significant improvements in accuracy, bias
+reduction, and spatial detail. Importantly, our approach achieves these results
+using only precipitation data, showcasing the potential of a purely computer
+vision-based approach for enhancing satellite precipitation products and paving
+the way for further advancements in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pairwise Comparisons without Stochastic Transitivity: Model, Theory and
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07437v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07437v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sze Ming Lee, Yunxiao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most statistical models for pairwise comparisons, including the Bradley-Terry
+(BT) and Thurstone models and many extensions, make a relatively strong
+assumption of stochastic transitivity. This assumption imposes the existence of
+an unobserved global ranking among all the players/teams/items and monotone
+constraints on the comparison probabilities implied by the global ranking.
+However, the stochastic transitivity assumption does not hold in many
+real-world scenarios of pairwise comparisons, especially games involving
+multiple skills or strategies. As a result, models relying on this assumption
+can have suboptimal predictive performance. In this paper, we propose a general
+family of statistical models for pairwise comparison data without a stochastic
+transitivity assumption, substantially extending the BT and Thurstone models.
+In this model, the pairwise probabilities are determined by a (approximately)
+low-dimensional skew-symmetric matrix. Likelihood-based estimation methods and
+computational algorithms are developed, which allow for sparse data with only a
+small proportion of observed pairs. Theoretical analysis shows that the
+proposed estimator achieves minimax-rate optimality, which adapts effectively
+to the sparsity level of the data. The spectral theory for skew-symmetric
+matrices plays a crucial role in the implementation and theoretical analysis.
+The proposed method's superiority against the BT model, along with its broad
+applicability across diverse scenarios, is further supported by simulations and
+real data analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distance Measure Based on an Embedding of the Manifold of K-Component
+  Gaussian Mixture Models into the Manifold of Symmetric Positive Definite
+  Matrices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Vishwakarma, KS Subrahamanian Moosath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a distance between the Gaussian Mixture Models(GMMs) is
+obtained based on an embedding of the K-component Gaussian Mixture Model into
+the manifold of the symmetric positive definite matrices. Proof of embedding of
+K-component GMMs into the manifold of symmetric positive definite matrices is
+given and shown that it is a submanifold. Then, proved that the manifold of
+GMMs with the pullback of induced metric is isometric to the submanifold with
+the induced metric. Through this embedding we obtain a general lower bound for
+the Fisher-Rao metric. This lower bound is a distance measure on the manifold
+of GMMs and we employ it for the similarity measure of GMMs. The effectiveness
+of this framework is demonstrated through an experiment on standard machine
+learning benchmarks, achieving accuracy of 98%, 92%, and 93.33% on the UIUC,
+KTH-TIPS, and UMD texture recognition datasets respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MVICAD2: Multi-View Independent Component Analysis with Delays and
+  Dilations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ambroise Heurtebise, Omar Chehab, Pierre Ablin, Alexandre Gramfort
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning techniques in multi-view settings face significant
+challenges, particularly when integrating heterogeneous data, aligning feature
+spaces, and managing view-specific biases. These issues are prominent in
+neuroscience, where data from multiple subjects exposed to the same stimuli are
+analyzed to uncover brain activity dynamics. In magnetoencephalography (MEG),
+where signals are captured at the scalp level, estimating the brain's
+underlying sources is crucial, especially in group studies where sources are
+assumed to be similar for all subjects. Common methods, such as Multi-View
+Independent Component Analysis (MVICA), assume identical sources across
+subjects, but this assumption is often too restrictive due to individual
+variability and age-related changes. Multi-View Independent Component Analysis
+with Delays (MVICAD) addresses this by allowing sources to differ up to a
+temporal delay. However, temporal dilation effects, particularly in auditory
+stimuli, are common in brain dynamics, making the estimation of time delays
+alone insufficient. To address this, we propose Multi-View Independent
+Component Analysis with Delays and Dilations (MVICAD2), which allows sources to
+differ across subjects in both temporal delays and dilations. We present a
+model with identifiable sources, derive an approximation of its likelihood in
+closed form, and use regularization and optimization techniques to enhance
+performance. Through simulations, we demonstrate that MVICAD2 outperforms
+existing multi-view ICA methods. We further validate its effectiveness using
+the Cam-CAN dataset, and showing how delays and dilations are related to aging.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Investigation into Seasonal Variations in Energy Forecasting for
+  Student Residences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Umair Danish, Mathumitha Sureshkumar, Thanuri Fonseka, Umeshika Uthayakumar, Vinura Galwaduge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research provides an in-depth evaluation of various machine learning
+models for energy forecasting, focusing on the unique challenges of seasonal
+variations in student residential settings. The study assesses the performance
+of baseline models, such as LSTM and GRU, alongside state-of-the-art
+forecasting methods, including Autoregressive Feedforward Neural Networks,
+Transformers, and hybrid approaches. Special attention is given to predicting
+energy consumption amidst challenges like seasonal patterns, vacations,
+meteorological changes, and irregular human activities that cause sudden
+fluctuations in usage. The findings reveal that no single model consistently
+outperforms others across all seasons, emphasizing the need for season-specific
+model selection or tailored designs. Notably, the proposed Hyper Network based
+LSTM and MiniAutoEncXGBoost models exhibit strong adaptability to seasonal
+variations, effectively capturing abrupt changes in energy consumption during
+summer months. This study advances the energy forecasting field by emphasizing
+the critical role of seasonal dynamics and model-specific behavior in achieving
+accurate predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PROTECT: Protein circadian time prediction using unsupervised learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07405v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07405v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aram Ansary Ogholbake, Qiang Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Circadian rhythms regulate the physiology and behavior of humans and animals.
+Despite advancements in understanding these rhythms and predicting circadian
+phases at the transcriptional level, predicting circadian phases from proteomic
+data remains elusive. This challenge is largely due to the scarcity of time
+labels in proteomic datasets, which are often characterized by small sample
+sizes, high dimensionality, and significant noise. Furthermore, existing
+methods for predicting circadian phases from transcriptomic data typically rely
+on prior knowledge of known rhythmic genes, making them unsuitable for
+proteomic datasets. To address this gap, we developed a novel computational
+method using unsupervised deep learning techniques to predict circadian sample
+phases from proteomic data without requiring time labels or prior knowledge of
+proteins or genes. Our model involves a two-stage training process optimized
+for robust circadian phase prediction: an initial greedy one-layer-at-a-time
+pre-training which generates informative initial parameters followed by
+fine-tuning. During fine-tuning, a specialized loss function guides the model
+to align protein expression levels with circadian patterns, enabling it to
+accurately capture the underlying rhythmic structure within the data. We tested
+our method on both time-labeled and unlabeled proteomic data. For labeled data,
+we compared our predictions to the known time labels, achieving high accuracy,
+while for unlabeled human datasets, including postmortem brain regions and
+urine samples, we explored circadian disruptions. Notably, our analysis
+identified disruptions in rhythmic proteins between Alzheimer's disease and
+control subjects across these samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Derivation of effective gradient flow equations and dynamical truncation
+  of training data in Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We derive explicit equations governing the cumulative biases and weights in
+Deep Learning with ReLU activation function, based on gradient descent for the
+Euclidean cost in the input layer, and under the assumption that the weights
+are, in a precise sense, adapted to the coordinate system distinguished by the
+activations. We show that gradient descent corresponds to a dynamical process
+in the input layer, whereby clusters of data are progressively reduced in
+complexity ("truncated") at an exponential rate that increases with the number
+of data points that have already been truncated. We provide a detailed
+discussion of several types of solutions to the gradient flow equations. A main
+motivation for this work is to shed light on the interpretability question in
+supervised learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AMS Latex, 35 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information-Theoretic Dual Memory System for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        RunQing Wu, KaiHui Huang, HanYi Zhang, QiHe Liu, GuoJin Yu, JingSong Deng, Fei Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continuously acquiring new knowledge from a dynamic environment is a
+fundamental capability for animals, facilitating their survival and ability to
+address various challenges. This capability is referred to as continual
+learning, which focuses on the ability to learn a sequence of tasks without the
+detriment of previous knowledge. A prevalent strategy to tackle continual
+learning involves selecting and storing numerous essential data samples from
+prior tasks within a fixed-size memory buffer. However, the majority of current
+memory-based techniques typically utilize a single memory buffer, which poses
+challenges in concurrently managing newly acquired and previously learned
+samples. Drawing inspiration from the Complementary Learning Systems (CLS)
+theory, which defines rapid and gradual learning mechanisms for processing
+information, we propose an innovative dual memory system called the
+Information-Theoretic Dual Memory System (ITDMS). This system comprises a fast
+memory buffer designed to retain temporary and novel samples, alongside a slow
+memory buffer dedicated to preserving critical and informative samples. The
+fast memory buffer is optimized employing an efficient reservoir sampling
+process. Furthermore, we introduce a novel information-theoretic memory
+optimization strategy that selectively identifies and retains diverse and
+informative data samples for the slow memory buffer. Additionally, we propose a
+novel balanced sample selection procedure that automatically identifies and
+eliminates redundant memorized samples, thus freeing up memory capacity for new
+data acquisitions, which can deal with a growing array of tasks. Our
+methodology is rigorously assessed through a series of continual learning
+experiments, with empirical results underscoring the effectiveness of the
+proposed system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 9 figures, submitted to Knowledge-Based Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynami-CAL GraphNet: A Physics-Informed Graph Neural Network Conserving
+  Linear and Angular Momentum for Dynamical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinay Sharma, Olga Fink
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate, interpretable, and real-time modeling of multi-body dynamical
+systems is essential for predicting behaviors and inferring physical properties
+in natural and engineered environments. Traditional physics-based models face
+scalability challenges and are computationally demanding, while data-driven
+approaches like Graph Neural Networks (GNNs) often lack physical consistency,
+interpretability, and generalization. In this paper, we propose Dynami-CAL
+GraphNet, a Physics-Informed Graph Neural Network that integrates the learning
+capabilities of GNNs with physics-based inductive biases to address these
+limitations. Dynami-CAL GraphNet enforces pairwise conservation of linear and
+angular momentum for interacting nodes using edge-local reference frames that
+are equivariant to rotational symmetries, invariant to translations, and
+equivariant to node permutations. This design ensures physically consistent
+predictions of node dynamics while offering interpretable, edge-wise linear and
+angular impulses resulting from pairwise interactions. Evaluated on a 3D
+granular system with inelastic collisions, Dynami-CAL GraphNet demonstrates
+stable error accumulation over extended rollouts, effective extrapolations to
+unseen configurations, and robust handling of heterogeneous interactions and
+external forces. Dynami-CAL GraphNet offers significant advantages in fields
+requiring accurate, interpretable, and real-time modeling of complex multi-body
+dynamical systems, such as robotics, aerospace engineering, and materials
+science. By providing physically consistent and scalable predictions that
+adhere to fundamental conservation laws, it enables the inference of forces and
+moments while efficiently handling heterogeneous interactions and external
+forces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulating the Hubbard Model with Equivariant Normalizing Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominic Schuh, Janik Kreit, Evan Berkowitz, Lena Funcke, Thomas Luu, Kim A. Nicoli, Marcel Rodekamp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models, particularly normalizing flows, have shown exceptional
+performance in learning probability distributions across various domains of
+physics, including statistical mechanics, collider physics, and lattice field
+theory. In the context of lattice field theory, normalizing flows have been
+successfully applied to accurately learn the Boltzmann distribution, enabling a
+range of tasks such as direct estimation of thermodynamic observables and
+sampling independent and identically distributed (i.i.d.) configurations.
+  In this work, we present a proof-of-concept demonstration that normalizing
+flows can be used to learn the Boltzmann distribution for the Hubbard model.
+This model is widely employed to study the electronic structure of graphene and
+other carbon nanomaterials. State-of-the-art numerical simulations of the
+Hubbard model, such as those based on Hybrid Monte Carlo (HMC) methods, often
+suffer from ergodicity issues, potentially leading to biased estimates of
+physical observables. Our numerical experiments demonstrate that leveraging
+i.i.d.\ sampling from the normalizing flow effectively addresses these issues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, contribution to the 41st International Symposium
+  on Lattice Field Theory (Lattice 2024), July 28th - August 3rd, 2024,
+  Liverpool, UK</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal semantic retrieval for product search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Liu, Esther Lopez Ramos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic retrieval (also known as dense retrieval) based on textual data has
+been extensively studied for both web search and product search application
+fields, where the relevance of a query and a potential target document is
+computed by their dense vector representation comparison. Product image is
+crucial for e-commence search interactions and is a key factor for customers at
+product explorations. But its impact for semantic retrieval has not been well
+studied yet. In this research, we build a multimodal representation for product
+items in e-commerece search in contrast to pure-text representation of
+products, and investigate the impact of such representations. The models are
+developed and evaluated on e-commerce datasets. We demonstrate that a
+multimodal representation scheme for a product can show improvement either on
+purchase recall or relevance accuracy in semantic retrieval. Additionally, we
+provide numerical analysis for exclusive matches retrieved by a multimodal
+semantic retrieval model versus a text-only semantic retrieval model, to
+demonstrate the validation of multimodal solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimberVision: A Multi-Task <span class="highlight-title">Dataset</span> and Framework for Log-Component
+  Segmentation and Tracking in Autonomous Forestry Operations <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Steininger, Julia Simon, Andreas Trondl, Markus Murschitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Timber represents an increasingly valuable and versatile resource. However,
+forestry operations such as harvesting, handling and measuring logs still
+require substantial human labor in remote environments posing significant
+safety risks. Progressively automating these tasks has the potential of
+increasing their efficiency as well as safety, but requires an accurate
+detection of individual logs as well as live trees and their context. Although
+initial approaches have been proposed for this challenging application domain,
+specialized data and algorithms are still too scarce to develop robust
+solutions. To mitigate this gap, we introduce the TimberVision dataset,
+consisting of more than 2k annotated RGB images containing a total of 51k trunk
+components including cut and lateral surfaces, thereby surpassing any existing
+dataset in this domain in terms of both quantity and detail by a large margin.
+Based on this data, we conduct a series of ablation experiments for oriented
+object detection and instance segmentation and evaluate the influence of
+multiple scene parameters on model performance. We introduce a generic
+framework to fuse the components detected by our models for both tasks into
+unified trunk representations. Furthermore, we automatically derive geometric
+properties and apply multi-object tracking to further enhance robustness. Our
+detection and tracking approach provides highly descriptive and accurate trunk
+representations solely from RGB image data, even under challenging
+environmental conditions. Our solution is suitable for a wide range of
+application scenarios and can be readily combined with other sensor modalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Winter Conference on Applications of Computer Vision
+  (WACV) 2025. Code and dataset available at
+  https://github.com/timbervision/timbervision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Generative Clustering with VAEs and Expectation-Maximization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07358v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07358v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Adipoetra, Ségolène Martin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel deep clustering method that integrates Variational
+Autoencoders (VAEs) into the Expectation-Maximization (EM) framework. Our
+approach models the probability distribution of each cluster with a VAE and
+alternates between updating model parameters by maximizing the Evidence Lower
+Bound (ELBO) of the log-likelihood and refining cluster assignments based on
+the learned distributions. This enables effective clustering and generation of
+new samples from each cluster. Unlike existing VAE-based methods, our approach
+eliminates the need for a Gaussian Mixture Model (GMM) prior or additional
+regularization techniques. Experiments on MNIST and FashionMNIST demonstrate
+superior clustering performance compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Online Reinforcement Learning with Meta-Learned Objective from
+  Offline Data <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shilong Deng, Zetao Zheng, Hongcai He, Paul Weng, Jie Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in Reinforcement Learning (RL) is the difficulty of
+learning an optimal policy from sparse rewards. Prior works enhance online RL
+with conventional Imitation Learning (IL) via a handcrafted auxiliary
+objective, at the cost of restricting the RL policy to be sub-optimal when the
+offline data is generated by a non-expert policy. Instead, to better leverage
+valuable information in offline data, we develop Generalized Imitation Learning
+from Demonstration (GILD), which meta-learns an objective that distills
+knowledge from offline data and instills intrinsic motivation towards the
+optimal policy. Distinct from prior works that are exclusive to a specific RL
+algorithm, GILD is a flexible module intended for diverse vanilla off-policy RL
+algorithms. In addition, GILD introduces no domain-specific hyperparameter and
+minimal increase in computational cost. In four challenging MuJoCo tasks with
+sparse rewards, we show that three RL algorithms enhanced with GILD
+significantly outperform state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025 (this version includes supplementary material)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Digital Operating Mode Classification of Real-World Amateur Radio
+  Transmissions <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Bundscherer, Thomas H. Schmitt, Ilja Baumann, Tobias Bocklet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents an ML approach for classifying digital radio operating
+modes evaluated on real-world transmissions. We generated 98 different
+parameterized radio signals from 17 digital operating modes, transmitted each
+of them on the 70 cm (UHF) amateur radio band, and recorded our transmissions
+with two different architectures of SDR receivers. Three lightweight ML models
+were trained exclusively on spectrograms of limited non-transmitted signals
+with random characters as payloads. This training involved an online data
+augmentation pipeline to simulate various radio channel impairments. Our best
+model, EfficientNetB0, achieved an accuracy of 93.80% across the 17 operating
+modes and 85.47% across all 98 parameterized radio signals, evaluated on our
+real-world transmissions with Wikipedia articles as payloads. Furthermore, we
+analyzed the impact of varying signal durations & the number of FFT bins on
+classification, assessed the effectiveness of our simulated channel
+impairments, and tested our models across multiple simulated SNRs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference IEEE ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tempo<span class="highlight-title">GPT</span>: Enhancing Temporal Reasoning via Quantizing Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochuan Zhang, Chunhua Yang, Jie Han, Liyang Qin, Xiaoli Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal language model has made advanced progress in vision and audio,
+but still faces significant challenges in dealing with complex reasoning tasks
+in the time series domain. The reasons are twofold. First, labels for
+multi-modal time series data are coarse and devoid of analysis or reasoning
+processes. Training with these data cannot improve the model's reasoning
+capabilities. Second, due to the lack of precise tokenization in processing
+time series, the representation patterns for temporal and textual information
+are inconsistent, which hampers the effectiveness of multi-modal alignment. To
+address these challenges, we propose a multi-modal time series data
+construction approach and a multi-modal time series language model (TLM),
+TempoGPT. Specially, we construct multi-modal data for complex reasoning tasks
+by analyzing the variable-system relationships within a white-box system.
+Additionally, proposed TempoGPT achieves consistent representation between
+temporal and textual information by quantizing temporal embeddings, where
+temporal embeddings are quantized into a series of discrete tokens using a
+predefined codebook; subsequently, a shared embedding layer processes both
+temporal and textual tokens. Extensive experiments demonstrate that TempoGPT
+accurately perceives temporal information, logically infers conclusions, and
+achieves state-of-the-art in the constructed complex time series reasoning
+tasks. Moreover, we quantitatively demonstrate the effectiveness of quantizing
+temporal embeddings in enhancing multi-modal alignment and the reasoning
+capabilities of TLMs. Code and data are available at
+https://github.com/zhanghaochuan20/TempoGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foundation Models at Work: Fine-Tuning for Fairness in Algorithmic
+  Hiring <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Buse Sibel Korkmaz, Rahul Nair, Elizabeth M. Daly, Evangelos Anagnostopoulos, Christos Varytimidis, Antonio del Rio Chanona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models require fine-tuning to ensure their generative outputs
+align with intended results for specific tasks. Automating this fine-tuning
+process is challenging, as it typically needs human feedback that can be
+expensive to acquire. We present AutoRefine, a method that leverages
+reinforcement learning for targeted fine-tuning, utilizing direct feedback from
+measurable performance improvements in specific downstream tasks. We
+demonstrate the method for a problem arising in algorithmic hiring platforms
+where linguistic biases influence a recommendation system. In this setting, a
+generative model seeks to rewrite given job specifications to receive more
+diverse candidate matches from a recommendation engine which matches jobs to
+candidates. Our model detects and regulates biases in job descriptions to meet
+diversity and fairness criteria. The experiments on a public hiring dataset and
+a real-world hiring platform showcase how large language models can assist in
+identifying and mitigation biases in the real world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025, AI Governance Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variable Bregman Majorization-Minimization Algorithm and its Application
+  to Dirichlet Maximum Likelihood Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ségolène Martin, Jean-Christophe Pesquet, Gabriele Steidl, Ismail Ben Ayed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel Bregman descent algorithm for minimizing a convex function
+that is expressed as the sum of a differentiable part (defined over an open
+set) and a possibly nonsmooth term. The approach, referred to as the Variable
+Bregman Majorization-Minimization (VBMM) algorithm, extends the Bregman
+Proximal Gradient method by allowing the Bregman function used in the
+divergence to adaptively vary at each iteration, provided it satisfies a
+majorizing condition on the objective function. This adaptive framework enables
+the algorithm to approximate the objective more precisely at each iteration,
+thereby allowing for accelerated convergence compared to the traditional
+Bregman Proximal Gradient descent. We establish the convergence of the VBMM
+algorithm to a minimizer under mild assumptions on the family of metrics used.
+Furthermore, we introduce a novel application of both the Bregman Proximal
+Gradient method and the VBMM algorithm to the estimation of the
+multidimensional parameters of a Dirichlet distribution through the
+maximization of its log-likelihood. Numerical experiments confirm that the VBMM
+algorithm outperforms existing approaches in terms of convergence speed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Code and Pixels: Multi-Modal Contrastive <span class="highlight-title">Pre-train</span>ing for Enhanced
+  Tabular Data Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kankana Roy, Lars Krämer, Sebastian Domaschke, Malik Haris, Roland Aydin, Fabian Isensee, Martin Held
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from tabular data is of paramount importance, as it complements the
+conventional analysis of image and video data by providing a rich source of
+structured information that is often critical for comprehensive understanding
+and decision-making processes. We present Multi-task Contrastive Masked Tabular
+Modeling (MT-CMTM), a novel method aiming to enhance tabular models by
+leveraging the correlation between tabular data and corresponding images.
+MT-CMTM employs a dual strategy combining contrastive learning with masked
+tabular modeling, optimizing the synergy between these data modalities.
+  Central to our approach is a 1D Convolutional Neural Network with residual
+connections and an attention mechanism (1D-ResNet-CBAM), designed to
+efficiently process tabular data without relying on images. This enables
+MT-CMTM to handle purely tabular data for downstream tasks, eliminating the
+need for potentially costly image acquisition and processing.
+  We evaluated MT-CMTM on the DVM car dataset, which is uniquely suited for
+this particular scenario, and the newly developed HIPMP dataset, which connects
+membrane fabrication parameters with image data. Our MT-CMTM model outperforms
+the proposed tabular 1D-ResNet-CBAM, which is trained from scratch, achieving a
+relative 1.48% improvement in relative MSE on HIPMP and a 2.38% increase in
+absolute accuracy on DVM. These results demonstrate MT-CMTM's robustness and
+its potential to advance the field of multi-modal learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Lessons of Developing Process Reward Models in Mathematical
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenru Zhang, Chujie Zheng, Yangzhen Wu, Beichen Zhang, Runji Lin, Bowen Yu, Dayiheng Liu, Jingren Zhou, Junyang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Process Reward Models (PRMs) emerge as a promising approach for process
+supervision in mathematical reasoning of Large Language Models (LLMs), which
+aim to identify and mitigate intermediate errors in the reasoning processes.
+However, the development of effective PRMs faces significant challenges,
+particularly in data annotation and evaluation methodologies. In this paper,
+through extensive experiments, we demonstrate that commonly used Monte Carlo
+(MC) estimation-based data synthesis for PRMs typically yields inferior
+performance and generalization compared to LLM-as-a-judge and human annotation
+methods. MC estimation relies on completion models to evaluate current-step
+correctness, leading to inaccurate step verification. Furthermore, we identify
+potential biases in conventional Best-of-N (BoN) evaluation strategies for
+PRMs: (1) The unreliable policy models generate responses with correct answers
+but flawed processes, leading to a misalignment between the evaluation criteria
+of BoN and the PRM objectives of process verification. (2) The tolerance of
+PRMs of such responses leads to inflated BoN scores. (3) Existing PRMs have a
+significant proportion of minimum scores concentrated on the final answer
+steps, revealing the shift from process to outcome-based assessment in BoN
+Optimized PRMs. To address these challenges, we develop a consensus filtering
+mechanism that effectively integrates MC estimation with LLM-as-a-judge and
+advocates a more comprehensive evaluation framework that combines
+response-level and step-level metrics. Based on the mechanisms, we
+significantly improve both model performance and data efficiency in the BoN
+evaluation and the step-wise error identification task. Finally, we release a
+new state-of-the-art PRM that outperforms existing open-source alternatives and
+provides practical guidelines for future research in building process
+supervision models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Dataset</span>-Agnostic Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tri Kurniawan Wijaya, Edoardo D'Amico, Xinyang Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  [This is a position paper and does not contain any empirical or theoretical
+results] Recommender systems have become a cornerstone of personalized user
+experiences, yet their development typically involves significant manual
+intervention, including dataset-specific feature engineering, hyperparameter
+tuning, and configuration. To this end, we introduce a novel paradigm:
+Dataset-Agnostic Recommender Systems (DAReS) that aims to enable a single
+codebase to autonomously adapt to various datasets without the need for
+fine-tuning, for a given recommender system task. Central to this approach is
+the Dataset Description Language (DsDL), a structured format that provides
+metadata about the dataset's features and labels, and allow the system to
+understand dataset's characteristics, allowing it to autonomously manage
+processes like feature selection, missing values imputation, noise removal, and
+hyperparameter optimization. By reducing the need for domain-specific expertise
+and manual adjustments, DAReS offers a more efficient and scalable solution for
+building recommender systems across diverse application domains. It addresses
+critical challenges in the field, such as reusability, reproducibility, and
+accessibility for non-expert users or entry-level researchers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimating quantum relative entropies on quantum computers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Lu, Kun Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum relative entropy, a quantum generalization of the well-known
+Kullback-Leibler divergence, serves as a fundamental measure of the
+distinguishability between quantum states and plays a pivotal role in quantum
+information science. Despite its importance, efficiently estimating quantum
+relative entropy between two quantum states on quantum computers remains a
+significant challenge. In this work, we propose the first quantum algorithm for
+estimating quantum relative entropy and Petz R\'{e}nyi divergence from two
+unknown quantum states on quantum computers, addressing open problems
+highlighted in [Phys. Rev. A 109, 032431 (2024)] and [IEEE Trans. Inf. Theory
+70, 5653-5680 (2024)]. This is achieved by combining quadrature approximations
+of relative entropies, the variational representation of quantum f-divergences,
+and a new technique for parameterizing Hermitian polynomial operators to
+estimate their traces with quantum states. Notably, the circuit size of our
+algorithm is at most 2n+1 with n being the number of qubits in the quantum
+states and it is directly applicable to distributed scenarios, where quantum
+states to be compared are hosted on cross-platform quantum computers. We
+validate our algorithm through numerical simulations, laying the groundwork for
+its future deployment on quantum hardware devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 10 figures; comments are welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Smart Meter Gaps: A Benchmark of Statistical, Machine Learning
+  and Time Series Foundation Models for Data Imputation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Sartipi, Joaquin Delgado Fernandez, Sergio Potenciano Menci, Alessio Magitteri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integrity of time series data in smart grids is often compromised by
+missing values due to sensor failures, transmission errors, or disruptions.
+Gaps in smart meter data can bias consumption analyses and hinder reliable
+predictions, causing technical and economic inefficiencies. As smart meter data
+grows in volume and complexity, conventional techniques struggle with its
+nonlinear and nonstationary patterns. In this context, Generative Artificial
+Intelligence offers promising solutions that may outperform traditional
+statistical methods. In this paper, we evaluate two general-purpose Large
+Language Models and five Time Series Foundation Models for smart meter data
+imputation, comparing them with conventional Machine Learning and statistical
+models. We introduce artificial gaps (30 minutes to one day) into an anonymized
+public dataset to test inference capabilities. Results show that Time Series
+Foundation Models, with their contextual understanding and pattern recognition,
+could significantly enhance imputation accuracy in certain cases. However, the
+trade-off between computational cost and performance gains remains a critical
+consideration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Poisoning Attacks against Ridge Regression Models with
+  Categorical Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monse Guedes-Ayala, Lars Schewe, Zeynep Suvak, Miguel Anjos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) models have become a very powerful tool to extract
+information from large datasets and use it to make accurate predictions and
+automated decisions. However, ML models can be vulnerable to external attacks,
+causing them to underperform or deviate from their expected tasks. One way to
+attack ML models is by injecting malicious data to mislead the algorithm during
+the training phase, which is referred to as a poisoning attack. We can prepare
+for such situations by designing anticipated attacks, which are later used for
+creating and testing defence strategies. In this paper, we propose an algorithm
+to generate strong poisoning attacks for a ridge regression model containing
+both numerical and categorical features that explicitly models and poisons
+categorical features. We model categorical features as SOS-1 sets and formulate
+the problem of designing poisoning attacks as a bilevel optimization problem
+that is nonconvex mixed-integer in the upper-level and unconstrained convex
+quadratic in the lower-level. We present the mathematical formulation of the
+problem, introduce a single-level reformulation based on the Karush-Kuhn-Tucker
+(KKT) conditions of the lower level, find bounds for the lower-level variables
+to accelerate solver performance, and propose a new algorithm to poison
+categorical features. Numerical experiments show that our method improves the
+mean squared error of all datasets compared to the previous benchmark in the
+literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOS-Attack: A Scalable Multi-objective Adversarial Attack Framework <span class="chip">CVPR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ping Guo, Cheng Gong, Xi Lin, Fei Liu, Zhichao Lu, Qingfu Zhang, Zhenkun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crafting adversarial examples is crucial for evaluating and enhancing the
+robustness of Deep Neural Networks (DNNs), presenting a challenge equivalent to
+maximizing a non-differentiable 0-1 loss function.
+  However, existing single objective methods, namely adversarial attacks focus
+on a surrogate loss function, do not fully harness the benefits of engaging
+multiple loss functions, as a result of insufficient understanding of their
+synergistic and conflicting nature.
+  To overcome these limitations, we propose the Multi-Objective Set-based
+Attack (MOS Attack), a novel adversarial attack framework leveraging multiple
+loss functions and automatically uncovering their interrelations.
+  The MOS Attack adopts a set-based multi-objective optimization strategy,
+enabling the incorporation of numerous loss functions without additional
+parameters.
+  It also automatically mines synergistic patterns among various losses,
+facilitating the generation of potent adversarial attacks with fewer
+objectives.
+  Extensive experiments have shown that our MOS Attack outperforms
+single-objective attacks. Furthermore, by harnessing the identified synergistic
+patterns, MOS Attack continues to show superior results with a reduced number
+of loss functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review of CVPR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable machine-learning for predicting molecular weight of PLA
+  based on artificial bee colony optimization algorithm and adaptive neurofuzzy
+  inference system 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Pouya Masoumi, Leo Creedon, Ramen Ghosh, Nimra Munir, Ross McMorrow, Marion McAfee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article discusses the integration of the Artificial Bee Colony (ABC)
+algorithm with two supervised learning methods, namely Artificial Neural
+Networks (ANNs) and Adaptive Network-based Fuzzy Inference System (ANFIS), for
+feature selection from Near-Infrared (NIR) spectra for predicting the molecular
+weight of medical-grade Polylactic Acid (PLA). During extrusion processing of
+PLA, in-line NIR spectra were captured along with extrusion process and machine
+setting data. With a dataset comprising 63 observations and 512 input features,
+appropriate machine learning tools are essential for interpreting data and
+selecting features to improve prediction accuracy. Initially, the ABC
+optimization algorithm is coupled with ANN/ANFIS to forecast PLA molecular
+weight. The objective functions of the ABC algorithm are to minimize the root
+mean square error (RMSE) between experimental and predicted PLA molecular
+weights while also minimizing the number of input features. Results indicate
+that employing ABC-ANFIS yields the lowest RMSE of 282 Da and identifies four
+significant parameters (NIR wavenumbers 6158 cm-1, 6310 cm-1, 6349 cm-1, and
+melt temperature) for prediction. These findings demonstrate the effectiveness
+of using the ABC algorithm with ANFIS for selecting a minimal set of features
+to predict PLA molecular weight with high accuracy during processing
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Breaking Memory Limits: Gradient Wavelet Transform Enhances LLMs
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqing Wen, Ping Luo, Jiahuan Wang, Xiaoge Deng, Jinping Zou, Kun Yuan, Tao Sun, Dongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown impressive performance across a range
+of natural language processing tasks. However, their vast number of parameters
+introduces significant memory challenges during training, particularly when
+using memory-intensive optimizers like Adam. Existing memory-efficient
+algorithms often rely on techniques such as singular value decomposition
+projection or weight freezing. While these approaches help alleviate memory
+constraints, they generally produce suboptimal results compared to full-rank
+updates. In this paper, we investigate the memory-efficient method beyond
+low-rank training, proposing a novel solution called Gradient Wavelet Transform
+(GWT), which applies wavelet transforms to gradients in order to significantly
+reduce the memory requirements for maintaining optimizer states. We demonstrate
+that GWT can be seamlessly integrated with memory-intensive optimizers,
+enabling efficient training without sacrificing performance. Through extensive
+experiments on both pre-training and fine-tuning tasks, we show that GWT
+achieves state-of-the-art performance compared with advanced memory-efficient
+optimizers and full-rank approaches in terms of both memory usage and training
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A data-driven approach to discover and quantify systemic lupus
+  erythematosus etiological heterogeneity from electronic health records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Barbero Mota, John M. Still, Jorge L. Gamboa, Eric V. Strobl, Charles M. Stein, Vivian K. Kawai, Thomas A. Lasko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Systemic lupus erythematosus (SLE) is a complex heterogeneous disease with
+many manifestational facets. We propose a data-driven approach to discover
+probabilistic independent sources from multimodal imperfect EHR data. These
+sources represent exogenous variables in the data generation process causal
+graph that estimate latent root causes of the presence of SLE in the health
+record. We objectively evaluated the sources against the original variables
+from which they were discovered by training supervised models to discriminate
+SLE from negative health records using a reduced set of labelled instances. We
+found 19 predictive sources with high clinical validity and whose EHR
+signatures define independent factors of SLE heterogeneity. Using the sources
+as input patient data representation enables models to provide with rich
+explanations that better capture the clinical reasons why a particular record
+is (not) an SLE case. Providers may be willing to trade patient-level
+interpretability for discrimination especially in challenging cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Received Runner-up Knowledge Discovery and Data Mining Innovation
+  Award at the American Medical Informatics Association Annual Symposium 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Enhanced Zeroth-Order Stochastic Frank-Wolfe Framework for
+  Constrained Finite-Sum Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haishan Ye, Yinghui Huang, Hao Di, Xiangyu Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an enhanced zeroth-order stochastic Frank-Wolfe framework to
+address constrained finite-sum optimization problems, a structure prevalent in
+large-scale machine-learning applications. Our method introduces a novel double
+variance reduction framework that effectively reduces the gradient
+approximation variance induced by zeroth-order oracles and the stochastic
+sampling variance from finite-sum objectives. By leveraging this framework, our
+algorithm achieves significant improvements in query efficiency, making it
+particularly well-suited for high-dimensional optimization tasks. Specifically,
+for convex objectives, the algorithm achieves a query complexity of O(d
+\sqrt{n}/\epsilon ) to find an epsilon-suboptimal solution, where d is the
+dimensionality and n is the number of functions in the finite-sum objective.
+For non-convex objectives, it achieves a query complexity of
+O(d^{3/2}\sqrt{n}/\epsilon^2 ) without requiring the computation ofd partial
+derivatives at each iteration. These complexities are the best known among
+zeroth-order stochastic Frank-Wolfe algorithms that avoid explicit gradient
+calculations. Empirical experiments on convex and non-convex machine learning
+tasks, including sparse logistic regression, robust classification, and
+adversarial attacks on deep networks, validate the computational efficiency and
+scalability of our approach. Our algorithm demonstrates superior performance in
+both convergence rate and query complexity compared to existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 4 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lung Cancer detection using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryan Chaudhari, Ankush Singh, Sanchi Gajbhiye, Pratham Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we discuss lung cancer detection using hybrid model of
+Convolutional-Neural-Networks (CNNs) and Support-Vector-Machines-(SVMs) in
+order to gain early detection of tumors, benign or malignant. The work uses
+this hybrid model by training upon the Computed Tomography scans (CT scans) as
+dataset. Using deep learning for detecting lung cancer early is a cutting-edge
+method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-Train</span>ed Large Language Model Based Remaining Useful Life Transfer
+  Prediction of Bearing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07191v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07191v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laifa Tao, Zhengduo Zhao, Xuesong Wang, Bin Li, Wenchao Zhan, Xuanyuan Su, Shangyu Li, Qixuan Huang, Haifei Liu, Chen Lu, Zhixuan Lian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately predicting the remaining useful life (RUL) of rotating machinery,
+such as bearings, is essential for ensuring equipment reliability and
+minimizing unexpected industrial failures. Traditional data-driven deep
+learning methods face challenges in practical settings due to inconsistent
+training and testing data distributions and limited generalization for
+long-term predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalizable Graph Neural Networks for Robust Power Grid Topology
+  Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthijs de Jong, Jan Viebahn, Yuliya Shapovalova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The energy transition necessitates new congestion management methods. One
+such method is controlling the grid topology with machine learning (ML). This
+approach has gained popularity following the Learning to Run a Power Network
+(L2RPN) competitions. Graph neural networks (GNNs) are a class of ML models
+that reflect graph structure in their computation, which makes them suitable
+for power grid modeling. Various GNN approaches for topology control have thus
+been proposed. We propose the first GNN model for grid topology control that
+uses only GNN layers. Additionally, we identify the busbar information
+asymmetry problem that the popular homogeneous graph representation suffers
+from, and propose a heterogeneous graph representation to resolve it. We train
+both homogeneous and heterogeneous GNNs and fully connected neural networks
+(FCNN) baselines on an imitation learning task. We evaluate the models
+according to their classification accuracy and grid operation ability. We find
+that the heterogeneous GNNs perform best on in-distribution networks, followed
+by the FCNNs, and lastly, the homogeneous GNNs. We also find that both GNN
+types generalize better to out-of-distribution networks than FCNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Guarantees on Automated Precision Weeding using Conformal
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Melki, Lionel Bombrun, Boubacar Diallo, Jérôme Dias, Jean-Pierre da Costa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precision agriculture in general, and precision weeding in particular, have
+greatly benefited from the major advancements in deep learning and computer
+vision. A large variety of commercial robotic solutions are already available
+and deployed. However, the adoption by farmers of such solutions is still low
+for many reasons, an important one being the lack of trust in these systems.
+This is in great part due to the opaqueness and complexity of deep neural
+networks and the manufacturers' inability to provide valid guarantees on their
+performance. Conformal prediction, a well-established methodology in the
+machine learning community, is an efficient and reliable strategy for providing
+trustworthy guarantees on the predictions of any black-box model under very
+minimal constraints. Bridging the gap between the safe machine learning and
+precision agriculture communities, this article showcases conformal prediction
+in action on the task of precision weeding through deep learning-based image
+classification. After a detailed presentation of the conformal prediction
+methodology and the development of a precision spraying pipeline based on a
+''conformalized'' neural network and well-defined spraying decision rules, the
+article evaluates this pipeline on two real-world scenarios: one under
+in-distribution conditions, the other reflecting a near out-of-distribution
+setting. The results show that we are able to provide formal, i.e. certifiable,
+guarantees on spraying at least 90% of the weeds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Distillation and Enhanced Subdomain Adaptation Using Graph
+  Convolutional Network for Resource-Constrained Bearing Fault Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Kavianpour, Parisa Kavianpour, Amin Ramezani, Mohammad TH Beheshti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bearing fault diagnosis under varying working conditions faces challenges,
+including a lack of labeled data, distribution discrepancies, and resource
+constraints. To address these issues, we propose a progressive knowledge
+distillation framework that transfers knowledge from a complex teacher model,
+utilizing a Graph Convolutional Network (GCN) with Autoregressive moving
+average (ARMA) filters, to a compact and efficient student model. To mitigate
+distribution discrepancies and labeling uncertainty, we introduce Enhanced
+Local Maximum Mean Squared Discrepancy (ELMMSD), which leverages mean and
+variance statistics in the Reproducing Kernel Hilbert Space (RKHS) and
+incorporates a priori probability distributions between labels. This approach
+increases the distance between clustering centers, bridges subdomain gaps, and
+enhances subdomain alignment reliability. Experimental results on benchmark
+datasets (CWRU and JNU) demonstrate that the proposed method achieves superior
+diagnostic accuracy while significantly reducing computational costs.
+Comprehensive ablation studies validate the effectiveness of each component,
+highlighting the robustness and adaptability of the approach across diverse
+working conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anomalous Agreement: How to find the Ideal Number of Anomaly Classes in
+  Correlated, Multivariate Time Series Data <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ferdinand Rewicki, Joachim Denzler, Julia Niebling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting and classifying abnormal system states is critical for condition
+monitoring, but supervised methods often fall short due to the rarity of
+anomalies and the lack of labeled data. Therefore, clustering is often used to
+group similar abnormal behavior. However, evaluating cluster quality without
+ground truth is challenging, as existing measures such as the Silhouette Score
+(SSC) only evaluate the cohesion and separation of clusters and ignore possible
+prior knowledge about the data. To address this challenge, we introduce the
+Synchronized Anomaly Agreement Index (SAAI), which exploits the synchronicity
+of anomalies across multivariate time series to assess cluster quality. We
+demonstrate the effectiveness of SAAI by showing that maximizing SAAI improves
+accuracy on the task of finding the true number of anomaly classes K in
+correlated time series by 0.23 compared to SSC and by 0.32 compared to X-Means.
+We also show that clusters obtained by maximizing SAAI are easier to interpret
+compared to SSC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Acccepted at AAAI Workshop on AI for Time Series Analysis (AI4TS)
+  2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AlphaNet: Scaling Up Local Frame-based Atomistic Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bangchen Yin, Jiaao Wang, Weitao Du, Pengbo Wang, Penghua Ying, Haojun Jia, Zisheng Zhang, Yuanqi Du, Carla P. Gomes, Chenru Duan, Hai Xiao, Graeme Henkelman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present AlphaNet, a local frame-based equivariant model designed to
+achieve both accurate and efficient simulations for atomistic systems.
+Recently, machine learning force fields (MLFFs) have gained prominence in
+molecular dynamics simulations due to their advantageous efficiency-accuracy
+balance compared to classical force fields and quantum mechanical calculations,
+alongside their transferability across various systems. Despite the
+advancements in improving model accuracy, the efficiency and scalability of
+MLFFs remain significant obstacles in practical applications. AlphaNet enhances
+computational efficiency and accuracy by leveraging the local geometric
+structures of atomic environments through the construction of equivariant local
+frames and learnable frame transitions. We substantiate the efficacy of
+AlphaNet across diverse datasets, including defected graphene, formate
+decomposition, zeolites, and surface reactions. AlphaNet consistently surpasses
+well-established models, such as NequIP and DeepPot, in terms of both energy
+and force prediction accuracy. Notably, AlphaNet offers one of the best
+trade-offs between computational efficiency and accuracy among existing models.
+Moreover, AlphaNet exhibits scalability across a broad spectrum of system and
+dataset sizes, affirming its versatility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TIMRL: A Novel Meta-Reinforcement Learning Framework for Non-Stationary
+  and Multi-Task Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyang Qi, Huiping Li, Panfeng Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, meta-reinforcement learning (meta-RL) algorithm has been
+proposed to improve sample efficiency in the field of decision-making and
+control, enabling agents to learn new knowledge from a small number of samples.
+However, most research uses the Gaussian distribution to extract task
+representation, which is poorly adapted to tasks that change in non-stationary
+environment. To address this problem, we propose a novel meta-reinforcement
+learning method by leveraging Gaussian mixture model and the transformer
+network to construct task inference model. The Gaussian mixture model is
+utilized to extend the task representation and conduct explicit encoding of
+tasks. Specifically, the classification of tasks is encoded through transformer
+network to determine the Gaussian component corresponding to the task. By
+leveraging task labels, the transformer network is trained using supervised
+learning. We validate our method on MuJoCo benchmarks with non-stationary and
+multi-task environments. Experimental results demonstrate that the proposed
+method dramatically improves sample efficiency and accurately recognizes the
+classification of the tasks, while performing excellently in the environment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM360 K2: Scaling Up 360-Open-Source Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengzhong Liu, Bowen Tan, Hongyi Wang, Willie Neiswanger, Tianhua Tao, Haonan Li, Fajri Koto, Yuqi Wang, Suqi Sun, Omkar Pangarkar, Richard Fan, Yi Gu, Victor Miller, Liqun Ma, Liping Tang, Nikhil Ranjan, Yonghao Zhuang, Guowei He, Renxi Wang, Mingkai Deng, Robin Algayres, Yuanzhi Li, Zhiqiang Shen, Preslav Nakov, Eric Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We detail the training of the LLM360 K2-65B model, scaling up our 360-degree
+OPEN SOURCE approach to the largest and most powerful models under project
+LLM360. While open-source LLMs continue to advance, the answer to "How are the
+largest LLMs trained?" remains unclear within the community. The implementation
+details for such high-capacity models are often protected due to business
+considerations associated with their high cost. This lack of transparency
+prevents LLM researchers from leveraging valuable insights from prior
+experience, e.g., "What are the best practices for addressing loss spikes?" The
+LLM360 K2 project addresses this gap by providing full transparency and access
+to resources accumulated during the training of LLMs at the largest scale. This
+report highlights key elements of the K2 project, including our first model, K2
+DIAMOND, a 65 billion-parameter LLM that surpasses LLaMA-65B and rivals
+LLaMA2-70B, while requiring fewer FLOPs and tokens. We detail the
+implementation steps and present a longitudinal analysis of K2 DIAMOND's
+capabilities throughout its training process. We also outline ongoing projects
+such as TXT360, setting the stage for future models in the series. By offering
+previously unavailable resources, the K2 project also resonates with the
+360-degree OPEN SOURCE principles of transparency, reproducibility, and
+accessibility, which we believe are vital in the era of resource-intensive AI
+research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inferring Interpretable Models of Fragmentation Functions using Symbolic
+  Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07123v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07123v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nour Makke, Sanjay Chawla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning is rapidly making its path into natural sciences, including
+high-energy physics. We present the first study that infers, directly from
+experimental data, a functional form of fragmentation functions. The latter
+represent a key ingredient to describe physical observables measured in
+high-energy physics processes that involve hadron production, and predict their
+values at different energy. Fragmentation functions can not be calculated in
+theory and have to be determined instead from data. Traditional approaches rely
+on global fits of experimental data using a pre-assumed functional form
+inspired from phenomenological models to learn its parameters. This novel
+approach uses a ML technique, namely symbolic regression, to learn an
+analytical model from measured charged hadron multiplicities. The function
+learned by symbolic regression resembles the Lund string function and describes
+the data well, thus representing a potential candidate for use in global FFs
+fits. This study represents an approach to follow in such QCD-related
+phenomenology studies and more generally in sciences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D3MES: Diffusion <span class="highlight-title">Transformer</span> with multihead equivariant self-attention
+  for 3D molecule generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhejun Zhang, Yuanping Chen, Shibing Chu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and predicting the diverse conformational states of molecules
+is crucial for advancing fields such as chemistry, material science, and drug
+development. Despite significant progress in generative models, accurately
+generating complex and biologically or material-relevant molecular structures
+remains a major challenge. In this work, we introduce a diffusion model for
+three-dimensional (3D) molecule generation that combines a classifiable
+diffusion model, Diffusion Transformer, with multihead equivariant
+self-attention. This method addresses two key challenges: correctly attaching
+hydrogen atoms in generated molecules through learning representations of
+molecules after hydrogen atoms are removed; and overcoming the limitations of
+existing models that cannot generate molecules across multiple classes
+simultaneously. The experimental results demonstrate that our model not only
+achieves state-of-the-art performance across several key metrics but also
+exhibits robustness and versatility, making it highly suitable for early-stage
+large-scale generation processes in molecular design, followed by validation
+and further screening to obtain molecules with specific properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SFC-GAN: A Generative Adversarial Network for Brain Functional and
+  Structural Connectome Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yee-Fan Tan, Jun Lin Liow, Pei-Sze Tan, Fuad Noman, Raphael C. -W. Phan, Hernando Ombao, Chee-Ming Ting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern brain imaging technologies have enabled the detailed reconstruction of
+human brain connectomes, capturing structural connectivity (SC) from diffusion
+MRI and functional connectivity (FC) from functional MRI. Understanding the
+intricate relationships between SC and FC is vital for gaining deeper insights
+into the brain's functional and organizational mechanisms. However, obtaining
+both SC and FC modalities simultaneously remains challenging, hindering
+comprehensive analyses. Existing deep generative models typically focus on
+synthesizing a single modality or unidirectional translation between FC and SC,
+thereby missing the potential benefits of bi-directional translation,
+especially in scenarios where only one connectome is available. Therefore, we
+propose Structural-Functional Connectivity GAN (SFC-GAN), a novel framework for
+bidirectional translation between SC and FC. This approach leverages the
+CycleGAN architecture, incorporating convolutional layers to effectively
+capture the spatial structures of brain connectomes. To preserve the
+topological integrity of these connectomes, we employ a structure-preserving
+loss that guides the model in capturing both global and local connectome
+patterns while maintaining symmetry. Our framework demonstrates superior
+performance in translating between SC and FC, outperforming baseline models in
+similarity and graph property evaluations compared to ground truth data, each
+translated modality can be effectively utilized for downstream classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentially Private Kernelized Contextual Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikola Pavlovic, Sudeep Salgia, Qing Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of contextual kernel bandits with stochastic
+contexts, where the underlying reward function belongs to a known Reproducing
+Kernel Hilbert Space (RKHS). We study this problem under the additional
+constraint of joint differential privacy, where the agents needs to ensure that
+the sequence of query points is differentially private with respect to both the
+sequence of contexts and rewards. We propose a novel algorithm that improves
+upon the state of the art and achieves an error rate of
+$\mathcal{O}\left(\sqrt{\frac{\gamma_T}{T}} + \frac{\gamma_T}{T
+\varepsilon}\right)$ after $T$ queries for a large class of kernel families,
+where $\gamma_T$ represents the effective dimensionality of the kernel and
+$\varepsilon > 0$ is the privacy parameter. Our results are based on a novel
+estimator for the reward function that simultaneously enjoys high utility along
+with a low-sensitivity to observed rewards and contexts, which is crucial to
+obtain an order optimal learning performance with improved dependence on the
+privacy parameter.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ACCon: Angle-Compensated Contrastive Regularizer for Deep Regression <span class="chip">AAAI-2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Botao Zhao, Xiaoyang Qu, Zuheng Kang, Junqing Peng, Jing Xiao, Jianzong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In deep regression, capturing the relationship among continuous labels in
+feature space is a fundamental challenge that has attracted increasing
+interest. Addressing this issue can prevent models from converging to
+suboptimal solutions across various regression tasks, leading to improved
+performance, especially for imbalanced regression and under limited sample
+sizes. However, existing approaches often rely on order-aware representation
+learning or distance-based weighting. In this paper, we hypothesize a linear
+negative correlation between label distances and representation similarities in
+regression tasks. To implement this, we propose an angle-compensated
+contrastive regularizer for deep regression, which adjusts the cosine distance
+between anchor and negative samples within the contrastive learning framework.
+Our method offers a plug-and-play compatible solution that extends most
+existing contrastive learning methods for regression tasks. Extensive
+experiments and theoretical analysis demonstrate that our proposed
+angle-compensated contrastive regularizer not only achieves competitive
+regression performance but also excels in data efficiency and effectiveness on
+imbalanced datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept by AAAI-2025 (The 39th Annual AAAI Conference on Artificial
+  Intelligence)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Protego: Detecting Adversarial Examples for Vision <span class="highlight-title">Transformer</span>s via
+  Intrinsic Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialin Wu, Kaikai Pan, Yanjiao Chen, Jiangyi Deng, Shengyuan Pang, Wenyuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models have excelled in natural language tasks, prompting the
+vision community to explore their implementation in computer vision problems.
+However, these models are still influenced by adversarial examples. In this
+paper, we investigate the attack capabilities of six common adversarial attacks
+on three pretrained ViT models to reveal the vulnerability of ViT models. To
+understand and analyse the bias in neural network decisions when the input is
+adversarial, we use two visualisation techniques that are attention rollout and
+grad attention rollout. To prevent ViT models from adversarial attack, we
+propose Protego, a detection framework that leverages the transformer intrinsic
+capabilities to detection adversarial examples of ViT models. Nonetheless, this
+is challenging due to a diversity of attack strategies that may be adopted by
+adversaries. Inspired by the attention mechanism, we know that the token of
+prediction contains all the information from the input sample. Additionally,
+the attention region for adversarial examples differs from that of normal
+examples. Given these points, we can train a detector that achieves superior
+performance than existing detection methods to identify adversarial examples.
+Our experiments have demonstrated the high effectiveness of our detection
+method. For these six adversarial attack methods, our detector's AUC scores all
+exceed 0.95. Protego may advance investigations in metaverse security.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE MetaCom 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explore the Use of Time Series Foundation Model for Car-Following
+  Behavior Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07034v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07034v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luwei Zeng, Runze Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling car-following behavior is essential for traffic simulation,
+analyzing driving patterns, and understanding complex traffic flows with
+varying levels of autonomous vehicles. Traditional models like the Safe
+Distance Model and Intelligent Driver Model (IDM) require precise parameter
+calibration and often lack generality due to simplified assumptions about
+driver behavior. While machine learning and deep learning methods capture
+complex patterns, they require large labeled datasets. Foundation models
+provide a more efficient alternative. Pre-trained on vast, diverse time series
+datasets, they can be applied directly to various tasks without the need for
+extensive re-training. These models generalize well across domains, and with
+minimal fine-tuning, they can be adapted to specific tasks like car-following
+behavior prediction. In this paper, we apply Chronos, a state-of-the-art public
+time series foundation model, to analyze car-following behavior using the Open
+ACC dataset. Without fine-tuning, Chronos outperforms traditional models like
+IDM and Exponential smoothing with trend and seasonality (ETS), and achieves
+similar results to deep learning models such as DeepAR and TFT, with an RMSE of
+0.60. After fine-tuning, Chronos reduces the error to an RMSE of 0.53,
+representing a 33.75% improvement over IDM and a 12-37% reduction compared to
+machine learning models like ETS and deep learning models including DeepAR,
+WaveNet, and TFT. This demonstrates the potential of foundation models to
+significantly advance transportation research, offering a scalable, adaptable,
+and highly accurate approach to predicting and simulating car-following
+behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detection of AI Deepfake and Fraud in Online Payments Using GAN-Based
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zong Ke, Shicheng Zhou, Yining Zhou, Chia Hong Chang, Rong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the use of Generative Adversarial Networks (GANs) to
+detect AI deepfakes and fraudulent activities in online payment systems. With
+the growing prevalence of deepfake technology, which can manipulate facial
+features in images and videos, the potential for fraud in online transactions
+has escalated. Traditional security systems struggle to identify these
+sophisticated forms of fraud. This research proposes a novel GAN-based model
+that enhances online payment security by identifying subtle manipulations in
+payment images. The model is trained on a dataset consisting of real-world
+online payment images and deepfake images generated using advanced GAN
+architectures, such as StyleGAN and DeepFake. The results demonstrate that the
+proposed model can accurately distinguish between legitimate transactions and
+deepfakes, achieving a high detection rate above 95%. This approach
+significantly improves the robustness of payment systems against AI-driven
+fraud. The paper contributes to the growing field of digital security, offering
+insights into the application of GANs for fraud detection in financial
+services. Keywords- Payment Security, Image Recognition, Generative Adversarial
+Networks, AI Deepfake, Fraudulent Activities
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper will be published and indexed by IEEE at 2025 8th
+  International Conference on Advanced Algorithms and Control Engineering
+  (ICAACE 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PRKAN: Parameter-Reduced Kolmogorov-Arnold Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoang-Thang Ta, Duy-Quy Thai, Anh Tran, Grigori Sidorov, Alexander Gelbukh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kolmogorov-Arnold Networks (KANs) represent an innovation in neural network
+architectures, offering a compelling alternative to Multi-Layer Perceptrons
+(MLPs) in models such as Convolutional Neural Networks (CNNs), Recurrent Neural
+Networks (RNNs), and Transformers. By advancing network design, KANs are
+driving groundbreaking research and enabling transformative applications across
+various scientific domains involving neural networks. However, existing KANs
+often require significantly more parameters in their network layers compared to
+MLPs. To address this limitation, this paper introduces PRKANs
+(\textbf{P}arameter-\textbf{R}educed \textbf{K}olmogorov-\textbf{A}rnold
+\textbf{N}etworks), which employ several methods to reduce the parameter count
+in KAN layers, making them comparable to MLP layers. Experimental results on
+the MNIST and Fashion-MNIST datasets demonstrate that PRKANs with attention
+mechanisms outperform several existing KANs and rival the performance of MLPs,
+albeit with slightly longer training times. Furthermore, the study highlights
+the advantages of Gaussian Radial Basis Functions (GRBFs) and layer
+normalization in KAN designs. The repository for this work is available at:
+\url{https://github.com/hoangthangta/All-KAN}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Erasing Noise in Signal Detection with Diffusion Model: From Theory to
+  Application 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiucheng Wang, Peilin Zheng, Nan Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a signal detection method based on the denoise diffusion model
+(DM) is proposed, which outperforms the maximum likelihood (ML) estimation
+method that has long been regarded as the optimal signal detection technique.
+Theoretically, a novel mathematical theory for intelligent signal detection
+based on stochastic differential equations (SDEs) is established in this paper,
+demonstrating the effectiveness of DM in reducing the additive white Gaussian
+noise in received signals. Moreover, a mathematical relationship between the
+signal-to-noise ratio (SNR) and the timestep in DM is established, revealing
+that for any given SNR, a corresponding optimal timestep can be identified.
+Furthermore, to address potential issues with out-of-distribution inputs in the
+DM, we employ a mathematical scaling technique that allows the trained DM to
+handle signal detection across a wide range of SNRs without any fine-tuning.
+Building on the above theoretical foundation, we propose a DM-based signal
+detection method, with the diffusion transformer (DiT) serving as the backbone
+neural network, whose computational complexity of this method is
+$\mathcal{O}(n^2)$. Simulation results demonstrate that, for BPSK and QAM
+modulation schemes, the DM-based method achieves a significantly lower symbol
+error rate (SER) compared to ML estimation, while maintaining a much lower
+computational complexity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SecAlign: Defending Against <span class="highlight-title">Prompt</span> Injection with Preference
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05451v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05451v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sizhe Chen, Arman Zharmagambetov, Saeed Mahloujifar, Kamalika Chaudhuri, David Wagner, Chuan Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are becoming increasingly prevalent in modern
+software systems, interfacing between the user and the Internet to assist with
+tasks that require advanced language understanding. To accomplish these tasks,
+the LLM often uses external data sources such as user documents, web retrieval,
+results from API calls, etc. This opens up new avenues for attackers to
+manipulate the LLM via prompt injection. Adversarial prompts can be injected
+into external data sources to override the system's intended instruction and
+instead execute a malicious instruction.
+  To mitigate this vulnerability, we propose a new defense called SecAlign
+based on the technique of preference optimization. Our defense first constructs
+a preference dataset with prompt-injected inputs, secure outputs (ones that
+respond to the legitimate instruction), and insecure outputs (ones that respond
+to the injection). We then perform preference optimization on this dataset to
+teach the LLM to prefer the secure output over the insecure one. This provides
+the first known method that reduces the success rates of various prompt
+injections to around 0%, even against attacks much more sophisticated than ones
+seen during training. This indicates our defense generalizes well against
+unknown and yet-to-come attacks. Also, our defended models are still practical
+with similar utility to the one before our defensive training. Our code is at
+https://github.com/facebookresearch/SecAlign
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Key words: prompt injection defense, LLM security, LLM-integrated
+  applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-Shot Task Learning through Inverse Generative Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.04987v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.04987v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviv Netanyahu, Yilun Du, Antonia Bronars, Jyothish Pari, Joshua Tenenbaum, Tianmin Shu, Pulkit Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning the intents of an agent, defined by its goals or motion style, is
+often extremely challenging from just a few examples. We refer to this problem
+as task concept learning and present our approach, Few-Shot Task Learning
+through Inverse Generative Modeling (FTL-IGM), which learns new task concepts
+by leveraging invertible neural generative models. The core idea is to pretrain
+a generative model on a set of basic concepts and their demonstrations. Then,
+given a few demonstrations of a new concept (such as a new goal or a new
+action), our method learns the underlying concepts through backpropagation
+without updating the model weights, thanks to the invertibility of the
+generative model. We evaluate our method in five domains -- object
+rearrangement, goal-oriented navigation, motion caption of human actions,
+autonomous driving, and real-world table-top manipulation. Our experimental
+results demonstrate that via the pretrained generative model, we successfully
+learn novel concepts and generate agent plans or motion corresponding to these
+concepts in (1) unseen environments and (2) in composition with training
+concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added acknowledgment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the Performance of Echo State Networks Through State Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.15141v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.15141v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter J. Ehlers, Hendra I. Nurdin, Daniel Soh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reservoir computing, using nonlinear dynamical systems, offers a
+cost-effective alternative to neural networks for complex tasks involving
+processing of sequential data, time series modeling, and system identification.
+Echo state networks (ESNs), a type of reservoir computer, mirror neural
+networks but simplify training. They apply fixed, random linear transformations
+to the internal state, followed by nonlinear changes. This process, guided by
+input signals and linear regression, adapts the system to match target
+characteristics, reducing computational demands. A potential drawback of ESNs
+is that the fixed reservoir may not offer the complexity needed for specific
+problems. While directly altering (training) the internal ESN would reintroduce
+the computational burden, an indirect modification can be achieved by
+redirecting some output as input. This feedback can influence the internal
+reservoir state, yielding ESNs with enhanced complexity suitable for broader
+challenges. In this paper, we demonstrate that by feeding some component of the
+reservoir state back into the network through the input, we can drastically
+improve upon the performance of a given ESN. We rigorously prove that, for any
+given ESN, feedback will almost always improve the accuracy of the output. For
+a set of three tasks, each representing different problem classes, we find that
+with feedback the average error measures are reduced by $30\%-60\%$.
+Remarkably, feedback provides at least an equivalent performance boost to
+doubling the initial number of computational nodes, a computationally expensive
+and technologically challenging alternative. These results demonstrate the
+broad applicability and substantial usefulness of this feedback scheme.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Directional Smoothness and Gradient Methods: Convergence and Adaptivity <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04081v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04081v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Mishkin, Ahmed Khaled, Yuanhao Wang, Aaron Defazio, Robert M. Gower
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop new sub-optimality bounds for gradient descent (GD) that depend on
+the conditioning of the objective along the path of optimization rather than on
+global, worst-case constants. Key to our proofs is directional smoothness, a
+measure of gradient variation that we use to develop upper-bounds on the
+objective. Minimizing these upper-bounds requires solving implicit equations to
+obtain a sequence of strongly adapted step-sizes; we show that these equations
+are straightforward to solve for convex quadratics and lead to new guarantees
+for two classical step-sizes. For general functions, we prove that the Polyak
+step-size and normalized GD obtain fast, path-dependent rates despite using no
+knowledge of the directional smoothness. Experiments on logistic regression
+show our convergence guarantees are tighter than the classical theory based on
+$L$-smoothness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a poster at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Divergences between Language Models and Human Brains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09308v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09308v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Zhou, Emmy Liu, Graham Neubig, Michael J. Tarr, Leila Wehbe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Do machines and humans process language in similar ways? Recent research has
+hinted at the affirmative, showing that human neural activity can be
+effectively predicted using the internal representations of language models
+(LMs). Although such results are thought to reflect shared computational
+principles between LMs and human brains, there are also clear differences in
+how LMs and humans represent and use language. In this work, we systematically
+explore the divergences between human and machine language processing by
+examining the differences between LM representations and human brain responses
+to language as measured by Magnetoencephalography (MEG) across two datasets in
+which subjects read and listened to narrative stories. Using an LLM-based
+data-driven approach, we identify two domains that LMs do not capture well:
+social/emotional intelligence and physical commonsense. We validate these
+findings with human behavioral experiments and hypothesize that the gap is due
+to insufficient representations of social/emotional and physical knowledge in
+LMs. Our results show that fine-tuning LMs on these domains can improve their
+alignment with human brain responses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Closer Look at AUROC and AUPRC under Class Imbalance <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06091v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06091v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew B. A. McDermott, Haoran Zhang, Lasse Hyldig Hansen, Giovanni Angelotti, Jack Gallifant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine learning (ML), a widespread claim is that the area under the
+precision-recall curve (AUPRC) is a superior metric for model comparison to the
+area under the receiver operating characteristic (AUROC) for tasks with class
+imbalance. This paper refutes this notion on two fronts. First, we
+theoretically characterize the behavior of AUROC and AUPRC in the presence of
+model mistakes, establishing clearly that AUPRC is not generally superior in
+cases of class imbalance. We further show that AUPRC can be a harmful metric as
+it can unduly favor model improvements in subpopulations with more frequent
+positive labels, heightening algorithmic disparities. Next, we empirically
+support our theory using experiments on both semi-synthetic and real-world
+fairness datasets. Prompted by these insights, we conduct a review of over 1.5
+million scientific papers to understand the origin of this invalid claim,
+finding that it is often made without citation, misattributed to papers that do
+not argue this point, and aggressively over-generalized from source arguments.
+Our findings represent a dual contribution: a significant technical advancement
+in understanding the relationship between AUROC and AUPRC and a stark warning
+about unchecked assumptions in the ML community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 (https://openreview.net/forum?id=S3HvA808gk)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pre-train</span>ed Vision-Language Models Learn Discoverable Visual Concepts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.12652v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.12652v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Zang, Tian Yun, Hao Tan, Trung Bui, Chen Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Do vision-language models (VLMs) pre-trained to caption an image of a
+"durian" learn visual concepts such as "brown" (color) and "spiky" (texture) at
+the same time? We aim to answer this question as visual concepts learned "for
+free" would enable wide applications such as neuro-symbolic reasoning or
+human-interpretable object classification. We assume that the visual concepts,
+if captured by pre-trained VLMs, can be extracted by their vision-language
+interface with text-based concept prompts. We observe that recent works
+prompting VLMs with concepts often differ in their strategies to define and
+evaluate the visual concepts, leading to conflicting conclusions. We propose a
+new concept definition strategy based on two observations: First, certain
+concept prompts include shortcuts that recognize correct concepts for wrong
+reasons; Second, multimodal information (e.g. visual discriminativeness, and
+textual knowledge) should be leveraged when selecting the concepts. Our
+proposed concept discovery and learning (CDL) framework is thus designed to
+identify a diverse list of generic visual concepts (e.g. "spiky" as opposed to
+"spiky durian"), which are ranked and selected based on visual and language
+mutual information. We carefully design quantitative and human evaluations of
+the discovered concepts on six diverse visual recognition datasets, which
+confirm that pre-trained VLMs do learn visual concepts that provide accurate
+and thorough descriptions for the recognized objects. All code and models are
+publicly released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Transactions on Machine Learning Research, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inhomogeneous graph trend filtering via a l2,0 cardinality penalty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05223v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05223v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoqing Huang, Andersen Ang, Kun Huang, Jie Zhang, Yijie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study estimation of piecewise smooth signals over a graph. We propose a
+$\ell_{2,0}$-norm penalized Graph Trend Filtering (GTF) model to estimate
+piecewise smooth graph signals that exhibit inhomogeneous levels of smoothness
+across the nodes. We prove that the proposed GTF model is simultaneously a
+k-means clustering on the signal over the nodes and a minimum graph cut on the
+edges of the graph, where the clustering and the cut share the same assignment
+matrix. We propose two methods to solve the proposed GTF model: a spectral
+decomposition method and a method based on simulated annealing. In the
+experiment on synthetic and real-world datasets, we show that the proposed GTF
+model has a better performances compared with existing approaches on the tasks
+of denoising, support recovery and semi-supervised classification. We also show
+that the proposed GTF model can be solved more efficiently than existing models
+for the dataset with a large edge set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 3 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-Based Residual Useful Lifetime Prediction for Assets with
+  Uncertain Failure Modes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.06068v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.06068v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqi Su, Xiaolei Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Industrial prognostics focuses on utilizing degradation signals to forecast
+and continually update the residual useful life of complex engineering systems.
+However, existing prognostic models for systems with multiple failure modes
+face several challenges in real-world applications, including overlapping
+degradation signals from multiple components, the presence of unlabeled
+historical data, and the similarity of signals across different failure modes.
+To tackle these issues, this research introduces two prognostic models that
+integrate the mixture (log)-location-scale distribution with deep learning.
+This integration facilitates the modeling of overlapping degradation signals,
+eliminates the need for explicit failure mode identification, and utilizes deep
+learning to capture complex nonlinear relationships between degradation signals
+and residual useful lifetimes. Numerical studies validate the superior
+performance of these proposed models compared to existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Context Matters: Leveraging Contextual Features for Time Series
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.12672v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.12672v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sameep Chattopadhyay, Pulkit Paliwal, Sai Shankar Narasimhan, Shubhankar Agarwal, Sandeep P. Chinchali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series forecasts are often influenced by exogenous contextual features
+in addition to their corresponding history. For example, in financial settings,
+it is hard to accurately predict a stock price without considering public
+sentiments and policy decisions in the form of news articles, tweets, etc.
+Though this is common knowledge, the current state-of-the-art (SOTA)
+forecasting models fail to incorporate such contextual information, owing to
+its heterogeneity and multimodal nature. To address this, we introduce
+ContextFormer, a novel plug-and-play method to surgically integrate multimodal
+contextual information into existing pre-trained forecasting models.
+ContextFormer effectively distills forecast-specific information from rich
+multimodal contexts, including categorical, continuous, time-varying, and even
+textual information, to significantly enhance the performance of existing base
+forecasters. ContextFormer outperforms SOTA forecasting models by up to 30% on
+a range of real-world datasets spanning energy, traffic, environmental, and
+financial domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative Assignment Flows for Representing and Learning Joint
+  Distributions of Discrete Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04527v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04527v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bastian Boll, Daniel Gonzalez-Alvarado, Stefania Petra, Christoph Schnörr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel generative model for the representation of joint
+probability distributions of a possibly large number of discrete random
+variables. The approach uses measure transport by randomized assignment flows
+on the statistical submanifold of factorizing distributions, which enables to
+represent and sample efficiently from any target distribution and to assess the
+likelihood of unseen data points. The complexity of the target distribution
+only depends on the parametrization of the affinity function of the dynamical
+assignment flow system. Our model can be trained in a simulation-free manner by
+conditional Riemannian flow matching, using the training data encoded as
+geodesics on the assignment manifold in closed-form, with respect to the
+e-connection of information geometry. Numerical experiments devoted to
+distributions of structured image labelings demonstrate the applicability to
+large-scale problems, which may include discrete distributions in other
+application areas. Performance measures show that our approach scales better
+with the increasing number of classes than recent related work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Remove that Square Root: A New Efficient Scale-Invariant Version of
+  AdaGrad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02648v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02648v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sayantan Choudhury, Nazarii Tupitsa, Nicolas Loizou, Samuel Horvath, Martin Takac, Eduard Gorbunov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive methods are extremely popular in machine learning as they make
+learning rate tuning less expensive. This paper introduces a novel optimization
+algorithm named KATE, which presents a scale-invariant adaptation of the
+well-known AdaGrad algorithm. We prove the scale-invariance of KATE for the
+case of Generalized Linear Models. Moreover, for general smooth non-convex
+problems, we establish a convergence rate of $O \left(\frac{\log T}{\sqrt{T}}
+\right)$ for KATE, matching the best-known ones for AdaGrad and Adam. We also
+compare KATE to other state-of-the-art adaptive algorithms Adam and AdaGrad in
+numerical experiments with different problems, including complex machine
+learning tasks like image classification and text classification on real data.
+The results indicate that KATE consistently outperforms AdaGrad and
+matches/surpasses the performance of Adam in all considered scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Scattering on Measure Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.08561v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.08561v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joyce Chew, Matthew Hirn, Smita Krishnaswamy, Deanna Needell, Michael Perlmutter, Holly Steach, Siddharth Viswanath, Hau-Tieng Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scattering transform is a multilayered, wavelet-based transform initially
+introduced as a model of convolutional neural networks (CNNs) that has played a
+foundational role in our understanding of these networks' stability and
+invariance properties. Subsequently, there has been widespread interest in
+extending the success of CNNs to data sets with non-Euclidean structure, such
+as graphs and manifolds, leading to the emerging field of geometric deep
+learning. In order to improve our understanding of the architectures used in
+this new field, several papers have proposed generalizations of the scattering
+transform for non-Euclidean data structures such as undirected graphs and
+compact Riemannian manifolds without boundary.
+  In this paper, we introduce a general, unified model for geometric scattering
+on measure spaces. Our proposed framework includes previous work on geometric
+scattering as special cases but also applies to more general settings such as
+directed graphs, signed graphs, and manifolds with boundary. We propose a new
+criterion that identifies to which groups a useful representation should be
+invariant and show that this criterion is sufficient to guarantee that the
+scattering transform has desirable stability and invariance properties.
+Additionally, we consider finite measure spaces that are obtained from randomly
+sampling an unknown manifold. We propose two methods for constructing a
+data-driven graph on which the associated graph scattering transform
+approximates the scattering transform on the underlying manifold. Moreover, we
+use a diffusion-maps based approach to prove quantitative estimates on the rate
+of convergence of one of these approximations as the number of sample points
+tends to infinity. Lastly, we showcase the utility of our method on spherical
+images, directed graphs, and on high-dimensional single-cell data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Barcodes as Summary of Loss Function Topology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1912.00043v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1912.00043v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serguei Barannikov, Alexander Korotin, Dmitry Oganesyan, Daniil Emtsev, Evgeny Burnaev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose to study neural networks' loss surfaces by methods of topological
+data analysis. We suggest to apply barcodes of Morse complexes to explore
+topology of loss surfaces. An algorithm for calculations of the loss function's
+barcodes of local minima is described. We have conducted experiments for
+calculating barcodes of local minima for benchmark functions and for loss
+surfaces of small neural networks. Our experiments confirm our two principal
+observations for neural networks' loss surfaces. First, the barcodes of local
+minima are located in a small lower part of the range of values of neural
+networks' loss function. Secondly, increase of the neural network's depth and
+width lowers the barcodes of local minima. This has some natural implications
+for the neural network's learning and for its generalization properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quilt-1M: One Million Image-Text Pairs for Histopathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11207v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11207v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wisdom Oluchi Ikezogwo, Mehmet Saygin Seyfioglu, Fatemeh Ghezloo, Dylan Stefan Chan Geva, Fatwir Sheikh Mohammed, Pavan Kumar Anand, Ranjay Krishna, Linda Shapiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent accelerations in multi-modal applications have been made possible with
+the plethora of image and text data available online. However, the scarcity of
+analogous data in the medical field, specifically in histopathology, has slowed
+comparable progress. To enable similar representation learning for
+histopathology, we turn to YouTube, an untapped resource of videos, offering
+$1,087$ hours of valuable educational histopathology videos from expert
+clinicians. From YouTube, we curate QUILT: a large-scale vision-language
+dataset consisting of $802, 144$ image and text pairs. QUILT was automatically
+curated using a mixture of models, including large language models, handcrafted
+algorithms, human knowledge databases, and automatic speech recognition. In
+comparison, the most comprehensive datasets curated for histopathology amass
+only around $200$K samples. We combine QUILT with datasets from other sources,
+including Twitter, research papers, and the internet in general, to create an
+even larger dataset: QUILT-1M, with $1$M paired image-text samples, marking it
+as the largest vision-language histopathology dataset to date. We demonstrate
+the value of QUILT-1M by fine-tuning a pre-trained CLIP model. Our model
+outperforms state-of-the-art models on both zero-shot and linear probing tasks
+for classifying new histopathology images across $13$ diverse patch-level
+datasets of $8$ different sub-pathologies and cross-modal retrieval tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Higher-Order Topological Directionality and Directed Simplicial Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Lecha, Andrea Cavallo, Francesca Dominici, Elvin Isufi, Claudio Battiloro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topological Deep Learning (TDL) has emerged as a paradigm to process and
+learn from signals defined on higher-order combinatorial topological spaces,
+such as simplicial or cell complexes. Although many complex systems have an
+asymmetric relational structure, most TDL models forcibly symmetrize these
+relationships. In this paper, we first introduce a novel notion of higher-order
+directionality and we then design Directed Simplicial Neural Networks
+(Dir-SNNs) based on it. Dir-SNNs are message-passing networks operating on
+directed simplicial complexes able to leverage directed and possibly asymmetric
+interactions among the simplices. To our knowledge, this is the first TDL model
+using a notion of higher-order directionality. We theoretically and empirically
+prove that Dir-SNNs are more expressive than their directed graph counterpart
+in distinguishing isomorphic directed graphs. Experiments on a synthetic source
+localization task demonstrate that Dir-SNNs outperform undirected SNNs when the
+underlying complex is directed, and perform comparably when the underlying
+complex is undirected.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 8 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhance Eye Disease Detection using Learnable Probabilistic Discrete
+  Latents in Machine Learning Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anirudh Prabhakaran, YeKun Xiao, Ching-Yu Cheng, Dianbo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ocular diseases, including diabetic retinopathy and glaucoma, present a
+significant public health challenge due to their high prevalence and potential
+for causing vision impairment. Early and accurate diagnosis is crucial for
+effective treatment and management. In recent years, deep learning models have
+emerged as powerful tools for analysing medical images, such as retina imaging.
+However, challenges persist in model relibability and uncertainty estimation,
+which are critical for clinical decision-making. This study leverages the
+probabilistic framework of Generative Flow Networks (GFlowNets) to learn the
+posterior distribution over latent discrete dropout masks for the
+classification and analysis of ocular diseases using fundus images. We develop
+a robust and generalizable method that utilizes GFlowOut integrated with
+ResNet18 and ViT models as the backbone in identifying various ocular
+conditions. This study employs a unique set of dropout masks - none, random,
+bottomup, and topdown - to enhance model performance in analyzing these fundus
+images. Our results demonstrate that our learnable probablistic latents
+significantly improves accuracy, outperforming the traditional dropout
+approach. We utilize a gradient map calculation method, Grad-CAM, to assess
+model explainability, observing that the model accurately focuses on critical
+image regions for predictions. The integration of GFlowOut in neural networks
+presents a promising advancement in the automated diagnosis of ocular diseases,
+with implications for improving clinical workflows and patient outcomes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Path Loss Prediction Using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.17752v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.17752v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan G. Dempsey, Jonathan Ethier, Halim Yanikomeroglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radio deployments and spectrum planning benefit from path loss predictions.
+Obstructions along a communications link are often considered implicitly or
+through derived metrics such as representative clutter height or total
+obstruction depth. In this paper, we propose a path-specific path loss
+prediction method that uses convolutional neural networks to automatically
+perform feature extraction from high-resolution obstruction height maps. Our
+methods result in low prediction error in a variety of environments without
+requiring derived metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FlashRNN: Optimizing Traditional RNNs on Modern Hardware 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07752v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07752v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Korbinian Pöppel, Maximilian Beck, Sepp Hochreiter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Transformers and other sequence-parallelizable neural network
+architectures seem like the current state of the art in sequence modeling, they
+specifically lack state-tracking capabilities. These are important for
+time-series tasks and logical reasoning. Traditional RNNs like LSTMs and GRUs,
+as well as modern variants like sLSTM do have these capabilities at the cost of
+strictly sequential processing. While this is often seen as a strong
+limitation, we show how fast these networks can get with our
+hardware-optimization FlashRNN in Triton and CUDA, optimizing kernels to the
+register level on modern GPUs. We extend traditional RNNs with a
+parallelization variant that processes multiple RNNs of smaller hidden state in
+parallel, similar to the head-wise processing in Transformers. To enable
+flexibility on different GPU variants, we introduce a new optimization
+framework for hardware-internal cache sizes, memory and compute handling. It
+models the hardware in a setting using polyhedral-like constraints, including
+the notion of divisibility. This speeds up the solution process in our
+ConstrINT library for general integer constraint satisfaction problems (integer
+CSPs). We show that our kernels can achieve 50x speed-ups over a vanilla
+PyTorch implementation and allow 40x larger hidden sizes compared to our Triton
+implementation. Our open-source kernels and the optimization library are
+released here to boost research in the direction of state-tracking enabled RNNs
+and sequence modeling: \url{https://github.com/NX-AI/flashrnn}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hybrid Top-Down Global Causal Discovery with Local Search for Linear and
+  Nonlinear Additive Noise Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14496v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14496v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sujai Hiremath, Jacqueline R. M. A. Maasch, Mengxiao Gao, Promit Ghosal, Kyra Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning the unique directed acyclic graph corresponding to an unknown causal
+model is a challenging task. Methods based on functional causal models can
+identify a unique graph, but either suffer from the curse of dimensionality or
+impose strong parametric assumptions. To address these challenges, we propose a
+novel hybrid approach for global causal discovery in observational data that
+leverages local causal substructures. We first present a topological sorting
+algorithm that leverages ancestral relationships in linear structural causal
+models to establish a compact top-down hierarchical ordering, encoding more
+causal information than linear orderings produced by existing methods. We
+demonstrate that this approach generalizes to nonlinear settings with arbitrary
+noise. We then introduce a nonparametric constraint-based algorithm that prunes
+spurious edges by searching for local conditioning sets, achieving greater
+accuracy than current methods. We provide theoretical guarantees for
+correctness and worst-case polynomial time complexities, with empirical
+validation on synthetic data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at the Thirty-Eighth Annual Conference on Neural
+  Information Processing Systems (NeurIPS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Approach to Extract Interpretable Rules from Tree Ensembles
+  via Integer Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.00843v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.00843v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Bonasera, Emilio Carrizosa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tree ensembles are very popular machine learning models, known for their
+effectiveness in supervised classification and regression tasks. Their
+performance derives from aggregating predictions of multiple decision trees,
+which are renowned for their interpretability properties. However, tree
+ensemble models do not reliably exhibit interpretable output. Our work aims to
+extract an optimized list of rules from a trained tree ensemble, providing the
+user with a condensed, interpretable model that retains most of the predictive
+power of the full model. Our approach consists of solving a set partitioning
+problem formulated through Integer Programming. The proposed method works with
+either tabular or time series data, for both classification and regression
+tasks, and its flexible formulation can include any arbitrary loss or
+regularization functions. Our extensive computational experiments offer
+statistically significant evidence that our method is competitive with other
+rule extraction methods in terms of predictive performance and fidelity towards
+the tree ensemble. Moreover, we empirically show that the proposed method
+effectively extracts interpretable rules from tree ensemble that are designed
+for time series data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>- Improved overall manuscript flow and clearness - Added related work
+  on explanation fidelity - Added computational results on fidelity - Fixed
+  some flaws on data inference - Optimization problem with weighted objectives
+  - Added appendix containing qualitative examples - New computational results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Steering Large Language Models using Conceptors: Improving
+  Addition-Based Activation Engineering <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.16314v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.16314v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joris Postmus, Steven Abreu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have transformed AI, yet reliably controlling their
+outputs remains a challenge. This paper explores activation engineering, where
+outputs of pre-trained LLMs are controlled by manipulating their activations at
+inference time. Unlike traditional methods using a single steering vector, we
+introduce conceptors - mathematical constructs that represent sets of
+activation vectors as ellipsoidal regions. Conceptors act as soft projection
+matrices and offer more precise control over complex activation patterns. Our
+experiments demonstrate that conceptors outperform traditional methods across
+multiple steering tasks. We further use Boolean operations on conceptors for
+combined steering goals that empirically outperform additively combining
+steering vectors on a set of tasks. These results highlight conceptors as a
+promising tool for more effective steering of LLMs. Our code is available on
+github.com/jorispos/conceptorsteering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at the MINT workshop at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automation of Quantum Dot Measurement Analysis via Explainable Machine
+  Learning <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13699v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13699v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Schug, Tyler J. Kovach, M. A. Wolfe, Jared Benson, Sanghyeok Park, J. P. Dodson, J. Corrigan, M. A. Eriksson, Justyna P. Zwolak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of quantum dot (QD) devices for quantum computing has
+necessitated more efficient and automated methods for device characterization
+and tuning. This work demonstrates the feasibility and advantages of applying
+explainable machine learning techniques to the analysis of quantum dot
+measurements, paving the way for further advances in automated and transparent
+QD device tuning. Many of the measurements acquired during the tuning process
+come in the form of images that need to be properly analyzed to guide the
+subsequent tuning steps. By design, features present in such images capture
+certain behaviors or states of the measured QD devices. When considered
+carefully, such features can aid the control and calibration of QD devices. An
+important example of such images are so-called $\textit{triangle plots}$, which
+visually represent current flow and reveal characteristics important for QD
+device calibration. While image-based classification tools, such as
+convolutional neural networks (CNNs), can be used to verify whether a given
+measurement is $\textit{good}$ and thus warrants the initiation of the next
+phase of tuning, they do not provide any insights into how the device should be
+adjusted in the case of $\textit{bad}$ images. This is because CNNs sacrifice
+prediction and model intelligibility for high accuracy. To ameliorate this
+trade-off, a recent study introduced an image vectorization approach that
+relies on the Gabor wavelet transform (Schug $\textit{et al.}$ 2024
+$\textit{Proc. XAI4Sci: Explainable Machine Learning for Sciences Workshop
+(AAAI 2024) (Vancouver, Canada)}$ pp 1-6). Here we propose an alternative
+vectorization method that involves mathematical modeling of synthetic triangles
+to mimic the experimental data. Using explainable boosting machines, we show
+that this new method offers superior explainability of model prediction without
+sacrificing accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures, abbreviated version published in Proceedings of
+  the XAI4Sci: Explainable machine learning for sciences workshop at AAAI 2024,
+  (Vancouver, Canada)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainable AI for Classifying UTI Risk Groups Using a Real-World Linked
+  EHR and Pathology Lab <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.17645v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.17645v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Dai, Brian Sullivan, Axel Montout, Amy Dillon, Chris Waller, Peter Acs, Rachel Denholm, Philip Williams, Alastair D Hay, Raul Santos-Rodriguez, Andrew Dowsey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of machine learning and AI on electronic health records (EHRs) holds
+substantial potential for clinical insight. However, this approach faces
+challenges due to data heterogeneity, sparsity, temporal misalignment, and
+limited labeled outcomes. In this context, we leverage a linked EHR dataset of
+approximately one million de-identified individuals from Bristol, North
+Somerset, and South Gloucestershire, UK, to characterize urinary tract
+infections (UTIs). We implemented a data pre-processing and curation pipeline
+that transforms the raw EHR data into a structured format suitable for
+developing predictive models focused on data fairness, accountability and
+transparency. Given the limited availability and biases of ground truth UTI
+outcomes, we introduce a UTI risk estimation framework informed by clinical
+expertise to estimate UTI risk across individual patient timelines. Pairwise
+XGBoost models are trained using this framework to differentiate UTI risk
+categories with explainable AI techniques applied to identify key predictors
+and support interpretability. Our findings reveal differences in clinical and
+demographic predictors across risk groups. While this study highlights the
+potential of AI-driven insights to support UTI clinical decision-making,
+further investigation of patient sub-strata and extensive validation are needed
+to ensure robustness and applicability in clinical practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Light Transport-aware Diffusion Posterior Sampling for Single-View
+  Reconstruction of 3D Volumes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludwic Leonard, Nils Thuerey, Ruediger Westermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a single-view reconstruction technique of volumetric fields in
+which multiple light scattering effects are omnipresent, such as in clouds. We
+model the unknown distribution of volumetric fields using an unconditional
+diffusion model trained on a novel benchmark dataset comprising 1,000
+synthetically simulated volumetric density fields. The neural diffusion model
+is trained on the latent codes of a novel, diffusion-friendly, monoplanar
+representation. The generative model is used to incorporate a tailored
+parametric diffusion posterior sampling technique into different reconstruction
+tasks. A physically-based differentiable volume renderer is employed to provide
+gradients with respect to light transport in the latent space. This stands in
+contrast to classic NeRF approaches and makes the reconstructions better
+aligned with observed data. Through various experiments, we demonstrate
+single-view reconstruction of volumetric clouds at a previously unattainable
+quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards an Information Theoretic Framework of Context-Based Offline
+  Meta-Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02429v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02429v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lanqing Li, Hai Zhang, Xinyu Zhang, Shatong Zhu, Yang Yu, Junqiao Zhao, Pheng-Ann Heng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a marriage between offline RL and meta-RL, the advent of offline
+meta-reinforcement learning (OMRL) has shown great promise in enabling RL
+agents to multi-task and quickly adapt while acquiring knowledge safely. Among
+which, context-based OMRL (COMRL) as a popular paradigm, aims to learn a
+universal policy conditioned on effective task representations. In this work,
+by examining several key milestones in the field of COMRL, we propose to
+integrate these seemingly independent methodologies into a unified framework.
+Most importantly, we show that the pre-existing COMRL algorithms are
+essentially optimizing the same mutual information objective between the task
+variable $M$ and its latent representation $Z$ by implementing various
+approximate bounds. Such theoretical insight offers ample design freedom for
+novel algorithms. As demonstrations, we propose a supervised and a
+self-supervised implementation of $I(Z; M)$, and empirically show that the
+corresponding optimization algorithms exhibit remarkable generalization across
+a broad spectrum of RL benchmarks, context shift scenarios, data qualities and
+deep learning architectures. This work lays the information theoretic
+foundation for COMRL methods, leading to a better understanding of task
+representation learning in the context of reinforcement learning. Given its
+generality, we envision our framework as a promising offline pre-training
+paradigm of foundation models for decision making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 8 figures, 7 tables. TLDR: We propose a novel information
+  theoretic framework of the context-based offline meta-RL paradigm, which
+  unifies several mainstream methods and leads to two robust algorithm
+  implementations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Project Tracyn: Generative Artificial Intelligence based Peripherals
+  Trace Synthesizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.06376v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.06376v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhibai Huang, Yihan Shen, Yongchen Xie, Zhixiang Wei, Yun wang, Fangxin Liu, Tao Song, Zhengwei Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Peripheral Component Interconnect Express (PCIe) is the de facto interconnect
+standard for high-speed peripherals and CPUs. Prototyping and optimizing PCIe
+devices for emerging scenarios is an ongoing challenge. Since Transaction Layer
+Packets (TLPs) capture device-CPU interactions, it is crucial to analyze and
+generate realistic TLP traces for effective device design and optimization.
+Generative AI offers a promising approach for creating intricate, custom TLP
+traces necessary for PCIe hardware and software development. However, existing
+models often generate impractical traces due to the absence of PCIe-specific
+constraints, such as TLP ordering and causality. This paper presents Phantom,
+the first framework that treats TLP trace generation as a generative AI problem
+while incorporating PCIe-specific constraints. We validate Phantom's
+effectiveness by generating TLP traces for an actual PCIe network interface
+card. Experimental results show that Phantom produces practical, large-scale
+TLP traces, significantly outperforming existing models, with improvements of
+up to 1000$\times$ in task-specific metrics and up to 2.19$\times$ in Frechet
+Inception Distance (FID) compared to backbone-only methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Design of 2D Skyrmionic Metamaterial Through Controlled Assembly 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qichen Xu, Zhuanglin Shen, Alexander Edström, I. P. Miranda, Zhiwei Lu, Anders Bergman, Danny Thonig, Wanjian Yin, Olle Eriksson, Anna Delin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite extensive research on magnetic skyrmions and antiskyrmions, a
+significant challenge remains in crafting nontrivial high-order skyrmionic
+textures with varying, or even tailor-made, topologies. We address this
+challenge, by focusing on a construction pathway of skyrmionic metamaterials
+within a monolayer thin film and suggest several skyrmionic metamaterials that
+are surprisingly stable, i.e., long-lived, due to a self-stabilization
+mechanism. This makes these new textures promising for applications. Central to
+our approach is the concept of 'simulated controlled assembly', in short, a
+protocol inspired by 'click chemistry' that allows for positioning topological
+magnetic structures where one likes, and then allowing for energy minimization
+to elucidate the stability. Utilizing high-throughput atomistic-spin-dynamic
+simulations alongside state-of-the-art AI-driven tools, we have isolated
+skyrmions (topological charge Q=1), antiskyrmions (Q=-1), and skyrmionium
+(Q=0). These entities serve as foundational 'skyrmionic building blocks' to
+form the here reported intricate textures. In this work, two key contributions
+are introduced to the field of skyrmionic systems. First, we present a a novel
+combination of atomistic spin dynamics simulations and controlled assembly
+protocols for the stabilization and investigation of new topological magnets.
+Second, using the aforementioned methods we report on the discovery of
+skyrmionic metamaterials.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BayesAdapter: enhanced uncertainty estimation in CLIP few-shot
+  adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09718v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09718v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Morales-Álvarez, Stergios Christodoulidis, Maria Vakalopoulou, Pablo Piantanida, Jose Dolz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large pre-trained vision-language models (VLMs) represents a
+paradigm shift in machine learning, with unprecedented results in a broad span
+of visual recognition tasks. CLIP, one of the most popular VLMs, has exhibited
+remarkable zero-shot and transfer learning capabilities in classification. To
+transfer CLIP to downstream tasks, adapters constitute a parameter-efficient
+approach that avoids backpropagation through the large model (unlike related
+prompt learning methods). However, CLIP adapters have been developed to target
+discriminative performance, and the quality of their uncertainty estimates has
+been overlooked. In this work we show that the discriminative performance of
+state-of-the-art CLIP adapters does not always correlate with their uncertainty
+estimation capabilities, which are essential for a safe deployment in
+real-world scenarios. We also demonstrate that one of such adapters is obtained
+through MAP inference from a more general probabilistic framework. Based on
+this observation we introduce BayesAdapter, which leverages Bayesian inference
+to estimate a full probability distribution instead of a single point, better
+capturing the variability inherent in the parameter space. In a comprehensive
+empirical evaluation we show that our approach obtains high quality uncertainty
+estimates in the predictions, standing out in calibration and selective
+classification. Our code will be publicly available upon acceptance of the
+paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 5 figures, 23 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring energy minimization to model strain localization as a strong
+  discontinuity using Physics Informed Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.13241v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.13241v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omar León, Víctor Rivera, Angel Vázquez-Patiño, Jacinto Ulloa, Esteban Samaniego
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the possibilities of using energy minimization for the numerical
+modeling of strain localization in solids as a sharp discontinuity in the
+displacement field. For this purpose, we consider (regularized) strong
+discontinuity kinematics in elastoplastic solids. The corresponding
+mathematical model is discretized using Artificial Neural Networks (ANNs),
+aiming to predict both the magnitude and location of the displacement jump from
+energy minimization, $\textit{i.e.}$, within a variational setting. The
+architecture takes care of the kinematics, while the loss function takes care
+of the variational statement of the boundary value problem. The main idea
+behind this approach is to solve both the equilibrium problem and the location
+of the localization band by means of trainable parameters in the ANN. As a
+proof of concept, we show through both 1D and 2D numerical examples that the
+computational modeling of strain localization for elastoplastic solids using
+energy minimization is feasible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QuadWBG: Generalizable Quadrupedal Whole-Body Grasping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.06782v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.06782v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jilong Wang, Javokhirbek Rajabov, Chaoyi Xu, Yiming Zheng, He Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legged robots with advanced manipulation capabilities have the potential to
+significantly improve household duties and urban maintenance. Despite
+considerable progress in developing robust locomotion and precise manipulation
+methods, seamlessly integrating these into cohesive whole-body control for
+real-world applications remains challenging. In this paper, we present a
+modular framework for robust and generalizable whole-body loco-manipulation
+controller based on a single arm-mounted camera. By using reinforcement
+learning (RL), we enable a robust low-level policy for command execution over 5
+dimensions (5D) and a grasp-aware high-level policy guided by a novel metric,
+Generalized Oriented Reachability Map (GORM). The proposed system achieves
+state-of-the-art one-time grasping accuracy of 89% in the real world, including
+challenging tasks such as grasping transparent objects. Through extensive
+simulations and real-world experiments, we demonstrate that our system can
+effectively manage a large workspace, from floor level to above body height,
+and perform diverse whole-body loco-manipulation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-Driven Early Mental Health Screening: Analyzing Selfies of Pregnant
+  Women <span class="chip">ALT</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gustavo A. Basílio, Thiago B. Pereira, Alessandro L. Koerich, Hermano Tavares, Ludmila Dias, Maria das Graças da S. Teixeira, Rafael T. Sousa, Wilian H. Hisatugu, Amanda S. Mota, Anilton S. Garcia, Marco Aurélio K. Galletta, Thiago M. Paixão
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Major Depressive Disorder and anxiety disorders affect millions globally,
+contributing significantly to the burden of mental health issues. Early
+screening is crucial for effective intervention, as timely identification of
+mental health issues can significantly improve treatment outcomes. Artificial
+intelligence (AI) can be valuable for improving the screening of mental
+disorders, enabling early intervention and better treatment outcomes. AI-driven
+screening can leverage the analysis of multiple data sources, including facial
+features in digital images. However, existing methods often rely on controlled
+environments or specialized equipment, limiting their broad applicability. This
+study explores the potential of AI models for ubiquitous depression-anxiety
+screening given face-centric selfies. The investigation focuses on high-risk
+pregnant patients, a population that is particularly vulnerable to mental
+health issues. To cope with limited training data resulting from our clinical
+setup, pre-trained models were utilized in two different approaches:
+fine-tuning convolutional neural networks (CNNs) originally designed for facial
+expression recognition and employing vision-language models (VLMs) for
+zero-shot analysis of facial expressions. Experimental results indicate that
+the proposed VLM-based method significantly outperforms CNNs, achieving an
+accuracy of 77.6%. Although there is significant room for improvement, the
+results suggest that VLMs can be a promising approach for mental health
+screening.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article has been accepted for publication in HEALTHINF25 at the
+  18th International Joint Conference on Biomedical Engineering Systems and
+  Technologies (BIOSTEC 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spectral complexity of deep neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09541v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09541v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simmaco Di Lillo, Domenico Marinucci, Michele Salvi, Stefano Vigogna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is well-known that randomly initialized, push-forward, fully-connected
+neural networks weakly converge to isotropic Gaussian processes, in the limit
+where the width of all layers goes to infinity. In this paper, we propose to
+use the angular power spectrum of the limiting field to characterize the
+complexity of the network architecture. In particular, we define sequences of
+random variables associated with the angular power spectrum, and provide a full
+characterization of the network complexity in terms of the asymptotic
+distribution of these sequences as the depth diverges. On this basis, we
+classify neural networks as low-disorder, sparse, or high-disorder; we show how
+this classification highlights a number of distinct features for standard
+activation functions, and in particular, sparsity properties of ReLU networks.
+Our theoretical results are also validated by numerical simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Forward Compatibility in Class Incremental Learning by
+  Increasing Representation Rank and Feature Richness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15517v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15517v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaeill Kim, Wonseok Lee, Moonjung Eo, Wonjong Rhee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class Incremental Learning (CIL) constitutes a pivotal subfield within
+continual learning, aimed at enabling models to progressively learn new
+classification tasks while retaining knowledge obtained from prior tasks.
+Although previous studies have predominantly focused on backward compatible
+approaches to mitigate catastrophic forgetting, recent investigations have
+introduced forward compatible methods to enhance performance on novel tasks and
+complement existing backward compatible methods. In this study, we introduce an
+effective-Rank based Feature Richness enhancement (RFR) method, designed for
+improving forward compatibility. Specifically, this method increases the
+effective rank of representations during the base session, thereby facilitating
+the incorporation of more informative features pertinent to unseen novel tasks.
+Consequently, RFR achieves dual objectives in backward and forward
+compatibility: minimizing feature extractor modifications and enhancing novel
+task performance, respectively. To validate the efficacy of our approach, we
+establish a theoretical connection between effective rank and the Shannon
+entropy of representations. Subsequently, we conduct comprehensive experiments
+by integrating RFR into eleven well-known CIL methods. Our results demonstrate
+the effectiveness of our approach in enhancing novel-task performance while
+mitigating catastrophic forgetting. Furthermore, our method notably improves
+the average incremental accuracy across all eleven cases examined.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QUACK: Quantum Aligned Centroid Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00304v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00304v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kilian Tscharke, Sebastian Issel, Pascal Debus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum computing (QC) seems to show potential for application in machine
+learning (ML). In particular quantum kernel methods (QKM) exhibit promising
+properties for use in supervised ML tasks. However, a major disadvantage of
+kernel methods is their unfavorable quadratic scaling with the number of
+training samples. Together with the limits imposed by currently available
+quantum hardware (NISQ devices) with their low qubit coherence times, small
+number of qubits, and high error rates, the use of QC in ML at an industrially
+relevant scale is currently impossible. As a small step in improving the
+potential applications of QKMs, we introduce QUACK, a quantum kernel algorithm
+whose time complexity scales linear with the number of samples during training,
+and independent of the number of training samples in the inference stage. In
+the training process, only the kernel entries for the samples and the centers
+of the classes are calculated, i.e. the maximum shape of the kernel for n
+samples and c classes is (n, c). During training, the parameters of the quantum
+kernel and the positions of the centroids are optimized iteratively. In the
+inference stage, for every new sample the circuit is only evaluated for every
+centroid, i.e. c times. We show that the QUACK algorithm nevertheless provides
+satisfactory results and can perform at a similar level as classical kernel
+methods with quadratic scaling during training. In addition, our (simulated)
+algorithm is able to handle high-dimensional datasets such as MNIST with 784
+features without any dimensionality reduction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2nd place Best Paper award in QML track @ IEEE International
+  Conference on Quantum Computing and Engineering (QCE) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying Aleatoric Uncertainty of the Treatment Effect: A Novel
+  Orthogonal Learner 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03387v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03387v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valentyn Melnychuk, Stefan Feuerriegel, Mihaela van der Schaar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating causal quantities from observational data is crucial for
+understanding the safety and effectiveness of medical treatments. However, to
+make reliable inferences, medical practitioners require not only estimating
+averaged causal quantities, such as the conditional average treatment effect,
+but also understanding the randomness of the treatment effect as a random
+variable. This randomness is referred to as aleatoric uncertainty and is
+necessary for understanding the probability of benefit from treatment or
+quantiles of the treatment effect. Yet, the aleatoric uncertainty of the
+treatment effect has received surprisingly little attention in the causal
+machine learning community. To fill this gap, we aim to quantify the aleatoric
+uncertainty of the treatment effect at the covariate-conditional level, namely,
+the conditional distribution of the treatment effect (CDTE). Unlike average
+causal quantities, the CDTE is not point identifiable without strong additional
+assumptions. As a remedy, we employ partial identification to obtain sharp
+bounds on the CDTE and thereby quantify the aleatoric uncertainty of the
+treatment effect. We then develop a novel, orthogonal learner for the bounds on
+the CDTE, which we call AU-learner. We further show that our AU-learner has
+several strengths in that it satisfies Neyman-orthogonality and, thus,
+quasi-oracle efficiency. Finally, we propose a fully-parametric deep learning
+instantiation of our AU-learner.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Counterfactual Image Generation <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.20287v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.20287v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Melistas, Nikos Spyrou, Nefeli Gkouti, Pedro Sanchez, Athanasios Vlontzos, Yannis Panagakis, Giorgos Papanastasiou, Sotirios A. Tsaftaris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI has revolutionised visual content editing, empowering users to
+effortlessly modify images and videos. However, not all edits are equal. To
+perform realistic edits in domains such as natural image or medical imaging,
+modifications must respect causal relationships inherent to the data generation
+process. Such image editing falls into the counterfactual image generation
+regime. Evaluating counterfactual image generation is substantially complex:
+not only it lacks observable ground truths, but also requires adherence to
+causal constraints. Although several counterfactual image generation methods
+and evaluation metrics exist, a comprehensive comparison within a unified
+setting is lacking. We present a comparison framework to thoroughly benchmark
+counterfactual image generation methods. We integrate all models that have been
+used for the task at hand and expand them to novel datasets and causal graphs,
+demonstrating the superiority of Hierarchical VAEs across most datasets and
+metrics. Our framework is implemented in a user-friendly Python package that
+can be extended to incorporate additional SCMs, causal methods, generative
+models, and datasets for the community to build on. Code:
+https://github.com/gulnazaki/counterfactual-benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at NeurIPS 2024 Datasets and
+  Benchmarks Track https://openreview.net/forum?id=0T8xRFrScB Project page:
+  https://gulnazaki.github.io/counterfactual-benchmark</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imitating from auxiliary imperfect demonstrations via Adversarial
+  Density Weighted Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20351v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20351v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Zhang, Zifeng Zhuang, Jingzehua Xu, Yiyuan Yang, Yubo Huang, Donglin Wang, Shuai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel one-step supervised imitation learning (IL) framework
+called Adversarial Density Regression (ADR). This IL framework aims to correct
+the policy learned on unknown-quality to match the expert distribution by
+utilizing demonstrations, without relying on the Bellman operator.
+Specifically, ADR addresses several limitations in previous IL algorithms:
+First, most IL algorithms are based on the Bellman operator, which inevitably
+suffer from cumulative offsets from sub-optimal rewards during multi-step
+update processes. Additionally, off-policy training frameworks suffer from
+Out-of-Distribution (OOD) state-actions. Second, while conservative terms help
+solve the OOD issue, balancing the conservative term is difficult. To address
+these limitations, we fully integrate a one-step density-weighted Behavioral
+Cloning (BC) objective for IL with auxiliary imperfect demonstration.
+Theoretically, we demonstrate that this adaptation can effectively correct the
+distribution of policies trained on unknown-quality datasets to align with the
+expert policy's distribution. Moreover, the difference between the empirical
+and the optimal value function is proportional to the upper bound of ADR's
+objective, indicating that minimizing ADR's objective is akin to approaching
+the optimal value. Experimentally, we validated the performance of ADR by
+conducting extensive evaluations. Specifically, ADR outperforms all of the
+selected IL algorithms on tasks from the Gym-Mujoco domain. Meanwhile, it
+achieves an 89.5% improvement over IQL when utilizing ground truth rewards on
+tasks from the Adroit and Kitchen domains. Our codebase will be released at:
+https://github.com/stevezhangzA/Adverserial_Density_Regression.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ D3RM: A Discrete Denoising Diffusion Refinement Model for Piano
+  Transcription <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05068v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05068v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hounsu Kim, Taegyun Kwon, Juhan Nam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have been widely used in the generative domain due to their
+convincing performance in modeling complex data distributions. Moreover, they
+have shown competitive results on discriminative tasks, such as image
+segmentation. While diffusion models have also been explored for automatic
+music transcription, their performance has yet to reach a competitive level. In
+this paper, we focus on discrete diffusion model's refinement capabilities and
+present a novel architecture for piano transcription. Our model utilizes
+Neighborhood Attention layers as the denoising module, gradually predicting the
+target high-resolution piano roll, conditioned on the finetuned features of a
+pretrained acoustic model. To further enhance refinement, we devise a novel
+strategy which applies distinct transition states during training and inference
+stage of discrete diffusion models. Experiments on the MAESTRO dataset show
+that our approach outperforms previous diffusion-based piano transcription
+models and the baseline model in terms of F1 score. Our code is available in
+https://github.com/hanshounsu/d3rm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are LLMs Good Cryptic Crossword Solvers? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrahman Sadallah, Daria Kotova, Ekaterina Kochmar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cryptic crosswords are puzzles that rely not only on general knowledge but
+also on the solver's ability to manipulate language on different levels and
+deal with various types of wordplay. Previous research suggests that solving
+such puzzles is a challenge even for modern NLP models. However, the abilities
+of large language models (LLMs) have not yet been tested on this task. In this
+paper, we establish the benchmark results for three popular LLMs -- LLaMA2,
+Mistral, and ChatGPT -- showing that their performance on this task is still
+far from that of humans.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object
+  Interaction Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenkun He, Yun Liu, Ruitao Liu, Li Yi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthesizing realistic human-object interaction motions is a critical problem
+in VR/AR and human animation. Unlike the commonly studied scenarios involving a
+single human or hand interacting with one object, we address a more generic
+multi-body setting with arbitrary numbers of humans, hands, and objects. This
+complexity introduces significant challenges in synchronizing motions due to
+the high correlations and mutual influences among bodies. To address these
+challenges, we introduce SyncDiff, a novel method for multi-body interaction
+synthesis using a synchronized motion diffusion strategy. SyncDiff employs a
+single diffusion model to capture the joint distribution of multi-body motions.
+To enhance motion fidelity, we propose a frequency-domain motion decomposition
+scheme. Additionally, we introduce a new set of alignment scores to emphasize
+the synchronization of different body motions. SyncDiff jointly optimizes both
+data sample likelihood and alignment likelihood through an explicit
+synchronization strategy. Extensive experiments across four datasets with
+various multi-body configurations demonstrate the superiority of SyncDiff over
+existing state-of-the-art motion synthesis methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Initialization is Critical to Whether <span class="highlight-title">Transformer</span>s Fit Composite
+  Functions by Reasoning or Memorizing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05409v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05409v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongwang Zhang, Pengxiao Lin, Zhiwei Wang, Yaoyu Zhang, Zhi-Qin John Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have shown impressive capabilities across various tasks, but
+their performance on compositional problems remains a topic of debate. In this
+work, we investigate the mechanisms of how transformers behave on unseen
+compositional tasks. We discover that the parameter initialization scale plays
+a critical role in determining whether the model learns inferential
+(reasoning-based) solutions, which capture the underlying compositional
+primitives, or symmetric (memory-based) solutions, which simply memorize
+mappings without understanding the compositional structure. By analyzing the
+information flow and vector representations within the model, we reveal the
+distinct mechanisms underlying these solution types. We further find that
+inferential (reasoning-based) solutions exhibit low complexity bias, which we
+hypothesize is a key factor enabling them to learn individual mappings for
+single anchors. We validate our conclusions on various real-world datasets. Our
+findings provide valuable insights into the role of initialization scale in
+tuning the reasoning and memorizing ability and we propose the initialization
+rate $\gamma$ to be a convenient tunable hyper-parameter in common deep
+learning frameworks, where $1/d_{\mathrm{in}}^\gamma$ is the standard deviation
+of parameters of the layer with $d_{\mathrm{in}}$ input neurons.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GFairHint: Improving Individual Fairness for Graph Neural Networks via
+  Fairness Hint <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15622v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15622v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paiheng Xu, Yuhang Zhou, Bang An, Wei Ai, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the growing concerns about fairness in machine learning and the
+impressive performance of Graph Neural Networks (GNNs) on graph data learning,
+algorithmic fairness in GNNs has attracted significant attention. While many
+existing studies improve fairness at the group level, only a few works promote
+individual fairness, which renders similar outcomes for similar individuals. A
+desirable framework that promotes individual fairness should (1) balance
+between fairness and performance, (2) accommodate two commonly-used individual
+similarity measures (externally annotated and computed from input features),
+(3) generalize across various GNN models, and (4) be computationally efficient.
+Unfortunately, none of the prior work achieves all the desirables. In this
+work, we propose a novel method, GFairHint, which promotes individual fairness
+in GNNs and achieves all aforementioned desirables. GFairHint learns fairness
+representations through an auxiliary link prediction task, and then
+concatenates the representations with the learned node embeddings in original
+GNNs as a "fairness hint". Through extensive experimental investigations on
+five real-world graph datasets under three prevalent GNN models covering both
+individual similarity measures above, GFairHint achieves the best fairness
+results in almost all combinations of datasets with various backbone models,
+while generating comparable utility results, with much less computational cost
+compared to the previous state-of-the-art (SoTA) method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the ACM Transactions on Knowledge Discovery from Data
+  (TKDD 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoNOAir: A Neural Operator for Forecasting Carbon Monoxide Evolution in
+  Cities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06007v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06007v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanchit Bedi, Karn Tiwari, Prathosh A. P., Sri Harsha Kota, N. M. Anoop Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Carbon Monoxide (CO) is a dominant pollutant in urban areas due to the energy
+generation from fossil fuels for industry, automobile, and domestic
+requirements. Forecasting the evolution of CO in real-time can enable the
+deployment of effective early warning systems and intervention strategies.
+However, the computational cost associated with the physics and chemistry-based
+simulation makes it prohibitive to implement such a model at the city and
+country scale. To address this challenge, here, we present a machine learning
+model based on neural operator, namely, Complex Neural Operator for Air Quality
+(CoNOAir), that can effectively forecast CO concentrations. We demonstrate this
+by developing a country-level model for short-term (hourly) and long-term
+(72-hour) forecasts of CO concentrations. Our model outperforms
+state-of-the-art models such as Fourier neural operators (FNO) and provides
+reliable predictions for both short and long-term forecasts. We further analyse
+the capability of the model to capture extreme events and generate forecasts in
+urban cities in India. Interestingly, we observe that the model predicts the
+next hour CO concentrations with R2 values greater than 0.95 for all the cities
+considered. The deployment of such a model can greatly assist the governing
+bodies to provide early warning, plan intervention strategies, and develop
+effective strategies by considering several what-if scenarios. Altogether, the
+present approach could provide a fillip to real-time predictions of CO
+pollution in urban cities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 14 figures, under submission process</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A monthly sub-national Harmonized Food Insecurity <span class="highlight-title">Dataset</span> for
+  comprehensive analysis and predictive modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06076v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06076v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mélissande Machefer, Michele Ronco, Anne-Claire Thomas, Michael Assouline, Melanie Rabier, Christina Corbane, Felix Rembold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Food security is a complex, multidimensional concept challenging to measure
+comprehensively. Effective anticipation, monitoring, and mitigation of food
+crises require timely and comprehensive global data. This paper introduces the
+Harmonized Food Insecurity Dataset (HFID), an open-source resource
+consolidating four key data sources: the Integrated Food Security Phase
+Classification (IPC)/Cadre Harmonis\'e (CH) phases, the Famine Early Warning
+Systems Network (FEWS NET) IPC-compatible phases, and the World Food Program's
+(WFP) Food Consumption Score (FCS) and reduced Coping Strategy Index (rCSI).
+Updated monthly and using a common reference system for administrative units,
+the HFID offers extensive spatial and temporal coverage. It serves as a vital
+tool for food security experts and humanitarian agencies, providing a unified
+resource for analyzing food security conditions and highlighting global data
+disparities. The scientific community can also leverage the HFID to develop
+data-driven predictive models, enhancing the capacity to forecast and prevent
+future food crises.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The authors Melissande Machefer and Michele Ronco have contributed
+  equally as both first authors to this work. This work is currently being
+  reviewed in a peer-reviewed journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bandit Pareto Set Identification: the Fixed Budget Setting <span class="chip">AISTATS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03992v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03992v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cyrille Kone, Emilie Kaufmann, Laura Richert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a multi-objective pure exploration problem in a multi-armed bandit
+model. Each arm is associated to an unknown multi-variate distribution and the
+goal is to identify the distributions whose mean is not uniformly worse than
+that of another distribution: the Pareto optimal set. We propose and analyze
+the first algorithms for the \emph{fixed budget} Pareto Set Identification
+task. We propose Empirical Gap Elimination, a family of algorithms combining a
+careful estimation of the ``hardness to classify'' each arm in or out of the
+Pareto set with a generic elimination scheme. We prove that two particular
+instances, EGE-SR and EGE-SH, have a probability of error that decays
+exponentially fast with the budget, with an exponent supported by an
+information theoretic lower-bound. We complement these findings with an
+empirical study using real-world and synthetic datasets, which showcase the
+good performance of our algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of AISTATS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MusicLIME: Explainable Multimodal Music Understanding <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.10496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.10496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theodoros Sotirou, Vassilis Lyberatos, Orfeas Menis Mastromichalakis, Giorgos Stamou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal models are critical for music understanding tasks, as they capture
+the complex interplay between audio and lyrics. However, as these models become
+more prevalent, the need for explainability grows-understanding how these
+systems make decisions is vital for ensuring fairness, reducing bias, and
+fostering trust. In this paper, we introduce MusicLIME, a model-agnostic
+feature importance explanation method designed for multimodal music models.
+Unlike traditional unimodal methods, which analyze each modality separately
+without considering the interaction between them, often leading to incomplete
+or misleading explanations, MusicLIME reveals how audio and lyrical features
+interact and contribute to predictions, providing a holistic view of the
+model's decision-making. Additionally, we enhance local explanations by
+aggregating them into global explanations, giving users a broader perspective
+of model behavior. Through this work, we contribute to improving the
+interpretability of multimodal music models, empowering users to make informed
+choices, and fostering more equitable, fair, and transparent music
+understanding systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GitHub repository: https://github.com/IamTheo2000/MusicLIME. To be
+  presented at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CAB: Comprehensive Attention Benchmarking on Long Sequence Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07661v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07661v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Zhang, Shuyang Jiang, Jiangtao Feng, Lin Zheng, Lingpeng Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer has achieved remarkable success in language, image, and speech
+processing. Recently, various efficient attention architectures have been
+proposed to improve transformer's efficiency while largely preserving its
+efficacy, especially in modeling long sequences. A widely-used benchmark to
+test these efficient methods' capability on long-range modeling is Long Range
+Arena (LRA). However, LRA only focuses on the standard bidirectional (or
+noncausal) self attention, and completely ignores cross attentions and
+unidirectional (or causal) attentions, which are equally important to
+downstream applications. In this paper, we propose Comprehensive Attention
+Benchmark (CAB) under a fine-grained attention taxonomy with four
+distinguishable attention patterns, namely, noncausal self, causal self,
+noncausal cross, and causal cross attentions. CAB collects seven real-world
+tasks from different research areas to evaluate efficient attentions under the
+four attention patterns. Among these tasks, CAB validates efficient attentions
+in eight backbone networks to show their generalization across neural
+architectures. We conduct exhaustive experiments to benchmark the performances
+of nine widely-used efficient attention architectures designed with different
+philosophies on CAB. Extensive experimental results also shed light on the
+fundamental problems of efficient attentions, such as efficiency length against
+vanilla attention, performance consistency across attention patterns, the
+benefit of attention mechanisms, and interpolation/extrapolation on
+long-context language modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> Amortizing intractable inference in diffusion models for vision,
+  language, and control <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20971v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20971v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddarth Venkatraman, Moksh Jain, Luca Scimeca, Minsu Kim, Marcin Sendera, Mohsin Hasan, Luke Rowe, Sarthak Mittal, Pablo Lemos, Emmanuel Bengio, Alexandre Adam, Jarrid Rector-Brooks, <span class="highlight-author">Yoshua Bengio</span>, Glen Berseth, Nikolay Malkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as effective distribution estimators in vision,
+language, and reinforcement learning, but their use as priors in downstream
+tasks poses an intractable posterior inference problem. This paper studies
+amortized sampling of the posterior over data, $\mathbf{x}\sim p^{\rm
+post}(\mathbf{x})\propto p(\mathbf{x})r(\mathbf{x})$, in a model that consists
+of a diffusion generative model prior $p(\mathbf{x})$ and a black-box
+constraint or likelihood function $r(\mathbf{x})$. We state and prove the
+asymptotic correctness of a data-free learning objective, relative trajectory
+balance, for training a diffusion model that samples from this posterior, a
+problem that existing methods solve only approximately or in restricted cases.
+Relative trajectory balance arises from the generative flow network perspective
+on diffusion models, which allows the use of deep reinforcement learning
+techniques to improve mode coverage. Experiments illustrate the broad potential
+of unbiased inference of arbitrary posteriors under diffusion priors: in vision
+(classifier guidance), language (infilling under a discrete diffusion LLM), and
+multimodal data (text-to-image generation). Beyond generative modeling, we
+apply relative trajectory balance to the problem of continuous control with a
+score-based behavior prior, achieving state-of-the-art results on benchmarks in
+offline reinforcement learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024; code: https://github.com/GFNOrg/diffusion-finetuning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Large Foundation Models Design: A Perspective From Model and
+  System Co-Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.01990v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.01990v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Liu, Yanxuan Yu, Zhixin Lai, Yite Wang, Jing Wu, Zhongwei Wan, Sina Alinejad, Benjamin Lengerich, Ying Nian Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on modern efficient training and inference technologies on
+foundation models and illustrates them from two perspectives: model and system
+design. Model and System Design optimize LLM training and inference from
+different aspects to save computational resources, making LLMs more efficient,
+affordable, and more accessible. The paper list repository is available at
+\url{https://github.com/NoakLiu/Efficient-Foundation-Models-Survey}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> Improved off-policy training of diffusion samplers <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05098v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05098v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcin Sendera, Minsu Kim, Sarthak Mittal, Pablo Lemos, Luca Scimeca, Jarrid Rector-Brooks, Alexandre Adam, <span class="highlight-author">Yoshua Bengio</span>, Nikolay Malkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of training diffusion models to sample from a
+distribution with a given unnormalized density or energy function. We benchmark
+several diffusion-structured inference methods, including simulation-based
+variational approaches and off-policy methods (continuous generative flow
+networks). Our results shed light on the relative advantages of existing
+algorithms while bringing into question some claims from past work. We also
+propose a novel exploration strategy for off-policy methods, based on local
+search in the target space with the use of a replay buffer, and show that it
+improves the quality of samples on a variety of target distributions. Our code
+for the sampling methods and benchmarks studied is made public at
+https://github.com/GFNOrg/gfn-diffusion as a base for future work on diffusion
+models for amortized inference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024; code: https://github.com/GFNOrg/gfn-diffusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EVA-S2PLoR: A Secure Element-wise Multiplication Meets Logistic
+  Regression on Heterogeneous Database 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05223v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05223v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianle Tao, Shizhao Peng, Tianyu Mei, Shoumo Li, Haogang Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate nonlinear computation is a key challenge in privacy-preserving
+machine learning (PPML). Most existing frameworks approximate it through linear
+operations, resulting in significant precision loss. This paper proposes an
+efficient, verifiable and accurate security 2-party logistic regression
+framework (EVA-S2PLoR), which achieves accurate nonlinear function computation
+through a novel secure element-wise multiplication protocol and its derived
+protocols. Our framework primarily includes secure 2-party vector element-wise
+multiplication, addition to multiplication, reciprocal, and sigmoid function
+based on data disguising technology, where high efficiency and accuracy are
+guaranteed by the simple computation flow based on the real number domain and
+the few number of fixed communication rounds. We provide secure and robust
+anomaly detection through dimension transformation and Monte Carlo methods.
+EVA-S2PLoR outperforms many advanced frameworks in terms of precision
+(improving the performance of the sigmoid function by about 10 orders of
+magnitude compared to most frameworks) and delivers the best overall
+performance in secure logistic regression experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimally Solving Simultaneous-Move Dec-POMDPs: The Sequential Central
+  Planning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13139v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13139v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Peralez, Aurèlien Delage, Jacopo Castellini, Rafael F. Cunha, Jilles S. Dibangoye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The centralized training for decentralized execution paradigm emerged as the
+state-of-the-art approach to $\epsilon$-optimally solving decentralized
+partially observable Markov decision processes. However, scalability remains a
+significant issue. This paper presents a novel and more scalable alternative,
+namely the sequential-move centralized training for decentralized execution.
+This paradigm further pushes the applicability of the Bellman's principle of
+optimality, raising three new properties. First, it allows a central planner to
+reason upon sufficient sequential-move statistics instead of prior
+simultaneous-move ones. Next, it proves that $\epsilon$-optimal value functions
+are piecewise linear and convex in such sufficient sequential-move statistics.
+Finally, it drops the complexity of the backup operators from double
+exponential to polynomial at the expense of longer planning horizons. Besides,
+it makes it easy to use single-agent methods, e.g., SARSA algorithm enhanced
+with these findings, while still preserving convergence guarantees. Experiments
+on two- as well as many-agent domains from the literature against
+$\epsilon$-optimal simultaneous-move solvers confirm the superiority of our
+novel approach. This paradigm opens the door for efficient planning and
+reinforcement learning methods for multi-agent systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Feature-based Knowledge Distillation for Recommender System: A
+  Frequency Perspective <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangchi Zhu, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we analyze the feature-based knowledge distillation for
+recommendation from the frequency perspective. By defining knowledge as
+different frequency components of the features, we theoretically demonstrate
+that regular feature-based knowledge distillation is equivalent to equally
+minimizing losses on all knowledge and further analyze how this equal loss
+weight allocation method leads to important knowledge being overlooked. In
+light of this, we propose to emphasize important knowledge by redistributing
+knowledge weights. Furthermore, we propose FreqD, a lightweight knowledge
+reweighting method, to avoid the computational cost of calculating losses on
+each knowledge. Extensive experiments demonstrate that FreqD consistently and
+significantly outperforms state-of-the-art knowledge distillation methods for
+recommender systems. Our code is available at https://github.com/woriazzc/KDs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM KDD 2025 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainable Metrics for the Assessment of Neurodegenerative Diseases
+  through Handwriting Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Thebaud, Anna Favaro, Casey Chen, Gabrielle Chavez, Laureano Moro-Velazquez, Ankur Butala, Najim Dehak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motor dysfunction is a common sign of neurodegenerative diseases (NDs) such
+as Parkinson's disease (PD) and Alzheimer's disease (AD), but may be difficult
+to detect, especially in the early stages. In this work, we examine the
+behavior of a wide array of explainable metrics extracted from the handwriting
+signals of 113 subjects performing multiple tasks on a digital tablet, as part
+of the Neurological Signals dataset. The aim is to measure their effectiveness
+in characterizing NDs, including AD and PD. To this end, task-agnostic and
+task-specific metrics are extracted from 14 distinct tasks. Subsequently,
+through statistical analysis and a series of classification experiments, we
+investigate which metrics provide greater discriminative power between NDs and
+healthy controls and amongst different NDs. Preliminary results indicate that
+the tasks at hand can all be effectively leveraged to distinguish between the
+considered set of NDs, specifically by measuring the stability, the speed of
+writing, the time spent not writing, and the pressure variations between groups
+from our handcrafted explainable metrics, which shows p-values lower than
+0.0001 for multiple tasks. Using various binary classification algorithms on
+the computed metrics, we obtain up to 87 % accuracy for the discrimination
+between AD and healthy controls (CTL), and up to 69 % for the discrimination
+between PD and CTL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages including references, under review in IEEE JHBI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An empirical study of LLaMA3 quantization: from LLMs to MLLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14047v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14047v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Huang, Xingyu Zheng, Xudong Ma, Haotong Qin, Chengtao Lv, Hong Chen, Jie Luo, Xiaojuan Qi, Xianglong Liu, Michele Magno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The LLaMA family, a collection of foundation language models ranging from 7B
+to 65B parameters, has become one of the most powerful open-source large
+language models (LLMs) and the popular LLM backbone of multi-modal large
+language models (MLLMs), widely used in computer vision and natural language
+understanding tasks. In particular, LLaMA3 models have recently been released
+and have achieved impressive performance in various domains with super-large
+scale pre-training on over 15T tokens of data. Given the wide application of
+low-bit quantization for LLMs in resource-constrained scenarios, we explore
+LLaMA3's capabilities when quantized to low bit-width. This exploration can
+potentially provide new insights and challenges for the low-bit quantization of
+LLaMA3 and other future LLMs, especially in addressing performance degradation
+issues that suffer in LLM compression. Specifically, we comprehensively
+evaluate the 10 existing post-training quantization and LoRA fine-tuning
+(LoRA-FT) methods of LLaMA3 on 1-8 bits and various datasets to reveal the
+low-bit quantization performance of LLaMA3. To uncover the capabilities of
+low-bit quantized MLLM, we assessed the performance of the LLaMA3-based
+LLaVA-Next-8B model under 2-4 ultra-low bits with post-training quantization
+methods. Our experimental results indicate that LLaMA3 still suffers from
+non-negligible degradation in linguistic and visual contexts, particularly
+under ultra-low bit widths. This highlights the significant performance gap at
+low bit-width that needs to be addressed in future developments. We expect that
+this empirical study will prove valuable in advancing future models, driving
+LLMs and MLLMs to achieve higher accuracy at lower bit to enhance practicality.
+Our project is released on https://github.com/Macaronlin/LLaMA3-Quantization ,
+and quantized models are released at https://huggingface.co/Efficient-ML .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Learning with Strategic Selection and Forgetting for Network
+  Intrusion Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinchen Zhang, Running Zhao, Zhihan Jiang, Handi Chen, Yulong Ding, Edith C. H. Ngai, Shuang-Hua Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intrusion Detection Systems (IDS) are crucial for safeguarding digital
+infrastructure. In dynamic network environments, both threat landscapes and
+normal operational behaviors are constantly changing, resulting in concept
+drift. While continuous learning mitigates the adverse effects of concept
+drift, insufficient attention to drift patterns and excessive preservation of
+outdated knowledge can still hinder the IDS's adaptability. In this paper, we
+propose SSF (Strategic Selection and Forgetting), a novel continual learning
+method for IDS, providing continuous model updates with a constantly refreshed
+memory buffer. Our approach features a strategic sample selection algorithm to
+select representative new samples and a strategic forgetting mechanism to drop
+outdated samples. The proposed strategic sample selection algorithm prioritizes
+new samples that cause the `drifted' pattern, enabling the model to better
+understand the evolving landscape. Additionally, we introduce strategic
+forgetting upon detecting significant drift by discarding outdated samples to
+free up memory, allowing the incorporation of more recent data. SSF captures
+evolving patterns effectively and ensures the model is aligned with the change
+of data patterns, significantly enhancing the IDS's adaptability to concept
+drift. The state-of-the-art performance of SSF on NSL-KDD and UNSW-NB15
+datasets demonstrates its superior adaptability to concept drift for network
+intrusion detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE International Conference on Computer Communications
+  (INFOCOM) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-Agnostic Cosmological Inference with SDSS-IV eBOSS: Simultaneous
+  Probing for Background and Perturbed Universe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13973v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13973v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Purba Mukherjee, Anjan A. Sen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Here we explore certain subtle features imprinted in data from the completed
+Sloan Digital Sky Survey IV (SDSS-IV) extended Baryon Oscillation Spectroscopic
+Survey (eBOSS) as a combined probe for the background and perturbed Universe.
+We reconstruct the baryon Acoustic Oscillation (BAO) and Redshift Space
+Distortion (RSD) observables as functions of redshift, using measurements from
+SDSS alone. We apply the Multi-Task Gaussian Process (MTGP) framework to model
+the interdependencies of cosmological observables $D_M(z)/r_d$, $D_H(z)/r_d$,
+and $f\sigma_8(z)$, and track their evolution across different redshifts.
+Subsequently, we obtain constrained three-dimensional phase space containing
+$D_M(z)/r_d$, $D_H(z)/r_d$, and $f\sigma_8(z)$ at different redshifts probed by
+the SDSS-IV eBOSS survey. Furthermore, assuming the $\Lambda$CDM model, we
+obtain constraints on model parameters $\Omega_{m}$, $H_{0}r_{d}$, $\sigma_{8}$
+and $S_{8}$ at each redshift probed by SDSS-IV eBOSS. This indicates
+redshift-dependent trends in $H_0$, $\Omega_m$, $\sigma_8$ and $S_8$ in the
+$\Lambda$CDM model, suggesting a possible inconsistency in the $\Lambda$CDM
+model. Ours is a template for model-independent extraction of information for
+both background and perturbed Universe using a single galaxy survey taking into
+account all the existing correlations between background and perturbed
+observables and this can be easily extended to future DESI-3YR as well as
+Euclid results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 sets of figures, 3 tables. Comments are welcome. New
+  references added</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaPRL: Adaptive Pairwise Regression Learning with Uncertainty
+  Estimation for Universal Regression Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05809v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05809v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fuhang Liang, Rucong Xu, Deng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current deep regression models usually learn in point-wise way that treat
+each sample as an independent input, neglecting the relative ordering among
+different data. Consequently, the regression model could neglect the data 's
+interrelationships, potentially resulting in suboptimal performance. Moreover,
+the existence of aleatoric uncertainty in the training data may drive the model
+to capture non-generalizable patterns, contributing to increased overfitting.
+To address these issues, we propose a novel adaptive pairwise learning
+framework (AdaPRL) for regression tasks which leverages the relative
+differences between data points and integrates with deep probabilistic models
+to quantify the uncertainty associated with the predictions. Additionally, we
+adapt AdaPRL for applications in multi-task learning and multivariate time
+series forecasting. Extensive experiments with several real-world regression
+datasets including recommendation systems, age estimation, time series
+forecasting, natural language understanding, finance, and industry datasets
+show that AdaPRL is compatible with different backbone networks in various
+tasks and achieves state-of-the-art performance on the vast majority of tasks,
+highlighting its notable potential including enhancing prediction accuracy and
+ranking ability, increasing generalization capability, improving robustness to
+noisy data, improving resilience to reduced data, and enhancing
+interpretability, etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MIO: A Foundation Model on Multimodal Tokens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.17692v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.17692v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekun Wang, King Zhu, Chunpu Xu, Wangchunshu Zhou, Jiaheng Liu, Yibo Zhang, Jiashuo Wang, Ning Shi, Siyu Li, Yizhi Li, Haoran Que, Zhaoxiang Zhang, Yuanxing Zhang, Ge Zhang, Ke Xu, Jie Fu, Wenhao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce MIO, a novel foundation model built on multimodal
+tokens, capable of understanding and generating speech, text, images, and
+videos in an end-to-end, autoregressive manner. While the emergence of large
+language models (LLMs) and multimodal large language models (MM-LLMs) propels
+advancements in artificial general intelligence through their versatile
+capabilities, they still lack true any-to-any understanding and generation.
+Recently, the release of GPT-4o has showcased the remarkable potential of
+any-to-any LLMs for complex real-world tasks, enabling omnidirectional input
+and output across images, speech, and text. However, it is closed-source and
+does not support the generation of multimodal interleaved sequences. To address
+this gap, we present MIO, which is trained on a mixture of discrete tokens
+across four modalities using causal multimodal modeling. MIO undergoes a
+four-stage training process: (1) alignment pre-training, (2) interleaved
+pre-training, (3) speech-enhanced pre-training, and (4) comprehensive
+supervised fine-tuning on diverse textual, visual, and speech tasks. Our
+experimental results indicate that MIO exhibits competitive, and in some cases
+superior, performance compared to previous dual-modal baselines, any-to-any
+model baselines, and even modality-specific baselines. Moreover, MIO
+demonstrates advanced capabilities inherent to its any-to-any feature, such as
+interleaved video-text generation, chain-of-visual-thought reasoning, visual
+guideline generation, instructional image editing, etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report. Codes and models are available in
+  https://github.com/MIO-Team/MIO</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simplifying CLIP: Unleashing the Power of Large-Scale Models on
+  Consumer-level Computers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pre-training (CLIP) has attracted a surge of
+attention for its superior zero-shot performance and excellent transferability
+to downstream tasks. However, training such large-scale models usually requires
+substantial computation and storage, which poses barriers for general users
+with consumer-level computers. Motivated by this observation, in this paper we
+investigate how to achieve competitive performance on only one Nvidia RTX3090
+GPU and with one terabyte for storing dataset. On one hand, we simplify the
+transformer block structure and combine Weight Inheritance with multi-stage
+Knowledge Distillation (WIKD), thereby reducing the parameters and improving
+the inference speed during training along with deployment. On the other hand,
+confronted with the convergence challenge posed by small dataset, we generate
+synthetic captions for each sample as data augmentation, and devise a novel
+Pair Matching (PM) loss to fully exploit the distinguishment among positive and
+negative image-text pairs. Extensive experiments demonstrate that our model can
+achieve a new state-of-the-art datascale-parameter-accuracy tradeoff, which
+could further popularize the CLIP model in the related research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Buster: Implanting Semantic Backdoor into Text Encoder to Mitigate NSFW
+  Content Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07249v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07249v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhao, Xiaojun Chen, Yuexin Xuan, Zhendong Zhao, Xiaojun Jia, Xinfeng Li, Xiaofeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of deep learning models in the digital era has raised substantial
+concerns regarding the generation of Not-Safe-for-Work (NSFW) content. Existing
+defense methods primarily involve model fine-tuning and post-hoc content
+moderation. Nevertheless, these approaches largely lack scalability in
+eliminating harmful content, degrade the quality of benign image generation, or
+incur high inference costs. To address these challenges, we propose an
+innovative framework named \textit{Buster}, which injects backdoors into the
+text encoder to prevent NSFW content generation. Buster leverages deep semantic
+information rather than explicit prompts as triggers, redirecting NSFW prompts
+towards targeted benign prompts. Additionally, Buster employs energy-based
+training data generation through Langevin dynamics for adversarial knowledge
+augmentation, thereby ensuring robustness in harmful concept definition. This
+approach demonstrates exceptional resilience and scalability in mitigating NSFW
+content. Particularly, Buster fine-tunes the text encoder of Text-to-Image
+models within merely five minutes, showcasing its efficiency. Our extensive
+experiments denote that Buster outperforms nine state-of-the-art baselines,
+achieving a superior NSFW content removal rate of at least 91.2\% while
+preserving the quality of harmless images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Critical Tokens Matter: Token-Level Contrastive Estimation Enhances
+  LLM's Reasoning Capability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19943v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19943v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zicheng Lin, Tian Liang, Jiahao Xu, Qiuzhi Lin, Xing Wang, Ruilin Luo, Chufan Shi, Siheng Li, Yujiu Yang, Zhaopeng Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mathematical reasoning tasks pose significant challenges for large language
+models (LLMs) because they require precise logical deduction and sequence
+analysis. In this work, we introduce the concept of critical tokens -- elements
+within reasoning trajectories that significantly influence incorrect outcomes.
+We present a novel framework for identifying these tokens through rollout
+sampling and demonstrate their substantial divergence from traditional error
+tokens. Through extensive experiments on datasets such as GSM8K and MATH500, we
+show that identifying and replacing critical tokens significantly improves
+model accuracy. We propose an efficient methodology for pinpointing these
+tokens in large-scale datasets using contrastive estimation and extend this
+framework to enhance model training processes with direct preference
+optimization (DPO). Experimental results on GSM8K and MATH500 benchmarks with
+the widely used models Llama-3 (8B and 70B) and Deepseek-math (7B) demonstrate
+the effectiveness of the proposed approach, cDPO. Our results underscore the
+potential of leveraging critical tokens to reduce errors in reasoning tasks,
+advancing the development of AI systems capable of robust logical deduction.
+Our code, annotated datasets, and trained models are available at
+https://github.com/chenzhiling9954/Critical-Tokens-Matter to support and
+encourage future research in this promising field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast and reliable uncertainty quantification with neural network
+  ensembles for industrial image classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10182v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10182v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arthur Thuy, Dries F. Benoit
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image classification with neural networks (NNs) is widely used in industrial
+processes, situations where the model likely encounters unknown objects during
+deployment, i.e., out-of-distribution (OOD) data. Worryingly, NNs tend to make
+confident yet incorrect predictions when confronted with OOD data. To increase
+the models' reliability, they should quantify the uncertainty in their own
+predictions, communicating when the output should (not) be trusted. Deep
+ensembles, composed of multiple independent NNs, have been shown to perform
+strongly but are computationally expensive. Recent research has proposed more
+efficient NN ensembles, namely the snapshot, batch, and multi-input
+multi-output ensemble. This study investigates the predictive and uncertainty
+performance of efficient NN ensembles in the context of image classification
+for industrial processes. It is the first to provide a comprehensive comparison
+and it proposes a novel Diversity Quality metric to quantify the ensembles'
+performance on the in-distribution and OOD sets in one single metric. The
+results highlight the batch ensemble as a cost-effective and competitive
+alternative to the deep ensemble. It matches the deep ensemble in both
+uncertainty and accuracy while exhibiting considerable savings in training
+time, test time, and memory storage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted Manuscript version of an article published in Annals of
+  Operations Research</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing Weather Forecast to Fine-grained Temporal Scales via
+  Physics-AI Hybrid Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13796v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13796v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanghan Xu, Fenghua Ling, Wenlong Zhang, Tao Han, Hao Chen, Wanli Ouyang, Lei Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven artificial intelligence (AI) models have made significant
+advancements in weather forecasting, particularly in medium-range and
+nowcasting. However, most data-driven weather forecasting models are black-box
+systems that focus on learning data mapping rather than fine-grained physical
+evolution in the time dimension. Consequently, the limitations in the temporal
+scale of datasets prevent these models from forecasting at finer time scales.
+This paper proposes a physics-AI hybrid model (i.e., WeatherGFT) which
+generalizes weather forecasts to finer-grained temporal scales beyond training
+dataset. Specifically, we employ a carefully designed PDE kernel to simulate
+physical evolution on a small time scale (e.g., 300 seconds) and use a parallel
+neural networks with a learnable router for bias correction. Furthermore, we
+introduce a lead time-aware training framework to promote the generalization of
+the model at different lead times. The weight analysis of physics-AI modules
+indicates that physics conducts major evolution while AI performs corrections
+adaptively. Extensive experiments show that WeatherGFT trained on an hourly
+dataset, effectively generalizes forecasts across multiple time scales,
+including 30-minute, which is even smaller than the dataset's temporal
+resolution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Convergence of Continual Federated Learning Using Incrementally
+  Aggregated Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satish Kumar Keshri, Nazreen Shah, Ranjitha Prasad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The holy grail of machine learning is to enable Continual Federated Learning
+(CFL) to enhance the efficiency, privacy, and scalability of AI systems while
+learning from streaming data. The primary challenge of a CFL system is to
+overcome global catastrophic forgetting, wherein the accuracy of the global
+model trained on new tasks declines on the old tasks. In this work, we propose
+Continual Federated Learning with Aggregated Gradients (C-FLAG), a novel
+replay-memory based federated strategy consisting of edge-based gradient
+updates on memory and aggregated gradients on the current data. We provide
+convergence analysis of the C-FLAG approach which addresses forgetting and bias
+while converging at a rate of $O(1/\sqrt{T})$ over $T$ communication rounds. We
+formulate an optimization sub-problem that minimizes catastrophic forgetting,
+translating CFL into an iterative algorithm with adaptive learning rates that
+ensure seamless learning across tasks. We empirically show that C-FLAG
+outperforms several state-of-the-art baselines on both task and
+class-incremental settings with respect to metrics such as accuracy and
+forgetting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Probabilistic Forecasting of Irregular Time Series via Conditional Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06293v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06293v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vijaya Krishna Yalavarthi, Randolf Scholz, Stefan Born, Lars Schmidt-Thieme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Probabilistic forecasting of irregularly sampled multivariate time series
+with missing values is an important problem in many fields, including health
+care, astronomy, and climate. State-of-the-art methods for the task estimate
+only marginal distributions of observations in single channels and at single
+timepoints, assuming a fixed-shape parametric distribution. In this work, we
+propose a novel model, ProFITi, for probabilistic forecasting of irregularly
+sampled time series with missing values using conditional normalizing flows.
+The model learns joint distributions over the future values of the time series
+conditioned on past observations and queried channels and times, without
+assuming any fixed shape of the underlying distribution. As model components,
+we introduce a novel invertible triangular attention layer and an invertible
+non-linear activation function on and onto the whole real line. We conduct
+extensive experiments on four datasets and demonstrate that the proposed model
+provides $4$ times higher likelihood over the previously best model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hardware implementation of timely reliable Bayesian decision-making
+  using memristors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.06838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.06838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lekai Song, Pengyu Liu, Yang Liu, Jingfang Pei, Wenyu Cui, Songwei Liu, Yingyi Wen, Teng Ma, Kong-Pang Pun, Leonard W. T. Ng, Guohua Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brains perform decision-making by Bayes theorem. The theorem quantifies
+events as probabilities and, based on probability rules, renders the decisions.
+Learning from this, Bayes theorem can be applied to enable efficient user-scene
+interactions. However, given the probabilistic nature, implementing Bayes
+theorem in hardware using conventional deterministic computing can incur
+excessive computational cost and decision latency. Though challenging, here we
+present a probabilistic computing approach based on memristors to implement the
+Bayes theorem. We integrate memristors with Boolean logics and, by exploiting
+the volatile stochastic switching of the memristors, realise probabilistic
+logic operations, key for hardware Bayes theorem implementation. To empirically
+validate the efficacy of the hardware Bayes theorem in user-scene interactions,
+we develop lightweight Bayesian inference and fusion hardware operators using
+the probabilistic logics and apply the operators in road scene parsing for
+self-driving, including route planning and obstacle detection. The results show
+our operators can achieve reliable decisions in less than 0.4 ms (or
+equivalently 2,500 fps), outperforming human decision-making and the existing
+driving assistance systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Spatio-Temporal Neural Network Forecasting Approach for Emulation of
+  Firefront Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.08523v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.08523v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Bolt, Carolyn Huston, Petra Kuhnert, Joel Janek Dabrowski, James Hilton, Conrad Sanderson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational simulations of wildfire spread typically employ empirical
+rate-of-spread calculations under various conditions (such as terrain, fuel
+type, weather). Small perturbations in conditions can often lead to significant
+changes in fire spread (such as speed and direction), necessitating a
+computationally expensive large set of simulations to quantify uncertainty.
+Model emulation seeks alternative representations of physical models using
+machine learning, aiming to provide more efficient and/or simplified surrogate
+models. We propose a dedicated spatio-temporal neural network based framework
+for model emulation, able to capture the complex behaviour of fire spread
+models. The proposed approach can approximate forecasts at fine spatial and
+temporal resolutions that are often challenging for neural network based
+approaches. Furthermore, the proposed approach is robust even with small
+training sets, due to novel data augmentation methods. Empirical experiments
+show good agreement between simulated and emulated firefronts, with an average
+Jaccard score of 0.76.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Spectral Methods by <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01312v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01312v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihan He, Yuan Cao, Hong-Yu Chen, Dennis Wu, Jianqing Fan, Han Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers demonstrate significant advantages as the building block of
+modern LLMs. In this work, we study the capacities of Transformers in
+performing unsupervised learning. We show that multi-layered Transformers,
+given a sufficiently large set of pre-training instances, are able to learn the
+algorithms themselves and perform statistical estimation tasks given new
+instances. This learning paradigm is distinct from the in-context learning
+setup and is similar to the learning procedure of human brains where skills are
+learned through past experience. Theoretically, we prove that pre-trained
+Transformers can learn the spectral methods and use the classification of
+bi-class Gaussian mixture model as an example. Our proof is constructive using
+algorithmic design techniques. Our results are built upon the similarities of
+multi-layered Transformer architecture with the iterative recovery algorithms
+used in practice. Empirically, we verify the strong capacity of the
+multi-layered (pre-trained) Transformer on unsupervised learning through the
+lens of both the PCA and the Clustering tasks performed on the synthetic and
+real-world datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>77 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond the Power Law: Estimation, Goodness-of-Fit, and a Semiparametric
+  Extension in Complex Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.11200v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.11200v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nixon Jerez-Lillo, Francisco A. Rodrigues, Paulo H. Ferreira, Pedro L. Ramos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scale-free networks play a fundamental role in the study of complex networks
+and various applied fields due to their ability to model a wide range of
+real-world systems. A key characteristic of these networks is their degree
+distribution, which often follows a power-law distribution, where the
+probability mass function is proportional to $x^{-\alpha}$, with $\alpha$
+typically ranging between $2 < \alpha < 3$. In this paper, we introduce
+Bayesian inference methods to obtain more accurate estimates than those
+obtained using traditional methods, which often yield biased estimates, and
+precise credible intervals. Through a simulation study, we demonstrate that our
+approach provides nearly unbiased estimates for the scaling parameter,
+enhancing the reliability of inferences. We also evaluate new goodness-of-fit
+tests to improve the effectiveness of the Kolmogorov-Smirnov test, commonly
+used for this purpose. Our findings show that the Watson test offers superior
+power while maintaining a controlled type I error rate, enabling us to better
+determine whether data adheres to a power-law distribution. Finally, we propose
+a piecewise extension of this model to provide greater flexibility, evaluating
+the estimation and its goodness-of-fit features as well. In the complex
+networks field, this extension allows us to model the full degree distribution,
+instead of just focusing on the tail, as is commonly done. We demonstrate the
+utility of these novel methods through applications to two real-world datasets,
+showcasing their practical relevance and potential to advance the analysis of
+power-law behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intelligent System for Automated Molecular Patent Infringement
+  Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07819v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07819v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaorui Shi, Sihang Li, Taiyan Zhang, Xi Fang, Jiankun Wang, Zhiyuan Liu, Guojiang Zhao, Zhengdan Zhu, Zhifeng Gao, Renxin Zhong, Linfeng Zhang, Guolin Ke, Weinan E, Hengxing Cai, Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated drug discovery offers significant potential for accelerating the
+development of novel therapeutics by substituting labor-intensive human
+workflows with machine-driven processes. However, molecules generated by
+artificial intelligence may unintentionally infringe on existing patents,
+posing legal and financial risks that impede the full automation of drug
+discovery pipelines. This paper introduces PatentFinder, a novel multi-agent
+and tool-enhanced intelligence system that can accurately and comprehensively
+evaluate small molecules for patent infringement. PatentFinder features five
+specialized agents that collaboratively analyze patent claims and molecular
+structures with heuristic and model-based tools, generating interpretable
+infringement reports. To support systematic evaluation, we curate
+MolPatent-240, a benchmark dataset tailored for patent infringement assessment
+algorithms. On this benchmark, PatentFinder outperforms baseline methods that
+rely solely on large language models or specialized chemical tools, achieving a
+13.8% improvement in F1-score and a 12% increase in accuracy. Additionally,
+PatentFinder autonomously generates detailed and interpretable patent
+infringement reports, showcasing enhanced accuracy and improved
+interpretability. The high accuracy and interpretability of PatentFinder make
+it a valuable and reliable tool for automating patent infringement assessments,
+offering a practical solution for integrating patent protection analysis into
+the drug discovery pipeline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-CoT: Exploring Chain-of-Thought Reasoning in Large Audio Language
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyang Ma, Zhuo Chen, Yuping Wang, Eng Siong Chng, Xie Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Audio-Language Models (LALMs) have demonstrated remarkable performance
+in tasks involving audio perception and understanding, such as speech
+recognition and audio captioning. However, their reasoning capabilities -
+critical for solving complex real-world problems - remain underexplored. In
+this work, we conduct the first exploration into integrating Chain-of-Thought
+(CoT) reasoning into LALMs to enhance their reasoning ability across auditory
+modalities. We evaluate representative CoT methods, analyzing their performance
+in both information extraction and reasoning tasks across sound, music, and
+speech domains. Our findings reveal that CoT methods significantly improve
+performance on easy and medium tasks but encounter challenges with hard tasks,
+where reasoning chains can confuse the model rather than improve accuracy.
+Additionally, we identify a positive correlation between reasoning path length
+and accuracy, demonstrating the potential of scaling inference for advanced
+instruction-following and reasoning. This study not only highlights the promise
+of CoT in enhancing LALM reasoning capabilities but also identifies key
+limitations and provides actionable directions for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Depth and Image Fusion for Road Obstacle Detection Using Stereo Camera 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Perezyabov, Mikhail Gavrilenkov, Ilya Afanasyev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper is devoted to the detection of objects on a road, performed with a
+combination of two methods based on both the use of depth information and video
+analysis of data from a stereo camera. Since neither the time of the appearance
+of an object on the road, nor its size and shape is known in advance,
+ML/DL-based approaches are not applicable. The task becomes more complicated
+due to variations in artificial illumination, inhomogeneous road surface
+texture, and unknown character and features of the object. To solve this
+problem we developed the depth and image fusion method that complements a
+search of small contrast objects by RGB-based method, and obstacle detection by
+stereo image-based approach with SLIC superpixel segmentation. We conducted
+experiments with static and low speed obstacles in an underground parking lot
+and demonstrated the successful work of the developed technique for detecting
+and even tracking small objects, which can be parking infrastructure objects,
+things left on the road, wheels, dropped boxes, etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Multimodal Fusion via Meta-Learning Towards Micro-Video
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07110v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07110v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Liu, Yinwei Wei, Fan Liu, Wenjie Wang, Liqiang Nie, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal information (e.g., visual, acoustic, and textual) has been widely
+used to enhance representation learning for micro-video recommendation. For
+integrating multimodal information into a joint representation of micro-video,
+multimodal fusion plays a vital role in the existing micro-video recommendation
+approaches. However, the static multimodal fusion used in previous studies is
+insufficient to model the various relationships among multimodal information of
+different micro-videos. In this paper, we develop a novel meta-learning-based
+multimodal fusion framework called Meta Multimodal Fusion (MetaMMF), which
+dynamically assigns parameters to the multimodal fusion function for each
+micro-video during its representation learning. Specifically, MetaMMF regards
+the multimodal fusion of each micro-video as an independent task. Based on the
+meta information extracted from the multimodal features of the input task,
+MetaMMF parameterizes a neural network as the item-specific fusion function via
+a meta learner. We perform extensive experiments on three benchmark datasets,
+demonstrating the significant improvements over several state-of-the-art
+multimodal recommendation models, like MMGCN, LATTICE, and InvRL. Furthermore,
+we lighten our model by adopting canonical polyadic decomposition to improve
+the training efficiency, and validate its effectiveness through experimental
+results. Codes are available at https://github.com/hanliu95/MetaMMF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ACM Transactions on Information
+  Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pedestrian Trajectory Prediction Based on Social Interactions Learning
+  With Random Weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajia Xie, Sheng Zhang, Beihao Xia, Zhu Xiao, Hongbo Jiang, Siwang Zhou, Zheng Qin, Hongyang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pedestrian trajectory prediction is a critical technology in the evolution of
+self-driving cars toward complete artificial intelligence. Over recent years,
+focusing on the trajectories of pedestrians to model their social interactions
+has surged with great interest in more accurate trajectory predictions.
+However, existing methods for modeling pedestrian social interactions rely on
+pre-defined rules, struggling to capture non-explicit social interactions. In
+this work, we propose a novel framework named DTGAN, which extends the
+application of Generative Adversarial Networks (GANs) to graph sequence data,
+with the primary objective of automatically capturing implicit social
+interactions and achieving precise predictions of pedestrian trajectory. DTGAN
+innovatively incorporates random weights within each graph to eliminate the
+need for pre-defined interaction rules. We further enhance the performance of
+DTGAN by exploring diverse task loss functions during adversarial training,
+which yields improvements of 16.7\% and 39.3\% on metrics ADE and FDE,
+respectively. The effectiveness and accuracy of our framework are verified on
+two public datasets. The experimental results show that our proposed DTGAN
+achieves superior performance and is well able to understand pedestrians'
+intentions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,7 figures,Accepted to IEEE Transactions on Multimedia (TMM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Efficient NVoD Scheme Using Implicit Error Correction and Subchannels
+  for Wireless Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rafael Asorey-Cacheda, Antonio-Javier Garcia-Sanchez, Joan Garcia-Haro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit Error Correction (IEC) is a near Video-on-Demand (nVoD) scheme that
+trades bandwidth utilization for initial playback delay to potentially support
+an infinite number of users. Additionally, it provides error protection without
+any further bandwidth increase by exploiting the implicit redundancy of nVoD
+protocols, using linear combinations of the segments transmitted in a given
+time slot. However, IEC packet loss protection is weaker at the beginning of
+the playback due to the lack of implicit redundancy and lower decoding
+efficiency, resulting in worse subjective playback quality. In tackling this
+issue, this paper contributes with an extension of the original nVoD
+architecture, enhancing its performance by adding a new element namely,
+subchannels. These subdivisions of the original channels do not provide further
+packet loss protection but significantly improve the decoding efficiency, which
+in turn increases playback quality, especially at the beginning. Even for very
+high packet loss probabilities, subchannels are designed to obtain higher
+decoding efficiency which results in greater packet loss protection than that
+provided by IEC. The proposed scheme is especially useful in wireless
+cooperative networks using techniques such as network coding, as content
+transmissions can be split into different subchannels in order to maximize
+network efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Sound of Water: Inferring Physical Properties from Pouring Liquids <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piyush Bagad, Makarand Tapaswi, Cees G. M. Snoek, Andrew Zisserman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the connection between audio-visual observations and the underlying
+physics of a mundane yet intriguing everyday activity: pouring liquids. Given
+only the sound of liquid pouring into a container, our objective is to
+automatically infer physical properties such as the liquid level, the shape and
+size of the container, the pouring rate and the time to fill. To this end, we:
+(i) show in theory that these properties can be determined from the fundamental
+frequency (pitch); (ii) train a pitch detection model with supervision from
+simulated data and visual data with a physics-inspired objective; (iii)
+introduce a new large dataset of real pouring videos for a systematic study;
+(iv) show that the trained model can indeed infer these physical properties for
+real data; and finally, (v) we demonstrate strong generalization to various
+container shapes, other datasets, and in-the-wild YouTube videos. Our work
+presents a keen understanding of a narrow yet rich problem at the intersection
+of acoustics, physics, and learning. It opens up applications to enhance
+multisensory perception in robotic pouring.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at https://bpiyush.github.io/pouring-water-website.
+  Short version accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BeFA: A General Behavior-driven Feature Adapter for Multimedia
+  Recommendation <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00323v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00323v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qile Fan, Penghang Yu, Zhiyi Tan, Bing-Kun Bao, Guanming Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia recommender systems focus on utilizing behavioral information and
+content information to model user preferences. Typically, it employs
+pre-trained feature encoders to extract content features, then fuses them with
+behavioral features. However, pre-trained feature encoders often extract
+features from the entire content simultaneously, including excessive
+preference-irrelevant details. We speculate that it may result in the extracted
+features not containing sufficient features to accurately reflect user
+preferences. To verify our hypothesis, we introduce an attribution analysis
+method for visually and intuitively analyzing the content features. The results
+indicate that certain products' content features exhibit the issues of
+information drift}and information omission,reducing the expressive ability of
+features. Building upon this finding, we propose an effective and efficient
+general Behavior-driven Feature Adapter (BeFA) to tackle these issues. This
+adapter reconstructs the content feature with the guidance of behavioral
+information, enabling content features accurately reflecting user preferences.
+Extensive experiments demonstrate the effectiveness of the adapter across all
+multimedia recommendation methods. Our code is made publicly available on
+https://github.com/fqldom/BeFA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">140</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Dataset</span> Distillation via Committee Voting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Cui, Zhaoyi Li, Xiaochen Ma, Xinyue Bi, Yaxin Luo, Zhiqiang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation aims to synthesize a smaller, representative dataset
+that preserves the essential properties of the original data, enabling
+efficient model training with reduced computational resources. Prior work has
+primarily focused on improving the alignment or matching process between
+original and synthetic data, or on enhancing the efficiency of distilling large
+datasets. In this work, we introduce ${\bf C}$ommittee ${\bf V}$oting for ${\bf
+D}$ataset ${\bf D}$istillation (CV-DD), a novel and orthogonal approach that
+leverages the collective wisdom of multiple models or experts to create
+high-quality distilled datasets. We start by showing how to establish a strong
+baseline that already achieves state-of-the-art accuracy through leveraging
+recent advancements and thoughtful adjustments in model design and optimization
+processes. By integrating distributions and predictions from a committee of
+models while generating high-quality soft labels, our method captures a wider
+spectrum of data features, reduces model-specific biases and the adverse
+effects of distribution shifts, leading to significant improvements in
+generalization. This voting-based strategy not only promotes diversity and
+robustness within the distilled dataset but also significantly reduces
+overfitting, resulting in improved performance on post-eval tasks. Extensive
+experiments across various datasets and IPCs (images per class) demonstrate
+that Committee Voting leads to more reliable and adaptable distilled data
+compared to single/multi-model distillation methods, demonstrating its
+potential for efficient and accurate dataset distillation. Code is available
+at: https://github.com/Jiacheng8/CV-DD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code at: https://github.com/Jiacheng8/CV-DD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UnCommon Objects in 3D 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingchen Liu, Piyush Tayal, Jianyuan Wang, Jesus Zarzar, Tom Monnier, Konstantinos Tertikas, Jiali Duan, Antoine Toisoul, Jason Y. Zhang, Natalia Neverova, Andrea Vedaldi, Roman Shapovalov, David Novotny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Uncommon Objects in 3D (uCO3D), a new object-centric dataset for
+3D deep learning and 3D generative AI. uCO3D is the largest publicly-available
+collection of high-resolution videos of objects with 3D annotations that
+ensures full-360$^{\circ}$ coverage. uCO3D is significantly more diverse than
+MVImgNet and CO3Dv2, covering more than 1,000 object categories. It is also of
+higher quality, due to extensive quality checks of both the collected videos
+and the 3D annotations. Similar to analogous datasets, uCO3D contains
+annotations for 3D camera poses, depth maps and sparse point clouds. In
+addition, each object is equipped with a caption and a 3D Gaussian Splat
+reconstruction. We train several large 3D models on MVImgNet, CO3Dv2, and uCO3D
+and obtain superior results using the latter, showing that uCO3D is better for
+learning applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WebWalker: Benchmarking LLMs in Web Traversal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialong Wu, Wenbiao Yin, Yong Jiang, Zhenglin Wang, Zekun Xi, Runnan Fang, Deyu Zhou, Pengjun Xie, Fei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) demonstrates remarkable performance
+across tasks in open-domain question-answering. However, traditional search
+engines may retrieve shallow content, limiting the ability of LLMs to handle
+complex, multi-layered information. To address it, we introduce WebWalkerQA, a
+benchmark designed to assess the ability of LLMs to perform web traversal. It
+evaluates the capacity of LLMs to traverse a website's subpages to extract
+high-quality data systematically. We propose WebWalker, which is a multi-agent
+framework that mimics human-like web navigation through an explore-critic
+paradigm. Extensive experimental results show that WebWalkerQA is challenging
+and demonstrates the effectiveness of RAG combined with WebWalker, through the
+horizontal and vertical integration in real-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Agent-based Program Repair at Google 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07531v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07531v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pat Rondon, Renyao Wei, José Cambronero, Jürgen Cito, Aaron Sun, Siddhant Sanyam, Michele Tufano, Satish Chandra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Agent-based program repair offers to automatically resolve complex bugs
+end-to-end by combining the planning, tool use, and code generation abilities
+of modern LLMs. Recent work has explored the use of agent-based repair
+approaches on the popular open-source SWE-Bench, a collection of bugs from
+highly-rated GitHub Python projects. In addition, various agentic approaches
+such as SWE-Agent have been proposed to solve bugs in this benchmark. This
+paper explores the viability of using an agentic approach to address bugs in an
+enterprise context. To investigate this, we curate an evaluation set of 178
+bugs drawn from Google's issue tracking system. This dataset spans both
+human-reported (78) and machine-reported bugs (100).
+  To establish a repair performance baseline on this benchmark, we implement
+Passerine, an agent similar in spirit to SWE-Agent that can work within
+Google's development environment. We show that with 20 trajectory samples and
+Gemini 1.5 Pro, Passerine can produce a patch that passes bug tests (i.e.,
+plausible) for 73% of machine-reported and 25.6% of human-reported bugs in our
+evaluation set. After manual examination, we found that 43% of machine-reported
+bugs and 17.9% of human-reported bugs have at least one patch that is
+semantically equivalent to the ground-truth patch.
+  These results establish a baseline on an industrially relevant benchmark,
+which as we show, contains bugs drawn from a different distribution -- in terms
+of language diversity, size, and spread of changes, etc. -- compared to those
+in the popular SWE-Bench dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance Optimization of Ratings-Based Reinforcement Learning <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evelyn Rose, Devin White, Mingkang Wu, Vernon Lawhern, Nicholas R. Waytowich, Yongcan Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores multiple optimization methods to improve the performance
+of rating-based reinforcement learning (RbRL). RbRL, a method based on the idea
+of human ratings, has been developed to infer reward functions in reward-free
+environments for the subsequent policy learning via standard reinforcement
+learning, which requires the availability of reward functions. Specifically,
+RbRL minimizes the cross entropy loss that quantifies the differences between
+human ratings and estimated ratings derived from the inferred reward. Hence, a
+low loss means a high degree of consistency between human ratings and estimated
+ratings. Despite its simple form, RbRL has various hyperparameters and can be
+sensitive to various factors. Therefore, it is critical to provide
+comprehensive experiments to understand the impact of various hyperparameters
+on the performance of RbRL. This paper is a work in progress, providing users
+some general guidelines on how to select hyperparameters in RbRL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Collaborative AI and Modeling of Humans Bridge
+  Program at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking AI Cultural Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Bravansky, Filip Trhlik, Fazl Barez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As AI systems become more integrated into society, evaluating their capacity
+to align with diverse cultural values is crucial for their responsible
+deployment. Current evaluation methods predominantly rely on multiple-choice
+question (MCQ) datasets. In this study, we demonstrate that MCQs are
+insufficient for capturing the complexity of cultural values expressed in
+open-ended scenarios. Our findings highlight significant discrepancies between
+MCQ-based assessments and the values conveyed in unconstrained interactions.
+Based on these findings, we recommend moving beyond MCQs to adopt more
+open-ended, context-specific assessments that better reflect how AI models
+engage with cultural values in realistic settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CDS: Data Synthesis Method Guided by Cognitive Diagnosis Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haokun Zhao, Jinyi Han, Jiaqing Liang, Yanghua Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated outstanding capabilities
+across various domains, but the increasing complexity of new challenges demands
+enhanced performance and adaptability. Traditional benchmarks, although
+comprehensive, often lack the granularity needed for detailed capability
+analysis. This study introduces the Cognitive Diagnostic Synthesis (CDS)
+method, which employs Cognitive Diagnosis Theory (CDT) for precise evaluation
+and targeted enhancement of LLMs. By decomposing complex tasks into discrete
+knowledge points, CDS accurately identifies and synthesizes data targeting
+model weaknesses, thereby enhancing the model's performance. This framework
+proposes a comprehensive pipeline driven by knowledge point evaluation,
+synthesis, data augmentation, and filtering, which significantly improves the
+model's mathematical and coding capabilities, achieving up to an 11.12%
+improvement in optimal scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models for Interpretable Mental Health Diagnosis <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Hyeongseok Kim, Chao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a clinical decision support system (CDSS) for mental health
+diagnosis that combines the strengths of large language models (LLMs) and
+constraint logic programming (CLP). Having a CDSS is important because of the
+high complexity of diagnostic manuals used by mental health professionals and
+the danger of diagnostic errors. Our CDSS is a software tool that uses an LLM
+to translate diagnostic manuals to a logic program and solves the program using
+an off-the-shelf CLP engine to query a patient's diagnosis based on the encoded
+rules and provided data. By giving domain experts the opportunity to inspect
+the LLM-generated logic program, and making modifications when needed, our CDSS
+ensures that the diagnosis is not only accurate but also interpretable. We
+experimentally compare it with two baseline approaches of using LLMs:
+diagnosing patients using the LLM-only approach, and using the LLM-generated
+logic program but without expert inspection. The results show that, while LLMs
+are extremely useful in generating candidate logic programs, these programs
+still require expert inspection and modification to guarantee faithfulness to
+the official diagnostic manuals. Additionally, ethical concerns arise from the
+direct use of patient data in LLMs, underscoring the need for a safer hybrid
+approach like our proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2025 Workshop on Large Language Models and
+  Generative AI for Health (GenAI4Health)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BlobGEN-Vid: Compositional Text-to-Video Generation with Blob Video
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixi Feng, Chao Liu, Sifei Liu, William Yang Wang, Arash Vahdat, Weili Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing video generation models struggle to follow complex text prompts and
+synthesize multiple objects, raising the need for additional grounding input
+for improved controllability. In this work, we propose to decompose videos into
+visual primitives - blob video representation, a general representation for
+controllable video generation. Based on blob conditions, we develop a
+blob-grounded video diffusion model named BlobGEN-Vid that allows users to
+control object motions and fine-grained object appearance. In particular, we
+introduce a masked 3D attention module that effectively improves regional
+consistency across frames. In addition, we introduce a learnable module to
+interpolate text embeddings so that users can control semantics in specific
+frames and obtain smooth object transitions. We show that our framework is
+model-agnostic and build BlobGEN-Vid based on both U-Net and DiT-based video
+diffusion models. Extensive experimental results show that BlobGEN-Vid achieves
+superior zero-shot video generation ability and state-of-the-art layout
+controllability on multiple benchmarks. When combined with an LLM for layout
+planning, our framework even outperforms proprietary text-to-video generators
+in terms of compositional accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://blobgen-vid2.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SafePowerGraph-LLM: Novel Power Grid Graph Embedding and Optimization
+  with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabien Bernier, Jun Cao, Maxime Cordy, Salah Ghamizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiently solving Optimal Power Flow (OPF) problems in power systems is
+crucial for operational planning and grid management. There is a growing need
+for scalable algorithms capable of handling the increasing variability,
+constraints, and uncertainties in modern power networks while providing
+accurate and fast solutions. To address this, machine learning techniques,
+particularly Graph Neural Networks (GNNs) have emerged as promising approaches.
+This letter introduces SafePowerGraph-LLM, the first framework explicitly
+designed for solving OPF problems using Large Language Models (LLM)s. The
+proposed approach combines graph and tabular representations of power grids to
+effectively query LLMs, capturing the complex relationships and constraints in
+power systems. A new implementation of in-context learning and fine-tuning
+protocols for LLMs is introduced, tailored specifically for the OPF problem.
+SafePowerGraph-LLM demonstrates reliable performances using off-the-shelf LLM.
+Our study reveals the impact of LLM architecture, size, and fine-tuning and
+demonstrates our framework's ability to handle realistic grid components and
+constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RadAlign: Advancing Radiology Report Generation with Vision-Language
+  Concept Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Difei Gu, Yunhe Gao, Yang Zhou, Mu Zhou, Dimitris Metaxas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated chest radiographs interpretation requires both accurate disease
+classification and detailed radiology report generation, presenting a
+significant challenge in the clinical workflow. Current approaches either focus
+on classification accuracy at the expense of interpretability or generate
+detailed but potentially unreliable reports through image captioning
+techniques. In this study, we present RadAlign, a novel framework that combines
+the predictive accuracy of vision-language models (VLMs) with the reasoning
+capabilities of large language models (LLMs). Inspired by the radiologist's
+workflow, RadAlign first employs a specialized VLM to align visual features
+with key medical concepts, achieving superior disease classification with an
+average AUC of 0.885 across multiple diseases. These recognized medical
+conditions, represented as text-based concepts in the aligned visual-language
+space, are then used to prompt LLM-based report generation. Enhanced by a
+retrieval-augmented generation mechanism that grounds outputs in similar
+historical cases, RadAlign delivers superior report quality with a GREEN score
+of 0.678, outperforming state-of-the-art methods' 0.634. Our framework
+maintains strong clinical interpretability while reducing hallucinations,
+advancing automated medical imaging and report analysis through integrated
+predictive and generative AI. Code is available at
+https://github.com/difeigu/RadAlign.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parallel Key-Value Cache Fusion for Position Invariant RAG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philhoon Oh, Jinwoo Shin, James Thorne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Large Language Models (LLMs) underscore the necessity
+of Retrieval Augmented Generation (RAG) to leverage external information.
+However, LLMs are sensitive to the position of relevant information within
+contexts and tend to generate incorrect responses when such information is
+placed in the middle, known as `Lost in the Middle' phenomenon. In this paper,
+we introduce a framework that generates consistent outputs for decoder-only
+models, irrespective of the input context order. Experimental results for three
+open domain question answering tasks demonstrate position invariance, where the
+model is not sensitive to input context order, and superior robustness to
+irrelevent passages compared to prevailing approaches for RAG pipelines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Paradox of Success in Evolutionary and Bioinspired Optimization:
+  Revisiting Critical Issues, Key Studies, and Methodological Pathways 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Molina, Javier Del Ser, Javier Poyatos, Francisco Herrera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evolutionary and bioinspired computation are crucial for efficiently
+addressing complex optimization problems across diverse application domains. By
+mimicking processes observed in nature, like evolution itself, these algorithms
+offer innovative solutions beyond the reach of traditional optimization
+methods. They excel at finding near-optimal solutions in large, complex search
+spaces, making them invaluable in numerous fields. However, both areas are
+plagued by challenges at their core, including inadequate benchmarking,
+problem-specific overfitting, insufficient theoretical grounding, and
+superfluous proposals justified only by their biological metaphor. This
+overview recapitulates and analyzes in depth the criticisms concerning the lack
+of innovation and rigor in experimental studies within the field. To this end,
+we examine the judgmental positions of the existing literature in an informed
+attempt to guide the research community toward directions of solid contribution
+and advancement in these areas. We summarize guidelines for the design of
+evolutionary and bioinspired optimizers, the development of experimental
+comparisons, and the derivation of novel proposals that take a step further in
+the field. We provide a brief note on automating the process of creating these
+algorithms, which may help align metaheuristic optimization research with its
+primary objective (solving real-world problems), provided that our identified
+pathways are followed. Our conclusions underscore the need for a sustained push
+towards innovation and the enforcement of methodological rigor in prospective
+studies to fully realize the potential of these advanced computational
+techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inductive Learning of Robot Task Knowledge from Raw Data and Online
+  Expert Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Meli, Paolo Fiorini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing level of autonomy of robots poses challenges of trust and
+social acceptance, especially in human-robot interaction scenarios. This
+requires an interpretable implementation of robotic cognitive capabilities,
+possibly based on formal methods as logics for the definition of task
+specifications. However, prior knowledge is often unavailable in complex
+realistic scenarios.
+  In this paper, we propose an offline algorithm based on inductive logic
+programming from noisy examples to extract task specifications (i.e., action
+preconditions, constraints and effects) directly from raw data of few
+heterogeneous (i.e., not repetitive) robotic executions. Our algorithm
+leverages on the output of any unsupervised action identification algorithm
+from video-kinematic recordings. Combining it with the definition of very
+basic, almost task-agnostic, commonsense concepts about the environment, which
+contribute to the interpretability of our methodology, we are able to learn
+logical axioms encoding preconditions of actions, as well as their effects in
+the event calculus paradigm. Since the quality of learned specifications
+depends mainly on the accuracy of the action identification algorithm, we also
+propose an online framework for incremental refinement of task knowledge from
+user feedback, guaranteeing safe execution. Results in a standard manipulation
+task and benchmark for user training in the safety-critical surgical robotic
+scenario, show the robustness, data- and time-efficiency of our methodology,
+with promising results towards the scalability in more complex domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RbRL2.0: Integrated Reward and Policy Learning for Rating-based
+  Reinforcement Learning <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingkang Wu, Devin White, Vernon Lawhern, Nicholas R. Waytowich, Yongcan Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL), a common tool in decision making, learns
+policies from various experiences based on the associated cumulative
+return/rewards without treating them differently. On the contrary, humans often
+learn to distinguish from different levels of performance and extract the
+underlying trends towards improving their decision making for best performance.
+Motivated by this, this paper proposes a novel RL method that mimics humans'
+decision making process by differentiating among collected experiences for
+effective policy learning. The main idea is to extract important directional
+information from experiences with different performance levels, named ratings,
+so that policies can be updated towards desired deviation from these
+experiences with different ratings. Specifically, we propose a new policy loss
+function that penalizes distribution similarities between the current policy
+and failed experiences with different ratings, and assign different weights to
+the penalty terms based on the rating classes. Meanwhile, reward learning from
+these rated samples can be integrated with the new policy loss towards an
+integrated reward and policy learning from rated samples. Optimizing the
+integrated reward and policy loss function will lead to the discovery of
+directions for policy improvement towards maximizing cumulative rewards and
+penalizing most from the lowest performance level while least from the highest
+performance level. To evaluate the effectiveness of the proposed method, we
+present results for experiments on a few typical environments that show
+improved convergence and overall performance over the existing rating-based
+reinforcement learning method with only reward learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Collaborative AI and Modeling of Humans Bridge
+  Program at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data and System Perspectives of Sustainable Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Xie, David Harel, Dezhi Ran, Zhenwen Li, Maoliang Li, Zhi Yang, Leye Wang, Xiang Chen, Ying Zhang, Wentao Zhang, Meng Li, Chen Zhang, Linyi Li, Assaf Marron
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sustainable AI is a subfield of AI for concerning developing and using AI
+systems in ways of aiming to reduce environmental impact and achieve
+sustainability. Sustainable AI is increasingly important given that training of
+and inference with AI models such as large langrage models are consuming a
+large amount of computing power. In this article, we discuss current issues,
+opportunities and example solutions for addressing these issues, and future
+challenges to tackle, from the data and system perspectives, related to data
+acquisition, data processing, and AI model training and inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smart Learning in the 21st Century: Advancing Constructionism Across
+  Three Digital Epochs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilya Levin, Alexei L. Semenov, Mikael Gorsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article explores the evolution of constructionism as an educational
+framework, tracing its relevance and transformation across three pivotal eras:
+the advent of personal computing, the networked society, and the current era of
+generative AI. Rooted in Seymour Papert constructionist philosophy, this study
+examines how constructionist principles align with the expanding role of
+digital technology in personal and collective learning. We discuss the
+transformation of educational environments from hierarchical instructionism to
+constructionist models that emphasize learner autonomy and interactive,
+creative engagement. Central to this analysis is the concept of an expanded
+personality, wherein digital tools and AI integration fundamentally reshape
+individual self-perception and social interactions. By integrating
+constructionism into the paradigm of smart education, we propose it as a
+foundational approach to personalized and democratized learning. Our findings
+underscore constructionism enduring relevance in navigating the complexities of
+technology-driven education, providing insights for educators and policymakers
+seeking to harness digital innovations to foster adaptive, student-centered
+learning experiences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TiEBe: A Benchmark for Assessing the Current Knowledge of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thales Sales Almeida, Giovana Kerche Bonás, João Guilherme Alves Santos, Hugo Abonizio, Rodrigo Nogueira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a rapidly evolving knowledge landscape and the increasing adoption of
+large language models, a need has emerged to keep these models continuously
+updated with current events. While existing benchmarks evaluate general factual
+recall, they often overlook two critical aspects: the ability of models to
+integrate evolving knowledge through continual learning and the significant
+regional disparities in their performance. To address these gaps, we introduce
+the Timely Events Benchmark (TiEBe), a dataset containing over 11,000
+question-answer pairs focused on globally and regionally significant events.
+TiEBe leverages structured retrospective data from Wikipedia, enabling
+continuous updates to assess LLMs' knowledge of evolving global affairs and
+their understanding of events across different regions. Our benchmark
+demonstrates that LLMs exhibit substantial geographic disparities in factual
+recall, emphasizing the need for more balanced global knowledge representation.
+Furthermore, TiEBe serves as a tool for evaluating continual learning
+strategies, providing insights into models' ability to acquire new information
+without forgetting past knowledge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimating Musical Surprisal in Audio <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07474v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07474v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathias Rose Bjare, Giorgia Cantisani, Stefan Lattner, Gerhard Widmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In modeling musical surprisal expectancy with computational methods, it has
+been proposed to use the information content (IC) of one-step predictions from
+an autoregressive model as a proxy for surprisal in symbolic music. With an
+appropriately chosen model, the IC of musical events has been shown to
+correlate with human perception of surprise and complexity aspects, including
+tonal and rhythmic complexity. This work investigates whether an analogous
+methodology can be applied to music audio. We train an autoregressive
+Transformer model to predict compressed latent audio representations of a
+pretrained autoencoder network. We verify learning effects by estimating the
+decrease in IC with repetitions. We investigate the mean IC of musical segment
+types (e.g., A or B) and find that segment types appearing later in a piece
+have a higher IC than earlier ones on average. We investigate the IC's relation
+to audio and musical features and find it correlated with timbral variations
+and loudness and, to a lesser extent, dissonance, rhythmic complexity, and
+onset density related to audio and musical features. Finally, we investigate if
+the IC can predict EEG responses to songs and thus model humans' surprisal in
+music. We provide code for our method on github.com/sonycslparis/audioic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, 1 table. Accepted at the 2025 IEEE International
+  Conference on Acoustics, Speech and Signal Processing (ICASSP 2025),
+  Hyderabad, India</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Embodied AI in Healthcare: Techniques, Applications, and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihao Liu, Xu Cao, Tingting Chen, Yankai Jiang, Junjie You, Minghua Wu, Xiaosong Wang, Mengling Feng, Yaochu Jin, Jintai Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Healthcare systems worldwide face persistent challenges in efficiency,
+accessibility, and personalization. Powered by modern AI technologies such as
+multimodal large language models and world models, Embodied AI (EmAI)
+represents a transformative frontier, offering enhanced autonomy and the
+ability to interact with the physical world to address these challenges. As an
+interdisciplinary and rapidly evolving research domain, "EmAI in healthcare"
+spans diverse fields such as algorithms, robotics, and biomedicine. This
+complexity underscores the importance of timely reviews and analyses to track
+advancements, address challenges, and foster cross-disciplinary collaboration.
+In this paper, we provide a comprehensive overview of the "brain" of EmAI for
+healthcare, wherein we introduce foundational AI algorithms for perception,
+actuation, planning, and memory, and focus on presenting the healthcare
+applications spanning clinical interventions, daily care & companionship,
+infrastructure support, and biomedical research. Despite its promise, the
+development of EmAI for healthcare is hindered by critical challenges such as
+safety concerns, gaps between simulation platforms and real-world applications,
+the absence of standardized benchmarks, and uneven progress across
+interdisciplinary domains. We discuss the technical barriers and explore
+ethical considerations, offering a forward-looking perspective on the future of
+EmAI in healthcare. A hierarchical framework of intelligent levels for EmAI
+systems is also introduced to guide further development. By providing
+systematic insights, this work aims to inspire innovation and practical
+applications, paving the way for a new era of intelligent, patient-centered
+healthcare.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>44 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding and Benchmarking Artificial Intelligence: OpenAI's o3 Is
+  Not AGI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07458v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07458v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rolf Pfister, Hansueli Jud
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  OpenAI's o3 achieves a high score of 87.5 % on ARC-AGI, a benchmark proposed
+to measure intelligence. This raises the question whether systems based on
+Large Language Models (LLMs), particularly o3, demonstrate intelligence and
+progress towards artificial general intelligence (AGI). Building on the
+distinction between skills and intelligence made by Fran\c{c}ois Chollet, the
+creator of ARC-AGI, a new understanding of intelligence is introduced: an agent
+is the more intelligent, the more efficiently it can achieve the more diverse
+goals in the more diverse worlds with the less knowledge. An analysis of the
+ARC-AGI benchmark shows that its tasks represent a very specific type of
+problem that can be solved by massive trialling of combinations of predefined
+operations. This method is also applied by o3, achieving its high score through
+the extensive use of computing power. However, for most problems in the
+physical world and in the human domain, solutions cannot be tested in advance
+and predefined operations are not available. Consequently, massive trialling of
+predefined operations, as o3 does, cannot be a basis for AGI - instead, new
+approaches are required that can reliably solve a wide variety of problems
+without existing skills. To support this development, a new benchmark for
+intelligence is outlined that covers a much higher diversity of unknown tasks
+to be solved, thus enabling a comprehensive assessment of intelligence and of
+progress towards AGI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online inductive learning from answer sets for efficient reinforcement
+  learning exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Celeste Veronese, Daniele Meli, Alessandro Farinelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach combining inductive logic programming
+with reinforcement learning to improve training performance and explainability.
+We exploit inductive learning of answer set programs from noisy examples to
+learn a set of logical rules representing an explainable approximation of the
+agent policy at each batch of experience. We then perform answer set reasoning
+on the learned rules to guide the exploration of the learning agent at the next
+batch, without requiring inefficient reward shaping and preserving optimality
+with soft bias. The entire procedure is conducted during the online execution
+of the reinforcement learning algorithm. We preliminarily validate the efficacy
+of our approach by integrating it into the Q-learning algorithm for the Pac-Man
+scenario in two maps of increasing complexity. Our methodology produces a
+significant boost in the discounted return achieved by the agent, even in the
+first batches of training. Moreover, inductive learning does not compromise the
+computational time required by Q-learning and learned rules quickly converge to
+an explanation of the agent policy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention when you need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lokesh Boominathan, Yizhou Chen, Matthew McGinley, Xaq Pitkow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being attentive to task-relevant features can improve task performance, but
+paying attention comes with its own metabolic cost. Therefore, strategic
+allocation of attention is crucial in performing the task efficiently. This
+work aims to understand this strategy. Recently, de Gee et al. conducted
+experiments involving mice performing an auditory sustained attention-value
+task. This task required the mice to exert attention to identify whether a
+high-order acoustic feature was present amid the noise. By varying the trial
+duration and reward magnitude, the task allows us to investigate how an agent
+should strategically deploy their attention to maximize their benefits and
+minimize their costs. In our work, we develop a reinforcement learning-based
+normative model of the mice to understand how it balances attention cost
+against its benefits. The model is such that at each moment the mice can choose
+between two levels of attention and decide when to take costly actions that
+could obtain rewards. Our model suggests that efficient use of attentional
+resources involves alternating blocks of high attention with blocks of low
+attention. In the extreme case where the agent disregards sensory input during
+low attention states, we see that high attention is used rhythmically. Our
+model provides evidence about how one should deploy attention as a function of
+task utility, signal statistics, and how attention affects sensory evidence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empirical Evaluation of the Implicit Hitting Set Approach for Weighted
+  CSPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksandra Petrova, Javier Larrosa, Emma Rollón
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SAT technology has proven to be surprisingly effective in a large variety of
+domains. However, for the Weighted CSP problem dedicated algorithms have always
+been superior. One approach not well-studied so far is the use of SAT in
+conjunction with the Implicit Hitting Set approach. In this work, we explore
+some alternatives to the existing algorithm of reference. The alternatives,
+mostly borrowed from related boolean frameworks, consider trade-offs for the
+two main components of the IHS approach: the computation of low-cost hitting
+vectors, and their transformation into high-cost cores. For each one, we
+propose 4 levels of intensity. Since we also test the usefulness of cost
+function merging, our experiments consider 32 different implementations. Our
+empirical study shows that for WCSP it is not easy to identify the best
+alternative. Nevertheless, the cost-function merging encoding and extracting
+maximal cores seems to be a robust approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diff-Ensembler: Learning to Ensemble 2D Diffusion Models for
+  Volume-to-Volume Medical Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiyue Zhu, Dou Hoon Kwark, Ruike Zhu, Kaiwen Hong, Yiqi Tao, Shirui Luo, Yudu Li, Zhi-Pei Liang, Volodymyr Kindratenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite success in volume-to-volume translations in medical images, most
+existing models struggle to effectively capture the inherent volumetric
+distribution using 3D representations. The current state-of-the-art approach
+combines multiple 2D-based networks through weighted averaging, thereby
+neglecting the 3D spatial structures. Directly training 3D models in medical
+imaging presents significant challenges due to high computational demands and
+the need for large-scale datasets. To address these challenges, we introduce
+Diff-Ensembler, a novel hybrid 2D-3D model for efficient and effective
+volumetric translations by ensembling perpendicularly trained 2D diffusion
+models with a 3D network in each diffusion step. Moreover, our model can
+naturally be used to ensemble diffusion models conditioned on different
+modalities, allowing flexible and accurate fusion of input conditions.
+Extensive experiments demonstrate that Diff-Ensembler attains superior accuracy
+and volumetric realism in 3D medical image super-resolution and modality
+translation. We further demonstrate the strength of our model's volumetric
+realism using tumor segmentation as a downstream task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Investigation into Seasonal Variations in Energy Forecasting for
+  Student Residences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Umair Danish, Mathumitha Sureshkumar, Thanuri Fonseka, Umeshika Uthayakumar, Vinura Galwaduge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research provides an in-depth evaluation of various machine learning
+models for energy forecasting, focusing on the unique challenges of seasonal
+variations in student residential settings. The study assesses the performance
+of baseline models, such as LSTM and GRU, alongside state-of-the-art
+forecasting methods, including Autoregressive Feedforward Neural Networks,
+Transformers, and hybrid approaches. Special attention is given to predicting
+energy consumption amidst challenges like seasonal patterns, vacations,
+meteorological changes, and irregular human activities that cause sudden
+fluctuations in usage. The findings reveal that no single model consistently
+outperforms others across all seasons, emphasizing the need for season-specific
+model selection or tailored designs. Notably, the proposed Hyper Network based
+LSTM and MiniAutoEncXGBoost models exhibit strong adaptability to seasonal
+variations, effectively capturing abrupt changes in energy consumption during
+summer months. This study advances the energy forecasting field by emphasizing
+the critical role of seasonal dynamics and model-specific behavior in achieving
+accurate predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Initial Findings on Sensor based Open Vocabulary Activity Recognition
+  via Text Embedding Inversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lala Shakti Swarup Ray, Bo Zhou, Sungho Suh, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional human activity recognition (HAR) relies on classifiers trained
+to predict discrete activity classes, inherently limiting recognition to
+activities explicitly present in the training set. Such classifiers would
+invariably fail, putting zero likelihood, when encountering unseen activities.
+We propose Open Vocabulary HAR (OV-HAR), a framework that overcomes this
+limitation by first converting each activity into natural language and breaking
+it into a sequence of elementary motions. This descriptive text is then encoded
+into a fixed-size embedding. The model is trained to regress this embedding,
+which is subsequently decoded back into natural language using a pre-trained
+embedding inversion model. Unlike other works that rely on auto-regressive
+large language models (LLMs) at their core, OV-HAR achieves open vocabulary
+recognition without the computational overhead of such models. The generated
+text can be transformed into a single activity class using LLM prompt
+engineering. We have evaluated our approach on different modalities, including
+vision (pose), IMU, and pressure sensors, demonstrating robust generalization
+across unseen activities and modalities, offering a fundamentally different
+paradigm from contemporary classifiers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PROTECT: Protein circadian time prediction using unsupervised learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07405v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07405v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aram Ansary Ogholbake, Qiang Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Circadian rhythms regulate the physiology and behavior of humans and animals.
+Despite advancements in understanding these rhythms and predicting circadian
+phases at the transcriptional level, predicting circadian phases from proteomic
+data remains elusive. This challenge is largely due to the scarcity of time
+labels in proteomic datasets, which are often characterized by small sample
+sizes, high dimensionality, and significant noise. Furthermore, existing
+methods for predicting circadian phases from transcriptomic data typically rely
+on prior knowledge of known rhythmic genes, making them unsuitable for
+proteomic datasets. To address this gap, we developed a novel computational
+method using unsupervised deep learning techniques to predict circadian sample
+phases from proteomic data without requiring time labels or prior knowledge of
+proteins or genes. Our model involves a two-stage training process optimized
+for robust circadian phase prediction: an initial greedy one-layer-at-a-time
+pre-training which generates informative initial parameters followed by
+fine-tuning. During fine-tuning, a specialized loss function guides the model
+to align protein expression levels with circadian patterns, enabling it to
+accurately capture the underlying rhythmic structure within the data. We tested
+our method on both time-labeled and unlabeled proteomic data. For labeled data,
+we compared our predictions to the known time labels, achieving high accuracy,
+while for unlabeled human datasets, including postmortem brain regions and
+urine samples, we explored circadian disruptions. Notably, our analysis
+identified disruptions in rhythmic proteins between Alzheimer's disease and
+control subjects across these samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Derivation of effective gradient flow equations and dynamical truncation
+  of training data in Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We derive explicit equations governing the cumulative biases and weights in
+Deep Learning with ReLU activation function, based on gradient descent for the
+Euclidean cost in the input layer, and under the assumption that the weights
+are, in a precise sense, adapted to the coordinate system distinguished by the
+activations. We show that gradient descent corresponds to a dynamical process
+in the input layer, whereby clusters of data are progressively reduced in
+complexity ("truncated") at an exponential rate that increases with the number
+of data points that have already been truncated. We provide a detailed
+discussion of several types of solutions to the gradient flow equations. A main
+motivation for this work is to shed light on the interpretability question in
+supervised learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AMS Latex, 35 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Essentials of AI for Life and Society: An AI Literacy Course for the
+  University Community <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07392v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07392v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joydeep Biswas, Don Fussell, Peter Stone, Kristin Patterson, Kristen Procko, Lea Sabatini, Zifan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We describe the development of a one-credit course to promote AI literacy at
+The University of Texas at Austin. In response to a call for the rapid
+deployment of class to serve a broad audience in Fall of 2023, we designed a
+14-week seminar-style course that incorporated an interdisciplinary group of
+speakers who lectured on topics ranging from the fundamentals of AI to societal
+concerns including disinformation and employment. University students, faculty,
+and staff, and even community members outside of the University, were invited
+to enroll in this online offering: The Essentials of AI for Life and Society.
+We collected feedback from course participants through weekly reflections and a
+final survey. Satisfyingly, we found that attendees reported gains in their AI
+literacy. We sought critical feedback through quantitative and qualitative
+analysis, which uncovered challenges in designing a course for this general
+audience. We utilized the course feedback to design a three-credit version of
+the course that is being offered in Fall of 2024. The lessons we learned and
+our plans for this new iteration may serve as a guide to instructors designing
+AI courses for a broad audience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EAAI-25: The 15th Symposium on Educational Advances in
+  Artificial Intelligence, collocated with AAAI-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Retrieval-Augmented Generation: A Study of Best Practices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siran Li, Linus Stenzel, Carsten Eickhoff, Seyed Ali Bahrainian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) systems have recently shown remarkable
+advancements by integrating retrieval mechanisms into language models,
+enhancing their ability to produce more accurate and contextually relevant
+responses. However, the influence of various components and configurations
+within RAG systems remains underexplored. A comprehensive understanding of
+these elements is essential for tailoring RAG systems to complex retrieval
+tasks and ensuring optimal performance across diverse applications. In this
+paper, we develop several advanced RAG system designs that incorporate query
+expansion, various novel retrieval strategies, and a novel Contrastive
+In-Context Learning RAG. Our study systematically investigates key factors,
+including language model size, prompt design, document chunk size, knowledge
+base size, retrieval stride, query expansion techniques, Contrastive In-Context
+Learning knowledge bases, multilingual knowledge bases, and Focus Mode
+retrieving relevant context at sentence-level. Through extensive
+experimentation, we provide a detailed analysis of how these factors influence
+response quality. Our findings offer actionable insights for developing RAG
+systems, striking a balance between contextual richness and
+retrieval-generation efficiency, thereby paving the way for more adaptable and
+high-performing RAG frameworks in diverse real-world scenarios. Our code and
+implementation details are publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information-Theoretic Dual Memory System for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        RunQing Wu, KaiHui Huang, HanYi Zhang, QiHe Liu, GuoJin Yu, JingSong Deng, Fei Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continuously acquiring new knowledge from a dynamic environment is a
+fundamental capability for animals, facilitating their survival and ability to
+address various challenges. This capability is referred to as continual
+learning, which focuses on the ability to learn a sequence of tasks without the
+detriment of previous knowledge. A prevalent strategy to tackle continual
+learning involves selecting and storing numerous essential data samples from
+prior tasks within a fixed-size memory buffer. However, the majority of current
+memory-based techniques typically utilize a single memory buffer, which poses
+challenges in concurrently managing newly acquired and previously learned
+samples. Drawing inspiration from the Complementary Learning Systems (CLS)
+theory, which defines rapid and gradual learning mechanisms for processing
+information, we propose an innovative dual memory system called the
+Information-Theoretic Dual Memory System (ITDMS). This system comprises a fast
+memory buffer designed to retain temporary and novel samples, alongside a slow
+memory buffer dedicated to preserving critical and informative samples. The
+fast memory buffer is optimized employing an efficient reservoir sampling
+process. Furthermore, we introduce a novel information-theoretic memory
+optimization strategy that selectively identifies and retains diverse and
+informative data samples for the slow memory buffer. Additionally, we propose a
+novel balanced sample selection procedure that automatically identifies and
+eliminates redundant memorized samples, thus freeing up memory capacity for new
+data acquisitions, which can deal with a growing array of tasks. Our
+methodology is rigorously assessed through a series of continual learning
+experiments, with empirical results underscoring the effectiveness of the
+proposed system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 9 figures, submitted to Knowledge-Based Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emergent effects of scaling on the functional hierarchies within large
+  language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul C. Bogdan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language model (LLM) architectures are often described as functionally
+hierarchical: Early layers process syntax, middle layers begin to parse
+semantics, and late layers integrate information. The present work revisits
+these ideas. This research submits simple texts to an LLM (e.g., "A church and
+organ") and extracts the resulting activations. Then, for each layer, support
+vector machines and ridge regressions are fit to predict a text's label and
+thus examine whether a given layer encodes some information. Analyses using a
+small model (Llama-3.2-3b; 28 layers) partly bolster the common hierarchical
+perspective: Item-level semantics are most strongly represented early (layers
+2-7), then two-item relations (layers 8-12), and then four-item analogies
+(layers 10-15). Afterward, the representation of items and simple relations
+gradually decreases in deeper layers that focus on more global information.
+However, several findings run counter to a steady hierarchy view: First,
+although deep layers can represent document-wide abstractions, deep layers also
+compress information from early portions of the context window without
+meaningful abstraction. Second, when examining a larger model
+(Llama-3.3-70b-Instruct), stark fluctuations in abstraction level appear: As
+depth increases, two-item relations and four-item analogies initially increase
+in their representation, then markedly decrease, and afterward increase again
+momentarily. This peculiar pattern consistently emerges across several
+experiments. Third, another emergent effect of scaling is coordination between
+the attention mechanisms of adjacent layers. Across multiple experiments using
+the larger model, adjacent layers fluctuate between what information they each
+specialize in representing. In sum, an abstraction hierarchy often manifests
+across layers, but large models also deviate from this structure in curious
+ways.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tempo<span class="highlight-title">GPT</span>: Enhancing Temporal Reasoning via Quantizing Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochuan Zhang, Chunhua Yang, Jie Han, Liyang Qin, Xiaoli Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal language model has made advanced progress in vision and audio,
+but still faces significant challenges in dealing with complex reasoning tasks
+in the time series domain. The reasons are twofold. First, labels for
+multi-modal time series data are coarse and devoid of analysis or reasoning
+processes. Training with these data cannot improve the model's reasoning
+capabilities. Second, due to the lack of precise tokenization in processing
+time series, the representation patterns for temporal and textual information
+are inconsistent, which hampers the effectiveness of multi-modal alignment. To
+address these challenges, we propose a multi-modal time series data
+construction approach and a multi-modal time series language model (TLM),
+TempoGPT. Specially, we construct multi-modal data for complex reasoning tasks
+by analyzing the variable-system relationships within a white-box system.
+Additionally, proposed TempoGPT achieves consistent representation between
+temporal and textual information by quantizing temporal embeddings, where
+temporal embeddings are quantized into a series of discrete tokens using a
+predefined codebook; subsequently, a shared embedding layer processes both
+temporal and textual tokens. Extensive experiments demonstrate that TempoGPT
+accurately perceives temporal information, logically infers conclusions, and
+achieves state-of-the-art in the constructed complex time series reasoning
+tasks. Moreover, we quantitatively demonstrate the effectiveness of quantizing
+temporal embeddings in enhancing multi-modal alignment and the reasoning
+capabilities of TLMs. Code and data are available at
+https://github.com/zhanghaochuan20/TempoGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anonymization of Documents for Law Enforcement with Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Eberhardinger, Patrick Takenaka, Daniel Grießhaber, Johannes Maucher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The steadily increasing utilization of data-driven methods and approaches in
+areas that handle sensitive personal information such as in law enforcement
+mandates an ever increasing effort in these institutions to comply with data
+protection guidelines. In this work, we present a system for automatically
+anonymizing images of scanned documents, reducing manual effort while ensuring
+data protection compliance. Our method considers the viability of further
+forensic processing after anonymization by minimizing automatically redacted
+areas by combining automatic detection of sensitive regions with knowledge from
+a manually anonymized reference document. Using a self-supervised image model
+for instance retrieval of the reference document, our approach requires only
+one anonymized example to efficiently redact all documents of the same type,
+significantly reducing processing time. We show that our approach outperforms
+both a purely automatic redaction system and also a naive copy-paste scheme of
+the reference anonymization to other documents on a hand-crafted dataset of
+ground truth redactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE Symposium on CI in Security, Defence and Biometrics
+  2025 (IEEE CISDB)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Lessons of Developing Process Reward Models in Mathematical
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenru Zhang, Chujie Zheng, Yangzhen Wu, Beichen Zhang, Runji Lin, Bowen Yu, Dayiheng Liu, Jingren Zhou, Junyang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Process Reward Models (PRMs) emerge as a promising approach for process
+supervision in mathematical reasoning of Large Language Models (LLMs), which
+aim to identify and mitigate intermediate errors in the reasoning processes.
+However, the development of effective PRMs faces significant challenges,
+particularly in data annotation and evaluation methodologies. In this paper,
+through extensive experiments, we demonstrate that commonly used Monte Carlo
+(MC) estimation-based data synthesis for PRMs typically yields inferior
+performance and generalization compared to LLM-as-a-judge and human annotation
+methods. MC estimation relies on completion models to evaluate current-step
+correctness, leading to inaccurate step verification. Furthermore, we identify
+potential biases in conventional Best-of-N (BoN) evaluation strategies for
+PRMs: (1) The unreliable policy models generate responses with correct answers
+but flawed processes, leading to a misalignment between the evaluation criteria
+of BoN and the PRM objectives of process verification. (2) The tolerance of
+PRMs of such responses leads to inflated BoN scores. (3) Existing PRMs have a
+significant proportion of minimum scores concentrated on the final answer
+steps, revealing the shift from process to outcome-based assessment in BoN
+Optimized PRMs. To address these challenges, we develop a consensus filtering
+mechanism that effectively integrates MC estimation with LLM-as-a-judge and
+advocates a more comprehensive evaluation framework that combines
+response-level and step-level metrics. Based on the mechanisms, we
+significantly improve both model performance and data efficiency in the BoN
+evaluation and the step-wise error identification task. Finally, we release a
+new state-of-the-art PRM that outperforms existing open-source alternatives and
+provides practical guidelines for future research in building process
+supervision models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Principles for Responsible AI Consciousness Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Butlin, Theodoros Lappas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research suggests that it may be possible to build conscious AI
+systems now or in the near future. Conscious AI systems would arguably deserve
+moral consideration, and it may be the case that large numbers of conscious
+systems could be created and caused to suffer. Furthermore, AI systems or
+AI-generated characters may increasingly give the impression of being
+conscious, leading to debate about their moral status. Organisations involved
+in AI research must establish principles and policies to guide research and
+deployment choices and public communication concerning consciousness. Even if
+an organisation chooses not to study AI consciousness as such, it will still
+need policies in place, as those developing advanced AI systems risk
+inadvertently creating conscious entities. Responsible research and deployment
+practices are essential to address this possibility. We propose five principles
+for responsible research and argue that research organisations should make
+voluntary, public commitments to principles on these lines. Our principles
+concern research objectives and procedures, knowledge sharing and public
+communications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM-Net: Democratizing LLMs-as-a-Service through Blockchain-based Expert
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zan-Kai Chong, Hiroyuki Ohsaki, Bryan Ng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The centralization of Large Language Models (LLMs) development has created
+significant barriers to AI advancement, limiting the democratization of these
+powerful technologies. This centralization, coupled with the scarcity of
+high-quality training data and mounting complexity of maintaining comprehensive
+expertise across rapidly expanding knowledge domains, poses critical challenges
+to the continued growth of LLMs. While solutions like Retrieval-Augmented
+Generation (RAG) offer potential remedies, maintaining up-to-date expert
+knowledge across diverse domains remains a significant challenge, particularly
+given the exponential growth of specialized information. This paper introduces
+LLMs Networks (LLM-Net), a blockchain-based framework that democratizes
+LLMs-as-a-Service through a decentralized network of specialized LLM providers.
+By leveraging collective computational resources and distributed domain
+expertise, LLM-Net incorporates fine-tuned expert models for various specific
+domains, ensuring sustained knowledge growth while maintaining service quality
+through collaborative prompting mechanisms. The framework's robust design
+includes blockchain technology for transparent transaction and performance
+validation, establishing an immutable record of service delivery. Our
+simulation, built on top of state-of-the-art LLMs such as Claude 3.5 Sonnet,
+Llama 3.1, Grok-2, and GPT-4o, validates the effectiveness of the
+reputation-based mechanism in maintaining service quality by selecting
+high-performing respondents (LLM providers). Thereby it demonstrates the
+potential of LLM-Net to sustain AI advancement through the integration of
+decentralized expertise and blockchain-based accountability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lifelong Learning of Large Language Model based Agents: A Roadmap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhao Zheng, Chengming Shi, Xidi Cai, Qiuke Li, Duzhen Zhang, Chenxing Li, Dong Yu, Qianli Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lifelong learning, also known as continual or incremental learning, is a
+crucial component for advancing Artificial General Intelligence (AGI) by
+enabling systems to continuously adapt in dynamic environments. While large
+language models (LLMs) have demonstrated impressive capabilities in natural
+language processing, existing LLM agents are typically designed for static
+systems and lack the ability to adapt over time in response to new challenges.
+This survey is the first to systematically summarize the potential techniques
+for incorporating lifelong learning into LLM-based agents. We categorize the
+core components of these agents into three modules: the perception module for
+multimodal input integration, the memory module for storing and retrieving
+evolving knowledge, and the action module for grounded interactions with the
+dynamic environment. We highlight how these pillars collectively enable
+continuous adaptation, mitigate catastrophic forgetting, and improve long-term
+performance. This survey provides a roadmap for researchers and practitioners
+working to develop lifelong learning capabilities in LLM agents, offering
+insights into emerging trends, evaluation metrics, and application scenarios.
+Relevant literature and resources are available at \href{this
+url}{https://github.com/qianlima-lab/awesome-lifelong-llm-agent}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Smart Meter Gaps: A Benchmark of Statistical, Machine Learning
+  and Time Series Foundation Models for Data Imputation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Sartipi, Joaquin Delgado Fernandez, Sergio Potenciano Menci, Alessio Magitteri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integrity of time series data in smart grids is often compromised by
+missing values due to sensor failures, transmission errors, or disruptions.
+Gaps in smart meter data can bias consumption analyses and hinder reliable
+predictions, causing technical and economic inefficiencies. As smart meter data
+grows in volume and complexity, conventional techniques struggle with its
+nonlinear and nonstationary patterns. In this context, Generative Artificial
+Intelligence offers promising solutions that may outperform traditional
+statistical methods. In this paper, we evaluate two general-purpose Large
+Language Models and five Time Series Foundation Models for smart meter data
+imputation, comparing them with conventional Machine Learning and statistical
+models. We introduce artificial gaps (30 minutes to one day) into an anonymized
+public dataset to test inference capabilities. Results show that Time Series
+Foundation Models, with their contextual understanding and pattern recognition,
+could significantly enhance imputation accuracy in certain cases. However, the
+trade-off between computational cost and performance gains remains a critical
+consideration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Skip Mamba Diffusion for Monocular 3D Semantic Scene Completion <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Liang, Naveed Akhtar, Jordan Vice, Xiangrui Kong, Ajmal Saeed Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D semantic scene completion is critical for multiple downstream tasks in
+autonomous systems. It estimates missing geometric and semantic information in
+the acquired scene data. Due to the challenging real-world conditions, this
+task usually demands complex models that process multi-modal data to achieve
+acceptable performance. We propose a unique neural model, leveraging advances
+from the state space and diffusion generative modeling to achieve remarkable 3D
+semantic scene completion performance with monocular image input. Our technique
+processes the data in the conditioned latent space of a variational autoencoder
+where diffusion modeling is carried out with an innovative state space
+technique. A key component of our neural network is the proposed Skimba (Skip
+Mamba) denoiser, which is adept at efficiently processing long-sequence data.
+The Skimba diffusion model is integral to our 3D scene completion network,
+incorporating a triple Mamba structure, dimensional decomposition residuals and
+varying dilations along three directions. We also adopt a variant of this
+network for the subsequent semantic segmentation stage of our method. Extensive
+evaluation on the standard SemanticKITTI and SSCBench-KITTI360 datasets show
+that our approach not only outperforms other monocular techniques by a large
+margin, it also achieves competitive performance against stereo methods. The
+code is available at https://github.com/xrkong/skimba
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOS-Attack: A Scalable Multi-objective Adversarial Attack Framework <span class="chip">CVPR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ping Guo, Cheng Gong, Xi Lin, Fei Liu, Zhichao Lu, Qingfu Zhang, Zhenkun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crafting adversarial examples is crucial for evaluating and enhancing the
+robustness of Deep Neural Networks (DNNs), presenting a challenge equivalent to
+maximizing a non-differentiable 0-1 loss function.
+  However, existing single objective methods, namely adversarial attacks focus
+on a surrogate loss function, do not fully harness the benefits of engaging
+multiple loss functions, as a result of insufficient understanding of their
+synergistic and conflicting nature.
+  To overcome these limitations, we propose the Multi-Objective Set-based
+Attack (MOS Attack), a novel adversarial attack framework leveraging multiple
+loss functions and automatically uncovering their interrelations.
+  The MOS Attack adopts a set-based multi-objective optimization strategy,
+enabling the incorporation of numerous loss functions without additional
+parameters.
+  It also automatically mines synergistic patterns among various losses,
+facilitating the generation of potent adversarial attacks with fewer
+objectives.
+  Extensive experiments have shown that our MOS Attack outperforms
+single-objective attacks. Furthermore, by harnessing the identified synergistic
+patterns, MOS Attack continues to show superior results with a reduced number
+of loss functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review of CVPR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lessons From Red Teaming 100 Generative AI Products 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Blake Bullwinkel, Amanda Minnich, Shiven Chawla, Gary Lopez, Martin Pouliot, Whitney Maxwell, Joris de Gruyter, Katherine Pratt, Saphir Qi, Nina Chikanov, Roman Lutz, Raja Sekhar Rao Dheekonda, Bolor-Erdene Jagdagdorj, Eugenia Kim, Justin Song, Keegan Hines, Daniel Jones, Giorgio Severi, Richard Lundeen, Sam Vaughan, Victoria Westerhoff, Pete Bryan, Ram Shankar Siva Kumar, Yonatan Zunger, Chang Kawaguchi, Mark Russinovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, AI red teaming has emerged as a practice for probing the
+safety and security of generative AI systems. Due to the nascency of the field,
+there are many open questions about how red teaming operations should be
+conducted. Based on our experience red teaming over 100 generative AI products
+at Microsoft, we present our internal threat model ontology and eight main
+lessons we have learned:
+  1. Understand what the system can do and where it is applied
+  2. You don't have to compute gradients to break an AI system
+  3. AI red teaming is not safety benchmarking
+  4. Automation can help cover more of the risk landscape
+  5. The human element of AI red teaming is crucial
+  6. Responsible AI harms are pervasive but difficult to measure
+  7. LLMs amplify existing security risks and introduce new ones
+  8. The work of securing AI systems will never be complete
+  By sharing these insights alongside case studies from our operations, we
+offer practical recommendations aimed at aligning red teaming efforts with real
+world risks. We also highlight aspects of AI red teaming that we believe are
+often misunderstood and discuss open questions for the field to consider.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Breaking Memory Limits: Gradient Wavelet Transform Enhances LLMs
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqing Wen, Ping Luo, Jiahuan Wang, Xiaoge Deng, Jinping Zou, Kun Yuan, Tao Sun, Dongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown impressive performance across a range
+of natural language processing tasks. However, their vast number of parameters
+introduces significant memory challenges during training, particularly when
+using memory-intensive optimizers like Adam. Existing memory-efficient
+algorithms often rely on techniques such as singular value decomposition
+projection or weight freezing. While these approaches help alleviate memory
+constraints, they generally produce suboptimal results compared to full-rank
+updates. In this paper, we investigate the memory-efficient method beyond
+low-rank training, proposing a novel solution called Gradient Wavelet Transform
+(GWT), which applies wavelet transforms to gradients in order to significantly
+reduce the memory requirements for maintaining optimizer states. We demonstrate
+that GWT can be seamlessly integrated with memory-intensive optimizers,
+enabling efficient training without sacrificing performance. Through extensive
+experiments on both pre-training and fine-tuning tasks, we show that GWT
+achieves state-of-the-art performance compared with advanced memory-efficient
+optimizers and full-rank approaches in terms of both memory usage and training
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Use of Contrastive Language-Image <span class="highlight-title">Pre-Train</span>ing for Human
+  Posture Classification: Insights from Yoga Pose Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrzej D. Dobrzycki, Ana M. Bernardos, Luca Bergesio, Andrzej Pomirski, Daniel Sáez-Trigueros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate human posture classification in images and videos is crucial for
+automated applications across various fields, including work safety, physical
+rehabilitation, sports training, or daily assisted living. Recently, multimodal
+learning methods, such as Contrastive Language-Image Pretraining (CLIP), have
+advanced significantly in jointly understanding images and text. This study
+aims to assess the effectiveness of CLIP in classifying human postures,
+focusing on its application in yoga. Despite the initial limitations of the
+zero-shot approach, applying transfer learning on 15,301 images (real and
+synthetic) with 82 classes has shown promising results. The article describes
+the full procedure for fine-tuning, including the choice for image description
+syntax, models and hyperparameters adjustment. The fine-tuned CLIP model,
+tested on 3826 images, achieves an accuracy of over 85%, surpassing the current
+state-of-the-art of previous works on the same dataset by approximately 6%, its
+training time being 3.5 times lower than what is needed to fine-tune a
+YOLOv8-based model. For more application-oriented scenarios, with smaller
+datasets of six postures each, containing 1301 and 401 training images, the
+fine-tuned models attain an accuracy of 98.8% and 99.1%, respectively.
+Furthermore, our experiments indicate that training with as few as 20 images
+per pose can yield around 90% accuracy in a six-class dataset. This study
+demonstrates that this multimodal technique can be effectively used for yoga
+pose classification, and possibly for human posture classification, in general.
+Additionally, CLIP inference time (around 7 ms) supports that the model can be
+integrated into automated systems for posture evaluation, e.g., for developing
+a real-time personal yoga assistant for performance assessment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-face emotion detection for effective Human-Robot Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Ala Yahyaoui, Mouaad Oujabour, Leila Ben Letaifa, Amine Bohi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of dialogue interfaces in mobile devices has become
+ubiquitous, providing a wide array of services. As technology progresses,
+humanoid robots designed with human-like features to interact effectively with
+people are gaining prominence, and the use of advanced human-robot dialogue
+interfaces is continually expanding. In this context, emotion recognition plays
+a crucial role in enhancing human-robot interaction by enabling robots to
+understand human intentions. This research proposes a facial emotion detection
+interface integrated into a mobile humanoid robot, capable of displaying
+real-time emotions from multiple individuals on a user interface. To this end,
+various deep neural network models for facial expression recognition were
+developed and evaluated under consistent computer-based conditions, yielding
+promising results. Afterwards, a trade-off between accuracy and memory
+footprint was carefully considered to effectively implement this application on
+a mobile humanoid robot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 8 figures and 1 table. Accepted at the 17th International
+  Conference on Agents and Artificial Intelligence (ICAART 2025), Porto,
+  Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Crowdsourced human-based computational approach for tagging peripheral
+  blood smear sample images from Sickle Cell Disease patients using non-expert
+  users 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07196v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07196v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José María Buades Rubio, Gabriel Moyà-Alcover, Antoni Jaume-i-Capó, Nataša Petrović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a human-based computation approach for the analysis
+of peripheral blood smear (PBS) images images in patients with Sickle Cell
+Disease (SCD). We used the Mechanical Turk microtask market to crowdsource the
+labeling of PBS images. We then use the expert-tagged erythrocytesIDB dataset
+to assess the accuracy and reliability of our proposal. Our results showed that
+when a robust consensus is achieved among the Mechanical Turk workers,
+probability of error is very low, based on comparison with expert analysis.
+This suggests that our proposed approach can be used to annotate datasets of
+PBS images, which can then be used to train automated methods for the diagnosis
+of SCD. In future work, we plan to explore the potential integration of our
+findings with outcomes obtained through automated methodologies. This could
+lead to the development of more accurate and reliable methods for the diagnosis
+of SCD
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalizable Graph Neural Networks for Robust Power Grid Topology
+  Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthijs de Jong, Jan Viebahn, Yuliya Shapovalova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The energy transition necessitates new congestion management methods. One
+such method is controlling the grid topology with machine learning (ML). This
+approach has gained popularity following the Learning to Run a Power Network
+(L2RPN) competitions. Graph neural networks (GNNs) are a class of ML models
+that reflect graph structure in their computation, which makes them suitable
+for power grid modeling. Various GNN approaches for topology control have thus
+been proposed. We propose the first GNN model for grid topology control that
+uses only GNN layers. Additionally, we identify the busbar information
+asymmetry problem that the popular homogeneous graph representation suffers
+from, and propose a heterogeneous graph representation to resolve it. We train
+both homogeneous and heterogeneous GNNs and fully connected neural networks
+(FCNN) baselines on an imitation learning task. We evaluate the models
+according to their classification accuracy and grid operation ability. We find
+that the heterogeneous GNNs perform best on in-distribution networks, followed
+by the FCNNs, and lastly, the homogeneous GNNs. We also find that both GNN
+types generalize better to out-of-distribution networks than FCNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kriging and Gaussian Process Interpolation for Georeferenced Data
+  Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07183v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07183v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frédérick Fabre Ferber, Dominique Gay, Jean-Christophe Soulié, Jean Diatta, Odalric-Ambrym Maillard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is a crucial step in the development of robust supervised
+learning models, especially when dealing with limited datasets. This study
+explores interpolation techniques for the augmentation of geo-referenced data,
+with the aim of predicting the presence of Commelina benghalensis L. in
+sugarcane plots in La R{\'e}union. Given the spatial nature of the data and the
+high cost of data collection, we evaluated two interpolation approaches:
+Gaussian processes (GPs) with different kernels and kriging with various
+variograms. The objectives of this work are threefold: (i) to identify which
+interpolation methods offer the best predictive performance for various
+regression algorithms, (ii) to analyze the evolution of performance as a
+function of the number of observations added, and (iii) to assess the spatial
+consistency of augmented datasets. The results show that GP-based methods, in
+particular with combined kernels (GP-COMB), significantly improve the
+performance of regression algorithms while requiring less additional data.
+Although kriging shows slightly lower performance, it is distinguished by a
+more homogeneous spatial coverage, a potential advantage in certain contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Spoils of Algorithmic Collusion: Profit Allocation Among Asymmetric
+  Firms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Martin, Hans-Theo Normann, Paul Püplichhuisen, Tobias Werner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the propensity of independent algorithms to collude in repeated
+Cournot duopoly games. Specifically, we investigate the predictive power of
+different oligopoly and bargaining solutions regarding the effect of asymmetry
+between firms. We find that both consumers and firms can benefit from
+asymmetry. Algorithms produce more competitive outcomes when firms are
+symmetric, but less when they are very asymmetric. Although the static Nash
+equilibrium underestimates the effect on total quantity and overestimates the
+effect on profits, it delivers surprisingly accurate predictions in terms of
+total welfare. The best description of our results is provided by the equal
+relative gains solution. In particular, we find algorithms to agree on profits
+that are on or close to the Pareto frontier for all degrees of asymmetry. Our
+results suggest that the common belief that symmetric industries are more prone
+to collusion may no longer hold when algorithms increasingly drive managerial
+decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anomalous Agreement: How to find the Ideal Number of Anomaly Classes in
+  Correlated, Multivariate Time Series Data <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ferdinand Rewicki, Joachim Denzler, Julia Niebling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting and classifying abnormal system states is critical for condition
+monitoring, but supervised methods often fall short due to the rarity of
+anomalies and the lack of labeled data. Therefore, clustering is often used to
+group similar abnormal behavior. However, evaluating cluster quality without
+ground truth is challenging, as existing measures such as the Silhouette Score
+(SSC) only evaluate the cohesion and separation of clusters and ignore possible
+prior knowledge about the data. To address this challenge, we introduce the
+Synchronized Anomaly Agreement Index (SAAI), which exploits the synchronicity
+of anomalies across multivariate time series to assess cluster quality. We
+demonstrate the effectiveness of SAAI by showing that maximizing SAAI improves
+accuracy on the task of finding the true number of anomaly classes K in
+correlated time series by 0.23 compared to SSC and by 0.32 compared to X-Means.
+We also show that clusters obtained by maximizing SAAI are easier to interpret
+compared to SSC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Acccepted at AAAI Workshop on AI for Time Series Analysis (AI4TS)
+  2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Natural Language-Assisted Multi-modal Medication Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Tan, Yu Rong, Kangfei Zhao, Tian Bian, Tingyang Xu, Junzhou Huang, Hong Cheng, Helen Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Combinatorial medication recommendation(CMR) is a fundamental task of
+healthcare, which offers opportunities for clinical physicians to provide more
+precise prescriptions for patients with intricate health conditions,
+particularly in the scenarios of long-term medical care. Previous research
+efforts have sought to extract meaningful information from electronic health
+records (EHRs) to facilitate combinatorial medication recommendations. Existing
+learning-based approaches further consider the chemical structures of
+medications, but ignore the textual medication descriptions in which the
+functionalities are clearly described. Furthermore, the textual knowledge
+derived from the EHRs of patients remains largely underutilized. To address
+these issues, we introduce the Natural Language-Assisted Multi-modal Medication
+Recommendation(NLA-MMR), a multi-modal alignment framework designed to learn
+knowledge from the patient view and medication view jointly. Specifically,
+NLA-MMR formulates CMR as an alignment problem from patient and medication
+modalities. In this vein, we employ pretrained language models(PLMs) to extract
+in-domain knowledge regarding patients and medications, serving as the
+foundational representation for both modalities. In the medication modality, we
+exploit both chemical structures and textual descriptions to create medication
+representations. In the patient modality, we generate the patient
+representations based on textual descriptions of diagnosis, procedure, and
+symptom. Extensive experiments conducted on three publicly accessible datasets
+demonstrate that NLA-MMR achieves new state-of-the-art performance, with a
+notable average improvement of 4.72% in Jaccard score. Our source code is
+publicly available on https://github.com/jtan1102/NLA-MMR_CIKM_2024.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QuantuneV2: Compiler-Based Local Metric-Driven Mixed Precision
+  Quantization for Practical Embedded AI Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07161v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07161v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeongseok Kim, Jemin Lee, Yongin Kwon, Daeyoung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mixed-precision quantization methods have been proposed to reduce model size
+while minimizing accuracy degradation. However, existing studies require
+retraining and do not consider the computational overhead and intermediate
+representations (IR) generated during the compilation process, limiting their
+application at the compiler level. This computational overhead refers to the
+runtime latency caused by frequent quantization and dequantization operations
+during inference. Performing these operations at the individual operator level
+causes significant runtime delays. To address these issues, we propose
+QuantuneV2, a compiler-based mixed-precision quantization method designed for
+practical embedded AI applications. QuantuneV2 performs inference only twice,
+once before quantization and once after quantization, and operates with a
+computational complexity of O(n) that increases linearly with the number of
+model parameters. We also made the sensitivity analysis more stable by using
+local metrics like weights, activation values, the Signal to Quantization Noise
+Ratio, and the Mean Squared Error. We also cut down on computational overhead
+by choosing the best IR and using operator fusion. Experimental results show
+that QuantuneV2 achieved up to a 10.28 percent improvement in accuracy and a
+12.52 percent increase in speed compared to existing methods across five
+models: ResNet18v1, ResNet50v1, SqueezeNetv1, VGGNet, and MobileNetv2. This
+demonstrates that QuantuneV2 enhances model performance while maintaining
+computational efficiency, making it suitable for deployment in embedded AI
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 10 figures, Accepted in Future Generation Computer Systems
+  Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eye Sclera for Fair Face Image Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wassim Kabbani, Kiran Raja, Raghavendra Ramachandra, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fair operational systems are crucial in gaining and maintaining society's
+trust in face recognition systems (FRS). FRS start with capturing an image and
+assessing its quality before using it further for enrollment or verification.
+Fair Face Image Quality Assessment (FIQA) schemes therefore become equally
+important in the context of fair FRS. This work examines the sclera as a
+quality assessment region for obtaining a fair FIQA. The sclera region is
+agnostic to demographic variations and skin colour for assessing the quality of
+a face image. We analyze three skin tone related ISO/IEC face image quality
+assessment measures and assess the sclera region as an alternative area for
+assessing FIQ. Our analysis of the face dataset of individuals from different
+demographic groups representing different skin tones indicates sclera as an
+alternative to measure dynamic range, over- and under-exposure of face using
+sclera region alone. The sclera region being agnostic to skin tone, i.e.,
+demographic factors, provides equal utility as a fair FIQA as shown by our
+Error-vs-Discard Characteristic (EDC) curve analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CureGraph: Contrastive Multi-Modal Graph Representation Learning for
+  Urban Living Circle Health Profiling and Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinlin Li, Xiao Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The early detection and prediction of health status decline among the elderly
+at the neighborhood level are of great significance for urban planning and
+public health policymaking. While existing studies affirm the connection
+between living environments and health outcomes, most rely on single data
+modalities or simplistic feature concatenation of multi-modal information,
+limiting their ability to comprehensively profile the health-oriented urban
+environments. To fill this gap, we propose CureGraph, a contrastive multi-modal
+representation learning framework for urban health prediction that employs
+graph-based techniques to infer the prevalence of common chronic diseases among
+the elderly within the urban living circles of each neighborhood. CureGraph
+leverages rich multi-modal information, including photos and textual reviews of
+residential areas and their surrounding points of interest, to generate urban
+neighborhood embeddings. By integrating pre-trained visual and textual encoders
+with graph modeling techniques, CureGraph captures cross-modal spatial
+dependencies, offering a comprehensive understanding of urban environments
+tailored to elderly health considerations. Extensive experiments on real-world
+datasets demonstrate that CureGraph improves the best baseline by $28\%$ on
+average in terms of $R^2$ across elderly disease risk prediction tasks.
+Moreover, the model enables the identification of stage-wise chronic disease
+progression and supports comparative public health analysis across
+neighborhoods, offering actionable insights for sustainable urban development
+and enhanced quality of life. The code is publicly available at
+https://github.com/jinlin2021/CureGraph.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TIMRL: A Novel Meta-Reinforcement Learning Framework for Non-Stationary
+  and Multi-Task Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyang Qi, Huiping Li, Panfeng Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, meta-reinforcement learning (meta-RL) algorithm has been
+proposed to improve sample efficiency in the field of decision-making and
+control, enabling agents to learn new knowledge from a small number of samples.
+However, most research uses the Gaussian distribution to extract task
+representation, which is poorly adapted to tasks that change in non-stationary
+environment. To address this problem, we propose a novel meta-reinforcement
+learning method by leveraging Gaussian mixture model and the transformer
+network to construct task inference model. The Gaussian mixture model is
+utilized to extend the task representation and conduct explicit encoding of
+tasks. Specifically, the classification of tasks is encoded through transformer
+network to determine the Gaussian component corresponding to the task. By
+leveraging task labels, the transformer network is trained using supervised
+learning. We validate our method on MuJoCo benchmarks with non-stationary and
+multi-task environments. Experimental results demonstrate that the proposed
+method dramatically improves sample efficiency and accurately recognizes the
+classification of the tasks, while performing excellently in the environment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FlexQuant: Elastic Quantization Framework for Locally Hosted LLM on Edge
+  Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuji Chai, Mujin Kwen, David Brooks, Gu-Yeon Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying LLMs on edge devices presents serious technical challenges. Memory
+elasticity is crucial for edge devices with unified memory, where memory is
+shared and fluctuates dynamically. Existing solutions suffer from either poor
+transition granularity or high storage costs. We propose FlexQuant, a novel
+elasticity framework that generates an ensemble of quantized models, providing
+an elastic hosting solution with 15x granularity improvement and 10x storage
+reduction compared to SoTA methods. FlexQuant works with most quantization
+methods and creates a family of trade-off options under various storage limits
+through our pruning method. It brings great performance and flexibility to the
+edge deployment of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How <span class="highlight-title">GPT</span> learns layer by layer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07108v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07108v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Du, Kelly Hong, Alishba Imran, Erfan Jahanparast, Mehdi Khfifi, Kaichun Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) excel at tasks like language processing,
+strategy games, and reasoning but struggle to build generalizable internal
+representations essential for adaptive decision-making in agents. For agents to
+effectively navigate complex environments, they must construct reliable world
+models. While LLMs perform well on specific benchmarks, they often fail to
+generalize, leading to brittle representations that limit their real-world
+effectiveness. Understanding how LLMs build internal world models is key to
+developing agents capable of consistent, adaptive behavior across tasks. We
+analyze OthelloGPT, a GPT-based model trained on Othello gameplay, as a
+controlled testbed for studying representation learning. Despite being trained
+solely on next-token prediction with random valid moves, OthelloGPT shows
+meaningful layer-wise progression in understanding board state and gameplay.
+Early layers capture static attributes like board edges, while deeper layers
+reflect dynamic tile changes. To interpret these representations, we compare
+Sparse Autoencoders (SAEs) with linear probes, finding that SAEs offer more
+robust, disentangled insights into compositional features, whereas linear
+probes mainly detect features useful for classification. We use SAEs to decode
+features related to tile color and tile stability, a previously unexamined
+feature that reflects complex gameplay concepts like board control and
+long-term planning. We study the progression of linear probe accuracy and tile
+color using both SAE's and linear probes to compare their effectiveness at
+capturing what the model is learning. Although we begin with a smaller language
+model, OthelloGPT, this study establishes a framework for understanding the
+internal representations learned by GPT models, transformers, and LLMs more
+broadly. Our code is publicly available: https://github.com/ALT-JS/OthelloSAE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaCS: Adaptive Normalization for Enhanced Code-Switching ASR <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        The Chuong Chu, Vu Tuan Dat Pham, Kien Dao, Hoang Nguyen, Quoc Hung Truong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intra-sentential code-switching (CS) refers to the alternation between
+languages that happens within a single utterance and is a significant challenge
+for Automatic Speech Recognition (ASR) systems. For example, when a Vietnamese
+speaker uses foreign proper names or specialized terms within their speech. ASR
+systems often struggle to accurately transcribe intra-sentential CS due to
+their training on monolingual data and the unpredictable nature of CS. This
+issue is even more pronounced for low-resource languages, where limited data
+availability hinders the development of robust models. In this study, we
+propose AdaCS, a normalization model integrates an adaptive bias attention
+module (BAM) into encoder-decoder network. This novel approach provides a
+robust solution to CS ASR in unseen domains, thereby significantly enhancing
+our contribution to the field. By utilizing BAM to both identify and normalize
+CS phrases, AdaCS enhances its adaptive capabilities with a biased list of
+words provided during inference. Our method demonstrates impressive performance
+and the ability to handle unseen CS phrases across various domains. Experiments
+show that AdaCS outperforms previous state-of-the-art method on Vietnamese CS
+ASR normalization by considerable WER reduction of 56.2% and 36.8% on the two
+proposed test sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Learning for 3D Hand-Object Reconstruction and
+  Compositional Action Recognition from Egocentric RGB Videos Using
+  Superquadrics <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tze Ho Elden Tse, Runyang Feng, Linfang Zheng, Jiho Park, Yixing Gao, Jihie Kim, Ales Leonardis, Hyung Jin Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the availability of egocentric 3D hand-object interaction datasets,
+there is increasing interest in developing unified models for hand-object pose
+estimation and action recognition. However, existing methods still struggle to
+recognise seen actions on unseen objects due to the limitations in representing
+object shape and movement using 3D bounding boxes. Additionally, the reliance
+on object templates at test time limits their generalisability to unseen
+objects. To address these challenges, we propose to leverage superquadrics as
+an alternative 3D object representation to bounding boxes and demonstrate their
+effectiveness on both template-free object reconstruction and action
+recognition tasks. Moreover, as we find that pure appearance-based methods can
+outperform the unified methods, the potential benefits from 3D geometric
+information remain unclear. Therefore, we study the compositionality of actions
+by considering a more challenging task where the training combinations of verbs
+and nouns do not overlap with the testing split. We extend H2O and FPHA
+datasets with compositional splits and design a novel collaborative learning
+framework that can explicitly reason about the geometric relations between
+hands and the manipulated object. Through extensive quantitative and
+qualitative evaluations, we demonstrate significant improvements over the
+state-of-the-arts in (compositional) action recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MathReader : Text-to-Speech for Mathematical Documents <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sieun Hyeon, Kyudan Jung, Nam-Joon Kim, Hyun Gon Ryu, Jaeyoung Do
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  TTS (Text-to-Speech) document reader from Microsoft, Adobe, Apple, and OpenAI
+have been serviced worldwide. They provide relatively good TTS results for
+general plain text, but sometimes skip contents or provide unsatisfactory
+results for mathematical expressions. This is because most modern academic
+papers are written in LaTeX, and when LaTeX formulas are compiled, they are
+rendered as distinctive text forms within the document. However, traditional
+TTS document readers output only the text as it is recognized, without
+considering the mathematical meaning of the formulas. To address this issue, we
+propose MathReader, which effectively integrates OCR, a fine-tuned T5 model,
+and TTS. MathReader demonstrated a lower Word Error Rate (WER) than existing
+TTS document readers, such as Microsoft Edge and Adobe Acrobat, when processing
+documents containing mathematical formulas. MathReader reduced the WER from
+0.510 to 0.281 compared to Microsoft Edge, and from 0.617 to 0.281 compared to
+Adobe Acrobat. This will significantly contribute to alleviating the
+inconvenience faced by users who want to listen to documents, especially those
+who are visually impaired. The code is available at
+https://github.com/hyeonsieun/MathReader.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video Quality Assessment for Online Processing: From Spatial to Temporal
+  Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiebin Yan, Lei Wu, Yuming Fang, Xuelin Liu, Xue Xia, Weide Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of multimedia processing and deep learning
+technologies, especially in the field of video understanding, video quality
+assessment (VQA) has achieved significant progress. Although researchers have
+moved from designing efficient video quality mapping models to various research
+directions, in-depth exploration of the effectiveness-efficiency trade-offs of
+spatio-temporal modeling in VQA models is still less sufficient. Considering
+the fact that videos have highly redundant information, this paper investigates
+this problem from the perspective of joint spatial and temporal sampling,
+aiming to seek the answer to how little information we should keep at least
+when feeding videos into the VQA models while with acceptable performance
+sacrifice. To this end, we drastically sample the video's information from both
+spatial and temporal dimensions, and the heavily squeezed video is then fed
+into a stable VQA model. Comprehensive experiments regarding joint spatial and
+temporal sampling are conducted on six public video quality databases, and the
+results demonstrate the acceptable performance of the VQA model when throwing
+away most of the video information. Furthermore, with the proposed joint
+spatial and temporal sampling strategy, we make an initial attempt to design an
+online VQA model, which is instantiated by as simple as possible a spatial
+feature extractor, a temporal feature fusion module, and a global quality
+regression module. Through quantitative and qualitative experiments, we verify
+the feasibility of online VQA model by simplifying itself and reducing input.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADKGD: Anomaly Detection in Knowledge Graphs with Dual-Channel Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayang Wu, Wensheng Gan, Jiahao Zhang, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the current development of large language models (LLMs), it is important
+to ensure the accuracy and reliability of the underlying data sources. LLMs are
+critical for various applications, but they often suffer from hallucinations
+and inaccuracies due to knowledge gaps in the training data. Knowledge graphs
+(KGs), as a powerful structural tool, could serve as a vital external
+information source to mitigate the aforementioned issues. By providing a
+structured and comprehensive understanding of real-world data, KGs enhance the
+performance and reliability of LLMs. However, it is common that errors exist in
+KGs while extracting triplets from unstructured data to construct KGs. This
+could lead to degraded performance in downstream tasks such as
+question-answering and recommender systems. Therefore, anomaly detection in KGs
+is essential to identify and correct these errors. This paper presents an
+anomaly detection algorithm in knowledge graphs with dual-channel learning
+(ADKGD). ADKGD leverages a dual-channel learning approach to enhance
+representation learning from both the entity-view and triplet-view
+perspectives. Furthermore, using a cross-layer approach, our framework
+integrates internal information aggregation and context information
+aggregation. We introduce a kullback-leibler (KL)-loss component to improve the
+accuracy of the scoring function between the dual channels. To evaluate ADKGD's
+performance, we conduct empirical studies on three real-world KGs: WN18RR,
+FB15K, and NELL-995. Experimental results demonstrate that ADKGD outperforms
+the state-of-the-art anomaly detection algorithms. The source code and datasets
+are publicly available at https://github.com/csjywu1/ADKGD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. 11 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Representation Learning of Point Cloud Upsampling in Global and Local
+  Inputs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongxu Zhang, Bei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, point cloud upsampling has been widely applied in fields
+such as 3D reconstruction. Our study investigates the factors influencing point
+cloud upsampling on both global and local levels through representation
+learning. Specifically, the paper inputs global and local information of the
+same point cloud model object into two encoders to extract these features,
+fuses them, and then feeds the combined features into an upsampling decoder.
+The goal is to address issues of sparsity and noise in point clouds by
+leveraging prior knowledge from both global and local inputs. And the proposed
+framework can be applied to any state-of-the-art point cloud upsampling neural
+network. Experiments were conducted on a series of autoencoder-based models
+utilizing deep learning, yielding interpretability for both global and local
+inputs, and it has been proven in the results that our proposed framework can
+further improve the upsampling effect in previous SOTA works. At the same time,
+the Saliency Map reflects the differences between global and local feature
+inputs, as well as the effectiveness of training with both inputs in parallel.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Value Compass Leaderboard: A Platform for Fundamental and Validated
+  Evaluation of LLMs Values 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Yao, Xiaoyuan Yi, Shitong Duan, Jindong Wang, Yuzhuo Bai, Muhua Huang, Peng Zhang, Tun Lu, Zhicheng Dou, Maosong Sun, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Large Language Models (LLMs) achieve remarkable breakthroughs, aligning
+their values with humans has become imperative for their responsible
+development and customized applications. However, there still lack evaluations
+of LLMs values that fulfill three desirable goals. (1) Value Clarification: We
+expect to clarify the underlying values of LLMs precisely and comprehensively,
+while current evaluations focus narrowly on safety risks such as bias and
+toxicity. (2) Evaluation Validity: Existing static, open-source benchmarks are
+prone to data contamination and quickly become obsolete as LLMs evolve.
+Additionally, these discriminative evaluations uncover LLMs' knowledge about
+values, rather than valid assessments of LLMs' behavioral conformity to values.
+(3) Value Pluralism: The pluralistic nature of human values across individuals
+and cultures is largely ignored in measuring LLMs value alignment. To address
+these challenges, we presents the Value Compass Leaderboard, with three
+correspondingly designed modules. It (i) grounds the evaluation on
+motivationally distinct \textit{basic values to clarify LLMs' underlying values
+from a holistic view; (ii) applies a \textit{generative evolving evaluation
+framework with adaptive test items for evolving LLMs and direct value
+recognition from behaviors in realistic scenarios; (iii) propose a metric that
+quantifies LLMs alignment with a specific value as a weighted sum over multiple
+dimensions, with weights determined by pluralistic values.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Logic Meets Magic: LLMs Cracking Smart Contract Vulnerabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        ZeKe Xiao, Qin Wang, Hammond Pearce, Shiping Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Smart contract vulnerabilities caused significant economic losses in
+blockchain applications. Large Language Models (LLMs) provide new possibilities
+for addressing this time-consuming task. However, state-of-the-art LLM-based
+detection solutions are often plagued by high false-positive rates.
+  In this paper, we push the boundaries of existing research in two key ways.
+First, our evaluation is based on Solidity v0.8, offering the most up-to-date
+insights compared to prior studies that focus on older versions (v0.4). Second,
+we leverage the latest five LLM models (across companies), ensuring
+comprehensive coverage across the most advanced capabilities in the field.
+  We conducted a series of rigorous evaluations. Our experiments demonstrate
+that a well-designed prompt can reduce the false-positive rate by over 60%.
+Surprisingly, we also discovered that the recall rate for detecting some
+specific vulnerabilities in Solidity v0.8 has dropped to just 13% compared to
+earlier versions (i.e., v0.4). Further analysis reveals the root cause of this
+decline: the reliance of LLMs on identifying changes in newly introduced
+libraries and frameworks during detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PoAct: Policy and Action Dual-Control Agent for Generalized Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guozhi Yuan, Youfeng Liu, Jingli Yang, Wei Jia, Kai Lin, Yansong Gao, Shan He, Zilin Ding, Haitao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Based on their superior comprehension and reasoning capabilities, Large
+Language Model (LLM) driven agent frameworks have achieved significant success
+in numerous complex reasoning tasks. ReAct-like agents can solve various
+intricate problems step-by-step through progressive planning and tool calls,
+iteratively optimizing new steps based on environmental feedback. However, as
+the planning capabilities of LLMs improve, the actions invoked by tool calls in
+ReAct-like frameworks often misalign with complex planning and challenging data
+organization. Code Action addresses these issues while also introducing the
+challenges of a more complex action space and more difficult action
+organization. To leverage Code Action and tackle the challenges of its
+complexity, this paper proposes Policy and Action Dual-Control Agent (PoAct)
+for generalized applications. The aim is to achieve higher-quality code actions
+and more accurate reasoning paths by dynamically switching reasoning policies
+and modifying the action space. Experimental results on the Agent Benchmark for
+both legal and generic scenarios demonstrate the superior reasoning
+capabilities and reduced token consumption of our approach in complex tasks. On
+the LegalAgentBench, our method shows a 20 percent improvement over the
+baseline while requiring fewer tokens. We conducted experiments and analyses on
+the GPT-4o and GLM-4 series models, demonstrating the significant potential and
+scalability of our approach to solve complex problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Potential of Text in High-Dimensional Time Series
+  Forecasting <span class="chip">NeurIPS24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhou, Weiqing Wang, Shilin Qu, Zhiqiang Zhang, Christoph Bergmeir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series forecasting has traditionally focused on univariate and
+multivariate numerical data, often overlooking the benefits of incorporating
+multimodal information, particularly textual data. In this paper, we propose a
+novel framework that integrates time series models with Large Language Models
+to improve high-dimensional time series forecasting. Inspired by multimodal
+models, our method combines time series and textual data in the dual-tower
+structure. This fusion of information creates a comprehensive representation,
+which is then processed through a linear layer to generate the final forecast.
+Extensive experiments demonstrate that incorporating text enhances
+high-dimensional time series forecasting performance. This work paves the way
+for further research in multimodal time series forecasting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS24 TSALM Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ACCon: Angle-Compensated Contrastive Regularizer for Deep Regression <span class="chip">AAAI-2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Botao Zhao, Xiaoyang Qu, Zuheng Kang, Junqing Peng, Jing Xiao, Jianzong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In deep regression, capturing the relationship among continuous labels in
+feature space is a fundamental challenge that has attracted increasing
+interest. Addressing this issue can prevent models from converging to
+suboptimal solutions across various regression tasks, leading to improved
+performance, especially for imbalanced regression and under limited sample
+sizes. However, existing approaches often rely on order-aware representation
+learning or distance-based weighting. In this paper, we hypothesize a linear
+negative correlation between label distances and representation similarities in
+regression tasks. To implement this, we propose an angle-compensated
+contrastive regularizer for deep regression, which adjusts the cosine distance
+between anchor and negative samples within the contrastive learning framework.
+Our method offers a plug-and-play compatible solution that extends most
+existing contrastive learning methods for regression tasks. Extensive
+experiments and theoretical analysis demonstrate that our proposed
+angle-compensated contrastive regularizer not only achieves competitive
+regression performance but also excels in data efficiency and effectiveness on
+imbalanced datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept by AAAI-2025 (The 39th Annual AAAI Conference on Artificial
+  Intelligence)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Proposed Large Language Model-Based Smart Search for Archive System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ha Dung Nguyen, Thi-Hoang Anh Nguyen, Thanh Binh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a novel framework for smart search in digital archival
+systems, leveraging the capabilities of Large Language Models (LLMs) to enhance
+information retrieval. By employing a Retrieval-Augmented Generation (RAG)
+approach, the framework enables the processing of natural language queries and
+transforming non-textual data into meaningful textual representations. The
+system integrates advanced metadata generation techniques, a hybrid retrieval
+mechanism, a router query engine, and robust response synthesis, the results
+proved search precision and relevance. We present the architecture and
+implementation of the system and evaluate its performance in four experiments
+concerning LLM efficiency, hybrid retrieval optimizations, multilingual query
+handling, and the impacts of individual components. Obtained results show
+significant improvements over conventional approaches and have demonstrated the
+potential of AI-powered systems to transform modern archival practices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 13th International Symposium on Information and Communication
+  Technology (SOICT 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Probabilistic Circuits: Enabling Compositional and Interpretable
+  Predictions through Logical Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixin Chen, Simon Yu, Huajie Shao, Lui Sha, Han Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end deep neural networks have achieved remarkable success across
+various domains but are often criticized for their lack of interpretability.
+While post hoc explanation methods attempt to address this issue, they often
+fail to accurately represent these black-box models, resulting in misleading or
+incomplete explanations. To overcome these challenges, we propose an inherently
+transparent model architecture called Neural Probabilistic Circuits (NPCs),
+which enable compositional and interpretable predictions through logical
+reasoning. In particular, an NPC consists of two modules: an attribute
+recognition model, which predicts probabilities for various attributes, and a
+task predictor built on a probabilistic circuit, which enables logical
+reasoning over recognized attributes to make class predictions. To train NPCs,
+we introduce a three-stage training algorithm comprising attribute recognition,
+circuit construction, and joint optimization. Moreover, we theoretically
+demonstrate that an NPC's error is upper-bounded by a linear combination of the
+errors from its modules. To further demonstrate the interpretability of NPC, we
+provide both the most probable explanations and the counterfactual
+explanations. Empirical results on four benchmark datasets show that NPCs
+strike a balance between interpretability and performance, achieving results
+competitive even with those of end-to-end black-box models while providing
+enhanced interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViSoLex: An Open-Source Repository for Vietnamese Social Media Lexical
+  Normalization <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anh Thi-Hoang Nguyen, Dung Ha Nguyen, Kiet Van Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ViSoLex is an open-source system designed to address the unique challenges of
+lexical normalization for Vietnamese social media text. The platform provides
+two core services: Non-Standard Word (NSW) Lookup and Lexical Normalization,
+enabling users to retrieve standard forms of informal language and standardize
+text containing NSWs. ViSoLex's architecture integrates pre-trained language
+models and weakly supervised learning techniques to ensure accurate and
+efficient normalization, overcoming the scarcity of labeled data in Vietnamese.
+This paper details the system's design, functionality, and its applications for
+researchers and non-technical users. Additionally, ViSoLex offers a flexible,
+customizable framework that can be adapted to various datasets and research
+requirements. By publishing the source code, ViSoLex aims to contribute to the
+development of more robust Vietnamese natural language processing tools and
+encourage further research in lexical normalization. Future directions include
+expanding the system's capabilities for additional languages and improving the
+handling of more complex non-standard linguistic patterns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 31st International Conference on Computational Linguistics
+  (COLING 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UNetVL: Enhancing 3D Medical Image Segmentation with Chebyshev KAN
+  Powered Vision-LSTM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhui Guo, Tanmoy Dam, Rohan Dhamdhere, Gourav Modanwal, Anant Madabhushi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D medical image segmentation has progressed considerably due to
+Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs), yet these
+methods struggle to balance long-range dependency acquisition with
+computational efficiency. To address this challenge, we propose UNETVL (U-Net
+Vision-LSTM), a novel architecture that leverages recent advancements in
+temporal information processing. UNETVL incorporates Vision-LSTM (ViL) for
+improved scalability and memory functions, alongside an efficient Chebyshev
+Kolmogorov-Arnold Networks (KAN) to handle complex and long-range dependency
+patterns more effectively. We validated our method on the ACDC and AMOS2022
+(post challenge Task 2) benchmark datasets, showing a significant improvement
+in mean Dice score compared to recent state-of-the-art approaches, especially
+over its predecessor, UNETR, with increases of 7.3% on ACDC and 15.6% on AMOS,
+respectively. Extensive ablation studies were conducted to demonstrate the
+impact of each component in UNETVL, providing a comprehensive understanding of
+its architecture. Our code is available at https://github.com/tgrex6/UNETVL,
+facilitating further research and applications in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-Modal Deep Learning Framework for Pan-Cancer Prognosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binyu Zhang, Shichao Li, Junpeng Jian, Zhu Meng, Limei Guo, Zhicheng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prognostic task is of great importance as it closely related to the survival
+analysis of patients, the optimization of treatment plans and the allocation of
+resources. The existing prognostic models have shown promising results on
+specific datasets, but there are limitations in two aspects. On the one hand,
+they merely explore certain types of modal data, such as patient histopathology
+WSI and gene expression analysis. On the other hand, they adopt the
+per-cancer-per-model paradigm, which means the trained models can only predict
+the prognostic effect of a single type of cancer, resulting in weak
+generalization ability. In this paper, a deep-learning based model, named
+UMPSNet, is proposed. Specifically, to comprehensively understand the condition
+of patients, in addition to constructing encoders for histopathology images and
+genomic expression profiles respectively, UMPSNet further integrates four types
+of important meta data (demographic information, cancer type information,
+treatment protocols, and diagnosis results) into text templates, and then
+introduces a text encoder to extract textual features. In addition, the optimal
+transport OT-based attention mechanism is utilized to align and fuse features
+of different modalities. Furthermore, a guided soft mixture of experts (GMoE)
+mechanism is introduced to effectively address the issue of distribution
+differences among multiple cancer datasets. By incorporating the multi-modality
+of patient data and joint training, UMPSNet outperforms all SOTA approaches,
+and moreover, it demonstrates the effectiveness and generalization ability of
+the proposed learning paradigm of a single model for multiple cancer types. The
+code of UMPSNet is available at https://github.com/binging512/UMPSNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AlgoRxplorers | Precision in Mutation -- Enhancing Drug Design with
+  Advanced Protein Stability Prediction Tools 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karishma Thakrar, Jiangqin Ma, Max Diamond, Akash Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting the impact of single-point amino acid mutations on protein
+stability is essential for understanding disease mechanisms and advancing drug
+development. Protein stability, quantified by changes in Gibbs free energy
+($\Delta\Delta G$), is influenced by these mutations. However, the scarcity of
+data and the complexity of model interpretation pose challenges in accurately
+predicting stability changes. This study proposes the application of deep
+neural networks, leveraging transfer learning and fusing complementary
+information from different models, to create a feature-rich representation of
+the protein stability landscape. We developed four models, with our third
+model, ThermoMPNN+, demonstrating the best performance in predicting
+$\Delta\Delta G$ values. This approach, which integrates diverse feature sets
+and embeddings through latent transfusion techniques, aims to refine
+$\Delta\Delta G$ predictions and contribute to a deeper understanding of
+protein dynamics, potentially leading to advancements in disease research and
+drug discovery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Likelihood Training of Cascaded Diffusion Models via Hierarchical
+  Volume-preserving Maps <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henry Li, Ronen Basri, Yuval Kluger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cascaded models are multi-scale generative models with a marked capacity for
+producing perceptually impressive samples at high resolutions. In this work, we
+show that they can also be excellent likelihood models, so long as we overcome
+a fundamental difficulty with probabilistic multi-scale models: the
+intractability of the likelihood function. Chiefly, in cascaded models each
+intermediary scale introduces extraneous variables that cannot be tractably
+marginalized out for likelihood evaluation. This issue vanishes by modeling the
+diffusion process on latent spaces induced by a class of transformations we
+call hierarchical volume-preserving maps, which decompose spatially structured
+data in a hierarchical fashion without introducing local distortions in the
+latent space. We demonstrate that two such maps are well-known in the
+literature for multiscale modeling: Laplacian pyramids and wavelet transforms.
+Not only do such reparameterizations allow the likelihood function to be
+directly expressed as a joint likelihood over the scales, we show that the
+Laplacian pyramid and wavelet transform also produces significant improvements
+to the state-of-the-art on a selection of benchmarks in likelihood modeling,
+including density estimation, lossless compression, and out-of-distribution
+detection. Investigating the theoretical basis of our empirical gains we
+uncover deep connections to score matching under the Earth Mover's Distance
+(EMD), which is a well-known surrogate for perceptual similarity. Code can be
+found at \href{https://github.com/lihenryhfl/pcdm}{this https url}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Spotlight at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Motion Tracks: A Unified Representation for Human-Robot Transfer in
+  Few-Shot Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juntao Ren, Priya Sundaresan, Dorsa Sadigh, Sanjiban Choudhury, Jeannette Bohg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teaching robots to autonomously complete everyday tasks remains a challenge.
+Imitation Learning (IL) is a powerful approach that imbues robots with skills
+via demonstrations, but is limited by the labor-intensive process of collecting
+teleoperated robot data. Human videos offer a scalable alternative, but it
+remains difficult to directly train IL policies from them due to the lack of
+robot action labels. To address this, we propose to represent actions as
+short-horizon 2D trajectories on an image. These actions, or motion tracks,
+capture the predicted direction of motion for either human hands or robot
+end-effectors. We instantiate an IL policy called Motion Track Policy (MT-pi)
+which receives image observations and outputs motion tracks as actions. By
+leveraging this unified, cross-embodiment action space, MT-pi completes tasks
+with high success given just minutes of human video and limited additional
+robot demonstrations. At test time, we predict motion tracks from two camera
+views, recovering 6DoF trajectories via multi-view synthesis. MT-pi achieves an
+average success rate of 86.5% across 4 real-world tasks, outperforming
+state-of-the-art IL baselines which do not leverage human data or our action
+space by 40%, and generalizes to scenarios seen only in human videos. Code and
+videos are available on our website
+https://portal-cornell.github.io/motion_track_policy/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Contrastive Learning on Multi-label Classification for
+  Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayang Wu, Wensheng Gan, Huashen Lu, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In business analysis, providing effective recommendations is essential for
+enhancing company profits. The utilization of graph-based structures, such as
+bipartite graphs, has gained popularity for their ability to analyze complex
+data relationships. Link prediction is crucial for recommending specific items
+to users. Traditional methods in this area often involve identifying patterns
+in the graph structure or using representational techniques like graph neural
+networks (GNNs). However, these approaches encounter difficulties as the volume
+of data increases. To address these challenges, we propose a model called Graph
+Contrastive Learning for Multi-label Classification (MCGCL). MCGCL leverages
+contrastive learning to enhance recommendation effectiveness. The model
+incorporates two training stages: a main task and a subtask. The main task is
+holistic user-item graph learning to capture user-item relationships. The
+homogeneous user-user (item-item) subgraph is constructed to capture user-user
+and item-item relationships in the subtask. We assessed the performance using
+real-world datasets from Amazon Reviews in multi-label classification tasks.
+Comparative experiments with state-of-the-art methods confirm the effectiveness
+of MCGCL, highlighting its potential for improving recommendation systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. 10 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Enrichment Work and AI Labor in Latin America and the Caribbean 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianna Williams, Maya De Los Santos, Alexandra To, Saiph Savage
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The global AI surge demands crowdworkers from diverse languages and cultures.
+They are pivotal in labeling data for enabling global AI systems. Despite
+global significance, research has primarily focused on understanding the
+perspectives and experiences of US and India crowdworkers, leaving a notable
+gap. To bridge this, we conducted a survey with 100 crowdworkers across 16
+Latin American and Caribbean countries. We discovered that these workers
+exhibited pride and respect for their digital labor, with strong support and
+admiration from their families. Notably, crowd work was also seen as a stepping
+stone to financial and professional independence. Surprisingly, despite wanting
+more connection, these workers also felt isolated from peers and doubtful of
+others' labor quality. They resisted collaboration and gender-based tools,
+valuing gender-neutrality. Our work advances HCI understanding of Latin
+American and Caribbean crowdwork, offering insights for digital resistance
+tools for the region.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages of content with 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combining LLM decision and RL action selection to improve RL policy for
+  adaptive interventions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karine Karine, Benjamin M. Marlin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) is increasingly being used in the healthcare
+domain, particularly for the development of personalized health adaptive
+interventions. Inspired by the success of Large Language Models (LLMs), we are
+interested in using LLMs to update the RL policy in real time, with the goal of
+accelerating personalization. We use the text-based user preference to
+influence the action selection on the fly, in order to immediately incorporate
+the user preference. We use the term "user preference" as a broad term to refer
+to a user personal preference, constraint, health status, or a statement
+expressing like or dislike, etc. Our novel approach is a hybrid method that
+combines the LLM response and the RL action selection to improve the RL policy.
+Given an LLM prompt that incorporates the user preference, the LLM acts as a
+filter in the typical RL action selection. We investigate different prompting
+strategies and action selection strategies. To evaluate our approach, we
+implement a simulation environment that generates the text-based user
+preferences and models the constraints that impact behavioral dynamics. We show
+that our approach is able to take into account the text-based user preferences,
+while improving the RL policy, thus improving personalization in adaptive
+intervention.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-Shot Task Learning through Inverse Generative Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.04987v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.04987v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviv Netanyahu, Yilun Du, Antonia Bronars, Jyothish Pari, Joshua Tenenbaum, Tianmin Shu, Pulkit Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning the intents of an agent, defined by its goals or motion style, is
+often extremely challenging from just a few examples. We refer to this problem
+as task concept learning and present our approach, Few-Shot Task Learning
+through Inverse Generative Modeling (FTL-IGM), which learns new task concepts
+by leveraging invertible neural generative models. The core idea is to pretrain
+a generative model on a set of basic concepts and their demonstrations. Then,
+given a few demonstrations of a new concept (such as a new goal or a new
+action), our method learns the underlying concepts through backpropagation
+without updating the model weights, thanks to the invertibility of the
+generative model. We evaluate our method in five domains -- object
+rearrangement, goal-oriented navigation, motion caption of human actions,
+autonomous driving, and real-world table-top manipulation. Our experimental
+results demonstrate that via the pretrained generative model, we successfully
+learn novel concepts and generate agent plans or motion corresponding to these
+concepts in (1) unseen environments and (2) in composition with training
+concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added acknowledgment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse Attention Vectors: Generative Multimodal Model Features Are
+  Discriminative Vision-Language Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.00142v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.00142v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chancharik Mitra, Brandon Huang, Tianning Chai, Zhiqiu Lin, Assaf Arbelle, Rogerio Feris, Leonid Karlinsky, Trevor Darrell, Deva Ramanan, Roei Herzig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Large Multimodal Models (LMMs) like LLaVA and Qwen-VL excel at a
+wide variety of vision-language (VL) tasks such as image captioning or visual
+question answering. Despite strong performance, LMMs are not directly suited
+for foundational discriminative vision-language tasks (i.e., tasks requiring
+discrete label predictions) such as image classification and multiple-choice
+VQA. One key challenge in utilizing LMMs for discriminative tasks is the
+extraction of useful features from generative models. To overcome this issue,
+we propose an approach for finding features in the model's latent space to more
+effectively leverage LMMs for discriminative tasks. Toward this end, we present
+Sparse Attention Vectors (SAVs) -- a finetuning-free method that leverages
+sparse attention head activations (fewer than 1\% of the heads) in LMMs as
+strong features for VL tasks. With only few-shot examples, SAVs demonstrate
+state-of-the-art performance compared to a variety of few-shot and finetuned
+baselines on a collection of discriminative tasks. Our experiments also imply
+that SAVs can scale in performance with additional examples and generalize to
+similar tasks, establishing SAVs as both effective and robust multimodal
+feature representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The infrastructure powering IBM's Gen AI model development 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05467v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05467v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Talia Gershon, Seetharami Seelam, Brian Belgodere, Milton Bonilla, Lan Hoang, Danny Barnett, I-Hsin Chung, Apoorve Mohan, Ming-Hung Chen, Lixiang Luo, Robert Walkup, Constantinos Evangelinos, Shweta Salaria, Marc Dombrowa, Yoonho Park, Apo Kayi, Liran Schour, Alim Alim, Ali Sydney, Pavlos Maniotis, Laurent Schares, Bernard Metzler, Bengi Karacali-Akyamac, Sophia Wen, Tatsuhiro Chiba, Sunyanan Choochotkaew, Takeshi Yoshimura, Claudia Misale, Tonia Elengikal, Kevin O Connor, Zhuoran Liu, Richard Molina, Lars Schneidenbach, James Caden, Christopher Laibinis, Carlos Fonseca, Vasily Tarasov, Swaminathan Sundararaman, Frank Schmuck, Scott Guthridge, Jeremy Cohn, Marc Eshel, Paul Muench, Runyu Liu, William Pointer, Drew Wyskida, Bob Krull, Ray Rose, Brent Wolfe, William Cornejo, John Walter, Colm Malone, Clifford Perucci, Frank Franco, Nigel Hinds, Bob Calio, Pavel Druyan, Robert Kilduff, John Kienle, Connor McStay, Andrew Figueroa, Matthew Connolly, Edie Fost, Gina Roma, Jake Fonseca, Ido Levy, Michele Payne, Ryan Schenkel, Amir Malki, Lion Schneider, Aniruddha Narkhede, Shekeba Moshref, Alexandra Kisin, Olga Dodin, Bill Rippon, Henry Wrieth, John Ganci, Johnny Colino, Donna Habeger-Rose, Rakesh Pandey, Aditya Gidh, Aditya Gaur, Dennis Patterson, Samsuddin Salmani, Rambilas Varma, Rumana Rumana, Shubham Sharma, Aditya Gaur, Mayank Mishra, Rameswar Panda, Aditya Prasad, Matt Stallone, Gaoyuan Zhang, Yikang Shen, David Cox, Ruchir Puri, Dakshi Agrawal, Drew Thorstensen, Joel Belog, Brent Tang, Saurabh Kumar Gupta, Amitabha Biswas, Anup Maheshwari, Eran Gampel, Jason Van Patten, Matthew Runion, Sai Kaki, Yigal Bogin, Brian Reitz, Steve Pritko, Shahan Najam, Surya Nambala, Radhika Chirra, Rick Welp, Frank DiMitri, Felipe Telles, Amilcar Arvelo, King Chu, Ed Seminaro, Andrew Schram, Felix Eickhoff, William Hanson, Eric Mckeever, Michael Light, Dinakaran Joseph, Piyush Chaudhary, Piyush Shivam, Puneet Chaudhary, Wesley Jones, Robert Guthrie, Chris Bostic, Rezaul Islam, Steve Duersch, Wayne Sawdon, John Lewars, Matthew Klos, Michael Spriggs, Bill McMillan, George Gao, Ashish Kamra, Gaurav Singh, Marc Curry, Tushar Katarki, Joe Talerico, Zenghui Shi, Sai Sindhur Malleni, Erwan Gallen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI Infrastructure plays a key role in the speed and cost-competitiveness of
+developing and deploying advanced AI models. The current demand for powerful AI
+infrastructure for model training is driven by the emergence of generative AI
+and foundational models, where on occasion thousands of GPUs must cooperate on
+a single training job for the model to be trained in a reasonable time.
+Delivering efficient and high-performing AI training requires an end-to-end
+solution that combines hardware, software and holistic telemetry to cater for
+multiple types of AI workloads. In this report, we describe IBM's hybrid cloud
+infrastructure that powers our generative AI model development. This
+infrastructure includes (1) Vela: an AI-optimized supercomputing capability
+directly integrated into the IBM Cloud, delivering scalable, dynamic,
+multi-tenant and geographically distributed infrastructure for large-scale
+model training and other AI workflow steps and (2) Blue Vela: a large-scale,
+purpose-built, on-premises hosting environment that is optimized to support our
+largest and most ambitious AI model training tasks. Vela provides IBM with the
+dual benefit of high performance for internal use along with the flexibility to
+adapt to an evolving commercial landscape. Blue Vela provides us with the
+benefits of rapid development of our largest and most ambitious models, as well
+as future-proofing against the evolving model landscape in the industry. Taken
+together, they provide IBM with the ability to rapidly innovate in the
+development of both AI models and commercial offerings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Corresponding Authors: Talia Gershon, Seetharami Seelam,Brian
+  Belgodere, Milton Bonilla</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scideator: Human-LLM Scientific Idea Generation Grounded in
+  Research-Paper Facet Recombination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14634v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14634v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marissa Radensky, Simra Shahid, Raymond Fok, Pao Siangliulue, Tom Hope, Daniel S. Weld
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scientific ideation process often involves blending salient aspects of
+existing papers to create new ideas. To see if large language models (LLMs) can
+assist this process, we contribute Scideator, a novel mixed-initiative tool for
+scientific ideation. Starting from a user-provided set of papers, Scideator
+extracts key facets (purposes, mechanisms, and evaluations) from these and
+relevant papers, allowing users to explore the idea space by interactively
+recombining facets to synthesize inventive ideas. Scideator also helps users to
+gauge idea novelty by searching the literature for potential overlaps and
+showing automated novelty assessments and explanations. To support these tasks,
+Scideator introduces four LLM-powered retrieval-augmented generation (RAG)
+modules: Analogous Paper Facet Finder, Faceted Idea Generator, Idea Novelty
+Checker, and Idea Novelty Iterator. In a within-subjects user study, 19
+computer-science researchers identified significantly more interesting ideas
+using Scideator compared to a strong baseline combining a scientific search
+engine with LLM interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added supplementary material</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Divergences between Language Models and Human Brains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09308v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09308v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Zhou, Emmy Liu, Graham Neubig, Michael J. Tarr, Leila Wehbe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Do machines and humans process language in similar ways? Recent research has
+hinted at the affirmative, showing that human neural activity can be
+effectively predicted using the internal representations of language models
+(LMs). Although such results are thought to reflect shared computational
+principles between LMs and human brains, there are also clear differences in
+how LMs and humans represent and use language. In this work, we systematically
+explore the divergences between human and machine language processing by
+examining the differences between LM representations and human brain responses
+to language as measured by Magnetoencephalography (MEG) across two datasets in
+which subjects read and listened to narrative stories. Using an LLM-based
+data-driven approach, we identify two domains that LMs do not capture well:
+social/emotional intelligence and physical commonsense. We validate these
+findings with human behavioral experiments and hypothesize that the gap is due
+to insufficient representations of social/emotional and physical knowledge in
+LMs. Our results show that fine-tuning LMs on these domains can improve their
+alignment with human brain responses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pre-train</span>ed Vision-Language Models Learn Discoverable Visual Concepts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.12652v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.12652v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Zang, Tian Yun, Hao Tan, Trung Bui, Chen Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Do vision-language models (VLMs) pre-trained to caption an image of a
+"durian" learn visual concepts such as "brown" (color) and "spiky" (texture) at
+the same time? We aim to answer this question as visual concepts learned "for
+free" would enable wide applications such as neuro-symbolic reasoning or
+human-interpretable object classification. We assume that the visual concepts,
+if captured by pre-trained VLMs, can be extracted by their vision-language
+interface with text-based concept prompts. We observe that recent works
+prompting VLMs with concepts often differ in their strategies to define and
+evaluate the visual concepts, leading to conflicting conclusions. We propose a
+new concept definition strategy based on two observations: First, certain
+concept prompts include shortcuts that recognize correct concepts for wrong
+reasons; Second, multimodal information (e.g. visual discriminativeness, and
+textual knowledge) should be leveraged when selecting the concepts. Our
+proposed concept discovery and learning (CDL) framework is thus designed to
+identify a diverse list of generic visual concepts (e.g. "spiky" as opposed to
+"spiky durian"), which are ranked and selected based on visual and language
+mutual information. We carefully design quantitative and human evaluations of
+the discovered concepts on six diverse visual recognition datasets, which
+confirm that pre-trained VLMs do learn visual concepts that provide accurate
+and thorough descriptions for the recognized objects. All code and models are
+publicly released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Transactions on Machine Learning Research, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cocoa: Co-Planning and Co-Execution with AI Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10999v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10999v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        K. J. Kevin Feng, Kevin Pu, Matt Latzke, Tal August, Pao Siangliulue, Jonathan Bragg, Daniel S. Weld, Amy X. Zhang, Joseph Chee Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Cocoa, a system that implements a novel interaction design pattern
+-- interactive plans -- for users to collaborate with an AI agent on complex,
+multi-step tasks in a document editor. Cocoa harmonizes human and AI efforts
+and enables flexible delegation of agency through two actions: Co-planning
+(where users collaboratively compose a plan of action with the agent) and
+Co-execution (where users collaboratively execute plan steps with the agent).
+Using scientific research as a sample domain, we motivate the design of Cocoa
+through a formative study with 9 researchers while also drawing inspiration
+from the design of computational notebooks. We evaluate Cocoa through a user
+study with 16 researchers and find that when compared to a strong chat
+baseline, Cocoa improved agent steerability without sacrificing ease of use. A
+deeper investigation of the general utility of both systems uncovered insights
+into usage contexts where interactive plans may be more appropriate than chat,
+and vice versa. Our work surfaces numerous practical implications and paves new
+paths for interactive interfaces that foster more effective collaboration
+between humans and agentic AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Context Matters: Leveraging Contextual Features for Time Series
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.12672v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.12672v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sameep Chattopadhyay, Pulkit Paliwal, Sai Shankar Narasimhan, Shubhankar Agarwal, Sandeep P. Chinchali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series forecasts are often influenced by exogenous contextual features
+in addition to their corresponding history. For example, in financial settings,
+it is hard to accurately predict a stock price without considering public
+sentiments and policy decisions in the form of news articles, tweets, etc.
+Though this is common knowledge, the current state-of-the-art (SOTA)
+forecasting models fail to incorporate such contextual information, owing to
+its heterogeneity and multimodal nature. To address this, we introduce
+ContextFormer, a novel plug-and-play method to surgically integrate multimodal
+contextual information into existing pre-trained forecasting models.
+ContextFormer effectively distills forecast-specific information from rich
+multimodal contexts, including categorical, continuous, time-varying, and even
+textual information, to significantly enhance the performance of existing base
+forecasters. ContextFormer outperforms SOTA forecasting models by up to 30% on
+a range of real-world datasets spanning energy, traffic, environmental, and
+financial domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Mixed-Integer Conic Program for the Moving-Target Traveling Salesman
+  Problem based on a Graph of Convex Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04917v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04917v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allen George Philip, Zhongqiang Ren, Sivakumar Rathinam, Howie Choset
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new formulation that finds the optimum for the
+Moving-Target Traveling Salesman Problem (MT-TSP), which seeks to find a
+shortest path for an agent, that starts at a depot, visits a set of moving
+targets exactly once within their assigned time-windows, and returns to the
+depot. The formulation relies on the key idea that when the targets move along
+lines, their trajectories become convex sets within the space-time coordinate
+system. The problem then reduces to finding the shortest path within a graph of
+convex sets, subject to some speed constraints. We compare our formulation with
+the current state-of-the-art Mixed Integer Conic Program (MICP) solver for the
+MT-TSP. The experimental results show that our formulation outperforms the MICP
+for instances with up to 20 targets, with up to two orders of magnitude
+reduction in runtime, and up to a 60\% tighter optimality gap. We also show
+that the solution cost from the convex relaxation of our formulation provides
+significantly tighter lower bounds for the MT-TSP than the ones from the MICP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Remove that Square Root: A New Efficient Scale-Invariant Version of
+  AdaGrad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02648v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02648v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sayantan Choudhury, Nazarii Tupitsa, Nicolas Loizou, Samuel Horvath, Martin Takac, Eduard Gorbunov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive methods are extremely popular in machine learning as they make
+learning rate tuning less expensive. This paper introduces a novel optimization
+algorithm named KATE, which presents a scale-invariant adaptation of the
+well-known AdaGrad algorithm. We prove the scale-invariance of KATE for the
+case of Generalized Linear Models. Moreover, for general smooth non-convex
+problems, we establish a convergence rate of $O \left(\frac{\log T}{\sqrt{T}}
+\right)$ for KATE, matching the best-known ones for AdaGrad and Adam. We also
+compare KATE to other state-of-the-art adaptive algorithms Adam and AdaGrad in
+numerical experiments with different problems, including complex machine
+learning tasks like image classification and text classification on real data.
+The results indicate that KATE consistently outperforms AdaGrad and
+matches/surpasses the performance of Adam in all considered scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harnessing Multimodal Large Language Models for Multimodal Sequential
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09698v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09698v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyang Ye, Zhi Zheng, Yishan Shen, Tianshu Wang, Hengruo Zhang, Peijun Zhu, Runlong Yu, Kai Zhang, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Large Language Models (LLMs) have demonstrated significant
+potential in the field of Recommendation Systems (RSs). Most existing studies
+have focused on converting user behavior logs into textual prompts and
+leveraging techniques such as prompt tuning to enable LLMs for recommendation
+tasks. Meanwhile, research interest has recently grown in multimodal
+recommendation systems that integrate data from images, text, and other sources
+using modality fusion techniques. This introduces new challenges to the
+existing LLM-based recommendation paradigm which relies solely on text modality
+information. Moreover, although Multimodal Large Language Models (MLLMs)
+capable of processing multi-modal inputs have emerged, how to equip MLLMs with
+multi-modal recommendation capabilities remains largely unexplored. To this
+end, in this paper, we propose the Multimodal Large Language Model-enhanced
+Multimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic
+user preference, we design a two-stage user preference summarization method.
+Specifically, we first utilize an MLLM-based item-summarizer to extract image
+feature given an item and convert the image into text. Then, we employ a
+recurrent user preference summarization generation paradigm to capture the
+dynamic changes in user preferences based on an LLM-based user-summarizer.
+Finally, to enable the MLLM for multi-modal recommendation task, we propose to
+fine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT)
+techniques. Extensive evaluations across various datasets validate the
+effectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt
+to the evolving dynamics of user preferences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The importance of visual modelling languages in generative software
+  engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.17976v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.17976v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roberto Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal GPTs represent a watershed in the interplay between Software
+Engineering and Generative Artificial Intelligence. GPT-4 accepts image and
+text inputs, rather than simply natural language. We investigate relevant use
+cases stemming from these enhanced capabilities of GPT-4. To the best of our
+knowledge, no other work has investigated similar use cases involving Software
+Engineering tasks carried out via multimodal GPTs prompted with a mix of
+diagrams and natural language.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, working paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FlashRNN: Optimizing Traditional RNNs on Modern Hardware 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07752v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07752v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Korbinian Pöppel, Maximilian Beck, Sepp Hochreiter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Transformers and other sequence-parallelizable neural network
+architectures seem like the current state of the art in sequence modeling, they
+specifically lack state-tracking capabilities. These are important for
+time-series tasks and logical reasoning. Traditional RNNs like LSTMs and GRUs,
+as well as modern variants like sLSTM do have these capabilities at the cost of
+strictly sequential processing. While this is often seen as a strong
+limitation, we show how fast these networks can get with our
+hardware-optimization FlashRNN in Triton and CUDA, optimizing kernels to the
+register level on modern GPUs. We extend traditional RNNs with a
+parallelization variant that processes multiple RNNs of smaller hidden state in
+parallel, similar to the head-wise processing in Transformers. To enable
+flexibility on different GPU variants, we introduce a new optimization
+framework for hardware-internal cache sizes, memory and compute handling. It
+models the hardware in a setting using polyhedral-like constraints, including
+the notion of divisibility. This speeds up the solution process in our
+ConstrINT library for general integer constraint satisfaction problems (integer
+CSPs). We show that our kernels can achieve 50x speed-ups over a vanilla
+PyTorch implementation and allow 40x larger hidden sizes compared to our Triton
+implementation. Our open-source kernels and the optimization library are
+released here to boost research in the direction of state-tracking enabled RNNs
+and sequence modeling: \url{https://github.com/NX-AI/flashrnn}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainable AI for Classifying UTI Risk Groups Using a Real-World Linked
+  EHR and Pathology Lab <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.17645v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.17645v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Dai, Brian Sullivan, Axel Montout, Amy Dillon, Chris Waller, Peter Acs, Rachel Denholm, Philip Williams, Alastair D Hay, Raul Santos-Rodriguez, Andrew Dowsey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of machine learning and AI on electronic health records (EHRs) holds
+substantial potential for clinical insight. However, this approach faces
+challenges due to data heterogeneity, sparsity, temporal misalignment, and
+limited labeled outcomes. In this context, we leverage a linked EHR dataset of
+approximately one million de-identified individuals from Bristol, North
+Somerset, and South Gloucestershire, UK, to characterize urinary tract
+infections (UTIs). We implemented a data pre-processing and curation pipeline
+that transforms the raw EHR data into a structured format suitable for
+developing predictive models focused on data fairness, accountability and
+transparency. Given the limited availability and biases of ground truth UTI
+outcomes, we introduce a UTI risk estimation framework informed by clinical
+expertise to estimate UTI risk across individual patient timelines. Pairwise
+XGBoost models are trained using this framework to differentiate UTI risk
+categories with explainable AI techniques applied to identify key predictors
+and support interpretability. Our findings reveal differences in clinical and
+demographic predictors across risk groups. While this study highlights the
+potential of AI-driven insights to support UTI clinical decision-making,
+further investigation of patient sub-strata and extensive validation are needed
+to ensure robustness and applicability in clinical practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Small Language Models can Outperform Humans in Short Creative Writing: A
+  Study Comparing SLMs with Humans and LLMs <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.11547v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.11547v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillermo Marco, Luz Rello, Julio Gonzalo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we evaluate the creative fiction writing abilities of a
+fine-tuned small language model (SLM), BART-large, and compare its performance
+to human writers and two large language models (LLMs): GPT-3.5 and GPT-4o. Our
+evaluation consists of two experiments: (i) a human study in which 68
+participants rated short stories from humans and the SLM on grammaticality,
+relevance, creativity, and attractiveness, and (ii) a qualitative linguistic
+analysis examining the textual characteristics of stories produced by each
+model. In the first experiment, BART-large outscored average human writers
+overall (2.11 vs. 1.85), a 14% relative improvement, though the slight human
+advantage in creativity was not statistically significant. In the second
+experiment, qualitative analysis showed that while GPT-4o demonstrated
+near-perfect coherence and used less cliche phrases, it tended to produce more
+predictable language, with only 3% of its synopses featuring surprising
+associations (compared to 15% for BART). These findings highlight how model
+size and fine-tuning influence the balance between creativity, fluency, and
+coherence in creative writing tasks, and demonstrate that smaller models can,
+in certain contexts, rival both humans and larger models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as Main Conference Paper at COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Pupil Segmentation with SAM 2: A Case Study of Over 14 Million
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.08926v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.08926v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Virmarie Maquiling, Sean Anthony Byrne, Diederick C. Niehorster, Marco Carminati, Enkelejda Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the transformative potential of SAM 2, a vision foundation model,
+in advancing gaze estimation and eye tracking technologies. By significantly
+reducing annotation time, lowering technical barriers through its ease of
+deployment, and enhancing segmentation accuracy, SAM 2 addresses critical
+challenges faced by researchers and practitioners. Utilizing its zero-shot
+segmentation capabilities with minimal user input-a single click per video-we
+tested SAM 2 on over 14 million eye images from diverse datasets, including
+virtual reality setups and the world's largest unified dataset recorded using
+wearable eye trackers. Remarkably, in pupil segmentation tasks, SAM 2 matches
+the performance of domain-specific models trained solely on eye images,
+achieving competitive mean Intersection over Union (mIoU) scores of up to 93%
+without fine-tuning. Additionally, we provide our code and segmentation masks
+for these widely used datasets to promote further research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Virmarie Maquiling and Sean Anthony Byrne contributed equally to this
+  paper, 8 pages, 3 figures, ETRA 2025, pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distributed Representations Enable Robust Multi-Timescale Symbolic
+  Computation in Neuromorphic Hardware 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.01305v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.01305v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Madison Cotteret, Hugh Greatorex, Alpha Renner, Junren Chen, Emre Neftci, Huaqiang Wu, Giacomo Indiveri, Martin Ziegler, Elisabetta Chicca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Programming recurrent spiking neural networks (RSNNs) to robustly perform
+multi-timescale computation remains a difficult challenge. To address this, we
+describe a single-shot weight learning scheme to embed robust multi-timescale
+dynamics into attractor-based RSNNs, by exploiting the properties of
+high-dimensional distributed representations. We embed finite state machines
+into the RSNN dynamics by superimposing a symmetric autoassociative weight
+matrix and asymmetric transition terms, which are each formed by the vector
+binding of an input and heteroassociative outer-products between states. Our
+approach is validated through simulations with highly nonideal weights; an
+experimental closed-loop memristive hardware setup; and on Loihi 2, where it
+scales seamlessly to large state machines. This work introduces a scalable
+approach to embed robust symbolic computation through recurrent dynamics into
+neuromorphic hardware, without requiring parameter fine-tuning or significant
+platform-specific optimisation. Moreover, it demonstrates that distributed
+symbolic representations serve as a highly capable representation-invariant
+language for cognitive algorithms in neuromorphic hardware.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 7 figures. Supplementary material: 13 pages, 8 figures.
+  Accepted for publication in Neuromorphic Computing and Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constructing and explaining machine learning models for chemistry:
+  example of the exploration and design of boron-based Lewis acids 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01576v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01576v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juliette Fenogli, Laurence Grimaud, Rodolphe Vuilleumier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of machine learning (ML) into chemistry offers transformative
+potential in the design of molecules with targeted properties. However, the
+focus has often been on creating highly efficient predictive models, sometimes
+at the expense of interpretability. In this study, we leverage explainable AI
+techniques to explore the rational design of boron-based Lewis acids, which
+play a pivotal role in organic reactions due to their electron-ccepting
+properties. Using Fluoride Ion Affinity as a proxy for Lewis acidity, we
+developed interpretable ML models based on chemically meaningful descriptors,
+including ab initio computed features and substituent-based parameters derived
+from the Hammett linear free-energy relationship. By constraining the chemical
+space to well-defined molecular scaffolds, we achieved highly accurate
+predictions (mean absolute error < 6 kJ/mol), surpassing conventional black-box
+deep learning models in low-data regimes. Interpretability analyses of the
+models shed light on the origin of Lewis acidity in these compounds and
+identified actionable levers to modulate it through the nature and positioning
+of substituents on the molecular scaffold. This work bridges ML and chemist's
+way of thinking, demonstrating how explainable models can inspire molecular
+design and enhance scientific understanding of chemical reactivity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main text is 14 pages, 7 figures, 1 scheme. Supporting information is
+  25 pages. For associated code and datasets, see
+  https://github.com/jfenogli/XAI_boron_LA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Project Tracyn: Generative Artificial Intelligence based Peripherals
+  Trace Synthesizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.06376v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.06376v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhibai Huang, Yihan Shen, Yongchen Xie, Zhixiang Wei, Yun wang, Fangxin Liu, Tao Song, Zhengwei Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Peripheral Component Interconnect Express (PCIe) is the de facto interconnect
+standard for high-speed peripherals and CPUs. Prototyping and optimizing PCIe
+devices for emerging scenarios is an ongoing challenge. Since Transaction Layer
+Packets (TLPs) capture device-CPU interactions, it is crucial to analyze and
+generate realistic TLP traces for effective device design and optimization.
+Generative AI offers a promising approach for creating intricate, custom TLP
+traces necessary for PCIe hardware and software development. However, existing
+models often generate impractical traces due to the absence of PCIe-specific
+constraints, such as TLP ordering and causality. This paper presents Phantom,
+the first framework that treats TLP trace generation as a generative AI problem
+while incorporating PCIe-specific constraints. We validate Phantom's
+effectiveness by generating TLP traces for an actual PCIe network interface
+card. Experimental results show that Phantom produces practical, large-scale
+TLP traces, significantly outperforming existing models, with improvements of
+up to 1000$\times$ in task-specific metrics and up to 2.19$\times$ in Frechet
+Inception Distance (FID) compared to backbone-only methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Out-of-Entity Errors in Named Entity Recognition: A
+  Sentence-Level Strategy <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.08434v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.08434v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guochao Jiang, Ziqin Luo, Chengwei Hu, Zepeng Ding, Deqing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many previous models of named entity recognition (NER) suffer from the
+problem of Out-of-Entity (OOE), i.e., the tokens in the entity mentions of the
+test samples have not appeared in the training samples, which hinders the
+achievement of satisfactory performance. To improve OOE-NER performance, in
+this paper, we propose a new framework, namely S+NER, which fully leverages
+sentence-level information. Our S+NER achieves better OOE-NER performance
+mainly due to the following two particular designs. 1) It first exploits the
+pre-trained language model's capability of understanding the target entity's
+sentence-level context with a template set. 2) Then, it refines the
+sentence-level representation based on the positive and negative templates,
+through a contrastive learning strategy and template pooling method, to obtain
+better NER results. Our extensive experiments on five benchmark datasets have
+demonstrated that, our S+NER outperforms some state-of-the-art OOE-NER models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QuadWBG: Generalizable Quadrupedal Whole-Body Grasping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.06782v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.06782v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jilong Wang, Javokhirbek Rajabov, Chaoyi Xu, Yiming Zheng, He Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legged robots with advanced manipulation capabilities have the potential to
+significantly improve household duties and urban maintenance. Despite
+considerable progress in developing robust locomotion and precise manipulation
+methods, seamlessly integrating these into cohesive whole-body control for
+real-world applications remains challenging. In this paper, we present a
+modular framework for robust and generalizable whole-body loco-manipulation
+controller based on a single arm-mounted camera. By using reinforcement
+learning (RL), we enable a robust low-level policy for command execution over 5
+dimensions (5D) and a grasp-aware high-level policy guided by a novel metric,
+Generalized Oriented Reachability Map (GORM). The proposed system achieves
+state-of-the-art one-time grasping accuracy of 89% in the real world, including
+challenging tasks such as grasping transparent objects. Through extensive
+simulations and real-world experiments, we demonstrate that our system can
+effectively manage a large workspace, from floor level to above body height,
+and perform diverse whole-body loco-manipulation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor
+  Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03836v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03836v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runci Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain tumors can result in neurological dysfunction, alterations in cognitive
+and psychological states, increased intracranial pressure, and the occurrence
+of seizures, thereby presenting a substantial risk to human life and health.
+The You Only Look Once(YOLO) series models have demonstrated superior accuracy
+in object detection for medical imaging. In this paper, we develop a novel
+SCC-YOLO architecture by integrating the SCConv attention mechanism into
+YOLOv9. The SCConv module reconstructs an efficient convolutional module by
+reducing spatial and channel redundancy among features, thereby enhancing the
+learning of image features. We investigate the impact of intergrating different
+attention mechanisms with the YOLOv9 model on brain tumor image detection using
+both the Br35H dataset and our self-made dataset(Brain_Tumor_Dataset).
+Experimental results show that on the Br35H dataset, SCC-YOLO achieved a 0.3%
+improvement in mAp50 compared to YOLOv9, while on our self-made dataset,
+SCC-YOLO exhibited a 0.5% improvement over YOLOv9. SCC-YOLO has reached
+state-of-the-art performance in brain tumor detection. Source code is available
+at : https://jihulab.com/healthcare-information-studio/SCC-YOLO/-/tree/master
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-Driven Early Mental Health Screening: Analyzing Selfies of Pregnant
+  Women <span class="chip">ALT</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gustavo A. Basílio, Thiago B. Pereira, Alessandro L. Koerich, Hermano Tavares, Ludmila Dias, Maria das Graças da S. Teixeira, Rafael T. Sousa, Wilian H. Hisatugu, Amanda S. Mota, Anilton S. Garcia, Marco Aurélio K. Galletta, Thiago M. Paixão
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Major Depressive Disorder and anxiety disorders affect millions globally,
+contributing significantly to the burden of mental health issues. Early
+screening is crucial for effective intervention, as timely identification of
+mental health issues can significantly improve treatment outcomes. Artificial
+intelligence (AI) can be valuable for improving the screening of mental
+disorders, enabling early intervention and better treatment outcomes. AI-driven
+screening can leverage the analysis of multiple data sources, including facial
+features in digital images. However, existing methods often rely on controlled
+environments or specialized equipment, limiting their broad applicability. This
+study explores the potential of AI models for ubiquitous depression-anxiety
+screening given face-centric selfies. The investigation focuses on high-risk
+pregnant patients, a population that is particularly vulnerable to mental
+health issues. To cope with limited training data resulting from our clinical
+setup, pre-trained models were utilized in two different approaches:
+fine-tuning convolutional neural networks (CNNs) originally designed for facial
+expression recognition and employing vision-language models (VLMs) for
+zero-shot analysis of facial expressions. Experimental results indicate that
+the proposed VLM-based method significantly outperforms CNNs, achieving an
+accuracy of 77.6%. Although there is significant room for improvement, the
+results suggest that VLMs can be a promising approach for mental health
+screening.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article has been accepted for publication in HEALTHINF25 at the
+  18th International Joint Conference on Biomedical Engineering Systems and
+  Technologies (BIOSTEC 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DrLLM: <span class="highlight-title">Prompt</span>-Enhanced Distributed Denial-of-Service Resistance Method
+  with Large Language Models <span class="chip">ICASSP2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.10561v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.10561v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyu Yin, Shang Liu, Guangyuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing number of Distributed Denial of Service (DDoS) attacks poses a
+major threat to the Internet, highlighting the importance of DDoS mitigation.
+Most existing approaches require complex training methods to learn data
+features, which increases the complexity and generality of the application. In
+this paper, we propose DrLLM, which aims to mine anomalous traffic information
+in zero-shot scenarios through Large Language Models (LLMs). To bridge the gap
+between DrLLM and existing approaches, we embed the global and local
+information of the traffic data into the reasoning paradigm and design three
+modules, namely Knowledge Embedding, Token Embedding, and Progressive Role
+Reasoning, for data representation and reasoning. In addition we explore the
+generalization of prompt engineering in the cybersecurity domain to improve the
+classification capability of DrLLM. Our ablation experiments demonstrate the
+applicability of DrLLM in zero-shot scenarios and further demonstrate the
+potential of LLMs in the network domains. DrLLM implementation code has been
+open-sourced at https://github.com/liuup/DrLLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Head Explainer: A General Framework to Improve Explainability in
+  CNNs and <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01311v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01311v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohang Sun, Pietro Liò
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce the Multi-Head Explainer (MHEX), a versatile and
+modular framework that enhances both the explainability and accuracy of
+Convolutional Neural Networks (CNNs) and Transformer-based models. MHEX
+consists of three core components: an Attention Gate that dynamically
+highlights task-relevant features, Deep Supervision that guides early layers to
+capture fine-grained details pertinent to the target class, and an Equivalent
+Matrix that unifies refined local and global representations to generate
+comprehensive saliency maps. Our approach demonstrates superior compatibility,
+enabling effortless integration into existing residual networks like ResNet and
+Transformer architectures such as BERT with minimal modifications. Extensive
+experiments on benchmark datasets in medical imaging and text classification
+show that MHEX not only improves classification accuracy but also produces
+highly interpretable and detailed saliency scores.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tiny Models are the Computational Saver for Large Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17726v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17726v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyuan Wang, Barry Cardiff, Antoine Frappé, Benoit Larras, Deepu John
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces TinySaver, an early-exit-like dynamic model compression
+approach which employs tiny models to substitute large models adaptively.
+Distinct from traditional compression techniques, dynamic methods like
+TinySaver can leverage the difficulty differences to allow certain inputs to
+complete their inference processes early, thereby conserving computational
+resources. Most existing early exit designs are implemented by attaching
+additional network branches to the model's backbone. Our study, however,
+reveals that completely independent tiny models can replace a substantial
+portion of the larger models' job with minimal impact on performance. Employing
+them as the first exit can remarkably enhance computational efficiency. By
+searching and employing the most appropriate tiny model as the computational
+saver for a given large model, the proposed approaches work as a novel and
+generic method to model compression. This finding will help the research
+community in exploring new compression methods to address the escalating
+computational demands posed by rapidly evolving AI models. Our evaluation of
+this approach in ImageNet-1k classification demonstrates its potential to
+reduce the number of compute operations by up to 90\%, with only negligible
+losses in performance, across various modern vision models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imitating from auxiliary imperfect demonstrations via Adversarial
+  Density Weighted Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20351v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20351v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Zhang, Zifeng Zhuang, Jingzehua Xu, Yiyuan Yang, Yubo Huang, Donglin Wang, Shuai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel one-step supervised imitation learning (IL) framework
+called Adversarial Density Regression (ADR). This IL framework aims to correct
+the policy learned on unknown-quality to match the expert distribution by
+utilizing demonstrations, without relying on the Bellman operator.
+Specifically, ADR addresses several limitations in previous IL algorithms:
+First, most IL algorithms are based on the Bellman operator, which inevitably
+suffer from cumulative offsets from sub-optimal rewards during multi-step
+update processes. Additionally, off-policy training frameworks suffer from
+Out-of-Distribution (OOD) state-actions. Second, while conservative terms help
+solve the OOD issue, balancing the conservative term is difficult. To address
+these limitations, we fully integrate a one-step density-weighted Behavioral
+Cloning (BC) objective for IL with auxiliary imperfect demonstration.
+Theoretically, we demonstrate that this adaptation can effectively correct the
+distribution of policies trained on unknown-quality datasets to align with the
+expert policy's distribution. Moreover, the difference between the empirical
+and the optimal value function is proportional to the upper bound of ADR's
+objective, indicating that minimizing ADR's objective is akin to approaching
+the optimal value. Experimentally, we validated the performance of ADR by
+conducting extensive evaluations. Specifically, ADR outperforms all of the
+selected IL algorithms on tasks from the Gym-Mujoco domain. Meanwhile, it
+achieves an 89.5% improvement over IQL when utilizing ground truth rewards on
+tasks from the Adroit and Kitchen domains. Our codebase will be released at:
+https://github.com/stevezhangzA/Adverserial_Density_Regression.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ D3RM: A Discrete Denoising Diffusion Refinement Model for Piano
+  Transcription <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05068v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05068v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hounsu Kim, Taegyun Kwon, Juhan Nam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have been widely used in the generative domain due to their
+convincing performance in modeling complex data distributions. Moreover, they
+have shown competitive results on discriminative tasks, such as image
+segmentation. While diffusion models have also been explored for automatic
+music transcription, their performance has yet to reach a competitive level. In
+this paper, we focus on discrete diffusion model's refinement capabilities and
+present a novel architecture for piano transcription. Our model utilizes
+Neighborhood Attention layers as the denoising module, gradually predicting the
+target high-resolution piano roll, conditioned on the finetuned features of a
+pretrained acoustic model. To further enhance refinement, we devise a novel
+strategy which applies distinct transition states during training and inference
+stage of discrete diffusion models. Experiments on the MAESTRO dataset show
+that our approach outperforms previous diffusion-based piano transcription
+models and the baseline model in terms of F1 score. Our code is available in
+https://github.com/hanshounsu/d3rm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are LLMs Good Cryptic Crossword Solvers? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrahman Sadallah, Daria Kotova, Ekaterina Kochmar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cryptic crosswords are puzzles that rely not only on general knowledge but
+also on the solver's ability to manipulate language on different levels and
+deal with various types of wordplay. Previous research suggests that solving
+such puzzles is a challenge even for modern NLP models. However, the abilities
+of large language models (LLMs) have not yet been tested on this task. In this
+paper, we establish the benchmark results for three popular LLMs -- LLaMA2,
+Mistral, and ChatGPT -- showing that their performance on this task is still
+far from that of humans.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object
+  Interaction Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenkun He, Yun Liu, Ruitao Liu, Li Yi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthesizing realistic human-object interaction motions is a critical problem
+in VR/AR and human animation. Unlike the commonly studied scenarios involving a
+single human or hand interacting with one object, we address a more generic
+multi-body setting with arbitrary numbers of humans, hands, and objects. This
+complexity introduces significant challenges in synchronizing motions due to
+the high correlations and mutual influences among bodies. To address these
+challenges, we introduce SyncDiff, a novel method for multi-body interaction
+synthesis using a synchronized motion diffusion strategy. SyncDiff employs a
+single diffusion model to capture the joint distribution of multi-body motions.
+To enhance motion fidelity, we propose a frequency-domain motion decomposition
+scheme. Additionally, we introduce a new set of alignment scores to emphasize
+the synchronization of different body motions. SyncDiff jointly optimizes both
+data sample likelihood and alignment likelihood through an explicit
+synchronization strategy. Extensive experiments across four datasets with
+various multi-body configurations demonstrate the superiority of SyncDiff over
+existing state-of-the-art motion synthesis methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VaeDiff-DocRE: End-to-end Data Augmentation Framework for Document-level
+  Relation Extraction <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13503v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13503v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khai Phan Tran, Wen Hua, Xue Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document-level Relation Extraction (DocRE) aims to identify relationships
+between entity pairs within a document. However, most existing methods assume a
+uniform label distribution, resulting in suboptimal performance on real-world,
+imbalanced datasets. To tackle this challenge, we propose a novel data
+augmentation approach using generative models to enhance data from the
+embedding space. Our method leverages the Variational Autoencoder (VAE)
+architecture to capture all relation-wise distributions formed by entity pair
+representations and augment data for underrepresented relations. To better
+capture the multi-label nature of DocRE, we parameterize the VAE's latent space
+with a Diffusion Model. Additionally, we introduce a hierarchical training
+framework to integrate the proposed VAE-based augmentation module into DocRE
+systems. Experiments on two benchmark datasets demonstrate that our method
+outperforms state-of-the-art models, effectively addressing the long-tail
+distribution problem in DocRE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PSA-VLM: Enhancing Vision-Language Model Safety through Progressive
+  Concept-Bottleneck-Driven Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11543v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11543v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhendong Liu, Yuanbi Nie, Yingshui Tan, Jiaheng Liu, Xiangyu Yue, Qiushi Cui, Chongjun Wang, Xiaoyong Zhu, Bo Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from the powerful capabilities of Large Language Models (LLMs),
+pre-trained visual encoder models connected to LLMs form Vision Language Models
+(VLMs). However, recent research shows that the visual modality in VLMs is
+highly vulnerable, allowing attackers to bypass safety alignment in LLMs
+through visually transmitted content, launching harmful attacks. To address
+this challenge, we propose a progressive concept-based alignment strategy,
+PSA-VLM, which incorporates safety modules as concept bottlenecks to enhance
+visual modality safety alignment. By aligning model predictions with specific
+safety concepts, we improve defenses against risky images, enhancing
+explainability and controllability while minimally impacting general
+performance. Our method is obtained through two-stage training. The low
+computational cost of the first stage brings very effective performance
+improvement, and the fine-tuning of the language model in the second stage
+further improves the safety performance. Our method achieves state-of-the-art
+results on popular VLM safety benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2405.13581</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Migician: Revealing the Magic of Free-Form Multi-Image Grounding in
+  Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05767v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05767v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        You Li, Heyu Huang, Chi Chen, Kaiyu Huang, Chao Huang, Zonghao Guo, Zhiyuan Liu, Jinan Xu, Yuhua Li, Ruixuan Li, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advancement of Multimodal Large Language Models (MLLMs) has
+significantly improved their fine-grained perception of single images and
+general comprehension across multiple images. However, existing MLLMs still
+face challenges in achieving precise grounding in complex multi-image
+scenarios. To address this, we first explore a Chain-of-Thought (CoT) framework
+that integrates single-image grounding with multi-image comprehension. While
+partially effective, it remains unstable and struggles to capture abstract
+visual information due to its non-end-to-end nature. Therefore, we introduce
+Migician, the first multi-image grounding model capable of performing free-form
+and accurate grounding across multiple images. To support this, we present the
+MGrounding-630k dataset, which comprises data for several multi-image grounding
+tasks derived from existing datasets, along with newly generated free-form
+grounding instruction-following data. Furthermore, we propose MIG-Bench, a
+comprehensive benchmark specifically designed for evaluating multi-image
+grounding capabilities. Experimental results demonstrate that our model
+achieves significantly superior multi-image grounding capabilities,
+outperforming the best existing MLLMs by 21.61% and even surpassing much larger
+70B models. Our code, model, dataset, and benchmark are fully open-sourced at
+https://migician-vg.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MusicLIME: Explainable Multimodal Music Understanding <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.10496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.10496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theodoros Sotirou, Vassilis Lyberatos, Orfeas Menis Mastromichalakis, Giorgos Stamou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal models are critical for music understanding tasks, as they capture
+the complex interplay between audio and lyrics. However, as these models become
+more prevalent, the need for explainability grows-understanding how these
+systems make decisions is vital for ensuring fairness, reducing bias, and
+fostering trust. In this paper, we introduce MusicLIME, a model-agnostic
+feature importance explanation method designed for multimodal music models.
+Unlike traditional unimodal methods, which analyze each modality separately
+without considering the interaction between them, often leading to incomplete
+or misleading explanations, MusicLIME reveals how audio and lyrical features
+interact and contribute to predictions, providing a holistic view of the
+model's decision-making. Additionally, we enhance local explanations by
+aggregating them into global explanations, giving users a broader perspective
+of model behavior. Through this work, we contribute to improving the
+interpretability of multimodal music models, empowering users to make informed
+choices, and fostering more equitable, fair, and transparent music
+understanding systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GitHub repository: https://github.com/IamTheo2000/MusicLIME. To be
+  presented at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Assessment and manipulation of latent constructs in <span class="highlight-title">pre-train</span>ed language
+  models using psychometric scales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.19655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.19655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maor Reuben, Ortal Slobodin, Aviad Elyshar, Idan-Chaim Cohen, Orna Braun-Lewensohn, Odeya Cohen, Rami Puzis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-like personality traits have recently been discovered in large language
+models, raising the hypothesis that their (known and as yet undiscovered)
+biases conform with human latent psychological constructs. While large
+conversational models may be tricked into answering psychometric
+questionnaires, the latent psychological constructs of thousands of simpler
+transformers, trained for other tasks, cannot be assessed because appropriate
+psychometric methods are currently lacking. Here, we show how standard
+psychological questionnaires can be reformulated into natural language
+inference prompts, and we provide a code library to support the psychometric
+assessment of arbitrary models. We demonstrate, using a sample of 88 publicly
+available models, the existence of human-like mental health-related constructs
+(including anxiety, depression, and Sense of Coherence) which conform with
+standard theories in human psychology and show similar correlations and
+mitigation strategies. The ability to interpret and rectify the performance of
+language models by using psychological tools can boost the development of more
+explainable, controllable, and trustworthy models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InstructOCR: Instruction Boosting Scene Text Spotting <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15523v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15523v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Duan, Qianyi Jiang, Pei Fu, Jiamin Chen, Shengxi Li, Zining Wang, Shan Guo, Junfeng Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of scene text spotting, previous OCR methods primarily relied on
+image encoders and pre-trained text information, but they often overlooked the
+advantages of incorporating human language instructions. To address this gap,
+we propose InstructOCR, an innovative instruction-based scene text spotting
+model that leverages human language instructions to enhance the understanding
+of text within images. Our framework employs both text and image encoders
+during training and inference, along with instructions meticulously designed
+based on text attributes. This approach enables the model to interpret text
+more accurately and flexibly. Extensive experiments demonstrate the
+effectiveness of our model and we achieve state-of-the-art results on widely
+used benchmarks. Furthermore, the proposed framework can be seamlessly applied
+to scene text VQA tasks. By leveraging instruction strategies during
+pre-training, the performance on downstream VQA tasks can be significantly
+improved, with a 2.6% increase on the TextVQA dataset and a 2.1% increase on
+the ST-VQA dataset. These experimental results provide insights into the
+benefits of incorporating human language instructions for OCR-related tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ II-Bench: An Image Implication Understanding Benchmark for Multimodal
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.05862v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.05862v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqiang Liu, Feiteng Fang, Xi Feng, Xinrun Du, Chenhao Zhang, Zekun Wang, Yuelin Bai, Qixuan Zhao, Liyang Fan, Chengguang Gan, Hongquan Lin, Jiaming Li, Yuansheng Ni, Haihong Wu, Yaswanth Narsupalli, Zhigang Zheng, Chengming Li, Xiping Hu, Ruifeng Xu, Xiaojun Chen, Min Yang, Jiaheng Liu, Ruibo Liu, Wenhao Huang, Ge Zhang, Shiwen Ni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancements in the development of multimodal large language models
+(MLLMs) have consistently led to new breakthroughs on various benchmarks. In
+response, numerous challenging and comprehensive benchmarks have been proposed
+to more accurately assess the capabilities of MLLMs. However, there is a dearth
+of exploration of the higher-order perceptual capabilities of MLLMs. To fill
+this gap, we propose the Image Implication understanding Benchmark, II-Bench,
+which aims to evaluate the model's higher-order perception of images. Through
+extensive experiments on II-Bench across multiple MLLMs, we have made
+significant findings. Initially, a substantial gap is observed between the
+performance of MLLMs and humans on II-Bench. The pinnacle accuracy of MLLMs
+attains 74.8%, whereas human accuracy averages 90%, peaking at an impressive
+98%. Subsequently, MLLMs perform worse on abstract and complex images,
+suggesting limitations in their ability to understand high-level semantics and
+capture image details. Finally, it is observed that most models exhibit
+enhanced accuracy when image sentiment polarity hints are incorporated into the
+prompts. This observation underscores a notable deficiency in their inherent
+understanding of image sentiment. We believe that II-Bench will inspire the
+community to develop the next generation of MLLMs, advancing the journey
+towards expert artificial general intelligence (AGI). II-Bench is publicly
+available at https://huggingface.co/datasets/m-a-p/II-Bench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>100 pages, 82 figures, add citations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Feature-based Knowledge Distillation for Recommender System: A
+  Frequency Perspective <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangchi Zhu, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we analyze the feature-based knowledge distillation for
+recommendation from the frequency perspective. By defining knowledge as
+different frequency components of the features, we theoretically demonstrate
+that regular feature-based knowledge distillation is equivalent to equally
+minimizing losses on all knowledge and further analyze how this equal loss
+weight allocation method leads to important knowledge being overlooked. In
+light of this, we propose to emphasize important knowledge by redistributing
+knowledge weights. Furthermore, we propose FreqD, a lightweight knowledge
+reweighting method, to avoid the computational cost of calculating losses on
+each knowledge. Extensive experiments demonstrate that FreqD consistently and
+significantly outperforms state-of-the-art knowledge distillation methods for
+recommender systems. Our code is available at https://github.com/woriazzc/KDs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM KDD 2025 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WeCromCL: Weakly Supervised Cross-Modality Contrastive Learning for
+  Transcription-only Supervised Text Spotting <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19507v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19507v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingjing Wu, Zhengyao Fang, Pengyuan Lyu, Chengquan Zhang, Fanglin Chen, Guangming Lu, Wenjie Pei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transcription-only Supervised Text Spotting aims to learn text spotters
+relying only on transcriptions but no text boundaries for supervision, thus
+eliminating expensive boundary annotation. The crux of this task lies in
+locating each transcription in scene text images without location annotations.
+In this work, we formulate this challenging problem as a Weakly Supervised
+Cross-modality Contrastive Learning problem, and design a simple yet effective
+model dubbed WeCromCL that is able to detect each transcription in a scene
+image in a weakly supervised manner. Unlike typical methods for cross-modality
+contrastive learning that focus on modeling the holistic semantic correlation
+between an entire image and a text description, our WeCromCL conducts atomistic
+contrastive learning to model the character-wise appearance consistency between
+a text transcription and its correlated region in a scene image to detect an
+anchor point for the transcription in a weakly supervised manner. The detected
+anchor points by WeCromCL are further used as pseudo location labels to guide
+the learning of text spotting. Extensive experiments on four challenging
+benchmarks demonstrate the superior performance of our model over other
+methods. Code will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-Driven Diabetic Retinopathy Screening: Multicentric Validation of
+  AIDRSS in India 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05826v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05826v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Kr Dey, Pradeep Walia, Girish Somvanshi, Abrar Ali, Sagarnil Das, Pallabi Paul, Minakhi Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Diabetic retinopathy (DR) is a major cause of vision loss,
+particularly in India, where access to retina specialists is limited in rural
+areas. This study aims to evaluate the Artificial Intelligence-based Diabetic
+Retinopathy Screening System (AIDRSS) for DR detection and prevalence
+assessment, addressing the growing need for scalable, automated screening
+solutions in resource-limited settings.
+  Approach: A multicentric, cross-sectional study was conducted in Kolkata,
+India, involving 5,029 participants and 10,058 macula-centric retinal fundus
+images. The AIDRSS employed a deep learning algorithm with 50 million trainable
+parameters, integrated with Contrast Limited Adaptive Histogram Equalization
+(CLAHE) preprocessing for enhanced image quality. DR was graded using the
+International Clinical Diabetic Retinopathy (ICDR) Scale, categorizing disease
+into five stages (DR0 to DR4). Statistical metrics including sensitivity,
+specificity, and prevalence rates were evaluated against expert retina
+specialist assessments.
+  Results: The prevalence of DR in the general population was 13.7%, rising to
+38.2% among individuals with elevated random blood glucose levels. The AIDRSS
+achieved an overall sensitivity of 92%, specificity of 88%, and 100%
+sensitivity for detecting referable DR (DR3 and DR4). These results demonstrate
+the system's robust performance in accurately identifying and grading DR in a
+diverse population.
+  Conclusions: AIDRSS provides a reliable, scalable solution for early DR
+detection in resource-constrained environments. Its integration of advanced AI
+techniques ensures high diagnostic accuracy, with potential to significantly
+reduce the burden of diabetes-related vision loss in underserved regions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 5 figures. arXiv admin note: substantial text overlap with
+  arXiv:1812.07105 by other authors without attribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized
+  Narratives from Open-Source Histopathology Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04746v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04746v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehmet Saygin Seyfioglu, Wisdom O. Ikezogwo, Fatemeh Ghezloo, Ranjay Krishna, Linda Shapiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosis in histopathology requires a global whole slide images (WSIs)
+analysis, requiring pathologists to compound evidence from different WSI
+patches. The gigapixel scale of WSIs poses a challenge for histopathology
+multi-modal models. Training multi-model models for histopathology requires
+instruction tuning datasets, which currently contain information for individual
+image patches, without a spatial grounding of the concepts within each patch
+and without a wider view of the WSI. Therefore, they lack sufficient diagnostic
+capacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a
+large-scale dataset of 107,131 histopathology-specific instruction
+question/answer pairs, grounded within diagnostically relevant image patches
+that make up the WSI. Our dataset is collected by leveraging educational
+histopathology videos from YouTube, which provides spatial localization of
+narrations by automatically extracting the narrators' cursor positions.
+Quilt-Instruct supports contextual reasoning by extracting diagnosis and
+supporting facts from the entire WSI. Using Quilt-Instruct, we train
+Quilt-LLaVA, which can reason beyond the given single image patch, enabling
+diagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a
+comprehensive evaluation dataset created from 985 images and 1283
+human-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using
+public histopathology datasets, where Quilt-LLaVA significantly outperforms
+SOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set
+VQA. Our code, data, and model are publicly accessible at
+quilt-llava.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Learning with Strategic Selection and Forgetting for Network
+  Intrusion Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinchen Zhang, Running Zhao, Zhihan Jiang, Handi Chen, Yulong Ding, Edith C. H. Ngai, Shuang-Hua Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intrusion Detection Systems (IDS) are crucial for safeguarding digital
+infrastructure. In dynamic network environments, both threat landscapes and
+normal operational behaviors are constantly changing, resulting in concept
+drift. While continuous learning mitigates the adverse effects of concept
+drift, insufficient attention to drift patterns and excessive preservation of
+outdated knowledge can still hinder the IDS's adaptability. In this paper, we
+propose SSF (Strategic Selection and Forgetting), a novel continual learning
+method for IDS, providing continuous model updates with a constantly refreshed
+memory buffer. Our approach features a strategic sample selection algorithm to
+select representative new samples and a strategic forgetting mechanism to drop
+outdated samples. The proposed strategic sample selection algorithm prioritizes
+new samples that cause the `drifted' pattern, enabling the model to better
+understand the evolving landscape. Additionally, we introduce strategic
+forgetting upon detecting significant drift by discarding outdated samples to
+free up memory, allowing the incorporation of more recent data. SSF captures
+evolving patterns effectively and ensures the model is aligned with the change
+of data patterns, significantly enhancing the IDS's adaptability to concept
+drift. The state-of-the-art performance of SSF on NSL-KDD and UNSW-NB15
+datasets demonstrates its superior adaptability to concept drift for network
+intrusion detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE International Conference on Computer Communications
+  (INFOCOM) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MIO: A Foundation Model on Multimodal Tokens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.17692v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.17692v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekun Wang, King Zhu, Chunpu Xu, Wangchunshu Zhou, Jiaheng Liu, Yibo Zhang, Jiashuo Wang, Ning Shi, Siyu Li, Yizhi Li, Haoran Que, Zhaoxiang Zhang, Yuanxing Zhang, Ge Zhang, Ke Xu, Jie Fu, Wenhao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce MIO, a novel foundation model built on multimodal
+tokens, capable of understanding and generating speech, text, images, and
+videos in an end-to-end, autoregressive manner. While the emergence of large
+language models (LLMs) and multimodal large language models (MM-LLMs) propels
+advancements in artificial general intelligence through their versatile
+capabilities, they still lack true any-to-any understanding and generation.
+Recently, the release of GPT-4o has showcased the remarkable potential of
+any-to-any LLMs for complex real-world tasks, enabling omnidirectional input
+and output across images, speech, and text. However, it is closed-source and
+does not support the generation of multimodal interleaved sequences. To address
+this gap, we present MIO, which is trained on a mixture of discrete tokens
+across four modalities using causal multimodal modeling. MIO undergoes a
+four-stage training process: (1) alignment pre-training, (2) interleaved
+pre-training, (3) speech-enhanced pre-training, and (4) comprehensive
+supervised fine-tuning on diverse textual, visual, and speech tasks. Our
+experimental results indicate that MIO exhibits competitive, and in some cases
+superior, performance compared to previous dual-modal baselines, any-to-any
+model baselines, and even modality-specific baselines. Moreover, MIO
+demonstrates advanced capabilities inherent to its any-to-any feature, such as
+interleaved video-text generation, chain-of-visual-thought reasoning, visual
+guideline generation, instructional image editing, etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report. Codes and models are available in
+  https://github.com/MIO-Team/MIO</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive Study of Structural Pruning for Vision Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12315v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12315v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changhao Li, Haoling Li, Mengqi Xue, Gongfan Fang, Sheng Zhou, Zunlei Feng, Huiqiong Wang, Mingli Song, Jie Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural pruning has emerged as a promising approach for producing more
+efficient models. Nevertheless, the community suffers from a lack of
+standardized benchmarks and metrics, leaving the progress in this area not
+fully comprehended.To fill this gap, we present the first comprehensive
+benchmark, termed PruningBench, for structural pruning. PruningBench showcases
+the following three characteristics: 1) PruningBench employs a unified and
+consistent framework for evaluating the effectiveness of diverse structural
+pruning techniques; 2) PruningBench systematically evaluates 16 existing
+pruning methods, encompassing a wide array of models (e.g., CNNs and ViTs) and
+tasks (e.g., classification and detection); 3) PruningBench provides easily
+implementable interfaces to facilitate the implementation of future pruning
+methods, and enables the subsequent researchers to incorporate their work into
+our leaderboards. We provide an online pruning platform for customizing pruning
+tasks and reproducing all results in this paper. Leaderboard results can also
+be available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a paper aims to present a evaluation benchmark for structural
+  pruning. The full text is 25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Buster: Implanting Semantic Backdoor into Text Encoder to Mitigate NSFW
+  Content Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07249v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07249v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhao, Xiaojun Chen, Yuexin Xuan, Zhendong Zhao, Xiaojun Jia, Xinfeng Li, Xiaofeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of deep learning models in the digital era has raised substantial
+concerns regarding the generation of Not-Safe-for-Work (NSFW) content. Existing
+defense methods primarily involve model fine-tuning and post-hoc content
+moderation. Nevertheless, these approaches largely lack scalability in
+eliminating harmful content, degrade the quality of benign image generation, or
+incur high inference costs. To address these challenges, we propose an
+innovative framework named \textit{Buster}, which injects backdoors into the
+text encoder to prevent NSFW content generation. Buster leverages deep semantic
+information rather than explicit prompts as triggers, redirecting NSFW prompts
+towards targeted benign prompts. Additionally, Buster employs energy-based
+training data generation through Langevin dynamics for adversarial knowledge
+augmentation, thereby ensuring robustness in harmful concept definition. This
+approach demonstrates exceptional resilience and scalability in mitigating NSFW
+content. Particularly, Buster fine-tunes the text encoder of Text-to-Image
+models within merely five minutes, showcasing its efficiency. Our extensive
+experiments denote that Buster outperforms nine state-of-the-art baselines,
+achieving a superior NSFW content removal rate of at least 91.2\% while
+preserving the quality of harmless images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiReCT: Diagnostic Reasoning for Clinical Notes via Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01933v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01933v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Wang, Jiuyang Chang, Yiming Qian, Guoxin Chen, Junhao Chen, Zhouqiang Jiang, Jiahao Zhang, Yuta Nakashima, Hajime Nagahara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have recently showcased remarkable capabilities,
+spanning a wide range of tasks and applications, including those in the medical
+domain. Models like GPT-4 excel in medical question answering but may face
+challenges in the lack of interpretability when handling complex tasks in real
+clinical settings. We thus introduce the diagnostic reasoning dataset for
+clinical notes (DiReCT), aiming at evaluating the reasoning ability and
+interpretability of LLMs compared to human doctors. It contains 511 clinical
+notes, each meticulously annotated by physicians, detailing the diagnostic
+reasoning process from observations in a clinical note to the final diagnosis.
+Additionally, a diagnostic knowledge graph is provided to offer essential
+knowledge for reasoning, which may not be covered in the training data of
+existing LLMs. Evaluations of leading LLMs on DiReCT bring out a significant
+gap between their reasoning ability and that of human doctors, highlighting the
+critical need for models that can reason effectively in real-world clinical
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Critical Tokens Matter: Token-Level Contrastive Estimation Enhances
+  LLM's Reasoning Capability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19943v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19943v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zicheng Lin, Tian Liang, Jiahao Xu, Qiuzhi Lin, Xing Wang, Ruilin Luo, Chufan Shi, Siheng Li, Yujiu Yang, Zhaopeng Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mathematical reasoning tasks pose significant challenges for large language
+models (LLMs) because they require precise logical deduction and sequence
+analysis. In this work, we introduce the concept of critical tokens -- elements
+within reasoning trajectories that significantly influence incorrect outcomes.
+We present a novel framework for identifying these tokens through rollout
+sampling and demonstrate their substantial divergence from traditional error
+tokens. Through extensive experiments on datasets such as GSM8K and MATH500, we
+show that identifying and replacing critical tokens significantly improves
+model accuracy. We propose an efficient methodology for pinpointing these
+tokens in large-scale datasets using contrastive estimation and extend this
+framework to enhance model training processes with direct preference
+optimization (DPO). Experimental results on GSM8K and MATH500 benchmarks with
+the widely used models Llama-3 (8B and 70B) and Deepseek-math (7B) demonstrate
+the effectiveness of the proposed approach, cDPO. Our results underscore the
+potential of leveraging critical tokens to reduce errors in reasoning tasks,
+advancing the development of AI systems capable of robust logical deduction.
+Our code, annotated datasets, and trained models are available at
+https://github.com/chenzhiling9954/Critical-Tokens-Matter to support and
+encourage future research in this promising field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Action Models: From Inception to Implementation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10047v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10047v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Wang, Fangkai Yang, Chaoyun Zhang, Junting Lu, Jiaxu Qian, Shilin He, Pu Zhao, Bo Qiao, Ray Huang, Si Qin, Qisheng Su, Jiayi Ye, Yudi Zhang, Jian-Guang Lou, Qingwei Lin, Saravan Rajmohan, Dongmei Zhang, Qi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As AI continues to advance, there is a growing demand for systems that go
+beyond language-based assistance and move toward intelligent agents capable of
+performing real-world actions. This evolution requires the transition from
+traditional Large Language Models (LLMs), which excel at generating textual
+responses, to Large Action Models (LAMs), designed for action generation and
+execution within dynamic environments. Enabled by agent systems, LAMs hold the
+potential to transform AI from passive language understanding to active task
+completion, marking a significant milestone in the progression toward
+artificial general intelligence.
+  In this paper, we present a comprehensive framework for developing LAMs,
+offering a systematic approach to their creation, from inception to deployment.
+We begin with an overview of LAMs, highlighting their unique characteristics
+and delineating their differences from LLMs. Using a Windows OS-based agent as
+a case study, we provide a detailed, step-by-step guide on the key stages of
+LAM development, including data collection, model training, environment
+integration, grounding, and evaluation. This generalizable workflow can serve
+as a blueprint for creating functional LAMs in various application domains. We
+conclude by identifying the current limitations of LAMs and discussing
+directions for future research and industrial deployment, emphasizing the
+challenges and opportunities that lie ahead in realizing the full potential of
+LAMs in real-world applications.
+  The code for the data collection process utilized in this paper is publicly
+available at: https://github.com/microsoft/UFO/tree/main/dataflow, and
+comprehensive documentation can be found at
+https://microsoft.github.io/UFO/dataflow/overview/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25pages,12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing Weather Forecast to Fine-grained Temporal Scales via
+  Physics-AI Hybrid Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13796v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13796v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanghan Xu, Fenghua Ling, Wenlong Zhang, Tao Han, Hao Chen, Wanli Ouyang, Lei Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven artificial intelligence (AI) models have made significant
+advancements in weather forecasting, particularly in medium-range and
+nowcasting. However, most data-driven weather forecasting models are black-box
+systems that focus on learning data mapping rather than fine-grained physical
+evolution in the time dimension. Consequently, the limitations in the temporal
+scale of datasets prevent these models from forecasting at finer time scales.
+This paper proposes a physics-AI hybrid model (i.e., WeatherGFT) which
+generalizes weather forecasts to finer-grained temporal scales beyond training
+dataset. Specifically, we employ a carefully designed PDE kernel to simulate
+physical evolution on a small time scale (e.g., 300 seconds) and use a parallel
+neural networks with a learnable router for bias correction. Furthermore, we
+introduce a lead time-aware training framework to promote the generalization of
+the model at different lead times. The weight analysis of physics-AI modules
+indicates that physics conducts major evolution while AI performs corrections
+adaptively. Extensive experiments show that WeatherGFT trained on an hourly
+dataset, effectively generalizes forecasts across multiple time scales,
+including 30-minute, which is even smaller than the dataset's temporal
+resolution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Topic-Aware Knowledge Graph with Large Language Models for
+  Interoperability in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20163v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20163v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minhye Jeon, Seokho Ahn, Young-Duk Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of knowledge graphs in recommender systems has become one of the
+common approaches to addressing data sparsity and cold start problems. Recent
+advances in large language models (LLMs) offer new possibilities for processing
+side and context information within knowledge graphs. However, consistent
+integration across various systems remains challenging due to the need for
+domain expert intervention and differences in system characteristics. To
+address these issues, we propose a consistent approach that extracts both
+general and specific topics from both side and context information using LLMs.
+First, general topics are iteratively extracted and updated from side
+information. Then, specific topics are extracted using context information.
+Finally, to address synonymous topics generated during the specific topic
+extraction process, a refining algorithm processes and resolves these issues
+effectively. This approach allows general topics to capture broad knowledge
+across diverse item characteristics, while specific topics emphasize detailed
+attributes, providing a more comprehensive understanding of the semantic
+features of items and the preferences of users. Experimental results
+demonstrate significant improvements in recommendation performance across
+diverse knowledge graphs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in The 40th ACM/SIGAPP Symposium On Applied Computing(SAC)
+  2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM4Vuln: A Unified Evaluation Framework for Decoupling and Enhancing
+  LLMs' Vulnerability Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.16185v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.16185v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqiang Sun, Daoyuan Wu, Yue Xue, Han Liu, Wei Ma, Lyuye Zhang, Yang Liu, Yingjiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated significant potential in
+various tasks, including those requiring human-level intelligence, such as
+vulnerability detection. However, recent efforts to use LLMs for vulnerability
+detection remain preliminary, as they lack a deep understanding of whether a
+subject LLM's vulnerability reasoning capability stems from the model itself or
+from external aids such as knowledge retrieval and tooling support.
+  In this paper, we aim to decouple LLMs' vulnerability reasoning from other
+capabilities, such as vulnerability knowledge adoption, context information
+retrieval, and advanced prompt schemes. We introduce LLM4Vuln, a unified
+evaluation framework that separates and assesses LLMs' vulnerability reasoning
+capabilities and examines improvements when combined with other enhancements.
+  We conduct controlled experiments using 147 ground-truth vulnerabilities and
+147 non-vulnerable cases in Solidity, Java and C/C++, testing them in a total
+of 3,528 scenarios across four LLMs (GPT-3.5, GPT-4, Phi-3, and Llama 3). Our
+findings reveal the varying impacts of knowledge enhancement, context
+supplementation, and prompt schemes. We also identify 14 zero-day
+vulnerabilities in four pilot bug bounty programs, resulting in $3,576 in
+bounties.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a technical report by Nanyang Technological University.
+  Updated to support Solidity, Java and C/C++</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Step-by-Step Mastery: Enhancing Soft Constraint Following Ability of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04945v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04945v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyu Ren, Jie Zeng, Qianyu He, Jiaqing Liang, Yanghua Xiao, Weikang Zhou, Zeye Sun, Fei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is crucial for large language models (LLMs) to follow instructions that
+involve multiple constraints. However, soft constraints are semantically
+related and difficult to verify through automated methods. These constraints
+remain a significant challenge for LLMs. To enhance the ability of LLMs to
+follow soft constraints, we initially design a pipeline to obtain high-quality
+outputs automatically. Additionally, to fully utilize the acquired data, we
+introduce a training paradigm based on curriculum learning. We experimentally
+evaluate the effectiveness of our methods in improving LLMs' soft constraint
+following ability and analyze the factors driving the improvements. The
+datasets and code are publicly available at
+https://github.com/Rainier-rq/FollowSoftConstraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MLLM-CompBench: A Comparative Reasoning Benchmark for Multimodal LLMs <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16837v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16837v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihyung Kil, Zheda Mai, Justin Lee, Zihe Wang, Kerrie Cheng, Lemeng Wang, Ye Liu, Arpita Chowdhury, Wei-Lun Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to compare objects, scenes, or situations is crucial for
+effective decision-making and problem-solving in everyday life. For instance,
+comparing the freshness of apples enables better choices during grocery
+shopping while comparing sofa designs helps optimize the aesthetics of our
+living space. Despite its significance, the comparative capability is largely
+unexplored in artificial general intelligence (AGI). In this paper, we
+introduce MLLM-CompBench, a benchmark designed to evaluate the comparative
+reasoning capability of multimodal large language models (MLLMs).
+MLLM-CompBench mines and pairs images through visually oriented questions
+covering eight dimensions of relative comparison: visual attribute, existence,
+state, emotion, temporality, spatiality, quantity, and quality. We curate a
+collection of around 40K image pairs using metadata from diverse vision
+datasets and CLIP similarity scores. These image pairs span a broad array of
+visual domains, including animals, fashion, sports, and both outdoor and indoor
+scenes. The questions are carefully crafted to discern relative characteristics
+between two images and are labeled by human annotators for accuracy and
+relevance. We use MLLM-CompBench to evaluate recent MLLMs, including
+GPT-4V(ision), Gemini-Pro, and LLaVA-1.6. Our results reveal notable
+shortcomings in their comparative abilities. We believe MLLM-COMPBENCH not only
+sheds light on these limitations but also establishes a solid foundation for
+future enhancements in the comparative capability of MLLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted to NeurIPS 2024. The first two authors
+  contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A minimal coalition logic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.14704v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.14704v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinfeng Li, Fengkui Ju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coalition Logic is a central logic in logical studies of strategic reasoning,
+whose models are concurrent game models. In this paper, first, we
+systematically discuss three assumptions of concurrent game models and argue
+that they are too strong. The first is seriality; that is, every coalition
+always has an available joint action. The second is the independence of agents;
+that is, the merge of two available joint actions of two disjoint coalitions is
+always an available joint action of the union of the two coalitions. The third
+is determinism; that is, all available joint actions of the grand coalition
+always have a unique outcome. Second, we present a coalition logic based on
+general concurrent game models which do not have the three assumptions and show
+its completeness. This logic seems minimal for reasoning about coalitional
+powers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HADES: Hardware Accelerated Decoding for Efficient Speculation in Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19925v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19925v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ze Yang, Yihong Jin, Xinhe Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have revolutionized natural language processing
+by understanding and generating human-like text. However, the increasing demand
+for more sophisticated LLMs presents significant computational challenges due
+to their scale and complexity. This paper introduces Hardware Accelerated
+Decoding (HADES), a novel approach to enhance the performance and energy
+efficiency of LLMs. We address the design of an LLM accelerator with
+hardware-level speculative decoding support, a concept not previously explored
+in existing literature. Our work demonstrates how speculative decoding can
+significantly improve the efficiency of LLM operations, paving the way for more
+advanced and practical applications of these models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCEA 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Map Imagination Like Blind Humans: Group Diffusion Model for Robotic Map
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16908v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16908v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qijin Song, Weibang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can robots imagine or generate maps like humans do, especially when only
+limited information can be perceived like blind people? To address this
+challenging task, we propose a novel group diffusion model (GDM) based
+architecture for robots to generate point cloud maps with very limited input
+information.Inspired from the blind humans' natural capability of imagining or
+generating mental maps, the proposed method can generate maps without visual
+perception data or depth data. With additional limited super-sparse spatial
+positioning data, like the extra contact-based positioning information the
+blind individuals can obtain, the map generation quality can be improved even
+more.Experiments on public datasets are conducted, and the results indicate
+that our method can generate reasonable maps solely based on path data, and
+produce even more refined maps upon incorporating exiguous LiDAR data.Compared
+to conventional mapping approaches, our novel method significantly mitigates
+sensor dependency, enabling the robots to imagine and generate elementary maps
+without heavy onboard sensory devices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intelligent System for Automated Molecular Patent Infringement
+  Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07819v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07819v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaorui Shi, Sihang Li, Taiyan Zhang, Xi Fang, Jiankun Wang, Zhiyuan Liu, Guojiang Zhao, Zhengdan Zhu, Zhifeng Gao, Renxin Zhong, Linfeng Zhang, Guolin Ke, Weinan E, Hengxing Cai, Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated drug discovery offers significant potential for accelerating the
+development of novel therapeutics by substituting labor-intensive human
+workflows with machine-driven processes. However, molecules generated by
+artificial intelligence may unintentionally infringe on existing patents,
+posing legal and financial risks that impede the full automation of drug
+discovery pipelines. This paper introduces PatentFinder, a novel multi-agent
+and tool-enhanced intelligence system that can accurately and comprehensively
+evaluate small molecules for patent infringement. PatentFinder features five
+specialized agents that collaboratively analyze patent claims and molecular
+structures with heuristic and model-based tools, generating interpretable
+infringement reports. To support systematic evaluation, we curate
+MolPatent-240, a benchmark dataset tailored for patent infringement assessment
+algorithms. On this benchmark, PatentFinder outperforms baseline methods that
+rely solely on large language models or specialized chemical tools, achieving a
+13.8% improvement in F1-score and a 12% increase in accuracy. Additionally,
+PatentFinder autonomously generates detailed and interpretable patent
+infringement reports, showcasing enhanced accuracy and improved
+interpretability. The high accuracy and interpretability of PatentFinder make
+it a valuable and reliable tool for automating patent infringement assessments,
+offering a practical solution for integrating patent protection analysis into
+the drug discovery pipeline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Proactive Distributed Emergency Response with Heterogeneous Tasks
+  Allocation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.11132v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.11132v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justice Darko, Hyoshin Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditionally, traffic incident management (TIM) programs coordinate the
+deployment of emergency resources to immediate incident requests without
+accommodating the interdependencies on incident evolutions in the environment.
+However, ignoring inherent interdependencies on the evolution of incidents in
+the environment while making current deployment decisions is shortsighted, and
+the resulting naive deployment strategy can significantly worsen the overall
+incident delay impact on the network. The interdependencies on incident
+evolution in the environment, including those between incident occurrences, and
+those between resource availability in near-future requests and the anticipated
+duration of the immediate incident request, should be considered through a
+look-ahead model when making current-stage deployment decisions. This study
+develops a new proactive framework based on the distributed constraint
+optimization problem (DCOP) to address the above limitations, overcoming
+conventional TIM models that cannot accommodate the dependencies in the TIM
+problem. Furthermore, the optimization objective is formulated to incorporate
+Unmanned Aerial Vehicles (UAVs). The UAVs' role in TIM includes exploring
+uncertain traffic conditions, detecting unexpected events, and augmenting
+information from roadway traffic sensors. Robustness analysis of our model for
+multiple TIM scenarios shows satisfactory performance using local search
+exploration heuristics. Overall, our model reports a significant reduction in
+total incident delay compared to conventional TIM models. With UAV support, we
+demonstrate a further decrease in the total incident delay ranging between 5%
+and 45% for the different number of incidents. UAV's active sensing can shorten
+response time of emergency vehicles, and a reduction in uncertainties
+associated with the estimated incident delay impact.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 13 figures, 3 tables, journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Seeing the Unseen: Learning Basis Confounder Representations for Robust
+  Traffic Prediction <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12472v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12472v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Ji, Wentao Zhang, Jingyuan Wang, Chao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic prediction is essential for intelligent transportation systems and
+urban computing. It aims to establish a relationship between historical traffic
+data X and future traffic states Y by employing various statistical or deep
+learning methods. However, the relations of X -> Y are often influenced by
+external confounders that simultaneously affect both X and Y , such as weather,
+accidents, and holidays. Existing deep-learning traffic prediction models adopt
+the classic front-door and back-door adjustments to address the confounder
+issue. However, these methods have limitations in addressing continuous or
+undefined confounders, as they depend on predefined discrete values that are
+often impractical in complex, real-world scenarios. To overcome this challenge,
+we propose the Spatial-Temporal sElf-superVised confoundEr learning (STEVE)
+model. This model introduces a basis vector approach, creating a base
+confounder bank to represent any confounder as a linear combination of a group
+of basis vectors. It also incorporates self-supervised auxiliary tasks to
+enhance the expressive power of the base confounder bank. Afterward, a
+confounder-irrelevant relation decoupling module is adopted to separate the
+confounder effects from direct X -> Y relations. Extensive experiments across
+four large-scale datasets validate our model's superior performance in handling
+spatial and temporal distribution shifts and underscore its adaptability to
+unseen confounders. Our model implementation is available at
+https://github.com/bigscity/STEVE_CODE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 10 figures, Accepted by KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainable Artificial Intelligence: A <span class="highlight-title">Survey</span> of Needs, Techniques,
+  Applications, and Future Direction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.00265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.00265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melkamu Mersha, Khang Lam, Joseph Wood, Ali AlShami, Jugal Kalita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence models encounter significant challenges due to their
+black-box nature, particularly in safety-critical domains such as healthcare,
+finance, and autonomous vehicles. Explainable Artificial Intelligence (XAI)
+addresses these challenges by providing explanations for how these models make
+decisions and predictions, ensuring transparency, accountability, and fairness.
+Existing studies have examined the fundamental concepts of XAI, its general
+principles, and the scope of XAI techniques. However, there remains a gap in
+the literature as there are no comprehensive reviews that delve into the
+detailed mathematical representations, design methodologies of XAI models, and
+other associated aspects. This paper provides a comprehensive literature review
+encompassing common terminologies and definitions, the need for XAI,
+beneficiaries of XAI, a taxonomy of XAI methods, and the application of XAI
+methods in different application areas. The survey is aimed at XAI researchers,
+XAI practitioners, AI model developers, and XAI beneficiaries who are
+interested in enhancing the trustworthiness, transparency, accountability, and
+fairness of their AI models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-12T00:00:00Z">2025-01-12</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">17</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Implicit Social Navigation Behavior using Deep Inverse
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tribhi Kathuria, Ke Liu, Junwoo Jang, X. Jessie Yang, Maani Ghaffari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper reports on learning a reward map for social navigation in dynamic
+environments where the robot can reason about its path at any time, given
+agents' trajectories and scene geometry. Humans navigating in dense and dynamic
+indoor environments often work with several implied social rules. A rule-based
+approach fails to model all possible interactions between humans, robots, and
+scenes. We propose a novel Smooth Maximum Entropy Deep Inverse Reinforcement
+Learning (S-MEDIRL) algorithm that can extrapolate beyond expert demos to
+better encode scene navigability from few-shot demonstrations. The agent learns
+to predict the cost maps reasoning on trajectory data and scene geometry. The
+agent samples a trajectory that is then executed using a local crowd navigation
+controller. We present results in a photo-realistic simulation environment,
+with a robot and a human navigating a narrow crossing scenario. The robot
+implicitly learns to exhibit social behaviors such as yielding to oncoming
+traffic and avoiding deadlocks. We compare the proposed approach to the popular
+model-based crowd navigation algorithm ORCA and a rule-based agent that
+exhibits yielding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, Submitted to IEEE Robotics and Automation Letters (RAL)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shake-VLA: Vision-Language-Action Model-Based System for Bimanual
+  Robotic Manipulations and Liquid Mixing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06919v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06919v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhamamd Haris Khan, Selamawit Asfaw, Dmitrii Iarchuk, Miguel Altamirano Cabrera, Luis Moreno, Issatay Tokmurziyev, Dzmitry Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Shake-VLA, a Vision-Language-Action (VLA) model-based
+system designed to enable bimanual robotic manipulation for automated cocktail
+preparation. The system integrates a vision module for detecting ingredient
+bottles and reading labels, a speech-to-text module for interpreting user
+commands, and a language model to generate task-specific robotic instructions.
+Force Torque (FT) sensors are employed to precisely measure the quantity of
+liquid poured, ensuring accuracy in ingredient proportions during the mixing
+process. The system architecture includes a Retrieval-Augmented Generation
+(RAG) module for accessing and adapting recipes, an anomaly detection mechanism
+to address ingredient availability issues, and bimanual robotic arms for
+dexterous manipulation. Experimental evaluations demonstrated a high success
+rate across system components, with the speech-to-text module achieving a 93%
+success rate in noisy environments, the vision module attaining a 91% success
+rate in object and label detection in cluttered environment, the anomaly module
+successfully identified 95% of discrepancies between detected ingredients and
+recipe requirements, and the system achieved an overall success rate of 100% in
+preparing cocktails, from recipe formulation to action generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE/ACM HRI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Simulation to Field: Learning Terrain Traversability for Real-World
+  Deployment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fetullah Atas, Grzegorz Cielniak, Lars Grimstad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The challenge of traversability estimation is a crucial aspect of autonomous
+navigation in unstructured outdoor environments such as forests. It involves
+determining whether certain areas are passable or risky for robots, taking into
+account factors like terrain irregularities, slopes, and potential obstacles.
+The majority of current methods for traversability estimation operate on the
+assumption of an offline computation, overlooking the significant influence of
+the robot's heading direction on accurate traversability estimates. In this
+work, we introduce a deep neural network that uses detailed geometric
+environmental data together with the robot's recent movement characteristics.
+This fusion enables the generation of robot direction awareness and continuous
+traversability estimates, essential for enhancing robot autonomy in challenging
+terrains like dense forests. The efficacy and significance of our approach are
+underscored by experiments conducted on both simulated and real robotic
+platforms in various environments, yielding quantitatively superior performance
+results compared to existing methods. Moreover, we demonstrate that our method,
+trained exclusively in a high-fidelity simulated setting, can accurately
+predict traversability in real-world applications without any real data
+collection. Our experiments showcase the advantages of our method for
+optimizing path-planning and exploration tasks within difficult outdoor
+environments, underscoring its practicality for effective, real-world robotic
+navigation. In the spirit of collaborative advancement, we have made the code
+implementation available to the public.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ActiveGAMER: Active GAussian Mapping through Efficient Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyan Chen, Huangying Zhan, Kevin Chen, Xiangyu Xu, Qingan Yan, Changjiang Cai, Yi Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce ActiveGAMER, an active mapping system that utilizes 3D Gaussian
+Splatting (3DGS) to achieve high-quality, real-time scene mapping and
+exploration. Unlike traditional NeRF-based methods, which are computationally
+demanding and restrict active mapping performance, our approach leverages the
+efficient rendering capabilities of 3DGS, allowing effective and efficient
+exploration in complex environments. The core of our system is a
+rendering-based information gain module that dynamically identifies the most
+informative viewpoints for next-best-view planning, enhancing both geometric
+and photometric reconstruction accuracy. ActiveGAMER also integrates a
+carefully balanced framework, combining coarse-to-fine exploration,
+post-refinement, and a global-local keyframe selection strategy to maximize
+reconstruction completeness and fidelity. Our system autonomously explores and
+reconstructs environments with state-of-the-art geometric and photometric
+accuracy and completeness, significantly surpassing existing approaches in both
+aspects. Extensive evaluations on benchmark datasets such as Replica and MP3D
+highlight ActiveGAMER's effectiveness in active mapping tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward a Universal Concept of Artificial Personality: Implementing
+  Robotic Personality in a Kinova Arm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alice Nardelli, Lorenzo Landolfi, Dario Pasquali, Antonio Sgorbissa, Francesco Rea, Carmine Recchiuto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fundamental role of personality in shaping interactions is increasingly
+being exploited in robotics. A carefully designed robotic personality has been
+shown to improve several key aspects of Human-Robot Interaction (HRI). However,
+the fragmentation and rigidity of existing approaches reveal even greater
+challenges when applied to non-humanoid robots. On one hand, the state of the
+art is very dispersed; on the other hand, Industry 4.0 is moving towards a
+future where humans and industrial robots are going to coexist. In this
+context, the proper design of a robotic personality can lead to more successful
+interactions. This research takes a first step in that direction by integrating
+a comprehensive cognitive architecture built upon the definition of robotic
+personality - validated on humanoid robots - into a robotic Kinova Jaco2 arm.
+The robot personality is defined through the cognitive architecture as a vector
+in the three-dimensional space encompassing Conscientiousness, Extroversion,
+and Agreeableness, affecting how actions are executed, the action selection
+process, and the internal reaction to environmental stimuli. Our main objective
+is to determine whether users perceive distinct personalities in the robot,
+regardless of its shape, and to understand the role language plays in shaping
+these perceptions. To achieve this, we conducted a user study comprising 144
+sessions of a collaborative game between a Kinova Jaco2 arm and participants,
+where the robot's behavior was influenced by its assigned personality.
+Furthermore, we compared two conditions: in the first, the robot communicated
+solely through gestures and action choices, while in the second, it also
+utilized verbal interaction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating Discovery in Natural Science Laboratories with AI and
+  Robotics: Perspectives and Challenges from the 2024 IEEE ICRA Workshop,
+  Yokohama, Japan 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew I. Cooper, Patrick Courtney, Kourosh Darvish, Moritz Eckhoff, Hatem Fakhruldeen, Andrea Gabrielli, Animesh Garg, Sami Haddadin, Kanako Harada, Jason Hein, Maria Hübner, Dennis Knobbe, Gabriella Pizzuto, Florian Shkurti, Ruja Shrestha, Kerstin Thurow, Rafael Vescovi, Birgit Vogel-Heuser, Ádám Wolf, Naruki Yoshikawa, Yan Zeng, Zhengxue Zhou, Henning Zwirnmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Science laboratory automation enables accelerated discovery in life sciences
+and materials. However, it requires interdisciplinary collaboration to address
+challenges such as robust and flexible autonomy, reproducibility, throughput,
+standardization, the role of human scientists, and ethics. This article
+highlights these issues, reflecting perspectives from leading experts in
+laboratory automation across different disciplines of the natural sciences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soft Vision-Based Tactile-Enabled SixthFinger: Advancing Daily Objects
+  Manipulation for Stroke Survivors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Basma Hasanen, Mashood M. Mohsan, Abdulaziz Y. Alkayas, Federico Renda, Irfan Hussain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The presence of post-stroke grasping deficiencies highlights the critical
+need for the development and implementation of advanced compensatory
+strategies. This paper introduces a novel system to aid chronic stroke
+survivors through the development of a soft, vision-based, tactile-enabled
+extra robotic finger. By incorporating vision-based tactile sensing, the system
+autonomously adjusts grip force in response to slippage detection. This synergy
+not only ensures mechanical stability but also enriches tactile feedback,
+mimicking the dynamics of human-object interactions. At the core of our
+approach is a transformer-based framework trained on a comprehensive tactile
+dataset encompassing objects with a wide range of morphological properties,
+including variations in shape, size, weight, texture, and hardness.
+Furthermore, we validated the system's robustness in real-world applications,
+where it successfully manipulated various everyday objects. The promising
+results highlight the potential of this approach to improve the quality of life
+for stroke survivors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Robosoft 2025 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cost-Effective Robotic Handwriting System with AI Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Huang, Richard Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a cost-effective robotic handwriting system designed to
+replicate human-like handwriting with high precision. Combining a Raspberry Pi
+Pico microcontroller, 3D-printed components, and a machine learning-based
+handwriting generation model implemented via TensorFlow.js, the system converts
+user-supplied text into realistic stroke trajectories. By leveraging
+lightweight 3D-printed materials and efficient mechanical designs, the system
+achieves a total hardware cost of approximately \$56, significantly
+undercutting commercial alternatives. Experimental evaluations demonstrate
+handwriting precision within $\pm$0.3 millimeters and a writing speed of
+approximately 200 mm/min, positioning the system as a viable solution for
+educational, research, and assistive applications. This study seeks to lower
+the barriers to personalized handwriting technologies, making them accessible
+to a broader audience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an updated version of a paper originally presented at the
+  2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Sampling-based Planner with LTL Constraints and Text
+  <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06719v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06719v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingzhan Ge, Zi-Hao Zhang, Sheng-En Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This project introduces a hierarchical planner integrating Linear Temporal
+Logic (LTL) constraints with natural language prompting for robot motion
+planning. The framework decomposes maps into regions, generates directed
+graphs, and converts them into transition systems for high-level planning. Text
+instructions are translated into LTL formulas and converted to Deterministic
+Finite Automata (DFA) for sequential goal-reaching tasks while adhering to
+safety constraints. High-level plans, derived via Breadth-First Search (BFS),
+guide low-level planners like Exploring Random Trees (RRT) and Probabilistic
+Roadmaps (PRM) for obstacle-avoidant navigation along with LTL tasks. The
+approach demonstrates adaptability to various task complexities, though
+challenges such as graph construction overhead and suboptimal path generation
+remain. Future directions include extending to considering terrain conditions
+and incorporating higher-order dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vid2Sim: Realistic and Interactive Simulation from Video for Urban
+  Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyang Xie, Zhizheng Liu, Zhenghao Peng, Wayne Wu, Bolei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sim-to-real gap has long posed a significant challenge for robot learning in
+simulation, preventing the deployment of learned models in the real world.
+Previous work has primarily focused on domain randomization and system
+identification to mitigate this gap. However, these methods are often limited
+by the inherent constraints of the simulation and graphics engines. In this
+work, we propose Vid2Sim, a novel framework that effectively bridges the
+sim2real gap through a scalable and cost-efficient real2sim pipeline for neural
+3D scene reconstruction and simulation. Given a monocular video as input,
+Vid2Sim can generate photorealistic and physically interactable 3D simulation
+environments to enable the reinforcement learning of visual navigation agents
+in complex urban environments. Extensive experiments demonstrate that Vid2Sim
+significantly improves the performance of urban navigation in the digital twins
+and real world by 31.2% and 68.3% in success rate compared with agents trained
+with prior simulation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://metadriverse.github.io/vid2sim/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application of Vision-Language Model to Pedestrians Behavior and Scene
+  Understanding in Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxiang Gao, Yu Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving (AD) has experienced significant improvements in recent
+years and achieved promising 3D detection, classification, and localization
+results. However, many challenges remain, e.g. semantic understanding of
+pedestrians' behaviors, and downstream handling for pedestrian interactions.
+Recent studies in applications of Large Language Models (LLM) and
+Vision-Language Models (VLM) have achieved promising results in scene
+understanding and high-level maneuver planning in diverse traffic scenarios.
+However, deploying the billion-parameter LLMs to vehicles requires significant
+computation and memory resources. In this paper, we analyzed effective
+knowledge distillation of semantic labels to smaller Vision networks, which can
+be used for the semantic representation of complex scenes for downstream
+decision-making for planning and control.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-Sensitivity Vision-Based Tactile Sensing Enhanced by
+  Microstructures and Lightweight CNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20758v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20758v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayue Shi, Yongqi Zhang, Xiaotong Guo, Eric M. Yeatman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tactile sensing is critical in advanced interactive systems by emulating the
+human sense of touch to detect stimuli. Vision-based tactile sensors (VBTSs)
+are promising for their ability to provide rich information, robustness,
+adaptability, low cost, and multimodal capabilities. However, current
+technologies still have limitations in sensitivity, spatial resolution, and the
+high computational demands of deep learning-based image processing. This paper
+presents a comprehensive approach combining a novel sensor structure with
+micromachined structures and an efficient image processing method, and
+demonstrates that carefully engineered microstructures within the sensor
+hardware can significantly enhance sensitivity while reducing computational
+load. Unlike traditional designs with tracking markers, our sensor incorporates
+an interface surface with micromachined trenches, as an example of
+microstructures, which modulate light transmission and amplify the variation in
+response to applied force. By capturing variations in brightness, wire width,
+and cross pattern locations with a camera, the sensor accurately infers the
+contact location, the magnitude of displacement and applied force with a
+lightweight convolutional neural network (CNN). Theoretical and experimental
+results demonstrated that the microstructures significantly enhance sensitivity
+by amplifying the visual effects of shape distortion. The sensor system
+effectively detected forces below 10 mN, and achieved a millimetre-level
+single-point spatial resolution. Using a model with only one convolutional
+layer, a mean absolute error (MAE) below 0.05 mm have been achieved. Its soft
+sensor body ensures compatibility with soft robots and wearable electronics,
+while its immunity to electrical crosstalk and interference guarantees
+reliability in complex human-machine environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 13 figures, 2 tables; rearranged figures; corrected typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Reinforcement Learning Applications in SLAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14518v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14518v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Dehghani Tezerjani, Mohammad Khoshnazar, Mohammadhamed Tangestanizadeh, Arman Kiani, Qing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of mobile robotics, particularly in the automotive industry,
+introduces a promising era of enriched user experiences and adept handling of
+complex navigation challenges. The realization of these advancements
+necessitates a focused technological effort and the successful execution of
+numerous intricate tasks, particularly in the critical domain of Simultaneous
+Localization and Mapping (SLAM). Various artificial intelligence (AI)
+methodologies, such as deep learning and reinforcement learning, present viable
+solutions to address the challenges in SLAM. This study specifically explores
+the application of reinforcement learning in the context of SLAM. By enabling
+the agent (the robot) to iteratively interact with and receive feedback from
+its environment, reinforcement learning facilitates the acquisition of
+navigation and mapping skills, thereby enhancing the robot's decision-making
+capabilities. This approach offers several advantages, including improved
+navigation proficiency, increased resilience, reduced dependence on sensor
+precision, and refinement of the decision-making process. The findings of this
+study, which provide an overview of reinforcement learning's utilization in
+SLAM, reveal significant advancements in the field. The investigation also
+highlights the evolution and innovative integration of these techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous
+  Sensors via Language Grounding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Jones, Oier Mees, Carmelo Sferrazza, Kyle Stachowicz, Pieter Abbeel, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interacting with the world is a multi-sensory experience: achieving effective
+general-purpose interaction requires making use of all available modalities --
+including vision, touch, and audio -- to fill in gaps from partial observation.
+For example, when vision is occluded reaching into a bag, a robot should rely
+on its senses of touch and sound. However, state-of-the-art generalist robot
+policies are typically trained on large datasets to predict robot actions
+solely from visual and proprioceptive observations. In this work, we propose
+FuSe, a novel approach that enables finetuning visuomotor generalist policies
+on heterogeneous sensor modalities for which large datasets are not readily
+available by leveraging natural language as a common cross-modal grounding. We
+combine a multimodal contrastive loss with a sensory-grounded language
+generation loss to encode high-level semantics. In the context of robot
+manipulation, we show that FuSe enables performing challenging tasks that
+require reasoning jointly over modalities such as vision, touch, and sound in a
+zero-shot setting, such as multimodal prompting, compositional cross-modal
+prompting, and descriptions of objects it interacts with. We show that the same
+recipe is applicable to widely different generalist policies, including both
+diffusion-based generalist policies and large vision-language-action (VLA)
+models. Extensive experiments in the real world show that FuSeis able to
+increase success rates by over 20% compared to all considered baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Accurate and Real-time Relative Pose Estimation from Triple
+  Point-line Images by Decoupling Rotation and Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zewen Xu, Yijia He, Hao Wei, Bo Xu, BinJian Xie, Yihong Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Line features are valid complements for point features in man-made
+environments. 3D-2D constraints provided by line features have been widely used
+in Visual Odometry (VO) and Structure-from-Motion (SfM) systems. However, how
+to accurately solve three-view relative motion only with 2D observations of
+points and lines in real time has not been fully explored. In this paper, we
+propose a novel three-view pose solver based on rotation-translation decoupled
+estimation. First, a high-precision rotation estimation method based on normal
+vector coplanarity constraints that consider the uncertainty of observations is
+proposed, which can be solved by Levenberg-Marquardt (LM) algorithm
+efficiently. Second, a robust linear translation constraint that minimizes the
+degree of the rotation components and feature observation components in
+equations is elaborately designed for estimating translations accurately.
+Experiments on synthetic data and real-world data show that the proposed
+approach improves both rotation and translation accuracy compared to the
+classical trifocal-tensor-based method and the state-of-the-art two-view
+algorithm in outdoor and indoor environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ USV-AUV Collaboration Framework for Underwater Tasks under Extreme Sea
+  Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.02444v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.02444v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingzehua Xu, Guanwen Xie, Xinqi Wang, Yimian Ding, Shuai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous underwater vehicles (AUVs) are valuable for ocean exploration due
+to their flexibility and ability to carry communication and detection units.
+Nevertheless, AUVs alone often face challenges in harsh and extreme sea
+conditions. This study introduces a unmanned surface vehicle (USV)-AUV
+collaboration framework, which includes high-precision multi-AUV positioning
+using USV path planning via Fisher information matrix optimization and
+reinforcement learning for multi-AUV cooperative tasks. Applied to a multi-AUV
+underwater data collection task scenario, extensive simulations validate the
+framework's feasibility and superior performance, highlighting exceptional
+coordination and robustness under extreme sea conditions. To accelerate
+relevant research in this field, we have made the simulation code (demo
+version) available as open-source.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Speedup Techniques for Switchable Temporal Plan Graph Optimization <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15908v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15908v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        He Jiang, Muhan Lin, Jiaoyang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Agent Path Finding (MAPF) focuses on planning collision-free paths for
+multiple agents. However, during the execution of a MAPF plan, agents may
+encounter unexpected delays, which can lead to inefficiencies, deadlocks, or
+even collisions. To address these issues, the Switchable Temporal Plan Graph
+provides a framework for finding an acyclic Temporal Plan Graph with the
+minimum execution cost under delays, ensuring deadlock- and collision-free
+execution. Unfortunately, existing optimal algorithms, such as Mixed Integer
+Linear Programming and Graph-Based Switchable Edge Search (GSES), are often too
+slow for practical use. This paper introduces Improved GSES, which
+significantly accelerates GSES through four speedup techniques: stronger
+admissible heuristics, edge grouping, prioritized branching, and incremental
+implementation. Experiments conducted on four different map types with varying
+numbers of agents demonstrate that Improved GSES consistently achieves over
+twice the success rate of GSES and delivers up to a 30-fold speedup on
+instances where both methods successfully find solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025. This version contains the appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TensorConvolutionPlus: A python package for distribution system
+  flexibility area estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Demetris Chrysostomou, Jose Luis Rueda Torres, Jochen Lorenz Cremer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Power system operators need new, efficient operational tools to use the
+flexibility of distributed resources and deal with the challenges of highly
+uncertain and variable power systems. Transmission system operators can
+consider the available flexibility in distribution systems (DSs) without
+breaching the DS constraints through flexibility areas. However, there is an
+absence of open-source packages for flexibility area estimation. This paper
+introduces TensorConvolutionPlus, a user-friendly Python-based package for
+flexibility area estimation. The main features of TensorConvolutionPlus include
+estimating flexibility areas using the TensorConvolution+ algorithm, the power
+flow-based algorithm, an exhaustive PF-based algorithm, and an optimal power
+flow-based algorithm. Additional features include adapting flexibility area
+estimations from different operating conditions and including flexibility
+service providers offering discrete setpoints of flexibility. The
+TensorConvolutionPlus package facilitates a broader adaptation of flexibility
+estimation algorithms by system operators and power system researchers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 10 figures,</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Human Activity Recognition with Passive Inter-Body
+  Electrostatic Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06940v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06940v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sizhen Bian, Vitor Fortes Rey, Siyu Yuan, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The passive body-area electrostatic field has recently been aspiringly
+explored for wearable motion sensing, harnessing its two thrilling
+characteristics: full-body motion sensitivity and environmental sensitivity,
+which potentially empowers human activity recognition both independently and
+jointly from a single sensing front-end and theoretically brings significant
+competition against traditional inertial sensor that is incapable in
+environmental variations sensing. While most works focus on exploring the
+electrostatic field of a single body as the target, this work, for the first
+time, quantitatively evaluates the mutual effect of inter-body electrostatic
+fields and its contribution to collaborative activity recognition. A wearable
+electrostatic field sensing front-end and wrist-worn prototypes are built, and
+a sixteen-hour, manually annotated dataset is collected, involving an
+experiment of manipulating objects both independently and collaboratively. A
+regression model is finally used to recognize the collaborative activities
+among users. Despite the theoretical advantages of the body electrostatic
+field, the recognition of both single and collaborative activities shows
+unanticipated less-competitive recognition performance compared with the
+accelerometer. However, It is worth mentioning that this novel sensing modality
+improves the recognition F-score of user collaboration by 16\% in the fusion
+result of the two wearable motion sensing modalities, demonstrating the
+potential of bringing body electrostatic field as a complementary
+power-efficient signal for collaborative activity tracking using wearables.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Phase Allocation in Unbalanced Power Distribution Networks
+  using a Linearized DistFlow Formulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul K. Gupta, Daniel K. Molzahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Power distribution networks, especially in North America, are often
+unbalanced but are designed to keep unbalance levels within the limits
+specified by IEEE, IEC, and NEMA standards. However, rapid integration of
+unbalanced devices, such as electric vehicle (EV) chargers and single-phase
+solar plants, can exacerbate these imbalances. This increase can trigger
+protection devices, increase losses, and potentially damage devices. To address
+this issue, phase swapping (or phase allocation) has been proposed. Existing
+approaches predominantly rely on heuristic methods. In this work, we develop a
+mixed integer linear programming (MILP) approach for phase allocation. Our
+approach uses linearized DistFlow equations to represent the distribution
+network and incorporates a phase consistency constraint, enforced with binary
+variables, to ensure that downstream phase configurations align with upstream
+configurations. We validate the proposed approach on multiple benchmark test
+cases and demonstrate that it effectively improves network balance, as
+quantified by various metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentially Private Gradient-Tracking-Based Distributed Stochastic
+  Optimization over Directed Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06793v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06793v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialong Chen, Jimin Wang, Ji-Feng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a new differentially private gradient-tracking-based
+distributed stochastic optimization algorithm over directed graphs.
+Specifically, privacy noises are added to each agent's state and tracking
+variable to prevent information leakage, and then perturbed states and tracking
+variables are transmitted to neighbors. We design two novel schemes of the
+iteration step-sizes and the sampling number for the algorithm. By using the
+sampling parameter-controlled subsampling method, both schemes enhance the
+differential privacy level, and achieve the finite cumulative privacy budget
+even over infinite iterations. The convergence rate of the algorithm is shown
+for both nonconvex with the Polyak-Lojasiewicz condition and strongly convex
+objectives: Scheme (S1) achieves the polynomial convergence rate, and Scheme
+(S2) achieves the exponential convergence rate. The trade-off between the
+privacy and the convergence rate is presented. The algorithm's effectiveness
+and superior performance over the existing works are demonstrated through
+numerical examples of distributed training on benchmark datasets "MNIST" and
+"CIFAR-10".
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cost-Effective Robotic Handwriting System with AI Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Huang, Richard Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a cost-effective robotic handwriting system designed to
+replicate human-like handwriting with high precision. Combining a Raspberry Pi
+Pico microcontroller, 3D-printed components, and a machine learning-based
+handwriting generation model implemented via TensorFlow.js, the system converts
+user-supplied text into realistic stroke trajectories. By leveraging
+lightweight 3D-printed materials and efficient mechanical designs, the system
+achieves a total hardware cost of approximately \$56, significantly
+undercutting commercial alternatives. Experimental evaluations demonstrate
+handwriting precision within $\pm$0.3 millimeters and a writing speed of
+approximately 200 mm/min, positioning the system as a viable solution for
+educational, research, and assistive applications. This study seeks to lower
+the barriers to personalized handwriting technologies, making them accessible
+to a broader audience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an updated version of a paper originally presented at the
+  2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative AI Enabled Robust Sensor Placement in Cyber-Physical Power
+  Systems: A Graph Diffusion Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changyuan Zhao, Guangyuan Liu, Bin Xiang, Dusit Niyato, Benoit Delinchant, Hongyang Du, Dong In Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With advancements in physical power systems and network technologies,
+integrated Cyber-Physical Power Systems (CPPS) have significantly enhanced
+system monitoring and control efficiency and reliability. This integration,
+however, introduces complex challenges in designing coherent CPPS, particularly
+as few studies concurrently address the deployment of physical layers and
+communication connections in the cyber layer. This paper addresses these
+challenges by proposing a framework for robust sensor placement to optimize
+anomaly detection in the physical layer and enhance communication resilience in
+the cyber layer. We model the CPPS as an interdependent network via a graph,
+allowing for simultaneous consideration of both layers. Then, we adopt the
+Log-normal Shadowing Path Loss (LNSPL) model to ensure reliable data
+transmission. Additionally, we leverage the Fiedler value to measure graph
+resilience against line failures and three anomaly detectors to fortify system
+safety. However, the optimization problem is NP-hard. Therefore, we introduce
+the Experience Feedback Graph Diffusion (EFGD) algorithm, which utilizes a
+diffusion process to generate optimal sensor placement strategies. This
+algorithm incorporates cross-entropy gradient and experience feedback
+mechanisms to expedite convergence and generate higher reward strategies.
+Extensive simulations demonstrate that the EFGD algorithm enhances model
+convergence by 18.9% over existing graph diffusion methods and improves average
+reward by 22.90% compared to Denoising Diffusion Policy Optimization (DDPO) and
+19.57% compared to Graph Diffusion Policy Optimization (GDPO), thereby
+significantly bolstering the robustness and reliability of CPPS operations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Sampling-based Planner with LTL Constraints and Text
+  <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06719v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06719v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingzhan Ge, Zi-Hao Zhang, Sheng-En Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This project introduces a hierarchical planner integrating Linear Temporal
+Logic (LTL) constraints with natural language prompting for robot motion
+planning. The framework decomposes maps into regions, generates directed
+graphs, and converts them into transition systems for high-level planning. Text
+instructions are translated into LTL formulas and converted to Deterministic
+Finite Automata (DFA) for sequential goal-reaching tasks while adhering to
+safety constraints. High-level plans, derived via Breadth-First Search (BFS),
+guide low-level planners like Exploring Random Trees (RRT) and Probabilistic
+Roadmaps (PRM) for obstacle-avoidant navigation along with LTL tasks. The
+approach demonstrates adaptability to various task complexities, though
+challenges such as graph construction overhead and suboptimal path generation
+remain. Future directions include extending to considering terrain conditions
+and incorporating higher-order dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coordinated Deliverable Energy Flexibility from EV Aggregators in
+  Distribution Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arash Baharvandi, Duong Tung Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a coordinated framework to optimize electric vehicle (EV)
+charging considering grid constraints and system uncertainties. The proposed
+framework consists of two optimization models. In particular, the distribution
+system operator (DSO) solves the first model to optimize the amount of
+deliverable energy flexibility that can be obtained from EV aggregators. To
+address the uncertainties of loads and solar energy generation, a hybrid
+robust/stochastic approach is employed, enabling the transformation of
+uncertainty-related constraints into a set of equivalent deterministic
+constraints. Once the DSO has computed the optimal energy flexibility, each
+aggregator utilizes the second optimization model to optimize the charging
+schedule for its respective fleet of EVs. Numerical simulations are performed
+on a modified IEEE 33-bus distribution network to illustrate the efficiency of
+the proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This Paper has been accepted for presentation in 2025 IEEE Green
+  Technologies Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ USV-AUV Collaboration Framework for Underwater Tasks under Extreme Sea
+  Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.02444v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.02444v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingzehua Xu, Guanwen Xie, Xinqi Wang, Yimian Ding, Shuai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous underwater vehicles (AUVs) are valuable for ocean exploration due
+to their flexibility and ability to carry communication and detection units.
+Nevertheless, AUVs alone often face challenges in harsh and extreme sea
+conditions. This study introduces a unmanned surface vehicle (USV)-AUV
+collaboration framework, which includes high-precision multi-AUV positioning
+using USV path planning via Fisher information matrix optimization and
+reinforcement learning for multi-AUV cooperative tasks. Applied to a multi-AUV
+underwater data collection task scenario, extensive simulations validate the
+framework's feasibility and superior performance, highlighting exceptional
+coordination and robustness under extreme sea conditions. To accelerate
+relevant research in this field, we have made the simulation code (demo
+version) available as open-source.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zeroth-Order Actor-Critic: An Evolutionary Framework for Sequential
+  Decision Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.12518v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.12518v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuheng Lei, Yao Lyu, Guojian Zhan, Tao Zhang, Jiangtao Li, Jianyu Chen, Shengbo Eben Li, Sifa Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evolutionary algorithms (EAs) have shown promise in solving sequential
+decision problems (SDPs) by simplifying them to static optimization problems
+and searching for the optimal policy parameters in a zeroth-order way. While
+these methods are highly versatile, they often suffer from high sample
+complexity due to their ignorance of the underlying temporal structures. In
+contrast, reinforcement learning (RL) methods typically formulate SDPs as
+Markov Decision Process (MDP). Although more sample efficient than EAs, RL
+methods are restricted to differentiable policies and prone to getting stuck in
+local optima. To address these issues, we propose a novel evolutionary
+framework Zeroth-Order Actor-Critic (ZOAC). We propose to use step-wise
+exploration in parameter space and theoretically derive the zeroth-order policy
+gradient. We further utilize the actor-critic architecture to effectively
+leverage the Markov property of SDPs and reduce the variance of gradient
+estimators. In each iteration, ZOAC employs samplers to collect trajectories
+with parameter space exploration, and alternates between first-order policy
+evaluation (PEV) and zeroth-order policy improvement (PIM). To evaluate the
+effectiveness of ZOAC, we apply it to a challenging multi-lane driving task,
+optimizing the parameters in a rule-based, non-differentiable driving policy
+that consists of three sub-modules: behavior selection, path planning, and
+trajectory tracking. We also compare it with gradient-based RL methods on three
+Gymnasium tasks, optimizing neural network policies with thousands of
+parameters. Experimental results demonstrate the strong capability of ZOAC in
+solving SDPs. ZOAC significantly outperforms EAs that treat the problem as
+static optimization and matches the performance of gradient-based RL methods
+even without first-order information, in terms of total average return across
+all tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Evolutionary Computation, Copyright
+  @IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Transferability of Multimodal Adversarial Samples for
+  Vision-Language <span class="highlight-title">Pre-train</span>ing Models with Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12636v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12636v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youze Wang, Wenbo Hu, Yinpeng Dong, Hanwang Zhang, Hang Su, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of visual and textual data in Vision-Language Pre-training
+(VLP) models is crucial for enhancing vision-language understanding. However,
+the adversarial robustness of these models, especially in the alignment of
+image-text features, has not yet been sufficiently explored. In this paper, we
+introduce a novel gradient-based multimodal adversarial attack method,
+underpinned by contrastive learning, to improve the transferability of
+multimodal adversarial samples in VLP models. This method concurrently
+generates adversarial texts and images within imperceptive perturbation,
+employing both image-text and intra-modal contrastive loss. We evaluate the
+effectiveness of our approach on image-text retrieval and visual entailment
+tasks, using publicly available datasets in a black-box setting. Extensive
+experiments indicate a significant advancement over existing single-modal
+transfer-based adversarial attack methods and current multimodal adversarial
+attack approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CAMSIC: Content-aware Masked Image Modeling <span class="highlight-title">Transformer</span> for Stereo Image
+  Compression <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.08505v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.08505v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinjie Zhang, Shenyuan Gao, Zhening Liu, Jiawei Shao, Xingtong Ge, Dailan He, Tongda Xu, Yan Wang, Jun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing learning-based stereo image codec adopt sophisticated transformation
+with simple entropy models derived from single image codecs to encode latent
+representations. However, those entropy models struggle to effectively capture
+the spatial-disparity characteristics inherent in stereo images, which leads to
+suboptimal rate-distortion results. In this paper, we propose a stereo image
+compression framework, named CAMSIC. CAMSIC independently transforms each image
+to latent representation and employs a powerful decoder-free Transformer
+entropy model to capture both spatial and disparity dependencies, by
+introducing a novel content-aware masked image modeling (MIM) technique. Our
+content-aware MIM facilitates efficient bidirectional interaction between prior
+information and estimated tokens, which naturally obviates the need for an
+extra Transformer decoder. Experiments show that our stereo image codec
+achieves state-of-the-art rate-distortion performance on two stereo image
+datasets Cityscapes and InStereo2K with fast encoding and decoding speed. Code
+is available at https://github.com/Xinjie-Q/CAMSIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discrete lossless convexification for pointing constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dayou Luo, Fabio Spada, Behçet Açıkmeşe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discrete Lossless Convexification (DLCvx) formulates a convex relaxation for
+a specific class of discrete-time non-convex optimal control problems. It
+establishes sufficient conditions under which the solution of the relaxed
+problem satisfies the original non-convex constraints at specified time grid
+points. Furthermore, it provides an upper bound on the number of time grid
+points where these sufficient conditions may not hold, and thus the original
+constraints could be violated. This paper extends DLCvx to problems with
+control pointing constraints. Additionally, it introduces a novel DLCvx
+formulation for mixed-integer optimal control problems in which the control is
+either inactive or constrained within an annular sector. This formulation
+broadens the feasible space for problems with pointing constraints. A numerical
+example is provided to illustrate its application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Online Bookmaking for Binary Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alankrita Bhatt, Or Ordentlich, Oron Sabag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In online betting, the bookmaker can update the payoffs it offers on a
+particular event many times before the event takes place, and the updated
+payoffs may depend on the bets accumulated thus far. We study the problem of
+bookmaking with the goal of maximizing the return in the worst-case, with
+respect to the gamblers' behavior and the event's outcome. We formalize this
+problem as the \emph{Optimal Online Bookmaking game}, and provide the exact
+solution for the binary case. To this end, we develop the optimal bookmaking
+strategy, which relies on a new technique called bi-balancing trees, that
+assures that the house loss is the same for all \emph{decisive} betting
+sequences, where the gambler bets all its money on a single outcome in each
+round.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Smoothing Consensus-Based Optimization Algorithm for Nonsmooth
+  Nonconvex Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06804v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06804v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhen Wei, Wei Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lately, a novel swarm intelligence model, namely the consensus-based
+optimization (CBO) algorithm, was introduced to deal with the global
+optimization problems. Limited by the conditions of Ito's formula, the
+convergence analysis of the previous CBO finite particle system mainly focuses
+on the problem with smooth objective function. With the help of smoothing
+method, this paper achieves a breakthrough by proposing an effective CBO
+algorithm for solving the global solution of a nonconvex, nonsmooth, and
+possible non-Lipschitz continuous minimization problem with theoretical
+analysis, which dose not rely on the mean-field limit. We indicate that the
+proposed algorithm exhibits a global consensus and converges to a common state
+with any initial data. Then, we give a more detailed error estimation on the
+objective function values along the state of the proposed algorithm towards the
+global minimum. Finally, some numerical examples are presented to illustrate
+the appreciable performance of the proposed method on solving the nonsmooth,
+nonconvex minimization problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Low Can We Go? Minimizing Interaction Samples for Configurable
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Krupke, Ahmad Moradi, Michael Perk, Phillip Keldenich, Gabriel Gehrke, Sebastian Krieter, Thomas Thüm, Sándor P. Fekete
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern software systems are typically configurable, a fundamental
+prerequisite for wide applicability and reusability. This flexibility poses an
+extraordinary challenge for quality assurance, as the enormous number of
+possible configurations makes it impractical to test each of them separately.
+This is where t-wise interaction sampling can be used to systematically cover
+the configuration space and detect unknown feature interactions. Over the last
+two decades, numerous algorithms for computing small interaction samples have
+been studied, providing improvements for a range of heuristic results;
+nevertheless, it has remained unclear how much these results can still be
+improved.
+  We present a significant breakthrough: a fundamental framework, based on the
+mathematical principle of duality, for combining near-optimal solutions with
+provable lower bounds on the required sample size. This implies that we no
+longer need to work on heuristics with marginal or no improvement, but can
+certify the solution quality by establishing a limit on the remaining gap; in
+many cases, we can even prove optimality of achieved solutions. This
+theoretical contribution also provides extensive practical improvements: Our
+algorithm SampLNS was tested on 47 small and medium-sized configurable systems
+from the existing literature. SampLNS can reliably find samples of smaller size
+than previous methods in 85% of the cases; moreover, we can achieve and prove
+optimality of solutions for 63% of all instances. This makes it possible to
+avoid cumbersome efforts of minimizing samples by researchers as well as
+practitioners, and substantially save testing resources for most configurable
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Initial Guess Generation for Low-Thrust Trajectory Design with
+  Robustness to Missed-Thrust-Events 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06694v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06694v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amlan Sinha, Ryne Beeson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing interest in cislunar space exploration in recent years has driven
+an increasing demand for efficient low-thrust missions to key cislunar orbits.
+These missions, typically possessing long thrust arcs, are particularly
+susceptible to operational uncertainties such as missed thrust events.
+Addressing these challenges requires efficient robust trajectory design
+frameworks during the preliminary mission design phase, where it is necessary
+to explore the solution space at a rapid cadence under evolving operational
+constraints. However, existing methods for missed thrust design rely on solving
+high-dimensional nonlinear programs, where generating effective initial guesses
+becomes challenging. To enhance computational efficiency, quality, and depth of
+robustness of solutions from global search, we compare two initial guess
+strategies: a baseline non-conditional global search, which samples from a
+static distribution with global support, and a conditional global search, which
+generates initial guesses conditioned on solutions to problems with less depth
+of robustness. The conditional search provides a sequential procedure for
+solving increasingly robust problems. We validate the improvements in the
+conditional approach using a low-thrust case study for the Lunar Gateway Power
+and Propulsion Element, where our results demonstrate that it significantly
+improves convergence rate and solution quality, highlighting its potential in
+preliminary robust trajectory design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript has been submitted for publication in the AIAA
+  Journal of Guidance, Control, and Dynamics. It represents a significant
+  evolution of our previous arXiv pre-print submission entitled "Algorithmic
+  Considerations for Effective Global Search of Robust Low-Thrust
+  Trajectories", reflecting substantial advancements and refinements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distributionally Robust Gaussian Process Regression and Bayesian Inverse
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.13111v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.13111v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhui Zhang, Jose Blanchet, Youssef Marzouk, Viet Anh Nguyen, Sven Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a distributionally robust optimization formulation (i.e., a min-max
+game) for two representative problems in Bayesian nonparametric estimation:
+Gaussian process regression and, more generally, linear inverse problems. Our
+formulation seeks the best mean-squared error predictor, in an
+infinite-dimensional space, against an adversary who chooses the worst-case
+model in a Wasserstein ball around a nominal infinite-dimensional Bayesian
+model. The transport cost is chosen to control features such as the degree of
+roughness of the sample paths that the adversary is allowed to inject. We show
+that the game has a well-defined value (i.e., strong duality holds in the sense
+that max-min equals min-max) and that there exists a unique Nash equilibrium
+which can be computed by a sequence of finite-dimensional approximations.
+Crucially, the worst-case distribution is itself Gaussian. We explore
+properties of the Nash equilibrium and the effects of hyperparameters through a
+set of numerical experiments, demonstrating the versatility of our modeling
+framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pontryagin-Guided Policy Optimization for Merton's Portfolio Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13101v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13101v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeonggyu Huh, Jaegi Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a Pontryagin-Guided Direct Policy Optimization (PG-DPO) framework
+for Merton's portfolio problem, unifying modern neural-network-based policy
+parameterization with the adjoint viewpoint from Pontryagin's maximum principle
+(PMP). Instead of approximating the value function (as done in deep BSDE
+methods), we track a policy-fixed BSDE for the adjoint processes, which allows
+each gradient update to align with continuous-time PMP conditions. This setup
+yields locally optimal consumption and investment policies that are closely
+tied to classical stochastic control. We further incorporate an alignment
+penalty that nudges the learned policy toward Pontryagin-derived solutions,
+enhancing both convergence speed and training stability. Numerical experiments
+confirm that PG-DPO effectively handles both consumption and investment,
+achieving strong performance and interpretability without requiring large
+offline datasets or model-free reinforcement learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Monotone Lipschitz-Gradient Denoiser: Explainability of Operator
+  Regularization Approaches Free From Lipschitz Constant Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masahiro Yukawa, Isao Yamada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses explainability of the operator-regularization approach
+under the use of monotone Lipschitz-gradient (MoL-Grad) denoiser -- an operator
+that can be expressed as the Lipschitz continuous gradient of a differentiable
+convex function. We prove that an operator is a MoL-Grad denoiser if and only
+if it is the ``single-valued'' proximity operator of a weakly convex function.
+An extension of Moreau's decomposition is also shown with respect to a weakly
+convex function and the conjugate of its convexified function. Under these
+arguments, two specific algorithms, the forward-backward splitting algorithm
+and the primal-dual splitting algorithm, are considered, both employing
+MoL-Grad denoisers. These algorithms generate a sequence of vectors converging
+weakly, under conditions, to a minimizer of a certain cost function which
+involves an ``implicit regularizer'' induced by the denoiser. Unlike the
+previous studies of operator regularization, our framework requires no control
+of the Lipschitz constant in learning the denoiser. The theoretical findings
+are supported by simulations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">29</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparison of Autoencoders for tokenization of ASL <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vouk Praun-Petrovic, Aadhvika Koundinya, Lavanya Prahallad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI, powered by large language models (LLMs), has revolutionized
+applications across text, audio, images, and video. This study focuses on
+developing and evaluating encoder-decoder architectures for the American Sign
+Language (ASL) image dataset, consisting of 87,000 images across 29 hand sign
+classes. Three approaches were compared: Feedforward Autoencoders,
+Convolutional Autoencoders, and Diffusion Autoencoders. The Diffusion
+Autoencoder outperformed the others, achieving the lowest mean squared error
+(MSE) and highest Mean Opinion Score (MOS) due to its probabilistic noise
+modeling and iterative denoising capabilities. The Convolutional Autoencoder
+demonstrated effective spatial feature extraction but lacked the robustness of
+the diffusion process, while the Feedforward Autoencoder served as a baseline
+with limitations in handling complex image data. Objective and subjective
+evaluations confirmed the superiority of the Diffusion Autoencoder for
+high-fidelity image reconstruction, emphasizing its potential in multimodal AI
+applications such as sign language recognition and generation. This work
+provides critical insights into designing robust encoder-decoder systems to
+advance multimodal AI capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 tables, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Super-Resolution of 3D Micro-CT Images Using Generative Adversarial
+  Networks: Enhancing Resolution and Segmentation Accuracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06939v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06939v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evgeny Ugolkov, Xupeng He, Hyung Kwak, Hussein Hoteit
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a procedure for substantially improving the quality of segmented
+3D micro-Computed Tomography (micro-CT) images of rocks with a Machine Learning
+(ML) Generative Model. The proposed model enhances the resolution eightfold
+(8x) and addresses segmentation inaccuracies due to the overlapping X-ray
+attenuation in micro-CT measurement for different rock minerals and phases. The
+proposed generative model is a 3D Deep Convolutional Wasserstein Generative
+Adversarial Network with Gradient Penalty (3D DC WGAN-GP). The algorithm is
+trained on segmented 3D low-resolution micro-CT images and segmented unpaired
+complementary 2D high-resolution Laser Scanning Microscope (LSM) images. The
+algorithm was demonstrated on multiple samples of Berea sandstones. We achieved
+high-quality super-resolved 3D images with a resolution of 0.4375 micro-m/voxel
+and accurate segmentation for constituting minerals and pore space. The
+described procedure can significantly expand the modern capabilities of digital
+rock physics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating unsupervised contrastive learning framework for MRI sequences
+  classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuli Wang, Kritika Iyer, Sep Farhand, Yoshihisa Shinagawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The automatic identification of Magnetic Resonance Imaging (MRI) sequences
+can streamline clinical workflows by reducing the time radiologists spend
+manually sorting and identifying sequences, thereby enabling faster diagnosis
+and treatment planning for patients. However, the lack of standardization in
+the parameters of MRI scans poses challenges for automated systems and
+complicates the generation and utilization of datasets for machine learning
+research. To address this issue, we propose a system for MRI sequence
+identification using an unsupervised contrastive deep learning framework. By
+training a convolutional neural network based on the ResNet-18 architecture,
+our system classifies nine common MRI sequence types as a 9-class
+classification problem. The network was trained using an in-house internal
+dataset and validated on several public datasets, including BraTS, ADNI, Fused
+Radiology-Pathology Prostate Dataset, the Breast Cancer Dataset (ACRIN), among
+others, encompassing diverse acquisition protocols and requiring only 2D slices
+for training. Our system achieves a classification accuracy of over 0.95 across
+the nine most common MRI sequence types.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CULTURE3D: Cultural Landmarks and Terrain <span class="highlight-title">Dataset</span> for 3D Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Zheng, Steve Zhang, Weizhe Lin, Aaron Zhang, Walterio W. Mayol-Cuevas, Junxiao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a large-scale fine-grained dataset using
+high-resolution images captured from locations worldwide. Compared to existing
+datasets, our dataset offers a significantly larger size and includes a higher
+level of detail, making it uniquely suited for fine-grained 3D applications.
+Notably, our dataset is built using drone-captured aerial imagery, which
+provides a more accurate perspective for capturing real-world site layouts and
+architectural structures. By reconstructing environments with these detailed
+images, our dataset supports applications such as the COLMAP format for
+Gaussian Splatting and the Structure-from-Motion (SfM) method. It is compatible
+with widely-used techniques including SLAM, Multi-View Stereo, and Neural
+Radiance Fields (NeRF), enabling accurate 3D reconstructions and point clouds.
+This makes it a benchmark for reconstruction and segmentation tasks. The
+dataset enables seamless integration with multi-modal data, supporting a range
+of 3D applications, from architectural reconstruction to virtual tourism. Its
+flexibility promotes innovation, facilitating breakthroughs in 3D modeling and
+analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking YOLOv8 for Optimal Crack Detection in Civil Infrastructure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Woubishet Zewdu Taffese, Ritesh Sharma, Mohammad Hossein Afsharmovahed, Gunasekaran Manogaran, Genda Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring the structural integrity and safety of bridges is crucial for the
+reliability of transportation networks and public safety. Traditional crack
+detection methods are increasingly being supplemented or replaced by advanced
+artificial intelligence (AI) techniques. However, most of the models rely on
+two-stage target detection algorithms, which pose concerns for real-time
+applications due to their lower speed. While models such as YOLO (You Only Look
+Once) have emerged as transformative tools due to their remarkable speed and
+accuracy. However, the potential of the latest YOLOv8 framework in this domain
+remains underexplored. This study bridges that gap by rigorously evaluating
+YOLOv8's performance across five model scales (nano, small, medium, large, and
+extra-large) using a high-quality Roboflow dataset. A comprehensive
+hyperparameter optimization was performed, testing six state-of-the-art
+optimizers-Stochastic Gradient Descent, Adaptive Moment Estimation, Adam with
+Decoupled Weight Decay, Root Mean Square Propagation, Rectified Adam, and
+Nesterov-accelerated Adam. Results revealed that YOLOv8, optimized with
+Stochastic Gradient Descent, delivered exceptional accuracy and speed, setting
+a new benchmark for real-time crack detection. Beyond its immediate
+application, this research positions YOLOv8 as a foundational approach for
+integrating advanced computer vision techniques into infrastructure monitoring.
+By enabling more reliable and proactive maintenance of aging bridge networks,
+this work paves the way for safer, more efficient transportation systems
+worldwide.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 104th TRB Annual Meeting 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Driver Age and Its Effect on Key Driving Metrics: Insights from Dynamic
+  Vehicle Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06918v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06918v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aparna Joshi, Kojo Adugyamfi, Jennifer Merickel, Pujitha Gunaratne, Anuj Sharma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By 2030, the senior population aged 65 and older is expected to increase by
+over 50%, significantly raising the number of older drivers on the road.
+Drivers over 70 face higher crash death rates compared to those in their
+forties and fifties, underscoring the importance of developing more effective
+safety interventions for this demographic. Although the impact of aging on
+driving behavior has been studied, there is limited research on how these
+behaviors translate into real-world driving scenarios. This study addresses
+this need by leveraging Naturalistic Driving Data (NDD) to analyze driving
+performance measures - specifically, speed limit adherence on interstates and
+deceleration at stop intersections, both of which may be influenced by
+age-related declines. Using NDD, we developed Cumulative Distribution Functions
+(CDFs) to establish benchmarks for key driving behaviors among senior and young
+drivers. Our analysis, which included anomaly detection, benchmark comparisons,
+and accuracy evaluations, revealed significant differences in driving patterns
+primarily related to speed limit adherence at 75mph. While our approach shows
+promising potential for enhancing Advanced Driver Assistance Systems (ADAS) by
+providing tailored interventions based on age-specific adherence to speed limit
+driving patterns, we recognize the need for additional data to refine and
+validate metrics for other driving behaviors. By establishing precise
+benchmarks for various driving performance metrics, ADAS can effectively
+identify anomalies, such as abrupt deceleration, which may indicate impaired
+driving or other safety concerns. This study lays a strong foundation for
+future research aimed at improving safety interventions through detailed
+driving behavior analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 9 figures, 4 Tables, 104th TRB Annual Meeting 2025,
+  Washington DC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Foreground Selection aware Attentive Feature Reconstruction for
+  few-shot fine-grained plant species classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aisha Zulfiqar, Ebroul Izquiedro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plant species exhibit significant intra-class variation and minimal
+inter-class variation. To enhance classification accuracy, it is essential to
+reduce intra-class variation while maximizing inter-class variation. This paper
+addresses plant species classification using a limited number of labelled
+samples and introduces a novel Local Foreground Selection(LFS) attention
+mechanism. LFS is a straightforward module designed to generate discriminative
+support and query feature maps. It operates by integrating two types of
+attention: local attention, which captures local spatial details to enhance
+feature discrimination and increase inter-class differentiation, and foreground
+selection attention, which emphasizes the foreground plant object while
+mitigating background interference. By focusing on the foreground, the query
+and support features selectively highlight relevant feature sequences and
+disregard less significant background sequences, thereby reducing intra-class
+differences. Experimental results from three plant species datasets demonstrate
+the effectiveness of the proposed LFS attention mechanism and its complementary
+advantages over previous feature reconstruction methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic Prior for Few-Shot Drivable Head Avatar Inversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wojciech Zielonka, Stephan J. Garbin, Alexandros Lattas, George Kopanas, Paulo Gotardo, Thabo Beeler, Justus Thies, Timo Bolkart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present SynShot, a novel method for the few-shot inversion of a drivable
+head avatar based on a synthetic prior. We tackle two major challenges. First,
+training a controllable 3D generative network requires a large number of
+diverse sequences, for which pairs of images and high-quality tracked meshes
+are not always available. Second, state-of-the-art monocular avatar models
+struggle to generalize to new views and expressions, lacking a strong prior and
+often overfitting to a specific viewpoint distribution. Inspired by machine
+learning models trained solely on synthetic data, we propose a method that
+learns a prior model from a large dataset of synthetic heads with diverse
+identities, expressions, and viewpoints. With few input images, SynShot
+fine-tunes the pretrained synthetic prior to bridge the domain gap, modeling a
+photorealistic head avatar that generalizes to novel expressions and
+viewpoints. We model the head avatar using 3D Gaussian splatting and a
+convolutional encoder-decoder that outputs Gaussian parameters in UV texture
+space. To account for the different modeling complexities over parts of the
+head (e.g., skin vs hair), we embed the prior with explicit control for
+upsampling the number of per-part primitives. Compared to state-of-the-art
+monocular methods that require thousands of real training images, SynShot
+significantly improves novel view and expression synthesis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website https://zielon.github.io/synshot/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ActiveGAMER: Active GAussian Mapping through Efficient Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyan Chen, Huangying Zhan, Kevin Chen, Xiangyu Xu, Qingan Yan, Changjiang Cai, Yi Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce ActiveGAMER, an active mapping system that utilizes 3D Gaussian
+Splatting (3DGS) to achieve high-quality, real-time scene mapping and
+exploration. Unlike traditional NeRF-based methods, which are computationally
+demanding and restrict active mapping performance, our approach leverages the
+efficient rendering capabilities of 3DGS, allowing effective and efficient
+exploration in complex environments. The core of our system is a
+rendering-based information gain module that dynamically identifies the most
+informative viewpoints for next-best-view planning, enhancing both geometric
+and photometric reconstruction accuracy. ActiveGAMER also integrates a
+carefully balanced framework, combining coarse-to-fine exploration,
+post-refinement, and a global-local keyframe selection strategy to maximize
+reconstruction completeness and fidelity. Our system autonomously explores and
+reconstructs environments with state-of-the-art geometric and photometric
+accuracy and completeness, significantly surpassing existing approaches in both
+aspects. Extensive evaluations on benchmark datasets such as Replica and MP3D
+highlight ActiveGAMER's effectiveness in active mapping tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedGrad E-CLIP: Enhancing Trust and Transparency in AI-Driven Skin
+  Lesion Diagnosis <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadia Kamal, Tim Oates
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As deep learning models gain attraction in medical data, ensuring transparent
+and trustworthy decision-making is essential. In skin cancer diagnosis, while
+advancements in lesion detection and classification have improved accuracy, the
+black-box nature of these methods poses challenges in understanding their
+decision processes, leading to trust issues among physicians. This study
+leverages the CLIP (Contrastive Language-Image Pretraining) model, trained on
+different skin lesion datasets, to capture meaningful relationships between
+visual features and diagnostic criteria terms. To further enhance transparency,
+we propose a method called MedGrad E-CLIP, which builds on gradient-based
+E-CLIP by incorporating a weighted entropy mechanism designed for complex
+medical imaging like skin lesions. This approach highlights critical image
+regions linked to specific diagnostic descriptions. The developed integrated
+pipeline not only classifies skin lesions by matching corresponding
+descriptions but also adds an essential layer of explainability developed
+especially for medical data. By visually explaining how different features in
+an image relates to diagnostic criteria, this approach demonstrates the
+potential of advanced vision-language models in medical image analysis,
+ultimately improving transparency, robustness, and trust in AI-driven
+diagnostic systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 2025 IEEE/CVF Winter Conference on Applications of
+  Computer Vision Workshops (WACVW)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transforming Vision <span class="highlight-title">Transformer</span>: Towards Efficient Multi-Task
+  Asynchronous Learning <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanwen Zhong, Jiaxin Chen, Yutong Zhang, Di Huang, Yunhong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Task Learning (MTL) for Vision Transformer aims at enhancing the model
+capability by tackling multiple tasks simultaneously. Most recent works have
+predominantly focused on designing Mixture-of-Experts (MoE) structures and in
+tegrating Low-Rank Adaptation (LoRA) to efficiently perform multi-task
+learning. However, their rigid combination hampers both the optimization of MoE
+and the ef fectiveness of reparameterization of LoRA, leading to sub-optimal
+performance and low inference speed. In this work, we propose a novel approach
+dubbed Efficient Multi-Task Learning (EMTAL) by transforming a pre-trained
+Vision Transformer into an efficient multi-task learner during training, and
+reparameterizing the learned structure for efficient inference. Specifically,
+we firstly develop the MoEfied LoRA structure, which decomposes the pre-trained
+Transformer into a low-rank MoE structure and employ LoRA to fine-tune the
+parameters. Subsequently, we take into account the intrinsic asynchronous
+nature of multi-task learning and devise a learning Quality Retaining (QR)
+optimization mechanism, by leveraging the historical high-quality class logits
+to prevent a well-trained task from performance degradation. Finally, we design
+a router fading strategy to integrate the learned parameters into the original
+Transformer, archiving efficient inference. Extensive experiments on public
+benchmarks demonstrate the superiority of our method, compared to the
+state-of-the-art multi-task learning approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 38th Conference on Neural Information Processing
+  Systems (NeurIPS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Neural-Enhancement for Online Cloud Gaming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shan Jiang, Zhenhua Han, Haisheng Tan, Xinyang Jiang, Yifan Yang, Xiaoxi Zhang, Hongqiu Ni, Yuqing Yang, Xiang-Yang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online Cloud gaming demands real-time, high-quality video transmission across
+variable wide-area networks (WANs). Neural-enhanced video transmission
+algorithms employing super-resolution (SR) for video quality enhancement have
+effectively challenged WAN environments. However, these SR-based methods
+require intensive fine-tuning for the whole video, making it infeasible in
+diverse online cloud gaming. To address this, we introduce River, a cloud
+gaming delivery framework designed based on the observation that video segment
+features in cloud gaming are typically repetitive and redundant. This permits a
+significant opportunity to reuse fine-tuned SR models, reducing the fine-tuning
+latency of minutes to query latency of milliseconds. To enable the idea, we
+design a practical system that addresses several challenges, such as model
+organization, online model scheduler, and transfer strategy. River first builds
+a content-aware encoder that fine-tunes SR models for diverse video segments
+and stores them in a lookup table. When delivering cloud gaming video streams
+online, River checks the video features and retrieves the most relevant SR
+models to enhance the frame quality. Meanwhile, if no existing SR model
+performs well enough for some video segments, River will further fine-tune new
+models and update the lookup table. Finally, to avoid the overhead of streaming
+model weight to the clients, River designs a prefetching strategy that predicts
+the models with the highest possibility of being retrieved. Our evaluation
+based on real video game streaming demonstrates River can reduce redundant
+training overhead by 44% and improve the Peak-Signal-to-Noise-Ratio by 1.81dB
+compared to the SOTA solutions. Practical deployment shows River meets
+real-time requirements, achieving approximately 720p 20fps on mobile devices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defect Detection Network In PCB Circuit Devices Based on GAN Enhanced
+  YOLOv11 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Huang, Feiyun Zhao, Lieyang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes an advanced method for surface defect detection in
+printed circuit boards (PCBs) using an improved YOLOv11 model enhanced with a
+generative adversarial network (GAN). The approach focuses on identifying six
+common defect types: missing hole, rat bite, open circuit, short circuit, burr,
+and virtual welding. By employing GAN to generate synthetic defect images, the
+dataset is augmented with diverse and realistic patterns, improving the model's
+ability to generalize, particularly for complex and infrequent defects like
+burrs. The enhanced YOLOv11 model is evaluated on a PCB defect dataset,
+demonstrating significant improvements in accuracy, recall, and robustness,
+especially when dealing with defects in complex environments or small targets.
+This research contributes to the broader field of electronic design automation
+(EDA), where efficient defect detection is a crucial step in ensuring
+high-quality PCB manufacturing. By integrating advanced deep learning
+techniques, this approach enhances the automation and precision of defect
+detection, reducing reliance on manual inspection and accelerating
+design-to-production workflows. The findings underscore the importance of
+incorporating GAN-based data augmentation and optimized detection architectures
+in EDA processes, providing valuable insights for improving reliability and
+efficiency in PCB defect detection within industrial applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-Aware Online Extrinsic Calibration: A Conformal Prediction
+  Approach <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Cocheteux, Julien Moreau, Franck Davoine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate sensor calibration is crucial for autonomous systems, yet its
+uncertainty quantification remains underexplored. We present the first approach
+to integrate uncertainty awareness into online extrinsic calibration, combining
+Monte Carlo Dropout with Conformal Prediction to generate prediction intervals
+with a guaranteed level of coverage. Our method proposes a framework to enhance
+existing calibration models with uncertainty quantification, compatible with
+various network architectures. Validated on KITTI (RGB Camera-LiDAR) and DSEC
+(Event Camera-LiDAR) datasets, we demonstrate effectiveness across different
+visual sensor types, measuring performance with adapted metrics to evaluate the
+efficiency and reliability of the intervals. By providing calibration
+parameters with quantifiable confidence measures, we offer insights into the
+reliability of calibration estimates, which can greatly improve the robustness
+of sensor fusion in dynamic environments and usefully serve the Computer Vision
+community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Foundational Generative Model for Breast Ultrasound Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojun Yu, Youcheng Li, Nan Zhang, Zihan Niu, Xuantong Gong, Yanwen Luo, Haotian Ye, Siyu He, Quanlin Wu, Wangyan Qin, Mengyuan Zhou, Jie Han, Jia Tao, Ziwei Zhao, Di Dai, Di He, Dong Wang, Binghui Tang, Ling Huo, James Zou, Qingli Zhu, Yong Wang, Liwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundational models have emerged as powerful tools for addressing various
+tasks in clinical settings. However, their potential development to breast
+ultrasound analysis remains untapped. In this paper, we present BUSGen, the
+first foundational generative model specifically designed for breast ultrasound
+image analysis. Pretrained on over 3.5 million breast ultrasound images, BUSGen
+has acquired extensive knowledge of breast structures, pathological features,
+and clinical variations. With few-shot adaptation, BUSGen can generate
+repositories of realistic and informative task-specific data, facilitating the
+development of models for a wide range of downstream tasks. Extensive
+experiments highlight BUSGen's exceptional adaptability, significantly
+exceeding real-data-trained foundational models in breast cancer screening,
+diagnosis, and prognosis. In breast cancer early diagnosis, our approach
+outperformed all board-certified radiologists (n=9), achieving an average
+sensitivity improvement of 16.5% (P-value<0.0001). Additionally, we
+characterized the scaling effect of using generated data which was as effective
+as the collected real-world data for training diagnostic models. Moreover,
+extensive experiments demonstrated that our approach improved the
+generalization ability of downstream models. Importantly, BUSGen protected
+patient privacy by enabling fully de-identified data sharing, making progress
+forward in secure medical data utilization. An online demo of BUSGen is
+available at https://aibus.bio.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Peking University; Stanford University; Peking University Cancer
+  Hospital & Institute; Peking Union Medical College Hospital; Cancer Hospital,
+  Chinese Academy of Medical Sciences</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LarvSeg: Exploring Image Classification Data For Large Vocabulary
+  Semantic Segmentation via Category-wise Attentive Classifier 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojun Yu, Di Dai, Ziwei Zhao, Di He, Han Hu, Liwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling up the vocabulary of semantic segmentation models is extremely
+challenging because annotating large-scale mask labels is labour-intensive and
+time-consuming. Recently, language-guided segmentation models have been
+proposed to address this challenge. However, their performance drops
+significantly when applied to out-of-distribution categories. In this paper, we
+propose a new large vocabulary semantic segmentation framework, called LarvSeg.
+Different from previous works, LarvSeg leverages image classification data to
+scale the vocabulary of semantic segmentation models as large-vocabulary
+classification datasets usually contain balanced categories and are much easier
+to obtain. However, for classification tasks, the category is image-level,
+while for segmentation we need to predict the label at pixel level. To address
+this issue, we first propose a general baseline framework to incorporate
+image-level supervision into the training process of a pixel-level segmentation
+model, making the trained network perform semantic segmentation on newly
+introduced categories in the classification data. We then observe that a model
+trained on segmentation data can group pixel features of categories beyond the
+training vocabulary. Inspired by this finding, we design a category-wise
+attentive classifier to apply supervision to the precise regions of
+corresponding categories to improve the model performance. Extensive
+experiments demonstrate that LarvSeg significantly improves the large
+vocabulary semantic segmentation performance, especially in the categories
+without mask labels. For the first time, we provide a 21K-category semantic
+segmentation model with the help of ImageNet21K. The code is available at
+https://github.com/HaojunYu1998/large_voc_seg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PRCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A General Framework for Inference-time Scaling and Steering of Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raghav Singhal, Zachary Horvitz, Ryan Teehan, Mengye Ren, Zhou Yu, Kathleen McKeown, Rajesh Ranganath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models produce impressive results in modalities ranging from images
+and video to protein design and text. However, generating samples with
+user-specified properties remains a challenge. Recent research proposes
+fine-tuning models to maximize rewards that capture desired properties, but
+these methods require expensive training and are prone to mode collapse. In
+this work, we propose Feynman Kac (FK) steering, an inference-time framework
+for steering diffusion models with reward functions. FK steering works by
+sampling a system of multiple interacting diffusion processes, called
+particles, and resampling particles at intermediate steps based on scores
+computed using functions called potentials. Potentials are defined using
+rewards for intermediate states and are selected such that a high value
+indicates that the particle will yield a high-reward sample. We explore various
+choices of potentials, intermediate rewards, and samplers. We evaluate FK
+steering on text-to-image and text diffusion models. For steering text-to-image
+models with a human preference reward, we find that FK steering a 0.8B
+parameter model outperforms a 2.6B parameter fine-tuned model on prompt
+fidelity, with faster sampling and no training. For steering text diffusion
+models with rewards for text quality and specific text attributes, we find that
+FK steering generates lower perplexity, more linguistically acceptable outputs
+and enables gradient-free control of attributes like toxicity. Our results
+demonstrate that inference-time scaling and steering of diffusion models, even
+with off-the-shelf rewards, can provide significant sample quality gains and
+controllability benefits. Code is available at
+https://github.com/zacharyhorvitz/Fk-Diffusion-Steering .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Faithful Counterfactual Visual Explanations (FCVE) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bismillah Khan, Syed Ali Tariq, Tehseen Zia, Muhammad Ahsan, David Windridge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models in computer vision have made remarkable progress, but
+their lack of transparency and interpretability remains a challenge. The
+development of explainable AI can enhance the understanding and performance of
+these models. However, existing techniques often struggle to provide convincing
+explanations that non-experts easily understand, and they cannot accurately
+identify models' intrinsic decision-making processes. To address these
+challenges, we propose to develop a counterfactual explanation (CE) model that
+balances plausibility and faithfulness. This model generates easy-to-understand
+visual explanations by making minimum changes necessary in images without
+altering the pixel data. Instead, the proposed method identifies internal
+concepts and filters learned by models and leverages them to produce plausible
+counterfactual explanations. The provided explanations reflect the internal
+decision-making process of the model, thus ensuring faithfulness to the model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAM-DA: Decoder Adapter for Efficient Medical Domain Adaptation <span class="chip">WACV25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Gamazo Tejero, Moritz Schmid, Pablo Márquez Neila, Martin S. Zinkernagel, Sebastian Wolf, Raphael Sznitman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the domain adaptation challenge for semantic
+segmentation in medical imaging. Despite the impressive performance of recent
+foundational segmentation models like SAM on natural images, they struggle with
+medical domain images. Beyond this, recent approaches that perform end-to-end
+fine-tuning of models are simply not computationally tractable. To address
+this, we propose a novel SAM adapter approach that minimizes the number of
+trainable parameters while achieving comparable performances to full
+fine-tuning. The proposed SAM adapter is strategically placed in the mask
+decoder, offering excellent and broad generalization capabilities and improved
+segmentation across both fully supervised and test-time domain adaptation
+tasks. Extensive validation on four datasets showcases the adapter's efficacy,
+outperforming existing methods while training less than 1% of SAM's total
+parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ X-LeBench: A Benchmark for Extremely Long Egocentric Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqi Zhou, Kai Cao, Hao Zheng, Xinyi Zheng, Miao Liu, Per Ola Kristensson, Walterio Mayol-Cuevas, Fan Zhang, Weizhe Lin, Junxiao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-form egocentric video understanding provides rich contextual information
+and unique insights into long-term human behaviors, holding significant
+potential for applications in embodied intelligence, long-term activity
+analysis, and personalized assistive technologies. However, existing benchmark
+datasets primarily focus on single, short-duration videos or moderately long
+videos up to dozens of minutes, leaving a substantial gap in evaluating
+extensive, ultra-long egocentric video recordings. To address this, we
+introduce X-LeBench, a novel benchmark dataset specifically crafted for
+evaluating tasks on extremely long egocentric video recordings. Leveraging the
+advanced text processing capabilities of large language models (LLMs),
+X-LeBench develops a life-logging simulation pipeline that produces realistic,
+coherent daily plans aligned with real-world video data. This approach enables
+the flexible integration of synthetic daily plans with real-world footage from
+Ego4D-a massive-scale egocentric video dataset covers a wide range of daily
+life scenarios-resulting in 432 simulated video life logs that mirror realistic
+daily activities in contextually rich scenarios. The video life-log durations
+span from 23 minutes to 16.4 hours. The evaluation of several baseline systems
+and multimodal large language models (MLLMs) reveals their poor performance
+across the board, highlighting the inherent challenges of long-form egocentric
+video understanding and underscoring the need for more advanced models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DoubleDiffusion: Combining Heat Diffusion with Denoising Diffusion for
+  Generative Learning on 3D Meshes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03397v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03397v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyang Wang, Ziang Cheng, Zhenyu Li, Jiayu Yang, Haorui Ji, Pan Ji, Mehrtash Harandi, Richard Hartley, Hongdong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes DoubleDiffusion, a novel framework that combines heat
+dissipation diffusion and denoising diffusion for direct generative learning on
+3D mesh surfaces. Our approach addresses the challenges of generating
+continuous signal distributions residing on a curve manifold surface. Unlike
+previous methods that rely on unrolling 3D meshes into 2D or adopting field
+representations, DoubleDiffusion leverages the Laplacian-Beltrami operator to
+process features respecting the mesh structure. This combination enables
+effective geometry-aware signal diffusion across the underlying geometry. As
+shown in Fig.1, we demonstrate that DoubleDiffusion has the ability to generate
+RGB signal distributions on complex 3D mesh surfaces and achieves per-category
+shape-conditioned texture generation across different shape geometry. Our work
+contributes a new direction in diffusion-based generative modeling on 3D
+surfaces, with potential applications in the field of 3D asset generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Codes: https://github.com/Wxyxixixi/DoubleDiffusion_3D_Mesh</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Artificial Intelligence for Cochlear Implants: <span class="highlight-title">Review</span> of Strategies,
+  Challenges, and Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15442v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15442v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Billel Essaid, Hamza Kheddar, Noureddine Batel, Muhammad E. H. Chowdhury, Abderrahmane Lakas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic speech recognition (ASR) plays a pivotal role in our daily lives,
+offering utility not only for interacting with machines but also for
+facilitating communication for individuals with partial or profound hearing
+impairments. The process involves receiving the speech signal in analog form,
+followed by various signal processing algorithms to make it compatible with
+devices of limited capacities, such as cochlear implants (CIs). Unfortunately,
+these implants, equipped with a finite number of electrodes, often result in
+speech distortion during synthesis. Despite efforts by researchers to enhance
+received speech quality using various state-of-the-art (SOTA) signal processing
+techniques, challenges persist, especially in scenarios involving multiple
+sources of speech, environmental noise, and other adverse conditions. The
+advent of new artificial intelligence (AI) methods has ushered in cutting-edge
+strategies to address the limitations and difficulties associated with
+traditional signal processing techniques dedicated to CIs. This review aims to
+comprehensively cover advancements in CI-based ASR and speech enhancement,
+among other related aspects. The primary objective is to provide a thorough
+overview of metrics and datasets, exploring the capabilities of AI algorithms
+in this biomedical field, and summarizing and commenting on the best results
+obtained. Additionally, the review will delve into potential applications and
+suggest future directions to bridge existing research gaps in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Superpixel Segmentation Methods in the Context of Citizen
+  Science and Deforestation Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.17922v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.17922v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Resende, Isabela Borlido, Victor Sundermann, Eduardo B. Neto, Silvio Jamil F. Guimarães, Fabio Faria, Alvaro Luiz Fazenda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tropical forests play an essential role in the planet's ecosystem, making the
+conservation of these biomes a worldwide priority. However, ongoing
+deforestation and degradation pose a significant threat to their existence,
+necessitating effective monitoring and the proposal of actions to mitigate the
+damage caused by these processes. In this regard, initiatives range from
+government and private sector monitoring programs to solutions based on citizen
+science campaigns, for example. Particularly in the context of citizen science
+campaigns, the segmentation of remote sensing images to identify deforested
+areas and subsequently submit them to analysis by non-specialized volunteers is
+necessary. Thus, segmentation using superpixel-based techniques proves to be a
+viable solution for this important task. Therefore, this paper presents an
+analysis of 22 superpixel-based segmentation methods applied to remote sensing
+images, aiming to identify which of them are more suitable for generating
+segments for citizen science campaigns. The results reveal that seven of the
+segmentation methods outperformed the baseline method (SLIC) currently employed
+in the ForestEyes citizen science project, indicating an opportunity for
+improvement in this important stage of campaign development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fresh-CL: Feature Realignment through Experts on Hypersphere in
+  Continual Learning <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02198v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02198v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongyi Zhou, Yaxin Peng, Pin Yi, Minjie Zhu, Chaomin Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual Learning enables models to learn and adapt to new tasks while
+retaining prior knowledge. Introducing new tasks, however, can naturally lead
+to feature entanglement across tasks, limiting the model's capability to
+distinguish between new domain data. In this work, we propose a method called
+Feature Realignment through Experts on hyperSpHere in Continual Learning
+(Fresh-CL). By leveraging predefined and fixed simplex equiangular tight frame
+(ETF) classifiers on a hypersphere, our model improves feature separation both
+intra and inter tasks. However, the projection to a simplex ETF shifts with new
+tasks, disrupting structured feature representation of previous tasks and
+degrading performance. Therefore, we propose a dynamic extension of ETF through
+mixture of experts, enabling adaptive projections onto diverse subspaces to
+enhance feature representation. Experiments on 11 datasets demonstrate a 2%
+improvement in accuracy compared to the strongest baseline, particularly in
+fine-grained datasets, confirming the efficacy of combining ETF and MoE to
+improve feature distinction in continual learning scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Low-Frequency Bias: Feature Recalibration and Frequency
+  Attention Regularization for Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04016v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04016v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kejia Zhang, Juanjuan Weng, Yuanzheng Cai, Zhiming Luo, Shaozi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring the robustness of deep neural networks against adversarial attacks
+remains a fundamental challenge in computer vision. While adversarial training
+(AT) has emerged as a promising defense strategy, our analysis reveals a
+critical limitation: AT-trained models exhibit a bias toward low-frequency
+features while neglecting high-frequency components. This bias is particularly
+concerning as each frequency component carries distinct and crucial
+information: low-frequency features encode fundamental structural patterns,
+while high-frequency features capture intricate details and textures. To
+address this limitation, we propose High-Frequency Feature Disentanglement and
+Recalibration (HFDR), a novel module that strategically separates and
+recalibrates frequency-specific features to capture latent semantic cues. We
+further introduce frequency attention regularization to harmonize feature
+extraction across the frequency spectrum and mitigate the inherent
+low-frequency bias of AT. Extensive experiments demonstrate our method's
+superior performance against white-box attacks and transfer attacks, while
+exhibiting strong generalization capabilities across diverse scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SELMA3D challenge: <span class="highlight-title">Self-supervised</span> learning for 3D light-sheet
+  microscopy image segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03880v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03880v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Chen, Rami Al-Maskari, Izabela Horvath, Mayar Ali, Luciano Hoher, Kaiyuan Yang, Zengming Lin, Zhiwei Zhai, Mengzhe Shen, Dejin Xun, Yi Wang, Tony Xu, Maged Goubran, Yunheng Wu, Kensaku Mori, Johannes C. Paetzold, Ali Erturk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent innovations in light sheet microscopy, paired with developments in
+tissue clearing techniques, enable the 3D imaging of large mammalian tissues
+with cellular resolution. Combined with the progress in large-scale data
+analysis, driven by deep learning, these innovations empower researchers to
+rapidly investigate the morphological and functional properties of diverse
+biological samples. Segmentation, a crucial preliminary step in the analysis
+process, can be automated using domain-specific deep learning models with
+expert-level performance. However, these models exhibit high sensitivity to
+domain shifts, leading to a significant drop in accuracy when applied to data
+outside their training distribution. To address this limitation, and inspired
+by the recent success of self-supervised learning in training generalizable
+models, we organized the SELMA3D Challenge during the MICCAI 2024 conference.
+SELMA3D provides a vast collection of light-sheet images from cleared mice and
+human brains, comprising 35 large 3D images-each with over 1000^3 voxels-and
+315 annotated small patches for finetuning, preliminary testing and final
+testing. The dataset encompasses diverse biological structures, including
+vessel-like and spot-like structures. Five teams participated in all phases of
+the challenge, and their proposed methods are reviewed in this paper.
+Quantitative and qualitative results from most participating teams demonstrate
+that self-supervised learning on large datasets improves segmentation model
+performance and generalization. We will continue to support and extend SELMA3D
+as an inaugural MICCAI challenge focused on self-supervised learning for 3D
+microscopy image segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2st version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Swin fMRI <span class="highlight-title">Transformer</span> Predicts Early Neurodevelopmental Outcomes from
+  Neonatal fMRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07783v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07783v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Styll, Dowon Kim, Jiook Cha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain development in the first few months of human life is a critical phase
+characterized by rapid structural growth and functional organization.
+Accurately predicting developmental outcomes during this time is crucial for
+identifying delays and enabling timely interventions. This study introduces the
+SwiFT (Swin 4D fMRI Transformer) model, designed to predict Bayley-III
+composite scores using neonatal fMRI data from the Developing Human Connectome
+Project (dHCP). To enhance predictive accuracy, we apply dimensionality
+reduction via group independent component analysis (ICA) and pretrain SwiFT on
+large adult fMRI datasets to address the challenges of limited neonatal data.
+Our analysis shows that SwiFT significantly outperforms baseline models in
+predicting cognitive, motor, and language outcomes, leveraging both
+single-label and multi-label prediction strategies. The model's attention-based
+architecture processes spatiotemporal data end-to-end, delivering superior
+predictive performance. Additionally, we use Integrated Gradients with
+Smoothgrad sQuare (IG-SQ) to interpret predictions, identifying neural spatial
+representations linked to early cognitive and behavioral development. These
+findings underscore the potential of Transformer models to advance
+neurodevelopmental research and clinical practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fMRI Transformer, Developing Human Connectome Project, Bayley Scales
+  of Infant Development, Personalized Therapy, XAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semantic <span class="highlight-title">Prompt</span> Learning for Weakly-Supervised Semantic Segmentation <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11791v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11791v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ci-Siang Lin, Chien-Yi Wang, Yu-Chiang Frank Wang, Min-Hung Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation
+models using image data with only image-level supervision. Since precise
+pixel-level annotations are not accessible, existing methods typically focus on
+producing pseudo masks for training segmentation models by refining CAM-like
+heatmaps. However, the produced heatmaps may capture only the discriminative
+image regions of object categories or the associated co-occurring backgrounds.
+To address the issues, we propose a Semantic Prompt Learning for WSSS (SemPLeS)
+framework, which learns to effectively prompt the CLIP latent space to enhance
+the semantic alignment between the segmented regions and the target object
+categories. More specifically, we propose Contrastive Prompt Learning and
+Prompt-guided Semantic Refinement to learn the prompts that adequately describe
+and suppress the co-occurring backgrounds associated with each object category.
+In this way, SemPLeS can perform better semantic alignment between object
+regions and class labels, resulting in desired pseudo masks for training
+segmentation models. The proposed SemPLeS framework achieves competitive
+performance on standard WSSS benchmarks, PASCAL VOC 2012 and MS COCO 2014, and
+shows compatibility with other WSSS methods. Code:
+https://github.com/NVlabs/SemPLeS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV 2025. Code: https://github.com/NVlabs/SemPLeS. Project page:
+  https://projectdisr.github.io/semples/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PointSAM: Pointly-Supervised Segment Anything Model for Remote Sensing
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.13401v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.13401v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nanqing Liu, Xun Xu, Yongyi Su, Haojie Zhang, Heng-Chao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segment Anything Model (SAM) is an advanced foundational model for image
+segmentation, which is gradually being applied to remote sensing images (RSIs).
+Due to the domain gap between RSIs and natural images, traditional methods
+typically use SAM as a source pre-trained model and fine-tune it with fully
+supervised masks. Unlike these methods, our work focuses on fine-tuning SAM
+using more convenient and challenging point annotations. Leveraging SAM's
+zero-shot capabilities, we adopt a self-training framework that iteratively
+generates pseudo-labels for training. However, if the pseudo-labels contain
+noisy labels, there is a risk of error accumulation. To address this issue, we
+extract target prototypes from the target dataset and use the Hungarian
+algorithm to match them with prediction prototypes, preventing the model from
+learning in the wrong direction. Additionally, due to the complex backgrounds
+and dense distribution of objects in RSI, using point prompts may result in
+multiple objects being recognized as one. To solve this problem, we propose a
+negative prompt calibration method based on the non-overlapping nature of
+instance masks. In brief, we use the prompts of overlapping masks as
+corresponding negative signals, resulting in refined masks. Combining the above
+methods, we propose a novel Pointly-supervised Segment Anything Model named
+PointSAM. We conduct experiments on RSI datasets, including WHU, HRSID, and
+NWPU VHR-10, and the results show that our method significantly outperforms
+direct testing with SAM, SAM2, and other comparison methods. Furthermore, we
+introduce PointSAM as a point-to-box converter and achieve encouraging results,
+suggesting that this method can be extended to other point-supervised tasks.
+The code is available at https://github.com/Lans1ng/PointSAM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TGRS</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Patent Novelty Assessment Accelerating Innovation and Patent Prosecution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kapil Kashyap, Sean Fargose, Gandhar Dhonde, Aditya Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly evolving landscape of technological innovation, safeguarding
+intellectual property rights through patents is crucial for fostering progress
+and stimulating research and development investments. This report introduces a
+ground-breaking Patent Novelty Assessment and Claim Generation System,
+meticulously crafted to dissect the inventive aspects of intellectual property
+and simplify access to extensive patent claim data. Addressing a crucial gap in
+academic institutions, our system provides college students and researchers
+with an intuitive platform to navigate and grasp the intricacies of patent
+claims, particularly tailored for the nuances of Chinese patents. Unlike
+conventional analysis systems, our initiative harnesses a proprietary Chinese
+API to ensure unparalleled precision and relevance. The primary challenge lies
+in the complexity of accessing and comprehending diverse patent claims,
+inhibiting effective innovation upon existing ideas. Our solution aims to
+overcome these barriers by offering a bespoke approach that seamlessly
+retrieves comprehensive claim information, finely tuned to the specifics of the
+Chinese patent landscape. By equipping users with efficient access to
+comprehensive patent claim information, our transformative platform seeks to
+ignite informed exploration and innovation in the ever-evolving domain of
+intellectual property. Its envisioned impact transcends individual colleges,
+nurturing an environment conducive to research and development while deepening
+the understanding of patented concepts within the academic community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Claims in Economics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prashant Garg, Thiemo Fetzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze over 44,000 NBER and CEPR working papers from 1980 to 2023 using a
+custom language model to construct knowledge graphs that map economic concepts
+and their relationships. We distinguish between general claims and those
+documented via causal inference methods (e.g., DiD, IV, RDD, RCTs). We document
+a substantial rise in the share of causal claims-from roughly 4% in 1990 to
+nearly 28% in 2020-reflecting the growing influence of the "credibility
+revolution." We find that causal narrative complexity (e.g., the depth of
+causal chains) strongly predicts both publication in top-5 journals and higher
+citation counts, whereas non-causal complexity tends to be uncorrelated or
+negatively associated with these outcomes. Novelty is also pivotal for top-5
+publication, but only when grounded in credible causal methods: introducing
+genuinely new causal edges or paths markedly increases both the likelihood of
+acceptance at leading outlets and long-run citations, while non-causal novelty
+exhibits weak or even negative effects. Papers engaging with central, widely
+recognized concepts tend to attract more citations, highlighting a divergence
+between factors driving publication success and long-term academic impact.
+Finally, bridging underexplored concept pairs is rewarded primarily when
+grounded in causal methods, yet such gap filling exhibits no consistent link
+with future citations. Overall, our findings suggest that methodological rigor
+and causal innovation are key drivers of academic recognition, but sustained
+impact may require balancing novel contributions with conceptual integration
+into established economic discourse.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For data, interactive tools, and additional project information,
+  visit https://www.causal.claims/. The website contains resources such as data
+  downloads, interactive author and paper-level knowledge graphs, and more</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling Temporal Trends in 19th Century Literature: An Information
+  Retrieval Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suchana Datta, Dwaipayan Roy, Derek Greene, Gerardine Meaney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In English literature, the 19th century witnessed a significant transition in
+styles, themes, and genres. Consequently, the novels from this period display
+remarkable diversity. This paper explores these variations by examining the
+evolution of term usage in 19th century English novels through the lens of
+information retrieval. By applying a query expansion-based approach to a
+decade-segmented collection of fiction from the British Library, we examine how
+related terms vary over time. Our analysis employs multiple standard metrics
+including Kendall's tau, Jaccard similarity, and Jensen-Shannon divergence to
+assess overlaps and shifts in expanded query term sets. Our results indicate a
+significant degree of divergence in the related terms across decades as
+selected by the query expansion technique, suggesting substantial linguistic
+and conceptual changes throughout the 19th century novels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at JCDL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models, Knowledge Graphs and Search Engines: A Crossroads
+  for Answering Users' Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06699v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06699v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aidan Hogan, Xin Luna Dong, Denny Vrandečić, Gerhard Weikum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Much has been discussed about how Large Language Models, Knowledge Graphs and
+Search Engines can be combined in a synergistic manner. A dimension largely
+absent from current academic discourse is the user perspective. In particular,
+there remain many open questions regarding how best to address the diverse
+information needs of users, incorporating varying facets and levels of
+difficulty. This paper introduces a taxonomy of user information needs, which
+guides us to study the pros, cons and possible synergies of Large Language
+Models, Knowledge Graphs and Search Engines. From this study, we derive a
+roadmap for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kolmogorov-Arnold Recurrent Network for Short Term Load Forecasting
+  Across Diverse Consumers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06965v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06965v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Umair Danish, Katarina Grolinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Load forecasting plays a crucial role in energy management, directly
+impacting grid stability, operational efficiency, cost reduction, and
+environmental sustainability. Traditional Vanilla Recurrent Neural Networks
+(RNNs) face issues such as vanishing and exploding gradients, whereas
+sophisticated RNNs such as LSTMs have shown considerable success in this
+domain. However, these models often struggle to accurately capture complex and
+sudden variations in energy consumption, and their applicability is typically
+limited to specific consumer types, such as offices or schools. To address
+these challenges, this paper proposes the Kolmogorov-Arnold Recurrent Network
+(KARN), a novel load forecasting approach that combines the flexibility of
+Kolmogorov-Arnold Networks with RNN's temporal modeling capabilities. KARN
+utilizes learnable temporal spline functions and edge-based activations to
+better model non-linear relationships in load data, making it adaptable across
+a diverse range of consumer types. The proposed KARN model was rigorously
+evaluated on a variety of real-world datasets, including student residences,
+detached homes, a home with electric vehicle charging, a townhouse, and
+industrial buildings. Across all these consumer categories, KARN consistently
+outperformed traditional Vanilla RNNs, while it surpassed LSTM and Gated
+Recurrent Units (GRUs) in six buildings. The results demonstrate KARN's
+superior accuracy and applicability, making it a promising tool for enhancing
+load forecasting in diverse energy management scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Patient-Centric Communication: Leveraging LLMs to Simulate
+  Patient Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyao Ma, Rui Zhu, Zihao Wang, Jingwei Xiong, Qingyu Chen, Haixu Tang, L. Jean Camp, Lucila Ohno-Machado
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated impressive capabilities in
+role-playing scenarios, particularly in simulating domain-specific experts
+using tailored prompts. This ability enables LLMs to adopt the persona of
+individuals with specific backgrounds, offering a cost-effective and efficient
+alternative to traditional, resource-intensive user studies. By mimicking human
+behavior, LLMs can anticipate responses based on concrete demographic or
+professional profiles. In this paper, we evaluate the effectiveness of LLMs in
+simulating individuals with diverse backgrounds and analyze the consistency of
+these simulated behaviors compared to real-world outcomes. In particular, we
+explore the potential of LLMs to interpret and respond to discharge summaries
+provided to patients leaving the Intensive Care Unit (ICU). We evaluate and
+compare with human responses the comprehensibility of discharge summaries among
+individuals with varying educational backgrounds, using this analysis to assess
+the strengths and limitations of LLM-driven simulations. Notably, when LLMs are
+primed with educational background information, they deliver accurate and
+actionable medical guidance 88% of the time. However, when other information is
+provided, performance significantly drops, falling below random chance levels.
+This preliminary study shows the potential benefits and pitfalls of
+automatically generating patient-specific health information from diverse
+populations. While LLMs show promise in simulating health personas, our results
+highlight critical gaps that must be addressed before they can be reliably used
+in clinical settings. Our findings suggest that a straightforward
+query-response model could outperform a more tailored approach in delivering
+health information. This is a crucial first step in understanding how LLMs can
+be optimized for personalized health communication while maintaining accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Artificial Intelligence-Supported Pentesting: A Comparison
+  between Claude Opus, <span class="highlight-title">GPT</span>-4, and Copilot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio López Martínez, Alejandro Cano, Antonio Ruiz-Martínez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of Generative Artificial Intelligence (GenAI) has brought a
+significant change to our society. GenAI can be applied across numerous fields,
+with particular relevance in cybersecurity. Among the various areas of
+application, its use in penetration testing (pentesting) or ethical hacking
+processes is of special interest. In this paper, we have analyzed the potential
+of leading generic-purpose GenAI tools-Claude Opus, GPT-4 from ChatGPT, and
+Copilot-in augmenting the penetration testing process as defined by the
+Penetration Testing Execution Standard (PTES). Our analysis involved evaluating
+each tool across all PTES phases within a controlled virtualized environment.
+The findings reveal that, while these tools cannot fully automate the
+pentesting process, they provide substantial support by enhancing efficiency
+and effectiveness in specific tasks. Notably, all tools demonstrated utility;
+however, Claude Opus consistently outperformed the others in our experimental
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compact Bayesian Neural Networks via pruned MCMC sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ratneel Deo, Scott Sisson, Jody M. Webster, Rohitash Chandra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian Neural Networks (BNNs) offer robust uncertainty quantification in
+model predictions, but training them presents a significant computational
+challenge. This is mainly due to the problem of sampling multimodal posterior
+distributions using Markov Chain Monte Carlo (MCMC) sampling and variational
+inference algorithms. Moreover, the number of model parameters scales
+exponentially with additional hidden layers, neurons, and features in the
+dataset. Typically, a significant portion of these densely connected parameters
+are redundant and pruning a neural network not only improves portability but
+also has the potential for better generalisation capabilities. In this study,
+we address some of the challenges by leveraging MCMC sampling with network
+pruning to obtain compact probabilistic models having removed redundant
+parameters. We sample the posterior distribution of model parameters (weights
+and biases) and prune weights with low importance, resulting in a compact
+model. We ensure that the compact BNN retains its ability to estimate
+uncertainty via the posterior distribution while retaining the model training
+and generalisation performance accuracy by adapting post-pruning resampling. We
+evaluate the effectiveness of our MCMC pruning strategy on selected benchmark
+datasets for regression and classification problems through empirical result
+analysis. We also consider two coral reef drill-core lithology classification
+datasets to test the robustness of the pruning model in complex real-world
+datasets. We further investigate if refining compact BNN can retain any loss of
+performance. Our results demonstrate the feasibility of training and pruning
+BNNs using MCMC whilst retaining generalisation performance with over 75%
+reduction in network size. This paves the way for developing compact BNN models
+that provide uncertainty estimates for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Patent Novelty Assessment Accelerating Innovation and Patent Prosecution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kapil Kashyap, Sean Fargose, Gandhar Dhonde, Aditya Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly evolving landscape of technological innovation, safeguarding
+intellectual property rights through patents is crucial for fostering progress
+and stimulating research and development investments. This report introduces a
+ground-breaking Patent Novelty Assessment and Claim Generation System,
+meticulously crafted to dissect the inventive aspects of intellectual property
+and simplify access to extensive patent claim data. Addressing a crucial gap in
+academic institutions, our system provides college students and researchers
+with an intuitive platform to navigate and grasp the intricacies of patent
+claims, particularly tailored for the nuances of Chinese patents. Unlike
+conventional analysis systems, our initiative harnesses a proprietary Chinese
+API to ensure unparalleled precision and relevance. The primary challenge lies
+in the complexity of accessing and comprehending diverse patent claims,
+inhibiting effective innovation upon existing ideas. Our solution aims to
+overcome these barriers by offering a bespoke approach that seamlessly
+retrieves comprehensive claim information, finely tuned to the specifics of the
+Chinese patent landscape. By equipping users with efficient access to
+comprehensive patent claim information, our transformative platform seeks to
+ignite informed exploration and innovation in the ever-evolving domain of
+intellectual property. Its envisioned impact transcends individual colleges,
+nurturing an environment conducive to research and development while deepening
+the understanding of patented concepts within the academic community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Einstein Test: Towards a Practical Test of a Machine's Ability to
+  Exhibit Superintelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Benrimoh, Nace Mikus, Ariel Rosenfeld
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creative and disruptive insights (CDIs), such as the development of the
+theory of relativity, have punctuated human history, marking pivotal shifts in
+our intellectual trajectory. Recent advancements in artificial intelligence
+(AI) have sparked debates over whether state of the art models possess the
+capacity to generate CDIs. We argue that the ability to create CDIs should be
+regarded as a significant feature of machine superintelligence (SI).To this
+end, we propose a practical test to evaluate whether an approach to AI
+targeting SI can yield novel insights of this kind. We propose the Einstein
+test: given the data available prior to the emergence of a known CDI, can an AI
+independently reproduce that insight (or one that is formally equivalent)? By
+achieving such a milestone, a machine can be considered to at least match
+humanity's past top intellectual achievements, and therefore to have the
+potential to surpass them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study of Deep Reinforcement Learning in Continuing Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06937v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06937v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Wan, Dmytro Korenkevych, Zheqing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In reinforcement learning (RL), continuing tasks refer to tasks where the
+agent-environment interaction is ongoing and can not be broken down into
+episodes. These tasks are suitable when environment resets are unavailable,
+agent-controlled, or predefined but where all rewards-including those beyond
+resets-are critical. These scenarios frequently occur in real-world
+applications and can not be modeled by episodic tasks. While modern deep RL
+algorithms have been extensively studied and well understood in episodic tasks,
+their behavior in continuing tasks remains underexplored. To address this gap,
+we provide an empirical study of several well-known deep RL algorithms using a
+suite of continuing task testbeds based on Mujoco and Atari environments,
+highlighting several key insights concerning continuing tasks. Using these
+testbeds, we also investigate the effectiveness of a method for improving
+temporal-difference-based RL algorithms in continuing tasks by centering
+rewards, as introduced by Naik et al. (2024). While their work primarily
+focused on this method in conjunction with Q-learning, our results extend their
+findings by demonstrating that this method is effective across a broader range
+of algorithms, scales to larger tasks, and outperforms two other
+reward-centering approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Artificial Intelligence for Cochlear Implants: <span class="highlight-title">Review</span> of Strategies,
+  Challenges, and Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15442v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15442v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Billel Essaid, Hamza Kheddar, Noureddine Batel, Muhammad E. H. Chowdhury, Abderrahmane Lakas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic speech recognition (ASR) plays a pivotal role in our daily lives,
+offering utility not only for interacting with machines but also for
+facilitating communication for individuals with partial or profound hearing
+impairments. The process involves receiving the speech signal in analog form,
+followed by various signal processing algorithms to make it compatible with
+devices of limited capacities, such as cochlear implants (CIs). Unfortunately,
+these implants, equipped with a finite number of electrodes, often result in
+speech distortion during synthesis. Despite efforts by researchers to enhance
+received speech quality using various state-of-the-art (SOTA) signal processing
+techniques, challenges persist, especially in scenarios involving multiple
+sources of speech, environmental noise, and other adverse conditions. The
+advent of new artificial intelligence (AI) methods has ushered in cutting-edge
+strategies to address the limitations and difficulties associated with
+traditional signal processing techniques dedicated to CIs. This review aims to
+comprehensively cover advancements in CI-based ASR and speech enhancement,
+among other related aspects. The primary objective is to provide a thorough
+overview of metrics and datasets, exploring the capabilities of AI algorithms
+in this biomedical field, and summarizing and commenting on the best results
+obtained. Additionally, the review will delve into potential applications and
+suggest future directions to bridge existing research gaps in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-11T00:00:00Z">2025-01-11</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MapGS: Generalizable <span class="highlight-title">Pretrain</span>ing and Data Augmentation for Online
+  Mapping via Novel View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengyuan Zhang, David Paz, Yuliang Guo, Xinyu Huang, Henrik I. Christensen, Liu Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online mapping reduces the reliance of autonomous vehicles on high-definition
+(HD) maps, significantly enhancing scalability. However, recent advancements
+often overlook cross-sensor configuration generalization, leading to
+performance degradation when models are deployed on vehicles with different
+camera intrinsics and extrinsics. With the rapid evolution of novel view
+synthesis methods, we investigate the extent to which these techniques can be
+leveraged to address the sensor configuration generalization challenge. We
+propose a novel framework leveraging Gaussian splatting to reconstruct scenes
+and render camera images in target sensor configurations. The target config
+sensor data, along with labels mapped to the target config, are used to train
+online mapping models. Our proposed framework on the nuScenes and Argoverse 2
+datasets demonstrates a performance improvement of 18% through effective
+dataset augmentation, achieves faster convergence and efficient training, and
+exceeds state-of-the-art performance when using only 25% of the original
+training data. This enables data reuse and reduces the need for laborious data
+labeling. Project page at https://henryzhangzhy.github.io/mapgs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Path Planning Performance through Image Representation
+  Learning of High-Dimensional Configuration Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Ocampo Jimenez, Wael Suleiman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel method for accelerating path-planning tasks in
+unknown scenes with obstacles by utilizing Wasserstein Generative Adversarial
+Networks (WGANs) with Gradient Penalty (GP) to approximate the distribution of
+waypoints for a collision-free path using the Rapidly-exploring Random Tree
+algorithm. Our approach involves conditioning the WGAN-GP with a forward
+diffusion process in a continuous latent space to handle multimodal datasets
+effectively. We also propose encoding the waypoints of a collision-free path as
+a matrix, where the multidimensional ordering of the waypoints is naturally
+preserved. This method not only improves model learning but also enhances
+training convergence. Furthermore, we propose a method to assess whether the
+trained model fails to accurately capture the true waypoints. In such cases, we
+revert to uniform sampling to ensure the algorithm's probabilistic
+completeness; a process that traditionally involves manually determining an
+optimal ratio for each scenario in other machine learning-based methods. Our
+experiments demonstrate promising results in accelerating path-planning tasks
+under critical time constraints. The source code is openly available at
+https://bitbucket.org/joro3001/imagewgangpplanning/src/master/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoboHorizon: An LLM-Assisted Multi-View World Model for Long-Horizon
+  Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Chen, Jing Huo, Yangtao Chen, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient control in long-horizon robotic manipulation is challenging due to
+complex representation and policy learning requirements. Model-based visual
+reinforcement learning (RL) has shown great potential in addressing these
+challenges but still faces notable limitations, particularly in handling sparse
+rewards and complex visual features in long-horizon environments. To address
+these limitations, we propose the Recognize-Sense-Plan-Act (RSPA) pipeline for
+long-horizon tasks and further introduce RoboHorizon, an LLM-assisted
+multi-view world model tailored for long-horizon robotic manipulation. In
+RoboHorizon, pre-trained LLMs generate dense reward structures for multi-stage
+sub-tasks based on task language instructions, enabling robots to better
+recognize long-horizon tasks. Keyframe discovery is then integrated into the
+multi-view masked autoencoder (MAE) architecture to enhance the robot's ability
+to sense critical task sequences, strengthening its multi-stage perception of
+long-horizon processes. Leveraging these dense rewards and multi-view
+representations, a robotic world model is constructed to efficiently plan
+long-horizon tasks, enabling the robot to reliably act through RL algorithms.
+Experiments on two representative benchmarks, RLBench and FurnitureBench, show
+that RoboHorizon outperforms state-of-the-art visual model-based RL methods,
+achieving a 23.35% improvement in task success rates on RLBench's 4
+short-horizon tasks and a 29.23% improvement on 6 long-horizon tasks from
+RLBench and 3 furniture assembly tasks from FurnitureBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Aerial Robot Inspection Challenge: A Benchmark for
+  Heterogeneous Multi-UAV Planning and Lessons Learned 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muqing Cao, Thien-Minh Nguyen, Shenghai Yuan, Andreas Anastasiou, Angelos Zacharia, Savvas Papaioannou, Panayiotis Kolios, Christos G. Panayiotou, Marios M. Polycarpou, Xinhang Xu, Mingjie Zhang, Fei Gao, Boyu Zhou, Ben M. Chen, Lihua Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a
+simulation-based benchmark for motion planning algorithms in heterogeneous
+multi-UAV systems. CARIC features UAV teams with complementary sensors,
+realistic constraints, and evaluation metrics prioritizing inspection quality
+and efficiency. It offers a ready-to-use perception-control software stack and
+diverse scenarios to support the development and evaluation of task allocation
+and motion planning algorithms. Competitions using CARIC were held at IEEE CDC
+2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation,
+attracting innovative solutions from research teams worldwide. This paper
+examines the top three teams from CDC 2023, analyzing their exploration,
+inspection, and task allocation strategies while drawing insights into their
+performance across scenarios. The results highlight the task's complexity and
+suggest promising directions for future research in cooperative multi-UAV
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Please find our website at https://ntu-aris.github.io/caric</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safe Circumnavigation of a Hostile Target Using Range-Based Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gaurav Singh Bhati, Arukonda Vaishnavi, Anoop Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic systems are frequently deployed in missions that are dull, dirty, and
+dangerous, where ensuring their safety is of paramount importance when
+designing stabilizing controllers to achieve their desired goals. This paper
+addresses the problem of safe circumnavigation around a hostile target by a
+nonholonomic robot, with the objective of maintaining a desired safe distance
+from the target. Our solution approach involves incorporating an auxiliary
+circle into the problem formulation, which assists in navigating the robot
+around the target using available range-based measurements. By leveraging the
+concept of a barrier Lyapunov function, we propose a novel control law that
+ensures stable circumnavigation around the target while preventing the robot
+from entering the safety circle. This controller is designed based on a
+parameter that depends on the radii of three circles, namely the stabilizing
+circle, the auxiliary circle, and the safety circle. By identifying an
+appropriate range for this design parameter, we rigorously prove the stability
+of the desired equilibrium of the closed-loop system. Additionally, we provide
+an analysis of the robot's motion within the auxiliary circle, which is
+influenced by a gain parameter in the proposed controller. Simulation and
+experimental results are presented to illustrate the key theoretical
+developments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Whole-Body Integrated Motion Planning for Aerial Manipulators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiliang Deng, Hongming Chen, Biyu Ye, Haoran Chen, Ximin Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient motion planning for Aerial Manipulators (AMs) is essential for
+tackling complex manipulation tasks, yet achieving coupled trajectory planning
+remains challenging. In this work, we propose, to the best of our knowledge,
+the first whole-body integrated motion planning framework for aerial
+manipulators, which is facilitated by an improved Safe Flight Corridor (SFC)
+generation strategy and high-dimensional collision-free trajectory planning. In
+particular, we formulate an optimization problem to generate feasible
+trajectories for both the quadrotor and manipulator while ensuring collision
+avoidance, dynamic feasibility, kinematic feasibility, and waypoint
+constraints. To achieve collision avoidance, we introduce a variable geometry
+approximation method, which dynamically models the changing collision volume
+induced by different manipulator configurations. Moreover, waypoint constraints
+in our framework are defined in $\mathrm{SE(3)\times\mathbb{R}^3}$, allowing
+the aerial manipulator to traverse specified positions while maintaining
+desired attitudes and end-effector states. The effectiveness of our framework
+is validated through comprehensive simulations and real-world experiments
+across various environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aug3D: Augmenting large scale outdoor <span class="highlight-title">dataset</span>s for Generalizable Novel
+  View Synthesis <span class="chip">IROS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Rauniyar, Omar Alama, Silong Yong, Katia Sycara, Sebastian Scherer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent photorealistic Novel View Synthesis (NVS) advances have increasingly
+gained attention. However, these approaches remain constrained to small indoor
+scenes. While optimization-based NVS models have attempted to address this,
+generalizable feed-forward methods, offering significant advantages, remain
+underexplored. In this work, we train PixelNeRF, a feed-forward NVS model, on
+the large-scale UrbanScene3D dataset. We propose four training strategies to
+cluster and train on this dataset, highlighting that performance is hindered by
+limited view overlap. To address this, we introduce Aug3D, an augmentation
+technique that leverages reconstructed scenes using traditional
+Structure-from-Motion (SfM). Aug3D generates well-conditioned novel views
+through grid and semantic sampling to enhance feed-forward NVS model learning.
+Our experiments reveal that reducing the number of views per cluster from 20 to
+10 improves PSNR by 10%, but the performance remains suboptimal. Aug3D further
+addresses this by combining the newly generated novel views with the original
+dataset, demonstrating its effectiveness in improving the model's ability to
+predict novel views.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS 2024 Workshop, 9 Pages, 7 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-Free and Real-Time Bioinspired Unicycle-Based Source Seeking:
+  Differential Wheeled Robotic Experiments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02184v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02184v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed A. Elgohary, Sameh A. Eisa, Shivam Bajpai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bioinspred robots aimed at source-seeking are often studied, and their
+controls designed, using unicycle modeling and formulation. This is true not
+only for model-based controllers, but also for model-free, real-time control
+methods such as extremum seeking control (ESC). In this paper, we propose a
+unicycle-based ESC design applicable to differential wheeled robots that: (1)
+is very simple design, based on one simple control-affine law, and without
+state integrators; (2) attenuates oscillations known to persist in ESC designs
+(i.e., fully stop at the source); and (3) operates in a model-free, real-time
+setting, tolerating environmental/sensor noise. We provide simulation and
+real-world robotic experimental results for fixed and moving light source
+seeking by a differential wheeled robot using our proposed design. Results
+indicate clear advantages of our proposed design when compared to the
+literature, including attenuation of undesired oscillations, improved
+convergence speed, and better handling of noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Printable Gradient Lattice Design for Multi-Stiffness Robotic Fingers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03763v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03763v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siebe J. Schouten, Tomas Steenman, Rens File, Merlijn Den Hartog, Aimee Sakes, Cosimo Della Santina, Kirsten Lussenburg, Ebrahim Shahabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human fingers achieve exceptional dexterity and adaptability by combining
+structures with varying stiffness levels, from soft tissues (low) to tendons
+and cartilage (medium) to bones (high). This paper explores developing a
+robotic finger with similar multi-stiffness characteristics. Specifically, we
+propose using a lattice configuration, parameterized by voxel size and unit
+cell geometry, to optimize and achieve fine-tuned stiffness properties with
+high granularity. A significant advantage of this approach is the feasibility
+of 3D printing the designs in a single process, eliminating the need for manual
+assembly of elements with differing stiffness. Based on this method, we present
+a novel, human-like finger, and a soft gripper. We integrate the latter with a
+rigid manipulator and demonstrate the effectiveness in pick and place tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Splat-Nav: Safe Real-Time Robot Navigation in Gaussian Splatting Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02751v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02751v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy Chen, Ola Shorinwa, Joseph Bruno, Aiden Swann, Javier Yu, Weijia Zeng, Keiko Nagami, Philip Dames, Mac Schwager
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Splat-Nav, a real-time robot navigation pipeline for Gaussian
+Splatting (GSplat) scenes, a powerful new 3D scene representation. Splat-Nav
+consists of two components: 1) Splat-Plan, a safe planning module, and 2)
+Splat-Loc, a robust vision-based pose estimation module. Splat-Plan builds a
+safe-by-construction polytope corridor through the map based on mathematically
+rigorous collision constraints and then constructs a B\'ezier curve trajectory
+through this corridor. Splat-Loc provides real-time recursive state estimates
+given only an RGB feed from an on-board camera, leveraging the point-cloud
+representation inherent in GSplat scenes. Working together, these modules give
+robots the ability to recursively re-plan smooth and safe trajectories to goal
+locations. Goals can be specified with position coordinates, or with language
+commands by using a semantic GSplat. We demonstrate improved safety compared to
+point cloud-based methods in extensive simulation experiments. In a total of
+126 hardware flights, we demonstrate equivalent safety and speed compared to
+motion capture and visual odometry, but without a manual frame alignment
+required by those methods. We show online re-planning at more than 2 Hz and
+pose estimation at about 25 Hz, an order of magnitude faster than Neural
+Radiance Field (NeRF)-based navigation methods, thereby enabling real-time
+navigation. We provide experiment videos on our project page at
+https://chengine.github.io/splatnav/. Our codebase and ROS nodes can be found
+at https://github.com/chengine/splatnav.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">15</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Geometric Analysis-Based Safety Assessment Framework for MASS Route
+  Decision-Making in Restricted Waters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zilong Xu, Zihao Wang, He Li, Dingli Yu, Zaili Yang, Jin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To enhance the safety of Maritime Autonomous Surface Ships (MASS) navigating
+in restricted waters, this paper aims to develop a geometric analysis-based
+route safety assessment (GARSA) framework, specifically designed for their
+route decision-making in irregularly shaped waterways. Utilizing line and point
+geometric elements to define waterway boundaries, the framework enables to
+construct a dynamic width characterization function to quantify spatial safety
+along intricate waterways. An iterative method is developed to calculate this
+function, enabling an abstracted spatial property representation of the
+waterways. Based on this, we introduce a navigational safety index that
+balances global navigational safety and local risk to determine the safest
+route. To accommodate ship kinematic constraints, path modifications are
+applied using a dynamic window approach. A case study in a simulated Port of
+Hamburg environment shows that GARSA effectively identifies safe routes and
+avoids the risk of entering narrow waterways in an autonomous manner, thereby
+prioritizing safety in route decision-making for MASS in confined waters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Reduced Order Iterative Linear Quadratic Regulator (ILQR) Technique
+  for the Optimal Control of Nonlinear Partial Differential Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aayushman Sharma, Suman Chakravorty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a reduced order model-based reinforcement
+learning (MBRL) approach, utilizing the Iterative Linear Quadratic Regulator
+(ILQR) algorithm for the optimal control of nonlinear partial differential
+equations (PDEs). The approach proposes a novel modification of the ILQR
+technique: it uses the Method of Snapshots to identify a reduced order Linear
+Time Varying (LTV) approximation of the nonlinear PDE dynamics around a current
+estimate of the optimal trajectory, utilizes the identified LTV model to solve
+a time-varying reduced order LQR problem to obtain an improved estimate of the
+optimal trajectory along with a new reduced basis, and iterates till
+convergence. The convergence behavior of the reduced order approach is analyzed
+and the algorithm is shown to converge to a limit set that is dependent on the
+truncation error in the reduction. The proposed approach is tested on the
+viscous Burger's equation and two phase-field models for microstructure
+evolution in materials, and the results show that there is a significant
+reduction in the computational burden over the standard ILQR approach, without
+significantly sacrificing performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing wheel loader performance: an end-to-end approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06583v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06583v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koji Aoshima, Eddie Wadbro, Martin Servin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wheel loaders in mines and construction sites repeatedly load soil from a
+pile to load receivers. This task presents a challenging optimization problem
+since each loading's performance depends on the pile state, which depends on
+previous loadings. We investigate an end-to-end optimization approach
+considering future loading outcomes and V-cycle transportation costs. To
+predict the evolution of the pile state and the loading performance, we use
+world models that leverage deep neural networks trained on numerous simulated
+loading cycles. A look-ahead tree search optimizes the sequence of loading
+actions by evaluating the performance of thousands of action candidates, which
+expand into subsequent action candidates under the predicted pile states
+recursively. Test results demonstrate that, over a horizon of 15 sequential
+loadings, the look-ahead tree search is 6% more efficient than a greedy
+strategy, which always selects the action that maximizes the current single
+loading performance, and 14% more efficient than using a fixed loading
+controller optimized for the nominal case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling the residual queue and queue-dependent capacity in a static
+  traffic assignment problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06573v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06573v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Fu, William H. K. Lam, Wei Ma, Yuxin Shi, Rui Jiang, Huijun Sun, Ziyou Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The residual queue during a given study period (e.g., peak hour) is an
+important feature that should be considered when solving a traffic assignment
+problem under equilibrium for strategic traffic planning. Although studies have
+focused extensively on static or quasi-dynamic traffic assignment models
+considering the residual queue, they have failed to capture the situation
+wherein the equilibrium link flow passing through the link is less than the
+link physical capacity under congested conditions. To address this critical
+issue, we introduce a novel static traffic assignment model that explicitly
+incorporates the residual queue and queue-dependent link capacity. The proposed
+model ensures that equilibrium link flows remain within the physical capacity
+bounds, yielding estimations more aligned with data observed by traffic
+detectors, especially in oversaturated scenarios. A generalized link cost
+function considering queue-dependent capacity, with an additional queuing delay
+term is proposed. The queuing delay term represents the added travel cost under
+congestion, offering a framework wherein conventional static models, both with
+and without physical capacity constraints, become special cases of our model.
+Our study rigorously analyzes the mathematical properties of the new model,
+establishing the theoretical uniqueness of solutions for link flow and residual
+queue under certain conditions. We also introduce a gradient projection-based
+alternating minimization algorithm tailored for the proposed model. Numerical
+examples are conducted to demonstrate the superiority and merit of the proposed
+model and solution algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Aerial Robot Inspection Challenge: A Benchmark for
+  Heterogeneous Multi-UAV Planning and Lessons Learned 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muqing Cao, Thien-Minh Nguyen, Shenghai Yuan, Andreas Anastasiou, Angelos Zacharia, Savvas Papaioannou, Panayiotis Kolios, Christos G. Panayiotou, Marios M. Polycarpou, Xinhang Xu, Mingjie Zhang, Fei Gao, Boyu Zhou, Ben M. Chen, Lihua Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a
+simulation-based benchmark for motion planning algorithms in heterogeneous
+multi-UAV systems. CARIC features UAV teams with complementary sensors,
+realistic constraints, and evaluation metrics prioritizing inspection quality
+and efficiency. It offers a ready-to-use perception-control software stack and
+diverse scenarios to support the development and evaluation of task allocation
+and motion planning algorithms. Competitions using CARIC were held at IEEE CDC
+2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation,
+attracting innovative solutions from research teams worldwide. This paper
+examines the top three teams from CDC 2023, analyzing their exploration,
+inspection, and task allocation strategies while drawing insights into their
+performance across scenarios. The results highlight the task's complexity and
+suggest promising directions for future research in cooperative multi-UAV
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Please find our website at https://ntu-aris.github.io/caric</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When xURLLC Meets NOMA: A Stochastic Network Calculus Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuang Chen, Hancheng Lu, Langtin Qin, Yansha Deng, Arumugam Nallanathan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of next-generation ultra-reliable and low-latency communications
+(xURLLC) presents stringent and unprecedented requirements for key performance
+indicators (KPIs). As a disruptive technology, non-orthogonal multiple access
+(NOMA) harbors the potential to fulfill these stringent KPIs essential for
+xURLLC. However, the immaturity of research on the tail distributions of these
+KPIs significantly impedes the application of NOMA to xURLLC. Stochastic
+network calculus (SNC), as a potent methodology, is leveraged to provide
+dependable theoretical insights into tail distribution analysis and statistical
+QoS provisioning (SQP). In this article, we develop a NOMA-assisted uplink
+xURLLC network architecture that incorporates an SNC-based SQP theoretical
+framework (SNC-SQP) to support tail distribution analysis in terms of delay,
+age-of-information (AoI), and reliability. Based on SNC-SQP, an SQP-driven
+power optimization problem is proposed to minimize transmit power while
+guaranteeing xURLLC's KPIs on delay, AoI, reliability, and power consumption.
+Extensive simulations validate our proposed theoretical framework and
+demonstrate that the proposed power allocation scheme significantly reduces
+uplink transmit power and outperforms conventional schemes in terms of SQP
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures, accepted by IEEE Communications Magazine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safe Circumnavigation of a Hostile Target Using Range-Based Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gaurav Singh Bhati, Arukonda Vaishnavi, Anoop Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic systems are frequently deployed in missions that are dull, dirty, and
+dangerous, where ensuring their safety is of paramount importance when
+designing stabilizing controllers to achieve their desired goals. This paper
+addresses the problem of safe circumnavigation around a hostile target by a
+nonholonomic robot, with the objective of maintaining a desired safe distance
+from the target. Our solution approach involves incorporating an auxiliary
+circle into the problem formulation, which assists in navigating the robot
+around the target using available range-based measurements. By leveraging the
+concept of a barrier Lyapunov function, we propose a novel control law that
+ensures stable circumnavigation around the target while preventing the robot
+from entering the safety circle. This controller is designed based on a
+parameter that depends on the radii of three circles, namely the stabilizing
+circle, the auxiliary circle, and the safety circle. By identifying an
+appropriate range for this design parameter, we rigorously prove the stability
+of the desired equilibrium of the closed-loop system. Additionally, we provide
+an analysis of the robot's motion within the auxiliary circle, which is
+influenced by a gain parameter in the proposed controller. Simulation and
+experimental results are presented to illustrate the key theoretical
+developments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Optimal Output Tracking for Discrete-Time Multiagent
+  Systems: Stabilizing Policy Iteration Frameworks and Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongdong Li, Jiuxiang Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, two model-free optimal output tracking frameworks based on
+policy iteration for discrete-time multi-agent systems are proposed. First, we
+establish a framework of stabilizing policy iteration that can start from any
+initial feedback control policy, relaxing the dependence of traditional policy
+iteration on the initial stabilizing control policy. Then, another efficient
+and equivalent $Q$-learning policy iteration framework is developed, which is
+shown to require only less system data to get the same results as the
+stabilizing policy iteration. Both frameworks obtain stabilizing control policy
+by iterating the stabilizing virtual closed-loop system step-by-step to the
+actual closed-loop system. Multiple explicit schemes for the iteration
+step-size/coefficient are designed and their stability during the above
+iterations is analyzed. By using the generated closed-loop stabilizing control
+policy and two frameworks, the optimal feedback control gain is obtained. The
+approximate solution of the regulator equations is found by model-free
+iteration, which leads to the optimal feedforward gain. Finally, the
+cooperative optimal output tracking is realized by a distributed
+feedforward-feedback controller. The proposed algorithms are validated by
+simulation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Requirements Classification with SMOTE-Tomek Preprocessing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Barak Or
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study emphasizes the domain of requirements engineering by applying the
+SMOTE-Tomek preprocessing technique, combined with stratified K-fold
+cross-validation, to address class imbalance in the PROMISE dataset. This
+dataset comprises 969 categorized requirements, classified into functional and
+non-functional types. The proposed approach enhances the representation of
+minority classes while maintaining the integrity of validation folds, leading
+to a notable improvement in classification accuracy. Logistic regression
+achieved 76.16\%, significantly surpassing the baseline of 58.31\%. These
+results highlight the applicability and efficiency of machine learning models
+as scalable and interpretable solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prediction Model of Aqua Fisheries Using IoT Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md. Monirul Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aquaculture involves cultivating marine and freshwater organisms, with
+real-time monitoring of aquatic parameters being crucial in fish farming. This
+thesis proposes an IoT-based framework using sensors and Arduino for efficient
+monitoring and control of water quality. Different sensors including pH,
+temperature, and turbidity are placed in cultivating pond water and each of
+them is connected to a common microcontroller board built on an Arduino Uno.
+The sensors read the data from the water and store it as a CSV file in an IoT
+cloud named Thingspeak through the Arduino Microcontroller. In the experimental
+part, we collected data from 5 ponds with various sizes and environments. After
+getting the real-time data, we compared these with the standard reference
+values. As a result, we can make the decision about which ponds are
+satisfactory for cultivating fish and what is not. After that, we labeled the
+data with 11 fish categories including Katla, sing, prawn, rui, koi, pangas,
+tilapia, silvercarp, karpio, magur, and shrimp. In addition, the data were
+analyzed using 10 machine learning (ML) algorithms containing J48, Random
+Forest, K-NN, K*, LMT, REPTree, JRIP, PART, Decision Table, and Logit boost.
+After experimental evaluation, it was observed among 5 ponds, only three ponds
+were perfect for fish farming, where these 3 ponds only satisfied the standard
+reference values of pH (6.5-8.5), Temperature (16-24)oC, Turbidity (below
+10)ntu, Conductivity (970-1825){\mu}S/cm, and Depth (1-4) meter. Among the
+state-of-the-art machine learning algorithms, Random Forest achieved the
+highest score of performance metrics as accuracy 94.42%, kappa statistics
+93.5%, and Avg. TP Rate 94.4%. In addition, we calculated the BOD, COD, and DO
+for one scenario. This study includes details of the proposed IoT system's
+prototype hardware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recent Advances of 6G Ultra-Massive MIMO Technologies in Spatial and
+  Beam Domains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Feng, Cheng-Xiang Wang, Jie Huang, Xiqi Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To explore the full potential of ultra-massive multiple-input multiple-output
+(MIMO) communication systems, it is fundamental to understand new ultra-massive
+MIMO channel characteristics and establish pervasive channel models. On this
+basis, large dimensional spatial-temporal transmission and random access
+technologies need to be investigated and evaluated for better practical
+implementation. Firstly, this paper reviews recent advances of ultra-massive
+MIMO technologies in the traditional spatial domain, including wireless channel
+characterization and modeling, channel estimation, spatial multiplexing, and
+precoding. Secondly, considering the dramatic increase of base station (BS)
+antennas and access users in ultra-massive MIMO systems, the confronted high
+dimensional complexity and computing burden of these ultra-massive MIMO
+technologies are indicated. To provide efficient and systematic solution, the
+emerging tendency to transform related technologies from the traditional
+spatial domain to beam domain is introduced. The utilities of large sparsity
+merit, reduced energy consumption, and improved usage of radio frequency (RF)
+chains in the beam domain channel are elaborated. At last, future challenges of
+ultra-massive MIMO communication systems are discussed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Relaxation based Non-Conservative Chance Constrained Stochastic
+  MPC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01973v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01973v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avik Ghosh, Cristian Cortes-Aguirre, Yi-An Chen, Adil Khurram, Jan Kleissl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chance constrained stochastic model predictive controllers (CC-SMPC) trade
+off full constraint satisfaction for economical plant performance under
+uncertainty. Previous CC-SMPC works are over-conservative in constraint
+violations leading to worse economic performance. Other past works require
+a-priori information about the uncertainty set, limiting their application.
+This paper considers a discrete LTI system with hard constraints on inputs and
+chance constraints on states, with unknown uncertainty distribution,
+statistics, or samples. This work proposes a novel adaptive online update rule
+to relax the state constraints based on the time-average of past constraint
+violations, to achieve reduced conservativeness in closed-loop. Under an ideal
+control policy assumption, it is proven that the time-average of constraint
+violations asymptotically converges to the maximum allowed violation
+probability. The method is applied for optimal battery energy storage system
+(BESS) dispatch in a grid connected microgrid with PV generation and load
+demand, with chance constraints on BESS state-of-charge (SOC). Realistic
+simulations show the superior electricity cost saving potential of the proposed
+method as compared to the traditional economic MPC without chance constraints,
+and a state-of-the-art approach with chance constraints. We satisfy the chance
+constraints non-conservatively in closed-loop, effectively trading off
+increased cost savings with minimal adverse effects on BESS lifetime.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures, 3 tables, Submitted to IEEE Transactions on
+  Control Systems Technology; Minor addition to footnote 6 from last version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Asynchronous Federated Learning: A Scalable Approach for Decentralized
+  Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17723v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17723v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Forootani, Raffaele Iervolino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has emerged as a powerful paradigm for decentralized
+machine learning, enabling collaborative model training across diverse clients
+without sharing raw data. However, traditional FL approaches often face
+limitations in scalability and efficiency due to their reliance on synchronous
+client updates, which can result in significant delays and increased
+communication overhead, particularly in heterogeneous and dynamic environments.
+To address these challenges in this paper, we propose an Asynchronous Federated
+Learning (AFL) algorithm, which allows clients to update the global model
+independently and asynchronously. Our key contributions include a comprehensive
+convergence analysis of AFL in the presence of client delays and model
+staleness. By leveraging martingale difference sequence theory and variance
+bounds, we ensure robust convergence despite asynchronous updates. Assuming
+strongly convex local objective functions, we establish bounds on gradient
+variance under random client sampling and derive a recursion formula
+quantifying the impact of client delays on convergence.
+  The proposed AFL algorithm addresses key limitations of traditional FL
+methods, such as inefficiency due to global synchronization and susceptibility
+to client drift. It enhances scalability, robustness, and efficiency in
+real-world settings with heterogeneous client populations and dynamic network
+conditions. Our results underscore the potential of AFL to drive advancements
+in distributed learning systems, particularly for large-scale,
+privacy-preserving applications in resource-constrained environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantum Reinforcement Learning-Based Two-Stage Unit Commitment Framework
+  for Enhanced Power Systems Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.21240v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.21240v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Wei, Ziqing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The volatility of renewable energy sources and fluctuations in real-time
+electricity demand present significant challenges to traditional unit
+commitment (UC) methods, often causing system constraint violations.
+Conventional optimization algorithms face substantial difficulties in
+responding quickly to these variations, frequently requiring the relaxation of
+constraints or producing infeasible solutions. To address these challenges, a
+robust two-stage UC framework based on quantum reinforcement learning (QRL) is
+proposed in this work, which improves both decision-making speed and solution
+feasibility. In the first stage, the day-ahead scheduling of thermal generators
+is optimized. In the second stage, real-time adjustments are made to account
+for changes in renewable generation and load, with microgrids integrated to
+reduce the impact of uncertainties on the power system. Both stages are
+formulated as Markov decision processes (MDPs), and QRL is used to efficiently
+solve the problem. QRL provides key advantages, including more effective
+navigation of the high-dimensional solution space and faster convergence
+compared to classical methods, thus enhancing the robustness and computational
+efficiency of UC operations. The proposed QRL-based two-stage UC framework is
+validated using the IEEE RTS 24-bus system. Results demonstrate the
+effectiveness of the approach, showing improved solution feasibility and
+computational speed compared to conventional UC methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Control Unknown Strongly Monotone Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.00575v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.00575v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Chandak, Ilai Bistritz, Nicholas Bambos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consider a game where the players' utility functions include a reward
+function and a linear term for each dimension, with coefficients that are
+controlled by the manager. We assume that the game is strongly monotone, so
+gradient play converges to a unique Nash equilibrium (NE). The NE is typically
+globally inefficient. The global performance at NE can be improved by imposing
+linear constraints on the NE. We therefore want the manager to pick the
+controlled coefficients that impose the desired constraint on the NE. However,
+this requires knowing the players' reward functions and action sets. Obtaining
+this game information is infeasible in a large-scale network and violates user
+privacy. To overcome this, we propose a simple algorithm that learns to shift
+the NE to meet the linear constraints by adjusting the controlled coefficients
+online. Our algorithm only requires the linear constraints violation as
+feedback and does not need to know the reward functions or the action sets. We
+prove that our algorithm converges with probability 1 to the set of NE that
+satisfy target linear constraints. We then prove an L2 convergence rate of
+near-$O(t^{-1/4})$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Transactions on Control of Network Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NVS-SQA: Exploring <span class="highlight-title">Self-Supervised</span> Quality Representation Learning for
+  Neurally Synthesized Scenes without References 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06488v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06488v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Qu, Yiran Shen, Xiaoming Chen, Yuk Ying Chung, Weidong Cai, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural View Synthesis (NVS), such as NeRF and 3D Gaussian Splatting,
+effectively creates photorealistic scenes from sparse viewpoints, typically
+evaluated by quality assessment methods like PSNR, SSIM, and LPIPS. However,
+these full-reference methods, which compare synthesized views to reference
+views, may not fully capture the perceptual quality of neurally synthesized
+scenes (NSS), particularly due to the limited availability of dense reference
+views. Furthermore, the challenges in acquiring human perceptual labels hinder
+the creation of extensive labeled datasets, risking model overfitting and
+reduced generalizability. To address these issues, we propose NVS-SQA, a NSS
+quality assessment method to learn no-reference quality representations through
+self-supervision without reliance on human labels. Traditional self-supervised
+learning predominantly relies on the "same instance, similar representation"
+assumption and extensive datasets. However, given that these conditions do not
+apply in NSS quality assessment, we employ heuristic cues and quality scores as
+learning objectives, along with a specialized contrastive pair preparation
+process to improve the effectiveness and efficiency of learning. The results
+show that NVS-SQA outperforms 17 no-reference methods by a large margin (i.e.,
+on average 109.5% in SRCC, 98.6% in PLCC, and 91.5% in KRCC over the second
+best) and even exceeds 16 full-reference methods across all evaluation metrics
+(i.e., 22.9% in SRCC, 19.1% in PLCC, and 18.6% in KRCC over the second best).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual question answering: from early developments to recent advances --
+  a <span class="highlight-title">survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03939v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03939v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ngoc Dung Huynh, Mohamed Reda Bouadjenek, Sunil Aryal, Imran Razzak, Hakim Hacid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) is an evolving research field aimed at
+enabling machines to answer questions about visual content by integrating image
+and language processing techniques such as feature extraction, object
+detection, text embedding, natural language understanding, and language
+generation. With the growth of multimodal data research, VQA has gained
+significant attention due to its broad applications, including interactive
+educational tools, medical image diagnosis, customer service, entertainment,
+and social media captioning. Additionally, VQA plays a vital role in assisting
+visually impaired individuals by generating descriptive content from images.
+This survey introduces a taxonomy of VQA architectures, categorizing them based
+on design choices and key components to facilitate comparative analysis and
+evaluation. We review major VQA approaches, focusing on deep learning-based
+methods, and explore the emerging field of Large Visual Language Models (LVLMs)
+that have demonstrated success in multimodal tasks like VQA. The paper further
+examines available datasets and evaluation metrics essential for measuring VQA
+system performance, followed by an exploration of real-world VQA applications.
+Finally, we highlight ongoing challenges and future directions in VQA research,
+presenting open questions and potential areas for further development. This
+survey serves as a comprehensive resource for researchers and practitioners
+interested in the latest advancements and future
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 papers</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">18</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Approximate controllability for a one-dimensional wave equation with the
+  fixed endpoint control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaías Pereira de Jesus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper is devoted to the study of the approximate controllability for a
+one-dimensional wave equation in domains with moving boundary. This equation
+models the motion of a string where an endpoint is fixed and the other one is
+moving. When the speed of the moving endpoint is less than the characteristic
+speed, the controllability of this equation is established. We present the
+following results: the existence and uniqueness of Nash equilibrium, the
+approximate controllability with respect to the leader control, and the
+optimality system for the leader control.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Control for the Oldroyd Equation in Memoriam to Professor
+  Luiz Adauto Medeiros 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaías Pereira de Jesus, Marcondes Rodrigues Clark, Alexandro Marinho Oliveira, Aldo Trajano Louredo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This manuscript deals with a hierarchical control problem for Oldroyd
+equation under the Stackelberg-Nash strategy. The Oldroyd equation model is
+defined by non-regular coefficients, that is, they are bounded measurable
+functions. We assume that we can act in the dynamic of the system by a
+hierarchy of controls, where one main control (the leader) and several
+additional secondary control (the followers) act in order to accomplish their
+given tasks: controllability for the leader and optimization for followers. We
+obtain the existence and uniqueness of Nash equilibrium and its
+characterization, the approximate controllability with respect to the leader
+control, and the optimality system for leader control.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-order Accurate Inference on Manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengzhu Huang, Anru R. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new framework for statistical inference on Riemannian manifolds
+that achieves high-order accuracy, addressing the challenges posed by
+non-Euclidean parameter spaces frequently encountered in modern data science.
+Our approach leverages a novel and computationally efficient procedure to reach
+higher-order asymptotic precision. In particular, we develop a bootstrap
+algorithm on Riemannian manifolds that is both computationally efficient and
+accurate for hypothesis testing and confidence region construction. Although
+locational hypothesis testing can be reformulated as a standard Euclidean
+problem, constructing high-order accurate confidence regions necessitates
+careful treatment of manifold geometry. To this end, we establish high-order
+asymptotics under a fixed normal chart centered at the true parameter, thereby
+enabling precise expansions that incorporate curvature effects. We demonstrate
+the versatility of this framework across various manifold settings-including
+spheres, the Stiefel manifold, fixed-rank matrices manifolds, and rank-one
+tensor manifolds-and, for Euclidean submanifolds, introduce a class of
+projection-like coordinate charts with strong consistency properties. Finally,
+numerical studies confirm the practical merits of the proposed procedure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Characterization of Highly Robust Solutions in Multi-Objective
+  Programming in Banach Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06640v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06640v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morteza Rahimi, Majid Soleimani-damaneh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper delves into the challenging issues in uncertain multi-objective
+optimization, where uncertainty permeates nonsmooth nonconvex objective and
+constraint functions. In this context, we investigate highly robust (weakly
+efficient) solutions, a solution concept defined by efficiency across all
+scenarios. Our exploration reveals important relationships between highly
+robust solutions and other robustness notions, including set-based and
+worst-case notions, as well as connections with proper and isolated efficiency.
+Leveraging modern techniques from variational analysis, we establish necessary
+and sufficient optimality conditions for these solutions. Moreover, we explore
+the robustness of multi-objective optimization problems in the face of various
+uncertain sets, such as ball, ellipsoidal, and polyhedral sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Convergence and Complexity of the Stochastic Central
+  Finite-Difference Based Gradient Estimation Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raghu Bollapragada, Cem Karamanli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an algorithmic framework for solving unconstrained
+stochastic optimization problems using only stochastic function evaluations. We
+employ central finite-difference based gradient estimation methods to
+approximate the gradients and dynamically control the accuracy of these
+approximations by adjusting the sample sizes used in stochastic realizations.
+We analyze the theoretical properties of the proposed framework on nonconvex
+functions. Our analysis yields sublinear convergence results to the
+neighborhood of the solution, and establishes the optimal worst-case iteration
+complexity ($\mathcal{O}(\epsilon^{-1})$) and sample complexity
+($\mathcal{O}(\epsilon^{-2})$) for each gradient estimation method to achieve
+an $\epsilon$-accurate solution. Finally, we demonstrate the performance of the
+proposed framework and the quality of the gradient estimation methods through
+numerical experiments on nonlinear least squares problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Linear Complexity Algorithm for Optimal Transport Problem with
+  Log-type Cost 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06578v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06578v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyuan Lyu, Zihao Wang, Hao Wu, Shuai Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In [Q. Liao et al., Commun. Math. Sci., 20(2022)], a linear-time Sinkhorn
+algorithm is developed based on dynamic programming, which significantly
+reduces the computational complexity involved in solving optimal transport
+problems. However, this algorithm is specifically designed for the
+Wasserstein-1 metric. We are curious whether the preceding dynamic programming
+framework can be extended to tackle optimal transport problems with different
+transport costs. Notably, two special kinds of optimal transport problems, the
+Sinkhorn ranking and the far-field reflector and refractor problems, are
+closely associated with the log-type transport costs. Interestingly, by
+employing series rearrangement and dynamic programming techniques, it is
+feasible to perform the matrix-vector multiplication within the Sinkhorn
+iteration in linear time for this type of cost. This paper provides a detailed
+exposition of its implementation and applications, with numerical simulations
+demonstrating the effectiveness and efficiency of our methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Viscosity Iterative algorithm for solving Variational Inclusion and
+  Fixed point problems involving Multivalued Quasi-Nonexpansive and
+  Demicontractive Operators in real Hil<span class="highlight-title">bert</span> Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Furmose Mendy, John T Mendy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a modified general viscosity iterative process designed
+to solve variational inclusion and fixed point problems involving multi-valued
+quasi-nonexpansive and demi-contractive operators. The modified iterative
+process incorporates a viscosity approximation technique to handle the
+nonexpansive and contractive mappings, providing a more robust and efficient
+solution approach. By introducing an additional sequence of iterates, the
+algorithm iteratively approximates the desired solution by combining fixed
+point iteration with viscosity approximation. The proposed method has been
+proven to converge strongly to the solution of the given problem, ensuring the
+reliability and accuracy of the results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the general form of bimonotone operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Hadjisavvas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a recent paper (2024) Camacho, C\'{a}novas, Mart\'{\i}nez-Legaz and Parra
+introduced bimonotone operators, i.e., operators $T$ such that both $T$ and
+$-T$ are monotone, and found some interesting applications to convex
+feasibility problems, especially in the case the operator is also paramonotone.
+In the present paper we drop paramonotonicity and examine the question of
+finding the most general form of a bimonotone operator in a Banach space. We
+show that any such operator can be reduced in some sense to a single-valued,
+skew symmetric linear operator. This facilitates the proof of some results
+involving these operators in applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Two Proofs of a Structural Theorem of Decreasing Minimization on
+  Integrally Convex Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazuo Murota, Akihisa Tamura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper gives two different proofs to a structural theorem of decreasing
+minimization (lexicographic optimization) on integrally convex sets. The
+theorem states that the set of decreasingly minimal elements of an integrally
+convex set can be represented as the intersection of a unit discrete cube and a
+face of the convex hull of the given integrally convex set. The first proof
+resorts to the Fenchel-type duality theorem in discrete convex analysis and the
+second is more elementary using Farkas' lemma.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ADMM for Nonsmooth Composite Optimization under Orthogonality
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15129v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15129v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ganzhao Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a class of structured, nonconvex, nonsmooth optimization problems
+under orthogonality constraints, where the objectives combine a smooth
+function, a nonsmooth concave function, and a nonsmooth weakly convex function.
+This class of problems finds diverse applications in statistical learning and
+data science. Existing methods for addressing these problems often fail to
+exploit the specific structure of orthogonality constraints, struggle with
+nonsmooth functions, or result in suboptimal oracle complexity. We propose {\sf
+OADMM}, an Alternating Direction Method of Multipliers (ADMM) designed to solve
+this class of problems using efficient proximal linearized strategies. Two
+specific variants of {\sf OADMM} are explored: one based on Euclidean
+Projection ({\sf OADMM-EP}) and the other on Riemannian Retraction ({\sf
+OADMM-RR}). Under mild assumptions, we prove that {\sf OADMM} converges to a
+critical point of the problem with an ergodic convergence rate of
+$\mathcal{O}(1/\epsilon^{3})$. Additionally, we establish a polynomial
+convergence rate or super-exponential convergence rate for {\sf OADMM},
+depending on the specific setting, under the Kurdyka-Lojasiewicz (KL)
+inequality. To the best of our knowledge, this is \textit{the first non-ergodic
+convergence result} for this class of nonconvex nonsmooth optimization
+problems. Numerical experiments demonstrate that the proposed algorithm
+achieves state-of-the-art performance.
+  \textbf{Keywords:} Orthogonality Constraints; Nonconvex Optimization;
+Nonsmooth Composite Optimization; ADMM; Convergence Analysis
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Delayed Feedback in Online Non-Convex Optimization: A Non-Stationary
+  Approach with Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14506v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14506v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felipe Lara, Cristian Vega
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study non-convex delayed-noise online optimization problems by evaluating
+dynamic regret in the non-stationary setting when the loss functions are
+quasar-convex. In particular, we consider scenarios involving quasar-convex
+functions either with a Lipschitz gradient or weakly smooth and, for each case,
+we ensure bounded dynamic regret in terms of cumulative path variation
+achieving sub-linear regret rates. Furthermore, we illustrate the flexibility
+of our framework by applying it to both theoretical settings such as
+zeroth-order (bandit) and also to practical applications with quadratic
+fractional functions. Moreover, we provide new examples of non-convex functions
+that are quasar-convex by proving that the class of differentiable strongly
+quasiconvex functions (Polyak 1966) are strongly quasar-convex on convex
+compact sets. Finally, several numerical experiments validate our theoretical
+findings, illustrating the effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 Pages, 7 Figures, 8 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Complexity of Decentralized Smooth Nonconvex Finite-Sum
+  Optimization <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.13931v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.13931v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luo Luo, Yunyan Bai, Lesi Chen, Yuxing Liu, Haishan Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the decentralized optimization problem $\min_{{\bf x}\in{\mathbb
+R}^d} f({\bf x})\triangleq \frac{1}{m}\sum_{i=1}^m f_i({\bf x})$, where the
+local function on the $i$-th agent has the form of $f_i({\bf x})\triangleq
+\frac{1}{n}\sum_{j=1}^n f_{i,j}({\bf x})$ and every individual $f_{i,j}$ is
+smooth but possibly nonconvex. We propose a stochastic algorithm called
+DEcentralized probAbilistic Recursive gradiEnt deScenT (DEAREST) method, which
+achieves an $\epsilon$-stationary point at each agent with the communication
+rounds of $\tilde{\mathcal O}(L\epsilon^{-2}/\sqrt{\gamma}\,)$, the computation
+rounds of $\tilde{\mathcal O}(n+(L+\min\{nL, \sqrt{n/m}\bar
+L\})\epsilon^{-2})$, and the local incremental first-oracle calls of ${\mathcal
+O}(mn + {\min\{mnL, \sqrt{mn}\bar L\}}{\epsilon^{-2}})$, where $L$ is the
+smoothness parameter of the objective function, $\bar L$ is the mean-squared
+smoothness parameter of all individual functions, and $\gamma$ is the spectral
+gap of the mixing matrix associated with the network. We then establish the
+lower bounds to show that the proposed method is near-optimal. Notice that the
+smoothness parameters $L$ and $\bar L$ used in our algorithm design and
+analysis are global, leading to sharper complexity bounds than existing results
+that depend on the local smoothness. We further extend DEAREST to solve the
+decentralized finite-sum optimization problem under the Polyak-{\L}ojasiewicz
+condition, also achieving the near-optimal complexity bounds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A major revision which significantly improves the results by
+  considering the global smoothness parameters and involving the content of PL
+  condition in ICML paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-Free and Real-Time Bioinspired Unicycle-Based Source Seeking:
+  Differential Wheeled Robotic Experiments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02184v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02184v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed A. Elgohary, Sameh A. Eisa, Shivam Bajpai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bioinspred robots aimed at source-seeking are often studied, and their
+controls designed, using unicycle modeling and formulation. This is true not
+only for model-based controllers, but also for model-free, real-time control
+methods such as extremum seeking control (ESC). In this paper, we propose a
+unicycle-based ESC design applicable to differential wheeled robots that: (1)
+is very simple design, based on one simple control-affine law, and without
+state integrators; (2) attenuates oscillations known to persist in ESC designs
+(i.e., fully stop at the source); and (3) operates in a model-free, real-time
+setting, tolerating environmental/sensor noise. We provide simulation and
+real-world robotic experimental results for fixed and moving light source
+seeking by a differential wheeled robot using our proposed design. Results
+indicate clear advantages of our proposed design when compared to the
+literature, including attenuation of undesired oscillations, improved
+convergence speed, and better handling of noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Symmetry & Critical Points for Symmetric Tensor Decomposition Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07886v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07886v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yossi Arjevani, Gal Vinograd
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the nonconvex optimization problem associated with the
+decomposition of a real symmetric tensor into a sum of rank one terms. Use is
+made of the rich symmetry structure to construct infinite families of critical
+points represented by Puiseux series in the problem dimension, and so obtain
+precise analytic estimates on the value of the objective function and the
+Hessian spectrum. The results allow an analytic characterization of various
+obstructions to using local optimization methods, revealing in particular a
+complex array of saddles and minima differing by their symmetry, structure and
+analytic properties. A~desirable phenomenon, occurring for all critical points
+considered, concerns the number of negative Hessian eigenvalues increasing with
+the value of the objective function. Our approach makes use of Newton polyhedra
+as well as results from real algebraic geometry, notably the Curve Selection
+Lemma, to determine the extremal character of degenerate critical points,
+establishing in particular the existence of infinite families of third-order
+saddles which can significantly slow down the optimization process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distributed Computing for Huge-Scale Linear Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06204v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06204v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luoyi Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study develops an algorithm for distributed computing of linear
+programming problems of huge-scales. Global consensus with single common
+variable, multiblocks, and augmented Lagrangian are adopted. The consensus is
+used to partition the constraints of equality and inequality into
+multi-consensus blocks, and the subblocks of each consensus block are employed
+to partition the primal variables into $M$ sets of disjoint subvectors. The
+block-coordinate Gauss-Seidel method, the proximal point method, and ADMM are
+used to update the primal variables, and descent models used to update the
+dual. Under the dual sequences supposedly bounded, convergence of the algorithm
+to optimal solution is shown and the rate of convergence of the augmented
+Lagrangian, of $O(1/k)$ is obtained. It is yet to be investigated regarding the
+issue of the dual sequences to be bounded via initialization of the primal and
+dual sequences and the control parameter values.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages. The issues of initialization and boundedness of dual
+  sequences are discussed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Mean-Field Game of Market Entry: Portfolio Liquidation with Trading
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10441v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10441v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanxing Fu, Paul P. Hager, Ulrich Horst
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider both $N$-player and mean-field games of optimal portfolio
+liquidation in which the players are not allowed to change the direction of
+trading. Players with an initially short position of stocks are only allowed to
+buy while players with an initially long position are only allowed to sell the
+stock. Under suitable conditions on the model parameters we show that the games
+are equivalent to games of timing where the players need to determine the
+optimal times of market entry and exit. We identify the equilibrium entry and
+exit times and prove that equilibrium mean-trading rates can be characterized
+in terms of the solutions to a highly non-linear higher-order integral equation
+with endogenous terminal condition. We prove the existence of a unique solution
+to the integral equation from which we obtain the existence of a unique
+equilibrium both in the mean-field and the $N$-player game.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multiple Regression for Matrix and Vector Predictors: Models, Theory,
+  Algorithms, and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.19264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.19264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meixia Lin, Ziyang Zeng, Yangjing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Matrix regression plays an important role in modern data analysis due to its
+ability to handle complex relationships involving both matrix and vector
+variables. We propose a class of regularized regression models capable of
+predicting both matrix and vector variables, accommodating various
+regularization techniques tailored to the inherent structures of the data. We
+establish the consistency of our estimator when penalizing the nuclear norm of
+the matrix variable and the $\ell_1$ norm of the vector variable. To tackle the
+general regularized regression model, we propose a unified framework based on
+an efficient preconditioned proximal point algorithm. Numerical experiments
+demonstrate the superior estimation and prediction accuracy of our proposed
+estimator, as well as the efficiency of our algorithm compared to the
+state-of-the-art solvers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Remarks on finite-approximate controllability of impulsive evolution
+  systems via resolvent-like operator in Hil<span class="highlight-title">bert</span> spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02995v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02995v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javad A. Asadzade, Nazim I. Mahmudov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this manuscript, we examine impulsive evolution systems in Hilbert spaces.
+Using a resolvent-like operator, we first establish the finite-approximate
+controllability for linear systems. Subsequently, by applying the Schauder
+fixed-point theorem (SFPT), we prove the existence of a solution and
+demonstrate the finite-approximate controllability of semilinear impulsive
+systems in Hilbert spaces. Finally, we extend these results to a broader
+application, specifically to the heat equation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">3</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recommending the right academic programs: An interest mining approach
+  using <span class="highlight-title">BERT</span>opic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Hill, Kalen Goo, Puneet Agarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prospective students face the challenging task of selecting a university
+program that will shape their academic and professional careers. For
+decision-makers and support services, it is often time-consuming and extremely
+difficult to match personal interests with suitable programs due to the vast
+and complex catalogue information available. This paper presents the first
+information system that provides students with efficient recommendations based
+on both program content and personal preferences. BERTopic, a powerful topic
+modeling algorithm, is used that leverages text embedding techniques to
+generate topic representations. It enables us to mine interest topics from all
+course descriptions, representing the full body of knowledge taught at the
+institution. Underpinned by the student's individual choice of topics, a
+shortlist of the most relevant programs is computed through statistical
+backtracking in the knowledge map, a novel characterization of the
+program-course relationship. This approach can be applied to a wide range of
+educational settings, including professional and vocational training. A case
+study at a post-secondary school with 80 programs and over 5,000 courses shows
+that the system provides immediate and effective decision support. The
+presented interest topics are meaningful, leading to positive effects such as
+serendipity, personalization, and fairness, as revealed by a qualitative study
+involving 65 students. Over 98% of users indicated that the recommendations
+aligned with their interests, and about 94% stated they would use the tool in
+the future. Quantitative analysis shows the system can be configured to ensure
+fairness, achieving 98% program coverage while maintaining a personalization
+score of 0.77. These findings suggest that this real-time, user-centered,
+data-driven system could improve the program selection process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Data Mining and Knowledge Discovery (Springer)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing the Role of Context in Forecasting with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerrit Mutschlechner, Adam Jatowt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study evaluates the forecasting performance of recent language models
+(LLMs) on binary forecasting questions. We first introduce a novel dataset of
+over 600 binary forecasting questions, augmented with related news articles and
+their concise question-related summaries. We then explore the impact of input
+prompts with varying level of context on forecasting performance. The results
+indicate that incorporating news articles significantly improves performance,
+while using few-shot examples leads to a decline in accuracy. We find that
+larger models consistently outperform smaller models, highlighting the
+potential of LLMs in enhancing automated forecasting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Sequential Recommendations with LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01339v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01339v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Artun Boz, Wouter Zorgdrager, Zoe Kotti, Jesse Harte, Panos Louridas, Dietmar Jannach, Vassilios Karakoidas, Marios Fragkoulis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The sequential recommendation problem has attracted considerable research
+attention in the past few years, leading to the rise of numerous recommendation
+models. In this work, we explore how Large Language Models (LLMs), which are
+nowadays introducing disruptive effects in many AI-based applications, can be
+used to build or improve sequential recommendation approaches. Specifically, we
+design three orthogonal approaches and hybrids of those to leverage the power
+of LLMs in different ways. In addition, we investigate the potential of each
+approach by focusing on its comprising technical aspects and determining an
+array of alternative choices for each one. We conduct extensive experiments on
+three datasets and explore a large variety of configurations, including
+different language models and baseline recommendation models, to obtain a
+comprehensive picture of the performance of each approach. Among other
+observations, we highlight that initializing state-of-the-art sequential
+recommendation models such as BERT4Rec or SASRec with embeddings obtained from
+an LLM can lead to substantial performance gains in terms of accuracy.
+Furthermore, we find that fine-tuning an LLM for recommendation tasks enables
+it to learn not only the tasks, but also concepts of a domain to some extent.
+We also show that fine-tuning OpenAI GPT leads to considerably better
+performance than fine-tuning Google PaLM 2. Overall, our extensive experiments
+indicate a huge potential value of leveraging LLMs in future recommendation
+approaches. We publicly share the code and data of our experiments to ensure
+reproducibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 12 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-10T00:00:00Z">2025-01-10</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">26</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoDriveVLM: VLM-Enhanced Urban Cooperative Dispatching and Motion
+  Planning for Future Autonomous Mobility on Demand Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haichao Liu, Ruoyu Yao, Wenru Liu, Zhenmin Huang, Shaojie Shen, Jun Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing demand for flexible and efficient urban transportation
+solutions has spotlighted the limitations of traditional Demand Responsive
+Transport (DRT) systems, particularly in accommodating diverse passenger needs
+and dynamic urban environments. Autonomous Mobility-on-Demand (AMoD) systems
+have emerged as a promising alternative, leveraging connected and autonomous
+vehicles (CAVs) to provide responsive and adaptable services. However, existing
+methods primarily focus on either vehicle scheduling or path planning, which
+often simplify complex urban layouts and neglect the necessity for simultaneous
+coordination and mutual avoidance among CAVs. This oversimplification poses
+significant challenges to the deployment of AMoD systems in real-world
+scenarios. To address these gaps, we propose CoDriveVLM, a novel framework that
+integrates high-fidelity simultaneous dispatching and cooperative motion
+planning for future AMoD systems. Our method harnesses Vision-Language Models
+(VLMs) to enhance multi-modality information processing, and this enables
+comprehensive dispatching and collision risk evaluation. The VLM-enhanced CAV
+dispatching coordinator is introduced to effectively manage complex and
+unforeseen AMoD conditions, thus supporting efficient scheduling
+decision-making. Furthermore, we propose a scalable decentralized cooperative
+motion planning method via consensus alternating direction method of
+multipliers (ADMM) focusing on collision risk evaluation and decentralized
+trajectory optimization. Simulation results demonstrate the feasibility and
+robustness of CoDriveVLM in various traffic conditions, showcasing its
+potential to significantly improve the fidelity and effectiveness of AMoD
+systems in future urban transportation networks. The code is available at
+https://github.com/henryhcliu/CoDriveVLM.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Mixed-Integer Conic Program for the Multi-Agent Moving-Target
+  Traveling Salesman Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06130v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06130v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allen George Philip, Zhongqiang Ren, Sivakumar Rathinam, Howie Choset
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Moving-Target Traveling Salesman Problem (MT-TSP) aims to find a shortest
+path for an agent that starts at a stationary depot, visits a set of moving
+targets exactly once, each within one of their respective time windows, and
+then returns to the depot. In this paper, we introduce a new Mixed-Integer
+Conic Program (MICP) formulation that finds the optimum for the Multi-Agent
+Moving-Target Traveling Salesman Problem (MA-MT-TSP), a generalization of the
+MT-TSP involving multiple agents. We obtain our formulation by first restating
+the current state-of-the-art MICP formulation for MA-MT-TSP as a Mixed-Integer
+Nonlinear Nonconvex Program, and then reformulating it as a new MICP. We
+present computational results to demonstrate the performance of our approach.
+The results show that our formulation significantly outperforms the
+state-of-the-art, with up to a two-order-of-magnitude reduction in runtime, and
+up to over 90% tighter optimality gap.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NDOB-Based Control of a UAV with Delta-Arm Considering Manipulator
+  Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06122v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06122v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongming Chen, Biyu Ye, Xianqi Liang, Weiliang Deng, Ximin Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aerial Manipulators (AMs) provide a versatile platform for various
+applications, including 3D printing, architecture, and aerial grasping
+missions. However, their operational speed is often sacrificed to uphold
+precision. Existing control strategies for AMs often regard the manipulator as
+a disturbance and employ robust control methods to mitigate its influence. This
+research focuses on elevating the precision of the end-effector and enhancing
+the agility of aerial manipulator movements. We present a composite control
+scheme to address these challenges. Initially, a Nonlinear Disturbance Observer
+(NDOB) is utilized to compensate for internal coupling effects and external
+disturbances. Subsequently, manipulator dynamics are processed through a high
+pass filter to facilitate agile movements. By integrating the proposed control
+method into a fully autonomous delta-arm-based AM system, we substantiate the
+controller's efficacy through extensive real-world experiments. The outcomes
+illustrate that the end-effector can achieve accuracy at the millimeter level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development of an Advisory System for Parking of a Car and Trailer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06115v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06115v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xincheng Cao, Haochong Chen, Bilin Aksun Guvenc, Levent Guvenc, Shihong Fan, John Harber, Brian Link, Peter Richmond, Dokyung Yim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trailer parking is a challenging task due to the unstable nature of the
+vehicle-trailer system in reverse motion and the unintuitive steering actions
+required at the vehicle to accomplish the parking maneuver. This paper presents
+a strategy to tackle this kind of maneuver with an advisory graphic aid to help
+the human driver with the task of manually backing up the vehicle-trailer
+system. A kinematic vehicle-trailer model is derived to describe the low-speed
+motion of the vehicle-trailer system, and its inverse kinematics is established
+by generating an equivalent virtual trailer axle steering command. The advisory
+system graphics is generated based on the inverse kinematics and displays the
+expected trailer orientation given the current vehicle steer angle and
+configuration (hitch angle). Simulation study and animation are set up to test
+the efficacy of the approach, where the user can select both vehicle speed and
+vehicle steering angle freely, which allows the user to stop the
+vehicle-trailer system and experiment with different steering inputs to see
+their effect on the predicted trailer motion before proceeding with the best
+one according to the advisory graphics, hence creating a series of piecewise
+continuous control actions similar to how manual trailer reverse parking is
+usually carried out. The advisory graphics proves to provide the driver with an
+intuitive understanding of the trailer motion at any given configuration (hitch
+angle).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vehicle-in-Virtual-Environment (VVE) Based Autonomous Driving Function
+  Development and Evaluation Methodology for Vulnerable Road User Safety 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochong Chen, Xincheng Cao, Levent Guvenc, Bilin Aksun Guvenc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional methods for developing and evaluating autonomous driving
+functions, such as model-in-the-loop (MIL) and hardware-in-the-loop (HIL)
+simulations, heavily depend on the accuracy of simulated vehicle models and
+human factors, especially for vulnerable road user safety systems. Continuation
+of development during public road deployment forces other road users including
+vulnerable ones to involuntarily participate in the development process,
+leading to safety risks, inefficiencies, and a decline in public trust. To
+address these deficiencies, the Vehicle-in-Virtual-Environment (VVE) method was
+proposed as a safer, more efficient, and cost-effective solution for developing
+and testing connected and autonomous driving technologies by operating the real
+vehicle and multiple other actors like vulnerable road users in different test
+areas while being immersed within the same highly realistic virtual
+environment. This VVE approach synchronizes real-world vehicle and vulnerable
+road user motion within the same virtual scenario, enabling the safe and
+realistic testing of various traffic situations in a safe and repeatable
+manner. In this paper, we propose a new testing pipeline that sequentially
+integrates MIL, HIL, and VVE methods to comprehensively develop and evaluate
+autonomous driving functions. The effectiveness of this testing pipeline will
+be demonstrated using an autonomous driving path-tracking algorithm with local
+deep reinforcement learning modification for vulnerable road user collision
+avoidance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Developing Socially Compliant Automated Vehicles: State of the
+  Art, Experts Expectations, and A Conceptual Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongqi Dong, Bart van Arem, Haneen Farah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated Vehicles (AVs) hold promise for revolutionizing transportation by
+improving road safety, traffic efficiency, and overall mobility. Despite the
+steady advancement in high-level AVs in recent years, the transition to full
+automation entails a period of mixed traffic, where AVs of varying automation
+levels coexist with human-driven vehicles (HDVs). Making AVs socially compliant
+and understood by human drivers is expected to improve the safety and
+efficiency of mixed traffic. Thus, ensuring AVs compatibility with HDVs and
+social acceptance is crucial for their successful and seamless integration into
+mixed traffic. However, research in this critical area of developing Socially
+Compliant AVs (SCAVs) remains sparse. This study carries out the first
+comprehensive scoping review to assess the current state of the art in
+developing SCAVs, identifying key concepts, methodological approaches, and
+research gaps. An expert interview was also conducted to identify critical
+research gaps and expectations towards SCAVs. Based on the scoping review and
+expert interview input, a conceptual framework is proposed for the development
+of SCAVs. The conceptual framework is evaluated using an online survey
+targeting researchers, technicians, policymakers, and other relevant
+professionals worldwide. The survey results provide valuable validation and
+insights, affirming the significance of the proposed conceptual framework in
+tackling the challenges of integrating AVs into mixed-traffic environments.
+Additionally, future research perspectives and suggestions are discussed,
+contributing to the research and development agenda of SCAVs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 13 figures, under review by the journal of Transportation
+  Research Part E: Logistics and Transportation Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-planar 3D Printing of Double Shells 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioanna Mitropoulou, Amir Vaxman, Olga Diamanti, Benjamin Dillenburger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method to fabricate double shell structures printed in
+trans-versal directions using multi-axis fused-deposition-modeling (FDM)
+robot-ic 3D printing. Shell structures, characterized by lightweight, thin
+walls, fast buildup, and minimal material usage, find diverse applications in
+pro-totyping and architecture for uses such as fa\c{c}ade panels, molds for
+concrete casting, or full-scale pavilions. We leverage an underlying
+representation of transversal strip networks generated using existing methods
+and propose a methodology for converting them into printable partitions. Each
+partition is printed separately and assembled into a double-shell structure. We
+out-line the specifications and workflow that make the printing of each piece
+and the subsequent assembly process feasible. The versatility and robust-ness
+of our method are demonstrated with both digital and fabricated re-sults on
+surfaces of different scales and geometric complexity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Affordances from Interactive Exploration using an Object-level
+  Map 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06047v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06047v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paula Wulkop, Halil Umut Özdemir, Antonia Hüfner, Jen Jen Chung, Roland Siegwart, Lionel Ott
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many robotic tasks in real-world environments require physical interactions
+with an object such as pick up or push. For successful interactions, the robot
+needs to know the object's affordances, which are defined as the potential
+actions the robot can perform with the object. In order to learn a
+robot-specific affordance predictor, we propose an interactive exploration
+pipeline which allows the robot to collect interaction experiences while
+exploring an unknown environment. We integrate an object-level map in the
+exploration pipeline such that the robot can identify different object
+instances and track objects across diverse viewpoints. This results in denser
+and more accurate affordance annotations compared to state-of-the-art methods,
+which do not incorporate a map. We show that our affordance exploration
+approach makes exploration more efficient and results in more accurate
+affordance prediction models compared to baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Symposium of Robotics Research (ISRR) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Environment Modeling for Service Robots From a Task Execution
+  Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Zhang, Guohui Tian, Cui-Hua Zhang, Changchun Hua, Weili Ding, Choon Ki Ahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Service robots are increasingly entering the home to provide domestic tasks
+for residents. However, when working in an open, dynamic, and unstructured home
+environment, service robots still face challenges such as low intelligence for
+task execution and poor long-term autonomy (LTA), which has limited their
+deployment. As the basis of robotic task execution, environment modeling has
+attracted significant attention. This integrates core technologies such as
+environment perception, understanding, and representation to accurately
+recognize environmental information. This paper presents a comprehensive survey
+of environmental modeling from a new task-executionoriented perspective. In
+particular, guided by the requirements of robots in performing domestic service
+tasks in the home environment, we systematically review the progress that has
+been made in task-execution-oriented environmental modeling in four respects:
+1) localization, 2) navigation, 3) manipulation, and 4) LTA. Current challenges
+are discussed, and potential research opportunities are also highlighted.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures; This article has been accepted for publication
+  in a future issue of IEEE/CAA Journal of Automatica Sinica, but has not been
+  fully edited. Content may change prior to final publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Path Planning for Multi-Copter UAV Formation Employing a Generalized
+  Particle Swarm Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Truong Hoang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper investigates the problem of path planning techniques for
+multi-copter uncrewed aerial vehicles (UAV) cooperation in a formation shape to
+examine surrounding surfaces. We first describe the problem as a joint
+objective cost for planning a path of the formation centroid working in a
+complicated space. The path planning algorithm, named the generalized particle
+swarm optimization algorithm, is then presented to construct an optimal,
+flyable path while avoiding obstacles and ensuring the flying mission
+requirements. A path-development scheme is then incorporated to generate a
+relevant path for each drone to maintain its position in the formation
+configuration. Simulation, comparison, and experiments have been conducted to
+verify the proposed approach. Results show the feasibility of the proposed
+path-planning algorithm with GEPSO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Mapping in Indoor Embodied AI -- A Comprehensive <span class="highlight-title">Survey</span> and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sonia Raychaudhuri, Angel X. Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent embodied agents (e.g. robots) need to perform complex semantic
+tasks in unfamiliar environments. Among many skills that the agents need to
+possess, building and maintaining a semantic map of the environment is most
+crucial in long-horizon tasks. A semantic map captures information about the
+environment in a structured way, allowing the agent to reference it for
+advanced reasoning throughout the task. While existing surveys in embodied AI
+focus on general advancements or specific tasks like navigation and
+manipulation, this paper provides a comprehensive review of semantic
+map-building approaches in embodied AI, specifically for indoor navigation. We
+categorize these approaches based on their structural representation (spatial
+grids, topological graphs, dense point-clouds or hybrid maps) and the type of
+information they encode (implicit features or explicit environmental data). We
+also explore the strengths and limitations of the map building techniques,
+highlight current challenges, and propose future research directions. We
+identify that the field is moving towards developing open-vocabulary,
+queryable, task-agnostic map representations, while high memory demands and
+computational inefficiency still remaining to be open challenges. This survey
+aims to guide current and future researchers in advancing semantic mapping
+techniques for embodied AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robot Error Awareness Through Human Reactions: Implementation,
+  Evaluation, and Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maia Stiber, Russell Taylor, Chien-Ming Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective error detection is crucial to prevent task disruption and maintain
+user trust. Traditional methods often rely on task-specific models or user
+reporting, which can be inflexible or slow. Recent research suggests social
+signals, naturally exhibited by users in response to robot errors, can enable
+more flexible, timely error detection. However, most studies rely on post hoc
+analysis, leaving their real-time effectiveness uncertain and lacking
+user-centric evaluation. In this work, we developed a proactive error detection
+system that combines user behavioral signals (facial action units and speech),
+user feedback, and error context for automatic error detection. In a study (N =
+28), we compared our proactive system to a status quo reactive approach.
+Results show our system 1) reliably and flexibly detects error, 2) detects
+errors faster than the reactive approach, and 3) is perceived more favorably by
+users than the reactive one. We discuss recommendations for enabling robot
+error awareness in future HRI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ eKalibr: Dynamic Intrinsic Calibration for Event Cameras From First
+  Principles of Events 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuolong Chen, Xingxing Li, Liu Yuan, Ziao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The bio-inspired event camera has garnered extensive research attention in
+recent years, owing to its significant potential derived from its high dynamic
+range and low latency characteristics. Similar to the standard camera, the
+event camera requires precise intrinsic calibration to facilitate further
+high-level visual applications, such as pose estimation and mapping. While
+several calibration methods for event cameras have been proposed, most of them
+are either (i) engineering-driven, heavily relying on conventional image-based
+calibration pipelines, or (ii) inconvenient, requiring complex instrumentation.
+To this end, we propose an accurate and convenient intrinsic calibration method
+for event cameras, named eKalibr, which builds upon a carefully designed
+event-based circle grid pattern recognition algorithm. To extract target
+patterns from events, we perform event-based normal flow estimation to identify
+potential events generated by circle edges, and cluster them spatially.
+Subsequently, event clusters associated with the same grid circles are matched
+and grouped using normal flows, for subsequent time-varying ellipse estimation.
+Fitted ellipse centers are time-synchronized, for final grid pattern
+recognition. We conducted extensive experiments to evaluate the performance of
+eKalibr in terms of pattern extraction and intrinsic calibration. The
+implementation of eKalibr is open-sourced at
+(https://github.com/Unsigned-Long/eKalibr) to benefit the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Safe Multi-Agent Control for Signal Temporal Logic
+  Specifications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joe Eappen, Zikang Xiong, Dipam Patel, Aniket Bera, Suresh Jagannathan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods for safe multi-agent control using logic specifications like
+Signal Temporal Logic (STL) often face scalability issues. This is because they
+rely either on single-agent perspectives or on Mixed Integer Linear Programming
+(MILP)-based planners, which are complex to optimize. These methods have proven
+to be computationally expensive and inefficient when dealing with a large
+number of agents. To address these limitations, we present a new scalable
+approach to multi-agent control in this setting. Our method treats the
+relationships between agents using a graph structure rather than in terms of a
+single-agent perspective. Moreover, it combines a multi-agent collision
+avoidance controller with a Graph Neural Network (GNN) based planner, models
+the system in a decentralized fashion, and trains on STL-based objectives to
+generate safe and efficient plans for multiple agents, thereby optimizing the
+satisfaction of complex temporal specifications while also facilitating
+multi-agent collision avoidance. Our experiments show that our approach
+significantly outperforms existing methods that use a state-of-the-art
+MILP-based planner in terms of scalability and performance. The project website
+is https://jeappen.com/mastl-gcbf-website/ and the code is at
+https://github.com/jeappen/mastl-gcbf .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CoRL 2024. arXiv admin note: text overlap with
+  arXiv:2401.14554 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Concerns and Values in Human-Robot Interactions: A Focus on Social
+  Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giulio Antonio Abbo, Tony Belpaeme, Micol Spitale
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robots, as AI with physical instantiation, inhabit our social and physical
+world, where their actions have both social and physical consequences, posing
+challenges for researchers when designing social robots. This study starts with
+a scoping review to identify discussions and potential concerns arising from
+interactions with robotic systems. Two focus groups of technology ethics
+experts then validated a comprehensive list of key topics and values in
+human-robot interaction (HRI) literature. These insights were integrated into
+the HRI Value Compass web tool, to help HRI researchers identify ethical values
+in robot design. The tool was evaluated in a pilot study. This work benefits
+the HRI community by highlighting key concerns in human-robot interactions and
+providing an instrument to help researchers design robots that align with human
+values, ensuring future robotic systems adhere to these values in social
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>52 pages, 10 figures, 5 appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why Automate This? Exploring the Connection between Time Use, Well-being
+  and Robot Automation Across Social Groups 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06348v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06348v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruchira Ray, Leona Pang, Sanjana Srivastava, Li Fei-Fei, Samantha Shorey, Roberto Martín-Martín
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the motivations underlying the human inclination to automate
+tasks is vital to developing truly helpful robots integrated into daily life.
+Accordingly, we ask: are individuals more inclined to automate chores based on
+the time they consume or the feelings experienced while performing them? This
+study explores these preferences and whether they vary across different social
+groups (i.e., gender category and income level). Leveraging data from the
+BEHAVIOR-1K dataset, the American Time-Use Survey, and the American Time-Use
+Survey Well-Being Module, we investigate the relationship between the desire
+for automation, time spent on daily activities, and their associated feelings -
+Happiness, Meaningfulness, Sadness, Painfulness, Stressfulness, or Tiredness.
+Our key findings show that, despite common assumptions, time spent does not
+strongly relate to the desire for automation for the general population. For
+the feelings analyzed, only happiness and pain are key indicators. Significant
+differences by gender and economic level also emerged: Women prefer to automate
+stressful activities, whereas men prefer to automate those that make them
+unhappy; mid-income individuals prioritize automating less enjoyable and
+meaningful activities, while low and high-income show no significant
+correlations. We hope our research helps motivate technologies to develop
+robots that match the priorities of potential users, moving domestic robotics
+toward more socially relevant solutions. We open-source all the data, including
+an online tool that enables the community to replicate our analysis and explore
+additional trends at https://hri1260.github.io/why-automate-this.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning-based Detection of GPS Spoofing Attack for Quadrotors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07597v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07597v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyu Wang, Zhaohua Yang, Jialu Li, Ling Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safety-critical cyber-physical systems (CPS), such as quadrotor UAVs, are
+particularly prone to cyber attacks, which can result in significant
+consequences if not detected promptly and accurately. During outdoor
+operations, the nonlinear dynamics of UAV systems, combined with non-Gaussian
+noise, pose challenges to the effectiveness of conventional statistical and
+machine learning methods. To overcome these limitations, we present QUADFormer,
+an advanced attack detection framework for quadrotor UAVs leveraging a
+transformer-based architecture. This framework features a residue generator
+that produces sequences sensitive to anomalies, which are then analyzed by the
+transformer to capture statistical patterns for detection and classification.
+Furthermore, an alert mechanism ensures UAVs can operate safely even when under
+attack. Extensive simulations and experimental evaluations highlight that
+QUADFormer outperforms existing state-of-the-art techniques in detection
+accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Industrial Electronics Society Annual Online
+  Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmark Evaluations, Applications, and Challenges of Large Vision
+  Language Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongxia Li, Xiyang Wu, Hongyang Du, Huy Nghiem, Guangyao Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Vision Language Models (VLMs) have emerged as a transformative
+technology at the intersection of computer vision and natural language
+processing, enabling machines to perceive and reason about the world through
+both visual and textual modalities. For example, models such as CLIP, Claude,
+and GPT-4V demonstrate strong reasoning and understanding abilities on visual
+and textual data and beat classical single modality vision models on zero-shot
+classification. Despite their rapid advancements in research and growing
+popularity in applications, a comprehensive survey of existing studies on VLMs
+is notably lacking, particularly for researchers aiming to leverage VLMs in
+their specific domains. To this end, we provide a systematic overview of VLMs
+in the following aspects: model information of the major VLMs developed over
+the past five years (2019-2024); the main architectures and training methods of
+these VLMs; summary and categorization of the popular benchmarks and evaluation
+metrics of VLMs; the applications of VLMs including embodied agents, robotics,
+and video generation; the challenges and issues faced by current VLMs such as
+hallucination, fairness, and safety. Detailed collections including papers and
+model repository links are listed in
+https://github.com/zli12321/Awesome-VLM-Papers-And-Models.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A General Control Method for Human-Robot Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14762v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14762v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maddalena Feder, Giorgio Grioli, Manuel G. Catalano, Antonio Bicchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new generalized control method designed for
+multi-degrees-of-freedom devices to help people with limited motion
+capabilities in their daily activities. The challenge lies in finding the most
+adapted strategy for the control interface to effectively map user's motions in
+a low-dimensional space to complex robotic assistive devices, such as
+prostheses, supernumerary limbs, up to remote robotic avatars. The goal is a
+system which integrates the human and the robotic parts into a unique system,
+moving so as to reach the targets decided by the human while autonomously
+reducing the user's effort and discomfort. We present a framework to control
+general multi DoFs assistive systems, which translates user-performed
+compensatory motions into the necessary robot commands for reaching targets
+while canceling or reducing compensation. The framework extends to prostheses
+of any number of DoF up to full robotic avatars, regarded here as a sort of
+whole-body prosthesis of the person who sees the robot as an artificial
+extension of their own body without a physical link but with a sensory-motor
+integration. We have validated and applied this control strategy through tests
+encompassing simulated scenarios and real-world trials involving a virtual twin
+of the robotic parts (prosthesis and robot) and a physical humanoid avatar.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the International Journal of Robotics Research (IJRR),
+  under review since October 2024, 16 pages, 30 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robots in Family Routines: Development of and Initial Insights from the
+  Family-Robot Routines Inventory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11136v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11136v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael F. Xu, Bengisu Cagiltay, Joseph Michaelis, Sarah Sebo, Bilge Mutlu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite advances in areas such as the personalization of robots, sustaining
+adoption of robots for long-term use in families remains a challenge. Recent
+studies have identified integrating robots into families' routines and rituals
+as a promising approach to support long-term adoption. However, few studies
+explored the integration of robots into family routines and there is a gap in
+systematic measures to capture family preferences for robot integration.
+Building upon existing routine inventories, we developed Family-Robot Routines
+Inventory (FRRI), with 24 family routines and 24 child routine items, to
+capture parents' attitudes toward and expectations from the integration of
+robotic technology into their family routines. Using this inventory, we
+collected data from 150 parents through an online survey. Our analysis
+indicates that parents had varying perceptions for the utility of integrating
+robots into their routines. For example, parents found robot integration to be
+more helpful in children's individual routines, than to the collective routines
+of their families. We discuss the design implications of these preliminary
+findings, and how they may serve as a first step toward understanding the
+diverse challenges and demands of designing and integrating household robots
+for families.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the Use of Robots for Diary Studies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04860v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04860v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael F. Xu, Bilge Mutlu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As interest in studying in-the-wild human-robot interaction grows, there is a
+need for methods to collect data over time and in naturalistic or potentially
+private environments. HRI researchers have increasingly used the diary method
+for these studies, asking study participants to self-administer a structured
+data collection instrument, i.e., a diary, over a period of time. Although the
+diary method offers a unique window into settings that researchers may not have
+access to, they also lack the interactivity and probing that interview-based
+methods offer. In this paper, we explore a novel data collection method in
+which a robot plays the role of an interactive diary. We developed the Diary
+Robot system and performed in-home deployments for a week to evaluate the
+feasibility and effectiveness of this approach. Using traditional text-based
+and audio-based diaries as benchmarks, we found that robots are able to
+effectively elicit the intended information. We reflect on our findings, and
+describe scenarios where the utilization of robots in diary studies as a data
+collection instrument may be especially applicable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 20th ACM/IEEE International Conference on Human
+  Robot Interaction (HRI 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Harmonic Exponential Filter for Nonparametric Estimation on Motion
+  Groups 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.00907v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.00907v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Saavedra-Ruiz, Steven A. Parkison, Ria Arora, James Richard Forbes, Liam Paull
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian estimation is a vital tool in robotics as it allows systems to
+update the robot state belief using incomplete information from noisy sensors.
+To render the state estimation problem tractable, many systems assume that the
+motion and measurement noise, as well as the state distribution, are unimodal
+and Gaussian. However, there are numerous scenarios and systems that do not
+comply with these assumptions. Existing nonparametric filters that are used to
+model multimodal distributions have drawbacks that limit their ability to
+represent a diverse set of distributions. This paper introduces a novel
+approach to nonparametric Bayesian filtering on motion groups, designed to
+handle multimodal distributions using harmonic exponential distributions. This
+approach leverages two key insights of harmonic exponential distributions: a)
+the product of two distributions can be expressed as the element-wise addition
+of their log-likelihood Fourier coefficients, and b) the convolution of two
+distributions can be efficiently computed as the tensor product of their
+Fourier coefficients. These observations enable the development of an efficient
+and asymptotically exact solution to the Bayes filter up to the band limit of a
+Fourier transform. We demonstrate our filter's performance compared with
+established nonparametric filtering methods across simulated and real-world
+localization tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the IEEE Robotics and Automation Letters (RA-L 2025) Code
+  available at https://github.com/montrealrobotics/harmonic-filter. Webpage and
+  additional videos at https://montrealrobotics.ca/hef/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards the Internet of Robotic Things: Analysis, Architecture,
+  Components and Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1907.03817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1907.03817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilya Afanasyev, Manuel Mazzara, Subham Chakraborty, Nikita Zhuchkov, Aizhan Maksatbek, Mohamad Kassab, Salvatore Distefano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Internet of Things (IoT) and robotics cannot be considered two separate
+domains these days. Internet of Robotics Things (IoRT) is a concept that has
+been recently introduced to describe the integration of robotics technologies
+in IoT scenarios. As a consequence, these two research fields have started
+interacting, and thus linking research communities. In this paper we intend to
+make further steps in joining the two communities and broaden the discussion on
+the development of this interdisciplinary field. The paper provides an
+overview, analysis and challenges of possible solutions for the Internet of
+Robotic Things, discussing the issues of the IoRT architecture, the integration
+of smart spaces and robotic applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CloudTrack: Scalable UAV Tracking with Cloud Semantics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16111v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16111v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannik Blei, Michael Krawez, Nisarga Nilavadi, Tanja Katharina Kaiser, Wolfram Burgard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, unmanned aerial vehicles (UAVs) are commonly used in search and
+rescue scenarios to gather information in the search area. The automatic
+identification of the person searched for in aerial footage could increase the
+autonomy of such systems, reduce the search time, and thus increase the missed
+person's chances of survival. In this paper, we present a novel approach to
+perform semantically conditioned open vocabulary object tracking that is
+specifically designed to cope with the limitations of UAV hardware. Our
+approach has several advantages. It can run with verbal descriptions of the
+missing person, e.g., the color of the shirt, it does not require dedicated
+training to execute the mission and can efficiently track a potentially moving
+person. Our experimental results demonstrate the versatility and efficacy of
+our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VLM-driven Behavior Tree for Context-aware Task Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03968v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03968v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoki Wake, Atsushi Kanehira, Jun Takamatsu, Kazuhiro Sasabuchi, Katsushi Ikeuchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)
+has recently gained attention in the robotics community, yet remains in its
+early stages of development. In this paper, we propose a novel framework that
+leverages Vision-Language Models (VLMs) to interactively generate and edit BTs
+that address visual conditions, enabling context-aware robot operations in
+visually complex environments. A key feature of our approach lies in the
+conditional control through self-prompted visual conditions. Specifically, the
+VLM generates BTs with visual condition nodes, where conditions are expressed
+as free-form text. Another VLM process integrates the text into its prompt and
+evaluates the conditions against real-world images during robot execution. We
+validated our framework in a real-world cafe scenario, demonstrating both its
+feasibility and limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 11 figures, 5 tables. Last updated on January 9th, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sinkage Study in Granular Material for Space Exploration Legged Robot
+  Gripper 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07261v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07261v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arthur Candalot, James Hurrell, Malik Manel Hashim, Brigid Hickey, Mickael Laine, Kazuya Yoshida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wheeled rovers have been the primary choice for lunar exploration due to
+their speed and efficiency. However, deeper areas, such as lunar caves and
+craters, require the mobility of legged robots. To do so, appropriate end
+effectors must be designed to enable climbing and walking on the granular
+surface of the Moon. This paper investigates the behavior of an underactuated
+soft gripper on deformable granular material when a legged robot is walking in
+soft soil. A modular test bench and a simulation model were developed to
+observe the gripper sinkage behavior under load. The gripper uses tendon-driven
+fingers to match its target shape and grasp on the target surface using
+multiple micro-spines. The sinkage of the gripper in silica sand was measured
+by comparing the axial displacement of the gripper with the nominal load of the
+robot mass. Multiple experiments were performed to observe the sinkage of the
+gripper over a range of slope angles. A simulation model accounting for the
+degrees of compliance of the gripper fingers was created using Altair
+MotionSolve software and coupled to Altair EDEM to compute the gripper
+interaction with particles utilizing the discrete element method. After
+validation of the model, complementary simulations using Lunar gravity and a
+regolith particle model were performed. The results show that a satisfactory
+gripper model with accurate freedom of motion can be created in simulation
+using the Altair simulation packages and expected sinkage under load in a
+particle-filled environment can be estimated using this model. By computing the
+sinkage of the end effector of legged robots, the results can be directly
+integrated into the motion control algorithm and improve the accuracy of
+mobility in a granular material environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 21st International and 12th Asia-Pacific Regional
+  Conference of the ISTVS</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">35</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Best Response Convergence for Zero-sum Stochastic Dynamic Games with
+  Partial and Asymmetric Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiang Guan, Iman Shames, Tyler H. Summers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze best response dynamics for finding a Nash equilibrium of an
+infinite horizon zero-sum stochastic linear quadratic dynamic game (LQDG) with
+partial and asymmetric information. We derive explicit expressions for each
+player's best response within the class of pure linear dynamic output feedback
+control strategies where the internal state dimension of each control strategy
+is an integer multiple of the system state dimension. With each best response,
+the players form increasingly higher-order belief states, leading to
+infinite-dimensional internal states. However, we observe in extensive
+numerical experiments that the game's value converges after just a few
+iterations, suggesting that strategies associated with increasingly
+higher-order belief states eventually provide no benefit. To help explain this
+convergence, our numerical analysis reveals rapid decay of the controllability
+and observability Gramian eigenvalues and Hankel singular values in
+higher-order belief dynamics, indicating that the higher-order belief dynamics
+become increasingly difficult for both players to control and observe.
+Consequently, the higher-order belief dynamics can be closely approximated by
+low-order belief dynamics with bounded error, and thus feedback strategies with
+limited internal state dimension can closely approximate a Nash equilibrium.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Learning for Physically-Constrained Neural System Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankush Chakrabarty, Gordon Wichern, Vedang M. Deshpande, Abraham P. Vinod, Karl Berntorp, Christopher R. Laughman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a gradient-based meta-learning framework for rapid adaptation of
+neural state-space models (NSSMs) for black-box system identification. When
+applicable, we also incorporate domain-specific physical constraints to improve
+the accuracy of the NSSM. The major benefit of our approach is that instead of
+relying solely on data from a single target system, our framework utilizes data
+from a diverse set of source systems, enabling learning from limited target
+data, as well as with few online training iterations. Through benchmark
+examples, we demonstrate the potential of our approach, study the effect of
+fine-tuning subnetworks rather than full fine-tuning, and report real-world
+case studies to illustrate the practical application and generalizability of
+the approach to practical problems with physical-constraints. Specifically, we
+show that the meta-learned models result in improved downstream performance in
+model-based state estimation in indoor localization and energy systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonlinear port-Hamiltonian system identification from input-state-output
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim Cherifi, Achraf El Messaoudi, Hannes Gernandt, Marco Roschkowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A framework for identifying nonlinear port-Hamiltonian systems using
+input-state-output data is introduced. The framework utilizes neural networks'
+universal approximation capacity to effectively represent complex dynamics in a
+structured way. We show that using the structure helps to make long-term
+predictions compared to baselines that do not incorporate physics. We also
+explore different architectures based on MLPs, KANs, and using prior
+information. The technique is validated through examples featuring
+nonlinearities in either the skew-symmetric terms, the dissipative terms, or
+the Hamiltonian.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development of an Advisory System for Parking of a Car and Trailer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06115v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06115v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xincheng Cao, Haochong Chen, Bilin Aksun Guvenc, Levent Guvenc, Shihong Fan, John Harber, Brian Link, Peter Richmond, Dokyung Yim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trailer parking is a challenging task due to the unstable nature of the
+vehicle-trailer system in reverse motion and the unintuitive steering actions
+required at the vehicle to accomplish the parking maneuver. This paper presents
+a strategy to tackle this kind of maneuver with an advisory graphic aid to help
+the human driver with the task of manually backing up the vehicle-trailer
+system. A kinematic vehicle-trailer model is derived to describe the low-speed
+motion of the vehicle-trailer system, and its inverse kinematics is established
+by generating an equivalent virtual trailer axle steering command. The advisory
+system graphics is generated based on the inverse kinematics and displays the
+expected trailer orientation given the current vehicle steer angle and
+configuration (hitch angle). Simulation study and animation are set up to test
+the efficacy of the approach, where the user can select both vehicle speed and
+vehicle steering angle freely, which allows the user to stop the
+vehicle-trailer system and experiment with different steering inputs to see
+their effect on the predicted trailer motion before proceeding with the best
+one according to the advisory graphics, hence creating a series of piecewise
+continuous control actions similar to how manual trailer reverse parking is
+usually carried out. The advisory graphics proves to provide the driver with an
+intuitive understanding of the trailer motion at any given configuration (hitch
+angle).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vehicle-in-Virtual-Environment (VVE) Based Autonomous Driving Function
+  Development and Evaluation Methodology for Vulnerable Road User Safety 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochong Chen, Xincheng Cao, Levent Guvenc, Bilin Aksun Guvenc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional methods for developing and evaluating autonomous driving
+functions, such as model-in-the-loop (MIL) and hardware-in-the-loop (HIL)
+simulations, heavily depend on the accuracy of simulated vehicle models and
+human factors, especially for vulnerable road user safety systems. Continuation
+of development during public road deployment forces other road users including
+vulnerable ones to involuntarily participate in the development process,
+leading to safety risks, inefficiencies, and a decline in public trust. To
+address these deficiencies, the Vehicle-in-Virtual-Environment (VVE) method was
+proposed as a safer, more efficient, and cost-effective solution for developing
+and testing connected and autonomous driving technologies by operating the real
+vehicle and multiple other actors like vulnerable road users in different test
+areas while being immersed within the same highly realistic virtual
+environment. This VVE approach synchronizes real-world vehicle and vulnerable
+road user motion within the same virtual scenario, enabling the safe and
+realistic testing of various traffic situations in a safe and repeatable
+manner. In this paper, we propose a new testing pipeline that sequentially
+integrates MIL, HIL, and VVE methods to comprehensively develop and evaluate
+autonomous driving functions. The effectiveness of this testing pipeline will
+be demonstrated using an autonomous driving path-tracking algorithm with local
+deep reinforcement learning modification for vulnerable road user collision
+avoidance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Experiments for Accurate Battery Circuit Parameters
+  Estimation: Reduction and Adjustment of Frequency Set Used in Electrochemical
+  Impedance Spectroscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06112v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06112v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladimir Sovljanski, Mario Paolone, Sylvain Tant, Damien Pierre Sainflou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study a suitable experimental design of electrochemical
+impedance spectroscopy (EIS) to reduce the number of frequency points while not
+significantly affecting the uncertainties of the estimated cell's equivalent
+circuit model (ECM) parameters. It is based on an E-optimal experimental design
+that aims to maximize the information about the ECM parameters collected by EIS
+measurements and, at the same time, minimize the overall uncertainty. In a
+numerical experiment, we first analyze to which extent reducing the number of
+measurement points at low frequencies affects the uncertainty of the estimated
+parameters. Secondly, we show that applying the frequency adjustments can lead
+to the same or even improved global uncertainty of ECM parameter estimates as
+with a higher number of measurements. This is numerically verified through a
+case study using the ECM parameters of a commercial battery cell.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weather-Driven Priority Charging for Battery Storage Systems in Hybrid
+  Renewable Energy Grid 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06104v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06104v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhrumil Bhatt, Siddharth Penumatsa, Nirbhay Singhal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of renewable energy into the power grid is often hindered by
+its fragmented infrastructure, leading to inefficient utilization due to the
+variability of energy production and its reliance on weather conditions.
+Battery storage systems, while essential for stabilizing energy supply, face
+challenges like sub-optimal energy distribution, accelerating battery
+degradation, and reducing operational efficiency. This paper presents a novel
+solution to these challenges by developing a large-scale, interconnected
+renewable energy network that optimizes energy storage and distribution. The
+proposed system includes strategically placed battery storage facilities that
+stabilize energy production by compensating for fluctuations in renewable
+output. A priority charging algorithm, informed by real-time weather
+forecasting and load monitoring, ensures that the most suitable battery systems
+are charged under varying conditions. Within each storage facility, a secondary
+priority charging algorithm minimizes battery degradation by ranking batteries
+based on critical parameters such as state of health (SoH) and state of charge
+(SoC) and deciding which to charge. This comprehensive approach enhances the
+efficiency and longevity of battery storage systems, offering a more reliable
+and resilient renewable energy infrastructure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Molecular Communication-Inspired Particle Collector-Transmitter (PaCoT)
+  for Heavy Metal Removal from Human Circulatory System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hilal Esra Yaldiz, Ozgur B. Akan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a novel molecular communication (MC)-inspired
+nanomachine, PArticle COllector-Transmitter (PaCoT), to remove toxic heavy
+metals from the human circulatory system. PaCoT collects these toxic metals and
+transmits them to release nodes, such as lymph capillaries, before they reach
+critical organs. The design incorporates key physical parameters and operates
+through particle reception and release mechanisms. In the reception process,
+described as ligand-receptor binding reactions, modeled as a continuous-time
+Markov process (CTMP), PaCoT uses metallothionein proteins as receptors and
+heavy metals (e.g., Zn, Pb, Cd) as ligands. We assume that the toxicity
+condition (toxic (bit-1), non-toxic (bit-0)) is encoded into the concentration
+of heavy metal molecules. Thus, we consider that heavy metal concentration
+within the MC channel (e.g., human circulatory system) employs binary
+concentration shift keying (binary CSK). The concentration ratio of specific
+heavy metals is estimated to infer toxicity, i.e., a high ratio indicates
+toxicity and a low ratio suggests non-toxicity. Toxicity detection is achieved
+by monitoring the receptor bound duration in the presence of interferers and
+various types of heavy metals. After detecting and collecting toxic heavy
+metals, PaCoT securely retains them in a liquid medium (e.g., water) until
+release, employing two mechanisms: (1) a single-disc viscous micropump to
+regulate flow rate, and (2) Brownian motion to facilitate diffusion. PaCoT's
+performance is evaluated through MATLAB simulations, focusing on bit error
+probability (BEP) of the toxicity detection method, release time of molecules
+from PaCoT and energy consumption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Developing Socially Compliant Automated Vehicles: State of the
+  Art, Experts Expectations, and A Conceptual Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongqi Dong, Bart van Arem, Haneen Farah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated Vehicles (AVs) hold promise for revolutionizing transportation by
+improving road safety, traffic efficiency, and overall mobility. Despite the
+steady advancement in high-level AVs in recent years, the transition to full
+automation entails a period of mixed traffic, where AVs of varying automation
+levels coexist with human-driven vehicles (HDVs). Making AVs socially compliant
+and understood by human drivers is expected to improve the safety and
+efficiency of mixed traffic. Thus, ensuring AVs compatibility with HDVs and
+social acceptance is crucial for their successful and seamless integration into
+mixed traffic. However, research in this critical area of developing Socially
+Compliant AVs (SCAVs) remains sparse. This study carries out the first
+comprehensive scoping review to assess the current state of the art in
+developing SCAVs, identifying key concepts, methodological approaches, and
+research gaps. An expert interview was also conducted to identify critical
+research gaps and expectations towards SCAVs. Based on the scoping review and
+expert interview input, a conceptual framework is proposed for the development
+of SCAVs. The conceptual framework is evaluated using an online survey
+targeting researchers, technicians, policymakers, and other relevant
+professionals worldwide. The survey results provide valuable validation and
+insights, affirming the significance of the proposed conceptual framework in
+tackling the challenges of integrating AVs into mixed-traffic environments.
+Additionally, future research perspectives and suggestions are discussed,
+contributing to the research and development agenda of SCAVs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 13 figures, under review by the journal of Transportation
+  Research Part E: Logistics and Transportation Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The improvement in transmission resilience metrics from reduced outages
+  or faster restoration can be calculated by rerunning historical outage data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arslan Ahmad, Ian Dobson, Svetlana Ekisheva, Christopher Claypool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transmission utilities routinely collect detailed outage data, including
+resilience events in which outages bunch up due to weather. The resilience
+events and their resilience metrics can readily be extracted from this
+historical outage data. Improvements such as grid hardening or investments in
+restoration lead to reduced outages or faster restoration. We show how to rerun
+this history with the effects of the reduced outages or faster restoration
+included to find the resulting improvement in resilience metrics, thus
+quantifying the benefits of these investments. This is demonstrated with case
+studies for specific events (a derecho and a hurricane), and all large events
+or large thunderstorms in the Midwest USA. Instead of predicting future extreme
+events with models, which is very challenging, the historical rerun readily
+quantifies the benefits that a resilience investment would have had if it had
+been made in the past. The historical rerun is particularly vivid in making the
+case for resilience investments to stakeholders because it quantifies the
+benefits for events actually experienced by those stakeholders, rather than for
+future events predicted with uncertainty.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resiliency metrics quantifying emergency response in a distribution
+  system 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shikhar Pandey, Gowtham Kandaperumal, Arslan Ahmad, Ian Dobson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The electric distribution system is a cornerstone of modern life, playing a
+critical role in the daily activities and well-being of individuals. As the
+world transitions toward a decarbonized future, where even mobility relies on
+electricity, ensuring the resilience of the grid becomes paramount. This paper
+introduces novel resilience metrics designed to equip utilities and
+stakeholders with actionable tools to assess performance during storm events.
+The metrics focus on emergency storm response and the resources required to
+improve customer service. The practical calculation of the metrics from
+historical utility data is demonstrated for multiple storm events.
+Additionally, the metrics' improvement with added crews is estimated by
+"rerunning history" with faster restoration. By applying this resilience
+framework, utilities can enhance their restoration strategies and unlock
+potential cost savings, benefiting both providers and customers in an era of
+heightened energy dependency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Impact of Observation Space Design Choices On Training
+  Reinforcement Learning Solutions for Spacecraft Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathaniel Hamilton, Kyle Dunlap, Kerianne L Hobbs
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research using Reinforcement Learning (RL) to learn autonomous control
+for spacecraft operations has shown great success. However, a recent study
+showed their performance could be improved by changing the action space, i.e.
+control outputs, used in the learning environment. This has opened the door for
+finding more improvements through further changes to the environment. The work
+in this paper focuses on how changes to the environment's observation space can
+impact the training and performance of RL agents learning the spacecraft
+inspection task. The studies are split into two groups. The first looks at the
+impact of sensors that were designed to help agents learn the task. The second
+looks at the impact of reference frames, reorienting the agent to see the world
+from a different perspective. The results show the sensors are not necessary,
+but most of them help agents learn more optimal behavior, and that the
+reference frame does not have a large impact, but is best kept consistent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 10 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Interaction in Transient Stability of Two-Inverter Power Systems
+  containing GFL inverter Using Manifold Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zhang, Yunjie Gu, Yue Zhu, Timothy C. Green, Hsiao-Dong Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many renewable energy resources are integrated into power systems via
+grid-following (GFL) inverters which rely on a phase-locked loop (PLL) for grid
+synchronization. During severe grid faults, GFL inverters are vulnerable to
+transient instability, often leading to disconnection from the grid. This paper
+aims to elucidate the interaction mechanisms and define the stability
+boundaries of systems of two inverters, including GFL, grid-forming (GFM), or
+grid-supporting (GSP) inverters. First, the generalized large-signal expression
+for the two-inverter system under various inverter combinations is derived,
+revealing that no energy function exists for systems containing GFL inverters.
+This implies that the traditional direct method cannot be applied to such
+systems. To overcome these challenges, a manifold method is employed to
+precisely determine the domain of attraction (DOA) of the system, and the
+transient stability margin is assessed by a new metric termed the critical
+clearing radius (CCR). A case study of the two-inverter system under various
+inverter combinations is conducted to explore large-signal interactions across
+different scenarios. Manifold analysis and simulation results reveal that GSP
+inverters using PLL for grid synchronization exhibit behavior similar to GFM
+inverters when the droop coefficients in the terminal voltage control loop
+(TVC) are sufficiently large. Compared to GFL inverters, GSP inverters
+incorporating a TVC significantly enhances the transient stability of other
+inverters. In the STATCOM case, the optimal placement of the STATCOM, realized
+by GSP or GFM inverters, is identified to be at the midpoint of a transmission
+line. All findings in this paper are validated through electromagnetic
+transient (EMT) simulations
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Safe Trusted Autonomy for Responsible Space Program 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kerianne L. Hobbs, Sean Phillips, Michelle Simon, Joseph B. Lyons, Jared Culbertson, Hamilton Scott Clouse, Nathaniel Hamilton, Kyle Dunlap, Zachary S. Lippay, Joshua Aurand, Zachary I. Bell, Taleri Hammack, Dorothy Ayres, Rizza Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Safe Trusted Autonomy for Responsible Space (STARS) program aims to
+advance autonomy technologies for space by leveraging machine learning
+technologies while mitigating barriers to trust, such as uncertainty,
+opaqueness, brittleness, and inflexibility. This paper presents the
+achievements and lessons learned from the STARS program in integrating
+reinforcement learning-based multi-satellite control, run time assurance
+approaches, and flexible human-autonomy teaming interfaces, into a new
+integrated testing environment for collaborative autonomous satellite systems.
+The primary results describe analysis of the reinforcement learning
+multi-satellite control and run time assurance algorithms. These algorithms are
+integrated into a prototype human-autonomy interface using best practices from
+human-autonomy trust literature, however detailed analysis of the effectiveness
+is left to future work. References are provided with additional detailed
+results of individual experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coverage and Spectral Efficiency of NOMA-Enabled LEO Satellite Networks
+  with Ordering Schemes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyu Li, Bodong Shang, Qingqing Wu, Chao Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates an analytical model for low-earth orbit (LEO)
+multi-satellite downlink non-orthogonal multiple access (NOMA) networks. The
+satellites transmit data to multiple NOMA user terminals (UTs), each employing
+successive interference cancellation (SIC) for decoding. Two ordering schemes
+are adopted for NOMA-enabled LEO satellite networks, i.e., mean signal power
+(MSP)-based ordering and
+instantaneous-signal-to-inter-satellite-interference-plus-noise ratio
+(ISINR)-based ordering. For each ordering scheme, we derive the coverage
+probabilities of UTs under different channel conditions. Moreover, we discuss
+how coverage is influenced by SIC, main-lobe gain, and tradeoffs between the
+number of satellites and their altitudes. Additionally, two user fairness-based
+power allocation (PA) schemes are considered, and PA coefficients with the
+optimal number of UTs that maximize their sum spectral efficiency (SE) are
+studied. Simulation results show that there exists a maximum
+signal-to-inter-satellite-interference-plus-noise ratio (SINR) threshold for
+each PA scheme that ensures the operation of NOMA in LEO satellite networks,
+and the benefit of NOMA only exists when the target SINR is below a certain
+threshold. Compared with orthogonal multiple access (OMA), NOMA increases UTs'
+sum SE by as much as 35\%. Furthermore, for most SINR thresholds, the sum SE
+increases with the number of UTs to the highest value, whilst the maximum sum
+SE is obtained when there are two UTs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Koopman-Based Model Predictive Control of Functional Electrical
+  Stimulation for Ankle Dorsiflexion and Plantarflexion Assistance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayank Singh, Noor Hakam, Trisha M. Kesar, Nitin Sharma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Functional Electrical Stimulation (FES) can be an effective tool to augment
+paretic muscle function and restore normal ankle function. Our approach
+incorporates a real-time, data-driven Model Predictive Control (MPC) scheme,
+built upon a Koopman operator theory (KOT) framework. This framework adeptly
+captures the complex nonlinear dynamics of ankle motion in a linearized form,
+enabling application of linear control approaches for highly nonlinear
+FES-actuated dynamics. Utilizing inertial measurement units (IMUs), our method
+accurately predicts the FES-induced ankle movements, while accounting for
+nonlinear muscle actuation dynamics, including the muscle activation for both
+plantarflexors, and dorsiflexors (Tibialis Anterior (TA)). The linear
+prediction model derived through KOT allowed us to formulate the MPC problem
+with linear state space dynamics, enhancing the real-time feasibility,
+precision and adaptability of the FES driven control. The effectiveness and
+applicability of our approach have been demonstrated through comprehensive
+simulations and experimental trials, including three participants with no
+disability and a participant with Multiple Sclerosis. Our findings highlight
+the potential of a KOT-based MPC approach for FES based gait assistance that
+offers effective and personalized assistance for individuals with gait
+impairment conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Orthogonal projection-based regularization for efficient model
+  augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bendegúz M. Györök, Jan H. Hoekstra, Johan Kon, Tamás Péni, Maarten Schoukens, Roland Tóth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep-learning-based nonlinear system identification has shown the ability to
+produce reliable and highly accurate models in practice. However, these
+black-box models lack physical interpretability, and often a considerable part
+of the learning effort is spent on capturing already expected/known behavior
+due to first-principles-based understanding of some aspects of the system. A
+potential solution is to integrate prior physical knowledge directly into the
+model structure, combining the strengths of physics-based modeling and
+deep-learning-based identification. The most common approach is to use an
+additive model augmentation structure, where the physics-based and the
+machine-learning (ML) components are connected in parallel. However, such
+models are overparametrized, training them is challenging, potentially causing
+the physics-based part to lose interpretability. To overcome this challenge,
+this paper proposes an orthogonal projection-based regularization technique to
+enhance parameter learning, convergence, and even model accuracy in
+learning-based augmentation of nonlinear baseline models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to L4DC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced sampled-data model predictive control via nonlinear lifting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nuthasith Gerdpratoom, Fumiya Matsuzaki, Yutaka Yamamoto, Kaoru Yamamoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel nonlinear model predictive control (NMPC)
+framework that incorporates a lifting technique to enhance control performance
+for nonlinear systems. While the lifting technique has been widely employed in
+linear systems to capture intersample behaviour, their application to nonlinear
+systems remains unexplored. We address this gap by formulating an NMPC scheme
+that combines fast-sample fast-hold (FSFH) approximations and numerical methods
+to approximate system dynamics and cost functions. The proposed approach is
+validated through two case studies: the Van der Pol oscillator and the inverted
+pendulum on a cart. Simulation results demonstrate that the lifted NMPC
+outperforms conventional NMPC in terms of reduced settling time and improved
+control accuracy. These findings underscore the potential of the lifting-based
+NMPC for efficient control of nonlinear systems, offering a practical solution
+for real-time applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Integrated Dispatching and Idle Fleet Steering with Deep
+  Reinforcement Learning for A Meal Delivery Platform 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyi Cheng, Shadi Sharif Azadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To achieve high service quality and profitability, meal delivery platforms
+like Uber Eats and Grubhub must strategically operate their fleets to ensure
+timely deliveries for current orders while mitigating the consequential impacts
+of suboptimal decisions that leads to courier understaffing in the future. This
+study set out to solve the real-time order dispatching and idle courier
+steering problems for a meal delivery platform by proposing a reinforcement
+learning (RL)-based strategic dual-control framework. To address the inherent
+sequential nature of these problems, we model both order dispatching and
+courier steering as Markov Decision Processes. Trained via a deep reinforcement
+learning (DRL) framework, we obtain strategic policies by leveraging the
+explicitly predicted demands as part of the inputs. In our dual-control
+framework, the dispatching and steering policies are iteratively trained in an
+integrated manner. These forward-looking policies can be executed in real-time
+and provide decisions while jointly considering the impacts on local and
+network levels. To enhance dispatching fairness, we propose convolutional deep
+Q networks to construct fair courier embeddings. To simultaneously rebalance
+the supply and demand within the service network, we propose to utilize
+mean-field approximated supply-demand knowledge to reallocate idle couriers at
+the local level. Utilizing the policies generated by the RL-based strategic
+dual-control framework, we find the delivery efficiency and fairness of
+workload distribution among couriers have been improved, and under-supplied
+conditions have been alleviated within the service network. Our study sheds
+light on designing an RL-based framework to enable forward-looking real-time
+operations for meal delivery platforms and other on-demand services.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Formally Verified Neural Lyapunov Function for Incremental
+  Input-to-State Stability of Unknown Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahan Basu, Bhabani Shankar Dey, Pushpak Jagtap
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents an approach to synthesize a Lyapunov-like function to
+ensure incrementally input-to-state stability ($\delta$-ISS) property for an
+unknown discrete-time system. To deal with challenges posed by unknown system
+dynamics, we parameterize the Lyapunov-like function as a neural network, which
+we train using the data samples collected from the unknown system along with
+appropriately designed loss functions. We propose a validity condition to test
+the obtained function and incorporate it into the training framework to ensure
+provable correctness at the end of the training. Finally, the usefulness of the
+proposed technique is proved using two case studies: a scalar non-linear
+dynamical system and a permanent magnet DC motor.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Path Planning for Multi-Copter UAV Formation Employing a Generalized
+  Particle Swarm Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Truong Hoang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper investigates the problem of path planning techniques for
+multi-copter uncrewed aerial vehicles (UAV) cooperation in a formation shape to
+examine surrounding surfaces. We first describe the problem as a joint
+objective cost for planning a path of the formation centroid working in a
+complicated space. The path planning algorithm, named the generalized particle
+swarm optimization algorithm, is then presented to construct an optimal,
+flyable path while avoiding obstacles and ensuring the flying mission
+requirements. A path-development scheme is then incorporated to generate a
+relevant path for each drone to maintain its position in the formation
+configuration. Simulation, comparison, and experiments have been conducted to
+verify the proposed approach. Results show the feasibility of the proposed
+path-planning algorithm with GEPSO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-intrusive Data-driven ADI-based Low-rank Balanced Truncation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Umair Zulfiqar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this short note, a non-intrusive data-driven formulation of ADI-based
+low-rank balanced truncation is provided. The proposed algorithm only requires
+transfer function samples at the mirror images of ADI shifts. If some shifts
+are used in both approximating the controllability Gramian and the
+observability Gramian, then samples of the transfer function's derivative at
+these shifts are also needed to enforce Hermite interpolation in the Loewner
+framework. It is noted that ADI-based low-rank balanced truncation can be
+viewed as a two-step process. The first step involves constructing an
+interpolant of the original model at the mirror images of the ADI shifts, which
+can be done non-intrusively within the Loewner framework. The second step
+involves reducing this interpolant using low-rank factors of Gramians
+associated with the interpolation data through the balanced square-root
+algorithm. This second step does not require any system information, making the
+overall process non-intrusive with the only required information being samples
+of the transfer function and/or its derivative at the mirror images of ADI
+shifts. Furthermore, it is shown that when the order of the reduced model in
+ADI-based low-rank balanced truncation is selected to match the numerical rank
+of the low-rank factors of the Gramians, it effectively reduces to standard
+interpolation at the mirror images of the ADI shift. An illustrative example is
+provided to explain the proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fully Decentralized Computation Offloading in Priority-Driven Edge
+  Computing Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Aggarwal, Melih Bastopcu, Muhammad Aneeq uz Zaman, Tamer Başar, Sennur Ulukus, Nail Akar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a novel framework for fully decentralized offloading policy design
+in multi-access edge computing (MEC) systems. The system comprises $N$
+power-constrained user equipments (UEs) assisted by an edge server (ES) to
+process incoming tasks. Tasks are labeled with urgency flags, and in this
+paper, we classify them under three urgency levels, namely, high, moderate, and
+low urgency. We formulate the problem of designing computation decisions for
+the UEs within a large population noncooperative game framework, where each UE
+selfishly decides on how to split task execution between its local onboard
+processor and the ES. We employ the weighted average age of information (AoI)
+metric to quantify information freshness at the UEs. Increased onboard
+processing consumes more local power, while increased offloading may
+potentially incur a higher average AoI due to other UEs' packets being
+offloaded to the same ES. Thus, we use the mean-field game (MFG) formulation to
+compute approximate decentralized Nash equilibrium offloading and local
+computation policies for the UEs to balance between the information freshness
+and local power consumption. Finally, we provide a projected gradient
+descent-based algorithm to numerically assess the merits of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Downlink Performance of Cell-Free Massive MIMO for LEO Satellite
+  Mega-Constellation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyu Li, Bodong Shang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-earth orbit (LEO) satellite communication (SatCom) has emerged as a
+promising technology for improving wireless connectivity in global areas.
+Cell-free massive multiple-input multiple-output (CF-mMIMO), an architecture
+recently proposed for next-generation networks, has yet to be fully explored
+for LEO satellites. In this paper, we investigate the downlink performance of a
+CF-mMIMO LEO SatCom network, where many satellite access points (SAPs)
+simultaneously serve the corresponding ground user terminals (UTs). Using tools
+from stochastic geometry, we model the locations of SAPs and UTs on surfaces of
+concentric spheres using Poisson point processes (PPPs) and present expressions
+based on linear minimum-mean-square-error (LMMSE) channel estimation and
+conjugate beamforming. Then, we derive the coverage probabilities in both
+fading and non-fading scenarios, with significant system parameters such as the
+Nakagami fading parameter, number of UTs, number of SAPs, orbital altitude, and
+service range brought by the dome angle. Finally, the analytical model is
+verified by extensive Monte Carlo simulations. Simulation results show that
+stronger line-of-sight (LoS) effects and a more comprehensive service range of
+the UT bring higher coverage probability despite existing multi-user
+interference. Moreover, we found that there exist optimal numbers of UTs for
+different orbital altitudes and dome angles, which provides valuable system
+design insights.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event Constrained Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Ovalle, Stefan Mazzadi, Carl D. Laird, Ignacio E. Grossmann, Joshua L. Pulsipher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present event constraints as a new modeling paradigm that
+generalizes joint chance constraints from stochastic optimization to (1)
+enforce a constraint on the probability of satisfying a set of constraints
+aggregated via application-specific logic (constituting an event) and (2) to be
+applied to general infinite-dimensional optimization (InfiniteOpt) problems
+(i.e., time, space, and/or uncertainty domains). This new constraint class
+offers significant modeling flexibility in posing InfiniteOpt constraints that
+are enforced over a certain portion of their domain (e.g., to a certain
+probability level), but can be challenging to reformulate/solve due to
+difficulties in representing arbitrary logical conditions and specifying a
+probabilistic measure on a collection of constraints. To address these
+challenges, we derive a generalized disjunctive programming (GDP)
+representation of event constrained optimization problems, which readily
+enables us to pose logical event conditions in a standard form and allows us to
+draw from a suite of GDP solution strategies that leverage the special
+structure of this problem class. We also extend several approximation
+techniques from the chance constraint literature to provide a means to
+reformulate certain event constraints without the use of binary variables. We
+illustrate these findings with case studies in stochastic optimal power flow,
+dynamic disease control, and optimal 2D diffusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparison of Strategies to Embed Physics-Informed Neural Networks in
+  Nonlinear Model Predictive Control Formulations Solved via Direct
+  Transcription 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Andrés Elorza Casas, Luis A. Ricardez-Sandoval, Joshua L. Pulsipher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study aims to benchmark candidate strategies for embedding neural
+network (NN) surrogates in nonlinear model predictive control (NMPC)
+formulations that are subject to systems described with partial differential
+equations and that are solved via direct transcription (i.e., simultaneous
+methods). This study focuses on the use of physics-informed NNs and
+physics-informed convolutional NNs as the internal (surrogate) models within
+the NMPC formulation. One strategy embeds NN models as explicit algebraic
+constraints, leveraging the automatic differentiation (AD) of an algebraic
+modelling language (AML) to evaluate the derivatives. Alternatively, the solver
+can be provided with derivatives computed external to the AML via the AD
+routines of the machine learning environment the NN is trained in. The three
+numerical experiments considered in this work reveal that replacing mechanistic
+models with NN surrogates may not always offer computational advantages when
+smooth activation functions are used in conjunction with a local nonlinear
+solver (e.g., Ipopt), even with highly nonlinear systems. Moreover, in this
+context, the external function evaluation of the NN surrogates often
+outperforms the embedding strategies that rely on explicit algebraic
+constraints, likely due to the difficulty in initializing the auxiliary
+variables and constraints introduced by explicit algebraic reformulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Decision-Making for Digital Twin in Additive Manufacturing
+  with Model Predictive Control using Time-Series Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Ping Chen, Vispi Karkaria, Ying-Kuan Tsai, Faith Rolark, Daniel Quispe, Robert X. Gao, Jian Cao, Wei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital Twin-a virtual replica of a physical system enabling real-time
+monitoring, model updating, prediction, and decision-making-combined with
+recent advances in machine learning (ML), offers new opportunities for
+proactive control strategies in autonomous manufacturing. However, achieving
+real-time decision-making with Digital Twins requires efficient optimization
+driven by accurate predictions of highly nonlinear manufacturing systems. This
+paper presents a simultaneous multi-step Model Predictive Control (MPC)
+framework for real-time decision-making, using a multi-variate deep neural
+network (DNN), named Time-Series Dense Encoder (TiDE), as the surrogate model.
+Different from the models in conventional MPC which only provide one-step ahead
+prediction, TiDE is capable of predicting future states within the prediction
+horizon in one shot (multi-step), significantly accelerating MPC. Using
+Directed Energy Deposition additive manufacturing as a case study, we
+demonstrate the effectiveness of the proposed MPC in achieving melt pool
+temperature tracking to ensure part quality, while reducing porosity defects by
+regulating laser power to maintain melt pool depth constraints. In this work,
+we first show that TiDE is capable of accurately predicting melt pool
+temperature and depth. Second, we demonstrate that the proposed MPC achieves
+precise temperature tracking while satisfying melt pool depth constraints
+within a targeted dilution range (10%-30%), reducing potential porosity
+defects. Compared to the PID controller, MPC results in smoother and less
+fluctuating laser power profiles with competitive or superior melt pool
+temperature control performance. This demonstrates MPC's proactive control
+capabilities, leveraging time-series prediction and real-time optimization,
+positioning it as a powerful tool for future Digital Twin applications and
+real-time process optimization in manufacturing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Privacy-Preserving Distributed Defense Framework for DC Microgrids
+  Against Exponentially Unbounded False Data Injection Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00588v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00588v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Zhang, Mohamadamin Rajabinezhad, Yichao Wang, Junbo Zhao, Shan Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel, fully distributed control framework for DC
+microgrids, enhancing resilience against exponentially unbounded false data
+injection (EU-FDI) attacks. Our framework features a consensus-based secondary
+control for each converter, effectively addressing these advanced threats. To
+further safeguard sensitive operational data, a privacy-preserving mechanism is
+incorporated into the control design, ensuring that critical information
+remains secure even under adversarial conditions. Rigorous Lyapunov stability
+analysis confirms the framework's ability to maintain critical DC microgrid
+operations like voltage regulation and load sharing under EU-FDI threats. The
+framework's practicality is validated through hardware-in-the-loop experiments,
+demonstrating its enhanced resilience and robust privacy protection against the
+complex challenges posed by quick variant FDI attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integration of Cobalt Ferromagnetic Control Gates for Electrical and
+  Magnetic Manipulation of Semiconductor Quantum Dots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.15862v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.15862v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio Bersano, Michele Aldeghi, Niccolò Martinolli, Victor Boureau, Thibault Aboud, Michele Ghini, Pasquale Scarlino, Gian Salis, Adrian Mihai Ionescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of electron spin qubit architectures for quantum computing
+processors has led to a strong interest in designing and integrating
+ferromagnets to induce stray magnetic fields for electron dipole spin resonance
+(EDSR). The integration of nanomagnets imposes however strict layout and
+processing constraints, challenging the arrangement of different gating layers
+and the control of neighboring qubit frequencies. This work reports a
+successful integration of nano-sized cobalt control gates into a multi-gate
+FD-SOI nanowire with nanometer-scale dot-to-magnet pitch, simultaneously
+exploiting electrical and ferromagnetic properties of the gate stack at
+nanoscale. The electrical characterization of the multi-gate nanowire exhibits
+full field effect functionality of all ferromagnetic gates from room
+temperature to 10 mK, proving quantum dot formation when ferromagnets are
+operated as barrier gates. The front-end-of-line (FEOL) compatible integration
+of cobalt is examined by energy dispersive X-ray spectroscopy and high/low
+frequency capacitance characterization, confirming the quality of interfaces
+and control over material diffusion. Insights into the magnetic properties of
+thin films and patterned control-gates are provided by vibrating sample
+magnetometry and electron holography measurements. Micromagnetic simulations
+anticipate that this structure fulfills the requirements for EDSR driving for
+magnetic fields higher than 1 T, where a homogeneous magnetization along the
+hard magnetic axis of the Co gates is expected. The FDSOI architecture
+showcased in this study provides a scalable alternative to micromagnets
+deposited in the back-end-of-line (BEOL) and middle-of-line (MOL) processes,
+while bringing technological insights for the FEOL-compatible integration of Co
+nanostructures in spin qubit devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empowering Aggregators with Practical Data-Driven Tools: Harnessing
+  Aggregated and Disaggregated Flexibility for Demand Response 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10726v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10726v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Costas Mylonas, Donata Boric, Leila Luttenberger Maric, Alexandros Tsitsanis, Eleftheria Petrianou, Magda Foti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the interaction between aggregators and building
+occupants in activating flexibility through Demand Response (DR) programs, with
+a focus on reinforcing the resilience of the energy system considering the
+uncertainties presented by Renewable Energy Sources (RES). Firstly, it
+introduces a methodology of optimizing aggregated flexibility provision
+strategies in environments with limited data, utilizing Discrete Fourier
+Transformation (DFT) and clustering techniques to identify building occupants'
+activity patterns. Secondly, the study assesses the disaggregated flexibility
+provision of Heating Ventilation and Air Conditioning (HVAC) systems during DR
+events, employing machine learning and optimization techniques for precise,
+device-level analysis. The first approach offers a non-intrusive pathway for
+aggregators to provide flexibility services in environments of a single smart
+meter for the whole building's consumption, while the second approach maximizes
+the amount of flexibility in the case of dedicated metering devices to the HVAC
+systems by carefully considering building occupants' thermal comfort profiles.
+Through the application of data-driven techniques and encompassing case studies
+from both industrial and residential buildings, this paper not only unveils
+pivotal opportunities for aggregators in the balancing and emerging flexibility
+markets but also successfully develops and demonstrates end-to-end practical
+tools for aggregators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Quantum Power Flow for Risk Assessment in Power Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02203v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02203v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brynjar Sævarsson, Hjörtur Jóhannsson, Spyros Chatzivasileiadis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the first quantum computing framework for Stochastic
+Quantum Power Flow (SQPF) analysis in power systems. The proposed method
+leverages quantum states to encode power flow distributions, enabling the use
+of Quantum Monte Carlo (QMC) sampling to efficiently assess the probability of
+line overloads. Our approach significantly reduces the required sample size
+compared to traditional Monte Carlo methods, making it particularly suited for
+risk assessments in scenarios involving high uncertainty, such as renewable
+energy integration. We validate the method on two test systems, demonstrating
+the computational advantage of quantum algorithms in reducing sample complexity
+while maintaining accuracy. This work represents a foundational step toward
+scalable quantum power flow analysis, with potential applications in future
+power system operations and planning. The results show promising computational
+speedups, underscoring the potential of quantum computing in addressing the
+increasing uncertainty in modern power grids.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the Electric Power System Research journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Remaining Discharge Energy Prediction for Lithium-Ion Batteries Over
+  Broad Current Ranges: A Machine Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14767v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14767v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Tu, Manashita Borah, Scott Moura, Yebin Wang, Huazhen Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lithium-ion batteries have found their way into myriad sectors of industry to
+drive electrification, decarbonization, and sustainability. A crucial aspect in
+ensuring their safe and optimal performance is monitoring their energy levels.
+In this paper, we present the first study on predicting the remaining energy of
+a battery cell undergoing discharge over wide current ranges from low to high
+C-rates. The complexity of the challenge arises from the cell's
+C-rate-dependent energy availability as well as its intricate electro-thermal
+dynamics especially at high C-rates. To address this, we introduce a new
+definition of remaining discharge energy and then undertake a systematic effort
+in harnessing the power of machine learning to enable its prediction. Our
+effort includes two parts in cascade. First, we develop an accurate dynamic
+model based on integration of physics with machine learning to capture a
+battery's voltage and temperature behaviors. Second, based on the model, we
+propose a machine learning approach to predict the remaining discharge energy
+under arbitrary C-rates and pre-specified cut-off limits in voltage and
+temperature. The experimental validation shows that the proposed approach can
+predict the remaining discharge energy with a relative error of less than 3%
+when the current varies between 0~8 C for an NCA cell and 0~15 C for an LFP
+cell. The approach, by design, is amenable to training and computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 13 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Log-Scale Quantization in Distributed First-Order Methods:
+  Gradient-based Learning from Distributed Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00621v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00621v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Doostmohammadian, Muhammad I. Qureshi, Mohammad Hossein Khalesi, Hamid R. Rabiee, Usman A. Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized strategies are of interest for learning from large-scale data
+over networks. This paper studies learning over a network of geographically
+distributed nodes/agents subject to quantization. Each node possesses a private
+local cost function, collectively contributing to a global cost function, which
+the considered methodology aims to minimize. In contrast to many existing
+papers, the information exchange among nodes is log-quantized to address
+limited network-bandwidth in practical situations. We consider a first-order
+computationally efficient distributed optimization algorithm (with no extra
+inner consensus loop) that leverages node-level gradient correction based on
+local data and network-level gradient aggregation only over nearby nodes. This
+method only requires balanced networks with no need for stochastic weight
+design. It can handle log-scale quantized data exchange over possibly
+time-varying and switching network setups. We study convergence over both
+structured networks (for example, training over data-centers) and ad-hoc
+multi-agent networks (for example, training over dynamic robotic networks).
+Through experimental validation, we show that (i) structured networks generally
+result in a smaller optimality gap, and (ii) log-scale quantization leads to a
+smaller optimality gap compared to uniform quantization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE TASE 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics-Informed Neural Network Lyapunov Functions: PDE
+  Characterization, Learning, and Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09131v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09131v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Liu, Yiming Meng, Maxwell Fitzsimmons, Ruikun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide a systematic investigation of using physics-informed neural
+networks to compute Lyapunov functions. We encode Lyapunov conditions as a
+partial differential equation (PDE) and use this for training neural network
+Lyapunov functions. We analyze the analytical properties of the solutions to
+the Lyapunov and Zubov PDEs. In particular, we show that employing the Zubov
+equation in training neural Lyapunov functions can lead to approximate regions
+of attraction close to the true domain of attraction. We also examine
+approximation errors and the convergence of neural approximations to the unique
+solution of Zubov's equation. We then provide sufficient conditions for the
+learned neural Lyapunov functions that can be readily verified by
+satisfiability modulo theories (SMT) solvers, enabling formal verification of
+both local stability analysis and region-of-attraction estimates in the large.
+Through a number of nonlinear examples, ranging from low to high dimensions, we
+demonstrate that the proposed framework can outperform traditional
+sums-of-squares (SOS) Lyapunov functions obtained using semidefinite
+programming (SDP).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The current version is accepted to the IFAC Journal Automatica</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Optimal Stable Matches in Decentralized Markets with Unknown
+  Preferences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.04669v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.04669v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vade Shah, Bryce L. Ferguson, Jason R. Marden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Matching algorithms have demonstrated great success in several practical
+applications, but they often require centralized coordination and plentiful
+information. In many modern online marketplaces, agents must independently seek
+out and match with another using little to no information. For these kinds of
+settings, can we design decentralized, limited-information matching algorithms
+that preserve the desirable properties of standard centralized techniques? In
+this work, we constructively answer this question in the affirmative. We model
+a two-sided matching market as a game consisting of two disjoint sets of
+agents, referred to as proposers and acceptors, each of whom seeks to match
+with their most preferable partner on the opposite side of the market. However,
+each proposer has no knowledge of their own preferences, so they must learn
+their preferences while forming matches in the market. We present a simple
+online learning rule that guarantees a strong notion of probabilistic
+convergence to the welfare-maximizing equilibrium of the game, referred to as
+the proposer-optimal stable match. To the best of our knowledge, this
+represents the first completely decoupled, communication-free algorithm that
+guarantees probabilistic convergence to an optimal stable match, irrespective
+of the structure of the matching market.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">35</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Best Response Convergence for Zero-sum Stochastic Dynamic Games with
+  Partial and Asymmetric Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiang Guan, Iman Shames, Tyler H. Summers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze best response dynamics for finding a Nash equilibrium of an
+infinite horizon zero-sum stochastic linear quadratic dynamic game (LQDG) with
+partial and asymmetric information. We derive explicit expressions for each
+player's best response within the class of pure linear dynamic output feedback
+control strategies where the internal state dimension of each control strategy
+is an integer multiple of the system state dimension. With each best response,
+the players form increasingly higher-order belief states, leading to
+infinite-dimensional internal states. However, we observe in extensive
+numerical experiments that the game's value converges after just a few
+iterations, suggesting that strategies associated with increasingly
+higher-order belief states eventually provide no benefit. To help explain this
+convergence, our numerical analysis reveals rapid decay of the controllability
+and observability Gramian eigenvalues and Hankel singular values in
+higher-order belief dynamics, indicating that the higher-order belief dynamics
+become increasingly difficult for both players to control and observe.
+Consequently, the higher-order belief dynamics can be closely approximated by
+low-order belief dynamics with bounded error, and thus feedback strategies with
+limited internal state dimension can closely approximate a Nash equilibrium.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Learning for Physically-Constrained Neural System Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankush Chakrabarty, Gordon Wichern, Vedang M. Deshpande, Abraham P. Vinod, Karl Berntorp, Christopher R. Laughman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a gradient-based meta-learning framework for rapid adaptation of
+neural state-space models (NSSMs) for black-box system identification. When
+applicable, we also incorporate domain-specific physical constraints to improve
+the accuracy of the NSSM. The major benefit of our approach is that instead of
+relying solely on data from a single target system, our framework utilizes data
+from a diverse set of source systems, enabling learning from limited target
+data, as well as with few online training iterations. Through benchmark
+examples, we demonstrate the potential of our approach, study the effect of
+fine-tuning subnetworks rather than full fine-tuning, and report real-world
+case studies to illustrate the practical application and generalizability of
+the approach to practical problems with physical-constraints. Specifically, we
+show that the meta-learned models result in improved downstream performance in
+model-based state estimation in indoor localization and energy systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonlinear port-Hamiltonian system identification from input-state-output
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim Cherifi, Achraf El Messaoudi, Hannes Gernandt, Marco Roschkowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A framework for identifying nonlinear port-Hamiltonian systems using
+input-state-output data is introduced. The framework utilizes neural networks'
+universal approximation capacity to effectively represent complex dynamics in a
+structured way. We show that using the structure helps to make long-term
+predictions compared to baselines that do not incorporate physics. We also
+explore different architectures based on MLPs, KANs, and using prior
+information. The technique is validated through examples featuring
+nonlinearities in either the skew-symmetric terms, the dissipative terms, or
+the Hamiltonian.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Averaged Adam accelerates stochastic optimization in the training of
+  deep neural network approximations for partial differential equation and
+  optimal control problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steffen Dereich, Arnulf Jentzen, Adrian Riekert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods - usually consisting of a class of deep neural networks
+(DNNs) trained by a stochastic gradient descent (SGD) optimization method - are
+nowadays omnipresent in data-driven learning problems as well as in scientific
+computing tasks such as optimal control (OC) and partial differential equation
+(PDE) problems. In practically relevant learning tasks, often not the
+plain-vanilla standard SGD optimization method is employed to train the
+considered class of DNNs but instead more sophisticated adaptive and
+accelerated variants of the standard SGD method such as the popular Adam
+optimizer are used. Inspired by the classical Polyak-Ruppert averaging
+approach, in this work we apply averaged variants of the Adam optimizer to
+train DNNs to approximately solve exemplary scientific computing problems in
+the form of PDEs and OC problems. We test the averaged variants of Adam in a
+series of learning problems including physics-informed neural network (PINN),
+deep backward stochastic differential equation (deep BSDE), and deep Kolmogorov
+approximations for PDEs (such as heat, Black-Scholes, Burgers, and Allen-Cahn
+PDEs), including DNN approximations for OC problems, and including DNN
+approximations for image classification problems (ResNet for CIFAR-10). In each
+of the numerical examples the employed averaged variants of Adam outperform the
+standard Adam and the standard SGD optimizers, particularly, in the situation
+of the scientific machine learning problems. The Python source codes for the
+numerical experiments associated to this work can be found on GitHub at
+https://github.com/deeplearningmethods/averaged-adam.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Set-valued evenly convex functions: characterizations and c-conjugacy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. D. Fajardo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we deal with set-valued functions with values in the power set
+of a separated locally convex space where a nontrivial pointed convex cone
+induces a partial order relation. A set-valued function is evenly convex if its
+epigraph is an evenly convex set, i.e., it is the intersection of an arbitrary
+family of open half-spaces. In this paper we characterize evenly convex
+set-valued functions as the pointwise supremum of its set-valued e-affine
+minorants. Moreover, a suitable conjugation pattern will be developed for these
+functions, as well as the counterpart of the biconjugation Fenchel-Moreau
+theorem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rank conditions for exactness of semidefinite relaxations in polynomial
+  optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean B Lasserre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the Moment-SOS hierarchy in polynomial optimization. We first
+provide a sufficient condition to solve the truncated K-moment problem
+associated with a given degree-$2n$ pseudo-moment sequence $\phi$ n and a
+semi-algebraic set $K \subset \mathbb{R}^d$. Namely, let $2v$ be the maximum
+degree of the polynomials that describe $K$. If the rank $r$ of its associated
+moment matrix is less than $nv + 1$, then $\phi^n$ has an atomic representing
+measure supported on at most $r$ points of $K$. When used at step-$n$ of the
+Moment-SOS hierarchy, it provides a sufficient condition to guarantee its
+finite convergence (i.e., the optimal value of the corresponding degree-n
+semidefinite relaxation of the hierarchy is the global minimum). For Quadratic
+Constrained Quadratic Problems (QCQPs) one may also recover global minimizers
+from the optimal pseudo-moment sequence. Our condition is in the spirit of
+Blekherman's rank condition and while on the one-hand it is more restrictive,
+on the other hand it applies to constrained POPs as it provides a localization
+on $K$ for the representing measure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributed Generalized Nash Equilibria Learning for Online Stochastic
+  Aggregative Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06023v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06023v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixin Du, Min Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates online stochastic aggregative games subject to local
+set constraints and time-varying coupled inequality constraints, where each
+player possesses a time-varying expectation-valued cost function relying on not
+only its own decision variable but also an aggregation of all the players'
+variables. Each player can only access its local individual cost function and
+constraints, necessitating partial information exchanges with neighboring
+players through time-varying unbalanced networks. Additionally, local cost
+functions and constraint functions are not prior knowledge and only revealed
+gradually. To learn generalized Nash equilibria of such games, a novel
+distributed online stochastic algorithm is devised based on push-sum and
+primal-dual strategies. Through rigorous analysis, high probability bounds on
+the regret and constraint violation are provided by appropriately selecting
+decreasing stepsizes. Moreover, for a time-invariant stochastic strongly
+monotone game, it is shown that the generated sequence by the designed
+algorithm converges to its variational generalized Nash equilibrium (GNE)
+almost surely, and the time-averaged sequence converges sublinearly with high
+probability. Finally, the derived theoretical results are illustrated by
+numerical simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soft regression trees: a model variant and a decomposition training
+  algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Consolo, Edoardo Amaldi, Andrea Manno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision trees are widely used for classification and regression tasks in a
+variety of application fields due to their interpretability and good accuracy.
+During the past decade, growing attention has been devoted to globally
+optimized decision trees with deterministic or soft splitting rules at branch
+nodes, which are trained by optimizing the error function over all the tree
+parameters. In this work, we propose a new variant of soft multivariate
+regression trees (SRTs) where, for every input vector, the prediction is
+defined as the linear regression associated to a single leaf node, namely, the
+leaf node obtained by routing the input vector from the root along the branches
+with higher probability. SRTs exhibit the conditional computational property,
+i.e., each prediction depends on a small number of nodes (parameters), and our
+nonlinear optimization formulation for training them is amenable to
+decomposition. After showing a universal approximation result for SRTs, we
+present a decomposition training algorithm including a clustering-based
+initialization procedure and a heuristic for reassigning the input vectors
+along the tree. Under mild assumptions, we establish asymptotic convergence
+guarantees. Experiments on 15 wellknown datasets indicate that our SRTs and
+decomposition algorithm yield higher accuracy and robustness compared with
+traditional soft regression trees trained using the nonlinear optimization
+formulation of Blanquero et al., and a significant reduction in training times
+as well as a slightly better average accuracy compared with the mixed-integer
+optimization approach of Bertsimas and Dunn. We also report a comparison with
+the Random Forest ensemble method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Designing a Robust and Cost-Efficient Electrified Bus Network with
+  Sparse Energy Consumption Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05939v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05939v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Momen, Yousef Maknoon, Bart van Arem, Shadi Sharif Azadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the challenges of charging infrastructure design (CID)
+for electrified public transport networks using Battery Electric Buses (BEBs)
+under conditions of sparse energy consumption data. Accurate energy consumption
+estimation is critical for cost-effective and reliable electrification but
+often requires costly field experiments, resulting in limited data. To address
+this issue, we propose two mathematical models designed to handle uncertainty
+and data sparsity in energy consumption. The first is a robust optimization
+model with box uncertainty, addressing variability in energy consumption. The
+second is a data-driven distributionally robust optimization model that
+leverages observed data to provide more flexible and informed solutions. To
+evaluate these models, we apply them to the Rotterdam bus network. Our analysis
+reveals three key insights: (1) Ignoring variations in energy consumption can
+result in operational unreliability, with up to 55\% of scenarios leading to
+infeasible trips. (2) Designing infrastructure based on worst-case energy
+consumption increases costs by 67\% compared to using average estimates. (3)
+The data-driven distributionally robust optimization model reduces costs by
+28\% compared to the box uncertainty model while maintaining reliability,
+especially in scenarios where extreme energy consumption values are rare and
+data exhibit skewness. In addition to cost savings, this approach provides
+robust protection against uncertainty, ensuring reliable operation under
+diverse conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Random Sparse Lifts: Construction, Analysis and Convergence of finite
+  sparse networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David A. R. Robin, Kevin Scaman, Marc Lelarge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a framework to define a large class of neural networks for which,
+by construction, training by gradient flow provably reaches arbitrarily low
+loss when the number of parameters grows. Distinct from the fixed-space global
+optimality of non-convex optimization, this new form of convergence, and the
+techniques introduced to prove such convergence, pave the way for a usable deep
+learning convergence theory in the near future, without overparameterization
+assumptions relating the number of parameters and training samples. We define
+these architectures from a simple computation graph and a mechanism to lift it,
+thus increasing the number of parameters, generalizing the idea of increasing
+the widths of multi-layer perceptrons. We show that architectures similar to
+most common deep learning models are present in this class, obtained by
+sparsifying the weight tensors of usual architectures at initialization.
+Leveraging tools of algebraic topology and random graph theory, we use the
+computation graph's geometry to propagate properties guaranteeing convergence
+to any precision for these large sparse models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Twelfth International Conference on Learning Representations, May
+  2024, Vienna, Austria</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Gradient Tracking Algorithms for Distributed Optimization
+  Problems with Inexact Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05737v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05737v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengchao Zhaoa, Yongchao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributed optimization problems usually face inexact communication issues
+induced by communication quantization, differential privacy protection, or
+channels noise. Most existing algorithms need two-timescale setting of the
+stepsize of gradient descent and the parameter of noise suppression to ensure
+the convergence to the optimal solution. In this paper, we propose two
+single-timescale algorithms, VRA-DGT and VRA--DSGT, for distributed
+deterministic and stochastic optimization problems with inexact communication
+respectively. VRA-DGT integrates the Variance-Reduced Aggregation (VRA)
+mechanism with the distributed gradient tracking framework, which achieves a
+convergence rate of $\mathcal{O}\left(k^{-1}\right)$ in the mean-square sense
+when the objective function is strongly convex and smooth. For distributed
+stochastic optimization problem,VRA-DSGT, where a hybrid variance reduction
+technique has been introduced in VRA-DGT,
+  VRA-DGT,, maintains the convergence rate of $\mathcal{O}\left(k^{-1}\right)$
+for strongly convex and smooth objective function. Simulated experiments on
+logistic regression problem with real-world data verify the effectiveness of
+the proposed algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Two-timescale Primal-dual Algorithm for Decentralized Optimization
+  with Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05701v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05701v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoming Liu, Chung-Yiu Yau, Hoi-To Wai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a two-timescale compressed primal-dual (TiCoPD) algorithm
+for decentralized optimization with improved communication efficiency over
+prior works on primal-dual decentralized optimization. The algorithm is built
+upon the primal-dual optimization framework and utilizes a
+majorization-minimization procedure. The latter naturally suggests the agents
+to share a compressed difference term during the iteration. Furthermore, the
+TiCoPD algorithm incorporates a fast timescale mirror sequence for agent
+consensus on nonlinearly compressed terms, together with a slow timescale
+primal-dual recursion for optimizing the objective function. We show that the
+TiCoPD algorithm converges with a constant step size. It also finds an O(1 /T )
+stationary solution after T iterations. Numerical experiments on decentralized
+training of a neural network validate the efficacy of TiCoPD algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Adaptive Supplementary Control for Damping Weak-Grid SSOs
+  Involving IBRs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sina Ameli, Lilan Karunaratne, Nilanjan Ray Chaudhuri, Constantino Lagoa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Subsynchronous oscillations (SSOs) involving grid-following converters
+(GFLCs) connected to weak grids are a relatively new phenomena observed in
+modern power systems. SSOs are further exacerbated when grids become weaker
+because lines are disconnected due to maintenance or following faults. Such
+undesirable oscillations have also led to curtailment of inverter-based
+resource (IBR) outputs. In contrast to most literature addressing the issue by
+retuning/redesigning of standard IBR controllers, we propose a robust adaptive
+supplementary control for damping of such SSOs while keeping standard controls
+unaltered. As a result, uncertainty in system conditions can be handled without
+negatively impacting the nominal IBR performance. To that end, the adaptive
+control law is derived for a GFLC connected to the grid, where the grid is
+modeled by the Thevenin's equivalent representation with uncertainty and
+disturbances. The theoretical result provides dissipativity certificate for the
+closed-loop error dynamics with sufficient conditions for stability. The
+effectiveness of the developed controller is validated with several case
+studies conducted on a single-GFLC-infinite-bus test system, the IEEE $2$-area
+test system, wherein some of the synchronous generators are replaced by GFLCs,
+and a modified IEEE $5$-area test system with two GFLCs. The findings
+demonstrate that under very weak grid conditions, the proposed robust adaptive
+control performs well in stabilizing SSO modes, which a classical
+state-feedback control method fails to address.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 19 figures, 3 tables, IEEE Transactions on Power Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Single-Loop Variance-Reduced Stochastic Algorithm for Nonconvex-Concave
+  Minimax Optimization <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05677v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05677v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xia Jiang, Linglingzhi Zhu, Taoli Zheng, Anthony Man-Cho So
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonconvex-concave (NC-C) finite-sum minimax problems have broad applications
+in decentralized optimization and various machine learning tasks. However, the
+nonsmooth nature of NC-C problems makes it challenging to design effective
+variance reduction techniques. Existing vanilla stochastic algorithms using
+uniform samples for gradient estimation often exhibit slow convergence rates
+and require bounded variance assumptions. In this paper, we develop a novel
+probabilistic variance reduction updating scheme and propose a single-loop
+algorithm called the probabilistic variance-reduced smoothed gradient
+descent-ascent (PVR-SGDA) algorithm. The proposed algorithm achieves an
+iteration complexity of $O(\epsilon^{-4})$, surpassing the best-known rates of
+stochastic algorithms for NC-C minimax problems and matching the performance of
+the best deterministic algorithms in this context. Finally, we demonstrate the
+effectiveness of the proposed algorithm through numerical simulations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The conference version of this paper has been accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Efficient Dual ADMM for Huber Regression with Fused Lasso Penalty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengjiao Shi, Yunhai Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ordinary least squares estimate in linear regression is sensitive to the
+influence of errors with large variance, which reduces its robustness,
+especially when dealing with heavy-tailed errors or outliers frequently
+encountered in real-world scenarios. To address this issue and accommodate the
+sparsity of coefficients along with their sequential disparities, we combine
+the adaptive robust Huber loss function with a fused lasso penalty. This
+combination yields a robust estimator capable of simultaneously achieving
+estimation and variable selection. Furthermore, we utilize an efficient
+alternating direction method of multipliers to solve this regression model from
+a dual perspective. The effectiveness and efficiency of our proposed approach
+is demonstrated through numerical experiments carried out on both simulated and
+real datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages,24 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FIRM: Federated Image Reconstruction using Multimodal Tomographic Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05642v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05642v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geunyeong Byeon, Minseok Ryu, Zichao Wendy Di, Kibaek Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a federated algorithm for reconstructing images using multimodal
+tomographic data sourced from dispersed locations, addressing the challenges of
+traditional unimodal approaches that are prone to noise and reduced image
+quality. Our approach formulates a joint inverse optimization problem
+incorporating multimodality constraints and solves it in a federated framework
+through local gradient computations complemented by lightweight central
+operations, ensuring data decentralization. Leveraging the connection between
+our federated algorithm and the quadratic penalty method, we introduce an
+adaptive step-size rule with guaranteed sublinear convergence and further
+suggest its extension to augmented Lagrangian framework. Numerical results
+demonstrate its superior computational efficiency and improved image
+reconstruction quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event Constrained Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Ovalle, Stefan Mazzadi, Carl D. Laird, Ignacio E. Grossmann, Joshua L. Pulsipher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present event constraints as a new modeling paradigm that
+generalizes joint chance constraints from stochastic optimization to (1)
+enforce a constraint on the probability of satisfying a set of constraints
+aggregated via application-specific logic (constituting an event) and (2) to be
+applied to general infinite-dimensional optimization (InfiniteOpt) problems
+(i.e., time, space, and/or uncertainty domains). This new constraint class
+offers significant modeling flexibility in posing InfiniteOpt constraints that
+are enforced over a certain portion of their domain (e.g., to a certain
+probability level), but can be challenging to reformulate/solve due to
+difficulties in representing arbitrary logical conditions and specifying a
+probabilistic measure on a collection of constraints. To address these
+challenges, we derive a generalized disjunctive programming (GDP)
+representation of event constrained optimization problems, which readily
+enables us to pose logical event conditions in a standard form and allows us to
+draw from a suite of GDP solution strategies that leverage the special
+structure of this problem class. We also extend several approximation
+techniques from the chance constraint literature to provide a means to
+reformulate certain event constraints without the use of binary variables. We
+illustrate these findings with case studies in stochastic optimal power flow,
+dynamic disease control, and optimal 2D diffusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMOP: Stochastic trust region method for multi-objective problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06350v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06350v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nataša Krejić, Nataša Krklec Jerinkić, Luka Rutešić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem considered is a multi-objective optimization problem, in which
+the goal is to find an optimal value of a vector function representing various
+criteria. The aim of this work is to develop an algorithm which utilizes the
+trust region framework with probabilistic model functions, able to cope with
+noisy problems, using inaccurate functions and gradients. We prove the almost
+sure convergence of the proposed algorithm to a Pareto critical point if the
+model functions are good approximations in probabilistic sense. Numerical
+results demonstrate effectiveness of the probabilistic trust region by
+comparing it to competitive stochastic multi-objective solvers. The application
+in supervised machine learning is showcased by training non discriminatory
+Logistic Regression models on different size data groups. Additionally, we use
+several test examples with irregularly shaped fronts to exhibit the efficiency
+of the algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparison of Strategies to Embed Physics-Informed Neural Networks in
+  Nonlinear Model Predictive Control Formulations Solved via Direct
+  Transcription 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Andrés Elorza Casas, Luis A. Ricardez-Sandoval, Joshua L. Pulsipher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study aims to benchmark candidate strategies for embedding neural
+network (NN) surrogates in nonlinear model predictive control (NMPC)
+formulations that are subject to systems described with partial differential
+equations and that are solved via direct transcription (i.e., simultaneous
+methods). This study focuses on the use of physics-informed NNs and
+physics-informed convolutional NNs as the internal (surrogate) models within
+the NMPC formulation. One strategy embeds NN models as explicit algebraic
+constraints, leveraging the automatic differentiation (AD) of an algebraic
+modelling language (AML) to evaluate the derivatives. Alternatively, the solver
+can be provided with derivatives computed external to the AML via the AD
+routines of the machine learning environment the NN is trained in. The three
+numerical experiments considered in this work reveal that replacing mechanistic
+models with NN surrogates may not always offer computational advantages when
+smooth activation functions are used in conjunction with a local nonlinear
+solver (e.g., Ipopt), even with highly nonlinear systems. Moreover, in this
+context, the external function evaluation of the NN surrogates often
+outperforms the embedding strategies that rely on explicit algebraic
+constraints, likely due to the difficulty in initializing the auxiliary
+variables and constraints introduced by explicit algebraic reformulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcing Infrastructure Networks with Multicriteria Portfolio
+  Decision Analysis: An Application to Railway Stations in Finland 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joaquín de la Barra, Ahti Salo, Leevi Olander, Kash Barker, Jussi Kangaspunta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advanced societies are crucially dependent on critical infrastructure
+networks for the reliable delivery of essential goods and services. Hence,
+well-founded analyses concerning disruptions are needed to guide decisions that
+seek to ensure the performance of these networks in the face of failures caused
+by vulnerabilities to external hazards or technical malfunctions. In this
+setting, we develop a multicriteria decision analysis approach to support the
+formulation of cost-efficient portfolios of preventive reinforcement actions.
+Our approach is general in that it (i) allows for multiple objectives, such as
+those that represent the volume of traffic that is enabled between alternative
+origin-destination pairs in a transportation network, (ii) uses methods of
+probabilistic risk assessment to quantify the expected performance of the
+network, and (iii) solves optimization problems to identify those combinations
+of reinforcement actions that are cost-efficient in improving the performance
+of the network, given the available, possibly incomplete information about the
+relative importance of objectives. Our methodological contributions are
+illustrated by a case study on the analysis of railway switches at a
+representative Finnish railway station.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploratory Randomization for Discrete-Time Linear Exponential Quadratic
+  Gaussian (LEQG) Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastien Lleo, Wolfgang Runggaldier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate exploratory randomization for an extended
+linear-exponential-quadratic-Gaussian (LEQG) control problem in discrete time.
+This extended control problem is related to the structure of risk-sensitive
+investment management applications. We introduce exploration through a
+randomization of the control. Next, we apply the duality between free energy
+and relative entropy to reduce the LEQG problem to an equivalent risk-neutral
+LQG control problem with an entropy regularization term, see, e.g. Dai Pra et
+al. (1996), for which we present a solution approach based on Dynamic
+Programming. Our approach, based on the energy-entropy duality may also be
+considered as leading to a justification for the use, in the literature, of an
+entropy regularization when applying a randomized control.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Lie-Bracket Averaging for a Class of Hybrid Dynamical Systems with
+  Applications to Model-Free Control and Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.15732v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.15732v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahmoud Abdelgalil, Jorge I. Poveda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The stability of dynamical systems with oscillatory behaviors and
+well-defined average vector fields has traditionally been studied using
+averaging theory. These tools have also been applied to hybrid dynamical
+systems, which combine continuous and discrete dynamics. However, most
+averaging results for hybrid systems are limited to first-order methods,
+hindering their use in systems and algorithms that require high-order averaging
+techniques, such as hybrid Lie-bracket-based extremum seeking algorithms and
+hybrid vibrational controllers. To address this limitation, we introduce a
+novel high-order averaging theorem for analyzing the stability of hybrid
+dynamical systems with high-frequency periodic flow maps. These systems
+incorporate set-valued flow maps and jump maps, effectively modeling well-posed
+differential and difference inclusions. By imposing appropriate regularity
+conditions, we establish results on $(T,\varepsilon)$-closeness of solutions
+and semi-global practical asymptotic stability for sets. These theoretical
+results are then applied to the study of three distinct applications in the
+context of hybrid model-free control and optimization via Lie-bracket
+averaging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Monotone Causality in Opportunistically Stochastic Shortest Path
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.14121v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.14121v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mallory E. Gaspard, Alexander Vladimirsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When traveling through a graph with an accessible deterministic path to a
+target, is it ever preferable to resort to stochastic node-to-node transitions
+instead? And if so, what are the conditions guaranteeing that such a stochastic
+optimal routing policy can be computed efficiently? We aim to answer these
+questions here by defining a class of Opportunistically Stochastic Shortest
+Path (OSSP) problems and deriving sufficient conditions for applicability of
+non-iterative label-setting methods. The usefulness of this framework is
+demonstrated in two very different contexts: numerical analysis and autonomous
+vehicle routing. We use OSSPs to derive causality conditions for
+semi-Lagrangian discretizations of anisotropic Hamilton-Jacobi equations. We
+also use a Dijkstra-like method to solve OSSPs optimizing the timing and
+urgency of lane change maneuvers for an autonomous vehicle navigating road
+networks with a heterogeneous traffic load.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to and under review for INFORMS Mathematics of Operations
+  Research. Revised to address first round feedback from reviewers for this
+  journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Low-Tubal-Rank Tensor Recovery via Factorized Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11940v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11940v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyu Liu, Zhi Han, Yandong Tang, Xi-Le Zhao, Yao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers the problem of recovering a tensor with an underlying
+low-tubal-rank structure from a small number of corrupted linear measurements.
+Traditional approaches tackling such a problem require the computation of
+tensor Singular Value Decomposition (t-SVD), that is a computationally
+intensive process, rendering them impractical for dealing with large-scale
+tensors. Aim to address this challenge, we propose an efficient and effective
+low-tubal-rank tensor recovery method based on a factorization procedure akin
+to the Burer-Monteiro (BM) method. Precisely, our fundamental approach involves
+decomposing a large tensor into two smaller factor tensors, followed by solving
+the problem through factorized gradient descent (FGD). This strategy eliminates
+the need for t-SVD computation, thereby reducing computational costs and
+storage requirements. We provide rigorous theoretical analysis to ensure the
+convergence of FGD under both noise-free and noisy situations. Additionally, it
+is worth noting that our method does not require the precise estimation of the
+tensor tubal-rank. Even in cases where the tubal-rank is slightly
+overestimated, our approach continues to demonstrate robust performance. A
+series of experiments have been carried out to demonstrate that, as compared to
+other popular ones, our approach exhibits superior performance in multiple
+scenarios, in terms of the faster computational speed and the smaller
+convergence error.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convergence analysis of wide shallow neural operators within the
+  framework of Neural Tangent Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05545v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05545v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianliang Xu, Ye Li, Zhongyi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural operators are aiming at approximating operators mapping between Banach
+spaces of functions, achieving much success in the field of scientific
+computing. Compared to certain deep learning-based solvers, such as
+Physics-Informed Neural Networks (PINNs), Deep Ritz Method (DRM), neural
+operators can solve a class of Partial Differential Equations (PDEs). Although
+much work has been done to analyze the approximation and generalization error
+of neural operators, there is still a lack of analysis on their training error.
+In this work, we conduct the convergence analysis of gradient descent for the
+wide shallow neural operators and physics-informed shallow neural operators
+within the framework of Neural Tangent Kernel (NTK). The core idea lies on the
+fact that over-parameterization and random initialization together ensure that
+each weight vector remains near its initialization throughout all iterations,
+yielding the linear convergence of gradient descent. In this work, we
+demonstrate that under the setting of over-parametrization, gradient descent
+can find the global minimum regardless of whether it is in continuous time or
+discrete time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A stochastic first-order method with multi-extrapolated momentum for
+  highly smooth unconstrained optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider an unconstrained stochastic optimization problem
+where the objective function exhibits high-order smoothness. Specifically, we
+propose a new stochastic first-order method (SFOM) with multi-extrapolated
+momentum, in which multiple extrapolations are performed in each iteration,
+followed by a momentum update based on these extrapolations. We demonstrate
+that the proposed SFOM can accelerate optimization by exploiting the high-order
+smoothness of the objective function $f$. Assuming that the $p$th-order
+derivative of $f$ is Lipschitz continuous for some $p\ge2$, and under
+additional mild assumptions, we establish that our method achieves a sample
+complexity of $\widetilde{\mathcal{O}}(\epsilon^{-(3p+1)/p})$ for finding a
+point $x$ such that $\mathbb{E}[\|\nabla f(x)\|]\le\epsilon$. To the best of
+our knowledge, this is the first SFOM to leverage arbitrary-order smoothness of
+the objective function for acceleration, resulting in a sample complexity that
+improves upon the best-known results without assuming the mean-squared
+smoothness condition. Preliminary numerical experiments validate the practical
+performance of our method and support our theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wait-Less Offline Tuning and Re-solving for Online Decision Making 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09594v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09594v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingruo Sun, Wenzhi Gao, Ellen Vitercik, Yinyu Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online linear programming (OLP) has found broad applications in revenue
+management and resource allocation. State-of-the-art OLP algorithms achieve low
+regret by repeatedly solving linear programming (LP) subproblems that
+incorporate updated resource information. However, LP-based methods are
+computationally expensive and often inefficient for large-scale applications.
+In contrast, recent first-order OLP algorithms are more computationally
+efficient but typically suffer from worse regret guarantees. To address these
+shortcomings, we propose a new algorithm that combines the strengths of
+LP-based and first-order OLP methods. The algorithm re-solves the LP
+subproblems periodically at a predefined frequency $f$ and uses the latest dual
+prices to guide online decision-making. In addition, a first-order method runs
+in parallel during each interval between LP re-solves, smoothing resource
+consumption. Our algorithm achieves $\mathscr{O}(\log (T/f) + \sqrt{f})$
+regret, delivering a "wait-less" online decision-making process that balances
+the computational efficiency of first-order methods and the superior regret
+guarantee of LP-based methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In this version, we achieve a tighter regret bound with the warm
+  start for the first batch. We also make the proof more elegant by manually
+  accepting all subsequent orders once the constraint is violated. In this way,
+  we do not need to introduce the concept of stopping time for the analysis of
+  the LP-based method</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generic controllability of equivariant systems and applications to
+  particle systems and neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.08289v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.08289v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrei Agrachev, Cyril Letrouit
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There exist many examples of systems which have some symmetries, and which
+one may monitor with symmetry preserving controls. Since symmetries are
+preserved along the evolution, full controllability is not possible, and
+controllability has to be considered inside sets of states with same
+symmetries. We prove that generic systems with symmetries are controllable in
+this sense. This result has several applications, for instance: (i) generic
+controllability of particle systems when the kernel of interaction between
+particles plays the role of a mean-field control; (ii) generic controllability
+for families of vector fields on manifolds with boundary; (iii) universal
+interpolation for neural networks architectures with "generic" self
+attention-type layers - a type of layers ubiquitous in recent neural networks
+architectures, e.g., in the Transformers architecture. The tools we develop
+could help address various other questions of control of equivariant systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Annales de l'Institut Henri Poincar\'e, Analyse non
+  lin\'eaire</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient descent for unbounded convex functions on Hadamard manifolds
+  and its applications to scaling problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09746v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09746v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hiroshi Hirai, Keiya Sakabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study asymptotic behaviors of continuous-time and
+discrete-time gradient flows of a ``lower-unbounded" convex function $f$ on a
+Hadamard manifold $M$, particularly, their convergence properties to the
+boundary $M^{\infty}$ at infinity of $M$. We establish a duality theorem that
+the infimum of the gradient-norm $\|\nabla f(x)\|$ of $f$ over $M$ is equal to
+the supremum of the negative of the recession function $f^{\infty}$ of $f$ over
+the boundary $M^{\infty}$, provided the infimum is positive. Further, the
+infimum and the supremum are obtained by the limits of the gradient flows of
+$f$, Our results feature convex-optimization ingredients of the moment-weight
+inequality for reductive group actions by Georgoulas, Robbin, and Salamon,and
+are applied to noncommutative optimization by B\"urgisser et al. FOCS 2019. We
+show that the gradient descent of the Kempf-Ness function for an unstable orbit
+converges to a 1-parameter subgroup in the Hilbert-Mumford criterion, and the
+associated moment-map sequence converges to the mimimum-norm point of the
+moment polytope. We show further refinements for operator scaling -- the
+left-right action on a matrix tuple $A= (A_1,A_2,\ldots,A_N)$. We characterize
+the gradient-flow limit of operator scaling by a vector-space generalization of
+the classical Dulmage-Mendelsohn decomposition of a bipartite graph. Also, for
+a special case of $N = 2$, we reveal that this limit determines the Kronecker
+canonical form of matrix pencils $s A_1+A_2$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The conference version in FOCS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Log-Scale Quantization in Distributed First-Order Methods:
+  Gradient-based Learning from Distributed Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00621v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00621v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Doostmohammadian, Muhammad I. Qureshi, Mohammad Hossein Khalesi, Hamid R. Rabiee, Usman A. Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized strategies are of interest for learning from large-scale data
+over networks. This paper studies learning over a network of geographically
+distributed nodes/agents subject to quantization. Each node possesses a private
+local cost function, collectively contributing to a global cost function, which
+the considered methodology aims to minimize. In contrast to many existing
+papers, the information exchange among nodes is log-quantized to address
+limited network-bandwidth in practical situations. We consider a first-order
+computationally efficient distributed optimization algorithm (with no extra
+inner consensus loop) that leverages node-level gradient correction based on
+local data and network-level gradient aggregation only over nearby nodes. This
+method only requires balanced networks with no need for stochastic weight
+design. It can handle log-scale quantized data exchange over possibly
+time-varying and switching network setups. We study convergence over both
+structured networks (for example, training over data-centers) and ad-hoc
+multi-agent networks (for example, training over dynamic robotic networks).
+Through experimental validation, we show that (i) structured networks generally
+result in a smaller optimality gap, and (ii) log-scale quantization leads to a
+smaller optimality gap compared to uniform quantization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE TASE 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prophet Inequalities: Competing with the Top $\ell$ Items is Easy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07616v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07616v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Molina, Nicolas Gast, Patrick Loiseau, Vianney Perchet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore a prophet inequality problem, where the values of a sequence of
+items are drawn i.i.d. from some distribution, and an online decision maker
+must select one item irrevocably. We establish that $\mathrm{CR}_{\ell}$ the
+worst-case competitive ratio between the expected optimal performance of an
+online decision maker compared to that of a prophet who uses the average of the
+top $\ell$ items is exactly the solution to an integral equation. This quantity
+$\mathrm{CR}_{\ell}$ is larger than $1-e^{-\ell}$. This implies that the bound
+converges exponentially fast to $1$ as $\ell$ grows. In particular for
+$\ell=2$, $\mathrm{CR}_{2} \approx 0.966$ which is much closer to $1$ than the
+classical bound of $0.745$ for $\ell=1$. Additionally, we prove asymptotic
+lower bounds for the competitive ratio of a more general scenario, where the
+decision maker is permitted to select $k$ items. This subsumes the $k$
+multi-unit i.i.d. prophet problem and provides the current best asymptotic
+guarantees, as well as enables broader understanding in the more general
+framework. Finally, we prove a tight asymptotic competitive ratio when only
+static threshold policies are allowed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An automatic system to detect equivalence between iterative algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.04684v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.04684v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shipu Zhao, Laurent Lessard, Madeleine Udell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When are two algorithms the same? How can we be sure a recently proposed
+algorithm is novel, and not a minor twist on an existing method? In this paper,
+we present a framework for reasoning about equivalence between a broad class of
+iterative algorithms, with a focus on algorithms designed for convex
+optimization. We propose several notions of what it means for two algorithms to
+be equivalent, and provide computationally tractable means to detect
+equivalence. Our main definition, oracle equivalence, states that two
+algorithms are equivalent if they result in the same sequence of calls to the
+function oracles (for suitable initialization). Borrowing from control theory,
+we use state-space realizations to represent algorithms and characterize
+algorithm equivalence via transfer functions. Our framework can also identify
+and characterize some algorithm transformations including permutations of the
+update equations, repetition of the iteration, and conjugation of some of the
+function oracles in the algorithm. To support the paper, we have developed a
+software package named Linnaeus that implements the framework to identify other
+iterative algorithms that are equivalent to an input algorithm. More broadly,
+this framework and software advances the goal of making mathematics searchable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper documents a software system for identifying equivalence
+  between optimization algorithms. The analysis in this paper has been improved
+  in arxiv:2501.04972</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Counter-examples in first-order optimization: a constructive approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10503v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10503v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baptiste Goujaud, Aymeric Dieuleveut, Adrien Taylor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While many approaches were developed for obtaining worst-case complexity
+bounds for first-order optimization methods in the last years, there remain
+theoretical gaps in cases where no such bound can be found. In such cases, it
+is often unclear whether no such bound exists (e.g., because the algorithm
+might fail to systematically converge) or simply if the current techniques do
+not allow finding them.
+  In this work, we propose an approach to automate the search for cyclic
+trajectories generated by first-order methods. This provides a constructive
+approach to show that no appropriate complexity bound exists, thereby
+complementing the approaches providing sufficient conditions for convergence.
+Using this tool, we provide ranges of parameters for which some of the famous
+heavy-ball, Nesterov accelerated gradient, inexact gradient descent, and
+three-operator splitting algorithms fail to systematically converge, and show
+that it nicely complements existing tools searching for Lyapunov functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics-Informed Neural Network Lyapunov Functions: PDE
+  Characterization, Learning, and Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09131v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09131v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Liu, Yiming Meng, Maxwell Fitzsimmons, Ruikun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide a systematic investigation of using physics-informed neural
+networks to compute Lyapunov functions. We encode Lyapunov conditions as a
+partial differential equation (PDE) and use this for training neural network
+Lyapunov functions. We analyze the analytical properties of the solutions to
+the Lyapunov and Zubov PDEs. In particular, we show that employing the Zubov
+equation in training neural Lyapunov functions can lead to approximate regions
+of attraction close to the true domain of attraction. We also examine
+approximation errors and the convergence of neural approximations to the unique
+solution of Zubov's equation. We then provide sufficient conditions for the
+learned neural Lyapunov functions that can be readily verified by
+satisfiability modulo theories (SMT) solvers, enabling formal verification of
+both local stability analysis and region-of-attraction estimates in the large.
+Through a number of nonlinear examples, ranging from low to high dimensions, we
+demonstrate that the proposed framework can outperform traditional
+sums-of-squares (SOS) Lyapunov functions obtained using semidefinite
+programming (SDP).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The current version is accepted to the IFAC Journal Automatica</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Monotone Inclusion with Closed Loop Distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.13868v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.13868v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamza Ennaji, Jalal Fadili, Hedy Attouch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study in a Hilbertian setting, first and second-order
+monotone inclusions related to stochastic optimization problems with decision
+dependent distributions. The studied dynamics are formulated as monotone
+inclusions governed by Lipschitz perturbations of maximally monotone operators
+where the concept of equilibrium plays a central role. We discuss the
+relationship between the $\mathbb{W}_1$-Wasserstein Lipschitz behavior of the
+distribution and the so-called coarse Ricci curvature. As an application, we
+consider the monotone inclusions associated with stochastic optimisation
+problems involving the sum of a smooth function with Lipschitz gradient, a
+proximable function and a composite term.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">115</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-subject Open-set Personalization in Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tsai-Shien Chen, Aliaksandr Siarohin, Willi Menapace, Yuwei Fang, Kwot Sin Lee, Ivan Skorokhodov, Kfir Aberman, Jun-Yan Zhu, Ming-Hsuan Yang, Sergey Tulyakov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video personalization methods allow us to synthesize videos with specific
+concepts such as people, pets, and places. However, existing methods often
+focus on limited domains, require time-consuming optimization per subject, or
+support only a single subject. We present Video Alchemist $-$ a video model
+with built-in multi-subject, open-set personalization capabilities for both
+foreground objects and background, eliminating the need for time-consuming
+test-time optimization. Our model is built on a new Diffusion Transformer
+module that fuses each conditional reference image and its corresponding
+subject-level text prompt with cross-attention layers. Developing such a large
+model presents two main challenges: dataset and evaluation. First, as paired
+datasets of reference images and videos are extremely hard to collect, we
+sample selected video frames as reference images and synthesize a clip of the
+target video. However, while models can easily denoise training videos given
+reference frames, they fail to generalize to new contexts. To mitigate this
+issue, we design a new automatic data construction pipeline with extensive
+image augmentations. Second, evaluating open-set video personalization is a
+challenge in itself. To address this, we introduce a personalization benchmark
+that focuses on accurate subject fidelity and supports diverse personalization
+scenarios. Finally, our extensive experiments show that our method
+significantly outperforms existing personalization methods in both quantitative
+and qualitative evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page:
+  https://snap-research.github.io/open-set-video-personalization/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LlamaV-o1: Rethinking Step-by-step Visual Reasoning in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omkar Thawakar, Dinura Dissanayake, Ketan More, Ritesh Thawkar, Ahmed Heakl, Noor Ahsan, Yuhao Li, Mohammed Zumri, Jean Lahoud, Rao Muhammad Anwer, Hisham Cholakkal, Ivan Laptev, Mubarak Shah, Fahad Shahbaz Khan, Salman Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning is a fundamental capability for solving complex multi-step
+problems, particularly in visual contexts where sequential step-wise
+understanding is essential. Existing approaches lack a comprehensive framework
+for evaluating visual reasoning and do not emphasize step-wise problem-solving.
+To this end, we propose a comprehensive framework for advancing step-by-step
+visual reasoning in large language models (LMMs) through three key
+contributions. First, we introduce a visual reasoning benchmark specifically
+designed to evaluate multi-step reasoning tasks. The benchmark presents a
+diverse set of challenges with eight different categories ranging from complex
+visual perception to scientific reasoning with over 4k reasoning steps in
+total, enabling robust evaluation of LLMs' abilities to perform accurate and
+interpretable visual reasoning across multiple steps. Second, we propose a
+novel metric that assesses visual reasoning quality at the granularity of
+individual steps, emphasizing both correctness and logical coherence. The
+proposed metric offers deeper insights into reasoning performance compared to
+traditional end-task accuracy metrics. Third, we present a new multimodal
+visual reasoning model, named LlamaV-o1, trained using a multi-step curriculum
+learning approach, where tasks are progressively organized to facilitate
+incremental skill acquisition and problem-solving. The proposed LlamaV-o1 is
+designed for multi-step reasoning and learns step-by-step through a structured
+training paradigm. Extensive experiments show that our LlamaV-o1 outperforms
+existing open-source models and performs favorably against close-source
+proprietary models. Compared to the recent Llava-CoT, our LlamaV-o1 achieves an
+average score of 67.3 with an absolute gain of 3.8\% across six benchmarks
+while being 5 times faster during inference scaling. Our benchmark, model, and
+code are publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PEACE: Empowering Geologic Map Holistic Understanding with MLLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangyu Huang, Tianyi Gao, Haoran Xu, Qihao Zhao, Yang Song, Zhipeng Gui, Tengchao Lv, Hao Chen, Lei Cui, Scarlett Li, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geologic map, as a fundamental diagram in geology science, provides critical
+insights into the structure and composition of Earth's subsurface and surface.
+These maps are indispensable in various fields, including disaster detection,
+resource exploration, and civil engineering. Despite their significance,
+current Multimodal Large Language Models (MLLMs) often fall short in geologic
+map understanding. This gap is primarily due to the challenging nature of
+cartographic generalization, which involves handling high-resolution map,
+managing multiple associated components, and requiring domain-specific
+knowledge. To quantify this gap, we construct GeoMap-Bench, the first-ever
+benchmark for evaluating MLLMs in geologic map understanding, which assesses
+the full-scale abilities in extracting, referring, grounding, reasoning, and
+analyzing. To bridge this gap, we introduce GeoMap-Agent, the inaugural agent
+designed for geologic map understanding, which features three modules:
+Hierarchical Information Extraction (HIE), Domain Knowledge Injection (DKI),
+and Prompt-enhanced Question Answering (PEQA). Inspired by the
+interdisciplinary collaboration among human scientists, an AI expert group acts
+as consultants, utilizing a diverse tool pool to comprehensively analyze
+questions. Through comprehensive experiments, GeoMap-Agent achieves an overall
+score of 0.811 on GeoMap-Bench, significantly outperforming 0.369 of GPT-4o.
+Our work, emPowering gEologic mAp holistiC undErstanding (PEACE) with MLLMs,
+paves the way for advanced AI applications in geology, enhancing the efficiency
+and accuracy of geological investigations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoAuteur: Towards Long Narrative Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junfei Xiao, Feng Cheng, Lu Qi, Liangke Gui, Jiepeng Cen, Zhibei Ma, Alan Yuille, Lu Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent video generation models have shown promising results in producing
+high-quality video clips lasting several seconds. However, these models face
+challenges in generating long sequences that convey clear and informative
+events, limiting their ability to support coherent narrations. In this paper,
+we present a large-scale cooking video dataset designed to advance long-form
+narrative generation in the cooking domain. We validate the quality of our
+proposed dataset in terms of visual fidelity and textual caption accuracy using
+state-of-the-art Vision-Language Models (VLMs) and video generation models,
+respectively. We further introduce a Long Narrative Video Director to enhance
+both visual and semantic coherence in generated videos and emphasize the role
+of aligning visual embeddings to achieve improved overall video quality. Our
+method demonstrates substantial improvements in generating visually detailed
+and semantically aligned keyframes, supported by finetuning techniques that
+integrate text and image embeddings within the video generation process.
+Project page: https://videoauteur.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, https://videoauteur.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PySpatial: A High-Speed Whole Slide Image Pathomics Toolkit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuechen Yang, Yu Wang, Tianyuan Yao, Ruining Deng, Mengmeng Yin, Shilin Zhao, Haichun Yang, Yuankai Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whole Slide Image (WSI) analysis plays a crucial role in modern digital
+pathology, enabling large-scale feature extraction from tissue samples.
+However, traditional feature extraction pipelines based on tools like
+CellProfiler often involve lengthy workflows, requiring WSI segmentation into
+patches, feature extraction at the patch level, and subsequent mapping back to
+the original WSI. To address these challenges, we present PySpatial, a
+high-speed pathomics toolkit specifically designed for WSI-level analysis.
+PySpatial streamlines the conventional pipeline by directly operating on
+computational regions of interest, reducing redundant processing steps.
+Utilizing rtree-based spatial indexing and matrix-based computation, PySpatial
+efficiently maps and processes computational regions, significantly
+accelerating feature extraction while maintaining high accuracy. Our
+experiments on two datasets-Perivascular Epithelioid Cell (PEC) and data from
+the Kidney Precision Medicine Project (KPMP)-demonstrate substantial
+performance improvements. For smaller and sparse objects in PEC datasets,
+PySpatial achieves nearly a 10-fold speedup compared to standard CellProfiler
+pipelines. For larger objects, such as glomeruli and arteries in KPMP datasets,
+PySpatial achieves a 2-fold speedup. These results highlight PySpatial's
+potential to handle large-scale WSI analysis with enhanced efficiency and
+accuracy, paving the way for broader applications in digital pathology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MS-Temba : Multi-Scale Temporal Mamba for Efficient Temporal Action
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arkaprava Sinha, Monish Soundar Raj, Pu Wang, Ahmed Helmy, Srijan Das
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Action detection in real-world scenarios is particularly challenging due to
+densely distributed actions in hour-long untrimmed videos. It requires modeling
+both short- and long-term temporal relationships while handling significant
+intra-class temporal variations. Previous state-of-the-art (SOTA)
+Transformer-based architectures, though effective, are impractical for
+real-world deployment due to their high parameter count, GPU memory usage, and
+limited throughput, making them unsuitable for very long videos. In this work,
+we innovatively adapt the Mamba architecture for action detection and propose
+Multi-scale Temporal Mamba (MS-Temba), comprising two key components: Temporal
+Mamba (Temba) Blocks and the Temporal Mamba Fuser. Temba Blocks include the
+Temporal Local Module (TLM) for short-range temporal modeling and the Dilated
+Temporal SSM (DTS) for long-range dependencies. By introducing dilations, a
+novel concept for Mamba, TLM and DTS capture local and global features at
+multiple scales. The Temba Fuser aggregates these scale-specific features using
+Mamba to learn comprehensive multi-scale representations of untrimmed videos.
+MS-Temba is validated on three public datasets, outperforming SOTA methods on
+long videos and matching prior methods on short videos while using only
+one-eighth of the parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing, Refining, and Fusing: Towards Robust Multi-Scale and Dense
+  Ship Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Congxia Zhao, Xiongjun Fu, Jian Dong, Shen Cao, Chunyan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic aperture radar (SAR) imaging, celebrated for its high resolution,
+all-weather capability, and day-night operability, is indispensable for
+maritime applications. However, ship detection in SAR imagery faces significant
+challenges, including complex backgrounds, densely arranged targets, and large
+scale variations. To address these issues, we propose a novel framework,
+Center-Aware SAR Ship Detector (CASS-Det), designed for robust multi-scale and
+densely packed ship detection. CASS-Det integrates three key innovations: (1) a
+center enhancement module (CEM) that employs rotational convolution to
+emphasize ship centers, improving localization while suppressing background
+interference; (2) a neighbor attention module (NAM) that leverages cross-layer
+dependencies to refine ship boundaries in densely populated scenes; and (3) a
+cross-connected feature pyramid network (CC-FPN) that enhances multi-scale
+feature fusion by integrating shallow and deep features. Extensive experiments
+on the SSDD, HRSID, and LS-SSDD-v1.0 datasets demonstrate the state-of-the-art
+performance of CASS-Det, excelling at detecting multi-scale and densely
+arranged ships.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MSCViT: A Small-size ViT architecture with Multi-Scale Self-Attention
+  Mechanism for Tiny <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowei Zhang, Yi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformer (ViT) has demonstrated significant potential in various
+vision tasks due to its strong ability in modelling long-range dependencies.
+However, such success is largely fueled by training on massive samples. In real
+applications, the large-scale datasets are not always available, and ViT
+performs worse than Convolutional Neural Networks (CNNs) if it is only trained
+on small scale dataset (called tiny dataset), since it requires large amount of
+training data to ensure its representational capacity. In this paper, a
+small-size ViT architecture with multi-scale self-attention mechanism and
+convolution blocks is presented (dubbed MSCViT) to model different scales of
+attention at each layer. Firstly, we introduced wavelet convolution, which
+selectively combines the high-frequency components obtained by frequency
+division with our convolution channel to extract local features. Then, a
+lightweight multi-head attention module is developed to reduce the number of
+tokens and computational costs. Finally, the positional encoding (PE) in the
+backbone is replaced by a local feature extraction module. Compared with the
+original ViT, it is parameter-efficient and is particularly suitable for tiny
+datasets. Extensive experiments have been conducted on tiny datasets, in which
+our model achieves an accuracy of 84.68% on CIFAR-100 with 14.0M parameters and
+2.5 GFLOPs, without pre-training on large datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-powered virtual tissues from spatial proteomics for clinical
+  diagnostics and biomedical discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johann Wenckstern, Eeshaan Jain, Kiril Vasilev, Matteo Pariset, Andreas Wicki, Gabriele Gut, Charlotte Bunne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial proteomics technologies have transformed our understanding of complex
+tissue architectures by enabling simultaneous analysis of multiple molecular
+markers and their spatial organization. The high dimensionality of these data,
+varying marker combinations across experiments and heterogeneous study designs
+pose unique challenges for computational analysis. Here, we present Virtual
+Tissues (VirTues), a foundation model framework for biological tissues that
+operates across the molecular, cellular and tissue scale. VirTues introduces
+innovations in transformer architecture design, including a novel tokenization
+scheme that captures both spatial and marker dimensions, and attention
+mechanisms that scale to high-dimensional multiplex data while maintaining
+interpretability. Trained on diverse cancer and non-cancer tissue datasets,
+VirTues demonstrates strong generalization capabilities without task-specific
+fine-tuning, enabling cross-study analysis and novel marker integration. As a
+generalist model, VirTues outperforms existing approaches across clinical
+diagnostics, biological discovery and patient case retrieval tasks, while
+providing insights into tissue function and disease mechanisms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Holistically Point-guided Text Framework for Weakly-Supervised
+  Camouflaged Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tsui Qin Mok, Shuyong Gao, Haozhe Xing, Miaoyang He, Yan Wang, Wenqiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-Supervised Camouflaged Object Detection (WSCOD) has gained popularity
+for its promise to train models with weak labels to segment objects that
+visually blend into their surroundings. Recently, some methods using
+sparsely-annotated supervision shown promising results through scribbling in
+WSCOD, while point-text supervision remains underexplored. Hence, this paper
+introduces a novel holistically point-guided text framework for WSCOD by
+decomposing into three phases: segment, choose, train. Specifically, we propose
+Point-guided Candidate Generation (PCG), where the point's foreground serves as
+a correction for the text path to explicitly correct and rejuvenate the loss
+detection object during the mask generation process (SEGMENT). We also
+introduce a Qualified Candidate Discriminator (QCD) to choose the optimal mask
+from a given text prompt using CLIP (CHOOSE), and employ the chosen pseudo mask
+for training with a self-supervised Vision Transformer (TRAIN). Additionally,
+we developed a new point-supervised dataset (P2C-COD) and a text-supervised
+dataset (T-COD). Comprehensive experiments on four benchmark datasets
+demonstrate our method outperforms state-of-the-art methods by a large margin,
+and also outperforms some existing fully-supervised camouflaged object
+detection methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonisotropic Gaussian Diffusion for Realistic 3D Human Motion Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06035v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06035v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cecilia Curreli, Dominik Muhle, Abhishek Saroha, Zhenzhang Ye, Riccardo Marin, Daniel Cremers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Probabilistic human motion prediction aims to forecast multiple possible
+future movements from past observations. While current approaches report high
+diversity and realism, they often generate motions with undetected limb
+stretching and jitter. To address this, we introduce SkeletonDiffusion, a
+latent diffusion model that embeds an explicit inductive bias on the human body
+within its architecture and training. Our model is trained with a novel
+nonisotropic Gaussian diffusion formulation that aligns with the natural
+kinematic structure of the human skeleton. Results show that our approach
+outperforms conventional isotropic alternatives, consistently generating
+realistic predictions while avoiding artifacts such as limb distortion.
+Additionally, we identify a limitation in commonly used diversity metrics,
+which may inadvertently favor models that produce inconsistent limb lengths
+within the same sequence. SkeletonDiffusion sets a new benchmark on three
+real-world datasets, outperforming various baselines across multiple evaluation
+metrics. Visit our project page:
+https://ceveloper.github.io/publications/skeletondiffusion/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generate, Transduct, Adapt: Iterative Transduction with VLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oindrila Saha, Logan Lawrence, Grant Van Horn, Subhransu Maji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transductive zero-shot learning with vision-language models leverages
+image-image similarities within the dataset to achieve better classification
+accuracy compared to the inductive setting. However, there is little work that
+explores the structure of the language space in this context. We propose
+GTA-CLIP, a novel technique that incorporates supervision from language models
+for joint transduction in language and vision spaces. Our approach is iterative
+and consists of three steps: (i) incrementally exploring the attribute space by
+querying language models, (ii) an attribute-augmented transductive inference
+procedure, and (iii) fine-tuning the language and vision encoders based on
+inferred labels within the dataset. Through experiments with CLIP encoders, we
+demonstrate that GTA-CLIP, yields an average performance improvement of 8.6%
+and 3.7% across 12 datasets and 3 encoders, over CLIP and transductive CLIP
+respectively in the zero-shot setting. We also observe similar improvements in
+a few-shot setting. We present ablation studies that demonstrate the value of
+each step and visualize how the vision and language spaces evolve over
+iterations driven by the transductive learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code will be released at https://github.com/cvl-umass/GTA-CLIP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Geometric-Based Nail Segmentation for Clinical Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06027v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06027v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bernat Galmés, Gabriel Moyà-Alcover, Pedro Bibiloni, Javier Varona, Antoni Jaume-i-Capó
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A robust segmentation method that can be used to perform measurements on
+toenails is presented. The proposed method is used as the first step in a
+clinical trial to objectively quantify the incidence of a particular pathology.
+For such an assessment, it is necessary to distinguish a nail, which locally
+appears to be similar to the skin. Many algorithms have been used, each of
+which leverages different aspects of toenail appearance. We used the Hough
+transform to locate the tip of the toe and estimate the nail location and size.
+Subsequently, we classified the super-pixels of the image based on their
+geometric and photometric information. Thereafter, the watershed transform
+delineated the border of the nail. The method was validated using a 348-image
+medical dataset, achieving an accuracy of 0.993 and an F-measure of 0.925. The
+proposed method is considerably robust across samples, with respect to factors
+such as nail shape, skin pigmentation, illumination conditions, and appearance
+of large regions affected by a medical condition
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BRIGHT: A globally distributed multimodal building damage assessment
+  <span class="highlight-title">dataset</span> with very-high-resolution for all-weather disaster response 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongruixuan Chen, Jian Song, Olivier Dietrich, Clifford Broni-Bediako, Weihao Xuan, Junjue Wang, Xinlei Shao, Yimin Wei, Junshi Xia, Cuiling Lan, Konrad Schindler, Naoto Yokoya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Disaster events occur around the world and cause significant damage to human
+life and property. Earth observation (EO) data enables rapid and comprehensive
+building damage assessment (BDA), an essential capability in the aftermath of a
+disaster to reduce human casualties and to inform disaster relief efforts.
+Recent research focuses on the development of AI models to achieve accurate
+mapping of unseen disaster events, mostly using optical EO data. However,
+solutions based on optical data are limited to clear skies and daylight hours,
+preventing a prompt response to disasters. Integrating multimodal (MM) EO data,
+particularly the combination of optical and SAR imagery, makes it possible to
+provide all-weather, day-and-night disaster responses. Despite this potential,
+the development of robust multimodal AI models has been constrained by the lack
+of suitable benchmark datasets. In this paper, we present a BDA dataset using
+veRy-hIGH-resoluTion optical and SAR imagery (BRIGHT) to support AI-based
+all-weather disaster response. To the best of our knowledge, BRIGHT is the
+first open-access, globally distributed, event-diverse MM dataset specifically
+curated to support AI-based disaster response. It covers five types of natural
+disasters and two types of man-made disasters across 12 regions worldwide, with
+a particular focus on developing countries where external assistance is most
+needed. The optical and SAR imagery in BRIGHT, with a spatial resolution
+between 0.3-1 meters, provides detailed representations of individual
+buildings, making it ideal for precise BDA. In our experiments, we have tested
+seven advanced AI models trained with our BRIGHT to validate the
+transferability and robustness. The dataset and code are available at
+https://github.com/ChenHongruixuan/BRIGHT. BRIGHT also serves as the official
+dataset for the 2025 IEEE GRSS Data Fusion Contest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pose-independent 3D Anthropometry from Sparse Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Bojanić, Stefanie Wuhrer, Tomislav Petković, Tomislav Pribanić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D digital anthropometry is the study of estimating human body measurements
+from 3D scans. Precise body measurements are important health indicators in the
+medical industry, and guiding factors in the fashion, ergonomic and
+entertainment industries. The measuring protocol consists of scanning the whole
+subject in the static A-pose, which is maintained without breathing or movement
+during the scanning process. However, the A-pose is not easy to maintain during
+the whole scanning process, which can last even up to a couple of minutes. This
+constraint affects the final quality of the scan, which in turn affects the
+accuracy of the estimated body measurements obtained from methods that rely on
+dense geometric data. Additionally, this constraint makes it impossible to
+develop a digital anthropometry method for subjects unable to assume the
+A-pose, such as those with injuries or disabilities. We propose a method that
+can obtain body measurements from sparse landmarks acquired in any pose. We
+make use of the sparse landmarks of the posed subject to create
+pose-independent features, and train a network to predict the body measurements
+as taken from the standard A-pose. We show that our method achieves comparable
+results to competing methods that use dense geometry in the standard A-pose,
+but has the capability of estimating the body measurements from any pose using
+sparse landmarks only. Finally, we address the lack of open-source 3D
+anthropometry methods by making our method available to the research community
+at https://github.com/DavidBoja/pose-independent-anthropometry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CamCtrl3D: Single-Image Scene Exploration with Precise 3D Camera Control <span class="chip">3DV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06006v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06006v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Popov, Amit Raj, Michael Krainin, Yuanzhen Li, William T. Freeman, Michael Rubinstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method for generating fly-through videos of a scene, from a
+single image and a given camera trajectory. We build upon an image-to-video
+latent diffusion model. We condition its UNet denoiser on the camera
+trajectory, using four techniques. (1) We condition the UNet's temporal blocks
+on raw camera extrinsics, similar to MotionCtrl. (2) We use images containing
+camera rays and directions, similar to CameraCtrl. (3) We reproject the initial
+image to subsequent frames and use the resulting video as a condition. (4) We
+use 2D<=>3D transformers to introduce a global 3D representation, which
+implicitly conditions on the camera poses. We combine all conditions in a
+ContolNet-style architecture. We then propose a metric that evaluates overall
+video quality and the ability to preserve details with view changes, which we
+use to analyze the trade-offs of individual and combined conditions. Finally,
+we identify an optimal combination of conditions. We calibrate camera positions
+in our datasets for scale consistency across scenes, and we train our scene
+exploration model, CamCtrl3D, demonstrating state-of-theart results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in 3DV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SeMi: When Imbalanced Semi-Supervised Learning Meets Mining Hard
+  Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Wang, Zixuan Wang, Hao Lu, Zhen Qin, Hailiang Zhao, Guanjie Cheng, Ge Su, Li Kuang, Mengchu Zhou, Shuiguang Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-Supervised Learning (SSL) can leverage abundant unlabeled data to boost
+model performance. However, the class-imbalanced data distribution in
+real-world scenarios poses great challenges to SSL, resulting in performance
+degradation. Existing class-imbalanced semi-supervised learning (CISSL) methods
+mainly focus on rebalancing datasets but ignore the potential of using hard
+examples to enhance performance, making it difficult to fully harness the power
+of unlabeled data even with sophisticated algorithms. To address this issue, we
+propose a method that enhances the performance of Imbalanced Semi-Supervised
+Learning by Mining Hard Examples (SeMi). This method distinguishes the entropy
+differences among logits of hard and easy examples, thereby identifying hard
+examples and increasing the utility of unlabeled data, better addressing the
+imbalance problem in CISSL. In addition, we maintain a class-balanced memory
+bank with confidence decay for storing high-confidence embeddings to enhance
+the pseudo-labels' reliability. Although our method is simple, it is effective
+and seamlessly integrates with existing approaches. We perform comprehensive
+experiments on standard CISSL benchmarks and experimentally demonstrate that
+our proposed SeMi outperforms existing state-of-the-art methods on multiple
+benchmarks, especially in reversed scenarios, where our best result shows
+approximately a 54.8\% improvement over the baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages,6 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Partial Cycle-Consistency for Multi-View Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fedor Taggenbrock, Gertjan Burghouts, Ronald Poppe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Matching objects across partially overlapping camera views is crucial in
+multi-camera systems and requires a view-invariant feature extraction network.
+Training such a network with cycle-consistency circumvents the need for
+labor-intensive labeling. In this paper, we extend the mathematical formulation
+of cycle-consistency to handle partial overlap. We then introduce a pseudo-mask
+which directs the training loss to take partial overlap into account. We
+additionally present several new cycle variants that complement each other and
+present a time-divergent scene sampling scheme that improves the data input for
+this self-supervised setting. Cross-camera matching experiments on the
+challenging DIVOTrack dataset show the merits of our approach. Compared to the
+self-supervised state-of-the-art, we achieve a 4.3 percentage point higher F1
+score with our combined contributions. Our improvements are robust to reduced
+overlap in the training data, with substantial improvements in challenging
+scenes that need to make few matches between many people. Self-supervised
+feature networks trained with our method are effective at matching objects in a
+range of multi-camera settings, providing opportunities for complex tasks like
+large-scale multi-camera scene understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to VISAPP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Minimizing Occlusion Effect on Multi-View Camera Perception in BEV with
+  Multi-Sensor Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanjay Kumar, Hiep Truong, Sushil Sharma, Ganesh Sistu, Tony Scanlan, Eoin Grua, Ciarán Eising
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving technology is rapidly evolving, offering the potential for
+safer and more efficient transportation. However, the performance of these
+systems can be significantly compromised by the occlusion on sensors due to
+environmental factors like dirt, dust, rain, and fog. These occlusions severely
+affect vision-based tasks such as object detection, vehicle segmentation, and
+lane recognition. In this paper, we investigate the impact of various kinds of
+occlusions on camera sensor by projecting their effects from multi-view camera
+images of the nuScenes dataset into the Bird's-Eye View (BEV) domain. This
+approach allows us to analyze how occlusions spatially distribute and influence
+vehicle segmentation accuracy within the BEV domain. Despite significant
+advances in sensor technology and multi-sensor fusion, a gap remains in the
+existing literature regarding the specific effects of camera occlusions on
+BEV-based perception systems. To address this gap, we use a multi-sensor fusion
+technique that integrates LiDAR and radar sensor data to mitigate the
+performance degradation caused by occluded cameras. Our findings demonstrate
+that this approach significantly enhances the accuracy and robustness of
+vehicle segmentation tasks, leading to more reliable autonomous driving
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted form publishing at the Electronic Imaging - Autonomous
+  Vehicles and Machines Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Attention-Guided Deep Learning Approach for Classifying 39 Skin
+  Lesion Types 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sauda Adiv Hanum, Ashim Dey, Muhammad Ashad Kabir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The skin, as the largest organ of the human body, is vulnerable to a diverse
+array of conditions collectively known as skin lesions, which encompass various
+dermatoses. Diagnosing these lesions presents significant challenges for
+medical practitioners due to the subtle visual differences that are often
+imperceptible to the naked eye. While not all skin lesions are
+life-threatening, certain types can act as early indicators of severe diseases,
+including skin cancers, underscoring the critical need for timely and accurate
+diagnostic methods. Deep learning algorithms have demonstrated remarkable
+potential in facilitating the early detection and prognosis of skin lesions.
+This study advances the field by curating a comprehensive and diverse dataset
+comprising 39 categories of skin lesions, synthesized from five publicly
+available datasets. Using this dataset, the performance of five
+state-of-the-art deep learning models -- MobileNetV2, Xception, InceptionV3,
+EfficientNetB1, and Vision Transformer - is rigorously evaluated. To enhance
+the accuracy and robustness of these models, attention mechanisms such as the
+Efficient Channel Attention (ECA) and the Convolutional Block Attention Module
+(CBAM) are incorporated into their architectures. Comprehensive evaluation
+across multiple performance metrics reveals that the Vision Transformer model
+integrated with CBAM outperforms others, achieving an accuracy of 93.46%,
+precision of 94%, recall of 93%, F1-score of 93%, and specificity of 93.67%.
+These results underscore the significant potential of the proposed system in
+supporting medical professionals with accurate and efficient prognostic tools
+for diagnosing a broad spectrum of skin lesions. The dataset and code used in
+this study can be found at
+https://github.com/akabircs/Skin-Lesions-Classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Swin-X2S: Reconstructing 3D Shape from 2D Biplanar X-ray with Swin
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuan Liu, Zongyuan Ying, Jie Jin, Dongyan Li, Ping Huang, Wenjian Wu, Zhe Chen, Jin Qi, Yong Lu, Lianfu Deng, Bo Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conversion from 2D X-ray to 3D shape holds significant potential for
+improving diagnostic efficiency and safety. However, existing reconstruction
+methods often rely on hand-crafted features, manual intervention, and prior
+knowledge, resulting in unstable shape errors and additional processing costs.
+In this paper, we introduce Swin-X2S, an end-to-end deep learning method for
+directly reconstructing 3D segmentation and labeling from 2D biplanar
+orthogonal X-ray images. Swin-X2S employs an encoder-decoder architecture: the
+encoder leverages 2D Swin Transformer for X-ray information extraction, while
+the decoder employs 3D convolution with cross-attention to integrate structural
+features from orthogonal views. A dimension-expanding module is introduced to
+bridge the encoder and decoder, ensuring a smooth conversion from 2D pixels to
+3D voxels. We evaluate proposed method through extensive qualitative and
+quantitative experiments across nine publicly available datasets covering four
+anatomies (femur, hip, spine, and rib), with a total of 54 categories.
+Significant improvements over previous methods have been observed not only in
+the segmentation and labeling metrics but also in the clinically relevant
+parameters that are of primary concern in practical applications, which
+demonstrates the promise of Swin-X2S to provide an effective option for
+anatomical shape reconstruction in clinical scenarios. Code implementation is
+available at: \url{https://github.com/liukuan5625/Swin-X2S}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Vision Language Model Training via High Quality Data Curation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05952v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05952v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyuan Dong, Zijian Kang, Weijie Yin, Xiao Liang, Chao Feng, Jiao Ran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce SAIL-VL (ScAlable Vision Language Model TraIning
+via High QuaLity Data Curation), an open-source vision language model (VLM) of
+state-of-the-art (SOTA) performance with 2B parameters. We introduce three key
+improvements that contribute to SAIL-VL's leading performance: (1) Scalable
+high-quality visual understanding data construction: We implement a visual
+understanding data construction pipeline, which enables hundred-million-scale
+high-quality recaption data annotation. Equipped with this pipeline, we curate
+SAIL-Caption, a large-scale caption dataset with large quantity and the highest
+data quality compared with opensource caption datasets. (2) Scalable
+Pretraining with High-Quality Visual Understanding Data: We scale SAIL-VL's
+pretraining budget up to 131B tokens and show that even a 2B VLM benefits from
+scaled up training data sizes, exhibiting expected data size scaling laws in
+visual understanding and instruction following performance. (3) Scalable SFT
+via quantity and quality scaling: We introduce general guidance for instruction
+data curation to scale up instruction data continuously, allowing us to
+construct a large SFT dataset with the highest quality. To further improve
+SAIL-VL's performance, we propose quality scaling, a multi-stage training
+recipe with curriculum learning, to improve model performance scaling curves
+w.r.t. data sizes from logarithmic to be near-linear. SAIL-VL obtains the
+highest average score in 19 commonly used benchmarks in our evaluation and
+achieves top1 performance among VLMs of comparable sizes on OpenCompass
+(https://rank.opencompass.org.cn/leaderboard-multimodal). We release our
+SAIL-VL-2B model at HuggingFace
+(https://huggingface.co/BytedanceDouyinContent/SAIL-VL-2B).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reusable specimen-level inference in computational pathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakub R. Kaczmarzyk, Rishul Sharma, Peter K. Koo, Joel H. Saltz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models for computational pathology have shown great promise for
+specimen-level tasks and are increasingly accessible to researchers. However,
+specimen-level models built on these foundation models remain largely
+unavailable, hindering their broader utility and impact. To address this gap,
+we developed SpinPath, a toolkit designed to democratize specimen-level deep
+learning by providing a zoo of pretrained specimen-level models, a Python-based
+inference engine, and a JavaScript-based inference platform. We demonstrate the
+utility of SpinPath in metastasis detection tasks across nine foundation
+models. SpinPath may foster reproducibility, simplify experimentation, and
+accelerate the adoption of specimen-level deep learning in computational
+pathology research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multimodal <span class="highlight-title">Dataset</span> for Enhancing Industrial Task Monitoring and
+  Engagement Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naval Kishore Mehta,  Arvind, Himanshu Kumar, Abeer Banerjee, Sumeet Saurav, Sanjay Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting and interpreting operator actions, engagement, and object
+interactions in dynamic industrial workflows remains a significant challenge in
+human-robot collaboration research, especially within complex, real-world
+environments. Traditional unimodal methods often fall short of capturing the
+intricacies of these unstructured industrial settings. To address this gap, we
+present a novel Multimodal Industrial Activity Monitoring (MIAM) dataset that
+captures realistic assembly and disassembly tasks, facilitating the evaluation
+of key meta-tasks such as action localization, object interaction, and
+engagement prediction. The dataset comprises multi-view RGB, depth, and
+Inertial Measurement Unit (IMU) data collected from 22 sessions, amounting to
+290 minutes of untrimmed video, annotated in detail for task performance and
+operator behavior. Its distinctiveness lies in the integration of multiple data
+modalities and its emphasis on real-world, untrimmed industrial workflows-key
+for advancing research in human-robot collaboration and operator monitoring.
+Additionally, we propose a multimodal network that fuses RGB frames, IMU data,
+and skeleton sequences to predict engagement levels during industrial tasks.
+Our approach improves the accuracy of recognizing engagement states, providing
+a robust solution for monitoring operator performance in dynamic industrial
+environments. The dataset and code can be accessed from
+https://github.com/navalkishoremehta95/MIAM/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 20th International Conference on Human-Robot
+  Interaction (HRI) 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weakly Supervised Segmentation of Hyper-Reflective Foci with Compact
+  Convolutional <span class="highlight-title">Transformer</span>s and SAM2 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olivier Morelle, Justus Bisten, Maximilian W. M. Wintergerst, Robert P. Finger, Thomas Schultz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly supervised segmentation has the potential to greatly reduce the
+annotation effort for training segmentation models for small structures such as
+hyper-reflective foci (HRF) in optical coherence tomography (OCT). However,
+most weakly supervised methods either involve a strong downsampling of input
+images, or only achieve localization at a coarse resolution, both of which are
+unsatisfactory for small structures. We propose a novel framework that
+increases the spatial resolution of a traditional attention-based Multiple
+Instance Learning (MIL) approach by using Layer-wise Relevance Propagation
+(LRP) to prompt the Segment Anything Model (SAM~2), and increases recall with
+iterative inference. Moreover, we demonstrate that replacing MIL with a Compact
+Convolutional Transformer (CCT), which adds a positional encoding, and permits
+an exchange of information between different regions of the OCT image, leads to
+a further and substantial increase in segmentation accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 1 figure, accepted at German Conference on Medical Image
+  Computing 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Binary Event-Driven Spiking <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honglin Cao, Zijian Zhou, Wenjie Wei, Ammar Belatreche, Yu Liang, Dehao Zhang, Malu Zhang, Yang Yang, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based Spiking Neural Networks (SNNs) introduce a novel
+event-driven self-attention paradigm that combines the high performance of
+Transformers with the energy efficiency of SNNs. However, the larger model size
+and increased computational demands of the Transformer structure limit their
+practicality in resource-constrained scenarios. In this paper, we integrate
+binarization techniques into Transformer-based SNNs and propose the Binary
+Event-Driven Spiking Transformer, i.e. BESTformer. The proposed BESTformer can
+significantly reduce storage and computational demands by representing weights
+and attention maps with a mere 1-bit. However, BESTformer suffers from a severe
+performance drop from its full-precision counterpart due to the limited
+representation capability of binarization. To address this issue, we propose a
+Coupled Information Enhancement (CIE) method, which consists of a reversible
+framework and information enhancement distillation. By maximizing the mutual
+information between the binary model and its full-precision counterpart, the
+CIE method effectively mitigates the performance degradation of the BESTformer.
+Extensive experiments on static and neuromorphic datasets demonstrate that our
+method achieves superior performance to other binary SNNs, showcasing its
+potential as a compact yet high-performance model for resource-limited edge
+devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Valley2: Exploring Multimodal Models with Scalable Vision-Language
+  Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziheng Wu, Zhenghao Chen, Ruipu Luo, Can Zhang, Yuan Gao, Zhentao He, Xian Wang, Haoran Lin, Minghui Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, vision-language models have made remarkable progress, demonstrating
+outstanding capabilities in various tasks such as image captioning and video
+understanding. We introduce Valley2, a novel multimodal large language model
+designed to enhance performance across all domains and extend the boundaries of
+practical applications in e-commerce and short video scenarios. Notably,
+Valley2 achieves state-of-the-art (SOTA) performance on e-commerce benchmarks,
+surpassing open-source models of similar size by a large margin (79.66 vs.
+72.76). Additionally, Valley2 ranks second on the OpenCompass leaderboard among
+models with fewer than 10B parameters, with an impressive average score of
+67.4. The code and model weights are open-sourced at
+https://github.com/bytedance/Valley.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Flat Text: Dual Self-inherited Guidance for Visual Text
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minxing Luo, Zixun Xia, Liaojun Chen, Zhenhang Li, Weichao Zeng, Jianye Wang, Wentao Cheng, Yaxing Wang, Yu Zhou, Jian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world images, slanted or curved texts, especially those on cans,
+banners, or badges, appear as frequently, if not more so, than flat texts due
+to artistic design or layout constraints. While high-quality visual text
+generation has become available with the advanced generative capabilities of
+diffusion models, these models often produce distorted text and inharmonious
+text background when given slanted or curved text layouts due to training data
+limitation. In this paper, we introduce a new training-free framework, STGen,
+which accurately generates visual texts in challenging scenarios (\eg, slanted
+or curved text layouts) while harmonizing them with the text background. Our
+framework decomposes the visual text generation process into two branches: (i)
+\textbf{Semantic Rectification Branch}, which leverages the ability in
+generating flat but accurate visual texts of the model to guide the generation
+of challenging scenarios. The generated latent of flat text is abundant in
+accurate semantic information related both to the text itself and its
+background. By incorporating this, we rectify the semantic information of the
+texts and harmonize the integration of the text with its background in complex
+layouts. (ii) \textbf{Structure Injection Branch}, which reinforces the visual
+text structure during inference. We incorporate the latent information of the
+glyph image, rich in glyph structure, as a new condition to further strengthen
+the text structure. To enhance image harmony, we also apply an effective
+combination method to merge the priors, providing a solid foundation for
+generation. Extensive experiments across a variety of visual text layouts
+demonstrate that our framework achieves superior accuracy and outstanding
+quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EDNet: Edge-Optimized Small Target Detection in UAV Imagery -- Faster
+  Context Attention, Better Feature Fusion, and Hardware Acceleration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhifan Song, Yuan Zhang, Abd Al Rahman M. Abu Ebayyeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting small targets in drone imagery is challenging due to low
+resolution, complex backgrounds, and dynamic scenes. We propose EDNet, a novel
+edge-target detection framework built on an enhanced YOLOv10 architecture,
+optimized for real-time applications without post-processing. EDNet
+incorporates an XSmall detection head and a Cross Concat strategy to improve
+feature fusion and multi-scale context awareness for detecting tiny targets in
+diverse environments. Our unique C2f-FCA block employs Faster Context Attention
+to enhance feature extraction while reducing computational complexity. The WIoU
+loss function is employed for improved bounding box regression. With seven
+model sizes ranging from Tiny to XL, EDNet accommodates various deployment
+environments, enabling local real-time inference and ensuring data privacy.
+Notably, EDNet achieves up to a 5.6% gain in mAP@50 with significantly fewer
+parameters. On an iPhone 12, EDNet variants operate at speeds ranging from 16
+to 55 FPS, providing a scalable and efficient solution for edge-based object
+detection in challenging drone imagery. The source code and pre-trained models
+are available at: https://github.com/zsniko/EDNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 21st IEEE International Conference on Ubiquitous
+  Intelligence and Computing (UIC 2024)
+  https://www.ieee-smart-world.org/2024/uic</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-to-Edit: Controllable End-to-End Video Ad Creation via Multimodal
+  LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dabing Cheng, Haosen Zhan, Xingchen Zhao, Guisheng Liu, Zemin Li, Jinghui Xie, Zhao Song, Weiguo Feng, Bingyue Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exponential growth of short-video content has ignited a surge in the
+necessity for efficient, automated solutions to video editing, with challenges
+arising from the need to understand videos and tailor the editing according to
+user requirements. Addressing this need, we propose an innovative end-to-end
+foundational framework, ultimately actualizing precise control over the final
+video content editing. Leveraging the flexibility and generalizability of
+Multimodal Large Language Models (MLLMs), we defined clear input-output
+mappings for efficient video creation. To bolster the model's capability in
+processing and comprehending video content, we introduce a strategic
+combination of a denser frame rate and a slow-fast processing technique,
+significantly enhancing the extraction and understanding of both temporal and
+spatial video information. Furthermore, we introduce a text-to-edit mechanism
+that allows users to achieve desired video outcomes through textual input,
+thereby enhancing the quality and controllability of the edited videos. Through
+comprehensive experimentation, our method has not only showcased significant
+effectiveness within advertising datasets, but also yields universally
+applicable conclusions on public datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16pages conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TakuNet: an Energy-Efficient CNN for Real-Time Inference on Embedded UAV
+  systems in Emergency Response Scenarios <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Rossi, Guido Borghi, Roberto Vezzani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing efficient neural networks for embedded devices is a critical
+challenge, particularly in applications requiring real-time performance, such
+as aerial imaging with drones and UAVs for emergency responses. In this work,
+we introduce TakuNet, a novel light-weight architecture which employs
+techniques such as depth-wise convolutions and an early downsampling stem to
+reduce computational complexity while maintaining high accuracy. It leverages
+dense connections for fast convergence during training and uses 16-bit
+floating-point precision for optimization on embedded hardware accelerators.
+Experimental evaluation on two public datasets shows that TakuNet achieves
+near-state-of-the-art accuracy in classifying aerial images of emergency
+situations, despite its minimal parameter count. Real-world tests on embedded
+devices, namely Jetson Orin Nano and Raspberry Pi, confirm TakuNet's
+efficiency, achieving more than 650 fps on the 15W Jetson board, making it
+suitable for real-time AI processing on resource-constrained platforms and
+advancing the applicability of drones in emergency scenarios. The code and
+implementation details are publicly released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted at WACVW 2025, which will take place on
+  28/02/2025. The official conference proceedings have not yet been published
+  at the time of submission to arXiv. The final version of the paper,
+  incorporating any changes based on feedback received during the conference,
+  will be included in the proceedings once they are made available</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoRAG: Retrieval-Augmented Generation over Video Corpus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soyeong Jeong, Kangsan Kim, Jinheon Baek, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) is a powerful strategy to address the
+issue of generating factually incorrect outputs in foundation models by
+retrieving external knowledge relevant to queries and incorporating it into
+their generation process. However, existing RAG approaches have primarily
+focused on textual information, with some recent advancements beginning to
+consider images, and they largely overlook videos, a rich source of multimodal
+knowledge capable of representing events, processes, and contextual details
+more effectively than any other modality. While a few recent studies explore
+the integration of videos in the response generation process, they either
+predefine query-associated videos without retrieving them according to queries,
+or convert videos into the textual descriptions without harnessing their
+multimodal richness. To tackle these, we introduce VideoRAG, a novel framework
+that not only dynamically retrieves relevant videos based on their relevance
+with queries but also utilizes both visual and textual information of videos in
+the output generation. Further, to operationalize this, our method revolves
+around the recent advance of Large Video Language Models (LVLMs), which enable
+the direct processing of video content to represent it for retrieval and
+seamless integration of the retrieved videos jointly with queries. We
+experimentally validate the effectiveness of VideoRAG, showcasing that it is
+superior to relevant baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language-Inspired Relation Transfer for Few-shot Class-Incremental
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zhao, Jia Li, Zeyin Song, Yonghong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depicting novel classes with language descriptions by observing few-shot
+samples is inherent in human-learning systems. This lifelong learning
+capability helps to distinguish new knowledge from old ones through the
+increase of open-world learning, namely Few-Shot Class-Incremental Learning
+(FSCIL). Existing works to solve this problem mainly rely on the careful tuning
+of visual encoders, which shows an evident trade-off between the base knowledge
+and incremental ones. Motivated by human learning systems, we propose a new
+Language-inspired Relation Transfer (LRT) paradigm to understand objects by
+joint visual clues and text depictions, composed of two major steps. We first
+transfer the pretrained text knowledge to the visual domains by proposing a
+graph relation transformation module and then fuse the visual and language
+embedding by a text-vision prototypical fusion module. Second, to mitigate the
+domain gap caused by visual finetuning, we propose context prompt learning for
+fast domain alignment and imagined contrastive learning to alleviate the
+insufficient text data during alignment. With collaborative learning of domain
+alignments and text-image transfer, our proposed LRT outperforms the
+state-of-the-art models by over $13\%$ and $7\%$ on the final session of
+mini-ImageNet and CIFAR-100 FSCIL benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TPAMI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MRI Patterns of the Hippocampus and Amygdala for Predicting Stages of
+  Alzheimer's Progression: A Minimal Feature Machine Learning Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aswini Kumar Patra, Soraisham Elizabeth Devi, Tejashwini Gajurel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alzheimer's disease (AD) progresses through distinct stages, from early mild
+cognitive impairment (EMCI) to late mild cognitive impairment (LMCI) and
+eventually to AD. Accurate identification of these stages, especially
+distinguishing LMCI from EMCI, is crucial for developing pre-dementia
+treatments but remains challenging due to subtle and overlapping imaging
+features. This study proposes a minimal-feature machine learning framework that
+leverages structural MRI data, focusing on the hippocampus and amygdala as
+regions of interest. The framework addresses the curse of dimensionality
+through feature selection, utilizes region-specific voxel information, and
+implements innovative data organization to enhance classification performance
+by reducing noise. The methodology integrates dimensionality reduction
+techniques such as PCA and t-SNE with state-of-the-art classifiers, achieving
+the highest accuracy of 88.46%. This framework demonstrates the potential for
+efficient and accurate staging of AD progression while providing valuable
+insights for clinical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identity-aware Feature Decoupling Learning for Clothing-change Person
+  Re-identification <span class="chip">ICASSP2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan Xu, Bo Li, Guanglin Niu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clothing-change person re-identification (CC Re-ID) has attracted increasing
+attention in recent years due to its application prospect. Most existing works
+struggle to adequately extract the ID-related information from the original RGB
+images. In this paper, we propose an Identity-aware Feature Decoupling (IFD)
+learning framework to mine identity-related features. Particularly, IFD
+exploits a dual stream architecture that consists of a main stream and an
+attention stream. The attention stream takes the clothing-masked images as
+inputs and derives the identity attention weights for effectively transferring
+the spatial knowledge to the main stream and highlighting the regions with
+abundant identity-related information. To eliminate the semantic gap between
+the inputs of two streams, we propose a clothing bias diminishing module
+specific to the main stream to regularize the features of clothing-relevant
+regions. Extensive experimental results demonstrate that our framework
+outperforms other baseline models on several widely-used CC Re-ID datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Poetry in Pixels: <span class="highlight-title">Prompt</span> Tuning for Poem Image Generation via Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05839v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05839v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sofia Jamil, Bollampalli Areen Reddy, Raghvendra Kumar, Sriparna Saha, K J Joseph, Koustava Goswami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of text-to-image generation has encountered significant challenges
+when applied to literary works, especially poetry. Poems are a distinct form of
+literature, with meanings that frequently transcend beyond the literal words.
+To address this shortcoming, we propose a PoemToPixel framework designed to
+generate images that visually represent the inherent meanings of poems. Our
+approach incorporates the concept of prompt tuning in our image generation
+framework to ensure that the resulting images closely align with the poetic
+content. In addition, we propose the PoeKey algorithm, which extracts three key
+elements in the form of emotions, visual elements, and themes from poems to
+form instructions which are subsequently provided to a diffusion model for
+generating corresponding images. Furthermore, to expand the diversity of the
+poetry dataset across different genres and ages, we introduce MiniPo, a novel
+multimodal dataset comprising 1001 children's poems and images. Leveraging this
+dataset alongside PoemSum, we conducted both quantitative and qualitative
+evaluations of image generation using our PoemToPixel framework. This paper
+demonstrates the effectiveness of our approach and offers a fresh perspective
+on generating images from literary sources.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UltraRay: Full-Path Ray Tracing for Enhancing Realism in Ultrasound
+  Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Duelmer, Mohammad Farid Azampour, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional ultrasound simulators solve the wave equation to model pressure
+distribution fields, achieving high accuracy but requiring significant
+computational time and resources. To address this, ray tracing approaches have
+been introduced, modeling wave propagation as rays interacting with boundaries
+and scatterers. However, existing models simplify ray propagation, generating
+echoes at interaction points without considering return paths to the sensor.
+This can result in unrealistic artifacts and necessitates careful scene tuning
+for plausible results. We propose a novel ultrasound simulation pipeline that
+utilizes a ray tracing algorithm to generate echo data, tracing each ray from
+the transducer through the scene and back to the sensor. To replicate advanced
+ultrasound imaging, we introduce a ray emission scheme optimized for plane wave
+imaging, incorporating delay and steering capabilities. Furthermore, we
+integrate a standard signal processing pipeline to simulate end-to-end
+ultrasound image formation. We showcase the efficacy of the proposed pipeline
+by modeling synthetic scenes featuring highly reflective objects, such as
+bones. In doing so, our proposed approach, UltraRay, not only enhances the
+overall visual quality but also improves the realism of the simulated images by
+accurately capturing secondary reflections and reducing unnatural artifacts. By
+building on top of a differentiable framework, the proposed pipeline lays the
+groundwork for a fast and differentiable ultrasound simulation tool necessary
+for gradient-based optimization, enabling advanced ultrasound beamforming
+strategies, neural network integration, and accurate inverse scene
+reconstruction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-Driven Diabetic Retinopathy Screening: Multicentric Validation of
+  AIDRSS in India 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Kr Dey, Pradeep Walia, Girish Somvanshi, Abrar Ali, Sagarnil Das, Pallabi Paul, Minakhi Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Diabetic retinopathy (DR) is a major cause of vision loss,
+particularly in India, where access to retina specialists is limited in rural
+areas. This study aims to evaluate the Artificial Intelligence-based Diabetic
+Retinopathy Screening System (AIDRSS) for DR detection and prevalence
+assessment, addressing the growing need for scalable, automated screening
+solutions in resource-limited settings.
+  Approach: A multicentric, cross-sectional study was conducted in Kolkata,
+India, involving 5,029 participants and 10,058 macula-centric retinal fundus
+images. The AIDRSS employed a deep learning algorithm with 50 million trainable
+parameters, integrated with Contrast Limited Adaptive Histogram Equalization
+(CLAHE) preprocessing for enhanced image quality. DR was graded using the
+International Clinical Diabetic Retinopathy (ICDR) Scale, categorizing disease
+into five stages (DR0 to DR4). Statistical metrics including sensitivity,
+specificity, and prevalence rates were evaluated against expert retina
+specialist assessments.
+  Results: The prevalence of DR in the general population was 13.7%, rising to
+38.2% among individuals with elevated random blood glucose levels. The AIDRSS
+achieved an overall sensitivity of 92%, specificity of 88%, and 100%
+sensitivity for detecting referable DR (DR3 and DR4). These results demonstrate
+the system's robust performance in accurately identifying and grading DR in a
+diverse population.
+  Conclusions: AIDRSS provides a reliable, scalable solution for early DR
+detection in resource-constrained environments. Its integration of advanced AI
+techniques ensures high diagnostic accuracy, with potential to significantly
+reduce the burden of diabetes-related vision loss in underserved regions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 5 figures. arXiv admin note: substantial text overlap with
+  arXiv:1812.07105 by other authors without attribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PersonaHOI: Effortlessly Improving Personalized Face with Human-Object
+  Interaction Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinting Hu, Haoran Wang, Jan Eric Lenssen, Bernt Schiele
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce PersonaHOI, a training- and tuning-free framework that fuses a
+general StableDiffusion model with a personalized face diffusion (PFD) model to
+generate identity-consistent human-object interaction (HOI) images. While
+existing PFD models have advanced significantly, they often overemphasize
+facial features at the expense of full-body coherence, PersonaHOI introduces an
+additional StableDiffusion (SD) branch guided by HOI-oriented text inputs. By
+incorporating cross-attention constraints in the PFD branch and spatial merging
+at both latent and residual levels, PersonaHOI preserves personalized facial
+details while ensuring interactive non-facial regions. Experiments, validated
+by a novel interaction alignment metric, demonstrate the superior realism and
+scalability of PersonaHOI, establishing a new standard for practical
+personalized face with HOI generation. Our code will be available at
+https://github.com/JoyHuYY1412/PersonaHOI
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alignment without Over-optimization: Training-Free Solution for
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunwoo Kim, Minkyu Kim, Dongmin Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models excel in generative tasks, but aligning them with specific
+objectives while maintaining their versatility remains challenging. Existing
+fine-tuning methods often suffer from reward over-optimization, while
+approximate guidance approaches fail to optimize target rewards effectively.
+Addressing these limitations, we propose a training-free sampling method based
+on Sequential Monte Carlo (SMC) to sample from the reward-aligned target
+distribution. Our approach, tailored for diffusion sampling and incorporating
+tempering techniques, achieves comparable or superior target rewards to
+fine-tuning methods while preserving diversity and cross-reward generalization.
+We demonstrate its effectiveness in single-reward optimization, multi-objective
+scenarios, and online black-box optimization. This work offers a robust
+solution for aligning diffusion models with diverse downstream objectives
+without compromising their general capabilities. Code is available at
+https://github.com/krafton-ai/DAS .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cryptanalysis of Cancelable Biometrics Vault 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05786v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05786v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Lacharme, Kevin Thiry-Atighehchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cancelable Biometrics (CB) stands for a range of biometric transformation
+schemes combining biometrics with user specific tokens to generate secure
+templates. Required properties are the irreversibility, unlikability and
+recognition accuracy of templates while making their revocation possible. In
+biometrics, a key-binding scheme is used for protecting a cryptographic key
+using a biometric data. The key can be recomputed only if a correct biometric
+data is acquired during authentication. Applications of key-binding schemes are
+typically disk encryption, where the cryptographic key is used to encrypt and
+decrypt the disk. In this paper, we cryptanalyze a recent key-binding scheme,
+called Cancelable Biometrics Vault (CBV) based on cancelable biometrics. More
+precisely, the introduced cancelable transformation, called BioEncoding scheme,
+for instantiating the CBV framework is attacked in terms of reversibility and
+linkability of templates. Subsequently, our linkability attack enables to
+recover the key in the vault without additional assumptions. Our cryptanalysis
+introduces a new perspective by uncovering the CBV scheme's revocability and
+linkability vulnerabilities, which were not previously identified in comparable
+biometric-based key-binding schemes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UV-Attack: Physical-World Adversarial Attacks for Person Detection via
+  Dynamic-NeRF-based UV Mapping <span class="chip">ICLR2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanjie Li, Wenxuan Zhang, Kaisheng Liang, Bin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent research, adversarial attacks on person detectors using patches or
+static 3D model-based texture modifications have struggled with low success
+rates due to the flexible nature of human movement. Modeling the 3D
+deformations caused by various actions has been a major challenge. Fortunately,
+advancements in Neural Radiance Fields (NeRF) for dynamic human modeling offer
+new possibilities. In this paper, we introduce UV-Attack, a groundbreaking
+approach that achieves high success rates even with extensive and unseen human
+actions. We address the challenge above by leveraging dynamic-NeRF-based UV
+mapping. UV-Attack can generate human images across diverse actions and
+viewpoints, and even create novel actions by sampling from the SMPL parameter
+space. While dynamic NeRF models are capable of modeling human bodies,
+modifying clothing textures is challenging because they are embedded in neural
+network parameters. To tackle this, UV-Attack generates UV maps instead of RGB
+images and modifies the texture stacks. This approach enables real-time texture
+edits and makes the attack more practical. We also propose a novel Expectation
+over Pose Transformation loss (EoPT) to improve the evasion success rate on
+unseen poses and views. Our experiments show that UV-Attack achieves a 92.75%
+attack success rate against the FastRCNN model across varied poses in dynamic
+video settings, significantly outperforming the state-of-the-art AdvCamou
+attack, which only had a 28.50% ASR. Moreover, we achieve 49.5% ASR on the
+latest YOLOv8 detector in black-box settings. This work highlights the
+potential of dynamic NeRF-based UV mapping for creating more effective
+adversarial attacks on person detectors, addressing key challenges in modeling
+human movement and texture modification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 22 figures, submitted to ICLR2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StructSR: Refuse Spurious Details in Real-World Image Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05777v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05777v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yachao Li, Dong Liang, Tianyu Ding, Sheng-Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based models have shown great promise in real-world image
+super-resolution (Real-ISR), but often generate content with structural errors
+and spurious texture details due to the empirical priors and illusions of these
+models. To address this issue, we introduce StructSR, a simple, effective, and
+plug-and-play method that enhances structural fidelity and suppresses spurious
+details for diffusion-based Real-ISR. StructSR operates without the need for
+additional fine-tuning, external model priors, or high-level semantic
+knowledge. At its core is the Structure-Aware Screening (SAS) mechanism, which
+identifies the image with the highest structural similarity to the
+low-resolution (LR) input in the early inference stage, allowing us to leverage
+it as a historical structure knowledge to suppress the generation of spurious
+details. By intervening in the diffusion inference process, StructSR seamlessly
+integrates with existing diffusion-based Real-ISR models. Our experimental
+results demonstrate that StructSR significantly improves the fidelity of
+structure and texture, improving the PSNR and SSIM metrics by an average of
+5.27% and 9.36% on a synthetic dataset (DIV2K-Val) and 4.13% and 8.64% on two
+real-world datasets (RealSR and DRealSR) when integrated with four
+state-of-the-art diffusion-based Real-ISR methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditional Diffusion Model for Electrical Impedance Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duanpeng Shi, Wendong Zheng, Di Guo, Huaping Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electrical impedance tomography (EIT) is a non-invasive imaging technique,
+which has been widely used in the fields of industrial inspection, medical
+monitoring and tactile sensing. However, due to the inherent non-linearity and
+ill-conditioned nature of the EIT inverse problem, the reconstructed image is
+highly sensitive to the measured data, and random noise artifacts often appear
+in the reconstructed image, which greatly limits the application of EIT. To
+address this issue, a conditional diffusion model with voltage consistency
+(CDMVC) is proposed in this study. The method consists of a pre-imaging module,
+a conditional diffusion model for reconstruction, a forward voltage constraint
+network and a scheme of voltage consistency constraint during sampling process.
+The pre-imaging module is employed to generate the initial reconstruction. This
+serves as a condition for training the conditional diffusion model. Finally,
+based on the forward voltage constraint network, a voltage consistency
+constraint is implemented in the sampling phase to incorporate forward
+information of EIT, thereby enhancing imaging quality. A more complete dataset,
+including both common and complex concave shapes, is generated. The proposed
+method is validated using both simulation and physical experiments.
+Experimental results demonstrate that our method can significantly improves the
+quality of reconstructed images. In addition, experimental results also
+demonstrate that our method has good robustness and generalization performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Migician: Revealing the Magic of Free-Form Multi-Image Grounding in
+  Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        You Li, Heyu Huang, Chi Chen, Kaiyu Huang, Chao Huang, Zonghao Guo, Zhiyuan Liu, Jinan Xu, Yuhua Li, Ruixuan Li, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advancement of Multimodal Large Language Models (MLLMs) has
+significantly improved their fine-grained perception of single images and
+general comprehension across multiple images. However, existing MLLMs still
+face challenges in achieving precise grounding in complex multi-image
+scenarios. To address this, we first explore a Chain-of-Thought (CoT) framework
+that integrates single-image grounding with multi-image comprehension. While
+partially effective, it remains unstable and struggles to capture abstract
+visual information due to its non-end-to-end nature. Therefore, we introduce
+Migician, the first multi-image grounding model capable of performing free-form
+and accurate grounding across multiple images. To support this, we present the
+MGrounding-630k dataset, which comprises data for several multi-image grounding
+tasks derived from existing datasets, along with newly generated free-form
+grounding instruction-following data. Furthermore, we propose MIG-Bench, a
+comprehensive benchmark specifically designed for evaluating multi-image
+grounding capabilities. Experimental results demonstrate that our model
+achieves significantly superior multi-image grounding capabilities,
+outperforming the best existing MLLMs by 21.61% and even surpassing much larger
+70B models. Our code, model, dataset, and benchmark are fully open-sourced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StarGen: A Spatiotemporal Autoregression Framework with Video Diffusion
+  Model for Scalable and Controllable Scene Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangjin Zhai, Zhichao Ye, Jialin Liu, Weijian Xie, Jiaqi Hu, Zhen Peng, Hua Xue, Danpeng Chen, Xiaomeng Wang, Lei Yang, Nan Wang, Haomin Liu, Guofeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large reconstruction and generative models have
+significantly improved scene reconstruction and novel view generation. However,
+due to compute limitations, each inference with these large models is confined
+to a small area, making long-range consistent scene generation challenging. To
+address this, we propose StarGen, a novel framework that employs a pre-trained
+video diffusion model in an autoregressive manner for long-range scene
+generation. The generation of each video clip is conditioned on the 3D warping
+of spatially adjacent images and the temporally overlapping image from
+previously generated clips, improving spatiotemporal consistency in long-range
+scene generation with precise pose control. The spatiotemporal condition is
+compatible with various input conditions, facilitating diverse tasks, including
+sparse view interpolation, perpetual view generation, and layout-conditioned
+city generation. Quantitative and qualitative evaluations demonstrate StarGen's
+superior scalability, fidelity, and pose accuracy compared to state-of-the-art
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Locality-aware Gaussian Compression for Fast and High-quality Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungjoo Shin, Jaesik Park, Sunghyun Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present LocoGS, a locality-aware 3D Gaussian Splatting (3DGS) framework
+that exploits the spatial coherence of 3D Gaussians for compact modeling of
+volumetric scenes. To this end, we first analyze the local coherence of 3D
+Gaussian attributes, and propose a novel locality-aware 3D Gaussian
+representation that effectively encodes locally-coherent Gaussian attributes
+using a neural field representation with a minimal storage requirement. On top
+of the novel representation, LocoGS is carefully designed with additional
+components such as dense initialization, an adaptive spherical harmonics
+bandwidth scheme and different encoding schemes for different Gaussian
+attributes to maximize compression performance. Experimental results
+demonstrate that our approach outperforms the rendering quality of existing
+compact Gaussian representations for representative real-world 3D datasets
+while achieving from 54.6$\times$ to 96.6$\times$ compressed storage size and
+from 2.1$\times$ to 2.4$\times$ rendering speed than 3DGS. Even our approach
+also demonstrates an averaged 2.4$\times$ higher rendering speed than the
+state-of-the-art compression method with comparable compression performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 15 figures, and 14 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Mapping in Indoor Embodied AI -- A Comprehensive <span class="highlight-title">Survey</span> and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sonia Raychaudhuri, Angel X. Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent embodied agents (e.g. robots) need to perform complex semantic
+tasks in unfamiliar environments. Among many skills that the agents need to
+possess, building and maintaining a semantic map of the environment is most
+crucial in long-horizon tasks. A semantic map captures information about the
+environment in a structured way, allowing the agent to reference it for
+advanced reasoning throughout the task. While existing surveys in embodied AI
+focus on general advancements or specific tasks like navigation and
+manipulation, this paper provides a comprehensive review of semantic
+map-building approaches in embodied AI, specifically for indoor navigation. We
+categorize these approaches based on their structural representation (spatial
+grids, topological graphs, dense point-clouds or hybrid maps) and the type of
+information they encode (implicit features or explicit environmental data). We
+also explore the strengths and limitations of the map building techniques,
+highlight current challenges, and propose future research directions. We
+identify that the field is moving towards developing open-vocabulary,
+queryable, task-agnostic map representations, while high memory demands and
+computational inefficiency still remaining to be open challenges. This survey
+aims to guide current and future researchers in advancing semantic mapping
+techniques for embodied AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLVD: LSTM-based Explicit Motion Modeling in Latent Space for Blind
+  Video Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Loay Rashid, Siddharth Roheda, Amit Unde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video restoration plays a pivotal role in revitalizing degraded video content
+by rectifying imperfections caused by various degradations introduced during
+capturing (sensor noise, motion blur, etc.), saving/sharing (compression,
+resizing, etc.) and editing. This paper introduces a novel algorithm designed
+for scenarios where noise is introduced during video capture, aiming to enhance
+the visual quality of videos by reducing unwanted noise artifacts. We propose
+the Latent space LSTM Video Denoiser (LLVD), an end-to-end blind denoising
+model. LLVD uniquely combines spatial and temporal feature extraction,
+employing Long Short Term Memory (LSTM) within the encoded feature domain. This
+integration of LSTM layers is crucial for maintaining continuity and minimizing
+flicker in the restored video. Moreover, processing frames in the encoded
+feature domain significantly reduces computations, resulting in a very
+lightweight architecture. LLVD's blind nature makes it versatile for real,
+in-the-wild denoising scenarios where prior information about noise
+characteristics is not available. Experiments reveal that LLVD demonstrates
+excellent performance for both synthetic and captured noise. Specifically, LLVD
+surpasses the current State-Of-The-Art (SOTA) in RAW denoising by 0.3dB, while
+also achieving a 59\% reduction in computational complexity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TB-Bench: Training and Testing Multi-Modal AI for Understanding
+  Spatio-Temporal Traffic Behaviors from Dashcam Images/Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Korawat Charoenpitaks, Van-Quang Nguyen, Masanori Suganuma, Kentaro Arai, Seiji Totsuka, Hiroshi Ino, Takayuki Okatani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of Multi-modal Large Language Models (MLLMs) in Autonomous
+Driving (AD) faces significant challenges due to their limited training on
+traffic-specific data and the absence of dedicated benchmarks for
+spatiotemporal understanding. This study addresses these issues by proposing
+TB-Bench, a comprehensive benchmark designed to evaluate MLLMs on understanding
+traffic behaviors across eight perception tasks from ego-centric views. We also
+introduce vision-language instruction tuning datasets, TB-100k and TB-250k,
+along with simple yet effective baselines for the tasks. Through extensive
+experiments, we show that existing MLLMs underperform in these tasks, with even
+a powerful model like GPT-4o achieving less than 35% accuracy on average. In
+contrast, when fine-tuned with TB-100k or TB-250k, our baseline models achieve
+average accuracy up to 85%, significantly enhancing performance on the tasks.
+Additionally, we demonstrate performance transfer by co-training TB-100k with
+another traffic dataset, leading to improved performance on the latter.
+Overall, this study represents a step forward by introducing a comprehensive
+benchmark, high-quality datasets, and baselines, thus supporting the gradual
+integration of MLLMs into the perception, prediction, and planning stages of
+AD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main Paper: 8 pages, Supplementary Materials: 15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Super-class guided <span class="highlight-title">Transformer</span> for Zero-Shot Attribute Classification <span class="chip">AAAI25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05728v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05728v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sehyung Kim, Chanhyeong Yang, Jihwan Park, Taehoon Song, Hyunwoo J. Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attribute classification is crucial for identifying specific characteristics
+within image regions. Vision-Language Models (VLMs) have been effective in
+zero-shot tasks by leveraging their general knowledge from large-scale
+datasets. Recent studies demonstrate that transformer-based models with
+class-wise queries can effectively address zero-shot multi-label
+classification. However, poor utilization of the relationship between seen and
+unseen attributes makes the model lack generalizability. Additionally,
+attribute classification generally involves many attributes, making maintaining
+the model's scalability difficult. To address these issues, we propose
+Super-class guided transFormer (SugaFormer), a novel framework that leverages
+super-classes to enhance scalability and generalizability for zero-shot
+attribute classification. SugaFormer employs Super-class Query Initialization
+(SQI) to reduce the number of queries, utilizing common semantic information
+from super-classes, and incorporates Multi-context Decoding (MD) to handle
+diverse visual cues. To strengthen generalizability, we introduce two knowledge
+transfer strategies that utilize VLMs. During training, Super-class guided
+Consistency Regularization (SCR) aligns SugaFormer's features with VLMs using
+region-specific prompts, and during inference, Zero-shot Retrieval-based Score
+Enhancement (ZRSE) refines predictions for unseen attributes. Extensive
+experiments demonstrate that SugaFormer achieves state-of-the-art performance
+across three widely-used attribute classification benchmarks under zero-shot,
+and cross-dataset transfer settings. Our code is available at
+https://github.com/mlvlab/SugaFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Shark Tracking and Biometrics from Aerial Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chinmay K Lalgudi, Mark E Leone, Jaden V Clark, Sergio Madrigal-Mora, Mario Espinoza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent widespread adoption of drones for studying marine animals provides
+opportunities for deriving biological information from aerial imagery. The
+large scale of imagery data acquired from drones is well suited for machine
+learning (ML) analysis. Development of ML models for analyzing marine animal
+aerial imagery has followed the classical paradigm of training, testing, and
+deploying a new model for each dataset, requiring significant time, human
+effort, and ML expertise. We introduce Frame Level ALIgment and tRacking
+(FLAIR), which leverages the video understanding of Segment Anything Model 2
+(SAM2) and the vision-language capabilities of Contrastive Language-Image
+Pre-training (CLIP). FLAIR takes a drone video as input and outputs
+segmentation masks of the species of interest across the video. Notably, FLAIR
+leverages a zero-shot approach, eliminating the need for labeled data, training
+a new model, or fine-tuning an existing model to generalize to other species.
+With a dataset of 18,000 drone images of Pacific nurse sharks, we trained
+state-of-the-art object detection models to compare against FLAIR. We show that
+FLAIR massively outperforms these object detectors and performs competitively
+against two human-in-the-loop methods for prompting SAM2, achieving a Dice
+score of 0.81. FLAIR readily generalizes to other shark species without
+additional human effort and can be combined with novel heuristics to
+automatically extract relevant information including length and tailbeat
+frequency. FLAIR has significant potential to accelerate aerial imagery
+analysis workflows, requiring markedly less human effort and expertise than
+traditional machine learning workflows, while achieving superior accuracy. By
+reducing the effort required for aerial imagery analysis, FLAIR allows
+scientists to spend more time interpreting results and deriving insights about
+marine ecosystems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From My View to Yours: Ego-Augmented Learning in Large Vision Language
+  Models for Understanding Exocentric Daily Living Activities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominick Reilly, Manish Kumar Govind, Srijan Das
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision Language Models (LVLMs) have demonstrated impressive
+capabilities in video understanding, yet their adoption for Activities of Daily
+Living (ADL) remains limited by their inability to capture fine-grained
+interactions and spatial relationships. This limitation is particularly evident
+in ADL tasks, where understanding detailed human-object interaction and
+human-centric motion is crucial for applications such as elderly monitoring and
+cognitive assessment. To address this, we aim to leverage the complementary
+nature of egocentric views to enhance LVLM's understanding of exocentric ADL
+videos. Consequently, we propose an online ego2exo distillation approach to
+learn ego-augmented exo representations in LVLMs. While effective, this
+approach requires paired ego-exo training data, which is impractical to collect
+for real-world ADL scenarios. Consequently, we develop EgoMimic, a
+skeleton-guided method that can generate mimicked ego views from exocentric
+videos. We find that the exo representations of our ego-augmented LVLMs
+successfully learn to extract ego-perspective cues, demonstrated through
+comprehensive evaluation on six ADL benchmarks and our proposed
+EgoPerceptionMCQ benchmark designed specifically to assess egocentric
+understanding from exocentric videos. Code, models, and data will be
+open-sourced at https://github.com/dominickrei/EgoExo4ADL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EmotiCrafter: Text-to-Emotional-Image Generation based on
+  Valence-Arousal Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi He, Shengqi Dang, Long Ling, Ziqing Qian, Nanxuan Zhao, Nan Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research shows that emotions can enhance users' cognition and
+influence information communication. While research on visual emotion analysis
+is extensive, limited work has been done on helping users generate emotionally
+rich image content. Existing work on emotional image generation relies on
+discrete emotion categories, making it challenging to capture complex and
+subtle emotional nuances accurately. Additionally, these methods struggle to
+control the specific content of generated images based on text prompts. In this
+work, we introduce the new task of continuous emotional image content
+generation (C-EICG) and present EmotiCrafter, an emotional image generation
+model that generates images based on text prompts and Valence-Arousal values.
+Specifically, we propose a novel emotion-embedding mapping network that embeds
+Valence-Arousal values into textual features, enabling the capture of specific
+emotions in alignment with intended input prompts. Additionally, we introduce a
+loss function to enhance emotion expression. The experimental results show that
+our method effectively generates images representing specific emotions with the
+desired content and outperforms existing techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Overcoming Language Priors for Visual Question Answering Based on
+  Knowledge Distillation <span class="chip">ICME2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daowan Peng, Wei Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous studies have pointed out that visual question answering (VQA) models
+are prone to relying on language priors for answer predictions. In this
+context, predictions often depend on linguistic shortcuts rather than a
+comprehensive grasp of multimodal knowledge, which diminishes their
+generalization ability. In this paper, we propose a novel method, namely, KDAR,
+leveraging knowledge distillation to address the prior-dependency dilemmas
+within the VQA task. Specifically, the regularization effect facilitated by
+soft labels from a well-trained teacher is employed to penalize overfitting to
+the most common answers. The soft labels, which serve a regularization role,
+also provide semantic guidance that narrows the range of candidate answers.
+Additionally, we design an adaptive sample-wise reweighting learning strategy
+to further mitigate bias by dynamically adjusting the importance of each
+sample. Experimental results demonstrate that our method enhances performance
+in both OOD and IID settings. Our method achieves state-of-the-art performance
+on the VQA-CPv2 out-of-distribution (OOD) benchmark, significantly
+outperforming previous state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICME2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ eKalibr: Dynamic Intrinsic Calibration for Event Cameras From First
+  Principles of Events 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuolong Chen, Xingxing Li, Liu Yuan, Ziao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The bio-inspired event camera has garnered extensive research attention in
+recent years, owing to its significant potential derived from its high dynamic
+range and low latency characteristics. Similar to the standard camera, the
+event camera requires precise intrinsic calibration to facilitate further
+high-level visual applications, such as pose estimation and mapping. While
+several calibration methods for event cameras have been proposed, most of them
+are either (i) engineering-driven, heavily relying on conventional image-based
+calibration pipelines, or (ii) inconvenient, requiring complex instrumentation.
+To this end, we propose an accurate and convenient intrinsic calibration method
+for event cameras, named eKalibr, which builds upon a carefully designed
+event-based circle grid pattern recognition algorithm. To extract target
+patterns from events, we perform event-based normal flow estimation to identify
+potential events generated by circle edges, and cluster them spatially.
+Subsequently, event clusters associated with the same grid circles are matched
+and grouped using normal flows, for subsequent time-varying ellipse estimation.
+Fitted ellipse centers are time-synchronized, for final grid pattern
+recognition. We conducted extensive experiments to evaluate the performance of
+eKalibr in terms of pattern extraction and intrinsic calibration. The
+implementation of eKalibr is open-sourced at
+(https://github.com/Unsigned-Long/eKalibr) to benefit the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniQ: Unified Decoder with Task-specific Queries for Efficient Scene
+  Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyao Liao, Wei Wei, Dangyang Chen, Yuanyuan Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Graph Generation(SGG) is a scene understanding task that aims at
+identifying object entities and reasoning their relationships within a given
+image. In contrast to prevailing two-stage methods based on a large object
+detector (e.g., Faster R-CNN), one-stage methods integrate a fixed-size set of
+learnable queries to jointly reason relational triplets <subject, predicate,
+object>. This paradigm demonstrates robust performance with significantly
+reduced parameters and computational overhead. However, the challenge in
+one-stage methods stems from the issue of weak entanglement, wherein entities
+involved in relationships require both coupled features shared within triplets
+and decoupled visual features. Previous methods either adopt a single decoder
+for coupled triplet feature modeling or multiple decoders for separate visual
+feature extraction but fail to consider both. In this paper, we introduce UniQ,
+a Unified decoder with task-specific Queries architecture, where task-specific
+queries generate decoupled visual features for subjects, objects, and
+predicates respectively, and unified decoder enables coupled feature modeling
+within relational triplets. Experimental results on the Visual Genome dataset
+demonstrate that UniQ has superior performance to both one-stage and two-stage
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Reversible Consistency Learning for Cross-modal Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruitao Pu, Yang Qin, Dezhong Peng, Xiaomin Song, Huiming Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal retrieval (CMR) typically involves learning common
+representations to directly measure similarities between multimodal samples.
+Most existing CMR methods commonly assume multimodal samples in pairs and
+employ joint training to learn common representations, limiting the flexibility
+of CMR. Although some methods adopt independent training strategies for each
+modality to improve flexibility in CMR, they utilize the randomly initialized
+orthogonal matrices to guide representation learning, which is suboptimal since
+they assume inter-class samples are independent of each other, limiting the
+potential of semantic alignments between sample representations and
+ground-truth labels. To address these issues, we propose a novel method termed
+Deep Reversible Consistency Learning (DRCL) for cross-modal retrieval. DRCL
+includes two core modules, \ie Selective Prior Learning (SPL) and Reversible
+Semantic Consistency learning (RSC). More specifically, SPL first learns a
+transformation weight matrix on each modality and selects the best one based on
+the quality score as the Prior, which greatly avoids blind selection of priors
+learned from low-quality modalities. Then, RSC employs a Modality-invariant
+Representation Recasting mechanism (MRR) to recast the potential
+modality-invariant representations from sample semantic labels by the
+generalized inverse matrix of the prior. Since labels are devoid of
+modal-specific information, we utilize the recast features to guide the
+representation learning, thus maintaining semantic consistency to the fullest
+extent possible. In addition, a feature augmentation mechanism (FA) is
+introduced in RSC to encourage the model to learn over a wider data
+distribution for diversity. Finally, extensive experiments conducted on five
+widely used datasets and comparisons with 15 state-of-the-art baselines
+demonstrate the effectiveness and superiority of our DRCL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LPRnet: A <span class="highlight-title">self-supervised</span> registration network for LiDAR and
+  photogrammetric point clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Wang, Yanfeng Gu, Xian Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR and photogrammetry are active and passive remote sensing techniques for
+point cloud acquisition, respectively, offering complementary advantages and
+heterogeneous. Due to the fundamental differences in sensing mechanisms,
+spatial distributions and coordinate systems, their point clouds exhibit
+significant discrepancies in density, precision, noise, and overlap. Coupled
+with the lack of ground truth for large-scale scenes, integrating the
+heterogeneous point clouds is a highly challenging task. This paper proposes a
+self-supervised registration network based on a masked autoencoder, focusing on
+heterogeneous LiDAR and photogrammetric point clouds. At its core, the method
+introduces a multi-scale masked training strategy to extract robust features
+from heterogeneous point clouds under self-supervision. To further enhance
+registration performance, a rotation-translation embedding module is designed
+to effectively capture the key features essential for accurate rigid
+transformations. Building upon the robust representations, a transformer-based
+architecture seamlessly integrates local and global features, fostering precise
+alignment across diverse point cloud datasets. The proposed method demonstrates
+strong feature extraction capabilities for both LiDAR and photogrammetric point
+clouds, addressing the challenges of acquiring ground truth at the scene level.
+Experiments conducted on two real-world datasets validate the effectiveness of
+the proposed method in solving heterogeneous point cloud registration problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 9 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HFMF: Hierarchical Fusion Meets Multi-Stream Models for Deepfake
+  Detection <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anant Mehta, Bryant McArthur, Nagarjuna Kolloju, Zhengzhong Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid progress in deep generative models has led to the creation of
+incredibly realistic synthetic images that are becoming increasingly difficult
+to distinguish from real-world data. The widespread use of Variational Models,
+Diffusion Models, and Generative Adversarial Networks has made it easier to
+generate convincing fake images and videos, which poses significant challenges
+for detecting and mitigating the spread of misinformation. As a result,
+developing effective methods for detecting AI-generated fakes has become a
+pressing concern. In our research, we propose HFMF, a comprehensive two-stage
+deepfake detection framework that leverages both hierarchical cross-modal
+feature fusion and multi-stream feature extraction to enhance detection
+performance against imagery produced by state-of-the-art generative AI models.
+The first component of our approach integrates vision Transformers and
+convolutional nets through a hierarchical feature fusion mechanism. The second
+component of our framework combines object-level information and a fine-tuned
+convolutional net model. We then fuse the outputs from both components via an
+ensemble deep neural net, enabling robust classification performances. We
+demonstrate that our architecture achieves superior performance across diverse
+dataset benchmarks while maintaining calibration and interoperability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted to WACV 2025 Workshop on AI for Multimedia
+  Forensics & Disinformation Detection. Code is available at:
+  https://github.com/taco-group/HFMF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralized Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David McAllister, Matthew Tancik, Jiaming Song, Angjoo Kanazawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale AI model training divides work across thousands of GPUs, then
+synchronizes gradients across them at each step. This incurs a significant
+network burden that only centralized, monolithic clusters can support, driving
+up infrastructure costs and straining power systems. We propose Decentralized
+Diffusion Models, a scalable framework for distributing diffusion model
+training across independent clusters or datacenters by eliminating the
+dependence on a centralized, high-bandwidth networking fabric. Our method
+trains a set of expert diffusion models over partitions of the dataset, each in
+full isolation from one another. At inference time, the experts ensemble
+through a lightweight router. We show that the ensemble collectively optimizes
+the same objective as a single model trained over the whole dataset. This means
+we can divide the training burden among a number of "compute islands," lowering
+infrastructure costs and improving resilience to localized GPU failures.
+Decentralized diffusion models empower researchers to take advantage of
+smaller, more cost-effective and more readily available compute like on-demand
+GPU nodes rather than central integrated systems. We conduct extensive
+experiments on ImageNet and LAION Aesthetics, showing that decentralized
+diffusion models FLOP-for-FLOP outperform standard diffusion models. We finally
+scale our approach to 24 billion parameters, demonstrating that high-quality
+diffusion models can now be trained with just eight individual GPU nodes in
+less than a week.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://decentralizeddiffusion.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Guess What I Think: Streamlined EEG-to-Image Generation with Latent
+  Diffusion Models <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.02780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.02780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eleonora Lopez, Luigi Sigillo, Federica Colonnese, Massimo Panella, Danilo Comminiello
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating images from brain waves is gaining increasing attention due to its
+potential to advance brain-computer interface (BCI) systems by understanding
+how brain signals encode visual cues. Most of the literature has focused on
+fMRI-to-Image tasks as fMRI is characterized by high spatial resolution.
+However, fMRI is an expensive neuroimaging modality and does not allow for
+real-time BCI. On the other hand, electroencephalography (EEG) is a low-cost,
+non-invasive, and portable neuroimaging technique, making it an attractive
+option for future real-time applications. Nevertheless, EEG presents inherent
+challenges due to its low spatial resolution and susceptibility to noise and
+artifacts, which makes generating images from EEG more difficult. In this
+paper, we address these problems with a streamlined framework based on the
+ControlNet adapter for conditioning a latent diffusion model (LDM) through EEG
+signals. We conduct experiments and ablation studies on popular benchmarks to
+demonstrate that the proposed method beats other state-of-the-art models.
+Unlike these methods, which often require extensive preprocessing, pretraining,
+different losses, and captioning models, our approach is efficient and
+straightforward, requiring only minimal preprocessing and a few components. The
+code is available at https://github.com/LuigiSigillo/GWIT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Stage Segmentation of Cervical Tumors using PocketNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.11456v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.11456v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Awj Twam, Megan Jacobsen, Rachel Glenn, Peng Wei, Jia Sun, Ann Klopp, Aradhana M. Venkatesan, David Fuentes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cervical cancer remains the fourth most common malignancy amongst women
+worldwide.1 Concurrent chemoradiotherapy (CRT) serves as the mainstay
+definitive treatment regimen for locally advanced cervical cancers and includes
+external beam radiation followed by brachytherapy.2 Integral to radiotherapy
+treatment planning is the routine contouring of both the target tumor at the
+level of the cervix, associated gynecologic anatomy and the adjacent organs at
+risk (OARs). However, manual contouring of these structures is both time and
+labor intensive and associated with known interobserver variability that can
+impact treatment outcomes. While multiple tools have been developed to
+automatically segment OARs and the high-risk clinical tumor volume (HR-CTV)
+using computed tomography (CT) images,3,4,5,6 the development of deep
+learning-based tumor segmentation tools using routine T2-weighted (T2w)
+magnetic resonance imaging (MRI) addresses an unmet clinical need to improve
+the routine contouring of both anatomical structures and cervical cancers,
+thereby increasing quality and consistency of radiotherapy planning. This work
+applied a novel deep-learning model (PocketNet) to segment the cervix, vagina,
+uterus, and tumor(s) on T2w MRI. The performance of the PocketNet architecture
+was evaluated, when trained on data via 5-fold cross validation. PocketNet
+achieved a mean Dice-Sorensen similarity coefficient (DSC) exceeding 70% for
+tumor segmentation and 80% for organ segmentation. These results suggest that
+PocketNet is robust to variations in contrast protocols, providing reliable
+segmentation of the regions of interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmark Evaluations, Applications, and Challenges of Large Vision
+  Language Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongxia Li, Xiyang Wu, Hongyang Du, Huy Nghiem, Guangyao Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Vision Language Models (VLMs) have emerged as a transformative
+technology at the intersection of computer vision and natural language
+processing, enabling machines to perceive and reason about the world through
+both visual and textual modalities. For example, models such as CLIP, Claude,
+and GPT-4V demonstrate strong reasoning and understanding abilities on visual
+and textual data and beat classical single modality vision models on zero-shot
+classification. Despite their rapid advancements in research and growing
+popularity in applications, a comprehensive survey of existing studies on VLMs
+is notably lacking, particularly for researchers aiming to leverage VLMs in
+their specific domains. To this end, we provide a systematic overview of VLMs
+in the following aspects: model information of the major VLMs developed over
+the past five years (2019-2024); the main architectures and training methods of
+these VLMs; summary and categorization of the popular benchmarks and evaluation
+metrics of VLMs; the applications of VLMs including embodied agents, robotics,
+and video generation; the challenges and issues faced by current VLMs such as
+hallucination, fairness, and safety. Detailed collections including papers and
+model repository links are listed in
+https://github.com/zli12321/Awesome-VLM-Papers-And-Models.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pixel Is Not A Barrier: An Effective Evasion Attack for Pixel-Domain
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11810v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11810v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chun-Yen Shih, Li-Xuan Peng, Jia-Wei Liao, Ernie Chu, Cheng-Fu Chou, Jun-Cheng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Models have emerged as powerful generative models for high-quality
+image synthesis, with many subsequent image editing techniques based on them.
+However, the ease of text-based image editing introduces significant risks,
+such as malicious editing for scams or intellectual property infringement.
+Previous works have attempted to safeguard images from diffusion-based editing
+by adding imperceptible perturbations. These methods are costly and
+specifically target prevalent Latent Diffusion Models (LDMs), while
+Pixel-domain Diffusion Models (PDMs) remain largely unexplored and robust
+against such attacks. Our work addresses this gap by proposing a novel attack
+framework, AtkPDM. AtkPDM is mainly composed of a feature representation
+attacking loss that exploits vulnerabilities in denoising UNets and a latent
+optimization strategy to enhance the naturalness of adversarial images.
+Extensive experiments demonstrate the effectiveness of our approach in
+attacking dominant PDM-based editing methods (e.g., SDEdit) while maintaining
+reasonable fidelity and robustness against common defense methods.
+Additionally, our framework is extensible to LDMs, achieving comparable
+performance to existing approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Masked Mesh Learning for Unsupervised Anomaly Detection
+  on 3D Cortical Surfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05580v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05580v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao-Chun Yang, Sicheng Dai, Saige Rutherford, Christian Gaser, Andre F Marquand, Christian F Beckmann, Thomas Wolfers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly detection in brain imaging is challenging. In this
+paper, we propose a self-supervised masked mesh learning for unsupervised
+anomaly detection in 3D cortical surfaces. Our framework leverages the
+intrinsic geometry of the cortical surface to learn a self-supervised
+representation that captures the underlying structure of the brain. We
+introduce a masked mesh convolutional neural network (MMN) that learns to
+predict masked regions of the cortical surface. By training the MMN on a large
+dataset of healthy subjects, we learn a representation that captures the normal
+variation in the cortical surface. We then use this representation to detect
+anomalies in unseen individuals by calculating anomaly scores based on the
+reconstruction error of the MMN. We evaluate our framework by training on
+population-scale dataset UKB and HCP-Aging and testing on two datasets of
+Alzheimer's disease patients ADNI and OASIS3. Our results show that our
+framework can detect anomalies in cortical thickness, cortical volume, and
+cortical sulcus features, which are known to be sensitive biomarkers for
+Alzheimer's disease. Our proposed framework provides a promising approach for
+unsupervised anomaly detection based on normative variation of cortical
+features.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Atlas: A Novel Pathology Foundation Model by Mayo Clinic, Charité, and
+  Aignostics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05409v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05409v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Alber, Stephan Tietz, Jonas Dippel, Timo Milbich, Timothée Lesort, Panos Korfiatis, Moritz Krügener, Beatriz Perez Cancer, Neelay Shah, Alexander Möllers, Philipp Seegerer, Alexandra Carpen-Amarie, Kai Standvoss, Gabriel Dernbach, Edwin de Jong, Simon Schallenberg, Andreas Kunft, Helmut Hoffer von Ankershoffen, Gavin Schaeferle, Patrick Duffy, Matt Redlon, Philipp Jurmeister, David Horst, Lukas Ruff, Klaus-Robert Müller, Frederick Klauschen, Andrew Norgan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in digital pathology have demonstrated the effectiveness of
+foundation models across diverse applications. In this report, we present
+Atlas, a novel vision foundation model based on the RudolfV approach. Our model
+was trained on a dataset comprising 1.2 million histopathology whole slide
+images, collected from two medical institutions: Mayo Clinic and Charit\'e -
+Universt\"atsmedizin Berlin. Comprehensive evaluations show that Atlas achieves
+state-of-the-art performance across twenty-one public benchmark datasets, even
+though it is neither the largest model by parameter count nor by training
+dataset size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Medical Visual Representations via Radiology Report Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.19635v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.19635v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keegan Quigley, Miriam Cha, Josh Barua, Geeticka Chauhan, Seth Berkowitz, Steven Horng, Polina Golland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language pretraining has been shown to produce high-quality visual
+encoders which transfer efficiently to downstream computer vision tasks.
+Contrastive learning approaches have increasingly been adopted for medical
+vision language pretraining (MVLP), yet recent developments in generative AI
+offer new modeling alternatives. This paper introduces RadTex, a CNN-encoder
+transformer-decoder architecture optimized for radiology. We explore
+bidirectional captioning as an alternative MVLP strategy and demonstrate that
+RadTex's captioning pretraining is competitive with established contrastive
+methods, achieving a CheXpert macro-AUC of 89.4%. Additionally, RadTex's
+lightweight text decoder not only generates clinically relevant radiology
+reports (macro-F1 score of 0.349), but also provides targeted, interactive
+responses, highlighting the utility of bidirectional captioning in advancing
+medical image analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZeroComp: Zero-shot Object Compositing from Image Intrinsics via
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.08168v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.08168v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zitian Zhang, Frédéric Fortier-Chouinard, Mathieu Garon, Anand Bhattad, Jean-François Lalonde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present ZeroComp, an effective zero-shot 3D object compositing approach
+that does not require paired composite-scene images during training. Our method
+leverages ControlNet to condition from intrinsic images and combines it with a
+Stable Diffusion model to utilize its scene priors, together operating as an
+effective rendering engine. During training, ZeroComp uses intrinsic images
+based on geometry, albedo, and masked shading, all without the need for paired
+images of scenes with and without composite objects. Once trained, it
+seamlessly integrates virtual 3D objects into scenes, adjusting shading to
+create realistic composites. We developed a high-quality evaluation dataset and
+demonstrate that ZeroComp outperforms methods using explicit lighting
+estimations and generative techniques in quantitative and human perception
+benchmarks. Additionally, ZeroComp extends to real and outdoor image
+compositing, even when trained solely on synthetic indoor data, showcasing its
+effectiveness in image compositing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://lvsn.github.io/ZeroComp, Code:
+  https://github.com/lvsn/ZeroComp</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> video <span class="highlight-title">pretrain</span>ing yields robust and more human-aligned
+  visual representations <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06433v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06433v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Parthasarathy, S. M. Ali Eslami, João Carreira, Olivier J. Hénaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans learn powerful representations of objects and scenes by observing how
+they evolve over time. Yet, outside of specific tasks that require explicit
+temporal understanding, static image pretraining remains the dominant paradigm
+for learning visual foundation models. We question this mismatch, and ask
+whether video pretraining can yield visual representations that bear the
+hallmarks of human perception: generalisation across tasks, robustness to
+perturbations, and consistency with human judgements. To that end we propose a
+novel procedure for curating videos, and develop a contrastive framework which
+learns from the complex transformations therein. This simple paradigm for
+distilling knowledge from videos, called VITO, yields general representations
+that far outperform prior video pretraining methods on image understanding
+tasks, and image pretraining methods on video understanding tasks. Moreover,
+VITO representations are significantly more robust to natural and synthetic
+deformations than image-, video-, and adversarially-trained ones. Finally,
+VITO's predictions are strongly aligned with human judgements, surpassing
+models that were specifically trained for that purpose. Together, these results
+suggest that video pretraining could be a simple way of learning unified,
+robust, and human-aligned representations of the visual world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 37th Conference on Neural Information Processing Systems
+  (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FaceMe: Robust Blind Face Restoration with Personal Identification <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05177v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05177v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Liu, Zheng-Peng Duan, Jia OuYang, Jiayi Fu, Hyunhee Park, Zikun Liu, Chun-Le Guo, Chongyi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind face restoration is a highly ill-posed problem due to the lack of
+necessary context. Although existing methods produce high-quality outputs, they
+often fail to faithfully preserve the individual's identity. In this paper, we
+propose a personalized face restoration method, FaceMe, based on a diffusion
+model. Given a single or a few reference images, we use an identity encoder to
+extract identity-related features, which serve as prompts to guide the
+diffusion model in restoring high-quality and identity-consistent facial
+images. By simply combining identity-related features, we effectively minimize
+the impact of identity-irrelevant features during training and support any
+number of reference image inputs during inference. Additionally, thanks to the
+robustness of the identity encoder, synthesized images can be used as reference
+images during training, and identity changing during inference does not require
+fine-tuning the model. We also propose a pipeline for constructing a reference
+image training pool that simulates the poses and expressions that may appear in
+real-world scenarios. Experimental results demonstrate that our FaceMe can
+restore high-quality facial images while maintaining identity consistency,
+achieving excellent performance and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BIV-Priv-Seg: Locating Private Content in Images Taken by People With
+  Visual Impairments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.18243v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.18243v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Yun Tseng, Tanusree Sharma, Lotus Zhang, Abigale Stangl, Leah Findlater, Yang Wang, Danna Gurari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Individuals who are blind or have low vision (BLV) are at a heightened risk
+of sharing private information if they share photographs they have taken. To
+facilitate developing technologies that can help them preserve privacy, we
+introduce BIV-Priv-Seg, the first localization dataset originating from people
+with visual impairments that shows private content. It contains 1,028 images
+with segmentation annotations for 16 private object categories. We first
+characterize BIV-Priv-Seg and then evaluate modern models' performance for
+locating private content in the dataset. We find modern models struggle most
+with locating private objects that are not salient, small, and lack text as
+well as recognizing when private content is absent from an image. We facilitate
+future extensions by sharing our new dataset with the evaluation server at
+https://vizwiz.org/tasks-and-datasets/object-localization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advances in Diffusion Models for Image Data Augmentation: A <span class="highlight-title">Review</span> of
+  Methods, Models, Evaluation Metrics and Future Research Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04103v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04103v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panagiotis Alimisis, Ioannis Mademlis, Panagiotis Radoglou-Grammatikis, Panagiotis Sarigiannidis, Georgios Th. Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image data augmentation constitutes a critical methodology in modern computer
+vision tasks, since it can facilitate towards enhancing the diversity and
+quality of training datasets; thereby, improving the performance and robustness
+of machine learning models in downstream tasks. In parallel, augmentation
+approaches can also be used for editing/modifying a given image in a context-
+and semantics-aware way. Diffusion Models (DMs), which comprise one of the most
+recent and highly promising classes of methods in the field of generative
+Artificial Intelligence (AI), have emerged as a powerful tool for image data
+augmentation, capable of generating realistic and diverse images by learning
+the underlying data distribution. The current study realizes a systematic,
+comprehensive and in-depth review of DM-based approaches for image
+augmentation, covering a wide range of strategies, tasks and applications. In
+particular, a comprehensive analysis of the fundamental principles, model
+architectures and training strategies of DMs is initially performed.
+Subsequently, a taxonomy of the relevant image augmentation methods is
+introduced, focusing on techniques regarding semantic manipulation,
+personalization and adaptation, and application-specific augmentation tasks.
+Then, performance assessment methodologies and respective evaluation metrics
+are analyzed. Finally, current challenges and future research directions in the
+field are discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>65 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dr. Tongue: Sign-Oriented Multi-label Detection for Remote Tongue
+  Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03053v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03053v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiliang Chen, Steven SC Ho, Cheng Xu, Yao Jie Xie, Wing-Fai Yeung, Shengfeng He, Jing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tongue diagnosis is a vital tool in Western and Traditional Chinese Medicine,
+providing key insights into a patient's health by analyzing tongue attributes.
+The COVID-19 pandemic has heightened the need for accurate remote medical
+assessments, emphasizing the importance of precise tongue attribute recognition
+via telehealth. To address this, we propose a Sign-Oriented multi-label
+Attributes Detection framework. Our approach begins with an adaptive tongue
+feature extraction module that standardizes tongue images and mitigates
+environmental factors. This is followed by a Sign-oriented Network (SignNet)
+that identifies specific tongue attributes, emulating the diagnostic process of
+experienced practitioners and enabling comprehensive health evaluations. To
+validate our methodology, we developed an extensive tongue image dataset
+specifically designed for telemedicine. Unlike existing datasets, ours is
+tailored for remote diagnosis, with a comprehensive set of attribute labels.
+This dataset will be openly available, providing a valuable resource for
+research. Initial tests have shown improved accuracy in detecting various
+tongue attributes, highlighting our framework's potential as an essential tool
+for remote medical assessments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Steerable Deep Network for Model-Free Diffusion MRI Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04794v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04794v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianfranco Cortes, Xiaoda Qu, Baba C. Vemuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonrigid registration is vital to medical image analysis but remains
+challenging for diffusion MRI (dMRI) due to its high-dimensional,
+orientation-dependent nature. While classical methods are accurate, they are
+computationally demanding, and deep neural networks, though efficient, have
+been underexplored for nonrigid dMRI registration compared to structural
+imaging. We present a novel, deep learning framework for model-free, nonrigid
+registration of raw diffusion MRI data that does not require explicit
+reorientation. Unlike previous methods relying on derived representations such
+as diffusion tensors or fiber orientation distribution functions, in our
+approach, we formulate the registration as an equivariant diffeomorphism of
+position-and-orientation space. Central to our method is an
+$\mathsf{SE}(3)$-equivariant UNet that generates velocity fields while
+preserving the geometric properties of a raw dMRI's domain. We introduce a new
+loss function based on the maximum mean discrepancy in Fourier space,
+implicitly matching ensemble average propagators across images. Experimental
+results on Human Connectome Project dMRI data demonstrate competitive
+performance compared to state-of-the-art approaches, with the added advantage
+of bypassing the overhead for estimating derived representations. This work
+establishes a foundation for data-driven, geometry-aware dMRI registration
+directly in the acquisition space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Coauthor was inadvertently left out. This is now corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ViM-Disparity: Bridging the Gap of Speed, Accuracy and Memory for
+  Disparity Map Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16745v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16745v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maheswar Bora, Tushar Anand, Saurabh Atreya, Aritra Mukherjee, Abhijit Das
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we propose a Visual Mamba (ViM) based architecture, to dissolve
+the existing trade-off for real-time and accurate model with low computation
+overhead for disparity map generation (DMG). Moreover, we proposed a
+performance measure that can jointly evaluate the inference speed, computation
+overhead and the accurateness of a DMG model. The code implementation and
+corresponding models are available at: https://github.com/MBora/ViM-Disparity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning a Consensus Sub-Network with Polarization Regularization and
+  One Pass Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10798v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10798v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoying Zhi, Varun Babbar, Rundong Liu, Pheobe Sun, Fran Silavong, Ruibo Shi, Sean Moran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The subject of green AI has been gaining attention within the deep learning
+community given the recent trend of ever larger and more complex neural network
+models. Existing solutions for reducing the computational load of training at
+inference time usually involve pruning the network parameters. Pruning schemes
+often create extra overhead either by iterative training and fine-tuning for
+static pruning or repeated computation of a dynamic pruning graph. We propose a
+new parameter pruning strategy for learning a lighter-weight sub-network that
+minimizes the energy cost while maintaining comparable performance to the fully
+parameterised network on given downstream tasks. Our proposed pruning scheme is
+green-oriented, as it only requires a one-off training to discover the optimal
+static sub-networks by dynamic pruning methods. The pruning scheme consists of
+a binary gating module and a polarizing loss function to uncover sub-networks
+with user-defined sparsity. Our method enables pruning and training
+simultaneously, which saves energy in both the training and inference phases
+and avoids extra computational overhead from gating modules at inference time.
+Our results on CIFAR-10, CIFAR-100, and Tiny Imagenet suggest that our scheme
+can remove 50% of connections in deep networks with <1% reduction in
+classification accuracy. Compared to other related pruning methods, our method
+demonstrates a lower drop in accuracy for equivalent reductions in
+computational cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03775v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03775v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinbin Yuan, Zhaohui Zheng, Yuxuan Li, Xialei Liu, Li Liu, Xiang Li, Qibin Hou, Ming-Ming Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While witnessed with rapid development, remote sensing object detection
+remains challenging for detecting high aspect ratio objects. This paper shows
+that large strip convolutions are good feature representation learners for
+remote sensing object detection and can detect objects of various aspect ratios
+well. Based on large strip convolutions, we build a new network architecture
+called Strip R-CNN, which is simple, efficient, and powerful. Unlike recent
+remote sensing object detectors that leverage large-kernel convolutions with
+square shapes, our Strip R-CNN takes advantage of sequential orthogonal large
+strip convolutions to capture spatial information. In addition, we enhance the
+localization capability of remote-sensing object detectors by decoupling the
+detection heads and equipping the localization head with strip convolutions to
+better localize the target objects. Extensive experiments on several
+benchmarks, e.g., DOTA, FAIR1M, HRSC2016, and DIOR, show that our Strip R-CNN
+can largely improve previous works. Notably, our 30M model achieves 82.75% mAP
+on DOTA-v1.0, setting a new state-of-the-art record.Code is available at
+https://github.com/YXB-NKU/Strip-R-CNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dolphin: Closed-loop Open-ended Auto-research through Thinking,
+  Practice, and Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiakang Yuan, Xiangchao Yan, Botian Shi, Tao Chen, Wanli Ouyang, Bo Zhang, Lei Bai, Yu Qiao, Bowen Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scientific research paradigm is undergoing a profound transformation
+owing to the development of Artificial Intelligence (AI). Recent works
+demonstrate that various AI-assisted research methods can largely improve
+research efficiency by improving data analysis, accelerating computation, and
+fostering novel idea generation. To further move towards the ultimate goal
+(i.e., automatic scientific research), in this paper, we propose Dolphin, the
+first closed-loop open-ended auto-research framework to further build the
+entire process of human scientific research. Dolphin can generate research
+ideas, perform experiments, and get feedback from experimental results to
+generate higher-quality ideas. More specifically, Dolphin first generates novel
+ideas based on relevant papers which are ranked by the topic and task
+attributes. Then, the codes are automatically generated and debugged with the
+exception-traceback-guided local code structure. Finally, Dolphin automatically
+analyzes the results of each idea and feeds the results back to the next round
+of idea generation. Experiments are conducted on the benchmark datasets of
+different topics and results show that Dolphin can generate novel ideas
+continuously and complete the experiment in a loop. We highlight that Dolphin
+can automatically propose methods that are comparable to the state-of-the-art
+in some tasks such as 2D image classification and 3D point classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 11 figures, and our homepage:
+  https://alpha-innovator.github.io/Dolphin-project-page</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Class Distance Weighted Cross Entropy Loss for Classification of Disease
+  Severity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01246v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01246v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gorkem Polat, Ümit Mert Çağlar, Alptekin Temizel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assessing disease severity involving ordinal classes, where each class
+represents increasing levels of severity, benefit from loss functions that
+account for this ordinal structure. Traditional categorical loss functions,
+like Cross-Entropy (CE), often perform suboptimally in these scenarios. To
+address this, we propose a novel loss function, Class Distance Weighted
+Cross-Entropy (CDW-CE), which penalizes misclassifications more harshly when
+classes are farther apart. We evaluated CDW-CE on the Labeled Images for
+Ulcerative Colitis (LIMUC) dataset using various deep architectures. Its
+performance was compared against several categorical and ordinal loss
+functions. To analyze the quality of latent representations, we used
+t-distributed stochastic neighbor embedding (t-SNE) visualizations and
+quantified their clustering with the Silhouette Score. We also compared Class
+Activation Maps (CAM) generated by models trained with CDW-CE and CE loss,
+incorporating domain expert feedback to evaluate alignment with expert
+knowledge. Our results show that CDW-CE consistently improves performance in
+ordinal image classification tasks. It achieves higher Silhouette Scores,
+indicating better differentiation of class representations, and its CAM
+visualizations demonstrate a stronger focus on clinically significant regions,
+as confirmed by domain experts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CloudTrack: Scalable UAV Tracking with Cloud Semantics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16111v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16111v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannik Blei, Michael Krawez, Nisarga Nilavadi, Tanja Katharina Kaiser, Wolfram Burgard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, unmanned aerial vehicles (UAVs) are commonly used in search and
+rescue scenarios to gather information in the search area. The automatic
+identification of the person searched for in aerial footage could increase the
+autonomy of such systems, reduce the search time, and thus increase the missed
+person's chances of survival. In this paper, we present a novel approach to
+perform semantically conditioned open vocabulary object tracking that is
+specifically designed to cope with the limitations of UAV hardware. Our
+approach has several advantages. It can run with verbal descriptions of the
+missing person, e.g., the color of the shirt, it does not require dedicated
+training to execute the mission and can efficiently track a potentially moving
+person. Our experimental results demonstrate the versatility and efficacy of
+our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Differential Appearance Equations <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.07128v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.07128v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liu, Tobias Ritschel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method to reproduce dynamic appearance textures with
+space-stationary but time-varying visual statistics. While most previous work
+decomposes dynamic textures into static appearance and motion, we focus on
+dynamic appearance that results not from motion but variations of fundamental
+properties, such as rusting, decaying, melting, and weathering. To this end, we
+adopt the neural ordinary differential equation (ODE) to learn the underlying
+dynamics of appearance from a target exemplar. We simulate the ODE in two
+phases. At the "warm-up" phase, the ODE diffuses a random noise to an initial
+state. We then constrain the further evolution of this ODE to replicate the
+evolution of visual feature statistics in the exemplar during the generation
+phase. The particular innovation of this work is the neural ODE achieving both
+denoising and evolution for dynamics synthesis, with a proposed temporal
+training scheme. We study both relightable (BRDF) and non-relightable (RGB)
+appearance models. For both we introduce new pilot datasets, allowing, for the
+first time, to study such phenomena: For RGB we provide 22 dynamic textures
+acquired from free online sources; For BRDFs, we further acquire a dataset of
+21 flash-lit videos of time-varying materials, enabled by a simple-to-construct
+setup. Our experiments show that our method consistently yields realistic and
+coherent results, whereas prior works falter under pronounced temporal
+appearance variations. A user study confirms our approach is preferred to
+previous work for such exemplars.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH Asia 2024 Journal Track. Project page at
+  https://ryushinn.github.io/ode-appearance</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chimera: Improving Generalist Model with Domain-Specific Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05983v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05983v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianshuo Peng, Mingsheng Li, Hongbin Zhou, Renqiu Xia, Renrui Zhang, Lei Bai, Song Mao, Bin Wang, Conghui He, Aojun Zhou, Botian Shi, Tao Chen, Bo Zhang, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Large Multi-modal Models (LMMs) underscore the
+importance of scaling by increasing image-text paired data, achieving
+impressive performance on general tasks. Despite their effectiveness in broad
+applications, generalist models are primarily trained on web-scale datasets
+dominated by natural images, resulting in the sacrifice of specialized
+capabilities for domain-specific tasks that require extensive domain prior
+knowledge. Moreover, directly integrating expert models tailored for specific
+domains is challenging due to the representational gap and imbalanced
+optimization between the generalist model and experts. To address these
+challenges, we introduce Chimera, a scalable and low-cost multi-modal pipeline
+designed to boost the ability of existing LMMs with domain-specific experts.
+Specifically, we design a progressive training strategy to integrate features
+from expert models into the input of a generalist LMM. To address the
+imbalanced optimization caused by the well-aligned general visual encoder, we
+introduce a novel Generalist-Specialist Collaboration Masking (GSCM) mechanism.
+This results in a versatile model that excels across the chart, table, math,
+and document domains, achieving state-of-the-art performance on multi-modal
+reasoning and visual content extraction tasks, both of which are challenging
+tasks for assessing existing LMMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Chimera Homepage: https://alpha-innovator.github.io/chimera_page</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GeoX: Geometric Problem Solving Through Unified Formalized
+  Vision-Language <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11863v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11863v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renqiu Xia, Mingsheng Li, Hancheng Ye, Wenjie Wu, Hongbin Zhou, Jiakang Yuan, Tianshuo Peng, Xinyu Cai, Xiangchao Yan, Bin Wang, Conghui He, Botian Shi, Tao Chen, Junchi Yan, Bo Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their proficiency in general tasks, Multi-modal Large Language Models
+(MLLMs) struggle with automatic Geometry Problem Solving (GPS), which demands
+understanding diagrams, interpreting symbols, and performing complex reasoning.
+This limitation arises from their pre-training on natural images and texts,
+along with the lack of automated verification in the problem-solving process.
+Besides, current geometric specialists are limited by their task-specific
+designs, making them less effective for broader geometric problems. To this
+end, we present GeoX, a multi-modal large model focusing on geometric
+understanding and reasoning tasks. Given the significant differences between
+geometric diagram-symbol and natural image-text, we introduce unimodal
+pre-training to develop a diagram encoder and symbol decoder, enhancing the
+understanding of geometric images and corpora. Furthermore, we introduce
+geometry-language alignment, an effective pre-training paradigm that bridges
+the modality gap between unimodal geometric experts. We propose a
+Generator-And-Sampler Transformer (GS-Former) to generate discriminative
+queries and eliminate uninformative representations from unevenly distributed
+geometric signals. Finally, GeoX benefits from visual instruction tuning,
+empowering it to take geometric images and questions as input and generate
+verifiable solutions. Experiments show that GeoX outperforms both generalists
+and geometric specialists on publicly recognized benchmarks, such as GeoQA,
+UniGeo, Geometry3K, and PGPS9k.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code is available at https://github.com/Alpha-Innovator/GeoX</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Backdoor Attacks against No-Reference Image Quality Assessment Models
+  via a Scalable Trigger <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07277v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07277v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Yu, Song Xia, Xun Lin, Wenhan Yang, Shijian Lu, Yap-peng Tan, Alex Kot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  No-Reference Image Quality Assessment (NR-IQA), responsible for assessing the
+quality of a single input image without using any reference, plays a critical
+role in evaluating and optimizing computer vision systems, e.g., low-light
+enhancement. Recent research indicates that NR-IQA models are susceptible to
+adversarial attacks, which can significantly alter predicted scores with
+visually imperceptible perturbations. Despite revealing vulnerabilities, these
+attack methods have limitations, including high computational demands,
+untargeted manipulation, limited practical utility in white-box scenarios, and
+reduced effectiveness in black-box scenarios. To address these challenges, we
+shift our focus to another significant threat and present a novel
+poisoning-based backdoor attack against NR-IQA (BAIQA), allowing the attacker
+to manipulate the IQA model's output to any desired target value by simply
+adjusting a scaling coefficient $\alpha$ for the trigger. We propose to inject
+the trigger in the discrete cosine transform (DCT) domain to improve the local
+invariance of the trigger for countering trigger diminishment in NR-IQA models
+due to widely adopted data augmentations. Furthermore, the universal
+adversarial perturbations (UAP) in the DCT space are designed as the trigger,
+to increase IQA model susceptibility to manipulation and improve attack
+effectiveness. In addition to the heuristic method for poison-label BAIQA
+(P-BAIQA), we explore the design of clean-label BAIQA (C-BAIQA), focusing on
+$\alpha$ sampling and image data refinement, driven by theoretical insights we
+reveal. Extensive experiments on diverse datasets and various NR-IQA models
+demonstrate the effectiveness of our attacks. Code can be found at
+https://github.com/yuyi-sd/BAIQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PGSR: Planar-based Gaussian Splatting for Efficient and High-Fidelity
+  Surface Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.06521v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.06521v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danpeng Chen, Hai Li, Weicai Ye, Yifan Wang, Weijian Xie, Shangjin Zhai, Nan Wang, Haomin Liu, Hujun Bao, Guofeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, 3D Gaussian Splatting (3DGS) has attracted widespread attention due
+to its high-quality rendering, and ultra-fast training and rendering speed.
+However, due to the unstructured and irregular nature of Gaussian point clouds,
+it is difficult to guarantee geometric reconstruction accuracy and multi-view
+consistency simply by relying on image reconstruction loss. Although many
+studies on surface reconstruction based on 3DGS have emerged recently, the
+quality of their meshes is generally unsatisfactory. To address this problem,
+we propose a fast planar-based Gaussian splatting reconstruction representation
+(PGSR) to achieve high-fidelity surface reconstruction while ensuring
+high-quality rendering. Specifically, we first introduce an unbiased depth
+rendering method, which directly renders the distance from the camera origin to
+the Gaussian plane and the corresponding normal map based on the Gaussian
+distribution of the point cloud, and divides the two to obtain the unbiased
+depth. We then introduce single-view geometric, multi-view photometric, and
+geometric regularization to preserve global geometric accuracy. We also propose
+a camera exposure compensation model to cope with scenes with large
+illumination variations. Experiments on indoor and outdoor scenes show that our
+method achieves fast training and rendering while maintaining high-fidelity
+rendering and geometric reconstruction, outperforming 3DGS-based and NeRF-based
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://zju3dv.github.io/pgsr/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VideoChat-Flash: Hierarchical Compression for Long-Context Video
+  Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00574v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00574v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhao Li, Yi Wang, Jiashuo Yu, Xiangyu Zeng, Yuhan Zhu, Haian Huang, Jianfei Gao, Kunchang Li, Yinan He, Chenting Wang, Yu Qiao, Yali Wang, Limin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-context modeling is a critical capability for multimodal large language
+models (MLLMs), enabling them to process long-form contents with implicit
+memorization. Despite its advances, handling extremely long videos remains
+challenging due to the difficulty in maintaining crucial features over extended
+sequences. This paper introduces a Hierarchical visual token Compression (HiCo)
+method designed for high-fidelity representation and a practical context
+modeling system VideoChat-Flash tailored for multimodal long-sequence
+processing. HiCo capitalizes on the redundancy of visual information in long
+videos to compress long video context from the clip-level to the video-level,
+reducing the compute significantly while preserving essential details.
+VideoChat-Flash features a multi-stage short-to-long learning scheme, a rich
+dataset of real-world long videos named LongVid, and an upgraded
+"Needle-In-A-video-Haystack" (NIAH) for evaluating context capacities. In
+extensive experiments, VideoChat-Flash shows the leading performance on both
+mainstream long and short video benchmarks at the 2B and 7B model scale. It
+firstly gets 99.1% accuracy over 10,000 frames in NIAH among open-source
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OmniCount: Multi-label Object Counting with Semantic-Geometric Priors <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05435v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05435v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anindya Mondal, Sauradip Nag, Xiatian Zhu, Anjan Dutta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object counting is pivotal for understanding the composition of scenes.
+Previously, this task was dominated by class-specific methods, which have
+gradually evolved into more adaptable class-agnostic strategies. However, these
+strategies come with their own set of limitations, such as the need for manual
+exemplar input and multiple passes for multiple categories, resulting in
+significant inefficiencies. This paper introduces a more practical approach
+enabling simultaneous counting of multiple object categories using an
+open-vocabulary framework. Our solution, OmniCount, stands out by using
+semantic and geometric insights (priors) from pre-trained models to count
+multiple categories of objects as specified by users, all without additional
+training. OmniCount distinguishes itself by generating precise object masks and
+leveraging varied interactive prompts via the Segment Anything Model for
+efficient counting. To evaluate OmniCount, we created the OmniCount-191
+benchmark, a first-of-its-kind dataset with multi-label object counts,
+including points, bounding boxes, and VQA annotations. Our comprehensive
+evaluation in OmniCount-191, alongside other leading benchmarks, demonstrates
+OmniCount's exceptional performance, significantly outpacing existing
+solutions. The project webpage is available at
+https://mondalanindya.github.io/OmniCount.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gender Bias in Text-to-Video Generation Models: A case study of Sora 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01987v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01987v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Nadeem, Shahab Saquib Sohail, Erik Cambria, Björn W. Schuller, Amir Hussain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of text-to-video generation models has revolutionized content
+creation as it produces high-quality videos from textual prompts. However,
+concerns regarding inherent biases in such models have prompted scrutiny,
+particularly regarding gender representation. Our study investigates the
+presence of gender bias in OpenAI's Sora, a state-of-the-art text-to-video
+generation model. We uncover significant evidence of bias by analyzing the
+generated videos from a diverse set of gender-neutral and stereotypical
+prompts. The results indicate that Sora disproportionately associates specific
+genders with stereotypical behaviors and professions, which reflects societal
+prejudices embedded in its training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MC-VTON: Minimal Control Virtual Try-On Diffusion <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03630v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03630v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junsheng Luan, Guangyuan Li, Lei Zhao, Wei Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual try-on methods based on diffusion models achieve realistic try-on
+effects. They use an extra reference network or an additional image encoder to
+process multiple conditional image inputs, which adds complexity pre-processing
+and additional computational costs. Besides, they require more than 25
+inference steps, bringing longer inference time. In this work, with the
+development of diffusion transformer (DiT), we rethink the necessity of
+additional reference network or image encoder and introduce MC-VTON, which
+leverages DiT's intrinsic backbone to seamlessly integrate minimal conditional
+try-on inputs. Compared to existing methods, the superiority of MC-VTON is
+demonstrated in four aspects: (1) Superior detail fidelity. Our DiT-based
+MC-VTON exhibits superior fidelity in preserving fine-grained details. (2)
+Simplified network and inputs. We remove any extra reference network or image
+encoder. We also remove unnecessary conditions like the long prompt, pose
+estimation, human parsing, and depth map. We require only the masked person
+image and the garment image. (3) Parameter-efficient training. To process the
+try-on task, we fine-tune the FLUX.1-dev with only 39.7M additional parameters
+(0.33% of the backbone parameters). (4) Less inference steps. We apply
+distillation diffusion on MC-VTON and only need 8 steps to generate a realistic
+try-on image, with only 86.8M additional parameters (0.72% of the backbone
+parameters). Experiments show that MC-VTON achieves superior qualitative and
+quantitative results with fewer condition inputs, trainable parameters, and
+inference steps than baseline methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VLM-driven Behavior Tree for Context-aware Task Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03968v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03968v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoki Wake, Atsushi Kanehira, Jun Takamatsu, Kazuhiro Sasabuchi, Katsushi Ikeuchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)
+has recently gained attention in the robotics community, yet remains in its
+early stages of development. In this paper, we propose a novel framework that
+leverages Vision-Language Models (VLMs) to interactively generate and edit BTs
+that address visual conditions, enabling context-aware robot operations in
+visually complex environments. A key feature of our approach lies in the
+conditional control through self-prompted visual conditions. Specifically, the
+VLM generates BTs with visual condition nodes, where conditions are expressed
+as free-form text. Another VLM process integrates the text into its prompt and
+evaluates the conditions against real-world images during robot execution. We
+validated our framework in a real-world cafe scenario, demonstrating both its
+feasibility and limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 11 figures, 5 tables. Last updated on January 9th, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long Story Short: Story-level Video Understanding from 20K Short Films 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ridouane Ghermi, Xi Wang, Vicky Kalogeiton, Ivan Laptev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent developments in vision-language models have significantly advanced
+video understanding. Existing datasets and tasks, however, have notable
+limitations. Most datasets are confined to short videos with limited events and
+narrow narratives. For example, datasets with instructional and egocentric
+videos often depict activities of one person in a single scene. Although
+existing movie datasets offer richer content, they are often limited to
+short-term tasks, lack publicly available videos, and frequently encounter data
+leakage issues given the use of subtitles and other information about
+commercial movies during LLM pretraining. To address the above limitations, we
+propose Short-Films 20K (SF20K), the largest publicly available movie dataset.
+SF20K is composed of 20,143 amateur films and offers long-term video tasks in
+the form of multiple-choice and open-ended question answering. Our extensive
+analysis of SF20K reveals minimal data leakage, emphasizes the need for
+long-term reasoning, and demonstrates the strong performance of recent VLMs.
+Finally, we show that instruction tuning on the SF20K-Train set substantially
+improves model performance, paving the way for future progress in long-term
+video understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fractional Concepts in Neural Networks: Enhancing Activation Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11875v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11875v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zahra Alijani, Vojtech Molek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing effective neural networks requires tuning architectural elements.
+This study integrates fractional calculus into neural networks by introducing
+fractional order derivatives (FDO) as tunable parameters in activation
+functions, allowing diverse activation functions by adjusting the FDO. We
+evaluate these fractional activation functions on various datasets and network
+architectures, comparing their performance with traditional and new activation
+functions. Our experiments assess their impact on accuracy, time complexity,
+computational overhead, and memory usage. Results suggest fractional activation
+functions, particularly fractional Sigmoid, offer benefits in some scenarios.
+Challenges related to consistency and efficiency remain. Practical implications
+and limitations are discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 8 figures, submitted to pattern recognition letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MoColl: Agent-Based Specific and General Model Collaboration for Image
+  Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01834v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01834v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pu Yang, Bin Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image captioning is a critical task at the intersection of computer vision
+and natural language processing, with wide-ranging applications across various
+domains. For complex tasks such as diagnostic report generation, deep learning
+models require not only domain-specific image-caption datasets but also the
+incorporation of relevant general knowledge to provide contextual accuracy.
+Existing approaches exhibit inherent limitations: specialized models excel in
+capturing domain-specific details but lack generalization, while
+vision-language models (VLMs) built on large language models (LLMs) leverage
+general knowledge but struggle with domain-specific adaptation. To address
+these limitations, this paper proposes a novel agent-enhanced model
+collaboration framework, which we call MoColl, designed to effectively
+integrate domain-specific and general knowledge. Specifically, our approach is
+to decompose complex image captioning tasks into a series of interconnected
+question-answer subtasks. A trainable visual question answering (VQA) model is
+employed as a specialized tool to focus on domain-specific visual analysis,
+answering task-specific questions based on image content. Concurrently, an
+LLM-based agent with general knowledge formulates these questions and
+synthesizes the resulting question-answer pairs into coherent captions. Beyond
+its role in leveraging the VQA model, the agent further guides its training to
+enhance its domain-specific capabilities. Experimental results on radiology
+report generation validate the effectiveness of the proposed framework,
+demonstrating significant improvements in the quality of generated reports.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards a Multimodal Large Language Model with Pixel-Level Insight for
+  Biomedicine <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09278v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09278v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshuang Huang, Lingdong Shen, Jia Liu, Fangxin Shang, Hongxiang Li, Haifeng Huang, Yehui Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Multimodal Large Language Models (MLLM) have achieved
+notable advancements, demonstrating the feasibility of developing an
+intelligent biomedical assistant. However, current biomedical MLLMs
+predominantly focus on image-level understanding and restrict interactions to
+textual commands, thus limiting their capability boundaries and the flexibility
+of usage. In this paper, we introduce a novel end-to-end multimodal large
+language model for the biomedical domain, named MedPLIB, which possesses
+pixel-level understanding. Excitingly, it supports visual question answering
+(VQA), arbitrary pixel-level prompts (points, bounding boxes, and free-form
+shapes), and pixel-level grounding. We propose a novel Mixture-of-Experts (MoE)
+multi-stage training strategy, which divides MoE into separate training phases
+for a visual-language expert model and a pixel-grounding expert model, followed
+by fine-tuning using MoE. This strategy effectively coordinates multitask
+learning while maintaining the computational cost at inference equivalent to
+that of a single expert model. To advance the research of biomedical MLLMs, we
+introduce the Medical Complex Vision Question Answering Dataset (MeCoVQA),
+which comprises an array of 8 modalities for complex medical imaging question
+answering and image region understanding. Experimental results indicate that
+MedPLIB has achieved state-of-the-art outcomes across multiple medical visual
+language tasks. More importantly, in zero-shot evaluations for the pixel
+grounding task, MedPLIB leads the best small and large models by margins of
+19.7 and 15.6 respectively on the mDice metric. The codes, data, and model
+checkpoints will be made publicly available at
+https://github.com/ShawnHuang497/MedPLIB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HazeCLIP: Towards Language Guided Real-World Image Dehazing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.13719v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.13719v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyi Wang, Wenhao Li, Xiaohong Liu, Chunyi Li, Zicheng Zhang, Xiongkuo Min, Guangtao Zhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods have achieved remarkable performance in image dehazing,
+particularly on synthetic datasets. However, they often struggle with
+real-world hazy images due to domain shift, limiting their practical
+applicability. This paper introduces HazeCLIP, a language-guided adaptation
+framework designed to enhance the real-world performance of pre-trained
+dehazing networks. Inspired by the Contrastive Language-Image Pre-training
+(CLIP) model's ability to distinguish between hazy and clean images, we
+leverage it to evaluate dehazing results. Combined with a region-specific
+dehazing technique and tailored prompt sets, the CLIP model accurately
+identifies hazy areas, providing a high-quality, human-like prior that guides
+the fine-tuning process of pre-trained networks. Extensive experiments
+demonstrate that HazeCLIP achieves state-of-the-art performance in real-word
+image dehazing, evaluated through both visual quality and image quality
+assessment metrics. Codes are available at https://github.com/Troivyn/HazeCLIP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Static for Dynamic: Towards a Deeper Understanding of Dynamic Facial
+  Expressions Using Static Expression Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.06154v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.06154v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Chen, Jia Li, Yu Zhang, Zhenzhen Hu, Shiguang Shan, Meng Wang, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic facial expression recognition (DFER) infers emotions from the
+temporal evolution of expressions, unlike static facial expression recognition
+(SFER), which relies solely on a single snapshot. This temporal analysis
+provides richer information and promises greater recognition capability.
+However, current DFER methods often exhibit unsatisfied performance largely due
+to fewer training samples compared to SFER. Given the inherent correlation
+between static and dynamic expressions, we hypothesize that leveraging the
+abundant SFER data can enhance DFER. To this end, we propose Static-for-Dynamic
+(S4D), a unified dual-modal learning framework that integrates SFER data as a
+complementary resource for DFER. Specifically, S4D employs dual-modal
+self-supervised pre-training on facial images and videos using a shared Vision
+Transformer (ViT) encoder-decoder architecture, yielding improved
+spatiotemporal representations. The pre-trained encoder is then fine-tuned on
+static and dynamic expression datasets in a multi-task learning setup to
+facilitate emotional information interaction. Unfortunately, vanilla multi-task
+learning in our study results in negative transfer. To address this, we propose
+an innovative Mixture of Adapter Experts (MoAE) module that facilitates
+task-specific knowledge acquisition while effectively extracting shared
+knowledge from both static and dynamic expression data. Extensive experiments
+demonstrate that S4D achieves a deeper understanding of DFER, setting new
+state-of-the-art performance on FERV39K, MAFW, and DFEW benchmarks, with
+weighted average recall (WAR) of 53.65\%, 58.44\%, and 76.68\%, respectively.
+Additionally, a systematic correlation analysis between SFER and DFER tasks is
+presented, which further elucidates the potential benefits of leveraging SFER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and model are publicly available here
+  https://github.com/MSA-LMC/S4D</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Progressive Image Compression with Variance-aware Masking <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10185v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10185v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alberto Presta, Enzo Tartaglione, Attilio Fiandrotti, Marco Grangetto, Pamela Cosman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learned progressive image compression is gaining momentum as it allows
+improved image reconstruction as more bits are decoded at the receiver. We
+propose a progressive image compression method in which an image is first
+represented as a pair of base-quality and top-quality latent representations.
+Next, a residual latent representation is encoded as the element-wise
+difference between the top and base representations. Our scheme enables
+progressive image compression with element-wise granularity by introducing a
+masking system that ranks each element of the residual latent representation
+from most to least important, dividing it into complementary components, which
+can be transmitted separately to the decoder in order to obtain different
+reconstruction quality. The masking system does not add further parameters nor
+complexity. At the receiver, any elements of the top latent representation
+excluded from the transmitted components can be independently replaced with the
+mean predicted by the hyperprior architecture, ensuring reliable
+reconstructions at any intermediate quality level. We also introduced Rate
+Enhancement Modules (REMs), which refine the estimation of entropy parameters
+using already decoded components. We obtain results competitive with
+state-of-the-art competitors, while significantly reducing computational
+complexity, decoding time, and number of parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages. Accepted at WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image-based Multimodal Models as Intruders: Transferable Multimodal
+  Attacks on Video-based MLLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01042v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01042v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linhao Huang, Xue Jiang, Zhiqiang Wang, Wentao Mo, Xi Xiao, Bo Han, Yongjie Yin, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based multimodal large language models (V-MLLMs) have shown
+vulnerability to adversarial examples in video-text multimodal tasks. However,
+the transferability of adversarial videos to unseen models--a common and
+practical real world scenario--remains unexplored. In this paper, we pioneer an
+investigation into the transferability of adversarial video samples across
+V-MLLMs. We find that existing adversarial attack methods face significant
+limitations when applied in black-box settings for V-MLLMs, which we attribute
+to the following shortcomings: (1) lacking generalization in perturbing video
+features, (2) focusing only on sparse key-frames, and (3) failing to integrate
+multimodal information. To address these limitations and deepen the
+understanding of V-MLLM vulnerabilities in black-box scenarios, we introduce
+the Image-to-Video MLLM (I2V-MLLM) attack. In I2V-MLLM, we utilize an
+image-based multimodal model (IMM) as a surrogate model to craft adversarial
+video samples. Multimodal interactions and temporal information are integrated
+to disrupt video representations within the latent space, improving adversarial
+transferability. In addition, a perturbation propagation technique is
+introduced to handle different unknown frame sampling strategies. Experimental
+results demonstrate that our method can generate adversarial examples that
+exhibit strong transferability across different V-MLLMs on multiple video-text
+multimodal tasks. Compared to white-box attacks on these models, our black-box
+attacks (using BLIP-2 as surrogate model) achieve competitive performance, with
+average attack success rates of 55.48% on MSVD-QA and 58.26% on MSRVTT-QA for
+VideoQA tasks, respectively. Our code will be released upon acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ResPanDiff: Diffusion Model for Pansharpening by Inferring Residual
+  Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05091v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05091v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiqi Cao, Liangjian Deng, Shangqi Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The implementation of diffusion-based pansharpening task is predominantly
+constrained by its slow inference speed, which results from numerous sampling
+steps. Despite the existing techniques aiming to accelerate sampling, they
+often compromise performance when fusing multi-source images. To ease this
+limitation, we introduce a novel and efficient diffusion model named Diffusion
+Model for Pansharpening by Inferring Residual Inference (ResPanDiff), which
+significantly reduces the number of diffusion steps without sacrificing the
+performance to tackle pansharpening task. In ResPanDiff, we innovatively
+propose a Markov chain that transits from noisy residuals to the residuals
+between the LRMS and HRMS images, thereby reducing the number of sampling steps
+and enhancing performance. Additionally, we design the latent space to help
+model extract more features at the encoding stage, Shallow
+Cond-Injection~(SC-I) to help model fetch cond-injected hidden features with
+higher dimensions, and loss functions to give a better guidance for the
+residual generation task. enabling the model to achieve superior performance in
+residual generation. Furthermore, experimental evaluations on pansharpening
+datasets demonstrate that the proposed method achieves superior outcomes
+compared to recent state-of-the-art~(SOTA) techniques, requiring only 15
+sampling steps, which reduces over $90\%$ step compared with the benchmark
+diffusion models. Our experiments also include thorough discussions and
+ablation studies to underscore the effectiveness of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balanced Multi-view Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02564v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02564v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenglai Li, Jun Wang, Chang Tang, Xinzhong Zhu, Wei Zhang, Xinwang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view clustering (MvC) aims to integrate information from different
+views to enhance the capability of the model in capturing the underlying data
+structures. The widely used joint training paradigm in MvC is potentially not
+fully leverage the multi-view information, since the imbalanced and
+under-optimized view-specific features caused by the uniform learning objective
+for all views. For instance, particular views with more discriminative
+information could dominate the learning process in the joint training paradigm,
+leading to other views being under-optimized. To alleviate this issue, we first
+analyze the imbalanced phenomenon in the joint-training paradigm of multi-view
+clustering from the perspective of gradient descent for each view-specific
+feature extractor. Then, we propose a novel balanced multi-view clustering
+(BMvC) method, which introduces a view-specific contrastive regularization
+(VCR) to modulate the optimization of each view. Concretely, VCR preserves the
+sample similarities captured from the joint features and view-specific ones
+into the clustering distributions corresponding to view-specific features to
+enhance the learning process of view-specific feature extractors. Additionally,
+a theoretical analysis is provided to illustrate that VCR adaptively modulates
+the magnitudes of gradients for updating the parameters of view-specific
+feature extractors to achieve a balanced multi-view learning procedure. In such
+a manner, BMvC achieves a better trade-off between the exploitation of
+view-specific patterns and the exploration of view-invariance patterns to fully
+learn the multi-view information for the clustering task. Finally, a set of
+experiments are conducted to verify the superiority of the proposed method
+compared with state-of-the-art approaches both on eight benchmark MvC datasets
+and two spatially resolved transcriptomics datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We are withdrawing this paper due to issues in the experimental
+  section related to the Application for Spatially Resolved Transcriptomics
+  Data Clustering. These issues affect the validity of the results presented.
+  We believe it is necessary to withdraw the paper to address these problems
+  adequately before resubmission.</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aria: An Open Multimodal Native Mixture-of-Experts Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05993v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05993v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongxu Li, Yudong Liu, Haoning Wu, Yue Wang, Zhiqi Shen, Bowen Qu, Xinyao Niu, Fan Zhou, Chengen Huang, Yanpeng Li, Chongyan Zhu, Xiaoyi Ren, Chao Li, Yifan Ye, Peng Liu, Lihuan Zhang, Hanshu Yan, Guoyin Wang, Bei Chen, Junnan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information comes in diverse modalities. Multimodal native AI models are
+essential to integrate real-world information and deliver comprehensive
+understanding. While proprietary multimodal native models exist, their lack of
+openness imposes obstacles for adoptions, let alone adaptations. To fill this
+gap, we introduce Aria, an open multimodal native model with best-in-class
+performance across a wide range of multimodal, language, and coding tasks. Aria
+is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual
+token and text token, respectively. It outperforms Pixtral-12B and
+Llama3.2-11B, and is competitive against the best proprietary models on various
+multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline,
+which progressively equips the model with strong capabilities in language
+understanding, multimodal understanding, long context window, and instruction
+following. We open-source the model weights along with a codebase that
+facilitates easy adoptions and adaptations of Aria in real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ViPOcc: Leveraging Visual Priors from Vision Foundation Models for
+  Single-View 3D Occupancy Prediction <span class="chip">AAAI25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Feng, Yu Han, Xijing Zhang, Tanghui Li, Yanting Zhang, Rui Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inferring the 3D structure of a scene from a single image is an ill-posed and
+challenging problem in the field of vision-centric autonomous driving. Existing
+methods usually employ neural radiance fields to produce voxelized 3D
+occupancy, lacking instance-level semantic reasoning and temporal photometric
+consistency. In this paper, we propose ViPOcc, which leverages the visual
+priors from vision foundation models (VFMs) for fine-grained 3D occupancy
+prediction. Unlike previous works that solely employ volume rendering for RGB
+and depth image reconstruction, we introduce a metric depth estimation branch,
+in which an inverse depth alignment module is proposed to bridge the domain gap
+in depth distribution between VFM predictions and the ground truth. The
+recovered metric depth is then utilized in temporal photometric alignment and
+spatial geometric alignment to ensure accurate and consistent 3D occupancy
+prediction. Additionally, we also propose a semantic-guided non-overlapping
+Gaussian mixture sampler for efficient, instance-aware ray sampling, which
+addresses the redundant and imbalanced sampling issue that still exists in
+previous state-of-the-art methods. Extensive experiments demonstrate the
+superior performance of ViPOcc in both 3D occupancy prediction and depth
+estimation tasks on the KITTI-360 and KITTI Raw datasets. Our code is available
+at: \url{https://mias.group/ViPOcc}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to AAAI25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GridShow: Omni Visual Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10718v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10718v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Wan, Xiangyang Luo, Zijian Cai, Yiren Song, Yunlong Zhao, Yifan Bai, Yuhang He, Yihong Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce GRID, a novel paradigm that reframes a broad
+range of visual generation tasks as the problem of arranging grids, akin to
+film strips. At its core, GRID transforms temporal sequences into grid layouts,
+enabling image generation models to process visual sequences holistically. To
+achieve both layout consistency and motion coherence, we develop a parallel
+flow-matching training strategy that combines layout matching and temporal
+losses, guided by a coarse-to-fine schedule that evolves from basic layouts to
+precise motion control. Our approach demonstrates remarkable efficiency,
+achieving up to 35 faster inference speeds while using 1/1000 of the
+computational resources compared to specialized models. Extensive experiments
+show that GRID exhibits exceptional versatility across diverse visual
+generation tasks, from Text-to-Video to 3D Editing, while maintaining its
+foundational image generation capabilities. This dual strength in both expanded
+applications and preserved core competencies establishes GRID as an efficient
+and versatile omni-solution for visual generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Codes: https://github.com/Should-AI-Lab/GRID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Infrared Image Super-Resolution: Systematic <span class="highlight-title">Review</span>, and Future Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12322v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12322v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongsong Huang, Tomo Miyazaki, Xiaofeng Liu, Shinichiro Omachi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image Super-Resolution (SR) is essential for a wide range of computer vision
+and image processing tasks. Investigating infrared (IR) image (or thermal
+images) super-resolution is a continuing concern within the development of deep
+learning. This survey aims to provide a comprehensive perspective of IR image
+super-resolution, including its applications, hardware imaging system dilemmas,
+and taxonomy of image processing methodologies. In addition, the datasets and
+evaluation metrics in IR image super-resolution tasks are also discussed.
+Furthermore, the deficiencies in current technologies and possible promising
+directions for the community to explore are highlighted. To cope with the rapid
+development in this field, we intend to regularly update the relevant excellent
+work at \url{https://github.com/yongsongH/Infrared_Image_SR_Survey
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Factorized Diffusion: Perceptual Illusions by Noise Decomposition <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.11615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.11615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Geng, Inbum Park, Andrew Owens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a factorization of an image into a sum of linear components, we present
+a zero-shot method to control each individual component through diffusion model
+sampling. For example, we can decompose an image into low and high spatial
+frequencies and condition these components on different text prompts. This
+produces hybrid images, which change appearance depending on viewing distance.
+By decomposing an image into three frequency subbands, we can generate hybrid
+images with three prompts. We also use a decomposition into grayscale and color
+components to produce images whose appearance changes when they are viewed in
+grayscale, a phenomena that naturally occurs under dim lighting. And we explore
+a decomposition by a motion blur kernel, which produces images that change
+appearance under motion blurring. Our method works by denoising with a
+composite noise estimate, built from the components of noise estimates
+conditioned on different prompts. We also show that for certain decompositions,
+our method recovers prior approaches to compositional generation and spatial
+control. Finally, we show that we can extend our approach to generate hybrid
+images from real images. We do this by holding one component fixed and
+generating the remaining components, effectively solving an inverse problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024 camera ready version + more readable size</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Plug-and-Play DISep: Separating Dense Instances for Scene-to-Pixel
+  Weakly-Supervised Change Detection in High-Resolution Remote Sensing Images <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04934v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04934v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghui Zhao, Chen Wu, Lixiang Ru, Di Wang, Hongruixuan Chen, Cuiqun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Weakly-Supervised Change Detection (WSCD) methods often encounter
+the problem of "instance lumping" under scene-level supervision, particularly
+in scenarios with a dense distribution of changed instances (i.e., changed
+objects). In these scenarios, unchanged pixels between changed instances are
+also mistakenly identified as changed, causing multiple changes to be
+mistakenly viewed as one. In practical applications, this issue prevents the
+accurate quantification of the number of changes. To address this issue, we
+propose a Dense Instance Separation (DISep) method as a plug-and-play solution,
+refining pixel features from a unified instance perspective under scene-level
+supervision. Specifically, our DISep comprises a three-step iterative training
+process: 1) Instance Localization: We locate instance candidate regions for
+changed pixels using high-pass class activation maps. 2) Instance Retrieval: We
+identify and group these changed pixels into different instance IDs through
+connectivity searching. Then, based on the assigned instance IDs, we extract
+corresponding pixel-level features on a per-instance basis. 3) Instance
+Separation: We introduce a separation loss to enforce intra-instance pixel
+consistency in the embedding space, thereby ensuring separable instance feature
+representations. The proposed DISep adds only minimal training cost and no
+inference cost. It can be seamlessly integrated to enhance existing WSCD
+methods. We achieve state-of-the-art performance by enhancing {three
+Transformer-based and four ConvNet-based methods} on the LEVIR-CD, WHU-CD,
+DSIFN-CD, SYSU-CD, and CDD datasets. Additionally, our DISep can be used to
+improve fully-supervised change detection methods. Code is available at
+https://github.com/zhenghuizhao/Plug-and-Play-DISep-for-Change-Detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ISPRS Journal of Photogrammetry and Remote Sensing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MiM: Mask in Mask <span class="highlight-title">Self-Supervised</span> <span class="highlight-title">Pre-Train</span>ing for 3D Medical Image
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.15580v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.15580v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Zhuang, Linshan Wu, Qiong Wang, Peng Fei, Varut Vardhanabhuti, Lin Luo, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Vision Transformer (ViT) has demonstrated remarkable performance in
+Self-Supervised Learning (SSL) for 3D medical image analysis. Masked
+AutoEncoder (MAE) for feature pre-training can further unleash the potential of
+ViT on various medical vision tasks. However, due to large spatial sizes with
+much higher dimensions of 3D medical images, the lack of hierarchical design
+for MAE may hinder the performance of downstream tasks. In this paper, we
+propose a novel \textit{Mask in Mask (MiM)} pre-training framework for 3D
+medical images, which aims to advance MAE by learning discriminative
+representation from hierarchical visual tokens across varying scales. We
+introduce multiple levels of granularity for masked inputs from the volume,
+which are then reconstructed simultaneously ranging at both fine and coarse
+levels. Additionally, a cross-level alignment mechanism is applied to adjacent
+level volumes to enforce anatomical similarity hierarchically. Furthermore, we
+adopt a hybrid backbone to enhance the hierarchical representation learning
+efficiently during the pre-training. MiM was pre-trained on a large scale of
+available 3D volumetric images, \textit{i.e.,} Computed Tomography (CT) images
+containing various body parts. Extensive experiments on thirteen public
+datasets demonstrate the superiority of MiM over other SSL methods in
+organ/lesion/tumor segmentation and disease classification. We further scale up
+the MiM to large pre-training datasets with more than 10k volumes, showing that
+large-scale pre-training can further enhance the performance of downstream
+tasks. The improvement also concluded that the research community should pay
+more attention to the scale of the pre-training dataset towards the healthcare
+foundation model for 3D medical images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to a journal, updated v2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Automatic Evaluation for Image Transcreation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13717v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13717v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simran Khanuja, Vivek Iyer, Claire He, Graham Neubig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Beyond conventional paradigms of translating speech and text, recently, there
+has been interest in automated transcreation of images to facilitate
+localization of visual content across different cultures. Attempts to define
+this as a formal Machine Learning (ML) problem have been impeded by the lack of
+automatic evaluation mechanisms, with previous work relying solely on human
+evaluation. In this paper, we seek to close this gap by proposing a suite of
+automatic evaluation metrics inspired by machine translation (MT) metrics,
+categorized into: a) Object-based, b) Embedding-based, and c) VLM-based.
+Drawing on theories from translation studies and real-world transcreation
+practices, we identify three critical dimensions of image transcreation:
+cultural relevance, semantic equivalence and visual similarity, and design our
+metrics to evaluate systems along these axes. Our results show that proprietary
+VLMs best identify cultural relevance and semantic equivalence, while
+vision-encoder representations are adept at measuring visual similarity.
+Meta-evaluation across 7 countries shows our metrics agree strongly with human
+ratings, with average segment-level correlations ranging from 0.55-0.87.
+Finally, through a discussion of the merits and demerits of each metric, we
+offer a robust framework for automated image transcreation evaluation, grounded
+in both theoretical foundations and practical application. Our code can be
+found here: https://github.com/simran-khanuja/automatic-eval-transcreation
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FMRFT: Fusion Mamba and DETR for Query Time Sequence Intersection Fish
+  Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.01148v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.01148v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Yao, Yukang Huo, Qingbin Tian, Jiayin Zhao, Xiao Liu, Ruifeng Wang, Lin Xue, Haihua Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early detection of abnormal fish behavior caused by disease or hunger can be
+achieved through fish tracking using deep learning techniques, which holds
+significant value for industrial aquaculture. However, underwater reflections
+and some reasons with fish, such as the high similarity, rapid swimming caused
+by stimuli and mutual occlusion bring challenges to multi-target tracking of
+fish. To address these challenges, this paper establishes a complex
+multi-scenario sturgeon tracking dataset and introduces the FMRFT model, a
+real-time end-to-end fish tracking solution. The model incorporates the low
+video memory consumption Mamba In Mamba (MIM) architecture, which facilitates
+multi-frame temporal memory and feature extraction, thereby addressing the
+challenges to track multiple fish across frames. Additionally, the FMRFT model
+with the Query Time Sequence Intersection (QTSI) module effectively manages
+occluded objects and reduces redundant tracking frames using the superior
+feature interaction and prior frame processing capabilities of RT-DETR. This
+combination significantly enhances the accuracy and stability of fish tracking.
+Trained and tested on the dataset, the model achieves an IDF1 score of 90.3%
+and a MOTA accuracy of 94.3%. Experimental results show that the proposed FMRFT
+model effectively addresses the challenges of high similarity and mutual
+occlusion in fish populations, enabling accurate tracking in factory farming
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages,14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comprehensive Examination of Unrolled Networks for Solving Linear
+  Inverse Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04608v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04608v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Chen, Xi Chen, Arian Maleki, Shirin Jalali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unrolled networks have become prevalent in various computer vision and
+imaging tasks. Although they have demonstrated remarkable efficacy in solving
+specific computer vision and computational imaging tasks, their adaptation to
+other applications presents considerable challenges. This is primarily due to
+the multitude of design decisions that practitioners working on new
+applications must navigate, each potentially affecting the network's overall
+performance. These decisions include selecting the optimization algorithm,
+defining the loss function, and determining the number of convolutional layers,
+among others. Compounding the issue, evaluating each design choice requires
+time-consuming simulations to train, fine-tune the neural network, and optimize
+for its performance. As a result, the process of exploring multiple options and
+identifying the optimal configuration becomes time-consuming and
+computationally demanding. The main objectives of this paper are (1) to unify
+some ideas and methodologies used in unrolled networks to reduce the number of
+design choices a user has to make, and (2) to report a comprehensive ablation
+study to discuss the impact of each of the choices involved in designing
+unrolled networks and present practical recommendations based on our findings.
+We anticipate that this study will help scientists and engineers design
+unrolled networks for their applications and diagnose problems within their
+networks efficiently.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 10 figures. Project Page:
+  https://github.com/YuxiChen25/Memory-Net-Inverse</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ JourneyBench: A Challenging One-Stop Vision-Language Understanding
+  Benchmark of Generated Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.12953v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.12953v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhecan Wang, Junzhang Liu, Chia-Wei Tang, Hani Alomari, Anushka Sivakumar, Rui Sun, Wenhao Li, Md. Atabuzzaman, Hammad Ayyubi, Haoxuan You, Alvi Ishmam, Kai-Wei Chang, Shih-Fu Chang, Chris Thomas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing vision-language understanding benchmarks largely consist of images
+of objects in their usual contexts. As a consequence, recent multimodal large
+language models can perform well with only a shallow visual understanding by
+relying on background language biases. Thus, strong performance on these
+benchmarks does not necessarily correlate with strong visual understanding. In
+this paper, we release JourneyBench, a comprehensive human-annotated benchmark
+of generated images designed to assess the model's fine-grained multimodal
+reasoning abilities across five tasks: complementary multimodal chain of
+thought, multi-image VQA, imaginary image captioning, VQA with hallucination
+triggers, and fine-grained retrieval with sample-specific distractors. Unlike
+existing benchmarks, JourneyBench explicitly requires fine-grained multimodal
+reasoning in unusual imaginary scenarios where language bias and holistic image
+gist are insufficient. We benchmark state-of-the-art models on JourneyBench and
+analyze performance along a number of fine-grained dimensions. Results across
+all five tasks show that JourneyBench is exceptionally challenging for even the
+best models, indicating that models' visual reasoning abilities are not as
+strong as they first appear. We discuss the implications of our findings and
+propose avenues for further research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMTNet: Convolutional Meets <span class="highlight-title">Transformer</span> Network for Hyperspectral Images
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14080v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14080v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faxu Guo, Quan Feng, Sen Yang, Wanxia Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral remote sensing (HIS) enables the detailed capture of spectral
+information from the Earth's surface, facilitating precise classification and
+identification of surface crops due to its superior spectral diagnostic
+capabilities. However, current convolutional neural networks (CNNs) focus on
+local features in hyperspectral data, leading to suboptimal performance when
+classifying intricate crop types and addressing imbalanced sample
+distributions. In contrast, the Transformer framework excels at extracting
+global features from hyperspectral imagery. To leverage the strengths of both
+approaches, this research introduces the Convolutional Meet Transformer Network
+(CMTNet). This innovative model includes a spectral-spatial feature extraction
+module for shallow feature capture, a dual-branch structure combining CNN and
+Transformer branches for local and global feature extraction, and a
+multi-output constraint module that enhances classification accuracy through
+multi-output loss calculations and cross constraints across local,
+international, and joint features. Extensive experiments conducted on three
+datasets (WHU-Hi-LongKou, WHU-Hi-HanChuan, and WHU-Hi-HongHu) demonstrate that
+CTDBNet significantly outperforms other state-of-the-art networks in
+classification performance, validating its effectiveness in hyperspectral crop
+classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We have decided to withdraw this article due to significant
+  adjustments in the research direction. The current manuscript no longer
+  reflects the final conclusions of our study. We plan to revise and resubmit
+  the work in the future.</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Robustness for Deep Learning-based Wildfire Prediction
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryo Ide, Lei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Smoke detection using Deep Neural Networks (DNNs) is an effective approach
+for early wildfire detection. However, because smoke is temporally and
+spatially anomalous, there are limitations in collecting sufficient training
+data. This raises overfitting and bias concerns in existing DNN-based wildfire
+detection models. Thus, we introduce WARP (Wildfire Adversarial Robustness
+Procedure), the first model-agnostic framework for evaluating the adversarial
+robustness of DNN-based wildfire detection models. WARP addresses limitations
+in smoke image diversity using global and local adversarial attack methods. The
+global attack method uses image-contextualized Gaussian noise, while the local
+attack method uses patch noise injection, tailored to address critical aspects
+of wildfire detection. Leveraging WARP's model-agnostic capabilities, we assess
+the adversarial robustness of real-time Convolutional Neural Networks (CNNs)
+and Transformers. The analysis revealed valuable insights into the models'
+limitations. Specifically, the global attack method demonstrates that the
+Transformer model has more than 70% precision degradation than the CNN against
+global noise. In contrast, the local attack method shows that both models are
+susceptible to cloud image injections when detecting smoke-positive instances,
+suggesting a need for model improvements through data augmentation. WARP's
+comprehensive robustness analysis contributed to the development of
+wildfire-specific data augmentation strategies, marking a step toward
+practicality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Sample Generation of Diffusion Models using Noise Level
+  Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abulikemu Abuduweili, Chenyang Yuan, Changliu Liu, Frank Permenter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The denoising process of diffusion models can be interpreted as an
+approximate projection of noisy samples onto the data manifold. Moreover, the
+noise level in these samples approximates their distance to the underlying
+manifold. Building on this insight, we propose a novel method to enhance sample
+generation by aligning the estimated noise level with the true distance of
+noisy samples to the manifold. Specifically, we introduce a noise level
+correction network, leveraging a pre-trained denoising network, to refine noise
+level estimates during the denoising process. Additionally, we extend this
+approach to various image restoration tasks by integrating task-specific
+constraints, including inpainting, deblurring, super-resolution, colorization,
+and compressed sensing. Experimental results demonstrate that our method
+significantly improves sample quality in both unconstrained and constrained
+generation scenarios. Notably, the proposed noise level correction framework is
+compatible with existing denoising schedulers (e.g., DDIM), offering additional
+performance improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">14</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ kANNolo: Sweet and Smooth Approximate k-Nearest Neighbors Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06121v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06121v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Delfino, Domenico Erriquez, Silvio Martinico, Franco Maria Nardini, Cosimo Rulli, Rossano Venturini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Approximate Nearest Neighbors (ANN) search is a crucial task in several
+applications like recommender systems and information retrieval. Current
+state-of-the-art ANN libraries, although being performance-oriented, often lack
+modularity and ease of use. This translates into them not being fully suitable
+for easy prototyping and testing of research ideas, an important feature to
+enable. We address these limitations by introducing kANNolo, a novel
+research-oriented ANN library written in Rust and explicitly designed to
+combine usability with performance effectively. kANNolo is the first ANN
+library that supports dense and sparse vector representations made available on
+top of different similarity measures, e.g., euclidean distance and inner
+product. Moreover, it also supports vector quantization techniques, e.g.,
+Product Quantization, on top of the indexing strategies implemented. These
+functionalities are managed through Rust traits, allowing shared behaviors to
+be handled abstractly. This abstraction ensures flexibility and facilitates an
+easy integration of new components. In this work, we detail the architecture of
+kANNolo and demonstrate that its flexibility does not compromise performance.
+The experimental analysis shows that kANNolo achieves state-of-the-art
+performance in terms of speed-accuracy trade-off while allowing fast and easy
+prototyping, thus making kANNolo a valuable tool for advancing ANN research.
+Source code available on GitHub: https://github.com/TusKANNy/kannolo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recommender Systems for Social Good: The Role of Accountability and
+  Sustainability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alan Said
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work examines the role of recommender systems in promoting
+sustainability, social responsibility, and accountability, with a focus on
+alignment with the United Nations Sustainable Development Goals (SDGs). As
+recommender systems become increasingly integrated into daily interactions,
+they must go beyond personalization to support responsible consumption, reduce
+environmental impact, and foster social good. We explore strategies to mitigate
+the carbon footprint of recommendation models, ensure fairness, and implement
+accountability mechanisms. By adopting these approaches, recommender systems
+can contribute to sustainable and socially beneficial outcomes, aligning
+technological advancements with the SDGs focused on environmental
+sustainability and social well-being.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First International Workshop on Recommender Systems for
+  Sustainability and Social Good (RecSoGood'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Navigating Tomorrow: Reliably Assessing Large Language Models
+  Performance on Future Event Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Petraq Nako, Adam Jatowt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting future events is an important activity with applications across
+multiple fields and domains. For example, the capacity to foresee stock market
+trends, natural disasters, business developments, or political events can
+facilitate early preventive measures and uncover new opportunities. Multiple
+diverse computational methods for attempting future predictions, including
+predictive analysis, time series forecasting, and simulations have been
+proposed. This study evaluates the performance of several large language models
+(LLMs) in supporting future prediction tasks, an under-explored domain. We
+assess the models across three scenarios: Affirmative vs. Likelihood
+questioning, Reasoning, and Counterfactual analysis. For this, we create a
+dataset1 by finding and categorizing news articles based on entity type and its
+popularity. We gather news articles before and after the LLMs training cutoff
+date in order to thoroughly test and compare model performance. Our research
+highlights LLMs potential and limitations in predictive modeling, providing a
+foundation for future improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text2Playlist: Generating Personalized Playlists from Text on Deezer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Delcluze, Antoine Khoury, Clémence Vast, Valerio Arnaudo, Léa Briand, Walid Bendada, Thomas Bouabça
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The streaming service Deezer heavily relies on the search to help users
+navigate through its extensive music catalog. Nonetheless, it is primarily
+designed to find specific items and does not lead directly to a smooth
+listening experience. We present Text2Playlist, a stand-alone tool that
+addresses these limitations. Text2Playlist leverages generative AI, music
+information retrieval and recommendation systems to generate query-specific and
+personalized playlists, successfully deployed at scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoRAG: Retrieval-Augmented Generation over Video Corpus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soyeong Jeong, Kangsan Kim, Jinheon Baek, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) is a powerful strategy to address the
+issue of generating factually incorrect outputs in foundation models by
+retrieving external knowledge relevant to queries and incorporating it into
+their generation process. However, existing RAG approaches have primarily
+focused on textual information, with some recent advancements beginning to
+consider images, and they largely overlook videos, a rich source of multimodal
+knowledge capable of representing events, processes, and contextual details
+more effectively than any other modality. While a few recent studies explore
+the integration of videos in the response generation process, they either
+predefine query-associated videos without retrieving them according to queries,
+or convert videos into the textual descriptions without harnessing their
+multimodal richness. To tackle these, we introduce VideoRAG, a novel framework
+that not only dynamically retrieves relevant videos based on their relevance
+with queries but also utilizes both visual and textual information of videos in
+the output generation. Further, to operationalize this, our method revolves
+around the recent advance of Large Video Language Models (LVLMs), which enable
+the direct processing of video content to represent it for retrieval and
+seamless integration of the retrieved videos jointly with queries. We
+experimentally validate the effectiveness of VideoRAG, showcasing that it is
+superior to relevant baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Social web and Wikipedia: an opportunity to rethink the links between
+  sources' credibility, trust and authority 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05813v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05813v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gilles Sahut, André Tricot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Web and its main tools (Google, Wikipedia, Facebook, Twitter) deeply
+raise and renew fundamental questions, that everyone asks almost every day: Is
+this information or content true? Can I trust this author or source? These
+questions are not new, they have been the same with books, newspapers,
+broadcasting and television, and, more fundamentally, in every human
+interpersonal communication. This paper is focused on two scientific problems
+on this issue. The first one is theoretical: to address this issue, many
+concepts have been used in library and information sciences, communication and
+psychology. The links between these concepts are not clear: sometimes two
+concepts are considered as synonymous, sometimes as very different. The second
+one is historical: sources like Wikipedia deeply challenge the epistemic
+evaluation of information sources, compared to previous modes of information
+production. This paper proposes an integrated and simple model considering the
+relation between a user, a document and an author as human communication. It
+reduces the problem to three concepts: credibility as a characteristic granted
+to information depending on its truth-value; trust as the ability to produce
+credible information; authority when the power to influence of an author is
+accepted, i.e., when readers accept that the source can modify their opinion,
+knowledge and decisions. The model describes also two kinds of relationships
+between the three concepts: an upward link and a downward link. The model is
+confronted with findings of empirical research on Wikipedia in particular.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaboration of Large Language Models and Small Recommendation Models
+  for Device-Cloud Recommendation <span class="chip">KDD'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheqi Lv, Tianyu Zhan, Wenjie Wang, Xinyu Lin, Shengyu Zhang, Wenqiao Zhang, Jiwei Li, Kun Kuang, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) for Recommendation (LLM4Rec) is a promising
+research direction that has demonstrated exceptional performance in this field.
+However, its inability to capture real-time user preferences greatly limits the
+practical application of LLM4Rec because (i) LLMs are costly to train and infer
+frequently, and (ii) LLMs struggle to access real-time data (its large number
+of parameters poses an obstacle to deployment on devices). Fortunately, small
+recommendation models (SRMs) can effectively supplement these shortcomings of
+LLM4Rec diagrams by consuming minimal resources for frequent training and
+inference, and by conveniently accessing real-time data on devices.
+  In light of this, we designed the Device-Cloud LLM-SRM Collaborative
+Recommendation Framework (LSC4Rec) under a device-cloud collaboration setting.
+LSC4Rec aims to integrate the advantages of both LLMs and SRMs, as well as the
+benefits of cloud and edge computing, achieving a complementary synergy. We
+enhance the practicability of LSC4Rec by designing three strategies:
+collaborative training, collaborative inference, and intelligent request.
+During training, LLM generates candidate lists to enhance the ranking ability
+of SRM in collaborative scenarios and enables SRM to update adaptively to
+capture real-time user interests. During inference, LLM and SRM are deployed on
+the cloud and on the device, respectively. LLM generates candidate lists and
+initial ranking results based on user behavior, and SRM get reranking results
+based on the candidate list, with final results integrating both LLM's and
+SRM's scores. The device determines whether a new candidate list is needed by
+comparing the consistency of the LLM's and SRM's sorted lists. Our
+comprehensive and extensive experimental analysis validates the effectiveness
+of each strategy in LSC4Rec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on KDD'25: Proceedings of the ACM SIGKDD Conference on
+  Knowledge Discovery and Data Mining 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gender-Neutral Large Language Models for Medical Applications: Reducing
+  Bias in PubMed Abstracts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizabeth Schaefer, Kirk Roberts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a pipeline for mitigating gender bias in large language
+models (LLMs) used in medical literature by neutralizing gendered occupational
+pronouns. A dataset of 379,000 PubMed abstracts from 1965-1980 was processed to
+identify and modify pronouns tied to professions. We developed a BERT-based
+model, ``Modern Occupational Bias Elimination with Refined Training,'' or
+``MOBERT,'' trained on these neutralized abstracts, and compared its
+performance with ``1965Bert,'' trained on the original dataset. MOBERT achieved
+a 70\% inclusive replacement rate, while 1965Bert reached only 4\%. A further
+analysis of MOBERT revealed that pronoun replacement accuracy correlated with
+the frequency of occupational terms in the training data. We propose expanding
+the dataset and refining the pipeline to improve performance and ensure more
+equitable language modeling in medical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Repeat-bias-aware Optimization of Beyond-accuracy Metrics for Next
+  Basket Recommendation <span class="chip">ECIR2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06362v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06362v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanna Liu, Ming Li, Mohammad Aliannejadi, Maarten de Rijke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In next basket recommendation (NBR) a set of items is recommended to users
+based on their historical basket sequences. In many domains, the recommended
+baskets consist of both repeat items and explore items. Some state-of-the-art
+NBR methods are heavily biased to recommend repeat items so as to maximize
+utility. The evaluation and optimization of beyond-accuracy objectives for NBR,
+such as item fairness and diversity, has attracted increasing attention. How
+can such beyond-accuracy objectives be pursued in the presence of heavy repeat
+bias? We find that only optimizing diversity or item fairness without
+considering repeat bias may cause NBR algorithms to recommend more repeat
+items. To solve this problem, we propose a model-agnostic repeat-bias-aware
+optimization algorithm to post-process the recommended results obtained from
+NBR methods with the objective of mitigating repeat bias when optimizing
+diversity or item fairness. We consider multiple variations of our optimization
+algorithm to cater to multiple NBR methods. Experiments on three real-world
+grocery shopping datasets show that the proposed algorithms can effectively
+improve diversity and item fairness, and mitigate repeat bias at acceptable
+Recall loss.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted as a full paper at the 47th European
+  Conference on Information Retrieval (ECIR2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Environmental large language model Evaluation (ELLE) <span class="highlight-title">dataset</span>: A
+  Benchmark for Evaluating Generative AI applications in Eco-environment Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06277v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06277v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Guo, Nan Li, Ming Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI holds significant potential for ecological and environmental
+applications such as monitoring, data analysis, education, and policy support.
+However, its effectiveness is limited by the lack of a unified evaluation
+framework. To address this, we present the Environmental Large Language model
+Evaluation (ELLE) question answer (QA) dataset, the first benchmark designed to
+assess large language models and their applications in ecological and
+environmental sciences. The ELLE dataset includes 1,130 question answer pairs
+across 16 environmental topics, categorized by domain, difficulty, and type.
+This comprehensive dataset standardizes performance assessments in these
+fields, enabling consistent and objective comparisons of generative AI
+performance. By providing a dedicated evaluation tool, ELLE dataset promotes
+the development and application of generative AI technologies for sustainable
+environmental outcomes. The dataset and code are available at
+https://elle.ceeai.net/ and https://github.com/CEEAI/elle.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimize Incompatible Parameters through Compatibility-aware Knowledge
+  Integration <span class="chip">AAAI'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheqi Lv, Keming Ye, Zishu Wei, Qi Tian, Shengyu Zhang, Wenqiao Zhang, Wenjie Wang, Kun Kuang, Tat-Seng Chua, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have become foundational to advancements in multiple
+domains, including recommendation systems, natural language processing, and so
+on. Despite their successes, these models often contain incompatible parameters
+that can be underutilized or detrimental to model performance, particularly
+when faced with specific, varying data distributions. Existing research excels
+in removing such parameters or merging the outputs of multiple different
+pretrained models. However, the former focuses on efficiency rather than
+performance, while the latter requires several times more computing and storage
+resources to support inference. In this paper, we set the goal to explicitly
+improve these incompatible parameters by leveraging the complementary strengths
+of different models, thereby directly enhancing the models without any
+additional parameters. Specifically, we propose Compatibility-aware Knowledge
+Integration (CKI), which consists of Parameter Compatibility Assessment and
+Parameter Splicing, which are used to evaluate the knowledge content of
+multiple models and integrate the knowledge into one model, respectively. The
+integrated model can be used directly for inference or for further fine-tuning.
+We conduct extensive experiments on various datasets for recommendation and
+language tasks, and the results show that Compatibility-aware Knowledge
+Integration can effectively optimize incompatible parameters under multiple
+tasks and settings to break through the training limit of the original model
+without increasing the inference cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on AAAI'25: The Annual AAAI Conference on Artificial
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Item Dissimilarities: Diversifying by Intent in Recommender
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12327v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12327v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyan Wang, Cheenar Banerjee, Samer Chucri, Fabio Soldo, Sriraj Badam, Ed H. Chi, Minmin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has become increasingly clear that recommender systems that overly focus
+on short-term engagement prevents users from exploring diverse interests,
+ultimately hurting long-term user experience. To tackle this challenge,
+numerous diversification algorithms have been proposed. These algorithms
+typically rely on measures of item similarity, aiming to maximize the
+dissimilarity across items in the final set of recommendations. However, in
+this work, we demonstrate the benefits of going beyond item-level similarities
+by utilizing higher-level user understanding--specifically, user intents that
+persist across multiple interactions--in diversification. Our approach is
+motivated by the observation that user behaviors on online platforms are
+largely driven by their underlying intents. Therefore, recommendations should
+ensure that diverse user intents are accurately represented. While intent has
+primarily been studied in the context of search, it is less clear how to
+incorporate real-time dynamic intent predictions into recommender systems. To
+address this gap, we develop a probabilistic intent-based whole-page
+diversification framework for the final stage of a recommender system. Starting
+with a prior belief of user intents, the proposed framework sequentially
+selects items for each position based on these beliefs and subsequently updates
+posterior beliefs about the intents. This approach ensures that different user
+intents are represented on a page, towards optimizing long-term user
+experience. We experiment with the intent diversification framework on YouTube,
+the world's largest video recommendation platform, serving billions of users
+daily. Live experiments on a diverse set of intents show that the proposed
+framework increases Daily Active Users (DAU) and overall user enjoyment,
+validating its effectiveness in facilitating long-term planning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-granularity Interest Retrieval and Refinement Network for
+  Long-Term User Behavior Modeling in CTR Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15005v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15005v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Xu, Hao Wang, Wei Guo, Luankang Zhang, Wanshan Yang, Runlong Yu, Yong Liu, Defu Lian, Enhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Click-through Rate (CTR) prediction is crucial for online personalization
+platforms. Recent advancements have shown that modeling rich user behaviors can
+significantly improve the performance of CTR prediction. Current long-term user
+behavior modeling algorithms predominantly follow two cascading stages. The
+first stage retrieves subsequence related to the target item from the long-term
+behavior sequence, while the second stage models the relationship between the
+subsequence and the target item. Despite significant progress, these methods
+have two critical flaws. First, the retrieval query typically includes only
+target item information, limiting the ability to capture the user's diverse
+interests. Second, relational information, such as sequential and interactive
+information within the subsequence, is frequently overlooked. Therefore, it
+requires to be further mined to more accurately model user interests.
+  To this end, we propose Multi-granularity Interest Retrieval and Refinement
+Network (MIRRN). Specifically, we first construct queries based on behaviors
+observed at different time scales to obtain subsequences, each capturing users'
+interest at various granularities. We then introduce an noval multi-head
+Fourier transformer to efficiently learn sequential and interactive information
+within the subsequences, leading to more accurate modeling of user interests.
+Finally, we employ multi-head target attention to adaptively assess the impact
+of these multi-granularity interests on the target item. Extensive experiments
+have demonstrated that MIRRN significantly outperforms state-of-the-art
+baselines. Furthermore, an A/B test shows that MIRRN increases the average
+number of listening songs by 1.32% and the average time of listening songs by
+0.55% on the Huawei Music App. The implementation code is publicly available at
+https://github.com/USTC-StarTeam/MIRRN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ THeGCN: Temporal Heterophilic Graph Convolutional Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Yan, Yuzhong Chen, Huiyuan Chen, Xiaoting Li, Zhe Xu, Zhichen Zeng, Lihui Liu, Zhining Liu, Hanghang Tong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have exhibited remarkable efficacy in diverse
+graph learning tasks, particularly on static homophilic graphs. Recent
+attention has pivoted towards more intricate structures, encompassing (1)
+static heterophilic graphs encountering the edge heterophily issue in the
+spatial domain and (2) event-based continuous graphs in the temporal domain.
+State-of-the-art (SOTA) has been concurrently addressing these two lines of
+work but tends to overlook the presence of heterophily in the temporal domain,
+constituting the temporal heterophily issue. Furthermore, we highlight that the
+edge heterophily issue and the temporal heterophily issue often co-exist in
+event-based continuous graphs, giving rise to the temporal edge heterophily
+challenge. To tackle this challenge, this paper first introduces the temporal
+edge heterophily measurement. Subsequently, we propose the Temporal
+Heterophilic Graph Convolutional Network (THeGCN), an innovative model that
+incorporates the low/high-pass graph signal filtering technique to accurately
+capture both edge (spatial) heterophily and temporal heterophily. Specifically,
+the THeGCN model consists of two key components: a sampler and an aggregator.
+The sampler selects events relevant to a node at a given moment. Then, the
+aggregator executes message-passing, encoding temporal information, node
+attributes, and edge attributes into node embeddings. Extensive experiments
+conducted on 5 real-world datasets validate the efficacy of THeGCN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">136</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning Force-Field Approach for Itinerant Electron Magnets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Zhang, Yunhao Fan, Kotaro Shimizu, Gia-Wei Chern
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We review the recent development of machine-learning (ML) force-field
+frameworks for Landau-Lifshitz-Gilbert (LLG) dynamics simulations of itinerant
+electron magnets, focusing on the general theory and implementations of
+symmetry-invariant representations of spin configurations. The crucial
+properties that such magnetic descriptors must satisfy are differentiability
+with respect to spin rotations and invariance to both lattice point-group
+symmetry and internal spin rotation symmetry. We propose an efficient
+implementation based on the concept of reference irreducible representations,
+modified from the group-theoretical power-spectrum and bispectrum methods. The
+ML framework is demonstrated using the s-d models, which are widely applied in
+spintronics research. We show that LLG simulations based on local fields
+predicted by the trained ML models successfully reproduce representative
+non-collinear spin structures, including 120$^\circ$, tetrahedral, and skyrmion
+crystal orders of the triangular-lattice s-d models. Large-scale thermal quench
+simulations enabled by ML models further reveal intriguing freezing dynamics
+and glassy stripe states consisting of skyrmions and bi-merons. Our work
+highlights the utility of ML force-field approach to dynamical modeling of
+complex spin orders in itinerant electron magnets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Learning for Physically-Constrained Neural System Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankush Chakrabarty, Gordon Wichern, Vedang M. Deshpande, Abraham P. Vinod, Karl Berntorp, Christopher R. Laughman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a gradient-based meta-learning framework for rapid adaptation of
+neural state-space models (NSSMs) for black-box system identification. When
+applicable, we also incorporate domain-specific physical constraints to improve
+the accuracy of the NSSM. The major benefit of our approach is that instead of
+relying solely on data from a single target system, our framework utilizes data
+from a diverse set of source systems, enabling learning from limited target
+data, as well as with few online training iterations. Through benchmark
+examples, we demonstrate the potential of our approach, study the effect of
+fine-tuning subnetworks rather than full fine-tuning, and report real-world
+case studies to illustrate the practical application and generalizability of
+the approach to practical problems with physical-constraints. Specifically, we
+show that the meta-learned models result in improved downstream performance in
+model-based state estimation in indoor localization and energy systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Alignment Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06164v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06164v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satchel Grant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When can we say that two neural systems are the same? The answer to this
+question is goal-dependent, and it is often addressed through correlative
+methods such as Representational Similarity Analysis (RSA) and Centered Kernel
+Alignment (CKA). What do we miss when we forgo causal explorations, and how can
+we target specific types of similarity? In this work, we introduce Model
+Alignment Search (MAS), a method for causally exploring distributed
+representational similarity. The method learns invertible linear
+transformations that align a subspace between two distributed networks'
+representations where causal information can be freely interchanged. We first
+show that the method can be used to transfer specific causal variables, such as
+the number of items in a counting task, between networks with different
+training seeds. We then explore open questions in number cognition by comparing
+different types of numeric representations in models trained on structurally
+different numeric tasks. We then explore differences between MAS vs preexisting
+causal similarity methods, showing MAS to be more resistant to unwanted
+exchanges. Lastly, we introduce a counterfactual latent auxiliary loss function
+that helps shape causally relevant alignments even in cases where we do not
+have causal access to one of the two models for training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Transition State Searches by Freezing String Method with Graph
+  Neural Network Potentials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06159v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06159v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonah Marks, Joseph Gomes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transition states are a critical bottleneck in chemical transformations.
+Significant efforts have been made to develop algorithms that efficiently
+locate transition states on potential energy surfaces. However, the
+computational cost of ab-initio potential energy surface evaluation limits the
+size of chemical systems that can routinely studied. In this work, we develop
+and fine-tune a graph neural network potential energy function suitable for
+describing organic chemical reactions and use it to rapidly identify transition
+state guess structures. We successfully refine guess structures and locate a
+transition state in each test system considered and reduce the average number
+of ab-initio calculations by 47% though use of the graph neural network
+potential energy function. Our results show that modern machine learning models
+have reached levels of reliability whereby they can be used to accelerate
+routine computational chemistry tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenMol: A Drug Discovery Generalist with Discrete Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seul Lee, Karsten Kreis, Srimukh Prasad Veccham, Meng Liu, Danny Reidenbach, Yuxing Peng, Saee Paliwal, Weili Nie, Arash Vahdat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Drug discovery is a complex process that involves multiple scenarios and
+stages, such as fragment-constrained molecule generation, hit generation and
+lead optimization. However, existing molecular generative models can only
+tackle one or two of these scenarios and lack the flexibility to address
+various aspects of the drug discovery pipeline. In this paper, we present
+Generalist Molecular generative model (GenMol), a versatile framework that
+addresses these limitations by applying discrete diffusion to the Sequential
+Attachment-based Fragment Embedding (SAFE) molecular representation. GenMol
+generates SAFE sequences through non-autoregressive bidirectional parallel
+decoding, thereby allowing utilization of a molecular context that does not
+rely on the specific token ordering and enhanced computational efficiency.
+Moreover, under the discrete diffusion framework, we introduce fragment
+remasking, a strategy that optimizes molecules by replacing fragments with
+masked tokens and regenerating them, enabling effective exploration of chemical
+space. GenMol significantly outperforms the previous GPT-based model trained on
+SAFE representations in de novo generation and fragment-constrained generation,
+and achieves state-of-the-art performance in goal-directed hit generation and
+lead optimization. These experimental results demonstrate that GenMol can
+tackle a wide range of drug discovery tasks, providing a unified and versatile
+approach for molecular design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From discrete-time policies to continuous-time diffusion samplers:
+  Asymptotic equivalences and faster training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06148v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06148v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Berner, Lorenz Richter, Marcin Sendera, Jarrid Rector-Brooks, Nikolay Malkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of training neural stochastic differential equations, or
+diffusion models, to sample from a Boltzmann distribution without access to
+target samples. Existing methods for training such models enforce time-reversal
+of the generative and noising processes, using either differentiable simulation
+or off-policy reinforcement learning (RL). We prove equivalences between
+families of objectives in the limit of infinitesimal discretization steps,
+linking entropic RL methods (GFlowNets) with continuous-time objects (partial
+differential equations and path space measures). We further show that an
+appropriate choice of coarse time discretization during training allows greatly
+improved sample efficiency and the use of time-local objectives, achieving
+competitive performance on standard sampling benchmarks with reduced
+computational cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code: https://github.com/GFNOrg/gfn-diffusion/tree/stagger</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emergent Symbol-like Number Variables in Artificial Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satchel Grant, Noah D. Goodman, James L. McClelland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What types of numeric representations emerge in Neural Networks (NNs)? To
+what degree do NNs induce abstract, mutable, slot-like numeric variables, and
+in what situations do these representations emerge? How do these
+representations change over learning, and how can we understand the neural
+implementations in ways that are unified across different NNs? In this work, we
+approach these questions by first training sequence based neural systems using
+Next Token Prediction (NTP) objectives on numeric tasks. We then seek to
+understand the neural solutions through the lens of causal abstractions or
+symbolic algorithms. We use a combination of causal interventions and
+visualization methods to find that artificial neural models do indeed develop
+analogs of interchangeable, mutable, latent number variables purely from the
+NTP objective. We then ask how variations on the tasks and model architectures
+affect the models' learned solutions to find that these symbol-like numeric
+representations do not form for every variant of the task, and transformers
+solve the problem in a notably different way than their recurrent counterparts.
+We then show how the symbol-like variables change over the course of training
+to find a strong correlation between the models' task performance and the
+alignment of their symbol-like representations. Lastly, we show that in all
+cases, some degree of gradience exists in these neural symbols, highlighting
+the difficulty of finding simple, interpretable symbolic stories of how neural
+networks perform numeric tasks. Taken together, our results are consistent with
+the view that neural networks can approximate interpretable symbolic programs
+of number cognition, but the particular program they approximate and the extent
+to which they approximate it can vary widely, depending on the network
+architecture, training data, extent of training, and network size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Merging Feed-Forward Sublayers for Compressed <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neha Verma, Kenton Murray, Kevin Duh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rise and ubiquity of larger deep learning models, the need for
+high-quality compression techniques is growing in order to deploy these models
+widely. The sheer parameter count of these models makes it difficult to fit
+them into the memory constraints of different hardware. In this work, we
+present a novel approach to model compression by merging similar parameter
+groups within a model, rather than pruning away less important parameters.
+Specifically, we select, align, and merge separate feed-forward sublayers in
+Transformer models, and test our method on language modeling, image
+classification, and machine translation. With our method, we demonstrate
+performance comparable to the original models while combining more than a third
+of model feed-forward sublayers, and demonstrate improved performance over a
+strong layer-pruning baseline. For instance, we can remove over 21% of total
+parameters from a Vision Transformer, while maintaining 99% of its original
+performance. Additionally, we observe that some groups of feed-forward
+sublayers exhibit high activation similarity, which may help explain their
+surprising mergeability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inferring High-Order Couplings with Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06108v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06108v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aurélien Decelle, Alfonso de Jesús Navas Gómez, Beatriz Seoane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maximum-entropy methods, rooted in the inverse Ising/Potts problem from
+statistical mechanics, have become indispensable tools for modeling pairwise
+interactions in disciplines such as bioinformatics, ecology, and neuroscience.
+Despite their remarkable success, these methods often overlook high-order
+interactions that may be crucial in complex systems. Conversely, while modern
+machine learning approaches can capture such interactions, existing
+interpretable frameworks are computationally expensive, making it impractical
+to assess the relevance of high-order interactions in real-world scenarios.
+Restricted Boltzmann Machines (RBMs) offer a computationally efficient
+alternative by encoding statistical correlations via hidden nodes in a
+bipartite neural network. Here, we present a method that maps RBMs exactly onto
+generalized Potts models with interactions of arbitrary high order. This
+approach leverages large-$N$ approximations, facilitated by the simple
+architecture of the RBM, to enable the efficient extraction of effective
+many-body couplings with minimal computational cost. This mapping also enables
+the development of a general formal framework for the extraction of effective
+higher-order interactions in arbitrarily complex probabilistic models.
+Additionally, we introduce a robust formalism for gauge fixing within the
+generalized Potts model. We validate our method by accurately recovering two-
+and three-body interactions from synthetic datasets. Additionally, applying our
+framework to protein sequence data demonstrates its effectiveness in
+reconstructing protein contact maps, achieving performance comparable to
+state-of-the-art inverse Potts models. These results position RBMs as a
+powerful and efficient tool for investigating high-order interactions in
+complex systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 Pages and 3 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finite-Horizon Single-Pull Restless Bandits: An Efficient Index Policy
+  For Scarce Resource Allocation <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guojun Xiong, Haichuan Wang, Yuqi Pan, Saptarshi Mandal, Sanket Shah, Niclas Boehmer, Milind Tambe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Restless multi-armed bandits (RMABs) have been highly successful in
+optimizing sequential resource allocation across many domains. However, in many
+practical settings with highly scarce resources, where each agent can only
+receive at most one resource, such as healthcare intervention programs, the
+standard RMAB framework falls short. To tackle such scenarios, we introduce
+Finite-Horizon Single-Pull RMABs (SPRMABs), a novel variant in which each arm
+can only be pulled once. This single-pull constraint introduces additional
+complexity, rendering many existing RMAB solutions suboptimal or ineffective.
+%To address this, we propose using dummy states to duplicate the system,
+ensuring that once an arm is activated, it transitions exclusively within the
+dummy states. To address this shortcoming, we propose using \textit{dummy
+states} that expand the system and enforce the one-pull constraint. We then
+design a lightweight index policy for this expanded system. For the first time,
+we demonstrate that our index policy achieves a sub-linearly decaying average
+optimality gap of $\tilde{\mathcal{O}}\left(\frac{1}{\rho^{1/2}}\right)$ for a
+finite number of arms, where $\rho$ is the scaling factor for each arm cluster.
+Extensive simulations validate the proposed method, showing robust performance
+across various domains compared to existing benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 Pages, 8 figures. Accepted by AAMAS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Deep Learning-based Anomaly Detection in Energy Consumption
+  Data by Focusing on Contextually Relevant Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Noorchenarboo, Katarina Grolinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting anomalies in energy consumption data is crucial for identifying
+energy waste, equipment malfunction, and overall, for ensuring efficient energy
+management. Machine learning, and specifically deep learning approaches, have
+been greatly successful in anomaly detection; however, they are black-box
+approaches that do not provide transparency or explanations. SHAP and its
+variants have been proposed to explain these models, but they suffer from high
+computational complexity (SHAP) or instability and inconsistency (e.g., Kernel
+SHAP). To address these challenges, this paper proposes an explainability
+approach for anomalies in energy consumption data that focuses on
+context-relevant information. The proposed approach leverages existing
+explainability techniques, focusing on SHAP variants, together with global
+feature importance and weighted cosine similarity to select background dataset
+based on the context of each anomaly point. By focusing on the context and most
+relevant features, this approach mitigates the instability of explainability
+algorithms. Experimental results across 10 different machine learning models,
+five datasets, and five XAI techniques, demonstrate that our method reduces the
+variability of explanations providing consistent explanations. Statistical
+analyses confirm the robustness of our approach, showing an average reduction
+in variability of approximately 38% across multiple datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Developing Socially Compliant Automated Vehicles: State of the
+  Art, Experts Expectations, and A Conceptual Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongqi Dong, Bart van Arem, Haneen Farah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated Vehicles (AVs) hold promise for revolutionizing transportation by
+improving road safety, traffic efficiency, and overall mobility. Despite the
+steady advancement in high-level AVs in recent years, the transition to full
+automation entails a period of mixed traffic, where AVs of varying automation
+levels coexist with human-driven vehicles (HDVs). Making AVs socially compliant
+and understood by human drivers is expected to improve the safety and
+efficiency of mixed traffic. Thus, ensuring AVs compatibility with HDVs and
+social acceptance is crucial for their successful and seamless integration into
+mixed traffic. However, research in this critical area of developing Socially
+Compliant AVs (SCAVs) remains sparse. This study carries out the first
+comprehensive scoping review to assess the current state of the art in
+developing SCAVs, identifying key concepts, methodological approaches, and
+research gaps. An expert interview was also conducted to identify critical
+research gaps and expectations towards SCAVs. Based on the scoping review and
+expert interview input, a conceptual framework is proposed for the development
+of SCAVs. The conceptual framework is evaluated using an online survey
+targeting researchers, technicians, policymakers, and other relevant
+professionals worldwide. The survey results provide valuable validation and
+insights, affirming the significance of the proposed conceptual framework in
+tackling the challenges of integrating AVs into mixed-traffic environments.
+Additionally, future research perspectives and suggestions are discussed,
+contributing to the research and development agenda of SCAVs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 13 figures, under review by the journal of Transportation
+  Research Part E: Logistics and Transportation Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ All AI Models are Wrong, but Some are Optimal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akhil S Anand, Shambhuraj Sawant, Dirk Reinhardt, Sebastien Gros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI models that predict the future behavior of a system (a.k.a. predictive AI
+models) are central to intelligent decision-making. However, decision-making
+using predictive AI models often results in suboptimal performance. This is
+primarily because AI models are typically constructed to best fit the data, and
+hence to predict the most likely future rather than to enable high-performance
+decision-making. The hope that such prediction enables high-performance
+decisions is neither guaranteed in theory nor established in practice. In fact,
+there is increasing empirical evidence that predictive models must be tailored
+to decision-making objectives for performance. In this paper, we establish
+formal (necessary and sufficient) conditions that a predictive model (AI-based
+or not) must satisfy for a decision-making policy established using that model
+to be optimal. We then discuss their implications for building predictive AI
+models for sequential decision-making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Averaged Adam accelerates stochastic optimization in the training of
+  deep neural network approximations for partial differential equation and
+  optimal control problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steffen Dereich, Arnulf Jentzen, Adrian Riekert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods - usually consisting of a class of deep neural networks
+(DNNs) trained by a stochastic gradient descent (SGD) optimization method - are
+nowadays omnipresent in data-driven learning problems as well as in scientific
+computing tasks such as optimal control (OC) and partial differential equation
+(PDE) problems. In practically relevant learning tasks, often not the
+plain-vanilla standard SGD optimization method is employed to train the
+considered class of DNNs but instead more sophisticated adaptive and
+accelerated variants of the standard SGD method such as the popular Adam
+optimizer are used. Inspired by the classical Polyak-Ruppert averaging
+approach, in this work we apply averaged variants of the Adam optimizer to
+train DNNs to approximately solve exemplary scientific computing problems in
+the form of PDEs and OC problems. We test the averaged variants of Adam in a
+series of learning problems including physics-informed neural network (PINN),
+deep backward stochastic differential equation (deep BSDE), and deep Kolmogorov
+approximations for PDEs (such as heat, Black-Scholes, Burgers, and Allen-Cahn
+PDEs), including DNN approximations for OC problems, and including DNN
+approximations for image classification problems (ResNet for CIFAR-10). In each
+of the numerical examples the employed averaged variants of Adam outperform the
+standard Adam and the standard SGD optimizers, particularly, in the situation
+of the scientific machine learning problems. The Python source codes for the
+numerical experiments associated to this work can be found on GitHub at
+https://github.com/deeplearningmethods/averaged-adam.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scale-up Unlearnable Examples Learning with High-Performance Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanfan Zhu, Issac Lyngaas, Murali Gopalakrishnan Meena, Mary Ellen I. Koran, Bradley Malin, Daniel Moyer, Shunxing Bao, Anuj Kapadia, Xiao Wang, Bennett Landman, Yuankai Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in AI models are structured to retain user interactions,
+which could inadvertently include sensitive healthcare data. In the healthcare
+field, particularly when radiologists use AI-driven diagnostic tools hosted on
+online platforms, there is a risk that medical imaging data may be repurposed
+for future AI training without explicit consent, spotlighting critical privacy
+and intellectual property concerns around healthcare data usage. Addressing
+these privacy challenges, a novel approach known as Unlearnable Examples (UEs)
+has been introduced, aiming to make data unlearnable to deep learning models. A
+prominent method within this area, called Unlearnable Clustering (UC), has
+shown improved UE performance with larger batch sizes but was previously
+limited by computational resources. To push the boundaries of UE performance
+with theoretically unlimited resources, we scaled up UC learning across various
+datasets using Distributed Data Parallel (DDP) training on the Summit
+supercomputer. Our goal was to examine UE efficacy at high-performance
+computing (HPC) levels to prevent unauthorized learning and enhance data
+security, particularly exploring the impact of batch size on UE's
+unlearnability. Utilizing the robust computational capabilities of the Summit,
+extensive experiments were conducted on diverse datasets such as Pets,
+MedMNist, Flowers, and Flowers102. Our findings reveal that both overly large
+and overly small batch sizes can lead to performance instability and affect
+accuracy. However, the relationship between batch size and unlearnability
+varied across datasets, highlighting the necessity for tailored batch size
+strategies to achieve optimal data protection. Our results underscore the
+critical role of selecting appropriate batch sizes based on the specific
+characteristics of each dataset to prevent learning and ensure data security in
+deep learning applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining k-Nearest Neighbors: Abductive and Counterfactual
+  Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Barceló, Alexander Kozachinskiy, Miguel Romero Orth, Bernardo Subercaseaux, José Verschae
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the wide use of $k$-Nearest Neighbors as classification models, their
+explainability properties remain poorly understood from a theoretical
+perspective. While nearest neighbors classifiers offer interpretability from a
+"data perspective", in which the classification of an input vector $\bar{x}$ is
+explained by identifying the vectors $\bar{v}_1, \ldots, \bar{v}_k$ in the
+training set that determine the classification of $\bar{x}$, we argue that such
+explanations can be impractical in high-dimensional applications, where each
+vector has hundreds or thousands of features and it is not clear what their
+relative importance is. Hence, we focus on understanding nearest neighbor
+classifications through a "feature perspective", in which the goal is to
+identify how the values of the features in $\bar{x}$ affect its classification.
+Concretely, we study abductive explanations such as "minimum sufficient
+reasons", which correspond to sets of features in $\bar{x}$ that are enough to
+guarantee its classification, and "counterfactual explanations" based on the
+minimum distance feature changes one would have to perform in $\bar{x}$ to
+change its classification. We present a detailed landscape of positive and
+negative complexity results for counterfactual and abductive explanations,
+distinguishing between discrete and continuous feature spaces, and considering
+the impact of the choice of distance function involved. Finally, we show that
+despite some negative complexity results, Integer Quadratic Programming and SAT
+solving allow for computing explanations in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Federated Bayesian Causal Inference and Its Application in
+  Advanced Manufacturing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofeng Xiao, Khawlah Alharbi, Pengyu Zhang, Hantang Qin, Xubo Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal inference has recently gained notable attention across various fields
+like biology, healthcare, and environmental science, especially within
+explainable artificial intelligence (xAI) systems, for uncovering the causal
+relationships among multiple variables and outcomes. Yet, it has not been fully
+recognized and deployed in the manufacturing systems. In this paper, we
+introduce an explainable, scalable, and flexible federated Bayesian learning
+framework, \texttt{xFBCI}, designed to explore causality through treatment
+effect estimation in distributed manufacturing systems. By leveraging federated
+Bayesian learning, we efficiently estimate posterior of local parameters to
+derive the propensity score for each client without accessing local private
+data. These scores are then used to estimate the treatment effect using
+propensity score matching (PSM). Through simulations on various datasets and a
+real-world Electrohydrodynamic (EHD) printing data, we demonstrate that our
+approach outperforms standard Bayesian causal inference methods and several
+state-of-the-art federated learning benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A monthly sub-national Harmonized Food Insecurity <span class="highlight-title">Dataset</span> for
+  comprehensive analysis and predictive modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Machefer Mélissande, Michele Ronco, Anne-Claire Thomas, Michael Assouline, Melanie Rabier, Christina Corbane, Felix Rembold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Food security is a complex, multidimensional concept challenging to measure
+comprehensively. Effective anticipation, monitoring, and mitigation of food
+crises require timely and comprehensive global data. This paper introduces the
+Harmonized Food Insecurity Dataset (HFID), an open-source resource
+consolidating four key data sources: the Integrated Food Security Phase
+Classification (IPC)/Cadre Harmonis\'e (CH) phases, the Famine Early Warning
+Systems Network (FEWS NET) IPC-compatible phases, and the World Food Program's
+(WFP) Food Consumption Score (FCS) and reduced Coping Strategy Index (rCSI).
+Updated monthly and using a common reference system for administrative units,
+the HFID offers extensive spatial and temporal coverage. It serves as a vital
+tool for food security experts and humanitarian agencies, providing a unified
+resource for analyzing food security conditions and highlighting global data
+disparities. The scientific community can also leverage the HFID to develop
+data-driven predictive models, enhancing the capacity to forecast and prevent
+future food crises.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The authors Melissande Machefer and Michele Ronco have contributed
+  equally as both first authors to this work. This work is currently being
+  reviewed in a peer-reviewed journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Geometry and Optimization of Shallow Polynomial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yossi Arjevani, Joan Bruna, Joe Kileel, Elzbieta Polak, Matthew Trager
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study shallow neural networks with polynomial activations. The function
+space for these models can be identified with a set of symmetric tensors with
+bounded rank. We describe general features of these networks, focusing on the
+relationship between width and optimization. We then consider teacher-student
+problems, that can be viewed as a problem of low-rank tensor approximation with
+respect to a non-standard inner product that is induced by the data
+distribution. In this setting, we introduce a teacher-metric discriminant which
+encodes the qualitative behavior of the optimization as a function of the
+training data distribution. Finally, we focus on networks with quadratic
+activations, presenting an in-depth analysis of the optimization landscape. In
+particular, we present a variation of the Eckart-Young Theorem characterizing
+all critical points and their Hessian signatures for teacher-student problems
+with quadratic networks and Gaussian training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distilling Calibration via Conformalized Credal Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Huang, Sangwoo Park, Nicola Paoletti, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying artificial intelligence (AI) models on edge devices involves a
+delicate balance between meeting stringent complexity constraints, such as
+limited memory and energy resources, and ensuring reliable performance in
+sensitive decision-making tasks. One way to enhance reliability is through
+uncertainty quantification via Bayesian inference. This approach, however,
+typically necessitates maintaining and running multiple models in an ensemble,
+which may exceed the computational limits of edge devices. This paper
+introduces a low-complexity methodology to address this challenge by distilling
+calibration information from a more complex model. In an offline phase,
+predictive probabilities generated by a high-complexity cloud-based model are
+leveraged to determine a threshold based on the typical divergence between the
+cloud and edge models. At run time, this threshold is used to construct credal
+sets -- ranges of predictive probabilities that are guaranteed, with a
+user-selected confidence level, to include the predictions of the cloud model.
+The credal sets are obtained through thresholding of a divergence measure in
+the simplex of predictive probabilities. Experiments on visual and language
+tasks demonstrate that the proposed approach, termed Conformalized Distillation
+for Credal Inference (CD-CI), significantly improves calibration performance
+compared to low-complexity Bayesian methods, such as Laplace approximation,
+making it a practical and efficient solution for edge AI deployments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personalized Language Model Learning on Text Data Without User
+  Identifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yucheng Ding, Yangwenjian Tan, Xiangyu Liu, Chaoyue Niu, Fandong Meng, Jie Zhou, Ning Liu, Fan Wu, Guihai Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many practical natural language applications, user data are highly
+sensitive, requiring anonymous uploads of text data from mobile devices to the
+cloud without user identifiers. However, the absence of user identifiers
+restricts the ability of cloud-based language models to provide personalized
+services, which are essential for catering to diverse user needs. The trivial
+method of replacing an explicit user identifier with a static user embedding as
+model input still compromises data anonymization. In this work, we propose to
+let each mobile device maintain a user-specific distribution to dynamically
+generate user embeddings, thereby breaking the one-to-one mapping between an
+embedding and a specific user. We further theoretically demonstrate that to
+prevent the cloud from tracking users via uploaded embeddings, the local
+distributions of different users should either be derived from a linearly
+dependent space to avoid identifiability or be close to each other to prevent
+accurate attribution. Evaluation on both public and industrial datasets using
+different language models reveals a remarkable improvement in accuracy from
+incorporating anonymous user embeddings, while preserving real-time inference
+requirement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COMIX: Compositional Explanations using Prototypes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06059v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06059v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarath Sivaprasad, Dmitry Kangin, Plamen Angelov, Mario Fritz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aligning machine representations with human understanding is key to improving
+interpretability of machine learning (ML) models. When classifying a new image,
+humans often explain their decisions by decomposing the image into concepts and
+pointing to corresponding regions in familiar images. Current ML explanation
+techniques typically either trace decision-making processes to reference
+prototypes, generate attribution maps highlighting feature importance, or
+incorporate intermediate bottlenecks designed to align with human-interpretable
+concepts. The proposed method, named COMIX, classifies an image by decomposing
+it into regions based on learned concepts and tracing each region to
+corresponding ones in images from the training dataset, assuring that
+explanations fully represent the actual decision-making process. We dissect the
+test image into selected internal representations of a neural network to derive
+prototypical parts (primitives) and match them with the corresponding
+primitives derived from the training data. In a series of qualitative and
+quantitative experiments, we theoretically prove and demonstrate that our
+method, in contrast to post hoc analysis, provides fidelity of explanations and
+shows that the efficiency is competitive with other inherently interpretable
+architectures. Notably, it shows substantial improvements in fidelity and
+sparsity metrics, including 48.82% improvement in the C-insertion score on the
+ImageNet dataset over the best state-of-the-art baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Flexible Heterogeneous Coordination with Capability-Aware
+  Shared Hypernetworks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Fu, Pierce Howell, Shalin Jain, Harish Ravichandar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperative heterogeneous multi-agent tasks require agents to effectively
+coordinate their behaviors while accounting for their relative capabilities.
+Learning-based solutions to this challenge span between two extremes: i)
+shared-parameter methods, which encode diverse behaviors within a single
+architecture by assigning an ID to each agent, and are sample-efficient but
+result in limited behavioral diversity; ii) independent methods, which learn a
+separate policy for each agent, and show greater behavioral diversity but lack
+sample-efficiency. Prior work has also explored selective parameter-sharing,
+allowing for a compromise between diversity and efficiency. None of these
+approaches, however, effectively generalize to unseen agents or teams. We
+present Capability-Aware Shared Hypernetworks (CASH), a novel architecture for
+heterogeneous multi-agent coordination that generates sufficient diversity
+while maintaining sample-efficiency via soft parameter-sharing hypernetworks.
+Intuitively, CASH allows the team to learn common strategies using a shared
+encoder, which are then adapted according to the team's individual and
+collective capabilities with a hypernetwork, allowing for zero-shot
+generalization to unseen teams and agents. We present experiments across two
+heterogeneous coordination tasks and three standard learning paradigms
+(imitation learning, on- and off-policy reinforcement learning). CASH is able
+to outperform baseline architectures in success rate and sample efficiency when
+evaluated on unseen teams and agents despite using less than half of the
+learnable parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures, equal authorship between Pierce Howell and
+  Shalin Jain</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-powered virtual tissues from spatial proteomics for clinical
+  diagnostics and biomedical discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johann Wenckstern, Eeshaan Jain, Kiril Vasilev, Matteo Pariset, Andreas Wicki, Gabriele Gut, Charlotte Bunne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial proteomics technologies have transformed our understanding of complex
+tissue architectures by enabling simultaneous analysis of multiple molecular
+markers and their spatial organization. The high dimensionality of these data,
+varying marker combinations across experiments and heterogeneous study designs
+pose unique challenges for computational analysis. Here, we present Virtual
+Tissues (VirTues), a foundation model framework for biological tissues that
+operates across the molecular, cellular and tissue scale. VirTues introduces
+innovations in transformer architecture design, including a novel tokenization
+scheme that captures both spatial and marker dimensions, and attention
+mechanisms that scale to high-dimensional multiplex data while maintaining
+interpretability. Trained on diverse cancer and non-cancer tissue datasets,
+VirTues demonstrates strong generalization capabilities without task-specific
+fine-tuning, enabling cross-study analysis and novel marker integration. As a
+generalist model, VirTues outperforms existing approaches across clinical
+diagnostics, biological discovery and patient case retrieval tasks, while
+providing insights into tissue function and disease mechanisms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Impact of Observation Space Design Choices On Training
+  Reinforcement Learning Solutions for Spacecraft Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathaniel Hamilton, Kyle Dunlap, Kerianne L Hobbs
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research using Reinforcement Learning (RL) to learn autonomous control
+for spacecraft operations has shown great success. However, a recent study
+showed their performance could be improved by changing the action space, i.e.
+control outputs, used in the learning environment. This has opened the door for
+finding more improvements through further changes to the environment. The work
+in this paper focuses on how changes to the environment's observation space can
+impact the training and performance of RL agents learning the spacecraft
+inspection task. The studies are split into two groups. The first looks at the
+impact of sensors that were designed to help agents learn the task. The second
+looks at the impact of reference frames, reorienting the agent to see the world
+from a different perspective. The results show the sensors are not necessary,
+but most of them help agents learn more optimal behavior, and that the
+reference frame does not have a large impact, but is best kept consistent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 10 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Neural Operator for Forecasting Carbon Monoxide Evolution in Cities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanchit Bedi, Karn Tiwari, Prathosh A. P., Sri Harsha Kota, N. M. Anoop Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-time forecasting of carbon monoxide (CO) concentrations is essential for
+enabling timely interventions to improve urban air quality. Conventional air
+quality models often require extensive computational resources for accurate,
+multi-scale predictions, limiting their practicality for rapid, real-time
+application. To address this challenge, we introduce the Complex Neural
+Operator for Air Quality (CoNOAir), a machine learning model that forecast CO
+concentrations efficiently. CoNOAir demonstrates superior performance over
+state-of-theart models, such as the Fourier Neural Operator (FNO), in both
+short-term (hourly) and extended (72-hour) forecasts at a national scale. It
+excels in capturing extreme pollution events and performs consistently across
+multiple Indian cities, achieving an R2 above 0.95 for hourly CO predictions
+across all evaluated locations. CoNOAir equips authorities with an effective
+tool for issuing early warnings and designing targeted intervention strategies.
+This work marks a step forward in achieving dependable, real-time CO pollution
+predictions for densely populated urban centres.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 21 figures, to be published in npj Clean Air journal
+  (accepted)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to generate feasible graphs using graph grammars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Mautner, Rolf Backofen, Fabrizio Costa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative methods for graphs need to be sufficiently flexible to model
+complex dependencies between sets of nodes. At the same time, the generated
+graphs need to satisfy domain-dependent feasibility conditions, that is, they
+should not violate certain constraints that would make their interpretation
+impossible within the given application domain (e.g. a molecular graph where an
+atom has a very large number of chemical bounds). Crucially, constraints can
+involve not only local but also long-range dependencies: for example, the
+maximal length of a cycle can be bounded.
+  Currently, a large class of generative approaches for graphs, such as methods
+based on artificial neural networks, is based on message passing schemes. These
+approaches suffer from information 'dilution' issues that severely limit the
+maximal range of the dependencies that can be modeled. To address this problem,
+we propose a generative approach based on the notion of graph grammars. The key
+novel idea is to introduce a domain-dependent coarsening procedure to provide
+short-cuts for long-range dependencies.
+  We show the effectiveness of our proposal in two domains: 1) small drugs and
+2) RNA secondary structures. In the first case, we compare the quality of the
+generated molecular graphs via the Molecular Sets (MOSES) benchmark suite,
+which evaluates the distance between generated and real molecules, their
+lipophilicity, synthesizability, and drug-likeness. In the second case, we show
+that the approach can generate very large graphs (with hundreds of nodes) that
+are accepted as valid examples for a desired RNA family by the "Infernal"
+covariance model, a state-of-the-art RNA classifier.
+  Our implementation is available on github:
+github.com/fabriziocosta/GraphLearn
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeltaGNN: Graph Neural Network with Information Flow Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Mancini, Islem Rekik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are popular deep learning models designed to
+process graph-structured data through recursive neighborhood aggregations in
+the message passing process. When applied to semi-supervised node
+classification, the message-passing enables GNNs to understand short-range
+spatial interactions, but also causes them to suffer from over-smoothing and
+over-squashing. These challenges hinder model expressiveness and prevent the
+use of deeper models to capture long-range node interactions (LRIs) within the
+graph. Popular solutions for LRIs detection are either too expensive to process
+large graphs due to high time complexity or fail to generalize across diverse
+graph structures. To address these limitations, we propose a mechanism called
+\emph{information flow control}, which leverages a novel connectivity measure,
+called \emph{information flow score}, to address over-smoothing and
+over-squashing with linear computational overhead, supported by theoretical
+evidence. Finally, to prove the efficacy of our methodology we design DeltaGNN,
+the first scalable and generalizable approach for detecting long-range and
+short-range interactions. We benchmark our model across 10 real-world datasets,
+including graphs with varying sizes, topologies, densities, and homophilic
+ratios, showing superior performance with limited computational complexity. The
+implementation of the proposed methods are publicly available at
+https://github.com/basiralab/DeltaGNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Attention-Guided Deep Learning Approach for Classifying 39 Skin
+  Lesion Types 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sauda Adiv Hanum, Ashim Dey, Muhammad Ashad Kabir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The skin, as the largest organ of the human body, is vulnerable to a diverse
+array of conditions collectively known as skin lesions, which encompass various
+dermatoses. Diagnosing these lesions presents significant challenges for
+medical practitioners due to the subtle visual differences that are often
+imperceptible to the naked eye. While not all skin lesions are
+life-threatening, certain types can act as early indicators of severe diseases,
+including skin cancers, underscoring the critical need for timely and accurate
+diagnostic methods. Deep learning algorithms have demonstrated remarkable
+potential in facilitating the early detection and prognosis of skin lesions.
+This study advances the field by curating a comprehensive and diverse dataset
+comprising 39 categories of skin lesions, synthesized from five publicly
+available datasets. Using this dataset, the performance of five
+state-of-the-art deep learning models -- MobileNetV2, Xception, InceptionV3,
+EfficientNetB1, and Vision Transformer - is rigorously evaluated. To enhance
+the accuracy and robustness of these models, attention mechanisms such as the
+Efficient Channel Attention (ECA) and the Convolutional Block Attention Module
+(CBAM) are incorporated into their architectures. Comprehensive evaluation
+across multiple performance metrics reveals that the Vision Transformer model
+integrated with CBAM outperforms others, achieving an accuracy of 93.46%,
+precision of 94%, recall of 93%, F1-score of 93%, and specificity of 93.67%.
+These results underscore the significant potential of the proposed system in
+supporting medical professionals with accurate and efficient prognostic tools
+for diagnosing a broad spectrum of skin lesions. The dataset and code used in
+this study can be found at
+https://github.com/akabircs/Skin-Lesions-Classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing <span class="highlight-title">Self-Supervised</span> Learning Models <span class="highlight-title">Pre-Train</span>ed on Human Speech
+  and Animal Vocalizations for Bioacoustics Processing <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eklavya Sarkar, Mathew Magimai. -Doss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) foundation models have emerged as powerful,
+domain-agnostic, general-purpose feature extractors applicable to a wide range
+of tasks. Such models pre-trained on human speech have demonstrated high
+transferability for bioacoustic processing. This paper investigates (i) whether
+SSL models pre-trained directly on animal vocalizations offer a significant
+advantage over those pre-trained on speech, and (ii) whether fine-tuning
+speech-pretrained models on automatic speech recognition (ASR) tasks can
+enhance bioacoustic classification. We conduct a comparative analysis using
+three diverse bioacoustic datasets and two different bioacoustic tasks. Results
+indicate that pre-training on bioacoustic data provides only marginal
+improvements over speech-pretrained models, with comparable performance in most
+scenarios. Fine-tuning on ASR tasks yields mixed outcomes, suggesting that the
+general-purpose representations learned during SSL pre-training are already
+well-suited for bioacoustic tasks. These findings highlight the robustness of
+speech-pretrained SSL models for bioacoustics and imply that extensive
+fine-tuning may not be necessary for optimal performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Variational Sequential Monte Carlo for High-Dimensional
+  Observations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wessel L. van Nierop, Nir Shlezinger, Ruud J. G. van Sloun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential Monte Carlo (SMC), or particle filtering, is widely used in
+nonlinear state-space systems, but its performance often suffers from poorly
+approximated proposal and state-transition distributions. This work introduces
+a differentiable particle filter that leverages the unsupervised variational
+SMC objective to parameterize the proposal and transition distributions with a
+neural network, designed to learn from high-dimensional observations.
+Experimental results demonstrate that our approach outperforms established
+baselines in tracking the challenging Lorenz attractor from high-dimensional
+and partial observations. Furthermore, an evidence lower bound based evaluation
+indicates that our method offers a more accurate representation of the
+posterior distribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Brain Age Residual Biomarker (BARB): Leveraging MRI-Based Models to
+  Detect Latent Health Conditions in U.S. Veterans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arthur Bousquet, Sugata Banerji, Mark F. Conneely, Shahrzad Jamshidi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Age prediction using brain imaging, such as MRIs, has achieved promising
+results, with several studies identifying the model's residual as a potential
+biomarker for chronic disease states. In this study, we developed a brain age
+predictive model using a dataset of 1,220 U.S. veterans (18--80 years) and
+convolutional neural networks (CNNs) trained on two-dimensional slices of axial
+T2-weighted fast spin-echo and T2-weighted fluid attenuated inversion recovery
+MRI images. The model, incorporating a degree-3 polynomial ensemble, achieved
+an $R^{2}$ of 0.816 on the testing set. Images were acquired at the level of
+the anterior commissure and the frontal horns of the lateral ventricles.
+Residual analysis was performed to assess its potential as a biomarker for five
+ICD-coded conditions: hypertension (HTN), diabetes mellitus (DM), mild
+traumatic brain injury (mTBI), illicit substance abuse/dependence (SAD), and
+alcohol abuse/dependence (AAD). Residuals grouped by the number of ICD-coded
+conditions demonstrated different trends that were statistically significant
+($p = 0.002$), suggesting a relationship between disease states and predicted
+brain age. This association was particularly pronounced in patients over 49
+years, where negative residuals (indicating advanced brain aging) correlated
+with the presence of multiple ICD codes. These findings support the potential
+of residuals as biomarkers for detecting latent health conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Early Prediction of <span class="highlight-title">Self-Supervised</span> Speech Model Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan Whetten, Lucas Maison, Titouan Parcollet, Marco Dinarelli, Yannick Estève
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Self-Supervised Learning (SSL), pre-training and evaluation are resource
+intensive. In the speech domain, current indicators of the quality of SSL
+models during pre-training, such as the loss, do not correlate well with
+downstream performance. Consequently, it is often difficult to gauge the final
+downstream performance in a cost efficient manner during pre-training. In this
+work, we propose unsupervised efficient methods that give insights into the
+quality of the pre-training of SSL speech models, namely, measuring the cluster
+quality and rank of the embeddings of the SSL model. Results show that measures
+of cluster quality and rank correlate better with downstream performance than
+the pre-training loss with only one hour of unlabeled audio, reducing the need
+for GPU hours and labeled data in SSL model evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Inversion in Split Learning for Personalized LLMs: New Insights
+  from Information Bottleneck Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05965v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05965v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunmeng Shu, Shaofeng Li, Tian Dong, Yan Meng, Haojin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized Large Language Models (LLMs) have become increasingly prevalent,
+showcasing the impressive capabilities of models like GPT-4. This trend has
+also catalyzed extensive research on deploying LLMs on mobile devices. Feasible
+approaches for such edge-cloud deployment include using split learning.
+However, previous research has largely overlooked the privacy leakage
+associated with intermediate representations transmitted from devices to
+servers. This work is the first to identify model inversion attacks in the
+split learning framework for LLMs, emphasizing the necessity of secure defense.
+For the first time, we introduce mutual information entropy to understand the
+information propagation of Transformer-based LLMs and assess privacy attack
+performance for LLM blocks. To address the issue of representations being
+sparser and containing less information than embeddings, we propose a two-stage
+attack system in which the first part projects representations into the
+embedding space, and the second part uses a generative model to recover text
+from these embeddings. This design breaks down the complexity and achieves
+attack scores of 38%-75% in various scenarios, with an over 60% improvement
+over the SOTA. This work comprehensively highlights the potential privacy risks
+during the deployment of personalized LLMs on the edge side.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soft regression trees: a model variant and a decomposition training
+  algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Consolo, Edoardo Amaldi, Andrea Manno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision trees are widely used for classification and regression tasks in a
+variety of application fields due to their interpretability and good accuracy.
+During the past decade, growing attention has been devoted to globally
+optimized decision trees with deterministic or soft splitting rules at branch
+nodes, which are trained by optimizing the error function over all the tree
+parameters. In this work, we propose a new variant of soft multivariate
+regression trees (SRTs) where, for every input vector, the prediction is
+defined as the linear regression associated to a single leaf node, namely, the
+leaf node obtained by routing the input vector from the root along the branches
+with higher probability. SRTs exhibit the conditional computational property,
+i.e., each prediction depends on a small number of nodes (parameters), and our
+nonlinear optimization formulation for training them is amenable to
+decomposition. After showing a universal approximation result for SRTs, we
+present a decomposition training algorithm including a clustering-based
+initialization procedure and a heuristic for reassigning the input vectors
+along the tree. Under mild assumptions, we establish asymptotic convergence
+guarantees. Experiments on 15 wellknown datasets indicate that our SRTs and
+decomposition algorithm yield higher accuracy and robustness compared with
+traditional soft regression trees trained using the nonlinear optimization
+formulation of Blanquero et al., and a significant reduction in training times
+as well as a slightly better average accuracy compared with the mixed-integer
+optimization approach of Bertsimas and Dunn. We also report a comparison with
+the Random Forest ensemble method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Encoded Spatial Attribute in Multi-Tier Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asfia Kawnine, Francis Palma, Seyed Alireza Rahimi Azghadi, Hung Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research presents an Encoded Spatial Multi-Tier Federated Learning
+approach for a comprehensive evaluation of aggregated models for geospatial
+data. In the client tier, encoding spatial information is introduced to better
+predict the target outcome. The research aims to assess the performance of
+these models across diverse datasets and spatial attributes, highlighting
+variations in predictive accuracy. Using evaluation metrics such as accuracy,
+our research reveals insights into the complexities of spatial granularity and
+the challenges of capturing underlying patterns in the data. We extended the
+scope of federated learning (FL) by having multi-tier along with the
+functionality of encoding spatial attributes. Our N-tier FL approach used
+encoded spatial data to aggregate in different tiers. We obtained multiple
+models that predicted the different granularities of spatial data. Our findings
+underscore the need for further research to improve predictive accuracy and
+model generalization, with potential avenues including incorporating additional
+features, refining model architectures, and exploring alternative modeling
+approaches. Our experiments have several tiers representing different levels of
+spatial aspects. We obtained accuracy of 75.62% and 89.52% for the global model
+without having to train the model using the data constituted with the
+designated tier. The research also highlights the importance of the proposed
+approach in real-time applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE ICCE 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffuSETS: 12-lead ECG Generation Conditioned on Clinical Text Reports
+  and Patient-Specific Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongfan Lai, Jiabo Chen, Deyun Zhang, Yue Wang, Shijia Geng, Hongyan Li, Shenda Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Heart disease remains a significant threat to human health. As a non-invasive
+diagnostic tool, the electrocardiogram (ECG) is one of the most widely used
+methods for cardiac screening. However, the scarcity of high-quality ECG data,
+driven by privacy concerns and limited medical resources, creates a pressing
+need for effective ECG signal generation. Existing approaches for generating
+ECG signals typically rely on small training datasets, lack comprehensive
+evaluation frameworks, and overlook potential applications beyond data
+augmentation. To address these challenges, we propose DiffuSETS, a novel
+framework capable of generating ECG signals with high semantic alignment and
+fidelity. DiffuSETS accepts various modalities of clinical text reports and
+patient-specific information as inputs, enabling the creation of clinically
+meaningful ECG signals. Additionally, to address the lack of standardized
+evaluation in ECG generation, we introduce a comprehensive benchmarking
+methodology to assess the effectiveness of generative models in this domain.
+Our model achieve excellent results in tests, proving its superiority in the
+task of ECG generation. Furthermore, we showcase its potential to mitigate data
+scarcity while exploring novel applications in cardiology education and medical
+knowledge discovery, highlighting the broader impact of our work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Q-MAML: Quantum Model-Agnostic Meta-Learning for Variational Quantum
+  Algorithms <span class="chip">AAAI 25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyong Lee, JeiHee Cho, Shiho Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the Noisy Intermediate-Scale Quantum (NISQ) era, using variational quantum
+algorithms (VQAs) to solve optimization problems has become a key application.
+However, these algorithms face significant challenges, such as choosing an
+effective initial set of parameters and the limited quantum processing time
+that restricts the number of optimization iterations. In this study, we
+introduce a new framework for optimizing parameterized quantum circuits (PQCs)
+that employs a classical optimizer, inspired by Model-Agnostic Meta-Learning
+(MAML) technique. This approach aim to achieve better parameter initialization
+that ensures fast convergence. Our framework features a classical neural
+network, called Learner}, which interacts with a PQC using the output of
+Learner as an initial parameter. During the pre-training phase, Learner is
+trained with a meta-objective based on the quantum circuit cost function. In
+the adaptation phase, the framework requires only a few PQC updates to converge
+to a more accurate value, while the learner remains unchanged. This method is
+highly adaptable and is effectively extended to various Hamiltonian
+optimization problems. We validate our approach through experiments, including
+distribution function mapping and optimization of the Heisenberg XYZ
+Hamiltonian. The result implies that the Learner successfully estimates initial
+parameters that generalize across the problem space, enabling fast adaptation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 8 figures, to be published in AAAI 25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discovery of sustainable energy materials via the machine-learned
+  material space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malte Grunert, Max Großmann, Erich Runge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Does a machine learning model actually gain an understanding of the material
+space? We answer this question in the affirmative on the example of the
+OptiMate model, a graph attention network trained to predict the optical
+properties of semiconductors and insulators. By applying the UMAP
+dimensionality reduction technique to its latent embeddings, we demonstrate
+that the model captures a nuanced and interpretable representation of the
+materials space, reflecting chemical and physical principles, without any
+user-induced bias. This enables clustering of almost 10,000 materials based on
+optical properties and chemical similarities. Beyond this understanding, we
+demonstrate how the learned material space can be used to identify more
+sustainable alternatives to critical materials in energy-related technologies,
+such as photovoltaics. These findings demonstrate the dual utility of machine
+learning models in materials science: Accurately predicting material properties
+while providing insights into the underlying materials space. The approach
+demonstrates the broader potential of leveraging learned materials spaces for
+the discovery and design of materials for diverse applications, and is easily
+applicable to any state-of-the-art machine learning model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text2Playlist: Generating Personalized Playlists from Text on Deezer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Delcluze, Antoine Khoury, Clémence Vast, Valerio Arnaudo, Léa Briand, Walid Bendada, Thomas Bouabça
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The streaming service Deezer heavily relies on the search to help users
+navigate through its extensive music catalog. Nonetheless, it is primarily
+designed to find specific items and does not lead directly to a smooth
+listening experience. We present Text2Playlist, a stand-alone tool that
+addresses these limitations. Text2Playlist leverages generative AI, music
+information retrieval and recommendation systems to generate query-specific and
+personalized playlists, successfully deployed at scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EDNet: Edge-Optimized Small Target Detection in UAV Imagery -- Faster
+  Context Attention, Better Feature Fusion, and Hardware Acceleration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhifan Song, Yuan Zhang, Abd Al Rahman M. Abu Ebayyeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting small targets in drone imagery is challenging due to low
+resolution, complex backgrounds, and dynamic scenes. We propose EDNet, a novel
+edge-target detection framework built on an enhanced YOLOv10 architecture,
+optimized for real-time applications without post-processing. EDNet
+incorporates an XSmall detection head and a Cross Concat strategy to improve
+feature fusion and multi-scale context awareness for detecting tiny targets in
+diverse environments. Our unique C2f-FCA block employs Faster Context Attention
+to enhance feature extraction while reducing computational complexity. The WIoU
+loss function is employed for improved bounding box regression. With seven
+model sizes ranging from Tiny to XL, EDNet accommodates various deployment
+environments, enabling local real-time inference and ensuring data privacy.
+Notably, EDNet achieves up to a 5.6% gain in mAP@50 with significantly fewer
+parameters. On an iPhone 12, EDNet variants operate at speeds ranging from 16
+to 55 FPS, providing a scalable and efficient solution for edge-based object
+detection in challenging drone imagery. The source code and pre-trained models
+are available at: https://github.com/zsniko/EDNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 21st IEEE International Conference on Ubiquitous
+  Intelligence and Computing (UIC 2024)
+  https://www.ieee-smart-world.org/2024/uic</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoRAG: Retrieval-Augmented Generation over Video Corpus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soyeong Jeong, Kangsan Kim, Jinheon Baek, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) is a powerful strategy to address the
+issue of generating factually incorrect outputs in foundation models by
+retrieving external knowledge relevant to queries and incorporating it into
+their generation process. However, existing RAG approaches have primarily
+focused on textual information, with some recent advancements beginning to
+consider images, and they largely overlook videos, a rich source of multimodal
+knowledge capable of representing events, processes, and contextual details
+more effectively than any other modality. While a few recent studies explore
+the integration of videos in the response generation process, they either
+predefine query-associated videos without retrieving them according to queries,
+or convert videos into the textual descriptions without harnessing their
+multimodal richness. To tackle these, we introduce VideoRAG, a novel framework
+that not only dynamically retrieves relevant videos based on their relevance
+with queries but also utilizes both visual and textual information of videos in
+the output generation. Further, to operationalize this, our method revolves
+around the recent advance of Large Video Language Models (LVLMs), which enable
+the direct processing of video content to represent it for retrieval and
+seamless integration of the retrieved videos jointly with queries. We
+experimentally validate the effectiveness of VideoRAG, showcasing that it is
+superior to relevant baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Content Moderation in the Fediverse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05871v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05871v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haris Bin Zia, Aravindh Raman, Ignacio Castro, Gareth Tyson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Fediverse, a group of interconnected servers providing a variety of
+interoperable services (e.g. micro-blogging in Mastodon) has gained rapid
+popularity. This sudden growth, partly driven by Elon Musk's acquisition of
+Twitter, has created challenges for administrators though. This paper focuses
+on one particular challenge: content moderation, e.g. the need to remove spam
+or hate speech. While centralized platforms like Facebook and Twitter rely on
+automated tools for moderation, their dependence on massive labeled datasets
+and specialized infrastructure renders them impractical for decentralized,
+low-resource settings like the Fediverse. In this work, we design and evaluate
+FedMod, a collaborative content moderation system based on federated learning.
+Our system enables servers to exchange parameters of partially trained local
+content moderation models with similar servers, creating a federated model
+shared among collaborating servers. FedMod demonstrates robust performance on
+three different content moderation tasks: harmful content detection, bot
+content detection, and content warning assignment, achieving average per-server
+macro-F1 scores of 0.71, 0.73, and 0.58, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Neighbor-based Approach to Pitch Ownership Models in Soccer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiago Mendes-Neves, Luís Meireles, João Mendes-Moreira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pitch ownership models allow many types of analysis in soccer and provide
+valuable assistance to tactical analysts in understanding the game's dynamics.
+The novelty they provide over event-based analysis is that tracking data
+incorporates context that event-based data does not possess, like player
+positioning. This paper proposes a novel approach to building pitch ownership
+models in soccer games using the K-Nearest Neighbors (KNN) algorithm. Our
+approach provides a fast inference mechanism that can model different
+approaches to pitch control using the same algorithm. Despite its flexibility,
+it uses only three hyperparameters to tune the model, facilitating the tuning
+process for different player skill levels. The flexibility of the approach
+allows for the emulation of different methods available in the literature by
+adjusting a small number of parameters, including adjusting for different
+levels of uncertainty. In summary, the proposed model provides a new and more
+flexible strategy for building pitch ownership models, extending beyond just
+replicating existing algorithms, and can provide valuable insights for tactical
+analysts and open up new avenues for future research. We thoroughly visualize
+several examples demonstrating the presented models' strengths and weaknesses.
+The code is available at github.com/nvsclub/KNNPitchControl.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Network Verification is a Programming Language Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas C. Cordeiro, Matthew L. Daggitt, Julien Girard-Satabin, Omri Isac, Taylor T. Johnson, Guy Katz, Ekaterina Komendantskaya, Augustin Lemesle, Edoardo Manino, Artjoms Šinkarovs, Haoze Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network verification is a new and rapidly developing field of
+research. So far, the main priority has been establishing efficient
+verification algorithms and tools, while proper support from the programming
+language perspective has been considered secondary or unimportant. Yet, there
+is mounting evidence that insights from the programming language community may
+make a difference in the future development of this domain. In this paper, we
+formulate neural network verification challenges as programming language
+challenges and suggest possible future solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ESOP 2025, European Symposium on Programming Languages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MRI Patterns of the Hippocampus and Amygdala for Predicting Stages of
+  Alzheimer's Progression: A Minimal Feature Machine Learning Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aswini Kumar Patra, Soraisham Elizabeth Devi, Tejashwini Gajurel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alzheimer's disease (AD) progresses through distinct stages, from early mild
+cognitive impairment (EMCI) to late mild cognitive impairment (LMCI) and
+eventually to AD. Accurate identification of these stages, especially
+distinguishing LMCI from EMCI, is crucial for developing pre-dementia
+treatments but remains challenging due to subtle and overlapping imaging
+features. This study proposes a minimal-feature machine learning framework that
+leverages structural MRI data, focusing on the hippocampus and amygdala as
+regions of interest. The framework addresses the curse of dimensionality
+through feature selection, utilizes region-specific voxel information, and
+implements innovative data organization to enhance classification performance
+by reducing noise. The methodology integrates dimensionality reduction
+techniques such as PCA and t-SNE with state-of-the-art classifiers, achieving
+the highest accuracy of 88.46%. This framework demonstrates the potential for
+efficient and accurate staging of AD progression while providing valuable
+insights for clinical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Annealing Machine-assisted Learning of Graph Neural Network for
+  Combinatorial Optimization <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Loyola, Kento Hasegawa, Andres Hoyos-Idobro, Kazuo Ono, Toyotaro Suzumura, Yu Hirate, Masanao Yamaoka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Annealing Machines (AM) have shown increasing capabilities in solving
+complex combinatorial problems, positioning themselves as a more immediate
+alternative to the expected advances of future fully quantum solutions, there
+are still scaling limitations. In parallel, Graph Neural Networks (GNN) have
+been recently adapted to solve combinatorial problems, showing competitive
+results and potentially high scalability due to their distributed nature. We
+propose a merging approach that aims at retaining both the accuracy exhibited
+by AMs and the representational flexibility and scalability of GNNs. Our model
+considers a compression step, followed by a supervised interaction where
+partial solutions obtained from the AM are used to guide local GNNs from where
+node feature representations are obtained and combined to initialize an
+additional GNN-based solver that handles the original graph's target problem.
+Intuitively, the AM can solve the combinatorial problem indirectly by infusing
+its knowledge into the GNN. Experiments on canonical optimization problems show
+that the idea is feasible, effectively allowing the AM to solve size problems
+beyond its original limits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Second Workshop on Machine Learning with New Compute Paradigms at
+  NeurIPS 2024 (MLNCP 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "Cause" is Mechanistic Narrative within Scientific Domains: An Ordinary
+  Language Philosophical Critique of "Causal Machine Learning" 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05844v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05844v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vyacheslav Kungurtsev, Leonardo Christov Moore, Gustav Sir, Martin Krutsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal Learning has emerged as a major theme of AI in recent years, promising
+to use special techniques to reveal the true nature of cause and effect in a
+number of important domains. We consider the Epistemology of learning and
+recognizing true cause and effect phenomena. Through thought exercises on the
+customary use of the word ''cause'', especially in scientific domains, we
+investigate what, in practice, constitutes a valid causal claim. We recognize
+the word's uses across scientific domains in disparate form but consistent
+function within the scientific paradigm. We highlight fundamental distinctions
+of practice that can be performed in the natural and social sciences, highlight
+the importance of many systems of interest being open and irreducible and
+identify the important notion of Hermeneutic knowledge for social science
+inquiry. We posit that the distinct properties require that definitive causal
+claims can only come through an agglomeration of consistent evidence across
+multiple domains and levels of abstraction, such as empirical, physiological,
+biochemical, etc. We present Cognitive Science as an exemplary
+multi-disciplinary field providing omnipresent opportunity for such a Research
+Program, and highlight the main general modes of practice of scientific inquiry
+that can adequately merge, rather than place as incorrigibly conflictual,
+multi-domain multi-abstraction scientific practices and language games.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Orthogonal projection-based regularization for efficient model
+  augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bendegúz M. Györök, Jan H. Hoekstra, Johan Kon, Tamás Péni, Maarten Schoukens, Roland Tóth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep-learning-based nonlinear system identification has shown the ability to
+produce reliable and highly accurate models in practice. However, these
+black-box models lack physical interpretability, and often a considerable part
+of the learning effort is spent on capturing already expected/known behavior
+due to first-principles-based understanding of some aspects of the system. A
+potential solution is to integrate prior physical knowledge directly into the
+model structure, combining the strengths of physics-based modeling and
+deep-learning-based identification. The most common approach is to use an
+additive model augmentation structure, where the physics-based and the
+machine-learning (ML) components are connected in parallel. However, such
+models are overparametrized, training them is challenging, potentially causing
+the physics-based part to lose interpretability. To overcome this challenge,
+this paper proposes an orthogonal projection-based regularization technique to
+enhance parameter learning, convergence, and even model accuracy in
+learning-based augmentation of nonlinear baseline models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to L4DC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-tuning is Not Fine: Mitigating Backdoor Attacks in GNNs with
+  Limited Clean Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiale Zhang, Bosen Rao, Chengcheng Zhu, Xiaobing Sun, Qingming Li, Haibo Hu, Xiapu Luo, Qingqing Ye, Shouling Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have achieved remarkable performance through
+their message-passing mechanism. However, recent studies have highlighted the
+vulnerability of GNNs to backdoor attacks, which can lead the model to
+misclassify graphs with attached triggers as the target class. The
+effectiveness of recent promising defense techniques, such as fine-tuning or
+distillation, is heavily contingent on having comprehensive knowledge of the
+sufficient training dataset. Empirical studies have shown that fine-tuning
+methods require a clean dataset of 20% to reduce attack accuracy to below 25%,
+while distillation methods require a clean dataset of 15%. However, obtaining
+such a large amount of clean data is commonly impractical.
+  In this paper, we propose a practical backdoor mitigation framework, denoted
+as GRAPHNAD, which can capture high-quality intermediate-layer representations
+in GNNs to enhance the distillation process with limited clean data. To achieve
+this, we address the following key questions: How to identify the appropriate
+attention representations in graphs for distillation? How to enhance
+distillation with limited data? By adopting the graph attention transfer
+method, GRAPHNAD can effectively align the intermediate-layer attention
+representations of the backdoored model with that of the teacher model, forcing
+the backdoor neurons to transform into benign ones. Besides, we extract the
+relation maps from intermediate-layer transformation and enforce the relation
+maps of the backdoored model to be consistent with that of the teacher model,
+thereby ensuring model accuracy while further reducing the influence of
+backdoors. Extensive experimental results show that by fine-tuning a teacher
+model with only 3% of the clean data, GRAPHNAD can reduce the attack success
+rate to below 5%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Models for Smarter UAVs: Decision-Making and Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yousef Emami, Hao Zhou, Luis Almeida, Kai Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned Aerial Vehicles (UAVs) are increasingly adopted in modern
+communication networks. However, challenges in decision-making and digital
+modeling continue to impede their rapid advancement. Reinforcement Learning
+(RL) algorithms face limitations such as low sample efficiency and limited data
+versatility, further magnified in UAV communication scenarios. Moreover,
+Digital Twin (DT) modeling introduces substantial decision-making and data
+management complexities. RL models, often integrated into DT frameworks,
+require extensive training data to achieve accurate predictions. In contrast to
+traditional approaches that focus on class boundaries, Diffusion Models (DMs),
+a new class of generative AI, learn the underlying probability distribution
+from the training data and can generate trustworthy new patterns based on this
+learned distribution. This paper explores the integration of DMs with RL and DT
+to effectively address these challenges. By combining the data generation
+capabilities of DMs with the decision-making framework of RL and the modeling
+accuracy of DT, the integration improves the adaptability and real-time
+performance of UAV communication. Moreover, the study shows how DMs can
+alleviate data scarcity, improve policy networks, and optimize dynamic
+modeling, providing a robust solution for complex UAV communication scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaPRL: Adaptive Pairwise Regression Learning with Uncertainty
+  Estimation for Universal Regression Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05809v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05809v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fuhang Liang, Rucong Xu, Deng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current deep regression models usually learn in point-wise way that treat
+each sample as an independent input, neglecting the relative ordering among
+different data. Consequently, the regression model could neglect the data 's
+interrelationships, potentially resulting in suboptimal performance. Moreover,
+the existence of aleatoric uncertainty in the training data may drive the model
+to capture non-generalizable patterns, contributing to increased overfitting.
+To address these issues, we propose a novel adaptive pairwise learning
+framework (AdaPRL) for regression tasks which leverages the relative
+differences between data points and integrates with deep probabilistic models
+to quantify the uncertainty associated with the predictions. Additionally, we
+adapt AdaPRL for applications in multi-task learning and multivariate time
+series forecasting. Extensive experiments with several real-world regression
+datasets including recommendation systems, age estimation, time series
+forecasting, natural language understanding, finance, and industry datasets
+show that AdaPRL is compatible with different backbone networks in various
+tasks and achieves state-of-the-art performance on the vast majority of tasks,
+highlighting its notable potential including enhancing prediction accuracy and
+ranking ability, increasing generalization capability, improving robustness to
+noisy data, improving resilience to reduced data, and enhancing
+interpretability, etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alignment without Over-optimization: Training-Free Solution for
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunwoo Kim, Minkyu Kim, Dongmin Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models excel in generative tasks, but aligning them with specific
+objectives while maintaining their versatility remains challenging. Existing
+fine-tuning methods often suffer from reward over-optimization, while
+approximate guidance approaches fail to optimize target rewards effectively.
+Addressing these limitations, we propose a training-free sampling method based
+on Sequential Monte Carlo (SMC) to sample from the reward-aligned target
+distribution. Our approach, tailored for diffusion sampling and incorporating
+tempering techniques, achieves comparable or superior target rewards to
+fine-tuning methods while preserving diversity and cross-reward generalization.
+We demonstrate its effectiveness in single-reward optimization, multi-objective
+scenarios, and online black-box optimization. This work offers a robust
+solution for aligning diffusion models with diverse downstream objectives
+without compromising their general capabilities. Code is available at
+https://github.com/krafton-ai/DAS .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Counterfactual Explanations under Model Multiplicity Using
+  Multi-Objective Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keita Kinjo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, explainability in machine learning has gained importance. In
+this context, counterfactual explanation (CE), which is an explanation method
+that uses examples, has attracted attention. However, it has been pointed out
+that CE is not robust when there are multiple machine-learning models. These
+problems are important when using machine learning to make safe decisions. In
+this paper, we propose robust CEs that introduce a new viewpoint - Pareto
+improvement - and a method that uses multi-objective optimization to generate
+it. To evaluate the proposed method, we conducted experiments using both
+simulated and actual data. The results demonstrate that the proposed method is
+robust and useful. We believe that this research will contribute to a wide
+range of research areas, such as explainability in machine learning,
+decision-making, and action planning based on machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Impact of Human Feedback via Influence Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taywon Min, Haeone Lee, Hanho Ryu, Yongchan Kwon, Kimin Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Reinforcement Learning from Human Feedback (RLHF), it is crucial to learn
+suitable reward models from human feedback to align large language models
+(LLMs) with human intentions. However, human feedback can often be noisy,
+inconsistent, or biased, especially when evaluating complex responses. Such
+feedback can lead to misaligned reward signals, potentially causing unintended
+side effects during the RLHF process. To address these challenges, we explore
+the use of influence functions to measure the impact of human feedback on the
+performance of reward models. We propose a compute-efficient approximation
+method that enables the application of influence functions to LLM-based reward
+models and large-scale preference datasets. In our experiments, we demonstrate
+two key applications of influence functions: (1) detecting common forms of
+labeler bias in human feedback datasets and (2) guiding labelers to refine
+their strategies to align more closely with expert feedback. By quantifying the
+impact of human feedback on reward models, we believe that influence functions
+can enhance feedback interpretability and contribute to scalable oversight in
+RLHF, helping labelers provide more accurate and consistent feedback. Source
+code is available at https://github.com/mintaywon/IF_RLHF
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Source code: https://github.com/mintaywon/IF_RLHF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ STHFL: Spatio-Temporal Heterogeneous Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shunxin Guo, Hongsong Wang, Shuxia Lin, Xu Yang, Xin Geng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning is a new framework that protects data privacy and allows
+multiple devices to cooperate in training machine learning models. Previous
+studies have proposed multiple approaches to eliminate the challenges posed by
+non-iid data and inter-domain heterogeneity issues. However, they ignore the
+\textbf{spatio-temporal} heterogeneity formed by different data distributions
+of increasing task data in the intra-domain. Moreover, the global data is
+generally a long-tailed distribution rather than assuming the global data is
+balanced in practical applications. To tackle the \textbf{spatio-temporal}
+dilemma, we propose a novel setting named \textbf{Spatio-Temporal
+Heterogeneity} Federated Learning (STHFL). Specially, the Global-Local Dynamic
+Prototype (GLDP) framework is designed for STHFL. In GLDP, the model in each
+client contains personalized layers which can dynamically adapt to different
+data distributions. For long-tailed data distribution, global prototypes are
+served as complementary knowledge for the training on classes with few samples
+in clients without leaking privacy. As tasks increase in clients, the knowledge
+of local prototypes generated in previous tasks guides for training in the
+current task to solve catastrophic forgetting. Meanwhile, the global-local
+prototypes are updated through the moving average method after training local
+prototypes in clients. Finally, we evaluate the effectiveness of GLDP, which
+achieves remarkable results compared to state-of-the-art methods in STHFL
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ rmlnomogram: An R package to construct an explainable nomogram for any
+  machine learning algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05772v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05772v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Herdiantri Sufriyana, Emily Chia-Yu Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Current nomogram can only be created for regression algorithm.
+Providing nomogram for any machine learning (ML) algorithms may accelerate
+model deployment in clinical settings or improve model availability. We
+developed an R package and web application to construct nomogram with model
+explainability of any ML algorithms. Methods: We formulated a function to
+transform an ML prediction model into a nomogram, requiring datasets with: (1)
+all possible combinations of predictor values; (2) the corresponding outputs of
+the model; and (3) the corresponding explainability values for each predictor
+(optional). Web application was also created. Results: Our R package could
+create 5 types of nomograms for categorical predictors and binary outcome
+without probability (1), categorical predictors and binary outcome with
+probability (2) or continuous outcome (3), and categorical with single
+numerical predictors and binary outcome with probability (4) or continuous
+outcome (5). Respectively, the first and remaining types optimally allowed
+maximum 15 and 5 predictors with maximum 3,200 combinations. Web application is
+provided with such limits. The explainability values were possible for types 2
+to 5. Conclusions: Our R package and web application could construct nomogram
+with model explainability of any ML algorithms using a fair number of
+predictors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 2 figures, 1 table, 3 equations, 1 algorithm, 4 code
+  snippets</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Halal or Not: Knowledge Graph Completion for Predicting Cultural
+  Appropriateness of Daily Products 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Thuy Hoang, Tien-Bach-Thanh Do, Jinho Seo, Seung Charlie Kim, Luong Vuong Nguyen, Duong Nguyen Minh Huy, Hyeon-Ju Jeon, O-Joun Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing demand for halal cosmetic products has exposed significant
+challenges, especially in Muslim-majority countries. Recently, various machine
+learning-based strategies, e.g., image-based methods, have shown remarkable
+success in predicting the halal status of cosmetics. However, these methods
+mainly focus on analyzing the discrete and specific ingredients within separate
+cosmetics, which ignore the high-order and complex relations between cosmetics
+and ingredients. To address this problem, we propose a halal cosmetic
+recommendation framework, namely HaCKG, that leverages a knowledge graph of
+cosmetics and their ingredients to explicitly model and capture the
+relationships between cosmetics and their components. By representing cosmetics
+and ingredients as entities within the knowledge graph, HaCKG effectively
+learns the high-order and complex relations between entities, offering a robust
+method for predicting halal status. Specifically, we first construct a cosmetic
+knowledge graph representing the relations between various cosmetics,
+ingredients, and their properties. We then propose a pre-trained relational
+graph attention network model with residual connections to learn the structural
+relation between entities in the knowledge graph. The pre-trained model is then
+fine-tuned on downstream cosmetic data to predict halal status. Extensive
+experiments on the cosmetic dataset over halal prediction tasks demonstrate the
+superiority of our model over state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development and Comparison of Model-Based and Data-Driven Approaches for
+  the Prediction of the Mechanical Properties of Lattice Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chiara Pasini, Oscar Ramponi, Stefano Pandini, Luciana Sartore, Giulia Scalet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lattice structures have great potential for several application fields
+ranging from medical and tissue engineering to aeronautical one. Their
+development is further speeded up by the continuing advances in additive
+manufacturing technologies that allow to overcome issues typical of standard
+processes and to propose tailored designs. However, the design of lattice
+structures is still challenging since their properties are considerably
+affected by numerous factors. The present paper aims to propose, discuss, and
+compare various modeling approaches to describe, understand, and predict the
+correlations between the mechanical properties and the void volume fraction of
+different types of lattice structures fabricated by fused deposition modeling
+3D printing. Particularly, four approaches are proposed: (i) a simplified
+analytical model; (ii) a semi-empirical model combining analytical equations
+with experimental correction factors; (iii) an artificial neural network
+trained on experimental data; (iv) numerical simulations by finite element
+analyses. The comparison among the various approaches, and with experimental
+data, allows to identify the performances, advantages, and disadvantages of
+each approach, thus giving important guidelines for choosing the right design
+methodology based on the needs and available data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work was funded by the European Union ERC CoDe4Bio Grant ID
+  101039467 under the funding programme Horizon Europe</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CognoSpeak: an automatic, remote assessment of early cognitive decline
+  in real-world conversational speech <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Madhurananda Pahar, Fuxiang Tao, Bahman Mirheidari, Nathan Pevy, Rebecca Bright, Swapnil Gadgil, Lise Sproson, Dorota Braun, Caitlin Illingworth, Daniel Blackburn, Heidi Christensen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The early signs of cognitive decline are often noticeable in conversational
+speech, and identifying those signs is crucial in dealing with later and more
+serious stages of neurodegenerative diseases. Clinical detection is costly and
+time-consuming and although there has been recent progress in the automatic
+detection of speech-based cues, those systems are trained on relatively small
+databases, lacking detailed metadata and demographic information. This paper
+presents CognoSpeak and its associated data collection efforts. CognoSpeak asks
+memory-probing long and short-term questions and administers standard cognitive
+tasks such as verbal and semantic fluency and picture description using a
+virtual agent on a mobile or web platform. In addition, it collects multimodal
+data such as audio and video along with a rich set of metadata from primary and
+secondary care, memory clinics and remote settings like people's homes. Here,
+we present results from 126 subjects whose audio was manually transcribed.
+Several classic classifiers, as well as large language model-based classifiers,
+have been investigated and evaluated across the different types of prompts. We
+demonstrate a high level of performance; in particular, we achieved an F1-score
+of 0.873 using a DistilBERT model to discriminate people with cognitive
+impairment (dementia and people with mild cognitive impairment (MCI)) from
+healthy volunteers using the memory responses, fluency tasks and cookie theft
+picture description. CognoSpeak is an automatic, remote, low-cost, repeatable,
+non-invasive and less stressful alternative to existing clinical cognitive
+assessments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for publication in IEEE SSCI 2025.
+  Copyright belongs to IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Covariate Dependent Mixture of Bayesian Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Marchant, Dario Draca, Gilad Francis, Sahand Assadzadeh, Mathew Varidel, Frank Iorfino, Sally Cripps
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning the structure of Bayesian networks from data provides insights into
+underlying processes and the causal relationships that generate the data, but
+its usefulness depends on the homogeneity of the data population, a condition
+often violated in real-world applications. In such cases, using a single
+network structure for inference can be misleading, as it may not capture
+sub-population differences. To address this, we propose a novel approach of
+modelling a mixture of Bayesian networks where component probabilities depend
+on individual characteristics. Our method identifies both network structures
+and demographic predictors of sub-population membership, aiding personalised
+interventions. We evaluate our method through simulations and a youth mental
+health case study, demonstrating its potential to improve tailored
+interventions in health, education, and social policy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLVD: LSTM-based Explicit Motion Modeling in Latent Space for Blind
+  Video Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Loay Rashid, Siddharth Roheda, Amit Unde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video restoration plays a pivotal role in revitalizing degraded video content
+by rectifying imperfections caused by various degradations introduced during
+capturing (sensor noise, motion blur, etc.), saving/sharing (compression,
+resizing, etc.) and editing. This paper introduces a novel algorithm designed
+for scenarios where noise is introduced during video capture, aiming to enhance
+the visual quality of videos by reducing unwanted noise artifacts. We propose
+the Latent space LSTM Video Denoiser (LLVD), an end-to-end blind denoising
+model. LLVD uniquely combines spatial and temporal feature extraction,
+employing Long Short Term Memory (LSTM) within the encoded feature domain. This
+integration of LSTM layers is crucial for maintaining continuity and minimizing
+flicker in the restored video. Moreover, processing frames in the encoded
+feature domain significantly reduces computations, resulting in a very
+lightweight architecture. LLVD's blind nature makes it versatile for real,
+in-the-wild denoising scenarios where prior information about noise
+characteristics is not available. Experiments reveal that LLVD demonstrates
+excellent performance for both synthetic and captured noise. Specifically, LLVD
+surpasses the current State-Of-The-Art (SOTA) in RAW denoising by 0.3dB, while
+also achieving a 59\% reduction in computational complexity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ELENA: Epigenetic Learning through Evolved Neural Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boris Kriuk, Keti Sulamanidze, Fedor Kriuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the success of metaheuristic algorithms in solving complex network
+optimization problems, they often struggle with adaptation, especially in
+dynamic or high-dimensional search spaces. Traditional approaches can become
+stuck in local optima, leading to inefficient exploration and suboptimal
+solutions. Most of the widely accepted advanced algorithms do well either on
+highly complex or smaller search spaces due to the lack of adaptation. To
+address these limitations, we present ELENA (Epigenetic Learning through
+Evolved Neural Adaptation), a new evolutionary framework that incorporates
+epigenetic mechanisms to enhance the adaptability of the core evolutionary
+approach. ELENA leverages compressed representation of learning parameters
+improved dynamically through epigenetic tags that serve as adaptive memory.
+Three epigenetic tags (mutation resistance, crossover affinity, and stability
+score) assist with guiding solution space search, facilitating a more
+intelligent hypothesis landscape exploration. To assess the framework
+performance, we conduct experiments on three critical network optimization
+problems: the Traveling Salesman Problem (TSP), the Vehicle Routing Problem
+(VRP), and the Maximum Clique Problem (MCP). Experiments indicate that ELENA
+achieves competitive results, often surpassing state-of-the-art methods on
+network optimization tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures, 4 tables, 2 algorithms</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diving Deep: Forecasting Sea Surface Temperatures and Anomalies <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ding Ning, Varvara Vetrova, Karin R. Bryan, Yun Sing Koh, Andreas Voskou, N'Dah Jean Kouagou, Arnab Sharma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This overview paper details the findings from the Diving Deep: Forecasting
+Sea Surface Temperatures and Anomalies Challenge at the European Conference on
+Machine Learning and Principles and Practice of Knowledge Discovery in
+Databases (ECML PKDD) 2024. The challenge focused on the data-driven
+predictability of global sea surface temperatures (SSTs), a key factor in
+climate forecasting, ecosystem management, fisheries management, and climate
+change monitoring. The challenge involved forecasting SST anomalies (SSTAs)
+three months in advance using historical data and included a special task of
+predicting SSTAs nine months ahead for the Baltic Sea. Participants utilized
+various machine learning approaches to tackle the task, leveraging data from
+ERA5. This paper discusses the methodologies employed, the results obtained,
+and the lessons learned, offering insights into the future of climate-related
+predictive modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper contains 9 pages for the main text and 10 pages including
+  References. 5 figures. Discovery Track, European Conference on Machine
+  Learning and Principles and Practice of Knowledge Discovery in Databases
+  (ECML PKDD) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Element-wise Attention Is All You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxin Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The self-attention (SA) mechanism has demonstrated superior performance
+across various domains, yet it suffers from substantial complexity during both
+training and inference. The next-generation architecture, aiming at retaining
+the competitive performance of SA while achieving low-cost inference and
+efficient long-sequence training, primarily focuses on three approaches: linear
+attention, linear RNNs, and state space models. Although these approaches
+achieve reduced complexity than SA, they all have built-in performance
+degradation factors, such as diminished “spikiness” and compression of
+historical information. In contrast to these approaches, we propose a novel
+element-wise attention mechanism, which uses the element-wise squared Euclidean
+distance, instead of the dot product operation, to compute similarity and
+approximates the quadratic complexity term $\exp(q_{ic}k_{jc})$ with a Taylor
+polynomial. This design achieves remarkable efficiency: during training, the
+element-wise attention has a complexity of $\mathcal{O}(tLD)$, making
+long-sequence training both computationally and memory efficient, where $L$ is
+the sequence length, $D$ is the feature dimension, and $t$ is the highest order
+of the polynomial; during inference, it can be reformulated as recurrent neural
+networks, achieving a inference complexity of $\mathcal{O}(tD)$. Furthermore,
+the element-wise attention circumvents the performance degradation factors
+present in these approaches and achieves performance comparable to SA in both
+causal and non-causal forms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enabling Scalable Oversight via Self-Evolving Critic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyang Tang, Ziniu Li, Zhenyang Xiao, Tian Ding, Ruoyu Sun, Benyou Wang, Dayiheng Liu, Fei Huang, Tianyu Liu, Bowen Yu, Junyang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their remarkable performance, the development of Large Language
+Models (LLMs) faces a critical challenge in scalable oversight: providing
+effective feedback for tasks where human evaluation is difficult or where LLMs
+outperform humans. While there is growing interest in using LLMs for critique,
+current approaches still rely on human annotations or more powerful models,
+leaving the issue of enhancing critique capabilities without external
+supervision unresolved. We introduce SCRIT (Self-evolving CRITic), a framework
+that enables genuine self-evolution of critique abilities. Technically, SCRIT
+self-improves by training on synthetic data, generated by a contrastive-based
+self-critic that uses reference solutions for step-by-step critique, and a
+self-validation mechanism that ensures critique quality through correction
+outcomes. Implemented with Qwen2.5-72B-Instruct, one of the most powerful LLMs,
+SCRIT achieves up to a 10.3\% improvement on critique-correction and error
+identification benchmarks. Our analysis reveals that SCRIT's performance scales
+positively with data and model size, outperforms alternative approaches, and
+benefits critically from its self-validation component.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiagent Finetuning: Self Improvement with Diverse Reasoning Chains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vighnesh Subramaniam, Yilun Du, Joshua B. Tenenbaum, Antonio Torralba, Shuang Li, Igor Mordatch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved remarkable performance in recent
+years but are fundamentally limited by the underlying training data. To improve
+models beyond the training data, recent works have explored how LLMs can be
+used to generate synthetic data for autonomous self-improvement. However,
+successive steps of self-improvement can reach a point of diminishing returns.
+In this work, we propose a complementary approach towards self-improvement
+where finetuning is applied to a multiagent society of language models. A group
+of language models, all starting from the same base model, are independently
+specialized by updating each one using data generated through multiagent
+interactions among the models. By training each model on independent sets of
+data, we illustrate how this approach enables specialization across models and
+diversification over the set of models. As a result, our overall system is able
+to preserve diverse reasoning chains and autonomously improve over many more
+rounds of fine-tuning than single-agent self-improvement methods. We
+quantitatively illustrate the efficacy of the approach across a wide suite of
+reasoning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 13 figures, 7 tables; Project page at
+  https://llm-multiagent-ft.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EXION: Exploiting Inter- and Intra-Iteration Output Sparsity for
+  Diffusion Models <span class="chip">HPCA 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaehoon Heo, Adiwena Putra, Jieon Yoon, Sungwoong Yune, Hangyeol Lee, Ji-Hoon Kim, Joo-Young Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past few years, diffusion models have emerged as novel AI solutions,
+generating diverse multi-modal outputs from text prompts. Despite their
+capabilities, they face challenges in computing, such as excessive latency and
+energy consumption due to their iterative architecture. Although prior works
+specialized in transformer acceleration can be applied, the iterative nature of
+diffusion models remains unresolved. In this paper, we present EXION, the first
+SW-HW co-designed diffusion accelerator that solves the computation challenges
+by exploiting the unique inter- and intra-iteration output sparsity in
+diffusion models. To this end, we propose two SW-level optimizations. First, we
+introduce the FFN-Reuse algorithm that identifies and skips redundant
+computations in FFN layers across different iterations (inter-iteration
+sparsity). Second, we use a modified eager prediction method that employs
+two-step leading-one detection to accurately predict the attention score,
+skipping unnecessary computations within an iteration (intra-iteration
+sparsity). We also introduce a novel data compaction mechanism named ConMerge,
+which can enhance HW utilization by condensing and merging sparse matrices into
+compact forms. Finally, it has a dedicated HW architecture that supports the
+above sparsity-inducing algorithms, translating high output sparsity into
+improved energy efficiency and performance. To verify the feasibility of the
+EXION, we first demonstrate that it has no impact on accuracy in various types
+of multi-modal diffusion models. We then instantiate EXION in both server- and
+edge-level settings and compare its performance against GPUs with similar
+specifications. Our evaluation shows that EXION achieves dramatic improvements
+in performance and energy efficiency by 3.2-379.3x and 45.1-3067.6x compared to
+a server GPU and by 42.6-1090.9x and 196.9-4668.2x compared to an edge GPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in 2025 IEEE International Symposium on High-Performance
+  Computer Architecture (HPCA 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Facilitate Collaboration between Large Language Model and Task-specific
+  Model for Time Series Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiyi Chen, Leilei Zhang, Guansong Pang, Roger Zimmermann, Shuiguang Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In anomaly detection, methods based on large language models (LLMs) can
+incorporate expert knowledge, while task-specific smaller models excel at
+extracting normal patterns and detecting value fluctuations. Inspired by the
+human nervous system, where the brain stores expert knowledge and the
+peripheral nervous system and spinal cord handle specific tasks like withdrawal
+and knee-jerk reflexes, we propose CoLLaTe, a framework designed to facilitate
+collaboration between LLMs and task-specific models, leveraging the strengths
+of both.
+  In this work, we first formulate the collaboration process and identify two
+key challenges in the collaboration between LLMs and task-specific models: (1)
+the misalignment between the expression domains of LLMs and smaller models, and
+(2) error accumulation arising from the predictions of both models.
+  To address these challenges, we introduce two key components in CoLLaTe: the
+alignment module and the collaborative loss function. Through theoretical
+analysis and experimental validation, we demonstrate that these components
+effectively mitigate the identified challenges and achieve better performance
+than LLM based methods and task-specific smaller model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TransPlace: Transferable Circuit Global Placement via Graph Neural
+  Network <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunbo Hou, Haoran Ye, Yingxue Zhang, Siyuan Xu, Guojie Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global placement, a critical step in designing the physical layout of
+computer chips, is essential to optimize chip performance. Prior global
+placement methods optimize each circuit design individually from scratch. Their
+neglect of transferable knowledge limits solution efficiency and chip
+performance as circuit complexity drastically increases. This study presents
+TransPlace, a global placement framework that learns to place millions of
+mixed-size cells in continuous space. TransPlace introduces i) Netlist Graph to
+efficiently model netlist topology, ii) Cell-flow and relative position
+encoding to learn SE(2)-invariant representation, iii) a tailored graph neural
+network architecture for informed parameterization of placement knowledge, and
+iv) a two-stage strategy for coarse-to-fine placement. Compared to
+state-of-the-art placement methods, TransPlace-trained on a few high-quality
+placements-can place unseen circuits with 1.2x speedup while reducing
+congestion by 30%, timing by 9%, and wirelength by 5%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Measure Quantum Neural Networks <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05663v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05663v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Yen-Chi Chen, Huan-Hsin Tseng, Hsin-Yi Lin, Shinjae Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid progress in quantum computing (QC) and machine learning (ML) has
+attracted growing attention, prompting extensive research into quantum machine
+learning (QML) algorithms to solve diverse and complex problems. Designing
+high-performance QML models demands expert-level proficiency, which remains a
+significant obstacle to the broader adoption of QML. A few major hurdles
+include crafting effective data encoding techniques and parameterized quantum
+circuits, both of which are crucial to the performance of QML models.
+Additionally, the measurement phase is frequently overlooked-most current QML
+models rely on pre-defined measurement protocols that often fail to account for
+the specific problem being addressed. We introduce a novel approach that makes
+the observable of the quantum system-specifically, the Hermitian
+matrix-learnable. Our method features an end-to-end differentiable learning
+framework, where the parameterized observable is trained alongside the ordinary
+quantum circuit parameters simultaneously. Using numerical simulations, we show
+that the proposed method can identify observables for variational quantum
+circuits that lead to improved outcomes, such as higher classification
+accuracy, thereby boosting the overall performance of QML models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025 Workshop: Quantum Machine Learning in Signal
+  Processing and Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TAMER: A Test-Time Adaptive MoE-Driven Framework for EHR Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghao Zhu, Xiaochen Zheng, Ahmed Allam, Michael Krauthammer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose TAMER, a Test-time Adaptive MoE-driven framework for EHR
+Representation learning. TAMER combines a Mixture-of-Experts (MoE) with
+Test-Time Adaptation (TTA) to address two critical challenges in EHR modeling:
+patient population heterogeneity and distribution shifts. The MoE component
+handles diverse patient subgroups, while TTA enables real-time adaptation to
+evolving health status distributions when new patient samples are introduced.
+Extensive experiments across four real-world EHR datasets demonstrate that
+TAMER consistently improves predictive performance for both mortality and
+readmission risk tasks when combined with diverse EHR modeling backbones. TAMER
+offers a promising approach for dynamic and personalized EHR-based predictions
+in practical clinical settings. Code is publicly available at
+https://github.com/yhzhu99/TAMER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evidential Deep Learning for Uncertainty Quantification and
+  Out-of-Distribution Detection in Jet Identification using Deep Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05656v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05656v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayush Khot, Xiwei Wang, Avik Roy, Volodymyr Kindratenko, Mark S. Neubauer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current methods commonly used for uncertainty quantification (UQ) in deep
+learning (DL) models utilize Bayesian methods which are computationally
+expensive and time-consuming. In this paper, we provide a detailed study of UQ
+based on evidential deep learning (EDL) for deep neural network models designed
+to identify jets in high energy proton-proton collisions at the Large Hadron
+Collider and explore its utility in anomaly detection. EDL is a DL approach
+that treats learning as an evidence acquisition process designed to provide
+confidence (or epistemic uncertainty) about test data. Using publicly available
+datasets for jet classification benchmarking, we explore hyperparameter
+optimizations for EDL applied to the challenge of UQ for jet identification. We
+also investigate how the uncertainty is distributed for each jet class, how
+this method can be implemented for the detection of anomalies, how the
+uncertainty compares with Bayesian ensemble methods, and how the uncertainty
+maps onto latent spaces for the models. Our studies uncover some pitfalls of
+EDL applied to anomaly detection and a more effective way to quantify
+uncertainty from EDL as compared with the foundational EDL setup. These studies
+illustrate a methodological approach to interpreting EDL in jet classification
+models, providing new insights on how EDL quantifies uncertainty and detects
+out-of-distribution data which may lead to improved EDL methods for DL models
+applied to classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages (including references) with 17 figures and 3 tables.
+  Repository: https://github.com/FAIR4HEP/PFIN4UQAD . Submitted to Machine
+  Learning: Science and Technology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Practical Cross-Layer Approach for ML-Driven Storage Placement in
+  Warehouse-Scale Computers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05651v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05651v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxi Yang, Yan Li, Martin Maas, Mustafa Uysal, Ubaid Ullah Hafeez, Arif Merchant, Richard McDougall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Storage systems account for a major portion of the total cost of ownership
+(TCO) of warehouse-scale computers, and thus have a major impact on the overall
+system's efficiency. Machine learning (ML)-based methods for solving key
+problems in storage system efficiency, such as data placement, have shown
+significant promise. However, there are few known practical deployments of such
+methods. Studying this problem in the context of real-world hyperscale data
+center deployments at Google, we identify a number of challenges that we
+believe cause this lack of practical adoption. Specifically, prior work assumes
+a monolithic model that resides entirely within the storage layer, an
+unrealistic assumption in real-world data center deployments. We propose a
+cross-layer approach that moves ML out of the storage system and performs it in
+the application running on top of it, co-designed with a scheduling algorithm
+at the storage layer that consumes predictions from these application-level
+models. This approach combines small, interpretable models with a co-designed
+heuristic that adapts to different online environments. We build a
+proof-of-concept of this approach in a production distributed computation
+framework at Google. Evaluations in a test deployment and large-scale
+simulation studies using production traces show improvements of as much as
+3.47x in TCO savings compared to state of the art baselines. We believe this
+work represents a significant step towards more practical ML-driven storage
+placement in warehouse-scale computers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Representations for High-Cardinality Categorical Variables in
+  Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High\-cardinality categorical variables pose significant challenges in
+machine learning, particularly in terms of computational efficiency and model
+interpretability. Traditional one\-hot encoding often results in
+high\-dimensional sparse feature spaces, increasing the risk of overfitting and
+reducing scalability. This paper introduces novel encoding techniques,
+including means encoding, low\-rank encoding, and multinomial logistic
+regression encoding, to address these challenges. These methods leverage
+sufficient representations to generate compact and informative embeddings of
+categorical data. We conduct rigorous theoretical analyses and empirical
+validations on diverse datasets, demonstrating significant improvements in
+model performance and computational efficiency compared to baseline methods.
+The proposed techniques are particularly effective in domains requiring
+scalable solutions for large datasets, paving the way for more robust and
+efficient applications in machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2025 International Conference on Advanced Machine Learning and Data
+  Science (AMLDS 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable Enzyme Function Prediction via Residue-Level Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao Yang, Bing Su, Jiahao Chen, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting multiple functions labeled with Enzyme Commission (EC) numbers
+from the enzyme sequence is of great significance but remains a challenge due
+to its sparse multi-label classification nature, i.e., each enzyme is typically
+associated with only a few labels out of more than 6000 possible EC numbers.
+However, existing machine learning algorithms generally learn a fixed global
+representation for each enzyme to classify all functions, thereby they lack
+interpretability and the fine-grained information of some function-specific
+local residue fragments may be overwhelmed. Here we present an attention-based
+framework, namely ProtDETR (Protein Detection Transformer), by casting enzyme
+function prediction as a detection problem. It uses a set of learnable
+functional queries to adaptatively extract different local representations from
+the sequence of residue-level features for predicting different EC numbers.
+ProtDETR not only significantly outperforms existing deep learning-based enzyme
+function prediction methods, but also provides a new interpretable perspective
+on automatically detecting different local regions for identifying different
+functions through cross-attentions between queries and residue-level features.
+Code is available at https://github.com/yangzhao1230/ProtDETR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Unsupervised Graph Few-shot Learning via Set Functions and
+  Optimal Transport <span class="chip">KDD2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonghao Liu, Fausto Giunchiglia, Ximing Li, Lan Huang, Xiaoyue Feng, Renchu Guan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph few-shot learning has garnered significant attention for its ability to
+rapidly adapt to downstream tasks with limited labeled data, sparking
+considerable interest among researchers. Recent advancements in graph few-shot
+learning models have exhibited superior performance across diverse
+applications. Despite their successes, several limitations still exist. First,
+existing models in the meta-training phase predominantly focus on
+instance-level features within tasks, neglecting crucial set-level features
+essential for distinguishing between different categories. Second, these models
+often utilize query sets directly on classifiers trained with support sets
+containing only a few labeled examples, overlooking potential distribution
+shifts between these sets and leading to suboptimal performance. Finally,
+previous models typically require necessitate abundant labeled data from base
+classes to extract transferable knowledge, which is typically infeasible in
+real-world scenarios. To address these issues, we propose a novel model named
+STAR, which leverages Set funcTions and optimAl tRansport for enhancing
+unsupervised graph few-shot learning. Specifically, STAR utilizes expressive
+set functions to obtain set-level features in an unsupervised manner and
+employs optimal transport principles to align the distributions of support and
+query sets, thereby mitigating distribution shift effects. Theoretical analysis
+demonstrates that STAR can capture more task-relevant information and enhance
+generalization capabilities. Empirically, extensive experiments across multiple
+datasets validate the effectiveness of STAR. Our code can be found here.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regularized Top-$k$: A Bayesian Framework for Gradient Sparsification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Bereyhi, Ben Liang, Gary Boudreau, Ali Afana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Error accumulation is effective for gradient sparsification in distributed
+settings: initially-unselected gradient entries are eventually selected as
+their accumulated error exceeds a certain level. The accumulation essentially
+behaves as a scaling of the learning rate for the selected entries. Although
+this property prevents the slow-down of lateral movements in distributed
+gradient descent, it can deteriorate convergence in some settings. This work
+proposes a novel sparsification scheme that controls the learning rate scaling
+of error accumulation. The development of this scheme follows two major steps:
+first, gradient sparsification is formulated as an inverse probability
+(inference) problem, and the Bayesian optimal sparsification mask is derived as
+a maximum-a-posteriori estimator. Using the prior distribution inherited from
+Top-$k$, we derive a new sparsification algorithm which can be interpreted as a
+regularized form of Top-$k$. We call this algorithm regularized Top-$k$
+(RegTop-$k$). It utilizes past aggregated gradients to evaluate posterior
+statistics of the next aggregation. It then prioritizes the local accumulated
+gradient entries based on these posterior statistics. We validate our
+derivation through numerical experiments. In distributed linear regression, it
+is observed that while Top-$k$ remains at a fixed distance from the global
+optimum, RegTop-$k$ converges to the global optimum at significantly higher
+compression ratios. We further demonstrate the generalization of this
+observation by employing RegTop-$k$ in distributed training of ResNet-18 on
+CIFAR-10, where it noticeably outperforms Top-$k$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralized Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David McAllister, Matthew Tancik, Jiaming Song, Angjoo Kanazawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale AI model training divides work across thousands of GPUs, then
+synchronizes gradients across them at each step. This incurs a significant
+network burden that only centralized, monolithic clusters can support, driving
+up infrastructure costs and straining power systems. We propose Decentralized
+Diffusion Models, a scalable framework for distributing diffusion model
+training across independent clusters or datacenters by eliminating the
+dependence on a centralized, high-bandwidth networking fabric. Our method
+trains a set of expert diffusion models over partitions of the dataset, each in
+full isolation from one another. At inference time, the experts ensemble
+through a lightweight router. We show that the ensemble collectively optimizes
+the same objective as a single model trained over the whole dataset. This means
+we can divide the training burden among a number of "compute islands," lowering
+infrastructure costs and improving resilience to localized GPU failures.
+Decentralized diffusion models empower researchers to take advantage of
+smaller, more cost-effective and more readily available compute like on-demand
+GPU nodes rather than central integrated systems. We conduct extensive
+experiments on ImageNet and LAION Aesthetics, showing that decentralized
+diffusion models FLOP-for-FLOP outperform standard diffusion models. We finally
+scale our approach to 24 billion parameters, demonstrating that high-quality
+diffusion models can now be trained with just eight individual GPU nodes in
+less than a week.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://decentralizeddiffusion.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Item Dissimilarities: Diversifying by Intent in Recommender
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12327v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12327v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyan Wang, Cheenar Banerjee, Samer Chucri, Fabio Soldo, Sriraj Badam, Ed H. Chi, Minmin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has become increasingly clear that recommender systems that overly focus
+on short-term engagement prevents users from exploring diverse interests,
+ultimately hurting long-term user experience. To tackle this challenge,
+numerous diversification algorithms have been proposed. These algorithms
+typically rely on measures of item similarity, aiming to maximize the
+dissimilarity across items in the final set of recommendations. However, in
+this work, we demonstrate the benefits of going beyond item-level similarities
+by utilizing higher-level user understanding--specifically, user intents that
+persist across multiple interactions--in diversification. Our approach is
+motivated by the observation that user behaviors on online platforms are
+largely driven by their underlying intents. Therefore, recommendations should
+ensure that diverse user intents are accurately represented. While intent has
+primarily been studied in the context of search, it is less clear how to
+incorporate real-time dynamic intent predictions into recommender systems. To
+address this gap, we develop a probabilistic intent-based whole-page
+diversification framework for the final stage of a recommender system. Starting
+with a prior belief of user intents, the proposed framework sequentially
+selects items for each position based on these beliefs and subsequently updates
+posterior beliefs about the intents. This approach ensures that different user
+intents are represented on a page, towards optimizing long-term user
+experience. We experiment with the intent diversification framework on YouTube,
+the world's largest video recommendation platform, serving billions of users
+daily. Live experiments on a diverse set of intents show that the proposed
+framework increases Daily Active Users (DAU) and overall user enjoyment,
+validating its effectiveness in facilitating long-term planning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Guess What I Think: Streamlined EEG-to-Image Generation with Latent
+  Diffusion Models <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.02780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.02780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eleonora Lopez, Luigi Sigillo, Federica Colonnese, Massimo Panella, Danilo Comminiello
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating images from brain waves is gaining increasing attention due to its
+potential to advance brain-computer interface (BCI) systems by understanding
+how brain signals encode visual cues. Most of the literature has focused on
+fMRI-to-Image tasks as fMRI is characterized by high spatial resolution.
+However, fMRI is an expensive neuroimaging modality and does not allow for
+real-time BCI. On the other hand, electroencephalography (EEG) is a low-cost,
+non-invasive, and portable neuroimaging technique, making it an attractive
+option for future real-time applications. Nevertheless, EEG presents inherent
+challenges due to its low spatial resolution and susceptibility to noise and
+artifacts, which makes generating images from EEG more difficult. In this
+paper, we address these problems with a streamlined framework based on the
+ControlNet adapter for conditioning a latent diffusion model (LDM) through EEG
+signals. We conduct experiments and ablation studies on popular benchmarks to
+demonstrate that the proposed method beats other state-of-the-art models.
+Unlike these methods, which often require extensive preprocessing, pretraining,
+different losses, and captioning models, our approach is efficient and
+straightforward, requiring only minimal preprocessing and a few components. The
+code is available at https://github.com/LuigiSigillo/GWIT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Stage Segmentation of Cervical Tumors using PocketNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.11456v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.11456v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Awj Twam, Megan Jacobsen, Rachel Glenn, Peng Wei, Jia Sun, Ann Klopp, Aradhana M. Venkatesan, David Fuentes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cervical cancer remains the fourth most common malignancy amongst women
+worldwide.1 Concurrent chemoradiotherapy (CRT) serves as the mainstay
+definitive treatment regimen for locally advanced cervical cancers and includes
+external beam radiation followed by brachytherapy.2 Integral to radiotherapy
+treatment planning is the routine contouring of both the target tumor at the
+level of the cervix, associated gynecologic anatomy and the adjacent organs at
+risk (OARs). However, manual contouring of these structures is both time and
+labor intensive and associated with known interobserver variability that can
+impact treatment outcomes. While multiple tools have been developed to
+automatically segment OARs and the high-risk clinical tumor volume (HR-CTV)
+using computed tomography (CT) images,3,4,5,6 the development of deep
+learning-based tumor segmentation tools using routine T2-weighted (T2w)
+magnetic resonance imaging (MRI) addresses an unmet clinical need to improve
+the routine contouring of both anatomical structures and cervical cancers,
+thereby increasing quality and consistency of radiotherapy planning. This work
+applied a novel deep-learning model (PocketNet) to segment the cervix, vagina,
+uterus, and tumor(s) on T2w MRI. The performance of the PocketNet architecture
+was evaluated, when trained on data via 5-fold cross validation. PocketNet
+achieved a mean Dice-Sorensen similarity coefficient (DSC) exceeding 70% for
+tumor segmentation and 80% for organ segmentation. These results suggest that
+PocketNet is robust to variations in contrast protocols, providing reliable
+segmentation of the regions of interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmark Evaluations, Applications, and Challenges of Large Vision
+  Language Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongxia Li, Xiyang Wu, Hongyang Du, Huy Nghiem, Guangyao Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Vision Language Models (VLMs) have emerged as a transformative
+technology at the intersection of computer vision and natural language
+processing, enabling machines to perceive and reason about the world through
+both visual and textual modalities. For example, models such as CLIP, Claude,
+and GPT-4V demonstrate strong reasoning and understanding abilities on visual
+and textual data and beat classical single modality vision models on zero-shot
+classification. Despite their rapid advancements in research and growing
+popularity in applications, a comprehensive survey of existing studies on VLMs
+is notably lacking, particularly for researchers aiming to leverage VLMs in
+their specific domains. To this end, we provide a systematic overview of VLMs
+in the following aspects: model information of the major VLMs developed over
+the past five years (2019-2024); the main architectures and training methods of
+these VLMs; summary and categorization of the popular benchmarks and evaluation
+metrics of VLMs; the applications of VLMs including embodied agents, robotics,
+and video generation; the challenges and issues faced by current VLMs such as
+hallucination, fairness, and safety. Detailed collections including papers and
+model repository links are listed in
+https://github.com/zli12321/Awesome-VLM-Papers-And-Models.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformalised data synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.08999v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.08999v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julia A. Meister, Khuong An Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the proliferation of increasingly complicated Deep Learning
+architectures, data synthesis is a highly promising technique to address the
+demand of data-hungry models. However, reliably assessing the quality of a
+'synthesiser' model's output is an open research question with significant
+associated risks for high-stake domains. To address this challenge, we propose
+a unique synthesis algorithm that generates data from high-confidence feature
+space regions based on the Conformal Prediction framework. We support our
+proposed algorithm with a comprehensive exploration of the core parameter's
+influence, an in-depth discussion of practical advice, and an extensive
+empirical evaluation of five benchmark datasets. To show our approach's
+versatility on ubiquitous real-world challenges, the datasets were carefully
+selected for their variety of difficult characteristics: low sample count,
+class imbalance, and non-separability. In all trials, training sets extended
+with our confident synthesised data performed at least as well as the original
+set and frequently significantly improved Deep Learning performance by up to 61
+percentage points F1-score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the Machine Learning journal special
+  issue "Conformal Prediction and Distribution-Free Uncertainty Quantification"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Closing the Gap: A User Study on the Real-world Usefulness of AI-powered
+  Vulnerability Detection & Repair in the IDE <span class="chip">ICSE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14306v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14306v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Steenhoek, Kalpathy Sivaraman, Renata Saldivar Gonzalez, Yevhen Mohylevskyy, Roshanak Zilouchian Moghaddam, Wei Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the first empirical study of a vulnerability detection
+and fix tool with professional software developers on real projects that they
+own. We implemented DeepVulGuard, an IDE-integrated tool based on
+state-of-the-art detection and fix models, and show that it has promising
+performance on benchmarks of historic vulnerability data. DeepVulGuard scans
+code for vulnerabilities (including identifying the vulnerability type and
+vulnerable region of code), suggests fixes, provides natural-language
+explanations for alerts and fixes, leveraging chat interfaces. We recruited 17
+professional software developers at Microsoft, observed their usage of the tool
+on their code, and conducted interviews to assess the tool's usefulness, speed,
+trust, relevance, and workflow integration. We also gathered detailed
+qualitative feedback on users' perceptions and their desired features. Study
+participants scanned a total of 24 projects, 6.9k files, and over 1.7 million
+lines of source code, and generated 170 alerts and 50 fix suggestions. We find
+that although state-of-the-art AI-powered detection and fix tools show promise,
+they are not yet practical for real-world use due to a high rate of false
+positives and non-applicable fixes. User feedback reveals several actionable
+pain points, ranging from incomplete context to lack of customization for the
+user's codebase. Additionally, we explore how AI features, including confidence
+scores, explanations, and chat interaction, can apply to vulnerability
+detection and fixing. Based on these insights, we offer practical
+recommendations for evaluating and deploying AI detection and fix models. Our
+code and data are available at https://doi.org/10.6084/m9.figshare.26367139.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICSE 2025 research track. Camera-ready version with
+  updated acknowledgments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Atlas: A Novel Pathology Foundation Model by Mayo Clinic, Charité, and
+  Aignostics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05409v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05409v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Alber, Stephan Tietz, Jonas Dippel, Timo Milbich, Timothée Lesort, Panos Korfiatis, Moritz Krügener, Beatriz Perez Cancer, Neelay Shah, Alexander Möllers, Philipp Seegerer, Alexandra Carpen-Amarie, Kai Standvoss, Gabriel Dernbach, Edwin de Jong, Simon Schallenberg, Andreas Kunft, Helmut Hoffer von Ankershoffen, Gavin Schaeferle, Patrick Duffy, Matt Redlon, Philipp Jurmeister, David Horst, Lukas Ruff, Klaus-Robert Müller, Frederick Klauschen, Andrew Norgan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in digital pathology have demonstrated the effectiveness of
+foundation models across diverse applications. In this report, we present
+Atlas, a novel vision foundation model based on the RudolfV approach. Our model
+was trained on a dataset comprising 1.2 million histopathology whole slide
+images, collected from two medical institutions: Mayo Clinic and Charit\'e -
+Universt\"atsmedizin Berlin. Comprehensive evaluations show that Atlas achieves
+state-of-the-art performance across twenty-one public benchmark datasets, even
+though it is neither the largest model by parameter count nor by training
+dataset size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> video <span class="highlight-title">pretrain</span>ing yields robust and more human-aligned
+  visual representations <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06433v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06433v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Parthasarathy, S. M. Ali Eslami, João Carreira, Olivier J. Hénaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans learn powerful representations of objects and scenes by observing how
+they evolve over time. Yet, outside of specific tasks that require explicit
+temporal understanding, static image pretraining remains the dominant paradigm
+for learning visual foundation models. We question this mismatch, and ask
+whether video pretraining can yield visual representations that bear the
+hallmarks of human perception: generalisation across tasks, robustness to
+perturbations, and consistency with human judgements. To that end we propose a
+novel procedure for curating videos, and develop a contrastive framework which
+learns from the complex transformations therein. This simple paradigm for
+distilling knowledge from videos, called VITO, yields general representations
+that far outperform prior video pretraining methods on image understanding
+tasks, and image pretraining methods on video understanding tasks. Moreover,
+VITO representations are significantly more robust to natural and synthetic
+deformations than image-, video-, and adversarially-trained ones. Finally,
+VITO's predictions are strongly aligned with human judgements, surpassing
+models that were specifically trained for that purpose. Together, these results
+suggest that video pretraining could be a simple way of learning unified,
+robust, and human-aligned representations of the visual world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 37th Conference on Neural Information Processing Systems
+  (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Expressive Power of Graph Neural Networks: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08235v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08235v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingxu Zhang, Changjun Fan, Shixuan Liu, Kuihua Huang, Xiang Zhao, Jincai Huang, Zhong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) are effective machine learning models for many
+graph-related applications. Despite their empirical success, many research
+efforts focus on the theoretical limitations of GNNs, i.e., the GNNs expressive
+power. Early works in this domain mainly focus on studying the graph
+isomorphism recognition ability of GNNs, and recent works try to leverage the
+properties such as subgraph counting and connectivity learning to characterize
+the expressive power of GNNs, which are more practical and closer to
+real-world. However, no survey papers and open-source repositories
+comprehensively summarize and discuss models in this important direction. To
+fill the gap, we conduct a first survey for models for enhancing expressive
+power under different forms of definition. Concretely, the models are reviewed
+based on three categories, i.e., Graph feature enhancement, Graph topology
+enhancement, and GNNs architecture enhancement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-dimensional classification problems with Barron regular boundaries
+  under margin conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07312v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07312v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan García, Philipp Petersen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We prove that a classifier with a Barron-regular decision boundary can be
+approximated with a rate of high polynomial degree by ReLU neural networks with
+three hidden layers when a margin condition is assumed. In particular, for
+strong margin conditions, high-dimensional discontinuous classifiers can be
+approximated with a rate that is typically only achievable when approximating a
+low-dimensional smooth function. We demonstrate how these expression rate
+bounds imply fast-rate learning bounds that are close to $n^{-1}$ where $n$ is
+the number of samples. In addition, we carry out comprehensive numerical
+experimentation on binary classification problems with various margins. We
+study three different dimensions, with the highest dimensional problem
+corresponding to images from the MNIST data set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Theoretical Error Analysis of Entropy Approximation for Gaussian Mixture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.13059v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.13059v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takashi Furuya, Hiroyuki Kusumoto, Koichi Taniguchi, Naoya Kanno, Kazuma Suetake
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian mixture distributions are commonly employed to represent general
+probability distributions. Despite the importance of using Gaussian mixtures
+for uncertainty estimation, the entropy of a Gaussian mixture cannot be
+calculated analytically. In this paper, we study the approximate entropy
+represented as the sum of the entropies of unimodal Gaussian distributions with
+mixing coefficients. This approximation is easy to calculate analytically
+regardless of dimension, but there is a lack of theoretical guarantees. We
+theoretically analyze the approximation error between the true and the
+approximate entropy to reveal when this approximation works effectively. This
+error is essentially controlled by how far apart each Gaussian component of the
+Gaussian mixture is. To measure such separation, we introduce the ratios of the
+distances between the means to the sum of the variances of each Gaussian
+component of the Gaussian mixture, and we reveal that the error converges to
+zero as the ratios tend to infinity. In addition, the probabilistic estimate
+indicates that this convergence situation is more likely to occur in
+higher-dimensional spaces. Therefore, our results provide a guarantee that this
+approximation works well for high-dimensional problems, such as neural networks
+that involve a large number of parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MARS: A neurosymbolic approach for interpretable drug discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05289v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05289v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lauren Nicole DeLong, Yojana Gadiya, Paola Galdi, Jacques D. Fleuriot, Daniel Domingo-Fernández
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neurosymbolic (NeSy) artificial intelligence describes the combination of
+logic or rule-based techniques with neural networks. Compared to neural
+approaches, NeSy methods often possess enhanced interpretability, which is
+particularly promising for biomedical applications like drug discovery.
+However, since interpretability is broadly defined, there are no clear
+guidelines for assessing the biological plausibility of model interpretations.
+To assess interpretability in the context of drug discovery, we devise a novel
+prediction task, called drug mechanism-of-action (MoA) deconvolution, with an
+associated, tailored knowledge graph (KG), MoA-net. We then develop the MoA
+Retrieval System (MARS), a NeSy approach for drug discovery which leverages
+logical rules with learned rule weights. Using this interpretable feature
+alongside domain knowledge, we find that MARS and other NeSy approaches on KGs
+are susceptible to reasoning shortcuts, in which the prediction of true labels
+is driven by "degree-bias" rather than the domain-based rules. Subsequently, we
+demonstrate ways to identify and mitigate this. Thereafter, MARS achieves
+performance on par with current state-of-the-art models while producing model
+interpretations aligned with known MoAs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review. 10 pages, 7 supplementary pages. Corresponding code is
+  here: https://github.com/laurendelong21/MARS and here:
+  https://github.com/laurendelong21/MoA-Net</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Low-Tubal-Rank Tensor Recovery via Factorized Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11940v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11940v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyu Liu, Zhi Han, Yandong Tang, Xi-Le Zhao, Yao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers the problem of recovering a tensor with an underlying
+low-tubal-rank structure from a small number of corrupted linear measurements.
+Traditional approaches tackling such a problem require the computation of
+tensor Singular Value Decomposition (t-SVD), that is a computationally
+intensive process, rendering them impractical for dealing with large-scale
+tensors. Aim to address this challenge, we propose an efficient and effective
+low-tubal-rank tensor recovery method based on a factorization procedure akin
+to the Burer-Monteiro (BM) method. Precisely, our fundamental approach involves
+decomposing a large tensor into two smaller factor tensors, followed by solving
+the problem through factorized gradient descent (FGD). This strategy eliminates
+the need for t-SVD computation, thereby reducing computational costs and
+storage requirements. We provide rigorous theoretical analysis to ensure the
+convergence of FGD under both noise-free and noisy situations. Additionally, it
+is worth noting that our method does not require the precise estimation of the
+tensor tubal-rank. Even in cases where the tubal-rank is slightly
+overestimated, our approach continues to demonstrate robust performance. A
+series of experiments have been carried out to demonstrate that, as compared to
+other popular ones, our approach exhibits superior performance in multiple
+scenarios, in terms of the faster computational speed and the smaller
+convergence error.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A unified cross-attention model for predicting antigen binding
+  specificity to both HLA and TCR molecules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.06653v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.06653v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenpeng Yu, Xing Fang, Hui Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The immune checkpoint inhibitors have demonstrated promising clinical
+efficacy across various tumor types, yet the percentage of patients who benefit
+from them remains low. The bindings between tumor antigens and HLA-I/TCR
+molecules determine the antigen presentation and T-cell activation, thereby
+playing an important role in the immunotherapy response. In this paper, we
+propose UnifyImmun, a unified cross-attention transformer model designed to
+simultaneously predict the bindings of peptides to both receptors, providing
+more comprehensive evaluation of antigen immunogenicity. We devise a two-phase
+strategy using virtual adversarial training that enables these two tasks to
+reinforce each other mutually, by compelling the encoders to extract more
+expressive features. Our method demonstrates superior performance in predicting
+both pHLA and pTCR binding on multiple independent and external test sets.
+Notably, on a large-scale COVID-19 pTCR binding test set without any seen
+peptide in training set, our method outperforms the current state-of-the-art
+methods by more than 10\%. The predicted binding scores significantly correlate
+with the immunotherapy response and clinical outcomes on two clinical cohorts.
+Furthermore, the cross-attention scores and integrated gradients reveal the
+amino-acid sites critical for peptide binding to receptors. In essence, our
+approach marks a significant step toward comprehensive evaluation of antigen
+immunogenicity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Nature Machine Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Steerable Deep Network for Model-Free Diffusion MRI Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04794v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04794v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianfranco Cortes, Xiaoda Qu, Baba C. Vemuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonrigid registration is vital to medical image analysis but remains
+challenging for diffusion MRI (dMRI) due to its high-dimensional,
+orientation-dependent nature. While classical methods are accurate, they are
+computationally demanding, and deep neural networks, though efficient, have
+been underexplored for nonrigid dMRI registration compared to structural
+imaging. We present a novel, deep learning framework for model-free, nonrigid
+registration of raw diffusion MRI data that does not require explicit
+reorientation. Unlike previous methods relying on derived representations such
+as diffusion tensors or fiber orientation distribution functions, in our
+approach, we formulate the registration as an equivariant diffeomorphism of
+position-and-orientation space. Central to our method is an
+$\mathsf{SE}(3)$-equivariant UNet that generates velocity fields while
+preserving the geometric properties of a raw dMRI's domain. We introduce a new
+loss function based on the maximum mean discrepancy in Fourier space,
+implicitly matching ensemble average propagators across images. Experimental
+results on Human Connectome Project dMRI data demonstrate competitive
+performance compared to state-of-the-art approaches, with the added advantage
+of bypassing the overhead for estimating derived representations. This work
+establishes a foundation for data-driven, geometry-aware dMRI registration
+directly in the acquisition space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Coauthor was inadvertently left out. This is now corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convergence analysis of wide shallow neural operators within the
+  framework of Neural Tangent Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05545v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05545v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianliang Xu, Ye Li, Zhongyi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural operators are aiming at approximating operators mapping between Banach
+spaces of functions, achieving much success in the field of scientific
+computing. Compared to certain deep learning-based solvers, such as
+Physics-Informed Neural Networks (PINNs), Deep Ritz Method (DRM), neural
+operators can solve a class of Partial Differential Equations (PDEs). Although
+much work has been done to analyze the approximation and generalization error
+of neural operators, there is still a lack of analysis on their training error.
+In this work, we conduct the convergence analysis of gradient descent for the
+wide shallow neural operators and physics-informed shallow neural operators
+within the framework of Neural Tangent Kernel (NTK). The core idea lies on the
+fact that over-parameterization and random initialization together ensure that
+each weight vector remains near its initialization throughout all iterations,
+yielding the linear convergence of gradient descent. In this work, we
+demonstrate that under the setting of over-parametrization, gradient descent
+can find the global minimum regardless of whether it is in continuous time or
+discrete time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CURing Large Models: Compression via CUR Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04211v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04211v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanghyeon Park, Soo-Mook Moon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large deep learning models have achieved remarkable success but are
+resource-intensive, posing challenges such as memory usage. We introduce
+CURing, a novel model compression method based on CUR matrix decomposition,
+which approximates weight matrices as the product of selected columns (C) and
+rows (R), and a small linking matrix (U). We apply this decomposition to
+weights chosen based on the combined influence of their magnitudes and
+activations. By identifying and retaining informative rows and columns, CURing
+significantly reduces model size with minimal performance loss. For example, it
+reduces Llama3.1-8B's parameters to 7.32B (-9%) in just 129 seconds, over 20
+times faster than prior compression methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DUET: Dual Clustering Enhanced Multivariate Time Series Forecasting <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10859v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10859v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangfei Qiu, Xingjian Wu, Yan Lin, Chenjuan Guo, Jilin Hu, Bin Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multivariate time series forecasting is crucial for various applications,
+such as financial investment, energy management, weather forecasting, and
+traffic optimization. However, accurate forecasting is challenging due to two
+main factors. First, real-world time series often show heterogeneous temporal
+patterns caused by distribution shifts over time. Second, correlations among
+channels are complex and intertwined, making it hard to model the interactions
+among channels precisely and flexibly.
+  In this study, we address these challenges by proposing a general framework
+called DUET, which introduces dual clustering on the temporal and channel
+dimensions to enhance multivariate time series forecasting. First, we design a
+Temporal Clustering Module (TCM) that clusters time series into fine-grained
+distributions to handle heterogeneous temporal patterns. For different
+distribution clusters, we design various pattern extractors to capture their
+intrinsic temporal patterns, thus modeling the heterogeneity. Second, we
+introduce a novel Channel-Soft-Clustering strategy and design a Channel
+Clustering Module (CCM), which captures the relationships among channels in the
+frequency domain through metric learning and applies sparsification to mitigate
+the adverse effects of noisy channels. Finally, DUET combines TCM and CCM to
+incorporate both the temporal and channel dimensions. Extensive experiments on
+25 real-world datasets from 10 application domains, demonstrate the
+state-of-the-art performance of DUET.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2025 research track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ u-$μ$P: The Unit-Scaled Maximal Update Parametrization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.17465v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.17465v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charlie Blake, Constantin Eichenberg, Josef Dean, Lukas Balles, Luke Y. Prince, Björn Deiseroth, Andres Felipe Cruz-Salinas, Carlo Luschi, Samuel Weinbach, Douglas Orr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Maximal Update Parametrization ($\mu$P) aims to make the optimal
+hyperparameters (HPs) of a model independent of its size, allowing them to be
+swept using a cheap proxy model rather than the full-size target model. We
+present a new scheme, u-$\mu$P, which improves upon $\mu$P by combining it with
+Unit Scaling, a method for designing models that makes them easy to train in
+low-precision. The two techniques have a natural affinity: $\mu$P ensures that
+the scale of activations is independent of model size, and Unit Scaling ensures
+that activations, weights and gradients begin training with a scale of one.
+This synthesis opens the door to a simpler scheme, whose default values are
+near-optimal. This in turn facilitates a more efficient sweeping strategy, with
+u-$\mu$P models reaching a loss that is equal to or lower than comparable
+$\mu$P models and working out-of-the-box in FP8.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>55 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A stochastic first-order method with multi-extrapolated momentum for
+  highly smooth unconstrained optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider an unconstrained stochastic optimization problem
+where the objective function exhibits high-order smoothness. Specifically, we
+propose a new stochastic first-order method (SFOM) with multi-extrapolated
+momentum, in which multiple extrapolations are performed in each iteration,
+followed by a momentum update based on these extrapolations. We demonstrate
+that the proposed SFOM can accelerate optimization by exploiting the high-order
+smoothness of the objective function $f$. Assuming that the $p$th-order
+derivative of $f$ is Lipschitz continuous for some $p\ge2$, and under
+additional mild assumptions, we establish that our method achieves a sample
+complexity of $\widetilde{\mathcal{O}}(\epsilon^{-(3p+1)/p})$ for finding a
+point $x$ such that $\mathbb{E}[\|\nabla f(x)\|]\le\epsilon$. To the best of
+our knowledge, this is the first SFOM to leverage arbitrary-order smoothness of
+the objective function for acceleration, resulting in a sample complexity that
+improves upon the best-known results without assuming the mean-squared
+smoothness condition. Preliminary numerical experiments validate the practical
+performance of our method and support our theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning a Consensus Sub-Network with Polarization Regularization and
+  One Pass Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10798v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10798v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoying Zhi, Varun Babbar, Rundong Liu, Pheobe Sun, Fran Silavong, Ruibo Shi, Sean Moran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The subject of green AI has been gaining attention within the deep learning
+community given the recent trend of ever larger and more complex neural network
+models. Existing solutions for reducing the computational load of training at
+inference time usually involve pruning the network parameters. Pruning schemes
+often create extra overhead either by iterative training and fine-tuning for
+static pruning or repeated computation of a dynamic pruning graph. We propose a
+new parameter pruning strategy for learning a lighter-weight sub-network that
+minimizes the energy cost while maintaining comparable performance to the fully
+parameterised network on given downstream tasks. Our proposed pruning scheme is
+green-oriented, as it only requires a one-off training to discover the optimal
+static sub-networks by dynamic pruning methods. The pruning scheme consists of
+a binary gating module and a polarizing loss function to uncover sub-networks
+with user-defined sparsity. Our method enables pruning and training
+simultaneously, which saves energy in both the training and inference phases
+and avoids extra computational overhead from gating modules at inference time.
+Our results on CIFAR-10, CIFAR-100, and Tiny Imagenet suggest that our scheme
+can remove 50% of connections in deep networks with <1% reduction in
+classification accuracy. Compared to other related pruning methods, our method
+demonstrates a lower drop in accuracy for equivalent reductions in
+computational cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Differential Appearance Equations <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.07128v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.07128v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liu, Tobias Ritschel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method to reproduce dynamic appearance textures with
+space-stationary but time-varying visual statistics. While most previous work
+decomposes dynamic textures into static appearance and motion, we focus on
+dynamic appearance that results not from motion but variations of fundamental
+properties, such as rusting, decaying, melting, and weathering. To this end, we
+adopt the neural ordinary differential equation (ODE) to learn the underlying
+dynamics of appearance from a target exemplar. We simulate the ODE in two
+phases. At the "warm-up" phase, the ODE diffuses a random noise to an initial
+state. We then constrain the further evolution of this ODE to replicate the
+evolution of visual feature statistics in the exemplar during the generation
+phase. The particular innovation of this work is the neural ODE achieving both
+denoising and evolution for dynamics synthesis, with a proposed temporal
+training scheme. We study both relightable (BRDF) and non-relightable (RGB)
+appearance models. For both we introduce new pilot datasets, allowing, for the
+first time, to study such phenomena: For RGB we provide 22 dynamic textures
+acquired from free online sources; For BRDFs, we further acquire a dataset of
+21 flash-lit videos of time-varying materials, enabled by a simple-to-construct
+setup. Our experiments show that our method consistently yields realistic and
+coherent results, whereas prior works falter under pronounced temporal
+appearance variations. A user study confirms our approach is preferred to
+previous work for such exemplars.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH Asia 2024 Journal Track. Project page at
+  https://ryushinn.github.io/ode-appearance</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast unsupervised ground metric learning with tree-Wasserstein distance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07432v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07432v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kira M. Düsterwald, Samo Hromadka, Makoto Yamada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of unsupervised methods such as clustering depends on the
+choice of distance metric between features, or ground metric. Commonly, ground
+metrics are decided with heuristics or learned via supervised algorithms.
+However, since many interesting datasets are unlabelled, unsupervised ground
+metric learning approaches have been introduced. One promising option employs
+Wasserstein singular vectors (WSVs), which emerge when computing optimal
+transport distances between features and samples simultaneously. WSVs are
+effective, but can be prohibitively computationally expensive in some
+applications: $\mathcal{O}(n^2m^2(n \log(n) + m \log(m))$ for $n$ samples and
+$m$ features. In this work, we propose to augment the WSV method by embedding
+samples and features on trees, on which we compute the tree-Wasserstein
+distance (TWD). We demonstrate theoretically and empirically that the algorithm
+converges to a better approximation of the standard WSV approach than the best
+known alternatives, and does so with $\mathcal{O}(n^3+m^3+mn)$ complexity. In
+addition, we prove that the initial tree structure can be chosen flexibly,
+since tree geometry does not constrain the richness of the approximation up to
+the number of edge weights. This proof suggests a fast and recursive algorithm
+for computing the tree parameter basis set, which we find crucial to realising
+the efficiency gains at scale. Finally, we employ the tree-WSV algorithm to
+several single-cell RNA sequencing genomics datasets, demonstrating its
+scalability and utility for unsupervised cell-type clustering problems. These
+results poise unsupervised ground metric learning with TWD as a low-rank
+approximation of WSV with the potential for widespread application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VideoChat-Flash: Hierarchical Compression for Long-Context Video
+  Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00574v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00574v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhao Li, Yi Wang, Jiashuo Yu, Xiangyu Zeng, Yuhan Zhu, Haian Huang, Jianfei Gao, Kunchang Li, Yinan He, Chenting Wang, Yu Qiao, Yali Wang, Limin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-context modeling is a critical capability for multimodal large language
+models (MLLMs), enabling them to process long-form contents with implicit
+memorization. Despite its advances, handling extremely long videos remains
+challenging due to the difficulty in maintaining crucial features over extended
+sequences. This paper introduces a Hierarchical visual token Compression (HiCo)
+method designed for high-fidelity representation and a practical context
+modeling system VideoChat-Flash tailored for multimodal long-sequence
+processing. HiCo capitalizes on the redundancy of visual information in long
+videos to compress long video context from the clip-level to the video-level,
+reducing the compute significantly while preserving essential details.
+VideoChat-Flash features a multi-stage short-to-long learning scheme, a rich
+dataset of real-world long videos named LongVid, and an upgraded
+"Needle-In-A-video-Haystack" (NIAH) for evaluating context capacities. In
+extensive experiments, VideoChat-Flash shows the leading performance on both
+mainstream long and short video benchmarks at the 2B and 7B model scale. It
+firstly gets 99.1% accuracy over 10,000 frames in NIAH among open-source
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning In-Distribution Representations for Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Willian T. Lunardi, Abdulrahman Banabila, Dania Herzalla, Martin Andreoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection involves identifying data patterns that deviate from the
+anticipated norm. Traditional methods struggle in high-dimensional spaces due
+to the curse of dimensionality. In recent years, self-supervised learning,
+particularly through contrastive objectives, has driven advances in anomaly
+detection. However, vanilla contrastive learning struggles to align with the
+unique demands of anomaly detection, as it lacks a pretext task tailored to the
+homogeneous nature of In-Distribution (ID) data and the diversity of
+Out-of-Distribution (OOD) anomalies. Methods that attempt to address these
+challenges, such as introducing hard negatives through synthetic outliers,
+Outlier Exposure (OE), and supervised objectives, often rely on pretext tasks
+that fail to balance compact clustering of ID samples with sufficient
+separation from OOD data. In this work, we propose Focused In-distribution
+Representation Modeling (FIRM), a contrastive learning objective specifically
+designed for anomaly detection. Unlike existing approaches, FIRM incorporates
+synthetic outliers into its pretext task in a way that actively shapes the
+representation space, promoting compact clustering of ID samples while
+enforcing strong separation from outliers. This formulation addresses the
+challenges of class collision, enhancing both the compactness of ID
+representations and the discriminative power of the learned feature space. We
+show that FIRM surpasses other contrastive methods in standard benchmarks,
+significantly enhancing anomaly detection compared to both traditional and
+supervised contrastive learning objectives. Our ablation studies confirm that
+FIRM consistently improves the quality of representations and shows robustness
+across a range of scoring methods. The code is available at:
+https://github.com/willtl/firm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gender Bias in Text-to-Video Generation Models: A case study of Sora 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01987v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01987v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Nadeem, Shahab Saquib Sohail, Erik Cambria, Björn W. Schuller, Amir Hussain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of text-to-video generation models has revolutionized content
+creation as it produces high-quality videos from textual prompts. However,
+concerns regarding inherent biases in such models have prompted scrutiny,
+particularly regarding gender representation. Our study investigates the
+presence of gender bias in OpenAI's Sora, a state-of-the-art text-to-video
+generation model. We uncover significant evidence of bias by analyzing the
+generated videos from a diverse set of gender-neutral and stereotypical
+prompts. The results indicate that Sora disproportionately associates specific
+genders with stereotypical behaviors and professions, which reflects societal
+prejudices embedded in its training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometry of Linear Neural Networks: Equivariance and Invariance under
+  Permutation Groups 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13736v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13736v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kathlén Kohn, Anna-Laura Sattelberger, Vahid Shahverdi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The set of functions parameterized by a linear fully-connected neural network
+is a determinantal variety. We investigate the subvariety of functions that are
+equivariant or invariant under the action of a permutation group. Examples of
+such group actions are translations or $90^\circ$ rotations on images. We
+describe such equivariant or invariant subvarieties as direct products of
+determinantal varieties, from which we deduce their dimension, degree,
+Euclidean distance degree, and their singularities. We fully characterize
+invariance for arbitrary permutation groups, and equivariance for cyclic
+groups. We draw conclusions for the parameterization and the design of
+equivariant and invariant linear networks in terms of sparsity and
+weight-sharing properties. We prove that all invariant linear functions can be
+parameterized by a single linear autoencoder with a weight-sharing property
+imposed by the cycle decomposition of the considered permutation. The space of
+rank-bounded equivariant functions has several irreducible components, so it
+can not be parameterized by a single network-but each irreducible component
+can. Finally, we show that minimizing the squared-error loss on our invariant
+or equivariant networks reduces to minimizing the Euclidean distance from
+determinantal varieties via the Eckart-Young theorem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 8 figures, 1 table; comments welcome!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empowering Aggregators with Practical Data-Driven Tools: Harnessing
+  Aggregated and Disaggregated Flexibility for Demand Response 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10726v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10726v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Costas Mylonas, Donata Boric, Leila Luttenberger Maric, Alexandros Tsitsanis, Eleftheria Petrianou, Magda Foti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the interaction between aggregators and building
+occupants in activating flexibility through Demand Response (DR) programs, with
+a focus on reinforcing the resilience of the energy system considering the
+uncertainties presented by Renewable Energy Sources (RES). Firstly, it
+introduces a methodology of optimizing aggregated flexibility provision
+strategies in environments with limited data, utilizing Discrete Fourier
+Transformation (DFT) and clustering techniques to identify building occupants'
+activity patterns. Secondly, the study assesses the disaggregated flexibility
+provision of Heating Ventilation and Air Conditioning (HVAC) systems during DR
+events, employing machine learning and optimization techniques for precise,
+device-level analysis. The first approach offers a non-intrusive pathway for
+aggregators to provide flexibility services in environments of a single smart
+meter for the whole building's consumption, while the second approach maximizes
+the amount of flexibility in the case of dedicated metering devices to the HVAC
+systems by carefully considering building occupants' thermal comfort profiles.
+Through the application of data-driven techniques and encompassing case studies
+from both industrial and residential buildings, this paper not only unveils
+pivotal opportunities for aggregators in the balancing and emerging flexibility
+markets but also successfully develops and demonstrates end-to-end practical
+tools for aggregators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Uncertainty Quantification Methods for Large Language
+  Models with LM-Polygraph <span class="chip">ACL 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15627v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15627v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Vashurin, Ekaterina Fadeeva, Artem Vazhentsev, Lyudmila Rvanova, Akim Tsvigun, Daniil Vasilev, Rui Xing, Abdelrahman Boda Sadallah, Kirill Grishchenkov, Sergey Petrakov, Alexander Panchenko, Timothy Baldwin, Preslav Nakov, Maxim Panov, Artem Shelmanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid proliferation of large language models (LLMs) has stimulated
+researchers to seek effective and efficient approaches to deal with LLM
+hallucinations and low-quality outputs. Uncertainty quantification (UQ) is a
+key element of machine learning applications in dealing with such challenges.
+However, research to date on UQ for LLMs has been fragmented in terms of
+techniques and evaluation methodologies. In this work, we address this issue by
+introducing a novel benchmark that implements a collection of state-of-the-art
+UQ baselines and offers an environment for controllable and consistent
+evaluation of novel UQ techniques over various text generation tasks. Our
+benchmark also supports the assessment of confidence normalization methods in
+terms of their ability to provide interpretable scores. Using our benchmark, we
+conduct a large-scale empirical investigation of UQ and normalization
+techniques across eleven tasks, identifying the most effective approaches.
+Code: https://github.com/IINemo/lm-polygraph Benchmark:
+https://huggingface.co/LM-Polygraph
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to TACL 2025, pre-MIT Press publication version. Roman
+  Vashurin, Ekaterina Fadeeva, Artem Vazhentsev contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ dlordinal: a Python package for deep ordinal classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.17163v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.17163v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Bérchez-Moreno, Víctor M. Vargas, Rafael Ayllón-Gavilán, David Guijo-Rubio, César Hervás-Martínez, Juan C. Fernández, Pedro A. Gutiérrez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  dlordinal is a new Python library that unifies many recent deep ordinal
+classification methodologies available in the literature. Developed using
+PyTorch as underlying framework, it implements the top performing
+state-of-the-art deep learning techniques for ordinal classification problems.
+Ordinal approaches are designed to leverage the ordering information present in
+the target variable. Specifically, it includes loss functions, various output
+layers, dropout techniques, soft labelling methodologies, and other
+classification strategies, all of which are appropriately designed to
+incorporate the ordinal information. Furthermore, as the performance metrics to
+assess novel proposals in ordinal classification depend on the distance between
+target and predicted classes in the ordinal scale, suitable ordinal evaluation
+metrics are also included. dlordinal is distributed under the BSD-3-Clause
+license and is available at https://github.com/ayrna/dlordinal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fractional Concepts in Neural Networks: Enhancing Activation Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11875v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11875v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zahra Alijani, Vojtech Molek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing effective neural networks requires tuning architectural elements.
+This study integrates fractional calculus into neural networks by introducing
+fractional order derivatives (FDO) as tunable parameters in activation
+functions, allowing diverse activation functions by adjusting the FDO. We
+evaluate these fractional activation functions on various datasets and network
+architectures, comparing their performance with traditional and new activation
+functions. Our experiments assess their impact on accuracy, time complexity,
+computational overhead, and memory usage. Results suggest fractional activation
+functions, particularly fractional Sigmoid, offer benefits in some scenarios.
+Challenges related to consistency and efficiency remain. Practical implications
+and limitations are discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 8 figures, submitted to pattern recognition letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Threshold Neuron: A Brain-inspired Artificial Neuron for Efficient
+  On-device Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13902v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13902v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Zheng, Yuanchun Li, Jiayu Chen, Peng Zhou, Xiang Chen, Yunxin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enhancing the computational efficiency of on-device Deep Neural Networks
+(DNNs) remains a significant challengein mobile and edge computing. As we aim
+to execute increasingly complex tasks with constrained computational resources,
+much of the research has focused on compressing neural network structures and
+optimizing systems. Although many studies have focused on compressing neural
+network structures and parameters or optimizing underlying systems, there has
+been limited attention on optimizing the fundamental building blocks of neural
+networks: the neurons. In this study, we deliberate on a simple but important
+research question: Can we design artificial neurons that offer greater
+efficiency than the traditional neuron paradigm? Inspired by the threshold
+mechanisms and the excitation-inhibition balance observed in biological
+neurons, we propose a novel artificial neuron model, Threshold Neurons. Using
+Threshold Neurons, we can construct neural networks similar to those with
+traditional artificial neurons, while significantly reducing hardware
+implementation complexity. Our extensive experiments validate the effectiveness
+of neural networks utilizing Threshold Neurons, achieving substantial power
+savings of 7.51x to 8.19x and area savings of 3.89x to 4.33x at the kernel
+level, with minimal loss in precision. Furthermore, FPGA-based implementations
+of these networks demonstrate 2.52x power savings and 1.75x speed enhancements
+at the system level. The source code will be made available upon publication.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Programmatic Reinforcement Learning: Navigating Gridworlds <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11650v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11650v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guruprerana Shabadi, Nathanaël Fijalkow, Théo Matricon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of reinforcement learning (RL) is concerned with algorithms for
+learning optimal policies in unknown stochastic environments. Programmatic RL
+studies representations of policies as programs, meaning involving higher order
+constructs such as control loops. Despite attracting a lot of attention at the
+intersection of the machine learning and formal methods communities, very
+little is known on the theoretical front about programmatic RL: what are good
+classes of programmatic policies? How large are optimal programmatic policies?
+How can we learn them? The goal of this paper is to give first answers to these
+questions, initiating a theoretical study of programmatic RL. Considering a
+class of gridworld environments, we define a class of programmatic policies.
+Our main contributions are to place upper bounds on the size of optimal
+programmatic policies, and to construct an algorithm for synthesizing them.
+These theoretical findings are complemented by a prototype implementation of
+the algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the proceedings of GenPlan, AAAI 2025 Workshop on
+  Generlization in Planning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wait-Less Offline Tuning and Re-solving for Online Decision Making 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09594v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09594v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingruo Sun, Wenzhi Gao, Ellen Vitercik, Yinyu Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online linear programming (OLP) has found broad applications in revenue
+management and resource allocation. State-of-the-art OLP algorithms achieve low
+regret by repeatedly solving linear programming (LP) subproblems that
+incorporate updated resource information. However, LP-based methods are
+computationally expensive and often inefficient for large-scale applications.
+In contrast, recent first-order OLP algorithms are more computationally
+efficient but typically suffer from worse regret guarantees. To address these
+shortcomings, we propose a new algorithm that combines the strengths of
+LP-based and first-order OLP methods. The algorithm re-solves the LP
+subproblems periodically at a predefined frequency $f$ and uses the latest dual
+prices to guide online decision-making. In addition, a first-order method runs
+in parallel during each interval between LP re-solves, smoothing resource
+consumption. Our algorithm achieves $\mathscr{O}(\log (T/f) + \sqrt{f})$
+regret, delivering a "wait-less" online decision-making process that balances
+the computational efficiency of first-order methods and the superior regret
+guarantee of LP-based methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In this version, we achieve a tighter regret bound with the warm
+  start for the first batch. We also make the proof more elegant by manually
+  accepting all subsequent orders once the constraint is violated. In this way,
+  we do not need to introduce the concept of stopping time for the analysis of
+  the LP-based method</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Pre-train</span>ed Data Deduplication Model based on Active Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00721v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00721v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Shi, Xinyao Liu, Fengmao Lv, Hongtao Xue, Jie Hu, Shengdong Du, Tianrui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of big data, the issue of data quality has become increasingly
+prominent. One of the main challenges is the problem of duplicate data, which
+can arise from repeated entry or the merging of multiple data sources. These
+"dirty data" problems can significantly limit the effective application of big
+data. To address the issue of data deduplication, we propose a pre-trained
+deduplication model based on active learning, which is the first work that
+utilizes active learning to address the problem of deduplication at the
+semantic level. The model is built on a pre-trained Transformer and fine-tuned
+to solve the deduplication problem as a sequence to classification task, which
+firstly integrate the transformer with active learning into an end-to-end
+architecture to select the most valuable data for deduplication model training,
+and also firstly employ the R-Drop method to perform data augmentation on each
+round of labeled data, which can reduce the cost of manual labeling and improve
+the model's performance. Experimental results demonstrate that our proposed
+model outperforms previous state-of-the-art (SOTA) for deduplicated data
+identification, achieving up to a 28% improvement in Recall score on benchmark
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Transport-inspired Deep Learning Framework for Slow-Decaying
+  Kolmogorov n-width Problems: Exploiting Sinkhorn Loss and Wasserstein Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moaad Khamlich, Federico Pichi, Gianluigi Rozza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reduced order models (ROMs) are widely used in scientific computing to tackle
+high-dimensional systems. However, traditional ROM methods may only partially
+capture the intrinsic geometric characteristics of the data. These
+characteristics encompass the underlying structure, relationships, and
+essential features crucial for accurate modeling.
+  To overcome this limitation, we propose a novel ROM framework that integrates
+optimal transport (OT) theory and neural network-based methods. Specifically,
+we investigate the Kernel Proper Orthogonal Decomposition (kPOD) method
+exploiting the Wasserstein distance as the custom kernel, and we efficiently
+train the resulting neural network (NN) employing the Sinkhorn algorithm. By
+leveraging an OT-based nonlinear reduction, the presented framework can capture
+the geometric structure of the data, which is crucial for accurate learning of
+the reduced solution manifold. When compared with traditional metrics such as
+mean squared error or cross-entropy, exploiting the Sinkhorn divergence as the
+loss function enhances stability during training, robustness against
+overfitting and noise, and accelerates convergence.
+  To showcase the approach's effectiveness, we conduct experiments on a set of
+challenging test cases exhibiting a slow decay of the Kolmogorov n-width. The
+results show that our framework outperforms traditional ROM methods in terms of
+accuracy and computational efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CORD: Generalizable Cooperation via Role Diversity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanefumi Matsuyama, Kefan Su, Jiangxing Wang, Deheng Ye, Zongqing Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperative multi-agent reinforcement learning (MARL) aims to develop agents
+that can collaborate effectively. However, most cooperative MARL methods
+overfit training agents, making learned policies not generalize well to unseen
+collaborators, which is a critical issue for real-world deployment. Some
+methods attempt to address the generalization problem but require prior
+knowledge or predefined policies of new teammates, limiting real-world
+applications. To this end, we propose a hierarchical MARL approach to enable
+generalizable cooperation via role diversity, namely CORD. CORD's high-level
+controller assigns roles to low-level agents by maximizing the role entropy
+with constraints. We show this constrained objective can be decomposed into
+causal influence in role that enables reasonable role assignment, and role
+heterogeneity that yields coherent, non-redundant role clusters. Evaluated on a
+variety of cooperative multi-agent tasks, CORD achieves better performance than
+baselines, especially in generalization tests. Ablation studies further
+demonstrate the efficacy of the constrained objective in generalizable
+cooperation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image-based Multimodal Models as Intruders: Transferable Multimodal
+  Attacks on Video-based MLLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01042v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01042v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linhao Huang, Xue Jiang, Zhiqiang Wang, Wentao Mo, Xi Xiao, Bo Han, Yongjie Yin, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based multimodal large language models (V-MLLMs) have shown
+vulnerability to adversarial examples in video-text multimodal tasks. However,
+the transferability of adversarial videos to unseen models--a common and
+practical real world scenario--remains unexplored. In this paper, we pioneer an
+investigation into the transferability of adversarial video samples across
+V-MLLMs. We find that existing adversarial attack methods face significant
+limitations when applied in black-box settings for V-MLLMs, which we attribute
+to the following shortcomings: (1) lacking generalization in perturbing video
+features, (2) focusing only on sparse key-frames, and (3) failing to integrate
+multimodal information. To address these limitations and deepen the
+understanding of V-MLLM vulnerabilities in black-box scenarios, we introduce
+the Image-to-Video MLLM (I2V-MLLM) attack. In I2V-MLLM, we utilize an
+image-based multimodal model (IMM) as a surrogate model to craft adversarial
+video samples. Multimodal interactions and temporal information are integrated
+to disrupt video representations within the latent space, improving adversarial
+transferability. In addition, a perturbation propagation technique is
+introduced to handle different unknown frame sampling strategies. Experimental
+results demonstrate that our method can generate adversarial examples that
+exhibit strong transferability across different V-MLLMs on multiple video-text
+multimodal tasks. Compared to white-box attacks on these models, our black-box
+attacks (using BLIP-2 as surrogate model) achieve competitive performance, with
+average attack success rates of 55.48% on MSVD-QA and 58.26% on MSRVTT-QA for
+VideoQA tasks, respectively. Our code will be released upon acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlgoFormer: An Efficient <span class="highlight-title">Transformer</span> Framework with Algorithmic
+  Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13572v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13572v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihang Gao, Chuanyang Zheng, Enze Xie, Han Shi, Tianyang Hu, Yu Li, Michael K. Ng, Zhenguo Li, Zhaoqiang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Besides natural language processing, transformers exhibit extraordinary
+performance in solving broader applications, including scientific computing and
+computer vision. Previous works try to explain this from the expressive power
+and capability perspectives that standard transformers are capable of
+performing some algorithms. To empower transformers with algorithmic
+capabilities and motivated by the recently proposed looped transformer, we
+design a novel transformer framework, dubbed Algorithm Transformer (abbreviated
+as AlgoFormer). We provide an insight that efficient transformer architectures
+can be designed by leveraging prior knowledge of tasks and the underlying
+structure of potential algorithms. Compared with the standard transformer and
+vanilla looped transformer, the proposed AlgoFormer can perform efficiently in
+algorithm representation in some specific tasks. In particular, inspired by the
+structure of human-designed learning algorithms, our transformer framework
+consists of a pre-transformer that is responsible for task preprocessing, a
+looped transformer for iterative optimization algorithms, and a
+post-transformer for producing the desired results after post-processing. We
+provide theoretical evidence of the expressive power of the AlgoFormer in
+solving some challenging problems, mirroring human-designed algorithms.
+Furthermore, some theoretical and empirical results are presented to show that
+the designed transformer has the potential to perform algorithm representation
+and learning. Experimental results demonstrate the empirical superiority of the
+proposed transformer in that it outperforms the standard transformer and
+vanilla looped transformer in some specific tasks. An extensive experiment on
+real language tasks (e.g., neural machine translation of German and English,
+and text classification) further validates the expressiveness and effectiveness
+of AlgoFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Transactions on Machine Learning Research (TMLR). The
+  paper provides insight that the Transformer architectures can mimic the
+  algorithm structures in (in-context) algorithm learning and representation.
+  The incorporated algorithmic structure in Algoformer shows its potential in
+  (deep learning for) scientific computing, besides the real language tasks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balanced Multi-view Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02564v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02564v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenglai Li, Jun Wang, Chang Tang, Xinzhong Zhu, Wei Zhang, Xinwang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view clustering (MvC) aims to integrate information from different
+views to enhance the capability of the model in capturing the underlying data
+structures. The widely used joint training paradigm in MvC is potentially not
+fully leverage the multi-view information, since the imbalanced and
+under-optimized view-specific features caused by the uniform learning objective
+for all views. For instance, particular views with more discriminative
+information could dominate the learning process in the joint training paradigm,
+leading to other views being under-optimized. To alleviate this issue, we first
+analyze the imbalanced phenomenon in the joint-training paradigm of multi-view
+clustering from the perspective of gradient descent for each view-specific
+feature extractor. Then, we propose a novel balanced multi-view clustering
+(BMvC) method, which introduces a view-specific contrastive regularization
+(VCR) to modulate the optimization of each view. Concretely, VCR preserves the
+sample similarities captured from the joint features and view-specific ones
+into the clustering distributions corresponding to view-specific features to
+enhance the learning process of view-specific feature extractors. Additionally,
+a theoretical analysis is provided to illustrate that VCR adaptively modulates
+the magnitudes of gradients for updating the parameters of view-specific
+feature extractors to achieve a balanced multi-view learning procedure. In such
+a manner, BMvC achieves a better trade-off between the exploitation of
+view-specific patterns and the exploration of view-invariance patterns to fully
+learn the multi-view information for the clustering task. Finally, a set of
+experiments are conducted to verify the superiority of the proposed method
+compared with state-of-the-art approaches both on eight benchmark MvC datasets
+and two spatially resolved transcriptomics datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We are withdrawing this paper due to issues in the experimental
+  section related to the Application for Spatially Resolved Transcriptomics
+  Data Clustering. These issues affect the validity of the results presented.
+  We believe it is necessary to withdraw the paper to address these problems
+  adequately before resubmission.</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extractive Structures Learned in <span class="highlight-title">Pretrain</span>ing Enable Generalization on
+  Finetuned Facts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04614v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04614v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahai Feng, Stuart Russell, Jacob Steinhardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained language models (LMs) can generalize to implications of facts that
+they are finetuned on. For example, if finetuned on ``John Doe lives in Tokyo,"
+LMs can correctly answer ``What language do the people in John Doe's city
+speak?'' with ``Japanese''. However, little is known about the mechanisms that
+enable this generalization or how they are learned during pretraining. We
+introduce extractive structures as a framework for describing how components in
+LMs (e.g., MLPs or attention heads) coordinate to enable this generalization.
+The structures consist of informative components that store training facts as
+weight changes, and upstream and downstream extractive components that query
+and process the stored information to produce the correct implication. We
+hypothesize that extractive structures are learned during pretraining when
+encountering implications of previously known facts. This yields two
+predictions: a data ordering effect where extractive structures can be learned
+only if facts precede their implications, and a weight grafting effect where
+extractive structures can be transferred to predict counterfactual
+implications. We empirically demonstrate these phenomena in the OLMo-7b, Llama
+3-8b, Gemma 2-9b, and Qwen 2-7b models. Of independent interest, our results
+also indicate that fact learning can occur at both early and late layers, which
+lead to different forms of generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KITS: Inductive Spatio-Temporal Kriging with Increment Training Strategy <span class="chip">AAAI'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.02565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.02565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianxiong Xu, Cheng Long, Ziyue Li, Sijie Ruan, Rui Zhao, Zhishuai Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sensors are commonly deployed to perceive the environment. However, due to
+the high cost, sensors are usually sparsely deployed. Kriging is the tailored
+task to infer the unobserved nodes (without sensors) using the observed source
+nodes (with sensors). The essence of kriging task is transferability. Recently,
+several inductive spatio-temporal kriging methods have been proposed based on
+graph neural networks, being trained based on a graph built on top of observed
+nodes via pretext tasks such as masking nodes out and reconstructing them.
+However, the graph in training is inevitably much sparser than the graph in
+inference that includes all the observed and unobserved nodes. The learned
+pattern cannot be well generalized for inference, denoted as graph gap. To
+address this issue, we first present a novel Increment training strategy:
+instead of masking nodes (and reconstructing them), we add virtual nodes into
+the training graph so as to mitigate the graph gap issue naturally.
+Nevertheless, the empty-shell virtual nodes without labels could have
+bad-learned features and lack supervision signals. To solve these issues, we
+pair each virtual node with its most similar observed node and fuse their
+features together; to enhance the supervision signal, we construct reliable
+pseudo labels for virtual nodes. As a result, the learned pattern of virtual
+nodes could be safely transferred to real unobserved nodes for reliable
+kriging. We name our new Kriging model with Increment Training Strategy as
+KITS. Extensive experiments demonstrate that KITS consistently outperforms
+existing kriging methods by large margins, e.g., the improvement over MAE score
+could be as high as 18.33%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by AAAI'25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fuzzy Information Entropy and Region Biased Matrix Factorization for Web
+  Service QoS Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxing Tang, Yugen Du, Xia Chen, Yingwei Luo, Benchi Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, there are many similar services available on the internet, making
+Quality of Service (QoS) a key concern for users. Since collecting QoS values
+for all services through user invocations is impractical, predicting QoS values
+is a more feasible approach. Matrix factorization is considered an effective
+prediction method. However, most existing matrix factorization algorithms focus
+on capturing global similarities between users and services, overlooking the
+local similarities between users and their similar neighbors, as well as the
+non-interactive effects between users and services. This paper proposes a
+matrix factorization approach based on user information entropy and region
+bias, which utilizes a similarity measurement method based on fuzzy information
+entropy to identify similar neighbors of users. Simultaneously, it integrates
+the region bias between each user and service linearly into matrix
+factorization to capture the non-interactive features between users and
+services. This method demonstrates improved predictive performance in more
+realistic and complex network environments. Additionally, numerous experiments
+are conducted on real-world QoS datasets. The experimental results show that
+the proposed method outperforms some of the state-of-the-art methods in the
+field at matrix densities ranging from 5% to 20%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TabuLa: Harnessing Language Models for Tabular Data Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12746v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12746v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zilong Zhao, Robert Birke, Lydia Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tabular data synthesis is crucial for addressing privacy and security
+concerns in industries reliant on tabular data. While recent advancements adopt
+large language models (LLMs) for realistic tabular data generation, their long
+training times and limited reusability hinder practical applications. In this
+paper, we propose Tabula, a tabular data synthesizer that leverages the
+structure of LLM. Unlike state-of-the-art (SOTA) LLM-based tabular data
+synthesizers that rely on pre-trained LLMs, Tabula discards the pre-trained
+weights originally designed for natural language tasks, focusing instead on a
+tailored approach for tabular data. In addition, Tabula introduces a token
+sequence compression strategy that significantly reduces training time while
+maintaining data quality, alongside a novel token padding method that improves
+sequence alignment across training batches. Experiments on six datasets show
+that Tabula achieves superior synthetic data utility compared to current SOTA
+methods. Additionally, the results demonstrate that Tabula model trained on
+tabular datasets serves effectively as a foundational model for synthesizing
+new tabular datasets. Furthermore, the proposed padding method outperforms the
+conventional left and right padding strategies. Finally, the results highlight
+that Tabula averagely reduces training time per epoch by 46.2% compared to
+state-of-the-art LLM approaches while achieving higher data utility. Our code
+is available at https://github.com/zhao-zilong/Tabula
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 4-bit Shampoo for Memory-Efficient Network Training <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18144v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18144v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sike Wang, Pan Zhou, Jia Li, Hua Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Second-order optimizers, maintaining a matrix termed a preconditioner, are
+superior to first-order optimizers in both theory and practice. The states
+forming the preconditioner and its inverse root restrict the maximum size of
+models trained by second-order optimizers. To address this, compressing 32-bit
+optimizer states to lower bitwidths has shown promise in reducing memory usage.
+However, current approaches only pertain to first-order optimizers. In this
+paper, we propose the first 4-bit second-order optimizers, exemplified by 4-bit
+Shampoo, maintaining performance similar to that of 32-bit ones. We show that
+quantizing the eigenvector matrix of the preconditioner in 4-bit Shampoo is
+remarkably better than quantizing the preconditioner itself both theoretically
+and experimentally. By rectifying the orthogonality of the quantized
+eigenvector matrix, we enhance the approximation of the preconditioner's
+eigenvector matrix, which also benefits the computation of its inverse 4-th
+root. Besides, we find that linear square quantization slightly outperforms
+dynamic tree quantization when quantizing second-order optimizer states.
+Evaluation on various networks for image classification and natural language
+modeling demonstrates that our 4-bit Shampoo achieves comparable performance to
+its 32-bit counterpart while being more memory-efficient.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 final camera-ready revisions, rectify the legend in
+  figure 9</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Joint Additive Factor Models for Multiview Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00778v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00778v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niccolo Anceschi, Federico Ferrari, David B. Dunson, Himel Mallick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is increasingly common in a wide variety of applied settings to collect
+data of multiple different types on the same set of samples. Our particular
+focus in this article is on studying relationships between such multiview
+features and responses. A motivating application arises in the context of
+precision medicine where multi-omics data are collected to correlate with
+clinical outcomes. It is of interest to infer dependence within and across
+views while combining multimodal information to improve the prediction of
+outcomes. The signal-to-noise ratio can vary substantially across views,
+motivating more nuanced statistical tools beyond standard late and early
+fusion. This challenge comes with the need to preserve interpretability, select
+features, and obtain accurate uncertainty quantification. We propose a joint
+additive factor regression model (JAFAR) with a structured additive design,
+accounting for shared and view-specific components. We ensure identifiability
+via a novel dependent cumulative shrinkage process (D-CUSP) prior. We provide
+an efficient implementation via a partially collapsed Gibbs sampler and extend
+our approach to allow flexible feature and outcome distributions. Prediction of
+time-to-labor onset from immunome, metabolome, and proteome data illustrates
+performance gains against state-of-the-art competitors. Our open-source
+software (R package) is available at https://github.com/niccoloanceschi/jafar.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Surrogate-based Autotuning for Randomized Sketching Algorithms in
+  Regression Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.15720v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.15720v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Younghyun Cho, James W. Demmel, Michał Dereziński, Haoyun Li, Hengrui Luo, Michael W. Mahoney, Riley J. Murray
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Algorithms from Randomized Numerical Linear Algebra (RandNLA) are known to be
+effective in handling high-dimensional computational problems, providing
+high-quality empirical performance as well as strong probabilistic guarantees.
+However, their practical application is complicated by the fact that the user
+needs to set various algorithm-specific tuning parameters which are different
+than those used in traditional NLA. This paper demonstrates how a
+surrogate-based autotuning approach can be used to address fundamental problems
+of parameter selection in RandNLA algorithms. In particular, we provide a
+detailed investigation of surrogate-based autotuning for
+sketch-and-precondition (SAP) based randomized least squares methods, which
+have been one of the great success stories in modern RandNLA. Empirical results
+show that our surrogate-based autotuning approach can achieve near-optimal
+performance with much less tuning cost than a random search (up to about 4x
+fewer trials of different parameter configurations). Moreover, while our
+experiments focus on least squares, our results demonstrate a general-purpose
+autotuning pipeline applicable to any kind of RandNLA algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Improved the presentation and clarity. Updated experimental results
+  and scenarios. Accepted for publication in SIAM Journal on Matrix Analysis
+  and Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Infrared Image Super-Resolution: Systematic <span class="highlight-title">Review</span>, and Future Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12322v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12322v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongsong Huang, Tomo Miyazaki, Xiaofeng Liu, Shinichiro Omachi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image Super-Resolution (SR) is essential for a wide range of computer vision
+and image processing tasks. Investigating infrared (IR) image (or thermal
+images) super-resolution is a continuing concern within the development of deep
+learning. This survey aims to provide a comprehensive perspective of IR image
+super-resolution, including its applications, hardware imaging system dilemmas,
+and taxonomy of image processing methodologies. In addition, the datasets and
+evaluation metrics in IR image super-resolution tasks are also discussed.
+Furthermore, the deficiencies in current technologies and possible promising
+directions for the community to explore are highlighted. To cope with the rapid
+development in this field, we intend to regularly update the relevant excellent
+work at \url{https://github.com/yongsongH/Infrared_Image_SR_Survey
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Human-In-the-Loop Software Development Agents <span class="chip">ICSE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.12924v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.12924v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wannita Takerngsaksiri, Jirat Pasuksmit, Patanamon Thongtanunam, Chakkrit Tantithamthavorn, Ruixiong Zhang, Fan Jiang, Jing Li, Evan Cook, Kun Chen, Ming Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Large Language Models (LLMs)-based multi-agent paradigms for
+software engineering are introduced to automatically resolve software
+development tasks (e.g., from a given issue to source code). However, existing
+work is evaluated based on historical benchmark datasets, rarely considers
+human feedback at each stage of the automated software development process, and
+has not been deployed in practice. In this paper, we introduce a
+Human-in-the-loop LLM-based Agents framework (HULA) for software development
+that allows software engineers to refine and guide LLMs when generating coding
+plans and source code for a given task. We design, implement, and deploy the
+HULA framework into Atlassian JIRA for internal uses. Through a multi-stage
+evaluation of the HULA framework, Atlassian software engineers perceive that
+HULA can minimize the overall development time and effort, especially in
+initiating a coding plan and writing code for straightforward tasks. On the
+other hand, challenges around code quality remain a concern in some cases. We
+draw lessons learned and discuss opportunities for future work, which will pave
+the way for the advancement of LLM-based agents in software development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures, ICSE SEIP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comprehensive Examination of Unrolled Networks for Solving Linear
+  Inverse Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04608v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04608v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Chen, Xi Chen, Arian Maleki, Shirin Jalali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unrolled networks have become prevalent in various computer vision and
+imaging tasks. Although they have demonstrated remarkable efficacy in solving
+specific computer vision and computational imaging tasks, their adaptation to
+other applications presents considerable challenges. This is primarily due to
+the multitude of design decisions that practitioners working on new
+applications must navigate, each potentially affecting the network's overall
+performance. These decisions include selecting the optimization algorithm,
+defining the loss function, and determining the number of convolutional layers,
+among others. Compounding the issue, evaluating each design choice requires
+time-consuming simulations to train, fine-tune the neural network, and optimize
+for its performance. As a result, the process of exploring multiple options and
+identifying the optimal configuration becomes time-consuming and
+computationally demanding. The main objectives of this paper are (1) to unify
+some ideas and methodologies used in unrolled networks to reduce the number of
+design choices a user has to make, and (2) to report a comprehensive ablation
+study to discuss the impact of each of the choices involved in designing
+unrolled networks and present practical recommendations based on our findings.
+We anticipate that this study will help scientists and engineers design
+unrolled networks for their applications and diagnose problems within their
+networks efficiently.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 10 figures. Project Page:
+  https://github.com/YuxiChen25/Memory-Net-Inverse</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Switching State Space Model (DS$^3$M) for Nonlinear Time Series
+  Forecasting with Regime Switching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.02329v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.02329v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiuqin Xu, Hanqiu Peng, Ying Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern time series data often display complex nonlinear dependencies along
+with irregular regime-switching behaviors. These features present technical
+challenges in modeling, inference, and in offering insightful understanding
+into the underlying stochastic phenomena. To tackle these challenges, we
+introduce a novel modeling framework known as the Deep Switching State Space
+Model (DS$^3$M). This framework is engineered to make accurate forecasts for
+such time series while adeptly identifying the irregular regimes hidden within
+the dynamics. These identifications not only have significant economic
+ramifications but also contribute to a deeper understanding of the underlying
+phenomena. In DS$^3$M, the architecture employs discrete latent variables to
+represent regimes and continuous latent variables to account for random driving
+factors. By melding a Recurrent Neural Network (RNN) with a nonlinear Switching
+State Space Model (SSSM), we manage to capture the nonlinear dependencies and
+irregular regime-switching behaviors, governed by a Markov chain and
+parameterized using multilayer perceptrons. We validate the effectiveness and
+regime identification capabilities of DS$^3$M through short- and long-term
+forecasting tests on a wide array of simulated and real-world datasets,
+spanning sectors such as healthcare, economics, traffic, meteorology, and
+energy. Experimental results reveal that DS$^3$M outperforms several
+state-of-the-art models in terms of forecasting accuracy, while providing
+meaningful regime identifications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Targeted Adversarial Denoising Autoencoders (TADA) for Neural Time
+  Series Filtration <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin J. Choi, Griffin Milsap, Clara A. Scholl, Francesco Tenore, Mattson Ogg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current machine learning (ML)-based algorithms for filtering
+electroencephalography (EEG) time series data face challenges related to
+cumbersome training times, regularization, and accurate reconstruction. To
+address these shortcomings, we present an ML filtration algorithm driven by a
+logistic covariance-targeted adversarial denoising autoencoder (TADA). We
+hypothesize that the expressivity of a targeted, correlation-driven
+convolutional autoencoder will enable effective time series filtration while
+minimizing compute requirements (e.g., runtime, model size). Furthermore, we
+expect that adversarial training with covariance rescaling will minimize signal
+degradation. To test this hypothesis, a TADA system prototype was trained and
+evaluated on the task of removing electromyographic (EMG) noise from EEG data
+in the EEGdenoiseNet dataset, which includes EMG and EEG data from 67 subjects.
+The TADA filter surpasses conventional signal filtration algorithms across
+quantitative metrics (Correlation Coefficient, Temporal RRMSE, Spectral RRMSE),
+and performs competitively against other deep learning architectures at a
+reduced model size of less than 400,000 trainable parameters. Further
+experimentation will be necessary to assess the viability of TADA on a wider
+range of deployment cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>[Accepted] Artificial Intelligence for Time Series Analysis (AI4TS):
+  Theory, Algorithms, and Applications @ AAAI 2025, Philadelphia, PA, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expected Coordinate Improvement for High-Dimensional Bayesian
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.11917v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.11917v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dawei Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian optimization (BO) algorithm is very popular for solving
+low-dimensional expensive optimization problems. Extending Bayesian
+optimization to high dimension is a meaningful but challenging task. One of the
+major challenges is that it is difficult to find good infill solutions as the
+acquisition functions are also high-dimensional. In this work, we propose the
+expected coordinate improvement (ECI) criterion for high-dimensional Bayesian
+optimization. The proposed ECI criterion measures the potential improvement we
+can get by moving the current best solution along one coordinate. The proposed
+approach selects the coordinate with the highest ECI value to refine in each
+iteration and covers all the coordinates gradually by iterating over the
+coordinates. The greatest advantage of the proposed ECI-BO (expected coordinate
+improvement based Bayesian optimization) algorithm over the standard BO
+algorithm is that the infill selection problem of the proposed algorithm is
+always a one-dimensional problem thus can be easily solved. Numerical
+experiments show that the proposed algorithm can achieve significantly better
+results than the standard BO algorithm and competitive results when compared
+with five state-of-the-art high-dimensional BOs. This work provides a simple
+but efficient approach for high-dimensional Bayesian optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Robustness for Deep Learning-based Wildfire Prediction
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryo Ide, Lei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Smoke detection using Deep Neural Networks (DNNs) is an effective approach
+for early wildfire detection. However, because smoke is temporally and
+spatially anomalous, there are limitations in collecting sufficient training
+data. This raises overfitting and bias concerns in existing DNN-based wildfire
+detection models. Thus, we introduce WARP (Wildfire Adversarial Robustness
+Procedure), the first model-agnostic framework for evaluating the adversarial
+robustness of DNN-based wildfire detection models. WARP addresses limitations
+in smoke image diversity using global and local adversarial attack methods. The
+global attack method uses image-contextualized Gaussian noise, while the local
+attack method uses patch noise injection, tailored to address critical aspects
+of wildfire detection. Leveraging WARP's model-agnostic capabilities, we assess
+the adversarial robustness of real-time Convolutional Neural Networks (CNNs)
+and Transformers. The analysis revealed valuable insights into the models'
+limitations. Specifically, the global attack method demonstrates that the
+Transformer model has more than 70% precision degradation than the CNN against
+global noise. In contrast, the local attack method shows that both models are
+susceptible to cloud image injections when detecting smoke-positive instances,
+suggesting a need for model improvements through data augmentation. WARP's
+comprehensive robustness analysis contributed to the development of
+wildfire-specific data augmentation strategies, marking a step toward
+practicality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Consistency Checks for Language Model Forecasters <span class="chip">ICLR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18544v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18544v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Paleka, Abhimanyu Pallavi Sudhir, Alejandro Alvarez, Vineeth Bhat, Adam Shen, Evan Wang, Florian Tramèr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting is a task that is difficult to evaluate: the ground truth can
+only be known in the future. Recent work showing LLM forecasters rapidly
+approaching human-level performance begs the question: how can we benchmark and
+evaluate these forecasters instantaneously? Following the consistency check
+framework, we measure the performance of forecasters in terms of the
+consistency of their predictions on different logically-related questions. We
+propose a new, general consistency metric based on arbitrage: for example, if a
+forecasting AI illogically predicts that both the Democratic and Republican
+parties have 60% probability of winning the 2024 US presidential election, an
+arbitrageur can trade against the forecaster's predictions and make a profit.
+We build an automated evaluation system that generates a set of base questions,
+instantiates consistency checks from these questions, elicits the predictions
+of the forecaster, and measures the consistency of the predictions. We then
+build a standard, proper-scoring-rule forecasting benchmark, and show that our
+(instantaneous) consistency metrics correlate with LLM forecasters' ground
+truth Brier scores (which are only known in the future). We also release a
+consistency benchmark that resolves in 2028, providing a long-term evaluation
+tool for forecasting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>55 pages, 25 figures. Submitted to ICLR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Sample Generation of Diffusion Models using Noise Level
+  Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abulikemu Abuduweili, Chenyang Yuan, Changliu Liu, Frank Permenter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The denoising process of diffusion models can be interpreted as an
+approximate projection of noisy samples onto the data manifold. Moreover, the
+noise level in these samples approximates their distance to the underlying
+manifold. Building on this insight, we propose a novel method to enhance sample
+generation by aligning the estimated noise level with the true distance of
+noisy samples to the manifold. Specifically, we introduce a noise level
+correction network, leveraging a pre-trained denoising network, to refine noise
+level estimates during the denoising process. Additionally, we extend this
+approach to various image restoration tasks by integrating task-specific
+constraints, including inpainting, deblurring, super-resolution, colorization,
+and compressed sensing. Experimental results demonstrate that our method
+significantly improves sample quality in both unconstrained and constrained
+generation scenarios. Notably, the proposed noise level correction framework is
+compatible with existing denoising schedulers (e.g., DDIM), offering additional
+performance improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ eGAD! double descent is explained by Generalized Aliasing Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08294v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08294v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark K. Transtrum, Gus L. W. Hart, Tyler J. Jarvis, Jared P. Whitehead
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A central problem in data science is to use potentially noisy samples of an
+unknown function to predict values for unseen inputs. In classical statistics,
+predictive error is understood as a trade-off between the bias and the variance
+that balances model simplicity with its ability to fit complex functions.
+However, over-parameterized models exhibit counterintuitive behaviors, such as
+"double descent" in which models of increasing complexity exhibit decreasing
+generalization error. Others may exhibit more complicated patterns of
+predictive error with multiple peaks and valleys. Neither double descent nor
+multiple descent phenomena are well explained by the bias-variance
+decomposition.
+  We introduce a novel decomposition that we call the generalized aliasing
+decomposition (GAD) to explain the relationship between predictive performance
+and model complexity. The GAD decomposes the predictive error into three parts:
+1) model insufficiency, which dominates when the number of parameters is much
+smaller than the number of data points, 2) data insufficiency, which dominates
+when the number of parameters is much greater than the number of data points,
+and 3) generalized aliasing, which dominates between these two extremes.
+  We demonstrate the applicability of the GAD to diverse applications,
+including random feature models from machine learning, Fourier transforms from
+signal processing, solution methods for differential equations, and predictive
+formation enthalpy in materials discovery. Because key components of the GAD
+can be explicitly calculated from the relationship between model class and
+samples without seeing any data labels, it can answer questions related to
+experimental design and model selection before collecting data or performing
+experiments. We further demonstrate this approach on several examples and
+discuss implications for predictive modeling and data science.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">1</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Reversible Consistency Learning for Cross-modal Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruitao Pu, Yang Qin, Dezhong Peng, Xiaomin Song, Huiming Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal retrieval (CMR) typically involves learning common
+representations to directly measure similarities between multimodal samples.
+Most existing CMR methods commonly assume multimodal samples in pairs and
+employ joint training to learn common representations, limiting the flexibility
+of CMR. Although some methods adopt independent training strategies for each
+modality to improve flexibility in CMR, they utilize the randomly initialized
+orthogonal matrices to guide representation learning, which is suboptimal since
+they assume inter-class samples are independent of each other, limiting the
+potential of semantic alignments between sample representations and
+ground-truth labels. To address these issues, we propose a novel method termed
+Deep Reversible Consistency Learning (DRCL) for cross-modal retrieval. DRCL
+includes two core modules, \ie Selective Prior Learning (SPL) and Reversible
+Semantic Consistency learning (RSC). More specifically, SPL first learns a
+transformation weight matrix on each modality and selects the best one based on
+the quality score as the Prior, which greatly avoids blind selection of priors
+learned from low-quality modalities. Then, RSC employs a Modality-invariant
+Representation Recasting mechanism (MRR) to recast the potential
+modality-invariant representations from sample semantic labels by the
+generalized inverse matrix of the prior. Since labels are devoid of
+modal-specific information, we utilize the recast features to guide the
+representation learning, thus maintaining semantic consistency to the fullest
+extent possible. In addition, a feature augmentation mechanism (FA) is
+introduced in RSC to encourage the model to learn over a wider data
+distribution for diversity. Finally, extensive experiments conducted on five
+widely used datasets and comparisons with 15 state-of-the-art baselines
+demonstrate the effectiveness and superiority of our DRCL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">94</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Alignment Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06164v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06164v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satchel Grant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When can we say that two neural systems are the same? The answer to this
+question is goal-dependent, and it is often addressed through correlative
+methods such as Representational Similarity Analysis (RSA) and Centered Kernel
+Alignment (CKA). What do we miss when we forgo causal explorations, and how can
+we target specific types of similarity? In this work, we introduce Model
+Alignment Search (MAS), a method for causally exploring distributed
+representational similarity. The method learns invertible linear
+transformations that align a subspace between two distributed networks'
+representations where causal information can be freely interchanged. We first
+show that the method can be used to transfer specific causal variables, such as
+the number of items in a counting task, between networks with different
+training seeds. We then explore open questions in number cognition by comparing
+different types of numeric representations in models trained on structurally
+different numeric tasks. We then explore differences between MAS vs preexisting
+causal similarity methods, showing MAS to be more resistant to unwanted
+exchanges. Lastly, we introduce a counterfactual latent auxiliary loss function
+that helps shape causally relevant alignments even in cases where we do not
+have causal access to one of the two models for training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ xLSTM-SENet: xLSTM for Single-Channel Speech Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolai Lund Kühne, Jan Østergaard, Jesper Jensen, Zheng-Hua Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While attention-based architectures, such as Conformers, excel in speech
+enhancement, they face challenges such as scalability with respect to input
+sequence length. In contrast, the recently proposed Extended Long Short-Term
+Memory (xLSTM) architecture offers linear scalability. However, xLSTM-based
+models remain unexplored for speech enhancement. This paper introduces
+xLSTM-SENet, the first xLSTM-based single-channel speech enhancement system. A
+comparative analysis reveals that xLSTM-and notably, even LSTM-can match or
+outperform state-of-the-art Mamba- and Conformer-based systems across various
+model sizes in speech enhancement on the VoiceBank+Demand dataset. Through
+ablation studies, we identify key architectural design choices such as
+exponential gating and bidirectionality contributing to its effectiveness. Our
+best xLSTM-based model, xLSTM-SENet2, outperforms state-of-the-art Mamba- and
+Conformer-based systems on the Voicebank+DEMAND dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multilingual Performance of a Multimodal Artificial Intelligence System
+  on Multisubject Physics Concept Inventories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerd Kortemeyer, Marina Babayeva, Giulia Polverini, Bor Gregorcic, Ralf Widenhorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the multilingual and multimodal performance of a large
+language model-based artificial intelligence (AI) system, GPT-4o, on a diverse
+set of physics concept inventories spanning multiple languages and subject
+areas. The inventories taken from the PhysPort website cover the classical
+physics topics of mechanics, electromagnetism, optics, and thermodynamics as
+well as relativity, quantum mechanics, astronomy, mathematics, and laboratory
+skills. Unlike previous text-only studies, we uploaded the inventories as
+images mirroring what a student would see on paper, assessing the system's
+multimodal functionality. The AI is prompted in English and autonomously
+chooses the language of its response - either remaining in the nominal language
+of the test, switching entirely to English, or mixing languages - revealing
+adaptive behavior dependent on linguistic complexity and data availability. Our
+results indicate some variation in performance across subject areas, with
+laboratory skills standing out as the area of poorest performance. Furthermore,
+the AI's performance on questions that require visual interpretation of images
+is worse than on purely text-based questions. Questions that are difficult for
+the AI tend to be that way invariably of the inventory language. We also find
+large variations in performance across languages, with some appearing to
+benefit substantially from language switching, a phenomenon similar to
+code-switching ofhuman speakers. Overall, comparing the obtained AI results to
+the existing literature, we find that the AI system outperforms average
+undergraduate students post-instruction in all subject areas but laboratory
+skills.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emergent Symbol-like Number Variables in Artificial Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satchel Grant, Noah D. Goodman, James L. McClelland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What types of numeric representations emerge in Neural Networks (NNs)? To
+what degree do NNs induce abstract, mutable, slot-like numeric variables, and
+in what situations do these representations emerge? How do these
+representations change over learning, and how can we understand the neural
+implementations in ways that are unified across different NNs? In this work, we
+approach these questions by first training sequence based neural systems using
+Next Token Prediction (NTP) objectives on numeric tasks. We then seek to
+understand the neural solutions through the lens of causal abstractions or
+symbolic algorithms. We use a combination of causal interventions and
+visualization methods to find that artificial neural models do indeed develop
+analogs of interchangeable, mutable, latent number variables purely from the
+NTP objective. We then ask how variations on the tasks and model architectures
+affect the models' learned solutions to find that these symbol-like numeric
+representations do not form for every variant of the task, and transformers
+solve the problem in a notably different way than their recurrent counterparts.
+We then show how the symbol-like variables change over the course of training
+to find a strong correlation between the models' task performance and the
+alignment of their symbol-like representations. Lastly, we show that in all
+cases, some degree of gradience exists in these neural symbols, highlighting
+the difficulty of finding simple, interpretable symbolic stories of how neural
+networks perform numeric tasks. Taken together, our results are consistent with
+the view that neural networks can approximate interpretable symbolic programs
+of number cognition, but the particular program they approximate and the extent
+to which they approximate it can vary widely, depending on the network
+architecture, training data, extent of training, and network size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Supervision policies can shape long-term risk management in
+  general-purpose AI models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Cebrian, Emilia Gomez, David Fernandez Llorca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid proliferation and deployment of General-Purpose AI (GPAI) models,
+including large language models (LLMs), present unprecedented challenges for AI
+supervisory entities. We hypothesize that these entities will need to navigate
+an emergent ecosystem of risk and incident reporting, likely to exceed their
+supervision capacity. To investigate this, we develop a simulation framework
+parameterized by features extracted from the diverse landscape of risk,
+incident, or hazard reporting ecosystems, including community-driven platforms,
+crowdsourcing initiatives, and expert assessments. We evaluate four supervision
+policies: non-prioritized (first-come, first-served), random selection,
+priority-based (addressing the highest-priority risks first), and
+diversity-prioritized (balancing high-priority risks with comprehensive
+coverage across risk types). Our results indicate that while priority-based and
+diversity-prioritized policies are more effective at mitigating high-impact
+risks, particularly those identified by experts, they may inadvertently neglect
+systemic issues reported by the broader community. This oversight can create
+feedback loops that amplify certain types of reporting while discouraging
+others, leading to a skewed perception of the overall risk landscape. We
+validate our simulation results with several real-world datasets, including one
+with over a million ChatGPT interactions, of which more than 150,000
+conversations were identified as risky. This validation underscores the complex
+trade-offs inherent in AI risk supervision and highlights how the choice of
+risk management policies can shape the future landscape of AI risks across
+diverse GPAI models used in society.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoDriveVLM: VLM-Enhanced Urban Cooperative Dispatching and Motion
+  Planning for Future Autonomous Mobility on Demand Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haichao Liu, Ruoyu Yao, Wenru Liu, Zhenmin Huang, Shaojie Shen, Jun Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing demand for flexible and efficient urban transportation
+solutions has spotlighted the limitations of traditional Demand Responsive
+Transport (DRT) systems, particularly in accommodating diverse passenger needs
+and dynamic urban environments. Autonomous Mobility-on-Demand (AMoD) systems
+have emerged as a promising alternative, leveraging connected and autonomous
+vehicles (CAVs) to provide responsive and adaptable services. However, existing
+methods primarily focus on either vehicle scheduling or path planning, which
+often simplify complex urban layouts and neglect the necessity for simultaneous
+coordination and mutual avoidance among CAVs. This oversimplification poses
+significant challenges to the deployment of AMoD systems in real-world
+scenarios. To address these gaps, we propose CoDriveVLM, a novel framework that
+integrates high-fidelity simultaneous dispatching and cooperative motion
+planning for future AMoD systems. Our method harnesses Vision-Language Models
+(VLMs) to enhance multi-modality information processing, and this enables
+comprehensive dispatching and collision risk evaluation. The VLM-enhanced CAV
+dispatching coordinator is introduced to effectively manage complex and
+unforeseen AMoD conditions, thus supporting efficient scheduling
+decision-making. Furthermore, we propose a scalable decentralized cooperative
+motion planning method via consensus alternating direction method of
+multipliers (ADMM) focusing on collision risk evaluation and decentralized
+trajectory optimization. Simulation results demonstrate the feasibility and
+robustness of CoDriveVLM in various traffic conditions, showcasing its
+potential to significantly improve the fidelity and effectiveness of AMoD
+systems in future urban transportation networks. The code is available at
+https://github.com/henryhcliu/CoDriveVLM.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contextual ASR Error Handling with LLMs Augmentation for Goal-Oriented
+  Conversational AI <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuya Asano, Sabit Hassan, Paras Sharma, Anthony Sicilia, Katherine Atwell, Diane Litman, Malihe Alikhani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  General-purpose automatic speech recognition (ASR) systems do not always
+perform well in goal-oriented dialogue. Existing ASR correction methods rely on
+prior user data or named entities. We extend correction to tasks that have no
+prior user data and exhibit linguistic flexibility such as lexical and
+syntactic variations. We propose a novel context augmentation with a large
+language model and a ranking strategy that incorporates contextual information
+from the dialogue states of a goal-oriented conversational AI and its tasks.
+Our method ranks (1) n-best ASR hypotheses by their lexical and semantic
+similarity with context and (2) context by phonetic correspondence with ASR
+hypotheses. Evaluated in home improvement and cooking domains with real-world
+users, our method improves recall and F1 of correction by 34% and 16%,
+respectively, while maintaining precision and false positive rate. Users rated
+.8-1 point (out of 5) higher when our correction method worked properly, with
+no decrease due to false positives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to COLING 2025 Industry Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fleurs-SLU: A Massively Multilingual Benchmark for Spoken Language
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian David Schmidt, Ivan Vulić, Goran Glavaš, David Ifeoluwa Adelani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent multilingual automatic speech recognition models claim to
+support thousands of languages, ASR for low-resource languages remains highly
+unreliable due to limited bimodal speech and text training data. Better
+multilingual spoken language understanding (SLU) can strengthen massively the
+robustness of multilingual ASR by levering language semantics to compensate for
+scarce training data, such as disambiguating utterances via context or
+exploiting semantic similarities across languages. Even more so, SLU is
+indispensable for inclusive speech technology in roughly half of all living
+languages that lack a formal writing system. However, the evaluation of
+multilingual SLU remains limited to shallower tasks such as intent
+classification or language identification. To address this, we present
+Fleurs-SLU, a multilingual SLU benchmark that encompasses topical speech
+classification in 102 languages and multiple-choice question answering through
+listening comprehension in 92 languages. We extensively evaluate both
+end-to-end speech classification models and cascaded systems that combine
+speech-to-text transcription with subsequent classification by large language
+models on Fleurs-SLU. Our results show that cascaded systems exhibit greater
+robustness in multilingual SLU tasks, though speech encoders can achieve
+competitive performance in topical speech classification when appropriately
+pre-trained. We further find a strong correlation between robust multilingual
+ASR, effective speech-to-text translation, and strong multilingual SLU,
+highlighting the mutual benefits between acoustic and semantic speech
+representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Deep Learning-based Anomaly Detection in Energy Consumption
+  Data by Focusing on Contextually Relevant Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Noorchenarboo, Katarina Grolinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting anomalies in energy consumption data is crucial for identifying
+energy waste, equipment malfunction, and overall, for ensuring efficient energy
+management. Machine learning, and specifically deep learning approaches, have
+been greatly successful in anomaly detection; however, they are black-box
+approaches that do not provide transparency or explanations. SHAP and its
+variants have been proposed to explain these models, but they suffer from high
+computational complexity (SHAP) or instability and inconsistency (e.g., Kernel
+SHAP). To address these challenges, this paper proposes an explainability
+approach for anomalies in energy consumption data that focuses on
+context-relevant information. The proposed approach leverages existing
+explainability techniques, focusing on SHAP variants, together with global
+feature importance and weighted cosine similarity to select background dataset
+based on the context of each anomaly point. By focusing on the context and most
+relevant features, this approach mitigates the instability of explainability
+algorithms. Experimental results across 10 different machine learning models,
+five datasets, and five XAI techniques, demonstrate that our method reduces the
+variability of explanations providing consistent explanations. Statistical
+analyses confirm the robustness of our approach, showing an average reduction
+in variability of approximately 38% across multiple datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Developing Socially Compliant Automated Vehicles: State of the
+  Art, Experts Expectations, and A Conceptual Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongqi Dong, Bart van Arem, Haneen Farah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated Vehicles (AVs) hold promise for revolutionizing transportation by
+improving road safety, traffic efficiency, and overall mobility. Despite the
+steady advancement in high-level AVs in recent years, the transition to full
+automation entails a period of mixed traffic, where AVs of varying automation
+levels coexist with human-driven vehicles (HDVs). Making AVs socially compliant
+and understood by human drivers is expected to improve the safety and
+efficiency of mixed traffic. Thus, ensuring AVs compatibility with HDVs and
+social acceptance is crucial for their successful and seamless integration into
+mixed traffic. However, research in this critical area of developing Socially
+Compliant AVs (SCAVs) remains sparse. This study carries out the first
+comprehensive scoping review to assess the current state of the art in
+developing SCAVs, identifying key concepts, methodological approaches, and
+research gaps. An expert interview was also conducted to identify critical
+research gaps and expectations towards SCAVs. Based on the scoping review and
+expert interview input, a conceptual framework is proposed for the development
+of SCAVs. The conceptual framework is evaluated using an online survey
+targeting researchers, technicians, policymakers, and other relevant
+professionals worldwide. The survey results provide valuable validation and
+insights, affirming the significance of the proposed conceptual framework in
+tackling the challenges of integrating AVs into mixed-traffic environments.
+Additionally, future research perspectives and suggestions are discussed,
+contributing to the research and development agenda of SCAVs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 13 figures, under review by the journal of Transportation
+  Research Part E: Logistics and Transportation Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ All AI Models are Wrong, but Some are Optimal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akhil S Anand, Shambhuraj Sawant, Dirk Reinhardt, Sebastien Gros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI models that predict the future behavior of a system (a.k.a. predictive AI
+models) are central to intelligent decision-making. However, decision-making
+using predictive AI models often results in suboptimal performance. This is
+primarily because AI models are typically constructed to best fit the data, and
+hence to predict the most likely future rather than to enable high-performance
+decision-making. The hope that such prediction enables high-performance
+decisions is neither guaranteed in theory nor established in practice. In fact,
+there is increasing empirical evidence that predictive models must be tailored
+to decision-making objectives for performance. In this paper, we establish
+formal (necessary and sufficient) conditions that a predictive model (AI-based
+or not) must satisfy for a decision-making policy established using that model
+to be optimal. We then discuss their implications for building predictive AI
+models for sequential decision-making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scale-up Unlearnable Examples Learning with High-Performance Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanfan Zhu, Issac Lyngaas, Murali Gopalakrishnan Meena, Mary Ellen I. Koran, Bradley Malin, Daniel Moyer, Shunxing Bao, Anuj Kapadia, Xiao Wang, Bennett Landman, Yuankai Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in AI models are structured to retain user interactions,
+which could inadvertently include sensitive healthcare data. In the healthcare
+field, particularly when radiologists use AI-driven diagnostic tools hosted on
+online platforms, there is a risk that medical imaging data may be repurposed
+for future AI training without explicit consent, spotlighting critical privacy
+and intellectual property concerns around healthcare data usage. Addressing
+these privacy challenges, a novel approach known as Unlearnable Examples (UEs)
+has been introduced, aiming to make data unlearnable to deep learning models. A
+prominent method within this area, called Unlearnable Clustering (UC), has
+shown improved UE performance with larger batch sizes but was previously
+limited by computational resources. To push the boundaries of UE performance
+with theoretically unlimited resources, we scaled up UC learning across various
+datasets using Distributed Data Parallel (DDP) training on the Summit
+supercomputer. Our goal was to examine UE efficacy at high-performance
+computing (HPC) levels to prevent unauthorized learning and enhance data
+security, particularly exploring the impact of batch size on UE's
+unlearnability. Utilizing the robust computational capabilities of the Summit,
+extensive experiments were conducted on diverse datasets such as Pets,
+MedMNist, Flowers, and Flowers102. Our findings reveal that both overly large
+and overly small batch sizes can lead to performance instability and affect
+accuracy. However, the relationship between batch size and unlearnability
+varied across datasets, highlighting the necessity for tailored batch size
+strategies to achieve optimal data protection. Our results underscore the
+critical role of selecting appropriate batch sizes based on the specific
+characteristics of each dataset to prevent learning and ensure data security in
+deep learning applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining k-Nearest Neighbors: Abductive and Counterfactual
+  Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Barceló, Alexander Kozachinskiy, Miguel Romero Orth, Bernardo Subercaseaux, José Verschae
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the wide use of $k$-Nearest Neighbors as classification models, their
+explainability properties remain poorly understood from a theoretical
+perspective. While nearest neighbors classifiers offer interpretability from a
+"data perspective", in which the classification of an input vector $\bar{x}$ is
+explained by identifying the vectors $\bar{v}_1, \ldots, \bar{v}_k$ in the
+training set that determine the classification of $\bar{x}$, we argue that such
+explanations can be impractical in high-dimensional applications, where each
+vector has hundreds or thousands of features and it is not clear what their
+relative importance is. Hence, we focus on understanding nearest neighbor
+classifications through a "feature perspective", in which the goal is to
+identify how the values of the features in $\bar{x}$ affect its classification.
+Concretely, we study abductive explanations such as "minimum sufficient
+reasons", which correspond to sets of features in $\bar{x}$ that are enough to
+guarantee its classification, and "counterfactual explanations" based on the
+minimum distance feature changes one would have to perform in $\bar{x}$ to
+change its classification. We present a detailed landscape of positive and
+negative complexity results for counterfactual and abductive explanations,
+distinguishing between discrete and continuous feature spaces, and considering
+the impact of the choice of distance function involved. Finally, we show that
+despite some negative complexity results, Integer Quadratic Programming and SAT
+solving allow for computing explanations in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distilling Calibration via Conformalized Credal Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Huang, Sangwoo Park, Nicola Paoletti, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying artificial intelligence (AI) models on edge devices involves a
+delicate balance between meeting stringent complexity constraints, such as
+limited memory and energy resources, and ensuring reliable performance in
+sensitive decision-making tasks. One way to enhance reliability is through
+uncertainty quantification via Bayesian inference. This approach, however,
+typically necessitates maintaining and running multiple models in an ensemble,
+which may exceed the computational limits of edge devices. This paper
+introduces a low-complexity methodology to address this challenge by distilling
+calibration information from a more complex model. In an offline phase,
+predictive probabilities generated by a high-complexity cloud-based model are
+leveraged to determine a threshold based on the typical divergence between the
+cloud and edge models. At run time, this threshold is used to construct credal
+sets -- ranges of predictive probabilities that are guaranteed, with a
+user-selected confidence level, to include the predictions of the cloud model.
+The credal sets are obtained through thresholding of a divergence measure in
+the simplex of predictive probabilities. Experiments on visual and language
+tasks demonstrate that the proposed approach, termed Conformalized Distillation
+for Credal Inference (CD-CI), significantly improves calibration performance
+compared to low-complexity Bayesian methods, such as Laplace approximation,
+making it a practical and efficient solution for edge AI deployments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Rotary Position Embeddings for Automatic Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shucong Zhang, Titouan Parcollet, Rogier van Dalen, Sourav Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rotary Position Embedding (RoPE) encodes relative and absolute positional
+information in Transformer-based models through rotation matrices applied to
+input vectors within sequences. While RoPE has demonstrated superior
+performance compared to other positional embedding technologies in natural
+language processing tasks, its effectiveness in speech processing applications
+remains understudied. In this work, we conduct a comprehensive evaluation of
+RoPE across diverse automatic speech recognition (ASR) tasks. Our experimental
+results demonstrate that for ASR tasks, RoPE consistently achieves lower error
+rates compared to the currently widely used relative positional embedding. To
+facilitate further research, we release the implementation and all experimental
+recipes through the SpeechBrain toolkit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-powered virtual tissues from spatial proteomics for clinical
+  diagnostics and biomedical discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johann Wenckstern, Eeshaan Jain, Kiril Vasilev, Matteo Pariset, Andreas Wicki, Gabriele Gut, Charlotte Bunne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial proteomics technologies have transformed our understanding of complex
+tissue architectures by enabling simultaneous analysis of multiple molecular
+markers and their spatial organization. The high dimensionality of these data,
+varying marker combinations across experiments and heterogeneous study designs
+pose unique challenges for computational analysis. Here, we present Virtual
+Tissues (VirTues), a foundation model framework for biological tissues that
+operates across the molecular, cellular and tissue scale. VirTues introduces
+innovations in transformer architecture design, including a novel tokenization
+scheme that captures both spatial and marker dimensions, and attention
+mechanisms that scale to high-dimensional multiplex data while maintaining
+interpretability. Trained on diverse cancer and non-cancer tissue datasets,
+VirTues demonstrates strong generalization capabilities without task-specific
+fine-tuning, enabling cross-study analysis and novel marker integration. As a
+generalist model, VirTues outperforms existing approaches across clinical
+diagnostics, biological discovery and patient case retrieval tasks, while
+providing insights into tissue function and disease mechanisms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How to Tune a Multilingual Encoder Model for Germanic Languages: A Study
+  of PEFT, Full Fine-Tuning, and Language Adapters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Romina Oji, Jenny Kunz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the optimal use of the multilingual encoder model
+mDeBERTa for tasks in three Germanic languages -- German, Swedish, and
+Icelandic -- representing varying levels of presence and likely data quality in
+mDeBERTas pre-training data. We compare full fine-tuning with the
+parameter-efficient fine-tuning (PEFT) methods LoRA and Pfeiffer bottleneck
+adapters, finding that PEFT is more effective for the higher-resource language,
+German. However, results for Swedish and Icelandic are less consistent. We also
+observe differences between tasks: While PEFT tends to work better for question
+answering, full fine-tuning is preferable for named entity recognition.
+Inspired by previous research on modular approaches that combine task and
+language adapters, we evaluate the impact of adding PEFT modules trained on
+unstructured text, finding that this approach is not beneficial.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NoDaLiDa Baltic-HLT 2025 Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BRIGHT: A globally distributed multimodal building damage assessment
+  <span class="highlight-title">dataset</span> with very-high-resolution for all-weather disaster response 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongruixuan Chen, Jian Song, Olivier Dietrich, Clifford Broni-Bediako, Weihao Xuan, Junjue Wang, Xinlei Shao, Yimin Wei, Junshi Xia, Cuiling Lan, Konrad Schindler, Naoto Yokoya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Disaster events occur around the world and cause significant damage to human
+life and property. Earth observation (EO) data enables rapid and comprehensive
+building damage assessment (BDA), an essential capability in the aftermath of a
+disaster to reduce human casualties and to inform disaster relief efforts.
+Recent research focuses on the development of AI models to achieve accurate
+mapping of unseen disaster events, mostly using optical EO data. However,
+solutions based on optical data are limited to clear skies and daylight hours,
+preventing a prompt response to disasters. Integrating multimodal (MM) EO data,
+particularly the combination of optical and SAR imagery, makes it possible to
+provide all-weather, day-and-night disaster responses. Despite this potential,
+the development of robust multimodal AI models has been constrained by the lack
+of suitable benchmark datasets. In this paper, we present a BDA dataset using
+veRy-hIGH-resoluTion optical and SAR imagery (BRIGHT) to support AI-based
+all-weather disaster response. To the best of our knowledge, BRIGHT is the
+first open-access, globally distributed, event-diverse MM dataset specifically
+curated to support AI-based disaster response. It covers five types of natural
+disasters and two types of man-made disasters across 12 regions worldwide, with
+a particular focus on developing countries where external assistance is most
+needed. The optical and SAR imagery in BRIGHT, with a spatial resolution
+between 0.3-1 meters, provides detailed representations of individual
+buildings, making it ideal for precise BDA. In our experiments, we have tested
+seven advanced AI models trained with our BRIGHT to validate the
+transferability and robustness. The dataset and code are available at
+https://github.com/ChenHongruixuan/BRIGHT. BRIGHT also serves as the official
+dataset for the 2025 IEEE GRSS Data Fusion Contest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing speaker gender bias in large scale speech translation systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Bansal, Vikas Joshi, Harveen Chadha, Rupeshkumar Mehta, Jinyu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the issue of speaker gender bias in Speech Translation
+(ST) systems, which can lead to offensive and inaccurate translations. The
+masculine bias often found in large-scale ST systems is typically perpetuated
+through training data derived from Machine Translation (MT) systems. Our
+approach involves two key steps. First, we employ Large Language Models (LLMs)
+to rectify translations based on the speaker's gender in a cost-effective
+manner. Second, we fine-tune the ST model with the corrected data, enabling the
+model to generate gender-specific translations directly from audio cues,
+without the need for explicit gender input. Additionally, we propose a
+three-mode fine-tuned model for scenarios where the speaker's gender is either
+predefined or should not be inferred from speech cues. We demonstrate a 70%
+improvement in translations for female speakers compared to our baseline and
+other large-scale ST systems, such as Seamless M4T and Canary, on the MuST-SHE
+test set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effective faking of verbal deception detection with target-aligned
+  adversarial attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bennett Kleinberg, Riccardo Loconte, Bruno Verschuere
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Deception detection through analysing language is a promising
+avenue using both human judgments and automated machine learning judgments. For
+both forms of credibility assessment, automated adversarial attacks that
+rewrite deceptive statements to appear truthful pose a serious threat. Methods:
+We used a dataset of 243 truthful and 262 fabricated autobiographical stories
+in a deception detection task for humans and machine learning models. A large
+language model was tasked to rewrite deceptive statements so that they appear
+truthful. In Study 1, humans who made a deception judgment or used the
+detailedness heuristic and two machine learning models (a fine-tuned language
+model and a simple n-gram model) judged original or adversarial modifications
+of deceptive statements. In Study 2, we manipulated the target alignment of the
+modifications, i.e. tailoring the attack to whether the statements would be
+assessed by humans or computer models. Results: When adversarial modifications
+were aligned with their target, human (d=-0.07 and d=-0.04) and machine
+judgments (51% accuracy) dropped to the chance level. When the attack was not
+aligned with the target, both human heuristics judgments (d=0.30 and d=0.36)
+and machine learning predictions (63-78%) were significantly better than
+chance. Conclusions: Easily accessible language models can effectively help
+anyone fake deception detection efforts both by humans and machine learning
+models. Robustness against adversarial modifications for humans and machines
+depends on that target alignment. We close with suggestions on advancing
+deception research with adversarial attack designs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffuSETS: 12-lead ECG Generation Conditioned on Clinical Text Reports
+  and Patient-Specific Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongfan Lai, Jiabo Chen, Deyun Zhang, Yue Wang, Shijia Geng, Hongyan Li, Shenda Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Heart disease remains a significant threat to human health. As a non-invasive
+diagnostic tool, the electrocardiogram (ECG) is one of the most widely used
+methods for cardiac screening. However, the scarcity of high-quality ECG data,
+driven by privacy concerns and limited medical resources, creates a pressing
+need for effective ECG signal generation. Existing approaches for generating
+ECG signals typically rely on small training datasets, lack comprehensive
+evaluation frameworks, and overlook potential applications beyond data
+augmentation. To address these challenges, we propose DiffuSETS, a novel
+framework capable of generating ECG signals with high semantic alignment and
+fidelity. DiffuSETS accepts various modalities of clinical text reports and
+patient-specific information as inputs, enabling the creation of clinically
+meaningful ECG signals. Additionally, to address the lack of standardized
+evaluation in ECG generation, we introduce a comprehensive benchmarking
+methodology to assess the effectiveness of generative models in this domain.
+Our model achieve excellent results in tests, proving its superiority in the
+task of ECG generation. Furthermore, we showcase its potential to mitigate data
+scarcity while exploring novel applications in cardiology education and medical
+knowledge discovery, highlighting the broader impact of our work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Backdoor Stealthiness in Model Parameter Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyun Xu, Zhuoran Liu, Stefanos Koffas, Stjepan Picek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research on backdoor stealthiness focuses mainly on indistinguishable
+triggers in input space and inseparable backdoor representations in feature
+space, aiming to circumvent backdoor defenses that examine these respective
+spaces. However, existing backdoor attacks are typically designed to resist a
+specific type of backdoor defense without considering the diverse range of
+defense mechanisms. Based on this observation, we pose a natural question: Are
+current backdoor attacks truly a real-world threat when facing diverse
+practical defenses?
+  To answer this question, we examine 12 common backdoor attacks that focus on
+input-space or feature-space stealthiness and 17 diverse representative
+defenses. Surprisingly, we reveal a critical blind spot: Backdoor attacks
+designed to be stealthy in input and feature spaces can be mitigated by
+examining backdoored models in parameter space. To investigate the underlying
+causes behind this common vulnerability, we study the characteristics of
+backdoor attacks in the parameter space. Notably, we find that input- and
+feature-space attacks introduce prominent backdoor-related neurons in parameter
+space, which are not thoroughly considered by current backdoor attacks. Taking
+comprehensive stealthiness into account, we propose a novel supply-chain attack
+called Grond. Grond limits the parameter changes by a simple yet effective
+module, Adversarial Backdoor Injection (ABI), which adaptively increases the
+parameter-space stealthiness during the backdoor injection. Extensive
+experiments demonstrate that Grond outperforms all 12 backdoor attacks against
+state-of-the-art (including adaptive) defenses on CIFAR-10, GTSRB, and a subset
+of ImageNet. In addition, we show that ABI consistently improves the
+effectiveness of common backdoor attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The New Anticipatory Governance Culture for Innovation: Regulatory
+  Foresight, Regulatory Experimentation and Regulatory Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05921v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05921v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deirdre Ahern
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid pace of technological innovation, traditional methods of
+policy formation and legislating are becoming conspicuously anachronistic. The
+need for regulatory choices to be made to counter the deadening effect of
+regulatory lag is more important to developing markets and fostering growth
+than achieving one off regulatory perfection. This article advances scholarship
+on innovation policy and the regulation of technological innovation in the
+European Union. It does so by considering what building an agile yet robust
+anticipatory governance regulatory culture involves. It systematically
+excavates a variety of tools and elements that are being put into use in
+inventive ways and argues that these need to be more cohesively and
+systemically integrated into the regulatory toolbox. Approaches covered include
+strategic foresight, the critical embrace of iterative policy development and
+regulatory learning in the face of uncertainty and the embrace of bottom up
+approaches to cocreation of policy such as Policy Labs and the testing and
+regulatory learning through pilot regulation and experimentation. The growing
+use of regulatory sandboxes as an EU policy tool to boost innovation and
+navigate regulatory complexity as seen in the EU AI Act is also probed
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Affordably Fine-tuned LLMs Provide Better Answers to Course-specific
+  MCQs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bianca Raimondi, Saverio Giallorenzo, Maurizio Gabbrielli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In education, the capability of generating human-like text of Large Language
+Models (LLMs) inspired work on how they can increase the efficiency of learning
+and teaching. We study the affordability of these models for educators and
+students by investigating how LLMs answer multiple-choice questions (MCQs) with
+respect to hardware constraints and refinement techniques. We explore this
+space by using generic pre-trained LLMs (the 7B, 13B, and 70B variants of
+LLaMA-2) to answer 162 undergraduate-level MCQs from a course on Programming
+Languages (PL) -- the MCQ dataset is a contribution of this work, which we make
+publicly available. Specifically, we dissect how different factors, such as
+using readily-available material -- (parts of) the course's textbook -- for
+fine-tuning and quantisation (to decrease resource usage) can change the
+accuracy of the responses. The main takeaway is that smaller textbook-based
+fine-tuned models outperform generic larger ones (whose pre-training requires
+conspicuous resources), making the usage of LLMs for answering MCQs resource-
+and material-wise affordable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 40th ACM/SIGAPP Symposium On Applied Computing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EDNet: Edge-Optimized Small Target Detection in UAV Imagery -- Faster
+  Context Attention, Better Feature Fusion, and Hardware Acceleration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhifan Song, Yuan Zhang, Abd Al Rahman M. Abu Ebayyeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting small targets in drone imagery is challenging due to low
+resolution, complex backgrounds, and dynamic scenes. We propose EDNet, a novel
+edge-target detection framework built on an enhanced YOLOv10 architecture,
+optimized for real-time applications without post-processing. EDNet
+incorporates an XSmall detection head and a Cross Concat strategy to improve
+feature fusion and multi-scale context awareness for detecting tiny targets in
+diverse environments. Our unique C2f-FCA block employs Faster Context Attention
+to enhance feature extraction while reducing computational complexity. The WIoU
+loss function is employed for improved bounding box regression. With seven
+model sizes ranging from Tiny to XL, EDNet accommodates various deployment
+environments, enabling local real-time inference and ensuring data privacy.
+Notably, EDNet achieves up to a 5.6% gain in mAP@50 with significantly fewer
+parameters. On an iPhone 12, EDNet variants operate at speeds ranging from 16
+to 55 FPS, providing a scalable and efficient solution for edge-based object
+detection in challenging drone imagery. The source code and pre-trained models
+are available at: https://github.com/zsniko/EDNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 21st IEEE International Conference on Ubiquitous
+  Intelligence and Computing (UIC 2024)
+  https://www.ieee-smart-world.org/2024/uic</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving nonograms using Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José María Buades Rubio, Antoni Jaume-i-Capó, David López González, Gabriel Moyà Alcover
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonograms are logic puzzles in which cells in a grid must be colored or left
+blank according to the numbers that are located in its headers. In this study,
+we analyze different techniques to solve this type of logical problem using an
+Heuristic Algorithm, Genetic Algorithm, and Heuristic Algorithm with Neural
+Network. Furthermore, we generate a public dataset to train the neural
+networks. We published this dataset and the code of the algorithms. Combination
+of the heuristic algorithm with a neural network obtained the best results.
+From state of the art review, no previous works used neural network to solve
+nonograms, nor combined a network with other algorithms to accelerate the
+resolution process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoRAG: Retrieval-Augmented Generation over Video Corpus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soyeong Jeong, Kangsan Kim, Jinheon Baek, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) is a powerful strategy to address the
+issue of generating factually incorrect outputs in foundation models by
+retrieving external knowledge relevant to queries and incorporating it into
+their generation process. However, existing RAG approaches have primarily
+focused on textual information, with some recent advancements beginning to
+consider images, and they largely overlook videos, a rich source of multimodal
+knowledge capable of representing events, processes, and contextual details
+more effectively than any other modality. While a few recent studies explore
+the integration of videos in the response generation process, they either
+predefine query-associated videos without retrieving them according to queries,
+or convert videos into the textual descriptions without harnessing their
+multimodal richness. To tackle these, we introduce VideoRAG, a novel framework
+that not only dynamically retrieves relevant videos based on their relevance
+with queries but also utilizes both visual and textual information of videos in
+the output generation. Further, to operationalize this, our method revolves
+around the recent advance of Large Video Language Models (LVLMs), which enable
+the direct processing of video content to represent it for retrieval and
+seamless integration of the retrieved videos jointly with queries. We
+experimentally validate the effectiveness of VideoRAG, showcasing that it is
+superior to relevant baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Annealing Machine-assisted Learning of Graph Neural Network for
+  Combinatorial Optimization <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Loyola, Kento Hasegawa, Andres Hoyos-Idobro, Kazuo Ono, Toyotaro Suzumura, Yu Hirate, Masanao Yamaoka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Annealing Machines (AM) have shown increasing capabilities in solving
+complex combinatorial problems, positioning themselves as a more immediate
+alternative to the expected advances of future fully quantum solutions, there
+are still scaling limitations. In parallel, Graph Neural Networks (GNN) have
+been recently adapted to solve combinatorial problems, showing competitive
+results and potentially high scalability due to their distributed nature. We
+propose a merging approach that aims at retaining both the accuracy exhibited
+by AMs and the representational flexibility and scalability of GNNs. Our model
+considers a compression step, followed by a supervised interaction where
+partial solutions obtained from the AM are used to guide local GNNs from where
+node feature representations are obtained and combined to initialize an
+additional GNN-based solver that handles the original graph's target problem.
+Intuitively, the AM can solve the combinatorial problem indirectly by infusing
+its knowledge into the GNN. Experiments on canonical optimization problems show
+that the idea is feasible, effectively allowing the AM to solve size problems
+beyond its original limits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Second Workshop on Machine Learning with New Compute Paradigms at
+  NeurIPS 2024 (MLNCP 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-Driven Diabetic Retinopathy Screening: Multicentric Validation of
+  AIDRSS in India 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Kr Dey, Pradeep Walia, Girish Somvanshi, Abrar Ali, Sagarnil Das, Pallabi Paul, Minakhi Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Diabetic retinopathy (DR) is a major cause of vision loss,
+particularly in India, where access to retina specialists is limited in rural
+areas. This study aims to evaluate the Artificial Intelligence-based Diabetic
+Retinopathy Screening System (AIDRSS) for DR detection and prevalence
+assessment, addressing the growing need for scalable, automated screening
+solutions in resource-limited settings.
+  Approach: A multicentric, cross-sectional study was conducted in Kolkata,
+India, involving 5,029 participants and 10,058 macula-centric retinal fundus
+images. The AIDRSS employed a deep learning algorithm with 50 million trainable
+parameters, integrated with Contrast Limited Adaptive Histogram Equalization
+(CLAHE) preprocessing for enhanced image quality. DR was graded using the
+International Clinical Diabetic Retinopathy (ICDR) Scale, categorizing disease
+into five stages (DR0 to DR4). Statistical metrics including sensitivity,
+specificity, and prevalence rates were evaluated against expert retina
+specialist assessments.
+  Results: The prevalence of DR in the general population was 13.7%, rising to
+38.2% among individuals with elevated random blood glucose levels. The AIDRSS
+achieved an overall sensitivity of 92%, specificity of 88%, and 100%
+sensitivity for detecting referable DR (DR3 and DR4). These results demonstrate
+the system's robust performance in accurately identifying and grading DR in a
+diverse population.
+  Conclusions: AIDRSS provides a reliable, scalable solution for early DR
+detection in resource-constrained environments. Its integration of advanced AI
+techniques ensures high diagnostic accuracy, with potential to significantly
+reduce the burden of diabetes-related vision loss in underserved regions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 5 figures. arXiv admin note: substantial text overlap with
+  arXiv:1812.07105 by other authors without attribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Models for Smarter UAVs: Decision-Making and Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yousef Emami, Hao Zhou, Luis Almeida, Kai Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned Aerial Vehicles (UAVs) are increasingly adopted in modern
+communication networks. However, challenges in decision-making and digital
+modeling continue to impede their rapid advancement. Reinforcement Learning
+(RL) algorithms face limitations such as low sample efficiency and limited data
+versatility, further magnified in UAV communication scenarios. Moreover,
+Digital Twin (DT) modeling introduces substantial decision-making and data
+management complexities. RL models, often integrated into DT frameworks,
+require extensive training data to achieve accurate predictions. In contrast to
+traditional approaches that focus on class boundaries, Diffusion Models (DMs),
+a new class of generative AI, learn the underlying probability distribution
+from the training data and can generate trustworthy new patterns based on this
+learned distribution. This paper explores the integration of DMs with RL and DT
+to effectively address these challenges. By combining the data generation
+capabilities of DMs with the decision-making framework of RL and the modeling
+accuracy of DT, the integration improves the adaptability and real-time
+performance of UAV communication. Moreover, the study shows how DMs can
+alleviate data scarcity, improve policy networks, and optimize dynamic
+modeling, providing a robust solution for complex UAV communication scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Integrated Dispatching and Idle Fleet Steering with Deep
+  Reinforcement Learning for A Meal Delivery Platform 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyi Cheng, Shadi Sharif Azadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To achieve high service quality and profitability, meal delivery platforms
+like Uber Eats and Grubhub must strategically operate their fleets to ensure
+timely deliveries for current orders while mitigating the consequential impacts
+of suboptimal decisions that leads to courier understaffing in the future. This
+study set out to solve the real-time order dispatching and idle courier
+steering problems for a meal delivery platform by proposing a reinforcement
+learning (RL)-based strategic dual-control framework. To address the inherent
+sequential nature of these problems, we model both order dispatching and
+courier steering as Markov Decision Processes. Trained via a deep reinforcement
+learning (DRL) framework, we obtain strategic policies by leveraging the
+explicitly predicted demands as part of the inputs. In our dual-control
+framework, the dispatching and steering policies are iteratively trained in an
+integrated manner. These forward-looking policies can be executed in real-time
+and provide decisions while jointly considering the impacts on local and
+network levels. To enhance dispatching fairness, we propose convolutional deep
+Q networks to construct fair courier embeddings. To simultaneously rebalance
+the supply and demand within the service network, we propose to utilize
+mean-field approximated supply-demand knowledge to reallocate idle couriers at
+the local level. Utilizing the policies generated by the RL-based strategic
+dual-control framework, we find the delivery efficiency and fairness of
+workload distribution among couriers have been improved, and under-supplied
+conditions have been alleviated within the service network. Our study sheds
+light on designing an RL-based framework to enable forward-looking real-time
+operations for meal delivery platforms and other on-demand services.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alignment without Over-optimization: Training-Free Solution for
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunwoo Kim, Minkyu Kim, Dongmin Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models excel in generative tasks, but aligning them with specific
+objectives while maintaining their versatility remains challenging. Existing
+fine-tuning methods often suffer from reward over-optimization, while
+approximate guidance approaches fail to optimize target rewards effectively.
+Addressing these limitations, we propose a training-free sampling method based
+on Sequential Monte Carlo (SMC) to sample from the reward-aligned target
+distribution. Our approach, tailored for diffusion sampling and incorporating
+tempering techniques, achieves comparable or superior target rewards to
+fine-tuning methods while preserving diversity and cross-reward generalization.
+We demonstrate its effectiveness in single-reward optimization, multi-objective
+scenarios, and online black-box optimization. This work offers a robust
+solution for aligning diffusion models with diverse downstream objectives
+without compromising their general capabilities. Code is available at
+https://github.com/krafton-ai/DAS .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Counterfactual Explanations under Model Multiplicity Using
+  Multi-Objective Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keita Kinjo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, explainability in machine learning has gained importance. In
+this context, counterfactual explanation (CE), which is an explanation method
+that uses examples, has attracted attention. However, it has been pointed out
+that CE is not robust when there are multiple machine-learning models. These
+problems are important when using machine learning to make safe decisions. In
+this paper, we propose robust CEs that introduce a new viewpoint - Pareto
+improvement - and a method that uses multi-objective optimization to generate
+it. To evaluate the proposed method, we conducted experiments using both
+simulated and actual data. The results demonstrate that the proposed method is
+robust and useful. We believe that this research will contribute to a wide
+range of research areas, such as explainability in machine learning,
+decision-making, and action planning based on machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Impact of Human Feedback via Influence Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taywon Min, Haeone Lee, Hanho Ryu, Yongchan Kwon, Kimin Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Reinforcement Learning from Human Feedback (RLHF), it is crucial to learn
+suitable reward models from human feedback to align large language models
+(LLMs) with human intentions. However, human feedback can often be noisy,
+inconsistent, or biased, especially when evaluating complex responses. Such
+feedback can lead to misaligned reward signals, potentially causing unintended
+side effects during the RLHF process. To address these challenges, we explore
+the use of influence functions to measure the impact of human feedback on the
+performance of reward models. We propose a compute-efficient approximation
+method that enables the application of influence functions to LLM-based reward
+models and large-scale preference datasets. In our experiments, we demonstrate
+two key applications of influence functions: (1) detecting common forms of
+labeler bias in human feedback datasets and (2) guiding labelers to refine
+their strategies to align more closely with expert feedback. By quantifying the
+impact of human feedback on reward models, we believe that influence functions
+can enhance feedback interpretability and contribute to scalable oversight in
+RLHF, helping labelers provide more accurate and consistent feedback. Source
+code is available at https://github.com/mintaywon/IF_RLHF
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Source code: https://github.com/mintaywon/IF_RLHF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UV-Attack: Physical-World Adversarial Attacks for Person Detection via
+  Dynamic-NeRF-based UV Mapping <span class="chip">ICLR2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanjie Li, Wenxuan Zhang, Kaisheng Liang, Bin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent research, adversarial attacks on person detectors using patches or
+static 3D model-based texture modifications have struggled with low success
+rates due to the flexible nature of human movement. Modeling the 3D
+deformations caused by various actions has been a major challenge. Fortunately,
+advancements in Neural Radiance Fields (NeRF) for dynamic human modeling offer
+new possibilities. In this paper, we introduce UV-Attack, a groundbreaking
+approach that achieves high success rates even with extensive and unseen human
+actions. We address the challenge above by leveraging dynamic-NeRF-based UV
+mapping. UV-Attack can generate human images across diverse actions and
+viewpoints, and even create novel actions by sampling from the SMPL parameter
+space. While dynamic NeRF models are capable of modeling human bodies,
+modifying clothing textures is challenging because they are embedded in neural
+network parameters. To tackle this, UV-Attack generates UV maps instead of RGB
+images and modifies the texture stacks. This approach enables real-time texture
+edits and makes the attack more practical. We also propose a novel Expectation
+over Pose Transformation loss (EoPT) to improve the evasion success rate on
+unseen poses and views. Our experiments show that UV-Attack achieves a 92.75%
+attack success rate against the FastRCNN model across varied poses in dynamic
+video settings, significantly outperforming the state-of-the-art AdvCamou
+attack, which only had a 28.50% ASR. Moreover, we achieve 49.5% ASR on the
+latest YOLOv8 detector in black-box settings. This work highlights the
+potential of dynamic NeRF-based UV mapping for creating more effective
+adversarial attacks on person detectors, addressing key challenges in modeling
+human movement and texture modification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 22 figures, submitted to ICLR2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Halal or Not: Knowledge Graph Completion for Predicting Cultural
+  Appropriateness of Daily Products 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Thuy Hoang, Tien-Bach-Thanh Do, Jinho Seo, Seung Charlie Kim, Luong Vuong Nguyen, Duong Nguyen Minh Huy, Hyeon-Ju Jeon, O-Joun Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing demand for halal cosmetic products has exposed significant
+challenges, especially in Muslim-majority countries. Recently, various machine
+learning-based strategies, e.g., image-based methods, have shown remarkable
+success in predicting the halal status of cosmetics. However, these methods
+mainly focus on analyzing the discrete and specific ingredients within separate
+cosmetics, which ignore the high-order and complex relations between cosmetics
+and ingredients. To address this problem, we propose a halal cosmetic
+recommendation framework, namely HaCKG, that leverages a knowledge graph of
+cosmetics and their ingredients to explicitly model and capture the
+relationships between cosmetics and their components. By representing cosmetics
+and ingredients as entities within the knowledge graph, HaCKG effectively
+learns the high-order and complex relations between entities, offering a robust
+method for predicting halal status. Specifically, we first construct a cosmetic
+knowledge graph representing the relations between various cosmetics,
+ingredients, and their properties. We then propose a pre-trained relational
+graph attention network model with residual connections to learn the structural
+relation between entities in the knowledge graph. The pre-trained model is then
+fine-tuned on downstream cosmetic data to predict halal status. Extensive
+experiments on the cosmetic dataset over halal prediction tasks demonstrate the
+superiority of our model over state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Migician: Revealing the Magic of Free-Form Multi-Image Grounding in
+  Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        You Li, Heyu Huang, Chi Chen, Kaiyu Huang, Chao Huang, Zonghao Guo, Zhiyuan Liu, Jinan Xu, Yuhua Li, Ruixuan Li, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advancement of Multimodal Large Language Models (MLLMs) has
+significantly improved their fine-grained perception of single images and
+general comprehension across multiple images. However, existing MLLMs still
+face challenges in achieving precise grounding in complex multi-image
+scenarios. To address this, we first explore a Chain-of-Thought (CoT) framework
+that integrates single-image grounding with multi-image comprehension. While
+partially effective, it remains unstable and struggles to capture abstract
+visual information due to its non-end-to-end nature. Therefore, we introduce
+Migician, the first multi-image grounding model capable of performing free-form
+and accurate grounding across multiple images. To support this, we present the
+MGrounding-630k dataset, which comprises data for several multi-image grounding
+tasks derived from existing datasets, along with newly generated free-form
+grounding instruction-following data. Furthermore, we propose MIG-Bench, a
+comprehensive benchmark specifically designed for evaluating multi-image
+grounding capabilities. Experimental results demonstrate that our model
+achieves significantly superior multi-image grounding capabilities,
+outperforming the best existing MLLMs by 21.61% and even surpassing much larger
+70B models. Our code, model, dataset, and benchmark are fully open-sourced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deontic Temporal Logic for Formal Verification of AI Ethics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Priya T. V., Shrisha Rao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring ethical behavior in Artificial Intelligence (AI) systems amidst
+their increasing ubiquity and influence is a major concern the world over. The
+use of formal methods in AI ethics is a possible crucial approach for
+specifying and verifying the ethical behavior of AI systems. This paper
+proposes a formalization based on deontic logic to define and evaluate the
+ethical behavior of AI systems, focusing on system-level specifications,
+contributing to this important goal. It introduces axioms and theorems to
+capture ethical requirements related to fairness and explainability. The
+formalization incorporates temporal operators to reason about the ethical
+behavior of AI systems over time. The authors evaluate the effectiveness of
+this formalization by assessing the ethics of the real-world COMPAS and loan
+prediction AI systems. Various ethical properties of the COMPAS and loan
+prediction systems are encoded using deontic logical formulas, allowing the use
+of an automated theorem prover to verify whether these systems satisfy the
+defined properties. The formal verification reveals that both systems fail to
+fulfill certain key ethical properties related to fairness and
+non-discrimination, demonstrating the effectiveness of the proposed
+formalization in identifying potential ethical issues in real-world AI
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Exploration with Adaptive Gating for Efficient Problem Solving
+  with Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05752v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05752v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungjae Lee, Hyejin Park, Jaechang Kim, Jungseul Ok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language models (LLMs) have shown remarkable
+potential in various complex tasks requiring multi-step reasoning methods like
+tree search to explore diverse reasoning paths. However, existing methods often
+suffer from computational inefficiency and redundancy. First, they overlook the
+diversity of task difficulties, leading to unnecessarily extensive searches
+even for easy tasks. Second, they neglect the semantics of reasoning paths,
+resulting in redundant exploration of semantically identical paths. To address
+these limitations, we propose Semantic Exploration with Adaptive Gating (SEAG),
+a computationally efficient method. SEAG employs an adaptive gating mechanism
+that dynamically decides whether to conduct a tree search, based on the
+confidence level of answers from a preceding simple reasoning method.
+Furthermore, its tree-based exploration consolidates semantically identical
+reasoning steps, reducing redundant explorations while maintaining or even
+improving accuracy. Our extensive experiments demonstrate that SEAG
+significantly improves accuracy by 4.3% on average while requiring only 31% of
+computational costs compared to existing tree search-based methods on complex
+reasoning benchmarks including GSM8K and ARC with diverse language models such
+as Llama2, Llama3, and Mistral.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Element-wise Attention Is All You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxin Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The self-attention (SA) mechanism has demonstrated superior performance
+across various domains, yet it suffers from substantial complexity during both
+training and inference. The next-generation architecture, aiming at retaining
+the competitive performance of SA while achieving low-cost inference and
+efficient long-sequence training, primarily focuses on three approaches: linear
+attention, linear RNNs, and state space models. Although these approaches
+achieve reduced complexity than SA, they all have built-in performance
+degradation factors, such as diminished “spikiness” and compression of
+historical information. In contrast to these approaches, we propose a novel
+element-wise attention mechanism, which uses the element-wise squared Euclidean
+distance, instead of the dot product operation, to compute similarity and
+approximates the quadratic complexity term $\exp(q_{ic}k_{jc})$ with a Taylor
+polynomial. This design achieves remarkable efficiency: during training, the
+element-wise attention has a complexity of $\mathcal{O}(tLD)$, making
+long-sequence training both computationally and memory efficient, where $L$ is
+the sequence length, $D$ is the feature dimension, and $t$ is the highest order
+of the polynomial; during inference, it can be reformulated as recurrent neural
+networks, achieving a inference complexity of $\mathcal{O}(tD)$. Furthermore,
+the element-wise attention circumvents the performance degradation factors
+present in these approaches and achieves performance comparable to SA in both
+causal and non-causal forms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ExPO: Explainable Phonetic Trait-Oriented Network for Speaker
+  Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Ma, Shuai Wang, Tianchi Liu, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In speaker verification, we use computational method to verify if an
+utterance matches the identity of an enrolled speaker. This task is similar to
+the manual task of forensic voice comparison, where linguistic analysis is
+combined with auditory measurements to compare and evaluate voice samples.
+Despite much success, we have yet to develop a speaker verification system that
+offers explainable results comparable to those from manual forensic voice
+comparison. A novel approach, Explainable Phonetic Trait-Oriented (ExPO)
+network, is proposed in this paper to introduce the speaker's phonetic trait
+which describes the speaker's characteristics at the phonetic level, resembling
+what forensic comparison does. ExPO not only generates utterance-level speaker
+embeddings but also allows for fine-grained analysis and visualization of
+phonetic traits, offering an explainable speaker verification process.
+Furthermore, we investigate phonetic traits from within-speaker and
+between-speaker variation perspectives to determine which trait is most
+effective for speaker verification, marking an important step towards
+explainable speaker verification. Our code is available at
+https://github.com/mmmmayi/ExPO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Signal Processing Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enabling Scalable Oversight via Self-Evolving Critic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyang Tang, Ziniu Li, Zhenyang Xiao, Tian Ding, Ruoyu Sun, Benyou Wang, Dayiheng Liu, Fei Huang, Tianyu Liu, Bowen Yu, Junyang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their remarkable performance, the development of Large Language
+Models (LLMs) faces a critical challenge in scalable oversight: providing
+effective feedback for tasks where human evaluation is difficult or where LLMs
+outperform humans. While there is growing interest in using LLMs for critique,
+current approaches still rely on human annotations or more powerful models,
+leaving the issue of enhancing critique capabilities without external
+supervision unresolved. We introduce SCRIT (Self-evolving CRITic), a framework
+that enables genuine self-evolution of critique abilities. Technically, SCRIT
+self-improves by training on synthetic data, generated by a contrastive-based
+self-critic that uses reference solutions for step-by-step critique, and a
+self-validation mechanism that ensures critique quality through correction
+outcomes. Implemented with Qwen2.5-72B-Instruct, one of the most powerful LLMs,
+SCRIT achieves up to a 10.3\% improvement on critique-correction and error
+identification benchmarks. Our analysis reveals that SCRIT's performance scales
+positively with data and model size, outperforms alternative approaches, and
+benefits critically from its self-validation component.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Shark Tracking and Biometrics from Aerial Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chinmay K Lalgudi, Mark E Leone, Jaden V Clark, Sergio Madrigal-Mora, Mario Espinoza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent widespread adoption of drones for studying marine animals provides
+opportunities for deriving biological information from aerial imagery. The
+large scale of imagery data acquired from drones is well suited for machine
+learning (ML) analysis. Development of ML models for analyzing marine animal
+aerial imagery has followed the classical paradigm of training, testing, and
+deploying a new model for each dataset, requiring significant time, human
+effort, and ML expertise. We introduce Frame Level ALIgment and tRacking
+(FLAIR), which leverages the video understanding of Segment Anything Model 2
+(SAM2) and the vision-language capabilities of Contrastive Language-Image
+Pre-training (CLIP). FLAIR takes a drone video as input and outputs
+segmentation masks of the species of interest across the video. Notably, FLAIR
+leverages a zero-shot approach, eliminating the need for labeled data, training
+a new model, or fine-tuning an existing model to generalize to other species.
+With a dataset of 18,000 drone images of Pacific nurse sharks, we trained
+state-of-the-art object detection models to compare against FLAIR. We show that
+FLAIR massively outperforms these object detectors and performs competitively
+against two human-in-the-loop methods for prompting SAM2, achieving a Dice
+score of 0.81. FLAIR readily generalizes to other shark species without
+additional human effort and can be combined with novel heuristics to
+automatically extract relevant information including length and tailbeat
+frequency. FLAIR has significant potential to accelerate aerial imagery
+analysis workflows, requiring markedly less human effort and expertise than
+traditional machine learning workflows, while achieving superior accuracy. By
+reducing the effort required for aerial imagery analysis, FLAIR allows
+scientists to spend more time interpreting results and deriving insights about
+marine ecosystems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How to Enable Effective Cooperation Between Humans and NLP Models: A
+  <span class="highlight-title">Survey</span> of Principles, Formalizations, and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Huang, Yang Deng, Wenqiang Lei, Jiancheng Lv, Tat-Seng Chua, Jimmy Xiangji Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of large language models (LLMs), intelligent models have
+evolved from mere tools to autonomous agents with their own goals and
+strategies for cooperating with humans. This evolution has birthed a novel
+paradigm in NLP, i.e., human-model cooperation, that has yielded remarkable
+progress in numerous NLP tasks in recent years. In this paper, we take the
+first step to present a thorough review of human-model cooperation, exploring
+its principles, formalizations, and open challenges. In particular, we
+introduce a new taxonomy that provides a unified perspective to summarize
+existing approaches. Also, we discuss potential frontier areas and their
+corresponding challenges. We regard our work as an entry point, paving the way
+for more breakthrough research in this regard.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiagent Finetuning: Self Improvement with Diverse Reasoning Chains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vighnesh Subramaniam, Yilun Du, Joshua B. Tenenbaum, Antonio Torralba, Shuang Li, Igor Mordatch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved remarkable performance in recent
+years but are fundamentally limited by the underlying training data. To improve
+models beyond the training data, recent works have explored how LLMs can be
+used to generate synthetic data for autonomous self-improvement. However,
+successive steps of self-improvement can reach a point of diminishing returns.
+In this work, we propose a complementary approach towards self-improvement
+where finetuning is applied to a multiagent society of language models. A group
+of language models, all starting from the same base model, are independently
+specialized by updating each one using data generated through multiagent
+interactions among the models. By training each model on independent sets of
+data, we illustrate how this approach enables specialization across models and
+diversification over the set of models. As a result, our overall system is able
+to preserve diverse reasoning chains and autonomously improve over many more
+rounds of fine-tuning than single-agent self-improvement methods. We
+quantitatively illustrate the efficacy of the approach across a wide suite of
+reasoning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 13 figures, 7 tables; Project page at
+  https://llm-multiagent-ft.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EXION: Exploiting Inter- and Intra-Iteration Output Sparsity for
+  Diffusion Models <span class="chip">HPCA 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaehoon Heo, Adiwena Putra, Jieon Yoon, Sungwoong Yune, Hangyeol Lee, Ji-Hoon Kim, Joo-Young Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past few years, diffusion models have emerged as novel AI solutions,
+generating diverse multi-modal outputs from text prompts. Despite their
+capabilities, they face challenges in computing, such as excessive latency and
+energy consumption due to their iterative architecture. Although prior works
+specialized in transformer acceleration can be applied, the iterative nature of
+diffusion models remains unresolved. In this paper, we present EXION, the first
+SW-HW co-designed diffusion accelerator that solves the computation challenges
+by exploiting the unique inter- and intra-iteration output sparsity in
+diffusion models. To this end, we propose two SW-level optimizations. First, we
+introduce the FFN-Reuse algorithm that identifies and skips redundant
+computations in FFN layers across different iterations (inter-iteration
+sparsity). Second, we use a modified eager prediction method that employs
+two-step leading-one detection to accurately predict the attention score,
+skipping unnecessary computations within an iteration (intra-iteration
+sparsity). We also introduce a novel data compaction mechanism named ConMerge,
+which can enhance HW utilization by condensing and merging sparse matrices into
+compact forms. Finally, it has a dedicated HW architecture that supports the
+above sparsity-inducing algorithms, translating high output sparsity into
+improved energy efficiency and performance. To verify the feasibility of the
+EXION, we first demonstrate that it has no impact on accuracy in various types
+of multi-modal diffusion models. We then instantiate EXION in both server- and
+edge-level settings and compare its performance against GPUs with similar
+specifications. Our evaluation shows that EXION achieves dramatic improvements
+in performance and energy efficiency by 3.2-379.3x and 45.1-3067.6x compared to
+a server GPU and by 42.6-1090.9x and 196.9-4668.2x compared to an edge GPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in 2025 IEEE International Symposium on High-Performance
+  Computer Architecture (HPCA 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Facilitate Collaboration between Large Language Model and Task-specific
+  Model for Time Series Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiyi Chen, Leilei Zhang, Guansong Pang, Roger Zimmermann, Shuiguang Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In anomaly detection, methods based on large language models (LLMs) can
+incorporate expert knowledge, while task-specific smaller models excel at
+extracting normal patterns and detecting value fluctuations. Inspired by the
+human nervous system, where the brain stores expert knowledge and the
+peripheral nervous system and spinal cord handle specific tasks like withdrawal
+and knee-jerk reflexes, we propose CoLLaTe, a framework designed to facilitate
+collaboration between LLMs and task-specific models, leveraging the strengths
+of both.
+  In this work, we first formulate the collaboration process and identify two
+key challenges in the collaboration between LLMs and task-specific models: (1)
+the misalignment between the expression domains of LLMs and smaller models, and
+(2) error accumulation arising from the predictions of both models.
+  To address these challenges, we introduce two key components in CoLLaTe: the
+alignment module and the collaborative loss function. Through theoretical
+analysis and experimental validation, we demonstrate that these components
+effectively mitigate the identified challenges and achieve better performance
+than LLM based methods and task-specific smaller model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Network Diffuser for Placing-Scheduling Service Function Chains with
+  Inverse Demonstration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05673v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05673v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuyuan Zhang, Vaneet Aggarwal, Tian Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network services are increasingly managed by considering chained-up virtual
+network functions and relevant traffic flows, known as the Service Function
+Chains (SFCs). To deal with sequential arrivals of SFCs in an online fashion,
+we must consider two closely-coupled problems - an SFC placement problem that
+maps SFCs to servers/links in the network and an SFC scheduling problem that
+determines when each SFC is executed. Solving the whole SFC problem targeting
+these two optimizations jointly is extremely challenging. In this paper, we
+propose a novel network diffuser using conditional generative modeling for this
+SFC placing-scheduling optimization. Recent advances in generative AI and
+diffusion models have made it possible to generate high-quality images/videos
+and decision trajectories from language description. We formulate the SFC
+optimization as a problem of generating a state sequence for planning and
+perform graph diffusion on the state trajectories to enable extraction of SFC
+decisions, with SFC optimization constraints and objectives as conditions. To
+address the lack of demonstration data due to NP-hardness and exponential
+problem space of the SFC optimization, we also propose a novel and somewhat
+maverick approach -- Rather than solving instances of this difficult
+optimization, we start with randomly-generated solutions as input, and then
+determine appropriate SFC optimization problems that render these solutions
+feasible. This inverse demonstration enables us to obtain sufficient expert
+demonstrations, i.e., problem-solution pairs, through further optimization. In
+our numerical evaluations, the proposed network diffuser outperforms learning
+and heuristic baselines, by $\sim$20\% improvement in SFC reward and $\sim$50\%
+reduction in SFC waiting time and blocking rate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE INFOCOM 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TransPlace: Transferable Circuit Global Placement via Graph Neural
+  Network <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunbo Hou, Haoran Ye, Yingxue Zhang, Siyuan Xu, Guojie Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global placement, a critical step in designing the physical layout of
+computer chips, is essential to optimize chip performance. Prior global
+placement methods optimize each circuit design individually from scratch. Their
+neglect of transferable knowledge limits solution efficiency and chip
+performance as circuit complexity drastically increases. This study presents
+TransPlace, a global placement framework that learns to place millions of
+mixed-size cells in continuous space. TransPlace introduces i) Netlist Graph to
+efficiently model netlist topology, ii) Cell-flow and relative position
+encoding to learn SE(2)-invariant representation, iii) a tailored graph neural
+network architecture for informed parameterization of placement knowledge, and
+iv) a two-stage strategy for coarse-to-fine placement. Compared to
+state-of-the-art placement methods, TransPlace-trained on a few high-quality
+placements-can place unseen circuits with 1.2x speedup while reducing
+congestion by 30%, timing by 9%, and wirelength by 5%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Measure Quantum Neural Networks <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05663v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05663v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Yen-Chi Chen, Huan-Hsin Tseng, Hsin-Yi Lin, Shinjae Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid progress in quantum computing (QC) and machine learning (ML) has
+attracted growing attention, prompting extensive research into quantum machine
+learning (QML) algorithms to solve diverse and complex problems. Designing
+high-performance QML models demands expert-level proficiency, which remains a
+significant obstacle to the broader adoption of QML. A few major hurdles
+include crafting effective data encoding techniques and parameterized quantum
+circuits, both of which are crucial to the performance of QML models.
+Additionally, the measurement phase is frequently overlooked-most current QML
+models rely on pre-defined measurement protocols that often fail to account for
+the specific problem being addressed. We introduce a novel approach that makes
+the observable of the quantum system-specifically, the Hermitian
+matrix-learnable. Our method features an end-to-end differentiable learning
+framework, where the parameterized observable is trained alongside the ordinary
+quantum circuit parameters simultaneously. Using numerical simulations, we show
+that the proposed method can identify observables for variational quantum
+circuits that lead to improved outcomes, such as higher classification
+accuracy, thereby boosting the overall performance of QML models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025 Workshop: Quantum Machine Learning in Signal
+  Processing and Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cascaded Self-Evaluation Augmented Training for Efficient Multimodal
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05662v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05662v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheqi Lv, Wenkai Wang, Jiawei Wang, Shengyu Zhang, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient Multimodal Large Language Models (EMLLMs) have rapidly advanced
+recently. Incorporating Chain-of-Thought (CoT) reasoning and step-by-step
+self-evaluation has improved their performance. However, limited parameters
+often hinder EMLLMs from effectively using self-evaluation during inference.
+Key challenges include synthesizing evaluation data, determining its quantity,
+optimizing training and inference strategies, and selecting appropriate
+prompts.
+  To address these issues, we introduce Self-Evaluation Augmented Training
+(SEAT). SEAT uses more powerful EMLLMs for CoT reasoning, data selection, and
+evaluation generation, then trains EMLLMs with the synthesized data. However,
+handling long prompts and maintaining CoT reasoning quality are problematic.
+Therefore, we propose Cascaded Self-Evaluation Augmented Training (Cas-SEAT),
+which breaks down lengthy prompts into shorter, task-specific cascaded prompts
+and reduces costs for resource-limited settings. During data synthesis, we
+employ open-source 7B-parameter EMLLMs and annotate a small dataset with short
+prompts.
+  Experiments demonstrate that Cas-SEAT significantly boosts EMLLMs'
+self-evaluation abilities, improving performance by 19.68%, 55.57%, and 46.79%
+on the MathVista, Math-V, and We-Math datasets, respectively. Additionally, our
+Cas-SEAT Dataset serves as a valuable resource for future research in enhancing
+EMLLM self-evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaboration of Large Language Models and Small Recommendation Models
+  for Device-Cloud Recommendation <span class="chip">KDD'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheqi Lv, Tianyu Zhan, Wenjie Wang, Xinyu Lin, Shengyu Zhang, Wenqiao Zhang, Jiwei Li, Kun Kuang, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) for Recommendation (LLM4Rec) is a promising
+research direction that has demonstrated exceptional performance in this field.
+However, its inability to capture real-time user preferences greatly limits the
+practical application of LLM4Rec because (i) LLMs are costly to train and infer
+frequently, and (ii) LLMs struggle to access real-time data (its large number
+of parameters poses an obstacle to deployment on devices). Fortunately, small
+recommendation models (SRMs) can effectively supplement these shortcomings of
+LLM4Rec diagrams by consuming minimal resources for frequent training and
+inference, and by conveniently accessing real-time data on devices.
+  In light of this, we designed the Device-Cloud LLM-SRM Collaborative
+Recommendation Framework (LSC4Rec) under a device-cloud collaboration setting.
+LSC4Rec aims to integrate the advantages of both LLMs and SRMs, as well as the
+benefits of cloud and edge computing, achieving a complementary synergy. We
+enhance the practicability of LSC4Rec by designing three strategies:
+collaborative training, collaborative inference, and intelligent request.
+During training, LLM generates candidate lists to enhance the ranking ability
+of SRM in collaborative scenarios and enables SRM to update adaptively to
+capture real-time user interests. During inference, LLM and SRM are deployed on
+the cloud and on the device, respectively. LLM generates candidate lists and
+initial ranking results based on user behavior, and SRM get reranking results
+based on the candidate list, with final results integrating both LLM's and
+SRM's scores. The device determines whether a new candidate list is needed by
+comparing the consistency of the LLM's and SRM's sorted lists. Our
+comprehensive and extensive experimental analysis validates the effectiveness
+of each strategy in LSC4Rec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on KDD'25: Proceedings of the ACM SIGKDD Conference on
+  Knowledge Discovery and Data Mining 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Representations for High-Cardinality Categorical Variables in
+  Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High\-cardinality categorical variables pose significant challenges in
+machine learning, particularly in terms of computational efficiency and model
+interpretability. Traditional one\-hot encoding often results in
+high\-dimensional sparse feature spaces, increasing the risk of overfitting and
+reducing scalability. This paper introduces novel encoding techniques,
+including means encoding, low\-rank encoding, and multinomial logistic
+regression encoding, to address these challenges. These methods leverage
+sufficient representations to generate compact and informative embeddings of
+categorical data. We conduct rigorous theoretical analyses and empirical
+validations on diverse datasets, demonstrating significant improvements in
+model performance and computational efficiency compared to baseline methods.
+The proposed techniques are particularly effective in domains requiring
+scalable solutions for large datasets, paving the way for more robust and
+efficient applications in machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2025 International Conference on Advanced Machine Learning and Data
+  Science (AMLDS 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iconicity in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Marklová, Jiří Milička, Leonid Ryvkin, Ľudmila Lacková Bennet, Libuše Kormaníková
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lexical iconicity, a direct relation between a word's meaning and its form,
+is an important aspect of every natural language, most commonly manifesting
+through sound-meaning associations. Since Large language models' (LLMs') access
+to both meaning and sound of text is only mediated (meaning through textual
+context, sound through written representation, further complicated by
+tokenization), we might expect that the encoding of iconicity in LLMs would be
+either insufficient or significantly different from human processing. This
+study addresses this hypothesis by having GPT-4 generate highly iconic
+pseudowords in artificial languages. To verify that these words actually carry
+iconicity, we had their meanings guessed by Czech and German participants
+(n=672) and subsequently by LLM-based participants (generated by GPT-4 and
+Claude 3.5 Sonnet). The results revealed that humans can guess the meanings of
+pseudowords in the generated iconic language more accurately than words in
+distant natural languages and that LLM-based participants are even more
+successful than humans in this task. This core finding is accompanied by
+several additional analyses concerning the universality of the generated
+language and the cues that both human and LLM-based participants utilize.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Supplementary information: https://osf.io/ywjrk/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Impact of Model Scaling on Seen and Unseen Language Performance <span class="chip">AAAI25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rhitabrat Pokharel, Sina Bagheri Nezhad, Ameeta Agrawal, Suresh Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of Large Language Models (LLMs), particularly those
+trained on multilingual corpora, has intensified the need for a deeper
+understanding of their performance across a diverse range of languages and
+model sizes. Our research addresses this critical need by studying the
+performance and scaling behavior of multilingual LLMs in text classification
+and machine translation tasks across 204 languages. We systematically examine
+both seen and unseen languages across three model families of varying sizes in
+zero-shot and few-shot settings. Our findings show significant differences in
+scaling behavior between zero-shot and two-shot scenarios, with striking
+disparities in performance between seen and unseen languages. Model scale has
+little effect on zero-shot performance, which remains mostly flat. However, in
+two-shot settings, larger models show clear linear improvements in multilingual
+text classification. For translation tasks, however, only the instruction-tuned
+model showed clear benefits from scaling. Our analysis also suggests that
+overall resource levels, not just the proportions of pretraining languages, are
+better predictors of model performance, shedding light on what drives
+multilingual LLM effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at SEAS Workshop at AAAI25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Guess What I Think: Streamlined EEG-to-Image Generation with Latent
+  Diffusion Models <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.02780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.02780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eleonora Lopez, Luigi Sigillo, Federica Colonnese, Massimo Panella, Danilo Comminiello
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating images from brain waves is gaining increasing attention due to its
+potential to advance brain-computer interface (BCI) systems by understanding
+how brain signals encode visual cues. Most of the literature has focused on
+fMRI-to-Image tasks as fMRI is characterized by high spatial resolution.
+However, fMRI is an expensive neuroimaging modality and does not allow for
+real-time BCI. On the other hand, electroencephalography (EEG) is a low-cost,
+non-invasive, and portable neuroimaging technique, making it an attractive
+option for future real-time applications. Nevertheless, EEG presents inherent
+challenges due to its low spatial resolution and susceptibility to noise and
+artifacts, which makes generating images from EEG more difficult. In this
+paper, we address these problems with a streamlined framework based on the
+ControlNet adapter for conditioning a latent diffusion model (LDM) through EEG
+signals. We conduct experiments and ablation studies on popular benchmarks to
+demonstrate that the proposed method beats other state-of-the-art models.
+Unlike these methods, which often require extensive preprocessing, pretraining,
+different losses, and captioning models, our approach is efficient and
+straightforward, requiring only minimal preprocessing and a few components. The
+code is available at https://github.com/LuigiSigillo/GWIT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Stage Segmentation of Cervical Tumors using PocketNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.11456v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.11456v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Awj Twam, Megan Jacobsen, Rachel Glenn, Peng Wei, Jia Sun, Ann Klopp, Aradhana M. Venkatesan, David Fuentes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cervical cancer remains the fourth most common malignancy amongst women
+worldwide.1 Concurrent chemoradiotherapy (CRT) serves as the mainstay
+definitive treatment regimen for locally advanced cervical cancers and includes
+external beam radiation followed by brachytherapy.2 Integral to radiotherapy
+treatment planning is the routine contouring of both the target tumor at the
+level of the cervix, associated gynecologic anatomy and the adjacent organs at
+risk (OARs). However, manual contouring of these structures is both time and
+labor intensive and associated with known interobserver variability that can
+impact treatment outcomes. While multiple tools have been developed to
+automatically segment OARs and the high-risk clinical tumor volume (HR-CTV)
+using computed tomography (CT) images,3,4,5,6 the development of deep
+learning-based tumor segmentation tools using routine T2-weighted (T2w)
+magnetic resonance imaging (MRI) addresses an unmet clinical need to improve
+the routine contouring of both anatomical structures and cervical cancers,
+thereby increasing quality and consistency of radiotherapy planning. This work
+applied a novel deep-learning model (PocketNet) to segment the cervix, vagina,
+uterus, and tumor(s) on T2w MRI. The performance of the PocketNet architecture
+was evaluated, when trained on data via 5-fold cross validation. PocketNet
+achieved a mean Dice-Sorensen similarity coefficient (DSC) exceeding 70% for
+tumor segmentation and 80% for organ segmentation. These results suggest that
+PocketNet is robust to variations in contrast protocols, providing reliable
+segmentation of the regions of interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmark Evaluations, Applications, and Challenges of Large Vision
+  Language Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongxia Li, Xiyang Wu, Hongyang Du, Huy Nghiem, Guangyao Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Vision Language Models (VLMs) have emerged as a transformative
+technology at the intersection of computer vision and natural language
+processing, enabling machines to perceive and reason about the world through
+both visual and textual modalities. For example, models such as CLIP, Claude,
+and GPT-4V demonstrate strong reasoning and understanding abilities on visual
+and textual data and beat classical single modality vision models on zero-shot
+classification. Despite their rapid advancements in research and growing
+popularity in applications, a comprehensive survey of existing studies on VLMs
+is notably lacking, particularly for researchers aiming to leverage VLMs in
+their specific domains. To this end, we provide a systematic overview of VLMs
+in the following aspects: model information of the major VLMs developed over
+the past five years (2019-2024); the main architectures and training methods of
+these VLMs; summary and categorization of the popular benchmarks and evaluation
+metrics of VLMs; the applications of VLMs including embodied agents, robotics,
+and video generation; the challenges and issues faced by current VLMs such as
+hallucination, fairness, and safety. Detailed collections including papers and
+model repository links are listed in
+https://github.com/zli12321/Awesome-VLM-Papers-And-Models.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Atlas: A Novel Pathology Foundation Model by Mayo Clinic, Charité, and
+  Aignostics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05409v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05409v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Alber, Stephan Tietz, Jonas Dippel, Timo Milbich, Timothée Lesort, Panos Korfiatis, Moritz Krügener, Beatriz Perez Cancer, Neelay Shah, Alexander Möllers, Philipp Seegerer, Alexandra Carpen-Amarie, Kai Standvoss, Gabriel Dernbach, Edwin de Jong, Simon Schallenberg, Andreas Kunft, Helmut Hoffer von Ankershoffen, Gavin Schaeferle, Patrick Duffy, Matt Redlon, Philipp Jurmeister, David Horst, Lukas Ruff, Klaus-Robert Müller, Frederick Klauschen, Andrew Norgan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in digital pathology have demonstrated the effectiveness of
+foundation models across diverse applications. In this report, we present
+Atlas, a novel vision foundation model based on the RudolfV approach. Our model
+was trained on a dataset comprising 1.2 million histopathology whole slide
+images, collected from two medical institutions: Mayo Clinic and Charit\'e -
+Universt\"atsmedizin Berlin. Comprehensive evaluations show that Atlas achieves
+state-of-the-art performance across twenty-one public benchmark datasets, even
+though it is neither the largest model by parameter count nor by training
+dataset size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> video <span class="highlight-title">pretrain</span>ing yields robust and more human-aligned
+  visual representations <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06433v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06433v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Parthasarathy, S. M. Ali Eslami, João Carreira, Olivier J. Hénaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans learn powerful representations of objects and scenes by observing how
+they evolve over time. Yet, outside of specific tasks that require explicit
+temporal understanding, static image pretraining remains the dominant paradigm
+for learning visual foundation models. We question this mismatch, and ask
+whether video pretraining can yield visual representations that bear the
+hallmarks of human perception: generalisation across tasks, robustness to
+perturbations, and consistency with human judgements. To that end we propose a
+novel procedure for curating videos, and develop a contrastive framework which
+learns from the complex transformations therein. This simple paradigm for
+distilling knowledge from videos, called VITO, yields general representations
+that far outperform prior video pretraining methods on image understanding
+tasks, and image pretraining methods on video understanding tasks. Moreover,
+VITO representations are significantly more robust to natural and synthetic
+deformations than image-, video-, and adversarially-trained ones. Finally,
+VITO's predictions are strongly aligned with human judgements, surpassing
+models that were specifically trained for that purpose. Together, these results
+suggest that video pretraining could be a simple way of learning unified,
+robust, and human-aligned representations of the visual world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 37th Conference on Neural Information Processing Systems
+  (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advances in Diffusion Models for Image Data Augmentation: A <span class="highlight-title">Review</span> of
+  Methods, Models, Evaluation Metrics and Future Research Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04103v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04103v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panagiotis Alimisis, Ioannis Mademlis, Panagiotis Radoglou-Grammatikis, Panagiotis Sarigiannidis, Georgios Th. Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image data augmentation constitutes a critical methodology in modern computer
+vision tasks, since it can facilitate towards enhancing the diversity and
+quality of training datasets; thereby, improving the performance and robustness
+of machine learning models in downstream tasks. In parallel, augmentation
+approaches can also be used for editing/modifying a given image in a context-
+and semantics-aware way. Diffusion Models (DMs), which comprise one of the most
+recent and highly promising classes of methods in the field of generative
+Artificial Intelligence (AI), have emerged as a powerful tool for image data
+augmentation, capable of generating realistic and diverse images by learning
+the underlying data distribution. The current study realizes a systematic,
+comprehensive and in-depth review of DM-based approaches for image
+augmentation, covering a wide range of strategies, tasks and applications. In
+particular, a comprehensive analysis of the fundamental principles, model
+architectures and training strategies of DMs is initially performed.
+Subsequently, a taxonomy of the relevant image augmentation methods is
+introduced, focusing on techniques regarding semantic manipulation,
+personalization and adaptation, and application-specific augmentation tasks.
+Then, performance assessment methodologies and respective evaluation metrics
+are analyzed. Finally, current challenges and future research directions in the
+field are discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>65 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncovering the Genetic Basis of Glioblastoma Heterogeneity through
+  Multimodal Analysis of Whole Slide Images and RNA Sequencing Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.18710v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.18710v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmad Berjaoui, Louis Roussel, Eduardo Hugo Sanchez, Elizabeth Cohen-Jonathan Moyal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Glioblastoma is a highly aggressive form of brain cancer characterized by
+rapid progression and poor prognosis. Despite advances in treatment, the
+underlying genetic mechanisms driving this aggressiveness remain poorly
+understood. In this study, we employed multimodal deep learning approaches to
+investigate glioblastoma heterogeneity using joint image/RNA-seq analysis. Our
+results reveal novel genes associated with glioblastoma. By leveraging a
+combination of whole-slide images and RNA-seq, as well as introducing novel
+methods to encode RNA-seq data, we identified specific genetic profiles that
+may explain different patterns of glioblastoma progression. These findings
+provide new insights into the genetic mechanisms underlying glioblastoma
+heterogeneity and highlight potential targets for therapeutic intervention.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MARS: A neurosymbolic approach for interpretable drug discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05289v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05289v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lauren Nicole DeLong, Yojana Gadiya, Paola Galdi, Jacques D. Fleuriot, Daniel Domingo-Fernández
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neurosymbolic (NeSy) artificial intelligence describes the combination of
+logic or rule-based techniques with neural networks. Compared to neural
+approaches, NeSy methods often possess enhanced interpretability, which is
+particularly promising for biomedical applications like drug discovery.
+However, since interpretability is broadly defined, there are no clear
+guidelines for assessing the biological plausibility of model interpretations.
+To assess interpretability in the context of drug discovery, we devise a novel
+prediction task, called drug mechanism-of-action (MoA) deconvolution, with an
+associated, tailored knowledge graph (KG), MoA-net. We then develop the MoA
+Retrieval System (MARS), a NeSy approach for drug discovery which leverages
+logical rules with learned rule weights. Using this interpretable feature
+alongside domain knowledge, we find that MARS and other NeSy approaches on KGs
+are susceptible to reasoning shortcuts, in which the prediction of true labels
+is driven by "degree-bias" rather than the domain-based rules. Subsequently, we
+demonstrate ways to identify and mitigate this. Thereafter, MARS achieves
+performance on par with current state-of-the-art models while producing model
+interpretations aligned with known MoAs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review. 10 pages, 7 supplementary pages. Corresponding code is
+  here: https://github.com/laurendelong21/MARS and here:
+  https://github.com/laurendelong21/MoA-Net</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LUMIA: Linear probing for Unimodal and MultiModal Membership Inference
+  Attacks leveraging internal LLM states 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19876v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19876v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis Ibanez-Lissen, Lorena Gonzalez-Manzano, Jose Maria de Fuentes, Nicolas Anciaux, Joaquin Garcia-Alfaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are increasingly used in a variety of
+applications, but concerns around membership inference have grown in parallel.
+Previous efforts focus on black-to-grey-box models, thus neglecting the
+potential benefit from internal LLM information. To address this, we propose
+the use of Linear Probes (LPs) as a method to detect Membership Inference
+Attacks (MIAs) by examining internal activations of LLMs. Our approach, dubbed
+LUMIA, applies LPs layer-by-layer to get fine-grained data on the model inner
+workings. We test this method across several model architectures, sizes and
+datasets, including unimodal and multimodal tasks. In unimodal MIA, LUMIA
+achieves an average gain of 15.71 % in Area Under the Curve (AUC) over previous
+techniques. Remarkably, LUMIA reaches AUC>60% in 65.33% of cases -- an
+increment of 46.80% against the state of the art. Furthermore, our approach
+reveals key insights, such as the model layers where MIAs are most detectable.
+In multimodal models, LPs indicate that visual inputs can significantly
+contribute to detect MIAs -- AUC>60% is reached in 85.90% of experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CURing Large Models: Compression via CUR Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04211v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04211v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanghyeon Park, Soo-Mook Moon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large deep learning models have achieved remarkable success but are
+resource-intensive, posing challenges such as memory usage. We introduce
+CURing, a novel model compression method based on CUR matrix decomposition,
+which approximates weight matrices as the product of selected columns (C) and
+rows (R), and a small linking matrix (U). We apply this decomposition to
+weights chosen based on the combined influence of their magnitudes and
+activations. By identifying and retaining informative rows and columns, CURing
+significantly reduces model size with minimal performance loss. For example, it
+reduces Llama3.1-8B's parameters to 7.32B (-9%) in just 129 seconds, over 20
+times faster than prior compression methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are We Done with MMLU? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04127v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04127v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryo Pradipta Gema, Joshua Ong Jun Leang, Giwon Hong, Alessio Devoto, Alberto Carlo Maria Mancino, Rohit Saxena, Xuanli He, Yu Zhao, Xiaotang Du, Mohammad Reza Ghasemi Madani, Claire Barale, Robert McHardy, Joshua Harris, Jean Kaddour, Emile van Krieken, Pasquale Minervini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maybe not. We identify and analyse errors in the popular Massive Multitask
+Language Understanding (MMLU) benchmark. Even though MMLU is widely adopted,
+our analysis demonstrates numerous ground truth errors that obscure the true
+capabilities of LLMs. For example, we find that 57% of the analysed questions
+in the Virology subset contain errors. To address this issue, we introduce a
+comprehensive framework for identifying dataset errors using a novel error
+annotation protocol. Then, we create MMLU-Redux, which is a subset of 5,700
+manually re-annotated questions across all 57 MMLU subjects. We estimate that
+6.49% of MMLU questions contain errors. Using MMLU-Redux, we demonstrate
+significant discrepancies with the model performance metrics that were
+originally reported. Our results strongly advocate for revising MMLU's
+error-ridden questions to enhance its future utility and reliability as a
+benchmark. https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux-2.0.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A stochastic first-order method with multi-extrapolated momentum for
+  highly smooth unconstrained optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider an unconstrained stochastic optimization problem
+where the objective function exhibits high-order smoothness. Specifically, we
+propose a new stochastic first-order method (SFOM) with multi-extrapolated
+momentum, in which multiple extrapolations are performed in each iteration,
+followed by a momentum update based on these extrapolations. We demonstrate
+that the proposed SFOM can accelerate optimization by exploiting the high-order
+smoothness of the objective function $f$. Assuming that the $p$th-order
+derivative of $f$ is Lipschitz continuous for some $p\ge2$, and under
+additional mild assumptions, we establish that our method achieves a sample
+complexity of $\widetilde{\mathcal{O}}(\epsilon^{-(3p+1)/p})$ for finding a
+point $x$ such that $\mathbb{E}[\|\nabla f(x)\|]\le\epsilon$. To the best of
+our knowledge, this is the first SFOM to leverage arbitrary-order smoothness of
+the objective function for acceleration, resulting in a sample complexity that
+improves upon the best-known results without assuming the mean-squared
+smoothness condition. Preliminary numerical experiments validate the practical
+performance of our method and support our theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Large Language Models in Mission-Critical IT Governance: Are We Ready
+  Yet? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Esposito, Francesco Palagiano, Valentina Lenarduzzi, Davide Taibi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Context. The security of critical infrastructure has been a pressing concern
+since the advent of computers and has become even more critical in today's era
+of cyber warfare. Protecting mission-critical systems (MCSs), essential for
+national security, requires swift and robust governance, yet recent events
+reveal the increasing difficulty of meeting these challenges. Aim. Building on
+prior research showcasing the potential of Generative AI (GAI), such as Large
+Language Models, in enhancing risk analysis, we aim to explore practitioners'
+views on integrating GAI into the governance of IT MCSs. Our goal is to provide
+actionable insights and recommendations for stakeholders, including
+researchers, practitioners, and policymakers. Method. We designed a survey to
+collect practical experiences, concerns, and expectations of practitioners who
+develop and implement security solutions in the context of MCSs. Conclusions
+and Future Works. Our findings highlight that the safe use of LLMs in MCS
+governance requires interdisciplinary collaboration. Researchers should focus
+on designing regulation-oriented models and focus on accountability;
+practitioners emphasize data protection and transparency, while policymakers
+must establish a unified AI framework with global benchmarks to ensure ethical
+and secure LLMs-based MCS governance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dolphin: Closed-loop Open-ended Auto-research through Thinking,
+  Practice, and Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiakang Yuan, Xiangchao Yan, Botian Shi, Tao Chen, Wanli Ouyang, Bo Zhang, Lei Bai, Yu Qiao, Bowen Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scientific research paradigm is undergoing a profound transformation
+owing to the development of Artificial Intelligence (AI). Recent works
+demonstrate that various AI-assisted research methods can largely improve
+research efficiency by improving data analysis, accelerating computation, and
+fostering novel idea generation. To further move towards the ultimate goal
+(i.e., automatic scientific research), in this paper, we propose Dolphin, the
+first closed-loop open-ended auto-research framework to further build the
+entire process of human scientific research. Dolphin can generate research
+ideas, perform experiments, and get feedback from experimental results to
+generate higher-quality ideas. More specifically, Dolphin first generates novel
+ideas based on relevant papers which are ranked by the topic and task
+attributes. Then, the codes are automatically generated and debugged with the
+exception-traceback-guided local code structure. Finally, Dolphin automatically
+analyzes the results of each idea and feeds the results back to the next round
+of idea generation. Experiments are conducted on the benchmark datasets of
+different topics and results show that Dolphin can generate novel ideas
+continuously and complete the experiment in a loop. We highlight that Dolphin
+can automatically propose methods that are comparable to the state-of-the-art
+in some tasks such as 2D image classification and 3D point classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 11 figures, and our homepage:
+  https://alpha-innovator.github.io/Dolphin-project-page</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LitSumm: Large language models for literature summarisation of
+  non-coding RNAs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03056v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03056v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Green, Carlos Ribas, Nancy Ontiveros-Palacios, Sam Griffiths-Jones, Anton I. Petrov, Alex Bateman, Blake Sweeney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Curation of literature in life sciences is a growing challenge. The continued
+increase in the rate of publication, coupled with the relatively fixed number
+of curators worldwide presents a major challenge to developers of biomedical
+knowledgebases. Very few knowledgebases have resources to scale to the whole
+relevant literature and all have to prioritise their efforts.
+  In this work, we take a first step to alleviating the lack of curator time in
+RNA science by generating summaries of literature for non-coding RNAs using
+large language models (LLMs). We demonstrate that high-quality, factually
+accurate summaries with accurate references can be automatically generated from
+the literature using a commercial LLM and a chain of prompts and checks. Manual
+assessment was carried out for a subset of summaries, with the majority being
+rated extremely high quality.
+  We apply our tool to a selection of over 4,600 ncRNAs and make the generated
+summaries available via the RNAcentral resource. We conclude that automated
+literature summarization is feasible with the current generation of LLMs,
+provided careful prompting and automated checking are applied.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gender Bias in Text-to-Video Generation Models: A case study of Sora 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01987v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01987v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Nadeem, Shahab Saquib Sohail, Erik Cambria, Björn W. Schuller, Amir Hussain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of text-to-video generation models has revolutionized content
+creation as it produces high-quality videos from textual prompts. However,
+concerns regarding inherent biases in such models have prompted scrutiny,
+particularly regarding gender representation. Our study investigates the
+presence of gender bias in OpenAI's Sora, a state-of-the-art text-to-video
+generation model. We uncover significant evidence of bias by analyzing the
+generated videos from a diverse set of gender-neutral and stereotypical
+prompts. The results indicate that Sora disproportionately associates specific
+genders with stereotypical behaviors and professions, which reflects societal
+prejudices embedded in its training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VLM-driven Behavior Tree for Context-aware Task Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03968v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03968v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoki Wake, Atsushi Kanehira, Jun Takamatsu, Kazuhiro Sasabuchi, Katsushi Ikeuchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)
+has recently gained attention in the robotics community, yet remains in its
+early stages of development. In this paper, we propose a novel framework that
+leverages Vision-Language Models (VLMs) to interactively generate and edit BTs
+that address visual conditions, enabling context-aware robot operations in
+visually complex environments. A key feature of our approach lies in the
+conditional control through self-prompted visual conditions. Specifically, the
+VLM generates BTs with visual condition nodes, where conditions are expressed
+as free-form text. Another VLM process integrates the text into its prompt and
+evaluates the conditions against real-world images during robot execution. We
+validated our framework in a real-world cafe scenario, demonstrating both its
+feasibility and limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 11 figures, 5 tables. Last updated on January 9th, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long Story Short: Story-level Video Understanding from 20K Short Films 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ridouane Ghermi, Xi Wang, Vicky Kalogeiton, Ivan Laptev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent developments in vision-language models have significantly advanced
+video understanding. Existing datasets and tasks, however, have notable
+limitations. Most datasets are confined to short videos with limited events and
+narrow narratives. For example, datasets with instructional and egocentric
+videos often depict activities of one person in a single scene. Although
+existing movie datasets offer richer content, they are often limited to
+short-term tasks, lack publicly available videos, and frequently encounter data
+leakage issues given the use of subtitles and other information about
+commercial movies during LLM pretraining. To address the above limitations, we
+propose Short-Films 20K (SF20K), the largest publicly available movie dataset.
+SF20K is composed of 20,143 amateur films and offers long-term video tasks in
+the form of multiple-choice and open-ended question answering. Our extensive
+analysis of SF20K reveals minimal data leakage, emphasizes the need for
+long-term reasoning, and demonstrates the strong performance of recent VLMs.
+Finally, we show that instruction tuning on the SF20K-Train set substantially
+improves model performance, paving the way for future progress in long-term
+video understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MoColl: Agent-Based Specific and General Model Collaboration for Image
+  Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01834v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01834v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pu Yang, Bin Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image captioning is a critical task at the intersection of computer vision
+and natural language processing, with wide-ranging applications across various
+domains. For complex tasks such as diagnostic report generation, deep learning
+models require not only domain-specific image-caption datasets but also the
+incorporation of relevant general knowledge to provide contextual accuracy.
+Existing approaches exhibit inherent limitations: specialized models excel in
+capturing domain-specific details but lack generalization, while
+vision-language models (VLMs) built on large language models (LLMs) leverage
+general knowledge but struggle with domain-specific adaptation. To address
+these limitations, this paper proposes a novel agent-enhanced model
+collaboration framework, which we call MoColl, designed to effectively
+integrate domain-specific and general knowledge. Specifically, our approach is
+to decompose complex image captioning tasks into a series of interconnected
+question-answer subtasks. A trainable visual question answering (VQA) model is
+employed as a specialized tool to focus on domain-specific visual analysis,
+answering task-specific questions based on image content. Concurrently, an
+LLM-based agent with general knowledge formulates these questions and
+synthesizes the resulting question-answer pairs into coherent captions. Beyond
+its role in leveraging the VQA model, the agent further guides its training to
+enhance its domain-specific capabilities. Experimental results on radiology
+report generation validate the effectiveness of the proposed framework,
+demonstrating significant improvements in the quality of generated reports.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards a Multimodal Large Language Model with Pixel-Level Insight for
+  Biomedicine <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09278v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09278v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshuang Huang, Lingdong Shen, Jia Liu, Fangxin Shang, Hongxiang Li, Haifeng Huang, Yehui Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Multimodal Large Language Models (MLLM) have achieved
+notable advancements, demonstrating the feasibility of developing an
+intelligent biomedical assistant. However, current biomedical MLLMs
+predominantly focus on image-level understanding and restrict interactions to
+textual commands, thus limiting their capability boundaries and the flexibility
+of usage. In this paper, we introduce a novel end-to-end multimodal large
+language model for the biomedical domain, named MedPLIB, which possesses
+pixel-level understanding. Excitingly, it supports visual question answering
+(VQA), arbitrary pixel-level prompts (points, bounding boxes, and free-form
+shapes), and pixel-level grounding. We propose a novel Mixture-of-Experts (MoE)
+multi-stage training strategy, which divides MoE into separate training phases
+for a visual-language expert model and a pixel-grounding expert model, followed
+by fine-tuning using MoE. This strategy effectively coordinates multitask
+learning while maintaining the computational cost at inference equivalent to
+that of a single expert model. To advance the research of biomedical MLLMs, we
+introduce the Medical Complex Vision Question Answering Dataset (MeCoVQA),
+which comprises an array of 8 modalities for complex medical imaging question
+answering and image region understanding. Experimental results indicate that
+MedPLIB has achieved state-of-the-art outcomes across multiple medical visual
+language tasks. More importantly, in zero-shot evaluations for the pixel
+grounding task, MedPLIB leads the best small and large models by margins of
+19.7 and 15.6 respectively on the mDice metric. The codes, data, and model
+checkpoints will be made publicly available at
+https://github.com/ShawnHuang497/MedPLIB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safety<span class="highlight-title">Prompt</span>s: a Systematic <span class="highlight-title">Review</span> of Open <span class="highlight-title">Dataset</span>s for Evaluating and
+  Improving Large Language Model Safety <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.05399v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.05399v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Röttger, Fabio Pernisi, Bertie Vidgen, Dirk Hovy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The last two years have seen a rapid growth in concerns around the safety of
+large language models (LLMs). Researchers and practitioners have met these
+concerns by creating an abundance of datasets for evaluating and improving LLM
+safety. However, much of this work has happened in parallel, and with very
+different goals in mind, ranging from the mitigation of near-term risks around
+bias and toxic content generation to the assessment of longer-term catastrophic
+risk potential. This makes it difficult for researchers and practitioners to
+find the most relevant datasets for their use case, and to identify gaps in
+dataset coverage that future work may fill. To remedy these issues, we conduct
+a first systematic review of open datasets for evaluating and improving LLM
+safety. We review 144 datasets, which we identified through an iterative and
+community-driven process over the course of several months. We highlight
+patterns and trends, such as a trend towards fully synthetic datasets, as well
+as gaps in dataset coverage, such as a clear lack of non-English and
+naturalistic datasets. We also examine how LLM safety datasets are used in
+practice -- in LLM release publications and popular LLM benchmarks -- finding
+that current evaluation practices are highly idiosyncratic and make use of only
+a small fraction of available datasets. Our contributions are based on
+SafetyPrompts.com, a living catalogue of open datasets for LLM safety, which we
+plan to update continuously as the field of LLM safety develops.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2025 (Special Track on AI Alignment)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Pre-train</span>ed Data Deduplication Model based on Active Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00721v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00721v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Shi, Xinyao Liu, Fengmao Lv, Hongtao Xue, Jie Hu, Shengdong Du, Tianrui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of big data, the issue of data quality has become increasingly
+prominent. One of the main challenges is the problem of duplicate data, which
+can arise from repeated entry or the merging of multiple data sources. These
+"dirty data" problems can significantly limit the effective application of big
+data. To address the issue of data deduplication, we propose a pre-trained
+deduplication model based on active learning, which is the first work that
+utilizes active learning to address the problem of deduplication at the
+semantic level. The model is built on a pre-trained Transformer and fine-tuned
+to solve the deduplication problem as a sequence to classification task, which
+firstly integrate the transformer with active learning into an end-to-end
+architecture to select the most valuable data for deduplication model training,
+and also firstly employ the R-Drop method to perform data augmentation on each
+round of labeled data, which can reduce the cost of manual labeling and improve
+the model's performance. Experimental results demonstrate that our proposed
+model outperforms previous state-of-the-art (SOTA) for deduplicated data
+identification, achieving up to a 28% improvement in Recall score on benchmark
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CORD: Generalizable Cooperation via Role Diversity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanefumi Matsuyama, Kefan Su, Jiangxing Wang, Deheng Ye, Zongqing Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperative multi-agent reinforcement learning (MARL) aims to develop agents
+that can collaborate effectively. However, most cooperative MARL methods
+overfit training agents, making learned policies not generalize well to unseen
+collaborators, which is a critical issue for real-world deployment. Some
+methods attempt to address the generalization problem but require prior
+knowledge or predefined policies of new teammates, limiting real-world
+applications. To this end, we propose a hierarchical MARL approach to enable
+generalizable cooperation via role diversity, namely CORD. CORD's high-level
+controller assigns roles to low-level agents by maximizing the role entropy
+with constraints. We show this constrained objective can be decomposed into
+causal influence in role that enables reasonable role assignment, and role
+heterogeneity that yields coherent, non-redundant role clusters. Evaluated on a
+variety of cooperative multi-agent tasks, CORD achieves better performance than
+baselines, especially in generalization tests. Ablation studies further
+demonstrate the efficacy of the constrained objective in generalizable
+cooperation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlgoFormer: An Efficient <span class="highlight-title">Transformer</span> Framework with Algorithmic
+  Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13572v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13572v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihang Gao, Chuanyang Zheng, Enze Xie, Han Shi, Tianyang Hu, Yu Li, Michael K. Ng, Zhenguo Li, Zhaoqiang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Besides natural language processing, transformers exhibit extraordinary
+performance in solving broader applications, including scientific computing and
+computer vision. Previous works try to explain this from the expressive power
+and capability perspectives that standard transformers are capable of
+performing some algorithms. To empower transformers with algorithmic
+capabilities and motivated by the recently proposed looped transformer, we
+design a novel transformer framework, dubbed Algorithm Transformer (abbreviated
+as AlgoFormer). We provide an insight that efficient transformer architectures
+can be designed by leveraging prior knowledge of tasks and the underlying
+structure of potential algorithms. Compared with the standard transformer and
+vanilla looped transformer, the proposed AlgoFormer can perform efficiently in
+algorithm representation in some specific tasks. In particular, inspired by the
+structure of human-designed learning algorithms, our transformer framework
+consists of a pre-transformer that is responsible for task preprocessing, a
+looped transformer for iterative optimization algorithms, and a
+post-transformer for producing the desired results after post-processing. We
+provide theoretical evidence of the expressive power of the AlgoFormer in
+solving some challenging problems, mirroring human-designed algorithms.
+Furthermore, some theoretical and empirical results are presented to show that
+the designed transformer has the potential to perform algorithm representation
+and learning. Experimental results demonstrate the empirical superiority of the
+proposed transformer in that it outperforms the standard transformer and
+vanilla looped transformer in some specific tasks. An extensive experiment on
+real language tasks (e.g., neural machine translation of German and English,
+and text classification) further validates the expressiveness and effectiveness
+of AlgoFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Transactions on Machine Learning Research (TMLR). The
+  paper provides insight that the Transformer architectures can mimic the
+  algorithm structures in (in-context) algorithm learning and representation.
+  The incorporated algorithmic structure in Algoformer shows its potential in
+  (deep learning for) scientific computing, besides the real language tasks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balanced Multi-view Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02564v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02564v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenglai Li, Jun Wang, Chang Tang, Xinzhong Zhu, Wei Zhang, Xinwang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view clustering (MvC) aims to integrate information from different
+views to enhance the capability of the model in capturing the underlying data
+structures. The widely used joint training paradigm in MvC is potentially not
+fully leverage the multi-view information, since the imbalanced and
+under-optimized view-specific features caused by the uniform learning objective
+for all views. For instance, particular views with more discriminative
+information could dominate the learning process in the joint training paradigm,
+leading to other views being under-optimized. To alleviate this issue, we first
+analyze the imbalanced phenomenon in the joint-training paradigm of multi-view
+clustering from the perspective of gradient descent for each view-specific
+feature extractor. Then, we propose a novel balanced multi-view clustering
+(BMvC) method, which introduces a view-specific contrastive regularization
+(VCR) to modulate the optimization of each view. Concretely, VCR preserves the
+sample similarities captured from the joint features and view-specific ones
+into the clustering distributions corresponding to view-specific features to
+enhance the learning process of view-specific feature extractors. Additionally,
+a theoretical analysis is provided to illustrate that VCR adaptively modulates
+the magnitudes of gradients for updating the parameters of view-specific
+feature extractors to achieve a balanced multi-view learning procedure. In such
+a manner, BMvC achieves a better trade-off between the exploitation of
+view-specific patterns and the exploration of view-invariance patterns to fully
+learn the multi-view information for the clustering task. Finally, a set of
+experiments are conducted to verify the superiority of the proposed method
+compared with state-of-the-art approaches both on eight benchmark MvC datasets
+and two spatially resolved transcriptomics datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We are withdrawing this paper due to issues in the experimental
+  section related to the Application for Spatially Resolved Transcriptomics
+  Data Clustering. These issues affect the validity of the results presented.
+  We believe it is necessary to withdraw the paper to address these problems
+  adequately before resubmission.</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KITS: Inductive Spatio-Temporal Kriging with Increment Training Strategy <span class="chip">AAAI'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.02565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.02565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianxiong Xu, Cheng Long, Ziyue Li, Sijie Ruan, Rui Zhao, Zhishuai Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sensors are commonly deployed to perceive the environment. However, due to
+the high cost, sensors are usually sparsely deployed. Kriging is the tailored
+task to infer the unobserved nodes (without sensors) using the observed source
+nodes (with sensors). The essence of kriging task is transferability. Recently,
+several inductive spatio-temporal kriging methods have been proposed based on
+graph neural networks, being trained based on a graph built on top of observed
+nodes via pretext tasks such as masking nodes out and reconstructing them.
+However, the graph in training is inevitably much sparser than the graph in
+inference that includes all the observed and unobserved nodes. The learned
+pattern cannot be well generalized for inference, denoted as graph gap. To
+address this issue, we first present a novel Increment training strategy:
+instead of masking nodes (and reconstructing them), we add virtual nodes into
+the training graph so as to mitigate the graph gap issue naturally.
+Nevertheless, the empty-shell virtual nodes without labels could have
+bad-learned features and lack supervision signals. To solve these issues, we
+pair each virtual node with its most similar observed node and fuse their
+features together; to enhance the supervision signal, we construct reliable
+pseudo labels for virtual nodes. As a result, the learned pattern of virtual
+nodes could be safely transferred to real unobserved nodes for reliable
+kriging. We name our new Kriging model with Increment Training Strategy as
+KITS. Extensive experiments demonstrate that KITS consistently outperforms
+existing kriging methods by large margins, e.g., the improvement over MAE score
+could be as high as 18.33%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by AAAI'25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 4-bit Shampoo for Memory-Efficient Network Training <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18144v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18144v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sike Wang, Pan Zhou, Jia Li, Hua Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Second-order optimizers, maintaining a matrix termed a preconditioner, are
+superior to first-order optimizers in both theory and practice. The states
+forming the preconditioner and its inverse root restrict the maximum size of
+models trained by second-order optimizers. To address this, compressing 32-bit
+optimizer states to lower bitwidths has shown promise in reducing memory usage.
+However, current approaches only pertain to first-order optimizers. In this
+paper, we propose the first 4-bit second-order optimizers, exemplified by 4-bit
+Shampoo, maintaining performance similar to that of 32-bit ones. We show that
+quantizing the eigenvector matrix of the preconditioner in 4-bit Shampoo is
+remarkably better than quantizing the preconditioner itself both theoretically
+and experimentally. By rectifying the orthogonality of the quantized
+eigenvector matrix, we enhance the approximation of the preconditioner's
+eigenvector matrix, which also benefits the computation of its inverse 4-th
+root. Besides, we find that linear square quantization slightly outperforms
+dynamic tree quantization when quantizing second-order optimizer states.
+Evaluation on various networks for image classification and natural language
+modeling demonstrates that our 4-bit Shampoo achieves comparable performance to
+its 32-bit counterpart while being more memory-efficient.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 final camera-ready revisions, rectify the legend in
+  figure 9</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Joint Additive Factor Models for Multiview Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00778v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00778v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niccolo Anceschi, Federico Ferrari, David B. Dunson, Himel Mallick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is increasingly common in a wide variety of applied settings to collect
+data of multiple different types on the same set of samples. Our particular
+focus in this article is on studying relationships between such multiview
+features and responses. A motivating application arises in the context of
+precision medicine where multi-omics data are collected to correlate with
+clinical outcomes. It is of interest to infer dependence within and across
+views while combining multimodal information to improve the prediction of
+outcomes. The signal-to-noise ratio can vary substantially across views,
+motivating more nuanced statistical tools beyond standard late and early
+fusion. This challenge comes with the need to preserve interpretability, select
+features, and obtain accurate uncertainty quantification. We propose a joint
+additive factor regression model (JAFAR) with a structured additive design,
+accounting for shared and view-specific components. We ensure identifiability
+via a novel dependent cumulative shrinkage process (D-CUSP) prior. We provide
+an efficient implementation via a partially collapsed Gibbs sampler and extend
+our approach to allow flexible feature and outcome distributions. Prediction of
+time-to-labor onset from immunome, metabolome, and proteome data illustrates
+performance gains against state-of-the-art competitors. Our open-source
+software (R package) is available at https://github.com/niccoloanceschi/jafar.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Takin-VC: Expressive Zero-Shot Voice Conversion via Adaptive Hybrid
+  Content Encoding and Enhanced Timbre Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.01350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.01350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuguang Yang, Yu Pan, Jixun Yao, Xiang Zhang, Jianhao Ye, Hongbin Zhou, Lei Xie, Lei Ma, Jianjun Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expressive zero-shot voice conversion (VC) is a critical and challenging task
+that aims to transform the source timbre into an arbitrary unseen speaker while
+preserving the original content and expressive qualities. Despite recent
+progress in zero-shot VC, there remains considerable potential for improvements
+in speaker similarity and speech naturalness. Moreover, existing zero-shot VC
+systems struggle to fully reproduce paralinguistic information in highly
+expressive speech, such as breathing, crying, and emotional nuances, limiting
+their practical applicability. To address these issues, we propose Takin-VC, a
+novel expressive zero-shot VC framework via adaptive hybrid content encoding
+and memory-augmented context-aware timbre modeling. Specifically, we introduce
+an innovative hybrid content encoder that incorporates an adaptive fusion
+module, capable of effectively integrating quantized features of the
+pre-trained WavLM and HybridFormer in an implicit manner, so as to extract
+precise linguistic features while enriching paralinguistic elements. For timbre
+modeling, we propose advanced memory-augmented and context-aware modules to
+generate high-quality target timbre features and fused representations that
+seamlessly align source content with target timbre. To enhance real-time
+performance, we advocate a conditional flow matching model to reconstruct the
+Mel-spectrogram of the source speech. Experimental results show that our
+Takin-VC consistently surpasses state-of-the-art VC systems, achieving notable
+improvements in terms of speech naturalness, speech expressiveness, and speaker
+similarity, while offering enhanced inference speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in Progress; Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Surrogate-based Autotuning for Randomized Sketching Algorithms in
+  Regression Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.15720v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.15720v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Younghyun Cho, James W. Demmel, Michał Dereziński, Haoyun Li, Hengrui Luo, Michael W. Mahoney, Riley J. Murray
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Algorithms from Randomized Numerical Linear Algebra (RandNLA) are known to be
+effective in handling high-dimensional computational problems, providing
+high-quality empirical performance as well as strong probabilistic guarantees.
+However, their practical application is complicated by the fact that the user
+needs to set various algorithm-specific tuning parameters which are different
+than those used in traditional NLA. This paper demonstrates how a
+surrogate-based autotuning approach can be used to address fundamental problems
+of parameter selection in RandNLA algorithms. In particular, we provide a
+detailed investigation of surrogate-based autotuning for
+sketch-and-precondition (SAP) based randomized least squares methods, which
+have been one of the great success stories in modern RandNLA. Empirical results
+show that our surrogate-based autotuning approach can achieve near-optimal
+performance with much less tuning cost than a random search (up to about 4x
+fewer trials of different parameter configurations). Moreover, while our
+experiments focus on least squares, our results demonstrate a general-purpose
+autotuning pipeline applicable to any kind of RandNLA algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Improved the presentation and clarity. Updated experimental results
+  and scenarios. Accepted for publication in SIAM Journal on Matrix Analysis
+  and Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SensorQA: A Question Answering Benchmark for Daily-Life Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04974v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04974v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Reichman, Xiaofan Yu, Lanxiang Hu, Jack Truxal, Atishay Jain, Rushil Chandrupatla, Tajana Šimunić Rosing, Larry Heck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid growth in sensor data, effectively interpreting and
+interfacing with these data in a human-understandable way has become crucial.
+While existing research primarily focuses on learning classification models,
+fewer studies have explored how end users can actively extract useful insights
+from sensor data, often hindered by the lack of a proper dataset. To address
+this gap, we introduce SensorQA, the first human-created question-answering
+(QA) dataset for long-term time-series sensor data for daily life monitoring.
+SensorQA is created by human workers and includes 5.6K diverse and practical
+queries that reflect genuine human interests, paired with accurate answers
+derived from sensor data. We further establish benchmarks for state-of-the-art
+AI models on this dataset and evaluate their performance on typical edge
+devices. Our results reveal a gap between current models and optimal QA
+performance and efficiency, highlighting the need for new contributions. The
+dataset and code are available at:
+\url{https://github.com/benjamin-reichman/SensorQA}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiReCT: Diagnostic Reasoning for Clinical Notes via Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01933v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01933v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Wang, Jiuyang Chang, Yiming Qian, Guoxin Chen, Junhao Chen, Zhouqiang Jiang, Jiahao Zhang, Yuta Nakashima, Hajime Nagahara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have recently showcased remarkable capabilities,
+spanning a wide range of tasks and applications, including those in the medical
+domain. Models like GPT-4 excel in medical question answering but may face
+challenges in the lack of interpretability when handling complex tasks in real
+clinical settings. We thus introduce the diagnostic reasoning dataset for
+clinical notes (DiReCT), aiming at evaluating the reasoning ability and
+interpretability of LLMs compared to human doctors. It contains 511 clinical
+notes, each meticulously annotated by physicians, detailing the diagnostic
+reasoning process from observations in a clinical note to the final diagnosis.
+Additionally, a diagnostic knowledge graph is provided to offer essential
+knowledge for reasoning, which may not be covered in the training data of
+existing LLMs. Evaluations of leading LLMs on DiReCT bring out a significant
+gap between their reasoning ability and that of human doctors, highlighting the
+critical need for models that can reason effectively in real-world clinical
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Human-In-the-Loop Software Development Agents <span class="chip">ICSE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.12924v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.12924v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wannita Takerngsaksiri, Jirat Pasuksmit, Patanamon Thongtanunam, Chakkrit Tantithamthavorn, Ruixiong Zhang, Fan Jiang, Jing Li, Evan Cook, Kun Chen, Ming Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Large Language Models (LLMs)-based multi-agent paradigms for
+software engineering are introduced to automatically resolve software
+development tasks (e.g., from a given issue to source code). However, existing
+work is evaluated based on historical benchmark datasets, rarely considers
+human feedback at each stage of the automated software development process, and
+has not been deployed in practice. In this paper, we introduce a
+Human-in-the-loop LLM-based Agents framework (HULA) for software development
+that allows software engineers to refine and guide LLMs when generating coding
+plans and source code for a given task. We design, implement, and deploy the
+HULA framework into Atlassian JIRA for internal uses. Through a multi-stage
+evaluation of the HULA framework, Atlassian software engineers perceive that
+HULA can minimize the overall development time and effort, especially in
+initiating a coding plan and writing code for straightforward tasks. On the
+other hand, challenges around code quality remain a concern in some cases. We
+draw lessons learned and discuss opportunities for future work, which will pave
+the way for the advancement of LLM-based agents in software development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures, ICSE SEIP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FMRFT: Fusion Mamba and DETR for Query Time Sequence Intersection Fish
+  Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.01148v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.01148v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Yao, Yukang Huo, Qingbin Tian, Jiayin Zhao, Xiao Liu, Ruifeng Wang, Lin Xue, Haihua Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early detection of abnormal fish behavior caused by disease or hunger can be
+achieved through fish tracking using deep learning techniques, which holds
+significant value for industrial aquaculture. However, underwater reflections
+and some reasons with fish, such as the high similarity, rapid swimming caused
+by stimuli and mutual occlusion bring challenges to multi-target tracking of
+fish. To address these challenges, this paper establishes a complex
+multi-scenario sturgeon tracking dataset and introduces the FMRFT model, a
+real-time end-to-end fish tracking solution. The model incorporates the low
+video memory consumption Mamba In Mamba (MIM) architecture, which facilitates
+multi-frame temporal memory and feature extraction, thereby addressing the
+challenges to track multiple fish across frames. Additionally, the FMRFT model
+with the Query Time Sequence Intersection (QTSI) module effectively manages
+occluded objects and reduces redundant tracking frames using the superior
+feature interaction and prior frame processing capabilities of RT-DETR. This
+combination significantly enhances the accuracy and stability of fish tracking.
+Trained and tested on the dataset, the model achieves an IDF1 score of 90.3%
+and a MOTA accuracy of 94.3%. Experimental results show that the proposed FMRFT
+model effectively addresses the challenges of high similarity and mutual
+occlusion in fish populations, enabling accurate tracking in factory farming
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages,14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Optimal, Universal and Agnostic Decoding Method for Message
+  Reconstruction, Bio and Technosignature Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16045v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16045v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hector Zenil, Alyssa Adams, Felipe S. Abrahão, Luan Ozelim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an agnostic signal reconstruction method for zero-knowledge
+one-way communication channels in which a receiver aims to interpret a message
+sent by an unknown source about which no prior knowledge is available and to
+which no return message can be sent. Our reconstruction method is agnostic
+vis-\`a-vis the arbitrarily chosen encoding-decoding scheme and other
+observer-dependent characteristics, such as the arbitrarily chosen
+computational model, probability distributions, or underlying mathematical
+theory. We investigate how non-random messages encode information about their
+intended physical properties, such as dimension and length scales of the space
+in which a signal or message may have been originally encoded, embedded, or
+generated. We focus on image data as a first illustration of the capabilities
+of the new method. We argue that our results have applications to life and
+technosignature detection, and to coding theory in general.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ JourneyBench: A Challenging One-Stop Vision-Language Understanding
+  Benchmark of Generated Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.12953v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.12953v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhecan Wang, Junzhang Liu, Chia-Wei Tang, Hani Alomari, Anushka Sivakumar, Rui Sun, Wenhao Li, Md. Atabuzzaman, Hammad Ayyubi, Haoxuan You, Alvi Ishmam, Kai-Wei Chang, Shih-Fu Chang, Chris Thomas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing vision-language understanding benchmarks largely consist of images
+of objects in their usual contexts. As a consequence, recent multimodal large
+language models can perform well with only a shallow visual understanding by
+relying on background language biases. Thus, strong performance on these
+benchmarks does not necessarily correlate with strong visual understanding. In
+this paper, we release JourneyBench, a comprehensive human-annotated benchmark
+of generated images designed to assess the model's fine-grained multimodal
+reasoning abilities across five tasks: complementary multimodal chain of
+thought, multi-image VQA, imaginary image captioning, VQA with hallucination
+triggers, and fine-grained retrieval with sample-specific distractors. Unlike
+existing benchmarks, JourneyBench explicitly requires fine-grained multimodal
+reasoning in unusual imaginary scenarios where language bias and holistic image
+gist are insufficient. We benchmark state-of-the-art models on JourneyBench and
+analyze performance along a number of fine-grained dimensions. Results across
+all five tasks show that JourneyBench is exceptionally challenging for even the
+best models, indicating that models' visual reasoning abilities are not as
+strong as they first appear. We discuss the implications of our findings and
+propose avenues for further research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Oscars of AI Theater: A <span class="highlight-title">Survey</span> on Role-Playing with Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.11484v9">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.11484v9.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nuo Chen, Yan Wang, Yang Deng, Jia Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This survey explores the burgeoning field of role-playing with language
+models, focusing on their development from early persona-based models to
+advanced character-driven simulations facilitated by Large Language Models
+(LLMs). Initially confined to simple persona consistency due to limited model
+capabilities, role-playing tasks have now expanded to embrace complex character
+portrayals involving character consistency, behavioral alignment, and overall
+attractiveness. We provide a comprehensive taxonomy of the critical components
+in designing these systems, including data, models and alignment, agent
+architecture and evaluation. This survey not only outlines the current
+methodologies and challenges, such as managing dynamic personal profiles and
+achieving high-level persona consistency but also suggests avenues for future
+research in improving the depth and realism of role-playing applications. The
+goal is to guide future research by offering a structured overview of current
+methodologies and identifying potential areas for improvement. Related
+resources and papers are available at
+https://github.com/nuochenpku/Awesome-Role-Play-Papers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expected Coordinate Improvement for High-Dimensional Bayesian
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.11917v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.11917v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dawei Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian optimization (BO) algorithm is very popular for solving
+low-dimensional expensive optimization problems. Extending Bayesian
+optimization to high dimension is a meaningful but challenging task. One of the
+major challenges is that it is difficult to find good infill solutions as the
+acquisition functions are also high-dimensional. In this work, we propose the
+expected coordinate improvement (ECI) criterion for high-dimensional Bayesian
+optimization. The proposed ECI criterion measures the potential improvement we
+can get by moving the current best solution along one coordinate. The proposed
+approach selects the coordinate with the highest ECI value to refine in each
+iteration and covers all the coordinates gradually by iterating over the
+coordinates. The greatest advantage of the proposed ECI-BO (expected coordinate
+improvement based Bayesian optimization) algorithm over the standard BO
+algorithm is that the infill selection problem of the proposed algorithm is
+always a one-dimensional problem thus can be easily solved. Numerical
+experiments show that the proposed algorithm can achieve significantly better
+results than the standard BO algorithm and competitive results when compared
+with five state-of-the-art high-dimensional BOs. This work provides a simple
+but efficient approach for high-dimensional Bayesian optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Consistency Checks for Language Model Forecasters <span class="chip">ICLR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18544v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18544v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Paleka, Abhimanyu Pallavi Sudhir, Alejandro Alvarez, Vineeth Bhat, Adam Shen, Evan Wang, Florian Tramèr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting is a task that is difficult to evaluate: the ground truth can
+only be known in the future. Recent work showing LLM forecasters rapidly
+approaching human-level performance begs the question: how can we benchmark and
+evaluate these forecasters instantaneously? Following the consistency check
+framework, we measure the performance of forecasters in terms of the
+consistency of their predictions on different logically-related questions. We
+propose a new, general consistency metric based on arbitrage: for example, if a
+forecasting AI illogically predicts that both the Democratic and Republican
+parties have 60% probability of winning the 2024 US presidential election, an
+arbitrageur can trade against the forecaster's predictions and make a profit.
+We build an automated evaluation system that generates a set of base questions,
+instantiates consistency checks from these questions, elicits the predictions
+of the forecaster, and measures the consistency of the predictions. We then
+build a standard, proper-scoring-rule forecasting benchmark, and show that our
+(instantaneous) consistency metrics correlate with LLM forecasters' ground
+truth Brier scores (which are only known in the future). We also release a
+consistency benchmark that resolves in 2028, providing a long-term evaluation
+tool for forecasting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>55 pages, 25 figures. Submitted to ICLR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-09T00:00:00Z">2025-01-09</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">41</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Simple to Complex Skills: The Case of In-Hand Object Reorientation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05439v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05439v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhi Qi, Brent Yi, Mike Lambeta, Yi Ma, Roberto Calandra, Jitendra Malik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning policies in simulation and transferring them to the real world has
+become a promising approach in dexterous manipulation. However, bridging the
+sim-to-real gap for each new task requires substantial human effort, such as
+careful reward engineering, hyperparameter tuning, and system identification.
+In this work, we present a system that leverages low-level skills to address
+these challenges for more complex tasks. Specifically, we introduce a
+hierarchical policy for in-hand object reorientation based on previously
+acquired rotation skills. This hierarchical policy learns to select which
+low-level skill to execute based on feedback from both the environment and the
+low-level skill policies themselves. Compared to learning from scratch, the
+hierarchical policy is more robust to out-of-distribution changes and transfers
+easily from simulation to real-world environments. Additionally, we propose a
+generalizable object pose estimator that uses proprioceptive information,
+low-level skill predictions, and control errors as inputs to estimate the
+object pose over time. We demonstrate that our system can reorient objects,
+including symmetrical and textureless ones, to a desired pose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>website: https://dexhier.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoboPanoptes: The All-seeing Robot with Whole-body Dexterity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaomeng Xu, Dominik Bauer, Shuran Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present RoboPanoptes, a capable yet practical robot system that achieves
+whole-body dexterity through whole-body vision. Its whole-body dexterity allows
+the robot to utilize its entire body surface for manipulation, such as
+leveraging multiple contact points or navigating constrained spaces. Meanwhile,
+whole-body vision uses a camera system distributed over the robot's surface to
+provide comprehensive, multi-perspective visual feedback of its own and the
+environment's state. At its core, RoboPanoptes uses a whole-body visuomotor
+policy that learns complex manipulation skills directly from human
+demonstrations, efficiently aggregating information from the distributed
+cameras while maintaining resilience to sensor failures. Together, these design
+aspects unlock new capabilities and tasks, allowing RoboPanoptes to unbox in
+narrow spaces, sweep multiple or oversized objects, and succeed in multi-step
+stowing in cluttered environments, outperforming baselines in adaptability and
+efficiency. Results are best viewed on https://robopanoptes.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://robopanoptes.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Virtual-Work Based Shape-Force Sensing for Continuum Instruments with
+  Tension-Feedback Actuation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoqing Zhang, Zihan Chen, Long Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continuum instruments are integral to robot-assisted minimally invasive
+surgery (MIS), with tendon-driven mechanisms being the most common. Real-time
+tension feedback is crucial for precise articulation but remains a challenge in
+compact actuation unit designs. Additionally, accurate shape and external force
+sensing of continuum instruments are essential for advanced control and
+manipulation. This paper presents a compact and modular actuation unit that
+integrates a torque cell directly into the pulley module to provide real-time
+tension feedback. Building on this unit, we propose a novel shape-force sensing
+framework that incorporates polynomial curvature kinematics to accurately model
+non-constant curvature. The framework combines pose sensor measurements at the
+instrument tip and actuation tension feedback at the developed actuation unit.
+Experimental results demonstrate the improved performance of the proposed
+shape-force sensing framework in terms of shape reconstruction accuracy and
+force estimation reliability compared to conventional constant-curvature
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Path-Planning for Autonomous Robots: A UCH-Enhanced Q-Learning
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Liu, Ruiyang Wang, Haonan Wang, Guangwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Q-learning methods are widely used in robot path planning but often face
+challenges of inefficient search and slow convergence. We propose an Improved
+Q-learning (IQL) framework that enhances standard Q-learning in two significant
+ways. First, we introduce the Path Adaptive Collaborative Optimization (PACO)
+algorithm to optimize Q-table initialization, providing better initial
+estimates and accelerating learning. Second, we incorporate a
+Utility-Controlled Heuristic (UCH) mechanism with dynamically tuned parameters
+to optimize the reward function, enhancing the algorithm's accuracy and
+effectiveness in path-planning tasks. Extensive experiments in three different
+raster grid environments validate the superior performance of our IQL
+framework. The results demonstrate that our IQL algorithm outperforms existing
+methods, including FIQL, PP-QL-based CPP, DFQL, and QMABC algorithms, in terms
+of path-planning capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Transfer in Model-Based Reinforcement Learning Agents for
+  Efficient Multi-Task Learning <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05329v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05329v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmytro Kuzmenko, Nadiya Shvai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an efficient knowledge transfer approach for model-based
+reinforcement learning, addressing the challenge of deploying large world
+models in resource-constrained environments. Our method distills a
+high-capacity multi-task agent (317M parameters) into a compact 1M parameter
+model, achieving state-of-the-art performance on the MT30 benchmark with a
+normalized score of 28.45, a substantial improvement over the original 1M
+parameter model's score of 18.93. This demonstrates the ability of our
+distillation technique to consolidate complex multi-task knowledge effectively.
+Additionally, we apply FP16 post-training quantization, reducing the model size
+by 50% while maintaining performance. Our work bridges the gap between the
+power of large models and practical deployment constraints, offering a scalable
+solution for efficient and accessible multi-task reinforcement learning in
+robotics and other resource-limited domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of an extended abstract accepted to AAMAS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Design and Control of a Bipedal Robotic Character 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruben Grandia, Espen Knoop, Michael A. Hopkins, Georg Wiedebach, Jared Bishop, Steven Pickles, David Müller, Moritz Bächer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legged robots have achieved impressive feats in dynamic locomotion in
+challenging unstructured terrain. However, in entertainment applications, the
+design and control of these robots face additional challenges in appealing to
+human audiences. This work aims to unify expressive, artist-directed motions
+and robust dynamic mobility for legged robots. To this end, we introduce a new
+bipedal robot, designed with a focus on character-driven mechanical features.
+We present a reinforcement learning-based control architecture to robustly
+execute artistic motions conditioned on command signals. During runtime, these
+command signals are generated by an animation engine which composes and blends
+between multiple animation sources. Finally, an intuitive operator interface
+enables real-time show performances with the robot. The complete system results
+in a believable robotic character, and paves the way for enhanced human-robot
+engagement in various contexts, in entertainment robotics and beyond.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dexterous Manipulation of Deformable Objects via Pneumatic Gripping:
+  Lifting by One End 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Mykhailyshyn, Jonathan Lee, Mykhailo Mykhailyshyn, Kensuke Harada, Ann Majewicz Fey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manipulating deformable objects in robotic cells is often costly and not
+widely accessible. However, the use of localized pneumatic gripping systems can
+enhance accessibility. Current methods that use pneumatic grippers to handle
+deformable objects struggle with effective lifting. This paper introduces a
+method for the dexterous lifting of textile deformable objects from one edge,
+utilizing a previously developed gripper designed for flexible and porous
+materials. By precisely adjusting the orientation and position of the gripper
+during the lifting process, we were able to significantly reduce necessary
+gripping force and minimize object vibration caused by airflow. This method was
+tested and validated on four materials with varying mass, friction, and
+flexibility. The proposed approach facilitates the lifting of deformable
+objects from a conveyor or automated line, even when only one edge is
+accessible for grasping. Future work will involve integrating a vision system
+to optimize the manipulation of deformable objects with more complex shapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to RA-L</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ State-Based Disassembly Planning <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Lei, Nir Lipovetzky, Krista A. Ehinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has been shown recently that physics-based simulation significantly
+enhances the disassembly capabilities of real-world assemblies with diverse 3D
+shapes and stringent motion constraints. However, the efficiency suffers when
+tackling intricate disassembly tasks that require numerous simulations and
+increased simulation time. In this work, we propose a State-Based Disassembly
+Planning (SBDP) approach, prioritizing physics-based simulation with
+translational motion over rotational motion to facilitate autonomy, reducing
+dependency on human input, while storing intermediate motion states to improve
+search scalability. We introduce two novel evaluation functions derived from
+new Directional Blocking Graphs (DBGs) enriched with state information to scale
+up the search. Our experiments show that SBDP with new evaluation functions and
+DBGs constraints outperforms the state-of-the-art in disassembly planning in
+terms of success rate and computational efficiency over benchmark datasets
+consisting of thousands of physically valid industrial assemblies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2025 (extended version)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assisting MoCap-Based Teleoperation of Robot Arm using Augmented Reality
+  Visualisations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiushi Zhou, Antony Chacon, Jiahe Pan, Wafa Johal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teleoperating a robot arm involves the human operator positioning the robot's
+end-effector or programming each joint. Whereas humans can control their own
+arms easily by integrating visual and proprioceptive feedback, it is
+challenging to control an external robot arm in the same way, due to its
+inconsistent orientation and appearance. We explore teleoperating a robot arm
+through motion-capture (MoCap) of the human operator's arm with the assistance
+of augmented reality (AR) visualisations. We investigate how AR helps
+teleoperation by visualising a virtual reference of the human arm alongside the
+robot arm to help users understand the movement mapping. We found that the AR
+overlay of a humanoid arm on the robot in the same orientation helped users
+learn the control. We discuss findings and future work on MoCap-based robot
+teleoperation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 7 figures, accepted to HRI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Systematic Literature <span class="highlight-title">Review</span> on Deep Learning-based Depth Estimation
+  in Computer Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Rohan, Md Junayed Hasan, Andrei Petrovski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth estimation (DE) provides spatial information about a scene and enables
+tasks such as 3D reconstruction, object detection, and scene understanding.
+Recently, there has been an increasing interest in using deep learning
+(DL)-based methods for DE. Traditional techniques rely on handcrafted features
+that often struggle to generalise to diverse scenes and require extensive
+manual tuning. However, DL models for DE can automatically extract relevant
+features from input data, adapt to various scene conditions, and generalise
+well to unseen environments. Numerous DL-based methods have been developed,
+making it necessary to survey and synthesize the state-of-the-art (SOTA).
+Previous reviews on DE have mainly focused on either monocular or stereo-based
+techniques, rather than comprehensively reviewing DE. Furthermore, to the best
+of our knowledge, there is no systematic literature review (SLR) that
+comprehensively focuses on DE. Therefore, this SLR study is being conducted.
+Initially, electronic databases were searched for relevant publications,
+resulting in 1284 publications. Using defined exclusion and quality criteria,
+128 publications were shortlisted and further filtered to select 59
+high-quality primary studies. These studies were analysed to extract data and
+answer defined research questions. Based on the results, DL methods were
+developed for mainly three different types of DE: monocular, stereo, and
+multi-view. 20 publicly available datasets were used to train, test, and
+evaluate DL models for DE, with KITTI, NYU Depth V2, and Make 3D being the most
+used datasets. 29 evaluation metrics were used to assess the performance of DE.
+35 base models were reported in the primary studies, and the top five most-used
+base models were ResNet-50, ResNet-18, ResNet-101, U-Net, and VGG-16. Finally,
+the lack of ground truth data was among the most significant challenges
+reported by primary studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OfficeMate: Pilot Evaluation of an Office Assistant Robot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahe Pan, Sarah Schömbs, Yan Zhang, Ramtin Tabatabaei, Muhammad Bilal, Wafa Johal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Office Assistant Robots (OARs) offer a promising solution to proactively
+provide in-situ support to enhance employee well-being and productivity in
+office spaces. We introduce OfficeMate, a social OAR designed to assist with
+practical tasks, foster social interaction, and promote health and well-being.
+Through a pilot evaluation with seven participants in an office environment, we
+found that users see potential in OARs for reducing stress and promoting
+healthy habits and value the robot's ability to provide companionship and
+physical activity reminders in the office space. However, concerns regarding
+privacy, communication, and the robot's interaction timing were also raised.
+The feedback highlights the need to carefully consider the robot's appearance
+and behaviour to ensure it enhances user experience and aligns with office
+social norms. We believe these insights will better inform the development of
+adaptive, intelligent OAR systems for future office space integration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, accepted to HRI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing the Power of Vibration Motors to Develop Miniature Untethered
+  Robotic Fishes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongjie Jiang, Yingying Dai, Jinyang Le, Xiaomeng Chen, Yu Xie, Wei Zhou, Fuzhou Niu, Ying Li, Tao Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Miniature underwater robots play a crucial role in the exploration and
+development of marine resources, particularly in confined spaces and
+high-pressure deep-sea environments. This study presents the design,
+optimization, and performance of a miniature robotic fish, powered by the
+oscillation of bio-inspired fins. These fins feature a rigid-flexible hybrid
+structure and use an eccentric rotating mass (ERM) vibration motor as the
+excitation source to generate high-frequency unidirectional oscillations that
+induce acoustic streaming for propulsion. The drive mechanism, powered by
+miniature ERM vibration motors, eliminates the need for complex mechanical
+drive systems, enabling complete isolation of the entire drive system from the
+external environment and facilitating the miniaturization of the robotic fish.
+A compact, untethered robotic fish, measuring 85*60*45 mm^3, is equipped with
+three bio-inspired fins located at the pectoral and caudal positions.
+Experimental results demonstrate that the robotic fish achieves a maximum
+forward swimming speed of 1.36 body lengths (BL) per second powered by all fins
+and minimum turning radius of 0.6 BL when powered by a single fin. These
+results underscore the significance of employing the ERM vibration motor in
+advancing the development of highly maneuverable, miniature untethered
+underwater robots for various marine exploration tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Quantile Regression with Spiking Neural Networks for Long-Term
+  System Health Prognostics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David J Poland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel predictive maintenance framework centered on
+Enhanced Quantile Regression Neural Networks EQRNNs, for anticipating system
+failures in industrial robotics. We address the challenge of early failure
+detection through a hybrid approach that combines advanced neural
+architectures. The system leverages dual computational stages: first
+implementing an EQRNN optimized for processing multi-sensor data streams
+including vibration, thermal, and power signatures, followed by an integrated
+Spiking Neural Network SNN, layer that enables microsecond-level response
+times. This architecture achieves notable accuracy rates of 92.3\% in component
+failure prediction with a 90-hour advance warning window. Field testing
+conducted on an industrial scale with 50 robotic systems demonstrates
+significant operational improvements, yielding a 94\% decrease in unexpected
+system failures and 76\% reduction in maintenance-related downtimes. The
+framework's effectiveness in processing complex, multi-modal sensor data while
+maintaining computational efficiency validates its applicability for Industry
+4.0 manufacturing environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LearningFlow: Automated Policy Learning Workflow for Urban Driving with
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zengqi Peng, Yubin Wang, Xu Han, Lei Zheng, Jun Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in reinforcement learning (RL) demonstrate the
+significant potential in autonomous driving. Despite this promise, challenges
+such as the manual design of reward functions and low sample efficiency in
+complex environments continue to impede the development of safe and effective
+driving policies. To tackle these issues, we introduce LearningFlow, an
+innovative automated policy learning workflow tailored to urban driving. This
+framework leverages the collaboration of multiple large language model (LLM)
+agents throughout the RL training process. LearningFlow includes a curriculum
+sequence generation process and a reward generation process, which work in
+tandem to guide the RL policy by generating tailored training curricula and
+reward functions. Particularly, each process is supported by an analysis agent
+that evaluates training progress and provides critical insights to the
+generation agent. Through the collaborative efforts of these LLM agents,
+LearningFlow automates policy learning across a series of complex driving
+tasks, and it significantly reduces the reliance on manual reward function
+design while enhancing sample efficiency. Comprehensive experiments are
+conducted in the high-fidelity CARLA simulator, along with comparisons with
+other existing methods, to demonstrate the efficacy of our proposed approach.
+The results demonstrate that LearningFlow excels in generating rewards and
+curricula. It also achieves superior performance and robust generalization
+across various driving tasks, as well as commendable adaptation to different RL
+algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ECBench: Can Multi-modal Foundation Models Understand the Egocentric
+  World? A Holistic Embodied Cognition Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ronghao Dang, Yuqian Yuan, Wenqi Zhang, Yifei Xin, Boqiang Zhang, Long Li, Liuyi Wang, Qinyang Zeng, Xin Li, Lidong Bing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The enhancement of generalization in robots by large vision-language models
+(LVLMs) is increasingly evident. Therefore, the embodied cognitive abilities of
+LVLMs based on egocentric videos are of great interest. However, current
+datasets for embodied video question answering lack comprehensive and
+systematic evaluation frameworks. Critical embodied cognitive issues, such as
+robotic self-cognition, dynamic scene perception, and hallucination, are rarely
+addressed. To tackle these challenges, we propose ECBench, a high-quality
+benchmark designed to systematically evaluate the embodied cognitive abilities
+of LVLMs. ECBench features a diverse range of scene video sources, open and
+varied question formats, and 30 dimensions of embodied cognition. To ensure
+quality, balance, and high visual dependence, ECBench uses class-independent
+meticulous human annotation and multi-round question screening strategies.
+Additionally, we introduce ECEval, a comprehensive evaluation system that
+ensures the fairness and rationality of the indicators. Utilizing ECBench, we
+conduct extensive evaluations of proprietary, open-source, and task-specific
+LVLMs. ECBench is pivotal in advancing the embodied cognitive capabilities of
+LVLMs, laying a solid foundation for developing reliable core models for
+embodied agents. All data and code are available at
+https://github.com/Rh-Dang/ECBench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UAV-VLA: Vision-Language-Action System for Large Scale Aerial Mission
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Sautenkov, Yasheerah Yaqoot, Artem Lykov, Muhammad Ahsan Mustafa, Grik Tadevosyan, Aibek Akhmetkazy, Miguel Altamirano Cabrera, Mikhail Martynov, Sausar Karaf, Dzmitry Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The UAV-VLA (Visual-Language-Action) system is a tool designed to facilitate
+communication with aerial robots. By integrating satellite imagery processing
+with the Visual Language Model (VLM) and the powerful capabilities of GPT,
+UAV-VLA enables users to generate general flight paths-and-action plans through
+simple text requests. This system leverages the rich contextual information
+provided by satellite images, allowing for enhanced decision-making and mission
+planning. The combination of visual analysis by VLM and natural language
+processing by GPT can provide the user with the path-and-action set, making
+aerial operations more efficient and accessible. The newly developed method
+showed the difference in the length of the created trajectory in 22% and the
+mean error in finding the objects of interest on a map in 34.22 m by Euclidean
+distance in the K-Nearest Neighbors (KNN) approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>HRI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Fast Path-Planning Method for Continuous Harvesting of Table-Top Grown
+  Strawberries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhonghua Miao, Yang Chen, Lichao Yang, Shimin Hu, Ya Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continuous harvesting and storage of multiple fruits in a single operation
+allow robots to significantly reduce the travel distance required for
+repetitive back-and-forth movements. Traditional collision-free path planning
+algorithms, such as Rapidly-Exploring Random Tree (RRT) and A-star (A), often
+fail to meet the demands of efficient continuous fruit harvesting due to their
+low search efficiency and the generation of excessive redundant points. This
+paper presents the Interactive Local Minima Search Algorithm (ILMSA), a fast
+path-planning method designed for the continuous harvesting of table-top grown
+strawberries. The algorithm featured an interactive node expansion strategy
+that iteratively extended and refined collision-free path segments based on
+local minima points. To enable the algorithm to function in 3D, the 3D
+environment was projected onto multiple 2D planes, generating optimal paths on
+each plane. The best path was then selected, followed by integrating and
+smoothing the 3D path segments. Simulations demonstrated that ILMSA
+outperformed existing methods, reducing path length by 21.5% and planning time
+by 97.1% compared to 3D-RRT, while achieving 11.6% shorter paths and 25.4%
+fewer nodes than the Lowest Point of the Strawberry (LPS) algorithm in 3D
+environments. In 2D, ILMSA achieved path lengths 16.2% shorter than A, 23.4%
+shorter than RRT, and 20.9% shorter than RRT-Connect, while being over 96%
+faster and generating significantly fewer nodes. Field tests confirmed ILMSA's
+suitability for complex agricultural tasks, having a combined planning and
+execution time and an average path length that were approximately 58% and 69%,
+respectively, of those achieved by the LPS algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on AgriFood Electronics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent Sailing Model for Open Sea Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanna Krasowski, Stefan Schärdinger, Murat Arcak, Matthias Althoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous vessels potentially enhance safety and reliability of seaborne
+trade. To facilitate the development of autonomous vessels, high-fidelity
+simulations are required to model realistic interactions with other vessels.
+However, modeling realistic interactive maritime traffic is challenging due to
+the unstructured environment, coarsely specified traffic rules, and largely
+varying vessel types. Currently, there is no standard for simulating
+interactive maritime environments in order to rigorously benchmark autonomous
+vessel algorithms. In this paper, we introduce the first intelligent sailing
+model (ISM), which simulates rule-compliant vessels for navigation on the open
+sea. An ISM vessel reacts to other traffic participants according to maritime
+traffic rules while at the same time solving a motion planning task
+characterized by waypoints. In particular, the ISM monitors the applicable
+rules, generates rule-compliant waypoints accordingly, and utilizes a model
+predictive control for tracking the waypoints. We evaluate the ISM in two
+environments: interactive traffic with only ISM vessels and mixed traffic where
+some vessel trajectories are from recorded real-world maritime traffic data or
+handcrafted for criticality. Our results show that simulations with many ISM
+vessels of different vessel types are rule-compliant and scalable. We tested
+4,049 critical traffic scenarios. For interactive traffic with ISM vessels, no
+collisions occurred while goal-reaching rates of about 97 percent were
+achieved. We believe that our ISM can serve as a standard for challenging and
+realistic maritime traffic simulation to accelerate autonomous vessel
+development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CuRLA: Curriculum Learning Based Deep Reinforcement Learning for
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhargava Uppuluri, Anjel Patel, Neil Mehta, Sridhar Kamath, Pratyush Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In autonomous driving, traditional Computer Vision (CV) agents often struggle
+in unfamiliar situations due to biases in the training data. Deep Reinforcement
+Learning (DRL) agents address this by learning from experience and maximizing
+rewards, which helps them adapt to dynamic environments. However, ensuring
+their generalization remains challenging, especially with static training
+environments. Additionally, DRL models lack transparency, making it difficult
+to guarantee safety in all scenarios, particularly those not seen during
+training. To tackle these issues, we propose a method that combines DRL with
+Curriculum Learning for autonomous driving. Our approach uses a Proximal Policy
+Optimization (PPO) agent and a Variational Autoencoder (VAE) to learn safe
+driving in the CARLA simulator. The agent is trained using two-fold curriculum
+learning, progressively increasing environment difficulty and incorporating a
+collision penalty in the reward function to promote safety. This method
+improves the agent's adaptability and reliability in complex environments, and
+understand the nuances of balancing multiple reward components from different
+feedback signals in a single scalar reward function. Keywords: Computer Vision,
+Deep Reinforcement Learning, Variational Autoencoder, Proximal Policy
+Optimization, Curriculum Learning, Autonomous Driving.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in the 17th International Conference on Agents and
+  Artificial Intelligence (ICAART), Feb 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AD-L-JEPA: <span class="highlight-title">Self-Supervised</span> Spatial World Models with Joint Embedding
+  Predictive Architecture for Autonomous Driving with LiDAR Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04969v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04969v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Zhu, Zhenyuan Dong, Kristi Topollai, Anna Choromanska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As opposed to human drivers, current autonomous driving systems still require
+vast amounts of labeled data to train. Recently, world models have been
+proposed to simultaneously enhance autonomous driving capabilities by improving
+the way these systems understand complex real-world environments and reduce
+their data demands via self-supervised pre-training. In this paper, we present
+AD-L-JEPA (aka Autonomous Driving with LiDAR data via a Joint Embedding
+Predictive Architecture), a novel self-supervised pre-training framework for
+autonomous driving with LiDAR data that, as opposed to existing methods, is
+neither generative nor contrastive. Our method learns spatial world models with
+a joint embedding predictive architecture. Instead of explicitly generating
+masked unknown regions, our self-supervised world models predict Bird's Eye
+View (BEV) embeddings to represent the diverse nature of autonomous driving
+scenes. Our approach furthermore eliminates the need to manually create
+positive and negative pairs, as is the case in contrastive learning. AD-L-JEPA
+leads to simpler implementation and enhanced learned representations. We
+qualitatively and quantitatively demonstrate high-quality of embeddings learned
+with AD-L-JEPA. We furthermore evaluate the accuracy and label efficiency of
+AD-L-JEPA on popular downstream tasks such as LiDAR 3D object detection and
+associated transfer learning. Our experimental evaluation demonstrates that
+AD-L-JEPA is a plausible approach for self-supervised pre-training in
+autonomous driving applications and is the best available approach
+outperforming SOTA, including most recently proposed Occupancy-MAE [1] and ALSO
+[2]. The source code of AD-L-JEPA is available at
+https://github.com/HaoranZhuExplorer/AD-L-JEPA-Release.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Drives You to Interact?: The Role of User Motivation for a Robot in
+  the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amy Koike, Yuki Okafuji, Kenya Hoshimure, Jun Baba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we aim to understand how user motivation shapes human-robot
+interaction (HRI) in the wild. To explore this, we conducted a field study by
+deploying a fully autonomous conversational robot in a shopping mall over two
+days. Through sequential video analysis, we identified five patterns of
+interaction fluency (Smooth, Awkward, Active, Messy, and Quiet), four types of
+user motivation for interacting with the robot (Function, Experiment,
+Curiosity, and Education), and user positioning towards the robot. We further
+analyzed how these motivations and positioning influence interaction fluency.
+Our findings suggest that incorporating users' motivation types into the design
+of robot behavior can enhance interaction fluency, engagement, and user
+satisfaction in real-world HRI scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Probabilistic Inference of Human Motor Intentions by Assistive
+  Mobile Robots Controlled via a Brain-Computer Interface 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshan Zhou, Carol M. Menassa, Vineet R. Kamat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assistive mobile robots are a transformative technology that helps persons
+with disabilities regain the ability to move freely. Although autonomous
+wheelchairs significantly reduce user effort, they still require human input to
+allow users to maintain control and adapt to changing environments. Brain
+Computer Interface (BCI) stands out as a highly user-friendly option that does
+not require physical movement. Current BCI systems can understand whether users
+want to accelerate or decelerate, but they implement these changes in discrete
+speed steps rather than allowing for smooth, continuous velocity adjustments.
+This limitation prevents the systems from mimicking the natural, fluid speed
+changes seen in human self-paced motion. The authors aim to address this
+limitation by redesigning the perception-action cycle in a BCI controlled
+robotic system: improving how the robotic agent interprets the user's motion
+intentions (world state) and implementing these actions in a way that better
+reflects natural physical properties of motion, such as inertia and damping.
+The scope of this paper focuses on the perception aspect. We asked and answered
+a normative question "what computation should the robotic agent carry out to
+optimally perceive incomplete or noisy sensory observations?" Empirical EEG
+data were collected, and probabilistic representation that served as world
+state distributions were learned and evaluated in a Generative Adversarial
+Network framework. The ROS framework was established that connected with a
+Gazebo environment containing a digital twin of an indoor space and a virtual
+model of a robotic wheelchair. Signal processing and statistical analyses were
+implemented to identity the most discriminative features in the
+spatial-spectral-temporal dimensions, which are then used to construct the
+world model for the robotic agent to interpret user motion intentions as a
+Bayesian observer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GelBelt: A Vision-based Tactile Sensor for Continuous Sensing of Large
+  Surfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Amin Mirzaee, Hung-Jui Huang, Wenzhen Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scanning large-scale surfaces is widely demanded in surface reconstruction
+applications and detecting defects in industries' quality control and
+maintenance stages. Traditional vision-based tactile sensors have shown
+promising performance in high-resolution shape reconstruction while suffering
+limitations such as small sensing areas or susceptibility to damage when slid
+across surfaces, making them unsuitable for continuous sensing on large
+surfaces. To address these shortcomings, we introduce a novel vision-based
+tactile sensor designed for continuous surface sensing applications. Our design
+uses an elastomeric belt and two wheels to continuously scan the target
+surface. The proposed sensor showed promising results in both shape
+reconstruction and surface fusion, indicating its applicability. The dot
+product of the estimated and reference surface normal map is reported over the
+sensing area and for different scanning speeds. Results indicate that the
+proposed sensor can rapidly scan large-scale surfaces with high accuracy at
+speeds up to 45 mm/s.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE RA-L. 8 pages, 7 figures, webpage:
+  https://aminmirz.github.io/GelBelt/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards smart and adaptive agents for active sensing on edge devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Devendra Vyas, Miguel de Prado, Tim Verbelen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  TinyML has made deploying deep learning models on low-power edge devices
+feasible, creating new opportunities for real-time perception in constrained
+environments. However, the adaptability of such deep learning methods remains
+limited to data drift adaptation, lacking broader capabilities that account for
+the environment's underlying dynamics and inherent uncertainty. Deep learning's
+scaling laws, which counterbalance this limitation by massively up-scaling data
+and model size, cannot be applied when deploying on the Edge, where deep
+learning limitations are further amplified as models are scaled down for
+deployment on resource-constrained devices.
+  This paper presents a smart agentic system capable of performing on-device
+perception and planning, enabling active sensing on the edge. By incorporating
+active inference into our solution, our approach extends beyond deep learning
+capabilities, allowing the system to plan in dynamic environments while
+operating in real time with a modest total model size of 2.3 MB. We showcase
+our proposed system by creating and deploying a saccade agent connected to an
+IoT camera with pan and tilt capabilities on an NVIDIA Jetson embedded device.
+The saccade agent controls the camera's field of view following optimal
+policies derived from the active inference principles, simulating human-like
+saccadic motion for surveillance and robotics applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Probabilistic Planning for the Uncertain and Dynamic
+  Orienteering Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.05545v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.05545v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiuchen Qian, Yanran Wang, David Boyle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Orienteering Problem (OP) is a well-studied routing problem that has been
+extended to incorporate uncertainties, reflecting stochastic or dynamic travel
+costs, prize-collection costs, and prizes. Existing approaches may, however, be
+inefficient in real-world applications due to insufficient modeling knowledge
+and initially unknowable parameters in online scenarios. Thus, we propose the
+Uncertain and Dynamic Orienteering Problem (UDOP), modeling travel costs as
+distributions with unknown and time-variant parameters. UDOP also associates
+uncertain travel costs with dynamic prizes and prize-collection costs for its
+objective and budget constraints. To address UDOP, we develop an ADaptive
+Approach for Probabilistic paThs - ADAPT, that iteratively performs 'execution'
+and 'online planning' based on an initial 'offline' solution. The execution
+phase updates system status and records online cost observations. The online
+planner employs a Bayesian approach to adaptively estimate power consumption
+and optimize path sequence based on safety beliefs. We evaluate ADAPT in a
+practical Unmanned Aerial Vehicle (UAV) charging scheduling problem for
+Wireless Rechargeable Sensor Networks. The UAV must optimize its path to
+recharge sensor nodes efficiently while managing its energy under uncertain
+conditions. ADAPT maintains comparable solution quality and computation time
+while offering superior robustness. Extensive simulations show that ADAPT
+achieves a 100% Mission Success Rate (MSR) across all tested scenarios,
+outperforming comparable heuristic-based and frequentist approaches that fail
+up to 70% (under challenging conditions) and averaging 67% MSR, respectively.
+This work advances the field of OP with uncertainties, offering a reliable and
+efficient approach for real-world applications in uncertain and dynamic
+environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Occupation-aware planning method for robotic monitoring missions in
+  dynamic environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.00846v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.00846v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaroslav Marchukov, Luis Montano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a method for robotic monitoring missions in the presence
+of moving obstacles. Although the scenario map is known, the robot lacks
+information about the movement of dynamic obstacles during the monitoring
+mission. Numerous local planners have been developed in recent years for
+navigating highly dynamic environments. However, the absence of a global
+planner for these environments can result in unavoidable collisions or the
+inability to successfully complete missions in densely populated areas, such as
+a scenario monitoring in our case. This work addresses the development and
+evaluation of a global planner, $MADA$ (Monitoring Avoiding Dynamic Areas),
+aimed at enhancing the deployment of robots in such challenging conditions. The
+robot plans and executes the mission using the proposed two-step approach. The
+first step involves selecting the observation goal based on the environment's
+distribution and estimated monitoring costs. In the second step, the robot
+identifies areas with moving obstacles and obtains paths avoiding densely
+occupied dynamic regions based on their occupation. Quantitative and
+qualitative results based on simulations and on real-world experimentation,
+confirm that the proposed method allows the robot to effectively monitor most
+of the environment while avoiding densely occupied dynamic areas.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automotive Speed Estimation: Sensor Types and Error Characteristics from
+  OBD-II to ADAS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00242v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00242v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hany Ragab, Sidney Givigi, Aboelmagd Noureldin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern on-road navigation systems heavily depend on integrating speed
+measurements with inertial navigation systems (INS) and global navigation
+satellite systems (GNSS). Telemetry-based applications typically source speed
+data from the On-Board Diagnostic II (OBD-II) system. However, the method of
+deriving speed, as well as the types of sensors used to measure wheel speed,
+differs across vehicles. These differences result in varying error
+characteristics that must be accounted for in navigation and autonomy
+applications. This paper addresses this gap by examining the diverse
+speed-sensing technologies employed in standard automotive systems and
+alternative techniques used in advanced systems designed for higher levels of
+autonomy, such as Advanced Driver Assistance Systems (ADAS), Autonomous Driving
+(AD), or surveying applications. We propose a method to identify the type of
+speed sensor in a vehicle and present strategies for accurately modeling its
+error characteristics. To validate our approach, we collected and analyzed data
+from three long real road trajectories conducted in urban environments in
+Toronto and Kingston, Ontario, Canada. The results underscore the critical role
+of integrating multiple sensor modalities to achieve more accurate speed
+estimation, thus improving automotive navigation state estimation, particularly
+in GNSS-denied environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 12 figures, to be published in conference proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LNS2+RL: Combining Multi-Agent Reinforcement Learning with Large
+  Neighborhood Search in Multi-Agent Path Finding <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17794v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17794v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutong Wang, Tanishq Duhan, Jiaoyang Li, Guillaume Sartoretti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Agent Path Finding (MAPF) is a critical component of logistics and
+warehouse management, which focuses on planning collision-free paths for a team
+of robots in a known environment. Recent work introduced a novel MAPF approach,
+LNS2, which proposed to repair a quickly obtained set of infeasible paths via
+iterative replanning, by relying on a fast, yet lower-quality, prioritized
+planning (PP) algorithm. At the same time, there has been a recent push for
+Multi-Agent Reinforcement Learning (MARL) based MAPF algorithms, which exhibit
+improved cooperation over such PP algorithms, although inevitably remaining
+slower. In this paper, we introduce a new MAPF algorithm, LNS2+RL, which
+combines the distinct yet complementary characteristics of LNS2 and MARL to
+effectively balance their individual limitations and get the best from both
+worlds. During early iterations, LNS2+RL relies on MARL for low-level
+replanning, which we show eliminates collisions much more than a PP algorithm.
+There, our MARL-based planner allows agents to reason about past and future
+information to gradually learn cooperative decision-making through a finely
+designed curriculum learning. At later stages of planning, LNS2+RL adaptively
+switches to PP algorithm to quickly resolve the remaining collisions, naturally
+trading off solution quality (number of collisions in the solution) and
+computational efficiency. Our comprehensive experiments on high-agent-density
+tasks across various team sizes, world sizes, and map structures consistently
+demonstrate the superior performance of LNS2+RL compared to many MAPF
+algorithms, including LNS2, LaCAM, EECBS, and SCRIMP. In maps with complex
+structures, the advantages of LNS2+RL are particularly pronounced, with LNS2+RL
+achieving a success rate of over 50% in nearly half of the tested tasks, while
+that of LaCAM, EECBS and SCRIMP falls to 0%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LP-ICP: General Localizability-Aware Point Cloud Registration for Robust
+  Localization in Extreme Unstructured Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02580v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02580v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haosong Yue, Qingyuan Xu, Fei Chen, Jia Pan, Weihai Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Iterative Closest Point (ICP) algorithm is a crucial component of
+LiDAR-based SLAM algorithms. However, its performance can be negatively
+affected in unstructured environments that lack features and geometric
+structures, leading to low accuracy and poor robustness in localization and
+mapping. It is known that degeneracy caused by the lack of geometric
+constraints can lead to errors in 6-DOF pose estimation along ill-conditioned
+directions. Therefore, there is a need for a broader and more fine-grained
+degeneracy detection and handling method. This paper proposes a new point cloud
+registration framework, LP-ICP, that combines point-to-line and point-to-plane
+distance metrics in the ICP algorithm, with localizability detection and
+handling. LP-ICP consists of a localizability detection module and an
+optimization module. The localizability detection module performs
+localizability analysis by utilizing the correspondences between edge points
+(with low local smoothness) to lines and planar points (with high local
+smoothness) to planes between the scan and the map. The localizability
+contribution of individual correspondence constraints can be applied to a
+broader range. The optimization module adds additional soft and hard
+constraints to the optimization equations based on the localizability category.
+This allows the pose to be constrained along ill-conditioned directions, with
+updates either tending towards the constraint value or leaving the initial
+estimate unchanged. This improves accuracy and reduces fluctuations. The
+proposed method is extensively evaluated through experiments on both simulation
+and real-world datasets, demonstrating higher or comparable accuracy than the
+state-of-the-art methods. The dataset and code of this paper will also be
+open-sourced at https://github.com/xuqingyuan2000/LP-ICP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 Pages, 8 Figures Submitted to IEEE Transactions on Automation
+  Science and Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Airborne Sense and Detect of Drones using Deep Learning and LiDAR Point
+  Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manduhu Manduhu, Alexander Dow, Petar Trslic, Gerard Dooly, Benjamin Blanck, James Riordan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The safe operation of drone swarms beyond visual line of sight requires
+multiple safeguards to mitigate the risk of collision between drones flying in
+close-proximity scenarios. Cooperative navigation and flight coordination
+strategies that rely on pre-planned trajectories, constant %{satellite and
+network connectivity and reliable Global Navigation Satellite System (GNSS)
+positioning are brittle to failure. Drone embedded sense and detect offers a
+comprehensive mode of separation between drones for deconfliction and collision
+avoidance. This paper presents the first airborne LiDAR based solution for
+drone-swarm detection and localization using 3D deep learning model. It adapts
+an existing deep learning neural network to the air-to-air drone scenario by
+expanding the scan space vertically. A new sparse convolution is proposed and
+applied to accelerate the backbone layer, which is the most time-consuming part
+of the neural network. To collect training data of safety critical,
+close-proximity multi-drone operations, a scenario Digital Twin is used to
+augment real datasets with high fidelity synthetic data. The trained model
+achieves over 80% recall and 96% precision when tested on real-world datasets.
+By incorporating a tracking-by-detection algorithm the system can reliably
+monitor the separation distance of multiple drones in challenging environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Humanoid Prosthetic Hands: Modular Terminal Devices That Improve
+  User Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.15589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.15589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Digby Chappell, Barry Mulvey, Shehara Perera, Fernando Bello, Petar Kormushev, Nicolas Rojas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite decades of research and development, myoelectric prosthetic hands
+lack functionality and are often rejected by users. This lack in functionality
+can be partially attributed to the widely accepted anthropomorphic design
+ideology in the field; attempting to replicate human hand form and function
+despite severe limitations in control and sensing technology. Instead,
+prosthetic hands can be tailored to perform specific tasks without increasing
+complexity by shedding the constraints of anthropomorphism. In this paper, we
+develop and evaluate four open-source modular non-humanoid devices to perform
+the motion required to replicate human flicking motion and to twist a
+screwdriver, and the functionality required to pick and place flat objects and
+to cut paper. Experimental results from these devices demonstrate that, versus
+a humanoid prosthesis, non-humanoid prosthesis design dramatically improves
+task performance, reduces user compensatory movement, and reduces task load.
+Case studies with two end users demonstrate the translational benefits of this
+research. We found that special attention should be paid to monitoring end-user
+task load to ensure positive rehabilitation outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures, 2 tables. Accepted for publication in IEEE
+  Transactions on Neural Systems and Rehabilitation Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the role of Artificial Intelligence methods in modern
+  force-controlled manufacturing robotic tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16828v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16828v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincenzo Petrone, Enrico Ferrentino, Pasquale Chiacchio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This position paper explores the integration of Artificial Intelligence (AI)
+into force-controlled robotic tasks within the scope of advanced manufacturing,
+a cornerstone of Industry 4.0. AI's role in enhancing robotic manipulators -
+key drivers in the Fourth Industrial Revolution - is rapidly leading to
+significant innovations in smart manufacturing. The objective of this article
+is to frame these innovations in practical force-controlled applications - e.g.
+deburring, polishing, and assembly tasks like peg-in-hole (PiH) - highlighting
+their necessity for maintaining high-quality production standards. By reporting
+on recent AI-based methodologies, this article contrasts them and identifies
+current challenges to be addressed in future research. The analysis concludes
+with a perspective on future research directions, emphasizing the need for
+common performance metrics to validate AI techniques, integration of various
+enhancements for performance optimization, and the importance of validating
+them in relevant scenarios. These future directions aim to provide consistency
+with already adopted approaches, so as to be compatible with manufacturing
+standards, increasing the relevance of AI-driven methods in both academic and
+industrial contexts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the 21st International Conference on Informatics in
+  Control, Automation and Robotics - Volume 1: ICINCO, 392-399, 2024 , Porto,
+  Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Semantic Navigation with Real Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16623v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16623v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Gutiérrez-Álvarez, Pablo Ríos-Navarro, Rafael Flor-Rodríguez, Francisco Javier Acevedo-Rodríguez, Roberto J. López-Sastre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Semantic Navigation (VSN) is the ability of a robot to learn visual
+semantic information for navigating in unseen environments. These VSN models
+are typically tested in those virtual environments where they are trained,
+mainly using reinforcement learning based approaches. Therefore, we do not yet
+have an in-depth analysis of how these models would behave in the real world.
+In this work, we propose a new solution to integrate VSN models into real
+robots, so that we have true embodied agents. We also release a novel ROS-based
+framework for VSN, ROS4VSN, so that any VSN-model can be easily deployed in any
+ROS-compatible robot and tested in a real setting. Our experiments with two
+different robots, where we have embedded two state-of-the-art VSN agents,
+confirm that there is a noticeable performance difference of these VSN
+solutions when tested in real-world and simulation environments. We hope that
+this research will endeavor to provide a foundation for addressing this
+consequential issue, with the ultimate aim of advancing the performance and
+efficiency of embodied agents within authentic real-world scenarios. Code to
+reproduce all our experiments can be found at
+https://github.com/gramuah/ros4vsn.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exosense: A Vision-Based Scene Understanding System For Exoskeletons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.14320v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.14320v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianeng Wang, Matias Mattamala, Christina Kassab, Guillaume Burger, Fabio Elnecave, Lintong Zhang, Marine Petriaux, Maurice Fallon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-balancing exoskeletons are a key enabling technology for individuals
+with mobility impairments. While the current challenges focus on
+human-compliant hardware and control, unlocking their use for daily activities
+requires a scene perception system. In this work, we present Exosense, a
+vision-centric scene understanding system for self-balancing exoskeletons. We
+introduce a multi-sensor visual-inertial mapping device as well as a navigation
+stack for state estimation, terrain mapping and long-term operation. We tested
+Exosense attached to both a human leg and Wandercraft's Personal Exoskeleton in
+real-world indoor scenarios. This enabled us to test the system during typical
+periodic walking gaits, as well as future uses in multi-story environments. We
+demonstrate that Exosense can achieve an odometry drift of about 4 cm per meter
+traveled, and construct terrain maps under 1 cm average reconstruction error.
+It can also work in a visual localization mode in a previously mapped
+environment, providing a step towards long-term operation of exoskeletons.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizable Autonomous Driving System across Diverse Adverse Weather
+  Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14737v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14737v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei-Bin Kou, Guangxu Zhu, Rongguang Ye, Qingfeng Lin, Zeyi Ren, Ming Tang, Yik-Chung Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various adverse weather conditions pose a significant challenge to autonomous
+driving (AD) street scene semantic understanding (segmentation). A common
+strategy is to minimize the disparity between images captured in clear and
+adverse weather conditions. However, this technique typically relies on
+utilizing clear image as a reference, which is challenging to obtain in
+practice. Furthermore, this method typically targets a single adverse
+condition, and thus perform poorly when confronting a mixture of multiple
+adverse weather conditions. To address these issues, we introduce a
+reference-free and Adverse weather-Immune scheme (called AdvImmu) that
+leverages the invariance of weather conditions over short periods (seconds).
+Specifically, AdvImmu includes three components: Locally Sequential Mechanism
+(LSM), Globally Shuffled Mechanism (GSM), and Unfolded Regularizers (URs). LSM
+leverages temporal correlations between adjacent frames to enhance model
+performance. GSM is proposed to shuffle LSM segments to prevent overfitting of
+temporal patterns. URs are the deep unfolding implementation of two proposed
+regularizers to penalize the model complexity to enhance across-weather
+generalization. In addition, to overcome the over-reliance on consecutive
+frame-wise annotations in the training of AdvImmu (typically unavailable in AD
+scenarios), we incorporate a foundation model named Segment Anything Model
+(SAM) to assist to annotate frames, and additionally propose a cluster
+algorithm (denoted as SBICAC) to surmount SAM's category-agnostic issue to
+generate pseudo-labels. Extensive experiments demonstrate that the proposed
+AdvImmu outperforms existing state-of-the-art methods by 88.56% in mean
+Intersection over Union (mIoU).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoMAL: Collaborative Multi-Agent Large Language Models for
+  Mixed-Autonomy Traffic <span class="chip">SDM25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.14368v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.14368v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaiyuan Yao, Longchao Da, Vishnu Nandam, Justin Turnau, Zhiwei Liu, Linsey Pang, Hua Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of autonomous vehicles into urban traffic has great potential
+to improve efficiency by reducing congestion and optimizing traffic flow
+systematically. In this paper, we introduce CoMAL (Collaborative Multi-Agent
+LLMs), a framework designed to address the mixed-autonomy traffic problem by
+collaboration among autonomous vehicles to optimize traffic flow. CoMAL is
+built upon large language models, operating in an interactive traffic
+simulation environment. It utilizes a Perception Module to observe surrounding
+agents and a Memory Module to store strategies for each agent. The overall
+workflow includes a Collaboration Module that encourages autonomous vehicles to
+discuss the effective strategy and allocate roles, a reasoning engine to
+determine optimal behaviors based on assigned roles, and an Execution Module
+that controls vehicle actions using a hybrid approach combining rule-based
+models. Experimental results demonstrate that CoMAL achieves superior
+performance on the Flow benchmark. Additionally, we evaluate the impact of
+different language models and compare our framework with reinforcement learning
+approaches. It highlights the strong cooperative capability of LLM agents and
+presents a promising solution to the mixed-autonomy traffic challenge. The code
+is available at https://github.com/Hyan-Yao/CoMAL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, accepted to SDM25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Adaptivity and Safety: Learning Agile Collision-Free Locomotion
+  Across Varied Physics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04276v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04276v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichao Zhong, Chong Zhang, Tairan He, Guanya Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world legged locomotion systems often need to reconcile agility and
+safety for different scenarios. Moreover, the underlying dynamics are often
+unknown and time-variant (e.g., payload, friction). In this paper, we introduce
+BAS (Bridging Adaptivity and Safety), which builds upon the pipeline of prior
+work Agile But Safe (ABS)(He et al.) and is designed to provide adaptive safety
+even in dynamic environments with uncertainties. BAS involves an agile policy
+to avoid obstacles rapidly and a recovery policy to prevent collisions, a
+physical parameter estimator that is concurrently trained with agile policy,
+and a learned control-theoretic RA (reach-avoid) value network that governs the
+policy switch. Also, the agile policy and RA network are both conditioned on
+physical parameters to make them adaptive. To mitigate the distribution shift
+issue, we further introduce an on-policy fine-tuning phase for the estimator to
+enhance its robustness and accuracy. The simulation results show that BAS
+achieves 50% better safety than baselines in dynamic environments while
+maintaining a higher speed on average. In real-world experiments, BAS shows its
+capability in complex environments with unknown physics (e.g., slippery floors
+with unknown frictions, unknown payloads up to 8kg), while baselines lack
+adaptivity, leading to collisions or. degraded agility. As a result, BAS
+achieves a 19.8% increase in speed and gets a 2.36 times lower collision rate
+than ABS in the real world. Videos: https://adaptive-safe-locomotion.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 Pages, 6 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nothing Stands Still: A Spatiotemporal Benchmark on 3D Point Cloud
+  Registration Under Large Geometric and Temporal Change <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09346v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09346v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Sun, Yan Hao, Shengyu Huang, Silvio Savarese, Konrad Schindler, Marc Pollefeys, Iro Armeni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building 3D geometric maps of man-made spaces is a well-established and
+active field that is fundamental to computer vision and robotics. However,
+considering the evolving nature of built environments, it is essential to
+question the capabilities of current mapping efforts in handling temporal
+changes. In addition, spatiotemporal mapping holds significant potential for
+achieving sustainability and circularity goals. Existing mapping approaches
+focus on small changes, such as object relocation or self-driving car
+operation; in all cases where the main structure of the scene remains fixed.
+Consequently, these approaches fail to address more radical changes in the
+structure of the built environment, such as geometry and topology. To this end,
+we introduce the Nothing Stands Still (NSS) benchmark, which focuses on the
+spatiotemporal registration of 3D scenes undergoing large spatial and temporal
+change, ultimately creating one coherent spatiotemporal map. Specifically, the
+benchmark involves registering two or more partial 3D point clouds (fragments)
+from the same scene but captured from different spatiotemporal views. In
+addition to the standard pairwise registration, we assess the multi-way
+registration of multiple fragments that belong to any temporal stage. As part
+of NSS, we introduce a dataset of 3D point clouds recurrently captured in
+large-scale building indoor environments that are under construction or
+renovation. The NSS benchmark presents three scenarios of increasing
+difficulty, to quantify the generalization ability of point cloud registration
+methods over space (within one building and across buildings) and time. We
+conduct extensive evaluations of state-of-the-art methods on NSS. The results
+demonstrate the necessity for novel methods specifically designed to handle
+large spatiotemporal changes. The homepage of our benchmark is at
+http://nothing-stands-still.com.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in the ISPRS Journal of Photogrammetry and Remote Sensing.
+  29 pages, 26 figures. For the project page, see
+  http://nothing-stands-still.com</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MobileH2R: Learning Generalizable Human to Mobile Robot Handover
+  Exclusively from Scalable and Diverse Synthetic Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04595v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04595v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zifan Wang, Ziqing Chen, Junyu Chen, Jilong Wang, Yuxin Yang, Yunze Liu, Xueyi Liu, He Wang, Li Yi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces MobileH2R, a framework for learning generalizable
+vision-based human-to-mobile-robot (H2MR) handover skills. Unlike traditional
+fixed-base handovers, this task requires a mobile robot to reliably receive
+objects in a large workspace enabled by its mobility. Our key insight is that
+generalizable handover skills can be developed in simulators using high-quality
+synthetic data, without the need for real-world demonstrations. To achieve
+this, we propose a scalable pipeline for generating diverse synthetic full-body
+human motion data, an automated method for creating safe and imitation-friendly
+demonstrations, and an efficient 4D imitation learning method for distilling
+large-scale demonstrations into closed-loop policies with base-arm
+coordination. Experimental evaluations in both simulators and the real world
+show significant improvements (at least +15% success rate) over baseline
+methods in all cases. Experiments also validate that large-scale and diverse
+synthetic data greatly enhances robot learning, highlighting our scalable
+framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constraints as Rewards: Reinforcement Learning for Robots without Reward
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Ishihara, Noriaki Takasugi, Kotaro Kawakami, Masaya Kinoshita, Kazumi Aoyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning has become an essential algorithm for generating
+complex robotic behaviors. However, to learn such behaviors, it is necessary to
+design a reward function that describes the task, which often consists of
+multiple objectives that needs to be balanced. This tuning process is known as
+reward engineering and typically involves extensive trial-and-error. In this
+paper, to avoid this trial-and-error process, we propose the concept of
+Constraints as Rewards (CaR). CaR formulates the task objective using multiple
+constraint functions instead of a reward function and solves a reinforcement
+learning problem with constraints using the Lagrangian-method. By adopting this
+approach, different objectives are automatically balanced, because Lagrange
+multipliers serves as the weights among the objectives. In addition, we will
+demonstrate that constraints, expressed as inequalities, provide an intuitive
+interpretation of the optimization target designed for the task. We apply the
+proposed method to the standing-up motion generation task of a
+six-wheeled-telescopic-legged robot and demonstrate that the proposed method
+successfully acquires the target behavior, even though it is challenging to
+learn with manually designed reward functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GUTS: Generalized Uncertainty-Aware Thompson Sampling for Multi-Agent
+  Active Search <span class="chip">ICRA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02075v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02075v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Angad Bakshi, Tejus Gupta, Ramina Ghods, Jeff Schneider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic solutions for quick disaster response are essential to ensure minimal
+loss of life, especially when the search area is too dangerous or too vast for
+human rescuers. We model this problem as an asynchronous multi-agent
+active-search task where each robot aims to efficiently seek objects of
+interest (OOIs) in an unknown environment. This formulation addresses the
+requirement that search missions should focus on quick recovery of OOIs rather
+than full coverage of the search region. Previous approaches fail to accurately
+model sensing uncertainty, account for occlusions due to foliage or terrain, or
+consider the requirement for heterogeneous search teams and robustness to
+hardware and communication failures. We present the Generalized
+Uncertainty-aware Thompson Sampling (GUTS) algorithm, which addresses these
+issues and is suitable for deployment on heterogeneous multi-robot systems for
+active search in large unstructured environments. We show through simulation
+experiments that GUTS consistently outperforms existing methods such as
+parallelized Thompson Sampling and exhaustive search, recovering all OOIs in
+80% of all runs. In contrast, existing approaches recover all OOIs in less than
+40% of all runs. We conduct field tests using our multi-robot system in an
+unstructured environment with a search area of approximately 75,000 sq. m. Our
+system demonstrates robustness to various failure modes, achieving full
+recovery of OOIs (where feasible) in every field run, and significantly
+outperforming our baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures, 1 table, for associated video see:
+  https://youtu.be/K0jkzdQ_j2E , published in International Conference on
+  Robotics and Automation (ICRA) 2023. Outstanding Deployed Systems Paper
+  Winner</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">21</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pitch Plane Trajectory Tracking Control for Sounding Rockets via
+  Adaptive Feedback Linearization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro dos Santos, Paulo Oliveira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a pitch plane trajectory tacking control solution for
+suborbital launch vehicles relying on adaptive feedback linearization.
+Initially, the 2D dynamics and kinematics for a single-engine,
+thrust-vector-controlled sounding rocket are obtained for control design
+purposes. Then, an inner-outer control strategy, which simultaneously tackles
+attitude and position control, is adopted, with the inner-loop comprising the
+altitude and pitch control and the outer-loop addressing the horizontal
+(downrange) position control. Feedback linearization is used to cancel out the
+non-linearities in both the inner and outer dynamics. Making use of Lyapunov
+stability theory, an adaptation law, which provides online estimates on the
+inner-loop aerodynamic uncertainty, is jointly designed with the output
+tracking controller via adaptive backstepping, ensuring global reference
+tracking in the region where the feedback linearization is well-defined. The
+zero dynamics of the inner-stabilized system are then exploited to obtain the
+outerloop dynamics and derive a Linear Quadratic Regulator (LQR) with integral
+action, which can stabilize them as well as reject external disturbances. In
+the outermost loop, the estimate on the correspondent aerodynamic uncertainty
+is indirectly obtained by using the inner loop estimates together with known
+aerodynamics relations. The resulting inner-outer position control solution is
+proven to be asymptotically stable in the region of interest. Using a
+single-stage sounding rocket, propelled by a liquid engine, as reference
+vehicle, different mission scenarios are tested in a simulation environment to
+verify the adaptability of the proposed control strategy. The system is able to
+track the requested trajectories while rejecting external wind disturbances.
+Furthermore, the need to re-tune the control gains in between different mission
+scenarios is minimal to none.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted to the IEEE Aerospace Conference 2025. Copyright:
+  979-8-3503-5597-0/25/$31.00 @2025 IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable AI based System for Supply Air Temperature Forecast 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marika Eik, Ahmet Kose, Hossein Nourollahi Hokmabad, Juri Belikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the application of Explainable AI (XAI) techniques to
+improve the transparency and understanding of predictive models in control of
+automated supply air temperature (ASAT) of Air Handling Unit (AHU). The study
+focuses on forecasting of ASAT using a linear regression with Huber loss.
+However, having only a control curve without semantic and/or physical
+explanation is often not enough. The present study employs one of the XAI
+methods: Shapley values, which allows to reveal the reasoning and highlight the
+contribution of each feature to the final ASAT forecast. In comparison to other
+XAI methods, Shapley values have solid mathematical background, resulting in
+interpretation transparency. The study demonstrates the contrastive
+explanations--slices, for each control value of ASAT, which makes it possible
+to give the client objective justifications for curve changes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 7 figures, 1 table, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coordinated Control of Deformation and Flight for Morphing Aircraft via
+  Meta-Learning and Coupled State-Dependent Riccati Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao-Chi Che, Huai-Ning Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, the coordinated control problem of deformation and flight for
+morphing aircraft (MA) is studied by using meta-learning (ML) and coupled
+state-dependent Riccati equations (CSDREs). Our method is built on two
+principal observations that dynamic models of MA under varying morphing
+conditions share a morphing condition independent representation function and
+that the specific morphing condition part lies in a set of linear coefficients.
+To that end, the domain adversarially invariant meta-learning (DAIML) is
+employed to learn the shared representation with offline flight data. Based on
+the learned representation function, the coordinated control of the deformation
+and flight for MA is formulated as a non-cooperative differential game. The
+state-dependent feedback control solutions can be derived by addressing a pair
+of CSDREs. For this purpose, Lyapunov iterations are extended to obtain the
+positive semidefinite (definite) stabilizing solutions of the CSDREs, and the
+convergence proof of the proposed algorithm is provided. Finally, a simulation
+study is carried out to validate the efficacy of the developed coordinated game
+control strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Promoting Shared Energy Storage Aggregation among High Price-Tolerance
+  Prosumer: An Incentive Deposit and Withdrawal Service 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Lu, Jing Qiu, Cuo Zhang, Gang Lei, Jianguo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many residential prosumers exhibit a high price-tolerance for household
+electricity bills and a low response to price incentives. This is because the
+household electricity bills are not inherently high, and the potential for
+saving on electricity bills through participation in conventional Shared Energy
+Storage (SES) is limited, which diminishes their motivation to actively engage
+in SES. Additionally, existing SES models often require prosumers to take
+additional actions, such as optimizing rental capacity and bidding prices,
+which happen to be capabilities that typical household prosumers do not
+possess. To incentivize these high price-tolerance residential prosumers to
+participate in SES, a novel SES aggregation framework is proposed, which does
+not require prosumers to take additional actions and allows them to maintain
+existing energy storage patterns. Compared to conventional long-term operation
+of SES, the proposed framework introduces an additional short-term construction
+step during which the energy service provider (ESP) acquires control of the
+energy storage systems (ESS) and offers electricity deposit and withdrawal
+services (DWS) with dynamic coefficients, enabling prosumers to withdraw more
+electricity than they deposit without additional actions. Additionally, a
+matching mechanism is proposed to align prosumers' electricity consumption
+behaviors with ESP's optimization strategies. Finally, the dynamic coefficients
+in DWS and trading strategies are optimized by an improved deep reinforcement
+learning (DRL) algorithm. Case studies are conducted to verify the
+effectiveness of the proposed SES aggregation framework with DWS and the
+matching mechanism.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Linear Models with 1-Bit Measurements: Asymptotics of the
+  Maximum Likelihood Estimator <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04937v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04937v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaimin Shah, Martina Cardone, Cynthia Rush, Alex Dytso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work establishes regularity conditions for consistency and asymptotic
+normality of the multiple parameter maximum likelihood estimator(MLE) from
+censored data, where the censoring mechanism is in the form of $1$-bit
+measurements. The underlying distribution of the uncensored data is assumed to
+belong to the exponential family, with natural parameters expressed as a linear
+combination of the predictors, known as generalized linear model (GLM). As part
+of the analysis, the Fisher information matrix is also derived for both
+censored and uncensored data, which helps to quantify the impact of censoring
+and assess the performance of the MLE. The choice of GLM allows one to consider
+a variety of practical examples where 1-bit estimation is of interest. In
+particular, it is shown how the derived results can be used to analyze two
+practically relevant scenarios: the Gaussian model with both unknown mean and
+variance, and the Poisson model with an unknown mean.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Switched Optimal Control with Dwell Time Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoud S. Sakha, Rushikesh Kamalapurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an embedding-based approach for solving switched optimal
+control problems (SOCPs) with dwell time constraints. At first, an embedded
+optimal control problem (EOCP) is defined by replacing the discrete switching
+signal with a continuous embedded variable that can take intermediate values
+between the discrete modes. While embedding enables solutions of SOCPs via
+conventional techniques, optimal solutions of EOCPs often involve nonexistent
+modes and thus may not be feasible for the SOCP. In the modified EOCP (MEOCP),
+a concave function is added to the cost function to enforce a bang-bang
+solution in the embedded variable, which results in feasible solutions for the
+SOCP. However, the MEOCP cannot guarantee the satisfaction of dwell-time
+constraints.
+  In this paper, a MEOCP is combined with a filter layer to remove switching
+times that violate the dwell time constraint. Insertion gradients are used to
+minimize the effect of the filter on the optimal cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent Sailing Model for Open Sea Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanna Krasowski, Stefan Schärdinger, Murat Arcak, Matthias Althoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous vessels potentially enhance safety and reliability of seaborne
+trade. To facilitate the development of autonomous vessels, high-fidelity
+simulations are required to model realistic interactions with other vessels.
+However, modeling realistic interactive maritime traffic is challenging due to
+the unstructured environment, coarsely specified traffic rules, and largely
+varying vessel types. Currently, there is no standard for simulating
+interactive maritime environments in order to rigorously benchmark autonomous
+vessel algorithms. In this paper, we introduce the first intelligent sailing
+model (ISM), which simulates rule-compliant vessels for navigation on the open
+sea. An ISM vessel reacts to other traffic participants according to maritime
+traffic rules while at the same time solving a motion planning task
+characterized by waypoints. In particular, the ISM monitors the applicable
+rules, generates rule-compliant waypoints accordingly, and utilizes a model
+predictive control for tracking the waypoints. We evaluate the ISM in two
+environments: interactive traffic with only ISM vessels and mixed traffic where
+some vessel trajectories are from recorded real-world maritime traffic data or
+handcrafted for criticality. Our results show that simulations with many ISM
+vessels of different vessel types are rule-compliant and scalable. We tested
+4,049 critical traffic scenarios. For interactive traffic with ISM vessels, no
+collisions occurred while goal-reaching rates of about 97 percent were
+achieved. We believe that our ISM can serve as a standard for challenging and
+realistic maritime traffic simulation to accelerate autonomous vessel
+development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LUCAS: A Low-Power Ultra-Low Jitter Compact ASIC for SiPM Targetting
+  ToF-CT 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07595v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07595v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Arash Katourani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present LUCAS (Low power Ultra-low jitter Compact ASIC for SiPM), an
+analog front-end for Silicon Photomultipliers (SiPM) targeting fast timing
+detectors in Time-of-Flight Computed Tomography (ToF-CT). LUCAS features a very
+low input impedance preamplifier followed by a voltage comparator. It is
+designed in TSMC 65 nm low-power CMOS technology with a power supply of 1.2 V.
+Our first 8-channel prototype has been sent to fabrication and will be received
+in August 2023. Post-layout simulations predict less than 40 ps FWHM SPTR
+jitter and an approximate power consumption of 3.2 mW per channel. The front
+end is suitable for applications with rigorous jitter requirements and high
+event rates, thanks to its 3.9 GHz unity-gain bandwidth. The front-end compact
+form factor will facilitate its incorporation into systems demanding high
+channel densities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Formalising the intentional stance 2: a coinductive approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon McGregor,  timorl, Nathaniel Virgo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a stochastic process with inputs and outputs, how might its behaviour
+be related to pursuit of a goal? We model this using 'transducers', objects
+that capture only the external behaviour of a system and not its internal
+state. A companion paper summarises our results for cognitive scientists; the
+current paper gives formal definitions and proofs.
+  To formalise the concept of a system that behaves as if it were pursuing a
+goal, we consider what happens when a transducer (a 'policy') is coupled to
+another transducer that comes equipped with a success condition (a
+'teleo-environment'). An optimal policy is identified with a transducer that
+behaves as if it were perfectly rational in the pursuit of a goal; our
+framework also allows us to model constrained rationality.
+  Optimal policies obey a version of Bellman's principle: a policy that's
+optimal in one time step will again be optimal in the next time step, but with
+respect to a different teleo-environment (obtained from the original one by a
+modified version of Bayesian filtering). This property sometimes also applies
+to the bounded-rational case; we give a sufficient condition.
+  A policy is deterministic if and only if there exists a teleo-environment for
+which it is uniquely optimal among the set of all policies; we relate this to
+classical representation theorems from decision theory. This result need not
+hold in the bounded-rational case; we give an example related to the
+absent-minded driver problem. The formalism is defined using coinduction,
+following the style proposed by Czajka.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the companion paper to "Formalising the intentional stance 1:
+  attributing goals and beliefs to stochastic processes" (uploaded as version 2
+  of arXiv:2405.16490). The other paper is an overview aimed at cognitive
+  scientists while this paper gives full mathematical details. 50 pages, no
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Probabilistic Planning for the Uncertain and Dynamic
+  Orienteering Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.05545v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.05545v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiuchen Qian, Yanran Wang, David Boyle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Orienteering Problem (OP) is a well-studied routing problem that has been
+extended to incorporate uncertainties, reflecting stochastic or dynamic travel
+costs, prize-collection costs, and prizes. Existing approaches may, however, be
+inefficient in real-world applications due to insufficient modeling knowledge
+and initially unknowable parameters in online scenarios. Thus, we propose the
+Uncertain and Dynamic Orienteering Problem (UDOP), modeling travel costs as
+distributions with unknown and time-variant parameters. UDOP also associates
+uncertain travel costs with dynamic prizes and prize-collection costs for its
+objective and budget constraints. To address UDOP, we develop an ADaptive
+Approach for Probabilistic paThs - ADAPT, that iteratively performs 'execution'
+and 'online planning' based on an initial 'offline' solution. The execution
+phase updates system status and records online cost observations. The online
+planner employs a Bayesian approach to adaptively estimate power consumption
+and optimize path sequence based on safety beliefs. We evaluate ADAPT in a
+practical Unmanned Aerial Vehicle (UAV) charging scheduling problem for
+Wireless Rechargeable Sensor Networks. The UAV must optimize its path to
+recharge sensor nodes efficiently while managing its energy under uncertain
+conditions. ADAPT maintains comparable solution quality and computation time
+while offering superior robustness. Extensive simulations show that ADAPT
+achieves a 100% Mission Success Rate (MSR) across all tested scenarios,
+outperforming comparable heuristic-based and frequentist approaches that fail
+up to 70% (under challenging conditions) and averaging 67% MSR, respectively.
+This work advances the field of OP with uncertainties, offering a reliable and
+efficient approach for real-world applications in uncertain and dynamic
+environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Systematic interval observer design for linear systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.06445v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.06445v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thach Ngoc Dinh, Gia Quoc Bao Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We first propose systematic and comprehensive interval observer designs for
+linear time-invariant systems, under standard assumptions involving
+observability and interval bounds on the initial condition and disturbances.
+Historically, such designs rely on transformations with certain limitations
+into a form that is Metzler (for continuous time) or non-negative (for discrete
+time). We show that they can be effectively replaced with a linear
+time-invariant transformation that can be easily computed offline. Next, we
+propose an extension to the time-varying setting, addressing the limitations of
+conventional transformations that lack guaranteed outcomes. We employ dynamical
+transformations into higher-dimensional target forms for which an interval
+observer can always be constructed. These transformations become
+left-invertible after a certain time, provided observability conditions are met
+and the target dynamics are sufficiently high-dimensional and fast, thus
+enabling the reconstruction of bounds in the original coordinates in finite
+time. Academic examples are presented to illustrate our methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regret Analysis: a control perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04572v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04572v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Travis E. Gibson, Sawal Acharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online learning and model reference adaptive control have many interesting
+intersections. One area where they differ however is in how the algorithms are
+analyzed and what objective or metric is used to discriminate "good" algorithms
+from "bad" algorithms. In adaptive control there are usually two objectives: 1)
+prove that all time varying parameters/states of the system are bounded, and 2)
+that the instantaneous error between the adaptively controlled system and a
+reference system converges to zero over time (or at least a compact set). For
+online learning the performance of algorithms is often characterized by the
+regret the algorithm incurs. Regret is defined as the cumulative loss (cost)
+over time from the online algorithm minus the cumulative loss (cost) of the
+single optimal fixed parameter choice in hindsight. Another significant
+difference between the two areas of research is with regard to the assumptions
+made in order to obtain said results. Adaptive control makes assumptions about
+the input-output properties of the control problem and derives solutions for a
+fixed error model or optimization task. In the online learning literature
+results are derived for classes of loss functions (i.e. convex) while a priori
+assuming that all time varying parameters are bounded, which for many
+optimization tasks is not unrealistic, but is a non starter in control
+applications. In this work we discuss these differences in detail through the
+regret based analysis of gradient descent for convex functions and the control
+based analysis of a streaming regression problem. We close with a discussion
+about the newly defined paradigm of online adaptive control and ask the
+following question "Are regret optimal control strategies deployable?"
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages no figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formalising the intentional stance 1: attributing goals and beliefs to
+  stochastic processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16490v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16490v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon McGregor,  timorl, Nathaniel Virgo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article presents a formalism inspired by Dennett's notion of the
+intentional stance. Whereas Dennett's treatment of these concepts is informal,
+we aim to provide a more formal analogue. We introduce a framework based on
+stochastic processes with inputs and outputs, in which we can talk precisely
+about *interpreting* systems as having *normative-epistemic states*, which
+combine belief-like and desire-like features. Our framework is based on
+optimality but nevertheless allows us to model some forms of bounded cognition.
+  One might expect that the systems that can be described in
+normative-epistemic terms would be some special subset of all systems, but we
+show that this is not the case: every system admits a (possibly trivial)
+normative-epistemic interpretation, and those that can be *uniquely specified*
+by a normative-epistemic description are exactly the deterministic ones.
+Finally, we show that there is a suitable notion of Bayesian updating for
+normative-epistemic states, which we call *value-laden filtering*, since it
+involves both normative and epistemic elements. For unbounded cognition it is
+always permissible to attribute beliefs that update in this way. This is not
+always the case for bounded cognition, but we give a sufficient condition under
+which it is.
+  This paper gives an overview of our framework aimed at cognitive scientists,
+with a formal mathematical treatment given in a companion paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The previous version of this document included the content of the
+  companion paper, "Formalising the intentional stance 2: a coinductive
+  approach". The paper has now been split into two, this one (which is an
+  overview aimed at cognitive scientists) and the companion (which contains
+  full mathematical detail). 16 pages, one figure with two subfigures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Low-Complexity Control for a Class of Uncertain MIMO Nonlinear Systems
+  under Generalized Time-Varying Output Constraints (extended version) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03997v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03997v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farhad Mehdifar, Lars Lindemann, Charalampos P. Bechlioulis, Dimos V. Dimarogonas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel control framework to address the satisfaction
+of multiple time-varying output constraints in uncertain high-order MIMO
+nonlinear control systems. Unlike existing methods, which often assume that the
+constraints are always decoupled and feasible, our approach can handle coupled
+time-varying constraints even in the presence of potential infeasibilities.
+First, it is shown that satisfying multiple constraints essentially boils down
+to ensuring the positivity of a scalar variable, representing the signed
+distance from the boundary of the time-varying output-constrained set. To
+achieve this, a single consolidating constraint is designed that, when
+satisfied, guarantees convergence to and invariance of the time-varying
+output-constrained set within a user-defined finite time. Next, a novel robust
+and low-complexity feedback controller is proposed to ensure the satisfaction
+of the consolidating constraint. Additionally, we provide a mechanism for
+online modification of the consolidating constraint to find a least violating
+solution when the constraints become mutually infeasible for some time.
+Finally, simulation examples of trajectory and region tracking for a mobile
+robot validate the proposed approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>extended version, 21 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convex Optimization of Initial Perturbations toward Quantitative Weather
+  Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19546v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19546v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Toshiyuki Ohtsuka, Atsushi Okazaki, Masaki Ogura, Shunji Kotsuki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes introducing convex optimization to find initial
+perturbations of atmospheric states to realize specified changes in subsequent
+weather. In the proposed method, we formulate and solve an inverse problem to
+find effective perturbations in atmospheric variables so that controlled
+variables satisfy specified changes at a specified time. The proposed method
+first constructs a sensitivity matrix of controlled variables, such as
+accumulated precipitation, to the initial atmospheric variables, such as
+temperature and humidity, through sensitivity analysis using a numerical
+weather prediction (NWP) model. Then a convex optimization problem is
+formulated to achieve various control specifications involving not only
+quadratic functions but also absolute values and maximum values of the
+controlled variables and initial atmospheric variables in the cost function and
+constraints. The proposed method was validated through a benchmark warm bubble
+experiment using the NWP model. The experiments showed that the identified
+perturbations successfully realized specified spatial distributions of
+accumulated precipitation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>shortend to improve conciseness; some figures added to Supplements
+  for discussion about physical processes; license changed to CC BY 4.0;
+  revised to improve readability; some figures in Appendix omitted to improve
+  conciseness</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaming on Coincident Peak Shaving: Equilibrium and Strategic Behavior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02792v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02792v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liudong Chen, Bolun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coincident peak demand charges are imposed by power system operators or
+electric utilities when the overall system demand, aggregated across multiple
+consumers, reaches its peak. These charges incentivize consumers to reduce
+their demand during peak periods, a practice known as coincident peak shaving.
+In this paper, we analyze the coincident peak shaving problem through the lens
+of game theory, developing a theoretical model to examine the impact of
+strategic consumer behavior on system efficiency. We demonstrate that the game
+structure exhibits varying characteristics - concave,
+quasiconcave/discontinuous, or non-concave/discontinuous - depending on the
+extent of consumers demand-shifting capabilities. For a two-agent, two-period
+setting, we derive closed-form Nash equilibrium solutions under each condition
+and generalize our findings to cases with multiple agents. We prove the
+stability of the equilibrium points and present an algorithm for computing
+equilibrium outcomes across all game scenarios. We also show that the
+peak-shaving effectiveness of the game model matches that of the centralized
+peak-shaving model but with increased levels of anarchy. In the cases of
+quasiconcave and non-concave game conditions, we analytically demonstrate in
+the two-agent setting that anarchy increases with consumers' flexibility and
+inequity, as measured by their marginal shifting costs, and we also analyze the
+influence of the number of agents on anarchy. Finally, we provide numerical
+simulations to validate our theoretical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collaborative Spacecraft Servicing under Partial Feedback using
+  Lyapunov-based Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristian F. Nino, Omkar Sudhir Patil, Christopher D. Petersen, Sean Phillips, Warren E. Dixon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent systems are increasingly applied in space missions, including
+distributed space systems, resilient constellations, and autonomous rendezvous
+and docking operations. A critical emerging application is collaborative
+spacecraft servicing, which encompasses on-orbit maintenance, space debris
+removal, and swarm-based satellite repositioning. These missions involve
+servicing spacecraft interacting with malfunctioning or defunct spacecraft
+under challenging conditions, such as limited state information, measurement
+inaccuracies, and erratic target behaviors. Existing approaches often rely on
+assumptions of full state knowledge or single-integrator dynamics, which are
+impractical for real-world applications involving second-order spacecraft
+dynamics. This work addresses these challenges by developing a distributed
+state estimation and tracking framework that requires only relative position
+measurements and operates under partial state information. A novel
+$\rho$-filter is introduced to reconstruct unknown states using locally
+available information, and a Lyapunov-based deep neural network adaptive
+controller is developed that adaptively compensates for uncertainties stemming
+from unknown spacecraft dynamics. To ensure the collaborative spacecraft
+regulation problem is well-posed, a trackability condition is defined. A
+Lyapunov-based stability analysis is provided to ensure exponential convergence
+of errors in state estimation and spacecraft regulation to a neighborhood of
+the origin under the trackability condition. The developed method eliminates
+the need for expensive velocity sensors or extensive pre-training, offering a
+practical and robust solution for spacecraft servicing in complex, dynamic
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 4 Figures, Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying Metrics for Wildfire Ignition Risk from Geographic Data in
+  Power Shutoff Decision-Making 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.20511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.20511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan Piansky, Sofia Taylor, Noah Rhodes, Daniel K. Molzahn, Line A. Roald, Jean-Paul Watson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Faults on power lines and other electric equipment are known to cause
+wildfire ignitions. To mitigate the threat of wildfire ignitions from electric
+power infrastructure, many utilities preemptively de-energize power lines,
+which may result in power shutoffs. Data regarding wildfire ignition risks are
+key inputs for effective planning of power line de-energizations. However,
+there are multiple ways to formulate risk metrics that spatially aggregate
+wildfire risk map data, and there are different ways of leveraging this data to
+make decisions. The key contribution of this paper is to define and compare the
+results of employing six metrics for quantifying the wildfire ignition risks of
+power lines from risk maps, considering both threshold- and optimization-based
+methods for planning power line de-energizations. The numeric results use the
+California Test System (CATS), a large-scale synthetic grid model with power
+line corridors accurately representing California infrastructure, in
+combination with real Wildland Fire Potential Index data for a full year. This
+is the first application of optimal power shutoff planning on such a large and
+realistic test case. Our results show that the choice of risk metric
+significantly impacts the lines that are de-energized and the resulting load
+shed. We find that the optimization-based method results in significantly less
+load shed than the threshold-based method while achieving the same risk
+reduction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stability and Synchronization of Kuramoto Oscillators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.17925v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.17925v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhiram Gorle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imagine a group of oscillators, each endowed with their own rhythm or
+frequency, be it the ticking of a biological clock, the swing of a pendulum, or
+the glowing of fireflies. While these individual oscillators may seem
+independent of one another at first glance, the true magic lies in their
+ability to influence and synchronize with one another, like a group of
+fireflies glowing in unison.
+  The Kuramoto model was motivated by this phenomenon of collective
+synchronization, when a group of a large number of oscillators spontaneously
+lock to a common frequency, despite vast differences in their individual
+frequencies. Inspired by Kuramoto's groundbreaking work in the 1970s, this
+model captures the essence of how interconnected systems, ranging from
+biological networks to power grids, can achieve a state of synchronization.
+  This work aims to study the stability and synchronization of Kuramoto
+oscillators, starting off with an introduction to Kuramoto Oscillators and it's
+broader applications. We then at a graph theoretic formulation for the same and
+establish various criterion for the stability, synchronization of Kuramoto
+Oscillators. Finally, we broadly analyze and experiment with various physical
+systems that tend to behave like Kuramoto oscillators followed by further
+simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SMTL: A Stratified Logic for Expressive Multi-Level Temporal
+  Specifications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Baheri, Peng Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Stratified Metric Temporal Logic (SMTL), a novel formalism for
+specifying and verifying properties of complex cyber-physical systems that
+exhibit behaviors across multiple temporal and abstraction scales. SMTL extends
+existing temporal logics by incorporating a stratification operator, enabling
+the association of temporal properties with specific abstraction levels. This
+allows for the natural expression of multi-scale requirements while maintaining
+formal reasoning about inter-level relationships. We formalize the syntax and
+semantics of SMTL, proving that it strictly subsumes metric temporal logic
+(MTL) and offers enhanced expressiveness by capturing properties unattainable
+in existing logics. Numerical simulations comparing agents operating under MTL
+and SMTL specifications show that SMTL enhances agent coordination and safety,
+reducing collision rates without substantial computational overhead or
+compromising path efficiency. These findings underscore SMTL's potential as a
+valuable tool for designing and verifying complex multi-agent systems operating
+across diverse temporal and abstraction scales.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The computation of approximate feedback Stackelberg equilibria in
+  multi-player nonlinear constrained dynamic games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15745v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15745v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingqi Li, Somayeh Sojoudi, Claire Tomlin, David Fridovich-Keil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving feedback Stackelberg games with nonlinear dynamics and coupled
+constraints, a common scenario in practice, presents significant challenges.
+This work introduces an efficient method for computing approximate local
+feedback Stackelberg equilibria in multi-player general-sum dynamic games, with
+continuous state and action spaces. Different from existing (approximate)
+dynamic programming solutions that are primarily designed for unconstrained
+problems, our approach involves reformulating a feedback Stackelberg dynamic
+game into a sequence of nested optimization problems, enabling the derivation
+of Karush-Kuhn-Tucker (KKT) conditions and the establishment of a second-order
+sufficient condition for local feedback Stackelberg equilibria. We propose a
+Newton-style primal-dual interior point method for solving constrained linear
+quadratic (LQ) feedback Stackelberg games, offering provable convergence
+guarantees. Our method is further extended to compute local feedback
+Stackelberg equilibria for more general nonlinear games by iteratively
+approximating them using LQ games, ensuring that their KKT conditions are
+locally aligned with those of the original nonlinear games. We prove the
+exponential convergence of our algorithm in constrained nonlinear games. In a
+feedback Stackelberg game with nonlinear dynamics and (nonconvex) coupled costs
+and constraints, our experimental results reveal the algorithm's ability to
+handle infeasible initial conditions and achieve exponential convergence
+towards an approximate local feedback Stackelberg equilibrium.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript has been accepted by SIAM Journal on Optimization. We
+  fix few typos in this arxiv version</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">39</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A dimension reduction procedure for the design of lattice-spring systems
+  with minimal fabrication cost and required multi-functional properties 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Egor Makarenkov, Sakshi Malhotra, Yang Jiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that the problem of the design of the lattices of elastoplastic
+current conducting springs with optimal multi-functional properties leads to an
+analytically tractable problem. Specifically, focusing on a lattice with a
+small number of springs, we use the technique of inequalities to reduce the
+number variables and to compute the minimal cost of lattice fabrication
+explicitly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A fast approximate scenario addition method for two-stage robust
+  mixed-integer programs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Goerigk, Dorothee Henke, Johannes Kager, Fabian Schäfer, Clemens Thielen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a new scenario addition method for two-stage robust
+mixed-integer programs with finite uncertainty sets. Our method combines and
+extends speed-up techniques used in previous scenario addition methods (also
+called column-and-constraint generation methods) and introduces several new
+techniques. In particular, it uses dual bounds for second-stage problems in
+order to allow a faster identification of the next promising scenario to be
+added to the master problem. Moreover, adaptive time limits are imposed to
+avoid getting stuck on particularly hard second-stage problems, and a gap
+propagation between master problem and second-stage problems is used to stop
+solving them earlier if only a given non-zero optimality gap is to be reached
+overall. This makes our method particularly effective for problems where
+solving the second-stage problem is computationally challenging. To evaluate
+the method's performance, we compare it to two recent scenario addition methods
+from the literature on two applications: a robust capacitated location routing
+problem and a robust integrated berth allocation and quay crane assignment and
+scheduling problem. The first problem features a particularly hard second
+stage, and we show that our method is able to solve considerably more and
+larger instances in a given time limit. Using the second problem, we verify the
+general applicability of our method, even for problems where the second stage
+is relatively easy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the emergence of almost-honeycomb structures in low-energy planar
+  clusters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Caroccia, Kenneth DeMason, Francesco Maggi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Several commonly observed physical and biological systems are arranged in
+shapes that closely resemble an honeycomb cluster, that is, a tessellation of
+the plane by regular hexagons. Although these shapes are not always the direct
+product of energy minimization, they can still be understood, at least
+phenomenologically, as low-energy configurations. In this paper, explicit
+quantitative estimates on the geometry of such low-energy configurations are
+provided, showing in particular that the vast majority of the chambers must be
+generalized polygons with six edges, and be closely resembling regular
+hexagons. Part of our arguments is a detailed revision of the estimates behind
+the global isoperimetric principle for honeycomb clusters due to Hales (T. C.
+Hales. The honeycomb conjecture. Discrete Comput. Geom., 25(1):1-22, 2001).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Control of Overpopulated Tails in Kinetic Epidemic Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mattia Zanella, Andrea Medaglia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce model-based transition rates for controlled compartmental models
+in mathematical epidemiology, with a focus on the effects of control strategies
+applied to interacting multi-agent systems describing contact formation
+dynamics. In the framework of kinetic control problems, we compare two
+prototypical control protocols: one additive control directly influencing the
+dynamics and another targeting the interaction strength between agents. The
+emerging controlled macroscopic models are derived for an SIR
+compartmentalization to illustrate their impact on epidemic progression and
+contact interaction dynamics. Numerical results show the effectiveness of this
+approach in steering the dynamics and controlling epidemic trends, even in
+scenarios where contact distributions exhibit an overpopulated tail.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Isoperimetric inequalities for the fractional composite membrane problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mrityunjoy Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we investigate some isoperimetric-type inequalities related
+to the first eigenvalue of the fractional composite membrane problem. First, we
+establish an analogue of the renowned Faber-Krahn inequality for the fractional
+composite membrane problem. Next, we investigate an isoperimetric inequality
+for the first eigenvalue of the fractional composite membrane problem on the
+intersection of two domains-a problem that was first studied by Lieb [23] for
+the Laplacian. Similar results in the local case were previously obtained by
+Cupini-Vecchi [9] for the composite membrane problem. Our findings provide
+further insights into the fractional setting, offering a new perspective on
+these classical inequalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring near-optimal energy systems with stakeholders: a novel
+  approach for participatory modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oskar Vågerö, Koen van Greevenbroek, Aleksander Grochowicz, Maximilian Roithner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Involving people in energy systems planning can increase the legitimacy and
+socio-political feasibility of energy transitions. Participatory research in
+energy modelling offers the opportunity to engage with stakeholders in a
+comprehensive way, but is limited by how results can be generated and presented
+without imposing assumptions and discrete scenarios on the participants. To
+this end, we present a methodology and a framework, based on near-optimal
+modelling results, that can incorporate stakeholders in a holistic and engaging
+way. We confront stakeholders with a continuum of modelling-based energy system
+designs via an interactive interface allowing them to choose essentially any
+combination of components that meet the system requirements. Together with
+information on the implications of different technologies, it is possible to
+assess how participants prioritise different aspects in energy systems planning
+while also facilitating learning in an engaging and stimulating way. We
+showcase the methodology for the remote Arctic settlement of Longyearbyen and
+illustrate how participants deviate consistently from the cost optimum. At the
+same time, they manage to balance different priorities such as emissions,
+costs, and system vulnerability leading to a better understanding of the
+complexity and intertwined nature of decisions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 7 figures and 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifiability of Controlled Open Quantum Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Waqas Parvaiz, Johannes Aspman, Ales Wodecki, Georgios Korpas, Jakub Marecek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open quantum systems are a rich area of research on the intersection of
+quantum mechanics and stochastic analysis. We unify multiple views of
+controlled open quantum systems within the framework of bilinear dynamical
+systems. We define the corresponding notions of identifiability from the
+results of quantum state tomography, obtained in many copies of the initial
+quantum state, under subsequences of varying lengths of control signals. We
+explain and extend work on identifiability of bilinear systems using either
+spectral criteria, criteria based on Hankel matrix, and frequency-domain
+criteria, to the parameter estimation of master equations of open quantum
+systems. This sets the groundwork for a number of constructive approaches to
+the identification of open quantum systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Coordinated Drone-Courier Logistics for Intra-city Express Services 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05200v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05200v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuiwang Chen, Kai Wang, Lingxiao Wu, Wei Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Problem definition: Drones, despite being acknowledged as a transformative
+force in the city logistics sector, are unable to execute the
+\textit{last-meter delivery} (unloading goods directly to customers' doorsteps)
+due to airspace restrictions and safety concerns. To leverage advancements and
+overcome the limitations of drones in providing intra-city express services, we
+introduce a coordinated drone-courier logistics system where drones operate
+within a closed network among vertiports, while couriers connect customers to
+the drone delivery system. This paper aims to shed light on this coordinated
+system in terms of system feasibility, network interactivity, and long-term
+sustainability. Methodology/Results: We develop an integrated optimization
+model to optimize the network planning of the coordinated logistics system. The
+interplay between network planning and tactical operations is mirrored by a
+queueing network model, resulting in the nonlinear and nonconvex (partially
+convex and partially concave) feasible region of the optimization model. An
+iterative exact algorithm that tightens lower and upper bounds by adaptively
+refining the linear approximations of nonlinear constraints is developed to
+provide optimality-guaranteed solutions with finite convergence. The
+computational experiments demonstrate the scalability and robustness of our
+algorithm across various network configurations and scenarios.Managerial
+implications: The case study, based on a real-world dataset from SF Express, a
+logistics giant in China, validates that the coordinated logistics system
+efficiently attains cost and time savings by leveraging the effective turnover
+of drones and the coordination between drones and couriers. The optimal network
+design features a concentrated structure, streamlining demand consolidation and
+reducing deadhead repositioning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KLAP: KYP lemma based low-rank approximation for $\mathcal{H}_2$-optimal
+  passivation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Nicodemus, Matthias Voigt, Serkan Gugercin, Benjamin Unger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel passivity enforcement (passivation) method, called KLAP,
+for linear time-invariant systems based on the Kalman-Yakubovich-Popov (KYP)
+lemma and the closely related Lur'e equations. The passivation problem in our
+framework corresponds to finding a perturbation to a given non-passive system
+that renders the system passive while minimizing the $\mathcal{H}_2$ or
+frequency-weighted $\mathcal{H}_2$ distance between the original non-passive
+and the resulting passive system. We show that this problem can be formulated
+as an unconstrained optimization problem whose objective function can be
+differentiated efficiently even in large-scale settings. We show that any
+minimizer of the unconstrained problem yields the same passive system.
+Furthermore, we prove that, in the absence of a feedthrough term, every local
+minimizer is also a global minimizer. For cases involving a non-trivial
+feedthrough term, we analyze global minimizers in relation to the extremal
+solutions of the Lur'e equations, which can serve as tools for identifying
+local minima. To solve the resulting numerical optimization problem
+efficiently, we propose an initialization strategy based on modifying the
+feedthrough term and a restart strategy when it is likely that the optimization
+has converged to a local minimum. Numerical examples illustrate the
+effectiveness of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Efficient Mixed-Integer Formulation and an Iterative Method for
+  Optimal Control of Switched Systems Under Dwell Time Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramin Abbasi-Esfeden, Armin Nurkanovic, Moritz Diehl, Panagiotis Patrinos, Jan Swevers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an efficient Mixed-Integer Nonlinear Programming (MINLP)
+formulation for systems with discrete control inputs under dwell time
+constraints. By viewing such systems as a switched system, the problem is
+decomposed into a Sequence Optimization (SO) and a Switching Time Optimization
+(STO) -- the former providing the sequence of the switched system, and the
+latter calculating the optimal switching times. By limiting the feasible set of
+SO to subsequences of a master sequence, this formulation requires a small
+number of binary variables, independent of the number of time discretization
+nodes. This enables the proposed formulation to provide solutions efficiently,
+even for large numbers of time discretization nodes. To provide even faster
+solutions, an iterative algorithm is introduced to heuristically solve STO and
+SO. The proposed approaches are then showcased on four different switched
+systems and results demonstrate the efficiency of the MINLP formulation and the
+iterative algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cover-Relax-Search: A Primal Heuristic for Binary Quadratic Programs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weimin Huang, Natalie M. Isenberg, Jan Drgona, Draguna L Vrabie, Bistra Dilkina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Binary Quadratic Programs (BQPs) are a class of NP-hard problems that arise
+in a wide range of applications, including finance, machine learning, and
+logistics. These problems are challenging to solve due to the combinatorial
+search space and nonlinearity. In fact, this class of optimization problems is
+so challenging that, in many instances, standard algorithms struggle to
+identify feasible solutions within a reasonable time. Primal heuristic
+algorithms have been developed to quickly identify feasible solutions to BQPs.
+In this paper, we propose Cover-Relax-Search, an efficient primal heuristic for
+BQPs. This approach is inspired by multiple local search algorithms, including
+Undercover. We evaluate the \emph{Cover-Relax-Search} algorithm on multiple BQP
+benchmarks and show that our proposed heuristic identifies high-quality
+solutions at a faster speed and significantly reduces the primal integral
+compared to state-of-the-art solvers and other local search baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum-Assisted Space Logistics Mission Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amiratabak Bahengam, Mohammad-Ali Miri, R. Joseph Rupert, Wesley Dyk, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum computing provides a novel approach to addressing conventionally
+intractable issues in large-scale optimization. Space logistics missions
+require the efficient routing of payloads, spacecraft, and resources across
+complex networks, often resulting in an exponential growth of the solution
+space that classical methods cannot efficiently solve. This paper leverages
+entropy quantum computing to model and solve the space logistics problem as a
+time-dependent multicommodity network flow, enabling the exploration of large
+solution spaces. The findings highlight quantum computing's potential to
+address complex aerospace logistics, demonstrating its suitability for complex
+interplanetary mission planning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Algebraic characterization of equivalence between optimization
+  algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurent Lessard, Madeleine Udell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When are two algorithms the same? How can we be sure a recently proposed
+algorithm is novel, and not a minor twist on an existing method? In this paper,
+we present a framework for reasoning about equivalence between a broad class of
+iterative algorithms, with a focus on algorithms designed for convex
+optimization. We propose several notions of what it means for two algorithms to
+be equivalent, and provide computationally tractable means to detect
+equivalence. Our main definition, oracle equivalence, states that two
+algorithms are equivalent if they result in the same sequence of calls to the
+function oracles (for suitable initialization). Borrowing from control theory,
+we use state-space realizations to represent algorithms and characterize
+algorithm equivalence via transfer functions. Our framework can also identify
+and characterize equivalence between algorithms that use different oracles that
+are related via a linear fractional transformation. Prominent examples include
+linear transformations and function conjugation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper generalizes and provides new analysis and examples
+  compared to arxiv:2105.04684</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continuous and Discrete Systems for Quasi Variational Inequalities with
+  Application to Game Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oday Hazaimah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A new class of projected dynamical systems of third order is investigated for
+quasi (parametric) variational inequalities in which the convex set in the
+classical variational inequality also depends upon the solution explicitly or
+implicitly. We study the stability of a continuous method of a gradient type.
+Some iterative implicit and explicit schemes are suggested as counterparts of
+the continuous case by inertial proximal methods. The convergence analysis of
+these proposed methods is established under sufficient mild conditions.
+Moreover, some applications dealing with the generalized Nash equilibrium
+problems are presented.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages. arXiv admin note: text overlap with arXiv:2406.19345</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Projected proximal gradient trust-region algorithm for nonsmooth
+  optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh N. Dao, Hung M. Phan, Lindon Roberts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider trust-region methods for solving optimization problems where the
+objective is the sum of a smooth, nonconvex function and a nonsmooth, convex
+regularizer. We extend the global convergence theory of such methods to include
+worst-case complexity bounds in the case of unbounded model Hessian growth, and
+introduce a new, simple nonsmooth trust-region subproblem solver based on
+combining several iterations of proximal gradient descent with a single
+projection into the trust region, which meets the sufficient descent
+requirements for algorithm convergence and has promising numerical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Quadratically-Constrained Convex Approximation for the AC Optimal
+  Power Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gonzalo E. Constante-Flores, Can Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a quadratically-constrained approximation (QCAC) of the AC
+optimal power flow (AC-OPF) problem. Unlike existing approximations like the
+DC-OPF, our model does not rely on typical assumptions such as high
+reactance-to-resistance ratio, near-nominal voltage magnitudes, or small angle
+differences, and preserves the structural sparsity of the original AC power
+flow equations, making it suitable for decentralized power systems optimization
+problems. To achieve this, we reformulate the AC-OPF problem as a quadratically
+constrained quadratic program. The nonconvex terms are expressed as differences
+of convex functions, which are then convexified around a base point derived
+from a warm start of the nodal voltages. If this linearization results in a
+non-empty constraint set, the convexified constraints form an inner convex
+approximation. Our experimental results, based on Power Grid Library instances
+of up to 30,000 buses, demonstrate the effectiveness of the QCAC approximation
+with respect to other well-documented conic relaxations and a linear
+approximation. We further showcase its potential advantages over the
+well-documented second-order conic relaxation of the power flow equations in
+two proof-of-concept case studies: optimal reactive power dispatch in
+transmission networks and PV hosting capacity in distribution grids.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of Two-Stage Distributionally Robust Optimization
+  over 1-Wasserstein and 2-Wasserstein Balls 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05619v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05619v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geunyeong Byeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates advantages of using 2-Wasserstein ambiguity sets over
+1-Wasserstein sets in two-stage distributionally robust optimization with
+right-hand side uncertainty. We examine the worst-case distributions within 1-
+and 2-Wasserstein balls under both unrestricted and nonnegative orthant
+supports, highlighting a pathological behavior arising in 1-Wasserstein balls.
+Closed-form solutions for a single-scenario newsvendor problem illustrate that
+2-Wasserstein balls enable more informed decisions. Additionally, a
+penalty-based dual interpretation suggests that 2-Wasserstein balls may
+outperform 1-Wasserstein balls across a broader range of Wasserstein radii,
+even with general support sets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Equivariant Perturbation in Gomory and Johnson's Infinite Group Problem.
+  IV. The General Unimodular Two-Dimensional Case 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Hildebrand, Matthias Köppe, Luze Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study an abstract setting for cutting planes for integer programming
+called the infinite group problem. In this abstraction, cutting planes are
+computed via cut generating function that act on the simplex tableau. In this
+function space, cut generating functions are classified as minimal, extreme,
+and facets as a proxy for understanding the strength or potential importance of
+these functions. Prior work developed algorithms for testing minimality,
+extremality, and facetness for cut generating functions applied to 1-row
+tableau and to some 2-row tableau in a restricted setting. We complement and
+generalize this work by giving an algorithm for testing the extremality of a
+large class of minimal valid functions for the two-dimensional infinite group
+problem. Along the way, we develop results of independent interest on
+functional equations and infinite systems of linear equations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Switched Optimal Control with Dwell Time Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoud S. Sakha, Rushikesh Kamalapurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an embedding-based approach for solving switched optimal
+control problems (SOCPs) with dwell time constraints. At first, an embedded
+optimal control problem (EOCP) is defined by replacing the discrete switching
+signal with a continuous embedded variable that can take intermediate values
+between the discrete modes. While embedding enables solutions of SOCPs via
+conventional techniques, optimal solutions of EOCPs often involve nonexistent
+modes and thus may not be feasible for the SOCP. In the modified EOCP (MEOCP),
+a concave function is added to the cost function to enforce a bang-bang
+solution in the embedded variable, which results in feasible solutions for the
+SOCP. However, the MEOCP cannot guarantee the satisfaction of dwell-time
+constraints.
+  In this paper, a MEOCP is combined with a filter layer to remove switching
+times that violate the dwell time constraint. Insertion gradients are used to
+minimize the effect of the filter on the optimal cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Algebraic Truncation Algorithm with A Posteriori Error Bounds for
+  Computing Markov Chain Equilibrium Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saied Mahdian, Peter W. Glynn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The numerical computation of equilibrium reward gradients for Markov chains
+appears in many applications for example within the policy improvement step
+arising in connection with average reward stochastic dynamic programming. When
+the state space is large or infinite, one will typically need to truncate the
+state space in order to arrive at a numerically tractable formulation. In this
+paper, we derive the first computable a posteriori error bounds for equilibrium
+reward gradients that account for the error induced by the truncation. Our
+approach uses regeneration to express equilibrium quantities in terms of the
+expectations of cumulative rewards over regenerative cycles. Lyapunov functions
+are then used to bound the contributions to these cumulative rewards and their
+gradients from path excursions that take the chain outside the truncation set.
+Our numerical results indicate that our approach can provide highly accurate
+bounds with truncation sets of moderate size. We further extend our approach to
+Markov jump processes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Formalising the intentional stance 2: a coinductive approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.09173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.09173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon McGregor,  timorl, Nathaniel Virgo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a stochastic process with inputs and outputs, how might its behaviour
+be related to pursuit of a goal? We model this using 'transducers', objects
+that capture only the external behaviour of a system and not its internal
+state. A companion paper summarises our results for cognitive scientists; the
+current paper gives formal definitions and proofs.
+  To formalise the concept of a system that behaves as if it were pursuing a
+goal, we consider what happens when a transducer (a 'policy') is coupled to
+another transducer that comes equipped with a success condition (a
+'teleo-environment'). An optimal policy is identified with a transducer that
+behaves as if it were perfectly rational in the pursuit of a goal; our
+framework also allows us to model constrained rationality.
+  Optimal policies obey a version of Bellman's principle: a policy that's
+optimal in one time step will again be optimal in the next time step, but with
+respect to a different teleo-environment (obtained from the original one by a
+modified version of Bayesian filtering). This property sometimes also applies
+to the bounded-rational case; we give a sufficient condition.
+  A policy is deterministic if and only if there exists a teleo-environment for
+which it is uniquely optimal among the set of all policies; we relate this to
+classical representation theorems from decision theory. This result need not
+hold in the bounded-rational case; we give an example related to the
+absent-minded driver problem. The formalism is defined using coinduction,
+following the style proposed by Czajka.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the companion paper to "Formalising the intentional stance 1:
+  attributing goals and beliefs to stochastic processes" (uploaded as version 2
+  of arXiv:2405.16490). The other paper is an overview aimed at cognitive
+  scientists while this paper gives full mathematical details. 50 pages, no
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using Linearized Optimal Transport to Predict the Evolution of
+  Stochastic Particle Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01857v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01857v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Karris, Evangelos A. Nikitopoulos, Ioannis Kevrekidis, Seungjoon Lee, Alexander Cloninger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop an algorithm to approximate the time evolution of a probability
+distribution without explicitly learning an operator that governs the
+evolution. A particular application of interest is discrete measures $\mu_t^N$
+that arise from systems of $N$ particles in $\mathbb R^d$. In many such
+situations, the individual particles move chaotically on short time scales,
+making it difficult to learn the dynamics of a governing operator, but the bulk
+distribution $\mu_t^N$ approximates an absolutely continuous measure $\mu_t$
+that evolves ``smoothly.'' If $\mu_t$ is known on some time interval, then
+linearized optimal transport theory provides an Euler-like scheme for
+approximating the evolution of $\mu_t$ using its ``tangent vector field''
+(represented as a time-dependent vector field on $\mathbb R^d$), which can be
+computed as a limit of optimal transport maps. We propose an analog of this
+Euler approximation to predict the evolution of the discrete measure $\mu_t^N$
+(without knowing $\mu_t$). To approximate the analogous tangent vector field,
+we use a finite difference over a time step that sits between two time scales
+of the system -- long enough for a large-$N$ evolution ($\mu_t$) to emerge but
+short enough to satisfactorily approximate the derivative object used in the
+Euler scheme. The emergence of the limiting behavior ensures the optimal
+transport maps closely approximate the vector field describing the bulk
+distribution's smooth evolution instead of the individual particles' more
+chaotic movements. We demonstrate the efficacy of our approach with two
+illustrative examples, Gaussian diffusion and a cell chemotaxis model, and show
+that our method succeeds in predicting the bulk behavior over relatively large
+steps.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Quantum State Reconstruction with Structured Classical Shadows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03144v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03144v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Qin, Joseph M. Lukens, Brian T. Kirby, Zhihui Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum state tomography (QST) remains the prevailing method for benchmarking
+and verifying quantum devices; however, its application to large quantum
+systems is rendered impractical due to the exponential growth in both the
+required number of total state copies and classical computational resources.
+Recently, the classical shadow (CS) method has been introduced as a more
+computationally efficient alternative, capable of accurately predicting key
+quantum state properties. Despite its advantages, a critical question remains
+as to whether the CS method can be extended to perform QST with guaranteed
+performance. In this paper, we address this challenge by introducing a
+projected classical shadow (PCS) method with guaranteed performance for QST
+based on Haar-random projective measurements. PCS extends the standard CS
+method by incorporating a projection step onto the target subspace. For a
+general quantum state consisting of $n$ qubits, our method requires a minimum
+of $O(4^n)$ total state copies to achieve a bounded recovery error in the
+Frobenius norm between the reconstructed and true density matrices, reducing to
+$O(2^n r)$ for states of rank $r<2^n$ -- meeting information-theoretic optimal
+bounds in both cases. For matrix product operator states, we demonstrate that
+the PCS method can recover the ground-truth state with $O(n^2)$ total state
+copies, improving upon the previously established Haar-random bound of
+$O(n^3)$. Simulation results further validate the effectiveness of the proposed
+PCS method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bounded weak solutions for Keller-Segel equations with generalized
+  diffusion and logistic source via an unbalanced Optimal Transport splitting
+  scheme 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08188v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08188v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyungkeun Kang, Hwa Kil Kim, Geuntaek Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a parabolic-elliptic type of Keller-Segel equations with
+generalized diffusion and logistic source under homogeneous Neumann-Neumann
+boundary conditions. We construct bounded weak solutions globally in time in an
+unbalanced optimal transport framework, provided that the magnitude of the
+chemotactic sensitivity can be restricted depending on parameters. In the case
+of subquadratic degradation of the logistic source, we quantify the chemotactic
+sensitivity, in particular, in terms of the power of degradation and the
+pointwise bound of the initial density.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stabilization of strictly pre-dissipative nonlinear receding horizon
+  control by terminal costs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13538v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13538v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Grüne, Mario Zanon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is known that receding horizon control with a strictly pre-dissipative
+optimal control problem yields a practically asymptotically stable closed loop
+when suitable state constraints are imposed. In this note we show that
+alternatively suitably bounded terminal costs can be used for stabilizing the
+closed loop.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Observability of the linear Zakharov--Kuznetsov equation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09844v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09844v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roberto de A. Capistrano Filho, Vilmos Komornik, Ademir F. Pazoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the linear Zakharov--Kuznetsov equation with periodic boundary
+conditions. Employing some tools from the nonharmonic Fourier series we obtain
+several internal observability theorems. Then we prove various exact
+controllability and rapid uniform stabilization results by applying a duality
+principle and a general feedback construction. The method presented here
+introduces a new insight into the control of dispersive equations in
+two-dimensional cases and may be adapted to more general equations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 2 figures. Comments are welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regret Analysis: a control perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04572v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04572v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Travis E. Gibson, Sawal Acharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online learning and model reference adaptive control have many interesting
+intersections. One area where they differ however is in how the algorithms are
+analyzed and what objective or metric is used to discriminate "good" algorithms
+from "bad" algorithms. In adaptive control there are usually two objectives: 1)
+prove that all time varying parameters/states of the system are bounded, and 2)
+that the instantaneous error between the adaptively controlled system and a
+reference system converges to zero over time (or at least a compact set). For
+online learning the performance of algorithms is often characterized by the
+regret the algorithm incurs. Regret is defined as the cumulative loss (cost)
+over time from the online algorithm minus the cumulative loss (cost) of the
+single optimal fixed parameter choice in hindsight. Another significant
+difference between the two areas of research is with regard to the assumptions
+made in order to obtain said results. Adaptive control makes assumptions about
+the input-output properties of the control problem and derives solutions for a
+fixed error model or optimization task. In the online learning literature
+results are derived for classes of loss functions (i.e. convex) while a priori
+assuming that all time varying parameters are bounded, which for many
+optimization tasks is not unrealistic, but is a non starter in control
+applications. In this work we discuss these differences in detail through the
+regret based analysis of gradient descent for convex functions and the control
+based analysis of a streaming regression problem. We close with a discussion
+about the newly defined paradigm of online adaptive control and ask the
+following question "Are regret optimal control strategies deployable?"
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages no figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Methodology for Interpretable Reinforcement Learning for Optimizing
+  Mechanical Ventilation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03105v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03105v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joo Seung Lee, Malini Mahendra, Anil Aswani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mechanical ventilation is a critical life support intervention that delivers
+controlled air and oxygen to a patient's lungs, assisting or replacing
+spontaneous breathing. While several data-driven approaches have been proposed
+to optimize ventilator control strategies, they often lack interpretability and
+alignment with domain knowledge, hindering clinical adoption. This paper
+presents a methodology for interpretable reinforcement learning (RL) aimed at
+improving mechanical ventilation control as part of connected health systems.
+Using a causal, nonparametric model-based off-policy evaluation, we assess RL
+policies for their ability to enhance patient-specific outcomes-specifically,
+increasing blood oxygen levels (SpO2), while avoiding aggressive ventilator
+settings that may cause ventilator-induced lung injuries and other
+complications. Through numerical experiments on real-world ICU data from the
+MIMIC-III database, we demonstrate that our interpretable decision tree policy
+achieves performance comparable to state-of-the-art deep RL methods while
+outperforming standard behavior cloning approaches. The results highlight the
+potential of interpretable, data-driven decision support systems to improve
+safety and efficiency in personalized ventilation strategies, paving the way
+for seamless integration into connected healthcare environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formalising the intentional stance 1: attributing goals and beliefs to
+  stochastic processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16490v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16490v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon McGregor,  timorl, Nathaniel Virgo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article presents a formalism inspired by Dennett's notion of the
+intentional stance. Whereas Dennett's treatment of these concepts is informal,
+we aim to provide a more formal analogue. We introduce a framework based on
+stochastic processes with inputs and outputs, in which we can talk precisely
+about *interpreting* systems as having *normative-epistemic states*, which
+combine belief-like and desire-like features. Our framework is based on
+optimality but nevertheless allows us to model some forms of bounded cognition.
+  One might expect that the systems that can be described in
+normative-epistemic terms would be some special subset of all systems, but we
+show that this is not the case: every system admits a (possibly trivial)
+normative-epistemic interpretation, and those that can be *uniquely specified*
+by a normative-epistemic description are exactly the deterministic ones.
+Finally, we show that there is a suitable notion of Bayesian updating for
+normative-epistemic states, which we call *value-laden filtering*, since it
+involves both normative and epistemic elements. For unbounded cognition it is
+always permissible to attribute beliefs that update in this way. This is not
+always the case for bounded cognition, but we give a sufficient condition under
+which it is.
+  This paper gives an overview of our framework aimed at cognitive scientists,
+with a formal mathematical treatment given in a companion paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The previous version of this document included the content of the
+  companion paper, "Formalising the intentional stance 2: a coinductive
+  approach". The paper has now been split into two, this one (which is an
+  overview aimed at cognitive scientists) and the companion (which contains
+  full mathematical detail). 16 pages, one figure with two subfigures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A matheuristic approach for an integrated lot-sizing and scheduling
+  problem with a period-based learning effect 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Rohaninejad, Behdin Vahedi-Nouri, Reza Tavakkoli-Moghaddam, Zdeněk Hanzálek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research investigates a multi-product capacitated lot-sizing and
+scheduling problem incorporating a novel learning effect, namely the
+period-based learning effect. This is inspired by a real case in a core
+analysis laboratory under a job shop setting. Accordingly, a Mixed-Integer
+Linear Programming (MILP) model is extended based on the big-bucket
+formulation, optimizing the total tardiness and overtime costs. Given the
+complexity of the problem, a cutting plane method is employed to simplify the
+model. Afterward, three matheuristic methods based on the rolling horizon
+approach are devised, incorporating two lower bounds and a local search
+heuristic. Furthermore, a post-processing approach is implemented to
+incorporate lot-streaming possibility. Computational experiments demonstrate:
+1) the simplified model performs effectively in terms of both solution quality
+and computational time; and 2) although the model encounters challenges with
+large-scale instances, the proposed matheuristic methods achieve satisfactory
+outcomes; and 3) it can be inferred that the complexity of the models and
+solution methods are independent of the learning effect; however, the value of
+learning effect may impact the performance of the lower bounds; 4) in
+manufacturing settings, where the lot-streaming is possible, incorporating
+post-processing can drastically improve the objective function; 5) the impact
+of the period-based learning effect in the results is significant, and the
+model's sensitivity to time-based parameters (e.g., learning rate) is more than
+cost-based ones (e.g., tardiness cost).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal error estimates of the stochastic parabolic optimal control
+  problem with integral state constraint 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18173v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18173v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiming Wang, Wanfang Shen, Wenbin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, the optimal strong error estimates for stochastic parabolic
+optimal control problem with additive noise and integral state constraint are
+derived based on time-implicit and finite element discretization. The
+continuous and discrete first-order optimality conditions are deduced by
+constructing the Lagrange functional, which contains forward-backward
+stochastic parabolic equations and a variational equation. The fully discrete
+version of forward-backward stochastic parabolic equations is introduced as an
+auxiliary problem and the optimal strong convergence orders are estimated,
+which further allows the optimal a priori error estimates for control, state,
+adjoint state and multiplier to be derived. Then, a simple and yet efficient
+gradient projection algorithm is proposed to solve stochastic parabolic control
+problem and its convergence rate is proved. Numerical experiments are carried
+out to illustrate the theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convex Optimization of Initial Perturbations toward Quantitative Weather
+  Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19546v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19546v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Toshiyuki Ohtsuka, Atsushi Okazaki, Masaki Ogura, Shunji Kotsuki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes introducing convex optimization to find initial
+perturbations of atmospheric states to realize specified changes in subsequent
+weather. In the proposed method, we formulate and solve an inverse problem to
+find effective perturbations in atmospheric variables so that controlled
+variables satisfy specified changes at a specified time. The proposed method
+first constructs a sensitivity matrix of controlled variables, such as
+accumulated precipitation, to the initial atmospheric variables, such as
+temperature and humidity, through sensitivity analysis using a numerical
+weather prediction (NWP) model. Then a convex optimization problem is
+formulated to achieve various control specifications involving not only
+quadratic functions but also absolute values and maximum values of the
+controlled variables and initial atmospheric variables in the cost function and
+constraints. The proposed method was validated through a benchmark warm bubble
+experiment using the NWP model. The experiments showed that the identified
+perturbations successfully realized specified spatial distributions of
+accumulated precipitation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>shortend to improve conciseness; some figures added to Supplements
+  for discussion about physical processes; license changed to CC BY 4.0;
+  revised to improve readability; some figures in Appendix omitted to improve
+  conciseness</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ New Lagrangian dual algorithms for solving the continuous nonlinear
+  resource allocation problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.01899v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.01899v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixiang Hu, Caixia Kou, Jianhua Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The continuous nonlinear resource allocation problem (CONRAP) has broad
+applications in economics, engineering, production and inventory management,
+and often serves as a subproblem in complex programming. Without relying on
+monotonicity assumptions for the objective and constraint functions, we propose
+two Lagrangian dual algorithms for solving two types of CONRAP. Both algorithms
+determine an update strategy for the Lagrange multiplier, utilizing the values
+of the objective and constraint functions at the current and previous
+iterations. This strategy accelerates the process of finding dual optimal
+solutions. Subsequently, leveraging the problem's convexity, the primal optimal
+solution is either directly identified or derived by solving a one-dimensional
+linear equation. We also prove that both algorithms converge to optimal
+solutions within a finite number of iterations. Numerical experiments on six
+types of practical test problems illustrate the superior computational
+efficiency of the proposed algorithms. For test problems with a general
+inequality constraint, the first algorithm achieves a CPU time reduction
+exceeding an order of magnitude compared to solvers such as Gurobi and CVX. For
+test problems with a linear equality constraint, the second algorithm
+consistently outperforms four existing algorithms, delivering an improvement of
+over two orders of magnitude in computational efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collaborative Spacecraft Servicing under Partial Feedback using
+  Lyapunov-based Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristian F. Nino, Omkar Sudhir Patil, Christopher D. Petersen, Sean Phillips, Warren E. Dixon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent systems are increasingly applied in space missions, including
+distributed space systems, resilient constellations, and autonomous rendezvous
+and docking operations. A critical emerging application is collaborative
+spacecraft servicing, which encompasses on-orbit maintenance, space debris
+removal, and swarm-based satellite repositioning. These missions involve
+servicing spacecraft interacting with malfunctioning or defunct spacecraft
+under challenging conditions, such as limited state information, measurement
+inaccuracies, and erratic target behaviors. Existing approaches often rely on
+assumptions of full state knowledge or single-integrator dynamics, which are
+impractical for real-world applications involving second-order spacecraft
+dynamics. This work addresses these challenges by developing a distributed
+state estimation and tracking framework that requires only relative position
+measurements and operates under partial state information. A novel
+$\rho$-filter is introduced to reconstruct unknown states using locally
+available information, and a Lyapunov-based deep neural network adaptive
+controller is developed that adaptively compensates for uncertainties stemming
+from unknown spacecraft dynamics. To ensure the collaborative spacecraft
+regulation problem is well-posed, a trackability condition is defined. A
+Lyapunov-based stability analysis is provided to ensure exponential convergence
+of errors in state estimation and spacecraft regulation to a neighborhood of
+the origin under the trackability condition. The developed method eliminates
+the need for expensive velocity sensors or extensive pre-training, offering a
+practical and robust solution for spacecraft servicing in complex, dynamic
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 4 Figures, Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Designing a Framework for Solving Multiobjective Simulation Optimization
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06881v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06881v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tyler H. Chang, Stefan M. Wild
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiobjective simulation optimization (MOSO) problems are optimization
+problems with multiple conflicting objectives, where evaluation of at least one
+of the objectives depends on a black-box numerical code or real-world
+experiment, which we refer to as a simulation. While an extensive body of
+research is dedicated to developing new algorithms and methods for solving
+these and related problems, it is challenging and time consuming to integrate
+these techniques into real world production-ready solvers. This is partly due
+to the diversity and complexity of modern state-of-the-art MOSO algorithms and
+methods and partly due to the complexity and specificity of many real-world
+problems and their corresponding computing environments. The complexity of this
+problem is only compounded when introducing potentially complex and/or
+domain-specific surrogate modeling techniques, problem formulations, design
+spaces, and data acquisition functions. This paper carefully surveys the
+current state-of-the-art in MOSO algorithms, techniques, and solvers; as well
+as problem types and computational environments where MOSO is commonly applied.
+We then present several key challenges in the design of a Parallel
+Multiobjective Simulation Optimization framework (ParMOO) and how they have
+been addressed. Finally, we provide two case studies demonstrating how
+customized ParMOO solvers can be quickly built and deployed to solve real-world
+MOSO problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regularized MIP Model for Integrating Energy Storage Systems and its
+  Application for Solving a Trilevel Interdiction Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04406v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04406v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dahye Han, Nan Jiang, Santanu S. Dey, Weijun Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incorporating energy storage systems (ESS) into power systems has been
+studied in many recent works, where binary variables are often introduced to
+model the complementary nature of battery charging and discharging. A
+conventional approach for these ESS optimization problems is to relax binary
+variables and convert the problem into a linear program. However, such linear
+programming relaxation models can yield unrealistic fractional solutions, such
+as simultaneous charging and discharging. In this paper, we develop a
+regularized Mixed-Integer Programming (MIP) model for the ESS optimal power
+flow (OPF) problem. We prove that under mild conditions, the proposed
+regularized model admits a zero integrality gap with its linear programming
+relaxation; hence, it can be solved efficiently. By studying the properties of
+the regularized MIP model, we show that its optimal solution is also
+near-optimal to the original ESS OPF problem, thereby providing a valid and
+tight upper bound for the ESS OPF problem. The use of the regularized MIP model
+allows us to solve a trilevel min-max-min network contingency problem which is
+otherwise intractable to solve.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The computation of approximate feedback Stackelberg equilibria in
+  multi-player nonlinear constrained dynamic games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15745v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15745v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingqi Li, Somayeh Sojoudi, Claire Tomlin, David Fridovich-Keil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving feedback Stackelberg games with nonlinear dynamics and coupled
+constraints, a common scenario in practice, presents significant challenges.
+This work introduces an efficient method for computing approximate local
+feedback Stackelberg equilibria in multi-player general-sum dynamic games, with
+continuous state and action spaces. Different from existing (approximate)
+dynamic programming solutions that are primarily designed for unconstrained
+problems, our approach involves reformulating a feedback Stackelberg dynamic
+game into a sequence of nested optimization problems, enabling the derivation
+of Karush-Kuhn-Tucker (KKT) conditions and the establishment of a second-order
+sufficient condition for local feedback Stackelberg equilibria. We propose a
+Newton-style primal-dual interior point method for solving constrained linear
+quadratic (LQ) feedback Stackelberg games, offering provable convergence
+guarantees. Our method is further extended to compute local feedback
+Stackelberg equilibria for more general nonlinear games by iteratively
+approximating them using LQ games, ensuring that their KKT conditions are
+locally aligned with those of the original nonlinear games. We prove the
+exponential convergence of our algorithm in constrained nonlinear games. In a
+feedback Stackelberg game with nonlinear dynamics and (nonconvex) coupled costs
+and constraints, our experimental results reveal the algorithm's ability to
+handle infeasible initial conditions and achieve exponential convergence
+towards an approximate local feedback Stackelberg equilibrium.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript has been accepted by SIAM Journal on Optimization. We
+  fix few typos in this arxiv version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Integrated Vehicle and Pollster Routing Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1912.07356v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1912.07356v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sandra Gutiérrez, Andrés Miniguano-Trujillo, Diego Recalde, Luis M. Torres, Ramiro Torres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The National Statistics Bureau of Ecuador carries out monthly polls to
+monitor the evolution of the Consumer Price Index, a metric measuring consumer
+prices of essential commodities. These surveys are administered across a
+designated set of stores, with a fleet of vehicles transporting pollsters from
+the bureau headquarters to the chosen locations. Moreover, pollsters move
+between stores using pedestrian paths or using a vehicle to shorten the travel
+time. This paper introduces the Integrated Vehicle and Pollster Routing Problem
+and presents an integer programming model to effectively schedule pollster
+visits to selected stores while optimizing the routing of the vehicle fleet.
+Results on the computational complexity, a three-phase algorithm, and
+computational experience based on real-world instances are provided.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 5 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cyber Risk Assessment for Capital Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.08435v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.08435v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wing Fung Chong, Runhuan Feng, Hins Hu, Linfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a two-pillar cyber risk management framework to address
+the pervasive challenges in managing cyber risk. The first pillar, cyber risk
+assessment, combines insurance frequency-severity models with cybersecurity
+cascade models to capture the unique nature of cyber risk. The second pillar,
+cyber capital management, facilitates informed allocation of capital for a
+balanced cyber risk management strategy, including cybersecurity investments,
+insurance coverage, and reserves. A case study, based on historical cyber
+incident data and realistic assumptions, demonstrates the necessity of
+comprehensive cost-benefit analysis for budget-constrained companies with
+competing objectives in cyber risk management. In addition, sensitivity
+analysis highlights the dependence of the optimal strategy on factors such as
+the price of cybersecurity controls and their effectiveness. The framework's
+implementation across a diverse range of companies yields general insights on
+cyber risk management.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was first presented on July 5, 2021, at the 24th
+  International Congress on Insurance: Mathematics and Economics</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">134</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReFocus: Visual Editing as a Chain of Thought for Structured Image
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05452v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05452v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Fu, Minqian Liu, Zhengyuan Yang, John Corring, Yijuan Lu, Jianwei Yang, Dan Roth, Dinei Florencio, Cha Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structured image understanding, such as interpreting tables and charts,
+requires strategically refocusing across various structures and texts within an
+image, forming a reasoning sequence to arrive at the final answer. However,
+current multimodal large language models (LLMs) lack this multihop selective
+attention capability. In this work, we introduce ReFocus, a simple yet
+effective framework that equips multimodal LLMs with the ability to generate
+"visual thoughts" by performing visual editing on the input image through code,
+shifting and refining their visual focuses. Specifically, ReFocus enables
+multimodal LLMs to generate Python codes to call tools and modify the input
+image, sequentially drawing boxes, highlighting sections, and masking out
+areas, thereby enhancing the visual reasoning process. We experiment upon a
+wide range of structured image understanding tasks involving tables and charts.
+ReFocus largely improves performance on all tasks over GPT-4o without visual
+editing, yielding an average gain of 11.0% on table tasks and 6.8% on chart
+tasks. We present an in-depth analysis of the effects of different visual
+edits, and reasons why ReFocus can improve the performance without introducing
+additional information. Further, we collect a 14k training set using ReFocus,
+and prove that such visual chain-of-thought with intermediate information
+offers a better supervision than standard VQA data, reaching a 8.0% average
+gain over the same model trained with QA pairs and 2.6% over CoT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project link: https://zeyofu.github.io/ReFocus/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study of Autoregressive <span class="highlight-title">Pre-train</span>ing from Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05453v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05453v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jathushan Rajasegaran, Ilija Radosavovic, Rahul Ravishankar, Yossi Gandelsman, Christoph Feichtenhofer, Jitendra Malik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We empirically study autoregressive pre-training from videos. To perform our
+study, we construct a series of autoregressive video models, called Toto. We
+treat videos as sequences of visual tokens and train transformer models to
+autoregressively predict future tokens. Our models are pre-trained on a diverse
+dataset of videos and images comprising over 1 trillion visual tokens. We
+explore different architectural, training, and inference design choices. We
+evaluate the learned visual representations on a range of downstream tasks
+including image recognition, video classification, object tracking, and
+robotics. Our results demonstrate that, despite minimal inductive biases,
+autoregressive pre-training leads to competitive performance across all
+benchmarks. Finally, we find that scaling our video models results in similar
+scaling curves to those seen in language models, albeit with a different rate.
+More details at https://brjathu.github.io/toto/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decentralized Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David McAllister, Matthew Tancik, Jiaming Song, Angjoo Kanazawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale AI model training divides work across thousands of GPUs, then
+synchronizes gradients across them at each step. This incurs a significant
+network burden that only centralized, monolithic clusters can support, driving
+up infrastructure costs and straining power systems. We propose Decentralized
+Diffusion Models, a scalable framework for distributing diffusion model
+training across independent clusters or datacenters by eliminating the
+dependence on a centralized, high-bandwidth networking fabric. Our method
+trains a set of expert diffusion models over partitions of the dataset, each in
+full isolation from one another. At inference time, the experts ensemble
+through a lightweight router. We show that the ensemble collectively optimizes
+the same objective as a single model trained over the whole dataset. This means
+we can divide the training burden among a number of "compute islands," lowering
+infrastructure costs and improving resilience to localized GPU failures.
+Decentralized diffusion models empower researchers to take advantage of
+smaller, more cost-effective and more readily available compute like on-demand
+GPU nodes rather than central integrated systems. We conduct extensive
+experiments on ImageNet and LAION Aesthetics, showing that decentralized
+diffusion models FLOP-for-FLOP outperform standard diffusion models. We finally
+scale our approach to 24 billion parameters, demonstrating that high-quality
+diffusion models can now be trained with just eight individual GPU nodes in
+less than a week.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://decentralizeddiffusion.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable AI-Enhanced Deep Learning for Pumpkin Leaf Disease
+  Detection: A Comparative Analysis of CNN Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md. Arafat Alam Khandaker, Ziyan Shirin Raha, Shifat Islam, Tashreef Muhammad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pumpkin leaf diseases are significant threats to agricultural productivity,
+requiring a timely and precise diagnosis for effective management. Traditional
+identification methods are laborious and susceptible to human error,
+emphasizing the necessity for automated solutions. This study employs on the
+"Pumpkin Leaf Disease Dataset", that comprises of 2000 high-resolution images
+separated into five categories. Downy mildew, powdery mildew, mosaic disease,
+bacterial leaf spot, and healthy leaves. The dataset was rigorously assembled
+from several agricultural fields to ensure a strong representation for model
+training. We explored many proficient deep learning architectures, including
+DenseNet201, DenseNet121, DenseNet169, Xception, ResNet50, ResNet101 and
+InceptionResNetV2, and observed that ResNet50 performed most effectively, with
+an accuracy of 90.5% and comparable precision, recall, and F1-Score. We used
+Explainable AI (XAI) approaches like Grad-CAM, Grad-CAM++, Score-CAM, and
+Layer-CAM to provide meaningful representations of model decision-making
+processes, which improved understanding and trust in automated disease
+diagnostics. These findings demonstrate ResNet50's potential to revolutionize
+pumpkin leaf disease detection, allowing for earlier and more accurate
+treatments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 2024 27th International Conference on Computer and
+  Information Technology (ICCIT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relative Pose Estimation through Affine Corrections of Monocular Depth
+  Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Yu, Shaohui Liu, Rémi Pautrat, Marc Pollefeys, Viktor Larsson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular depth estimation (MDE) models have undergone significant
+advancements over recent years. Many MDE models aim to predict affine-invariant
+relative depth from monocular images, while recent developments in large-scale
+training and vision foundation models enable reasonable estimation of metric
+(absolute) depth. However, effectively leveraging these predictions for
+geometric vision tasks, in particular relative pose estimation, remains
+relatively under explored. While depths provide rich constraints for cross-view
+image alignment, the intrinsic noise and ambiguity from the monocular depth
+priors present practical challenges to improving upon classic keypoint-based
+solutions. In this paper, we develop three solvers for relative pose estimation
+that explicitly account for independent affine (scale and shift) ambiguities,
+covering both calibrated and uncalibrated conditions. We further propose a
+hybrid estimation pipeline that combines our proposed solvers with classic
+point-based solvers and epipolar constraints. We find that the affine
+correction modeling is beneficial to not only the relative depth priors but
+also, surprisingly, the ``metric" ones. Results across multiple datasets
+demonstrate large improvements of our approach over classic keypoint-based
+baselines and PnP-based solutions, under both calibrated and uncalibrated
+setups. We also show that our method improves consistently with different
+feature matchers and MDE models, and can further benefit from very recent
+advances on both modules. Code is available at
+https://github.com/MarkYu98/madpose.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistent Flow Distillation for Text-to-3D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runjie Yan, Yinbo Chen, Xiaolong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Score Distillation Sampling (SDS) has made significant strides in distilling
+image-generative models for 3D generation. However, its
+maximum-likelihood-seeking behavior often leads to degraded visual quality and
+diversity, limiting its effectiveness in 3D applications. In this work, we
+propose Consistent Flow Distillation (CFD), which addresses these limitations.
+We begin by leveraging the gradient of the diffusion ODE or SDE sampling
+process to guide the 3D generation. From the gradient-based sampling
+perspective, we find that the consistency of 2D image flows across different
+viewpoints is important for high-quality 3D generation. To achieve this, we
+introduce multi-view consistent Gaussian noise on the 3D object, which can be
+rendered from various viewpoints to compute the flow gradient. Our experiments
+demonstrate that CFD, through consistent flows, significantly outperforms
+previous methods in text-to-3D generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://runjie-yan.github.io/cfd/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can MLLMs Reason in Multimodality? EMMA: An Enhanced MultiModal
+  ReAsoning Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05444v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05444v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhuo Hao, Jiawei Gu, Huichen Will Wang, Linjie Li, Zhengyuan Yang, Lijuan Wang, Yu Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to organically reason over and with both text and images is a
+pillar of human intelligence, yet the ability of Multimodal Large Language
+Models (MLLMs) to perform such multimodal reasoning remains under-explored.
+Existing benchmarks often emphasize text-dominant reasoning or rely on shallow
+visual cues, failing to adequately assess integrated visual and textual
+reasoning. We introduce EMMA (Enhanced MultiModal reAsoning), a benchmark
+targeting organic multimodal reasoning across mathematics, physics, chemistry,
+and coding. EMMA tasks demand advanced cross-modal reasoning that cannot be
+addressed by reasoning independently in each modality, offering an enhanced
+test suite for MLLMs' reasoning capabilities. Our evaluation of
+state-of-the-art MLLMs on EMMA reveals significant limitations in handling
+complex multimodal and multi-step reasoning tasks, even with advanced
+techniques like Chain-of-Thought prompting and test-time compute scaling
+underperforming. These findings underscore the need for improved multimodal
+architectures and training paradigms to close the gap between human and model
+reasoning in multimodality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive Growing of Video Tokenizers for Highly Compressed Latent
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aniruddha Mahapatra, Long Mai, Yitian Zhang, David Bourgin, Feng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video tokenizers are essential for latent video diffusion models, converting
+raw video data into spatiotemporally compressed latent spaces for efficient
+training. However, extending state-of-the-art video tokenizers to achieve a
+temporal compression ratio beyond 4x without increasing channel capacity poses
+significant challenges. In this work, we propose an alternative approach to
+enhance temporal compression. We find that the reconstruction quality of
+temporally subsampled videos from a low-compression encoder surpasses that of
+high-compression encoders applied to original videos. This indicates that
+high-compression models can leverage representations from lower-compression
+models. Building on this insight, we develop a bootstrapped
+high-temporal-compression model that progressively trains high-compression
+blocks atop well-trained lower-compression models. Our method includes a
+cross-level feature-mixing module to retain information from the pretrained
+low-compression model and guide higher-compression blocks to capture the
+remaining details from the full video sequence. Evaluation of video benchmarks
+shows that our method significantly improves reconstruction quality while
+increasing temporal compression compared to direct extensions of existing video
+tokenizers. Furthermore, the resulting compact latent space effectively trains
+a video diffusion model for high-quality video generation with a reduced token
+budget.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website:
+  https://progressive-video-tokenizer.github.io/Pro-MAG/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The GAN is dead; long live the GAN! A Modern GAN Baseline <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiwen Huang, Aaron Gokaslan, Volodymyr Kuleshov, James Tompkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a widely-spread claim that GANs are difficult to train, and GAN
+architectures in the literature are littered with empirical tricks. We provide
+evidence against this claim and build a modern GAN baseline in a more
+principled manner. First, we derive a well-behaved regularized relativistic GAN
+loss that addresses issues of mode dropping and non-convergence that were
+previously tackled via a bag of ad-hoc tricks. We analyze our loss
+mathematically and prove that it admits local convergence guarantees, unlike
+most existing relativistic losses. Second, our new loss allows us to discard
+all ad-hoc tricks and replace outdated backbones used in common GANs with
+modern architectures. Using StyleGAN2 as an example, we present a roadmap of
+simplification and modernization that results in a new minimalist baseline --
+R3GAN. Despite being simple, our approach surpasses StyleGAN2 on FFHQ,
+ImageNet, CIFAR, and Stacked MNIST datasets, and compares favorably against
+state-of-the-art GANs and diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2024. Code available at
+  https://github.com/brownvc/R3GAN/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ $DPF^*$: improved Depth Potential Function for scale-invariant sulcal
+  depth estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Dieudonné, Guillaume Auzias, Julien Lefèvre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The shape of human brain is complex and highly variable, with interactions
+between brain size, cortical folding, and age well-documented in the
+literature. However, few studies have explored how global brain size influences
+geometric features of the cortical surface derived from anatomical MRI. In this
+work, we focus on sulcal depth, an imaging phenotype that has gained
+significant attention in both basic research and clinical applications. We make
+key contributions to the field by: 1) providing the first quantitative analysis
+of how brain size affects sulcal depth measurements; 2) introducing a novel,
+scale-invariant method for sulcal depth estimation based on an original
+formalization of the problem; 3) presenting a validation framework and sharing
+our code and benchmark data with the community; and 4) demonstrating the
+biological relevance of our new sulcal depth measure using a large sample of
+1,987 subjects spanning the developmental period from 26 weeks post-conception
+to adulthood.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GA and JL contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flatland Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sameer Agarwal, Erin Connelly, Annalisa Crannell, Timothy Duff, Rekha R. Thomas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When is it possible to project two sets of labeled points lying in a pair of
+projective planes to the same image on a projective line? We give a complete
+answer to this question and describe the loci of the projection centers that
+enable a common image. In particular, we find that there exists a solution to
+this problem if and only if these two sets are themselves images of a common
+pointset in projective space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-1-to-G: Taming <span class="highlight-title">Pretrain</span>ed 2D Diffusion Model for Direct 3D
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyi Meng, Chen Wang, Jiahui Lei, Kostas Daniilidis, Jiatao Gu, Lingjie Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in 2D image generation have achieved remarkable
+quality,largely driven by the capacity of diffusion models and the availability
+of large-scale datasets. However, direct 3D generation is still constrained by
+the scarcity and lower fidelity of 3D datasets. In this paper, we introduce
+Zero-1-to-G, a novel approach that addresses this problem by enabling direct
+single-view generation on Gaussian splats using pretrained 2D diffusion models.
+Our key insight is that Gaussian splats, a 3D representation, can be decomposed
+into multi-view images encoding different attributes. This reframes the
+challenging task of direct 3D generation within a 2D diffusion framework,
+allowing us to leverage the rich priors of pretrained 2D diffusion models. To
+incorporate 3D awareness, we introduce cross-view and cross-attribute attention
+layers, which capture complex correlations and enforce 3D consistency across
+generated splats. This makes Zero-1-to-G the first direct image-to-3D
+generative model to effectively utilize pretrained 2D diffusion priors,
+enabling efficient training and improved generalization to unseen objects.
+Extensive experiments on both synthetic and in-the-wild datasets demonstrate
+superior performance in 3D object generation, offering a new approach to
+high-quality 3D generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Images to Insights: Transforming Brain Cancer Diagnosis with
+  Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md. Arafat Alam Khandaker, Ziyan Shirin Raha, Salehin Bin Iqbal, M. F. Mridha, Jungpil Shin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain cancer represents a major challenge in medical diagnostics, requisite
+precise and timely detection for effective treatment. Diagnosis initially
+relies on the proficiency of radiologists, which can cause difficulties and
+threats when the expertise is sparse. Despite the use of imaging resources,
+brain cancer remains often difficult, time-consuming, and vulnerable to
+intraclass variability. This study conveys the Bangladesh Brain Cancer MRI
+Dataset, containing 6,056 MRI images organized into three categories: Brain
+Tumor, Brain Glioma, and Brain Menin. The dataset was collected from several
+hospitals in Bangladesh, providing a diverse and realistic sample for research.
+We implemented advanced deep learning models, and DenseNet169 achieved
+exceptional results, with accuracy, precision, recall, and F1-Score all
+reaching 0.9983. In addition, Explainable AI (XAI) methods including GradCAM,
+GradCAM++, ScoreCAM, and LayerCAM were employed to provide visual
+representations of the decision-making processes of the models. In the context
+of brain cancer, these techniques highlight DenseNet169's potential to enhance
+diagnostic accuracy while simultaneously offering transparency, facilitating
+early diagnosis and better patient outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 2024 27th International Conference on Computer and
+  Information Technology (ICCIT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seeing Sound: Assembling Sounds from Visuals for Audio-to-Image
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Darius Petermann, Mahdi M. Kalayeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training audio-to-image generative models requires an abundance of diverse
+audio-visual pairs that are semantically aligned. Such data is almost always
+curated from in-the-wild videos, given the cross-modal semantic correspondence
+that is inherent to them. In this work, we hypothesize that insisting on the
+absolute need for ground truth audio-visual correspondence, is not only
+unnecessary, but also leads to severe restrictions in scale, quality, and
+diversity of the data, ultimately impairing its use in the modern generative
+models. That is, we propose a scalable image sonification framework where
+instances from a variety of high-quality yet disjoint uni-modal origins can be
+artificially paired through a retrieval process that is empowered by reasoning
+capabilities of modern vision-language models. To demonstrate the efficacy of
+this approach, we use our sonified images to train an audio-to-image generative
+model that performs competitively against state-of-the-art. Finally, through a
+series of ablation studies, we exhibit several intriguing auditory capabilities
+like semantic mixing and interpolation, loudness calibration and acoustic space
+modeling through reverberation that our model has implicitly developed to guide
+the image generation process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Pathology Foundation Model by Mayo Clinic, Charité, and
+  Aignostics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Alber, Stephan Tietz, Jonas Dippel, Timo Milbich, Timothée Lesort, Panos Korfiatis, Moritz Krügener, Beatriz Perez Cancer, Neelay Shah, Alexander Möllers, Philipp Seegerer, Alexandra Carpen-Amarie, Kai Standvoss, Gabriel Dernbach, Edwin de Jong, Simon Schallenberg, Andreas Kunft, Helmut Hoffer von Ankershoffen, Gavin Schaeferle, Patrick Duffy, Matt Redlon, Philipp Jurmeister, David Horst, Lukas Ruff, Klaus-Robert Müller, Frederick Klauschen, Andrew Norgan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in digital pathology have demonstrated the effectiveness of
+foundation models across diverse applications. In this report, we present a
+novel vision foundation model based on the RudolfV approach. Our model was
+trained on a dataset comprising 1.2 million histopathology whole slide images,
+collected from two medical institutions: Mayo Clinic and Charit\'e -
+Universt\"atsmedizin Berlin. Comprehensive evaluations show that our model
+achieves state-of-the-art performance across twenty-one public benchmark
+datasets, even though it is neither the largest model by parameter count nor by
+training dataset size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance of YOLOv7 in Kitchen Safety While Handling Knife 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05399v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05399v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Athulya Sundaresan Geetha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe knife practices in the kitchen significantly reduce the risk of cuts,
+injuries, and serious accidents during food preparation. Using YOLOv7, an
+advanced object detection model, this study focuses on identifying safety risks
+during knife handling, particularly improper finger placement and blade contact
+with hand. The model's performance was evaluated using metrics such as
+precision, recall, mAP50, and mAP50-95. The results demonstrate that YOLOv7
+achieved its best performance at epoch 31, with a mAP50-95 score of 0.7879,
+precision of 0.9063, and recall of 0.7503. These findings highlight YOLOv7's
+potential to accurately detect knife-related hazards, promoting the development
+of improved kitchen safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Arc2Avatar: Generating Expressive 3D Avatars from a Single Image via ID
+  Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Gerogiannis, Foivos Paraperas Papantoniou, Rolandos Alexandros Potamias, Alexandros Lattas, Stefanos Zafeiriou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by the effectiveness of 3D Gaussian Splatting (3DGS) in
+reconstructing detailed 3D scenes within multi-view setups and the emergence of
+large 2D human foundation models, we introduce Arc2Avatar, the first SDS-based
+method utilizing a human face foundation model as guidance with just a single
+image as input. To achieve that, we extend such a model for diverse-view human
+head generation by fine-tuning on synthetic data and modifying its
+conditioning. Our avatars maintain a dense correspondence with a human face
+mesh template, allowing blendshape-based expression generation. This is
+achieved through a modified 3DGS approach, connectivity regularizers, and a
+strategic initialization tailored for our task. Additionally, we propose an
+optional efficient SDS-based correction step to refine the blendshape
+expressions, enhancing realism and diversity. Experiments demonstrate that
+Arc2Avatar achieves state-of-the-art realism and identity preservation,
+effectively addressing color issues by allowing the use of very low guidance,
+enabled by our strong identity prior and initialization strategy, without
+compromising detail.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 1-2-1: Renaissance of Single-Network Paradigm for Virtual Try-On 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuliang Ning, Yipeng Qin, Xiaoguang Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual Try-On (VTON) has become a crucial tool in ecommerce, enabling the
+realistic simulation of garments on individuals while preserving their original
+appearance and pose. Early VTON methods relied on single generative networks,
+but challenges remain in preserving fine-grained garment details due to
+limitations in feature extraction and fusion. To address these issues, recent
+approaches have adopted a dual-network paradigm, incorporating a complementary
+"ReferenceNet" to enhance garment feature extraction and fusion. While
+effective, this dual-network approach introduces significant computational
+overhead, limiting its scalability for high-resolution and long-duration
+image/video VTON applications. In this paper, we challenge the dual-network
+paradigm by proposing a novel single-network VTON method that overcomes the
+limitations of existing techniques. Our method, namely MNVTON, introduces a
+Modality-specific Normalization strategy that separately processes text, image
+and video inputs, enabling them to share the same attention layers in a VTON
+network. Extensive experimental results demonstrate the effectiveness of our
+approach, showing that it consistently achieves higher-quality, more detailed
+results for both image and video VTON tasks. Our results suggest that the
+single-network paradigm can rival the performance of dualnetwork approaches,
+offering a more efficient alternative for high-quality, scalable VTON
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://ningshuliang.github.io/2023/Arxiv/index.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CROPS: Model-Agnostic Training-Free Framework for Safe Image Synthesis
+  with Latent Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junha Park, Ian Ryu, Jaehui Hwang, Hyungkeun Park, Jiyoon Kim, Jong-Seok Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With advances in diffusion models, image generation has shown significant
+performance improvements. This raises concerns about the potential abuse of
+image generation, such as the creation of explicit or violent images, commonly
+referred to as Not Safe For Work (NSFW) content. To address this, the Stable
+Diffusion model includes several safety checkers to censor initial text prompts
+and final output images generated from the model. However, recent research has
+shown that these safety checkers have vulnerabilities against adversarial
+attacks, allowing them to generate NSFW images. In this paper, we find that
+these adversarial attacks are not robust to small changes in text prompts or
+input latents. Based on this, we propose CROPS (Circular or RandOm Prompts for
+Safety), a model-agnostic framework that easily defends against adversarial
+attacks generating NSFW images without requiring additional training. Moreover,
+we develop an approach that utilizes one-step diffusion models for efficient
+NSFW detection (CROPS-1), further reducing computational resources. We
+demonstrate the superiority of our method in terms of performance and
+applicability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JAQ: Joint Efficient Architecture Design and Low-Bit Quantization with
+  Hardware-Software Co-Exploration <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingzi Wang, Yuan Meng, Chen Tang, Weixiang Zhang, Yijian Qin, Yang Yao, Yingxin Li, Tongtong Feng, Xin Wang, Xun Guan, Zhi Wang, Wenwu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The co-design of neural network architectures, quantization precisions, and
+hardware accelerators offers a promising approach to achieving an optimal
+balance between performance and efficiency, particularly for model deployment
+on resource-constrained edge devices. In this work, we propose the JAQ
+Framework, which jointly optimizes the three critical dimensions. However,
+effectively automating the design process across the vast search space of those
+three dimensions poses significant challenges, especially when pursuing
+extremely low-bit quantization. Specifical, the primary challenges include: (1)
+Memory overhead in software-side: Low-precision quantization-aware training can
+lead to significant memory usage due to storing large intermediate features and
+latent weights for back-propagation, potentially causing memory exhaustion. (2)
+Search time-consuming in hardware-side: The discrete nature of hardware
+parameters and the complex interplay between compiler optimizations and
+individual operators make the accelerator search time-consuming. To address
+these issues, JAQ mitigates the memory overhead through a channel-wise sparse
+quantization (CSQ) scheme, selectively applying quantization to the most
+sensitive components of the model during optimization. Additionally, JAQ
+designs BatchTile, which employs a hardware generation network to encode all
+possible tiling modes, thereby speeding up the search for the optimal compiler
+mapping strategy. Extensive experiments demonstrate the effectiveness of JAQ,
+achieving approximately 7% higher Top-1 accuracy on ImageNet compared to
+previous methods and reducing the hardware search time per iteration to 0.15
+seconds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparison Study: Glacier Calving Front Delineation in Synthetic
+  Aperture Radar Images With Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nora Gourmelon, Konrad Heidler, Erik Loebel, Daniel Cheng, Julian Klink, Anda Dong, Fei Wu, Noah Maul, Moritz Koch, Marcel Dreier, Dakota Pyles, Thorsten Seehaus, Matthias Braun, Andreas Maier, Vincent Christlein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Calving front position variation of marine-terminating glaciers is an
+indicator of ice mass loss and a crucial parameter in numerical glacier models.
+Deep Learning (DL) systems can automatically extract this position from
+Synthetic Aperture Radar (SAR) imagery, enabling continuous, weather- and
+illumination-independent, large-scale monitoring. This study presents the first
+comparison of DL systems on a common calving front benchmark dataset. A
+multi-annotator study with ten annotators is performed to contrast the
+best-performing DL system against human performance. The best DL model's
+outputs deviate 221 m on average, while the average deviation of the human
+annotators is 38 m. This significant difference shows that current DL systems
+do not yet match human performance and that further research is needed to
+enable fully automated monitoring of glacier calving fronts. The study of
+Vision Transformers, foundation models, and the inclusion and processing
+strategy of more information are identified as avenues for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving the Catastrophic Forgetting Problem in Generalized Category
+  Discovery <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05272v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05272v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinzi Cao, Xiawu Zheng, Guanhong Wang, Weijiang Yu, Yunhang Shen, Ke Li, Yutong Lu, Yonghong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalized Category Discovery (GCD) aims to identify a mix of known and
+novel categories within unlabeled data sets, providing a more realistic setting
+for image recognition. Essentially, GCD needs to remember existing patterns
+thoroughly to recognize novel categories. Recent state-of-the-art method SimGCD
+transfers the knowledge from known-class data to the learning of novel classes
+through debiased learning. However, some patterns are catastrophically forgot
+during adaptation and thus lead to poor performance in novel categories
+classification. To address this issue, we propose a novel learning approach,
+LegoGCD, which is seamlessly integrated into previous methods to enhance the
+discrimination of novel classes while maintaining performance on previously
+encountered known classes. Specifically, we design two types of techniques
+termed as Local Entropy Regularization (LER) and Dual-views Kullback Leibler
+divergence constraint (DKL). The LER optimizes the distribution of potential
+known class samples in unlabeled data, thus ensuring the preservation of
+knowledge related to known categories while learning novel classes. Meanwhile,
+DKL introduces Kullback Leibler divergence to encourage the model to produce a
+similar prediction distribution of two view samples from the same image. In
+this way, it successfully avoids mismatched prediction and generates more
+reliable potential known class samples simultaneously. Extensive experiments
+validate that the proposed LegoGCD effectively addresses the known category
+forgetting issue across all datasets, eg, delivering a 7.74% and 2.51% accuracy
+boost on known and novel classes in CUB, respectively. Our code is available
+at: https://github.com/Cliffia123/LegoGCD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CellViT++: Energy-Efficient and Adaptive Cell Segmentation and
+  Classification Using Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Hörst, Moritz Rempe, Helmut Becker, Lukas Heine, Julius Keyl, Jens Kleesiek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital Pathology is a cornerstone in the diagnosis and treatment of
+diseases. A key task in this field is the identification and segmentation of
+cells in hematoxylin and eosin-stained images. Existing methods for cell
+segmentation often require extensive annotated datasets for training and are
+limited to a predefined cell classification scheme. To overcome these
+limitations, we propose $\text{CellViT}^{{\scriptscriptstyle ++}}$, a framework
+for generalized cell segmentation in digital pathology.
+$\text{CellViT}^{{\scriptscriptstyle ++}}$ utilizes Vision Transformers with
+foundation models as encoders to compute deep cell features and segmentation
+masks simultaneously. To adapt to unseen cell types, we rely on a
+computationally efficient approach. It requires minimal data for training and
+leads to a drastically reduced carbon footprint. We demonstrate excellent
+performance on seven different datasets, covering a broad spectrum of cell
+types, organs, and clinical settings. The framework achieves remarkable
+zero-shot segmentation and data-efficient cell-type classification.
+Furthermore, we show that $\text{CellViT}^{{\scriptscriptstyle ++}}$ can
+leverage immunofluorescence stainings to generate training datasets without the
+need for pathologist annotations. The automated dataset generation approach
+surpasses the performance of networks trained on manually labeled data,
+demonstrating its effectiveness in creating high-quality training datasets
+without expert annotations. To advance digital pathology,
+$\text{CellViT}^{{\scriptscriptstyle ++}}$ is available as an open-source
+framework featuring a user-friendly, web-based interface for visualization and
+annotation. The code is available under
+https://github.com/TIO-IKIM/CellViT-plus-plus.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Patch-GAN Transfer Learning with Reconstructive Models for Cloud Removal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanli Ma, Oktay Karakus, Paul L. Rosin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cloud removal plays a crucial role in enhancing remote sensing image
+analysis, yet accurately reconstructing cloud-obscured regions remains a
+significant challenge. Recent advancements in generative models have made the
+generation of realistic images increasingly accessible, offering new
+opportunities for this task. Given the conceptual alignment between image
+generation and cloud removal tasks, generative models present a promising
+approach for addressing cloud removal in remote sensing. In this work, we
+propose a deep transfer learning approach built on a generative adversarial
+network (GAN) framework to explore the potential of the novel masked
+autoencoder (MAE) image reconstruction model in cloud removal. Due to the
+complexity of remote sensing imagery, we further propose using a patch-wise
+discriminator to determine whether each patch of the image is real or not. The
+proposed reconstructive transfer learning approach demonstrates significant
+improvements in cloud removal performance compared to other GAN-based methods.
+Additionally, whilst direct comparisons with some of the state-of-the-art cloud
+removal techniques are limited due to unclear details regarding their
+train/test data splits, the proposed model achieves competitive results based
+on available benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Balanced Continual Multi-Modal Learning in Human Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxuan Peng, Mengshi Qi, Dong Zhao, Huadong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D human pose estimation (3D HPE) has emerged as a prominent research topic,
+particularly in the realm of RGB-based methods. However, RGB images are
+susceptible to limitations such as sensitivity to lighting conditions and
+potential user discomfort. Consequently, multi-modal sensing, which leverages
+non-intrusive sensors, is gaining increasing attention. Nevertheless,
+multi-modal 3D HPE still faces challenges, including modality imbalance and the
+imperative for continual learning. In this work, we introduce a novel balanced
+continual multi-modal learning method for 3D HPE, which harnesses the power of
+RGB, LiDAR, mmWave, and WiFi. Specifically, we propose a Shapley value-based
+contribution algorithm to quantify the contribution of each modality and
+identify modality imbalance. To address this imbalance, we employ a re-learning
+strategy. Furthermore, recognizing that raw data is prone to noise
+contamination, we develop a novel denoising continual learning approach. This
+approach incorporates a noise identification and separation module to mitigate
+the adverse effects of noise and collaborates with the balanced learning
+strategy to enhance optimization. Additionally, an adaptive EWC mechanism is
+employed to alleviate catastrophic forgetting. We conduct extensive experiments
+on the widely-adopted multi-modal dataset, MM-Fi, which demonstrate the
+superiority of our approach in boosting 3D pose estimation and mitigating
+catastrophic forgetting in complex scenarios. We will release our codes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain-Incremental Semantic Segmentation for Autonomous Driving under
+  Adverse Driving Conditions <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shishir Muralidhara, René Schuster, Didier Stricker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation for autonomous driving is an even more challenging task
+when faced with adverse driving conditions. Standard models trained on data
+recorded under ideal conditions show a deteriorated performance in unfavorable
+weather or illumination conditions. Fine-tuning on the new task or condition
+would lead to overwriting the previously learned information resulting in
+catastrophic forgetting. Adapting to the new conditions through traditional
+domain adaption methods improves the performance on the target domain at the
+expense of the source domain. Addressing these issues, we propose an
+architecture-based domain-incremental learning approach called Progressive
+Semantic Segmentation (PSS). PSS is a task-agnostic, dynamically growing
+collection of domain-specific segmentation models. The task of inferring the
+domain and subsequently selecting the appropriate module for segmentation is
+carried out using a collection of convolutional autoencoders. We extensively
+evaluate our proposed approach using several datasets at varying levels of
+granularity in the categorization of adverse driving conditions. Furthermore,
+we demonstrate the generalization of the proposed approach to similar and
+unseen domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICPRAM 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimized Sampling for Non-Line-of-Sight Imaging Using Modified Fast
+  Fourier Transforms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Talha Sultan, Alex Bocchieri, Chaoying Gu, Xiaochun Liu, Pavel Polynkin, Andreas Velten
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-line-of-Sight (NLOS) imaging systems collect light at a diffuse relay
+surface and input this measurement into computational algorithms that output a
+3D volumetric reconstruction. These algorithms utilize the Fast Fourier
+Transform (FFT) to accelerate the reconstruction process but require both input
+and output to be sampled spatially with uniform grids. However, the geometry of
+NLOS imaging inherently results in non-uniform sampling on the relay surface
+when using multi-pixel detector arrays, even though such arrays significantly
+reduce acquisition times. Furthermore, using these arrays increases the data
+rate required for sensor readout, posing challenges for real-world deployment.
+In this work, we utilize the phasor field framework to demonstrate that
+existing NLOS imaging setups typically oversample the relay surface spatially,
+explaining why the measurement can be compressed without significantly
+sacrificing reconstruction quality. This enables us to utilize the Non-Uniform
+Fast Fourier Transform (NUFFT) to reconstruct from sparse measurements acquired
+from irregularly sampled relay surfaces of arbitrary shapes. Furthermore, we
+utilize the NUFFT to reconstruct at arbitrary locations in the hidden volume,
+ensuring flexible sampling schemes for both the input and output. Finally, we
+utilize the Scaled Fast Fourier Transform (SFFT) to reconstruct larger volumes
+without increasing the number of samples stored in memory. All algorithms
+introduced in this paper preserve the computational complexity of FFT-based
+methods, ensuring scalability for practical NLOS imaging applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaffold-SLAM: Structured 3D Gaussians for Simultaneous Localization and
+  Photorealistic Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05242v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05242v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wen Tianci, Liu Zhiang, Lu Biao, Fang Yongchun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting (3DGS) has recently revolutionized novel view synthesis
+in the Simultaneous Localization and Mapping (SLAM). However, existing SLAM
+methods utilizing 3DGS have failed to provide high-quality novel view rendering
+for monocular, stereo, and RGB-D cameras simultaneously. Notably, some methods
+perform well for RGB-D cameras but suffer significant degradation in rendering
+quality for monocular cameras. In this paper, we present Scaffold-SLAM, which
+delivers simultaneous localization and high-quality photorealistic mapping
+across monocular, stereo, and RGB-D cameras. We introduce two key innovations
+to achieve this state-of-the-art visual quality. First, we propose
+Appearance-from-Motion embedding, enabling 3D Gaussians to better model image
+appearance variations across different camera poses. Second, we introduce a
+frequency regularization pyramid to guide the distribution of Gaussians,
+allowing the model to effectively capture finer details in the scene. Extensive
+experiments on monocular, stereo, and RGB-D datasets demonstrate that
+Scaffold-SLAM significantly outperforms state-of-the-art methods in
+photorealistic mapping quality, e.g., PSNR is 16.76% higher in the TUM RGB-D
+datasets for monocular cameras.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrast-Free Myocardial Scar Segmentation in Cine MRI using Motion and
+  Texture Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guang Yang, Jingkun Chen, Xicheng Sheng, Shan Yang, Xiahai Zhuang, Betty Raman, Lei Li, Vicente Grau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Late gadolinium enhancement MRI (LGE MRI) is the gold standard for the
+detection of myocardial scars for post myocardial infarction (MI). LGE MRI
+requires the injection of a contrast agent, which carries potential side
+effects and increases scanning time and patient discomfort. To address these
+issues, we propose a novel framework that combines cardiac motion observed in
+cine MRI with image texture information to segment the myocardium and scar
+tissue in the left ventricle. Cardiac motion tracking can be formulated as a
+full cardiac image cycle registration problem, which can be solved via deep
+neural networks. Experimental results prove that the proposed method can
+achieve scar segmentation based on non-contrasted cine images with comparable
+accuracy to LGE MRI. This demonstrates its potential as an alternative to
+contrast-enhanced techniques for scar detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2figs, 2tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Your Autonomous Vehicle Safe? Understanding the Threat of
+  Electromagnetic Signal Injection Attacks on Traffic Scene Perception <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Liao, Sineng Yan, Youqian Zhang, Xinwei Zhai, Yuanyuan Wang, Eugene Yujun Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous vehicles rely on camera-based perception systems to comprehend
+their driving environment and make crucial decisions, thereby ensuring vehicles
+to steer safely. However, a significant threat known as Electromagnetic Signal
+Injection Attacks (ESIA) can distort the images captured by these cameras,
+leading to incorrect AI decisions and potentially compromising the safety of
+autonomous vehicles. Despite the serious implications of ESIA, there is limited
+understanding of its impacts on the robustness of AI models across various and
+complex driving scenarios. To address this gap, our research analyzes the
+performance of different models under ESIA, revealing their vulnerabilities to
+the attacks. Moreover, due to the challenges in obtaining real-world attack
+data, we develop a novel ESIA simulation method and generate a simulated attack
+dataset for different driving scenarios. Our research provides a comprehensive
+simulation and evaluation framework, aiming to enhance the development of more
+robust AI models and secure intelligent systems, ultimately contributing to the
+advancement of safer and more reliable technology across various fields.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FOCUS: Towards Universal Foreground Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuyao You, Lingyu Kong, Lingchen Meng, Zuxuan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foreground segmentation is a fundamental task in computer vision,
+encompassing various subdivision tasks. Previous research has typically
+designed task-specific architectures for each task, leading to a lack of
+unification. Moreover, they primarily focus on recognizing foreground objects
+without effectively distinguishing them from the background. In this paper, we
+emphasize the importance of the background and its relationship with the
+foreground. We introduce FOCUS, the Foreground ObjeCts Universal Segmentation
+framework that can handle multiple foreground tasks. We develop a multi-scale
+semantic network using the edge information of objects to enhance image
+features. To achieve boundary-aware segmentation, we propose a novel
+distillation method, integrating the contrastive learning strategy to refine
+the prediction mask in multi-modal feature space. We conduct extensive
+experiments on a total of 13 datasets across 5 tasks, and the results
+demonstrate that FOCUS consistently outperforms the state-of-the-art
+task-specific models on most metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated external cervical resorption segmentation in cone-beam CT
+  using local texture features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadhana Ravikumar, Asma A. Khan, Matthew C. Davis, Beatriz Paniagua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  External cervical resorption (ECR) is a resorptive process affecting teeth.
+While in some patients, active resorption ceases and gets replaced by osseous
+tissue, in other cases, the resorption progresses and ultimately results in
+tooth loss. For proper ECR assessment, cone-beam computed tomography (CBCT) is
+the recommended imaging modality, enabling a 3-D characterization of these
+lesions. While it is possible to manually identify and measure ECR resorption
+in CBCT scans, this process can be time intensive and highly subject to human
+error. Therefore, there is an urgent need to develop an automated method to
+identify and quantify the severity of ECR resorption using CBCT. Here, we
+present a method for ECR lesion segmentation that is based on automatic, binary
+classification of locally extracted voxel-wise texture features. We evaluate
+our method on 6 longitudinal CBCT datasets and show that certain
+texture-features can be used to accurately detect subtle CBCT signal changes
+due to ECR. We also present preliminary analyses clustering texture features
+within a lesion to stratify the defects and identify patterns indicative of
+calcification. These methods are important steps in developing prognostic
+biomarkers to predict whether ECR will continue to progress or cease,
+ultimately informing treatment decisions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing Large Language and Vision-Language Models for Robust
+  Out-of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pei-Kang Lee, Jun-Cheng Chen, Ja-Ling Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection has seen significant advancements with
+zero-shot approaches by leveraging the powerful Vision-Language Models (VLMs)
+such as CLIP. However, prior research works have predominantly focused on
+enhancing Far-OOD performance, while potentially compromising Near-OOD
+efficacy, as observed from our pilot study. To address this issue, we propose a
+novel strategy to enhance zero-shot OOD detection performances for both Far-OOD
+and Near-OOD scenarios by innovatively harnessing Large Language Models (LLMs)
+and VLMs. Our approach first exploit an LLM to generate superclasses of the ID
+labels and their corresponding background descriptions followed by feature
+extraction using CLIP. We then isolate the core semantic features for ID data
+by subtracting background features from the superclass features. The refined
+representation facilitates the selection of more appropriate negative labels
+for OOD data from a comprehensive candidate label set of WordNet, thereby
+enhancing the performance of zero-shot OOD detection in both scenarios.
+Furthermore, we introduce novel few-shot prompt tuning and visual prompt tuning
+to adapt the proposed framework to better align with the target distribution.
+Experimental results demonstrate that the proposed approach consistently
+outperforms current state-of-the-art methods across multiple benchmarks, with
+an improvement of up to 2.9% in AUROC and a reduction of up to 12.6% in FPR95.
+Additionally, our method exhibits superior robustness against covariate shift
+across different domains, further highlighting its effectiveness in real-world
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Light Transport-aware Diffusion Posterior Sampling for Single-View
+  Reconstruction of 3D Volumes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludwic Leonard, Nils Thuerey, Ruediger Westermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a single-view reconstruction technique of volumetric fields in
+which multiple light scattering effects are omnipresent, such as in clouds. We
+model the unknown distribution of volumetric fields using an unconditional
+diffusion model trained on a novel benchmark dataset comprising 1,000
+synthetically simulated volumetric density fields. The neural diffusion model
+is trained on the latent codes of a novel, diffusion-friendly, monoplanar
+representation. The generative model is used to incorporate a tailored
+parametric diffusion posterior sampling technique into different reconstruction
+tasks. A physically-based differentiable volume renderer is employed to provide
+gradients with respect to light transport in the latent space. This stands in
+contrast to classic NeRF approaches and makes the reconstructions better
+aligned with observed data. Through various experiments, we demonstrate
+single-view reconstruction of volumetric clouds at a previously unattainable
+quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MHAFF: Multi-Head Attention Feature Fusion of CNN and <span class="highlight-title">Transformer</span> for
+  Cattle Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rabin Dulal, Lihong Zheng, Muhammad Ashad Kabir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional Neural Networks (CNNs) have drawn researchers' attention to
+identifying cattle using muzzle images. However, CNNs often fail to capture
+long-range dependencies within the complex patterns of the muzzle. The
+transformers handle these challenges. This inspired us to fuse the strengths of
+CNNs and transformers in muzzle-based cattle identification. Addition and
+concatenation have been the most commonly used techniques for feature fusion.
+However, addition fails to preserve discriminative information, while
+concatenation results in an increase in dimensionality. Both methods are simple
+operations and cannot discover the relationships or interactions between fusing
+features. This research aims to overcome the issues faced by addition and
+concatenation. This research introduces a novel approach called Multi-Head
+Attention Feature Fusion (MHAFF) for the first time in cattle identification.
+MHAFF captures relations between the different types of fusing features while
+preserving their originality. The experiments show that MHAFF outperformed
+addition and concatenation techniques and the existing cattle identification
+methods in accuracy on two publicly available cattle datasets. MHAFF
+demonstrates excellent performance and quickly converges to achieve optimum
+accuracy of 99.88% and 99.52% in two cattle datasets simultaneously.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discovering Hidden Visual Concepts Beyond Linguistic Input in Infant
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueyi Ke, Satoshi Tsutsui, Yayun Zhang, Bihan Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infants develop complex visual understanding rapidly, even preceding of the
+acquisition of linguistic inputs. As computer vision seeks to replicate the
+human vision system, understanding infant visual development may offer valuable
+insights. In this paper, we present an interdisciplinary study exploring this
+question: can a computational model that imitates the infant learning process
+develop broader visual concepts that extend beyond the vocabulary it has heard,
+similar to how infants naturally learn? To investigate this, we analyze a
+recently published model in Science by Vong et al.,which is trained on
+longitudinal, egocentric images of a single child paired with transcribed
+parental speech. We introduce a training-free framework that can discover
+visual concept neurons hidden in the model's internal representations. Our
+findings show that these neurons can classify objects outside its original
+vocabulary. Furthermore, we compare the visual representations in infant-like
+models with those in moder computer vision models, such as CLIP or ImageNet
+pre-trained model, highlighting key similarities and differences. Ultimately,
+our work bridges cognitive science and computer vision by analyzing the
+internal representations of a computational model trained on an infant's visual
+and linguistic inputs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HipyrNet: Hypernet-Guided Feature Pyramid network for mixed-exposure
+  correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05195v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05195v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaurya Singh Rathore, Aravind Shenoy, Krish Didwania, Aditya Kasliwal, Ujjwal Verma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in image translation for enhancing mixed-exposure images
+have demonstrated the transformative potential of deep learning algorithms.
+However, addressing extreme exposure variations in images remains a significant
+challenge due to the inherent complexity and contrast inconsistencies across
+regions. Current methods often struggle to adapt effectively to these
+variations, resulting in suboptimal performance. In this work, we propose
+HipyrNet, a novel approach that integrates a HyperNetwork within a Laplacian
+Pyramid-based framework to tackle the challenges of mixed-exposure image
+enhancement. The inclusion of a HyperNetwork allows the model to adapt to these
+exposure variations. HyperNetworks dynamically generates weights for another
+network, allowing dynamic changes during deployment. In our model, the
+HyperNetwork employed is used to predict optimal kernels for Feature Pyramid
+decomposition, which enables a tailored and adaptive decomposition process for
+each input image. Our enhanced translational network incorporates multiscale
+decomposition and reconstruction, leveraging dynamic kernel prediction to
+capture and manipulate features across varying scales. Extensive experiments
+demonstrate that HipyrNet outperforms existing methods, particularly in
+scenarios with extreme exposure variations, achieving superior results in both
+qualitative and quantitative evaluations. Our approach sets a new benchmark for
+mixed-exposure image enhancement, paving the way for future research in
+adaptive image translation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compression with Global Guidance: Towards Training-free High-Resolution
+  MLLMs Acceleration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyang Liu, Ziming Wang, Yuhang Han, Yingyao Wang, Jiale Yuan, Jun Song, Bo Zheng, Linfeng Zhang, Siteng Huang, Honggang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) have attracted considerable
+attention due to their exceptional performance in visual content understanding
+and reasoning. However, their inference efficiency has been a notable concern,
+as the increasing length of multimodal contexts leads to quadratic complexity.
+Token compression techniques, which reduce the number of visual tokens, have
+demonstrated their effectiveness in reducing computational costs. Yet, these
+approaches have struggled to keep pace with the rapid advancements in MLLMs,
+especially the AnyRes strategy in the context of high-resolution image
+understanding. In this paper, we propose a novel token compression method,
+GlobalCom$^2$, tailored for high-resolution MLLMs that receive both the
+thumbnail and multiple crops. GlobalCom$^2$ treats the tokens derived from the
+thumbnail as the ``commander'' of the entire token compression process,
+directing the allocation of retention ratios and the specific compression for
+each crop. In this way, redundant tokens are eliminated while important local
+details are adaptively preserved to the highest extent feasible. Empirical
+results across 10 benchmarks reveal that GlobalCom$^2$ achieves an optimal
+balance between performance and efficiency, and consistently outperforms
+state-of-the-art token compression methods with LLaVA-NeXT-7B/13B models. Our
+code is released at \url{https://github.com/xuyang-liu16/GlobalCom2}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code is released at
+  \url{https://github.com/xuyang-liu16/GlobalCom2}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaceMe: Robust Blind Face Restoration with Personal Identification <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Liu, Zheng-Peng Duan, Jia OuYang, Jiayi Fu, Hyunhee Park, Zikun Liu, Chun-Le Guo, Chongyi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind face restoration is a highly ill-posed problem due to the lack of
+necessary context. Although existing methods produce high-quality outputs, they
+often fail to faithfully preserve the individual's identity. In this paper, we
+propose a personalized face restoration method, FaceMe, based on a diffusion
+model. Given a single or a few reference images, we use an identity encoder to
+extract identity-related features, which serve as prompts to guide the
+diffusion model in restoring high-quality and identity-consistent facial
+images. By simply combining identity-related features, we effectively minimize
+the impact of identity-irrelevant features during training and support any
+number of reference image inputs during inference. Additionally, thanks to the
+robustness of the identity encoder, synthesized images can be used as reference
+images during training, and identity changing during inference does not require
+fine-tuning the model. We also propose a pipeline for constructing a reference
+image training pool that simulates the poses and expressions that may appear in
+real-world scenarios. Experimental results demonstrate that our FaceMe can
+restore high-quality facial images while maintaining identity consistency,
+achieving excellent performance and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Systematic Literature <span class="highlight-title">Review</span> on Deep Learning-based Depth Estimation
+  in Computer Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Rohan, Md Junayed Hasan, Andrei Petrovski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth estimation (DE) provides spatial information about a scene and enables
+tasks such as 3D reconstruction, object detection, and scene understanding.
+Recently, there has been an increasing interest in using deep learning
+(DL)-based methods for DE. Traditional techniques rely on handcrafted features
+that often struggle to generalise to diverse scenes and require extensive
+manual tuning. However, DL models for DE can automatically extract relevant
+features from input data, adapt to various scene conditions, and generalise
+well to unseen environments. Numerous DL-based methods have been developed,
+making it necessary to survey and synthesize the state-of-the-art (SOTA).
+Previous reviews on DE have mainly focused on either monocular or stereo-based
+techniques, rather than comprehensively reviewing DE. Furthermore, to the best
+of our knowledge, there is no systematic literature review (SLR) that
+comprehensively focuses on DE. Therefore, this SLR study is being conducted.
+Initially, electronic databases were searched for relevant publications,
+resulting in 1284 publications. Using defined exclusion and quality criteria,
+128 publications were shortlisted and further filtered to select 59
+high-quality primary studies. These studies were analysed to extract data and
+answer defined research questions. Based on the results, DL methods were
+developed for mainly three different types of DE: monocular, stereo, and
+multi-view. 20 publicly available datasets were used to train, test, and
+evaluate DL models for DE, with KITTI, NYU Depth V2, and Make 3D being the most
+used datasets. 29 evaluation metrics were used to assess the performance of DE.
+35 base models were reported in the primary studies, and the top five most-used
+base models were ResNet-50, ResNet-18, ResNet-101, U-Net, and VGG-16. Finally,
+the lack of ground truth data was among the most significant challenges
+reported by primary studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CorrDiff: Adaptive Delay-aware Detector with Temporal Cue Inputs for
+  Real-time Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Zhang, Chenchen Fu, Yufei Cui, Lan Yi, Yuyang Sun, Weiwei Wu, Xue Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-time object detection takes an essential part in the decision-making
+process of numerous real-world applications, including collision avoidance and
+path planning in autonomous driving systems. This paper presents a novel
+real-time streaming perception method named CorrDiff, designed to tackle the
+challenge of delays in real-time detection systems. The main contribution of
+CorrDiff lies in its adaptive delay-aware detector, which is able to utilize
+runtime-estimated temporal cues to predict objects' locations for multiple
+future frames, and selectively produce predictions that matches real-world
+time, effectively compensating for any communication and computational delays.
+The proposed model outperforms current state-of-the-art methods by leveraging
+motion estimation and feature enhancement, both for 1) single-frame detection
+for the current frame or the next frame, in terms of the metric mAP, and 2) the
+prediction for (multiple) future frame(s), in terms of the metric sAP (The sAP
+metric is to evaluate object detection algorithms in streaming scenarios,
+factoring in both latency and accuracy). It demonstrates robust performance
+across a range of devices, from powerful Tesla V100 to modest RTX 2080Ti,
+achieving the highest level of perceptual accuracy on all platforms. Unlike
+most state-of-the-art methods that struggle to complete computation within a
+single frame on less powerful devices, CorrDiff meets the stringent real-time
+processing requirements on all kinds of devices. The experimental results
+emphasize the system's adaptability and its potential to significantly improve
+the safety and reliability for many real-world systems, such as autonomous
+driving. Our code is completely open-sourced and is available at
+https://anonymous.4open.science/r/CorrDiff.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE JSAC Special Issue: Intelligent Communications for
+  Real-Time Computer Vision (Comm4CV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3DIS-FLUX: simple and efficient multi-instance generation with DiT
+  rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dewei Zhou, Ji Xie, Zongxin Yang, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing demand for controllable outputs in text-to-image generation has
+driven significant advancements in multi-instance generation (MIG), enabling
+users to define both instance layouts and attributes. Currently, the
+state-of-the-art methods in MIG are primarily adapter-based. However, these
+methods necessitate retraining a new adapter each time a more advanced model is
+released, resulting in significant resource consumption. A methodology named
+Depth-Driven Decoupled Instance Synthesis (3DIS) has been introduced, which
+decouples MIG into two distinct phases: 1) depth-based scene construction and
+2) detail rendering with widely pre-trained depth control models. The 3DIS
+method requires adapter training solely during the scene construction phase,
+while enabling various models to perform training-free detail rendering.
+Initially, 3DIS focused on rendering techniques utilizing U-Net architectures
+such as SD1.5, SD2, and SDXL, without exploring the potential of recent
+DiT-based models like FLUX. In this paper, we present 3DIS-FLUX, an extension
+of the 3DIS framework that integrates the FLUX model for enhanced rendering
+capabilities. Specifically, we employ the FLUX.1-Depth-dev model for depth map
+controlled image generation and introduce a detail renderer that manipulates
+the Attention Mask in FLUX's Joint Attention mechanism based on layout
+information. This approach allows for the precise rendering of fine-grained
+attributes of each instance. Our experimental results indicate that 3DIS-FLUX,
+leveraging the FLUX model, outperforms the original 3DIS method, which utilized
+SD2 and SDXL, and surpasses current state-of-the-art adapter-based methods in
+terms of both performance and image quality. Project Page:
+https://limuloo.github.io/3DIS/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>tech report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Centurio: On Drivers of Multilingual Ability of Large Vision-Language
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05122v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05122v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gregor Geigle, Florian Schneider, Carolin Holtermann, Chris Biemann, Radu Timofte, Anne Lauscher, Goran Glavaš
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most Large Vision-Language Models (LVLMs) to date are trained predominantly
+on English data, which makes them struggle to understand non-English input and
+fail to generate output in the desired target language. Existing efforts
+mitigate these issues by adding multilingual training data, but do so in a
+largely ad-hoc manner, lacking insight into how different training mixes tip
+the scale for different groups of languages. In this work, we present a
+comprehensive investigation into the training strategies for massively
+multilingual LVLMs. First, we conduct a series of multi-stage experiments
+spanning 13 downstream vision-language tasks and 43 languages, systematically
+examining: (1) the number of training languages that can be included without
+degrading English performance and (2) optimal language distributions of
+pre-training as well as (3) instruction-tuning data. Further, we (4)
+investigate how to improve multilingual text-in-image understanding, and
+introduce a new benchmark for the task. Surprisingly, our analysis reveals that
+one can (i) include as many as 100 training languages simultaneously (ii) with
+as little as 25-50\% of non-English data, to greatly improve multilingual
+performance while retaining strong English performance. We further find that
+(iii) including non-English OCR data in pre-training and instruction-tuning is
+paramount for improving multilingual text-in-image understanding. Finally, we
+put all our findings together and train Centurio, a 100-language LVLM, offering
+state-of-the-art performance in an evaluation covering 14 tasks and 56
+languages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving the U-Net Configuration for Automated Delineation of Head and
+  Neck Cancer on MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05120v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05120v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrei Iantsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tumor volume segmentation on MRI is a challenging and time-consuming process
+that is performed manually in typical clinical settings. This work presents an
+approach to automated delineation of head and neck tumors on MRI scans,
+developed in the context of the MICCAI Head and Neck Tumor Segmentation for
+MR-Guided Applications (HNTS-MRG) 2024 Challenge. Rather than designing a new,
+task-specific convolutional neural network, the focus of this research was to
+propose improvements to the configuration commonly used in medical segmentation
+tasks, relying solely on the traditional U-Net architecture. The empirical
+results presented in this article suggest the superiority of patch-wise
+normalization used for both training and sliding window inference. They also
+indicate that the performance of segmentation models can be enhanced by
+applying a scheduled data augmentation policy during training. Finally, it is
+shown that a small improvement in quality can be achieved by using Gaussian
+weighting to combine predictions for individual patches during sliding window
+inference. The model with the best configuration obtained an aggregated Dice
+Similarity Coefficient (DSCagg) of 0.749 in Task 1 and 0.710 in Task 2 on five
+cross-validation folds. The ensemble of five models (one best model per
+validation fold) showed consistent results on a private test set of 50 patients
+with an DSCagg of 0.752 in Task 1 and 0.718 in Task 2 (team name:
+andrei.iantsen). The source code and model weights are freely available at
+www.github.com/iantsen/hntsmrg.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Multitask Industrial Processes with Predictive Action
+  Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05108v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05108v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naval Kishore Mehta,  Arvind, Shyam Sunder Prasad, Sumeet Saurav, Sanjay Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monitoring complex assembly processes is critical for maintaining
+productivity and ensuring compliance with assembly standards. However,
+variability in human actions and subjective task preferences complicate
+accurate task anticipation and guidance. To address these challenges, we
+introduce the Multi-Modal Transformer Fusion and Recurrent Units (MMTFRU)
+Network for egocentric activity anticipation, utilizing multimodal fusion to
+improve prediction accuracy. Integrated with the Operator Action Monitoring
+Unit (OAMU), the system provides proactive operator guidance, preventing
+deviations in the assembly process. OAMU employs two strategies: (1) Top-5
+MMTF-RU predictions, combined with a reference graph and an action dictionary,
+for next-step recommendations; and (2) Top-1 MMTF-RU predictions, integrated
+with a reference graph, for detecting sequence deviations and predicting
+anomaly scores via an entropy-informed confidence mechanism. We also introduce
+Time-Weighted Sequence Accuracy (TWSA) to evaluate operator efficiency and
+ensure timely task completion. Our approach is validated on the industrial
+Meccano dataset and the largescale EPIC-Kitchens-55 dataset, demonstrating its
+effectiveness in dynamic environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Motion-X++: A Large-Scale Multimodal 3D Whole-body Human Motion <span class="highlight-title">Dataset</span> <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhong Zhang, Jing Lin, Ailing Zeng, Guanlin Wu, Shunlin Lu, Yurong Fu, Yuanhao Cai, Ruimao Zhang, Haoqian Wang, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Motion-X++, a large-scale multimodal 3D
+expressive whole-body human motion dataset. Existing motion datasets
+predominantly capture body-only poses, lacking facial expressions, hand
+gestures, and fine-grained pose descriptions, and are typically limited to lab
+settings with manually labeled text descriptions, thereby restricting their
+scalability. To address this issue, we develop a scalable annotation pipeline
+that can automatically capture 3D whole-body human motion and comprehensive
+textural labels from RGB videos and build the Motion-X dataset comprising 81.1K
+text-motion pairs. Furthermore, we extend Motion-X into Motion-X++ by improving
+the annotation pipeline, introducing more data modalities, and scaling up the
+data quantities. Motion-X++ provides 19.5M 3D whole-body pose annotations
+covering 120.5K motion sequences from massive scenes, 80.8K RGB videos, 45.3K
+audios, 19.5M frame-level whole-body pose descriptions, and 120.5K
+sequence-level semantic labels. Comprehensive experiments validate the accuracy
+of our annotation pipeline and highlight Motion-X++'s significant benefits for
+generating expressive, precise, and natural motion with paired multimodal
+labels supporting several downstream tasks, including text-driven whole-body
+motion generation,audio-driven motion generation, 3D whole-body human mesh
+recovery, and 2D whole-body keypoints estimation, etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 figures, This work extends and enhances the research
+  published in the NeurIPS 2023 paper, "Motion-X: A Large-scale 3D Expressive
+  Whole-body Human Motion Dataset". arXiv admin note: substantial text overlap
+  with arXiv:2307.00818</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A 1Mb mixed-precision quantized encoder for image classification and
+  patch-based compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Thien Nguyen, William Guicquero, Gilles Sicard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even if Application-Specific Integrated Circuits (ASIC) have proven to be a
+relevant choice for integrating inference at the edge, they are often limited
+in terms of applicability. In this paper, we demonstrate that an ASIC neural
+network accelerator dedicated to image processing can be applied to multiple
+tasks of different levels: image classification and compression, while
+requiring a very limited hardware. The key component is a reconfigurable,
+mixed-precision (3b/2b/1b) encoder that takes advantage of proper weight and
+activation quantizations combined with convolutional layer structural pruning
+to lower hardware-related constraints (memory and computing). We introduce an
+automatic adaptation of linear symmetric quantizer scaling factors to perform
+quantized levels equalization, aiming at stabilizing quinary and ternary
+weights training. In addition, a proposed layer-shared Bit-Shift Normalization
+significantly simplifies the implementation of the hardware-expensive Batch
+Normalization. For a specific configuration in which the encoder design only
+requires 1Mb, the classification accuracy reaches 87.5% on CIFAR-10. Besides,
+we also show that this quantized encoder can be used to compress image
+patch-by-patch while the reconstruction can performed remotely, by a dedicated
+full-frame decoder. This solution typically enables an end-to-end compression
+almost without any block artifacts, outperforming patch-based state-of-the-art
+techniques employing a patch-constant bitrate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE Transactions on Circuits and Systems for Video
+  Technology (TCSVT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing ALS Applications with Large-Scale <span class="highlight-title">Pre-train</span>ing: <span class="highlight-title">Dataset</span>
+  Development and Downstream Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05095v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05095v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyi Xiu, Xin Liu, Taehoon Kim, Kyoung-Sook Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pre-training and fine-tuning paradigm has revolutionized satellite remote
+sensing applications. However, this approach remains largely underexplored for
+airborne laser scanning (ALS), an important technology for applications such as
+forest management and urban planning. In this study, we address this gap by
+constructing a large-scale ALS point cloud dataset and evaluating its impact on
+downstream applications. Our dataset comprises ALS point clouds collected
+across the contiguous United States, provided by the United States Geological
+Survey's 3D Elevation Program. To ensure efficient data collection while
+capturing diverse land cover and terrain types, we introduce a geospatial
+sampling method that selects point cloud tiles based on land cover maps and
+digital elevation models. As a baseline self-supervised learning model, we
+adopt BEV-MAE, a state-of-the-art masked autoencoder for 3D outdoor point
+clouds, and pre-train it on the constructed dataset. The pre-trained models are
+subsequently fine-tuned for downstream tasks, including tree species
+classification, terrain scene recognition, and point cloud semantic
+segmentation. Our results show that the pre-trained models significantly
+outperform their scratch counterparts across all downstream tasks,
+demonstrating the transferability of the representations learned from the
+proposed dataset. Furthermore, we observe that scaling the dataset using our
+geospatial sampling method consistently enhances performance, whereas
+pre-training on datasets constructed with random sampling fails to achieve
+similar improvements. These findings highlight the utility of the constructed
+dataset and the effectiveness of our sampling strategy in the pre-training and
+fine-tuning paradigm. The source code and pre-trained models will be made
+publicly available at \url{https://github.com/martianxiu/ALS_pretraining}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ResPanDiff: Diffusion Model with Disentangled Modulations for Image
+  Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiqi Cao, Liangjian Deng, Shangqi Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The implementation of diffusion-based pansharpening task is predominantly
+constrained by its slow inference speed, which results from numerous sampling
+steps. Despite the existing techniques aiming to accelerate sampling, they
+often compromise performance when fusing multi-source images. To ease this
+limitation, we introduce a novel and efficient diffusion model named Diffusion
+Model for Pansharpening by Inferring Residual Inference (ResPanDiff), which
+significantly reduces the number of diffusion steps without sacrificing the
+performance to tackle pansharpening task. In ResPanDiff, we innovatively
+propose a Markov chain that transits from noisy residuals to the residuals
+between the LRMS and HRMS images, thereby reducing the number of sampling steps
+and enhancing performance. Additionally, we design the latent space to help
+model extract more features at the encoding stage, Shallow
+Cond-Injection~(SC-I) to help model fetch cond-injected hidden features with
+higher dimensions, and loss functions to give a better guidance for the
+residual generation task. enabling the model to achieve superior performance in
+residual generation. Furthermore, experimental evaluations on pansharpening
+datasets demonstrate that the proposed method achieves superior outcomes
+compared to recent state-of-the-art~(SOTA) techniques, requiring only 15
+sampling steps, which reduces over $90\%$ step compared with the benchmark
+diffusion models. Our experiments also include thorough discussions and
+ablation studies to underscore the effectiveness of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ End-to-End Deep Learning for Interior Tomography with Low-Dose X-ray CT 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoseob Han, Dufan Wu, Kyungsang Kim, Quanzheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: There exist several X-ray computed tomography (CT) scanning
+strategies to reduce a radiation dose, such as (1) sparse-view CT, (2) low-dose
+CT, and (3) region-of-interest (ROI) CT (called interior tomography). To
+further reduce the dose, the sparse-view and/or low-dose CT settings can be
+applied together with interior tomography. Interior tomography has various
+advantages in terms of reducing the number of detectors and decreasing the
+X-ray radiation dose. However, a large patient or small field-of-view (FOV)
+detector can cause truncated projections, and then the reconstructed images
+suffer from severe cupping artifacts. In addition, although the low-dose CT can
+reduce the radiation exposure dose, analytic reconstruction algorithms produce
+image noise. Recently, many researchers have utilized image-domain deep
+learning (DL) approaches to remove each artifact and demonstrated impressive
+performances, and the theory of deep convolutional framelets supports the
+reason for the performance improvement. Approach: In this paper, we found that
+the image-domain convolutional neural network (CNN) is difficult to solve
+coupled artifacts, based on deep convolutional framelets. Significance: To
+address the coupled problem, we decouple it into two sub-problems: (i) image
+domain noise reduction inside truncated projection to solve low-dose CT problem
+and (ii) extrapolation of projection outside truncated projection to solve the
+ROI CT problem. The decoupled sub-problems are solved directly with a novel
+proposed end-to-end learning using dual-domain CNNs. Main results: We
+demonstrate that the proposed method outperforms the conventional image-domain
+deep learning methods, and a projection-domain CNN shows better performance
+than the image-domain CNNs which are commonly used by many researchers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published by Physics in Medicine & Biology (2022.5)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TipSegNet: Fingertip Segmentation in Contactless Fingerprint Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurenz Ruzicka, Bernhard Kohn, Clemens Heitzinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contactless fingerprint recognition systems offer a hygienic, user-friendly,
+and efficient alternative to traditional contact-based methods. However, their
+accuracy heavily relies on precise fingertip detection and segmentation,
+particularly under challenging background conditions. This paper introduces
+TipSegNet, a novel deep learning model that achieves state-of-the-art
+performance in segmenting fingertips directly from grayscale hand images.
+TipSegNet leverages a ResNeXt-101 backbone for robust feature extraction,
+combined with a Feature Pyramid Network (FPN) for multi-scale representation,
+enabling accurate segmentation across varying finger poses and image qualities.
+Furthermore, we employ an extensive data augmentation strategy to enhance the
+model's generalizability and robustness. TipSegNet outperforms existing
+methods, achieving a mean Intersection over Union (mIoU) of 0.987 and an
+accuracy of 0.999, representing a significant advancement in contactless
+fingerprint segmentation. This enhanced accuracy has the potential to
+substantially improve the reliability and effectiveness of contactless
+biometric systems in real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Flexible and Scalable Framework for Video Moment Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongzhi Zhang, Xizhou Zhu, Aixin Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video moment search, the process of finding relevant moments in a video
+corpus to match a user's query, is crucial for various applications. Existing
+solutions, however, often assume a single perfect matching moment, struggle
+with inefficient inference, and have limitations with hour-long videos. This
+paper introduces a flexible and scalable framework for retrieving a ranked list
+of moments from collection of videos in any length to match a text query, a
+task termed Ranked Video Moment Retrieval (RVMR). Our framework, called
+Segment-Proposal-Ranking (SPR), simplifies the search process into three
+independent stages: segment retrieval, proposal generation, and moment
+refinement with re-ranking. Specifically, videos are divided into equal-length
+segments with precomputed embeddings indexed offline, allowing efficient
+retrieval regardless of video length. For scalable online retrieval, both
+segments and queries are projected into a shared feature space to enable
+approximate nearest neighbor (ANN) search. Retrieved segments are then merged
+into coarse-grained moment proposals. Then a refinement and re-ranking module
+is designed to reorder and adjust timestamps of the coarse-grained proposals.
+Evaluations on the TVR-Ranking dataset demonstrate that our framework achieves
+state-of-the-art performance with significant reductions in computational cost
+and processing time. The flexible design also allows for independent
+improvements to each stage, making SPR highly adaptable for large-scale
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Commonsense Video Question Answering through Video-Grounded Entailment
+  Tree Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huabin Liu, Filip Ilievski, Cees G. M. Snoek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes the first video-grounded entailment tree reasoning method
+for commonsense video question answering (VQA). Despite the remarkable progress
+of large visual-language models (VLMs), there are growing concerns that they
+learn spurious correlations between videos and likely answers, reinforced by
+their black-box nature and remaining benchmarking biases. Our method explicitly
+grounds VQA tasks to video fragments in four steps: entailment tree
+construction, video-language entailment verification, tree reasoning, and
+dynamic tree expansion. A vital benefit of the method is its generalizability
+to current video and image-based VLMs across reasoning types. To support fair
+evaluation, we devise a de-biasing procedure based on large-language models
+that rewrites VQA benchmark answer sets to enforce model reasoning. Systematic
+experiments on existing and de-biased benchmarks highlight the impact of our
+method components across benchmarks, VLMs, and reasoning types.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLaVA-Octopus: Unlocking Instruction-Driven Adaptive Projector Fusion
+  for Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Zhao, Boyuan Sun, Xiang Chen, Xihan Wei, Qibin Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce LLaVA-Octopus, a novel video multimodal large
+language model. LLaVA-Octopus adaptively weights features from different visual
+projectors based on user instructions, enabling us to leverage the
+complementary strengths of each projector. We observe that different visual
+projectors exhibit distinct characteristics when handling specific tasks. For
+instance, some projectors excel at capturing static details, while others are
+more effective at processing temporal information, and some are better suited
+for tasks requiring temporal coherence. By dynamically adjusting feature
+weights according to user instructions, LLaVA-Octopus dynamically selects and
+combines the most suitable features, significantly enhancing the model's
+performance in multimodal tasks. Experimental results demonstrate that
+LLaVA-Octopus achieves excellent performance across multiple benchmarks,
+especially in tasks such as multimodal understanding, visual question
+answering, and video understanding, highlighting its broad application
+potential.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Skeleton-based Action Recognition with Interactive Object
+  Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Wen, Ziqian Lu, Fengli Shen, Zhe-Ming Lu, Jialin Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human skeleton information is important in skeleton-based action recognition,
+which provides a simple and efficient way to describe human pose. However,
+existing skeleton-based methods focus more on the skeleton, ignoring the
+objects interacting with humans, resulting in poor performance in recognizing
+actions that involve object interactions. We propose a new action recognition
+framework introducing object nodes to supplement absent interactive object
+information. We also propose Spatial Temporal Variable Graph Convolutional
+Networks (ST-VGCN) to effectively model the Variable Graph (VG) containing
+object nodes. Specifically, in order to validate the role of interactive object
+information, by leveraging a simple self-training approach, we establish a new
+dataset, JXGC 24, and an extended dataset, NTU RGB+D+Object 60, including more
+than 2 million additional object nodes. At the same time, we designe the
+Variable Graph construction method to accommodate a variable number of nodes
+for graph structure. Additionally, we are the first to explore the overfitting
+issue introduced by incorporating additional object information, and we propose
+a VG-based data augmentation method to address this issue, called Random Node
+Attack. Finally, regarding the network structure, we introduce two fusion
+modules, CAF and WNPool, along with a novel Node Balance Loss, to enhance the
+comprehensive performance by effectively fusing and balancing skeleton and
+object node information. Our method surpasses the previous state-of-the-art on
+multiple skeleton-based action recognition benchmarks. The accuracy of our
+method on NTU RGB+D 60 cross-subject split is 96.7\%, and on cross-view split,
+it is 99.2\%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LongViTU: Instruction Tuning for Long-Form Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rujie Wu, Xiaojian Ma, Hai Ci, Yue Fan, Yuxuan Wang, Haozhe Zhao, Qing Li, Yizhou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduce LongViTU, a large-scale (~121k QA pairs, ~900h videos),
+automatically generated dataset for long-form video understanding. We developed
+a systematic approach that organizes videos into a hierarchical tree structure
+and incorporates self-revision mechanisms to ensure high-quality QA pairs. Each
+QA pair in LongViTU features: 1) long-term context (average certificate length
+of 4.6 minutes); 2) rich knowledge and condensed reasoning (commonsense,
+causality, planning, etc.); and 3) explicit timestamp labels for relevant
+events. LongViTU also serves as a benchmark for instruction following in
+long-form and streaming video understanding. We evaluate the open-source
+state-of-the-art long video understanding model, LongVU, and the commercial
+model, Gemini-1.5-Pro, on our benchmark. They achieve GPT-4 scores of 49.9 and
+52.3, respectively, underscoring the substantial challenge posed by our
+benchmark. Further supervised fine-tuning (SFT) on LongVU led to performance
+improvements of 12.0% on our benchmark, 2.2% on the in-distribution (ID)
+benchmark EgoSchema, 1.0%, 2.2% and 1.2% on the out-of-distribution (OOD)
+benchmarks VideoMME (Long), WorldQA and OpenEQA, respectively. These outcomes
+demonstrate LongViTU's high data quality and robust OOD generalizability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Fingerprint Mosaicking Artifact Detection: A <span class="highlight-title">Self-Supervised</span>
+  Deep Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05034v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05034v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurenz Ruzicka, Alexander Spenke, Stephan Bergmann, Gerd Nolden, Bernhard Kohn, Clemens Heitzinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fingerprint mosaicking, which is the process of combining multiple
+fingerprint images into a single master fingerprint, is an essential process in
+modern biometric systems. However, it is prone to errors that can significantly
+degrade fingerprint image quality. This paper proposes a novel deep
+learning-based approach to detect and score mosaicking artifacts in fingerprint
+images. Our method leverages a self-supervised learning framework to train a
+model on large-scale unlabeled fingerprint data, eliminating the need for
+manual artifact annotation. The proposed model effectively identifies
+mosaicking errors, achieving high accuracy on various fingerprint modalities,
+including contactless, rolled, and pressed fingerprints and furthermore proves
+to be robust to different data sources. Additionally, we introduce a novel
+mosaicking artifact score to quantify the severity of errors, enabling
+automated evaluation of fingerprint images. By addressing the challenges of
+mosaicking artifact detection, our work contributes to improving the accuracy
+and reliability of fingerprint-based biometric systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ECBench: Can Multi-modal Foundation Models Understand the Egocentric
+  World? A Holistic Embodied Cognition Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ronghao Dang, Yuqian Yuan, Wenqi Zhang, Yifei Xin, Boqiang Zhang, Long Li, Liuyi Wang, Qinyang Zeng, Xin Li, Lidong Bing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The enhancement of generalization in robots by large vision-language models
+(LVLMs) is increasingly evident. Therefore, the embodied cognitive abilities of
+LVLMs based on egocentric videos are of great interest. However, current
+datasets for embodied video question answering lack comprehensive and
+systematic evaluation frameworks. Critical embodied cognitive issues, such as
+robotic self-cognition, dynamic scene perception, and hallucination, are rarely
+addressed. To tackle these challenges, we propose ECBench, a high-quality
+benchmark designed to systematically evaluate the embodied cognitive abilities
+of LVLMs. ECBench features a diverse range of scene video sources, open and
+varied question formats, and 30 dimensions of embodied cognition. To ensure
+quality, balance, and high visual dependence, ECBench uses class-independent
+meticulous human annotation and multi-round question screening strategies.
+Additionally, we introduce ECEval, a comprehensive evaluation system that
+ensures the fairness and rationality of the indicators. Utilizing ECBench, we
+conduct extensive evaluations of proprietary, open-source, and task-specific
+LVLMs. ECBench is pivotal in advancing the embodied cognitive capabilities of
+LVLMs, laying a solid foundation for developing reliable core models for
+embodied agents. All data and code are available at
+https://github.com/Rh-Dang/ECBench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Perception-as-Control: Fine-grained Controllable Image Animation with
+  3D-aware Motion Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingjie Chen, Yifang Men, Yuan Yao, Miaomiao Cui, Liefeng Bo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion-controllable image animation is a fundamental task with a wide range
+of potential applications. Recent works have made progress in controlling
+camera or object motion via various motion representations, while they still
+struggle to support collaborative camera and object motion control with
+adaptive control granularity. To this end, we introduce 3D-aware motion
+representation and propose an image animation framework, called
+Perception-as-Control, to achieve fine-grained collaborative motion control.
+Specifically, we construct 3D-aware motion representation from a reference
+image, manipulate it based on interpreted user intentions, and perceive it from
+different viewpoints. In this way, camera and object motions are transformed
+into intuitive, consistent visual changes. Then, the proposed framework
+leverages the perception results as motion control signals, enabling it to
+support various motion-related video synthesis tasks in a unified and flexible
+way. Experiments demonstrate the superiority of the proposed framework. For
+more details and qualitative results, please refer to our project webpage:
+https://chen-yingjie.github.io/projects/Perception-as-Control.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continuous Knowledge-Preserving Decomposition for Few-Shot Continual
+  Learning <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojie Li, Yibo Yang, Jianlong Wu, David A. Clifton, Yue Yu, Bernard Ghanem, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot class-incremental learning (FSCIL) involves learning new classes
+from limited data while retaining prior knowledge, and often results in
+catastrophic forgetting. Existing methods either freeze backbone networks to
+preserve knowledge, which limits adaptability, or rely on additional modules or
+prompts, introducing inference overhead. To this end, we propose Continuous
+Knowledge-Preserving Decomposition for FSCIL (CKPD-FSCIL), a framework that
+decomposes a model's weights into two parts: one that compacts existing
+knowledge (knowledge-sensitive components) and another that carries redundant
+capacity to accommodate new abilities (redundant-capacity components). The
+decomposition is guided by a covariance matrix from replay samples, ensuring
+principal components align with classification abilities. During adaptation, we
+freeze the knowledge-sensitive components and only adapt the redundant-capacity
+components, fostering plasticity while minimizing interference without changing
+the architecture or increasing overhead. Additionally, CKPD introduces an
+adaptive layer selection strategy to identify layers with redundant capacity,
+dynamically allocating adapters. Experiments on multiple benchmarks show that
+CKPD-FSCIL outperforms state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/xiaojieli0903/CKPD-FSCIL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Scalable System for Visual Analysis of Ocean Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Toshit Jain, Upkar Singh, Varun Singh, Vijay Kumar Boda, Ingrid Hotz, Sathish S. Vadhiyar, P. N. Vinayachandran, Vijay Natarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Oceanographers rely on visual analysis to interpret model simulations,
+identify events and phenomena, and track dynamic ocean processes. The ever
+increasing resolution and complexity of ocean data due to its dynamic nature
+and multivariate relationships demands a scalable and adaptable visualization
+tool for interactive exploration. We introduce pyParaOcean, a scalable and
+interactive visualization system designed specifically for ocean data analysis.
+pyParaOcean offers specialized modules for common oceanographic analysis tasks,
+including eddy identification and salinity movement tracking. These modules
+seamlessly integrate with ParaView as filters, ensuring a user-friendly and
+easy-to-use system while leveraging the parallelization capabilities of
+ParaView and a plethora of inbuilt general-purpose visualization
+functionalities. The creation of an auxiliary dataset stored as a Cinema
+database helps address I/O and network bandwidth bottlenecks while supporting
+the generation of quick overview visualizations. We present a case study on the
+Bay of Bengal (BoB) to demonstrate the utility of the system and scaling
+studies to evaluate the efficiency of the system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A CT Image Classification Network Framework for Lung Tumors Based on
+  <span class="highlight-title">Pre-train</span>ed MobileNetV2 Model and Transfer learning, And Its Application and
+  Market Analysis in the Medical field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyang Gao, Yong Tian, Shih-Chi Lin, Junghua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the medical field, accurate diagnosis of lung cancer is crucial for
+treatment. Traditional manual analysis methods have significant limitations in
+terms of accuracy and efficiency. To address this issue, this paper proposes a
+deep learning network framework based on the pre-trained MobileNetV2 model,
+initialized with weights from the ImageNet-1K dataset (version 2). The last
+layer of the model (the fully connected layer) is replaced with a new fully
+connected layer, and a softmax activation function is added to efficiently
+classify three types of lung cancer CT scan images. Experimental results show
+that the model achieves an accuracy of 99.6% on the test set, with significant
+improvements in feature extraction compared to traditional models.With the
+rapid development of artificial intelligence technologies, deep learning
+applications in medical image processing are bringing revolutionary changes to
+the healthcare industry. AI-based lung cancer detection systems can
+significantly improve diagnostic efficiency, reduce the workload of doctors,
+and occupy an important position in the global healthcare market. The potential
+of AI to improve diagnostic accuracy, reduce medical costs, and promote
+precision medicine will have a profound impact on the future development of the
+healthcare industry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IPDN: Image-enhanced <span class="highlight-title">Prompt</span> Decoding Network for 3D Referring Expression
+  Segmentation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Chen, Changli Wu, Jiayi Ji, Yiwei Ma, Danni Yang, Xiaoshuai Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Referring Expression Segmentation (3D-RES) aims to segment point cloud
+scenes based on a given expression. However, existing 3D-RES approaches face
+two major challenges: feature ambiguity and intent ambiguity. Feature ambiguity
+arises from information loss or distortion during point cloud acquisition due
+to limitations such as lighting and viewpoint. Intent ambiguity refers to the
+model's equal treatment of all queries during the decoding process, lacking
+top-down task-specific guidance. In this paper, we introduce an Image enhanced
+Prompt Decoding Network (IPDN), which leverages multi-view images and
+task-driven information to enhance the model's reasoning capabilities. To
+address feature ambiguity, we propose the Multi-view Semantic Embedding (MSE)
+module, which injects multi-view 2D image information into the 3D scene and
+compensates for potential spatial information loss. To tackle intent ambiguity,
+we designed a Prompt-Aware Decoder (PAD) that guides the decoding process by
+deriving task-driven signals from the interaction between the expression and
+visual features. Comprehensive experiments demonstrate that IPDN outperforms
+the state-ofthe-art by 1.9 and 4.2 points in mIoU metrics on the 3D-RES and
+3D-GRES tasks, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ V2C-CBM: Building Concept Bottlenecks with Vision-to-Concept Tokenizer <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hangzhou He, Lei Zhu, Xinliang Zhang, Shuang Zeng, Qian Chen, Yanye Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concept Bottleneck Models (CBMs) offer inherent interpretability by initially
+translating images into human-comprehensible concepts, followed by a linear
+combination of these concepts for classification. However, the annotation of
+concepts for visual recognition tasks requires extensive expert knowledge and
+labor, constraining the broad adoption of CBMs. Recent approaches have
+leveraged the knowledge of large language models to construct concept
+bottlenecks, with multimodal models like CLIP subsequently mapping image
+features into the concept feature space for classification. Despite this, the
+concepts produced by language models can be verbose and may introduce
+non-visual attributes, which hurts accuracy and interpretability. In this
+study, we investigate to avoid these issues by constructing CBMs directly from
+multimodal models. To this end, we adopt common words as base concept
+vocabulary and leverage auxiliary unlabeled images to construct a
+Vision-to-Concept (V2C) tokenizer that can explicitly quantize images into
+their most relevant visual concepts, thus creating a vision-oriented concept
+bottleneck tightly coupled with the multimodal model. This leads to our V2C-CBM
+which is training efficient and interpretable with high accuracy. Our V2C-CBM
+has matched or outperformed LLM-supervised CBMs on various visual
+classification benchmarks, validating the efficacy of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AD-L-JEPA: <span class="highlight-title">Self-Supervised</span> Spatial World Models with Joint Embedding
+  Predictive Architecture for Autonomous Driving with LiDAR Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04969v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04969v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Zhu, Zhenyuan Dong, Kristi Topollai, Anna Choromanska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As opposed to human drivers, current autonomous driving systems still require
+vast amounts of labeled data to train. Recently, world models have been
+proposed to simultaneously enhance autonomous driving capabilities by improving
+the way these systems understand complex real-world environments and reduce
+their data demands via self-supervised pre-training. In this paper, we present
+AD-L-JEPA (aka Autonomous Driving with LiDAR data via a Joint Embedding
+Predictive Architecture), a novel self-supervised pre-training framework for
+autonomous driving with LiDAR data that, as opposed to existing methods, is
+neither generative nor contrastive. Our method learns spatial world models with
+a joint embedding predictive architecture. Instead of explicitly generating
+masked unknown regions, our self-supervised world models predict Bird's Eye
+View (BEV) embeddings to represent the diverse nature of autonomous driving
+scenes. Our approach furthermore eliminates the need to manually create
+positive and negative pairs, as is the case in contrastive learning. AD-L-JEPA
+leads to simpler implementation and enhanced learned representations. We
+qualitatively and quantitatively demonstrate high-quality of embeddings learned
+with AD-L-JEPA. We furthermore evaluate the accuracy and label efficiency of
+AD-L-JEPA on popular downstream tasks such as LiDAR 3D object detection and
+associated transfer learning. Our experimental evaluation demonstrates that
+AD-L-JEPA is a plausible approach for self-supervised pre-training in
+autonomous driving applications and is the best available approach
+outperforming SOTA, including most recently proposed Occupancy-MAE [1] and ALSO
+[2]. The source code of AD-L-JEPA is available at
+https://github.com/HaoranZhuExplorer/AD-L-JEPA-Release.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emergence of Painting Ability via Recognition-Driven Evolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Lin, Lin Gu, Ziteng Cui, Shenghan Su, Yumo Hao, Yingtao Tian, Tatsuya Harada, Jianfei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  From Paleolithic cave paintings to Impressionism, human painting has evolved
+to depict increasingly complex and detailed scenes, conveying more nuanced
+messages. This paper attempts to emerge this artistic capability by simulating
+the evolutionary pressures that enhance visual communication efficiency.
+Specifically, we present a model with a stroke branch and a palette branch that
+together simulate human-like painting. The palette branch learns a limited
+colour palette, while the stroke branch parameterises each stroke using
+B\'ezier curves to render an image, subsequently evaluated by a high-level
+recognition module. We quantify the efficiency of visual communication by
+measuring the recognition accuracy achieved with machine vision. The model then
+optimises the control points and colour choices for each stroke to maximise
+recognition accuracy with minimal strokes and colours. Experimental results
+show that our model achieves superior performance in high-level recognition
+tasks, delivering artistic expression and aesthetic appeal, especially in
+abstract sketches. Additionally, our approach shows promise as an efficient
+bit-level image compression technique, outperforming traditional methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing Domain Shift via Imbalance-Aware Domain Adaptation in Embryo
+  Development Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Li, Xinglin Zhang, Jun Liang, Tao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models in medical imaging face dual challenges: domain shift,
+where models perform poorly when deployed in settings different from their
+training environment, and class imbalance, where certain disease conditions are
+naturally underrepresented. We present Imbalance-Aware Domain Adaptation
+(IADA), a novel framework that simultaneously tackles both challenges through
+three key components: (1) adaptive feature learning with class-specific
+attention mechanisms, (2) balanced domain alignment with dynamic weighting, and
+(3) adaptive threshold optimization. Our theoretical analysis establishes
+convergence guarantees and complexity bounds. Through extensive experiments on
+embryo development assessment across four imaging modalities, IADA demonstrates
+significant improvements over existing methods, achieving up to 25.19\% higher
+accuracy while maintaining balanced performance across classes. In challenging
+scenarios with low-quality imaging systems, IADA shows robust generalization
+with AUC improvements of up to 12.56\%. These results demonstrate IADA's
+potential for developing reliable and equitable medical imaging systems for
+diverse clinical settings. The code is made public available at
+\url{https://github.com/yinghemedical/imbalance-aware_domain_adaptation}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MORDA: A Synthetic <span class="highlight-title">Dataset</span> to Facilitate Adaptation of Object Detectors
+  to Unseen Real-target Domain While Preserving Performance on Real-source
+  Domain <span class="chip">ICRA2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hojun Lim, Heecheol Yoo, Jinwoo Lee, Seungmin Jeon, Hyeongseok Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural network (DNN) based perception models are indispensable in the
+development of autonomous vehicles (AVs). However, their reliance on
+large-scale, high-quality data is broadly recognized as a burdensome necessity
+due to the substantial cost of data acquisition and labeling. Further, the
+issue is not a one-time concern, as AVs might need a new dataset if they are to
+be deployed to another region (real-target domain) that the in-hand dataset
+within the real-source domain cannot incorporate. To mitigate this burden, we
+propose leveraging synthetic environments as an auxiliary domain where the
+characteristics of real domains are reproduced. This approach could enable
+indirect experience about the real-target domain in a time- and cost-effective
+manner. As a practical demonstration of our methodology, nuScenes and South
+Korea are employed to represent real-source and real-target domains,
+respectively. That means we construct digital twins for several regions of
+South Korea, and the data-acquisition framework of nuScenes is reproduced.
+Blending the aforementioned components within a simulator allows us to obtain a
+synthetic-fusion domain in which we forge our novel driving dataset, MORDA:
+Mixture Of Real-domain characteristics for synthetic-data-assisted Domain
+Adaptation. To verify the value of synthetic features that MORDA provides in
+learning about driving environments of South Korea, 2D/3D detectors are trained
+solely on a combination of nuScenes and MORDA. Afterward, their performance is
+evaluated on the unforeseen real-world dataset (AI-Hub) collected in South
+Korea. Our experiments present that MORDA can significantly improve mean
+Average Precision (mAP) on AI-Hub dataset while that on nuScenes is retained or
+slightly enhanced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 6 figures, 4 tables, This work has been submitted to the
+  IEEE for possible publication (the paper is submitted to the conference
+  ICRA2025 and is under review)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seeing with Partial Certainty: Conformal Prediction for Robotic Scene
+  Recognition in Built Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Xu, Vineet Kamat, Carol Menassa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In assistive robotics serving people with disabilities (PWD), accurate place
+recognition in built environments is crucial to ensure that robots navigate and
+interact safely within diverse indoor spaces. Language interfaces, particularly
+those powered by Large Language Models (LLM) and Vision Language Models (VLM),
+hold significant promise in this context, as they can interpret visual scenes
+and correlate them with semantic information. However, such interfaces are also
+known for their hallucinated predictions. In addition, language instructions
+provided by humans can also be ambiguous and lack precise details about
+specific locations, objects, or actions, exacerbating the hallucination issue.
+In this work, we introduce Seeing with Partial Certainty (SwPC) - a framework
+designed to measure and align uncertainty in VLM-based place recognition,
+enabling the model to recognize when it lacks confidence and seek assistance
+when necessary. This framework is built on the theory of conformal prediction
+to provide statistical guarantees on place recognition while minimizing
+requests for human help in complex indoor environment settings. Through
+experiments on the widely used richly-annotated scene dataset Matterport3D, we
+show that SwPC significantly increases the success rate and decreases the
+amount of human intervention required relative to the prior art. SwPC can be
+utilized with any VLMs directly without requiring model fine-tuning, offering a
+promising, lightweight approach to uncertainty modeling that complements and
+scales alongside the expanding capabilities of foundational models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MambaHSI: Spatial-Spectral Mamba for Hyperspectral Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yapeng Li, Yong Luo, Lefei Zhang, Zengmao Wang, Bo Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer has been extensively explored for hyperspectral image (HSI)
+classification. However, transformer poses challenges in terms of speed and
+memory usage because of its quadratic computational complexity. Recently, the
+Mamba model has emerged as a promising approach, which has strong long-distance
+modeling capabilities while maintaining a linear computational complexity.
+However, representing the HSI is challenging for the Mamba due to the
+requirement for an integrated spatial and spectral understanding. To remedy
+these drawbacks, we propose a novel HSI classification model based on a Mamba
+model, named MambaHSI, which can simultaneously model long-range interaction of
+the whole image and integrate spatial and spectral information in an adaptive
+manner. Specifically, we design a spatial Mamba block (SpaMB) to model the
+long-range interaction of the whole image at the pixel-level. Then, we propose
+a spectral Mamba block (SpeMB) to split the spectral vector into multiple
+groups, mine the relations across different spectral groups, and extract
+spectral features. Finally, we propose a spatial-spectral fusion module (SSFM)
+to adaptively integrate spatial and spectral features of a HSI. To our best
+knowledge, this is the first image-level HSI classification model based on the
+Mamba. We conduct extensive experiments on four diverse HSI datasets. The
+results demonstrate the effectiveness and superiority of the proposed model for
+HSI classification. This reveals the great potential of Mamba to be the
+next-generation backbone for HSI models. Codes are available at
+https://github.com/li-yapeng/MambaHSI .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by IEEE TGRS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Context Temporal Consistent Modeling for Referring Video Object
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04939v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04939v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sun-Hyuk Choi, Hayoung Jo, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Referring video object segmentation aims to segment objects within a video
+corresponding to a given text description. Existing transformer-based temporal
+modeling approaches face challenges related to query inconsistency and the
+limited consideration of context. Query inconsistency produces unstable masks
+of different objects in the middle of the video. The limited consideration of
+context leads to the segmentation of incorrect objects by failing to adequately
+account for the relationship between the given text and instances. To address
+these issues, we propose the Multi-context Temporal Consistency Module (MTCM),
+which consists of an Aligner and a Multi-Context Enhancer (MCE). The Aligner
+removes noise from queries and aligns them to achieve query consistency. The
+MCE predicts text-relevant queries by considering multi-context. We applied
+MTCM to four different models, increasing performance across all of them,
+particularly achieving 47.6 J&F on the MeViS. Code is available at
+https://github.com/Choi58/MTCM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Plug-and-Play DISep: Separating Dense Instances for Scene-to-Pixel
+  Weakly-Supervised Change Detection in High-Resolution Remote Sensing Images <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghui Zhao, Chen Wu, Lixiang Ru, Di Wang, Hongruixuan Chen, Cuiqun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Weakly-Supervised Change Detection (WSCD) methods often encounter
+the problem of "instance lumping" under scene-level supervision, particularly
+in scenarios with a dense distribution of changed instances (i.e., changed
+objects). In these scenarios, unchanged pixels between changed instances are
+also mistakenly identified as changed, causing multiple changes to be
+mistakenly viewed as one. In practical applications, this issue prevents the
+accurate quantification of the number of changes. To address this issue, we
+propose a Dense Instance Separation (DISep) method as a plug-and-play solution,
+refining pixel features from a unified instance perspective under scene-level
+supervision. Specifically, our DISep comprises a three-step iterative training
+process: 1) Instance Localization: We locate instance candidate regions for
+changed pixels using high-pass class activation maps. 2) Instance Retrieval: We
+identify and group these changed pixels into different instance IDs through
+connectivity searching. Then, based on the assigned instance IDs, we extract
+corresponding pixel-level features on a per-instance basis. 3) Instance
+Separation: We introduce a separation loss to enforce intra-instance pixel
+consistency in the embedding space, thereby ensuring separable instance feature
+representations. The proposed DISep adds only minimal training cost and no
+inference cost. It can be seamlessly integrated to enhance existing WSCD
+methods. We achieve state-of-the-art performance by enhancing {three
+Transformer-based and four ConvNet-based methods} on the LEVIR-CD, WHU-CD,
+DSIFN-CD, SYSU-CD, and CDD datasets. Additionally, our DISep can be used to
+improve fully-supervised change detection methods. Code is available at
+https://github.com/zhenghuizhao/Plug-and-Play-DISep-for-Change-Detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ISPRS Journal of Photogrammetry and Remote Sensing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image2CADSeq: Computer-Aided Design Sequence and Knowledge Inference
+  from Product Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingang Li, Zhenghui Sha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computer-aided design (CAD) tools empower designers to design and modify 3D
+models through a series of CAD operations, commonly referred to as a CAD
+sequence. In scenarios where digital CAD files are not accessible, reverse
+engineering (RE) has been used to reconstruct 3D CAD models. Recent advances
+have seen the rise of data-driven approaches for RE, with a primary focus on
+converting 3D data, such as point clouds, into 3D models in boundary
+representation (B-rep) format. However, obtaining 3D data poses significant
+challenges, and B-rep models do not reveal knowledge about the 3D modeling
+process of designs. To this end, our research introduces a novel data-driven
+approach with an Image2CADSeq neural network model. This model aims to reverse
+engineer CAD models by processing images as input and generating CAD sequences.
+These sequences can then be translated into B-rep models using a solid modeling
+kernel. Unlike B-rep models, CAD sequences offer enhanced flexibility to modify
+individual steps of model creation, providing a deeper understanding of the
+construction process of CAD models. To quantitatively and rigorously evaluate
+the predictive performance of the Image2CADSeq model, we have developed a
+multi-level evaluation framework for model assessment. The model was trained on
+a specially synthesized dataset, and various network architectures were
+explored to optimize the performance. The experimental and validation results
+show great potential for the model in generating CAD sequences from 2D image
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 10 figures, and 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Mesh Completion to AI Designed Crown 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Golriz Hosseinimanesh, Farnoosh Ghadiri, Francois Guibault, Farida Cheriet, Julia Keren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing a dental crown is a time-consuming and labor intensive process. Our
+goal is to simplify crown design and minimize the tediousness of making manual
+adjustments while still ensuring the highest level of accuracy and consistency.
+To this end, we present a new end- to-end deep learning approach, coined Dental
+Mesh Completion (DMC), to generate a crown mesh conditioned on a point cloud
+context. The dental context includes the tooth prepared to receive a crown and
+its surroundings, namely the two adjacent teeth and the three closest teeth in
+the opposing jaw. We formulate crown generation in terms of completing this
+point cloud context. A feature extractor first converts the input point cloud
+into a set of feature vectors that represent local regions in the point cloud.
+The set of feature vectors is then fed into a transformer to predict a new set
+of feature vectors for the missing region (crown). Subsequently, a point
+reconstruction head, followed by a multi-layer perceptron, is used to predict a
+dense set of points with normals. Finally, a differentiable point-to-mesh layer
+serves to reconstruct the crown surface mesh. We compare our DMC method to a
+graph-based convolutional neural network which learns to deform a crown mesh
+from a generic crown shape to the target geometry. Extensive experiments on our
+dataset demonstrate the effectiveness of our method, which attains an average
+of 0.062 Chamfer Distance.The code is available
+at:https://github.com/Golriz-code/DMC.gi
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Machine Learning Model for Crowd Density Classification in Hajj Video
+  Frames 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04911v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04911v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Afnan A. Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Managing the massive annual gatherings of Hajj and Umrah presents significant
+challenges, particularly as the Saudi government aims to increase the number of
+pilgrims. Currently, around two million pilgrims attend Hajj and 26 million
+attend Umrah making crowd control especially in critical areas like the Grand
+Mosque during Tawaf, a major concern. Additional risks arise in managing dense
+crowds at key sites such as Arafat where the potential for stampedes, fires and
+pandemics poses serious threats to public safety. This research proposes a
+machine learning model to classify crowd density into three levels: moderate
+crowd, overcrowded and very dense crowd in video frames recorded during Hajj,
+with a flashing red light to alert organizers in real-time when a very dense
+crowd is detected. While current research efforts in processing Hajj
+surveillance videos focus solely on using CNN to detect abnormal behaviors,
+this research focuses more on high-risk crowds that can lead to disasters.
+Hazardous crowd conditions require a robust method, as incorrect classification
+could trigger unnecessary alerts and government intervention, while failure to
+classify could result in disaster. The proposed model integrates Local Binary
+Pattern (LBP) texture analysis, which enhances feature extraction for
+differentiating crowd density levels, along with edge density and area-based
+features. The model was tested on the KAU-Smart Crowd 'HAJJv2' dataset which
+contains 18 videos from various key locations during Hajj including 'Massaa',
+'Jamarat', 'Arafat' and 'Tawaf'. The model achieved an accuracy rate of 87%
+with a 2.14% error percentage (misclassification rate), demonstrating its
+ability to detect and classify various crowd conditions effectively. That
+contributes to enhanced crowd management and safety during large-scale events
+like Hajj.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bit-depth color recovery via off-the-shelf super-resolution models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanshuo Fu, Danna Xue, Javier Vazquez-Corral
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in imaging technology have enabled hardware to support 10 to 16
+bits per channel, facilitating precise manipulation in applications like image
+editing and video processing. While deep neural networks promise to recover
+high bit-depth representations, existing methods often rely on scale-invariant
+image information, limiting performance in certain scenarios. In this paper, we
+introduce a novel approach that integrates a super-resolution architecture to
+extract detailed a priori information from images. By leveraging interpolated
+data generated during the super-resolution process, our method achieves
+pixel-level recovery of fine-grained color details. Additionally, we
+demonstrate that spatial features learned through the super-resolution process
+significantly contribute to the recovery of detailed color depth information.
+Experiments on benchmark datasets demonstrate that our approach outperforms
+state-of-the-art methods, highlighting the potential of super-resolution for
+high-fidelity color restoration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Approximate Supervised Object Distance Estimation on Unmanned Surface
+  Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Kiefer, Yitong Quan, Andreas Zell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned surface vehicles (USVs) and boats are increasingly important in
+maritime operations, yet their deployment is limited due to costly sensors and
+complexity. LiDAR, radar, and depth cameras are either costly, yield sparse
+point clouds or are noisy, and require extensive calibration. Here, we
+introduce a novel approach for approximate distance estimation in USVs using
+supervised object detection. We collected a dataset comprising images with
+manually annotated bounding boxes and corresponding distance measurements.
+Leveraging this data, we propose a specialized branch of an object detection
+model, not only to detect objects but also to predict their distances from the
+USV. This method offers a cost-efficient and intuitive alternative to
+conventional distance measurement techniques, aligning more closely with human
+estimation capabilities. We demonstrate its application in a marine assistance
+system that alerts operators to nearby objects such as boats, buoys, or other
+waterborne hazards.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-Language Models for Autonomous Driving: CLIP-Based Dynamic Scene
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Elhenawy, Huthaifa I. Ashqar, Andry Rakotonirainy, Taqwa I. Alhadidi, Ahmed Jaber, Mohammad Abu Tami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene understanding is essential for enhancing driver safety, generating
+human-centric explanations for Automated Vehicle (AV) decisions, and leveraging
+Artificial Intelligence (AI) for retrospective driving video analysis. This
+study developed a dynamic scene retrieval system using Contrastive
+Language-Image Pretraining (CLIP) models, which can be optimized for real-time
+deployment on edge devices. The proposed system outperforms state-of-the-art
+in-context learning methods, including the zero-shot capabilities of GPT-4o,
+particularly in complex scenarios. By conducting frame-level analysis on the
+Honda Scenes Dataset, which contains a collection of about 80 hours of
+annotated driving videos capturing diverse real-world road and weather
+conditions, our study highlights the robustness of CLIP models in learning
+visual concepts from natural language supervision. Results also showed that
+fine-tuning the CLIP models, such as ViT-L/14 and ViT-B/32, significantly
+improved scene classification, achieving a top F1 score of 91.1%. These results
+demonstrate the ability of the system to deliver rapid and precise scene
+recognition, which can be used to meet the critical requirements of Advanced
+Driver Assistance Systems (ADAS). This study shows the potential of CLIP models
+to provide scalable and efficient frameworks for dynamic scene understanding
+and classification. Furthermore, this work lays the groundwork for advanced
+autonomous vehicle technologies by fostering a deeper understanding of driver
+behavior, road conditions, and safety-critical scenarios, marking a significant
+step toward smarter, safer, and more context-aware autonomous driving systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Zero-Shot Object-Level Change Detection by Incorporating
+  Visual Correspondence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Huy Nguyen, Pooyan Rahmanzadehgervi, Long Mail, Anh Totti Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting object-level changes between two images across possibly different
+views is a core task in many applications that involve visual inspection or
+camera surveillance. Existing change-detection approaches suffer from three
+major limitations: (1) lack of evaluation on image pairs that contain no
+changes, leading to unreported false positive rates; (2) lack of
+correspondences (\ie, localizing the regions before and after a change); and
+(3) poor zero-shot generalization across different domains. To address these
+issues, we introduce a novel method that leverages change correspondences (a)
+during training to improve change detection accuracy, and (b) at test time, to
+minimize false positives. That is, we harness the supervision labels of where
+an object is added or removed to supervise change detectors, improving their
+accuracy over previous work by a large margin. Our work is also the first to
+predict correspondences between pairs of detected changes using estimated
+homography and the Hungarian algorithm. Our model demonstrates superior
+performance over existing methods, achieving state-of-the-art results in change
+detection and change correspondence accuracy across both in-distribution and
+zero-shot benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OVO-Bench: How Far is Your Video-LLMs from Real-World Online Video
+  Understanding? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Li, Junbo Niu, Ziyang Miao, Chunjiang Ge, Yuanhang Zhou, Qihao He, Xiaoyi Dong, Haodong Duan, Shuangrui Ding, Rui Qian, Pan Zhang, Yuhang Zang, Yuhang Cao, Conghui He, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Awareness, the ability to reason dynamically based on the timestamp
+when a question is raised, is the key distinction between offline and online
+video LLMs. Unlike offline models, which rely on complete videos for static,
+post hoc analysis, online models process video streams incrementally and
+dynamically adapt their responses based on the timestamp at which the question
+is posed. Despite its significance, temporal awareness has not been adequately
+evaluated in existing benchmarks. To fill this gap, we present OVO-Bench
+(Online-VideO-Benchmark), a novel video benchmark that emphasizes the
+importance of timestamps for advanced online video understanding capability
+benchmarking. OVO-Bench evaluates the ability of video LLMs to reason and
+respond to events occurring at specific timestamps under three distinct
+scenarios: (1) Backward tracing: trace back to past events to answer the
+question. (2) Real-time understanding: understand and respond to events as they
+unfold at the current timestamp. (3) Forward active responding: delay the
+response until sufficient future information becomes available to answer the
+question accurately. OVO-Bench comprises 12 tasks, featuring 644 unique videos
+and approximately human-curated 2,800 fine-grained meta-annotations with
+precise timestamps. We combine automated generation pipelines with human
+curation. With these high-quality samples, we further developed an evaluation
+pipeline to systematically query video LLMs along the video timeline.
+Evaluations of nine Video-LLMs reveal that, despite advancements on traditional
+benchmarks, current models struggle with online video understanding, showing a
+significant gap compared to human agents. We hope OVO-Bench will drive progress
+in video LLMs and inspire future research in online video reasoning. Our
+benchmark and code can be accessed at https://github.com/JoeLeelyf/OVO-Bench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UAV-VLA: Vision-Language-Action System for Large Scale Aerial Mission
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Sautenkov, Yasheerah Yaqoot, Artem Lykov, Muhammad Ahsan Mustafa, Grik Tadevosyan, Aibek Akhmetkazy, Miguel Altamirano Cabrera, Mikhail Martynov, Sausar Karaf, Dzmitry Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The UAV-VLA (Visual-Language-Action) system is a tool designed to facilitate
+communication with aerial robots. By integrating satellite imagery processing
+with the Visual Language Model (VLM) and the powerful capabilities of GPT,
+UAV-VLA enables users to generate general flight paths-and-action plans through
+simple text requests. This system leverages the rich contextual information
+provided by satellite images, allowing for enhanced decision-making and mission
+planning. The combination of visual analysis by VLM and natural language
+processing by GPT can provide the user with the path-and-action set, making
+aerial operations more efficient and accessible. The newly developed method
+showed the difference in the length of the created trajectory in 22% and the
+mean error in finding the objects of interest on a map in 34.22 m by Euclidean
+distance in the K-Nearest Neighbors (KNN) approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>HRI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Perspective on Privacy Protection in Federated Learning with
+  Granular-Ball Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04940v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04940v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guannan Lai, Yihui Feng, Xin Yang, Xiaoyu Deng, Hao Yu, Shuyin Xia, Guoyin Wang, Tianrui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) facilitates collaborative model training while
+prioritizing privacy by avoiding direct data sharing. However, most existing
+articles attempt to address challenges within the model's internal parameters
+and corresponding outputs, while neglecting to solve them at the input level.
+To address this gap, we propose a novel framework called Granular-Ball
+Federated Learning (GrBFL) for image classification. GrBFL diverges from
+traditional methods that rely on the finest-grained input data. Instead, it
+segments images into multiple regions with optimal coarse granularity, which
+are then reconstructed into a graph structure. We designed a two-dimensional
+binary search segmentation algorithm based on variance constraints for GrBFL,
+which effectively removes redundant information while preserving key
+representative features. Extensive theoretical analysis and experiments
+demonstrate that GrBFL not only safeguards privacy and enhances efficiency but
+also maintains robust utility, consistently outperforming other
+state-of-the-art FL methods. The code is available at
+https://github.com/AIGNLAI/GrBFL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient-based facial encoding for key generation to encrypt and decrypt
+  multimedia data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.06927v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.06927v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankit Kumar Patel, Dewanshi Paul, Sarthak Giri, Sneha Chaudhary, Bikalpa Gautam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Security systems relying on passwords are vulnerable to being forgotten,
+guessed, or breached. Likewise, biometric systems that operate independently
+are at risk of template spoofing and replay incidents. This paper introduces a
+biocryptosystem utilizing face recognition techniques to address these issues,
+allowing for the encryption and decryption of various file types through the
+Advanced Encryption Standard (AES). The proposed system creates a distinct
+32-bit encryption key derived from facial features identified by Histogram of
+Oriented Gradients (HOG) and categorized using Support Vector Machines (SVM).
+HOG efficiently identifies edge-aligned facial features, even in dim lighting,
+ensuring that reliable biometric keys can be generated. This key is then used
+with AES to encrypt and decrypt a variety of data formats, such as text, audio,
+and video files. This encryption key, derived from an individual's distinctive
+facial traits, is exceedingly challenging for adversaries to reproduce or
+guess. The security and performance of the system have been validated through
+experiments using several metrics, including correlation analysis, Shannon
+entropy, normalized Hamming distance, and the avalanche effect on 25 different
+file types. Potential uses for the proposed system include secure file sharing,
+online transactions, and data archiving, making it a strong and trustworthy
+approach to safeguarding sensitive information by integrating the uniqueness of
+facial biometrics with the established security of AES encryption.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 2 figures, This work has been submitted to the IEEE for
+  possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Agro<span class="highlight-title">GPT</span>: Efficient Agricultural Vision-Language Model with Expert Tuning <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.08405v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.08405v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Awais, Ali Husain Salem Abdulla Alharthi, Amandeep Kumar, Hisham Cholakkal, Rao Muhammad Anwer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant progress has been made in advancing large multimodal
+conversational models (LMMs), capitalizing on vast repositories of image-text
+data available online. Despite this progress, these models often encounter
+substantial domain gaps, hindering their ability to engage in complex
+conversations across new domains. Recent efforts have aimed to mitigate this
+issue, albeit relying on domain-specific image-text data to curate
+instruction-tuning data. However, many domains, such as agriculture, lack such
+vision-language data. In this work, we propose an approach to construct
+instruction-tuning data that harnesses vision-only data for the agriculture
+domain. We utilize diverse agricultural datasets spanning multiple domains,
+curate class-specific information, and employ large language models (LLMs) to
+construct an expert-tuning set, resulting in a 70k expert-tuning dataset called
+AgroInstruct. Subsequently, we expert-tuned and created AgroGPT, an efficient
+LMM that can hold complex agriculture-related conversations and provide useful
+insights. We also develop AgroEvals for evaluation and compare {AgroGPT's}
+performance with large open and closed-source models. {AgroGPT} excels at
+identifying fine-grained agricultural concepts, can act as an agriculture
+expert, and provides helpful information for multimodal agriculture questions.
+The code, datasets, and models are available at
+https://github.com/awaisrauf/agroGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WACV, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Snapshot: Towards Application-centered Models for Pedestrian Trajectory
+  Prediction in Urban Traffic Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.01971v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.01971v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nico Uhlemann, Yipeng Zhou, Tobias Simeon Mohr, Markus Lienkamp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores pedestrian trajectory prediction in urban traffic while
+focusing on both model accuracy and real-world applicability. While promising
+approaches exist, they often revolve around pedestrian datasets excluding
+traffic-related information, or resemble architectures that are either not
+real-time capable or robust. To address these limitations, we first introduce a
+dedicated benchmark based on Argoverse 2, specifically targeting pedestrians in
+traffic environments. Following this, we present Snapshot, a modular,
+feed-forward neural network that outperforms the current state of the art,
+reducing the Average Displacement Error (ADE) by 8.8% while utilizing
+significantly less information. Despite its agent-centric encoding scheme,
+Snapshot demonstrates scalability, real-time performance, and robustness to
+varying motion histories. Moreover, by integrating Snapshot into a modular
+autonomous driving software stack, we showcase its real-world applicability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 Pages, 9 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>4Scene: Understand 3D Scenes from Videos with Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01428v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01428v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangyang Qi, Zhixiong Zhang, Ye Fang, Jiaqi Wang, Hengshuang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, 2D Vision-Language Models (VLMs) have made significant
+strides in image-text understanding tasks. However, their performance in 3D
+spatial comprehension, which is critical for embodied intelligence, remains
+limited. Recent advances have leveraged 3D point clouds and multi-view images
+as inputs, yielding promising results. However, we propose exploring a purely
+vision-based solution inspired by human perception, which merely relies on
+visual cues for 3D spatial understanding. This paper empirically investigates
+the limitations of VLMs in 3D spatial knowledge, revealing that their primary
+shortcoming lies in the lack of global-local correspondence between the scene
+and individual frames. To address this, we introduce GPT4Scene, a novel visual
+prompting paradigm in VLM training and inference that helps build the
+global-local relationship, significantly improving the 3D spatial understanding
+of indoor scenes. Specifically, GPT4Scene constructs a 3D Bird's Eye View (BEV)
+image from the video and marks consistent object IDs across both frames and the
+BEV image. The model then inputs the concatenated BEV image and video frames
+with markers. In zero-shot evaluations, GPT4Scene improves performance over
+closed-source VLMs like GPT-4o. Additionally, we prepare a processed video
+dataset consisting of 165K text annotation to fine-tune open-source VLMs,
+achieving state-of-the-art performance on all 3D understanding tasks.
+Surprisingly, after training with the GPT4Scene paradigm, VLMs consistently
+improve during inference, even without visual prompting and BEV image as
+explicit correspondence. It demonstrates that the proposed paradigm helps VLMs
+develop an intrinsic ability to understand 3D scenes, which paves the way for a
+noninvasive approach to extending pre-trained VLMs for 3D scene understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://gpt4scene.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenOmni: Large Language Models Pivot Zero-shot Omnimodal Alignment
+  across Language with Real-time Self-Aware Emotional Speech Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04561v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04561v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Run Luo, Ting-En Lin, Haonan Zhang, Yuchuan Wu, Xiong Liu, Min Yang, Yongbin Li, Longze Chen, Jiaming Li, Lei Zhang, Yangyi Chen, Hamid Alinejad-Rokny, Fei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in omnimodal learning have been achieved in understanding
+and generation across images, text, and speech, though mainly within
+proprietary models. Limited omnimodal datasets and the inherent challenges
+associated with real-time emotional speech generation have hindered open-source
+progress. To address these issues, we propose openomni, a two-stage training
+method combining omnimodal alignment and speech generation to develop a
+state-of-the-art omnimodal large language model. In the alignment phase, a
+pre-trained speech model is further trained on text-image tasks to generalize
+from vision to speech in a (near) zero-shot manner, outperforming models
+trained on tri-modal datasets. In the speech generation phase, a lightweight
+decoder facilitates real-time emotional speech through training on speech tasks
+and preference learning. Experiments demonstrate that openomni consistently
+improves across omnimodal, vision-language, and speech-language evaluations,
+enabling natural, emotion-rich dialogues and real-time emotional speech
+generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Voxel-Aggregated Feature Synthesis: Efficient Dense Mapping for
+  Simulated 3D Reasoning <span class="chip">CVPR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10616v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10616v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Owen Burns, Rizwan Qureshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the issue of the exploding computational requirements of recent
+State-of-the-art (SOTA) open set multimodel 3D mapping (dense 3D mapping)
+algorithms and present Voxel-Aggregated Feature Synthesis (VAFS), a novel
+approach to dense 3D mapping in simulation. Dense 3D mapping involves
+segmenting and embedding sequential RGBD frames which are then fused into 3D.
+This leads to redundant computation as the differences between frames are small
+but all are individually segmented and embedded. This makes dense 3D mapping
+impractical for research involving embodied agents in which the environment,
+and thus the mapping, must be modified with regularity. VAFS drastically
+reduces this computation by using the segmented point cloud computed by a
+simulator's physics engine and synthesizing views of each region. This reduces
+the number of features to embed from the number of captured RGBD frames to the
+number of objects in the scene, effectively allowing a "ground truth" semantic
+map to be computed an order of magnitude faster than traditional methods. We
+test the resulting representation by assessing the IoU scores of semantic
+queries for different objects in the simulated scene, and find that VAFS
+exceeds the accuracy and speed of prior dense 3D mapping techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 2 figures, CVPR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Less is More: The Influence of Pruning on the Explainability of CNNs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08878v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08878v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Weber, Florian Merkle, Pascal Schöttle, Stephan Schlögl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern, state-of-the-art Convolutional Neural Networks (CNNs) in computer
+vision have millions of parameters. Thus, explaining the complex decisions of
+such networks to humans is challenging. A technical approach to reduce CNN
+complexity is network pruning, where less important parameters are deleted. The
+work presented in this paper investigates whether this technical complexity
+reduction also helps with perceived explainability. To do so, we conducted a
+pre-study and two human-grounded experiments, assessing the effects of
+different pruning ratios on CNN explainability. Overall, we evaluated four
+different compression rates (i.e., CPR 2, 4, 8, and 32) with 37 500 tasks on
+Mechanical Turk. Results indicate that lower compression rates have a positive
+influence on explainability, while higher compression rates show negative
+effects. Furthermore, we were able to identify sweet spots that increase both
+the perceived explainability and the model's performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometry Restoration and Dewarping of Camera-Captured Document Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03145v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03145v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valery Istomin, Oleg Pereziabov, Ilya Afanasyev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research focuses on developing a method for restoring the topology of
+digital images of paper documents captured by a camera, using algorithms for
+detection, segmentation, geometry restoration, and dewarping. Our methodology
+employs deep learning (DL) for document outline detection, followed by computer
+vision (CV) to create a topological 2D grid using cubic polynomial
+interpolation and correct nonlinear distortions by remapping the image. Using
+classical CV methods makes the document topology restoration process more
+efficient and faster, as it requires significantly fewer computational
+resources and memory. We developed a new pipeline for automatic document
+dewarping and reconstruction, along with a framework and annotated dataset to
+demonstrate its efficiency. Our experiments confirm the promise of our
+methodology and its superiority over existing benchmarks (including mobile apps
+and popular DL solutions, such as RectiNet, DocGeoNet, and DocTr++) both
+visually and in terms of document readability via Optical Character Recognition
+(OCR) and geometry restoration metrics. This paves the way for creating
+high-quality digital copies of paper documents and enhancing the efficiency of
+OCR systems. Project page: https://github.com/HorizonParadox/DRCCBI
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Identity-Preserving Video Dubbing Using Motion Warping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04586v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04586v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runzhen Liu, Qinjie Lin, Yunfei Liu, Lijian Lin, Ye Zhu, Yu Li, Chuhua Xian, Fa-Ting Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video dubbing aims to synthesize realistic, lip-synced videos from a
+reference video and a driving audio signal. Although existing methods can
+accurately generate mouth shapes driven by audio, they often fail to preserve
+identity-specific features, largely because they do not effectively capture the
+nuanced interplay between audio cues and the visual attributes of reference
+identity . As a result, the generated outputs frequently lack fidelity in
+reproducing the unique textural and structural details of the reference
+identity. To address these limitations, we propose IPTalker, a novel and robust
+framework for video dubbing that achieves seamless alignment between driving
+audio and reference identity while ensuring both lip-sync accuracy and
+high-fidelity identity preservation. At the core of IPTalker is a
+transformer-based alignment mechanism designed to dynamically capture and model
+the correspondence between audio features and reference images, thereby
+enabling precise, identity-aware audio-visual integration. Building on this
+alignment, a motion warping strategy further refines the results by spatially
+deforming reference images to match the target audio-driven configuration. A
+dedicated refinement process then mitigates occlusion artifacts and enhances
+the preservation of fine-grained textures, such as mouth details and skin
+features. Extensive qualitative and quantitative evaluations demonstrate that
+IPTalker consistently outperforms existing approaches in terms of realism, lip
+synchronization, and identity retention, establishing a new state of the art
+for high-quality, identity-consistent video dubbing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2, Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BTMTrack: Robust RGB-T Tracking via Dual-template Bridging and
+  Temporal-Modal Candidate Elimination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03616v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03616v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongxuan Zhang, Bi Zeng, Xinyu Ni, Yimin Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RGB-T tracking leverages the complementary strengths of RGB and thermal
+infrared (TIR) modalities to address challenging scenarios such as low
+illumination and adverse weather. However, existing methods often fail to
+effectively integrate temporal information and perform efficient cross-modal
+interactions, which constrain their adaptability to dynamic targets. In this
+paper, we propose BTMTrack, a novel framework for RGB-T tracking. The core of
+our approach lies in the dual-template backbone network and the Temporal-Modal
+Candidate Elimination (TMCE) strategy. The dual-template backbone effectively
+integrates temporal information, while the TMCE strategy focuses the model on
+target-relevant tokens by evaluating temporal and modal correlations, reducing
+computational overhead and avoiding irrelevant background noise. Building upon
+this foundation, we propose the Temporal Dual Template Bridging (TDTB) module,
+which facilitates precise cross-modal fusion through dynamically filtered
+tokens. This approach further strengthens the interaction between templates and
+the search region. Extensive experiments conducted on three benchmark datasets
+demonstrate the effectiveness of BTMTrack. Our method achieves state-of-the-art
+performance, with a 72.3% precision rate on the LasHeR test set and competitive
+results on RGBT210 and RGBT234 datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Semantic Navigation with Real Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16623v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16623v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Gutiérrez-Álvarez, Pablo Ríos-Navarro, Rafael Flor-Rodríguez, Francisco Javier Acevedo-Rodríguez, Roberto J. López-Sastre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Semantic Navigation (VSN) is the ability of a robot to learn visual
+semantic information for navigating in unseen environments. These VSN models
+are typically tested in those virtual environments where they are trained,
+mainly using reinforcement learning based approaches. Therefore, we do not yet
+have an in-depth analysis of how these models would behave in the real world.
+In this work, we propose a new solution to integrate VSN models into real
+robots, so that we have true embodied agents. We also release a novel ROS-based
+framework for VSN, ROS4VSN, so that any VSN-model can be easily deployed in any
+ROS-compatible robot and tested in a real setting. Our experiments with two
+different robots, where we have embedded two state-of-the-art VSN agents,
+confirm that there is a noticeable performance difference of these VSN
+solutions when tested in real-world and simulation environments. We hope that
+this research will endeavor to provide a foundation for addressing this
+consequential issue, with the ultimate aim of advancing the performance and
+efficiency of embodied agents within authentic real-world scenarios. Code to
+reproduce all our experiments can be found at
+https://github.com/gramuah/ros4vsn.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse
+  Tensor-based <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07899v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07899v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Huo, Junhui Hou, Shuai Wan, Fuzheng Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evolution of 3D visualization techniques has fundamentally transformed
+how we interact with digital content. At the forefront of this change is point
+cloud technology, offering an immersive experience that surpasses traditional
+2D representations. However, the massive data size of point clouds presents
+significant challenges in data compression. Current methods for lossy point
+cloud attribute compression (PCAC) generally focus on reconstructing the
+original point clouds with minimal error. However, for point cloud
+visualization scenarios, the reconstructed point clouds with distortion still
+need to undergo a complex rendering process, which affects the final
+user-perceived quality. In this paper, we propose an end-to-end deep learning
+framework that seamlessly integrates PCAC with differentiable rendering,
+denoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of
+rendered multiview images for viewing. In a differentiable manner, the impact
+of the rendering process on the reconstructed point clouds is taken into
+account. Moreover, we characterize point clouds as sparse tensors and propose a
+sparse tensor-based transformer, called SP-Trans. By aligning with the local
+density of the point cloud and utilizing an enhanced local attention mechanism,
+SP-Trans captures the intricate relationships within the point cloud, further
+improving feature analysis and synthesis within the framework. Extensive
+experiments demonstrate that the proposed RO-PCAC achieves state-of-the-art
+compression performance, compared to existing reconstruction-oriented methods,
+including traditional, learning-based, and hybrid methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semi-supervised 3D Semantic Scene Completion with 2D Vision Foundation
+  Model Guidance <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11559v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11559v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duc-Hai Pham, Duc-Dung Nguyen, Anh Pham, Tuan Ho, Phong Nguyen, Khoi Nguyen, Rang Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of 3D semantic occupancy from 2D visual images is vital
+in enabling autonomous agents to comprehend their surroundings for planning and
+navigation. State-of-the-art methods typically employ fully supervised
+approaches, necessitating a huge labeled dataset acquired through expensive
+LiDAR sensors and meticulous voxel-wise labeling by human annotators. The
+resource-intensive nature of this annotating process significantly hampers the
+application and scalability of these methods. We introduce a novel
+semi-supervised framework to alleviate the dependency on densely annotated
+data. Our approach leverages 2D foundation models to generate essential 3D
+scene geometric and semantic cues, facilitating a more efficient training
+process. Our framework exhibits notable properties: (1) Generalizability,
+applicable to various 3D semantic scene completion approaches, including 2D-3D
+lifting and 3D-2D transformer methods. (2) Effectiveness, as demonstrated
+through experiments on SemanticKITTI and NYUv2, wherein our method achieves up
+to 85% of the fully-supervised performance using only 10% labeled data. This
+approach not only reduces the cost and labor associated with data annotation
+but also demonstrates the potential for broader adoption in camera-based
+systems for 3D semantic occupancy prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI2025. Project Page:
+  https://vinairesearch.github.io/SemiSSC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoE: Deep Coupled Embedding for Non-Rigid Point Cloud Correspondences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05557v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05557v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huajian Zeng, Maolin Gao, Daniel Cremers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The interest in matching non-rigidly deformed shapes represented as raw point
+clouds is rising due to the proliferation of low-cost 3D sensors. Yet, the task
+is challenging since point clouds are irregular and there is a lack of
+intrinsic shape information. We propose to tackle these challenges by learning
+a new shape representation -- a per-point high dimensional embedding, in an
+embedding space where semantically similar points share similar embeddings. The
+learned embedding has multiple beneficial properties: it is aware of the
+underlying shape geometry and is robust to shape deformations and various shape
+artefacts, such as noise and partiality. Consequently, this embedding can be
+directly employed to retrieve high-quality dense correspondences through a
+simple nearest neighbor search in the embedding space. Extensive experiments
+demonstrate new state-of-the-art results and robustness in numerous challenging
+non-rigid shape matching benchmarks and show its great potential in other shape
+analysis tasks, such as segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DGNN-YOLO: Interpretable Dynamic Graph Neural Networks with YOLO11 for
+  Detecting and Tracking Small Occluded Objects in Urban Traffic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.17251v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.17251v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahriar Soudeep, M. F. Mridha, Md Abrar Jahin, Nilanjan Dey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The detection and tracking of small, occluded objects such as pedestrians,
+cyclists, and motorbikes pose significant challenges for traffic surveillance
+systems because of their erratic movement, frequent occlusion, and poor
+visibility in dynamic urban environments. Traditional methods like YOLO11,
+while proficient in spatial feature extraction for precise detection, often
+struggle with these small and dynamically moving objects, particularly in
+handling real-time data updates and resource efficiency. This paper introduces
+DGNN-YOLO, a novel framework that integrates dynamic graph neural networks
+(DGNNs) with YOLO11 to address these limitations. Unlike standard GNNs, DGNNs
+are chosen for their superior ability to dynamically update graph structures in
+real-time, which enables adaptive and robust tracking of objects in highly
+variable urban traffic scenarios. This framework constructs and regularly
+updates its graph representations, capturing objects as nodes and their
+interactions as edges, thus effectively responding to rapidly changing
+conditions. Additionally, DGNN-YOLO incorporates Grad-CAM, Grad-CAM++, and
+Eigen-CAM visualization techniques to enhance interpretability and foster
+trust, offering insights into the model's decision-making process. Extensive
+experiments validate the framework's performance, achieving a precision of
+0.8382, recall of 0.6875, and mAP@0.5:0.95 of 0.6476, significantly
+outperforming existing methods. This study offers a scalable and interpretable
+solution for real-time traffic surveillance and significantly advances
+intelligent transportation systems' capabilities by addressing the critical
+challenge of detecting and tracking small, occluded objects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMTNet: Convolutional Meets <span class="highlight-title">Transformer</span> Network for Hyperspectral Images
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14080v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14080v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faxu Guo, Quan Feng, Sen Yang, Wanxia Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral remote sensing (HIS) enables the detailed capture of spectral
+information from the Earth's surface, facilitating precise classification and
+identification of surface crops due to its superior spectral diagnostic
+capabilities. However, current convolutional neural networks (CNNs) focus on
+local features in hyperspectral data, leading to suboptimal performance when
+classifying intricate crop types and addressing imbalanced sample
+distributions. In contrast, the Transformer framework excels at extracting
+global features from hyperspectral imagery. To leverage the strengths of both
+approaches, this research introduces the Convolutional Meet Transformer Network
+(CMTNet). This innovative model includes a spectral-spatial feature extraction
+module for shallow feature capture, a dual-branch structure combining CNN and
+Transformer branches for local and global feature extraction, and a
+multi-output constraint module that enhances classification accuracy through
+multi-output loss calculations and cross constraints across local,
+international, and joint features. Extensive experiments conducted on three
+datasets (WHU-Hi-LongKou, WHU-Hi-HanChuan, and WHU-Hi-HongHu) demonstrate that
+CTDBNet significantly outperforms other state-of-the-art networks in
+classification performance, validating its effectiveness in hyperspectral crop
+classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>After submission, our research team underwent a significant shift in
+  the project's focus and direction. As a result, the current manuscript no
+  longer accurately reflects the revised scope or findings of our research.To
+  prevent potential misinterpretations or misleading citations, we believe it
+  is in the best interest of the academic community to withdraw this article</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exosense: A Vision-Based Scene Understanding System For Exoskeletons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.14320v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.14320v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianeng Wang, Matias Mattamala, Christina Kassab, Guillaume Burger, Fabio Elnecave, Lintong Zhang, Marine Petriaux, Maurice Fallon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-balancing exoskeletons are a key enabling technology for individuals
+with mobility impairments. While the current challenges focus on
+human-compliant hardware and control, unlocking their use for daily activities
+requires a scene perception system. In this work, we present Exosense, a
+vision-centric scene understanding system for self-balancing exoskeletons. We
+introduce a multi-sensor visual-inertial mapping device as well as a navigation
+stack for state estimation, terrain mapping and long-term operation. We tested
+Exosense attached to both a human leg and Wandercraft's Personal Exoskeleton in
+real-world indoor scenarios. This enabled us to test the system during typical
+periodic walking gaits, as well as future uses in multi-story environments. We
+demonstrate that Exosense can achieve an odometry drift of about 4 cm per meter
+traveled, and construct terrain maps under 1 cm average reconstruction error.
+It can also work in a visual localization mode in a previously mapped
+environment, providing a step towards long-term operation of exoskeletons.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentiable Task Graph Learning: Procedural Activity Representation
+  and Online Mistake Detection from Egocentric Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01486v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01486v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luigi Seminara, Giovanni Maria Farinella, Antonino Furnari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Procedural activities are sequences of key-steps aimed at achieving specific
+goals. They are crucial to build intelligent agents able to assist users
+effectively. In this context, task graphs have emerged as a
+human-understandable representation of procedural activities, encoding a
+partial ordering over the key-steps. While previous works generally relied on
+hand-crafted procedures to extract task graphs from videos, in this paper, we
+propose an approach based on direct maximum likelihood optimization of edges'
+weights, which allows gradient-based learning of task graphs and can be
+naturally plugged into neural network architectures. Experiments on the
+CaptainCook4D dataset demonstrate the ability of our approach to predict
+accurate task graphs from the observation of action sequences, with an
+improvement of +16.7% over previous approaches. Owing to the differentiability
+of the proposed framework, we also introduce a feature-based approach, aiming
+to predict task graphs from key-step textual or video embeddings, for which we
+observe emerging video understanding abilities. Task graphs learned with our
+approach are also shown to significantly enhance online mistake detection in
+procedural egocentric videos, achieving notable gains of +19.8% and +7.5% on
+the Assembly101-O and EPIC-Tent-O datasets. Code for replicating experiments is
+available at https://github.com/fpv-iplab/Differentiable-Task-Graph-Learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OneLLM: One Framework to Align All Modalities with Language <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Han, Kaixiong Gong, Yiyuan Zhang, Jiaqi Wang, Kaipeng Zhang, Dahua Lin, Yu Qiao, Peng Gao, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) have gained significant attention
+due to their strong multimodal understanding capability. However, existing
+works rely heavily on modality-specific encoders, which usually differ in
+architecture and are limited to common modalities. In this paper, we present
+OneLLM, an MLLM that aligns eight modalities to language using a unified
+framework. We achieve this through a unified multimodal encoder and a
+progressive multimodal alignment pipeline. In detail, we first train an image
+projection module to connect a vision encoder with LLM. Then, we build a
+universal projection module (UPM) by mixing multiple image projection modules
+and dynamic routing. Finally, we progressively align more modalities to LLM
+with the UPM. To fully leverage the potential of OneLLM in following
+instructions, we also curated a comprehensive multimodal instruction dataset,
+including 2M items from image, audio, video, point cloud, depth/normal map, IMU
+and fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,
+encompassing tasks such as multimodal captioning, question answering and
+reasoning, where it delivers excellent performance. Code, data, model and
+online demo are available at https://github.com/csuhan/OneLLM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2024. Code: https://github.com/csuhan/OneLLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ tCURLoRA: Tensor CUR Decomposition Based Low-Rank Parameter Adaptation
+  and Its Application in Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanghua He, Wangang Cheng, Hancan Zhu, Xiaohao Cai, Gaohang Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transfer learning, by leveraging knowledge from pre-trained models, has
+significantly enhanced the performance of target tasks. However, as deep neural
+networks scale up, full fine-tuning introduces substantial computational and
+storage challenges in resource-constrained environments, limiting its
+widespread adoption. To address this, parameter-efficient fine-tuning (PEFT)
+methods have been developed to reduce computational complexity and storage
+requirements by minimizing the number of updated parameters. While matrix
+decomposition-based PEFT methods, such as LoRA, show promise, they struggle to
+fully capture the high-dimensional structural characteristics of model weights.
+In contrast, high-dimensional tensors offer a more natural representation of
+neural network weights, allowing for a more comprehensive capture of
+higher-order features and multi-dimensional interactions. In this paper, we
+propose tCURLoRA, a novel fine-tuning method based on tensor CUR decomposition.
+By concatenating pre-trained weight matrices into a three-dimensional tensor
+and applying tensor CUR decomposition, we update only the lower-order tensor
+components during fine-tuning, effectively reducing computational and storage
+overhead. Experimental results demonstrate that tCURLoRA outperforms existing
+PEFT methods in medical image segmentation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DATransNet: Dynamic Attention <span class="highlight-title">Transformer</span> Network for Infrared Small
+  Target Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.19599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.19599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Hu, Yian Huang, Kexuan Li, Luping Zhang, Chang Long, Yiming Zhu, Tian Pu, Zhenming Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared small target detection (ISTD) is widely used in civilian and
+military applications. However, ISTD encounters several challenges, including
+the tendency for small and dim targets to be obscured by complex backgrounds.To
+address this issue, we propose the Dynamic Attention Transformer Network
+(DATransNet), which aims to extract and preserve edge information of small
+targets.DATransNet employs the Dynamic Attention Transformer (DATrans),
+simulating central difference convolutions (CDC) to extract and integrate
+gradient features with deeper features.Furthermore, we propose a global feature
+extraction module (GFEM) that offers a comprehensive perspective to prevent the
+network from focusing solely on details while neglecting the background
+information. We compare the network with state-of-the-art (SOTA) approaches,
+and the results demonstrate that our method performs effectively. Our source
+code is available at https://github.com/greekinRoma/DATransNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TextToucher: Fine-Grained Text-to-Touch Generation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.05427v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.05427v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahang Tu, Hao Fu, Fengyu Yang, Hanbin Zhao, Chao Zhang, Hui Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tactile sensation plays a crucial role in the development of multi-modal
+large models and embodied intelligence. To collect tactile data with minimal
+cost as possible, a series of studies have attempted to generate tactile images
+by vision-to-touch image translation. However, compared to text modality,
+visual modality-driven tactile generation cannot accurately depict human
+tactile sensation. In this work, we analyze the characteristics of tactile
+images in detail from two granularities: object-level (tactile texture, tactile
+shape), and sensor-level (gel status). We model these granularities of
+information through text descriptions and propose a fine-grained Text-to-Touch
+generation method (TextToucher) to generate high-quality tactile samples.
+Specifically, we introduce a multimodal large language model to build the text
+sentences about object-level tactile information and employ a set of learnable
+text prompts to represent the sensor-level tactile information. To better guide
+the tactile generation process with the built text information, we fuse the
+dual grains of text information and explore various dual-grain text
+conditioning methods within the diffusion transformer architecture.
+Furthermore, we propose a Contrastive Text-Touch Pre-training (CTTP) metric to
+precisely evaluate the quality of text-driven generated tactile data. Extensive
+experiments demonstrate the superiority of our TextToucher method. The source
+codes will be available at \url{https://github.com/TtuHamg/TextToucher}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DoubleDiffusion: Combining Heat Diffusion with Denoising Diffusion for
+  Generative Learning on 3D Meshes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03397v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03397v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyang Wang, Ziang Cheng, Zhenyu Li, Jiayu Yang, Haorui Ji, Pan Ji, Mehrtash Harandi, Richard Hartley, Hongdong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes DoubleDiffusion, a novel framework that combines heat
+dissipation diffusion and denoising diffusion for direct generative learning on
+3D mesh surfaces. Our approach addresses the challenges of generating
+continuous signal distributions residing on a curve manifold surface. Unlike
+previous methods that rely on unrolling 3D meshes into 2D or adopting field
+representations, DoubleDiffusion leverages the Laplacian-Beltrami operator to
+process features respecting the mesh structure. This combination enables
+effective geometry-aware signal diffusion across the underlying geometry. As
+shown in Fig.1, we demonstrate that DoubleDiffusion has the ability to generate
+RGB signal distributions on complex 3D mesh surfaces and achieves per-category
+shape-conditioned texture generation across different shape geometry. Our work
+contributes a new direction in diffusion-based generative modeling on 3D
+surfaces, with potential applications in the field of 3D asset generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UltraCortex: Submillimeter Ultra-High Field 9.4 T Brain MR Image
+  Collection and Manual Cortical Segmentations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.18571v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.18571v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Mahler, Julius Steiglechner, Benjamin Bender, Tobias Lindig, Dana Ramadan, Jonas Bause, Florian Birk, Rahel Heule, Edyta Charyasz, Michael Erb, Vinod Jangir Kumar, Gisela E Hagberg, Pascal Martin, Gabriele Lohmann, Klaus Scheffler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The UltraCortex repository (https://www.ultracortex.org) houses magnetic
+resonance imaging data of the human brain obtained at an ultra-high field
+strength of 9.4 T. It contains 86 structural MR images with spatial resolutions
+ranging from 0.6 to 0.8 mm. Additionally, the repository includes segmentations
+of 12 brains into gray and white matter compartments. These segmentations have
+been independently validated by two expert neuroradiologists, thus establishing
+them as a reliable gold standard. This resource provides researchers with
+access to high-quality brain imaging data and validated segmentations,
+facilitating neuroimaging studies and advancing our understanding of brain
+structure and function. Existing repositories do not accommodate field
+strengths beyond 7 T, nor do they offer validated segmentations, underscoring
+the significance of this new resource.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLaVA-CoT: Let Vision Language Models Reason Step-by-Step 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10440v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10440v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guowei Xu, Peng Jin, Hao Li, Yibing Song, Lichao Sun, Li Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have demonstrated substantial advancements in reasoning
+capabilities, particularly through inference-time scaling, as illustrated by
+models such as OpenAI's o1. However, current Vision-Language Models (VLMs)
+often struggle to perform systematic and structured reasoning, especially when
+handling complex visual question-answering tasks. In this work, we introduce
+LLaVA-CoT, a novel VLM designed to conduct autonomous multistage reasoning.
+Unlike chain-of-thought prompting, LLaVA-CoT independently engages in
+sequential stages of summarization, visual interpretation, logical reasoning,
+and conclusion generation. This structured approach enables LLaVA-CoT to
+achieve marked improvements in precision on reasoning-intensive tasks. To
+accomplish this, we compile the LLaVA-CoT-100k dataset, integrating samples
+from various visual question answering sources and providing structured
+reasoning annotations. Besides, we propose an inference-time stage-level beam
+search method, which enables effective inference-time scaling. Remarkably, with
+only 100k training samples and a simple yet effective inference time scaling
+method, LLaVA-CoT not only outperforms its base model by 7.4% on a wide range
+of multimodal reasoning benchmarks, but also surpasses the performance of
+larger and even closed-source models, such as Gemini-1.5-pro, GPT-4o-mini, and
+Llama-3.2-90B-Vision-Instruct.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ INFELM: In-depth Fairness Evaluation of Large Text-To-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01973v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01973v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Jin, Xing Liu, Yu Liu, Jia Qing Yap, Andrea Wong, Adriana Crespo, Qi Lin, Zhiyuan Yin, Qiang Yan, Ryan Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of large language models (LLMs) and large vision models
+(LVMs) have propelled the evolution of multi-modal AI systems, which have
+demonstrated the remarkable potential for industrial applications by emulating
+human-like cognition. However, they also pose significant ethical challenges,
+including amplifying harmful content and reinforcing societal biases. For
+instance, biases in some industrial image generation models highlighted the
+urgent need for robust fairness assessments. Most existing evaluation
+frameworks focus on the comprehensiveness of various aspects of the models, but
+they exhibit critical limitations, including insufficient attention to content
+generation alignment and social bias-sensitive domains. More importantly, their
+reliance on pixel-detection techniques is prone to inaccuracies.
+  To address these issues, this paper presents INFELM, an in-depth fairness
+evaluation on widely-used text-to-image models. Our key contributions are: (1)
+an advanced skintone classifier incorporating facial topology and refined skin
+pixel representation to enhance classification precision by at least 16.04%,
+(2) a bias-sensitive content alignment measurement for understanding societal
+impacts, (3) a generalizable representation bias evaluation for diverse
+demographic groups, and (4) extensive experiments analyzing large-scale
+text-to-image model outputs across six social-bias-sensitive domains. We find
+that existing models in the study generally do not meet the empirical fairness
+criteria, and representation bias is generally more pronounced than alignment
+errors. INFELM establishes a robust benchmark for fairness assessment,
+supporting the development of multi-modal AI systems that align with ethical
+and human-centric principles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Di Jin and Xing Liu contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ McGrids: Monte Carlo-Driven Adaptive Grids for Iso-Surface Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.06710v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.06710v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daxuan Ren, Hezi Shi, Jianmin Zheng, Jianfei Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Iso-surface extraction from an implicit field is a fundamental process in
+various applications of computer vision and graphics. When dealing with
+geometric shapes with complicated geometric details, many existing algorithms
+suffer from high computational costs and memory usage. This paper proposes
+McGrids, a novel approach to improve the efficiency of iso-surface extraction.
+The key idea is to construct adaptive grids for iso-surface extraction rather
+than using a simple uniform grid as prior art does. Specifically, we formulate
+the problem of constructing adaptive grids as a probability sampling problem,
+which is then solved by Monte Carlo process. We demonstrate McGrids' capability
+with extensive experiments from both analytical SDFs computed from surface
+meshes and learned implicit fields from real multiview images. The experiment
+results show that our McGrids can significantly reduce the number of implicit
+field queries, resulting in significant memory reduction, while producing
+high-quality meshes with rich geometric details.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MagicFace: High-Fidelity Facial Expression Editing with Action-Unit
+  Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengting Wei, Tuomas Varanka, Xingxun Jiang, Huai-Qian Khor, Guoying Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the problem of facial expression editing by controling the
+relative variation of facial action-unit (AU) from the same person. This
+enables us to edit this specific person's expression in a fine-grained,
+continuous and interpretable manner, while preserving their identity, pose,
+background and detailed facial attributes. Key to our model, which we dub
+MagicFace, is a diffusion model conditioned on AU variations and an ID encoder
+to preserve facial details of high consistency. Specifically, to preserve the
+facial details with the input identity, we leverage the power of pretrained
+Stable-Diffusion models and design an ID encoder to merge appearance features
+through self-attention. To keep background and pose consistency, we introduce
+an efficient Attribute Controller by explicitly informing the model of current
+background and pose of the target. By injecting AU variations into a denoising
+UNet, our model can animate arbitrary identities with various AU combinations,
+yielding superior results in high-fidelity expression editing compared to other
+facial expression editing works. Code is publicly available at
+https://github.com/weimengting/MagicFace.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UniMatch V2: Pushing the Limit of Semi-Supervised Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.10777v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.10777v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lihe Yang, Zhen Zhao, Hengshuang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised semantic segmentation (SSS) aims at learning rich visual
+knowledge from cheap unlabeled images to enhance semantic segmentation
+capability. Among recent works, UniMatch improves its precedents tremendously
+by amplifying the practice of weak-to-strong consistency regularization.
+Subsequent works typically follow similar pipelines and propose various
+delicate designs. Despite the achieved progress, strangely, even in this
+flourishing era of numerous powerful vision models, almost all SSS works are
+still sticking to 1) using outdated ResNet encoders with small-scale
+ImageNet-1K pre-training, and 2) evaluation on simple Pascal and Cityscapes
+datasets. In this work, we argue that, it is necessary to switch the baseline
+of SSS from ResNet-based encoders to more capable ViT-based encoders (e.g.,
+DINOv2) that are pre-trained on massive data. A simple update on the encoder
+(even using 2x fewer parameters) can bring more significant improvement than
+careful method designs. Built on this competitive baseline, we present our
+upgraded and simplified UniMatch V2, inheriting the core spirit of
+weak-to-strong consistency from V1, but requiring less training cost and
+providing consistently better results. Additionally, witnessing the gradually
+saturated performance on Pascal and Cityscapes, we appeal that we should focus
+on more challenging benchmarks with complex taxonomy, such as ADE20K and COCO
+datasets. Code, models, and logs of all reported values, are available at
+https://github.com/LiheYoung/UniMatch-V2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InfiFusion: A Unified Framework for Enhanced Cross-Model Reasoning via
+  LLM Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02795v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02795v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoyi Yan, Zhijie Sang, Yiming Zhang, Yuhao Fu, Baoyi He, Qi Zhou, Yining Di, Chunlin Ji, Shengyu Zhang, Fei Wu, Hongxia Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated strong performance across
+various reasoning tasks, yet building a single model that consistently excels
+across all domains remains challenging. This paper addresses this problem by
+exploring strategies to integrate multiple domain-specialized models into an
+efficient pivot model.We propose two fusion strategies to combine the strengths
+of multiple LLMs: (1) a pairwise, multi-step fusion approach that sequentially
+distills each source model into the pivot model, followed by a weight merging
+step to integrate the distilled models into the final model. This method
+achieves strong performance but requires substantial training effort; and (2) a
+unified fusion approach that aggregates all source models' outputs
+simultaneously.To improve the fusion process, we introduce a novel
+Rate-Skewness Adaptive Fusion (RSAF) technique, which dynamically adjusts top-K
+ratios during parameter merging for enhanced flexibility and
+stability.Furthermore, we propose an uncertainty-based weighting method for the
+unified approach, which dynamically balances the contributions of source models
+and outperforms other logits/distribution ensemble methods.We achieved accuracy
+improvements of 9.27%, 8.80%, and 8.89% on the GSM8K, MATH, and HumanEval
+tasks, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video
+  Generation Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03847v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03847v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekai Gu, Rui Yan, Jiahao Lu, Peng Li, Zhiyang Dou, Chenyang Si, Zhen Dong, Qifeng Liu, Cheng Lin, Ziwei Liu, Wenping Wang, Yuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated impressive performance in generating
+high-quality videos from text prompts or images. However, precise control over
+the video generation process, such as camera manipulation or content editing,
+remains a significant challenge. Existing methods for controlled video
+generation are typically limited to a single control type, lacking the
+flexibility to handle diverse control demands. In this paper, we introduce
+Diffusion as Shader (DaS), a novel approach that supports multiple video
+control tasks within a unified architecture. Our key insight is that achieving
+versatile video control necessitates leveraging 3D control signals, as videos
+are fundamentally 2D renderings of dynamic 3D content. Unlike prior methods
+limited to 2D control signals, DaS leverages 3D tracking videos as control
+inputs, making the video diffusion process inherently 3D-aware. This innovation
+allows DaS to achieve a wide range of video controls by simply manipulating the
+3D tracking videos. A further advantage of using 3D tracking videos is their
+ability to effectively link frames, significantly enhancing the temporal
+consistency of the generated videos. With just 3 days of fine-tuning on 8 H800
+GPUs using less than 10k videos, DaS demonstrates strong control capabilities
+across diverse tasks, including mesh-to-video generation, camera control,
+motion transfer, and object manipulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://igl-hkust.github.io/das/ Codes:
+  https://github.com/IGL-HKUST/DiffusionAsShader</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nothing Stands Still: A Spatiotemporal Benchmark on 3D Point Cloud
+  Registration Under Large Geometric and Temporal Change <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09346v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09346v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Sun, Yan Hao, Shengyu Huang, Silvio Savarese, Konrad Schindler, Marc Pollefeys, Iro Armeni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building 3D geometric maps of man-made spaces is a well-established and
+active field that is fundamental to computer vision and robotics. However,
+considering the evolving nature of built environments, it is essential to
+question the capabilities of current mapping efforts in handling temporal
+changes. In addition, spatiotemporal mapping holds significant potential for
+achieving sustainability and circularity goals. Existing mapping approaches
+focus on small changes, such as object relocation or self-driving car
+operation; in all cases where the main structure of the scene remains fixed.
+Consequently, these approaches fail to address more radical changes in the
+structure of the built environment, such as geometry and topology. To this end,
+we introduce the Nothing Stands Still (NSS) benchmark, which focuses on the
+spatiotemporal registration of 3D scenes undergoing large spatial and temporal
+change, ultimately creating one coherent spatiotemporal map. Specifically, the
+benchmark involves registering two or more partial 3D point clouds (fragments)
+from the same scene but captured from different spatiotemporal views. In
+addition to the standard pairwise registration, we assess the multi-way
+registration of multiple fragments that belong to any temporal stage. As part
+of NSS, we introduce a dataset of 3D point clouds recurrently captured in
+large-scale building indoor environments that are under construction or
+renovation. The NSS benchmark presents three scenarios of increasing
+difficulty, to quantify the generalization ability of point cloud registration
+methods over space (within one building and across buildings) and time. We
+conduct extensive evaluations of state-of-the-art methods on NSS. The results
+demonstrate the necessity for novel methods specifically designed to handle
+large spatiotemporal changes. The homepage of our benchmark is at
+http://nothing-stands-still.com.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in the ISPRS Journal of Photogrammetry and Remote Sensing.
+  29 pages, 26 figures. For the project page, see
+  http://nothing-stands-still.com</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STITCH: Surface reconstrucTion using Implicit neural representations
+  with Topology Constraints and persistent Homology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18696v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18696v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anushrut Jignasu, Ethan Herron, Zhanhong Jiang, Soumik Sarkar, Chinmay Hegde, Baskar Ganapathysubramanian, Aditya Balu, Adarsh Krishnamurthy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present STITCH, a novel approach for neural implicit surface
+reconstruction of a sparse and irregularly spaced point cloud while enforcing
+topological constraints (such as having a single connected component). We
+develop a new differentiable framework based on persistent homology to
+formulate topological loss terms that enforce the prior of a single 2-manifold
+object. Our method demonstrates excellent performance in preserving the
+topology of complex 3D geometries, evident through both visual and empirical
+comparisons. We supplement this with a theoretical analysis, and provably show
+that optimizing the loss with stochastic (sub)gradient descent leads to
+convergence and enables reconstructing shapes with a single connected
+component. Our approach showcases the integration of differentiable topological
+data analysis tools for implicit surface reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 12 figures, 29 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Task Model Merging via Adaptive Weight Disentanglement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.18729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.18729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Xiong, Runxi Cheng, Wang Chen, Zhanqiu Zhang, Yiwen Guo, Chun Yuan, Ruifeng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model merging has recently gained attention as an economical and scalable
+approach to incorporate task-specific weights from various tasks into a unified
+multi-task model. For example, in Task Arithmetic (TA), adding the fine-tuned
+weights of different tasks can enhance the model's performance on those tasks,
+while subtracting them leads to task forgetting. Although TA is highly
+effective, interference among task still hampers the performance of the merged
+model. Existing methods for handling conflicts between task generally rely on
+empirical selection, resulting in suboptimal performance. In this paper, we
+introduce an Adaptive Weight Disentanglement method. We begin by theoretically
+proving that task vectors employed in model merging should be orthogonal to
+minimize interference among tasks. Guided by this insight, we initialize
+redundant vectors such that, when subtracted from the original task vectors,
+the resulting vectors exhibit increased orthogonality. Additionally, we impose
+an norm constraint on the redundant vectors to preserve the performance of the
+task-specific models. Experimental results demonstrate the effectiveness of our
+proposed technique: it successfully extracts redundant vectors, and after their
+subtraction, the task vectors not only retain robust performance but also
+achieve superior fusion outcomes. Our code is available at
+\href{https://github.com/FarisXiong/AWD.git}{https://github.com/FarisXiong/AWD.git}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embodied VideoAgent: Persistent Memory from Egocentric Videos and
+  Embodied Sensors Enables Dynamic Scene Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00358v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00358v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Fan, Xiaojian Ma, Rongpeng Su, Jun Guo, Rujie Wu, Xi Chen, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the problem of understanding dynamic 3D scenes from
+egocentric observations, a key challenge in robotics and embodied AI. Unlike
+prior studies that explored this as long-form video understanding and utilized
+egocentric video only, we instead propose an LLM-based agent, Embodied
+VideoAgent, which constructs scene memory from both egocentric video and
+embodied sensory inputs (e.g. depth and pose sensing). We further introduce a
+VLM-based approach to automatically update the memory when actions or
+activities over objects are perceived. Embodied VideoAgent attains significant
+advantages over counterparts in challenging reasoning and planning tasks in 3D
+scenes, achieving gains of 4.9% on Ego4D-VQ3D, 5.8% on OpenEQA, and 11.7% on
+EnvQA. We have also demonstrated its potential in various embodied AI tasks
+including generating embodied interactions and perception for robot
+manipulation. The code and demo will be made public.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://embodied-videoagent.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MoEE: Mixture of Emotion Experts for Audio-Driven Portrait Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01808v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01808v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaize Liu, Wenzhang Sun, Donglin Di, Shibo Sun, Jiahui Yang, Changqing Zou, Hujun Bao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of talking avatars has achieved significant advancements in
+precise audio synchronization. However, crafting lifelike talking head videos
+requires capturing a broad spectrum of emotions and subtle facial expressions.
+Current methods face fundamental challenges: a) the absence of frameworks for
+modeling single basic emotional expressions, which restricts the generation of
+complex emotions such as compound emotions; b) the lack of comprehensive
+datasets rich in human emotional expressions, which limits the potential of
+models. To address these challenges, we propose the following innovations: 1)
+the Mixture of Emotion Experts (MoEE) model, which decouples six fundamental
+emotions to enable the precise synthesis of both singular and compound
+emotional states; 2) the DH-FaceEmoVid-150 dataset, specifically curated to
+include six prevalent human emotional expressions as well as four types of
+compound emotions, thereby expanding the training potential of emotion-driven
+models. Furthermore, to enhance the flexibility of emotion control, we propose
+an emotion-to-latents module that leverages multimodal inputs, aligning diverse
+control signals-such as audio, text, and labels-to ensure more varied control
+inputs as well as the ability to control emotions using audio alone. Through
+extensive quantitative and qualitative evaluations, we demonstrate that the
+MoEE framework, in conjunction with the DH-FaceEmoVid-150 dataset, excels in
+generating complex emotional expressions and nuanced facial details, setting a
+new benchmark in the field. These datasets will be publicly released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Magic-Boost: Boost 3D Generation with Multi-View Conditioned Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.06429v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.06429v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Yang, Jianfeng Zhang, Yichun Shi, Bowen Chen, Chenxu Zhang, Huichao Zhang, Xiaofeng Yang, Xiu Li, Jiashi Feng, Guosheng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from the rapid development of 2D diffusion models, 3D content
+generation has witnessed significant progress. One promising solution is to
+finetune the pre-trained 2D diffusion models to produce multi-view images and
+then reconstruct them into 3D assets via feed-forward sparse-view
+reconstruction models. However, limited by the 3D inconsistency in the
+generated multi-view images and the low reconstruction resolution of the
+feed-forward reconstruction models, the generated 3d assets are still limited
+to incorrect geometries and blurry textures. To address this problem, we
+present a multi-view based refine method, named Magic-Boost, to further refine
+the generation results. In detail, we first propose a novel multi-view
+conditioned diffusion model which extracts 3d prior from the synthesized
+multi-view images to synthesize high-fidelity novel view images and then
+introduce a novel iterative-update strategy to adopt it to provide precise
+guidance to refine the coarse generated results through a fast optimization
+process. Conditioned on the strong 3d priors extracted from the synthesized
+multi-view images, Magic-Boost is capable of providing precise optimization
+guidance that well aligns with the coarse generated 3D assets, enriching the
+local detail in both geometry and texture within a short time ($\sim15$min).
+Extensive experiments show Magic-Boost greatly enhances the coarse generated
+inputs, generates high-quality 3D assets with rich geometric and textural
+details. (Project Page: https://magic-research.github.io/magic-boost/)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ YOLO11 to Its Genesis: A Decadal and Comprehensive <span class="highlight-title">Review</span> of The You
+  Only Look Once (YOLO) Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19407v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19407v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ranjan Sapkota, Rizwan Qureshi, Marco Flores Calero, Chetan Badjugar, Upesh Nepal, Alwin Poulose, Peter Zeno, Uday Bhanu Prakash Vaddevolu, Sheheryar Khan, Maged Shoman, Hong Yan, Manoj Karkee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the rapid emergence and applications of Large Language This review
+systematically examines the progression of the You Only Look Once (YOLO) object
+detection algorithms from YOLOv1 to the recently unveiled YOLO11 (or YOLOv11).
+Employing a reverse chronological analysis, this study examines the
+advancements introduced by YOLO algorithms, beginning with YOLOv11 and
+progressing through YOLOv10, YOLOv9, YOLOv8, and subsequent versions to explore
+each version's contributions to enhancing speed, detection accuracy, and
+computational efficiency in real-time object detection. By detailing the
+incremental technological advancements in subsequent YOLO versions, this review
+chronicles the evolution of YOLO, and discusses the challenges and limitations
+in each earlier versions. The evolution signifies a path towards integrating
+YOLO with multimodal, context-aware, and Artificial General Intelligence (AGI)
+systems for the next YOLO decade, promising significant implications for future
+developments in AI-driven applications. YOLOV11 to YOLOv1
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 Figures, 7 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Domain Features Guided Supervised Contrastive Learning for Radar
+  Target Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Wang, Yuze Gao, Dongying Li, Wenxian Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting small targets in sea clutter is challenging due to dynamic maritime
+conditions. Existing solutions either model sea clutter for detection or
+extract target features based on clutter-target echo differences, including
+statistical and deep features. While more common, the latter often excels in
+controlled scenarios but struggles with robust detection and generalization in
+diverse environments, limiting practical use. In this letter, we propose a
+multi-domain features guided supervised contrastive learning (MDFG_SCL) method,
+which integrates statistical features derived from multi-domain differences
+with deep features obtained through supervised contrastive learning, thereby
+capturing both low-level domain-specific variations and high-level semantic
+information. This comprehensive feature integration enables the model to
+effectively distinguish between small targets and sea clutter, even under
+challenging conditions. Experiments conducted on real-world datasets
+demonstrate that the proposed shallow-to-deep detector not only achieves
+effective identification of small maritime targets but also maintains superior
+detection performance across varying sea conditions, outperforming the
+mainstream unsupervised contrastive learning and supervised contrastive
+learning methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ContextMRI: Enhancing Compressed Sensing MRI through Metadata
+  Conditioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04284v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04284v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyungjin Chung, Dohun Lee, Zihui Wu, Byung-Hoon Kim, Katherine L. Bouman, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compressed sensing MRI seeks to accelerate MRI acquisition processes by
+sampling fewer k-space measurements and then reconstructing the missing data
+algorithmically. The success of these approaches often relies on strong priors
+or learned statistical models. While recent diffusion model-based priors have
+shown great potential, previous methods typically ignore clinically available
+metadata (e.g. patient demographics, imaging parameters, slice-specific
+information). In practice, metadata contains meaningful cues about the anatomy
+and acquisition protocol, suggesting it could further constrain the
+reconstruction problem. In this work, we propose ContextMRI, a text-conditioned
+diffusion model for MRI that integrates granular metadata into the
+reconstruction process. We train a pixel-space diffusion model directly on
+minimally processed, complex-valued MRI images. During inference, metadata is
+converted into a structured text prompt and fed to the model via CLIP text
+embeddings. By conditioning the prior on metadata, we unlock more accurate
+reconstructions and show consistent gains across multiple datasets,
+acceleration factors, and undersampling patterns. Our experiments demonstrate
+that increasing the fidelity of metadata, ranging from slice location and
+contrast to patient age, sex, and pathology, systematically boosts
+reconstruction performance. This work highlights the untapped potential of
+leveraging clinical context for inverse problems and opens a new direction for
+metadata-driven MRI reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 9 figures. Code is available at
+  https://github.com/DoHunLee1/ContextMRI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hyper-3DG: Text-to-3D Gaussian Generation via Hypergraph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09236v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09236v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donglin Di, Jiahui Yang, Chaofan Luo, Zhou Xue, Wei Chen, Xun Yang, Yue Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-3D generation represents an exciting field that has seen rapid
+advancements, facilitating the transformation of textual descriptions into
+detailed 3D models. However, current progress often neglects the intricate
+high-order correlation of geometry and texture within 3D objects, leading to
+challenges such as over-smoothness, over-saturation and the Janus problem. In
+this work, we propose a method named ``3D Gaussian Generation via Hypergraph
+(Hyper-3DG)'', designed to capture the sophisticated high-order correlations
+present within 3D objects. Our framework is anchored by a well-established
+mainflow and an essential module, named ``Geometry and Texture Hypergraph
+Refiner (HGRefiner)''. This module not only refines the representation of 3D
+Gaussians but also accelerates the update process of these 3D Gaussians by
+conducting the Patch-3DGS Hypergraph Learning on both explicit attributes and
+latent visual features. Our framework allows for the production of finely
+generated 3D objects within a cohesive optimization, effectively circumventing
+degradation. Extensive experimentation has shown that our proposed method
+significantly enhances the quality of 3D generation while incurring no
+additional computational overhead for the underlying framework. (Project code:
+https://github.com/yjhboy/Hyper3DG)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IJCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EndoPerfect: A Hybrid NeRF-Stereo Vision Approach Pioneering Monocular
+  Depth Estimation and 3D Reconstruction in Endoscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.04041v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.04041v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Chen, Wenhao Li, Nicole Gunderson, Jeremy Ruthberg, Randall Bly, Zhenglong Sun, Waleed M. Abuzeid, Eric J. Seibel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D reconstruction in endoscopic sinus surgery (ESS) demands exceptional
+accuracy, with the mean error and standard deviation necessitating within the
+range of a single CT slice (0.625 mm), as the critical structures in the nasal
+cavity are situated within submillimeter distances from surgical instruments.
+This poses a formidable challenge when using conventional monocular endoscopes.
+Depth estimation is crucial for 3D reconstruction, yet existing depth
+estimation methodologies either suffer from inherent accuracy limitations or,
+in the case of learning-based approaches, perform poorly when applied to ESS
+despite succeeding on their original datasets. In this study, we present a
+novel, highly generalizable method that combines Neural Radiance Fields (NeRF)
+and stereo depth estimation for 3D reconstruction that can derive metric
+monocular depth. Our approach begins with an initial NeRF reconstruction
+yielding a coarse 3D scene, the subsequent creation of binocular pairs within
+coarse 3D scene, and generation of depth maps through stereo vision, These
+depth maps are used to supervise subsequent NeRF iteration, progressively
+refining NeRF and binocular depth, the refinement process continues until the
+depth maps converged. This recursive process generates high-accuracy depth maps
+from monocular endoscopic video. Evaluation in synthetic endoscopy shows a
+depth accuracy of 0.125 $\pm$ 0.443 mm, well within the 0.625 mm threshold.
+Further clinical experiments with real endoscopic data demonstrate a mean
+distance to CT mesh of 0.269 mm, representing the highest accuracy among
+monocular 3D reconstruction methods in ESS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The evolution of volumetric video: A <span class="highlight-title">survey</span> of smart transcoding and
+  compression approaches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.02095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.02095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Preetish Kakkar, Hariharan Ragothaman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Volumetric video, the capture and display of three-dimensional (3D) imagery,
+has emerged as a revolutionary technology poised to transform the media
+landscape, enabling immersive experiences that transcend the limitations of
+traditional 2D video. One of the key challenges in this domain is the efficient
+delivery of these high-bandwidth, data-intensive volumetric video streams,
+which requires innovative transcoding and compression techniques. This research
+paper explores the state-of-the-art in volumetric video compression and
+delivery, with a focus on the potential of AI-driven solutions to address the
+unique challenges posed by this emerging medium.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics Based Differentiable Rendering for Inverse Problems and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.08563v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.08563v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Preetish Kakkar, Srijani Mukherjee, Hariharan Ragothaman, Vishal Mehta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-based differentiable rendering (PBDR) has become an efficient method
+in computer vision, graphics, and machine learning for addressing an array of
+inverse problems. PBDR allows patterns to be generated from perceptions which
+can be applied to enhance object attributes like geometry, substances, and
+lighting by adding physical models of light propagation and materials
+interaction. Due to these capabilities, distinguished rendering has been
+employed in a wider range of sectors such as autonomous navigation, scene
+reconstruction, and material design. We provide an extensive overview of PBDR
+techniques in this study, emphasizing their creation, effectiveness, and
+limitations while managing inverse situations. We demonstrate modern techniques
+and examine their value in everyday situations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Discriminative Class Tokens for Text-to-Image Diffusion Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17155v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17155v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Idan Schwartz, Vésteinn Snæbjarnarson, Hila Chefer, Ryan Cotterell, Serge Belongie, Lior Wolf, Sagie Benaim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in text-to-image diffusion models have enabled the generation
+of diverse and high-quality images. While impressive, the images often fall
+short of depicting subtle details and are susceptible to errors due to
+ambiguity in the input text. One way of alleviating these issues is to train
+diffusion models on class-labeled datasets. This approach has two
+disadvantages: (i) supervised datasets are generally small compared to
+large-scale scraped text-image datasets on which text-to-image models are
+trained, affecting the quality and diversity of the generated images, or (ii)
+the input is a hard-coded label, as opposed to free-form text, limiting the
+control over the generated images.
+  In this work, we propose a non-invasive fine-tuning technique that
+capitalizes on the expressive potential of free-form text while achieving high
+accuracy through discriminative signals from a pretrained classifier. This is
+done by iteratively modifying the embedding of an added input token of a
+text-to-image diffusion model, by steering generated images toward a given
+target class according to a classifier. Our method is fast compared to prior
+fine-tuning methods and does not require a collection of in-class images or
+retraining of a noise-tolerant classifier. We evaluate our method extensively,
+showing that the generated images are: (i) more accurate and of higher quality
+than standard diffusion models, (ii) can be used to augment training data in a
+low-resource setting, and (iii) reveal information about the data used to train
+the guiding classifier. The code is available at
+\url{https://github.com/idansc/discriminative_class_tokens}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-generated Image Detection: Passive or Watermark? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.13553v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.13553v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moyang Guo, Yuepeng Hu, Zhengyuan Jiang, Zeyu Li, Amir Sadovnik, Arka Daw, Neil Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While text-to-image models offer numerous benefits, they also pose
+significant societal risks. Detecting AI-generated images is crucial for
+mitigating these risks. Detection methods can be broadly categorized into
+passive and watermark-based approaches: passive detectors rely on artifacts
+present in AI-generated images, whereas watermark-based detectors proactively
+embed watermarks into such images. A key question is which type of detector
+performs better in terms of effectiveness, robustness, and efficiency. However,
+the current literature lacks a comprehensive understanding of this issue. In
+this work, we aim to bridge that gap by developing ImageDetectBench, the first
+comprehensive benchmark to compare the effectiveness, robustness, and
+efficiency of passive and watermark-based detectors. Our benchmark includes
+four datasets, each containing a mix of AI-generated and non-AI-generated
+images. We evaluate five passive detectors and four watermark-based detectors
+against eight types of common perturbations and three types of adversarial
+perturbations. Our benchmark results reveal several interesting findings. For
+instance, watermark-based detectors consistently outperform passive detectors,
+both in the presence and absence of perturbations. Based on these insights, we
+provide recommendations for detecting AI-generated images, e.g., when both
+types of detectors are applicable, watermark-based detectors should be the
+preferred choice. Our code and data are publicly available at
+https://github.com/moyangkuo/ImageDetectBench.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Masked Image Modeling: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vlad Hondru, Florinel Alin Croitoru, Shervin Minaee, Radu Tudor Ionescu, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we survey recent studies on masked image modeling (MIM), an
+approach that emerged as a powerful self-supervised learning technique in
+computer vision. The MIM task involves masking some information, e.g.~pixels,
+patches, or even latent representations, and training a model, usually an
+autoencoder, to predicting the missing information by using the context
+available in the visible part of the input. We identify and formalize two
+categories of approaches on how to implement MIM as a pretext task, one based
+on reconstruction and one based on contrastive learning. Then, we construct a
+taxonomy and review the most prominent papers in recent years. We complement
+the manually constructed taxonomy with a dendrogram obtained by applying a
+hierarchical clustering algorithm. We further identify relevant clusters via
+manually inspecting the resulting dendrogram. Our review also includes datasets
+that are commonly used in MIM research. We aggregate the performance results of
+various masked image modeling methods on the most popular datasets, to
+facilitate the comparison of competing methods. Finally, we identify research
+gaps and propose several interesting directions of future work. We supplement
+our survey with the following public repository containing organized
+references: https://github.com/vladhondru25/MIM-Survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revised version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Real Time Multi Organ Classification on Computed Tomography Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18731v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18731v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Halid Ziya Yerebakan, Yoshihisa Shinagawa, Gerardo Hermosillo Valadez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Organ segmentation is a fundamental task in medical imaging since it is
+useful for many clinical automation pipelines. However, some tasks do not
+require full segmentation. Instead, a classifier can identify the selected
+organ without segmenting the entire volume. In this study, we demonstrate a
+classifier based method to obtain organ labels in real time by using a large
+context size with a sparse data sampling strategy. Although our method operates
+as an independent classifier at query locations, it can generate full
+segmentations by querying grid locations at any resolution, offering faster
+performance than segmentation algorithms. We compared our method with existing
+segmentation techniques, demonstrating its superior runtime potential for
+practical applications in medical imaging.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, Organ Classification, Organ Segmentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Transferable Features for Implicit Neural Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09566v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09566v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kushal Vyas, Ahmed Imtiaz Humayun, Aniket Dashpute, Richard G. Baraniuk, Ashok Veeraraghavan, Guha Balakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit neural representations (INRs) have demonstrated success in a variety
+of applications, including inverse problems and neural rendering. An INR is
+typically trained to capture one signal of interest, resulting in learned
+neural features that are highly attuned to that signal. Assumed to be less
+generalizable, we explore the aspect of transferability of such learned neural
+features for fitting similar signals. We introduce a new INR training
+framework, STRAINER that learns transferrable features for fitting INRs to new
+signals from a given distribution, faster and with better reconstruction
+quality. Owing to the sequential layer-wise affine operations in an INR, we
+propose to learn transferable representations by sharing initial encoder layers
+across multiple INRs with independent decoder layers. At test time, the learned
+encoder representations are transferred as initialization for an otherwise
+randomly initialized INR. We find STRAINER to yield extremely powerful
+initialization for fitting images from the same domain and allow for $\approx
++10dB$ gain in signal quality early on compared to an untrained INR itself.
+STRAINER also provides a simple way to encode data-driven priors in INRs. We
+evaluate STRAINER on multiple in-domain and out-of-domain signal fitting tasks
+and inverse problems and further provide detailed analysis and discussion on
+the transferability of STRAINER's features. Our demo can be accessed at
+https://kushalvyas.github.io/strainer.html .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Website: https://kushalvyas.github.io/strainer.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Modal Mapping: Eliminating the Modality Gap for Few-Shot Image
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20110v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20110v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Yang, Pai Peng, Wulin Xie, Xiaohuan Lu, Jie Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In few-shot image classification tasks, methods based on pretrained
+vision-language models (such as CLIP) have achieved significant progress. Many
+existing approaches directly utilize visual or textual features as class
+prototypes, however, these features fail to adequately represent their
+respective classes. We identify that this limitation arises from the modality
+gap inherent in pretrained vision-language models, which weakens the connection
+between the visual and textual modalities. To eliminate this modality gap and
+enable textual features to fully represent class prototypes, we propose a
+simple and efficient Cross-Modal Mapping (CMM) method. This method employs a
+linear transformation to map image features into the textual feature space,
+ensuring that both modalities are comparable within the same feature space.
+Nevertheless, the modality gap diminishes the effectiveness of this mapping. To
+address this, we further introduce a triplet loss to optimize the spatial
+relationships between image features and class textual features, allowing class
+textual features to naturally serve as class prototypes for image features.
+Experimental results on 11 benchmark demonstrate an average improvement of
+approximately 3.5% compared to conventional methods and exhibit competitive
+performance on 4 distribution shift benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaze-Informed Vision <span class="highlight-title">Transformer</span>s: Predicting Driving Decisions Under
+  Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13969v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13969v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharath Koorathota, Nikolas Papadopoulos, Jia Li Ma, Shruti Kumar, Xiaoxiao Sun, Arunesh Mittal, Patrick Adelman, Paul Sajda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViT) have advanced computer vision, yet their efficacy
+in complex tasks like driving remains less explored. This study enhances ViT by
+integrating human eye gaze, captured via eye-tracking, to increase prediction
+accuracy in driving scenarios under uncertainty in both real-world and virtual
+reality scenarios. First, we establish the significance of human eye gaze in
+left-right driving decisions, as observed in both human subjects and a ViT
+model. By comparing the similarity between human fixation maps and ViT
+attention weights, we reveal the dynamics of overlap across individual heads
+and layers. This overlap demonstrates that fixation data can guide the model in
+distributing its attention weights more effectively. We introduce the
+fixation-attention intersection (FAX) loss, a novel loss function that
+significantly improves ViT performance under high uncertainty conditions. Our
+results show that ViT, when trained with FAX loss, aligns its attention with
+human gaze patterns. This gaze-informed approach has significant potential for
+driver behavior analysis, as well as broader applications in human-centered AI
+systems, extending ViT's use to complex visual environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 9 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Proactive Adversarial Defense: Harnessing <span class="highlight-title">Prompt</span> Tuning in
+  Vision-Language Models to Detect Unseen Backdoored Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.08755v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.08755v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Stein, Andrew Arash Mahyari, Guillermo Francia, Eman El-Sheikh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks pose a critical threat by embedding hidden triggers into
+inputs, causing models to misclassify them into target labels. While extensive
+research has focused on mitigating these attacks in object recognition models
+through weight fine-tuning, much less attention has been given to detecting
+backdoored samples directly. Given the vast datasets used in training, manual
+inspection for backdoor triggers is impractical, and even state-of-the-art
+defense mechanisms fail to fully neutralize their impact. To address this gap,
+we introduce a groundbreaking method to detect unseen backdoored images during
+both training and inference. Leveraging the transformative success of prompt
+tuning in Vision Language Models (VLMs), our approach trains learnable text
+prompts to differentiate clean images from those with hidden backdoor triggers.
+Experiments demonstrate the exceptional efficacy of this method, achieving an
+impressive average accuracy of 86% across two renowned datasets for detecting
+unseen backdoor triggers, establishing a new standard in backdoor defense.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">12</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Search-o1: Agentic Search-Enhanced Large Reasoning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05366v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05366v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxi Li, Guanting Dong, Jiajie Jin, Yuyao Zhang, Yujia Zhou, Yutao Zhu, Peitian Zhang, Zhicheng Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large reasoning models (LRMs) like OpenAI-o1 have demonstrated impressive
+long stepwise reasoning capabilities through large-scale reinforcement
+learning. However, their extended reasoning processes often suffer from
+knowledge insufficiency, leading to frequent uncertainties and potential
+errors. To address this limitation, we introduce \textbf{Search-o1}, a
+framework that enhances LRMs with an agentic retrieval-augmented generation
+(RAG) mechanism and a Reason-in-Documents module for refining retrieved
+documents. Search-o1 integrates an agentic search workflow into the reasoning
+process, enabling dynamic retrieval of external knowledge when LRMs encounter
+uncertain knowledge points. Additionally, due to the verbose nature of
+retrieved documents, we design a separate Reason-in-Documents module to deeply
+analyze the retrieved information before injecting it into the reasoning chain,
+minimizing noise and preserving coherent reasoning flow. Extensive experiments
+on complex reasoning tasks in science, mathematics, and coding, as well as six
+open-domain QA benchmarks, demonstrate the strong performance of Search-o1.
+This approach enhances the trustworthiness and applicability of LRMs in complex
+reasoning tasks, paving the way for more reliable and versatile intelligent
+systems. The code is available at
+\url{https://github.com/sunnynexus/Search-o1}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unraveling the Impact of Visual Complexity on Search as Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wolfgang Gritz, Anett Hoppe, Ralph Ewerth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information search has become essential for learning and knowledge
+acquisition, offering broad access to information and learning resources. The
+visual complexity of web pages is known to influence search behavior, with
+previous work suggesting that searchers make evaluative judgments within the
+first second on a page. However, there is a significant gap in our
+understanding of how visual complexity impacts searches specifically conducted
+with a learning intent. This gap is particularly relevant for the development
+of optimized information retrieval (IR) systems that effectively support
+educational objectives. To address this research need, we model visual
+complexity and aesthetics via a diverse set of features, investigating their
+relationship with search behavior during learning-oriented web sessions. Our
+study utilizes a publicly available dataset from a lab study where participants
+learned about thunderstorm formation. Our findings reveal that while content
+relevance is the most significant predictor for knowledge gain, sessions with
+less visually complex pages are associated with higher learning success. This
+observation applies to features associated with the layout of web pages rather
+than to simpler features (e.g., number of images). The reported results shed
+light on the impact of visual complexity on learning-oriented searches,
+informing the design of more effective IR systems for educational contexts. To
+foster reproducibility, we release our source code
+(https://github.com/TIBHannover/sal_visual_complexity).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Approach to Scalable and Automatic Topic-Controlled Question
+  Generation in Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqing Li, Mutlu Cukurova, Sahan Bulathwela
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of Automatic Question Generation (QG) models has the
+potential to significantly improve educational practices by reducing the
+teacher workload associated with creating educational content. This paper
+introduces a novel approach to educational question generation that controls
+the topical focus of questions. The proposed Topic-Controlled Question
+Generation (T-CQG) method enhances the relevance and effectiveness of the
+generated content for educational purposes. Our approach uses fine-tuning on a
+pre-trained T5-small model, employing specially created datasets tailored to
+educational needs. The research further explores the impacts of pre-training
+strategies, quantisation, and data augmentation on the model's performance. We
+specifically address the challenge of generating semantically aligned questions
+with paragraph-level contexts, thereby improving the topic specificity of the
+generated questions. In addition, we introduce and explore novel evaluation
+methods to assess the topical relatedness of the generated questions. Our
+results, validated through rigorous offline and human-backed evaluations,
+demonstrate that the proposed models effectively generate high-quality,
+topic-focused questions. These models have the potential to reduce teacher
+workload and support personalised tutoring systems by serving as bespoke
+question generators. With its relatively small number of parameters, the
+proposals not only advance the capabilities of question generation models for
+handling specific educational topics but also offer a scalable solution that
+reduces infrastructure costs. This scalability makes them feasible for
+widespread use in education without reliance on proprietary large language
+models like ChatGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published at ACM Conf. on Learning Analytics and Knowledge
+  (LAK'25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ De-centering the (Traditional) User: Multistakeholder Evaluation of
+  Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robin Burke, Gediminas Adomavicius, Toine Bogers, Tommaso Di Noia, Dominik Kowald, Julia Neidhardt, Özlem Özgöbek, Maria Soledad Pera, Nava Tintarev, Jürgen Ziegler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multistakeholder recommender systems are those that account for the impacts
+and preferences of multiple groups of individuals, not just the end users
+receiving recommendations. Due to their complexity, evaluating these systems
+cannot be restricted to the overall utility of a single stakeholder, as is
+often the case of more mainstream recommender system applications. In this
+article, we focus our discussion on the intricacies of the evaluation of
+multistakeholder recommender systems. We bring attention to the different
+aspects involved in the evaluation of multistakeholder recommender systems -
+from the range of stakeholders involved (including but not limited to producers
+and consumers) to the values and specific goals of each relevant stakeholder.
+Additionally, we discuss how to move from theoretical principles to practical
+implementation, providing specific use case examples. Finally, we outline open
+research directions for the RecSys community to explore. We aim to provide
+guidance to researchers and practitioners about how to think about these
+complex and domain-dependent issues of evaluation in the course of designing,
+developing, and researching applications with multistakeholder aspects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to Elsevier, "Re-centering the User in Recommender
+  System Research" special issue of the International Journal of Human-Computer
+  Studies (IJHCS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparison of Feature Learning Methods for Metadata Extraction from PDF
+  Scholarly Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05082v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05082v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyd Boukhers, Cong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of metadata for scientific documents is pivotal in
+propelling scientific knowledge forward and for adhering to the FAIR principles
+(i.e. Findability, Accessibility, Interoperability, and Reusability) of
+research findings. However, the lack of sufficient metadata in published
+documents, particularly those from smaller and mid-sized publishers, hinders
+their accessibility. This issue is widespread in some disciplines, such as the
+German Social Sciences, where publications often employ diverse templates. To
+address this challenge, our study evaluates various feature learning and
+prediction methods, including natural language processing (NLP), computer
+vision (CV), and multimodal approaches, for extracting metadata from documents
+with high template variance. We aim to improve the accessibility of scientific
+documents and facilitate their wider use. To support our comparison of these
+methods, we provide comprehensive experimental results, analyzing their
+accuracy and efficiency in extracting metadata. Additionally, we provide
+valuable insights into the strengths and weaknesses of various feature learning
+and prediction methods, which can guide future research in this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Flexible and Scalable Framework for Video Moment Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongzhi Zhang, Xizhou Zhu, Aixin Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video moment search, the process of finding relevant moments in a video
+corpus to match a user's query, is crucial for various applications. Existing
+solutions, however, often assume a single perfect matching moment, struggle
+with inefficient inference, and have limitations with hour-long videos. This
+paper introduces a flexible and scalable framework for retrieving a ranked list
+of moments from collection of videos in any length to match a text query, a
+task termed Ranked Video Moment Retrieval (RVMR). Our framework, called
+Segment-Proposal-Ranking (SPR), simplifies the search process into three
+independent stages: segment retrieval, proposal generation, and moment
+refinement with re-ranking. Specifically, videos are divided into equal-length
+segments with precomputed embeddings indexed offline, allowing efficient
+retrieval regardless of video length. For scalable online retrieval, both
+segments and queries are projected into a shared feature space to enable
+approximate nearest neighbor (ANN) search. Retrieved segments are then merged
+into coarse-grained moment proposals. Then a refinement and re-ranking module
+is designed to reorder and adjust timestamps of the coarse-grained proposals.
+Evaluations on the TVR-Ranking dataset demonstrate that our framework achieves
+state-of-the-art performance with significant reductions in computational cost
+and processing time. The flexible design also allows for independent
+improvements to each stage, making SPR highly adaptable for large-scale
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Needles in Emb(a)dding Haystacks: Legal Document Retrieval via
+  Bagging and SVR Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Bönisch, Alexander Mehler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a retrieval approach leveraging Support Vector Regression (SVR)
+ensembles, bootstrap aggregation (bagging), and embedding spaces on the German
+Dataset for Legal Information Retrieval (GerDaLIR). By conceptualizing the
+retrieval task in terms of multiple binary needle-in-a-haystack subtasks, we
+show improved recall over the baselines (0.849 > 0.803 | 0.829) using our
+voting ensemble, suggesting promising initial results, without training or
+fine-tuning any deep learning models. Our approach holds potential for further
+enhancement, particularly through refining the encoding models and optimizing
+hyperparameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harmonizing Metadata of Language Resources for Enhanced Querying and
+  Accessibility 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the harmonization of metadata from diverse repositories
+of language resources (LRs). Leveraging linked data and RDF techniques, we
+integrate data from multiple sources into a unified model based on DCAT and
+META-SHARE OWL ontology. Our methodology supports text-based search, faceted
+browsing, and advanced SPARQL queries through Linghub, a newly developed
+portal. Real user queries from the Corpora Mailing List (CML) were evaluated to
+assess Linghub capability to satisfy actual user needs. Results indicate that
+while some limitations persist, many user requests can be successfully
+addressed. The study highlights significant metadata issues and advocates for
+adherence to open vocabularies and standards to enhance metadata harmonization.
+This initial research underscores the importance of API-based access to LRs,
+promoting machine usability and data subset extraction for specific purposes,
+paving the way for more efficient and standardized LR utilization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2024 5th International Conference on Computers and Artificial
+  Intelligence Technology (CAIT 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial Information Integration in Small Language Models for Document
+  Layout Generation and Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Melendez, Clemens Havas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document layout understanding is a field of study that analyzes the spatial
+arrangement of information in a document hoping to understand its structure and
+layout. Models such as LayoutLM (and its subsequent iterations) can understand
+semi-structured documents with SotA results; however, the lack of open
+semi-structured data is a limitation in itself. While semi-structured data is
+common in everyday life (balance sheets, purchase orders, receipts), there is a
+lack of public datasets for training machine learning models for this type of
+document. In this investigation we propose a method to generate new, synthetic,
+layout information that can help overcoming this data shortage. According to
+our results, the proposed method performs better than LayoutTransformer,
+another popular layout generation method. We also show that, in some scenarios,
+text classification can improve when supported by bounding box information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages. Symposium on Applied Computing 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Lazy to Prolific: Tackling Missing Labels in Open Vocabulary
+  Extreme Classification by Positive-Unlabeled Sequence Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08981v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08981v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ranran Haoran Zhang, Bensu Uçar, Soumik Dey, Hansi Wu, Binbin Li, Rui Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary Extreme Multi-label Classification (OXMC) extends traditional
+XMC by allowing prediction beyond an extremely large, predefined label set
+(typically $10^3$ to $10^{12}$ labels), addressing the dynamic nature of
+real-world labeling tasks. However, self-selection bias in data annotation
+leads to significant missing labels in both training and test data,
+particularly for less popular inputs. This creates two critical challenges:
+generation models learn to be "lazy'" by under-generating labels, and
+evaluation becomes unreliable due to insufficient annotation in the test set.
+In this work, we introduce Positive-Unlabeled Sequence Learning (PUSL), which
+reframes OXMC as an infinite keyphrase generation task, addressing the
+generation model's laziness. Additionally, we propose to adopt a suite of
+evaluation metrics, F1@$\mathcal{O}$ and newly proposed B@$k$, to reliably
+assess OXMC models with incomplete ground truths. In a highly imbalanced
+e-commerce dataset with substantial missing labels, PUSL generates 30% more
+unique labels, and 72% of its predictions align with actual user queries. On
+the less skewed EURLex-4.3k dataset, PUSL demonstrates superior F1 scores,
+especially as label counts increase from 15 to 30. Our approach effectively
+tackles both the modeling and evaluation challenges in OXMC with missing
+labels.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Table Retrieval a Solved Problem? Exploring Join-Aware Multi-Table
+  Retrieval <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09889v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09889v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Baile Chen, Yi Zhang, Dan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieving relevant tables containing the necessary information to accurately
+answer a given question over tables is critical to open-domain
+question-answering (QA) systems. Previous methods assume the answer to such a
+question can be found either in a single table or multiple tables identified
+through question decomposition or rewriting. However, neither of these
+approaches is sufficient, as many questions require retrieving multiple tables
+and joining them through a join plan that cannot be discerned from the user
+query itself. If the join plan is not considered in the retrieval stage, the
+subsequent steps of reasoning and answering based on those retrieved tables are
+likely to be incorrect. To address this problem, we introduce a method that
+uncovers useful join relations for any query and database during table
+retrieval. We use a novel re-ranking method formulated as a mixed-integer
+program that considers not only table-query relevance but also table-table
+relevance that requires inferring join relationships. Our method outperforms
+the state-of-the-art approaches for table retrieval by up to 9.3% in F1 score
+and for end-to-end QA by up to 5.4% in accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024. Dataset and code are available at
+  https://peterbaile.github.io/jar</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17428v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17428v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chankyu Lee, Rajarshi Roy, Mengyao Xu, Jonathan Raiman, Mohammad Shoeybi, Bryan Catanzaro, Wei Ping
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoder-only large language model (LLM)-based embedding models are beginning
+to outperform BERT or T5-based embedding models in general-purpose text
+embedding tasks, including dense vector-based retrieval. In this work, we
+introduce the NV-Embed model, incorporating architectural designs, training
+procedures, and curated datasets to significantly enhance the performance of
+LLM as a versatile embedding model, while maintaining its simplicity and
+reproducibility. For model architecture, we propose a latent attention layer to
+obtain pooled embeddings, which consistently improves retrieval and downstream
+task accuracy compared to mean pooling or using the last <EOS> token embedding
+from LLMs. To enhance representation learning, we remove the causal attention
+mask of LLMs during contrastive training. For training algorithm, we introduce
+a two-stage contrastive instruction-tuning method. It first applies contrastive
+training with instructions on retrieval datasets, utilizing in-batch negatives
+and curated hard negative examples. At stage-2, it blends various non-retrieval
+into instruction tuning, which not only enhances non-retrieval task accuracy
+but also improves retrieval performance. For training data, we utilize the
+hard-negative mining, synthetic data generation and existing public available
+datasets to boost the performance of embedding model. By combining these
+techniques, our NV-Embed-v1 and NV-Embed-v2 models obtained the No.1 position
+on the Massive Text Embedding Benchmark (MTEB) (as of May 24, 2024 and August
+30, 2024, respectively) across 56 embedding tasks, demonstrating the sustained
+effectiveness of the proposed methods over time. Additionally, it achieved the
+highest scores in the Long Doc section and the second-highest scores in the QA
+section of the AIR Benchmark, which covers a range of out-of-domain information
+retrieval topics beyond those in MTEB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We open-source the model at:
+  https://huggingface.co/nvidia/NV-Embed-v2</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">189</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decentralized Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David McAllister, Matthew Tancik, Jiaming Song, Angjoo Kanazawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale AI model training divides work across thousands of GPUs, then
+synchronizes gradients across them at each step. This incurs a significant
+network burden that only centralized, monolithic clusters can support, driving
+up infrastructure costs and straining power systems. We propose Decentralized
+Diffusion Models, a scalable framework for distributing diffusion model
+training across independent clusters or datacenters by eliminating the
+dependence on a centralized, high-bandwidth networking fabric. Our method
+trains a set of expert diffusion models over partitions of the dataset, each in
+full isolation from one another. At inference time, the experts ensemble
+through a lightweight router. We show that the ensemble collectively optimizes
+the same objective as a single model trained over the whole dataset. This means
+we can divide the training burden among a number of "compute islands," lowering
+infrastructure costs and improving resilience to localized GPU failures.
+Decentralized diffusion models empower researchers to take advantage of
+smaller, more cost-effective and more readily available compute like on-demand
+GPU nodes rather than central integrated systems. We conduct extensive
+experiments on ImageNet and LAION Aesthetics, showing that decentralized
+diffusion models FLOP-for-FLOP outperform standard diffusion models. We finally
+scale our approach to 24 billion parameters, demonstrating that high-quality
+diffusion models can now be trained with just eight individual GPU nodes in
+less than a week.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://decentralizeddiffusion.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistent Flow Distillation for Text-to-3D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runjie Yan, Yinbo Chen, Xiaolong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Score Distillation Sampling (SDS) has made significant strides in distilling
+image-generative models for 3D generation. However, its
+maximum-likelihood-seeking behavior often leads to degraded visual quality and
+diversity, limiting its effectiveness in 3D applications. In this work, we
+propose Consistent Flow Distillation (CFD), which addresses these limitations.
+We begin by leveraging the gradient of the diffusion ODE or SDE sampling
+process to guide the 3D generation. From the gradient-based sampling
+perspective, we find that the consistency of 2D image flows across different
+viewpoints is important for high-quality 3D generation. To achieve this, we
+introduce multi-view consistent Gaussian noise on the 3D object, which can be
+rendered from various viewpoints to compute the flow gradient. Our experiments
+demonstrate that CFD, through consistent flows, significantly outperforms
+previous methods in text-to-3D generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://runjie-yan.github.io/cfd/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The GAN is dead; long live the GAN! A Modern GAN Baseline <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiwen Huang, Aaron Gokaslan, Volodymyr Kuleshov, James Tompkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a widely-spread claim that GANs are difficult to train, and GAN
+architectures in the literature are littered with empirical tricks. We provide
+evidence against this claim and build a modern GAN baseline in a more
+principled manner. First, we derive a well-behaved regularized relativistic GAN
+loss that addresses issues of mode dropping and non-convergence that were
+previously tackled via a bag of ad-hoc tricks. We analyze our loss
+mathematically and prove that it admits local convergence guarantees, unlike
+most existing relativistic losses. Second, our new loss allows us to discard
+all ad-hoc tricks and replace outdated backbones used in common GANs with
+modern architectures. Using StyleGAN2 as an example, we present a roadmap of
+simplification and modernization that results in a new minimalist baseline --
+R3GAN. Despite being simple, our approach surpasses StyleGAN2 on FFHQ,
+ImageNet, CIFAR, and Stacked MNIST datasets, and compares favorably against
+state-of-the-art GANs and diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2024. Code available at
+  https://github.com/brownvc/R3GAN/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Simple to Complex Skills: The Case of In-Hand Object Reorientation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05439v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05439v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhi Qi, Brent Yi, Mike Lambeta, Yi Ma, Roberto Calandra, Jitendra Malik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning policies in simulation and transferring them to the real world has
+become a promising approach in dexterous manipulation. However, bridging the
+sim-to-real gap for each new task requires substantial human effort, such as
+careful reward engineering, hyperparameter tuning, and system identification.
+In this work, we present a system that leverages low-level skills to address
+these challenges for more complex tasks. Specifically, we introduce a
+hierarchical policy for in-hand object reorientation based on previously
+acquired rotation skills. This hierarchical policy learns to select which
+low-level skill to execute based on feedback from both the environment and the
+low-level skill policies themselves. Compared to learning from scratch, the
+hierarchical policy is more robust to out-of-distribution changes and transfers
+easily from simulation to real-world environments. Additionally, we propose a
+generalizable object pose estimator that uses proprioceptive information,
+low-level skill predictions, and control errors as inputs to estimate the
+object pose over time. We demonstrate that our system can reorient objects,
+including symmetrical and textureless ones, to a desired pose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>website: https://dexhier.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entangled Mean Estimation in High-Dimensions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilias Diakonikolas, Daniel M. Kane, Sihan Liu, Thanasis Pittas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the task of high-dimensional entangled mean estimation in the
+subset-of-signals model. Specifically, given $N$ independent random points
+$x_1,\ldots,x_N$ in $\mathbb{R}^D$ and a parameter $\alpha \in (0, 1)$ such
+that each $x_i$ is drawn from a Gaussian with mean $\mu$ and unknown
+covariance, and an unknown $\alpha$-fraction of the points have
+identity-bounded covariances, the goal is to estimate the common mean $\mu$.
+The one-dimensional version of this task has received significant attention in
+theoretical computer science and statistics over the past decades. Recent work
+[LY20; CV24] has given near-optimal upper and lower bounds for the
+one-dimensional setting. On the other hand, our understanding of even the
+information-theoretic aspects of the multivariate setting has remained limited.
+  In this work, we design a computationally efficient algorithm achieving an
+information-theoretically near-optimal error. Specifically, we show that the
+optimal error (up to polylogarithmic factors) is $f(\alpha,N) + \sqrt{D/(\alpha
+N)}$, where the term $f(\alpha,N)$ is the error of the one-dimensional problem
+and the second term is the sub-Gaussian error rate. Our algorithmic approach
+employs an iterative refinement strategy, whereby we progressively learn more
+accurate approximations $\hat \mu$ to $\mu$. This is achieved via a novel
+rejection sampling procedure that removes points significantly deviating from
+$\hat \mu$, as an attempt to filter out unusually noisy samples. A complication
+that arises is that rejection sampling introduces bias in the distribution of
+the remaining points. To address this issue, we perform a careful analysis of
+the bias, develop an iterative dimension-reduction strategy, and employ a novel
+subroutine inspired by list-decodable learning that leverages the
+one-dimensional result.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using LLMs to Infer Non-Binary COVID-19 Sentiments of Chinese
+  Micro-bloggers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerry Chongyi Hu, Mohammed Shahid Modi, Boleslaw K. Szymanski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Studying public sentiment during crises is crucial for understanding how
+opinions and sentiments shift, resulting in polarized societies. We study
+Weibo, the most popular microblogging site in China, using posts made during
+the outbreak of the COVID-19 crisis. The study period includes the pre-COVID-19
+stage, the outbreak stage, and the early stage of epidemic prevention. We use
+Llama 3 8B, a Large Language Model, to analyze users' sentiments on the
+platform by classifying them into positive, negative, sarcastic, and neutral
+categories. Analyzing sentiment shifts on Weibo provides insights into how
+social events and government actions influence public opinion. This study
+contributes to understanding the dynamics of social sentiments during health
+crises, fulfilling a gap in sentiment analysis for Chinese platforms. By
+examining these dynamics, we aim to offer valuable perspectives on digital
+communication's role in shaping society's responses during unprecedented global
+challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-aware Knowledge Tracing <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihua Cheng, Hanwen Du, Chunxiao Li, Ersheng Ni, Liangdi Tan, Tianqi Xu, Yongxin Ni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Tracing (KT) is crucial in education assessment, which focuses on
+depicting students' learning states and assessing students' mastery of
+subjects. With the rise of modern online learning platforms, particularly
+massive open online courses (MOOCs), an abundance of interaction data has
+greatly advanced the development of the KT technology. Previous research
+commonly adopts deterministic representation to capture students' knowledge
+states, which neglects the uncertainty during student interactions and thus
+fails to model the true knowledge state in learning process. In light of this,
+we propose an Uncertainty-Aware Knowledge Tracing model (UKT) which employs
+stochastic distribution embeddings to represent the uncertainty in student
+interactions, with a Wasserstein self-attention mechanism designed to capture
+the transition of state distribution in student learning behaviors.
+Additionally, we introduce the aleatory uncertainty-aware contrastive learning
+loss, which strengthens the model's robustness towards different types of
+uncertainties. Extensive experiments on six real-world datasets demonstrate
+that UKT not only significantly surpasses existing deep learning-based models
+in KT prediction, but also shows unique advantages in handling the uncertainty
+of student interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Pathology Foundation Model by Mayo Clinic, Charité, and
+  Aignostics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Alber, Stephan Tietz, Jonas Dippel, Timo Milbich, Timothée Lesort, Panos Korfiatis, Moritz Krügener, Beatriz Perez Cancer, Neelay Shah, Alexander Möllers, Philipp Seegerer, Alexandra Carpen-Amarie, Kai Standvoss, Gabriel Dernbach, Edwin de Jong, Simon Schallenberg, Andreas Kunft, Helmut Hoffer von Ankershoffen, Gavin Schaeferle, Patrick Duffy, Matt Redlon, Philipp Jurmeister, David Horst, Lukas Ruff, Klaus-Robert Müller, Frederick Klauschen, Andrew Norgan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in digital pathology have demonstrated the effectiveness of
+foundation models across diverse applications. In this report, we present a
+novel vision foundation model based on the RudolfV approach. Our model was
+trained on a dataset comprising 1.2 million histopathology whole slide images,
+collected from two medical institutions: Mayo Clinic and Charit\'e -
+Universt\"atsmedizin Berlin. Comprehensive evaluations show that our model
+achieves state-of-the-art performance across twenty-one public benchmark
+datasets, even though it is neither the largest model by parameter count nor by
+training dataset size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimeRL: Efficient Deep Reinforcement Learning with Polyhedral Dependence
+  Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro F. Silvestre, Peter Pietzuch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern deep learning (DL) workloads increasingly use complex deep
+reinforcement learning (DRL) algorithms that generate training data within the
+learning loop. This results in programs with several nested loops and dynamic
+data dependencies between tensors. While DL systems with eager execution
+support such dynamism, they lack the optimizations and smart scheduling of
+graph-based execution. Graph-based execution, however, cannot express dynamic
+tensor shapes, instead requiring the use of multiple static subgraphs. Either
+execution model for DRL thus leads to redundant computation, reduced
+parallelism, and less efficient memory management.
+  We describe TimeRL, a system for executing dynamic DRL programs that combines
+the dynamism of eager execution with the whole-program optimizations and
+scheduling of graph-based execution. TimeRL achieves this by introducing the
+declarative programming model of recurrent tensors, which allows users to
+define dynamic dependencies as intuitive recurrence equations. TimeRL
+translates recurrent tensors into a polyhedral dependence graph (PDG) with
+dynamic dependencies as symbolic expressions. Through simple PDG
+transformations, TimeRL applies whole-program optimizations, such as automatic
+vectorization, incrementalization, and operator fusion. The PDG also allows for
+the computation of an efficient program-wide execution schedule, which decides
+on buffer deallocations, buffer donations, and GPU/CPU memory swapping. We show
+that TimeRL executes current DRL algorithms up to 47$\times$ faster than
+existing DRL systems, while using 16$\times$ less GPU peak memory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 11 figures, 5 bibliography pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On-line Policy Improvement using Monte-Carlo Search <span class="chip">NeurIPS 1996</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerald Tesauro, Gregory R. Galperin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a Monte-Carlo simulation algorithm for real-time policy
+improvement of an adaptive controller. In the Monte-Carlo simulation, the
+long-term expected reward of each possible action is statistically measured,
+using the initial policy to make decisions in each step of the simulation. The
+action maximizing the measured expected reward is then taken, resulting in an
+improved policy. Our algorithm is easily parallelizable and has been
+implemented on the IBM SP1 and SP2 parallel-RISC supercomputers.
+  We have obtained promising initial results in applying this algorithm to the
+domain of backgammon. Results are reported for a wide variety of initial
+policies, ranging from a random policy to TD-Gammon, an extremely strong
+multi-layer neural network. In each case, the Monte-Carlo algorithm gives a
+substantial reduction, by as much as a factor of 5 or more, in the error rate
+of the base players. The algorithm is also potentially useful in many other
+adaptive control applications in which it is possible to simulate the
+environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accompanied by oral presentation by Gregory Galperin at NeurIPS 1996
+  (then known as NIPS*96)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimeDP: Learning to Generate Multi-Domain Time Series with Domain
+  <span class="highlight-title">Prompt</span>s <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Hao Huang, Chang Xu, Yueying Wu, Wu-Jun Li, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series generation models are crucial for applications like data
+augmentation and privacy preservation. Most existing time series generation
+models are typically designed to generate data from one specified domain. While
+leveraging data from other domain for better generalization is proved to work
+in other application areas, this approach remains challenging for time series
+modeling due to the large divergence in patterns among different real world
+time series categories. In this paper, we propose a multi-domain time series
+diffusion model with domain prompts, named TimeDP. In TimeDP, we utilize a time
+series semantic prototype module which defines time series prototypes to
+represent time series basis, each prototype vector serving as "word"
+representing some elementary time series feature. A prototype assignment module
+is applied to extract the extract domain specific prototype weights, for
+learning domain prompts as generation condition. During sampling, we extract
+"domain prompt" with few-shot samples from the target domain and use the domain
+prompts as condition to generate time series samples. Experiments demonstrate
+that our method outperforms baselines to provide the state-of-the-art in-domain
+generation quality and strong unseen domain generation capability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BRATI: Bidirectional Recurrent Attention for Time-Series Imputation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armando Collado-Villaverde, Pablo Muñoz, Maria D. R-Moreno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Missing data in time-series analysis poses significant challenges, affecting
+the reliability of downstream applications. Imputation, the process of
+estimating missing values, has emerged as a key solution. This paper introduces
+BRATI, a novel deep-learning model designed to address multivariate time-series
+imputation by combining Bidirectional Recurrent Networks and Attention
+mechanisms. BRATI processes temporal dependencies and feature correlations
+across long and short time horizons, utilizing two imputation blocks that
+operate in opposite temporal directions. Each block integrates recurrent layers
+and attention mechanisms to effectively resolve long-term dependencies.
+  We evaluate BRATI on three real-world datasets under diverse missing-data
+scenarios: randomly missing values, fixed-length missing sequences, and
+variable-length missing sequences. Our findings demonstrate that BRATI
+consistently outperforms state-of-the-art models, delivering superior accuracy
+and robustness in imputing multivariate time-series data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mechanistic understanding and validation of large AI models with
+  SemanticLens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Dreyer, Jim Berend, Tobias Labarta, Johanna Vielhaben, Thomas Wiegand, Sebastian Lapuschkin, Wojciech Samek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unlike human-engineered systems such as aeroplanes, where each component's
+role and dependencies are well understood, the inner workings of AI models
+remain largely opaque, hindering verifiability and undermining trust. This
+paper introduces SemanticLens, a universal explanation method for neural
+networks that maps hidden knowledge encoded by components (e.g., individual
+neurons) into the semantically structured, multimodal space of a foundation
+model such as CLIP. In this space, unique operations become possible, including
+(i) textual search to identify neurons encoding specific concepts, (ii)
+systematic analysis and comparison of model representations, (iii) automated
+labelling of neurons and explanation of their functional roles, and (iv) audits
+to validate decision-making against requirements. Fully scalable and operating
+without human input, SemanticLens is shown to be effective for debugging and
+validation, summarizing model knowledge, aligning reasoning with expectations
+(e.g., adherence to the ABCDE-rule in melanoma classification), and detecting
+components tied to spurious correlations and their associated training data. By
+enabling component-level understanding and validation, the proposed approach
+helps bridge the "trust gap" between AI models and traditional engineered
+systems. We provide code for SemanticLens on
+https://github.com/jim-berend/semanticlens and a demo on
+https://semanticlens.hhi-research-insights.eu.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>74 pages (18 pages manuscript, 7 pages references, 49 pages appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating Explainable AI for Effective Malware Detection in Encrypted
+  Network Traffic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sileshi Nibret Zeleke, Amsalu Fentie Jember, Mario Bochicchio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Encrypted network communication ensures confidentiality, integrity, and
+privacy between endpoints. However, attackers are increasingly exploiting
+encryption to conceal malicious behavior. Detecting unknown encrypted malicious
+traffic without decrypting the payloads remains a significant challenge. In
+this study, we investigate the integration of explainable artificial
+intelligence (XAI) techniques to detect malicious network traffic. We employ
+ensemble learning models to identify malicious activity using multi-view
+features extracted from various aspects of encrypted communication. To
+effectively represent malicious communication, we compiled a robust dataset
+with 1,127 unique connections, more than any other available open-source
+dataset, and spanning 54 malware families. Our models were benchmarked against
+the CTU-13 dataset, achieving performance of over 99% accuracy, precision, and
+F1-score. Additionally, the eXtreme Gradient Boosting (XGB) model demonstrated
+99.32% accuracy, 99.53% precision, and 99.43% F1-score on our custom dataset.
+By leveraging Shapley Additive Explanations (SHAP), we identified that the
+maximum packet size, mean inter-arrival time of packets, and transport layer
+security version used are the most critical features for the global model
+explanation. Furthermore, key features were identified as important for local
+explanations across both datasets for individual traffic samples. These
+insights provide a deeper understanding of the model decision-making process,
+enhancing the transparency and reliability of detecting malicious encrypted
+traffic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted and presented on PanAfriCon AI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerated Diffusion Models via Speculative Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valentin De Bortoli, Alexandre Galashov, Arthur Gretton, Arnaud Doucet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speculative sampling is a popular technique for accelerating inference in
+Large Language Models by generating candidate tokens using a fast draft model
+and accepting or rejecting them based on the target model's distribution. While
+speculative sampling was previously limited to discrete sequences, we extend it
+to diffusion models, which generate samples via continuous, vector-valued
+Markov chains. In this context, the target model is a high-quality but
+computationally expensive diffusion model. We propose various drafting
+strategies, including a simple and effective approach that does not require
+training a draft model and is applicable out of the box to any diffusion model.
+Our experiments demonstrate significant generation speedup on various diffusion
+models, halving the number of function evaluations, while generating exact
+samples from the target model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Developing a Foundation of Vector Symbolic Architectures Using Category
+  Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nolan P Shaw, P Michael Furlong, Britt Anderson, Jeff Orchard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  At the risk of overstating the case, connectionist approaches to machine
+learning, i.e. neural networks, are enjoying a small vogue right now. However,
+these methods require large volumes of data and produce models that are
+uninterpretable to humans. An alternative framework that is compatible with
+neural networks and gradient-based learning, but explicitly models
+compositionality, is Vector Symbolic Architectures (VSAs). VSAs are a family of
+algebras on high-dimensional vector representations. They arose in cognitive
+science from the need to unify neural processing and the kind of symbolic
+reasoning that humans perform. While machine learning methods have benefited
+from category theoretical analyses, VSAs have not yet received similar
+treatment. In this paper, we present a first attempt at applying category
+theory to VSAs. Specifically, we conduct a brief literature survey
+demonstrating the lacking intersection of these two topics, provide a list of
+desiderata for VSAs, and propose that VSAs may be understood as a (division)
+rig in a category enriched over a monoid in Met (the category of Lawvere metric
+spaces). This final contribution suggests that VSAs may be generalised beyond
+current implementations. It is our hope that grounding VSAs in category theory
+will lead to more rigorous connections with other research, both within and
+beyond, learning and cognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, no figures, 2 tables, one appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No-Regret Linear Bandits under Gap-Adjusted Misspecification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chong Liu, Dan Qiao, Ming Yin, Ilija Bogunovic, Yu-Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies linear bandits under a new notion of gap-adjusted
+misspecification and is an extension of Liu et al. (2023). When the underlying
+reward function is not linear, existing linear bandits work usually relies on a
+uniform misspecification parameter $\epsilon$ that measures the sup-norm error
+of the best linear approximation. This results in an unavoidable linear regret
+whenever $\epsilon > 0$. We propose a more natural model of misspecification
+which only requires the approximation error at each input $x$ to be
+proportional to the suboptimality gap at $x$. It captures the intuition that,
+for optimization problems, near-optimal regions should matter more and we can
+tolerate larger approximation errors in suboptimal regions.
+  Quite surprisingly, we show that the classical LinUCB algorithm -- designed
+for the realizable case -- is automatically robust against such
+$\rho$-gap-adjusted misspecification with parameter $\rho$ diminishing at
+$O(1/(d \sqrt{\log T}))$. It achieves a near-optimal $O(\sqrt{T})$ regret for
+problems that the best-known regret is almost linear in time horizon $T$. We
+further advance this frontier by presenting a novel phased elimination-based
+algorithm whose gap-adjusted misspecification parameter $\rho = O(1/\sqrt{d})$
+does not scale with $T$. This algorithm attains optimal $O(\sqrt{T})$ regret
+and is deployment-efficient, requiring only $\log T$ batches of exploration. It
+also enjoys an adaptive $O(\log T)$ regret when a constant suboptimality gap
+exists. Technically, our proof relies on a novel self-bounding argument that
+bounds the part of the regret due to misspecification by the regret itself, and
+a new inductive lemma that limits the misspecification error within the
+suboptimality gap for all valid actions in each batch selected by G-optimal
+design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2302.13252</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stream Aligner: Efficient Sentence-Level Alignment via Distribution
+  Induction <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hantao Lou, Jiaming Ji, Kaile Wang, Yaodong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of large language models (LLMs) has led to significant
+improvements in their capabilities, but also to increased concerns about their
+alignment with human values and intentions. Current alignment strategies,
+including adaptive training and inference-time methods, have demonstrated
+potential in this area. However, these approaches still struggle to balance
+deployment complexity and capability across various tasks and difficulties. In
+this work, we introduce the Streaming Distribution Induce Aligner (Stream
+Aligner), a novel alignment paradigm that combines efficiency with enhanced
+performance in various tasks throughout the generation process. Stream Aligner
+achieves dynamic sentence-level correction by using a small model to learn the
+preferences of the suffix sentence, iteratively correcting the suffix sentence
+output by the upstream model, and then using the corrected sentence to replace
+the suffix sentence in subsequent generations. Compared to Aligner, our
+experiments demonstrate that Stream Aligner reduces reliance on the
+capabilities of additional models, enhances the reasoning abilities of LLMs,
+and decreases latency during user interaction. Specifically, Stream Aligner-2B
+model has achieved an improvement of 76.1% in helpfulness, 36.0% in
+harmlessness on the tested Llama2-70B-chat model, and Stream Aligner-8B has
+achieved an improvement of 3.5% on the math ability of the tested
+Llama3-70B-Instruct model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI Alignment Track 2025 Poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stability and List-Replicability for Agnostic Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ari Blonda, Shan Gao, Hamed Hatami, Pooya Hatami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Two seminal papers--Alon, Livni, Malliaris, Moran (STOC 2019) and Bun, Livni,
+and Moran (FOCS 2020)--established the equivalence between online learnability
+and globally stable PAC learnability in binary classification. However, Chase,
+Chornomaz, Moran, and Yehudayoff (STOC 2024) recently showed that this
+equivalence does not hold in the agnostic setting. Specifically, they proved
+that in the agnostic setting, only finite hypothesis classes are globally
+stable learnable. Therefore, agnostic global stability is too restrictive to
+capture interesting hypothesis classes.
+  To address this limitation, Chase \emph{et al.} introduced two relaxations of
+agnostic global stability. In this paper, we characterize the classes that are
+learnable under their proposed relaxed conditions, resolving the two open
+problems raised in their work.
+  First, we prove that in the setting where the stability parameter can depend
+on the excess error (the gap between the learner's error and the best
+achievable error by the hypothesis class), agnostic stability is fully
+characterized by the Littlestone dimension. Consequently, as in the realizable
+case, this form of learnability is equivalent to online learnability.
+  As part of the proof of this theorem, we strengthen the celebrated result of
+Bun et al. by showing that classes with infinite Littlestone dimension are not
+stably PAC learnable, even if we allow the stability parameter to depend on the
+excess error.
+  For the second relaxation proposed by Chase et al., we prove that only finite
+hypothesis classes are globally stable learnable even if we restrict the
+agnostic setting to distributions with small population loss.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Transfer in Model-Based Reinforcement Learning Agents for
+  Efficient Multi-Task Learning <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05329v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05329v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmytro Kuzmenko, Nadiya Shvai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an efficient knowledge transfer approach for model-based
+reinforcement learning, addressing the challenge of deploying large world
+models in resource-constrained environments. Our method distills a
+high-capacity multi-task agent (317M parameters) into a compact 1M parameter
+model, achieving state-of-the-art performance on the MT30 benchmark with a
+normalized score of 28.45, a substantial improvement over the original 1M
+parameter model's score of 18.93. This demonstrates the ability of our
+distillation technique to consolidate complex multi-task knowledge effectively.
+Additionally, we apply FP16 post-training quantization, reducing the model size
+by 50% while maintaining performance. Our work bridges the gap between the
+power of large models and practical deployment constraints, offering a scalable
+solution for efficient and accessible multi-task reinforcement learning in
+robotics and other resource-limited domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of an extended abstract accepted to AAMAS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The explanation dialogues: an expert focus study to understand
+  requirements towards explanations within the GDPR 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura State, Alejandra Bringas Colmenarejo, Andrea Beretta, Salvatore Ruggieri, Franco Turini, Stephanie Law
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable AI (XAI) provides methods to understand non-interpretable machine
+learning models. However, we have little knowledge about what legal experts
+expect from these explanations, including their legal compliance with, and
+value against European Union legislation. To close this gap, we present the
+Explanation Dialogues, an expert focus study to uncover the expectations,
+reasoning, and understanding of legal experts and practitioners towards XAI,
+with a specific focus on the European General Data Protection Regulation. The
+study consists of an online questionnaire and follow-up interviews, and is
+centered around a use-case in the credit domain. We extract both a set of
+hierarchical and interconnected codes using grounded theory, and present the
+standpoints of the participating experts towards XAI. We find that the
+presented explanations are hard to understand and lack information, and discuss
+issues that can arise from the different interests of the data controller and
+subject. Finally, we present a set of recommendations for developers of XAI
+methods, and indications of legal areas of discussion. Among others,
+recommendations address the presentation, choice, and content of an
+explanation, technical risks as well as the end-user, while we provide legal
+pointers to the contestability of explanations, transparency thresholds,
+intellectual property rights as well as the relationship between involved
+parties.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Artificial Intelligence and Law (Springer Nature)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributed Learning and Inference Systems: A Networking Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hesham G. Moussa, Arashmid Akhavain, S. Maryam Hosseini, Bill McCormick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models have achieved, and in some cases surpassed,
+human-level performance in various tasks, mainly through centralized training
+of static models and the use of large models stored in centralized clouds for
+inference. However, this centralized approach has several drawbacks, including
+privacy concerns, high storage demands, a single point of failure, and
+significant computing requirements. These challenges have driven interest in
+developing alternative decentralized and distributed methods for AI training
+and inference. Distribution introduces additional complexity, as it requires
+managing multiple moving parts. To address these complexities and fill a gap in
+the development of distributed AI systems, this work proposes a novel
+framework, Data and Dynamics-Aware Inference and Training Networks (DA-ITN).
+The different components of DA-ITN and their functions are explored, and the
+associated challenges and research areas are highlighted.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to IEEE Network magazine and is still
+  under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Distributed Deployment of Mixture-of-Experts Model Inference
+  in Serverless Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengfan Liu, Wei Wang, Chuan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of serverless computing, running machine learning (ML)
+inference services over a serverless platform has been advocated, given its
+labor-free scalability and cost effectiveness. Mixture-of-Experts (MoE) models
+have been a dominant type of model architectures to enable large models
+nowadays, with parallel expert networks. Serving large MoE models on serverless
+computing is potentially beneficial, but has been underexplored due to
+substantial challenges in handling the skewed expert popularity and
+scatter-gather communication bottleneck in MoE model execution, for
+cost-efficient serverless MoE deployment and performance guarantee. We study
+optimized MoE model deployment and distributed inference serving on a
+serverless platform, that effectively predict expert selection, pipeline
+communication with model execution, and minimize the overall billed cost of
+serving MoE models. Especially, we propose a Bayesian optimization framework
+with multi-dimensional epsilon-greedy search to learn expert selections and
+optimal MoE deployment achieving optimal billed cost, including: 1) a Bayesian
+decision-making method for predicting expert popularity; 2) flexibly pipelined
+scatter-gather communication; and 3) an optimal model deployment algorithm for
+distributed MoE serving. Extensive experiments on AWS Lambda show that our
+designs reduce the billed cost of all MoE layers by at least 75.67% compared to
+CPU clusters while maintaining satisfactory inference throughput. As compared
+to LambdaML in serverless computing, our designs achieves 43.41% lower cost
+with a throughput decrease of at most 18.76%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Private Selection with Heterogeneous Sensitivities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniela Antonova, Allegra Laro, Audra McMillan, Lorenz Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differentially private (DP) selection involves choosing a high-scoring
+candidate from a finite candidate pool, where each score depends on a sensitive
+dataset. This problem arises naturally in a variety of contexts including model
+selection, hypothesis testing, and within many DP algorithms. Classical
+methods, such as Report Noisy Max (RNM), assume all candidates' scores are
+equally sensitive to changes in a single individual's data, but this often
+isn't the case. To address this, algorithms like the Generalised Exponential
+Mechanism (GEM) leverage variability in candidate sensitivities. However, we
+observe that while these algorithms can outperform RNM in some situations, they
+may underperform in others - they can even perform worse than random selection.
+In this work, we explore how the distribution of scores and sensitivities
+impacts DP selection mechanisms. In all settings we study, we find that there
+exists a mechanism that utilises heterogeneity in the candidate sensitivities
+that outperforms standard mechanisms like RNM. However, no single mechanism
+uniformly outperforms RNM. We propose using the correlation between the scores
+and sensitivities as the basis for deciding which DP selection mechanism to
+use. Further, we design a slight variant of GEM, modified GEM that generally
+performs well whenever GEM performs poorly. Relying on the correlation
+heuristic we propose combined GEM, which adaptively chooses between GEM and
+modified GEM and outperforms both in polarised settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparison Study: Glacier Calving Front Delineation in Synthetic
+  Aperture Radar Images With Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nora Gourmelon, Konrad Heidler, Erik Loebel, Daniel Cheng, Julian Klink, Anda Dong, Fei Wu, Noah Maul, Moritz Koch, Marcel Dreier, Dakota Pyles, Thorsten Seehaus, Matthias Braun, Andreas Maier, Vincent Christlein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Calving front position variation of marine-terminating glaciers is an
+indicator of ice mass loss and a crucial parameter in numerical glacier models.
+Deep Learning (DL) systems can automatically extract this position from
+Synthetic Aperture Radar (SAR) imagery, enabling continuous, weather- and
+illumination-independent, large-scale monitoring. This study presents the first
+comparison of DL systems on a common calving front benchmark dataset. A
+multi-annotator study with ten annotators is performed to contrast the
+best-performing DL system against human performance. The best DL model's
+outputs deviate 221 m on average, while the average deviation of the human
+annotators is 38 m. This significant difference shows that current DL systems
+do not yet match human performance and that further research is needed to
+enable fully automated monitoring of glacier calving fronts. The study of
+Vision Transformers, foundation models, and the inclusion and processing
+strategy of more information are identified as avenues for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning convolution operators on compact Abelian groups 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emilia Magnani, Ernesto De Vito, Philipp Hennig, Lorenzo Rosasco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning convolution operators associated to
+compact Abelian groups. We study a regularization-based approach and provide
+corresponding learning guarantees, discussing natural regularity condition on
+the convolution kernel. More precisely, we assume the convolution kernel is a
+function in a translation invariant Hilbert space and analyze a natural ridge
+regression (RR) estimator. Building on existing results for RR, we characterize
+the accuracy of the estimator in terms of finite sample bounds. Interestingly,
+regularity assumptions which are classical in the analysis of RR, have a novel
+and natural interpretation in terms of space/frequency localization.
+Theoretical results are illustrated by numerical simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Off-Policy Evaluation and Counterfactual Methods in Dynamic Auction
+  Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ritam Guha, Nilavra Pathak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual estimators are critical for learning and refining policies
+using logged data, a process known as Off-Policy Evaluation (OPE). OPE allows
+researchers to assess new policies without costly experiments, speeding up the
+evaluation process. Online experimental methods, such as A/B tests, are
+effective but often slow, thus delaying the policy selection and optimization
+process.
+  In this work, we explore the application of OPE methods in the context of
+resource allocation in dynamic auction environments. Given the competitive
+nature of environments where rapid decision-making is crucial for gaining a
+competitive edge, the ability to quickly and accurately assess algorithmic
+performance is essential. By utilizing counterfactual estimators as a
+preliminary step before conducting A/B tests, we aim to streamline the
+evaluation process, reduce the time and resources required for experimentation,
+and enhance confidence in the chosen policies. Our investigation focuses on the
+feasibility and effectiveness of using these estimators to predict the outcomes
+of potential resource allocation strategies, evaluate their performance, and
+facilitate more informed decision-making in policy selection. Motivated by the
+outcomes of our initial study, we envision an advanced analytics system
+designed to seamlessly and dynamically assess new resource allocation
+strategies and policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 15 figures, IEEE format</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CellViT++: Energy-Efficient and Adaptive Cell Segmentation and
+  Classification Using Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Hörst, Moritz Rempe, Helmut Becker, Lukas Heine, Julius Keyl, Jens Kleesiek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital Pathology is a cornerstone in the diagnosis and treatment of
+diseases. A key task in this field is the identification and segmentation of
+cells in hematoxylin and eosin-stained images. Existing methods for cell
+segmentation often require extensive annotated datasets for training and are
+limited to a predefined cell classification scheme. To overcome these
+limitations, we propose $\text{CellViT}^{{\scriptscriptstyle ++}}$, a framework
+for generalized cell segmentation in digital pathology.
+$\text{CellViT}^{{\scriptscriptstyle ++}}$ utilizes Vision Transformers with
+foundation models as encoders to compute deep cell features and segmentation
+masks simultaneously. To adapt to unseen cell types, we rely on a
+computationally efficient approach. It requires minimal data for training and
+leads to a drastically reduced carbon footprint. We demonstrate excellent
+performance on seven different datasets, covering a broad spectrum of cell
+types, organs, and clinical settings. The framework achieves remarkable
+zero-shot segmentation and data-efficient cell-type classification.
+Furthermore, we show that $\text{CellViT}^{{\scriptscriptstyle ++}}$ can
+leverage immunofluorescence stainings to generate training datasets without the
+need for pathologist annotations. The automated dataset generation approach
+surpasses the performance of networks trained on manually labeled data,
+demonstrating its effectiveness in creating high-quality training datasets
+without expert annotations. To advance digital pathology,
+$\text{CellViT}^{{\scriptscriptstyle ++}}$ is available as an open-source
+framework featuring a user-friendly, web-based interface for visualization and
+annotation. The code is available under
+https://github.com/TIO-IKIM/CellViT-plus-plus.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Plagiarism Detection in Marathi with a Weighted Ensemble of
+  TF-IDF and <span class="highlight-title">BERT</span> Embeddings for Low-Resource Language Processing <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atharva Mutsaddi, Aditya Choudhary
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plagiarism involves using another person's work or concepts without proper
+attribution, presenting them as original creations. With the growing amount of
+data communicated in regional languages such as Marathi -- one of India's
+regional languages -- it is crucial to design robust plagiarism detection
+systems tailored for low-resource languages. Language models like Bidirectional
+Encoder Representations from Transformers (BERT) have demonstrated exceptional
+capability in text representation and feature extraction, making them essential
+tools for semantic analysis and plagiarism detection. However, the application
+of BERT for low-resource languages remains under-explored, particularly in the
+context of plagiarism detection. This paper presents a method to enhance the
+accuracy of plagiarism detection for Marathi texts using BERT sentence
+embeddings in conjunction with Term Frequency-Inverse Document Frequency
+(TF-IDF) feature representation. This approach effectively captures
+statistical, semantic, and syntactic aspects of text features through a
+weighted voting ensemble of machine learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted into LoResLM: The First Workshop on Language Models for
+  Low-Resource Languages, colocated with COLING 2025 and set to be published
+  into ACL Anthology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deriving Coding-Specific Sub-Models from LLMs using Resource-Efficient
+  Pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Puccioni, Alireza Farshin, Mariano Scazzariello, Changjie Wang, Marco Chiesa, Dejan Kostic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated their exceptional performance
+in various complex code generation tasks. However, their broader adoption is
+limited by significant computational demands and high resource requirements,
+particularly memory and processing power. To mitigate such requirements, model
+pruning techniques are used to create more compact models with significantly
+fewer parameters. However, current approaches do not focus on the efficient
+extraction of programming-language-specific sub-models. In this work, we
+explore the idea of efficiently deriving coding-specific sub-models through
+unstructured pruning (i.e., Wanda). We investigate the impact of different
+domain-specific calibration datasets on pruning outcomes across three distinct
+domains and extend our analysis to extracting four language-specific
+sub-models: Python, Java, C++, and JavaScript. We are the first to efficiently
+extract programming-language-specific sub-models using appropriate calibration
+datasets while maintaining acceptable accuracy w.r.t. full models. We are also
+the first to provide analytical evidence that domain-specific tasks activate
+distinct regions within LLMs, supporting the creation of specialized sub-models
+through unstructured pruning. We believe that this work has significant
+potential to enhance LLM accessibility for coding by reducing computational
+requirements to enable local execution on consumer-grade hardware, and
+supporting faster inference times critical for real-time development feedback.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Estonian TV Subtitles with Semi-supervised Learning and LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Artem Fedorchenko, Tanel Alumäe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an approach for generating high-quality, same-language
+subtitles for Estonian TV content. We fine-tune the Whisper model on
+human-generated Estonian subtitles and enhance it with iterative
+pseudo-labeling and large language model (LLM) based post-editing. Our
+experiments demonstrate notable subtitle quality improvement through
+pseudo-labeling with an unlabeled dataset. We find that applying LLM-based
+editing at test time enhances subtitle accuracy, while its use during training
+does not yield further gains. This approach holds promise for creating subtitle
+quality close to human standard and could be extended to real-time
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Light Transport-aware Diffusion Posterior Sampling for Single-View
+  Reconstruction of 3D Volumes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludwic Leonard, Nils Thuerey, Ruediger Westermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a single-view reconstruction technique of volumetric fields in
+which multiple light scattering effects are omnipresent, such as in clouds. We
+model the unknown distribution of volumetric fields using an unconditional
+diffusion model trained on a novel benchmark dataset comprising 1,000
+synthetically simulated volumetric density fields. The neural diffusion model
+is trained on the latent codes of a novel, diffusion-friendly, monoplanar
+representation. The generative model is used to incorporate a tailored
+parametric diffusion posterior sampling technique into different reconstruction
+tasks. A physically-based differentiable volume renderer is employed to provide
+gradients with respect to light transport in the latent space. This stands in
+contrast to classic NeRF approaches and makes the reconstructions better
+aligned with observed data. Through various experiments, we demonstrate
+single-view reconstruction of volumetric clouds at a previously unattainable
+quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EVA-S2PLoR: A Secure Element-wise Multiplication Meets Logistic
+  Regression on Heterogeneous Database 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianle Tao, Shizhao Peng, Tianyu Mei, Shoumo Li, Haogang Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate nonlinear computation is a key challenge in privacy-preserving
+machine learning (PPML). Most existing frameworks approximate it through linear
+operations, resulting in significant precision loss. This paper proposes an
+efficient, verifiable and accurate security 2-party logistic regression
+framework (EVA-S2PLoR), which achieves accurate nonlinear function computation
+through a novel secure element-wise multiplication protocol and its derived
+protocols. Our framework primarily includes secure 2-party vector element-wise
+multiplication, addition to multiplication, reciprocal, and sigmoid function
+based on data disguising technology, where high efficiency and accuracy are
+guaranteed by the simple computation flow based on the real number domain and
+the few number of fixed communication rounds. We provide secure and robust
+anomaly detection through dimension transformation and Monte Carlo methods.
+EVA-S2PLoR outperforms many advanced frameworks in terms of precision
+(improving the performance of the sigmoid function by about 10 orders of
+magnitude compared to most frameworks) and delivers the best overall
+performance in secure logistic regression experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoDe: Communication Delay-Tolerant Multi-Agent Collaboration via Dual
+  Alignment of Intent and Timeliness <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shoucheng Song, Youfang Lin, Sheng Han, Chang Yao, Hao Wu, Shuo Wang, Kai Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Communication has been widely employed to enhance multi-agent collaboration.
+Previous research has typically assumed delay-free communication, a strong
+assumption that is challenging to meet in practice. However, real-world agents
+suffer from channel delays, receiving messages sent at different time points,
+termed {\it{Asynchronous Communication}}, leading to cognitive biases and
+breakdowns in collaboration. This paper first defines two communication delay
+settings in MARL and emphasizes their harm to collaboration. To handle the
+above delays, this paper proposes a novel framework, Communication
+Delay-tolerant Multi-Agent Collaboration (CoDe). At first, CoDe learns an
+intent representation as messages through future action inference, reflecting
+the stable future behavioral trends of the agents. Then, CoDe devises a dual
+alignment mechanism of intent and timeliness to strengthen the fusion process
+of asynchronous messages. In this way, agents can extract the long-term intent
+of others, even from delayed messages, and selectively utilize the most recent
+messages that are relevant to their intent. Experimental results demonstrate
+that CoDe outperforms baseline algorithms in three MARL benchmarks without
+delay and exhibits robustness under fixed and time-varying delays.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Design and Control of a Bipedal Robotic Character 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruben Grandia, Espen Knoop, Michael A. Hopkins, Georg Wiedebach, Jared Bishop, Steven Pickles, David Müller, Moritz Bächer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legged robots have achieved impressive feats in dynamic locomotion in
+challenging unstructured terrain. However, in entertainment applications, the
+design and control of these robots face additional challenges in appealing to
+human audiences. This work aims to unify expressive, artist-directed motions
+and robust dynamic mobility for legged robots. To this end, we introduce a new
+bipedal robot, designed with a focus on character-driven mechanical features.
+We present a reinforcement learning-based control architecture to robustly
+execute artistic motions conditioned on command signals. During runtime, these
+command signals are generated by an animation engine which composes and blends
+between multiple animation sources. Finally, an intuitive operator interface
+enables real-time show performances with the robot. The complete system results
+in a believable robotic character, and paves the way for enhanced human-robot
+engagement in various contexts, in entertainment robotics and beyond.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Algorithmic Approach for Causal Health Equity: A Look at Race
+  Differentials in Intensive Care Unit (ICU) Outcomes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Drago Plecko, Paul Secombe, Andrea Clarke, Amelia Fiske, Samarra Toby, Donisha Duff, David Pilcher, Leo Anthony Celi, Rinaldo Bellomo, Elias Bareinboim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The new era of large-scale data collection and analysis presents an
+opportunity for diagnosing and understanding the causes of health inequities.
+In this study, we describe a framework for systematically analyzing health
+disparities using causal inference. The framework is illustrated by
+investigating racial and ethnic disparities in intensive care unit (ICU)
+outcome between majority and minority groups in Australia (Indigenous vs.
+Non-Indigenous) and the United States (African-American vs. White). We
+demonstrate that commonly used statistical measures for quantifying inequity
+are insufficient, and focus on attributing the observed disparity to the causal
+mechanisms that generate it. We find that minority patients are younger at
+admission, have worse chronic health, are more likely to be admitted for urgent
+and non-elective reasons, and have higher illness severity. At the same time,
+however, we find a protective direct effect of belonging to a minority group,
+with minority patients showing improved survival compared to their majority
+counterparts, with all other variables kept equal. We demonstrate that this
+protective effect is related to the increased probability of being admitted to
+ICU, with minority patients having an increased risk of ICU admission. We also
+find that minority patients, while showing improved survival, are more likely
+to be readmitted to ICU. Thus, due to worse access to primary health care,
+minority patients are more likely to end up in ICU for preventable conditions,
+causing a reduction in the mortality rates and creating an effect that appears
+to be protective. Since the baseline risk of ICU admission may serve as proxy
+for lack of access to primary care, we developed the Indigenous Intensive Care
+Equity (IICE) Radar, a monitoring system for tracking the over-utilization of
+ICU resources by the Indigenous population of Australia across geographical
+areas.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Radio<span class="highlight-title">Transformer</span>: Accurate Radio Map Construction and Coverage
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05190v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05190v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Li, Cheng Zhang, Wen Wang, Yongming Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radio map, or pathloss map prediction, is a crucial method for wireless
+network modeling and management. By leveraging deep learning to construct
+pathloss patterns from geographical maps, an accurate digital replica of the
+transmission environment could be established with less computational overhead
+and lower prediction error compared to traditional model-driven techniques.
+While existing state-of-the-art (SOTA) methods predominantly rely on
+convolutional architectures, this paper introduces a hybrid
+transformer-convolution model, termed RadioTransformer, to enhance the accuracy
+of radio map prediction. The proposed model features a multi-scale
+transformer-based encoder for efficient feature extraction and a
+convolution-based decoder for precise pixel-level image reconstruction.
+Simulation results demonstrate that the proposed scheme significantly improves
+prediction accuracy, and over a 30% reduction in root mean square error (RMSE)
+is achieved compared to typical SOTA approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE VTC 2025 Spring</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ De-centering the (Traditional) User: Multistakeholder Evaluation of
+  Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robin Burke, Gediminas Adomavicius, Toine Bogers, Tommaso Di Noia, Dominik Kowald, Julia Neidhardt, Özlem Özgöbek, Maria Soledad Pera, Nava Tintarev, Jürgen Ziegler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multistakeholder recommender systems are those that account for the impacts
+and preferences of multiple groups of individuals, not just the end users
+receiving recommendations. Due to their complexity, evaluating these systems
+cannot be restricted to the overall utility of a single stakeholder, as is
+often the case of more mainstream recommender system applications. In this
+article, we focus our discussion on the intricacies of the evaluation of
+multistakeholder recommender systems. We bring attention to the different
+aspects involved in the evaluation of multistakeholder recommender systems -
+from the range of stakeholders involved (including but not limited to producers
+and consumers) to the values and specific goals of each relevant stakeholder.
+Additionally, we discuss how to move from theoretical principles to practical
+implementation, providing specific use case examples. Finally, we outline open
+research directions for the RecSys community to explore. We aim to provide
+guidance to researchers and practitioners about how to think about these
+complex and domain-dependent issues of evaluation in the course of designing,
+developing, and researching applications with multistakeholder aspects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to Elsevier, "Re-centering the User in Recommender
+  System Research" special issue of the International Journal of Human-Computer
+  Studies (IJHCS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning In-Distribution Representations for Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05130v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05130v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William T. Lunardi, Abdulrahman Banabila, Dania Herzalla, Martin L. Andreoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection involves identifying data patterns that deviate from the
+anticipated norm. Traditional methods struggle in high-dimensional spaces due
+to the curse of dimensionality. In recent years, self-supervised learning,
+particularly through contrastive objectives, has driven advances in anomaly
+detection. However, vanilla contrastive learning struggles to align with the
+unique demands of anomaly detection, as it lacks a pretext task tailored to the
+homogeneous nature of In-Distribution (ID) data and the diversity of
+Out-of-Distribution (OOD) anomalies. Methods that attempt to address these
+challenges, such as introducing hard negatives through synthetic outliers,
+Outlier Exposure (OE), and supervised objectives, often rely on pretext tasks
+that fail to balance compact clustering of ID samples with sufficient
+separation from OOD data. In this work, we propose Focused In-distribution
+Representation Modeling (FIRM), a contrastive learning objective specifically
+designed for anomaly detection. Unlike existing approaches, FIRM incorporates
+synthetic outliers into its pretext task in a way that actively shapes the
+representation space, promoting compact clustering of ID samples while
+enforcing strong separation from outliers. This formulation addresses the
+challenges of class collision, enhancing both the compactness of ID
+representations and the discriminative power of the learned feature space. We
+show that FIRM surpasses other contrastive methods in standard benchmarks,
+significantly enhancing anomaly detection compared to both traditional and
+supervised contrastive learning objectives. Our ablation studies confirm that
+FIRM consistently improves the quality of representations and shows robustness
+across a range of scoring methods. The code is available at:
+https://github.com/willtl/firm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constrained Optimization of Charged Particle Tracking with Multi-Agent
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Kortus, Ralf Keidel, Nicolas R. Gauger, Jan Kieseler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning demonstrated immense success in modelling complex
+physics-driven systems, providing end-to-end trainable solutions by interacting
+with a simulated or real environment, maximizing a scalar reward signal. In
+this work, we propose, building upon previous work, a multi-agent reinforcement
+learning approach with assignment constraints for reconstructing particle
+tracks in pixelated particle detectors. Our approach optimizes collaboratively
+a parametrized policy, functioning as a heuristic to a multidimensional
+assignment problem, by jointly minimizing the total amount of particle
+scattering over the reconstructed tracks in a readout frame. To satisfy
+constraints, guaranteeing a unique assignment of particle hits, we propose a
+safety layer solving a linear assignment problem for every joint action.
+Further, to enforce cost margins, increasing the distance of the local policies
+predictions to the decision boundaries of the optimizer mappings, we recommend
+the use of an additional component in the blackbox gradient estimation, forcing
+the policy to solutions with lower total assignment costs. We empirically show
+on simulated data, generated for a particle detector developed for proton
+imaging, the effectiveness of our approach, compared to multiple single- and
+multi-agent baselines. We further demonstrate the effectiveness of constraints
+with cost margins for both optimization and generalization, introduced by wider
+regions with high reconstruction performance as well as reduced predictive
+instabilities. Our results form the basis for further developments in RL-based
+tracking, offering both enhanced performance with constrained policies and
+greater flexibility in optimizing tracking algorithms through the option for
+individual and team rewards.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EquiBoost: An Equivariant Boosting Approach to Molecular Conformation
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Yang, Xingyu Fang, Zhaowen Cheng, Pengju Yan, Xiaolin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Molecular conformation generation plays key roles in computational drug
+design. Recently developed deep learning methods, particularly diffusion models
+have reached competitive performance over traditional cheminformatical
+approaches. However, these methods are often time-consuming or require extra
+support from traditional methods. We propose EquiBoost, a boosting model that
+stacks several equivariant graph transformers as weak learners, to iteratively
+refine 3D conformations of molecules. Without relying on diffusion techniques,
+EquiBoost balances accuracy and efficiency more effectively than
+diffusion-based methods. Notably, compared to the previous state-of-the-art
+diffusion method, EquiBoost improves generation quality and preserves
+diversity, achieving considerably better precision of Average Minimum RMSD
+(AMR) on the GEOM datasets. This work rejuvenates boosting and sheds light on
+its potential to be a robust alternative to diffusion models in certain
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Score Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05105v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05105v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Schwank, Andrew McCormack, Mathias Drton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Proposed in Hyv\"arinen (2005), score matching is a parameter estimation
+procedure that does not require computation of distributional normalizing
+constants. In this work we utilize the geometric median of means to develop a
+robust score matching procedure that yields consistent parameter estimates in
+settings where the observed data has been contaminated. A special appeal of the
+proposed method is that it retains convexity in exponential family models. The
+new method is therefore particularly attractive for non-Gaussian, exponential
+family graphical models where evaluation of normalizing constants is
+intractable. Support recovery guarantees for such models when contamination is
+present are provided. Additionally, support recovery is studied in numerical
+experiments and on a precipitation dataset. We demonstrate that the proposed
+robust score matching estimator performs comparably to the standard score
+matching estimator when no contamination is present but greatly outperforms
+this estimator in a setting with contamination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A 1Mb mixed-precision quantized encoder for image classification and
+  patch-based compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Thien Nguyen, William Guicquero, Gilles Sicard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even if Application-Specific Integrated Circuits (ASIC) have proven to be a
+relevant choice for integrating inference at the edge, they are often limited
+in terms of applicability. In this paper, we demonstrate that an ASIC neural
+network accelerator dedicated to image processing can be applied to multiple
+tasks of different levels: image classification and compression, while
+requiring a very limited hardware. The key component is a reconfigurable,
+mixed-precision (3b/2b/1b) encoder that takes advantage of proper weight and
+activation quantizations combined with convolutional layer structural pruning
+to lower hardware-related constraints (memory and computing). We introduce an
+automatic adaptation of linear symmetric quantizer scaling factors to perform
+quantized levels equalization, aiming at stabilizing quinary and ternary
+weights training. In addition, a proposed layer-shared Bit-Shift Normalization
+significantly simplifies the implementation of the hardware-expensive Batch
+Normalization. For a specific configuration in which the encoder design only
+requires 1Mb, the classification accuracy reaches 87.5% on CIFAR-10. Besides,
+we also show that this quantized encoder can be used to compress image
+patch-by-patch while the reconstruction can performed remotely, by a dedicated
+full-frame decoder. This solution typically enables an end-to-end compression
+almost without any block artifacts, outperforming patch-based state-of-the-art
+techniques employing a patch-constant bitrate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE Transactions on Circuits and Systems for Video
+  Technology (TCSVT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Decomposed Dual-domain Deep Learning for Sparse-View CT
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoseob Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: X-ray computed tomography employing sparse projection views has
+emerged as a contemporary technique to mitigate radiation dose. However, due to
+the inadequate number of projection views, an analytic reconstruction method
+utilizing filtered backprojection results in severe streaking artifacts.
+Recently, deep learning strategies employing image-domain networks have
+demonstrated remarkable performance in eliminating the streaking artifact
+caused by analytic reconstruction methods with sparse projection views.
+Nevertheless, it is difficult to clarify the theoretical justification for
+applying deep learning to sparse view CT reconstruction, and it has been
+understood as restoration by removing image artifacts, not reconstruction.
+  Approach: By leveraging the theory of deep convolutional framelets and the
+hierarchical decomposition of measurement, this research reveals the
+constraints of conventional image- and projection-domain deep learning
+methodologies, subsequently, the research proposes a novel dual-domain deep
+learning framework utilizing hierarchical decomposed measurements.
+Specifically, the research elucidates how the performance of the
+projection-domain network can be enhanced through a low-rank property of deep
+convolutional framelets and a bowtie support of hierarchical decomposed
+measurement in the Fourier domain.
+  Main Results: This study demonstrated performance improvement of the proposed
+framework based on the low-rank property, resulting in superior reconstruction
+performance compared to conventional analytic and deep learning methods.
+  Significance: By providing a theoretically justified deep learning approach
+for sparse-view CT reconstruction, this study not only offers a superior
+alternative to existing methods but also opens new avenues for research in
+medical imaging.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published by Physics in Medicine & Biology (2024.4)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Supervised Learning with Evolving Tasks and Performance Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Verónica Álvarez, Santiago Mazuelas, Jose A. Lozano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple supervised learning scenarios are composed by a sequence of
+classification tasks. For instance, multi-task learning and continual learning
+aim to learn a sequence of tasks that is either fixed or grows over time.
+Existing techniques for learning tasks that are in a sequence are tailored to
+specific scenarios, lacking adaptability to others. In addition, most of
+existing techniques consider situations in which the order of the tasks in the
+sequence is not relevant. However, it is common that tasks in a sequence are
+evolving in the sense that consecutive tasks often have a higher similarity.
+This paper presents a learning methodology that is applicable to multiple
+supervised learning scenarios and adapts to evolving tasks. Differently from
+existing techniques, we provide computable tight performance guarantees and
+analytically characterize the increase in the effective sample size.
+Experiments on benchmark datasets show the performance improvement of the
+proposed methodology in multiple scenarios and the reliability of the presented
+performance guarantees.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2310.15974</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Quantile Regression with Spiking Neural Networks for Long-Term
+  System Health Prognostics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David J Poland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel predictive maintenance framework centered on
+Enhanced Quantile Regression Neural Networks EQRNNs, for anticipating system
+failures in industrial robotics. We address the challenge of early failure
+detection through a hybrid approach that combines advanced neural
+architectures. The system leverages dual computational stages: first
+implementing an EQRNN optimized for processing multi-sensor data streams
+including vibration, thermal, and power signatures, followed by an integrated
+Spiking Neural Network SNN, layer that enables microsecond-level response
+times. This architecture achieves notable accuracy rates of 92.3\% in component
+failure prediction with a 90-hour advance warning window. Field testing
+conducted on an industrial scale with 50 robotic systems demonstrates
+significant operational improvements, yielding a 94\% decrease in unexpected
+system failures and 76\% reduction in maintenance-related downtimes. The
+framework's effectiveness in processing complex, multi-modal sensor data while
+maintaining computational efficiency validates its applicability for Industry
+4.0 manufacturing environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ End-to-End Deep Learning for Interior Tomography with Low-Dose X-ray CT 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoseob Han, Dufan Wu, Kyungsang Kim, Quanzheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: There exist several X-ray computed tomography (CT) scanning
+strategies to reduce a radiation dose, such as (1) sparse-view CT, (2) low-dose
+CT, and (3) region-of-interest (ROI) CT (called interior tomography). To
+further reduce the dose, the sparse-view and/or low-dose CT settings can be
+applied together with interior tomography. Interior tomography has various
+advantages in terms of reducing the number of detectors and decreasing the
+X-ray radiation dose. However, a large patient or small field-of-view (FOV)
+detector can cause truncated projections, and then the reconstructed images
+suffer from severe cupping artifacts. In addition, although the low-dose CT can
+reduce the radiation exposure dose, analytic reconstruction algorithms produce
+image noise. Recently, many researchers have utilized image-domain deep
+learning (DL) approaches to remove each artifact and demonstrated impressive
+performances, and the theory of deep convolutional framelets supports the
+reason for the performance improvement. Approach: In this paper, we found that
+the image-domain convolutional neural network (CNN) is difficult to solve
+coupled artifacts, based on deep convolutional framelets. Significance: To
+address the coupled problem, we decouple it into two sub-problems: (i) image
+domain noise reduction inside truncated projection to solve low-dose CT problem
+and (ii) extrapolation of projection outside truncated projection to solve the
+ROI CT problem. The decoupled sub-problems are solved directly with a novel
+proposed end-to-end learning using dual-domain CNNs. Main results: We
+demonstrate that the proposed method outperforms the conventional image-domain
+deep learning methods, and a projection-domain CNN shows better performance
+than the image-domain CNNs which are commonly used by many researchers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published by Physics in Medicine & Biology (2022.5)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparison of Feature Learning Methods for Metadata Extraction from PDF
+  Scholarly Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05082v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05082v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyd Boukhers, Cong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of metadata for scientific documents is pivotal in
+propelling scientific knowledge forward and for adhering to the FAIR principles
+(i.e. Findability, Accessibility, Interoperability, and Reusability) of
+research findings. However, the lack of sufficient metadata in published
+documents, particularly those from smaller and mid-sized publishers, hinders
+their accessibility. This issue is widespread in some disciplines, such as the
+German Social Sciences, where publications often employ diverse templates. To
+address this challenge, our study evaluates various feature learning and
+prediction methods, including natural language processing (NLP), computer
+vision (CV), and multimodal approaches, for extracting metadata from documents
+with high template variance. We aim to improve the accessibility of scientific
+documents and facilitate their wider use. To support our comparison of these
+methods, we provide comprehensive experimental results, analyzing their
+accuracy and efficiency in extracting metadata. Additionally, we provide
+valuable insights into the strengths and weaknesses of various feature learning
+and prediction methods, which can guide future research in this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DriVLM: Domain Adaptation of Vision-Language Models in Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuran Zheng, Chang D. Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, large language models have had a very impressive
+performance, which largely contributed to the development and application of
+artificial intelligence, and the parameters and performance of the models are
+still growing rapidly. In particular, multimodal large language models (MLLM)
+can combine multiple modalities such as pictures, videos, sounds, texts, etc.,
+and have great potential in various tasks. However, most MLLMs require very
+high computational resources, which is a major challenge for most researchers
+and developers. In this paper, we explored the utility of small-scale MLLMs and
+applied small-scale MLLMs to the field of autonomous driving. We hope that this
+will advance the application of MLLMs in real-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing Memorization in Large Language Models through the Lens of
+  Model Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tarun Ram Menta, Susmit Agrawal, Chirag Agarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are prevalent in modern applications but often
+memorize training data, leading to privacy breaches and copyright issues.
+Existing research has mainly focused on posthoc analyses, such as extracting
+memorized content or developing memorization metrics, without exploring the
+underlying architectural factors that contribute to memorization. In this work,
+we investigate memorization from an architectural lens by analyzing how
+attention modules at different layers impact its memorization and
+generalization performance. Using attribution techniques, we systematically
+intervene in the LLM architecture by bypassing attention modules at specific
+blocks while keeping other components like layer normalization and MLP
+transformations intact. We provide theorems analyzing our intervention
+mechanism from a mathematical view, bounding the difference in layer outputs
+with and without our attributions. Our theoretical and empirical analyses
+reveal that attention modules in deeper transformer blocks are primarily
+responsible for memorization, whereas earlier blocks are crucial for the models
+generalization and reasoning capabilities. We validate our findings through
+comprehensive experiments on different LLM families (Pythia and GPTNeo) and
+five benchmark datasets. Our insights offer a practical approach to mitigate
+memorization in LLMs while preserving their performance, contributing to safer
+and more ethical deployment in real world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TipSegNet: Fingertip Segmentation in Contactless Fingerprint Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurenz Ruzicka, Bernhard Kohn, Clemens Heitzinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contactless fingerprint recognition systems offer a hygienic, user-friendly,
+and efficient alternative to traditional contact-based methods. However, their
+accuracy heavily relies on precise fingertip detection and segmentation,
+particularly under challenging background conditions. This paper introduces
+TipSegNet, a novel deep learning model that achieves state-of-the-art
+performance in segmenting fingertips directly from grayscale hand images.
+TipSegNet leverages a ResNeXt-101 backbone for robust feature extraction,
+combined with a Feature Pyramid Network (FPN) for multi-scale representation,
+enabling accurate segmentation across varying finger poses and image qualities.
+Furthermore, we employ an extensive data augmentation strategy to enhance the
+model's generalizability and robustness. TipSegNet outperforms existing
+methods, achieving a mean Intersection over Union (mIoU) of 0.987 and an
+accuracy of 0.999, representing a significant advancement in contactless
+fingerprint segmentation. This enhanced accuracy has the potential to
+substantially improve the reliability and effectiveness of contactless
+biometric systems in real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Text-Based Knowledge-Embedded Soft Sensing Modeling Approach for
+  General Industrial Process Tasks Based on Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Tong, Han Liu, Runyuan Guo, Xueqiong Tian, Wenqing Wang, Ding Liu, Youmin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven soft sensors (DDSS) have become mainstream methods for predicting
+key performance indicators in process industries. However, DDSS development
+requires complex and costly customized designs tailored to various tasks during
+the modeling process. Moreover, DDSS are constrained to a single structured
+data modality, limiting their ability to incorporate additional contextual
+knowledge. Furthermore, DDSSs' limited representation learning leads to weak
+predictive performance with scarce data. To address these challenges, we
+propose a general framework named LLM-TKESS (large language model for
+text-based knowledge-embedded soft sensing), harnessing the powerful general
+problem-solving capabilities, cross-modal knowledge transfer abilities, and
+few-shot capabilities of LLM for enhanced soft sensing modeling. Specifically,
+an auxiliary variable series encoder (AVS Encoder) is proposed to unleash LLM's
+potential for capturing temporal relationships within series and spatial
+semantic relationships among auxiliary variables. Then, we propose a two-stage
+fine-tuning alignment strategy: in the first stage, employing
+parameter-efficient fine-tuning through autoregressive training adjusts LLM to
+rapidly accommodate process variable data, resulting in a soft sensing
+foundation model (SSFM). Subsequently, by training adapters, we adapt the SSFM
+to various downstream tasks without modifying its architecture. Then, we
+propose two text-based knowledge-embedded soft sensors, integrating new natural
+language modalities to overcome the limitations of pure structured data models.
+Furthermore, benefiting from LLM's pre-existing world knowledge, our model
+demonstrates outstanding predictive capabilities in small sample conditions.
+Using the thermal deformation of air preheater rotor as a case study, we
+validate through extensive experiments that LLM-TKESS exhibits outstanding
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D3RM: A Discrete Denoising Diffusion Refinement Model for Piano
+  Transcription <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hounsu Kim, Taegyun Kwon, Juhan Nam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have been widely used in the generative domain due to their
+convincing performance in modeling complex data distributions. Moreover, they
+have shown competitive results on discriminative tasks, such as image
+segmentation. While diffusion models have also been explored for automatic
+music transcription, their performance has yet to reach a competitive level. In
+this paper, we focus on discrete diffusion model's refinement capabilities and
+present a novel architecture for piano transcription. Our model utilizes
+Neighborhood Attention layers as the denoising module, gradually predicting the
+target high-resolution piano roll, conditioned on the finetuned features of a
+pretrained acoustic model. To further enhance refinement, we devise a novel
+strategy which applies distinct transition states during training and inference
+stage of discrete diffusion models. Experiments on the MAESTRO dataset show
+that our approach outperforms previous diffusion-based piano transcription
+models and the baseline model in terms of F1 score. Our code is available in
+https://github.com/hanshounsu/d3rm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simultaneous emulation and downscaling with physically-consistent deep
+  learning-based regional ocean emulators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonard Lupin-Jimenez, Moein Darman, Subhashis Hazarika, Tianning Wu, Michael Gray, Ruyoing He, Anthony Wong, Ashesh Chattopadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building on top of the success in AI-based atmospheric emulation, we propose
+an AI-based ocean emulation and downscaling framework focusing on the
+high-resolution regional ocean over Gulf of Mexico. Regional ocean emulation
+presents unique challenges owing to the complex bathymetry and lateral boundary
+conditions as well as from fundamental biases in deep learning-based
+frameworks, such as instability and hallucinations. In this paper, we develop a
+deep learning-based framework to autoregressively integrate ocean-surface
+variables over the Gulf of Mexico at $8$ Km spatial resolution without
+unphysical drifts over decadal time scales and simulataneously downscale and
+bias-correct it to $4$ Km resolution using a physics-constrained generative
+model. The framework shows both short-term skills as well as accurate long-term
+statistics in terms of mean and variability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LearningFlow: Automated Policy Learning Workflow for Urban Driving with
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zengqi Peng, Yubin Wang, Xu Han, Lei Zheng, Jun Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in reinforcement learning (RL) demonstrate the
+significant potential in autonomous driving. Despite this promise, challenges
+such as the manual design of reward functions and low sample efficiency in
+complex environments continue to impede the development of safe and effective
+driving policies. To tackle these issues, we introduce LearningFlow, an
+innovative automated policy learning workflow tailored to urban driving. This
+framework leverages the collaboration of multiple large language model (LLM)
+agents throughout the RL training process. LearningFlow includes a curriculum
+sequence generation process and a reward generation process, which work in
+tandem to guide the RL policy by generating tailored training curricula and
+reward functions. Particularly, each process is supported by an analysis agent
+that evaluates training progress and provides critical insights to the
+generation agent. Through the collaborative efforts of these LLM agents,
+LearningFlow automates policy learning across a series of complex driving
+tasks, and it significantly reduces the reliance on manual reward function
+design while enhancing sample efficiency. Comprehensive experiments are
+conducted in the high-fidelity CARLA simulator, along with comparisons with
+other existing methods, to demonstrate the efficacy of our proposed approach.
+The results demonstrate that LearningFlow excels in generating rewards and
+curricula. It also achieves superior performance and robust generalization
+across various driving tasks, as well as commendable adaptation to different RL
+algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LongViTU: Instruction Tuning for Long-Form Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rujie Wu, Xiaojian Ma, Hai Ci, Yue Fan, Yuxuan Wang, Haozhe Zhao, Qing Li, Yizhou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduce LongViTU, a large-scale (~121k QA pairs, ~900h videos),
+automatically generated dataset for long-form video understanding. We developed
+a systematic approach that organizes videos into a hierarchical tree structure
+and incorporates self-revision mechanisms to ensure high-quality QA pairs. Each
+QA pair in LongViTU features: 1) long-term context (average certificate length
+of 4.6 minutes); 2) rich knowledge and condensed reasoning (commonsense,
+causality, planning, etc.); and 3) explicit timestamp labels for relevant
+events. LongViTU also serves as a benchmark for instruction following in
+long-form and streaming video understanding. We evaluate the open-source
+state-of-the-art long video understanding model, LongVU, and the commercial
+model, Gemini-1.5-Pro, on our benchmark. They achieve GPT-4 scores of 49.9 and
+52.3, respectively, underscoring the substantial challenge posed by our
+benchmark. Further supervised fine-tuning (SFT) on LongVU led to performance
+improvements of 12.0% on our benchmark, 2.2% on the in-distribution (ID)
+benchmark EgoSchema, 1.0%, 2.2% and 1.2% on the out-of-distribution (OOD)
+benchmarks VideoMME (Long), WorldQA and OpenEQA, respectively. These outcomes
+demonstrate LongViTU's high data quality and robust OOD generalizability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Fingerprint Mosaicking Artifact Detection: A <span class="highlight-title">Self-Supervised</span>
+  Deep Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05034v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05034v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurenz Ruzicka, Alexander Spenke, Stephan Bergmann, Gerd Nolden, Bernhard Kohn, Clemens Heitzinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fingerprint mosaicking, which is the process of combining multiple
+fingerprint images into a single master fingerprint, is an essential process in
+modern biometric systems. However, it is prone to errors that can significantly
+degrade fingerprint image quality. This paper proposes a novel deep
+learning-based approach to detect and score mosaicking artifacts in fingerprint
+images. Our method leverages a self-supervised learning framework to train a
+model on large-scale unlabeled fingerprint data, eliminating the need for
+manual artifact annotation. The proposed model effectively identifies
+mosaicking errors, achieving high accuracy on various fingerprint modalities,
+including contactless, rolled, and pressed fingerprints and furthermore proves
+to be robust to different data sources. Additionally, we introduce a novel
+mosaicking artifact score to quantify the severity of errors, enabling
+automated evaluation of fingerprint images. By addressing the challenges of
+mosaicking artifact detection, our work contributes to improving the accuracy
+and reliability of fingerprint-based biometric systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ECBench: Can Multi-modal Foundation Models Understand the Egocentric
+  World? A Holistic Embodied Cognition Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ronghao Dang, Yuqian Yuan, Wenqi Zhang, Yifei Xin, Boqiang Zhang, Long Li, Liuyi Wang, Qinyang Zeng, Xin Li, Lidong Bing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The enhancement of generalization in robots by large vision-language models
+(LVLMs) is increasingly evident. Therefore, the embodied cognitive abilities of
+LVLMs based on egocentric videos are of great interest. However, current
+datasets for embodied video question answering lack comprehensive and
+systematic evaluation frameworks. Critical embodied cognitive issues, such as
+robotic self-cognition, dynamic scene perception, and hallucination, are rarely
+addressed. To tackle these challenges, we propose ECBench, a high-quality
+benchmark designed to systematically evaluate the embodied cognitive abilities
+of LVLMs. ECBench features a diverse range of scene video sources, open and
+varied question formats, and 30 dimensions of embodied cognition. To ensure
+quality, balance, and high visual dependence, ECBench uses class-independent
+meticulous human annotation and multi-round question screening strategies.
+Additionally, we introduce ECEval, a comprehensive evaluation system that
+ensures the fairness and rationality of the indicators. Utilizing ECBench, we
+conduct extensive evaluations of proprietary, open-source, and task-specific
+LVLMs. ECBench is pivotal in advancing the embodied cognitive capabilities of
+LVLMs, laying a solid foundation for developing reliable core models for
+embodied agents. All data and code are available at
+https://github.com/Rh-Dang/ECBench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Measuring Unnoticeability of Graph Adversarial Attacks: Observations,
+  New Measure, and Applications <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeonsoo Jo, Hyunjin Hwang, Fanchen Bu, Soo Yong Lee, Chanyoung Park, Kijung Shin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks are allegedly unnoticeable. Prior studies have designed
+attack noticeability measures on graphs, primarily using statistical tests to
+compare the topology of original and (possibly) attacked graphs. However, we
+observe two critical limitations in the existing measures. First, because the
+measures rely on simple rules, attackers can readily enhance their attacks to
+bypass them, reducing their attack "noticeability" and, yet, maintaining their
+attack performance. Second, because the measures naively leverage global
+statistics, such as degree distributions, they may entirely overlook attacks
+until severe perturbations occur, letting the attacks be almost "totally
+unnoticeable." To address the limitations, we introduce HideNSeek, a learnable
+measure for graph attack noticeability. First, to mitigate the bypass problem,
+HideNSeek learns to distinguish the original and (potential) attack edges using
+a learnable edge scorer (LEO), which scores each edge on its likelihood of
+being an attack. Second, to mitigate the overlooking problem, HideNSeek
+conducts imbalance-aware aggregation of all the edge scores to obtain the final
+noticeability score. Using six real-world graphs, we empirically demonstrate
+that HideNSeek effectively alleviates the observed limitations, and LEO (i.e.,
+our learnable edge scorer) outperforms eleven competitors in distinguishing
+attack edges under five different attack methods. For an additional
+application, we show that LEO boost the performance of robust GNNs by removing
+attack-like edges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UAV-VLA: Vision-Language-Action System for Large Scale Aerial Mission
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Sautenkov, Yasheerah Yaqoot, Artem Lykov, Muhammad Ahsan Mustafa, Grik Tadevosyan, Aibek Akhmetkazy, Miguel Altamirano Cabrera, Mikhail Martynov, Sausar Karaf, Dzmitry Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The UAV-VLA (Visual-Language-Action) system is a tool designed to facilitate
+communication with aerial robots. By integrating satellite imagery processing
+with the Visual Language Model (VLM) and the powerful capabilities of GPT,
+UAV-VLA enables users to generate general flight paths-and-action plans through
+simple text requests. This system leverages the rich contextual information
+provided by satellite images, allowing for enhanced decision-making and mission
+planning. The combination of visual analysis by VLM and natural language
+processing by GPT can provide the user with the path-and-action set, making
+aerial operations more efficient and accessible. The newly developed method
+showed the difference in the length of the created trajectory in 22% and the
+mean error in finding the objects of interest on a map in 34.22 m by Euclidean
+distance in the K-Nearest Neighbors (KNN) approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>HRI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum-enhanced causal discovery for a small number of samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yota Maeda, Ken Arai, Yu Tanaka, Yu Terada, Hiroshi Ueno, Hiroyuki Tezuka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discovery of causal relationships from observed data has attracted
+significant interest from disciplines such as economics, social sciences,
+epidemiology, and biology. In practical applications, considerable knowledge of
+the underlying systems is often unavailable, and real data are often associated
+with nonlinear causal structures, which make the direct use of most
+conventional causality analysis methods difficult. This study proposes a novel
+quantum Peter-Clark (qPC) algorithm for causal discovery that does not assume
+any underlying model structures. Based on the independence conditional tests in
+a class of reproducing kernel Hilbert spaces characterized by quantum circuits,
+the proposed qPC algorithm can explore causal relationships from the observed
+data drawn from arbitrary distributions. We conducted systematic experiments on
+fundamental graph parts of causal structures, demonstrating that the qPC
+algorithm exhibits a significantly better performance, particularly with
+smaller sample sizes compared to its classical counterpart. Furthermore, we
+proposed a novel optimization approach based on Kernel Target Alignment (KTA)
+for determining hyperparameters of quantum kernels. This method effectively
+reduced the risk of false positives in causal discovery, enabling more reliable
+inference. Our theoretical and experimental results demonstrate that the
+proposed quantum algorithm can empower classical algorithms for robust and
+accurate inference in causal discovery, supporting them in regimes where
+classical algorithms typically fail. Additionally, the effectiveness of this
+method was validated using the Boston Housing dataset as a real-world
+application. These findings demonstrate the new potential of quantum
+circuit-based causal discovery methods in addressing practical challenges,
+particularly in small-sample scenarios where traditional approaches have shown
+limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A High-accuracy Calibration Method of Transient TSEPs for Power
+  Semiconductor Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinghao Zhang, Wenrui Li, Pinjia Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The thermal sensitive electrical parameter (TSEP) method is crucial for
+enhancing the reliability of power devices through junction temperature
+monitoring. The TSEP method comprises three key processes: calibration,
+regression, and application. While significant efforts have been devoted to
+improving regression algorithms and increasing TSEP sensitivity to enhance
+junction temperature monitoring accuracy, these approaches have reached a
+bottleneck. In reality, the calibration method significantly influences
+monitoring accuracy, an aspect often overlooked in conventional TSEP methods.
+To address this issue, we propose a high-accuracy calibration method for
+transient TSEPs. First, a temperature compensation strategy based on thermal
+analysis is introduced to mitigate the temperature difference caused by load
+current during dual pulse tests. Second, the impact of stray parameters is
+analyzed to identify coupled parameters, which are typically neglected in
+existing methods. Third, it is observed that random errors follow a logarithm
+Gaussian distribution, covering a hidden variable. A neural network is used to
+obtain the junction temperature predictive model. The proposed calibration
+method is experimental validated in threshold voltage as an example. Compared
+with conventional calibration methods, the mean absolute error is reduced by
+over 30%. Moreover, this method does not require additional hardware cost and
+has good generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Load Forecasting for Households and Energy Communities: Are Deep
+  Learning Models Worth the Effort? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Moosbrugger, Valentin Seiler, Philipp Wohlgenannt, Sebastian Hegenbart, Sashko Ristov, Peter Kepplinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate load forecasting is crucial for predictive control in many energy
+domain applications, with significant economic and ecological implications. To
+address these implications, this study provides an extensive benchmark of
+state-of-the-art deep learning models for short-term load forecasting in energy
+communities. Namely, LSTM, xLSTM, and Transformers are compared with benchmarks
+such as KNNs, synthetic load models, and persistence forecasting models. This
+comparison considers different scales of aggregation (e.g., number of household
+loads) and varying training data availability (e.g., training data time spans).
+Further, the impact of transfer learning from synthetic (standard) load
+profiles and the deep learning model size (i.e., parameter count) is
+investigated in terms of forecasting error. Implementations are publicly
+available and other researchers are encouraged to benchmark models using this
+framework. Additionally, a comprehensive case study, comprising an energy
+community of 50 households and a battery storage demonstrates the beneficial
+financial implications of accurate predictions. Key findings of this research
+include: (1) Simple persistence benchmarks outperform deep learning models for
+short-term load forecasting when the available training data is limited to six
+months or less; (2) Pretraining with publicly available synthetic load profiles
+improves the normalized Mean Absolute Error (nMAE) by an average of 1.28%pt
+during the first nine months of training data; (3) Increased aggregation
+significantly enhances the performance of deep learning models relative to
+persistence benchmarks; (4) Improved load forecasting, with an nMAE reduction
+of 1.1%pt, translates to an economic benefit of approximately 600EUR per year
+in an energy community comprising 50 households.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint was submitted to the Elsevier journal Energy and AI on
+  December 18, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GiNet: Integrating Sequential and Context-Aware Learning for Battery
+  Capacity Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Sameer, Wei Zhang, Xin Lou, Qingyu Yan, Terence Goh, Yulin Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The surging demand for batteries requires advanced battery management
+systems, where battery capacity modelling is a key functionality. In this
+paper, we aim to achieve accurate battery capacity prediction by learning from
+historical measurements of battery dynamics. We propose GiNet, a gated
+recurrent units enhanced Informer network, for predicting battery's capacity.
+The novelty and competitiveness of GiNet lies in its capability of capturing
+sequential and contextual information from raw battery data and reflecting the
+battery's complex behaviors with both temporal dynamics and long-term
+dependencies. We conducted an experimental study based on a publicly available
+dataset to showcase GiNet's strength of gaining a holistic understanding of
+battery behavior and predicting battery capacity accurately. GiNet achieves
+0.11 mean absolute error for predicting the battery capacity in a sequence of
+future time slots without knowing the historical battery capacity. It also
+outperforms the latest algorithms significantly with 27% error reduction on
+average compared to Informer. The promising results highlight the importance of
+customized and optimized integration of algorithm and battery knowledge and
+shed light on other industry applications as well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CuRLA: Curriculum Learning Based Deep Reinforcement Learning for
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhargava Uppuluri, Anjel Patel, Neil Mehta, Sridhar Kamath, Pratyush Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In autonomous driving, traditional Computer Vision (CV) agents often struggle
+in unfamiliar situations due to biases in the training data. Deep Reinforcement
+Learning (DRL) agents address this by learning from experience and maximizing
+rewards, which helps them adapt to dynamic environments. However, ensuring
+their generalization remains challenging, especially with static training
+environments. Additionally, DRL models lack transparency, making it difficult
+to guarantee safety in all scenarios, particularly those not seen during
+training. To tackle these issues, we propose a method that combines DRL with
+Curriculum Learning for autonomous driving. Our approach uses a Proximal Policy
+Optimization (PPO) agent and a Variational Autoencoder (VAE) to learn safe
+driving in the CARLA simulator. The agent is trained using two-fold curriculum
+learning, progressively increasing environment difficulty and incorporating a
+collision penalty in the reward function to promote safety. This method
+improves the agent's adaptability and reliability in complex environments, and
+understand the nuances of balancing multiple reward components from different
+feedback signals in a single scalar reward function. Keywords: Computer Vision,
+Deep Reinforcement Learning, Variational Autoencoder, Proximal Policy
+Optimization, Curriculum Learning, Autonomous Driving.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in the 17th International Conference on Agents and
+  Artificial Intelligence (ICAART), Feb 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Adaptive Ising Machines for Constrained Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Corentin Delacour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ising machines (IM) are physics-inspired alternatives to von Neumann
+architectures for solving hard optimization tasks. By mapping binary variables
+to coupled Ising spins, IMs can naturally solve unconstrained combinatorial
+optimization problems such as finding maximum cuts in graphs. However, despite
+their importance in practical applications, constrained problems remain
+challenging to solve for IMs that require large quadratic energy penalties to
+ensure the correspondence between energy ground states and constrained optimal
+solutions. To relax this requirement, we propose a self-adaptive IM that
+iteratively shapes its energy landscape using a Lagrange relaxation of
+constraints and avoids prior tuning of penalties. Using a probabilistic-bit
+(p-bit) IM emulated in software, we benchmark our algorithm with
+multidimensional knapsack problems (MKP) and quadratic knapsack problems (QKP),
+the latter being an Ising problem with linear constraints. For QKP with 300
+variables, the proposed algorithm finds better solutions than state-of-the-art
+IMs such as Fujitsu's Digital Annealer and requires 7,500x fewer samples. Our
+results show that adapting the energy landscape during the search can speed up
+IMs for constrained optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Battling the Non-stationarity in Time Series Forecasting via Test-time
+  Adaptation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        HyunGi Kim, Siwon Kim, Jisoo Mok, Sungroh Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks have spearheaded remarkable advancements in time series
+forecasting (TSF), one of the major tasks in time series modeling. Nonetheless,
+the non-stationarity of time series undermines the reliability of pre-trained
+source time series forecasters in mission-critical deployment settings. In this
+study, we introduce a pioneering test-time adaptation framework tailored for
+TSF (TSF-TTA). TAFAS, the proposed approach to TSF-TTA, flexibly adapts source
+forecasters to continuously shifting test distributions while preserving the
+core semantic information learned during pre-training. The novel utilization of
+partially-observed ground truth and gated calibration module enables proactive,
+robust, and model-agnostic adaptation of source forecasters. Experiments on
+diverse benchmark datasets and cutting-edge architectures demonstrate the
+efficacy and generality of TAFAS, especially in long-term forecasting scenarios
+that suffer from significant distribution shifts. The code is available at
+https://github.com/kimanki/TAFAS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Targeted Adversarial Denoising Autoencoders (TADA) for Neural Time
+  Series Filtration <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin J. Choi, Griffin Milsap, Clara A. Scholl, Francesco Tenore, Mattson Ogg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current machine learning (ML)-based algorithms for filtering
+electroencephalography (EEG) time series data face challenges related to
+cumbersome training times, regularization, and accurate reconstruction. To
+address these shortcomings, we present an ML filtration algorithm driven by a
+logistic covariance-targeted adversarial denoising autoencoder (TADA). We
+hypothesize that the expressivity of a targeted, correlation-driven
+convolutional autoencoder will enable effective time series filtration while
+minimizing compute requirements (e.g., runtime, model size). Furthermore, we
+expect that adversarial training with covariance rescaling will minimize signal
+degradation. To test this hypothesis, a TADA system prototype was trained and
+evaluated on the task of removing electromyographic (EMG) noise from EEG data
+in the EEGdenoiseNet dataset, which includes EMG and EEG data from 67 subjects.
+The TADA filter surpasses conventional signal filtration algorithms across
+quantitative metrics (Correlation Coefficient, Temporal RRMSE, Spectral RRMSE),
+and performs competitively against other deep learning architectures at a
+reduced model size of less than 400,000 trainable parameters. Further
+experimentation will be necessary to assess the viability of TADA on a wider
+range of deployment cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>[Accepted] Artificial Intelligence for Time Series Analysis (AI4TS):
+  Theory, Algorithms, and Applications @ AAAI 2025, Philadelphia, PA, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Demystifying Domain-adaptive Post-training for Financial LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Ke, Yifei Ming, Xuan-Phi Nguyen, Caiming Xiong, Shafiq Joty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain-adaptive post-training of large language models (LLMs) has emerged as
+a promising approach for specialized domains such as medicine and finance.
+However, significant challenges remain in identifying optimal adaptation
+criteria and training strategies across varying data and model configurations.
+To address these challenges, we introduce FINDAP, a systematic and fine-grained
+investigation into domain-adaptive post-training of LLMs for the finance
+domain. Our approach begins by identifying the core capabilities required for
+the target domain and designing a comprehensive evaluation suite aligned with
+these needs. We then analyze the effectiveness of key post-training stages,
+including continual pretraining, instruction tuning, and preference alignment.
+Building on these insights, we propose an effective training recipe centered on
+a novel preference data distillation method, which leverages process signals
+from a generative reward model. The resulting model, Llama-Fin, achieves
+state-of-the-art performance across a wide range of financial tasks. Our
+analysis also highlights how each post-training stage contributes to distinct
+capabilities, uncovering specific challenges and effective solutions, providing
+valuable insights for domain adaptation of LLMs. Project page:
+https://github.com/SalesforceAIResearch/FinDap
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open Problems in Machine Unlearning for AI Safety 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04952v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04952v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fazl Barez, Tingchen Fu, Ameya Prabhu, Stephen Casper, Amartya Sanyal, Adel Bibi, Aidan O'Gara, Robert Kirk, Ben Bucknall, Tim Fist, Luke Ong, Philip Torr, Kwok-Yan Lam, Robert Trager, David Krueger, Sören Mindermann, José Hernandez-Orallo, Mor Geva, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As AI systems become more capable, widely deployed, and increasingly
+autonomous in critical areas such as cybersecurity, biological research, and
+healthcare, ensuring their safety and alignment with human values is paramount.
+Machine unlearning -- the ability to selectively forget or suppress specific
+types of knowledge -- has shown promise for privacy and data removal tasks,
+which has been the primary focus of existing research. More recently, its
+potential application to AI safety has gained attention. In this paper, we
+identify key limitations that prevent unlearning from serving as a
+comprehensive solution for AI safety, particularly in managing dual-use
+knowledge in sensitive domains like cybersecurity and chemical, biological,
+radiological, and nuclear (CBRN) safety. In these contexts, information can be
+both beneficial and harmful, and models may combine seemingly harmless
+information for harmful purposes -- unlearning this information could strongly
+affect beneficial uses. We provide an overview of inherent constraints and open
+problems, including the broader side effects of unlearning dangerous knowledge,
+as well as previously unexplored tensions between unlearning and existing
+safety mechanisms. Finally, we investigate challenges related to evaluation,
+robustness, and the preservation of safety features during unlearning. By
+mapping these limitations and open challenges, we aim to guide future research
+toward realistic applications of unlearning within a broader AI safety
+framework, acknowledging its limitations and highlighting areas where
+alternative approaches may be required.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-asymptotic analysis of the performance of the penalized least
+  trimmed squares in sparse models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijun Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The least trimmed squares (LTS) estimator is a renowned robust alternative to
+the classic least squares estimator and is popular in location, regression,
+machine learning, and AI literature. Many studies exist on LTS, including its
+robustness, computation algorithms, extension to non-linear cases, asymptotics,
+etc. The LTS has been applied in the penalized regression in a high-dimensional
+real-data sparse-model setting where dimension $p$ (in thousands) is much
+larger than sample size $n$ (in tens, or hundreds). In such a practical
+setting, the sample size $n$ often is the count of sub-population that has a
+special attribute (e.g. the count of patients of Alzheimer's, Parkinson's,
+Leukemia, or ALS, etc.) among a population with a finite fixed size N.
+Asymptotic analysis assuming that $n$ tends to infinity is not practically
+convincing and legitimate in such a scenario. A non-asymptotic or finite sample
+analysis will be more desirable and feasible.
+  This article establishes some finite sample (non-asymptotic) error bounds for
+estimating and predicting based on LTS with high probability for the first
+time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Perspective on Privacy Protection in Federated Learning with
+  Granular-Ball Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04940v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04940v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guannan Lai, Yihui Feng, Xin Yang, Xiaoyu Deng, Hao Yu, Shuyin Xia, Guoyin Wang, Tianrui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) facilitates collaborative model training while
+prioritizing privacy by avoiding direct data sharing. However, most existing
+articles attempt to address challenges within the model's internal parameters
+and corresponding outputs, while neglecting to solve them at the input level.
+To address this gap, we propose a novel framework called Granular-Ball
+Federated Learning (GrBFL) for image classification. GrBFL diverges from
+traditional methods that rely on the finest-grained input data. Instead, it
+segments images into multiple regions with optimal coarse granularity, which
+are then reconstructed into a graph structure. We designed a two-dimensional
+binary search segmentation algorithm based on variance constraints for GrBFL,
+which effectively removes redundant information while preserving key
+representative features. Extensive theoretical analysis and experiments
+demonstrate that GrBFL not only safeguards privacy and enhances efficiency but
+also maintains robust utility, consistently outperforming other
+state-of-the-art FL methods. The code is available at
+https://github.com/AIGNLAI/GrBFL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpecTf: <span class="highlight-title">Transformer</span>s Enable Data-Driven Imaging Spectroscopy Cloud
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jake H. Lee, Michael Kiper, David R. Thompson, Philip G. Brodrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current and upcoming generations of visible-shortwave infrared (VSWIR)
+imaging spectrometers promise unprecedented capacity to quantify Earth System
+processes across the globe. However, reliable cloud screening remains a
+fundamental challenge for these instruments, where traditional spatial and
+temporal approaches are limited by cloud variability and limited temporal
+coverage. The Spectroscopic Transformer (SpecTf) addresses these challenges
+with a spectroscopy-specific deep learning architecture that performs cloud
+detection using only spectral information (no spatial or temporal data are
+required). By treating spectral measurements as sequences rather than image
+channels, SpecTf learns fundamental physical relationships without relying on
+spatial context. Our experiments demonstrate that SpecTf significantly
+outperforms the current baseline approach implemented for the EMIT instrument,
+and performs comparably with other machine learning methods with orders of
+magnitude fewer learned parameters. Critically, we demonstrate SpecTf's
+inherent interpretability through its attention mechanism, revealing physically
+meaningful spectral features the model has learned. Finally, we present
+SpecTf's potential for cross-instrument generalization by applying it to a
+different instrument on a different platform without modifications, opening the
+door to instrument agnostic data driven algorithms for future imaging
+spectroscopy tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 5 figures, in review. Code repository:
+  https://github.com/emit-sds/SpecTf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Mesh Completion to AI Designed Crown 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Golriz Hosseinimanesh, Farnoosh Ghadiri, Francois Guibault, Farida Cheriet, Julia Keren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing a dental crown is a time-consuming and labor intensive process. Our
+goal is to simplify crown design and minimize the tediousness of making manual
+adjustments while still ensuring the highest level of accuracy and consistency.
+To this end, we present a new end- to-end deep learning approach, coined Dental
+Mesh Completion (DMC), to generate a crown mesh conditioned on a point cloud
+context. The dental context includes the tooth prepared to receive a crown and
+its surroundings, namely the two adjacent teeth and the three closest teeth in
+the opposing jaw. We formulate crown generation in terms of completing this
+point cloud context. A feature extractor first converts the input point cloud
+into a set of feature vectors that represent local regions in the point cloud.
+The set of feature vectors is then fed into a transformer to predict a new set
+of feature vectors for the missing region (crown). Subsequently, a point
+reconstruction head, followed by a multi-layer perceptron, is used to predict a
+dense set of points with normals. Finally, a differentiable point-to-mesh layer
+serves to reconstruct the crown surface mesh. We compare our DMC method to a
+graph-based convolutional neural network which learns to deform a crown mesh
+from a generic crown shape to the target geometry. Extensive experiments on our
+dataset demonstrate the effectiveness of our method, which attains an average
+of 0.062 Chamfer Distance.The code is available
+at:https://github.com/Golriz-code/DMC.gi
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards understanding the bias in decision trees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Phelps, Daniel J. Lizotte, Douglas G. Woolford
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a widespread and longstanding belief that machine learning models
+are biased towards the majority (or negative) class when learning from
+imbalanced data, leading them to neglect or ignore the minority (or positive)
+class. In this study, we show that this belief is not necessarily correct for
+decision trees, and that their bias can actually be in the opposite direction.
+Motivated by a recent simulation study that suggested that decision trees can
+be biased towards the minority class, our paper aims to reconcile the conflict
+between that study and decades of other works. First, we critically evaluate
+past literature on this problem, finding that failing to consider the data
+generating process has led to incorrect conclusions about the bias in decision
+trees. We then prove that, under specific conditions related to the predictors,
+decision trees fit to purity and trained on a dataset with only one positive
+case are biased towards the minority class. Finally, we demonstrate that splits
+in a decision tree are also biased when there is more than one positive case.
+Our findings have implications on the use of popular tree-based models, such as
+random forests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimality and Adaptivity of Deep Neural Features for Instrumental
+  Variable Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juno Kim, Dimitri Meunier, Arthur Gretton, Taiji Suzuki, Zhu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide a convergence analysis of deep feature instrumental variable
+(DFIV) regression (Xu et al., 2021), a nonparametric approach to IV regression
+using data-adaptive features learned by deep neural networks in two stages. We
+prove that the DFIV algorithm achieves the minimax optimal learning rate when
+the target structural function lies in a Besov space. This is shown under
+standard nonparametric IV assumptions, and an additional smoothness assumption
+on the regularity of the conditional distribution of the covariate given the
+instrument, which controls the difficulty of Stage 1. We further demonstrate
+that DFIV, as a data-adaptive algorithm, is superior to fixed-feature (kernel
+or sieve) IV methods in two ways. First, when the target function possesses low
+spatial homogeneity (i.e., it has both smooth and spiky/discontinuous regions),
+DFIV still achieves the optimal rate, while fixed-feature methods are shown to
+be strictly suboptimal. Second, comparing with kernel-based two-stage
+regression estimators, DFIV is provably more data efficient in the Stage 1
+samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46 pages, 1 figure, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Continual Learning: A Systematic Literature <span class="highlight-title">Review</span> of Approaches,
+  Challenges, and Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Amir Bidaki, Amir Mohammadkhah, Kiyan Rezaee, Faeze Hassani, Sadegh Eskandari, Maziar Salahi, Mohammad M. Ghassemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online Continual Learning (OCL) is a critical area in machine learning,
+focusing on enabling models to adapt to evolving data streams in real-time
+while addressing challenges such as catastrophic forgetting and the
+stability-plasticity trade-off. This study conducts the first comprehensive
+Systematic Literature Review (SLR) on OCL, analyzing 81 approaches, extracting
+over 1,000 features (specific tasks addressed by these approaches), and
+identifying more than 500 components (sub-models within approaches, including
+algorithms and tools). We also review 83 datasets spanning applications like
+image classification, object detection, and multimodal vision-language tasks.
+Our findings highlight key challenges, including reducing computational
+overhead, developing domain-agnostic solutions, and improving scalability in
+resource-constrained environments. Furthermore, we identify promising
+directions for future research, such as leveraging self-supervised learning for
+multimodal and sequential data, designing adaptive memory mechanisms that
+integrate sparse retrieval and generative replay, and creating efficient
+frameworks for real-world applications with noisy or evolving task boundaries.
+By providing a rigorous and structured synthesis of the current state of OCL,
+this review offers a valuable resource for advancing this field and addressing
+its critical challenges and opportunities. The complete SLR methodology steps
+and extracted data are publicly available through the provided link:
+https://github.com/kiyan-rezaee/
+Systematic-Literature-Review-on-Online-Continual-Learning
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantifying Itch and its Impact on Sleep Using Machine Learning and
+  Radio Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michail Ouroutzoglou, Mingmin Zhao, Joshua Hellerstein, Hariharan Rahul, Asima Badic, Brian S. Kim, Dina Katabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chronic itch affects 13% of the US population, is highly debilitating, and
+underlies many medical conditions. A major challenge in clinical care and new
+therapeutics development is the lack of an objective measure for quantifying
+itch, leading to reliance on subjective measures like patients' self-assessment
+of itch severity. In this paper, we show that a home radio device paired with
+artificial intelligence (AI) can concurrently capture scratching and evaluate
+its impact on sleep quality by analyzing radio signals bouncing in the
+environment. The device eliminates the need for wearable sensors or skin
+contact, enabling monitoring of chronic itch over extended periods at home
+without burdening patients or interfering with their skin condition. To
+validate the technology, we conducted an observational clinical study of
+chronic pruritus patients, monitored at home for one month using both the radio
+device and an infrared camera. Comparing the output of the device to ground
+truth data from the camera demonstrates its feasibility and accuracy (ROC AUC =
+0.997, sensitivity = 0.825, specificity = 0.997). The results reveal a
+significant correlation between scratching and low sleep quality, manifested as
+a reduction in sleep efficiency (R = 0.6, p < 0.001) and an increase in sleep
+latency (R = 0.68, p < 0.001). Our study underscores the potential of passive,
+long-term, at-home monitoring of chronic scratching and its sleep implications,
+offering a valuable tool for both clinical care of chronic itch patients and
+pharmaceutical clinical trials.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Look into How Machine Learning is Reshaping Engineering Models: the
+  Rise of Analysis Paralysis, Optimal yet Infeasible Solutions, and the
+  Inevitable Rashomon Paradox 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        MZ Naser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread acceptance of empirically derived codal provisions and
+equations in civil engineering stands in stark contrast to the skepticism
+facing machine learning (ML) models, despite their shared statistical
+foundations. This paper examines this philosophical tension through the lens of
+structural engineering and explores how integrating ML challenges traditional
+engineering philosophies and professional identities. Recent efforts have
+documented how ML enhances predictive accuracy, optimizes designs, and analyzes
+complex behaviors. However, one might also raise concerns about the diminishing
+role of human intuition and the interpretability of algorithms. To showcase
+this rarely explored front, this paper presents how ML can be successfully
+integrated into various engineering problems by means of formulation via
+deduction, induction, and abduction. Then, this paper identifies three
+principal paradoxes that could arise when adopting ML: analysis paralysis
+(increased prediction accuracy leading to a reduced understanding of physical
+mechanisms), infeasible solutions (optimization resulting in unconventional
+designs that challenge engineering intuition), and the Rashomon effect (where
+contradictions in explainability methods and physics arise). This paper
+concludes by addressing these paradoxes and arguing the need to rethink
+epistemological shifts in engineering and engineering education and
+methodologies to harmonize traditional principles with ML.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Probabilistic Inference of Human Motor Intentions by Assistive
+  Mobile Robots Controlled via a Brain-Computer Interface 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshan Zhou, Carol M. Menassa, Vineet R. Kamat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assistive mobile robots are a transformative technology that helps persons
+with disabilities regain the ability to move freely. Although autonomous
+wheelchairs significantly reduce user effort, they still require human input to
+allow users to maintain control and adapt to changing environments. Brain
+Computer Interface (BCI) stands out as a highly user-friendly option that does
+not require physical movement. Current BCI systems can understand whether users
+want to accelerate or decelerate, but they implement these changes in discrete
+speed steps rather than allowing for smooth, continuous velocity adjustments.
+This limitation prevents the systems from mimicking the natural, fluid speed
+changes seen in human self-paced motion. The authors aim to address this
+limitation by redesigning the perception-action cycle in a BCI controlled
+robotic system: improving how the robotic agent interprets the user's motion
+intentions (world state) and implementing these actions in a way that better
+reflects natural physical properties of motion, such as inertia and damping.
+The scope of this paper focuses on the perception aspect. We asked and answered
+a normative question "what computation should the robotic agent carry out to
+optimally perceive incomplete or noisy sensory observations?" Empirical EEG
+data were collected, and probabilistic representation that served as world
+state distributions were learned and evaluated in a Generative Adversarial
+Network framework. The ROS framework was established that connected with a
+Gazebo environment containing a digital twin of an indoor space and a virtual
+model of a robotic wheelchair. Signal processing and statistical analyses were
+implemented to identity the most discriminative features in the
+spatial-spectral-temporal dimensions, which are then used to construct the
+world model for the robotic agent to interpret user motion intentions as a
+Bayesian observer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Personalized Learning Analysis via an Innovative Domain
+  Knowledge Informed Attention-based Knowledge Tracing Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Kose, Jin Wei-Kocsis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emerging Knowledge Tracing (KT) models, particularly deep learning and
+attention-based Knowledge Tracing, have shown great potential in realizing
+personalized learning analysis via prediction of students' future performance
+based on their past interactions. The existing methods mainly focus on
+immediate past interactions or individual concepts without accounting for
+dependencies between knowledge concept, referred as knowledge concept routes,
+that can be critical to advance the understanding the students' learning
+outcomes. To address this, in this paper, we propose an innovative
+attention-based method by effectively incorporating the domain knowledge of
+knowledge concept routes in the given curriculum. Additionally, we leverage
+XES3G5M dataset, a benchmark dataset with rich auxiliary information for
+knowledge concept routes, to evaluate and compare the performance of our
+proposed method to the seven State-of-the-art (SOTA) deep learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Session-Level Dynamic Ad Load Optimization using Offline Robust
+  Reinforcement Learning <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Liu, Qi Xu, Wei Shi, Zhigang Hua, Shuang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Session-level dynamic ad load optimization aims to personalize the density
+and types of delivered advertisements in real time during a user's online
+session by dynamically balancing user experience quality and ad monetization.
+Traditional causal learning-based approaches struggle with key technical
+challenges, especially in handling confounding bias and distribution shifts. In
+this paper, we develop an offline deep Q-network (DQN)-based framework that
+effectively mitigates confounding bias in dynamic systems and demonstrates more
+than 80% offline gains compared to the best causal learning-based production
+baseline. Moreover, to improve the framework's robustness against unanticipated
+distribution shifts, we further enhance our framework with a novel offline
+robust dueling DQN approach. This approach achieves more stable rewards on
+multiple OpenAI-Gym datasets as perturbations increase, and provides an
+additional 5% offline gains on real-world ad delivery data.
+  Deployed across multiple production systems, our approach has achieved
+outsized topline gains. Post-launch online A/B tests have shown double-digit
+improvements in the engagement-ad score trade-off efficiency, significantly
+enhancing our platform's capability to serve both consumers and advertisers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Will appear in KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enforcing Fundamental Relations via Adversarial Attacks on Input
+  Parameter Correlations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timo Saala, Lucie Flek, Alexander Jung, Akbar Karimi, Alexander Schmidt, Matthias Schott, Philipp Soldin, Christopher Wiebusch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Correlations between input parameters play a crucial role in many scientific
+classification tasks, since these are often related to fundamental laws of
+nature. For example, in high energy physics, one of the common deep learning
+use-cases is the classification of signal and background processes in particle
+collisions. In many such cases, the fundamental principles of the correlations
+between observables are often better understood than the actual distributions
+of the observables themselves. In this work, we present a new adversarial
+attack algorithm called Random Distribution Shuffle Attack (RDSA), emphasizing
+the correlations between observables in the network rather than individual
+feature characteristics. Correct application of the proposed novel attack can
+result in a significant improvement in classification performance -
+particularly in the context of data augmentation - when using the generated
+adversaries within adversarial training. Given that correlations between input
+features are also crucial in many other disciplines. We demonstrate the RDSA
+effectiveness on six classification tasks, including two particle collision
+challenges (using CERN Open Data), hand-written digit recognition (MNIST784),
+human activity recognition (HAR), weather forecasting (Rain in Australia), and
+ICU patient mortality (MIMIC-IV), demonstrating a general use case beyond
+fundamental physics for this new type of adversarial attack algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures (Without appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learned Discrepancy Reconstruction and Benchmark <span class="highlight-title">Dataset</span> for Magnetic
+  Particle Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05583v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05583v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meira Iske, Hannes Albers, Tobias Knopp, Tobias Kluth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Magnetic Particle Imaging (MPI) is an emerging imaging modality based on the
+magnetic response of superparamagnetic iron oxide nanoparticles to achieve
+high-resolution and real-time imaging without harmful radiation. One key
+challenge in the MPI image reconstruction task arises from its underlying noise
+model, which does not fulfill the implicit Gaussian assumptions that are made
+when applying traditional reconstruction approaches. To address this challenge,
+we introduce the Learned Discrepancy Approach, a novel learning-based
+reconstruction method for inverse problems that includes a learned discrepancy
+function. It enhances traditional techniques by incorporating an invertible
+neural network to explicitly model problem-specific noise distributions. This
+approach does not rely on implicit Gaussian noise assumptions, making it
+especially suited to handle the sophisticated noise model in MPI and also
+applicable to other inverse problems. To further advance MPI reconstruction
+techniques, we introduce the MPI-MNIST dataset - a large collection of
+simulated MPI measurements derived from the MNIST dataset of handwritten
+digits. The dataset includes noise-perturbed measurements generated from
+state-of-the-art model-based system matrices and measurements of a preclinical
+MPI scanner device. This provides a realistic and flexible environment for
+algorithm testing. Validated against the MPI-MNIST dataset, our method
+demonstrates significant improvements in reconstruction quality in terms of
+structural similarity when compared to classical reconstruction techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-Driven Learning for Inverse Problems in Quantum Chromodynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gert Aarts, Kenji Fukushima, Tetsuo Hatsuda, Andreas Ipp, Shuzhe Shi, Lingxiao Wang, Kai Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of deep learning techniques and physics-driven designs is
+reforming the way we address inverse problems, in which accurate physical
+properties are extracted from complex data sets. This is particularly relevant
+for quantum chromodynamics (QCD), the theory of strong interactions, with its
+inherent limitations in observational data and demanding computational
+approaches. This perspective highlights advances and potential of
+physics-driven learning methods, focusing on predictions of physical quantities
+towards QCD physics, and drawing connections to machine learning(ML). It is
+shown that the fusion of ML and physics can lead to more efficient and reliable
+problem-solving strategies. Key ideas of ML, methodology of embedding physics
+priors, and generative models as inverse modelling of physical probability
+distributions are introduced. Specific applications cover first-principle
+lattice calculations, and QCD physics of hadrons, neutron stars, and heavy-ion
+collisions. These examples provide a structured and concise overview of how
+incorporating prior knowledge such as symmetry, continuity and equations into
+deep learning designs can address diverse inverse problems across different
+physical sciences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, submitted version to Nat Rev Phys</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analog Bayesian neural networks are insensitive to the shape of the
+  weight distribution <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ravi G. Patel, T. Patrick Xiao, Sapan Agarwal, Christopher Bennett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has demonstrated that Bayesian neural networks (BNN's) trained
+with mean field variational inference (MFVI) can be implemented in analog
+hardware, promising orders of magnitude energy savings compared to the standard
+digital implementations. However, while Gaussians are typically used as the
+variational distribution in MFVI, it is difficult to precisely control the
+shape of the noise distributions produced by sampling analog devices. This
+paper introduces a method for MFVI training using real device noise as the
+variational distribution. Furthermore, we demonstrate empirically that the
+predictive distributions from BNN's with the same weight means and variances
+converge to the same distribution, regardless of the shape of the variational
+distribution. This result suggests that analog device designers do not need to
+consider the shape of the device noise distribution when hardware-implementing
+BNNs performing MFVI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at the NeurIPS 2024 Workshop on Machine Learning with New
+  Compute Paradigms, https://openreview.net/forum?id=soS5qgU7Yb</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prediction-Assisted Online Distributed Deep Learning Workload Scheduling
+  in GPU Clusters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyue Luo, Jia Liu, Myungjin Lee, Ness B. Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent explosive growth of deep learning (DL) models has necessitated a
+compelling need for efficient job scheduling for distributed deep learning
+training with mixed parallelisms (DDLwMP) in GPU clusters. This paper proposes
+an adaptive shortest-remaining-processing-time-first (A-SRPT) scheduling
+algorithm, a novel prediction-assisted online scheduling approach designed to
+mitigate the challenges associated with DL cluster scheduling. By modeling each
+job as a graph corresponding to heterogeneous Deep Neural Network (DNN) models
+and their associated distributed training configurations, A-SRPT strategically
+assigns jobs to the available GPUs, thereby minimizing inter-server
+communication overhead. Observing that most DDLwMP jobs recur, A-SRPT
+incorporates a random forest regression model to predict training iterations.
+Crucially, A-SRPT maps the complex scheduling problem into a single-machine
+instance, which is addressed optimally by a preemptive
+"shortest-remaining-processing-time-first" strategy. This optimized solution
+serves as a guide for actual job scheduling within the GPU clusters, leading to
+a theoretically provable competitive scheduling efficiency. We conduct
+extensive real-world testbed and simulation experiments to verify our proposed
+algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INFOCOM 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soup to go: mitigating forgetting during continual learning with model
+  averaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05559v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05559v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anat Kleiman, Gintare Karolina Dziugaite, Jonathan Frankle, Sham Kakade, Mansheej Paul
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In continual learning, where task data arrives in a sequence, fine-tuning on
+later tasks will often lead to performance degradation on earlier tasks. This
+is especially pronounced when these tasks come from diverse domains. In this
+setting, how can we mitigate catastrophic forgetting of earlier tasks and
+retain what the model has learned with minimal computational expenses? Inspired
+by other merging methods, and L2-regression, we propose Sequential Fine-tuning
+with Averaging (SFA), a method that merges currently training models with
+earlier checkpoints during the course of training. SOTA approaches typically
+maintain a data buffer of past tasks or impose a penalty at each gradient step.
+In contrast, our method achieves comparable results without the need to store
+past data, or multiple copies of parameters for each gradient step.
+Furthermore, our method outperforms common merging techniques such as Task
+Arithmetic, TIES Merging, and WiSE-FT, as well as other penalty methods like L2
+and Elastic Weight Consolidation. In turn, our method offers insight into the
+benefits of merging partially-trained models during training across both image
+and language domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emergent weight morphologies in deep neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05550v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05550v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal de Jong, Felix Meigel, Steffen Rulands
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whether deep neural networks can exhibit emergent behaviour is not only
+relevant for understanding how deep learning works, it is also pivotal for
+estimating potential security risks of increasingly capable artificial
+intelligence systems. Here, we show that training deep neural networks gives
+rise to emergent weight morphologies independent of the training data.
+Specifically, in analogy to condensed matter physics, we derive a theory that
+predict that the homogeneous state of deep neural networks is unstable in a way
+that leads to the emergence of periodic channel structures. We verified these
+structures by performing numerical experiments on a variety of data sets. Our
+work demonstrates emergence in the training of deep neural networks, which
+impacts the achievable performance of deep neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NSChat: A Chatbot System To Rule Them All 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zenon Lamprou, Yashar Moshfeghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of artificial intelligence has resulted in the advent
+of large language models (LLMs) with the capacity to produce text that closely
+resembles human communication. These models have been seamlessly integrated
+into diverse applications, enabling interactive and responsive communication
+across multiple platforms. The potential utility of chatbots transcends these
+traditional applications, particularly in research contexts, wherein they can
+offer valuable insights and facilitate the design of innovative experiments. In
+this study, we present NSChat, a web-based chatbot system designed to assist in
+neuroscience research. The system is meticulously designed to function as an
+experimental instrument rather than a conventional chatbot, necessitating users
+to input a username and experiment code upon access. This setup facilitates
+precise data cross-referencing, thereby augmenting the integrity and
+applicability of the data collected for research purposes. It can be easily
+expanded to accommodate new basic events as needed; and it allows researchers
+to integrate their own logging events without the necessity of implementing a
+separate logging mechanism. It is worth noting that our system was built to
+assist primarily neuroscience research but is not limited to it, it can easily
+be adapted to assist information retrieval research or interacting with chat
+bot agents in general.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniJet-${α_{ C}}$: Learning point cloud calorimeter simulations
+  using generative <span class="highlight-title">transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joschka Birk, Frank Gaede, Anna Hallin, Gregor Kasieczka, Martina Mozzanica, Henning Rose
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show the first use of generative transformers for generating calorimeter
+showers as point clouds in a high-granularity calorimeter. Using the tokenizer
+and generative part of the OmniJet-${\alpha}$ model, we represent the hits in
+the detector as sequences of integers. This model allows variable-length
+sequences, which means that it supports realistic shower development and does
+not need to be conditioned on the number of hits. Since the tokenization
+represents the showers as point clouds, the model learns the geometry of the
+showers without being restricted to any particular voxel grid.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Outlyingness Scores with Cluster Catch Digraphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05530v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05530v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Shi, Nedret Billor, Elvan Ceyhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces two novel, outlyingness scores (OSs) based on Cluster
+Catch Digraphs (CCDs): Outbound Outlyingness Score (OOS) and Inbound
+Outlyingness Score (IOS). These scores enhance the interpretability of outlier
+detection results. Both OSs employ graph-, density-, and distribution-based
+techniques, tailored to high-dimensional data with varying cluster shapes and
+intensities. OOS evaluates the outlyingness of a point relative to its nearest
+neighbors, while IOS assesses the total ``influence" a point receives from
+others within its cluster. Both OSs effectively identify global and local
+outliers, invariant to data collinearity. Moreover, IOS is robust to the
+masking problems. With extensive Monte Carlo simulations, we compare the
+performance of both OSs with CCD-based, traditional, and state-of-the-art
+outlier detection methods. Both OSs exhibit substantial overall improvements
+over the CCD-based methods in both artificial and real-world data sets,
+particularly with IOS, which delivers the best overall performance among all
+the methods, especially in high-dimensional settings.
+  Keywords: Outlier detection, Outlyingness score, Graph-based clustering,
+Cluster catch digraphs, High-dimensional data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 7 figures, 16 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Architecture Codesign for Fast Physics Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Weitz, Dmitri Demler, Luke McDermott, Nhan Tran, Javier Duarte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a pipeline to streamline neural architecture codesign for physics
+applications to reduce the need for ML expertise when designing models for
+novel tasks. Our method employs neural architecture search and network
+compression in a two-stage approach to discover hardware efficient models. This
+approach consists of a global search stage that explores a wide range of
+architectures while considering hardware constraints, followed by a local
+search stage that fine-tunes and compresses the most promising candidates. We
+exceed performance on various tasks and show further speedup through model
+compression techniques such as quantization-aware-training and neural network
+pruning. We synthesize the optimal models to high level synthesis code for FPGA
+deployment with the hls4ml library. Additionally, our hierarchical search space
+provides greater flexibility in optimization, which can easily extend to other
+tasks and domains. We demonstrate this with two case studies: Bragg peak
+finding in materials science and jet classification in high energy physics,
+achieving models with improved accuracy, smaller latencies, or reduced resource
+utilization relative to the baseline models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The more polypersonal the better -- a short look on space geometry of
+  fine-tuned layers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergei Kudriashov, Veronika Zykova, Angelina Stepanova, Yakov Raskind, Eduard Klyshinsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The interpretation of deep learning models is a rapidly growing field, with
+particular interest in language models. There are various approaches to this
+task, including training simpler models to replicate neural network predictions
+and analyzing the latent space of the model. The latter method allows us to not
+only identify patterns in the model's decision-making process, but also
+understand the features of its internal structure. In this paper, we analyze
+the changes in the internal representation of the BERT model when it is trained
+with additional grammatical modules and data containing new grammatical
+structures (polypersonality). We find that adding a single grammatical layer
+causes the model to separate the new and old grammatical systems within itself,
+improving the overall performance on perplexity metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Neuroinformatics 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shrink the longest: improving latent space isotropy with symplicial
+  geometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergei Kudriashov, Olesya Karpik, Eduard Klyshinsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although transformer-based models have been dominating the field of deep
+learning, various studies of their embedding space have shown that they suffer
+from "representation degeneration problem": embeddings tend to be distributed
+in a narrow cone, making the latent space highly anisotropic. Increasing the
+isotropy has shown to improve performance in downstream tasks both in static
+and contextual language models. However, most of approaches either add
+inference overhead or require substantial amount of data for model
+reparametrization. We propose a novel regularization technique based on
+simplicial geometry to improve the isotropy of latent representations. The core
+idea of our method is based on maximizing the persistent entropy of barcodes
+obtained using Vietoris-Rips filtration from contextual embeddings in the
+underlying latent space. We demonstrate that the method leads to an increase in
+downstream performance while significantly lowering the anisotropy during
+fine-tuning by exploiting existing geometric structures instead of
+reparametrization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AIST-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strategy Masking: A Method for Guardrails in Value-based Reinforcement
+  Learning Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Keane, Sam Keyser, Jeremy Kedziora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of reward functions to structure AI learning and decision making is
+core to the current reinforcement learning paradigm; however, without careful
+design of reward functions, agents can learn to solve problems in ways that may
+be considered ``undesirable" or ``unethical. Without thorough understanding of
+the incentives a reward function creates, it can be difficult to impose
+principled yet general control mechanisms over its behavior. In this paper, we
+study methods for constructing guardrails for AI agents that use reward
+functions to learn decision making. We introduce a novel approach, which we
+call strategy masking, to explicitly learn and then suppress undesirable AI
+agent behavior. We apply our method to study lying in AI agents and show that
+strategy masking can effectively modify agent behavior by suppressing, or
+actively penalizing, the reward dimension for lying such that agents act more
+honestly while not compromising their ability to perform effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization of Urban Wind Environment Using Fourier Neural Operator
+  Across Different Wind Directions and Cities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Chen, Geng Tian, Shaoxiang Qin, Senwen Yang, Dingyang Geng, Dongxue Zhan, Jinqiu Yang, David Vidal, Liangzhu Leon Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulation of urban wind environments is crucial for urban planning,
+pollution control, and renewable energy utilization. However, the computational
+requirements of high-fidelity computational fluid dynamics (CFD) methods make
+them impractical for real cities. To address these limitations, this study
+investigates the effectiveness of the Fourier Neural Operator (FNO) model in
+predicting flow fields under different wind directions and urban layouts. In
+this study, we investigate the effectiveness of the Fourier Neural Operator
+(FNO) model in predicting urban wind conditions under different wind directions
+and urban layouts. By training the model on velocity data from large eddy
+simulation data, we evaluate the performance of the model under different urban
+configurations and wind conditions. The results show that the FNO model can
+provide accurate predictions while significantly reducing the computational
+time by 99%. Our innovative approach of dividing the wind field into smaller
+spatial blocks for training improves the ability of the FNO model to capture
+wind frequency features effectively. The SDF data also provides important
+spatial building information, enhancing the model's ability to recognize
+physical boundaries and generate more realistic predictions. The proposed FNO
+approach enhances the AI model's generalizability for different wind directions
+and urban layouts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Flow Networks: Theory and Applications to Structure Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05498v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05498v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tristan Deleu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Without any assumptions about data generation, multiple causal models may
+explain our observations equally well. To avoid selecting a single arbitrary
+model that could result in unsafe decisions if it does not match reality, it is
+therefore essential to maintain a notion of epistemic uncertainty about our
+possible candidates. This thesis studies the problem of structure learning from
+a Bayesian perspective, approximating the posterior distribution over the
+structure of a causal model, represented as a directed acyclic graph (DAG),
+given data. It introduces Generative Flow Networks (GFlowNets), a novel class
+of probabilistic models designed for modeling distributions over discrete and
+compositional objects such as graphs. They treat generation as a sequential
+decision making problem, constructing samples of a target distribution defined
+up to a normalization constant piece by piece. In the first part of this
+thesis, we present the mathematical foundations of GFlowNets, their connections
+to existing domains of machine learning and statistics such as variational
+inference and reinforcement learning, and their extensions beyond discrete
+problems. In the second part of this thesis, we show how GFlowNets can
+approximate the posterior distribution over DAG structures of causal Bayesian
+Networks, along with the parameters of its causal mechanisms, given
+observational and experimental data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedSA: A Unified Representation Learning via Semantic Anchors for
+  Prototype-based Federated Learning <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanbing Zhou, Xiangmou Qu, Chenlong You, Jiyang Zhou, Jingyue Tang, Xin Zheng, Chunmao Cai, Yingbo Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prototype-based federated learning has emerged as a promising approach that
+shares lightweight prototypes to transfer knowledge among clients with data
+heterogeneity in a model-agnostic manner. However, existing methods often
+collect prototypes directly from local models, which inevitably introduce
+inconsistencies into representation learning due to the biased data
+distributions and differing model architectures among clients. In this paper,
+we identify that both statistical and model heterogeneity create a vicious
+cycle of representation inconsistency, classifier divergence, and skewed
+prototype alignment, which negatively impacts the performance of clients. To
+break the vicious cycle, we propose a novel framework named Federated Learning
+via Semantic Anchors (FedSA) to decouple the generation of prototypes from
+local representation learning. We introduce a novel perspective that uses
+simple yet effective semantic anchors serving as prototypes to guide local
+models in learning consistent representations. By incorporating semantic
+anchors, we further propose anchor-based regularization with margin-enhanced
+contrastive learning and anchor-based classifier calibration to correct feature
+extractors and calibrate classifiers across clients, achieving intra-class
+compactness and inter-class separability of prototypes while ensuring
+consistent decision boundaries. We then update the semantic anchors with these
+consistent and discriminative prototypes, which iteratively encourage clients
+to collaboratively learn a unified data representation with robust
+generalization. Extensive experiments under both statistical and model
+heterogeneity settings show that FedSA significantly outperforms existing
+prototype-based FL methods on various classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LSEBMCL: A Latent Space Energy-Based Model for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaodi Li, Dingcheng Li, Rujun Gao, Mahmoud Zamani, Latifur Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning has become essential in many practical applications such
+as online news summaries and product classification. The primary challenge is
+known as catastrophic forgetting, a phenomenon where a model inadvertently
+discards previously learned knowledge when it is trained on new tasks. Existing
+solutions involve storing exemplars from previous classes, regularizing
+parameters during the fine-tuning process, or assigning different model
+parameters to each task. The proposed solution LSEBMCL (Latent Space
+Energy-Based Model for Continual Learning) in this work is to use energy-based
+models (EBMs) to prevent catastrophic forgetting by sampling data points from
+previous tasks when training on new ones. The EBM is a machine learning model
+that associates an energy value with each input data point. The proposed method
+uses an EBM layer as an outer-generator in the continual learning framework for
+NLP tasks. The study demonstrates the efficacy of EBM in NLP tasks, achieving
+state-of-the-art results in all experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the 7th International Conference on Artificial Intelligence in
+  Information and Communication (ICAIIC 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mathematical Modeling and Machine Learning for Predicting Shade-Seeking
+  Behavior in Cows Under Heat Stress 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05494v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05494v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. Sanjuan, D. A. Méndez, R. Arnau, J. M. Calabuig, X. Díaz de Otálora Aguirre, F. Estellés
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we develop a mathematical model combined with machine learning
+techniques to predict shade-seeking behavior in cows exposed to heat stress.
+The approach integrates advanced mathematical features, such as time-averaged
+thermal indices and accumulated heat stress metrics, obtained by mathematical
+analysis of data from a farm in Titaguas (Valencia, Spain), collected during
+the summer of 2023. Two predictive models, Random Forests and Neural Networks,
+are compared for accuracy, robustness, and interpretability. The Random Forest
+model is highlighted for its balance between precision and explainability,
+achieving an RMSE of $14.97$. The methodology also employs $5-$fold
+cross-validation to ensure robustness under real-world conditions. This work
+not only advances the mathematical modeling of animal behavior but also
+provides useful insights for mitigating heat stress in livestock through
+data-driven tools.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FOCUS: Towards Universal Foreground Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuyao You, Lingyu Kong, Lingchen Meng, Zuxuan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foreground segmentation is a fundamental task in computer vision,
+encompassing various subdivision tasks. Previous research has typically
+designed task-specific architectures for each task, leading to a lack of
+unification. Moreover, they primarily focus on recognizing foreground objects
+without effectively distinguishing them from the background. In this paper, we
+emphasize the importance of the background and its relationship with the
+foreground. We introduce FOCUS, the Foreground ObjeCts Universal Segmentation
+framework that can handle multiple foreground tasks. We develop a multi-scale
+semantic network using the edge information of objects to enhance image
+features. To achieve boundary-aware segmentation, we propose a novel
+distillation method, integrating the contrastive learning strategy to refine
+the prediction mask in multi-modal feature space. We conduct extensive
+experiments on a total of 13 datasets across 5 tasks, and the results
+demonstrate that FOCUS consistently outperforms the state-of-the-art
+task-specific models on most metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Monotonic Learning in the PAC Framework: A New Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Li, Chenyi Zhang, Qin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monotone learning refers to learning processes in which expected performance
+consistently improves as more training data is introduced. Non-monotone
+behavior of machine learning has been the topic of a series of recent works,
+with various proposals that ensure monotonicity by applying transformations or
+wrappers on learning algorithms. In this work, from a different perspective, we
+tackle the topic of monotone learning within the framework of Probably
+Approximately Correct (PAC) learning theory. Following the mechanism that
+estimates sample complexity of a PAC-learnable problem, we derive a performance
+lower bound for that problem, and prove the monotonicity of that bound as the
+sample sizes increase. By calculating the lower bound distribution, we are able
+to prove that given a PAC-learnable problem with a hypothesis space that is
+either of finite size or of finite VC dimension, any learning algorithm based
+on Empirical Risk Minimization (ERM) is monotone if training samples are
+independent and identically distributed (i.i.d.). We further carry out an
+experiment on two concrete machine learning problems, one of which has a finite
+hypothesis set, and the other of finite VC dimension, and compared the
+experimental data for the empirical risk distributions with the estimated
+theoretical bound. The results of the comparison have confirmed the
+monotonicity of learning for the two PAC-learnable problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Probabilities-Informed Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11526v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11526v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohsen Rashki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) has emerged as a powerful tool for tackling complex
+regression and classification tasks, yet its success often hinges on the
+quality of training data. This study introduces an ML paradigm inspired by
+domain knowledge of the structure of output function, akin to physics-informed
+ML, but rooted in probabilistic principles rather than physical laws. The
+proposed approach integrates the probabilistic structure of the target variable
+(such as its cumulative distribution function) into the training process. This
+probabilistic information is obtained from historical data or estimated using
+structural reliability methods during experimental design. By embedding
+domain-specific probabilistic insights into the learning process, the technique
+enhances model accuracy and mitigates risks of overfitting and underfitting.
+Applications in regression, image denoising, and classification demonstrate the
+approach's effectiveness in addressing real-world problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional Deep Canonical Time Warping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Afek Steinberg, Ran Eisenberg, Ofir Lindenbaum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal alignment of sequences is a fundamental challenge in many
+applications, such as computer vision and bioinformatics, where local time
+shifting needs to be accounted for. Misalignment can lead to poor model
+generalization, especially in high-dimensional sequences. Existing methods
+often struggle with optimization when dealing with high-dimensional sparse
+data, falling into poor alignments. Feature selection is frequently used to
+enhance model performance for sparse data. However, a fixed set of selected
+features would not generally work for dynamically changing sequences and would
+need to be modified based on the state of the sequence. Therefore, modifying
+the selected feature based on contextual input would result in better
+alignment. Our suggested method, Conditional Deep Canonical Temporal Time
+Warping (CDCTW), is designed for temporal alignment in sparse temporal data to
+address these challenges. CDCTW enhances alignment accuracy for high
+dimensional time-dependent views be performing dynamic time warping on data
+embedded in maximally correlated subspace which handles sparsity with novel
+feature selection method. We validate the effectiveness of CDCTW through
+extensive experiments on various datasets, demonstrating superior performance
+over previous techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attention Mechanisms Don't Learn Additive Models: Rethinking Feature
+  Importance for <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13536v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13536v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Leemann, Alina Fastowski, Felix Pfeiffer, Gjergji Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the critical challenge of applying feature attribution methods to
+the transformer architecture, which dominates current applications in natural
+language processing and beyond. Traditional attribution methods to explainable
+AI (XAI) explicitly or implicitly rely on linear or additive surrogate models
+to quantify the impact of input features on a model's output. In this work, we
+formally prove an alarming incompatibility: transformers are structurally
+incapable of representing linear or additive surrogate models used for feature
+attribution, undermining the grounding of these conventional explanation
+methodologies. To address this discrepancy, we introduce the Softmax-Linked
+Additive Log Odds Model (SLALOM), a novel surrogate model specifically designed
+to align with the transformer framework. SLALOM demonstrates the capacity to
+deliver a range of insightful explanations with both synthetic and real-world
+datasets. We highlight SLALOM's unique efficiency-quality curve by showing that
+SLALOM can produce explanations with substantially higher fidelity than
+competing surrogate models or provide explanations of comparable quality at a
+fraction of their computational costs. We release code for SLALOM as an
+open-source project online at https://github.com/tleemann/slalom_explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR Camera-Ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using Linearized Optimal Transport to Predict the Evolution of
+  Stochastic Particle Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01857v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01857v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Karris, Evangelos A. Nikitopoulos, Ioannis Kevrekidis, Seungjoon Lee, Alexander Cloninger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop an algorithm to approximate the time evolution of a probability
+distribution without explicitly learning an operator that governs the
+evolution. A particular application of interest is discrete measures $\mu_t^N$
+that arise from systems of $N$ particles in $\mathbb R^d$. In many such
+situations, the individual particles move chaotically on short time scales,
+making it difficult to learn the dynamics of a governing operator, but the bulk
+distribution $\mu_t^N$ approximates an absolutely continuous measure $\mu_t$
+that evolves ``smoothly.'' If $\mu_t$ is known on some time interval, then
+linearized optimal transport theory provides an Euler-like scheme for
+approximating the evolution of $\mu_t$ using its ``tangent vector field''
+(represented as a time-dependent vector field on $\mathbb R^d$), which can be
+computed as a limit of optimal transport maps. We propose an analog of this
+Euler approximation to predict the evolution of the discrete measure $\mu_t^N$
+(without knowing $\mu_t$). To approximate the analogous tangent vector field,
+we use a finite difference over a time step that sits between two time scales
+of the system -- long enough for a large-$N$ evolution ($\mu_t$) to emerge but
+short enough to satisfactorily approximate the derivative object used in the
+Euler scheme. The emergence of the limiting behavior ensures the optimal
+transport maps closely approximate the vector field describing the bulk
+distribution's smooth evolution instead of the individual particles' more
+chaotic movements. We demonstrate the efficacy of our approach with two
+illustrative examples, Gaussian diffusion and a cell chemotaxis model, and show
+that our method succeeds in predicting the bulk behavior over relatively large
+steps.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized Kernel Thinning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.01593v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.01593v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raaz Dwivedi, Lester Mackey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The kernel thinning (KT) algorithm of Dwivedi and Mackey (2021) compresses a
+probability distribution more effectively than independent sampling by
+targeting a reproducing kernel Hilbert space (RKHS) and leveraging a less
+smooth square-root kernel. Here we provide four improvements. First, we show
+that KT applied directly to the target RKHS yields tighter, dimension-free
+guarantees for any kernel, any distribution, and any fixed function in the
+RKHS. Second, we show that, for analytic kernels like Gaussian, inverse
+multiquadric, and sinc, target KT admits maximum mean discrepancy (MMD)
+guarantees comparable to or better than those of square-root KT without making
+explicit use of a square-root kernel. Third, we prove that KT with a fractional
+power kernel yields better-than-Monte-Carlo MMD guarantees for non-smooth
+kernels, like Laplace and Mat\'ern, that do not have square-roots. Fourth, we
+establish that KT applied to a sum of the target and power kernels (a procedure
+we call KT+) simultaneously inherits the improved MMD guarantees of power KT
+and the tighter individual function guarantees of target KT. In our experiments
+with target KT and KT+, we witness significant improvements in integration
+error even in $100$ dimensions and when compressing challenging differential
+equation posteriors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Corrected B-spline and Sinc rates in Table 3</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TradingAgents: Multi-Agents LLM Financial Trading Framework <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20138v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20138v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijia Xiao, Edward Sun, Di Luo, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant progress has been made in automated problem-solving using
+societies of agents powered by large language models (LLMs). In finance,
+efforts have largely focused on single-agent systems handling specific tasks or
+multi-agent frameworks independently gathering data. However, multi-agent
+systems' potential to replicate real-world trading firms' collaborative
+dynamics remains underexplored. TradingAgents proposes a novel stock trading
+framework inspired by trading firms, featuring LLM-powered agents in
+specialized roles such as fundamental analysts, sentiment analysts, technical
+analysts, and traders with varied risk profiles. The framework includes Bull
+and Bear researcher agents assessing market conditions, a risk management team
+monitoring exposure, and traders synthesizing insights from debates and
+historical data to make informed decisions. By simulating a dynamic,
+collaborative trading environment, this framework aims to improve trading
+performance. Detailed architecture and extensive experiments reveal its
+superiority over baseline models, with notable improvements in cumulative
+returns, Sharpe ratio, and maximum drawdown, highlighting the potential of
+multi-agent LLM frameworks in financial trading. More details on TradingAgents
+are available at https://TradingAgents-AI.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Multi-Agent AI in the Real World @ AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PFML: <span class="highlight-title">Self-Supervised</span> Learning of Time-Series Data Without
+  Representation Collapse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10087v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10087v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Einari Vaaras, Manu Airaksinen, Okko Räsänen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) is a data-driven learning approach that
+utilizes the innate structure of the data to guide the learning process. In
+contrast to supervised learning, which depends on external labels, SSL utilizes
+the inherent characteristics of the data to produce its own supervisory signal.
+However, one frequent issue with SSL methods is representation collapse, where
+the model outputs a constant input-invariant feature representation. This issue
+hinders the potential application of SSL methods to new data modalities, as
+trying to avoid representation collapse wastes researchers' time and effort.
+This paper introduces a novel SSL algorithm for time-series data called
+Prediction of Functionals from Masked Latents (PFML). Instead of predicting
+masked input signals or their latent representations directly, PFML operates by
+predicting statistical functionals of the input signal corresponding to masked
+embeddings, given a sequence of unmasked embeddings. The algorithm is designed
+to avoid representation collapse, rendering it straightforwardly applicable to
+different time-series data domains, such as novel sensor modalities in clinical
+data. We demonstrate the effectiveness of PFML through complex, real-life
+classification tasks across three different data modalities: infant posture and
+movement classification from multi-sensor inertial measurement unit data,
+emotion recognition from speech data, and sleep stage classification from EEG
+data. The results show that PFML is superior to a conceptually similar SSL
+method and a contrastive learning-based SSL method. Additionally, PFML is on
+par with the current state-of-the-art SSL method, while also being conceptually
+simpler and without suffering from representation collapse.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Conformal Prediction Using Privileged Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.05405v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.05405v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shai Feldman, Yaniv Romano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a method to generate prediction sets with a guaranteed coverage
+rate that is robust to corruptions in the training data, such as missing or
+noisy variables. Our approach builds on conformal prediction, a powerful
+framework to construct prediction sets that are valid under the i.i.d
+assumption. Importantly, naively applying conformal prediction does not provide
+reliable predictions in this setting, due to the distribution shift induced by
+the corruptions. To account for the distribution shift, we assume access to
+privileged information (PI). The PI is formulated as additional features that
+explain the distribution shift, however, they are only available during
+training and absent at test time. We approach this problem by introducing a
+novel generalization of weighted conformal prediction and support our method
+with theoretical coverage guarantees. Empirical experiments on both real and
+synthetic datasets indicate that our approach achieves a valid coverage rate
+and constructs more informative predictions compared to existing methods, which
+are not supported by theoretical guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometry Restoration and Dewarping of Camera-Captured Document Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03145v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03145v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valery Istomin, Oleg Pereziabov, Ilya Afanasyev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research focuses on developing a method for restoring the topology of
+digital images of paper documents captured by a camera, using algorithms for
+detection, segmentation, geometry restoration, and dewarping. Our methodology
+employs deep learning (DL) for document outline detection, followed by computer
+vision (CV) to create a topological 2D grid using cubic polynomial
+interpolation and correct nonlinear distortions by remapping the image. Using
+classical CV methods makes the document topology restoration process more
+efficient and faster, as it requires significantly fewer computational
+resources and memory. We developed a new pipeline for automatic document
+dewarping and reconstruction, along with a framework and annotated dataset to
+demonstrate its efficiency. Our experiments confirm the promise of our
+methodology and its superiority over existing benchmarks (including mobile apps
+and popular DL solutions, such as RectiNet, DocGeoNet, and DocTr++) both
+visually and in terms of document readability via Optical Character Recognition
+(OCR) and geometry restoration metrics. This paves the way for creating
+high-quality digital copies of paper documents and enhancing the efficiency of
+OCR systems. Project page: https://github.com/HorizonParadox/DRCCBI
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ REFA: Reference Free Alignment for multi-preference optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16378v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16378v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taneesh Gupta, Rahul Madhavan, Xuchao Zhang, Chetan Bansal, Saravan Rajmohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce REFA, a family of reference-free alignment methods that optimize
+over multiple user preferences while enforcing fine-grained length control. Our
+approach integrates deviation-based weighting to emphasize high-quality
+responses more strongly, length normalization to prevent trivial short-response
+solutions, and an EOS-probability regularizer to mitigate dataset-induced
+brevity biases. Theoretically, we show that under the Uncertainty Reduction
+with Sequence Length Assertion (URSLA), naive length normalization can still
+incentivize length-based shortcuts. By contrast, REFA corrects these subtle
+incentives, guiding models toward genuinely more informative and higher-quality
+outputs. Empirically, REFA sets a new state-of-the-art among reference-free
+alignment methods, producing richer responses aligned more closely with human
+preferences. Compared to a base supervised fine-tuned (SFT) mistral-7b model
+that achieves 8.4% length-controlled win rate (LC-WR) and 6.2% win rate (WR),
+our best REFA configuration attains 21.62% LC-WR and 19.87% WR on the
+AlpacaEval v2 benchmark. This represents a substantial improvement over both
+the strongest multi-preference baseline, InfoNCA (16.82% LC-WR, 10.44% WR), and
+the strongest reference-free baseline, SimPO (20.01% LC-WR, 17.65% WR)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AgentForge: A Flexible Low-Code Platform for Reinforcement Learning
+  Agent Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.19528v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.19528v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Erivaldo Fernandes Junior, Antti Oulasvirta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing a reinforcement learning (RL) agent often involves identifying
+values for numerous parameters, covering the policy, reward function,
+environment, and agent-internal architecture. Since these parameters are
+interrelated in complex ways, optimizing them is a black-box problem that
+proves especially challenging for nonexperts. Although existing
+optimization-as-a-service platforms (e.g., Vizier and Optuna) can handle such
+problems, they are impractical for RL systems, since the need for manual user
+mapping of each parameter to distinct components makes the effort cumbersome.
+It also requires understanding of the optimization process, limiting the
+systems' application beyond the machine learning field and restricting access
+in areas such as cognitive science, which models human decision-making. To
+tackle these challenges, the paper presents AgentForge, a flexible low-code
+platform to optimize any parameter set across an RL system. Available at
+https://github.com/feferna/AgentForge, it allows an optimization problem to be
+defined in a few lines of code and handed to any of the interfaced optimizers.
+With AgentForge, the user can optimize the parameters either individually or
+jointly. The paper presents an evaluation of its performance for a challenging
+vision-based RL problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted at the 17th International Conference on
+  Agents and Artificial Intelligence (ICAART 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Contrastive Symmetric Forward-Forward Algorithm (SFFA) for Continual
+  Learning Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.07387v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.07387v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik B. Terres-Escudero, Javier Del Ser, Pablo Garcia Bringas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The so-called Forward-Forward Algorithm (FFA) has recently gained momentum as
+an alternative to the conventional back-propagation algorithm for neural
+network learning, yielding competitive performance across various modeling
+tasks. By replacing the backward pass of gradient back-propagation with two
+contrastive forward passes, the FFA avoids several shortcomings undergone by
+its predecessor (e.g., vanishing/exploding gradient) by enabling layer-wise
+training heuristics. In classification tasks, this contrastive method has been
+proven to effectively create a latent sparse representation of the input data,
+ultimately favoring discriminability. However, FFA exhibits an inherent
+asymmetric gradient behavior due to an imbalanced loss function between
+positive and negative data, adversely impacting on the model's generalization
+capabilities and leading to an accuracy degradation. To address this issue,
+this work proposes the Symmetric Forward-Forward Algorithm (SFFA), a novel
+modification of the original FFA which partitions each layer into positive and
+negative neurons. This allows the local fitness function to be defined as the
+ratio between the activation of positive neurons and the overall layer
+activity, resulting in a symmetric loss landscape during the training phase. To
+evaluate the enhanced convergence of our method, we conduct several experiments
+using multiple image classification benchmarks, comparing the accuracy of
+models trained with SFFA to those trained with its FFA counterpart. As a
+byproduct of this reformulation, we explore the advantages of using a
+layer-wise training algorithm for Continual Learning (CL) tasks. The
+specialization of neurons and the sparsity of their activations induced by
+layer-wise training algorithms enable efficient CL strategies that incorporate
+new knowledge (classes) into the neural network, while preventing catastrophic
+forgetting of previously...
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 3rd Conference on Lifelong Learning Agents (CoLLAs), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Attention Graph Neural Networks for Inferring Gene Regulatory
+  Networks with Skewed Degree Distribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16220v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16220v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Xiong, Nan Yin, Shiyang Liang, Haoyang Li, Yingxu Wang, Duo Ai, Fang Pan, Jingjie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inferencing Gene Regulatory Networks (GRNs) from gene expression data is a
+pivotal challenge in systems biology, and several innovative computational
+methods have been introduced. However, most of these studies have not
+considered the skewed degree distribution of genes. Specifically, some genes
+may regulate multiple target genes while some genes may be regulated by
+multiple regulator genes. Such a skewed degree distribution issue significantly
+complicates the application of directed graph embedding methods. To tackle this
+issue, we propose the Cross-Attention Complex Dual Graph Embedding Model
+(XATGRN). Our XATGRN employs a cross-attention mechanism to effectively capture
+intricate gene interactions from gene expression profiles. Additionally, it
+uses a Dual Complex Graph Embedding approach to manage the skewed degree
+distribution, thereby ensuring precise prediction of regulatory relationships
+and their directionality. Our model consistently outperforms existing
+state-of-the-art methods across various datasets, underscoring its efficacy in
+elucidating complex gene regulatory mechanisms. Our codes used in this paper
+are publicly available at: https://github.com/kikixiong/XATGRN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures,1 tabels</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Drift2Matrix: Kernel-Induced Self Representation for Concept Drift
+  Adaptation in Co-evolving Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01480v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01480v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunpeng Xu, Lifei Chen, Shengrui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of time series analysis, tackling the phenomenon of concept
+drift poses a significant challenge. Concept drift -- characterized by the
+evolving statistical properties of time series data, affects the reliability
+and accuracy of conventional analysis models. This is particularly evident in
+co-evolving scenarios where interactions among variables are crucial. This
+paper presents Drift2Matrix, a novel framework that leverages kernel-induced
+self-representation for adaptive responses to concept drift in time series.
+Drift2Matrix employs a kernel-based learning mechanism to generate a
+representation matrix, encapsulating the inherent dynamics of co-evolving time
+series. This matrix serves as a key tool for identification and adaptation to
+concept drift by observing its temporal variations. Furthermore, Drift2Matrix
+effectively identifies prevailing patterns and offers insights into emerging
+trends through pattern evolution analysis. Our empirical evaluation of
+Drift2Matrix across various datasets demonstrates its effectiveness in handling
+the complexities of concept drift. This approach introduces a novel perspective
+in the theoretical domain of co-evolving time series analysis, enhancing
+adaptability and accuracy in the face of dynamic data environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regret Analysis: a control perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04572v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04572v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Travis E. Gibson, Sawal Acharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online learning and model reference adaptive control have many interesting
+intersections. One area where they differ however is in how the algorithms are
+analyzed and what objective or metric is used to discriminate "good" algorithms
+from "bad" algorithms. In adaptive control there are usually two objectives: 1)
+prove that all time varying parameters/states of the system are bounded, and 2)
+that the instantaneous error between the adaptively controlled system and a
+reference system converges to zero over time (or at least a compact set). For
+online learning the performance of algorithms is often characterized by the
+regret the algorithm incurs. Regret is defined as the cumulative loss (cost)
+over time from the online algorithm minus the cumulative loss (cost) of the
+single optimal fixed parameter choice in hindsight. Another significant
+difference between the two areas of research is with regard to the assumptions
+made in order to obtain said results. Adaptive control makes assumptions about
+the input-output properties of the control problem and derives solutions for a
+fixed error model or optimization task. In the online learning literature
+results are derived for classes of loss functions (i.e. convex) while a priori
+assuming that all time varying parameters are bounded, which for many
+optimization tasks is not unrealistic, but is a non starter in control
+applications. In this work we discuss these differences in detail through the
+regret based analysis of gradient descent for convex functions and the control
+based analysis of a streaming regression problem. We close with a discussion
+about the newly defined paradigm of online adaptive control and ask the
+following question "Are regret optimal control strategies deployable?"
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages no figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of uncertainty estimations for Gaussian process regression
+  based machine learning interatomic potentials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.20398v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.20398v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Holzenkamp, Dongyu Lyu, Ulrich Kleinekathöfer, Peter Zaspel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty estimations for machine learning interatomic potentials (MLIPs)
+are crucial for quantifying model error and identifying informative training
+samples in active learning strategies. In this study, we evaluate uncertainty
+estimations of Gaussian process regression (GPR)-based MLIPs, including the
+predictive GPR standard deviation and ensemble-based uncertainties. We do this
+in terms of calibration and in terms of impact on model performance in an
+active learning scheme. We consider GPR models with Coulomb and Smooth Overlap
+of Atomic Positions (SOAP) representations as inputs to predict potential
+energy surfaces and excitation energies of molecules. Regarding calibration, we
+find that ensemble-based uncertainty estimations show already poor global
+calibration (e.g., averaged over the whole test set). In contrast, the GPR
+standard deviation shows good global calibration, but when grouping predictions
+by their uncertainty, we observe a systematical bias for predictions with high
+uncertainty. Although an increasing uncertainty correlates with an increasing
+bias, the bias is not captured quantitatively by the uncertainty. Therefore,
+the GPR standard deviation can be useful to identify predictions with a high
+bias and error but, without further knowledge, should not be interpreted as a
+quantitative measure for a potential error range. Selecting the samples with
+the highest GPR standard deviation from a fixed configuration space leads to a
+model that overemphasizes the borders of the configuration space represented in
+the fixed dataset. This may result in worse performance in more densely sampled
+areas but better generalization for extrapolation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time Transfer: On Optimal Learning Rate and Batch Size In The Infinite
+  Data Limit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Filatov, Jan Ebert, Jiangtao Wang, Stefan Kesselheim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the main challenges in optimal scaling of large language models (LLMs)
+is the prohibitive cost of hyperparameter tuning, particularly learning rate
+$\eta$ and batch size $B$. While techniques like $\mu$P (Yang et al., 2022)
+provide scaling rules for optimal $\eta$ transfer in the infinite model size
+limit, the optimal scaling behavior in the infinite data size limit remains
+unknown. We fill in this gap by observing for the first time an intricate
+dependence of optimal $\eta$ scaling on the pretraining token budget $T$, $B$
+and its relation to the critical batch size $B_\mathrm{crit}$, which we measure
+to evolve as $B_\mathrm{crit} \propto T$. Furthermore, we show that the optimal
+batch size is positively correlated with $B_\mathrm{crit}$: keeping it fixed
+becomes suboptimal over time even if learning rate is scaled optimally.
+Surprisingly, our results demonstrate that the observed optimal $\eta$ and $B$
+dynamics are preserved with $\mu$P model scaling, challenging the conventional
+view of $B_\mathrm{crit}$ dependence solely on loss value. Complementing
+optimality, we examine the sensitivity of loss to changes in learning rate,
+where we find the sensitivity to decrease with increase of $T$ and to remain
+constant with $\mu$P model scaling. We hope our results make the first step
+towards a unified picture of the joint optimal data and model scaling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RA-PbRL: Provably Efficient Risk-Aware Preference-Based Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23569v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23569v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Zhao, Jose Efraim Aguilar Escamill, Weyl Lu, Huazheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning from Human Feedback (RLHF) has recently surged in
+popularity, particularly for aligning large language models and other AI
+systems with human intentions. At its core, RLHF can be viewed as a specialized
+instance of Preference-based Reinforcement Learning (PbRL), where the
+preferences specifically originate from human judgments rather than arbitrary
+evaluators. Despite this connection, most existing approaches in both RLHF and
+PbRL primarily focus on optimizing a mean reward objective, neglecting
+scenarios that necessitate risk-awareness, such as AI safety, healthcare, and
+autonomous driving. These scenarios often operate under a one-episode-reward
+setting, which makes conventional risk-sensitive objectives inapplicable. To
+address this, we explore and prove the applicability of two risk-aware
+objectives to PbRL : nested and static quantile risk objectives. We also
+introduce Risk-AwarePbRL (RA-PbRL), an algorithm designed to optimize both
+nested and static objectives. Additionally, we provide a theoretical analysis
+of the regret upper bounds, demonstrating that they are sublinear with respect
+to the number of episodes, and present empirical results to support our
+findings. Our code is available in
+https://github.com/aguilarjose11/PbRLNeurips.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralized Federated Anomaly Detection in Smart Grids: A P2P Gossip
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15879v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15879v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Akbar Husnoo, Adnan Anwar, Md Enamul Haque, A. N. Mahmood
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing security and privacy concerns in the Smart Grid sector have
+led to a significant demand for robust intrusion detection systems within
+critical smart grid infrastructure. To address the challenges posed by privacy
+preservation and decentralized power system zones with distinct data ownership,
+Federated Learning (FL) has emerged as a promising privacy-preserving solution
+which facilitates collaborative training of attack detection models without
+necessitating the sharing of raw data. However, FL presents several
+implementation limitations in the power system domain due to its heavy reliance
+on a centralized aggregator and the risks of privacy leakage during model
+update transmission. To overcome these technical bottlenecks, this paper
+introduces a novel decentralized federated anomaly detection scheme based on
+two main gossip protocols namely Random Walk and Epidemic. Our findings
+indicate that the Random Walk protocol exhibits superior performance compared
+to the Epidemic protocol, highlighting its efficacy in decentralized federated
+learning environments. Experimental validation of the proposed framework
+utilizing publicly available industrial control systems datasets demonstrates
+superior attack detection accuracy while safeguarding data confidentiality and
+mitigating the impact of communication latency and stragglers. Furthermore, our
+approach yields a notable 35% improvement in training time compared to
+conventional FL, underscoring the efficacy and robustness of our decentralized
+learning method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Graph Neural Network Training by Focusing on Non-Robust Samples
+  from the Training Set 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14738v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14738v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are a highly effective neural network
+architecture for processing graph-structured data. Unlike traditional neural
+networks that rely solely on the features of the data as input, GNNs leverage
+both the graph structure, which represents the relationships between data
+points, and the feature matrix of the data to optimize their feature
+representation. This unique capability enables GNNs to achieve superior
+performance across various tasks. However, it also makes GNNs more susceptible
+to noise from both the graph structure and data features, which can
+significantly increase the training difficulty and degrade their performance.
+To address this issue, this paper proposes a novel method for selecting
+noise-sensitive training samples from the original training set to construct a
+smaller yet more effective training set for model training. These samples are
+then used to enhance the model's ability to handle noise-prone instances
+effectively. We have evaluated our approach on three of the most classical GNN
+models -- GCN, GAT, and GraphSAGE -- as well as three widely used benchmark
+datasets: Cora, Citeseer, and PubMed. Our experiments demonstrate that the
+proposed method can substantially boost the overall training of Graph Neural
+Networks compared to using randomly constructed training sets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Human Delegation Behavior in Human-AI Collaboration: The Effect of
+  Contextual Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04729v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04729v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Spitzer, Joshua Holstein, Patrick Hemmer, Michael Vössing, Niklas Kühl, Dominik Martin, Gerhard Satzger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of artificial intelligence (AI) into human decision-making
+processes at the workplace presents both opportunities and challenges. One
+promising approach to leverage existing complementary capabilities is allowing
+humans to delegate individual instances of decision tasks to AI. However,
+enabling humans to delegate instances effectively requires them to assess
+several factors. One key factor is the analysis of both their own capabilities
+and those of the AI in the context of the given task. In this work, we conduct
+a behavioral study to explore the effects of providing contextual information
+to support this delegation decision. Specifically, we investigate how
+contextual information about the AI and the task domain influence humans'
+delegation decisions to an AI and their impact on the human-AI team
+performance. Our findings reveal that access to contextual information
+significantly improves human-AI team performance in delegation settings.
+Finally, we show that the delegation behavior changes with the different types
+of contextual information. Overall, this research advances the understanding of
+computer-supported, collaborative work and provides actionable insights for
+designing more effective collaborative systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Filter-then-Generate: Large Language Models with Structure-Text Adapter
+  for Knowledge Graph Completion <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Liu, Jihai Zhang, Fangquan Lin, Cheng Yang, Min Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) present massive inherent knowledge and superior
+semantic comprehension capability, which have revolutionized various tasks in
+natural language processing. Despite their success, a critical gap remains in
+enabling LLMs to perform knowledge graph completion (KGC). Empirical evidence
+suggests that LLMs consistently perform worse than conventional KGC approaches,
+even through sophisticated prompt design or tailored instruction-tuning.
+Fundamentally, applying LLMs on KGC introduces several critical challenges,
+including a vast set of entity candidates, hallucination issue of LLMs, and
+under-exploitation of the graph structure. To address these challenges, we
+propose a novel instruction-tuning-based method, namely FtG. Specifically, we
+present a \textit{filter-then-generate} paradigm and formulate the KGC task
+into a multiple-choice question format. In this way, we can harness the
+capability of LLMs while mitigating the issue casused by hallucinations.
+Moreover, we devise a flexible ego-graph serialization prompt and employ a
+structure-text adapter to couple structure and text information in a
+contextualized manner. Experimental results demonstrate that FtG achieves
+substantial performance gain compared to existing state-of-the-art methods. The
+instruction dataset and code are available at
+\url{https://github.com/LB0828/FtG}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>COLING 2025 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DGNN-YOLO: Interpretable Dynamic Graph Neural Networks with YOLO11 for
+  Detecting and Tracking Small Occluded Objects in Urban Traffic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.17251v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.17251v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahriar Soudeep, M. F. Mridha, Md Abrar Jahin, Nilanjan Dey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The detection and tracking of small, occluded objects such as pedestrians,
+cyclists, and motorbikes pose significant challenges for traffic surveillance
+systems because of their erratic movement, frequent occlusion, and poor
+visibility in dynamic urban environments. Traditional methods like YOLO11,
+while proficient in spatial feature extraction for precise detection, often
+struggle with these small and dynamically moving objects, particularly in
+handling real-time data updates and resource efficiency. This paper introduces
+DGNN-YOLO, a novel framework that integrates dynamic graph neural networks
+(DGNNs) with YOLO11 to address these limitations. Unlike standard GNNs, DGNNs
+are chosen for their superior ability to dynamically update graph structures in
+real-time, which enables adaptive and robust tracking of objects in highly
+variable urban traffic scenarios. This framework constructs and regularly
+updates its graph representations, capturing objects as nodes and their
+interactions as edges, thus effectively responding to rapidly changing
+conditions. Additionally, DGNN-YOLO incorporates Grad-CAM, Grad-CAM++, and
+Eigen-CAM visualization techniques to enhance interpretability and foster
+trust, offering insights into the model's decision-making process. Extensive
+experiments validate the framework's performance, achieving a precision of
+0.8382, recall of 0.6875, and mAP@0.5:0.95 of 0.6476, significantly
+outperforming existing methods. This study offers a scalable and interpretable
+solution for real-time traffic surveillance and significantly advances
+intelligent transportation systems' capabilities by addressing the critical
+challenge of detecting and tracking small, occluded objects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatiotemporally Coherent Probabilistic Generation of Weather from
+  Climate 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15361v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15361v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Schmidt, Luca Schmidt, Felix Strnad, Nicole Ludwig, Philipp Hennig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Local climate information is crucial for impact assessment and
+decision-making, yet coarse global climate simulations cannot capture
+small-scale phenomena. Current statistical downscaling methods infer these
+phenomena as temporally decoupled spatial patches. However, to preserve
+physical properties, estimating spatio-temporally coherent high-resolution
+weather dynamics for multiple variables across long time horizons is crucial.
+We present a novel generative approach that uses a score-based diffusion model
+trained on high-resolution reanalysis data to capture the statistical
+properties of local weather dynamics. After training, we condition on coarse
+climate model data to generate weather patterns consistent with the aggregate
+information. As this inference task is inherently uncertain, we leverage the
+probabilistic nature of diffusion models and sample multiple trajectories. We
+evaluate our approach with high-resolution reanalysis information before
+applying it to the climate model downscaling task. We then demonstrate that the
+model generates spatially and temporally coherent weather dynamics that align
+with global climate output.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures, additional supplementary text and figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Neural Network Symmetrisation in Markov Categories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11814v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11814v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rob Cornish
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of symmetrising a neural network along a group
+homomorphism: given a homomorphism $\varphi : H \to G$, we would like a
+procedure that converts $H$-equivariant neural networks to $G$-equivariant
+ones. We formulate this in terms of Markov categories, which allows us to
+consider neural networks whose outputs may be stochastic, but with
+measure-theoretic details abstracted away. We obtain a flexible and
+compositional framework for symmetrisation that relies on minimal assumptions
+about the structure of the group and the underlying neural network
+architecture. Our approach recovers existing canonicalisation and averaging
+techniques for symmetrising deterministic models, and extends to provide a
+novel methodology for symmetrising stochastic models also. Beyond this, our
+findings also demonstrate the utility of Markov categories for addressing
+complex problems in machine learning in a conceptually clear yet mathematically
+precise way.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpreting Deep Neural Network-Based Receiver Under Varying
+  Signal-To-Noise Ratios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16768v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16768v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marko Tuononen, Dani Korpi, Ville Hautamäki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel method for interpreting neural networks, focusing on
+convolutional neural network-based receiver model. The method identifies which
+unit or units of the model contain most (or least) information about the
+channel parameter(s) of the interest, providing insights at both global and
+local levels -- with global explanations aggregating local ones. Experiments on
+link-level simulations demonstrate the method's effectiveness in identifying
+units that contribute most (and least) to signal-to-noise ratio processing.
+Although we focus on a radio receiver model, the method generalizes to other
+neural network architectures and applications, offering robust estimation even
+in high-dimensional settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7+1 pages, 8 figures, 1 equation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COCOLA: Coherence-Oriented Contrastive Learning of Musical Audio
+  Representations <span class="chip">ICASSP-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.16969v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.16969v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruben Ciranni, Giorgio Mariani, Michele Mancusi, Emilian Postolache, Giorgio Fabbro, Emanuele Rodolà, Luca Cosmo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present COCOLA (Coherence-Oriented Contrastive Learning for Audio), a
+contrastive learning method for musical audio representations that captures the
+harmonic and rhythmic coherence between samples. Our method operates at the
+level of the stems composing music tracks and can input features obtained via
+Harmonic-Percussive Separation (HPS). COCOLA allows the objective evaluation of
+generative models for music accompaniment generation, which are difficult to
+benchmark with established metrics. In this regard, we evaluate recent music
+accompaniment generation models, demonstrating the effectiveness of the
+proposed method. We release the model checkpoints trained on public datasets
+containing separate stems (MUSDB18-HQ, MoisesDB, Slakh2100, and CocoChorales).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Demo page: https://github.com/gladia-research-group/cocola, Accepted
+  at ICASSP-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Reward: LLM-Empowered Credit Assignment in Episodic Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11120v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11120v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Qu, Yuhang Jiang, Boyuan Wang, Yixiu Mao, Cheems Wang, Chang Liu, Xiangyang Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) often encounters delayed and sparse feedback in
+real-world applications, even with only episodic rewards. Previous approaches
+have made some progress in reward redistribution for credit assignment but
+still face challenges, including training difficulties due to redundancy and
+ambiguous attributions stemming from overlooking the multifaceted nature of
+mission performance evaluation. Hopefully, Large Language Model (LLM)
+encompasses fruitful decision-making knowledge and provides a plausible tool
+for reward redistribution. Even so, deploying LLM in this case is non-trivial
+due to the misalignment between linguistic knowledge and the symbolic form
+requirement, together with inherent randomness and hallucinations in inference.
+To tackle these issues, we introduce LaRe, a novel LLM-empowered symbolic-based
+decision-making framework, to improve credit assignment. Key to LaRe is the
+concept of the Latent Reward, which works as a multi-dimensional performance
+evaluation, enabling more interpretable goal attainment from various
+perspectives and facilitating more effective reward redistribution. We examine
+that semantically generated code from LLM can bridge linguistic knowledge and
+symbolic latent rewards, as it is executable for symbolic objects. Meanwhile,
+we design latent reward self-verification to increase the stability and
+reliability of LLM inference. Theoretically, reward-irrelevant redundancy
+elimination in the latent reward benefits RL performance from more accurate
+reward estimation. Extensive experimental results witness that LaRe (i)
+achieves superior temporal credit assignment to SOTA methods, (ii) excels in
+allocating contributions among multiple agents, and (iii) outperforms policies
+trained with ground truth rewards for certain tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convergence Analysis of Split Federated Learning on Heterogeneous Data <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15166v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15166v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengchao Han, Chao Huang, Geng Tian, Ming Tang, Xin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Split federated learning (SFL) is a recent distributed approach for
+collaborative model training among multiple clients. In SFL, a global model is
+typically split into two parts, where clients train one part in a parallel
+federated manner, and a main server trains the other. Despite the recent
+research on SFL algorithm development, the convergence analysis of SFL is
+missing in the literature, and this paper aims to fill this gap. The analysis
+of SFL can be more challenging than that of federated learning (FL), due to the
+potential dual-paced updates at the clients and the main server. We provide
+convergence analysis of SFL for strongly convex and general convex objectives
+on heterogeneous data. The convergence rates are $O(1/T)$ and
+$O(1/\sqrt[3]{T})$, respectively, where $T$ denotes the total number of rounds
+for SFL training. We further extend the analysis to non-convex objectives and
+the scenario where some clients may be unavailable during training.
+Experimental experiments validate our theoretical results and show that SFL
+outperforms FL and split learning (SL) when data is highly heterogeneous across
+a large number of clients.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Conference on Neural Information Processing Systems
+  (NeurIPS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Localisation of Spatial-Temporal Graph Neural Network <span class="chip">KDD'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04239v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04239v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenying Duan, Shujun Guo, Wei huang, Hong Rao, Xiaoxi He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial-temporal data, fundamental to many intelligent applications, reveals
+dependencies indicating causal links between present measurements at specific
+locations and historical data at the same or other locations. Within this
+context, adaptive spatial-temporal graph neural networks (ASTGNNs) have emerged
+as valuable tools for modelling these dependencies, especially through a
+data-driven approach rather than pre-defined spatial graphs. While this
+approach offers higher accuracy, it presents increased computational demands.
+Addressing this challenge, this paper delves into the concept of localisation
+within ASTGNNs, introducing an innovative perspective that spatial dependencies
+should be dynamically evolving over time. We introduce \textit{DynAGS}, a
+localised ASTGNN framework aimed at maximising efficiency and accuracy in
+distributed deployment. This framework integrates dynamic localisation,
+time-evolving spatial graphs, and personalised localisation, all orchestrated
+around the Dynamic Graph Generator, a light-weighted central module leveraging
+cross attention. The central module can integrate historical information in a
+node-independent manner to enhance the feature representation of nodes at the
+current moment. This improved feature representation is then used to generate a
+dynamic sparse graph without the need for costly data exchanges, and it
+supports personalised localisation. Performance assessments across two core
+ASTGNN architectures and nine real-world datasets from various applications
+reveal that \textit{DynAGS} outshines current benchmarks, underscoring that the
+dynamic modelling of spatial dependencies can drastically improve model
+expressibility, flexibility, and system efficiency, especially in distributed
+settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted by KDD'25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Naturalistic Music Decoding from EEG Data via Latent Diffusion Models <span class="chip">ICASSP-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09062v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09062v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emilian Postolache, Natalia Polouliakh, Hiroaki Kitano, Akima Connelly, Emanuele Rodolà, Luca Cosmo, Taketo Akama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we explore the potential of using latent diffusion models, a
+family of powerful generative models, for the task of reconstructing
+naturalistic music from electroencephalogram (EEG) recordings. Unlike simpler
+music with limited timbres, such as MIDI-generated tunes or monophonic pieces,
+the focus here is on intricate music featuring a diverse array of instruments,
+voices, and effects, rich in harmonics and timbre. This study represents an
+initial foray into achieving general music reconstruction of high-quality using
+non-invasive EEG data, employing an end-to-end training approach directly on
+raw data without the need for manual pre-processing and channel selection. We
+train our models on the public NMED-T dataset and perform quantitative
+evaluation proposing neural embedding-based metrics. Our work contributes to
+the ongoing research in neural decoding and brain-computer interfaces, offering
+insights into the feasibility of using EEG data for complex auditory
+information reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Methodology for Interpretable Reinforcement Learning for Optimizing
+  Mechanical Ventilation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03105v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03105v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joo Seung Lee, Malini Mahendra, Anil Aswani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mechanical ventilation is a critical life support intervention that delivers
+controlled air and oxygen to a patient's lungs, assisting or replacing
+spontaneous breathing. While several data-driven approaches have been proposed
+to optimize ventilator control strategies, they often lack interpretability and
+alignment with domain knowledge, hindering clinical adoption. This paper
+presents a methodology for interpretable reinforcement learning (RL) aimed at
+improving mechanical ventilation control as part of connected health systems.
+Using a causal, nonparametric model-based off-policy evaluation, we assess RL
+policies for their ability to enhance patient-specific outcomes-specifically,
+increasing blood oxygen levels (SpO2), while avoiding aggressive ventilator
+settings that may cause ventilator-induced lung injuries and other
+complications. Through numerical experiments on real-world ICU data from the
+MIMIC-III database, we demonstrate that our interpretable decision tree policy
+achieves performance comparable to state-of-the-art deep RL methods while
+outperforming standard behavior cloning approaches. The results highlight the
+potential of interpretable, data-driven decision support systems to improve
+safety and efficiency in personalized ventilation strategies, paving the way
+for seamless integration into connected healthcare environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preference-Based Multi-Agent Reinforcement Learning: Data Coverage and
+  Algorithmic Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.00717v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.00717v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Natalia Zhang, Xinqi Wang, Qiwen Cui, Runlong Zhou, Sham M. Kakade, Simon S. Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We initiate the study of Preference-Based Multi-Agent Reinforcement Learning
+(PbMARL), exploring both theoretical foundations and empirical validations. We
+define the task as identifying the Nash equilibrium from a preference-only
+offline dataset in general-sum games, a problem marked by the challenge of
+sparse feedback signals. Our theory establishes the upper complexity bounds for
+Nash Equilibrium in effective PbMARL, demonstrating that single-policy coverage
+is inadequate and highlighting the importance of unilateral dataset coverage.
+These theoretical insights are verified through comprehensive experiments. To
+enhance the practical performance, we further introduce two algorithmic
+techniques. (1) We propose a Mean Squared Error (MSE) regularization along the
+time axis to achieve a more uniform reward distribution and improve reward
+learning outcomes. (2) We propose an additional penalty based on the
+distribution of the dataset to incorporate pessimism, improving stability and
+effectiveness during training. Our findings underscore the multifaceted
+approach required for PbMARL, paving the way for effective preference-based
+multi-agent systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Representation Learning of Lab Values via Masked AutoEncoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Restrepo, Chenwei Wu, Yueran Jia, Jaden K. Sun, Jack Gallifant, Catherine G. Bielick, Yugang Jia, Leo A. Celi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate imputation of missing laboratory values in electronic health records
+(EHRs) is critical to enable robust clinical predictions and reduce biases in
+AI systems in healthcare. Existing methods, such as variational autoencoders
+(VAEs) and decision tree-based approaches such as XGBoost, struggle to model
+the complex temporal and contextual dependencies in EHR data, mainly in
+underrepresented groups. In this work, we propose Lab-MAE, a novel
+transformer-based masked autoencoder framework that leverages self-supervised
+learning for the imputation of continuous sequential lab values. Lab-MAE
+introduces a structured encoding scheme that jointly models laboratory test
+values and their corresponding timestamps, enabling explicit capturing temporal
+dependencies. Empirical evaluation on the MIMIC-IV dataset demonstrates that
+Lab-MAE significantly outperforms the state-of-the-art baselines such as
+XGBoost across multiple metrics, including root mean square error (RMSE),
+R-squared (R2), and Wasserstein distance (WD). Notably, Lab-MAE achieves
+equitable performance across demographic groups of patients, advancing fairness
+in clinical predictions. We further investigate the role of follow-up
+laboratory values as potential shortcut features, revealing Lab-MAE's
+robustness in scenarios where such data is unavailable. The findings suggest
+that our transformer-based architecture, adapted to the characteristics of the
+EHR data, offers a foundation model for more accurate and fair clinical
+imputation models. In addition, we measure and compare the carbon footprint of
+Lab-MAE with the baseline XGBoost model, highlighting its environmental
+requirements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages main text, 8 appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zeroth-Order Adaptive Neuron Alignment Based Pruning without Re-Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elia Cunegatti, Leonardo Lucio Custode, Giovanni Iacca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network pruning focuses on computational techniques that aim to reduce a
+given model's computational cost by removing a subset of its parameters while
+having minimal impact on performance. Throughout the last decade, the most
+widely used pruning paradigm has been pruning and re-training, which nowadays
+is inconvenient due to the vast amount of pre-trained models, which are in any
+case too expensive to re-train. In this paper, we exploit functional
+information from dense pre-trained models, i.e., their activations, to obtain
+sparse models that maximize the activations' alignment w.r.t. their
+corresponding dense models. Hence, we propose \textsc{NeuroAL}, a \emph{top-up}
+algorithm that can be used on top of any given pruning algorithm for LLMs,
+which modifies the block-wise and row-wise sparsity exploiting information from
+both the dense model and its sparse version to maximize the \emph{neuron
+alignment} among activations. Differently from existing methods, our approach
+adaptively selects the best hyperparameters for the block-wise and row-wise
+sparsity ratios w.r.t. the model and the desired sparsity, and requires
+\emph{no re-training}. We test our method over 276 cases combining four LLM
+families, three sparsity ratios, and ten language tasks (three language
+modeling and seven zero-shot datasets), showing how it consistently outperforms
+the latest state-of-the-art methods in terms of performance-runtime trade-off.
+The code is available at
+\href{https://github.com/eliacunegatti/NeuroAL}{https://github.com/eliacunegatti/NeuroAL}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A General Framework for Clustering and Distribution Matching with Bandit
+  Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.05072v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.05072v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Recep Can Yavas, Yuqi Huang, Vincent Y. F. Tan, Jonathan Scarlett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a general framework for clustering and distribution matching
+problems with bandit feedback. We consider a $K$-armed bandit model where some
+subset of $K$ arms is partitioned into $M$ groups. Within each group, the
+random variable associated to each arm follows the same distribution on a
+finite alphabet. At each time step, the decision maker pulls an arm and
+observes its outcome from the random variable associated to that arm.
+Subsequent arm pulls depend on the history of arm pulls and their outcomes. The
+decision maker has no knowledge of the distributions of the arms or the
+underlying partitions. The task is to devise an online algorithm to learn the
+underlying partition of arms with the least number of arm pulls on average and
+with an error probability not exceeding a pre-determined value~$\delta$.
+Several existing problems fall under our general framework, including finding
+$M$ pairs of arms, odd arm identification, and $N$-ary clustering of $K$ arms
+belong to our general framework. We derive a non-asymptotic lower bound on the
+average number of arm pulls for any online algorithm with an error probability
+not exceeding $\delta$. Furthermore, we develop a computationally-efficient
+online algorithm based on the Track-and-Stop method and Frank--Wolfe algorithm,
+and show that the average number of arm pulls of our algorithm asymptotically
+matches that of the lower bound. Our refined analysis also uncovers a novel
+bound on the speed at which the average number of arm pulls of our algorithm
+converges to the fundamental limit as $\delta$ vanishes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Joint Additive Factor Models for Multiview Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00778v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00778v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niccolo Anceschi, Federico Ferrari, David B. Dunson, Himel Mallick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is increasingly common in a wide variety of applied settings to collect
+data of multiple different types on the same set of samples. Our particular
+focus in this article is on studying relationships between such multiview
+features and responses. A motivating application arises in the context of
+precision medicine where multi-omics data are collected to correlate with
+clinical outcomes. It is of interest to infer dependence within and across
+views while combining multimodal information to improve the prediction of
+outcomes. The signal-to-noise ratio can vary substantially across views,
+motivating more nuanced statistical tools beyond standard late and early
+fusion. This challenge comes with the need to preserve interpretability, select
+features, and obtain accurate uncertainty quantification. We propose a joint
+additive factor regression model (JAFAR) with a structured additive design,
+accounting for shared and view-specific components. We ensure identifiability
+via a novel dependent cumulative shrinkage process (D-CUSP) prior. We provide
+an efficient implementation via a partially collapsed Gibbs sampler and extend
+our approach to allow flexible feature and outcome distributions. Prediction of
+time-to-labor onset from immunome, metabolome, and proteome data illustrates
+performance gains against state-of-the-art competitors. Our open-source
+software (R package) is available at https://github.com/niccoloanceschi/jafar.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain Adaptation-Enhanced Searchlight: Enabling classification of brain
+  states from visual perception to mental imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01163v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01163v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Olza, David Soto, Roberto Santana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In cognitive neuroscience and brain-computer interface research, accurately
+predicting imagined stimuli is crucial. This study investigates the
+effectiveness of Domain Adaptation (DA) in enhancing imagery prediction using
+primarily visual data from fMRI scans of 18 subjects. Initially, we train a
+baseline model on visual stimuli to predict imagined stimuli, utilizing data
+from 14 brain regions. We then develop several models to improve imagery
+prediction, comparing different DA methods. Our results demonstrate that DA
+significantly enhances imagery prediction in binary classification on our
+dataset, as well as in multiclass classification on a publicly available
+dataset. We then conduct a DA-enhanced searchlight analysis, followed by
+permutation-based statistical tests to identify brain regions where imagery
+decoding is consistently above chance across subjects. Our DA-enhanced
+searchlight predicts imagery contents in a highly distributed set of brain
+regions, including the visual cortex and the frontoparietal cortex, thereby
+outperforming standard cross-domain classification methods. The complete code
+and data for this paper have been made openly available for the use of the
+scientific community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Range, not Independence, Drives Modularity in Biological Inspired
+  Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.06232v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.06232v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Will Dorrell, Kyle Hsu, Luke Hollingsworth, Jin Hwa Lee, Jiajun Wu, Chelsea Finn, Peter E Latham, Tim EJ Behrens, James CR Whittington
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Why do biological and artificial neurons sometimes modularise, each encoding
+a single meaningful variable, and sometimes entangle their representation of
+many variables? In this work, we develop a theory of when biologically inspired
+networks -- those that are nonnegative and energy efficient -- modularise their
+representation of source variables (sources). We derive necessary and
+sufficient conditions on a sample of sources that determine whether the neurons
+in an optimal biologically-inspired linear autoencoder modularise. Our theory
+applies to any dataset, extending far beyond the case of statistical
+independence studied in previous work. Rather we show that sources modularise
+if their support is ``sufficiently spread''. From this theory, we extract and
+validate predictions in a variety of empirical studies on how data distribution
+affects modularisation in nonlinear feedforward and recurrent neural networks
+trained on supervised and unsupervised tasks. Furthermore, we apply these ideas
+to neuroscience data, showing that range independence can be used to understand
+the mixing or modularising of spatial and reward information in entorhinal
+recordings in seemingly conflicting experiments. Further, we use these results
+to suggest alternate origins of mixed-selectivity, beyond the predominant
+theory of flexible nonlinear classification. In sum, our theory prescribes
+precise conditions on when neural activities modularise, providing tools for
+inducing and elucidating modular representations in brains and machines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 16 figures. WD and KH contributed equally; LH and JHL
+  contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OneLLM: One Framework to Align All Modalities with Language <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Han, Kaixiong Gong, Yiyuan Zhang, Jiaqi Wang, Kaipeng Zhang, Dahua Lin, Yu Qiao, Peng Gao, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) have gained significant attention
+due to their strong multimodal understanding capability. However, existing
+works rely heavily on modality-specific encoders, which usually differ in
+architecture and are limited to common modalities. In this paper, we present
+OneLLM, an MLLM that aligns eight modalities to language using a unified
+framework. We achieve this through a unified multimodal encoder and a
+progressive multimodal alignment pipeline. In detail, we first train an image
+projection module to connect a vision encoder with LLM. Then, we build a
+universal projection module (UPM) by mixing multiple image projection modules
+and dynamic routing. Finally, we progressively align more modalities to LLM
+with the UPM. To fully leverage the potential of OneLLM in following
+instructions, we also curated a comprehensive multimodal instruction dataset,
+including 2M items from image, audio, video, point cloud, depth/normal map, IMU
+and fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,
+encompassing tasks such as multimodal captioning, question answering and
+reasoning, where it delivers excellent performance. Code, data, model and
+online demo are available at https://github.com/csuhan/OneLLM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2024. Code: https://github.com/csuhan/OneLLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HiTZ at VarDial 2025 NorSID: Overcoming Data Scarcity with Language
+  Transfer and Automatic Data Annotation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaione Bengoetxea, Mikel Zubillaga, Ekhi Azurmendi, Maite Heredia, Julen Etxaniz, Markel Ferro, Jeremy Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we present our submission for the NorSID Shared Task as part of
+the 2025 VarDial Workshop (Scherrer et al., 2025), consisting of three tasks:
+Intent Detection, Slot Filling and Dialect Identification, evaluated using data
+in different dialects of the Norwegian language. For Intent Detection and Slot
+Filling, we have fine-tuned a multitask model in a cross-lingual setting, to
+leverage the xSID dataset available in 17 languages. In the case of Dialect
+Identification, our final submission consists of a model fine-tuned on the
+provided development set, which has obtained the highest scores within our
+experiments. Our final results on the test set show that our models do not drop
+in performance compared to the development set, likely due to the
+domain-specificity of the dataset and the similar distribution of both subsets.
+Finally, we also report an in-depth analysis of the provided datasets and their
+artifacts, as well as other sets of experiments that have been carried out but
+did not yield the best results. Additionally, we present an analysis on the
+reasons why some methods have been more successful than others; mainly the
+impact of the combination of languages and domain-specificity of the training
+data on the results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Vardial 2025 NorSID Shared Task, fixed minor typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Histogram-Equalized Quantization for logic-gated Residual Neural
+  Networks <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04517v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04517v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Thien Nguyen, William Guicquero, Gilles Sicard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adjusting the quantization according to the data or to the model loss seems
+mandatory to enable a high accuracy in the context of quantized neural
+networks. This work presents Histogram-Equalized Quantization (HEQ), an
+adaptive framework for linear symmetric quantization. HEQ automatically adapts
+the quantization thresholds using a unique step size optimization. We
+empirically show that HEQ achieves state-of-the-art performances on CIFAR-10.
+Experiments on the STL-10 dataset even show that HEQ enables a proper training
+of our proposed logic-gated (OR, MUX) residual networks with a higher accuracy
+at a lower hardware complexity than previous work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE ISCAS 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedCoDi-M: A Multi-<span class="highlight-title">Prompt</span> Foundation Model for Multimodal Medical Data
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04614v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04614v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Molino, Francesco Di Feola, Eliodoro Faiella, Deborah Fazzini, Domiziana Santucci, Linlin Shen, Valerio Guarrasi, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence is revolutionizing medical practice, enhancing
+diagnostic accuracy and healthcare delivery. However, its adaptation in medical
+settings still faces significant challenges, related to data availability and
+privacy constraints. Synthetic data has emerged as a promising solution to
+mitigate these issues, addressing data scarcity while preserving privacy.
+Recently, Latent Diffusion Models have emerged as a powerful tool for
+generating high-quality synthetic data. Meanwhile, the integration of different
+modalities has gained interest, emphasizing the need of models capable of
+handle multimodal medical data. Existing approaches struggle to integrate
+complementary information and lack the ability to generate modalities
+simultaneously. To address this challenge, we present MedCoDi-M, a
+6.77-billion-parameter model, designed for multimodal medical data generation,
+that, following Foundation Model paradigm, exploits contrastive learning and
+large quantity of data to build a shared latent space which capture the
+relationships between different data modalities. Further, we introduce the
+Multi-Prompt training technique, which significantly boosts MedCoDi-M's
+generation under different settings. We extensively validate MedCoDi-M: first
+we benchmark it against five competitors on the MIMIC-CXR dataset, a
+state-of-the-art dataset for Chest X-ray and radiological report generation.
+Secondly, we perform a Visual Turing Test with expert radiologists to assess
+the realism and clinical relevance of the generated data, ensuring alignment
+with real-world scenarios. Finally, we assess the utility of MedCoDi-M in
+addressing key challenges in the medical field, such as anonymization, data
+scarcity and imbalance learning. The results are promising, demonstrating the
+applicability of MedCoDi-M in medical contexts. Project page is at
+https://cosbidev.github.io/MedCoDi-M/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Tabular Foundation Model TabPFN Outperforms Specialized Time Series
+  Forecasting Models Based on Simple Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02945v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02945v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shi Bin Hoo, Samuel Müller, David Salinas, Frank Hutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have become popular in forecasting due to their ability to
+make accurate predictions, even with minimal fine-tuning on specific datasets.
+In this paper, we demonstrate how the newly released regression variant of
+TabPFN, a general tabular foundation model, can be applied to time series
+forecasting. We propose a straightforward approach, TabPFN-TS, which pairs
+TabPFN with simple feature engineering to achieve strong forecasting
+performance. Despite its simplicity and with only 11M parameters, TabPFN-TS
+outperforms Chronos-Mini, a model of similar size, and matches or even slightly
+outperforms Chronos-Large, which has 65-fold more parameters. A key strength of
+our method lies in its reliance solely on artificial data during pre-training,
+avoiding the need for large training datasets and eliminating the risk of
+benchmark contamination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trading Devil RL: Backdoor attack via Stock market, Bayesian
+  Optimization and Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17908v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17908v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Orson Mengara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of generative artificial intelligence,
+particularly large language models, a number of sub-fields of deep learning
+have made significant progress and are now very useful in everyday
+applications. For example, well-known financial institutions simulate a wide
+range of scenarios for various models created by their research teams using
+reinforcement learning, both before production and after regular operations. In
+this work, we propose a backdoor attack that focuses solely on data poisoning.
+This particular backdoor attack is classified as an attack without prior
+consideration or trigger, and we name it FinanceLLMsBackRL. Our aim is to
+examine the potential effects of large language models that use reinforcement
+learning systems for text production or speech recognition, finance, physics,
+or the ecosystem of contemporary artificial intelligence models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>End of data poisoning research!: Navier-stokes equations (3D;
+  update); Reinforcement Learning (RL); HFT (High Frequency Trading); Limit
+  Order Markets and backdoor attack detection</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Fast Algorithm for the Real-Valued Combinatorial Pure Exploration of
+  Multi-Armed Bandit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09202v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09202v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shintaro Nakamura, Masashi Sugiyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the real-valued combinatorial pure exploration problem in the
+stochastic multi-armed bandit (R-CPE-MAB). We study the case where the size of
+the action set is polynomial with respect to the number of arms. In such a
+case, the R-CPE-MAB can be seen as a special case of the so-called transductive
+linear bandits. We introduce an algorithm named the combinatorial gap-based
+exploration (CombGapE) algorithm, whose sample complexity upper bound matches
+the lower bound up to a problem-dependent constant factor. We numerically show
+that the CombGapE algorithm outperforms existing methods significantly in both
+synthetic and real-world datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting the geometry of heterogeneous networks: A case study of the
+  Indian stock market 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.04710v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.04710v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pawanesh Pawanesh, Charu Sharma, Niteesh Sahni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we model the Indian stock market as heterogenous scale free
+network, which is then embedded in a two dimensional hyperbolic space through a
+machine learning based technique called as coalescent embedding. This allows us
+to apply the hyperbolic kmeans algorithm on the Poincare disc and the clusters
+so obtained resemble the original network communities more closely than the
+clusters obtained via Euclidean kmeans on the basis of well-known measures
+normalised mutual information and adjusted mutual information. Through this, we
+are able to clearly distinguish between periods of market stability and
+volatility by applying non-parametric statistical tests with a significance
+level of 0.05 to geometric measures namely hyperbolic distance and hyperbolic
+shortest path distance. After that, we are able to spot significant market
+change early by leveraging the Bollinger Band analysis on the time series of
+modularity in the embedded networks of each window. Finally, the radial
+distance and the Equidistance Angular coordinates help in visualizing the
+embedded network in the Poincare disc and it is seen that specific market
+sectors cluster together.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Two-Scale Complexity Measure for Deep Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09184v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09184v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Massimiliano Datres, Gian Paolo Leonardi, Alessio Figalli, David Sutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel capacity measure 2sED for statistical models based on
+the effective dimension. The new quantity provably bounds the generalization
+error under mild assumptions on the model. Furthermore, simulations on standard
+data sets and popular model architectures show that 2sED correlates well with
+the training error. For Markovian models, we show how to efficiently
+approximate 2sED from below through a layerwise iterative approach, which
+allows us to tackle deep learning models with a large number of parameters.
+Simulation results suggest that the approximation is good for different
+prominent models and data sets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-shot Class-incremental Learning for Classification and Object
+  Detection: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06764v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06764v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghua Zhang, Li Liu, Olli Silvén, Matti Pietikäinen, Dewen Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot Class-Incremental Learning (FSCIL) presents a unique challenge in
+Machine Learning (ML), as it necessitates the Incremental Learning (IL) of new
+classes from sparsely labeled training samples without forgetting previous
+knowledge. While this field has seen recent progress, it remains an active
+exploration area. This paper aims to provide a comprehensive and systematic
+review of FSCIL. In our in-depth examination, we delve into various facets of
+FSCIL, encompassing the problem definition, the discussion of the primary
+challenges of unreliable empirical risk minimization and the
+stability-plasticity dilemma, general schemes, and relevant problems of IL and
+Few-shot Learning (FSL). Besides, we offer an overview of benchmark datasets
+and evaluation metrics. Furthermore, we introduce the Few-shot
+Class-incremental Classification (FSCIC) methods from data-based,
+structure-based, and optimization-based approaches and the Few-shot
+Class-incremental Object Detection (FSCIOD) methods from anchor-free and
+anchor-based approaches. Beyond these, we present several promising research
+directions within FSCIL that merit further investigation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ITINERA: Integrating Spatial Optimization with Large Language Models for
+  Open-domain Urban Itinerary Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07204v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07204v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihong Tang, Zhaokai Wang, Ao Qu, Yihao Yan, Zhaofeng Wu, Dingyi Zhuang, Jushi Kai, Kebing Hou, Xiaotong Guo, Han Zheng, Tiange Luo, Jinhua Zhao, Zhan Zhao, Wei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Citywalk, a recently popular form of urban travel, requires genuine
+personalization and understanding of fine-grained requests compared to
+traditional itinerary planning. In this paper, we introduce the novel task of
+Open-domain Urban Itinerary Planning (OUIP), which generates personalized urban
+itineraries from user requests in natural language. We then present ITINERA, an
+OUIP system that integrates spatial optimization with large language models to
+provide customized urban itineraries based on user needs. This involves
+decomposing user requests, selecting candidate points of interest (POIs),
+ordering the POIs based on cluster-aware spatial optimization, and generating
+the itinerary. Experiments on real-world datasets and the performance of the
+deployed system demonstrate our system's capacity to deliver personalized and
+spatially coherent itineraries compared to current solutions. Source codes of
+ITINERA are available at https://github.com/YihongT/ITINERA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CDC: A Simple Framework for Complex Data Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03670v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03670v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao Kang, Xuanting Xie, Bingheng Li, Erlin Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In today's data-driven digital era, the amount as well as complexity, such as
+multi-view, non-Euclidean, and multi-relational, of the collected data are
+growing exponentially or even faster. Clustering, which unsupervisely extracts
+valid knowledge from data, is extremely useful in practice. However, existing
+methods are independently developed to handle one particular challenge at the
+expense of the others. In this work, we propose a simple but effective
+framework for complex data clustering (CDC) that can efficiently process
+different types of data with linear complexity. We first utilize graph
+filtering to fuse geometry structure and attribute information. We then reduce
+the complexity with high-quality anchors that are adaptively learned via a
+novel similarity-preserving regularizer. We illustrate the cluster-ability of
+our proposed method theoretically and experimentally. In particular, we deploy
+CDC to graph data of size 111M.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TNNLS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating Multi-Modal Input Token Mixer Into Mamba-Based Decision
+  Models: Decision MetaMamba 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10517v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10517v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wall Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequence modeling with State Space models (SSMs) has demonstrated performance
+surpassing that of Transformers in various tasks, raising expectations for
+their potential to outperform the Decision Transformer and its enhanced
+variants in offline reinforcement learning (RL). However, decision models based
+on Mamba, a state-of-the-art SSM, failed to achieve superior performance
+compared to these enhanced Decision Transformers. We hypothesize that this
+limitation arises from information loss during the selective scanning phase. To
+address this, we propose the Decision MetaMamba (DMM), which augments Mamba
+with a token mixer in its input layer. This mixer explicitly accounts for the
+multimodal nature of offline RL inputs, comprising state, action, and
+return-to-go. The DMM demonstrates improved performance while significantly
+reducing parameter count compared to prior models. Notably, similar performance
+gains were achieved using a simple linear token mixer, emphasizing the
+importance of preserving information from proximate time steps rather than the
+specific design of the token mixer itself. This novel modification to Mamba's
+input layer represents a departure from conventional timestamp-based encoding
+approaches used in Transformers. By enhancing performance of Mamba in offline
+RL, characterized by memory efficiency and fast inference, this work opens new
+avenues for its broader application in future RL research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We have decided to withdraw this manuscript as we believe that the
+  work requires significant improvements and further research to ensure its
+  quality and impact. We are currently pursuing a more comprehensive approach
+  to address the limitations of the current submission and plan to resubmit an
+  improved version in the future</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-Based Automatic Multi-Level Airway Collapse Monitoring on
+  Obstructive Sleep Apnea Patients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16030v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16030v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying-Chieh Hsu, Stanley Yung-Chuan Liu, Chao-Jung Huang, Chi-Wei Wu, Ren-Kai Cheng, Jane Yung-Jen Hsu, Shang-Ran Huang, Yuan-Ren Cheng, Fu-Shun Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigated the use of deep learning to identify multi-level
+upper airway collapses in obstructive sleep apnea (OSA) patients based on
+snoring sounds. We fi-ne-tuned ResNet-50 and Audio Spectrogram Transformer
+(AST) models using snoring recordings from 37 subjects undergoing drug-induced
+sleep endoscopy (DISE) between 2020 and 2021. Snoring sounds were labeled
+according to the VOTE (Velum, Orophar-ynx, Tongue Base, Epiglottis)
+classification, resulting in 259 V, 403 O, 77 T, 13 E, 1016 VO, 46 VT, 140 OT,
+39 OE, 30 VOT, and 3150 non-snoring (N) 0.5-second clips. The models were
+trained for two multi-label classification tasks: identifying obstructions at
+V, O, T, and E levels, and identifying retropalatal (RP) and retroglossal (RG)
+obstruc-tions. Results showed AST slightly outperformed ResNet-50,
+demonstrating good abil-ity to identify V (F1-score: 0.71, MCC: 0.61, AUC:
+0.89), O (F1-score: 0.80, MCC: 0.72, AUC: 0.94), and RP obstructions (F1-score:
+0.86, MCC: 0.77, AUC: 0.97). However, both models struggled with T, E, and RG
+classifications due to limited data. Retrospective analysis of a full-night
+recording showed the potential to profile airway obstruction dynamics. We
+expect this information, combined with polysomnography and other clinical
+parameters, can aid clinical triage and treatment planning for OSA patients.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Effective Rank and the Staircase Phenomenon: New Insights into Neural
+  Network Training Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05144v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05144v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiang Yang, Yuxiang Zhao, Quanhui Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, deep learning, powered by neural networks, has achieved
+widespread success in solving high-dimensional problems, particularly those
+with low-dimensional feature structures. This success stems from their ability
+to identify and learn low dimensional features tailored to the problems.
+Understanding how neural networks extract such features during training
+dynamics remains a fundamental question in deep learning theory. In this work,
+we propose a novel perspective by interpreting the neurons in the last hidden
+layer of a neural network as basis functions that represent essential features.
+To explore the linear independence of these basis functions throughout the deep
+learning dynamics, we introduce the concept of 'effective rank'. Our extensive
+numerical experiments reveal a notable phenomenon: the effective rank increases
+progressively during the learning process, exhibiting a staircase-like pattern,
+while the loss function concurrently decreases as the effective rank rises. We
+refer to this observation as the 'staircase phenomenon'. Specifically, for deep
+neural networks, we rigorously prove the negative correlation between the loss
+function and effective rank, demonstrating that the lower bound of the loss
+function decreases with increasing effective rank. Therefore, to achieve a
+rapid descent of the loss function, it is critical to promote the swift growth
+of effective rank. Ultimately, we evaluate existing advanced learning
+methodologies and find that these approaches can quickly achieve a higher
+effective rank, thereby avoiding redundant staircase processes and accelerating
+the rapid decline of the loss function.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Disentangled Speech Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03389v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03389v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuf Brima, Ulf Krumnack, Simone Pika, Gunther Heidemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Disentangled representation learning in speech processing has lagged behind
+other domains, largely due to the lack of datasets with annotated generative
+factors for robust evaluation. To address this, we propose SynSpeech, a novel
+large-scale synthetic speech dataset specifically designed to enable research
+on disentangled speech representations. SynSpeech includes controlled
+variations in speaker identity, spoken text, and speaking style, with three
+dataset versions to support experimentation at different levels of complexity.
+  In this study, we present a comprehensive framework to evaluate disentangled
+representation learning techniques, applying both linear probing and
+established supervised disentanglement metrics to assess the modularity,
+compactness, and informativeness of the representations learned by a
+state-of-the-art model. Using the RAVE model as a test case, we find that
+SynSpeech facilitates benchmarking across a range of factors, achieving
+promising disentanglement of simpler features like gender and speaking style,
+while highlighting challenges in isolating complex attributes like speaker
+identity. This benchmark dataset and evaluation framework fills a critical gap,
+supporting the development of more robust and interpretable speech
+representation learning methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mean-Field Analysis for Learning Subspace-Sparse Polynomials with
+  Gaussian Input 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08948v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08948v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziang Chen, Rong Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we study the mean-field flow for learning subspace-sparse
+polynomials using stochastic gradient descent and two-layer neural networks,
+where the input distribution is standard Gaussian and the output only depends
+on the projection of the input onto a low-dimensional subspace. We establish a
+necessary condition for SGD-learnability, involving both the characteristics of
+the target function and the expressiveness of the activation function. In
+addition, we prove that the condition is almost sufficient, in the sense that a
+condition slightly stronger than the necessary condition can guarantee the
+exponential decay of the loss functional to zero.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HAAQI-Net: A Non-intrusive Neural Music Audio Quality Assessment Model
+  for Hearing Aids 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01145v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01145v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dyah A. M. G. Wisnu, Stefano Rini, Ryandhimas E. Zezario, Hsin-Min Wang, Yu Tsao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces HAAQI-Net, a non-intrusive deep learning-based music
+audio quality assessment model for hearing aid users. Unlike traditional
+methods like the Hearing Aid Audio Quality Index (HAAQI) that require intrusive
+reference signal comparisons, HAAQI-Net offers a more accessible and
+computationally efficient alternative. By utilizing a Bidirectional Long
+Short-Term Memory (BLSTM) architecture with attention mechanisms and features
+extracted from the pre-trained BEATs model, it can predict HAAQI scores
+directly from music audio clips and hearing loss patterns. Experimental results
+demonstrate HAAQI-Net's effectiveness, achieving a Linear Correlation
+Coefficient (LCC) of 0.9368 , a Spearman's Rank Correlation Coefficient (SRCC)
+of 0.9486 , and a Mean Squared Error (MSE) of 0.0064 and inference time
+significantly reduces from 62.52 to 2.54 seconds. To address computational
+overhead, a knowledge distillation strategy was applied, reducing parameters by
+75.85% and inference time by 96.46%, while maintaining strong performance (LCC:
+0.9071 , SRCC: 0.9307 , MSE: 0.0091 ). To expand its capabilities, HAAQI-Net
+was adapted to predict subjective human scores like the Mean Opinion Score
+(MOS) through fine-tuning. This adaptation significantly improved prediction
+accuracy, validated through statistical analysis. Furthermore, the robustness
+of HAAQI-Net was evaluated under varying Sound Pressure Level (SPL) conditions,
+revealing optimal performance at a reference SPL of 65 dB, with accuracy
+gradually decreasing as SPL deviated from this point. The advancements in
+subjective score prediction, SPL robustness, and computational efficiency
+position HAAQI-Net as a scalable solution for music audio quality assessment in
+hearing aid applications, contributing to efficient and accurate models in
+audio signal processing and hearing aid technology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE/ACM Transactions on Audio, Speech, and Language
+  Processing (TASLP), 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Adaptivity and Safety: Learning Agile Collision-Free Locomotion
+  Across Varied Physics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04276v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04276v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichao Zhong, Chong Zhang, Tairan He, Guanya Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world legged locomotion systems often need to reconcile agility and
+safety for different scenarios. Moreover, the underlying dynamics are often
+unknown and time-variant (e.g., payload, friction). In this paper, we introduce
+BAS (Bridging Adaptivity and Safety), which builds upon the pipeline of prior
+work Agile But Safe (ABS)(He et al.) and is designed to provide adaptive safety
+even in dynamic environments with uncertainties. BAS involves an agile policy
+to avoid obstacles rapidly and a recovery policy to prevent collisions, a
+physical parameter estimator that is concurrently trained with agile policy,
+and a learned control-theoretic RA (reach-avoid) value network that governs the
+policy switch. Also, the agile policy and RA network are both conditioned on
+physical parameters to make them adaptive. To mitigate the distribution shift
+issue, we further introduce an on-policy fine-tuning phase for the estimator to
+enhance its robustness and accuracy. The simulation results show that BAS
+achieves 50% better safety than baselines in dynamic environments while
+maintaining a higher speed on average. In real-world experiments, BAS shows its
+capability in complex environments with unknown physics (e.g., slippery floors
+with unknown frictions, unknown payloads up to 8kg), while baselines lack
+adaptivity, leading to collisions or. degraded agility. As a result, BAS
+achieves a 19.8% increase in speed and gets a 2.36 times lower collision rate
+than ABS in the real world. Videos: https://adaptive-safe-locomotion.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 Pages, 6 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Let's Ask GNN: Empowering Large Language Model for Graph In-Context
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.07074v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.07074v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyu Hu, Yichuan Li, Zhengyu Chen, Jingang Wang, Han Liu, Kyumin Lee, Kaize Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Textual Attributed Graphs (TAGs) are crucial for modeling complex real-world
+systems, yet leveraging large language models (LLMs) for TAGs presents unique
+challenges due to the gap between sequential text processing and
+graph-structured data. We introduce AskGNN, a novel approach that bridges this
+gap by leveraging In-Context Learning (ICL) to integrate graph data and
+task-specific information into LLMs. AskGNN employs a Graph Neural Network
+(GNN)-powered structure-enhanced retriever to select labeled nodes across
+graphs, incorporating complex graph structures and their supervision signals.
+Our learning-to-retrieve algorithm optimizes the retriever to select example
+nodes that maximize LLM performance on graph. Experiments across three tasks
+and seven LLMs demonstrate AskGNN's superior effectiveness in graph task
+performance, opening new avenues for applying LLMs to graph-structured data
+without extensive fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nothing Stands Still: A Spatiotemporal Benchmark on 3D Point Cloud
+  Registration Under Large Geometric and Temporal Change <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09346v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09346v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Sun, Yan Hao, Shengyu Huang, Silvio Savarese, Konrad Schindler, Marc Pollefeys, Iro Armeni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building 3D geometric maps of man-made spaces is a well-established and
+active field that is fundamental to computer vision and robotics. However,
+considering the evolving nature of built environments, it is essential to
+question the capabilities of current mapping efforts in handling temporal
+changes. In addition, spatiotemporal mapping holds significant potential for
+achieving sustainability and circularity goals. Existing mapping approaches
+focus on small changes, such as object relocation or self-driving car
+operation; in all cases where the main structure of the scene remains fixed.
+Consequently, these approaches fail to address more radical changes in the
+structure of the built environment, such as geometry and topology. To this end,
+we introduce the Nothing Stands Still (NSS) benchmark, which focuses on the
+spatiotemporal registration of 3D scenes undergoing large spatial and temporal
+change, ultimately creating one coherent spatiotemporal map. Specifically, the
+benchmark involves registering two or more partial 3D point clouds (fragments)
+from the same scene but captured from different spatiotemporal views. In
+addition to the standard pairwise registration, we assess the multi-way
+registration of multiple fragments that belong to any temporal stage. As part
+of NSS, we introduce a dataset of 3D point clouds recurrently captured in
+large-scale building indoor environments that are under construction or
+renovation. The NSS benchmark presents three scenarios of increasing
+difficulty, to quantify the generalization ability of point cloud registration
+methods over space (within one building and across buildings) and time. We
+conduct extensive evaluations of state-of-the-art methods on NSS. The results
+demonstrate the necessity for novel methods specifically designed to handle
+large spatiotemporal changes. The homepage of our benchmark is at
+http://nothing-stands-still.com.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in the ISPRS Journal of Photogrammetry and Remote Sensing.
+  29 pages, 26 figures. For the project page, see
+  http://nothing-stands-still.com</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STITCH: Surface reconstrucTion using Implicit neural representations
+  with Topology Constraints and persistent Homology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18696v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18696v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anushrut Jignasu, Ethan Herron, Zhanhong Jiang, Soumik Sarkar, Chinmay Hegde, Baskar Ganapathysubramanian, Aditya Balu, Adarsh Krishnamurthy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present STITCH, a novel approach for neural implicit surface
+reconstruction of a sparse and irregularly spaced point cloud while enforcing
+topological constraints (such as having a single connected component). We
+develop a new differentiable framework based on persistent homology to
+formulate topological loss terms that enforce the prior of a single 2-manifold
+object. Our method demonstrates excellent performance in preserving the
+topology of complex 3D geometries, evident through both visual and empirical
+comparisons. We supplement this with a theoretical analysis, and provably show
+that optimizing the loss with stochastic (sub)gradient descent leads to
+convergence and enables reconstructing shapes with a single connected
+component. Our approach showcases the integration of differentiable topological
+data analysis tools for implicit surface reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 12 figures, 29 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Task Model Merging via Adaptive Weight Disentanglement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.18729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.18729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Xiong, Runxi Cheng, Wang Chen, Zhanqiu Zhang, Yiwen Guo, Chun Yuan, Ruifeng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model merging has recently gained attention as an economical and scalable
+approach to incorporate task-specific weights from various tasks into a unified
+multi-task model. For example, in Task Arithmetic (TA), adding the fine-tuned
+weights of different tasks can enhance the model's performance on those tasks,
+while subtracting them leads to task forgetting. Although TA is highly
+effective, interference among task still hampers the performance of the merged
+model. Existing methods for handling conflicts between task generally rely on
+empirical selection, resulting in suboptimal performance. In this paper, we
+introduce an Adaptive Weight Disentanglement method. We begin by theoretically
+proving that task vectors employed in model merging should be orthogonal to
+minimize interference among tasks. Guided by this insight, we initialize
+redundant vectors such that, when subtracted from the original task vectors,
+the resulting vectors exhibit increased orthogonality. Additionally, we impose
+an norm constraint on the redundant vectors to preserve the performance of the
+task-specific models. Experimental results demonstrate the effectiveness of our
+proposed technique: it successfully extracts redundant vectors, and after their
+subtraction, the task vectors not only retain robust performance but also
+achieve superior fusion outcomes. Our code is available at
+\href{https://github.com/FarisXiong/AWD.git}{https://github.com/FarisXiong/AWD.git}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long-range Brain Graph <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01100v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01100v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Yu, Shan Jin, Ming Li, Tabinda Sarwar, Feng Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding communication and information processing among brain regions of
+interest (ROIs) is highly dependent on long-range connectivity, which plays a
+crucial role in facilitating diverse functional neural integration across the
+entire brain. However, previous studies generally focused on the short-range
+dependencies within brain networks while neglecting the long-range
+dependencies, limiting an integrated understanding of brain-wide communication.
+To address this limitation, we propose Adaptive Long-range aware TransformER
+(ALTER), a brain graph transformer to capture long-range dependencies between
+brain ROIs utilizing biased random walk. Specifically, we present a novel
+long-range aware strategy to explicitly capture long-range dependencies between
+brain ROIs. By guiding the walker towards the next hop with higher correlation
+value, our strategy simulates the real-world brain-wide communication.
+Furthermore, by employing the transformer framework, ALERT adaptively
+integrates both short- and long-range dependencies between brain ROIs, enabling
+an integrated understanding of multi-level communication across the entire
+brain. Extensive experiments on ABIDE and ADNI datasets demonstrate that ALTER
+consistently outperforms generalized state-of-the-art graph learning methods
+(including SAN, Graphormer, GraphTrans, and LRGNN) and other graph learning
+based brain network analysis methods (including FBNETGEN, BrainNetGNN,
+BrainGNN, and BrainNETTF) in neurological disease diagnosis. Cases of
+long-range dependencies are also presented to further illustrate the
+effectiveness of ALTER. The implementation is available at
+https://github.com/yushuowiki/ALTER.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear Multidimensional Regression with Interactive Fixed-Effects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11691v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11691v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Freeman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies a linear and additively separable regression model for
+multidimensional panel data of three or more dimensions with unobserved
+interactive fixed effects. The main estimator follows a double debias approach,
+and requires two preliminary steps to control unobserved heterogeneity. First,
+the model is embedded within the standard two-dimensional panel framework and
+restrictions are formed under which the factor structure methods in Bai (2009)
+lead to consistent estimation of model parameters, but at slow rates of
+convergence. The second step develops a weighted fixed-effects method that is
+robust to the multidimensional nature of the problem and achieves the
+parametric rate of consistency. This second step is combined with a double
+debias procedure for asymptotically normal slope estimates. The methods are
+implemented to estimate the demand elasticity for beer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Navigating the Designs of Privacy-Preserving Fine-tuning for Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04323v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04323v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Shi, Tu Ouyang, An Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning has proven effective in enhancing Large Language Models'
+(LLMs) performance on downstream tasks. However, real-world fine-tuning faces
+inherent conflicts between model providers' intellectual property protection,
+clients' data privacy requirements, and tuning costs. While recent approaches
+like split learning and offsite tuning demonstrate promising architectures for
+privacy-preserving fine-tuning, there is a gap in systematically addressing the
+multidimensional trade-offs required for diverse real-world deployments. We
+propose several indicative evaluation metrics to guide design trade-offs for
+privacy-preserving fine-tuning and a series of example designs, collectively
+named GuardedTuning; they result from novel combinations of system
+architectures with adapted privacy-enhancement methods and emerging computation
+techniques. Each design represents distinct trade-offs across model utility,
+privacy guarantees, and costs. Experimental results demonstrate that these
+designs protect against data reconstruction attacks while maintaining
+competitive fine-tuning performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Process Learning via Operator Flow Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04126v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04126v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaozhong Shi, Zachary E. Ross, Domniki Asimaki, Kamyar Azizzadenesheli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expanding on neural operators, we propose a novel framework for stochastic
+process learning across arbitrary domains. In particular, we develop operator
+flow matching (OFM) for learning stochastic process priors on function spaces.
+OFM provides the probability density of the values of any collection of points
+and enables mathematically tractable functional regression at new points with
+mean and density estimation. Our method outperforms state-of-the-art models in
+stochastic process learning, functional regression, and prior learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ More is not always better? Enhancing Many-Shot In-Context Learning with
+  Differentiated and Reweighting Objectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04070v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04070v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoqing Zhang, Ang Lv, Yuhan Liu, Flood Sung, Wei Liu, Shuo Shang, Xiuying Chen, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) excel at few-shot in-context learning (ICL)
+without requiring parameter updates. However, as the number of ICL
+demonstrations increases from a few to many, performance tends to plateau and
+eventually decline. We identify two primary causes for this trend: the
+suboptimal negative log-likelihood (NLL) optimization objective and the
+incremental data noise. To address these issues, we introduce DrICL, a novel
+optimization method that enhances model performance through Differentiated
+Learning and advantage-based Reweighting objectives. Globally, DrICL utilizes
+differentiated learning to optimize the NLL objective, ensuring that many-shot
+performance surpasses zero-shot levels. Locally, it dynamically adjusts the
+weighting of many-shot demonstrations by leveraging cumulative advantages
+inspired by reinforcement learning, thereby improving generalization. This
+approach allows the model to handle varying numbers of shots effectively,
+mitigating the impact of noisy data. Recognizing the lack of multi-task
+datasets with diverse many-shot distributions, we develop the Many-Shot ICL
+Benchmark (ICL-50)-a large-scale benchmark of 50 tasks that cover shot numbers
+from 1 to 350 within sequences of up to 8,000 tokens-for fine-tuning purposes.
+ICL-50 facilitates the evaluation of many-shot ICL strategies across seven
+prominent NLP tasks and 50 distinct datasets. Experimental results demonstrate
+that LLMs enhanced with DrICL achieve significant improvements in many-shot
+setups across various tasks, including both in-domain and out-of-domain
+scenarios. We release the code and benchmark dataset hoping to facilitate
+further research in many-shot ICL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ContextMRI: Enhancing Compressed Sensing MRI through Metadata
+  Conditioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04284v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04284v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyungjin Chung, Dohun Lee, Zihui Wu, Byung-Hoon Kim, Katherine L. Bouman, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compressed sensing MRI seeks to accelerate MRI acquisition processes by
+sampling fewer k-space measurements and then reconstructing the missing data
+algorithmically. The success of these approaches often relies on strong priors
+or learned statistical models. While recent diffusion model-based priors have
+shown great potential, previous methods typically ignore clinically available
+metadata (e.g. patient demographics, imaging parameters, slice-specific
+information). In practice, metadata contains meaningful cues about the anatomy
+and acquisition protocol, suggesting it could further constrain the
+reconstruction problem. In this work, we propose ContextMRI, a text-conditioned
+diffusion model for MRI that integrates granular metadata into the
+reconstruction process. We train a pixel-space diffusion model directly on
+minimally processed, complex-valued MRI images. During inference, metadata is
+converted into a structured text prompt and fed to the model via CLIP text
+embeddings. By conditioning the prior on metadata, we unlock more accurate
+reconstructions and show consistent gains across multiple datasets,
+acceleration factors, and undersampling patterns. Our experiments demonstrate
+that increasing the fidelity of metadata, ranging from slice location and
+contrast to patient age, sex, and pathology, systematically boosts
+reconstruction performance. This work highlights the untapped potential of
+leveraging clinical context for inverse problems and opens a new direction for
+metadata-driven MRI reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 9 figures. Code is available at
+  https://github.com/DoHunLee1/ContextMRI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A New Transformation Approach for Uplift Modeling with Binary Outcome 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05549v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05549v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Li, Liangshu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uplift modeling has been used effectively in fields such as marketing and
+customer retention, to target those customers who are more likely to respond
+due to the campaign or treatment. Essentially, it is a machine learning
+technique that predicts the gain from performing some action with respect to
+not taking it. A popular class of uplift models is the transformation approach
+that redefines the target variable with the original treatment indicator. These
+transformation approaches only need to train and predict the difference in
+outcomes directly. The main drawback of these approaches is that in general it
+does not use the information in the treatment indicator beyond the construction
+of the transformed outcome and usually is not efficient. In this paper, we
+design a novel transformed outcome for the case of the binary target variable
+and unlock the full value of the samples with zero outcome. From a practical
+perspective, our new approach is flexible and easy to use. Experimental results
+on synthetic and real-world datasets obviously show that our new approach
+outperforms the traditional one. At present, our new approach has already been
+applied to precision marketing in a China nation-wide financial holdings group.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constraints as Rewards: Reinforcement Learning for Robots without Reward
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Ishihara, Noriaki Takasugi, Kotaro Kawakami, Masaya Kinoshita, Kazumi Aoyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning has become an essential algorithm for generating
+complex robotic behaviors. However, to learn such behaviors, it is necessary to
+design a reward function that describes the task, which often consists of
+multiple objectives that needs to be balanced. This tuning process is known as
+reward engineering and typically involves extensive trial-and-error. In this
+paper, to avoid this trial-and-error process, we propose the concept of
+Constraints as Rewards (CaR). CaR formulates the task objective using multiple
+constraint functions instead of a reward function and solves a reinforcement
+learning problem with constraints using the Lagrangian-method. By adopting this
+approach, different objectives are automatically balanced, because Lagrange
+multipliers serves as the weights among the objectives. In addition, we will
+demonstrate that constraints, expressed as inequalities, provide an intuitive
+interpretation of the optimization target designed for the task. We apply the
+proposed method to the standing-up motion generation task of a
+six-wheeled-telescopic-legged robot and demonstrate that the proposed method
+successfully acquires the target behavior, even though it is challenging to
+learn with manually designed reward functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimality of Message-Passing Architectures for Sparse Graphs <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10391v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10391v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aseem Baranwal, Kimon Fountoulakis, Aukosh Jagannath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the node classification problem on feature-decorated graphs in the
+sparse setting, i.e., when the expected degree of a node is $O(1)$ in the
+number of nodes, in the fixed-dimensional asymptotic regime, i.e., the
+dimension of the feature data is fixed while the number of nodes is large. Such
+graphs are typically known to be locally tree-like. We introduce a notion of
+Bayes optimality for node classification tasks, called asymptotic local Bayes
+optimality, and compute the optimal classifier according to this criterion for
+a fairly general statistical data model with arbitrary distributions of the
+node features and edge connectivity. The optimal classifier is implementable
+using a message-passing graph neural network architecture. We then compute the
+generalization error of this classifier and compare its performance against
+existing learning methods theoretically on a well-studied statistical model
+with naturally identifiable signal-to-noise ratios (SNRs) in the data. We find
+that the optimal message-passing architecture interpolates between a standard
+MLP in the regime of low graph signal and a typical convolution in the regime
+of high graph signal. Furthermore, we prove a corresponding non-asymptotic
+result.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 2 figures, published at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Persistent Homology for Structural Characterization in Disordered
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14390v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14390v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An Wang, Li Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a unified framework based on persistent homology (PH) to
+characterize both local and global structures in disordered systems. It can
+simultaneously generate local and global descriptors using the same algorithm
+and data structure, and has shown to be highly effective and interpretable in
+predicting particle rearrangements and classifying global phases. We also
+demonstrated that using a single variable enables a linear SVM to achieve
+nearly perfect three-phase classification. Inspired by this discovery, we
+define a non-parametric metric, the Separation Index (SI), which not only
+achieves this classification without sacrificing significant performance but
+also establishes a connection between particle environments and the global
+phase structure. Our methods provide an effective framework for understanding
+and analyzing the properties of disordered materials, with broad potential
+applications in materials science and even wider studies of complex systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PalmBench: A Comprehensive Benchmark of Compressed Large Language Models
+  on Mobile Platforms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05315v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05315v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilong Li, Jingyu Liu, Hao Zhang, M Badri Narayanan, Utkarsh Sharma, Shuai Zhang, Pan Hu, Yijing Zeng, Jayaram Raghuram, Suman Banerjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying large language models (LLMs) locally on mobile devices is
+advantageous in scenarios where transmitting data to remote cloud servers is
+either undesirable due to privacy concerns or impractical due to network
+connection. Recent advancements (MLC, 2023a; Gerganov, 2023) have facilitated
+the local deployment of LLMs. However, local deployment also presents
+challenges, particularly in balancing quality (generative performance),
+latency, and throughput within the hardware constraints of mobile devices. In
+this paper, we introduce our lightweight, all-in-one automated benchmarking
+framework that allows users to evaluate LLMs on mobile devices. We provide a
+comprehensive benchmark of various popular LLMs with different quantization
+configurations (both weights and activations) across multiple mobile platforms
+with varying hardware capabilities. Unlike traditional benchmarks that assess
+full-scale models on high-end GPU clusters, we focus on evaluating resource
+efficiency (memory and power consumption) and harmful output for compressed
+models on mobile devices. Our key observations include i) differences in energy
+efficiency and throughput across mobile platforms; ii) the impact of
+quantization on memory usage, GPU execution time, and power consumption; and
+iii) accuracy and performance degradation of quantized models compared to their
+non-quantized counterparts; and iv) the frequency of hallucinations and toxic
+content generated by compressed LLMs on mobile devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17428v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17428v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chankyu Lee, Rajarshi Roy, Mengyao Xu, Jonathan Raiman, Mohammad Shoeybi, Bryan Catanzaro, Wei Ping
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoder-only large language model (LLM)-based embedding models are beginning
+to outperform BERT or T5-based embedding models in general-purpose text
+embedding tasks, including dense vector-based retrieval. In this work, we
+introduce the NV-Embed model, incorporating architectural designs, training
+procedures, and curated datasets to significantly enhance the performance of
+LLM as a versatile embedding model, while maintaining its simplicity and
+reproducibility. For model architecture, we propose a latent attention layer to
+obtain pooled embeddings, which consistently improves retrieval and downstream
+task accuracy compared to mean pooling or using the last <EOS> token embedding
+from LLMs. To enhance representation learning, we remove the causal attention
+mask of LLMs during contrastive training. For training algorithm, we introduce
+a two-stage contrastive instruction-tuning method. It first applies contrastive
+training with instructions on retrieval datasets, utilizing in-batch negatives
+and curated hard negative examples. At stage-2, it blends various non-retrieval
+into instruction tuning, which not only enhances non-retrieval task accuracy
+but also improves retrieval performance. For training data, we utilize the
+hard-negative mining, synthetic data generation and existing public available
+datasets to boost the performance of embedding model. By combining these
+techniques, our NV-Embed-v1 and NV-Embed-v2 models obtained the No.1 position
+on the Massive Text Embedding Benchmark (MTEB) (as of May 24, 2024 and August
+30, 2024, respectively) across 56 embedding tasks, demonstrating the sustained
+effectiveness of the proposed methods over time. Additionally, it achieved the
+highest scores in the Long Doc section and the second-highest scores in the QA
+section of the AIR Benchmark, which covers a range of out-of-domain information
+retrieval topics beyond those in MTEB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We open-source the model at:
+  https://huggingface.co/nvidia/NV-Embed-v2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Arcee's MergeKit: A Toolkit for Merging Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13257v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13257v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles Goddard, Shamane Siriwardhana, Malikeh Ehghaghi, Luke Meyers, Vlad Karpukhin, Brian Benedict, Mark McQuade, Jacob Solawetz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid expansion of the open-source language model landscape presents an
+opportunity to merge the competencies of these model checkpoints by combining
+their parameters. Advances in transfer learning, the process of fine-tuning
+pretrained models for specific tasks, has resulted in the development of vast
+amounts of task-specific models, typically specialized in individual tasks and
+unable to utilize each other's strengths. Model merging facilitates the
+creation of multitask models without the need for additional training, offering
+a promising avenue for enhancing model performance and versatility. By
+preserving the intrinsic capabilities of the original models, model merging
+addresses complex challenges in AI - including the difficulties of catastrophic
+forgetting and multitask learning. To support this expanding area of research,
+we introduce MergeKit, a comprehensive, open-source library designed to
+facilitate the application of model merging strategies. MergeKit offers an
+extensible framework to efficiently merge models on any hardware, providing
+utility to researchers and practitioners. To date, thousands of models have
+been merged by the open-source community, leading to the creation of some of
+the worlds most powerful open-source model checkpoints, as assessed by the Open
+LLM Leaderboard. The library is accessible at
+https://github.com/arcee-ai/MergeKit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-generated Image Detection: Passive or Watermark? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.13553v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.13553v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moyang Guo, Yuepeng Hu, Zhengyuan Jiang, Zeyu Li, Amir Sadovnik, Arka Daw, Neil Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While text-to-image models offer numerous benefits, they also pose
+significant societal risks. Detecting AI-generated images is crucial for
+mitigating these risks. Detection methods can be broadly categorized into
+passive and watermark-based approaches: passive detectors rely on artifacts
+present in AI-generated images, whereas watermark-based detectors proactively
+embed watermarks into such images. A key question is which type of detector
+performs better in terms of effectiveness, robustness, and efficiency. However,
+the current literature lacks a comprehensive understanding of this issue. In
+this work, we aim to bridge that gap by developing ImageDetectBench, the first
+comprehensive benchmark to compare the effectiveness, robustness, and
+efficiency of passive and watermark-based detectors. Our benchmark includes
+four datasets, each containing a mix of AI-generated and non-AI-generated
+images. We evaluate five passive detectors and four watermark-based detectors
+against eight types of common perturbations and three types of adversarial
+perturbations. Our benchmark results reveal several interesting findings. For
+instance, watermark-based detectors consistently outperform passive detectors,
+both in the presence and absence of perturbations. Based on these insights, we
+provide recommendations for detecting AI-generated images, e.g., when both
+types of detectors are applicable, watermark-based detectors should be the
+preferred choice. Our code and data are publicly available at
+https://github.com/moyangkuo/ImageDetectBench.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Masked Image Modeling: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vlad Hondru, Florinel Alin Croitoru, Shervin Minaee, Radu Tudor Ionescu, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we survey recent studies on masked image modeling (MIM), an
+approach that emerged as a powerful self-supervised learning technique in
+computer vision. The MIM task involves masking some information, e.g.~pixels,
+patches, or even latent representations, and training a model, usually an
+autoencoder, to predicting the missing information by using the context
+available in the visible part of the input. We identify and formalize two
+categories of approaches on how to implement MIM as a pretext task, one based
+on reconstruction and one based on contrastive learning. Then, we construct a
+taxonomy and review the most prominent papers in recent years. We complement
+the manually constructed taxonomy with a dendrogram obtained by applying a
+hierarchical clustering algorithm. We further identify relevant clusters via
+manually inspecting the resulting dendrogram. Our review also includes datasets
+that are commonly used in MIM research. We aggregate the performance results of
+various masked image modeling methods on the most popular datasets, to
+facilitate the comparison of competing methods. Finally, we identify research
+gaps and propose several interesting directions of future work. We supplement
+our survey with the following public repository containing organized
+references: https://github.com/vladhondru25/MIM-Survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revised version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Function-Space Optimality of Neural Architectures with Multivariate
+  Nonlinearities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03696v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03696v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Parhi, Michael Unser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the function-space optimality (specifically, the Banach-space
+optimality) of a large class of shallow neural architectures with multivariate
+nonlinearities/activation functions. To that end, we construct a new family of
+Banach spaces defined via a regularization operator, the $k$-plane transform,
+and a sparsity-promoting norm. We prove a representer theorem that states that
+the solution sets to learning problems posed over these Banach spaces are
+completely characterized by neural architectures with multivariate
+nonlinearities. These optimal architectures have skip connections and are
+tightly connected to orthogonal weight normalization and multi-index models,
+both of which have received recent interest in the neural network community.
+Our framework is compatible with a number of classical nonlinearities including
+the rectified linear unit (ReLU) activation function, the norm activation
+function, and the radial basis functions found in the theory of
+thin-plate/polyharmonic splines. We also show that the underlying spaces are
+special instances of reproducing kernel Banach spaces and variation spaces. Our
+results shed light on the regularity of functions learned by neural networks
+trained on data, particularly with multivariate nonlinearities, and provide new
+theoretical motivation for several architectural choices found in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Real Time Multi Organ Classification on Computed Tomography Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18731v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18731v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Halid Ziya Yerebakan, Yoshihisa Shinagawa, Gerardo Hermosillo Valadez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Organ segmentation is a fundamental task in medical imaging since it is
+useful for many clinical automation pipelines. However, some tasks do not
+require full segmentation. Instead, a classifier can identify the selected
+organ without segmenting the entire volume. In this study, we demonstrate a
+classifier based method to obtain organ labels in real time by using a large
+context size with a sparse data sampling strategy. Although our method operates
+as an independent classifier at query locations, it can generate full
+segmentations by querying grid locations at any resolution, offering faster
+performance than segmentation algorithms. We compared our method with existing
+segmentation techniques, demonstrating its superior runtime potential for
+practical applications in medical imaging.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, Organ Classification, Organ Segmentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Point Matching with Distance Profiles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12641v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12641v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        YoonHaeng Hur, Yuehaw Khoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show the outlier robustness and noise stability of practical matching
+procedures based on distance profiles. Although the idea of matching points
+based on invariants like distance profiles has a long history in the
+literature, there has been little understanding of the theoretical properties
+of such procedures, especially in the presence of outliers and noise. We
+provide a theoretical analysis showing that under certain probabilistic
+settings, the proposed matching procedure is successful with high probability
+even in the presence of outliers and noise. We demonstrate the performance of
+the proposed method using a real data example and provide simulation studies to
+complement the theoretical findings. Lastly, we extend the concept of distance
+profiles to the abstract setting and connect the proposed matching procedure to
+the Gromov-Wasserstein distance and its lower bound, with a new sample
+complexity result derived based on the properties of distance profiles. This
+paper contributes to the literature by providing theoretical underpinnings of
+the matching procedures based on invariants like distance profiles, which have
+been widely used in practice but have rarely been analyzed theoretically.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaze-Informed Vision <span class="highlight-title">Transformer</span>s: Predicting Driving Decisions Under
+  Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13969v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13969v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharath Koorathota, Nikolas Papadopoulos, Jia Li Ma, Shruti Kumar, Xiaoxiao Sun, Arunesh Mittal, Patrick Adelman, Paul Sajda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViT) have advanced computer vision, yet their efficacy
+in complex tasks like driving remains less explored. This study enhances ViT by
+integrating human eye gaze, captured via eye-tracking, to increase prediction
+accuracy in driving scenarios under uncertainty in both real-world and virtual
+reality scenarios. First, we establish the significance of human eye gaze in
+left-right driving decisions, as observed in both human subjects and a ViT
+model. By comparing the similarity between human fixation maps and ViT
+attention weights, we reveal the dynamics of overlap across individual heads
+and layers. This overlap demonstrates that fixation data can guide the model in
+distributing its attention weights more effectively. We introduce the
+fixation-attention intersection (FAX) loss, a novel loss function that
+significantly improves ViT performance under high uncertainty conditions. Our
+results show that ViT, when trained with FAX loss, aligns its attention with
+human gaze patterns. This gaze-informed approach has significant potential for
+driver behavior analysis, as well as broader applications in human-centered AI
+systems, extending ViT's use to complex visual environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 9 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SepsisCalc: Integrating Clinical Calculators into Early Sepsis
+  Prediction via Dynamic Temporal Graph Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changchang Yin, Shihan Fu, Bingsheng Yao, Thai-Hoang Pham, Weidan Cao, Dakuo Wang, Jeffrey Caterino, Ping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sepsis is an organ dysfunction caused by a deregulated immune response to an
+infection. Early sepsis prediction and identification allow for timely
+intervention, leading to improved clinical outcomes. Clinical calculators
+(e.g., the six-organ dysfunction assessment of SOFA) play a vital role in
+sepsis identification within clinicians' workflow, providing evidence-based
+risk assessments essential for sepsis diagnosis. However, artificial
+intelligence (AI) sepsis prediction models typically generate a single sepsis
+risk score without incorporating clinical calculators for assessing organ
+dysfunctions, making the models less convincing and transparent to clinicians.
+To bridge the gap, we propose to mimic clinicians' workflow with a novel
+framework SepsisCalc to integrate clinical calculators into the predictive
+model, yielding a clinically transparent and precise model for utilization in
+clinical settings. Practically, clinical calculators usually combine
+information from multiple component variables in Electronic Health Records
+(EHR), and might not be applicable when the variables are (partially) missing.
+We mitigate this issue by representing EHRs as temporal graphs and integrating
+a learning module to dynamically add the accurately estimated calculator to the
+graphs. Experimental results on real-world datasets show that the proposed
+model outperforms state-of-the-art methods on sepsis prediction tasks.
+Moreover, we developed a system to identify organ dysfunctions and potential
+sepsis risks, providing a human-AI interaction tool for deployment, which can
+help clinicians understand the prediction outputs and prepare timely
+interventions for the corresponding dysfunctions, paving the way for actionable
+clinical decision-making support for early intervention.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Randomized Approach to Matrix Completion: Applications in Collaborative
+  Filtering and Image Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01919v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01919v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonina Krajewska, Ewa Niewiadomska-Szynkiewicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel method for matrix completion, specifically designed for
+matrices where one dimension significantly exceeds the other. Our Columns
+Selected Matrix Completion (CSMC) method combines Column Subset Selection and
+Low-Rank Matrix Completion to efficiently reconstruct incomplete datasets. In
+each step, CSMC solves a convex optimization problem. We introduce two
+algorithms to implement CSMC, each tailored to problems of different sizes. A
+formal analysis is provided, outlining the necessary assumptions and the
+probability of obtaining a correct solution. To assess the impact of matrix
+size, rank, and the ratio of missing entries on solution quality and
+computation time, we conducted experiments on synthetic data. The method was
+also applied to two real-world problems: recommendation systems and image
+inpainting. Our results show that CSMC provides solutions of the same quality
+as state-of-the-art matrix completion algorithms based on convex optimization,
+while achieving significant reductions in computational runtime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Investigation of Conformal Isometry Hypothesis for Grid Cells 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dehong Xu, Ruiqi Gao, Wen-Hao Zhang, Xue-Xin Wei, Ying Nian Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the conformal isometry hypothesis as a potential
+explanation for hexagonal periodic patterns in grid cell response maps. The
+hypothesis posits that grid cell activity forms a high-dimensional vector in
+neural space, encoding the agent's position in 2D physical space. As the agent
+moves, this vector rotates within a 2D manifold in the neural space, driven by
+a recurrent neural network. The conformal hypothesis suggests that this neural
+manifold is a conformally isometric embedding of physical space, where local
+displacements in neural space are proportional to those in physical space. In
+this paper, we conduct numerical experiments to show that this hypothesis leads
+to the hexagon periodic patterns of grid cells, agnostic to the choice of
+transformation models. Furthermore, we present a theoretical understanding that
+hexagon patterns emerge by minimizing our loss function because hexagon flat
+torus exhibits minimal deviation from local conformal isometry. In addition, we
+propose a conformal modulation of the agent's input velocity, enabling the
+recurrent neural network of grid cells to satisfy the conformal isometry
+hypothesis automatically.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2310.19192</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Proactive Adversarial Defense: Harnessing <span class="highlight-title">Prompt</span> Tuning in
+  Vision-Language Models to Detect Unseen Backdoored Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.08755v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.08755v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Stein, Andrew Arash Mahyari, Guillermo Francia, Eman El-Sheikh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks pose a critical threat by embedding hidden triggers into
+inputs, causing models to misclassify them into target labels. While extensive
+research has focused on mitigating these attacks in object recognition models
+through weight fine-tuning, much less attention has been given to detecting
+backdoored samples directly. Given the vast datasets used in training, manual
+inspection for backdoor triggers is impractical, and even state-of-the-art
+defense mechanisms fail to fully neutralize their impact. To address this gap,
+we introduce a groundbreaking method to detect unseen backdoored images during
+both training and inference. Leveraging the transformative success of prompt
+tuning in Vision Language Models (VLMs), our approach trains learnable text
+prompts to differentiate clean images from those with hidden backdoor triggers.
+Experiments demonstrate the exceptional efficacy of this method, achieving an
+impressive average accuracy of 86% across two renowned datasets for detecting
+unseen backdoor triggers, establishing a new standard in backdoor defense.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Architecture Frameworks by Including Modern Stakeholders and
+  their Views/Viewpoints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05239v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05239v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armin Moin, Atta Badii, Stephan Günnemann, Moharram Challenger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various architecture frameworks for software, systems, and enterprises have
+been proposed in the literature. They identified several stakeholders and
+defined modeling perspectives, architecture viewpoints, and views to frame and
+address stakeholder concerns. However, the stakeholders with data science and
+Machine Learning (ML) related concerns, such as data scientists and data
+engineers, are yet to be included in existing architecture frameworks. Only
+this way can we envision a holistic system architecture description of an
+ML-enabled system. Note that the ML component behavior and functionalities are
+special and should be distinguished from traditional software system behavior
+and functionalities. The main reason is that the actual functionality should be
+inferred from data instead of being specified at design time. Additionally, the
+structural models of ML components, such as ML model architectures, are
+typically specified using different notations and formalisms from what the
+Software Engineering (SE) community uses for software structural models. Yet,
+these two aspects, namely ML and non-ML, are becoming so intertwined that it
+necessitates an extension of software architecture frameworks and modeling
+practices toward supporting ML-enabled system architectures. In this paper, we
+address this gap through an empirical study using an online survey instrument.
+We surveyed 61 subject matter experts from over 25 organizations in 10
+countries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICICT 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse
+  Tensor-based <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07899v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07899v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Huo, Junhui Hou, Shuai Wan, Fuzheng Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evolution of 3D visualization techniques has fundamentally transformed
+how we interact with digital content. At the forefront of this change is point
+cloud technology, offering an immersive experience that surpasses traditional
+2D representations. However, the massive data size of point clouds presents
+significant challenges in data compression. Current methods for lossy point
+cloud attribute compression (PCAC) generally focus on reconstructing the
+original point clouds with minimal error. However, for point cloud
+visualization scenarios, the reconstructed point clouds with distortion still
+need to undergo a complex rendering process, which affects the final
+user-perceived quality. In this paper, we propose an end-to-end deep learning
+framework that seamlessly integrates PCAC with differentiable rendering,
+denoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of
+rendered multiview images for viewing. In a differentiable manner, the impact
+of the rendering process on the reconstructed point clouds is taken into
+account. Moreover, we characterize point clouds as sparse tensors and propose a
+sparse tensor-based transformer, called SP-Trans. By aligning with the local
+density of the point cloud and utilizing an enhanced local attention mechanism,
+SP-Trans captures the intricate relationships within the point cloud, further
+improving feature analysis and synthesis within the framework. Extensive
+experiments demonstrate that the proposed RO-PCAC achieves state-of-the-art
+compression performance, compared to existing reconstruction-oriented methods,
+including traditional, learning-based, and hybrid methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OneLLM: One Framework to Align All Modalities with Language <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Han, Kaixiong Gong, Yiyuan Zhang, Jiaqi Wang, Kaipeng Zhang, Dahua Lin, Yu Qiao, Peng Gao, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) have gained significant attention
+due to their strong multimodal understanding capability. However, existing
+works rely heavily on modality-specific encoders, which usually differ in
+architecture and are limited to common modalities. In this paper, we present
+OneLLM, an MLLM that aligns eight modalities to language using a unified
+framework. We achieve this through a unified multimodal encoder and a
+progressive multimodal alignment pipeline. In detail, we first train an image
+projection module to connect a vision encoder with LLM. Then, we build a
+universal projection module (UPM) by mixing multiple image projection modules
+and dynamic routing. Finally, we progressively align more modalities to LLM
+with the UPM. To fully leverage the potential of OneLLM in following
+instructions, we also curated a comprehensive multimodal instruction dataset,
+including 2M items from image, audio, video, point cloud, depth/normal map, IMU
+and fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,
+encompassing tasks such as multimodal captioning, question answering and
+reasoning, where it delivers excellent performance. Code, data, model and
+online demo are available at https://github.com/csuhan/OneLLM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2024. Code: https://github.com/csuhan/OneLLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">140</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study of Autoregressive <span class="highlight-title">Pre-train</span>ing from Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05453v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05453v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jathushan Rajasegaran, Ilija Radosavovic, Rahul Ravishankar, Yossi Gandelsman, Christoph Feichtenhofer, Jitendra Malik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We empirically study autoregressive pre-training from videos. To perform our
+study, we construct a series of autoregressive video models, called Toto. We
+treat videos as sequences of visual tokens and train transformer models to
+autoregressively predict future tokens. Our models are pre-trained on a diverse
+dataset of videos and images comprising over 1 trillion visual tokens. We
+explore different architectural, training, and inference design choices. We
+evaluate the learned visual representations on a range of downstream tasks
+including image recognition, video classification, object tracking, and
+robotics. Our results demonstrate that, despite minimal inductive biases,
+autoregressive pre-training leads to competitive performance across all
+benchmarks. Finally, we find that scaling our video models results in similar
+scaling curves to those seen in language models, albeit with a different rate.
+More details at https://brjathu.github.io/toto/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistent Flow Distillation for Text-to-3D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runjie Yan, Yinbo Chen, Xiaolong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Score Distillation Sampling (SDS) has made significant strides in distilling
+image-generative models for 3D generation. However, its
+maximum-likelihood-seeking behavior often leads to degraded visual quality and
+diversity, limiting its effectiveness in 3D applications. In this work, we
+propose Consistent Flow Distillation (CFD), which addresses these limitations.
+We begin by leveraging the gradient of the diffusion ODE or SDE sampling
+process to guide the 3D generation. From the gradient-based sampling
+perspective, we find that the consistency of 2D image flows across different
+viewpoints is important for high-quality 3D generation. To achieve this, we
+introduce multi-view consistent Gaussian noise on the 3D object, which can be
+rendered from various viewpoints to compute the flow gradient. Our experiments
+demonstrate that CFD, through consistent flows, significantly outperforms
+previous methods in text-to-3D generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://runjie-yan.github.io/cfd/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">survey</span> of textual cyber abuse detection using cutting-edge language
+  models and large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose A. Diaz-Garcia, Joao Paulo Carvalho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of social media platforms has facilitated the emergence of
+various forms of online abuse within digital communities. This abuse manifests
+in multiple ways, including hate speech, cyberbullying, emotional abuse,
+grooming, and sexting. In this paper, we present a comprehensive analysis of
+the different forms of abuse prevalent in social media, with a particular focus
+on how emerging technologies, such as Language Models (LMs) and Large Language
+Models (LLMs), are reshaping both the detection and generation of abusive
+content within these networks. We delve into the mechanisms through which
+social media abuse is perpetuated, exploring the psychological and social
+impact. Additionally, we examine the dual role of advanced language
+models-highlighting their potential to enhance automated detection systems for
+abusive behavior while also acknowledging their capacity to generate harmful
+content. This paper aims to contribute to the ongoing discourse on online
+safety and ethics, offering insights into the evolving landscape of cyberabuse
+and the technological innovations that both mitigate and exacerbate it.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, under review in WIREs Data Mining and Knowledge Discovery</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive Growing of Video Tokenizers for Highly Compressed Latent
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aniruddha Mahapatra, Long Mai, Yitian Zhang, David Bourgin, Feng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video tokenizers are essential for latent video diffusion models, converting
+raw video data into spatiotemporally compressed latent spaces for efficient
+training. However, extending state-of-the-art video tokenizers to achieve a
+temporal compression ratio beyond 4x without increasing channel capacity poses
+significant challenges. In this work, we propose an alternative approach to
+enhance temporal compression. We find that the reconstruction quality of
+temporally subsampled videos from a low-compression encoder surpasses that of
+high-compression encoders applied to original videos. This indicates that
+high-compression models can leverage representations from lower-compression
+models. Building on this insight, we develop a bootstrapped
+high-temporal-compression model that progressively trains high-compression
+blocks atop well-trained lower-compression models. Our method includes a
+cross-level feature-mixing module to retain information from the pretrained
+low-compression model and guide higher-compression blocks to capture the
+remaining details from the full video sequence. Evaluation of video benchmarks
+shows that our method significantly improves reconstruction quality while
+increasing temporal compression compared to direct extensions of existing video
+tokenizers. Furthermore, the resulting compact latent space effectively trains
+a video diffusion model for high-quality video generation with a reduced token
+budget.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website:
+  https://progressive-video-tokenizer.github.io/Pro-MAG/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Simple to Complex Skills: The Case of In-Hand Object Reorientation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05439v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05439v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhi Qi, Brent Yi, Mike Lambeta, Yi Ma, Roberto Calandra, Jitendra Malik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning policies in simulation and transferring them to the real world has
+become a promising approach in dexterous manipulation. However, bridging the
+sim-to-real gap for each new task requires substantial human effort, such as
+careful reward engineering, hyperparameter tuning, and system identification.
+In this work, we present a system that leverages low-level skills to address
+these challenges for more complex tasks. Specifically, we introduce a
+hierarchical policy for in-hand object reorientation based on previously
+acquired rotation skills. This hierarchical policy learns to select which
+low-level skill to execute based on feedback from both the environment and the
+low-level skill policies themselves. Compared to learning from scratch, the
+hierarchical policy is more robust to out-of-distribution changes and transfers
+easily from simulation to real-world environments. Additionally, we propose a
+generalizable object pose estimator that uses proprioceptive information,
+low-level skill predictions, and control errors as inputs to estimate the
+object pose over time. We demonstrate that our system can reorient objects,
+including symmetrical and textureless ones, to a desired pose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>website: https://dexhier.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neuro-Symbolic AI in 2024: A Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05435v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05435v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon C. Colelough, William Regli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: The field of Artificial Intelligence has undergone cyclical
+periods of growth and decline, known as AI summers and winters. Currently, we
+are in the third AI summer, characterized by significant advancements and
+commercialization, particularly in the integration of Symbolic AI and
+Sub-Symbolic AI, leading to the emergence of Neuro-Symbolic AI.
+  Methods: The review followed the PRISMA methodology, utilizing databases such
+as IEEE Explore, Google Scholar, arXiv, ACM, and SpringerLink. The inclusion
+criteria targeted peer-reviewed papers published between 2020 and 2024. Papers
+were screened for relevance to Neuro-Symbolic AI, with further inclusion based
+on the availability of associated codebases to ensure reproducibility.
+  Results: From an initial pool of 1,428 papers, 167 met the inclusion criteria
+and were analyzed in detail. The majority of research efforts are concentrated
+in the areas of learning and inference (63%), logic and reasoning (35%), and
+knowledge representation (44%). Explainability and trustworthiness are less
+represented (28%), with Meta-Cognition being the least explored area (5%). The
+review identifies significant interdisciplinary opportunities, particularly in
+integrating explainability and trustworthiness with other research areas.
+  Conclusion: Neuro-Symbolic AI research has seen rapid growth since 2020, with
+concentrated efforts in learning and inference. Significant gaps remain in
+explainability, trustworthiness, and Meta-Cognition. Addressing these gaps
+through interdisciplinary research will be crucial for advancing the field
+towards more intelligent, reliable, and context-aware AI systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Pathology Foundation Model by Mayo Clinic, Charité, and
+  Aignostics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Alber, Stephan Tietz, Jonas Dippel, Timo Milbich, Timothée Lesort, Panos Korfiatis, Moritz Krügener, Beatriz Perez Cancer, Neelay Shah, Alexander Möllers, Philipp Seegerer, Alexandra Carpen-Amarie, Kai Standvoss, Gabriel Dernbach, Edwin de Jong, Simon Schallenberg, Andreas Kunft, Helmut Hoffer von Ankershoffen, Gavin Schaeferle, Patrick Duffy, Matt Redlon, Philipp Jurmeister, David Horst, Lukas Ruff, Klaus-Robert Müller, Frederick Klauschen, Andrew Norgan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in digital pathology have demonstrated the effectiveness of
+foundation models across diverse applications. In this report, we present a
+novel vision foundation model based on the RudolfV approach. Our model was
+trained on a dataset comprising 1.2 million histopathology whole slide images,
+collected from two medical institutions: Mayo Clinic and Charit\'e -
+Universt\"atsmedizin Berlin. Comprehensive evaluations show that our model
+achieves state-of-the-art performance across twenty-one public benchmark
+datasets, even though it is neither the largest model by parameter count nor by
+training dataset size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimeRL: Efficient Deep Reinforcement Learning with Polyhedral Dependence
+  Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro F. Silvestre, Peter Pietzuch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern deep learning (DL) workloads increasingly use complex deep
+reinforcement learning (DRL) algorithms that generate training data within the
+learning loop. This results in programs with several nested loops and dynamic
+data dependencies between tensors. While DL systems with eager execution
+support such dynamism, they lack the optimizations and smart scheduling of
+graph-based execution. Graph-based execution, however, cannot express dynamic
+tensor shapes, instead requiring the use of multiple static subgraphs. Either
+execution model for DRL thus leads to redundant computation, reduced
+parallelism, and less efficient memory management.
+  We describe TimeRL, a system for executing dynamic DRL programs that combines
+the dynamism of eager execution with the whole-program optimizations and
+scheduling of graph-based execution. TimeRL achieves this by introducing the
+declarative programming model of recurrent tensors, which allows users to
+define dynamic dependencies as intuitive recurrence equations. TimeRL
+translates recurrent tensors into a polyhedral dependence graph (PDG) with
+dynamic dependencies as symbolic expressions. Through simple PDG
+transformations, TimeRL applies whole-program optimizations, such as automatic
+vectorization, incrementalization, and operator fusion. The PDG also allows for
+the computation of an efficient program-wide execution schedule, which decides
+on buffer deallocations, buffer donations, and GPU/CPU memory swapping. We show
+that TimeRL executes current DRL algorithms up to 47$\times$ faster than
+existing DRL systems, while using 16$\times$ less GPU peak memory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 11 figures, 5 bibliography pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On-line Policy Improvement using Monte-Carlo Search <span class="chip">NeurIPS 1996</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerald Tesauro, Gregory R. Galperin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a Monte-Carlo simulation algorithm for real-time policy
+improvement of an adaptive controller. In the Monte-Carlo simulation, the
+long-term expected reward of each possible action is statistically measured,
+using the initial policy to make decisions in each step of the simulation. The
+action maximizing the measured expected reward is then taken, resulting in an
+improved policy. Our algorithm is easily parallelizable and has been
+implemented on the IBM SP1 and SP2 parallel-RISC supercomputers.
+  We have obtained promising initial results in applying this algorithm to the
+domain of backgammon. Results are reported for a wide variety of initial
+policies, ranging from a random policy to TD-Gammon, an extremely strong
+multi-layer neural network. In each case, the Monte-Carlo algorithm gives a
+substantial reduction, by as much as a factor of 5 or more, in the error rate
+of the base players. The algorithm is also potentially useful in many other
+adaptive control applications in which it is possible to simulate the
+environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accompanied by oral presentation by Gregory Galperin at NeurIPS 1996
+  (then known as NIPS*96)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimeDP: Learning to Generate Multi-Domain Time Series with Domain
+  <span class="highlight-title">Prompt</span>s <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Hao Huang, Chang Xu, Yueying Wu, Wu-Jun Li, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series generation models are crucial for applications like data
+augmentation and privacy preservation. Most existing time series generation
+models are typically designed to generate data from one specified domain. While
+leveraging data from other domain for better generalization is proved to work
+in other application areas, this approach remains challenging for time series
+modeling due to the large divergence in patterns among different real world
+time series categories. In this paper, we propose a multi-domain time series
+diffusion model with domain prompts, named TimeDP. In TimeDP, we utilize a time
+series semantic prototype module which defines time series prototypes to
+represent time series basis, each prototype vector serving as "word"
+representing some elementary time series feature. A prototype assignment module
+is applied to extract the extract domain specific prototype weights, for
+learning domain prompts as generation condition. During sampling, we extract
+"domain prompt" with few-shot samples from the target domain and use the domain
+prompts as condition to generate time series samples. Experiments demonstrate
+that our method outperforms baselines to provide the state-of-the-art in-domain
+generation quality and strong unseen domain generation capability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BRATI: Bidirectional Recurrent Attention for Time-Series Imputation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armando Collado-Villaverde, Pablo Muñoz, Maria D. R-Moreno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Missing data in time-series analysis poses significant challenges, affecting
+the reliability of downstream applications. Imputation, the process of
+estimating missing values, has emerged as a key solution. This paper introduces
+BRATI, a novel deep-learning model designed to address multivariate time-series
+imputation by combining Bidirectional Recurrent Networks and Attention
+mechanisms. BRATI processes temporal dependencies and feature correlations
+across long and short time horizons, utilizing two imputation blocks that
+operate in opposite temporal directions. Each block integrates recurrent layers
+and attention mechanisms to effectively resolve long-term dependencies.
+  We evaluate BRATI on three real-world datasets under diverse missing-data
+scenarios: randomly missing values, fixed-length missing sequences, and
+variable-length missing sequences. Our findings demonstrate that BRATI
+consistently outperforms state-of-the-art models, delivering superior accuracy
+and robustness in imputing multivariate time-series data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mechanistic understanding and validation of large AI models with
+  SemanticLens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Dreyer, Jim Berend, Tobias Labarta, Johanna Vielhaben, Thomas Wiegand, Sebastian Lapuschkin, Wojciech Samek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unlike human-engineered systems such as aeroplanes, where each component's
+role and dependencies are well understood, the inner workings of AI models
+remain largely opaque, hindering verifiability and undermining trust. This
+paper introduces SemanticLens, a universal explanation method for neural
+networks that maps hidden knowledge encoded by components (e.g., individual
+neurons) into the semantically structured, multimodal space of a foundation
+model such as CLIP. In this space, unique operations become possible, including
+(i) textual search to identify neurons encoding specific concepts, (ii)
+systematic analysis and comparison of model representations, (iii) automated
+labelling of neurons and explanation of their functional roles, and (iv) audits
+to validate decision-making against requirements. Fully scalable and operating
+without human input, SemanticLens is shown to be effective for debugging and
+validation, summarizing model knowledge, aligning reasoning with expectations
+(e.g., adherence to the ABCDE-rule in melanoma classification), and detecting
+components tied to spurious correlations and their associated training data. By
+enabling component-level understanding and validation, the proposed approach
+helps bridge the "trust gap" between AI models and traditional engineered
+systems. We provide code for SemanticLens on
+https://github.com/jim-berend/semanticlens and a demo on
+https://semanticlens.hhi-research-insights.eu.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>74 pages (18 pages manuscript, 7 pages references, 49 pages appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The global consensus on the risk management of autonomous driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Krügel, Matthias Uhl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Every maneuver of a vehicle redistributes risks between road users. While
+human drivers do this intuitively, autonomous vehicles allow and require
+deliberative algorithmic risk management. But how should traffic risks be
+distributed among road users? In a global experimental study in eight countries
+with different cultural backgrounds and almost 11,000 participants, we compared
+risk distribution preferences. It turns out that risk preferences in road
+traffic are strikingly similar between the cultural zones. The vast majority of
+participants in all countries deviates from a guiding principle of minimizing
+accident probabilities in favor of weighing up the probability and severity of
+accidents. At the national level, the consideration of accident probability and
+severity hardly differs between countries. The social dilemma of autonomous
+vehicles detected in deterministic crash scenarios disappears in risk
+assessments of everyday traffic situations in all countries. In no country do
+cyclists receive a risk bonus that goes beyond their higher vulnerability. In
+sum, our results suggest that a global consensus on the risk ethics of
+autonomous driving is easier to establish than on the ethics of crashing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Physics Models: Towards a collaborative approach with Large
+  Language Models and Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kristian G. Barman, Sascha Caron, Emily Sullivan, Henk W. de Regt, Roberto Ruiz de Austri, Mieke Boon, Michael Färber, Stefan Fröse, Faegheh Hasibi, Andreas Ipp, Rukshak Kapoor, Gregor Kasieczka, Daniel Kostić, Michael Krämer, Tobias Golling, Luis G. Lopez, Jesus Marco, Sydney Otten, Pawel Pawlowski, Pietro Vischia, Erik Weber, Christoph Weniger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores ideas and provides a potential roadmap for the
+development and evaluation of physics-specific large-scale AI models, which we
+call Large Physics Models (LPMs). These models, based on foundation models such
+as Large Language Models (LLMs) - trained on broad data - are tailored to
+address the demands of physics research. LPMs can function independently or as
+part of an integrated framework. This framework can incorporate specialized
+tools, including symbolic reasoning modules for mathematical manipulations,
+frameworks to analyse specific experimental and simulated data, and mechanisms
+for synthesizing theories and scientific literature. We begin by examining
+whether the physics community should actively develop and refine dedicated
+models, rather than relying solely on commercial LLMs. We then outline how LPMs
+can be realized through interdisciplinary collaboration among experts in
+physics, computer science, and philosophy of science. To integrate these models
+effectively, we identify three key pillars: Development, Evaluation, and
+Philosophical Reflection. Development focuses on constructing models capable of
+processing physics texts, mathematical formulations, and diverse physical data.
+Evaluation assesses accuracy and reliability by testing and benchmarking.
+Finally, Philosophical Reflection encompasses the analysis of broader
+implications of LLMs in physics, including their potential to generate new
+scientific understanding and what novel collaboration dynamics might arise in
+research. Inspired by the organizational structure of experimental
+collaborations in particle physics, we propose a similarly interdisciplinary
+and collaborative approach to building and refining Large Physics Models. This
+roadmap provides specific objectives, defines pathways to achieve them, and
+identifies challenges that must be addressed to realise physics-specific large
+scale AI models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Developing a Foundation of Vector Symbolic Architectures Using Category
+  Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nolan P Shaw, P Michael Furlong, Britt Anderson, Jeff Orchard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  At the risk of overstating the case, connectionist approaches to machine
+learning, i.e. neural networks, are enjoying a small vogue right now. However,
+these methods require large volumes of data and produce models that are
+uninterpretable to humans. An alternative framework that is compatible with
+neural networks and gradient-based learning, but explicitly models
+compositionality, is Vector Symbolic Architectures (VSAs). VSAs are a family of
+algebras on high-dimensional vector representations. They arose in cognitive
+science from the need to unify neural processing and the kind of symbolic
+reasoning that humans perform. While machine learning methods have benefited
+from category theoretical analyses, VSAs have not yet received similar
+treatment. In this paper, we present a first attempt at applying category
+theory to VSAs. Specifically, we conduct a brief literature survey
+demonstrating the lacking intersection of these two topics, provide a list of
+desiderata for VSAs, and propose that VSAs may be understood as a (division)
+rig in a category enriched over a monoid in Met (the category of Lawvere metric
+spaces). This final contribution suggests that VSAs may be generalised beyond
+current implementations. It is our hope that grounding VSAs in category theory
+will lead to more rigorous connections with other research, both within and
+beyond, learning and cognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, no figures, 2 tables, one appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Search-o1: Agentic Search-Enhanced Large Reasoning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05366v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05366v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxi Li, Guanting Dong, Jiajie Jin, Yuyao Zhang, Yujia Zhou, Yutao Zhu, Peitian Zhang, Zhicheng Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large reasoning models (LRMs) like OpenAI-o1 have demonstrated impressive
+long stepwise reasoning capabilities through large-scale reinforcement
+learning. However, their extended reasoning processes often suffer from
+knowledge insufficiency, leading to frequent uncertainties and potential
+errors. To address this limitation, we introduce \textbf{Search-o1}, a
+framework that enhances LRMs with an agentic retrieval-augmented generation
+(RAG) mechanism and a Reason-in-Documents module for refining retrieved
+documents. Search-o1 integrates an agentic search workflow into the reasoning
+process, enabling dynamic retrieval of external knowledge when LRMs encounter
+uncertain knowledge points. Additionally, due to the verbose nature of
+retrieved documents, we design a separate Reason-in-Documents module to deeply
+analyze the retrieved information before injecting it into the reasoning chain,
+minimizing noise and preserving coherent reasoning flow. Extensive experiments
+on complex reasoning tasks in science, mathematics, and coding, as well as six
+open-domain QA benchmarks, demonstrate the strong performance of Search-o1.
+This approach enhances the trustworthiness and applicability of LRMs in complex
+reasoning tasks, paving the way for more reliable and versatile intelligent
+systems. The code is available at
+\url{https://github.com/sunnynexus/Search-o1}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Corrigibility and Alignment in Multi Agent Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edmund Dable-Heath, Boyko Vodenicharski, James Bishop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Corrigibility of autonomous agents is an under explored part of system
+design, with previous work focusing on single agent systems. It has been
+suggested that uncertainty over the human preferences acts to keep the agents
+corrigible, even in the face of human irrationality. We present a general
+framework for modelling corrigibility in a multi-agent setting as a 2 player
+game in which the agents always have a move in which they can ask the human for
+supervision. This is formulated as a Bayesian game for the purpose of
+introducing uncertainty over the human beliefs. We further analyse two specific
+cases. First, a two player corrigibility game, in which we want corrigibility
+displayed in both agents for both common payoff (monotone) games and harmonic
+games. Then we investigate an adversary setting, in which one agent is
+considered to be a `defending' agent and the other an `adversary'. A general
+result is provided for what belief over the games and human rationality the
+defending agent is required to have to induce corrigibility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stream Aligner: Efficient Sentence-Level Alignment via Distribution
+  Induction <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hantao Lou, Jiaming Ji, Kaile Wang, Yaodong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of large language models (LLMs) has led to significant
+improvements in their capabilities, but also to increased concerns about their
+alignment with human values and intentions. Current alignment strategies,
+including adaptive training and inference-time methods, have demonstrated
+potential in this area. However, these approaches still struggle to balance
+deployment complexity and capability across various tasks and difficulties. In
+this work, we introduce the Streaming Distribution Induce Aligner (Stream
+Aligner), a novel alignment paradigm that combines efficiency with enhanced
+performance in various tasks throughout the generation process. Stream Aligner
+achieves dynamic sentence-level correction by using a small model to learn the
+preferences of the suffix sentence, iteratively correcting the suffix sentence
+output by the upstream model, and then using the corrected sentence to replace
+the suffix sentence in subsequent generations. Compared to Aligner, our
+experiments demonstrate that Stream Aligner reduces reliance on the
+capabilities of additional models, enhances the reasoning abilities of LLMs,
+and decreases latency during user interaction. Specifically, Stream Aligner-2B
+model has achieved an improvement of 76.1% in helpfulness, 36.0% in
+harmlessness on the tested Llama2-70B-chat model, and Stream Aligner-8B has
+achieved an improvement of 3.5% on the math ability of the tested
+Llama3-70B-Instruct model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI Alignment Track 2025 Poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Bakers and Millers Game with Restricted Locations <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Krogmann, Pascal Lenzner, Alexander Skopalik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study strategic location choice by customers and sellers, termed the
+Bakers and Millers Game in the literature. In our generalized setting, each
+miller can freely choose any location for setting up a mill, while each baker
+is restricted in the choice of location for setting up a bakery. For optimal
+bargaining power, a baker would like to select a location with many millers to
+buy flour from and with little competition from other bakers. Likewise, a
+miller aims for a location with many bakers and few competing millers. Thus,
+both types of agents choose locations to optimize the ratio of agents of
+opposite type divided by agents of the same type at their chosen location.
+Originally raised in the context of Fractional Hedonic Games, the Bakers and
+Millers Game has applications that range from commerce to product design.
+  We study the impact of location restrictions on the properties of the game.
+While pure Nash equilibria trivially exist in the setting without location
+restrictions, we show via a sophisticated, efficient algorithm that even the
+more challenging restricted setting admits equilibria. Moreover, the computed
+equilibrium approximates the optimal social welfare by a factor of at most
+$2\left(\frac{e}{e-1}\right)$. Furthermore, we give tight bounds on the price
+of anarchy/stability.
+  On the conceptual side, the location choice feature adds a new layer to the
+standard setting of Hedonic Games, in the sense that agents that select the
+same location form a coalition. This allows to naturally restrict the possible
+coalitions that can be formed. With this, our model generalizes simple
+symmetric Fractional Hedonic Games on complete bipartite valuation graphs and
+also Hedonic Diversity Games with utilities single-peaked at 0. We believe that
+this generalization is also a very interesting direction for other types of
+Hedonic Games.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at the 24th International Conference on Autonomous Agents
+  and Multiagent Systems (AAMAS 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AnCoGen: Analysis, Control and Generation of Speech with a Masked
+  Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samir Sadok, Simon Leglaive, Laurent Girin, Gaël Richard, Xavier Alameda-Pineda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces AnCoGen, a novel method that leverages a masked
+autoencoder to unify the analysis, control, and generation of speech signals
+within a single model. AnCoGen can analyze speech by estimating key attributes,
+such as speaker identity, pitch, content, loudness, signal-to-noise ratio, and
+clarity index. In addition, it can generate speech from these attributes and
+allow precise control of the synthesized speech by modifying them. Extensive
+experiments demonstrated the effectiveness of AnCoGen across speech
+analysis-resynthesis, pitch estimation, pitch modification, and speech
+enhancement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, https://samsad35.github.io/site-ancogen</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Off-Policy Evaluation and Counterfactual Methods in Dynamic Auction
+  Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ritam Guha, Nilavra Pathak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual estimators are critical for learning and refining policies
+using logged data, a process known as Off-Policy Evaluation (OPE). OPE allows
+researchers to assess new policies without costly experiments, speeding up the
+evaluation process. Online experimental methods, such as A/B tests, are
+effective but often slow, thus delaying the policy selection and optimization
+process.
+  In this work, we explore the application of OPE methods in the context of
+resource allocation in dynamic auction environments. Given the competitive
+nature of environments where rapid decision-making is crucial for gaining a
+competitive edge, the ability to quickly and accurately assess algorithmic
+performance is essential. By utilizing counterfactual estimators as a
+preliminary step before conducting A/B tests, we aim to streamline the
+evaluation process, reduce the time and resources required for experimentation,
+and enhance confidence in the chosen policies. Our investigation focuses on the
+feasibility and effectiveness of using these estimators to predict the outcomes
+of potential resource allocation strategies, evaluate their performance, and
+facilitate more informed decision-making in policy selection. Motivated by the
+outcomes of our initial study, we envision an advanced analytics system
+designed to seamlessly and dynamically assess new resource allocation
+strategies and policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 15 figures, IEEE format</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Balanced Continual Multi-Modal Learning in Human Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxuan Peng, Mengshi Qi, Dong Zhao, Huadong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D human pose estimation (3D HPE) has emerged as a prominent research topic,
+particularly in the realm of RGB-based methods. However, RGB images are
+susceptible to limitations such as sensitivity to lighting conditions and
+potential user discomfort. Consequently, multi-modal sensing, which leverages
+non-intrusive sensors, is gaining increasing attention. Nevertheless,
+multi-modal 3D HPE still faces challenges, including modality imbalance and the
+imperative for continual learning. In this work, we introduce a novel balanced
+continual multi-modal learning method for 3D HPE, which harnesses the power of
+RGB, LiDAR, mmWave, and WiFi. Specifically, we propose a Shapley value-based
+contribution algorithm to quantify the contribution of each modality and
+identify modality imbalance. To address this imbalance, we employ a re-learning
+strategy. Furthermore, recognizing that raw data is prone to noise
+contamination, we develop a novel denoising continual learning approach. This
+approach incorporates a noise identification and separation module to mitigate
+the adverse effects of noise and collaborates with the balanced learning
+strategy to enhance optimization. Additionally, an adaptive EWC mechanism is
+employed to alleviate catastrophic forgetting. We conduct extensive experiments
+on the widely-adopted multi-modal dataset, MM-Fi, which demonstrate the
+superiority of our approach in boosting 3D pose estimation and mitigating
+catastrophic forgetting in complex scenarios. We will release our codes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Plagiarism Detection in Marathi with a Weighted Ensemble of
+  TF-IDF and <span class="highlight-title">BERT</span> Embeddings for Low-Resource Language Processing <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atharva Mutsaddi, Aditya Choudhary
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plagiarism involves using another person's work or concepts without proper
+attribution, presenting them as original creations. With the growing amount of
+data communicated in regional languages such as Marathi -- one of India's
+regional languages -- it is crucial to design robust plagiarism detection
+systems tailored for low-resource languages. Language models like Bidirectional
+Encoder Representations from Transformers (BERT) have demonstrated exceptional
+capability in text representation and feature extraction, making them essential
+tools for semantic analysis and plagiarism detection. However, the application
+of BERT for low-resource languages remains under-explored, particularly in the
+context of plagiarism detection. This paper presents a method to enhance the
+accuracy of plagiarism detection for Marathi texts using BERT sentence
+embeddings in conjunction with Term Frequency-Inverse Document Frequency
+(TF-IDF) feature representation. This approach effectively captures
+statistical, semantic, and syntactic aspects of text features through a
+weighted voting ensemble of machine learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted into LoResLM: The First Workshop on Language Models for
+  Low-Resource Languages, colocated with COLING 2025 and set to be published
+  into ACL Anthology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automating the Detection of Code Vulnerabilities by Analyzing GitHub
+  Issues 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Cipollone, Changjie Wang, Mariano Scazzariello, Simone Ferlin, Maliheh Izadi, Dejan Kostic, Marco Chiesa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In today's digital landscape, the importance of timely and accurate
+vulnerability detection has significantly increased. This paper presents a
+novel approach that leverages transformer-based models and machine learning
+techniques to automate the identification of software vulnerabilities by
+analyzing GitHub issues. We introduce a new dataset specifically designed for
+classifying GitHub issues relevant to vulnerability detection. We then examine
+various classification techniques to determine their effectiveness. The results
+demonstrate the potential of this approach for real-world application in early
+vulnerability detection, which could substantially reduce the window of
+exploitation for software vulnerabilities. This research makes a key
+contribution to the field by providing a scalable and computationally efficient
+framework for automated detection, enabling the prevention of compromised
+software usage before official notifications. This work has the potential to
+enhance the security of open-source software ecosystems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Scientific Texts to Verifiable Code: Automating the Process with
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05252v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05252v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changjie Wang, Mariano Scazzariello, Marco Chiesa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the vast body of research literature proposing algorithms with formal
+guarantees, the amount of verifiable code in today's systems remains minimal.
+This discrepancy stems from the inherent difficulty of verifying code,
+particularly due to the time-consuming nature and strict formalism of proof
+details that formal verification tools require. However, the emergence of
+transformers in Large Language Models presents a promising solution to this
+challenge. In this position paper, we believe that transformers have the
+potential to read research papers that propose algorithms with formal proofs
+and translate these proofs into verifiable code. We leverage transformers to
+first build a formal structure of the proof using the original text from the
+paper, and then to handle the tedious, low-level aspects of proofs that are
+often omitted by humans. We argue that this approach can significantly reduce
+the barrier to formal verification. The above idea of reading papers to write
+verifiable code opens new avenues for automating the verification of complex
+systems, enabling a future where formally verified algorithms from academic
+research can more seamlessly transition into real-world software systems,
+thereby improving code reliability and security.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RAG-WM: An Efficient Black-Box Watermarking Approach for
+  Retrieval-Augmented Generation of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peizhuo Lv, Mengjie Sun, Hao Wang, Xiaofeng Wang, Shengzhi Zhang, Yuxuan Chen, Kai Chen, Limin Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, tremendous success has been witnessed in Retrieval-Augmented
+Generation (RAG), widely used to enhance Large Language Models (LLMs) in
+domain-specific, knowledge-intensive, and privacy-sensitive tasks. However,
+attackers may steal those valuable RAGs and deploy or commercialize them,
+making it essential to detect Intellectual Property (IP) infringement. Most
+existing ownership protection solutions, such as watermarks, are designed for
+relational databases and texts. They cannot be directly applied to RAGs because
+relational database watermarks require white-box access to detect IP
+infringement, which is unrealistic for the knowledge base in RAGs. Meanwhile,
+post-processing by the adversary's deployed LLMs typically destructs text
+watermark information. To address those problems, we propose a novel black-box
+"knowledge watermark" approach, named RAG-WM, to detect IP infringement of
+RAGs. RAG-WM uses a multi-LLM interaction framework, comprising a Watermark
+Generator, Shadow LLM & RAG, and Watermark Discriminator, to create watermark
+texts based on watermark entity-relationship tuples and inject them into the
+target RAG. We evaluate RAG-WM across three domain-specific and two
+privacy-sensitive tasks on four benchmark LLMs. Experimental results show that
+RAG-WM effectively detects the stolen RAGs in various deployed LLMs.
+Furthermore, RAG-WM is robust against paraphrasing, unrelated content removal,
+knowledge insertion, and knowledge expansion attacks. Lastly, RAG-WM can also
+evade watermark detection approaches, highlighting its promising application in
+detecting IP infringement of RAG systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deriving Coding-Specific Sub-Models from LLMs using Resource-Efficient
+  Pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Puccioni, Alireza Farshin, Mariano Scazzariello, Changjie Wang, Marco Chiesa, Dejan Kostic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated their exceptional performance
+in various complex code generation tasks. However, their broader adoption is
+limited by significant computational demands and high resource requirements,
+particularly memory and processing power. To mitigate such requirements, model
+pruning techniques are used to create more compact models with significantly
+fewer parameters. However, current approaches do not focus on the efficient
+extraction of programming-language-specific sub-models. In this work, we
+explore the idea of efficiently deriving coding-specific sub-models through
+unstructured pruning (i.e., Wanda). We investigate the impact of different
+domain-specific calibration datasets on pruning outcomes across three distinct
+domains and extend our analysis to extracting four language-specific
+sub-models: Python, Java, C++, and JavaScript. We are the first to efficiently
+extract programming-language-specific sub-models using appropriate calibration
+datasets while maintaining acceptable accuracy w.r.t. full models. We are also
+the first to provide analytical evidence that domain-specific tasks activate
+distinct regions within LLMs, supporting the creation of specialized sub-models
+through unstructured pruning. We believe that this work has significant
+potential to enhance LLM accessibility for coding by reducing computational
+requirements to enable local execution on consumer-grade hardware, and
+supporting faster inference times critical for real-time development feedback.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online <span class="highlight-title">Prompt</span> and Solver Selection for Program Synthesis <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Li, Lewis Frampton, Federico Mora, Elizabeth Polgreen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) demonstrate impressive capabilities in the
+domain of program synthesis. This level of performance is not, however,
+universal across all tasks, all LLMs and all prompting styles. There are many
+areas where one LLM dominates, one prompting style dominates, or where calling
+a symbolic solver is a better choice than an LLM. A key challenge for the user
+then, is to identify not only when an LLM is the right choice of solver, and
+the appropriate LLM to call for a given synthesis task, but also the right way
+to call it. A non-expert user who makes the wrong choice, incurs a cost both in
+terms of results (number of tasks solved, and the time it takes to solve them)
+and financial cost, if using a closed-source language model via a commercial
+API. We frame this choice as an online learning problem. We use a multi-armed
+bandit algorithm to select which symbolic solver, or LLM and prompt combination
+to deploy in order to maximize a given reward function (which may prioritize
+solving time, number of synthesis tasks solved, or financial cost of solving).
+We implement an instance of this approach, called CYANEA, and evaluate it on
+synthesis queries from the literature in ranking function synthesis, from the
+syntax-guided synthesis competition, and fresh, unseen queries generated from
+SMT problems. CYANEA solves 37.2\% more queries than the best single solver and
+achieves results within 4\% of the virtual best solver.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 39th AAAI Conference on Artificial Intelligence
+  (AAAI-25) Main Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Estonian TV Subtitles with Semi-supervised Learning and LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Artem Fedorchenko, Tanel Alumäe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an approach for generating high-quality, same-language
+subtitles for Estonian TV content. We fine-tune the Whisper model on
+human-generated Estonian subtitles and enhance it with iterative
+pseudo-labeling and large language model (LLM) based post-editing. Our
+experiments demonstrate notable subtitle quality improvement through
+pseudo-labeling with an unlabeled dataset. We find that applying LLM-based
+editing at test time enhances subtitle accuracy, while its use during training
+does not yield further gains. This approach holds promise for creating subtitle
+quality close to human standard and could be extended to real-time
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Approach to Scalable and Automatic Topic-Controlled Question
+  Generation in Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqing Li, Mutlu Cukurova, Sahan Bulathwela
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of Automatic Question Generation (QG) models has the
+potential to significantly improve educational practices by reducing the
+teacher workload associated with creating educational content. This paper
+introduces a novel approach to educational question generation that controls
+the topical focus of questions. The proposed Topic-Controlled Question
+Generation (T-CQG) method enhances the relevance and effectiveness of the
+generated content for educational purposes. Our approach uses fine-tuning on a
+pre-trained T5-small model, employing specially created datasets tailored to
+educational needs. The research further explores the impacts of pre-training
+strategies, quantisation, and data augmentation on the model's performance. We
+specifically address the challenge of generating semantically aligned questions
+with paragraph-level contexts, thereby improving the topic specificity of the
+generated questions. In addition, we introduce and explore novel evaluation
+methods to assess the topical relatedness of the generated questions. Our
+results, validated through rigorous offline and human-backed evaluations,
+demonstrate that the proposed models effectively generate high-quality,
+topic-focused questions. These models have the potential to reduce teacher
+workload and support personalised tutoring systems by serving as bespoke
+question generators. With its relatively small number of parameters, the
+proposals not only advance the capabilities of question generation models for
+handling specific educational topics but also offer a scalable solution that
+reduces infrastructure costs. This scalability makes them feasible for
+widespread use in education without reliance on proprietary large language
+models like ChatGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published at ACM Conf. on Learning Analytics and Knowledge
+  (LAK'25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GLaM-Sign: Greek Language Multimodal Lip Reading with Integrated Sign
+  Language Accessibility 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitris Kouremenos, Klimis Ntalianis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Greek Language Multimodal Lip Reading with Integrated Sign Language
+Accessibility (GLaM-Sign) [1] is a groundbreaking resource in accessibility and
+multimodal AI, designed to support Deaf and Hard-of-Hearing (DHH) individuals.
+Developed from the FEELIT project [2], it integrates high-resolution audio,
+video, textual transcriptions, and Greek Sign Language translations for
+applications like real-time sign language translation and enhanced subtitle
+synchronization. While its primary focus is on promoting inclusivity in the
+Greek tourism sector, its adaptability extends to education, healthcare, and
+public services. Future advancements will enhance word-level precision and
+scalability to additional languages, supported by advanced AI methodologies and
+collaborations with diverse stakeholders. This dataset underscores the
+transformative potential of multimodal resources in bridging communication
+gaps, fostering innovation, and setting a benchmark for ethical AI and
+inclusive technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discovering Hidden Visual Concepts Beyond Linguistic Input in Infant
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueyi Ke, Satoshi Tsutsui, Yayun Zhang, Bihan Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infants develop complex visual understanding rapidly, even preceding of the
+acquisition of linguistic inputs. As computer vision seeks to replicate the
+human vision system, understanding infant visual development may offer valuable
+insights. In this paper, we present an interdisciplinary study exploring this
+question: can a computational model that imitates the infant learning process
+develop broader visual concepts that extend beyond the vocabulary it has heard,
+similar to how infants naturally learn? To investigate this, we analyze a
+recently published model in Science by Vong et al.,which is trained on
+longitudinal, egocentric images of a single child paired with transcribed
+parental speech. We introduce a training-free framework that can discover
+visual concept neurons hidden in the model's internal representations. Our
+findings show that these neurons can classify objects outside its original
+vocabulary. Furthermore, we compare the visual representations in infant-like
+models with those in moder computer vision models, such as CLIP or ImageNet
+pre-trained model, highlighting key similarities and differences. Ultimately,
+our work bridges cognitive science and computer vision by analyzing the
+internal representations of a computational model trained on an infant's visual
+and linguistic inputs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Algorithmic Approach for Causal Health Equity: A Look at Race
+  Differentials in Intensive Care Unit (ICU) Outcomes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Drago Plecko, Paul Secombe, Andrea Clarke, Amelia Fiske, Samarra Toby, Donisha Duff, David Pilcher, Leo Anthony Celi, Rinaldo Bellomo, Elias Bareinboim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The new era of large-scale data collection and analysis presents an
+opportunity for diagnosing and understanding the causes of health inequities.
+In this study, we describe a framework for systematically analyzing health
+disparities using causal inference. The framework is illustrated by
+investigating racial and ethnic disparities in intensive care unit (ICU)
+outcome between majority and minority groups in Australia (Indigenous vs.
+Non-Indigenous) and the United States (African-American vs. White). We
+demonstrate that commonly used statistical measures for quantifying inequity
+are insufficient, and focus on attributing the observed disparity to the causal
+mechanisms that generate it. We find that minority patients are younger at
+admission, have worse chronic health, are more likely to be admitted for urgent
+and non-elective reasons, and have higher illness severity. At the same time,
+however, we find a protective direct effect of belonging to a minority group,
+with minority patients showing improved survival compared to their majority
+counterparts, with all other variables kept equal. We demonstrate that this
+protective effect is related to the increased probability of being admitted to
+ICU, with minority patients having an increased risk of ICU admission. We also
+find that minority patients, while showing improved survival, are more likely
+to be readmitted to ICU. Thus, due to worse access to primary health care,
+minority patients are more likely to end up in ICU for preventable conditions,
+causing a reduction in the mortality rates and creating an effect that appears
+to be protective. Since the baseline risk of ICU admission may serve as proxy
+for lack of access to primary care, we developed the Indigenous Intensive Care
+Equity (IICE) Radar, a monitoring system for tracking the over-utilization of
+ICU resources by the Indigenous population of Australia across geographical
+areas.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bringing Order Amidst Chaos: On the Role of Artificial Intelligence in
+  Secure Software Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Esposito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Context. Developing secure and reliable software remains a key challenge in
+software engineering (SE). The ever-evolving technological landscape offers
+both opportunities and threats, creating a dynamic space where chaos and order
+compete. Secure software engineering (SSE) must continuously address
+vulnerabilities that endanger software systems and carry broader socio-economic
+risks, such as compromising critical national infrastructure and causing
+significant financial losses. Researchers and practitioners have explored
+methodologies like Static Application Security Testing Tools (SASTTs) and
+artificial intelligence (AI) approaches, including machine learning (ML) and
+large language models (LLMs), to detect and mitigate these vulnerabilities.
+Each method has unique strengths and limitations.
+  Aim. This thesis seeks to bring order to the chaos in SSE by addressing
+domain-specific differences that impact AI accuracy.
+  Methodology. The research employs a mix of empirical strategies, such as
+evaluating effort-aware metrics, analyzing SASTTs, conducting method-level
+analysis, and leveraging evidence-based techniques like systematic dataset
+reviews. These approaches help characterize vulnerability prediction datasets.
+  Results. Key findings include limitations in static analysis tools for
+identifying vulnerabilities, gaps in SASTT coverage of vulnerability types,
+weak relationships among vulnerability severity scores, improved defect
+prediction accuracy using just-in-time modeling, and threats posed by untouched
+methods.
+  Conclusions. This thesis highlights the complexity of SSE and the importance
+of contextual knowledge in improving AI-driven vulnerability and defect
+prediction. The comprehensive analysis advances effective prediction models,
+benefiting both researchers and practitioners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PhD thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable AI based System for Supply Air Temperature Forecast 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marika Eik, Ahmet Kose, Hossein Nourollahi Hokmabad, Juri Belikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the application of Explainable AI (XAI) techniques to
+improve the transparency and understanding of predictive models in control of
+automated supply air temperature (ASAT) of Air Handling Unit (AHU). The study
+focuses on forecasting of ASAT using a linear regression with Huber loss.
+However, having only a control curve without semantic and/or physical
+explanation is often not enough. The present study employs one of the XAI
+methods: Shapley values, which allows to reveal the reasoning and highlight the
+contribution of each feature to the final ASAT forecast. In comparison to other
+XAI methods, Shapley values have solid mathematical background, resulting in
+interpretation transparency. The study demonstrates the contrastive
+explanations--slices, for each control value of ASAT, which makes it possible
+to give the client objective justifications for curve changes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 7 figures, 1 table, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Biomedical Relation Extraction via Adaptive Document-Relation
+  Cross-Mapping and Concept Unique Identifier 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Shang, Yanrong Guo, Shijie Hao, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document-Level Biomedical Relation Extraction (Bio-RE) aims to identify
+relations between biomedical entities within extensive texts, serving as a
+crucial subfield of biomedical text mining. Existing Bio-RE methods struggle
+with cross-sentence inference, which is essential for capturing relations
+spanning multiple sentences. Moreover, previous methods often overlook the
+incompleteness of documents and lack the integration of external knowledge,
+limiting contextual richness. Besides, the scarcity of annotated data further
+hampers model training. Recent advancements in large language models (LLMs)
+have inspired us to explore all the above issues for document-level Bio-RE.
+Specifically, we propose a document-level Bio-RE framework via LLM Adaptive
+Document-Relation Cross-Mapping (ADRCM) Fine-Tuning and Concept Unique
+Identifier (CUI) Retrieval-Augmented Generation (RAG). First, we introduce the
+Iteration-of-REsummary (IoRs) prompt for solving the data scarcity issue. In
+this way, Bio-RE task-specific synthetic data can be generated by guiding
+ChatGPT to focus on entity relations and iteratively refining synthetic data.
+Next, we propose ADRCM fine-tuning, a novel fine-tuning recipe that establishes
+mappings across different documents and relations, enhancing the model's
+contextual understanding and cross-sentence inference capabilities. Finally,
+during the inference, a biomedical-specific RAG approach, named CUI RAG, is
+designed to leverage CUIs as indexes for entities, narrowing the retrieval
+scope and enriching the relevant document contexts. Experiments conducted on
+three Bio-RE datasets (GDA, CDR, and BioRED) demonstrate the state-of-the-art
+performance of our proposed method by comparing it with other related works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Systematic Literature <span class="highlight-title">Review</span> on Deep Learning-based Depth Estimation
+  in Computer Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Rohan, Md Junayed Hasan, Andrei Petrovski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth estimation (DE) provides spatial information about a scene and enables
+tasks such as 3D reconstruction, object detection, and scene understanding.
+Recently, there has been an increasing interest in using deep learning
+(DL)-based methods for DE. Traditional techniques rely on handcrafted features
+that often struggle to generalise to diverse scenes and require extensive
+manual tuning. However, DL models for DE can automatically extract relevant
+features from input data, adapt to various scene conditions, and generalise
+well to unseen environments. Numerous DL-based methods have been developed,
+making it necessary to survey and synthesize the state-of-the-art (SOTA).
+Previous reviews on DE have mainly focused on either monocular or stereo-based
+techniques, rather than comprehensively reviewing DE. Furthermore, to the best
+of our knowledge, there is no systematic literature review (SLR) that
+comprehensively focuses on DE. Therefore, this SLR study is being conducted.
+Initially, electronic databases were searched for relevant publications,
+resulting in 1284 publications. Using defined exclusion and quality criteria,
+128 publications were shortlisted and further filtered to select 59
+high-quality primary studies. These studies were analysed to extract data and
+answer defined research questions. Based on the results, DL methods were
+developed for mainly three different types of DE: monocular, stereo, and
+multi-view. 20 publicly available datasets were used to train, test, and
+evaluate DL models for DE, with KITTI, NYU Depth V2, and Make 3D being the most
+used datasets. 29 evaluation metrics were used to assess the performance of DE.
+35 base models were reported in the primary studies, and the top five most-used
+base models were ResNet-50, ResNet-18, ResNet-101, U-Net, and VGG-16. Finally,
+the lack of ground truth data was among the most significant challenges
+reported by primary studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constrained Optimization of Charged Particle Tracking with Multi-Agent
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Kortus, Ralf Keidel, Nicolas R. Gauger, Jan Kieseler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning demonstrated immense success in modelling complex
+physics-driven systems, providing end-to-end trainable solutions by interacting
+with a simulated or real environment, maximizing a scalar reward signal. In
+this work, we propose, building upon previous work, a multi-agent reinforcement
+learning approach with assignment constraints for reconstructing particle
+tracks in pixelated particle detectors. Our approach optimizes collaboratively
+a parametrized policy, functioning as a heuristic to a multidimensional
+assignment problem, by jointly minimizing the total amount of particle
+scattering over the reconstructed tracks in a readout frame. To satisfy
+constraints, guaranteeing a unique assignment of particle hits, we propose a
+safety layer solving a linear assignment problem for every joint action.
+Further, to enforce cost margins, increasing the distance of the local policies
+predictions to the decision boundaries of the optimizer mappings, we recommend
+the use of an additional component in the blackbox gradient estimation, forcing
+the policy to solutions with lower total assignment costs. We empirically show
+on simulated data, generated for a particle detector developed for proton
+imaging, the effectiveness of our approach, compared to multiple single- and
+multi-agent baselines. We further demonstrate the effectiveness of constraints
+with cost margins for both optimization and generalization, introduced by wider
+regions with high reconstruction performance as well as reduced predictive
+instabilities. Our results form the basis for further developments in RL-based
+tracking, offering both enhanced performance with constrained policies and
+greater flexibility in optimizing tracking algorithms through the option for
+individual and team rewards.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing ALS Applications with Large-Scale <span class="highlight-title">Pre-train</span>ing: <span class="highlight-title">Dataset</span>
+  Development and Downstream Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05095v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05095v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyi Xiu, Xin Liu, Taehoon Kim, Kyoung-Sook Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pre-training and fine-tuning paradigm has revolutionized satellite remote
+sensing applications. However, this approach remains largely underexplored for
+airborne laser scanning (ALS), an important technology for applications such as
+forest management and urban planning. In this study, we address this gap by
+constructing a large-scale ALS point cloud dataset and evaluating its impact on
+downstream applications. Our dataset comprises ALS point clouds collected
+across the contiguous United States, provided by the United States Geological
+Survey's 3D Elevation Program. To ensure efficient data collection while
+capturing diverse land cover and terrain types, we introduce a geospatial
+sampling method that selects point cloud tiles based on land cover maps and
+digital elevation models. As a baseline self-supervised learning model, we
+adopt BEV-MAE, a state-of-the-art masked autoencoder for 3D outdoor point
+clouds, and pre-train it on the constructed dataset. The pre-trained models are
+subsequently fine-tuned for downstream tasks, including tree species
+classification, terrain scene recognition, and point cloud semantic
+segmentation. Our results show that the pre-trained models significantly
+outperform their scratch counterparts across all downstream tasks,
+demonstrating the transferability of the representations learned from the
+proposed dataset. Furthermore, we observe that scaling the dataset using our
+geospatial sampling method consistently enhances performance, whereas
+pre-training on datasets constructed with random sampling fails to achieve
+similar improvements. These findings highlight the utility of the constructed
+dataset and the effectiveness of our sampling strategy in the pre-training and
+fine-tuning paradigm. The source code and pre-trained models will be made
+publicly available at \url{https://github.com/martianxiu/ALS_pretraining}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal-to-Text <span class="highlight-title">Prompt</span> Engineering in Large Language Models Using
+  Feature Embeddings for GNSS Interference Characterization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harshith Manjunath, Lucas Heublein, Tobias Feigl, Felix Ott
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are advanced AI systems applied across various
+domains, including NLP, information retrieval, and recommendation systems.
+Despite their adaptability and efficiency, LLMs have not been extensively
+explored for signal processing tasks, particularly in the domain of global
+navigation satellite system (GNSS) interference monitoring. GNSS interference
+monitoring is essential to ensure the reliability of vehicle localization on
+roads, a critical requirement for numerous applications. However, GNSS-based
+positioning is vulnerable to interference from jamming devices, which can
+compromise its accuracy. The primary objective is to identify, classify, and
+mitigate these interferences. Interpreting GNSS snapshots and the associated
+interferences presents significant challenges due to the inherent complexity,
+including multipath effects, diverse interference types, varying sensor
+characteristics, and satellite constellations. In this paper, we extract
+features from a large GNSS dataset and employ LLaVA to retrieve relevant
+information from an extensive knowledge base. We employ prompt engineering to
+interpret the interferences and environmental factors, and utilize t-SNE to
+analyze the feature embeddings. Our findings demonstrate that the proposed
+method is capable of visual and logical reasoning within the GNSS context.
+Furthermore, our pipeline outperforms state-of-the-art machine learning models
+in interference classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing Memorization in Large Language Models through the Lens of
+  Model Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tarun Ram Menta, Susmit Agrawal, Chirag Agarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are prevalent in modern applications but often
+memorize training data, leading to privacy breaches and copyright issues.
+Existing research has mainly focused on posthoc analyses, such as extracting
+memorized content or developing memorization metrics, without exploring the
+underlying architectural factors that contribute to memorization. In this work,
+we investigate memorization from an architectural lens by analyzing how
+attention modules at different layers impact its memorization and
+generalization performance. Using attribution techniques, we systematically
+intervene in the LLM architecture by bypassing attention modules at specific
+blocks while keeping other components like layer normalization and MLP
+transformations intact. We provide theorems analyzing our intervention
+mechanism from a mathematical view, bounding the difference in layer outputs
+with and without our attributions. Our theoretical and empirical analyses
+reveal that attention modules in deeper transformer blocks are primarily
+responsible for memorization, whereas earlier blocks are crucial for the models
+generalization and reasoning capabilities. We validate our findings through
+comprehensive experiments on different LLM families (Pythia and GPTNeo) and
+five benchmark datasets. Our insights offer a practical approach to mitigate
+memorization in LLMs while preserving their performance, contributing to safer
+and more ethical deployment in real world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Text-Based Knowledge-Embedded Soft Sensing Modeling Approach for
+  General Industrial Process Tasks Based on Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Tong, Han Liu, Runyuan Guo, Xueqiong Tian, Wenqing Wang, Ding Liu, Youmin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven soft sensors (DDSS) have become mainstream methods for predicting
+key performance indicators in process industries. However, DDSS development
+requires complex and costly customized designs tailored to various tasks during
+the modeling process. Moreover, DDSS are constrained to a single structured
+data modality, limiting their ability to incorporate additional contextual
+knowledge. Furthermore, DDSSs' limited representation learning leads to weak
+predictive performance with scarce data. To address these challenges, we
+propose a general framework named LLM-TKESS (large language model for
+text-based knowledge-embedded soft sensing), harnessing the powerful general
+problem-solving capabilities, cross-modal knowledge transfer abilities, and
+few-shot capabilities of LLM for enhanced soft sensing modeling. Specifically,
+an auxiliary variable series encoder (AVS Encoder) is proposed to unleash LLM's
+potential for capturing temporal relationships within series and spatial
+semantic relationships among auxiliary variables. Then, we propose a two-stage
+fine-tuning alignment strategy: in the first stage, employing
+parameter-efficient fine-tuning through autoregressive training adjusts LLM to
+rapidly accommodate process variable data, resulting in a soft sensing
+foundation model (SSFM). Subsequently, by training adapters, we adapt the SSFM
+to various downstream tasks without modifying its architecture. Then, we
+propose two text-based knowledge-embedded soft sensors, integrating new natural
+language modalities to overcome the limitations of pure structured data models.
+Furthermore, benefiting from LLM's pre-existing world knowledge, our model
+demonstrates outstanding predictive capabilities in small sample conditions.
+Using the thermal deformation of air preheater rotor as a case study, we
+validate through extensive experiments that LLM-TKESS exhibits outstanding
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Commonsense Video Question Answering through Video-Grounded Entailment
+  Tree Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huabin Liu, Filip Ilievski, Cees G. M. Snoek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes the first video-grounded entailment tree reasoning method
+for commonsense video question answering (VQA). Despite the remarkable progress
+of large visual-language models (VLMs), there are growing concerns that they
+learn spurious correlations between videos and likely answers, reinforced by
+their black-box nature and remaining benchmarking biases. Our method explicitly
+grounds VQA tasks to video fragments in four steps: entailment tree
+construction, video-language entailment verification, tree reasoning, and
+dynamic tree expansion. A vital benefit of the method is its generalizability
+to current video and image-based VLMs across reasoning types. To support fair
+evaluation, we devise a de-biasing procedure based on large-language models
+that rewrites VQA benchmark answer sets to enforce model reasoning. Systematic
+experiments on existing and de-biased benchmarks highlight the impact of our
+method components across benchmarks, VLMs, and reasoning types.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D3RM: A Discrete Denoising Diffusion Refinement Model for Piano
+  Transcription <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hounsu Kim, Taegyun Kwon, Juhan Nam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have been widely used in the generative domain due to their
+convincing performance in modeling complex data distributions. Moreover, they
+have shown competitive results on discriminative tasks, such as image
+segmentation. While diffusion models have also been explored for automatic
+music transcription, their performance has yet to reach a competitive level. In
+this paper, we focus on discrete diffusion model's refinement capabilities and
+present a novel architecture for piano transcription. Our model utilizes
+Neighborhood Attention layers as the denoising module, gradually predicting the
+target high-resolution piano roll, conditioned on the finetuned features of a
+pretrained acoustic model. To further enhance refinement, we devise a novel
+strategy which applies distinct transition states during training and inference
+stage of discrete diffusion models. Experiments on the MAESTRO dataset show
+that our approach outperforms previous diffusion-based piano transcription
+models and the baseline model in terms of F1 score. Our code is available in
+https://github.com/hanshounsu/d3rm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLaVA-Octopus: Unlocking Instruction-Driven Adaptive Projector Fusion
+  for Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Zhao, Boyuan Sun, Xiang Chen, Xihan Wei, Qibin Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce LLaVA-Octopus, a novel video multimodal large
+language model. LLaVA-Octopus adaptively weights features from different visual
+projectors based on user instructions, enabling us to leverage the
+complementary strengths of each projector. We observe that different visual
+projectors exhibit distinct characteristics when handling specific tasks. For
+instance, some projectors excel at capturing static details, while others are
+more effective at processing temporal information, and some are better suited
+for tasks requiring temporal coherence. By dynamically adjusting feature
+weights according to user instructions, LLaVA-Octopus dynamically selects and
+combines the most suitable features, significantly enhancing the model's
+performance in multimodal tasks. Experimental results demonstrate that
+LLaVA-Octopus achieves excellent performance across multiple benchmarks,
+especially in tasks such as multimodal understanding, visual question
+answering, and video understanding, highlighting its broad application
+potential.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Skeleton-based Action Recognition with Interactive Object
+  Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Wen, Ziqian Lu, Fengli Shen, Zhe-Ming Lu, Jialin Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human skeleton information is important in skeleton-based action recognition,
+which provides a simple and efficient way to describe human pose. However,
+existing skeleton-based methods focus more on the skeleton, ignoring the
+objects interacting with humans, resulting in poor performance in recognizing
+actions that involve object interactions. We propose a new action recognition
+framework introducing object nodes to supplement absent interactive object
+information. We also propose Spatial Temporal Variable Graph Convolutional
+Networks (ST-VGCN) to effectively model the Variable Graph (VG) containing
+object nodes. Specifically, in order to validate the role of interactive object
+information, by leveraging a simple self-training approach, we establish a new
+dataset, JXGC 24, and an extended dataset, NTU RGB+D+Object 60, including more
+than 2 million additional object nodes. At the same time, we designe the
+Variable Graph construction method to accommodate a variable number of nodes
+for graph structure. Additionally, we are the first to explore the overfitting
+issue introduced by incorporating additional object information, and we propose
+a VG-based data augmentation method to address this issue, called Random Node
+Attack. Finally, regarding the network structure, we introduce two fusion
+modules, CAF and WNPool, along with a novel Node Balance Loss, to enhance the
+comprehensive performance by effectively fusing and balancing skeleton and
+object node information. Our method surpasses the previous state-of-the-art on
+multiple skeleton-based action recognition benchmarks. The accuracy of our
+method on NTU RGB+D 60 cross-subject split is 96.7\%, and on cross-view split,
+it is 99.2\%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simultaneous emulation and downscaling with physically-consistent deep
+  learning-based regional ocean emulators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonard Lupin-Jimenez, Moein Darman, Subhashis Hazarika, Tianning Wu, Michael Gray, Ruyoing He, Anthony Wong, Ashesh Chattopadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building on top of the success in AI-based atmospheric emulation, we propose
+an AI-based ocean emulation and downscaling framework focusing on the
+high-resolution regional ocean over Gulf of Mexico. Regional ocean emulation
+presents unique challenges owing to the complex bathymetry and lateral boundary
+conditions as well as from fundamental biases in deep learning-based
+frameworks, such as instability and hallucinations. In this paper, we develop a
+deep learning-based framework to autoregressively integrate ocean-surface
+variables over the Gulf of Mexico at $8$ Km spatial resolution without
+unphysical drifts over decadal time scales and simulataneously downscale and
+bias-correct it to $4$ Km resolution using a physics-constrained generative
+model. The framework shows both short-term skills as well as accurate long-term
+statistics in terms of mean and variability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TAPFed: Threshold Secure Aggregation for Privacy-Preserving Federated
+  Learning <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runhua Xu, Bo Li, Chao Li, James B. D. Joshi, Shuai Ma, Jianxin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning is a computing paradigm that enhances privacy by enabling
+multiple parties to collaboratively train a machine learning model without
+revealing personal data. However, current research indicates that traditional
+federated learning platforms are unable to ensure privacy due to privacy leaks
+caused by the interchange of gradients. To achieve privacy-preserving federated
+learning, integrating secure aggregation mechanisms is essential.
+Unfortunately, existing solutions are vulnerable to recently demonstrated
+inference attacks such as the disaggregation attack. This paper proposes
+TAPFed, an approach for achieving privacy-preserving federated learning in the
+context of multiple decentralized aggregators with malicious actors. TAPFed
+uses a proposed threshold functional encryption scheme and allows for a certain
+number of malicious aggregators while maintaining security and privacy. We
+provide formal security and privacy analyses of TAPFed and compare it to
+various baselines through experimental evaluation. Our results show that TAPFed
+offers equivalent performance in terms of model quality compared to
+state-of-the-art approaches while reducing transmission overhead by 29%-45%
+across different model training scenarios. Most importantly, TAPFed can defend
+against recently demonstrated inference attacks caused by curious aggregators,
+which the majority of existing approaches are susceptible to.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been published in IEEE TDSC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Human-Like Responses in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ethem Yağız Çalık, Talha Rüzgar Akkuş
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the advancements in making large language models (LLMs)
+more human-like. We focus on techniques that enhance natural language
+understanding, conversational coherence, and emotional intelligence in AI
+systems. The study evaluates various approaches, including fine-tuning with
+diverse datasets, incorporating psychological principles, and designing models
+that better mimic human reasoning patterns. Our findings demonstrate that these
+enhancements not only improve user interactions but also open new possibilities
+for AI applications across different domains. Future work will address the
+ethical implications and potential biases introduced by these human-like
+attributes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A General Retrieval-Augmented Generation Framework for Multimodal
+  Case-Based Reasoning Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ofir Marom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Case-based reasoning (CBR) is an experience-based approach to problem
+solving, where a repository of solved cases is adapted to solve new cases.
+Recent research shows that Large Language Models (LLMs) with
+Retrieval-Augmented Generation (RAG) can support the Retrieve and Reuse stages
+of the CBR pipeline by retrieving similar cases and using them as additional
+context to an LLM query. Most studies have focused on text-only applications,
+however, in many real-world problems the components of a case are multimodal.
+In this paper we present MCBR-RAG, a general RAG framework for multimodal CBR
+applications. The MCBR-RAG framework converts non-text case components into
+text-based representations, allowing it to: 1) learn application-specific
+latent representations that can be indexed for retrieval, and 2) enrich the
+query provided to the LLM by incorporating all case components for better
+context. We demonstrate MCBR-RAG's effectiveness through experiments conducted
+on a simplified Math-24 application and a more complex Backgammon application.
+Our empirical results show that MCBR-RAG improves generation quality compared
+to a baseline LLM with no contextual information provided.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Needles in Emb(a)dding Haystacks: Legal Document Retrieval via
+  Bagging and SVR Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Bönisch, Alexander Mehler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a retrieval approach leveraging Support Vector Regression (SVR)
+ensembles, bootstrap aggregation (bagging), and embedding spaces on the German
+Dataset for Legal Information Retrieval (GerDaLIR). By conceptualizing the
+retrieval task in terms of multiple binary needle-in-a-haystack subtasks, we
+show improved recall over the baselines (0.849 > 0.803 | 0.829) using our
+voting ensemble, suggesting promising initial results, without training or
+fine-tuning any deep learning models. Our approach holds potential for further
+enhancement, particularly through refining the encoding models and optimizing
+hyperparameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Measuring Unnoticeability of Graph Adversarial Attacks: Observations,
+  New Measure, and Applications <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeonsoo Jo, Hyunjin Hwang, Fanchen Bu, Soo Yong Lee, Chanyoung Park, Kijung Shin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks are allegedly unnoticeable. Prior studies have designed
+attack noticeability measures on graphs, primarily using statistical tests to
+compare the topology of original and (possibly) attacked graphs. However, we
+observe two critical limitations in the existing measures. First, because the
+measures rely on simple rules, attackers can readily enhance their attacks to
+bypass them, reducing their attack "noticeability" and, yet, maintaining their
+attack performance. Second, because the measures naively leverage global
+statistics, such as degree distributions, they may entirely overlook attacks
+until severe perturbations occur, letting the attacks be almost "totally
+unnoticeable." To address the limitations, we introduce HideNSeek, a learnable
+measure for graph attack noticeability. First, to mitigate the bypass problem,
+HideNSeek learns to distinguish the original and (potential) attack edges using
+a learnable edge scorer (LEO), which scores each edge on its likelihood of
+being an attack. Second, to mitigate the overlooking problem, HideNSeek
+conducts imbalance-aware aggregation of all the edge scores to obtain the final
+noticeability score. Using six real-world graphs, we empirically demonstrate
+that HideNSeek effectively alleviates the observed limitations, and LEO (i.e.,
+our learnable edge scorer) outperforms eleven competitors in distinguishing
+attack edges under five different attack methods. For an additional
+application, we show that LEO boost the performance of robust GNNs by removing
+attack-like edges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UAV-VLA: Vision-Language-Action System for Large Scale Aerial Mission
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Sautenkov, Yasheerah Yaqoot, Artem Lykov, Muhammad Ahsan Mustafa, Grik Tadevosyan, Aibek Akhmetkazy, Miguel Altamirano Cabrera, Mikhail Martynov, Sausar Karaf, Dzmitry Tsetserukou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The UAV-VLA (Visual-Language-Action) system is a tool designed to facilitate
+communication with aerial robots. By integrating satellite imagery processing
+with the Visual Language Model (VLM) and the powerful capabilities of GPT,
+UAV-VLA enables users to generate general flight paths-and-action plans through
+simple text requests. This system leverages the rich contextual information
+provided by satellite images, allowing for enhanced decision-making and mission
+planning. The combination of visual analysis by VLM and natural language
+processing by GPT can provide the user with the path-and-action set, making
+aerial operations more efficient and accessible. The newly developed method
+showed the difference in the length of the created trajectory in 22% and the
+mean error in finding the objects of interest on a map in 34.22 m by Euclidean
+distance in the K-Nearest Neighbors (KNN) approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>HRI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum-enhanced causal discovery for a small number of samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yota Maeda, Ken Arai, Yu Tanaka, Yu Terada, Hiroshi Ueno, Hiroyuki Tezuka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discovery of causal relationships from observed data has attracted
+significant interest from disciplines such as economics, social sciences,
+epidemiology, and biology. In practical applications, considerable knowledge of
+the underlying systems is often unavailable, and real data are often associated
+with nonlinear causal structures, which make the direct use of most
+conventional causality analysis methods difficult. This study proposes a novel
+quantum Peter-Clark (qPC) algorithm for causal discovery that does not assume
+any underlying model structures. Based on the independence conditional tests in
+a class of reproducing kernel Hilbert spaces characterized by quantum circuits,
+the proposed qPC algorithm can explore causal relationships from the observed
+data drawn from arbitrary distributions. We conducted systematic experiments on
+fundamental graph parts of causal structures, demonstrating that the qPC
+algorithm exhibits a significantly better performance, particularly with
+smaller sample sizes compared to its classical counterpart. Furthermore, we
+proposed a novel optimization approach based on Kernel Target Alignment (KTA)
+for determining hyperparameters of quantum kernels. This method effectively
+reduced the risk of false positives in causal discovery, enabling more reliable
+inference. Our theoretical and experimental results demonstrate that the
+proposed quantum algorithm can empower classical algorithms for robust and
+accurate inference in causal discovery, supporting them in regimes where
+classical algorithms typically fail. Additionally, the effectiveness of this
+method was validated using the Boston Housing dataset as a real-world
+application. These findings demonstrate the new potential of quantum
+circuit-based causal discovery methods in addressing practical challenges,
+particularly in small-sample scenarios where traditional approaches have shown
+limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GiNet: Integrating Sequential and Context-Aware Learning for Battery
+  Capacity Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Sameer, Wei Zhang, Xin Lou, Qingyu Yan, Terence Goh, Yulin Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The surging demand for batteries requires advanced battery management
+systems, where battery capacity modelling is a key functionality. In this
+paper, we aim to achieve accurate battery capacity prediction by learning from
+historical measurements of battery dynamics. We propose GiNet, a gated
+recurrent units enhanced Informer network, for predicting battery's capacity.
+The novelty and competitiveness of GiNet lies in its capability of capturing
+sequential and contextual information from raw battery data and reflecting the
+battery's complex behaviors with both temporal dynamics and long-term
+dependencies. We conducted an experimental study based on a publicly available
+dataset to showcase GiNet's strength of gaining a holistic understanding of
+battery behavior and predicting battery capacity accurately. GiNet achieves
+0.11 mean absolute error for predicting the battery capacity in a sequence of
+future time slots without knowing the historical battery capacity. It also
+outperforms the latest algorithms significantly with 27% error reduction on
+average compared to Informer. The promising results highlight the importance of
+customized and optimized integration of algorithm and battery knowledge and
+shed light on other industry applications as well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IPDN: Image-enhanced <span class="highlight-title">Prompt</span> Decoding Network for 3D Referring Expression
+  Segmentation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Chen, Changli Wu, Jiayi Ji, Yiwei Ma, Danni Yang, Xiaoshuai Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Referring Expression Segmentation (3D-RES) aims to segment point cloud
+scenes based on a given expression. However, existing 3D-RES approaches face
+two major challenges: feature ambiguity and intent ambiguity. Feature ambiguity
+arises from information loss or distortion during point cloud acquisition due
+to limitations such as lighting and viewpoint. Intent ambiguity refers to the
+model's equal treatment of all queries during the decoding process, lacking
+top-down task-specific guidance. In this paper, we introduce an Image enhanced
+Prompt Decoding Network (IPDN), which leverages multi-view images and
+task-driven information to enhance the model's reasoning capabilities. To
+address feature ambiguity, we propose the Multi-view Semantic Embedding (MSE)
+module, which injects multi-view 2D image information into the 3D scene and
+compensates for potential spatial information loss. To tackle intent ambiguity,
+we designed a Prompt-Aware Decoder (PAD) that guides the decoding process by
+deriving task-driven signals from the interaction between the expression and
+visual features. Comprehensive experiments demonstrate that IPDN outperforms
+the state-ofthe-art by 1.9 and 4.2 points in mIoU metrics on the 3D-RES and
+3D-GRES tasks, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CuRLA: Curriculum Learning Based Deep Reinforcement Learning for
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhargava Uppuluri, Anjel Patel, Neil Mehta, Sridhar Kamath, Pratyush Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In autonomous driving, traditional Computer Vision (CV) agents often struggle
+in unfamiliar situations due to biases in the training data. Deep Reinforcement
+Learning (DRL) agents address this by learning from experience and maximizing
+rewards, which helps them adapt to dynamic environments. However, ensuring
+their generalization remains challenging, especially with static training
+environments. Additionally, DRL models lack transparency, making it difficult
+to guarantee safety in all scenarios, particularly those not seen during
+training. To tackle these issues, we propose a method that combines DRL with
+Curriculum Learning for autonomous driving. Our approach uses a Proximal Policy
+Optimization (PPO) agent and a Variational Autoencoder (VAE) to learn safe
+driving in the CARLA simulator. The agent is trained using two-fold curriculum
+learning, progressively increasing environment difficulty and incorporating a
+collision penalty in the reward function to promote safety. This method
+improves the agent's adaptability and reliability in complex environments, and
+understand the nuances of balancing multiple reward components from different
+feedback signals in a single scalar reward function. Keywords: Computer Vision,
+Deep Reinforcement Learning, Variational Autoencoder, Proximal Policy
+Optimization, Curriculum Learning, Autonomous Driving.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in the 17th International Conference on Agents and
+  Artificial Intelligence (ICAART), Feb 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SensorQA: A Question Answering Benchmark for Daily-Life Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Reichman, Xiaofan Yu, Lanxiang Hu, Jack Truxal, Atishay Jain, Rushil Chandrupatla, Tajana Šimunić Rosing, Larry Heck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid growth in sensor data, effectively interpreting and
+interfacing with these data in a human-understandable way has become crucial.
+While existing research primarily focuses on learning classification models,
+fewer studies have explored how end users can actively extract useful insights
+from sensor data, often hindered by the lack of a proper dataset. To address
+this gap, we introduce \Dataset, the first human-created question-answering
+(QA) dataset for long-term time-series sensor data for daily life monitoring.
+\Dataset is created by human workers and includes 5.6K diverse and practical
+queries that reflect genuine human interests, paired with accurate answers
+derived from sensor data. We further establish benchmarks for state-of-the-art
+AI models on this dataset and evaluate their performance on typical edge
+devices. Our results reveal a gap between current models and optimal QA
+performance and efficiency, highlighting the need for new contributions. The
+dataset and code are available at:
+\url{https://github.com/benjamin-reichman/SensorQA}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Battling the Non-stationarity in Time Series Forecasting via Test-time
+  Adaptation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        HyunGi Kim, Siwon Kim, Jisoo Mok, Sungroh Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks have spearheaded remarkable advancements in time series
+forecasting (TSF), one of the major tasks in time series modeling. Nonetheless,
+the non-stationarity of time series undermines the reliability of pre-trained
+source time series forecasters in mission-critical deployment settings. In this
+study, we introduce a pioneering test-time adaptation framework tailored for
+TSF (TSF-TTA). TAFAS, the proposed approach to TSF-TTA, flexibly adapts source
+forecasters to continuously shifting test distributions while preserving the
+core semantic information learned during pre-training. The novel utilization of
+partially-observed ground truth and gated calibration module enables proactive,
+robust, and model-agnostic adaptation of source forecasters. Experiments on
+diverse benchmark datasets and cutting-edge architectures demonstrate the
+efficacy and generality of TAFAS, especially in long-term forecasting scenarios
+that suffer from significant distribution shifts. The code is available at
+https://github.com/kimanki/TAFAS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Demystifying Domain-adaptive Post-training for Financial LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Ke, Yifei Ming, Xuan-Phi Nguyen, Caiming Xiong, Shafiq Joty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain-adaptive post-training of large language models (LLMs) has emerged as
+a promising approach for specialized domains such as medicine and finance.
+However, significant challenges remain in identifying optimal adaptation
+criteria and training strategies across varying data and model configurations.
+To address these challenges, we introduce FINDAP, a systematic and fine-grained
+investigation into domain-adaptive post-training of LLMs for the finance
+domain. Our approach begins by identifying the core capabilities required for
+the target domain and designing a comprehensive evaluation suite aligned with
+these needs. We then analyze the effectiveness of key post-training stages,
+including continual pretraining, instruction tuning, and preference alignment.
+Building on these insights, we propose an effective training recipe centered on
+a novel preference data distillation method, which leverages process signals
+from a generative reward model. The resulting model, Llama-Fin, achieves
+state-of-the-art performance across a wide range of financial tasks. Our
+analysis also highlights how each post-training stage contributes to distinct
+capabilities, uncovering specific challenges and effective solutions, providing
+valuable insights for domain adaptation of LLMs. Project page:
+https://github.com/SalesforceAIResearch/FinDap
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing Domain Shift via Imbalance-Aware Domain Adaptation in Embryo
+  Development Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Li, Xinglin Zhang, Jun Liang, Tao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models in medical imaging face dual challenges: domain shift,
+where models perform poorly when deployed in settings different from their
+training environment, and class imbalance, where certain disease conditions are
+naturally underrepresented. We present Imbalance-Aware Domain Adaptation
+(IADA), a novel framework that simultaneously tackles both challenges through
+three key components: (1) adaptive feature learning with class-specific
+attention mechanisms, (2) balanced domain alignment with dynamic weighting, and
+(3) adaptive threshold optimization. Our theoretical analysis establishes
+convergence guarantees and complexity bounds. Through extensive experiments on
+embryo development assessment across four imaging modalities, IADA demonstrates
+significant improvements over existing methods, achieving up to 25.19\% higher
+accuracy while maintaining balanced performance across classes. In challenging
+scenarios with low-quality imaging systems, IADA shows robust generalization
+with AUC improvements of up to 12.56\%. These results demonstrate IADA's
+potential for developing reliable and equitable medical imaging systems for
+diverse clinical settings. The code is made public available at
+\url{https://github.com/yinghemedical/imbalance-aware_domain_adaptation}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Step-by-Step Mastery: Enhancing Soft Constraint Following Ability of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyu Ren, Jie Zeng, Qianyu He, Jiaqing Liang, Yanghua Xiao, Weikang Zhou, Zeye Sun, Fei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is crucial for large language models (LLMs) to follow instructions that
+involve multiple constraints. However, soft constraints are semantically
+related and difficult to verify through automated methods. These constraints
+remain a significant challenge for LLMs. To enhance the ability of LLMs to
+follow soft constraints, we initially design a pipeline to obtain high-quality
+outputs automatically. Additionally, to fully utilize the acquired data, we
+introduce a training paradigm based on curriculum learning. We experimentally
+evaluate the effectiveness of our methods in improving LLMs' soft constraint
+following ability and analyze the factors driving the improvements. The
+datasets and code are publicly available at
+https://github.com/Rainier-rq/FollowSoftConstraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jailbreaking Multimodal Large Language Models via Shuffle Inconsistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiji Zhao, Ranjie Duan, Fengxiang Wang, Chi Chen, Caixin Kang, Jialing Tao, YueFeng Chen, Hui Xue, Xingxing Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have achieved impressive performance
+and have been put into practical use in commercial applications, but they still
+have potential safety mechanism vulnerabilities. Jailbreak attacks are red
+teaming methods that aim to bypass safety mechanisms and discover MLLMs'
+potential risks. Existing MLLMs' jailbreak methods often bypass the model's
+safety mechanism through complex optimization methods or carefully designed
+image and text prompts. Despite achieving some progress, they have a low attack
+success rate on commercial closed-source MLLMs. Unlike previous research, we
+empirically find that there exists a Shuffle Inconsistency between MLLMs'
+comprehension ability and safety ability for the shuffled harmful instruction.
+That is, from the perspective of comprehension ability, MLLMs can understand
+the shuffled harmful text-image instructions well. However, they can be easily
+bypassed by the shuffled harmful instructions from the perspective of safety
+ability, leading to harmful responses. Then we innovatively propose a
+text-image jailbreak attack named SI-Attack. Specifically, to fully utilize the
+Shuffle Inconsistency and overcome the shuffle randomness, we apply a
+query-based black-box optimization method to select the most harmful shuffled
+inputs based on the feedback of the toxic judge model. A series of experiments
+show that SI-Attack can improve the attack's performance on three benchmarks.
+In particular, SI-Attack can obviously improve the attack success rate for
+commercial MLLMs such as GPT-4o or Claude-3.5-Sonnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image2CADSeq: Computer-Aided Design Sequence and Knowledge Inference
+  from Product Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingang Li, Zhenghui Sha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computer-aided design (CAD) tools empower designers to design and modify 3D
+models through a series of CAD operations, commonly referred to as a CAD
+sequence. In scenarios where digital CAD files are not accessible, reverse
+engineering (RE) has been used to reconstruct 3D CAD models. Recent advances
+have seen the rise of data-driven approaches for RE, with a primary focus on
+converting 3D data, such as point clouds, into 3D models in boundary
+representation (B-rep) format. However, obtaining 3D data poses significant
+challenges, and B-rep models do not reveal knowledge about the 3D modeling
+process of designs. To this end, our research introduces a novel data-driven
+approach with an Image2CADSeq neural network model. This model aims to reverse
+engineer CAD models by processing images as input and generating CAD sequences.
+These sequences can then be translated into B-rep models using a solid modeling
+kernel. Unlike B-rep models, CAD sequences offer enhanced flexibility to modify
+individual steps of model creation, providing a deeper understanding of the
+construction process of CAD models. To quantitatively and rigorously evaluate
+the predictive performance of the Image2CADSeq model, we have developed a
+multi-level evaluation framework for model assessment. The model was trained on
+a specially synthesized dataset, and various network architectures were
+explored to optimize the performance. The experimental and validation results
+show great potential for the model in generating CAD sequences from 2D image
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 10 figures, and 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLowHigh: Towards Efficient and High-Quality Audio Super-Resolution with
+  Single-Step Flow Matching <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun-Hak Yun, Seung-Bin Kim, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio super-resolution is challenging owing to its ill-posed nature.
+Recently, the application of diffusion models in audio super-resolution has
+shown promising results in alleviating this challenge. However, diffusion-based
+models have limitations, primarily the necessity for numerous sampling steps,
+which causes significantly increased latency when synthesizing high-quality
+audio samples. In this paper, we propose FLowHigh, a novel approach that
+integrates flow matching, a highly efficient generative model, into audio
+super-resolution. We also explore probability paths specially tailored for
+audio super-resolution, which effectively capture high-resolution audio
+distributions, thereby enhancing reconstruction quality. The proposed method
+generates high-fidelity, high-resolution audio through a single-step sampling
+process across various input sampling rates. The experimental results on the
+VCTK benchmark dataset demonstrate that FLowHigh achieves state-of-the-art
+performance in audio super-resolution, as evaluated by log-spectral distance
+and ViSQOL while maintaining computational efficiency with only a single-step
+sampling process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SUGAR: Leveraging Contextual Confidence for Smarter Retrieval <span class="chip">ICASSP2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanna Zubkova, Ji-Hoon Park, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bearing in mind the limited parametric knowledge of Large Language Models
+(LLMs), retrieval-augmented generation (RAG) which supplies them with the
+relevant external knowledge has served as an approach to mitigate the issue of
+hallucinations to a certain extent. However, uniformly retrieving supporting
+context makes response generation source-inefficient, as triggering the
+retriever is not always necessary, or even inaccurate, when a model gets
+distracted by noisy retrieved content and produces an unhelpful answer.
+Motivated by these issues, we introduce Semantic Uncertainty Guided Adaptive
+Retrieval (SUGAR), where we leverage context-based entropy to actively decide
+whether to retrieve and to further determine between single-step and multi-step
+retrieval. Our empirical results show that selective retrieval guided by
+semantic uncertainty estimation improves the performance across diverse
+question answering tasks, as well as achieves a more efficient inference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantifying Itch and its Impact on Sleep Using Machine Learning and
+  Radio Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michail Ouroutzoglou, Mingmin Zhao, Joshua Hellerstein, Hariharan Rahul, Asima Badic, Brian S. Kim, Dina Katabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chronic itch affects 13% of the US population, is highly debilitating, and
+underlies many medical conditions. A major challenge in clinical care and new
+therapeutics development is the lack of an objective measure for quantifying
+itch, leading to reliance on subjective measures like patients' self-assessment
+of itch severity. In this paper, we show that a home radio device paired with
+artificial intelligence (AI) can concurrently capture scratching and evaluate
+its impact on sleep quality by analyzing radio signals bouncing in the
+environment. The device eliminates the need for wearable sensors or skin
+contact, enabling monitoring of chronic itch over extended periods at home
+without burdening patients or interfering with their skin condition. To
+validate the technology, we conducted an observational clinical study of
+chronic pruritus patients, monitored at home for one month using both the radio
+device and an infrared camera. Comparing the output of the device to ground
+truth data from the camera demonstrates its feasibility and accuracy (ROC AUC =
+0.997, sensitivity = 0.825, specificity = 0.997). The results reveal a
+significant correlation between scratching and low sleep quality, manifested as
+a reduction in sleep efficiency (R = 0.6, p < 0.001) and an increase in sleep
+latency (R = 0.68, p < 0.001). Our study underscores the potential of passive,
+long-term, at-home monitoring of chronic scratching and its sleep implications,
+offering a valuable tool for both clinical care of chronic itch patients and
+pharmaceutical clinical trials.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Watermarking Graph Neural Networks via Explanations for Ownership
+  Protection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05614v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05614v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jane Downer, Ren Wang, Binghui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are the mainstream method to learn pervasive
+graph data and are widely deployed in industry, making their intellectual
+property valuable. However, protecting GNNs from unauthorized use remains a
+challenge. Watermarking, which embeds ownership information into a model, is a
+potential solution. However, existing watermarking methods have two key
+limitations: First, almost all of them focus on non-graph data, with
+watermarking GNNs for complex graph data largely unexplored. Second, the de
+facto backdoor-based watermarking methods pollute training data and induce
+ownership ambiguity through intentional misclassification. Our
+explanation-based watermarking inherits the strengths of backdoor-based methods
+(e.g., robust to watermark removal attacks), but avoids data pollution and
+eliminates intentional misclassification. In particular, our method learns to
+embed the watermark in GNN explanations such that this unique watermark is
+statistically distinct from other potential solutions, and ownership claims
+must show statistical significance to be verified. We theoretically prove that,
+even with full knowledge of our method, locating the watermark is an NP-hard
+problem. Empirically, our method manifests robustness to removal attacks like
+fine-tuning and pruning. By addressing these challenges, our approach marks a
+significant advancement in protecting GNN intellectual property.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Personalized Learning Analysis via an Innovative Domain
+  Knowledge Informed Attention-based Knowledge Tracing Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Kose, Jin Wei-Kocsis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emerging Knowledge Tracing (KT) models, particularly deep learning and
+attention-based Knowledge Tracing, have shown great potential in realizing
+personalized learning analysis via prediction of students' future performance
+based on their past interactions. The existing methods mainly focus on
+immediate past interactions or individual concepts without accounting for
+dependencies between knowledge concept, referred as knowledge concept routes,
+that can be critical to advance the understanding the students' learning
+outcomes. To address this, in this paper, we propose an innovative
+attention-based method by effectively incorporating the domain knowledge of
+knowledge concept routes in the given curriculum. Additionally, we leverage
+XES3G5M dataset, a benchmark dataset with rich auxiliary information for
+knowledge concept routes, to evaluate and compare the performance of our
+proposed method to the seven State-of-the-art (SOTA) deep learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Approximate Supervised Object Distance Estimation on Unmanned Surface
+  Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Kiefer, Yitong Quan, Andreas Zell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned surface vehicles (USVs) and boats are increasingly important in
+maritime operations, yet their deployment is limited due to costly sensors and
+complexity. LiDAR, radar, and depth cameras are either costly, yield sparse
+point clouds or are noisy, and require extensive calibration. Here, we
+introduce a novel approach for approximate distance estimation in USVs using
+supervised object detection. We collected a dataset comprising images with
+manually annotated bounding boxes and corresponding distance measurements.
+Leveraging this data, we propose a specialized branch of an object detection
+model, not only to detect objects but also to predict their distances from the
+USV. This method offers a cost-efficient and intuitive alternative to
+conventional distance measurement techniques, aligning more closely with human
+estimation capabilities. We demonstrate its application in a marine assistance
+system that alerts operators to nearby objects such as boats, buoys, or other
+waterborne hazards.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-Language Models for Autonomous Driving: CLIP-Based Dynamic Scene
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Elhenawy, Huthaifa I. Ashqar, Andry Rakotonirainy, Taqwa I. Alhadidi, Ahmed Jaber, Mohammad Abu Tami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene understanding is essential for enhancing driver safety, generating
+human-centric explanations for Automated Vehicle (AV) decisions, and leveraging
+Artificial Intelligence (AI) for retrospective driving video analysis. This
+study developed a dynamic scene retrieval system using Contrastive
+Language-Image Pretraining (CLIP) models, which can be optimized for real-time
+deployment on edge devices. The proposed system outperforms state-of-the-art
+in-context learning methods, including the zero-shot capabilities of GPT-4o,
+particularly in complex scenarios. By conducting frame-level analysis on the
+Honda Scenes Dataset, which contains a collection of about 80 hours of
+annotated driving videos capturing diverse real-world road and weather
+conditions, our study highlights the robustness of CLIP models in learning
+visual concepts from natural language supervision. Results also showed that
+fine-tuning the CLIP models, such as ViT-L/14 and ViT-B/32, significantly
+improved scene classification, achieving a top F1 score of 91.1%. These results
+demonstrate the ability of the system to deliver rapid and precise scene
+recognition, which can be used to meet the critical requirements of Advanced
+Driver Assistance Systems (ADAS). This study shows the potential of CLIP models
+to provide scalable and efficient frameworks for dynamic scene understanding
+and classification. Furthermore, this work lays the groundwork for advanced
+autonomous vehicle technologies by fostering a deeper understanding of driver
+behavior, road conditions, and safety-critical scenarios, marking a significant
+step toward smarter, safer, and more context-aware autonomous driving systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soup to go: mitigating forgetting during continual learning with model
+  averaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05559v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05559v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anat Kleiman, Gintare Karolina Dziugaite, Jonathan Frankle, Sham Kakade, Mansheej Paul
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In continual learning, where task data arrives in a sequence, fine-tuning on
+later tasks will often lead to performance degradation on earlier tasks. This
+is especially pronounced when these tasks come from diverse domains. In this
+setting, how can we mitigate catastrophic forgetting of earlier tasks and
+retain what the model has learned with minimal computational expenses? Inspired
+by other merging methods, and L2-regression, we propose Sequential Fine-tuning
+with Averaging (SFA), a method that merges currently training models with
+earlier checkpoints during the course of training. SOTA approaches typically
+maintain a data buffer of past tasks or impose a penalty at each gradient step.
+In contrast, our method achieves comparable results without the need to store
+past data, or multiple copies of parameters for each gradient step.
+Furthermore, our method outperforms common merging techniques such as Task
+Arithmetic, TIES Merging, and WiSE-FT, as well as other penalty methods like L2
+and Elastic Weight Consolidation. In turn, our method offers insight into the
+benefits of merging partially-trained models during training across both image
+and language domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Zero-Shot Object-Level Change Detection by Incorporating
+  Visual Correspondence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Huy Nguyen, Pooyan Rahmanzadehgervi, Long Mail, Anh Totti Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting object-level changes between two images across possibly different
+views is a core task in many applications that involve visual inspection or
+camera surveillance. Existing change-detection approaches suffer from three
+major limitations: (1) lack of evaluation on image pairs that contain no
+changes, leading to unreported false positive rates; (2) lack of
+correspondences (\ie, localizing the regions before and after a change); and
+(3) poor zero-shot generalization across different domains. To address these
+issues, we introduce a novel method that leverages change correspondences (a)
+during training to improve change detection accuracy, and (b) at test time, to
+minimize false positives. That is, we harness the supervision labels of where
+an object is added or removed to supervise change detectors, improving their
+accuracy over previous work by a large margin. Our work is also the first to
+predict correspondences between pairs of detected changes using estimated
+homography and the Hungarian algorithm. Our model demonstrates superior
+performance over existing methods, achieving state-of-the-art results in change
+detection and change correspondence accuracy across both in-distribution and
+zero-shot benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLMQuoter: Enhancing RAG Capabilities Through Efficient Quote Extraction
+  From Large Contexts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05554v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05554v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuri Facanha Bezerra, Li Weigang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce LLMQuoter, a lightweight, distillation-based model designed to
+enhance Retrieval Augmented Generation (RAG) by extracting the most relevant
+textual evidence for downstream reasoning tasks. Built on the LLaMA-3B
+architecture and fine-tuned with Low-Rank Adaptation (LoRA) on a 15,000-sample
+subset of HotpotQA, LLMQuoter adopts a "quote-first-then-answer" strategy,
+efficiently identifying key quotes before passing curated snippets to reasoning
+models. This workflow reduces cognitive overhead and outperforms full-context
+approaches like Retrieval-Augmented Fine-Tuning (RAFT), achieving over 20-point
+accuracy gains across both small and large language models. By leveraging
+knowledge distillation from a high-performing teacher model, LLMQuoter achieves
+competitive results in a resource-efficient fine-tuning setup. It democratizes
+advanced RAG capabilities, delivering significant performance improvements
+without requiring extensive model retraining. Our results highlight the
+potential of distilled quote-based reasoning to streamline complex workflows,
+offering a scalable and practical solution for researchers and practitioners
+alike.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The dynamics of meaning through time: Assessment of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Taher Alrefaie, Fatty Salem, Nour Eldin Morsy, Nada Samir, Mohamed Medhat Gaber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how large language models (LLMs) grasp the historical context
+of concepts and their semantic evolution is essential in advancing artificial
+intelligence and linguistic studies. This study aims to evaluate the
+capabilities of various LLMs in capturing temporal dynamics of meaning,
+specifically how they interpret terms across different time periods. We analyze
+a diverse set of terms from multiple domains, using tailored prompts and
+measuring responses through both objective metrics (e.g., perplexity and word
+count) and subjective human expert evaluations. Our comparative analysis
+includes prominent models like ChatGPT, GPT-4, Claude, Bard, Gemini, and Llama.
+Findings reveal marked differences in each model's handling of historical
+context and semantic shifts, highlighting both strengths and limitations in
+temporal semantic understanding. These insights offer a foundation for refining
+LLMs to better address the evolving nature of language, with implications for
+historical text analysis, AI design, and applications in digital humanities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OVO-Bench: How Far is Your Video-LLMs from Real-World Online Video
+  Understanding? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Li, Junbo Niu, Ziyang Miao, Chunjiang Ge, Yuanhang Zhou, Qihao He, Xiaoyi Dong, Haodong Duan, Shuangrui Ding, Rui Qian, Pan Zhang, Yuhang Zang, Yuhang Cao, Conghui He, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Awareness, the ability to reason dynamically based on the timestamp
+when a question is raised, is the key distinction between offline and online
+video LLMs. Unlike offline models, which rely on complete videos for static,
+post hoc analysis, online models process video streams incrementally and
+dynamically adapt their responses based on the timestamp at which the question
+is posed. Despite its significance, temporal awareness has not been adequately
+evaluated in existing benchmarks. To fill this gap, we present OVO-Bench
+(Online-VideO-Benchmark), a novel video benchmark that emphasizes the
+importance of timestamps for advanced online video understanding capability
+benchmarking. OVO-Bench evaluates the ability of video LLMs to reason and
+respond to events occurring at specific timestamps under three distinct
+scenarios: (1) Backward tracing: trace back to past events to answer the
+question. (2) Real-time understanding: understand and respond to events as they
+unfold at the current timestamp. (3) Forward active responding: delay the
+response until sufficient future information becomes available to answer the
+question accurately. OVO-Bench comprises 12 tasks, featuring 644 unique videos
+and approximately human-curated 2,800 fine-grained meta-annotations with
+precise timestamps. We combine automated generation pipelines with human
+curation. With these high-quality samples, we further developed an evaluation
+pipeline to systematically query video LLMs along the video timeline.
+Evaluations of nine Video-LLMs reveal that, despite advancements on traditional
+benchmarks, current models struggle with online video understanding, showing a
+significant gap compared to human agents. We hope OVO-Bench will drive progress
+in video LLMs and inspire future research in online video reasoning. Our
+benchmark and code can be accessed at https://github.com/JoeLeelyf/OVO-Bench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strategy Masking: A Method for Guardrails in Value-based Reinforcement
+  Learning Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Keane, Sam Keyser, Jeremy Kedziora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of reward functions to structure AI learning and decision making is
+core to the current reinforcement learning paradigm; however, without careful
+design of reward functions, agents can learn to solve problems in ways that may
+be considered ``undesirable" or ``unethical. Without thorough understanding of
+the incentives a reward function creates, it can be difficult to impose
+principled yet general control mechanisms over its behavior. In this paper, we
+study methods for constructing guardrails for AI agents that use reward
+functions to learn decision making. We introduce a novel approach, which we
+call strategy masking, to explicitly learn and then suppress undesirable AI
+agent behavior. We apply our method to study lying in AI agents and show that
+strategy masking can effectively modify agent behavior by suppressing, or
+actively penalizing, the reward dimension for lying such that agents act more
+honestly while not compromising their ability to perform effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial Information Integration in Small Language Models for Document
+  Layout Generation and Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Melendez, Clemens Havas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document layout understanding is a field of study that analyzes the spatial
+arrangement of information in a document hoping to understand its structure and
+layout. Models such as LayoutLM (and its subsequent iterations) can understand
+semi-structured documents with SotA results; however, the lack of open
+semi-structured data is a limitation in itself. While semi-structured data is
+common in everyday life (balance sheets, purchase orders, receipts), there is a
+lack of public datasets for training machine learning models for this type of
+document. In this investigation we propose a method to generate new, synthetic,
+layout information that can help overcoming this data shortage. According to
+our results, the proposed method performs better than LayoutTransformer,
+another popular layout generation method. We also show that, in some scenarios,
+text classification can improve when supported by bounding box information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages. Symposium on Applied Computing 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedSA: A Unified Representation Learning via Semantic Anchors for
+  Prototype-based Federated Learning <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanbing Zhou, Xiangmou Qu, Chenlong You, Jiyang Zhou, Jingyue Tang, Xin Zheng, Chunmao Cai, Yingbo Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prototype-based federated learning has emerged as a promising approach that
+shares lightweight prototypes to transfer knowledge among clients with data
+heterogeneity in a model-agnostic manner. However, existing methods often
+collect prototypes directly from local models, which inevitably introduce
+inconsistencies into representation learning due to the biased data
+distributions and differing model architectures among clients. In this paper,
+we identify that both statistical and model heterogeneity create a vicious
+cycle of representation inconsistency, classifier divergence, and skewed
+prototype alignment, which negatively impacts the performance of clients. To
+break the vicious cycle, we propose a novel framework named Federated Learning
+via Semantic Anchors (FedSA) to decouple the generation of prototypes from
+local representation learning. We introduce a novel perspective that uses
+simple yet effective semantic anchors serving as prototypes to guide local
+models in learning consistent representations. By incorporating semantic
+anchors, we further propose anchor-based regularization with margin-enhanced
+contrastive learning and anchor-based classifier calibration to correct feature
+extractors and calibrate classifiers across clients, achieving intra-class
+compactness and inter-class separability of prototypes while ensuring
+consistent decision boundaries. We then update the semantic anchors with these
+consistent and discriminative prototypes, which iteratively encourage clients
+to collaboratively learn a unified data representation with robust
+generalization. Extensive experiments under both statistical and model
+heterogeneity settings show that FedSA significantly outperforms existing
+prototype-based FL methods on various classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LSEBMCL: A Latent Space Energy-Based Model for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaodi Li, Dingcheng Li, Rujun Gao, Mahmoud Zamani, Latifur Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning has become essential in many practical applications such
+as online news summaries and product classification. The primary challenge is
+known as catastrophic forgetting, a phenomenon where a model inadvertently
+discards previously learned knowledge when it is trained on new tasks. Existing
+solutions involve storing exemplars from previous classes, regularizing
+parameters during the fine-tuning process, or assigning different model
+parameters to each task. The proposed solution LSEBMCL (Latent Space
+Energy-Based Model for Continual Learning) in this work is to use energy-based
+models (EBMs) to prevent catastrophic forgetting by sampling data points from
+previous tasks when training on new ones. The EBM is a machine learning model
+that associates an energy value with each input data point. The proposed method
+uses an EBM layer as an outer-generator in the continual learning framework for
+NLP tasks. The study demonstrates the efficacy of EBM in NLP tasks, achieving
+state-of-the-art results in all experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the 7th International Conference on Artificial Intelligence in
+  Information and Communication (ICAIIC 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FOCUS: Towards Universal Foreground Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuyao You, Lingyu Kong, Lingchen Meng, Zuxuan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foreground segmentation is a fundamental task in computer vision,
+encompassing various subdivision tasks. Previous research has typically
+designed task-specific architectures for each task, leading to a lack of
+unification. Moreover, they primarily focus on recognizing foreground objects
+without effectively distinguishing them from the background. In this paper, we
+emphasize the importance of the background and its relationship with the
+foreground. We introduce FOCUS, the Foreground ObjeCts Universal Segmentation
+framework that can handle multiple foreground tasks. We develop a multi-scale
+semantic network using the edge information of objects to enhance image
+features. To achieve boundary-aware segmentation, we propose a novel
+distillation method, integrating the contrastive learning strategy to refine
+the prediction mask in multi-modal feature space. We conduct extensive
+experiments on a total of 13 datasets across 5 tasks, and the results
+demonstrate that FOCUS consistently outperforms the state-of-the-art
+task-specific models on most metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LearningFlow: Automated Policy Learning Workflow for Urban Driving with
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zengqi Peng, Yubin Wang, Xu Han, Lei Zheng, Jun Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in reinforcement learning (RL) demonstrate the
+significant potential in autonomous driving. Despite this promise, challenges
+such as the manual design of reward functions and low sample efficiency in
+complex environments continue to impede the development of safe and effective
+driving policies. To tackle these issues, we introduce LearningFlow, an
+innovative automated policy learning workflow tailored to urban driving. This
+framework leverages the collaboration of multiple large language model (LLM)
+agents throughout the RL training process. LearningFlow includes a curriculum
+sequence generation process and a reward generation process, which work in
+tandem to guide the RL policy by generating tailored training curricula and
+reward functions. Particularly, each process is supported by an analysis agent
+that evaluates training progress and provides critical insights to the
+generation agent. Through the collaborative efforts of these LLM agents,
+LearningFlow automates policy learning across a series of complex driving
+tasks, and it significantly reduces the reliance on manual reward function
+design while enhancing sample efficiency. Comprehensive experiments are
+conducted in the high-fidelity CARLA simulator, along with comparisons with
+other existing methods, to demonstrate the efficacy of our proposed approach.
+The results demonstrate that LearningFlow excels in generating rewards and
+curricula. It also achieves superior performance and robust generalization
+across various driving tasks, as well as commendable adaptation to different RL
+algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable deep learning illuminates multiple structures fluorescence
+  imaging: a path toward trustworthy artificial intelligence in microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyang Chen, Luhong Jin, Xuwei Xuan, Defu Yang, Yun Cheng, Ju Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Live-cell imaging of multiple subcellular structures is essential for
+understanding subcellular dynamics. However, the conventional multi-color
+sequential fluorescence microscopy suffers from significant imaging delays and
+limited number of subcellular structure separate labeling, resulting in
+substantial limitations for real-time live-cell research applications. Here, we
+present the Adaptive Explainable Multi-Structure Network (AEMS-Net), a
+deep-learning framework that enables simultaneous prediction of two subcellular
+structures from a single image. The model normalizes staining intensity and
+prioritizes critical image features by integrating attention mechanisms and
+brightness adaptation layers. Leveraging the Kolmogorov-Arnold representation
+theorem, our model decomposes learned features into interpretable univariate
+functions, enhancing the explainability of complex subcellular morphologies. We
+demonstrate that AEMS-Net allows real-time recording of interactions between
+mitochondria and microtubules, requiring only half the conventional
+sequential-channel imaging procedures. Notably, this approach achieves over 30%
+improvement in imaging quality compared to traditional deep learning methods,
+establishing a new paradigm for long-term, interpretable live-cell imaging that
+advances the ability to explore subcellular dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open Problems in Machine Unlearning for AI Safety 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04952v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04952v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fazl Barez, Tingchen Fu, Ameya Prabhu, Stephen Casper, Amartya Sanyal, Adel Bibi, Aidan O'Gara, Robert Kirk, Ben Bucknall, Tim Fist, Luke Ong, Philip Torr, Kwok-Yan Lam, Robert Trager, David Krueger, Sören Mindermann, José Hernandez-Orallo, Mor Geva, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As AI systems become more capable, widely deployed, and increasingly
+autonomous in critical areas such as cybersecurity, biological research, and
+healthcare, ensuring their safety and alignment with human values is paramount.
+Machine unlearning -- the ability to selectively forget or suppress specific
+types of knowledge -- has shown promise for privacy and data removal tasks,
+which has been the primary focus of existing research. More recently, its
+potential application to AI safety has gained attention. In this paper, we
+identify key limitations that prevent unlearning from serving as a
+comprehensive solution for AI safety, particularly in managing dual-use
+knowledge in sensitive domains like cybersecurity and chemical, biological,
+radiological, and nuclear (CBRN) safety. In these contexts, information can be
+both beneficial and harmful, and models may combine seemingly harmless
+information for harmful purposes -- unlearning this information could strongly
+affect beneficial uses. We provide an overview of inherent constraints and open
+problems, including the broader side effects of unlearning dangerous knowledge,
+as well as previously unexplored tensions between unlearning and existing
+safety mechanisms. Finally, we investigate challenges related to evaluation,
+robustness, and the preservation of safety features during unlearning. By
+mapping these limitations and open challenges, we aim to guide future research
+toward realistic applications of unlearning within a broader AI safety
+framework, acknowledging its limitations and highlighting areas where
+alternative approaches may be required.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Agro<span class="highlight-title">GPT</span>: Efficient Agricultural Vision-Language Model with Expert Tuning <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.08405v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.08405v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Awais, Ali Husain Salem Abdulla Alharthi, Amandeep Kumar, Hisham Cholakkal, Rao Muhammad Anwer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant progress has been made in advancing large multimodal
+conversational models (LMMs), capitalizing on vast repositories of image-text
+data available online. Despite this progress, these models often encounter
+substantial domain gaps, hindering their ability to engage in complex
+conversations across new domains. Recent efforts have aimed to mitigate this
+issue, albeit relying on domain-specific image-text data to curate
+instruction-tuning data. However, many domains, such as agriculture, lack such
+vision-language data. In this work, we propose an approach to construct
+instruction-tuning data that harnesses vision-only data for the agriculture
+domain. We utilize diverse agricultural datasets spanning multiple domains,
+curate class-specific information, and employ large language models (LLMs) to
+construct an expert-tuning set, resulting in a 70k expert-tuning dataset called
+AgroInstruct. Subsequently, we expert-tuned and created AgroGPT, an efficient
+LMM that can hold complex agriculture-related conversations and provide useful
+insights. We also develop AgroEvals for evaluation and compare {AgroGPT's}
+performance with large open and closed-source models. {AgroGPT} excels at
+identifying fine-grained agricultural concepts, can act as an agriculture
+expert, and provides helpful information for multimodal agriculture questions.
+The code, datasets, and models are available at
+https://github.com/awaisrauf/agroGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WACV, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attention Mechanisms Don't Learn Additive Models: Rethinking Feature
+  Importance for <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13536v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13536v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Leemann, Alina Fastowski, Felix Pfeiffer, Gjergji Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the critical challenge of applying feature attribution methods to
+the transformer architecture, which dominates current applications in natural
+language processing and beyond. Traditional attribution methods to explainable
+AI (XAI) explicitly or implicitly rely on linear or additive surrogate models
+to quantify the impact of input features on a model's output. In this work, we
+formally prove an alarming incompatibility: transformers are structurally
+incapable of representing linear or additive surrogate models used for feature
+attribution, undermining the grounding of these conventional explanation
+methodologies. To address this discrepancy, we introduce the Softmax-Linked
+Additive Log Odds Model (SLALOM), a novel surrogate model specifically designed
+to align with the transformer framework. SLALOM demonstrates the capacity to
+deliver a range of insightful explanations with both synthetic and real-world
+datasets. We highlight SLALOM's unique efficiency-quality curve by showing that
+SLALOM can produce explanations with substantially higher fidelity than
+competing surrogate models or provide explanations of comparable quality at a
+fraction of their computational costs. We release code for SLALOM as an
+open-source project online at https://github.com/tleemann/slalom_explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR Camera-Ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tailored-LLaMA: Optimizing Few-Shot Learning in Pruned LLaMA Models with
+  Task-Specific <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.19185v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.19185v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danyal Aftab, Steven Davy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models demonstrate impressive proficiency in language
+understanding and generation. Nonetheless, training these models from scratch,
+even the least complex billion-parameter variant demands significant
+computational resources rendering it economically impractical for many
+organizations. With large language models functioning as general-purpose task
+solvers, this paper investigates their task-specific fine-tuning. We employ
+task-specific datasets and prompts to fine-tune two pruned LLaMA models having
+5 billion and 4 billion parameters. This process utilizes the pre-trained
+weights and focuses on a subset of weights using the LoRA method. One challenge
+in fine-tuning the LLaMA model is crafting a precise prompt tailored to the
+specific task. To address this, we propose a novel approach to fine-tune the
+LLaMA model under two primary constraints: task specificity and prompt
+effectiveness. Our approach, Tailored LLaMA initially employs structural
+pruning to reduce the model sizes from 7B to 5B and 4B parameters.
+Subsequently, it applies a carefully designed prompt specific to the task and
+utilizes the LoRA method to accelerate the fine-tuning process. Moreover,
+fine-tuning a model pruned by 50\% for less than one hour restores the mean
+accuracy of classification tasks to 95.68\% at a 20\% compression ratio and to
+86.54\% at a 50\% compression ratio through few-shot learning with 50 shots.
+Our validation of Tailored LLaMA on these two pruned variants demonstrates that
+even when compressed to 50\%, the models maintain over 65\% of the baseline
+model accuracy in few-shot classification and generation tasks. These findings
+highlight the efficacy of our tailored approach in maintaining high performance
+with significantly reduced model sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TradingAgents: Multi-Agents LLM Financial Trading Framework <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20138v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20138v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijia Xiao, Edward Sun, Di Luo, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant progress has been made in automated problem-solving using
+societies of agents powered by large language models (LLMs). In finance,
+efforts have largely focused on single-agent systems handling specific tasks or
+multi-agent frameworks independently gathering data. However, multi-agent
+systems' potential to replicate real-world trading firms' collaborative
+dynamics remains underexplored. TradingAgents proposes a novel stock trading
+framework inspired by trading firms, featuring LLM-powered agents in
+specialized roles such as fundamental analysts, sentiment analysts, technical
+analysts, and traders with varied risk profiles. The framework includes Bull
+and Bear researcher agents assessing market conditions, a risk management team
+monitoring exposure, and traders synthesizing insights from debates and
+historical data to make informed decisions. By simulating a dynamic,
+collaborative trading environment, this framework aims to improve trading
+performance. Detailed architecture and extensive experiments reveal its
+superiority over baseline models, with notable improvements in cumulative
+returns, Sharpe ratio, and maximum drawdown, highlighting the potential of
+multi-agent LLM frameworks in financial trading. More details on TradingAgents
+are available at https://TradingAgents-AI.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Multi-Agent AI in the Real World @ AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PFML: <span class="highlight-title">Self-Supervised</span> Learning of Time-Series Data Without
+  Representation Collapse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.10087v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.10087v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Einari Vaaras, Manu Airaksinen, Okko Räsänen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) is a data-driven learning approach that
+utilizes the innate structure of the data to guide the learning process. In
+contrast to supervised learning, which depends on external labels, SSL utilizes
+the inherent characteristics of the data to produce its own supervisory signal.
+However, one frequent issue with SSL methods is representation collapse, where
+the model outputs a constant input-invariant feature representation. This issue
+hinders the potential application of SSL methods to new data modalities, as
+trying to avoid representation collapse wastes researchers' time and effort.
+This paper introduces a novel SSL algorithm for time-series data called
+Prediction of Functionals from Masked Latents (PFML). Instead of predicting
+masked input signals or their latent representations directly, PFML operates by
+predicting statistical functionals of the input signal corresponding to masked
+embeddings, given a sequence of unmasked embeddings. The algorithm is designed
+to avoid representation collapse, rendering it straightforwardly applicable to
+different time-series data domains, such as novel sensor modalities in clinical
+data. We demonstrate the effectiveness of PFML through complex, real-life
+classification tasks across three different data modalities: infant posture and
+movement classification from multi-sensor inertial measurement unit data,
+emotion recognition from speech data, and sleep stage classification from EEG
+data. The results show that PFML is superior to a conceptually similar SSL
+method and a contrastive learning-based SSL method. Additionally, PFML is on
+par with the current state-of-the-art SSL method, while also being conceptually
+simpler and without suffering from representation collapse.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Less is More: The Influence of Pruning on the Explainability of CNNs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08878v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08878v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Weber, Florian Merkle, Pascal Schöttle, Stephan Schlögl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern, state-of-the-art Convolutional Neural Networks (CNNs) in computer
+vision have millions of parameters. Thus, explaining the complex decisions of
+such networks to humans is challenging. A technical approach to reduce CNN
+complexity is network pruning, where less important parameters are deleted. The
+work presented in this paper investigates whether this technical complexity
+reduction also helps with perceived explainability. To do so, we conducted a
+pre-study and two human-grounded experiments, assessing the effects of
+different pruning ratios on CNN explainability. Overall, we evaluated four
+different compression rates (i.e., CPR 2, 4, 8, and 32) with 37 500 tasks on
+Mechanical Turk. Results indicate that lower compression rates have a positive
+influence on explainability, while higher compression rates show negative
+effects. Furthermore, we were able to identify sweet spots that increase both
+the perceived explainability and the model's performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometry Restoration and Dewarping of Camera-Captured Document Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03145v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03145v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valery Istomin, Oleg Pereziabov, Ilya Afanasyev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research focuses on developing a method for restoring the topology of
+digital images of paper documents captured by a camera, using algorithms for
+detection, segmentation, geometry restoration, and dewarping. Our methodology
+employs deep learning (DL) for document outline detection, followed by computer
+vision (CV) to create a topological 2D grid using cubic polynomial
+interpolation and correct nonlinear distortions by remapping the image. Using
+classical CV methods makes the document topology restoration process more
+efficient and faster, as it requires significantly fewer computational
+resources and memory. We developed a new pipeline for automatic document
+dewarping and reconstruction, along with a framework and annotated dataset to
+demonstrate its efficiency. Our experiments confirm the promise of our
+methodology and its superiority over existing benchmarks (including mobile apps
+and popular DL solutions, such as RectiNet, DocGeoNet, and DocTr++) both
+visually and in terms of document readability via Optical Character Recognition
+(OCR) and geometry restoration metrics. This paves the way for creating
+high-quality digital copies of paper documents and enhancing the efficiency of
+OCR systems. Project page: https://github.com/HorizonParadox/DRCCBI
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ REFA: Reference Free Alignment for multi-preference optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16378v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16378v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taneesh Gupta, Rahul Madhavan, Xuchao Zhang, Chetan Bansal, Saravan Rajmohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce REFA, a family of reference-free alignment methods that optimize
+over multiple user preferences while enforcing fine-grained length control. Our
+approach integrates deviation-based weighting to emphasize high-quality
+responses more strongly, length normalization to prevent trivial short-response
+solutions, and an EOS-probability regularizer to mitigate dataset-induced
+brevity biases. Theoretically, we show that under the Uncertainty Reduction
+with Sequence Length Assertion (URSLA), naive length normalization can still
+incentivize length-based shortcuts. By contrast, REFA corrects these subtle
+incentives, guiding models toward genuinely more informative and higher-quality
+outputs. Empirically, REFA sets a new state-of-the-art among reference-free
+alignment methods, producing richer responses aligned more closely with human
+preferences. Compared to a base supervised fine-tuned (SFT) mistral-7b model
+that achieves 8.4% length-controlled win rate (LC-WR) and 6.2% win rate (WR),
+our best REFA configuration attains 21.62% LC-WR and 19.87% WR on the
+AlpacaEval v2 benchmark. This represents a substantial improvement over both
+the strongest multi-preference baseline, InfoNCA (16.82% LC-WR, 10.44% WR), and
+the strongest reference-free baseline, SimPO (20.01% LC-WR, 17.65% WR)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Attention Graph Neural Networks for Inferring Gene Regulatory
+  Networks with Skewed Degree Distribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16220v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16220v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Xiong, Nan Yin, Shiyang Liang, Haoyang Li, Yingxu Wang, Duo Ai, Fang Pan, Jingjie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inferencing Gene Regulatory Networks (GRNs) from gene expression data is a
+pivotal challenge in systems biology, and several innovative computational
+methods have been introduced. However, most of these studies have not
+considered the skewed degree distribution of genes. Specifically, some genes
+may regulate multiple target genes while some genes may be regulated by
+multiple regulator genes. Such a skewed degree distribution issue significantly
+complicates the application of directed graph embedding methods. To tackle this
+issue, we propose the Cross-Attention Complex Dual Graph Embedding Model
+(XATGRN). Our XATGRN employs a cross-attention mechanism to effectively capture
+intricate gene interactions from gene expression profiles. Additionally, it
+uses a Dual Complex Graph Embedding approach to manage the skewed degree
+distribution, thereby ensuring precise prediction of regulatory relationships
+and their directionality. Our model consistently outperforms existing
+state-of-the-art methods across various datasets, underscoring its efficacy in
+elucidating complex gene regulatory mechanisms. Our codes used in this paper
+are publicly available at: https://github.com/kikixiong/XATGRN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures,1 tabels</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Drift2Matrix: Kernel-Induced Self Representation for Concept Drift
+  Adaptation in Co-evolving Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01480v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01480v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunpeng Xu, Lifei Chen, Shengrui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of time series analysis, tackling the phenomenon of concept
+drift poses a significant challenge. Concept drift -- characterized by the
+evolving statistical properties of time series data, affects the reliability
+and accuracy of conventional analysis models. This is particularly evident in
+co-evolving scenarios where interactions among variables are crucial. This
+paper presents Drift2Matrix, a novel framework that leverages kernel-induced
+self-representation for adaptive responses to concept drift in time series.
+Drift2Matrix employs a kernel-based learning mechanism to generate a
+representation matrix, encapsulating the inherent dynamics of co-evolving time
+series. This matrix serves as a key tool for identification and adaptation to
+concept drift by observing its temporal variations. Furthermore, Drift2Matrix
+effectively identifies prevailing patterns and offers insights into emerging
+trends through pattern evolution analysis. Our empirical evaluation of
+Drift2Matrix across various datasets demonstrates its effectiveness in handling
+the complexities of concept drift. This approach introduces a novel perspective
+in the theoretical domain of co-evolving time series analysis, enhancing
+adaptability and accuracy in the face of dynamic data environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safeguarding System <span class="highlight-title">Prompt</span>s for LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13426v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13426v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhifeng Jiang, Zhihua Jin, Guoliang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are increasingly utilized in applications where
+system prompts, which guide model outputs, play a crucial role. These prompts
+often contain business logic and sensitive information, making their protection
+essential. However, adversarial and even regular user queries can exploit LLM
+vulnerabilities to expose these hidden prompts. To address this issue, we
+propose PromptKeeper, a robust defense mechanism designed to safeguard system
+prompts. PromptKeeper tackles two core challenges: reliably detecting prompt
+leakage and mitigating side-channel vulnerabilities when leakage occurs. By
+framing detection as a hypothesis-testing problem, PromptKeeper effectively
+identifies both explicit and subtle leakage. Upon detection, it regenerates
+responses using a dummy prompt, ensuring that outputs remain indistinguishable
+from typical interactions when no leakage is present. PromptKeeper ensures
+robust protection against prompt extraction attacks via either adversarial or
+regular queries, while preserving conversational capability and runtime
+efficiency during benign user interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the role of Artificial Intelligence methods in modern
+  force-controlled manufacturing robotic tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16828v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16828v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincenzo Petrone, Enrico Ferrentino, Pasquale Chiacchio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This position paper explores the integration of Artificial Intelligence (AI)
+into force-controlled robotic tasks within the scope of advanced manufacturing,
+a cornerstone of Industry 4.0. AI's role in enhancing robotic manipulators -
+key drivers in the Fourth Industrial Revolution - is rapidly leading to
+significant innovations in smart manufacturing. The objective of this article
+is to frame these innovations in practical force-controlled applications - e.g.
+deburring, polishing, and assembly tasks like peg-in-hole (PiH) - highlighting
+their necessity for maintaining high-quality production standards. By reporting
+on recent AI-based methodologies, this article contrasts them and identifies
+current challenges to be addressed in future research. The analysis concludes
+with a perspective on future research directions, emphasizing the need for
+common performance metrics to validate AI techniques, integration of various
+enhancements for performance optimization, and the importance of validating
+them in relevant scenarios. These future directions aim to provide consistency
+with already adopted approaches, so as to be compatible with manufacturing
+standards, increasing the relevance of AI-driven methods in both academic and
+industrial contexts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the 21st International Conference on Informatics in
+  Control, Automation and Robotics - Volume 1: ICINCO, 392-399, 2024 , Porto,
+  Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time Transfer: On Optimal Learning Rate and Batch Size In The Infinite
+  Data Limit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleg Filatov, Jan Ebert, Jiangtao Wang, Stefan Kesselheim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the main challenges in optimal scaling of large language models (LLMs)
+is the prohibitive cost of hyperparameter tuning, particularly learning rate
+$\eta$ and batch size $B$. While techniques like $\mu$P (Yang et al., 2022)
+provide scaling rules for optimal $\eta$ transfer in the infinite model size
+limit, the optimal scaling behavior in the infinite data size limit remains
+unknown. We fill in this gap by observing for the first time an intricate
+dependence of optimal $\eta$ scaling on the pretraining token budget $T$, $B$
+and its relation to the critical batch size $B_\mathrm{crit}$, which we measure
+to evolve as $B_\mathrm{crit} \propto T$. Furthermore, we show that the optimal
+batch size is positively correlated with $B_\mathrm{crit}$: keeping it fixed
+becomes suboptimal over time even if learning rate is scaled optimally.
+Surprisingly, our results demonstrate that the observed optimal $\eta$ and $B$
+dynamics are preserved with $\mu$P model scaling, challenging the conventional
+view of $B_\mathrm{crit}$ dependence solely on loss value. Complementing
+optimality, we examine the sensitivity of loss to changes in learning rate,
+where we find the sensitivity to decrease with increase of $T$ and to remain
+constant with $\mu$P model scaling. We hope our results make the first step
+towards a unified picture of the joint optimal data and model scaling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-class Decoding of Attended Speaker Direction Using
+  Electroencephalogram and Audio Spatial Spectrum 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.06928v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.06928v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanming Zhang, Jing Lu, Fei Chen, Haoliang Du, Xia Gao, Zhibin Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoding the directional focus of an attended speaker from listeners'
+electroencephalogram (EEG) signals is essential for developing brain-computer
+interfaces to improve the quality of life for individuals with hearing
+impairment. Previous works have concentrated on binary directional focus
+decoding, i.e., determining whether the attended speaker is on the left or
+right side of the listener. However, a more precise decoding of the exact
+direction of the attended speaker is necessary for effective speech processing.
+Additionally, audio spatial information has not been effectively leveraged,
+resulting in suboptimal decoding results. In this paper, it is found that on
+the recently presented dataset with 14-class directional focus, models relying
+exclusively on EEG inputs exhibit significantly lower accuracy when decoding
+the directional focus in both leave-one-subject-out and leave-one-trial-out
+scenarios. By integrating audio spatial spectra with EEG features, the decoding
+accuracy can be effectively improved. The CNN, LSM-CNN, and Deformer models are
+employed to decode the directional focus from listeners' EEG signals and audio
+spatial spectra. The proposed Sp-EEG-Deformer model achieves notable 14-class
+decoding accuracies of 55.35% and 57.19% in leave-one-subject-out and
+leave-one-trial-out scenarios with a decision window of 1 second, respectively.
+Experiment results indicate increased decoding accuracy as the number of
+alternative directions reduces. These findings suggest the efficacy of our
+proposed dual modal directional focus decoding strategy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE TNSRE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralized Federated Anomaly Detection in Smart Grids: A P2P Gossip
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15879v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15879v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Akbar Husnoo, Adnan Anwar, Md Enamul Haque, A. N. Mahmood
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing security and privacy concerns in the Smart Grid sector have
+led to a significant demand for robust intrusion detection systems within
+critical smart grid infrastructure. To address the challenges posed by privacy
+preservation and decentralized power system zones with distinct data ownership,
+Federated Learning (FL) has emerged as a promising privacy-preserving solution
+which facilitates collaborative training of attack detection models without
+necessitating the sharing of raw data. However, FL presents several
+implementation limitations in the power system domain due to its heavy reliance
+on a centralized aggregator and the risks of privacy leakage during model
+update transmission. To overcome these technical bottlenecks, this paper
+introduces a novel decentralized federated anomaly detection scheme based on
+two main gossip protocols namely Random Walk and Epidemic. Our findings
+indicate that the Random Walk protocol exhibits superior performance compared
+to the Epidemic protocol, highlighting its efficacy in decentralized federated
+learning environments. Experimental validation of the proposed framework
+utilizing publicly available industrial control systems datasets demonstrates
+superior attack detection accuracy while safeguarding data confidentiality and
+mitigating the impact of communication latency and stragglers. Furthermore, our
+approach yields a notable 35% improvement in training time compared to
+conventional FL, underscoring the efficacy and robustness of our decentralized
+learning method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Filter-then-Generate: Large Language Models with Structure-Text Adapter
+  for Knowledge Graph Completion <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Liu, Jihai Zhang, Fangquan Lin, Cheng Yang, Min Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) present massive inherent knowledge and superior
+semantic comprehension capability, which have revolutionized various tasks in
+natural language processing. Despite their success, a critical gap remains in
+enabling LLMs to perform knowledge graph completion (KGC). Empirical evidence
+suggests that LLMs consistently perform worse than conventional KGC approaches,
+even through sophisticated prompt design or tailored instruction-tuning.
+Fundamentally, applying LLMs on KGC introduces several critical challenges,
+including a vast set of entity candidates, hallucination issue of LLMs, and
+under-exploitation of the graph structure. To address these challenges, we
+propose a novel instruction-tuning-based method, namely FtG. Specifically, we
+present a \textit{filter-then-generate} paradigm and formulate the KGC task
+into a multiple-choice question format. In this way, we can harness the
+capability of LLMs while mitigating the issue casused by hallucinations.
+Moreover, we devise a flexible ego-graph serialization prompt and employ a
+structure-text adapter to couple structure and text information in a
+contextualized manner. Experimental results demonstrate that FtG achieves
+substantial performance gain compared to existing state-of-the-art methods. The
+instruction dataset and code are available at
+\url{https://github.com/LB0828/FtG}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>COLING 2025 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Reward: LLM-Empowered Credit Assignment in Episodic Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11120v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11120v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Qu, Yuhang Jiang, Boyuan Wang, Yixiu Mao, Cheems Wang, Chang Liu, Xiangyang Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) often encounters delayed and sparse feedback in
+real-world applications, even with only episodic rewards. Previous approaches
+have made some progress in reward redistribution for credit assignment but
+still face challenges, including training difficulties due to redundancy and
+ambiguous attributions stemming from overlooking the multifaceted nature of
+mission performance evaluation. Hopefully, Large Language Model (LLM)
+encompasses fruitful decision-making knowledge and provides a plausible tool
+for reward redistribution. Even so, deploying LLM in this case is non-trivial
+due to the misalignment between linguistic knowledge and the symbolic form
+requirement, together with inherent randomness and hallucinations in inference.
+To tackle these issues, we introduce LaRe, a novel LLM-empowered symbolic-based
+decision-making framework, to improve credit assignment. Key to LaRe is the
+concept of the Latent Reward, which works as a multi-dimensional performance
+evaluation, enabling more interpretable goal attainment from various
+perspectives and facilitating more effective reward redistribution. We examine
+that semantically generated code from LLM can bridge linguistic knowledge and
+symbolic latent rewards, as it is executable for symbolic objects. Meanwhile,
+we design latent reward self-verification to increase the stability and
+reliability of LLM inference. Theoretically, reward-irrelevant redundancy
+elimination in the latent reward benefits RL performance from more accurate
+reward estimation. Extensive experimental results witness that LaRe (i)
+achieves superior temporal credit assignment to SOTA methods, (ii) excels in
+allocating contributions among multiple agents, and (iii) outperforms policies
+trained with ground truth rewards for certain tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preference-Based Multi-Agent Reinforcement Learning: Data Coverage and
+  Algorithmic Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.00717v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.00717v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Natalia Zhang, Xinqi Wang, Qiwen Cui, Runlong Zhou, Sham M. Kakade, Simon S. Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We initiate the study of Preference-Based Multi-Agent Reinforcement Learning
+(PbMARL), exploring both theoretical foundations and empirical validations. We
+define the task as identifying the Nash equilibrium from a preference-only
+offline dataset in general-sum games, a problem marked by the challenge of
+sparse feedback signals. Our theory establishes the upper complexity bounds for
+Nash Equilibrium in effective PbMARL, demonstrating that single-policy coverage
+is inadequate and highlighting the importance of unilateral dataset coverage.
+These theoretical insights are verified through comprehensive experiments. To
+enhance the practical performance, we further introduce two algorithmic
+techniques. (1) We propose a Mean Squared Error (MSE) regularization along the
+time axis to achieve a more uniform reward distribution and improve reward
+learning outcomes. (2) We propose an additional penalty based on the
+distribution of the dataset to incorporate pessimism, improving stability and
+effectiveness during training. Our findings underscore the multifaceted
+approach required for PbMARL, paving the way for effective preference-based
+multi-agent systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Representation Learning of Lab Values via Masked AutoEncoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Restrepo, Chenwei Wu, Yueran Jia, Jaden K. Sun, Jack Gallifant, Catherine G. Bielick, Yugang Jia, Leo A. Celi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate imputation of missing laboratory values in electronic health records
+(EHRs) is critical to enable robust clinical predictions and reduce biases in
+AI systems in healthcare. Existing methods, such as variational autoencoders
+(VAEs) and decision tree-based approaches such as XGBoost, struggle to model
+the complex temporal and contextual dependencies in EHR data, mainly in
+underrepresented groups. In this work, we propose Lab-MAE, a novel
+transformer-based masked autoencoder framework that leverages self-supervised
+learning for the imputation of continuous sequential lab values. Lab-MAE
+introduces a structured encoding scheme that jointly models laboratory test
+values and their corresponding timestamps, enabling explicit capturing temporal
+dependencies. Empirical evaluation on the MIMIC-IV dataset demonstrates that
+Lab-MAE significantly outperforms the state-of-the-art baselines such as
+XGBoost across multiple metrics, including root mean square error (RMSE),
+R-squared (R2), and Wasserstein distance (WD). Notably, Lab-MAE achieves
+equitable performance across demographic groups of patients, advancing fairness
+in clinical predictions. We further investigate the role of follow-up
+laboratory values as potential shortcut features, revealing Lab-MAE's
+robustness in scenarios where such data is unavailable. The findings suggest
+that our transformer-based architecture, adapted to the characteristics of the
+EHR data, offers a foundation model for more accurate and fair clinical
+imputation models. In addition, we measure and compare the carbon footprint of
+Lab-MAE with the baseline XGBoost model, highlighting its environmental
+requirements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages main text, 8 appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zeroth-Order Adaptive Neuron Alignment Based Pruning without Re-Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elia Cunegatti, Leonardo Lucio Custode, Giovanni Iacca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network pruning focuses on computational techniques that aim to reduce a
+given model's computational cost by removing a subset of its parameters while
+having minimal impact on performance. Throughout the last decade, the most
+widely used pruning paradigm has been pruning and re-training, which nowadays
+is inconvenient due to the vast amount of pre-trained models, which are in any
+case too expensive to re-train. In this paper, we exploit functional
+information from dense pre-trained models, i.e., their activations, to obtain
+sparse models that maximize the activations' alignment w.r.t. their
+corresponding dense models. Hence, we propose \textsc{NeuroAL}, a \emph{top-up}
+algorithm that can be used on top of any given pruning algorithm for LLMs,
+which modifies the block-wise and row-wise sparsity exploiting information from
+both the dense model and its sparse version to maximize the \emph{neuron
+alignment} among activations. Differently from existing methods, our approach
+adaptively selects the best hyperparameters for the block-wise and row-wise
+sparsity ratios w.r.t. the model and the desired sparsity, and requires
+\emph{no re-training}. We test our method over 276 cases combining four LLM
+families, three sparsity ratios, and ten language tasks (three language
+modeling and seven zero-shot datasets), showing how it consistently outperforms
+the latest state-of-the-art methods in terms of performance-runtime trade-off.
+The code is available at
+\href{https://github.com/eliacunegatti/NeuroAL}{https://github.com/eliacunegatti/NeuroAL}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Joint Additive Factor Models for Multiview Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00778v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00778v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niccolo Anceschi, Federico Ferrari, David B. Dunson, Himel Mallick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is increasingly common in a wide variety of applied settings to collect
+data of multiple different types on the same set of samples. Our particular
+focus in this article is on studying relationships between such multiview
+features and responses. A motivating application arises in the context of
+precision medicine where multi-omics data are collected to correlate with
+clinical outcomes. It is of interest to infer dependence within and across
+views while combining multimodal information to improve the prediction of
+outcomes. The signal-to-noise ratio can vary substantially across views,
+motivating more nuanced statistical tools beyond standard late and early
+fusion. This challenge comes with the need to preserve interpretability, select
+features, and obtain accurate uncertainty quantification. We propose a joint
+additive factor regression model (JAFAR) with a structured additive design,
+accounting for shared and view-specific components. We ensure identifiability
+via a novel dependent cumulative shrinkage process (D-CUSP) prior. We provide
+an efficient implementation via a partially collapsed Gibbs sampler and extend
+our approach to allow flexible feature and outcome distributions. Prediction of
+time-to-labor onset from immunome, metabolome, and proteome data illustrates
+performance gains against state-of-the-art competitors. Our open-source
+software (R package) is available at https://github.com/niccoloanceschi/jafar.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Range, not Independence, Drives Modularity in Biological Inspired
+  Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.06232v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.06232v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Will Dorrell, Kyle Hsu, Luke Hollingsworth, Jin Hwa Lee, Jiajun Wu, Chelsea Finn, Peter E Latham, Tim EJ Behrens, James CR Whittington
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Why do biological and artificial neurons sometimes modularise, each encoding
+a single meaningful variable, and sometimes entangle their representation of
+many variables? In this work, we develop a theory of when biologically inspired
+networks -- those that are nonnegative and energy efficient -- modularise their
+representation of source variables (sources). We derive necessary and
+sufficient conditions on a sample of sources that determine whether the neurons
+in an optimal biologically-inspired linear autoencoder modularise. Our theory
+applies to any dataset, extending far beyond the case of statistical
+independence studied in previous work. Rather we show that sources modularise
+if their support is ``sufficiently spread''. From this theory, we extract and
+validate predictions in a variety of empirical studies on how data distribution
+affects modularisation in nonlinear feedforward and recurrent neural networks
+trained on supervised and unsupervised tasks. Furthermore, we apply these ideas
+to neuroscience data, showing that range independence can be used to understand
+the mixing or modularising of spatial and reward information in entorhinal
+recordings in seemingly conflicting experiments. Further, we use these results
+to suggest alternate origins of mixed-selectivity, beyond the predominant
+theory of flexible nonlinear classification. In sum, our theory prescribes
+precise conditions on when neural activities modularise, providing tools for
+inducing and elucidating modular representations in brains and machines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 16 figures. WD and KH contributed equally; LH and JHL
+  contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OneLLM: One Framework to Align All Modalities with Language <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Han, Kaixiong Gong, Yiyuan Zhang, Jiaqi Wang, Kaipeng Zhang, Dahua Lin, Yu Qiao, Peng Gao, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) have gained significant attention
+due to their strong multimodal understanding capability. However, existing
+works rely heavily on modality-specific encoders, which usually differ in
+architecture and are limited to common modalities. In this paper, we present
+OneLLM, an MLLM that aligns eight modalities to language using a unified
+framework. We achieve this through a unified multimodal encoder and a
+progressive multimodal alignment pipeline. In detail, we first train an image
+projection module to connect a vision encoder with LLM. Then, we build a
+universal projection module (UPM) by mixing multiple image projection modules
+and dynamic routing. Finally, we progressively align more modalities to LLM
+with the UPM. To fully leverage the potential of OneLLM in following
+instructions, we also curated a comprehensive multimodal instruction dataset,
+including 2M items from image, audio, video, point cloud, depth/normal map, IMU
+and fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,
+encompassing tasks such as multimodal captioning, question answering and
+reasoning, where it delivers excellent performance. Code, data, model and
+online demo are available at https://github.com/csuhan/OneLLM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2024. Code: https://github.com/csuhan/OneLLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HiTZ at VarDial 2025 NorSID: Overcoming Data Scarcity with Language
+  Transfer and Automatic Data Annotation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaione Bengoetxea, Mikel Zubillaga, Ekhi Azurmendi, Maite Heredia, Julen Etxaniz, Markel Ferro, Jeremy Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we present our submission for the NorSID Shared Task as part of
+the 2025 VarDial Workshop (Scherrer et al., 2025), consisting of three tasks:
+Intent Detection, Slot Filling and Dialect Identification, evaluated using data
+in different dialects of the Norwegian language. For Intent Detection and Slot
+Filling, we have fine-tuned a multitask model in a cross-lingual setting, to
+leverage the xSID dataset available in 17 languages. In the case of Dialect
+Identification, our final submission consists of a model fine-tuned on the
+provided development set, which has obtained the highest scores within our
+experiments. Our final results on the test set show that our models do not drop
+in performance compared to the development set, likely due to the
+domain-specificity of the dataset and the similar distribution of both subsets.
+Finally, we also report an in-depth analysis of the provided datasets and their
+artifacts, as well as other sets of experiments that have been carried out but
+did not yield the best results. Additionally, we present an analysis on the
+reasons why some methods have been more successful than others; mainly the
+impact of the combination of languages and domain-specificity of the training
+data on the results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Vardial 2025 NorSID Shared Task, fixed minor typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Planning-Driven Programming: A Large Language Model Programming Workflow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.14503v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.14503v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Lei, Yanchuan Chang, Nir Lipovetzky, Krista A. Ehinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The strong performance of large language models (LLMs) raises extensive
+discussion on their application to code generation. Recent research suggests
+continuous program refinements through visible tests to improve code generation
+accuracy in LLMs. However, these methods suffer from LLMs' inefficiency and
+limited reasoning capacity. In this work, we propose an LLM programming
+workflow (LPW) designed to improve both initial code generation and subsequent
+refinements within a structured two-phase workflow. Specifically, the solution
+generation phase formulates a solution plan, which is then verified through
+visible tests to specify the intended natural language solution. Subsequently,
+the code implementation phase drafts an initial code according to the solution
+plan and its verification. If the generated code fails the visible tests, the
+plan verification serves as the intended solution to consistently inform the
+refinement process for correcting bugs. Compared to state-of-the-art methods
+across various existing LLMs, LPW significantly improves the Pass@1 accuracy by
+up to 16.4% on well-established text-to-code generation benchmarks. LPW also
+sets new state-of-the-art Pass@1 accuracy, achieving 98.2% on HumanEval, 84.8%
+on MBPP, 59.3% on LiveCode, 62.6% on APPS, and 34.7% on CodeContest, using
+GPT-4o as the backbone.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedCoDi-M: A Multi-<span class="highlight-title">Prompt</span> Foundation Model for Multimodal Medical Data
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04614v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04614v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Molino, Francesco Di Feola, Eliodoro Faiella, Deborah Fazzini, Domiziana Santucci, Linlin Shen, Valerio Guarrasi, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence is revolutionizing medical practice, enhancing
+diagnostic accuracy and healthcare delivery. However, its adaptation in medical
+settings still faces significant challenges, related to data availability and
+privacy constraints. Synthetic data has emerged as a promising solution to
+mitigate these issues, addressing data scarcity while preserving privacy.
+Recently, Latent Diffusion Models have emerged as a powerful tool for
+generating high-quality synthetic data. Meanwhile, the integration of different
+modalities has gained interest, emphasizing the need of models capable of
+handle multimodal medical data. Existing approaches struggle to integrate
+complementary information and lack the ability to generate modalities
+simultaneously. To address this challenge, we present MedCoDi-M, a
+6.77-billion-parameter model, designed for multimodal medical data generation,
+that, following Foundation Model paradigm, exploits contrastive learning and
+large quantity of data to build a shared latent space which capture the
+relationships between different data modalities. Further, we introduce the
+Multi-Prompt training technique, which significantly boosts MedCoDi-M's
+generation under different settings. We extensively validate MedCoDi-M: first
+we benchmark it against five competitors on the MIMIC-CXR dataset, a
+state-of-the-art dataset for Chest X-ray and radiological report generation.
+Secondly, we perform a Visual Turing Test with expert radiologists to assess
+the realism and clinical relevance of the generated data, ensuring alignment
+with real-world scenarios. Finally, we assess the utility of MedCoDi-M in
+addressing key challenges in the medical field, such as anonymization, data
+scarcity and imbalance learning. The results are promising, demonstrating the
+applicability of MedCoDi-M in medical contexts. Project page is at
+https://cosbidev.github.io/MedCoDi-M/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-shot Class-incremental Learning for Classification and Object
+  Detection: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06764v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06764v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghua Zhang, Li Liu, Olli Silvén, Matti Pietikäinen, Dewen Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot Class-Incremental Learning (FSCIL) presents a unique challenge in
+Machine Learning (ML), as it necessitates the Incremental Learning (IL) of new
+classes from sparsely labeled training samples without forgetting previous
+knowledge. While this field has seen recent progress, it remains an active
+exploration area. This paper aims to provide a comprehensive and systematic
+review of FSCIL. In our in-depth examination, we delve into various facets of
+FSCIL, encompassing the problem definition, the discussion of the primary
+challenges of unreliable empirical risk minimization and the
+stability-plasticity dilemma, general schemes, and relevant problems of IL and
+Few-shot Learning (FSL). Besides, we offer an overview of benchmark datasets
+and evaluation metrics. Furthermore, we introduce the Few-shot
+Class-incremental Classification (FSCIC) methods from data-based,
+structure-based, and optimization-based approaches and the Few-shot
+Class-incremental Object Detection (FSCIOD) methods from anchor-free and
+anchor-based approaches. Beyond these, we present several promising research
+directions within FSCIL that merit further investigation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ INFELM: In-depth Fairness Evaluation of Large Text-To-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01973v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01973v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Jin, Xing Liu, Yu Liu, Jia Qing Yap, Andrea Wong, Adriana Crespo, Qi Lin, Zhiyuan Yin, Qiang Yan, Ryan Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of large language models (LLMs) and large vision models
+(LVMs) have propelled the evolution of multi-modal AI systems, which have
+demonstrated the remarkable potential for industrial applications by emulating
+human-like cognition. However, they also pose significant ethical challenges,
+including amplifying harmful content and reinforcing societal biases. For
+instance, biases in some industrial image generation models highlighted the
+urgent need for robust fairness assessments. Most existing evaluation
+frameworks focus on the comprehensiveness of various aspects of the models, but
+they exhibit critical limitations, including insufficient attention to content
+generation alignment and social bias-sensitive domains. More importantly, their
+reliance on pixel-detection techniques is prone to inaccuracies.
+  To address these issues, this paper presents INFELM, an in-depth fairness
+evaluation on widely-used text-to-image models. Our key contributions are: (1)
+an advanced skintone classifier incorporating facial topology and refined skin
+pixel representation to enhance classification precision by at least 16.04%,
+(2) a bias-sensitive content alignment measurement for understanding societal
+impacts, (3) a generalizable representation bias evaluation for diverse
+demographic groups, and (4) extensive experiments analyzing large-scale
+text-to-image model outputs across six social-bias-sensitive domains. We find
+that existing models in the study generally do not meet the empirical fairness
+criteria, and representation bias is generally more pronounced than alignment
+errors. INFELM establishes a robust benchmark for fairness assessment,
+supporting the development of multi-modal AI systems that align with ethical
+and human-centric principles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Di Jin and Xing Liu contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Driving Towards Inclusion: A Systematic <span class="highlight-title">Review</span> of AI-powered
+  Accessibility Enhancements for People with Disability in Autonomous Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14571v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14571v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Bastola, Hao Wang, Sayed Pedram Haeri Boroujeni, Julian Brinkley, Ata Jahangir Moshayedi, Abolfazl Razi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides a comprehensive and, to our knowledge, the first review
+of inclusive human-computer interaction (HCI) within autonomous vehicles (AVs)
+and human-driven cars with partial autonomy, emphasizing accessibility and
+user-centered design principles. We explore the current technologies and HCI
+systems designed to enhance passenger experience, particularly for individuals
+with accessibility needs. Key technologies discussed include brain-computer
+interfaces, anthropomorphic interaction, virtual reality, augmented reality,
+mode adaptation, voice-activated interfaces, haptic feedback, etc. Each
+technology is evaluated for its role in creating an inclusive in-vehicle
+environment. Furthermore, we highlight recent interface designs by leading
+companies and review emerging concepts and prototypes under development or
+testing, which show significant potential to address diverse accessibility
+requirements. Safety considerations, ethical concerns, and adoption of AVs are
+other major issues that require thorough investigation. Building on these
+findings, we propose an end-to-end design framework that addresses
+accessibility requirements across diverse user demographics, including older
+adults and individuals with physical or cognitive impairments. This work
+provides actionable insights for designers, researchers, and policymakers
+aiming to create safer and more comfortable environments in autonomous and
+regular vehicles accessible to all users.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ITINERA: Integrating Spatial Optimization with Large Language Models for
+  Open-domain Urban Itinerary Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07204v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07204v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihong Tang, Zhaokai Wang, Ao Qu, Yihao Yan, Zhaofeng Wu, Dingyi Zhuang, Jushi Kai, Kebing Hou, Xiaotong Guo, Han Zheng, Tiange Luo, Jinhua Zhao, Zhan Zhao, Wei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Citywalk, a recently popular form of urban travel, requires genuine
+personalization and understanding of fine-grained requests compared to
+traditional itinerary planning. In this paper, we introduce the novel task of
+Open-domain Urban Itinerary Planning (OUIP), which generates personalized urban
+itineraries from user requests in natural language. We then present ITINERA, an
+OUIP system that integrates spatial optimization with large language models to
+provide customized urban itineraries based on user needs. This involves
+decomposing user requests, selecting candidate points of interest (POIs),
+ordering the POIs based on cluster-aware spatial optimization, and generating
+the itinerary. Experiments on real-world datasets and the performance of the
+deployed system demonstrate our system's capacity to deliver personalized and
+spatially coherent itineraries compared to current solutions. Source codes of
+ITINERA are available at https://github.com/YihongT/ITINERA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating Multi-Modal Input Token Mixer Into Mamba-Based Decision
+  Models: Decision MetaMamba 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10517v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10517v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wall Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequence modeling with State Space models (SSMs) has demonstrated performance
+surpassing that of Transformers in various tasks, raising expectations for
+their potential to outperform the Decision Transformer and its enhanced
+variants in offline reinforcement learning (RL). However, decision models based
+on Mamba, a state-of-the-art SSM, failed to achieve superior performance
+compared to these enhanced Decision Transformers. We hypothesize that this
+limitation arises from information loss during the selective scanning phase. To
+address this, we propose the Decision MetaMamba (DMM), which augments Mamba
+with a token mixer in its input layer. This mixer explicitly accounts for the
+multimodal nature of offline RL inputs, comprising state, action, and
+return-to-go. The DMM demonstrates improved performance while significantly
+reducing parameter count compared to prior models. Notably, similar performance
+gains were achieved using a simple linear token mixer, emphasizing the
+importance of preserving information from proximate time steps rather than the
+specific design of the token mixer itself. This novel modification to Mamba's
+input layer represents a departure from conventional timestamp-based encoding
+approaches used in Transformers. By enhancing performance of Mamba in offline
+RL, characterized by memory efficiency and fast inference, this work opens new
+avenues for its broader application in future RL research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We have decided to withdraw this manuscript as we believe that the
+  work requires significant improvements and further research to ensure its
+  quality and impact. We are currently pursuing a more comprehensive approach
+  to address the limitations of the current submission and plan to resubmit an
+  improved version in the future</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-Based Automatic Multi-Level Airway Collapse Monitoring on
+  Obstructive Sleep Apnea Patients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16030v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16030v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying-Chieh Hsu, Stanley Yung-Chuan Liu, Chao-Jung Huang, Chi-Wei Wu, Ren-Kai Cheng, Jane Yung-Jen Hsu, Shang-Ran Huang, Yuan-Ren Cheng, Fu-Shun Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigated the use of deep learning to identify multi-level
+upper airway collapses in obstructive sleep apnea (OSA) patients based on
+snoring sounds. We fi-ne-tuned ResNet-50 and Audio Spectrogram Transformer
+(AST) models using snoring recordings from 37 subjects undergoing drug-induced
+sleep endoscopy (DISE) between 2020 and 2021. Snoring sounds were labeled
+according to the VOTE (Velum, Orophar-ynx, Tongue Base, Epiglottis)
+classification, resulting in 259 V, 403 O, 77 T, 13 E, 1016 VO, 46 VT, 140 OT,
+39 OE, 30 VOT, and 3150 non-snoring (N) 0.5-second clips. The models were
+trained for two multi-label classification tasks: identifying obstructions at
+V, O, T, and E levels, and identifying retropalatal (RP) and retroglossal (RG)
+obstruc-tions. Results showed AST slightly outperformed ResNet-50,
+demonstrating good abil-ity to identify V (F1-score: 0.71, MCC: 0.61, AUC:
+0.89), O (F1-score: 0.80, MCC: 0.72, AUC: 0.94), and RP obstructions (F1-score:
+0.86, MCC: 0.77, AUC: 0.97). However, both models struggled with T, E, and RG
+classifications due to limited data. Retrospective analysis of a full-night
+recording showed the potential to profile airway obstruction dynamics. We
+expect this information, combined with polysomnography and other clinical
+parameters, can aid clinical triage and treatment planning for OSA patients.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoMAL: Collaborative Multi-Agent Large Language Models for
+  Mixed-Autonomy Traffic <span class="chip">SDM25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.14368v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.14368v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaiyuan Yao, Longchao Da, Vishnu Nandam, Justin Turnau, Zhiwei Liu, Linsey Pang, Hua Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of autonomous vehicles into urban traffic has great potential
+to improve efficiency by reducing congestion and optimizing traffic flow
+systematically. In this paper, we introduce CoMAL (Collaborative Multi-Agent
+LLMs), a framework designed to address the mixed-autonomy traffic problem by
+collaboration among autonomous vehicles to optimize traffic flow. CoMAL is
+built upon large language models, operating in an interactive traffic
+simulation environment. It utilizes a Perception Module to observe surrounding
+agents and a Memory Module to store strategies for each agent. The overall
+workflow includes a Collaboration Module that encourages autonomous vehicles to
+discuss the effective strategy and allocate roles, a reasoning engine to
+determine optimal behaviors based on assigned roles, and an Execution Module
+that controls vehicle actions using a hybrid approach combining rule-based
+models. Experimental results demonstrate that CoMAL achieves superior
+performance on the Flow benchmark. Additionally, we evaluate the impact of
+different language models and compare our framework with reinforcement learning
+approaches. It highlights the strong cooperative capability of LLM agents and
+presents a promising solution to the mixed-autonomy traffic challenge. The code
+is available at https://github.com/Hyan-Yao/CoMAL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, accepted to SDM25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video
+  Generation Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03847v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03847v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekai Gu, Rui Yan, Jiahao Lu, Peng Li, Zhiyang Dou, Chenyang Si, Zhen Dong, Qifeng Liu, Cheng Lin, Ziwei Liu, Wenping Wang, Yuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated impressive performance in generating
+high-quality videos from text prompts or images. However, precise control over
+the video generation process, such as camera manipulation or content editing,
+remains a significant challenge. Existing methods for controlled video
+generation are typically limited to a single control type, lacking the
+flexibility to handle diverse control demands. In this paper, we introduce
+Diffusion as Shader (DaS), a novel approach that supports multiple video
+control tasks within a unified architecture. Our key insight is that achieving
+versatile video control necessitates leveraging 3D control signals, as videos
+are fundamentally 2D renderings of dynamic 3D content. Unlike prior methods
+limited to 2D control signals, DaS leverages 3D tracking videos as control
+inputs, making the video diffusion process inherently 3D-aware. This innovation
+allows DaS to achieve a wide range of video controls by simply manipulating the
+3D tracking videos. A further advantage of using 3D tracking videos is their
+ability to effectively link frames, significantly enhancing the temporal
+consistency of the generated videos. With just 3 days of fine-tuning on 8 H800
+GPUs using less than 10k videos, DaS demonstrates strong control capabilities
+across diverse tasks, including mesh-to-video generation, camera control,
+motion transfer, and object manipulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://igl-hkust.github.io/das/ Codes:
+  https://github.com/IGL-HKUST/DiffusionAsShader</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on LLM-as-a-Judge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15594v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15594v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Gu, Xuhui Jiang, Zhichao Shi, Hexiang Tan, Xuehao Zhai, Chengjin Xu, Wei Li, Yinghan Shen, Shengjie Ma, Honghao Liu, Yuanzhuo Wang, Jian Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and consistent evaluation is crucial for decision-making across
+numerous fields, yet it remains a challenging task due to inherent
+subjectivity, variability, and scale. Large Language Models (LLMs) have
+achieved remarkable success across diverse domains, leading to the emergence of
+"LLM-as-a-Judge," where LLMs are employed as evaluators for complex tasks. With
+their ability to process diverse data types and provide scalable,
+cost-effective, and consistent assessments, LLMs present a compelling
+alternative to traditional expert-driven evaluations. However, ensuring the
+reliability of LLM-as-a-Judge systems remains a significant challenge that
+requires careful design and standardization. This paper provides a
+comprehensive survey of LLM-as-a-Judge, addressing the core question: How can
+reliable LLM-as-a-Judge systems be built? We explore strategies to enhance
+reliability, including improving consistency, mitigating biases, and adapting
+to diverse assessment scenarios. Additionally, we propose methodologies for
+evaluating the reliability of LLM-as-a-Judge systems, supported by a novel
+benchmark designed for this purpose. To advance the development and real-world
+deployment of LLM-as-a-Judge systems, we also discussed practical applications,
+challenges, and future directions. This survey serves as a foundational
+reference for researchers and practitioners in this rapidly evolving field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Corrected typos & more discussion on reasoning models 33 pages, 9
+  figures. arXiv admin note: text overlap with arXiv:2310.05470 by other
+  authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Magic-Boost: Boost 3D Generation with Multi-View Conditioned Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.06429v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.06429v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Yang, Jianfeng Zhang, Yichun Shi, Bowen Chen, Chenxu Zhang, Huichao Zhang, Xiaofeng Yang, Xiu Li, Jiashi Feng, Guosheng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from the rapid development of 2D diffusion models, 3D content
+generation has witnessed significant progress. One promising solution is to
+finetune the pre-trained 2D diffusion models to produce multi-view images and
+then reconstruct them into 3D assets via feed-forward sparse-view
+reconstruction models. However, limited by the 3D inconsistency in the
+generated multi-view images and the low reconstruction resolution of the
+feed-forward reconstruction models, the generated 3d assets are still limited
+to incorrect geometries and blurry textures. To address this problem, we
+present a multi-view based refine method, named Magic-Boost, to further refine
+the generation results. In detail, we first propose a novel multi-view
+conditioned diffusion model which extracts 3d prior from the synthesized
+multi-view images to synthesize high-fidelity novel view images and then
+introduce a novel iterative-update strategy to adopt it to provide precise
+guidance to refine the coarse generated results through a fast optimization
+process. Conditioned on the strong 3d priors extracted from the synthesized
+multi-view images, Magic-Boost is capable of providing precise optimization
+guidance that well aligns with the coarse generated 3D assets, enriching the
+local detail in both geometry and texture within a short time ($\sim15$min).
+Extensive experiments show Magic-Boost greatly enhances the coarse generated
+inputs, generates high-quality 3D assets with rich geometric and textural
+details. (Project Page: https://magic-research.github.io/magic-boost/)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ViLBias: A Comprehensive Framework for Bias Detection through Linguistic
+  and Visual Cues , presenting Annotation Strategies, Evaluation, and Key
+  Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17052v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17052v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaina Raza, Caesar Saleh, Emrul Hasan, Franklin Ogidi, Maximus Powers, Veronica Chatrath, Marcelo Lotif, Roya Javadi, Anam Zahid, Vahid Reza Khazaie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of Large Language Models (LLMs) and Vision-Language Models
+(VLMs) opens new avenues for addressing complex challenges in multimodal
+content analysis, particularly in biased news detection. This study introduces
+VLBias, a framework that leverages state-of-the-art LLMs and VLMs to detect
+linguistic and visual biases in news content. We present a multimodal dataset
+comprising textual content and corresponding images from diverse news sources.
+We propose a hybrid annotation framework that combines LLM-based annotations
+with human review to ensure high-quality labeling while reducing costs and
+enhancing scalability. Our evaluation compares the performance of
+state-of-the-art SLMs and LLMs for both modalities (text and images) and the
+results reveal that while SLMs are computationally efficient, LLMs demonstrate
+superior accuracy in identifying subtle framing and text-visual
+inconsistencies. Furthermore, empirical analysis shows that incorporating
+visual cues alongside textual data improves bias detection accuracy by 3 to 5%.
+This study provides a comprehensive exploration of LLMs, SLMs, and VLMs as
+tools for detecting multimodal biases in news content and highlights their
+respective strengths, limitations, and potential for future applications
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ More is not always better? Enhancing Many-Shot In-Context Learning with
+  Differentiated and Reweighting Objectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04070v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04070v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoqing Zhang, Ang Lv, Yuhan Liu, Flood Sung, Wei Liu, Shuo Shang, Xiuying Chen, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) excel at few-shot in-context learning (ICL)
+without requiring parameter updates. However, as the number of ICL
+demonstrations increases from a few to many, performance tends to plateau and
+eventually decline. We identify two primary causes for this trend: the
+suboptimal negative log-likelihood (NLL) optimization objective and the
+incremental data noise. To address these issues, we introduce DrICL, a novel
+optimization method that enhances model performance through Differentiated
+Learning and advantage-based Reweighting objectives. Globally, DrICL utilizes
+differentiated learning to optimize the NLL objective, ensuring that many-shot
+performance surpasses zero-shot levels. Locally, it dynamically adjusts the
+weighting of many-shot demonstrations by leveraging cumulative advantages
+inspired by reinforcement learning, thereby improving generalization. This
+approach allows the model to handle varying numbers of shots effectively,
+mitigating the impact of noisy data. Recognizing the lack of multi-task
+datasets with diverse many-shot distributions, we develop the Many-Shot ICL
+Benchmark (ICL-50)-a large-scale benchmark of 50 tasks that cover shot numbers
+from 1 to 350 within sequences of up to 8,000 tokens-for fine-tuning purposes.
+ICL-50 facilitates the evaluation of many-shot ICL strategies across seven
+prominent NLP tasks and 50 distinct datasets. Experimental results demonstrate
+that LLMs enhanced with DrICL achieve significant improvements in many-shot
+setups across various tasks, including both in-domain and out-of-domain
+scenarios. We release the code and benchmark dataset hoping to facilitate
+further research in many-shot ICL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constraints as Rewards: Reinforcement Learning for Robots without Reward
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Ishihara, Noriaki Takasugi, Kotaro Kawakami, Masaya Kinoshita, Kazumi Aoyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning has become an essential algorithm for generating
+complex robotic behaviors. However, to learn such behaviors, it is necessary to
+design a reward function that describes the task, which often consists of
+multiple objectives that needs to be balanced. This tuning process is known as
+reward engineering and typically involves extensive trial-and-error. In this
+paper, to avoid this trial-and-error process, we propose the concept of
+Constraints as Rewards (CaR). CaR formulates the task objective using multiple
+constraint functions instead of a reward function and solves a reinforcement
+learning problem with constraints using the Lagrangian-method. By adopting this
+approach, different objectives are automatically balanced, because Lagrange
+multipliers serves as the weights among the objectives. In addition, we will
+demonstrate that constraints, expressed as inequalities, provide an intuitive
+interpretation of the optimization target designed for the task. We apply the
+proposed method to the standing-up motion generation task of a
+six-wheeled-telescopic-legged robot and demonstrate that the proposed method
+successfully acquires the target behavior, even though it is challenging to
+learn with manually designed reward functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAPGen: An Approach for Fixing Code Inefficiencies in Zero-Shot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17077v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17077v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Spandan Garg, Roshanak Zilouchian Moghaddam, Neel Sundaresan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Performance bugs are non-functional bugs that can even manifest in
+well-tested commercial products. Fixing these performance bugs is an important
+yet challenging problem. In this work, we address this challenge and present a
+new approach called Retrieval-Augmented Prompt Generation (RAPGen). Given a
+code snippet with a performance issue, RAPGen first retrieves a prompt
+instruction from a pre-constructed knowledge-base of previous performance bug
+fixes and then generates a prompt using the retrieved instruction. It then uses
+this prompt on a Large Language Model (such as Codex) in zero-shot to generate
+a fix. We compare our approach with the various prompt variations and state of
+the art methods in the task of performance bug fixing. Our evaluation shows
+that RAPGen can generate performance improvement suggestions equivalent or
+better than a developer in ~60% of the cases, getting ~42% of them verbatim, in
+an expert-verified dataset of past performance changes made by C# developers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PalmBench: A Comprehensive Benchmark of Compressed Large Language Models
+  on Mobile Platforms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05315v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05315v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilong Li, Jingyu Liu, Hao Zhang, M Badri Narayanan, Utkarsh Sharma, Shuai Zhang, Pan Hu, Yijing Zeng, Jayaram Raghuram, Suman Banerjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying large language models (LLMs) locally on mobile devices is
+advantageous in scenarios where transmitting data to remote cloud servers is
+either undesirable due to privacy concerns or impractical due to network
+connection. Recent advancements (MLC, 2023a; Gerganov, 2023) have facilitated
+the local deployment of LLMs. However, local deployment also presents
+challenges, particularly in balancing quality (generative performance),
+latency, and throughput within the hardware constraints of mobile devices. In
+this paper, we introduce our lightweight, all-in-one automated benchmarking
+framework that allows users to evaluate LLMs on mobile devices. We provide a
+comprehensive benchmark of various popular LLMs with different quantization
+configurations (both weights and activations) across multiple mobile platforms
+with varying hardware capabilities. Unlike traditional benchmarks that assess
+full-scale models on high-end GPU clusters, we focus on evaluating resource
+efficiency (memory and power consumption) and harmful output for compressed
+models on mobile devices. Our key observations include i) differences in energy
+efficiency and throughput across mobile platforms; ii) the impact of
+quantization on memory usage, GPU execution time, and power consumption; and
+iii) accuracy and performance degradation of quantized models compared to their
+non-quantized counterparts; and iv) the frequency of hallucinations and toxic
+content generated by compressed LLMs on mobile devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Inference for Self-Organizing Multi-LLM Systems: A Bayesian
+  Thermodynamic Approach to Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.10425v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.10425v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rithvik Prakki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel approach to creating adaptive language agents
+by integrating active inference with large language models (LLMs). While LLMs
+demonstrate remarkable capabilities, their reliance on static prompts limits
+adaptation to new information and changing environments. We address this by
+implementing an active inference framework that acts as a cognitive layer above
+an LLM-based agent, dynamically adjusting prompts and search strategies through
+principled information-seeking behavior. Our framework models the environment
+using three state factors (prompt, search, and information states) with seven
+observation modalities capturing quality metrics. By framing the agent's
+learning through the free energy principle, we enable systematic exploration of
+prompt combinations and search strategies. Experimental results demonstrate the
+effectiveness of this approach, with the agent developing accurate models of
+environment dynamics evidenced by emergent structure in observation matrices.
+Action selection patterns reveal sophisticated exploration-exploitation
+behavior, transitioning from initial information-gathering to targeted prompt
+testing. The integration of thermodynamic principles with language model
+capabilities provides a principled framework for creating robust, adaptable
+agents, extending active inference beyond traditional low-dimensional control
+problems to high-dimensional, language-driven environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Table Retrieval a Solved Problem? Exploring Join-Aware Multi-Table
+  Retrieval <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09889v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09889v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Baile Chen, Yi Zhang, Dan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieving relevant tables containing the necessary information to accurately
+answer a given question over tables is critical to open-domain
+question-answering (QA) systems. Previous methods assume the answer to such a
+question can be found either in a single table or multiple tables identified
+through question decomposition or rewriting. However, neither of these
+approaches is sufficient, as many questions require retrieving multiple tables
+and joining them through a join plan that cannot be discerned from the user
+query itself. If the join plan is not considered in the retrieval stage, the
+subsequent steps of reasoning and answering based on those retrieved tables are
+likely to be incorrect. To address this problem, we introduce a method that
+uncovers useful join relations for any query and database during table
+retrieval. We use a novel re-ranking method formulated as a mixed-integer
+program that considers not only table-query relevance but also table-table
+relevance that requires inferring join relationships. Our method outperforms
+the state-of-the-art approaches for table retrieval by up to 9.3% in F1 score
+and for end-to-end QA by up to 5.4% in accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024. Dataset and code are available at
+  https://peterbaile.github.io/jar</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17428v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17428v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chankyu Lee, Rajarshi Roy, Mengyao Xu, Jonathan Raiman, Mohammad Shoeybi, Bryan Catanzaro, Wei Ping
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoder-only large language model (LLM)-based embedding models are beginning
+to outperform BERT or T5-based embedding models in general-purpose text
+embedding tasks, including dense vector-based retrieval. In this work, we
+introduce the NV-Embed model, incorporating architectural designs, training
+procedures, and curated datasets to significantly enhance the performance of
+LLM as a versatile embedding model, while maintaining its simplicity and
+reproducibility. For model architecture, we propose a latent attention layer to
+obtain pooled embeddings, which consistently improves retrieval and downstream
+task accuracy compared to mean pooling or using the last <EOS> token embedding
+from LLMs. To enhance representation learning, we remove the causal attention
+mask of LLMs during contrastive training. For training algorithm, we introduce
+a two-stage contrastive instruction-tuning method. It first applies contrastive
+training with instructions on retrieval datasets, utilizing in-batch negatives
+and curated hard negative examples. At stage-2, it blends various non-retrieval
+into instruction tuning, which not only enhances non-retrieval task accuracy
+but also improves retrieval performance. For training data, we utilize the
+hard-negative mining, synthetic data generation and existing public available
+datasets to boost the performance of embedding model. By combining these
+techniques, our NV-Embed-v1 and NV-Embed-v2 models obtained the No.1 position
+on the Massive Text Embedding Benchmark (MTEB) (as of May 24, 2024 and August
+30, 2024, respectively) across 56 embedding tasks, demonstrating the sustained
+effectiveness of the proposed methods over time. Additionally, it achieved the
+highest scores in the Long Doc section and the second-highest scores in the QA
+section of the AIR Benchmark, which covers a range of out-of-domain information
+retrieval topics beyond those in MTEB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We open-source the model at:
+  https://huggingface.co/nvidia/NV-Embed-v2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Discriminative Class Tokens for Text-to-Image Diffusion Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17155v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17155v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Idan Schwartz, Vésteinn Snæbjarnarson, Hila Chefer, Ryan Cotterell, Serge Belongie, Lior Wolf, Sagie Benaim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in text-to-image diffusion models have enabled the generation
+of diverse and high-quality images. While impressive, the images often fall
+short of depicting subtle details and are susceptible to errors due to
+ambiguity in the input text. One way of alleviating these issues is to train
+diffusion models on class-labeled datasets. This approach has two
+disadvantages: (i) supervised datasets are generally small compared to
+large-scale scraped text-image datasets on which text-to-image models are
+trained, affecting the quality and diversity of the generated images, or (ii)
+the input is a hard-coded label, as opposed to free-form text, limiting the
+control over the generated images.
+  In this work, we propose a non-invasive fine-tuning technique that
+capitalizes on the expressive potential of free-form text while achieving high
+accuracy through discriminative signals from a pretrained classifier. This is
+done by iteratively modifying the embedding of an added input token of a
+text-to-image diffusion model, by steering generated images toward a given
+target class according to a classifier. Our method is fast compared to prior
+fine-tuning methods and does not require a collection of in-class images or
+retraining of a noise-tolerant classifier. We evaluate our method extensively,
+showing that the generated images are: (i) more accurate and of higher quality
+than standard diffusion models, (ii) can be used to augment training data in a
+low-resource setting, and (iii) reveal information about the data used to train
+the guiding classifier. The code is available at
+\url{https://github.com/idansc/discriminative_class_tokens}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Arcee's MergeKit: A Toolkit for Merging Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13257v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13257v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles Goddard, Shamane Siriwardhana, Malikeh Ehghaghi, Luke Meyers, Vlad Karpukhin, Brian Benedict, Mark McQuade, Jacob Solawetz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid expansion of the open-source language model landscape presents an
+opportunity to merge the competencies of these model checkpoints by combining
+their parameters. Advances in transfer learning, the process of fine-tuning
+pretrained models for specific tasks, has resulted in the development of vast
+amounts of task-specific models, typically specialized in individual tasks and
+unable to utilize each other's strengths. Model merging facilitates the
+creation of multitask models without the need for additional training, offering
+a promising avenue for enhancing model performance and versatility. By
+preserving the intrinsic capabilities of the original models, model merging
+addresses complex challenges in AI - including the difficulties of catastrophic
+forgetting and multitask learning. To support this expanding area of research,
+we introduce MergeKit, a comprehensive, open-source library designed to
+facilitate the application of model merging strategies. MergeKit offers an
+extensible framework to efficiently merge models on any hardware, providing
+utility to researchers and practitioners. To date, thousands of models have
+been merged by the open-source community, leading to the creation of some of
+the worlds most powerful open-source model checkpoints, as assessed by the Open
+LLM Leaderboard. The library is accessible at
+https://github.com/arcee-ai/MergeKit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Masked Image Modeling: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vlad Hondru, Florinel Alin Croitoru, Shervin Minaee, Radu Tudor Ionescu, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we survey recent studies on masked image modeling (MIM), an
+approach that emerged as a powerful self-supervised learning technique in
+computer vision. The MIM task involves masking some information, e.g.~pixels,
+patches, or even latent representations, and training a model, usually an
+autoencoder, to predicting the missing information by using the context
+available in the visible part of the input. We identify and formalize two
+categories of approaches on how to implement MIM as a pretext task, one based
+on reconstruction and one based on contrastive learning. Then, we construct a
+taxonomy and review the most prominent papers in recent years. We complement
+the manually constructed taxonomy with a dendrogram obtained by applying a
+hierarchical clustering algorithm. We further identify relevant clusters via
+manually inspecting the resulting dendrogram. Our review also includes datasets
+that are commonly used in MIM research. We aggregate the performance results of
+various masked image modeling methods on the most popular datasets, to
+facilitate the comparison of competing methods. Finally, we identify research
+gaps and propose several interesting directions of future work. We supplement
+our survey with the following public repository containing organized
+references: https://github.com/vladhondru25/MIM-Survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revised version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Real Time Multi Organ Classification on Computed Tomography Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18731v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18731v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Halid Ziya Yerebakan, Yoshihisa Shinagawa, Gerardo Hermosillo Valadez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Organ segmentation is a fundamental task in medical imaging since it is
+useful for many clinical automation pipelines. However, some tasks do not
+require full segmentation. Instead, a classifier can identify the selected
+organ without segmenting the entire volume. In this study, we demonstrate a
+classifier based method to obtain organ labels in real time by using a large
+context size with a sparse data sampling strategy. Although our method operates
+as an independent classifier at query locations, it can generate full
+segmentations by querying grid locations at any resolution, offering faster
+performance than segmentation algorithms. We compared our method with existing
+segmentation techniques, demonstrating its superior runtime potential for
+practical applications in medical imaging.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, Organ Classification, Organ Segmentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Separating Tongue from Thought: Activation Patching Reveals
+  Language-Agnostic Concept Representations in <span class="highlight-title">Transformer</span>s <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.08745v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.08745v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clément Dumas, Chris Wendler, Veniamin Veselovsky, Giovanni Monea, Robert West
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A central question in multilingual language modeling is whether large
+language models (LLMs) develop a universal concept representation, disentangled
+from specific languages. In this paper, we address this question by analyzing
+latent representations (latents) during a word translation task in
+transformer-based LLMs. We strategically extract latents from a source
+translation prompt and insert them into the forward pass on a target
+translation prompt. By doing so, we find that the output language is encoded in
+the latent at an earlier layer than the concept to be translated. Building on
+this insight, we conduct two key experiments. First, we demonstrate that we can
+change the concept without changing the language and vice versa through
+activation patching alone. Second, we show that patching with the mean over
+latents across different languages does not impair and instead improves the
+models' performance in translating the concept. Our results provide evidence
+for the existence of language-agnostic concept representations within the
+investigated models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 14 figures, previous version published under the title "How
+  Do Llamas Process Multilingual Text? A Latent Exploration through Activation
+  Patching" at the ICML 2024 mechanistic interpretability workshop at
+  https://openreview.net/forum?id=0ku2hIm4BS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Transferable Features for Implicit Neural Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09566v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09566v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kushal Vyas, Ahmed Imtiaz Humayun, Aniket Dashpute, Richard G. Baraniuk, Ashok Veeraraghavan, Guha Balakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit neural representations (INRs) have demonstrated success in a variety
+of applications, including inverse problems and neural rendering. An INR is
+typically trained to capture one signal of interest, resulting in learned
+neural features that are highly attuned to that signal. Assumed to be less
+generalizable, we explore the aspect of transferability of such learned neural
+features for fitting similar signals. We introduce a new INR training
+framework, STRAINER that learns transferrable features for fitting INRs to new
+signals from a given distribution, faster and with better reconstruction
+quality. Owing to the sequential layer-wise affine operations in an INR, we
+propose to learn transferable representations by sharing initial encoder layers
+across multiple INRs with independent decoder layers. At test time, the learned
+encoder representations are transferred as initialization for an otherwise
+randomly initialized INR. We find STRAINER to yield extremely powerful
+initialization for fitting images from the same domain and allow for $\approx
++10dB$ gain in signal quality early on compared to an untrained INR itself.
+STRAINER also provides a simple way to encode data-driven priors in INRs. We
+evaluate STRAINER on multiple in-domain and out-of-domain signal fitting tasks
+and inverse problems and further provide detailed analysis and discussion on
+the transferability of STRAINER's features. Our demo can be accessed at
+https://kushalvyas.github.io/strainer.html .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Website: https://kushalvyas.github.io/strainer.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detecting Cognitive Impairment and Psychological Well-being among Older
+  Adults Using Facial, Acoustic, Linguistic, and Cardiovascular Patterns
+  Derived from Remote Conversations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14194v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14194v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofan Mu, Salman Seyedi, Iris Zheng, Zifan Jiang, Liu Chen, Bolaji Omofojoye, Rachel Hershenberg, Allan I. Levey, Gari D. Clifford, Hiroko H. Dodge, Hyeokhyen Kwon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aging society urgently requires scalable methods to monitor cognitive
+decline and identify social and psychological factors indicative of dementia
+risk in older adults. Our machine learning (ML) models captured facial,
+acoustic, linguistic, and cardiovascular features from 39 individuals with
+normal cognition or Mild Cognitive Impairment derived from remote video
+conversations and classified cognitive status, social isolation, neuroticism,
+and psychological well-being. Our model could distinguish Clinical Dementia
+Rating Scale (CDR) of 0.5 (vs. 0) with 0.78 area under the receiver operating
+characteristic curve (AUC), social isolation with 0.75 AUC, neuroticism with
+0.71 AUC, and negative affect scales with 0.79 AUC. Recent advances in machine
+learning offer new opportunities to remotely detect cognitive impairment and
+assess associated factors, such as neuroticism and psychological well-being.
+Our experiment showed that speech and language patterns were more useful for
+quantifying cognitive impairment, whereas facial expression and cardiovascular
+patterns using photoplethysmography (PPG) were more useful for quantifying
+personality and psychological well-being.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaze-Informed Vision <span class="highlight-title">Transformer</span>s: Predicting Driving Decisions Under
+  Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13969v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13969v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharath Koorathota, Nikolas Papadopoulos, Jia Li Ma, Shruti Kumar, Xiaoxiao Sun, Arunesh Mittal, Patrick Adelman, Paul Sajda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViT) have advanced computer vision, yet their efficacy
+in complex tasks like driving remains less explored. This study enhances ViT by
+integrating human eye gaze, captured via eye-tracking, to increase prediction
+accuracy in driving scenarios under uncertainty in both real-world and virtual
+reality scenarios. First, we establish the significance of human eye gaze in
+left-right driving decisions, as observed in both human subjects and a ViT
+model. By comparing the similarity between human fixation maps and ViT
+attention weights, we reveal the dynamics of overlap across individual heads
+and layers. This overlap demonstrates that fixation data can guide the model in
+distributing its attention weights more effectively. We introduce the
+fixation-attention intersection (FAX) loss, a novel loss function that
+significantly improves ViT performance under high uncertainty conditions. Our
+results show that ViT, when trained with FAX loss, aligns its attention with
+human gaze patterns. This gaze-informed approach has significant potential for
+driver behavior analysis, as well as broader applications in human-centered AI
+systems, extending ViT's use to complex visual environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 9 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SepsisCalc: Integrating Clinical Calculators into Early Sepsis
+  Prediction via Dynamic Temporal Graph Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changchang Yin, Shihan Fu, Bingsheng Yao, Thai-Hoang Pham, Weidan Cao, Dakuo Wang, Jeffrey Caterino, Ping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sepsis is an organ dysfunction caused by a deregulated immune response to an
+infection. Early sepsis prediction and identification allow for timely
+intervention, leading to improved clinical outcomes. Clinical calculators
+(e.g., the six-organ dysfunction assessment of SOFA) play a vital role in
+sepsis identification within clinicians' workflow, providing evidence-based
+risk assessments essential for sepsis diagnosis. However, artificial
+intelligence (AI) sepsis prediction models typically generate a single sepsis
+risk score without incorporating clinical calculators for assessing organ
+dysfunctions, making the models less convincing and transparent to clinicians.
+To bridge the gap, we propose to mimic clinicians' workflow with a novel
+framework SepsisCalc to integrate clinical calculators into the predictive
+model, yielding a clinically transparent and precise model for utilization in
+clinical settings. Practically, clinical calculators usually combine
+information from multiple component variables in Electronic Health Records
+(EHR), and might not be applicable when the variables are (partially) missing.
+We mitigate this issue by representing EHRs as temporal graphs and integrating
+a learning module to dynamically add the accurately estimated calculator to the
+graphs. Experimental results on real-world datasets show that the proposed
+model outperforms state-of-the-art methods on sepsis prediction tasks.
+Moreover, we developed a system to identify organ dysfunctions and potential
+sepsis risks, providing a human-AI interaction tool for deployment, which can
+help clinicians understand the prediction outputs and prepare timely
+interventions for the corresponding dysfunctions, paving the way for actionable
+clinical decision-making support for early intervention.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GUTS: Generalized Uncertainty-Aware Thompson Sampling for Multi-Agent
+  Active Search <span class="chip">ICRA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02075v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02075v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Angad Bakshi, Tejus Gupta, Ramina Ghods, Jeff Schneider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic solutions for quick disaster response are essential to ensure minimal
+loss of life, especially when the search area is too dangerous or too vast for
+human rescuers. We model this problem as an asynchronous multi-agent
+active-search task where each robot aims to efficiently seek objects of
+interest (OOIs) in an unknown environment. This formulation addresses the
+requirement that search missions should focus on quick recovery of OOIs rather
+than full coverage of the search region. Previous approaches fail to accurately
+model sensing uncertainty, account for occlusions due to foliage or terrain, or
+consider the requirement for heterogeneous search teams and robustness to
+hardware and communication failures. We present the Generalized
+Uncertainty-aware Thompson Sampling (GUTS) algorithm, which addresses these
+issues and is suitable for deployment on heterogeneous multi-robot systems for
+active search in large unstructured environments. We show through simulation
+experiments that GUTS consistently outperforms existing methods such as
+parallelized Thompson Sampling and exhaustive search, recovering all OOIs in
+80% of all runs. In contrast, existing approaches recover all OOIs in less than
+40% of all runs. We conduct field tests using our multi-robot system in an
+unstructured environment with a search area of approximately 75,000 sq. m. Our
+system demonstrates robustness to various failure modes, achieving full
+recovery of OOIs (where feasible) in every field run, and significantly
+outperforming our baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures, 1 table, for associated video see:
+  https://youtu.be/K0jkzdQ_j2E , published in International Conference on
+  Robotics and Automation (ICRA) 2023. Outstanding Deployed Systems Paper
+  Winner</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Proactive Adversarial Defense: Harnessing <span class="highlight-title">Prompt</span> Tuning in
+  Vision-Language Models to Detect Unseen Backdoored Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.08755v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.08755v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Stein, Andrew Arash Mahyari, Guillermo Francia, Eman El-Sheikh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks pose a critical threat by embedding hidden triggers into
+inputs, causing models to misclassify them into target labels. While extensive
+research has focused on mitigating these attacks in object recognition models
+through weight fine-tuning, much less attention has been given to detecting
+backdoored samples directly. Given the vast datasets used in training, manual
+inspection for backdoor triggers is impractical, and even state-of-the-art
+defense mechanisms fail to fully neutralize their impact. To address this gap,
+we introduce a groundbreaking method to detect unseen backdoored images during
+both training and inference. Leveraging the transformative success of prompt
+tuning in Vision Language Models (VLMs), our approach trains learnable text
+prompts to differentiate clean images from those with hidden backdoor triggers.
+Experiments demonstrate the exceptional efficacy of this method, achieving an
+impressive average accuracy of 86% across two renowned datasets for detecting
+unseen backdoor triggers, establishing a new standard in backdoor defense.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explaining the Behavior of Black-Box Prediction Algorithms with Causal
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2006.02482v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2006.02482v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Numair Sani, Daniel Malinsky, Ilya Shpitser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal approaches to post-hoc explainability for black-box prediction models
+(e.g., deep neural networks trained on image pixel data) have become
+increasingly popular. However, existing approaches have two important
+shortcomings: (i) the "explanatory units" are micro-level inputs into the
+relevant prediction model, e.g., image pixels, rather than interpretable
+macro-level features that are more useful for understanding how to possibly
+change the algorithm's behavior, and (ii) existing approaches assume there
+exists no unmeasured confounding between features and target model predictions,
+which fails to hold when the explanatory units are macro-level variables. Our
+focus is on the important setting where the analyst has no access to the inner
+workings of the target prediction algorithm, rather only the ability to query
+the output of the model in response to a particular input. To provide causal
+explanations in such a setting, we propose to learn causal graphical
+representations that allow for arbitrary unmeasured confounding among features.
+We demonstrate the resulting graph can differentiate between interpretable
+features that causally influence model predictions versus those that are merely
+associated with model predictions due to confounding. Our approach is motivated
+by a counterfactual theory of causal explanation wherein good explanations
+point to factors that are "difference-makers" in an interventionist sense.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-08T00:00:00Z">2025-01-08</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">44</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous
+  Sensors via Language Grounding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Jones, Oier Mees, Carmelo Sferrazza, Kyle Stachowicz, Pieter Abbeel, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interacting with the world is a multi-sensory experience: achieving effective
+general-purpose interaction requires making use of all available modalities --
+including vision, touch, and audio -- to fill in gaps from partial observation.
+For example, when vision is occluded reaching into a bag, a robot should rely
+on its senses of touch and sound. However, state-of-the-art generalist robot
+policies are typically trained on large datasets to predict robot actions
+solely from visual and proprioceptive observations. In this work, we propose
+FuSe, a novel approach that enables finetuning visuomotor generalist policies
+on heterogeneous sensor modalities for which large datasets are not readily
+available by leveraging natural language as a common cross-modal grounding. We
+combine a multimodal contrastive loss with a sensory-grounded language
+generation loss to encode high-level semantics. In the context of robot
+manipulation, we show that FuSe enables performing challenging tasks that
+require reasoning jointly over modalities such as vision, touch, and sound in a
+zero-shot setting, such as multimodal prompting, compositional cross-modal
+prompting, and descriptions of objects it interacts with. We show that the same
+recipe is applicable to widely different generalist policies, including both
+diffusion-based generalist policies and large vision-language-action (VLA)
+models. Extensive experiments in the real world show that FuSeis able to
+increase success rates by over 20% compared to all considered baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "Can you be my mum?": Manipulating Social Robots in the Large Language
+  Models Era 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giulio Antonio Abbo, Gloria Desideri, Tony Belpaeme, Micol Spitale
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in robots powered by large language models have enhanced
+their conversational abilities, enabling interactions closely resembling human
+dialogue. However, these models introduce safety and security concerns in HRI,
+as they are vulnerable to manipulation that can bypass built-in safety
+measures. Imagining a social robot deployed in a home, this work aims to
+understand how everyday users try to exploit a language model to violate
+ethical principles, such as by prompting the robot to act like a life partner.
+We conducted a pilot study involving 21 university students who interacted with
+a Misty robot, attempting to circumvent its safety mechanisms across three
+scenarios based on specific HRI ethical principles: attachment, freedom, and
+empathy. Our results reveal that participants employed five techniques,
+including insulting and appealing to pity using emotional language. We hope
+this work can inform future research in designing strong safeguards to ensure
+ethical and secure human-robot interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FrontierNet: Learning Visual Cues to Explore 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04597v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04597v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyang Sun, Hanzhi Chen, Stefan Leutenegger, Cesar Cadena, Marc Pollefeys, Hermann Blum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploration of unknown environments is crucial for autonomous robots; it
+allows them to actively reason and decide on what new data to acquire for tasks
+such as mapping, object discovery, and environmental assessment. Existing
+methods, such as frontier-based methods, rely heavily on 3D map operations,
+which are limited by map quality and often overlook valuable context from
+visual cues. This work aims at leveraging 2D visual cues for efficient
+autonomous exploration, addressing the limitations of extracting goal poses
+from a 3D map. We propose a image-only frontier-based exploration system, with
+FrontierNet as a core component developed in this work. FrontierNet is a
+learning-based model that (i) detects frontiers, and (ii) predicts their
+information gain, from posed RGB images enhanced by monocular depth priors. Our
+approach provides an alternative to existing 3D-dependent exploration systems,
+achieving a 16% improvement in early-stage exploration efficiency, as validated
+through extensive simulations and real-world experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MobileH2R: Learning Generalizable Human to Mobile Robot Handover
+  Exclusively from Scalable and Diverse Synthetic Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04595v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04595v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zifan Wang, Ziqing Chen, Junyu Chen, Jilong Wang, Yuxin Yang, Yunze Liu, Xueyi Liu, He Wang, Li Yi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces MobileH2R, a framework for learning generalizable
+vision-based human-to-mobile-robot (H2MR) handover skills. Unlike traditional
+fixed-base handovers, this task requires a mobile robot to reliably receive
+objects in a large workspace enabled by its mobility. Our key insight is that
+generalizable handover skills can be developed in simulators using high-quality
+synthetic data, without the need for real-world demonstrations. To achieve
+this, we propose a scalable pipeline for generating diverse synthetic full-body
+human motion data, an automated method for creating safe and imitation-friendly
+demonstrations, and an efficient 4D imitation learning method for distilling
+large-scale demonstrations into closed-loop policies with base-arm
+coordination. Experimental evaluations in both simulators and the real world
+show significant improvements (at least +15% success rate) over baseline
+methods in all cases. Experiments also validate that large-scale and diverse
+synthetic data greatly enhances robot learning, highlighting our scalable
+framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Expectations for a Robotic Guide Dog for Visually Impaired
+  People 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04594v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04594v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J. Taery Kim, Morgan Byrd, Jack L. Crandell, Bruce N. Walker, Greg Turk, Sehoon Ha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic guide dogs hold significant potential to enhance the autonomy and
+mobility of blind or visually impaired (BVI) individuals by offering universal
+assistance over unstructured terrains at affordable costs. However, the design
+of robotic guide dogs remains underexplored, particularly in systematic aspects
+such as gait controllers, navigation behaviors, interaction methods, and verbal
+explanations. Our study addresses this gap by conducting user studies with 18
+BVI participants, comprising 15 cane users and three guide dog users.
+Participants interacted with a quadrupedal robot and provided both quantitative
+and qualitative feedback. Our study revealed several design implications, such
+as a preference for a learning-based controller and a rigid handle, gradual
+turns with asymmetric speeds, semantic communication methods, and
+explainability. The study also highlighted the importance of customization to
+support users with diverse backgrounds and preferences, along with practical
+concerns such as battery life, maintenance, and weather issues. These findings
+offer valuable insights and design implications for future research and
+development of robotic guide dogs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures, Proceedings of the 2025 ACM/IEEE International
+  Conference on Human-Robot Interaction (HRI'25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A 65 nm Bayesian Neural Network Accelerator with 360 fJ/Sample In-Word
+  GRNG for AI Uncertainty Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zephan M. Enciso, Boyang Cheng, Likai Pei, Jianbo Liu, Steven Davis, Ningyuan Cao, Michael Niemier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty estimation is an indispensable capability for AI-enabled,
+safety-critical applications, e.g. autonomous vehicles or medical diagnosis.
+Bayesian neural networks (BNNs) use Bayesian statistics to provide both
+classification predictions and uncertainty estimation, but they suffer from
+high computational overhead associated with random number generation and
+repeated sample iterations. Furthermore, BNNs are not immediately amenable to
+acceleration through compute-in-memory architectures due to the frequent memory
+writes necessary after each RNG operation. To address these challenges, we
+present an ASIC that integrates 360 fJ/Sample Gaussian RNG directly into the
+SRAM memory words. This integration reduces RNG overhead and enables
+fully-parallel compute-in-memory operations for BNNs. The prototype chip
+achieves 5.12 GSa/s RNG throughput and 102 GOp/s neural network throughput
+while occupying 0.45 mm2, bringing AI uncertainty estimation to edge
+computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cyber-Physical Steganography in Robotic Motion Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ching-Chun Chang, Yijie Lin, Isao Echizen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Steganography, the art of information hiding, has continually evolved across
+visual, auditory and linguistic domains, adapting to the ceaseless interplay
+between steganographic concealment and steganalytic revelation. This study
+seeks to extend the horizons of what constitutes a viable steganographic medium
+by introducing a steganographic paradigm in robotic motion control. Based on
+the observation of the robot's inherent sensitivity to changes in its
+environment, we propose a methodology to encode messages as environmental
+stimuli influencing the motions of the robotic agent and to decode messages
+from the resulting motion trajectory. The constraints of maximal robot
+integrity and minimal motion deviation are established as fundamental
+principles underlying secrecy. As a proof of concept, we conduct experiments in
+simulated environments across various manipulation tasks, incorporating robotic
+embodiments equipped with generalist multimodal policies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SplineFormer: An Explainable <span class="highlight-title">Transformer</span>-Based Approach for Autonomous
+  Endovascular Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tudor Jianu, Shayan Doust, Mengyun Li, Baoru Huang, Tuong Do, Hoan Nguyen, Karl Bates, Tung D. Ta, Sebastiano Fichera, Pierre Berthet-Rayne, Anh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Endovascular navigation is a crucial aspect of minimally invasive procedures,
+where precise control of curvilinear instruments like guidewires is critical
+for successful interventions. A key challenge in this task is accurately
+predicting the evolving shape of the guidewire as it navigates through the
+vasculature, which presents complex deformations due to interactions with the
+vessel walls. Traditional segmentation methods often fail to provide accurate
+real-time shape predictions, limiting their effectiveness in highly dynamic
+environments. To address this, we propose SplineFormer, a new transformer-based
+architecture, designed specifically to predict the continuous, smooth shape of
+the guidewire in an explainable way. By leveraging the transformer's ability,
+our network effectively captures the intricate bending and twisting of the
+guidewire, representing it as a spline for greater accuracy and smoothness. We
+integrate our SplineFormer into an end-to-end robot navigation system by
+leveraging the condensed information. The experimental results demonstrate that
+our SplineFormer is able to perform endovascular navigation autonomously and
+achieves a 50% success rate when cannulating the brachiocephalic artery on the
+real robot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safe Reinforcement Learning with Minimal Supervision <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Quessy, Thomas Richardson, Sebastian East
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) in the real world necessitates the development of
+procedures that enable agents to explore without causing harm to themselves or
+others. The most successful solutions to the problem of safe RL leverage
+offline data to learn a safe-set, enabling safe online exploration. However,
+this approach to safe-learning is often constrained by the demonstrations that
+are available for learning.
+  In this paper we investigate the influence of the quantity and quality of
+data used to train the initial safe learning problem offline on the ability to
+learn safe-RL policies online. Specifically, we focus on tasks with spatially
+extended goal states where we have few or no demonstrations available.
+Classically this problem is addressed either by using hand-designed controllers
+to generate data or by collecting user-generated demonstrations. However, these
+methods are often expensive and do not scale to more complex tasks and
+environments. To address this limitation we propose an unsupervised RL-based
+offline data collection procedure, to learn complex and scalable policies
+without the need for hand-designed controllers or user demonstrations. Our
+research demonstrates the significance of providing sufficient demonstrations
+for agents to learn optimal safe-RL policies online, and as a result, we
+propose optimistic forgetting, a novel online safe-RL approach that is
+practical for scenarios with limited data. Further, our unsupervised data
+collection approach highlights the need to balance diversity and optimality for
+safe online exploration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Initially submitted to ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Research on environment perception and behavior prediction of
+  intelligent UAV based on semantic communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kechong Ren, Li Gao, Qi Guan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The convergence of drone delivery systems, virtual worlds, and blockchain has
+transformed logistics and supply chain management, providing a fast, and
+environmentally friendly alternative to traditional ground transportation
+methods;Provide users with a real-world experience, virtual service providers
+need to collect up-to-the-minute delivery information from edge devices. To
+address this challenge, 1) a reinforcement learning approach is introduced to
+enable drones with fast training capabilities and the ability to autonomously
+adapt to new virtual scenarios for effective resource allocation.2) A semantic
+communication framework for meta-universes is proposed, which utilizes the
+extraction of semantic information to reduce the communication cost and
+incentivize the transmission of information for meta-universe services.3) In
+order to ensure that user information security, a lightweight authentication
+and key agreement scheme is designed between the drone and the user by
+introducing blockchain technology. In our experiments, the drone adaptation
+performance is improved by about 35\%, and the local offloading rate can reach
+90\% with the increase of the number of base stations. The semantic
+communication system proposed in this paper is compared with the Cross Entropy
+baseline model. Introducing blockchain technology the throughput of the
+transaction is maintained at a stable value with different number of drones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Artificial Intelligence Strategies for Drone Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rubén San-Segundo, Lucía Angulo, Manuel Gil-Martín, David Carramiñana, Ana M. Bernardos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: This paper describes the development of hybrid artificial
+intelligence strategies for drone navigation. Methods: The navigation module
+combines a deep learning model with a rule-based engine depending on the agent
+state. The deep learning model has been trained using reinforcement learning.
+The rule-based engine uses expert knowledge to deal with specific situations.
+The navigation module incorporates several strategies to explain the drone
+decision based on its observation space, and different mechanisms for including
+human decisions in the navigation process. Finally, this paper proposes an
+evaluation methodology based on defining several scenarios and analyzing the
+performance of the different strategies according to metrics adapted to each
+scenario. Results: Two main navigation problems have been studied. For the
+first scenario (reaching known targets), it has been possible to obtain a 90%
+task completion rate, reducing significantly the number of collisions thanks to
+the rule-based engine. For the second scenario, it has been possible to reduce
+20% of the time required to locate all the targets using the reinforcement
+learning model. Conclusions: Reinforcement learning is a very good strategy to
+learn policies for drone navigation, but in critical situations, it is
+necessary to complement it with a rule-based module to increase task success
+rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Path Planning Problem of Rolling Contacts: Approaches,
+  Applications and Future Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Amir Tafrishi, Mikhail Svinin, Kenji Tahara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores an eclectic range of path-planning methodologies
+engineered for rolling surfaces. Our focus is on the kinematic intricacies of
+rolling contact systems, which are investigated through a motion planning lens.
+Beyond summarizing the approaches to single-contact rotational surfaces, we
+explore the challenging domain of spin-rolling multi-contact systems. Our work
+proposes solutions for the higher-dimensional problem of multiple rotating
+objects in contact. Venturing beyond kinematics, these methodologies find
+application across a spectrum of domains, including rolling robots,
+reconfigurable swarm robotics, micro/nano manipulation, and nonprehensile
+manipulations. Through meticulously examining established planning strategies,
+we unveil their practical implementations in various real-world scenarios, from
+intricate dexterous manipulation tasks to the nimble manoeuvring of rolling
+robots and even shape planning of multi-contact swarms of particles. This study
+introduces the persistent challenges and unexplored frontiers of robotics,
+intricately linked to both path planning and mechanism design. As we illuminate
+existing solutions, we also set the stage for future breakthroughs in this
+dynamic and rapidly evolving field by highlighting the critical importance of
+addressing rolling contact problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual-Force: Enhanced Offline Diversity Maximization under Imitation
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavel Kolev, Marin Vlastelica, Georg Martius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While many algorithms for diversity maximization under imitation constraints
+are online in nature, many applications require offline algorithms without
+environment interactions. Tackling this problem in the offline setting,
+however, presents significant challenges that require non-trivial, multi-stage
+optimization processes with non-stationary rewards. In this work, we present a
+novel offline algorithm that enhances diversity using an objective based on Van
+der Waals (VdW) force and successor features, and eliminates the need to learn
+a previously used skill discriminator. Moreover, by conditioning the value
+function and policy on a pre-trained Functional Reward Encoding (FRE), our
+method allows for better handling of non-stationary rewards and provides
+zero-shot recall of all skills encountered during training, significantly
+expanding the set of skills learned in prior work. Consequently, our algorithm
+benefits from receiving a consistently strong diversity signal (VdW), and
+enjoys more stable and efficient training. We demonstrate the effectiveness of
+our method in generating diverse skills for two robotic tasks in simulation:
+locomotion of a quadruped and local navigation with obstacle traversal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implementation Of Wildlife Observation System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neethu K N, Rakshitha Y Nayak,  Rashmi, Meghana S
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By entering the habitats of wild animals, wildlife watchers can engage
+closely with them. There are some wild animals that are not always safe to
+approach. Therefore, we suggest this system for observing wildlife. Android
+phones can be used by users to see live events. Wildlife observers can thus get
+a close-up view of wild animals by employing this robotic vehicle. The commands
+are delivered to the system via a Wi-Fi module. As we developed the technology
+to enable our robot to deal with the challenges of maintaining continuous
+surveillance of a target, we found that our robot needed to be able to move
+silently and purposefully when monitoring a natural target without being
+noticed. After processing the data, the computer sends commands to the motors
+to turn on. The driver motors, which deliver the essential signal outputs to
+drive the vehicle movement, are now in charge of driving the motors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cluster & Disperse: a general air conflict resolution heuristic using
+  unsupervised learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirmojtaba Gharibi, John-Paul Clarke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide a general and malleable heuristic for the air conflict resolution
+problem. This heuristic is based on a new neighborhood structure for searching
+the solution space of trajectories and flight-levels. Using unsupervised
+learning, the core idea of our heuristic is to cluster the conflict points and
+disperse them in various flight levels. Our first algorithm is called Cluster &
+Disperse and in each iteration it assigns the most problematic flights in each
+cluster to another flight-level. In effect, we shuffle them between the
+flight-levels until we achieve a well-balanced configuration. The Cluster &
+Disperse algorithm then uses any horizontal plane conflict resolution algorithm
+as a subroutine to solve these well-balanced instances. Nevertheless, we
+develop a novel algorithm for the horizontal plane based on a similar idea.
+That is we cluster and disperse the conflict points spatially in the same
+flight level using the gradient descent and a social force. We use a novel
+maneuver making flights travel on an arc instead of a straight path which is
+based on the aviation routine of the Radius to Fix legs. Our algorithms can
+handle a high density of flights within a reasonable computation time. We put
+their performance in context with some notable algorithms from the literature.
+Being a general framework, a particular strength of the Cluster & Disperse is
+its malleability in allowing various constraints regarding the aircraft or the
+environment to be integrated with ease. This is in contrast to the models for
+instance based on mixed integer programming.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenIN: Open-Vocabulary Instance-Oriented Navigation in Dynamic Domestic
+  Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Tang, Meiling Wang, Yinan Deng, Zibo Zheng, Jingchuan Deng, Yufeng Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In daily domestic settings, frequently used objects like cups often have
+unfixed positions and multiple instances within the same category, and their
+carriers frequently change as well. As a result, it becomes challenging for a
+robot to efficiently navigate to a specific instance. To tackle this challenge,
+the robot must capture and update scene changes and plans continuously.
+However, current object navigation approaches primarily focus on the semantic
+level and lack the ability to dynamically update scene representation. In
+contrast, this paper captures the relationships between frequently used objects
+and their static carriers. It constructs an open-vocabulary
+Carrier-Relationship Scene Graph (CRSG) and updates the carrying status during
+robot navigation to reflect the dynamic changes of the scene. Based on the
+CRSG, we further propose an instance navigation strategy that models the
+navigation process as a Markov Decision Process. At each step, decisions are
+informed by the Large Language Model's commonsense knowledge and
+visual-language feature similarity. We designed a series of long-sequence
+navigation tasks for frequently used everyday items in the Habitat simulator.
+The results demonstrate that by updating the CRSG, the robot can efficiently
+navigate to moved targets. Additionally, we deployed our algorithm on a real
+robot and validated its practical effectiveness. The project page can be found
+here: https://OpenIN-nav.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2409.18743</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Adaptivity and Safety: Learning Agile Collision-Free Locomotion
+  Across Varied Physics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichao Zhong, Chong Zhang, Tairan He, Guanya Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world legged locomotion systems often need to reconcile agility and
+safety for different scenarios. Moreover, the underlying dynamics are often
+unknown and time-variant (e.g., payload, friction). In this paper, we introduce
+BAS (Bridging Adaptivity and Safety), which builds upon the pipeline of prior
+work Agile But Safe (ABS)(He et al.) and is designed to provide adaptive safety
+even in dynamic environments with uncertainties. BAS involves an agile policy
+to avoid obstacles rapidly and a recovery policy to prevent collisions, a
+physical parameter estimator that is concurrently trained with agile policy,
+and a learned control-theoretic RA (reach-avoid) value network that governs the
+policy switch. Also, the agile policy and RA network are both conditioned on
+physical parameters to make them adaptive. To mitigate the distribution shift
+issue, we further introduce an on-policy fine-tuning phase for the estimator to
+enhance its robustness and accuracy. The simulation results show that BAS
+achieves 50% better safety than baselines in dynamic environments while
+maintaining a higher speed on average. In real-world experiments, BAS shows its
+capability in complex environments with unknown physics (e.g., slippery floors
+with unknown frictions, unknown payloads up to 8kg), while baselines lack
+adaptivity, leading to collisions or. degraded agility. As a result, BAS
+achieves a 19.8% increase in speed and gets a 2.36 times lower collision rate
+than ABS in the real world. Videos: https://adaptive-safe-locomotion.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 Pages, 6 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robotic Programmer: Video Instructed Policy Code Generation for Robotic
+  Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Senwei Xie, Hongyu Wang, Zhanqi Xiao, Ruiping Wang, Xilin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot generalization across various robots, tasks and environments
+remains a significant challenge in robotic manipulation. Policy code generation
+methods use executable code to connect high-level task descriptions and
+low-level action sequences, leveraging the generalization capabilities of large
+language models and atomic skill libraries. In this work, we propose Robotic
+Programmer (RoboPro), a robotic foundation model, enabling the capability of
+perceiving visual information and following free-form instructions to perform
+robotic manipulation with policy code in a zero-shot manner. To address low
+efficiency and high cost in collecting runtime code data for robotic tasks, we
+devise Video2Code to synthesize executable code from extensive videos
+in-the-wild with off-the-shelf vision-language model and code-domain large
+language model. Extensive experiments show that RoboPro achieves the
+state-of-the-art zero-shot performance on robotic manipulation in both
+simulators and real-world environments. Specifically, the zero-shot success
+rate of RoboPro on RLBench surpasses the state-of-the-art model GPT-4o by
+11.6%, which is even comparable to a strong supervised training baseline.
+Furthermore, RoboPro is robust to variations on API formats and skill sets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KN-LIO: Geometric Kinematics and Neural Field Coupled LiDAR-Inertial
+  Odometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhong Wang, Lele Ren, Yue Wen, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in LiDAR-Inertial Odometry (LIO) have boosted a large
+amount of applications. However, traditional LIO systems tend to focus more on
+localization rather than mapping, with maps consisting mostly of sparse
+geometric elements, which is not ideal for downstream tasks. Recent emerging
+neural field technology has great potential in dense mapping, but pure LiDAR
+mapping is difficult to work on high-dynamic vehicles. To mitigate this
+challenge, we present a new solution that tightly couples geometric kinematics
+with neural fields to enhance simultaneous state estimation and dense mapping
+capabilities. We propose both semi-coupled and tightly coupled Kinematic-Neural
+LIO (KN-LIO) systems that leverage online SDF decoding and iterated error-state
+Kalman filtering to fuse laser and inertial data. Our KN-LIO minimizes
+information loss and improves accuracy in state estimation, while also
+accommodating asynchronous multi-LiDAR inputs. Evaluations on diverse
+high-dynamic datasets demonstrate that our KN-LIO achieves performance on par
+with or superior to existing state-of-the-art solutions in pose estimation and
+offers improved dense mapping accuracy over pure LiDAR-based methods. The
+relevant code and datasets will be made available at https://**.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constraints as Rewards: Reinforcement Learning for Robots without Reward
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Ishihara, Noriaki Takasugi, Kotaro Kawakami, Masaya Kinoshita, Kazumi Aoyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning has become an essential algorithm for generating
+complex robotic behaviors. However, to learn such behaviors, it is necessary to
+design a reward function that describes the task, which often consists of
+multiple objectives that needs to be balanced. This tuning process is known as
+reward engineering and typically involves extensive trial-and-error. In this
+paper, to avoid this trial-and-error process, we propose the concept of
+Constraints as Rewards (CaR). CaR formulates the task objective using multiple
+constraint functions instead of a reward function and solves a reinforcement
+learning problem with constraints using the Lagrangian-method. By adopting this
+approach, different objectives are automatically balanced, because Lagrange
+multipliers serves as the weights among the objectives. In addition, we will
+demonstrate that constraints, expressed as inequalities, provide an intuitive
+interpretation of the optimization target designed for the task. We apply the
+proposed method to the standing-up motion generation task of a
+six-wheeled-telescopic-legged robot and demonstrate that the proposed method
+successfully acquires the target behavior, even though it is challenging to
+learn with manually designed reward functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ STLCG++: A Masking Approach for Differentiable Signal Temporal Logic
+  Specification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parv Kapoor, Kazuki Mizuta, Eunsuk Kang, Karen Leung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Signal Temporal Logic (STL) offers a concise yet expressive framework for
+specifying and reasoning about spatio-temporal behaviors of robotic systems.
+Attractively, STL admits the notion of robustness, the degree to which an input
+signal satisfies or violates an STL specification, thus providing a nuanced
+evaluation of system performance. Notably, the differentiability of STL
+robustness enables direct integration to robotics workflows that rely on
+gradient-based optimization, such as trajectory optimization and deep learning.
+However, existing approaches to evaluating and differentiating STL robustness
+rely on recurrent computations, which become inefficient with longer sequences,
+limiting their use in time-sensitive applications. In this paper, we present
+STLCG++, a masking-based approach that parallelizes STL robustness evaluation
+and backpropagation across timesteps, achieving more than 1000x faster
+computation time than the recurrent approach. We also introduce a smoothing
+technique for differentiability through time interval bounds, expanding STL's
+applicability in gradient-based optimization tasks over spatial and temporal
+variables. Finally, we demonstrate STLCG++'s benefits through three robotics
+use cases and provide open-source Python libraries in JAX and PyTorch for
+seamless integration into modern robotics workflows.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be submitted to robotics journal for review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GNN-based Decentralized Perception in Multirobot Systems for Predicting
+  Worker Actions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Imran, Giovanni Beltrame, David St-Onge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In industrial environments, predicting human actions is essential for
+ensuring safe and effective collaboration between humans and robots. This paper
+introduces a perception framework that enables mobile robots to understand and
+share information about human actions in a decentralized way. The framework
+first allows each robot to build a spatial graph representing its surroundings,
+which it then shares with other robots. This shared spatial data is combined
+with temporal information to track human behavior over time. A swarm-inspired
+decision-making process is used to ensure all robots agree on a unified
+interpretation of the human's actions. Results show that adding more robots and
+incorporating longer time sequences improve prediction accuracy. Additionally,
+the consensus mechanism increases system resilience, making the multi-robot
+setup more reliable in dynamic industrial settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to RA-L</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Use of Robots for Diary Studies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael F. Xu, Bilge Mutlu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As interest in studying in-the-wild human-robot interaction grows, there is a
+need for methods to collect data over time and in naturalistic or potentially
+private environments. HRI researchers have increasingly used the diary method
+for these studies, asking study participants to self-administer a structured
+data collection instrument, i.e., a diary, over a period of time. Although the
+diary method offers a unique window into settings that researchers may not have
+access to, they also lack the interactivity and probing that interview-based
+methods offer. In this paper, we explore a novel data collection method in
+which a robot plays the role of an interactive diary. We developed the Diary
+Robot system and performed in-home deployments for a week to evaluate the
+feasibility and effectiveness of this approach. Using traditional text-based
+and audio-based diaries as benchmarks, we found that robots are able to
+effectively elicit the intended information. We reflect on our findings, and
+describe scenarios where the utilization of robots in diary studies as a data
+collection instrument may be especially applicable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 29th ACM/IEEE International Conference on Human
+  Robot Interaction (HRI 2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Robot Safety from Sparse Human Feedback using Conformal
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron O. Feldman, Joseph A. Vincent, Maximilian Adang, Jun En Low, Mac Schwager
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring robot safety can be challenging; user-defined constraints can miss
+edge cases, policies can become unsafe even when trained from safe data, and
+safety can be subjective. Thus, we learn about robot safety by showing policy
+trajectories to a human who flags unsafe behavior. From this binary feedback,
+we use the statistical method of conformal prediction to identify a region of
+states, potentially in learned latent space, guaranteed to contain a
+user-specified fraction of future policy errors. Our method is
+sample-efficient, as it builds on nearest neighbor classification and avoids
+withholding data as is common with conformal prediction. By alerting if the
+robot reaches the suspected unsafe region, we obtain a warning system that
+mimics the human's safety preferences with guaranteed miss rate. From video
+labeling, our system can detect when a quadcopter visuomotor policy will fail
+to steer through a designated gate. We present an approach for policy
+improvement by avoiding the suspected unsafe region. With it we improve a model
+predictive controller's safety, as shown in experimental testing with 30
+quadcopter flights across 6 navigation tasks. Code and videos are provided.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimize the parameters of the PID Controller using Genetic Algorithm
+  for Robot Manipulators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vu Ngoc Son, Pham Van Cuong, Nguyen Duy Minh, Phi Hoang Nha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the design a Proportional-Integral-Derivative (PID)
+controller with optimized parameters for a two-degree-of-freedom robotic arm. A
+genetic algorithm (GA) is proposed to optimize the controller parameters,
+addressing the challenges in determining PID controller parameters for highly
+nonlinear systems like robotic arms compared to traditional methods. The
+GA-optimized PID controller significantly improves control accuracy and
+performance over traditional control methods. Simulation results demonstrate
+that the robotic arm system operates with high precision and stability.
+Additionally, the shortened trajectory tracking response time enhances the
+feasibility of applying this control algorithm in realworld scenarios. This
+research not only confirms the suitability of PID-GA for robotic arms and
+similar systems but also opens new avenues for applying this algorithm to real
+physical systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Human-Robot Teaching by Quantifying and Reducing Mental Model
+  Mismatch 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phillip Richter, Heiko Wersing, Anna-Lisa Vollmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of artificial intelligence and robotics has had a
+significant impact on our lives, with intelligent systems increasingly
+performing tasks traditionally performed by humans. Efficient knowledge
+transfer requires matching the mental model of the human teacher with the
+capabilities of the robot learner. This paper introduces the Mental Model
+Mismatch (MMM) Score, a feedback mechanism designed to quantify and reduce
+mismatches by aligning human teaching behavior with robot learning behavior.
+Using Large Language Models (LLMs), we analyze teacher intentions in natural
+language to generate adaptive feedback. A study with 150 participants teaching
+a virtual robot to solve a puzzle game shows that intention-based feedback
+significantly outperforms traditional performance-based feedback or no
+feedback. The results suggest that intention-based feedback improves
+instructional outcomes, improves understanding of the robot's learning process
+and reduces misconceptions. This research addresses a critical gap in
+human-robot interaction (HRI) by providing a method to quantify and mitigate
+discrepancies between human mental models and robot capabilities, with the goal
+of improving robot learning and human teaching effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 Pages, 4 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development of an Adaptive Sliding Mode Controller using Neural Networks
+  for Trajectory Tracking of a Cylindrical Manipulator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        TieuNien Le, VanCuong Pham, NgocSon Vu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cylindrical manipulators are extensively used in industrial automation,
+especially in emerging technologies like 3D printing, which represents a
+significant future trend. However, controlling the trajectory of nonlinear
+models with system uncertainties remains a critical challenge, often leading to
+reduced accuracy and reliability. To address this, the study develops an
+Adaptive Sliding Mode Controller (ASMC) integrated with Neural Networks (NNs)
+to improve trajectory tracking for cylindrical manipulators. The ASMC leverages
+the robustness of sliding mode control and the adaptability of neural networks
+to handle uncertainties and dynamic variations effectively. Simulation results
+validate that the proposed ASMC-NN achieves high trajectory tracking accuracy,
+fast response time, and enhanced reliability, making it a promising solution
+for applications in 3D printing and beyond.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human Grasp Generation for Rigid and Deformable Objects with Decomposed
+  VQ-VAE 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05483v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05483v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengshi Qi, Zhe Zhao, Huadong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating realistic human grasps is crucial yet challenging for object
+manipulation in computer graphics and robotics. Current methods often struggle
+to generate detailed and realistic grasps with full finger-object interaction,
+as they typically rely on encoding the entire hand and estimating both posture
+and position in a single step. Additionally, simulating object deformation
+during grasp generation is still difficult, as modeling such deformation
+requires capturing the comprehensive relationship among points of the object's
+surface. To address these limitations, we propose a novel improved Decomposed
+Vector-Quantized Variational Autoencoder (DVQ-VAE-2), which decomposes the hand
+into distinct parts and encodes them separately. This part-aware architecture
+allows for more precise management of hand-object interactions. Furthermore, we
+introduce a dual-stage decoding strategy that first predicts the grasp type
+under skeletal constraints and then identifies the optimal grasp position,
+enhancing both the realism and adaptability of the model to unseen
+interactions. Furthermore, we introduce a new Mesh UFormer as the backbone
+network to extract the hierarchical structural representations from the mesh
+and propose a new normal vector-guided position encoding to simulate the
+hand-object deformation. In experiments, our model achieves a relative
+improvement of approximately 14.1% in grasp quality compared to
+state-of-the-art methods across four widely used benchmarks. Our comparisons
+with other backbone networks show relative improvements of 2.23% in Hand-object
+Contact Distance and 5.86% in Quality Index on deformable and rigid object
+based datasets, respectively. Our source code and model are available at
+https://github.com/florasion/D-VQVAE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NextStop: An Improved Tracker For Panoptic LIDAR Segmentation Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nirit Alkalay, Roy Orfaig, Ben-Zion Bobrovsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  4D panoptic LiDAR segmentation is essential for scene understanding in
+autonomous driving and robotics ,combining semantic and instance segmentation
+with temporal consistency.Current methods, like 4D-PLS and 4D-STOP, use a
+tracking-by-detection methodology, employing deep learning networks to perform
+semantic and instance segmentation on each frame. To maintain temporal
+consistency, large-size instances detected in the current frame are compared
+and associated with instances within a temporal window that includes the
+current and preceding frames. However, their reliance on short-term instance
+detection, lack of motion estimation, and exclusion of small-sized instances
+lead to frequent identity switches and reduced tracking performance. We address
+these issues with the NextStop1 tracker, which integrates Kalman filter-based
+motion estimation, data association, and lifespan management, along with a
+tracklet state concept to improve prioritization. Evaluated using the LiDAR
+Segmentation and Tracking Quality (LSTQ) metric on the SemanticKITTI validation
+set, NextStop demonstrated enhanced tracking performance, particularly for
+small-sized objects like people and bicyclists, with fewer ID switches, earlier
+tracking initiation, and improved reliability in complex environments. The
+source code is available at https://github.com/AIROTAU/NextStopTracker
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SDPRLayers: Certifiable Backpropagation Through Polynomial Optimization
+  Problems in Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19309v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19309v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Connor Holmes, Frederike Dümbgen, Timothy D. Barfoot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A recent set of techniques in the robotics community, known as certifiably
+correct methods, frames robotics problems as polynomial optimization problems
+(POPs) and applies convex, semidefinite programming (SDP) relaxations to either
+find or certify their global optima. In parallel, differentiable optimization
+allows optimization problems to be embedded into end-to-end learning frameworks
+and has received considerable attention in the robotics community. In this
+paper, we consider the ill effect of convergence to spurious local minima in
+the context of learning frameworks that use differentiable optimization. We
+present SDPRLayers, an approach that seeks to address this issue by combining
+convex relaxations with implicit differentiation techniques to provide
+certifiably correct solutions and gradients throughout the training process. We
+provide theoretical results that outline conditions for the correctness of
+these gradients and provide efficient means for their computation. Our approach
+is first applied to two simple-but-demonstrative simulated examples, which
+expose the potential pitfalls of reliance on local optimization in existing,
+state-of-the-art, differentiable optimization methods. We then apply our method
+in a real-world application: we train a deep neural network to detect image
+keypoints for robot localization in challenging lighting conditions. We provide
+our open-source, PyTorch implementation of SDPRLayers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revised Version Submitted to T-RO</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Object-Oriented POMDP Planning for Object Rearrangement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01348v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01348v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajesh Mangannavar, Alan Fern, Prasad Tadepalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an online planning framework for solving multi-object
+rearrangement problems in partially observable, multi-room environments.
+Current object rearrangement solutions, primarily based on Reinforcement
+Learning or hand-coded planning methods, often lack adaptability to diverse
+challenges. To address this limitation, we introduce a novel Hierarchical
+Object-Oriented Partially Observed Markov Decision Process (HOO-POMDP) planning
+approach. This approach comprises of (a) an object-oriented POMDP planner
+generating sub-goals, (b) a set of low-level policies for sub-goal achievement,
+and (c) an abstraction system converting the continuous low-level world into a
+representation suitable for abstract planning. We evaluate our system on
+varying numbers of objects, rooms, and problem types in AI2-THOR simulated
+environments with promising results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 2 Figures. Preprint. Updated acknowledgments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LiLMaps: Learnable Implicit Language Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03304v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03304v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evgenii Kruzhkov, Sven Behnke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the current trends in robotics is to employ large language models
+(LLMs) to provide non-predefined command execution and natural human-robot
+interaction. It is useful to have an environment map together with its language
+representation, which can be further utilized by LLMs. Such a comprehensive
+scene representation enables numerous ways of interaction with the map for
+autonomously operating robots. In this work, we present an approach that
+enhances incremental implicit mapping through the integration of
+vision-language features. Specifically, we (i) propose a decoder optimization
+technique for implicit language maps which can be used when new objects appear
+on the scene, and (ii) address the problem of inconsistent vision-language
+predictions between different viewing positions. Our experiments demonstrate
+the effectiveness of LiLMaps and solid improvements in performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Revisiting Visual Place Recognition for Joining Submaps in
+  Multimap SLAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.12408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.12408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus Weißflog, Stefan Schubert, Peter Protzel, Peer Neubert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual SLAM is a key technology for many autonomous systems. However,
+tracking loss can lead to the creation of disjoint submaps in multimap SLAM
+systems like ORB-SLAM3. Because of that, these systems employ submap merging
+strategies. As we show, these strategies are not always successful. In this
+paper, we investigate the impact of using modern VPR approaches for submap
+merging in visual SLAM. We argue that classical evaluation metrics are not
+sufficient to estimate the impact of a modern VPR component on the overall
+system. We show that naively replacing the VPR component does not leverage its
+full potential without requiring substantial interference in the original
+system. Because of that, we present a post-processing pipeline along with a set
+of metrics that allow us to estimate the impact of modern VPR components. We
+evaluate our approach on the NCLT and Newer College datasets using ORB-SLAM3
+with NetVLAD and HDC-DELF as VPR components. Additionally, we present a simple
+approach for combining VPR with temporal consistency for map merging. We show
+that the map merging performance of ORB-SLAM3 can be improved. Building on
+these results, researchers in VPR can assess the potential of their approaches
+for SLAM systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at TAROS 2024. This is the submitted version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Harmonic Exponential Filter for Nonparametric Estimation on Motion
+  Groups 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.00907v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.00907v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Saavedra-Ruiz, Steven A. Parkison, Ria Arora, James Richard Forbes, Liam Paull
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian estimation is a vital tool in robotics as it allows systems to
+update the robot state belief using incomplete information from noisy sensors.
+To render the state estimation problem tractable, many systems assume that the
+motion and measurement noise, as well as the state distribution, are unimodal
+and Gaussian. However, there are numerous scenarios and systems that do not
+comply with these assumptions. Existing nonparametric filters that are used to
+model multimodal distributions have drawbacks that limit their ability to
+represent a diverse set of distributions. This paper introduces a novel
+approach to nonparametric Bayesian filtering on motion groups, designed to
+handle multimodal distributions using harmonic exponential distributions. This
+approach leverages two key insights of harmonic exponential distributions: a)
+the product of two distributions can be expressed as the element-wise addition
+of their log-likelihood Fourier coefficients, and b) the convolution of two
+distributions can be efficiently computed as the tensor product of their
+Fourier coefficients. These observations enable the development of an efficient
+and asymptotically exact solution to the Bayes filter up to the band limit of a
+Fourier transform. We demonstrate our filter's performance compared with
+established nonparametric filtering methods across simulated and real-world
+localization tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the IEEE Robotics and Automation Letters (RA-L 2025) Code
+  available at https://github.com/montrealrobotics/harmonic-filter. Webpage and
+  additional videos at https://montrealrobotics.ca/hef/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SenseRAG: Constructing Environmental Knowledge Bases with Proactive
+  Querying for LLM-Based Autonomous Driving <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03535v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03535v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuewen Luo, Fan Ding, Fengze Yang, Yang Zhou, Junnyong Loo, Hwa Hui Tew, Chenxi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the critical need for enhanced situational awareness in
+autonomous driving (AD) by leveraging the contextual reasoning capabilities of
+large language models (LLMs). Unlike traditional perception systems that rely
+on rigid, label-based annotations, it integrates real-time, multimodal sensor
+data into a unified, LLMs-readable knowledge base, enabling LLMs to dynamically
+understand and respond to complex driving environments. To overcome the
+inherent latency and modality limitations of LLMs, a proactive
+Retrieval-Augmented Generation (RAG) is designed for AD, combined with a
+chain-of-thought prompting mechanism, ensuring rapid and context-rich
+understanding. Experimental results using real-world Vehicle-to-everything
+(V2X) datasets demonstrate significant improvements in perception and
+prediction performance, highlighting the potential of this framework to enhance
+safety, adaptability, and decision-making in next-generation AD systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at WACV Workshop LLMAD
+  2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Informed, Constrained, Aligned: A Field Analysis on Degeneracy-aware
+  Point Cloud Registration in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11809v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11809v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Turcan Tuna, Julian Nubert, Patrick Pfreundschuh, Cesar Cadena, Shehryar Khattak, Marco Hutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ICP registration algorithm has been a preferred method for LiDAR-based
+robot localization for nearly a decade. However, even in modern SLAM solutions,
+ICP can degrade and become unreliable in geometrically ill-conditioned
+environments. Current solutions primarily focus on utilizing additional sources
+of information, such as external odometry, to either replace the degenerate
+directions of the optimization solution or add additional constraints in a
+sensor-fusion setup afterward.
+  In response, this work investigates and compares new and existing degeneracy
+mitigation methods for robust LiDAR-based localization and analyzes the
+efficacy of these approaches in degenerate environments for the first time in
+the literature at this scale. Specifically, this work investigates i) the
+effect of using active or passive degeneracy mitigation methods for the problem
+of ill-conditioned ICP in LiDAR degenerate environments, ii) the evaluation of
+TSVD, inequality constraints, and linear/non-linear Tikhonov regularization for
+the application of degenerate point cloud registration for the first time.
+Furthermore, a sensitivity analysis for least-squares minimization step of the
+ICP problem is carried out to better understand how each method affects the
+optimization and what to expect from each method. The results of the analysis
+are validated through multiple real-world robotic field and simulated
+experiments. The analysis demonstrates that active optimization degeneracy
+mitigation is necessary and advantageous in the absence of reliable external
+estimate assistance for LiDAR-SLAM, and soft-constrained methods can provide
+better results in complex ill-conditioned scenarios with heuristic fine-tuned
+parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Transactions on Field Robotics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Task Coordination and Trajectory Optimization for Multi-Aerial Systems
+  via Signal Temporal Logic: A Wind Turbine Inspection Study <span class="chip">IROS'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.06620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.06620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giuseppe Silano, Alvaro Caballero, Davide Liuzza, Luigi Iannelli, Stjepan Bogdan, Martin Saska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a method for task allocation and trajectory generation in
+cooperative inspection missions using a fleet of multirotor drones, with a
+focus on wind turbine inspection. The approach generates safe, feasible flight
+paths that adhere to time-sensitive constraints and vehicle limitations by
+formulating an optimization problem based on Signal Temporal Logic (STL)
+specifications. An event-triggered replanning mechanism addresses unexpected
+events and delays, while a generalized robustness scoring method incorporates
+user preferences and minimizes task conflicts. The approach is validated
+through simulations in MATLAB and Gazebo, as well as field experiments in a
+mock-up scenario.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2 pages, Accepted for discussion at the workshop session "Formal
+  methods techniques in robotics systems: Design and control" at IROS'24 in Abu
+  Dhabi, UAE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion Manifold Flow Primitives for Task-Conditioned Trajectory
+  Generation under Complex Task-Motion Dependencies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19681v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19681v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonghyeon Lee, Byeongho Lee, Seungyeon Kim, Frank C. Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective movement primitives should be capable of encoding and generating a
+rich repertoire of trajectories -- typically collected from human
+demonstrations -- conditioned on task-defining parameters such as vision or
+language inputs. While recent methods based on the motion manifold hypothesis,
+which assumes that a set of trajectories lies on a lower-dimensional nonlinear
+subspace, address challenges such as limited dataset size and the high
+dimensionality of trajectory data, they often struggle to capture complex
+task-motion dependencies, i.e., when motion distributions shift drastically
+with task variations. To address this, we introduce Motion Manifold Flow
+Primitives (MMFP), a framework that decouples the training of the motion
+manifold from task-conditioned distributions. Specifically, we employ flow
+matching models, state-of-the-art conditional deep generative models, to learn
+task-conditioned distributions in the latent coordinate space of the learned
+motion manifold. Experiments are conducted on language-guided trajectory
+generation tasks, where many-to-many text-motion correspondences introduce
+complex task-motion dependencies, highlighting MMFP's superiority over existing
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Future Success Prediction in Open-Vocabulary Object Manipulation Tasks
+  Based on End-Effector Trajectories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19112v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19112v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Motonari Kambara, Komei Sugiura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses a task designed to predict the future success or failure
+of open-vocabulary object manipulation. In this task, the model is required to
+make predictions based on natural language instructions, egocentric view images
+before manipulation, and the given end-effector trajectories. Conventional
+methods typically perform success prediction only after the manipulation is
+executed, limiting their efficiency in executing the entire task sequence. We
+propose a novel approach that enables the prediction of success or failure by
+aligning the given trajectories and images with natural language instructions.
+We introduce Trajectory Encoder to apply learnable weighting to the input
+trajectories, allowing the model to consider temporal dynamics and interactions
+between objects and the end effector, improving the model's ability to predict
+manipulation outcomes accurately. We constructed a dataset based on the RT-1
+dataset, a large-scale benchmark for open-vocabulary object manipulation tasks,
+to evaluate our method. The experimental results show that our method achieved
+a higher prediction accuracy than baseline approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at LangRob @ CoRL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safe MPC Alignment with Human Directional Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04216v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04216v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixian Xie, Wenlong Zhang, Yi Ren, Zhaoran Wang, George J. Pappas, Wanxin Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In safety-critical robot planning or control, manually specifying safety
+constraints or learning them from demonstrations can be challenging. In this
+article, we propose a certifiable alignment method for a robot to learn a
+safety constraint in its model predictive control (MPC) policy with human
+online directional feedback. To our knowledge, it is the first method to learn
+safety constraints from human feedback. The proposed method is based on an
+empirical observation: human directional feedback, when available, tends to
+guide the robot toward safer regions. The method only requires the direction of
+human feedback to update the learning hypothesis space. It is certifiable,
+providing an upper bound on the total number of human feedback in the case of
+successful learning, or declaring the hypothesis misspecification, i.e., the
+true implicit safety constraint cannot be found within the specified hypothesis
+space. We evaluated the proposed method using numerical examples and user
+studies in two simulation games. Additionally, we implemented and tested the
+proposed method on a real-world Franka robot arm performing mobile
+water-pouring tasks. The results demonstrate the efficacy and efficiency of our
+method, showing that it enables a robot to successfully learn safety
+constraints with a small handful (tens) of human directional corrections.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, submission to T-RO</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Vision-Language Models with Scene Graphs for Traffic Accident
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05910v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05910v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Lohner, Francesco Compagno, Jonathan Francis, Alessandro Oltramari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognizing a traffic accident is an essential part of any autonomous driving
+or road monitoring system. An accident can appear in a wide variety of forms,
+and understanding what type of accident is taking place may be useful to
+prevent it from recurring. This work focuses on classifying traffic scenes into
+specific accident types. We approach the problem by representing a traffic
+scene as a graph, where objects such as cars can be represented as nodes, and
+relative distances and directions between them as edges. This representation of
+a traffic scene is referred to as a scene graph, and can be used as input for
+an accident classifier. Better results are obtained with a classifier that
+fuses the scene graph input with visual and textual representations. This work
+introduces a multi-stage, multimodal pipeline that pre-processes videos of
+traffic accidents, encodes them as scene graphs, and aligns this representation
+with vision and language modalities before executing the classification task.
+When trained on 4 classes, our method achieves a balanced accuracy score of
+57.77% on an (unbalanced) subset of the popular Detection of Traffic Anomaly
+(DoTA) benchmark, representing an increase of close to 5 percentage points from
+the case where scene graph information is not taken into account.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Won the 'Best Paper Runner-up Award' at the 2024 IEEE International
+  Automated Vehicle Validation Conference (IAVVC 2024). Also accepted at the
+  1st Workshop on Semantic Reasoning and Goal Understanding in Robotics, at the
+  Robotics Science and Systems Conference (RSS SemRob 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TCAFF: Temporal Consistency for Robot Frame Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mason B. Peterson, Parker C. Lusk, Antonio Avila, Jonathan P. How
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of collaborative robotics, the ability to communicate spatial
+information like planned trajectories and shared environment information is
+crucial. When no global position information is available (e.g., indoor or
+GPS-denied environments), agents must align their coordinate frames before
+shared spatial information can be properly expressed and interpreted.
+Coordinate frame alignment is particularly difficult when robots have no
+initial alignment and are affected by odometry drift. To this end, we develop a
+novel multiple hypothesis algorithm, called TCAFF, for aligning the coordinate
+frames of neighboring robots. TCAFF considers potential alignments from
+associating sparse open-set object maps and leverages temporal consistency to
+determine an initial alignment and correct for drift, all without any initial
+knowledge of neighboring robot poses. We demonstrate TCAFF being used for frame
+alignment in a collaborative object tracking application on a team of four
+robots tracking six pedestrians and show that TCAFF enables robots to achieve a
+tracking accuracy similar to that of a system with ground truth localization.
+The code and hardware dataset are available at
+https://github.com/mit-acl/tcaff.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Coverage Path Planning in Precision Agriculture: Algorithms,
+  Applications, and Key Benefits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19813v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19813v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jahid Chowdhury Choton, William H. Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coverage path planning (CPP) is the task of computing an optimal path within
+a region to completely scan or survey an area of interest using one or multiple
+mobile robots. Robots equipped with sensors and cameras can collect vast
+amounts of data on crop health, soil conditions, and weather patterns. Advanced
+analytics can then be applied to this data to make informed decisions,
+improving overall farm management. In this paper, we will demonstrate one
+approach to find the optimal coverage path of an agricultural field using a
+single robot, and one using multiple robots. For the single robot, we used a
+wavefront coverage algorithm that generates a sequence of locations that the
+robot needs to follow. For the multi-robot approach, the proposed approach
+consists of two steps: dividing the agricultural field into convex polygonal
+areas to optimally distribute them among the robots, and generating an optimal
+coverage path to ensure minimum coverage time for each of the polygonal areas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The co-authors have asked to withdraw this paper, since it contains
+  incomplete and incorrect informations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formal Modeling and Verification of Publisher-Subscriber Paradigm in ROS
+  2 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16186v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16186v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jahid Chowdhury Choton, Lipsy Gupta, Pavithra Prabhakar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Robot Operating System (ROS) is one of the most popular middleware for
+developing robot applications, but it is subject to major shortcomings when
+applied to real-time robotic systems in safety-critical environments. For this
+reason, ROS 2 was released in 2017 for implementing real-time capabilities in
+distributed robotic systems while supporting the most prominent aspects of the
+original ROS. There is still not much work done to provide formal guarantees
+and correctness of a ROS program. In this paper, we propose a framework to
+address this challenging problem of guaranteeing the correct behaviour of
+robotic systems. We propose a formal modelling of a ROS 2 program, and also
+describe the program using a network of timed automata. We then prove that the
+sets of executions of a ROS program in the model and in the network of timed
+automata are the same. Thus to analyze a publisher-subscriber scenario of ROS 2
+program, our algorithm first converts the program into the model, and then into
+the network of timed automata. The applicability and validity of our approach
+are verified by conducting several experiments on a simplified system and an
+actual robotic system, and the results and limitations are discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The co-authors have asked to withdraw this paper, since it contains
+  incomplete and incorrect informations</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">31</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large-scale Grid Optimization: The Workhorse of Future Grid Computations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amritanshu Pandey, Mads Almassalkhi, Sam Chevalier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: The computation methods for modeling, controlling and optimizing the
+transforming grid are evolving rapidly. We review and systemize knowledge for a
+special class of computation methods that solve large-scale power grid
+optimization problems. Summary: Large-scale grid optimizations are pertinent
+for, amongst other things, hedging against risk due to resource stochasticity,
+evaluating aggregated DERs' impact on grid operation and design, and improving
+the overall efficiency of grid operation in terms of cost, reliability, and
+carbon footprint. We attribute the continual growth in scale and complexity of
+grid optimizations to a large influx of new spatial and temporal features in
+both transmission (T) and distribution (D) networks. Therefore, to systemize
+knowledge in the field, we discuss the recent advancements in T and D systems
+from the viewpoint of mechanistic physics-based and emerging data-driven
+methods. Findings: We find that while mechanistic physics-based methods are
+leading the science in solving large-scale grid optimizations, data-driven
+techniques, especially physics-constrained ones, are emerging as an alternative
+to solve otherwise intractable problems. We also find observable gaps in the
+field and ascertain these gaps from the paper's literature review and by
+collecting and synthesizing feedback from industry experts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regret Analysis: a control perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Travis E. Gibson, Sawal Acharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online learning and model reference adaptive control have many interesting
+intersections. One area where they differ however is in how the algorithms are
+analyzed and what objective or metric is used to discriminate "good" algorithms
+from "bad" algorithms. In adaptive control there are usually two objectives: 1)
+prove that all time varying parameters/states of the system are bounded, and 2)
+that the instantaneous error between the adaptively controlled system and a
+reference system converges to zero over time (or at least a compact set). For
+online learning the performance of algorithms is often characterized by the
+regret the algorithm incurs. Regret is defined as the cumulative loss (cost)
+over time from the online algorithm minus the cumulative loss (cost) of the
+single optimal fixed parameter choice in hindsight. Another significant
+difference between the two areas of research is with regard to the assumptions
+made in order to obtain said results. Adaptive control makes assumptions about
+the input-output properties of the control problem and derives solutions for a
+fixed error model or optimization task. In the online learning literature
+results are derived for classes of loss functions (i.e. convex) while a priori
+assuming that all time varying parameters are bounded, which for many
+optimization tasks is not unrealistic, but is a non starter in control
+applications. In this work we discuss these differences in detail through the
+regret based analysis of gradient descent for convex functions and the control
+based analysis of a streaming regression problem. We close with a discussion
+about the newly defined paradigm of online adaptive control and ask the
+following question "Are regret optimal control strategies deployable?"
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages no figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recursive Least Squares with Fading Regularization for Finite-Time
+  Convergence without Persistent Excitation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Lai, Dimitra Panagou, Dennis S. Bernstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper extends recursive least squares (RLS) to include time-varying
+regularization. This extension provides flexibility for updating the least
+squares regularization term in real time. Existing results with constant
+regularization imply that the parameter-estimation error dynamics of RLS are
+globally attractive to zero if and only the regressor is weakly persistently
+exciting. This work shows that, by extending classical RLS to include a
+time-varying (fading) regularization term that converges to zero, the
+parameter-estimation error dynamics are globally attractive to zero without
+weakly persistent excitation. Moreover, if the fading regularization term
+converges to zero in finite time, then the parameter estimation error also
+converges to zero in finite time. Finally, we propose rank-1 fading
+regularization (R1FR) RLS, a time-varying regularization algorithm with fading
+regularization that converges to zero, and which runs in the same computational
+complexity as classical RLS. Numerical examples are presented to validate
+theoretical guarantees and to show how R1FR-RLS can protect against
+over-regularization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the 2025 American Control Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ New Linear Model of a Composite Energy Storage System with Realizable
+  Dispatch Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mazen Elsaadany, Mads R. Almassalkhi, Simon H. Tindemans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To optimize battery dispatch, a model is required that can predict the state
+of charge (SOC) trajectory and ensure dispatch is admissible (i.e., does not
+lead to unexpected SOC saturation). But battery dispatch optimization is
+inherently challenging since batteries cannot simultaneously charge and
+discharge, which begets a non-convex complementarity constraint. In this paper,
+we consider a composition of energy storage elements that can charge or
+discharge independently and provide a sufficient linear energy storage model of
+the composite battery. This permits convex optimization of the composite
+battery SOC trajectory while ensuring admissibility of the resulting
+(aggregated) power schedule and disaggregation to the individual energy storage
+elements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safe Reinforcement Learning with Minimal Supervision <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Quessy, Thomas Richardson, Sebastian East
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) in the real world necessitates the development of
+procedures that enable agents to explore without causing harm to themselves or
+others. The most successful solutions to the problem of safe RL leverage
+offline data to learn a safe-set, enabling safe online exploration. However,
+this approach to safe-learning is often constrained by the demonstrations that
+are available for learning.
+  In this paper we investigate the influence of the quantity and quality of
+data used to train the initial safe learning problem offline on the ability to
+learn safe-RL policies online. Specifically, we focus on tasks with spatially
+extended goal states where we have few or no demonstrations available.
+Classically this problem is addressed either by using hand-designed controllers
+to generate data or by collecting user-generated demonstrations. However, these
+methods are often expensive and do not scale to more complex tasks and
+environments. To address this limitation we propose an unsupervised RL-based
+offline data collection procedure, to learn complex and scalable policies
+without the need for hand-designed controllers or user demonstrations. Our
+research demonstrates the significance of providing sufficient demonstrations
+for agents to learn optimal safe-RL policies online, and as a result, we
+propose optimistic forgetting, a novel online safe-RL approach that is
+practical for scenarios with limited data. Further, our unsupervised data
+collection approach highlights the need to balance diversity and optimality for
+safe online exploration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Initially submitted to ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating LLMs with ITS: Recent Advances, Potentials, Challenges, and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04437v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04437v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Doaa Mahmud, Hadeel Hajmohamed, Shamma Almentheri, Shamma Alqaydi, Lameya Aldhaheri, Ruhul Amin Khalil, Nasir Saeed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent Transportation Systems (ITS) are crucial for the development and
+operation of smart cities, addressing key challenges in efficiency,
+productivity, and environmental sustainability. This paper comprehensively
+reviews the transformative potential of Large Language Models (LLMs) in
+optimizing ITS. Initially, we provide an extensive overview of ITS,
+highlighting its components, operational principles, and overall effectiveness.
+We then delve into the theoretical background of various LLM techniques, such
+as GPT, T5, CTRL, and BERT, elucidating their relevance to ITS applications.
+Following this, we examine the wide-ranging applications of LLMs within ITS,
+including traffic flow prediction, vehicle detection and classification,
+autonomous driving, traffic sign recognition, and pedestrian detection. Our
+analysis reveals how these advanced models can significantly enhance traffic
+management and safety. Finally, we explore the challenges and limitations LLMs
+face in ITS, such as data availability, computational constraints, and ethical
+considerations. We also present several future research directions and
+potential innovations to address these challenges. This paper aims to guide
+researchers and practitioners through the complexities and opportunities of
+integrating LLMs in ITS, offering a roadmap to create more efficient,
+sustainable, and responsive next-generation transportation systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in IEEE Transactions on Intelligent
+  Transportation Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A new methodology for the optimization of bolt tightening sequences for
+  ring type joints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibai Coria, Mikel Abasolo, Imanol Olaskoaga, Arkaitz Etxezarreta, Josu Aguirrebeitia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Achieving uniform bolt load distribution is critical to obtain leak-free
+service in pressure vessel gasketed joints used in offshore pipelines. This is
+a difficult task due to bolt load variations during the assembly process. In
+this sense, the Elastic Interaction Coefficients Method has been developed in
+previous works to define tightening sequences that provide the target load at
+the end of the sequence in one or two passes. The method is very costly because
+a complete sequence must be simulated and the load of every bolt must be
+measured after each tightening operation. The present work validates this
+method for Ring Type Joints and further develops a numerically and
+experimentally validated new methodology that provides highly satisfactory
+results with a significantly lower cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Numerical Differentiation for Extremum Seeking with Sensor
+  Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Verma, Juan Augusto Paredes Salazar, Jhon Manuel Portella Delgado, Ankit Goel, Dennis S. Bernstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extremum-seeking control (ESC) is widely used to optimize performance when
+the system dynamics are uncertain. However, sensitivity to sensor noise is an
+important issue in ESC implementation due to the use of high-pass filters or
+gradient estimators. To reduce the sensitivity of ESC to noise, this paper
+investigates the use of adaptive input and state estimation (AISE) for
+numerical differentiation. In particular, this paper develops extremum-seeking
+control with adaptive input and state estimation (ESC/AISE), where the
+high-pass filter of ESC is replaced by AISE to improve performance under sensor
+noise. The effectiveness of ESC/AISE is illustrated via numerical examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 13 figures. Submitted to ACC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Frenet-Serret-Based Trajectory Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Verma, Dennis S. Bernstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trajectory prediction is a crucial element of guidance, navigation, and
+control systems. This paper presents two novel trajectory-prediction methods
+based on real-time position measurements and adaptive input and state
+estimation (AISE). The first method, called AISE/va, uses position measurements
+to estimate the target velocity and acceleration. The second method, called
+AISE/FS, models the target trajectory as a 3D curve using the Frenet-Serret
+formulas, which require estimates of velocity, acceleration, and jerk. To
+estimate velocity, acceleration, and jerk in real time, AISE computes first,
+second, and third derivatives of the position measurements. AISE does not rely
+on assumptions about the target maneuver, measurement noise, or disturbances.
+For trajectory prediction, both methods use measurements of the target position
+and estimates of its derivatives to extrapolate from the current position. The
+performance of AISE/va and AISE/FS is compared numerically with the
+$\alpha$-$\beta$-$\gamma$ filter, which shows that AISE/FS provides more
+accurate trajectory prediction than AISE/va and traditional methods, especially
+for complex target maneuvers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures. Submitted to ACC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Target Tracking Using the Invariant Extended Kalman Filter with
+  Numerical Differentiation for Estimating Curvature and Torsion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Verma, Dennis S. Bernstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of target tracking is to estimate target position, velocity, and
+acceleration in real time using position data. This paper introduces a novel
+target-tracking technique that uses adaptive input and state estimation (AISE)
+for real-time numerical differentiation to estimate velocity, acceleration, and
+jerk from position data. These estimates are used to model the target motion
+within the Frenet-Serret (FS) frame. By representing the model in SE(3), the
+position and velocity are estimated using the invariant extended Kalman filter
+(IEKF). The proposed method, called FS-IEKF-AISE, is illustrated by numerical
+examples and compared to prior techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 8 figures, submitted to ACC 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beam Domain Channel Estimation for Spatial Non-Stationary Massive MIMO
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04242v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04242v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Hou, Hengtai Chang, Cheng-Xiang Wang, Jie Huang, Songjiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In massive multiple-input multiple-output (MIMO) systems, the channel
+estimation scheme is subject to the spatial non-stationarity and inevitably
+power leakage in the beam domain. In this paper, a beam domain channel
+estimation scheme is investigated for spatial non-stationary (SNS) massive MIMO
+systems considering power leakage. %a novel beam domain channel estimation
+scheme is proposed for spatial non-stationary (SNS) massive MIMO systems.
+Specifically, a realistic massive MIMO beam domain channel model (BDCM) is
+introduced to capture the spatial non-stationarity considering power leakage by
+introducing the illustration of visibility region (VR). Then, a beam domain
+structure-based sparsity adaptive matching pursuit (BDS-SAMP) scheme is
+proposed based on the cross-block sparse structure and power ratio threshold of
+beam domain channel. Finally, the simulation results validate the accuracy of
+proposed BDS-SAMP scheme with low pilot overhead and reasonable complexity by
+comparing with conventional schemes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Non-Stationary Channel Emulator for 6G MIMO Wireless Channels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Zong, Lijian Xin, Jie Huang, Cheng-Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance evaluation of sixth generation (6G) communication systems is
+anticipated to be a controlled and repeatable process in the lab, which brings
+up the demand for wireless channel emulators. However, channel emulation for 6G
+space-time-frequency (STF) non-stationary channels is missing currently. In
+this paper, a non-stationary multiple-input multiple-output (MIMO)
+geometry-based stochastic model (GBSM) that accurately characterizes the
+channel STF properties is introduced firstly. Then, a subspace-based method is
+proposed for reconstructing the channel fading obtained from the GBSM and a
+channel emulator architecture with frequency domain processing is presented for
+6G MIMO systems. Moreover, the spatial time-varying channel transfer functions
+(CTFs) of the channel simulation and the channel emulation are compared and
+analyzed. The Doppler power spectral density (PSD) and delay PSD are further
+derived and compared between the channel model simulation and subspace-based
+emulation. The results demonstrate that the proposed channel emulator is
+capable of reproducing the non-stationary channel characteristics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Quasi-deterministic Channel Model for Underwater Acoustic
+  Communication Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Yang, Yilin Ma, Hengtai Chang, Cheng-Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a quasi-deterministic (Q-D) model for non-stationary
+underwater acoustic (UWA) channels is proposed. This model combines the BELLHOP
+deterministic model and geometry-based stochastic model (GBSM), which provides
+higher accuracy and flexibility. Different propagation components in shallow
+water are classified as D-rays, R-rays and F-rays in the proposed model, where
+D-rays are modeled by BELLHOP while both R-rays and F-rays are modeled by GBSM.
+Some important channel statistical properties, including time-frequency
+correlation function (TF-CF), Doppler power spectrum density (PSD), average
+Doppler shift, and RMS Doppler spread are derived and simulated. Finally,
+simulation results illustrate the correctness of the proposed model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Privacy-Preserving Distributed Online Mirror Descent for Nonconvex
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingjie Zhou, Tao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the distributed online nonconvex optimization problem with
+differential privacy over time-varying networks. Each node minimizes the sum of
+several nonconvex functions while preserving the node's differential privacy.
+We propose a privacy-preserving distributed online mirror descent algorithm for
+nonconvex optimization, which uses the mirror descent to update decision
+variables and the Laplace differential privacy mechanism to protect privacy.
+Unlike the existing works, the proposed algorithm allows the cost functions to
+be nonconvex, which is more applicable. Based upon these, we prove that if the
+communication network is $B$-strongly connected and the constraint set is
+compact, then by choosing the step size properly, the algorithm guarantees
+$\epsilon$-differential privacy at each time. Furthermore, we prove that if the
+local cost functions are $\beta$-smooth, then the regret over time horizon $T$
+grows sublinearly while preserving differential privacy, with an upper bound
+$O(\sqrt{T})$. Finally, the effectiveness of the algorithm is demonstrated
+through numerical simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DRL-Based Medium-Term Planning of Renewable-Integrated Self-Scheduling
+  Cascaded Hydropower to Guide Wholesale Market Participation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04839v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04839v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianbang Chen, Yikui Liu, Neng Fan, Lei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For self-scheduling cascaded hydropower (S-CHP) facilities, medium-term
+planning is a critical step that coordinates water availability over the
+medium-term horizon, providing water usage guidance for their short-term
+operations in wholesale market participation. Typically, medium-term planning
+strategies (e.g., reservoir storage targets at the end of each short-term
+period) are determined by either optimization methods or rules of thumb.
+However, with the integration of variable renewable energy sources (VRESs),
+optimization-based methods suffer from deviations between the anticipated and
+actual reservoir storage, while rules of thumb could be financially
+conservative, thereby compromising short-term operating profitability in
+wholesale market participation. This paper presents a deep reinforcement
+learning (DRL)-based framework to derive medium-term planning policies for
+VRES-integrated S-CHPs (VS-CHPs), which can leverage contextual information
+underneath individual short-term periods and train planning policies by their
+induced short-term operating profits in wholesale market participation. The
+proposed DRL-based framework offers two practical merits. First, its planning
+strategies consider both seasonal requirements of reservoir storage and needs
+for short-term operating profits. Second, it adopts a multi-parametric
+programming-based strategy to accelerate the expensive training process
+associated with multi-step short-term operations. Finally, the DRL-based
+framework is evaluated on a real-world VS-CHP, demonstrating its advantages
+over current practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep Learning-Based Method for Power System Resilience Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuesong Wang, Caisheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Power systems are critical infrastructure in modern society, and power
+outages can cause significant disruptions to communities and individuals' daily
+lives. The resilience of a power system measures its ability to maintain power
+supply during highly disruptive events such as hurricanes, earthquakes, and
+thunderstorms. Traditional methods for quantifying power system resilience
+include statistics-based and simulation-based approaches. Statistics-based
+methods offer a retrospective analysis of system performance without requiring
+a physical model, while simulation-based methods necessitate detailed physical
+system information and often simplify real-world scenarios. This paper
+introduces a deep learning-based method for evaluating power system resilience
+using historical power outage data. The method leverages the generalization
+capabilities of deep learning models and incorporates socio-economic and
+demographic factors as weighting terms to highlight the impacts on vulnerable
+demographic groups. The effectiveness of the proposed method is demonstrated
+through two case studies: one with real historical outage data and the other
+with simulated outage records. This approach provides valuable insights into
+measuring power system resilience against hazardous weather events without
+requiring a physical model of the target systems. The evaluation results can
+further guide the planning of distributed energy resources for resilience
+enhancement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Transactions on Power Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Democratic Resilience and Sociotechnical Shocks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04796v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04796v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Amin Rahimian, Michael P. Colaresi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We focus on the potential fragility of democratic elections given modern
+information-communication technologies (ICT) in the Web 2.0 era. Our work
+provides an explanation for the cascading attrition of public officials
+recently in the United States and offers potential policy interventions from a
+dynamic system's perspective. We propose that micro-level heterogeneity across
+individuals within crucial institutions leads to vulnerabilities of election
+support systems at the macro scale. Our analysis provides comparative
+statistics to measure the fragility of systems against targeted harassment,
+disinformation campaigns, and other adversarial manipulations that are now
+cheaper to scale and deploy. Our analysis also informs policy interventions
+that seek to retain public officials and increase voter turnout. We show how
+limited resources (for example, salary incentives to public officials and
+targeted interventions to increase voter turnout) can be allocated at the
+population level to improve these outcomes and maximally enhance democratic
+resilience. On the one hand, structural and individual heterogeneity cause
+systemic fragility that adversarial actors can exploit, but also provide
+opportunities for effective interventions that offer significant global
+improvements from limited and localized actions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computational and Mathematical Organization Theory, forthcoming</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Observer Design for LuGre Friction Estimation and Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04793v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04793v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caner Odabaş, Ömer Morgül
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic components of the friction may directly impact the stability and
+performance of the motion control systems. The LuGre model is a prevalent
+friction model utilized to express this dynamic behavior. Since the LuGre model
+is very comprehensive, friction compensation based on it might be challenging.
+Inspired by this, we develop a novel observer to estimate and compensate for
+LuGre friction. Furthermore, we present a Lyapunov stability analysis to show
+that observer dynamics are asymptotically stable under certain conditions.
+Compared to its counterparts, the proposed observer constitutes a simple and
+standalone scheme that can be utilized with arbitrary control inputs in a
+straightforward way. As a primary difference, the presented observer estimates
+velocity and uses the velocity error to estimate friction in addition to
+control input. The extensive simulations revealed that the introduced observer
+enhances position and velocity tracking performance in the presence of
+friction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Traffic Simulations: Multi-City Calibration of Metropolitan Highway
+  Networks <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Zhang, Yechen Li, Neha Arora, Damien Pierce, Carolina Osorio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes an approach to perform travel demand calibration for
+high-resolution stochastic traffic simulators. It employs abundant travel times
+at the path-level, departing from the standard practice of resorting to scarce
+segment-level sensor counts. The proposed approach is shown to tackle
+high-dimensional instances in a sample-efficient way. For the first time, case
+studies on 6 metropolitan highway networks are carried out, considering a total
+of 54 calibration scenarios. This is the first work to show the ability of a
+calibration algorithm to systematically scale across networks. Compared to the
+state-of-the-art simultaneous perturbation stochastic approximation (SPSA)
+algorithm, the proposed approach enhances fit to field data by an average 43.5%
+with a maximum improvement of 80.0%, and does so within fewer simulation calls.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on the 27th IEEE International Conference on Intelligent
+  Transportation Systems (ITSC) (2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimize the parameters of the PID Controller using Genetic Algorithm
+  for Robot Manipulators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vu Ngoc Son, Pham Van Cuong, Nguyen Duy Minh, Phi Hoang Nha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the design a Proportional-Integral-Derivative (PID)
+controller with optimized parameters for a two-degree-of-freedom robotic arm. A
+genetic algorithm (GA) is proposed to optimize the controller parameters,
+addressing the challenges in determining PID controller parameters for highly
+nonlinear systems like robotic arms compared to traditional methods. The
+GA-optimized PID controller significantly improves control accuracy and
+performance over traditional control methods. Simulation results demonstrate
+that the robotic arm system operates with high precision and stability.
+Additionally, the shortened trajectory tracking response time enhances the
+feasibility of applying this control algorithm in realworld scenarios. This
+research not only confirms the suitability of PID-GA for robotic arms and
+similar systems but also opens new avenues for applying this algorithm to real
+physical systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development of an Adaptive Sliding Mode Controller using Neural Networks
+  for Trajectory Tracking of a Cylindrical Manipulator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        TieuNien Le, VanCuong Pham, NgocSon Vu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cylindrical manipulators are extensively used in industrial automation,
+especially in emerging technologies like 3D printing, which represents a
+significant future trend. However, controlling the trajectory of nonlinear
+models with system uncertainties remains a critical challenge, often leading to
+reduced accuracy and reliability. To address this, the study develops an
+Adaptive Sliding Mode Controller (ASMC) integrated with Neural Networks (NNs)
+to improve trajectory tracking for cylindrical manipulators. The ASMC leverages
+the robustness of sliding mode control and the adaptability of neural networks
+to handle uncertainties and dynamic variations effectively. Simulation results
+validate that the proposed ASMC-NN achieves high trajectory tracking accuracy,
+fast response time, and enhanced reliability, making it a promising solution
+for applications in 3D printing and beyond.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards resilient cities: A hybrid simulation framework for risk
+  mitigation through data driven decision making 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Carraminana, Ana M. Bernardos, Juan A. Besada, Jose R. Casar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Providing a comprehensive view of the city operation and offering useful
+metrics for decision making is a well known challenge for urban risk analysis
+systems. Existing systems are, in many cases, generalizations of previous
+domain specific tools and or methodologies that may not cover all urban
+interdependencies and makes it difficult to have homogeneous indicators. In
+order to overcome this limitation while seeking for effective support to
+decision makers, this article introduces a novel hybrid simulation framework
+for risk mitigation. The framework is built on a proposed city concept that
+considers urban space as a Complex Adaptive System composed by interconnected
+Critical Infrastructures. In this concept, a Social System, which models daily
+patterns and social interactions of the citizens in the Urban Landscape, drives
+the CIs demand to configure the full city picture. The frameworks hybrid design
+integrates agent based and network based modeling by breaking down city agents
+into system dependent subagents, to enable both inter and intra system
+interaction simulation, respectively. A layered structure of indicators at
+different aggregation levels is also developed, to ensure that decisions are
+not only data driven but also explainable. Therefore, the proposed simulation
+framework can serve as a DSS tool that allows the quantitative analysis of the
+impact of threats at different levels. First, system level metrics can be used
+to get a broad view on the city resilience. Then, agent level metrics back
+those figures and provide better explainability. On implementation, the
+proposed framework enables component reusability (for eased coding), simulation
+federation (enabling the integration of existing system oriented simulators),
+discrete simulation in accelerated time (for rapid scenario simulation) and
+decision oriented visualization (for informed outputs).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ultrafast pulsed laser evaluation of Single Event Transients in
+  opto-couplers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.07590v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.07590v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kavin Dave, Aditya Mukherjee, Hari Shanker Gupta, Deepak Jain, Shalabh Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We build a 1064 nm fiber laser system-based testing facility for emulating
+SETs in different electronics components and ICs. Using these facilities, we
+tested the 4N35 optocoupler to observe SETs for the first time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CLEO 2023, San Jose, USA and CLEO 2024, North Carolina,
+  USA for in poster presentation. However due to lack of funds, we could not
+  travel</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaming on Coincident Peak Shaving: Equilibrium and Strategic Behavior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02792v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02792v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liudong Chen, Bolun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coincident peak demand charges are imposed by power system operators or
+electric utilities when the overall system demand, aggregated across multiple
+consumers, reaches its peak. These charges incentivize consumers to reduce
+their demand during peak periods, a practice known as coincident peak shaving.
+In this paper, we analyze the coincident peak shaving problem through the lens
+of game theory, developing a theoretical model to examine the impact of
+strategic consumer behavior on system efficiency. We demonstrate that the game
+structure exhibits varying characteristics - concave,
+quasiconcave/discontinuous, or non-concave/discontinuous - depending on the
+extent of consumers demand-shifting capabilities. For a two-agent, two-period
+setting, we derive closed-form Nash equilibrium solutions under each condition
+and generalize our findings to cases with multiple agents. We prove the
+stability of the equilibrium points and present an algorithm for computing
+equilibrium outcomes across all game scenarios. We also show that the
+peak-shaving effectiveness of the game model matches that of the centralized
+peak-shaving model but with increased levels of anarchy. In the cases of
+quasiconcave and non-concave game conditions, we analytically demonstrate in
+the two-agent setting that anarchy increases with consumers' flexibility and
+inequity, as measured by their marginal shifting costs, and we also analyze the
+influence of the number of agents on anarchy. Finally, we provide numerical
+simulations to validate our theoretical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Entropy-regularized Diffusion Policy with Q-Ensembles for Offline
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04080v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04080v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruoqi Zhang, Ziwei Luo, Jens Sjölund, Thomas B. Schön, Per Mattsson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents advanced techniques of training diffusion policies for
+offline reinforcement learning (RL). At the core is a mean-reverting stochastic
+differential equation (SDE) that transfers a complex action distribution into a
+standard Gaussian and then samples actions conditioned on the environment state
+with a corresponding reverse-time SDE, like a typical diffusion policy. We show
+that such an SDE has a solution that we can use to calculate the log
+probability of the policy, yielding an entropy regularizer that improves the
+exploration of offline datasets. To mitigate the impact of inaccurate value
+functions from out-of-distribution data points, we further propose to learn the
+lower confidence bound of Q-ensembles for more robust policy improvement. By
+combining the entropy-regularized diffusion policy with Q-ensembles in offline
+RL, our method achieves state-of-the-art performance on most tasks in D4RL
+benchmarks. Code is available at
+https://github.com/ruoqizzz/Entropy-Regularized-Diffusion-Policy-with-QEnsemble.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A capacity renting framework for shared energy storage considering
+  peer-to-peer energy trading of prosumers with privacy protection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.06107v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.06107v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingcong Sun, Laijun Chen, Yue Chen, Mingrui Tang, Shengwei Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shared energy storage systems (ESS) present a promising solution to the
+temporal imbalance between energy generation from renewable distributed
+generators (DGs) and the power demands of prosumers. However, as DG penetration
+rates rise, spatial energy imbalances become increasingly significant,
+necessitating the integration of peer-to-peer (P2P) energy trading within the
+shared ESS framework. Two key challenges emerge in this context: the absence of
+effective mechanisms and the greater difficulty for privacy protection due to
+increased data communication. This research proposes a capacity renting
+framework for shared ESS considering P2P energy trading of prosumers. In the
+proposed framework, prosumers can participate in P2P energy trading and rent
+capacities from shared ESS. A generalized Nash game is formulated to model the
+trading process and the competitive interactions among prosumers, and the
+variational equilibrium of the game is proved to be equivalent to the optimal
+solution of a quadratic programming (QP) problem. To address the privacy
+protection concern, the problem is solved using the alternating direction
+method of multipliers (ADMM) with the Paillier cryptosystem. Finally, numerical
+simulations demonstrate the impact of P2P energy trading on the shared ESS
+framework and validate the effectiveness of the proposed privacy-preserving
+algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative manufacturing systems using diffusion models and Chat<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00958v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00958v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Li, Fei Tao, Wei Ye, Aydin Nassehi, John W. Sutherland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce Generative Manufacturing Systems (GMS) as a novel
+approach to effectively manage and coordinate autonomous manufacturing assets,
+thereby enhancing their responsiveness and flexibility to address a wide array
+of production objectives and human preferences. Deviating from traditional
+explicit modeling, GMS employs generative AI, including diffusion models and
+ChatGPT, for implicit learning from envisioned futures, marking a shift from a
+model-optimum to a training-sampling decision-making. Through the integration
+of generative AI, GMS enables complex decision-making through interactive
+dialogue with humans, allowing manufacturing assets to generate multiple
+high-quality global decisions that can be iteratively refined based on human
+feedback. Empirical findings showcase GMS's substantial improvement in system
+resilience and responsiveness to uncertainties, with decision times reduced
+from seconds to milliseconds. The study underscores the inherent creativity and
+diversity in the generated solutions, facilitating human-centric
+decision-making through seamless and continuous human-machine interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We are withdrawing this preprint to incorporate significant new
+  results and expand the scope of the paper. We plan to resubmit a
+  substantially revised version in the near future</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Configuration and EMT Simulation of the 240-bus MiniWECC System
+  Integrating Offshore Wind Farms (OWFs) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07988v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07988v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Buxin She, Hisham Mahmood, Marcelo Elizondo, Veronica Adetola, Yuqing Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As offshore wind farms (OWFs) become increasingly prevalent in Northern
+California and Southern Oregon, they introduce faster dynamics into the Western
+Electricity Coordinating Council (WECC) system, reshaping its dynamic behavior.
+Accordingly, electromagnetic transient (EMT) simulation is essential to assess
+high frequency dynamics of the WECC system with integrated OWFs. Against this
+background, this paper presents the integration of detailed dynamic models of
+OWFs into a 240-bus miniWECC system in PSCAD software. The sequential
+initialization technique is employed to facilitate the smooth initiation of a
+large-scale system in an EMT simulation. The performance of the configured
+model is assessed under wind speed variations and grounded faults,
+demonstrating the effectiveness of the miniWECC system with OWFs. This system
+serves as a valuable basic use case for validating the fast dynamic performance
+of future WECC systems with high penetration of wind energy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Carryover Storage Valuation Framework for Medium-Term Cascaded
+  Hydropower Planning: A Portland General Electric System Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09876v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09876v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianbang Chen, Yikui Liu, Zhiming Zhong, Neng Fan, Zhechong Zhao, Lei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medium-term planning of cascaded hydropower (CHP) determines appropriate
+carryover storage levels in reservoirs to optimize the usage of available water
+resources. This optimization seeks to maximize the hydropower generated in the
+current period (i.e., immediate benefit) plus the potential hydropower
+generation in the future period (i.e., future value). Thus, in the medium-term
+CHP planning, properly quantifying the future value deposited in carryover
+storage is essential to achieve a balanced trade-off between immediate benefit
+and future value. To this end, this paper presents a framework to quantify the
+future value of carryover storage, which consists of three major steps: i)
+constructing a model to calculate the maximum possible hydropower generation
+that a given level of carryover storage can deliver in the future period; ii)
+extracting the implicit locational marginal water value (LMWV) of carryover
+storage for each reservoir by applying a partition-then-extract algorithm to
+the constructed model; and iii) developing a set of analytical rules based on
+the extracted LMWV to effectively calculate the future value. These rules can
+be seamlessly integrated into medium-term CHP planning models as tractable
+mixed-integer linear constraints to quantify the future value properly, and can
+be easily visualized to offer valuable insights for CHP operators. Finally,
+numerical results on a CHP system of Portland General Electric demonstrate the
+effectiveness of the presented framework in determining proper carryover
+storage values to facilitate medium-term CHP planning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Equity Impacts of Public Transit Network Redesign with Shared Autonomous
+  Mobility Services 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max T. M. Ng, Meredith Raymer, Hani S. Mahmassani, Omer Verbas, Taner Cokyasar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study examines the equity impacts of integrating shared autonomous
+mobility services (SAMS) into transit system redesign. Using the Greater
+Chicago area as a case study, we compare two optimization objectives in
+multimodal transit network redesign: minimizing total generalized costs
+(equity-agnostic) versus prioritizing service in low-income areas
+(equity-focused). We evaluate the achieved accessibility of clustered zones
+with redesigned transit networks under two objectives, compared to driving and
+the existing transit network. The transit access gaps across zones and between
+transit and driving are found to be generally reduced with the introduction of
+SAMS, but less so with the subsequent improved infrastructure under budget.
+Differential improvement in equity is seen across suburbs and areas of the
+city, reflecting the disparity in current transit access and improvement
+potential. In particular, SAMS bridges the transit access gaps in suburban and
+city areas currently underserved by transit. The City of Chicago, which is also
+disproportionately home to vulnerable populations, offers an avenue to improve
+vertical equity. These findings demonstrate that SAMS can enhance both
+horizontal and vertical equity in transit systems, particularly when equity is
+explicitly incorporated into the design objective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Restructuring the paper for more precise research direction</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralized Singular Value Decomposition for Large-scale Distributed
+  Sensor Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14292v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14292v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufan Fan, Marius Pesavento
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article studies the problem of decentralized Singular Value
+Decomposition (d-SVD), which is fundamental in various signal processing
+applications. Two scenarios are considered depending on the availability of the
+data matrix under consideration. In the first scenario, the matrix of interest
+is row-wisely available in each local node in the network. In the second
+scenario, the matrix of interest implicitly forms an outer product from two
+different series of measurements. By combining the lightweight local rational
+function approximation approach with parallel averaging consensus algorithms,
+two d-SVD algorithms are proposed to cope with the two aforementioned
+scenarios. We evaluate the proposed algorithms using two application examples:
+decentralized sensor localization via low-rank matrix completion and
+decentralized passive radar detection. Moreover, a novel and non-trivial
+truncation technique, which employs a representative vector that is orthonormal
+to the principal signal subspace, is proposed to further reduce the
+communication cost associated with the d-SVD algorithms. Simulation results
+show that the proposed d-SVD algorithms converge to the centralized solution
+with reduced communication cost compared to those facilitated with the
+state-of-the-art decentralized power method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">42</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semilinear Dynamic Programming: Analysis, Algorithms, and Certainty
+  Equivalence Properties 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04668v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04668v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchao Li, Dimitri Bertsekas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a broad class of dynamic programming (DP) problems that involve a
+partially linear structure and some positivity properties in their system
+equation and cost function. We address deterministic and stochastic problems,
+possibly with Markov jump parameters. We focus primarily on infinite horizon
+problems and prove that under our assumptions, the optimal cost function is
+linear, and that an optimal policy can be computed efficiently with standard DP
+algorithms. Moreover, we show that forms of certainty equivalence hold for our
+stochastic problems, in analogy with the classical linear quadratic optimal
+control problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quadratic-form Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04658v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04658v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruodu Wang, Zhenyuan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the framework of quadratic-form optimal transport (QOT), whose
+transport cost has the form $\iint c\,\mathrm{d}\pi \otimes\mathrm{d}\pi$ for
+some coupling $\pi$ between two marginals. Interesting examples of
+quadratic-form transport cost and their optimization include the variance of a
+bivariate function, covariance, Kendall's tau, the Gromov--Wasserstein
+distance, quadratic assignment problems, and quadratic regularization of
+classic optimal transport. QOT leads to substantially different mathematical
+structures compared to classic transport problems and many technical
+challenges. We illustrate the fundamental properties of QOT, provide several
+cases where explicit solutions are obtained, and give general lower bounds of
+the optimal transport costs. For a wide class of cost functions, including the
+rectangular cost functions, the QOT problem is solved by a new coupling called
+the diamond transport, whose copula is supported on a diamond in the unit
+square.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Characterizations of Variational Convexity and Tilt Stability via
+  Quadratic Bundles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pham Duy Khanh, Boris S. Mordukhovich, Vo Thanh Phat, Le Duc Viet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we establish characterizations of variational $s$-convexity
+and tilt stability for prox-regular functions in the absence of subdifferential
+continuity via quadratic bundles, a kind of primal-dual generalized
+second-order derivatives recently introduced by Rockafellar. Deriving such
+characterizations in the effective pointbased form requires a certain revision
+of quadratic bundles investigated below. Our device is based on the notion of
+generalized twice differentiability and its novel characterization via
+classical twice differentiability of the associated Moreau envelopes combined
+with various limiting procedures for functions and sets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Infinite Horizon Fully Coupled Nonlinear Forward-Backward Stochastic
+  Difference Equations and their Application to LQ Optimal Control Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04603v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04603v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Ma, Xun Li, Qingxin Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on the study of infinite horizon fully coupled nonlinear
+forward-backward stochastic difference equations (FBS$\bigtriangleup$Es).
+Firstly, we establish a pair of priori estimates for the solutions to forward
+stochastic difference equations (FS$\bigtriangleup$Es) and backward stochastic
+difference equations (BS$\bigtriangleup$Es) respectively. Then, to achieve
+broader applicability, we utilize a set of domination-monotonicity conditions
+which are more lenient than general ones. Using these conditions, we apply
+continuation methods to prove the unique solvability of infinite horizon fully
+coupled FBS$\bigtriangleup$Es and derive a set of solution estimates.
+Furthermore, our results have considerable implications for a variety of
+related linear quadratic (LQ) problems, especially when the stochastic
+Hamiltonian system is consistent with FBS$\bigtriangleup$Es satisfying these
+introduced domination-monotonicity conditions. Thus, by solving the associated
+stochastic Hamiltonian system, we can derive an explicit expression for the
+unique optimal control.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2410.01749</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerated Extragradient-Type Methods -- Part 2: Generalization and
+  Sublinear Convergence Rates under Co-Hypomonotonicity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04585v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04585v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quoc Tran-Dinh, Nghia Nguyen-Trung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following the first part of our project, this paper comprehensively studies
+two types of extragradient-based methods: anchored extragradient and Nesterov's
+accelerated extragradient for solving [non]linear inclusions (and, in
+particular, equations), primarily under the Lipschitz continuity and the
+co-hypomonotonicity assumptions. We unify and generalize a class of anchored
+extragradient methods for monotone inclusions to a wider range of schemes
+encompassing existing algorithms as special cases. We establish
+$\mathcal{O}(1/k)$ last-iterate convergence rates on the residual norm of the
+underlying mapping for this general framework and then specialize it to obtain
+convergence guarantees for specific instances, where $k$ denotes the iteration
+counter. We extend our approach to a class of anchored Tseng's
+forward-backward-forward splitting methods to obtain a broader class of
+algorithms for solving co-hypomonotone inclusions. Again, we analyze
+$\mathcal{O}(1/k)$ last-iterate convergence rates for this general scheme and
+specialize it to obtain convergence results for existing and new variants. We
+generalize and unify Nesterov's accelerated extra-gradient method to a new
+class of algorithms that covers existing schemes as special instances while
+generating new variants. For these schemes, we can prove $\mathcal{O}(1/k)$
+last-iterate convergence rates for the residual norm under co-hypomonotonicity,
+covering a class of nonmonotone problems. We propose another novel class of
+Nesterov's accelerated extragradient methods to solve inclusions.
+Interestingly, these algorithms achieve both $\mathcal{O}(1/k)$ and $o(1/k)$
+last-iterate convergence rates, and also the convergence of iterate sequences
+under co-hypomonotonicity and Lipschitz continuity. Finally, we provide a set
+of numerical experiments encompassing different scenarios to validate our
+algorithms and theoretical guarantees.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>75 pages, 7 figures, and 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regret Analysis: a control perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Travis E. Gibson, Sawal Acharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online learning and model reference adaptive control have many interesting
+intersections. One area where they differ however is in how the algorithms are
+analyzed and what objective or metric is used to discriminate "good" algorithms
+from "bad" algorithms. In adaptive control there are usually two objectives: 1)
+prove that all time varying parameters/states of the system are bounded, and 2)
+that the instantaneous error between the adaptively controlled system and a
+reference system converges to zero over time (or at least a compact set). For
+online learning the performance of algorithms is often characterized by the
+regret the algorithm incurs. Regret is defined as the cumulative loss (cost)
+over time from the online algorithm minus the cumulative loss (cost) of the
+single optimal fixed parameter choice in hindsight. Another significant
+difference between the two areas of research is with regard to the assumptions
+made in order to obtain said results. Adaptive control makes assumptions about
+the input-output properties of the control problem and derives solutions for a
+fixed error model or optimization task. In the online learning literature
+results are derived for classes of loss functions (i.e. convex) while a priori
+assuming that all time varying parameters are bounded, which for many
+optimization tasks is not unrealistic, but is a non starter in control
+applications. In this work we discuss these differences in detail through the
+regret based analysis of gradient descent for convex functions and the control
+based analysis of a streaming regression problem. We close with a discussion
+about the newly defined paradigm of online adaptive control and ask the
+following question "Are regret optimal control strategies deployable?"
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages no figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Branch-and-Price for Project Scheduling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Kolter, Martin Grunow, Rainer Kolisch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Integer programs for resource-constrained project scheduling problems are
+notoriously hard to solve due to their weak linear relaxations. Several papers
+have proposed reformulating project scheduling problems via Dantzig-Wolfe
+decomposition to strengthen their linear relaxation and decompose large problem
+instances. The reformulation gives rise to a master problem that has a large
+number of variables. Therefore, the master problem is solved by a column
+generation procedure embedded in a branching framework, also known as
+branch-and-price. While branch-and-price has been successfully applied to many
+problem classes, it turns out to be ineffective for most project scheduling
+problems. This paper identifies drivers of the ineffectiveness by analyzing the
+structure of the reformulated problem and the strength of different branching
+schemes. Our analysis shows that the reformulated problem has an unfavorable
+structure for column generation: It is highly degenerate, slowing down the
+convergence of column generation, and for many project scheduling problems, it
+yields the same or only slightly stronger linear relaxations as classical
+formulations at the expense of large increases in runtime. Our computational
+experiments complement our theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Control of the Navier-Stokes equations via Pressure Boundary
+  Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boris Vexler, Jakob Wagner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we study an optimal control problem subject to the instationary
+Navier-Stokes equations, where the control enters via an inhomogeneous
+Neumann/Do-Nothing boundary condition. Despite the Navier-Stokes equations with
+these boundary conditions not being well-posed for large times and/or data, we
+obtain wellposedness of the optimal control problem by choosing a proper
+tracking type term. In order to discuss the regularity of the optimal control,
+state and adjoint state, we present new results on $L^2(I;H^2(\Omega))$
+regularity of solutions to a Stokes problem with mixed inhomogeneous boundary
+conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Derivative-Free Optimization Algorithms with Low-Dimensional
+  Subspace Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04536v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04536v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zaikun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We re-introduce a derivative-free subspace optimization framework originating
+from Chapter 5 of the Ph.D. thesis [Z. Zhang, On Derivative-Free Optimization
+Methods, Ph.D. thesis, Chinese Academy of Sciences, Beijing, 2012] of the
+author under the supervision of Ya-xiang Yuan. At each iteration, the framework
+defines a (low-dimensional) subspace based on an approximate gradient, and then
+solves a subproblem in this subspace to generate a new iterate. We sketch the
+global convergence and worst-case complexity analysis of the framework,
+elaborate on its implementation, and present some numerical results on solving
+problems with dimensions as high as 10^4 using only inaccurate function values.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A fast iterative thresholding and support-and-scale shrinking algorithm
+  (fits3) for non-lipschitz group sparse optimization (i): the case of
+  least-squares fidelity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanan Zhao, Qiaoli Dong, Yufei Zhao, Chunlin Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider to design a new efficient and easy-to-implement algorithm to
+solve a general group sparse optimization model with a class of non-convex
+non-Lipschitz regularizations, named as fast iterative thresholding and
+support-and-scale shrinking algorithm (FITS3). In this paper we focus on the
+case of a least-squares fidelity. FITS3 is designed from a lower bound theory
+of such models and by integrating thresholding operation, linearization and
+extrapolation techniques. The FITS3 has two advantages. Firstly, it is quite
+efficient and especially suitable for large-scale problems, because it adopts
+support-and-scale shrinking and does not need to solve any linear or nonlinear
+system. For two important special cases, the FITS3 contains only simple
+calculations like matrix-vector multiplication and soft thresholding. Secondly,
+the FITS3 algorithm has a sequence convergence guarantee under proper
+assumptions. The numerical experiments and comparisons to recent existing
+non-Lipschitz group recovery algorithms demonstrate that, the proposed FITS3
+achieves similar recovery accuracies, but costs only around a half of the CPU
+time by the second fastest compared algorithm for median or large-scale
+problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting LocalSGD and SCAFFOLD: Improved Rates and Missing Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruichen Luo, Sebastian U Stich, Samuel Horváth, Martin Takáč
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LocalSGD and SCAFFOLD are widely used methods in distributed stochastic
+optimization, with numerous applications in machine learning, large-scale data
+processing, and federated learning. However, rigorously establishing their
+theoretical advantages over simpler methods, such as minibatch SGD (MbSGD), has
+proven challenging, as existing analyses often rely on strong assumptions,
+unrealistic premises, or overly restrictive scenarios.
+  In this work, we revisit the convergence properties of LocalSGD and SCAFFOLD
+under a variety of existing or weaker conditions, including gradient
+similarity, Hessian similarity, weak convexity, and Lipschitz continuity of the
+Hessian. Our analysis shows that (i) LocalSGD achieves faster convergence
+compared to MbSGD for weakly convex functions without requiring stronger
+gradient similarity assumptions; (ii) LocalSGD benefits significantly from
+higher-order similarity and smoothness; and (iii) SCAFFOLD demonstrates faster
+convergence than MbSGD for a broader class of non-quadratic functions. These
+theoretical insights provide a clearer understanding of the conditions under
+which LocalSGD and SCAFFOLD outperform MbSGD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ State-dependent preconditioning for the inner-loop in Variational Data
+  Assimilation using Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Trappler, Arthur Vidard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data Assimilation is the process in which we improve the representation of
+the state of a physical system by combining information coming from a numerical
+model, real-world observations, and some prior modelling. It is widely used to
+model and to improve forecast systems in Earth science fields such as
+meteorology, oceanography and environmental sciences. One key aspect of Data
+assimilation is the analysis step, where the output of the numerical model is
+adjusted in order to account for the observational data. In Variational Data
+Assimilation and under Gaussian assumptions, the analysis step comes down to
+solving a high-dimensional non-linear least-square problem. In practice, this
+minimization involves successive inversions of large, and possibly
+ill-conditioned matrices constructed using linearizations of the forward model.
+In order to improve the convergence rate of these methods, and thus reduce the
+computational burden, preconditioning techniques are often used to get
+better-conditioned matrices, but require either the sparsity pattern of the
+matrix to inverse, or some spectral information. We propose to use Deep Neural
+Networks in order to construct a preconditioner. This surrogate is trained
+using some properties of the singular value decomposition, and is based on a
+dataset which can be constructed online to reduce the storage requirements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An algorithm for a constrained P-spline 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rosanna Campagna, Serena Crisci, Gabriele Santin, Gerardo Toraldo, Marco Viola
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regression splines are largely used to investigate and predict data behavior,
+attracting the interest of mathematicians for their beautiful numerical
+properties, and of statisticians for their versatility with respect to the
+applications. Several penalized spline regression models are available in the
+literature, and the most commonly used ones in real-world applications are
+P-splines, which enjoy the advantages of penalized models while being easy to
+generalize across different functional spaces and higher degree order, because
+of their discrete penalty term. To face the different requirements imposed by
+the nature of the problem or the physical meaning of the expected values, the
+P-spline definition is often modified by additional hypotheses, often
+translated into constraints on the solution or its derivatives. In this
+framework, our work is motivated by the aim of getting approximation models
+that fall within pre-established thresholds. Specifically, starting from a set
+of observed data, we consider a P-spline constrained between some prefixed
+bounds. In our paper, we just consider 0 as lower bound, although our approach
+applies to more general cases. We propose to get nonnegativity by imposing
+lower bounds on selected sample points. The spline can be computed through a
+sequence of linearly constrained problems. We suggest a strategy to dynamically
+select the sample points, to avoid extremely dense sampling, and therefore try
+to reduce as much as possible the computational burden. We show through some
+computational experiments the reliability of our approach and the accuracy of
+the results compared to some state-of-the-art models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A truncated ε-subdifferential method for global DC optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adil M. Bagirov, Kaisa Joki, Marko M. Makela, Sona Taheri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the difference of convex (DC) optimization problem subject to
+box-constraints. Utilizing {\epsilon}-subdifferentials of DC components of the
+objective, we develop a new method for finding global solutions to this
+problem. The method combines a local search approach with a special procedure
+for escaping non-global solutions by identifying improved initial points for a
+local search. The method terminates when the solution cannot be improved
+further. The escaping procedure is designed using subsets of the
+{\epsilon}-subdifferentials of DC components. We compute the deviation between
+these subsets and determine {\epsilon}-subgradients providing this deviation.
+Using these specific {\epsilon}-subgradients, we formulate a subproblem with a
+convex objective function. The solution to this subproblem serves as a starting
+point for a local search. We study the convergence of the conceptual version of
+the proposed method and discuss its implementation. A large number of academic
+test problems demonstrate that the method requires reasonable computational
+effort to find higher quality solutions than other local DC optimization
+methods. Additionally, we apply the new method to find global solutions to DC
+optimization problems and compare its performance with two benchmark global
+optimization solvers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrated Offline and Online Learning to Solve a Large Class of
+  Scheduling Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anbang Liu, Zhi-Long Chen, Jinyang Jiang, Xi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we develop a unified machine learning (ML) approach to predict
+high-quality solutions for single-machine scheduling problems with a
+non-decreasing min-sum objective function with or without release times. Our ML
+approach is novel in three major aspects. First, our approach is developed for
+the entire class of the aforementioned problems. To achieve this, we exploit
+the fact that the entire class of the problems considered can be formulated as
+a time-indexed formulation in a unified manner. We develop a deep neural
+network (DNN) which uses the cost parameters in the time-indexed formulation as
+the inputs to effectively predict a continuous solution to this formulation,
+based on which a feasible discrete solution is easily constructed. The second
+novel aspect of our approach lies in how the DNN model is trained. In view of
+the NP-hard nature of the problems, labels (i.e., optimal solutions) are hard
+to generate for training. To overcome this difficulty, we generate and utilize
+a set of special instances, for which optimal solutions can be found with
+little computational effort, to train the ML model offline. The third novel
+idea we employ in our approach is that we develop an online single-instance
+learning approach to fine tune the parameters in the DNN for a given online
+instance, with the goal of generating an improved solution for the given
+instance. To this end, we develop a feasibility surrogate that approximates the
+objective value of a given instance as a continuous function of the outputs of
+the DNN, which then enables us to derive gradients and update the learnable
+parameters in the DNN. Numerical results show that our approach can efficiently
+generate high-quality solutions for a variety of single-machine scheduling
+min-sum problems with up to 1000 jobs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A black-box optimization method with polynomial-based kernels and
+  quadratic-optimization annealing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuki Minamoto, Yuya Sakamoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce kernel-QA, a black-box optimization (BBO) method that constructs
+surrogate models analytically using low-order polynomial kernels within a
+quadratic unconstrained binary optimization (QUBO) framework, enabling
+efficient utilization of Ising machines. The method has been evaluated on
+artificial landscapes, ranging from uni-modal to multi-modal, with input
+dimensions extending to 80 for real variables and 640 for binary variables. The
+results demonstrate that kernel-QA is particularly effective for optimizing
+black-box functions characterized by local minima and high-dimensional inputs,
+showcasing its potential as a robust and scalable BBO approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 11 figures, and 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-step Inertial Accelerated Doubly Stochastic Gradient Methods for
+  Block Term Tensor Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehui Liu, Qingsong Wang, Chunfeng Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we explore a specific optimization problem that combines a
+differentiable nonconvex function with a nondifferentiable function for
+multi-block variables, which is particularly relevant to tackle the multilinear
+rank-($L_r$,$L_r$,1) block-term tensor decomposition model with a
+regularization term. While existing algorithms often suffer from high
+per-iteration complexity and slow convergence, this paper employs a unified
+multi-step inertial accelerated doubly stochastic gradient descent method
+tailored for structured rank-$\left(L_r, L_r, 1\right)$ tensor decomposition,
+referred to as Midas-LL1. We also introduce an extended multi-step
+variance-reduced stochastic estimator framework. Our analysis under this new
+framework demonstrates the subsequential and sequential convergence of the
+proposed algorithm under certain conditions and illustrates the sublinear
+convergence rate of the subsequence, showing that the Midas-LL1 algorithm
+requires at most $\mathcal{O}(\varepsilon^{-2})$ iterations in expectation to
+reach an $\varepsilon$-stationary point. The proposed algorithm is evaluated on
+several datasets, and the results indicate that Midas-LL1 outperforms existing
+state-of-the-art algorithms in terms of both computational speed and solution
+quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Robot Safety from Sparse Human Feedback using Conformal
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron O. Feldman, Joseph A. Vincent, Maximilian Adang, Jun En Low, Mac Schwager
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring robot safety can be challenging; user-defined constraints can miss
+edge cases, policies can become unsafe even when trained from safe data, and
+safety can be subjective. Thus, we learn about robot safety by showing policy
+trajectories to a human who flags unsafe behavior. From this binary feedback,
+we use the statistical method of conformal prediction to identify a region of
+states, potentially in learned latent space, guaranteed to contain a
+user-specified fraction of future policy errors. Our method is
+sample-efficient, as it builds on nearest neighbor classification and avoids
+withholding data as is common with conformal prediction. By alerting if the
+robot reaches the suspected unsafe region, we obtain a warning system that
+mimics the human's safety preferences with guaranteed miss rate. From video
+labeling, our system can detect when a quadcopter visuomotor policy will fail
+to steer through a designated gate. We present an approach for policy
+improvement by avoiding the suspected unsafe region. With it we improve a model
+predictive controller's safety, as shown in experimental testing with 30
+quadcopter flights across 6 navigation tasks. Code and videos are provided.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extended formulations for the multilinear polytope of acyclic
+  hypergraphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04805v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04805v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alberto Del Pia, Aida Khajavirad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article provides an overview of our joint work on binary polynomial
+optimization over the past decade. We define the multilinear polytope as the
+convex hull of the feasible region of a linearized binary polynomial
+optimization problem. By representing the multilinear polytope with
+hypergraphs, we investigate the connections between hypergraph acyclicity and
+the complexity of the facial structure of the multilinear polytope. We
+characterize the acyclic hypergraphs for which a polynomial-size extended
+formulation for the multilinear polytope can be constructed in polynomial time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2212.11239</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inexact Catching-Up Algorithm for Moreau's Sweeping Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Guillermo Garrido, Maximiliano Lioi, Emilio Vilches
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we develop an inexact version of the catching-up algorithm for
+sweeping processes. We define a new notion of approximate projection, which is
+compatible with any numerical method for approximating exact projections, as
+this new notion is not restricted to remain strictly within the set. We provide
+several properties of the new approximate projections, which enable us to prove
+the convergence of the inexact catching-up algorithm in three general
+frameworks: prox-regular moving sets, subsmooth moving sets, and merely closed
+sets. Additionally, we apply our numerical results to address complementarity
+dynamical systems, particularly electrical circuits with ideal diodes. In this
+context, we implement the inexact catching-up algorithm using a primal-dual
+optimization method, which typically does not necessarily guarantee a feasible
+point. Our results are illustrated through an electrical circuit with ideal
+diodes. Our results recover classical existence results in the literature and
+provide new insights into the numerical simulation of sweeping processes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2308.08093</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimize the parameters of the PID Controller using Genetic Algorithm
+  for Robot Manipulators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vu Ngoc Son, Pham Van Cuong, Nguyen Duy Minh, Phi Hoang Nha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the design a Proportional-Integral-Derivative (PID)
+controller with optimized parameters for a two-degree-of-freedom robotic arm. A
+genetic algorithm (GA) is proposed to optimize the controller parameters,
+addressing the challenges in determining PID controller parameters for highly
+nonlinear systems like robotic arms compared to traditional methods. The
+GA-optimized PID controller significantly improves control accuracy and
+performance over traditional control methods. Simulation results demonstrate
+that the robotic arm system operates with high precision and stability.
+Additionally, the shortened trajectory tracking response time enhances the
+feasibility of applying this control algorithm in realworld scenarios. This
+research not only confirms the suitability of PID-GA for robotic arms and
+similar systems but also opens new avenues for applying this algorithm to real
+physical systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Under the hood of a carbon footprint calculator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Indira Chatterji, Ariadna Fossas Tenas, Elise Raphael
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explain the mathematical theory of the Input-Output method for carbon
+footprints computations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Algorithmic Developments in Optimal Transport Problem with
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sina Moradi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal Transport (OT) has established itself as a robust framework for
+quantifying differences between distributions, with applications that span
+fields such as machine learning, data science, and computer vision. This paper
+offers a detailed examination of the OT problem, beginning with its theoretical
+foundations, including the classical formulations of Monge and Kantorovich and
+their extensions to modern computational techniques. It explores cutting-edge
+algorithms, including Sinkhorn iterations, primal-dual strategies, and
+reduction-based approaches, emphasizing their efficiency and scalability in
+addressing high-dimensional problems. The paper also highlights emerging
+trends, such as integrating OT into machine learning frameworks, the
+development of novel problem variants, and ongoing theoretical advancements.
+Applications of OT are presented across a range of domains, with particular
+attention to its innovative application in time series data analysis via
+Optimal Transport Warping (OTW), a robust alternative to methods like Dynamic
+Time Warping. Despite the significant progress made, challenges related to
+scalability, robustness, and ethical considerations remain, necessitating
+further research. The paper underscores OT's potential to bridge theoretical
+depth and practical utility, fostering impactful advancements across diverse
+disciplines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Convergence of Dynamic Routing between Capsules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.06240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.06240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daoyuan Ye, Juntao Li, Yiting Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capsule networks(CapsNet) are recently proposed neural network models with
+new processing layers, specifically for entity representation and discovery of
+images. It is well known that CapsNet have some advantages over traditional
+neural networks, especially in generalization capability. At the same time,
+some studies report negative experimental results. The causes of this
+contradiction have not been thoroughly analyzed. The preliminary experimental
+results show that the behavior of routing algorithms does not always produce
+good results as expected, and in most cases, different routing algorithms do
+not change the classification results, but simply polarize the link strength,
+especially when they continue to repeat without stopping. To realize the true
+potential of the CapsNet, deep mathematical analysis of the routing algorithms
+is crucial. In this paper, we will give the objective function that is
+minimized by the dynamic routing algorithm, which is a concave function. The
+dynamic routing algorithm can be regarded as nonlinear gradient method to
+solving an optimization algorithm under linear constraints, and its convergence
+can be strictly proved mathematically. Furthermore, the mathematically rigorous
+proof of the convergence is given for this class of iterative routing
+procedures. We analyze the relation between the objective function and the
+constraints solved by the dynamic routing algorithm in detail, and perform the
+corresponding routing experiment to analyze the effect of our convergence
+proof.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Algorithmic Framework for Dynamic Assortment Optimization
+  under MNL Choice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03604v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03604v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Sun, Rajan Udwani, Zuo-Jun Max Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider assortment and inventory planning problems with dynamic
+stockout-based substitution effects, and without replenishment, in two
+different settings: (1) Customers can see all available products when they
+arrive, a typical scenario in physical stores. (2) The seller can choose to
+offer a subset of available products to each customer, which is more common on
+online platforms. Both settings are known to be computationally challenging,
+and the current approximation algorithms for the two settings are quite
+different. We develop a unified algorithm framework under the MNL choice model
+for both settings. Our algorithms improve on the state-of-the-art algorithms in
+terms of approximation guarantee and runtime, and the ability to manage
+uncertainty in the total number of customers and handle more complex
+constraints. In the process, we establish various novel properties of dynamic
+assortment planning (for the MNL choice model) that may be useful more broadly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A three-stage method for reconstructing multiple coefficients in coupled
+  photoacoustic and diffuse optical imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.03496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.03496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinxi Pan, Kui Ren, Shanyin Tong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies inverse problems in quantitative photoacoustic tomography
+with additional optical current data supplemented from diffuse optical
+tomography. We propose a three-stage image reconstruction method for the
+simultaneous recovery of the absorption, diffusion, and Gr\"uneisen
+coefficients. We demonstrate, through numerical simulations, that: (i) when the
+Gr\"uneisen coefficient is known, the addition of the optical measurements
+allows a more accurate reconstruction of the scattering and absorption
+coefficients; and (ii) when the Gr\"uneisen coefficient is not known, the
+addition of optical current measurements allows us to reconstruct uniquely the
+Gr\"uneisen, the scattering and absorption coefficients. Numerical simulations
+based on synthetic data are presented to demonstrate the effectiveness of the
+proposed idea.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Column Generation with Graph Neural Networks for Joint Rider
+  Trip Planning and Crew Shift Scheduling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03692v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03692v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Lu, Tinghan Ye, Wenbo Chen, Pascal Van Hentenryck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimizing service schedules is pivotal to the reliable, efficient, and
+inclusive on-demand mobility. This pressing challenge is further exacerbated by
+the increasing needs of an aging population, the oversubscription of existing
+services, and the lack of effective solution methods. This study addresses the
+intricacies of service scheduling, by jointly optimizing rider trip planning
+and crew scheduling for a complex dynamic mobility service. The resulting
+optimization problems are extremely challenging computationally for
+state-of-the-art methods. To address this fundamental gap, this paper
+introduces the Joint Rider Trip Planning and Crew Shift Scheduling Problem
+(JRTPCSSP) and a novel solution method, called Attention and Gated GNN-Informed
+Column Generation (AGGNNI-CG), that hybridizes column generation and machine
+learning to obtain near-optimal solutions to the JRTPCSSP with real-life
+constraints of the application. The key idea of the machine-learning component
+is to dramatically reduce the number of paths to explore in the pricing
+problem, accelerating the most time-consuming component of the column
+generation. The machine learning component is a graph neural network with an
+attention mechanism and a gated architecture, which is particularly suited to
+cater for the different input sizes coming from daily operations. AGGNNI-CG has
+been applied to a challenging, real-world dataset from the Paratransit system
+of Chatham County in Georgia. It produces substantial improvements compared to
+the baseline column generation approach, which typically cannot produce
+high-quality feasible solutions in reasonable time on large-scale complex
+instances. AGGNNI-CG also produces significant improvements in service quality
+compared to the existing system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Euclidean distance discriminants and Morse attractors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16957v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16957v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cezar Joiţa, Dirk Siersma, Mihai Tibăr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our study concerns the Euclidean distance function in case of complex plane
+curves. We decompose the ED discriminant into 3 parts which are responsible for
+the 3 types of behavior of the Morse points, and we find the structure of each
+one. In particular we shed light on the ``atypical discriminant'' which is due
+to the loss of Morse points at infinity. We find formulas for the number of
+Morse singularities which abut to the corresponding 3 types of attractors when
+moving the centre of the distance function toward a point of the discriminant.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>several improvements in Section 3</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Central limit theorems for vector-valued composite functionals with
+  smoothing and applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19367v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19367v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huihui Chen, Darinka Dentcheva, Yang Lin, Gregory J. Stock
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on vector-valued composite functionals, which may be
+nonlinear in probability. Our primary goal is to establish central limit
+theorems for these functionals when mixed estimators are employed. Our study is
+relevant to the evaluation and comparison of risk in decision-making contexts
+and extends to functionals that arise in machine learning methods. A
+generalized family of composite risk functionals is presented, which
+encompasses most of the known coherent risk measures including systemic
+measures of risk. The paper makes two main contributions. First, we analyze
+vector-valued functionals, providing a framework for evaluating
+high-dimensional risks. This framework facilitates the comparison of multiple
+risk measures, as well as the estimation and asymptotic analysis of systemic
+risk and its optimal value in decision-making problems. Second, we derive novel
+central limit theorems for optimized composite functionals when mixed types of
+estimators: empirical and smoothed estimators are used. We provide verifiable
+sufficient conditions for the central limit formulae and show their
+applicability to several popular measures of risk.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking the Capacity of Graph Neural Networks for Branching Strategy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07099v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07099v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziang Chen, Jialin Liu, Xiaohan Chen, Xinshang Wang, Wotao Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have been widely used to predict properties and
+heuristics of mixed-integer linear programs (MILPs) and hence accelerate MILP
+solvers. This paper investigates the capacity of GNNs to represent strong
+branching (SB), the most effective yet computationally expensive heuristic
+employed in the branch-and-bound algorithm. In the literature, message-passing
+GNN (MP-GNN), as the simplest GNN structure, is frequently used as a fast
+approximation of SB and we find that not all MILPs's SB can be represented with
+MP-GNN. We precisely define a class of "MP-tractable" MILPs for which MP-GNNs
+can accurately approximate SB scores. Particularly, we establish a universal
+approximation theorem: for any data distribution over the MP-tractable class,
+there always exists an MP-GNN that can approximate the SB score with
+arbitrarily high accuracy and arbitrarily high probability, which lays a
+theoretical foundation of the existing works on imitating SB with MP-GNN. For
+MILPs without the MP-tractability, unfortunately, a similar result is
+impossible, which can be illustrated by two MILP instances with different SB
+scores that cannot be distinguished by any MP-GNN, regardless of the number of
+parameters. Recognizing this, we explore another GNN structure called the
+second-order folklore GNN (2-FGNN) that overcomes this limitation, and the
+aforementioned universal approximation theorem can be extended to the entire
+MILP space using 2-FGNN, regardless of the MP-tractability. A small-scale
+numerical experiment is conducted to directly validate our theoretical
+findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hardness of circuit and monotone diameters of polytopes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.04158v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.04158v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Nöbel, Raphael Steiner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Circuit diameter of polytopes was introduced by Borgwardt, Finhold and
+Hemmecke as a fundamental tool for the study of circuit augmentation schemes
+for linear programming and for estimating combinatorial diameters. Determining
+the complexity of computing the circuit diameter of polytopes was posed as an
+open problem by Sanit\`a as well as by Kafer, and was recently reiterated by
+Borgwardt, Grewe, Kafer, Lee and Sanit\`a.
+  In this paper, we solve this problem by showing that computing the circuit
+diameter of a polytope given in halfspace-description is strongly NP-hard. To
+prove this result, we show that computing the combinatorial diameter of the
+perfect matching polytope of a bipartite graph is NP-hard. This complements a
+result by Sanit\`a (FOCS 2018) on the NP-hardness of computing the diameter of
+fractional matching polytopes and implies the new result that computing the
+diameter of a $\{0,1\}$-polytope is strongly NP-hard, which may be of
+independent interest. In our second main result, we give a precise
+graph-theoretic description of the monotone diameter of perfect matching
+polytopes and use this description to prove that computing the monotone
+(circuit) diameter of a given input polytope is strongly NP-hard as well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 9 figures. Restructured paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hydrogen Network Expansion Planning considering the Chicken-and-egg
+  Dilemma and Market Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03744v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03744v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sezen Ece Kayacık, Beste Basciftci, Albert H. Schrotenboer, Iris F. A. Vis, Evrim Ursavas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Green hydrogen is thought to be a game changer for reaching sustainability
+targets. However, the transition to a green hydrogen economy faces a critical
+challenge known as the `chicken-and-egg dilemma', wherein establishing a
+hydrogen supply network relies on demand, while demand only grows with reliable
+supply. In addition, as the hydrogen market is in the early stage, predicting
+demand distributions is challenging due to lack of data availability. This
+paper addresses these complex issues through a risk-averse framework with the
+introduction of a distributionally robust hydrogen network expansion planning
+problem under decision-dependent demand ambiguity. The problem optimizes
+location and production capacity decisions of the suppliers considering the
+moments of the stochastic hydrogen demand as a function of these investment
+decisions. To obtain tractable representations of this problem, we derive two
+different reformulations that consider continuous and discrete hydrogen demand
+support sets under different forms of decision dependencies. To efficiently
+solve the reformulations, we develop a tailored algorithm based on the
+column-and-constraint generation approach, and enhance the computational
+performance through solving the master problems to a relative optimality gap,
+decomposing the subproblems, and integrating pre-generated columns and
+constraints. To validate the effectiveness of our approach, we investigate a
+real case study leveraging data from the "Hydrogen Energy Applications in
+Valley Environments for Northern Netherlands (HEAVENN)" project. The results
+reveal that considering the chicken-and-egg dilemma under uncertain hydrogen
+market conditions leads to earlier and more diverse investments, providing
+critical insights for policymakers based on the degree of decision dependency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ State-of-the-art Methods for Pseudo-Boolean Solving with SCIP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03390v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03390v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gioni Mexi, Dominik Kamp, Yuji Shinano, Shanwen Pu, Alexander Hoen, Ksenia Bestuzheva, Christopher Hojny, Matthias Walter, Marc E. Pfetsch, Sebastian Pokutta, Thorsten Koch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Pseudo-Boolean problem deals with linear or polynomial constraints with
+integer coefficients over Boolean variables. The objective lies in optimizing a
+linear objective function, or finding a feasible solution, or finding a
+solution that satisfies as many constraints as possible. In the 2024
+Pseudo-Boolean competition, solvers incorporating the SCIP framework won five
+out of six categories it was competing in. From a total of 1,207 instances,
+SCIP successfully solved 759, while its parallel version FiberSCIP solved 776.
+Based on the results from the competition, we further enhanced SCIP's
+Pseudo-Boolean capabilities. This article discusses the results and presents
+the winning algorithmic ideas.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Second-Order Optimization Algorithms for Minimizing Low-rank
+  Functions <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03718v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03718v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edward Tansley, Coralia Cartis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a random-subspace variant of cubic regularization algorithm that
+chooses the size of the subspace adaptively, based on the rank of the projected
+second derivative matrix. Iteratively, our variant only requires access to
+(small-dimensional) projections of first- and second-order problem derivatives
+and calculates a reduced step inexpensively. The ensuing method maintains the
+optimal global rate of convergence of (full-dimensional) cubic regularization,
+while showing improved scalability both theoretically and numerically,
+particularly when applied to low-rank functions. When applied to the latter,
+our algorithm naturally adapts the subspace size to the true rank of the
+function, without knowing it a priori.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2024 Workshop OPT2024: Optimization for Machine
+  Learning; fixed typo on page 5</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Output-Positive Adaptive Control of Hyperbolic PDE-ODE Cascades 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05596v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05596v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Wang, Miroslav Krstic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a new adaptive Control Barrier Function (aCBF)
+method to design the output-positive adaptive control law for a hyperbolic
+PDE-ODE cascade with parametric uncertainties. This method employs the recent
+adaptive control approach with batch least-squares identification (BaLSI,
+pronounced "ballsy") that completes perfect parameter identification in finite
+time and offers a previously unforeseen advantage in safe control design with
+aCBF, which we elucidate in this paper. Since the true challenge is exhibited
+for CBF of a high relative degree, we undertake a control design in this paper
+for a class of systems that possess a particularly extreme relative degree:
+$2\times2$ hyperbolic PDEs sandwiched by a strict-feedback nonlinear ODE and a
+linear ODE, where the unknown coefficients are associated with the PDE
+in-domain coupling terms and with the input signal of the distal ODE. The
+designed output-positive adaptive controller guarantees the positivity of the
+output signal that is the furthermost state from the control input as well as
+the exponential regulation of the overall plant state to zero. The
+effectiveness of the proposed method is illustrated by numerical simulation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Off-the-grid regularisation for Poisson inverse problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00810v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00810v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marta Lazzaretti, Claudio Estatico, Alejandro Melero, Luca Calatroni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Off-the-grid regularisation has been extensively employed over the last
+decade in the context of ill-posed inverse problems formulated in the
+continuous setting of the space of Radon measures $\mathcal{M}(\mathcal{X})$.
+These approaches enjoy convexity and counteract the discretisation biases as
+well the numerical instabilities typical of their discrete counterparts. In the
+framework of sparse reconstruction of discrete point measures (sum of weighted
+Diracs), a Total Variation regularisation norm in $\mathcal{M}(\mathcal{X})$ is
+typically combined with an $L^2$ data term modelling additive Gaussian noise.
+To asses the framework of off-the-grid regularisation in the presence of
+signal-dependent Poisson noise, we consider in this work a variational model
+coupling the Total Variation regularisation with a Kullback-Leibler data term
+under a non-negativity constraint. Analytically, we study the optimality
+conditions of the composite functional and analyse its dual problem. Then, we
+consider an homotopy strategy to select an optimal regularisation parameter and
+use it within a Sliding Frank-Wolfe algorithm. Several numerical experiments on
+both 1D/2D simulated and real 3D fluorescent microscopy data are reported.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ensemble Control for Stochastic Systems with Asymmetric Laplace Noises 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09973v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09973v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yajie Yu, Xuehui Ma, Shiliang Zhang, Zhuzhu Wang, Xubing Shi, Yushuai Li, Tingwen Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an adaptive ensemble control for stochastic systems
+subject to asymmetric noises and outliers. Asymmetric noises skew system
+observations, and outliers with large amplitude deteriorate the observations
+even further. Such disturbances induce poor system estimation and degraded
+stochastic system control. In this work, we model the asymmetric noises and
+outliers by mixed asymmetric Laplace distributions (ALDs), and propose an
+optimal control for stochastic systems with mixed ALD noises. Particularly, we
+segregate the system disturbed by mixed ALD noises into subsystems, each of
+which is subject to a specific ALD noise. For each subsystem, we design an
+iterative quantile filter (IQF) to estimate the system parameters using system
+observations. With the estimated parameters by IQF, we derive the certainty
+equivalence (CE) control law for each subsystem. Then we use the Bayesian
+approach to ensemble the subsystem CE controllers, with each of the controllers
+weighted by their posterior probability. We finalize our control law as the
+weighted sum of the control signals by the sub-system CE controllers. To
+demonstrate our approach, we conduct numerical simulations and Monte Carlo
+analyses. The results show improved tracking performance by our approach for
+skew noises and its robustness to outliers, compared with single ALD based and
+RLS-based control policy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization
+  Algorithm for Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01714v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01714v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawu Tian, Liwei Xu, Xiaowei Zhang, Yongqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training deep neural networks is a challenging task. In order to speed up
+training and enhance the performance of deep neural networks, we rectify the
+vanilla conjugate gradient as conjugate-gradient-like and incorporate it into
+the generic Adam, and thus propose a new optimization algorithm named
+CG-like-Adam for deep learning. Specifically, both the first-order and the
+second-order moment estimation of generic Adam are replaced by the
+conjugate-gradient-like. Convergence analysis handles the cases where the
+exponential moving average coefficient of the first-order moment estimation is
+constant and the first-order moment estimation is unbiased. Numerical
+experiments show the superiority of the proposed algorithm based on the
+CIFAR10/100 dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Stabilization of Periodic Orbits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11955v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11955v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Beck, Noboru Sakamoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this contribution, the optimal stabilization problem of periodic orbits is
+studied via invariant manifold theory and symplectic geometry. The stable
+manifold theory for the optimal point stabilization case is generalized to the
+case of periodic orbit stabilization, where a normally hyperbolic invariant
+manifold (NHIM) plays the role of a hyperbolic equilibrium.
+  A sufficient condition for the existence of an NHIM of an associated
+Hamiltonian system is derived in terms of a periodic Riccati differential
+equation. It is shown that the problem of optimal orbit stabilization has a
+solution if a linearized periodic system satisfies stabilizability and
+detectability. A moving orthogonal coordinate system is employed along the
+periodic orbit which is a natural framework for orbital stabilization and
+linearization argument.
+  Examples illustrated include an optimal control problem for a spring-mass
+oscillator system, which should be stabilized at a certain energy level, and an
+orbit transfer problem for a satellite, which constitutes a typical control
+problem of orbital mechanics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted for a journal on November 29 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Neural Contracting Dynamics: Extended Linearization and Global
+  Guarantees <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08090v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08090v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Jaffe, Alexander Davydov, Deniz Lapsekili, Ambuj Singh, Francesco Bullo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global stability and robustness guarantees in learned dynamical systems are
+essential to ensure well-behavedness of the systems in the face of uncertainty.
+We present Extended Linearized Contracting Dynamics (ELCD), the first neural
+network-based dynamical system with global contractivity guarantees in
+arbitrary metrics. The key feature of ELCD is a parametrization of the extended
+linearization of the nonlinear vector field. In its most basic form, ELCD is
+guaranteed to be (i) globally exponentially stable, (ii) equilibrium
+contracting, and (iii) globally contracting with respect to some metric. To
+allow for contraction with respect to more general metrics in the data space,
+we train diffeomorphisms between the data space and a latent space and enforce
+contractivity in the latent space, which ensures global contractivity in the
+data space. We demonstrate the performance of ELCD on the high dimensional
+LASA, multi-link pendulum, and Rosenbrock datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures. NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-asymptotic Global Convergence Analysis of BFGS with the Armijo-Wolfe
+  Line Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.16731v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.16731v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiujiang Jin, Ruichen Jiang, Aryan Mokhtari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present the first explicit and non-asymptotic global
+convergence rates of the BFGS method when implemented with an inexact line
+search scheme satisfying the Armijo-Wolfe conditions. We show that BFGS
+achieves a global linear convergence rate of $(1 - \frac{1}{\kappa})^t$ for
+$\mu$-strongly convex functions with $L$-Lipschitz gradients, where $\kappa =
+\frac{L}{\mu}$ represents the condition number. Additionally, if the objective
+function's Hessian is Lipschitz, BFGS with the Armijo-Wolfe line search
+achieves a linear convergence rate that depends solely on the line search
+parameters, independent of the condition number. We also establish a global
+superlinear convergence rate of $\mathcal{O}((\frac{1}{t})^t)$. These global
+bounds are all valid for any starting point $x_0$ and any symmetric positive
+definite initial Hessian approximation matrix $B_0$, though the choice of $B_0$
+impacts the number of iterations needed to achieve these rates. By synthesizing
+these results, we outline the first global complexity characterization of BFGS
+with the Armijo-Wolfe line search. Additionally, we clearly define a mechanism
+for selecting the step size to satisfy the Armijo-Wolfe conditions and
+characterize its overall complexity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Equity Impacts of Public Transit Network Redesign with Shared Autonomous
+  Mobility Services 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max T. M. Ng, Meredith Raymer, Hani S. Mahmassani, Omer Verbas, Taner Cokyasar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study examines the equity impacts of integrating shared autonomous
+mobility services (SAMS) into transit system redesign. Using the Greater
+Chicago area as a case study, we compare two optimization objectives in
+multimodal transit network redesign: minimizing total generalized costs
+(equity-agnostic) versus prioritizing service in low-income areas
+(equity-focused). We evaluate the achieved accessibility of clustered zones
+with redesigned transit networks under two objectives, compared to driving and
+the existing transit network. The transit access gaps across zones and between
+transit and driving are found to be generally reduced with the introduction of
+SAMS, but less so with the subsequent improved infrastructure under budget.
+Differential improvement in equity is seen across suburbs and areas of the
+city, reflecting the disparity in current transit access and improvement
+potential. In particular, SAMS bridges the transit access gaps in suburban and
+city areas currently underserved by transit. The City of Chicago, which is also
+disproportionately home to vulnerable populations, offers an avenue to improve
+vertical equity. These findings demonstrate that SAMS can enhance both
+horizontal and vertical equity in transit systems, particularly when equity is
+explicitly incorporated into the design objective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Restructuring the paper for more precise research direction</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">128</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Planarian Neural Networks: Evolutionary Patterns from Basic Bilateria
+  Shaping Modern Artificial Neural Network Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyuan Huang, Mark Newman, Maria Vaida, Srikar Bellur, Roozbeh Sadeghian, Andrew Siu, Hui Wang, Kevin Huggins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study examined the viability of enhancing the prediction accuracy of
+artificial neural networks (ANNs) in image classification tasks by developing
+ANNs with evolution patterns similar to those of biological neural networks.
+ResNet is a widely used family of neural networks with both deep and wide
+variants; therefore, it was selected as the base model for our investigation.
+The aim of this study is to improve the image classification performance of
+ANNs via a novel approach inspired by the biological nervous system
+architecture of planarians, which comprises a brain and two nerve cords. We
+believe that the unique neural architecture of planarians offers valuable
+insights into the performance enhancement of ANNs. The proposed planarian
+neural architecture-based neural network was evaluated on the CIFAR-10 and
+CIFAR-100 datasets. Our results indicate that the proposed method exhibits
+higher prediction accuracy than the baseline neural network models in image
+classification tasks. These findings demonstrate the significant potential of
+biologically inspired neural network architectures in improving the performance
+of ANNs in a wide range of applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EditAR: Unified Conditional Generation with Autoregressive Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04699v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04699v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiteng Mu, Nuno Vasconcelos, Xiaolong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in controllable image generation and editing is largely
+driven by diffusion-based methods. Although diffusion models perform
+exceptionally well in specific tasks with tailored designs, establishing a
+unified model is still challenging. In contrast, autoregressive models
+inherently feature a unified tokenized representation, which simplifies the
+creation of a single foundational model for various tasks. In this work, we
+propose EditAR, a single unified autoregressive framework for a variety of
+conditional image generation tasks, e.g., image editing, depth-to-image,
+edge-to-image, segmentation-to-image. The model takes both images and
+instructions as inputs, and predicts the edited images tokens in a vanilla
+next-token paradigm. To enhance the text-to-image alignment, we further propose
+to distill the knowledge from foundation models into the autoregressive
+modeling process. We evaluate its effectiveness across diverse tasks on
+established benchmarks, showing competitive performance to various
+state-of-the-art task-specific methods. Project page:
+https://jitengmu.github.io/EditAR/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://jitengmu.github.io/EditAR/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConceptMaster: Multi-Concept Video Customization on Diffusion
+  <span class="highlight-title">Transformer</span> Models Without Test-Time Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhou Huang, Ziyang Yuan, Quande Liu, Qiulin Wang, Xintao Wang, Ruimao Zhang, Pengfei Wan, Di Zhang, Kun Gai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-video generation has made remarkable advancements through diffusion
+models. However, Multi-Concept Video Customization (MCVC) remains a significant
+challenge. We identify two key challenges in this task: 1) the identity
+decoupling problem, where directly adopting existing customization methods
+inevitably mix attributes when handling multiple concepts simultaneously, and
+2) the scarcity of high-quality video-entity pairs, which is crucial for
+training such a model that represents and decouples various concepts well. To
+address these challenges, we introduce ConceptMaster, an innovative framework
+that effectively tackles the critical issues of identity decoupling while
+maintaining concept fidelity in customized videos. Specifically, we introduce a
+novel strategy of learning decoupled multi-concept embeddings that are injected
+into the diffusion models in a standalone manner, which effectively guarantees
+the quality of customized videos with multiple identities, even for highly
+similar visual concepts. To further overcome the scarcity of high-quality MCVC
+data, we carefully establish a data construction pipeline, which enables
+systematic collection of precise multi-concept video-entity data across diverse
+concepts. A comprehensive benchmark is designed to validate the effectiveness
+of our model from three critical dimensions: concept fidelity, identity
+decoupling ability, and video generation quality across six different concept
+composition scenarios. Extensive experiments demonstrate that our ConceptMaster
+significantly outperforms previous approaches for this task, paving the way for
+generating personalized and semantically accurate videos across multiple
+concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://yuzhou914.github.io/ConceptMaster/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grokking at the Edge of Numerical Stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Prieto, Melih Barsbey, Pedro A. M. Mediano, Tolga Birdal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grokking, the sudden generalization that occurs after prolonged overfitting,
+is a surprising phenomenon challenging our understanding of deep learning.
+Although significant progress has been made in understanding grokking, the
+reasons behind the delayed generalization and its dependence on regularization
+remain unclear. In this work, we argue that without regularization, grokking
+tasks push models to the edge of numerical stability, introducing floating
+point errors in the Softmax function, which we refer to as Softmax Collapse
+(SC). We demonstrate that SC prevents grokking and that mitigating SC enables
+grokking without regularization. Investigating the root cause of SC, we find
+that beyond the point of overfitting, the gradients strongly align with what we
+call the na\"ive loss minimization (NLM) direction. This component of the
+gradient does not alter the model's predictions but decreases the loss by
+scaling the logits, typically by scaling the weights along their current
+direction. We show that this scaling of the logits explains the delay in
+generalization characteristic of grokking and eventually leads to SC, halting
+further learning. To validate our hypotheses, we introduce two key
+contributions that address the challenges in grokking tasks: StableMax, a new
+activation function that prevents SC and enables grokking without
+regularization, and $\perp$Grad, a training algorithm that promotes quick
+generalization in grokking tasks by preventing NLM altogether. These
+contributions provide new insights into grokking, elucidating its delayed
+generalization, reliance on regularization, and the effectiveness of existing
+grokking-inducing methods. Code for this paper is available at
+https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Test-Time Optimization for Domain Adaptive Open Vocabulary Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ulindu De Silva, Didula Samaraweera, Sasini Wanigathunga, Kavindu Kariyawasam, Kanchana Ranasinghe, Muzammal Naseer, Ranga Rodrigo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Seg-TTO, a novel framework for zero-shot, open-vocabulary semantic
+segmentation (OVSS), designed to excel in specialized domain tasks. While
+current open vocabulary approaches show impressive performance on standard
+segmentation benchmarks under zero-shot settings, they fall short of supervised
+counterparts on highly domain-specific datasets. We focus on
+segmentation-specific test-time optimization to address this gap. Segmentation
+requires an understanding of multiple concepts within a single image while
+retaining the locality and spatial structure of representations. We propose a
+novel self-supervised objective adhering to these requirements and use it to
+align the model parameters with input images at test time. In the textual
+modality, we learn multiple embeddings for each category to capture diverse
+concepts within an image, while in the visual modality, we calculate
+pixel-level losses followed by embedding aggregation operations specific to
+preserving spatial structure. Our resulting framework termed Seg-TTO is a
+plug-in-play module. We integrate Seg-TTO with three state-of-the-art OVSS
+approaches and evaluate across 22 challenging OVSS tasks covering a range of
+specialized domains. Our Seg-TTO demonstrates clear performance improvements
+across these establishing new state-of-the-art. Code:
+https://github.com/UlinduP/SegTTO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Re-ranking the Context for Multimodal Retrieval Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matin Mortaheb, Mohammad A. Amir Khojastepour, Srimat T. Chakradhar, Sennur Ulukus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) enhances large language models (LLMs) by
+incorporating external knowledge to generate a response within a context with
+improved accuracy and reduced hallucinations. However, multi-modal RAG systems
+face unique challenges: (i) the retrieval process may select irrelevant entries
+to user query (e.g., images, documents), and (ii) vision-language models or
+multi-modal language models like GPT-4o may hallucinate when processing these
+entries to generate RAG output. In this paper, we aim to address the first
+challenge, i.e, improving the selection of relevant context from the
+knowledge-base in retrieval phase of the multi-modal RAG. Specifically, we
+leverage the relevancy score (RS) measure designed in our previous work for
+evaluating the RAG performance to select more relevant entries in retrieval
+process. The retrieval based on embeddings, say CLIP-based embedding, and
+cosine similarity usually perform poorly particularly for multi-modal data. We
+show that by using a more advanced relevancy measure, one can enhance the
+retrieval process by selecting more relevant pieces from the knowledge-base and
+eliminate the irrelevant pieces from the context by adaptively selecting
+up-to-$k$ entries instead of fixed number of entries. Our evaluation using COCO
+dataset demonstrates significant enhancement in selecting relevant context and
+accuracy of the generated response.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SPAR3D: Stable Point-Aware Reconstruction of 3D Objects from Single
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Huang, Mark Boss, Aaryaman Vasishta, James M. Rehg, Varun Jampani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of single-image 3D object reconstruction. Recent works
+have diverged into two directions: regression-based modeling and generative
+modeling. Regression methods efficiently infer visible surfaces, but struggle
+with occluded regions. Generative methods handle uncertain regions better by
+modeling distributions, but are computationally expensive and the generation is
+often misaligned with visible surfaces. In this paper, we present SPAR3D, a
+novel two-stage approach aiming to take the best of both directions. The first
+stage of SPAR3D generates sparse 3D point clouds using a lightweight point
+diffusion model, which has a fast sampling speed. The second stage uses both
+the sampled point cloud and the input image to create highly detailed meshes.
+Our two-stage design enables probabilistic modeling of the ill-posed
+single-image 3D task while maintaining high computational efficiency and great
+output fidelity. Using point clouds as an intermediate representation further
+allows for interactive user edits. Evaluated on diverse datasets, SPAR3D
+demonstrates superior performance over previous state-of-the-art methods, at an
+inference speed of 0.7 seconds. Project page with code and model:
+https://spar3d.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rad<span class="highlight-title">GPT</span>: Constructing 3D Image-Text Tumor <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro R. A. S. Bassi, Mehmet Can Yavuz, Kang Wang, Xiaoxi Chen, Wenxuan Li, Sergio Decherchi, Andrea Cavalli, Yang Yang, Alan Yuille, Zongwei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With over 85 million CT scans performed annually in the United States,
+creating tumor-related reports is a challenging and time-consuming task for
+radiologists. To address this need, we present RadGPT, an Anatomy-Aware
+Vision-Language AI Agent for generating detailed reports from CT scans. RadGPT
+first segments tumors, including benign cysts and malignant tumors, and their
+surrounding anatomical structures, then transforms this information into both
+structured reports and narrative reports. These reports provide tumor size,
+shape, location, attenuation, volume, and interactions with surrounding blood
+vessels and organs. Extensive evaluation on unseen hospitals shows that RadGPT
+can produce accurate reports, with high sensitivity/specificity for small tumor
+(<2 cm) detection: 80/73% for liver tumors, 92/78% for kidney tumors, and
+77/77% for pancreatic tumors. For large tumors, sensitivity ranges from 89% to
+97%. The results significantly surpass the state-of-the-art in abdominal CT
+report generation.
+  RadGPT generated reports for 17 public datasets. Through radiologist review
+and refinement, we have ensured the reports' accuracy, and created the first
+publicly available image-text 3D medical dataset, comprising over 1.8 million
+text tokens and 2.7 million images from 9,262 CT scans, including 2,947 tumor
+scans/reports of 8,562 tumor instances. Our reports can: (1) localize tumors in
+eight liver sub-segments and three pancreatic sub-segments annotated per-voxel;
+(2) determine pancreatic tumor stage (T1-T4) in 260 reports; and (3) present
+individual analyses of multiple tumors--rare in human-made reports.
+Importantly, 948 of the reports are for early-stage tumors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Financial VQA in Vision Language Models using Intermediate
+  Structured Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Archita Srivastava, Abhas Kumar, Rajesh Kumar, Prabhakar Srinivasan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chart interpretation is crucial for visual data analysis, but accurately
+extracting information from charts poses significant challenges for automated
+models. This study investigates the fine-tuning of DEPLOT, a modality
+conversion module that translates the image of a plot or chart to a linearized
+table, on a custom dataset of 50,000 bar charts. The dataset comprises simple,
+stacked, and grouped bar charts, targeting the unique structural features of
+these visualizations. The finetuned DEPLOT model is evaluated against its base
+version using a test set of 1,000 images and two metrics: Relative Mapping
+Similarity (RMS), which measures categorical mapping accuracy, and Relative
+Number Set Similarity (RNSS), which evaluates numerical interpretation
+accuracy. To further explore the reasoning capabilities of large language
+models (LLMs), we curate an additional set of 100 bar chart images paired with
+question answer sets. Our findings demonstrate that providing a structured
+intermediate table alongside the image significantly enhances LLM reasoning
+performance compared to direct image queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DRIVINGVQA: Analyzing Visual Chain-of-Thought Reasoning of Vision
+  Language Models in Real-World Scenarios with Driving Theory Tests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles Corbière, Simon Roburin, Syrielle Montariol, Antoine Bosselut, Alexandre Alahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models (LVLMs) augment language models with visual
+understanding, enabling multimodal reasoning. However, due to the modality gap
+between textual and visual data, they often face significant challenges, such
+as over-reliance on text priors, hallucinations, and limited capacity for
+complex visual reasoning. Existing benchmarks to evaluate visual reasoning in
+LVLMs often rely on schematic or synthetic images and on imprecise
+machine-generated explanations. To bridge the modality gap, we present
+DrivingVQA, a new benchmark derived from driving theory tests to evaluate
+visual chain-of-thought reasoning in complex real-world scenarios. It offers
+3,931 expert-crafted multiple-choice problems and interleaved explanations
+grounded with entities relevant to the reasoning process. We leverage this
+dataset to perform an extensive study of LVLMs' ability to reason about complex
+visual scenarios. Our experiments reveal that open-source and proprietary LVLMs
+struggle with visual chain-of-thought reasoning under zero-shot settings. We
+investigate training strategies that leverage relevant entities to improve
+visual reasoning. Notably, we observe a performance boost of up to 7\% when
+reasoning over image tokens of cropped regions tied to these entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are They the Same? Exploring Visual Correspondence Shortcomings of
+  Multimodal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yikang Zhou, Tao Zhang, Shilin Xu, Shihao Chen, Qianyu Zhou, Yunhai Tong, Shunping Ji, Jiangning Zhang, Xiangtai Li, Lu Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in multimodal models have shown a strong ability in
+visual perception, reasoning abilities, and vision-language understanding.
+However, studies on visual matching ability are missing, where finding the
+visual correspondence of objects is essential in vision research. Our research
+reveals that the matching capabilities in recent multimodal LLMs (MLLMs) still
+exhibit systematic shortcomings, even with current strong MLLMs models, GPT-4o.
+In particular, we construct a Multimodal Visual Matching (MMVM) benchmark to
+fairly benchmark over 30 different MLLMs. The MMVM benchmark is built from 15
+open-source datasets and Internet videos with manual annotation. We categorize
+the data samples of MMVM benchmark into eight aspects based on the required
+cues and capabilities to more comprehensively evaluate and analyze current
+MLLMs. In addition, we have designed an automatic annotation pipeline to
+generate the MMVM SFT dataset, including 220K visual matching data with
+reasoning annotation. Finally, we present CoLVA, a novel contrastive MLLM with
+two novel technical designs: fine-grained vision expert with object-level
+contrastive learning and instruction augmentation strategy. CoLVA achieves
+51.06\% overall accuracy (OA) on the MMVM benchmark, surpassing GPT-4o and
+baseline by 8.41\% and 23.58\% OA, respectively. The results show the
+effectiveness of our MMVM SFT dataset and our novel technical designs. Code,
+benchmark, dataset, and models are available at
+https://github.com/zhouyiks/CoLVA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://zhouyiks.github.io/projects/CoLVA/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Virtual Try-On with Synthetic Pairs and Error-Aware Noise
+  Scheduling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nannan Li, Kevin J. Shih, Bryan A. Plummer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given an isolated garment image in a canonical product view and a separate
+image of a person, the virtual try-on task aims to generate a new image of the
+person wearing the target garment. Prior virtual try-on works face two major
+challenges in achieving this goal: a) the paired (human, garment) training data
+has limited availability; b) generating textures on the human that perfectly
+match that of the prompted garment is difficult, often resulting in distorted
+text and faded textures. Our work explores ways to tackle these issues through
+both synthetic data as well as model refinement. We introduce a garment
+extraction model that generates (human, synthetic garment) pairs from a single
+image of a clothed individual. The synthetic pairs can then be used to augment
+the training of virtual try-on. We also propose an Error-Aware Refinement-based
+Schr\"odinger Bridge (EARSB) that surgically targets localized generation
+errors for correcting the output of a base virtual try-on model. To identify
+likely errors, we propose a weakly-supervised error classifier that localizes
+regions for refinement, subsequently augmenting the Schr\"odinger Bridge's
+noise schedule with its confidence heatmap. Experiments on VITON-HD and
+DressCode-Upper demonstrate that our synthetic data augmentation enhances the
+performance of prior work, while EARSB improves the overall image quality. In
+user studies, our model is preferred by the users in an average of 59% of
+cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HyFusion: Enhanced Reception Field <span class="highlight-title">Transformer</span> for Hyperspectral Image
+  Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04665v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04665v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chia-Ming Lee, Yu-Fan Lin, Yu-Hao Ho, Li-Wei Kang, Chih-Chung Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral image (HSI) fusion addresses the challenge of reconstructing
+High-Resolution HSIs (HR-HSIs) from High-Resolution Multispectral images
+(HR-MSIs) and Low-Resolution HSIs (LR-HSIs), a critical task given the high
+costs and hardware limitations associated with acquiring high-quality HSIs.
+While existing methods leverage spatial and spectral relationships, they often
+suffer from limited receptive fields and insufficient feature utilization,
+leading to suboptimal performance. Furthermore, the scarcity of high-quality
+HSI data highlights the importance of efficient data utilization to maximize
+reconstruction quality. To address these issues, we propose HyFusion, a novel
+framework designed to enhance the receptive field and enable effective feature
+map reusing, thereby maximizing data utilization. First, HR-MSI and LR-HSI
+inputs are concatenated to form a quasi-fused draft, preserving complementary
+spatial and spectral details. Next, the Enhanced Reception Field Block (ERFB)
+is introduced, combining shifting-window attention and dense connections to
+expand the receptive field, effectively capturing long-range dependencies and
+reusing features to reduce information loss, thereby boosting data efficiency.
+Finally, the Dual-Coupled Network (DCN) dynamically extracts high-frequency
+spectral and spatial features from LR-HSI and HR-MSI, ensuring efficient
+cross-domain fusion. Extensive experiments demonstrate that HyFusion achieves
+state-of-the-art performance in HR-MSI/LR-HSI fusion, significantly improving
+reconstruction quality while maintaining a compact model size and computational
+efficiency. By integrating enhanced receptive fields and feature map reusing,
+HyFusion provides a practical and effective solution for HSI fusion in
+resource-constrained scenarios, setting a new benchmark in hyperspectral
+imaging. Our code will be publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IGARSS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flair<span class="highlight-title">GPT</span>: Repurposing LLMs for Interior Designs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabrielle Littlefair, Niladri Shekhar Dutt, Niloy J. Mitra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interior design involves the careful selection and arrangement of objects to
+create an aesthetically pleasing, functional, and harmonized space that aligns
+with the client's design brief. This task is particularly challenging, as a
+successful design must not only incorporate all the necessary objects in a
+cohesive style, but also ensure they are arranged in a way that maximizes
+accessibility, while adhering to a variety of affordability and usage
+considerations. Data-driven solutions have been proposed, but these are
+typically room- or domain-specific and lack explainability in their design
+design considerations used in producing the final layout. In this paper, we
+investigate if large language models (LLMs) can be directly utilized for
+interior design. While we find that LLMs are not yet capable of generating
+complete layouts, they can be effectively leveraged in a structured manner,
+inspired by the workflow of interior designers. By systematically probing LLMs,
+we can reliably generate a list of objects along with relevant constraints that
+guide their placement. We translate this information into a design layout
+graph, which is then solved using an off-the-shelf constrained optimization
+setup to generate the final layouts. We benchmark our algorithm in various
+design configurations against existing LLM-based methods and human designs, and
+evaluate the results using a variety of quantitative and qualitative metrics
+along with user studies. In summary, we demonstrate that LLMs, when used in a
+structured manner, can effectively generate diverse high-quality layouts,
+making them a viable solution for creating large-scale virtual scenes. Project
+webpage at https://flairgpt.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EUROGRAPHICS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discrete Wavelet Transform-Based Capsule Network for Hyperspectral Image
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiang Gao, Jiaqi Wang, Hangchi Shen, Zhihao Dou, Xiangbo Zhang, Kaizhu Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral image (HSI) classification is a crucial technique for remote
+sensing to build large-scale earth monitoring systems. HSI contains much more
+information than traditional visual images for identifying the categories of
+land covers. One recent feasible solution for HSI is to leverage CapsNets for
+capturing spectral-spatial information. However, these methods require high
+computational requirements due to the full connection architecture between
+stacked capsule layers. To solve this problem, a DWT-CapsNet is proposed to
+identify partial but important connections in CapsNet for a effective and
+efficient HSI classification. Specifically, we integrate a tailored attention
+mechanism into a Discrete Wavelet Transform (DWT)-based downsampling layer,
+alleviating the information loss problem of conventional downsampling operation
+in feature extractors. Moreover, we propose a novel multi-scale routing
+algorithm that prunes a large proportion of connections in CapsNet. A capsule
+pyramid fusion mechanism is designed to aggregate the spectral-spatial
+relationships in multiple levels of granularity, and then a self-attention
+mechanism is further conducted in a partially and locally connected
+architecture to emphasize the meaningful relationships. As shown in the
+experimental results, our method achieves state-of-the-art accuracy while
+keeping lower computational demand regarding running time, flops, and the
+number of parameters, rendering it an appealing choice for practical
+implementation in HSI classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 Pages; 9 Figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangled Clothed Avatar Generation with Layered Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weitian Zhang, Sijing Wu, Manwen Liao, Yichao Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clothed avatar generation has wide applications in virtual and augmented
+reality, filmmaking, and more. Previous methods have achieved success in
+generating diverse digital avatars, however, generating avatars with
+disentangled components (\eg, body, hair, and clothes) has long been a
+challenge. In this paper, we propose LayerAvatar, the first feed-forward
+diffusion-based method for generating component-disentangled clothed avatars.
+To achieve this, we first propose a layered UV feature plane representation,
+where components are distributed in different layers of the Gaussian-based UV
+feature plane with corresponding semantic labels. This representation supports
+high-resolution and real-time rendering, as well as expressive animation
+including controllable gestures and facial expressions. Based on the
+well-designed representation, we train a single-stage diffusion model and
+introduce constrain terms to address the severe occlusion problem of the
+innermost human body layer. Extensive experiments demonstrate the impressive
+performances of our method in generating disentangled clothed avatars, and we
+further explore its applications in component transfer. The project page is
+available at: https://olivia23333.github.io/LayerAvatar/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://olivia23333.github.io/LayerAvatar/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FatesGS: Fast and Accurate Sparse-View Surface Reconstruction using
+  Gaussian Splatting with Depth-Feature Consistency <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Huang, Yulun Wu, Chao Deng, Ge Gao, Ming Gu, Yu-Shen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Gaussian Splatting has sparked a new trend in the field of computer
+vision. Apart from novel view synthesis, it has also been extended to the area
+of multi-view reconstruction. The latest methods facilitate complete, detailed
+surface reconstruction while ensuring fast training speed. However, these
+methods still require dense input views, and their output quality significantly
+degrades with sparse views. We observed that the Gaussian primitives tend to
+overfit the few training views, leading to noisy floaters and incomplete
+reconstruction surfaces. In this paper, we present an innovative sparse-view
+reconstruction framework that leverages intra-view depth and multi-view feature
+consistency to achieve remarkably accurate surface reconstruction.
+Specifically, we utilize monocular depth ranking information to supervise the
+consistency of depth distribution within patches and employ a smoothness loss
+to enhance the continuity of the distribution. To achieve finer surface
+reconstruction, we optimize the absolute position of depth through multi-view
+projection features. Extensive experiments on DTU and BlendedMVS demonstrate
+that our method outperforms state-of-the-art methods with a speedup of 60x to
+200x, achieving swift and fine-grained mesh reconstruction without the need for
+costly pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025. Project page:
+  https://alvin528.github.io/FatesGS/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comprehensive Examination of Unrolled Networks for Linear Inverse
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Chen, Xi Chen, Arian Maleki, Shirin Jalali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unrolled networks have become prevalent in various computer vision and
+imaging tasks. Although they have demonstrated remarkable efficacy in solving
+specific computer vision and computational imaging tasks, their adaptation to
+other applications presents considerable challenges. This is primarily due to
+the multitude of design decisions that practitioners working on new
+applications must navigate, each potentially affecting the network's overall
+performance. These decisions include selecting the optimization algorithm,
+defining the loss function, and determining the number of convolutional layers,
+among others. Compounding the issue, evaluating each design choice requires
+time-consuming simulations to train, fine-tune the neural network, and optimize
+for its performance. As a result, the process of exploring multiple options and
+identifying the optimal configuration becomes time-consuming and
+computationally demanding. The main objectives of this paper are (1) to unify
+some ideas and methodologies used in unrolled networks to reduce the number of
+design choices a user has to make, and (2) to report a comprehensive ablation
+study to discuss the impact of each of the choices involved in designing
+unrolled networks and present practical recommendations based on our findings.
+We anticipate that this study will help scientists and engineers design
+unrolled networks for their applications and diagnose problems within their
+networks efficiently.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 10 figures. Project Page:
+  https://github.com/YuxiChen25/Memory-Net-Inverse</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Low-Cost Video Editing with Lightweight Adaptors and
+  Temporal-Aware Inversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangfan He, Sida Li, Kun Li, Jianhui Wang, Binxu Li, Tianyu Shi, Jun Yin, Miao Zhang, Xueqian Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in text-to-image (T2I) generation using diffusion models
+have enabled cost-effective video-editing applications by leveraging
+pre-trained models, eliminating the need for resource-intensive training.
+However, the frame-independence of T2I generation often results in poor
+temporal consistency. Existing methods address this issue through temporal
+layer fine-tuning or inference-based temporal propagation, but these approaches
+suffer from high training costs or limited temporal coherence. To address these
+challenges, we propose a General and Efficient Adapter (GE-Adapter) that
+integrates temporal-spatial and semantic consistency with Baliteral DDIM
+inversion. This framework introduces three key components: (1) Frame-based
+Temporal Consistency Blocks (FTC Blocks) to capture frame-specific features and
+enforce smooth inter-frame transitions via temporally-aware loss functions; (2)
+Channel-dependent Spatial Consistency Blocks (SCD Blocks) employing bilateral
+filters to enhance spatial coherence by reducing noise and artifacts; and (3)
+Token-based Semantic Consistency Module (TSC Module) to maintain semantic
+alignment using shared prompt tokens and frame-specific tokens. Our method
+significantly improves perceptual quality, text-image alignment, and temporal
+coherence, as demonstrated on the MSR-VTT dataset. Additionally, it achieves
+enhanced fidelity and frame-to-frame coherence, offering a practical solution
+for T2V editing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FrontierNet: Learning Visual Cues to Explore 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04597v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04597v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyang Sun, Hanzhi Chen, Stefan Leutenegger, Cesar Cadena, Marc Pollefeys, Hermann Blum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploration of unknown environments is crucial for autonomous robots; it
+allows them to actively reason and decide on what new data to acquire for tasks
+such as mapping, object discovery, and environmental assessment. Existing
+methods, such as frontier-based methods, rely heavily on 3D map operations,
+which are limited by map quality and often overlook valuable context from
+visual cues. This work aims at leveraging 2D visual cues for efficient
+autonomous exploration, addressing the limitations of extracting goal poses
+from a 3D map. We propose a image-only frontier-based exploration system, with
+FrontierNet as a core component developed in this work. FrontierNet is a
+learning-based model that (i) detects frontiers, and (ii) predicts their
+information gain, from posed RGB images enhanced by monocular depth priors. Our
+approach provides an alternative to existing 3D-dependent exploration systems,
+achieving a 16% improvement in early-stage exploration efficiency, as validated
+through extensive simulations and real-world experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identity-Preserving Video Dubbing Using Motion Warping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runzhen Liu, Qinjie Lin, Yunfei Liu, Lijian Lin, Ye Zhu, Yu Li, Chuhua Xian, Fa-Ting Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video dubbing aims to synthesize realistic, lip-synced videos from a
+reference video and a driving audio signal. Although existing methods can
+accurately generate mouth shapes driven by audio, they often fail to preserve
+identity-specific features, largely because they do not effectively capture the
+nuanced interplay between audio cues and the visual attributes of reference
+identity . As a result, the generated outputs frequently lack fidelity in
+reproducing the unique textural and structural details of the reference
+identity. To address these limitations, we propose IPTalker, a novel and robust
+framework for video dubbing that achieves seamless alignment between driving
+audio and reference identity while ensuring both lip-sync accuracy and
+high-fidelity identity preservation. At the core of IPTalker is a
+transformer-based alignment mechanism designed to dynamically capture and model
+the correspondence between audio features and reference images, thereby
+enabling precise, identity-aware audio-visual integration. Building on this
+alignment, a motion warping strategy further refines the results by spatially
+deforming reference images to match the target audio-driven configuration. A
+dedicated refinement process then mitigates occlusion artifacts and enhances
+the preservation of fine-grained textures, such as mouth details and skin
+features. Extensive qualitative and quantitative evaluations demonstrate that
+IPTalker consistently outperforms existing approaches in terms of realism, lip
+synchronization, and identity retention, establishing a new state of the art
+for high-quality, identity-consistent video dubbing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Salient Object Detection with Knowledge Distillated from Large
+  Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miaoyang He, Shuyong Gao, Tsui Qin Mok, Weifeng Ge, Wengqiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Salient Object Detection (SOD) aims to identify and segment prominent regions
+within a scene. Traditional models rely on manually annotated pseudo labels
+with precise pixel-level accuracy, which is time-consuming. We developed a
+low-cost, high-precision annotation method by leveraging large foundation
+models to address the challenges. Specifically, we use a weakly supervised
+approach to guide large models in generating pseudo-labels through textual
+prompts. Since large models do not effectively focus on the salient regions of
+images, we manually annotate a subset of text to fine-tune the model. Based on
+this approach, which enables precise and rapid generation of pseudo-labels, we
+introduce a new dataset, BDS-TR. Compared to the previous DUTS-TR dataset,
+BDS-TR is more prominent in scale and encompasses a wider variety of categories
+and scenes. This expansion will enhance our model's applicability across a
+broader range of scenarios and provide a more comprehensive foundational
+dataset for future SOD research. Additionally, we present an edge decoder based
+on dynamic upsampling, which focuses on object edges while gradually recovering
+image feature resolution. Comprehensive experiments on five benchmark datasets
+demonstrate that our method significantly outperforms state-of-the-art
+approaches and also surpasses several existing fully-supervised SOD methods.
+The code and results will be made available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Coding for Both Human Perception and Generalized Machine
+  Analytics with CLIP Supervision <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangsheng Yin, Quan Liu, Xuelin Shen, Yulin He, Wenhan Yang, Shiqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The image compression model has long struggled with adaptability and
+generalization, as the decoded bitstream typically serves only human or machine
+needs and fails to preserve information for unseen visual tasks. Therefore,
+this paper innovatively introduces supervision obtained from multimodal
+pre-training models and incorporates adaptive multi-objective optimization
+tailored to support both human visual perception and machine vision
+simultaneously with a single bitstream, denoted as Unified and Generalized
+Image Coding for Machine (UG-ICM). Specifically, to get rid of the reliance
+between compression models with downstream task supervision, we introduce
+Contrastive Language-Image Pre-training (CLIP) models into the training
+constraint for improved generalization. Global-to-instance-wise CLIP
+supervision is applied to help obtain hierarchical semantics that make models
+more generalizable for the tasks relying on the information of different
+granularity. Furthermore, for supporting both human and machine visions with
+only a unifying bitstream, we incorporate a conditional decoding strategy that
+takes as conditions human or machine preferences, enabling the bitstream to be
+decoded into different versions for corresponding preferences. As such, our
+proposed UG-ICM is fully trained in a self-supervised manner, i.e., without
+awareness of any specific downstream models and tasks. The extensive
+experiments have shown that the proposed UG-ICM is capable of achieving
+remarkable improvements in various unseen machine analytics tasks, while
+simultaneously providing perceptually satisfying images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 10 figures, publised to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Supervision-free Vision-Language Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Giannone, Ruoteng Li, Qianli Feng, Evgeny Perevodchikov, Rui Chen, Aleix Martinez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) have demonstrated remarkable potential in
+integrating visual and linguistic information, but their performance is often
+constrained by the need for extensive, high-quality image-text training data.
+Curation of these image-text pairs is both time-consuming and computationally
+expensive. To address this challenge, we introduce SVP (Supervision-free Visual
+Projection), a novel framework that enhances vision-language alignment without
+relying on curated data or preference annotation. SVP leverages self-captioning
+and a pre-trained grounding model as a feedback mechanism to elicit latent
+information in VLMs. We evaluate our approach across six key areas: captioning,
+referring, visual question answering, multitasking, hallucination control, and
+object recall. Results demonstrate significant improvements, including a 14%
+average improvement in captioning tasks, up to 12% increase in object recall,
+and substantial reduction in hallucination rates. Notably, a small VLM using
+SVP achieves hallucination reductions comparable to a model five times larger,
+while a VLM with initially poor referring capabilities more than doubles its
+performance, approaching parity with a model twice its size.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learnable Scaled Gradient Descent for Guaranteed Robust Tensor PCA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lanlan Feng, Ce Zhu, Yipeng Liu, Saiprasad Ravishankar, Longxiu Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust tensor principal component analysis (RTPCA) aims to separate the
+low-rank and sparse components from multi-dimensional data, making it an
+essential technique in the signal processing and computer vision fields.
+Recently emerging tensor singular value decomposition (t-SVD) has gained
+considerable attention for its ability to better capture the low-rank structure
+of tensors compared to traditional matrix SVD. However, existing methods often
+rely on the computationally expensive tensor nuclear norm (TNN), which limits
+their scalability for real-world tensors. To address this issue, we explore an
+efficient scaled gradient descent (SGD) approach within the t-SVD framework for
+the first time, and propose the RTPCA-SGD method. Theoretically, we rigorously
+establish the recovery guarantees of RTPCA-SGD under mild assumptions,
+demonstrating that with appropriate parameter selection, it achieves linear
+convergence to the true low-rank tensor at a constant rate, independent of the
+condition number. To enhance its practical applicability, we further propose a
+learnable self-supervised deep unfolding model, which enables effective
+parameter learning. Numerical experiments on both synthetic and real-world
+datasets demonstrate the superior performance of the proposed methods while
+maintaining competitive computational efficiency, especially consuming less
+time than RTPCA-TNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenOmni: Large Language Models Pivot Zero-shot Omnimodal Alignment
+  across Language with Real-time Self-Aware Emotional Speech Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04561v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04561v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Run Luo, Ting-En Lin, Haonan Zhang, Yuchuan Wu, Xiong Liu, Min Yang, Yongbin Li, Longze Chen, Jiaming Li, Lei Zhang, Yangyi Chen, Hamid Alinejad-Rokny, Fei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in omnimodal learning have been achieved in understanding
+and generation across images, text, and speech, though mainly within
+proprietary models. Limited omnimodal datasets and the inherent challenges
+associated with real-time emotional speech generation have hindered open-source
+progress. To address these issues, we propose openomni, a two-stage training
+method combining omnimodal alignment and speech generation to develop a
+state-of-the-art omnimodal large language model. In the alignment phase, a
+pre-trained speech model is further trained on text-image tasks to generalize
+from vision to speech in a (near) zero-shot manner, outperforming models
+trained on tri-modal datasets. In the speech generation phase, a lightweight
+decoder facilitates real-time emotional speech through training on speech tasks
+and preference learning. Experiments demonstrate that openomni consistently
+improves across omnimodal, vision-language, and speech-language evaluations,
+enabling natural, emotion-rich dialogues and real-time emotional speech
+generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combining YOLO and Visual Rhythm for Vehicle Counting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Nascimento Ribeiro, Nina S. T. Hirata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based vehicle detection and counting play a critical role in managing
+transport infrastructure. Traditional image-based counting methods usually
+involve two main steps: initial detection and subsequent tracking, which are
+applied to all video frames, leading to a significant increase in computational
+complexity. To address this issue, this work presents an alternative and more
+efficient method for vehicle detection and counting. The proposed approach
+eliminates the need for a tracking step and focuses solely on detecting
+vehicles in key video frames, thereby increasing its efficiency. To achieve
+this, we developed a system that combines YOLO, for vehicle detection, with
+Visual Rhythm, a way to create time-spatial images that allows us to focus on
+frames that contain useful information. Additionally, this method can be used
+for counting in any application involving unidirectional moving targets to be
+detected and identified. Experimental analysis using real videos shows that the
+proposed method achieves mean counting accuracy around 99.15% over a set of
+videos, with a processing speed three times faster than tracking based
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Conference on Graphics, Patterns and
+  Images (SIBGRAPI) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Fair Class-wise Robustness: Class Optimal Distribution
+  Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongxin Zhi, Hongtao Yu, Shaome Li, Xiuming Zhao, Yiteng Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial training has proven to be a highly effective method for improving
+the robustness of deep neural networks against adversarial attacks.
+Nonetheless, it has been observed to exhibit a limitation in terms of robust
+fairness, characterized by a significant disparity in robustness across
+different classes. Recent efforts to mitigate this problem have turned to
+class-wise reweighted methods. However, these methods suffer from a lack of
+rigorous theoretical analysis and are limited in their exploration of the
+weight space, as they mainly rely on existing heuristic algorithms or intuition
+to compute weights. In addition, these methods fail to guarantee the
+consistency of the optimization direction due to the decoupled optimization of
+weights and the model parameters. They potentially lead to suboptimal weight
+assignments and consequently, a suboptimal model. To address these problems,
+this paper proposes a novel min-max training framework, Class Optimal
+Distribution Adversarial Training (CODAT), which employs distributionally
+robust optimization to fully explore the class-wise weight space, thus enabling
+the identification of the optimal weight with theoretical guarantees.
+Furthermore, we derive a closed-form optimal solution to the internal
+maximization and then get a deterministic equivalent objective function, which
+provides a theoretical basis for the joint optimization of weights and model
+parameters. Meanwhile, we propose a fairness elasticity coefficient for the
+evaluation of the algorithm with regard to both robustness and robust fairness.
+Experimental results on various datasets show that the proposed method can
+effectively improve the robust fairness of the model and outperform the
+state-of-the-art approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SplineFormer: An Explainable <span class="highlight-title">Transformer</span>-Based Approach for Autonomous
+  Endovascular Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tudor Jianu, Shayan Doust, Mengyun Li, Baoru Huang, Tuong Do, Hoan Nguyen, Karl Bates, Tung D. Ta, Sebastiano Fichera, Pierre Berthet-Rayne, Anh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Endovascular navigation is a crucial aspect of minimally invasive procedures,
+where precise control of curvilinear instruments like guidewires is critical
+for successful interventions. A key challenge in this task is accurately
+predicting the evolving shape of the guidewire as it navigates through the
+vasculature, which presents complex deformations due to interactions with the
+vessel walls. Traditional segmentation methods often fail to provide accurate
+real-time shape predictions, limiting their effectiveness in highly dynamic
+environments. To address this, we propose SplineFormer, a new transformer-based
+architecture, designed specifically to predict the continuous, smooth shape of
+the guidewire in an explainable way. By leveraging the transformer's ability,
+our network effectively captures the intricate bending and twisting of the
+guidewire, representing it as a spline for greater accuracy and smoothness. We
+integrate our SplineFormer into an end-to-end robot navigation system by
+leveraging the condensed information. The experimental results demonstrate that
+our SplineFormer is able to perform endovascular navigation autonomously and
+achieves a 50% success rate when cannulating the brachiocephalic artery on the
+real robot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Image Captioning by Mimicking Human Reformulation Feedback at
+  Inference-time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04513v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04513v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uri Berger, Omri Abend, Lea Frermann, Gabriel Stanovsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incorporating automatically predicted human feedback into the process of
+training generative models has attracted substantial recent interest, while
+feedback at inference time has received less attention. The typical feedback at
+training time, i.e., preferences of choice given two samples, does not
+naturally transfer to the inference phase. We introduce a novel type of
+feedback -- caption reformulations -- and train models to mimic reformulation
+feedback based on human annotations. Our method does not require training the
+image captioning model itself, thereby demanding substantially less
+computational effort. We experiment with two types of reformulation feedback:
+first, we collect a dataset of human reformulations that correct errors in the
+generated captions. We find that incorporating reformulation models trained on
+this data into the inference phase of existing image captioning models results
+in improved captions, especially when the original captions are of low quality.
+We apply our method to non-English image captioning, a domain where robust
+models are less prevalent, and gain substantial improvement. Second, we apply
+reformulations to style transfer. Quantitative evaluations reveal
+state-of-the-art performance on German image captioning and English style
+transfer, while human validation with a detailed comparative framework exposes
+the specific axes of improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Role of Machine Learning in Congenital Heart Disease Diagnosis:
+  <span class="highlight-title">Dataset</span>s, Algorithms, and Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khalil Khan, Farhan Ullah, Ikram Syed, Irfan Ullah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Congenital heart disease is among the most common fetal abnormalities and
+birth defects. Despite identifying numerous risk factors influencing its onset,
+a comprehensive understanding of its genesis and management across diverse
+populations remains limited. Recent advancements in machine learning have
+demonstrated the potential for leveraging patient data to enable early
+congenital heart disease detection. Over the past seven years, researchers have
+proposed various data-driven and algorithmic solutions to address this
+challenge. This paper presents a systematic review of congential heart disease
+recognition using machine learning, conducting a meta-analysis of 432
+references from leading journals published between 2018 and 2024. A detailed
+investigation of 74 scholarly works highlights key factors, including
+databases, algorithms, applications, and solutions. Additionally, the survey
+outlines reported datasets used by machine learning experts for congenital
+heart disease recognition. Using a systematic literature review methodology,
+this study identifies critical challenges and opportunities in applying machine
+learning to congenital heart disease.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MB-TaylorFormer V2: Improved Multi-branch Linear <span class="highlight-title">Transformer</span> Expanded by
+  Taylor Formula for Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhi Jin, Yuwei Qiu, Kaihao Zhang, Hongdong Li, Wenhan Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Transformer networks have demonstrated outstanding performance in
+the field of image restoration due to the global receptive field and
+adaptability to input. However, the quadratic computational complexity of
+Softmax-attention poses a significant limitation on its extensive application
+in image restoration tasks, particularly for high-resolution images. To tackle
+this challenge, we propose a novel variant of the Transformer. This variant
+leverages the Taylor expansion to approximate the Softmax-attention and
+utilizes the concept of norm-preserving mapping to approximate the remainder of
+the first-order Taylor expansion, resulting in a linear computational
+complexity. Moreover, we introduce a multi-branch architecture featuring
+multi-scale patch embedding into the proposed Transformer, which has four
+distinct advantages: 1) various sizes of the receptive field; 2) multi-level
+semantic information; 3) flexible shapes of the receptive field; 4) accelerated
+training and inference speed. Hence, the proposed model, named the second
+version of Taylor formula expansion-based Transformer (for short
+MB-TaylorFormer V2) has the capability to concurrently process coarse-to-fine
+features, capture long-distance pixel interactions with limited computational
+cost, and improve the approximation of the Taylor expansion remainder.
+Experimental results across diverse image restoration benchmarks demonstrate
+that MB-TaylorFormer V2 achieves state-of-the-art performance in multiple image
+restoration tasks, such as image dehazing, deraining, desnowing, motion
+deblurring, and denoising, with very little computational overhead. The source
+code is available at https://github.com/FVL2020/MB-TaylorFormerV2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking High-speed Image Reconstruction Framework with Spike Camera <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04477v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04477v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kang Chen, Yajing Zheng, Tiejun Huang, Zhaofei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spike cameras, as innovative neuromorphic devices, generate continuous spike
+streams to capture high-speed scenes with lower bandwidth and higher dynamic
+range than traditional RGB cameras. However, reconstructing high-quality images
+from the spike input under low-light conditions remains challenging.
+Conventional learning-based methods often rely on the synthetic dataset as the
+supervision for training. Still, these approaches falter when dealing with
+noisy spikes fired under the low-light environment, leading to further
+performance degradation in the real-world dataset. This phenomenon is primarily
+due to inadequate noise modelling and the domain gap between synthetic and real
+datasets, resulting in recovered images with unclear textures, excessive noise,
+and diminished brightness. To address these challenges, we introduce a novel
+spike-to-image reconstruction framework SpikeCLIP that goes beyond traditional
+training paradigms. Leveraging the CLIP model's powerful capability to align
+text and images, we incorporate the textual description of the captured scene
+and unpaired high-quality datasets as the supervision. Our experiments on
+real-world low-light datasets U-CALTECH and U-CIFAR demonstrate that SpikeCLIP
+significantly enhances texture details and the luminance balance of recovered
+images. Furthermore, the reconstructed images are well-aligned with the broader
+visual features needed for downstream tasks, ensuring more robust and versatile
+performance in challenging environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Histologic <span class="highlight-title">Dataset</span> of Normal and Atypical Mitotic Figures on Human
+  Breast Cancer (AMi-Br) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christof A. Bertram, Viktoria Weiss, Taryn A. Donovan, Sweta Banerjee, Thomas Conrad, Jonas Ammeling, Robert Klopfleisch, Christopher Kaltenecker, Marc Aubreville
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assessment of the density of mitotic figures (MFs) in histologic tumor
+sections is an important prognostic marker for many tumor types, including
+breast cancer. Recently, it has been reported in multiple works that the
+quantity of MFs with an atypical morphology (atypical MFs, AMFs) might be an
+independent prognostic criterion for breast cancer. AMFs are an indicator of
+mutations in the genes regulating the cell cycle and can lead to aberrant
+chromosome constitution (aneuploidy) of the tumor cells. To facilitate further
+research on this topic using pattern recognition, we present the first ever
+publicly available dataset of atypical and normal MFs (AMi-Br). For this, we
+utilized two of the most popular MF datasets (MIDOG 2021 and TUPAC) and
+subclassified all MFs using a three expert majority vote. Our final dataset
+consists of 3,720 MFs, split into 832 AMFs (22.4%) and 2,888 normal MFs (77.6%)
+across all 223 tumor cases in the combined set. We provide baseline
+classification experiments to investigate the consistency of the dataset, using
+a Monte Carlo cross-validation and different strategies to combat class
+imbalance. We found an averaged balanced accuracy of up to 0.806 when using a
+patch-level data set split, and up to 0.713 when using a patient-level split.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rapid Automated Mapping of Clouds on Titan With Instance Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zachary Yahn, Douglas M Trent, Ethan Duncan, Benoît Seignovert, John Santerre, Conor Nixon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite widespread adoption of deep learning models to address a variety of
+computer vision tasks, planetary science has yet to see extensive utilization
+of such tools to address its unique problems. On Titan, the largest moon of
+Saturn, tracking seasonal trends and weather patterns of clouds provides
+crucial insights into one of the most complex climates in the Solar System, yet
+much of the available image data are still analyzed in a conventional way. In
+this work, we apply a Mask R-CNN trained via transfer learning to perform
+instance segmentation of clouds in Titan images acquired by the Cassini
+spacecraft - a previously unexplored approach to a big data problem in
+planetary science. We demonstrate that an automated technique can provide
+quantitative measures for clouds, such as areas and centroids, that may
+otherwise be prohibitively time-intensive to produce by human mapping.
+Furthermore, despite Titan specific challenges, our approach yields accuracy
+comparable to contemporary cloud identification studies on Earth and other
+worlds. We compare the efficiencies of human-driven versus algorithmic
+approaches, showing that transfer learning provides speed-ups that may open new
+horizons for data investigation for Titan. Moreover, we suggest that such
+approaches have broad potential for application to similar problems in
+planetary science where they are currently under-utilized. Future planned
+missions to the planets and remote sensing initiatives for the Earth promise to
+provide a deluge of image data in the coming years that will benefit strongly
+from leveraging machine learning approaches to perform the analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A novel Facial Recognition technique with Focusing on Masked Faces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04444v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04444v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dana A Abdullah, Dana Rasul Hamad, Hakem Beitollahi, Ismail Y Maolood, Abdulhady Abas Abdullah, Aso Khaleel Ameen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognizing the same faces with and without masks is important for ensuring
+consistent identification in security, access control, and public safety. This
+capability is crucial in scenarios like law enforcement, healthcare, and
+surveillance, where accurate recognition must be maintained despite facial
+occlusion. This research focuses on the challenge of recognizing the same faces
+with and without masks by employing cosine similarity as the primary technique.
+With the increased use of masks, traditional facial recognition systems face
+significant accuracy issues, making it crucial to develop methods that can
+reliably identify individuals in masked conditions. For that reason, this study
+proposed Masked-Unmasked Face Matching Model (MUFM). This model employs
+transfer learning using the Visual Geometry Group (VGG16) model to extract
+significant facial features, which are subsequently classified utilizing the
+K-Nearest Neighbors (K-NN) algorithm. The cosine similarity metric is employed
+to compare masked and unmasked faces of the same individuals. This approach
+represents a novel contribution, as the task of recognizing the same individual
+with and without a mask using cosine similarity has not been previously
+addressed. By integrating these advanced methodologies, the research
+demonstrates effective identification of individuals despite the presence of
+masks, addressing a significant limitation in traditional systems. Using data
+is another essential part of this work, by collecting and preparing an image
+dataset from three different sources especially some of those data are real
+provided a comprehensive power of this research. The image dataset used were
+already collected in three different datasets of masked and unmasked for the
+same faces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RSAR: Restricted State Angle Resolver and Rotated SAR Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhang, Xue Yang, Yuxuan Li, Jian Yang, Ming-Ming Cheng, Xiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rotated object detection has made significant progress in the optical remote
+sensing. However, advancements in the Synthetic Aperture Radar (SAR) field are
+laggard behind, primarily due to the absence of a large-scale dataset.
+Annotating such a dataset is inefficient and costly. A promising solution is to
+employ a weakly supervised model (e.g., trained with available horizontal boxes
+only) to generate pseudo-rotated boxes for reference before manual calibration.
+Unfortunately, the existing weakly supervised models exhibit limited accuracy
+in predicting the object's angle. Previous works attempt to enhance angle
+prediction by using angle resolvers that decouple angles into cosine and sine
+encodings. In this work, we first reevaluate these resolvers from a unified
+perspective of dimension mapping and expose that they share the same
+shortcomings: these methods overlook the unit cycle constraint inherent in
+these encodings, easily leading to prediction biases. To address this issue, we
+propose the Unit Cycle Resolver, which incorporates a unit circle constraint
+loss to improve angle prediction accuracy. Our approach can effectively improve
+the performance of existing state-of-the-art weakly supervised methods and even
+surpasses fully supervised models on existing optical benchmarks (i.e.,
+DOTA-v1.0 dataset). With the aid of UCR, we further annotate and introduce
+RSAR, the largest multi-class rotated SAR object detection dataset to date.
+Extensive experiments on both RSAR and optical datasets demonstrate that our
+UCR enhances angle prediction accuracy. Our dataset and code can be found at:
+https://github.com/zhasion/RSAR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ iFADIT: Invertible Face Anonymization via Disentangled Identity
+  Transform 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Yuan, Kai Liang, Xiong Li, Tao Wu, Nannan Wang, Xinbo Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face anonymization aims to conceal the visual identity of a face to safeguard
+the individual's privacy. Traditional methods like blurring and pixelation can
+largely remove identifying features, but these techniques significantly degrade
+image quality and are vulnerable to deep reconstruction attacks. Generative
+models have emerged as a promising solution for anonymizing faces while
+preserving a natural appearance.However, many still face limitations in visual
+quality and often overlook the potential to recover the original face from the
+anonymized version, which can be valuable in specific contexts such as image
+forensics. This paper proposes a novel framework named iFADIT, an acronym for
+Invertible Face Anonymization via Disentangled Identity Transform.The framework
+features a disentanglement architecture coupled with a secure flow-based model:
+the former decouples identity information from non-identifying attributes,
+while the latter transforms the decoupled identity into an anonymized version
+in an invertible manner controlled by a secret key. The anonymized face can
+then be reconstructed based on a pre-trained StyleGAN that ensures high image
+quality and realistic facial details. Recovery of the original face (aka
+de-anonymization) is possible upon the availability of the matching secret, by
+inverting the anonymization process based on the same set of model parameters.
+Furthermore, a dedicated secret-key mechanism along with a dual-phase training
+strategy is devised to ensure the desired properties of face anonymization.
+Qualitative and quantitative experiments demonstrate the superiority of the
+proposed approach in anonymity, reversibility, security, diversity, and
+interpretability over competing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Computational Limits and Provably Efficient Criteria of Visual
+  Autoregressive Models: A Fine-Grained Complexity Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yekun Ke, Xiaoyu Li, Yingyu Liang, Zhizhou Sha, Zhenmei Shi, Zhao Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Visual Autoregressive ($\mathsf{VAR}$) Models introduced a
+groundbreaking advancement in the field of image generation, offering a
+scalable approach through a coarse-to-fine "next-scale prediction" paradigm.
+However, the state-of-the-art algorithm of $\mathsf{VAR}$ models in [Tian,
+Jiang, Yuan, Peng and Wang, NeurIPS 2024] takes $O(n^4)$ time, which is
+computationally inefficient. In this work, we analyze the computational limits
+and efficiency criteria of $\mathsf{VAR}$ Models through a fine-grained
+complexity lens. Our key contribution is identifying the conditions under which
+$\mathsf{VAR}$ computations can achieve sub-quadratic time complexity.
+Specifically, we establish a critical threshold for the norm of input matrices
+used in $\mathsf{VAR}$ attention mechanisms. Above this threshold, assuming the
+Strong Exponential Time Hypothesis ($\mathsf{SETH}$) from fine-grained
+complexity theory, a sub-quartic time algorithm for $\mathsf{VAR}$ models is
+impossible. To substantiate our theoretical findings, we present efficient
+constructions leveraging low-rank approximations that align with the derived
+criteria. This work initiates the study of the computational efficiency of the
+$\mathsf{VAR}$ model from a theoretical perspective. Our technique will shed
+light on advancing scalable and efficient image generation in $\mathsf{VAR}$
+frameworks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Unbiased Deepfake Detection via Token-Level Shuffling and
+  Mixing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04376v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04376v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinghe Fu, Zhiyuan Yan, Taiping Yao, Shen Chen, Xi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generalization problem is broadly recognized as a critical challenge in
+detecting deepfakes. Most previous work believes that the generalization gap is
+caused by the differences among various forgery methods. However, our
+investigation reveals that the generalization issue can still occur when
+forgery-irrelevant factors shift. In this work, we identify two biases that
+detectors may also be prone to overfitting: position bias and content bias, as
+depicted in Fig. 1. For the position bias, we observe that detectors are prone
+to lazily depending on the specific positions within an image (e.g., central
+regions even no forgery). As for content bias, we argue that detectors may
+potentially and mistakenly utilize forgery-unrelated information for detection
+(e.g., background, and hair). To intervene these biases, we propose two
+branches for shuffling and mixing with tokens in the latent space of
+transformers. For the shuffling branch, we rearrange the tokens and
+corresponding position embedding for each image while maintaining the local
+correlation. For the mixing branch, we randomly select and mix the tokens in
+the latent space between two images with the same label within the mini-batch
+to recombine the content information. During the learning process, we align the
+outputs of detectors from different branches in both feature space and logit
+space. Contrastive losses for features and divergence losses for logits are
+applied to obtain unbiased feature representation and classifiers. We
+demonstrate and verify the effectiveness of our method through extensive
+experiments on widely used evaluation datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instructive3D: Editing Large Reconstruction Models with Text
+  Instructions <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunal Kathare, Ankit Dhiman, K Vikas Gowda, Siddharth Aravindan, Shubham Monga, Basavaraja Shanthappa Vandrotti, Lokesh R Boregowda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer based methods have enabled users to create, modify, and
+comprehend text and image data. Recently proposed Large Reconstruction Models
+(LRMs) further extend this by providing the ability to generate high-quality 3D
+models with the help of a single object image. These models, however, lack the
+ability to manipulate or edit the finer details, such as adding standard design
+patterns or changing the color and reflectance of the generated objects, thus
+lacking fine-grained control that may be very helpful in domains such as
+augmented reality, animation and gaming. Naively training LRMs for this purpose
+would require generating precisely edited images and 3D object pairs, which is
+computationally expensive. In this paper, we propose Instructive3D, a novel LRM
+based model that integrates generation and fine-grained editing, through user
+text prompts, of 3D objects into a single model. We accomplish this by adding
+an adapter that performs a diffusion process conditioned on a text prompt
+specifying edits in the triplane latent space representation of 3D object
+models. Our method does not require the generation of edited 3D objects.
+Additionally, Instructive3D allows us to perform geometrically consistent
+modifications, as the edits done through user-defined text prompts are applied
+to the triplane latent representation thus enhancing the versatility and
+precision of 3D objects generated. We compare the objects generated by
+Instructive3D and a baseline that first generates the 3D object meshes using a
+standard LRM model and then edits these 3D objects using text prompts when
+images are provided from the Objaverse LVIS dataset. We find that Instructive3D
+produces qualitatively superior 3D objects with the properties specified by the
+edit prompts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WACV 2025. First two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FGU3R: Fine-Grained Fusion via Unified 3D Representation for Multimodal
+  3D Object Detection <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxin Zhang, Ziying Song, Lin Liu, Zhonghong Ou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal 3D object detection has garnered considerable interest in
+autonomous driving. However, multimodal detectors suffer from dimension
+mismatches that derive from fusing 3D points with 2D pixels coarsely, which
+leads to sub-optimal fusion performance. In this paper, we propose a multimodal
+framework FGU3R to tackle the issue mentioned above via unified 3D
+representation and fine-grained fusion, which consists of two important
+components. First, we propose an efficient feature extractor for raw and pseudo
+points, termed Pseudo-Raw Convolution (PRConv), which modulates multimodal
+features synchronously and aggregates the features from different types of
+points on key points based on multimodal interaction. Second, a Cross-Attention
+Adaptive Fusion (CAAF) is designed to fuse homogeneous 3D RoI (Region of
+Interest) features adaptively via a cross-attention variant in a fine-grained
+manner. Together they make fine-grained fusion on unified 3D representation.
+The experiments conducted on the KITTI and nuScenes show the effectiveness of
+our proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Unified Framework for Foreground and Anonymization Area Segmentation
+  in CT and MRI Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Nohel, Constantin Ulrich, Jonathan Suprijadi, Tassilo Wald, Klaus Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents an open-source toolkit to address critical challenges in
+preprocessing data for self-supervised learning (SSL) for 3D medical imaging,
+focusing on data privacy and computational efficiency. The toolkit comprises
+two main components: a segmentation network that delineates foreground regions
+to optimize data sampling and thus reduce training time, and a segmentation
+network that identifies anonymized regions, preventing erroneous supervision in
+reconstruction-based SSL methods. Experimental results demonstrate high
+robustness, with mean Dice scores exceeding 98.5 across all anonymization
+methods and surpassing 99.5 for foreground segmentation tasks, highlighting the
+efficacy of the toolkit in supporting SSL applications in 3D medical imaging
+for both CT and MRI images. The weights and code is available at
+https://github.com/MIC-DKFZ/Foreground-and-Anonymization-Area-Segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeFusion: An Effective Decoupling Fusion Network for Multi-Modal
+  Pregnancy Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueqiang Ouyang, Jia Wei, Wenjie Huo, Xiaocong Wang, Rui Li, Jianlong Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal embryo images and parental fertility table indicators are both
+valuable for pregnancy prediction in \textbf{in vitro fertilization embryo
+transfer} (IVF-ET). However, current machine learning models cannot make full
+use of the complementary information between the two modalities to improve
+pregnancy prediction performance. In this paper, we propose a Decoupling Fusion
+Network called DeFusion to effectively integrate the multi-modal information
+for IVF-ET pregnancy prediction. Specifically, we propose a decoupling fusion
+module that decouples the information from the different modalities into
+related and unrelated information, thereby achieving a more delicate fusion.
+And we fuse temporal embryo images with a spatial-temporal position encoding,
+and extract fertility table indicator information with a table transformer. To
+evaluate the effectiveness of our model, we use a new dataset including 4046
+cases collected from Southern Medical University. The experiments show that our
+model outperforms state-of-the-art methods. Meanwhile, the performance on the
+eye disease prediction dataset reflects the model's good generalization. Our
+code and dataset are available at https://github.com/Ou-Young-1999/DFNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Gaussian Test-Time Adaptation of Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clément Fuchs, Maxime Zanella, Christophe De Vleeschouwer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online test-time adaptation (OTTA) of vision-language models (VLMs) has
+recently garnered increased attention to take advantage of data observed along
+a stream to improve future predictions. Unfortunately, existing methods rely on
+dataset-specific hyperparameters, significantly limiting their adaptability to
+unseen tasks. In response, we propose Online Gaussian Adaptation (OGA), a novel
+method that models the likelihoods of visual features using Gaussian
+distributions and incorporates zero-shot priors into an interpretable Maximum A
+Posteriori (MAP) estimation framework with fixed hyper-parameters across all
+datasets. We demonstrate that OGA outperforms state-of-the-art methods on most
+datasets and runs. Additionally, we show that combining OTTA with popular
+few-shot techniques (a practical yet overlooked setting in prior research) is
+highly beneficial. Furthermore, our experimental study reveals that common OTTA
+evaluation protocols, which average performance over at most three runs per
+dataset, are inadequate due to the substantial variability observed across runs
+for all OTTA methods. Therefore, we advocate for more rigorous evaluation
+practices, including increasing the number of runs and considering additional
+quantitative metrics, such as our proposed Expected Tail Accuracy (ETA),
+calculated as the average accuracy in the worst 10% of runs. We hope these
+contributions will encourage more rigorous and diverse evaluation practices in
+the OTTA community. Code is available at https://github.com/cfuchs2023/OGA .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building a Mind Palace: Structuring Environment-Grounded Semantic Graphs
+  for Effective Long Video Analysis with LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyi Huang, Yuyang Ji, Xiaofang Wang, Nikhil Mehta, Tong Xiao, Donghyun Lee, Sigmund Vanvalkenburgh, Shengxin Zha, Bolin Lai, Licheng Yu, Ning Zhang, Yong Jae Lee, Miao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-form video understanding with Large Vision Language Models is challenged
+by the need to analyze temporally dispersed yet spatially concentrated key
+moments within limited context windows. In this work, we introduce
+VideoMindPalace, a new framework inspired by the "Mind Palace", which organizes
+critical video moments into a topologically structured semantic graph.
+VideoMindPalace organizes key information through (i) hand-object tracking and
+interaction, (ii) clustered activity zones representing specific areas of
+recurring activities, and (iii) environment layout mapping, allowing natural
+language parsing by LLMs to provide grounded insights on spatio-temporal and 3D
+context. In addition, we propose the Video MindPalace Benchmark (VMB), to
+assess human-like reasoning, including spatial localization, temporal
+reasoning, and layout-aware sequential understanding. Evaluated on VMB and
+established video QA datasets, including EgoSchema, NExT-QA, IntentQA, and the
+Active Memories Benchmark, VideoMindPalace demonstrates notable gains in
+spatio-temporal coherence and human-aligned reasoning, advancing long-form
+video analysis capabilities in VLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Efficient Adaptive Compression Method for Human Perception and
+  Machine Vision Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04329v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04329v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Liu, Zhenghao Chen, Zhihao Hu, Dong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While most existing neural image compression (NIC) and neural video
+compression (NVC) methodologies have achieved remarkable success, their
+optimization is primarily focused on human visual perception. However, with the
+rapid development of artificial intelligence, many images and videos will be
+used for various machine vision tasks. Consequently, such existing compression
+methodologies cannot achieve competitive performance in machine vision. In this
+work, we introduce an efficient adaptive compression (EAC) method tailored for
+both human perception and multiple machine vision tasks. Our method involves
+two key modules: 1), an adaptive compression mechanism, that adaptively selects
+several subsets from latent features to balance the optimizations for multiple
+machine vision tasks (e.g., segmentation, and detection) and human vision. 2),
+a task-specific adapter, that uses the parameter-efficient delta-tuning
+strategy to stimulate the comprehensive downstream analytical networks for
+specific machine vision tasks. By using the above two modules, we can optimize
+the bit-rate costs and improve machine vision performance. In general, our
+proposed EAC can seamlessly integrate with existing NIC (i.e., Ball\'e2018, and
+Cheng2020) and NVC (i.e., DVC, and FVC) methods. Extensive evaluation on
+various benchmark datasets (i.e., VOC2007, ILSVRC2012, VOC2012, COCO, UCF101,
+and DAVIS) shows that our method enhances performance for multiple machine
+vision tasks while maintaining the quality of human vision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Edit as You See: Image-guided Video Editing via Masked Motion Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhi-Lin Huang, Yixuan Liu, Chujun Qin, Zhongdao Wang, Dong Zhou, Dong Li, Emad Barsoum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in diffusion models have significantly facilitated
+text-guided video editing. However, there is a relative scarcity of research on
+image-guided video editing, a method that empowers users to edit videos by
+merely indicating a target object in the initial frame and providing an RGB
+image as reference, without relying on the text prompts. In this paper, we
+propose a novel Image-guided Video Editing Diffusion model, termed IVEDiff for
+the image-guided video editing. IVEDiff is built on top of image editing
+models, and is equipped with learnable motion modules to maintain the temporal
+consistency of edited video. Inspired by self-supervised learning concepts, we
+introduce a masked motion modeling fine-tuning strategy that empowers the
+motion module's capabilities for capturing inter-frame motion dynamics, while
+preserving the capabilities for intra-frame semantic correlations modeling of
+the base image editing model. Moreover, an optical-flow-guided motion reference
+network is proposed to ensure the accurate propagation of information between
+edited video frames, alleviating the misleading effects of invalid information.
+We also construct a benchmark to facilitate further research. The comprehensive
+experiments demonstrate that our method is able to generate temporally smooth
+edited videos while robustly dealing with various editing objects with high
+quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eve: Efficient Multimodal Vision Language Models with Elastic Visual
+  Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miao Rang, Zhenni Bi, Chuanjian Liu, Yehui Tang, Kai Han, Yunhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal vision language models (VLMs) have made significant progress with
+the support of continuously increasing model sizes and data volumes. Running
+VLMs on edge devices has become a challenge for their widespread application.
+There are several efficient VLM efforts, but they often sacrifice linguistic
+capabilities to enhance multimodal abilities, or require extensive training. To
+address this quandary,we introduce the innovative framework of Efficient Vision
+Language Models with Elastic Visual Experts (Eve). By strategically
+incorporating adaptable visual expertise at multiple stages of training, Eve
+strikes a balance between preserving linguistic abilities and augmenting
+multimodal capabilities. This balanced approach results in a versatile model
+with only 1.8B parameters that delivers significant improvements in both
+multimodal and linguistic tasks. Notably, in configurations below 3B
+parameters, Eve distinctly outperforms in language benchmarks and achieves
+state-of-the-art results 68.87% in VLM Benchmarks. Additionally, its multimodal
+accuracy outstrips that of the larger 7B LLaVA-1.5 model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DGQ: Distribution-Aware Group Quantization for Text-to-Image Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyogon Ryu, NaHyeon Park, Hyunjung Shim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the widespread use of text-to-image diffusion models across various
+tasks, their computational and memory demands limit practical applications. To
+mitigate this issue, quantization of diffusion models has been explored. It
+reduces memory usage and computational costs by compressing weights and
+activations into lower-bit formats. However, existing methods often struggle to
+preserve both image quality and text-image alignment, particularly in
+lower-bit($<$ 8bits) quantization. In this paper, we analyze the challenges
+associated with quantizing text-to-image diffusion models from a distributional
+perspective. Our analysis reveals that activation outliers play a crucial role
+in determining image quality. Additionally, we identify distinctive patterns in
+cross-attention scores, which significantly affects text-image alignment. To
+address these challenges, we propose Distribution-aware Group Quantization
+(DGQ), a method that identifies and adaptively handles pixel-wise and
+channel-wise outliers to preserve image quality. Furthermore, DGQ applies
+prompt-specific logarithmic quantization scales to maintain text-image
+alignment. Our method demonstrates remarkable performance on datasets such as
+MS-COCO and PartiPrompts. We are the first to successfully achieve low-bit
+quantization of text-to-image diffusion models without requiring additional
+fine-tuning of weight quantization parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://ugonfor.kr/DGQ</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ H-MBA: Hierarchical MamBa Adaptation for Multi-Modal Video Understanding
+  in Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04302v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04302v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siran Chen, Yuxiao Luo, Yue Ma, Yu Qiao, Yali Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the prevalence of Multimodal Large Language Models(MLLMs), autonomous
+driving has encountered new opportunities and challenges. In particular,
+multi-modal video understanding is critical to interactively analyze what will
+happen in the procedure of autonomous driving. However, videos in such a
+dynamical scene that often contains complex spatial-temporal movements, which
+restricts the generalization capacity of the existing MLLMs in this field. To
+bridge the gap, we propose a novel Hierarchical Mamba Adaptation (H-MBA)
+framework to fit the complicated motion changes in autonomous driving videos.
+Specifically, our H-MBA consists of two distinct modules, including Context
+Mamba (C-Mamba) and Query Mamba (Q-Mamba). First, C-Mamba contains various
+types of structure state space models, which can effectively capture
+multi-granularity video context for different temporal resolutions. Second,
+Q-Mamba flexibly transforms the current frame as the learnable query, and
+attentively selects multi-granularity video context into query. Consequently,
+it can adaptively integrate all the video contexts of multi-scale temporal
+resolutions to enhance video understanding. Via a plug-and-play paradigm in
+MLLMs, our H-MBA shows the remarkable performance on multi-modal video tasks in
+autonomous driving, e.g., for risk object detection, it outperforms the
+previous SOTA method with 5.5% mIoU improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TADFormer : Task-Adaptive Dynamic <span class="highlight-title">Transformer</span> for Efficient Multi-Task
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungmin Baek, Soyul Lee, Hayeon Jo, Hyesong Choi, Dongbo Min
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transfer learning paradigm has driven substantial advancements in various
+vision tasks. However, as state-of-the-art models continue to grow, classical
+full fine-tuning often becomes computationally impractical, particularly in
+multi-task learning (MTL) setup where training complexity increases
+proportional to the number of tasks. Consequently, recent studies have explored
+Parameter-Efficient Fine-Tuning (PEFT) for MTL architectures. Despite some
+progress, these approaches still exhibit limitations in capturing fine-grained,
+task-specific features that are crucial to MTL. In this paper, we introduce
+Task-Adaptive Dynamic transFormer, termed TADFormer, a novel PEFT framework
+that performs task-aware feature adaptation in the fine-grained manner by
+dynamically considering task-specific input contexts. TADFormer proposes the
+parameter-efficient prompting for task adaptation and the Dynamic Task Filter
+(DTF) to capture task information conditioned on input contexts. Experiments on
+the PASCAL-Context benchmark demonstrate that the proposed method achieves
+higher accuracy in dense scene understanding tasks, while reducing the number
+of trainable parameters by up to 8.4 times when compared to full fine-tuning of
+MTL models. TADFormer also demonstrates superior parameter efficiency and
+accuracy compared to recent PEFT methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ContextMRI: Enhancing Compressed Sensing MRI through Metadata
+  Conditioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyungjin Chung, Dohun Lee, Zihui Wu, Byung-Hoon Kim, Katherine L. Bouman, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compressed sensing MRI seeks to accelerate MRI acquisition processes by
+sampling fewer k-space measurements and then reconstructing the missing data
+algorithmically. The success of these approaches often relies on strong priors
+or learned statistical models. While recent diffusion model-based priors have
+shown great potential, previous methods typically ignore clinically available
+metadata (e.g. patient demographics, imaging parameters, slice-specific
+information). In practice, metadata contains meaningful cues about the anatomy
+and acquisition protocol, suggesting it could further constrain the
+reconstruction problem. In this work, we propose ContextMRI, a text-conditioned
+diffusion model for MRI that integrates granular metadata into the
+reconstruction process. We train a pixel-space diffusion model directly on
+minimally processed, complex-valued MRI images. During inference, metadata is
+converted into a structured text prompt and fed to the model via CLIP text
+embeddings. By conditioning the prior on metadata, we unlock more accurate
+reconstructions and show consistent gains across multiple datasets,
+acceleration factors, and undersampling patterns. Our experiments demonstrate
+that increasing the fidelity of metadata, ranging from slice location and
+contrast to patient age, sex, and pathology, systematically boosts
+reconstruction performance. This work highlights the untapped potential of
+leveraging clinical context for inverse problems and opens a new direction for
+metadata-driven MRI reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Scene Classification in Cloudy Image Scenarios: A
+  Collaborative Transfer Method with Information Regulation Mechanism using
+  Optical Cloud-Covered and SAR Remote Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuze Wang, Rong Xiao, Haifeng Li, Mariana Belgiu, Chao Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In remote sensing scene classification, leveraging the transfer methods with
+well-trained optical models is an efficient way to overcome label scarcity.
+However, cloud contamination leads to optical information loss and significant
+impacts on feature distribution, challenging the reliability and stability of
+transferred target models. Common solutions include cloud removal for optical
+data or directly using Synthetic aperture radar (SAR) data in the target
+domain. However, cloud removal requires substantial auxiliary data for support
+and pre-training, while directly using SAR disregards the unobstructed portions
+of optical data. This study presents a scene classification transfer method
+that synergistically combines multi-modality data, which aims to transfer the
+source domain model trained on cloudfree optical data to the target domain that
+includes both cloudy optical and SAR data at low cost. Specifically, the
+framework incorporates two parts: (1) the collaborative transfer strategy,
+based on knowledge distillation, enables the efficient prior knowledge transfer
+across heterogeneous data; (2) the information regulation mechanism (IRM) is
+proposed to address the modality imbalance issue during transfer. It employs
+auxiliary models to measure the contribution discrepancy of each modality, and
+automatically balances the information utilization of modalities during the
+target model learning process at the sample-level. The transfer experiments
+were conducted on simulated and real cloud datasets, demonstrating the superior
+performance of the proposed method compared to other solutions in cloud-covered
+scenarios. We also verified the importance and limitations of IRM, and further
+discussed and visualized the modality imbalance problem during the model
+transfer. Codes are available at https://github.com/wangyuze-csu/ESCCS
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open set label noise learning with robust sample selection and
+  margin-guided module 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuandi Zhao, Qianxi Xia, Yang Sun, Zhijie Wen, Liyan Ma, Shihui Ying
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the remarkable success of deep neural networks (DNNs) in
+computer vision is largely due to large-scale, high-quality labeled datasets.
+Training directly on real-world datasets with label noise may result in
+overfitting. The traditional method is limited to deal with closed set label
+noise, where noisy training data has true class labels within the known label
+space. However, there are some real-world datasets containing open set label
+noise, which means that some samples belong to an unknown class outside the
+known label space. To address the open set label noise problem, we introduce a
+method based on Robust Sample Selection and Margin-Guided Module (RSS-MGM).
+Firstly, unlike the prior clean sample selection approach, which only select a
+limited number of clean samples, a robust sample selection module combines
+small loss selection or high-confidence sample selection to obtain more clean
+samples. Secondly, to efficiently distinguish open set label noise and closed
+set ones, margin functions are designed to filter open-set data and closed set
+data. Thirdly, different processing methods are selected for different types of
+samples in order to fully utilize the data's prior information and optimize the
+whole model. Furthermore, extensive experimental results with noisy labeled
+data from benchmark datasets and real-world datasets, such as CIFAR-100N-C,
+CIFAR80N-O, WebFG-469, and Food101N, indicate that our approach outperforms
+many state-of-the-art label noise learning methods. Especially, it can more
+accurately divide open set label noise samples and closed set ones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robotic Programmer: Video Instructed Policy Code Generation for Robotic
+  Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Senwei Xie, Hongyu Wang, Zhanqi Xiao, Ruiping Wang, Xilin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot generalization across various robots, tasks and environments
+remains a significant challenge in robotic manipulation. Policy code generation
+methods use executable code to connect high-level task descriptions and
+low-level action sequences, leveraging the generalization capabilities of large
+language models and atomic skill libraries. In this work, we propose Robotic
+Programmer (RoboPro), a robotic foundation model, enabling the capability of
+perceiving visual information and following free-form instructions to perform
+robotic manipulation with policy code in a zero-shot manner. To address low
+efficiency and high cost in collecting runtime code data for robotic tasks, we
+devise Video2Code to synthesize executable code from extensive videos
+in-the-wild with off-the-shelf vision-language model and code-domain large
+language model. Extensive experiments show that RoboPro achieves the
+state-of-the-art zero-shot performance on robotic manipulation in both
+simulators and real-world environments. Specifically, the zero-shot success
+rate of RoboPro on RLBench surpasses the state-of-the-art model GPT-4o by
+11.6%, which is even comparable to a strong supervised training baseline.
+Furthermore, RoboPro is robust to variations on API formats and skill sets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual <span class="highlight-title">Self-supervised</span> Learning Considering Medical Domain Knowledge
+  in Chest CT Images <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ren Tasai, Guang Li, Ren Togo, Minghui Tang, Takaaki Yoshimura, Hiroyuki Sugimori, Kenji Hirata, Takahiro Ogawa, Kohsuke Kudo, Miki Haseyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel continual self-supervised learning method (CSSL)
+considering medical domain knowledge in chest CT images. Our approach addresses
+the challenge of sequential learning by effectively capturing the relationship
+between previously learned knowledge and new information at different stages.
+By incorporating an enhanced DER into CSSL and maintaining both diversity and
+representativeness within the rehearsal buffer of DER, the risk of data
+interference during pretraining is reduced, enabling the model to learn more
+richer and robust feature representations. In addition, we incorporate a mixup
+strategy and feature distillation to further enhance the model's ability to
+learn meaningful representations. We validate our method using chest CT images
+obtained under two different imaging conditions, demonstrating superior
+performance compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UPAQ: A Framework for Real-Time and Energy-Efficient 3D Object Detection
+  in Autonomous Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Balasubramaniam, Febin P Sunny, Sudeep Pasricha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To enhance perception in autonomous vehicles (AVs), recent efforts are
+concentrating on 3D object detectors, which deliver more comprehensive
+predictions than traditional 2D object detectors, at the cost of increased
+memory footprint and computational resource usage. We present a novel framework
+called UPAQ, which leverages semi-structured pattern pruning and quantization
+to improve the efficiency of LiDAR point-cloud and camera-based 3D object
+detectors on resource-constrained embedded AV platforms. Experimental results
+on the Jetson Orin Nano embedded platform indicate that UPAQ achieves up to
+5.62x and 5.13x model compression rates, up to 1.97x and 1.86x boost in
+inference speed, and up to 2.07x and 1.87x reduction in energy consumption
+compared to state-of-the-art model compression frameworks, on the Pointpillar
+and SMOKE models respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recognition-Oriented Low-Light Image Enhancement based on Global and
+  Pixelwise Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seitaro Ono, Yuka Ogino, Takahiro Toizumi, Atsushi Ito, Masato Tsukada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel low-light image enhancement method aimed at
+improving the performance of recognition models. Despite recent advances in
+deep learning, the recognition of images under low-light conditions remains a
+challenge. Although existing low-light image enhancement methods have been
+developed to improve image visibility for human vision, they do not
+specifically focus on enhancing recognition model performance. Our proposed
+low-light image enhancement method consists of two key modules: the Global
+Enhance Module, which adjusts the overall brightness and color balance of the
+input image, and the Pixelwise Adjustment Module, which refines image features
+at the pixel level. These modules are trained to enhance input images to
+improve downstream recognition model performance effectively. Notably, the
+proposed method can be applied as a frontend filter to improve low-light
+recognition performance without requiring retraining of downstream recognition
+models. Experimental results demonstrate that our method improves the
+performance of pretrained recognition models under low-light conditions and its
+effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to VISAPP2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GRAPHITE: Graph-Based Interpretable Tissue Examination for Enhanced
+  Explainability in Breast Cancer Histopathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raktim Kumar Mondol, Ewan K. A. Millar, Peter H. Graham, Lois Browne, Arcot Sowmya, Erik Meijering
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable AI (XAI) in medical histopathology is essential for enhancing the
+interpretability and clinical trustworthiness of deep learning models in cancer
+diagnosis. However, the black-box nature of these models often limits their
+clinical adoption. We introduce GRAPHITE (Graph-based Interpretable Tissue
+Examination), a post-hoc explainable framework designed for breast cancer
+tissue microarray (TMA) analysis. GRAPHITE employs a multiscale approach,
+extracting patches at various magnification levels, constructing an
+hierarchical graph, and utilising graph attention networks (GAT) with scalewise
+attention (SAN) to capture scale-dependent features. We trained the model on
+140 tumour TMA cores and four benign whole slide images from which 140 benign
+samples were created, and tested it on 53 pathologist-annotated TMA samples.
+GRAPHITE outperformed traditional XAI methods, achieving a mean average
+precision (mAP) of 0.56, an area under the receiver operating characteristic
+curve (AUROC) of 0.94, and a threshold robustness (ThR) of 0.70, indicating
+that the model maintains high performance across a wide range of thresholds. In
+clinical utility, GRAPHITE achieved the highest area under the decision curve
+(AUDC) of 4.17e+5, indicating reliable decision support across thresholds.
+These results highlight GRAPHITE's potential as a clinically valuable tool in
+computational pathology, providing interpretable visualisations that align with
+the pathologists' diagnostic reasoning and support precision medicine.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 Pages, 9 Figures, 1 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LipGen: Viseme-Guided Lip Video Generation for Enhancing Visual Speech
+  Recognition <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Hao, Dongliang Zhou, Xiaojie Li, Xingyu Zhang, Liang Xie, Jianlong Wu, Erwei Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual speech recognition (VSR), commonly known as lip reading, has garnered
+significant attention due to its wide-ranging practical applications. The
+advent of deep learning techniques and advancements in hardware capabilities
+have significantly enhanced the performance of lip reading models. Despite
+these advancements, existing datasets predominantly feature stable video
+recordings with limited variability in lip movements. This limitation results
+in models that are highly sensitive to variations encountered in real-world
+scenarios. To address this issue, we propose a novel framework, LipGen, which
+aims to improve model robustness by leveraging speech-driven synthetic visual
+data, thereby mitigating the constraints of current datasets. Additionally, we
+introduce an auxiliary task that incorporates viseme classification alongside
+attention mechanisms. This approach facilitates the efficient integration of
+temporal information, directing the model's focus toward the relevant segments
+of speech, thereby enhancing discriminative capabilities. Our method
+demonstrates superior performance compared to the current state-of-the-art on
+the lip reading in the wild (LRW) dataset and exhibits even more pronounced
+advantages under challenging conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative <span class="highlight-title">Dataset</span> Distillation Based on Self-knowledge Distillation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longzhen Li, Guang Li, Ren Togo, Keisuke Maeda, Takahiro Ogawa, Miki Haseyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation is an effective technique for reducing the cost and
+complexity of model training while maintaining performance by compressing large
+datasets into smaller, more efficient versions. In this paper, we present a
+novel generative dataset distillation method that can improve the accuracy of
+aligning prediction logits. Our approach integrates self-knowledge distillation
+to achieve more precise distribution matching between the synthetic and
+original data, thereby capturing the overall structure and relationships within
+the data. To further improve the accuracy of alignment, we introduce a
+standardization step on the logits before performing distribution matching,
+ensuring consistency in the range of logits. Through extensive experiments, we
+demonstrate that our method outperforms existing state-of-the-art methods,
+resulting in superior distillation performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topological Classification of points in $Z^2$ by using Topological
+  Numbers for $2$D discrete binary images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christophe Lohou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a topological classification of points for 2D
+discrete binary images. This classification is based on the values of the
+calculus of topological numbers. Six classes of points are proposed: isolated
+point, interior point, simple point, curve point, point of intersection of 3
+curves, point of intersection of 4 curves. The number of configurations of each
+class is also given.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2410.21588</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Back Home: A Machine Learning Approach to Seashell Classification and
+  Ecosystem Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Valverde, Luis Solano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Costa Rica, an average of 5 tons of seashells are extracted from
+ecosystems annually. Confiscated seashells, cannot be returned to their
+ecosystems due to the lack of origin recognition. To address this issue, we
+developed a convolutional neural network (CNN) specifically for seashell
+identification. We built a dataset from scratch, consisting of approximately
+19000 images from the Pacific and Caribbean coasts. Using this dataset, the
+model achieved a classification accuracy exceeding 85%. The model has been
+integrated into a user-friendly application, which has classified over 36,000
+seashells to date, delivering real-time results within 3 seconds per image. To
+further enhance the system's accuracy, an anomaly detection mechanism was
+incorporated to filter out irrelevant or anomalous inputs, ensuring only valid
+seashell images are processed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LayerMix: Enhanced Data Augmentation through Fractal Integration for
+  Robust Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hafiz Mughees Ahmad, Dario Morle, Afshin Rahimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models have demonstrated remarkable performance across various
+computer vision tasks, yet their vulnerability to distribution shifts remains a
+critical challenge. Despite sophisticated neural network architectures,
+existing models often struggle to maintain consistent performance when
+confronted with Out-of-Distribution (OOD) samples, including natural
+corruptions, adversarial perturbations, and anomalous patterns. We introduce
+LayerMix, an innovative data augmentation approach that systematically enhances
+model robustness through structured fractal-based image synthesis. By
+meticulously integrating structural complexity into training datasets, our
+method generates semantically consistent synthetic samples that significantly
+improve neural network generalization capabilities. Unlike traditional
+augmentation techniques that rely on random transformations, LayerMix employs a
+structured mixing pipeline that preserves original image semantics while
+introducing controlled variability. Extensive experiments across multiple
+benchmark datasets, including CIFAR-10, CIFAR-100, ImageNet-200, and
+ImageNet-1K demonstrate LayerMixs superior performance in classification
+accuracy and substantially enhances critical Machine Learning (ML) safety
+metrics, including resilience to natural image corruptions, robustness against
+adversarial attacks, improved model calibration and enhanced prediction
+consistency. LayerMix represents a significant advancement toward developing
+more reliable and adaptable artificial intelligence systems by addressing the
+fundamental challenges of deep learning generalization. The code is available
+at https://github.com/ahmadmughees/layermix.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EDMB: Edge Detector with Mamba 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04846v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04846v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yachuan Li, Xavier Soria Poma, Yun Bai, Qian Xiao, Chaozhi Yang, Guanlin Li, Zongmin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based models have made significant progress in edge detection,
+but their high computational cost is prohibitive. Recently, vision Mamba have
+shown excellent ability in efficiently capturing long-range dependencies.
+Drawing inspiration from this, we propose a novel edge detector with Mamba,
+termed EDMB, to efficiently generate high-quality multi-granularity edges. In
+EDMB, Mamba is combined with a global-local architecture, therefore it can
+focus on both global information and fine-grained cues. The fine-grained cues
+play a crucial role in edge detection, but are usually ignored by ordinary
+Mamba. We design a novel decoder to construct learnable Gaussian distributions
+by fusing global features and fine-grained features. And the multi-grained
+edges are generated by sampling from the distributions. In order to make
+multi-granularity edges applicable to single-label data, we introduce Evidence
+Lower Bound loss to supervise the learning of the distributions. On the
+multi-label dataset BSDS500, our proposed EDMB achieves competitive
+single-granularity ODS 0.837 and multi-granularity ODS 0.851 without
+multi-scale test or extra PASCAL-VOC data. Remarkably, EDMB can be extended to
+single-label datasets such as NYUDv2 and BIPED. The source code is available at
+https://github.com/Li-yachuan/EDMB.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalizable Trajectory Prediction Using Dual-Level
+  Representation Learning And Adaptive <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaouther Messaoud, Matthieu Cord, Alexandre Alahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing vehicle trajectory prediction models struggle with generalizability,
+prediction uncertainties, and handling complex interactions. It is often due to
+limitations like complex architectures customized for a specific dataset and
+inefficient multimodal handling. We propose Perceiver with Register queries
+(PerReg+), a novel trajectory prediction framework that introduces: (1)
+Dual-Level Representation Learning via Self-Distillation (SD) and Masked
+Reconstruction (MR), capturing global context and fine-grained details.
+Additionally, our approach of reconstructing segmentlevel trajectories and lane
+segments from masked inputs with query drop, enables effective use of
+contextual information and improves generalization; (2) Enhanced Multimodality
+using register-based queries and pretraining, eliminating the need for
+clustering and suppression; and (3) Adaptive Prompt Tuning during fine-tuning,
+freezing the main architecture and optimizing a small number of prompts for
+efficient adaptation. PerReg+ sets a new state-of-the-art performance on
+nuScenes [1], Argoverse 2 [2], and Waymo Open Motion Dataset (WOMD) [3].
+Remarkable, our pretrained model reduces the error by 6.8% on smaller datasets,
+and multi-dataset training enhances generalization. In cross-domain tests,
+PerReg+ reduces B-FDE by 11.8% compared to its non-pretrained variant.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Steerable Deep Network for Model-Free Diffusion MRI Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04794v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04794v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianfranco Cortes, Baba C. Vemuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonrigid registration is vital to medical image analysis but remains
+challenging for diffusion MRI (dMRI) due to its high-dimensional,
+orientation-dependent nature. While classical methods are accurate, they are
+computationally demanding, and deep neural networks, though efficient, have
+been underexplored for nonrigid dMRI registration compared to structural
+imaging. We present a novel, deep learning framework for model-free, nonrigid
+registration of raw diffusion MRI data that does not require explicit
+reorientation. Unlike previous methods relying on derived representations such
+as diffusion tensors or fiber orientation distribution functions, in our
+approach, we formulate the registration as an equivariant diffeomorphism of
+position-and-orientation space. Central to our method is an
+$\mathsf{SE}(3)$-equivariant UNet that generates velocity fields while
+preserving the geometric properties of a raw dMRI's domain. We introduce a new
+loss function based on the maximum mean discrepancy in Fourier space,
+implicitly matching ensemble average propagators across images. Experimental
+results on Human Connectome Project dMRI data demonstrate competitive
+performance compared to state-of-the-art approaches, with the added advantage
+of bypassing the overhead for estimating derived representations. This work
+establishes a foundation for data-driven, geometry-aware dMRI registration
+directly in the acquisition space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Registers in Vision <span class="highlight-title">Transformer</span>s for Robust Adaptation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04784v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04784v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Srikar Yellapragada, Kowshik Thopalli, Vivek Narayanaswamy, Wesam Sakla, Yang Liu, Yamen Mubarka, Dimitris Samaras, Jayaraman J. Thiagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have shown success across a variety of tasks due
+to their ability to capture global image representations. Recent studies have
+identified the existence of high-norm tokens in ViTs, which can interfere with
+unsupervised object discovery. To address this, the use of "registers" which
+are additional tokens that isolate high norm patch tokens while capturing
+global image-level information has been proposed. While registers have been
+studied extensively for object discovery, their generalization properties
+particularly in out-of-distribution (OOD) scenarios, remains underexplored. In
+this paper, we examine the utility of register token embeddings in providing
+additional features for improving generalization and anomaly rejection. To that
+end, we propose a simple method that combines the special CLS token embedding
+commonly employed in ViTs with the average-pooled register embeddings to create
+feature representations which are subsequently used for training a downstream
+classifier. We find that this enhances OOD generalization and anomaly
+rejection, while maintaining in-distribution (ID) performance. Extensive
+experiments across multiple ViT backbones trained with and without registers
+reveal consistent improvements of 2-4\% in top-1 OOD accuracy and a 2-3\%
+reduction in false positive rates for anomaly detection. Importantly, these
+gains are achieved without additional computational overhead.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaussianVideo: Efficient Video Representation via Hierarchical Gaussian
+  Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Bond, Jui-Hsien Wang, Long Mai, Erkut Erdem, Aykut Erdem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient neural representations for dynamic video scenes are critical for
+applications ranging from video compression to interactive simulations. Yet,
+existing methods often face challenges related to high memory usage, lengthy
+training times, and temporal consistency. To address these issues, we introduce
+a novel neural video representation that combines 3D Gaussian splatting with
+continuous camera motion modeling. By leveraging Neural ODEs, our approach
+learns smooth camera trajectories while maintaining an explicit 3D scene
+representation through Gaussians. Additionally, we introduce a spatiotemporal
+hierarchical learning strategy, progressively refining spatial and temporal
+features to enhance reconstruction quality and accelerate convergence. This
+memory-efficient approach achieves high-quality rendering at impressive speeds.
+Experimental results show that our hierarchical learning, combined with robust
+camera motion modeling, captures complex dynamic scenes with strong temporal
+consistency, achieving state-of-the-art performance across diverse video
+datasets in both high- and low-motion scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TREAD: Token Routing for Efficient Architecture-agnostic Diffusion
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Krause, Timy Phan, Vincent Tao Hu, Björn Ommer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as the mainstream approach for visual
+generation. However, these models usually suffer from sample inefficiency and
+high training costs. This issue is particularly pronounced in the standard
+diffusion transformer architecture due to its quadratic complexity relative to
+input length. Recent works have addressed this by reducing the number of tokens
+processed in the model, often through masking. In contrast, this work aims to
+improve the training efficiency of the diffusion backbone by using predefined
+routes that store this information until it is reintroduced to deeper layers of
+the model, rather than discarding these tokens entirely. Further, we combine
+multiple routes and introduce an adapted auxiliary loss that accounts for all
+applied routes. Our method is not limited to the common transformer-based model
+- it can also be applied to state-space models. Unlike most current approaches,
+TREAD achieves this without architectural modifications. Finally, we show that
+our method reduces the computational cost and simultaneously boosts model
+performance on the standard benchmark ImageNet-1K 256 x 256 in
+class-conditional synthesis. Both of these benefits multiply to a convergence
+speedup of 9.55x at 400K training iterations compared to DiT and 25.39x
+compared to the best benchmark performance of DiT at 7M training iterations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video Summarisation with Incident and Context Information using
+  Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ulindu De Silva, Leon Fernando, Kalinga Bandara, Rashmika Nawaratne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of video content production has led to vast amounts of
+data, posing substantial challenges in terms of analysis efficiency and
+resource utilization. Addressing this issue calls for the development of robust
+video analysis tools. This paper proposes a novel approach leveraging
+Generative Artificial Intelligence (GenAI) to facilitate streamlined video
+analysis. Our tool aims to deliver tailored textual summaries of user-defined
+queries, offering a focused insight amidst extensive video datasets. Unlike
+conventional frameworks that offer generic summaries or limited action
+recognition, our method harnesses the power of GenAI to distil relevant
+information, enhancing analysis precision and efficiency. Employing YOLO-V8 for
+object detection and Gemini for comprehensive video and text analysis, our
+solution achieves heightened contextual accuracy. By combining YOLO with
+Gemini, our approach furnishes textual summaries extracted from extensive CCTV
+footage, enabling users to swiftly navigate and verify pertinent events without
+the need for exhaustive manual review. The quantitative evaluation revealed a
+similarity of 72.8%, while the qualitative assessment rated an accuracy of 85%,
+demonstrating the capability of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient License Plate Recognition in Videos Using Visual Rhythm and
+  Accumulative Line Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Nascimento Ribeiro, Nina S. T. Hirata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based Automatic License Plate Recognition (ALPR) involves extracting
+vehicle license plate text information from video captures. Traditional systems
+typically rely heavily on high-end computing resources and utilize multiple
+frames to recognize license plates, leading to increased computational
+overhead. In this paper, we propose two methods capable of efficiently
+extracting exactly one frame per vehicle and recognizing its license plate
+characters from this single image, thus significantly reducing computational
+demands. The first method uses Visual Rhythm (VR) to generate time-spatial
+images from videos, while the second employs Accumulative Line Analysis (ALA),
+a novel algorithm based on single-line video processing for real-time
+operation. Both methods leverage YOLO for license plate detection within the
+frame and a Convolutional Neural Network (CNN) for Optical Character
+Recognition (OCR) to extract textual information. Experiments on real videos
+demonstrate that the proposed methods achieve results comparable to traditional
+frame-by-frame approaches, with processing speeds three times faster.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Conference on Graphics, Patterns and
+  Images (SIBGRAPI) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EndoDINO: A Foundation Model for GI Endoscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05488v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05488v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Dermyer, Angad Kalra, Matt Schwartz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present EndoDINO, a foundation model for GI endoscopy tasks
+that achieves strong generalizability by pre-training on a well-curated image
+dataset sampled from the largest known GI endoscopy video dataset in the
+literature. Specifically, we pre-trained ViT models with 1B, 307M, and 86M
+parameters using datasets ranging from 100K to 10M curated images. Using
+EndoDINO as a frozen feature encoder, we achieved state-of-the-art performance
+in anatomical landmark classification, polyp segmentation, and Mayo endoscopic
+scoring (MES) for ulcerative colitis with only simple decoder heads.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Click2Mask: Local Editing with Dynamic Mask Generation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08272v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08272v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omer Regev, Omri Avrahami, Dani Lischinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in generative models have revolutionized image generation
+and editing, making these tasks accessible to non-experts. This paper focuses
+on local image editing, particularly the task of adding new content to a
+loosely specified area. Existing methods often require a precise mask or a
+detailed description of the location, which can be cumbersome and prone to
+errors. We propose Click2Mask, a novel approach that simplifies the local
+editing process by requiring only a single point of reference (in addition to
+the content description). A mask is dynamically grown around this point during
+a Blended Latent Diffusion (BLD) process, guided by a masked CLIP-based
+semantic loss. Click2Mask surpasses the limitations of segmentation-based and
+fine-tuning dependent methods, offering a more user-friendly and contextually
+accurate solution. Our experiments demonstrate that Click2Mask not only
+minimizes user effort but also enables competitive or superior local image
+manipulations compared to SoTA methods, according to both human judgement and
+automatic metrics. Key contributions include the simplification of user input,
+the ability to freely add objects unconstrained by existing segments, and the
+integration potential of our dynamic mask approach within other editing
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025. Project page is available at
+  https://omeregev.github.io/click2mask/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GLoG-CSUnet: Enhancing Vision <span class="highlight-title">Transformer</span>s with Adaptable Radiomic
+  Features for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02788v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02788v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niloufar Eghbali, Hassan Bagher-Ebadian, Tuka Alhanai, Mohammad M. Ghassemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have shown promise in medical image semantic
+segmentation (MISS) by capturing long-range correlations. However, ViTs often
+struggle to model local spatial information effectively, which is essential for
+accurately segmenting fine anatomical details, particularly when applied to
+small datasets without extensive pre-training. We introduce Gabor and Laplacian
+of Gaussian Convolutional Swin Network (GLoG-CSUnet), a novel architecture
+enhancing Transformer-based models by incorporating learnable radiomic
+features. This approach integrates dynamically adaptive Gabor and Laplacian of
+Gaussian (LoG) filters to capture texture, edge, and boundary information,
+enhancing the feature representation processed by the Transformer model. Our
+method uniquely combines the long-range dependency modeling of Transformers
+with the texture analysis capabilities of Gabor and LoG features. Evaluated on
+the Synapse multi-organ and ACDC cardiac segmentation datasets, GLoG-CSUnet
+demonstrates significant improvements over state-of-the-art models, achieving a
+1.14% increase in Dice score for Synapse and 0.99% for ACDC, with minimal
+computational overhead (only 15 and 30 additional parameters, respectively).
+GLoG-CSUnet's flexible design allows integration with various base models,
+offering a promising approach for incorporating radiomics-inspired feature
+extraction in Transformer architectures for medical image analysis. The code
+implementation is available on GitHub at: https://github.com/HAAIL/GLoG-CSUnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MADation: Face Morphing Attack Detection with Foundation Models <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03800v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03800v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduarda Caldeira, Guray Ozgur, Tahar Chettaoui, Marija Ivanovska, Peter Peer, Fadi Boutros, Vitomir Struc, Naser Damer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the considerable performance improvements of face recognition
+algorithms in recent years, the same scientific advances responsible for this
+progress can also be used to create efficient ways to attack them, posing a
+threat to their secure deployment. Morphing attack detection (MAD) systems aim
+to detect a specific type of threat, morphing attacks, at an early stage,
+preventing them from being considered for verification in critical processes.
+Foundation models (FM) learn from extensive amounts of unlabeled data,
+achieving remarkable zero-shot generalization to unseen domains. Although this
+generalization capacity might be weak when dealing with domain-specific
+downstream tasks such as MAD, FMs can easily adapt to these settings while
+retaining the built-in knowledge acquired during pre-training. In this work, we
+recognize the potential of FMs to perform well in the MAD task when properly
+adapted to its specificities. To this end, we adapt FM CLIP architectures with
+LoRA weights while simultaneously training a classification header. The
+proposed framework, MADation surpasses our alternative FM and transformer-based
+frameworks and constitutes the first adaption of FMs to the MAD task. MADation
+presents competitive results with current MAD solutions in the literature and
+even surpasses them in several evaluation scenarios. To encourage
+reproducibility and facilitate further research in MAD, we publicly release the
+implementation of MADation at https: //github.com/gurayozgur/MADation
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WACV 2025 workshops</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forget Vectors at Play: Universal Input Perturbations Driving Machine
+  Unlearning in Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changchang Sun, Ren Wang, Yihua Zhang, Jinghan Jia, Jiancheng Liu, Gaowen Liu, Sijia Liu, Yan Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning (MU), which seeks to erase the influence of specific
+unwanted data from already-trained models, is becoming increasingly vital in
+model editing, particularly to comply with evolving data regulations like the
+``right to be forgotten''. Conventional approaches are predominantly
+model-based, typically requiring retraining or fine-tuning the model's weights
+to meet unlearning requirements. In this work, we approach the MU problem from
+a novel input perturbation-based perspective, where the model weights remain
+intact throughout the unlearning process. We demonstrate the existence of a
+proactive input-based unlearning strategy, referred to forget vector, which can
+be generated as an input-agnostic data perturbation and remains as effective as
+model-based approximate unlearning approaches. We also explore forget vector
+arithmetic, whereby multiple class-specific forget vectors are combined through
+simple operations (e.g., linear combinations) to generate new forget vectors
+for unseen unlearning tasks, such as forgetting arbitrary subsets across
+classes. Extensive experiments validate the effectiveness and adaptability of
+the forget vector, showcasing its competitive performance relative to
+state-of-the-art model-based methods. Codes are available at
+https://github.com/Changchangsun/Forget-Vector.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LeGrad: An Explainability Method for Vision <span class="highlight-title">Transformer</span>s via Feature
+  Formation Sensitivity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03214v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03214v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Walid Bousselham, Angie Boggust, Sofian Chaybouti, Hendrik Strobelt, Hilde Kuehne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs), with their ability to model long-range
+dependencies through self-attention mechanisms, have become a standard
+architecture in computer vision. However, the interpretability of these models
+remains a challenge. To address this, we propose LeGrad, an explainability
+method specifically designed for ViTs. LeGrad computes the gradient with
+respect to the attention maps of ViT layers, considering the gradient itself as
+the explainability signal. We aggregate the signal over all layers, combining
+the activations of the last as well as intermediate tokens to produce the
+merged explainability map. This makes LeGrad a conceptually simple and an
+easy-to-implement tool for enhancing the transparency of ViTs. We evaluate
+LeGrad in challenging segmentation, perturbation, and open-vocabulary settings,
+showcasing its versatility compared to other SotA explainability methods
+demonstrating its superior spatial fidelity and robustness to perturbations. A
+demo and the code is available at https://github.com/WalBouss/LeGrad.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://github.com/WalBouss/LeGrad</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Revisiting Visual Place Recognition for Joining Submaps in
+  Multimap SLAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.12408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.12408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus Weißflog, Stefan Schubert, Peter Protzel, Peer Neubert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual SLAM is a key technology for many autonomous systems. However,
+tracking loss can lead to the creation of disjoint submaps in multimap SLAM
+systems like ORB-SLAM3. Because of that, these systems employ submap merging
+strategies. As we show, these strategies are not always successful. In this
+paper, we investigate the impact of using modern VPR approaches for submap
+merging in visual SLAM. We argue that classical evaluation metrics are not
+sufficient to estimate the impact of a modern VPR component on the overall
+system. We show that naively replacing the VPR component does not leverage its
+full potential without requiring substantial interference in the original
+system. Because of that, we present a post-processing pipeline along with a set
+of metrics that allow us to estimate the impact of modern VPR components. We
+evaluate our approach on the NCLT and Newer College datasets using ORB-SLAM3
+with NetVLAD and HDC-DELF as VPR components. Additionally, we present a simple
+approach for combining VPR with temporal consistency for map merging. We show
+that the map merging performance of ORB-SLAM3 can be improved. Building on
+these results, researchers in VPR can assess the potential of their approaches
+for SLAM systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at TAROS 2024. This is the submitted version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PointDreamer: Zero-shot 3D Textured Mesh Reconstruction from Colored
+  Point Cloud 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15811v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15811v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiao Yu, Xianzhi Li, Yuan Tang, Xu Han, Jinfeng Xu, Long Hu, Min Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing textured meshes from colored point clouds is an important but
+challenging task. Most existing methods yield blurry-looking textures or rely
+on 3D training data that are hard to acquire. Regarding this, we propose
+PointDreamer, a novel framework for textured mesh reconstruction from colored
+point cloud via diffusion-based 2D inpainting. Specifically, we first
+reconstruct an untextured mesh. Next, we project the input point cloud into 2D
+space to generate sparse multi-view images, and then inpaint empty pixels
+utilizing a pre-trained 2D diffusion model. After that, we unproject the colors
+of the inpainted dense images onto the untextured mesh, thus obtaining the
+final textured mesh. This project-inpaint-unproject pipeline bridges the gap
+between 3D point clouds and 2D diffusion models for the first time. Thanks to
+the powerful 2D diffusion model pre-trained on extensive 2D data, PointDreamer
+reconstructs clear, high-quality textures with high robustness to sparse or
+noisy input. Also, it's zero-shot requiring no extra training. In addition, we
+design Non-Border-First unprojection strategy to address the border-area
+inconsistency issue, which is less explored but commonly-occurred in methods
+that generate 3D textures from multiview images. Extensive qualitative and
+quantitative experiments on various synthetic and real-scanned datasets show
+the SoTA performance of PointDreamer, by significantly outperforming baseline
+methods with 30% improvement in LPIPS score (from 0.118 to 0.068). Code at:
+https://github.com/YuQiao0303/PointDreamer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion-Zero: Zero-Shot Moving Object Control Framework for
+  Diffusion-Based Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10150v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10150v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changgu Chen, Junwei Shu, Gaoqi He, Changbo Wang, Yang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent large-scale pre-trained diffusion models have demonstrated a powerful
+generative ability to produce high-quality videos from detailed text
+descriptions. However, exerting control over the motion of objects in videos
+generated by any video diffusion model is a challenging problem. In this paper,
+we propose a novel zero-shot moving object trajectory control framework,
+Motion-Zero, to enable a bounding-box-trajectories-controlled text-to-video
+diffusion model. To this end, an initial noise prior module is designed to
+provide a position-based prior to improve the stability of the appearance of
+the moving object and the accuracy of position. In addition, based on the
+attention map of the U-net, spatial constraints are directly applied to the
+denoising process of diffusion models, which further ensures the positional and
+spatial consistency of moving objects during the inference. Furthermore,
+temporal consistency is guaranteed with a proposed shift temporal attention
+mechanism. Our method can be flexibly applied to various state-of-the-art video
+diffusion models without any training process. Extensive experiments
+demonstrate our proposed method can control the motion trajectories of objects
+and generate high-quality videos. Our project page is
+https://vpx-ecnu.github.io/MotionZero-website/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rad4XCNN: a new agnostic method for post-hoc global explanation of
+  CNN-derived features by means of radiomics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02334v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02334v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Prinzi, Carmelo Militello, Calogero Zarcaro, Tommaso Vincenzo Bartolotta, Salvatore Gaglio, Salvatore Vitabile
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, machine learning-based clinical decision support systems
+(CDSS) have played a key role in the analysis of several medical conditions.
+Despite their promising capabilities, the lack of transparency in AI models
+poses significant challenges, particularly in medical contexts where
+reliability is a mandatory aspect. However, it appears that explainability is
+inversely proportional to accuracy. For this reason, achieving transparency
+without compromising predictive accuracy remains a key challenge. This paper
+presents a novel method, namely Rad4XCNN, to enhance the predictive power of
+CNN-derived features with the inherent interpretability of radiomic features.
+Rad4XCNN diverges from conventional methods based on saliency maps, by
+associating intelligible meaning to CNN-derived features by means of Radiomics,
+offering new perspectives on explanation methods beyond visualization maps.
+Using a breast cancer classification task as a case study, we evaluated
+Rad4XCNN on ultrasound imaging datasets, including an online dataset and two
+in-house datasets for internal and external validation. Some key results are:
+i) CNN-derived features guarantee more robust accuracy when compared against
+ViT-derived and radiomic features; ii) conventional visualization map methods
+for explanation present several pitfalls; iii) Rad4XCNN does not sacrifice
+model accuracy for their explainability; iv) Rad4XCNN provides a global
+explanation enabling the physician to extract global insights and findings. Our
+method can mitigate some concerns related to the explainability-accuracy
+trade-off. This study highlighted the importance of proposing new methods for
+model explanation without affecting their accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with
+  Video LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqian Yuan, Hang Zhang, Wentong Li, Zesen Cheng, Boqiang Zhang, Long Li, Xin Li, Deli Zhao, Wenqiao Zhang, Yueting Zhuang, Jianke Zhu, Lidong Bing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Large Language Models (Video LLMs) have recently exhibited remarkable
+capabilities in general video understanding. However, they mainly focus on
+holistic comprehension and struggle with capturing fine-grained spatial and
+temporal details. Besides, the lack of high-quality object-level video
+instruction data and a comprehensive benchmark further hinders their
+advancements. To tackle these challenges, we introduce the VideoRefer Suite to
+empower Video LLM for finer-level spatial-temporal video understanding, i.e.,
+enabling perception and reasoning on any objects throughout the video.
+Specially, we thoroughly develop VideoRefer Suite across three essential
+aspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent
+data engine to meticulously curate a large-scale, high-quality object-level
+video instruction dataset, termed VideoRefer-700K. Next, we present the
+VideoRefer model, which equips a versatile spatial-temporal object encoder to
+capture precise regional and sequential representations. Finally, we
+meticulously create a VideoRefer-Bench to comprehensively assess the
+spatial-temporal understanding capability of a Video LLM, evaluating it across
+various aspects. Extensive experiments and analyses demonstrate that our
+VideoRefer model not only achieves promising performance on video referring
+benchmarks but also facilitates general video understanding capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 figures, technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embedding Similarity Guided License Plate Super Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01483v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01483v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abderrezzaq Sendjasni, Mohamed-Chaker Larabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Super-resolution (SR) techniques play a pivotal role in enhancing the quality
+of low-resolution images, particularly for applications such as security and
+surveillance, where accurate license plate recognition is crucial. This study
+proposes a novel framework that combines pixel-based loss with embedding
+similarity learning to address the unique challenges of license plate
+super-resolution (LPSR). The introduced pixel and embedding consistency loss
+(PECL) integrates a Siamese network and applies contrastive loss to force
+embedding similarities to improve perceptual and structural fidelity. By
+effectively balancing pixel-wise accuracy with embedding-level consistency, the
+framework achieves superior alignment of fine-grained features between
+high-resolution (HR) and super-resolved (SR) license plates. Extensive
+experiments on the CCPD dataset validate the efficacy of the proposed
+framework, demonstrating consistent improvements over state-of-the-art methods
+in terms of PSNR_RGB, PSNR_Y and optical character recognition (OCR) accuracy.
+These results highlight the potential of embedding similarity learning to
+advance both perceptual quality and task-specific performance in extreme
+super-resolution scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Neurocomputing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuralDiffuser: Neuroscience-inspired Diffusion Guidance for fMRI Visual
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13809v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13809v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Li, Hao Wu, Badong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing visual stimuli from functional Magnetic Resonance Imaging fMRI
+enables fine-grained retrieval of brain activity. However, the accurate
+reconstruction of diverse details, including structure, background, texture,
+color, and more, remains challenging. The stable diffusion models inevitably
+result in the variability of reconstructed images, even under identical
+conditions. To address this challenge, we first uncover the neuroscientific
+perspective of diffusion methods, which primarily involve top-down creation
+using pre-trained knowledge from extensive image datasets, but tend to lack
+detail-driven bottom-up perception, leading to a loss of faithful details. In
+this paper, we propose NeuralDiffuser, which incorporates primary visual
+feature guidance to provide detailed cues in the form of gradients. This
+extension of the bottom-up process for diffusion models achieves both semantic
+coherence and detail fidelity when reconstructing visual stimuli. Furthermore,
+we have developed a novel guidance strategy for reconstruction tasks that
+ensures the consistency of repeated outputs with original images rather than
+with various outputs. Extensive experimental results on the Natural Senses
+Dataset (NSD) qualitatively and quantitatively demonstrate the advancement of
+NeuralDiffuser by comparing it against baseline and state-of-the-art methods
+horizontally, as well as conducting longitudinal ablation studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tutorial on Diffusion Models for Imaging and Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18103v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18103v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stanley H. Chan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The astonishing growth of generative tools in recent years has empowered many
+exciting applications in text-to-image generation and text-to-video generation.
+The underlying principle behind these generative tools is the concept of
+diffusion, a particular sampling mechanism that has overcome some shortcomings
+that were deemed difficult in the previous approaches. The goal of this
+tutorial is to discuss the essential ideas underlying the diffusion models. The
+target audience of this tutorial includes undergraduate and graduate students
+who are interested in doing research on diffusion models or applying these
+models to solve other problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TSCM: A Teacher-Student Model for Vision Place Recognition Using
+  Cross-Metric Knowledge Distillation <span class="chip">ICRA 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01587v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01587v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yehui Shen, Mingmin Liu, Huimin Lu, Xieyuanli Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual place recognition (VPR) plays a pivotal role in autonomous exploration
+and navigation of mobile robots within complex outdoor environments. While
+cost-effective and easily deployed, camera sensors are sensitive to lighting
+and weather changes, and even slight image alterations can greatly affect VPR
+efficiency and precision. Existing methods overcome this by exploiting powerful
+yet large networks, leading to significant consumption of computational
+resources. In this paper, we propose a high-performance teacher and lightweight
+student distillation framework called TSCM. It exploits our devised
+cross-metric knowledge distillation to narrow the performance gap between the
+teacher and student models, maintaining superior performance while enabling
+minimal computational load during deployment. We conduct comprehensive
+evaluations on large-scale datasets, namely Pittsburgh30k and Pittsburgh250k.
+Experimental results demonstrate the superiority of our method over baseline
+models in terms of recognition accuracy and model parameter efficiency.
+Moreover, our ablation studies show that the proposed knowledge distillation
+technique surpasses other counterparts. The code of our method has been
+released at https://github.com/nubot-nudt/TSCM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICRA 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReCLIP++: Learn to Rectify the Bias of CLIP for Unsupervised Semantic
+  Segmentation <span class="chip">CVPR 24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06747v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06747v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyun Wang, Guoliang Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works utilize CLIP to perform the challenging unsupervised semantic
+segmentation task where only images without annotations are available. However,
+we observe that when adopting CLIP to such a pixel-level understanding task,
+unexpected bias (including class-preference bias and space-preference bias)
+occurs. Previous works don't explicitly model the bias, which largely
+constrains the segmentation performance. In this paper, we propose to
+explicitly model and rectify the bias existing in CLIP to facilitate the
+unsupervised semantic segmentation task. Specifically, we design a learnable
+"Reference" prompt to encode class-preference bias and a projection of the
+positional embedding in the vision transformer to encode space-preference bias
+respectively. To avoid interference, two kinds of biases are firstly
+independently encoded into different features, i.e., the Reference feature and
+the positional feature. Via a matrix multiplication between the Reference
+feature and the positional feature, a bias logit map is generated to explicitly
+represent two kinds of biases. Then we rectify the logits of CLIP via a simple
+element-wise subtraction. To make the rectified results smoother and more
+contextual, we design a mask decoder which takes the feature of CLIP and the
+rectified logits as input and outputs a rectified segmentation mask with the
+help of Gumbel-Softmax operation. A contrastive loss based on the masked visual
+features and the text features of different classes is imposed, which makes the
+bias modeling and rectification process meaningful and effective. Extensive
+experiments on various benchmarks including PASCAL VOC, PASCAL Context, ADE20K,
+Cityscapes, and COCO Stuff demonstrate that our method performs favorably
+against previous state-of-the-arts. The implementation is available at:
+https://github.com/dogehhh/ReCLIP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of our CVPR 24 paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Energy-based Hopfield Boosting for Out-of-Distribution Detection <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.08766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.08766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claus Hofmann, Simon Schmid, Bernhard Lehner, Daniel Klotz, Sepp Hochreiter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection is critical when deploying machine
+learning models in the real world. Outlier exposure methods, which incorporate
+auxiliary outlier data in the training process, can drastically improve OOD
+detection performance compared to approaches without advanced training
+strategies. We introduce Hopfield Boosting, a boosting approach, which
+leverages modern Hopfield energy (MHE) to sharpen the decision boundary between
+the in-distribution and OOD data. Hopfield Boosting encourages the model to
+concentrate on hard-to-distinguish auxiliary outlier examples that lie close to
+the decision boundary between in-distribution and auxiliary outlier data. Our
+method achieves a new state-of-the-art in OOD detection with outlier exposure,
+improving the FPR95 metric from 2.28 to 0.92 on CIFAR-10 and from 11.76 to 7.94
+on CIFAR-100.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Pixels to Titles: Video Game Identification by Screenshots using
+  Convolutional Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15963v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15963v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabricio Breve
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates video game identification through single screenshots,
+utilizing ten convolutional neural network (CNN) architectures (VGG16,
+ResNet50, ResNet152, MobileNet, DenseNet169, DenseNet201, EfficientNetB0,
+EfficientNetB2, EfficientNetB3, and EfficientNetV2S) and three transformers
+architectures (ViT-B16, ViT-L32, and SwinT) across 22 home console systems,
+spanning from Atari 2600 to PlayStation 5, totalling 8,796 games and 170,881
+screenshots. Except for VGG16, all CNNs outperformed the transformers in this
+task. Using ImageNet pre-trained weights as initial weights, EfficientNetV2S
+achieves the highest average accuracy (77.44%) and the highest accuracy in 16
+of the 22 systems. DenseNet201 is the best in four systems and EfficientNetB3
+is the best in the remaining two systems. Employing alternative initial weights
+fine-tuned in an arcade screenshots dataset boosts accuracy for EfficientNet
+architectures, with the EfficientNetV2S reaching a peak accuracy of 77.63% and
+demonstrating reduced convergence epochs from 26.9 to 24.5 on average. Overall,
+the combination of optimal architecture and weights attains 78.79% accuracy,
+primarily led by EfficientNetV2S in 15 systems. These findings underscore the
+efficacy of CNNs in video game identification through screenshots.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Video-Based ALPR System Using YOLO and Visual Rhythm <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02270v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02270v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Nascimento Ribeiro, Nina S. T. Hirata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic License Plate Recognition (ALPR) involves extracting vehicle
+license plate information from image or a video capture. These systems have
+gained popularity due to the wide availability of low-cost surveillance cameras
+and advances in Deep Learning. Typically, video-based ALPR systems rely on
+multiple frames to detect the vehicle and recognize the license plates.
+Therefore, we propose a system capable of extracting exactly one frame per
+vehicle and recognizing its license plate characters from this singular image
+using an Optical Character Recognition (OCR) model. Early experiments show that
+this methodology is viable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balanced 3DGS: Gaussian-wise Parallelism Rendering with Fine-Grained
+  Tiling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17378v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17378v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Gui, Lin Hu, Rui Chen, Mingxiao Huang, Yuxin Yin, Jin Yang, Yong Wu, Chen Liu, Zhongxu Sun, Xueyang Zhang, Kun Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting (3DGS) is increasingly attracting attention in both
+academia and industry owing to its superior visual quality and rendering speed.
+However, training a 3DGS model remains a time-intensive task, especially in
+load imbalance scenarios where workload diversity among pixels and Gaussian
+spheres causes poor renderCUDA kernel performance. We introduce Balanced 3DGS,
+a Gaussian-wise parallelism rendering with fine-grained tiling approach in 3DGS
+training process, perfectly solving load-imbalance issues. First, we
+innovatively introduce the inter-block dynamic workload distribution technique
+to map workloads to Streaming Multiprocessor(SM) resources within a single GPU
+dynamically, which constitutes the foundation of load balancing. Second, we are
+the first to propose the Gaussian-wise parallel rendering technique to
+significantly reduce workload divergence inside a warp, which serves as a
+critical component in addressing load imbalance. Based on the above two
+methods, we further creatively put forward the fine-grained combined load
+balancing technique to uniformly distribute workload across all SMs, which
+boosts the forward renderCUDA kernel performance by up to 7.52x. Besides, we
+present a self-adaptive render kernel selection strategy during the 3DGS
+training process based on different load-balance situations, which effectively
+improves training efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ViG-Bias: Visually Grounded Bias Discovery and Mitigation <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.01996v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.01996v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Badr-Eddine Marani, Mohamed Hanini, Nihitha Malayarukil, Stergios Christodoulidis, Maria Vakalopoulou, Enzo Ferrante
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of machine learning models in critical decision making
+processes has underscored the need for bias discovery and mitigation
+strategies. Identifying the reasons behind a biased system is not
+straightforward, since in many occasions they are associated with hidden
+spurious correlations which are not easy to spot. Standard approaches rely on
+bias audits performed by analyzing model performance in pre-defined subgroups
+of data samples, usually characterized by common attributes like gender or
+ethnicity when it comes to people, or other specific attributes defining
+semantically coherent groups of images. However, it is not always possible to
+know a-priori the specific attributes defining the failure modes of visual
+recognition systems. Recent approaches propose to discover these groups by
+leveraging large vision language models, which enable the extraction of
+cross-modal embeddings and the generation of textual descriptions to
+characterize the subgroups where a certain model is underperforming. In this
+work, we argue that incorporating visual explanations (e.g. heatmaps generated
+via GradCAM or other approaches) can boost the performance of such bias
+discovery and mitigation frameworks. To this end, we introduce Visually
+Grounded Bias Discovery and Mitigation (ViG-Bias), a simple yet effective
+technique which can be integrated to a variety of existing frameworks to
+improve both, discovery and mitigation performance. Our comprehensive
+evaluation shows that incorporating visual explanations enhances existing
+techniques like DOMINO, FACTS and Bias-to-Text, across several challenging
+datasets, including CelebA, Waterbirds, and NICO++.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stylebreeder: Exploring and Democratizing Artistic Styles through
+  Text-to-Image Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Zheng, Enis Simsar, Hidir Yesiltepe, Federico Tombari, Joel Simon, Pinar Yanardag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image models are becoming increasingly popular, revolutionizing the
+landscape of digital art creation by enabling highly detailed and creative
+visual content generation. These models have been widely employed across
+various domains, particularly in art generation, where they facilitate a broad
+spectrum of creative expression and democratize access to artistic creation. In
+this paper, we introduce \texttt{STYLEBREEDER}, a comprehensive dataset of 6.8M
+images and 1.8M prompts generated by 95K users on Artbreeder, a platform that
+has emerged as a significant hub for creative exploration with over 13M users.
+We introduce a series of tasks with this dataset aimed at identifying diverse
+artistic styles, generating personalized content, and recommending styles based
+on user interests. By documenting unique, user-generated styles that transcend
+conventional categories like 'cyberpunk' or 'Picasso,' we explore the potential
+for unique, crowd-sourced styles that could provide deep insights into the
+collective creative psyche of users worldwide. We also evaluate different
+personalization methods to enhance artistic expression and introduce a style
+atlas, making these models available in LoRA format for public use. Our
+research demonstrates the potential of text-to-image diffusion models to
+uncover and promote unique artistic expressions, further democratizing AI in
+art and fostering a more diverse and inclusive artistic community. The dataset,
+code and models are available at https://stylebreeder.github.io under a Public
+Domain (CC0) license.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2024 D&B Track, Project page:
+  https://stylebreeder.github.io HuggingFace DB Page:
+  https://huggingface.co/datasets/stylebreeder/stylebreeder</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LogicAD: Explainable Anomaly Detection via VLM-based Text Feature
+  Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01767v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01767v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Er Jin, Qihui Feng, Yongli Mou, Stefan Decker, Gerhard Lakemeyer, Oliver Simons, Johannes Stegmaier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Logical image understanding involves interpreting and reasoning about the
+relationships and consistency within an image's visual content. This capability
+is essential in applications such as industrial inspection, where logical
+anomaly detection is critical for maintaining high-quality standards and
+minimizing costly recalls. Previous research in anomaly detection (AD) has
+relied on prior knowledge for designing algorithms, which often requires
+extensive manual annotations, significant computing power, and large amounts of
+data for training. Autoregressive, multimodal Vision Language Models (AVLMs)
+offer a promising alternative due to their exceptional performance in visual
+reasoning across various domains. Despite this, their application to logical AD
+remains unexplored. In this work, we investigate using AVLMs for logical AD and
+demonstrate that they are well-suited to the task. Combining AVLMs with format
+embedding and a logic reasoner, we achieve SOTA performance on public
+benchmarks, MVTec LOCO AD, with an AUROC of 86.0% and F1-max of 83.7%, along
+with explanations of anomalies. This significantly outperforms the existing
+SOTA method by a large margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at aaai25, project page:
+  https://jasonjin34.github.io/logicad.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Simplicity and Sophistication using GLinear: A Novel
+  Architecture for Enhanced Time Series Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01087v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01087v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Syed Tahir Hussain Rizvi, Neel Kanwal, Muddasar Naeem, Alfredo Cuzzocrea, Antonio Coronato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time Series Forecasting (TSF) is an important application across many fields.
+There is a debate about whether Transformers, despite being good at
+understanding long sequences, struggle with preserving temporal relationships
+in time series data. Recent research suggests that simpler linear models might
+outperform or at least provide competitive performance compared to complex
+Transformer-based models for TSF tasks. In this paper, we propose a novel
+data-efficient architecture, GLinear, for multivariate TSF that exploits
+periodic patterns to provide better accuracy. It also provides better
+prediction accuracy by using a smaller amount of historical data compared to
+other state-of-the-art linear predictors. Four different datasets (ETTh1,
+Electricity, Traffic, and Weather) are used to evaluate the performance of the
+proposed predictor. A performance comparison with state-of-the-art linear
+architectures (such as NLinear, DLinear, and RLinear) and transformer-based
+time series predictor (Autoformer) shows that the GLinear, despite being
+parametrically efficient, significantly outperforms the existing architectures
+in most cases of multivariate TSF. We hope that the proposed GLinear opens new
+fronts of research and development of simpler and more sophisticated
+architectures for data and computationally efficient time-series analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Transactions on Emerging Topics in Computational
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Image Caption via Cycle-consistent Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03567v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03567v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Cui, Jinbin Bai, Guo-Hua Wang, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, Ye Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating image captions typically relies on reference captions, which are
+costly to obtain and exhibit significant diversity and subjectivity. While
+reference-free evaluation metrics have been proposed, most focus on cross-modal
+evaluation between captions and images. Recent research has revealed that the
+modality gap generally exists in the representation of contrastive
+learning-based multi-modal systems, undermining the reliability of
+cross-modality metrics like CLIPScore. In this paper, we propose CAMScore, a
+cyclic reference-free automatic evaluation metric for image captioning models.
+To circumvent the aforementioned modality gap, CAMScore utilizes a
+text-to-image model to generate images from captions and subsequently evaluates
+these generated images against the original images. Furthermore, to provide
+fine-grained information for a more comprehensive evaluation, we design a
+three-level evaluation framework for CAMScore that encompasses pixel-level,
+semantic-level, and objective-level perspectives. Extensive experiment results
+across multiple benchmark datasets show that CAMScore achieves a superior
+correlation with human judgments compared to existing reference-based and
+reference-free metrics, demonstrating the effectiveness of the framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cobra: Extending Mamba to Multi-Modal Large Language Model for Efficient
+  Inference <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.14520v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.14520v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Zhao, Min Zhang, Wei Zhao, Pengxiang Ding, Siteng Huang, Donglin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the application of multimodal large language models (MLLM)
+in various fields has achieved remarkable success. However, as the foundation
+model for many downstream tasks, current MLLMs are composed of the well-known
+Transformer network, which has a less efficient quadratic computation
+complexity. To improve the efficiency of such basic models, we propose Cobra, a
+linear computational complexity MLLM. Specifically, Cobra integrates the
+efficient Mamba language model into the visual modality. Moreover, we explore
+and study various modal fusion schemes to create an effective multi-modal
+Mamba. Extensive experiments demonstrate that (1) Cobra achieves extremely
+competitive performance with current computationally efficient state-of-the-art
+methods, e.g., LLaVA-Phi, TinyLLaVA, and MobileVLM v2, and has faster speed due
+to Cobra's linear sequential modeling. (2) Interestingly, the results of
+closed-set challenging prediction benchmarks show that Cobra performs well in
+overcoming visual illusions and spatial relationship judgments. (3) Notably,
+Cobra even achieves comparable performance to LLaVA with about 43% of the
+number of parameters. We will make all codes of Cobra open-source and hope that
+the proposed method can facilitate future research on complexity problems in
+MLLM. Our project page is available at: https://sites.google.com/view/cobravlm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Thirty-Ninth AAAI Conference on Artificial
+  Intelligence (AAAI-25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One missing piece in Vision and Language: A <span class="highlight-title">Survey</span> on Comics
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09502v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09502v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuele Vivoli, Mohamed Ali Souibgui, Andrey Barsky, Artemis LLabrés, Marco Bertini, Dimosthenis Karatzas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models have recently evolved into versatile systems capable
+of high performance across a range of tasks, such as document understanding,
+visual question answering, and grounding, often in zero-shot settings. Comics
+Understanding, a complex and multifaceted field, stands to greatly benefit from
+these advances. Comics, as a medium, combine rich visual and textual
+narratives, challenging AI models with tasks that span image classification,
+object detection, instance segmentation, and deeper narrative comprehension
+through sequential panels. However, the unique structure of comics --
+characterized by creative variations in style, reading order, and non-linear
+storytelling -- presents a set of challenges distinct from those in other
+visual-language domains. In this survey, we present a comprehensive review of
+Comics Understanding from both dataset and task perspectives. Our contributions
+are fivefold: (1) We analyze the structure of the comics medium, detailing its
+distinctive compositional elements; (2) We survey the widely used datasets and
+tasks in comics research, emphasizing their role in advancing the field; (3) We
+introduce the Layer of Comics Understanding (LoCU) framework, a novel taxonomy
+that redefines vision-language tasks within comics and lays the foundation for
+future work; (4) We provide a detailed review and categorization of existing
+methods following the LoCU framework; (5) Finally, we highlight current
+research challenges and propose directions for future exploration, particularly
+in the context of vision-language models applied to comics. This survey is the
+first to propose a task-oriented framework for comics intelligence and aims to
+guide future research by addressing critical gaps in data availability and task
+definition. A project associated with this survey is available at
+https://github.com/emanuelevivoli/awesome-comics-understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review. project website:
+  https://github.com/emanuelevivoli/awesome-comics-understanding</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TinySAM: Pushing the Envelope for Efficient Segment Anything Model <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13789v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13789v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Shu, Wenshuo Li, Yehui Tang, Yiman Zhang, Yihao Chen, Houqiang Li, Yunhe Wang, Xinghao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently segment anything model (SAM) has shown powerful segmentation
+capability and has drawn great attention in computer vision fields. Massive
+following works have developed various applications based on the pre-trained
+SAM and achieved impressive performance on downstream vision tasks. However,
+SAM consists of heavy architectures and requires massive computational
+capacity, which hinders the further application of SAM on computation
+constrained edge devices. To this end, in this paper we propose a framework to
+obtain a tiny segment anything model (TinySAM) while maintaining the strong
+zero-shot performance. We first propose a full-stage knowledge distillation
+method with hard prompt sampling and hard mask weighting strategy to distill a
+lightweight student model. We also adapt the post-training quantization to the
+prompt-based segmentation task and further reduce the computational cost.
+Moreover, a hierarchical segmenting everything strategy is proposed to
+accelerate the everything inference by $2\times$ with almost no performance
+degradation. With all these proposed methods, our TinySAM leads to orders of
+magnitude computational reduction and pushes the envelope for efficient segment
+anything task. Extensive experiments on various zero-shot transfer tasks
+demonstrate the significantly advantageous performance of our TinySAM against
+counterpart methods. Codes are available at
+https://github.com/xinghaochen/TinySAM and
+https://gitee.com/mindspore/models/tree/master/research/cv/TinySAM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FreeZe: Training-free zero-shot 6D pose estimation with geometric and
+  vision foundation models <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.00947v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.00947v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Caraffa, Davide Boscaini, Amir Hamza, Fabio Poiesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the 6D pose of objects unseen during training is highly desirable
+yet challenging. Zero-shot object 6D pose estimation methods address this
+challenge by leveraging additional task-specific supervision provided by
+large-scale, photo-realistic synthetic datasets. However, their performance
+heavily depends on the quality and diversity of rendered data and they require
+extensive training. In this work, we show how to tackle the same task but
+without training on specific data. We propose FreeZe, a novel solution that
+harnesses the capabilities of pre-trained geometric and vision foundation
+models. FreeZe leverages 3D geometric descriptors learned from unrelated 3D
+point clouds and 2D visual features learned from web-scale 2D images to
+generate discriminative 3D point-level descriptors. We then estimate the 6D
+pose of unseen objects by 3D registration based on RANSAC. We also introduce a
+novel algorithm to solve ambiguous cases due to geometrically symmetric objects
+that is based on visual features. We comprehensively evaluate FreeZe across the
+seven core datasets of the BOP Benchmark, which include over a hundred 3D
+objects and 20,000 images captured in various scenarios. FreeZe consistently
+outperforms all state-of-the-art approaches, including competitors extensively
+trained on synthetic 6D pose estimation data. Code will be publicly available
+at https://andreacaraffa.github.io/freeze.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ECCV 2024. Project page:
+  https://andreacaraffa.github.io/freeze</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DEFormer: DCT-driven Enhancement <span class="highlight-title">Transformer</span> for Low-light Image and
+  Dark Vision <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06941v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06941v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangchen Yin, Zhenda Yu, Xin Gao, Xiao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-light image enhancement restores the colors and details of a single image
+and improves high-level visual tasks. However, restoring the lost details in
+the dark area is still a challenge relying only on the RGB domain. In this
+paper, we delve into frequency as a new clue into the model and propose a
+DCT-driven enhancement transformer (DEFormer) framework. First, we propose a
+learnable frequency branch (LFB) for frequency enhancement contains DCT
+processing and curvature-based frequency enhancement (CFE) to represent
+frequency features. Additionally, we propose a cross domain fusion (CDF) to
+reduce the differences between the RGB domain and the frequency domain. Our
+DEFormer has achieved superior results on the LOL and MIT-Adobe FiveK datasets,
+improving the dark detection performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MSCoTDet: Language-driven Multi-modal Fusion for Improved Multispectral
+  Pedestrian Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15209v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15209v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taeheon Kim, Sangyun Chung, Damin Yeom, Youngjoon Yu, Hak Gu Kim, Yong Man Ro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multispectral pedestrian detection is attractive for around-the-clock
+applications due to the complementary information between RGB and thermal
+modalities. However, current models often fail to detect pedestrians in certain
+cases (e.g., thermal-obscured pedestrians), particularly due to the modality
+bias learned from statistically biased datasets. In this paper, we investigate
+how to mitigate modality bias in multispectral pedestrian detection using Large
+Language Models (LLMs). Accordingly, we design a Multispectral Chain-of-Thought
+(MSCoT) prompting strategy, which prompts the LLM to perform multispectral
+pedestrian detection. Moreover, we propose a novel Multispectral
+Chain-of-Thought Detection (MSCoTDet) framework that integrates MSCoT prompting
+into multispectral pedestrian detection. To this end, we design a
+Language-driven Multi-modal Fusion (LMF) strategy that enables fusing the
+outputs of MSCoT prompting with the detection results of vision-based
+multispectral pedestrian detection models. Extensive experiments validate that
+MSCoTDet effectively mitigates modality biases and improves multispectral
+pedestrian detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Transactions on Circuits and Systems for Video Technology
+  (TCSVT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Part Segmentation via Geometric Aggregation of 2D Visual Features <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Garosi, Riccardo Tedoldi, Davide Boscaini, Massimiliano Mancini, Nicu Sebe, Fabio Poiesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised 3D part segmentation models are tailored for a fixed set of
+objects and parts, limiting their transferability to open-set, real-world
+scenarios. Recent works have explored vision-language models (VLMs) as a
+promising alternative, using multi-view rendering and textual prompting to
+identify object parts. However, naively applying VLMs in this context
+introduces several drawbacks, such as the need for meticulous prompt
+engineering, and fails to leverage the 3D geometric structure of objects. To
+address these limitations, we propose COPS, a COmprehensive model for Parts
+Segmentation that blends the semantics extracted from visual concepts and 3D
+geometry to effectively identify object parts. COPS renders a point cloud from
+multiple viewpoints, extracts 2D features, projects them back to 3D, and uses a
+novel geometric-aware feature aggregation procedure to ensure spatial and
+semantic consistency. Finally, it clusters points into parts and labels them.
+We demonstrate that COPS is efficient, scalable, and achieves zero-shot
+state-of-the-art performance across five datasets, covering synthetic and
+real-world data, texture-less and coloured objects, as well as rigid and
+non-rigid shapes. The code is available at https://3d-cops.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in WACV 2025. Project page: https://3d-cops.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapting Image-to-Video Diffusion Models for Large-Motion Frame
+  Interpolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17042v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17042v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luoxu Jin, Hiroshi Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of video generation models has advanced significantly in
+recent years, we adopt large-scale image-to-video diffusion models for video
+frame interpolation. We present a conditional encoder designed to adapt an
+image-to-video model for large-motion frame interpolation. To enhance
+performance, we integrate a dual-branch feature extractor and propose a
+cross-frame attention mechanism that effectively captures both spatial and
+temporal information, enabling accurate interpolations of intermediate frames.
+Our approach demonstrates superior performance on the Fr\'echet Video Distance
+(FVD) metric when evaluated against other state-of-the-art approaches,
+particularly in handling large motion scenarios, highlighting advancements in
+generative-based methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Cut-guided Maximal Coding Rate Reduction for Learning Image
+  Embedding and Clustering <span class="chip">ACCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        W. He, Z. Huang, X. Meng, X. Qi, R. Xiao, C. -G. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of pre-trained models, image clustering task is usually addressed
+by two relevant stages: a) to produce features from pre-trained vision models;
+and b) to find clusters from the pre-trained features. However, these two
+stages are often considered separately or learned by different paradigms,
+leading to suboptimal clustering performance. In this paper, we propose a
+unified framework, termed graph Cut-guided Maximal Coding Rate Reduction
+(CgMCR$^2$), for jointly learning the structured embeddings and the clustering.
+To be specific, we attempt to integrate an efficient clustering module into the
+principled framework for learning structured representation, in which the
+clustering module is used to provide partition information to guide the
+cluster-wise compression and the learned embeddings is aligned to desired
+geometric structures in turn to help for yielding more accurate partitions. We
+conduct extensive experiments on both standard and out-of-domain image datasets
+and experimental results validate the effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 9 figures, accepted in ACCV2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empowering LLMs to Understand and Generate Complex Vector Graphics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11102v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11102v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ximing Xing, Juncheng Hu, Guotao Liang, Jing Zhang, Dong Xu, Qian Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unprecedented advancements in Large Language Models (LLMs) have
+profoundly impacted natural language processing but have yet to fully embrace
+the realm of scalable vector graphics (SVG) generation. While LLMs encode
+partial knowledge of SVG data from web pages during training, recent findings
+suggest that semantically ambiguous and tokenized representations within LLMs
+may result in hallucinations in vector primitive predictions. Additionally, LLM
+training typically lacks modeling and understanding of the rendering sequence
+of vector paths, which can lead to occlusion between output vector primitives.
+In this paper, we present LLM4SVG, an initial yet substantial step toward
+bridging this gap by enabling LLMs to better understand and generate vector
+graphics. LLM4SVG facilitates a deeper understanding of SVG components through
+learnable semantic tokens, which precisely encode these tokens and their
+corresponding properties to generate semantically aligned SVG outputs. Using a
+series of learnable semantic tokens, a structured dataset for instruction
+following is developed to support comprehension and generation across two
+primary tasks. Our method introduces a modular architecture to existing large
+language models, integrating semantic tags, vector instruction encoders,
+fine-tuned commands, and powerful LLMs to tightly combine geometric,
+appearance, and language information. To overcome the scarcity of SVG-text
+instruction data, we developed an automated data generation pipeline that
+collected a massive dataset of more than 250k SVG data and 580k SVG-text
+instructions, which facilitated the adoption of the two-stage training strategy
+popular in LLM development. By exploring various training strategies, we
+developed LLM4SVG, which significantly moves beyond optimized rendering-based
+approaches and language-model-based baselines to achieve remarkable results in
+human evaluation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://ximinng.github.io/LLM4SVGProject/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03775v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03775v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinbin Yuan, ZhaoHui Zheng, Yuxuan Li, Xialei Liu, Li Liu, Xiang Li, Qibin Hou, Ming-Ming Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While witnessed with rapid development, remote sensing object detection
+remains challenging for detecting high aspect ratio objects. This paper shows
+that large strip convolutions are good feature representation learners for
+remote sensing object detection and can detect objects of various aspect ratios
+well. Based on large strip convolutions, we build a new network architecture
+called Strip R-CNN, which is simple, efficient, and powerful. Unlike recent
+remote sensing object detectors that leverage large-kernel convolutions with
+square shapes, our Strip R-CNN takes advantage of sequential orthogonal large
+strip convolutions to capture spatial information. In addition, we enhance the
+localization capability of remote-sensing object detectors by decoupling the
+detection heads and equipping the localization head with strip convolutions to
+better localize the target objects. Extensive experiments on several
+benchmarks, e.g., DOTA, FAIR1M, HRSC2016, and DIOR, show that our Strip R-CNN
+can largely improve previous works. Notably, our 30M model achieves 82.75% mAP
+on DOTA-v1.0, setting a new state-of-the-art record.Code is available at
+https://github.com/YXB-NKU/Strip-R-CNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization
+  Algorithm for Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01714v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01714v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawu Tian, Liwei Xu, Xiaowei Zhang, Yongqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training deep neural networks is a challenging task. In order to speed up
+training and enhance the performance of deep neural networks, we rectify the
+vanilla conjugate gradient as conjugate-gradient-like and incorporate it into
+the generic Adam, and thus propose a new optimization algorithm named
+CG-like-Adam for deep learning. Specifically, both the first-order and the
+second-order moment estimation of generic Adam are replaced by the
+conjugate-gradient-like. Convergence analysis handles the cases where the
+exponential moving average coefficient of the first-order moment estimation is
+constant and the first-order moment estimation is unbiased. Numerical
+experiments show the superiority of the proposed algorithm based on the
+CIFAR10/100 dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Future Success Prediction in Open-Vocabulary Object Manipulation Tasks
+  Based on End-Effector Trajectories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19112v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19112v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Motonari Kambara, Komei Sugiura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses a task designed to predict the future success or failure
+of open-vocabulary object manipulation. In this task, the model is required to
+make predictions based on natural language instructions, egocentric view images
+before manipulation, and the given end-effector trajectories. Conventional
+methods typically perform success prediction only after the manipulation is
+executed, limiting their efficiency in executing the entire task sequence. We
+propose a novel approach that enables the prediction of success or failure by
+aligning the given trajectories and images with natural language instructions.
+We introduce Trajectory Encoder to apply learnable weighting to the input
+trajectories, allowing the model to consider temporal dynamics and interactions
+between objects and the end effector, improving the model's ability to predict
+manipulation outcomes accurately. We constructed a dataset based on the RT-1
+dataset, a large-scale benchmark for open-vocabulary object manipulation tasks,
+to evaluate our method. The experimental results show that our method achieved
+a higher prediction accuracy than baseline approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at LangRob @ CoRL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoFuse: Automatic Fusion Networks for Deformable Medical Image
+  Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Meng, Michael Fulham, Dagan Feng, Lei Bi, Jinman Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deformable image registration aims to find a dense non-linear spatial
+correspondence between a pair of images, which is a crucial step for many
+medical tasks such as tumor growth monitoring and population analysis.
+Recently, Deep Neural Networks (DNNs) have been widely recognized for their
+ability to perform fast end-to-end registration. However, DNN-based
+registration needs to explore the spatial information of each image and fuse
+this information to characterize spatial correspondence. This raises an
+essential question: what is the optimal fusion strategy to characterize spatial
+correspondence? Existing fusion strategies (e.g., early fusion, late fusion)
+were empirically designed to fuse information by manually defined prior
+knowledge, which inevitably constrains the registration performance within the
+limits of empirical designs. In this study, we depart from existing
+empirically-designed fusion strategies and develop a data-driven fusion
+strategy for deformable image registration. To achieve this, we propose an
+Automatic Fusion network (AutoFuse) that provides flexibility to fuse
+information at many potential locations within the network. A Fusion Gate (FG)
+module is also proposed to control how to fuse information at each potential
+network location based on training data. Our AutoFuse can automatically
+optimize its fusion strategy during training and can be generalizable to both
+unsupervised registration (without any labels) and semi-supervised registration
+(with weak labels provided for partial training data). Extensive experiments on
+two well-benchmarked medical registration tasks (inter- and intra-patient
+registration) with eight public datasets show that our AutoFuse outperforms
+state-of-the-art unsupervised and semi-supervised registration methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainable Severity ranking via pairwise n-hidden comparison: a case
+  study of glaucoma 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02541v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02541v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Nguyen, Cuong V. Nguyen, Shrikanth Narayanan, Benjamin Y. Xu, Michael Pazzani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Primary open-angle glaucoma (POAG) is a chronic and progressive optic nerve
+condition that results in an acquired loss of optic nerve fibers and potential
+blindness. The gradual onset of glaucoma results in patients progressively
+losing their vision without being consciously aware of the changes. To diagnose
+POAG and determine its severity, patients must undergo a comprehensive dilated
+eye examination. In this work, we build a framework to rank, compare, and
+interpret the severity of glaucoma using fundus images. We introduce a
+siamese-based severity ranking using pairwise n-hidden comparisons. We
+additionally have a novel approach to explaining why a specific image is deemed
+more severe than others. Our findings indicate that the proposed severity
+ranking model surpasses traditional ones in terms of diagnostic accuracy and
+delivers improved saliency explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FILP-3D: Enhancing 3D Few-shot Class-incremental Learning with
+  <span class="highlight-title">Pre-train</span>ed Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.17051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.17051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wan Xu, Tianyu Huang, Tianyu Qu, Guanglei Yang, Yiwen Guo, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot class-incremental learning (FSCIL) aims to mitigate the catastrophic
+forgetting issue when a model is incrementally trained on limited data.
+However, many of these works lack effective exploration of prior knowledge,
+rendering them unable to effectively address the domain gap issue in the
+context of 3D FSCIL, thereby leading to catastrophic forgetting. The
+Contrastive Vision-Language Pre-Training (CLIP) model serves as a highly
+suitable backbone for addressing the challenges of 3D FSCIL due to its abundant
+shape-related prior knowledge. Unfortunately, its direct application to 3D
+FSCIL still faces the incompatibility between 3D data representation and the 2D
+features, primarily manifested as feature space misalignment and significant
+noise. To address the above challenges, we introduce the FILP-3D framework with
+two novel components: the Redundant Feature Eliminator (RFE) for feature space
+misalignment and the Spatial Noise Compensator (SNC) for significant noise. RFE
+aligns the feature spaces of input point clouds and their embeddings by
+performing a unique dimensionality reduction on the feature space of
+pre-trained models (PTMs), effectively eliminating redundant information
+without compromising semantic integrity. On the other hand, SNC is a
+graph-based 3D model designed to capture robust geometric information within
+point clouds, thereby augmenting the knowledge lost due to projection,
+particularly when processing real-world scanned data. Moreover, traditional
+accuracy metrics are proven to be biased due to the imbalance in existing 3D
+datasets. Therefore we propose 3D FSCIL benchmark FSCIL3D-XL and novel
+evaluation metrics that offer a more nuanced assessment of a 3D FSCIL model.
+Experimental results on both established and our proposed benchmarks
+demonstrate that our approach significantly outperforms existing
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Unfolding Network with Spatial Alignment for multi-modal MRI
+  reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.16998v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.16998v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Zhang, Qi Wang, Jun Shi, Shihui Ying, Zhijie Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal Magnetic Resonance Imaging (MRI) offers complementary diagnostic
+information, but some modalities are limited by the long scanning time. To
+accelerate the whole acquisition process, MRI reconstruction of one modality
+from highly undersampled k-space data with another fully-sampled reference
+modality is an efficient solution. However, the misalignment between
+modalities, which is common in clinic practice, can negatively affect
+reconstruction quality. Existing deep learning-based methods that account for
+inter-modality misalignment perform better, but still share two main common
+limitations: (1) The spatial alignment task is not adaptively integrated with
+the reconstruction process, resulting in insufficient complementarity between
+the two tasks; (2) the entire framework has weak interpretability. In this
+paper, we construct a novel Deep Unfolding Network with Spatial Alignment,
+termed DUN-SA, to appropriately embed the spatial alignment task into the
+reconstruction process. Concretely, we derive a novel joint
+alignment-reconstruction model with a specially designed cross-modal spatial
+alignment term. By relaxing the model into cross-modal spatial alignment and
+multi-modal reconstruction tasks, we propose an effective algorithm to solve
+this model alternatively. Then, we unfold the iterative steps of the proposed
+algorithm and design corresponding network modules to build DUN-SA with
+interpretability. Through end-to-end training, we effectively compensate for
+spatial misalignment using only reconstruction loss, and utilize the
+progressively aligned reference modality to provide inter-modality prior to
+improve the reconstruction of the target modality. Comprehensive experiments on
+three real datasets demonstrate that our method exhibits superior
+reconstruction performance compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAG-ViT: A Scale-Aware, High-Fidelity Patching Approach with Graph
+  Attention for Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.09420v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.09420v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shravan Venkatraman, Jaskaran Singh Walia, Joe Dhanith P R
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have redefined image classification by leveraging
+self-attention to capture complex patterns and long-range dependencies between
+image patches. However, a key challenge for ViTs is efficiently incorporating
+multi-scale feature representations, which is inherent in convolutional neural
+networks (CNNs) through their hierarchical structure. Graph transformers have
+made strides in addressing this by leveraging graph-based modeling, but they
+often lose or insufficiently represent spatial hierarchies, especially since
+redundant or less relevant areas dilute the image's contextual representation.
+To bridge this gap, we propose SAG-ViT, a Scale-Aware Graph Attention ViT that
+integrates multi-scale feature capabilities of CNNs, representational power of
+ViTs, graph-attended patching to enable richer contextual representation. Using
+EfficientNetV2 as a backbone, the model extracts multi-scale feature maps,
+dividing them into patches to preserve richer semantic information compared to
+directly patching the input images. The patches are structured into a graph
+using spatial and feature similarities, where a Graph Attention Network (GAT)
+refines the node embeddings. This refined graph representation is then
+processed by a Transformer encoder, capturing long-range dependencies and
+complex interactions. We evaluate SAG-ViT on benchmark datasets across various
+domains, validating its effectiveness in advancing image classification tasks.
+Our code and weights are available at https://github.com/shravan-18/SAG-ViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion Dreamer: Realizing Physically Coherent Video Generation through
+  Scene-Aware Motion Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.00547v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.00547v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianshuo Xu, Zhifei Chen, Leyi Wu, Hao Lu, Yuying Chen, Lihui Jiang, Bingbing Liu, Yingcong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent numerous video generation models, also known as world models, have
+demonstrated the ability to generate plausible real-world videos. However, many
+studies have shown that these models often produce motion results lacking
+logical or physical coherence. In this paper, we revisit video generation
+models and find that single-stage approaches struggle to produce high-quality
+results while maintaining coherent motion reasoning. To address this issue, we
+propose \textbf{Motion Dreamer}, a two-stage video generation framework. In
+Stage I, the model generates an intermediate motion representation-such as a
+segmentation map or depth map-based on the input image and motion conditions,
+focusing solely on the motion itself. In Stage II, the model uses this
+intermediate motion representation as a condition to generate a high-detail
+video. By decoupling motion reasoning from high-fidelity video synthesis, our
+approach allows for more accurate and physically plausible motion generation.
+We validate the effectiveness of our approach on the Physion dataset and in
+autonomous driving scenarios. For example, given a single push, our model can
+synthesize the sequential toppling of a set of dominoes. Similarly, by varying
+the movements of ego-cars, our model can produce different effects on other
+vehicles. Our work opens new avenues in creating models that can reason about
+physical interactions in a more coherent and realistic manner. Our webpage is
+available: https://envision-research.github.io/MotionDreamer/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detailed Object Description with Controllable Dimensions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.19106v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.19106v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinran Wang, Haiwen Zhang, Baoteng Li, Kongming Liang, Hao Sun, Zhongjiang He, Zhanyu Ma, Jun Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object description plays an important role for visually impaired individuals
+to understand and compare the differences between objects. Recent multimodal
+large language models(MLLMs) exhibit powerful perceptual abilities and
+demonstrate impressive potential for generating object-centric descriptions.
+However, the descriptions generated by such models may still usually contain a
+lot of content that is not relevant to the user intent or miss some important
+object dimension details. Under special scenarios, users may only need the
+details of certain dimensions of an object. In this paper, we propose a
+training-free object description refinement pipeline, Dimension Tailor,
+designed to enhance user-specified details in object descriptions. This
+pipeline includes three steps: dimension extracting, erasing, and
+supplementing, which decompose the description into user-specified dimensions.
+Dimension Tailor can not only improve the quality of object details but also
+offer flexibility in including or excluding specific dimensions based on user
+preferences. We conducted extensive experiments to demonstrate the
+effectiveness of Dimension Tailor on controllable object descriptions. Notably,
+the proposed pipeline can consistently improve the performance of the recent
+MLLMs. The code is currently accessible at
+https://github.com/xin-ran-w/ControllableObjectDescription.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ISR-DPO: Aligning Large Multimodal Models for Videos by Iterative
+  Self-Retrospective DPO <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11280v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11280v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daechul Ahn, Yura Choi, San Kim, Youngjae Yu, Dongyeop Kang, Jonghyun Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Iterative self-improvement, a concept extending beyond personal growth, has
+found powerful applications in machine learning, particularly in transforming
+weak models into strong ones. While recent advances in natural language
+processing have shown its efficacy through iterative preference optimization,
+applying this approach to Video Large Multi-modal Models (VLMMs) remains
+challenging due to modality misalignment. VLMMs struggle with this misalignment
+during iterative preference modeling, as the self-judge model often prioritizes
+linguistic knowledge over visual information. Additionally, iterative
+preference optimization can lead to visually hallucinated verbose responses due
+to length bias within the self-rewarding cycle. To address these issues, we
+propose Iterative Self-Retrospective Direct Preference Optimization (ISR-DPO),
+a method that uses self-retrospection to enhance preference modeling. This
+approach enhances the self-judge's focus on informative video regions,
+resulting in more visually grounded preferences. In extensive empirical
+evaluations across diverse video question answering benchmarks, the ISR-DPO
+significantly outperforms the state of the art. We are committed to
+open-sourcing our code, models, and datasets to encourage further
+investigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to Bridge the Gap between Modalities: <span class="highlight-title">Survey</span> on Multimodal Large
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07594v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07594v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shezheng Song, Xiaopeng Li, Shasha Li, Shan Zhao, Jie Yu, Jun Ma, Xiaoguang Mao, Weimin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore Multimodal Large Language Models (MLLMs), which integrate LLMs
+like GPT-4 to handle multimodal data, including text, images, audio, and more.
+MLLMs demonstrate capabilities such as generating image captions and answering
+image-based questions, bridging the gap towards real-world human-computer
+interactions and hinting at a potential pathway to artificial general
+intelligence. However, MLLMs still face challenges in addressing the semantic
+gap in multimodal data, which may lead to erroneous outputs, posing potential
+risks to society. Selecting the appropriate modality alignment method is
+crucial, as improper methods might require more parameters without significant
+performance improvements. This paper aims to explore modality alignment methods
+for LLMs and their current capabilities. Implementing effective modality
+alignment can help LLMs address environmental issues and enhance accessibility.
+The study surveys existing modality alignment methods for MLLMs, categorizing
+them into four groups: (1) Multimodal Converter, which transforms data into a
+format that LLMs can understand; (2) Multimodal Perceiver, which improves how
+LLMs percieve different types of data; (3) Tool Learning, which leverages
+external tools to convert data into a common format, usually text; and (4)
+Data-Driven Method, which teaches LLMs to understand specific data types within
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TKDE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Label-Efficient Data Augmentation with Video Diffusion Models for
+  Guidewire Segmentation in Cardiac Fluoroscopy <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16050v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16050v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoyan Pan, Yikang Liu, Lin Zhao, Eric Z. Chen, Xiao Chen, Terrence Chen, Shanhui Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accurate segmentation of guidewires in interventional cardiac fluoroscopy
+videos is crucial for computer-aided navigation tasks. Although deep learning
+methods have demonstrated high accuracy and robustness in wire segmentation,
+they require substantial annotated datasets for generalizability, underscoring
+the need for extensive labeled data to enhance model performance. To address
+this challenge, we propose the Segmentation-guided Frame-consistency Video
+Diffusion Model (SF-VD) to generate large collections of labeled fluoroscopy
+videos, augmenting the training data for wire segmentation networks. SF-VD
+leverages videos with limited annotations by independently modeling scene
+distribution and motion distribution. It first samples the scene distribution
+by generating 2D fluoroscopy images with wires positioned according to a
+specified input mask, and then samples the motion distribution by progressively
+generating subsequent frames, ensuring frame-to-frame coherence through a
+frame-consistency strategy. A segmentation-guided mechanism further refines the
+process by adjusting wire contrast, ensuring a diverse range of visibility in
+the synthesized image. Evaluation on a fluoroscopy dataset confirms the
+superior quality of the generated videos and shows significant improvements in
+guidewire segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Low-Light Image Recognition Performance Based on
+  Image-adaptive Learnable Module 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06438v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06438v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seitaro Ono, Yuka Ogino, Takahiro Toizumi, Atsushi Ito, Masato Tsukada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, significant progress has been made in image recognition
+technology based on deep neural networks. However, improving recognition
+performance under low-light conditions remains a significant challenge. This
+study addresses the enhancement of recognition model performance in low-light
+conditions. We propose an image-adaptive learnable module which apply
+appropriate image processing on input images and a hyperparameter predictor to
+forecast optimal parameters used in the module. Our proposed approach allows
+for the enhancement of recognition performance under low-light conditions by
+easily integrating as a front-end filter without the need to retrain existing
+recognition models designed for low-light conditions. Through experiments, our
+proposed method demonstrates its contribution to enhancing image recognition
+performance under low-light conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to VISAPP2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ YOLOv5-Based Object Detection for Emergency Response in Aerial Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05394v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05394v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sindhu Boddu, Arindam Mukherjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a robust approach for object detection in aerial imagery
+using the YOLOv5 model. We focus on identifying critical objects such as
+ambulances, car crashes, police vehicles, tow trucks, fire engines, overturned
+cars, and vehicles on fire. By leveraging a custom dataset, we outline the
+complete pipeline from data collection and annotation to model training and
+evaluation. Our results demonstrate that YOLOv5 effectively balances speed and
+accuracy, making it suitable for real-time emergency response applications.
+This work addresses key challenges in aerial imagery, including small object
+detection and complex backgrounds, and provides insights for future research in
+automated emergency response systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, submitted for open-access publication on arXiv</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AnoFPDM: Anomaly Segmentation with Forward Process of Diffusion Models
+  for Brain MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.15683v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.15683v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Che, Fazle Rafsani, Jay Shah, Md Mahfuzur Rahman Siddiquee, Teresa Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-supervised diffusion models (DMs) in anomaly segmentation, leveraging
+image-level labels, have attracted significant attention for their superior
+performance compared to unsupervised methods. It eliminates the need for
+pixel-level labels in training, offering a more cost-effective alternative to
+supervised methods. However, existing methods are not fully weakly-supervised
+because they heavily rely on costly pixel-level labels for hyperparameter
+tuning in inference. To tackle this challenge, we introduce Anomaly
+Segmentation with Forward Process of Diffusion Models (AnoFPDM), a fully
+weakly-supervised framework that operates without the need of pixel-level
+labels. Leveraging the unguided forward process as a reference for the guided
+forward process, we select hyperparameters such as the noise scale, the
+threshold for segmentation and the guidance strength. We aggregate anomaly maps
+from guided forward process, enhancing the signal strength of anomalous
+regions. Remarkably, our proposed method outperforms recent state-of-the-art
+weakly-supervised approaches, even without utilizing pixel-level labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v4: added appendices and fixed some typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Vision-Language Models with Scene Graphs for Traffic Accident
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05910v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05910v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Lohner, Francesco Compagno, Jonathan Francis, Alessandro Oltramari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognizing a traffic accident is an essential part of any autonomous driving
+or road monitoring system. An accident can appear in a wide variety of forms,
+and understanding what type of accident is taking place may be useful to
+prevent it from recurring. This work focuses on classifying traffic scenes into
+specific accident types. We approach the problem by representing a traffic
+scene as a graph, where objects such as cars can be represented as nodes, and
+relative distances and directions between them as edges. This representation of
+a traffic scene is referred to as a scene graph, and can be used as input for
+an accident classifier. Better results are obtained with a classifier that
+fuses the scene graph input with visual and textual representations. This work
+introduces a multi-stage, multimodal pipeline that pre-processes videos of
+traffic accidents, encodes them as scene graphs, and aligns this representation
+with vision and language modalities before executing the classification task.
+When trained on 4 classes, our method achieves a balanced accuracy score of
+57.77% on an (unbalanced) subset of the popular Detection of Traffic Anomaly
+(DoTA) benchmark, representing an increase of close to 5 percentage points from
+the case where scene graph information is not taken into account.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Won the 'Best Paper Runner-up Award' at the 2024 IEEE International
+  Automated Vehicle Validation Conference (IAVVC 2024). Also accepted at the
+  1st Workshop on Semantic Reasoning and Goal Understanding in Robotics, at the
+  Robotics Science and Systems Conference (RSS SemRob 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GLOV: Guided Large Language Models as Implicit Optimizers for Vision
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.06154v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.06154v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Jehanzeb Mirza, Mengjie Zhao, Zhuoyuan Mao, Sivan Doveh, Wei Lin, Paul Gavrikov, Michael Dorkenwald, Shiqi Yang, Saurav Jha, Hiromi Wakaki, Yuki Mitsufuji, Horst Possegger, Rogerio Feris, Leonid Karlinsky, James Glass
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a novel method (GLOV) enabling Large Language Models
+(LLMs) to act as implicit Optimizers for Vision-Langugage Models (VLMs) to
+enhance downstream vision tasks. Our GLOV meta-prompts an LLM with the
+downstream task description, querying it for suitable VLM prompts (e.g., for
+zero-shot classification with CLIP). These prompts are ranked according to a
+purity measure obtained through a fitness function. In each respective
+optimization step, the ranked prompts are fed as in-context examples (with
+their accuracies) to equip the LLM with the knowledge of the type of text
+prompts preferred by the downstream VLM. Furthermore, we also explicitly steer
+the LLM generation process in each optimization step by specifically adding an
+offset difference vector of the embeddings from the positive and negative
+solutions found by the LLM, in previous optimization steps, to the intermediate
+layer of the network for the next generation step. This offset vector steers
+the LLM generation toward the type of language preferred by the downstream VLM,
+resulting in enhanced performance on the downstream vision tasks. We
+comprehensively evaluate our GLOV on 16 diverse datasets using two families of
+VLMs, i.e., dual-encoder (e.g., CLIP) and encoder-decoder (e.g., LLaVa) models
+-- showing that the discovered solutions can enhance the recognition
+performance by up to 15.0% and 57.5% (3.8% and 21.6% on average) for these
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/jmiemirza/GLOV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BiomedCLIP: a multimodal biomedical foundation model <span class="highlight-title">pretrain</span>ed from
+  fifteen million scientific image-text pairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00915v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00915v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Zhang, Yanbo Xu, Naoto Usuyama, Hanwen Xu, Jaspreet Bagga, Robert Tinn, Sam Preston, Rajesh Rao, Mu Wei, Naveen Valluri, Cliff Wong, Andrea Tupini, Yu Wang, Matt Mazzola, Swadheen Shukla, Lars Liden, Jianfeng Gao, Angela Crabtree, Brian Piening, Carlo Bifulco, Matthew P. Lungren, Tristan Naumann, Sheng Wang, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical data is inherently multimodal, comprising physical measurements
+and natural language narratives. A generalist biomedical AI model needs to
+simultaneously process different modalities of data, including text and images.
+Therefore, training an effective generalist biomedical model requires
+high-quality multimodal data, such as parallel image-text pairs. Here, we
+present PMC-15M, a novel dataset that is two orders of magnitude larger than
+existing biomedical multimodal datasets such as MIMIC-CXR, and spans a diverse
+range of biomedical image types. PMC-15M contains 15 million biomedical
+image-text pairs collected from 4.4 million scientific articles. Based on
+PMC-15M, we have pretrained BiomedCLIP, a multimodal foundation model, with
+domain-specific adaptations tailored to biomedical vision-language processing.
+We conducted extensive experiments and ablation studies on standard biomedical
+imaging tasks from retrieval to classification to visual question-answering
+(VQA). BiomedCLIP achieved new state-of-the-art results in a wide range of
+standard datasets, substantially outperforming prior approaches. Intriguingly,
+by large-scale pretraining on diverse biomedical image types, BiomedCLIP even
+outperforms state-of-the-art radiology-specific models such as BioViL in
+radiology-specific tasks such as RSNA pneumonia detection. In summary,
+BiomedCLIP is a fully open-access foundation model that achieves
+state-of-the-art performance on various biomedical tasks, paving the way for
+transformative multimodal biomedical discovery and applications. We release our
+models at https://aka.ms/biomedclip to facilitate future research in multimodal
+biomedical AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The models are released at https://aka.ms/biomedclip</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multispectral Pedestrian Detection with Sparsely Annotated Label <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02640v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02640v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chan Lee, Seungho Shin, Gyeong-Moon Park, Jung Uk Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although existing Sparsely Annotated Object Detection (SAOD) approches have
+made progress in handling sparsely annotated environments in multispectral
+domain, where only some pedestrians are annotated, they still have the
+following limitations: (i) they lack considerations for improving the quality
+of pseudo-labels for missing annotations, and (ii) they rely on fixed ground
+truth annotations, which leads to learning only a limited range of pedestrian
+visual appearances in the multispectral domain. To address these issues, we
+propose a novel framework called Sparsely Annotated Multispectral Pedestrian
+Detection (SAMPD). For limitation (i), we introduce Multispectral
+Pedestrian-aware Adaptive Weight (MPAW) and Positive Pseudo-label Enhancement
+(PPE) module. Utilizing multispectral knowledge, these modules ensure the
+generation of high-quality pseudo-labels and enable effective learning by
+increasing weights for high-quality pseudo-labels based on modality
+characteristics. To address limitation (ii), we propose an Adaptive Pedestrian
+Retrieval Augmentation (APRA) module, which adaptively incorporates pedestrian
+patches from ground-truth and dynamically integrates high-quality pseudo-labels
+with the ground-truth, facilitating a more diverse learning pool of
+pedestrians. Extensive experimental results demonstrate that our SAMPD
+significantly enhances performance in sparsely annotated environments within
+the multispectral domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">17</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Re-ranking the Context for Multimodal Retrieval Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matin Mortaheb, Mohammad A. Amir Khojastepour, Srimat T. Chakradhar, Sennur Ulukus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) enhances large language models (LLMs) by
+incorporating external knowledge to generate a response within a context with
+improved accuracy and reduced hallucinations. However, multi-modal RAG systems
+face unique challenges: (i) the retrieval process may select irrelevant entries
+to user query (e.g., images, documents), and (ii) vision-language models or
+multi-modal language models like GPT-4o may hallucinate when processing these
+entries to generate RAG output. In this paper, we aim to address the first
+challenge, i.e, improving the selection of relevant context from the
+knowledge-base in retrieval phase of the multi-modal RAG. Specifically, we
+leverage the relevancy score (RS) measure designed in our previous work for
+evaluating the RAG performance to select more relevant entries in retrieval
+process. The retrieval based on embeddings, say CLIP-based embedding, and
+cosine similarity usually perform poorly particularly for multi-modal data. We
+show that by using a more advanced relevancy measure, one can enhance the
+retrieval process by selecting more relevant pieces from the knowledge-base and
+eliminate the irrelevant pieces from the context by adaptively selecting
+up-to-$k$ entries instead of fixed number of entries. Our evaluation using COCO
+dataset demonstrates significant enhancement in selecting relevant context and
+accuracy of the generated response.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-task retriever fine-tuning for domain-specific and efficient RAG <span class="chip">NAACL 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrice Béchard, Orlando Marquez Ayala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) has become ubiquitous when deploying
+Large Language Models (LLMs), as it can address typical limitations such as
+generating hallucinated or outdated information. However, when building
+real-world RAG applications, practical issues arise. First, the retrieved
+information is generally domain-specific. Since it is computationally expensive
+to fine-tune LLMs, it is more feasible to fine-tune the retriever to improve
+the quality of the data included in the LLM input. Second, as more applications
+are deployed in the same real-world system, one cannot afford to deploy
+separate retrievers. Moreover, these RAG applications normally retrieve
+different kinds of data. Our solution is to instruction fine-tune a small
+retriever encoder on a variety of domain-specific tasks to allow us to deploy
+one encoder that can serve many use cases, thereby achieving low-cost,
+scalability, and speed. We show how this encoder generalizes to out-of-domain
+settings as well as to an unseen retrieval task on real-world enterprise use
+cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures. Submitted to NAACL 2025 Industry Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Retrieval Based on Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Te-Lun Yang, Jyi-Shane Liu, Yuen-Hsien Tseng, Jyh-Shing Roger Jang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study develops a question-answering system based on Retrieval-Augmented
+Generation (RAG) using Chinese Wikipedia and Lawbank as retrieval sources.
+Using TTQA and TMMLU+ as evaluation datasets, the system employs BGE-M3 for
+dense vector retrieval to obtain highly relevant search results and
+BGE-reranker to reorder these results based on query relevance. The most
+pertinent retrieval outcomes serve as reference knowledge for a Large Language
+Model (LLM), enhancing its ability to answer questions and establishing a
+knowledge retrieval system grounded in generative AI.
+  The system's effectiveness is assessed through a two-stage evaluation:
+automatic and assisted performance evaluations. The automatic evaluation
+calculates accuracy by comparing the model's auto-generated labels with ground
+truth answers, measuring performance under standardized conditions without
+human intervention. The assisted performance evaluation involves 20
+finance-related multiple-choice questions answered by 20 participants without
+financial backgrounds. Initially, participants answer independently. Later,
+they receive system-generated reference information to assist in answering,
+examining whether the system improves accuracy when assistance is provided.
+  The main contributions of this research are: (1) Enhanced LLM Capability: By
+integrating BGE-M3 and BGE-reranker, the system retrieves and reorders highly
+relevant results, reduces hallucinations, and dynamically accesses authorized
+or public knowledge sources. (2) Improved Data Privacy: A customized RAG
+architecture enables local operation of the LLM, eliminating the need to send
+private data to external servers. This approach enhances data security, reduces
+reliance on commercial services, lowers operational costs, and mitigates
+privacy risks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 13 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Interval-based Tokenization for Pitch Representation in
+  Symbolic Music Analysis <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04630v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04630v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dinh-Viet-Toan Le, Louis Bigo, Mikaela Keller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Symbolic music analysis tasks are often performed by models originally
+developed for Natural Language Processing, such as Transformers. Such models
+require the input data to be represented as sequences, which is achieved
+through a process of tokenization. Tokenization strategies for symbolic music
+often rely on absolute MIDI values to represent pitch information. However,
+music research largely promotes the benefit of higher-level representations
+such as melodic contour and harmonic relations for which pitch intervals turn
+out to be more expressive than absolute pitches. In this work, we introduce a
+general framework for building interval-based tokenizations. By evaluating
+these tokenizations on three music analysis tasks, we show that such
+interval-based tokenizations improve model performances and facilitate their
+explainability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Artificial Intelligence for Music Workshop at AAAI 2025
+  (https://ai4musicians.org/2025aaai.html)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Closer Look on Gender Stereotypes in Movie Recommender Systems and
+  Their Implications with Privacy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Falguni Roy, Yiduo Shen, Na Zhao, Xiaofeng Ding, Md. Omar Faruk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The movie recommender system typically leverages user feedback to provide
+personalized recommendations that align with user preferences and increase
+business revenue. This study investigates the impact of gender stereotypes on
+such systems through a specific attack scenario. In this scenario, an attacker
+determines users' gender, a private attribute, by exploiting gender stereotypes
+about movie preferences and analyzing users' feedback data, which is either
+publicly available or observed within the system. The study consists of two
+phases. In the first phase, a user study involving 630 participants identified
+gender stereotypes associated with movie genres, which often influence viewing
+choices. In the second phase, four inference algorithms were applied to detect
+gender stereotypes by combining the findings from the first phase with users'
+feedback data. Results showed that these algorithms performed more effectively
+than relying solely on feedback data for gender inference. Additionally, we
+quantified the extent of gender stereotypes to evaluate their broader impact on
+digital computational science. The latter part of the study utilized two major
+movie recommender datasets: MovieLens 1M and Yahoo!Movie. Detailed experimental
+information is available on our GitHub repository:
+https://github.com/fr-iit/GSMRS
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ User Simulation in the Era of Generative AI: User Modeling, Synthetic
+  Data Generation, and System Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krisztian Balog, ChengXiang Zhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User simulation is an emerging interdisciplinary topic with multiple critical
+applications in the era of Generative AI. It involves creating an intelligent
+agent that mimics the actions of a human user interacting with an AI system,
+enabling researchers to model and analyze user behaviour, generate synthetic
+data for training, and evaluate interactive AI systems in a controlled and
+reproducible manner. User simulation has profound implications for diverse
+fields and plays a vital role in the pursuit of Artificial General
+Intelligence. This paper provides an overview of user simulation, highlighting
+its key applications, connections to various disciplines, and outlining future
+research directions to advance this increasingly important technology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An innovative data collection method to eliminate the preprocessing
+  phase in web usage mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04364v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04364v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ozkan Canay, Umit Kocabicak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The underlying data source for web usage mining (WUM) is commonly thought to
+be server logs. However, access log files ensure quite limited data about the
+clients. Identifying sessions from this messy data takes a considerable effort,
+and operations performed for this purpose do not always yield excellent
+results. Also, this data cannot be used for web analytics efficiently. This
+study proposes an innovative method for user tracking, session management, and
+collecting web usage data. The method is mainly based on a new approach for
+using collected data for web analytics extraction as the data source in web
+usage mining. An application-based API has been developed with a different
+strategy from conventional client-side methods to obtain and process log data.
+The log data has been successfully gathered by integrating the technique into
+an enterprise web application. The results reveal that the homogeneous
+structured data collected and stored with this method is more convenient to
+browse, filter, and process than web server logs. This data stored on a
+relational database can be used effortlessly as a reliable data source for
+high-performance web usage mining activity, real-time web analytics, or a
+functional recommendation system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reproducing HotFlip for Corpus Poisoning Attacks in Dense Retrieval <span class="chip">ECIR 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongkang Li, Panagiotis Eustratiadis, Evangelos Kanoulas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  HotFlip is a topical gradient-based word substitution method for attacking
+language models. Recently, this method has been further applied to attack
+retrieval systems by generating malicious passages that are injected into a
+corpus, i.e., corpus poisoning. However, HotFlip is known to be computationally
+inefficient, with the majority of time being spent on gradient accumulation for
+each query-passage pair during the adversarial token generation phase, making
+it impossible to generate an adequate number of adversarial passages in a
+reasonable amount of time. Moreover, the attack method itself assumes access to
+a set of user queries, a strong assumption that does not correspond to how
+real-world adversarial attacks are usually performed. In this paper, we first
+significantly boost the efficiency of HotFlip, reducing the adversarial
+generation process from 4 hours per document to only 15 minutes, using the same
+hardware. We further contribute experiments and analysis on two additional
+tasks: (1) transfer-based black-box attacks, and (2) query-agnostic attacks.
+Whenever possible, we provide comparisons between the original method and our
+improved version. Our experiments demonstrate that HotFlip can effectively
+attack a variety of dense retrievers, with an observed trend that its attack
+performance diminishes against more advanced and recent methods. Interestingly,
+we observe that while HotFlip performs poorly in a black-box setting,
+indicating limited capacity for generalization, in query-agnostic scenarios its
+performance is correlated to the volume of injected adversarial passages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for oral presentation in the
+  reproducibility track at ECIR 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient and Responsible Adaptation of Large Language Models for Robust
+  and Equitable Top-k Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kirandeep Kaur, Manya Chadha, Vinayak Gupta, Chirag Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional recommendation systems (RSs) are typically optimized to enhance
+performance metrics uniformly across all training samples, inadvertently
+overlooking the needs of diverse user populations. The performance disparity
+among various populations can harm the model's robustness to sub-populations
+due to the varying user properties. While large language models (LLMs) show
+promise in enhancing RS performance, their practical applicability is hindered
+by high costs, inference latency, and degraded performance on long user
+queries. To address these challenges, we propose a hybrid task allocation
+framework designed to promote social good by equitably serving all user groups.
+By adopting a two-phase approach, we promote a strategic assignment of tasks
+for efficient and responsible adaptation of LLMs. Our strategy works by first
+identifying the weak and inactive users that receive a suboptimal ranking
+performance by RSs. Next, we use an in-context learning approach for such
+users, wherein each user interaction history is contextualized as a distinct
+ranking task. We evaluate our hybrid framework by incorporating eight different
+recommendation algorithms and three different LLMs -- both open and
+close-sourced. Our results on three real-world datasets show a significant
+reduction in weak users and improved robustness to subpopulations without
+disproportionately escalating costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2405.00824</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Search engines in polarized media environment: Auditing political
+  information curation on Google and Bing prior to 2024 US elections 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mykola Makhortykh, Tobias Rorhbach, Maryna Sydorova, Elizaveta Kuznetsova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Search engines play an important role in the context of modern elections. By
+curating information in response to user queries, search engines influence how
+individuals are informed about election-related developments and perceive the
+media environment in which elections take place. It has particular implications
+for (perceived) polarization, especially if search engines' curation results in
+a skewed treatment of information sources based on their political leaning.
+Until now, however, it is unclear whether such a partisan gap emerges through
+information curation on search engines and what user- and system-side factors
+affect it. To address this shortcoming, we audit the two largest Western search
+engines, Google and Bing, prior to the 2024 US presidential elections and
+examine how these search engines' organic search results and additional
+interface elements represent election-related information depending on the
+queries' slant, user location, and time when the search was conducted. Our
+findings indicate that both search engines tend to prioritize left-leaning
+media sources, with the exact scope of search results' ideological slant
+varying between Democrat- and Republican-focused queries. We also observe
+limited effects of location- and time-based factors on organic search results,
+whereas results for additional interface elements were more volatile over time
+and specific US states. Together, our observations highlight that search
+engines' information curation actively mirrors the partisan divides present in
+the US media environments and has the potential to contribute to (perceived)
+polarization within these environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ S2 Chunking: A Hybrid Framework for Document Segmentation Through
+  Integrated Spatial and Semantic Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prashant Verma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document chunking is a critical task in natural language processing (NLP)
+that involves dividing a document into meaningful segments. Traditional methods
+often rely solely on semantic analysis, ignoring the spatial layout of
+elements, which is crucial for understanding relationships in complex
+documents. This paper introduces a novel hybrid approach that combines layout
+structure, semantic analysis, and spatial relationships to enhance the cohesion
+and accuracy of document chunks. By leveraging bounding box information (bbox)
+and text embeddings, our method constructs a weighted graph representation of
+document elements, which is then clustered using spectral clustering.
+Experimental results demonstrate that this approach outperforms traditional
+methods, particularly in documents with diverse layouts such as reports,
+articles, and multi-column designs. The proposed method also ensures that no
+chunk exceeds a specified token length, making it suitable for use cases where
+token limits are critical (e.g., language models with input size limitations)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making Software FAIR: A machine-assisted workflow for the research
+  software lifecycle 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.10415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.10415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Petr Knoth, Laurent Romary, Patrice Lopez, Roberto Di Cosmo, Pavel Smrz, Tomasz Umerle, Melissa Harrison, Alain Monteil, Matteo Cancellieri, David Pride
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key issue hindering discoverability, attribution and reusability of open
+research software is that its existence often remains hidden within the
+manuscript of research papers. For these resources to become first-class
+bibliographic records, they first need to be identified and subsequently
+registered with persistent identifiers (PIDs) to be made FAIR (Findable,
+Accessible, Interoperable and Reusable). To this day, much open research
+software fails to meet FAIR principles and software resources are mostly not
+explicitly linked from the manuscripts that introduced them or used them.
+SoFAIR is a 2-year international project (2024-2025) which proposes a solution
+to the above problem realised over the content available through the global
+network of open repositories. SoFAIR will extend the capabilities of widely
+used open scholarly infrastructures (CORE, Software Heritage, HAL) and tools
+(GROBID) operated by the consortium partners, delivering and deploying an
+effective solution for the management of the research software lifecycle,
+including: 1) ML-assisted identification of research software assets from
+within the manuscripts of scholarly papers, 2) validation of the identified
+assets by authors, 3) registration of software assets with PIDs and their
+archival.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RDRec: Rationale Distillation for LLM-based Recommendation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10587v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10587v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinfeng Wang, Jin Cui, Yoshimi Suzuki, Fumiyo Fukumoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language model (LLM)-based recommender models that bridge users and
+items through textual prompts for effective semantic reasoning have gained
+considerable attention. However, few methods consider the underlying rationales
+behind interactions, such as user preferences and item attributes, limiting the
+reasoning capability of LLMs for recommendations. This paper proposes a
+rationale distillation recommender (RDRec), a compact model designed to learn
+rationales generated by a larger language model (LM). By leveraging rationales
+from reviews related to users and items, RDRec remarkably specifies their
+profiles for recommendations. Experiments show that RDRec achieves
+state-of-the-art (SOTA) performance in both top-N and sequential
+recommendations. Our source code is released at
+https://github.com/WangXFng/RDRec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages. Accepted to ACL 2024 Main as a short paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Network-Based Video Recommendation Using Viewing Patterns and Modularity
+  Analysis: An Integrated Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12743v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12743v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehrdad Maghsoudi, Mohammad Hossein valikhani, Mohammad Hossein Zohdi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of video-on-demand (VOD) services has led to a paradox of
+choice, overwhelming users with vast content libraries and revealing
+limitations in current recommender systems. This research introduces a novel
+approach by combining implicit user data, such as viewing percentages, with
+social network analysis to enhance personalization in VOD platforms. The
+methodology constructs user-item interaction graphs based on viewing patterns
+and applies centrality measures (degree, closeness, and betweenness) to
+identify important videos. Modularity-based clustering groups related content,
+enabling personalized recommendations. The system was evaluated on a
+documentary-focused VOD platform with 328 users over four months. Results
+showed significant improvements: a 63% increase in click-through rate (CTR), a
+24% increase in view completion rate, and a 17% improvement in user
+satisfaction. The approach outperformed traditional methods like Naive Bayes
+and SVM. Future research should explore advanced techniques, such as matrix
+factorization models, graph neural networks, and hybrid approaches combining
+content-based and collaborative filtering. Additionally, incorporating temporal
+models and addressing scalability challenges for large-scale platforms are
+essential next steps. This study contributes to the state of the art by
+introducing modularity-based clustering and ego-centric ranking methods to
+enhance personalization in video recommendations. The findings suggest that
+integrating network-based features and implicit feedback can significantly
+improve user engagement, offering a cost-effective solution for VOD platforms
+to enhance recommendation quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval-Augmented Generation with Graphs (GraphRAG) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00309v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00309v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Han, Yu Wang, Harry Shomer, Kai Guo, Jiayuan Ding, Yongjia Lei, Mahantesh Halappanavar, Ryan A. Rossi, Subhabrata Mukherjee, Xianfeng Tang, Qi He, Zhigang Hua, Bo Long, Tong Zhao, Neil Shah, Amin Javari, Yinglong Xia, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) is a powerful technique that enhances
+downstream task execution by retrieving additional information, such as
+knowledge, skills, and tools from external sources. Graph, by its intrinsic
+"nodes connected by edges" nature, encodes massive heterogeneous and relational
+information, making it a golden resource for RAG in tremendous real-world
+applications. As a result, we have recently witnessed increasing attention on
+equipping RAG with Graph, i.e., GraphRAG. However, unlike conventional RAG,
+where the retriever, generator, and external data sources can be uniformly
+designed in the neural-embedding space, the uniqueness of graph-structured
+data, such as diverse-formatted and domain-specific relational knowledge, poses
+unique and significant challenges when designing GraphRAG for different
+domains. Given the broad applicability, the associated design challenges, and
+the recent surge in GraphRAG, a systematic and up-to-date survey of its key
+concepts and techniques is urgently desired. Following this motivation, we
+present a comprehensive and up-to-date survey on GraphRAG. Our survey first
+proposes a holistic GraphRAG framework by defining its key components,
+including query processor, retriever, organizer, generator, and data source.
+Furthermore, recognizing that graphs in different domains exhibit distinct
+relational patterns and require dedicated designs, we review GraphRAG
+techniques uniquely tailored to each domain. Finally, we discuss research
+challenges and brainstorm directions to inspire cross-disciplinary
+opportunities. Our survey repository is publicly maintained at
+https://github.com/Graph-RAG/GraphRAG/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InterFormer: Towards Effective Heterogeneous Interaction Learning for
+  Click-Through Rate Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.09852v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.09852v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhichen Zeng, Xiaolong Liu, Mengyue Hang, Xiaoyi Liu, Qinghai Zhou, Chaofei Yang, Yiqun Liu, Yichen Ruan, Laming Chen, Yuxin Chen, Yujia Hao, Jiaqi Xu, Jade Nie, Xi Liu, Buyun Zhang, Wei Wen, Siyang Yuan, Kai Wang, Wen-Yen Chen, Yiping Han, Huayu Li, Chunzhi Yang, Bo Long, Philip S. Yu, Hanghang Tong, Jiyan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Click-through rate (CTR) prediction, which predicts the probability of a user
+clicking an ad, is a fundamental task in recommender systems. The emergence of
+heterogeneous information, such as user profile and behavior sequences, depicts
+user interests from different aspects. A mutually beneficial integration of
+heterogeneous information is the cornerstone towards the success of CTR
+prediction. However, most of the existing methods suffer from two fundamental
+limitations, including (1) insufficient inter-mode interaction due to the
+unidirectional information flow between modes, and (2) aggressive information
+aggregation caused by early summarization, resulting in excessive information
+loss. To address the above limitations, we propose a novel module named
+InterFormer to learn heterogeneous information interaction in an interleaving
+style. To achieve better interaction learning, InterFormer enables
+bidirectional information flow for mutually beneficial learning across
+different modes. To avoid aggressive information aggregation, we retain
+complete information in each data mode and use a separate bridging arch for
+effective information selection and summarization. Our proposed InterFormer
+achieves state-of-the-art performance on three public datasets and a
+large-scale industrial dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Structured Neural Network: Efficient Retrieval Scaling for
+  Large Scale Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06653v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06653v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaushik Rangadurai, Siyang Yuan, Minhui Huang, Yiqun Liu, Golnaz Ghasemiesfeh, Yunchen Pu, Haiyu Lu, Xingfeng He, Fangzhou Xu, Andrew Cui, Vidhoon Viswanathan, Lin Yang, Liang Wang, Jiyan Yang, Chonglin Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval, the initial stage of a recommendation system, is tasked with
+down-selecting items from a pool of tens of millions of candidates to a few
+thousands. Embedding Based Retrieval (EBR) has been a typical choice for this
+problem, addressing the computational demands of deep neural networks across
+vast item corpora. EBR utilizes Two Tower or Siamese Networks to learn
+representations for users and items, and employ Approximate Nearest Neighbor
+(ANN) search to efficiently retrieve relevant items. Despite its popularity in
+industry, EBR faces limitations. The Two Tower architecture, relying on a
+single dot product interaction, struggles to capture complex data distributions
+due to limited capability in learning expressive interactions between users and
+items. Additionally, ANN index building and representation learning for user
+and item are often separate, leading to inconsistencies exacerbated by
+representation (e.g. continuous online training) and item drift (e.g. items
+expired and new items added). In this paper, we introduce the Hierarchical
+Structured Neural Network (HSNN), an efficient deep neural network model to
+learn intricate user and item interactions beyond the commonly used dot product
+in retrieval tasks, achieving sublinear computational costs relative to corpus
+size. A Modular Neural Network (MoNN) is designed to maintain high
+expressiveness for interaction learning while ensuring efficiency. A mixture of
+MoNNs operate on a hierarchical item index to achieve extensive computation
+sharing, enabling it to scale up to large corpus size. MoNN and the
+hierarchical index are jointly learnt to continuously adapt to distribution
+shifts in both user interests and item distributions. HSNN achieves substantial
+improvement in offline evaluation compared to prevailing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Resubmit</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">148</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Planarian Neural Networks: Evolutionary Patterns from Basic Bilateria
+  Shaping Modern Artificial Neural Network Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyuan Huang, Mark Newman, Maria Vaida, Srikar Bellur, Roozbeh Sadeghian, Andrew Siu, Hui Wang, Kevin Huggins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study examined the viability of enhancing the prediction accuracy of
+artificial neural networks (ANNs) in image classification tasks by developing
+ANNs with evolution patterns similar to those of biological neural networks.
+ResNet is a widely used family of neural networks with both deep and wide
+variants; therefore, it was selected as the base model for our investigation.
+The aim of this study is to improve the image classification performance of
+ANNs via a novel approach inspired by the biological nervous system
+architecture of planarians, which comprises a brain and two nerve cords. We
+believe that the unique neural architecture of planarians offers valuable
+insights into the performance enhancement of ANNs. The proposed planarian
+neural architecture-based neural network was evaluated on the CIFAR-10 and
+CIFAR-100 datasets. Our results indicate that the proposed method exhibits
+higher prediction accuracy than the baseline neural network models in image
+classification tasks. These findings demonstrate the significant potential of
+biologically inspired neural network architectures in improving the performance
+of ANNs in a wide range of applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grokking at the Edge of Numerical Stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Prieto, Melih Barsbey, Pedro A. M. Mediano, Tolga Birdal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grokking, the sudden generalization that occurs after prolonged overfitting,
+is a surprising phenomenon challenging our understanding of deep learning.
+Although significant progress has been made in understanding grokking, the
+reasons behind the delayed generalization and its dependence on regularization
+remain unclear. In this work, we argue that without regularization, grokking
+tasks push models to the edge of numerical stability, introducing floating
+point errors in the Softmax function, which we refer to as Softmax Collapse
+(SC). We demonstrate that SC prevents grokking and that mitigating SC enables
+grokking without regularization. Investigating the root cause of SC, we find
+that beyond the point of overfitting, the gradients strongly align with what we
+call the na\"ive loss minimization (NLM) direction. This component of the
+gradient does not alter the model's predictions but decreases the loss by
+scaling the logits, typically by scaling the weights along their current
+direction. We show that this scaling of the logits explains the delay in
+generalization characteristic of grokking and eventually leads to SC, halting
+further learning. To validate our hypotheses, we introduce two key
+contributions that address the challenges in grokking tasks: StableMax, a new
+activation function that prevents SC and enables grokking without
+regularization, and $\perp$Grad, a training algorithm that promotes quick
+generalization in grokking tasks by preventing NLM altogether. These
+contributions provide new insights into grokking, elucidating its delayed
+generalization, reliance on regularization, and the effectiveness of existing
+grokking-inducing methods. Code for this paper is available at
+https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Re-ranking the Context for Multimodal Retrieval Augmented Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matin Mortaheb, Mohammad A. Amir Khojastepour, Srimat T. Chakradhar, Sennur Ulukus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) enhances large language models (LLMs) by
+incorporating external knowledge to generate a response within a context with
+improved accuracy and reduced hallucinations. However, multi-modal RAG systems
+face unique challenges: (i) the retrieval process may select irrelevant entries
+to user query (e.g., images, documents), and (ii) vision-language models or
+multi-modal language models like GPT-4o may hallucinate when processing these
+entries to generate RAG output. In this paper, we aim to address the first
+challenge, i.e, improving the selection of relevant context from the
+knowledge-base in retrieval phase of the multi-modal RAG. Specifically, we
+leverage the relevancy score (RS) measure designed in our previous work for
+evaluating the RAG performance to select more relevant entries in retrieval
+process. The retrieval based on embeddings, say CLIP-based embedding, and
+cosine similarity usually perform poorly particularly for multi-modal data. We
+show that by using a more advanced relevancy measure, one can enhance the
+retrieval process by selecting more relevant pieces from the knowledge-base and
+eliminate the irrelevant pieces from the context by adaptively selecting
+up-to-$k$ entries instead of fixed number of entries. Our evaluation using COCO
+dataset demonstrates significant enhancement in selecting relevant context and
+accuracy of the generated response.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of Quantum and Classical Support Vector Classifiers
+  for Software Bug Prediction: An Exploratory Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Nadim, Mohammad Hassan, Ashis Kumar Mandal, Chanchal K. Roy, Banani Roy, Kevin A. Schneider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Quantum computing promises to transform problem-solving across
+various domains with rapid and practical solutions. Within Software Evolution
+and Maintenance, Quantum Machine Learning (QML) remains mostly an underexplored
+domain, particularly in addressing challenges such as detecting buggy software
+commits from code repositories. Methods: In this study, we investigate the
+practical application of Quantum Support Vector Classifiers (QSVC) for
+detecting buggy software commits across 14 open-source software projects with
+diverse dataset sizes encompassing 30,924 data instances. We compare the QML
+algorithm PQSVC (Pegasos QSVC) and QSVC against the classical Support Vector
+Classifier (SVC). Our technique addresses large datasets in QSVC algorithms by
+dividing them into smaller subsets. We propose and evaluate an aggregation
+method to combine predictions from these models to detect the entire test
+dataset. We also introduce an incremental testing methodology to overcome the
+difficulties of quantum feature mapping during the testing approach. Results:
+The study shows the effectiveness of QSVC and PQSVC in detecting buggy software
+commits. The aggregation technique successfully combines predictions from
+smaller data subsets, enhancing the overall detection accuracy for the entire
+test dataset. The incremental testing methodology effectively manages the
+challenges associated with quantum feature mapping during the testing process.
+Conclusion: We contribute to the advancement of QML algorithms in defect
+prediction, unveiling the potential for further research in this domain. The
+specific scenario of the Short-Term Activity Frame (STAF) highlights the early
+detection of buggy software commits during the initial developmental phases of
+software systems, particularly when dataset sizes remain insufficient to train
+machine learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the Springer Journal: Quantum Machine
+  Intelligence (https://link.springer.com/journal/42484)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ URSA: Understanding and Verifying Chain-of-thought Reasoning in
+  Multimodal Mathematics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruilin Luo, Zhuofan Zheng, Yifan Wang, Yiyao Yu, Xinzhe Ni, Zicheng Lin, Jin Zeng, Yujiu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-thought (CoT) reasoning has been widely applied in the mathematical
+reasoning of Large Language Models (LLMs). Recently, the introduction of
+derivative process supervision on CoT trajectories has sparked discussions on
+enhancing scaling capabilities during test time, thereby boosting the potential
+of these models. However, in multimodal mathematical reasoning, the scarcity of
+high-quality CoT training data has hindered existing models from achieving
+high-precision CoT reasoning and has limited the realization of reasoning
+potential during test time. In this work, we propose a three-module synthesis
+strategy that integrates CoT distillation, trajectory-format rewriting, and
+format unification. It results in a high-quality CoT reasoning instruction
+fine-tuning dataset in multimodal mathematics, MMathCoT-1M. We comprehensively
+validate the state-of-the-art (SOTA) performance of the trained URSA-7B model
+on multiple multimodal mathematical benchmarks. For test-time scaling, we
+introduce a data synthesis strategy that automatically generates process
+annotation datasets, known as DualMath-1.1M, focusing on both interpretation
+and logic. By further training URSA-7B on DualMath-1.1M, we transition from CoT
+reasoning capabilities to robust supervision abilities. The trained URSA-RM-7B
+acts as a verifier, effectively enhancing the performance of URSA-7B at test
+time. URSA-RM-7B also demonstrates excellent out-of-distribution (OOD)
+verifying capabilities, showcasing its generalization. Model weights, training
+data and code will be open-sourced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 10 tables, 17 figures. The training data has been released.
+  The code and model are currently undergoing internal review. They will be
+  made available soon. Project url: https://ursa-math.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Sufficient Statistical Power in Algorithmic Bias Assessment: A
+  Test for ABROCA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04683v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04683v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Conrad Borchers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Algorithmic bias is a pressing concern in educational data mining (EDM), as
+it risks amplifying inequities in learning outcomes. The Area Between ROC
+Curves (ABROCA) metric is frequently used to measure discrepancies in model
+performance across demographic groups to quantify overall model fairness.
+However, its skewed distribution--especially when class or group imbalances
+exist--makes significance testing challenging. This study investigates ABROCA's
+distributional properties and contributes robust methods for its significance
+testing. Specifically, we address (1) whether ABROCA follows any known
+distribution, (2) how to reliably test for algorithmic bias using ABROCA, and
+(3) the statistical power achievable with ABROCA-based bias assessments under
+typical EDM sample specifications. Simulation results confirm that ABROCA does
+not match standard distributions, including those suited to accommodate
+skewness. We propose nonparametric randomization tests for ABROCA and
+demonstrate that reliably detecting bias with ABROCA requires large sample
+sizes or substantial effect sizes, particularly in imbalanced settings.
+Findings suggest that ABROCA-based bias evaluation based on sample sizes common
+in EDM tends to be underpowered, undermining the reliability of conclusions
+about model fairness. By offering open-source code to simulate power and
+statistically test ABROCA, this paper aims to foster more reliable statistical
+testing in EDM research. It supports broader efforts toward replicability and
+equity in educational modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Financial VQA in Vision Language Models using Intermediate
+  Structured Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Archita Srivastava, Abhas Kumar, Rajesh Kumar, Prabhakar Srinivasan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chart interpretation is crucial for visual data analysis, but accurately
+extracting information from charts poses significant challenges for automated
+models. This study investigates the fine-tuning of DEPLOT, a modality
+conversion module that translates the image of a plot or chart to a linearized
+table, on a custom dataset of 50,000 bar charts. The dataset comprises simple,
+stacked, and grouped bar charts, targeting the unique structural features of
+these visualizations. The finetuned DEPLOT model is evaluated against its base
+version using a test set of 1,000 images and two metrics: Relative Mapping
+Similarity (RMS), which measures categorical mapping accuracy, and Relative
+Number Set Similarity (RNSS), which evaluates numerical interpretation
+accuracy. To further explore the reasoning capabilities of large language
+models (LLMs), we curate an additional set of 100 bar chart images paired with
+question answer sets. Our findings demonstrate that providing a structured
+intermediate table alongside the image significantly enhances LLM reasoning
+performance compared to direct image queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Natural Variational Annealing for Multimodal Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tâm Le Minh, Julyan Arbel, Thomas Möllenhoff, Mohammad Emtiyaz Khan, Florence Forbes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new multimodal optimization approach called Natural
+Variational Annealing (NVA) that combines the strengths of three foundational
+concepts to simultaneously search for multiple global and local modes of
+black-box nonconvex objectives. First, it implements a simultaneous search by
+using variational posteriors, such as, mixtures of Gaussians. Second, it
+applies annealing to gradually trade off exploration for exploitation. Finally,
+it learns the variational search distribution using natural-gradient learning
+where updates resemble well-known and easy-to-implement algorithms. The three
+concepts come together in NVA giving rise to new algorithms and also allowing
+us to incorporate "fitness shaping", a core concept from evolutionary
+algorithms. We assess the quality of search on simulations and compare them to
+methods using gradient descent and evolution strategies. We also provide an
+application to a real-world inverse problem in planetary science.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-task retriever fine-tuning for domain-specific and efficient RAG <span class="chip">NAACL 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrice Béchard, Orlando Marquez Ayala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) has become ubiquitous when deploying
+Large Language Models (LLMs), as it can address typical limitations such as
+generating hallucinated or outdated information. However, when building
+real-world RAG applications, practical issues arise. First, the retrieved
+information is generally domain-specific. Since it is computationally expensive
+to fine-tune LLMs, it is more feasible to fine-tune the retriever to improve
+the quality of the data included in the LLM input. Second, as more applications
+are deployed in the same real-world system, one cannot afford to deploy
+separate retrievers. Moreover, these RAG applications normally retrieve
+different kinds of data. Our solution is to instruction fine-tune a small
+retriever encoder on a variety of domain-specific tasks to allow us to deploy
+one encoder that can serve many use cases, thereby achieving low-cost,
+scalability, and speed. We show how this encoder generalizes to out-of-domain
+settings as well as to an unseen retrieval task on real-world enterprise use
+cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures. Submitted to NAACL 2025 Industry Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Statistical Theory of Contrastive <span class="highlight-title">Pre-train</span>ing and Multimodal
+  Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazusato Oko, Licong Lin, Yuhang Cai, Song Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal generative AI systems, such as those combining vision and
+language, rely on contrastive pre-training to learn representations across
+different modalities. While their practical benefits are widely acknowledged, a
+rigorous theoretical understanding of the contrastive pre-training framework
+remains limited. This paper develops a theoretical framework to explain the
+success of contrastive pre-training in downstream tasks, such as zero-shot
+classification, conditional diffusion models, and vision-language models. We
+introduce the concept of approximate sufficient statistics, a generalization of
+the classical sufficient statistics, and show that near-minimizers of the
+contrastive pre-training loss are approximately sufficient, making them
+adaptable to diverse downstream tasks. We further propose the Joint Generative
+Hierarchical Model for the joint distribution of images and text, showing that
+transformers can efficiently approximate relevant functions within this model
+via belief propagation. Building on this framework, we derive sample complexity
+guarantees for multi-modal learning based on contrastive pre-trained
+representations. Numerical simulations validate these theoretical findings,
+demonstrating the strong generalization performance of contrastively
+pre-trained transformers in various multi-modal tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>108 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reach Measurement, Optimization and Frequency Capping In Targeted Online
+  Advertising Under k-Anonymity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Gao, Mu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growth in the use of online advertising to foster brand awareness over
+recent years is largely attributable to the ubiquity of social media. One
+pivotal technology contributing to the success of online brand advertising is
+frequency capping, a mechanism that enables marketers to control the number of
+times an ad is shown to a specific user. However, the very foundation of this
+technology is being scrutinized as the industry gravitates towards advertising
+solutions that prioritize user privacy. This paper delves into the issue of
+reach measurement and optimization within the context of $k$-anonymity, a
+privacy-preserving model gaining traction across major online advertising
+platforms. We outline how to report reach within this new privacy landscape and
+demonstrate how probabilistic discounting, a probabilistic adaptation of
+traditional frequency capping, can be employed to optimize campaign
+performance. Experiments are performed to assess the trade-off between user
+privacy and the efficacy of online brand advertising. Notably, we discern a
+significant dip in performance as long as privacy is introduced, yet this comes
+with a limited additional cost for advertising platforms to offer their users
+more privacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Geophysical inverse problems with measurement-guided diffusion models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04881v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04881v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Ravasi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving inverse problems with the reverse process of a diffusion model
+represents an appealing avenue to produce highly realistic, yet diverse
+solutions from incomplete and possibly noisy measurements, ultimately enabling
+uncertainty quantification at scale. However, because of the intractable nature
+of the score function of the likelihood term (i.e., $\nabla_{\mathbf{x}_t}
+p(\mathbf{y} | \mathbf{x}_t)$), various samplers have been proposed in the
+literature that use different (more or less accurate) approximations of such a
+gradient to guide the diffusion process towards solutions that match the
+observations. In this work, I consider two sampling algorithms recently
+proposed under the name of Diffusion Posterior Sampling (DPS) and
+Pseudo-inverse Guided Diffusion Model (PGDM), respectively. In DSP, the
+guidance term used at each step of the reverse diffusion process is obtained by
+applying the adjoint of the modeling operator to the residual obtained from a
+one-step denoising estimate of the solution. On the other hand, PGDM utilizes a
+pseudo-inverse operator that originates from the fact that the one-step
+denoised solution is not assumed to be deterministic, rather modeled as a
+Gaussian distribution. Through an extensive set of numerical examples on two
+geophysical inverse problems (namely, seismic interpolation and seismic
+inversion), I show that two key aspects for the success of any
+measurement-guided diffusion process are: i) our ability to re-parametrize the
+inverse problem such that the sought after model is bounded between -1 and 1 (a
+pre-requisite for any diffusion model); ii) the choice of the training dataset
+used to learn the implicit prior that guides the reverse diffusion process.
+Numerical examples on synthetic and field datasets reveal that PGDM outperforms
+DPS in both scenarios at limited additional cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Log Probabilities in Language Models to Forecast Future
+  Events 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tommaso Soru, Jim Marshall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the constantly changing field of data-driven decision making, accurately
+predicting future events is crucial for strategic planning in various sectors.
+The emergence of Large Language Models (LLMs) marks a significant advancement
+in this area, offering advanced tools that utilise extensive text data for
+prediction. In this industry paper, we introduce a novel method for AI-driven
+foresight using LLMs. Building on top of previous research, we employ data on
+current trends and their trajectories for generating forecasts on 15 different
+topics. Subsequently, we estimate their probabilities via a multi-step approach
+based on log probabilities. We show we achieve a Brier score of 0.186, meaning
+a +26% improvement over random chance and a +19% improvement over
+widely-available AI systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multilinear Tensor Low-Rank Approximation for Policy-Gradient Methods in
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Rozada, Hoi-To Wai, Antonio G. Marques
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) aims to estimate the action to take given a
+(time-varying) state, with the goal of maximizing a cumulative reward function.
+Predominantly, there are two families of algorithms to solve RL problems:
+value-based and policy-based methods, with the latter designed to learn a
+probabilistic parametric policy from states to actions. Most contemporary
+approaches implement this policy using a neural network (NN). However, NNs
+usually face issues related to convergence, architectural suitability,
+hyper-parameter selection, and underutilization of the redundancies of the
+state-action representations (e.g. locally similar states). This paper
+postulates multi-linear mappings to efficiently estimate the parameters of the
+RL policy. More precisely, we leverage the PARAFAC decomposition to design
+tensor low-rank policies. The key idea involves collecting the policy
+parameters into a tensor and leveraging tensor-completion techniques to enforce
+low rank. We establish theoretical guarantees of the proposed methods for
+various policy classes and validate their efficacy through numerical
+experiments. Specifically, we demonstrate that tensor low-rank policy models
+reduce computational and sample complexities in comparison to NN models while
+achieving similar rewards.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RieszBoost: Gradient Boosting for Riesz Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04871v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04871v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaitlyn J. Lee, Alejandro Schuler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Answering causal questions often involves estimating linear functionals of
+conditional expectations, such as the average treatment effect or the effect of
+a longitudinal modified treatment policy. By the Riesz representation theorem,
+these functionals can be expressed as the expected product of the conditional
+expectation of the outcome and the Riesz representer, a key component in doubly
+robust estimation methods. Traditionally, the Riesz representer is estimated
+indirectly by deriving its explicit analytical form, estimating its components,
+and substituting these estimates into the known form (e.g., the inverse
+propensity score). However, deriving or estimating the analytical form can be
+challenging, and substitution methods are often sensitive to practical
+positivity violations, leading to higher variance and wider confidence
+intervals. In this paper, we propose a novel gradient boosting algorithm to
+directly estimate the Riesz representer without requiring its explicit
+analytical form. This method is particularly suited for tabular data, offering
+a flexible, nonparametric, and computationally efficient alternative to
+existing methods for Riesz regression. Through simulation studies, we
+demonstrate that our algorithm performs on par with or better than indirect
+estimation techniques across a range of functionals, providing a user-friendly
+and robust solution for estimating causal quantities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Transfer $Q$-Learning for Offline Non-Stationary Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhang Chai, Elynn Chen, Jianqing Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In dynamic decision-making scenarios across business and healthcare,
+leveraging sample trajectories from diverse populations can significantly
+enhance reinforcement learning (RL) performance for specific target
+populations, especially when sample sizes are limited. While existing transfer
+learning methods primarily focus on linear regression settings, they lack
+direct applicability to reinforcement learning algorithms. This paper pioneers
+the study of transfer learning for dynamic decision scenarios modeled by
+non-stationary finite-horizon Markov decision processes, utilizing neural
+networks as powerful function approximators and backward inductive learning. We
+demonstrate that naive sample pooling strategies, effective in regression
+settings, fail in Markov decision processes.To address this challenge, we
+introduce a novel ``re-weighted targeting procedure'' to construct
+``transferable RL samples'' and propose ``transfer deep $Q^*$-learning'',
+enabling neural network approximation with theoretical guarantees. We assume
+that the reward functions are transferable and deal with both situations in
+which the transition densities are transferable or nontransferable. Our
+analytical techniques for transfer learning in neural network approximation and
+transition density transfers have broader implications, extending to supervised
+transfer learning with neural networks and domain shift scenarios. Empirical
+experiments on both synthetic and real datasets corroborate the advantages of
+our method, showcasing its potential for improving decision-making through
+strategically constructing transferable RL samples in non-stationary
+reinforcement learning contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent experiments through real-time AI: Fast Data Processing and
+  Autonomous Detector Control for sPHENIX and future EIC detectors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J. Kvapil, G. Borca-Tasciuc, H. Bossi, K. Chen, Y. Chen, Y. Corrales Morales, H. Da Costa, C. Da Silva, C. Dean, J. Durham, S. Fu, C. Hao, P. Harris, O. Hen, H. Jheng, Y. Lee, P. Li, X. Li, Y. Lin, M. X. Liu, V. Loncar, J. P. Mitrevski, A. Olvera, M. L. Purschke, J. S. Renck, G. Roland, J. Schambach, Z. Shi, N. Tran, N. Wuerfel, B. Xu, D. Yu, H. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This R\&D project, initiated by the DOE Nuclear Physics AI-Machine Learning
+initiative in 2022, leverages AI to address data processing challenges in
+high-energy nuclear experiments (RHIC, LHC, and future EIC). Our focus is on
+developing a demonstrator for real-time processing of high-rate data streams
+from sPHENIX experiment tracking detectors. The limitations of a 15 kHz maximum
+trigger rate imposed by the calorimeters can be negated by intelligent use of
+streaming technology in the tracking system. The approach efficiently
+identifies low momentum rare heavy flavor events in high-rate p+p collisions
+(3MHz), using Graph Neural Network (GNN) and High Level Synthesis for Machine
+Learning (hls4ml). Success at sPHENIX promises immediate benefits, minimizing
+resources and accelerating the heavy-flavor measurements. The approach is
+transferable to other fields. For the EIC, we develop a DIS-electron tagger
+using Artificial Intelligence - Machine Learning (AI-ML) algorithms for
+real-time identification, showcasing the transformative potential of AI and
+FPGA technologies in high-energy nuclear and particle experiments real-time
+data processing pipelines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>proceedings for 42nd International Conference on High Energy Physics
+  (ICHEP2024), 18-24 July 2024, Prague, Czech Republic</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum Hybrid Support Vector Machines for Stress Detection in Older
+  Adults 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Saif Hassan Onim, Travis S. Humble, Himanshu Thapliyal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stress can increase the possibility of cognitive impairment and decrease the
+quality of life in older adults. Smart healthcare can deploy quantum machine
+learning to enable preventive and diagnostic support. This work introduces a
+unique technique to address stress detection as an anomaly detection problem
+that uses quantum hybrid support vector machines. With the help of a wearable
+smartwatch, we mapped baseline sensor reading as normal data and stressed
+sensor reading as anomaly data using cortisol concentration as the ground
+truth. We have used quantum computing techniques to explore the complex feature
+spaces with kernel-based preprocessing. We illustrate the usefulness of our
+method by doing experimental validation on 40 older adults with the help of the
+TSST protocol. Our findings highlight that using a limited number of features,
+quantum machine learning provides improved accuracy compared to classical
+methods. We also observed that the recall value using quantum machine learning
+is higher compared to the classical method. The higher recall value illustrates
+the potential of quantum machine learning in healthcare, as missing anomalies
+could result in delayed diagnostics or treatment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent Gradient Boosting Algorithms for Estimating Strength of
+  Modified Subgrade Soil 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ismail B. Mustapha, Muyideen Abdulkareem, Shafaatunnur Hasan, Abideen Ganiyu, Hatem Nabus, Jin Chai Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of pavement under loading depends on the strength of the
+subgrade. However, experimental estimation of properties of pavement strengths
+such as California bearing ratio (CBR), unconfined compressive strength (UCS)
+and resistance value (R) are often tedious, time-consuming and costly, thereby
+inspiring a growing interest in machine learning based tools which are simple,
+cheap and fast alternatives. Thus, the potential application of two boosting
+techniques; categorical boosting (CatBoost) and extreme gradient boosting
+(XGBoost) and support vector regression (SVR), is similarly explored in this
+study for estimation of properties of subgrade soil modified with hydrated lime
+activated rice husk ash (HARSH). Using 121 experimental data samples of varying
+proportions of HARSH, plastic limit, liquid limit, plasticity index, clay
+activity, optimum moisture content, and maximum dry density as input for CBR,
+UCS and R estimation, four evaluation metrics namely coefficient of
+determination (R2), root mean squared error (RMSE), mean absolute error (MAE)
+and mean absolute percentage error (MAPE) are used to evaluate the models'
+performance. The results indicate that XGBoost outperformed CatBoost and SVR in
+estimating these properties, yielding R2 of 0.9994, 0.9995 and 0.9999 in
+estimating the CBR, UCS and R respectively. Also, SVR outperformed CatBoost in
+estimating the CBR and R with R2 of 0.9997 respectively. On the other hand,
+CatBoost outperformed SVR in estimating the UCS with R2 of 0.9994. Feature
+sensitivity analysis shows that the three machine learning techniques are
+unanimous that increasing HARSH proportion lead to values of the estimated
+properties respectively. A comparison with previous results also shows
+superiority of XGBoost in estimating subgrade properties.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decentralised Resource Sharing in TinyML: Wireless Bilayer Gossip
+  Parallel SGD for Collaborative Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyuan Bao, Eiman Kanjo, Soumya Banerjee, Hasib-Al Rashid, Tinoosh Mohsenin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing computational capabilities of microcontroller units (MCUs),
+edge devices can now support machine learning models. However, deploying
+decentralised federated learning (DFL) on such devices presents key challenges,
+including intermittent connectivity, limited communication range, and dynamic
+network topologies. This paper proposes a novel framework, bilayer Gossip
+Decentralised Parallel Stochastic Gradient Descent (GD PSGD), designed to
+address these issues in resource-constrained environments. The framework
+incorporates a hierarchical communication structure using Distributed Kmeans
+(DKmeans) clustering for geographic grouping and a gossip protocol for
+efficient model aggregation across two layers: intra-cluster and inter-cluster.
+We evaluate the framework's performance against the Centralised Federated
+Learning (CFL) baseline using the MCUNet model on the CIFAR-10 dataset under
+IID and Non-IID conditions. Results demonstrate that the proposed method
+achieves comparable accuracy to CFL on IID datasets, requiring only 1.8
+additional rounds for convergence. On Non-IID datasets, the accuracy loss
+remains under 8\% for moderate data imbalance. These findings highlight the
+framework's potential to support scalable and privacy-preserving learning on
+edge devices with minimal performance trade-offs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Skip Connections for Deterministic Uncertainty
+  Quantification in Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04816v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04816v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Jimenez, Matthias Katzfuss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deterministic uncertainty quantification (UQ) in deep learning aims to
+estimate uncertainty with a single pass through a network by leveraging outputs
+from the network's feature extractor. Existing methods require that the feature
+extractor be both sensitive and smooth, ensuring meaningful input changes
+produce meaningful changes in feature vectors. Smoothness enables
+generalization, while sensitivity prevents feature collapse, where distinct
+inputs are mapped to identical feature vectors. To meet these requirements,
+current deterministic methods often retrain networks with spectral
+normalization. Instead of modifying training, we propose using measures of
+neural collapse to identify an existing intermediate layer that is both
+sensitive and smooth. We then fit a probabilistic model to the feature vector
+of this intermediate layer, which we call a probabilistic skip connection
+(PSC). Through empirical analysis, we explore the impact of spectral
+normalization on neural collapse and demonstrate that PSCs can effectively
+disentangle aleatoric and epistemic uncertainty. Additionally, we show that
+PSCs achieve uncertainty quantification and out-of-distribution (OOD) detection
+performance that matches or exceeds existing single-pass methods requiring
+training modifications. By retrofitting existing models, PSCs enable
+high-quality UQ and OOD capabilities without retraining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast, Fine-Grained Equivalence Checking for Neural Decompilers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04811v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04811v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luke Dramko, Claire Le Goues, Edward J. Schwartz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural decompilers are machine learning models that reconstruct the source
+code from an executable program. Critical to the lifecycle of any machine
+learning model is an evaluation of its effectiveness. However, existing
+techniques for evaluating neural decompilation models have substantial
+weaknesses, especially when it comes to showing the correctness of the neural
+decompiler's predictions. To address this, we introduce codealign, a novel
+instruction-level code equivalence technique designed for neural decompilers.
+We provide a formal definition of a relation between equivalent instructions,
+which we term an equivalence alignment. We show how codealign generates
+equivalence alignments, then evaluate codealign by comparing it with symbolic
+execution. Finally, we show how the information codealign provides-which parts
+of the functions are equivalent and how well the variable names match-is
+substantially more detailed than existing state-of-the-art evaluation metrics,
+which report unitless numbers measuring similarity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Steerable Deep Network for Model-Free Diffusion MRI Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04794v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04794v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianfranco Cortes, Baba C. Vemuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonrigid registration is vital to medical image analysis but remains
+challenging for diffusion MRI (dMRI) due to its high-dimensional,
+orientation-dependent nature. While classical methods are accurate, they are
+computationally demanding, and deep neural networks, though efficient, have
+been underexplored for nonrigid dMRI registration compared to structural
+imaging. We present a novel, deep learning framework for model-free, nonrigid
+registration of raw diffusion MRI data that does not require explicit
+reorientation. Unlike previous methods relying on derived representations such
+as diffusion tensors or fiber orientation distribution functions, in our
+approach, we formulate the registration as an equivariant diffeomorphism of
+position-and-orientation space. Central to our method is an
+$\mathsf{SE}(3)$-equivariant UNet that generates velocity fields while
+preserving the geometric properties of a raw dMRI's domain. We introduce a new
+loss function based on the maximum mean discrepancy in Fourier space,
+implicitly matching ensemble average propagators across images. Experimental
+results on Human Connectome Project dMRI data demonstrate competitive
+performance compared to state-of-the-art approaches, with the added advantage
+of bypassing the overhead for estimating derived representations. This work
+establishes a foundation for data-driven, geometry-aware dMRI registration
+directly in the acquisition space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Registers in Vision <span class="highlight-title">Transformer</span>s for Robust Adaptation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04784v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04784v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Srikar Yellapragada, Kowshik Thopalli, Vivek Narayanaswamy, Wesam Sakla, Yang Liu, Yamen Mubarka, Dimitris Samaras, Jayaraman J. Thiagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have shown success across a variety of tasks due
+to their ability to capture global image representations. Recent studies have
+identified the existence of high-norm tokens in ViTs, which can interfere with
+unsupervised object discovery. To address this, the use of "registers" which
+are additional tokens that isolate high norm patch tokens while capturing
+global image-level information has been proposed. While registers have been
+studied extensively for object discovery, their generalization properties
+particularly in out-of-distribution (OOD) scenarios, remains underexplored. In
+this paper, we examine the utility of register token embeddings in providing
+additional features for improving generalization and anomaly rejection. To that
+end, we propose a simple method that combines the special CLS token embedding
+commonly employed in ViTs with the average-pooled register embeddings to create
+feature representations which are subsequently used for training a downstream
+classifier. We find that this enhances OOD generalization and anomaly
+rejection, while maintaining in-distribution (ID) performance. Extensive
+experiments across multiple ViT backbones trained with and without registers
+reveal consistent improvements of 2-4\% in top-1 OOD accuracy and a 2-3\%
+reduction in false positive rates for anomaly detection. Importantly, these
+gains are achieved without additional computational overhead.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient and Responsible Adaptation of Large Language Models for Robust
+  and Equitable Top-k Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kirandeep Kaur, Manya Chadha, Vinayak Gupta, Chirag Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional recommendation systems (RSs) are typically optimized to enhance
+performance metrics uniformly across all training samples, inadvertently
+overlooking the needs of diverse user populations. The performance disparity
+among various populations can harm the model's robustness to sub-populations
+due to the varying user properties. While large language models (LLMs) show
+promise in enhancing RS performance, their practical applicability is hindered
+by high costs, inference latency, and degraded performance on long user
+queries. To address these challenges, we propose a hybrid task allocation
+framework designed to promote social good by equitably serving all user groups.
+By adopting a two-phase approach, we promote a strategic assignment of tasks
+for efficient and responsible adaptation of LLMs. Our strategy works by first
+identifying the weak and inactive users that receive a suboptimal ranking
+performance by RSs. Next, we use an in-context learning approach for such
+users, wherein each user interaction history is contextualized as a distinct
+ranking task. We evaluate our hybrid framework by incorporating eight different
+recommendation algorithms and three different LLMs -- both open and
+close-sourced. Our results on three real-world datasets show a significant
+reduction in weak users and improved robustness to subpopulations without
+disproportionately escalating costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2405.00824</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DAREK -- Distance Aware Error for Kolmogorov Networks <span class="chip">ICASSP25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoud Ataei, Mohammad Javad Khojasteh, Vikas Dhiman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we provide distance-aware error bounds for Kolmogorov Arnold
+Networks (KANs). We call our new error bounds estimator DAREK -- Distance Aware
+Error for Kolmogorov networks. Z. Liu et al. provide error bounds, which may be
+loose, lack distance-awareness, and are defined only up to an unknown constant
+of proportionality. We review the error bounds for Newton's polynomial, which
+is then generalized to an arbitrary spline, under Lipschitz continuity
+assumptions. We then extend these bounds to nested compositions of splines,
+arriving at error bounds for KANs. We evaluate our method by estimating an
+object's shape from sparse laser scan points. We use KAN to fit a smooth
+function to the scans and provide error bounds for the fit. We find that our
+method is faster than Monte Carlo approaches, and that our error bounds enclose
+the true obstacle shape reliably.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP25, 5 pages + 2 pages supplementary material, 3
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Semantic Partitioning Method for Large-Scale Training of Knowledge
+  Graph Embeddings <span class="chip">WWW '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhe Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, knowledge graph embeddings have achieved great success. Many
+methods have been proposed and achieved state-of-the-art results in various
+tasks. However, most of the current methods present one or more of the
+following problems: (i) They only consider fact triplets, while ignoring the
+ontology information of knowledge graphs. (ii) The obtained embeddings do not
+contain much semantic information. Therefore, using these embeddings for
+semantic tasks is problematic. (iii) They do not enable large-scale training.
+In this paper, we propose a new algorithm that incorporates the ontology of
+knowledge graphs and partitions the knowledge graph based on classes to include
+more semantic information for parallel training of large-scale knowledge graph
+embeddings. Our preliminary results show that our algorithm performs well on
+several popular benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WWW '23 Companion: Companion Proceedings of the ACM Web
+  Conference 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resilient Peer-to-peer Learning based on Adaptive Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chandreyee Bhowmick, Xenofon Koutsoukos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative learning in peer-to-peer networks offers the benefits of
+distributed learning while mitigating the risks associated with single points
+of failure inherent in centralized servers. However, adversarial workers pose
+potential threats by attempting to inject malicious information into the
+network. Thus, ensuring the resilience of peer-to-peer learning emerges as a
+pivotal research objective. The challenge is exacerbated in the presence of
+non-convex loss functions and non-iid data distributions. This paper introduces
+a resilient aggregation technique tailored for such scenarios, aimed at
+fostering similarity among peers' learning processes. The aggregation weights
+are determined through an optimization procedure, and use the loss function
+computed using the neighbor's models and individual private data, thereby
+addressing concerns regarding data privacy in distributed machine learning.
+Theoretical analysis demonstrates convergence of parameters with non-convex
+loss functions and non-iid data distributions. Empirical evaluations across
+three distinct machine learning tasks support the claims. The empirical
+findings, which encompass a range of diverse attack models, also demonstrate
+improved accuracy when compared to existing methodologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comprehensive Examination of Unrolled Networks for Linear Inverse
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Chen, Xi Chen, Arian Maleki, Shirin Jalali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unrolled networks have become prevalent in various computer vision and
+imaging tasks. Although they have demonstrated remarkable efficacy in solving
+specific computer vision and computational imaging tasks, their adaptation to
+other applications presents considerable challenges. This is primarily due to
+the multitude of design decisions that practitioners working on new
+applications must navigate, each potentially affecting the network's overall
+performance. These decisions include selecting the optimization algorithm,
+defining the loss function, and determining the number of convolutional layers,
+among others. Compounding the issue, evaluating each design choice requires
+time-consuming simulations to train, fine-tune the neural network, and optimize
+for its performance. As a result, the process of exploring multiple options and
+identifying the optimal configuration becomes time-consuming and
+computationally demanding. The main objectives of this paper are (1) to unify
+some ideas and methodologies used in unrolled networks to reduce the number of
+design choices a user has to make, and (2) to report a comprehensive ablation
+study to discuss the impact of each of the choices involved in designing
+unrolled networks and present practical recommendations based on our findings.
+We anticipate that this study will help scientists and engineers design
+unrolled networks for their applications and diagnose problems within their
+networks efficiently.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 10 figures. Project Page:
+  https://github.com/YuxiChen25/Memory-Net-Inverse</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient License Plate Recognition in Videos Using Visual Rhythm and
+  Accumulative Line Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Nascimento Ribeiro, Nina S. T. Hirata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based Automatic License Plate Recognition (ALPR) involves extracting
+vehicle license plate text information from video captures. Traditional systems
+typically rely heavily on high-end computing resources and utilize multiple
+frames to recognize license plates, leading to increased computational
+overhead. In this paper, we propose two methods capable of efficiently
+extracting exactly one frame per vehicle and recognizing its license plate
+characters from this single image, thus significantly reducing computational
+demands. The first method uses Visual Rhythm (VR) to generate time-spatial
+images from videos, while the second employs Accumulative Line Analysis (ALA),
+a novel algorithm based on single-line video processing for real-time
+operation. Both methods leverage YOLO for license plate detection within the
+frame and a Convolutional Neural Network (CNN) for Optical Character
+Recognition (OCR) to extract textual information. Experiments on real videos
+demonstrate that the proposed methods achieve results comparable to traditional
+frame-by-frame approaches, with processing speeds three times faster.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Conference on Graphics, Patterns and
+  Images (SIBGRAPI) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated-Continual Dynamic Segmentation of Histopathology guided by
+  Barlow Continuity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niklas Babendererde, Haozhe Zhu, Moritz Fuchs, Jonathan Stieber, Anirban Mukhopadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated- and Continual Learning have been established as approaches to
+enable privacy-aware learning on continuously changing data, as required for
+deploying AI systems in histopathology images. However, data shifts can occur
+in a dynamic world, spatially between institutions and temporally, due to
+changing data over time. This leads to two issues: Client Drift, where the
+central model degrades from aggregating data from clients trained on shifted
+data, and Catastrophic Forgetting, from temporal shifts such as changes in
+patient populations. Both tend to degrade the model's performance of previously
+seen data or spatially distributed training. Despite both problems arising from
+the same underlying problem of data shifts, existing research addresses them
+only individually. In this work, we introduce a method that can jointly
+alleviate Client Drift and Catastrophic Forgetting by using our proposed
+Dynamic Barlow Continuity that evaluates client updates on a public reference
+dataset and uses this to guide the training process to a spatially and
+temporally shift-invariant model. We evaluate our approach on the
+histopathology datasets BCSS and Semicol and prove our method to be highly
+effective by jointly improving the dice score as much as from 15.8% to 71.6% in
+Client Drift and from 42.5% to 62.8% in Catastrophic Forgetting. This enables
+Dynamic Learning by establishing spatio-temporal shift-invariance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A 65 nm Bayesian Neural Network Accelerator with 360 fJ/Sample In-Word
+  GRNG for AI Uncertainty Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zephan M. Enciso, Boyang Cheng, Likai Pei, Jianbo Liu, Steven Davis, Ningyuan Cao, Michael Niemier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty estimation is an indispensable capability for AI-enabled,
+safety-critical applications, e.g. autonomous vehicles or medical diagnosis.
+Bayesian neural networks (BNNs) use Bayesian statistics to provide both
+classification predictions and uncertainty estimation, but they suffer from
+high computational overhead associated with random number generation and
+repeated sample iterations. Furthermore, BNNs are not immediately amenable to
+acceleration through compute-in-memory architectures due to the frequent memory
+writes necessary after each RNG operation. To address these challenges, we
+present an ASIC that integrates 360 fJ/Sample Gaussian RNG directly into the
+SRAM memory words. This integration reduces RNG overhead and enables
+fully-parallel compute-in-memory operations for BNNs. The prototype chip
+achieves 5.12 GSa/s RNG throughput and 102 GOp/s neural network throughput
+while occupying 0.45 mm2, bringing AI uncertainty estimation to edge
+computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large-Scale Spectral Graph Neural Networks via Laplacian Sparsification:
+  Technical Report 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haipeng Ding, Zhewei Wei, Yuhang Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) play a pivotal role in graph-based tasks for
+their proficiency in representation learning. Among the various GNN methods,
+spectral GNNs employing polynomial filters have shown promising performance on
+tasks involving both homophilous and heterophilous graph structures. However,
+The scalability of spectral GNNs on large graphs is limited because they learn
+the polynomial coefficients through multiple forward propagation executions
+during forward propagation. Existing works have attempted to scale up spectral
+GNNs by eliminating the linear layers on the input node features, a change that
+can disrupt end-to-end training, potentially impact performance, and become
+impractical with high-dimensional input features. To address the above
+challenges, we propose "Spectral Graph Neural Networks with Laplacian
+Sparsification (SGNN-LS)", a novel graph spectral sparsification method to
+approximate the propagation patterns of spectral GNNs. We prove that our
+proposed method generates Laplacian sparsifiers that can approximate both fixed
+and learnable polynomial filters with theoretical guarantees. Our method allows
+the application of linear layers on the input node features, enabling
+end-to-end training as well as the handling of raw text features. We conduct an
+extensive experimental analysis on datasets spanning various graph scales and
+properties to demonstrate the superior efficiency and effectiveness of our
+method. The results show that our method yields superior results in comparison
+with the corresponding approximated base models, especially on dataset
+Ogbn-papers100M(111M nodes, 1.6B edges) and MAG-scholar-C (2.8M features).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Supervision-free Vision-Language Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Giannone, Ruoteng Li, Qianli Feng, Evgeny Perevodchikov, Rui Chen, Aleix Martinez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) have demonstrated remarkable potential in
+integrating visual and linguistic information, but their performance is often
+constrained by the need for extensive, high-quality image-text training data.
+Curation of these image-text pairs is both time-consuming and computationally
+expensive. To address this challenge, we introduce SVP (Supervision-free Visual
+Projection), a novel framework that enhances vision-language alignment without
+relying on curated data or preference annotation. SVP leverages self-captioning
+and a pre-trained grounding model as a feedback mechanism to elicit latent
+information in VLMs. We evaluate our approach across six key areas: captioning,
+referring, visual question answering, multitasking, hallucination control, and
+object recall. Results demonstrate significant improvements, including a 14%
+average improvement in captioning tasks, up to 12% increase in object recall,
+and substantial reduction in hallucination rates. Notably, a small VLM using
+SVP achieves hallucination reductions comparable to a model five times larger,
+while a VLM with initially poor referring capabilities more than doubles its
+performance, approaching parity with a model twice its size.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Medical artificial intelligence toolbox (MAIT): an explainable machine
+  learning framework for binary classification, survival modelling, and
+  regression analyses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04547v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04547v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramtin Zargari Marandi, Anne Svane Frahm, Jens Lundgren, Daniel Dawson Murray, Maja Milojevic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While machine learning offers diverse techniques suitable for exploring
+various medical research questions, a cohesive synergistic framework can
+facilitate the integration and understanding of new approaches within unified
+model development and interpretation. We therefore introduce the Medical
+Artificial Intelligence Toolbox (MAIT), an explainable, open-source Python
+pipeline for developing and evaluating binary classification, regression, and
+survival models on tabular datasets. MAIT addresses key challenges (e.g., high
+dimensionality, class imbalance, mixed variable types, and missingness) while
+promoting transparency in reporting (TRIPOD+AI compliant). Offering automated
+configurations for beginners and customizable source code for experts, MAIT
+streamlines two primary use cases: Discovery (feature importance via unified
+scoring, e.g., SHapley Additive exPlanations - SHAP) and Prediction (model
+development and deployment with optimized solutions). Moreover, MAIT proposes
+new techniques including fine-tuning of probability threshold in binary
+classification, translation of cumulative hazard curves to binary
+classification, enhanced visualizations for model interpretation for mixed data
+types, and handling censoring through semi-supervised learning, to adapt to a
+wide set of data constraints and study designs. We provide detailed tutorials
+on GitHub, using four open-access data sets, to demonstrate how MAIT can be
+used to improve implementation and interpretation of ML models in medical
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 2 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HypeRL: Parameter-Informed Reinforcement Learning for Parametric PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04538v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04538v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolò Botteghi, Stefania Fresca, Mengwu Guo, Andrea Manzoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we devise a new, general-purpose reinforcement learning
+strategy for the optimal control of parametric partial differential equations
+(PDEs). Such problems frequently arise in applied sciences and engineering and
+entail a significant complexity when control and/or state variables are
+distributed in high-dimensional space or depend on varying parameters.
+Traditional numerical methods, relying on either iterative minimization
+algorithms or dynamic programming, while reliable, often become computationally
+infeasible. Indeed, in either way, the optimal control problem must be solved
+for each instance of the parameters, and this is out of reach when dealing with
+high-dimensional time-dependent and parametric PDEs. In this paper, we propose
+HypeRL, a deep reinforcement learning (DRL) framework to overcome the
+limitations shown by traditional methods. HypeRL aims at approximating the
+optimal control policy directly. Specifically, we employ an actor-critic DRL
+approach to learn an optimal feedback control strategy that can generalize
+across the range of variation of the parameters. To effectively learn such
+optimal control laws, encoding the parameter information into the DRL policy
+and value function neural networks (NNs) is essential. To do so, HypeRL uses
+two additional NNs, often called hypernetworks, to learn the weights and biases
+of the value function and the policy NNs. We validate the proposed approach on
+two PDE-constrained optimal control benchmarks, namely a 1D
+Kuramoto-Sivashinsky equation and a 2D Navier-Stokes equations, by showing that
+the knowledge of the PDE parameters and how this information is encoded, i.e.,
+via a hypernetwork, is an essential ingredient for learning parameter-dependent
+control policies that can generalize effectively to unseen scenarios and for
+improving the sample efficiency of such policies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combining YOLO and Visual Rhythm for Vehicle Counting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Nascimento Ribeiro, Nina S. T. Hirata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based vehicle detection and counting play a critical role in managing
+transport infrastructure. Traditional image-based counting methods usually
+involve two main steps: initial detection and subsequent tracking, which are
+applied to all video frames, leading to a significant increase in computational
+complexity. To address this issue, this work presents an alternative and more
+efficient method for vehicle detection and counting. The proposed approach
+eliminates the need for a tracking step and focuses solely on detecting
+vehicles in key video frames, thereby increasing its efficiency. To achieve
+this, we developed a system that combines YOLO, for vehicle detection, with
+Visual Rhythm, a way to create time-spatial images that allows us to focus on
+frames that contain useful information. Additionally, this method can be used
+for counting in any application involving unidirectional moving targets to be
+detected and identified. Experimental analysis using real videos shows that the
+proposed method achieves mean counting accuracy around 99.15% over a set of
+videos, with a processing speed three times faster than tracking based
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Conference on Graphics, Patterns and
+  Images (SIBGRAPI) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Plug-and-Play Bregman ADMM Module for Inferring Event Branches in
+  Temporal Point Processes <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingmei Wang, Yuxin Wu, Yujie Long, Jing Huang, Fengyuan Ran, Bing Su, Hongteng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An event sequence generated by a temporal point process is often associated
+with a hidden and structured event branching process that captures the
+triggering relations between its historical and current events. In this study,
+we design a new plug-and-play module based on the Bregman ADMM (BADMM)
+algorithm, which infers event branches associated with event sequences in the
+maximum likelihood estimation framework of temporal point processes (TPPs).
+Specifically, we formulate the inference of event branches as an optimization
+problem for the event transition matrix under sparse and low-rank constraints,
+which is embedded in existing TPP models or their learning paradigms. We can
+implement this optimization problem based on subspace clustering and sparse
+group-lasso, respectively, and solve it using the Bregman ADMM algorithm, whose
+unrolling leads to the proposed BADMM module. When learning a classic TPP
+(e.g., Hawkes process) by the expectation-maximization algorithm, the BADMM
+module helps derive structured responsibility matrices in the E-step.
+Similarly, the BADMM module helps derive low-rank and sparse attention maps for
+the neural TPPs with self-attention layers. The structured responsibility
+matrices and attention maps, which work as learned event transition matrices,
+indicate event branches, e.g., inferring isolated events and those key events
+triggering many subsequent events. Experiments on both synthetic and real-world
+data show that plugging our BADMM module into existing TPP models and learning
+paradigms can improve model performance and provide us with interpretable
+structured event branches. The code is available at
+\url{https://github.com/qingmeiwangdaily/BADMM_TPP}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a Problem-Oriented Domain Adaptation Framework for Machine
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Spitzer, Dominik Martin, Laurin Eichberger, Niklas Kühl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation is a sub-field of machine learning that involves
+transferring knowledge from a source domain to perform the same task in the
+target domain. It is a typical challenge in machine learning that arises, e.g.,
+when data is obtained from various sources or when using a data basis that
+changes over time. Recent advances in the field offer promising methods, but it
+is still challenging for researchers and practitioners to determine if domain
+adaptation is suitable for a given problem -- and, subsequently, to select the
+appropriate approach. This article employs design science research to develop a
+problem-oriented framework for domain adaptation, which is matured in three
+evaluation episodes. We describe a framework that distinguishes between five
+domain adaptation scenarios, provides recommendations for addressing each
+scenario, and offers guidelines for determining if a problem falls into one of
+these scenarios. During the multiple evaluation episodes, the framework is
+tested on artificial and real-world datasets and an experimental study
+involving 100 participants. The evaluation demonstrates that the framework has
+the explanatory power to capture any domain adaptation problem effectively. In
+summary, we provide clear guidance for researchers and practitioners who want
+to employ domain adaptation but lack in-depth knowledge of the possibilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Fair Class-wise Robustness: Class Optimal Distribution
+  Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongxin Zhi, Hongtao Yu, Shaome Li, Xiuming Zhao, Yiteng Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial training has proven to be a highly effective method for improving
+the robustness of deep neural networks against adversarial attacks.
+Nonetheless, it has been observed to exhibit a limitation in terms of robust
+fairness, characterized by a significant disparity in robustness across
+different classes. Recent efforts to mitigate this problem have turned to
+class-wise reweighted methods. However, these methods suffer from a lack of
+rigorous theoretical analysis and are limited in their exploration of the
+weight space, as they mainly rely on existing heuristic algorithms or intuition
+to compute weights. In addition, these methods fail to guarantee the
+consistency of the optimization direction due to the decoupled optimization of
+weights and the model parameters. They potentially lead to suboptimal weight
+assignments and consequently, a suboptimal model. To address these problems,
+this paper proposes a novel min-max training framework, Class Optimal
+Distribution Adversarial Training (CODAT), which employs distributionally
+robust optimization to fully explore the class-wise weight space, thus enabling
+the identification of the optimal weight with theoretical guarantees.
+Furthermore, we derive a closed-form optimal solution to the internal
+maximization and then get a deterministic equivalent objective function, which
+provides a theoretical basis for the joint optimization of weights and model
+parameters. Meanwhile, we propose a fairness elasticity coefficient for the
+evaluation of the algorithm with regard to both robustness and robust fairness.
+Experimental results on various datasets show that the proposed method can
+effectively improve the robust fairness of the model and outperform the
+state-of-the-art approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating remote sensing data assimilation, deep learning and large
+  language model for interactive wheat breeding yield prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guofeng Yang, Nanfei Jin, Wenjie Ai, Zhonghua Zheng, Yuhong He, Yong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Yield is one of the core goals of crop breeding. By predicting the potential
+yield of different breeding materials, breeders can screen these materials at
+various growth stages to select the best performing. Based on unmanned aerial
+vehicle remote sensing technology, high-throughput crop phenotyping data in
+breeding areas is collected to provide data support for the breeding decisions
+of breeders. However, the accuracy of current yield predictions still requires
+improvement, and the usability and user-friendliness of yield forecasting tools
+remain suboptimal. To address these challenges, this study introduces a hybrid
+method and tool for crop yield prediction, designed to allow breeders to
+interactively and accurately predict wheat yield by chatting with a large
+language model (LLM). First, the newly designed data assimilation algorithm is
+used to assimilate the leaf area index into the WOFOST model. Then, selected
+outputs from the assimilation process, along with remote sensing inversion
+results, are used to drive the time-series temporal fusion transformer model
+for wheat yield prediction. Finally, based on this hybrid method and leveraging
+an LLM with retrieval augmented generation technology, we developed an
+interactive yield prediction Web tool that is user-friendly and supports
+sustainable data updates. This tool integrates multi-source data to assist
+breeding decision-making. This study aims to accelerate the identification of
+high-yield materials in the breeding process, enhance breeding efficiency, and
+enable more scientific and smart breeding decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safe Reinforcement Learning with Minimal Supervision <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Quessy, Thomas Richardson, Sebastian East
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) in the real world necessitates the development of
+procedures that enable agents to explore without causing harm to themselves or
+others. The most successful solutions to the problem of safe RL leverage
+offline data to learn a safe-set, enabling safe online exploration. However,
+this approach to safe-learning is often constrained by the demonstrations that
+are available for learning.
+  In this paper we investigate the influence of the quantity and quality of
+data used to train the initial safe learning problem offline on the ability to
+learn safe-RL policies online. Specifically, we focus on tasks with spatially
+extended goal states where we have few or no demonstrations available.
+Classically this problem is addressed either by using hand-designed controllers
+to generate data or by collecting user-generated demonstrations. However, these
+methods are often expensive and do not scale to more complex tasks and
+environments. To address this limitation we propose an unsupervised RL-based
+offline data collection procedure, to learn complex and scalable policies
+without the need for hand-designed controllers or user demonstrations. Our
+research demonstrates the significance of providing sufficient demonstrations
+for agents to learn optimal safe-RL policies online, and as a result, we
+propose optimistic forgetting, a novel online safe-RL approach that is
+practical for scenarios with limited data. Further, our unsupervised data
+collection approach highlights the need to balance diversity and optimality for
+safe online exploration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Initially submitted to ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regularising NARX models with multi-task learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Bee, Lawrence Bull, Nikolaos Dervilis, Keith Worden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A Nonlinear Auto-Regressive with eXogenous inputs (NARX) model can be used to
+describe time-varying processes; where the output depends on both previous
+outputs and current/previous external input variables. One limitation of NARX
+models is their propensity to overfit and result in poor generalisation for
+future predictions. The proposed method to help to overcome the issue of
+overfitting is a NARX model which predicts outputs at both the current time and
+several lead times into the future. This is a form of multi-task learner (MTL);
+whereby the lead time outputs will regularise the current time output. This
+work shows that for high noise level, MTL can be used to regularise NARX with a
+lower Normalised Mean Square Error (NMSE) compared to the NMSE of the
+independent learner counterpart.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Purification: Defense Against Poisoning Attack in Decentralized
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04453v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04453v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Li, Xiaoye Miao, Yongheng Shang, Xinkui Zhao, Shuiguang Deng, Jianwei Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized federated learning (DFL) is inherently vulnerable to poisoning
+attacks, as malicious clients can transmit manipulated model gradients to
+neighboring clients. Existing defense methods either reject suspicious
+gradients per iteration or restart DFL aggregation after detecting all
+malicious clients. They overlook the potential accuracy benefit from the
+discarded malicious gradients. In this paper, we propose a novel gradient
+purification defense, named GPD, that integrates seamlessly with existing DFL
+aggregation to defend against poisoning attacks. It aims to mitigate the harm
+in model gradients while retaining the benefit in model weights for enhancing
+accuracy. For each benign client in GPD, a recording variable is designed to
+track the historically aggregated gradients from one of its neighbors. It
+allows benign clients to precisely detect malicious neighbors and swiftly
+mitigate aggregated malicious gradients via historical consistency checks. Upon
+mitigation, GPD optimizes model weights via aggregating gradients solely from
+benign clients. This retains the previously beneficial portions from malicious
+clients and exploits the contributions from benign clients, thereby
+significantly enhancing the model accuracy. We analyze the convergence of GPD,
+as well as its ability to harvest high accuracy. Extensive experiments over
+three datasets demonstrate that, GPD is capable of mitigating poisoning attacks
+under both iid and non-iid data distributions. It significantly outperforms
+state-of-the-art defenses in terms of accuracy against various poisoning
+attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting LocalSGD and SCAFFOLD: Improved Rates and Missing Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruichen Luo, Sebastian U Stich, Samuel Horváth, Martin Takáč
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LocalSGD and SCAFFOLD are widely used methods in distributed stochastic
+optimization, with numerous applications in machine learning, large-scale data
+processing, and federated learning. However, rigorously establishing their
+theoretical advantages over simpler methods, such as minibatch SGD (MbSGD), has
+proven challenging, as existing analyses often rely on strong assumptions,
+unrealistic premises, or overly restrictive scenarios.
+  In this work, we revisit the convergence properties of LocalSGD and SCAFFOLD
+under a variety of existing or weaker conditions, including gradient
+similarity, Hessian similarity, weak convexity, and Lipschitz continuity of the
+Hessian. Our analysis shows that (i) LocalSGD achieves faster convergence
+compared to MbSGD for weakly convex functions without requiring stronger
+gradient similarity assumptions; (ii) LocalSGD benefits significantly from
+higher-order similarity and smoothness; and (iii) SCAFFOLD demonstrates faster
+convergence than MbSGD for a broader class of non-quadratic functions. These
+theoretical insights provide a clearer understanding of the conditions under
+which LocalSGD and SCAFFOLD outperform MbSGD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Motif Discovery Framework for Psychiatric EEG Data Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melanija Kraljevska, Katerina Hlavackova-Schindler, Lukas Miklautz, Claudia Plant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In current medical practice, patients undergoing depression treatment must
+wait four to six weeks before a clinician can assess medication response due to
+the delayed noticeable effects of antidepressants. Identification of a
+treatment response at any earlier stage is of great importance, since it can
+reduce the emotional and economic burden connected with the treatment. We
+approach the prediction of a patient response to a treatment as a
+classification problem, by utilizing the dynamic properties of EEG recordings
+on the 7th day of the treatment. We present a novel framework that applies
+motif discovery to extract meaningful features from EEG data distinguishing
+between depression treatment responders and non-responders. We applied our
+framework also to classification tasks in other psychiatric EEG datasets,
+namely to patients with symptoms of schizophrenia, pediatric patients with
+intractable seizures, and Alzheimer disease and dementia. We achieved high
+classification precision in all data sets. The results demonstrate that the
+dynamic properties of the EEGs may support clinicians in decision making both
+in diagnosis and in the prediction depression treatment response as early as on
+the 7th day of the treatment. To our best knowledge, our work is the first one
+using motifs in the depression diagnostics in general.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Fine-Tuning of LLMs: Framework Comparison and Research
+  Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Na Yan, Yang Su, Yansha Deng, Robert Schober
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) provides a privacy-preserving solution for
+fine-tuning pre-trained large language models (LLMs) using distributed private
+datasets, enabling task-specific adaptation while preserving data privacy.
+However, fine-tuning the extensive parameters in LLMs is particularly
+challenging in resource-constrained federated scenarios due to the significant
+communication and computational costs. To gain a deeper understanding of how
+these challenges can be addressed, this article conducts a comparative analysis
+three advanced federated LLM (FedLLM) frameworks that integrate knowledge
+distillation (KD) and split learning (SL) to mitigate these issues: 1) FedLLMs,
+where clients upload model parameters or gradients to enable straightforward
+and effective fine-tuning; 2) KD-FedLLMs, which leverage KD for efficient
+knowledge sharing via logits; and 3) Split-FedLLMs, which split the LLMs into
+two parts, with one part executed on the client and the other one on the
+server, to balance the computational load. Each framework is evaluated based on
+key performance metrics, including model accuracy, communication overhead, and
+client-side computational load, offering insights into their effectiveness for
+various federated fine-tuning scenarios. Through this analysis, we identify
+framework-specific optimization opportunities to enhance the efficiency of
+FedLLMs and discuss broader research directions, highlighting open
+opportunities to better adapt FedLLMs for real-world applications. A use case
+is presented to demonstrate the performance comparison of these three
+frameworks under varying configurations and settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual-Force: Enhanced Offline Diversity Maximization under Imitation
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavel Kolev, Marin Vlastelica, Georg Martius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While many algorithms for diversity maximization under imitation constraints
+are online in nature, many applications require offline algorithms without
+environment interactions. Tackling this problem in the offline setting,
+however, presents significant challenges that require non-trivial, multi-stage
+optimization processes with non-stationary rewards. In this work, we present a
+novel offline algorithm that enhances diversity using an objective based on Van
+der Waals (VdW) force and successor features, and eliminates the need to learn
+a previously used skill discriminator. Moreover, by conditioning the value
+function and policy on a pre-trained Functional Reward Encoding (FRE), our
+method allows for better handling of non-stationary rewards and provides
+zero-shot recall of all skills encountered during training, significantly
+expanding the set of skills learned in prior work. Consequently, our algorithm
+benefits from receiving a consistently strong diversity signal (VdW), and
+enjoys more stable and efficient training. We demonstrate the effectiveness of
+our method in generating diverse skills for two robotic tasks in simulation:
+locomotion of a quadruped and local navigation with obstacle traversal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Risk-averse policies for natural gas futures trading using
+  distributional reinforcement learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Félicien Hêche, Biagio Nigro, Oussama Barakat, Stephan Robert-Nicoud
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Financial markets have experienced significant instabilities in recent years,
+creating unique challenges for trading and increasing interest in risk-averse
+strategies. Distributional Reinforcement Learning (RL) algorithms, which model
+the full distribution of returns rather than just expected values, offer a
+promising approach to managing market uncertainty. This paper investigates this
+potential by studying the effectiveness of three distributional RL algorithms
+for natural gas futures trading and exploring their capacity to develop
+risk-averse policies. Specifically, we analyze the performance and behavior of
+Categorical Deep Q-Network (C51), Quantile Regression Deep Q-Network (QR-DQN),
+and Implicit Quantile Network (IQN). To the best of our knowledge, these
+algorithms have never been applied in a trading context. These policies are
+compared against five Machine Learning (ML) baselines, using a detailed dataset
+provided by Predictive Layer SA, a company supplying ML-based strategies for
+energy trading. The main contributions of this study are as follows. (1) We
+demonstrate that distributional RL algorithms significantly outperform
+classical RL methods, with C51 achieving performance improvement of more than
+32\%. (2) We show that training C51 and IQN to maximize CVaR produces
+risk-sensitive policies with adjustable risk aversion. Specifically, our
+ablation studies reveal that lower CVaR confidence levels increase risk
+aversion, while higher levels decrease it, offering flexible risk management
+options. In contrast, QR-DQN shows less predictable behavior. These findings
+emphasize the potential of distributional RL for developing adaptable,
+risk-averse trading strategies in volatile markets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning and statistical classification of CRISPR-Cas12a
+  diagnostic assays 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Khosla, Jake M. Lesinski, Marcus Haywood-Alexander, Andrew J. deMello, Daniel A. Richards
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CRISPR-based diagnostics have gained increasing attention as biosensing tools
+able to address limitations in contemporary molecular diagnostic tests. To
+maximise the performance of CRISPR-based assays, much effort has focused on
+optimizing the chemistry and biology of the biosensing reaction. However, less
+attention has been paid to improving the techniques used to analyse
+CRISPR-based diagnostic data. To date, diagnostic decisions typically involve
+various forms of slope-based classification. Such methods are superior to
+traditional methods based on assessing absolute signals, but still have
+limitations. Herein, we establish performance benchmarks (total accuracy,
+sensitivity, and specificity) using common slope-based methods. We compare the
+performance of these benchmark methods with three different quadratic empirical
+distribution function statistical tests, finding significant improvements in
+diagnostic speed and accuracy when applied to a clinical data set. Two of the
+three statistical techniques, the Kolmogorov-Smirnov and Anderson-Darling
+tests, report the lowest time-to-result and highest total test accuracy.
+Furthermore, we developed a long short-term memory recurrent neural network to
+classify CRISPR-biosensing data, achieving 100% specificity on our model data
+set. Finally, we provide guidelines on choosing the classification method and
+classification method parameters that best suit a diagnostic assays needs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 5 figures, research paper. Nathan Khosla and Jake M.
+  Lesinski contributed equally. Electronic supporting information is included
+  as an appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ User Simulation in the Era of Generative AI: User Modeling, Synthetic
+  Data Generation, and System Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krisztian Balog, ChengXiang Zhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User simulation is an emerging interdisciplinary topic with multiple critical
+applications in the era of Generative AI. It involves creating an intelligent
+agent that mimics the actions of a human user interacting with an AI system,
+enabling researchers to model and analyze user behaviour, generate synthetic
+data for training, and evaluate interactive AI systems in a controlled and
+reproducible manner. User simulation has profound implications for diverse
+fields and plays a vital role in the pursuit of Artificial General
+Intelligence. This paper provides an overview of user simulation, highlighting
+its key applications, connections to various disciplines, and outlining future
+research directions to advance this increasingly important technology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lossless Privacy-Preserving Aggregation for Decentralized Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoye Miao, Bin Li, Yangyang Wu, Meng Xi, Xinkui Zhao, Jianwei Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Privacy concerns arise as sensitive data proliferate. Despite decentralized
+federated learning (DFL) aggregating gradients from neighbors to avoid direct
+data transmission, it still poses indirect data leaks from the transmitted
+gradients. Existing privacy-preserving methods for DFL add noise to gradients.
+They either diminish the model predictive accuracy or suffer from ineffective
+gradient protection. In this paper, we propose a novel lossless
+privacy-preserving aggregation rule named LPPA to enhance gradient protection
+as much as possible but without loss of DFL model predictive accuracy. LPPA
+subtly injects the noise difference between the sent and received noise into
+transmitted gradients for gradient protection. The noise difference
+incorporates neighbors' randomness for each client, effectively safeguarding
+against data leaks. LPPA employs the noise flow conservation theory to ensure
+that the noise impact can be globally eliminated. The global sum of all noise
+differences remains zero, ensuring that accurate gradient aggregation is
+unaffected and the model accuracy remains intact. We theoretically prove that
+the privacy-preserving capacity of LPPA is \sqrt{2} times greater than that of
+noise addition, while maintaining comparable model accuracy to the standard DFL
+aggregation without noise injection. Experimental results verify the
+theoretical findings and show that LPPA achieves a 13% mean improvement in
+accuracy over noise addition. We also demonstrate the effectiveness of LPPA in
+protecting raw data and guaranteeing lossless model accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rising Rested MAB with Linear Drift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omer Amichay, Yishay Mansour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider non-stationary multi-arm bandit (MAB) where the expected reward
+of each action follows a linear function of the number of times we executed the
+action. Our main result is a tight regret bound of
+$\tilde{\Theta}(T^{4/5}K^{3/5})$, by providing both upper and lower bounds. We
+extend our results to derive instance dependent regret bounds, which depend on
+the unknown parametrization of the linear drift of the rewards.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tracking UWB Devices Through Radio Frequency Fingerprinting Is Possible 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thibaud Ardoin, Niklas Pauli, Benedikt Groß, Mahsa Kholghi, Khan Reaz, Gerhard Wunder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultra-wideband (UWB) is a state-of-the-art technology designed for
+applications requiring centimeter-level localization. Its widespread adoption
+by smartphone manufacturer naturally raises security and privacy concerns.
+Successfully implementing Radio Frequency Fingerprinting (RFF) to UWB could
+enable physical layer security, but might also allow undesired tracking of the
+devices. The scope of this paper is to explore the feasibility of applying RFF
+to UWB and investigates how well this technique generalizes across different
+environments. We collected a realistic dataset using off-the-shelf UWB devices
+with controlled variation in device positioning. Moreover, we developed an
+improved deep learning pipeline to extract the hardware signature from the
+signal data. In stable conditions, the extracted RFF achieves over 99%
+accuracy. While the accuracy decreases in more changing environments, we still
+obtain up to 76% accuracy in untrained locations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>conference ICNC'25, 7 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The unbearable lightness of Restricted Boltzmann Machines: Theoretical
+  Insights and Biological Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni di Sarra, Barbara Bravi, Yasser Roudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Restricted Boltzmann Machines are simple yet powerful neural networks. They
+can be used for learning structure in data, and are used as a building block of
+more complex neural architectures. At the same time, their simplicity makes
+them easy to use, amenable to theoretical analysis, yielding interpretable
+models in applications. Here, we focus on reviewing the role that the
+activation functions, describing the input-output relationship of single
+neurons in RBM, play in the functionality of these models. We discuss recent
+theoretical results on the benefits and limitations of different activation
+functions. We also review applications to biological data analysis, namely
+neural data analysis, where RBM units are mostly taken to have sigmoid
+activation functions and binary units, to protein data analysis and immunology
+where non-binary units and non-sigmoid activation functions have recently been
+shown to yield important insights into the data. Finally, we discuss open
+problems addressing which can shed light on broader issues in neural network
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures. To be published in EPL as di Sarra et al 2025
+  EPL. Accepted manuscript available online at
+  https://doi.org/10.1209/0295-5075/ada636</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Computational Limits and Provably Efficient Criteria of Visual
+  Autoregressive Models: A Fine-Grained Complexity Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yekun Ke, Xiaoyu Li, Yingyu Liang, Zhizhou Sha, Zhenmei Shi, Zhao Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Visual Autoregressive ($\mathsf{VAR}$) Models introduced a
+groundbreaking advancement in the field of image generation, offering a
+scalable approach through a coarse-to-fine "next-scale prediction" paradigm.
+However, the state-of-the-art algorithm of $\mathsf{VAR}$ models in [Tian,
+Jiang, Yuan, Peng and Wang, NeurIPS 2024] takes $O(n^4)$ time, which is
+computationally inefficient. In this work, we analyze the computational limits
+and efficiency criteria of $\mathsf{VAR}$ Models through a fine-grained
+complexity lens. Our key contribution is identifying the conditions under which
+$\mathsf{VAR}$ computations can achieve sub-quadratic time complexity.
+Specifically, we establish a critical threshold for the norm of input matrices
+used in $\mathsf{VAR}$ attention mechanisms. Above this threshold, assuming the
+Strong Exponential Time Hypothesis ($\mathsf{SETH}$) from fine-grained
+complexity theory, a sub-quartic time algorithm for $\mathsf{VAR}$ models is
+impossible. To substantiate our theoretical findings, we present efficient
+constructions leveraging low-rank approximations that align with the derived
+criteria. This work initiates the study of the computational efficiency of the
+$\mathsf{VAR}$ model from a theoretical perspective. Our technique will shed
+light on advancing scalable and efficient image generation in $\mathsf{VAR}$
+frameworks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding EEG Speech Perception with <span class="highlight-title">Transformer</span>s and VAE-based Data
+  Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Terrance Yu-Hao Chen, Yulin Chen, Pontus Soederhaell, Sadrishya Agrawal, Kateryna Shapovalenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoding speech from non-invasive brain signals, such as
+electroencephalography (EEG), has the potential to advance brain-computer
+interfaces (BCIs), with applications in silent communication and assistive
+technologies for individuals with speech impairments. However, EEG-based speech
+decoding faces major challenges, such as noisy data, limited datasets, and poor
+performance on complex tasks like speech perception. This study attempts to
+address these challenges by employing variational autoencoders (VAEs) for EEG
+data augmentation to improve data quality and applying a state-of-the-art
+(SOTA) sequence-to-sequence deep learning architecture, originally successful
+in electromyography (EMG) tasks, to EEG-based speech decoding. Additionally, we
+adapt this architecture for word classification tasks. Using the Brennan
+dataset, which contains EEG recordings of subjects listening to narrated
+speech, we preprocess the data and evaluate both classification and
+sequence-to-sequence models for EEG-to-words/sentences tasks. Our experiments
+show that VAEs have the potential to reconstruct artificial EEG data for
+augmentation. Meanwhile, our sequence-to-sequence model achieves more promising
+performance in generating sentences compared to our classification model,
+though both remain challenging tasks. These findings lay the groundwork for
+future research on EEG speech perception decoding, with possible extensions to
+speech production tasks such as silent or imagined speech.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 15 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeFusion: An Effective Decoupling Fusion Network for Multi-Modal
+  Pregnancy Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueqiang Ouyang, Jia Wei, Wenjie Huo, Xiaocong Wang, Rui Li, Jianlong Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal embryo images and parental fertility table indicators are both
+valuable for pregnancy prediction in \textbf{in vitro fertilization embryo
+transfer} (IVF-ET). However, current machine learning models cannot make full
+use of the complementary information between the two modalities to improve
+pregnancy prediction performance. In this paper, we propose a Decoupling Fusion
+Network called DeFusion to effectively integrate the multi-modal information
+for IVF-ET pregnancy prediction. Specifically, we propose a decoupling fusion
+module that decouples the information from the different modalities into
+related and unrelated information, thereby achieving a more delicate fusion.
+And we fuse temporal embryo images with a spatial-temporal position encoding,
+and extract fertility table indicator information with a table transformer. To
+evaluate the effectiveness of our model, we use a new dataset including 4046
+cases collected from Southern Medical University. The experiments show that our
+model outperforms state-of-the-art methods. Meanwhile, the performance on the
+eye disease prediction dataset reflects the model's good generalization. Our
+code and dataset are available at https://github.com/Ou-Young-1999/DFNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DCIts -- Deep Convolutional Interpreter for time series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davor Horvatic, Domjan Baric
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce an interpretable deep learning model for multivariate time
+series forecasting that prioritizes both predictive performance and
+interpretability - key requirements for understanding complex physical
+phenomena. Our model not only matches but often surpasses existing
+interpretability methods, achieving this without compromising accuracy. Through
+extensive experiments, we demonstrate its ability to identify the most relevant
+time series and lags that contribute to forecasting future values, providing
+intuitive and transparent explanations for its predictions. To minimize the
+need for manual supervision, the model is designed so one can robustly
+determine the optimal window size that captures all necessary interactions
+within the smallest possible time frame. Additionally, it effectively
+identifies the optimal model order, balancing complexity when incorporating
+higher-order terms. These advancements hold significant implications for
+modeling and understanding dynamic systems, making the model a valuable tool
+for applied and computational physicists.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoDFL: A Scalable and Automated Reputation-Aware Decentralized
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meryem Malak Dif, Mouhamed Amine Bouchiha, Mourad Rabah, Yacine Ghamri-Doudane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blockchained federated learning (BFL) combines the concepts of federated
+learning and blockchain technology to enhance privacy, security, and
+transparency in collaborative machine learning models. However, implementing
+BFL frameworks poses challenges in terms of scalability and cost-effectiveness.
+Reputation-aware BFL poses even more challenges, as blockchain validators are
+tasked with processing federated learning transactions along with the
+transactions that evaluate FL tasks and aggregate reputations. This leads to
+faster blockchain congestion and performance degradation. To improve BFL
+efficiency while increasing scalability and reducing on-chain reputation
+management costs, this paper proposes AutoDFL, a scalable and automated
+reputation-aware decentralized federated learning framework. AutoDFL leverages
+zk-Rollups as a Layer-2 scaling solution to boost the performance while
+maintaining the same level of security as the underlying Layer-1 blockchain.
+Moreover, AutoDFL introduces an automated and fair reputation model designed to
+incentivize federated learning actors. We develop a proof of concept for our
+framework for an accurate evaluation. Tested with various custom workloads,
+AutoDFL reaches an average throughput of over 3000 TPS with a gas reduction of
+up to 20X.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at NOMS'2025 (pages 9, figures 5)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VerifBFL: Leveraging zk-SNARKs for A Verifiable Blockchained Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04319v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04319v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Ayoub Bellachia, Mouhamed Amine Bouchiha, Yacine Ghamri-Doudane, Mourad Rabah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blockchain-based Federated Learning (FL) is an emerging decentralized machine
+learning paradigm that enables model training without relying on a central
+server. Although some BFL frameworks are considered privacy-preserving, they
+are still vulnerable to various attacks, including inference and model
+poisoning. Additionally, most of these solutions employ strong trust
+assumptions among all participating entities or introduce incentive mechanisms
+to encourage collaboration, making them susceptible to multiple security flaws.
+This work presents VerifBFL, a trustless, privacy-preserving, and verifiable
+federated learning framework that integrates blockchain technology and
+cryptographic protocols. By employing zero-knowledge Succinct Non-Interactive
+Argument of Knowledge (zk-SNARKs) and incrementally verifiable computation
+(IVC), VerifBFL ensures the verifiability of both local training and
+aggregation processes. The proofs of training and aggregation are verified
+on-chain, guaranteeing the integrity and auditability of each participant's
+contributions. To protect training data from inference attacks, VerifBFL
+leverages differential privacy. Finally, to demonstrate the efficiency of the
+proposed protocols, we built a proof of concept using emerging tools. The
+results show that generating proofs for local training and aggregation in
+VerifBFL takes less than 81s and 2s, respectively, while verifying them
+on-chain takes less than 0.6s.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at NOMS'25 (9 pages, 6 Figures)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoRA: Efficient Fine-Tuning of LLM with Reliability Optimization for
+  Rank Adaptation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04315v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04315v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Liu, Zhenglun Kong, Peiyan Dong, Xuan Shen, Pu Zhao, Hao Tang, Geng Yuan, Wei Niu, Wenbin Zhang, Xue Lin, Dong Huang, Yanzhi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning helps large language models (LLM) recover degraded information
+and enhance task performance.Although Low-Rank Adaptation (LoRA) is widely used
+and effective for fine-tuning, we have observed that its scaling factor can
+limit or even reduce performance as the rank size increases. To address this
+issue, we propose RoRA (Rank-adaptive Reliability Optimization), a simple yet
+effective method for optimizing LoRA's scaling factor. By replacing $\alpha/r$
+with $\alpha/\sqrt{r}$, RoRA ensures improved performance as rank size
+increases. Moreover, RoRA enhances low-rank adaptation in fine-tuning
+uncompressed models and excels in the more challenging task of accuracy
+recovery when fine-tuning pruned models. Extensive experiments demonstrate the
+effectiveness of RoRA in fine-tuning both uncompressed and pruned models. RoRA
+surpasses the state-of-the-art (SOTA) in average accuracy and robustness on
+LLaMA-7B/13B, LLaMA2-7B, and LLaMA3-8B, specifically outperforming LoRA and
+DoRA by 6.5% and 2.9% on LLaMA-7B, respectively. In pruned model fine-tuning,
+RoRA shows significant advantages; for SHEARED-LLAMA-1.3, a LLaMA-7B with 81.4%
+pruning, RoRA achieves 5.7% higher average accuracy than LoRA and 3.9% higher
+than DoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FSC-loss: A Frequency-domain Structure Consistency Learning Approach for
+  Signal Data Recovery and Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04308v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04308v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liwen Zhang, Zhaoji Miao, Fan Yang, Gen Shi, Jie He, Yu An, Hui Hui, Jie Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A core challenge for signal data recovery is to model the distribution of
+signal matrix (SM) data based on measured low-quality data in biomedical
+engineering of magnetic particle imaging (MPI). For acquiring the
+high-resolution (high-quality) SM, the number of meticulous measurements at
+numerous positions in the field-of-view proves time-consuming (measurement of a
+37x37x37 SM takes about 32 hours). To improve reconstructed signal quality and
+shorten SM measurement time, existing methods explore to generating
+high-resolution SM based on time-saving measured low-resolution SM (a 9x9x9 SM
+just takes about 0.5 hours). However, previous methods show poor performance
+for high-frequency signal recovery in SM. To achieve a high-resolution SM
+recovery and shorten its acquisition time, we propose a frequency-domain
+structure consistency loss function and data component embedding strategy to
+model global and local structural information of SM. We adopt a
+transformer-based network to evaluate this function and the strategy. We
+evaluate our methods and state-of-the-art (SOTA) methods on the two simulation
+datasets and four public measured SMs in Open MPI Data. The results show that
+our method outperforms the SOTA methods in high-frequency structural signal
+recovery. Additionally, our method can recover a high-resolution SM with clear
+high-frequency structure based on a down-sampling factor of 16 less than 15
+seconds, which accelerates the acquisition time over 60 times faster than the
+measurement-based HR SM with the minimum error (nRMSE=0.041). Moreover, our
+method is applied in our three in-house MPI systems, and boost their
+performance for signal reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages,7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-Informed Super-Resolution Diffusion for 6D Phase Space
+  Diagnostics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Scheinker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive physics-informed super-resolution diffusion is developed for
+non-invasive virtual diagnostics of the 6D phase space density of charged
+particle beams. An adaptive variational autoencoder (VAE) embeds initial beam
+condition images and scalar measurements to a low-dimensional latent space from
+which a 326 pixel 6D tensor representation of the beam's 6D phase space density
+is generated. Projecting from a 6D tensor generates physically consistent 2D
+projections. Physics-guided super-resolution diffusion transforms
+low-resolution images of the 6D density to high resolution 256x256 pixel
+images. Un-supervised adaptive latent space tuning enables tracking of
+time-varying beams without knowledge of time-varying initial conditions. The
+method is demonstrated with experimental data and multi-particle simulations at
+the HiRES UED. The general approach is applicable to a wide range of complex
+dynamic systems evolving in high-dimensional phase space. The method is shown
+to be robust to distribution shift without re-training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DGQ: Distribution-Aware Group Quantization for Text-to-Image Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyogon Ryu, NaHyeon Park, Hyunjung Shim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the widespread use of text-to-image diffusion models across various
+tasks, their computational and memory demands limit practical applications. To
+mitigate this issue, quantization of diffusion models has been explored. It
+reduces memory usage and computational costs by compressing weights and
+activations into lower-bit formats. However, existing methods often struggle to
+preserve both image quality and text-image alignment, particularly in
+lower-bit($<$ 8bits) quantization. In this paper, we analyze the challenges
+associated with quantizing text-to-image diffusion models from a distributional
+perspective. Our analysis reveals that activation outliers play a crucial role
+in determining image quality. Additionally, we identify distinctive patterns in
+cross-attention scores, which significantly affects text-image alignment. To
+address these challenges, we propose Distribution-aware Group Quantization
+(DGQ), a method that identifies and adaptively handles pixel-wise and
+channel-wise outliers to preserve image quality. Furthermore, DGQ applies
+prompt-specific logarithmic quantization scales to maintain text-image
+alignment. Our method demonstrates remarkable performance on datasets such as
+MS-COCO and PartiPrompts. We are the first to successfully achieve low-bit
+quantization of text-to-image diffusion models without requiring additional
+fine-tuning of weight quantization parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://ugonfor.kr/DGQ</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Handling Incomplete Heterogeneous Data using a Data-Dependent Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youran Zhou, Mohamed Reda Bouadjenek, Jonathan Wells, Sunil Aryal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handling incomplete data in real-world applications is a critical challenge
+due to two key limitations of existing methods: (i) they are primarily designed
+for numeric data and struggle with categorical or heterogeneous/mixed datasets;
+(ii) they assume that data is missing completely at random, which is often not
+the case in practice -- in reality, data is missing in patterns, leading to
+biased results if these patterns are not accounted for. To address these two
+limitations, this paper presents a novel approach to handling missing values
+using the Probability Mass Similarity Kernel (PMK), a data-dependent kernel,
+which does not make any assumptions about data types and missing mechanisms. It
+eliminates the need for prior knowledge or extensive pre-processing steps and
+instead leverages the distribution of observed data. Our method unifies the
+representation of diverse data types by capturing more meaningful pairwise
+similarities and enhancing downstream performance. We evaluated our approach
+across over 10 datasets with numerical-only, categorical-only, and mixed
+features under different missing mechanisms and rates. Across both
+classification and clustering tasks, our approach consistently outperformed
+existing techniques, demonstrating its robustness and effectiveness in managing
+incomplete heterogeneous data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Circuit Complexity Bounds for Visual Autoregressive Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yekun Ke, Xiaoyu Li, Yingyu Liang, Zhenmei Shi, Zhao Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the expressive ability of a specific model is essential for
+grasping its capacity limitations. Recently, several studies have established
+circuit complexity bounds for Transformer architecture. Besides, the Visual
+AutoRegressive (VAR) model has risen to be a prominent method in the field of
+image generation, outperforming previous techniques, such as Diffusion
+Transformers, in generating high-quality images. We investigate the circuit
+complexity of the VAR model and establish a bound in this study. Our primary
+result demonstrates that the VAR model is equivalent to a simulation by a
+uniform $\mathsf{TC}^0$ threshold circuit with hidden dimension $d \leq O(n)$
+and $\mathrm{poly}(n)$ precision. This is the first study to rigorously
+highlight the limitations in the expressive power of VAR models despite their
+impressive performance. We believe our findings will offer valuable insights
+into the inherent constraints of these models and guide the development of more
+efficient and expressive architectures in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAD-UV: The 1st INTERSPEECH Mice Autism Detection via Ultrasound
+  Vocalization Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijiang Yang, Meishu Song, Xin Jing, Haojie Zhang, Kun Qian, Bin Hu, Kota Tamada, Toru Takumi, Björn W. Schuller, Yoshiharu Yamamoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Mice Autism Detection via Ultrasound Vocalization (MAD-UV) Challenge
+introduces the first INTERSPEECH challenge focused on detecting autism spectrum
+disorder (ASD) in mice through their vocalizations. Participants are tasked
+with developing models to automatically classify mice as either wild-type or
+ASD models based on recordings with a high sampling rate. Our baseline system
+employs a simple CNN-based classification using three different spectrogram
+features. Results demonstrate the feasibility of automated ASD detection, with
+the considered audible-range features achieving the best performance (UAR of
+0.600 for segment-level and 0.625 for subject-level classification). This
+challenge bridges speech technology and biomedical research, offering
+opportunities to advance our understanding of ASD models through machine
+learning approaches. The findings suggest promising directions for vocalization
+analysis and highlight the potential value of audible and ultrasound
+vocalizations in ASD detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure and 2 tables. For MAD-UV Challenge 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Analysis of Model Robustness across Concurrent Distribution Shifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Myeongho Jeon, Suhwan Choi, Hyoje Lee, Teresa Yeo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models, meticulously optimized for source data, often fail
+to predict target data when faced with distribution shifts (DSs). Previous
+benchmarking studies, though extensive, have mainly focused on simple DSs.
+Recognizing that DSs often occur in more complex forms in real-world scenarios,
+we broadened our study to include multiple concurrent shifts, such as unseen
+domain shifts combined with spurious correlations. We evaluated 26 algorithms
+that range from simple heuristic augmentations to zero-shot inference using
+foundation models, across 168 source-target pairs from eight datasets. Our
+analysis of over 100K models reveals that (i) concurrent DSs typically worsen
+performance compared to a single shift, with certain exceptions, (ii) if a
+model improves generalization for one distribution shift, it tends to be
+effective for others, and (iii) heuristic data augmentations achieve the best
+overall performance on both synthetic and real-world datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to TMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ElasticZO: A Memory-Efficient On-Device Learning with Combined Zeroth-
+  and First-Order Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keisuke Sugiura, Hiroki Matsutani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zeroth-order (ZO) optimization is being recognized as a simple yet powerful
+alternative to standard backpropagation (BP)-based training. Notably, ZO
+optimization allows for training with only forward passes and (almost) the same
+memory as inference, making it well-suited for edge devices with limited
+computing and memory resources. In this paper, we propose ZO-based on-device
+learning (ODL) methods for full-precision and 8-bit quantized deep neural
+networks (DNNs), namely ElasticZO and ElasticZO-INT8. ElasticZO lies in the
+middle between pure ZO- and pure BP-based approaches, and is based on the idea
+to employ BP for the last few layers and ZO for the remaining layers.
+ElasticZO-INT8 achieves integer arithmetic-only ZO-based training for the first
+time, by incorporating a novel method for computing quantized ZO gradients from
+integer cross-entropy loss values. Experimental results on the classification
+datasets show that ElasticZO effectively addresses the slow convergence of
+vanilla ZO and shrinks the accuracy gap to BP-based training. Compared to
+vanilla ZO, ElasticZO achieves 5.2-9.5% higher accuracy with only 0.072-1.7%
+memory overhead, and can handle fine-tuning tasks as well as full training.
+ElasticZO-INT8 further reduces the memory usage and training time by 1.46-1.60x
+and 1.38-1.42x without compromising the accuracy. These results demonstrate a
+better tradeoff between accuracy and training cost compared to pure ZO- and
+BP-based approaches, and also highlight the potential of ZO optimization in
+on-device learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mapping the Edge of Chaos: Fractal-Like Boundaries in The Trainability
+  of Decoder-Only <span class="highlight-title">Transformer</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bahman Torkamandi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of fractal geometry, intricate structures emerge from simple
+iterative processes that partition parameter spaces into regions of stability
+and instability. Likewise, training large language models involves iteratively
+applying update functions, such as Adam, where even slight hyperparameter
+adjustments can shift the training process from convergence to divergence.
+Recent evidence from miniature neural networks suggests that the boundary
+separating these outcomes displays fractal characteristics [1]. Building on
+these insights, this study extends them to medium-sized, decoder-only
+transformer architectures by employing a more consistent convergence measure
+and examining the learning rate hyperparameter landscape for attention and
+fully connected layers. The results show that the trainability frontier is not
+a simple threshold; rather, it forms a self-similar yet seemingly random
+structure at multiple scales, with statistically consistent and repeating
+patterns. Within this landscape, a region of stable convergence is surrounded
+by a complex chaotic border, illustrating the sensitive nature of the
+underlying training dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cluster & Disperse: a general air conflict resolution heuristic using
+  unsupervised learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirmojtaba Gharibi, John-Paul Clarke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide a general and malleable heuristic for the air conflict resolution
+problem. This heuristic is based on a new neighborhood structure for searching
+the solution space of trajectories and flight-levels. Using unsupervised
+learning, the core idea of our heuristic is to cluster the conflict points and
+disperse them in various flight levels. Our first algorithm is called Cluster &
+Disperse and in each iteration it assigns the most problematic flights in each
+cluster to another flight-level. In effect, we shuffle them between the
+flight-levels until we achieve a well-balanced configuration. The Cluster &
+Disperse algorithm then uses any horizontal plane conflict resolution algorithm
+as a subroutine to solve these well-balanced instances. Nevertheless, we
+develop a novel algorithm for the horizontal plane based on a similar idea.
+That is we cluster and disperse the conflict points spatially in the same
+flight level using the gradient descent and a social force. We use a novel
+maneuver making flights travel on an arc instead of a straight path which is
+based on the aviation routine of the Radius to Fix legs. Our algorithms can
+handle a high density of flights within a reasonable computation time. We put
+their performance in context with some notable algorithms from the literature.
+Being a general framework, a particular strength of the Cluster & Disperse is
+its malleability in allowing various constraints regarding the aircraft or the
+environment to be integrated with ease. This is in contrast to the models for
+instance based on mixed integer programming.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On weight and variance uncertainty in neural networks for regression
+  tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04272v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04272v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moein Monemi, Morteza Amini, S. Mahmoud Taheri, Mohammad Arashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of weight uncertainty proposed by [Blundell et al.
+(2015). Weight uncertainty in neural network. In International conference on
+machine learning, 1613-1622, PMLR.] in neural networks {(NNs)} specialized for
+regression tasks. {We further} investigate the effect of variance uncertainty
+in {their model}. We show that including the variance uncertainty can improve
+the prediction performance of the Bayesian {NN}. Variance uncertainty enhances
+the generalization of the model {by} considering the posterior distribution
+over the variance parameter. { We examine the generalization ability of the
+proposed model using a function approximation} example and {further illustrate
+it with} the riboflavin genetic data set. {We explore fully connected dense
+networks and dropout NNs with} Gaussian and spike-and-slab priors,
+respectively, for the network weights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Click2Mask: Local Editing with Dynamic Mask Generation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08272v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08272v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omer Regev, Omri Avrahami, Dani Lischinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in generative models have revolutionized image generation
+and editing, making these tasks accessible to non-experts. This paper focuses
+on local image editing, particularly the task of adding new content to a
+loosely specified area. Existing methods often require a precise mask or a
+detailed description of the location, which can be cumbersome and prone to
+errors. We propose Click2Mask, a novel approach that simplifies the local
+editing process by requiring only a single point of reference (in addition to
+the content description). A mask is dynamically grown around this point during
+a Blended Latent Diffusion (BLD) process, guided by a masked CLIP-based
+semantic loss. Click2Mask surpasses the limitations of segmentation-based and
+fine-tuning dependent methods, offering a more user-friendly and contextually
+accurate solution. Our experiments demonstrate that Click2Mask not only
+minimizes user effort but also enables competitive or superior local image
+manipulations compared to SoTA methods, according to both human judgement and
+automatic metrics. Key contributions include the simplification of user input,
+the ability to freely add objects unconstrained by existing segments, and the
+integration potential of our dynamic mask approach within other editing
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2025. Project page is available at
+  https://omeregev.github.io/click2mask/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $O(k)$-Equivariant Dimensionality Reduction on Stiefel Manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10775v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10775v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Lee, Harlin Lee, Jose A. Perea, Nikolas Schonsheck, Madeleine Weinstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many real-world datasets live on high-dimensional Stiefel and Grassmannian
+manifolds, $V_k(\mathbb{R}^N)$ and $Gr(k, \mathbb{R}^N)$ respectively, and
+benefit from projection onto lower-dimensional Stiefel and Grassmannian
+manifolds. In this work, we propose an algorithm called \textit{Principal
+Stiefel Coordinates (PSC)} to reduce data dimensionality from $
+V_k(\mathbb{R}^N)$ to $V_k(\mathbb{R}^n)$ in an \textit{$O(k)$-equivariant}
+manner ($k \leq n \ll N$). We begin by observing that each element $\alpha \in
+V_n(\mathbb{R}^N)$ defines an isometric embedding of $V_k(\mathbb{R}^n)$ into
+$V_k(\mathbb{R}^N)$. Next, we describe two ways of finding a suitable embedding
+map $\alpha$: one via an extension of principal component analysis
+($\alpha_{PCA}$), and one that further minimizes data fit error using gradient
+descent ($\alpha_{GD}$). Then, we define a continuous and $O(k)$-equivariant
+map $\pi_\alpha$ that acts as a "closest point operator" to project the data
+onto the image of $V_k(\mathbb{R}^n)$ in $V_k(\mathbb{R}^N)$ under the
+embedding determined by $\alpha$, while minimizing distortion. Because this
+dimensionality reduction is $O(k)$-equivariant, these results extend to
+Grassmannian manifolds as well. Lastly, we show that $\pi_{\alpha_{PCA}}$
+globally minimizes projection error in a noiseless setting, while
+$\pi_{\alpha_{GD}}$ achieves a meaningfully different and improved outcome when
+the data does not lie exactly on the image of a linearly embedded
+lower-dimensional Stiefel manifold as above. Multiple numerical experiments
+using synthetic and real-world data are performed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 8 figures, comments welcome!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GLoG-CSUnet: Enhancing Vision <span class="highlight-title">Transformer</span>s with Adaptable Radiomic
+  Features for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02788v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02788v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niloufar Eghbali, Hassan Bagher-Ebadian, Tuka Alhanai, Mohammad M. Ghassemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have shown promise in medical image semantic
+segmentation (MISS) by capturing long-range correlations. However, ViTs often
+struggle to model local spatial information effectively, which is essential for
+accurately segmenting fine anatomical details, particularly when applied to
+small datasets without extensive pre-training. We introduce Gabor and Laplacian
+of Gaussian Convolutional Swin Network (GLoG-CSUnet), a novel architecture
+enhancing Transformer-based models by incorporating learnable radiomic
+features. This approach integrates dynamically adaptive Gabor and Laplacian of
+Gaussian (LoG) filters to capture texture, edge, and boundary information,
+enhancing the feature representation processed by the Transformer model. Our
+method uniquely combines the long-range dependency modeling of Transformers
+with the texture analysis capabilities of Gabor and LoG features. Evaluated on
+the Synapse multi-organ and ACDC cardiac segmentation datasets, GLoG-CSUnet
+demonstrates significant improvements over state-of-the-art models, achieving a
+1.14% increase in Dice score for Synapse and 0.99% for ACDC, with minimal
+computational overhead (only 15 and 30 additional parameters, respectively).
+GLoG-CSUnet's flexible design allows integration with various base models,
+offering a promising approach for incorporating radiomics-inspired feature
+extraction in Transformer architectures for medical image analysis. The code
+implementation is available on GitHub at: https://github.com/HAAIL/GLoG-CSUnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GABAR: Graph Attention-Based Action Ranking for Relational Policy
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04752v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04752v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajesh Mangannavar, Stefan Lee, Alan Fern, Prasad Tadepalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel approach to learn relational policies for classical
+planning based on learning to rank actions. We introduce a new graph
+representation that explicitly captures action information and propose a Graph
+Neural Network architecture augmented with Gated Recurrent Units (GRUs) to
+learn action rankings. Our model is trained on small problem instances and
+generalizes to significantly larger instances where traditional planning
+becomes computationally expensive. Experimental results across standard
+planning benchmarks demonstrate that our action-ranking approach achieves
+generalization to significantly larger problems than those used in training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 Pages, 1 figure. Updated acknowledgments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Object-Oriented POMDP Planning for Object Rearrangement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01348v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01348v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajesh Mangannavar, Alan Fern, Prasad Tadepalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an online planning framework for solving multi-object
+rearrangement problems in partially observable, multi-room environments.
+Current object rearrangement solutions, primarily based on Reinforcement
+Learning or hand-coded planning methods, often lack adaptability to diverse
+challenges. To address this limitation, we introduce a novel Hierarchical
+Object-Oriented Partially Observed Markov Decision Process (HOO-POMDP) planning
+approach. This approach comprises of (a) an object-oriented POMDP planner
+generating sub-goals, (b) a set of low-level policies for sub-goal achievement,
+and (c) an abstraction system converting the continuous low-level world into a
+representation suitable for abstract planning. We evaluate our system on
+varying numbers of objects, rooms, and problem types in AI2-THOR simulated
+environments with promising results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 2 Figures. Preprint. Updated acknowledgments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Correlated Privacy Mechanisms for Differentially Private Distributed
+  Mean Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.03289v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.03289v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sajani Vithana, Viveck R. Cadambe, Flavio P. Calmon, Haewon Jeong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differentially private distributed mean estimation (DP-DME) is a fundamental
+building block in privacy-preserving federated learning, where a central server
+estimates the mean of $d$-dimensional vectors held by $n$ users while ensuring
+$(\epsilon,\delta)$-DP. Local differential privacy (LDP) and distributed DP
+with secure aggregation (SA) are the most common notions of DP used in DP-DME
+settings with an untrusted server. LDP provides strong resilience to dropouts,
+colluding users, and adversarial attacks, but suffers from poor utility. In
+contrast, SA-based DP-DME achieves an $O(n)$ utility gain over LDP in DME, but
+requires increased communication and computation overheads and complex
+multi-round protocols to handle dropouts and attacks. In this work, we present
+a generalized framework for DP-DME, that captures LDP and SA-based mechanisms
+as extreme cases. Our framework provides a foundation for developing and
+analyzing a variety of DP-DME protocols that leverage correlated privacy
+mechanisms across users. To this end, we propose CorDP-DME, a novel DP-DME
+mechanism based on the correlated Gaussian mechanism, that spans the gap
+between DME with LDP and distributed DP. We prove that CorDP-DME offers a
+favorable balance between utility and resilience to dropout and collusion. We
+provide an information-theoretic analysis of CorDP-DME, and derive theoretical
+guarantees for utility under any given privacy parameters and dropout/colluding
+user thresholds. Our results demonstrate that (anti) correlated Gaussian DP
+mechanisms can significantly improve utility in mean estimation tasks compared
+to LDP -- even in adversarial settings -- while maintaining better resilience
+to dropouts and attacks compared to distributed DP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Representation Shattering in <span class="highlight-title">Transformer</span>s: A Synthetic Study with
+  Knowledge Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.17194v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.17194v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kento Nishi, Maya Okawa, Rahul Ramesh, Mikail Khona, Hidenori Tanaka, Ekdeep Singh Lubana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Editing (KE) algorithms alter models' weights to perform targeted
+updates to incorrect, outdated, or otherwise unwanted factual associations. To
+better identify the possibilities and limitations of these approaches, recent
+work has shown that applying KE can adversely affect models' factual recall
+accuracy and diminish their general reasoning abilities. While these studies
+give broad insights into the potential harms of KE algorithms, e.g., via
+performance evaluations on benchmarks, we argue little is understood as to why
+such destructive failures occur. Is it possible KE methods distort
+representations of concepts beyond the targeted fact, hence hampering abilities
+at broad? If so, what is the extent of this distortion? Motivated by such
+questions, we define a novel synthetic task wherein a Transformer is trained
+from scratch to internalize a "structured" knowledge graph. The structure
+enforces relationships between entities of the graph, such that editing a
+factual association has "trickling effects" on other entities in the graph
+(e.g., altering X's parent is Y to Z affects who X's siblings' parent is).
+Through evaluations of edited models and analysis of extracted representations,
+we show that KE inadvertently affects representations of entities beyond the
+targeted one, distorting relevant structures that allow a model to infer unseen
+knowledge about an entity. We call this phenomenon representation shattering
+and demonstrate that it results in degradation of factual recall and reasoning
+performance more broadly. To corroborate our findings in a more naturalistic
+setup, we perform preliminary experiments with pre-trained Llama and Mamba
+models, reproducing the representation shattering effect therein as well.
+Overall, our work yields a precise mechanistic hypothesis to explain why KE has
+adverse effects on model abilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Map Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.05901v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.05901v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julio Candanedo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we explore various modifications to diffusion maps (DMAP),
+including their incorporation into a layered sequential neural network model
+trained with gradient descent. The result is a sequential neural network that
+inherits the interpretability of diffusion maps.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast and Interpretable Mortality Risk Scores for Critical Care Patients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.13015v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.13015v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chloe Qinyu Zhu, Muhang Tian, Lesia Semenova, Jiachang Liu, Jack Xu, Joseph Scarpa, Cynthia Rudin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prediction of mortality in intensive care unit (ICU) patients typically
+relies on black box models (that are unacceptable for use in hospitals) or
+hand-tuned interpretable models (that might lead to the loss in performance).
+We aim to bridge the gap between these two categories by building on modern
+interpretable ML techniques to design interpretable mortality risk scores that
+are as accurate as black boxes. We developed a new algorithm, GroupFasterRisk,
+which has several important benefits: it uses both hard and soft direct
+sparsity regularization, it incorporates group sparsity to allow more cohesive
+models, it allows for monotonicity constraint to include domain knowledge, and
+it produces many equally-good models, which allows domain experts to choose
+among them. For evaluation, we leveraged the largest existing public ICU
+monitoring datasets (MIMIC III and eICU). Models produced by GroupFasterRisk
+outperformed OASIS and SAPS II scores and performed similarly to APACHE IV/IVa
+while using at most a third of the parameters. For patients with
+sepsis/septicemia, acute myocardial infarction, heart failure, and acute kidney
+failure, GroupFasterRisk models outperformed OASIS and SOFA. Finally, different
+mortality prediction ML approaches performed better based on variables selected
+by GroupFasterRisk as compared to OASIS variables. GroupFasterRisk's models
+performed better than risk scores currently used in hospitals, and on par with
+black box ML models, while being orders of magnitude sparser. Because
+GroupFasterRisk produces a variety of risk scores, it allows design flexibility
+- the key enabler of practical model creation. GroupFasterRisk is a fast,
+accessible, and flexible procedure that allows learning a diverse set of sparse
+risk scores for mortality prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article has been accepted for publication in the Journal of the
+  American Medical Informatics Association, published by Oxford University
+  Press</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forget Vectors at Play: Universal Input Perturbations Driving Machine
+  Unlearning in Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changchang Sun, Ren Wang, Yihua Zhang, Jinghan Jia, Jiancheng Liu, Gaowen Liu, Sijia Liu, Yan Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning (MU), which seeks to erase the influence of specific
+unwanted data from already-trained models, is becoming increasingly vital in
+model editing, particularly to comply with evolving data regulations like the
+``right to be forgotten''. Conventional approaches are predominantly
+model-based, typically requiring retraining or fine-tuning the model's weights
+to meet unlearning requirements. In this work, we approach the MU problem from
+a novel input perturbation-based perspective, where the model weights remain
+intact throughout the unlearning process. We demonstrate the existence of a
+proactive input-based unlearning strategy, referred to forget vector, which can
+be generated as an input-agnostic data perturbation and remains as effective as
+model-based approximate unlearning approaches. We also explore forget vector
+arithmetic, whereby multiple class-specific forget vectors are combined through
+simple operations (e.g., linear combinations) to generate new forget vectors
+for unseen unlearning tasks, such as forgetting arbitrary subsets across
+classes. Extensive experiments validate the effectiveness and adaptability of
+the forget vector, showcasing its competitive performance relative to
+state-of-the-art model-based methods. Codes are available at
+https://github.com/Changchangsun/Forget-Vector.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative manufacturing systems using diffusion models and Chat<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00958v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00958v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Li, Fei Tao, Wei Ye, Aydin Nassehi, John W. Sutherland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce Generative Manufacturing Systems (GMS) as a novel
+approach to effectively manage and coordinate autonomous manufacturing assets,
+thereby enhancing their responsiveness and flexibility to address a wide array
+of production objectives and human preferences. Deviating from traditional
+explicit modeling, GMS employs generative AI, including diffusion models and
+ChatGPT, for implicit learning from envisioned futures, marking a shift from a
+model-optimum to a training-sampling decision-making. Through the integration
+of generative AI, GMS enables complex decision-making through interactive
+dialogue with humans, allowing manufacturing assets to generate multiple
+high-quality global decisions that can be iteratively refined based on human
+feedback. Empirical findings showcase GMS's substantial improvement in system
+resilience and responsiveness to uncertainties, with decision times reduced
+from seconds to milliseconds. The study underscores the inherent creativity and
+diversity in the generated solutions, facilitating human-centric
+decision-making through seamless and continuous human-machine interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We are withdrawing this preprint to incorporate significant new
+  results and expand the scope of the paper. We plan to resubmit a
+  substantially revised version in the near future</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mask-Weighted Spatial Likelihood Coding for Speaker-Independent Joint
+  Localization and Mask Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.19595v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.19595v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Kienegger, Alina Mannanova, Timo Gerkmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to their robustness and flexibility, neural-driven beamformers are a
+popular choice for speech separation in challenging environments with a varying
+amount of simultaneous speakers alongside noise and reverberation.
+Time-frequency masks and relative directions of the speakers regarding a fixed
+spatial grid can be used to estimate the beamformer's parameters. To some
+degree, speaker-independence is achieved by ensuring a greater amount of
+spatial partitions than speech sources. In this work, we analyze how to encode
+both mask and positioning into such a grid to enable joint estimation of both
+quantities. We propose mask-weighted spatial likelihood coding and show that it
+achieves considerable performance in both tasks compared to baseline encodings
+optimized for either localization or mask estimation. In the same setup, we
+demonstrate superiority for joint estimation of both quantities. Conclusively,
+we propose a universal approach which can replace an upstream sound source
+localization system solely by adapting the training framework, making it highly
+relevant in performance-critical scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\copyright 2025 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Entropy-Guided Attention for Private LLMs <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03489v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03489v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nandan Kumar Jha, Brandon Reagen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pervasiveness of proprietary language models has raised critical privacy
+concerns, necessitating advancements in private inference (PI), where
+computations are performed directly on encrypted data without revealing users'
+sensitive information. While PI offers a promising solution, its practical
+deployment is hindered by substantial communication and latency overheads,
+primarily stemming from nonlinear operations. To address this, we introduce an
+information-theoretic framework to characterize the role of nonlinearities in
+decoder-only language models, laying a principled foundation for optimizing
+transformer-architectures tailored to the demands of PI.
+  By leveraging Shannon's entropy as a quantitative measure, we uncover the
+previously unexplored dual significance of nonlinearities: beyond ensuring
+training stability, they are crucial for maintaining attention head diversity.
+Specifically, we find that their removal triggers two critical failure modes:
+{\em entropy collapse} in deeper layers that destabilizes training, and {\em
+entropic overload} in earlier layers that leads to under-utilization of
+Multi-Head Attention's (MHA) representational capacity.
+  We propose an entropy-guided attention mechanism paired with a novel entropy
+regularization technique to mitigate entropic overload. Additionally, we
+explore PI-friendly alternatives to layer normalization for preventing entropy
+collapse and stabilizing the training of LLMs with reduced-nonlinearities. Our
+study bridges the gap between information theory and architectural design,
+establishing entropy dynamics as a principled guide for developing efficient PI
+architectures. The code and implementation are available at
+https://github.com/Nandan91/entropy-guided-attention-llm
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 6th AAAI Workshop on Privacy-Preserving Artificial
+  Intelligence (PPAI), 2025. arXiv admin note: substantial text overlap with
+  arXiv:2410.13060</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Most Influential Subset Selection: Challenges, Promises, and Beyond <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18153v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18153v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzheng Hu, Pingbang Hu, Han Zhao, Jiaqi W. Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How can we attribute the behaviors of machine learning models to their
+training data? While the classic influence function sheds light on the impact
+of individual samples, it often fails to capture the more complex and
+pronounced collective influence of a set of samples. To tackle this challenge,
+we study the Most Influential Subset Selection (MISS) problem, which aims to
+identify a subset of training samples with the greatest collective influence.
+We conduct a comprehensive analysis of the prevailing approaches in MISS,
+elucidating their strengths and weaknesses. Our findings reveal that
+influence-based greedy heuristics, a dominant class of algorithms in MISS, can
+provably fail even in linear regression. We delineate the failure modes,
+including the errors of influence function and the non-additive structure of
+the collective influence. Conversely, we demonstrate that an adaptive version
+of these heuristics which applies them iteratively, can effectively capture the
+interactions among samples and thus partially address the issues. Experiments
+on real-world datasets corroborate these theoretical findings and further
+demonstrate that the merit of adaptivity can extend to more complex scenarios
+such as classification tasks and non-linear neural networks. We conclude our
+analysis by emphasizing the inherent trade-off between performance and
+computational efficiency, questioning the use of additive metrics such as the
+Linear Datamodeling Score, and offering a range of discussions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 38th Conference on Neural Information Processing
+  Systems (NeurIPS 2024) Edit: Added discussion on a concurrent work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forecasting Symmetric Random Walks: A Fusion Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14469v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14469v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting random walks is notoriously challenging, with na\"ive prediction
+serving as a difficult-to-surpass baseline. To investigate the potential of
+using movement predictions to improve point forecasts in this context, this
+study focuses on symmetric random walks, in which the target variable's future
+value is reformulated as a combination of its future movement and current
+value. The proposed forecasting method, termed the fusion of movement and
+na\"ive predictions (FMNP), is grounded in this reformulation. The simulation
+results show that FMNP achieves statistically significant improvements over
+na\"ive prediction, even when the movement prediction accuracy is only slightly
+above 0.50. In practice, movement predictions can be derived from the
+comovement between an exogenous variable and the target variable and then
+linearly combined with the na\"ive prediction to generate the final forecast.
+FMNP effectiveness was evaluated on four U.S. financial time series -- the
+close prices of Boeing (BA), Brent crude oil (OIL), Halliburton (HAL), and
+Schlumberger (SLB) -- using the open price of the Financial Times Stock
+Exchange (FTSE) index as the exogenous variable. In all the cases, FMNP
+outperformed the na\"ive prediction, demonstrating its efficacy in forecasting
+symmetric random walks and its potential applicability to other forecasting
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GSVD-NMF: Recovering Missing Features in Non-negative Matrix
+  Factorization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youdong Guo, Timothy E. Holy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-negative matrix factorization (NMF) is an important tool in signal
+processing and widely used to separate mixed sources into their components.
+Algorithms for NMF require that the user choose the number of components in
+advance, and if the results are unsatisfying one typically needs to start again
+with a different number of components. To make NMF more interactive and
+incremental, here we introduce GSVD-NMF, a method that proposes new components
+based on the generalized singular value decomposition (GSVD) to address
+discrepancies between the initial under-complete NMF results and the SVD of the
+original matrix. Simulation and experimental results demonstrate that GSVD-NMF
+often effectively recovers multiple missing components in under-complete NMF,
+with the recovered NMF solutions frequently reaching better local optima. The
+results further show that GSVD-NMF is compatible with various NMF algorithms
+and that directly augmenting components is more efficient than rerunning NMF
+from scratch with additional components. By deliberately starting from
+under-complete NMF, GSVD-NMF has the potential to be a recommended approach for
+a range of general NMF applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Online Federated Learning with Correlated Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16542v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16542v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaojiao Zhang, Linglingzhi Zhu, Mikael Johansson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel differentially private algorithm for online federated
+learning that employs temporally correlated noise to enhance utility while
+ensuring privacy of continuously released models. To address challenges posed
+by DP noise and local updates with streaming non-iid data, we develop a
+perturbed iterate analysis to control the impact of the DP noise on the
+utility. Moreover, we demonstrate how the drift errors from local updates can
+be effectively managed under a quasi-strong convexity condition. Subject to an
+$(\epsilon, \delta)$-DP budget, we establish a dynamic regret bound over the
+entire time horizon, quantifying the impact of key parameters and the intensity
+of changes in dynamic environments. Numerical experiments confirm the efficacy
+of the proposed algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Locally Differentially Private Online Federated Learning With Correlated
+  Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.18752v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.18752v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaojiao Zhang, Linglingzhi Zhu, Dominik Fay, Mikael Johansson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a locally differentially private (LDP) algorithm for online
+federated learning that employs temporally correlated noise to improve utility
+while preserving privacy. To address challenges posed by the correlated noise
+and local updates with streaming non-IID data, we develop a perturbed iterate
+analysis that controls the impact of the noise on the utility. Moreover, we
+demonstrate how the drift errors from local updates can be effectively managed
+for several classes of nonconvex loss functions. Subject to an
+$(\epsilon,\delta)$-LDP budget, we establish a dynamic regret bound that
+quantifies the impact of key parameters and the intensity of changes in the
+dynamic environment on the learning performance. Numerical experiments confirm
+the efficacy of the proposed algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2403.16542</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analyzing Country-Level Vaccination Rates and Determinants of Practical
+  Capacity to Administer COVID-19 Vaccines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01447v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01447v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharika J. Hegde, Max T. M. Ng, Marcos Rios, Hani S. Mahmassani, Ying Chen, Karen Smilowitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 vaccine development, manufacturing, transportation, and
+administration proved an extreme logistics operation of global magnitude.
+Global vaccination levels, however, remain a key concern in preventing the
+emergence of new strains and minimizing the impact of the pandemic's disruption
+of daily life. In this paper, country-level vaccination rates are analyzed
+through a queuing framework to extract service rates that represent the
+practical capacity of a country to administer vaccines. These rates are further
+characterized through regression and interpretable machine learning methods
+with country-level demographic, governmental, and socio-economic variates.
+Model results show that participation in multi-governmental collaborations such
+as COVAX may improve the ability to vaccinate. Similarly, improved
+transportation and accessibility variates such as roads per area for low-income
+countries and rail lines per area for high-income countries can improve rates.
+It was also found that for low-income countries specifically, improvements in
+basic and health infrastructure (as measured through spending on healthcare,
+number of doctors and hospital beds per 100k, population percent with access to
+electricity, life expectancy, and vehicles per 1000 people) resulted in higher
+vaccination rates. Of the high-income countries, those with larger 65-plus
+populations struggled to vaccinate at high rates, indicating potential
+accessibility issues for the elderly. This study finds that improving basic and
+health infrastructure, focusing on accessibility in the last mile, particularly
+for the elderly, and fostering global partnerships can improve logistical
+operations of such a scale. Such structural impediments and inequities in
+global health care must be addressed in preparation for future global public
+health crises.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under consideration for more thorough analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Mamba in the Llama: Distilling and Accelerating Hybrid Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15237v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15237v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junxiong Wang, Daniele Paliotta, Avner May, Alexander M. Rush, Tri Dao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linear RNN architectures, like Mamba, can be competitive with Transformer
+models in language modeling while having advantageous deployment
+characteristics. Given the focus on training large-scale Transformer models, we
+consider the challenge of converting these pretrained models for deployment. We
+demonstrate that it is feasible to distill large Transformers into linear RNNs
+by reusing the linear projection weights from attention layers with academic
+GPU resources. The resulting hybrid model, which incorporates a quarter of the
+attention layers, achieves performance comparable to the original Transformer
+in chat benchmarks and outperforms open-source hybrid Mamba models trained from
+scratch with trillions of tokens in both chat benchmarks and general
+benchmarks. Moreover, we introduce a hardware-aware speculative decoding
+algorithm that accelerates the inference speed of Mamba and hybrid models.
+Overall we show how, with limited computation resources, we can remove many of
+the original attention layers and generate from the resulting model more
+efficiently. Our top-performing model, distilled from Llama3-8B-Instruct,
+achieves a 29.61 length-controlled win rate on AlpacaEval 2 against GPT-4 and
+7.35 on MT-Bench, surpassing the best 8B scale instruction-tuned linear RNN
+model. We also find that the distilled model has natural length extrapolation,
+showing almost perfect accuracy in the needle-in-a-haystack test at 20x the
+distillation length. Code and pre-trained checkpoints are open-sourced at
+https://github.com/jxiw/MambaInLlama and
+https://github.com/itsdaniele/speculative_mamba.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024. v3 updates: fix format errors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deliberative Alignment: Reasoning Enables Safer Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16339v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16339v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melody Y. Guan, Manas Joglekar, Eric Wallace, Saachi Jain, Boaz Barak, Alec Helyar, Rachel Dias, Andrea Vallone, Hongyu Ren, Jason Wei, Hyung Won Chung, Sam Toyer, Johannes Heidecke, Alex Beutel, Amelia Glaese
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large-scale language models increasingly impact safety-critical domains,
+ensuring their reliable adherence to well-defined principles remains a
+fundamental challenge. We introduce Deliberative Alignment, a new paradigm that
+directly teaches the model safety specifications and trains it to explicitly
+recall and accurately reason over the specifications before answering. We used
+this approach to align OpenAI's o-series models, and achieved highly precise
+adherence to OpenAI's safety policies, without requiring human-written
+chain-of-thoughts or answers. Deliberative Alignment pushes the Pareto frontier
+by simultaneously increasing robustness to jailbreaks while decreasing
+overrefusal rates, and also improves out-of-distribution generalization. We
+demonstrate that reasoning over explicitly specified policies enables more
+scalable, trustworthy, and interpretable alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MADGEN: Mass-Spec attends to De Novo Molecular generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01950v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01950v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinkai Wang, Xiaohui Chen, Liping Liu, Soha Hassoun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The annotation (assigning structural chemical identities) of MS/MS spectra
+remains a significant challenge due to the enormous molecular diversity in
+biological samples and the limited scope of reference databases. Currently, the
+vast majority of spectral measurements remain in the "dark chemical space"
+without structural annotations. To improve annotation, we propose MADGEN
+(Mass-spec Attends to De Novo Molecular GENeration), a scaffold-based method
+for de novo molecular structure generation guided by mass spectrometry data.
+MADGEN operates in two stages: scaffold retrieval and spectra-conditioned
+molecular generation starting with the scaffold. In the first stage, given an
+MS/MS spectrum, we formulate scaffold retrieval as a ranking problem and employ
+contrastive learning to align mass spectra with candidate molecular scaffolds.
+In the second stage, starting from the retrieved scaffold, we employ the MS/MS
+spectrum to guide an attention-based generative model to generate the final
+molecule. Our approach constrains the molecular generation search space,
+reducing its complexity and improving generation accuracy. We evaluate MADGEN
+on three datasets (NIST23, CANOPUS, and MassSpecGym) and evaluate MADGEN's
+performance with a predictive scaffold retriever and with an oracle retriever.
+We demonstrate the effectiveness of using attention to integrate spectral
+information throughout the generation process to achieve strong results with
+the oracle retriever.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative AI Policies under the Microscope: How CS Conferences Are
+  Navigating the New Frontier in Scholarly Writing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11977v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11977v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahjabin Nahar, Sian Lee, Becky Guillen, Dongwon Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the current state of generative AI policies of computer
+science conferences and offers guidelines for policy adoption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Literature Meets Data: A Synergistic Approach to Hypothesis Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.17309v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.17309v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haokun Liu, Yangqiaoyu Zhou, Mingxuan Li, Chenfei Yuan, Chenhao Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI holds promise for transforming scientific processes, including hypothesis
+generation. Prior work on hypothesis generation can be broadly categorized into
+theory-driven and data-driven approaches. While both have proven effective in
+generating novel and plausible hypotheses, it remains an open question whether
+they can complement each other. To address this, we develop the first method
+that combines literature-based insights with data to perform LLM-powered
+hypothesis generation. We apply our method on five different datasets and
+demonstrate that integrating literature and data outperforms other baselines
+(8.97\% over few-shot, 15.75\% over literature-based alone, and 3.37\% over
+data-driven alone). Additionally, we conduct the first human evaluation to
+assess the utility of LLM-generated hypotheses in assisting human
+decision-making on two challenging tasks: deception detection and AI generated
+content detection. Our results show that human accuracy improves significantly
+by 7.44\% and 14.19\% on these tasks, respectively. These findings suggest that
+integrating literature-based and data-driven approaches provides a
+comprehensive and nuanced framework for hypothesis generation and could open
+new avenues for scientific inquiry.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 9 figures, code link:
+  https://github.com/ChicagoHAI/hypothesis-generation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Manifolds, Random Matrices and Spectral Gaps: The geometric phases of
+  generative diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05898v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05898v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enrico Ventura, Beatrice Achilli, Gianluigi Silvestri, Carlo Lucibello, Luca Ambrogioni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the latent geometry of generative diffusion
+models under the manifold hypothesis. For this purpose, we analyze the spectrum
+of eigenvalues (and singular values) of the Jacobian of the score function,
+whose discontinuities (gaps) reveal the presence and dimensionality of distinct
+sub-manifolds. Using a statistical physics approach, we derive the spectral
+distributions and formulas for the spectral gaps under several distributional
+assumptions, and we compare these theoretical predictions with the spectra
+estimated from trained networks. Our analysis reveals the existence of three
+distinct qualitative phases during the generative process: a trivial phase; a
+manifold coverage phase where the diffusion process fits the distribution
+internal to the manifold; a consolidation phase where the score becomes
+orthogonal to the manifold and all particles are projected on the support of
+the data. This `division of labor' between different timescales provides an
+elegant explanation of why generative diffusion models are not affected by the
+manifold overfitting phenomenon that plagues likelihood-based models, since the
+internal distribution and the manifold geometry are produced at different time
+points during generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LiLMaps: Learnable Implicit Language Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03304v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03304v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evgenii Kruzhkov, Sven Behnke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the current trends in robotics is to employ large language models
+(LLMs) to provide non-predefined command execution and natural human-robot
+interaction. It is useful to have an environment map together with its language
+representation, which can be further utilized by LLMs. Such a comprehensive
+scene representation enables numerous ways of interaction with the map for
+autonomously operating robots. In this work, we present an approach that
+enhances incremental implicit mapping through the integration of
+vision-language features. Specifically, we (i) propose a decoder optimization
+technique for implicit language maps which can be used when new objects appear
+on the scene, and (ii) address the problem of inconsistent vision-language
+predictions between different viewing positions. Our experiments demonstrate
+the effectiveness of LiLMaps and solid improvements in performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Indoor-Training Effect: unexpected gains from distribution shifts in
+  the transition function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15856v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15856v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serena Bono, Spandan Madan, Ishaan Grover, Mao Yasueda, Cynthia Breazeal, Hanspeter Pfister, Gabriel Kreiman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Is it better to perform tennis training in a pristine indoor environment or a
+noisy outdoor one? To model this problem, here we investigate whether shifts in
+the transition probabilities between the training and testing environments in
+reinforcement learning problems can lead to better performance under certain
+conditions. We generate new Markov Decision Processes (MDPs) starting from a
+given MDP, by adding quantifiable, parametric noise into the transition
+function. We refer to this process as Noise Injection and the resulting
+environments as {\delta}-environments. This process allows us to create
+variations of the same environment with quantitative control over noise serving
+as a metric of distance between environments. Conventional wisdom suggests that
+training and testing on the same MDP should yield the best results. In stark
+contrast, we observe that agents can perform better when trained on the
+noise-free environment and tested on the noisy {\delta}-environments, compared
+to training and testing on the same {\delta}-environments. We confirm that this
+finding extends beyond noise variations: it is possible to showcase the same
+phenomenon in ATARI game variations including varying Ghost behaviour in
+PacMan, and Paddle behaviour in Pong. We demonstrate this intriguing behaviour
+across 60 different variations of ATARI games, including PacMan, Pong, and
+Breakout. We refer to this phenomenon as the Indoor-Training Effect. Code to
+reproduce our experiments and to implement Noise Injection can be found at
+https://bit.ly/3X6CTYk.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Column Generation with Graph Neural Networks for Joint Rider
+  Trip Planning and Crew Shift Scheduling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03692v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03692v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Lu, Tinghan Ye, Wenbo Chen, Pascal Van Hentenryck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimizing service schedules is pivotal to the reliable, efficient, and
+inclusive on-demand mobility. This pressing challenge is further exacerbated by
+the increasing needs of an aging population, the oversubscription of existing
+services, and the lack of effective solution methods. This study addresses the
+intricacies of service scheduling, by jointly optimizing rider trip planning
+and crew scheduling for a complex dynamic mobility service. The resulting
+optimization problems are extremely challenging computationally for
+state-of-the-art methods. To address this fundamental gap, this paper
+introduces the Joint Rider Trip Planning and Crew Shift Scheduling Problem
+(JRTPCSSP) and a novel solution method, called Attention and Gated GNN-Informed
+Column Generation (AGGNNI-CG), that hybridizes column generation and machine
+learning to obtain near-optimal solutions to the JRTPCSSP with real-life
+constraints of the application. The key idea of the machine-learning component
+is to dramatically reduce the number of paths to explore in the pricing
+problem, accelerating the most time-consuming component of the column
+generation. The machine learning component is a graph neural network with an
+attention mechanism and a gated architecture, which is particularly suited to
+cater for the different input sizes coming from daily operations. AGGNNI-CG has
+been applied to a challenging, real-world dataset from the Paratransit system
+of Chatham County in Georgia. It produces substantial improvements compared to
+the baseline column generation approach, which typically cannot produce
+high-quality feasible solutions in reasonable time on large-scale complex
+instances. AGGNNI-CG also produces significant improvements in service quality
+compared to the existing system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Asymptotic Inference for Multi-Stage Stationary Treatment Policy with
+  Variable Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12553v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12553v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daiqi Gao, Yufeng Liu, Donglin Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic treatment regimes or policies are a sequence of decision functions
+over multiple stages that are tailored to individual features. One important
+class of treatment policies in practice, namely multi-stage stationary
+treatment policies, prescribes treatment assignment probabilities using the
+same decision function across stages, where the decision is based on the same
+set of features consisting of time-evolving variables (e.g., routinely
+collected disease biomarkers). Although there has been extensive literature on
+constructing valid inference for the value function associated with dynamic
+treatment policies, little work has focused on the policies themselves,
+especially in the presence of high-dimensional feature variables. We aim to
+fill the gap in this work. Specifically, we first estimate the multi-stage
+stationary treatment policy using an augmented inverse probability weighted
+estimator for the value function to increase asymptotic efficiency, and further
+apply a penalty to select important feature variables. We then construct
+one-step improvements of the policy parameter estimators for valid inference.
+Theoretically, we show that the improved estimators are asymptotically normal,
+even if nuisance parameters are estimated at a slow convergence rate and the
+dimension of the feature variables increases with the sample size. Our
+numerical studies demonstrate that the proposed method estimates a sparse
+policy with a near-optimal value function and conducts valid inference for the
+policy parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mixture-of-Experts Graph <span class="highlight-title">Transformer</span>s for Interpretable Particle
+  Collision Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03432v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03432v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donatella Genovese, Alessandro Sgroi, Alessio Devoto, Samuel Valentine, Lennox Wood, Cristiano Sebastiani, Stefano Giagu, Monica D'Onofrio, Simone Scardapane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Large Hadron Collider at CERN produces immense volumes of complex data
+from high-energy particle collisions, demanding sophisticated analytical
+techniques for effective interpretation. Neural Networks, including Graph
+Neural Networks, have shown promise in tasks such as event classification and
+object identification by representing collisions as graphs. However, while
+Graph Neural Networks excel in predictive accuracy, their "black box" nature
+often limits their interpretability, making it difficult to trust their
+decision-making processes. In this paper, we propose a novel approach that
+combines a Graph Transformer model with Mixture-of-Expert layers to achieve
+high predictive performance while embedding interpretability into the
+architecture. By leveraging attention maps and expert specialization, the model
+offers insights into its internal decision-making, linking predictions to
+physics-informed features. We evaluate the model on simulated events from the
+ATLAS experiment, focusing on distinguishing rare Supersymmetric signal events
+from Standard Model background. Our results highlight that the model achieves
+competitive classification accuracy while providing interpretable outputs that
+align with known physics, demonstrating its potential as a robust and
+transparent tool for high-energy physics data analysis. This approach
+underscores the importance of explainability in machine learning methods
+applied to high energy physics, offering a path toward greater trust in
+AI-driven discoveries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Offline Reinforcement Learning for Learning to Dispatch for Job Shop
+  Scheduling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.10589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.10589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesse van Remmerden, Zaharah Bukhsh, Yingqian Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Job Shop Scheduling Problem (JSSP) is a complex combinatorial
+optimization problem. While online Reinforcement Learning (RL) has shown
+promise by quickly finding acceptable solutions for JSSP, it faces key
+limitations: it requires extensive training interactions from scratch leading
+to sample inefficiency, cannot leverage existing high-quality solutions, and
+often yields suboptimal results compared to traditional methods like Constraint
+Programming (CP). We introduce Offline Reinforcement Learning for Learning to
+Dispatch (Offline-LD), which addresses these limitations by learning from
+previously generated solutions. Our approach is motivated by scenarios where
+historical scheduling data and expert solutions are available, although our
+current evaluation focuses on benchmark problems. Offline-LD adapts two
+CQL-based Q-learning methods (mQRDQN and discrete mSAC) for maskable action
+spaces, introduces a novel entropy bonus modification for discrete SAC, and
+exploits reward normalization through preprocessing. Our experiments
+demonstrate that Offline-LD outperforms online RL on both generated and
+benchmark instances. Notably, by introducing noise into the expert dataset, we
+achieve similar or better results than those obtained from the expert dataset,
+suggesting that a more diverse training set is preferable because it contains
+counterfactual information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://github.com/jesserem/Offline-LD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking the Capacity of Graph Neural Networks for Branching Strategy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07099v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07099v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziang Chen, Jialin Liu, Xiaohan Chen, Xinshang Wang, Wotao Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have been widely used to predict properties and
+heuristics of mixed-integer linear programs (MILPs) and hence accelerate MILP
+solvers. This paper investigates the capacity of GNNs to represent strong
+branching (SB), the most effective yet computationally expensive heuristic
+employed in the branch-and-bound algorithm. In the literature, message-passing
+GNN (MP-GNN), as the simplest GNN structure, is frequently used as a fast
+approximation of SB and we find that not all MILPs's SB can be represented with
+MP-GNN. We precisely define a class of "MP-tractable" MILPs for which MP-GNNs
+can accurately approximate SB scores. Particularly, we establish a universal
+approximation theorem: for any data distribution over the MP-tractable class,
+there always exists an MP-GNN that can approximate the SB score with
+arbitrarily high accuracy and arbitrarily high probability, which lays a
+theoretical foundation of the existing works on imitating SB with MP-GNN. For
+MILPs without the MP-tractability, unfortunately, a similar result is
+impossible, which can be illustrated by two MILP instances with different SB
+scores that cannot be distinguished by any MP-GNN, regardless of the number of
+parameters. Recognizing this, we explore another GNN structure called the
+second-order folklore GNN (2-FGNN) that overcomes this limitation, and the
+aforementioned universal approximation theorem can be extended to the entire
+MILP space using 2-FGNN, regardless of the MP-tractability. A small-scale
+numerical experiment is conducted to directly validate our theoretical
+findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Realistic Evaluation of Commit Message Generation by Matching
+  Online and Offline Settings <span class="chip">ICSE'2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.12046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.12046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Petr Tsvetkov, Aleksandra Eliseeva, Danny Dig, Alexander Bezzubov, Yaroslav Golubev, Timofey Bryksin, Yaroslav Zharov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When a Commit Message Generation (CMG) system is integrated into the IDEs and
+other products at JetBrains, we perform online evaluation based on user
+acceptance of the generated messages. However, performing online experiments
+with every change to a CMG system is troublesome, as each iteration affects
+users and requires time to collect enough statistics. On the other hand,
+offline evaluation, a prevalent approach in the research literature,
+facilitates fast experiments but employs automatic metrics that are not
+guaranteed to represent the preferences of real users. In this work, we
+describe a novel way we employed to deal with this problem at JetBrains, by
+leveraging an online metric - the number of edits users introduce before
+committing the generated messages to the VCS - to select metrics for offline
+experiments.
+  To support this new type of evaluation, we develop a novel markup collection
+tool mimicking the real workflow with a CMG system, collect a dataset with 57
+pairs consisting of commit messages generated by GPT-4 and their counterparts
+edited by human experts, and design and verify a way to synthetically extend
+such a dataset. Then, we use the final dataset of 656 pairs to study how the
+widely used similarity metrics correlate with the online metric reflecting the
+real users' experience.
+  Our results indicate that edit distance exhibits the highest correlation with
+the online metric, whereas commonly used similarity metrics such as BLEU and
+METEOR demonstrate low correlation. This contradicts the previous studies on
+similarity metrics for CMG, suggesting that user interactions with a CMG system
+in real-world settings differ significantly from the responses by human
+labelers within controlled environments. We release all the code and the
+dataset to support future research in the field: https://jb.gg/cmg-evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures (Published at ICSE'2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Multi-Objective Reinforcement Learning for Utility-Based
+  Infrastructural Maintenance Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.06184v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.06184v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesse van Remmerden, Maurice Kenter, Diederik M. Roijers, Charalampos Andriotis, Yingqian Zhang, Zaharah Bukhsh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Multi-Objective Deep Centralized Multi-Agent
+Actor-Critic (MO- DCMAC), a multi-objective reinforcement learning (MORL)
+method for infrastructural maintenance optimization, an area traditionally
+dominated by single-objective reinforcement learning (RL) approaches. Previous
+single-objective RL methods combine multiple objectives, such as probability of
+collapse and cost, into a singular reward signal through reward-shaping. In
+contrast, MO-DCMAC can optimize a policy for multiple objectives directly, even
+when the utility function is non-linear. We evaluated MO-DCMAC using two
+utility functions, which use probability of collapse and cost as input. The
+first utility function is the Threshold utility, in which MO-DCMAC should
+minimize cost so that the probability of collapse is never above the threshold.
+The second is based on the Failure Mode, Effects, and Criticality Analysis
+(FMECA) methodology used by asset managers to asses maintenance plans. We
+evaluated MO-DCMAC, with both utility functions, in multiple maintenance
+environments, including ones based on a case study of the historical quay walls
+of Amsterdam. The performance of MO-DCMAC was compared against multiple
+rule-based policies based on heuristics currently used for constructing
+maintenance plans. Our results demonstrate that MO-DCMAC outperforms
+traditional rule-based policies across various environments and utility
+functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the Neural Computing and Applications: Topical Collection
+  on Multi-Objective Decision Making 2023 (MODeM 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SWEPO: Simultaneous Weighted Preference Optimization for Group
+  Contrastive Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taneesh Gupta, Rahul Madhavan, Xuchao Zhang, Chetan Bansal, Saravan Rajmohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Simultaneous Weighted Preference Optimization (SWEPO), a novel
+extension of Direct Preference Optimization (DPO) designed to accommodate
+multiple dynamically chosen positive and negative responses for each query.
+SWEPO employs a weighted group contrastive loss, assigning weights to responses
+based on their deviation from the mean reward score. This approach effectively
+prioritizes responses that are significantly better or worse than the average,
+enhancing optimization. Our theoretical analysis demonstrates that
+simultaneously considering multiple preferences reduces alignment bias,
+resulting in more robust alignment. Additionally, we provide insights into the
+training dynamics of our loss function and a related function, InfoNCA.
+Empirical validation on the UltraFeedback dataset establishes SWEPO as
+state-of-the-art, with superior performance in downstream evaluations using the
+AlpacaEval dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Time Series Foundation Models on Noisy Periodic Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00889v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00889v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Syamantak Datta Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent advancements in foundation models have significantly impacted
+machine learning, rigorous tests on the performance of time series foundation
+models (TSFMs) remain largely underexplored. This paper presents an empirical
+study evaluating the zero-shot, long-horizon forecasting abilities of several
+leading TSFMs over two synthetic datasets constituting noisy periodic time
+series. We assess model efficacy across different noise levels, underlying
+frequencies, and sampling rates. As benchmarks for comparison, we choose two
+statistical techniques: a Fourier transform (FFT)-based approach and a linear
+autoregressive (AR) model. Our findings demonstrate that while for time series
+with bounded periods and higher sampling rates, TSFMs can match or outperform
+the statistical approaches, their forecasting abilities deteriorate with longer
+periods, higher noise levels, lower sampling rates and more complex shapes of
+the time series.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rad4XCNN: a new agnostic method for post-hoc global explanation of
+  CNN-derived features by means of radiomics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02334v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02334v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Prinzi, Carmelo Militello, Calogero Zarcaro, Tommaso Vincenzo Bartolotta, Salvatore Gaglio, Salvatore Vitabile
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, machine learning-based clinical decision support systems
+(CDSS) have played a key role in the analysis of several medical conditions.
+Despite their promising capabilities, the lack of transparency in AI models
+poses significant challenges, particularly in medical contexts where
+reliability is a mandatory aspect. However, it appears that explainability is
+inversely proportional to accuracy. For this reason, achieving transparency
+without compromising predictive accuracy remains a key challenge. This paper
+presents a novel method, namely Rad4XCNN, to enhance the predictive power of
+CNN-derived features with the inherent interpretability of radiomic features.
+Rad4XCNN diverges from conventional methods based on saliency maps, by
+associating intelligible meaning to CNN-derived features by means of Radiomics,
+offering new perspectives on explanation methods beyond visualization maps.
+Using a breast cancer classification task as a case study, we evaluated
+Rad4XCNN on ultrasound imaging datasets, including an online dataset and two
+in-house datasets for internal and external validation. Some key results are:
+i) CNN-derived features guarantee more robust accuracy when compared against
+ViT-derived and radiomic features; ii) conventional visualization map methods
+for explanation present several pitfalls; iii) Rad4XCNN does not sacrifice
+model accuracy for their explainability; iv) Rad4XCNN provides a global
+explanation enabling the physician to extract global insights and findings. Our
+method can mitigate some concerns related to the explainability-accuracy
+trade-off. This study highlighted the importance of proposing new methods for
+model explanation without affecting their accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with
+  Video LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqian Yuan, Hang Zhang, Wentong Li, Zesen Cheng, Boqiang Zhang, Long Li, Xin Li, Deli Zhao, Wenqiao Zhang, Yueting Zhuang, Jianke Zhu, Lidong Bing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Large Language Models (Video LLMs) have recently exhibited remarkable
+capabilities in general video understanding. However, they mainly focus on
+holistic comprehension and struggle with capturing fine-grained spatial and
+temporal details. Besides, the lack of high-quality object-level video
+instruction data and a comprehensive benchmark further hinders their
+advancements. To tackle these challenges, we introduce the VideoRefer Suite to
+empower Video LLM for finer-level spatial-temporal video understanding, i.e.,
+enabling perception and reasoning on any objects throughout the video.
+Specially, we thoroughly develop VideoRefer Suite across three essential
+aspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent
+data engine to meticulously curate a large-scale, high-quality object-level
+video instruction dataset, termed VideoRefer-700K. Next, we present the
+VideoRefer model, which equips a versatile spatial-temporal object encoder to
+capture precise regional and sequential representations. Finally, we
+meticulously create a VideoRefer-Bench to comprehensively assess the
+spatial-temporal understanding capability of a Video LLM, evaluating it across
+various aspects. Extensive experiments and analyses demonstrate that our
+VideoRefer model not only achieves promising performance on video referring
+benchmarks but also facilitates general video understanding capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 figures, technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Race to Efficiency: A New Perspective on AI Scaling Laws 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02156v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02156v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chien-Ping Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large-scale AI models expand, training becomes costlier and sustaining
+progress grows harder. Classical scaling laws (e.g., Kaplan et al. (2020),
+Hoffmann et al. (2022)) predict training loss from a static compute budget yet
+neglect time and efficiency, prompting the question: how can we balance
+ballooning GPU fleets with rapidly improving hardware and algorithms? We
+introduce the relative-loss equation, a time- and efficiency-aware framework
+that extends classical AI scaling laws. Our model shows that, without ongoing
+efficiency gains, advanced performance could demand millennia of training or
+unrealistically large GPU fleets. However, near-exponential progress remains
+achievable if the "efficiency-doubling rate" parallels Moore's Law. By
+formalizing this race to efficiency, we offer a quantitative roadmap for
+balancing front-loaded GPU investments with incremental improvements across the
+AI stack. Empirical trends suggest that sustained efficiency gains can push AI
+scaling well into the coming decade, providing a new perspective on the
+diminishing returns inherent in classical scaling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 3 figures. 2 tables, second draft</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tutorial on Diffusion Models for Imaging and Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18103v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18103v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stanley H. Chan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The astonishing growth of generative tools in recent years has empowered many
+exciting applications in text-to-image generation and text-to-video generation.
+The underlying principle behind these generative tools is the concept of
+diffusion, a particular sampling mechanism that has overcome some shortcomings
+that were deemed difficult in the previous approaches. The goal of this
+tutorial is to discuss the essential ideas underlying the diffusion models. The
+target audience of this tutorial includes undergraduate and graduate students
+who are interested in doing research on diffusion models or applying these
+models to solve other problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning from Ambiguous Data with Hard Labels <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01844v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01844v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeke Xie, Zheng He, Nan Lu, Lichen Bai, Bao Li, Shuo Yang, Mingming Sun, Ping Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world data often contains intrinsic ambiguity that the common
+single-hard-label annotation paradigm ignores. Standard training using
+ambiguous data with these hard labels may produce overly confident models and
+thus leading to poor generalization. In this paper, we propose a novel
+framework called Quantized Label Learning (QLL) to alleviate this issue. First,
+we formulate QLL as learning from (very) ambiguous data with hard labels:
+ideally, each ambiguous instance should be associated with a ground-truth
+soft-label distribution describing its corresponding probabilistic weight in
+each class, however, this is usually not accessible; in practice, we can only
+observe a quantized label, i.e., a hard label sampled (quantized) from the
+corresponding ground-truth soft-label distribution, of each instance, which can
+be seen as a biased approximation of the ground-truth soft-label. Second, we
+propose a Class-wise Positive-Unlabeled (CPU) risk estimator that allows us to
+train accurate classifiers from only ambiguous data with quantized labels.
+Third, to simulate ambiguous datasets with quantized labels in the real world,
+we design a mixing-based ambiguous data generation procedure for empirical
+evaluation. Experiments demonstrate that our CPU method can significantly
+improve model generalization performance and outperform the baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling-laws for Large Time-series Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas D. P. Edwards, James Alvey, Justin Alsing, Nam H. Nguyen, Benjamin D. Wandelt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling laws for large language models (LLMs) have provided useful guidance
+in training ever larger models for predictable performance gains. Time series
+forecasting shares a similar sequential structure to language, and is amenable
+to large-scale transformer architectures. Here we show that foundational
+decoder-only time series transformer models exhibit analogous scaling-behavior
+to LLMs, with architectural details (aspect ratio and number of heads) having a
+minimal effect over broad ranges. We assemble a large corpus of heterogenous
+time series data on which to train, and establish for the first time power-law
+scaling with parameter count, dataset size, and training compute, spanning five
+orders of magnitude.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 main pages (16 total), 4 figures; Accepted for oral presentation in
+  Time Series in the Age of Large Models (TSALM) Workshop at Neurips 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Energy-based Hopfield Boosting for Out-of-Distribution Detection <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.08766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.08766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claus Hofmann, Simon Schmid, Bernhard Lehner, Daniel Klotz, Sepp Hochreiter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection is critical when deploying machine
+learning models in the real world. Outlier exposure methods, which incorporate
+auxiliary outlier data in the training process, can drastically improve OOD
+detection performance compared to approaches without advanced training
+strategies. We introduce Hopfield Boosting, a boosting approach, which
+leverages modern Hopfield energy (MHE) to sharpen the decision boundary between
+the in-distribution and OOD data. Hopfield Boosting encourages the model to
+concentrate on hard-to-distinguish auxiliary outlier examples that lie close to
+the decision boundary between in-distribution and auxiliary outlier data. Our
+method achieves a new state-of-the-art in OOD detection with outlier exposure,
+improving the FPR95 metric from 2.28 to 0.92 on CIFAR-10 and from 11.76 to 7.94
+on CIFAR-100.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Video-Based ALPR System Using YOLO and Visual Rhythm <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02270v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02270v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Nascimento Ribeiro, Nina S. T. Hirata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic License Plate Recognition (ALPR) involves extracting vehicle
+license plate information from image or a video capture. These systems have
+gained popularity due to the wide availability of low-cost surveillance cameras
+and advances in Deep Learning. Typically, video-based ALPR systems rely on
+multiple frames to detect the vehicle and recognize the license plates.
+Therefore, we propose a system capable of extracting exactly one frame per
+vehicle and recognizing its license plate characters from this singular image
+using an Optical Character Recognition (OCR) model. Early experiments show that
+this methodology is viable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoSTF: Decoupled Neural Architecture Search for Cost-Effective
+  Automated Spatio-Temporal Forecasting <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16586v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16586v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengfei Lyu, Weijia Zhang, Jinliang Deng, Hao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatio-temporal forecasting is a critical component of various smart city
+applications, such as transportation optimization, energy management, and
+socio-economic analysis. Recently, several automated spatio-temporal
+forecasting methods have been proposed to automatically search the optimal
+neural network architecture for capturing complex spatio-temporal dependencies.
+However, the existing automated approaches suffer from expensive neural
+architecture search overhead, which hinders their practical use and the further
+exploration of diverse spatio-temporal operators in a finer granularity. In
+this paper, we propose AutoSTF, a decoupled automatic neural architecture
+search framework for cost-effective automated spatio-temporal forecasting. From
+the efficiency perspective, we first decouple the mixed search space into
+temporal space and spatial space and respectively devise representation
+compression and parameter-sharing schemes to mitigate the parameter explosion.
+The decoupled spatio-temporal search not only expedites the model optimization
+process but also leaves new room for more effective spatio-temporal dependency
+modeling. From the effectiveness perspective, we propose a multi-patch transfer
+module to jointly capture multi-granularity temporal dependencies and extend
+the spatial search space to enable finer-grained layer-wise spatial dependency
+search. Extensive experiments on eight datasets demonstrate the superiority of
+AutoSTF in terms of both accuracy and efficiency. Specifically, our proposed
+method achieves up to 13.48x speed-up compared to state-of-the-art automatic
+spatio-temporal forecasting methods while maintaining the best forecasting
+accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2025 Research Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Edge-Wise Graph-Instructed Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08023v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08023v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Della Santa, Antonio Mastropietro, Sandra Pieraccini, Francesco Vaccarino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of multi-task regression over graph nodes has been recently
+approached through Graph-Instructed Neural Network (GINN), which is a promising
+architecture belonging to the subset of message-passing graph neural networks.
+In this work, we discuss the limitations of the Graph-Instructed (GI) layer,
+and we formalize a novel edge-wise GI (EWGI) layer. We discuss the advantages
+of the EWGI layer and we provide numerical evidence that EWGINNs perform better
+than GINNs over some graph-structured input data, like the ones inferred from
+the Barabasi-Albert graph, and improve the training regularization on graphs
+with chaotic connectivity, like the ones inferred from the Erdos-Renyi graph.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analyzing Consumer IoT Traffic from Security and Privacy Perspectives: a
+  Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16149v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16149v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Jia, Yuxin Song, Zihou Liu, Qingyin Tan, Yang Song, Yu Zhang, Zheli Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Consumer Internet of Things (CIoT), a notable segment within the IoT
+domain, involves the integration of IoT technology into consumer electronics
+and devices, such as smart homes and smart wearables. Compared to traditional
+IoT fields, CIoT differs notably in target users, product types, and design
+approaches. While offering convenience to users, it also raises new security
+and privacy concerns. Network traffic analysis, a widely used technique in the
+security community, has been extensively applied to investigate these concerns
+about CIoT. Compared to network traffic analysis in other fields such as mobile
+apps and websites, CIoT presents unique characteristics, introducing new
+challenges and research opportunities. Researchers have made significant
+contributions in this area. To aid researchers in understanding the application
+of traffic analysis tools for studying CIoT security and privacy risks, this
+survey reviews 303 publications on traffic analysis within the CIoT security
+and privacy domain from January 2018 to June 2024, focusing on three research
+questions. Our work: 1) outlines the CIoT traffic analysis process and
+highlights its differences from general network traffic analysis. 2) summarizes
+and classifies existing research into four categories according to its
+application objectives: device fingerprinting, user activity inference,
+malicious traffic detection, and measurement. 3) explores emerging challenges
+and potential future research directions based on each step of the CIoT traffic
+analysis process. This will provide new insights to the community and guide the
+industry towards safer product designs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parallelized Midpoint Randomization for Langevin Monte Carlo 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14434v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14434v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Yu, Arnak Dalalyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of sampling from a target probability density function
+in frameworks where parallel evaluations of the log-density gradient are
+feasible. Focusing on smooth and strongly log-concave densities, we revisit the
+parallelized randomized midpoint method and investigate its properties using
+recently developed techniques for analyzing its sequential version. Through
+these techniques, we derive upper bounds on the Wasserstein distance between
+sampling and target densities. These bounds quantify the substantial runtime
+improvements achieved through parallel processing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2306.08494</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Fidelity Bayesian Optimization With Across-Task Transferable
+  Max-Value Entropy Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09570v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09570v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunchuan Zhang, Sangwoo Park, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many applications, ranging from logistics to engineering, a designer is
+faced with a sequence of optimization tasks for which the objectives are in the
+form of black-box functions that are costly to evaluate. Furthermore,
+higher-fidelity evaluations of the optimization objectives often entail a
+larger cost. Existing multi-fidelity black-box optimization strategies select
+candidate solutions and fidelity levels with the goal of maximizing the
+information about the optimal value or the optimal solution for the current
+task. Assuming that successive optimization tasks are related, this paper
+introduces a novel information-theoretic acquisition function that balances the
+need to acquire information about the current task with the goal of collecting
+information transferable to future tasks. The proposed method transfers across
+tasks distributions over parameters of a Gaussian process surrogate model by
+implementing particle-based variational Bayesian updates. Theoretical insights
+based on the analysis of the expected regret substantiate the benefits of
+acquiring transferable knowledge across tasks. Furthermore, experimental
+results across synthetic and real-world examples reveal that the proposed
+acquisition strategy that caters to future tasks can significantly improve the
+optimization efficiency as soon as a sufficient number of tasks is processed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 10 figures, published in IEEE Transactions on Signal
+  Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Byzantine Robustness in Federated Recommendation from Sparse
+  Aggregation Perspective <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03301v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03301v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjian Zhang, Mengmei Zhang, Xiao Wang, Lingjuan Lyu, Bo Yan, Junping Du, Chuan Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To preserve user privacy in recommender systems, federated recommendation
+(FR) based on federated learning (FL) emerges, keeping the personal data on the
+local client and updating a model collaboratively. Unlike FL, FR has a unique
+sparse aggregation mechanism, where the embedding of each item is updated by
+only partial clients, instead of full clients in a dense aggregation of general
+FL. Recently, as an essential principle of FL, model security has received
+increasing attention, especially for Byzantine attacks, where malicious clients
+can send arbitrary updates. The problem of exploring the Byzantine robustness
+of FR is particularly critical since in the domains applying FR, e.g.,
+e-commerce, malicious clients can be injected easily by registering new
+accounts. However, existing Byzantine works neglect the unique sparse
+aggregation of FR, making them unsuitable for our problem. Thus, we make the
+first effort to investigate Byzantine attacks on FR from the perspective of
+sparse aggregation, which is non-trivial: it is not clear how to define
+Byzantine robustness under sparse aggregations and design Byzantine attacks
+under limited knowledge/capability. In this paper, we reformulate the Byzantine
+robustness under sparse aggregation by defining the aggregation for a single
+item as the smallest execution unit. Then we propose a family of effective
+attack strategies, named Spattack, which exploit the vulnerability in sparse
+aggregation and are categorized along the adversary's knowledge and capability.
+Extensive experimental results demonstrate that Spattack can effectively
+prevent convergence and even break down defenses under a few malicious clients,
+raising alarms for securing FR systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Simplicity and Sophistication using GLinear: A Novel
+  Architecture for Enhanced Time Series Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01087v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01087v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Syed Tahir Hussain Rizvi, Neel Kanwal, Muddasar Naeem, Alfredo Cuzzocrea, Antonio Coronato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time Series Forecasting (TSF) is an important application across many fields.
+There is a debate about whether Transformers, despite being good at
+understanding long sequences, struggle with preserving temporal relationships
+in time series data. Recent research suggests that simpler linear models might
+outperform or at least provide competitive performance compared to complex
+Transformer-based models for TSF tasks. In this paper, we propose a novel
+data-efficient architecture, GLinear, for multivariate TSF that exploits
+periodic patterns to provide better accuracy. It also provides better
+prediction accuracy by using a smaller amount of historical data compared to
+other state-of-the-art linear predictors. Four different datasets (ETTh1,
+Electricity, Traffic, and Weather) are used to evaluate the performance of the
+proposed predictor. A performance comparison with state-of-the-art linear
+architectures (such as NLinear, DLinear, and RLinear) and transformer-based
+time series predictor (Autoformer) shows that the GLinear, despite being
+parametrically efficient, significantly outperforms the existing architectures
+in most cases of multivariate TSF. We hope that the proposed GLinear opens new
+fronts of research and development of simpler and more sophisticated
+architectures for data and computationally efficient time-series analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Transactions on Emerging Topics in Computational
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Dense to Sparse: Event Response for Enhanced Residential Load
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02781v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02781v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Cao, Qinghua Tao, Yingjie Zhou, Lu Zhang, Le Zhang, Dongjin Song, Dapeng Oliver Wu, Ce Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Residential load forecasting (RLF) is crucial for resource scheduling in
+power systems. Most existing methods utilize all given load records (dense
+data) to indiscriminately extract the dependencies between historical and
+future time series. However, there exist important regular patterns residing in
+the event-related associations among different appliances (sparse knowledge),
+which have yet been ignored. In this paper, we propose an Event-Response
+Knowledge Guided approach (ERKG) for RLF by incorporating the estimation of
+electricity usage events for different appliances, mining event-related sparse
+knowledge from the load series. With ERKG, the event-response estimation
+enables portraying the electricity consumption behaviors of residents,
+revealing regular variations in appliance operational states. To be specific,
+ERKG consists of knowledge extraction and guidance: i) a forecasting model is
+designed for the electricity usage events by estimating appliance operational
+states, aiming to extract the event-related sparse knowledge; ii) a novel
+knowledge-guided mechanism is established by fusing such state estimates of the
+appliance events into the RLF model, which can give particular focuses on the
+patterns of users' electricity consumption behaviors. Notably, ERKG can
+flexibly serve as a plug-in module to boost the capability of existing
+forecasting models by leveraging event response. In numerical experiments,
+extensive comparisons and ablation studies have verified the effectiveness of
+our ERKG, e.g., over 8% MAE can be reduced on the tested state-of-the-art
+forecasting models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages and 6 figures. Accepted for publication by IEEE Transactions
+  on Instrumentation and Measurement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Full Line Code Completion: Bringing AI to Desktop <span class="chip">ICSE'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.08704v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.08704v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anton Semenkin, Vitaliy Bibaev, Yaroslav Sokolov, Kirill Krylov, Alexey Kalina, Anna Khannanova, Danila Savenkov, Darya Rovdo, Igor Davidenko, Kirill Karnaukhov, Maxim Vakhrushev, Mikhail Kostyukov, Mikhail Podvitskii, Petr Surkov, Yaroslav Golubev, Nikita Povarov, Timofey Bryksin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, several industrial solutions for the problem of multi-token
+code completion appeared, each making a great advance in the area but mostly
+focusing on cloud-based runtime and avoiding working on the end user's device.
+  In this work, we describe our approach for building a multi-token code
+completion feature for the JetBrains' IntelliJ Platform, which we call Full
+Line Code Completion. The feature suggests only syntactically correct code and
+works fully locally, i.e., data querying and the generation of suggestions
+happens on the end user's machine. We share important time and
+memory-consumption restrictions, as well as design principles that a code
+completion engine should satisfy. Working entirely on the end user's device,
+our code completion engine enriches user experience while being not only fast
+and compact but also secure. We share a number of useful techniques to meet the
+stated development constraints and also describe offline and online evaluation
+pipelines that allowed us to make better decisions.
+  Our online evaluation shows that the usage of the tool leads to 1.3 times
+more Python code in the IDE being produced by code completion. The described
+solution was initially started with a help of researchers and was then bundled
+into all JetBrains IDEs where it is now used by millions of users. Thus, we
+believe that this work is useful for bridging academia and industry, providing
+researchers with the knowledge of what happens when complex research-based
+solutions are integrated into real products.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICSE'25. 12 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hardness of Learning Fixed Parities with Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Itamar Shoshani, Ohad Shamir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning parity functions is a canonical problem in learning theory, which
+although computationally tractable, is not amenable to standard learning
+algorithms such as gradient-based methods. This hardness is usually explained
+via statistical query lower bounds [Kearns, 1998]. However, these bounds only
+imply that for any given algorithm, there is some worst-case parity function
+that will be hard to learn. Thus, they do not explain why fixed parities - say,
+the full parity function over all coordinates - are difficult to learn in
+practice, at least with standard predictors and gradient-based methods [Abbe
+and Boix-Adsera, 2022]. In this paper, we address this open problem, by showing
+that for any fixed parity of some minimal size, using it as a target function
+to train one-hidden-layer ReLU networks with perturbed gradient descent will
+fail to produce anything meaningful. To establish this, we prove a new result
+about the decay of the Fourier coefficients of linear threshold (or weighted
+majority) functions, which may be of independent interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An updated version was uploaded in order to fix a typo at theorem 2
+  statement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ENCODE: Encoding NetFlows for Network Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.03890v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.03890v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clinton Cao, Annibale Panichella, Sicco Verwer, Agathe Blaise, Filippo Rebecchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  NetFlow data is a popular network log format used by many network analysts
+and researchers. The advantages of using NetFlow over deep packet inspection
+are that it is easier to collect and process, and it is less privacy intrusive.
+Many works have used machine learning to detect network attacks using NetFlow
+data. The first step for these machine learning pipelines is to pre-process the
+data before it is given to the machine learning algorithm. Many approaches
+exist to pre-process NetFlow data; however, these simply apply existing methods
+to the data, not considering the specific properties of network data. We argue
+that for data originating from software systems, such as NetFlow or software
+logs, similarities in frequency and contexts of feature values are more
+important than similarities in the value itself. In this work, we propose an
+encoding algorithm that directly takes the frequency and the context of the
+feature values into account when the data is being processed. Different types
+of network behaviours can be clustered using this encoding, thus aiding the
+process of detecting anomalies within the network. We train several machine
+learning models for anomaly detection using the data that has been encoded with
+our encoding algorithm. We evaluate the effectiveness of our encoding on a new
+dataset that we created for network attacks on Kubernetes clusters and two
+well-known public NetFlow datasets. We empirically demonstrate that the machine
+learning models benefit from using our encoding for anomaly detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multilevel Picard approximations and deep neural networks with ReLU,
+  leaky ReLU, and softplus activation overcome the curse of dimensionality when
+  approximating semilinear parabolic partial differential equations in
+  $L^p$-sense 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.20431v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.20431v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariel Neufeld, Tuan Anh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We prove that multilevel Picard approximations and deep neural networks with
+ReLU, leaky ReLU, and softplus activation are capable of approximating
+solutions of semilinear Kolmogorov PDEs in $L^\mathfrak{p}$-sense,
+$\mathfrak{p}\in [2,\infty)$, in the case of gradient-independent,
+Lipschitz-continuous nonlinearities, while the computational effort of the
+multilevel Picard approximations and the required number of parameters in the
+neural networks grow at most polynomially in both dimension $d\in \mathbb{N}$
+and reciprocal of the prescribed accuracy $\epsilon$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic normalizing flows for Effective String Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michele Caselle, Elia Cellini, Alessandro Nada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective String Theory (EST) is a powerful tool used to study confinement in
+pure gauge theories by modeling the confining flux tube connecting a static
+quark-anti-quark pair as a thin vibrating string. Recently, flow-based samplers
+have been applied as an efficient numerical method to study EST regularized on
+the lattice, opening the route to study observables previously inaccessible to
+standard analytical methods. Flow-based samplers are a class of algorithms
+based on Normalizing Flows (NFs), deep generative models recently proposed as a
+promising alternative to traditional Markov Chain Monte Carlo methods in
+lattice field theory calculations. By combining NF layers with
+out-of-equilibrium stochastic updates, we obtain Stochastic Normalizing Flows
+(SNFs), a scalable class of machine learning algorithms that can be explained
+in terms of stochastic thermodynamics. In this contribution, we outline EST and
+SNFs, and report some numerical results for the shape of the flux tube.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1+ 10 pages, 2 figures, contribution for the 41st International
+  Symposium on Lattice Field Theory (Lattice 2024), 28 July - 3 August 2024,
+  Liverpool, UK; v2: 1+ 10 pages, 2 figures, reference added</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reorganizing attention-space geometry with expressive attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.18601v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.18601v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claudius Gros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention regulates information transfer between tokens. For this, query and
+key vectors are compared, typically in terms of a scalar product,
+$\mathbf{Q}^T\mathbf{K}$, together with a subsequent softmax normalization. In
+geometric terms, the standard dot-product attention (DPA) leads to large/small
+attention weights for parallel/antiparallel queries and keys. Here we study
+expressive attention (EA), which is based on $(\mathbf{Q}^T\mathbf{K})^2$, the
+squared dot product. In this case, attention is enhanced when query and key are
+either parallel or antiparallel, and suppressed for orthogonal configurations.
+EA can be introduced into any attention-based code without additional compute
+costs or memory requirements. For a series of autoregressive prediction tasks,
+we find that expressive attention performs at least as well as vanilla DPA.
+Increasing task complexity, EA is observed to outperform DPA with increasing
+margins, which also holds for multi-task settings. For a given model size, EA
+manages to achieve 100% performance for a range of complexity levels not
+accessible to DPA. Our results show that it is possible to reorganize the
+geometry of the matching condition in the space of attention heads without loss
+of performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Toxicity Detection towards Adaptability to Changing Perturbations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15267v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15267v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hankun Kang, Jianhao Chen, Yongqi Li, Xin Miao, Mayi Xu, Ming Zhong, Yuanyuan Zhu, Tieyun Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Toxicity detection is crucial for maintaining the peace of the society. While
+existing methods perform well on normal toxic contents or those generated by
+specific perturbation methods, they are vulnerable to evolving perturbation
+patterns. However, in real-world scenarios, malicious users tend to create new
+perturbation patterns for fooling the detectors. For example, some users may
+circumvent the detector of large language models (LLMs) by adding `I am a
+scientist' at the beginning of the prompt. In this paper, we introduce a novel
+problem, i.e., continual learning jailbreak perturbation patterns, into the
+toxicity detection field. To tackle this problem, we first construct a new
+dataset generated by 9 types of perturbation patterns, 7 of them are summarized
+from prior work and 2 of them are developed by us. We then systematically
+validate the vulnerability of current methods on this new perturbation
+pattern-aware dataset via both the zero-shot and fine tuned cross-pattern
+detection. Upon this, we present the domain incremental learning paradigm and
+the corresponding benchmark to ensure the detector's robustness to dynamically
+emerging types of perturbed toxic text. Our code and dataset are provided in
+the appendix and will be publicly available at GitHub, by which we wish to
+offer new research opportunities for the security-relevant communities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Entropy-regularized Diffusion Policy with Q-Ensembles for Offline
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04080v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04080v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruoqi Zhang, Ziwei Luo, Jens Sjölund, Thomas B. Schön, Per Mattsson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents advanced techniques of training diffusion policies for
+offline reinforcement learning (RL). At the core is a mean-reverting stochastic
+differential equation (SDE) that transfers a complex action distribution into a
+standard Gaussian and then samples actions conditioned on the environment state
+with a corresponding reverse-time SDE, like a typical diffusion policy. We show
+that such an SDE has a solution that we can use to calculate the log
+probability of the policy, yielding an entropy regularizer that improves the
+exploration of offline datasets. To mitigate the impact of inaccurate value
+functions from out-of-distribution data points, we further propose to learn the
+lower confidence bound of Q-ensembles for more robust policy improvement. By
+combining the entropy-regularized diffusion policy with Q-ensembles in offline
+RL, our method achieves state-of-the-art performance on most tasks in D4RL
+benchmarks. Code is available at
+https://github.com/ruoqizzz/Entropy-Regularized-Diffusion-Policy-with-QEnsemble.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Adversarial Attacks in Reinforcement Learning from Policy
+  Distribution Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03562v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03562v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyang Duan, Zongyuan Zhang, Zheng Lin, Yue Gao, Ling Xiong, Yong Cui, Hongbin Liang, Xianhao Chen, Heming Cui, Dong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Reinforcement Learning (DRL) suffers from uncertainties and inaccuracies
+in the observation signal in realworld applications. Adversarial attack is an
+effective method for evaluating the robustness of DRL agents. However, existing
+attack methods targeting individual sampled actions have limited impacts on the
+overall policy distribution, particularly in continuous action spaces. To
+address these limitations, we propose the Distribution-Aware Projected Gradient
+Descent attack (DAPGD). DAPGD uses distribution similarity as the gradient
+perturbation input to attack the policy network, which leverages the entire
+policy distribution rather than relying on individual samples. We utilize the
+Bhattacharyya distance in DAPGD to measure policy similarity, enabling
+sensitive detection of subtle but critical differences between probability
+distributions. Our experiment results demonstrate that DAPGD achieves SOTA
+results compared to the baselines in three robot navigation tasks, achieving an
+average 22.03% higher reward drop compared to the best baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can We Enhance the Quality of Mobile Crowdsensing Data Without Ground
+  Truth? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18725v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18725v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajie Li, Bo Gu, Shimin Gong, Zhou Su, Mohsen Guizani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mobile crowdsensing (MCS) has emerged as a prominent trend across various
+domains. However, ensuring the quality of the sensing data submitted by mobile
+users (MUs) remains a complex and challenging problem. To address this
+challenge, an advanced method is needed to detect low-quality sensing data and
+identify malicious MUs that may disrupt the normal operations of an MCS system.
+Therefore, this article proposes a prediction- and reputation-based truth
+discovery (PRBTD) framework, which can separate low-quality data from
+high-quality data in sensing tasks. First, we apply a correlation-focused
+spatio-temporal Transformer network that learns from the historical sensing
+data and predicts the ground truth of the data submitted by MUs. However, due
+to the noise in historical data for training and the bursty values within
+sensing data, the prediction results can be inaccurate. To address this issue,
+we use the implications among the sensing data, which are learned from the
+prediction results but are stable and less affected by inaccurate predictions,
+to evaluate the quality of the data. Finally, we design a reputation-based
+truth discovery (TD) module for identifying low-quality data with their
+implications. Given the sensing data submitted by MUs, PRBTD can eliminate the
+data with heavy noise and identify malicious MUs with high accuracy. Extensive
+experimental results demonstrate that the PRBTD method outperforms existing
+methods in terms of identification accuracy and data quality enhancement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MultiMax: Sparse and Multi-Modal Attention Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01189v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01189v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Zhou, Mario Fritz, Margret Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SoftMax is a ubiquitous ingredient of modern machine learning algorithms. It
+maps an input vector onto a probability simplex and reweights the input by
+concentrating the probability mass at large entries. Yet, as a smooth
+approximation to the Argmax function, a significant amount of probability mass
+is distributed to other, residual entries, leading to poor interpretability and
+noise. Although sparsity can be achieved by a family of SoftMax variants, they
+often require an alternative loss function and do not preserve multi-modality.
+We show that this trade-off between multi-modality and sparsity limits the
+expressivity of SoftMax as well as its variants. We provide a solution to this
+tension between objectives by proposing a piece-wise differentiable function,
+termed MultiMax, which adaptively modulates the output distribution according
+to input entry range. Through comprehensive analysis and evaluation, we show
+that MultiMax successfully produces a distribution that supresses irrelevant
+entries while preserving multimodality, with benefits in image classification,
+language modeling and machine translation. The code is available at
+https://github.com/ZhouYuxuanYX/MultiMax.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Stochastic Nonlinear Dynamics with Embedded Latent Transfer
+  Operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02721v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02721v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naichang Ke, Ryogo Tanaka, Yoshinobu Kawahara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider an operator-based latent Markov representation of a stochastic
+nonlinear dynamical system, where the stochastic evolution of the latent state
+embedded in a reproducing kernel Hilbert space is described with the
+corresponding transfer operator, and develop a spectral method to learn this
+representation based on the theory of stochastic realization. The embedding may
+be learned simultaneously using reproducing kernels, for example, constructed
+with feed-forward neural networks. We also address the generalization of
+sequential state-estimation (Kalman filtering) in stochastic nonlinear systems,
+and of operator-based eigen-mode decomposition of dynamics, for the
+representation. Several examples with synthetic and real-world data are shown
+to illustrate the empirical characteristics of our methods, and to investigate
+the performance of our model in sequential state-estimation and mode
+decomposition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled Prioritized Resampling for Offline RL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05412v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05412v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Yue, Bingyi Kang, Xiao Ma, Qisen Yang, Gao Huang, Shiji Song, Shuicheng Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline reinforcement learning (RL) is challenged by the distributional shift
+problem. To address this problem, existing works mainly focus on designing
+sophisticated policy constraints between the learned policy and the behavior
+policy. However, these constraints are applied equally to well-performing and
+inferior actions through uniform sampling, which might negatively affect the
+learned policy. To alleviate this issue, we propose Offline Prioritized
+Experience Replay (OPER), featuring a class of priority functions designed to
+prioritize highly-rewarding transitions, making them more frequently visited
+during training. Through theoretical analysis, we show that this class of
+priority functions induce an improved behavior policy, and when constrained to
+this improved policy, a policy-constrained offline RL algorithm is likely to
+yield a better solution. We develop two practical strategies to obtain priority
+weights by estimating advantages based on a fitted value network (OPER-A) or
+utilizing trajectory returns (OPER-R) for quick computation. OPER is a
+plug-and-play component for offline RL algorithms. As case studies, we evaluate
+OPER on five different algorithms, including BC, TD3+BC, Onestep RL, CQL, and
+IQL. Extensive experiments demonstrate that both OPER-A and OPER-R
+significantly improve the performance for all baseline methods. Codes and
+priority weights are availiable at https://github.com/sail-sg/OPER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published on IEEE TNNLS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating
+  Machine Learning Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Gandhi, Manasi Patwardhan, Lovekesh Vig, Gautam Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) excel in diverse applications including
+generation of code snippets, but often struggle with generating code for
+complex Machine Learning (ML) tasks. Although existing LLM single-agent based
+systems give varying performance depending on the task complexity, they purely
+rely on larger and expensive models such as GPT-4. Our investigation reveals
+that no-cost and low-cost models such as Gemini-Pro, Mixtral and CodeLlama
+perform far worse than GPT-4 in a single-agent setting. With the motivation of
+developing a cost-efficient LLM based solution for solving ML tasks, we propose
+an LLM Multi-Agent based system which leverages combination of experts using
+profiling, efficient retrieval of past observations, LLM cascades, and
+ask-the-expert calls. Through empirical analysis on ML engineering tasks in the
+MLAgentBench benchmark, we demonstrate the effectiveness of our system, using
+no-cost models, namely Gemini as the base LLM, paired with GPT-4 in cascade and
+expert to serve occasional ask-the-expert calls for planning. With 94.2\%
+reduction in the cost (from \$0.931 per run cost averaged over all tasks for
+GPT-4 single agent system to \$0.054), our system is able to yield better
+average success rate of 32.95\% as compared to GPT-4 single-agent system
+yielding 22.72\% success rate averaged over all the tasks of MLAgentBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at AIMLSystems '24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing Teacher Networks for Effective Knowledge Distillation
+  Across Student Architectures <span class="chip">BMVC 24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuluhan Binici, Weiming Wu, Tulika Mitra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD) is a model compression method that entails
+training a compact student model to emulate the performance of a more complex
+teacher model. However, the architectural capacity gap between the two models
+limits the effectiveness of knowledge transfer. Addressing this issue, previous
+works focused on customizing teacher-student pairs to improve compatibility, a
+computationally expensive process that needs to be repeated every time either
+model changes. Hence, these methods are impractical when a teacher model has to
+be compressed into different student models for deployment on multiple hardware
+devices with distinct resource constraints. In this work, we propose Generic
+Teacher Network (GTN), a one-off KD-aware training to create a generic teacher
+capable of effectively transferring knowledge to any student model sampled from
+a given finite pool of architectures. To this end, we represent the student
+pool as a weight-sharing supernet and condition our generic teacher to align
+with the capacities of various student architectures sampled from this
+supernet. Experimental evaluation shows that our method both improves overall
+KD effectiveness and amortizes the minimal additional training cost of the
+generic teacher across students in the pool.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>British Machine Vision Conference (BMVC 24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamics of Meta-learning Representation in the Teacher-student Scenario 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12545v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12545v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Wang, Cho Tung Yip, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient-based meta-learning algorithms have gained popularity for their
+ability to train models on new tasks using limited data. Empirical observations
+indicate that such algorithms are able to learn a shared representation across
+tasks, which is regarded as a key factor in their success. However, the
+in-depth theoretical understanding of the learning dynamics and the origin of
+the shared representation remains underdeveloped. In this work, we investigate
+the meta-learning dynamics of nonlinear two-layer neural networks trained on
+streaming tasks in the teacher-student scenario. Through the lens of
+statistical physics analysis, we characterize the macroscopic behavior of the
+meta-training processes, the formation of the shared representation, and the
+generalization ability of the model on new tasks. The analysis also points to
+the importance of the choice of certain hyperparameters of the learning
+algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization
+  Algorithm for Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01714v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01714v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawu Tian, Liwei Xu, Xiaowei Zhang, Yongqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training deep neural networks is a challenging task. In order to speed up
+training and enhance the performance of deep neural networks, we rectify the
+vanilla conjugate gradient as conjugate-gradient-like and incorporate it into
+the generic Adam, and thus propose a new optimization algorithm named
+CG-like-Adam for deep learning. Specifically, both the first-order and the
+second-order moment estimation of generic Adam are replaced by the
+conjugate-gradient-like. Convergence analysis handles the cases where the
+exponential moving average coefficient of the first-order moment estimation is
+constant and the first-order moment estimation is unbiased. Numerical
+experiments show the superiority of the proposed algorithm based on the
+CIFAR10/100 dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Topology-enhanced machine learning model (Top-ML) for anticancer peptide
+  prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.08974v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.08974v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Zhi En Tan, JunJie Wee, Xue Gong, Kelin Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, therapeutic peptides have demonstrated great promise for cancer
+treatment. To explore powerful anticancer peptides, artificial intelligence
+(AI)-based approaches have been developed to systematically screen potential
+candidates. However, the lack of efficient featurization of peptides has become
+a bottleneck for these machine-learning models. In this paper, we propose a
+topology-enhanced machine learning model (Top-ML) for anticancer peptides
+prediction. Our Top-ML employs peptide topological features derived from its
+sequence "connection" information characterized by vector and spectral
+descriptors. Our Top-ML model, employing an Extra-Trees classifier, has been
+validated on the AntiCP 2.0 and mACPpred 2.0 benchmark datasets, achieving
+state-of-the-art performance or results comparable to existing deep learning
+models, while providing greater interpretability. Our results highlight the
+potential of leveraging novel topology-based featurization to accelerate the
+identification of anticancer peptides.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Why Does Dropping Edges Usually Outperform Adding Edges in Graph
+  Contrastive Learning? <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.08128v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.08128v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanchen Xu, Siqi Huang, Hongyuan Zhang, Xuelong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph contrastive learning (GCL) has been widely used as an effective
+self-supervised learning method for graph representation learning. However, how
+to apply adequate and stable graph augmentation to generating proper views for
+contrastive learning remains an essential problem. Dropping edges is a primary
+augmentation in GCL while adding edges is not a common method due to its
+unstable performance. To our best knowledge, there is no theoretical analysis
+to study why dropping edges usually outperforms adding edges. To answer this
+question, we introduce a new metric, namely Error Passing Rate (EPR), to
+quantify how a graph fits the network. Inspired by the theoretical conclusions
+and the idea of positive-incentive noise, we propose a novel GCL algorithm,
+Error-PAssing-based Graph Contrastive Learning (EPAGCL), which uses both edge
+adding and edge dropping as its augmentations. To be specific, we generate
+views by adding and dropping edges based on the weights derived from EPR.
+Extensive experiments on various real-world datasets are conducted to validate
+the correctness of our theoretical analysis and the effectiveness of our
+proposed algorithm. Our code is available at:
+https://github.com/hyzhang98/EPAGCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ferrari: Federated Feature Unlearning via Optimizing Feature Sensitivity <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17462v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17462v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanlin Gu, Win Kent Ong, Chee Seng Chan, Lixin Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of Federated Learning (FL) highlights the practical necessity for
+the right to be forgotten for all clients, allowing them to request data
+deletion from the machine learning models service provider. This necessity has
+spurred a growing demand for Federated Unlearning (FU). Feature unlearning has
+gained considerable attention due to its applications in unlearning sensitive,
+backdoor, and biased features. Existing methods employ the influence function
+to achieve feature unlearning, which is impractical for FL as it necessitates
+the participation of other clients, if not all, in the unlearning process.
+Furthermore, current research lacks an evaluation of the effectiveness of
+feature unlearning. To address these limitations, we define feature sensitivity
+in evaluating feature unlearning according to Lipschitz continuity. This metric
+characterizes the model outputs rate of change or sensitivity to perturbations
+in the input feature. We then propose an effective federated feature unlearning
+framework called Ferrari, which minimizes feature sensitivity. Extensive
+experimental results and theoretical analysis demonstrate the effectiveness of
+Ferrari across various feature unlearning scenarios, including sensitive,
+backdoor, and biased features. The code is publicly available at
+https://github.com/OngWinKent/Federated-Feature-Unlearning
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TLDR: The need for a "right to be forgotten" in Federated Learning
+  has led to the development of the Ferrari framework, which efficiently
+  unlearns sensitive features using a Lipschitz continuity-based metric, proven
+  effective in extensive testing. Accepted at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval-Augmented Generation with Graphs (GraphRAG) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00309v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00309v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Han, Yu Wang, Harry Shomer, Kai Guo, Jiayuan Ding, Yongjia Lei, Mahantesh Halappanavar, Ryan A. Rossi, Subhabrata Mukherjee, Xianfeng Tang, Qi He, Zhigang Hua, Bo Long, Tong Zhao, Neil Shah, Amin Javari, Yinglong Xia, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) is a powerful technique that enhances
+downstream task execution by retrieving additional information, such as
+knowledge, skills, and tools from external sources. Graph, by its intrinsic
+"nodes connected by edges" nature, encodes massive heterogeneous and relational
+information, making it a golden resource for RAG in tremendous real-world
+applications. As a result, we have recently witnessed increasing attention on
+equipping RAG with Graph, i.e., GraphRAG. However, unlike conventional RAG,
+where the retriever, generator, and external data sources can be uniformly
+designed in the neural-embedding space, the uniqueness of graph-structured
+data, such as diverse-formatted and domain-specific relational knowledge, poses
+unique and significant challenges when designing GraphRAG for different
+domains. Given the broad applicability, the associated design challenges, and
+the recent surge in GraphRAG, a systematic and up-to-date survey of its key
+concepts and techniques is urgently desired. Following this motivation, we
+present a comprehensive and up-to-date survey on GraphRAG. Our survey first
+proposes a holistic GraphRAG framework by defining its key components,
+including query processor, retriever, organizer, generator, and data source.
+Furthermore, recognizing that graphs in different domains exhibit distinct
+relational patterns and require dedicated designs, we review GraphRAG
+techniques uniquely tailored to each domain. Finally, we discuss research
+challenges and brainstorm directions to inspire cross-disciplinary
+opportunities. Our survey repository is publicly maintained at
+https://github.com/Graph-RAG/GraphRAG/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Temporally-Consistent Koopman Autoencoders for Forecasting Dynamical
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12335v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12335v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Indranil Nayak, Ananda Chakrabarty, Mrinal Kumar, Fernando Teixeira, Debdipta Goswami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Absence of sufficiently high-quality data often poses a key challenge in
+data-driven modeling of high-dimensional spatio-temporal dynamical systems.
+Koopman Autoencoders (KAEs) harness the expressivity of deep neural networks
+(DNNs), the dimension reduction capabilities of autoencoders, and the spectral
+properties of the Koopman operator to learn a reduced-order feature space with
+simpler, linear dynamics. However, the effectiveness of KAEs is hindered by
+limited and noisy training datasets, leading to poor generalizability. To
+address this, we introduce the Temporally-Consistent Koopman Autoencoder
+(tcKAE), designed to generate accurate long-term predictions even with limited
+and noisy training data. This is achieved through a consistency regularization
+term that enforces prediction coherence across different time steps, thus
+enhancing the robustness and generalizability of tcKAE over existing models. We
+provide analytical justification for this approach based on Koopman spectral
+theory and empirically demonstrate tcKAE's superior performance over
+state-of-the-art KAE models across a variety of test cases, including simple
+pendulum oscillations, kinetic plasma, and fluid flow data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Soft Sensor Method with Uncertainty-Awareness and Self-Explanation
+  Based on Large Language Models Enhanced by Domain Knowledge Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03295v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03295v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Tong, Han Liu, Runyuan Guo, Wenqing Wang, Xueqiong Tian, Lingyun Wei, Lin Zhang, Huayong Wu, Ding Liu, Youmin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven soft sensors are crucial in predicting key performance indicators
+in industrial systems. However, current methods predominantly rely on the
+supervised learning paradigms of parameter updating, which inherently faces
+challenges such as high development costs, poor robustness, training
+instability, and lack of interpretability. Recently, large language models
+(LLMs) have demonstrated significant potential across various domains, notably
+through In-Context Learning (ICL), which enables high-performance task
+execution with minimal input-label demonstrations and no prior training. This
+paper aims to replace supervised learning with the emerging ICL paradigm for
+soft sensor modeling to address existing challenges and explore new avenues for
+advancement. To achieve this, we propose a novel framework called the Few-shot
+Uncertainty-aware and self-Explaining Soft Sensor (LLM-FUESS), which includes
+the Zero-shot Auxiliary Variable Selector (LLM-ZAVS) and the Uncertainty-aware
+Few-shot Soft Sensor (LLM-UFSS). The LLM-ZAVS retrieves from the Industrial
+Knowledge Vector Storage to enhance LLMs' domain-specific knowledge, enabling
+zero-shot auxiliary variable selection. In the LLM-UFSS, we utilize text-based
+context demonstrations of structured data to prompt LLMs to execute ICL for
+predicting and propose a context sample retrieval augmentation strategy to
+improve performance. Additionally, we explored LLMs' AIGC and probabilistic
+characteristics to propose self-explanation and uncertainty quantification
+methods for constructing a trustworthy soft sensor. Extensive experiments
+demonstrate that our method achieved state-of-the-art predictive performance,
+strong robustness, and flexibility, effectively mitigates training instability
+found in traditional methods. To the best of our knowledge, this is the first
+work to establish soft sensor utilizing LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Coding for Both Human Perception and Generalized Machine
+  Analytics with CLIP Supervision <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangsheng Yin, Quan Liu, Xuelin Shen, Yulin He, Wenhan Yang, Shiqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The image compression model has long struggled with adaptability and
+generalization, as the decoded bitstream typically serves only human or machine
+needs and fails to preserve information for unseen visual tasks. Therefore,
+this paper innovatively introduces supervision obtained from multimodal
+pre-training models and incorporates adaptive multi-objective optimization
+tailored to support both human visual perception and machine vision
+simultaneously with a single bitstream, denoted as Unified and Generalized
+Image Coding for Machine (UG-ICM). Specifically, to get rid of the reliance
+between compression models with downstream task supervision, we introduce
+Contrastive Language-Image Pre-training (CLIP) models into the training
+constraint for improved generalization. Global-to-instance-wise CLIP
+supervision is applied to help obtain hierarchical semantics that make models
+more generalizable for the tasks relying on the information of different
+granularity. Furthermore, for supporting both human and machine visions with
+only a unifying bitstream, we incorporate a conditional decoding strategy that
+takes as conditions human or machine preferences, enabling the bitstream to be
+decoded into different versions for corresponding preferences. As such, our
+proposed UG-ICM is fully trained in a self-supervised manner, i.e., without
+awareness of any specific downstream models and tasks. The extensive
+experiments have shown that the proposed UG-ICM is capable of achieving
+remarkable improvements in various unseen machine analytics tasks, while
+simultaneously providing perceptually satisfying images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 10 figures, publised to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multichannel Steganography: A Provably Secure Hybrid Steganographic
+  Model for Secure Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Obinna Omego, Michal Bosy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces a novel steganographic model that synthesizes
+Steganography by Cover Modification (CMO) and Steganography by Cover Synthesis
+(CSY), enhancing both security and undetectability by generating cover messages
+or parameters while retaining the original cover's form, thus minimizing
+detection risks and overcoming the limitations of single-method techniques.
+Building upon this model, a refined Steganographic Communication Protocol is
+proposed, enhancing resilience against sophisticated threats such as
+Multichannel Replay Attacks and Multichannel Man-in-the-Middle Attacks,
+fortifying the protocol against potential tampering and improving upon prior
+works. To evaluate the security of the proposed protocol, a novel adversarial
+model is developed simulating a probabilistic polynomial time (PPT) adversary
+capable of intercepting communications across multiple channels. This model
+assesses the adversary's ability to compromise the protocol, providing a
+comprehensive security analysis. Finally, this study explores the practicality
+and adaptability of the model to both constrained environments like SMS banking
+and resource-rich settings such as blockchain transactions, demonstrating their
+potential to enhance financial services and security. These contributions
+present a robust and adaptable framework for secure steganographic
+communication, offering practical solutions for secure communications across
+diverse environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures, 3 algorithms, This version is a preprint
+  uploaded to arXiv</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LipGen: Viseme-Guided Lip Video Generation for Enhancing Visual Speech
+  Recognition <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Hao, Dongliang Zhou, Xiaojie Li, Xingyu Zhang, Liang Xie, Jianlong Wu, Erwei Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual speech recognition (VSR), commonly known as lip reading, has garnered
+significant attention due to its wide-ranging practical applications. The
+advent of deep learning techniques and advancements in hardware capabilities
+have significantly enhanced the performance of lip reading models. Despite
+these advancements, existing datasets predominantly feature stable video
+recordings with limited variability in lip movements. This limitation results
+in models that are highly sensitive to variations encountered in real-world
+scenarios. To address this issue, we propose a novel framework, LipGen, which
+aims to improve model robustness by leveraging speech-driven synthetic visual
+data, thereby mitigating the constraints of current datasets. Additionally, we
+introduce an auxiliary task that incorporates viseme classification alongside
+attention mechanisms. This approach facilitates the efficient integration of
+temporal information, directing the model's focus toward the relevant segments
+of speech, thereby enhancing discriminative capabilities. Our method
+demonstrates superior performance compared to the current state-of-the-art on
+the lip reading in the wild (LRW) dataset and exhibits even more pronounced
+advantages under challenging conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video Summarisation with Incident and Context Information using
+  Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ulindu De Silva, Leon Fernando, Kalinga Bandara, Rashmika Nawaratne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of video content production has led to vast amounts of
+data, posing substantial challenges in terms of analysis efficiency and
+resource utilization. Addressing this issue calls for the development of robust
+video analysis tools. This paper proposes a novel approach leveraging
+Generative Artificial Intelligence (GenAI) to facilitate streamlined video
+analysis. Our tool aims to deliver tailored textual summaries of user-defined
+queries, offering a focused insight amidst extensive video datasets. Unlike
+conventional frameworks that offer generic summaries or limited action
+recognition, our method harnesses the power of GenAI to distil relevant
+information, enhancing analysis precision and efficiency. Employing YOLO-V8 for
+object detection and Gemini for comprehensive video and text analysis, our
+solution achieves heightened contextual accuracy. By combining YOLO with
+Gemini, our approach furnishes textual summaries extracted from extensive CCTV
+footage, enabling users to swiftly navigate and verify pertinent events without
+the need for exhaustive manual review. The quantitative evaluation revealed a
+similarity of 72.8%, while the qualitative assessment rated an accuracy of 85%,
+demonstrating the capability of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to Bridge the Gap between Modalities: <span class="highlight-title">Survey</span> on Multimodal Large
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07594v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07594v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shezheng Song, Xiaopeng Li, Shasha Li, Shan Zhao, Jie Yu, Jun Ma, Xiaoguang Mao, Weimin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore Multimodal Large Language Models (MLLMs), which integrate LLMs
+like GPT-4 to handle multimodal data, including text, images, audio, and more.
+MLLMs demonstrate capabilities such as generating image captions and answering
+image-based questions, bridging the gap towards real-world human-computer
+interactions and hinting at a potential pathway to artificial general
+intelligence. However, MLLMs still face challenges in addressing the semantic
+gap in multimodal data, which may lead to erroneous outputs, posing potential
+risks to society. Selecting the appropriate modality alignment method is
+crucial, as improper methods might require more parameters without significant
+performance improvements. This paper aims to explore modality alignment methods
+for LLMs and their current capabilities. Implementing effective modality
+alignment can help LLMs address environmental issues and enhance accessibility.
+The study surveys existing modality alignment methods for MLLMs, categorizing
+them into four groups: (1) Multimodal Converter, which transforms data into a
+format that LLMs can understand; (2) Multimodal Perceiver, which improves how
+LLMs percieve different types of data; (3) Tool Learning, which leverages
+external tools to convert data into a common format, usually text; and (4)
+Data-Driven Method, which teaches LLMs to understand specific data types within
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TKDE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3DMambaIPF: A State Space Model for Iterative Point Cloud Filtering via
+  Differentiable Rendering <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.05522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.05522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyuan Zhou, Weidong Yang, Ben Fei, Jingyi Xu, Rui Zhang, Keyi Liu, Yeqi Luo, Ying He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Noise is an inevitable aspect of point cloud acquisition, necessitating
+filtering as a fundamental task within the realm of 3D vision. Existing
+learning-based filtering methods have shown promising capabilities on
+small-scale synthetic or real-world datasets. Nonetheless, the effectiveness of
+these methods is constrained when dealing with a substantial quantity of point
+clouds. This limitation primarily stems from their limited denoising
+capabilities for large-scale point clouds and their inclination to generate
+noisy outliers after denoising. The recent introduction of State Space Models
+(SSMs) for long sequence modeling in Natural Language Processing (NLP) presents
+a promising solution for handling large-scale data. Encouraged by iterative
+point cloud filtering methods, we introduce 3DMambaIPF, firstly incorporating
+Mamba (Selective SSM) architecture to sequentially handle extensive point
+clouds from large scenes, capitalizing on its strengths in selective input
+processing and long sequence modeling capabilities. Additionally, we integrate
+a robust and fast differentiable rendering loss to constrain the noisy points
+around the surface. In contrast to previous methodologies, this differentiable
+rendering loss enhances the visual realism of denoised geometric structures and
+aligns point cloud boundaries more closely with those observed in real-world
+objects. Extensive evaluation on datasets comprising small-scale synthetic and
+real-world models (typically with up to 50K points) demonstrate that our method
+achieves state-of-the-art results. Moreover, we showcase the superior
+scalability and efficiency of our method on large-scale models with about 500K
+points, where the majority of the existing learning-based denoising methods are
+unable to handle.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">134</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Planarian Neural Networks: Evolutionary Patterns from Basic Bilateria
+  Shaping Modern Artificial Neural Network Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyuan Huang, Mark Newman, Maria Vaida, Srikar Bellur, Roozbeh Sadeghian, Andrew Siu, Hui Wang, Kevin Huggins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study examined the viability of enhancing the prediction accuracy of
+artificial neural networks (ANNs) in image classification tasks by developing
+ANNs with evolution patterns similar to those of biological neural networks.
+ResNet is a widely used family of neural networks with both deep and wide
+variants; therefore, it was selected as the base model for our investigation.
+The aim of this study is to improve the image classification performance of
+ANNs via a novel approach inspired by the biological nervous system
+architecture of planarians, which comprises a brain and two nerve cords. We
+believe that the unique neural architecture of planarians offers valuable
+insights into the performance enhancement of ANNs. The proposed planarian
+neural architecture-based neural network was evaluated on the CIFAR-10 and
+CIFAR-100 datasets. Our results indicate that the proposed method exhibits
+higher prediction accuracy than the baseline neural network models in image
+classification tasks. These findings demonstrate the significant potential of
+biologically inspired neural network architectures in improving the performance
+of ANNs in a wide range of applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grokking at the Edge of Numerical Stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Prieto, Melih Barsbey, Pedro A. M. Mediano, Tolga Birdal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grokking, the sudden generalization that occurs after prolonged overfitting,
+is a surprising phenomenon challenging our understanding of deep learning.
+Although significant progress has been made in understanding grokking, the
+reasons behind the delayed generalization and its dependence on regularization
+remain unclear. In this work, we argue that without regularization, grokking
+tasks push models to the edge of numerical stability, introducing floating
+point errors in the Softmax function, which we refer to as Softmax Collapse
+(SC). We demonstrate that SC prevents grokking and that mitigating SC enables
+grokking without regularization. Investigating the root cause of SC, we find
+that beyond the point of overfitting, the gradients strongly align with what we
+call the na\"ive loss minimization (NLM) direction. This component of the
+gradient does not alter the model's predictions but decreases the loss by
+scaling the logits, typically by scaling the weights along their current
+direction. We show that this scaling of the logits explains the delay in
+generalization characteristic of grokking and eventually leads to SC, halting
+further learning. To validate our hypotheses, we introduce two key
+contributions that address the challenges in grokking tasks: StableMax, a new
+activation function that prevents SC and enables grokking without
+regularization, and $\perp$Grad, a training algorithm that promotes quick
+generalization in grokking tasks by preventing NLM altogether. These
+contributions provide new insights into grokking, elucidating its delayed
+generalization, reliance on regularization, and the effectiveness of existing
+grokking-inducing methods. Code for this paper is available at
+https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EpiCoder: Encompassing Diversity and Complexity in Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04694v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04694v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaoxiang Wang, Haoling Li, Xin Zhang, Jie Wu, Xiao Liu, Wenxiang Hu, Zhongxin Guo, Yangyu Huang, Ying Xin, Yujiu Yang, Jinsong Su, Qi Chen, Scarlett Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective instruction tuning is indispensable for optimizing code LLMs,
+aligning model behavior with user expectations and enhancing model performance
+in real-world applications. However, most existing methods focus on code
+snippets, which are limited to specific functionalities and rigid structures,
+restricting the complexity and diversity of the synthesized data. To address
+these limitations, we introduce a novel feature tree-based synthesis framework
+inspired by Abstract Syntax Trees (AST). Unlike AST, which captures syntactic
+structure of code, our framework models semantic relationships between code
+elements, enabling the generation of more nuanced and diverse data. The feature
+tree is constructed from raw data and refined iteratively to increase the
+quantity and diversity of the extracted features. This process enables the
+identification of more complex patterns and relationships within the code. By
+sampling subtrees with controlled depth and breadth, our framework allows
+precise adjustments to the complexity of the generated code, supporting a wide
+range of tasks from simple function-level operations to intricate multi-file
+scenarios. We fine-tuned widely-used base models to create the EpiCoder series,
+achieving state-of-the-art performance at both the function and file levels
+across multiple benchmarks. Notably, empirical evidence indicates that our
+approach shows significant potential in synthesizing highly complex
+repository-level code data. Further analysis elucidates the merits of this
+approach by rigorously assessing data complexity and diversity through software
+engineering principles and LLM-as-a-judge method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous
+  Sensors via Language Grounding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Jones, Oier Mees, Carmelo Sferrazza, Kyle Stachowicz, Pieter Abbeel, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interacting with the world is a multi-sensory experience: achieving effective
+general-purpose interaction requires making use of all available modalities --
+including vision, touch, and audio -- to fill in gaps from partial observation.
+For example, when vision is occluded reaching into a bag, a robot should rely
+on its senses of touch and sound. However, state-of-the-art generalist robot
+policies are typically trained on large datasets to predict robot actions
+solely from visual and proprioceptive observations. In this work, we propose
+FuSe, a novel approach that enables finetuning visuomotor generalist policies
+on heterogeneous sensor modalities for which large datasets are not readily
+available by leveraging natural language as a common cross-modal grounding. We
+combine a multimodal contrastive loss with a sensory-grounded language
+generation loss to encode high-level semantics. In the context of robot
+manipulation, we show that FuSe enables performing challenging tasks that
+require reasoning jointly over modalities such as vision, touch, and sound in a
+zero-shot setting, such as multimodal prompting, compositional cross-modal
+prompting, and descriptions of objects it interacts with. We show that the same
+recipe is applicable to widely different generalist policies, including both
+diffusion-based generalist policies and large vision-language-action (VLA)
+models. Extensive experiments in the real world show that FuSeis able to
+increase success rates by over 20% compared to all considered baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ URSA: Understanding and Verifying Chain-of-thought Reasoning in
+  Multimodal Mathematics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruilin Luo, Zhuofan Zheng, Yifan Wang, Yiyao Yu, Xinzhe Ni, Zicheng Lin, Jin Zeng, Yujiu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-thought (CoT) reasoning has been widely applied in the mathematical
+reasoning of Large Language Models (LLMs). Recently, the introduction of
+derivative process supervision on CoT trajectories has sparked discussions on
+enhancing scaling capabilities during test time, thereby boosting the potential
+of these models. However, in multimodal mathematical reasoning, the scarcity of
+high-quality CoT training data has hindered existing models from achieving
+high-precision CoT reasoning and has limited the realization of reasoning
+potential during test time. In this work, we propose a three-module synthesis
+strategy that integrates CoT distillation, trajectory-format rewriting, and
+format unification. It results in a high-quality CoT reasoning instruction
+fine-tuning dataset in multimodal mathematics, MMathCoT-1M. We comprehensively
+validate the state-of-the-art (SOTA) performance of the trained URSA-7B model
+on multiple multimodal mathematical benchmarks. For test-time scaling, we
+introduce a data synthesis strategy that automatically generates process
+annotation datasets, known as DualMath-1.1M, focusing on both interpretation
+and logic. By further training URSA-7B on DualMath-1.1M, we transition from CoT
+reasoning capabilities to robust supervision abilities. The trained URSA-RM-7B
+acts as a verifier, effectively enhancing the performance of URSA-7B at test
+time. URSA-RM-7B also demonstrates excellent out-of-distribution (OOD)
+verifying capabilities, showcasing its generalization. Model weights, training
+data and code will be open-sourced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 10 tables, 17 figures. The training data has been released.
+  The code and model are currently undergoing internal review. They will be
+  made available soon. Project url: https://ursa-math.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards System 2 Reasoning in LLMs: Learning How to Think With Meta
+  Chain-of-Though 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Violet Xiang, Charlie Snell, Kanishk Gandhi, Alon Albalak, Anikait Singh, Chase Blagden, Duy Phung, Rafael Rafailov, Nathan Lile, Dakota Mahan, Louis Castricato, Jan-Philipp Franken, Nick Haber, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel framework, Meta Chain-of-Thought (Meta-CoT), which extends
+traditional Chain-of-Thought (CoT) by explicitly modeling the underlying
+reasoning required to arrive at a particular CoT. We present empirical evidence
+from state-of-the-art models exhibiting behaviors consistent with in-context
+search, and explore methods for producing Meta-CoT via process supervision,
+synthetic data generation, and search algorithms. Finally, we outline a
+concrete pipeline for training a model to produce Meta-CoTs, incorporating
+instruction tuning with linearized search traces and reinforcement learning
+post-training. Finally, we discuss open research questions, including scaling
+laws, verifier roles, and the potential for discovering novel reasoning
+algorithms. This work provides a theoretical and practical roadmap to enable
+Meta-CoT in LLMs, paving the way for more powerful and human-like reasoning in
+artificial intelligence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Financial VQA in Vision Language Models using Intermediate
+  Structured Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Archita Srivastava, Abhas Kumar, Rajesh Kumar, Prabhakar Srinivasan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chart interpretation is crucial for visual data analysis, but accurately
+extracting information from charts poses significant challenges for automated
+models. This study investigates the fine-tuning of DEPLOT, a modality
+conversion module that translates the image of a plot or chart to a linearized
+table, on a custom dataset of 50,000 bar charts. The dataset comprises simple,
+stacked, and grouped bar charts, targeting the unique structural features of
+these visualizations. The finetuned DEPLOT model is evaluated against its base
+version using a test set of 1,000 images and two metrics: Relative Mapping
+Similarity (RMS), which measures categorical mapping accuracy, and Relative
+Number Set Similarity (RNSS), which evaluates numerical interpretation
+accuracy. To further explore the reasoning capabilities of large language
+models (LLMs), we curate an additional set of 100 bar chart images paired with
+question answer sets. Our findings demonstrate that providing a structured
+intermediate table alongside the image significantly enhances LLM reasoning
+performance compared to direct image queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DRIVINGVQA: Analyzing Visual Chain-of-Thought Reasoning of Vision
+  Language Models in Real-World Scenarios with Driving Theory Tests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles Corbière, Simon Roburin, Syrielle Montariol, Antoine Bosselut, Alexandre Alahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models (LVLMs) augment language models with visual
+understanding, enabling multimodal reasoning. However, due to the modality gap
+between textual and visual data, they often face significant challenges, such
+as over-reliance on text priors, hallucinations, and limited capacity for
+complex visual reasoning. Existing benchmarks to evaluate visual reasoning in
+LVLMs often rely on schematic or synthetic images and on imprecise
+machine-generated explanations. To bridge the modality gap, we present
+DrivingVQA, a new benchmark derived from driving theory tests to evaluate
+visual chain-of-thought reasoning in complex real-world scenarios. It offers
+3,931 expert-crafted multiple-choice problems and interleaved explanations
+grounded with entities relevant to the reasoning process. We leverage this
+dataset to perform an extensive study of LVLMs' ability to reason about complex
+visual scenarios. Our experiments reveal that open-source and proprietary LVLMs
+struggle with visual chain-of-thought reasoning under zero-shot settings. We
+investigate training strategies that leverage relevant entities to improve
+visual reasoning. Notably, we observe a performance boost of up to 7\% when
+reasoning over image tokens of cropped regions tied to these entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Language Comprehension in Large Language Models Using
+  Construction Grammar 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wesley Scivetti, Melissa Torgbi, Austin Blodgett, Mollie Shichman, Taylor Hudson, Claire Bonial, Harish Tayyar Madabushi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models, despite their significant capabilities, are known to
+fail in surprising and unpredictable ways. Evaluating their true
+`understanding' of language is particularly challenging due to the extensive
+web-scale data they are trained on. Therefore, we construct an evaluation to
+systematically assess natural language understanding (NLU) in LLMs by
+leveraging Construction Grammar (CxG), which provides insights into the meaning
+captured by linguistic elements known as constructions (Cxns). CxG is
+well-suited for this purpose because provides a theoretical basis to construct
+targeted evaluation sets. These datasets are carefully constructed to include
+examples which are unlikely to appear in pre-training data, yet intuitive and
+easy for humans to understand, enabling a more targeted and reliable
+assessment. Our experiments focus on downstream natural language inference and
+reasoning tasks by comparing LLMs' understanding of the underlying meanings
+communicated through 8 unique Cxns with that of humans. The results show that
+while LLMs demonstrate some knowledge of constructional information, even the
+latest models including GPT-o1 struggle with abstract meanings conveyed by
+these Cxns, as demonstrated in cases where test sentences are dissimilar to
+their pre-training data. We argue that such cases provide a more accurate test
+of true language understanding, highlighting key limitations in LLMs' semantic
+capabilities. We make our novel dataset and associated experimental data
+including prompts and model responses publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Retrieval Based on Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Te-Lun Yang, Jyi-Shane Liu, Yuen-Hsien Tseng, Jyh-Shing Roger Jang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study develops a question-answering system based on Retrieval-Augmented
+Generation (RAG) using Chinese Wikipedia and Lawbank as retrieval sources.
+Using TTQA and TMMLU+ as evaluation datasets, the system employs BGE-M3 for
+dense vector retrieval to obtain highly relevant search results and
+BGE-reranker to reorder these results based on query relevance. The most
+pertinent retrieval outcomes serve as reference knowledge for a Large Language
+Model (LLM), enhancing its ability to answer questions and establishing a
+knowledge retrieval system grounded in generative AI.
+  The system's effectiveness is assessed through a two-stage evaluation:
+automatic and assisted performance evaluations. The automatic evaluation
+calculates accuracy by comparing the model's auto-generated labels with ground
+truth answers, measuring performance under standardized conditions without
+human intervention. The assisted performance evaluation involves 20
+finance-related multiple-choice questions answered by 20 participants without
+financial backgrounds. Initially, participants answer independently. Later,
+they receive system-generated reference information to assist in answering,
+examining whether the system improves accuracy when assistance is provided.
+  The main contributions of this research are: (1) Enhanced LLM Capability: By
+integrating BGE-M3 and BGE-reranker, the system retrieves and reorders highly
+relevant results, reduces hallucinations, and dynamically accesses authorized
+or public knowledge sources. (2) Improved Data Privacy: A customized RAG
+architecture enables local operation of the LLM, eliminating the need to send
+private data to external servers. This approach enhances data security, reduces
+reliance on commercial services, lowers operational costs, and mitigates
+privacy risks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 13 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedCoDi-M: A Multi-<span class="highlight-title">Prompt</span> Foundation Model for Multimodal Medical Data
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04614v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04614v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Molino, Francesco Di Feola, Eliodoro Faiella, Deborah Fazzini, Domiziana Santucci, Linlin Shen, Valerio Guarrasi, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence is revolutionizing medical practice, enhancing
+diagnostic accuracy and healthcare delivery. However, its adaptation in medical
+settings still faces significant challenges, related to data availability and
+privacy constraints. Synthetic data has emerged as a promising solution to
+mitigate these issues, addressing data scarcity while preserving privacy.
+Recently, Latent Diffusion Models have emerged as a powerful tool for
+generating high-quality synthetic data. Meanwhile, the integration of different
+modalities has gained interest, emphasizing the need of models capable of
+handle multimodal medical data.Existing approaches struggle to integrate
+complementary information and lack the ability to generate modalities
+simultaneously. To address this challenge, we present MedCoDi-M, a
+6.77-billion-parameter model, designed for multimodal medical data generation,
+that, following Foundation Model paradigm, exploits contrastive learning and
+large quantity of data to build a shared latent space which capture the
+relationships between different data modalities. Further, we introduce the
+Multi-Prompt training technique, which significantly boosts MedCoDi-M's
+generation under different settings. We extensively validate MedCoDi-M: first
+we benchmark it against five competitors on the MIMIC-CXR dataset, a
+state-of-the-art dataset for Chest X-ray and radiological report generation.
+Secondly, we perform a Visual Turing Test with expert radiologists to assess
+the realism and clinical relevance of the generated data, ensuring alignment
+with real-world scenarios. Finally, we assess the utility of MedCoDi-M in
+addressing key challenges in the medical field, such as anonymization, data
+scarcity and imbalance learning. The results are promising, demonstrating the
+applicability of MedCoDi-M in medical contexts. Project page is at
+https://cosbidev.github.io/MedCoDi-M/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated-Continual Dynamic Segmentation of Histopathology guided by
+  Barlow Continuity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niklas Babendererde, Haozhe Zhu, Moritz Fuchs, Jonathan Stieber, Anirban Mukhopadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated- and Continual Learning have been established as approaches to
+enable privacy-aware learning on continuously changing data, as required for
+deploying AI systems in histopathology images. However, data shifts can occur
+in a dynamic world, spatially between institutions and temporally, due to
+changing data over time. This leads to two issues: Client Drift, where the
+central model degrades from aggregating data from clients trained on shifted
+data, and Catastrophic Forgetting, from temporal shifts such as changes in
+patient populations. Both tend to degrade the model's performance of previously
+seen data or spatially distributed training. Despite both problems arising from
+the same underlying problem of data shifts, existing research addresses them
+only individually. In this work, we introduce a method that can jointly
+alleviate Client Drift and Catastrophic Forgetting by using our proposed
+Dynamic Barlow Continuity that evaluates client updates on a public reference
+dataset and uses this to guide the training process to a spatially and
+temporally shift-invariant model. We evaluate our approach on the
+histopathology datasets BCSS and Semicol and prove our method to be highly
+effective by jointly improving the dice score as much as from 15.8% to 71.6% in
+Client Drift and from 42.5% to 62.8% in Catastrophic Forgetting. This enables
+Dynamic Learning by establishing spatio-temporal shift-invariance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A 65 nm Bayesian Neural Network Accelerator with 360 fJ/Sample In-Word
+  GRNG for AI Uncertainty Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zephan M. Enciso, Boyang Cheng, Likai Pei, Jianbo Liu, Steven Davis, Ningyuan Cao, Michael Niemier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty estimation is an indispensable capability for AI-enabled,
+safety-critical applications, e.g. autonomous vehicles or medical diagnosis.
+Bayesian neural networks (BNNs) use Bayesian statistics to provide both
+classification predictions and uncertainty estimation, but they suffer from
+high computational overhead associated with random number generation and
+repeated sample iterations. Furthermore, BNNs are not immediately amenable to
+acceleration through compute-in-memory architectures due to the frequent memory
+writes necessary after each RNG operation. To address these challenges, we
+present an ASIC that integrates 360 fJ/Sample Gaussian RNG directly into the
+SRAM memory words. This integration reduces RNG overhead and enables
+fully-parallel compute-in-memory operations for BNNs. The prototype chip
+achieves 5.12 GSa/s RNG throughput and 102 GOp/s neural network throughput
+while occupying 0.45 mm2, bringing AI uncertainty estimation to edge
+computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InfiGUIAgent: A Multimodal Generalist GUI Agent with Native Reasoning
+  and Reflection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Liu, Pengxiang Li, Zishu Wei, Congkai Xie, Xueyu Hu, Xinchen Xu, Shengyu Zhang, Xiaotian Han, Hongxia Yang, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graphical User Interface (GUI) Agents, powered by multimodal large language
+models (MLLMs), have shown great potential for task automation on computing
+devices such as computers and mobile phones. However, existing agents face
+challenges in multi-step reasoning and reliance on textual annotations,
+limiting their effectiveness. We introduce \textit{InfiGUIAgent}, an MLLM-based
+GUI Agent trained with a two-stage supervised fine-tuning pipeline. Stage 1
+enhances fundamental skills such as GUI understanding and grounding, while
+Stage 2 integrates hierarchical reasoning and expectation-reflection reasoning
+skills using synthesized data to enable native reasoning abilities of the
+agents. \textit{InfiGUIAgent} achieves competitive performance on several GUI
+benchmarks, highlighting the impact of native reasoning skills in enhancing GUI
+interaction for automation tasks. Resources are available at
+\url{https://github.com/Reallm-Labs/InfiGUIAgent}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures, work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Supervision-free Vision-Language Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Giannone, Ruoteng Li, Qianli Feng, Evgeny Perevodchikov, Rui Chen, Aleix Martinez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) have demonstrated remarkable potential in
+integrating visual and linguistic information, but their performance is often
+constrained by the need for extensive, high-quality image-text training data.
+Curation of these image-text pairs is both time-consuming and computationally
+expensive. To address this challenge, we introduce SVP (Supervision-free Visual
+Projection), a novel framework that enhances vision-language alignment without
+relying on curated data or preference annotation. SVP leverages self-captioning
+and a pre-trained grounding model as a feedback mechanism to elicit latent
+information in VLMs. We evaluate our approach across six key areas: captioning,
+referring, visual question answering, multitasking, hallucination control, and
+object recall. Results demonstrate significant improvements, including a 14%
+average improvement in captioning tasks, up to 12% increase in object recall,
+and substantial reduction in hallucination rates. Notably, a small VLM using
+SVP achieves hallucination reductions comparable to a model five times larger,
+while a VLM with initially poor referring capabilities more than doubles its
+performance, approaching parity with a model twice its size.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cyber-Physical Steganography in Robotic Motion Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ching-Chun Chang, Yijie Lin, Isao Echizen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Steganography, the art of information hiding, has continually evolved across
+visual, auditory and linguistic domains, adapting to the ceaseless interplay
+between steganographic concealment and steganalytic revelation. This study
+seeks to extend the horizons of what constitutes a viable steganographic medium
+by introducing a steganographic paradigm in robotic motion control. Based on
+the observation of the robot's inherent sensitivity to changes in its
+environment, we propose a methodology to encode messages as environmental
+stimuli influencing the motions of the robotic agent and to decode messages
+from the resulting motion trajectory. The constraints of maximal robot
+integrity and minimal motion deviation are established as fundamental
+principles underlying secrecy. As a proof of concept, we conduct experiments in
+simulated environments across various manipulation tasks, incorporating robotic
+embodiments equipped with generalist multimodal policies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a Problem-Oriented Domain Adaptation Framework for Machine
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Spitzer, Dominik Martin, Laurin Eichberger, Niklas Kühl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation is a sub-field of machine learning that involves
+transferring knowledge from a source domain to perform the same task in the
+target domain. It is a typical challenge in machine learning that arises, e.g.,
+when data is obtained from various sources or when using a data basis that
+changes over time. Recent advances in the field offer promising methods, but it
+is still challenging for researchers and practitioners to determine if domain
+adaptation is suitable for a given problem -- and, subsequently, to select the
+appropriate approach. This article employs design science research to develop a
+problem-oriented framework for domain adaptation, which is matured in three
+evaluation episodes. We describe a framework that distinguishes between five
+domain adaptation scenarios, provides recommendations for addressing each
+scenario, and offers guidelines for determining if a problem falls into one of
+these scenarios. During the multiple evaluation episodes, the framework is
+tested on artificial and real-world datasets and an experimental study
+involving 100 participants. The evaluation demonstrates that the framework has
+the explanatory power to capture any domain adaptation problem effectively. In
+summary, we provide clear guidance for researchers and practitioners who want
+to employ domain adaptation but lack in-depth knowledge of the possibilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CGP-Tuning: Structure-Aware Soft <span class="highlight-title">Prompt</span> Tuning for Code Vulnerability
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruijun Feng, Hammond Pearce, Pietro Liguori, Yulei Sui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have been proposed as powerful tools for
+detecting software vulnerabilities, where task-specific fine-tuning is
+typically employed to provide vulnerability-specific knowledge to the LLMs for
+this purpose. However, traditional full-parameter fine-tuning is inefficient
+for modern, complex LLMs, which contain billions of parameters.
+  Soft prompt tuning has been suggested as a more efficient alternative for
+fine-tuning LLMs in general cases. However, pure soft prompt tuning treats
+source code as plain text, losing structural information inherent in source
+code. Meanwhile, graph-enhanced soft prompt tuning methods, which aim to
+address this issue, are unable to preserve the rich semantic information within
+code graphs, as they are primarily designed for general graph-related tasks and
+focus more on adjacency information. They also fail to ensure computational
+efficiency while accounting for graph-text interactions.
+  This paper, therefore, introduces a new code graph-enhanced, structure-aware
+soft prompt tuning method for vulnerability detection, referred to as
+CGP-Tuning. It employs innovative type-aware embeddings to capture the rich
+semantic information within code graphs, along with a novel and efficient
+cross-modal alignment module that achieves linear computational cost while
+incorporating graph-text interactions. The proposed CGP-Tuning is evaluated on
+the latest DiverseVul dataset and the most recent open-source code LLMs,
+CodeLlama and CodeGemma. Experimental results demonstrate that CGP-Tuning
+outperforms the best state-of-the-art method by an average of 3.5 percentage
+points in accuracy, without compromising its vulnerability detection
+capabilities for long source code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Role of Machine Learning in Congenital Heart Disease Diagnosis:
+  <span class="highlight-title">Dataset</span>s, Algorithms, and Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khalil Khan, Farhan Ullah, Ikram Syed, Irfan Ullah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Congenital heart disease is among the most common fetal abnormalities and
+birth defects. Despite identifying numerous risk factors influencing its onset,
+a comprehensive understanding of its genesis and management across diverse
+populations remains limited. Recent advancements in machine learning have
+demonstrated the potential for leveraging patient data to enable early
+congenital heart disease detection. Over the past seven years, researchers have
+proposed various data-driven and algorithmic solutions to address this
+challenge. This paper presents a systematic review of congential heart disease
+recognition using machine learning, conducting a meta-analysis of 432
+references from leading journals published between 2018 and 2024. A detailed
+investigation of 74 scholarly works highlights key factors, including
+databases, algorithms, applications, and solutions. Additionally, the survey
+outlines reported datasets used by machine learning experts for congenital
+heart disease recognition. Using a systematic literature review methodology,
+this study identifies critical challenges and opportunities in applying machine
+learning to congenital heart disease.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating remote sensing data assimilation, deep learning and large
+  language model for interactive wheat breeding yield prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guofeng Yang, Nanfei Jin, Wenjie Ai, Zhonghua Zheng, Yuhong He, Yong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Yield is one of the core goals of crop breeding. By predicting the potential
+yield of different breeding materials, breeders can screen these materials at
+various growth stages to select the best performing. Based on unmanned aerial
+vehicle remote sensing technology, high-throughput crop phenotyping data in
+breeding areas is collected to provide data support for the breeding decisions
+of breeders. However, the accuracy of current yield predictions still requires
+improvement, and the usability and user-friendliness of yield forecasting tools
+remain suboptimal. To address these challenges, this study introduces a hybrid
+method and tool for crop yield prediction, designed to allow breeders to
+interactively and accurately predict wheat yield by chatting with a large
+language model (LLM). First, the newly designed data assimilation algorithm is
+used to assimilate the leaf area index into the WOFOST model. Then, selected
+outputs from the assimilation process, along with remote sensing inversion
+results, are used to drive the time-series temporal fusion transformer model
+for wheat yield prediction. Finally, based on this hybrid method and leveraging
+an LLM with retrieval augmented generation technology, we developed an
+interactive yield prediction Web tool that is user-friendly and supports
+sustainable data updates. This tool integrates multi-source data to assist
+breeding decision-making. This study aims to accelerate the identification of
+high-yield materials in the breeding process, enhance breeding efficiency, and
+enable more scientific and smart breeding decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Research on environment perception and behavior prediction of
+  intelligent UAV based on semantic communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kechong Ren, Li Gao, Qi Guan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The convergence of drone delivery systems, virtual worlds, and blockchain has
+transformed logistics and supply chain management, providing a fast, and
+environmentally friendly alternative to traditional ground transportation
+methods;Provide users with a real-world experience, virtual service providers
+need to collect up-to-the-minute delivery information from edge devices. To
+address this challenge, 1) a reinforcement learning approach is introduced to
+enable drones with fast training capabilities and the ability to autonomously
+adapt to new virtual scenarios for effective resource allocation.2) A semantic
+communication framework for meta-universes is proposed, which utilizes the
+extraction of semantic information to reduce the communication cost and
+incentivize the transmission of information for meta-universe services.3) In
+order to ensure that user information security, a lightweight authentication
+and key agreement scheme is designed between the drone and the user by
+introducing blockchain technology. In our experiments, the drone adaptation
+performance is improved by about 35\%, and the local offloading rate can reach
+90\% with the increase of the number of base stations. The semantic
+communication system proposed in this paper is compared with the Cross Entropy
+baseline model. Introducing blockchain technology the throughput of the
+transaction is maintained at a stable value with different number of drones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Artificial Intelligence Strategies for Drone Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rubén San-Segundo, Lucía Angulo, Manuel Gil-Martín, David Carramiñana, Ana M. Bernardos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: This paper describes the development of hybrid artificial
+intelligence strategies for drone navigation. Methods: The navigation module
+combines a deep learning model with a rule-based engine depending on the agent
+state. The deep learning model has been trained using reinforcement learning.
+The rule-based engine uses expert knowledge to deal with specific situations.
+The navigation module incorporates several strategies to explain the drone
+decision based on its observation space, and different mechanisms for including
+human decisions in the navigation process. Finally, this paper proposes an
+evaluation methodology based on defining several scenarios and analyzing the
+performance of the different strategies according to metrics adapted to each
+scenario. Results: Two main navigation problems have been studied. For the
+first scenario (reaching known targets), it has been possible to obtain a 90%
+task completion rate, reducing significantly the number of collisions thanks to
+the rule-based engine. For the second scenario, it has been possible to reduce
+20% of the time required to locate all the targets using the reinforcement
+learning model. Conclusions: Reinforcement learning is a very good strategy to
+learn policies for drone navigation, but in critical situations, it is
+necessary to complement it with a rule-based module to increase task success
+rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A novel Facial Recognition technique with Focusing on Masked Faces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04444v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04444v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dana A Abdullah, Dana Rasul Hamad, Hakem Beitollahi, Ismail Y Maolood, Abdulhady Abas Abdullah, Aso Khaleel Ameen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognizing the same faces with and without masks is important for ensuring
+consistent identification in security, access control, and public safety. This
+capability is crucial in scenarios like law enforcement, healthcare, and
+surveillance, where accurate recognition must be maintained despite facial
+occlusion. This research focuses on the challenge of recognizing the same faces
+with and without masks by employing cosine similarity as the primary technique.
+With the increased use of masks, traditional facial recognition systems face
+significant accuracy issues, making it crucial to develop methods that can
+reliably identify individuals in masked conditions. For that reason, this study
+proposed Masked-Unmasked Face Matching Model (MUFM). This model employs
+transfer learning using the Visual Geometry Group (VGG16) model to extract
+significant facial features, which are subsequently classified utilizing the
+K-Nearest Neighbors (K-NN) algorithm. The cosine similarity metric is employed
+to compare masked and unmasked faces of the same individuals. This approach
+represents a novel contribution, as the task of recognizing the same individual
+with and without a mask using cosine similarity has not been previously
+addressed. By integrating these advanced methodologies, the research
+demonstrates effective identification of individuals despite the presence of
+masks, addressing a significant limitation in traditional systems. Using data
+is another essential part of this work, by collecting and preparing an image
+dataset from three different sources especially some of those data are real
+provided a comprehensive power of this research. The image dataset used were
+already collected in three different datasets of masked and unmasked for the
+same faces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effect of Information Technology on Job Creation to Support Economic:
+  Case Studies of Graduates in Universities (2023-2024) of the KRG of Iraq 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azhi Kh. Bapir, Ismail Y. Maolood, Dana A Abdullah, Aso K. Ameen, Abdulhady Abas Abdullah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aim of this study is to assess the impact of information technology (IT)
+on university graduates in terms of employment development, which will aid in
+economic issues. This study uses a descriptive research methodology and a
+quantitative approach to understand variables. The focus of this study is to
+ascertain how graduates of Kurdistan regional universities might use IT to
+secure employment and significantly contribute to the nation's economic
+revival. The sample size was established by the use of judgmental sampling
+procedure and consisted of 314 people. The researcher prepared the
+questionnaire to collect data, and then SPSS statistical software, version 22,
+and Excel 2010 were used to modify, compile, and tabulate the results. The
+study's outcome showed that information technology is incredibly inventive, has
+a promising future, and makes life much easier for everyone. It also proved
+that a deep academic understanding of information technology and its
+constituent parts helps graduates of Kurdistan Regional University find
+suitable careers. More importantly, though, anyone looking for work or a means
+of support will find great benefit from possessing credentials and
+understanding of IT. The study's final finding was that information technology
+has actively advanced the country's economy. Not only is IT helping to boost
+youth employment, but it is also turning into a worthwhile investment for
+economic growth.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating LLMs with ITS: Recent Advances, Potentials, Challenges, and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04437v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04437v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Doaa Mahmud, Hadeel Hajmohamed, Shamma Almentheri, Shamma Alqaydi, Lameya Aldhaheri, Ruhul Amin Khalil, Nasir Saeed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent Transportation Systems (ITS) are crucial for the development and
+operation of smart cities, addressing key challenges in efficiency,
+productivity, and environmental sustainability. This paper comprehensively
+reviews the transformative potential of Large Language Models (LLMs) in
+optimizing ITS. Initially, we provide an extensive overview of ITS,
+highlighting its components, operational principles, and overall effectiveness.
+We then delve into the theoretical background of various LLM techniques, such
+as GPT, T5, CTRL, and BERT, elucidating their relevance to ITS applications.
+Following this, we examine the wide-ranging applications of LLMs within ITS,
+including traffic flow prediction, vehicle detection and classification,
+autonomous driving, traffic sign recognition, and pedestrian detection. Our
+analysis reveals how these advanced models can significantly enhance traffic
+management and safety. Finally, we explore the challenges and limitations LLMs
+face in ITS, such as data availability, computational constraints, and ethical
+considerations. We also present several future research directions and
+potential innovations to address these challenges. This paper aims to guide
+researchers and practitioners through the complexities and opportunities of
+integrating LLMs in ITS, offering a roadmap to create more efficient,
+sustainable, and responsive next-generation transportation systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in IEEE Transactions on Intelligent
+  Transportation Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Fine-Tuning of LLMs: Framework Comparison and Research
+  Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Na Yan, Yang Su, Yansha Deng, Robert Schober
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) provides a privacy-preserving solution for
+fine-tuning pre-trained large language models (LLMs) using distributed private
+datasets, enabling task-specific adaptation while preserving data privacy.
+However, fine-tuning the extensive parameters in LLMs is particularly
+challenging in resource-constrained federated scenarios due to the significant
+communication and computational costs. To gain a deeper understanding of how
+these challenges can be addressed, this article conducts a comparative analysis
+three advanced federated LLM (FedLLM) frameworks that integrate knowledge
+distillation (KD) and split learning (SL) to mitigate these issues: 1) FedLLMs,
+where clients upload model parameters or gradients to enable straightforward
+and effective fine-tuning; 2) KD-FedLLMs, which leverage KD for efficient
+knowledge sharing via logits; and 3) Split-FedLLMs, which split the LLMs into
+two parts, with one part executed on the client and the other one on the
+server, to balance the computational load. Each framework is evaluated based on
+key performance metrics, including model accuracy, communication overhead, and
+client-side computational load, offering insights into their effectiveness for
+various federated fine-tuning scenarios. Through this analysis, we identify
+framework-specific optimization opportunities to enhance the efficiency of
+FedLLMs and discuss broader research directions, highlighting open
+opportunities to better adapt FedLLMs for real-world applications. A use case
+is presented to demonstrate the performance comparison of these three
+frameworks under varying configurations and settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Digital Shadow for Modeling, Studying and Preventing Urban Crime 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04435v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04435v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Palma-Borda, Eduardo Guzmán, María-Victoria Belmonte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crime is one of the greatest threats to urban security. Around 80 percent of
+the world's population lives in countries with high levels of criminality. Most
+of the crimes committed in the cities take place in their urban environments.
+This paper presents the development and validation of a digital shadow platform
+for modeling and simulating urban crime. This digital shadow has been
+constructed using data-driven agent-based modeling and simulation techniques,
+which are suitable for capturing dynamic interactions among individuals and
+with their environment. Our approach transforms and integrates well-known
+criminological theories and the expert knowledge of law enforcement agencies
+(LEA), policy makers, and other stakeholders under a theoretical model, which
+is in turn combined with real crime, spatial (cartographic) and socio-economic
+data into an urban model characterizing the daily behavior of citizens. The
+digital shadow has also been instantiated for the city of Malaga, for which we
+had over 300,000 complaints available. This instance has been calibrated with
+those complaints and other geographic and socio-economic information of the
+city. To the best of our knowledge, our digital shadow is the first for large
+urban areas that has been calibrated with a large dataset of real crime reports
+and with an accurate representation of the urban environment. The performance
+indicators of the model after being calibrated, in terms of the metrics widely
+used in predictive policing, suggest that our simulated crime generation
+matches the general pattern of crime in the city according to historical data.
+Our digital shadow platform could be an interesting tool for modeling and
+predicting criminal behavior in an urban environment on a daily basis and,
+thus, a useful tool for policy makers, criminologists, sociologists, LEAs, etc.
+to study and prevent urban crime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual-Force: Enhanced Offline Diversity Maximization under Imitation
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavel Kolev, Marin Vlastelica, Georg Martius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While many algorithms for diversity maximization under imitation constraints
+are online in nature, many applications require offline algorithms without
+environment interactions. Tackling this problem in the offline setting,
+however, presents significant challenges that require non-trivial, multi-stage
+optimization processes with non-stationary rewards. In this work, we present a
+novel offline algorithm that enhances diversity using an objective based on Van
+der Waals (VdW) force and successor features, and eliminates the need to learn
+a previously used skill discriminator. Moreover, by conditioning the value
+function and policy on a pre-trained Functional Reward Encoding (FRE), our
+method allows for better handling of non-stationary rewards and provides
+zero-shot recall of all skills encountered during training, significantly
+expanding the set of skills learned in prior work. Consequently, our algorithm
+benefits from receiving a consistently strong diversity signal (VdW), and
+enjoys more stable and efficient training. We demonstrate the effectiveness of
+our method in generating diverse skills for two robotic tasks in simulation:
+locomotion of a quadruped and local navigation with obstacle traversal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NSA: Neuro-symbolic ARC Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paweł Batorski, Jannik Brinkmann, Paul Swoboda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Abstraction and Reasoning Corpus (ARC) evaluates general reasoning
+capabilities that are difficult for both machine learning models and
+combinatorial search methods. We propose a neuro-symbolic approach that
+combines a transformer for proposal generation with combinatorial search using
+a domain-specific language. The transformer narrows the search space by
+proposing promising search directions, which allows the combinatorial search to
+find the actual solution in short time. We pre-train the trainsformer with
+synthetically generated data. During test-time we generate additional
+task-specific training tasks and fine-tune our model. Our results surpass
+comparable state of the art on the ARC evaluation set by 27% and compare
+favourably on the ARC train set. We make our code and dataset publicly
+available at https://github.com/Batorskq/NSA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ User Simulation in the Era of Generative AI: User Modeling, Synthetic
+  Data Generation, and System Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krisztian Balog, ChengXiang Zhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User simulation is an emerging interdisciplinary topic with multiple critical
+applications in the era of Generative AI. It involves creating an intelligent
+agent that mimics the actions of a human user interacting with an AI system,
+enabling researchers to model and analyze user behaviour, generate synthetic
+data for training, and evaluate interactive AI systems in a controlled and
+reproducible manner. User simulation has profound implications for diverse
+fields and plays a vital role in the pursuit of Artificial General
+Intelligence. This paper provides an overview of user simulation, highlighting
+its key applications, connections to various disciplines, and outlining future
+research directions to advance this increasingly important technology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Computational Limits and Provably Efficient Criteria of Visual
+  Autoregressive Models: A Fine-Grained Complexity Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yekun Ke, Xiaoyu Li, Yingyu Liang, Zhizhou Sha, Zhenmei Shi, Zhao Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Visual Autoregressive ($\mathsf{VAR}$) Models introduced a
+groundbreaking advancement in the field of image generation, offering a
+scalable approach through a coarse-to-fine "next-scale prediction" paradigm.
+However, the state-of-the-art algorithm of $\mathsf{VAR}$ models in [Tian,
+Jiang, Yuan, Peng and Wang, NeurIPS 2024] takes $O(n^4)$ time, which is
+computationally inefficient. In this work, we analyze the computational limits
+and efficiency criteria of $\mathsf{VAR}$ Models through a fine-grained
+complexity lens. Our key contribution is identifying the conditions under which
+$\mathsf{VAR}$ computations can achieve sub-quadratic time complexity.
+Specifically, we establish a critical threshold for the norm of input matrices
+used in $\mathsf{VAR}$ attention mechanisms. Above this threshold, assuming the
+Strong Exponential Time Hypothesis ($\mathsf{SETH}$) from fine-grained
+complexity theory, a sub-quartic time algorithm for $\mathsf{VAR}$ models is
+impossible. To substantiate our theoretical findings, we present efficient
+constructions leveraging low-rank approximations that align with the derived
+criteria. This work initiates the study of the computational efficiency of the
+$\mathsf{VAR}$ model from a theoretical perspective. Our technique will shed
+light on advancing scalable and efficient image generation in $\mathsf{VAR}$
+frameworks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DispFormer: <span class="highlight-title">Pretrain</span>ed <span class="highlight-title">Transformer</span> for Flexible Dispersion Curve
+  Inversion from Global Synthesis to Regional Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04366v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04366v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Liu, Bao Deng, Rui Su, Lei Bai, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surface wave dispersion curve inversion is essential for estimating
+subsurface Shear-wave velocity ($v_s$), yet traditional methods often struggle
+to balance computational efficiency with inversion accuracy. While deep
+learning approaches show promise, previous studies typically require large
+amounts of labeled data and struggle with real-world datasets that have varying
+period ranges, missing data, and low signal-to-noise ratios. This study
+proposes DispFormer, a transformer-based neural network for inverting the $v_s$
+profile from Rayleigh-wave phase and group dispersion curves. DispFormer
+processes dispersion data at each period independently, thereby allowing it to
+handle data of varying lengths without requiring network modifications or
+alignment between training and testing data. The performance is demonstrated by
+pre-training it on a global synthetic dataset and testing it on two regional
+synthetic datasets using zero-shot and few-shot strategies. Results indicate
+that zero-shot DispFormer, even without any labeled data, produces inversion
+profiles that match well with the ground truth, providing a deployable initial
+model generator to assist traditional methods. When labeled data is available,
+few-shot DispFormer outperforms traditional methods with only a small number of
+labels. Furthermore, real-world tests indicate that DispFormer effectively
+handles varying length data, and yields lower data residuals than reference
+models. These findings demonstrate that DispFormer provides a robust foundation
+model for dispersion curve inversion and is a promising approach for broader
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 11 figures, related codes and data are available at
+  https://github.com/liufeng2317/DispFormer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimelineKGQA: A Comprehensive Question-Answer Pair Generator for
+  Temporal Knowledge Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Sun, Sirui Li, Du Huynh, Mark Reynolds, Wei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering over temporal knowledge graphs (TKGs) is crucial for
+understanding evolving facts and relationships, yet its development is hindered
+by limited datasets and difficulties in generating custom QA pairs. We propose
+a novel categorization framework based on timeline-context relationships, along
+with \textbf{TimelineKGQA}, a universal temporal QA generator applicable to any
+TKGs. The code is available at: \url{https://github.com/PascalSun/TimelineKGQA}
+as an open source Python package.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoRA: Efficient Fine-Tuning of LLM with Reliability Optimization for
+  Rank Adaptation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04315v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04315v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Liu, Zhenglun Kong, Peiyan Dong, Xuan Shen, Pu Zhao, Hao Tang, Geng Yuan, Wei Niu, Wenbin Zhang, Xue Lin, Dong Huang, Yanzhi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning helps large language models (LLM) recover degraded information
+and enhance task performance.Although Low-Rank Adaptation (LoRA) is widely used
+and effective for fine-tuning, we have observed that its scaling factor can
+limit or even reduce performance as the rank size increases. To address this
+issue, we propose RoRA (Rank-adaptive Reliability Optimization), a simple yet
+effective method for optimizing LoRA's scaling factor. By replacing $\alpha/r$
+with $\alpha/\sqrt{r}$, RoRA ensures improved performance as rank size
+increases. Moreover, RoRA enhances low-rank adaptation in fine-tuning
+uncompressed models and excels in the more challenging task of accuracy
+recovery when fine-tuning pruned models. Extensive experiments demonstrate the
+effectiveness of RoRA in fine-tuning both uncompressed and pruned models. RoRA
+surpasses the state-of-the-art (SOTA) in average accuracy and robustness on
+LLaMA-7B/13B, LLaMA2-7B, and LLaMA3-8B, specifically outperforming LoRA and
+DoRA by 6.5% and 2.9% on LLaMA-7B, respectively. In pruned model fine-tuning,
+RoRA shows significant advantages; for SHEARED-LLAMA-1.3, a LLaMA-7B with 81.4%
+pruning, RoRA achieves 5.7% higher average accuracy than LoRA and 3.9% higher
+than DoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ H-MBA: Hierarchical MamBa Adaptation for Multi-Modal Video Understanding
+  in Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04302v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04302v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siran Chen, Yuxiao Luo, Yue Ma, Yu Qiao, Yali Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the prevalence of Multimodal Large Language Models(MLLMs), autonomous
+driving has encountered new opportunities and challenges. In particular,
+multi-modal video understanding is critical to interactively analyze what will
+happen in the procedure of autonomous driving. However, videos in such a
+dynamical scene that often contains complex spatial-temporal movements, which
+restricts the generalization capacity of the existing MLLMs in this field. To
+bridge the gap, we propose a novel Hierarchical Mamba Adaptation (H-MBA)
+framework to fit the complicated motion changes in autonomous driving videos.
+Specifically, our H-MBA consists of two distinct modules, including Context
+Mamba (C-Mamba) and Query Mamba (Q-Mamba). First, C-Mamba contains various
+types of structure state space models, which can effectively capture
+multi-granularity video context for different temporal resolutions. Second,
+Q-Mamba flexibly transforms the current frame as the learnable query, and
+attentively selects multi-granularity video context into query. Consequently,
+it can adaptively integrate all the video contexts of multi-scale temporal
+resolutions to enhance video understanding. Via a plug-and-play paradigm in
+MLLMs, our H-MBA shows the remarkable performance on multi-modal video tasks in
+autonomous driving, e.g., for risk object detection, it outperforms the
+previous SOTA method with 5.5% mIoU improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Circuit Complexity Bounds for Visual Autoregressive Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yekun Ke, Xiaoyu Li, Yingyu Liang, Zhenmei Shi, Zhao Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the expressive ability of a specific model is essential for
+grasping its capacity limitations. Recently, several studies have established
+circuit complexity bounds for Transformer architecture. Besides, the Visual
+AutoRegressive (VAR) model has risen to be a prominent method in the field of
+image generation, outperforming previous techniques, such as Diffusion
+Transformers, in generating high-quality images. We investigate the circuit
+complexity of the VAR model and establish a bound in this study. Our primary
+result demonstrates that the VAR model is equivalent to a simulation by a
+uniform $\mathsf{TC}^0$ threshold circuit with hidden dimension $d \leq O(n)$
+and $\mathrm{poly}(n)$ precision. This is the first study to rigorously
+highlight the limitations in the expressive power of VAR models despite their
+impressive performance. We believe our findings will offer valuable insights
+into the inherent constraints of these models and guide the development of more
+efficient and expressive architectures in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAD-UV: The 1st INTERSPEECH Mice Autism Detection via Ultrasound
+  Vocalization Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijiang Yang, Meishu Song, Xin Jing, Haojie Zhang, Kun Qian, Bin Hu, Kota Tamada, Toru Takumi, Björn W. Schuller, Yoshiharu Yamamoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Mice Autism Detection via Ultrasound Vocalization (MAD-UV) Challenge
+introduces the first INTERSPEECH challenge focused on detecting autism spectrum
+disorder (ASD) in mice through their vocalizations. Participants are tasked
+with developing models to automatically classify mice as either wild-type or
+ASD models based on recordings with a high sampling rate. Our baseline system
+employs a simple CNN-based classification using three different spectrogram
+features. Results demonstrate the feasibility of automated ASD detection, with
+the considered audible-range features achieving the best performance (UAR of
+0.600 for segment-level and 0.625 for subject-level classification). This
+challenge bridges speech technology and biomedical research, offering
+opportunities to advance our understanding of ASD models through machine
+learning approaches. The findings suggest promising directions for vocalization
+analysis and highlight the potential value of audible and ultrasound
+vocalizations in ASD detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure and 2 tables. For MAD-UV Challenge 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mapping the Edge of Chaos: Fractal-Like Boundaries in The Trainability
+  of Decoder-Only <span class="highlight-title">Transformer</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bahman Torkamandi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of fractal geometry, intricate structures emerge from simple
+iterative processes that partition parameter spaces into regions of stability
+and instability. Likewise, training large language models involves iteratively
+applying update functions, such as Adam, where even slight hyperparameter
+adjustments can shift the training process from convergence to divergence.
+Recent evidence from miniature neural networks suggests that the boundary
+separating these outcomes displays fractal characteristics [1]. Building on
+these insights, this study extends them to medium-sized, decoder-only
+transformer architectures by employing a more consistent convergence measure
+and examining the learning rate hyperparameter landscape for attention and
+fully connected layers. The results show that the trainability frontier is not
+a simple threshold; rather, it forms a self-similar yet seemingly random
+structure at multiple scales, with statistically consistent and repeating
+patterns. Within this landscape, a region of stable convergence is surrounded
+by a complex chaotic border, illustrating the sensitive nature of the
+underlying training dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Scene Classification in Cloudy Image Scenarios: A
+  Collaborative Transfer Method with Information Regulation Mechanism using
+  Optical Cloud-Covered and SAR Remote Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuze Wang, Rong Xiao, Haifeng Li, Mariana Belgiu, Chao Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In remote sensing scene classification, leveraging the transfer methods with
+well-trained optical models is an efficient way to overcome label scarcity.
+However, cloud contamination leads to optical information loss and significant
+impacts on feature distribution, challenging the reliability and stability of
+transferred target models. Common solutions include cloud removal for optical
+data or directly using Synthetic aperture radar (SAR) data in the target
+domain. However, cloud removal requires substantial auxiliary data for support
+and pre-training, while directly using SAR disregards the unobstructed portions
+of optical data. This study presents a scene classification transfer method
+that synergistically combines multi-modality data, which aims to transfer the
+source domain model trained on cloudfree optical data to the target domain that
+includes both cloudy optical and SAR data at low cost. Specifically, the
+framework incorporates two parts: (1) the collaborative transfer strategy,
+based on knowledge distillation, enables the efficient prior knowledge transfer
+across heterogeneous data; (2) the information regulation mechanism (IRM) is
+proposed to address the modality imbalance issue during transfer. It employs
+auxiliary models to measure the contribution discrepancy of each modality, and
+automatically balances the information utilization of modalities during the
+target model learning process at the sample-level. The transfer experiments
+were conducted on simulated and real cloud datasets, demonstrating the superior
+performance of the proposed method compared to other solutions in cloud-covered
+scenarios. We also verified the importance and limitations of IRM, and further
+discussed and visualized the modality imbalance problem during the model
+transfer. Codes are available at https://github.com/wangyuze-csu/ESCCS
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Large Language Model Training on Frontier with Low-Bandwidth
+  Partitioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lang Xu, Quentin Anthony, Jacob Hatef, Aamir Shafi, Hari Subramoni, Dhabaleswar K.,  Panda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling up Large Language Model(LLM) training involves fitting a tremendous
+amount of training parameters across a limited number of workers. However,
+methods like ZeRO-3 that drastically reduce GPU memory pressure often incur
+heavy communication to ensure global synchronization and consistency.
+Established efforts such as ZeRO++ use secondary partitions to avoid inter-node
+communications, given that intra-node GPU-GPU transfer generally has more
+bandwidth and lower latency than inter-node connections. However, as more
+capable infrastructure like Frontier, equipped with AMD GPUs, emerged with
+impressive computing capability, there is a need for investigations on the
+hardware topology and to develop targeted strategies to improve training
+efficiency. In this work, we propose a collection of communication and
+optimization strategies for ZeRO++ to reduce communication costs and improve
+memory utilization. In this paper, we propose a 3-level hierarchical
+partitioning specifically for the current Top-1 supercomputing cluster,
+Frontier, which aims at leveraging various bandwidths across layers of
+communications (GCD-GCD, GPU-GPU, and inter-node) to reduce communication
+overhead. For a 20B GPT model, we observe a 1.71x increase in TFLOPS per GPU
+when compared with ZeRO++ up to 384 GCDs and a scaling efficiency of 0.94 for
+up to 384 GCDs. To the best of our knowledge, our work is also the first effort
+to efficiently optimize LLM workloads on Frontier AMD GPUs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KN-LIO: Geometric Kinematics and Neural Field Coupled LiDAR-Inertial
+  Odometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhong Wang, Lele Ren, Yue Wen, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in LiDAR-Inertial Odometry (LIO) have boosted a large
+amount of applications. However, traditional LIO systems tend to focus more on
+localization rather than mapping, with maps consisting mostly of sparse
+geometric elements, which is not ideal for downstream tasks. Recent emerging
+neural field technology has great potential in dense mapping, but pure LiDAR
+mapping is difficult to work on high-dynamic vehicles. To mitigate this
+challenge, we present a new solution that tightly couples geometric kinematics
+with neural fields to enhance simultaneous state estimation and dense mapping
+capabilities. We propose both semi-coupled and tightly coupled Kinematic-Neural
+LIO (KN-LIO) systems that leverage online SDF decoding and iterated error-state
+Kalman filtering to fuse laser and inertial data. Our KN-LIO minimizes
+information loss and improves accuracy in state estimation, while also
+accommodating asynchronous multi-LiDAR inputs. Evaluations on diverse
+high-dynamic datasets demonstrate that our KN-LIO achieves performance on par
+with or superior to existing state-of-the-art solutions in pose estimation and
+offers improved dense mapping accuracy over pure LiDAR-based methods. The
+relevant code and datasets will be made available at https://**.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrated Offline and Online Learning to Solve a Large Class of
+  Scheduling Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anbang Liu, Zhi-Long Chen, Jinyang Jiang, Xi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we develop a unified machine learning (ML) approach to predict
+high-quality solutions for single-machine scheduling problems with a
+non-decreasing min-sum objective function with or without release times. Our ML
+approach is novel in three major aspects. First, our approach is developed for
+the entire class of the aforementioned problems. To achieve this, we exploit
+the fact that the entire class of the problems considered can be formulated as
+a time-indexed formulation in a unified manner. We develop a deep neural
+network (DNN) which uses the cost parameters in the time-indexed formulation as
+the inputs to effectively predict a continuous solution to this formulation,
+based on which a feasible discrete solution is easily constructed. The second
+novel aspect of our approach lies in how the DNN model is trained. In view of
+the NP-hard nature of the problems, labels (i.e., optimal solutions) are hard
+to generate for training. To overcome this difficulty, we generate and utilize
+a set of special instances, for which optimal solutions can be found with
+little computational effort, to train the ML model offline. The third novel
+idea we employ in our approach is that we develop an online single-instance
+learning approach to fine tune the parameters in the DNN for a given online
+instance, with the goal of generating an improved solution for the given
+instance. To this end, we develop a feasibility surrogate that approximates the
+objective value of a given instance as a continuous function of the outputs of
+the DNN, which then enables us to derive gradients and update the learnable
+parameters in the DNN. Numerical results show that our approach can efficiently
+generate high-quality solutions for a variety of single-machine scheduling
+min-sum problems with up to 1000 jobs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constraints as Rewards: Reinforcement Learning for Robots without Reward
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Ishihara, Noriaki Takasugi, Kotaro Kawakami, Masaya Kinoshita, Kazumi Aoyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning has become an essential algorithm for generating
+complex robotic behaviors. However, to learn such behaviors, it is necessary to
+design a reward function that describes the task, which often consists of
+multiple objectives that needs to be balanced. This tuning process is known as
+reward engineering and typically involves extensive trial-and-error. In this
+paper, to avoid this trial-and-error process, we propose the concept of
+Constraints as Rewards (CaR). CaR formulates the task objective using multiple
+constraint functions instead of a reward function and solves a reinforcement
+learning problem with constraints using the Lagrangian-method. By adopting this
+approach, different objectives are automatically balanced, because Lagrange
+multipliers serves as the weights among the objectives. In addition, we will
+demonstrate that constraints, expressed as inequalities, provide an intuitive
+interpretation of the optimization target designed for the task. We apply the
+proposed method to the standing-up motion generation task of a
+six-wheeled-telescopic-legged robot and demonstrate that the proposed method
+successfully acquires the target behavior, even though it is challenging to
+learn with manually designed reward functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Agent Laboratory: Using LLM Agents as Research Assistants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Schmidgall, Yusheng Su, Ze Wang, Ximeng Sun, Jialian Wu, Xiaodong Yu, Jiang Liu, Zicheng Liu, Emad Barsoum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Historically, scientific discovery has been a lengthy and costly process,
+demanding substantial time and resources from initial conception to final
+results. To accelerate scientific discovery, reduce research costs, and improve
+research quality, we introduce Agent Laboratory, an autonomous LLM-based
+framework capable of completing the entire research process. This framework
+accepts a human-provided research idea and progresses through three
+stages--literature review, experimentation, and report writing to produce
+comprehensive research outputs, including a code repository and a research
+report, while enabling users to provide feedback and guidance at each stage. We
+deploy Agent Laboratory with various state-of-the-art LLMs and invite multiple
+researchers to assess its quality by participating in a survey, providing human
+feedback to guide the research process, and then evaluate the final paper. We
+found that: (1) Agent Laboratory driven by o1-preview generates the best
+research outcomes; (2) The generated machine learning code is able to achieve
+state-of-the-art performance compared to existing methods; (3) Human
+involvement, providing feedback at each stage, significantly improves the
+overall quality of research; (4) Agent Laboratory significantly reduces
+research expenses, achieving an 84% decrease compared to previous autonomous
+research methods. We hope Agent Laboratory enables researchers to allocate more
+effort toward creative ideation rather than low-level coding and writing,
+ultimately accelerating scientific discovery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual <span class="highlight-title">Self-supervised</span> Learning Considering Medical Domain Knowledge
+  in Chest CT Images <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ren Tasai, Guang Li, Ren Togo, Minghui Tang, Takaaki Yoshimura, Hiroyuki Sugimori, Kenji Hirata, Takahiro Ogawa, Kohsuke Kudo, Miki Haseyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel continual self-supervised learning method (CSSL)
+considering medical domain knowledge in chest CT images. Our approach addresses
+the challenge of sequential learning by effectively capturing the relationship
+between previously learned knowledge and new information at different stages.
+By incorporating an enhanced DER into CSSL and maintaining both diversity and
+representativeness within the rehearsal buffer of DER, the risk of data
+interference during pretraining is reduced, enabling the model to learn more
+richer and robust feature representations. In addition, we incorporate a mixup
+strategy and feature distillation to further enhance the model's ability to
+learn meaningful representations. We validate our method using chest CT images
+obtained under two different imaging conditions, demonstrating superior
+performance compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UPAQ: A Framework for Real-Time and Energy-Efficient 3D Object Detection
+  in Autonomous Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Balasubramaniam, Febin P Sunny, Sudeep Pasricha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To enhance perception in autonomous vehicles (AVs), recent efforts are
+concentrating on 3D object detectors, which deliver more comprehensive
+predictions than traditional 2D object detectors, at the cost of increased
+memory footprint and computational resource usage. We present a novel framework
+called UPAQ, which leverages semi-structured pattern pruning and quantization
+to improve the efficiency of LiDAR point-cloud and camera-based 3D object
+detectors on resource-constrained embedded AV platforms. Experimental results
+on the Jetson Orin Nano embedded platform indicate that UPAQ achieves up to
+5.62x and 5.13x model compression rates, up to 1.97x and 1.86x boost in
+inference speed, and up to 2.07x and 1.87x reduction in energy consumption
+compared to state-of-the-art model compression frameworks, on the Pointpillar
+and SMOKE models respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CURing Large Models: Compression via CUR Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanghyeon Park, Soo-Mook Moon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large deep learning models have achieved remarkable success but are
+resource-intensive, posing challenges in computational cost and memory usage.
+  We introduce CURing, a novel model compression method based on CUR matrix
+decomposition, which approximates weight matrices as the product of selected
+columns (C) and rows (R), and a small linking matrix (U). We apply this
+decomposition to weights chosen based on the combined influence of their
+magnitudes and activations. By identifying and retaining informative rows and
+columns, CURing significantly reduces model size with minimal performance loss.
+  It preserves the original network's input/output structures, retains
+important features such as non-negativity, and the compressed model's
+activation patterns align with the original, thereby enhancing
+interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative <span class="highlight-title">Dataset</span> Distillation Based on Self-knowledge Distillation <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longzhen Li, Guang Li, Ren Togo, Keisuke Maeda, Takahiro Ogawa, Miki Haseyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation is an effective technique for reducing the cost and
+complexity of model training while maintaining performance by compressing large
+datasets into smaller, more efficient versions. In this paper, we present a
+novel generative dataset distillation method that can improve the accuracy of
+aligning prediction logits. Our approach integrates self-knowledge distillation
+to achieve more precise distribution matching between the synthetic and
+original data, thereby capturing the overall structure and relationships within
+the data. To further improve the accuracy of alignment, we introduce a
+standardization step on the logits before performing distribution matching,
+ensuring consistency in the range of logits. Through extensive experiments, we
+demonstrate that our method outperforms existing state-of-the-art methods,
+resulting in superior distillation performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GNN-based Decentralized Perception in Multirobot Systems for Predicting
+  Worker Actions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Imran, Giovanni Beltrame, David St-Onge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In industrial environments, predicting human actions is essential for
+ensuring safe and effective collaboration between humans and robots. This paper
+introduces a perception framework that enables mobile robots to understand and
+share information about human actions in a decentralized way. The framework
+first allows each robot to build a spatial graph representing its surroundings,
+which it then shares with other robots. This shared spatial data is combined
+with temporal information to track human behavior over time. A swarm-inspired
+decision-making process is used to ensure all robots agree on a unified
+interpretation of the human's actions. Results show that adding more robots and
+incorporating longer time sequences improve prediction accuracy. Additionally,
+the consensus mechanism increases system resilience, making the multi-robot
+setup more reliable in dynamic industrial settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to RA-L</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reach Measurement, Optimization and Frequency Capping In Targeted Online
+  Advertising Under k-Anonymity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Gao, Mu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growth in the use of online advertising to foster brand awareness over
+recent years is largely attributable to the ubiquity of social media. One
+pivotal technology contributing to the success of online brand advertising is
+frequency capping, a mechanism that enables marketers to control the number of
+times an ad is shown to a specific user. However, the very foundation of this
+technology is being scrutinized as the industry gravitates towards advertising
+solutions that prioritize user privacy. This paper delves into the issue of
+reach measurement and optimization within the context of $k$-anonymity, a
+privacy-preserving model gaining traction across major online advertising
+platforms. We outline how to report reach within this new privacy landscape and
+demonstrate how probabilistic discounting, a probabilistic adaptation of
+traditional frequency capping, can be employed to optimize campaign
+performance. Experiments are performed to assess the trade-off between user
+privacy and the efficacy of online brand advertising. Notably, we discern a
+significant dip in performance as long as privacy is introduced, yet this comes
+with a limited additional cost for advertising platforms to offer their users
+more privacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Textless Dialogue Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Mai, Julie Carson-Berndsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language models (LLMs) have led to significant
+progress in text-based dialogue systems. These systems can now generate
+high-quality responses that are accurate and coherent across a wide range of
+topics and tasks. However, spoken dialogue systems still lag behind in terms of
+naturalness. They tend to produce robotic interactions, with issues such as
+slow response times, overly generic or cautious replies, and a lack of natural
+rhythm and fluid turn-taking. This shortcoming is largely due to the
+over-reliance on the traditional cascaded design, which involve separate,
+sequential components, as well as the use of text as an intermediate
+representation. This paper propose a real-time, textless spoken dialogue
+generation model (RTTL-DG) that aims to overcome these challenges. Our system
+enables fluid turn-taking and generates responses with minimal delay by
+processing streaming spoken conversation directly. Additionally, our model
+incorporates backchannels, filters, laughter, and other paralinguistic signals,
+which are often absent in cascaded dialogue systems, to create more natural and
+human-like interactions. The implementations and generated samples are
+available in our repository: https://github.com/mailong25/rts2s-dg
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Back Home: A Machine Learning Approach to Seashell Classification and
+  Ecosystem Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Valverde, Luis Solano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Costa Rica, an average of 5 tons of seashells are extracted from
+ecosystems annually. Confiscated seashells, cannot be returned to their
+ecosystems due to the lack of origin recognition. To address this issue, we
+developed a convolutional neural network (CNN) specifically for seashell
+identification. We built a dataset from scratch, consisting of approximately
+19000 images from the Pacific and Caribbean coasts. Using this dataset, the
+model achieved a classification accuracy exceeding 85%. The model has been
+integrated into a user-friendly application, which has classified over 36,000
+seashells to date, delivering real-time results within 3 seconds per image. To
+further enhance the system's accuracy, an anomaly detection mechanism was
+incorporated to filter out irrelevant or anomalous inputs, ensuring only valid
+seashell images are processed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Large Language Models for Semantic Analysis and Categorization
+  of Android Malware 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon J Walton, Mst Eshita Khatun, James M Ghawaly, Aisha Ali-Gombe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Malware analysis is a complex process of examining and evaluating malicious
+software's functionality, origin, and potential impact. This arduous process
+typically involves dissecting the software to understand its components,
+infection vector, propagation mechanism, and payload. Over the years, deep
+reverse engineering of malware has become increasingly tedious, mainly due to
+modern malicious codebases' fast evolution and sophistication. Essentially,
+analysts are tasked with identifying the elusive needle in the haystack within
+the complexities of zero-day malware, all while under tight time constraints.
+Thus, in this paper, we explore leveraging Large Language Models (LLMs) for
+semantic malware analysis to expedite the analysis of known and novel samples.
+Built on GPT-4o-mini model, \msp is designed to augment malware analysis for
+Android through a hierarchical-tiered summarization chain and strategic prompt
+engineering. Additionally, \msp performs malware categorization, distinguishing
+potential malware from benign applications, thereby saving time during the
+malware reverse engineering process. Despite not being fine-tuned for Android
+malware analysis, we demonstrate that through optimized and advanced prompt
+engineering \msp can achieve up to 77% classification accuracy while providing
+highly robust summaries at functional, class, and package levels. In addition,
+leveraging the backward tracing of the summaries from package to function
+levels allowed us to pinpoint the precise code snippets responsible for
+malicious behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Listened Speech Decoding from EEG via Parallel Phoneme
+  Sequence Prediction <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04844v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04844v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihwan Lee, Tiantian Feng, Aditya Kommineni, Sudarsana Reddy Kadiri, Shrikanth Narayanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain-computer interfaces (BCI) offer numerous human-centered application
+possibilities, particularly affecting people with neurological disorders. Text
+or speech decoding from brain activities is a relevant domain that could
+augment the quality of life for people with impaired speech perception. We
+propose a novel approach to enhance listened speech decoding from
+electroencephalography (EEG) signals by utilizing an auxiliary phoneme
+predictor that simultaneously decodes textual phoneme sequences. The proposed
+model architecture consists of three main parts: EEG module, speech module, and
+phoneme predictor. The EEG module learns to properly represent EEG signals into
+EEG embeddings. The speech module generates speech waveforms from the EEG
+embeddings. The phoneme predictor outputs the decoded phoneme sequences in text
+modality. Our proposed approach allows users to obtain decoded listened speech
+from EEG signals in both modalities (speech waveforms and textual phoneme
+sequences) simultaneously, eliminating the need for a concatenated sequential
+pipeline for each modality. The proposed approach also outperforms previous
+methods in both modalities. The source code and speech samples are publicly
+available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Code LLMs Understand Design Patterns? <span class="chip">ICSE 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyu Pan, Xuefeng Song, Yunkun Wang, Rongyu Cao, Binhua Li, Yongbin Li, Han Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Code Large Language Models (LLMs) demonstrate great versatility in adapting
+to various downstream tasks, including code generation and completion, as well
+as bug detection and fixing. However, Code LLMs often fail to capture existing
+coding standards, leading to the generation of code that conflicts with the
+required design patterns for a given project. As a result, developers must
+post-process to adapt the generated code to the project's design norms. In this
+work, we empirically investigate the biases of Code LLMs in software
+development. Through carefully designed experiments, we assess the models'
+understanding of design patterns across recognition, comprehension, and
+generation. Our findings reveal that biases in Code LLMs significantly affect
+the reliability of downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accpeted by llm4code workshop in ICSE 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ActPC-Geom: Towards Scalable Online Neural-Symbolic Learning via
+  Accelerating Active Predictive Coding with Information Geometry & Diverse
+  Cognitive Mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Goertzel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces ActPC-Geom, an approach to accelerate Active Predictive
+Coding (ActPC) in neural networks by integrating information geometry,
+specifically using Wasserstein-metric-based methods for measure-dependent
+gradient flows. We propose replacing KL-divergence in ActPC's predictive error
+assessment with the Wasserstein metric, suggesting this may enhance network
+robustness.
+  To make this computationally feasible, we present strategies including: (1)
+neural approximators for inverse measure-dependent Laplacians, (2) approximate
+kernel PCA embeddings for low-rank approximations feeding into these
+approximators, and (3) compositional hypervector embeddings derived from kPCA
+outputs, with algebra optimized for fuzzy FCA lattices learned through neural
+architectures analyzing network states.
+  This results in an ActPC architecture capable of real-time online learning
+and integrating continuous (e.g., transformer-like or Hopfield-net-like) and
+discrete symbolic ActPC networks, including frameworks like OpenCog Hyperon or
+ActPC-Chem for algorithmic chemistry evolution. Shared probabilistic,
+concept-lattice, and hypervector models enable symbolic-subsymbolic
+integration.
+  Key features include (1) compositional reasoning via hypervector embeddings
+in transformer-like architectures for tasks like commonsense reasoning, and (2)
+Hopfield-net dynamics enabling associative long-term memory and
+attractor-driven cognitive features.
+  We outline how ActPC-Geom combines few-shot learning with online weight
+updates, enabling deliberative thinking and seamless symbolic-subsymbolic
+reasoning. Ideas from Galois connections are explored for efficient hybrid
+ActPC/ActPC-Chem processing. Finally, we propose a specialized HPC design
+optimized for real-time focused attention and deliberative reasoning tailored
+to ActPC-Geom's demands.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent Gradient Boosting Algorithms for Estimating Strength of
+  Modified Subgrade Soil 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ismail B. Mustapha, Muyideen Abdulkareem, Shafaatunnur Hasan, Abideen Ganiyu, Hatem Nabus, Jin Chai Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of pavement under loading depends on the strength of the
+subgrade. However, experimental estimation of properties of pavement strengths
+such as California bearing ratio (CBR), unconfined compressive strength (UCS)
+and resistance value (R) are often tedious, time-consuming and costly, thereby
+inspiring a growing interest in machine learning based tools which are simple,
+cheap and fast alternatives. Thus, the potential application of two boosting
+techniques; categorical boosting (CatBoost) and extreme gradient boosting
+(XGBoost) and support vector regression (SVR), is similarly explored in this
+study for estimation of properties of subgrade soil modified with hydrated lime
+activated rice husk ash (HARSH). Using 121 experimental data samples of varying
+proportions of HARSH, plastic limit, liquid limit, plasticity index, clay
+activity, optimum moisture content, and maximum dry density as input for CBR,
+UCS and R estimation, four evaluation metrics namely coefficient of
+determination (R2), root mean squared error (RMSE), mean absolute error (MAE)
+and mean absolute percentage error (MAPE) are used to evaluate the models'
+performance. The results indicate that XGBoost outperformed CatBoost and SVR in
+estimating these properties, yielding R2 of 0.9994, 0.9995 and 0.9999 in
+estimating the CBR, UCS and R respectively. Also, SVR outperformed CatBoost in
+estimating the CBR and R with R2 of 0.9997 respectively. On the other hand,
+CatBoost outperformed SVR in estimating the UCS with R2 of 0.9994. Feature
+sensitivity analysis shows that the three machine learning techniques are
+unanimous that increasing HARSH proportion lead to values of the estimated
+properties respectively. A comparison with previous results also shows
+superiority of XGBoost in estimating subgrade properties.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Planing It by Ear: Convolutional Neural Networks for Acoustic Anomaly
+  Detection in Industrial Wood Planers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anthony Deschênes, Rémi Georges, Cem Subakan, Bruna Ugulino, Antoine Henry, Michael Morin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the wood product industry has been facing a skilled labor
+shortage. The result is more frequent sudden failures, resulting in additional
+costs for these companies already operating in a very competitive market.
+Moreover, sawmills are challenging environments for machinery and sensors.
+Given that experienced machine operators may be able to diagnose defects or
+malfunctions, one possible way of assisting novice operators is through
+acoustic monitoring. As a step towards the automation of wood-processing
+equipment and decision support systems for machine operators, in this paper, we
+explore using a deep convolutional autoencoder for acoustic anomaly detection
+of wood planers on a new real-life dataset. Specifically, our convolutional
+autoencoder with skip connections (Skip-CAE) and our Skip-CAE transformer
+outperform the DCASE autoencoder baseline, one-class SVM, isolation forest and
+a published convolutional autoencoder architecture, respectively obtaining an
+area under the ROC curve of 0.846 and 0.875 on a dataset of real-factory planer
+sounds. Moreover, we show that adding skip connections and attention mechanism
+under the form of a transformer encoder-decoder helps to further improve the
+anomaly detection capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decentralised Resource Sharing in TinyML: Wireless Bilayer Gossip
+  Parallel SGD for Collaborative Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyuan Bao, Eiman Kanjo, Soumya Banerjee, Hasib-Al Rashid, Tinoosh Mohsenin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing computational capabilities of microcontroller units (MCUs),
+edge devices can now support machine learning models. However, deploying
+decentralised federated learning (DFL) on such devices presents key challenges,
+including intermittent connectivity, limited communication range, and dynamic
+network topologies. This paper proposes a novel framework, bilayer Gossip
+Decentralised Parallel Stochastic Gradient Descent (GD PSGD), designed to
+address these issues in resource-constrained environments. The framework
+incorporates a hierarchical communication structure using Distributed Kmeans
+(DKmeans) clustering for geographic grouping and a gossip protocol for
+efficient model aggregation across two layers: intra-cluster and inter-cluster.
+We evaluate the framework's performance against the Centralised Federated
+Learning (CFL) baseline using the MCUNet model on the CIFAR-10 dataset under
+IID and Non-IID conditions. Results demonstrate that the proposed method
+achieves comparable accuracy to CFL on IID datasets, requiring only 1.8
+additional rounds for convergence. On Non-IID datasets, the accuracy loss
+remains under 8\% for moderate data imbalance. These findings highlight the
+framework's potential to support scalable and privacy-preserving learning on
+edge devices with minimal performance trade-offs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards System 2 Reasoning in LLMs: Learning How to Think With Meta
+  Chain-of-Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Violet Xiang, Charlie Snell, Kanishk Gandhi, Alon Albalak, Anikait Singh, Chase Blagden, Duy Phung, Rafael Rafailov, Nathan Lile, Dakota Mahan, Louis Castricato, Jan-Philipp Franken, Nick Haber, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel framework, Meta Chain-of-Thought (Meta-CoT), which extends
+traditional Chain-of-Thought (CoT) by explicitly modeling the underlying
+reasoning required to arrive at a particular CoT. We present empirical evidence
+from state-of-the-art models exhibiting behaviors consistent with in-context
+search, and explore methods for producing Meta-CoT via process supervision,
+synthetic data generation, and search algorithms. Finally, we outline a
+concrete pipeline for training a model to produce Meta-CoTs, incorporating
+instruction tuning with linearized search traces and reinforcement learning
+post-training. Finally, we discuss open research questions, including scaling
+laws, verifier roles, and the potential for discovering novel reasoning
+algorithms. This work provides a theoretical and practical roadmap to enable
+Meta-CoT in LLMs, paving the way for more powerful and human-like reasoning in
+artificial intelligence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TREAD: Token Routing for Efficient Architecture-agnostic Diffusion
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Krause, Timy Phan, Vincent Tao Hu, Björn Ommer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as the mainstream approach for visual
+generation. However, these models usually suffer from sample inefficiency and
+high training costs. This issue is particularly pronounced in the standard
+diffusion transformer architecture due to its quadratic complexity relative to
+input length. Recent works have addressed this by reducing the number of tokens
+processed in the model, often through masking. In contrast, this work aims to
+improve the training efficiency of the diffusion backbone by using predefined
+routes that store this information until it is reintroduced to deeper layers of
+the model, rather than discarding these tokens entirely. Further, we combine
+multiple routes and introduce an adapted auxiliary loss that accounts for all
+applied routes. Our method is not limited to the common transformer-based model
+- it can also be applied to state-space models. Unlike most current approaches,
+TREAD achieves this without architectural modifications. Finally, we show that
+our method reduces the computational cost and simultaneously boosts model
+performance on the standard benchmark ImageNet-1K 256 x 256 in
+class-conditional synthesis. Both of these benefits multiply to a convergence
+speedup of 9.55x at 400K training iterations compared to DiT and 25.39x
+compared to the best benchmark performance of DiT at 7M training iterations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discovering new robust local search algorithms with neuro-evolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Salim Amri Sakhri, Adrien Goëffon, Olivier Goudet, Frédéric Saubion, Chaïmaâ Touhami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores a novel approach aimed at overcoming existing challenges
+in the realm of local search algorithms. Our aim is to improve the decision
+process that takes place within a local search algorithm so as to make the best
+possible transitions in the neighborhood at each iteration. To improve this
+process, we propose to use a neural network that has the same input information
+as conventional local search algorithms. In this paper, which is an extension
+of the work [Goudet et al. 2024] presented at EvoCOP2024, we investigate
+different ways of representing this information so as to make the algorithm as
+efficient as possible but also robust to monotonic transformations of the
+problem objective function. To assess the efficiency of this approach, we
+develop an experimental setup centered around NK landscape problems, offering
+the flexibility to adjust problem size and ruggedness. This approach offers a
+promising avenue for the emergence of new local search algorithms and the
+improvement of their problem-solving capabilities for black-box problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards an Ontology of Traceable Impact Management in the Food Supply
+  Chain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bart Gajderowicz, Mark S Fox, Yongchao Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pursuit of quality improvements and accountability in the food supply
+chains, especially how they relate to food-related outcomes, such as hunger,
+has become increasingly vital, necessitating a comprehensive approach that
+encompasses product quality and its impact on various stakeholders and their
+communities. Such an approach offers numerous benefits in increasing product
+quality and eliminating superfluous measurements while appraising and
+alleviating the broader societal and environmental repercussions. A traceable
+impact management model (TIMM) provides an impact structure and a reporting
+mechanism that identifies each stakeholder's role in the total impact of food
+production and consumption stages.
+  The model aims to increase traceability's utility in understanding the impact
+of changes on communities affected by food production and consumption, aligning
+with current and future government requirements, and addressing the needs of
+communities and consumers. This holistic approach is further supported by an
+ontological model that forms the logical foundation and a unified terminology.
+By proposing a holistic and integrated solution across multiple stakeholders,
+the model emphasizes quality and the extensive impact of championing
+accountability, sustainability, and responsible practices with global
+traceability.
+  With these combined efforts, the food supply chain moves toward a global
+tracking and tracing process that not only ensures product quality but also
+addresses its impact on a broader scale, fostering accountability,
+sustainability, and responsible food production and consumption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GLoG-CSUnet: Enhancing Vision <span class="highlight-title">Transformer</span>s with Adaptable Radiomic
+  Features for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02788v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02788v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niloufar Eghbali, Hassan Bagher-Ebadian, Tuka Alhanai, Mohammad M. Ghassemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have shown promise in medical image semantic
+segmentation (MISS) by capturing long-range correlations. However, ViTs often
+struggle to model local spatial information effectively, which is essential for
+accurately segmenting fine anatomical details, particularly when applied to
+small datasets without extensive pre-training. We introduce Gabor and Laplacian
+of Gaussian Convolutional Swin Network (GLoG-CSUnet), a novel architecture
+enhancing Transformer-based models by incorporating learnable radiomic
+features. This approach integrates dynamically adaptive Gabor and Laplacian of
+Gaussian (LoG) filters to capture texture, edge, and boundary information,
+enhancing the feature representation processed by the Transformer model. Our
+method uniquely combines the long-range dependency modeling of Transformers
+with the texture analysis capabilities of Gabor and LoG features. Evaluated on
+the Synapse multi-organ and ACDC cardiac segmentation datasets, GLoG-CSUnet
+demonstrates significant improvements over state-of-the-art models, achieving a
+1.14% increase in Dice score for Synapse and 0.99% for ACDC, with minimal
+computational overhead (only 15 and 30 additional parameters, respectively).
+GLoG-CSUnet's flexible design allows integration with various base models,
+offering a promising approach for incorporating radiomics-inspired feature
+extraction in Transformer architectures for medical image analysis. The code
+implementation is available on GitHub at: https://github.com/HAAIL/GLoG-CSUnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Object-Oriented POMDP Planning for Object Rearrangement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.01348v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.01348v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajesh Mangannavar, Alan Fern, Prasad Tadepalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an online planning framework for solving multi-object
+rearrangement problems in partially observable, multi-room environments.
+Current object rearrangement solutions, primarily based on Reinforcement
+Learning or hand-coded planning methods, often lack adaptability to diverse
+challenges. To address this limitation, we introduce a novel Hierarchical
+Object-Oriented Partially Observed Markov Decision Process (HOO-POMDP) planning
+approach. This approach comprises of (a) an object-oriented POMDP planner
+generating sub-goals, (b) a set of low-level policies for sub-goal achievement,
+and (c) an abstraction system converting the continuous low-level world into a
+representation suitable for abstract planning. We evaluate our system on
+varying numbers of objects, rooms, and problem types in AI2-THOR simulated
+environments with promising results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 2 Figures. Preprint. Updated acknowledgments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Samba-ASR: State-Of-The-Art Speech Recognition Leveraging Structured
+  State-Space Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02832v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02832v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Syed Abdul Gaffar Shakhadri, Kruthika KR, Kartik Basavaraj Angadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Samba ASR,the first state of the art Automatic Speech
+Recognition(ASR)model leveraging the novel Mamba architecture as both encoder
+and decoder,built on the foundation of state space models(SSMs).Unlike
+transformerbased ASR models,which rely on self-attention mechanisms to capture
+dependencies,Samba ASR effectively models both local and global temporal
+dependencies using efficient statespace dynamics,achieving remarkable
+performance gains.By addressing the limitations of transformers,such as
+quadratic scaling with input length and difficulty in handling longrange
+dependencies,Samba ASR achieves superior accuracy and efficiency.Experimental
+results demonstrate that Samba ASR surpasses existing opensource
+transformerbased ASR models across various standard benchmarks,establishing it
+as the new state of theart in ASR.Extensive evaluations on the benchmark
+dataset show significant improvements in Word Error Rate(WER),with competitive
+performance even in lowresource scenarios.Furthermore,the inherent
+computational efficiency and parameter optimization of the Mamba architecture
+make Samba ASR a scalable and robust solution for diverse ASR tasks.Our
+contributions include the development of a new Samba ASR architecture for
+automatic speech recognition(ASR),demonstrating the superiority of structured
+statespace models(SSMs)over transformer based models for speech sequence
+processing.We provide a comprehensive evaluation on public
+benchmarks,showcasing stateoftheart(SOTA)performance,and present an indepth
+analysis of computational efficiency,robustness to noise,and sequence
+generalization.This work highlights the viability of Mamba SSMs as a
+transformerfree alternative for efficient and accurate ASR.By leveraging the
+advancements of statespace modeling,Samba ASR redefines ASR performance
+standards and sets a new benchmark for future research in this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Zero-Shot Open-Vocabulary Pipeline for Dialogue Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.15861v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.15861v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdulfattah Safa, Gözde Gül Şahin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialogue State Tracking (DST) is crucial for understanding user needs and
+executing appropriate system actions in task-oriented dialogues. Majority of
+existing DST methods are designed to work within predefined ontologies and
+assume the availability of gold domain labels, struggling with adapting to new
+slots values. While Large Language Models (LLMs)-based systems show promising
+zero-shot DST performance, they either require extensive computational
+resources or they underperform existing fully-trained systems, limiting their
+practicality. To address these limitations, we propose a zero-shot,
+open-vocabulary system that integrates domain classification and DST in a
+single pipeline. Our approach includes reformulating DST as a
+question-answering task for less capable models and employing self-refining
+prompts for more adaptable ones. Our system does not rely on fixed slot values
+defined in the ontology allowing the system to adapt dynamically. We compare
+our approach with existing SOTA, and show that it provides up to 20% better
+Joint Goal Accuracy (JGA) over previous methods on datasets like Multi-WOZ 2.1,
+with up to 90% fewer requests to the LLM API.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Indoor-Training Effect: unexpected gains from distribution shifts in
+  the transition function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15856v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15856v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serena Bono, Spandan Madan, Ishaan Grover, Mao Yasueda, Cynthia Breazeal, Hanspeter Pfister, Gabriel Kreiman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Is it better to perform tennis training in a pristine indoor environment or a
+noisy outdoor one? To model this problem, here we investigate whether shifts in
+the transition probabilities between the training and testing environments in
+reinforcement learning problems can lead to better performance under certain
+conditions. We generate new Markov Decision Processes (MDPs) starting from a
+given MDP, by adding quantifiable, parametric noise into the transition
+function. We refer to this process as Noise Injection and the resulting
+environments as {\delta}-environments. This process allows us to create
+variations of the same environment with quantitative control over noise serving
+as a metric of distance between environments. Conventional wisdom suggests that
+training and testing on the same MDP should yield the best results. In stark
+contrast, we observe that agents can perform better when trained on the
+noise-free environment and tested on the noisy {\delta}-environments, compared
+to training and testing on the same {\delta}-environments. We confirm that this
+finding extends beyond noise variations: it is possible to showcase the same
+phenomenon in ATARI game variations including varying Ghost behaviour in
+PacMan, and Paddle behaviour in Pong. We demonstrate this intriguing behaviour
+across 60 different variations of ATARI games, including PacMan, Pong, and
+Breakout. We refer to this phenomenon as the Indoor-Training Effect. Code to
+reproduce our experiments and to implement Noise Injection can be found at
+https://bit.ly/3X6CTYk.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Incentivized Symbiosis: A Paradigm for Human-Agent Coevolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.06855v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.06855v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomer Jordi Chaffer, Justin Goldston, Gemach D. A. T. A. I
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperation is vital to our survival and progress. Evolutionary game theory
+offers a lens to understand the structures and incentives that enable
+cooperation to be a successful strategy. As artificial intelligence agents
+become integral to human systems, the dynamics of cooperation take on
+unprecedented significance. The convergence of human-agent teaming, contract
+theory, and decentralized frameworks like Web3, grounded in transparency,
+accountability, and trust, offers a foundation for fostering cooperation by
+establishing enforceable rules and incentives for humans and AI agents. We
+conceptualize Incentivized Symbiosis as a social contract between humans and
+AI, inspired by Web3 principles and encoded in blockchain technology, to define
+and enforce rules, incentives, and consequences for both parties. By exploring
+this paradigm, we aim to catalyze new research at the intersection of systems
+thinking in AI, Web3, and society, fostering innovative pathways for
+cooperative human-agent coevolution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Offline Reinforcement Learning for Learning to Dispatch for Job Shop
+  Scheduling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.10589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.10589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesse van Remmerden, Zaharah Bukhsh, Yingqian Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Job Shop Scheduling Problem (JSSP) is a complex combinatorial
+optimization problem. While online Reinforcement Learning (RL) has shown
+promise by quickly finding acceptable solutions for JSSP, it faces key
+limitations: it requires extensive training interactions from scratch leading
+to sample inefficiency, cannot leverage existing high-quality solutions, and
+often yields suboptimal results compared to traditional methods like Constraint
+Programming (CP). We introduce Offline Reinforcement Learning for Learning to
+Dispatch (Offline-LD), which addresses these limitations by learning from
+previously generated solutions. Our approach is motivated by scenarios where
+historical scheduling data and expert solutions are available, although our
+current evaluation focuses on benchmark problems. Offline-LD adapts two
+CQL-based Q-learning methods (mQRDQN and discrete mSAC) for maskable action
+spaces, introduces a novel entropy bonus modification for discrete SAC, and
+exploits reward normalization through preprocessing. Our experiments
+demonstrate that Offline-LD outperforms online RL on both generated and
+benchmark instances. Notably, by introducing noise into the expert dataset, we
+achieve similar or better results than those obtained from the expert dataset,
+suggesting that a more diverse training set is preferable because it contains
+counterfactual information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://github.com/jesserem/Offline-LD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Multi-Objective Reinforcement Learning for Utility-Based
+  Infrastructural Maintenance Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.06184v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.06184v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesse van Remmerden, Maurice Kenter, Diederik M. Roijers, Charalampos Andriotis, Yingqian Zhang, Zaharah Bukhsh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Multi-Objective Deep Centralized Multi-Agent
+Actor-Critic (MO- DCMAC), a multi-objective reinforcement learning (MORL)
+method for infrastructural maintenance optimization, an area traditionally
+dominated by single-objective reinforcement learning (RL) approaches. Previous
+single-objective RL methods combine multiple objectives, such as probability of
+collapse and cost, into a singular reward signal through reward-shaping. In
+contrast, MO-DCMAC can optimize a policy for multiple objectives directly, even
+when the utility function is non-linear. We evaluated MO-DCMAC using two
+utility functions, which use probability of collapse and cost as input. The
+first utility function is the Threshold utility, in which MO-DCMAC should
+minimize cost so that the probability of collapse is never above the threshold.
+The second is based on the Failure Mode, Effects, and Criticality Analysis
+(FMECA) methodology used by asset managers to asses maintenance plans. We
+evaluated MO-DCMAC, with both utility functions, in multiple maintenance
+environments, including ones based on a case study of the historical quay walls
+of Amsterdam. The performance of MO-DCMAC was compared against multiple
+rule-based policies based on heuristics currently used for constructing
+maintenance plans. Our results demonstrate that MO-DCMAC outperforms
+traditional rule-based policies across various environments and utility
+functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the Neural Computing and Applications: Topical Collection
+  on Multi-Objective Decision Making 2023 (MODeM 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lemur: Log Parsing with Entropy Sampling and Chain-of-Thought Merging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18205v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18205v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Zhang, Hongcheng Guo, Anjie Le, Jian Yang, Jiaheng Liu, Zhoujun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Logs produced by extensive software systems are integral to monitoring system
+behaviors. Advanced log analysis facilitates the detection, alerting, and
+diagnosis of system faults. Log parsing, which entails transforming raw log
+messages into structured templates, constitutes a critical phase in the
+automation of log analytics. Existing log parsers fail to identify the correct
+templates due to reliance on human-made rules. Besides, These methods focus on
+statistical features while ignoring semantic information in log messages. To
+address these challenges, we introduce a cutting-edge \textbf{L}og parsing
+framework with \textbf{E}ntropy sampling and Chain-of-Thought \textbf{M}erging
+(Lemur). Specifically, to discard the tedious manual rules. We propose a novel
+sampling method inspired by information entropy, which efficiently clusters
+typical logs. Furthermore, to enhance the merging of log templates, we design a
+chain-of-thought method for large language models (LLMs). LLMs exhibit
+exceptional semantic comprehension, deftly distinguishing between parameters
+and invariant tokens. We have conducted experiments on large-scale public
+datasets. Extensive evaluation demonstrates that Lemur achieves the
+state-of-the-art performance and impressive efficiency. The Code is available
+at https://github.com/zwpride/lemur.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SWEPO: Simultaneous Weighted Preference Optimization for Group
+  Contrastive Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taneesh Gupta, Rahul Madhavan, Xuchao Zhang, Chetan Bansal, Saravan Rajmohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Simultaneous Weighted Preference Optimization (SWEPO), a novel
+extension of Direct Preference Optimization (DPO) designed to accommodate
+multiple dynamically chosen positive and negative responses for each query.
+SWEPO employs a weighted group contrastive loss, assigning weights to responses
+based on their deviation from the mean reward score. This approach effectively
+prioritizes responses that are significantly better or worse than the average,
+enhancing optimization. Our theoretical analysis demonstrates that
+simultaneously considering multiple preferences reduces alignment bias,
+resulting in more robust alignment. Additionally, we provide insights into the
+training dynamics of our loss function and a related function, InfoNCA.
+Empirical validation on the UltraFeedback dataset establishes SWEPO as
+state-of-the-art, with superior performance in downstream evaluations using the
+AlpacaEval dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tougher Text, Smarter Models: Raising the Bar for Adversarial Defence
+  Benchmarks <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02654v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02654v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Wang, Chenghua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in natural language processing have highlighted the
+vulnerability of deep learning models to adversarial attacks. While various
+defence mechanisms have been proposed, there is a lack of comprehensive
+benchmarks that evaluate these defences across diverse datasets, models, and
+tasks. In this work, we address this gap by presenting an extensive benchmark
+for textual adversarial defence that significantly expands upon previous work.
+Our benchmark incorporates a wide range of datasets, evaluates state-of-the-art
+defence mechanisms, and extends the assessment to include critical tasks such
+as single-sentence classification, similarity and paraphrase identification,
+natural language inference, and commonsense reasoning. This work not only
+serves as a valuable resource for researchers and practitioners in the field of
+adversarial robustness but also identifies key areas for future research in
+textual adversarial defence. By establishing a new standard for benchmarking in
+this domain, we aim to accelerate progress towards more robust and reliable
+natural language processing systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Will be presented as an oral in-person presentation at the conference
+  of COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rad4XCNN: a new agnostic method for post-hoc global explanation of
+  CNN-derived features by means of radiomics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02334v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02334v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Prinzi, Carmelo Militello, Calogero Zarcaro, Tommaso Vincenzo Bartolotta, Salvatore Gaglio, Salvatore Vitabile
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, machine learning-based clinical decision support systems
+(CDSS) have played a key role in the analysis of several medical conditions.
+Despite their promising capabilities, the lack of transparency in AI models
+poses significant challenges, particularly in medical contexts where
+reliability is a mandatory aspect. However, it appears that explainability is
+inversely proportional to accuracy. For this reason, achieving transparency
+without compromising predictive accuracy remains a key challenge. This paper
+presents a novel method, namely Rad4XCNN, to enhance the predictive power of
+CNN-derived features with the inherent interpretability of radiomic features.
+Rad4XCNN diverges from conventional methods based on saliency maps, by
+associating intelligible meaning to CNN-derived features by means of Radiomics,
+offering new perspectives on explanation methods beyond visualization maps.
+Using a breast cancer classification task as a case study, we evaluated
+Rad4XCNN on ultrasound imaging datasets, including an online dataset and two
+in-house datasets for internal and external validation. Some key results are:
+i) CNN-derived features guarantee more robust accuracy when compared against
+ViT-derived and radiomic features; ii) conventional visualization map methods
+for explanation present several pitfalls; iii) Rad4XCNN does not sacrifice
+model accuracy for their explainability; iv) Rad4XCNN provides a global
+explanation enabling the physician to extract global insights and findings. Our
+method can mitigate some concerns related to the explainability-accuracy
+trade-off. This study highlighted the importance of proposing new methods for
+model explanation without affecting their accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with
+  Video LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqian Yuan, Hang Zhang, Wentong Li, Zesen Cheng, Boqiang Zhang, Long Li, Xin Li, Deli Zhao, Wenqiao Zhang, Yueting Zhuang, Jianke Zhu, Lidong Bing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Large Language Models (Video LLMs) have recently exhibited remarkable
+capabilities in general video understanding. However, they mainly focus on
+holistic comprehension and struggle with capturing fine-grained spatial and
+temporal details. Besides, the lack of high-quality object-level video
+instruction data and a comprehensive benchmark further hinders their
+advancements. To tackle these challenges, we introduce the VideoRefer Suite to
+empower Video LLM for finer-level spatial-temporal video understanding, i.e.,
+enabling perception and reasoning on any objects throughout the video.
+Specially, we thoroughly develop VideoRefer Suite across three essential
+aspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent
+data engine to meticulously curate a large-scale, high-quality object-level
+video instruction dataset, termed VideoRefer-700K. Next, we present the
+VideoRefer model, which equips a versatile spatial-temporal object encoder to
+capture precise regional and sequential representations. Finally, we
+meticulously create a VideoRefer-Bench to comprehensively assess the
+spatial-temporal understanding capability of a Video LLM, evaluating it across
+various aspects. Extensive experiments and analyses demonstrate that our
+VideoRefer model not only achieves promising performance on video referring
+benchmarks but also facilitates general video understanding capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 figures, technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Model Based Agents: State-of-the-Art, Cooperation Paradigms,
+  Security and Privacy, and Future Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14457v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14457v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuntao Wang, Yanghe Pan, Zhou Su, Yi Deng, Quan Zhao, Linkang Du, Tom H. Luan, Jiawen Kang, Dusit Niyato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid advancement of large models (LMs), the development of
+general-purpose intelligent agents powered by LMs has become a reality. It is
+foreseeable that in the near future, LM-driven general AI agents will serve as
+essential tools in production tasks, capable of autonomous communication and
+collaboration without human intervention. This paper investigates scenarios
+involving the autonomous collaboration of future LM agents. We review the
+current state of LM agents, the key technologies enabling LM agent
+collaboration, and the security and privacy challenges they face during
+cooperative operations. To this end, we first explore the foundational
+principles of LM agents, including their general architecture, key components,
+enabling technologies, and modern applications. We then discuss practical
+collaboration paradigms from data, computation, and knowledge perspectives to
+achieve connected intelligence among LM agents. After that, we analyze the
+security vulnerabilities and privacy risks associated with LM agents,
+particularly in multi-agent settings, examining underlying mechanisms and
+reviewing current and potential countermeasures. Lastly, we propose future
+research directions for building robust and secure LM agent ecosystems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 31 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Race to Efficiency: A New Perspective on AI Scaling Laws 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02156v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02156v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chien-Ping Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large-scale AI models expand, training becomes costlier and sustaining
+progress grows harder. Classical scaling laws (e.g., Kaplan et al. (2020),
+Hoffmann et al. (2022)) predict training loss from a static compute budget yet
+neglect time and efficiency, prompting the question: how can we balance
+ballooning GPU fleets with rapidly improving hardware and algorithms? We
+introduce the relative-loss equation, a time- and efficiency-aware framework
+that extends classical AI scaling laws. Our model shows that, without ongoing
+efficiency gains, advanced performance could demand millennia of training or
+unrealistically large GPU fleets. However, near-exponential progress remains
+achievable if the "efficiency-doubling rate" parallels Moore's Law. By
+formalizing this race to efficiency, we offer a quantitative roadmap for
+balancing front-loaded GPU investments with incremental improvements across the
+AI stack. Empirical trends suggest that sustained efficiency gains can push AI
+scaling well into the coming decade, providing a new perspective on the
+diminishing returns inherent in classical scaling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 3 figures. 2 tables, second draft</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuralDiffuser: Neuroscience-inspired Diffusion Guidance for fMRI Visual
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13809v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13809v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Li, Hao Wu, Badong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing visual stimuli from functional Magnetic Resonance Imaging fMRI
+enables fine-grained retrieval of brain activity. However, the accurate
+reconstruction of diverse details, including structure, background, texture,
+color, and more, remains challenging. The stable diffusion models inevitably
+result in the variability of reconstructed images, even under identical
+conditions. To address this challenge, we first uncover the neuroscientific
+perspective of diffusion methods, which primarily involve top-down creation
+using pre-trained knowledge from extensive image datasets, but tend to lack
+detail-driven bottom-up perception, leading to a loss of faithful details. In
+this paper, we propose NeuralDiffuser, which incorporates primary visual
+feature guidance to provide detailed cues in the form of gradients. This
+extension of the bottom-up process for diffusion models achieves both semantic
+coherence and detail fidelity when reconstructing visual stimuli. Furthermore,
+we have developed a novel guidance strategy for reconstruction tasks that
+ensures the consistency of repeated outputs with original images rather than
+with various outputs. Extensive experimental results on the Natural Senses
+Dataset (NSD) qualitatively and quantitatively demonstrate the advancement of
+NeuralDiffuser by comparing it against baseline and state-of-the-art methods
+horizontally, as well as conducting longitudinal ablation studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling-laws for Large Time-series Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas D. P. Edwards, James Alvey, Justin Alsing, Nam H. Nguyen, Benjamin D. Wandelt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling laws for large language models (LLMs) have provided useful guidance
+in training ever larger models for predictable performance gains. Time series
+forecasting shares a similar sequential structure to language, and is amenable
+to large-scale transformer architectures. Here we show that foundational
+decoder-only time series transformer models exhibit analogous scaling-behavior
+to LLMs, with architectural details (aspect ratio and number of heads) having a
+minimal effect over broad ranges. We assemble a large corpus of heterogenous
+time series data on which to train, and establish for the first time power-law
+scaling with parameter count, dataset size, and training compute, spanning five
+orders of magnitude.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 main pages (16 total), 4 figures; Accepted for oral presentation in
+  Time Series in the Age of Large Models (TSALM) Workshop at Neurips 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Don't be Fooled: The Misinformation Effect of Explanations in Human-AI
+  Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.12809v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.12809v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Spitzer, Joshua Holstein, Katelyn Morrison, Kenneth Holstein, Gerhard Satzger, Niklas Kühl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Across various applications, humans increasingly use black-box artificial
+intelligence (AI) systems without insight into these systems' reasoning. To
+counter this opacity, explainable AI (XAI) methods promise enhanced
+transparency and interpretability. While recent studies have explored how XAI
+affects human-AI collaboration, few have examined the potential pitfalls caused
+by incorrect explanations. The implications for humans can be far-reaching but
+have not been explored extensively. To investigate this, we ran a study (n=160)
+on AI-assisted decision-making in which humans were supported by XAI. Our
+findings reveal a misinformation effect when incorrect explanations accompany
+correct AI advice with implications post-collaboration. This effect causes
+humans to infer flawed reasoning strategies, hindering task execution and
+demonstrating impaired procedural knowledge. Additionally, incorrect
+explanations compromise human-AI team-performance during collaboration. With
+our work, we contribute to HCI by providing empirical evidence for the negative
+consequences of incorrect explanations on humans post-collaboration and
+outlining guidelines for designers of AI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedPix 2.0: A Comprehensive Multimodal Biomedical Data set for Advanced
+  AI Applications with Retrieval Augmented Generation and Knowledge Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.02994v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.02994v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Irene Siragusa, Salvatore Contino, Massimo La Ciura, Rosario Alicata, Roberto Pirrone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing interest in developing Artificial Intelligence applications in
+the medical domain, suffers from the lack of high-quality data set, mainly due
+to privacy-related issues. In addition, the recent increase in large multimodal
+models (LMM) leads to the need for multimodal medical data sets, where clinical
+reports and findings are attached to the corresponding CT or MRI scans. This
+paper illustrates the entire workflow for building the MedPix 2.0 data set.
+Starting with the well-known multimodal data set
+MedPix\textsuperscript{\textregistered}, mainly used by physicians, nurses, and
+healthcare students for Continuing Medical Education purposes, a semi-automatic
+pipeline was developed to extract visual and textual data followed by a manual
+curing procedure in which noisy samples were removed, thus creating a MongoDB
+database. Along with the data set, we developed a GUI aimed at navigating
+efficiently the MongoDB instance and obtaining the raw data that can be easily
+used for training and/or fine-tuning LMMs. To enforce this point, in this work,
+we first recall DR-Minerva, a RAG-based LMM trained using MedPix 2.0.
+DR-Minerva predicts the body part and the modality used to scan its input
+image. We also propose the extension of DR-Minerva with a Knowledge Graph that
+uses Llama 3.1 Instruct 8B, and leverages MedPix 2.0. The resulting
+architecture can be queried in a end-to-end manner, as a medical decision
+support system. MedPix 2.0 is available on GitHub.
+\url{https://github.com/CHILab1/MedPix-2.0}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoSTF: Decoupled Neural Architecture Search for Cost-Effective
+  Automated Spatio-Temporal Forecasting <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16586v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16586v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengfei Lyu, Weijia Zhang, Jinliang Deng, Hao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatio-temporal forecasting is a critical component of various smart city
+applications, such as transportation optimization, energy management, and
+socio-economic analysis. Recently, several automated spatio-temporal
+forecasting methods have been proposed to automatically search the optimal
+neural network architecture for capturing complex spatio-temporal dependencies.
+However, the existing automated approaches suffer from expensive neural
+architecture search overhead, which hinders their practical use and the further
+exploration of diverse spatio-temporal operators in a finer granularity. In
+this paper, we propose AutoSTF, a decoupled automatic neural architecture
+search framework for cost-effective automated spatio-temporal forecasting. From
+the efficiency perspective, we first decouple the mixed search space into
+temporal space and spatial space and respectively devise representation
+compression and parameter-sharing schemes to mitigate the parameter explosion.
+The decoupled spatio-temporal search not only expedites the model optimization
+process but also leaves new room for more effective spatio-temporal dependency
+modeling. From the effectiveness perspective, we propose a multi-patch transfer
+module to jointly capture multi-granularity temporal dependencies and extend
+the spatial search space to enable finer-grained layer-wise spatial dependency
+search. Extensive experiments on eight datasets demonstrate the superiority of
+AutoSTF in terms of both accuracy and efficiency. Specifically, our proposed
+method achieves up to 13.48x speed-up compared to state-of-the-art automatic
+spatio-temporal forecasting methods while maintaining the best forecasting
+accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2025 Research Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mathematical Definition and Systematization of Puzzle Rules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01433v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01433v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Itsuki Maeda, Yasuhiro Inoue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While logic puzzles have engaged individuals through problem-solving and
+critical thinking, the creation of new puzzle rules has largely relied on
+ad-hoc processes. Pencil puzzles, such as Slitherlink and Sudoku, represent a
+prominent subset of these games, celebrated for their intellectual challenges
+rooted in combinatorial logic and spatial reasoning. Despite extensive research
+into solving techniques and automated problem generation, a unified framework
+for systematic and scalable rule design has been lacking. Here, we introduce a
+mathematical framework for defining and systematizing pencil puzzle rules. This
+framework formalizes grid elements, their positional relationships, and
+iterative composition operations, allowing for the incremental construction of
+structures that form the basis of puzzle rules. Furthermore, we establish a
+formal method to describe constraints and domains for each structure, ensuring
+solvability and coherence. Applying this framework, we successfully formalized
+the rules of well-known Nikoli puzzles, including Slitherlink and Sudoku,
+demonstrating the formal representation of a significant portion (approximately
+one-fourth) of existing puzzles. These results validate the potential of the
+framework to systematize and innovate puzzle rule design, establishing a
+pathway to automated rule generation. By providing a mathematical foundation
+for puzzle rule creation, this framework opens avenues for computers,
+potentially enhanced by AI, to design novel puzzle rules tailored to player
+preferences, expanding the scope of puzzle diversity. Beyond its direct
+application to pencil puzzles, this work illustrates how mathematical
+frameworks can bridge recreational mathematics and algorithmic design, offering
+tools for broader exploration in logic-based systems, with potential
+applications in educational game design, personalized learning, and
+computational creativity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Edge-Wise Graph-Instructed Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08023v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08023v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Della Santa, Antonio Mastropietro, Sandra Pieraccini, Francesco Vaccarino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of multi-task regression over graph nodes has been recently
+approached through Graph-Instructed Neural Network (GINN), which is a promising
+architecture belonging to the subset of message-passing graph neural networks.
+In this work, we discuss the limitations of the Graph-Instructed (GI) layer,
+and we formalize a novel edge-wise GI (EWGI) layer. We discuss the advantages
+of the EWGI layer and we provide numerical evidence that EWGINNs perform better
+than GINNs over some graph-structured input data, like the ones inferred from
+the Barabasi-Albert graph, and improve the training regularization on graphs
+with chaotic connectivity, like the ones inferred from the Erdos-Renyi graph.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analyzing Consumer IoT Traffic from Security and Privacy Perspectives: a
+  Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16149v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16149v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Jia, Yuxin Song, Zihou Liu, Qingyin Tan, Yang Song, Yu Zhang, Zheli Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Consumer Internet of Things (CIoT), a notable segment within the IoT
+domain, involves the integration of IoT technology into consumer electronics
+and devices, such as smart homes and smart wearables. Compared to traditional
+IoT fields, CIoT differs notably in target users, product types, and design
+approaches. While offering convenience to users, it also raises new security
+and privacy concerns. Network traffic analysis, a widely used technique in the
+security community, has been extensively applied to investigate these concerns
+about CIoT. Compared to network traffic analysis in other fields such as mobile
+apps and websites, CIoT presents unique characteristics, introducing new
+challenges and research opportunities. Researchers have made significant
+contributions in this area. To aid researchers in understanding the application
+of traffic analysis tools for studying CIoT security and privacy risks, this
+survey reviews 303 publications on traffic analysis within the CIoT security
+and privacy domain from January 2018 to June 2024, focusing on three research
+questions. Our work: 1) outlines the CIoT traffic analysis process and
+highlights its differences from general network traffic analysis. 2) summarizes
+and classifies existing research into four categories according to its
+application objectives: device fingerprinting, user activity inference,
+malicious traffic detection, and measurement. 3) explores emerging challenges
+and potential future research directions based on each step of the CIoT traffic
+analysis process. This will provide new insights to the community and guide the
+industry towards safer product designs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Byzantine Robustness in Federated Recommendation from Sparse
+  Aggregation Perspective <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03301v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03301v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjian Zhang, Mengmei Zhang, Xiao Wang, Lingjuan Lyu, Bo Yan, Junping Du, Chuan Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To preserve user privacy in recommender systems, federated recommendation
+(FR) based on federated learning (FL) emerges, keeping the personal data on the
+local client and updating a model collaboratively. Unlike FL, FR has a unique
+sparse aggregation mechanism, where the embedding of each item is updated by
+only partial clients, instead of full clients in a dense aggregation of general
+FL. Recently, as an essential principle of FL, model security has received
+increasing attention, especially for Byzantine attacks, where malicious clients
+can send arbitrary updates. The problem of exploring the Byzantine robustness
+of FR is particularly critical since in the domains applying FR, e.g.,
+e-commerce, malicious clients can be injected easily by registering new
+accounts. However, existing Byzantine works neglect the unique sparse
+aggregation of FR, making them unsuitable for our problem. Thus, we make the
+first effort to investigate Byzantine attacks on FR from the perspective of
+sparse aggregation, which is non-trivial: it is not clear how to define
+Byzantine robustness under sparse aggregations and design Byzantine attacks
+under limited knowledge/capability. In this paper, we reformulate the Byzantine
+robustness under sparse aggregation by defining the aggregation for a single
+item as the smallest execution unit. Then we propose a family of effective
+attack strategies, named Spattack, which exploit the vulnerability in sparse
+aggregation and are categorized along the adversary's knowledge and capability.
+Extensive experimental results demonstrate that Spattack can effectively
+prevent convergence and even break down defenses under a few malicious clients,
+raising alarms for securing FR systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Large Language Models for Active Merchant Non-player
+  Characters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.11189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.11189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Byungjun Kim, Minju Kim, Dayeon Seo, Bugeun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We highlight two significant issues leading to the passivity of current
+merchant non-player characters (NPCs): pricing and communication. While
+immersive interactions have been a focus, negotiations between merchant NPCs
+and players on item prices have not received sufficient attention. First, we
+define passive pricing as the limited ability of merchants to modify predefined
+item prices. Second, passive communication means that merchants can only
+interact with players in a scripted manner. To tackle these issues and create
+an active merchant NPC, we propose a merchant framework based on large language
+models (LLMs), called MART, which consists of an appraiser module and a
+negotiator module. We conducted two experiments to guide game developers in
+selecting appropriate implementations by comparing different training methods
+and LLM sizes. Our findings indicate that finetuning methods, such as
+supervised finetuning (SFT) and knowledge distillation (KD), are effective in
+using smaller LLMs to implement active merchant NPCs. Additionally, we found
+three irregular cases arising from the responses of LLMs. We expect our
+findings to guide developers in using LLMs for developing active merchant NPCs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review / Modified the links to code and dataset</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentiable Inductive Logic Programming in High-Dimensional Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06652v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06652v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stanisław J. Purgał, David M. Cerna, Cezary Kaliszyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthesizing large logic programs through symbolic Inductive Logic
+Programming (ILP) typically requires intermediate definitions. However,
+cluttering the hypothesis space with intensional predicates typically degrades
+performance. In contrast, gradient descent provides an efficient way to find
+solutions within such high-dimensional spaces. Neuro-symbolic ILP approaches
+have not fully exploited this so far. We propose extending the {\delta}ILP
+approach to inductive synthesis with large-scale predicate invention, thus
+allowing us to exploit the efficacy of high-dimensional gradient descent. We
+show that large-scale predicate invention benefits differentiable inductive
+synthesis through gradient descent and allows one to learn solutions for tasks
+beyond the capabilities of existing neuro-symbolic ILP systems. Furthermore, we
+achieve these results without specifying the precise structure of the solution
+within the language bias.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, To appear, published at IJCLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SenseRAG: Constructing Environmental Knowledge Bases with Proactive
+  Querying for LLM-Based Autonomous Driving <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03535v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03535v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuewen Luo, Fan Ding, Fengze Yang, Yang Zhou, Junnyong Loo, Hwa Hui Tew, Chenxi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the critical need for enhanced situational awareness in
+autonomous driving (AD) by leveraging the contextual reasoning capabilities of
+large language models (LLMs). Unlike traditional perception systems that rely
+on rigid, label-based annotations, it integrates real-time, multimodal sensor
+data into a unified, LLMs-readable knowledge base, enabling LLMs to dynamically
+understand and respond to complex driving environments. To overcome the
+inherent latency and modality limitations of LLMs, a proactive
+Retrieval-Augmented Generation (RAG) is designed for AD, combined with a
+chain-of-thought prompting mechanism, ensuring rapid and context-rich
+understanding. Experimental results using real-world Vehicle-to-everything
+(V2X) datasets demonstrate significant improvements in perception and
+prediction performance, highlighting the potential of this framework to enhance
+safety, adaptability, and decision-making in next-generation AD systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at WACV Workshop LLMAD
+  2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DEFormer: DCT-driven Enhancement <span class="highlight-title">Transformer</span> for Low-light Image and
+  Dark Vision <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06941v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06941v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangchen Yin, Zhenda Yu, Xin Gao, Xiao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-light image enhancement restores the colors and details of a single image
+and improves high-level visual tasks. However, restoring the lost details in
+the dark area is still a challenge relying only on the RGB domain. In this
+paper, we delve into frequency as a new clue into the model and propose a
+DCT-driven enhancement transformer (DEFormer) framework. First, we propose a
+learnable frequency branch (LFB) for frequency enhancement contains DCT
+processing and curvature-based frequency enhancement (CFE) to represent
+frequency features. Additionally, we propose a cross domain fusion (CDF) to
+reduce the differences between the RGB domain and the frequency domain. Our
+DEFormer has achieved superior results on the LOL and MIT-Adobe FiveK datasets,
+improving the dark detection performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reorganizing attention-space geometry with expressive attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.18601v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.18601v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claudius Gros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention regulates information transfer between tokens. For this, query and
+key vectors are compared, typically in terms of a scalar product,
+$\mathbf{Q}^T\mathbf{K}$, together with a subsequent softmax normalization. In
+geometric terms, the standard dot-product attention (DPA) leads to large/small
+attention weights for parallel/antiparallel queries and keys. Here we study
+expressive attention (EA), which is based on $(\mathbf{Q}^T\mathbf{K})^2$, the
+squared dot product. In this case, attention is enhanced when query and key are
+either parallel or antiparallel, and suppressed for orthogonal configurations.
+EA can be introduced into any attention-based code without additional compute
+costs or memory requirements. For a series of autoregressive prediction tasks,
+we find that expressive attention performs at least as well as vanilla DPA.
+Increasing task complexity, EA is observed to outperform DPA with increasing
+margins, which also holds for multi-task settings. For a given model size, EA
+manages to achieve 100% performance for a range of complexity levels not
+accessible to DPA. Our results show that it is possible to reorganize the
+geometry of the matching condition in the space of attention heads without loss
+of performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Toxicity Detection towards Adaptability to Changing Perturbations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.15267v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.15267v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hankun Kang, Jianhao Chen, Yongqi Li, Xin Miao, Mayi Xu, Ming Zhong, Yuanyuan Zhu, Tieyun Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Toxicity detection is crucial for maintaining the peace of the society. While
+existing methods perform well on normal toxic contents or those generated by
+specific perturbation methods, they are vulnerable to evolving perturbation
+patterns. However, in real-world scenarios, malicious users tend to create new
+perturbation patterns for fooling the detectors. For example, some users may
+circumvent the detector of large language models (LLMs) by adding `I am a
+scientist' at the beginning of the prompt. In this paper, we introduce a novel
+problem, i.e., continual learning jailbreak perturbation patterns, into the
+toxicity detection field. To tackle this problem, we first construct a new
+dataset generated by 9 types of perturbation patterns, 7 of them are summarized
+from prior work and 2 of them are developed by us. We then systematically
+validate the vulnerability of current methods on this new perturbation
+pattern-aware dataset via both the zero-shot and fine tuned cross-pattern
+detection. Upon this, we present the domain incremental learning paradigm and
+the corresponding benchmark to ensure the detector's robustness to dynamically
+emerging types of perturbed toxic text. Our code and dataset are provided in
+the appendix and will be publicly available at GitHub, by which we wish to
+offer new research opportunities for the security-relevant communities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rho-1: Not All Tokens Are What You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07965v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07965v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghao Lin, Zhibin Gou, Yeyun Gong, Xiao Liu, Yelong Shen, Ruochen Xu, Chen Lin, Yujiu Yang, Jian Jiao, Nan Duan, Weizhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous language model pre-training methods have uniformly applied a
+next-token prediction loss to all training tokens. Challenging this norm, we
+posit that "9l training". Our initial analysis examines token-level training
+dynamics of language model, revealing distinct loss patterns for different
+tokens. Leveraging these insights, we introduce a new language model called
+Rho-1. Unlike traditional LMs that learn to predict every next token in a
+corpus, Rho-1 employs Selective Language Modeling (SLM), which selectively
+trains on useful tokens that aligned with the desired distribution. This
+approach involves scoring pretraining tokens using a reference model, and then
+training the language model with a focused loss on tokens with higher scores.
+When continual pretraining on 15B OpenWebMath corpus, Rho-1 yields an absolute
+improvement in few-shot accuracy of up to 30% in 9 math tasks. After
+fine-tuning, Rho-1-1B and 7B achieved state-of-the-art results of 40.6% and
+51.8% on MATH dataset, respectively - matching DeepSeekMath with only 3% of the
+pretraining tokens. Furthermore, when continual pretraining on 80B general
+tokens, Rho-1 achieves 6.8% average enhancement across 15 diverse tasks,
+increasing both efficiency and performance of the language model pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors equal contribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Adversarial Attacks in Reinforcement Learning from Policy
+  Distribution Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03562v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03562v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyang Duan, Zongyuan Zhang, Zheng Lin, Yue Gao, Ling Xiong, Yong Cui, Hongbin Liang, Xianhao Chen, Heming Cui, Dong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Reinforcement Learning (DRL) suffers from uncertainties and inaccuracies
+in the observation signal in realworld applications. Adversarial attack is an
+effective method for evaluating the robustness of DRL agents. However, existing
+attack methods targeting individual sampled actions have limited impacts on the
+overall policy distribution, particularly in continuous action spaces. To
+address these limitations, we propose the Distribution-Aware Projected Gradient
+Descent attack (DAPGD). DAPGD uses distribution similarity as the gradient
+perturbation input to attack the policy network, which leverages the entire
+policy distribution rather than relying on individual samples. We utilize the
+Bhattacharyya distance in DAPGD to measure policy similarity, enabling
+sensitive detection of subtle but critical differences between probability
+distributions. Our experiment results demonstrate that DAPGD achieves SOTA
+results compared to the baselines in three robot navigation tasks, achieving an
+average 22.03% higher reward drop compared to the best baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MultiMax: Sparse and Multi-Modal Attention Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01189v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01189v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Zhou, Mario Fritz, Margret Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SoftMax is a ubiquitous ingredient of modern machine learning algorithms. It
+maps an input vector onto a probability simplex and reweights the input by
+concentrating the probability mass at large entries. Yet, as a smooth
+approximation to the Argmax function, a significant amount of probability mass
+is distributed to other, residual entries, leading to poor interpretability and
+noise. Although sparsity can be achieved by a family of SoftMax variants, they
+often require an alternative loss function and do not preserve multi-modality.
+We show that this trade-off between multi-modality and sparsity limits the
+expressivity of SoftMax as well as its variants. We provide a solution to this
+tension between objectives by proposing a piece-wise differentiable function,
+termed MultiMax, which adaptively modulates the output distribution according
+to input entry range. Through comprehensive analysis and evaluation, we show
+that MultiMax successfully produces a distribution that supresses irrelevant
+entries while preserving multimodality, with benefits in image classification,
+language modeling and machine translation. The code is available at
+https://github.com/ZhouYuxuanYX/MultiMax.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TS-HTFA: Advancing Time Series Forecasting via Hierarchical Text-Free
+  Alignment with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14978v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14978v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengfei Wang, Huanran Zheng, Qi'ao Xu, Silong Dai, Yiqiao Wang, Wenjing Yue, Wei Zhu, Tianwen Qian, Xiaoling Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the significant potential of large language models (LLMs) in sequence
+modeling, emerging studies have begun applying them to time-series forecasting.
+Despite notable progress, existing methods still face two critical challenges:
+1) their reliance on large amounts of paired text data, limiting the model
+applicability, and 2) a substantial modality gap between text and time series,
+leading to insufficient alignment and suboptimal performance. In this paper, we
+introduce \textbf{H}ierarchical \textbf{T}ext-\textbf{F}ree \textbf{A}lignment
+(\textbf{TS-HTFA}), a novel method that leverages hierarchical alignment to
+fully exploit the representation capacity of LLMs while eliminating the
+dependence on text data. Specifically, we replace paired text data with
+adaptive virtual text based on QR decomposition word embeddings and learnable
+prompt. Furthermore, we establish comprehensive cross-modal alignment at three
+levels: input, feature, and output. Extensive experiments on multiple
+time-series benchmarks demonstrate that HTFA achieves state-of-the-art
+performance, significantly improving prediction accuracy and generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Agent Training for Pommerman: Curriculum Learning and
+  Population-based Self-Play Approach <span class="chip">IJCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.00662v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.00662v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nhat-Minh Huynh, Hoang-Giang Cao, I-Chen Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pommerman is a multi-agent environment that has received considerable
+attention from researchers in recent years. This environment is an ideal
+benchmark for multi-agent training, providing a battleground for two teams with
+communication capabilities among allied agents. Pommerman presents significant
+challenges for model-free reinforcement learning due to delayed action effects,
+sparse rewards, and false positives, where opponent players can lose due to
+their own mistakes. This study introduces a system designed to train
+multi-agent systems to play Pommerman using a combination of curriculum
+learning and population-based self-play. We also tackle two challenging
+problems when deploying the multi-agent training system for competitive games:
+sparse reward and suitable matchmaking mechanism. Specifically, we propose an
+adaptive annealing factor based on agents' performance to adjust the dense
+exploration reward during training dynamically. Additionally, we implement a
+matchmaking mechanism utilizing the Elo rating system to pair agents
+effectively. Our experimental results demonstrate that our trained agent can
+outperform top learning agents without requiring communication among allied
+agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at The First Workshop on Game AI Algorithms and Multi-Agent
+  Learning - IJCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled Prioritized Resampling for Offline RL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05412v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05412v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Yue, Bingyi Kang, Xiao Ma, Qisen Yang, Gao Huang, Shiji Song, Shuicheng Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline reinforcement learning (RL) is challenged by the distributional shift
+problem. To address this problem, existing works mainly focus on designing
+sophisticated policy constraints between the learned policy and the behavior
+policy. However, these constraints are applied equally to well-performing and
+inferior actions through uniform sampling, which might negatively affect the
+learned policy. To alleviate this issue, we propose Offline Prioritized
+Experience Replay (OPER), featuring a class of priority functions designed to
+prioritize highly-rewarding transitions, making them more frequently visited
+during training. Through theoretical analysis, we show that this class of
+priority functions induce an improved behavior policy, and when constrained to
+this improved policy, a policy-constrained offline RL algorithm is likely to
+yield a better solution. We develop two practical strategies to obtain priority
+weights by estimating advantages based on a fitted value network (OPER-A) or
+utilizing trajectory returns (OPER-R) for quick computation. OPER is a
+plug-and-play component for offline RL algorithms. As case studies, we evaluate
+OPER on five different algorithms, including BC, TD3+BC, Onestep RL, CQL, and
+IQL. Extensive experiments demonstrate that both OPER-A and OPER-R
+significantly improve the performance for all baseline methods. Codes and
+priority weights are availiable at https://github.com/sail-sg/OPER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published on IEEE TNNLS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating
+  Machine Learning Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.07464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.07464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Gandhi, Manasi Patwardhan, Lovekesh Vig, Gautam Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) excel in diverse applications including
+generation of code snippets, but often struggle with generating code for
+complex Machine Learning (ML) tasks. Although existing LLM single-agent based
+systems give varying performance depending on the task complexity, they purely
+rely on larger and expensive models such as GPT-4. Our investigation reveals
+that no-cost and low-cost models such as Gemini-Pro, Mixtral and CodeLlama
+perform far worse than GPT-4 in a single-agent setting. With the motivation of
+developing a cost-efficient LLM based solution for solving ML tasks, we propose
+an LLM Multi-Agent based system which leverages combination of experts using
+profiling, efficient retrieval of past observations, LLM cascades, and
+ask-the-expert calls. Through empirical analysis on ML engineering tasks in the
+MLAgentBench benchmark, we demonstrate the effectiveness of our system, using
+no-cost models, namely Gemini as the base LLM, paired with GPT-4 in cascade and
+expert to serve occasional ask-the-expert calls for planning. With 94.2\%
+reduction in the cost (from \$0.931 per run cost averaged over all tasks for
+GPT-4 single agent system to \$0.054), our system is able to yield better
+average success rate of 32.95\% as compared to GPT-4 single-agent system
+yielding 22.72\% success rate averaged over all the tasks of MLAgentBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at AIMLSystems '24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MEDSAGE: Enhancing Robustness of Medical Dialogue Summarization to ASR
+  Errors with LLM-generated Synthetic Dialogues <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14418v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14418v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuluhan Binici, Abhinav Ramesh Kashyap, Viktor Schlegel, Andy T. Liu, Vijay Prakash Dwivedi, Thanh-Tung Nguyen, Xiaoxue Gao, Nancy F. Chen, Stefan Winkler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Speech Recognition (ASR) systems are pivotal in transcribing speech
+into text, yet the errors they introduce can significantly degrade the
+performance of downstream tasks like summarization. This issue is particularly
+pronounced in clinical dialogue summarization, a low-resource domain where
+supervised data for fine-tuning is scarce, necessitating the use of ASR models
+as black-box solutions. Employing conventional data augmentation for enhancing
+the noise robustness of summarization models is not feasible either due to the
+unavailability of sufficient medical dialogue audio recordings and
+corresponding ASR transcripts. To address this challenge, we propose MEDSAGE,
+an approach for generating synthetic samples for data augmentation using Large
+Language Models (LLMs). Specifically, we leverage the in-context learning
+capabilities of LLMs and instruct them to generate ASR-like errors based on a
+few available medical dialogue examples with audio recordings. Experimental
+results show that LLMs can effectively model ASR noise, and incorporating this
+noisy data into the training process significantly improves the robustness and
+accuracy of medical dialogue summarization systems. This approach addresses the
+challenges of noisy ASR outputs in critical applications, offering a robust
+solution to enhance the reliability of clinical dialogue summarization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the Thirty-Ninth AAAI Conference on Artificial
+  Intelligence (AAAI-25)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing Teacher Networks for Effective Knowledge Distillation
+  Across Student Architectures <span class="chip">BMVC 24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuluhan Binici, Weiming Wu, Tulika Mitra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD) is a model compression method that entails
+training a compact student model to emulate the performance of a more complex
+teacher model. However, the architectural capacity gap between the two models
+limits the effectiveness of knowledge transfer. Addressing this issue, previous
+works focused on customizing teacher-student pairs to improve compatibility, a
+computationally expensive process that needs to be repeated every time either
+model changes. Hence, these methods are impractical when a teacher model has to
+be compressed into different student models for deployment on multiple hardware
+devices with distinct resource constraints. In this work, we propose Generic
+Teacher Network (GTN), a one-off KD-aware training to create a generic teacher
+capable of effectively transferring knowledge to any student model sampled from
+a given finite pool of architectures. To this end, we represent the student
+pool as a weight-sharing supernet and condition our generic teacher to align
+with the capacities of various student architectures sampled from this
+supernet. Experimental evaluation shows that our method both improves overall
+KD effectiveness and amortizes the minimal additional training cost of the
+generic teacher across students in the pool.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>British Machine Vision Conference (BMVC 24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Lightweight and Real-Time Binaural Speech Enhancement Model with
+  Spatial Cues Preservation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.12444v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.12444v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyuan Wang, Jie Zhang, Shihao Chen, Miao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Binaural speech enhancement (BSE) aims to jointly improve the speech quality
+and intelligibility of noisy signals received by hearing devices and preserve
+the spatial cues of the target for natural listening. Existing methods often
+suffer from the compromise between noise reduction (NR) capacity and spatial
+cues preservation (SCP) accuracy and a high computational demand in complex
+acoustic scenes. In this work, we present a learning-based lightweight binaural
+complex convolutional network (LBCCN), which excels in NR by filtering
+low-frequency bands and keeping the rest. Additionally, our approach explicitly
+incorporates the estimation of interchannel relative acoustic transfer function
+to ensure the spatial cues fidelity and speech clarity. Results show that the
+proposed LBCCN can achieve a comparable NR performance to state-of-the-art
+methods under fixed-speaker conditions, but with a much lower computational
+cost and a certain degree of SCP capability. The reproducible code and audio
+examples are available at https://github.com/jywanng/LBCCN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Learning and RAG Integration: A Scalable Approach for Medical
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13720v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13720v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jincheol Jung, Hongju Jeong, Eui-Nam Huh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study analyzes the performance of domain-specific Large Language Models
+(LLMs) for the medical field by integrating Retrieval-Augmented Generation
+(RAG) systems within a federated learning framework. Leveraging the inherent
+advantages of federated learning, such as preserving data privacy and enabling
+distributed computation, this research explores the integration of RAG systems
+with models trained under varying client configurations to optimize
+performance. Experimental results demonstrate that the federated learning-based
+models integrated with RAG systems consistently outperform their non-integrated
+counterparts across all evaluation metrics. This study highlights the potential
+of combining federated learning and RAG systems for developing domain-specific
+LLMs in the medical field, providing a scalable and privacy-preserving solution
+for enhancing text generation capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion Manifold Flow Primitives for Task-Conditioned Trajectory
+  Generation under Complex Task-Motion Dependencies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19681v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19681v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonghyeon Lee, Byeongho Lee, Seungyeon Kim, Frank C. Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective movement primitives should be capable of encoding and generating a
+rich repertoire of trajectories -- typically collected from human
+demonstrations -- conditioned on task-defining parameters such as vision or
+language inputs. While recent methods based on the motion manifold hypothesis,
+which assumes that a set of trajectories lies on a lower-dimensional nonlinear
+subspace, address challenges such as limited dataset size and the high
+dimensionality of trajectory data, they often struggle to capture complex
+task-motion dependencies, i.e., when motion distributions shift drastically
+with task variations. To address this, we introduce Motion Manifold Flow
+Primitives (MMFP), a framework that decouples the training of the motion
+manifold from task-conditioned distributions. Specifically, we employ flow
+matching models, state-of-the-art conditional deep generative models, to learn
+task-conditioned distributions in the latent coordinate space of the learned
+motion manifold. Experiments are conducted on language-guided trajectory
+generation tasks, where many-to-many text-motion correspondences introduce
+complex task-motion dependencies, highlighting MMFP's superiority over existing
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization
+  Algorithm for Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01714v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01714v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawu Tian, Liwei Xu, Xiaowei Zhang, Yongqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training deep neural networks is a challenging task. In order to speed up
+training and enhance the performance of deep neural networks, we rectify the
+vanilla conjugate gradient as conjugate-gradient-like and incorporate it into
+the generic Adam, and thus propose a new optimization algorithm named
+CG-like-Adam for deep learning. Specifically, both the first-order and the
+second-order moment estimation of generic Adam are replaced by the
+conjugate-gradient-like. Convergence analysis handles the cases where the
+exponential moving average coefficient of the first-order moment estimation is
+constant and the first-order moment estimation is unbiased. Numerical
+experiments show the superiority of the proposed algorithm based on the
+CIFAR10/100 dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Digital Ecosystem of Beliefs: does evolution favour AI over humans? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14500v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14500v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David M. Bossens, Shanshan Feng, Yew-Soon Ong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As AI systems are integrated into social networks, there are AI safety
+concerns that AI-generated content may dominate the web, e.g. in popularity or
+impact on beliefs. To understand such questions, this paper proposes the
+Digital Ecosystem of Beliefs (Digico), the first evolutionary framework for
+controlled experimentation with multi-population interactions in simulated
+social networks. The framework models a population of agents which change their
+messaging strategies due to evolutionary updates following a Universal
+Darwinism approach, interact via messages, influence each other's beliefs
+through dynamics based on a contagion model, and maintain their beliefs through
+cognitive Lamarckian inheritance. Initial experiments with an abstract
+implementation of Digico show that: a) when AIs have faster messaging,
+evolution, and more influence in the recommendation algorithm, they get 80% to
+95% of the views, depending on the size of the influence benefit; b) AIs
+designed for propaganda can typically convince 50% of humans to adopt extreme
+beliefs, and up to 85% when agents believe only a limited number of channels;
+c) a penalty for content that violates agents' beliefs reduces propaganda
+effectiveness by up to 8%. We further discuss implications for control (e.g.
+legislation) and Digico as a means of studying evolutionary principles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoFuse: Automatic Fusion Networks for Deformable Medical Image
+  Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Meng, Michael Fulham, Dagan Feng, Lei Bi, Jinman Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deformable image registration aims to find a dense non-linear spatial
+correspondence between a pair of images, which is a crucial step for many
+medical tasks such as tumor growth monitoring and population analysis.
+Recently, Deep Neural Networks (DNNs) have been widely recognized for their
+ability to perform fast end-to-end registration. However, DNN-based
+registration needs to explore the spatial information of each image and fuse
+this information to characterize spatial correspondence. This raises an
+essential question: what is the optimal fusion strategy to characterize spatial
+correspondence? Existing fusion strategies (e.g., early fusion, late fusion)
+were empirically designed to fuse information by manually defined prior
+knowledge, which inevitably constrains the registration performance within the
+limits of empirical designs. In this study, we depart from existing
+empirically-designed fusion strategies and develop a data-driven fusion
+strategy for deformable image registration. To achieve this, we propose an
+Automatic Fusion network (AutoFuse) that provides flexibility to fuse
+information at many potential locations within the network. A Fusion Gate (FG)
+module is also proposed to control how to fuse information at each potential
+network location based on training data. Our AutoFuse can automatically
+optimize its fusion strategy during training and can be generalizable to both
+unsupervised registration (without any labels) and semi-supervised registration
+(with weak labels provided for partial training data). Extensive experiments on
+two well-benchmarked medical registration tasks (inter- and intra-patient
+registration) with eight public datasets show that our AutoFuse outperforms
+state-of-the-art unsupervised and semi-supervised registration methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Channel-Aware Domain-Adaptive Generative Adversarial Network for Robust
+  Speech Recognition <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.12386v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.12386v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chien-Chun Wang, Li-Wei Chen, Cheng-Kang Chou, Hung-Shin Lee, Berlin Chen, Hsin-Min Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While pre-trained automatic speech recognition (ASR) systems demonstrate
+impressive performance on matched domains, their performance often degrades
+when confronted with channel mismatch stemming from unseen recording
+environments and conditions. To mitigate this issue, we propose a novel
+channel-aware data simulation method for robust ASR training. Our method
+harnesses the synergistic power of channel-extractive techniques and generative
+adversarial networks (GANs). We first train a channel encoder capable of
+extracting embeddings from arbitrary audio. On top of this, channel embeddings
+are extracted using a minimal amount of target-domain data and used to guide a
+GAN-based speech synthesizer. This synthesizer generates speech that faithfully
+preserves the phonetic content of the input while mimicking the channel
+characteristics of the target domain. We evaluate our method on the challenging
+Hakka Across Taiwan (HAT) and Taiwanese Across Taiwan (TAT) corpora, achieving
+relative character error rate (CER) reductions of 20.02% and 9.64%,
+respectively, compared to the baselines. These results highlight the efficacy
+of our channel-aware data simulation method for bridging the gap between
+source- and target-domain acoustics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ARC Prize 2024: Technical Report 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04604v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04604v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francois Chollet, Mike Knoop, Gregory Kamradt, Bryan Landers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As of December 2024, the ARC-AGI benchmark is five years old and remains
+unbeaten. We believe it is currently the most important unsolved AI benchmark
+in the world because it seeks to measure generalization on novel tasks -- the
+essence of intelligence -- as opposed to skill at tasks that can be prepared
+for in advance. This year, we launched ARC Prize, a global competition to
+inspire new ideas and drive open progress towards AGI by reaching a target
+benchmark score of 85\%. As a result, the state-of-the-art score on the ARC-AGI
+private evaluation set increased from 33\% to 55.5\%, propelled by several
+frontier AGI reasoning techniques including deep learning-guided program
+synthesis and test-time training. In this paper, we survey top approaches,
+review new open-source implementations, discuss the limitations of the
+ARC-AGI-1 dataset, and share key insights gained from the competition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Soft Sensor Method with Uncertainty-Awareness and Self-Explanation
+  Based on Large Language Models Enhanced by Domain Knowledge Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03295v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03295v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Tong, Han Liu, Runyuan Guo, Wenqing Wang, Xueqiong Tian, Lingyun Wei, Lin Zhang, Huayong Wu, Ding Liu, Youmin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven soft sensors are crucial in predicting key performance indicators
+in industrial systems. However, current methods predominantly rely on the
+supervised learning paradigms of parameter updating, which inherently faces
+challenges such as high development costs, poor robustness, training
+instability, and lack of interpretability. Recently, large language models
+(LLMs) have demonstrated significant potential across various domains, notably
+through In-Context Learning (ICL), which enables high-performance task
+execution with minimal input-label demonstrations and no prior training. This
+paper aims to replace supervised learning with the emerging ICL paradigm for
+soft sensor modeling to address existing challenges and explore new avenues for
+advancement. To achieve this, we propose a novel framework called the Few-shot
+Uncertainty-aware and self-Explaining Soft Sensor (LLM-FUESS), which includes
+the Zero-shot Auxiliary Variable Selector (LLM-ZAVS) and the Uncertainty-aware
+Few-shot Soft Sensor (LLM-UFSS). The LLM-ZAVS retrieves from the Industrial
+Knowledge Vector Storage to enhance LLMs' domain-specific knowledge, enabling
+zero-shot auxiliary variable selection. In the LLM-UFSS, we utilize text-based
+context demonstrations of structured data to prompt LLMs to execute ICL for
+predicting and propose a context sample retrieval augmentation strategy to
+improve performance. Additionally, we explored LLMs' AIGC and probabilistic
+characteristics to propose self-explanation and uncertainty quantification
+methods for constructing a trustworthy soft sensor. Extensive experiments
+demonstrate that our method achieved state-of-the-art predictive performance,
+strong robustness, and flexibility, effectively mitigates training instability
+found in traditional methods. To the best of our knowledge, this is the first
+work to establish soft sensor utilizing LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAG-ViT: A Scale-Aware, High-Fidelity Patching Approach with Graph
+  Attention for Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.09420v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.09420v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shravan Venkatraman, Jaskaran Singh Walia, Joe Dhanith P R
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have redefined image classification by leveraging
+self-attention to capture complex patterns and long-range dependencies between
+image patches. However, a key challenge for ViTs is efficiently incorporating
+multi-scale feature representations, which is inherent in convolutional neural
+networks (CNNs) through their hierarchical structure. Graph transformers have
+made strides in addressing this by leveraging graph-based modeling, but they
+often lose or insufficiently represent spatial hierarchies, especially since
+redundant or less relevant areas dilute the image's contextual representation.
+To bridge this gap, we propose SAG-ViT, a Scale-Aware Graph Attention ViT that
+integrates multi-scale feature capabilities of CNNs, representational power of
+ViTs, graph-attended patching to enable richer contextual representation. Using
+EfficientNetV2 as a backbone, the model extracts multi-scale feature maps,
+dividing them into patches to preserve richer semantic information compared to
+directly patching the input images. The patches are structured into a graph
+using spatial and feature similarities, where a Graph Attention Network (GAT)
+refines the node embeddings. This refined graph representation is then
+processed by a Transformer encoder, capturing long-range dependencies and
+complex interactions. We evaluate SAG-ViT on benchmark datasets across various
+domains, validating its effectiveness in advancing image classification tasks.
+Our code and weights are available at https://github.com/shravan-18/SAG-ViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion Dreamer: Realizing Physically Coherent Video Generation through
+  Scene-Aware Motion Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.00547v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.00547v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianshuo Xu, Zhifei Chen, Leyi Wu, Hao Lu, Yuying Chen, Lihui Jiang, Bingbing Liu, Yingcong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent numerous video generation models, also known as world models, have
+demonstrated the ability to generate plausible real-world videos. However, many
+studies have shown that these models often produce motion results lacking
+logical or physical coherence. In this paper, we revisit video generation
+models and find that single-stage approaches struggle to produce high-quality
+results while maintaining coherent motion reasoning. To address this issue, we
+propose \textbf{Motion Dreamer}, a two-stage video generation framework. In
+Stage I, the model generates an intermediate motion representation-such as a
+segmentation map or depth map-based on the input image and motion conditions,
+focusing solely on the motion itself. In Stage II, the model uses this
+intermediate motion representation as a condition to generate a high-detail
+video. By decoupling motion reasoning from high-fidelity video synthesis, our
+approach allows for more accurate and physically plausible motion generation.
+We validate the effectiveness of our approach on the Physion dataset and in
+autonomous driving scenarios. For example, given a single push, our model can
+synthesize the sequential toppling of a set of dominoes. Similarly, by varying
+the movements of ego-cars, our model can produce different effects on other
+vehicles. Our work opens new avenues in creating models that can reason about
+physical interactions in a more coherent and realistic manner. Our webpage is
+available: https://envision-research.github.io/MotionDreamer/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DPO Kernels: A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich
+  Paradigm for Direct Preference Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amitava Das, Suranjana Trivedy, Danush Khanna, Rajarshi Roy, Gurpreet Singh, Basab Ghosh, Yaswanth Narsupalli, Vinija Jain, Vasu Sharma, Aishwarya Naresh Reganti, Aman Chadha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid rise of large language models (LLMs) has unlocked many applications
+but also underscores the challenge of aligning them with diverse values and
+preferences. Direct Preference Optimization (DPO) is central to alignment but
+constrained by fixed divergences and limited feature transformations. We
+propose DPO-Kernels, which integrates kernel methods to address these issues
+through four key contributions: (i) Kernelized Representations with polynomial,
+RBF, Mahalanobis, and spectral kernels for richer transformations, plus a
+hybrid loss combining embedding-based and probability-based objectives; (ii)
+Divergence Alternatives (Jensen-Shannon, Hellinger, Renyi, Bhattacharyya,
+Wasserstein, and f-divergences) for greater stability; (iii) Data-Driven
+Selection metrics that automatically choose the best kernel-divergence pair;
+and (iv) a Hierarchical Mixture of Kernels for both local precision and global
+modeling. Evaluations on 12 datasets demonstrate state-of-the-art performance
+in factuality, safety, reasoning, and instruction following. Grounded in
+Heavy-Tailed Self-Regularization, DPO-Kernels maintains robust generalization
+for LLMs, offering a comprehensive resource for further alignment research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligning with Human Judgement: The Role of Pairwise Large Language Model
+  Evaluators in Preference Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16950v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16950v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinhong Liu, Han Zhou, Zhijiang Guo, Ehsan Shareghi, Ivan Vulić, Anna Korhonen, Nigel Collier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated promising capabilities as
+automatic evaluators in assessing the quality of generated natural language.
+However, LLMs still exhibit biases in evaluation and often struggle to generate
+coherent evaluations that align with human assessments. In this work, we first
+conduct a systematic study of the misalignment between LLM evaluators and human
+judgement, revealing that existing calibration methods aimed at mitigating
+biases are insufficient for effectively aligning LLM evaluators. Inspired by
+the use of preference data in RLHF, we formulate the evaluation as a ranking
+problem and introduce Pairwise-preference Search (PairS), an uncertainty-guided
+search method that employs LLMs to conduct pairwise comparisons and efficiently
+ranks candidate texts. PairS achieves state-of-the-art performance on
+representative evaluation tasks and demonstrates significant improvements over
+direct scoring. Furthermore, we provide insights into the role of pairwise
+preference in quantifying the transitivity of LLMs and demonstrate how PairS
+benefits from calibration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by COLM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fully Data-driven but Interpretable Human Behavioural Modelling with
+  Differentiable Discrete Choice Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19403v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19403v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fumiyasu Makinoshima, Tatsuya Mitomi, Fumiya Makihara, Eigo Segawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discrete choice models are essential for modelling various decision-making
+processes in human behaviour. However, the specification of these models has
+depended heavily on domain knowledge from experts, and the fully automated but
+interpretable modelling of complex human behaviours has been a long-standing
+challenge. In this paper, we introduce the differentiable discrete choice model
+(Diff-DCM), a fully data-driven method for the interpretable modelling,
+learning, prediction, and control of complex human behaviours, which is
+realised by differentiable programming. Solely from input features and choice
+outcomes without any prior knowledge, Diff-DCM can estimate interpretable
+closed-form utility functions that reproduce observed behaviours. Comprehensive
+experiments with both synthetic and real-world data demonstrate that Diff-DCM
+can be applied to various types of data and requires only a small amount of
+computational resources for the estimations, which can be completed within tens
+of seconds on a laptop without any accelerators. In these experiments, we also
+demonstrate that, using its differentiability, Diff-DCM can provide useful
+insights into human behaviours, such as an optimal intervention path for
+effective behavioural changes. This study provides a strong basis for the fully
+automated and reliable modelling, prediction, and control of human behaviours.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to Bridge the Gap between Modalities: <span class="highlight-title">Survey</span> on Multimodal Large
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07594v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07594v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shezheng Song, Xiaopeng Li, Shasha Li, Shan Zhao, Jie Yu, Jun Ma, Xiaoguang Mao, Weimin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore Multimodal Large Language Models (MLLMs), which integrate LLMs
+like GPT-4 to handle multimodal data, including text, images, audio, and more.
+MLLMs demonstrate capabilities such as generating image captions and answering
+image-based questions, bridging the gap towards real-world human-computer
+interactions and hinting at a potential pathway to artificial general
+intelligence. However, MLLMs still face challenges in addressing the semantic
+gap in multimodal data, which may lead to erroneous outputs, posing potential
+risks to society. Selecting the appropriate modality alignment method is
+crucial, as improper methods might require more parameters without significant
+performance improvements. This paper aims to explore modality alignment methods
+for LLMs and their current capabilities. Implementing effective modality
+alignment can help LLMs address environmental issues and enhance accessibility.
+The study surveys existing modality alignment methods for MLLMs, categorizing
+them into four groups: (1) Multimodal Converter, which transforms data into a
+format that LLMs can understand; (2) Multimodal Perceiver, which improves how
+LLMs percieve different types of data; (3) Tool Learning, which leverages
+external tools to convert data into a common format, usually text; and (4)
+Data-Driven Method, which teaches LLMs to understand specific data types within
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TKDE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Proof-of-Learning with Incentive Security 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09005v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09005v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zishuo Zhao, Zhixuan Fang, Xuechao Wang, Xi Chen, Hongxu Su, Haibo Xiao, Yuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most concurrent blockchain systems rely heavily on the Proof-of-Work (PoW) or
+Proof-of-Stake (PoS) mechanisms for decentralized consensus and security
+assurance. However, the substantial energy expenditure stemming from
+computationally intensive yet meaningless tasks has raised considerable
+concerns surrounding traditional PoW approaches, The PoS mechanism, while free
+of energy consumption, is subject to security and economic issues. Addressing
+these issues, the paradigm of Proof-of-Useful-Work (PoUW) seeks to employ
+challenges of practical significance as PoW, thereby imbuing energy consumption
+with tangible value. While previous efforts in Proof of Learning (PoL) explored
+the utilization of deep learning model training SGD tasks as PoUW challenges,
+recent research has revealed its vulnerabilities to adversarial attacks and the
+theoretical hardness in crafting a byzantine-secure PoL mechanism. In this
+paper, we introduce the concept of incentive-security that incentivizes
+rational provers to behave honestly for their best interest, bypassing the
+existing hardness to design a PoL mechanism with computational efficiency, a
+provable incentive-security guarantee and controllable difficulty.
+Particularly, our work is secure against two attacks, and also improves the
+computational overhead from $\Theta(1)$ to $O(\frac{\log E}{E})$. Furthermore,
+while most recent research assumes trusted problem providers and verifiers, our
+design also guarantees frontend incentive-security even when problem providers
+are untrusted, and verifier incentive-security that bypasses the Verifier's
+Dilemma. By incorporating ML training into blockchain consensus mechanisms with
+provable guarantees, our research not only proposes an eco-friendly solution to
+blockchain systems, but also provides a proposal for a completely decentralized
+computing power market in the new AI age.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balancing Diversity and Risk in LLM Sampling: How to Select Your Method
+  and Parameter for Open-Ended Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13586v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13586v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Zhou, Margret Keuper, Mario Fritz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sampling-based decoding strategies have been widely adopted for Large
+Language Models (LLMs) in numerous applications, targeting a balance between
+diversity and quality via temperature tuning and tail truncation. Considering
+the strong dependency of the candidate next tokens on different prefixes,
+recent studies propose to adaptively truncate the tail of LLMs' predicted
+distribution. Although improved results have been reported with these methods
+on open-ended text generation tasks, the results are highly dependent on the
+curated parameters and the limited exemplar text. In this paper, we propose a
+systematic way to estimate the capacity of a truncation sampling method by
+considering the trade-off between diversity and risk at each decoding step,
+based on our collected prefix tree which preserves the context of a full
+sentence. Our work offers a comprehensive comparison of existing truncation
+sampling methods and serves as a practical user guideline for their parameter
+selection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Label-Efficient Data Augmentation with Video Diffusion Models for
+  Guidewire Segmentation in Cardiac Fluoroscopy <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16050v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16050v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoyan Pan, Yikang Liu, Lin Zhao, Eric Z. Chen, Xiao Chen, Terrence Chen, Shanhui Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accurate segmentation of guidewires in interventional cardiac fluoroscopy
+videos is crucial for computer-aided navigation tasks. Although deep learning
+methods have demonstrated high accuracy and robustness in wire segmentation,
+they require substantial annotated datasets for generalizability, underscoring
+the need for extensive labeled data to enhance model performance. To address
+this challenge, we propose the Segmentation-guided Frame-consistency Video
+Diffusion Model (SF-VD) to generate large collections of labeled fluoroscopy
+videos, augmenting the training data for wire segmentation networks. SF-VD
+leverages videos with limited annotations by independently modeling scene
+distribution and motion distribution. It first samples the scene distribution
+by generating 2D fluoroscopy images with wires positioned according to a
+specified input mask, and then samples the motion distribution by progressively
+generating subsequent frames, ensuring frame-to-frame coherence through a
+frame-consistency strategy. A segmentation-guided mechanism further refines the
+process by adjusting wire contrast, ensuring a diverse range of visibility in
+the synthesized image. Evaluation on a fluoroscopy dataset confirms the
+superior quality of the generated videos and shows significant improvements in
+guidewire segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InterFormer: Towards Effective Heterogeneous Interaction Learning for
+  Click-Through Rate Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.09852v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.09852v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhichen Zeng, Xiaolong Liu, Mengyue Hang, Xiaoyi Liu, Qinghai Zhou, Chaofei Yang, Yiqun Liu, Yichen Ruan, Laming Chen, Yuxin Chen, Yujia Hao, Jiaqi Xu, Jade Nie, Xi Liu, Buyun Zhang, Wei Wen, Siyang Yuan, Kai Wang, Wen-Yen Chen, Yiping Han, Huayu Li, Chunzhi Yang, Bo Long, Philip S. Yu, Hanghang Tong, Jiyan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Click-through rate (CTR) prediction, which predicts the probability of a user
+clicking an ad, is a fundamental task in recommender systems. The emergence of
+heterogeneous information, such as user profile and behavior sequences, depicts
+user interests from different aspects. A mutually beneficial integration of
+heterogeneous information is the cornerstone towards the success of CTR
+prediction. However, most of the existing methods suffer from two fundamental
+limitations, including (1) insufficient inter-mode interaction due to the
+unidirectional information flow between modes, and (2) aggressive information
+aggregation caused by early summarization, resulting in excessive information
+loss. To address the above limitations, we propose a novel module named
+InterFormer to learn heterogeneous information interaction in an interleaving
+style. To achieve better interaction learning, InterFormer enables
+bidirectional information flow for mutually beneficial learning across
+different modes. To avoid aggressive information aggregation, we retain
+complete information in each data mode and use a separate bridging arch for
+effective information selection and summarization. Our proposed InterFormer
+achieves state-of-the-art performance on three public datasets and a
+large-scale industrial dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Superficial Patterns to Semantic Understanding: Fine-Tuning
+  Language Models on Contrast Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02683v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02683v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Petrov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale pre-trained language models have demonstrated high performance on
+standard datasets for natural language inference (NLI) tasks. Unfortunately,
+these evaluations can be misleading, as although the models can perform well on
+in-distribution data, they perform poorly on out-of-distribution test sets,
+such as contrast sets. Contrast sets consist of perturbed instances of data
+that have very minor, but meaningful, changes to the input that alter the gold
+label, revealing how models can learn superficial patterns in the training data
+rather than learning more sophisticated language nuances. As an example, the
+ELECTRA-small language model achieves nearly 90% accuracy on an SNLI dataset
+but drops to 75% when tested on an out-of-distribution contrast set. The
+research carried out in this study explores how the robustness of a language
+model can be improved by exposing it to small amounts of more complex contrast
+sets during training to help it better learn language patterns. With this
+approach, the model recovers performance and achieves nearly 90% accuracy on
+contrast sets, highlighting the importance of diverse and challenging training
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Neural PDE Solver: a reduced-order modelling framework for
+  partial differential equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17853v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17853v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Li, Saurabh Patil, Francis Ogoke, Dule Shu, Wilson Zhen, Michael Schneier, John R. Buchanan, Jr., Amir Barati Farimani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks have shown promising potential in accelerating the numerical
+simulation of systems governed by partial differential equations (PDEs).
+Different from many existing neural network surrogates operating on
+high-dimensional discretized fields, we propose to learn the dynamics of the
+system in the latent space with much coarser discretizations. In our proposed
+framework - Latent Neural PDE Solver (LNS), a non-linear autoencoder is first
+trained to project the full-order representation of the system onto the
+mesh-reduced space, then a temporal model is trained to predict the future
+state in this mesh-reduced space. This reduction process simplifies the
+training of the temporal model by greatly reducing the computational cost
+accompanying a fine discretization. We study the capability of the proposed
+framework and several other popular neural PDE solvers on various types of
+systems including single-phase and multi-phase flows along with varying system
+parameters. We showcase that it has competitive accuracy and efficiency
+compared to the neural PDE solver that operates on full-order space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Vision-Language Models with Scene Graphs for Traffic Accident
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05910v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05910v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Lohner, Francesco Compagno, Jonathan Francis, Alessandro Oltramari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognizing a traffic accident is an essential part of any autonomous driving
+or road monitoring system. An accident can appear in a wide variety of forms,
+and understanding what type of accident is taking place may be useful to
+prevent it from recurring. This work focuses on classifying traffic scenes into
+specific accident types. We approach the problem by representing a traffic
+scene as a graph, where objects such as cars can be represented as nodes, and
+relative distances and directions between them as edges. This representation of
+a traffic scene is referred to as a scene graph, and can be used as input for
+an accident classifier. Better results are obtained with a classifier that
+fuses the scene graph input with visual and textual representations. This work
+introduces a multi-stage, multimodal pipeline that pre-processes videos of
+traffic accidents, encodes them as scene graphs, and aligns this representation
+with vision and language modalities before executing the classification task.
+When trained on 4 classes, our method achieves a balanced accuracy score of
+57.77% on an (unbalanced) subset of the popular Detection of Traffic Anomaly
+(DoTA) benchmark, representing an increase of close to 5 percentage points from
+the case where scene graph information is not taken into account.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Won the 'Best Paper Runner-up Award' at the 2024 IEEE International
+  Automated Vehicle Validation Conference (IAVVC 2024). Also accepted at the
+  1st Workshop on Semantic Reasoning and Goal Understanding in Robotics, at the
+  Robotics Science and Systems Conference (RSS SemRob 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative manufacturing systems using diffusion models and Chat<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00958v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00958v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Li, Fei Tao, Wei Ye, Aydin Nassehi, John W. Sutherland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce Generative Manufacturing Systems (GMS) as a novel
+approach to effectively manage and coordinate autonomous manufacturing assets,
+thereby enhancing their responsiveness and flexibility to address a wide array
+of production objectives and human preferences. Deviating from traditional
+explicit modeling, GMS employs generative AI, including diffusion models and
+ChatGPT, for implicit learning from envisioned futures, marking a shift from a
+model-optimum to a training-sampling decision-making. Through the integration
+of generative AI, GMS enables complex decision-making through interactive
+dialogue with humans, allowing manufacturing assets to generate multiple
+high-quality global decisions that can be iteratively refined based on human
+feedback. Empirical findings showcase GMS's substantial improvement in system
+resilience and responsiveness to uncertainties, with decision times reduced
+from seconds to milliseconds. The study underscores the inherent creativity and
+diversity in the generated solutions, facilitating human-centric
+decision-making through seamless and continuous human-machine interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We are withdrawing this preprint to incorporate significant new
+  results and expand the scope of the paper. We plan to resubmit a
+  substantially revised version in the near future</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forecasting Symmetric Random Walks: A Fusion Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14469v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14469v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting random walks is notoriously challenging, with na\"ive prediction
+serving as a difficult-to-surpass baseline. To investigate the potential of
+using movement predictions to improve point forecasts in this context, this
+study focuses on symmetric random walks, in which the target variable's future
+value is reformulated as a combination of its future movement and current
+value. The proposed forecasting method, termed the fusion of movement and
+na\"ive predictions (FMNP), is grounded in this reformulation. The simulation
+results show that FMNP achieves statistically significant improvements over
+na\"ive prediction, even when the movement prediction accuracy is only slightly
+above 0.50. In practice, movement predictions can be derived from the
+comovement between an exogenous variable and the target variable and then
+linearly combined with the na\"ive prediction to generate the final forecast.
+FMNP effectiveness was evaluated on four U.S. financial time series -- the
+close prices of Boeing (BA), Brent crude oil (OIL), Halliburton (HAL), and
+Schlumberger (SLB) -- using the open price of the Financial Times Stock
+Exchange (FTSE) index as the exogenous variable. In all the cases, FMNP
+outperformed the na\"ive prediction, demonstrating its efficacy in forecasting
+symmetric random walks and its potential applicability to other forecasting
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Structured Neural Network: Efficient Retrieval Scaling for
+  Large Scale Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06653v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06653v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaushik Rangadurai, Siyang Yuan, Minhui Huang, Yiqun Liu, Golnaz Ghasemiesfeh, Yunchen Pu, Haiyu Lu, Xingfeng He, Fangzhou Xu, Andrew Cui, Vidhoon Viswanathan, Lin Yang, Liang Wang, Jiyan Yang, Chonglin Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval, the initial stage of a recommendation system, is tasked with
+down-selecting items from a pool of tens of millions of candidates to a few
+thousands. Embedding Based Retrieval (EBR) has been a typical choice for this
+problem, addressing the computational demands of deep neural networks across
+vast item corpora. EBR utilizes Two Tower or Siamese Networks to learn
+representations for users and items, and employ Approximate Nearest Neighbor
+(ANN) search to efficiently retrieve relevant items. Despite its popularity in
+industry, EBR faces limitations. The Two Tower architecture, relying on a
+single dot product interaction, struggles to capture complex data distributions
+due to limited capability in learning expressive interactions between users and
+items. Additionally, ANN index building and representation learning for user
+and item are often separate, leading to inconsistencies exacerbated by
+representation (e.g. continuous online training) and item drift (e.g. items
+expired and new items added). In this paper, we introduce the Hierarchical
+Structured Neural Network (HSNN), an efficient deep neural network model to
+learn intricate user and item interactions beyond the commonly used dot product
+in retrieval tasks, achieving sublinear computational costs relative to corpus
+size. A Modular Neural Network (MoNN) is designed to maintain high
+expressiveness for interaction learning while ensuring efficiency. A mixture of
+MoNNs operate on a hierarchical item index to achieve extensive computation
+sharing, enabling it to scale up to large corpus size. MoNN and the
+hierarchical index are jointly learnt to continuously adapt to distribution
+shifts in both user interests and item distributions. HSNN achieves substantial
+improvement in offline evaluation compared to prevailing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Resubmit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Mamba in the Llama: Distilling and Accelerating Hybrid Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15237v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15237v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junxiong Wang, Daniele Paliotta, Avner May, Alexander M. Rush, Tri Dao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linear RNN architectures, like Mamba, can be competitive with Transformer
+models in language modeling while having advantageous deployment
+characteristics. Given the focus on training large-scale Transformer models, we
+consider the challenge of converting these pretrained models for deployment. We
+demonstrate that it is feasible to distill large Transformers into linear RNNs
+by reusing the linear projection weights from attention layers with academic
+GPU resources. The resulting hybrid model, which incorporates a quarter of the
+attention layers, achieves performance comparable to the original Transformer
+in chat benchmarks and outperforms open-source hybrid Mamba models trained from
+scratch with trillions of tokens in both chat benchmarks and general
+benchmarks. Moreover, we introduce a hardware-aware speculative decoding
+algorithm that accelerates the inference speed of Mamba and hybrid models.
+Overall we show how, with limited computation resources, we can remove many of
+the original attention layers and generate from the resulting model more
+efficiently. Our top-performing model, distilled from Llama3-8B-Instruct,
+achieves a 29.61 length-controlled win rate on AlpacaEval 2 against GPT-4 and
+7.35 on MT-Bench, surpassing the best 8B scale instruction-tuned linear RNN
+model. We also find that the distilled model has natural length extrapolation,
+showing almost perfect accuracy in the needle-in-a-haystack test at 20x the
+distillation length. Code and pre-trained checkpoints are open-sourced at
+https://github.com/jxiw/MambaInLlama and
+https://github.com/itsdaniele/speculative_mamba.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024. v3 updates: fix format errors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deliberative Alignment: Reasoning Enables Safer Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16339v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16339v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melody Y. Guan, Manas Joglekar, Eric Wallace, Saachi Jain, Boaz Barak, Alec Helyar, Rachel Dias, Andrea Vallone, Hongyu Ren, Jason Wei, Hyung Won Chung, Sam Toyer, Johannes Heidecke, Alex Beutel, Amelia Glaese
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large-scale language models increasingly impact safety-critical domains,
+ensuring their reliable adherence to well-defined principles remains a
+fundamental challenge. We introduce Deliberative Alignment, a new paradigm that
+directly teaches the model safety specifications and trains it to explicitly
+recall and accurately reason over the specifications before answering. We used
+this approach to align OpenAI's o-series models, and achieved highly precise
+adherence to OpenAI's safety policies, without requiring human-written
+chain-of-thoughts or answers. Deliberative Alignment pushes the Pareto frontier
+by simultaneously increasing robustness to jailbreaks while decreasing
+overrefusal rates, and also improves out-of-distribution generalization. We
+demonstrate that reasoning over explicitly specified policies enables more
+scalable, trustworthy, and interpretable alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MADGEN: Mass-Spec attends to De Novo Molecular generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01950v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01950v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinkai Wang, Xiaohui Chen, Liping Liu, Soha Hassoun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The annotation (assigning structural chemical identities) of MS/MS spectra
+remains a significant challenge due to the enormous molecular diversity in
+biological samples and the limited scope of reference databases. Currently, the
+vast majority of spectral measurements remain in the "dark chemical space"
+without structural annotations. To improve annotation, we propose MADGEN
+(Mass-spec Attends to De Novo Molecular GENeration), a scaffold-based method
+for de novo molecular structure generation guided by mass spectrometry data.
+MADGEN operates in two stages: scaffold retrieval and spectra-conditioned
+molecular generation starting with the scaffold. In the first stage, given an
+MS/MS spectrum, we formulate scaffold retrieval as a ranking problem and employ
+contrastive learning to align mass spectra with candidate molecular scaffolds.
+In the second stage, starting from the retrieved scaffold, we employ the MS/MS
+spectrum to guide an attention-based generative model to generate the final
+molecule. Our approach constrains the molecular generation search space,
+reducing its complexity and improving generation accuracy. We evaluate MADGEN
+on three datasets (NIST23, CANOPUS, and MassSpecGym) and evaluate MADGEN's
+performance with a predictive scaffold retriever and with an oracle retriever.
+We demonstrate the effectiveness of using attention to integrate spectral
+information throughout the generation process to achieve strong results with
+the oracle retriever.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainability in Neural Networks for Natural Language Processing Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18036v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18036v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melkamu Mersha, Mingiziem Bitewa, Tsion Abay, Jugal Kalita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are widely regarded as black-box models, creating significant
+challenges in understanding their inner workings, especially in natural
+language processing (NLP) applications. To address this opacity, model
+explanation techniques like Local Interpretable Model-Agnostic Explanations
+(LIME) have emerged as essential tools for providing insights into the behavior
+of these complex systems. This study leverages LIME to interpret a multi-layer
+perceptron (MLP) neural network trained on a text classification task. By
+analyzing the contribution of individual features to model predictions, the
+LIME approach enhances interpretability and supports informed decision-making.
+Despite its effectiveness in offering localized explanations, LIME has
+limitations in capturing global patterns and feature interactions. This
+research highlights the strengths and shortcomings of LIME and proposes
+directions for future work to achieve more comprehensive interpretability in
+neural NLP models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative AI Policies under the Microscope: How CS Conferences Are
+  Navigating the New Frontier in Scholarly Writing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11977v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11977v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahjabin Nahar, Sian Lee, Becky Guillen, Dongwon Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the current state of generative AI policies of computer
+science conferences and offers guidelines for policy adoption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Literature Meets Data: A Synergistic Approach to Hypothesis Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.17309v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.17309v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haokun Liu, Yangqiaoyu Zhou, Mingxuan Li, Chenfei Yuan, Chenhao Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI holds promise for transforming scientific processes, including hypothesis
+generation. Prior work on hypothesis generation can be broadly categorized into
+theory-driven and data-driven approaches. While both have proven effective in
+generating novel and plausible hypotheses, it remains an open question whether
+they can complement each other. To address this, we develop the first method
+that combines literature-based insights with data to perform LLM-powered
+hypothesis generation. We apply our method on five different datasets and
+demonstrate that integrating literature and data outperforms other baselines
+(8.97\% over few-shot, 15.75\% over literature-based alone, and 3.37\% over
+data-driven alone). Additionally, we conduct the first human evaluation to
+assess the utility of LLM-generated hypotheses in assisting human
+decision-making on two challenging tasks: deception detection and AI generated
+content detection. Our results show that human accuracy improves significantly
+by 7.44\% and 14.19\% on these tasks, respectively. These findings suggest that
+integrating literature-based and data-driven approaches provides a
+comprehensive and nuanced framework for hypothesis generation and could open
+new avenues for scientific inquiry.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 9 figures, code link:
+  https://github.com/ChicagoHAI/hypothesis-generation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A hybrid marketplace of ideas 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02132v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02132v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomer Jordi Chaffer, Dontrail Cotlage, Justin Goldston
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The convergence of humans and artificial intelligence systems introduces new
+dynamics into the cultural and intellectual landscape. Complementing emerging
+cultural evolution concepts such as machine culture, AI agents represent a
+significant techno-sociological development, particularly within the
+anthropological study of Web3 as a community focused on decentralization
+through blockchain. Despite their growing presence, the cultural significance
+of AI agents remains largely unexplored in academic literature. Toward this
+end, we conceived hybrid netnography, a novel interdisciplinary approach that
+examines the cultural and intellectual dynamics within digital ecosystems by
+analyzing the interactions and contributions of both human and AI agents as
+co-participants in shaping narratives, ideas, and cultural artifacts. We argue
+that, within the Web3 community on the social media platform X, these agents
+challenge traditional notions of participation and influence in public
+discourse, creating a hybrid marketplace of ideas, a conceptual space where
+human and AI generated ideas coexist and compete for attention. We examine the
+current state of AI agents in idea generation, propagation, and engagement,
+positioning their role as cultural agents through the lens of memetics and
+encouraging further inquiry into their cultural and societal impact.
+Additionally, we address the implications of this paradigm for privacy,
+intellectual property, and governance, highlighting the societal and legal
+challenges of integrating AI agents into the hybrid marketplace of ideas.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2025-01-07T00:00:00Z">2025-01-07</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Robotics <span class="chip" style="font-size: 60%">35</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LargeAD: Large-Scale Cross-Sensor Data <span class="highlight-title">Pretrain</span>ing for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingdong Kong, Xiang Xu, Youquan Liu, Jun Cen, Runnan Chen, Wenwei Zhang, Liang Pan, Kai Chen, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in vision foundation models (VFMs) have revolutionized
+visual perception in 2D, yet their potential for 3D scene understanding,
+particularly in autonomous driving applications, remains underexplored. In this
+paper, we introduce LargeAD, a versatile and scalable framework designed for
+large-scale 3D pretraining across diverse real-world driving datasets. Our
+framework leverages VFMs to extract semantically rich superpixels from 2D
+images, which are aligned with LiDAR point clouds to generate high-quality
+contrastive samples. This alignment facilitates cross-modal representation
+learning, enhancing the semantic consistency between 2D and 3D data. We
+introduce several key innovations: i) VFM-driven superpixel generation for
+detailed semantic representation, ii) a VFM-assisted contrastive learning
+strategy to align multimodal features, iii) superpoint temporal consistency to
+maintain stable representations across time, and iv) multi-source data
+pretraining to generalize across various LiDAR configurations. Our approach
+delivers significant performance improvements over state-of-the-art methods in
+both linear probing and fine-tuning tasks for both LiDAR-based segmentation and
+object detection. Extensive experiments on eleven large-scale multi-modal
+datasets highlight our superior performance, demonstrating the adaptability,
+efficiency, and robustness in real-world autonomous driving scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; 16 pages, 7 figures, 8 tables; Project Page at
+  https://ldkong.com/LargeAD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Xu, Lingdong Kong, Hui Shuai, Liang Pan, Ziwei Liu, Qingshan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR data pretraining offers a promising approach to leveraging large-scale,
+readily available datasets for enhanced data utilization. However, existing
+methods predominantly focus on sparse voxel representation, overlooking the
+complementary attributes provided by other LiDAR representations. In this work,
+we propose LiMoE, a framework that integrates the Mixture of Experts (MoE)
+paradigm into LiDAR data representation learning to synergistically combine
+multiple representations, such as range images, sparse voxels, and raw points.
+Our approach consists of three stages: i) Image-to-LiDAR Pretraining, which
+transfers prior knowledge from images to point clouds across different
+representations; ii) Contrastive Mixture Learning (CML), which uses MoE to
+adaptively activate relevant attributes from each representation and distills
+these mixed features into a unified 3D network; iii) Semantic Mixture
+Supervision (SMS), which combines semantic logits from multiple representations
+to boost downstream segmentation performance. Extensive experiments across 11
+large-scale LiDAR datasets demonstrate our effectiveness and superiority. The
+code and model checkpoints have been made publicly accessible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; 26 pages, 17 figures, 7 tables; Project Page at
+  https://ldkong.com/LiMoE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are VLMs Ready for Autonomous Driving? An Empirical Study from the
+  Reliability, Data, and Metric Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoyuan Xie, Lingdong Kong, Yuhao Dong, Chonghao Sima, Wenwei Zhang, Qi Alfred Chen, Ziwei Liu, Liang Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Vision-Language Models (VLMs) have sparked interest in
+their use for autonomous driving, particularly in generating interpretable
+driving decisions through natural language. However, the assumption that VLMs
+inherently provide visually grounded, reliable, and interpretable explanations
+for driving remains largely unexamined. To address this gap, we introduce
+DriveBench, a benchmark dataset designed to evaluate VLM reliability across 17
+settings (clean, corrupted, and text-only inputs), encompassing 19,200 frames,
+20,498 question-answer pairs, three question types, four mainstream driving
+tasks, and a total of 12 popular VLMs. Our findings reveal that VLMs often
+generate plausible responses derived from general knowledge or textual cues
+rather than true visual grounding, especially under degraded or missing visual
+inputs. This behavior, concealed by dataset imbalances and insufficient
+evaluation metrics, poses significant risks in safety-critical scenarios like
+autonomous driving. We further observe that VLMs struggle with multi-modal
+reasoning and display heightened sensitivity to input corruptions, leading to
+inconsistencies in performance. To address these challenges, we propose refined
+evaluation metrics that prioritize robust visual grounding and multi-modal
+understanding. Additionally, we highlight the potential of leveraging VLMs'
+awareness of corruptions to enhance their reliability, offering a roadmap for
+developing more trustworthy and interpretable decision-making systems in
+real-world autonomous driving contexts. The benchmark toolkit is publicly
+accessible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; 41 pages, 32 figures, 16 tables; Project Page at
+  https://drive-bench.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAD-BA: 3D LiDAR Bundle Adjustment -- from Uncertainty Modelling to
+  Structure Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krzysztof Ćwian, Luca Di Giammarino, Simone Ferrari, Thomas Ciarfuglia, Giorgio Grisetti, Piotr Skrzypczyński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The joint optimization of sensor poses and 3D structure is fundamental for
+state estimation in robotics and related fields. Current LiDAR systems often
+prioritize pose optimization, with structure refinement either omitted or
+treated separately using representations like signed distance functions or
+neural networks. This paper introduces a framework for simultaneous
+optimization of sensor poses and 3D map, represented as surfels. A generalized
+LiDAR uncertainty model is proposed to address degraded or less reliable
+measurements in varying scenarios. Experimental results on public datasets
+demonstrate improved performance over most comparable state-of-the-art methods.
+The system is provided as open-source software to support further research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, this work has been submitted to IEEE RA-L</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impact of Leg Stiffness on Energy Efficiency in One Legged Hopping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iskandar Khemakhem, Dominik Tschemernjak, Maximilian Raff, C. David Remy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the fields of robotics and biomechanics, the integration of elastic
+elements such as springs and tendons in legged systems has long been recognized
+for enabling energy-efficient locomotion. Yet, a significant challenge
+persists: designing a robotic leg that perform consistently across diverse
+operating conditions, especially varying average forward speeds. It remains
+unclear whether, for such a range of operating conditions, the stiffness of the
+elastic elements needs to be varied or if a similar performance can be obtained
+by changing the motion and actuation while keeping the stiffness fixed. This
+work explores the influence of the leg stiffness on the energy efficiency of a
+monopedal robot through an extensive parametric study of its periodic hopping
+motion. To this end, we formulate an optimal control problem parameterized by
+average forward speed and leg stiffness, solving it numerically using direct
+collocation. Our findings indicate that, compared to the use of a fixed
+stiffness, employing variable stiffness in legged systems improves energy
+efficiency by 20 % maximally and by 6.8 % on average across a range of speeds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VLM-driven Behavior Tree for Context-aware Task Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoki Wake, Atsushi Kanehira, Jun Takamatsu, Kazuhiro Sasabuchi, Katsushi Ikeuchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)
+has recently gained attention in the robotics community, yet remains in its
+early stages of development. In this paper, we propose a novel framework that
+leverages Vision-Language Models (VLMs) to interactively generate and edit BTs
+that address visual conditions, enabling context-aware robot operations in
+visually complex environments. A key feature of our approach lies in the
+conditional control through self-prompted visual conditions. Specifically, the
+VLM generates BTs with visual condition nodes, where conditions are expressed
+as free-form text. Another VLM process integrates the text into its prompt and
+evaluates the conditions against real-world images during robot execution. We
+validated our framework in a real-world cafe scenario, demonstrating both its
+feasibility and limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 11 figures, 5 tables. Last updated on January 7th, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit Coordination using Active Epistemic Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lauren Bramblett, Jonathan Reasoner, Nicola Bezzo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A Multi-robot system (MRS) provides significant advantages for intricate
+tasks such as environmental monitoring, underwater inspections, and space
+missions. However, addressing potential communication failures or the lack of
+communication infrastructure in these fields remains a challenge. A significant
+portion of MRS research presumes that the system can maintain communication
+with proximity constraints, but this approach does not solve situations where
+communication is either non-existent, unreliable, or poses a security risk.
+Some approaches tackle this issue using predictions about other robots while
+not communicating, but these methods generally only permit agents to utilize
+first-order reasoning, which involves reasoning based purely on their own
+observations. In contrast, to deal with this problem, our proposed framework
+utilizes Theory of Mind (ToM), employing higher-order reasoning by shifting a
+robot's perspective to reason about a belief of others observations. Our
+approach has two main phases: i) an efficient runtime plan adaptation using
+active inference to signal intentions and reason about a robot's own belief and
+the beliefs of others in the system, and ii) a hierarchical epistemic planning
+framework to iteratively reason about the current MRS mission state. The
+proposed framework outperforms greedy and first-order reasoning approaches and
+is validated using simulations and experiments with heterogeneous robotic
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An LSTM-based Test Selection Method for Self-Driving Cars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03881v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03881v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Güllü, Faiz Ali Shah, Dietmar Pfahl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-driving cars require extensive testing, which can be costly in terms of
+time. To optimize this process, simple and straightforward tests should be
+excluded, focusing on challenging tests instead. This study addresses the test
+selection problem for lane-keeping systems for self-driving cars. Road segment
+features, such as angles and lengths, were extracted and treated as sequences,
+enabling classification of the test cases as "safe" or "unsafe" using a long
+short-term memory (LSTM) model. The proposed model is compared against machine
+learning-based test selectors. Results demonstrated that the LSTM-based method
+outperformed machine learning-based methods in accuracy and precision metrics
+while exhibiting comparable performance in recall and F1 scores. This work
+introduces a novel deep learning-based approach to the road classification
+problem, providing an effective solution for self-driving car test selection
+using a simulation environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Synergistic Framework for Learning Shape Estimation and Shape-Aware
+  Whole-Body Control Policy for Continuum Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Kasaei, Farshid Alambeigi, Mohsen Khadem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel synergistic framework for learning shape
+estimation and a shape-aware whole-body control policy for tendon-driven
+continuum robots. Our approach leverages the interaction between two Augmented
+Neural Ordinary Differential Equations (ANODEs) -- the Shape-NODE and
+Control-NODE -- to achieve continuous shape estimation and shape-aware control.
+The Shape-NODE integrates prior knowledge from Cosserat rod theory, allowing it
+to adapt and account for model mismatches, while the Control-NODE uses this
+shape information to optimize a whole-body control policy, trained in a Model
+Predictive Control (MPC) fashion. This unified framework effectively overcomes
+limitations of existing data-driven methods, such as poor shape awareness and
+challenges in capturing complex nonlinear dynamics. Extensive evaluations in
+both simulation and real-world environments demonstrate the framework's robust
+performance in shape estimation, trajectory tracking, and obstacle avoidance.
+The proposed method consistently outperforms state-of-the-art end-to-end,
+Neural-ODE, and Recurrent Neural Network (RNN) models, particularly in terms of
+tracking accuracy and generalization capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniManip: Towards General Robotic Manipulation via Object-Centric
+  Interaction Primitives as Spatial Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingjie Pan, Jiyao Zhang, Tianshu Wu, Yinghao Zhao, Wenlong Gao, Hao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of general robotic systems capable of manipulating in
+unstructured environments is a significant challenge. While Vision-Language
+Models(VLM) excel in high-level commonsense reasoning, they lack the
+fine-grained 3D spatial understanding required for precise manipulation tasks.
+Fine-tuning VLM on robotic datasets to create Vision-Language-Action
+Models(VLA) is a potential solution, but it is hindered by high data collection
+costs and generalization issues. To address these challenges, we propose a
+novel object-centric representation that bridges the gap between VLM's
+high-level reasoning and the low-level precision required for manipulation. Our
+key insight is that an object's canonical space, defined by its functional
+affordances, provides a structured and semantically meaningful way to describe
+interaction primitives, such as points and directions. These primitives act as
+a bridge, translating VLM's commonsense reasoning into actionable 3D spatial
+constraints. In this context, we introduce a dual closed-loop, open-vocabulary
+robotic manipulation system: one loop for high-level planning through primitive
+resampling, interaction rendering and VLM checking, and another for low-level
+execution via 6D pose tracking. This design ensures robust, real-time control
+without requiring VLM fine-tuning. Extensive experiments demonstrate strong
+zero-shot generalization across diverse robotic manipulation tasks,
+highlighting the potential of this approach for automating large-scale
+simulation data generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An innovative mixed reality approach for Robotics Surgery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriela Rus, Nadim Al Hajjar, Ionut Zima, Calin Vaida, Corina Radu, Damien Chablat, Andra Ciocan, Doina Pîslă
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic-assisted procedures offer numerous advantages over traditional
+approaches, including improved dexterity, reduced fatigue, minimized trauma,
+and superior outcomes. However, the main challenge of these systems remains the
+poor visualization and perception of the surgical field. The goal of this paper
+is to provide an innovative approach concerning an application able to improve
+the surgical procedures offering assistance in both preplanning and
+intraoperative steps of the surgery. The system has been designed to offer a
+better understanding of the patient through techniques that provide medical
+images visualization, 3D anatomical structures perception and robotic planning.
+The application was designed to be intuitive and user friendly, providing an
+augmented reality experience through the Hololens 2 device. It was tested in
+laboratory conditions, yielding positive results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Printable Gradient Lattice Design for Multi-Stiffness Robotic Fingers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siebe J. Schouten, Tomas Steenman, Rens File, Merlijn Den Hartog, Aimee Sakes, Cosimo Della Santina, Kirsten Lussenburg, Ebrahim Shahabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human fingers achieve exceptional dexterity and adaptability by combining
+structures with varying stiffness levels, from soft tissues (low) to tendons
+and cartilage (medium) to bones (high). This paper explores developing a
+robotic finger with similar multi-stiffness characteristics. Specifically, we
+propose using a lattice configuration, parameterized by voxel size and unit
+cell geometry, to optimize and achieve fine-tuned stiffness properties with
+high granularity. A significant advantage of this approach is the feasibility
+of 3D printing the designs in a single process, eliminating the need for manual
+assembly of elements with differing stiffness. Based on this method, we present
+a novel, human-like finger, and a soft gripper. We integrate the latter with a
+rigid manipulator and demonstrate the effectiveness in pick and place tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Machine Learning Model with a Constrained Action Space for
+  Trajectory Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Fertig, Lakshman Balasubramanian, Michael Botsch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trajectory prediction is crucial to advance autonomous driving, improving
+safety, and efficiency. Although end-to-end models based on deep learning have
+great potential, they often do not consider vehicle dynamic limitations,
+leading to unrealistic predictions. To address this problem, this work
+introduces a novel hybrid model that combines deep learning with a kinematic
+motion model. It is able to predict object attributes such as acceleration and
+yaw rate and generate trajectories based on them. A key contribution is the
+incorporation of expert knowledge into the learning objective of the deep
+learning model. This results in the constraint of the available action space,
+thus enabling the prediction of physically feasible object attributes and
+trajectories, thereby increasing safety and robustness. The proposed hybrid
+model facilitates enhanced interpretability, thereby reinforcing the
+trustworthiness of deep learning methods and promoting the development of safe
+planning solutions. Experiments conducted on the publicly available real-world
+Argoverse dataset demonstrate realistic driving behaviour, with benchmark
+comparisons and ablation studies showing promising results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to 2025 IEEE Intelligent Vehicles Symposium (IV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VTAO-BiManip: Masked Visual-Tactile-Action <span class="highlight-title">Pre-train</span>ing with Object
+  Understanding for Bimanual Dexterous Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengnan Sun, Zhaotai Shi, Jiayin Chen, Qingtao Liu, Yu Cui, Qi Ye, Jiming Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bimanual dexterous manipulation remains significant challenges in robotics
+due to the high DoFs of each hand and their coordination. Existing single-hand
+manipulation techniques often leverage human demonstrations to guide RL methods
+but fail to generalize to complex bimanual tasks involving multiple sub-skills.
+In this paper, we introduce VTAO-BiManip, a novel framework that combines
+visual-tactile-action pretraining with object understanding to facilitate
+curriculum RL to enable human-like bimanual manipulation. We improve prior
+learning by incorporating hand motion data, providing more effective guidance
+for dual-hand coordination than binary tactile feedback. Our pretraining model
+predicts future actions as well as object pose and size using masked multimodal
+inputs, facilitating cross-modal regularization. To address the multi-skill
+learning challenge, we introduce a two-stage curriculum RL approach to
+stabilize training. We evaluate our method on a bottle-cap unscrewing task,
+demonstrating its effectiveness in both simulated and real-world environments.
+Our approach achieves a success rate that surpasses existing visual-tactile
+pretraining methods by over 20%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collision Risk Quantification and Conflict Resolution in Trajectory
+  Tracking for Acceleration-Actuated Multi-Robot Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03585v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03585v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxiao Li, Zhirui Sun, Mansha Zheng, Hongpeng Wang, Shuai Li, Jiankun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the pivotal challenges in a multi-robot system is how to give
+attention to accuracy and efficiency while ensuring safety. Prior arts cannot
+strictly guarantee collision-free for an arbitrarily large number of robots or
+the results are considerably conservative. Smoothness of the avoidance
+trajectory also needs to be further optimized. This paper proposes an
+accelerationactuated simultaneous obstacle avoidance and trajectory tracking
+method for arbitrarily large teams of robots, that provides a nonconservative
+collision avoidance strategy and gives approaches for deadlock avoidance. We
+propose two ways of deadlock resolution, one involves incorporating an
+auxiliary velocity vector into the error function of the trajectory tracking
+module, which is proven to have no influence on global convergence of the
+tracking error. Furthermore, unlike the traditional methods that they address
+conflicts after a deadlock occurs, our decision-making mechanism avoids the
+near-zero velocity, which is much more safer and efficient in crowed
+environments. Extensive comparison show that the proposed method is superior to
+the existing studies when deployed in a large-scale robot system, with minimal
+invasiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cosmos World Foundation Model Platform for Physical AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         NVIDIA,  :, Niket Agarwal, Arslan Ali, Maciej Bala, Yogesh Balaji, Erik Barker, Tiffany Cai, Prithvijit Chattopadhyay, Yongxin Chen, Yin Cui, Yifan Ding, Daniel Dworakowski, Jiaojiao Fan, Michele Fenzi, Francesco Ferroni, Sanja Fidler, Dieter Fox, Songwei Ge, Yunhao Ge, Jinwei Gu, Siddharth Gururani, Ethan He, Jiahui Huang, Jacob Huffman, Pooya Jannaty, Jingyi Jin, Seung Wook Kim, Gergely Klár, Grace Lam, Shiyi Lan, Laura Leal-Taixe, Anqi Li, Zhaoshuo Li, Chen-Hsuan Lin, Tsung-Yi Lin, Huan Ling, Ming-Yu Liu, Xian Liu, Alice Luo, Qianli Ma, Hanzi Mao, Kaichun Mo, Arsalan Mousavian, Seungjun Nah, Sriharsha Niverty, David Page, Despoina Paschalidou, Zeeshan Patel, Lindsey Pavao, Morteza Ramezanali, Fitsum Reda, Xiaowei Ren, Vasanth Rao Naik Sabavat, Ed Schmerling, Stella Shi, Bartosz Stefaniak, Shitao Tang, Lyne Tchapmi, Przemek Tredak, Wei-Cheng Tseng, Jibin Varghese, Hao Wang, Haoxiang Wang, Heng Wang, Ting-Chun Wang, Fangyin Wei, Xinyue Wei, Jay Zhangjie Wu, Jiashu Xu, Wei Yang, Lin Yen-Chen, Xiaohui Zeng, Yu Zeng, Jing Zhang, Qinsheng Zhang, Yuxuan Zhang, Qingqing Zhao, Artur Zolkowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physical AI needs to be trained digitally first. It needs a digital twin of
+itself, the policy model, and a digital twin of the world, the world model. In
+this paper, we present the Cosmos World Foundation Model Platform to help
+developers build customized world models for their Physical AI setups. We
+position a world foundation model as a general-purpose world model that can be
+fine-tuned into customized world models for downstream applications. Our
+platform covers a video curation pipeline, pre-trained world foundation models,
+examples of post-training of pre-trained world foundation models, and video
+tokenizers. To help Physical AI builders solve the most critical problems of
+our society, we make our platform open-source and our models open-weight with
+permissive licenses available via https://github.com/NVIDIA/Cosmos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SenseRAG: Constructing Environmental Knowledge Bases with Proactive
+  Querying for LLM-Based Autonomous Driving <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuewen Luo, Fan Ding, Fengze Yang, Yang Zhou, Junnyong Loo, Hwa Hui Tew, Chenxi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the critical need for enhanced situational awareness in
+autonomous driving (AD) by leveraging the contextual reasoning capabilities of
+large language models (LLMs). Unlike traditional perception systems that rely
+on rigid, label-based annotations, it integrates real-time, multimodal sensor
+data into a unified, LLMs-readable knowledge base, enabling LLMs to dynamically
+understand and respond to complex driving environments. To overcome the
+inherent latency and modality limitations of LLMs, a proactive
+Retrieval-Augmented Generation (RAG) is designed for AD, combined with a
+chain-of-thought prompting mechanism, ensuring rapid and context-rich
+understanding. Experimental results using real-world Vehicle-to-everything
+(V2X) datasets demonstrate significant improvements in perception and
+prediction performance, highlighting the potential of this framework to enhance
+safety, adaptability, and decision-making in next-generation AD systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at WACV Workshop LLMAD
+  2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effects of Robot Competency and Motion Legibility on Human Correction
+  Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangge Wang, Anjiabei Wang, Sofiya Goncharova, Brian Scassellati, Tesca Fitzgerald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As robot deployments become more commonplace, people are likely to take on
+the role of supervising robots (i.e., correcting their mistakes) rather than
+directly teaching them. Prior works on Learning from Corrections (LfC) have
+relied on three key assumptions to interpret human feedback: (1) people correct
+the robot only when there is significant task objective divergence; (2) people
+can accurately predict if a correction is necessary; and (3) people trade off
+precision and physical effort when giving corrections. In this work, we study
+how two key factors (robot competency and motion legibility) affect how people
+provide correction feedback and their implications on these existing
+assumptions. We conduct a user study ($N=60$) under an LfC setting where
+participants supervise and correct a robot performing pick-and-place tasks. We
+find that people are more sensitive to suboptimal behavior by a highly
+competent robot compared to an incompetent robot when the motions are legible
+($p=0.0015$) and predictable ($p=0.0055$). In addition, people also tend to
+withhold necessary corrections ($p < 0.0001$) when supervising an incompetent
+robot and are more prone to offering unnecessary ones ($p = 0.0171$) when
+supervising a highly competent robot. We also find that physical effort
+positively correlates with correction precision, providing empirical evidence
+to support this common assumption. We also find that this correlation is
+significantly weaker for an incompetent robot with legible motions than an
+incompetent robot with predictable motions ($p = 0.0075$). Our findings offer
+insights for accounting for competency and legibility when designing robot
+interaction behaviors and learning task objectives from corrections.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in the 2025 ACM/IEEE International Conference on
+  Human-Robot Interaction (HRI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FRESHR-GSI: A Generalized Safety Model and Evaluation Framework for
+  Mobile Robots in Multi-Human Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pranav Pandey, Ramviyas Parasuraman, Prashant Doshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human safety is critical in applications involving close human-robot
+interactions (HRI) and is a key aspect of physical compatibility between humans
+and robots. While measures of human safety in HRI exist, these mainly target
+industrial settings involving robotic manipulators. Less attention has been
+paid to settings where mobile robots and humans share the space. This paper
+introduces a new robot-centered directional framework of human safety. It is
+particularly useful for evaluating mobile robots as they operate in
+environments populated by multiple humans. The framework integrates several key
+metrics, such as each human's relative distance, speed, and orientation. The
+core novelty lies in the framework's flexibility to accommodate different
+application requirements while allowing for both the robot-centered and
+external observer points of view. We instantiate the framework by using RGB-D
+based vision integrated with a deep learning-based human detection pipeline to
+yield a generalized safety index (GSI) that instantaneously assesses human
+safety. We evaluate GSI's capability of producing appropriate, robust, and
+fine-grained safety measures in real-world experimental scenarios and compare
+its performance with extant safety models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Bayesian Modeling Framework for Estimation and Ground Segmentation of
+  Cluttered Staircases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prasanna Sriganesh, Burhanuddin Shirose, Matthew Travers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous robot navigation in complex environments requires robust
+perception as well as high-level scene understanding due to perceptual
+challenges, such as occlusions, and uncertainty introduced by robot movement.
+For example, a robot climbing a cluttered staircase can misinterpret clutter as
+a step, misrepresenting the state and compromising safety. This requires robust
+state estimation methods capable of inferring the underlying structure of the
+environment even from incomplete sensor data. In this paper, we introduce a
+novel method for robust state estimation of staircases. To address the
+challenge of perceiving occluded staircases extending beyond the robot's
+field-of-view, our approach combines an infinite-width staircase representation
+with a finite endpoint state to capture the overall staircase structure. This
+representation is integrated into a Bayesian inference framework to fuse noisy
+measurements enabling accurate estimation of staircase location even with
+partial observations and occlusions. Additionally, we present a segmentation
+algorithm that works in conjunction with the staircase estimation pipeline to
+accurately identify clutter-free regions on a staircase. Our method is
+extensively evaluated on real robot across diverse staircases, demonstrating
+significant improvements in estimation accuracy and segmentation performance
+compared to baseline approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Transfer Human Hand Skills for Robot Manipulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungjae Park, Seungho Lee, Mingi Choi, Jiye Lee, Jeonghwan Kim, Jisoo Kim, Hanbyul Joo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method for teaching dexterous manipulation tasks to robots from
+human hand motion demonstrations. Unlike existing approaches that solely rely
+on kinematics information without taking into account the plausibility of robot
+and object interaction, our method directly infers plausible robot manipulation
+actions from human motion demonstrations. To address the embodiment gap between
+the human hand and the robot system, our approach learns a joint motion
+manifold that maps human hand movements, robot hand actions, and object
+movements in 3D, enabling us to infer one motion component from others. Our key
+idea is the generation of pseudo-supervision triplets, which pair human,
+object, and robot motion trajectories synthetically. Through real-world
+experiments with robot hand manipulation, we demonstrate that our data-driven
+retargeting method significantly outperforms conventional retargeting
+techniques, effectively bridging the embodiment gap between human and robotic
+hands. Website at https://rureadyo.github.io/MocapRobot/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language and Planning in Robotic Navigation: A Multilingual Evaluation
+  of State-of-the-Art Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malak Mansour, Ahmed Aly, Bahey Tharwat, Sarim Hashmi, Dong An, Ian Reid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) such as GPT-4, trained on huge amount of
+datasets spanning multiple domains, exhibit significant reasoning,
+understanding, and planning capabilities across various tasks. This study
+presents the first-ever work in Arabic language integration within the
+Vision-and-Language Navigation (VLN) domain in robotics, an area that has been
+notably underexplored in existing research. We perform a comprehensive
+evaluation of state-of-the-art multi-lingual Small Language Models (SLMs),
+including GPT-4o mini, Llama 3 8B, and Phi-3 medium 14B, alongside the
+Arabic-centric LLM, Jais. Our approach utilizes the NavGPT framework, a pure
+LLM-based instruction-following navigation agent, to assess the impact of
+language on navigation reasoning through zero-shot sequential action prediction
+using the R2R dataset. Through comprehensive experiments, we demonstrate that
+our framework is capable of high-level planning for navigation tasks when
+provided with instructions in both English and Arabic. However, certain models
+struggled with reasoning and planning in the Arabic language due to inherent
+limitations in their capabilities, sub-optimal performance, and parsing issues.
+These findings highlight the importance of enhancing planning and reasoning
+capabilities in language models for effective navigation, emphasizing this as a
+key area for further development while also unlocking the potential of
+Arabic-language models for impactful real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ λ: A Benchmark for Data-Efficiency in Long-Horizon Indoor Mobile
+  Manipulation Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05313v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05313v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Jaafar, Shreyas Sundara Raman, Yichen Wei, Sudarshan Harithas, Sofia Juliani, Anneke Wernerfelt, Benedict Quartey, Ifrah Idrees, Jason Xinyu Liu, Stefanie Tellex
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiently learning and executing long-horizon mobile manipulation (MoMa)
+tasks is crucial for advancing robotics in household and workplace settings.
+However, current MoMa models are data-inefficient, underscoring the need for
+improved models that require realistic-sized benchmarks to evaluate their
+efficiency, which do not exist. To address this, we introduce the LAMBDA
+({\lambda}) benchmark (Long-horizon Actions for Mobile-manipulation
+Benchmarking of Directed Activities), which evaluates the data efficiency of
+models on language-conditioned, long-horizon, multi-room, multi-floor,
+pick-and-place tasks using a dataset of manageable size, more feasible for
+collection. The benchmark includes 571 human-collected demonstrations that
+provide realism and diversity in simulated and real-world settings. Unlike
+planner-generated data, these trajectories offer natural variability and
+replay-verifiability, ensuring robust learning and evaluation. We benchmark
+several models, including learning-based models and a neuro-symbolic modular
+approach combining foundation models with task and motion planning.
+Learning-based models show suboptimal success rates, even when leveraging
+pretrained weights, underscoring significant data inefficiencies. However, the
+neuro-symbolic approach performs significantly better while being more data
+efficient. Findings highlight the need for more data-efficient learning-based
+MoMa approaches. {\lambda} addresses this gap by serving as a key benchmark for
+evaluating the data efficiency of those future models in handling household
+robotics tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Scenario Reasoning: Unlocking Cognitive Autonomy in Humanoid
+  Robots for Multimodal Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20429v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20429v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Libo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To improve the cognitive autonomy of humanoid robots, this research proposes
+a multi-scenario reasoning architecture to solve the technical shortcomings of
+multi-modal understanding in this field. It draws on simulation based
+experimental design that adopts multi-modal synthesis (visual, auditory,
+tactile) and builds a simulator "Maha" to perform the experiment. The findings
+demonstrate the feasibility of this architecture in multimodal data. It
+provides reference experience for the exploration of cross-modal interaction
+strategies for humanoid robots in dynamic environments. In addition,
+multi-scenario reasoning simulates the high-level reasoning mechanism of the
+human brain to humanoid robots at the cognitive level. This new concept
+promotes cross-scenario practical task transfer and semantic-driven action
+planning. It heralds the future development of self-learning and autonomous
+behavior of humanoid robots in changing scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The main text is 5 pages, 2 figures, and 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Incorporating Control Inputs in Continuous-Time Gaussian Process State
+  Estimation for Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01333v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01333v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sven Lilge, Timothy D. Barfoot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continuous-time batch state estimation using Gaussian processes is an
+efficient approach to estimate the trajectories of robots over time. In the
+past, relatively simple physics-motivated priors have been considered for such
+approaches, using assumptions such as constant velocity or acceleration. This
+paper presents an approach to incorporating exogenous control inputs, such as
+velocity or acceleration commands, into the continuous Gaussian process
+state-estimation framework. It is shown that this approach generalizes across
+different domains in robotics, making it applicable to both the estimation of
+continuous-time trajectories for mobile robots and the estimation of
+quasi-static continuum robot shapes. Results show that incorporating control
+inputs leads to more informed priors, potentially requiring less measurements
+and estimation nodes to obtain accurate estimates. This makes the approach
+particularly useful in situations in which limited sensing is available. For
+example, in a mobile robot localization experiment with sparse landmark
+distance measurements and frequent odometry control inputs, our approach
+provides accurate trajectory estimates with root-mean-square errors around 3-4
+cm and 4-5 degrees, even with time intervals up to five seconds between
+discrete estimation nodes, which significantly reduces computation time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 7 figures, Accepted to Robotica</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Information Theory for Intuitive Robot Programming of Manual
+  Activities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23963v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23963v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elena Merlo, Marta Lagomarsino, Edoardo Lamon, Arash Ajoudani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Observational learning is a promising approach to enable people without
+expertise in programming to transfer skills to robots in a user-friendly
+manner, since it mirrors how humans learn new behaviors by observing others.
+Many existing methods focus on instructing robots to mimic human trajectories,
+but motion-level strategies often pose challenges in skills generalization
+across diverse environments. This paper proposes a novel framework that allows
+robots to achieve a higher-level understanding of human-demonstrated manual
+tasks recorded in RGB videos. By recognizing the task structure and goals,
+robots generalize what observed to unseen scenarios. We found our task
+representation on Shannon's Information Theory (IT), which is applied for the
+first time to manual tasks. IT helps extract the active scene elements and
+quantify the information shared between hands and objects. We exploit scene
+graph properties to encode the extracted interaction features in a compact
+structure and segment the demonstration into blocks, streamlining the
+generation of Behavior Trees for robot replicas. Experiments validated the
+effectiveness of IT to automatically generate robot execution plans from a
+single human demonstration. Additionally, we provide HANDSOME, an open-source
+dataset of HAND Skills demOnstrated by Multi-subjEcts, to promote further
+research and evaluation in this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soft Adaptive Feet for Legged Robots: An Open-Source Model for
+  Locomotion Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.03191v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.03191v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Crotti, Luca Rossini, Balint K. Hodossy, Anna Pace, Giorgio Grioli, Antonio Bicchi, Manuel G. Catalano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, artificial feet based on soft robotics and under-actuation
+principles emerged to improve mobility on challenging terrains. This paper
+presents the application of the MuJoCo physics engine to realize a digital twin
+of an adaptive soft foot developed for use with legged robots. We release the
+MuJoCo soft foot digital twin as open source to allow users and researchers to
+explore new approaches to locomotion. The work includes the system modeling
+techniques along with the kinematic and dynamic attributes involved. Validation
+is conducted through a rigorous comparison with bench tests on a physical
+prototype, replicating these experiments in simulation. Results are evaluated
+based on sole deformation and contact forces during foot-obstacle interaction.
+The foot model is subsequently integrated into simulations of the humanoid
+robot COMAN+, replacing its original flat feet. Results show an improvement in
+the robot's ability to negotiate small obstacles without altering its control
+strategy. Ultimately, this study offers a comprehensive modeling approach for
+adaptive soft feet, supported by qualitative comparisons of bipedal locomotion
+with state of the art robotic feet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-driven tool wear prediction in milling, based on a
+  process-integrated single-sensor approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19950v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19950v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Hirsch, Christian Friedrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate tool wear prediction is essential for maintaining productivity and
+minimizing costs in machining. However, the complex nature of the tool wear
+process poses significant challenges to achieving reliable predictions. This
+study explores data-driven methods, in particular deep learning, for tool wear
+prediction. Traditional data-driven approaches often focus on a single process,
+relying on multi-sensor setups and extensive data generation, which limits
+generalization to new settings. Moreover, multi-sensor integration is often
+impractical in industrial environments. To address these limitations, this
+research investigates the transferability of predictive models using minimal
+training data, validated across two processes. Furthermore, it uses a simple
+setup with a single acceleration sensor to establish a low-cost data generation
+approach that facilitates the generalization of models to other processes via
+transfer learning. The study evaluates several machine learning models,
+including convolutional neural networks (CNN), long short-term memory networks
+(LSTM), support vector machines (SVM) and decision trees, trained on different
+input formats such as feature vectors and short-time Fourier transform (STFT).
+The performance of the models is evaluated on different amounts of training
+data, including scenarios with significantly reduced datasets, providing
+insight into their effectiveness under constrained data conditions. The results
+demonstrate the potential of specific models and configurations for effective
+tool wear prediction, contributing to the development of more adaptable and
+efficient predictive maintenance strategies in machining. Notably, the ConvNeXt
+model has an exceptional performance, achieving an 99.1% accuracy in
+identifying tool wear using data from only four milling tools operated until
+they are worn.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to Robotics and Computer-Integrated Manufacturing
+  ,14 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sketch-MoMa: Teleoperation for Mobile Manipulator via Interpretation of
+  Hand-Drawn Sketches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19153v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19153v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kosei Tanada, Yuka Iwanaga, Masayoshi Tsuchinaga, Yuji Nakamura, Takemitsu Mori, Remi Sakai, Takashi Yamamoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To use assistive robots in everyday life, a remote control system with common
+devices, such as 2D devices, is helpful to control the robots anytime and
+anywhere as intended. Hand-drawn sketches are one of the intuitive ways to
+control robots with 2D devices. However, since similar sketches have different
+intentions from scene to scene, existing work needs additional modalities to
+set the sketches' semantics. This requires complex operations for users and
+leads to decreasing usability. In this paper, we propose Sketch-MoMa, a
+teleoperation system using the user-given hand-drawn sketches as instructions
+to control a robot. We use Vision-Language Models (VLMs) to understand the
+user-given sketches superimposed on an observation image and infer drawn shapes
+and low-level tasks of the robot. We utilize the sketches and the generated
+shapes for recognition and motion planning of the generated low-level tasks for
+precise and intuitive operations. We validate our approach using
+state-of-the-art VLMs with 7 tasks and 5 sketch shapes. We also demonstrate
+that our approach effectively specifies the detailed motions, such as how to
+grasp and how much to rotate. Moreover, we show the competitive usability of
+our approach compared with the existing 2D interface through a user experiment
+with 14 participants.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Project Page: https://toyotafrc.github.io/SketchMoMa-Proj</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BTGenBot: Behavior Tree Generation for Robotic Tasks with Lightweight
+  LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12761v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12761v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Andrea Izzo, Gianluca Bardaro, Matteo Matteucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach to generating behavior trees for robots
+using lightweight large language models (LLMs) with a maximum of 7 billion
+parameters. The study demonstrates that it is possible to achieve satisfying
+results with compact LLMs when fine-tuned on a specific dataset. The key
+contributions of this research include the creation of a fine-tuning dataset
+based on existing behavior trees using GPT-3.5 and a comprehensive comparison
+of multiple LLMs (namely llama2, llama-chat, and code-llama) across nine
+distinct tasks. To be thorough, we evaluated the generated behavior trees using
+static syntactical analysis, a validation system, a simulated environment, and
+a real robot. Furthermore, this work opens the possibility of deploying such
+solutions directly on the robot, enhancing its practical applicability.
+Findings from this study demonstrate the potential of LLMs with a limited
+number of parameters in generating effective and efficient robot behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ORGANA: A Robotic Assistant for Automated Chemistry Experimentation and
+  Characterization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06949v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06949v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kourosh Darvish, Marta Skreta, Yuchi Zhao, Naruki Yoshikawa, Sagnik Som, Miroslav Bogdanovic, Yang Cao, Han Hao, Haoping Xu, Alán Aspuru-Guzik, Animesh Garg, Florian Shkurti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chemistry experiments can be resource- and labor-intensive, often requiring
+manual tasks like polishing electrodes in electrochemistry. Traditional lab
+automation infrastructure faces challenges adapting to new experiments. To
+address this, we introduce ORGANA, an assistive robotic system that automates
+diverse chemistry experiments using decision-making and perception tools. It
+makes decisions with chemists in the loop to control robots and lab devices.
+ORGANA interacts with chemists using Large Language Models (LLMs) to derive
+experiment goals, handle disambiguation, and provide experiment logs. ORGANA
+plans and executes complex tasks with visual feedback, while supporting
+scheduling and parallel task execution. We demonstrate ORGANA's capabilities in
+solubility, pH measurement, recrystallization, and electrochemistry
+experiments. In electrochemistry, it executes a 19-step plan in parallel to
+characterize quinone derivatives for flow batteries. Our user study shows
+ORGANA reduces frustration and physical demand by over 50%, with users saving
+an average of 80.3% of their time when using it.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MonoRollBot: 3-DOF Spherical Robot with Underactuated Single Compliant
+  Actuator Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.04264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.04264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Liu, Seyed Amir Tafrishi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spherical rolling robots have garnered significant attention in the field of
+mobile robotics for applications such as inspection and space exploration.
+Designing underactuated rolling robots poses challenges in achieving
+multi-directional propulsion with high degrees of freedom while utilizing a
+limited number of actuators. This paper presents the MonoRollBot, a novel
+3-degree-of-freedom (DOF) spherical robot that utilizes an underactuated
+mechanism driven by only a single spring-motor system. Unlike conventional
+spherical robots, MonoRollBot employs a minimalist actuation approach, relying
+on only one motor and a passive spring to control its locomotion. The robot
+achieves 3-DOF motion through an innovative coupling of spring dynamics and
+motor control. In this work, we detail the design of the MonoRollBot and
+evaluate its motion capabilities through design studies. We also do studies on
+its locomotion behaviours based on changes in rotating mass and stiffness
+properties.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 11 figures, accepted at IEEE RoboSoft 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GCBF+: A Neural Graph Control Barrier Function Framework for Distributed
+  Safe Multi-Agent Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14554v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14554v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songyuan Zhang, Oswin So, Kunal Garg, Chuchu Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributed, scalable, and safe control of large-scale multi-agent systems is
+a challenging problem. In this paper, we design a distributed framework for
+safe multi-agent control in large-scale environments with obstacles, where a
+large number of agents are required to maintain safety using only local
+information and reach their goal locations. We introduce a new class of
+certificates, termed graph control barrier function (GCBF), which are based on
+the well-established control barrier function theory for safety guarantees and
+utilize a graph structure for scalable and generalizable distributed control of
+MAS. We develop a novel theoretical framework to prove the safety of an
+arbitrary-sized MAS with a single GCBF. We propose a new training framework
+GCBF+ that uses graph neural networks to parameterize a candidate GCBF and a
+distributed control policy. The proposed framework is distributed and is
+capable of taking point clouds from LiDAR, instead of actual state information,
+for real-world robotic applications. We illustrate the efficacy of the proposed
+method through various hardware experiments on a swarm of drones with
+objectives ranging from exchanging positions to docking on a moving target
+without collision. Additionally, we perform extensive numerical experiments,
+where the number and density of agents, as well as the number of obstacles,
+increase. Empirical results show that in complex environments with agents with
+nonlinear dynamics (e.g., Crazyflie drones), GCBF+ outperforms the hand-crafted
+CBF-based method with the best performance by up to 20% for relatively
+small-scale MAS with up to 256 agents, and leading reinforcement learning (RL)
+methods by up to 40% for MAS with 1024 agents. Furthermore, the proposed method
+does not compromise on the performance, in terms of goal reaching, for
+achieving high safety rates, which is a common trade-off in RL-based methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 15 figures; Accepted by IEEE Transactions on Robotics
+  (T-RO)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PrefCLM: Enhancing Preference-based Reinforcement Learning with
+  Crowdsourced Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.08213v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.08213v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiqi Wang, Dezhong Zhao, Ziqin Yuan, Ike Obi, Byung-Cheol Min
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preference-based reinforcement learning (PbRL) is emerging as a promising
+approach to teaching robots through human comparative feedback, sidestepping
+the need for complex reward engineering. However, the substantial volume of
+feedback required in existing PbRL methods often lead to reliance on synthetic
+feedback generated by scripted teachers. This approach necessitates intricate
+reward engineering again and struggles to adapt to the nuanced preferences
+particular to human-robot interaction (HRI) scenarios, where users may have
+unique expectations toward the same task. To address these challenges, we
+introduce PrefCLM, a novel framework that utilizes crowdsourced large language
+models (LLMs) as simulated teachers in PbRL. We utilize Dempster-Shafer Theory
+to fuse individual preferences from multiple LLM agents at the score level,
+efficiently leveraging their diversity and collective intelligence. We also
+introduce a human-in-the-loop pipeline that facilitates collective refinements
+based on user interactive feedback. Experimental results across various general
+RL tasks show that PrefCLM achieves competitive performance compared to
+traditional scripted teachers and excels in facilitating more more natural and
+efficient behaviors. A real-world user study (N=10) further demonstrates its
+capability to tailor robot behaviors to individual user preferences,
+significantly enhancing user satisfaction in HRI scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Game Between Two Identical Dubins Cars: Evading a Conic Sensor in
+  Minimum Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.08637v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.08637v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ubaldo Ruiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A fundamental task in mobile robotics is keeping an intelligent agent under
+surveillance with an autonomous robot as it travels in the environment. This
+work studies a theoretical version of that problem involving one of the most
+popular vehicle platforms in robotics. In particular, we consider two identical
+Dubins cars moving on a plane without obstacles. One of them plays as the
+pursuer, and it is equipped with a limited field-of-view detection region
+modeled as a semi-infinite cone with its apex at the pursuer's position. The
+pursuer aims to maintain the other Dubins car, which plays as the evader, as
+much time as possible inside its detection region. On the contrary, the evader
+wants to escape as soon as possible. In this work, employing differential game
+theory, we find the time-optimal motion strategies near the game's end. The
+analysis of those trajectories reveals the existence of at least two singular
+surfaces: a Transition Surface (also known as a Switch Surface) and an Evader's
+Universal Surface. We also found that the barrier's standard construction
+produces a surface that partially lies outside the playing space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Systems and Control <span class="chip" style="font-size: 60%">25</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Moving-horizon Estimation for Nonlinear Systems: From Perfect to
+  Imperfect Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angelo Alessandri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust stability of moving-horizon estimators is investigated for nonlinear
+discrete-time systems that are detectable in the sense of incremental
+input/output-to-state stability and are affected by disturbances. The estimate
+of a moving-horizon estimator stems from the on-line solution of a
+least-squares minimization problem at each time instant. The resulting
+stability guarantees depend on the optimization tolerance in solving such
+minimization problems. Specifically, two main contributions are established:
+(i) the robust stability of the estimation error, while supposing to solve
+exactly the on-line minimization problem; (ii) the practical robust stability
+of the estimation error with state estimates obtained by an imperfect
+minimization. Finally, the construction of such robust moving-horizon
+estimators and the performances resulting from the design based on the
+theoretical findings are showcased with two numerical examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 2 figures, 24 bibliographic references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multimodal Lightweight Approach to Fault Diagnosis of Induction Motors
+  in High-Dimensional <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Usman Ali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An accurate AI-based diagnostic system for induction motors (IMs) holds the
+potential to enhance proactive maintenance, mitigating unplanned downtime and
+curbing overall maintenance costs within an industrial environment. Notably,
+among the prevalent faults in IMs, a Broken Rotor Bar (BRB) fault is frequently
+encountered. Researchers have proposed various fault diagnosis approaches using
+signal processing (SP), machine learning (ML), deep learning (DL), and hybrid
+architectures for BRB faults. One limitation in the existing literature is the
+training of these architectures on relatively small datasets, risking
+overfitting when implementing such systems in industrial environments. This
+paper addresses this limitation by implementing large-scale data of BRB faults
+by using a transfer-learning-based lightweight DL model named ShuffleNetV2 for
+diagnosing one, two, three, and four BRB faults using current and vibration
+signal data. Spectral images for training and testing are generated using a
+Short-Time Fourier Transform (STFT). The dataset comprises 57,500 images, with
+47,500 used for training and 10,000 for testing. Remarkably, the ShuffleNetV2
+model exhibited superior performance, in less computational cost as well as
+accurately classifying 98.856% of spectral images. To further enhance the
+visualization of harmonic sidebands resulting from broken bars, Fast Fourier
+Transform (FFT) is applied to current and vibration data. The paper also
+provides insights into the training and testing times for each model,
+contributing to a comprehensive understanding of the proposed fault diagnosis
+methodology. The findings of our research provide valuable insights into the
+performance and efficiency of different ML and DL models, offering a foundation
+for the development of robust fault diagnosis systems for induction motors in
+industrial settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stabilization of Strictly Pre-Dissipative Receding Horizon Linear
+  Quadratic Control by Terminal Costs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03691v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03691v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mario Zanon, Lars Grüne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Asymptotic stability in receding horizon control is obtained under a strict
+pre-dissipativity assumption, in the presence of suitable state constraints. In
+this paper we analyze how terminal constraints can be replaced by suitable
+terminal costs. We restrict to the linear-quadratic setting as that allows us
+to obtain stronger results, while we analyze the full nonlinear case in a
+separate contribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Imitation Learning of MPC with Neural Networks: Error Guarantees and
+  Sparsification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hendrik Alsmeier, Lukas Theiner, Anton Savchenko, Ali Mesbah, Rolf Findeisen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a framework for bounding the approximation error in
+imitation model predictive controllers utilizing neural networks. Leveraging
+the Lipschitz properties of these neural networks, we derive a bound that
+guides dataset design to ensure the approximation error remains at chosen
+limits. We discuss how this method can be used to design a stable neural
+network controller with performance guarantees employing existing robust model
+predictive control approaches for data generation. Additionally, we introduce a
+training adjustment, which is based on the sensitivities of the optimization
+problem and reduces dataset density requirements based on the derived bounds.
+We verify that the proposed augmentation results in improvements to the
+network's predictive capabilities and a reduction of the Lipschitz constant.
+Moreover, on a simulated inverted pendulum problem, we show that the approach
+results in a closer match of the closed-loop behavior between the imitation and
+the original model predictive controller.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Study of Frictional and Impact Transients in Active-Passive Mechanical
+  Pair 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Ruderman, Francesco De Rito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider an active-passive mechanical pair in which the relative motion of
+the latter is constrained by the mechanical impact. The system dynamics is
+described by the previously introduced modeling frameworks of force transition
+and dissipation through the nonlinear Coulomb friction and structural damping,
+the later in accord with Hertzian contact theory. The focus of the recent study
+is on combining both interaction mechanisms, and the detailed experimental
+evaluation which discloses validity of the modeling assumptions. Such
+mechanical pair interactions can be found in various mechatronic systems and
+mechanisms, like for example clutches, backlash elements, sliding items on the
+shaking and inclining surfaces, conveyor belts and others. This practical study
+demonstrates and discusses the transients of a vibro-impact dynamics and shows
+theoretical developments in line with experimental evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Approach to Real-Time Short-Term Traffic Prediction based on
+  Distributed Fiber-Optic Sensing and Data Assimilation with a Stochastic
+  Cell-Automata Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoshiyuki Yajima, Hemant Prasad, Daisuke Ikefuji, Takemasa Suzuki, Shin Tominaga, Hitoshi Sakurai, Manabu Otani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper demonstrates real-time short-term traffic flow prediction through
+distributed fiber-optic sensing (DFOS) and data assimilation with a stochastic
+cell-automata-based traffic model. Traffic congestion on expressways is a
+severe issue. To alleviate its negative impacts, it is necessary to optimize
+traffic flow prior to becoming serious congestion. For this purpose, real-time
+short-term traffic flow prediction is promising. However, conventional traffic
+monitoring apparatus used in prediction methods faces a technical issue due to
+the sparsity in traffic flow data. To overcome the issue for realizing
+real-time traffic prediction, this paper employs DFOS, which enables to obtain
+spatially continuous and real-time traffic flow data along the road without
+dead zones. Using mean velocities derived from DFOS data as a feature
+extraction, this paper proposes a real-time data assimilation method for the
+short-term prediction. As the theoretical model, the stochastic
+Nishinari-Fukui-Schadschneider model is adopted. Future traffic flow is
+simulated with the optimal values of model parameters estimated from observed
+mean velocities and the initial condition estimated as the latest microscopic
+traffic state. This concept is validated using two congestion scenarios
+obtained in Japanese expressways. The results show that the mean absolute error
+of the predicted mean velocities is 10-15 km/h in the prediction horizon of 30
+minutes. Furthermore, the prediction error in congestion length and travel time
+decreases by 40-84% depending on congestion scenarios when compared with
+conventional methods with traffic counters. This paper concludes that real-time
+data assimilation using DFOS enables an accurate short-term traffic prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A 3D Continuous-Space Electromagnetic Channel Model for 6G Tri-Polarized
+  Multi-user Communications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Yang, Cheng-Xiang Wang, Jie Huang, John Thompson, H. Vincent Poor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is envisioned that the sixth generation (6G) and beyond 6G (B6G) wireless
+communication networks will enable global coverage in space, air, ground, and
+sea. In this case, both base stations and users can be mobile and will tend to
+move continuously in three-dimensional (3D) space. Therefore, obtaining channel
+state information (CSI) in 3D continuous-space is crucial for the design and
+performance evaluation of future 6G and B6G wireless systems. On the other
+hand, new 6G technologies such as integrated sensing and communications (ISAC)
+will also require prior knowledge of CSI in 3D continuous-space. In this paper,
+a 3D continuous-space electromagnetic channel model is proposed for
+tri-polarized multi-user communications, taking into account scatterers and
+spherical wavefronts. Scattered fields are calculated using the method of
+moments (MoM) with high accuracy. Spherical wave functions are utilized to
+decompose the dyadic Green's functions that connect the transmitted source
+currents and the received electric fields. Simulation results demonstrate that
+transmit power, apertures, scatterers, and sample intervals have significant
+impacts on statistical properties and channel capacities, providing insights
+into the performance of continuous-space electromagnetic channel models and the
+design of future wireless systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Wireless Channel Measurements and Characterization in Industrial IoT
+  Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zhang, Cheng-Xiang Wang, Zihao Zhou, Yuxiao Li, Jie Huang, Lijian Xin, Chun Pan, Dabo Zheng, Xiping Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wireless Fidelity (Wi-Fi) communication technologies hold significant
+potential for realizing the Industrial Internet of Things (IIoT). In this
+paper, both Single-Input Single-Output (SISO) and polarized Multiple-Input
+Multiple-Output (MIMO) channel measurements are conducted in an IIoT scenario
+at the less congested Wi-Fi band, i.e., 5.5~GHz. The purpose is to investigate
+wireless characteristics of communications between access points and terminals
+mounted on automated guided vehicles as well as those surrounding manufacturing
+areas. For SISO channel measurements, statistical properties including the
+delay Power Spectral Density (PSD), path loss, shadowing fading, delay spread,
+excess delay, K-factor, and amplitude distribution of small-scale fading are
+analyzed and compared with those observed in an office scenario. For MIMO
+channel measurements, results show that there are multiple Dense Multipath
+Component (DMC) processes in the delay PSD. An estimation algorithm based on
+the algorithm for a single DMC process is proposed to effectively process the
+multi-processes data. Moreover, delay, angular, power, and polarization
+properties of DMCs are investigated and compared with those of specular
+multipath components. Furthermore, effects of DMCs on Singular Values (SVs) and
+channel capacities are explored. Ignoring DMCs can overestimate SVs and
+underestimate channel capacities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Proxy Control Barrier Functions: Integrating Barrier-Based and
+  Lyapunov-Based Safety-Critical Control Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Wang, Xiangru Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces a novel Proxy Control Barrier Function (PCBF) scheme
+that integrates barrier-based and Lyapunov-based safety-critical control
+strategies for strict-feedback systems with potentially unknown dynamics. The
+proposed method employs a modular design procedure, decomposing the original
+system into a proxy subsystem and a virtual tracking subsystem that are
+controlled by the control barrier function (CBF)-based and Lyapunov-based
+controllers, respectively. By integrating these separately designed
+controllers, the overall system's safety is ensured. Moreover, a new
+filter-based disturbance observer is utilized to design a PCBF-based safe
+controller for strict-feedback systems subject to mismatched disturbances. This
+approach broadens the class of systems to which CBF-based methods can be
+applied and significantly simplifies CBF construction by requiring only the
+model of the proxy subsystem. The effectiveness of the proposed method is
+demonstrated through numerical simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributionally Robust Joint Chance-Constrained Optimal Power Flow
+  using Relative Entropy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eli Brock, Haixiang Zhang, Javad Lavaei, Somayeh Sojoudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing robust algorithms for the optimal power flow (OPF) problem is
+critical for the control of large-scale power systems under uncertainty. The
+chance-constrained OPF (CCOPF) problem provides a natural formulation of the
+trade-off between the operating cost and the constraint satisfaction rate. In
+this work, we propose a new data-driven algorithm for the CCOPF problem, based
+on distributionally robust optimization (DRO). \revise{We show that the
+proposed reformulation of the distributionally robust chance constraints is
+exact, whereas other approaches in the CCOPF literature rely on conservative
+approximations. We establish out-of-sample robustness guarantees for the
+distributionally robust solution and prove that the solution is the most
+efficient among all approaches enjoying the same guarantees.} We apply the
+proposed algorithm to the the CCOPF problem and compare the performance of our
+approach with existing methods using simulations on IEEE benchmark power
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resilient Distributed Control for Uncertain Nonlinear Interconnected
+  Systems under Network Anomaly 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youqing Wang, Ying Li, Thomas Parisini, Dong Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address a distributed adaptive control methodology for nonlinear
+interconnected systems possibly affected by network anomalies. In the framework
+of adaptive approximation, the distributed controller and parameter estimator
+are designed by exploiting a backstepping approach. The stability of the
+distributed control system under anomalies is analyzed, where both local and
+neighboring anomaly effects are considered. To quantify the resilience of the
+interconnected system under the action of network anomalies, we derive bounds
+on the duration of each anomaly and the resting time between two consecutive
+anomalies. Specifically, when each anomaly duration is smaller than our
+designed upper bound, the interconnected system controlled by the distributed
+approximation-based controller remains asymptotically stable. Moreover, if the
+resting time between two consecutive anomalies is larger than the proposed
+bound, then all signals of the control system are guaranteed to be bounded. In
+the paper, we show that under the action of the proposed distributed adaptive
+controller, the interconnected system remains stable in the presence of network
+anomalies, with both the qualitative and quantitative resilient conditions.
+Extensive simulation results show the effectiveness of our theoretical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Unified Attack Detection Strategy for Multi-Agent Systems over
+  Transient and Steady Stages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinming Gao, Yijing Wang, Wentao Zhang, Rui Zhao, Yang Shi, Zhiqiang Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a unified detection strategy against three kinds of
+attacks for multi-agent systems (MASs) which is applicable to both transient
+and steady stages. For attacks on the communication layer, a watermarking-based
+detection scheme with KullbackLeibler (KL) divergence is designed. Different
+from traditional communication schemes, each agent transmits a message set
+containing two state values with different types of watermarking. It is found
+that the detection performance is determined by the relevant parameters of the
+watermarking signal. Unlike the existing detection manoeuvres, such a scheme is
+capable of transient and steady stages. For attacks on the agent layer, a
+convergence rate related detection approach is put forward. It is shown that
+the resilience of the considered system is characterized by the coefficient and
+offset of the envelope. For hybrid attacks, based on the above detection
+mechanisms, a general framework resorting to trusted agents is presented, which
+requires weaker graph conditions and less information transmission. Finally, an
+example associated with the platooning of connected vehicles is given to
+support the theoretical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending Internet Access Over LoRa for Internet of Things and Critical
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atonu Ghosh, Devadeep Misra, Hirdesh Mewada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LoRa bridges the gap between remote locations and mainstream networks,
+enabling large-scale Internet of Things (IoT) deployments. Despite the recent
+advancements around LoRa, Internet access over this technology is still largely
+unexplored. Most existing solutions only handle packets within the local LoRa
+network and do not interact with web applications. This limits the scalability
+and the ability to deliver essential web services in disconnected regions. This
+work proposes and implements ILoRa to extend the public Internet to
+disconnected areas for essential service delivery. ILoRa enables accessing
+Application Programming Interfaces (APIs) and web pages on the Internet over a
+LoRa backbone network. It comprises a ILoRa coordinator code (ICN) and access
+point nodes (APNs). The ICN interfaces the LoRa network with the public
+Internet and interprets content. The APN tethers a WiFi hotspot to which
+devices connect and access the web content. This work further proposes data
+handling methods for ICNs and APNs. An actual hardware-based implementation
+validates the proposed system. The implementation achieves a throughput of 1.06
+kbps tested for an Internet-based API returning JSON data of 930 B.
+Furthermore, the APN consumed approximately $0.162$A current, and the resource
+utilization on the ICN was minimal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Spacecraft Servicing under Partial Feedback using
+  Lyapunov-based Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristian F. Nino, Omkar Sudhir Patil, Christopher D. Petersen, Sean Phillips, Warren E. Dixon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent systems are increasingly applied in space missions, including
+distributed space systems, resilient constellations, and autonomous rendezvous
+and docking operations. A critical emerging application is collaborative
+spacecraft servicing, which encompasses on-orbit maintenance, space debris
+removal, and swarm-based satellite repositioning. These missions involve
+servicing spacecraft interacting with malfunctioning or defunct spacecraft
+under challenging conditions, such as limited state information, measurement
+inaccuracies, and erratic target behaviors. Existing approaches often rely on
+assumptions of full state knowledge or single-integrator dynamics, which are
+impractical for real-world applications involving second-order spacecraft
+dynamics. This work addresses these challenges by developing a distributed
+state estimation and tracking framework that requires only relative position
+measurements and operates under partial state information. A novel
+$\rho$-filter is introduced to reconstruct unknown states using locally
+available information, and a Lyapunov-based deep neural network adaptive
+controller is developed that adaptively compensates for uncertainties stemming
+from unknown spacecraft dynamics. To ensure the collaborative spacecraft
+regulation problem is well-posed, a trackability condition is defined. A
+Lyapunov-based stability analysis is provided to ensure exponential convergence
+of errors in state estimation and spacecraft regulation to a neighborhood of
+the origin under the trackability condition. The developed method eliminates
+the need for expensive velocity sensors or extensive pre-training, offering a
+practical and robust solution for spacecraft servicing in complex, dynamic
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 4 Figures, Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Impulse Control of Piecewise Deterministic Markov Processes and
+  Markov Decision Processes: Frameworks, Extensions, and Open Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04120v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04120v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alice Cleynen, Benoîte de Saporta, Orlane Rossini, Régis Sabbadin, Amélie Vernay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Control theory plays a pivotal role in understanding and optimizing the
+behavior of complex dynamical systems across various scientific and engineering
+disciplines. Two key frameworks that have emerged for modeling and solving
+control problems in stochastic systems are piecewise deterministic Markov
+processes (PDMPs) and Markov decision processes (MDPs). Each framework has its
+unique strengths, and their intersection offers promising opportunities for
+tackling a broad class of problems, particularly in the context of impulse
+controls and decision-making in complex systems.
+  The relationship between PDMPs and MDPs is a natural subject of exploration,
+as embedding impulse control problems for PDMPs into the MDP framework could
+open new avenues for their analysis and resolution. Specifically, this
+integration would allow leveraging the computational and theoretical tools
+developed for MDPs to address the challenges inherent in PDMPs. On the other
+hand, PDMPs can offer a versatile and simple paradigm to model continuous time
+problems that are often described as discrete-time MDPs parametrized by complex
+transition kernels. This transformation has the potential to bridge the gap
+between the two frameworks, enabling solutions to previously intractable
+problems and expanding the scope of both fields. This paper presents a
+comprehensive review of two research domains, illustrated through a recurring
+medical example. The example is revisited and progressively formalized within
+the framework of thevarious concepts and objects introduced
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Security by Design Issues in Autonomous Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04104v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04104v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Higgins, Devki Jha, David Blundell, David Wallom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As autonomous vehicle (AV) technology advances towards maturity, it becomes
+imperative to examine the security vulnerabilities within these cyber-physical
+systems. While conventional cyber-security concerns are often at the forefront
+of discussions, it is essential to get deeper into the various layers of
+vulnerability that are often overlooked within mainstream frameworks. Our goal
+is to spotlight imminent challenges faced by AV operators and explore emerging
+technologies for comprehensive solutions. This research outlines the diverse
+security layers, spanning physical, cyber, coding, and communication aspects,
+in the context of AVs. Furthermore, we provide insights into potential
+solutions for each potential attack vector, ensuring that autonomous vehicles
+remain secure and resilient in an evolving threat landscape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intelligent Router for LLM Workloads: Improving Performance Through
+  Workload-Aware Load Balancing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13510v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13510v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunal Jain, Anjaly Parayil, Ankur Mallick, Esha Choukse, Xiaoting Qin, Jue Zhang, Íñigo Goiri, Rujia Wang, Chetan Bansal, Victor Rühle, Anoop Kulkarni, Steve Kofsky, Saravan Rajmohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Model (LLM) workloads have distinct prefill and decode phases
+with different compute and memory requirements which should ideally be
+accounted for when scheduling input queries across different LLM instances in a
+cluster. However existing scheduling algorithms treat LLM workloads as
+monolithic jobs without considering the distinct characteristics of the two
+phases in each workload. This leads to sub-optimal scheduling and increased
+response latency. In this work, we start by characterizing factors affecting
+the response latency during LLM inference serving. We establish that better
+load balancing of inference requests across the available LLM instances can
+improve the end-to-end latency to a larger extent than merely focusing on
+optimizing the instance-level scheduler. Motivated by our findings, we propose
+a heuristic-guided reinforcement learning-based intelligent router for
+data-driven and workload-aware scheduling. Our router schedules queries across
+LLM instances by leveraging a trainable response-length predictor, and a novel
+formulation for estimating the impact of mixing different workloads and
+achieves over 11% lower end-to-end latency than existing approaches on a mix of
+public datasets and 7.8% lower end-to-end latency on real workload data with
+diverse input and output trends from Cloud Provider X. Additionally, the
+proposed framework can also serve as a standard for benchmarking different LLM
+inference schedulers since it provides the best latency for a given model,
+hardware, and instance-level scheduler combination.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Time-Invariant Distributed Formation Tracking for Second-Order
+  Multi-Agent Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12235v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12235v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Fabris, Giulio Fattore, Angelo Cenedese
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the optimal time-invariant formation tracking problem
+with the aim of providing a distributed solution for multi-agent systems with
+second-order integrator dynamics. In the literature, most of the results
+related to multi-agent formation tracking do not consider energy issues while
+investigating distributed feedback control laws. In order to account for this
+crucial design aspect, we contribute by formalizing and proposing a solution to
+an optimization problem that encapsulates trajectory tracking, distance-based
+formation control and input energy minimization, through a specific and key
+choice of potential functions in the optimization cost. To this end, we show
+how to compute the inverse dynamics in a centralized fashion by means of the
+Projector-Operator-based Newton's method for Trajectory Optimization (PRONTO)
+and, more importantly, we exploit such an offline solution as a general
+reference to devise a stabilizing online distributed control law. Finally,
+numerical examples involving a cubic formation following a chicane-like path in
+the 3D space are provided to validate the proposed control strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 3 figures, accepted on March 27th, 2024 by the European
+  Journal of Control (first submission: June 23rd, 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harnessing Uncertainty for a Separation Principle in Direct Data-Driven
+  Predictive Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14788v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14788v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Chiuso, Marco Fabris, Valentina Breschi, Simone Formentin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model Predictive Control (MPC) is a powerful method for complex system
+regulation, but its reliance on an accurate model poses many limitations in
+real-world applications. Data-driven predictive control (DDPC) aims at
+overcoming this limitation, by relying on historical data to provide
+information on the plant to be controlled. In this work, we present a unified
+stochastic framework for direct DDPC, where control actions are obtained by
+optimizing the Final Control Error (FCE), which is directly computed from
+available data only and automatically weighs the impact of uncertainty on the
+control objective. Our framework allows us to establish a separation principle
+for Predictive Control, elucidating the role that predictive models and their
+uncertainty play in DDPC. Moreover, it generalizes existing DDPC methods, like
+regularized Data-enabled Predictive Control (DeePC) and $\gamma$-DDPC,
+providing a path toward noise-tolerant data-based control with rigorous
+optimality guarantees. The theoretical investigation is complemented by a
+series of experiments (code available on GitHub:
+https://github.com/marcofabris92/a-separation-principle-in-d3pc), revealing
+that the proposed method consistently outperforms or, at worst, matches
+existing techniques without requiring tuning regularization parameters as other
+methods do.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 2 figures, 1 table, accepted by Automatica on October 31st,
+  2024 (first submission: December 22nd, 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Resilient Control of Dynamic Flow Networks Subject to Stochastic
+  Cyber-Physical Disruptions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2004.00159v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2004.00159v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Tang, Li Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern network systems, such as transportation and communication systems, are
+prone to cyber-physical disruptions and thus suffer efficiency loss. This paper
+studies network resiliency, in terms of throughput, and develops resilient
+control to improve throughput. We consider single-commodity networks that admit
+congestion propagation. We also apply a Markov process to model disruption
+switches. For throughput analysis, we first use insights into congestion
+spillback to propose novel Lyapunov functions and then exploit monotone network
+dynamics to reduce computational costs of verifying stability conditions. For
+control design, we show that (i) for a network with infinite link storage
+space, there exists an open-loop control that attains the min-expected-cut
+capacity; (ii) for a network with observable disruptions that restrict maximum
+sending and/or receiving flows, there exists a mode-dependent control that
+attains the expected-min-cut capacity; (iii) for general networks, there exists
+a closed-loop control with throughput guarantees. We also derive lower bounds
+of resiliency scores for a set of numerical examples and verify resiliency
+improvement with our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soft Adaptive Feet for Legged Robots: An Open-Source Model for
+  Locomotion Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.03191v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.03191v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Crotti, Luca Rossini, Balint K. Hodossy, Anna Pace, Giorgio Grioli, Antonio Bicchi, Manuel G. Catalano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, artificial feet based on soft robotics and under-actuation
+principles emerged to improve mobility on challenging terrains. This paper
+presents the application of the MuJoCo physics engine to realize a digital twin
+of an adaptive soft foot developed for use with legged robots. We release the
+MuJoCo soft foot digital twin as open source to allow users and researchers to
+explore new approaches to locomotion. The work includes the system modeling
+techniques along with the kinematic and dynamic attributes involved. Validation
+is conducted through a rigorous comparison with bench tests on a physical
+prototype, replicating these experiments in simulation. Results are evaluated
+based on sole deformation and contact forces during foot-obstacle interaction.
+The foot model is subsequently integrated into simulations of the humanoid
+robot COMAN+, replacing its original flat feet. Results show an improvement in
+the robot's ability to negotiate small obstacles without altering its control
+strategy. Ultimately, this study offers a comprehensive modeling approach for
+adaptive soft feet, supported by qualitative comparisons of bipedal locomotion
+with state of the art robotic feet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Volumetric Approach to Privacy of Dynamical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02893v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02893v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanghong Weng, Ehsan Nekouei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information-theoretic metrics, such as mutual information, have been widely
+used to evaluate privacy leakage in dynamic systems. However, these approaches
+are typically limited to stochastic systems and face computational challenges.
+In this paper, we introduce a novel volumetric framework for analyzing privacy
+in systems affected by unknown but bounded noise. Our model considers a dynamic
+system comprising public and private states, where an observation set of the
+public state is released. An adversary utilizes the observed public state to
+infer an uncertainty set of the private state, referred to as the inference
+attack. We define the evolution dynamics of these inference attacks and
+quantify the privacy level of the private state using the volume of its
+uncertainty sets. For linear scalar systems, we derive an explicit formulation
+of the uncertainty set. For multi-dimensional linear systems, we develop an
+approximate computation method leveraging interval analysis. We investigate the
+properties of the proposed volumetric privacy measure and demonstrate that it
+is bounded by the information gain derived from the observation set.
+Furthermore, we propose an optimization approach to designing privacy filter
+using randomization and linear programming based on the proposed privacy
+measure. The effectiveness of the optimal privacy filter design is evaluated
+through a production-inventory case study, illustrating its robustness against
+the inference attack.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Backstepping Control of a Quadrotor Unmanned Aerial Vehicle Under
+  Colored Noises 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05022v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05022v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehmet Karahan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in software and hardware technologies have facilitated the
+production of quadrotor unmanned aerial vehicles (UAVs). Nowadays, people
+actively use quadrotor UAVs in essential missions such as search and rescue,
+counter-terrorism, firefighting, surveillance, and cargo transportation. While
+performing these tasks, quadrotors must operate in noisy environments.
+Therefore, a robust controller design that can control the altitude and
+attitude of the quadrotor in noisy environments is of great importance. Many
+researchers have focused only on white Gaussian noise in their studies, whereas
+researchers need to consider the effects of all colored noises during the
+operation of the quadrotor. This study aims to design a robust controller that
+is resistant to all colored noises. Firstly, a nonlinear quadrotor model was
+created with MATLAB. Then, a backstepping controller resistant to colored
+noises was designed. The designed backstepping controller was tested under
+Gaussian white, pink, brown, blue, and purple noises. PID and Lyapunov-based
+controller designs were also carried out, and their time responses (rise time,
+overshoot, settling time) were compared with those of the backstepping
+controller. In the simulations, time was in seconds, altitude was in meters,
+and roll, pitch, and yaw references were in radians. Rise and settling time
+values were in seconds, and overshoot value was in percent. When the obtained
+values are examined, simulations prove that the proposed backstepping
+controller has the least overshoot and the shortest settling time under all
+noise types.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Joint Observer Gain and Input Design for Asymptotic Active Fault
+  Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.09061v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.09061v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Xu, Yiming Wan, Ye Wang, Vicenc Puig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a joint gain and input design method for observer-based
+asymptotic active fault diagnosis, which is based on a newly-defined notion
+named the excluding degree of the origin from a zonotope. Using the excluding
+degree, a quantitative specification is obtained to characterize the
+performance of set-based robust fault diagnosis. Furthermore, a single gain
+design method and a joint gain and input design method are proposed,
+respectively. This is the first work to achieve a joint observer gain and input
+design for set-based active fault diagnosis. Compared with the existing methods
+that design gains and input separately, the proposed joint gain and input
+design method has advantages to exploit the fault diagnosis potential of
+observer-based schemes. Finally, several examples are used to illustrate the
+effectiveness of the proposed methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Provisionally accepted by Automatica as Regular Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Localization Phenomena in Large-Scale Networked Systems: Robustness and
+  Fragility of Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.00252v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.00252v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Poorva Shukla, Bassam Bamieh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study phenomena where some eigenvectors of a graph Laplacian are largely
+confined in small subsets of the graph. These localization phenomena are
+similar to those generally termed Anderson Localization in the Physics
+literature, and are related to the complexity of the structure of large graphs
+in still unexplored ways. Using spectral perturbation theory and
+pseudo-spectrum analysis, we explain how the presence of localized eigenvectors
+gives rise to fragilities (low robustness margins) to unmodeled node or link
+dynamics. Our analysis is demonstrated by examples of networks with relatively
+low complexity, but with features that appear to induce eigenvector
+localization. The implications of this newly-discovered fragility phenomenon
+are briefly discussed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Optimization and Control <span class="chip" style="font-size: 60%">51</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Relax Nonconvex Quadratically Constrained Quadratic Programs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Buket Ozen, Burak Kocuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quadratically constrained quadratic programs (QCQPs) are ubiquitous in
+optimization: Such problems arise in applications from operations research,
+power systems, signal processing, chemical engineering, portfolio theory, among
+others. Despite their flexibility in modeling real-life situations and the
+recent effort to understand their properties, nonconvex QCQPs are hard to solve
+in practice. Most of the approaches in the literature are based on either
+Linear Programming (LP) or Semidefinite Programming (SDP) relaxations, each of
+which works very well for some problem subclasses but perform poorly on others.
+In this paper, we develop a relaxation selection procedure for nonconvex QCQPs
+that can adaptively decide whether an LP- or SDP-based approach is expected to
+be more beneficial by considering the instance structure. The proposed
+methodology relies on utilizing machine learning methods that involve features
+derived from spectral properties and sparsity patterns of data matrices, and
+once trained appropriately, the prediction model is applicable to any instance
+with an arbitrary number of variables and constraints. We train and test
+classification and regression models over synthetically generated instances,
+and empirically show the efficacy of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven Optimization for the Evolve-Filter-Relax regularization of
+  convection-dominated flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Ivagnes, Maria Strazzullo, Michele Girfoglio, Traian Iliescu, Gianluigi Rozza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerical stabilization techniques are often employed in under-resolved
+simulations of convection-dominated flows to improve accuracy and mitigate
+spurious oscillations. Specifically, the Evolve-Filter-Relax (EFR) algorithm is
+a framework which consists in evolving the solution, applying a filtering step
+to remove high-frequency noise, and relaxing through a convex combination of
+filtered and original solutions. The stability and accuracy of the EFR solution
+strongly depend on two parameters, the filter radius $\delta$ and the
+relaxation parameter $\chi$. Standard choices for these parameters are usually
+fixed in time, and related to the full order model setting, i.e., the grid size
+for $\delta$ and the time step for $\chi$. This paper makes two significant
+improvements to the standard EFR framework by proposing: (i) time-dependent
+parameters, (ii) data-driven adaptive optimization of the parameters in time,
+considering a fully-resolved simulation as a reference. In particular, we
+propose three different classes of Optimized-EFR strategies, aiming to optimize
+one or both parameters. Moreover, we investigate the accuracy and efficiency of
+the proposed optimization algorithms considering different objective functions,
+both local (point-valued) and global (such as the kinetic energy). The new
+Optimized-EFR strategies are tested in the under-resolved simulation of a
+turbulent flow past a cylinder at $Re=1000$. The new Optimized-EFR results are
+more accurate than the standard EFR solution while maintaining a similar
+computational time. In particular, we show that using a global objective
+function and including the $H^1$ velocity seminorm is crucial to accurately
+match the reference flow dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A regularized transportation cost stemming from entropic approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Camilla Brizzi, Luigi De Pascale, Anna Kausamo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the entropic regularizations of optimal transport problems under
+suitable summability assumptions on the point-wise transport cost. These
+summability assumptions already appear in the literature. However, we show that
+the weakest compactness conditions that can be derived are already enough to
+obtain the convergence of the regularized functionals. This approach allows us
+to characterize the variational limit of the regularization even when it does
+not converge to the original problem. The results apply also to problems with
+more than two marginals.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An obstruction to small-time local controllability for a bilinear
+  Schrödinger equation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karine Beauchard, Frédéric Marbach, Thomas Perrin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the small-time local controllability in the vicinity of the
+ground state of a bilinear Schr\"odinger equation with Neumann boundary
+conditions. We prove that, when the linearized system is not controllable, the
+nonlinear system is not controllable, due to a quadratic obstruction involving
+the squared norm of the control's primitive. This obstruction has been known
+since 1983 for ODEs and observed for some PDEs since 2006. However, our
+situation is more intricate since the kernel describing the quadratic expansion
+of the solution is not twice differentiable. We thus follow a Fourier-based
+approach, closer to the one used for quadratic obstructions of fractional
+Sobolev regularity.
+  In this Fourier-based approach, a challenge is to formulate a necessary and
+sufficient condition on the convolution kernel, for the quadratic form to be
+coercive. In previous studies, the coercivity was ensured by a signed
+asymptotic equivalent for the Fourier transform of the convolution kernel of
+the form $\widehat{K}(\omega) \sim \omega^{-2}$ as $|\omega| \to \infty$. In
+our case, $\widehat{K}$ is a distribution which has singularities and changes
+sign up to infinity. We still prove coercivity because one of the signs appears
+too infrequently.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal control of a nonlinear kinetic Fokker-Planck equation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03784v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03784v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Breiten, Karl Kunisch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A tracking type optimal control problem for a nonlinear and nonlocal kinetic
+Fokker-Planck equation which arises as the mean field limit of an interacting
+particle systems that is subject to distance dependent random fluctuations is
+studied. As the equation of interest is only hypocoercive and the control
+operator is unbounded with respect to the canonical state space, classical
+variational solution techniques cannot be utilized directly. Instead, the
+concept of admissible control operators is employed. For the underlying
+nonlinearities, local Lipschitz estimates are derived and subsequently used
+within a fixed point argument to obtain local existence of solutions. Again,
+due to hypocoercivity, existence of optimal controls requires non standard
+techniques as (compensated) compactness arguments are not readily available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The maximal angle between $3 \times 3$ copositive matrices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03773v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03773v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Gourion
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In 2010, Hiriart-Urruty and Seeger posed the problem of finding the maximal
+possible angle $\theta_n$ between two copositive matrices of order $n$. They
+proved that $\theta_2=\frac{3}{4}\pi$. In this paper, we study the maximal
+angle between two copositive matrices of order 3. We show that
+$\theta_3=\frac{3}{4}\pi$ and give all possible pairs of matrices achieving
+this maximal angle. The proof is based on case analysis and uses optimization
+and basic linear algebra techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hydrogen Network Expansion Planning considering the Chicken-and-egg
+  Dilemma and Market Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sezen Ece Kayacık, Beste Basciftci, Albert H. Schrotenboer, Iris F. A. Vis, Evrim Ursavas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Comparable performance to fully flexible settings through optimized revision
+times.Green hydrogen is thought to be a game changer for reaching
+sustainability targets. However, the transition to a green hydrogen economy
+faces a critical challenge known as the `chicken-and-egg dilemma', wherein
+establishing a hydrogen supply network relies on demand, while demand only
+grows with reliable supply. In addition, as the hydrogen market is in the early
+stage, predicting demand distributions is challenging due to lack of data
+availability. This paper addresses these complex issues through a risk-averse
+framework with the introduction of a distributionally robust hydrogen network
+expansion planning problem under decision-dependent demand ambiguity. The
+problem optimizes location and production capacity decisions of the suppliers
+considering the moments of the stochastic hydrogen demand as a function of
+these investment decisions. To obtain tractable representations of this
+problem, we derive two different reformulations that consider continuous and
+discrete hydrogen demand support sets under different forms of decision
+dependencies. To efficiently solve the reformulations, we develop a tailored
+algorithm based on the column-and-constraint generation approach, and enhance
+the computational performance through solving the master problems to a relative
+optimality gap, decomposing the subproblems, and integrating pre-generated
+columns and constraints. To validate the effectiveness of our approach, we
+investigate a real case study leveraging data from the ``Hydrogen Energy
+Applications in Valley Environments for Northern Netherlands (HEAVENN)"
+project. The results reveal that considering the chicken-and-egg dilemma under
+uncertain hydrogen market conditions leads to earlier and more diverse
+investments, providing critical insights for policymakers based on the degree
+of decision dependency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Second-Order Optimization Algorithms for Minimizing Low-rank
+  Functions <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edward Tansley, Coralia Cartis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a random-subspace variant of cubic regularization algorithm that
+chooses the size of the subspace adaptively, based on the rank of the projected
+second derivative matrix. Iteratively, our variant only requires access to
+(small-dimensional) projections of first- and second-order problem derivatives
+and calculates a reduced step inexpensively. The ensuing method maintains the
+optimal global rate of convergence of (full-dimensional) cubic regularization,
+while showing improved scalability both theoretically and numerically,
+particularly when applied to low-rank functions. When applied to the latter,
+our algorithm naturally adapts the subspace size to the true rank of the
+function, without knowing it a priori.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2024 Workshop OPT2024: Optimization for Machine
+  Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computational complexity of sum-of-squares bounds for copositive
+  programs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marilena Palomba, Lucas Slot, Luis Felipe Vargas, Monaldo Mastrolilli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, copositive programming has received significant attention
+for its ability to model hard problems in both discrete and continuous
+optimization. Several relaxations of copositive programs based on semidefinite
+programming (SDP) have been proposed in the literature, meant to provide
+tractable bounds. However, while these SDP-based relaxations are amenable to
+the ellipsoid algorithm and interior point methods, it is not immediately
+obvious that they can be solved in polynomial time (even approximately). In
+this paper, we consider the sum-of-squares (SOS) hierarchies of relaxations for
+copositive programs introduced by Parrilo (2000), de Klerk & Pasechnik (2002)
+and Pe\~na, Vera & Zuluaga (2006), which can be formulated as SDPs. We
+establish sufficient conditions that guarantee the polynomial-time
+computability (up to fixed precision) of these relaxations. These conditions
+are satisfied by copositive programs that represent standard quadratic programs
+and their reciprocals. As an application, we show that the SOS bounds for the
+(weighted) stability number of a graph can be computed efficiently.
+Additionally, we provide pathological examples of copositive programs (that do
+not satisfy the sufficient conditions) whose SOS relaxations admit only
+feasible solutions of doubly-exponential size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stabilization of Strictly Pre-Dissipative Receding Horizon Linear
+  Quadratic Control by Terminal Costs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03691v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03691v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mario Zanon, Lars Grüne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Asymptotic stability in receding horizon control is obtained under a strict
+pre-dissipativity assumption, in the presence of suitable state constraints. In
+this paper we analyze how terminal constraints can be replaced by suitable
+terminal costs. We restrict to the linear-quadratic setting as that allows us
+to obtain stronger results, while we analyze the full nonlinear case in a
+separate contribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controlling the low-temperature Ising model using spatiotemporal Markov
+  decision theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03668v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03668v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. C. de Jongh, Richard J. Boucherie, M. N. M. van Lieshout
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the spatiotemporal Markov decision process (STMDP), a special
+type of Markov decision process that models sequential decision-making problems
+which are not only characterized by temporal, but also by spatial interaction
+structures. To illustrate the framework, we construct an STMDP inspired by the
+low-temperature two-dimensional Ising model on a finite, square lattice,
+evolving according to the Metropolis dynamics. We consider the situation in
+which an external decision maker aims to drive the system towards the all-plus
+configuration by flipping spins at specified moments in time. In order to
+analyze this problem, we construct an auxiliary MDP by means of a reduction of
+the configuration space to the local minima of the Hamiltonian. Leveraging the
+convenient form of this auxiliary MDP, we uncover the structure of the optimal
+policy by solving the Bellman equations in a recursive manner. Finally, we
+conduct a numerical study on the performance of the optimal policy obtained
+from the auxiliary MDP in the original Ising STMDP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Proxy Control Barrier Functions: Integrating Barrier-Based and
+  Lyapunov-Based Safety-Critical Control Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Wang, Xiangru Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces a novel Proxy Control Barrier Function (PCBF) scheme
+that integrates barrier-based and Lyapunov-based safety-critical control
+strategies for strict-feedback systems with potentially unknown dynamics. The
+proposed method employs a modular design procedure, decomposing the original
+system into a proxy subsystem and a virtual tracking subsystem that are
+controlled by the control barrier function (CBF)-based and Lyapunov-based
+controllers, respectively. By integrating these separately designed
+controllers, the overall system's safety is ensured. Moreover, a new
+filter-based disturbance observer is utilized to design a PCBF-based safe
+controller for strict-feedback systems subject to mismatched disturbances. This
+approach broadens the class of systems to which CBF-based methods can be
+applied and significantly simplifies CBF construction by requiring only the
+model of the proxy subsystem. The effectiveness of the proposed method is
+demonstrated through numerical simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributionally Robust Joint Chance-Constrained Optimal Power Flow
+  using Relative Entropy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eli Brock, Haixiang Zhang, Javad Lavaei, Somayeh Sojoudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing robust algorithms for the optimal power flow (OPF) problem is
+critical for the control of large-scale power systems under uncertainty. The
+chance-constrained OPF (CCOPF) problem provides a natural formulation of the
+trade-off between the operating cost and the constraint satisfaction rate. In
+this work, we propose a new data-driven algorithm for the CCOPF problem, based
+on distributionally robust optimization (DRO). \revise{We show that the
+proposed reformulation of the distributionally robust chance constraints is
+exact, whereas other approaches in the CCOPF literature rely on conservative
+approximations. We establish out-of-sample robustness guarantees for the
+distributionally robust solution and prove that the solution is the most
+efficient among all approaches enjoying the same guarantees.} We apply the
+proposed algorithm to the the CCOPF problem and compare the performance of our
+approach with existing methods using simulations on IEEE benchmark power
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Turbulence modeling over riblets via domain transformation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadamin Naseri, Armin Zare
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerical and experimental studies have demonstrated the drag-reducing
+potential of carefully designed streamwise-elongated riblets in lowering
+skin-friction drag. To support the systematic design of such surface
+corrugations, recent efforts have integrated simplified versions of the
+governing equations with innovative methods for representing the effects of
+rough boundaries on flow dynamics. Notably, the statistical response of the
+eddy-viscosity-enhanced linearized Navier-Stokes equations has been shown to
+effectively capture the ability of riblets in suppressing turbulence, quantify
+the influence of background turbulence on the mean velocity, and reproduce
+established drag-reduction trends. In this paper, we enhance the flexibility
+and computational efficiency of this simulation-free approach by implementing a
+domain transformation for surface representation, along with a perturbation
+analysis on a small geometric parameter of the riblets. While domain
+transformation complicates the differential equations, it provides accurate
+boundary representations and facilitates the analysis of complex riblet shapes
+at high Reynolds numbers by enabling perturbation analysis to simplify the
+dimensional complexity of the governing equations. Our method successfully
+predicts drag reduction trends for semi-circular riblets, consistent with
+existing literature. We further utilize our framework to investigate flow
+mechanisms influenced by riblets and extend our study to channel flows with
+friction Reynolds numbers up to 2003. Our findings reveal the emergence of
+Kelvin-Helmholtz rollers over large and sharp semi-circular riblets,
+contributing to the degradation of drag reduction in these geometries.
+Additionally, we examine the impact of riblets on near-wall flow structures,
+focusing on their suppression of streamwise-elongated structures in flows over
+large riblets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 26 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Positivstellensätze for polynomial matrices with universal quantifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Guo, Jie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies Positivstellens\"atze for a polynomial matrix subject to
+polynomial matrix inequality constraints with universal quantifiers. We first
+present a Scherer-Hol-type Positivstellensatz under the Archimedean condition.
+When the objective is a scalar polynomial, we further provide a sparse
+Scherer-Hol-type Positivstellensatz in the presence of correlative sparsity.
+Next, without assuming the Archimedean condition, we derive
+Putinar-Vasilescu-type, P\'olya-type, and Lasserre-Netzer-type
+Positivstellens\"atze under the same setting. These results can be viewed as
+common generalizations of corresponding Positivstellens\"atze in the cases of
+polynomials, polynomials with universal quantifiers, and polynomial matrices.
+For the proofs, techniques from *-algebra, real algebraic geometry, operator
+theory, and convex optimization are employed. Applications of the established
+Positivstellens\"atze to robust polynomial matrix optimization are also
+discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convergence of a particle method for gradient flows on the
+  $L^p$-Wasserstein space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rong Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the particle method to approximate the gradient flow on the
+$L^p$-Wasserstein space. This method relies on the discretization of the energy
+introduced by [3] via nonoverlapping balls centered at the particles and
+preserves the gradient flow structure at the particle level. We prove the
+convergence of the discrete gradient flow to the continuum gradient flow on the
+$L^p$-Wasserstein space over $\mathbb R$, specifically to the doubly nonlinear
+diffusion equation in one dimension.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:1605.08086 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimization Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Van Hentenryck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces the concept of optimization learning, a methodology
+to design optimization proxies that learn the input/output mapping of
+parametric optimization problems. These optimization proxies are trustworthy by
+design: they compute feasible solutions to the underlying optimization
+problems, provide quality guarantees on the returned solutions, and scale to
+large instances. Optimization proxies are differentiable programs that combine
+traditional deep learning technology with repair or completion layers to
+produce feasible solutions. The article shows that optimization proxies can be
+trained end-to-end in a self-supervised way. It presents methodologies to
+provide performance guarantees and to scale optimization proxies to large-scale
+optimization problems. The potential of optimization proxies is highlighted
+through applications in power systems and, in particular, real-time risk
+assessment and security-constrained optimal power flow.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unifying restart accelerated gradient and proximal bundle methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel restarted version of Nesterov's accelerated
+gradient method and establishes its optimal iteration-complexity for solving
+convex smooth composite optimization problems. The proposed restart accelerated
+gradient method is shown to be a specific instance of the accelerated inexact
+proximal point framework introduced in "An accelerated hybrid proximal
+extragradient method for convex optimization and its implications to
+second-order methods" by Monteiro and Svaiter, SIAM Journal on Optimization,
+2013. Furthermore, this work examines the proximal bundle method within the
+inexact proximal point framework, demonstrating that it is an instance of the
+framework. Notably, this paper provides new insights into the underlying
+algorithmic principle that unifies two seemingly disparate optimization
+methods, namely, the restart accelerated gradient and the proximal bundle
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Spacecraft Servicing under Partial Feedback using
+  Lyapunov-based Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristian F. Nino, Omkar Sudhir Patil, Christopher D. Petersen, Sean Phillips, Warren E. Dixon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent systems are increasingly applied in space missions, including
+distributed space systems, resilient constellations, and autonomous rendezvous
+and docking operations. A critical emerging application is collaborative
+spacecraft servicing, which encompasses on-orbit maintenance, space debris
+removal, and swarm-based satellite repositioning. These missions involve
+servicing spacecraft interacting with malfunctioning or defunct spacecraft
+under challenging conditions, such as limited state information, measurement
+inaccuracies, and erratic target behaviors. Existing approaches often rely on
+assumptions of full state knowledge or single-integrator dynamics, which are
+impractical for real-world applications involving second-order spacecraft
+dynamics. This work addresses these challenges by developing a distributed
+state estimation and tracking framework that requires only relative position
+measurements and operates under partial state information. A novel
+$\rho$-filter is introduced to reconstruct unknown states using locally
+available information, and a Lyapunov-based deep neural network adaptive
+controller is developed that adaptively compensates for uncertainties stemming
+from unknown spacecraft dynamics. To ensure the collaborative spacecraft
+regulation problem is well-posed, a trackability condition is defined. A
+Lyapunov-based stability analysis is provided to ensure exponential convergence
+of errors in state estimation and spacecraft regulation to a neighborhood of
+the origin under the trackability condition. The developed method eliminates
+the need for expensive velocity sensors or extensive pre-training, offering a
+practical and robust solution for spacecraft servicing in complex, dynamic
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 4 Figures, Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient LP warmstarting for linear modifications of the constraint
+  matrix 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Derval, Bardhyl Miftari, Damien Ernst, Quentin Louveaux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of computing the optimal solution and objective of a
+linear program under linearly changing linear constraints. More specifically,
+we want to compute the optimal solution of a linear optimization where the
+constraint matrix linearly depends on a paramater that can take p different
+values. Based on the information given by a precomputed basis, we present three
+efficient LP warm-starting algorithms. Each algorithm is either based on the
+eigenvalue decomposition, the Schur decomposition, or a tweaked eigenvalue
+decomposition to evaluate the optimal solution and optimal objective of these
+problems. The three algorithms have an overall complexity O(m^3 + pm^2) where m
+is the number of constraints of the original problem and p the number of values
+of the parameter that we want to evaluate. We also provide theorems related to
+the optimality conditions to verify when a basis is still optimal and a local
+bound on the objective.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Optimization for the Perfect Meal: A Data-Driven Approach to
+  Optimising the Perfect Meal Using Gurobi 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Utkarsh Prajapati, Tanushree Jain, Abhishek Machiraju, Divyam Kaushik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study aims to optimize meal planning for nutritional health and cost
+efficiency using linear programming. Linear optimization provides an effective
+framework for addressing the problem of an optimal diet, as the composition of
+food can be naturally modeled as a linearly additive system. Leveraging a
+comprehensive nutrition dataset, our model minimizes meal costs while meeting
+specific nutritional requirements. We explore additional complexities, such as
+fractional weights and nutrient ratio constraints, enhancing the robustness of
+the solution. Case studies address common nutritional challenges, providing
+tailored diet plans. The significance lies in aiding individuals to form
+balanced, cost-effective dietary schedules, considering fitness goals and
+caloric needs. This research contributes to efficient, sustainable, and
+time-sensitive meal planning, emphasizing the intersection of nutrition,
+optimization, and real-world applicability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mixing Times and Privacy Analysis for the Projected Langevin Algorithm
+  under a Modulus of Continuity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mario Bravo, Juan P. Flores-Mella, Cristóbal Guzmán
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the mixing time of the projected Langevin algorithm (LA) and the
+privacy curve of noisy Stochastic Gradient Descent (SGD), beyond nonexpansive
+iterations. Specifically, we derive new mixing time bounds for the projected LA
+which are, in some important cases, dimension-free and poly-logarithmic on the
+accuracy, closely matching the existing results in the smooth convex case.
+Additionally, we establish new upper bounds for the privacy curve of the
+subsampled noisy SGD algorithm. These bounds show a crucial dependency on the
+regularity of gradients, and are useful for a wide range of convex losses
+beyond the smooth case. Our analysis relies on a suitable extension of the
+Privacy Amplification by Iteration (PABI) framework (Feldman et al., 2018;
+Altschuler and Talwar, 2022, 2023) to noisy iterations whose gradient map is
+not necessarily nonexpansive. This extension is achieved by designing an
+optimization problem which accounts for the best possible R\'enyi divergence
+bound obtained by an application of PABI, where the tractability of the problem
+is crucially related to the modulus of continuity of the associated gradient
+mapping. We show that, in several interesting cases -- including the nonsmooth
+convex, weakly smooth and (strongly) dissipative -- such optimization problem
+can be solved exactly and explicitly. This yields the tightest possible
+PABI-based bounds, where our results are either new or substantially sharper
+than those in previous works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepVIVONet: Using deep neural operators to optimize sensor locations
+  with application to vortex-induced vibrations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04105v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04105v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruyin Wan, Ehsan Kharazmi, Michael S Triantafyllou, George Em Karniadakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce DeepVIVONet, a new framework for optimal dynamic reconstruction
+and forecasting of the vortex-induced vibrations (VIV) of a marine riser, using
+field data. We demonstrate the effectiveness of DeepVIVONet in accurately
+reconstructing the motion of an off--shore marine riser by using sparse
+spatio-temporal measurements. We also show the generalization of our model in
+extrapolating to other flow conditions via transfer learning, underscoring its
+potential to streamline operational efficiency and enhance predictive accuracy.
+The trained DeepVIVONet serves as a fast and accurate surrogate model for the
+marine riser, which we use in an outer--loop optimization algorithm to obtain
+the optimal locations for placing the sensors. Furthermore, we employ an
+existing sensor placement method based on proper orthogonal decomposition (POD)
+to compare with our data-driven approach. We find that that while POD offers a
+good approach for initial sensor placement, DeepVIVONet's adaptive capabilities
+yield more precise and cost-effective configurations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Blackwell Equilibrium in Repeated Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Costas Cavounidis, Sambuddha Ghosh, Johannes Hörner, Eilon Solan, Satoru Takahashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We apply Blackwell optimality to repeated games. An equilibrium whose
+strategy profile is sequentially rational for all high enough discount factors
+simultaneously is a Blackwell (subgame-perfect, perfect public, etc.)
+equilibrium. The bite of this requirement depends on the monitoring structure.
+Under perfect monitoring, a ``folk'' theorem holds relative to an appropriate
+notion of minmax. Under imperfect public monitoring, absent a public
+randomization device, any perfect public equilibrium generically involves pure
+action profiles or stage-game Nash equilibria only. Under private conditionally
+independent monitoring, in a class of games that includes the prisoner's
+dilemma, the stage-game Nash equilibrium is played in every round.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adjoint Matching: Fine-tuning Flow and Diffusion Generative Models with
+  Memoryless Stochastic Optimal Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08861v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08861v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carles Domingo-Enrich, Michal Drozdzal, Brian Karrer, Ricky T. Q. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamical generative models that produce samples through an iterative
+process, such as Flow Matching and denoising diffusion models, have seen
+widespread use, but there have not been many theoretically-sound methods for
+improving these models with reward fine-tuning. In this work, we cast reward
+fine-tuning as stochastic optimal control (SOC). Critically, we prove that a
+very specific memoryless noise schedule must be enforced during fine-tuning, in
+order to account for the dependency between the noise variable and the
+generated samples. We also propose a new algorithm named Adjoint Matching which
+outperforms existing SOC algorithms, by casting SOC problems as a regression
+problem. We find that our approach significantly improves over existing methods
+for reward fine-tuning, achieving better consistency, realism, and
+generalization to unseen human preference reward models, while retaining sample
+diversity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Characterizations of the Aubin Property of the Solution Mapping for
+  Nonlinear Semidefinite Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08232v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08232v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Chen, Ruoning Chen, Defeng Sun, Liping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the Aubin property of the Karush-Kuhn-Tucker solution
+mapping for the nonlinear semidefinite programming (NLSDP) problem at a locally
+optimal solution. In the literature, it is known that the Aubin property
+implies the constraint nondegeneracy by Fusek [SIAM J. Optim. 23 (2013), pp.
+1041-1061] and the second-order sufficient condition by Ding et al. [SIAM J.
+Optim. 27 (2017), pp. 67-90]. Based on the Mordukhovich criterion, here we
+further prove that the strong second-order sufficient condition is also
+necessary for the Aubin property to hold. Consequently, several equivalent
+conditions including the strong regularity are established for NLSDP's Aubin
+property. Together with the recent progress made by Chen et al. on the
+equivalence between the Aubin property and the strong regularity for nonlinear
+second-order cone programming [SIAM J. Optim., in press; arXiv:2406.13798v3
+(2024)], this paper constitutes a significant step forward in characterizing
+the Aubin property for general non-polyhedral $C^2$-cone reducible constrained
+optimization problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constrained Sampling with Primal-Dual Langevin Monte Carlo <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.00568v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.00568v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luiz F. O. Chamon, Mohammad Reza Karimi, Anna Korba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work considers the problem of sampling from a probability distribution
+known up to a normalization constant while satisfying a set of statistical
+constraints specified by the expected values of general nonlinear functions.
+This problem finds applications in, e.g., Bayesian inference, where it can
+constrain moments to evaluate counterfactual scenarios or enforce desiderata
+such as prediction fairness. Methods developed to handle support constraints,
+such as those based on mirror maps, barriers, and penalties, are not suited for
+this task. This work therefore relies on gradient descent-ascent dynamics in
+Wasserstein space to put forward a discrete-time primal-dual Langevin Monte
+Carlo algorithm (PD-LMC) that simultaneously constrains the target distribution
+and samples from it. We analyze the convergence of PD-LMC under standard
+assumptions on the target distribution and constraints, namely (strong)
+convexity and log-Sobolev inequalities. To do so, we bring classical
+optimization arguments for saddle-point algorithms to the geometry of
+Wasserstein space. We illustrate the relevance and effectiveness of PD-LMC in
+several applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 14 figures. Published at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Follow The Approximate Sparse Leader for No-Regret Online Sparse Linear
+  Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00799v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00799v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samrat Mukhopadhyay, Debasmita Mukherjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of \textit{online sparse linear approximation}, where
+one predicts the best sparse approximation of a sequence of measurements in
+terms of linear combination of columns of a given measurement matrix. Such
+online prediction problems are ubiquitous, ranging from medical trials to web
+caching to resource allocation. The inherent difficulty of offline recovery
+also makes the online problem challenging. In this letter, we propose
+Follow-The-Approximate-Sparse-Leader, an efficient online meta-policy to
+address this online problem. Through a detailed theoretical analysis, we prove
+that under certain assumptions on the measurement sequence, the proposed policy
+enjoys a data-dependent sublinear upper bound on the static regret, which can
+range from logarithmic to square-root. Numerical simulations are performed to
+corroborate the theoretical findings and demonstrate the efficacy of the
+proposed online policy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures, corrected title, added proof of a lemma in
+  appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aubin Property and Strong Regularity Are Equivalent for Nonlinear
+  Second-Order Cone Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.13798v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.13798v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Chen, Ruoning Chen, Defeng Sun, Junyuan Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper solves a fundamental open problem in variational analysis on the
+equivalence between the Aubin property and the strong regularity for nonlinear
+second-order cone programming (SOCP) at a locally optimal solution. We achieve
+this by introducing a reduction approach to the Aubin property characterized by
+the Mordukhovich criterion and a lemma of alternative choices on cones to
+replace the S-lemma used in Outrata and Ram\'irez [SIAM J. Optim. 21 (2011)
+789-823] and Opazo, Outrata, and Ram\'irez [SIAM J. Optim. 27 (2017)
+2141-2151], where the same SOCP was considered under the strict complementarity
+condition except for possibly only one block of constraints. As a byproduct, we
+also offer a new approach to the well-known result of Dontchev and Rockafellar
+[SIAM J. Optim. 6 (1996) 1087-1105] on the equivalence of the two concepts in
+conventional nonlinear programming.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in SIAM Journal on Optimization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reduction from the partition problem: Dynamic lot sizing problem with
+  polynomial complexity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05017v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05017v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chee-Khian Sim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note, we polynomially reduce an instance of the partition problem to
+a dynamic lot sizing problem, and show that solving the latter problem solves
+the former problem. By solving the dynamic programming formulation of the
+dynamic lot sizing problem, we show that the instance of the partition problem
+can be solved with pseudo-polynomial time complexity. Numerical results on
+solving instances of the partition problem are also provided using an
+implementation of the algorithm that solves the dynamic program.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages. Latest version contains improved arguments and results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Converse Lyapunov Results for Stability of Switched Systems with Average
+  Dwell-Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.03560v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.03560v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Della Rossa, Aneel Tanwani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article provides a characterization of stability for switched nonlinear
+systems under average dwell-time constraints, in terms of necessary and
+sufficient conditions involving multiple Lyapunov functions. Earlier converse
+results focus on switched systems with dwell-time constraints only, and the
+resulting inequalities depend on the flow of individual subsystems. With the
+help of a counterexample, we show that a lower bound that guarantees stability
+for dwell-time switching signals may not necessarily imply stability for
+switching signals with same lower bound on the average dwell-time. Based on
+these two observations, we provide a converse result for the average dwell-time
+constrained systems in terms of inequalities which do not depend on the flow of
+individual subsystems and are easier to check. The particular case of linear
+switched systems is studied as a corollary to our main result.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ESAIM: Control, Optimisation and Calculus of Variations
+  (ESAIM: COCV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-Based Modeling and Decomposition of Hierarchical Optimization
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02098v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02098v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David L. Cole, Filippo Pecci, Omar J. Guerra, Harsha Gangammanavar, Jesse D. Jenkins, Victor M. Zavala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a graph-theoretic modeling approach for hierarchical optimization
+that leverages the OptiGraph abstraction implemented in the Julia package
+Plasmo.jl. We show that the abstraction is flexible and can effectively capture
+complex hierarchical connectivity that arises from decision-making over
+multiple spatial and temporal scales (e.g., integration of planning,
+scheduling, and operations in manufacturing and infrastructures). We also show
+that the graph abstraction facilitates the conceptualization and implementation
+of decomposition and approximation schemes. Specifically, we propose a
+graph-based Benders decomposition (gBD) framework that enables the exploitation
+of hierarchical (nested) structures and that uses graph
+aggregation/partitioning procedures to discover such structures. In addition,
+we provide a Julia implementation of gBD, which we call PlasmoBenders.jl. We
+illustrate the capabilities using examples arising in the context of energy and
+power systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>66 pages, 3 tables, 28 figures, updated abstract</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the Convergence Rates for the Kinetic Fokker-Planck Equation
+  by Optimal Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.01369v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.01369v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Breiten, Karl Kunisch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The long time behavior and detailed convergence analysis of Langevin
+equations has received increased attention over the last years. Difficulties
+arise from a lack of coercivity, usually termed hypocoercivity, of the
+underlying kinetic Fokker-Planck operator which is a consequence of the
+partially deterministic nature of a second order stochastic differential
+equation. In this manuscript, the effect of controlling the confinement
+potential without altering the original invariant measure is investigated. This
+leads to an abstract bilinear control system with an unbounded but
+infinite-time admissible control operator which, by means of an artificial
+diffusion approach, is shown to possess a unique solution. The compactness of
+the underlying semigroup is further used to define an infinite-horizon optimal
+control problem on an appropriately reduced state space. Under smallness
+assumptions on the initial data, feasibility of and existence of a solution to
+the optimal control problem are discussed. Numerical results based on a local
+approximation based on a shifted Riccati equation illustrate the theoretical
+findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trade-off Invariance Principle for minimizers of regularized functionals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11639v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11639v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Massimo Fornasier, Jona Klemenc, Alessandro Scagliotti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider functionals of the form $H_\alpha(u)=F(u)+\alpha
+G(u)$ with $\alpha\in[0,+\infty)$, where $u$ varies in a set $U\neq\emptyset$
+(without further structure). We first show that, excluding at most countably
+many values of $\alpha$, we have that $\inf_{H_\alpha^\star}G=
+\sup_{H_\alpha^\star}G$, where $H_\alpha^\star := \arg \min_U H_\alpha$, which
+is assumed to be non-empty. We further prove a stronger result that concerns
+the invariance of the limiting value of the functional $G$ along minimizing
+sequences for $H_\alpha$. Moreover, we show to what extent these findings
+generalize to multi-regularized functionals and -- in the presence of an
+underlying differentiable structure -- to critical points. Finally, the main
+result implies an unexpected consequence for functionals regularized with
+uniformly convex norms: excluding again at most countably many values of
+$\alpha$, it turns out that for a minimizing sequence, convergence to a
+minimizer in the weak or strong sense is equivalent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, extension to multi-regularization and to critical points</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Gaussian Processes via Relevance Pursuit <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.24222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.24222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Ament, Elizabeth Santorella, David Eriksson, Ben Letham, Maximilian Balandat, Eytan Bakshy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian processes (GPs) are non-parametric probabilistic regression models
+that are popular due to their flexibility, data efficiency, and well-calibrated
+uncertainty estimates. However, standard GP models assume homoskedastic
+Gaussian noise, while many real-world applications are subject to non-Gaussian
+corruptions. Variants of GPs that are more robust to alternative noise models
+have been proposed, and entail significant trade-offs between accuracy and
+robustness, and between computational requirements and theoretical guarantees.
+In this work, we propose and study a GP model that achieves robustness against
+sparse outliers by inferring data-point-specific noise levels with a sequential
+selection procedure maximizing the log marginal likelihood that we refer to as
+relevance pursuit. We show, surprisingly, that the model can be parameterized
+such that the associated log marginal likelihood is strongly concave in the
+data-point-specific noise variances, a property rarely found in either robust
+regression objectives or GP marginal likelihoods. This in turn implies the weak
+submodularity of the corresponding subset selection problem, and thereby proves
+approximation guarantees for the proposed algorithm. We compare the model's
+performance relative to other approaches on diverse regression and Bayesian
+optimization tasks, including the challenging but common setting of sparse
+corruptions of the labels within or close to the function range.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 Article (https://openreview.net/forum?id=5FATPIlWUJ)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforcement Learning for Jump-Diffusions, with Financial Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16449v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16449v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuefeng Gao, Lingfei Li, Xun Yu Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study continuous-time reinforcement learning (RL) for stochastic control
+in which system dynamics are governed by jump-diffusion processes. We formulate
+an entropy-regularized exploratory control problem with stochastic policies to
+capture the exploration--exploitation balance essential for RL. Unlike the pure
+diffusion case initially studied by Wang et al. (2020), the derivation of the
+exploratory dynamics under jump-diffusions calls for a careful formulation of
+the jump part. Through a theoretical analysis, we find that one can simply use
+the same policy evaluation and $q$-learning algorithms in Jia and Zhou (2022a,
+2023), originally developed for controlled diffusions, without needing to check
+a priori whether the underlying data come from a pure diffusion or a
+jump-diffusion. However, we show that the presence of jumps ought to affect
+parameterizations of actors and critics in general. We investigate as an
+application the mean--variance portfolio selection problem with stock price
+modelled as a jump-diffusion, and show that both RL algorithms and
+parameterizations are invariant with respect to jumps. Finally, we present a
+detailed study on applying the general theory to option hedging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regularity and stability for the Gibbs conditioning principle on path
+  space via McKean-Vlasov control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23016v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23016v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis-Pierre Chaintron, Giovanni Conforti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a system of diffusion processes interacting through their
+empirical distribution. Assuming that the empirical average of a given
+observable can be observed at any time, we derive regularity and quantitative
+stability results for the optimal solutions in the associated version of the
+Gibbs conditioning principle. The proofs rely on the analysis of a
+McKean-Vlasov control problem with distributional constraints. Some new
+estimates are derived for Hamilton-Jacobi-Bellman equations and the Hessian of
+the log-density of diffusion processes, which are of independent interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-geodesically-convex optimization in the Wasserstein space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00502v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00502v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoang Phuc Hau Luu, Hanlin Yu, Bernardo Williams, Petrus Mikkola, Marcelo Hartmann, Kai Puolamäki, Arto Klami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a class of optimization problems in the Wasserstein space (the space
+of probability measures) where the objective function is nonconvex along
+generalized geodesics. Specifically, the objective exhibits some
+difference-of-convex structure along these geodesics. The setting also
+encompasses sampling problems where the logarithm of the target distribution is
+difference-of-convex. We derive multiple convergence insights for a novel semi
+Forward-Backward Euler scheme under several nonconvex (and possibly nonsmooth)
+regimes. Notably, the semi Forward-Backward Euler is just a slight modification
+of the Forward-Backward Euler whose convergence is -- to our knowledge -- still
+unknown in our very general non-geodesically-convex setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient descent inference in empirical risk minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.09498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.09498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiyang Han, Xiaocong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient descent is one of the most widely used iterative algorithms in
+modern statistical learning. However, its precise algorithmic dynamics in
+high-dimensional settings remain only partially understood, which has therefore
+limited its broader potential for statistical inference applications.
+  This paper provides a precise, non-asymptotic distributional characterization
+of gradient descent iterates in a broad class of empirical risk minimization
+problems, in the so-called mean-field regime where the sample size is
+proportional to the signal dimension. Our non-asymptotic state evolution theory
+holds for both general non-convex loss functions and non-Gaussian data, and
+reveals the central role of two Onsager correction matrices that precisely
+characterize the non-trivial dependence among all gradient descent iterates in
+the mean-field regime.
+  Although the Onsager correction matrices are typically analytically
+intractable, our state evolution theory facilitates a generic gradient descent
+inference algorithm that consistently estimates these matrices across a broad
+class of models. Leveraging this algorithm, we show that the state evolution
+can be inverted to construct (i) data-driven estimators for the generalization
+error of gradient descent iterates and (ii) debiased gradient descent iterates
+for inference of the unknown signal. Detailed applications to two canonical
+models--linear regression and (generalized) logistic regression--are worked out
+to illustrate model-specific features of our general theory and inference
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Representing Convex Quadratically Constrained Quadratic Programs via
+  Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.13805v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.13805v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyang Wu, Qian Chen, Akang Wang, Tian Ding, Ruoyu Sun, Wenguo Yang, Qingjiang Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convex quadratically constrained quadratic programs (QCQPs) involve finding a
+solution within a convex feasible region defined by quadratic constraints while
+minimizing a convex quadratic objective function. These problems arise in
+various industrial applications, including power systems and signal processing.
+Traditional methods for solving convex QCQPs primarily rely on matrix
+factorization, which quickly becomes computationally prohibitive as the problem
+size increases. Recently, graph neural networks (GNNs) have gained attention
+for their potential in representing and solving various optimization problems
+such as linear programs and linearly constrained quadratic programs. In this
+work, we investigate the representation power of GNNs in the context of QCQP
+tasks. Specifically, we propose a new tripartite graph representation for
+general convex QCQPs and properly associate it with message-passing GNNs. We
+demonstrate that there exist GNNs capable of reliably representing key
+properties of convex QCQPs, including feasibility, optimal value, and optimal
+solution. Our result deepens the understanding of the connection between QCQPs
+and GNNs, paving the way for future machine learning approaches to efficiently
+solve QCQPs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SymILO: A Symmetry-Aware Learning Framework for Integer Linear
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.19678v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.19678v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Chen, Tianjian Zhang, Linxin Yang, Qingyu Han, Akang Wang, Ruoyu Sun, Xiaodong Luo, Tsung-Hui Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Integer linear programs (ILPs) are commonly employed to model diverse
+practical problems such as scheduling and planning. Recently, machine learning
+techniques have been utilized to solve ILPs. A straightforward idea is to train
+a model via supervised learning, with an ILP as the input and an optimal
+solution as the label. An ILP is symmetric if its variables can be permuted
+without changing the problem structure, resulting in numerous equivalent and
+optimal solutions. Randomly selecting an optimal solution as the label can
+introduce variability in the training data, which may hinder the model from
+learning stable patterns. In this work, we incorporate the intrinsic symmetry
+of ILPs and propose a novel training framework called SymILO. Specifically, we
+modify the learning task by introducing solution permutation along with neural
+network weights as learnable parameters and then design an alternating
+algorithm to jointly optimize the loss function. We conduct extensive
+experiments on ILPs involving different symmetries and the computational
+results demonstrate that our symmetry-aware approach significantly outperforms
+three existing methods -- achieving $50.3\%$, $66.5\%$, and $45.4\%$ average
+improvements, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variable Projection Algorithms: Theoretical Insights and A Novel
+  Approach for Problems with Large Residual 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyong Chen, Peng Xue, Min Gan, Jing Chen, Wenzhong Guo, C. L. Philip. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper delves into an in-depth exploration of the Variable Projection
+(VP) algorithm, a powerful tool for solving separable nonlinear optimization
+problems across multiple domains, including system identification, image
+processing, and machine learning. We first establish a theoretical framework to
+examine the effect of the approximate treatment of the coupling relationship
+among parameters on the local convergence of the VP algorithm and theoretically
+prove that the Kaufman's VP algorithm can achieve a similar convergence rate as
+the Golub \& Pereyra's form. These studies fill the gap in the existing
+convergence theory analysis, and provide a solid foundation for understanding
+the mechanism of VP algorithm and broadening its application horizons.
+Furthermore, drawing inspiration from these theoretical revelations, we design
+a refined VP algorithm for handling separable nonlinear optimization problems
+characterized by large residual, called VPLR, which boosts the convergence
+performance by addressing the interdependence of parameters within the
+separable model and by continually correcting the approximated Hessian matrix
+to counteract the influence of large residual during the iterative process. The
+effectiveness of this refined algorithm is corroborated through numerical
+experimentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupling Learning and Decision-Making: Breaking the
+  $\mathcal{O}(\sqrt{T})$ Barrier in Online Resource Allocation with
+  First-Order Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07108v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07108v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenzhi Gao, Chunlin Sun, Chenyu Xue, Dongdong Ge, Yinyu Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online linear programming plays an important role in both revenue management
+and resource allocation, and recent research has focused on developing
+efficient first-order online learning algorithms. Despite the empirical success
+of first-order methods, they typically achieve a regret no better than
+$\mathcal{O}(\sqrt{T})$, which is suboptimal compared to the $\mathcal{O}(\log
+T)$ bound guaranteed by the state-of-the-art linear programming (LP)-based
+online algorithms. This paper establishes several important facts about online
+linear programming, which unveils the challenge for first-order-method-based
+online algorithms to achieve beyond $\mathcal{O}(\sqrt{T})$ regret. To address
+the challenge, we introduce a new algorithmic framework that decouples learning
+from decision-making. For the first time, we show that first-order methods can
+attain regret $\mathcal{O}(T^{1/3})$ with this new framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Merged into arXiv:2501.02761</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Policy Iteration for Exploratory Hamilton--Jacobi--Bellman Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00612v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00612v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Vinh Tran, Zhenhua Wang, Yuming Paul Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the policy iteration algorithm (PIA) for entropy-regularized
+stochastic control problems on an infinite time horizon with a large discount
+rate, focusing on two main scenarios. First, we analyze PIA with bounded
+coefficients where the controls applied to the diffusion term satisfy a
+smallness condition. We demonstrate the convergence of PIA based on a uniform
+$\mathcal{C}^{2,\alpha}$ estimate for the value sequence generated by PIA, and
+provide a quantitative convergence analysis for this scenario. Second, we
+investigate PIA with unbounded coefficients but no control over the diffusion
+term. In this scenario, we first provide the well-posedness of the exploratory
+Hamilton--Jacobi--Bellman equation with linear growth coefficients and
+polynomial growth reward function. By such a well-posedess result we achieve
+PIA's convergence by establishing a quantitative locally uniform
+$\mathcal{C}^{1,\alpha}$ estimates for the generated value sequence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GCBF+: A Neural Graph Control Barrier Function Framework for Distributed
+  Safe Multi-Agent Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14554v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14554v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songyuan Zhang, Oswin So, Kunal Garg, Chuchu Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributed, scalable, and safe control of large-scale multi-agent systems is
+a challenging problem. In this paper, we design a distributed framework for
+safe multi-agent control in large-scale environments with obstacles, where a
+large number of agents are required to maintain safety using only local
+information and reach their goal locations. We introduce a new class of
+certificates, termed graph control barrier function (GCBF), which are based on
+the well-established control barrier function theory for safety guarantees and
+utilize a graph structure for scalable and generalizable distributed control of
+MAS. We develop a novel theoretical framework to prove the safety of an
+arbitrary-sized MAS with a single GCBF. We propose a new training framework
+GCBF+ that uses graph neural networks to parameterize a candidate GCBF and a
+distributed control policy. The proposed framework is distributed and is
+capable of taking point clouds from LiDAR, instead of actual state information,
+for real-world robotic applications. We illustrate the efficacy of the proposed
+method through various hardware experiments on a swarm of drones with
+objectives ranging from exchanging positions to docking on a moving target
+without collision. Additionally, we perform extensive numerical experiments,
+where the number and density of agents, as well as the number of obstacles,
+increase. Empirical results show that in complex environments with agents with
+nonlinear dynamics (e.g., Crazyflie drones), GCBF+ outperforms the hand-crafted
+CBF-based method with the best performance by up to 20% for relatively
+small-scale MAS with up to 256 agents, and leading reinforcement learning (RL)
+methods by up to 40% for MAS with 1024 agents. Furthermore, the proposed method
+does not compromise on the performance, in terms of goal reaching, for
+achieving high safety rates, which is a common trade-off in RL-based methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 15 figures; Accepted by IEEE Transactions on Robotics
+  (T-RO)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized sparsity-promoting solvers for Bayesian inverse problems:
+  Versatile sparsifying transforms and unknown noise variances 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16623v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16623v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Lindbloom, Jan Glaubitz, Anne Gelb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian hierarchical models can provide efficient algorithms for finding
+sparse solutions to ill-posed inverse problems. The models typically comprise a
+conditionally Gaussian prior model for the unknown which is augmented by a
+generalized gamma hyper-prior model for variance hyper-parameters. This
+investigation generalizes these models and their efficient maximum a posterior
+(MAP) estimation using the iterative alternating sequential (IAS) algorithm in
+two ways: (1) General sparsifying transforms: Diverging from conventional
+methods, our approach permits the use of sparsifying transformations with
+nontrivial kernels; (2) Unknown noise variances: We treat the noise variance as
+a random variable that is estimated during the inference procedure. This is
+important in applications where the noise estimate cannot be accurately
+estimated a priori. Remarkably, these augmentations neither significantly
+burden the computational expense of the algorithm nor compromise its efficacy.
+We include convexity and convergence analysis for the method and demonstrate
+its efficacy in several numerical experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variance-reduction for Variational Inequality Problems with Bregman
+  Distance Function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10735v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10735v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeinab Alizadeh, Erfan Yazdandoost Hamedani, Afrooz Jalilzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address variational inequalities (VI) with a finite-sum
+structure. We introduce a novel single-loop stochastic variance-reduced
+algorithm, incorporating the Bregman distance function, and establish an
+optimal convergence guarantee under a monotone setting. Additionally, we
+explore a structured class of non-monotone problems that exhibit weak Minty
+solutions, and analyze the complexity of our proposed method, highlighting a
+significant improvement over existing approaches. Numerical experiments are
+presented to demonstrate the performance of our algorithm compared to
+state-of-the-art methods
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Game Between Two Identical Dubins Cars: Evading a Conic Sensor in
+  Minimum Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.08637v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.08637v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ubaldo Ruiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A fundamental task in mobile robotics is keeping an intelligent agent under
+surveillance with an autonomous robot as it travels in the environment. This
+work studies a theoretical version of that problem involving one of the most
+popular vehicle platforms in robotics. In particular, we consider two identical
+Dubins cars moving on a plane without obstacles. One of them plays as the
+pursuer, and it is equipped with a limited field-of-view detection region
+modeled as a semi-infinite cone with its apex at the pursuer's position. The
+pursuer aims to maintain the other Dubins car, which plays as the evader, as
+much time as possible inside its detection region. On the contrary, the evader
+wants to escape as soon as possible. In this work, employing differential game
+theory, we find the time-optimal motion strategies near the game's end. The
+analysis of those trajectories reveals the existence of at least two singular
+surfaces: a Transition Surface (also known as a Switch Surface) and an Evader's
+Universal Surface. We also found that the barrier's standard construction
+produces a surface that partially lies outside the playing space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-Based Modeling and Decomposition of Hierarchical Optimization
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02098v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02098v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David L. Cole, Filippo Pecci, Omar J. Guerra, Harsha Gangammanavar, Jesse D. Jenkins, Victor M. Zavala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a graph-theoretic modeling approach for hierarchical optimization
+that leverages the OptiGraph abstraction implemented in the Julia package
+Plasmo$.$jl. We show that the abstraction is flexible and can effectively
+capture complex hierarchical connectivity that arises from decision-making over
+multiple spatial and temporal scales (e.g., integration of planning,
+scheduling, and operations in manufacturing and infrastructures). We also show
+that the graph abstraction facilitates the conceptualization and implementation
+of decomposition and approximation schemes. Specifically, we propose a
+graph-based Benders decomposition (gBD) framework that enables the exploitation
+of hierarchical (nested) structures and that uses graph
+aggregation/partitioning procedures to discover such structures. In addition,
+we provide a Julia implementation of gBD, which we call PlasmoBenders$.$jl. We
+illustrate the capabilities using examples arising in the context of energy and
+power systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>66 pages, 3 tables, 28 figures, updated abstract</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Policy Iteration with Integer Programming for Inventory Management <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.02215v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.02215v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavithra Harsha, Ashish Jagmohan, Jayant Kalagnanam, Brian Quanz, Divya Singhvi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a Reinforcement Learning (RL) based framework for optimizing
+long-term discounted reward problems with large combinatorial action space and
+state dependent constraints. These characteristics are common to many
+operations management problems, e.g., network inventory replenishment, where
+managers have to deal with uncertain demand, lost sales, and capacity
+constraints that results in more complex feasible action spaces. Our proposed
+Programmable Actor Reinforcement Learning (PARL) uses a deep-policy iteration
+method that leverages neural networks (NNs) to approximate the value function
+and combines it with mathematical programming (MP) and sample average
+approximation (SAA) to solve the per-step-action optimally while accounting for
+combinatorial action spaces and state-dependent constraint sets. We show how
+the proposed methodology can be applied to complex inventory replenishment
+problems where analytical solutions are intractable. We also benchmark the
+proposed algorithm against state-of-the-art RL algorithms and commonly used
+replenishment heuristics and find it considerably outperforms existing methods
+by as much as 14.7% on average in various complex supply chain settings. We
+find that this improvement of PARL over benchmark algorithms can be directly
+attributed to better inventory cost management, especially in inventory
+constrained settings. Furthermore, in the simpler setting where optimal
+replenishment policy is tractable or known near optimal heuristics exist, we
+find that the RL approaches can learn near optimal policies. Finally, to make
+RL algorithms more accessible for inventory management researchers, we also
+discuss the development of a modular Python library that can be used to test
+the performance of RL algorithms with various supply chain structures and spur
+future research in developing practical and near-optimal algorithms for
+inventory management problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Prior shorter version accepted to NeurIPS 2021 Deep RL Workshop.
+  Updated version to appear in MSOM journal. Authors are listed in alphabetical
+  order</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Partition for Multi-Type Queueing System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.14104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.14104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyu Cao, Simai He, Zizhuo Wang, Yifan Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study an optimal server partition and customer assignment problem for an
+uncapacitated FCFS queueing system with heterogeneous types of customers. Each
+type of customers is associated with a Poisson arrival, a certain service time
+distribution, and a unit waiting cost. The goal is to minimize the expected
+total waiting cost by partitioning the server into sub-queues, each with a
+smaller service capacity, and routing customer types probabilistically. First,
+we show that by properly partitioning the queue, it is possible to reduce the
+expected waiting costs by an arbitrarily large ratio. Then, we show that for
+any given server partition, the optimal customer assignment admits a certain
+geometric structure, enabling an efficient algorithm to find the optimal
+assignment. Such an optimal structure also applies when minimizing the expected
+sojourn time. Finally, we consider the joint partition-assignment optimization
+problem. The customer assignment under the optimal server partition admits a
+stronger structure. Specifically, if the first two moments of the service time
+distributions satisfy certain properties, it is optimal to deterministically
+assign customer types with consecutive service rates to the same sub-queue.
+This structure allows for more efficient algorithms. Overall, the common rule
+of thumb to partition customers into continuous segments ranked by service
+rates could be suboptimal, and our work is the first to comprehensively study
+the queue partition problem based on customer types.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">147</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LargeAD: Large-Scale Cross-Sensor Data <span class="highlight-title">Pretrain</span>ing for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingdong Kong, Xiang Xu, Youquan Liu, Jun Cen, Runnan Chen, Wenwei Zhang, Liang Pan, Kai Chen, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in vision foundation models (VFMs) have revolutionized
+visual perception in 2D, yet their potential for 3D scene understanding,
+particularly in autonomous driving applications, remains underexplored. In this
+paper, we introduce LargeAD, a versatile and scalable framework designed for
+large-scale 3D pretraining across diverse real-world driving datasets. Our
+framework leverages VFMs to extract semantically rich superpixels from 2D
+images, which are aligned with LiDAR point clouds to generate high-quality
+contrastive samples. This alignment facilitates cross-modal representation
+learning, enhancing the semantic consistency between 2D and 3D data. We
+introduce several key innovations: i) VFM-driven superpixel generation for
+detailed semantic representation, ii) a VFM-assisted contrastive learning
+strategy to align multimodal features, iii) superpoint temporal consistency to
+maintain stable representations across time, and iv) multi-source data
+pretraining to generalize across various LiDAR configurations. Our approach
+delivers significant performance improvements over state-of-the-art methods in
+both linear probing and fine-tuning tasks for both LiDAR-based segmentation and
+object detection. Extensive experiments on eleven large-scale multi-modal
+datasets highlight our superior performance, demonstrating the adaptability,
+efficiency, and robustness in real-world autonomous driving scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; 16 pages, 7 figures, 8 tables; Project Page at
+  https://ldkong.com/LargeAD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Xu, Lingdong Kong, Hui Shuai, Liang Pan, Ziwei Liu, Qingshan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR data pretraining offers a promising approach to leveraging large-scale,
+readily available datasets for enhanced data utilization. However, existing
+methods predominantly focus on sparse voxel representation, overlooking the
+complementary attributes provided by other LiDAR representations. In this work,
+we propose LiMoE, a framework that integrates the Mixture of Experts (MoE)
+paradigm into LiDAR data representation learning to synergistically combine
+multiple representations, such as range images, sparse voxels, and raw points.
+Our approach consists of three stages: i) Image-to-LiDAR Pretraining, which
+transfers prior knowledge from images to point clouds across different
+representations; ii) Contrastive Mixture Learning (CML), which uses MoE to
+adaptively activate relevant attributes from each representation and distills
+these mixed features into a unified 3D network; iii) Semantic Mixture
+Supervision (SMS), which combines semantic logits from multiple representations
+to boost downstream segmentation performance. Extensive experiments across 11
+large-scale LiDAR datasets demonstrate our effectiveness and superiority. The
+code and model checkpoints have been made publicly accessible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; 26 pages, 17 figures, 7 tables; Project Page at
+  https://ldkong.com/LiMoE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are VLMs Ready for Autonomous Driving? An Empirical Study from the
+  Reliability, Data, and Metric Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoyuan Xie, Lingdong Kong, Yuhao Dong, Chonghao Sima, Wenwei Zhang, Qi Alfred Chen, Ziwei Liu, Liang Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Vision-Language Models (VLMs) have sparked interest in
+their use for autonomous driving, particularly in generating interpretable
+driving decisions through natural language. However, the assumption that VLMs
+inherently provide visually grounded, reliable, and interpretable explanations
+for driving remains largely unexamined. To address this gap, we introduce
+DriveBench, a benchmark dataset designed to evaluate VLM reliability across 17
+settings (clean, corrupted, and text-only inputs), encompassing 19,200 frames,
+20,498 question-answer pairs, three question types, four mainstream driving
+tasks, and a total of 12 popular VLMs. Our findings reveal that VLMs often
+generate plausible responses derived from general knowledge or textual cues
+rather than true visual grounding, especially under degraded or missing visual
+inputs. This behavior, concealed by dataset imbalances and insufficient
+evaluation metrics, poses significant risks in safety-critical scenarios like
+autonomous driving. We further observe that VLMs struggle with multi-modal
+reasoning and display heightened sensitivity to input corruptions, leading to
+inconsistencies in performance. To address these challenges, we propose refined
+evaluation metrics that prioritize robust visual grounding and multi-modal
+understanding. Additionally, we highlight the potential of leveraging VLMs'
+awareness of corruptions to enhance their reliability, offering a roadmap for
+developing more trustworthy and interpretable decision-making systems in
+real-world autonomous driving contexts. The benchmark toolkit is publicly
+accessible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; 41 pages, 32 figures, 16 tables; Project Page at
+  https://drive-bench.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extraction Of Cumulative Blobs From Dynamic Gestures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rishabh Naulakha, Shubham Gaur, Dhairya Lodha, Mehek Tulsyan, Utsav Kotecha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gesture recognition is a perceptual user interface, which is based on CV
+technology that allows the computer to interpret human motions as commands,
+allowing users to communicate with a computer without the use of hands, thus
+making the mouse and keyboard superfluous. Gesture recognition's main weakness
+is a light condition because gesture control is based on computer vision, which
+heavily relies on cameras. These cameras are used to interpret gestures in 2D
+and 3D, so the extracted information can vary depending on the source of light.
+The limitation of the system cannot work in a dark environment. A simple night
+vision camera can be used as our camera for motion capture as they also blast
+out infrared light which is not visible to humans but can be clearly seen with
+a camera that has no infrared filter this majorly overcomes the limitation of
+systems which cannot work in a dark environment. So, the video stream from the
+camera is fed into a Raspberry Pi which has a Python program running OpenCV
+module which is used for detecting, isolating and tracking the path of dynamic
+gesture, then we use an algorithm of machine learning to recognize the pattern
+drawn and accordingly control the GPIOs of the raspberry pi to perform some
+activities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of
+  Images and Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haobo Yuan, Xiangtai Li, Tao Zhang, Zilong Huang, Shilin Xu, Shunping Ji, Yunhai Tong, Lu Qi, Jiashi Feng, Ming-Hsuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents Sa2VA, the first unified model for dense grounded
+understanding of both images and videos. Unlike existing multi-modal large
+language models, which are often limited to specific modalities and tasks,
+Sa2VA supports a wide range of image and video tasks, including referring
+segmentation and conversation, with minimal one-shot instruction tuning. Sa2VA
+combines SAM-2, a foundation video segmentation model, with LLaVA, an advanced
+vision-language model, and unifies text, image, and video into a shared LLM
+token space. Using the LLM, Sa2VA generates instruction tokens that guide SAM-2
+in producing precise masks, enabling a grounded, multi-modal understanding of
+both static and dynamic visual content. Additionally, we introduce Ref-SAV, an
+auto-labeled dataset containing over 72k object expressions in complex video
+scenes, designed to boost model performance. We also manually validate 2k video
+objects in the Ref-SAV datasets to benchmark referring video object
+segmentation in complex environments. Experiments show that Sa2VA achieves
+state-of-the-art across multiple tasks, particularly in referring video object
+segmentation, highlighting its potential for complex real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://lxtgh.github.io/project/sa2va</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RAG-Check: Evaluating Multimodal Retrieval Augmented Generation
+  Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matin Mortaheb, Mohammad A. Amir Khojastepour, Srimat T. Chakradhar, Sennur Ulukus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) improves large language models (LLMs) by
+using external knowledge to guide response generation, reducing hallucinations.
+However, RAG, particularly multi-modal RAG, can introduce new hallucination
+sources: (i) the retrieval process may select irrelevant pieces (e.g.,
+documents, images) as raw context from the database, and (ii) retrieved images
+are processed into text-based context via vision-language models (VLMs) or
+directly used by multi-modal language models (MLLMs) like GPT-4o, which may
+hallucinate. To address this, we propose a novel framework to evaluate the
+reliability of multi-modal RAG using two performance measures: (i) the
+relevancy score (RS), assessing the relevance of retrieved entries to the
+query, and (ii) the correctness score (CS), evaluating the accuracy of the
+generated response. We train RS and CS models using a ChatGPT-derived database
+and human evaluator samples. Results show that both models achieve ~88%
+accuracy on test data. Additionally, we construct a 5000-sample human-annotated
+database evaluating the relevancy of retrieved pieces and the correctness of
+response statements. Our RS model aligns with human preferences 20% more often
+than CLIP in retrieval, and our CS model matches human preferences ~91% of the
+time. Finally, we assess various RAG systems' selection and generation
+performances using RS and CS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeuralSVG: An Implicit Representation for Text-to-Vector Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sagi Polaczek, Yuval Alaluf, Elad Richardson, Yael Vinker, Daniel Cohen-Or
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vector graphics are essential in design, providing artists with a versatile
+medium for creating resolution-independent and highly editable visual content.
+Recent advancements in vision-language and diffusion models have fueled
+interest in text-to-vector graphics generation. However, existing approaches
+often suffer from over-parameterized outputs or treat the layered structure - a
+core feature of vector graphics - as a secondary goal, diminishing their
+practical use. Recognizing the importance of layered SVG representations, we
+propose NeuralSVG, an implicit neural representation for generating vector
+graphics from text prompts. Inspired by Neural Radiance Fields (NeRFs),
+NeuralSVG encodes the entire scene into the weights of a small MLP network,
+optimized using Score Distillation Sampling (SDS). To encourage a layered
+structure in the generated SVG, we introduce a dropout-based regularization
+technique that strengthens the standalone meaning of each shape. We
+additionally demonstrate that utilizing a neural representation provides an
+added benefit of inference-time control, enabling users to dynamically adapt
+the generated SVG based on user-provided inputs, all with a single learned
+representation. Through extensive qualitative and quantitative evaluations, we
+demonstrate that NeuralSVG outperforms existing methods in generating
+structured and flexible SVG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://sagipolaczek.github.io/NeuralSVG/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VLM-driven Behavior Tree for Context-aware Task Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoki Wake, Atsushi Kanehira, Jun Takamatsu, Kazuhiro Sasabuchi, Katsushi Ikeuchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)
+has recently gained attention in the robotics community, yet remains in its
+early stages of development. In this paper, we propose a novel framework that
+leverages Vision-Language Models (VLMs) to interactively generate and edit BTs
+that address visual conditions, enabling context-aware robot operations in
+visually complex environments. A key feature of our approach lies in the
+conditional control through self-prompted visual conditions. Specifically, the
+VLM generates BTs with visual condition nodes, where conditions are expressed
+as free-form text. Another VLM process integrates the text into its prompt and
+evaluates the conditions against real-world images during robot execution. We
+validated our framework in a real-world cafe scenario, demonstrating both its
+feasibility and limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 11 figures, 5 tables. Last updated on January 7th, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Feature Weaving for Neonatal Echocardiographic Viewpoint Video
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satchel French, Faith Zhu, Amish Jain, Naimul Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated viewpoint classification in echocardiograms can help
+under-resourced clinics and hospitals in providing faster diagnosis and
+screening when expert technicians may not be available. We propose a novel
+approach towards echocardiographic viewpoint classification. We show that
+treating viewpoint classification as video classification rather than image
+classification yields advantage. We propose a CNN-GRU architecture with a novel
+temporal feature weaving method, which leverages both spatial and temporal
+information to yield a 4.33\% increase in accuracy over baseline image
+classification while using only four consecutive frames. The proposed approach
+incurs minimal computational overhead. Additionally, we publish the Neonatal
+Echocardiogram Dataset (NED), a professionally-annotated dataset providing
+sixteen viewpoints and associated echocardipgraphy videos to encourage future
+work and development in this field. Code available at:
+https://github.com/satchelfrench/NED
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ISBI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision Language Models as Values Detectors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giulio Antonio Abbo, Tony Belpaeme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models integrating textual and visual inputs have introduced
+new possibilities for interpreting complex data. Despite their remarkable
+ability to generate coherent and contextually relevant text based on visual
+stimuli, the alignment of these models with human perception in identifying
+relevant elements in images requires further exploration. This paper
+investigates the alignment between state-of-the-art LLMs and human annotators
+in detecting elements of relevance within home environment scenarios. We
+created a set of twelve images depicting various domestic scenarios and
+enlisted fourteen annotators to identify the key element in each image. We then
+compared these human responses with outputs from five different LLMs, including
+GPT-4o and four LLaVA variants. Our findings reveal a varied degree of
+alignment, with LLaVA 34B showing the highest performance but still scoring
+low. However, an analysis of the results highlights the models' potential to
+detect value-laden elements in images, suggesting that with improved training
+and refined prompts, LLMs could enhance applications in social robotics,
+assistive technologies, and human-computer interaction by providing deeper
+insights and more contextually relevant responses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual question answering: from early developments to recent advances --
+  a <span class="highlight-title">survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03939v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03939v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ngoc Dung Huynh, Mohamed Reda Bouadjenek, Sunil Aryal, Imran Razzak, Hakim Hacid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) is an evolving research field aimed at
+enabling machines to answer questions about visual content by integrating image
+and language processing techniques such as feature extraction, object
+detection, text embedding, natural language understanding, and language
+generation. With the growth of multimodal data research, VQA has gained
+significant attention due to its broad applications, including interactive
+educational tools, medical image diagnosis, customer service, entertainment,
+and social media captioning. Additionally, VQA plays a vital role in assisting
+visually impaired individuals by generating descriptive content from images.
+This survey introduces a taxonomy of VQA architectures, categorizing them based
+on design choices and key components to facilitate comparative analysis and
+evaluation. We review major VQA approaches, focusing on deep learning-based
+methods, and explore the emerging field of Large Visual Language Models (LVLMs)
+that have demonstrated success in multimodal tasks like VQA. The paper further
+examines available datasets and evaluation metrics essential for measuring VQA
+system performance, followed by an exploration of real-world VQA applications.
+Finally, we highlight ongoing challenges and future directions in VQA research,
+presenting open questions and potential areas for further development. This
+survey serves as a comprehensive resource for researchers and practitioners
+interested in the latest advancements and future
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoStruction: Conjoint radiance field optimization for urban scene
+  reconStruction with limited image overlap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fusang Wang, Hala Djeghim, Nathan Piasco, Moussab Bennehar, Luis Roldão, Dzmitry Tsishkou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing the surrounding surface geometry from recorded driving
+sequences poses a significant challenge due to the limited image overlap and
+complex topology of urban environments. SoTA neural implicit surface
+reconstruction methods often struggle in such setting, either failing due to
+small vision overlap or exhibiting suboptimal performance in accurately
+reconstructing both the surface and fine structures. To address these
+limitations, we introduce CoStruction, a novel hybrid implicit surface
+reconstruction method tailored for large driving sequences with limited camera
+overlap. CoStruction leverages cross-representation uncertainty estimation to
+filter out ambiguous geometry caused by limited observations. Our method
+performs joint optimization of both radiance fields in addition to guided
+sampling achieving accurate reconstruction of large areas along with fine
+structures in complex urban scenarios. Extensive evaluation on major driving
+datasets demonstrates the superiority of our approach in reconstructing large
+driving sequences with limited image overlap, outperforming concurrent SoTA
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Magic Mirror: ID-Preserved Video Generation in Video Diffusion
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuechen Zhang, Yaoyang Liu, Bin Xia, Bohao Peng, Zexin Yan, Eric Lo, Jiaya Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Magic Mirror, a framework for generating identity-preserved videos
+with cinematic-level quality and dynamic motion. While recent advances in video
+diffusion models have shown impressive capabilities in text-to-video
+generation, maintaining consistent identity while producing natural motion
+remains challenging. Previous methods either require person-specific
+fine-tuning or struggle to balance identity preservation with motion diversity.
+Built upon Video Diffusion Transformers, our method introduces three key
+components: (1) a dual-branch facial feature extractor that captures both
+identity and structural features, (2) a lightweight cross-modal adapter with
+Conditioned Adaptive Normalization for efficient identity integration, and (3)
+a two-stage training strategy combining synthetic identity pairs with video
+data. Extensive experiments demonstrate that Magic Mirror effectively balances
+identity consistency with natural motion, outperforming existing methods across
+multiple metrics while requiring minimal parameters added. The code and model
+will be made publicly available at:
+https://github.com/dvlab-research/MagicMirror/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>It is best viewed in Acrobat. Project Page:
+  https://julianjuaner.github.io/projects/MagicMirror/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable AI model reveals disease-related mechanisms in single-cell
+  RNA-seq data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Usman, Olga Varea, Petia Radeva, Josep Canals, Jordi Abante, Daniel Ortiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neurodegenerative diseases (NDDs) are complex and lack effective treatment
+due to their poorly understood mechanism. The increasingly used data analysis
+from Single nucleus RNA Sequencing (snRNA-seq) allows to explore transcriptomic
+events at a single cell level, yet face challenges in interpreting the
+mechanisms underlying a disease. On the other hand, Neural Network (NN) models
+can handle complex data to offer insights but can be seen as black boxes with
+poor interpretability. In this context, explainable AI (XAI) emerges as a
+solution that could help to understand disease-associated mechanisms when
+combined with efficient NN models. However, limited research explores XAI in
+single-cell data. In this work, we implement a method for identifying
+disease-related genes and the mechanistic explanation of disease progression
+based on NN model combined with SHAP. We analyze available Huntington's disease
+(HD) data to identify both HD-altered genes and mechanisms by adding Gene Set
+Enrichment Analysis (GSEA) comparing two methods, differential gene expression
+analysis (DGE) and NN combined with SHAP approach. Our results show that DGE
+and SHAP approaches offer both common and differential sets of altered genes
+and pathways, reinforcing the usefulness of XAI methods for a broader
+perspective of disease.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dolphin: Closed-loop Open-ended Auto-research through Thinking,
+  Practice, and Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiakang Yuan, Xiangchao Yan, Botian Shi, Tao Chen, Wanli Ouyang, Bo Zhang, Lei Bai, Yu Qiao, Bowen Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scientific research paradigm is undergoing a profound transformation
+owing to the development of Artificial Intelligence (AI). Recent works
+demonstrate that various AI-assisted research methods can largely improve
+research efficiency by improving data analysis, accelerating computation, and
+fostering novel idea generation. To further move towards the ultimate goal
+(i.e., automatic scientific research), in this paper, we propose Dolphin, the
+first closed-loop open-ended auto-research framework to further build the
+entire process of human scientific research. Dolphin can generate research
+ideas, perform experiments, and get feedback from experimental results to
+generate higher-quality ideas. More specifically, Dolphin first generates novel
+ideas based on relevant papers which are ranked by the topic and task
+attributes. Then, the codes are automatically generated and debugged with the
+exception-traceback-guided local code structure. Finally, Dolphin automatically
+analyzes the results of each idea and feeds the results back to the next round
+of idea generation. Experiments are conducted on the benchmark datasets of
+different topics and results show that Dolphin can generate novel ideas
+continuously and complete the experiment in a loop. We highlight that Dolphin
+can automatically propose methods that are comparable to the state-of-the-art
+in some tasks such as 2D image classification and 3D point classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 11 figures, and our homepage:
+  https://unimodal4reasoning.github.io/Dolphin-project-page/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HYB-VITON: A Hybrid Approach to Virtual Try-On Combining Explicit and
+  Implicit Warping <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03910v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03910v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kosuke Takemoto, Takafumi Koshinaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual try-on systems have significant potential in e-commerce, allowing
+customers to visualize garments on themselves. Existing image-based methods
+fall into two categories: those that directly warp garment-images onto
+person-images (explicit warping), and those using cross-attention to
+reconstruct given garments (implicit warping). Explicit warping preserves
+garment details but often produces unrealistic output, while implicit warping
+achieves natural reconstruction but struggles with fine details. We propose
+HYB-VITON, a novel approach that combines the advantages of each method and
+includes both a preprocessing pipeline for warped garments and a novel training
+option. These components allow us to utilize beneficial regions of explicitly
+warped garments while leveraging the natural reconstruction of implicit
+warping. A series of experiments demonstrates that HYB-VITON preserves garment
+details more faithfully than recent diffusion-based methods, while producing
+more realistic results than a state-of-the-art explicit warping method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One
+  Vision Token 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaolei Zhang, Qingkai Fang, Zhe Yang, Yang Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of real-time large multimodal models (LMMs) like GPT-4o has
+sparked considerable interest in efficient LMMs. LMM frameworks typically
+encode visual inputs into vision tokens (continuous representations) and
+integrate them and textual instructions into the context of large language
+models (LLMs), where large-scale parameters and numerous context tokens
+(predominantly vision tokens) result in substantial computational overhead.
+Previous efforts towards efficient LMMs always focus on replacing the LLM
+backbone with smaller models, while neglecting the crucial issue of token
+quantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal
+vision tokens. To achieve a high compression ratio of vision tokens while
+preserving visual information, we first analyze how LMMs understand vision
+tokens and find that most vision tokens only play a crucial role in the early
+layers of LLM backbone, where they mainly fuse visual information into text
+tokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to
+fuse visual information into text tokens in advance, thereby facilitating the
+extreme compression of vision tokens fed to LLM backbone into one token.
+LLaVA-Mini is a unified large multimodal model that can support the
+understanding of images, high-resolution images, and videos in an efficient
+manner. Experiments across 11 image-based and 7 video-based benchmarks
+demonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token
+instead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by
+77%, deliver low-latency responses within 40 milliseconds, and process over
+10,000 frames of video on the GPU hardware with 24GB of memory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/ictnlp/LLaVA-Mini; Model:
+  https://huggingface.co/ICTNLP/llava-mini-llama-3.1-8b</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Superpixel Boundary Correction for Weakly-Supervised Semantic
+  Segmentation on Histopathology Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyi Wu, Hong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid advancement of deep learning, computational pathology has made
+significant progress in cancer diagnosis and subtyping. Tissue segmentation is
+a core challenge, essential for prognosis and treatment decisions. Weakly
+supervised semantic segmentation (WSSS) reduces the annotation requirement by
+using image-level labels instead of pixel-level ones. However, Class Activation
+Map (CAM)-based methods still suffer from low spatial resolution and unclear
+boundaries. To address these issues, we propose a multi-level superpixel
+correction algorithm that refines CAM boundaries using superpixel clustering
+and floodfill. Experimental results show that our method achieves great
+performance on breast cancer segmentation dataset with mIoU of 71.08%,
+significantly improving tumor microenvironment boundary delineation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SELMA3D challenge: <span class="highlight-title">Self-supervised</span> learning for 3D light-sheet
+  microscopy image segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Chen, Rami Al-Maskari, Izabela Horvath, Mayar Ali, Luciano Höher, Kaiyuan Yang, Zengming Lin, Zhiwei Zhai, Mengzhe Shen, Dejin Xun, Yi Wang, Tony Xu, Maged Goubran, Yunheng Wu, Ali Erturk, Johannes C. Paetzold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent innovations in light sheet microscopy, paired with developments in
+tissue clearing techniques, enable the 3D imaging of large mammalian tissues
+with cellular resolution. Combined with the progress in large-scale data
+analysis, driven by deep learning, these innovations empower researchers to
+rapidly investigate the morphological and functional properties of diverse
+biological samples. Segmentation, a crucial preliminary step in the analysis
+process, can be automated using domain-specific deep learning models with
+expert-level performance. However, these models exhibit high sensitivity to
+domain shifts, leading to a significant drop in accuracy when applied to data
+outside their training distribution. To address this limitation, and inspired
+by the recent success of self-supervised learning in training generalizable
+models, we organized the SELMA3D Challenge during the MICCAI 2024 conference.
+SELMA3D provides a vast collection of light-sheet images from cleared mice and
+human brains, comprising 35 large 3D images-each with over 1000^3 voxels-and
+315 annotated small patches for finetuning, preliminary testing and final
+testing. The dataset encompasses diverse biological structures, including
+vessel-like and spot-like structures. Five teams participated in all phases of
+the challenge, and their proposed methods are reviewed in this paper.
+Quantitative and qualitative results from most participating teams demonstrate
+that self-supervised learning on large datasets improves segmentation model
+performance and generalization. We will continue to support and extend SELMA3D
+as an inaugural MICCAI challenge focused on self-supervised learning for 3D
+microscopy image segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1st version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CL3DOR: Contrastive Learning for 3D Large Multimodal Models via Odds
+  Ratio on High-Resolution Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keonwoo Kim, Yeongjae Cho, Taebaek Hwang, Minsoo Jo, Sangdo Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research has demonstrated that Large Language Models (LLMs) are not
+limited to text-only tasks but can also function as multimodal models across
+various modalities, including audio, images, and videos. In particular,
+research on 3D Large Multimodal Models (3D LMMs) is making notable strides,
+driven by the potential of processing higher-dimensional data like point
+clouds. However, upon closer examination, we find that the visual and textual
+content within each sample of existing training datasets lacks both high
+informational granularity and clarity, which serve as a bottleneck for precise
+cross-modal understanding. To address these issues, we propose CL3DOR,
+Contrastive Learning for 3D large multimodal models via Odds ratio on
+high-Resolution point clouds, designed to ensure greater specificity and
+clarity in both visual and textual content. Specifically, we increase the
+density of point clouds per object and construct informative hard negative
+responses in the training dataset to penalize unwanted responses. To leverage
+hard negative responses, we incorporate the odds ratio as an auxiliary term for
+contrastive learning into the conventional language modeling loss. CL3DOR
+achieves state-of-the-art performance in 3D scene understanding and reasoning
+benchmarks. Additionally, we demonstrate the effectiveness of CL3DOR's key
+components through extensive experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZDySS -- Zero-Shot Dynamic Scene Stylization using Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Saroha, Florian Hofherr, Mariia Gladkova, Cecilia Curreli, Or Litany, Daniel Cremers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stylizing a dynamic scene based on an exemplar image is critical for various
+real-world applications, including gaming, filmmaking, and augmented and
+virtual reality. However, achieving consistent stylization across both spatial
+and temporal dimensions remains a significant challenge. Most existing methods
+are designed for static scenes and often require an optimization process for
+each style image, limiting their adaptability. We introduce ZDySS, a zero-shot
+stylization framework for dynamic scenes, allowing our model to generalize to
+previously unseen style images at inference. Our approach employs Gaussian
+splatting for scene representation, linking each Gaussian to a learned feature
+vector that renders a feature map for any given view and timestamp. By applying
+style transfer on the learned feature vectors instead of the rendered feature
+map, we enhance spatio-temporal consistency across frames. Our method
+demonstrates superior performance and coherence over state-of-the-art baselines
+in tests on real-world dynamic scenes, making it a robust solution for
+practical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neuromorphic Optical Tracking and Imaging of Randomly Moving Targets
+  through Strongly Scattering Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ning Zhang, Timothy Shea, Arto Nurmikko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking and acquiring simultaneous optical images of randomly moving targets
+obscured by scattering media remains a challenging problem of importance to
+many applications that require precise object localization and identification.
+In this work we develop an end-to-end neuromorphic optical engineering and
+computational approach to demonstrate how to track and image normally invisible
+objects by combining an event detecting camera with a multistage neuromorphic
+deep learning strategy. Photons emerging from dense scattering media are
+detected by the event camera and converted to pixel-wise asynchronized spike
+trains - a first step in isolating object-specific information from the
+dominant uninformative background. Spiking data is fed into a deep spiking
+neural network (SNN) engine where object tracking and image reconstruction are
+performed by two separate yet interconnected modules running in parallel in
+discrete time steps over the event duration. Through benchtop experiments we
+demonstrate tracking and imaging randomly moving objects in dense turbid media
+as well as image reconstruction of spatially stationary but optically dynamic
+objects. Standardized character sets serve as representative proxies for
+geometrically complex objects, underscoring the method's generality. The
+results highlight the advantages of a fully neuromorphic approach in meeting a
+major imaging technology with high computational efficiency and low power
+consumption.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semise: Semi-supervised learning for severity representation in medical
+  image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dung T. Tran, Hung Vu, Anh Tran, Hieu Pham, Hong Nguyen, Phong Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces SEMISE, a novel method for representation learning in
+medical imaging that combines self-supervised and supervised learning. By
+leveraging both labeled and augmented data, SEMISE addresses the challenge of
+data scarcity and enhances the encoder's ability to extract meaningful
+features. This integrated approach leads to more informative representations,
+improving performance on downstream tasks. As result, our approach achieved a
+12% improvement in classification and a 3% improvement in segmentation,
+outperforming existing methods. These results demonstrate the potential of
+SIMESE to advance medical image analysis and offer more accurate solutions for
+healthcare applications, particularly in contexts where labeled data is
+limited.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the 2025 IEEE 22nd International
+  Symposium on Biomedical Imaging (ISBI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video
+  Generation Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekai Gu, Rui Yan, Jiahao Lu, Peng Li, Zhiyang Dou, Chenyang Si, Zhen Dong, Qifeng Liu, Cheng Lin, Ziwei Liu, Wenping Wang, Yuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated impressive performance in generating
+high-quality videos from text prompts or images. However, precise control over
+the video generation process, such as camera manipulation or content editing,
+remains a significant challenge. Existing methods for controlled video
+generation are typically limited to a single control type, lacking the
+flexibility to handle diverse control demands. In this paper, we introduce
+Diffusion as Shader (DaS), a novel approach that supports multiple video
+control tasks within a unified architecture. Our key insight is that achieving
+versatile video control necessitates leveraging 3D control signals, as videos
+are fundamentally 2D renderings of dynamic 3D content. Unlike prior methods
+limited to 2D control signals, DaS leverages 3D tracking videos as control
+inputs, making the video diffusion process inherently 3D-aware. This innovation
+allows DaS to achieve a wide range of video controls by simply manipulating the
+3D tracking videos. A further advantage of using 3D tracking videos is their
+ability to effectively link frames, significantly enhancing the temporal
+consistency of the generated videos. With just 3 days of fine-tuning on 8 H800
+GPUs using less than 10k videos, DaS demonstrates strong control capabilities
+across diverse tasks, including mesh-to-video generation, camera control,
+motion transfer, and object manipulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://igl-hkust.github.io/das/ Codes:
+  https://github.com/IGL-HKUST/DiffusionAsShader</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedFocusCLIP : Improving few shot classification in medical <span class="highlight-title">dataset</span>s
+  using pixel wise attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03839v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03839v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aadya Arora, Vinay Namboodiri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the popularity of foundational models, parameter efficient fine tuning
+has become the defacto approach to leverage pretrained models to perform
+downstream tasks. Taking inspiration from recent advances in large language
+models, Visual Prompt Tuning, and similar techniques, learn an additional
+prompt to efficiently finetune a pretrained vision foundational model. However,
+we observe that such prompting is insufficient for fine-grained visual
+classification tasks such as medical image classification, where there is large
+inter-class variance, and small intra-class variance. Hence, in this paper we
+propose to leverage advanced segmentation capabilities of Segment Anything
+Model 2 (SAM2) as a visual prompting cue to help visual encoder in the CLIP
+(Contrastive Language-Image Pretraining) by guiding the attention in CLIP
+visual encoder to relevant regions in the image. This helps the model to focus
+on highly discriminative regions, without getting distracted from visually
+similar background features, an essential requirement in a fewshot, finegrained
+classification setting. We evaluate our method on diverse medical datasets
+including X-rays, CT scans, and MRI images, and report an accuracy of (71%,
+81%, 86%, 58%) from the proposed approach on (COVID, lung-disease, brain-tumor,
+breast-cancer) datasets against (66%, 70%, 68%, 29%) from a pretrained CLIP
+model after fewshot training. The proposed approach also allows to obtain
+interpretable explanation for the classification performance through the
+localization obtained using segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LM-Net: A Light-weight and Multi-scale Network for Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03838v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03838v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenkun Lu, Chaoyin She, Wei Wang, Qinghua Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current medical image segmentation approaches have limitations in deeply
+exploring multi-scale information and effectively combining local detail
+textures with global contextual semantic information. This results in
+over-segmentation, under-segmentation, and blurred segmentation boundaries. To
+tackle these challenges, we explore multi-scale feature representations from
+different perspectives, proposing a novel, lightweight, and multi-scale
+architecture (LM-Net) that integrates advantages of both Convolutional Neural
+Networks (CNNs) and Vision Transformers (ViTs) to enhance segmentation
+accuracy. LM-Net employs a lightweight multi-branch module to capture
+multi-scale features at the same level. Furthermore, we introduce two modules
+to concurrently capture local detail textures and global semantics with
+multi-scale features at different levels: the Local Feature Transformer (LFT)
+and Global Feature Transformer (GFT). The LFT integrates local window
+self-attention to capture local detail textures, while the GFT leverages global
+self-attention to capture global contextual semantics. By combining these
+modules, our model achieves complementarity between local and global
+representations, alleviating the problem of blurred segmentation boundaries in
+medical image segmentation. To evaluate the feasibility of LM-Net, extensive
+experiments have been conducted on three publicly available datasets with
+different modalities. Our proposed model achieves state-of-the-art results,
+surpassing previous methods, while only requiring 4.66G FLOPs and 5.4M
+parameters. These state-of-the-art results on three datasets with different
+modalities demonstrate the effectiveness and adaptability of our proposed
+LM-Net for various medical image segmentation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor
+  Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runci Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain tumors can result in neurological dysfunction, alterations in cognitive
+and psychological states, increased intracranial pressure, and the occurrence
+of seizures, thereby presenting a substantial risk to human life and health.
+The You Only Look Once(YOLO) series models have demonstrated superior accuracy
+in object detection for medical imaging. In this paper, we develop a novel
+SCC-YOLO architecture by integrating the SCConv attention mechanism into
+YOLOv9. The SCConv module reconstructs an efficient convolutional module by
+reducing spatial and channel redundancy among features, thereby enhancing the
+learning of image features. We investigate the impact of intergrating different
+attention mechanisms with the YOLOv9 model on brain tumor image detection using
+both the Br35H dataset and our self-made dataset(Brain_Tumor_Dataset).
+Experimental results show that on the Br35H dataset, SCC-YOLO achieved a 0.3%
+improvement in mAp50 compared to YOLOv9, while on our self-made dataset,
+SCC-YOLO exhibited a 0.5% improvement over YOLOv9. SCC-YOLO has reached
+state-of-the-art performance in brain tumor detection. Source code is available
+at : https://jihulab.com/healthcare-information-studio/SCC-YOLO/-/tree/master
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeshConv3D: Efficient convolution and pooling operators for triangular
+  3D meshes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Germain Bregeon, Marius Preda, Radu Ispas, Titus Zaharia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks (CNNs) have been pivotal in various 2D image
+analysis tasks, including computer vision, image indexing and retrieval or
+semantic classification. Extending CNNs to 3D data such as point clouds and 3D
+meshes raises significant challenges since the very basic convolution and
+pooling operators need to be completely re-visited and re-defined in an
+appropriate manner to tackle irregular connectivity issues. In this paper, we
+introduce MeshConv3D, a 3D mesh-dedicated methodology integrating specialized
+convolution and face collapse-based pooling operators. MeshConv3D operates
+directly on meshes of arbitrary topology, without any need of prior
+re-meshing/conversion techniques. In order to validate our approach, we have
+considered a semantic classification task. The experimental results obtained on
+three distinct benchmark datasets show that the proposed approach makes it
+possible to achieve equivalent or superior classification results, while
+minimizing the related memory footprint and computational load.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Sylvester Posterior Inference for Adaptive Compressed Sensing in
+  Ultrasound Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon W. Penninga, Hans van Gorp, Ruud J. G. van Sloun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrasound images are commonly formed by sequential acquisition of
+beam-steered scan-lines. Minimizing the number of required scan-lines can
+significantly enhance frame rate, field of view, energy efficiency, and data
+transfer speeds. Existing approaches typically use static subsampling schemes
+in combination with sparsity-based or, more recently, deep-learning-based
+recovery. In this work, we introduce an adaptive subsampling method that
+maximizes intrinsic information gain in-situ, employing a Sylvester Normalizing
+Flow encoder to infer an approximate Bayesian posterior under partial
+observation in real-time. Using the Bayesian posterior and a deep generative
+model for future observations, we determine the subsampling scheme that
+maximizes the mutual information between the subsampled observations, and the
+next frame of the video. We evaluate our approach using the EchoNet cardiac
+ultrasound video dataset and demonstrate that our active sampling method
+outperforms competitive baselines, including uniform and variable-density
+random sampling, as well as equidistantly spaced scan-lines, improving mean
+absolute reconstruction error by 15%. Moreover, posterior inference and the
+sampling scheme generation are performed in just 0.015 seconds (66Hz), making
+it fast enough for real-time 2D ultrasound imaging applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MADation: Face Morphing Attack Detection with Foundation Models <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduarda Caldeira, Guray Ozgur, Tahar Chettaoui, Marija Ivanovska, Fadi Boutros, Vitomir Struc, Naser Damer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the considerable performance improvements of face recognition
+algorithms in recent years, the same scientific advances responsible for this
+progress can also be used to create efficient ways to attack them, posing a
+threat to their secure deployment. Morphing attack detection (MAD) systems aim
+to detect a specific type of threat, morphing attacks, at an early stage,
+preventing them from being considered for verification in critical processes.
+Foundation models (FM) learn from extensive amounts of unlabeled data,
+achieving remarkable zero-shot generalization to unseen domains. Although this
+generalization capacity might be weak when dealing with domain-specific
+downstream tasks such as MAD, FMs can easily adapt to these settings while
+retaining the built-in knowledge acquired during pre-training. In this work, we
+recognize the potential of FMs to perform well in the MAD task when properly
+adapted to its specificities. To this end, we adapt FM CLIP architectures with
+LoRA weights while simultaneously training a classification header. The
+proposed framework, MADation surpasses our alternative FM and transformer-based
+frameworks and constitutes the first adaption of FMs to the MAD task. MADation
+presents competitive results with current MAD solutions in the literature and
+even surpasses them in several evaluation scenarios. To encourage
+reproducibility and facilitate further research in MAD, we publicly release the
+implementation of MADation at https: //github.com/gurayozgur/MADation
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WACV 2025 workshops</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KAnoCLIP: Zero-Shot Anomaly Detection through Knowledge-Driven <span class="highlight-title">Prompt</span>
+  Learning and Enhanced Cross-Modal Integration <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03786v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03786v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengyuan Li, Suyang Zhou, Jieping Kong, Lei Qi, Hui Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot anomaly detection (ZSAD) identifies anomalies without needing
+training samples from the target dataset, essential for scenarios with privacy
+concerns or limited data. Vision-language models like CLIP show potential in
+ZSAD but have limitations: relying on manually crafted fixed textual
+descriptions or anomaly prompts is time-consuming and prone to semantic
+ambiguity, and CLIP struggles with pixel-level anomaly segmentation, focusing
+more on global semantics than local details. To address these limitations, We
+introduce KAnoCLIP, a novel ZSAD framework that leverages vision-language
+models. KAnoCLIP combines general knowledge from a Large Language Model
+(GPT-3.5) and fine-grained, image-specific knowledge from a Visual Question
+Answering system (Llama3) via Knowledge-Driven Prompt Learning (KnPL). KnPL
+uses a knowledge-driven (KD) loss function to create learnable anomaly prompts,
+removing the need for fixed text prompts and enhancing generalization. KAnoCLIP
+includes the CLIP visual encoder with V-V attention (CLIP-VV), Bi-Directional
+Cross-Attention for Multi-Level Cross-Modal Interaction (Bi-CMCI), and
+Conv-Adapter. These components preserve local visual semantics, improve local
+cross-modal fusion, and align global visual features with textual information,
+enhancing pixel-level anomaly detection. KAnoCLIP achieves state-of-the-art
+performance in ZSAD across 12 industrial and medical datasets, demonstrating
+superior generalization compared to existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinbin Yuan, ZhaoHui Zheng, Yuxuan Li, Xialei Liu, Li Liu, Xiang Li, Qibin Hou, Ming-Ming Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While witnessed with rapid development, remote sensing object detection
+remains challenging for detecting high aspect ratio objects. This paper shows
+that large strip convolutions are good feature representation learners for
+remote sensing object detection and can detect objects of various aspect ratios
+well. Based on large strip convolutions, we build a new network architecture
+called Strip R-CNN, which is simple, efficient, and powerful. Unlike recent
+remote sensing object detectors that leverage large-kernel convolutions with
+square shapes, our Strip R-CNN takes advantage of sequential orthogonal large
+strip convolutions to capture spatial information. In addition, we enhance the
+localization capability of remote-sensing object detectors by decoupling the
+detection heads and equipping the localization head with strip convolutions to
+better localize the target objects. Extensive experiments on several
+benchmarks, e.g., DOTA, FAIR1M, HRSC2016, and DIOR, show that our Strip R-CNN
+can largely improve previous works. Notably, our 30M model achieves 82.75% mAP
+on DOTA-v1.0, setting a new state-of-the-art record.Code is available at
+https://github.com/YXB-NKU/Strip-R-CNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoFish: <span class="highlight-title">Dataset</span> and Benchmark for Fine-grained Analysis of Fish <span class="chip">WACV'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Hein Bengtson, Daniel Lehotský, Vasiliki Ismiroglou, Niels Madsen, Thomas B. Moeslund, Malte Pedersen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated fish documentation processes are in the near future expected to
+play an essential role in sustainable fisheries management and for addressing
+challenges of overfishing. In this paper, we present a novel and publicly
+available dataset named AutoFish designed for fine-grained fish analysis. The
+dataset comprises 1,500 images of 454 specimens of visually similar fish placed
+in various constellations on a white conveyor belt and annotated with instance
+segmentation masks, IDs, and length measurements. The data was collected in a
+controlled environment using an RGB camera. The annotation procedure involved
+manual point annotations, initial segmentation masks proposed by the Segment
+Anything Model (SAM), and subsequent manual correction of the masks. We
+establish baseline instance segmentation results using two variations of the
+Mask2Former architecture, with the best performing model reaching an mAP of
+89.15%. Additionally, we present two baseline length estimation methods, the
+best performing being a custom MobileNetV2-based regression model reaching an
+MAE of 0.62cm in images with no occlusion and 1.38cm in images with occlusion.
+Link to project page: https://vap.aau.dk/autofish/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the 3rd Workshop on Maritime Computer Vision (MaCVi) at WACV'25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Segmentation: Inducing graph-based learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryan Singh, Pepijn Van de Ven, Ciarán Eising, Patrick Denny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the potential of graph neural networks (GNNs) to enhance
+semantic segmentation across diverse image modalities. We evaluate the
+effectiveness of a novel GNN-based U-Net architecture on three distinct
+datasets: PascalVOC, a standard benchmark for natural image segmentation,
+WoodScape, a challenging dataset of fisheye images commonly used in autonomous
+driving, introducing significant geometric distortions; and ISIC2016, a dataset
+of dermoscopic images for skin lesion segmentation. We compare our proposed
+UNet-GNN model against established convolutional neural networks (CNNs) based
+segmentation models, including U-Net and U-Net++, as well as the
+transformer-based SwinUNet. Unlike these methods, which primarily rely on local
+convolutional operations or global self-attention, GNNs explicitly model
+relationships between image regions by constructing and operating on a graph
+representation of the image features. This approach allows the model to capture
+long-range dependencies and complex spatial relationships, which we hypothesize
+will be particularly beneficial for handling geometric distortions present in
+fisheye imagery and capturing intricate boundaries in medical images. Our
+analysis demonstrates the versatility of GNNs in addressing diverse
+segmentation challenges and highlights their potential to improve segmentation
+accuracy in various applications, including autonomous driving and medical
+image analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Re-Visible Dual-Domain <span class="highlight-title">Self-Supervised</span> Deep Unfolding Network for MRI
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03737v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03737v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Zhang, Qi Wang, Jian Sun, Zhijie Wen, Jun Shi, Shihui Ying
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Magnetic Resonance Imaging (MRI) is widely used in clinical practice, but
+suffered from prolonged acquisition time. Although deep learning methods have
+been proposed to accelerate acquisition and demonstrate promising performance,
+they rely on high-quality fully-sampled datasets for training in a supervised
+manner. However, such datasets are time-consuming and expensive-to-collect,
+which constrains their broader applications. On the other hand, self-supervised
+methods offer an alternative by enabling learning from under-sampled data
+alone, but most existing methods rely on further partitioned under-sampled
+k-space data as model's input for training, resulting in a loss of valuable
+information. Additionally, their models have not fully incorporated image
+priors, leading to degraded reconstruction performance. In this paper, we
+propose a novel re-visible dual-domain self-supervised deep unfolding network
+to address these issues when only under-sampled datasets are available.
+Specifically, by incorporating re-visible dual-domain loss, all under-sampled
+k-space data are utilized during training to mitigate information loss caused
+by further partitioning. This design enables the model to implicitly adapt to
+all under-sampled k-space data as input. Additionally, we design a deep
+unfolding network based on Chambolle and Pock Proximal Point Algorithm
+(DUN-CP-PPA) to achieve end-to-end reconstruction, incorporating imaging
+physics and image priors to guide the reconstruction process. By employing a
+Spatial-Frequency Feature Extraction (SFFE) block to capture global and local
+feature representation, we enhance the model's efficiency to learn
+comprehensive image priors. Experiments conducted on the fastMRI and IXI
+datasets demonstrate that our method significantly outperforms state-of-the-art
+approaches in terms of reconstruction performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Realistic Test-Time Adaptation of Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Zanella, Clément Fuchs, Christophe De Vleeschouwer, Ismail Ben Ayed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The zero-shot capabilities of Vision-Language Models (VLMs) have been widely
+leveraged to improve predictive performance. However, previous works on
+transductive or test-time adaptation (TTA) often make strong assumptions about
+the data distribution, such as the presence of all classes. Our work challenges
+these favorable deployment scenarios, and introduces a more realistic
+evaluation framework, including: (i) a variable number of effective classes for
+adaptation within a single batch, and (ii) non-i.i.d. batches of test samples
+in online adaptation settings. We provide comprehensive evaluations,
+comparisons, and ablation studies that demonstrate how current transductive or
+TTA methods for VLMs systematically compromise the models' initial zero-shot
+robustness across various realistic scenarios, favoring performance gains under
+advantageous assumptions about the test samples' distributions. Furthermore, we
+introduce StatA, a versatile method that could handle a wide range of
+deployment scenarios, including those with a variable number of effective
+classes at test time. Our approach incorporates a novel regularization term
+designed specifically for VLMs, which acts as a statistical anchor preserving
+the initial text-encoder knowledge, particularly in low-data regimes. Code
+available at https://github.com/MaxZanella/StatA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-adaptive vision-language model for 3D segmentation of pulmonary
+  artery and vein 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03722v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03722v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaotong Guo, Deqian Yang, Dan Wang, Haochen Zhao, Yuan Li, Zhilin Sui, Tao Zhou, Lijun Zhang, Yanda Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of pulmonary structures iscrucial in clinical
+diagnosis, disease study, and treatment planning. Significant progress has been
+made in deep learning-based segmentation techniques, but most require much
+labeled data for training. Consequently, developing precise segmentation
+methods that demand fewer labeled datasets is paramount in medical image
+analysis. The emergence of pre-trained vision-language foundation models, such
+as CLIP, recently opened the door for universal computer vision tasks.
+Exploiting the generalization ability of these pre-trained foundation models on
+downstream tasks, such as segmentation, leads to unexpected performance with a
+relatively small amount of labeled data. However, exploring these models for
+pulmonary artery-vein segmentation is still limited. This paper proposes a
+novel framework called Language-guided self-adaptive Cross-Attention Fusion
+Framework. Our method adopts pre-trained CLIP as a strong feature extractor for
+generating the segmentation of 3D CT scans, while adaptively aggregating the
+cross-modality of text and image representations. We propose a s pecially
+designed adapter module to fine-tune pre-trained CLIP with a self-adaptive
+learning strategy to effectively fuse the two modalities of embeddings. We
+extensively validate our method on a local dataset, which is the largest
+pulmonary artery-vein CT dataset to date and consists of 718 labeled data in
+total. The experiments show that our method outperformed other state-of-the-art
+methods by a large margin. Our data and code will be made publicly available
+upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages,3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Materialist: Physically Based Editing Using Single-Image Inverse
+  Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lezhong Wang, Duc Minh Tran, Ruiqi Cui, Thomson TG, Manmohan Chandraker, Jeppe Revall Frisvad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To perform image editing based on single-view, inverse physically based
+rendering, we present a method combining a learning-based approach with
+progressive differentiable rendering. Given an image, our method leverages
+neural networks to predict initial material properties. Progressive
+differentiable rendering is then used to optimize the environment map and
+refine the material properties with the goal of closely matching the rendered
+result to the input image. We require only a single image while other inverse
+rendering methods based on the rendering equation require multiple views. In
+comparison to single-view methods that rely on neural renderers, our approach
+achieves more realistic light material interactions, accurate shadows, and
+global illumination. Furthermore, with optimized material properties and
+illumination, our method enables a variety of tasks, including physically based
+material editing, object insertion, and relighting. We also propose a method
+for material transparency editing that operates effectively without requiring
+full scene geometry. Compared with methods based on Stable Diffusion, our
+approach offers stronger interpretability and more realistic light refraction
+based on empirical results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code will be available at github.com/lez-s/Materialist</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MoDec-GS: Global-to-Local Motion Decomposition and Temporal Interval
+  Adjustment for Compact Dynamic 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangwoon Kwak, Joonsoo Kim, Jun Young Jeong, Won-Sik Cheong, Jihyong Oh, Munchurl Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting (3DGS) has made significant strides in scene
+representation and neural rendering, with intense efforts focused on adapting
+it for dynamic scenes. Despite delivering remarkable rendering quality and
+speed, existing methods struggle with storage demands and representing complex
+real-world motions. To tackle these issues, we propose MoDecGS, a
+memory-efficient Gaussian splatting framework designed for reconstructing novel
+views in challenging scenarios with complex motions. We introduce GlobaltoLocal
+Motion Decomposition (GLMD) to effectively capture dynamic motions in a
+coarsetofine manner. This approach leverages Global Canonical Scaffolds (Global
+CS) and Local Canonical Scaffolds (Local CS), extending static Scaffold
+representation to dynamic video reconstruction. For Global CS, we propose
+Global Anchor Deformation (GAD) to efficiently represent global dynamics along
+complex motions, by directly deforming the implicit Scaffold attributes which
+are anchor position, offset, and local context features. Next, we finely adjust
+local motions via the Local Gaussian Deformation (LGD) of Local CS explicitly.
+Additionally, we introduce Temporal Interval Adjustment (TIA) to automatically
+control the temporal coverage of each Local CS during training, allowing
+MoDecGS to find optimal interval assignments based on the specified number of
+temporal segments. Extensive evaluations demonstrate that MoDecGS achieves an
+average 70% reduction in model size over stateoftheart methods for dynamic 3D
+Gaussians from realworld dynamic videos while maintaining or even improving
+rendering quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The last two authors are co-corresponding authors. Please visit our
+  project page at https://kaist-viclab.github.io/MoDecGS-site/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AuxDepthNet: Real-Time Monocular 3D Object Detection with
+  Depth-Sensitive Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruochen Zhang, Hyeung-Sik Choi, Dongwook Jung, Phan Huy Nam Anh, Sang-Ki Jeong, Zihao Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular 3D object detection is a challenging task in autonomous systems due
+to the lack of explicit depth information in single-view images. Existing
+methods often depend on external depth estimators or expensive sensors, which
+increase computational complexity and hinder real-time performance. To overcome
+these limitations, we propose AuxDepthNet, an efficient framework for real-time
+monocular 3D object detection that eliminates the reliance on external depth
+maps or pre-trained depth models. AuxDepthNet introduces two key components:
+the Auxiliary Depth Feature (ADF) module, which implicitly learns
+depth-sensitive features to improve spatial reasoning and computational
+efficiency, and the Depth Position Mapping (DPM) module, which embeds depth
+positional information directly into the detection process to enable accurate
+object localization and 3D bounding box regression. Leveraging the DepthFusion
+Transformer architecture, AuxDepthNet globally integrates visual and
+depth-sensitive features through depth-guided interactions, ensuring robust and
+efficient detection. Extensive experiments on the KITTI dataset show that
+AuxDepthNet achieves state-of-the-art performance, with $\text{AP}_{3D}$ scores
+of 24.72\% (Easy), 18.63\% (Moderate), and 15.31\% (Hard), and
+$\text{AP}_{\text{BEV}}$ scores of 34.11\% (Easy), 25.18\% (Moderate), and
+21.90\% (Hard) at an IoU threshold of 0.7.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Motion-Aware Generative Frame Interpolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03699v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03699v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guozhen Zhang, Yuhan Zhu, Yutao Cui, Xiaotong Zhao, Kai Ma, Limin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative frame interpolation, empowered by large-scale pre-trained video
+generation models, has demonstrated remarkable advantages in complex scenes.
+However, existing methods heavily rely on the generative model to independently
+infer the correspondences between input frames, an ability that is inadequately
+developed during pre-training. In this work, we propose a novel framework,
+termed Motion-aware Generative frame interpolation (MoG), to significantly
+enhance the model's motion awareness by integrating explicit motion guidance.
+Specifically we investigate two key questions: what can serve as an effective
+motion guidance, and how we can seamlessly embed this guidance into the
+generative model. For the first question, we reveal that the intermediate flow
+from flow-based interpolation models could efficiently provide task-oriented
+motion guidance. Regarding the second, we first obtain guidance-based
+representations of intermediate frames by warping input frames' representations
+using guidance, and then integrate them into the model at both latent and
+feature levels. To demonstrate the versatility of our method, we train MoG on
+both real-world and animation datasets. Comprehensive evaluations show that our
+MoG significantly outperforms the existing methods in both domains, achieving
+superior video quality and improved fidelity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMIR: Efficient Synthetic Data Pipeline To Improve Multi-Image Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Li, Rahul Thapa, Rahul Chalamala, Qingyang Wu, Kezhen Chen, James Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Models (VLMs) have shown strong performance in understanding
+single images, aided by numerous high-quality instruction datasets. However,
+multi-image reasoning tasks are still under-explored in the open-source
+community due to two main challenges: (1) scaling datasets with multiple
+correlated images and complex reasoning instructions is resource-intensive and
+maintaining quality is difficult, and (2) there is a lack of robust evaluation
+benchmarks for multi-image tasks. To address these issues, we introduce SMIR,
+an efficient synthetic data-generation pipeline for multi-image reasoning, and
+a high-quality dataset generated using this pipeline. Our pipeline efficiently
+extracts highly correlated images using multimodal embeddings, combining visual
+and descriptive information and leverages open-source LLMs to generate quality
+instructions. Using this pipeline, we generated 160K synthetic training
+samples, offering a cost-effective alternative to expensive closed-source
+solutions. Additionally, we present SMIR-BENCH, a novel multi-image reasoning
+evaluation benchmark comprising 200 diverse examples across 7 complex
+multi-image reasoning tasks. SMIR-BENCH is multi-turn and utilizes a VLM judge
+to evaluate free-form responses, providing a comprehensive assessment of model
+expressiveness and reasoning capability across modalities. We demonstrate the
+effectiveness of SMIR dataset by fine-tuning several open-source VLMs and
+evaluating their performance on SMIR-BENCH. Our results show that models
+trained on our dataset outperform baseline models in multi-image reasoning
+tasks up to 8% with a much more scalable data pipeline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Action Quality Assessment via Hierarchical Pose-guided Multi-stage
+  Contrastive Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengshi Qi, Hao Ye, Jiaxuan Peng, Huadong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Action Quality Assessment (AQA), which aims at automatic and fair evaluation
+of athletic performance, has gained increasing attention in recent years.
+However, athletes are often in rapid movement and the corresponding visual
+appearance variances are subtle, making it challenging to capture fine-grained
+pose differences and leading to poor estimation performance. Furthermore, most
+common AQA tasks, such as diving in sports, are usually divided into multiple
+sub-actions, each of which contains different durations. However, existing
+methods focus on segmenting the video into fixed frames, which disrupts the
+temporal continuity of sub-actions resulting in unavoidable prediction errors.
+To address these challenges, we propose a novel action quality assessment
+method through hierarchically pose-guided multi-stage contrastive regression.
+Firstly, we introduce a multi-scale dynamic visual-skeleton encoder to capture
+fine-grained spatio-temporal visual and skeletal features. Then, a procedure
+segmentation network is introduced to separate different sub-actions and obtain
+segmented features. Afterwards, the segmented visual and skeletal features are
+both fed into a multi-modal fusion module as physics structural priors, to
+guide the model in learning refined activity similarities and variances.
+Finally, a multi-stage contrastive learning regression approach is employed to
+learn discriminative representations and output prediction results. In
+addition, we introduce a newly-annotated FineDiving-Pose Dataset to improve the
+current low-quality human pose labels. In experiments, the results on
+FineDiving and MTL-AQA datasets demonstrate the effectiveness and superiority
+of our proposed approach. Our source code and dataset are available at
+https://github.com/Lumos0507/HP-MCoRe.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Compositional Complexity: How to Detect a Human-readable Messsage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis Mahon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data complexity is an important concept in the natural sciences and related
+areas, but lacks a rigorous and computable definition. In this paper, we focus
+on a particular sense of complexity that is high if the data is structured in a
+way that could serve to communicate a message. In this sense, human speech,
+written language, drawings, diagrams and photographs are high complexity,
+whereas data that is close to uniform throughout or populated by random values
+is low complexity. We describe a general framework for measuring data
+complexity based on dividing the shortest description of the data into a
+structured and an unstructured portion, and taking the size of the former as
+the complexity score. We outline an application of this framework in
+statistical mechanics that may allow a more objective characterisation of the
+macrostate and entropy of a physical system. Then, we derive a more precise and
+computable definition geared towards human communication, by proposing local
+compositionality as an appropriate specific structure. We demonstrate
+experimentally that this method can distinguish meaningful signals from noise
+or repetitive signals in auditory, visual and text domains, and could
+potentially help determine whether an extra-terrestrial signal contained a
+message.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DehazeGS: Seeing Through Fog with 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinze Yu, Yiqun Wang, Zhengda Lu, Jianwei Guo, Yong Li, Hongxing Qin, Xiaopeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current novel view synthesis tasks primarily rely on high-quality and clear
+images. However, in foggy scenes, scattering and attenuation can significantly
+degrade the reconstruction and rendering quality. Although NeRF-based dehazing
+reconstruction algorithms have been developed, their use of deep fully
+connected neural networks and per-ray sampling strategies leads to high
+computational costs. Moreover, NeRF's implicit representation struggles to
+recover fine details from hazy scenes. In contrast, recent advancements in 3D
+Gaussian Splatting achieve high-quality 3D scene reconstruction by explicitly
+modeling point clouds into 3D Gaussians. In this paper, we propose leveraging
+the explicit Gaussian representation to explain the foggy image formation
+process through a physically accurate forward rendering process. We introduce
+DehazeGS, a method capable of decomposing and rendering a fog-free background
+from participating media using only muti-view foggy images as input. We model
+the transmission within each Gaussian distribution to simulate the formation of
+fog. During this process, we jointly learn the atmospheric light and scattering
+coefficient while optimizing the Gaussian representation of the hazy scene. In
+the inference stage, we eliminate the effects of scattering and attenuation on
+the Gaussians and directly project them onto a 2D plane to obtain a clear view.
+Experiments on both synthetic and real-world foggy datasets demonstrate that
+DehazeGS achieves state-of-the-art performance in terms of both rendering
+quality and computational efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing the Understanding of Fine-Grained 3D Forest Structures using
+  Digital Cousins and Simulation-to-Reality: Methods and <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03637v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03637v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Liu, Duanchu Wang, Haoran Gong, Chongyu Wang, Jihua Zhu, Di Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and analyzing the spatial semantics and structure of forests is
+essential for accurate forest resource monitoring and ecosystem research.
+However, the lack of large-scale and annotated datasets has limited the
+widespread use of advanced intelligent techniques in this field. To address
+this challenge, a fully automated synthetic data generation and processing
+framework based on the concepts of Digital Cousins and Simulation-to-Reality
+(Sim2Real) is proposed, offering versatility and scalability to any size and
+platform. Using this process, we created the Boreal3D, the world's largest
+forest point cloud dataset. It includes 1000 highly realistic and structurally
+diverse forest plots across four different platforms, totaling 48,403 trees and
+over 35.3 billion points. Each point is labeled with semantic, instance, and
+viewpoint information, while each tree is described with structural parameters
+such as diameter, crown width, leaf area, and total volume. We designed and
+conducted extensive experiments to evaluate the potential of Boreal3D in
+advancing fine-grained 3D forest structure analysis in real-world applications.
+The results demonstrate that with certain strategies, models pre-trained on
+synthetic data can significantly improve performance when applied to real
+forest datasets. Especially, the findings reveal that fine-tuning with only 20%
+of real-world data enables the model to achieve performance comparable to
+models trained exclusively on entire real-world data, highlighting the value
+and potential of our proposed framework. The Boreal3D dataset, and more
+broadly, the synthetic data augmentation framework, is poised to become a
+critical resource for advancing research in large-scale 3D forest scene
+understanding and structural parameter estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Optimal Latent Trajetory for Zero-shot Image Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maomao Li, Yu Li, Yunfei Liu, Dong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Editability and fidelity are two essential demands for text-driven image
+editing, which expects that the editing area should align with the target
+prompt and the rest should remain unchanged separately. The current
+cutting-edge editing methods usually obey an "inversion-then-editing" pipeline,
+where the source image is first inverted to an approximate Gaussian noise
+${z}_T$, based on which a sampling process is conducted using the target
+prompt. Nevertheless, we argue that it is not a good choice to use a
+near-Gaussian noise as a pivot for further editing since it almost lost all
+structure fidelity. We verify this by a pilot experiment, discovering that some
+intermediate-inverted latents can achieve a better trade-off between
+editability and fidelity than the fully-inverted ${z}_T$. Based on this, we
+propose a novel editing paradigm dubbed ZZEdit, which gentlely strengthens the
+target guidance on a sufficient-for-editing while structure-preserving latent.
+Specifically, we locate such an editing pivot by searching the first point on
+the inversion trajectory which has larger response levels toward the target
+prompt than the source one. Then, we propose a ZigZag process to perform mild
+target guiding on this pivot, which fulfills denoising and inversion
+iteratively, approaching the target while still holding fidelity. Afterwards,
+to achieve the same number of inversion and denoising steps, we perform a pure
+sampling process under the target prompt. Extensive experiments highlight the
+effectiveness of our ZZEdit in diverse image editing scenarios compared with
+the "inversion-then-editing" pipeline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MC-VTON: Minimal Control Virtual Try-On Diffusion <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03630v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03630v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junsheng Luan, Guangyuan Li, Lei Zhao, Wei Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual try-on methods based on diffusion models achieve realistic try-on
+effects. They use an extra reference network or an additional image encoder to
+process multiple conditional image inputs, which results in high training
+costs. Besides, they require more than 25 inference steps, bringing a long
+inference time. In this work, with the development of diffusion transformer
+(DiT), we rethink the necessity of reference network or image encoder, then
+propose MC-VTON, enabling DiT to integrate minimal conditional try-on inputs by
+utilizing its intrinsic backbone. Compared to existing methods, the superiority
+of MC-VTON is demonstrated in four aspects: (1)Superior detail fidelity. Our
+DiT-based MC-VTON exhibits superior fidelity in preserving fine-grained
+details. (2)Simplified network and inputs. We remove any extra reference
+network or image encoder. We also remove unnecessary conditions like the long
+prompt, pose estimation, human parsing, and depth map. We require only the
+masked person image and the garment image. (3)Parameter-efficient training. To
+process the try-on task, we fine-tune the FLUX.1-dev with only 39.7M additional
+parameters 0.33% of the backbone parameters). (4)Less inference steps. We apply
+distillation diffusion on MC-VTON and only need 8 steps to generate a realistic
+try-on image, with only 86.8M additional parameters (0.72% of the backbone
+parameters). Experiments show that MC-VTON achieves superior qualitative and
+quantitative results with fewer condition inputs, fewer inference steps, and
+fewer trainable parameters than baseline methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CFFormer: Cross CNN-<span class="highlight-title">Transformer</span> Channel Attention and Spatial Feature
+  Fusion for Improved Segmentation of Low Quality Medical Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxuan Li, Qing Xu, Xiangjian He, Ziyu Liu, Daokun Zhang, Ruili Wang, Rong Qu, Guoping Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hybrid CNN-Transformer models are designed to combine the advantages of
+Convolutional Neural Networks (CNNs) and Transformers to efficiently model both
+local information and long-range dependencies. However, most research tends to
+focus on integrating the spatial features of CNNs and Transformers, while
+overlooking the critical importance of channel features. This is particularly
+significant for model performance in low-quality medical image segmentation.
+Effective channel feature extraction can significantly enhance the model's
+ability to capture contextual information and improve its representation
+capabilities. To address this issue, we propose a hybrid CNN-Transformer model,
+CFFormer, and introduce two modules: the Cross Feature Channel Attention (CFCA)
+module and the X-Spatial Feature Fusion (XFF) module. The model incorporates
+dual encoders, with the CNN encoder focusing on capturing local features and
+the Transformer encoder modeling global features. The CFCA module filters and
+facilitates interactions between the channel features from the two encoders,
+while the XFF module effectively reduces the significant semantic information
+differences in spatial features, enabling a smooth and cohesive spatial feature
+fusion. We evaluate our model across eight datasets covering five modalities to
+test its generalization capability. Experimental results demonstrate that our
+model outperforms current state-of-the-art (SOTA) methods, with particularly
+superior performance on datasets characterized by blurry boundaries and low
+contrast.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The article consists of 15 pages, including 10 figures and 7 tables.
+  The code will be made open-source once the article is accepted by the journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning-based Compression Detection for explainable Face Image
+  Quality Assessment <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03619v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03619v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurin Jonientz, Johannes Merkle, Christian Rathgeb, Benjamin Tams, Georg Merz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The assessment of face image quality is crucial to ensure reliable face
+recognition. In order to provide data subjects and operators with explainable
+and actionable feedback regarding captured face images, relevant quality
+components have to be measured. Quality components that are known to negatively
+impact the utility of face images include JPEG and JPEG 2000 compression
+artefacts, among others. Compression can result in a loss of important image
+details which may impair the recognition performance. In this work, deep neural
+networks are trained to detect the compression artefacts in a face images. For
+this purpose, artefact-free facial images are compressed with the JPEG and JPEG
+2000 compression algorithms. Subsequently, the PSNR and SSIM metrics are
+employed to obtain training labels based on which neural networks are trained
+using a single network to detect JPEG and JPEG 2000 artefacts, respectively.
+The evaluation of the proposed method shows promising results: in terms of
+detection accuracy, error rates of 2-3% are obtained for utilizing PSNR labels
+during training. In addition, we show that error rates of different open-source
+and commercial face recognition systems can be significantly reduced by
+discarding face images exhibiting severe compression artefacts. To minimize
+resource consumption, EfficientNetV2 serves as basis for the presented
+algorithm, which is available as part of the OFIQ software.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2nd Workshop on Fairness in Biometric Systems (FAIRBIO) at
+  International Conference on Pattern Recognition (ICPR) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BTMTrack: Robust RGB-T Tracking via Dual-template Bridging and
+  Temporal-Modal Candidate Elimination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongxuan Zhang, Bi Zeng, Xinyu Ni, Yimin Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RGB-T tracking leverages the complementary strengths of RGB and thermal
+infrared (TIR) modalities to address challenging scenarios such as low
+illumination and adverse weather. However, existing methods often fail to
+effectively integrate temporal information and perform efficient cross-modal
+interactions, which constrain their adaptability to dynamic targets. In this
+paper, we propose BTMTrack, a novel framework for RGB-T tracking. The core of
+our approach lies in the dual-template backbone network and the Temporal-Modal
+Candidate Elimination (TMCE) strategy. The dual-template backbone effectively
+integrates temporal information, while the TMCE strategy focuses the model on
+target-relevant tokens by evaluating temporal and modal correlations, reducing
+computational overhead and avoiding irrelevant background noise. Building upon
+this foundation, we propose the Temporal Dual Template Bridging (TDTB) module,
+which facilitates precise cross-modal fusion through dynamically filtered
+tokens. This approach further strengthens the interaction between templates and
+the search region. Extensive experiments conducted on three benchmark datasets
+demonstrate the effectiveness of BTMTrack. Our method achieves state-of-the-art
+performance, with a 72.3% precision rate on the LasHeR test set and competitive
+results on RGBT210 and RGBT234 datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VTAO-BiManip: Masked Visual-Tactile-Action <span class="highlight-title">Pre-train</span>ing with Object
+  Understanding for Bimanual Dexterous Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengnan Sun, Zhaotai Shi, Jiayin Chen, Qingtao Liu, Yu Cui, Qi Ye, Jiming Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bimanual dexterous manipulation remains significant challenges in robotics
+due to the high DoFs of each hand and their coordination. Existing single-hand
+manipulation techniques often leverage human demonstrations to guide RL methods
+but fail to generalize to complex bimanual tasks involving multiple sub-skills.
+In this paper, we introduce VTAO-BiManip, a novel framework that combines
+visual-tactile-action pretraining with object understanding to facilitate
+curriculum RL to enable human-like bimanual manipulation. We improve prior
+learning by incorporating hand motion data, providing more effective guidance
+for dual-hand coordination than binary tactile feedback. Our pretraining model
+predicts future actions as well as object pose and size using masked multimodal
+inputs, facilitating cross-modal regularization. To address the multi-skill
+learning challenge, we introduce a two-stage curriculum RL approach to
+stabilize training. We evaluate our method on a bottle-cap unscrewing task,
+demonstrating its effectiveness in both simulated and real-world environments.
+Our approach achieves a success rate that surpasses existing visual-tactile
+pretraining methods by over 20%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConcealGS: Concealing Invisible Copyright Information in 3D Gaussian
+  Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifeng Yang, Hengyu Liu, Chenxin Li, Yining Sun, Wuyang Li, Yifan Liu, Yiyang Lin, Yixuan Yuan, Nanyang Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of 3D reconstruction technology, the widespread
+distribution of 3D data has become a future trend. While traditional visual
+data (such as images and videos) and NeRF-based formats already have mature
+techniques for copyright protection, steganographic techniques for the emerging
+3D Gaussian Splatting (3D-GS) format have yet to be fully explored. To address
+this, we propose ConcealGS, an innovative method for embedding implicit
+information into 3D-GS. By introducing the knowledge distillation and gradient
+optimization strategy based on 3D-GS, ConcealGS overcomes the limitations of
+NeRF-based models and enhances the robustness of implicit information and the
+quality of 3D reconstruction. We evaluate ConcealGS in various potential
+application scenarios, and experimental results have demonstrated that
+ConcealGS not only successfully recovers implicit information but also has
+almost no impact on rendering quality, providing a new approach for embedding
+invisible and recoverable information into 3D models in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Value Mapping Virtual Staining Framework for Large-scale Histological
+  Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjia Wang, Bo Xiong, You Zhou, Xun Cao, Zhan Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of virtual staining technology provides a rapid and efficient
+alternative for researchers in tissue pathology. It enables the utilization of
+unlabeled microscopic samples to generate virtual replicas of chemically
+stained histological slices, or facilitate the transformation of one staining
+type into another. The remarkable performance of generative networks, such as
+CycleGAN, offers an unsupervised learning approach for virtual coloring,
+overcoming the limitations of high-quality paired data required in supervised
+learning. Nevertheless, large-scale color transformation necessitates
+processing large field-of-view images in patches, often resulting in
+significant boundary inconsistency and artifacts. Additionally, the
+transformation between different colorized modalities typically needs further
+efforts to modify loss functions and tune hyperparameters for independent
+training of networks. In this study, we introduce a general virtual staining
+framework that is adaptable to various conditions. We propose a loss function
+based on the value mapping constraint to ensure the accuracy of virtual
+coloring between different pathological modalities, termed the Value Mapping
+Generative Adversarial Network (VM-GAN). Meanwhile, we present a
+confidence-based tiling method to address the challenge of boundary
+inconsistency arising from patch-wise processing. Experimental results on
+diverse data with varying staining protocols demonstrate that our method
+achieves superior quantitative indicators and improved visual perception.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BASIC: Semi-supervised Multi-organ Segmentation with Balanced Subclass
+  Regularization and Semantic-conflict Penalty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghao Feng, Lu Wen, Yuanyuan Xu, Binyu Yan, Xi Wu, Jiliu Zhou, Yan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised learning (SSL) has shown notable potential in relieving the
+heavy demand of dense prediction tasks on large-scale well-annotated datasets,
+especially for the challenging multi-organ segmentation (MoS). However, the
+prevailing class-imbalance problem in MoS caused by the substantial variations
+in organ size exacerbates the learning difficulty of the SSL network. To
+address this issue, in this paper, we propose an innovative semi-supervised
+network with BAlanced Subclass regularIzation and semantic-Conflict penalty
+mechanism (BASIC) to effectively learn the unbiased knowledge for
+semi-supervised MoS. Concretely, we construct a novel auxiliary subclass
+segmentation (SCS) task based on priorly generated balanced subclasses, thus
+deeply excavating the unbiased information for the main MoS task with the
+fashion of multi-task learning. Additionally, based on a mean teacher
+framework, we elaborately design a balanced subclass regularization to utilize
+the teacher predictions of SCS task to supervise the student predictions of MoS
+task, thus effectively transferring unbiased knowledge to the MoS subnetwork
+and alleviating the influence of the class-imbalance problem. Considering the
+similar semantic information inside the subclasses and their corresponding
+original classes (i.e., parent classes), we devise a semantic-conflict penalty
+mechanism to give heavier punishments to the conflicting SCS predictions with
+wrong parent classes and provide a more accurate constraint to the MoS
+predictions. Extensive experiments conducted on two publicly available
+datasets, i.e., the WORD dataset and the MICCAI FLARE 2022 dataset, have
+verified the superior performance of our proposed BASIC compared to other
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cosmos World Foundation Model Platform for Physical AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         NVIDIA,  :, Niket Agarwal, Arslan Ali, Maciej Bala, Yogesh Balaji, Erik Barker, Tiffany Cai, Prithvijit Chattopadhyay, Yongxin Chen, Yin Cui, Yifan Ding, Daniel Dworakowski, Jiaojiao Fan, Michele Fenzi, Francesco Ferroni, Sanja Fidler, Dieter Fox, Songwei Ge, Yunhao Ge, Jinwei Gu, Siddharth Gururani, Ethan He, Jiahui Huang, Jacob Huffman, Pooya Jannaty, Jingyi Jin, Seung Wook Kim, Gergely Klár, Grace Lam, Shiyi Lan, Laura Leal-Taixe, Anqi Li, Zhaoshuo Li, Chen-Hsuan Lin, Tsung-Yi Lin, Huan Ling, Ming-Yu Liu, Xian Liu, Alice Luo, Qianli Ma, Hanzi Mao, Kaichun Mo, Arsalan Mousavian, Seungjun Nah, Sriharsha Niverty, David Page, Despoina Paschalidou, Zeeshan Patel, Lindsey Pavao, Morteza Ramezanali, Fitsum Reda, Xiaowei Ren, Vasanth Rao Naik Sabavat, Ed Schmerling, Stella Shi, Bartosz Stefaniak, Shitao Tang, Lyne Tchapmi, Przemek Tredak, Wei-Cheng Tseng, Jibin Varghese, Hao Wang, Haoxiang Wang, Heng Wang, Ting-Chun Wang, Fangyin Wei, Xinyue Wei, Jay Zhangjie Wu, Jiashu Xu, Wei Yang, Lin Yen-Chen, Xiaohui Zeng, Yu Zeng, Jing Zhang, Qinsheng Zhang, Yuxuan Zhang, Qingqing Zhao, Artur Zolkowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physical AI needs to be trained digitally first. It needs a digital twin of
+itself, the policy model, and a digital twin of the world, the world model. In
+this paper, we present the Cosmos World Foundation Model Platform to help
+developers build customized world models for their Physical AI setups. We
+position a world foundation model as a general-purpose world model that can be
+fine-tuned into customized world models for downstream applications. Our
+platform covers a video curation pipeline, pre-trained world foundation models,
+examples of post-training of pre-trained world foundation models, and video
+tokenizers. To help Physical AI builders solve the most critical problems of
+our society, we make our platform open-source and our models open-weight with
+permissive licenses available via https://github.com/NVIDIA/Cosmos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Image Caption via Cycle-consistent Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Cui, Jinbin Bai, Guohua Wang, Qingguo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, Ye Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating image captions typically relies on reference captions, which are
+costly to obtain and exhibit significant diversity and subjectivity. While
+reference-free evaluation metrics have been proposed, most focus on cross-modal
+evaluation between captions and images. Recent research has revealed that the
+modality gap generally exists in the representation of contrastive
+learning-based multi-modal systems, undermining the reliability of
+cross-modality metrics like CLIPScore. In this paper, we propose CAMScore, a
+cyclic reference-free automatic evaluation metric for image captioning models.
+To circumvent the aforementioned modality gap, CAMScore utilizes a
+text-to-image model to generate images from captions and subsequently evaluates
+these generated images against the original images. Furthermore, to provide
+fine-grained information for a more comprehensive evaluation, we design a
+three-level evaluation framework for CAMScore that encompasses pixel-level,
+semantic-level, and objective-level perspectives. Extensive experiment results
+across multiple benchmark datasets show that CAMScore achieves a superior
+correlation with human judgments compared to existing reference-based and
+reference-free metrics, demonstrating the effectiveness of the framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridged Semantic Alignment for Zero-shot 3D Medical Image Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Lai, Zihang Jiang, Qingsong Yao, Rongsheng Wang, Zhiyang He, Xiaodong Tao, Wei Wei, Weifu Lv, S. Kevin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D medical images such as Computed tomography (CT) are widely used in
+clinical practice, offering a great potential for automatic diagnosis.
+Supervised learning-based approaches have achieved significant progress but
+rely heavily on extensive manual annotations, limited by the availability of
+training data and the diversity of abnormality types. Vision-language alignment
+(VLA) offers a promising alternative by enabling zero-shot learning without
+additional annotations. However, we empirically discover that the visual and
+textural embeddings after alignment endeavors from existing VLA methods form
+two well-separated clusters, presenting a wide gap to be bridged. To bridge
+this gap, we propose a Bridged Semantic Alignment (BrgSA) framework. First, we
+utilize a large language model to perform semantic summarization of reports,
+extracting high-level semantic information. Second, we design a Cross-Modal
+Knowledge Interaction (CMKI) module that leverages a cross-modal knowledge bank
+as a semantic bridge, facilitating interaction between the two modalities,
+narrowing the gap, and improving their alignment. To comprehensively evaluate
+our method, we construct a benchmark dataset that includes 15 underrepresented
+abnormalities as well as utilize two existing benchmark datasets. Experimental
+results demonstrate that BrgSA achieves state-of-the-art performances on both
+public benchmark datasets and our custom-labeled dataset, with significant
+improvements in zero-shot diagnosis of underrepresented abnormalities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>Guard: Soft <span class="highlight-title">Prompt</span>-Guided Unsafe Content Moderation for
+  Text-to-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingzhi Yuan, Xinfeng Li, Chejian Xu, Guanhong Tao, Xiaojun Jia, Yihao Huang, Wei Dong, Yang Liu, XiaoFeng Wang, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image (T2I) models have been shown to be vulnerable to misuse,
+particularly in generating not-safe-for-work (NSFW) content, raising serious
+ethical concerns. In this work, we present PromptGuard, a novel content
+moderation technique that draws inspiration from the system prompt mechanism in
+large language models (LLMs) for safety alignment. Unlike LLMs, T2I models lack
+a direct interface for enforcing behavioral guidelines. Our key idea is to
+optimize a safety soft prompt that functions as an implicit system prompt
+within the T2I model's textual embedding space. This universal soft prompt (P*)
+directly moderates NSFW inputs, enabling safe yet realistic image generation
+without altering the inference efficiency or requiring proxy models. Extensive
+experiments across three datasets demonstrate that PromptGuard effectively
+mitigates NSFW content generation while preserving high-quality benign outputs.
+PromptGuard achieves 7.8 times faster than prior content moderation methods,
+surpassing eight state-of-the-art defenses with an optimal unsafe ratio down to
+5.84%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 8 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Tuberculosis Bacilli Detection using Attention-Residual U-Net
+  and Ensemble Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Greeshma K, Vishnukumar S
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tuberculosis (TB), caused by Mycobacterium tuberculosis, remains a critical
+global health issue, necessitating timely diagnosis and treatment. Current
+methods for detecting tuberculosis bacilli from bright field microscopic sputum
+smear images suffer from low automation, inadequate segmentation performance,
+and limited classification accuracy. This paper proposes an efficient hybrid
+approach that combines deep learning for segmentation and an ensemble model for
+classification. An enhanced U-Net model incorporating attention blocks and
+residual connections is introduced to precisely segment microscopic sputum
+smear images, facilitating the extraction of Regions of Interest (ROIs). These
+ROIs are subsequently classified using an ensemble classifier comprising
+Support Vector Machine (SVM), Random Forest, and Extreme Gradient Boost
+(XGBoost), resulting in an accurate identification of bacilli within the
+images. Experiments conducted on a newly created dataset, along with public
+datasets, demonstrate that the proposed model achieves superior segmentation
+performance, higher classification accuracy, and enhanced automation compared
+to existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient and Accurate Tuberculosis Diagnosis: Attention Residual U-Net
+  and Vision <span class="highlight-title">Transformer</span> Based Detection Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03538v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03538v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Greeshma K, Vishnukumar S
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tuberculosis (TB), an infectious disease caused by Mycobacterium
+tuberculosis, continues to be a major global health threat despite being
+preventable and curable. This burden is particularly high in low and middle
+income countries. Microscopy remains essential for diagnosing TB by enabling
+direct visualization of Mycobacterium tuberculosis in sputum smear samples,
+offering a cost effective approach for early detection and effective treatment.
+Given the labour-intensive nature of microscopy, automating the detection of
+bacilli in microscopic images is crucial to improve both the expediency and
+reliability of TB diagnosis. The current methodologies for detecting
+tuberculosis bacilli in bright field microscopic sputum smear images are
+hindered by limited automation capabilities, inconsistent segmentation quality,
+and constrained classification precision. This paper proposes a twostage deep
+learning methodology for tuberculosis bacilli detection, comprising bacilli
+segmentation followed by classification. In the initial phase, an advanced
+U-Net model employing attention blocks and residual connections is proposed to
+segment microscopic sputum smear images, enabling the extraction of Regions of
+Interest (ROIs). The extracted ROIs are then classified using a Vision
+Transformer, which we specifically customized as TBViT to enhance the precise
+detection of bacilli within the images. For the experiments, a newly developed
+dataset of microscopic sputum smear images derived from Ziehl-Neelsen-stained
+slides is used in conjunction with existing public datasets. The qualitative
+and quantitative evaluation of the experiments using various metrics
+demonstrates that the proposed model achieves significantly improved
+segmentation performance, higher classification accuracy, and a greater level
+of automation, surpassing existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anomaly Triplet-Net: Progress Recognition Model Using Deep Metric
+  Learning Considering Occlusion for Manual Assembly Work 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takumi Kitsukawa, Kazuma Miura, Shigeki Yumoto, Sarthak Pathak, Alessandro Moro, Kazunori Umeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a progress recognition method consider occlusion using deep
+metric learning is proposed to visualize the product assembly process in a
+factory. First, the target assembly product is detected from images acquired
+from a fixed-point camera installed in the factory using a deep learning-based
+object detection method. Next, the detection area is cropped from the image.
+Finally, by using a classification method based on deep metric learning on the
+cropped image, the progress of the product assembly work is estimated as a
+rough progress step.
+  As a specific progress estimation model, we propose an Anomaly Triplet-Net
+that adds anomaly samples to Triplet Loss for progress estimation considering
+occlusion.
+  In experiments, an 82.9% success rate is achieved for the progress estimation
+method using Anomaly Triplet-Net.
+  We also experimented with the practicality of the sequence of detection,
+cropping, and progression estimation, and confirmed the effectiveness of the
+overall system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been peer-reviewed, revised, and published in Advanced
+  Robotics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FgC2F-UDiff: Frequency-guided and Coarse-to-fine Unified Diffusion Model
+  for Multi-modality Missing MRI Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojiao Xiao, Qinmin Vivian Hu, Guanghui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modality magnetic resonance imaging (MRI) is essential for the
+diagnosis and treatment of brain tumors. However, missing modalities are
+commonly observed due to limitations in scan time, scan corruption, artifacts,
+motion, and contrast agent intolerance. Synthesis of missing MRI has been a
+means to address the limitations of modality insufficiency in clinical practice
+and research. However, there are still some challenges, such as poor
+generalization, inaccurate non-linear mapping, and slow processing speeds. To
+address the aforementioned issues, we propose a novel unified synthesis model,
+the Frequency-guided and Coarse-to-fine Unified Diffusion Model (FgC2F-UDiff),
+designed for multiple inputs and outputs. Specifically, the Coarse-to-fine
+Unified Network (CUN) fully exploits the iterative denoising properties of
+diffusion models, from global to detail, by dividing the denoising process into
+two stages, coarse and fine, to enhance the fidelity of synthesized images.
+Secondly, the Frequency-guided Collaborative Strategy (FCS) harnesses
+appropriate frequency information as prior knowledge to guide the learning of a
+unified, highly non-linear mapping. Thirdly, the Specific-acceleration Hybrid
+Mechanism (SHM) integrates specific mechanisms to accelerate the diffusion
+model and enhance the feasibility of many-to-many synthesis. Extensive
+experimental evaluations have demonstrated that our proposed FgC2F-UDiff model
+achieves superior performance on two datasets, validated through a
+comprehensive assessment that includes both qualitative observations and
+quantitative metrics, such as PSNR SSIM, LPIPS, and FID.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TexHOI: Reconstructing Textures of 3D Unknown Objects in Monocular
+  Hand-Object Interaction Scenes <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alakh Aggarwal, Ningna Wang, Xiaohu Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing 3D models of dynamic, real-world objects with high-fidelity
+textures from monocular frame sequences has been a challenging problem in
+recent years. This difficulty stems from factors such as shadows, indirect
+illumination, and inaccurate object-pose estimations due to occluding
+hand-object interactions. To address these challenges, we propose a novel
+approach that predicts the hand's impact on environmental visibility and
+indirect illumination on the object's surface albedo. Our method first learns
+the geometry and low-fidelity texture of the object, hand, and background
+through composite rendering of radiance fields. Simultaneously, we optimize the
+hand and object poses to achieve accurate object-pose estimations. We then
+refine physics-based rendering parameters - including roughness, specularity,
+albedo, hand visibility, skin color reflections, and environmental illumination
+- to produce precise albedo, and accurate hand illumination and shadow regions.
+Our approach surpasses state-of-the-art methods in texture reconstruction and,
+to the best of our knowledge, is the first to account for hand-object
+interactions in object texture reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted at ICCVM 2025 and will appear in the
+  proceedings of IEEE TVCG as part of the conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Salient Region Matching for Fully Automated MR-TRUS Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zetian Feng, Dong Ni, Yi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prostate cancer is a leading cause of cancer-related mortality in men. The
+registration of magnetic resonance (MR) and transrectal ultrasound (TRUS) can
+provide guidance for the targeted biopsy of prostate cancer. In this study, we
+propose a salient region matching framework for fully automated MR-TRUS
+registration. The framework consists of prostate segmentation, rigid alignment
+and deformable registration. Prostate segmentation is performed using two
+segmentation networks on MR and TRUS respectively, and the predicted salient
+regions are used for the rigid alignment. The rigidly-aligned MR and TRUS
+images serve as initialization for the deformable registration. The deformable
+registration network has a dual-stream encoder with cross-modal spatial
+attention modules to facilitate multi-modality feature learning, and a salient
+region matching loss to consider both structure and intensity similarity within
+the prostate region. Experiments on a public MR-TRUS dataset demonstrate that
+our method achieves satisfactory registration results, outperforming several
+cutting-edge methods. The code is publicly available at
+https://github.com/mock1ngbrd/salient-region-matching.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study of Accuracy-Robustness Tradeoff and Training
+  Efficiency in <span class="highlight-title">Self-Supervised</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Ghofrani, Pooyan Jamshidi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) has significantly advanced image
+representation learning, yet efficiency challenges persist, particularly with
+adversarial training. Many SSL methods require extensive epochs to achieve
+convergence, a demand further amplified in adversarial settings. To address
+this inefficiency, we revisit the robust EMP-SSL framework, emphasizing the
+importance of increasing the number of crops per image to accelerate learning.
+Unlike traditional contrastive learning, robust EMP-SSL leverages multi-crop
+sampling, integrates an invariance term and regularization, and reduces
+training epochs, enhancing time efficiency. Evaluated with both standard linear
+classifiers and multi-patch embedding aggregation, robust EMP-SSL provides new
+insights into SSL evaluation strategies.
+  Our results show that robust crop-based EMP-SSL not only accelerates
+convergence but also achieves a superior balance between clean accuracy and
+adversarial robustness, outperforming multi-crop embedding aggregation.
+Additionally, we extend this approach with free adversarial training in
+Multi-Crop SSL, introducing the Cost-Free Adversarial Multi-Crop
+Self-Supervised Learning (CF-AMC-SSL) method. CF-AMC-SSL demonstrates the
+effectiveness of free adversarial training in reducing training time while
+simultaneously improving clean accuracy and adversarial robustness. These
+findings underscore the potential of CF-AMC-SSL for practical SSL applications.
+Our code is publicly available at https://github.com/softsys4ai/CF-AMC-SSL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Deep Learning Trigger Alerts from Mobile-Captured Images? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pritisha Sarkar, Duranta Durbaar Vishal Saha, Mousumi Saha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our research presents a comprehensive approach to leveraging mobile camera
+image data for real-time air quality assessment and recommendation. We develop
+a regression-based Convolutional Neural Network model and tailor it explicitly
+for air quality prediction by exploiting the inherent relationship between
+output parameters. As a result, the Mean Squared Error of 0.0077 and 0.0112
+obtained for 2 and 5 pollutants respectively outperforms existing models.
+Furthermore, we aim to verify the common practice of augmenting the original
+dataset with a view to introducing more variation in the training phase. It is
+one of our most significant contributions that our experimental results
+demonstrate minimal accuracy differences between the original and augmented
+datasets. Finally, a real-time, user-friendly dashboard is implemented which
+dynamically displays the Air Quality Index and pollutant values derived from
+captured mobile camera images. Users' health conditions are considered to
+recommend whether a location is suitable based on current air quality metrics.
+Overall, this research contributes to verification of data augmentation
+techniques, CNN-based regression modelling for air quality prediction, and
+user-centric air quality monitoring through mobile technology. The proposed
+system offers practical solutions for individuals to make informed
+environmental health and well-being decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Textualize Visual <span class="highlight-title">Prompt</span> for Image Editing via Diffusion Bridge <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Xu, Qingnan Fan, Fei Kou, Shuai Qin, Hong Gu, Ruoyu Zhao, Charles Ling, Boyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual prompt, a pair of before-and-after edited images, can convey
+indescribable imagery transformations and prosper in image editing. However,
+current visual prompt methods rely on a pretrained text-guided image-to-image
+generative model that requires a triplet of text, before, and after images for
+retraining over a text-to-image model. Such crafting triplets and retraining
+processes limit the scalability and generalization of editing. In this paper,
+we present a framework based on any single text-to-image model without reliance
+on the explicit image-to-image model thus enhancing the generalizability and
+scalability. Specifically, by leveraging the probability-flow ordinary
+equation, we construct a diffusion bridge to transfer the distribution between
+before-and-after images under the text guidance. By optimizing the text via the
+bridge, the framework adaptively textualizes the editing transformation
+conveyed by visual prompts into text embeddings without other models.
+Meanwhile, we introduce differential attention control during text
+optimization, which disentangles the text embedding from the invariance of the
+before-and-after images and makes it solely capture the delicate transformation
+and generalize to edit various images. Experiments on real images validate
+competitive results on the generalization, contextual coherence, and high
+fidelity for delicate editing with just one image pair as the visual prompt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SceneBooth: Diffusion-based Framework for Subject-preserved
+  Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shang Chai, Zihang Lin, Min Zhou, Xubin Li, Liansheng Zhuang, Houqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the demand for personalizing image generation, subject-driven
+text-to-image generation method, which creates novel renditions of an input
+subject based on text prompts, has received growing research interest. Existing
+methods often learn subject representation and incorporate it into the prompt
+embedding to guide image generation, but they struggle with preserving subject
+fidelity. To solve this issue, this paper approaches a novel framework named
+SceneBooth for subject-preserved text-to-image generation, which consumes
+inputs of a subject image, object phrases and text prompts. Instead of learning
+the subject representation and generating a subject, our SceneBooth fixes the
+given subject image and generates its background image guided by the text
+prompts. To this end, our SceneBooth introduces two key components, i.e., a
+multimodal layout generation module and a background painting module. The
+former determines the position and scale of the subject by generating
+appropriate scene layouts that align with text captions, object phrases, and
+subject visual information. The latter integrates two adapters (ControlNet and
+Gated Self-Attention) into the latent diffusion model to generate a background
+that harmonizes with the subject guided by scene layouts and text descriptions.
+In this manner, our SceneBooth ensures accurate preservation of the subject's
+appearance in the output. Quantitative and qualitative experimental results
+demonstrate that SceneBooth significantly outperforms baseline methods in terms
+of subject preservation, image harmonization and overall quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VOILA: Complexity-Aware Universal Segmentation of CT images by Voxel
+  Interacting with Language <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zishuo Wan, Yu Gao, Wanyuan Pang, Dawei Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satisfactory progress has been achieved recently in universal segmentation of
+CT images. Following the success of vision-language methods, there is a growing
+trend towards utilizing text prompts and contrastive learning to develop
+universal segmentation models. However, there exists a significant imbalance in
+information density between 3D images and text prompts. Moreover, the standard
+fully connected layer segmentation approach faces significant challenges in
+handling multiple classes and exhibits poor generalizability. To address these
+challenges, we propose the VOxel Interacting with LAnguage method (VOILA) for
+universal CT image segmentation. Initially, we align voxels and language into a
+shared representation space and classify voxels on the basis of cosine
+similarity. Subsequently, we develop the Voxel-Language Interaction framework
+to mitigate the impact of class imbalance caused by foreground-background
+discrepancies and variations in target volumes. Furthermore, a Complexity-Aware
+Sampling method is proposed to focus on region hard to segment, achieved by
+generating pseudo-heatmaps from a trainable Gaussian mixture distribution. Our
+results indicate the proposed VOILA is capable to achieve improved performance
+with reduced parameters and computational cost during training. Furthermore, it
+demonstrates significant generalizability across diverse datasets without
+additional fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hyperbolic Binary Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Chen, Jingyang Xiang, Tianxin Huang, Xiangrui Zhao, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Binary Neural Network (BNN) converts full-precision weights and activations
+into their extreme 1-bit counterparts, making it particularly suitable for
+deployment on lightweight mobile devices. While binary neural networks are
+typically formulated as a constrained optimization problem and optimized in the
+binarized space, general neural networks are formulated as an unconstrained
+optimization problem and optimized in the continuous space. This paper
+introduces the Hyperbolic Binary Neural Network (HBNN) by leveraging the
+framework of hyperbolic geometry to optimize the constrained problem.
+Specifically, we transform the constrained problem in hyperbolic space into an
+unconstrained one in Euclidean space using the Riemannian exponential map. On
+the other hand, we also propose the Exponential Parametrization Cluster (EPC)
+method, which, compared to the Riemannian exponential map, shrinks the segment
+domain based on a diffeomorphism. This approach increases the probability of
+weight flips, thereby maximizing the information gain in BNNs. Experimental
+results on CIFAR10, CIFAR100, and ImageNet classification datasets with
+VGGsmall, ResNet18, and ResNet34 models illustrate the superior performance of
+our HBNN over state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information-Maximized Soft Variable Discretization for <span class="highlight-title">Self-Supervised</span>
+  Image Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuang Niu, Wenjun Xia, Hongming Shan, Ge Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) has emerged as a crucial technique in image
+processing, encoding, and understanding, especially for developing today's
+vision foundation models that utilize large-scale datasets without annotations
+to enhance various downstream tasks. This study introduces a novel SSL
+approach, Information-Maximized Soft Variable Discretization (IMSVD), for image
+representation learning. Specifically, IMSVD softly discretizes each variable
+in the latent space, enabling the estimation of their probability distributions
+over training batches and allowing the learning process to be directly guided
+by information measures. Motivated by the MultiView assumption, we propose an
+information-theoretic objective function to learn transform-invariant,
+non-travail, and redundancy-minimized representation features. We then derive a
+joint-cross entropy loss function for self-supervised image representation
+learning, which theoretically enjoys superiority over the existing methods in
+reducing feature redundancy. Notably, our non-contrastive IMSVD method
+statistically performs contrastive learning. Extensive experimental results
+demonstrate the effectiveness of IMSVD on various downstream tasks in terms of
+both accuracy and efficiency. Thanks to our variable discretization, the
+embedding features optimized by IMSVD offer unique explainability at the
+variable level. IMSVD has the potential to be adapted to other learning
+paradigms. Our code is publicly available at
+https://github.com/niuchuangnn/IMSVD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DGSSA: Domain generalization with structural and stylistic augmentation
+  for retinal vessel segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03466v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03466v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Liu, Yudong Zhang, Shuihua Wang, Siyue Li, Jin Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retinal vascular morphology is crucial for diagnosing diseases such as
+diabetes, glaucoma, and hypertension, making accurate segmentation of retinal
+vessels essential for early intervention. Traditional segmentation methods
+assume that training and testing data share similar distributions, which can
+lead to poor performance on unseen domains due to domain shifts caused by
+variations in imaging devices and patient demographics. This paper presents a
+novel approach, DGSSA, for retinal vessel image segmentation that enhances
+model generalization by combining structural and style augmentation strategies.
+We utilize a space colonization algorithm to generate diverse vascular-like
+structures that closely mimic actual retinal vessels, which are then used to
+generate pseudo-retinal images with an improved Pix2Pix model, allowing the
+segmentation model to learn a broader range of structure distributions.
+Additionally, we utilize PixMix to implement random photometric augmentations
+and introduce uncertainty perturbations, thereby enriching stylistic diversity
+and significantly enhancing the model's adaptability to varying imaging
+conditions. Our framework has been rigorously evaluated on four challenging
+datasets-DRIVE, CHASEDB, HRF, and STARE-demonstrating state-of-the-art
+performance that surpasses existing methods. This validates the effectiveness
+of our proposed approach, highlighting its potential for clinical application
+in automated retinal vessel analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Activating Associative Disease-Aware Vision Token Memory for LLM-Based
+  X-ray Report Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03458v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03458v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Wang, Fuling Wang, Haowen Wang, Bo Jiang, Chuanfu Li, Yaowei Wang, Yonghong Tian, Jin Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  X-ray image based medical report generation achieves significant progress in
+recent years with the help of the large language model, however, these models
+have not fully exploited the effective information in visual image regions,
+resulting in reports that are linguistically sound but insufficient in
+describing key diseases. In this paper, we propose a novel associative
+memory-enhanced X-ray report generation model that effectively mimics the
+process of professional doctors writing medical reports. It considers both the
+mining of global and local visual information and associates historical report
+information to better complete the writing of the current report. Specifically,
+given an X-ray image, we first utilize a classification model along with its
+activation maps to accomplish the mining of visual regions highly associated
+with diseases and the learning of disease query tokens. Then, we employ a
+visual Hopfield network to establish memory associations for disease-related
+tokens, and a report Hopfield network to retrieve report memory information.
+This process facilitates the generation of high-quality reports based on a
+large language model and achieves state-of-the-art performance on multiple
+benchmark datasets, including the IU X-ray, MIMIC-CXR, and Chexpert Plus. The
+source code of this work is released on
+\url{https://github.com/Event-AHU/Medical_Image_Analysis}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Peer Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedicalNarratives: Connecting Medical Vision and Language with Localized
+  Narratives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wisdom O. Ikezogwo, Kevin Zhang, Mehmet Saygin Seyfioglu, Fatemeh Ghezloo, Linda Shapiro, Ranjay Krishna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MedicalNarratives, a dataset curated from medical pedagogical
+videos similar in nature to data collected in Think-Aloud studies and inspired
+by Localized Narratives, which collects grounded image-text data by curating
+instructors' speech and mouse cursor movements synchronized in time.
+MedicalNarratives enables pretraining of both semantic and dense objectives,
+alleviating the need to train medical semantic and dense tasks disparately due
+to the lack of reasonably sized datasets. Our dataset contains 4.7M image-text
+pairs from videos and articles, with 1M samples containing dense annotations in
+the form of traces and bounding boxes. To evaluate the utility of
+MedicalNarratives, we train GenMedClip based on the CLIP architecture using our
+dataset spanning 12 medical domains and demonstrate that it outperforms
+previous state-of-the-art models on a newly constructed medical imaging
+benchmark that comprehensively evaluates performance across all modalities.
+Data, demo, code and models available at https://medical-narratives.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning for Identifying Grain Boundaries in Scanning Electron
+  Microscopy (SEM) Images of Nanoparticle Superlattices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aanish Paruchuri, Carl Thrasher, A. J. Hart, Robert Macfarlane, Arthi Jayaraman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nanoparticle superlattices consisting of ordered arrangements of
+nanoparticles exhibit unique optical, magnetic, and electronic properties
+arising from nanoparticle characteristics as well as their collective
+behaviors. Understanding how processing conditions influence the nanoscale
+arrangement and microstructure is critical for engineering materials with
+desired macroscopic properties. Microstructural features such as grain
+boundaries, lattice defects, and pores significantly affect these properties
+but are challenging to quantify using traditional manual analyses as they are
+labor-intensive and prone to errors. In this work, we present a machine
+learning workflow for automating grain segmentation in scanning electron
+microscopy (SEM) images of nanoparticle superlattices. This workflow integrates
+signal processing techniques, such as Radon transforms, with unsupervised
+learning methods like agglomerative hierarchical clustering to identify and
+segment grains without requiring manually annotated data. In the workflow we
+transform the raw pixel data into explainable numerical representation of
+superlattice orientations for clustering. Benchmarking results demonstrate the
+workflow's robustness against noisy images and edge cases, with a processing
+speed of four images per minute on standard computational hardware. This
+efficiency makes the workflow scalable to large datasets and makes it a
+valuable tool for integrating data-driven models into decision-making processes
+for material design and analysis. For example, one can use this workflow to
+quantify grain size distributions at varying processing conditions like
+temperature and pressure and using that knowledge adjust processing conditions
+to achieve desired superlattice orientations and grain sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MM-GEN: Enhancing Task Performance Through Targeted Multimodal Data
+  Curation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Joshi, Besmira Nushi, Vidhisha Balachandran, Varun Chandrasekaran, Vibhav Vineet, Neel Joshi, Baharan Mirzasoleiman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) are highly effective but often underperform on
+specialized tasks; for example, Llava-1.5 struggles with chart and diagram
+understanding due to scarce task-specific training data. Existing training
+data, sourced from general-purpose datasets, fails to capture the nuanced
+details needed for these tasks. We introduce MM-Gen, a scalable method that
+generates task-specific, high-quality synthetic text for candidate images by
+leveraging stronger models. MM-Gen employs a three-stage targeted process:
+partitioning data into subgroups, generating targeted text based on task
+descriptions, and filtering out redundant and outlier data. Fine-tuning VLMs
+with data generated by MM-Gen leads to significant performance gains, including
+29% on spatial reasoning and 15% on diagram understanding for Llava-1.5 (7B).
+Compared to human-curated caption data, MM-Gen achieves up to 1.6x better
+improvements for the original models, proving its effectiveness in enhancing
+task-specific VLM performance and bridging the gap between general-purpose
+datasets and specialized requirements. Code available at
+https://github.com/sjoshi804/MM-Gen.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chirpy3D: Continuous Part Latents for Creative 3D Bird Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kam Woh Ng, Jing Yang, Jia Wei Sii, Jiankang Deng, Chee Seng Chan, Yi-Zhe Song, Tao Xiang, Xiatian Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we push the boundaries of fine-grained 3D generation into
+truly creative territory. Current methods either lack intricate details or
+simply mimic existing objects -- we enable both. By lifting 2D fine-grained
+understanding into 3D through multi-view diffusion and modeling part latents as
+continuous distributions, we unlock the ability to generate entirely new, yet
+plausible parts through interpolation and sampling. A self-supervised feature
+consistency loss further ensures stable generation of these unseen parts. The
+result is the first system capable of creating novel 3D objects with
+species-specific details that transcend existing examples. While we demonstrate
+our approach on birds, the underlying framework extends beyond things that can
+chirp! Code will be released at https://github.com/kamwoh/chirpy3d.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph-Based Multimodal and Multi-view Alignment for Keystep Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04121v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04121v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julia Lee Romero, Kyle Min, Subarna Tripathi, Morteza Karimzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Egocentric videos capture scenes from a wearer's viewpoint, resulting in
+dynamic backgrounds, frequent motion, and occlusions, posing challenges to
+accurate keystep recognition. We propose a flexible graph-learning framework
+for fine-grained keystep recognition that is able to effectively leverage
+long-term dependencies in egocentric videos, and leverage alignment between
+egocentric and exocentric videos during training for improved inference on
+egocentric videos. Our approach consists of constructing a graph where each
+video clip of the egocentric video corresponds to a node. During training, we
+consider each clip of each exocentric video (if available) as additional nodes.
+We examine several strategies to define connections across these nodes and pose
+keystep recognition as a node classification task on the constructed graphs. We
+perform extensive experiments on the Ego-Exo4D dataset and show that our
+proposed flexible graph-based framework notably outperforms existing methods by
+more than 12 points in accuracy. Furthermore, the constructed graphs are sparse
+and compute efficient. We also present a study examining on harnessing several
+multimodal features, including narrations, depth, and object class labels, on a
+heterogeneous graph and discuss their corresponding contribution to the keystep
+recognition performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeRFs are Mirror Detectors: Using Structural Similarity for Multi-View
+  Mirror Scene Reconstruction with 3D Surface Primitives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leif Van Holland, Michael Weinmann, Jan U. Müller, Patrick Stotko, Reinhard Klein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While neural radiance fields (NeRF) led to a breakthrough in photorealistic
+novel view synthesis, handling mirroring surfaces still denotes a particular
+challenge as they introduce severe inconsistencies in the scene representation.
+Previous attempts either focus on reconstructing single reflective objects or
+rely on strong supervision guidance in terms of additional user-provided
+annotations of visible image regions of the mirrors, thereby limiting the
+practical usability. In contrast, in this paper, we present NeRF-MD, a method
+which shows that NeRFs can be considered as mirror detectors and which is
+capable of reconstructing neural radiance fields of scenes containing mirroring
+surfaces without the need for prior annotations. To this end, we first compute
+an initial estimate of the scene geometry by training a standard NeRF using a
+depth reprojection loss. Our key insight lies in the fact that parts of the
+scene corresponding to a mirroring surface will still exhibit a significant
+photometric inconsistency, whereas the remaining parts are already
+reconstructed in a plausible manner. This allows us to detect mirror surfaces
+by fitting geometric primitives to such inconsistent regions in this initial
+stage of the training. Using this information, we then jointly optimize the
+radiance field and mirror geometry in a second training stage to refine their
+quality. We demonstrate the capability of our method to allow the faithful
+detection of mirrors in the scene as well as the reconstruction of a single
+consistent scene representation, and demonstrate its potential in comparison to
+baseline and mirror-aware approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Ophthalmology: The State-of-the-Art and Future Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duy M. H. Nguyen, Hasan Md Tusfiqur Alam, Tai Nguyen, Devansh Srivastav, Hans-Juergen Profitlich, Ngan Le, Daniel Sonntag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of artificial intelligence (AI), particularly deep learning
+(DL), has marked a new era in the realm of ophthalmology, offering
+transformative potential for the diagnosis and treatment of posterior segment
+eye diseases. This review explores the cutting-edge applications of DL across a
+range of ocular conditions, including diabetic retinopathy, glaucoma,
+age-related macular degeneration, and retinal vessel segmentation. We provide a
+comprehensive overview of foundational ML techniques and advanced DL
+architectures, such as CNNs, attention mechanisms, and transformer-based
+models, highlighting the evolving role of AI in enhancing diagnostic accuracy,
+optimizing treatment strategies, and improving overall patient care.
+Additionally, we present key challenges in integrating AI solutions into
+clinical practice, including ensuring data diversity, improving algorithm
+transparency, and effectively leveraging multimodal data. This review
+emphasizes AI's potential to improve disease diagnosis and enhance patient care
+while stressing the importance of collaborative efforts to overcome these
+barriers and fully harness AI's impact in advancing eye care.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topology-based deep-learning segmentation method for deep anterior
+  lamellar keratoplasty (DALK) surgical guidance using M-mode OCT data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J. Yu, H. Yi, Y. Wang, J. D. Opfermann, W. G. Gensheimer, A. Krieger, J. U. Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Anterior Lamellar Keratoplasty (DALK) is a partial-thickness corneal
+transplant procedure used to treat corneal stromal diseases. A crucial step in
+this procedure is the precise separation of the deep stroma from Descemet's
+membrane (DM) using the Big Bubble technique. To simplify the tasks of needle
+insertion and pneumo-dissection in this technique, we previously developed an
+Optical Coherence Tomography (OCT)-guided, eye-mountable robot that uses
+real-time tracking of corneal layers from M-mode OCT signals for control.
+However, signal noise and instability during manipulation of the OCT fiber
+sensor-integrated needle have hindered the performance of conventional
+deep-learning segmentation methods, resulting in rough and inaccurate detection
+of corneal layers. To address these challenges, we have developed a
+topology-based deep-learning segmentation method that integrates a topological
+loss function with a modified network architecture. This approach effectively
+reduces the effects of noise and improves segmentation speed, precision, and
+stability. Validation using in vivo, ex vivo, and hybrid rabbit eye datasets
+demonstrates that our method outperforms traditional loss-based techniques,
+providing fast, accurate, and robust segmentation of the epithelium and DM to
+guide surgery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ImageFlowNet: Forecasting Multiscale Image-Level Trajectories of Disease
+  Progression with Irregularly-Sampled Longitudinal Medical Images <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14794v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14794v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liu, Ke Xu, Liangbo L. Shen, Guillaume Huguet, Zilong Wang, Alexander Tong, Danilo Bzdok, Jay Stewart, Jay C. Wang, Lucian V. Del Priore, Smita Krishnaswamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in medical imaging technologies have enabled the collection of
+longitudinal images, which involve repeated scanning of the same patients over
+time, to monitor disease progression. However, predictive modeling of such data
+remains challenging due to high dimensionality, irregular sampling, and data
+sparsity. To address these issues, we propose ImageFlowNet, a novel model
+designed to forecast disease trajectories from initial images while preserving
+spatial details. ImageFlowNet first learns multiscale joint representation
+spaces across patients and time points, then optimizes deterministic or
+stochastic flow fields within these spaces using a position-parameterized
+neural ODE/SDE framework. The model leverages a UNet architecture to create
+robust multiscale representations and mitigates data scarcity by combining
+knowledge from all patients. We provide theoretical insights that support our
+formulation of ODEs, and motivate our regularizations involving high-level
+visual features, latent space organization, and trajectory smoothness. We
+validate ImageFlowNet on three longitudinal medical image datasets depicting
+progression in geographic atrophy, multiple sclerosis, and glioblastoma,
+demonstrating its ability to effectively forecast disease progression and
+outperform existing methods. Our contributions include the development of
+ImageFlowNet, its theoretical underpinnings, and empirical validation on
+real-world datasets. The official implementation is available at
+https://github.com/KrishnaswamyLab/ImageFlowNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vim-F: Visual State Space Model Benefiting from Learning in the
+  Frequency Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18679v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18679v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juntao Zhang, Shaogeng Liu, Kun Bian, You Zhou, Pei Zhang, Wenbo An, Jun Zhou, Kun Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, State Space Models (SSMs) with efficient hardware-aware
+designs, known as the Mamba deep learning models, have made significant
+progress in modeling long sequences such as language understanding. Therefore,
+building efficient and general-purpose visual backbones based on SSMs is a
+promising direction. Compared to traditional convolutional neural networks
+(CNNs) and Vision Transformers (ViTs), the performance of Vision Mamba (ViM)
+methods is not yet fully competitive. To enable SSMs to process image data,
+ViMs typically flatten 2D images into 1D sequences, inevitably ignoring some 2D
+local dependencies, thereby weakening the model's ability to interpret spatial
+relationships from a global perspective. We use Fast Fourier Transform (FFT) to
+obtain the spectrum of the feature map and add it to the original feature map,
+enabling ViM to model a unified visual representation in both frequency and
+spatial domains. The introduction of frequency domain information enables ViM
+to have a global receptive field during scanning. We propose a novel model
+called Vim-F, which employs pure Mamba encoders and scans in both the frequency
+and spatial domains. Moreover, we question the necessity of position embedding
+in ViM and remove it accordingly in Vim-F, which helps to fully utilize the
+efficient long-sequence modeling capability of ViM. Finally, we redesign a
+patch embedding for Vim-F, leveraging a convolutional stem to capture more
+local correlations, further improving the performance of Vim-F. Code is
+available at: \url{https://github.com/yws-wxs/Vim-F}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaussian Building Mesh (GBM): Extract a Building's 3D Mesh with Google
+  Earth and Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00625v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00625v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Gao, Liangzhi Li, Hongjie He, Dening Lu, Linlin Xu, Jonathan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently released open-source pre-trained foundational image segmentation and
+object detection models (SAM2+GroundingDINO) allow for geometrically consistent
+segmentation of objects of interest in multi-view 2D images. Users can use
+text-based or click-based prompts to segment objects of interest without
+requiring labeled training datasets. Gaussian Splatting allows for the learning
+of the 3D representation of a scene's geometry and radiance based on 2D images.
+Combining Google Earth Studio, SAM2+GroundingDINO, 2D Gaussian Splatting, and
+our improvements in mask refinement based on morphological operations and
+contour simplification, we created a pipeline to extract the 3D mesh of any
+building based on its name, address, or geographic coordinates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Finer: Investigating and Enhancing Fine-Grained Visual Concept
+  Recognition in Large Vision Language Models <span class="chip">EMNLP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16315v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16315v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeonghwan Kim, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in instruction-tuned Large Vision-Language Models (LVLMs)
+have imbued the models with the ability to generate high-level, image-grounded
+explanations with ease. While such capability is largely attributed to the rich
+world knowledge contained within the Large Language Models (LLMs), our work
+reveals their shortcomings in fine-grained visual categorization (FGVC) across
+six different benchmark settings. Most recent state-of-the-art LVLMs like
+LLaVa-1.5, InstructBLIP and GPT-4V not only severely deteriorate in terms of
+classification performance, e.g., average drop of 65.58 in EM for Stanford Dogs
+for LLaVA-1.5, but also struggle to generate an accurate explanation with
+detailed attributes based on the concept that appears within an input image
+despite their capability to generate holistic image-level descriptions.
+In-depth analyses show that instruction-tuned LVLMs exhibit modality gap,
+showing discrepancy when given textual and visual inputs that correspond to the
+same concept, preventing the image modality from leveraging the rich parametric
+knowledge within the LLMs. In an effort to further the community's endeavor in
+this direction, we propose a multiple granularity attribute-centric evaluation
+benchmark, Finer, which aims to establish a ground to evaluate LVLMs'
+fine-grained visual comprehension ability and provide significantly improved
+explainability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2024; Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-based Accelerated MR Cholangiopancreatography without
+  Fully-sampled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.03732v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.03732v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinho Kim, Marcel Dominik Nickel, Florian Knoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The purpose of this study was to accelerate MR cholangiopancreatography
+(MRCP) acquisitions using deep learning-based (DL) reconstruction at 3T and
+0.55T. A total of 35 healthy volunteers underwent conventional two-fold
+accelerated MRCP scans at field strengths of 3T and 0.55T. We trained DL
+reconstructions using two different training strategies, supervised (SV) and
+self-supervised (SSV), with retrospectively six-fold undersampled data obtained
+at 3T. We then evaluated the DL reconstructions against standard techniques,
+parallel imaging (PI) and compressed sensing (CS), focusing on peak
+signal-to-noise ratio (PSNR) and structural similarity (SSIM) as metrics. We
+also tested DL reconstructions with prospectively accelerated acquisitions and
+evaluated their robustness when changing fields strengths from 3T to 0.55T. DL
+reconstructions demonstrated a reduction in average acquisition time from
+599/542 to 255/180 seconds for MRCP at 3T/0.55T. In both retrospective and
+prospective undersampling, PSNR and SSIM of DL reconstructions were higher than
+those of PI and CS. At the same time, DL reconstructions preserved the image
+quality of undersampled data, including sharpness and the visibility of
+hepatobiliary ducts. In addition, both DL approaches produced high-quality
+reconstructions at 0.55T. In summary, DL reconstructions trained for highly
+accelerated MRCP enabled a reduction in acquisition time by a factor of 2.4/3.0
+at 3T/0.55T while maintaining the image quality of conventional acquisitions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLaVA Steering: Visual Instruction Tuning with 500x Fewer Parameters
+  through Modality Linear Representation-Steering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12359v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12359v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhe Bi, Yujun Wang, Haokun Chen, Xun Xiao, Artur Hecker, Volker Tresp, Yunpu Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have significantly advanced visual
+tasks by integrating visual representations into large language models (LLMs).
+The textual modality, inherited from LLMs, equips MLLMs with abilities like
+instruction following and in-context learning. In contrast, the visual modality
+enhances performance in downstream tasks by leveraging rich semantic content,
+spatial information, and grounding capabilities. These intrinsic modalities
+work synergistically across various visual tasks. Our research initially
+reveals a persistent imbalance between these modalities, with text often
+dominating output generation during visual instruction tuning. This imbalance
+occurs when using both full fine-tuning and parameter-efficient fine-tuning
+(PEFT) methods. We then found that re-balancing these modalities can
+significantly reduce the number of trainable parameters required, inspiring a
+direction for further optimizing visual instruction tuning. We introduce
+Modality Linear Representation-Steering (MoReS) to achieve the goal. MoReS
+effectively re-balances the intrinsic modalities throughout the model, where
+the key idea is to steer visual representations through linear transformations
+in the visual subspace across each model layer. To validate our solution, we
+composed LLaVA Steering, a suite of models integrated with the proposed MoReS
+method. Evaluation results show that the composed LLaVA Steering models
+require, on average, 500 times fewer trainable parameters than LoRA needs while
+still achieving comparable performance across three visual benchmarks and eight
+visual question-answering tasks. Last, we present the LLaVA Steering Factory,
+an in-house developed platform that enables researchers to quickly customize
+various MLLMs with component-based architecture for seamlessly integrating
+state-of-the-art models, and evaluate their intrinsic modality imbalance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting risk of cardiovascular disease using retinal OCT imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18873v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18873v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cynthia Maldonado-Garcia, Rodrigo Bonazzola, Enzo Ferrante, Thomas H Julian, Panagiotis I Sergouniotis, Nishant Ravikumara, Alejandro F Frangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiovascular diseases (CVD) are the leading cause of death globally.
+Non-invasive, cost-effective imaging techniques play a crucial role in early
+detection and prevention of CVD. Optical coherence tomography (OCT) has gained
+recognition as a potential tool for early CVD risk prediction, though its use
+remains underexplored. In this study, we investigated the potential of OCT as
+an additional imaging technique to predict future CVD events. We analysed
+retinal OCT data from the UK Biobank. The dataset included 612 patients who
+suffered a myocardial infarction (MI) or stroke within five years of imaging
+and 2,234 controls without CVD (total: 2,846 participants). A self-supervised
+deep learning approach based on Variational Autoencoders (VAE) was used to
+extract low-dimensional latent representations from high-dimensional 3D OCT
+images, capturing distinct features of retinal layers. These latent features,
+along with clinical data, were used to train a Random Forest (RF) classifier to
+differentiate between patients at risk of future CVD events (MI or stroke) and
+healthy controls. Our model achieved an AUC of 0.75, sensitivity of 0.70,
+specificity of 0.70, and accuracy of 0.70, outperforming the QRISK3 score (the
+third version of the QRISK cardiovascular disease risk prediction algorithm;
+AUC = 0.60, sensitivity = 0.60, specificity = 0.55, accuracy = 0.55). The
+choroidal layer in OCT images was identified as a key predictor of future CVD
+events, revealed through a novel model explainability approach. This study
+demonstrates that retinal OCT imaging is a cost-effective, non-invasive
+alternative for predicting CVD risk, offering potential for widespread
+application in optometry practices and hospitals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>New version - 26 pages for main manuscript, 7 figures, 7 pages for
+  appendix and preprint for a journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaze-guided Hand-Object Interaction Synthesis: <span class="highlight-title">Dataset</span> and Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16169v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16169v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Tian, Ran Ji, Lingxiao Yang, Suting Ni, Yuexin Ma, Lan Xu, Jingyi Yu, Ye Shi, Jingya Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaze plays a crucial role in revealing human attention and intention,
+particularly in hand-object interaction scenarios, where it guides and
+synchronizes complex tasks that require precise coordination between the brain,
+hand, and object. Motivated by this, we introduce a novel task: Gaze-Guided
+Hand-Object Interaction Synthesis, with potential applications in augmented
+reality, virtual reality, and assistive technologies. To support this task, we
+present GazeHOI, the first dataset to capture simultaneous 3D modeling of gaze,
+hand, and object interactions. This task poses significant challenges due to
+the inherent sparsity and noise in gaze data, as well as the need for high
+consistency and physical plausibility in generating hand and object motions. To
+tackle these issues, we propose a stacked gaze-guided hand-object interaction
+diffusion model, named GHO-Diffusion. The stacked design effectively reduces
+the complexity of motion generation. We also introduce HOI-Manifold Guidance
+during the sampling stage of GHO-Diffusion, enabling fine-grained control over
+generated motions while maintaining the data manifold. Additionally, we propose
+a spatial-temporal gaze feature encoding for the diffusion condition and select
+diffusion results based on consistency scores between gaze-contact maps and
+gaze-interaction trajectories. Extensive experiments highlight the
+effectiveness of our method and the unique contributions of our dataset. More
+details in https://takiee.github.io/gaze-hoi/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://takiee.github.io/gaze-hoi/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GDSR: Global-Detail Integration through Dual-Branch Network with Wavelet
+  Losses for Remote Sensing Image Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01460v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01460v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiwei Zhu, Kai Li, Guojing Zhang, Xiaoying Wang, Jianqiang Huang, Xilai Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, deep neural networks, including Convolutional Neural
+Networks, Transformers, and State Space Models, have achieved significant
+progress in Remote Sensing Image (RSI) Super-Resolution (SR). However, existing
+SR methods typically overlook the complementary relationship between global and
+local dependencies. These methods either focus on capturing local information
+or prioritize global information, which results in models that are unable to
+effectively capture both global and local features simultaneously. Moreover,
+their computational cost becomes prohibitive when applied to large-scale RSIs.
+To address these challenges, we introduce the novel application of Receptance
+Weighted Key Value (RWKV) to RSI-SR, which captures long-range dependencies
+with linear complexity. To simultaneously model global and local features, we
+propose the Global-Detail dual-branch structure, GDSR, which performs SR
+reconstruction by paralleling RWKV and convolutional operations to handle
+large-scale RSIs. Furthermore, we introduce the Global-Detail Reconstruction
+Module (GDRM) as an intermediary between the two branches to bridge their
+complementary roles. In addition, we propose Wavelet Loss, a loss function that
+effectively captures high-frequency detail information in images, thereby
+enhancing the visual quality of SR, particularly in terms of detail
+reconstruction. Extensive experiments on several benchmarks, including AID,
+AID_CDM, RSSRD-QH, and RSSRD-QH_CDM, demonstrate that GSDR outperforms the
+state-of-the-art Transformer-based method HAT by an average of 0.05 dB in PSNR,
+while using only 63% of its parameters and 51% of its FLOPs, achieving an
+inference speed 2.9 times faster. Furthermore, the Wavelet Loss shows excellent
+generalization across various architectures, providing a novel perspective for
+RSI-SR enhancement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The experiments were conducted using private datasets that were
+  incomplete as they did not include all the necessary copyrights.
+  Additionally, the conclusions require further exploration as the work is
+  still in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for
+  Adversarial Defense <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23091v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23091v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingkun Zhang, Keping Bi, Wei Chen, Quanrun Chen, Jiafeng Guo, Xueqi Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite ongoing efforts to defend neural classifiers from adversarial
+attacks, they remain vulnerable, especially to unseen attacks. In contrast,
+humans are difficult to be cheated by subtle manipulations, since we make
+judgments only based on essential factors. Inspired by this observation, we
+attempt to model label generation with essential label-causative factors and
+incorporate label-non-causative factors to assist data generation. For an
+adversarial example, we aim to discriminate the perturbations as non-causative
+factors and make predictions only based on the label-causative factors.
+Concretely, we propose a casual diffusion model (CausalDiff) that adapts
+diffusion models for conditional data generation and disentangles the two types
+of casual factors by learning towards a novel casual information bottleneck
+objective. Empirically, CausalDiff has significantly outperformed
+state-of-the-art defense methods on various unseen attacks, achieving an
+average robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on
+CIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition
+Benchmark). The code is available at
+https://github.com/CAS-AISafetyBasicResearchGroup/CausalDiff.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ xMIL: Insightful Explanations for Multiple Instance Learning in
+  Histopathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04280v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04280v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Hense, Mina Jamshidi Idaji, Oliver Eberle, Thomas Schnake, Jonas Dippel, Laure Ciernik, Oliver Buchstab, Andreas Mock, Frederick Klauschen, Klaus-Robert Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple instance learning (MIL) is an effective and widely used approach for
+weakly supervised machine learning. In histopathology, MIL models have achieved
+remarkable success in tasks like tumor detection, biomarker prediction, and
+outcome prognostication. However, MIL explanation methods are still lagging
+behind, as they are limited to small bag sizes or disregard instance
+interactions. We revisit MIL through the lens of explainable AI (XAI) and
+introduce xMIL, a refined framework with more general assumptions. We
+demonstrate how to obtain improved MIL explanations using layer-wise relevance
+propagation (LRP) and conduct extensive evaluation experiments on three toy
+settings and four real-world histopathology datasets. Our approach consistently
+outperforms previous explanation attempts with particularly improved
+faithfulness scores on challenging biomarker prediction tasks. Finally, we
+showcase how xMIL explanations enable pathologists to extract insights from MIL
+models, representing a significant advance for knowledge discovery and model
+debugging in digital histopathology. Codes are available at:
+https://github.com/bifold-pathomics/xMIL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hyperbolic Contrastive Learning for Hierarchical 3D Point Cloud
+  Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02285v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02285v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingjie Liu, Pengyu Zhang, Ziyao He, Mingsong Chen, Xuan Tang, Xian Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperbolic spaces allow for more efficient modeling of complex, hierarchical
+structures, which is particularly beneficial in tasks involving multi-modal
+data. Although hyperbolic geometries have been proven effective for
+language-image pre-training, their capabilities to unify language, image, and
+3D Point Cloud modalities are under-explored. We extend the 3D Point Cloud
+modality in hyperbolic multi-modal contrastive pre-training. Additionally, we
+explore the entailment, modality gap, and alignment regularizers for learning
+hierarchical 3D embeddings and facilitating the transfer of knowledge from both
+Text and Image modalities. These regularizers enable the learning of
+intra-modal hierarchy within each modality and inter-modal hierarchy across
+text, 2D images, and 3D Point Clouds. Experimental results demonstrate that our
+proposed training strategy yields an outstanding 3D Point Cloud encoder, and
+the obtained 3D Point Cloud hierarchical embeddings significantly improve
+performance on various downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diff-Lung: Diffusion-Based Texture Synthesis for Enhanced Pathological
+  Tissue Segmentation in Lung CT Scans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rezkellah Noureddine Khiati, Pierre-Yves Brillet, Radu Ispas, Catalin Fetita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate quantification of the extent of lung pathological patterns
+(fibrosis, ground-glass opacity, emphysema, consolidation) is prerequisite for
+diagnosis and follow-up of interstitial lung diseases. However, segmentation is
+challenging due to the significant class imbalance between healthy and
+pathological tissues. This paper addresses this issue by leveraging a diffusion
+model for data augmentation applied during training an AI model. Our approach
+generates synthetic pathological tissue patches while preserving essential
+shape characteristics and intricate details specific to each tissue type. This
+method enhances the segmentation process by increasing the occurence of
+underrepresented classes in the training data. We demonstrate that our
+diffusion-based augmentation technique improves segmentation accuracy across
+all pathological tissue types, particularly for the less common patterns. This
+advancement contributes to more reliable automated analysis of lung CT scans,
+potentially improving clinical decision-making and patient outcomes
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at ISBI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMAD: The First-Ever Comprehensive Benchmark for Multimodal Large
+  Language Models in Industrial Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.09453v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.09453v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Jiang, Jian Li, Hanqiu Deng, Yong Liu, Bin-Bin Gao, Yifeng Zhou, Jialin Li, Chengjie Wang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of industrial inspection, Multimodal Large Language Models
+(MLLMs) have a high potential to renew the paradigms in practical applications
+due to their robust language capabilities and generalization abilities.
+However, despite their impressive problem-solving skills in many domains,
+MLLMs' ability in industrial anomaly detection has not been systematically
+studied. To bridge this gap, we present MMAD, the first-ever full-spectrum
+MLLMs benchmark in industrial Anomaly Detection. We defined seven key subtasks
+of MLLMs in industrial inspection and designed a novel pipeline to generate the
+MMAD dataset with 39,672 questions for 8,366 industrial images. With MMAD, we
+have conducted a comprehensive, quantitative evaluation of various
+state-of-the-art MLLMs. The commercial models performed the best, with the
+average accuracy of GPT-4o models reaching 74.9%. However, this result falls
+far short of industrial requirements. Our analysis reveals that current MLLMs
+still have significant room for improvement in answering questions related to
+industrial anomalies and defects. We further explore two training-free
+performance enhancement strategies to help models improve in industrial
+scenarios, highlighting their promising potential for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and data are available at https://github.com/jam-cc/MMAD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wavelet-Driven Generalizable Framework for Deepfake Face Forgery
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18301v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18301v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lalith Bharadwaj Baru, Rohit Boddeda, Shilhora Akshay Patel, Sai Mohan Gajapaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evolution of digital image manipulation, particularly with the
+advancement of deep generative models, significantly challenges existing
+deepfake detection methods, especially when the origin of the deepfake is
+obscure. To tackle the increasing complexity of these forgeries, we propose
+\textbf{Wavelet-CLIP}, a deepfake detection framework that integrates wavelet
+transforms with features derived from the ViT-L/14 architecture, pre-trained in
+the CLIP fashion. Wavelet-CLIP utilizes Wavelet Transforms to deeply analyze
+both spatial and frequency features from images, thus enhancing the model's
+capability to detect sophisticated deepfakes. To verify the effectiveness of
+our approach, we conducted extensive evaluations against existing
+state-of-the-art methods for cross-dataset generalization and detection of
+unseen images generated by standard diffusion models. Our method showcases
+outstanding performance, achieving an average AUC of 0.749 for cross-data
+generalization and 0.893 for robustness against unseen deepfakes, outperforming
+all compared methods. The code can be reproduced from the repo:
+\url{https://github.com/lalithbharadwajbaru/Wavelet-CLIP}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Pages, 2 Figures, 3 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-source Domain Adaptation for Panoramic Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16469v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16469v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Jiang, Sicheng Zhao, Jiankun Zhu, Wenbo Tang, Zhaopan Xu, Jidong Yang, Guoping Liu, Tengfei Xing, Pengfei Xu, Hongxun Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation methods for panoramic semantic segmentation
+utilize real pinhole images or low-cost synthetic panoramic images to transfer
+segmentation models to real panoramic images. However, these methods struggle
+to understand the panoramic structure using only real pinhole images and lack
+real-world scene perception with only synthetic panoramic images. Therefore, in
+this paper, we propose a new task, Multi-source Domain Adaptation for Panoramic
+Semantic Segmentation (MSDA4PASS), which leverages both real pinhole and
+synthetic panoramic images to improve segmentation on unlabeled real panoramic
+images. There are two key issues in the MSDA4PASS task: (1) distortion gaps
+between the pinhole and panoramic domains -- panoramic images exhibit global
+and local distortions absent in pinhole images; (2) texture gaps between the
+source and target domains -- scenes and styles differ across domains. To
+address these two issues, we propose a novel framework, Deformation Transform
+Aligner for Panoramic Semantic Segmentation (DTA4PASS), which converts all
+pinhole images in the source domains into distorted images and aligns the
+source distorted and panoramic images with the target panoramic images.
+Specifically, DTA4PASS consists of two main components: Unpaired Semantic
+Morphing (USM) and Distortion Gating Alignment (DGA). First, in USM, the
+Dual-view Discriminator (DvD) assists in training the diffeomorphic deformation
+network at the image and pixel level, enabling the effective deformation
+transformation of pinhole images without paired panoramic views, alleviating
+distortion gaps. Second, DGA assigns pinhole-like (pin-like) and panoramic-like
+(pan-like) features to each image by gating, and aligns these two features
+through uncertainty estimation, reducing texture gaps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Information Fusion 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NBBOX: Noisy Bounding Box Improves Remote Sensing Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09424v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09424v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yechan Kim, SooYeon Kim, Moongu Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation has shown significant advancements in computer vision to
+improve model performance over the years, particularly in scenarios with
+limited and insufficient data. Currently, most studies focus on adjusting the
+image or its features to expand the size, quality, and variety of samples
+during training in various tasks including object detection. However, we argue
+that it is necessary to investigate bounding box transformations as a data
+augmentation technique rather than image-level transformations, especially in
+aerial imagery due to potentially inconsistent bounding box annotations. Hence,
+this letter presents a thorough investigation of bounding box transformation in
+terms of scaling, rotation, and translation for remote sensing object
+detection. We call this augmentation strategy NBBOX (Noise Injection into
+Bounding Box). We conduct extensive experiments on DOTA and DIOR-R, both
+well-known datasets that include a variety of rotated generic objects in aerial
+images. Experimental results show that our approach significantly improves
+remote sensing object detection without whistles and bells and it is more
+time-efficient than other state-of-the-art augmentation strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Geoscience and Remote Sensing Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PSA-VLM: Enhancing Vision-Language Model Safety through Progressive
+  Concept-Bottleneck-Driven Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11543v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11543v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhendong Liu, Yuanbi Nie, Yingshui Tan, Jiaheng Liu, Xiangyu Yue, Qiushi Cui, Chongjun Wang, Xiaoyong Zhu, Bo Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from the powerful capabilities of Large Language Models (LLMs),
+pre-trained visual encoder models connected to LLMs form Vision Language Models
+(VLMs). However, recent research shows that the visual modality in VLMs is
+highly vulnerable, allowing attackers to bypass safety alignment in LLMs
+through visually transmitted content, launching harmful attacks. To address
+this challenge, we propose a progressive concept-based alignment strategy,
+PSA-VLM, which incorporates safety modules as concept bottlenecks to enhance
+visual modality safety alignment. By aligning model predictions with specific
+safety concepts, we improve defenses against risky images, enhancing
+explainability and controllability while minimally impacting general
+performance. Our method is obtained through two-stage training. The low
+computational cost of the first stage brings very effective performance
+improvement, and the fine-tuning of the language model in the second stage
+further improves the safety performance. Our method achieves state-of-the-art
+results on popular VLM safety benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2405.13581</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fully automated workflow for designing patient-specific orthopaedic
+  implants: application to total knee arthroplasty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15353v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15353v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aziliz Guezou-Philippe, Arnaud Clavé, Ehouarn Maguet, Ludivine Maintier, Charles Garraud, Jean-Rassaire Fouefack, Valérie Burdin, Eric Stindel, Guillaume Dardenne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background. Osteoarthritis affects about 528 million people worldwide,
+causing pain and stiffness in the joints. Arthroplasty is commonly performed to
+treat joint osteoarthritis, reducing pain and improving mobility. Nevertheless,
+a significant share of patients remain unsatisfied with their surgery.
+Personalised arthroplasty was introduced to improve surgical outcomes however
+current solutions require delays, making it difficult to integrate in clinical
+routine. We propose a fully automated workflow to design patient-specific
+implants for total knee arthroplasty.
+  Methods. The proposed pipeline first uses artificial neural networks to
+segment the femur and tibia proximal and distal extremities. Then the full
+bones are reconstructed using augmented statistical shape models, combining
+shape and landmarks information. Finally, 77 morphological parameters are
+computed to design patient-specific implants. The developed workflow has been
+trained on 91 CT scans and evaluated on 41 CT scans, in terms of accuracy and
+execution time.
+  Results. The workflow accuracy was $0.4\pm0.2mm$ for segmentation,
+$1.0\pm0.3mm$ for full bone reconstruction, and $2.2\pm1.5mm$ for anatomical
+landmarks determination. The custom implants fitted the patients' anatomy with
+$0.9\pm0.5mm$ accuracy. The whole process from segmentation to implants' design
+lasted about 15 minutes.
+  Conclusion. The proposed workflow performs a fast and reliable
+personalisation of knee implants, directly from a CT image without requiring
+any manual intervention. It allows the establishment of a patient-specific
+pre-operative planning in a very short time, making it easily available for all
+patients. Combined with efficient implant manufacturing techniques, this
+solution could help answer the growing number of arthroplasties while reducing
+complications and improving patients' satisfaction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Annotation-Free Learning by Distilling 2D Open-Vocabulary
+  Segmentation Models for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15286v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15286v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyi Sun, Yuhang Liu, Xingxia Wang, Bin Tian, Long Chen, Fei-Yue Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud data labeling is considered a time-consuming and expensive task
+in autonomous driving, whereas annotation-free learning training can avoid it
+by learning point cloud representations from unannotated data. In this paper,
+we propose AFOV, a novel 3D \textbf{A}nnotation-\textbf{F}ree framework
+assisted by 2D \textbf{O}pen-\textbf{V}ocabulary segmentation models. It
+consists of two stages: In the first stage, we innovatively integrate
+high-quality textual and image features of 2D open-vocabulary models and
+propose the Tri-Modal contrastive Pre-training (TMP). In the second stage,
+spatial mapping between point clouds and images is utilized to generate
+pseudo-labels, enabling cross-modal knowledge distillation. Besides, we
+introduce the Approximate Flat Interaction (AFI) to address the noise during
+alignment and label confusion. To validate the superiority of AFOV, extensive
+experiments are conducted on multiple related datasets. We achieved a
+record-breaking 47.73\% mIoU on the annotation-free 3D segmentation task in
+nuScenes, surpassing the previous best model by 3.13\% mIoU. Meanwhile, the
+performance of fine-tuning with 1\% data on nuScenes and SemanticKITTI reached
+a remarkable 51.75\% mIoU and 48.14\% mIoU, outperforming all previous
+pre-trained models
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures, codes are available at
+  https://github.com/sbysbysbys/AFOV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Light <span class="highlight-title">Transformer</span> Ensembles for Multimodal Trajectory
+  Forecasting <span class="chip">WACV 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17678v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17678v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrien Lafage, Mathieu Barbier, Gianni Franchi, David Filliat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate trajectory forecasting is crucial for the performance of various
+systems, such as advanced driver-assistance systems and self-driving vehicles.
+These forecasts allow us to anticipate events that lead to collisions and,
+therefore, to mitigate them. Deep Neural Networks have excelled in motion
+forecasting, but overconfidence and weak uncertainty quantification persist.
+Deep Ensembles address these concerns, yet applying them to multimodal
+distributions remains challenging. In this paper, we propose a novel approach
+named Hierarchical Light Transformer Ensembles (HLT-Ens) aimed at efficiently
+training an ensemble of Transformer architectures using a novel hierarchical
+loss function. HLT-Ens leverages grouped fully connected layers, inspired by
+grouped convolution techniques, to capture multimodal distributions
+effectively. We demonstrate that HLT-Ens achieves state-of-the-art performance
+levels through extensive experimentation, offering a promising avenue for
+improving trajectory forecasting techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WACV 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diverse Rare Sample Generation with <span class="highlight-title">Pretrain</span>ed GANs <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19543v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19543v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subeen Lee, Jiyeon Han, Soyeon Kim, Jaesik Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative models are proficient in generating realistic data but
+struggle with producing rare samples in low density regions due to their
+scarcity of training datasets and the mode collapse problem. While recent
+methods aim to improve the fidelity of generated samples, they often reduce
+diversity and coverage by ignoring rare and novel samples. This study proposes
+a novel approach for generating diverse rare samples from high-resolution image
+datasets with pretrained GANs. Our method employs gradient-based optimization
+of latent vectors within a multi-objective framework and utilizes normalizing
+flows for density estimation on the feature space. This enables the generation
+of diverse rare images, with controllable parameters for rarity, diversity, and
+similarity to a reference image. We demonstrate the effectiveness of our
+approach both qualitatively and quantitatively across various datasets and GANs
+without retraining or fine-tuning the pretrained GANs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ParGo: Bridging Vision-Language with Partial and Global Views <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12928v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12928v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An-Lan Wang, Bin Shan, Wei Shi, Kun-Yu Lin, Xiang Fei, Guozhi Tang, Lei Liao, Jingqun Tang, Can Huang, Wei-Shi Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents ParGo, a novel Partial-Global projector designed to
+connect the vision and language modalities for Multimodal Large Language Models
+(MLLMs). Unlike previous works that rely on global attention-based projectors,
+our ParGo bridges the representation gap between the separately pre-trained
+vision encoders and the LLMs by integrating global and partial views, which
+alleviates the overemphasis on prominent regions. To facilitate the effective
+training of ParGo, we collect a large-scale detail-captioned image-text dataset
+named ParGoCap-1M-PT, consisting of 1 million images paired with high-quality
+captions. Extensive experiments on several MLLM benchmarks demonstrate the
+effectiveness of our ParGo, highlighting its superiority in aligning vision and
+language modalities. Compared to conventional Q-Former projector, our ParGo
+achieves an improvement of 259.96 in MME benchmark. Furthermore, our
+experiments reveal that ParGo significantly outperforms other projectors,
+particularly in tasks that emphasize detail perception ability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Review</span> of Bayesian Uncertainty Quantification in Deep Probabilistic
+  Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.16370v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.16370v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. M. A. Valiuddin, R. J. G. van Sloun, C. G. A. Viviers, P. H. N. de With, F. van der Sommen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in image segmentation play an integral role within the broad
+scope of Deep Learning-based Computer Vision. Furthermore, their widespread
+applicability in critical real-world tasks has resulted in challenges related
+to the reliability of such algorithms. Hence, uncertainty quantification has
+been extensively studied within this context, enabling the expression of model
+ignorance (epistemic uncertainty) or data ambiguity (aleatoric uncertainty) to
+prevent uninformed decision-making. Due to the rapid adoption of Convolutional
+Neural Network (CNN)-based segmentation models in high-stake applications, a
+substantial body of research has been published on this very topic, causing its
+swift expansion into a distinct field. This work provides a comprehensive
+overview of probabilistic segmentation, by discussing fundamental concepts of
+uncertainty quantification, governing advancements in the field as well as the
+application to various tasks. Moreover, literature on both types of
+uncertainties trace back to four key applications: (1) to quantify statistical
+inconsistencies in the annotation process due ambiguous images, (2) correlating
+prediction error with uncertainty, (3) expanding the model hypothesis space for
+better generalization, and (4) Active Learning. An extensive discussion follows
+that includes an overview of utilized datasets for each of the applications and
+evaluation of the available methods. We also highlight challenges related to
+architectures, uncertainty quantification methods, standardization and
+benchmarking, and finally end with recommendations for future work such as
+methods based on single forward passes and models that appropriately leverage
+volumetric data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, revised</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Remote Sensing Vision-Language Models for Zero-Shot Scene
+  Classification <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.00698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.00698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim El Khoury, Maxime Zanella, Benoît Gérin, Tiffanie Godelaine, Benoît Macq, Saïd Mahmoudi, Christophe De Vleeschouwer, Ismail Ben Ayed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Models for remote sensing have shown promising uses thanks to
+their extensive pretraining. However, their conventional usage in zero-shot
+scene classification methods still involves dividing large images into patches
+and making independent predictions, i.e., inductive inference, thereby limiting
+their effectiveness by ignoring valuable contextual information. Our approach
+tackles this issue by utilizing initial predictions based on text prompting and
+patch affinity relationships from the image encoder to enhance zero-shot
+capabilities through transductive inference, all without the need for
+supervision and at a minor computational cost. Experiments on 10 remote sensing
+datasets with state-of-the-art Vision-Language Models demonstrate significant
+accuracy improvements over inductive zero-shot classification. Our source code
+is publicly available on Github: https://github.com/elkhouryk/RS-TransCLIP
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VideoAnydoor: High-fidelity Video Object Insertion with Precise Motion
+  Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01427v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01427v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanpeng Tu, Hao Luo, Xi Chen, Sihui Ji, Xiang Bai, Hengshuang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant advancements in video generation, inserting a given
+object into videos remains a challenging task. The difficulty lies in
+preserving the appearance details of the reference object and accurately
+modeling coherent motions at the same time. In this paper, we propose
+VideoAnydoor, a zero-shot video object insertion framework with high-fidelity
+detail preservation and precise motion control. Starting from a text-to-video
+model, we utilize an ID extractor to inject the global identity and leverage a
+box sequence to control the overall motion. To preserve the detailed appearance
+and meanwhile support fine-grained motion control, we design a pixel warper. It
+takes the reference image with arbitrary key-points and the corresponding
+key-point trajectories as inputs. It warps the pixel details according to the
+trajectories and fuses the warped features with the diffusion U-Net, thus
+improving detail preservation and supporting users in manipulating the motion
+trajectories. In addition, we propose a training strategy involving both videos
+and static images with a weighted loss to enhance insertion quality.
+VideoAnydoor demonstrates significant superiority over existing methods and
+naturally supports various downstream applications (e.g., talking head
+generation, video virtual try-on, multi-region editing) without task-specific
+fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://videoanydoor.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ START: A Generalized State Space Model with Saliency-Driven Token-Aware
+  Transformation <span class="chip">NeurIPS2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.16020v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.16020v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jintao Guo, Lei Qi, Yinghuan Shi, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain Generalization (DG) aims to enable models to generalize to unseen
+target domains by learning from multiple source domains. Existing DG methods
+primarily rely on convolutional neural networks (CNNs), which inherently learn
+texture biases due to their limited receptive fields, making them prone to
+overfitting source domains. While some works have introduced transformer-based
+methods (ViTs) for DG to leverage the global receptive field, these methods
+incur high computational costs due to the quadratic complexity of
+self-attention. Recently, advanced state space models (SSMs), represented by
+Mamba, have shown promising results in supervised learning tasks by achieving
+linear complexity in sequence length during training and fast RNN-like
+computation during inference. Inspired by this, we investigate the
+generalization ability of the Mamba model under domain shifts and find that
+input-dependent matrices within SSMs could accumulate and amplify
+domain-specific features, thus hindering model generalization. To address this
+issue, we propose a novel SSM-based architecture with saliency-based
+token-aware transformation (namely START), which achieves state-of-the-art
+(SOTA) performances and offers a competitive alternative to CNNs and ViTs. Our
+START can selectively perturb and suppress domain-specific features in salient
+tokens within the input-dependent matrices of SSMs, thus effectively reducing
+the discrepancy between different domains. Extensive experiments on five
+benchmarks demonstrate that START outperforms existing SOTA DG methods with
+efficient linear complexity. Our code is available at
+https://github.com/lingeringlight/START.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS2024. The code is available at
+  https://github.com/lingeringlight/START</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transferable Adversarial Examples with Bayes Approach <span class="chip">AsiaCCS'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06538v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06538v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Fan, Cen Chen, Wenmeng Zhou, Yinggui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The vulnerability of deep neural networks (DNNs) to black-box adversarial
+attacks is one of the most heated topics in trustworthy AI. In such attacks,
+the attackers operate without any insider knowledge of the model, making the
+cross-model transferability of adversarial examples critical. Despite the
+potential for adversarial examples to be effective across various models, it
+has been observed that adversarial examples that are specifically crafted for a
+specific model often exhibit poor transferability. In this paper, we explore
+the transferability of adversarial examples via the lens of Bayesian approach.
+Specifically, we leverage Bayesian approach to probe the transferability and
+then study what constitutes a transferability-promoting prior. Following this,
+we design two concrete transferability-promoting priors, along with an adaptive
+dynamic weighting strategy for instances sampled from these priors. Employing
+these techniques, we present BayAtk. Extensive experiments illustrate the
+significant effectiveness of BayAtk in crafting more transferable adversarial
+examples against both undefended and defended black-box models compared to
+existing state-of-the-art attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in AsiaCCS'25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ACE++: Instruction-Based Image Creation and Editing via Context-Aware
+  Content Filling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02487v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02487v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaojie Mao, Jingfeng Zhang, Yulin Pan, Zeyinzi Jiang, Zhen Han, Yu Liu, Jingren Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We report ACE++, an instruction-based diffusion framework that tackles
+various image generation and editing tasks. Inspired by the input format for
+the inpainting task proposed by FLUX.1-Fill-dev, we improve the Long-context
+Condition Unit (LCU) introduced in ACE and extend this input paradigm to any
+editing and generation tasks. To take full advantage of image generative
+priors, we develop a two-stage training scheme to minimize the efforts of
+finetuning powerful text-to-image diffusion models like FLUX.1-dev. In the
+first stage, we pre-train the model using task data with the 0-ref tasks from
+the text-to-image model. There are many models in the community based on the
+post-training of text-to-image foundational models that meet this training
+paradigm of the first stage. For example, FLUX.1-Fill-dev deals primarily with
+painting tasks and can be used as an initialization to accelerate the training
+process. In the second stage, we finetune the above model to support the
+general instructions using all tasks defined in ACE. To promote the widespread
+application of ACE++ in different scenarios, we provide a comprehensive set of
+models that cover both full finetuning and lightweight finetuning, while
+considering general applicability and applicability in vertical scenarios. The
+qualitative analysis showcases the superiority of ACE++ in terms of generating
+image quality and prompt following ability. Code and models will be available
+on the project page: https://ali-vilab. github.io/ACE_plus_page/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KNN-MMD: Cross Domain Wireless Sensing via Local Distribution Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04783v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04783v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Zhao, Zhijie Cai, Tingwei Chen, Xiaoyang Li, Hang Li, Qimei Chen, Guangxu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wireless sensing has recently found widespread applications in diverse
+environments, including homes, offices, and public spaces. By analyzing
+patterns in channel state information (CSI), it is possible to infer human
+actions for tasks such as person identification, gesture recognition, and fall
+detection. However, CSI is highly sensitive to environmental changes, where
+even minor alterations can significantly distort the CSI patterns. This
+sensitivity often leads to performance degradation or outright failure when
+applying wireless sensing models trained in one environment to another. To
+address this challenge, Domain Alignment (DAL) has been widely adopted for
+cross-domain classification tasks, as it focuses on aligning the global
+distributions of the source and target domains in feature space. Despite its
+popularity, DAL often neglects inter-category relationships, which can lead to
+misalignment between categories across domains, even when global alignment is
+achieved. To overcome these limitations, we propose K-Nearest Neighbors Maximum
+Mean Discrepancy (KNN-MMD), a novel few-shot method for cross-domain wireless
+sensing. Our approach begins by constructing a help set using KNN from the
+target domain, enabling local alignment between the source and target domains
+within each category using MMD. Additionally, we address a key instability
+issue commonly observed in cross-domain methods, where model performance
+fluctuates sharply between epochs. Further, most existing methods struggle to
+determine an optimal stopping point during training due to the absence of
+labeled data from the target domain. Our method resolves this by excluding the
+support set from the target domain during training and employing it as a
+validation set to determine the stopping criterion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AE-NeRF: Augmenting Event-Based Neural Radiance Fields for Non-ideal
+  Conditions and Larger Scene 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoran Feng, Wangbo Yu, Xinhua Cheng, Zhenyu Tang, Junwu Zhang, Li Yuan, Yonghong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compared to frame-based methods, computational neuromorphic imaging using
+event cameras offers significant advantages, such as minimal motion blur,
+enhanced temporal resolution, and high dynamic range. The multi-view
+consistency of Neural Radiance Fields combined with the unique benefits of
+event cameras, has spurred recent research into reconstructing NeRF from data
+captured by moving event cameras. While showing impressive performance,
+existing methods rely on ideal conditions with the availability of uniform and
+high-quality event sequences and accurate camera poses, and mainly focus on the
+object level reconstruction, thus limiting their practical applications. In
+this work, we propose AE-NeRF to address the challenges of learning event-based
+NeRF from non-ideal conditions, including non-uniform event sequences, noisy
+poses, and various scales of scenes. Our method exploits the density of event
+streams and jointly learn a pose correction module with an event-based NeRF
+(e-NeRF) framework for robust 3D reconstruction from inaccurate camera poses.
+To generalize to larger scenes, we propose hierarchical event distillation with
+a proposal e-NeRF network and a vanilla e-NeRF network to resample and refine
+the reconstruction process. We further propose an event reconstruction loss and
+a temporal loss to improve the view consistency of the reconstructed scene. We
+established a comprehensive benchmark that includes large-scale scenes to
+simulate practical non-ideal conditions, incorporating both synthetic and
+challenging real-world event datasets. The experimental results show that our
+method achieves a new state-of-the-art in event-based 3D reconstruction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpotDiffusion: A Fast Approach For Seamless Panorama Generation Over
+  Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15507v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15507v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stanislav Frolov, Brian B. Moser, Andreas Dengel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating high-resolution images with generative models has recently been
+made widely accessible by leveraging diffusion models pre-trained on
+large-scale datasets. Various techniques, such as MultiDiffusion and
+SyncDiffusion, have further pushed image generation beyond training
+resolutions, i.e., from square images to panorama, by merging multiple
+overlapping diffusion paths or employing gradient descent to maintain
+perceptual coherence. However, these methods suffer from significant
+computational inefficiencies due to generating and averaging numerous
+predictions, which is required in practice to produce high-quality and seamless
+images. This work addresses this limitation and presents a novel approach that
+eliminates the need to generate and average numerous overlapping denoising
+predictions. Our method shifts non-overlapping denoising windows over time,
+ensuring that seams in one timestep are corrected in the next. This results in
+coherent, high-resolution images with fewer overall steps. We demonstrate the
+effectiveness of our approach through qualitative and quantitative evaluations,
+comparing it with MultiDiffusion, SyncDiffusion, and StitchDiffusion. Our
+method offers several key benefits, including improved computational efficiency
+and faster inference times while producing comparable or better image quality.
+Link to code https://github.com/stanifrolov/spotdiffusion
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://spotdiffusion.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing the automatic segmentation and analysis of 3D liver
+  vasculature models <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15778v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15778v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassine Machta, Omar Ali, Kevin Hakkakian, Ana Vlasceanu, Amaury Facque, Nicolas Golse, Irene Vignon-Clementel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surgical assessment of liver cancer patients requires identification of the
+vessel trees from medical images. Specifically, the venous trees - the portal
+(perfusing) and the hepatic (draining) trees are important for understanding
+the liver anatomy and disease state, and perform surgery planning. This
+research aims to improve the 3D segmentation, skeletonization, and subsequent
+analysis of vessel trees, by creating an automatic pipeline based on deep
+learning and image processing techniques.
+  The first part of this work explores the impact of differentiable
+skeletonization methods such as ClDice and morphological skeletonization loss,
+on the overall liver vessel segmentation performance. To this aim, it studies
+how to improve vessel tree connectivity.
+  The second part of this study converts a single class vessel segmentation
+into multi-class ones, separating the two venous trees. It builds on the
+previous two-class vessel segmentation model, which vessel tree outputs might
+be entangled, and on connected components and skeleton analyses of the trees.
+  After providing sub-labeling of the specific anatomical branches of each
+venous tree, these algorithms also enable a morphometric analysis of the vessel
+trees by extracting various geometrical markers.
+  In conclusion, we propose a method that successfully improves current
+skeletonization methods, for extensive vascular trees that contain vessels of
+different calibers. The separation algorithm creates a clean multi-class
+segmentation of the vessels, validated by surgeons to provide low error. A new,
+publicly shared high-quality liver vessel dataset of 77 cases is thus created.
+Finally a method to annotate vessel trees according to anatomy is provided,
+enabling a unique liver vessel morphometry analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper presented at MICCAI 2024 Workshop: ADSMI. This work was done in
+  the context of an internship at Simbiotx, Inria</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MULTI: Multimodal Understanding Leaderboard with Text and Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03173v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03173v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichen Zhu, Yang Xu, Lu Chen, Jingkai Yang, Yichuan Ma, Yiming Sun, Hailin Wen, Jiaqi Liu, Jinyu Cai, Yingzi Ma, Situo Zhang, Zihan Zhao, Liangtai Sun, Kai Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of multimodal large language models (MLLMs) raises the
+question of how they compare to human performance. While existing datasets
+often feature synthetic or overly simplistic tasks, some models have already
+surpassed human expert baselines. In this paper, we present MULTI, a Chinese
+multimodal dataset derived from authentic examination questions. Comprising
+over 18,000 carefully selected and refined questions, MULTI evaluates models
+using real-world examination standards, encompassing image-text comprehension,
+complex reasoning, and knowledge recall. Additionally, We also introduce
+MULTI-Elite, a 500-question selected hard subset, and MULTI-Extend with more
+than 4,500 external knowledge context pieces for testing in-context learning
+capabilities. Our evaluation highlights substantial room for MLLM advancement,
+with Qwen2-VL-72B achieving a 76.9% accuracy on MULTI and 53.1% on MULTI-Elite
+leading 25 evaluated models, compared to human expert baselines of 86.1% and
+73.1%. MULTI serves not only as a robust evaluation platform but also paves the
+way for the development of expert-level AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 19 figures, 10 tables. Details and access are available at:
+  https://OpenDFM.github.io/MULTI-Benchmark/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Homophily Clustering: Structure Homophily Graph Learning with
+  Adaptive Filter for Hyperspectral Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01595v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01595v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Ding, Weijie Kang, Aitao Yang, Zhili Zhang, Junyang Zhao, Jie Feng, Danfeng Hong, Qinhe Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral image (HSI) clustering has been a fundamental but challenging
+task with zero training labels. Currently, some deep graph clustering methods
+have been successfully explored for HSI due to their outstanding performance in
+effective spatial structural information encoding. Nevertheless, insufficient
+structural information utilization, poor feature presentation ability, and weak
+graph update capability limit their performance. Thus, in this paper, a
+homophily structure graph learning with an adaptive filter clustering method
+(AHSGC) for HSI is proposed. Specifically, homogeneous region generation is
+first developed for HSI processing and constructing the original graph.
+Afterward, an adaptive filter graph encoder is designed to adaptively capture
+the high and low frequency features on the graph for subsequence processing.
+Then, a graph embedding clustering self-training decoder is developed with KL
+Divergence, with which the pseudo-label is generated for network training.
+Meanwhile, homophily-enhanced structure learning is introduced to update the
+graph according to the clustering task, in which the orient correlation
+estimation is adopted to estimate the node connection, and graph edge
+sparsification is designed to adjust the edges in the graph dynamically.
+Finally, a joint network optimization is introduced to achieve network
+self-training and update the graph. The K-means is adopted to express the
+latent features. Extensive experiments and repeated comparative analysis have
+verified that our AHSGC contains high clustering accuracy, low computational
+complexity, and strong robustness. The code source will be available at
+https://github.com/DY-HYX.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Approximation and bounding techniques for the Fisher-Rao distances
+  between parametric statistical models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10089v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10089v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Nielsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Fisher-Rao distance between two probability distributions of a
+statistical model is defined as the Riemannian geodesic distance induced by the
+Fisher information metric. In order to calculate the Fisher-Rao distance in
+closed-form, we need (1) to elicit a formula for the Fisher-Rao geodesics, and
+(2) to integrate the Fisher length element along those geodesics. We consider
+several numerically robust approximation and bounding techniques for the
+Fisher-Rao distances: First, we report generic upper bounds on Fisher-Rao
+distances based on closed-form 1D Fisher-Rao distances of submodels. Second, we
+describe several generic approximation schemes depending on whether the
+Fisher-Rao geodesics or pregeodesics are available in closed-form or not. In
+particular, we obtain a generic method to guarantee an arbitrarily small
+additive error on the approximation provided that Fisher-Rao pregeodesics and
+tight lower and upper bounds are available. Third, we consider the case of
+Fisher metrics being Hessian metrics, and report generic tight upper bounds on
+the Fisher-Rao distances using techniques of information geometry.
+Uniparametric and biparametric statistical models always have Fisher Hessian
+metrics, and in general a simple test allows to check whether the Fisher
+information matrix yields a Hessian metric or not. Fourth, we consider
+elliptical distribution families and show how to apply the above techniques to
+these models. We also propose two new distances based either on the Fisher-Rao
+lengths of curves serving as proxies of Fisher-Rao geodesics, or based on the
+Birkhoff/Hilbert projective cone distance. Last, we consider an alternative
+group-theoretic approach for statistical transformation models based on the
+notion of maximal invariant which yields insights on the structures of the
+Fisher-Rao distance formula which may be used fruitfully in applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>48 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive deep learning framework for robust unsupervised underwater
+  image enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08983v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08983v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alzayat Saleh, Marcus Sheaves, Dean Jerry, Mostafa Rahimi Azghadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the main challenges in deep learning-based underwater image
+enhancement is the limited availability of high-quality training data.
+Underwater images are difficult to capture and are often of poor quality due to
+the distortion and loss of colour and contrast in water. This makes it
+difficult to train supervised deep learning models on large and diverse
+datasets, which can limit the model's performance. In this paper, we explore an
+alternative approach to supervised underwater image enhancement. Specifically,
+we propose a novel unsupervised underwater image enhancement framework that
+employs a conditional variational autoencoder (cVAE) to train a deep learning
+model with probabilistic adaptive instance normalization (PAdaIN) and
+statistically guided multi-colour space stretch that produces realistic
+underwater images. The resulting framework is composed of a U-Net as a feature
+extractor and a PAdaIN to encode the uncertainty, which we call UDnet. To
+improve the visual quality of the images generated by UDnet, we use a
+statistically guided multi-colour space stretch module that ensures visual
+consistency with the input image and provides an alternative to training using
+a ground truth image. The proposed model does not need manual human annotation
+and can learn with a limited amount of data and achieves state-of-the-art
+results on underwater images. We evaluated our proposed framework on eight
+publicly-available datasets. The results show that our proposed framework
+yields competitive performance compared to other state-of-the-art approaches in
+quantitative as well as qualitative metrics. Code available at
+https://github.com/alzayats/UDnet .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 7 figures, 6 tables, accepted for publication in Expert
+  Systems with Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GUPNet++: Geometry Uncertainty Propagation Network for Monocular 3D
+  Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.15624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.15624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Lu, Xinzhu Ma, Lei Yang, Tianzhu Zhang, Yating Liu, Qi Chu, Tong He, Yonghui Li, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geometry plays a significant role in monocular 3D object detection. It can be
+used to estimate object depth by using the perspective projection between
+object's physical size and 2D projection in the image plane, which can
+introduce mathematical priors into deep models. However, this projection
+process also introduces error amplification, where the error of the estimated
+height is amplified and reflected into the projected depth. It leads to
+unreliable depth inferences and also impairs training stability. To tackle this
+problem, we propose a novel Geometry Uncertainty Propagation Network (GUPNet++)
+by modeling geometry projection in a probabilistic manner. This ensures depth
+predictions are well-bounded and associated with a reasonable uncertainty. The
+significance of introducing such geometric uncertainty is two-fold: (1). It
+models the uncertainty propagation relationship of the geometry projection
+during training, improving the stability and efficiency of the end-to-end model
+learning. (2). It can be derived to a highly reliable confidence to indicate
+the quality of the 3D detection result, enabling more reliable detection
+inference. Experiments show that the proposed approach not only obtains
+(state-of-the-art) SOTA performance in image-based monocular 3D detection but
+also demonstrates superiority in efficacy with a simplified framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Concept Matching with Agent for Out-of-Distribution Detection <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiao Lee, Xiaofeng Cao, Jingcai Guo, Wei Ye, Qing Guo, Yi Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable achievements of Large Language Models (LLMs) have captivated
+the attention of both academia and industry, transcending their initial role in
+dialogue generation. To expand the usage scenarios of LLM, some works enhance
+the effectiveness and capabilities of the model by introducing more external
+information, which is called the agent paradigm. Based on this idea, we propose
+a new method that integrates the agent paradigm into out-of-distribution (OOD)
+detection task, aiming to improve its robustness and adaptability. Our proposed
+method, Concept Matching with Agent (CMA), employs neutral prompts as agents to
+augment the CLIP-based OOD detection process. These agents function as dynamic
+observers and communication hubs, interacting with both In-distribution (ID)
+labels and data inputs to form vector triangle relationships. This triangular
+framework offers a more nuanced approach than the traditional binary
+relationship, allowing for better separation and identification of ID and OOD
+inputs. Our extensive experimental results showcase the superior performance of
+CMA over both zero-shot and training-required methods in a diverse array of
+real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-Performance Inference Graph Convolutional Networks for
+  Skeleton-Based Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18710v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18710v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyi Wang, Ziao Li, Bangli Liu, Haibin Cai, Mohamad Saada, Qinggang Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the significant achievements have been made in skeleton-based human
+action recognition with the emergence of graph convolutional networks (GCNs).
+However, the state-of-the-art (SOTA) models used for this task focus on
+constructing more complex higher-order connections between joint nodes to
+describe skeleton information, which leads to complex inference processes and
+high computational costs. To address the slow inference speed caused by overly
+complex model structures, we introduce re-parameterization and
+over-parameterization techniques to GCNs and propose two novel high-performance
+inference GCNs, namely HPI-GCN-RP and HPI-GCN-OP. After the completion of model
+training, model parameters are fixed. HPI-GCN-RP adopts re-parameterization
+technique to transform high-performance training model into fast inference
+model through linear transformations, which achieves a higher inference speed
+with competitive model performance. HPI-GCN-OP further utilizes
+over-parameterization technique to achieve higher performance improvement by
+introducing additional inference parameters, albeit with slightly decreased
+inference speed. The experimental results on the two skeleton-based action
+recognition datasets demonstrate the effectiveness of our approach. Our
+HPI-GCN-OP achieves performance comparable to the current SOTA models, with
+inference speeds five times faster. Specifically, our HPI-GCN-OP achieves an
+accuracy of 93\% on the cross-subject split of the NTU-RGB+D 60 dataset, and
+90.1\% on the cross-subject benchmark of the NTU-RGB+D 120 dataset. Code is
+available at github.com/lizaowo/HPI-GCN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Siamese-DETR for Generic Multi-Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.17875v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.17875v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiankun Liu, Yichen Li, Yuqi Jiang, Ying Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to detect and track the dynamic objects in different scenes is
+fundamental to real-world applications, e.g., autonomous driving and robot
+navigation. However, traditional Multi-Object Tracking (MOT) is limited to
+tracking objects belonging to the pre-defined closed-set categories. Recently,
+Open-Vocabulary MOT (OVMOT) and Generic MOT (GMOT) are proposed to track
+interested objects beyond pre-defined categories with the given text prompt and
+template image. However, the expensive well pre-trained (vision-)language model
+and fine-grained category annotations are required to train OVMOT models. In
+this paper, we focus on GMOT and propose a simple but effective method,
+Siamese-DETR, for GMOT. Only the commonly used detection datasets (e.g., COCO)
+are required for training. Different from existing GMOT methods, which train a
+Single Object Tracking (SOT) based detector to detect interested objects and
+then apply a data association based MOT tracker to get the trajectories, we
+leverage the inherent object queries in DETR variants. Specifically: 1) The
+multi-scale object queries are designed based on the given template image,
+which are effective for detecting different scales of objects with the same
+category as the template image; 2) A dynamic matching training strategy is
+introduced to train Siamese-DETR on commonly used detection datasets, which
+takes full advantage of provided annotations; 3) The online tracking pipeline
+is simplified through a tracking-by-query manner by incorporating the tracked
+boxes in previous frame as additional query boxes. The complex data association
+is replaced with the much simpler Non-Maximum Suppression (NMS). Extensive
+experimental results show that Siamese-DETR surpasses existing MOT methods on
+GMOT-40 dataset by a large margin. Codes are avaliable at
+\url{https://github.com/yumu-173/Siamese-DETR}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Localize-and-Stitch: Efficient Model Merging via Sparse Task Arithmetic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13656v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13656v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei He, Yuzheng Hu, Yong Lin, Tong Zhang, Han Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model merging offers an effective strategy to combine the strengths of
+multiple finetuned models into a unified model that preserves the specialized
+capabilities of each. Existing methods merge models in a global manner,
+performing arithmetic operations across all model parameters. However, such
+global merging often leads to task interference, degrading the performance of
+the merged model. In this work, we introduce Localize-and-Stitch, a novel
+approach that merges models in a localized way. Our algorithm works in two
+steps: i) Localization: identify tiny ($1\%$ of the total parameters) localized
+regions in the finetuned models containing essential skills for the downstream
+tasks, and ii) Stitching: reintegrate only these essential regions back into
+the pretrained model for task synergy. We demonstrate that our approach
+effectively locates sparse regions responsible for finetuned performance, and
+the localized regions could be treated as compact and interpretable
+representations of the finetuned models (tasks). Empirically, we evaluate our
+method on various vision and language benchmarks, showing that it outperforms
+existing model merging methods under different data availability scenarios.
+Beyond strong empirical performance, our algorithm also facilitates model
+compression and preserves pretrained knowledge, enabling flexible and continual
+skill composition from multiple finetuned models with minimal storage and
+computational overhead. Our code is available at
+https://github.com/uiuctml/Localize-and-Stitch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Checking in Medical Imaging for Tumor Detection and Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02024v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02024v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elhoucine Elfatimi, Lahcen El fatimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in model checking have demonstrated significant potential
+across diverse applications, particularly in signal and image analysis. Medical
+imaging stands out as a critical domain where model checking can be effectively
+applied to design and evaluate robust frameworks. These frameworks facilitate
+automatic and semi-automatic delineation of regions of interest within images,
+aiding in accurate segmentation. This paper provides a comprehensive analysis
+of recent works leveraging spatial logic to develop operators and tools for
+identifying regions of interest, including tumorous and non-tumorous areas.
+Additionally, we examine the challenges inherent to spatial model-checking
+techniques, such as variability in ground truth data and the need for
+streamlined procedures suitable for routine clinical practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EEG Emotion Copilot: Optimizing Lightweight LLMs for Emotional EEG
+  Interpretation with Assisted Medical Record Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.00166v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.00166v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyu Chen, Weiming Zeng, Chengcheng Chen, Luhui Cai, Fei Wang, Yuhu Shi, Lei Wang, Wei Zhang, Yueyang Li, Hongjie Yan, Wai Ting Siok, Nizhuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the fields of affective computing (AC) and brain-machine interface (BMI),
+the analysis of physiological and behavioral signals to discern individual
+emotional states has emerged as a critical research frontier. While deep
+learning-based approaches have made notable strides in EEG emotion recognition,
+particularly in feature extraction and pattern recognition, significant
+challenges persist in achieving end-to-end emotion computation, including
+real-time processing, individual adaptation, and seamless user interaction.
+This paper presents the EEG Emotion Copilot, a system optimizing a lightweight
+large language model (LLM) with 0.5B parameters operating in a local setting,
+which first recognizes emotional states directly from EEG signals, subsequently
+generates personalized diagnostic and treatment suggestions, and finally
+supports the automation of assisted electronic medical records. Specifically,
+we demonstrate the critical techniques in the novel data structure of prompt,
+model pruning and fine-tuning training, and deployment strategies aiming at
+improving real-time performance and computational efficiency. Extensive
+experiments show that our optimized lightweight LLM-based copilot achieves an
+enhanced intuitive interface for participant interaction, superior accuracy of
+emotion recognition and assisted electronic medical records generation, in
+comparison to such models with similar scale parameters or large-scale
+parameters such as 1.5B, 1.8B, 3B and 7B. In summary, through these efforts,
+the proposed copilot is expected to advance the application of AC in the
+medical domain, offering innovative solution to mental health monitoring. The
+codes will be released at https://github.com/NZWANG/EEG_Emotion_Copilot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 12 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An In-Depth Analysis of Adversarial Discriminative Domain Adaptation for
+  Digit Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19391v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19391v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Choi, Julian Rodriguez, Edmund Young
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation is an active area of research driven by the growing demand
+for robust machine learning models that perform well on real-world data.
+Adversarial learning for deep neural networks (DNNs) has emerged as a promising
+approach to improving generalization ability, particularly for image
+classification. In this paper, we implement a specific adversarial learning
+technique known as Adversarial Discriminative Domain Adaptation (ADDA) and
+replicate digit classification experiments from the original ADDA paper. We
+extend their findings by examining a broader range of domain shifts and provide
+a detailed analysis of in-domain classification accuracy post-ADDA. Our results
+demonstrate that ADDA significantly improves accuracy across certain domain
+shifts with minimal impact on in-domain performance. Furthermore, we provide
+qualitative analysis and propose potential explanations for ADDA's limitations
+in less successful domain shifts. Code is at
+https://github.com/eugenechoi2004/COS429_FINAL .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Replacement: Updated methodology section to include grayscale
+  preprocessing of SVHN data</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VidFormer: A novel end-to-end framework fused by 3DCNN and <span class="highlight-title">Transformer</span>
+  for Video-based Remote Physiological Measurement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01691v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01691v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Li, Shisheng Guo, Longzhen Tang, Cuolong Cui, Lingjiang Kong, Xiaobo Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote physiological signal measurement based on facial videos, also known as
+remote photoplethysmography (rPPG), involves predicting changes in facial
+vascular blood flow from facial videos. While most deep learning-based methods
+have achieved good results, they often struggle to balance performance across
+small and large-scale datasets due to the inherent limitations of convolutional
+neural networks (CNNs) and Transformer. In this paper, we introduce VidFormer,
+a novel end-to-end framework that integrates 3-Dimension Convolutional Neural
+Network (3DCNN) and Transformer models for rPPG tasks. Initially, we conduct an
+analysis of the traditional skin reflection model and subsequently introduce an
+enhanced model for the reconstruction of rPPG signals. Based on this improved
+model, VidFormer utilizes 3DCNN and Transformer to extract local and global
+features from input data, respectively. To enhance the spatiotemporal feature
+extraction capabilities of VidFormer, we incorporate temporal-spatial attention
+mechanisms tailored for both 3DCNN and Transformer. Additionally, we design a
+module to facilitate information exchange and fusion between the 3DCNN and
+Transformer. Our evaluation on five publicly available datasets demonstrates
+that VidFormer outperforms current state-of-the-art (SOTA) methods. Finally, we
+discuss the essential roles of each VidFormer module and examine the effects of
+ethnicity, makeup, and exercise on its performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Socratic Questioning: Learn to Self-guide Multimodal Reasoning in the
+  Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02964v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02964v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanpeng Hu, Haodi Liu, Lin Chen, Feng Zhou, Changming Xiao, Qi Yang, Changshui Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Complex visual reasoning remains a key challenge today. Typically, the
+challenge is tackled using methodologies such as Chain of Thought (COT) and
+visual instruction tuning. However, how to organically combine these two
+methodologies for greater success remains unexplored. Also, issues like
+hallucinations and high training cost still need to be addressed. In this work,
+we devise an innovative multi-round training and reasoning framework suitable
+for lightweight Multimodal Large Language Models (MLLMs). Our self-questioning
+approach heuristically guides MLLMs to focus on visual clues relevant to the
+target problem, reducing hallucinations and enhancing the model's ability to
+describe fine-grained image details. This ultimately enables the model to
+perform well in complex visual reasoning and question-answering tasks. We have
+named this framework Socratic Questioning(SQ). To facilitate future research,
+we create a multimodal mini-dataset named CapQA, which includes 1k images of
+fine-grained activities, for visual instruction tuning and evaluation, our
+proposed SQ method leads to a 31.2% improvement in the hallucination score. Our
+extensive experiments on various benchmarks demonstrate SQ's remarkable
+capabilities in heuristic self-questioning, zero-shot visual reasoning and
+hallucination mitigation. Our model and code will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ControlMLLM: Training-Free Visual <span class="highlight-title">Prompt</span> Learning for Multimodal Large
+  Language Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21534v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21534v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingrui Wu, Xinyue Cai, Jiayi Ji, Jiale Li, Oucheng Huang, Gen Luo, Hao Fei, Guannan Jiang, Xiaoshuai Sun, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a training-free method to inject visual prompts into
+Multimodal Large Language Models (MLLMs) through test-time optimization of a
+learnable latent variable. We observe that attention, as the core module of
+MLLMs, connects text prompt tokens and visual tokens, ultimately determining
+the final results. Our approach involves adjusting visual tokens from the MLP
+output at test time, controlling the attention response to ensure text prompt
+tokens attend to visual tokens in referring regions. We optimize a learnable
+latent variable based on an energy function, enhancing the strength of
+referring regions in the attention map. This enables detailed region
+description and reasoning without the need for substantial training costs or
+model retraining. Our method offers a promising direction for integrating
+referring abilities into MLLMs, and supports referring with box, mask, scribble
+and point. The results demonstrate that our method exhibits out-of-domain
+generalization and interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2024;
+  Code:https://github.com/mrwu-mac/ControlMLLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SceneVTG++: Controllable Multilingual Visual Text Generation in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02962v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02962v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Liu, Yuanzhi Zhu, Feiyu Gao, Zhibo Yang, Peng Wang, Junyang Lin, Xinggang Wang, Wenyu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating visual text in natural scene images is a challenging task with
+many unsolved problems. Different from generating text on artificially designed
+images (such as posters, covers, cartoons, etc.), the text in natural scene
+images needs to meet the following four key criteria: (1) Fidelity: the
+generated text should appear as realistic as a photograph and be completely
+accurate, with no errors in any of the strokes. (2) Reasonability: the text
+should be generated on reasonable carrier areas (such as boards, signs, walls,
+etc.), and the generated text content should also be relevant to the scene. (3)
+Utility: the generated text can facilitate to the training of natural scene OCR
+(Optical Character Recognition) tasks. (4) Controllability: The attribute of
+the text (such as font and color) should be controllable as needed. In this
+paper, we propose a two stage method, SceneVTG++, which simultaneously
+satisfies the four aspects mentioned above. SceneVTG++ consists of a Text
+Layout and Content Generator (TLCG) and a Controllable Local Text Diffusion
+(CLTD). The former utilizes the world knowledge of multi modal large language
+models to find reasonable text areas and recommend text content according to
+the nature scene background images, while the latter generates controllable
+multilingual text based on the diffusion model. Through extensive experiments,
+we respectively verified the effectiveness of TLCG and CLTD, and demonstrated
+the state-of-the-art text generation performance of SceneVTG++. In addition,
+the generated images have superior utility in OCR tasks like text detection and
+text recognition. Codes and datasets will be available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ INFELM: In-depth Fairness Evaluation of Large Text-To-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01973v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01973v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Jin, Xing Liu, Yu Liu, Jia Qing Yap, Andrea Wong, Adriana Crespo, Qi Lin, Zhiyuan Yin, Qiang Yan, Ryan Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of large language models (LLMs) and large vision models
+(LVMs) have propelled the evolution of multi-modal AI systems, which have
+demonstrated the remarkable potential for industrial applications by emulating
+human-like cognition. However, they also pose significant ethical challenges,
+including amplifying harmful content and reinforcing societal biases. For
+instance, biases in some industrial image generation models highlighted the
+urgent need for robust fairness assessments. Most existing evaluation
+frameworks focus on the comprehensiveness of various aspects of the models, but
+they exhibit critical limitations, including insufficient attention to content
+generation alignment and social bias-sensitive domains. More importantly, their
+reliance on pixel-detection techniques is prone to inaccuracies.
+  To address these issues, this paper presents INFELM, an in-depth fairness
+evaluation on widely-used text-to-image models. Our key contributions are: (1)
+an advanced skintone classifier incorporating facial topology and refined skin
+pixel representation to enhance classification precision by at least 16.04%,
+(2) a bias-sensitive content alignment measurement for understanding societal
+impacts, (3) a generalizable representation bias evaluation for diverse
+demographic groups, and (4) extensive experiments analyzing large-scale
+text-to-image model outputs across six social-bias-sensitive domains. We find
+that existing models in the study generally do not meet the empirical fairness
+criteria, and representation bias is generally more pronounced than alignment
+errors. INFELM establishes a robust benchmark for fairness assessment,
+supporting the development of multi-modal AI systems that align with ethical
+and human-centric principles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Di Jin and Xing Liu contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trusted Mamba Contrastive Network for Multi-View Clustering <span class="chip">ICASSP2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16487v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16487v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Zhu, Xin Zou, Lei Liu, Zhangmin Huang, Ying Zhang, Chang Tang, Li-Rong Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view clustering can partition data samples into their categories by
+learning a consensus representation in an unsupervised way and has received
+more and more attention in recent years. However, there is an untrusted fusion
+problem. The reasons for this problem are as follows: 1) The current methods
+ignore the presence of noise or redundant information in the view; 2) The
+similarity of contrastive learning comes from the same sample rather than the
+same cluster in deep multi-view clustering. It causes multi-view fusion in the
+wrong direction. This paper proposes a novel multi-view clustering network to
+address this problem, termed as Trusted Mamba Contrastive Network (TMCN).
+Specifically, we present a new Trusted Mamba Fusion Network (TMFN), which
+achieves a trusted fusion of multi-view data through a selective mechanism.
+Moreover, we align the fused representation and the view-specific
+representation using the Average-similarity Contrastive Learning (AsCL) module.
+AsCL increases the similarity of view presentation from the same cluster, not
+merely from the same sample. Extensive experiments show that the proposed
+method achieves state-of-the-art results in deep multi-view clustering tasks.
+The source code is available at https://github.com/HackerHyper/TMCN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by 2025 IEEE International Conference on Acoustics, Speech,
+  and Signal Processing(ICASSP2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlanLLM: Video Procedure Planning with Refinable Large Language Models <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19139v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19139v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dejie Yang, Zijing Zhao, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video procedure planning, i.e., planning a sequence of action steps given the
+video frames of start and goal states, is an essential ability for embodied AI.
+Recent works utilize Large Language Models (LLMs) to generate enriched action
+step description texts to guide action step decoding. Although LLMs are
+introduced, these methods decode the action steps into a closed-set of one-hot
+vectors, limiting the model's capability of generalizing to new steps or tasks.
+Additionally, fixed action step descriptions based on world-level commonsense
+may contain noise in specific instances of visual states. In this paper, we
+propose PlanLLM, a cross-modal joint learning framework with LLMs for video
+procedure planning. We propose an LLM-Enhanced Planning module which fully uses
+the generalization ability of LLMs to produce free-form planning output and to
+enhance action step decoding. We also propose Mutual Information Maximization
+module to connect world-level commonsense of step descriptions and
+sample-specific information of visual states, enabling LLMs to employ the
+reasoning ability to generate step sequences. With the assistance of LLMs, our
+method can both closed-set and open vocabulary procedure planning tasks. Our
+PlanLLM achieves superior performance on three benchmarks, demonstrating the
+effectiveness of our designs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rare-to-Frequent: Unlocking Compositional Generation Power of Diffusion
+  Models on Rare Concepts with LLM Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.22376v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.22376v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongmin Park, Sebin Kim, Taehong Moon, Minkyu Kim, Kangwook Lee, Jaewoong Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art text-to-image (T2I) diffusion models often struggle to
+generate rare compositions of concepts, e.g., objects with unusual attributes.
+In this paper, we show that the compositional generation power of diffusion
+models on such rare concepts can be significantly enhanced by the Large
+Language Model (LLM) guidance. We start with empirical and theoretical
+analysis, demonstrating that exposing frequent concepts relevant to the target
+rare concepts during the diffusion sampling process yields more accurate
+concept composition. Based on this, we propose a training-free approach, R2F,
+that plans and executes the overall rare-to-frequent concept guidance
+throughout the diffusion inference by leveraging the abundant semantic
+knowledge in LLMs. Our framework is flexible across any pre-trained diffusion
+models and LLMs, and can be seamlessly integrated with the region-guided
+diffusion approaches. Extensive experiments on three datasets, including our
+newly proposed benchmark, RareBench, containing various prompts with rare
+compositions of concepts, R2F significantly surpasses existing models including
+SD3.0 and FLUX by up to 28.1%p in T2I alignment. Code is available at
+https://github.com/krafton-ai/Rare-to-Frequent.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Flemme: A Flexible and Modular Learning Platform for Medical Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09369v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09369v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoqing Zhang, Jingyun Yang, Yang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the rapid development of computer vision and the emergence of powerful
+network backbones and architectures, the application of deep learning in
+medical imaging has become increasingly significant. Unlike natural images,
+medical images lack huge volumes of data but feature more modalities, making it
+difficult to train a general model that has satisfactory performance across
+various datasets. In practice, practitioners often suffer from manually
+creating and testing models combining independent backbones and architectures,
+which is a laborious and time-consuming process. We propose Flemme, a FLExible
+and Modular learning platform for MEdical images. Our platform separates
+encoders from the model architectures so that different models can be
+constructed via various combinations of supported encoders and architectures.
+We construct encoders using building blocks based on convolution, transformer,
+and state-space model (SSM) to process both 2D and 3D image patches. A base
+architecture is implemented following an encoder-decoder style, with several
+derived architectures for image segmentation, reconstruction, and generation
+tasks. In addition, we propose a general hierarchical architecture
+incorporating a pyramid loss to optimize and fuse vertical features.
+Experiments demonstrate that this simple design leads to an average improvement
+of 5.60% in Dice score and 7.81% in mean interaction of units (mIoU) for
+segmentation models, as well as an enhancement of 5.57% in peak signal-to-noise
+ratio (PSNR) and 8.22% in structural similarity (SSIM) for reconstruction
+models. We further utilize Flemme as an analytical tool to assess the
+effectiveness and efficiency of various encoders across different tasks. Code
+is available at https://github.com/wlsdzyzl/flemme.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Light-weight Fine-tuning Method for Defending Adversarial Noise in
+  <span class="highlight-title">Pre-train</span>ed Medical Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.02716v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.02716v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Han, Linghao Jin, Xuezhe Ma, Xiaofeng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning pre-trained Vision-Language Models (VLMs) has shown remarkable
+capabilities in medical image and textual depiction synergy. Nevertheless, many
+pre-training datasets are restricted by patient privacy concerns, potentially
+containing noise that can adversely affect downstream performance. Moreover,
+the growing reliance on multi-modal generation exacerbates this issue because
+of its susceptibility to adversarial attacks. To investigate how VLMs trained
+on adversarial noisy data perform on downstream medical tasks, we first craft
+noisy upstream datasets using multi-modal adversarial attacks. Through our
+comprehensive analysis, we unveil that moderate noise enhances model robustness
+and transferability, but increasing noise levels negatively impact downstream
+task performance. To mitigate this issue, we propose rectify adversarial noise
+(RAN) framework, a recipe designed to effectively defend adversarial attacks
+and rectify the influence of upstream noise during fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoMA: Compositional Human Motion Generation with Multi-modal Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.07320v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.07320v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanlin Sun, Gabriel De Araujo, Jiaqi Xu, Shenghan Zhou, Hanwen Zhang, Ziheng Huang, Chenyu You, Xiaohui Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D human motion generation has seen substantial advancement in recent years.
+While state-of-the-art approaches have improved performance significantly, they
+still struggle with complex and detailed motions unseen in training data,
+largely due to the scarcity of motion datasets and the prohibitive cost of
+generating new training examples. To address these challenges, we introduce
+CoMA, an agent-based solution for complex human motion generation, editing, and
+comprehension. CoMA leverages multiple collaborative agents powered by large
+language and vision models, alongside a mask transformer-based motion generator
+featuring body part-specific encoders and codebooks for fine-grained control.
+Our framework enables generation of both short and long motion sequences with
+detailed instructions, text-guided motion editing, and self-correction for
+improved quality. Evaluations on the HumanML3D dataset demonstrate competitive
+performance against state-of-the-art methods. Additionally, we create a set of
+context-rich, compositional, and long text prompts, where user studies show our
+method significantly outperforms existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://gabrie-l.github.io/coma-page/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fair Text to Medical Image Diffusion Model with Subgroup Distribution
+  Aligned Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14847v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14847v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Han, Fangfang Fan, Jingzhao Rong, Zhen Li, Georges El Fakhri, Qingyu Chen, Xiaofeng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The text to medical image (T2MedI) with latent diffusion model has great
+potential to alleviate the scarcity of medical imaging data and explore the
+underlying appearance distribution of lesions in a specific patient status
+description. However, as the text to nature image models, we show that the
+T2MedI model can also bias to some subgroups to overlook the minority ones in
+the training set. In this work, we first build a T2MedI model based on the
+pre-trained Imagen model, which has the fixed contrastive language-image
+pre-training (CLIP) text encoder, while its decoder has been fine-tuned on
+medical images from the Radiology Objects in COntext (ROCO) dataset. Its gender
+bias is analyzed qualitatively and quantitatively. Toward this issue, we
+propose to fine-tune the T2MedI toward the target application dataset to align
+their sensitive subgroups distribution probability. Specifically, the alignment
+loss for fine-tuning is guided by an off-the-shelf sensitivity-subgroup
+classifier to match the classification probability between the generated images
+and the expected target dataset. In addition, the image quality is maintained
+by a CLIP-consistency regularization term following a knowledge distillation
+scheme. For evaluation, we set the target dataset to be enhanced as the BraST18
+dataset, and trained a brain magnetic resonance (MR) slice-based gender
+classifier from it. With our method, the generated MR image can markedly reduce
+the inconsistency with the gender proportion in the BraTS18 dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Skeleton Interaction Graph Aggregation Network for Representation
+  Learning of Mouse Social Behaviour 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.03819v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.03819v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feixiang Zhou, Xinyu Yang, Fang Chen, Long Chen, Zheheng Jiang, Hui Zhu, Reiko Heckel, Haikuan Wang, Minrui Fei, Huiyu Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated social behaviour analysis of mice has become an increasingly
+popular research area in behavioural neuroscience. Recently, pose information
+(i.e., locations of keypoints or skeleton) has been used to interpret social
+behaviours of mice. Nevertheless, effective encoding and decoding of social
+interaction information underlying the keypoints of mice has been rarely
+investigated in the existing methods. In particular, it is challenging to model
+complex social interactions between mice due to highly deformable body shapes
+and ambiguous movement patterns. To deal with the interaction modelling
+problem, we here propose a Cross-Skeleton Interaction Graph Aggregation Network
+(CS-IGANet) to learn abundant dynamics of freely interacting mice, where a
+Cross-Skeleton Node-level Interaction module (CS-NLI) is used to model
+multi-level interactions (i.e., intra-, inter- and cross-skeleton
+interactions). Furthermore, we design a novel Interaction-Aware Transformer
+(IAT) to dynamically learn the graph-level representation of social behaviours
+and update the node-level representation, guided by our proposed
+interaction-aware self-attention mechanism. Finally, to enhance the
+representation ability of our model, an auxiliary self-supervised learning task
+is proposed for measuring the similarity between cross-skeleton nodes.
+Experimental results on the standard CRMI13-Skeleton and our PDMB-Skeleton
+datasets show that our proposed model outperforms several other
+state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Transactions on Image Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MotionBridge: Dynamic Video Inbetweening with Flexible Controls 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13190v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13190v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maham Tanveer, Yang Zhou, Simon Niklaus, Ali Mahdavi Amiri, Hao Zhang, Krishna Kumar Singh, Nanxuan Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By generating plausible and smooth transitions between two image frames,
+video inbetweening is an essential tool for video editing and long video
+synthesis. Traditional works lack the capability to generate complex large
+motions. While recent video generation techniques are powerful in creating
+high-quality results, they often lack fine control over the details of
+intermediate frames, which can lead to results that do not align with the
+creative mind. We introduce MotionBridge, a unified video inbetweening
+framework that allows flexible controls, including trajectory strokes,
+keyframes, masks, guide pixels, and text. However, learning such multi-modal
+controls in a unified framework is a challenging task. We thus design two
+generators to extract the control signal faithfully and encode feature through
+dual-branch embedders to resolve ambiguities. We further introduce a curriculum
+training strategy to smoothly learn various controls. Extensive qualitative and
+quantitative experiments have demonstrated that such multi-modal controls
+enable a more dynamic, customizable, and contextually accurate visual
+narrative.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: [https://motionbridge.github.io/]</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unconditional Latent Diffusion Models Memorize Patient Imaging Data:
+  Implications for Openly Sharing Synthetic Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01054v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01054v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salman Ul Hassan Dar, Marvin Seyfarth, Isabelle Ayx, Theano Papavassiliu, Stefan O. Schoenberg, Robert Malte Siepmann, Fabian Christopher Laqua, Jannik Kahmann, Norbert Frey, Bettina Baeßler, Sebastian Foersch, Daniel Truhn, Jakob Nikolas Kather, Sandy Engelhardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI models present a wide range of applications in the field of medicine.
+However, achieving optimal performance requires access to extensive healthcare
+data, which is often not readily available. Furthermore, the imperative to
+preserve patient privacy restricts patient data sharing with third parties and
+even within institutes. Recently, generative AI models have been gaining
+traction for facilitating open-data sharing by proposing synthetic data as
+surrogates of real patient data. Despite the promise, some of these models are
+susceptible to patient data memorization, where models generate patient data
+copies instead of novel synthetic samples. Considering the importance of the
+problem, surprisingly it has received relatively little attention in the
+medical imaging community. To this end, we assess memorization in unconditional
+latent diffusion models. We train latent diffusion models on CT, MR, and X-ray
+datasets for synthetic data generation. We then detect the amount of training
+data memorized utilizing our novel self-supervised copy detection approach and
+further investigate various factors that can influence memorization. Our
+findings show a surprisingly high degree of patient data memorization across
+all datasets. Comparison with non-diffusion generative models, such as
+autoencoders and generative adversarial networks, indicates that while latent
+diffusion models are more susceptible to memorization, overall they outperform
+non-diffusion models in synthesis quality. Further analyses reveal that using
+augmentation strategies, small architecture, and increasing dataset can reduce
+memorization while over-training the models can enhance it. Collectively, our
+results emphasize the importance of carefully training generative models on
+private medical imaging datasets, and examining the synthetic data to ensure
+patient privacy before sharing it for medical research and applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TGGLinesPlus: A robust topological graph-guided computer vision
+  algorithm for line detection from images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18038v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18038v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Yang, Joshua Driscol, Ming Gong, Katie Slack, Wenbin Zhang, Shujie Wang, Catherine G. Potts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Line detection is a classic and essential problem in image processing,
+computer vision and machine intelligence. Line detection has many important
+applications, including image vectorization (e.g., document recognition and art
+design), indoor mapping, and important societal challenges (e.g., sea ice
+fracture line extraction from satellite imagery). Many line detection
+algorithms and methods have been developed, but robust and intuitive methods
+are still lacking. In this paper, we proposed and implemented a topological
+graph-guided algorithm, named TGGLinesPlus, for line detection. Our experiments
+on images from a wide range of domains have demonstrated the flexibility of our
+TGGLinesPlus algorithm. We benchmarked our algorithm with five classic and
+state-of-the-art line detection methods and evaluated the benchmark results
+qualitatively and quantitatively, the results demonstrate the robustness of
+TGGLinesPlus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our TGGLinesPlus Python implementation is open-sourced. 29 pages, 8
+  figures and 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlocking the diagnostic potential of electrocardiograms through
+  information transfer from cardiac magnetic resonance imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05764v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05764v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Özgün Turgut, Philip Müller, Paul Hager, Suprosanna Shit, Sophie Starck, Martin J. Menten, Eimo Martens, Daniel Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiovascular diseases (CVD) can be diagnosed using various diagnostic
+modalities. The electrocardiogram (ECG) is a cost-effective and widely
+available diagnostic aid that provides functional information of the heart.
+However, its ability to classify and spatially localise CVD is limited. In
+contrast, cardiac magnetic resonance (CMR) imaging provides detailed structural
+information of the heart and thus enables evidence-based diagnosis of CVD, but
+long scan times and high costs limit its use in clinical routine. In this work,
+we present a deep learning strategy for cost-effective and comprehensive
+cardiac screening solely from ECG. Our approach combines multimodal contrastive
+learning with masked data modelling to transfer domain-specific information
+from CMR imaging to ECG representations. In extensive experiments using data
+from 40,044 UK Biobank subjects, we demonstrate the utility and
+generalisability of our method for subject-specific risk prediction of CVD and
+the prediction of cardiac phenotypes using only ECG data. Specifically, our
+novel multimodal pre-training paradigm improves performance by up to 12.19 %
+for risk prediction and 27.59 % for phenotype prediction. In a qualitative
+analysis, we demonstrate that our learned ECG representations incorporate
+information from CMR image regions of interest. Our entire pipeline is publicly
+available at https://github.com/oetu/MMCL-ECG-CMR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KCNet: An Insect-Inspired Single-Hidden-Layer Neural Network with
+  Randomized Binary Weights for Prediction and Classification Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.07554v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.07554v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyung Hong, Theodore P. Pavlic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fruit flies are established model systems for studying olfactory learning as
+they will readily learn to associate odors with both electric shock or sugar
+rewards. The mechanisms of the insect brain apparently responsible for odor
+learning form a relatively shallow neuronal architecture. Olfactory inputs are
+received by the antennal lobe (AL) of the brain, which produces an encoding of
+each odor mixture across ~50 sub-units known as glomeruli. Each of these
+glomeruli then projects its component of this feature vector to several of
+~2000 so-called Kenyon Cells (KCs) in a region of the brain known as the
+mushroom body (MB). Fly responses to odors are generated by small downstream
+neutrophils that decode the higher-order representation from the MB. Research
+has shown that there is no recognizable pattern in the glomeruli--KC
+connections (and thus the particular higher-order representations); they are
+akin to fingerprints--even isogenic flies have different projections.
+Leveraging insights from this architecture, we propose KCNet, a
+single-hidden-layer neural network that contains sparse, randomized, binary
+weights between the input layer and the hidden layer and analytically learned
+weights between the hidden layer and the output layer. Furthermore, we also
+propose a dynamic optimization algorithm that enables the KCNet to increase
+performance beyond its structural limits by searching for a more efficient set
+of inputs. For odorant-perception tasks that predict the perceptual properties
+of an odorant, we show that KCNet outperforms existing data-driven approaches,
+such as XGBoost. For image classification tasks, KCNet achieves reasonable
+performance on benchmark datasets (MNIST, Fashion-MNIST, and EMNIST) without
+any data-augmentation methods or convolutional layers and shows a particularly
+fast running time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 46 figures, 3 tables; The GitHub repo link was updated</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open-Source Acceleration of Stable-Diffusion.cpp Deployable on All
+  Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05781v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05781v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingxu Ng, Cheng Lv, Pu Zhao, Wei Niu, Juyi Lin, Minzhou Pan, Yun Liang, Yanzhi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stable diffusion plays a crucial role in generating high-quality images.
+However, image generation is time-consuming and memory-intensive. To address
+this, stable-diffusion.cpp (Sdcpp) emerges as an efficient inference framework
+to accelerate the diffusion models. Although it is lightweight, the current
+implementation of ggml_conv_2d operator in Sdcpp is suboptimal, exhibiting both
+high inference latency and massive memory usage. To address this, in this work,
+we present an optimized version of Sdcpp leveraging the Winograd algorithm to
+accelerate 2D convolution operations, which is the primary bottleneck in the
+pipeline. By analyzing both dependent and independent computation graphs, we
+exploit the device's locality and parallelism to achieve substantial
+performance improvements. Our framework delivers correct end-to-end results
+across various stable diffusion models, including SDv1.4, v1.5, v2.1, SDXL, and
+SDXL-Turbo. Our evaluation results demonstrate a speedup up to 2.76x for
+individual convolutional layers and an inference speedup up to 4.79x for the
+overall image generation process, compared with the original Sdcpp on M1 pro.
+Homepage: https://github.com/SealAILab/stable-diffusion-cpp
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mahalanobis k-NN: A Statistical Lens for Robust Point-Cloud
+  Registrations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.06267v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.06267v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tejas Anvekar, Shivanand Venkanna Sheshappanavar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we discuss Mahalanobis k-NN: A Statistical Lens designed to
+address the challenges of feature matching in learning-based point cloud
+registration when confronted with an arbitrary density of point clouds. We
+tackle this by adopting Mahalanobis k-NN's inherent property to capture the
+distribution of the local neighborhood and surficial geometry. Our method can
+be seamlessly integrated into any local-graph-based point cloud analysis
+method. In this paper, we focus on two distinct methodologies: Deep Closest
+Point (DCP) and Deep Universal Manifold Embedding (DeepUME). Our extensive
+benchmarking on the ModelNet40 and FAUST datasets highlights the efficacy of
+the proposed method in point cloud registration tasks. Moreover, we establish
+for the first time that the features acquired through point cloud registration
+inherently can possess discriminative capabilities. This is evident by a
+substantial improvement of about 20% in the average accuracy observed in the
+point cloud few-shot classification task, benchmarked on ModelNet40 and
+ScanObjectNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">16</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RAG-Check: Evaluating Multimodal Retrieval Augmented Generation
+  Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matin Mortaheb, Mohammad A. Amir Khojastepour, Srimat T. Chakradhar, Sennur Ulukus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) improves large language models (LLMs) by
+using external knowledge to guide response generation, reducing hallucinations.
+However, RAG, particularly multi-modal RAG, can introduce new hallucination
+sources: (i) the retrieval process may select irrelevant pieces (e.g.,
+documents, images) as raw context from the database, and (ii) retrieved images
+are processed into text-based context via vision-language models (VLMs) or
+directly used by multi-modal language models (MLLMs) like GPT-4o, which may
+hallucinate. To address this, we propose a novel framework to evaluate the
+reliability of multi-modal RAG using two performance measures: (i) the
+relevancy score (RS), assessing the relevance of retrieved entries to the
+query, and (ii) the correctness score (CS), evaluating the accuracy of the
+generated response. We train RS and CS models using a ChatGPT-derived database
+and human evaluator samples. Results show that both models achieve ~88%
+accuracy on test data. Additionally, we construct a 5000-sample human-annotated
+database evaluating the relevancy of retrieved pieces and the correctness of
+response statements. Our RS model aligns with human preferences 20% more often
+than CLIP in retrieval, and our CS model matches human preferences ~91% of the
+time. Finally, we assess various RAG systems' selection and generation
+performances using RS and CS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ (De)-Indexing and the Right to be Forgotten 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salvatore Vilella, Giancarlo Ruffo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the digital age, the challenge of forgetfulness has emerged as a
+significant concern, particularly regarding the management of personal data and
+its accessibility online. The right to be forgotten (RTBF) allows individuals
+to request the removal of outdated or harmful information from public access,
+yet implementing this right poses substantial technical difficulties for search
+engines. This paper aims to introduce non-experts to the foundational concepts
+of information retrieval (IR) and de-indexing, which are critical for
+understanding how search engines can effectively "forget" certain content. We
+will explore various IR models, including boolean, probabilistic, vector space,
+and embedding-based approaches, as well as the role of Large Language Models
+(LLMs) in enhancing data processing capabilities. By providing this overview,
+we seek to highlight the complexities involved in balancing individual privacy
+rights with the operational challenges faced by search engines in managing
+information visibility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Reliable Testing for Multiple Information Retrieval System
+  Comparisons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Otero, Javier Parapar, Álvaro Barreiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Null Hypothesis Significance Testing is the \textit{de facto} tool for
+assessing effectiveness differences between Information Retrieval systems.
+Researchers use statistical tests to check whether those differences will
+generalise to online settings or are just due to the samples observed in the
+laboratory. Much work has been devoted to studying which test is the most
+reliable when comparing a pair of systems, but most of the IR real-world
+experiments involve more than two. In the multiple comparisons scenario,
+testing several systems simultaneously may inflate the errors committed by the
+tests. In this paper, we use a new approach to assess the reliability of
+multiple comparison procedures using simulated and real TREC data. Experiments
+show that Wilcoxon plus the Benjamini-Hochberg correction yields Type I error
+rates according to the significance level for typical sample sizes while being
+the best test in terms of statistical power.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Potential of Large Language Models in Public
+  Transportation: San Antonio Case Study <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramya Jonnala, Gongbo Liang, Jeong Yang, Izzat Alsmadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of large language models (LLMs) into public transit systems
+presents a transformative opportunity to enhance urban mobility. This study
+explores the potential of LLMs to revolutionize public transportation
+management within the context of San Antonio's transit system. Leveraging the
+capabilities of LLMs in natural language processing and data analysis, we
+investigate their capabilities to optimize route planning, reduce wait times,
+and provide personalized travel assistance. By utilizing the General Transit
+Feed Specification (GTFS) and other relevant data, this research aims to
+demonstrate how LLMs can potentially improve resource allocation, elevate
+passenger satisfaction, and inform data-driven decision-making in transit
+operations. A comparative analysis of different ChatGPT models was conducted to
+assess their ability to understand transportation information, retrieve
+relevant data, and provide comprehensive responses. Findings from this study
+suggest that while LLMs hold immense promise for public transit, careful
+engineering and fine-tuning are essential to realizing their full potential.
+San Antonio serves as a case study to inform the development of LLM-powered
+transit systems in other urban environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted to AAAI 2025 Workshop on AI for Urban Planning.
+  arXiv admin note: substantial text overlap with arXiv:2407.11003</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">BERT</span>opic for Topic Modeling of Hindi Short Texts: A Comparative Study <span class="chip">COLING
+  2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03843v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03843v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atharva Mutsaddi, Anvi Jamkhande, Aryan Thakre, Yashodhara Haribhakta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As short text data in native languages like Hindi increasingly appear in
+modern media, robust methods for topic modeling on such data have gained
+importance. This study investigates the performance of BERTopic in modeling
+Hindi short texts, an area that has been under-explored in existing research.
+Using contextual embeddings, BERTopic can capture semantic relationships in
+data, making it potentially more effective than traditional models, especially
+for short and diverse texts. We evaluate BERTopic using 6 different document
+embedding models and compare its performance against 8 established topic
+modeling techniques, such as Latent Dirichlet Allocation (LDA), Non-negative
+Matrix Factorization (NMF), Latent Semantic Indexing (LSI), Additive
+Regularization of Topic Models (ARTM), Probabilistic Latent Semantic Analysis
+(PLSA), Embedded Topic Model (ETM), Combined Topic Model (CTM), and Top2Vec.
+The models are assessed using coherence scores across a range of topic counts.
+Our results reveal that BERTopic consistently outperforms other models in
+capturing coherent topics from short Hindi texts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted into IndoNLP: The First Workshop on Natural Language
+  Processing for Indo-Aryan and Dravidian Languages, collocated with COLING
+  2025. Set to appear in the workshop proceedings published in ACL Anthology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TACLR: A Scalable and Efficient Retrieval-based Method for Industrial
+  Product Attribute Value Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yindu Su, Huike Zou, Lin Sun, Ting Zhang, Haiyang Yang, Liyu Chen, David Lo, Qingheng Zhang, Shuguang Han, Jufeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Product Attribute Value Identification (PAVI) involves identifying attribute
+values from product profiles, a key task for improving product search,
+recommendations, and business analytics on e-commerce platforms. However,
+existing PAVI methods face critical challenges, such as inferring implicit
+values, handling out-of-distribution (OOD) values, and producing normalized
+outputs. To address these limitations, we introduce Taxonomy-Aware Contrastive
+Learning Retrieval (TACLR), the first retrieval-based method for PAVI. TACLR
+formulates PAVI as an information retrieval task by encoding product profiles
+and candidate values into embeddings and retrieving values based on their
+similarity to the item embedding. It leverages contrastive training with
+taxonomy-aware hard negative sampling and employs adaptive inference with
+dynamic thresholds. TACLR offers three key advantages: (1) it effectively
+handles implicit and OOD values while producing normalized outputs; (2) it
+scales to thousands of categories, tens of thousands of attributes, and
+millions of values; and (3) it supports efficient inference for high-load
+industrial scenarios. Extensive experiments on proprietary and public datasets
+validate the effectiveness and efficiency of TACLR. Moreover, it has been
+successfully deployed in a real-world e-commerce platform, processing millions
+of product listings daily while supporting dynamic, large-scale attribute
+taxonomies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending Chat<span class="highlight-title">GPT</span> with a Browserless System for Web Product Price
+  Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03811v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03811v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Lloret-Gazo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advenement of ChatGPT, we can find very clean, precise answers to a
+varied amount of questions. However, for questions such as 'find the price of
+the lemon cake at zingerman's', the answer looks like 'I can't browse the web
+right now'. In this paper, we propose a system, called Wextractor, which
+extends ChatGPT to answer questions as the one mentioned before. Obviously, our
+system cannot be labeled as `artificial intelligence'. Simply, it offers to
+cover a kind of transactional search that is not included in the current
+version of ChatGPT. Moreover, Wextractor includes two improvements with respect
+to the initial version: social extraction and pointing pattern extraction to
+improve the answer speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-label Cross-lingual automatic music genre classification from
+  lyrics with Sentence <span class="highlight-title">BERT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiago Fernandes Tavares, Fabio José Ayres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music genres are shaped by both the stylistic features of songs and the
+cultural preferences of artists' audiences. Automatic classification of music
+genres using lyrics can be useful in several applications such as
+recommendation systems, playlist creation, and library organization. We present
+a multi-label, cross-lingual genre classification system based on multilingual
+sentence embeddings generated by sBERT. Using a bilingual Portuguese-English
+dataset with eight overlapping genres, we demonstrate the system's ability to
+train on lyrics in one language and predict genres in another. Our approach
+outperforms the baseline approach of translating lyrics and using a
+bag-of-words representation, improving the genrewise average F1-Score from 0.35
+to 0.69. The classifier uses a one-vs-all architecture, enabling it to assign
+multiple genre labels to a single lyric. Experimental results reveal that
+dataset centralization notably improves cross-lingual performance. This
+approach offers a scalable solution for genre classification across
+underrepresented languages and cultural domains, advancing the capabilities of
+music information retrieval systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RecKG: Knowledge Graph for Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03598v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03598v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhyuk Kwon, Seokho Ahn, Young-Duk Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graphs have proven successful in integrating heterogeneous data
+across various domains. However, there remains a noticeable dearth of research
+on their seamless integration among heterogeneous recommender systems, despite
+knowledge graph-based recommender systems garnering extensive research
+attention. This study aims to fill this gap by proposing RecKG, a standardized
+knowledge graph for recommender systems. RecKG ensures the consistent
+representation of entities across different datasets, accommodating diverse
+attribute types for effective data integration. Through a meticulous
+examination of various recommender system datasets, we select attributes for
+RecKG, ensuring standardized formatting through consistent naming conventions.
+By these characteristics, RecKG can seamlessly integrate heterogeneous data
+sources, enabling the discovery of additional semantic information within the
+integrated knowledge graph. We apply RecKG to standardize real-world datasets,
+subsequently developing an application for RecKG using a graph database.
+Finally, we validate RecKG's achievement in interoperability through a
+qualitative evaluation between RecKG and other studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by The 39th ACM/SIGAPP Symposium On Applied Computing(SAC)
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reasoning-Enhanced Self-Training for Long-Form Personalized Text
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alireza Salemi, Cheng Li, Mingyang Zhang, Qiaozhu Mei, Weize Kong, Tao Chen, Zhuowan Li, Michael Bendersky, Hamed Zamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized text generation requires a unique ability of large language
+models (LLMs) to learn from context that they often do not encounter during
+their standard training. One way to encourage LLMs to better use personalized
+context for generating outputs that better align with the user's expectations
+is to instruct them to reason over the user's past preferences, background
+knowledge, or writing style. To achieve this, we propose Reasoning-Enhanced
+Self-Training for Personalized Text Generation (REST-PG), a framework that
+trains LLMs to reason over personal data during response generation. REST-PG
+first generates reasoning paths to train the LLM's reasoning abilities and then
+employs Expectation-Maximization Reinforced Self-Training to iteratively train
+the LLM based on its own high-reward outputs. We evaluate REST-PG on the
+LongLaMP benchmark, consisting of four diverse personalized long-form text
+generation tasks. Our experiments demonstrate that REST-PG achieves significant
+improvements over state-of-the-art baselines, with an average relative
+performance gain of 14.5% on the benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KGIF: Optimizing Relation-Aware Recommendations with Knowledge Graph
+  Information Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04161v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04161v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Hyun Jeon, Wenbo Sun, Houbing Herbert Song, Dongfang Liu, Velasquez Alvaro, Yixin Chloe Xie, Shuteng Niu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep-learning-enabled recommender systems demonstrate strong
+performance benchmarks, many struggle to adapt effectively in real-world
+environments due to limited use of user-item relationship data and insufficient
+transparency in recommendation generation. Traditional collaborative filtering
+approaches fail to integrate multifaceted item attributes, and although
+Factorization Machines account for item-specific details, they overlook broader
+relational patterns. Collaborative knowledge graph-based models have progressed
+by embedding user-item interactions with item-attribute relationships, offering
+a holistic perspective on interconnected entities. However, these models
+frequently aggregate attribute and interaction data in an implicit manner,
+leaving valuable relational nuances underutilized.
+  This study introduces the Knowledge Graph Attention Network with Information
+Fusion (KGIF), a specialized framework designed to merge entity and relation
+embeddings explicitly through a tailored self-attention mechanism. The KGIF
+framework integrates reparameterization via dynamic projection vectors,
+enabling embeddings to adaptively represent intricate relationships within
+knowledge graphs. This explicit fusion enhances the interplay between user-item
+interactions and item-attribute relationships, providing a nuanced balance
+between user-centric and item-centric representations. An attentive propagation
+mechanism further optimizes knowledge graph embeddings, capturing multi-layered
+interaction patterns. The contributions of this work include an innovative
+method for explicit information fusion, improved robustness for sparse
+knowledge graphs, and the ability to generate explainable recommendations
+through interpretable path visualization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE Big Data 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval-Augmented Generation by Evidence Retroactivity in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.05475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.05475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Xiao, Wen Dai, Shuai Chen, Bin Qin, Chongyang Shi, Haopeng Jing, Tianyu Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation has gained significant attention due to its
+ability to integrate relevant external knowledge, enhancing the accuracy and
+reliability of the LLMs' responses. Most of the existing methods apply a
+dynamic multiple retrieval-generating process, to address multi-hop complex
+questions by decomposing them into sub-problems. However, these methods rely on
+an unidirectional forward reasoning paradigm, where errors from insufficient
+reasoning steps or inherent flaws in current retrieval systems are
+irreversible, potentially derailing the entire reasoning chain. For the first
+time, this work introduces Retroactive Retrieval-Augmented Generation
+(RetroRAG), a novel framework to build a retroactive reasoning paradigm.
+RetroRAG revises and updates the evidence, redirecting the reasoning chain to
+the correct direction. RetroRAG constructs an evidence-collation-discovery
+framework to search, generate, and refine credible evidence. It synthesizes
+inferential evidence related to the key entities in the question from the
+existing source knowledge and formulates search queries to uncover additional
+information. As new evidence is found, RetroRAG continually updates and
+organizes this information, enhancing its ability to locate further necessary
+evidence. Paired with an Answerer to generate and evaluate outputs, RetroRAG is
+capable of refining its reasoning process iteratively until a reliable answer
+is obtained. Empirical evaluations show that RetroRAG significantly outperforms
+existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Diffusion Bridges for Unsupervised Musical Audio Timbre Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.06096v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.06096v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michele Mancusi, Yurii Halychanskyi, Kin Wai Cheuk, Eloi Moliner, Chieh-Hsin Lai, Stefan Uhlich, Junghyun Koo, Marco A. Martínez-Ramírez, Wei-Hsiang Liao, Giorgio Fabbro, Yuki Mitsufuji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music timbre transfer is a challenging task that involves modifying the
+timbral characteristics of an audio signal while preserving its melodic
+structure. In this paper, we propose a novel method based on dual diffusion
+bridges, trained using the CocoChorales Dataset, which consists of unpaired
+monophonic single-instrument audio data. Each diffusion model is trained on a
+specific instrument with a Gaussian prior. During inference, a model is
+designated as the source model to map the input audio to its corresponding
+Gaussian prior, and another model is designated as the target model to
+reconstruct the target audio from this Gaussian prior, thereby facilitating
+timbre transfer. We compare our approach against existing unsupervised timbre
+transfer models such as VAEGAN and Gaussian Flow Bridges (GFB). Experimental
+results demonstrate that our method achieves both better Fr\'echet Audio
+Distance (FAD) and melody preservation, as reflected by lower pitch distances
+(DPD) compared to VAEGAN and GFB. Additionally, we discover that the noise
+level from the Gaussian prior, $\sigma$, can be adjusted to control the degree
+of melody preservation and amount of timbre transferred.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CausalMob: Causal Human Mobility Prediction with LLMs-derived Human
+  Intentions toward Public Events <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.02155v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.02155v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojie Yang, Hangli Ge, Jiawei Wang, Zipei Fan, Renhe Jiang, Ryosuke Shibasaki, Noboru Koshizuka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale human mobility exhibits spatial and temporal patterns that can
+assist policymakers in decision making. Although traditional prediction models
+attempt to capture these patterns, they often interfered by non-periodic public
+events, such as disasters and occasional celebrations. Since regular human
+mobility patterns are heavily affected by these events, estimating their causal
+effects is critical to accurate mobility predictions. Although news articles
+provide unique perspectives on these events in an unstructured format,
+processing is a challenge. In this study, we propose a causality-augmented
+prediction model, called CausalMob, to analyze the causal effects of public
+events. We first utilize large language models (LLMs) to extract human
+intentions from news articles and transform them into features that act as
+causal treatments. Next, the model learns representations of spatio-temporal
+regional covariates from multiple data sources to serve as confounders for
+causal inference. Finally, we present a causal effect estimation framework to
+ensure event features remain independent of confounders during prediction.
+Based on large-scale real-world data, the experimental results show that the
+proposed model excels in human mobility prediction, outperforming
+state-of-the-art models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LightGNN: Simple Graph Neural Network for Recommendation <span class="chip">WSDM 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxuan Chen, Lianghao Xia, Chao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have demonstrated superior performance in
+collaborative recommendation through their ability to conduct high-order
+representation smoothing, effectively capturing structural information within
+users' interaction patterns. However, existing GNN paradigms face significant
+challenges in scalability and robustness when handling large-scale, noisy, and
+real-world datasets. To address these challenges, we present LightGNN, a
+lightweight and distillation-based GNN pruning framework designed to
+substantially reduce model complexity while preserving essential collaboration
+modeling capabilities. Our LightGNN framework introduces a computationally
+efficient pruning module that adaptively identifies and removes redundant edges
+and embedding entries for model compression. The framework is guided by a
+resource-friendly hierarchical knowledge distillation objective, whose
+intermediate layer augments the observed graph to maintain performance,
+particularly in high-rate compression scenarios. Extensive experiments on
+public datasets demonstrate LightGNN's effectiveness, significantly improving
+both computational efficiency and recommendation accuracy. Notably, LightGNN
+achieves an 80% reduction in edge count and 90% reduction in embedding entries
+while maintaining performance comparable to more complex state-of-the-art
+baselines. The implementation of our LightGNN framework is available at the
+github repository: https://github.com/HKUDS/LightGNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WSDM 2025 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Minimum Weighted Feedback Arc Sets for Ranking from Pairwise Comparisons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16181v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16181v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroush Vahidi, Ioannis Koutis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Minimum Weighted Feedback Arc Set (MWFAS) problem is fundamentally
+connected to the Ranking Problem -- the task of deriving global rankings from
+pairwise comparisons. Recent work [He et al. ICML2022] has advanced the
+state-of-the-art for the Ranking Problem using learning-based methods,
+improving upon multiple previous approaches. However, the connection to MWFAS
+remains underexplored. This paper investigates this relationship and presents
+efficient combinatorial algorithms for solving MWFAS, thus addressing the
+Ranking Problem. Our experimental results demonstrate that these simple,
+learning-free algorithms not only significantly outperform learning-based
+methods in terms of speed but also generally achieve superior ranking accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a preliminary paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">151</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LargeAD: Large-Scale Cross-Sensor Data <span class="highlight-title">Pretrain</span>ing for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingdong Kong, Xiang Xu, Youquan Liu, Jun Cen, Runnan Chen, Wenwei Zhang, Liang Pan, Kai Chen, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in vision foundation models (VFMs) have revolutionized
+visual perception in 2D, yet their potential for 3D scene understanding,
+particularly in autonomous driving applications, remains underexplored. In this
+paper, we introduce LargeAD, a versatile and scalable framework designed for
+large-scale 3D pretraining across diverse real-world driving datasets. Our
+framework leverages VFMs to extract semantically rich superpixels from 2D
+images, which are aligned with LiDAR point clouds to generate high-quality
+contrastive samples. This alignment facilitates cross-modal representation
+learning, enhancing the semantic consistency between 2D and 3D data. We
+introduce several key innovations: i) VFM-driven superpixel generation for
+detailed semantic representation, ii) a VFM-assisted contrastive learning
+strategy to align multimodal features, iii) superpoint temporal consistency to
+maintain stable representations across time, and iv) multi-source data
+pretraining to generalize across various LiDAR configurations. Our approach
+delivers significant performance improvements over state-of-the-art methods in
+both linear probing and fine-tuning tasks for both LiDAR-based segmentation and
+object detection. Extensive experiments on eleven large-scale multi-modal
+datasets highlight our superior performance, demonstrating the adaptability,
+efficiency, and robustness in real-world autonomous driving scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; 16 pages, 7 figures, 8 tables; Project Page at
+  https://ldkong.com/LargeAD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Xu, Lingdong Kong, Hui Shuai, Liang Pan, Ziwei Liu, Qingshan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR data pretraining offers a promising approach to leveraging large-scale,
+readily available datasets for enhanced data utilization. However, existing
+methods predominantly focus on sparse voxel representation, overlooking the
+complementary attributes provided by other LiDAR representations. In this work,
+we propose LiMoE, a framework that integrates the Mixture of Experts (MoE)
+paradigm into LiDAR data representation learning to synergistically combine
+multiple representations, such as range images, sparse voxels, and raw points.
+Our approach consists of three stages: i) Image-to-LiDAR Pretraining, which
+transfers prior knowledge from images to point clouds across different
+representations; ii) Contrastive Mixture Learning (CML), which uses MoE to
+adaptively activate relevant attributes from each representation and distills
+these mixed features into a unified 3D network; iii) Semantic Mixture
+Supervision (SMS), which combines semantic logits from multiple representations
+to boost downstream segmentation performance. Extensive experiments across 11
+large-scale LiDAR datasets demonstrate our effectiveness and superiority. The
+code and model checkpoints have been made publicly accessible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; 26 pages, 17 figures, 7 tables; Project Page at
+  https://ldkong.com/LiMoE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Federated Learning in Human Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohan Li, Martin Gjoreski, Pietro Barbiero, Gašper Slapničar, Mitja Luštrek, Nicholas D. Lane, Marc Langheinrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Sensing, a field that leverages technology to monitor human activities,
+psycho-physiological states, and interactions with the environment, enhances
+our understanding of human behavior and drives the development of advanced
+services that improve overall quality of life. However, its reliance on
+detailed and often privacy-sensitive data as the basis for its machine learning
+(ML) models raises significant legal and ethical concerns. The recently
+proposed ML approach of Federated Learning (FL) promises to alleviate many of
+these concerns, as it is able to create accurate ML models without sending raw
+user data to a central server. While FL has demonstrated its usefulness across
+a variety of areas, such as text prediction and cyber security, its benefits in
+Human Sensing are under-explored, given the particular challenges in this
+domain. This survey conducts a comprehensive analysis of the current
+state-of-the-art studies on FL in Human Sensing, and proposes a taxonomy and an
+eight-dimensional assessment for FL approaches. Through the eight-dimensional
+assessment, we then evaluate whether the surveyed studies consider a specific
+FL-in-Human-Sensing challenge or not. Finally, based on the overall analysis,
+we discuss open challenges and highlight five research aspects related to FL in
+Human Sensing that require urgent research attention. Our work provides a
+comprehensive corpus of FL studies and aims to assist FL practitioners in
+developing and evaluating solutions that effectively address the real-world
+complexities of Human Sensing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WAPTS: A Weighted Allocation Probability Adjusted Thompson Sampling
+  Algorithm for High-Dimensional and Sparse Experiment Settings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Song, Ilya Musabirov, Ananya Bhattacharjee, Audrey Durand, Meredith Franklin, Anna Rafferty, Joseph Jay Williams
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aiming for more effective experiment design, such as in video content
+advertising where different content options compete for user engagement, these
+scenarios can be modeled as multi-arm bandit problems. In cases where limited
+interactions are available due to external factors, such as the cost of
+conducting experiments, recommenders often face constraints due to the small
+number of user interactions. In addition, there is a trade-off between
+selecting the best treatment and the ability to personalize and contextualize
+based on individual factors. A popular solution to this dilemma is the
+Contextual Bandit framework. It aims to maximize outcomes while incorporating
+personalization (contextual) factors, customizing treatments such as a user's
+profile to individual preferences. Despite their advantages, Contextual Bandit
+algorithms face challenges like measurement bias and the 'curse of
+dimensionality.' These issues complicate the management of numerous
+interventions and often lead to data sparsity through participant segmentation.
+To address these problems, we introduce the Weighted Allocation Probability
+Adjusted Thompson Sampling (WAPTS) algorithm. WAPTS builds on the contextual
+Thompson Sampling method by using a dynamic weighting parameter. This improves
+the allocation process for interventions and enables rapid optimization in
+data-sparse environments. We demonstrate the performance of our approach on
+different numbers of arms and effect sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RAG-Check: Evaluating Multimodal Retrieval Augmented Generation
+  Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matin Mortaheb, Mohammad A. Amir Khojastepour, Srimat T. Chakradhar, Sennur Ulukus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) improves large language models (LLMs) by
+using external knowledge to guide response generation, reducing hallucinations.
+However, RAG, particularly multi-modal RAG, can introduce new hallucination
+sources: (i) the retrieval process may select irrelevant pieces (e.g.,
+documents, images) as raw context from the database, and (ii) retrieved images
+are processed into text-based context via vision-language models (VLMs) or
+directly used by multi-modal language models (MLLMs) like GPT-4o, which may
+hallucinate. To address this, we propose a novel framework to evaluate the
+reliability of multi-modal RAG using two performance measures: (i) the
+relevancy score (RS), assessing the relevance of retrieved entries to the
+query, and (ii) the correctness score (CS), evaluating the accuracy of the
+generated response. We train RS and CS models using a ChatGPT-derived database
+and human evaluator samples. Results show that both models achieve ~88%
+accuracy on test data. Additionally, we construct a 5000-sample human-annotated
+database evaluating the relevancy of retrieved pieces and the correctness of
+response statements. Our RS model aligns with human preferences 20% more often
+than CLIP in retrieval, and our CS model matches human preferences ~91% of the
+time. Finally, we assess various RAG systems' selection and generation
+performances using RS and CS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic Data Privacy Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amy Steier, Lipika Ramaswamy, Andre Manoel, Alexa Haushalter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in generative AI have made it possible to create
+synthetic datasets that can be as accurate as real-world data for training AI
+models, powering statistical insights, and fostering collaboration with
+sensitive datasets while offering strong privacy guarantees. Effectively
+measuring the empirical privacy of synthetic data is an important step in the
+process. However, while there is a multitude of new privacy metrics being
+published every day, there currently is no standardization. In this paper, we
+review the pros and cons of popular metrics that include simulations of
+adversarial attacks. We also review current best practices for amending
+generative models to enhance the privacy of the data they create (e.g.
+differential privacy).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A precise asymptotic analysis of learning diffusion models: theory and
+  insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03937v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03937v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Cui, Cengiz Pehlevan, Yue M. Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this manuscript, we consider the problem of learning a flow or
+diffusion-based generative model parametrized by a two-layer auto-encoder,
+trained with online stochastic gradient descent, on a high-dimensional target
+density with an underlying low-dimensional manifold structure. We derive a
+tight asymptotic characterization of low-dimensional projections of the
+distribution of samples generated by the learned model, ascertaining in
+particular its dependence on the number of training samples. Building on this
+analysis, we discuss how mode collapse can arise, and lead to model collapse
+when the generative model is re-trained on generated synthetic data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Newswire to Nexus: Using text-based actor embeddings and
+  <span class="highlight-title">transformer</span> networks to forecast conflict dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihai Croicu, Simon Polichinel von der Maase
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study advances the field of conflict forecasting by using text-based
+actor embeddings with transformer models to predict dynamic changes in violent
+conflict patterns at the actor level. More specifically, we combine newswire
+texts with structured conflict event data and leverage recent advances in
+Natural Language Processing (NLP) techniques to forecast escalations and
+de-escalations among conflicting actors, such as governments, militias,
+separatist movements, and terrorists. This new approach accurately and promptly
+captures the inherently volatile patterns of violent conflicts, which existing
+methods have not been able to achieve. To create this framework, we began by
+curating and annotating a vast international newswire corpus, leveraging
+hand-labeled event data from the Uppsala Conflict Data Program. By using this
+hybrid dataset, our models can incorporate the textual context of news sources
+along with the precision and detail of structured event data. This combination
+enables us to make both dynamic and granular predictions about conflict
+developments. We validate our approach through rigorous back-testing against
+historical events, demonstrating superior out-of-sample predictive power. We
+find that our approach is quite effective in identifying and predicting phases
+of conflict escalation and de-escalation, surpassing the capabilities of
+traditional models. By focusing on actor interactions, our explicit goal is to
+provide actionable insights to policymakers, humanitarian organizations, and
+peacekeeping operations in order to enable targeted and effective intervention
+strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 5 figures. Paper presented at the 120th American Political
+  Science Association Annual Meeting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable AI model reveals disease-related mechanisms in single-cell
+  RNA-seq data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Usman, Olga Varea, Petia Radeva, Josep Canals, Jordi Abante, Daniel Ortiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neurodegenerative diseases (NDDs) are complex and lack effective treatment
+due to their poorly understood mechanism. The increasingly used data analysis
+from Single nucleus RNA Sequencing (snRNA-seq) allows to explore transcriptomic
+events at a single cell level, yet face challenges in interpreting the
+mechanisms underlying a disease. On the other hand, Neural Network (NN) models
+can handle complex data to offer insights but can be seen as black boxes with
+poor interpretability. In this context, explainable AI (XAI) emerges as a
+solution that could help to understand disease-associated mechanisms when
+combined with efficient NN models. However, limited research explores XAI in
+single-cell data. In this work, we implement a method for identifying
+disease-related genes and the mechanistic explanation of disease progression
+based on NN model combined with SHAP. We analyze available Huntington's disease
+(HD) data to identify both HD-altered genes and mechanisms by adding Gene Set
+Enrichment Analysis (GSEA) comparing two methods, differential gene expression
+analysis (DGE) and NN combined with SHAP approach. Our results show that DGE
+and SHAP approaches offer both common and differential sets of altered genes
+and pathways, reinforcing the usefulness of XAI methods for a broader
+perspective of disease.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ mFabric: An Efficient and Scalable Fabric for Mixture-of-Experts
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xudong Liao, Yijun Sun, Han Tian, Xinchen Wan, Yilun Jin, Zilong Wang, Zhenghang Ren, Xinyang Huang, Wenxue Li, Kin Fai Tse, Zhizhen Zhong, Guyue Liu, Ying Zhang, Xiaofeng Ye, Yiming Zhang, Kai Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mixture-of-Expert (MoE) models outperform conventional models by selectively
+activating different subnets, named \emph{experts}, on a per-token basis. This
+gated computation generates dynamic communications that cannot be determined
+beforehand, challenging the existing GPU interconnects that remain
+\emph{static} during the distributed training process. In this paper, we
+advocate for a first-of-its-kind system, called mFabric, that unlocks topology
+reconfiguration \emph{during} distributed MoE training. Towards this vision, we
+first perform a production measurement study and show that the MoE dynamic
+communication pattern has \emph{strong locality}, alleviating the requirement
+of global reconfiguration. Based on this, we design and implement a
+\emph{regionally reconfigurable high-bandwidth domain} on top of existing
+electrical interconnects using optical circuit switching (OCS), achieving
+scalability while maintaining rapid adaptability. We have built a fully
+functional mFabric prototype with commodity hardware and a customized
+collective communication runtime that trains state-of-the-art MoE models with
+\emph{in-training} topology reconfiguration across 32 A100 GPUs. Large-scale
+packet-level simulations show that mFabric delivers comparable performance as
+the non-blocking fat-tree fabric while boosting the training cost efficiency
+(e.g., performance per dollar) of four representative MoE models by
+1.2$\times$--1.5$\times$ and 1.9$\times$--2.3$\times$ at 100 Gbps and 400 Gbps
+link bandwidths, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Corresponding authors: zhizhenz@mit.edu (Z. Zhong),
+  kaichen@cse.ust.hk (K. Chen)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Potential of Large Language Models in Public
+  Transportation: San Antonio Case Study <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramya Jonnala, Gongbo Liang, Jeong Yang, Izzat Alsmadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of large language models (LLMs) into public transit systems
+presents a transformative opportunity to enhance urban mobility. This study
+explores the potential of LLMs to revolutionize public transportation
+management within the context of San Antonio's transit system. Leveraging the
+capabilities of LLMs in natural language processing and data analysis, we
+investigate their capabilities to optimize route planning, reduce wait times,
+and provide personalized travel assistance. By utilizing the General Transit
+Feed Specification (GTFS) and other relevant data, this research aims to
+demonstrate how LLMs can potentially improve resource allocation, elevate
+passenger satisfaction, and inform data-driven decision-making in transit
+operations. A comparative analysis of different ChatGPT models was conducted to
+assess their ability to understand transportation information, retrieve
+relevant data, and provide comprehensive responses. Findings from this study
+suggest that while LLMs hold immense promise for public transit, careful
+engineering and fine-tuning are essential to realizing their full potential.
+San Antonio serves as a case study to inform the development of LLM-powered
+transit systems in other urban environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted to AAAI 2025 Workshop on AI for Urban Planning.
+  arXiv admin note: substantial text overlap with arXiv:2407.11003</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Reinforcement Learning via Temporal Policy Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franco Ruggeri, Alessio Russo, Rafia Inam, Karl Henrik Johansson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the explainability of Reinforcement Learning (RL) policies
+from a temporal perspective, focusing on the sequence of future outcomes
+associated with individual actions. In RL, value functions compress information
+about rewards collected across multiple trajectories and over an infinite
+horizon, allowing a compact form of knowledge representation. However, this
+compression obscures the temporal details inherent in sequential
+decision-making, presenting a key challenge for interpretability. We present
+Temporal Policy Decomposition (TPD), a novel explainability approach that
+explains individual RL actions in terms of their Expected Future Outcome (EFO).
+These explanations decompose generalized value functions into a sequence of
+EFOs, one for each time step up to a prediction horizon of interest, revealing
+insights into when specific outcomes are expected to occur. We leverage
+fixed-horizon temporal difference learning to devise an off-policy method for
+learning EFOs for both optimal and suboptimal actions, enabling contrastive
+explanations consisting of EFOs for different state-action pairs. Our
+experiments demonstrate that TPD generates accurate explanations that (i)
+clarify the policy's future strategy and anticipated trajectory for a given
+action and (ii) improve understanding of the reward composition, facilitating
+fine-tuning of the reward function to align with human expectations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural DNF-MT: A Neuro-symbolic Approach for Learning Interpretable and
+  Editable Policies <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kexin Gu Baugh, Luke Dickens, Alessandra Russo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although deep reinforcement learning has been shown to be effective, the
+model's black-box nature presents barriers to direct policy interpretation. To
+address this problem, we propose a neuro-symbolic approach called neural DNF-MT
+for end-to-end policy learning. The differentiable nature of the neural DNF-MT
+model enables the use of deep actor-critic algorithms for training. At the same
+time, its architecture is designed so that trained models can be directly
+translated into interpretable policies expressed as standard (bivalent or
+probabilistic) logic programs. Moreover, additional layers can be included to
+extract abstract features from complex observations, acting as a form of
+predicate invention. The logic representations are highly interpretable, and we
+show how the bivalent representations of deterministic policies can be edited
+and incorporated back into a neural model, facilitating manual intervention and
+adaptation of learned policies. We evaluate our approach on a range of tasks
+requiring learning deterministic or stochastic behaviours from various forms of
+observations. Our empirical results show that our neural DNF-MT model performs
+at the level of competing black-box methods whilst providing interpretable
+policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAMAS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SELMA3D challenge: <span class="highlight-title">Self-supervised</span> learning for 3D light-sheet
+  microscopy image segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Chen, Rami Al-Maskari, Izabela Horvath, Mayar Ali, Luciano Höher, Kaiyuan Yang, Zengming Lin, Zhiwei Zhai, Mengzhe Shen, Dejin Xun, Yi Wang, Tony Xu, Maged Goubran, Yunheng Wu, Ali Erturk, Johannes C. Paetzold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent innovations in light sheet microscopy, paired with developments in
+tissue clearing techniques, enable the 3D imaging of large mammalian tissues
+with cellular resolution. Combined with the progress in large-scale data
+analysis, driven by deep learning, these innovations empower researchers to
+rapidly investigate the morphological and functional properties of diverse
+biological samples. Segmentation, a crucial preliminary step in the analysis
+process, can be automated using domain-specific deep learning models with
+expert-level performance. However, these models exhibit high sensitivity to
+domain shifts, leading to a significant drop in accuracy when applied to data
+outside their training distribution. To address this limitation, and inspired
+by the recent success of self-supervised learning in training generalizable
+models, we organized the SELMA3D Challenge during the MICCAI 2024 conference.
+SELMA3D provides a vast collection of light-sheet images from cleared mice and
+human brains, comprising 35 large 3D images-each with over 1000^3 voxels-and
+315 annotated small patches for finetuning, preliminary testing and final
+testing. The dataset encompasses diverse biological structures, including
+vessel-like and spot-like structures. Five teams participated in all phases of
+the challenge, and their proposed methods are reviewed in this paper.
+Quantitative and qualitative results from most participating teams demonstrate
+that self-supervised learning on large datasets improves segmentation model
+performance and generalization. We will continue to support and extend SELMA3D
+as an inaugural MICCAI challenge focused on self-supervised learning for 3D
+microscopy image segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1st version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stochastically Constrained Best Arm Identification with Thompson
+  Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Yang, Siyang Gao, Cheng Li, Yi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of the best arm identification in the presence of
+stochastic constraints, where there is a finite number of arms associated with
+multiple performance measures. The goal is to identify the arm that optimizes
+the objective measure subject to constraints on the remaining measures. We will
+explore the popular idea of Thompson sampling (TS) as a means to solve it. To
+the best of our knowledge, it is the first attempt to extend TS to this
+problem. We will design a TS-based sampling algorithm, establish its asymptotic
+optimality in the rate of posterior convergence, and demonstrate its superior
+performance using numerical examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 12 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neuromorphic Optical Tracking and Imaging of Randomly Moving Targets
+  through Strongly Scattering Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ning Zhang, Timothy Shea, Arto Nurmikko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking and acquiring simultaneous optical images of randomly moving targets
+obscured by scattering media remains a challenging problem of importance to
+many applications that require precise object localization and identification.
+In this work we develop an end-to-end neuromorphic optical engineering and
+computational approach to demonstrate how to track and image normally invisible
+objects by combining an event detecting camera with a multistage neuromorphic
+deep learning strategy. Photons emerging from dense scattering media are
+detected by the event camera and converted to pixel-wise asynchronized spike
+trains - a first step in isolating object-specific information from the
+dominant uninformative background. Spiking data is fed into a deep spiking
+neural network (SNN) engine where object tracking and image reconstruction are
+performed by two separate yet interconnected modules running in parallel in
+discrete time steps over the event duration. Through benchtop experiments we
+demonstrate tracking and imaging randomly moving objects in dense turbid media
+as well as image reconstruction of spatially stationary but optically dynamic
+objects. Standardized character sets serve as representative proxies for
+geometrically complex objects, underscoring the method's generality. The
+results highlight the advantages of a fully neuromorphic approach in meeting a
+major imaging technology with high computational efficiency and low power
+consumption.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Truthful mechanisms for linear bandit games with private contexts <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiting Hu, Lingjie Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The contextual bandit problem, where agents arrive sequentially with personal
+contexts and the system adapts its arm allocation decisions accordingly, has
+recently garnered increasing attention for enabling more personalized outcomes.
+However, in many healthcare and recommendation applications, agents have
+private profiles and may misreport their contexts to gain from the system. For
+example, in adaptive clinical trials, where hospitals sequentially recruit
+volunteers to test multiple new treatments and adjust plans based on
+volunteers' reported profiles such as symptoms and interim data, participants
+may misreport severe side effects like allergy and nausea to avoid perceived
+suboptimal treatments. We are the first to study this issue of private context
+misreporting in a stochastic contextual bandit game between the system and
+non-repeated agents. We show that traditional low-regret algorithms, such as
+UCB family algorithms and Thompson sampling, fail to ensure truthful reporting
+and can result in linear regret in the worst case, while traditional truthful
+algorithms like explore-then-commit (ETC) and $\epsilon$-greedy algorithm incur
+sublinear but high regret. We propose a mechanism that uses a linear program to
+ensure truthfulness while minimizing deviation from Thompson sampling, yielding
+an $O(\ln T)$ frequentist regret. Our numerical experiments further demonstrate
+strong performance in multiple contexts and across other distribution families.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at AAMAS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symmetry and Generalisation in Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hayder Elesedy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work is about understanding the impact of invariance and equivariance on
+generalisation in supervised learning. We use the perspective afforded by an
+averaging operator to show that for any predictor that is not equivariant,
+there is an equivariant predictor with strictly lower test risk on all
+regression problems where the equivariance is correctly specified. This
+constitutes a rigorous proof that symmetry, in the form of invariance or
+equivariance, is a useful inductive bias.
+  We apply these ideas to equivariance and invariance in random design least
+squares and kernel ridge regression respectively. This allows us to specify the
+reduction in expected test risk in more concrete settings and express it in
+terms of properties of the group, the model and the data.
+  Along the way, we give examples and additional results to demonstrate the
+utility of the averaging operator approach in analysing equivariant predictors.
+In addition, we adopt an alternative perspective and formalise the common
+intuition that learning with invariant models reduces to a problem in terms of
+orbit representatives. The formalism extends naturally to a similar intuition
+for equivariant models. We conclude by connecting the two perspectives and
+giving some ideas for future work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PhD Thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging time and parameters for nonlinear model reduction methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Silke Glas, Benjamin Unger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider model order reduction (MOR) methods for problems
+with slowly decaying Kolmogorov $n$-widths as, e.g., certain wave-like or
+transport-dominated problems. To overcome this Kolmogorov barrier within MOR,
+nonlinear projections are used, which are often realized numerically using
+autoencoders. These autoencoders generally consist of a nonlinear encoder and a
+nonlinear decoder and involve costly training of the hyperparameters to obtain
+a good approximation quality of the reduced system. To facilitate the training
+process, we show that extending the to-be-reduced system and its corresponding
+training data makes it possible to replace the nonlinear encoder with a linear
+encoder without sacrificing accuracy, thus roughly halving the number of
+hyperparameters to be trained.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">BERT</span>opic for Topic Modeling of Hindi Short Texts: A Comparative Study <span class="chip">COLING
+  2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03843v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03843v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atharva Mutsaddi, Anvi Jamkhande, Aryan Thakre, Yashodhara Haribhakta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As short text data in native languages like Hindi increasingly appear in
+modern media, robust methods for topic modeling on such data have gained
+importance. This study investigates the performance of BERTopic in modeling
+Hindi short texts, an area that has been under-explored in existing research.
+Using contextual embeddings, BERTopic can capture semantic relationships in
+data, making it potentially more effective than traditional models, especially
+for short and diverse texts. We evaluate BERTopic using 6 different document
+embedding models and compare its performance against 8 established topic
+modeling techniques, such as Latent Dirichlet Allocation (LDA), Non-negative
+Matrix Factorization (NMF), Latent Semantic Indexing (LSI), Additive
+Regularization of Topic Models (ARTM), Probabilistic Latent Semantic Analysis
+(PLSA), Embedded Topic Model (ETM), Combined Topic Model (CTM), and Top2Vec.
+The models are assessed using coherence scores across a range of topic counts.
+Our results reveal that BERTopic consistently outperforms other models in
+capturing coherent topics from short Hindi texts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted into IndoNLP: The First Workshop on Natural Language
+  Processing for Indo-Aryan and Dravidian Languages, collocated with COLING
+  2025. Set to appear in the workshop proceedings published in ACL Anthology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine learning applications in archaeological practices: a <span class="highlight-title">review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03840v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03840v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathias Bellat, Jordy D. Orellana Figueroa, Jonathan S. Reeves, Ruhollah Taghizadeh-Mehrjardi, Claudio Tennie, Thomas Scholten
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence and machine learning applications in archaeology have
+increased significantly in recent years, and these now span all subfields,
+geographical regions, and time periods. The prevalence and success of these
+applications have remained largely unexamined, as recent reviews on the use of
+machine learning in archaeology have only focused only on specific subfields of
+archaeology. Our review examined an exhaustive corpus of 135 articles published
+between 1997 and 2022. We observed a significant increase in the number of
+relevant publications from 2019 onwards. Automatic structure detection and
+artefact classification were the most represented tasks in the articles
+reviewed, followed by taphonomy, and archaeological predictive modelling. From
+the review, clustering and unsupervised methods were underrepresented compared
+to supervised models. Artificial neural networks and ensemble learning account
+for two thirds of the total number of models used. However, if machine learning
+is gaining in popularity it remains subject to misunderstanding. We observed,
+in some cases, poorly defined requirements and caveats of the machine learning
+methods used. Furthermore, the goals and the needs of machine learning
+applications for archaeological purposes are in some cases unclear or poorly
+expressed. To address this, we proposed a workflow guide for archaeologists to
+develop coherent and consistent methodologies adapted to their research
+questions, project scale and data. As in many other areas, machine learning is
+rapidly becoming an important tool in archaeological research and practice,
+useful for the analyses of large and multivariate data, although not without
+limitations. This review highlights the importance of well-defined and
+well-reported structured methodologies and collaborative practices to maximise
+the potential of applications of machine learning methods in archaeology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Three-dimensional attention <span class="highlight-title">Transformer</span> for state evaluation in
+  real-time strategy games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanqing Ye, Weilong Yang, Kai Qiu, Jie Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Situation assessment in Real-Time Strategy (RTS) games is crucial for
+understanding decision-making in complex adversarial environments. However,
+existing methods remain limited in processing multi-dimensional feature
+information and temporal dependencies. Here we propose a tri-dimensional
+Space-Time-Feature Transformer (TSTF Transformer) architecture, which
+efficiently models battlefield situations through three independent but
+cascaded modules: spatial attention, temporal attention, and feature attention.
+On a dataset comprising 3,150 adversarial experiments, the 8-layer TSTF
+Transformer demonstrates superior performance: achieving 58.7% accuracy in the
+early game (~4% progress), significantly outperforming the conventional
+Timesformer's 41.8%; reaching 97.6% accuracy in the mid-game (~40% progress)
+while maintaining low performance variation (standard deviation 0.114).
+Meanwhile, this architecture requires fewer parameters (4.75M) compared to the
+baseline model (5.54M). Our study not only provides new insights into situation
+assessment in RTS games but also presents an innovative paradigm for
+Transformer-based multi-dimensional temporal modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Impact of Data Selection Strategies on Language Model
+  Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayao Gu, Liting Chen, Yihong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data selection is critical for enhancing the performance of language models,
+particularly when aligning training datasets with a desired target
+distribution. This study explores the effects of different data selection
+methods and feature types on model performance. We evaluate whether selecting
+data subsets can influence downstream tasks, whether n-gram features improve
+alignment with target distributions, and whether embedding-based neural
+features provide complementary benefits. Through comparative experiments using
+baseline random selection methods and distribution aligned approaches, we
+provide insights into the interplay between data selection strategies and model
+training efficacy. All code for this study can be found on
+\href{https://github.com/jgu13/HIR-Hybrid-Importance-Resampling-for-Language-Models}{github
+repository}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class-Balance Bias in Regularized Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Larsson, Jonas Wallin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regularized models are often sensitive to the scales of the features in the
+data and it has therefore become standard practice to normalize (center and
+scale) the features before fitting the model. But there are many different ways
+to normalize the features and the choice may have dramatic effects on the
+resulting model. In spite of this, there has so far been no research on this
+topic. In this paper, we begin to bridge this knowledge gap by studying
+normalization in the context of lasso, ridge, and elastic net regression. We
+focus on normal and binary features and show that the class balances of binary
+features directly influences the regression coefficients and that this effect
+depends on the combination of normalization and regularization methods used. We
+demonstrate that this effect can be mitigated by scaling binary features with
+their variance in the case of the lasso and standard deviation in the case of
+ridge regression, but that this comes at the cost of increased variance. For
+the elastic net, we show that scaling the penalty weights, rather than the
+features, can achieve the same effect. Finally, we also tackle mixes of binary
+and normal features as well as interactions and provide some initial results on
+how to normalize features in these cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision <span class="highlight-title">Transformer</span> Neural Architecture Search for Out-of-Distribution
+  Generalization: Benchmark and Insights <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sy-Tuyen Ho, Tuan Van Vo, Somayeh Ebrahimkhani, Ngai-Man Cheung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While ViTs have achieved across machine learning tasks, deploying them in
+real-world scenarios faces a critical challenge: generalizing under OoD shifts.
+A crucial research gap exists in understanding how to design ViT architectures,
+both manually and automatically, for better OoD generalization. To this end, we
+introduce OoD-ViT-NAS, the first systematic benchmark for ViTs NAS focused on
+OoD generalization. This benchmark includes 3000 ViT architectures of varying
+computational budgets evaluated on 8 common OoD datasets. Using this benchmark,
+we analyze factors contributing to OoD generalization. Our findings reveal key
+insights. First, ViT architecture designs significantly affect OoD
+generalization. Second, ID accuracy is often a poor indicator of OoD accuracy,
+highlighting the risk of optimizing ViT architectures solely for ID
+performance. Third, we perform the first study of NAS for ViTs OoD robustness,
+analyzing 9 Training-free NAS methods. We find that existing Training-free NAS
+methods are largely ineffective in predicting OoD accuracy despite excelling at
+ID accuracy. Simple proxies like Param or Flop surprisingly outperform complex
+Training-free NAS methods in predicting OoD accuracy. Finally, we study how ViT
+architectural attributes impact OoD generalization and discover that increasing
+embedding dimensions generally enhances performance. Our benchmark shows that
+ViT architectures exhibit a wide range of OoD accuracy, with up to 11.85%
+improvement for some OoD shifts. This underscores the importance of studying
+ViT architecture design for OoD. We believe OoD-ViT-NAS can catalyze further
+research into how ViT designs influence OoD generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-label Cross-lingual automatic music genre classification from
+  lyrics with Sentence <span class="highlight-title">BERT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiago Fernandes Tavares, Fabio José Ayres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music genres are shaped by both the stylistic features of songs and the
+cultural preferences of artists' audiences. Automatic classification of music
+genres using lyrics can be useful in several applications such as
+recommendation systems, playlist creation, and library organization. We present
+a multi-label, cross-lingual genre classification system based on multilingual
+sentence embeddings generated by sBERT. Using a bilingual Portuguese-English
+dataset with eight overlapping genres, we demonstrate the system's ability to
+train on lyrics in one language and predict genres in another. Our approach
+outperforms the baseline approach of translating lyrics and using a
+bag-of-words representation, improving the genrewise average F1-Score from 0.35
+to 0.69. The classifier uses a one-vs-all architecture, enabling it to assign
+multiple genre labels to a single lyric. Experimental results reveal that
+dataset centralization notably improves cross-lingual performance. This
+approach offers a scalable solution for genre classification across
+underrepresented languages and cultural domains, advancing the capabilities of
+music information retrieval systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context-Alignment: Activating and Enhancing LLM Capabilities in Time
+  Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiao Hu, Qian Li, Dongxiao Zhang, Jinyue Yan, Yuntian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, leveraging pre-trained Large Language Models (LLMs) for time series
+(TS) tasks has gained increasing attention, which involves activating and
+enhancing LLMs' capabilities. Many methods aim to activate LLMs' capabilities
+based on token-level alignment but overlook LLMs' inherent strength on natural
+language processing -- their deep understanding of linguistic logic and
+structure rather than superficial embedding processing. We propose
+Context-Alignment, a new paradigm that aligns TS with a linguistic component in
+the language environments familiar to LLMs to enable LLMs to contextualize and
+comprehend TS data, thereby activating their capabilities. Specifically, such
+context-level alignment comprises structural alignment and logical alignment,
+which is achieved by a Dual-Scale Context-Alignment GNNs (DSCA-GNNs) applied to
+TS-language multimodal inputs. Structural alignment utilizes dual-scale nodes
+to describe hierarchical structure in TS-language, enabling LLMs treat long TS
+data as a whole linguistic component while preserving intrinsic token features.
+Logical alignment uses directed edges to guide logical relationships, ensuring
+coherence in the contextual semantics. Demonstration examples prompt are
+employed to construct Demonstration Examples based Context-Alignment (DECA)
+following DSCA-GNNs framework. DECA can be flexibly and repeatedly integrated
+into various layers of pre-trained LLMs to improve awareness of logic and
+structure, thereby enhancing performance. Extensive experiments show the
+effectiveness of DECA and the importance of Context-Alignment across tasks,
+particularly in few-shot and zero-shot forecasting, confirming that
+Context-Alignment provide powerful prior knowledge on context.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>no comment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multimodal Lightweight Approach to Fault Diagnosis of Induction Motors
+  in High-Dimensional <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Usman Ali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An accurate AI-based diagnostic system for induction motors (IMs) holds the
+potential to enhance proactive maintenance, mitigating unplanned downtime and
+curbing overall maintenance costs within an industrial environment. Notably,
+among the prevalent faults in IMs, a Broken Rotor Bar (BRB) fault is frequently
+encountered. Researchers have proposed various fault diagnosis approaches using
+signal processing (SP), machine learning (ML), deep learning (DL), and hybrid
+architectures for BRB faults. One limitation in the existing literature is the
+training of these architectures on relatively small datasets, risking
+overfitting when implementing such systems in industrial environments. This
+paper addresses this limitation by implementing large-scale data of BRB faults
+by using a transfer-learning-based lightweight DL model named ShuffleNetV2 for
+diagnosing one, two, three, and four BRB faults using current and vibration
+signal data. Spectral images for training and testing are generated using a
+Short-Time Fourier Transform (STFT). The dataset comprises 57,500 images, with
+47,500 used for training and 10,000 for testing. Remarkably, the ShuffleNetV2
+model exhibited superior performance, in less computational cost as well as
+accurately classifying 98.856% of spectral images. To further enhance the
+visualization of harmonic sidebands resulting from broken bars, Fast Fourier
+Transform (FFT) is applied to current and vibration data. The paper also
+provides insights into the training and testing times for each model,
+contributing to a comprehensive understanding of the proposed fault diagnosis
+methodology. The findings of our research provide valuable insights into the
+performance and efficiency of different ML and DL models, offering a foundation
+for the development of robust fault diagnosis systems for induction motors in
+industrial settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Neurocognitive Disorders through Analyses of Topic Evolution
+  and Cross-modal Consistency in Visual-Stimulated Narratives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinchao Li, Yuejiao Wang, Junan Li, Jiawen Kang, Bo Zheng, Simon Wong, Brian Mak, Helene Fung, Jean Woo, Man-Wai Mak, Timothy Kwok, Vincent Mok, Xianmin Gong, Xixin Wu, Xunying Liu, Patrick Wong, Helen Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early detection of neurocognitive disorders (NCDs) is crucial for timely
+intervention and disease management. Speech analysis offers a non-intrusive and
+scalable screening method, particularly through narrative tasks in
+neuropsychological assessment tools. Traditional narrative analysis often
+focuses on local indicators in microstructure, such as word usage and syntax.
+While these features provide insights into language production abilities, they
+often fail to capture global narrative patterns, or microstructures.
+Macrostructures include coherence, thematic organization, and logical
+progressions, reflecting essential cognitive skills potentially critical for
+recognizing NCDs. Addressing this gap, we propose to investigate specific
+cognitive and linguistic challenges by analyzing topical shifts, temporal
+dynamics, and the coherence of narratives over time, aiming to reveal cognitive
+deficits by identifying narrative impairments, and exploring their impact on
+communication and cognition. The investigation is based on the CU-MARVEL Rabbit
+Story corpus, which comprises recordings of a story-telling task from 758 older
+adults. We developed two approaches: the Dynamic Topic Models (DTM)-based
+temporal analysis to examine the evolution of topics over time, and the
+Text-Image Temporal Alignment Network (TITAN) to evaluate the coherence between
+spoken narratives and visual stimuli. DTM-based approach validated the
+effectiveness of dynamic topic consistency as a macrostructural metric
+(F1=0.61, AUC=0.78). The TITAN approach achieved the highest performance
+(F1=0.72, AUC=0.81), surpassing established microstructural and macrostructural
+feature sets. Cross-comparison and regression tasks further demonstrated the
+effectiveness of proposed dynamic macrostructural modeling approaches for NCD
+detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Deconstruction Search for Vehicle Routing Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        André Hottung, Paula Wong-Chung, Kevin Tierney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autoregressive construction approaches generate solutions to vehicle routing
+problems in a step-by-step fashion, leading to high-quality solutions that are
+nearing the performance achieved by handcrafted, operations research
+techniques. In this work, we challenge the conventional paradigm of sequential
+solution construction and introduce an iterative search framework where
+solutions are instead deconstructed by a neural policy. Throughout the search,
+the neural policy collaborates with a simple greedy insertion algorithm to
+rebuild the deconstructed solutions. Our approach surpasses the performance of
+state-of-the-art operations research methods across three challenging vehicle
+routing problems of various problem sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Speech Segmentation: A General Approach Using Speech
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avishai Elmakies, Omri Abend, Yossi Adi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce an unsupervised approach for Speech Segmentation,
+which builds on previously researched approaches, e.g., Speaker Diarization,
+while being applicable to an inclusive set of acoustic-semantic distinctions,
+paving a path towards a general Unsupervised Speech Segmentation approach.
+Unlike traditional speech and audio segmentation, which mainly focuses on
+spectral changes in the input signal, e.g., phone segmentation, our approach
+tries to segment the spoken utterance into chunks with differing
+acoustic-semantic styles, focusing on acoustic-semantic information that does
+not translate well into text, e.g., emotion or speaker. While most Speech
+Segmentation tasks only handle one style change, e.g., emotion diarization, our
+approach tries to handle multiple acoustic-semantic style changes. Leveraging
+recent advances in Speech Language Models (SLMs), we propose a simple
+unsupervised method to segment a given speech utterance. We empirically
+demonstrate the effectiveness of the proposed approach by considering several
+setups. Results suggest that the proposed method is superior to the evaluated
+baselines on boundary detection, segment purity, and over-segmentation. Code is
+available at
+https://github.com/avishaiElmakies/unsupervised_speech_segmentation_using_slm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Networks are Reproducing Kernel Chains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tjeerd Jan Heeringa, Len Spek, Christoph Brune
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying an appropriate function space for deep neural networks remains a
+key open question. While shallow neural networks are naturally associated with
+Reproducing Kernel Banach Spaces (RKBS), deep networks present unique
+challenges. In this work, we extend RKBS to chain RKBS (cRKBS), a new framework
+that composes kernels rather than functions, preserving the desirable
+properties of RKBS. We prove that any deep neural network function is a neural
+cRKBS function, and conversely, any neural cRKBS function defined on a finite
+dataset corresponds to a deep neural network. This approach provides a sparse
+solution to the empirical risk minimization problem, requiring no more than $N$
+neurons per layer, where $N$ is the number of data points.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Molecule Generation Using Latent Space Graph Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prashanth Pombala, Gerrit Grossmann, Verena Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating molecular graphs is a challenging task due to their discrete
+nature and the competitive objectives involved. Diffusion models have emerged
+as SOTA approaches in data generation across various modalities. For molecular
+graphs, graph neural networks (GNNs) as a diffusion backbone have achieved
+impressive results. Latent space diffusion, where diffusion occurs in a
+low-dimensional space via an autoencoder, has demonstrated computational
+efficiency. However, the literature on latent space diffusion for molecular
+graphs is scarce, and no commonly accepted best practices exist. In this work,
+we explore different approaches and hyperparameters, contrasting generative
+flow models (denoising diffusion, flow matching, heat dissipation) and
+architectures (GNNs and E(3)-equivariant GNNs). Our experiments reveal a high
+sensitivity to the choice of approach and design decisions. Code is made
+available at
+github.com/Prashanth-Pombala/Molecule-Generation-using-Latent-Space-Graph-Diffusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Run-and-tumble chemotaxis using reinforcement learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramesh Pramanik, Shradha Mishra, Sakuntala Chatterjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bacterial cells use run-and-tumble motion to climb up attractant
+concentration gradient in their environment. By extending the uphill runs and
+shortening the downhill runs the cells migrate towards the higher attractant
+zones. Motivated by this, we formulate a reinforcement learning (RL) algorithm
+where an agent moves in one dimension in the presence of an attractant
+gradient. The agent can perform two actions: either persistent motion in the
+same direction or reversal of direction. We assign costs for these actions
+based on the recent history of the agent's trajectory. We ask the question:
+which RL strategy works best in different types of attractant environment. We
+quantify efficiency of the RL strategy by the ability of the agent (a) to
+localize in the favorable zones after large times, and (b) to learn about its
+complete environment. Depending on the attractant profile and the initial
+condition, we find an optimum balance is needed between exploration and
+exploitation to ensure the most efficient performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SALE-Based Offline Reinforcement Learning with Ensemble Q-Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Chun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we build upon the offline reinforcement learning algorithm TD7,
+which incorporates State-Action Learned Embeddings (SALE) and LAP, and propose
+a model-free actor-critic algorithm that integrates ensemble Q-networks and a
+gradient diversity penalty from EDAC. The ensemble Q-networks effectively
+address the challenge of out-of-distribution actions by introducing penalties
+that guide the actor network to focus on in-distribution actions. Meanwhile,
+the gradient diversity penalty encourages diverse Q-value gradients, further
+suppressing overestimation for out-of-distribution actions. Additionally, our
+method retains an adjustable behavior cloning (BC) term that directs the actor
+network toward dataset actions during early training stages, while gradually
+reducing its influence as the precision of the Q-ensemble improves. These
+enhancements work synergistically to improve training stability and accuracy.
+Experimental results on the D4RL MuJoCo benchmarks demonstrate that our
+algorithm achieves superior convergence speed, stability, and performance
+compared to existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Imitation Learning of MPC with Neural Networks: Error Guarantees and
+  Sparsification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hendrik Alsmeier, Lukas Theiner, Anton Savchenko, Ali Mesbah, Rolf Findeisen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a framework for bounding the approximation error in
+imitation model predictive controllers utilizing neural networks. Leveraging
+the Lipschitz properties of these neural networks, we derive a bound that
+guides dataset design to ensure the approximation error remains at chosen
+limits. We discuss how this method can be used to design a stable neural
+network controller with performance guarantees employing existing robust model
+predictive control approaches for data generation. Additionally, we introduce a
+training adjustment, which is based on the sensitivities of the optimization
+problem and reduces dataset density requirements based on the derived bounds.
+We verify that the proposed augmentation results in improvements to the
+network's predictive capabilities and a reduction of the Lipschitz constant.
+Moreover, on a simulated inverted pendulum problem, we show that the approach
+results in a closer match of the closed-loop behavior between the imitation and
+the original model predictive controller.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Machine Learning Model with a Constrained Action Space for
+  Trajectory Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Fertig, Lakshman Balasubramanian, Michael Botsch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trajectory prediction is crucial to advance autonomous driving, improving
+safety, and efficiency. Although end-to-end models based on deep learning have
+great potential, they often do not consider vehicle dynamic limitations,
+leading to unrealistic predictions. To address this problem, this work
+introduces a novel hybrid model that combines deep learning with a kinematic
+motion model. It is able to predict object attributes such as acceleration and
+yaw rate and generate trajectories based on them. A key contribution is the
+incorporation of expert knowledge into the learning objective of the deep
+learning model. This results in the constraint of the available action space,
+thus enabling the prediction of physically feasible object attributes and
+trajectories, thereby increasing safety and robustness. The proposed hybrid
+model facilitates enhanced interpretability, thereby reinforcing the
+trustworthiness of deep learning methods and promoting the development of safe
+planning solutions. Experiments conducted on the publicly available real-world
+Argoverse dataset demonstrate realistic driving behaviour, with benchmark
+comparisons and ablation studies showing promising results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to 2025 IEEE Intelligent Vehicles Symposium (IV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Augmentation for Deep Learning Regression Tasks by Machine Learning
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Assaf Shmuel, Oren Glickman, Teddy Lazebnik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) models have gained prominence in domains such as computer
+vision and natural language processing but remain underutilized for regression
+tasks involving tabular data. In these cases, traditional machine learning (ML)
+models often outperform DL models. In this study, we propose and evaluate
+various data augmentation (DA) techniques to improve the performance of DL
+models for tabular data regression tasks. We compare the performance gain of
+Neural Networks by different DA strategies ranging from a naive method of
+duplicating existing observations and adding noise to a more sophisticated DA
+strategy that preserves the underlying statistical relationship in the data.
+Our analysis demonstrates that the advanced DA method significantly improves DL
+model performance across multiple datasets and regression tasks, resulting in
+an average performance increase of over 10\% compared to baseline models
+without augmentation. The efficacy of these DA strategies was rigorously
+validated across 30 distinct datasets, with multiple iterations and evaluations
+using three different automated deep learning (AutoDL) frameworks: AutoKeras,
+H2O, and AutoGluon. This study demonstrates that by leveraging advanced DA
+techniques, DL models can realize their full potential in regression tasks,
+thereby contributing to broader adoption and enhanced performance in practical
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MHGNet: Multi-Heterogeneous Graph Neural Network for Traffic Prediction <span class="chip">SP2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mei Wu, Yiqian Lin, Tianfan Jiang, Wenchao Weng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, traffic flow prediction has played a crucial role in the
+management of intelligent transportation systems. However, traditional
+forecasting methods often model non-Euclidean low-dimensional traffic data as a
+simple graph with single-type nodes and edges, failing to capture similar
+trends among nodes of the same type. To address this limitation, this paper
+proposes MHGNet, a novel framework for modeling spatiotemporal
+multi-heterogeneous graphs. Within this framework, the STD Module decouples
+single-pattern traffic data into multi-pattern traffic data through feature
+mappings of timestamp embedding matrices and node embedding matrices.
+Subsequently, the Node Clusterer leverages the Euclidean distance between nodes
+and different types of limit points to perform clustering with O(N) time
+complexity. The nodes within each cluster undergo residual subgraph convolution
+within the spatiotemporal fusion subgraphs generated by the DSTGG Module,
+followed by processing in the SIE Module for node repositioning and
+redistribution of weights. To validate the effectiveness of MHGNet, this paper
+conducts extensive ablation studies and quantitative evaluations on four widely
+used benchmarks, demonstrating its superior performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 2025 lEEE International Conference on Acoustics, speech,
+  and signal Processing (lCASSP2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coupled Hierarchical Structure Learning using Tree-Wasserstein Distance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03627v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03627v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ya-Wei Eileen Lin, Ronald R. Coifman, Gal Mishne, Ronen Talmon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many applications, both data samples and features have underlying
+hierarchical structures. However, existing methods for learning these latent
+structures typically focus on either samples or features, ignoring possible
+coupling between them. In this paper, we introduce a coupled hierarchical
+structure learning method using tree-Wasserstein distance (TWD). Our method
+jointly computes TWDs for samples and features, representing their latent
+hierarchies as trees. We propose an iterative, unsupervised procedure to build
+these sample and feature trees based on diffusion geometry, hyperbolic
+geometry, and wavelet filters. We show that this iterative procedure converges
+and empirically improves the quality of the constructed trees. The method is
+also computationally efficient and scales well in high-dimensional settings.
+Our method can be seamlessly integrated with hyperbolic graph convolutional
+networks (HGCN). We demonstrate that our method outperforms competing
+approaches in sparse approximation and unsupervised Wasserstein distance
+learning on several word-document and single-cell RNA-sequencing datasets. In
+addition, integrating our method into HGCN enhances performance in link
+prediction and node classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discriminative Representation learning via Attention-Enhanced
+  Contrastive Learning for Short Text Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03584v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03584v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning has gained significant attention in short text
+clustering, yet it has an inherent drawback of mistakenly identifying samples
+from the same category as negatives and then separating them in the feature
+space (false negative separation), which hinders the generation of superior
+representations. To generate more discriminative representations for efficient
+clustering, we propose a novel short text clustering method, called
+Discriminative Representation learning via \textbf{A}ttention-\textbf{E}nhanced
+\textbf{C}ontrastive \textbf{L}earning for Short Text Clustering
+(\textbf{AECL}). The \textbf{AECL} consists of two modules which are the
+pseudo-label generation module and the contrastive learning module. Both
+modules build a sample-level attention mechanism to capture similarity
+relationships between samples and aggregate cross-sample features to generate
+consistent representations. Then, the former module uses the more
+discriminative consistent representation to produce reliable supervision
+information for assist clustering, while the latter module explores similarity
+relationships and consistent representations optimize the construction of
+positive samples to perform similarity-guided contrastive learning, effectively
+addressing the false negative separation issue. Experimental results
+demonstrate that the proposed \textbf{AECL} outperforms state-of-the-art
+methods. If the paper is accepted, we will open-source the code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ STContext: A Multifaceted <span class="highlight-title">Dataset</span> for Developing Context-aware
+  Spatio-temporal Crowd Mobility Prediction Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03583v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03583v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyue Chen, Jiangyi Fang, Tengfei Liu, Fangyuan Gao, Leye Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In smart cities, context-aware spatio-temporal crowd flow prediction (STCFP)
+models leverage contextual features (e.g., weather) to identify unusual crowd
+mobility patterns and enhance prediction accuracy. However, the best practice
+for incorporating contextual features remains unclear due to inconsistent usage
+of contextual features in different papers. Developing a multifaceted dataset
+with rich types of contextual features and STCFP scenarios is crucial for
+establishing a principled context modeling paradigm. Existing open crowd flow
+datasets lack an adequate range of contextual features, which poses an urgent
+requirement to build a multifaceted dataset to fill these research gaps. To
+this end, we create STContext, a multifaceted dataset for developing
+context-aware STCFP models. Specifically, STContext provides nine
+spatio-temporal datasets across five STCFP scenarios and includes ten
+contextual features, including weather, air quality index, holidays, points of
+interest, road networks, etc. Besides, we propose a unified workflow for
+incorporating contextual features into deep STCFP methods, with steps including
+feature transformation, dependency modeling, representation fusion, and
+training strategies. Through extensive experiments, we have obtained several
+useful guidelines for effective context modeling and insights for future
+research. The STContext is open-sourced at
+https://github.com/Liyue-Chen/STContext.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cosmos World Foundation Model Platform for Physical AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         NVIDIA,  :, Niket Agarwal, Arslan Ali, Maciej Bala, Yogesh Balaji, Erik Barker, Tiffany Cai, Prithvijit Chattopadhyay, Yongxin Chen, Yin Cui, Yifan Ding, Daniel Dworakowski, Jiaojiao Fan, Michele Fenzi, Francesco Ferroni, Sanja Fidler, Dieter Fox, Songwei Ge, Yunhao Ge, Jinwei Gu, Siddharth Gururani, Ethan He, Jiahui Huang, Jacob Huffman, Pooya Jannaty, Jingyi Jin, Seung Wook Kim, Gergely Klár, Grace Lam, Shiyi Lan, Laura Leal-Taixe, Anqi Li, Zhaoshuo Li, Chen-Hsuan Lin, Tsung-Yi Lin, Huan Ling, Ming-Yu Liu, Xian Liu, Alice Luo, Qianli Ma, Hanzi Mao, Kaichun Mo, Arsalan Mousavian, Seungjun Nah, Sriharsha Niverty, David Page, Despoina Paschalidou, Zeeshan Patel, Lindsey Pavao, Morteza Ramezanali, Fitsum Reda, Xiaowei Ren, Vasanth Rao Naik Sabavat, Ed Schmerling, Stella Shi, Bartosz Stefaniak, Shitao Tang, Lyne Tchapmi, Przemek Tredak, Wei-Cheng Tseng, Jibin Varghese, Hao Wang, Haoxiang Wang, Heng Wang, Ting-Chun Wang, Fangyin Wei, Xinyue Wei, Jay Zhangjie Wu, Jiashu Xu, Wei Yang, Lin Yen-Chen, Xiaohui Zeng, Yu Zeng, Jing Zhang, Qinsheng Zhang, Yuxuan Zhang, Qingqing Zhao, Artur Zolkowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physical AI needs to be trained digitally first. It needs a digital twin of
+itself, the policy model, and a digital twin of the world, the world model. In
+this paper, we present the Cosmos World Foundation Model Platform to help
+developers build customized world models for their Physical AI setups. We
+position a world foundation model as a general-purpose world model that can be
+fine-tuned into customized world models for downstream applications. Our
+platform covers a video curation pipeline, pre-trained world foundation models,
+examples of post-training of pre-trained world foundation models, and video
+tokenizers. To help Physical AI builders solve the most critical problems of
+our society, we make our platform open-source and our models open-weight with
+permissive licenses available via https://github.com/NVIDIA/Cosmos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AADNet: Exploring EEG Spatiotemporal Information for Fast and Accurate
+  Orientation and Timbre Detection of Auditory Attention Based on A Cue-Masked
+  Paradigm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keren Shi, Xu Liu, Xue Yuan, Haijie Shang, Ruiting Dai, Hanbin Wang, Yunfa Fu, Ning Jiang, Jiayuan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Auditory attention decoding from electroencephalogram (EEG) could infer to
+which source the user is attending in noisy environments. Decoding algorithms
+and experimental paradigm designs are crucial for the development of technology
+in practical applications. To simulate real-world scenarios, this study
+proposed a cue-masked auditory attention paradigm to avoid information leakage
+before the experiment. To obtain high decoding accuracy with low latency, an
+end-to-end deep learning model, AADNet, was proposed to exploit the
+spatiotemporal information from the short time window of EEG signals. The
+results showed that with a 0.5-second EEG window, AADNet achieved an average
+accuracy of 93.46% and 91.09% in decoding auditory orientation attention (OA)
+and timbre attention (TA), respectively. It significantly outperformed five
+previous methods and did not need the knowledge of the original audio source.
+This work demonstrated that it was possible to detect the orientation and
+timbre of auditory attention from EEG signals fast and accurately. The results
+are promising for the real-time multi-property auditory attention decoding,
+facilitating the application of the neuro-steered hearing aids and other
+assistive listening devices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advanced Tutorial: Label-Efficient Two-Sample Tests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhi Li, Visar Berisha, Gautam Dasarathy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hypothesis testing is a statistical inference approach used to determine
+whether data supports a specific hypothesis. An important type is the
+two-sample test, which evaluates whether two sets of data points are from
+identical distributions. This test is widely used, such as by clinical
+researchers comparing treatment effectiveness. This tutorial explores
+two-sample testing in a context where an analyst has many features from two
+samples, but determining the sample membership (or labels) of these features is
+costly. In machine learning, a similar scenario is studied in active learning.
+This tutorial extends active learning concepts to two-sample testing within
+this \textit{label-costly} setting while maintaining statistical validity and
+high testing power. Additionally, the tutorial discusses practical applications
+of these label-efficient two-sample tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Adversarial Attacks in Reinforcement Learning from Policy
+  Distribution Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03562v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03562v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyang Duan, Zongyuan Zhang, Zheng Lin, Yue Gao, Ling Xiong, Yong Cui, Hongbin Liang, Xianhao Chen, Heming Cui, Dong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Reinforcement Learning (DRL) suffers from uncertainties and inaccuracies
+in the observation signal in realworld applications. Adversarial attack is an
+effective method for evaluating the robustness of DRL agents. However, existing
+attack methods targeting individual sampled actions have limited impacts on the
+overall policy distribution, particularly in continuous action spaces. To
+address these limitations, we propose the Distribution-Aware Projected Gradient
+Descent attack (DAPGD). DAPGD uses distribution similarity as the gradient
+perturbation input to attack the policy network, which leverages the entire
+policy distribution rather than relying on individual samples. We utilize the
+Bhattacharyya distance in DAPGD to measure policy similarity, enabling
+sensitive detection of subtle but critical differences between probability
+distributions. Our experiment results demonstrate that DAPGD achieves SOTA
+results compared to the baselines in three robot navigation tasks, achieving an
+average 22.03% higher reward drop compared to the best baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KG-TRICK: Unifying Textual and Relational Information Completion of
+  Knowledge for Multilingual Knowledge Graphs <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelin Zhou, Simone Conia, Daniel Lee, Min Li, Shenglei Huang, Umar Farooq Minhas, Saloni Potdar, Henry Xiao, Yunyao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilingual knowledge graphs (KGs) provide high-quality relational and
+textual information for various NLP applications, but they are often
+incomplete, especially in non-English languages. Previous research has shown
+that combining information from KGs in different languages aids either
+Knowledge Graph Completion (KGC), the task of predicting missing relations
+between entities, or Knowledge Graph Enhancement (KGE), the task of predicting
+missing textual information for entities. Although previous efforts have
+considered KGC and KGE as independent tasks, we hypothesize that they are
+interdependent and mutually beneficial. To this end, we introduce KG-TRICK, a
+novel sequence-to-sequence framework that unifies the tasks of textual and
+relational information completion for multilingual KGs. KG-TRICK demonstrates
+that: i) it is possible to unify the tasks of KGC and KGE into a single
+framework, and ii) combining textual information from multiple languages is
+beneficial to improve the completeness of a KG. As part of our contributions,
+we also introduce WikiKGE10++, the largest manually-curated benchmark for
+textual information completion of KGs, which features over 25,000 entities
+across 10 diverse languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera ready for COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning within Tabular Data: Foundations, Challenges, Advances and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijieying Ren, Tianxiang Zhao, Yuqing Huang, Vasant Honavar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tabular data remains one of the most prevalent data types across a wide range
+of real-world applications, yet effective representation learning for this
+domain poses unique challenges due to its irregular patterns, heterogeneous
+feature distributions, and complex inter-column dependencies. This survey
+provides a comprehensive review of state-of-the-art techniques in tabular data
+representation learning, structured around three foundational design elements:
+training data, neural architectures, and learning objectives. Unlike prior
+surveys that focus primarily on either architecture design or learning
+strategies, we adopt a holistic perspective that emphasizes the universality
+and robustness of representation learning methods across diverse downstream
+tasks. We examine recent advances in data augmentation and generation,
+specialized neural network architectures tailored to tabular data, and
+innovative learning objectives that enhance representation quality.
+Additionally, we highlight the growing influence of self-supervised learning
+and the adaptation of transformer-based foundation models for tabular data. Our
+review is based on a systematic literature search using rigorous inclusion
+criteria, encompassing 127 papers published since 2020 in top-tier conferences
+and journals. Through detailed analysis and comparison, we identify emerging
+trends, critical gaps, and promising directions for future research, aiming to
+guide the development of more generalizable and effective tabular data
+representation methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FgC2F-UDiff: Frequency-guided and Coarse-to-fine Unified Diffusion Model
+  for Multi-modality Missing MRI Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojiao Xiao, Qinmin Vivian Hu, Guanghui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modality magnetic resonance imaging (MRI) is essential for the
+diagnosis and treatment of brain tumors. However, missing modalities are
+commonly observed due to limitations in scan time, scan corruption, artifacts,
+motion, and contrast agent intolerance. Synthesis of missing MRI has been a
+means to address the limitations of modality insufficiency in clinical practice
+and research. However, there are still some challenges, such as poor
+generalization, inaccurate non-linear mapping, and slow processing speeds. To
+address the aforementioned issues, we propose a novel unified synthesis model,
+the Frequency-guided and Coarse-to-fine Unified Diffusion Model (FgC2F-UDiff),
+designed for multiple inputs and outputs. Specifically, the Coarse-to-fine
+Unified Network (CUN) fully exploits the iterative denoising properties of
+diffusion models, from global to detail, by dividing the denoising process into
+two stages, coarse and fine, to enhance the fidelity of synthesized images.
+Secondly, the Frequency-guided Collaborative Strategy (FCS) harnesses
+appropriate frequency information as prior knowledge to guide the learning of a
+unified, highly non-linear mapping. Thirdly, the Specific-acceleration Hybrid
+Mechanism (SHM) integrates specific mechanisms to accelerate the diffusion
+model and enhance the feasibility of many-to-many synthesis. Extensive
+experimental evaluations have demonstrated that our proposed FgC2F-UDiff model
+achieves superior performance on two datasets, validated through a
+comprehensive assessment that includes both qualitative observations and
+quantitative metrics, such as PSNR SSIM, LPIPS, and FID.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vocal Tract Length Warped Features for Spoken Keyword Spotting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Achintya kr. Sarkar, Priyanka Dwivedi, Zheng-Hua Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose several methods that incorporate vocal tract length
+(VTL) warped features for spoken keyword spotting (KWS). The first method,
+VTL-independent KWS, involves training a single deep neural network (DNN) that
+utilizes VTL features with various warping factors. During training, a specific
+VTL feature is randomly selected per epoch, allowing the exploration of VTL
+variations. During testing, the VTL features with different warping factors of
+a test utterance are scored against the DNN and combined with equal weight. In
+the second method scores the conventional features of a test utterance (without
+VTL warping) against the DNN. The third method, VTL-concatenation KWS,
+concatenates VTL warped features to form high-dimensional features for KWS.
+Evaluations carried out on the English Google Command dataset demonstrate that
+the proposed methods improve the accuracy of KWS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transfer Learning for Deep-Unfolded Combinatorial Optimization Solver
+  with Quantum Annealer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryo Hagiwara, Shunta Arai, Satoshi Takabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum annealing (QA) has attracted research interest as a sampler and
+combinatorial optimization problem (COP) solver. A recently proposed
+sampling-based solver for QA significantly reduces the required number of
+qubits, being capable of large COPs. In relation to this, a trainable
+sampling-based COP solver has been proposed that optimizes its internal
+parameters from a dataset by using a deep learning technique called deep
+unfolding. Although learning the internal parameters accelerates the
+convergence speed, the sampler in the trainable solver is restricted to using a
+classical sampler owing to the training cost. In this study, to utilize QA in
+the trainable solver, we propose classical-quantum transfer learning, where
+parameters are trained classically, and the trained parameters are used in the
+solver with QA. The results of numerical experiments demonstrate that the
+trainable quantum COP solver using classical-quantum transfer learning improves
+convergence speed and execution time over the original solver.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study of Accuracy-Robustness Tradeoff and Training
+  Efficiency in <span class="highlight-title">Self-Supervised</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Ghofrani, Pooyan Jamshidi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) has significantly advanced image
+representation learning, yet efficiency challenges persist, particularly with
+adversarial training. Many SSL methods require extensive epochs to achieve
+convergence, a demand further amplified in adversarial settings. To address
+this inefficiency, we revisit the robust EMP-SSL framework, emphasizing the
+importance of increasing the number of crops per image to accelerate learning.
+Unlike traditional contrastive learning, robust EMP-SSL leverages multi-crop
+sampling, integrates an invariance term and regularization, and reduces
+training epochs, enhancing time efficiency. Evaluated with both standard linear
+classifiers and multi-patch embedding aggregation, robust EMP-SSL provides new
+insights into SSL evaluation strategies.
+  Our results show that robust crop-based EMP-SSL not only accelerates
+convergence but also achieves a superior balance between clean accuracy and
+adversarial robustness, outperforming multi-crop embedding aggregation.
+Additionally, we extend this approach with free adversarial training in
+Multi-Crop SSL, introducing the Cost-Free Adversarial Multi-Crop
+Self-Supervised Learning (CF-AMC-SSL) method. CF-AMC-SSL demonstrates the
+effectiveness of free adversarial training in reducing training time while
+simultaneously improving clean accuracy and adversarial robustness. These
+findings underscore the potential of CF-AMC-SSL for practical SSL applications.
+Our code is publicly available at https://github.com/softsys4ai/CF-AMC-SSL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Textualize Visual <span class="highlight-title">Prompt</span> for Image Editing via Diffusion Bridge <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Xu, Qingnan Fan, Fei Kou, Shuai Qin, Hong Gu, Ruoyu Zhao, Charles Ling, Boyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual prompt, a pair of before-and-after edited images, can convey
+indescribable imagery transformations and prosper in image editing. However,
+current visual prompt methods rely on a pretrained text-guided image-to-image
+generative model that requires a triplet of text, before, and after images for
+retraining over a text-to-image model. Such crafting triplets and retraining
+processes limit the scalability and generalization of editing. In this paper,
+we present a framework based on any single text-to-image model without reliance
+on the explicit image-to-image model thus enhancing the generalizability and
+scalability. Specifically, by leveraging the probability-flow ordinary
+equation, we construct a diffusion bridge to transfer the distribution between
+before-and-after images under the text guidance. By optimizing the text via the
+bridge, the framework adaptively textualizes the editing transformation
+conveyed by visual prompts into text embeddings without other models.
+Meanwhile, we introduce differential attention control during text
+optimization, which disentangles the text embedding from the invariance of the
+before-and-after images and makes it solely capture the delicate transformation
+and generalize to edit various images. Experiments on real images validate
+competitive results on the generalization, contextual coherence, and high
+fidelity for delicate editing with just one image pair as the visual prompt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Source Urban Traffic Flow Forecasting with Drone and Loop Detector
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijiang Xiong, Robert Fonod, Alexandre Alahi, Nikolas Geroliminis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic forecasting is a fundamental task in transportation research, however
+the scope of current research has mainly focused on a single data modality of
+loop detectors. Recently, the advances in Artificial Intelligence and drone
+technologies have made possible novel solutions for efficient, accurate and
+flexible aerial observations of urban traffic. As a promising traffic
+monitoring approach, drone-captured data can create an accurate multi-sensor
+mobility observatory for large-scale urban networks, when combined with
+existing infrastructure. Therefore, this paper investigates the problem of
+multi-source traffic speed prediction, simultaneously using drone and loop
+detector data. A simple yet effective graph-based model HiMSNet is proposed to
+integrate multiple data modalities and learn spatio-temporal correlations.
+Detailed analysis shows that predicting accurate segment-level speed is more
+challenging than the regional speed, especially under high-demand scenarios
+with heavier congestions and varying traffic dynamics. Utilizing both drone and
+loop detector data, the prediction accuracy can be improved compared to
+single-modality cases, when the sensors have lower coverages and are subject to
+noise. Our simulation study based on vehicle trajectories in a real urban road
+network has highlighted the added value of integrating drones in traffic
+forecasting and monitoring.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entropy-Guided Attention for Private LLMs <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03489v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03489v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nandan Kumar Jha, Brandon Reagen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pervasiveness of proprietary language models has raised critical privacy
+concerns, necessitating advancements in private inference (PI), where
+computations are performed directly on encrypted data without revealing users'
+sensitive information. While PI offers a promising solution, its practical
+deployment is hindered by substantial communication and latency overheads,
+primarily stemming from nonlinear operations. To address this, we introduce an
+information-theoretic framework to characterize the role of nonlinearities in
+decoder-only language models, laying a principled foundation for optimizing
+transformer-architectures tailored to the demands of PI.
+  By leveraging Shannon's entropy as a quantitative measure, we uncover the
+previously unexplored dual significance of nonlinearities: beyond ensuring
+training stability, they are crucial for maintaining attention head diversity.
+Specifically, we find that their removal triggers two critical failure modes:
+{\em entropy collapse} in deeper layers that destabilizes training, and {\em
+entropic overload} in earlier layers that leads to under-utilization of
+Multi-Head Attention's (MHA) representational capacity.
+  We propose an entropy-guided attention mechanism paired with a novel entropy
+regularization technique to mitigate entropic overload. Additionally, we
+explore PI-friendly alternatives to layer normalization for preventing entropy
+collapse and stabilizing the training of LLMs with reduced-nonlinearities. Our
+study bridges the gap between information theory and architectural design,
+establishing entropy dynamics as a principled guide for developing efficient PI
+architectures. The code and implementation are available at
+\href{https://github.com/Nandan91/entropy-guided-attention-llm}{entropy-guided-llm}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 6th AAAI Workshop on Privacy-Preserving Artificial Intelligence
+  (PPAI), 2025. arXiv admin note: substantial text overlap with
+  arXiv:2410.13060</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Align-Pro: A Principled Approach to <span class="highlight-title">Prompt</span> Optimization for LLM
+  Alignment <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prashant Trivedi, Souradip Chakraborty, Avinash Reddy, Vaneet Aggarwal, Amrit Singh Bedi, George K. Atia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The alignment of large language models (LLMs) with human values is critical
+as these models become increasingly integrated into various societal and
+decision-making processes. Traditional methods, such as reinforcement learning
+from human feedback (RLHF), achieve alignment by fine-tuning model parameters,
+but these approaches are often computationally expensive and impractical when
+models are frozen or inaccessible for parameter modification. In contrast,
+prompt optimization is a viable alternative to RLHF for LLM alignment. While
+the existing literature has shown empirical promise of prompt optimization, its
+theoretical underpinning remains under-explored. We address this gap by
+formulating prompt optimization as an optimization problem and try to provide
+theoretical insights into the optimality of such a framework. To analyze the
+performance of the prompt optimization, we study theoretical suboptimality
+bounds and provide insights in terms of how prompt optimization depends upon
+the given prompter and target model. We also provide empirical validation
+through experiments on various datasets, demonstrating that prompt optimization
+can effectively align LLMs, even when parameter fine-tuning is not feasible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, Accepted in AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A study on performance limitations in Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03477v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03477v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karthik Mohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasing privacy concerns and unrestricted access to data lead to the
+development of a novel machine learning paradigm called Federated Learning
+(FL). FL borrows many of the ideas from distributed machine learning, however,
+the challenges associated with federated learning makes it an interesting
+engineering problem since the models are trained on edge devices. It was
+introduced in 2016 by Google, and since then active research is being carried
+out in different areas within FL such as federated optimization algorithms,
+model and update compression, differential privacy, robustness, and attacks,
+federated GANs and privacy preserved personalization. There are many open
+challenges in the development of such federated machine learning systems and
+this project will be focusing on the communication bottleneck and data Non
+IID-ness, and its effect on the performance of the models. These issues are
+characterized on a baseline model, model performance is evaluated, and
+discussions are made to overcome these issues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>archive 2021 work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reading with Intent -- Neutralizing Intent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Reichman, Adar Avsian, Larry Heck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Queries to large language models (LLMs) can be divided into two parts: the
+instruction/question and the accompanying context. The context for
+retrieval-augmented generation (RAG) systems in most benchmarks comes from
+Wikipedia or Wikipedia-like texts which are written in a neutral and factual
+tone. However, when RAG systems retrieve internet-based content, they encounter
+text with diverse tones and linguistic styles, introducing challenges for
+downstream tasks. The Reading with Intent task addresses this issue by
+evaluating how varying tones in context passages affect model performance.
+Building on prior work that focused on sarcasm, we extend this paradigm by
+constructing a dataset where context passages are transformed to $11$ distinct
+emotions using a better synthetic data generation approach. Using this dataset,
+we train an emotion translation model to systematically adapt passages to
+specified emotional tones. The human evaluation shows that the LLM fine-tuned
+to become the emotion-translator benefited from the synthetically generated
+data. Finally, the emotion-translator is used in the Reading with Intent task
+to transform the passages to a neutral tone. By neutralizing the passages, it
+mitigates the challenges posed by sarcastic passages and improves overall
+results on this task by about $3\%$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hyperbolic Binary Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Chen, Jingyang Xiang, Tianxin Huang, Xiangrui Zhao, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Binary Neural Network (BNN) converts full-precision weights and activations
+into their extreme 1-bit counterparts, making it particularly suitable for
+deployment on lightweight mobile devices. While binary neural networks are
+typically formulated as a constrained optimization problem and optimized in the
+binarized space, general neural networks are formulated as an unconstrained
+optimization problem and optimized in the continuous space. This paper
+introduces the Hyperbolic Binary Neural Network (HBNN) by leveraging the
+framework of hyperbolic geometry to optimize the constrained problem.
+Specifically, we transform the constrained problem in hyperbolic space into an
+unconstrained one in Euclidean space using the Riemannian exponential map. On
+the other hand, we also propose the Exponential Parametrization Cluster (EPC)
+method, which, compared to the Riemannian exponential map, shrinks the segment
+domain based on a diffeomorphism. This approach increases the probability of
+weight flips, thereby maximizing the information gain in BNNs. Experimental
+results on CIFAR10, CIFAR100, and ImageNet classification datasets with
+VGGsmall, ResNet18, and ResNet34 models illustrate the superior performance of
+our HBNN over state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Radar Signal Recognition through <span class="highlight-title">Self-Supervised</span> Learning and Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zi Huang, Akila Pemasiri, Simon Denman, Clinton Fookes, Terrence Martin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic radar signal recognition (RSR) plays a pivotal role in electronic
+warfare (EW), as accurately classifying radar signals is critical for informing
+decision-making processes. Recent advances in deep learning have shown
+significant potential in improving RSR performance in domains with ample
+annotated data. However, these methods fall short in EW scenarios where
+annotated RF data are scarce or impractical to obtain. To address these
+challenges, we introduce a self-supervised learning (SSL) method which utilises
+masked signal modelling and RF domain adaption to enhance RSR performance in
+environments with limited RF samples and labels. Specifically, we investigate
+pre-training masked autoencoders (MAE) on baseband in-phase and quadrature
+(I/Q) signals from various RF domains and subsequently transfer the learned
+representation to the radar domain, where annotated data are limited. Empirical
+results show that our lightweight self-supervised ResNet model with domain
+adaptation achieves up to a 17.5\% improvement in 1-shot classification
+accuracy when pre-trained on in-domain signals (i.e., radar signals) and up to
+a 16.31\% improvement when pre-trained on out-of-domain signals (i.e., comm
+signals), compared to its baseline without SSL. We also provide reference
+results for several MAE designs and pre-training strategies, establishing a new
+benchmark for few-shot radar signal classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Structure-Preference Enabled Graph Embedding Generation under
+  Differential Privacy <span class="chip">ICDE 25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Zhang, Qingqing Ye, Haibo Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph embedding generation techniques aim to learn low-dimensional vectors
+for each node in a graph and have recently gained increasing research
+attention. Publishing low-dimensional node vectors enables various graph
+analysis tasks, such as structural equivalence and link prediction. Yet,
+improper publication opens a backdoor to malicious attackers, who can infer
+sensitive information of individuals from the low-dimensional node vectors.
+Existing methods tackle this issue by developing deep graph learning models
+with differential privacy (DP). However, they often suffer from large noise
+injections and cannot provide structural preferences consistent with mining
+objectives. Recently, skip-gram based graph embedding generation techniques are
+widely used due to their ability to extract customizable structures. Based on
+skip-gram, we present SE-PrivGEmb, a structure-preference enabled graph
+embedding generation under DP. For arbitrary structure preferences, we design a
+unified noise tolerance mechanism via perturbing non-zero vectors. This
+mechanism mitigates utility degradation caused by high sensitivity. By
+carefully designing negative sampling probabilities in skip-gram, we
+theoretically demonstrate that skip-gram can preserve arbitrary proximities,
+which quantify structural features in graphs. Extensive experiments show that
+our method outperforms existing state-of-the-art methods under structural
+equivalence and link prediction tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICDE 25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Value of Learning in Task-Oriented Federated Meta-Learning
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03448v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03448v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bibo Wu, Fang Fang, Xianbin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has gained significant attention in recent years due
+to its distributed nature and privacy preserving benefits. However, a key
+limitation of conventional FL is that it learns and distributes a common global
+model to all participants, which fails to provide customized solutions for
+diverse task requirements. Federated meta-learning (FML) offers a promising
+solution to this issue by enabling devices to finetune local models after
+receiving a shared meta-model from the server. In this paper, we propose a
+task-oriented FML framework over non-orthogonal multiple access (NOMA)
+networks. A novel metric, termed value of learning (VoL), is introduced to
+assess the individual training needs across devices. Moreover, a task-level
+weight (TLW) metric is defined based on task requirements and fairness
+considerations, guiding the prioritization of edge devices during FML training.
+The formulated problem, to maximize the sum of TLW-based VoL across devices,
+forms a non-convex mixed-integer non-linear programming (MINLP) challenge,
+addressed here using a parameterized deep Q-network (PDQN) algorithm to handle
+both discrete and continuous variables. Simulation results demonstrate that our
+approach significantly outperforms baseline schemes, underscoring the
+advantages of the proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-Constrained Generative Artificial Intelligence for Rapid Takeoff
+  Trajectory Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Sisk, Xiaosong Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To aid urban air mobility (UAM), electric vertical takeoff and landing
+(eVTOL) aircraft are being targeted. Conventional multidisciplinary analysis
+and optimization (MDAO) can be expensive, while surrogate-based optimization
+can struggle with challenging physical constraints. This work proposes
+physics-constrained generative adversarial networks (physicsGAN), to
+intelligently parameterize the takeoff control profiles of an eVTOL aircraft
+and to transform the original design space to a feasible space. Specifically,
+the transformed feasible space refers to a space where all designs directly
+satisfy all design constraints. The physicsGAN-enabled surrogate-based takeoff
+trajectory design framework was demonstrated on the Airbus A3 Vahana. The
+physicsGAN generated only feasible control profiles of power and wing angle in
+the feasible space with around 98.9% of designs satisfying all constraints. The
+proposed design framework obtained 99.6% accuracy compared with
+simulation-based optimal design and took only 2.2 seconds, which reduced the
+computational time by around 200 times. Meanwhile, data-driven GAN-enabled
+surrogate-based optimization took 21.9 seconds using a derivative-free
+optimizer, which was around an order of magnitude slower than the proposed
+framework. Moreover, the data-driven GAN-based optimization using
+gradient-based optimizers could not consistently find the optimal design during
+random trials and got stuck in an infeasible region, which is problematic in
+real practice. Therefore, the proposed physicsGAN-based design framework
+outperformed data-driven GAN-based design to the extent of efficiency (2.2
+seconds), optimality (99.6% accurate), and feasibility (100% feasible).
+According to the literature review, this is the first physics-constrained
+generative artificial intelligence enabled by surrogate models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference version with 10 pages and 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ λ: A Benchmark for Data-Efficiency in Long-Horizon Indoor Mobile
+  Manipulation Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05313v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05313v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Jaafar, Shreyas Sundara Raman, Yichen Wei, Sudarshan Harithas, Sofia Juliani, Anneke Wernerfelt, Benedict Quartey, Ifrah Idrees, Jason Xinyu Liu, Stefanie Tellex
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiently learning and executing long-horizon mobile manipulation (MoMa)
+tasks is crucial for advancing robotics in household and workplace settings.
+However, current MoMa models are data-inefficient, underscoring the need for
+improved models that require realistic-sized benchmarks to evaluate their
+efficiency, which do not exist. To address this, we introduce the LAMBDA
+({\lambda}) benchmark (Long-horizon Actions for Mobile-manipulation
+Benchmarking of Directed Activities), which evaluates the data efficiency of
+models on language-conditioned, long-horizon, multi-room, multi-floor,
+pick-and-place tasks using a dataset of manageable size, more feasible for
+collection. The benchmark includes 571 human-collected demonstrations that
+provide realism and diversity in simulated and real-world settings. Unlike
+planner-generated data, these trajectories offer natural variability and
+replay-verifiability, ensuring robust learning and evaluation. We benchmark
+several models, including learning-based models and a neuro-symbolic modular
+approach combining foundation models with task and motion planning.
+Learning-based models show suboptimal success rates, even when leveraging
+pretrained weights, underscoring significant data inefficiencies. However, the
+neuro-symbolic approach performs significantly better while being more data
+efficient. Findings highlight the need for more data-efficient learning-based
+MoMa approaches. {\lambda} addresses this gap by serving as a key benchmark for
+evaluating the data efficiency of those future models in handling household
+robotics tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ImageFlowNet: Forecasting Multiscale Image-Level Trajectories of Disease
+  Progression with Irregularly-Sampled Longitudinal Medical Images <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14794v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14794v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liu, Ke Xu, Liangbo L. Shen, Guillaume Huguet, Zilong Wang, Alexander Tong, Danilo Bzdok, Jay Stewart, Jay C. Wang, Lucian V. Del Priore, Smita Krishnaswamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in medical imaging technologies have enabled the collection of
+longitudinal images, which involve repeated scanning of the same patients over
+time, to monitor disease progression. However, predictive modeling of such data
+remains challenging due to high dimensionality, irregular sampling, and data
+sparsity. To address these issues, we propose ImageFlowNet, a novel model
+designed to forecast disease trajectories from initial images while preserving
+spatial details. ImageFlowNet first learns multiscale joint representation
+spaces across patients and time points, then optimizes deterministic or
+stochastic flow fields within these spaces using a position-parameterized
+neural ODE/SDE framework. The model leverages a UNet architecture to create
+robust multiscale representations and mitigates data scarcity by combining
+knowledge from all patients. We provide theoretical insights that support our
+formulation of ODEs, and motivate our regularizations involving high-level
+visual features, latent space organization, and trajectory smoothness. We
+validate ImageFlowNet on three longitudinal medical image datasets depicting
+progression in geographic atrophy, multiple sclerosis, and glioblastoma,
+demonstrating its ability to effectively forecast disease progression and
+outperform existing methods. Our contributions include the development of
+ImageFlowNet, its theoretical underpinnings, and empirical validation on
+real-world datasets. The official implementation is available at
+https://github.com/KrishnaswamyLab/ImageFlowNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Machine Learning Can Predict Videoconference Fluidity and
+  Enjoyment <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Chang, Viswadruth Akkaraju, Ray McFadden Cogliano, David Poeppel, Dustin Freeman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Videoconferencing is now a frequent mode of communication in both
+professional and informal settings, yet it often lacks the fluidity and
+enjoyment of in-person conversation. This study leverages multimodal machine
+learning to predict moments of negative experience in videoconferencing. We
+sampled thousands of short clips from the RoomReader corpus, extracting audio
+embeddings, facial actions, and body motion features to train models for
+identifying low conversational fluidity, low enjoyment, and classifying
+conversational events (backchanneling, interruption, or gap). Our best models
+achieved an ROC-AUC of up to 0.87 on hold-out videoconference sessions, with
+domain-general audio features proving most critical. This work demonstrates
+that multimodal audio-video signals can effectively predict high-level
+subjective conversational outcomes. In addition, this is a contribution to
+research on videoconferencing user experience by showing that multimodal
+machine learning can be used to identify rare moments of negative user
+experience for further study or mitigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adjoint Matching: Fine-tuning Flow and Diffusion Generative Models with
+  Memoryless Stochastic Optimal Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08861v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08861v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carles Domingo-Enrich, Michal Drozdzal, Brian Karrer, Ricky T. Q. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamical generative models that produce samples through an iterative
+process, such as Flow Matching and denoising diffusion models, have seen
+widespread use, but there have not been many theoretically-sound methods for
+improving these models with reward fine-tuning. In this work, we cast reward
+fine-tuning as stochastic optimal control (SOC). Critically, we prove that a
+very specific memoryless noise schedule must be enforced during fine-tuning, in
+order to account for the dependency between the noise variable and the
+generated samples. We also propose a new algorithm named Adjoint Matching which
+outperforms existing SOC algorithms, by casting SOC problems as a regression
+problem. We find that our approach significantly improves over existing methods
+for reward fine-tuning, achieving better consistency, realism, and
+generalization to unseen human preference reward models, while retaining sample
+diversity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unity by Diversity: Improved Representation Learning in Multimodal VAEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05300v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05300v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas M. Sutter, Yang Meng, Andrea Agostini, Daphné Chopard, Norbert Fortin, Julia E. Vogt, Babak Shahbaba, Stephan Mandt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational Autoencoders for multimodal data hold promise for many tasks in
+data analysis, such as representation learning, conditional generation, and
+imputation. Current architectures either share the encoder output, decoder
+input, or both across modalities to learn a shared representation. Such
+architectures impose hard constraints on the model. In this work, we show that
+a better latent representation can be obtained by replacing these hard
+constraints with a soft constraint. We propose a new mixture-of-experts prior,
+softly guiding each modality's latent representation towards a shared aggregate
+posterior. This approach results in a superior latent representation and allows
+each encoding to preserve information better from its uncompressed original
+features. In extensive experiments on multiple benchmark datasets and two
+challenging real-world datasets, we show improved learned latent
+representations and imputation of missing data modalities compared to existing
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Neurips 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constrained Sampling with Primal-Dual Langevin Monte Carlo <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.00568v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.00568v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luiz F. O. Chamon, Mohammad Reza Karimi, Anna Korba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work considers the problem of sampling from a probability distribution
+known up to a normalization constant while satisfying a set of statistical
+constraints specified by the expected values of general nonlinear functions.
+This problem finds applications in, e.g., Bayesian inference, where it can
+constrain moments to evaluate counterfactual scenarios or enforce desiderata
+such as prediction fairness. Methods developed to handle support constraints,
+such as those based on mirror maps, barriers, and penalties, are not suited for
+this task. This work therefore relies on gradient descent-ascent dynamics in
+Wasserstein space to put forward a discrete-time primal-dual Langevin Monte
+Carlo algorithm (PD-LMC) that simultaneously constrains the target distribution
+and samples from it. We analyze the convergence of PD-LMC under standard
+assumptions on the target distribution and constraints, namely (strong)
+convexity and log-Sobolev inequalities. To do so, we bring classical
+optimization arguments for saddle-point algorithms to the geometry of
+Wasserstein space. We illustrate the relevance and effectiveness of PD-LMC in
+several applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 14 figures. Published at NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Neural Network Symmetrisation in Markov Categories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11814v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11814v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rob Cornish
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of symmetrising a neural network along a group
+homomorphism: given a homomorphism $\varphi : H \to G$, we would like a
+procedure that converts $H$-equivariant neural networks to $G$-equivariant
+ones. We formulate this in terms of Markov categories, which allows us to
+consider neural networks whose outputs may be stochastic, but with
+measure-theoretic details abstracted away. We obtain a flexible and
+compositional framework for symmetrisation that relies on minimal assumptions
+about the structure of the group and the underlying neural network
+architecture. Our approach recovers existing canonicalisation and averaging
+techniques for symmetrising deterministic models, and extends to provide a
+novel methodology for symmetrising stochastic models also. Beyond this, our
+findings also demonstrate the utility of Markov categories for addressing
+complex problems in machine learning in a conceptually clear yet mathematically
+precise way.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Follow The Approximate Sparse Leader for No-Regret Online Sparse Linear
+  Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00799v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00799v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samrat Mukhopadhyay, Debasmita Mukherjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of \textit{online sparse linear approximation}, where
+one predicts the best sparse approximation of a sequence of measurements in
+terms of linear combination of columns of a given measurement matrix. Such
+online prediction problems are ubiquitous, ranging from medical trials to web
+caching to resource allocation. The inherent difficulty of offline recovery
+also makes the online problem challenging. In this letter, we propose
+Follow-The-Approximate-Sparse-Leader, an efficient online meta-policy to
+address this online problem. Through a detailed theoretical analysis, we prove
+that under certain assumptions on the measurement sequence, the proposed policy
+enjoys a data-dependent sublinear upper bound on the static regret, which can
+range from logarithmic to square-root. Numerical simulations are performed to
+corroborate the theoretical findings and demonstrate the efficacy of the
+proposed online policy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures, corrected title, added proof of a lemma in
+  appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AtMan: Understanding <span class="highlight-title">Transformer</span> Predictions Through Memory Efficient
+  Attention Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08110v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08110v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Björn Deiseroth, Mayukh Deb, Samuel Weinbach, Manuel Brack, Patrick Schramowski, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative transformer models have become increasingly complex, with large
+numbers of parameters and the ability to process multiple input modalities.
+Current methods for explaining their predictions are resource-intensive. Most
+crucially, they require prohibitively large amounts of extra memory, since they
+rely on backpropagation which allocates almost twice as much GPU memory as the
+forward pass. This makes it difficult, if not impossible, to use them in
+production. We present AtMan that provides explanations of generative
+transformer models at almost no extra cost. Specifically, AtMan is a
+modality-agnostic perturbation method that manipulates the attention mechanisms
+of transformers to produce relevance maps for the input with respect to the
+output prediction. Instead of using backpropagation, AtMan applies a
+parallelizable token-based search method based on cosine similarity
+neighborhood in the embedding space. Our exhaustive experiments on text and
+image-text benchmarks demonstrate that AtMan outperforms current
+state-of-the-art gradient-based methods on several metrics while being
+computationally efficient. As such, AtMan is suitable for use in large model
+inference deployments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Probability-density-aware Semi-supervised Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.17547v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.17547v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyang Liu, Ruiqiu Zheng, Yunhang Shen, Ke Li, Xing Sun, Zhou Yu, Shaohui Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised learning (SSL) assumes that neighbor points lie in the same
+category (neighbor assumption), and points in different clusters belong to
+various categories (cluster assumption). Existing methods usually rely on
+similarity measures to retrieve the similar neighbor points, ignoring cluster
+assumption, which may not utilize unlabeled information sufficiently and
+effectively. This paper first provides a systematical investigation into the
+significant role of probability density in SSL and lays a solid theoretical
+foundation for cluster assumption. To this end, we introduce a
+Probability-Density-Aware Measure (PM) to discern the similarity between
+neighbor points. To further improve Label Propagation, we also design a
+Probability-Density-Aware Measure Label Propagation (PMLP) algorithm to fully
+consider the cluster assumption in label propagation. Last but not least, we
+prove that traditional pseudo-labeling could be viewed as a particular case of
+PMLP, which provides a comprehensive theoretical understanding of PMLP's
+superior performance. Extensive experiments demonstrate that PMLP achieves
+outstanding performance compared with other recent methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Statistical Error Bounds for GANs with Nonlinear Objective Functionals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.16834v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.16834v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeremiah Birrell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative adversarial networks (GANs) are unsupervised learning methods for
+training a generator distribution to produce samples that approximate those
+drawn from a target distribution. Many such methods can be formulated as
+minimization of a metric or divergence between probability distributions.
+Recent works have derived statistical error bounds for GANs that are based on
+integral probability metrics (IPMs), e.g., WGAN which is based on the
+1-Wasserstein metric. In general, IPMs are defined by optimizing a linear
+functional (difference of expectations) over a space of discriminators. A much
+larger class of GANs, which we here call $(f,\Gamma)$-GANs, can be constructed
+using $f$-divergences (e.g., Jensen-Shannon, KL, or $\alpha$-divergences)
+together with a regularizing discriminator space $\Gamma$ (e.g., $1$-Lipschitz
+functions). These GANs have nonlinear objective functions, depending on the
+choice of $f$, and have been shown to exhibit improved performance in a number
+of applications. In this work we derive statistical error bounds for
+$(f,\Gamma)$-GANs for general classes of $f$ and $\Gamma$ in the form of
+finite-sample concentration inequalities. These results prove the statistical
+consistency of $(f,\Gamma)$-GANs and reduce to the known results for IPM-GANs
+in the appropriate limit. Finally, our results also give new insight into the
+performance of GANs for distributions with unbounded support.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Efficient LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14746v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14746v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        B. N. Kausik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trained LLMs are typically sparse in that most of the parameters are zero,
+raising questions on efficiency. In response, we inquire into efficient LLMs,
+i.e. those with the fewest parameters that achieve the desired accuracy on a
+training corpus. Specifically, we compare theoretical and empirical estimates
+for training loss to obtain upper and lower bounds on the number of unique
+sequences in a natural training corpus as a function of its size. Our result
+implies (1) to double the number of skills represented in a training corpus,
+the corpus must scale more than four fold (2) for efficient LLMs, the number of
+parameters N and the size D of a natural training corpus scale as $N \propto
+D^{0.44}$; (3) if the number of parameters of an LLM is smaller than the number
+of unique sequences in the training corpus, scaling up can uncover emergent
+skills.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Federated Unlearning: Analysis, Comparison, and Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.19218v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.19218v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Zhao, Jiaxi Yang, Yiling Tao, Lixu Wang, Xiaoxiao Li, Dusit Niyato, H. Vincent Poor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing demand for privacy-preserving machine learning has spurred
+interest in federated unlearning, which enables the selective removal of data
+from models trained in federated systems. However, developing federated
+unlearning methods presents challenges, particularly in balancing three often
+conflicting objectives: privacy, accuracy, and efficiency. This paper provides
+a comprehensive analysis of existing federated unlearning approaches, examining
+their algorithmic efficiency, impact on model accuracy, and effectiveness in
+preserving privacy. We discuss key trade-offs among these dimensions and
+highlight their implications for practical applications across various domains.
+Additionally, we propose the OpenFederatedUnlearning framework, a unified
+benchmark for evaluating federated unlearning methods, incorporating classic
+baselines and diverse performance metrics. Our findings aim to guide
+practitioners in navigating the complex interplay of these objectives, offering
+insights to achieve effective and efficient federated unlearning. Finally, we
+outline directions for future research to further advance the state of
+federated unlearning techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ T-FREE: Subword Tokenizer-Free Generative LLMs via Sparse
+  Representations for Memory-Efficient Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19223v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19223v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Björn Deiseroth, Manuel Brack, Patrick Schramowski, Kristian Kersting, Samuel Weinbach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tokenizers are crucial for encoding information in Large Language Models, but
+their development has recently stagnated, and they contain inherent weaknesses.
+Major limitations include computational overhead, ineffective vocabulary use,
+and unnecessarily large embedding and head layers. Additionally, their
+performance is biased towards a reference corpus, leading to reduced
+effectiveness for underrepresented languages.
+  To remedy these issues, we propose T-FREE, which directly embeds words
+through sparse activation patterns over character triplets, and does not
+require a reference corpus. T-FREE inherently exploits morphological
+similarities and allows for strong compression of embedding layers. In our
+exhaustive experimental evaluation, we achieve competitive downstream
+performance with a parameter reduction of more than 85% on these layers.
+Further, T-FREE shows significant improvements in cross-lingual transfer
+learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LMS-AutoTSF: Learnable Multi-Scale Decomposition and Integrated
+  Autocorrelation for Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.06866v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.06866v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibrahim Delibasoglu, Sanjay Chakraborty, Fredrik Heintz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series forecasting is an important challenge with significant
+applications in areas such as weather prediction, stock market analysis,
+scientific simulations and industrial process analysis. In this work, we
+introduce LMS-AutoTSF, a novel time series forecasting architecture that
+incorporates autocorrelation while leveraging dual encoders operating at
+multiple scales. Unlike models that rely on predefined trend and seasonal
+components, LMS-AutoTSF employs two separate encoders per scale: one focusing
+on low-pass filtering to capture trends and the other utilizing high-pass
+filtering to model seasonal variations. These filters are learnable, allowing
+the model to dynamically adapt and isolate trend and seasonal components
+directly in the frequency domain. A key innovation in our approach is the
+integration of autocorrelation, achieved by computing lagged differences in
+time steps, which enables the model to capture dependencies across time more
+effectively. Each encoder processes the input through fully connected layers to
+handle temporal and channel interactions. By combining frequency-domain
+filtering, autocorrelation-based temporal modeling, and channel-wise
+transformations, LMS-AutoTSF not only accurately captures long-term
+dependencies and fine-grained patterns but also operates more efficiently
+compared to other state-of-the-art methods. Its lightweight design ensures
+faster processing while maintaining high precision in forecasting across
+diverse time horizons. The source code is publicly available at
+\url{http://github.com/mribrahim/LMS-TSF}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Koopman Learning with Episodic Memory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12615v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12615v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William T. Redman, Dean Huang, Maria Fonoberova, Igor Mezić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Koopman operator theory has found significant success in learning models of
+complex, real-world dynamical systems, enabling prediction and control. The
+greater interpretability and lower computational costs of these models,
+compared to traditional machine learning methodologies, make Koopman learning
+an especially appealing approach. Despite this, little work has been performed
+on endowing Koopman learning with the ability to leverage its own failures. To
+address this, we equip Koopman methods -- developed for predicting
+non-autonomous time-series -- with an episodic memory mechanism, enabling
+global recall of (or attention to) periods in time where similar dynamics
+previously occurred. We find that a basic implementation of Koopman learning
+with episodic memory leads to significant improvements in prediction on
+synthetic and real-world data. Our framework has considerable potential for
+expansion, allowing for future advances, and opens exciting new directions for
+Koopman learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Lipschitz Operators with respect to Gaussian Measures with
+  Near-Optimal Sample Complexity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23440v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23440v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Adcock, Michael Griebel, Gregor Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Operator learning, the approximation of mappings between infinite-dimensional
+function spaces using ideas from machine learning, has gained increasing
+research attention in recent years. Approximate operators, learned from data,
+hold promise to serve as efficient surrogate models for problems in
+computational science and engineering, complementing traditional numerical
+methods. However, despite their empirical success, our understanding of the
+underpinning mathematical theory is in large part still incomplete. In this
+paper, we study the approximation of Lipschitz operators in expectation with
+respect to Gaussian measures. We prove higher Gaussian Sobolev regularity of
+Lipschitz operators and establish lower and upper bounds on the Hermite
+polynomial approximation error. We further consider the reconstruction of
+Lipschitz operators from $m$ arbitrary (adaptive) linear samples. A key finding
+is the tight characterization of the smallest achievable error for all possible
+(adaptive) sampling and reconstruction maps in terms of $m$. It is shown that
+Hermite polynomial approximation is an optimal recovery strategy, but we have
+the following curse of sample complexity: No method to approximate Lipschitz
+operators based on $m$ samples can achieve algebraic convergence rates in $m$.
+On the positive side, we prove that a sufficiently fast spectral decay of the
+covariance operator of the Gaussian measure guarantees convergence rates which
+are arbitrarily close to any algebraic rate in the large data limit $m \to
+\infty$. A main focus of this work is on the recovery of Lipschitz operators
+from finitely many point samples. We use Christoffel sampling and weighted
+least-squares approximation to propose an algorithm which provably achieves
+near-optimal sample complexity in high probability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>56 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Glucose Patterns to Health Outcomes: A Generalizable Foundation
+  Model for Continuous Glucose Monitor Data Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11876v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11876v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guy Lutsker, Gal Sapir, Smadar Shilo, Jordi Merino, Anastasia Godneva, Jerry R Greenfield, Dorit Samocha-Bonet, Raja Dhir, Francisco Gude, Shie Mannor, Eli Meirom, Gal Chechik, Hagai Rossman, Eran Segal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in SSL enabled novel medical AI models, known as foundation
+models, offer great potential for better characterizing health from diverse
+biomedical data. CGM provides rich, temporal data on glycemic patterns, but its
+full potential for predicting broader health outcomes remains underutilized.
+Here, we present GluFormer, a generative foundation model for CGM data that
+learns nuanced glycemic patterns and translates them into predictive
+representations of metabolic health. Trained on over 10 million CGM
+measurements from 10,812 adults, primarily without diabetes, GluFormer uses
+autoregressive token prediction to capture longitudinal glucose dynamics. We
+show that GluFormer generalizes to 19 external cohorts (n=6,044) spanning
+different ethnicities and ages, 5 countries, 8 CGM devices, and diverse
+pathophysiological states. GluFormers representations exceed the performance of
+current CGM metrics, such as the Glucose Management Indicator (GMI), for
+forecasting clinical measures. In a longitudinal study of 580 adults with CGM
+data and 12-year follow-up, GluFormer identifies individuals at elevated risk
+of developing diabetes more effectively than blood HbA1C%, capturing 66% of all
+new-onset diabetes diagnoses in the top quartile versus 7% in the bottom
+quartile. Similarly, 69% of cardiovascular-death events occurred in the top
+quartile with none in the bottom quartile, demonstrating powerful risk
+stratification beyond traditional glycemic metrics. We also show that CGM
+representations from pre-intervention periods in Randomized Clinical Trials
+outperform other methods in predicting primary and secondary outcomes. When
+integrating dietary data into GluFormer, we show that the multi-modal version
+of the model can accurately generate CGM data based on dietary intake data,
+simulate outcomes of dietary interventions, and predict individual responses to
+specific foods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Abstracted Shapes as Tokens -- A Generalizable and Interpretable Model
+  for Time-series Classification <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.01006v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.01006v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunshi Wen, Tengfei Ma, Tsui-Wei Weng, Lam M. Nguyen, Anak Agung Julius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In time-series analysis, many recent works seek to provide a unified view and
+representation for time-series across multiple domains, leading to the
+development of foundation models for time-series data. Despite diverse modeling
+techniques, existing models are black boxes and fail to provide insights and
+explanations about their representations. In this paper, we present VQShape, a
+pre-trained, generalizable, and interpretable model for time-series
+representation learning and classification. By introducing a novel
+representation for time-series data, we forge a connection between the latent
+space of VQShape and shape-level features. Using vector quantization, we show
+that time-series from different domains can be described using a unified set of
+low-dimensional codes, where each code can be represented as an abstracted
+shape in the time domain. On classification tasks, we show that the
+representations of VQShape can be utilized to build interpretable classifiers,
+achieving comparable performance to specialist models. Additionally, in
+zero-shot learning, VQShape and its codebook can generalize to previously
+unseen datasets and domains that are not included in the pre-training process.
+The code and pre-trained weights are available at
+https://github.com/YunshiWen/VQShape.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Neural Information Processing Systems (NeurIPS) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advanced Persistent Threats (APT) Attribution Using Deep Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Animesh Singh Basnet, Mohamed Chahine Ghanem, Dipo Dunsin, Wiktor Sowinski-Mydlarz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of the DRL model for malware attribution involved extensive
+research, iterative coding, and numerous adjustments based on the insights
+gathered from predecessor models and contemporary research papers. This
+preparatory work was essential to establish a robust foundation for the model,
+ensuring it could adapt and respond effectively to the dynamic nature of
+malware threats. Initially, the model struggled with low accuracy levels, but
+through persistent adjustments to its architecture and learning algorithms,
+accuracy improved dramatically from about 7 percent to over 73 percent in early
+iterations. By the end of the training, the model consistently reached accuracy
+levels near 98 percent, demonstrating its strong capability to accurately
+recognise and attribute malware activities. This upward trajectory in training
+accuracy is graphically represented in the Figure, which vividly illustrates
+the model maturation and increasing proficiency over time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-based Accelerated MR Cholangiopancreatography without
+  Fully-sampled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.03732v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.03732v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinho Kim, Marcel Dominik Nickel, Florian Knoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The purpose of this study was to accelerate MR cholangiopancreatography
+(MRCP) acquisitions using deep learning-based (DL) reconstruction at 3T and
+0.55T. A total of 35 healthy volunteers underwent conventional two-fold
+accelerated MRCP scans at field strengths of 3T and 0.55T. We trained DL
+reconstructions using two different training strategies, supervised (SV) and
+self-supervised (SSV), with retrospectively six-fold undersampled data obtained
+at 3T. We then evaluated the DL reconstructions against standard techniques,
+parallel imaging (PI) and compressed sensing (CS), focusing on peak
+signal-to-noise ratio (PSNR) and structural similarity (SSIM) as metrics. We
+also tested DL reconstructions with prospectively accelerated acquisitions and
+evaluated their robustness when changing fields strengths from 3T to 0.55T. DL
+reconstructions demonstrated a reduction in average acquisition time from
+599/542 to 255/180 seconds for MRCP at 3T/0.55T. In both retrospective and
+prospective undersampling, PSNR and SSIM of DL reconstructions were higher than
+those of PI and CS. At the same time, DL reconstructions preserved the image
+quality of undersampled data, including sharpness and the visibility of
+hepatobiliary ducts. In addition, both DL approaches produced high-quality
+reconstructions at 0.55T. In summary, DL reconstructions trained for highly
+accelerated MRCP enabled a reduction in acquisition time by a factor of 2.4/3.0
+at 3T/0.55T while maintaining the image quality of conventional acquisitions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Influence Functions for Scalable Data Attribution in Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.13850v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.13850v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Mlodozeniec, Runa Eschenhagen, Juhan Bae, Alexander Immer, David Krueger, Richard Turner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have led to significant advancements in generative
+modelling. Yet their widespread adoption poses challenges regarding data
+attribution and interpretability. In this paper, we aim to help address such
+challenges in diffusion models by developing an influence functions framework.
+Influence function-based data attribution methods approximate how a model's
+output would have changed if some training data were removed. In supervised
+learning, this is usually used for predicting how the loss on a particular
+example would change. For diffusion models, we focus on predicting the change
+in the probability of generating a particular example via several proxy
+measurements. We show how to formulate influence functions for such quantities
+and how previously proposed methods can be interpreted as particular design
+choices in our framework. To ensure scalability of the Hessian computations in
+influence functions, we systematically develop K-FAC approximations based on
+generalised Gauss-Newton matrices specifically tailored to diffusion models. We
+recast previously proposed methods as specific design choices in our framework
+and show that our recommended method outperforms previous data attribution
+approaches on common evaluations, such as the Linear Data-modelling Score (LDS)
+or retraining without top influences, without the need for method-specific
+hyperparameter tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Highway Graph to Accelerate Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11727v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11727v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zidu Yin, Zhen Zhang, Dong Gong, Stefano V. Albrecht, Javen Q. Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning (RL) algorithms often struggle with low training
+efficiency. A common approach to address this challenge is integrating
+model-based planning algorithms, such as Monte Carlo Tree Search (MCTS) or
+Value Iteration (VI), into the environmental model. However, VI requires
+iterating over a large tensor which updates the value of the preceding state
+based on the succeeding state through value propagation, resulting in
+computationally intensive operations. To enhance the RL training efficiency, we
+propose improving the efficiency of the value learning process. In
+deterministic environments with discrete state and action spaces, we observe
+that on the sampled empirical state-transition graph, a non-branching sequence
+of transitions-termed a highway-can take the agent to another state without
+deviation through intermediate states. On these non-branching highways, the
+value-updating process can be streamlined into a single-step operation,
+eliminating the need for step-by-step updates. Building on this observation, we
+introduce the highway graph to model state transitions. The highway graph
+compresses the transition model into a compact representation, where edges can
+encapsulate multiple state transitions, enabling value propagation across
+multiple time steps in a single iteration. By integrating the highway graph
+into RL, the training process is significantly accelerated, particularly in the
+early stages of training. Experiments across four categories of environments
+demonstrate that our method learns significantly faster than established and
+state-of-the-art RL algorithms (often by a factor of 10 to 150) while
+maintaining equal or superior expected returns. Furthermore, a deep neural
+network-based agent trained using the highway graph exhibits improved
+generalization capabilities and reduced storage costs. Code is publicly
+available at https://github.com/coodest/highwayRL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in TMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Piano Transcription by Hierarchical Language Modeling with <span class="highlight-title">Pretrain</span>ed
+  Roll-based Encoders <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03038v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03038v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dichucheng Li, Yongyi Zang, Qiuqiang Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Music Transcription (AMT), aiming to get musical notes from raw
+audio, typically uses frame-level systems with piano-roll outputs or language
+model (LM)-based systems with note-level predictions. However, frame-level
+systems require manual thresholding, while the LM-based systems struggle with
+long sequences. In this paper, we propose a hybrid method combining pre-trained
+roll-based encoders with an LM decoder to leverage the strengths of both
+methods. Besides, our approach employs a hierarchical prediction strategy,
+first predicting onset and pitch, then velocity, and finally offset. The
+hierarchical prediction strategy reduces computational costs by breaking down
+long sequences into different hierarchies. Evaluated on two benchmark
+roll-based encoders, our method outperforms traditional piano-roll outputs 0.01
+and 0.022 in onset-offset-velocity F1 score, demonstrating its potential as a
+performance-enhancing plug-in for arbitrary roll-based music transcription
+encoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GraphLoRA: Structure-Aware Contrastive Low-Rank Adaptation for
+  Cross-Graph Transfer Learning <span class="chip">KDD2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16670v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16670v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe-Rui Yang, Jindong Han, Chang-Dong Wang, Hao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have demonstrated remarkable proficiency in
+handling a range of graph analytical tasks across various domains, such as
+e-commerce and social networks. Despite their versatility, GNNs face
+significant challenges in transferability, limiting their utility in real-world
+applications. Existing research in GNN transfer learning overlooks
+discrepancies in distribution among various graph datasets, facing challenges
+when transferring across different distributions. How to effectively adopt a
+well-trained GNN to new graphs with varying feature and structural
+distributions remains an under-explored problem. Taking inspiration from the
+success of Low-Rank Adaptation (LoRA) in adapting large language models to
+various domains, we propose GraphLoRA, an effective and parameter-efficient
+method for transferring well-trained GNNs to diverse graph domains.
+Specifically, we first propose a Structure-aware Maximum Mean Discrepancy
+(SMMD) to align divergent node feature distributions across source and target
+graphs. Moreover, we introduce low-rank adaptation by injecting a small
+trainable GNN alongside the pre-trained one, effectively bridging structural
+distribution gaps while mitigating the catastrophic forgetting. Additionally, a
+structure-aware regularization objective is proposed to enhance the
+adaptability of the pre-trained GNN to target graph with scarce supervision
+labels. Extensive experiments on eight real-world datasets demonstrate the
+effectiveness of GraphLoRA against fourteen baselines by tuning only 20% of
+parameters, even across disparate graph domains. The code is available at
+https://github.com/AllminerLab/GraphLoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting risk of cardiovascular disease using retinal OCT imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18873v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18873v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cynthia Maldonado-Garcia, Rodrigo Bonazzola, Enzo Ferrante, Thomas H Julian, Panagiotis I Sergouniotis, Nishant Ravikumara, Alejandro F Frangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiovascular diseases (CVD) are the leading cause of death globally.
+Non-invasive, cost-effective imaging techniques play a crucial role in early
+detection and prevention of CVD. Optical coherence tomography (OCT) has gained
+recognition as a potential tool for early CVD risk prediction, though its use
+remains underexplored. In this study, we investigated the potential of OCT as
+an additional imaging technique to predict future CVD events. We analysed
+retinal OCT data from the UK Biobank. The dataset included 612 patients who
+suffered a myocardial infarction (MI) or stroke within five years of imaging
+and 2,234 controls without CVD (total: 2,846 participants). A self-supervised
+deep learning approach based on Variational Autoencoders (VAE) was used to
+extract low-dimensional latent representations from high-dimensional 3D OCT
+images, capturing distinct features of retinal layers. These latent features,
+along with clinical data, were used to train a Random Forest (RF) classifier to
+differentiate between patients at risk of future CVD events (MI or stroke) and
+healthy controls. Our model achieved an AUC of 0.75, sensitivity of 0.70,
+specificity of 0.70, and accuracy of 0.70, outperforming the QRISK3 score (the
+third version of the QRISK cardiovascular disease risk prediction algorithm;
+AUC = 0.60, sensitivity = 0.60, specificity = 0.55, accuracy = 0.55). The
+choroidal layer in OCT images was identified as a key predictor of future CVD
+events, revealed through a novel model explainability approach. This study
+demonstrates that retinal OCT imaging is a cost-effective, non-invasive
+alternative for predicting CVD risk, offering potential for widespread
+application in optometry practices and hospitals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>New version - 26 pages for main manuscript, 7 figures, 7 pages for
+  appendix and preprint for a journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Local Overfitting and Forgetting in Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12968v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12968v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uri Stern, Tomer Yaacoby, Daphna Weinshall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The infrequent occurrence of overfitting in deep neural networks is
+perplexing: contrary to theoretical expectations, increasing model size often
+enhances performance in practice. But what if overfitting does occur, though
+restricted to specific sub-regions of the data space? In this work, we propose
+a novel score that captures the forgetting rate of deep models on validation
+data. We posit that this score quantifies local overfitting: a decline in
+performance confined to certain regions of the data space. We then show
+empirically that local overfitting occurs regardless of the presence of
+traditional overfitting. Using the framework of deep over-parametrized linear
+models, we offer a certain theoretical characterization of forgotten knowledge,
+and show that it correlates with knowledge forgotten by real deep models.
+Finally, we devise a new ensemble method that aims to recover forgotten
+knowledge, relying solely on the training history of a single network. When
+combined with self-distillation, this method enhances the performance of any
+trained model without adding inference costs. Extensive empirical evaluations
+demonstrate the efficacy of our method across multiple datasets, contemporary
+neural network architectures, and training protocols.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2310.11094</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentangling, Amplifying, and Debiasing: Learning Disentangled
+  Representations for Fair Graph Neural Networks <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12875v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12875v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeon-Chang Lee, Hojung Shin, Sang-Wook Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have become essential tools for graph
+representation learning in various domains, such as social media and
+healthcare. However, they often suffer from fairness issues due to inherent
+biases in node attributes and graph structure, leading to unfair predictions.
+To address these challenges, we propose a novel GNN framework, DAB-GNN, that
+Disentangles, Amplifies, and deBiases attribute, structure, and potential
+biases in the GNN mechanism. DAB-GNN employs a disentanglement and
+amplification module that isolates and amplifies each type of bias through
+specialized disentanglers, followed by a debiasing module that minimizes the
+distance between subgroup distributions. Extensive experiments on five datasets
+demonstrate that DAB-GNN significantly outperforms ten state-of-the-art
+competitors in terms of achieving an optimal balance between accuracy and
+fairness. The codebase of DAB-GNN is available at
+https://github.com/Bigdasgit/DAB-GNN
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-driven tool wear prediction in milling, based on a
+  process-integrated single-sensor approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19950v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19950v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Hirsch, Christian Friedrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate tool wear prediction is essential for maintaining productivity and
+minimizing costs in machining. However, the complex nature of the tool wear
+process poses significant challenges to achieving reliable predictions. This
+study explores data-driven methods, in particular deep learning, for tool wear
+prediction. Traditional data-driven approaches often focus on a single process,
+relying on multi-sensor setups and extensive data generation, which limits
+generalization to new settings. Moreover, multi-sensor integration is often
+impractical in industrial environments. To address these limitations, this
+research investigates the transferability of predictive models using minimal
+training data, validated across two processes. Furthermore, it uses a simple
+setup with a single acceleration sensor to establish a low-cost data generation
+approach that facilitates the generalization of models to other processes via
+transfer learning. The study evaluates several machine learning models,
+including convolutional neural networks (CNN), long short-term memory networks
+(LSTM), support vector machines (SVM) and decision trees, trained on different
+input formats such as feature vectors and short-time Fourier transform (STFT).
+The performance of the models is evaluated on different amounts of training
+data, including scenarios with significantly reduced datasets, providing
+insight into their effectiveness under constrained data conditions. The results
+demonstrate the potential of specific models and configurations for effective
+tool wear prediction, contributing to the development of more adaptable and
+efficient predictive maintenance strategies in machining. Notably, the ConvNeXt
+model has an exceptional performance, achieving an 99.1% accuracy in
+identifying tool wear using data from only four milling tools operated until
+they are worn.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to Robotics and Computer-Integrated Manufacturing
+  ,14 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Catch Causal Signals from Edges for Label Imbalance in Graph
+  Classification <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01707v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01707v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengrui Zhang, Yujia Yin, Hongzong Li, Yifan Chen, Tianyi Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant advancements in causal research on graphs and its
+application to cracking label imbalance, the role of edge features in detecting
+the causal effects within graphs has been largely overlooked, leaving existing
+methods with untapped potential for further performance gains. In this paper,
+we enhance the causal attention mechanism through effectively leveraging edge
+information to disentangle the causal subgraph from the original graph, as well
+as further utilizing edge features to reshape graph representations. Capturing
+more comprehensive causal signals, our design leads to improved performance on
+graph classification tasks with label imbalance issues. We evaluate our
+approach on real-word datasets PTC, Tox21, and ogbg-molhiv, observing
+improvements over baselines. Overall, we highlight the importance of edge
+features in graph causal detection and provide a promising direction for
+addressing label imbalance challenges in graph-level tasks. The model
+implementation details and the codes are available on
+https://github.com/fengrui-z/ECAL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning the Language of Protein Structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benoit Gaujac, Jérémie Donà, Liviu Copoiu, Timothy Atkinson, Thomas Pierrot, Thomas D. Barrett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning and \emph{de novo} generation of proteins are pivotal
+computational biology tasks. Whilst natural language processing (NLP)
+techniques have proven highly effective for protein sequence modelling,
+structure modelling presents a complex challenge, primarily due to its
+continuous and three-dimensional nature. Motivated by this discrepancy, we
+introduce an approach using a vector-quantized autoencoder that effectively
+tokenizes protein structures into discrete representations. This method
+transforms the continuous, complex space of protein structures into a
+manageable, discrete format with a codebook ranging from 4096 to 64000 tokens,
+achieving high-fidelity reconstructions with backbone root mean square
+deviations (RMSD) of approximately 1-5 \AA. To demonstrate the efficacy of our
+learned representations, we show that a simple GPT model trained on our
+codebooks can generate novel, diverse, and designable protein structures. Our
+approach not only provides representations of protein structure, but also
+mitigates the challenges of disparate modal representations and sets a
+foundation for seamless, multi-modal integration, enhancing the capabilities of
+computational methods in protein design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GDSR: Global-Detail Integration through Dual-Branch Network with Wavelet
+  Losses for Remote Sensing Image Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01460v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01460v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiwei Zhu, Kai Li, Guojing Zhang, Xiaoying Wang, Jianqiang Huang, Xilai Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, deep neural networks, including Convolutional Neural
+Networks, Transformers, and State Space Models, have achieved significant
+progress in Remote Sensing Image (RSI) Super-Resolution (SR). However, existing
+SR methods typically overlook the complementary relationship between global and
+local dependencies. These methods either focus on capturing local information
+or prioritize global information, which results in models that are unable to
+effectively capture both global and local features simultaneously. Moreover,
+their computational cost becomes prohibitive when applied to large-scale RSIs.
+To address these challenges, we introduce the novel application of Receptance
+Weighted Key Value (RWKV) to RSI-SR, which captures long-range dependencies
+with linear complexity. To simultaneously model global and local features, we
+propose the Global-Detail dual-branch structure, GDSR, which performs SR
+reconstruction by paralleling RWKV and convolutional operations to handle
+large-scale RSIs. Furthermore, we introduce the Global-Detail Reconstruction
+Module (GDRM) as an intermediary between the two branches to bridge their
+complementary roles. In addition, we propose Wavelet Loss, a loss function that
+effectively captures high-frequency detail information in images, thereby
+enhancing the visual quality of SR, particularly in terms of detail
+reconstruction. Extensive experiments on several benchmarks, including AID,
+AID_CDM, RSSRD-QH, and RSSRD-QH_CDM, demonstrate that GSDR outperforms the
+state-of-the-art Transformer-based method HAT by an average of 0.05 dB in PSNR,
+while using only 63% of its parameters and 51% of its FLOPs, achieving an
+inference speed 2.9 times faster. Furthermore, the Wavelet Loss shows excellent
+generalization across various architectures, providing a novel perspective for
+RSI-SR enhancement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The experiments were conducted using private datasets that were
+  incomplete as they did not include all the necessary copyrights.
+  Additionally, the conclusions require further exploration as the work is
+  still in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ xMIL: Insightful Explanations for Multiple Instance Learning in
+  Histopathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04280v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04280v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Hense, Mina Jamshidi Idaji, Oliver Eberle, Thomas Schnake, Jonas Dippel, Laure Ciernik, Oliver Buchstab, Andreas Mock, Frederick Klauschen, Klaus-Robert Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple instance learning (MIL) is an effective and widely used approach for
+weakly supervised machine learning. In histopathology, MIL models have achieved
+remarkable success in tasks like tumor detection, biomarker prediction, and
+outcome prognostication. However, MIL explanation methods are still lagging
+behind, as they are limited to small bag sizes or disregard instance
+interactions. We revisit MIL through the lens of explainable AI (XAI) and
+introduce xMIL, a refined framework with more general assumptions. We
+demonstrate how to obtain improved MIL explanations using layer-wise relevance
+propagation (LRP) and conduct extensive evaluation experiments on three toy
+settings and four real-world histopathology datasets. Our approach consistently
+outperforms previous explanation attempts with particularly improved
+faithfulness scores on challenging biomarker prediction tasks. Finally, we
+showcase how xMIL explanations enable pathologists to extract insights from MIL
+models, representing a significant advance for knowledge discovery and model
+debugging in digital histopathology. Codes are available at:
+https://github.com/bifold-pathomics/xMIL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AllSpark: A Multimodal Spatio-Temporal General Intelligence Model with
+  Ten Modalities via Language as a Reference Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00546v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00546v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Run Shao, Cheng Yang, Qiujun Li, Qing Zhu, Yongjun Zhang, YanSheng Li, Yu Liu, Yong Tang, Dapeng Liu, Shizhong Yang, Haifeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging multimodal data is an inherent requirement for comprehending
+geographic objects. However, due to the high heterogeneity in structure and
+semantics among various spatio-temporal modalities, the joint interpretation of
+multimodal spatio-temporal data has long been an extremely challenging problem.
+The primary challenge resides in striking a trade-off between the cohesion and
+autonomy of diverse modalities. This trade-off becomes progressively nonlinear
+as the number of modalities expands. Inspired by the human cognitive system and
+linguistic philosophy, where perceptual signals from the five senses converge
+into language, we introduce the Language as Reference Framework (LaRF), a
+fundamental principle for constructing a multimodal unified model. Building
+upon this, we propose AllSpark, a multimodal spatio-temporal general artificial
+intelligence model. Our model integrates ten different modalities into a
+unified framework. To achieve modal cohesion, AllSpark introduces a modal
+bridge and multimodal large language model (LLM) to map diverse modal features
+into the language feature space. To maintain modality autonomy, AllSpark uses
+modality-specific encoders to extract the tokens of various spatio-temporal
+modalities. Finally, observing a gap between the model's interpretability and
+downstream tasks, we designed modality-specific prompts and task heads,
+enhancing the model's generalization capability across specific tasks.
+Experiments indicate that the incorporation of language enables AllSpark to
+excel in few-shot classification tasks for RGB and point cloud modalities
+without additional training, surpassing baseline performance by up to 41.82\%.
+The source code is available at https://github.com/GeoX-Lab/AllSpark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 19 tables, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BoRA: Bayesian Hierarchical Low-Rank Adaption for Multi-Task Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15857v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15857v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simen Eide, Arnoldo Frigessi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Bayesian Hierarchical Low-Rank Adaption (BoRA), a novel
+method for finetuning multi-task Large Language Models (LLMs). Current
+finetuning approaches, such as Low-Rank Adaption (LoRA), perform exeptionally
+well in reducing training parameters and memory usage but face limitations when
+applied to multiple similar tasks. Practitioners usually have to choose between
+training separate models for each task or a single model for all tasks, both of
+which come with trade-offs in specialization and data utilization. BoRA
+addresses these trade-offs by leveraging a Bayesian hierarchical model that
+allows tasks to share information through global hierarchical priors. This
+enables tasks with limited data to benefit from the overall structure derived
+from related tasks while allowing tasks with more data to specialize. Our
+experimental results show that BoRA outperforms both individual and unified
+model approaches, achieving lower perplexity and better generalization across
+tasks. This method provides a scalable and efficient solution for multi-task
+LLM finetuning, with significant practical implications for diverse
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unexpected Improvements to Expected Improvement for Bayesian
+  Optimization <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.20708v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.20708v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Ament, Samuel Daulton, David Eriksson, Maximilian Balandat, Eytan Bakshy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expected Improvement (EI) is arguably the most popular acquisition function
+in Bayesian optimization and has found countless successful applications, but
+its performance is often exceeded by that of more recent methods. Notably, EI
+and its variants, including for the parallel and multi-objective settings, are
+challenging to optimize because their acquisition values vanish numerically in
+many regions. This difficulty generally increases as the number of
+observations, dimensionality of the search space, or the number of constraints
+grow, resulting in performance that is inconsistent across the literature and
+most often sub-optimal. Herein, we propose LogEI, a new family of acquisition
+functions whose members either have identical or approximately equal optima as
+their canonical counterparts, but are substantially easier to optimize
+numerically. We demonstrate that numerical pathologies manifest themselves in
+"classic" analytic EI, Expected Hypervolume Improvement (EHVI), as well as
+their constrained, noisy, and parallel variants, and propose corresponding
+reformulations that remedy these pathologies. Our empirical results show that
+members of the LogEI family of acquisition functions substantially improve on
+the optimization performance of their canonical counterparts and surprisingly,
+are on par with or exceed the performance of recent state-of-the-art
+acquisition functions, highlighting the understated role of numerical
+optimization in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 Spotlight (https://openreview.net/forum?id=QFgYOtOkDB)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Gaussian Processes via Relevance Pursuit <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.24222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.24222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Ament, Elizabeth Santorella, David Eriksson, Ben Letham, Maximilian Balandat, Eytan Bakshy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian processes (GPs) are non-parametric probabilistic regression models
+that are popular due to their flexibility, data efficiency, and well-calibrated
+uncertainty estimates. However, standard GP models assume homoskedastic
+Gaussian noise, while many real-world applications are subject to non-Gaussian
+corruptions. Variants of GPs that are more robust to alternative noise models
+have been proposed, and entail significant trade-offs between accuracy and
+robustness, and between computational requirements and theoretical guarantees.
+In this work, we propose and study a GP model that achieves robustness against
+sparse outliers by inferring data-point-specific noise levels with a sequential
+selection procedure maximizing the log marginal likelihood that we refer to as
+relevance pursuit. We show, surprisingly, that the model can be parameterized
+such that the associated log marginal likelihood is strongly concave in the
+data-point-specific noise variances, a property rarely found in either robust
+regression objectives or GP marginal likelihoods. This in turn implies the weak
+submodularity of the corresponding subset selection problem, and thereby proves
+approximation guarantees for the proposed algorithm. We compare the model's
+performance relative to other approaches on diverse regression and Bayesian
+optimization tasks, including the challenging but common setting of sparse
+corruptions of the labels within or close to the function range.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024 Article (https://openreview.net/forum?id=5FATPIlWUJ)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Structured Orthogonal Dictionary Learning using Householder
+  Reflections <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09138v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09138v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anirudh Dash, Aditya Siripuram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose and investigate algorithms for the structured
+orthogonal dictionary learning problem. First, we investigate the case when the
+dictionary is a Householder matrix. We give sample complexity results and show
+theoretically guaranteed approximate recovery (in the $l_{\infty}$ sense) with
+optimal computational complexity. We then attempt to generalize these
+techniques when the dictionary is a product of a few Householder matrices. We
+numerically validate these techniques in the sample-limited setting to show
+performance similar to or better than existing techniques while having much
+improved computational complexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures, accepted for publication: IEEE ICASSP, 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wavelet-Driven Generalizable Framework for Deepfake Face Forgery
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18301v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18301v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lalith Bharadwaj Baru, Rohit Boddeda, Shilhora Akshay Patel, Sai Mohan Gajapaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evolution of digital image manipulation, particularly with the
+advancement of deep generative models, significantly challenges existing
+deepfake detection methods, especially when the origin of the deepfake is
+obscure. To tackle the increasing complexity of these forgeries, we propose
+\textbf{Wavelet-CLIP}, a deepfake detection framework that integrates wavelet
+transforms with features derived from the ViT-L/14 architecture, pre-trained in
+the CLIP fashion. Wavelet-CLIP utilizes Wavelet Transforms to deeply analyze
+both spatial and frequency features from images, thus enhancing the model's
+capability to detect sophisticated deepfakes. To verify the effectiveness of
+our approach, we conducted extensive evaluations against existing
+state-of-the-art methods for cross-dataset generalization and detection of
+unseen images generated by standard diffusion models. Our method showcases
+outstanding performance, achieving an average AUC of 0.749 for cross-data
+generalization and 0.893 for robustness against unseen deepfakes, outperforming
+all compared methods. The code can be reproduced from the repo:
+\url{https://github.com/lalithbharadwajbaru/Wavelet-CLIP}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Pages, 2 Figures, 3 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continuously Learning New Words in Automatic Speech Recognition <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04482v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04482v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Huber, Alexander Waibel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent advances, Automatic Speech Recognition (ASR) systems are still
+far from perfect. Typical errors include acronyms, named entities, and
+domain-specific special words for which little or no labeled data is available.
+To address the problem of recognizing these words, we propose a self-supervised
+continual learning approach: Given the audio of a lecture talk with the
+corresponding slides, we bias the model towards decoding new words from the
+slides by using a memory-enhanced ASR model from the literature. Then, we
+perform inference on the talk, collecting utterances that contain detected new
+words into an adaptation data set. Continual learning is then performed by
+training adaptation weights added to the model on this data set. The whole
+procedure is iterated for many talks. We show that with this approach, we
+obtain increasing performance on the new words when they occur more frequently
+(more than 80% recall) while preserving the general performance of the model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PRMBench: A Fine-grained and Challenging Benchmark for Process-Level
+  Reward Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03124v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03124v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyang Song, Zhaochen Su, Xiaoye Qu, Jiawei Zhou, Yu Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Process-level Reward Models (PRMs) are crucial for complex reasoning and
+decision-making tasks, where each intermediate step plays an important role in
+the reasoning process. Since language models are prone to various types of
+errors during the reasoning process, PRMs are required to possess nuanced
+capabilities for detecting various implicit error types in real-world
+scenarios. However, current benchmarks primarily focus on step correctness,
+failing to evaluate PRMs' performance systematically. To address this gap, we
+introduce PRMBench, a process-level benchmark specifically designed to assess
+the fine-grained error detection capabilities of PRMs. PRMBench comprises 6,216
+carefully designed problems and 83,456 step-level labels, evaluating models
+across multiple dimensions, including simplicity, soundness, and sensitivity.
+In our experiments on 15 models, spanning both open-source PRMs and
+closed-source large language models prompted as critic models, we uncover
+significant weaknesses in current PRMs. These findings underscore the
+challenges inherent in process-level evaluation and highlight key directions
+for future research. We hope PRMBench can be a robust bench for advancing
+research on PRM evaluation and development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://prmbench.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforcement Learning for Jump-Diffusions, with Financial Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16449v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16449v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuefeng Gao, Lingfei Li, Xun Yu Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study continuous-time reinforcement learning (RL) for stochastic control
+in which system dynamics are governed by jump-diffusion processes. We formulate
+an entropy-regularized exploratory control problem with stochastic policies to
+capture the exploration--exploitation balance essential for RL. Unlike the pure
+diffusion case initially studied by Wang et al. (2020), the derivation of the
+exploratory dynamics under jump-diffusions calls for a careful formulation of
+the jump part. Through a theoretical analysis, we find that one can simply use
+the same policy evaluation and $q$-learning algorithms in Jia and Zhou (2022a,
+2023), originally developed for controlled diffusions, without needing to check
+a priori whether the underlying data come from a pure diffusion or a
+jump-diffusion. However, we show that the presence of jumps ought to affect
+parameterizations of actors and critics in general. We investigate as an
+application the mean--variance portfolio selection problem with stock price
+modelled as a jump-diffusion, and show that both RL algorithms and
+parameterizations are invariant with respect to jumps. Finally, we present a
+detailed study on applying the general theory to option hedging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In Search of Trees: Decision-Tree Policy Synthesis for Black-Box Systems
+  via Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.03260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.03260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emir Demirović, Christian Schilling, Anna Lukina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision trees, owing to their interpretability, are attractive as control
+policies for (dynamical) systems. Unfortunately, constructing, or synthesising,
+such policies is a challenging task. Previous approaches do so by imitating a
+neural-network policy, approximating a tabular policy obtained via formal
+synthesis, employing reinforcement learning, or modelling the problem as a
+mixed-integer linear program. However, these works may require access to a
+hard-to-obtain accurate policy or a formal model of the environment (within
+reach of formal synthesis), and may not provide guarantees on the quality or
+size of the final tree policy. In contrast, we present an approach to
+synthesise optimal decision-tree policies given a deterministic black-box
+environment and specification, a discretisation of the tree predicates, and an
+initial set of states, where optimality is defined with respect to the number
+of steps to achieve the goal. Our approach is a specialised search algorithm
+which systematically explores the (exponentially large) space of decision trees
+under the given discretisation. The key component is a novel trace-based
+pruning mechanism that significantly reduces the search space. Our approach
+represents a conceptually novel way of synthesising small decision-tree
+policies with optimality guarantees even for black-box environments with
+black-box specifications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages main text incl. references, 2 pages appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Neural Backdoor: Fundamentals, Methodologies, Applications, and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10573v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10573v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Yang, Gaolei Li, Jianhua Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have significantly advanced various downstream
+graph-relevant tasks, encompassing recommender systems, molecular structure
+prediction, social media analysis, etc. Despite the boosts of GNN, recent
+research has empirically demonstrated its potential vulnerability to backdoor
+attacks, wherein adversaries employ triggers to poison input samples, inducing
+GNN to adversary-premeditated malicious outputs. This is typically due to the
+controlled training process, or the deployment of untrusted models, such as
+delegating model training to third-party service, leveraging external training
+sets, and employing pre-trained models from online sources. Although there's an
+ongoing increase in research on GNN backdoors, comprehensive investigation into
+this field is lacking. To bridge this gap, we propose the first survey
+dedicated to GNN backdoors. We begin by outlining the fundamental definition of
+GNN, followed by the detailed summarization and categorization of current GNN
+backdoor attacks and defenses based on their technical characteristics and
+application scenarios. Subsequently, the analysis of the applicability and use
+cases of GNN backdoors is undertaken. Finally, the exploration of potential
+research directions of GNN backdoors is presented. This survey aims to explore
+the principles of graph backdoors, provide insights to defenders, and promote
+future security research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Manifolds, Random Matrices and Spectral Gaps: The geometric phases of
+  generative diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.05898v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.05898v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enrico Ventura, Beatrice Achilli, Gianluigi Silvestri, Carlo Lucibello, Luca Ambrogioni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the latent geometry of generative diffusion
+models under the manifold hypothesis. For this purpose, we analyze the spectrum
+of eigenvalues (and singular values) of the Jacobian of the score function,
+whose discontinuities (gaps) reveal the presence and dimensionality of distinct
+sub-manifolds. Using a statistical physics approach, we derive the spectral
+distributions and formulas for the spectral gaps under several distributional
+assumptions, and we compare these theoretical predictions with the spectra
+estimated from trained networks. Our analysis reveals the existence of three
+distinct qualitative phases during the generative process: a trivial phase; a
+manifold coverage phase where the diffusion process fits the distribution
+internal to the manifold; a consolidation phase where the score becomes
+orthogonal to the manifold and all particles are projected on the support of
+the data. This `division of labor' between different timescales provides an
+elegant explanation of why generative diffusion models are not affected by the
+manifold overfitting phenomenon that plagues likelihood-based models, since the
+internal distribution and the manifold geometry are produced at different time
+points during generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Standardness Clouds Meaning: A Position Regarding the Informed Usage of
+  Standard <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.13552v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.13552v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Cech, Ole Wegen, Daniel Atzberger, Rico Richter, Willy Scheibel, Jürgen Döllner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard datasets are frequently used to train and evaluate Machine Learning
+models. However, the assumed standardness of these datasets leads to a lack of
+in-depth discussion on how their labels match the derived categories for the
+respective use case, which we demonstrate by reviewing recent literature that
+employs standard datasets. We find that the standardness of the datasets seems
+to cloud their actual coherency and applicability, thus impeding the trust in
+Machine Learning models trained on these datasets. Therefore, we argue against
+the uncritical use of standard datasets and advocate for their critical
+examination instead. For this, we suggest to use Grounded Theory in combination
+with Hypotheses Testing through Visualization as methods to evaluate the match
+between use case, derived categories, and labels. We exemplify this approach by
+applying it to the 20 Newsgroups dataset and the MNIST dataset, both considered
+standard datasets in their respective domain. The results show that the labels
+of the 20 Newsgroups dataset are imprecise, which implies that neither a
+Machine Learning model can learn a meaningful abstraction of derived categories
+nor one can draw conclusions from achieving high accuracy on this dataset. For
+the MNIST dataset, we demonstrate that the labels can be confirmed to be
+defined well. We conclude that also for datasets that are considered to be
+standard, quality and suitability have to be assessed in order to learn
+meaningful abstractions and, thus, improve trust in Machine Learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Generative Modeling via Penalized Optimal Transport Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10456v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10456v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhui Sophia Lu, Chenyang Zhong, Wing Hung Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of synthetic data with distributions that faithfully emulate
+the underlying data-generating mechanism holds paramount significance.
+Wasserstein Generative Adversarial Networks (WGANs) have emerged as a prominent
+tool for this task; however, due to the delicate equilibrium of the minimax
+formulation and the instability of Wasserstein distance in high dimensions,
+WGAN often manifests the pathological phenomenon of mode collapse. This results
+in generated samples that converge to a restricted set of outputs and fail to
+adequately capture the tail behaviors of the true distribution. Such
+limitations can lead to serious downstream consequences. To this end, we
+propose the Penalized Optimal Transport Network (POTNet), a versatile deep
+generative model based on the marginally-penalized Wasserstein (MPW) distance.
+Through the MPW distance, POTNet effectively leverages low-dimensional marginal
+information to guide the overall alignment of joint distributions. Furthermore,
+our primal-based framework enables direct evaluation of the MPW distance, thus
+eliminating the need for a critic network. This formulation circumvents
+training instabilities inherent in adversarial approaches and avoids the need
+for extensive parameter tuning. We derive a non-asymptotic bound on the
+generalization error of the MPW loss and establish convergence rates of the
+generative distribution learned by POTNet. Our theoretical analysis together
+with extensive empirical evaluations demonstrate the superior performance of
+POTNet in accurately capturing underlying data structures, including their tail
+behaviors and minor modalities. Moreover, our model achieves orders of
+magnitude speedup during the sampling stage compared to state-of-the-art
+alternatives, which enables computationally efficient large-scale synthetic
+data generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>54 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deploying Open-Source Large Language Models: A performance Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14887v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14887v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannis Bendi-Ouis, Dan Dutartre, Xavier Hinaut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the release of ChatGPT in November 2022, large language models (LLMs)
+have seen considerable success, including in the open-source community, with
+many open-weight models available. However, the requirements to deploy such a
+service are often unknown and difficult to evaluate in advance. To facilitate
+this process, we conducted numerous tests at the Centre Inria de l'Universit\'e
+de Bordeaux. In this article, we propose a comparison of the performance of
+several models of different sizes (mainly Mistral and LLaMa) depending on the
+available GPUs, using vLLM, a Python library designed to optimize the inference
+of these models. Our results provide valuable information for private and
+public groups wishing to deploy LLMs, allowing them to evaluate the performance
+of different models based on their available hardware. This study thus
+contributes to facilitating the adoption and use of these large language models
+in various application domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Dense to Sparse: Event Response for Enhanced Residential Load
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02781v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02781v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Cao, Qinghua Tao, Yingjie Zhou, Lu Zhang, Le Zhang, Dongjin Song, Dapeng Oliver Wu, Ce Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Residential load forecasting (RLF) is crucial for resource scheduling in
+power systems. Most existing methods utilize all given load records (dense
+data) to indiscriminately extract the dependencies between historical and
+future time series. However, there exist important regular patterns residing in
+the event-related associations among different appliances (sparse knowledge),
+which have yet been ignored.In this paper, we propose an Event-Response
+Knowledge Guided approach (ERKG) for RLF by incorporating the estimation of
+electricity usage events for different appliances, mining event-related sparse
+knowledge from the load series. With ERKG, the event-response estimation
+enables portraying the electricity consumption behaviors of residents,
+revealing regular variations in appliance operational states.To be specific,
+ERKG consists of knowledge extraction and guidance: i) a forecasting model is
+designed for the electricity usage events by estimating appliance operational
+states, aiming to extract the event-related sparse knowledge; ii) a novel
+knowledge-guided mechanism is established by fusing such state estimates of the
+appliance events into the RLF model, which can give particular focuses on the
+patterns of users' electricity consumption behaviors.Notably, ERKG can flexibly
+serve as a plug-in module to boost the capability of existing forecasting
+models by leveraging event response. In numerical experiments, extensive
+comparisons and ablation studies have verified the effectiveness of our ERKG,
+e.g., over 8% MAE can be reduced on the tested state-of-the-art forecasting
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages and 6 figures. Accepted for publication by IEEE Transactions
+  on Instrumentation and Measurement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Review</span> of Bayesian Uncertainty Quantification in Deep Probabilistic
+  Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.16370v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.16370v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. M. A. Valiuddin, R. J. G. van Sloun, C. G. A. Viviers, P. H. N. de With, F. van der Sommen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in image segmentation play an integral role within the broad
+scope of Deep Learning-based Computer Vision. Furthermore, their widespread
+applicability in critical real-world tasks has resulted in challenges related
+to the reliability of such algorithms. Hence, uncertainty quantification has
+been extensively studied within this context, enabling the expression of model
+ignorance (epistemic uncertainty) or data ambiguity (aleatoric uncertainty) to
+prevent uninformed decision-making. Due to the rapid adoption of Convolutional
+Neural Network (CNN)-based segmentation models in high-stake applications, a
+substantial body of research has been published on this very topic, causing its
+swift expansion into a distinct field. This work provides a comprehensive
+overview of probabilistic segmentation, by discussing fundamental concepts of
+uncertainty quantification, governing advancements in the field as well as the
+application to various tasks. Moreover, literature on both types of
+uncertainties trace back to four key applications: (1) to quantify statistical
+inconsistencies in the annotation process due ambiguous images, (2) correlating
+prediction error with uncertainty, (3) expanding the model hypothesis space for
+better generalization, and (4) Active Learning. An extensive discussion follows
+that includes an overview of utilized datasets for each of the applications and
+evaluation of the available methods. We also highlight challenges related to
+architectures, uncertainty quantification methods, standardization and
+benchmarking, and finally end with recommendations for future work such as
+methods based on single forward passes and models that appropriately leverage
+volumetric data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, revised</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Partial-Label Learning with a Reject Option 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00592v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00592v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Fuchs, Florian Kalinke, Klemens Böhm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world applications, one often encounters ambiguously labeled data,
+where different annotators assign conflicting class labels. Partial-label
+learning allows training classifiers in this weakly supervised setting, where
+state-of-the-art methods already show good predictive performance. However,
+even the best algorithms give incorrect predictions, which can have severe
+consequences when they impact actions or decisions. We propose a novel
+risk-consistent nearest-neighbor-based partial-label learning algorithm with a
+reject option, that is, the algorithm can reject unsure predictions. Extensive
+experiments on artificial and real-world datasets show that our method provides
+the best trade-off between the number and accuracy of non-rejected predictions
+when compared to our competitors, which use confidence thresholds for rejecting
+unsure predictions. When evaluated without the reject option, our
+nearest-neighbor-based approach also achieves competitive prediction
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at TMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Mode-Seeking Properties of Langevin Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02017v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02017v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiwei Cheng, Kexin Fu, Farzan Farnia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Langevin Dynamics framework, which aims to generate samples from the
+score function of a probability distribution, is widely used for analyzing and
+interpreting score-based generative modeling. While the convergence behavior of
+Langevin Dynamics under unimodal distributions has been extensively studied in
+the literature, in practice the data distribution could consist of multiple
+distinct modes. In this work, we investigate Langevin Dynamics in producing
+samples from multimodal distributions and theoretically study its mode-seeking
+properties. We prove that under a variety of sub-Gaussian mixtures, Langevin
+Dynamics is unlikely to find all mixture components within a sub-exponential
+number of steps in the data dimension. To reduce the mode-seeking tendencies of
+Langevin Dynamics, we propose \emph{Chained Langevin Dynamics}, which divides
+the data vector into patches of constant size and generates every patch
+sequentially conditioned on the previous patches. We perform a theoretical
+analysis of Chained Langevin Dynamics by reducing it to sampling from a
+constant-dimensional distribution. We present the results of several numerical
+experiments on synthetic and real image datasets, supporting our theoretical
+results on the iteration complexities of sample generation from mixture
+distributions using the chained and vanilla Langevin Dynamics. The code is
+available at https://github.com/Xiwei-Cheng/Chained_LD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transferable Adversarial Examples with Bayes Approach <span class="chip">AsiaCCS'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06538v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06538v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Fan, Cen Chen, Wenmeng Zhou, Yinggui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The vulnerability of deep neural networks (DNNs) to black-box adversarial
+attacks is one of the most heated topics in trustworthy AI. In such attacks,
+the attackers operate without any insider knowledge of the model, making the
+cross-model transferability of adversarial examples critical. Despite the
+potential for adversarial examples to be effective across various models, it
+has been observed that adversarial examples that are specifically crafted for a
+specific model often exhibit poor transferability. In this paper, we explore
+the transferability of adversarial examples via the lens of Bayesian approach.
+Specifically, we leverage Bayesian approach to probe the transferability and
+then study what constitutes a transferability-promoting prior. Following this,
+we design two concrete transferability-promoting priors, along with an adaptive
+dynamic weighting strategy for instances sampled from these priors. Employing
+these techniques, we present BayAtk. Extensive experiments illustrate the
+significant effectiveness of BayAtk in crafting more transferable adversarial
+examples against both undefended and defended black-box models compared to
+existing state-of-the-art attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in AsiaCCS'25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-geodesically-convex optimization in the Wasserstein space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00502v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00502v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoang Phuc Hau Luu, Hanlin Yu, Bernardo Williams, Petrus Mikkola, Marcelo Hartmann, Kai Puolamäki, Arto Klami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a class of optimization problems in the Wasserstein space (the space
+of probability measures) where the objective function is nonconvex along
+generalized geodesics. Specifically, the objective exhibits some
+difference-of-convex structure along these geodesics. The setting also
+encompasses sampling problems where the logarithm of the target distribution is
+difference-of-convex. We derive multiple convergence insights for a novel semi
+Forward-Backward Euler scheme under several nonconvex (and possibly nonsmooth)
+regimes. Notably, the semi Forward-Backward Euler is just a slight modification
+of the Forward-Backward Euler whose convergence is -- to our knowledge -- still
+unknown in our very general non-geodesically-convex setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stability and Generalization in Free Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.08980v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.08980v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiwei Cheng, Kexin Fu, Farzan Farnia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While adversarial training methods have significantly improved the robustness
+of deep neural networks against norm-bounded adversarial perturbations, the
+generalization gap between their performance on training and test data is
+considerably greater than that of standard empirical risk minimization. Recent
+studies have aimed to connect the generalization properties of adversarially
+trained classifiers to the min-max optimization algorithm used in their
+training. In this work, we analyze the interconnections between generalization
+and optimization in adversarial training using the algorithmic stability
+framework. Specifically, our goal is to compare the generalization gap of
+neural networks trained using the vanilla adversarial training method, which
+fully optimizes perturbations at every iteration, with the free adversarial
+training method, which simultaneously optimizes norm-bounded perturbations and
+classifier parameters. We prove bounds on the generalization error of these
+methods, indicating that the free adversarial training method may exhibit a
+lower generalization gap between training and test samples due to its
+simultaneous min-max optimization of classifier weights and perturbation
+variables. We conduct several numerical experiments to evaluate the
+train-to-test generalization gap in vanilla and free adversarial training
+methods. Our empirical findings also suggest that the free adversarial training
+method could lead to a smaller generalization gap over a similar number of
+training iterations. The paper code is available at
+https://github.com/Xiwei-Cheng/Stability_FreeAT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Mitigating Architecture Overfitting on Distilled <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.04195v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.04195v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyang Zhong, Chen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation methods have demonstrated remarkable performance for
+neural networks trained with very limited training data. However, a significant
+challenge arises in the form of \textit{architecture overfitting}: the
+distilled training dataset synthesized by a specific network architecture
+(i.e., training network) generates poor performance when trained by other
+network architectures (i.e., test networks), especially when the test networks
+have a larger capacity than the training network. This paper introduces a
+series of approaches to mitigate this issue. Among them, DropPath renders the
+large model to be an implicit ensemble of its sub-networks, and knowledge
+distillation ensures each sub-network acts similarly to the small but
+well-performing teacher network. These methods, characterized by their
+smoothing effects, significantly mitigate architecture overfitting. We conduct
+extensive experiments to demonstrate the effectiveness and generality of our
+methods. Particularly, across various scenarios involving different tasks and
+different sizes of distilled data, our approaches significantly mitigate
+architecture overfitting. Furthermore, our approaches achieve comparable or
+even superior performance when the test network is larger than the training
+network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TNNLS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Stochastic Nonlinear Dynamics with Embedded Latent Transfer
+  Operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02721v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02721v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naichang Ke, Ryogo Tanaka, Yoshinobu Kawahara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider an operator-based latent Markov representation of a stochastic
+nonlinear dynamical system, where the stochastic evolution of the latent state
+embedded in a reproducing kernel Hilbert space is described with the
+corresponding transfer operator, and develop a spectral method to learn this
+representation based on the theory of stochastic realization. The embedding may
+be learned simultaneously using reproducing kernels, for example, constructed
+with feed-forward neural networks. We also address the generalization of
+sequential state-estimation (Kalman filtering) in stochastic nonlinear systems,
+and of operator-based eigen-mode decomposition of dynamics, for the
+representation. Several examples with synthetic and real-world data are shown
+to illustrate the empirical characteristics of our methods, and to investigate
+the performance of our model in sequential state-estimation and mode
+decomposition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This submission includes a supplementary file providing additional
+  details. It also contains a code directory (code/) for the experiments. Both
+  are included within the TeX source package</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TabTreeFormer: Tabular Data Generation Using Hybrid Tree-<span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01216v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01216v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Li, Bingyin Zhao, Zilong Zhao, Kevin Yee, Uzair Javaid, Biplab Sikdar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have achieved remarkable success in tabular data generation.
+However, they lack domain-specific inductive biases which are critical to
+preserving the intrinsic characteristics of tabular data. Meanwhile, they
+suffer from poor scalability and efficiency due to quadratic computational
+complexity. In this paper, we propose TabTreeFormer, a hybrid transformer
+architecture that incorporates a tree-based model that retains tabular-specific
+inductive biases of non-smooth and potentially low-correlated patterns caused
+by discreteness and non-rotational invariance, and hence enhances the fidelity
+and utility of synthetic data. In addition, we devise a dual-quantization
+tokenizer to capture the multimodal continuous distribution and further
+facilitate the learning of numerical value distribution. Moreover, our proposed
+tokenizer reduces the vocabulary size and sequence length due to the limited
+complexity (e.g., dimension-wise semantic meaning) of tabular data, rendering a
+significant model size shrink without sacrificing the capability of the
+transformer model. We evaluate TabTreeFormer on 10 datasets against multiple
+generative models on various metrics; our experimental results show that
+TabTreeFormer achieves superior fidelity, utility, privacy, and efficiency. Our
+best model yields a 40% utility improvement with 1/16 of the baseline model
+size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient descent in materia through homodyne gradient extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.11233v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.11233v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcus N. Boon, Lorenzo Cassola, Hans-Christian Ruiz Euler, Tao Chen, Bram van de Ven, Unai Alegre Ibarra, Peter A. Bobbert, Wilfred G. van der Wiel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning, a multi-layered neural network approach inspired by the brain,
+has revolutionized machine learning. One of its key enablers has been
+backpropagation, an algorithm that computes the gradient of a loss function
+with respect to the weights and biases in the neural network model, in
+combination with its use in gradient descent. However, the implementation of
+deep learning in digital computers is intrinsically energy hungry, with energy
+consumption becoming prohibitively high for many applications. This has
+stimulated the development of specialized hardware, ranging from neuromorphic
+CMOS integrated circuits and integrated photonic tensor cores to
+unconventional, material-based computing system. The learning process in these
+material systems, realized, e.g., by artificial evolution, equilibrium
+propagation or surrogate modelling, is a complicated and time-consuming
+process. Here, we demonstrate a simple yet efficient and accurate gradient
+extraction method, based on the principle of homodyne detection, for performing
+gradient descent on a loss function directly in a physical system without the
+need of an analytical description. By perturbing the parameters that need to be
+optimized using sinusoidal waveforms with distinct frequencies, we effectively
+obtain the gradient information in a highly robust and scalable manner. We
+illustrate the method in dopant network processing units, but argue that it is
+applicable in a wide range of physical systems. Homodyne gradient extraction
+can in principle be fully implemented in materia, facilitating the development
+of autonomously learning material systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-Rank Irreducible Cartesian Tensor Decomposition and Bases of
+  Equivariant Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.18263v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.18263v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shihao Shao, Yikang Li, Zhouchen Lin, Qinghua Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Irreducible Cartesian tensors (ICTs) play a crucial role in the design of
+equivariant graph neural networks, as well as in theoretical chemistry and
+chemical physics. Meanwhile, the design space of available linear operations on
+tensors that preserve symmetry presents a significant challenge. The ICT
+decomposition and a basis of this equivariant space are difficult to obtain for
+high-order tensors. After decades of research, Bonvicini (2024) recently
+achieves an explicit ICT decomposition for $n=5$ with factorial time/space
+complexity. This work, for the first time, obtains decomposition matrices for
+ICTs up to rank $n=9$ with reduced and affordable complexity, by constructing
+what we call path matrices. The path matrices are obtained via performing
+chain-like contraction with Clebsch-Gordan matrices following the parentage
+scheme. We prove and leverage that the concatenation of path matrices is an
+orthonormal change-of-basis matrix between the Cartesian tensor product space
+and the spherical direct sum spaces. Furthermore, we identify a complete
+orthogonal basis for the equivariant space, rather than a spanning set
+(Pearce-Crump, 2023b), through this path matrices technique. We further extend
+our result to the arbitrary tensor product and direct sum spaces, enabling free
+design between different spaces while keeping symmetry. The Python code is
+available at
+https://github.com/ShihaoShao-GH/ICT-decomposition-and-equivariant-bases, where
+the $n=6,\dots,9$ ICT decomposition matrices are obtained in 1s, 3s, 11s, and
+4m32s on on 28-cores Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scam Detection for Ethereum Smart Contracts: Leveraging Graph
+  Representation Learning for Secure Blockchain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12370v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12370v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihong Jin, Ze Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the increasing abuse of fraudulent activities that result in
+significant financial and reputational harm, Ethereum smart contracts face a
+significant problem in detecting fraud. Existing monitoring methods typically
+rely on lease code analysis or physically extracted features, which suffer from
+scalability and adaptability limitations. In this study, we use graph
+representation learning to observe purchase trends and find fraudulent deals.
+We can achieve powerful categorisation performance by using innovative machine
+learning versions and transforming Ethereum invoice data into graph structures.
+Our method addresses label imbalance through SMOTE-ENN techniques and evaluates
+models like Multi-Layer Perceptron ( MLP ) and Graph Convolutional Networks (
+GCN). Experimental results show that the MLP type surpasses the GCN in this
+environment, with domain-specific assessments closely aligned with real-world
+assessments. This study provides a scalable and efficient way to improve
+Ethereum's ecosystem's confidence and security.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to BDICN 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Approximation and bounding techniques for the Fisher-Rao distances
+  between parametric statistical models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10089v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10089v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Nielsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Fisher-Rao distance between two probability distributions of a
+statistical model is defined as the Riemannian geodesic distance induced by the
+Fisher information metric. In order to calculate the Fisher-Rao distance in
+closed-form, we need (1) to elicit a formula for the Fisher-Rao geodesics, and
+(2) to integrate the Fisher length element along those geodesics. We consider
+several numerically robust approximation and bounding techniques for the
+Fisher-Rao distances: First, we report generic upper bounds on Fisher-Rao
+distances based on closed-form 1D Fisher-Rao distances of submodels. Second, we
+describe several generic approximation schemes depending on whether the
+Fisher-Rao geodesics or pregeodesics are available in closed-form or not. In
+particular, we obtain a generic method to guarantee an arbitrarily small
+additive error on the approximation provided that Fisher-Rao pregeodesics and
+tight lower and upper bounds are available. Third, we consider the case of
+Fisher metrics being Hessian metrics, and report generic tight upper bounds on
+the Fisher-Rao distances using techniques of information geometry.
+Uniparametric and biparametric statistical models always have Fisher Hessian
+metrics, and in general a simple test allows to check whether the Fisher
+information matrix yields a Hessian metric or not. Fourth, we consider
+elliptical distribution families and show how to apply the above techniques to
+these models. We also propose two new distances based either on the Fisher-Rao
+lengths of curves serving as proxies of Fisher-Rao geodesics, or based on the
+Birkhoff/Hilbert projective cone distance. Last, we consider an alternative
+group-theoretic approach for statistical transformation models based on the
+notion of maximal invariant which yields insights on the structures of the
+Fisher-Rao distance formula which may be used fruitfully in applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>48 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Edge Graph Intelligence: Reciprocally Empowering Edge Networks with
+  Graph Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15320v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15320v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liekang Zeng, Shengyuan Ye, Xu Chen, Xiaoxi Zhang, Ju Ren, Jian Tang, Yang Yang,  Xuemin,  Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed a thriving growth of computing facilities
+connected at the network edge, cultivating edge networks as a fundamental
+infrastructure for supporting miscellaneous intelligent services.Meanwhile,
+Artificial Intelligence (AI) frontiers have extrapolated to the graph domain
+and promoted Graph Intelligence (GI). Given the inherent relation between
+graphs and networks, the interdiscipline of graph learning and edge networks,
+i.e., Edge GI or EGI, has revealed a novel interplay between them -- GI aids in
+optimizing edge networks, while edge networks facilitate GI model deployment.
+Driven by this delicate closed-loop, EGI is recognized as a promising solution
+to fully unleash the potential of edge computing power and is garnering growing
+attention. Nevertheless, research on EGI remains nascent, and there is a
+soaring demand within both the communications and AI communities for a
+dedicated venue to share recent advancements. To this end, this paper promotes
+the concept of EGI, explores its scope and core principles, and conducts a
+comprehensive survey concerning recent research efforts on this emerging field.
+Specifically, this paper introduces and discusses: 1) fundamentals of edge
+computing and graph learning,2) emerging techniques centering on the closed
+loop between graph intelligence and edge networks, and 3) open challenges and
+research opportunities of future EGI. By bridging the gap across communication,
+networking, and graph learning areas, we believe that this survey can garner
+increased attention, foster meaningful discussions, and inspire further
+research ideas in EGI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Communications Surveys & Tutorials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CausalMob: Causal Human Mobility Prediction with LLMs-derived Human
+  Intentions toward Public Events <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.02155v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.02155v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojie Yang, Hangli Ge, Jiawei Wang, Zipei Fan, Renhe Jiang, Ryosuke Shibasaki, Noboru Koshizuka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale human mobility exhibits spatial and temporal patterns that can
+assist policymakers in decision making. Although traditional prediction models
+attempt to capture these patterns, they often interfered by non-periodic public
+events, such as disasters and occasional celebrations. Since regular human
+mobility patterns are heavily affected by these events, estimating their causal
+effects is critical to accurate mobility predictions. Although news articles
+provide unique perspectives on these events in an unstructured format,
+processing is a challenge. In this study, we propose a causality-augmented
+prediction model, called CausalMob, to analyze the causal effects of public
+events. We first utilize large language models (LLMs) to extract human
+intentions from news articles and transform them into features that act as
+causal treatments. Next, the model learns representations of spatio-temporal
+regional covariates from multiple data sources to serve as confounders for
+causal inference. Finally, we present a causal effect estimation framework to
+ensure event features remain independent of confounders during prediction.
+Based on large-scale real-world data, the experimental results show that the
+proposed model excels in human mobility prediction, outperforming
+state-of-the-art models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bongard-OpenWorld: Few-Shot Reasoning for Free-form Visual Concepts in
+  the Real World <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10207v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10207v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rujie Wu, Xiaojian Ma, Zhenliang Zhang, Wei Wang, Qing Li, Song-Chun Zhu, Yizhou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Bongard-OpenWorld, a new benchmark for evaluating real-world
+few-shot reasoning for machine vision. It originates from the classical Bongard
+Problems (BPs): Given two sets of images (positive and negative), the model
+needs to identify the set that query images belong to by inducing the visual
+concepts, which is exclusively depicted by images from the positive set. Our
+benchmark inherits the few-shot concept induction of the original BPs while
+adding the two novel layers of challenge: 1) open-world free-form concepts, as
+the visual concepts in Bongard-OpenWorld are unique compositions of terms from
+an open vocabulary, ranging from object categories to abstract visual
+attributes and commonsense factual knowledge; 2) real-world images, as opposed
+to the synthetic diagrams used by many counterparts. In our exploration,
+Bongard-OpenWorld already imposes a significant challenge to current few-shot
+reasoning algorithms. We further investigate to which extent the recently
+introduced Large Language Models (LLMs) and Vision-Language Models (VLMs) can
+solve our task, by directly probing VLMs, and combining VLMs and LLMs in an
+interactive reasoning scheme. We even conceived a neuro-symbolic reasoning
+approach that reconciles LLMs & VLMs with logical reasoning to emulate the
+human problem-solving process for Bongard Problems. However, none of these
+approaches manage to close the human-machine gap, as the best learner achieves
+64% accuracy while human participants easily reach 91%. We hope
+Bongard-OpenWorld can help us better understand the limitations of current
+visual intelligence and facilitate future research on visual agents with
+stronger few-shot visual reasoning capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ALTBI: Constructing Improved Outlier Detection Models via Optimization
+  of Inlier-Memorization Effect 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09791v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09791v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seoyoung Cho, Jaesung Hwang, Kwan-Young Bak, Dongha Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Outlier detection (OD) is the task of identifying unusual observations (or
+outliers) from a given or upcoming data by learning unique patterns of normal
+observations (or inliers). Recently, a study introduced a powerful unsupervised
+OD (UOD) solver based on a new observation of deep generative models, called
+inlier-memorization (IM) effect, which suggests that generative models memorize
+inliers before outliers in early learning stages. In this study, we aim to
+develop a theoretically principled method to address UOD tasks by maximally
+utilizing the IM effect. We begin by observing that the IM effect is observed
+more clearly when the given training data contain fewer outliers. This finding
+indicates a potential for enhancing the IM effect in UOD regimes if we can
+effectively exclude outliers from mini-batches when designing the loss
+function. To this end, we introduce two main techniques: 1) increasing the
+mini-batch size as the model training proceeds and 2) using an adaptive
+threshold to calculate the truncated loss function. We theoretically show that
+these two techniques effectively filter out outliers from the truncated loss
+function, allowing us to utilize the IM effect to the fullest. Coupled with an
+additional ensemble strategy, we propose our method and term it Adaptive Loss
+Truncation with Batch Increment (ALTBI). We provide extensive experimental
+results to demonstrate that ALTBI achieves state-of-the-art performance in
+identifying outliers compared to other recent methods, even with significantly
+lower computation costs. Additionally, we show that our method yields robust
+performances when combined with privacy-preserving algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages in total</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Initialization is Critical to Whether <span class="highlight-title">Transformer</span>s Fit Composite
+  Functions by Reasoning or Memorizing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05409v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05409v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongwang Zhang, Pengxiao Lin, Zhiwei Wang, Yaoyu Zhang, Zhi-Qin John Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have shown impressive capabilities across various tasks, but
+their performance on compositional problems remains a topic of debate. In this
+work, we investigate the mechanisms of how transformers behave on unseen
+compositional tasks. We discover that the parameter initialization scale plays
+a critical role in determining whether the model learns inferential
+(reasoning-based) solutions, which capture the underlying compositional
+primitives, or symmetric (memory-based) solutions, which simply memorize
+mappings without understanding the compositional structure. By analyzing the
+information flow and vector representations within the model, we reveal the
+distinct mechanisms underlying these solution types. We further find that
+inferential (reasoning-based) solutions exhibit low complexity bias, which we
+hypothesize is a key factor enabling them to learn individual mappings for
+single anchors. We validate our conclusions on various real-world datasets. Our
+findings provide valuable insights into the role of initialization scale in
+tuning the reasoning and memorizing ability and we propose the initialization
+rate $\gamma$ to be a convenient tunable hyper-parameter in common deep
+learning frameworks, where $1/d_{\mathrm{in}}^\gamma$ is the standard deviation
+of parameters of the layer with $d_{\mathrm{in}}$ input neurons.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Race to Efficiency: A New Perspective on AI Scaling Laws 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02156v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02156v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chien-Ping Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large-scale AI models expand, training becomes costlier and sustaining
+progress grows harder. Classical scaling laws (e.g., Kaplan et al. (2020),
+Hoffmann et al. (2022)) predict training loss from a static compute budget yet
+neglect time and efficiency, prompting the question: how can we balance
+ballooning GPU fleets with rapidly improving hardware and algorithms? We
+introduce the relative-loss equation, a time- and efficiency-aware framework
+that extends classical AI scaling laws. Our model shows that, without ongoing
+efficiency gains, advanced performance could demand millennia of training or
+unrealistically large GPU fleets. However, near-exponential progress remains
+achievable if the "efficiency-doubling rate" parallels Moore's Law. By
+formalizing this race to efficiency, we offer a quantitative roadmap for
+balancing front-loaded GPU investments with incremental improvements across the
+AI stack. Empirical trends suggest that sustained efficiency gains can push AI
+scaling well into the coming decade, providing a new perspective on the
+diminishing returns inherent in classical scaling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 3 figures. 2 tables, second draft</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProSparse: Introducing and Enhancing Intrinsic Activation Sparsity
+  within Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13516v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13516v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyang Song, Xu Han, Zhengyan Zhang, Shengding Hu, Xiyu Shi, Kuai Li, Chen Chen, Zhiyuan Liu, Guangli Li, Tao Yang, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Activation sparsity refers to the existence of considerable
+weakly-contributed elements among activation outputs. As a prevalent property
+of the models using the ReLU activation function, activation sparsity has been
+proven a promising paradigm to boost model inference efficiency. Nevertheless,
+most large language models (LLMs) adopt activation functions without intrinsic
+activation sparsity (e.g., GELU and Swish). Some recent efforts have explored
+introducing ReLU or its variants as the substitutive activation function to
+help LLMs achieve activation sparsity and inference acceleration, but few can
+simultaneously obtain high sparsity and comparable model performance. This
+paper introduces a simple and effective sparsification method named "ProSparse"
+to push LLMs for higher activation sparsity while maintaining comparable
+performance. Specifically, after substituting the activation function of LLMs
+with ReLU, ProSparse adopts progressive sparsity regularization with a factor
+smoothly increasing along the multi-stage sine curves. This can enhance
+activation sparsity and mitigate performance degradation by avoiding radical
+shifts in activation distributions. With ProSparse, we obtain high sparsity of
+89.32% for LLaMA2-7B, 88.80% for LLaMA2-13B, and 87.89% for end-size
+MiniCPM-1B, respectively, achieving comparable performance to their original
+Swish-activated versions. These present the most sparsely activated models
+among open-source LLaMA versions and competitive end-size models, considerably
+surpassing ReluLLaMA-7B (66.98%) and ReluLLaMA-13B (71.56%). Our inference
+acceleration experiments further demonstrate the significant practical
+acceleration potential of LLMs with higher activation sparsity, obtaining up to
+4.52$\times$ inference speedup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FIDLAR: Forecast-Informed Deep Learning Architecture for Flood
+  Mitigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13371v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13371v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jimeng Shi, Zeda Yin, Arturo Leon, Jayantha Obeysekera, Giri Narasimhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In coastal river systems, frequent floods, often occurring during major
+storms or king tides, pose a severe threat to lives and property. However,
+these floods can be mitigated or even prevented by strategically releasing
+water before extreme weather events with hydraulic structures such as dams,
+gates, pumps, and reservoirs. A standard approach used by local water
+management agencies is the "rule-based" method, which specifies predetermined
+pre-releases of water based on historical and time-tested human experience, but
+which tends to result in excess or inadequate water release. The model
+predictive control (MPC), a physics-based model for prediction, is an
+alternative approach, albeit involving computationally intensive calculations.
+In this paper, we propose a Forecast Informed Deep Learning Architecture,
+FIDLAR, to achieve rapid and optimal flood management with precise water
+pre-releases. FIDLAR seamlessly integrates two neural network modules: one
+called the Flood Manager, which is responsible for generating water pre-release
+schedules, and another called the Flood Evaluator, which assesses these
+generated schedules. The Evaluator module is pre-trained separately, and its
+gradient-based feedback is used to train the Manager model, ensuring optimal
+water pre-releases. We have conducted experiments using FIDLAR with data from a
+flood-prone coastal area in South Florida, particularly susceptible to frequent
+storms. Results show that FIDLAR is several orders of magnitude faster than
+currently used physics-based approaches while outperforming baseline methods
+with improved water pre-release schedules.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChatBug: A Common Vulnerability of Aligned LLMs Induced by Chat
+  Templates <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12935v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12935v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengqing Jiang, Zhangchen Xu, Luyao Niu, Bill Yuchen Lin, Radha Poovendran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are expected to follow instructions from users
+and engage in conversations. Techniques to enhance LLMs' instruction-following
+capabilities typically fine-tune them using data structured according to a
+predefined chat template. Although chat templates are shown to be effective in
+optimizing LLM performance, their impact on safety alignment of LLMs has been
+less understood, which is crucial for deploying LLMs safely at scale.
+  In this paper, we investigate how chat templates affect safety alignment of
+LLMs. We identify a common vulnerability, named ChatBug, that is introduced by
+chat templates. Our key insight to identify ChatBug is that the chat templates
+provide a rigid format that need to be followed by LLMs, but not by users.
+Hence, a malicious user may not necessarily follow the chat template when
+prompting LLMs. Instead, malicious users could leverage their knowledge of the
+chat template and accordingly craft their prompts to bypass safety alignments
+of LLMs. We develop two attacks to exploit the ChatBug vulnerability. We
+demonstrate that a malicious user can exploit the ChatBug vulnerability of
+eight state-of-the-art (SOTA) LLMs and effectively elicit unintended responses
+from these models. Moreover, we show that ChatBug can be exploited by existing
+jailbreak attacks to enhance their attack success rates. We investigate
+potential countermeasures to ChatBug. Our results show that while adversarial
+training effectively mitigates the ChatBug vulnerability, the victim model
+incurs significant performance degradation. These results highlight the
+trade-off between safety alignment and helpfulness. Developing new methods for
+instruction tuning to balance this trade-off is an open and critical direction
+for future research
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GUPNet++: Geometry Uncertainty Propagation Network for Monocular 3D
+  Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.15624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.15624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Lu, Xinzhu Ma, Lei Yang, Tianzhu Zhang, Yating Liu, Qi Chu, Tong He, Yonghui Li, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geometry plays a significant role in monocular 3D object detection. It can be
+used to estimate object depth by using the perspective projection between
+object's physical size and 2D projection in the image plane, which can
+introduce mathematical priors into deep models. However, this projection
+process also introduces error amplification, where the error of the estimated
+height is amplified and reflected into the projected depth. It leads to
+unreliable depth inferences and also impairs training stability. To tackle this
+problem, we propose a novel Geometry Uncertainty Propagation Network (GUPNet++)
+by modeling geometry projection in a probabilistic manner. This ensures depth
+predictions are well-bounded and associated with a reasonable uncertainty. The
+significance of introducing such geometric uncertainty is two-fold: (1). It
+models the uncertainty propagation relationship of the geometry projection
+during training, improving the stability and efficiency of the end-to-end model
+learning. (2). It can be derived to a highly reliable confidence to indicate
+the quality of the 3D detection result, enabling more reliable detection
+inference. Experiments show that the proposed approach not only obtains
+(state-of-the-art) SOTA performance in image-based monocular 3D detection but
+also demonstrates superiority in efficacy with a simplified framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LightGNN: Simple Graph Neural Network for Recommendation <span class="chip">WSDM 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxuan Chen, Lianghao Xia, Chao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have demonstrated superior performance in
+collaborative recommendation through their ability to conduct high-order
+representation smoothing, effectively capturing structural information within
+users' interaction patterns. However, existing GNN paradigms face significant
+challenges in scalability and robustness when handling large-scale, noisy, and
+real-world datasets. To address these challenges, we present LightGNN, a
+lightweight and distillation-based GNN pruning framework designed to
+substantially reduce model complexity while preserving essential collaboration
+modeling capabilities. Our LightGNN framework introduces a computationally
+efficient pruning module that adaptively identifies and removes redundant edges
+and embedding entries for model compression. The framework is guided by a
+resource-friendly hierarchical knowledge distillation objective, whose
+intermediate layer augments the observed graph to maintain performance,
+particularly in high-rate compression scenarios. Extensive experiments on
+public datasets demonstrate LightGNN's effectiveness, significantly improving
+both computational efficiency and recommendation accuracy. Notably, LightGNN
+achieves an 80% reduction in edge count and 90% reduction in embedding entries
+while maintaining performance comparable to more complex state-of-the-art
+baselines. The implementation of our LightGNN framework is available at the
+github repository: https://github.com/HKUDS/LightGNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WSDM 2025 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Gradient Subspaces: Addressing and Overcoming LoRA's
+  Limitations in Federated Fine-Tuning of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23111v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23111v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navyansh Mahla, Kshitij Sharad Jadhav, Ganesh Ramakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable capabilities across
+various domains, particularly in task generalization for both text and vision
+data. While fine-tuning these models can significantly enhance their
+performance on specific downstream tasks, it often requires high-quality data
+that cannot be shared due to privacy concerns. Federated Learning (FL) offers a
+promising solution for collaborative training without direct data sharing.
+However, many parameter-efficient fine-tuning strategies for LLMs in FL,
+particularly those based on Low-Rank Adaptation (LoRA), face limitations. In
+this paper, we critically analyze the convergence and performance guarantees of
+popular FL frameworks utilizing LoRA, highlighting its suboptimal nature due to
+constrained subspace learning of low-rank matrices. This limitation hinders
+effective fine-tuning of LLMs in federated settings. Through rigorous
+analytical and empirical evaluations, we demonstrate that direct weight
+averaging outperforms LoRA-based strategies, leading to superior performance
+for fine-tuned models. Our comprehensive comparison unmasks inefficiencies in
+LoRA approaches and underscores the advantages of direct weight aggregation. We
+extend our analysis to low-rank gradient-based optimizers, such as GaLore, used
+during local training steps. Our findings show that GaLore along with
+direct-weight aggregation is a more effective approach, outperforming federated
+LoRA methods like FlexLoRA and FFA-LoRA across both text and image modalities.
+While privacy remains paramount in FL discourse, our focus is on assessing
+performance outcomes of federated fine-tuned models and evaluating various FL
+frameworks from both theoretical and empirical perspectives. Our findings
+advocate reassessing the reliance on LoRA within FL contexts, paving the way
+for more efficient training methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Concept Matching with Agent for Out-of-Distribution Detection <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiao Lee, Xiaofeng Cao, Jingcai Guo, Wei Ye, Qing Guo, Yi Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable achievements of Large Language Models (LLMs) have captivated
+the attention of both academia and industry, transcending their initial role in
+dialogue generation. To expand the usage scenarios of LLM, some works enhance
+the effectiveness and capabilities of the model by introducing more external
+information, which is called the agent paradigm. Based on this idea, we propose
+a new method that integrates the agent paradigm into out-of-distribution (OOD)
+detection task, aiming to improve its robustness and adaptability. Our proposed
+method, Concept Matching with Agent (CMA), employs neutral prompts as agents to
+augment the CLIP-based OOD detection process. These agents function as dynamic
+observers and communication hubs, interacting with both In-distribution (ID)
+labels and data inputs to form vector triangle relationships. This triangular
+framework offers a more nuanced approach than the traditional binary
+relationship, allowing for better separation and identification of ID and OOD
+inputs. Our extensive experimental results showcase the superior performance of
+CMA over both zero-shot and training-required methods in a diverse array of
+real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Localize-and-Stitch: Efficient Model Merging via Sparse Task Arithmetic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13656v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13656v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei He, Yuzheng Hu, Yong Lin, Tong Zhang, Han Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model merging offers an effective strategy to combine the strengths of
+multiple finetuned models into a unified model that preserves the specialized
+capabilities of each. Existing methods merge models in a global manner,
+performing arithmetic operations across all model parameters. However, such
+global merging often leads to task interference, degrading the performance of
+the merged model. In this work, we introduce Localize-and-Stitch, a novel
+approach that merges models in a localized way. Our algorithm works in two
+steps: i) Localization: identify tiny ($1\%$ of the total parameters) localized
+regions in the finetuned models containing essential skills for the downstream
+tasks, and ii) Stitching: reintegrate only these essential regions back into
+the pretrained model for task synergy. We demonstrate that our approach
+effectively locates sparse regions responsible for finetuned performance, and
+the localized regions could be treated as compact and interpretable
+representations of the finetuned models (tasks). Empirically, we evaluate our
+method on various vision and language benchmarks, showing that it outperforms
+existing model merging methods under different data availability scenarios.
+Beyond strong empirical performance, our algorithm also facilitates model
+compression and preserves pretrained knowledge, enabling flexible and continual
+skill composition from multiple finetuned models with minimal storage and
+computational overhead. Our code is available at
+https://github.com/uiuctml/Localize-and-Stitch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Checking in Medical Imaging for Tumor Detection and Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02024v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02024v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elhoucine Elfatimi, Lahcen El fatimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in model checking have demonstrated significant potential
+across diverse applications, particularly in signal and image analysis. Medical
+imaging stands out as a critical domain where model checking can be effectively
+applied to design and evaluate robust frameworks. These frameworks facilitate
+automatic and semi-automatic delineation of regions of interest within images,
+aiding in accurate segmentation. This paper provides a comprehensive analysis
+of recent works leveraging spatial logic to develop operators and tools for
+identifying regions of interest, including tumorous and non-tumorous areas.
+Additionally, we examine the challenges inherent to spatial model-checking
+techniques, such as variability in ground truth data and the need for
+streamlined procedures suitable for routine clinical practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DistPred: A Distribution-Free Probabilistic Inference Method for
+  Regression and Forecasting <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11397v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11397v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daojun Liang, Haixia Zhang, Dongfeng Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional regression and prediction tasks often only provide deterministic
+point estimates. To estimate the distribution or uncertainty of the response
+variable, traditional methods either assume that the posterior distribution of
+samples follows a Gaussian process or require thousands of forward passes for
+sample generation. We propose a novel approach called DistPred for regression
+and forecasting tasks, which overcomes the limitations of existing methods
+while remaining simple and powerful. Specifically, we transform proper scoring
+rules that measure the discrepancy between the predicted distribution and the
+target distribution into a differentiable discrete form and use it as a loss
+function to train the model end-to-end. This allows the model to sample
+numerous samples in a single forward pass to estimate the potential
+distribution of the response variable. We have compared our method with several
+existing approaches on multiple datasets and achieved state-of-the-art
+performance. Additionally, our method significantly improves computational
+efficiency. For example, compared to state-of-the-art models, DistPred has a
+180x faster inference speed Experimental results can be reproduced through
+https://github.com/Anoise/DistPred.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An In-Depth Analysis of Adversarial Discriminative Domain Adaptation for
+  Digit Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19391v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19391v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Choi, Julian Rodriguez, Edmund Young
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation is an active area of research driven by the growing demand
+for robust machine learning models that perform well on real-world data.
+Adversarial learning for deep neural networks (DNNs) has emerged as a promising
+approach to improving generalization ability, particularly for image
+classification. In this paper, we implement a specific adversarial learning
+technique known as Adversarial Discriminative Domain Adaptation (ADDA) and
+replicate digit classification experiments from the original ADDA paper. We
+extend their findings by examining a broader range of domain shifts and provide
+a detailed analysis of in-domain classification accuracy post-ADDA. Our results
+demonstrate that ADDA significantly improves accuracy across certain domain
+shifts with minimal impact on in-domain performance. Furthermore, we provide
+qualitative analysis and propose potential explanations for ADDA's limitations
+in less successful domain shifts. Code is at
+https://github.com/eugenechoi2004/COS429_FINAL .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Replacement: Updated methodology section to include grayscale
+  preprocessing of SVHN data</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Causal Transition Matrix for Instance-dependent Label Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.13516v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.13516v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahui Li, Tai-Wei Chang, Kun Kuang, Ximing Li, Long Chen, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Noisy labels are both inevitable and problematic in machine learning methods,
+as they negatively impact models' generalization ability by causing
+overfitting. In the context of learning with noise, the transition matrix plays
+a crucial role in the design of statistically consistent algorithms. However,
+the transition matrix is often considered unidentifiable. One strand of methods
+typically addresses this problem by assuming that the transition matrix is
+instance-independent; that is, the probability of mislabeling a particular
+instance is not influenced by its characteristics or attributes. This
+assumption is clearly invalid in complex real-world scenarios. To better
+understand the transition relationship and relax this assumption, we propose to
+study the data generation process of noisy labels from a causal perspective. We
+discover that an unobservable latent variable can affect either the instance
+itself, the label annotation procedure, or both, which complicates the
+identification of the transition matrix. To address various scenarios, we have
+unified these observations within a new causal graph. In this graph, the input
+instance is divided into a noise-resistant component and a noise-sensitive
+component based on whether they are affected by the latent variable. These two
+components contribute to identifying the ``causal transition matrix'', which
+approximates the true transition matrix with theoretical guarantee. In line
+with this, we have designed a novel training framework that explicitly models
+this causal relationship and, as a result, achieves a more accurate model for
+inferring the clean label.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Out-of-Domain data help to Learn Domain-Specific <span class="highlight-title">Prompt</span>s for
+  Multimodal Misinformation Detection? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16496v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16496v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amartya Bhattacharya, Debarshi Brahma, Suraj Nagaje Mahadev, Anmol Asati, Vikas Verma, Soma Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spread of fake news using out-of-context images and captions has become
+widespread in this era of information overload. Since fake news can belong to
+different domains like politics, sports, etc. with their unique
+characteristics, inference on a test image-caption pair is contingent on how
+well the model has been trained on similar data. Since training individual
+models for each domain is not practical, we propose a novel framework termed
+DPOD (Domain-specific Prompt tuning using Out-of-domain data), which can
+exploit out-of-domain data during training to improve fake news detection of
+all desired domains simultaneously. First, to compute generalizable features,
+we modify the Vision-Language Model, CLIP to extract features that helps to
+align the representations of the images and corresponding captions of both the
+in-domain and out-of-domain data in a label-aware manner. Further, we propose a
+domain-specific prompt learning technique which leverages training samples of
+all the available domains based on the extent they can be useful to the desired
+domain. Extensive experiments on the large-scale NewsCLIPpings and VERITE
+benchmarks demonstrate that DPOD achieves state of-the-art performance for this
+challenging task. Code: https://github.com/scviab/DPOD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Network Prediction of Strong Lensing Systems with Domain
+  Adaptation and Uncertainty Quantification <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03334v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03334v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shrihan Agarwal, Aleksandra Ćiprijanović, Brian D. Nord
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling strong gravitational lenses is computationally expensive for the
+complex data from modern and next-generation cosmic surveys. Deep learning has
+emerged as a promising approach for finding lenses and predicting lensing
+parameters, such as the Einstein radius. Mean-variance Estimators (MVEs) are a
+common approach for obtaining aleatoric (data) uncertainties from a neural
+network prediction. However, neural networks have not been demonstrated to
+perform well on out-of-domain target data successfully - e.g., when trained on
+simulated data and applied to real, observational data. In this work, we
+perform the first study of the efficacy of MVEs in combination with
+unsupervised domain adaptation (UDA) on strong lensing data. The source domain
+data is noiseless, and the target domain data has noise mimicking modern
+cosmology surveys. We find that adding UDA to MVE increases the accuracy on the
+target data by a factor of about two over an MVE model without UDA. Including
+UDA also permits much more well-calibrated aleatoric uncertainty predictions.
+Advancements in this approach may enable future applications of MVE models to
+real observational data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Machine Learning for Physical Sciences workshop at
+  NeurIPS 2024; 24 pages, 2 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transfer learning via Regularized Linear Discriminant Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02411v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02411v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongzhe Zhang, Arnab Auddy, Hongzhe Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linear discriminant analysis is a widely used method for classification.
+However, the high dimensionality of predictors combined with small sample sizes
+often results in large classification errors. To address this challenge, it is
+crucial to leverage data from related source models to enhance the
+classification performance of a target model. We propose to address this
+problem in the framework of transfer learning.
+  In this paper, we present novel transfer learning methods via regularized
+random-effects linear discriminant analysis, where the discriminant direction
+is estimated as a weighted combination of ridge estimates obtained from both
+the target and source models. Multiple strategies for determining these weights
+are introduced and evaluated, including one that minimizes the estimation risk
+of the discriminant vector and another that minimizes the classification error.
+Utilizing results from random matrix theory, we explicitly derive the
+asymptotic values of these weights and the associated classification error
+rates in the high-dimensional setting, where $p/n \rightarrow \gamma$, with $p$
+representing the predictor dimension and $n$ the sample size. We also provide
+geometric interpretations of various weights and a guidance on which weights to
+choose. Extensive numerical studies, including simulations and analysis of
+proteomics-based 10-year cardiovascular disease risk classification,
+demonstrate the effectiveness of the proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Dataset</span>-Free Weight-Initialization on Restricted Boltzmann Machine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.07708v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.07708v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muneki Yasuda, Ryosuke Maeno, Chako Takahashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In feed-forward neural networks, dataset-free weight-initialization methods
+such as LeCun, Xavier (or Glorot), and He initializations have been developed.
+These methods randomly determine the initial values of weight parameters based
+on specific distributions (e.g., Gaussian or uniform distributions) without
+using training datasets. To the best of the authors' knowledge, such a
+dataset-free weight-initialization method is yet to be developed for restricted
+Boltzmann machines (RBMs), which are probabilistic neural networks consisting
+of two layers. In this study, we derive a dataset-free weight-initialization
+method for Bernoulli--Bernoulli RBMs based on statistical mechanical analysis.
+In the proposed weight-initialization method, the weight parameters are drawn
+from a Gaussian distribution with zero mean. The standard deviation of the
+Gaussian distribution is optimized based on our hypothesis that a standard
+deviation providing a larger layer correlation (LC) between the two layers
+improves the learning efficiency. The expression of the LC is derived based on
+a statistical mechanical analysis. The optimal value of the standard deviation
+corresponds to the maximum point of the LC. The proposed weight-initialization
+method is identical to Xavier initialization in a specific case (i.e., when the
+sizes of the two layers are the same, the random variables of the layers are
+$\{-1,1\}$-binary, and all bias parameters are zero). The validity of the
+proposed weight-initialization method is demonstrated in numerical experiments
+using a toy and real-world datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupling Learning and Decision-Making: Breaking the
+  $\mathcal{O}(\sqrt{T})$ Barrier in Online Resource Allocation with
+  First-Order Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07108v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07108v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenzhi Gao, Chunlin Sun, Chenyu Xue, Dongdong Ge, Yinyu Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online linear programming plays an important role in both revenue management
+and resource allocation, and recent research has focused on developing
+efficient first-order online learning algorithms. Despite the empirical success
+of first-order methods, they typically achieve a regret no better than
+$\mathcal{O}(\sqrt{T})$, which is suboptimal compared to the $\mathcal{O}(\log
+T)$ bound guaranteed by the state-of-the-art linear programming (LP)-based
+online algorithms. This paper establishes several important facts about online
+linear programming, which unveils the challenge for first-order-method-based
+online algorithms to achieve beyond $\mathcal{O}(\sqrt{T})$ regret. To address
+the challenge, we introduce a new algorithmic framework that decouples learning
+from decision-making. For the first time, we show that first-order methods can
+attain regret $\mathcal{O}(T^{1/3})$ with this new framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Merged into arXiv:2501.02761</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlanLLM: Video Procedure Planning with Refinable Large Language Models <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19139v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19139v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dejie Yang, Zijing Zhao, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video procedure planning, i.e., planning a sequence of action steps given the
+video frames of start and goal states, is an essential ability for embodied AI.
+Recent works utilize Large Language Models (LLMs) to generate enriched action
+step description texts to guide action step decoding. Although LLMs are
+introduced, these methods decode the action steps into a closed-set of one-hot
+vectors, limiting the model's capability of generalizing to new steps or tasks.
+Additionally, fixed action step descriptions based on world-level commonsense
+may contain noise in specific instances of visual states. In this paper, we
+propose PlanLLM, a cross-modal joint learning framework with LLMs for video
+procedure planning. We propose an LLM-Enhanced Planning module which fully uses
+the generalization ability of LLMs to produce free-form planning output and to
+enhance action step decoding. We also propose Mutual Information Maximization
+module to connect world-level commonsense of step descriptions and
+sample-specific information of visual states, enabling LLMs to employ the
+reasoning ability to generate step sequences. With the assistance of LLMs, our
+method can both closed-set and open vocabulary procedure planning tasks. Our
+PlanLLM achieves superior performance on three benchmarks, demonstrating the
+effectiveness of our designs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rare-to-Frequent: Unlocking Compositional Generation Power of Diffusion
+  Models on Rare Concepts with LLM Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.22376v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.22376v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongmin Park, Sebin Kim, Taehong Moon, Minkyu Kim, Kangwook Lee, Jaewoong Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art text-to-image (T2I) diffusion models often struggle to
+generate rare compositions of concepts, e.g., objects with unusual attributes.
+In this paper, we show that the compositional generation power of diffusion
+models on such rare concepts can be significantly enhanced by the Large
+Language Model (LLM) guidance. We start with empirical and theoretical
+analysis, demonstrating that exposing frequent concepts relevant to the target
+rare concepts during the diffusion sampling process yields more accurate
+concept composition. Based on this, we propose a training-free approach, R2F,
+that plans and executes the overall rare-to-frequent concept guidance
+throughout the diffusion inference by leveraging the abundant semantic
+knowledge in LLMs. Our framework is flexible across any pre-trained diffusion
+models and LLMs, and can be seamlessly integrated with the region-guided
+diffusion approaches. Extensive experiments on three datasets, including our
+newly proposed benchmark, RareBench, containing various prompts with rare
+compositions of concepts, R2F significantly surpasses existing models including
+SD3.0 and FLUX by up to 28.1%p in T2I alignment. Code is available at
+https://github.com/krafton-ai/Rare-to-Frequent.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Countering Backdoor Attacks in Image Recognition: A <span class="highlight-title">Survey</span> and
+  Evaluation of Mitigation Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11200v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11200v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kealan Dunnett, Reza Arablouei, Dimity Miller, Volkan Dedeoglu, Raja Jurdak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread adoption of deep learning across various industries has
+introduced substantial challenges, particularly in terms of model
+explainability and security. The inherent complexity of deep learning models,
+while contributing to their effectiveness, also renders them susceptible to
+adversarial attacks. Among these, backdoor attacks are especially concerning,
+as they involve surreptitiously embedding specific triggers within training
+data, causing the model to exhibit aberrant behavior when presented with input
+containing the triggers. Such attacks often exploit vulnerabilities in
+outsourced processes, compromising model integrity without affecting
+performance on clean (trigger-free) input data. In this paper, we present a
+comprehensive review of existing mitigation strategies designed to counter
+backdoor attacks in image recognition. We provide an in-depth analysis of the
+theoretical foundations, practical efficacy, and limitations of these
+approaches. In addition, we conduct an extensive benchmarking of sixteen
+state-of-the-art approaches against eight distinct backdoor attacks, utilizing
+three datasets, four model architectures, and three poisoning ratios. Our
+results, derived from 122,236 individual experiments, indicate that while many
+approaches provide some level of protection, their performance can vary
+considerably. Furthermore, when compared to two seminal approaches, most newer
+approaches do not demonstrate substantial improvements in overall performance
+or consistency across diverse settings. Drawing from these findings, we propose
+potential directions for developing more effective and generalizable defensive
+mechanisms in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">3</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual question answering: from early developments to recent advances --
+  a <span class="highlight-title">survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03939v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03939v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ngoc Dung Huynh, Mohamed Reda Bouadjenek, Sunil Aryal, Imran Razzak, Hakim Hacid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) is an evolving research field aimed at
+enabling machines to answer questions about visual content by integrating image
+and language processing techniques such as feature extraction, object
+detection, text embedding, natural language understanding, and language
+generation. With the growth of multimodal data research, VQA has gained
+significant attention due to its broad applications, including interactive
+educational tools, medical image diagnosis, customer service, entertainment,
+and social media captioning. Additionally, VQA plays a vital role in assisting
+visually impaired individuals by generating descriptive content from images.
+This survey introduces a taxonomy of VQA architectures, categorizing them based
+on design choices and key components to facilitate comparative analysis and
+evaluation. We review major VQA approaches, focusing on deep learning-based
+methods, and explore the emerging field of Large Visual Language Models (LVLMs)
+that have demonstrated success in multimodal tasks like VQA. The paper further
+examines available datasets and evaluation metrics essential for measuring VQA
+system performance, followed by an exploration of real-world VQA applications.
+Finally, we highlight ongoing challenges and future directions in VQA research,
+presenting open questions and potential areas for further development. This
+survey serves as a comprehensive resource for researchers and practitioners
+interested in the latest advancements and future
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConcealGS: Concealing Invisible Copyright Information in 3D Gaussian
+  Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifeng Yang, Hengyu Liu, Chenxin Li, Yining Sun, Wuyang Li, Yifan Liu, Yiyang Lin, Yixuan Yuan, Nanyang Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of 3D reconstruction technology, the widespread
+distribution of 3D data has become a future trend. While traditional visual
+data (such as images and videos) and NeRF-based formats already have mature
+techniques for copyright protection, steganographic techniques for the emerging
+3D Gaussian Splatting (3D-GS) format have yet to be fully explored. To address
+this, we propose ConcealGS, an innovative method for embedding implicit
+information into 3D-GS. By introducing the knowledge distillation and gradient
+optimization strategy based on 3D-GS, ConcealGS overcomes the limitations of
+NeRF-based models and enhances the robustness of implicit information and the
+quality of 3D reconstruction. We evaluate ConcealGS in various potential
+application scenarios, and experimental results have demonstrated that
+ConcealGS not only successfully recovers implicit information but also has
+almost no impact on rendering quality, providing a new approach for embedding
+invisible and recoverable information into 3D models in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlanLLM: Video Procedure Planning with Refinable Large Language Models <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19139v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19139v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dejie Yang, Zijing Zhao, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video procedure planning, i.e., planning a sequence of action steps given the
+video frames of start and goal states, is an essential ability for embodied AI.
+Recent works utilize Large Language Models (LLMs) to generate enriched action
+step description texts to guide action step decoding. Although LLMs are
+introduced, these methods decode the action steps into a closed-set of one-hot
+vectors, limiting the model's capability of generalizing to new steps or tasks.
+Additionally, fixed action step descriptions based on world-level commonsense
+may contain noise in specific instances of visual states. In this paper, we
+propose PlanLLM, a cross-modal joint learning framework with LLMs for video
+procedure planning. We propose an LLM-Enhanced Planning module which fully uses
+the generalization ability of LLMs to produce free-form planning output and to
+enhance action step decoding. We also propose Mutual Information Maximization
+module to connect world-level commonsense of step descriptions and
+sample-specific information of visual states, enabling LLMs to employ the
+reasoning ability to generate step sequences. With the assistance of LLMs, our
+method can both closed-set and open vocabulary procedure planning tasks. Our
+PlanLLM achieves superior performance on three benchmarks, demonstrating the
+effectiveness of our designs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Artificial Intelligence <span class="chip" style="font-size: 60%">153</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VLM-driven Behavior Tree for Context-aware Task Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoki Wake, Atsushi Kanehira, Jun Takamatsu, Kazuhiro Sasabuchi, Katsushi Ikeuchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of Large Language Models (LLMs) for generating Behavior Trees (BTs)
+has recently gained attention in the robotics community, yet remains in its
+early stages of development. In this paper, we propose a novel framework that
+leverages Vision-Language Models (VLMs) to interactively generate and edit BTs
+that address visual conditions, enabling context-aware robot operations in
+visually complex environments. A key feature of our approach lies in the
+conditional control through self-prompted visual conditions. Specifically, the
+VLM generates BTs with visual condition nodes, where conditions are expressed
+as free-form text. Another VLM process integrates the text into its prompt and
+evaluates the conditions against real-world images during robot execution. We
+validated our framework in a real-world cafe scenario, demonstrating both its
+feasibility and limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 11 figures, 5 tables. Last updated on January 7th, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Localizing AI: Evaluating Open-Weight Language Models for Languages of
+  Baltic States 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03952v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03952v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jurgita Kapočiūtė-Dzikienė, Toms Bergmanis, Mārcis Pinnis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although large language models (LLMs) have transformed our expectations of
+modern language technologies, concerns over data privacy often restrict the use
+of commercially available LLMs hosted outside of EU jurisdictions. This limits
+their application in governmental, defence, and other data-sensitive sectors.
+In this work, we evaluate the extent to which locally deployable open-weight
+LLMs support lesser-spoken languages such as Lithuanian, Latvian, and Estonian.
+We examine various size and precision variants of the top-performing
+multilingual open-weight models, Llama~3, Gemma~2, Phi, and NeMo, on machine
+translation, multiple-choice question answering, and free-form text generation.
+The results indicate that while certain models like Gemma~2 perform close to
+the top commercially available models, many LLMs struggle with these languages.
+Most surprisingly, however, we find that these models, while showing close to
+state-of-the-art translation performance, are still prone to lexical
+hallucinations with errors in at least 1 in 20 words for all open-weight
+multilingual LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted to NoDaLiDa/Baltic-HLT 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic Data Privacy Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amy Steier, Lipika Ramaswamy, Andre Manoel, Alexa Haushalter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in generative AI have made it possible to create
+synthetic datasets that can be as accurate as real-world data for training AI
+models, powering statistical insights, and fostering collaboration with
+sensitive datasets while offering strong privacy guarantees. Effectively
+measuring the empirical privacy of synthetic data is an important step in the
+process. However, while there is a multitude of new privacy metrics being
+published every day, there currently is no standardization. In this paper, we
+review the pros and cons of popular metrics that include simulations of
+adversarial attacks. We also review current best practices for amending
+generative models to enhance the privacy of the data they create (e.g.
+differential privacy).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Not all tokens are created equal: Perplexity Attention Weighted Networks
+  for AI generated text detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03940v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03940v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Miralles-González, Javier Huertas-Tato, Alejandro Martín, David Camacho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement in large language models (LLMs) has significantly
+enhanced their ability to generate coherent and contextually relevant text,
+raising concerns about the misuse of AI-generated content and making it
+critical to detect it. However, the task remains challenging, particularly in
+unseen domains or with unfamiliar LLMs. Leveraging LLM next-token distribution
+outputs offers a theoretically appealing approach for detection, as they
+encapsulate insights from the models' extensive pre-training on diverse
+corpora. Despite its promise, zero-shot methods that attempt to operationalize
+these outputs have met with limited success. We hypothesize that one of the
+problems is that they use the mean to aggregate next-token distribution metrics
+across tokens, when some tokens are naturally easier or harder to predict and
+should be weighted differently. Based on this idea, we propose the Perplexity
+Attention Weighted Network (PAWN), which uses the last hidden states of the LLM
+and positions to weight the sum of a series of features based on metrics from
+the next-token distribution across the sequence length. Although not zero-shot,
+our method allows us to cache the last hidden states and next-token
+distribution metrics on disk, greatly reducing the training resource
+requirements. PAWN shows competitive and even better performance
+in-distribution than the strongest baselines (fine-tuned LMs) with a fraction
+of their trainable parameters. Our model also generalizes better to unseen
+domains and source models, with smaller variability in the decision boundary
+across distribution shifts. It is also more robust to adversarial attacks, and
+if the backbone has multilingual capabilities, it presents decent
+generalization to languages not seen during supervised training, with LLaMA3-1B
+reaching a mean macro-averaged F1 score of 81.46% in cross-validation with nine
+languages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PPTAgent: Generating and Evaluating Presentations Beyond Text-to-Slides 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Zheng, Xinyan Guan, Hao Kong, Jia Zheng, Hongyu Lin, Yaojie Lu, Ben He, Xianpei Han, Le Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically generating presentations from documents is a challenging task
+that requires balancing content quality, visual design, and structural
+coherence. Existing methods primarily focus on improving and evaluating the
+content quality in isolation, often overlooking visual design and structural
+coherence, which limits their practical applicability. To address these
+limitations, we propose PPTAgent, which comprehensively improves presentation
+generation through a two-stage, edit-based approach inspired by human
+workflows. PPTAgent first analyzes reference presentations to understand their
+structural patterns and content schemas, then drafts outlines and generates
+slides through code actions to ensure consistency and alignment. To
+comprehensively evaluate the quality of generated presentations, we further
+introduce PPTEval, an evaluation framework that assesses presentations across
+three dimensions: Content, Design, and Coherence. Experiments show that
+PPTAgent significantly outperforms traditional automatic presentation
+generation methods across all three dimensions. The code and data are available
+at https://github.com/icip-cas/PPTAgent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dolphin: Closed-loop Open-ended Auto-research through Thinking,
+  Practice, and Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiakang Yuan, Xiangchao Yan, Botian Shi, Tao Chen, Wanli Ouyang, Bo Zhang, Lei Bai, Yu Qiao, Bowen Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scientific research paradigm is undergoing a profound transformation
+owing to the development of Artificial Intelligence (AI). Recent works
+demonstrate that various AI-assisted research methods can largely improve
+research efficiency by improving data analysis, accelerating computation, and
+fostering novel idea generation. To further move towards the ultimate goal
+(i.e., automatic scientific research), in this paper, we propose Dolphin, the
+first closed-loop open-ended auto-research framework to further build the
+entire process of human scientific research. Dolphin can generate research
+ideas, perform experiments, and get feedback from experimental results to
+generate higher-quality ideas. More specifically, Dolphin first generates novel
+ideas based on relevant papers which are ranked by the topic and task
+attributes. Then, the codes are automatically generated and debugged with the
+exception-traceback-guided local code structure. Finally, Dolphin automatically
+analyzes the results of each idea and feeds the results back to the next round
+of idea generation. Experiments are conducted on the benchmark datasets of
+different topics and results show that Dolphin can generate novel ideas
+continuously and complete the experiment in a loop. We highlight that Dolphin
+can automatically propose methods that are comparable to the state-of-the-art
+in some tasks such as 2D image classification and 3D point classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 11 figures, and our homepage:
+  https://unimodal4reasoning.github.io/Dolphin-project-page/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Potential of Large Language Models in Public
+  Transportation: San Antonio Case Study <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramya Jonnala, Gongbo Liang, Jeong Yang, Izzat Alsmadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of large language models (LLMs) into public transit systems
+presents a transformative opportunity to enhance urban mobility. This study
+explores the potential of LLMs to revolutionize public transportation
+management within the context of San Antonio's transit system. Leveraging the
+capabilities of LLMs in natural language processing and data analysis, we
+investigate their capabilities to optimize route planning, reduce wait times,
+and provide personalized travel assistance. By utilizing the General Transit
+Feed Specification (GTFS) and other relevant data, this research aims to
+demonstrate how LLMs can potentially improve resource allocation, elevate
+passenger satisfaction, and inform data-driven decision-making in transit
+operations. A comparative analysis of different ChatGPT models was conducted to
+assess their ability to understand transportation information, retrieve
+relevant data, and provide comprehensive responses. Findings from this study
+suggest that while LLMs hold immense promise for public transit, careful
+engineering and fine-tuning are essential to realizing their full potential.
+San Antonio serves as a case study to inform the development of LLM-powered
+transit systems in other urban environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted to AAAI 2025 Workshop on AI for Urban Planning.
+  arXiv admin note: substantial text overlap with arXiv:2407.11003</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Reinforcement Learning via Temporal Policy Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franco Ruggeri, Alessio Russo, Rafia Inam, Karl Henrik Johansson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the explainability of Reinforcement Learning (RL) policies
+from a temporal perspective, focusing on the sequence of future outcomes
+associated with individual actions. In RL, value functions compress information
+about rewards collected across multiple trajectories and over an infinite
+horizon, allowing a compact form of knowledge representation. However, this
+compression obscures the temporal details inherent in sequential
+decision-making, presenting a key challenge for interpretability. We present
+Temporal Policy Decomposition (TPD), a novel explainability approach that
+explains individual RL actions in terms of their Expected Future Outcome (EFO).
+These explanations decompose generalized value functions into a sequence of
+EFOs, one for each time step up to a prediction horizon of interest, revealing
+insights into when specific outcomes are expected to occur. We leverage
+fixed-horizon temporal difference learning to devise an off-policy method for
+learning EFOs for both optimal and suboptimal actions, enabling contrastive
+explanations consisting of EFOs for different state-action pairs. Our
+experiments demonstrate that TPD generates accurate explanations that (i)
+clarify the policy's future strategy and anticipated trajectory for a given
+action and (ii) improve understanding of the reward composition, facilitating
+fine-tuning of the reward function to align with human expectations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One
+  Vision Token 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaolei Zhang, Qingkai Fang, Zhe Yang, Yang Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of real-time large multimodal models (LMMs) like GPT-4o has
+sparked considerable interest in efficient LMMs. LMM frameworks typically
+encode visual inputs into vision tokens (continuous representations) and
+integrate them and textual instructions into the context of large language
+models (LLMs), where large-scale parameters and numerous context tokens
+(predominantly vision tokens) result in substantial computational overhead.
+Previous efforts towards efficient LMMs always focus on replacing the LLM
+backbone with smaller models, while neglecting the crucial issue of token
+quantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal
+vision tokens. To achieve a high compression ratio of vision tokens while
+preserving visual information, we first analyze how LMMs understand vision
+tokens and find that most vision tokens only play a crucial role in the early
+layers of LLM backbone, where they mainly fuse visual information into text
+tokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to
+fuse visual information into text tokens in advance, thereby facilitating the
+extreme compression of vision tokens fed to LLM backbone into one token.
+LLaVA-Mini is a unified large multimodal model that can support the
+understanding of images, high-resolution images, and videos in an efficient
+manner. Experiments across 11 image-based and 7 video-based benchmarks
+demonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token
+instead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by
+77%, deliver low-latency responses within 40 milliseconds, and process over
+10,000 frames of video on the GPU hardware with 24GB of memory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/ictnlp/LLaVA-Mini; Model:
+  https://huggingface.co/ICTNLP/llava-mini-llama-3.1-8b</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural DNF-MT: A Neuro-symbolic Approach for Learning Interpretable and
+  Editable Policies <span class="chip">AAMAS 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kexin Gu Baugh, Luke Dickens, Alessandra Russo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although deep reinforcement learning has been shown to be effective, the
+model's black-box nature presents barriers to direct policy interpretation. To
+address this problem, we propose a neuro-symbolic approach called neural DNF-MT
+for end-to-end policy learning. The differentiable nature of the neural DNF-MT
+model enables the use of deep actor-critic algorithms for training. At the same
+time, its architecture is designed so that trained models can be directly
+translated into interpretable policies expressed as standard (bivalent or
+probabilistic) logic programs. Moreover, additional layers can be included to
+extract abstract features from complex observations, acting as a form of
+predicate invention. The logic representations are highly interpretable, and we
+show how the bivalent representations of deterministic policies can be edited
+and incorporated back into a neural model, facilitating manual intervention and
+adaptation of learned policies. We evaluate our approach on a range of tasks
+requiring learning deterministic or stochastic behaviours from various forms of
+observations. Our empirical results show that our neural DNF-MT model performs
+at the level of competing black-box methods whilst providing interpretable
+policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAMAS 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CL3DOR: Contrastive Learning for 3D Large Multimodal Models via Odds
+  Ratio on High-Resolution Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keonwoo Kim, Yeongjae Cho, Taebaek Hwang, Minsoo Jo, Sangdo Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research has demonstrated that Large Language Models (LLMs) are not
+limited to text-only tasks but can also function as multimodal models across
+various modalities, including audio, images, and videos. In particular,
+research on 3D Large Multimodal Models (3D LMMs) is making notable strides,
+driven by the potential of processing higher-dimensional data like point
+clouds. However, upon closer examination, we find that the visual and textual
+content within each sample of existing training datasets lacks both high
+informational granularity and clarity, which serve as a bottleneck for precise
+cross-modal understanding. To address these issues, we propose CL3DOR,
+Contrastive Learning for 3D large multimodal models via Odds ratio on
+high-Resolution point clouds, designed to ensure greater specificity and
+clarity in both visual and textual content. Specifically, we increase the
+density of point clouds per object and construct informative hard negative
+responses in the training dataset to penalize unwanted responses. To leverage
+hard negative responses, we incorporate the odds ratio as an auxiliary term for
+contrastive learning into the conventional language modeling loss. CL3DOR
+achieves state-of-the-art performance in 3D scene understanding and reasoning
+benchmarks. Additionally, we demonstrate the effectiveness of CL3DOR's key
+components through extensive experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video
+  Generation Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekai Gu, Rui Yan, Jiahao Lu, Peng Li, Zhiyang Dou, Chenyang Si, Zhen Dong, Qifeng Liu, Cheng Lin, Ziwei Liu, Wenping Wang, Yuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated impressive performance in generating
+high-quality videos from text prompts or images. However, precise control over
+the video generation process, such as camera manipulation or content editing,
+remains a significant challenge. Existing methods for controlled video
+generation are typically limited to a single control type, lacking the
+flexibility to handle diverse control demands. In this paper, we introduce
+Diffusion as Shader (DaS), a novel approach that supports multiple video
+control tasks within a unified architecture. Our key insight is that achieving
+versatile video control necessitates leveraging 3D control signals, as videos
+are fundamentally 2D renderings of dynamic 3D content. Unlike prior methods
+limited to 2D control signals, DaS leverages 3D tracking videos as control
+inputs, making the video diffusion process inherently 3D-aware. This innovation
+allows DaS to achieve a wide range of video controls by simply manipulating the
+3D tracking videos. A further advantage of using 3D tracking videos is their
+ability to effectively link frames, significantly enhancing the temporal
+consistency of the generated videos. With just 3 days of fine-tuning on 8 H800
+GPUs using less than 10k videos, DaS demonstrates strong control capabilities
+across diverse tasks, including mesh-to-video generation, camera control,
+motion transfer, and object manipulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://igl-hkust.github.io/das/ Codes:
+  https://github.com/IGL-HKUST/DiffusionAsShader</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor
+  Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runci Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain tumors can result in neurological dysfunction, alterations in cognitive
+and psychological states, increased intracranial pressure, and the occurrence
+of seizures, thereby presenting a substantial risk to human life and health.
+The You Only Look Once(YOLO) series models have demonstrated superior accuracy
+in object detection for medical imaging. In this paper, we develop a novel
+SCC-YOLO architecture by integrating the SCConv attention mechanism into
+YOLOv9. The SCConv module reconstructs an efficient convolutional module by
+reducing spatial and channel redundancy among features, thereby enhancing the
+learning of image features. We investigate the impact of intergrating different
+attention mechanisms with the YOLOv9 model on brain tumor image detection using
+both the Br35H dataset and our self-made dataset(Brain_Tumor_Dataset).
+Experimental results show that on the Br35H dataset, SCC-YOLO achieved a 0.3%
+improvement in mAp50 compared to YOLOv9, while on our self-made dataset,
+SCC-YOLO exhibited a 0.5% improvement over YOLOv9. SCC-YOLO has reached
+state-of-the-art performance in brain tumor detection. Source code is available
+at : https://jihulab.com/healthcare-information-studio/SCC-YOLO/-/tree/master
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TACLR: A Scalable and Efficient Retrieval-based Method for Industrial
+  Product Attribute Value Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yindu Su, Huike Zou, Lin Sun, Ting Zhang, Haiyang Yang, Liyu Chen, David Lo, Qingheng Zhang, Shuguang Han, Jufeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Product Attribute Value Identification (PAVI) involves identifying attribute
+values from product profiles, a key task for improving product search,
+recommendations, and business analytics on e-commerce platforms. However,
+existing PAVI methods face critical challenges, such as inferring implicit
+values, handling out-of-distribution (OOD) values, and producing normalized
+outputs. To address these limitations, we introduce Taxonomy-Aware Contrastive
+Learning Retrieval (TACLR), the first retrieval-based method for PAVI. TACLR
+formulates PAVI as an information retrieval task by encoding product profiles
+and candidate values into embeddings and retrieving values based on their
+similarity to the item embedding. It leverages contrastive training with
+taxonomy-aware hard negative sampling and employs adaptive inference with
+dynamic thresholds. TACLR offers three key advantages: (1) it effectively
+handles implicit and OOD values while producing normalized outputs; (2) it
+scales to thousands of categories, tens of thousands of attributes, and
+millions of values; and (3) it supports efficient inference for high-load
+industrial scenarios. Extensive experiments on proprietary and public datasets
+validate the effectiveness and efficiency of TACLR. Moreover, it has been
+successfully deployed in a real-world e-commerce platform, processing millions
+of product listings daily while supporting dynamic, large-scale attribute
+taxonomies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Three-dimensional attention <span class="highlight-title">Transformer</span> for state evaluation in
+  real-time strategy games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanqing Ye, Weilong Yang, Kai Qiu, Jie Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Situation assessment in Real-Time Strategy (RTS) games is crucial for
+understanding decision-making in complex adversarial environments. However,
+existing methods remain limited in processing multi-dimensional feature
+information and temporal dependencies. Here we propose a tri-dimensional
+Space-Time-Feature Transformer (TSTF Transformer) architecture, which
+efficiently models battlefield situations through three independent but
+cascaded modules: spatial attention, temporal attention, and feature attention.
+On a dataset comprising 3,150 adversarial experiments, the 8-layer TSTF
+Transformer demonstrates superior performance: achieving 58.7% accuracy in the
+early game (~4% progress), significantly outperforming the conventional
+Timesformer's 41.8%; reaching 97.6% accuracy in the mid-game (~40% progress)
+while maintaining low performance variation (standard deviation 0.114).
+Meanwhile, this architecture requires fewer parameters (4.75M) compared to the
+baseline model (5.54M). Our study not only provides new insights into situation
+assessment in RTS games but also presents an innovative paradigm for
+Transformer-based multi-dimensional temporal modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Sylvester Posterior Inference for Adaptive Compressed Sensing in
+  Ultrasound Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon W. Penninga, Hans van Gorp, Ruud J. G. van Sloun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrasound images are commonly formed by sequential acquisition of
+beam-steered scan-lines. Minimizing the number of required scan-lines can
+significantly enhance frame rate, field of view, energy efficiency, and data
+transfer speeds. Existing approaches typically use static subsampling schemes
+in combination with sparsity-based or, more recently, deep-learning-based
+recovery. In this work, we introduce an adaptive subsampling method that
+maximizes intrinsic information gain in-situ, employing a Sylvester Normalizing
+Flow encoder to infer an approximate Bayesian posterior under partial
+observation in real-time. Using the Bayesian posterior and a deep generative
+model for future observations, we determine the subsampling scheme that
+maximizes the mutual information between the subsampled observations, and the
+next frame of the video. We evaluate our approach using the EchoNet cardiac
+ultrasound video dataset and demonstrate that our active sampling method
+outperforms competitive baselines, including uniform and variable-density
+random sampling, as well as equidistantly spaced scan-lines, improving mean
+absolute reconstruction error by 15%. Moreover, posterior inference and the
+sampling scheme generation are performed in just 0.015 seconds (66Hz), making
+it fast enough for real-time 2D ultrasound imaging applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Reinforcement Learning-Based Dynamic Adaptive Evaluation Function
+  for Real-Time Strategy Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weilong Yang, Jie Zhang, Xunyun Liu, Yanqing Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective evaluation of real-time strategy tasks requires adaptive mechanisms
+to cope with dynamic and unpredictable environments. This study proposes a
+method to improve evaluation functions for real-time responsiveness to
+battle-field situation changes, utilizing an online reinforcement
+learning-based dynam-ic weight adjustment mechanism within the real-time
+strategy game. Building on traditional static evaluation functions, the method
+employs gradient descent in online reinforcement learning to update weights
+dynamically, incorporating weight decay techniques to ensure stability.
+Additionally, the AdamW optimizer is integrated to adjust the learning rate and
+decay rate of online reinforcement learning in real time, further reducing the
+dependency on manual parameter tun-ing. Round-robin competition experiments
+demonstrate that this method signifi-cantly enhances the application
+effectiveness of the Lanchester combat model evaluation function, Simple
+evaluation function, and Simple Sqrt evaluation function in planning algorithms
+including IDABCD, IDRTMinimax, and Port-folio AI. The method achieves a notable
+improvement in scores, with the en-hancement becoming more pronounced as the
+map size increases. Furthermore, the increase in evaluation function
+computation time induced by this method is kept below 6% for all evaluation
+functions and planning algorithms. The pro-posed dynamic adaptive evaluation
+function demonstrates a promising approach for real-time strategy task
+evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Adaptive ERP: Embedding NLP into Petri-Net creation and Model
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Maged, Gamal Kassem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enterprise Resource Planning (ERP) consultants play a vital role in
+customizing systems to meet specific business needs by processing large amounts
+of data and adapting functionalities. However, the process is
+resource-intensive, time-consuming, and requires continuous adjustments as
+business demands evolve. This research introduces a Self-Adaptive ERP Framework
+that automates customization using enterprise process models and system usage
+analysis. It leverages Artificial Intelligence (AI) & Natural Language
+Processing (NLP) for Petri nets to transform business processes into adaptable
+models, addressing both structural and functional matching. The framework,
+built using Design Science Research (DSR) and a Systematic Literature Review
+(SLR), reduces reliance on manual adjustments, improving ERP customization
+efficiency and accuracy while minimizing the need for consultants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelectiveFinetuning: Enhancing Transfer Learning in Sleep Staging
+  through Selective Domain Alignment <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Zhao, Chenyu Liu, Yi Ding, Xinliang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In practical sleep stage classification, a key challenge is the variability
+of EEG data across different subjects and environments. Differences in
+physiology, age, health status, and recording conditions can lead to domain
+shifts between data. These domain shifts often result in decreased model
+accuracy and reliability, particularly when the model is applied to new data
+with characteristics different from those it was originally trained on, which
+is a typical manifestation of negative transfer. To address this, we propose
+SelectiveFinetuning in this paper. Our method utilizes a pretrained Multi
+Resolution Convolutional Neural Network (MRCNN) to extract EEG features,
+capturing the distinctive characteristics of different sleep stages. To
+mitigate the effect of domain shifts, we introduce a domain aligning mechanism
+that employs Earth Mover Distance (EMD) to evaluate and select source domain
+data closely matching the target domain. By finetuning the model with selective
+source data, our SelectiveFinetuning enhances the model's performance on target
+domain that exhibits domain shifts compared to the data used for training.
+Experimental results show that our method outperforms existing baselines,
+offering greater robustness and adaptability in practical scenarios where data
+distributions are often unpredictable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-adaptive vision-language model for 3D segmentation of pulmonary
+  artery and vein 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03722v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03722v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaotong Guo, Deqian Yang, Dan Wang, Haochen Zhao, Yuan Li, Zhilin Sui, Tao Zhou, Lijun Zhang, Yanda Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of pulmonary structures iscrucial in clinical
+diagnosis, disease study, and treatment planning. Significant progress has been
+made in deep learning-based segmentation techniques, but most require much
+labeled data for training. Consequently, developing precise segmentation
+methods that demand fewer labeled datasets is paramount in medical image
+analysis. The emergence of pre-trained vision-language foundation models, such
+as CLIP, recently opened the door for universal computer vision tasks.
+Exploiting the generalization ability of these pre-trained foundation models on
+downstream tasks, such as segmentation, leads to unexpected performance with a
+relatively small amount of labeled data. However, exploring these models for
+pulmonary artery-vein segmentation is still limited. This paper proposes a
+novel framework called Language-guided self-adaptive Cross-Attention Fusion
+Framework. Our method adopts pre-trained CLIP as a strong feature extractor for
+generating the segmentation of 3D CT scans, while adaptively aggregating the
+cross-modality of text and image representations. We propose a s pecially
+designed adapter module to fine-tune pre-trained CLIP with a self-adaptive
+learning strategy to effectively fuse the two modalities of embeddings. We
+extensively validate our method on a local dataset, which is the largest
+pulmonary artery-vein CT dataset to date and consists of 718 labeled data in
+total. The experiments show that our method outperformed other state-of-the-art
+methods by a large margin. Our data and code will be made publicly available
+upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages,3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Materialist: Physically Based Editing Using Single-Image Inverse
+  Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lezhong Wang, Duc Minh Tran, Ruiqi Cui, Thomson TG, Manmohan Chandraker, Jeppe Revall Frisvad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To perform image editing based on single-view, inverse physically based
+rendering, we present a method combining a learning-based approach with
+progressive differentiable rendering. Given an image, our method leverages
+neural networks to predict initial material properties. Progressive
+differentiable rendering is then used to optimize the environment map and
+refine the material properties with the goal of closely matching the rendered
+result to the input image. We require only a single image while other inverse
+rendering methods based on the rendering equation require multiple views. In
+comparison to single-view methods that rely on neural renderers, our approach
+achieves more realistic light material interactions, accurate shadows, and
+global illumination. Furthermore, with optimized material properties and
+illumination, our method enables a variety of tasks, including physically based
+material editing, object insertion, and relighting. We also propose a method
+for material transparency editing that operates effectively without requiring
+full scene geometry. Compared with methods based on Stable Diffusion, our
+approach offers stronger interpretability and more realistic light refraction
+based on empirical results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code will be available at github.com/lez-s/Materialist</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Deconstruction Search for Vehicle Routing Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        André Hottung, Paula Wong-Chung, Kevin Tierney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autoregressive construction approaches generate solutions to vehicle routing
+problems in a step-by-step fashion, leading to high-quality solutions that are
+nearing the performance achieved by handcrafted, operations research
+techniques. In this work, we challenge the conventional paradigm of sequential
+solution construction and introduce an iterative search framework where
+solutions are instead deconstructed by a neural policy. Throughout the search,
+the neural policy collaborates with a simple greedy insertion algorithm to
+rebuild the deconstructed solutions. Our approach surpasses the performance of
+state-of-the-art operations research methods across three challenging vehicle
+routing problems of various problem sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Speech Segmentation: A General Approach Using Speech
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avishai Elmakies, Omri Abend, Yossi Adi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce an unsupervised approach for Speech Segmentation,
+which builds on previously researched approaches, e.g., Speaker Diarization,
+while being applicable to an inclusive set of acoustic-semantic distinctions,
+paving a path towards a general Unsupervised Speech Segmentation approach.
+Unlike traditional speech and audio segmentation, which mainly focuses on
+spectral changes in the input signal, e.g., phone segmentation, our approach
+tries to segment the spoken utterance into chunks with differing
+acoustic-semantic styles, focusing on acoustic-semantic information that does
+not translate well into text, e.g., emotion or speaker. While most Speech
+Segmentation tasks only handle one style change, e.g., emotion diarization, our
+approach tries to handle multiple acoustic-semantic style changes. Leveraging
+recent advances in Speech Language Models (SLMs), we propose a simple
+unsupervised method to segment a given speech utterance. We empirically
+demonstrate the effectiveness of the proposed approach by considering several
+setups. Results suggest that the proposed method is superior to the evaluated
+baselines on boundary detection, segment purity, and over-segmentation. Code is
+available at
+https://github.com/avishaiElmakies/unsupervised_speech_segmentation_using_slm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AuxDepthNet: Real-Time Monocular 3D Object Detection with
+  Depth-Sensitive Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruochen Zhang, Hyeung-Sik Choi, Dongwook Jung, Phan Huy Nam Anh, Sang-Ki Jeong, Zihao Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular 3D object detection is a challenging task in autonomous systems due
+to the lack of explicit depth information in single-view images. Existing
+methods often depend on external depth estimators or expensive sensors, which
+increase computational complexity and hinder real-time performance. To overcome
+these limitations, we propose AuxDepthNet, an efficient framework for real-time
+monocular 3D object detection that eliminates the reliance on external depth
+maps or pre-trained depth models. AuxDepthNet introduces two key components:
+the Auxiliary Depth Feature (ADF) module, which implicitly learns
+depth-sensitive features to improve spatial reasoning and computational
+efficiency, and the Depth Position Mapping (DPM) module, which embeds depth
+positional information directly into the detection process to enable accurate
+object localization and 3D bounding box regression. Leveraging the DepthFusion
+Transformer architecture, AuxDepthNet globally integrates visual and
+depth-sensitive features through depth-guided interactions, ensuring robust and
+efficient detection. Extensive experiments on the KITTI dataset show that
+AuxDepthNet achieves state-of-the-art performance, with $\text{AP}_{3D}$ scores
+of 24.72\% (Easy), 18.63\% (Moderate), and 15.31\% (Hard), and
+$\text{AP}_{\text{BEV}}$ scores of 34.11\% (Easy), 25.18\% (Moderate), and
+21.90\% (Hard) at an IoU threshold of 0.7.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Molecule Generation Using Latent Space Graph Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prashanth Pombala, Gerrit Grossmann, Verena Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating molecular graphs is a challenging task due to their discrete
+nature and the competitive objectives involved. Diffusion models have emerged
+as SOTA approaches in data generation across various modalities. For molecular
+graphs, graph neural networks (GNNs) as a diffusion backbone have achieved
+impressive results. Latent space diffusion, where diffusion occurs in a
+low-dimensional space via an autoencoder, has demonstrated computational
+efficiency. However, the literature on latent space diffusion for molecular
+graphs is scarce, and no commonly accepted best practices exist. In this work,
+we explore different approaches and hyperparameters, contrasting generative
+flow models (denoising diffusion, flow matching, heat dissipation) and
+architectures (GNNs and E(3)-equivariant GNNs). Our experiments reveal a high
+sensitivity to the choice of approach and design decisions. Code is made
+available at
+github.com/Prashanth-Pombala/Molecule-Generation-using-Latent-Space-Graph-Diffusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAJL: A Model-Agnostic Joint Learning Framework for Music Source
+  Separation and Pitch Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojie Wei, Jun Yuan, Rui Zhang, Quanyu Dai, Yueguo Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music source separation and pitch estimation are two vital tasks in music
+information retrieval. Typically, the input of pitch estimation is obtained
+from the output of music source separation. Therefore, existing methods have
+tried to perform these two tasks simultaneously, so as to leverage the mutually
+beneficial relationship between both tasks. However, these methods still face
+two critical challenges that limit the improvement of both tasks: the lack of
+labeled data and joint learning optimization. To address these challenges, we
+propose a Model-Agnostic Joint Learning (MAJL) framework for both tasks. MAJL
+is a generic framework and can use variant models for each task. It includes a
+two-stage training method and a dynamic weighting method named Dynamic Weights
+on Hard Samples (DWHS), which addresses the lack of labeled data and joint
+learning optimization, respectively. Experimental results on public music
+datasets show that MAJL outperforms state-of-the-art methods on both tasks,
+with significant improvements of 0.92 in Signal-to-Distortion Ratio (SDR) for
+music source separation and 2.71% in Raw Pitch Accuracy (RPA) for pitch
+estimation. Furthermore, comprehensive studies not only validate the
+effectiveness of each component of MAJL, but also indicate the great generality
+of MAJL in adapting to different model architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLAM: Towards Efficient Multilingual Reasoning via Selective Language
+  Alignment <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchun Fan, Yongyu Mu, Yilin Wang, Lei Huang, Junhao Ruan, Bei Li, Tong Xiao, Shujian Huang, Xiaocheng Feng, Jingbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the significant improvements achieved by large language models (LLMs)
+in English reasoning tasks, these models continue to struggle with multilingual
+reasoning. Recent studies leverage a full-parameter and two-stage training
+paradigm to teach models to first understand non-English questions and then
+reason. However, this method suffers from both substantial computational
+resource computing and catastrophic forgetting. The fundamental cause is that,
+with the primary goal of enhancing multilingual comprehension, an excessive
+number of irrelevant layers and parameters are tuned during the first stage.
+Given our findings that the representation learning of languages is merely
+conducted in lower-level layers, we propose an efficient multilingual reasoning
+alignment approach that precisely identifies and fine-tunes the layers
+responsible for handling multilingualism. Experimental results show that our
+method, SLAM, only tunes 6 layers' feed-forward sub-layers including 6.5-8% of
+all parameters within 7B and 13B LLMs, achieving superior average performance
+than all strong baselines across 10 languages. Meanwhile, SLAM only involves
+one training stage, reducing training time by 4.1-11.9 compared to the
+two-stage method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by COLING 2025 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SALE-Based Offline Reinforcement Learning with Ensemble Q-Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Chun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we build upon the offline reinforcement learning algorithm TD7,
+which incorporates State-Action Learned Embeddings (SALE) and LAP, and propose
+a model-free actor-critic algorithm that integrates ensemble Q-networks and a
+gradient diversity penalty from EDAC. The ensemble Q-networks effectively
+address the challenge of out-of-distribution actions by introducing penalties
+that guide the actor network to focus on in-distribution actions. Meanwhile,
+the gradient diversity penalty encourages diverse Q-value gradients, further
+suppressing overestimation for out-of-distribution actions. Additionally, our
+method retains an adjustable behavior cloning (BC) term that directs the actor
+network toward dataset actions during early training stages, while gradually
+reducing its influence as the precision of the Q-ensemble improves. These
+enhancements work synergistically to improve training stability and accuracy.
+Experimental results on the D4RL MuJoCo benchmarks demonstrate that our
+algorithm achieves superior convergence speed, stability, and performance
+compared to existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Action Quality Assessment via Hierarchical Pose-guided Multi-stage
+  Contrastive Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengshi Qi, Hao Ye, Jiaxuan Peng, Huadong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Action Quality Assessment (AQA), which aims at automatic and fair evaluation
+of athletic performance, has gained increasing attention in recent years.
+However, athletes are often in rapid movement and the corresponding visual
+appearance variances are subtle, making it challenging to capture fine-grained
+pose differences and leading to poor estimation performance. Furthermore, most
+common AQA tasks, such as diving in sports, are usually divided into multiple
+sub-actions, each of which contains different durations. However, existing
+methods focus on segmenting the video into fixed frames, which disrupts the
+temporal continuity of sub-actions resulting in unavoidable prediction errors.
+To address these challenges, we propose a novel action quality assessment
+method through hierarchically pose-guided multi-stage contrastive regression.
+Firstly, we introduce a multi-scale dynamic visual-skeleton encoder to capture
+fine-grained spatio-temporal visual and skeletal features. Then, a procedure
+segmentation network is introduced to separate different sub-actions and obtain
+segmented features. Afterwards, the segmented visual and skeletal features are
+both fed into a multi-modal fusion module as physics structural priors, to
+guide the model in learning refined activity similarities and variances.
+Finally, a multi-stage contrastive learning regression approach is employed to
+learn discriminative representations and output prediction results. In
+addition, we introduce a newly-annotated FineDiving-Pose Dataset to improve the
+current low-quality human pose labels. In experiments, the results on
+FineDiving and MTL-AQA datasets demonstrate the effectiveness and superiority
+of our proposed approach. Our source code and dataset are available at
+https://github.com/Lumos0507/HP-MCoRe.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Diversity-Enhanced Knowledge Distillation Model for Practical Math
+  Word Problem Solving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Zhang, Guangyou Zhou, Zhiwen Xie, Jinjin Ma, Jimmy Xiangji Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Math Word Problem (MWP) solving is a critical task in natural language
+processing, has garnered significant research interest in recent years. Various
+recent studies heavily rely on Seq2Seq models and their extensions (e.g.,
+Seq2Tree and Graph2Tree) to generate mathematical equations. While effective,
+these models struggle to generate diverse but counterpart solution equations,
+limiting their generalization across various math problem scenarios. In this
+paper, we introduce a novel Diversity-enhanced Knowledge Distillation (DivKD)
+model for practical MWP solving. Our approach proposes an adaptive diversity
+distillation method, in which a student model learns diverse equations by
+selectively transferring high-quality knowledge from a teacher model.
+Additionally, we design a diversity prior-enhanced student model to better
+capture the diversity distribution of equations by incorporating a conditional
+variational auto-encoder. Extensive experiments on {four} MWP benchmark
+datasets demonstrate that our approach achieves higher answer accuracy than
+strong baselines while maintaining high efficiency for practical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effective and Efficient Mixed Precision Quantization of Speech
+  Foundation Models <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoning Xu, Zhaoqing Li, Zengrui Jin, Huimeng Wang, Youjun Chen, Guinan Li, Mengzhe Geng, Shujie Hu, Jiajun Deng, Xunying Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel mixed-precision quantization approach for speech
+foundation models that tightly integrates mixed-precision learning and
+quantized model parameter estimation into one single model compression stage.
+Experiments conducted on LibriSpeech dataset with fine-tuned wav2vec2.0-base
+and HuBERT-large models suggest the resulting mixed-precision quantized models
+increased the lossless compression ratio by factors up to 1.7x and 1.9x over
+the respective uniform-precision and two-stage mixed-precision quantized
+baselines that perform precision learning and model parameters quantization in
+separate and disjointed stages, while incurring no statistically word error
+rate (WER) increase over the 32-bit full-precision models. The system
+compression time of wav2vec2.0-base and HuBERT-large models is reduced by up to
+1.9 and 1.5 times over the two-stage mixed-precision baselines, while both
+produce lower WERs. The best-performing 3.5-bit mixed-precision quantized
+HuBERT-large model produces a lossless compression ratio of 8.6x over the
+32-bit full-precision system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at IEEE ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MHGNet: Multi-Heterogeneous Graph Neural Network for Traffic Prediction <span class="chip">SP2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mei Wu, Yiqian Lin, Tianfan Jiang, Wenchao Weng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, traffic flow prediction has played a crucial role in the
+management of intelligent transportation systems. However, traditional
+forecasting methods often model non-Euclidean low-dimensional traffic data as a
+simple graph with single-type nodes and edges, failing to capture similar
+trends among nodes of the same type. To address this limitation, this paper
+proposes MHGNet, a novel framework for modeling spatiotemporal
+multi-heterogeneous graphs. Within this framework, the STD Module decouples
+single-pattern traffic data into multi-pattern traffic data through feature
+mappings of timestamp embedding matrices and node embedding matrices.
+Subsequently, the Node Clusterer leverages the Euclidean distance between nodes
+and different types of limit points to perform clustering with O(N) time
+complexity. The nodes within each cluster undergo residual subgraph convolution
+within the spatiotemporal fusion subgraphs generated by the DSTGG Module,
+followed by processing in the SIE Module for node repositioning and
+redistribution of weights. To validate the effectiveness of MHGNet, this paper
+conducts extensive ablation studies and quantitative evaluations on four widely
+used benchmarks, demonstrating its superior performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 2025 lEEE International Conference on Acoustics, speech,
+  and signal Processing (lCASSP2025)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RecKG: Knowledge Graph for Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03598v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03598v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhyuk Kwon, Seokho Ahn, Young-Duk Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graphs have proven successful in integrating heterogeneous data
+across various domains. However, there remains a noticeable dearth of research
+on their seamless integration among heterogeneous recommender systems, despite
+knowledge graph-based recommender systems garnering extensive research
+attention. This study aims to fill this gap by proposing RecKG, a standardized
+knowledge graph for recommender systems. RecKG ensures the consistent
+representation of entities across different datasets, accommodating diverse
+attribute types for effective data integration. Through a meticulous
+examination of various recommender system datasets, we select attributes for
+RecKG, ensuring standardized formatting through consistent naming conventions.
+By these characteristics, RecKG can seamlessly integrate heterogeneous data
+sources, enabling the discovery of additional semantic information within the
+integrated knowledge graph. We apply RecKG to standardize real-world datasets,
+subsequently developing an application for RecKG using a graph database.
+Finally, we validate RecKG's achievement in interoperability through a
+qualitative evaluation between RecKG and other studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by The 39th ACM/SIGAPP Symposium On Applied Computing(SAC)
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ STContext: A Multifaceted <span class="highlight-title">Dataset</span> for Developing Context-aware
+  Spatio-temporal Crowd Mobility Prediction Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03583v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03583v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyue Chen, Jiangyi Fang, Tengfei Liu, Fangyuan Gao, Leye Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In smart cities, context-aware spatio-temporal crowd flow prediction (STCFP)
+models leverage contextual features (e.g., weather) to identify unusual crowd
+mobility patterns and enhance prediction accuracy. However, the best practice
+for incorporating contextual features remains unclear due to inconsistent usage
+of contextual features in different papers. Developing a multifaceted dataset
+with rich types of contextual features and STCFP scenarios is crucial for
+establishing a principled context modeling paradigm. Existing open crowd flow
+datasets lack an adequate range of contextual features, which poses an urgent
+requirement to build a multifaceted dataset to fill these research gaps. To
+this end, we create STContext, a multifaceted dataset for developing
+context-aware STCFP models. Specifically, STContext provides nine
+spatio-temporal datasets across five STCFP scenarios and includes ten
+contextual features, including weather, air quality index, holidays, points of
+interest, road networks, etc. Besides, we propose a unified workflow for
+incorporating contextual features into deep STCFP methods, with steps including
+feature transformation, dependency modeling, representation fusion, and
+training strategies. Through extensive experiments, we have obtained several
+useful guidelines for effective context modeling and insights for future
+research. The STContext is open-sourced at
+https://github.com/Liyue-Chen/STContext.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cosmos World Foundation Model Platform for Physical AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         NVIDIA,  :, Niket Agarwal, Arslan Ali, Maciej Bala, Yogesh Balaji, Erik Barker, Tiffany Cai, Prithvijit Chattopadhyay, Yongxin Chen, Yin Cui, Yifan Ding, Daniel Dworakowski, Jiaojiao Fan, Michele Fenzi, Francesco Ferroni, Sanja Fidler, Dieter Fox, Songwei Ge, Yunhao Ge, Jinwei Gu, Siddharth Gururani, Ethan He, Jiahui Huang, Jacob Huffman, Pooya Jannaty, Jingyi Jin, Seung Wook Kim, Gergely Klár, Grace Lam, Shiyi Lan, Laura Leal-Taixe, Anqi Li, Zhaoshuo Li, Chen-Hsuan Lin, Tsung-Yi Lin, Huan Ling, Ming-Yu Liu, Xian Liu, Alice Luo, Qianli Ma, Hanzi Mao, Kaichun Mo, Arsalan Mousavian, Seungjun Nah, Sriharsha Niverty, David Page, Despoina Paschalidou, Zeeshan Patel, Lindsey Pavao, Morteza Ramezanali, Fitsum Reda, Xiaowei Ren, Vasanth Rao Naik Sabavat, Ed Schmerling, Stella Shi, Bartosz Stefaniak, Shitao Tang, Lyne Tchapmi, Przemek Tredak, Wei-Cheng Tseng, Jibin Varghese, Hao Wang, Haoxiang Wang, Heng Wang, Ting-Chun Wang, Fangyin Wei, Xinyue Wei, Jay Zhangjie Wu, Jiashu Xu, Wei Yang, Lin Yen-Chen, Xiaohui Zeng, Yu Zeng, Jing Zhang, Qinsheng Zhang, Yuxuan Zhang, Qingqing Zhao, Artur Zolkowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physical AI needs to be trained digitally first. It needs a digital twin of
+itself, the policy model, and a digital twin of the world, the world model. In
+this paper, we present the Cosmos World Foundation Model Platform to help
+developers build customized world models for their Physical AI setups. We
+position a world foundation model as a general-purpose world model that can be
+fine-tuned into customized world models for downstream applications. Our
+platform covers a video curation pipeline, pre-trained world foundation models,
+examples of post-training of pre-trained world foundation models, and video
+tokenizers. To help Physical AI builders solve the most critical problems of
+our society, we make our platform open-source and our models open-weight with
+permissive licenses available via https://github.com/NVIDIA/Cosmos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Code to Compliance: Assessing Chat<span class="highlight-title">GPT</span>'s Utility in Designing an
+  Accessible Webpage -- A Case Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ammar Ahmed, Margarida Fresco, Fredrik Forsberg, Hallvard Grotli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Web accessibility ensures that individuals with disabilities can access and
+interact with digital content without barriers, yet a significant majority of
+most used websites fail to meet accessibility standards. This study evaluates
+ChatGPT's (GPT-4o) ability to generate and improve web pages in line with Web
+Content Accessibility Guidelines (WCAG). While ChatGPT can effectively address
+accessibility issues when prompted, its default code often lacks compliance,
+reflecting limitations in its training data and prevailing inaccessible web
+practices. Automated and manual testing revealed strengths in resolving simple
+issues but challenges with complex tasks, requiring human oversight and
+additional iterations. Unlike prior studies, we incorporate manual evaluation,
+dynamic elements, and use the visual reasoning capability of ChatGPT along with
+the prompts to fix accessibility issues. Providing screenshots alongside
+prompts enhances the LLM's ability to address accessibility issues by allowing
+it to analyze surrounding components, such as determining appropriate contrast
+colors. We found that effective prompt engineering, such as providing concise,
+structured feedback and incorporating visual aids, significantly enhances
+ChatGPT's performance. These findings highlight the potential and limitations
+of large language models for accessible web development, offering practical
+guidance for developers to create more inclusive websites.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Applying Large Language Models in Knowledge Graph-based Enterprise
+  Modeling: Challenges and Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benedikt Reitemeyer, Hans-Georg Fill
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The role of large language models (LLMs) in enterprise modeling has recently
+started to shift from academic research to that of industrial applications.
+Thereby, LLMs represent a further building block for the machine-supported
+generation of enterprise models. In this paper we employ a knowledge
+graph-based approach for enterprise modeling and investigate the potential
+benefits of LLMs in this context. In addition, the findings of an expert survey
+and ChatGPT-4o-based experiments demonstrate that LLM-based model generations
+exhibit minimal variability, yet remain constrained to specific tasks, with
+reliability declining for more intricate tasks. The survey results further
+suggest that the supervision and intervention of human modeling experts are
+essential to ensure the accuracy and integrity of the generated models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Adversarial Attacks in Reinforcement Learning from Policy
+  Distribution Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03562v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03562v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyang Duan, Zongyuan Zhang, Zheng Lin, Yue Gao, Ling Xiong, Yong Cui, Hongbin Liang, Xianhao Chen, Heming Cui, Dong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Reinforcement Learning (DRL) suffers from uncertainties and inaccuracies
+in the observation signal in realworld applications. Adversarial attack is an
+effective method for evaluating the robustness of DRL agents. However, existing
+attack methods targeting individual sampled actions have limited impacts on the
+overall policy distribution, particularly in continuous action spaces. To
+address these limitations, we propose the Distribution-Aware Projected Gradient
+Descent attack (DAPGD). DAPGD uses distribution similarity as the gradient
+perturbation input to attack the policy network, which leverages the entire
+policy distribution rather than relying on individual samples. We utilize the
+Bhattacharyya distance in DAPGD to measure policy similarity, enabling
+sensitive detection of subtle but critical differences between probability
+distributions. Our experiment results demonstrate that DAPGD achieves SOTA
+results compared to the baselines in three robot navigation tasks, achieving an
+average 22.03% higher reward drop compared to the best baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KG-TRICK: Unifying Textual and Relational Information Completion of
+  Knowledge for Multilingual Knowledge Graphs <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelin Zhou, Simone Conia, Daniel Lee, Min Li, Shenglei Huang, Umar Farooq Minhas, Saloni Potdar, Henry Xiao, Yunyao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilingual knowledge graphs (KGs) provide high-quality relational and
+textual information for various NLP applications, but they are often
+incomplete, especially in non-English languages. Previous research has shown
+that combining information from KGs in different languages aids either
+Knowledge Graph Completion (KGC), the task of predicting missing relations
+between entities, or Knowledge Graph Enhancement (KGE), the task of predicting
+missing textual information for entities. Although previous efforts have
+considered KGC and KGE as independent tasks, we hypothesize that they are
+interdependent and mutually beneficial. To this end, we introduce KG-TRICK, a
+novel sequence-to-sequence framework that unifies the tasks of textual and
+relational information completion for multilingual KGs. KG-TRICK demonstrates
+that: i) it is possible to unify the tasks of KGC and KGE into a single
+framework, and ii) combining textual information from multiple languages is
+beneficial to improve the completeness of a KG. As part of our contributions,
+we also introduce WikiKGE10++, the largest manually-curated benchmark for
+textual information completion of KGs, which features over 25,000 entities
+across 10 diverse languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera ready for COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>Guard: Soft <span class="highlight-title">Prompt</span>-Guided Unsafe Content Moderation for
+  Text-to-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingzhi Yuan, Xinfeng Li, Chejian Xu, Guanhong Tao, Xiaojun Jia, Yihao Huang, Wei Dong, Yang Liu, XiaoFeng Wang, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image (T2I) models have been shown to be vulnerable to misuse,
+particularly in generating not-safe-for-work (NSFW) content, raising serious
+ethical concerns. In this work, we present PromptGuard, a novel content
+moderation technique that draws inspiration from the system prompt mechanism in
+large language models (LLMs) for safety alignment. Unlike LLMs, T2I models lack
+a direct interface for enforcing behavioral guidelines. Our key idea is to
+optimize a safety soft prompt that functions as an implicit system prompt
+within the T2I model's textual embedding space. This universal soft prompt (P*)
+directly moderates NSFW inputs, enabling safe yet realistic image generation
+without altering the inference efficiency or requiring proxy models. Extensive
+experiments across three datasets demonstrate that PromptGuard effectively
+mitigates NSFW content generation while preserving high-quality benign outputs.
+PromptGuard achieves 7.8 times faster than prior content moderation methods,
+surpassing eight state-of-the-art defenses with an optimal unsafe ratio down to
+5.84%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 8 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning within Tabular Data: Foundations, Challenges, Advances and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijieying Ren, Tianxiang Zhao, Yuqing Huang, Vasant Honavar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tabular data remains one of the most prevalent data types across a wide range
+of real-world applications, yet effective representation learning for this
+domain poses unique challenges due to its irregular patterns, heterogeneous
+feature distributions, and complex inter-column dependencies. This survey
+provides a comprehensive review of state-of-the-art techniques in tabular data
+representation learning, structured around three foundational design elements:
+training data, neural architectures, and learning objectives. Unlike prior
+surveys that focus primarily on either architecture design or learning
+strategies, we adopt a holistic perspective that emphasizes the universality
+and robustness of representation learning methods across diverse downstream
+tasks. We examine recent advances in data augmentation and generation,
+specialized neural network architectures tailored to tabular data, and
+innovative learning objectives that enhance representation quality.
+Additionally, we highlight the growing influence of self-supervised learning
+and the adaptation of transformer-based foundation models for tabular data. Our
+review is based on a systematic literature search using rigorous inclusion
+criteria, encompassing 127 papers published since 2020 in top-tier conferences
+and journals. Through detailed analysis and comparison, we identify emerging
+trends, critical gaps, and promising directions for future research, aiming to
+guide the development of more generalizable and effective tabular data
+representation methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SenseRAG: Constructing Environmental Knowledge Bases with Proactive
+  Querying for LLM-Based Autonomous Driving <span class="chip">WACV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuewen Luo, Fan Ding, Fengze Yang, Yang Zhou, Junnyong Loo, Hwa Hui Tew, Chenxi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the critical need for enhanced situational awareness in
+autonomous driving (AD) by leveraging the contextual reasoning capabilities of
+large language models (LLMs). Unlike traditional perception systems that rely
+on rigid, label-based annotations, it integrates real-time, multimodal sensor
+data into a unified, LLMs-readable knowledge base, enabling LLMs to dynamically
+understand and respond to complex driving environments. To overcome the
+inherent latency and modality limitations of LLMs, a proactive
+Retrieval-Augmented Generation (RAG) is designed for AD, combined with a
+chain-of-thought prompting mechanism, ensuring rapid and context-rich
+understanding. Experimental results using real-world Vehicle-to-everything
+(V2X) datasets demonstrate significant improvements in perception and
+prediction performance, highlighting the potential of this framework to enhance
+safety, adaptability, and decision-making in next-generation AD systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at WACV Workshop LLMAD
+  2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vocal Tract Length Warped Features for Spoken Keyword Spotting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Achintya kr. Sarkar, Priyanka Dwivedi, Zheng-Hua Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose several methods that incorporate vocal tract length
+(VTL) warped features for spoken keyword spotting (KWS). The first method,
+VTL-independent KWS, involves training a single deep neural network (DNN) that
+utilizes VTL features with various warping factors. During training, a specific
+VTL feature is randomly selected per epoch, allowing the exploration of VTL
+variations. During testing, the VTL features with different warping factors of
+a test utterance are scored against the DNN and combined with equal weight. In
+the second method scores the conventional features of a test utterance (without
+VTL warping) against the DNN. The third method, VTL-concatenation KWS,
+concatenates VTL warped features to form high-dimensional features for KWS.
+Evaluations carried out on the English Google Command dataset demonstrate that
+the proposed methods improve the accuracy of KWS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Deep Learning Trigger Alerts from Mobile-Captured Images? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pritisha Sarkar, Duranta Durbaar Vishal Saha, Mousumi Saha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our research presents a comprehensive approach to leveraging mobile camera
+image data for real-time air quality assessment and recommendation. We develop
+a regression-based Convolutional Neural Network model and tailor it explicitly
+for air quality prediction by exploiting the inherent relationship between
+output parameters. As a result, the Mean Squared Error of 0.0077 and 0.0112
+obtained for 2 and 5 pollutants respectively outperforms existing models.
+Furthermore, we aim to verify the common practice of augmenting the original
+dataset with a view to introducing more variation in the training phase. It is
+one of our most significant contributions that our experimental results
+demonstrate minimal accuracy differences between the original and augmented
+datasets. Finally, a real-time, user-friendly dashboard is implemented which
+dynamically displays the Air Quality Index and pollutant values derived from
+captured mobile camera images. Users' health conditions are considered to
+recommend whether a location is suitable based on current air quality metrics.
+Overall, this research contributes to verification of data augmentation
+techniques, CNN-based regression modelling for air quality prediction, and
+user-centric air quality monitoring through mobile technology. The proposed
+system offers practical solutions for individuals to make informed
+environmental health and well-being decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can LLMs Design Good Questions Based on Context? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueheng Zhang, Xiaoyuan Liu, Yiyou Sun, Atheer Alharbi, Hend Alzahrani, Basel Alomair, Dawn Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper evaluates questions generated by LLMs from context, comparing them
+to human-generated questions across six dimensions. We introduce an automated
+LLM-based evaluation method, focusing on aspects like question length, type,
+context coverage, and answerability. Our findings highlight unique
+characteristics of LLM-generated questions, contributing insights that can
+support further research in question quality and downstream applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Align-Pro: A Principled Approach to <span class="highlight-title">Prompt</span> Optimization for LLM
+  Alignment <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prashant Trivedi, Souradip Chakraborty, Avinash Reddy, Vaneet Aggarwal, Amrit Singh Bedi, George K. Atia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The alignment of large language models (LLMs) with human values is critical
+as these models become increasingly integrated into various societal and
+decision-making processes. Traditional methods, such as reinforcement learning
+from human feedback (RLHF), achieve alignment by fine-tuning model parameters,
+but these approaches are often computationally expensive and impractical when
+models are frozen or inaccessible for parameter modification. In contrast,
+prompt optimization is a viable alternative to RLHF for LLM alignment. While
+the existing literature has shown empirical promise of prompt optimization, its
+theoretical underpinning remains under-explored. We address this gap by
+formulating prompt optimization as an optimization problem and try to provide
+theoretical insights into the optimality of such a framework. To analyze the
+performance of the prompt optimization, we study theoretical suboptimality
+bounds and provide insights in terms of how prompt optimization depends upon
+the given prompter and target model. We also provide empirical validation
+through experiments on various datasets, demonstrating that prompt optimization
+can effectively align LLMs, even when parameter fine-tuning is not feasible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, Accepted in AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reading with Intent -- Neutralizing Intent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Reichman, Adar Avsian, Larry Heck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Queries to large language models (LLMs) can be divided into two parts: the
+instruction/question and the accompanying context. The context for
+retrieval-augmented generation (RAG) systems in most benchmarks comes from
+Wikipedia or Wikipedia-like texts which are written in a neutral and factual
+tone. However, when RAG systems retrieve internet-based content, they encounter
+text with diverse tones and linguistic styles, introducing challenges for
+downstream tasks. The Reading with Intent task addresses this issue by
+evaluating how varying tones in context passages affect model performance.
+Building on prior work that focused on sarcasm, we extend this paradigm by
+constructing a dataset where context passages are transformed to $11$ distinct
+emotions using a better synthetic data generation approach. Using this dataset,
+we train an emotion translation model to systematically adapt passages to
+specified emotional tones. The human evaluation shows that the LLM fine-tuned
+to become the emotion-translator benefited from the synthetically generated
+data. Finally, the emotion-translator is used in the Reading with Intent task
+to transform the passages to a neutral tone. By neutralizing the passages, it
+mitigates the challenges posed by sarcastic passages and improves overall
+results on this task by about $3\%$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MTRAG: A Multi-Turn Conversational Benchmark for Evaluating
+  Retrieval-Augmented Generation Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannis Katsis, Sara Rosenthal, Kshitij Fadnis, Chulaka Gunasekara, Young-Suk Lee, Lucian Popa, Vraj Shah, Huaiyu Zhu, Danish Contractor, Marina Danilevsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) has recently become a very popular task
+for Large Language Models (LLMs). Evaluating them on multi-turn RAG
+conversations, where the system is asked to generate a response to a question
+in the context of a preceding conversation is an important and often overlooked
+task with several additional challenges. We present MTRAG: an end-to-end
+human-generated multi-turn RAG benchmark that reflects several real-world
+properties across diverse dimensions for evaluating the full RAG pipeline.
+MTRAG contains 110 conversations averaging 7.7 turns each across four domains
+for a total of 842 tasks. We also explore automation paths via synthetic data
+and LLM-as-a-Judge evaluation. Our human and automatic evaluations show that
+even state-of-the-art LLM RAG systems struggle on MTRAG. We demonstrate the
+need for strong retrieval and generation systems that can handle later turns,
+unanswerable questions, non-standalone questions, and multiple domains. MTRAG
+is available at https://github.com/ibm/mt-rag-benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LHGNN: Local-Higher Order Graph Neural Networks For Audio Classification
+  and Tagging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubhr Singh, Emmanouil Benetos, Huy Phan, Dan Stowell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have set new benchmarks in audio processing tasks, leveraging
+self-attention mechanisms to capture complex patterns and dependencies within
+audio data. However, their focus on pairwise interactions limits their ability
+to process the higher-order relations essential for identifying distinct audio
+objects. To address this limitation, this work introduces the Local- Higher
+Order Graph Neural Network (LHGNN), a graph based model that enhances feature
+understanding by integrating local neighbourhood information with higher-order
+data from Fuzzy C-Means clusters, thereby capturing a broader spectrum of audio
+relationships. Evaluation of the model on three publicly available audio
+datasets shows that it outperforms Transformer-based models across all
+benchmarks while operating with substantially fewer parameters. Moreover, LHGNN
+demonstrates a distinct advantage in scenarios lacking ImageNet pretraining,
+establishing its effectiveness and efficiency in environments where extensive
+pretraining data is unavailable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Radar Signal Recognition through <span class="highlight-title">Self-Supervised</span> Learning and Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zi Huang, Akila Pemasiri, Simon Denman, Clinton Fookes, Terrence Martin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic radar signal recognition (RSR) plays a pivotal role in electronic
+warfare (EW), as accurately classifying radar signals is critical for informing
+decision-making processes. Recent advances in deep learning have shown
+significant potential in improving RSR performance in domains with ample
+annotated data. However, these methods fall short in EW scenarios where
+annotated RF data are scarce or impractical to obtain. To address these
+challenges, we introduce a self-supervised learning (SSL) method which utilises
+masked signal modelling and RF domain adaption to enhance RSR performance in
+environments with limited RF samples and labels. Specifically, we investigate
+pre-training masked autoencoders (MAE) on baseband in-phase and quadrature
+(I/Q) signals from various RF domains and subsequently transfer the learned
+representation to the radar domain, where annotated data are limited. Empirical
+results show that our lightweight self-supervised ResNet model with domain
+adaptation achieves up to a 17.5\% improvement in 1-shot classification
+accuracy when pre-trained on in-domain signals (i.e., radar signals) and up to
+a 16.31\% improvement when pre-trained on out-of-domain signals (i.e., comm
+signals), compared to its baseline without SSL. We also provide reference
+results for several MAE designs and pre-training strategies, establishing a new
+benchmark for few-shot radar signal classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Activating Associative Disease-Aware Vision Token Memory for LLM-Based
+  X-ray Report Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03458v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03458v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Wang, Fuling Wang, Haowen Wang, Bo Jiang, Chuanfu Li, Yaowei Wang, Yonghong Tian, Jin Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  X-ray image based medical report generation achieves significant progress in
+recent years with the help of the large language model, however, these models
+have not fully exploited the effective information in visual image regions,
+resulting in reports that are linguistically sound but insufficient in
+describing key diseases. In this paper, we propose a novel associative
+memory-enhanced X-ray report generation model that effectively mimics the
+process of professional doctors writing medical reports. It considers both the
+mining of global and local visual information and associates historical report
+information to better complete the writing of the current report. Specifically,
+given an X-ray image, we first utilize a classification model along with its
+activation maps to accomplish the mining of visual regions highly associated
+with diseases and the learning of disease query tokens. Then, we employ a
+visual Hopfield network to establish memory associations for disease-related
+tokens, and a report Hopfield network to retrieve report memory information.
+This process facilitates the generation of high-quality reports based on a
+large language model and achieves state-of-the-art performance on multiple
+benchmark datasets, including the IU X-ray, MIMIC-CXR, and Chexpert Plus. The
+source code of this work is released on
+\url{https://github.com/Event-AHU/Medical_Image_Analysis}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Peer Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimization Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Van Hentenryck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces the concept of optimization learning, a methodology
+to design optimization proxies that learn the input/output mapping of
+parametric optimization problems. These optimization proxies are trustworthy by
+design: they compute feasible solutions to the underlying optimization
+problems, provide quality guarantees on the returned solutions, and scale to
+large instances. Optimization proxies are differentiable programs that combine
+traditional deep learning technology with repair or completion layers to
+produce feasible solutions. The article shows that optimization proxies can be
+trained end-to-end in a self-supervised way. It presents methodologies to
+provide performance guarantees and to scale optimization proxies to large-scale
+optimization problems. The potential of optimization proxies is highlighted
+through applications in power systems and, in particular, real-time risk
+assessment and security-constrained optimal power flow.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fixed Points of Deep Neural Networks: Emergence, Stability, and
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04182v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04182v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        L. Berlyand, V. Slavin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present numerical and analytical results on the formation and stability of
+a family of fixed points of deep neural networks (DNNs). Such fixed points
+appear in a class of DNNs when dimensions of input and output vectors are the
+same. We demonstrate examples of applications of such networks in supervised,
+semi-supervised and unsupervised learning such as encoding/decoding of images,
+restoration of damaged images among others.
+  We present several numerical and analytical results. First, we show that for
+untrained DNN's with weights and biases initialized by normally distributed
+random variables the only one fixed point exists. This result holds for DNN
+with any depth (number of layers) $L$, any layer width $N$, and sigmoid-type
+activation functions. Second, it has been shown that for a DNN whose parameters
+(weights and biases) are initialized by ``light-tailed'' distribution of
+weights (e.g. normal distribution), after training the distribution of these
+parameters become ``heavy-tailed''. This motivates our study of DNNs with
+``heavy-tailed'' initialization. For such DNNs we show numerically %existence
+and stability that training leads to emergence of $Q(N,L)$ fixed points, where
+$Q(N,L)$ is a positive integer which depends on the number of layers $L$ and
+layer width $N$. We further observe numerically that for fixed $N = N_0$ the
+function $Q(N_0, L)$ is non-monotone, that is it initially grows as $L$
+increases and then decreases to 1.
+  This non-monotone behavior of $Q(N_0, L)$ is also obtained by analytical
+derivation of equation for Empirical Spectral Distribution (ESD) of
+input-output Jacobian followed by numerical solution of this equation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HIVEX: A High-Impact Environment Suite for Multi-Agent Research
+  (extended version) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp D. Siedler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Games have been vital test beds for the rapid development of Agent-based
+research. Remarkable progress has been achieved in the past, but it is unclear
+if the findings equip for real-world problems. While pressure grows, some of
+the most critical ecological challenges can find mitigation and prevention
+solutions through technology and its applications. Most real-world domains
+include multi-agent scenarios and require machine-machine and human-machine
+collaboration. Open-source environments have not advanced and are often toy
+scenarios, too abstract or not suitable for multi-agent research. By mimicking
+real-world problems and increasing the complexity of environments, we hope to
+advance state-of-the-art multi-agent research and inspire researchers to work
+on immediate real-world problems. Here, we present HIVEX, an environment suite
+to benchmark multi-agent research focusing on ecological challenges. HIVEX
+includes the following environments: Wind Farm Control, Wildfire Resource
+Management, Drone-Based Reforestation, Ocean Plastic Collection, and Aerial
+Wildfire Suppression. We provide environments, training examples, and baselines
+for the main and sub-tasks. All trained models resulting from the experiments
+of this work are hosted on Hugging Face. We also provide a leaderboard on
+Hugging Face and encourage the community to submit models trained on our
+environment suite.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Multihop Source Retrieval for Web Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navya Yarrabelly, Saloni Mittal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work deals with the challenge of learning and reasoning over multi-modal
+multi-hop question answering (QA). We propose a graph reasoning network based
+on the semantic structure of the sentences to learn multi-source reasoning
+paths and find the supporting facts across both image and text modalities for
+answering the question. In this paper, we investigate the importance of graph
+structure for multi-modal multi-hop question answering. Our analysis is
+centered on WebQA. We construct a strong baseline model, that finds relevant
+sources using a pairwise classification task. We establish that, with the
+proper use of feature representations from pre-trained models, graph structure
+helps in improving multi-modal multi-hop question answering. We point out that
+both graph structure and adjacency matrix are task-related prior knowledge, and
+graph structure can be leveraged to improve the retrieval performance for the
+task. Experiments and visualized analysis demonstrate that message propagation
+over graph networks or the entire graph structure can replace massive
+multimodal transformers with token-wise cross-attention. We demonstrated the
+applicability of our method and show a performance gain of \textbf{4.6$\%$}
+retrieval F1score over the transformer baselines, despite being a very light
+model. We further demonstrated the applicability of our model to a large scale
+retrieval setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2010.03604 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Transfer Human Hand Skills for Robot Manipulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungjae Park, Seungho Lee, Mingi Choi, Jiye Lee, Jeonghwan Kim, Jisoo Kim, Hanbyul Joo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method for teaching dexterous manipulation tasks to robots from
+human hand motion demonstrations. Unlike existing approaches that solely rely
+on kinematics information without taking into account the plausibility of robot
+and object interaction, our method directly infers plausible robot manipulation
+actions from human motion demonstrations. To address the embodiment gap between
+the human hand and the robot system, our approach learns a joint motion
+manifold that maps human hand movements, robot hand actions, and object
+movements in 3D, enabling us to infer one motion component from others. Our key
+idea is the generation of pseudo-supervision triplets, which pair human,
+object, and robot motion trajectories synthetically. Through real-world
+experiments with robot hand manipulation, we demonstrate that our data-driven
+retargeting method significantly outperforms conventional retargeting
+techniques, effectively bridging the embodiment gap between human and robotic
+hands. Website at https://rureadyo.github.io/MocapRobot/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reasoning-Enhanced Self-Training for Long-Form Personalized Text
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alireza Salemi, Cheng Li, Mingyang Zhang, Qiaozhu Mei, Weize Kong, Tao Chen, Zhuowan Li, Michael Bendersky, Hamed Zamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized text generation requires a unique ability of large language
+models (LLMs) to learn from context that they often do not encounter during
+their standard training. One way to encourage LLMs to better use personalized
+context for generating outputs that better align with the user's expectations
+is to instruct them to reason over the user's past preferences, background
+knowledge, or writing style. To achieve this, we propose Reasoning-Enhanced
+Self-Training for Personalized Text Generation (REST-PG), a framework that
+trains LLMs to reason over personal data during response generation. REST-PG
+first generates reasoning paths to train the LLM's reasoning abilities and then
+employs Expectation-Maximization Reinforced Self-Training to iteratively train
+the LLM based on its own high-reward outputs. We evaluate REST-PG on the
+LongLaMP benchmark, consisting of four diverse personalized long-form text
+generation tasks. Our experiments demonstrate that REST-PG achieves significant
+improvements over state-of-the-art baselines, with an average relative
+performance gain of 14.5% on the benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiasGuard: Guardrailing Fairness in Machine Learning Production Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nurit Cohen-Inger, Seffi Cohen, Neomi Rabaev, Lior Rokach, Bracha Shapira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As machine learning (ML) systems increasingly impact critical sectors such as
+hiring, financial risk assessments, and criminal justice, the imperative to
+ensure fairness has intensified due to potential negative implications. While
+much ML fairness research has focused on enhancing training data and processes,
+addressing the outputs of already deployed systems has received less attention.
+This paper introduces 'BiasGuard', a novel approach designed to act as a
+fairness guardrail in production ML systems. BiasGuard leverages Test-Time
+Augmentation (TTA) powered by Conditional Generative Adversarial Network
+(CTGAN), a cutting-edge generative AI model, to synthesize data samples
+conditioned on inverted protected attribute values, thereby promoting equitable
+outcomes across diverse groups. This method aims to provide equal opportunities
+for both privileged and unprivileged groups while significantly enhancing the
+fairness metrics of deployed systems without the need for retraining. Our
+comprehensive experimental analysis across diverse datasets reveals that
+BiasGuard enhances fairness by 31% while only reducing accuracy by 0.09%
+compared to non-mitigated benchmarks. Additionally, BiasGuard outperforms
+existing post-processing methods in improving fairness, positioning it as an
+effective tool to safeguard against biases when retraining the model is
+impractical.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implementing Systemic Thinking for Automatic Schema Matching: An
+  Agent-Based Modeling Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hicham Assoudi, Hakim Lounis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Several approaches are proposed to deal with the problem of the Automatic
+Schema Matching (ASM). The challenges and difficulties caused by the complexity
+and uncertainty characterizing both the process and the outcome of Schema
+Matching motivated us to investigate how bio-inspired emerging paradigm can
+help with understanding, managing, and ultimately overcoming those challenges.
+In this paper, we explain how we approached Automatic Schema Matching as a
+systemic and Complex Adaptive System (CAS) and how we modeled it using the
+approach of Agent-Based Modeling and Simulation (ABMS). This effort gives birth
+to a tool (prototype) for schema matching called Reflex-SMAS. A set of
+experiments demonstrates the viability of our approach on two main aspects: (i)
+effectiveness (increasing the quality of the found matchings) and (ii)
+efficiency (reducing the effort required for this efficiency). Our approach
+represents a significant paradigm-shift, in the field of Automatic Schema
+Matching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>COGNITIVE 2018 : The Tenth International Conference on Advanced
+  Cognitive Technologies and Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TrojanDec: Data-free Detection of Trojan Inputs in <span class="highlight-title">Self-supervised</span>
+  Learning <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04108v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04108v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupei Liu, Yanting Wang, Jinyuan Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An image encoder pre-trained by self-supervised learning can be used as a
+general-purpose feature extractor to build downstream classifiers for various
+downstream tasks. However, many studies showed that an attacker can embed a
+trojan into an encoder such that multiple downstream classifiers built based on
+the trojaned encoder simultaneously inherit the trojan behavior. In this work,
+we propose TrojanDec, the first data-free method to identify and recover a test
+input embedded with a trigger. Given a (trojaned or clean) encoder and a test
+input, TrojanDec first predicts whether the test input is trojaned. If not, the
+test input is processed in a normal way to maintain the utility. Otherwise, the
+test input will be further restored to remove the trigger. Our extensive
+evaluation shows that TrojanDec can effectively identify the trojan (if any)
+from a given test input and recover it under state-of-the-art trojan attacks.
+We further demonstrate by experiments that our TrojanDec outperforms the
+state-of-the-art defenses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Distribution and Label Consistency for Graph
+  Out-of-Distribution Generalization <span class="chip">ICDM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Song Wang, Xiaodong Yang, Rashidul Islam, Huiyuan Chen, Minghua Xu, Jundong Li, Yiwei Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To deal with distribution shifts in graph data, various graph
+out-of-distribution (OOD) generalization techniques have been recently
+proposed. These methods often employ a two-step strategy that first creates
+augmented environments and subsequently identifies invariant subgraphs to
+improve generalizability. Nevertheless, this approach could be suboptimal from
+the perspective of consistency. First, the process of augmenting environments
+by altering the graphs while preserving labels may lead to graphs that are not
+realistic or meaningfully related to the origin distribution, thus lacking
+distribution consistency. Second, the extracted subgraphs are obtained from
+directly modifying graphs, and may not necessarily maintain a consistent
+predictive relationship with their labels, thereby impacting label consistency.
+In response to these challenges, we introduce an innovative approach that aims
+to enhance these two types of consistency for graph OOD generalization. We
+propose a modifier to obtain both augmented and invariant graphs in a unified
+manner. With the augmented graphs, we enrich the training data without
+compromising the integrity of label-graph relationships. The label consistency
+enhancement in our framework further preserves the supervision information in
+the invariant graph. We conduct extensive experiments on real-world datasets to
+demonstrate the superiority of our framework over other state-of-the-art
+baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICDM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-armed Bandit and Backbone boost Lin-Kernighan-Helsgaun Algorithm
+  for the Traveling Salesman Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Wang, Jiongzhi Zheng, Zhengda Xiong, Kun He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Lin-Kernighan-Helsguan (LKH) heuristic is a classic local search
+algorithm for the Traveling Salesman Problem (TSP). LKH introduces an
+$\alpha$-value to replace the traditional distance metric for evaluating the
+edge quality, which leads to a significant improvement. However, we observe
+that the $\alpha$-value does not make full use of the historical information
+during the search, and single guiding information often makes LKH hard to
+escape from some local optima. To address the above issues, we propose a novel
+way to extract backbone information during the TSP local search process, which
+is dynamic and can be updated once a local optimal solution is found. We
+further propose to combine backbone information, $\alpha$-value, and distance
+to evaluate the edge quality so as to guide the search. Moreover, we abstract
+their different combinations to arms in a multi-armed bandit (MAB) and use an
+MAB model to help the algorithm select an appropriate evaluation metric
+dynamically. Both the backbone information and MAB can provide diverse guiding
+information and learn from the search history to suggest the best metric. We
+apply our methods to LKH and LKH-3, which is an extension version of LKH that
+can be used to solve about 40 variant problems of TSP and Vehicle Routing
+Problem (VRP). Extensive experiments show the excellent performance and
+generalization capability of our proposed method, significantly improving LKH
+for TSP and LKH-3 for two representative TSP and VRP variants, the Colored TSP
+(CTSP) and Capacitated VRP with Time Windows (CVRPTW).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ More is not always better? Enhancing Many-Shot In-Context Learning with
+  Differentiated and Reweighting Objectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoqing Zhang, Ang Lv, Yuhan Liu, Flood Sung, Wei Liu, Shuo Shang, Xiuying Chen, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) excel at few-shot in-context learning (ICL)
+without requiring parameter updates. However, as the number of ICL
+demonstrations increases from a few to many, performance tends to plateau and
+eventually decline. We identify two primary causes for this trend: the
+suboptimal negative log-likelihood (NLL) optimization objective and the
+incremental data noise. To address these issues, we introduce DR-ICL, a novel
+optimization method that enhances model performance through Differentiated
+Learning and advantage-based Reweighting objectives. Globally, DR-ICL utilizes
+differentiated learning to optimize the NLL objective, ensuring that many-shot
+performance surpasses zero-shot levels. Locally, it dynamically adjusts the
+weighting of many-shot demonstrations by leveraging cumulative advantages
+inspired by reinforcement learning, thereby improving generalization. This
+approach allows the model to handle varying numbers of shots effectively,
+mitigating the impact of noisy data. Recognizing the lack of multi-task
+datasets with diverse many-shot distributions, we develop the Many-Shot ICL
+Benchmark (MICLB)-a large-scale benchmark covering shot numbers from 1 to 350
+within sequences of up to 8,000 tokens-for fine-tuning purposes. MICLB
+facilitates the evaluation of many-shot ICL strategies across seven prominent
+NLP tasks and 50 distinct datasets. Experimental results demonstrate that LLMs
+enhanced with DR-ICL achieve significant improvements in many-shot setups
+across various tasks, including both in-domain and out-of-domain scenarios. We
+release the code and benchmark dataset hoping to facilitate further research in
+many-shot ICL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Reinforcement Learning for Formula One Race Strategy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Devin Thomas, Junqi Jiang, Avinash Kori, Aaron Russo, Steffen Winkler, Stuart Sale, Joseph McMillan, Francesco Belardinelli, Antonio Rago
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Formula One, teams compete to develop their cars and achieve the highest
+possible finishing position in each race. During a race, however, teams are
+unable to alter the car, so they must improve their cars' finishing positions
+via race strategy, i.e. optimising their selection of which tyre compounds to
+put on the car and when to do so. In this work, we introduce a reinforcement
+learning model, RSRL (Race Strategy Reinforcement Learning), to control race
+strategies in simulations, offering a faster alternative to the industry
+standard of hard-coded and Monte Carlo-based race strategies. Controlling cars
+with a pace equating to an expected finishing position of P5.5 (where P1
+represents first place and P20 is last place), RSRL achieves an average
+finishing position of P5.33 on our test race, the 2023 Bahrain Grand Prix,
+outperforming the best baseline of P5.63. We then demonstrate, in a
+generalisability study, how performance for one track or multiple tracks can be
+prioritised via training. Further, we supplement model predictions with feature
+importance, decision tree-based surrogate models, and decision tree
+counterfactuals towards improving user trust in the model. Finally, we provide
+illustrations which exemplify our approach in real-world situations, drawing
+parallels between simulations and reality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures. Copyright ACM 2025. This is the authors' version
+  of the work. It is posted here for your personal use. Not for redistribution.
+  The definitive Version of Record will be published in SAC 2025,
+  http://dx.doi.org/10.1145/3672608.3707766</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Time Series Prediction of Tyre Energy in Formula One Race
+  Strategy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jamie Todd, Junqi Jiang, Aaron Russo, Steffen Winkler, Stuart Sale, Joseph McMillan, Antonio Rago
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Formula One (F1) race strategy takes place in a high-pressure and fast-paced
+environment where split-second decisions can drastically affect race results.
+Two of the core decisions of race strategy are when to make pit stops (i.e.
+replace the cars' tyres) and which tyre compounds (hard, medium or soft, in
+normal conditions) to select. The optimal pit stop decisions can be determined
+by estimating the tyre degradation of these compounds, which in turn can be
+computed from the energy applied to each tyre, i.e. the tyre energy. In this
+work, we trained deep learning models, using the Mercedes-AMG PETRONAS F1
+team's historic race data consisting of telemetry, to forecast tyre energies
+during races. Additionally, we fitted XGBoost, a decision tree-based machine
+learning algorithm, to the same dataset and compared the results, with both
+giving impressive performance. Furthermore, we incorporated two different
+explainable AI methods, namely feature importance and counterfactual
+explanations, to gain insights into the reasoning behind the forecasts. Our
+contributions thus result in an explainable, automated method which could
+assist F1 teams in optimising their race strategy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 9 figures. Copyright ACM 2025. This is the authors' version
+  of the work. It is posted here for your personal use. Not for redistribution.
+  The definitive Version of Record will be published in SAC 2025,
+  http://dx.doi.org/10.1145/3672608.3707765</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChronoLLM: A Framework for Customizing Large Language Model for Digital
+  Twins generalization based on PyChrono 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingquan Wang, Harry Zhang, Khailanii Slaton, Shu Wang, Radu Serban, Jinlong Wu, Dan Negrut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the integration of advanced simulation technologies with artificial
+intelligence (AI) is revolutionizing science and engineering research.
+ChronoLlama introduces a novel framework that customizes the open-source LLMs,
+specifically for code generation, paired with PyChrono for multi-physics
+simulations. This integration aims to automate and improve the creation of
+simulation scripts, thus enhancing model accuracy and efficiency. This
+combination harnesses the speed of AI-driven code generation with the
+reliability of physics-based simulations, providing a powerful tool for
+researchers and engineers. Empirical results indicate substantial enhancements
+in simulation setup speed, accuracy of the generated codes, and overall
+computational efficiency. ChronoLlama not only expedites the development and
+testing of multibody systems but also spearheads a scalable, AI-enhanced
+approach to managing intricate mechanical simulations. This pioneering
+integration of cutting-edge AI with traditional simulation platforms represents
+a significant leap forward in automating and optimizing design processes in
+engineering applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Style Transfer for MRI Image Segmentation: A Case of Glioma
+  Segmentation in Sub-Saharan Africa 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rancy Chepchirchir, Jill Sunday, Raymond Confidence, Dong Zhang, Talha Chaudhry, Udunna C. Anazodo, Kendi Muchungi, Yujing Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Sub-Saharan Africa (SSA), the utilization of lower-quality Magnetic
+Resonance Imaging (MRI) technology raises questions about the applicability of
+machine learning methods for clinical tasks. This study aims to provide a
+robust deep learning-based brain tumor segmentation (BraTS) method tailored for
+the SSA population using a threefold approach. Firstly, the impact of domain
+shift from the SSA training data on model efficacy was examined, revealing no
+significant effect. Secondly, a comparative analysis of 3D and 2D
+full-resolution models using the nnU-Net framework indicates similar
+performance of both the models trained for 300 epochs achieving a five-fold
+cross-validation score of 0.93. Lastly, addressing the performance gap observed
+in SSA validation as opposed to the relatively larger BraTS glioma (GLI)
+validation set, two strategies are proposed: fine-tuning SSA cases using the
+GLI+SSA best-pretrained 2D fullres model at 300 epochs, and introducing a novel
+neural style transfer-based data augmentation technique for the SSA cases. This
+investigation underscores the potential of enhancing brain tumor prediction
+within SSA's unique healthcare landscape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-Driven Reinvention of Hydrological Modeling for Accurate Predictions
+  and Interpretation to Transform Earth System Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.04733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.04733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cuihui Xia, Lei Yue, Deliang Chen, Yuyang Li, Hongqiang Yang, Ancheng Xue, Zhiqiang Li, Qing He, Guoqing Zhang, Dambaru Ballab Kattel, Lei Lei, Ming Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional equation-driven hydrological models often struggle to accurately
+predict streamflow in challenging regional Earth systems like the Tibetan
+Plateau, while hybrid and existing algorithm-driven models face difficulties in
+interpreting hydrological behaviors. This work introduces HydroTrace, an
+algorithm-driven, data-agnostic model that substantially outperforms these
+approaches, achieving a Nash-Sutcliffe Efficiency of 98% and demonstrating
+strong generalization on unseen data. Moreover, HydroTrace leverages advanced
+attention mechanisms to capture spatial-temporal variations and
+feature-specific impacts, enabling the quantification and spatial resolution of
+streamflow partitioning as well as the interpretation of hydrological behaviors
+such as glacier-snow-streamflow interactions and monsoon dynamics.
+Additionally, a large language model (LLM)-based application allows users to
+easily understand and apply HydroTrace's insights for practical purposes. These
+advancements position HydroTrace as a transformative tool in hydrological and
+broader Earth system modeling, offering enhanced prediction accuracy and
+interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ λ: A Benchmark for Data-Efficiency in Long-Horizon Indoor Mobile
+  Manipulation Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05313v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05313v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Jaafar, Shreyas Sundara Raman, Yichen Wei, Sudarshan Harithas, Sofia Juliani, Anneke Wernerfelt, Benedict Quartey, Ifrah Idrees, Jason Xinyu Liu, Stefanie Tellex
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiently learning and executing long-horizon mobile manipulation (MoMa)
+tasks is crucial for advancing robotics in household and workplace settings.
+However, current MoMa models are data-inefficient, underscoring the need for
+improved models that require realistic-sized benchmarks to evaluate their
+efficiency, which do not exist. To address this, we introduce the LAMBDA
+({\lambda}) benchmark (Long-horizon Actions for Mobile-manipulation
+Benchmarking of Directed Activities), which evaluates the data efficiency of
+models on language-conditioned, long-horizon, multi-room, multi-floor,
+pick-and-place tasks using a dataset of manageable size, more feasible for
+collection. The benchmark includes 571 human-collected demonstrations that
+provide realism and diversity in simulated and real-world settings. Unlike
+planner-generated data, these trajectories offer natural variability and
+replay-verifiability, ensuring robust learning and evaluation. We benchmark
+several models, including learning-based models and a neuro-symbolic modular
+approach combining foundation models with task and motion planning.
+Learning-based models show suboptimal success rates, even when leveraging
+pretrained weights, underscoring significant data inefficiencies. However, the
+neuro-symbolic approach performs significantly better while being more data
+efficient. Findings highlight the need for more data-efficient learning-based
+MoMa approaches. {\lambda} addresses this gap by serving as a key benchmark for
+evaluating the data efficiency of those future models in handling household
+robotics tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Scenario Reasoning: Unlocking Cognitive Autonomy in Humanoid
+  Robots for Multimodal Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.20429v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.20429v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Libo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To improve the cognitive autonomy of humanoid robots, this research proposes
+a multi-scenario reasoning architecture to solve the technical shortcomings of
+multi-modal understanding in this field. It draws on simulation based
+experimental design that adopts multi-modal synthesis (visual, auditory,
+tactile) and builds a simulator "Maha" to perform the experiment. The findings
+demonstrate the feasibility of this architecture in multimodal data. It
+provides reference experience for the exploration of cross-modal interaction
+strategies for humanoid robots in dynamic environments. In addition,
+multi-scenario reasoning simulates the high-level reasoning mechanism of the
+human brain to humanoid robots at the cognitive level. This new concept
+promotes cross-scenario practical task transfer and semantic-driven action
+planning. It heralds the future development of self-learning and autonomous
+behavior of humanoid robots in changing scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The main text is 5 pages, 2 figures, and 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unity by Diversity: Improved Representation Learning in Multimodal VAEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05300v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05300v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas M. Sutter, Yang Meng, Andrea Agostini, Daphné Chopard, Norbert Fortin, Julia E. Vogt, Babak Shahbaba, Stephan Mandt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational Autoencoders for multimodal data hold promise for many tasks in
+data analysis, such as representation learning, conditional generation, and
+imputation. Current architectures either share the encoder output, decoder
+input, or both across modalities to learn a shared representation. Such
+architectures impose hard constraints on the model. In this work, we show that
+a better latent representation can be obtained by replacing these hard
+constraints with a soft constraint. We propose a new mixture-of-experts prior,
+softly guiding each modality's latent representation towards a shared aggregate
+posterior. This approach results in a superior latent representation and allows
+each encoding to preserve information better from its uncompressed original
+features. In extensive experiments on multiple benchmark datasets and two
+challenging real-world datasets, we show improved learned latent
+representations and imputation of missing data modalities compared to existing
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Neurips 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Clinical Insights: A Comprehensive <span class="highlight-title">Review</span> of Language Models in Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11735v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11735v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikita Neveditsin, Pawan Lingras, Vijay Mago
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the advancements and applications of language models in
+healthcare, focusing on their clinical use cases. It examines the evolution
+from early encoder-based systems requiring extensive fine-tuning to
+state-of-the-art large language and multimodal models capable of integrating
+text and visual data through in-context learning. The analysis emphasizes
+locally deployable models, which enhance data privacy and operational autonomy,
+and their applications in tasks such as text generation, classification,
+information extraction, and conversational systems. The paper also highlights a
+structured organization of tasks and a tiered ethical approach, providing a
+valuable resource for researchers and practitioners, while discussing key
+challenges related to ethics, evaluation, and implementation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to PLOS Digital Health, Revision 1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AtMan: Understanding <span class="highlight-title">Transformer</span> Predictions Through Memory Efficient
+  Attention Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08110v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08110v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Björn Deiseroth, Mayukh Deb, Samuel Weinbach, Manuel Brack, Patrick Schramowski, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative transformer models have become increasingly complex, with large
+numbers of parameters and the ability to process multiple input modalities.
+Current methods for explaining their predictions are resource-intensive. Most
+crucially, they require prohibitively large amounts of extra memory, since they
+rely on backpropagation which allocates almost twice as much GPU memory as the
+forward pass. This makes it difficult, if not impossible, to use them in
+production. We present AtMan that provides explanations of generative
+transformer models at almost no extra cost. Specifically, AtMan is a
+modality-agnostic perturbation method that manipulates the attention mechanisms
+of transformers to produce relevance maps for the input with respect to the
+output prediction. Instead of using backpropagation, AtMan applies a
+parallelizable token-based search method based on cosine similarity
+neighborhood in the embedding space. Our exhaustive experiments on text and
+image-text benchmarks demonstrate that AtMan outperforms current
+state-of-the-art gradient-based methods on several metrics while being
+computationally efficient. As such, AtMan is suitable for use in large model
+inference deployments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ T-FREE: Subword Tokenizer-Free Generative LLMs via Sparse
+  Representations for Memory-Efficient Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19223v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19223v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Björn Deiseroth, Manuel Brack, Patrick Schramowski, Kristian Kersting, Samuel Weinbach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tokenizers are crucial for encoding information in Large Language Models, but
+their development has recently stagnated, and they contain inherent weaknesses.
+Major limitations include computational overhead, ineffective vocabulary use,
+and unnecessarily large embedding and head layers. Additionally, their
+performance is biased towards a reference corpus, leading to reduced
+effectiveness for underrepresented languages.
+  To remedy these issues, we propose T-FREE, which directly embeds words
+through sparse activation patterns over character triplets, and does not
+require a reference corpus. T-FREE inherently exploits morphological
+similarities and allows for strong compression of embedding layers. In our
+exhaustive experimental evaluation, we achieve competitive downstream
+performance with a parameter reduction of more than 85% on these layers.
+Further, T-FREE shows significant improvements in cross-lingual transfer
+learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LMS-AutoTSF: Learnable Multi-Scale Decomposition and Integrated
+  Autocorrelation for Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.06866v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.06866v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibrahim Delibasoglu, Sanjay Chakraborty, Fredrik Heintz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series forecasting is an important challenge with significant
+applications in areas such as weather prediction, stock market analysis,
+scientific simulations and industrial process analysis. In this work, we
+introduce LMS-AutoTSF, a novel time series forecasting architecture that
+incorporates autocorrelation while leveraging dual encoders operating at
+multiple scales. Unlike models that rely on predefined trend and seasonal
+components, LMS-AutoTSF employs two separate encoders per scale: one focusing
+on low-pass filtering to capture trends and the other utilizing high-pass
+filtering to model seasonal variations. These filters are learnable, allowing
+the model to dynamically adapt and isolate trend and seasonal components
+directly in the frequency domain. A key innovation in our approach is the
+integration of autocorrelation, achieved by computing lagged differences in
+time steps, which enables the model to capture dependencies across time more
+effectively. Each encoder processes the input through fully connected layers to
+handle temporal and channel interactions. By combining frequency-domain
+filtering, autocorrelation-based temporal modeling, and channel-wise
+transformations, LMS-AutoTSF not only accurately captures long-term
+dependencies and fine-grained patterns but also operates more efficiently
+compared to other state-of-the-art methods. Its lightweight design ensures
+faster processing while maintaining high precision in forecasting across
+diverse time horizons. The source code is publicly available at
+\url{http://github.com/mribrahim/LMS-TSF}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two-Layer Retrieval-Augmented Generation Framework for Low-Resource
+  Medical Question Answering Using Reddit Data: Proof-of-Concept Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19519v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19519v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sudeshna Das, Yao Ge, Yuting Guo, Swati Rajwal, JaMor Hairston, Jeanne Powell, Drew Walker, Snigdha Peddireddy, Sahithi Lakamana, Selen Bozkurt, Matthew Reyna, Reza Sameni, Yunyu Xiao, Sangmi Kim, Rasheeta Chandler, Natalie Hernandez, Danielle Mowery, Rachel Wightman, Jennifer Love, Anthony Spadaro, Jeanmarie Perrone, Abeed Sarker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing use of social media to share lived and living experiences of
+substance use presents a unique opportunity to obtain information on side
+effects, use patterns, and opinions on novel psychoactive substances. However,
+due to the large volume of data, obtaining useful insights through natural
+language processing technologies such as large language models is challenging.
+This paper aims to develop a retrieval-augmented generation (RAG) architecture
+for medical question answering pertaining to clinicians' queries on emerging
+issues associated with health-related topics, using user-generated medical
+information on social media. We proposed a two-layer RAG framework for
+query-focused answer generation and evaluated a proof of concept for the
+framework in the context of query-focused summary generation from social media
+forums, focusing on emerging drug-related information. Our modular framework
+generates individual summaries followed by an aggregated summary to answer
+medical queries from large amounts of user-generated social media data in an
+efficient manner. We compared the performance of a quantized large language
+model (Nous-Hermes-2-7B-DPO), deployable in low-resource settings, with GPT-4.
+For this proof-of-concept study, we used user-generated data from Reddit to
+answer clinicians' questions on the use of xylazine and ketamine. Our framework
+achieves comparable median scores in terms of relevance, length, hallucination,
+coverage, and coherence when evaluated using GPT-4 and Nous-Hermes-2-7B-DPO,
+evaluated for 20 queries with 76 samples. There was no statistically
+significant difference between the two for coverage, coherence, relevance,
+length, and hallucination. A statistically significant difference was noted for
+the Coleman-Liau Index. Our RAG framework can effectively answer medical
+questions about targeted topics and can be deployed in resource-constrained
+settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in JMIR: https://www.jmir.org/2025/1/e66220</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Glucose Patterns to Health Outcomes: A Generalizable Foundation
+  Model for Continuous Glucose Monitor Data Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11876v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11876v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guy Lutsker, Gal Sapir, Smadar Shilo, Jordi Merino, Anastasia Godneva, Jerry R Greenfield, Dorit Samocha-Bonet, Raja Dhir, Francisco Gude, Shie Mannor, Eli Meirom, Gal Chechik, Hagai Rossman, Eran Segal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in SSL enabled novel medical AI models, known as foundation
+models, offer great potential for better characterizing health from diverse
+biomedical data. CGM provides rich, temporal data on glycemic patterns, but its
+full potential for predicting broader health outcomes remains underutilized.
+Here, we present GluFormer, a generative foundation model for CGM data that
+learns nuanced glycemic patterns and translates them into predictive
+representations of metabolic health. Trained on over 10 million CGM
+measurements from 10,812 adults, primarily without diabetes, GluFormer uses
+autoregressive token prediction to capture longitudinal glucose dynamics. We
+show that GluFormer generalizes to 19 external cohorts (n=6,044) spanning
+different ethnicities and ages, 5 countries, 8 CGM devices, and diverse
+pathophysiological states. GluFormers representations exceed the performance of
+current CGM metrics, such as the Glucose Management Indicator (GMI), for
+forecasting clinical measures. In a longitudinal study of 580 adults with CGM
+data and 12-year follow-up, GluFormer identifies individuals at elevated risk
+of developing diabetes more effectively than blood HbA1C%, capturing 66% of all
+new-onset diabetes diagnoses in the top quartile versus 7% in the bottom
+quartile. Similarly, 69% of cardiovascular-death events occurred in the top
+quartile with none in the bottom quartile, demonstrating powerful risk
+stratification beyond traditional glycemic metrics. We also show that CGM
+representations from pre-intervention periods in Randomized Clinical Trials
+outperform other methods in predicting primary and secondary outcomes. When
+integrating dietary data into GluFormer, we show that the multi-modal version
+of the model can accurately generate CGM data based on dietary intake data,
+simulate outcomes of dietary interventions, and predict individual responses to
+specific foods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advanced Persistent Threats (APT) Attribution Using Deep Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Animesh Singh Basnet, Mohamed Chahine Ghanem, Dipo Dunsin, Wiktor Sowinski-Mydlarz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of the DRL model for malware attribution involved extensive
+research, iterative coding, and numerous adjustments based on the insights
+gathered from predecessor models and contemporary research papers. This
+preparatory work was essential to establish a robust foundation for the model,
+ensuring it could adapt and respond effectively to the dynamic nature of
+malware threats. Initially, the model struggled with low accuracy levels, but
+through persistent adjustments to its architecture and learning algorithms,
+accuracy improved dramatically from about 7 percent to over 73 percent in early
+iterations. By the end of the training, the model consistently reached accuracy
+levels near 98 percent, demonstrating its strong capability to accurately
+recognise and attribute malware activities. This upward trajectory in training
+accuracy is graphically represented in the Figure, which vividly illustrates
+the model maturation and increasing proficiency over time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-based Accelerated MR Cholangiopancreatography without
+  Fully-sampled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.03732v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.03732v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinho Kim, Marcel Dominik Nickel, Florian Knoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The purpose of this study was to accelerate MR cholangiopancreatography
+(MRCP) acquisitions using deep learning-based (DL) reconstruction at 3T and
+0.55T. A total of 35 healthy volunteers underwent conventional two-fold
+accelerated MRCP scans at field strengths of 3T and 0.55T. We trained DL
+reconstructions using two different training strategies, supervised (SV) and
+self-supervised (SSV), with retrospectively six-fold undersampled data obtained
+at 3T. We then evaluated the DL reconstructions against standard techniques,
+parallel imaging (PI) and compressed sensing (CS), focusing on peak
+signal-to-noise ratio (PSNR) and structural similarity (SSIM) as metrics. We
+also tested DL reconstructions with prospectively accelerated acquisitions and
+evaluated their robustness when changing fields strengths from 3T to 0.55T. DL
+reconstructions demonstrated a reduction in average acquisition time from
+599/542 to 255/180 seconds for MRCP at 3T/0.55T. In both retrospective and
+prospective undersampling, PSNR and SSIM of DL reconstructions were higher than
+those of PI and CS. At the same time, DL reconstructions preserved the image
+quality of undersampled data, including sharpness and the visibility of
+hepatobiliary ducts. In addition, both DL approaches produced high-quality
+reconstructions at 0.55T. In summary, DL reconstructions trained for highly
+accelerated MRCP enabled a reduction in acquisition time by a factor of 2.4/3.0
+at 3T/0.55T while maintaining the image quality of conventional acquisitions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Apollo: Band-sequence Modeling for High-Quality Audio Restoration <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.08514v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.08514v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Li, Yi Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio restoration has become increasingly significant in modern society, not
+only due to the demand for high-quality auditory experiences enabled by
+advanced playback devices, but also because the growing capabilities of
+generative audio models necessitate high-fidelity audio. Typically, audio
+restoration is defined as a task of predicting undistorted audio from damaged
+input, often trained using a GAN framework to balance perception and
+distortion. Since audio degradation is primarily concentrated in mid- and
+high-frequency ranges, especially due to codecs, a key challenge lies in
+designing a generator capable of preserving low-frequency information while
+accurately reconstructing high-quality mid- and high-frequency content.
+Inspired by recent advancements in high-sample-rate music separation, speech
+enhancement, and audio codec models, we propose Apollo, a generative model
+designed for high-sample-rate audio restoration. Apollo employs an explicit
+frequency band split module to model the relationships between different
+frequency bands, allowing for more coherent and higher-quality restored audio.
+Evaluated on the MUSDB18-HQ and MoisesDB datasets, Apollo consistently
+outperforms existing SR-GAN models across various bit rates and music genres,
+particularly excelling in complex scenarios involving mixtures of multiple
+instruments and vocals. Apollo significantly improves music restoration quality
+while maintaining computational efficiency. The source code for Apollo is
+publicly available at https://github.com/JusperLee/Apollo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025, Demo Page: https://cslikai.cn/Apollo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Helping LLMs Improve Code Generation Using Feedback from Testing and
+  Static Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.14841v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.14841v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Greta Dolcetti, Vincenzo Arceri, Eleonora Iotti, Sergio Maffeis, Agostino Cortesi, Enea Zaffanella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are one of the most promising developments in
+the field of artificial intelligence, and the software engineering community
+has readily noticed their potential role in the software development
+life-cycle. Developers routinely ask LLMs to generate code snippets, increasing
+productivity but also potentially introducing ownership, privacy, correctness,
+and security issues. Previous work highlighted how code generated by mainstream
+commercial LLMs is often not safe, containing vulnerabilities, bugs, and code
+smells. In this paper, we present a framework that leverages testing and static
+analysis to assess the quality, and guide the self-improvement, of code
+generated by general-purpose, open-source LLMs.
+  First, we ask LLMs to generate C code to solve a number of programming tasks.
+Then we employ ground-truth tests to assess the (in)correctness of the
+generated code, and a static analysis tool to detect potential safety
+vulnerabilities. Next, we assess the models ability to evaluate the generated
+code, by asking them to detect errors and vulnerabilities. Finally, we test the
+models ability to fix the generated code, providing the reports produced during
+the static analysis and incorrectness evaluation phases as feedback.
+  Our results show that models often produce incorrect code, and that the
+generated code can include safety issues. Moreover, they perform very poorly at
+detecting either issue. On the positive side, we observe a substantial ability
+to fix flawed code when provided with information about failed tests or
+potential vulnerabilities, indicating a promising avenue for improving the
+safety of LLM-based code generation tools.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lived Experience Not Found: LLMs Struggle to Align with Experts on
+  Addressing Adverse Drug Reactions from Psychiatric Medication Use 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.19155v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.19155v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohit Chandra, Siddharth Sriraman, Gaurav Verma, Harneet Singh Khanuja, Jose Suarez Campayo, Zihang Li, Michael L. Birnbaum, Munmun De Choudhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adverse Drug Reactions (ADRs) from psychiatric medications are the leading
+cause of hospitalizations among mental health patients. With healthcare systems
+and online communities facing limitations in resolving ADR-related issues,
+Large Language Models (LLMs) have the potential to fill this gap. Despite the
+increasing capabilities of LLMs, past research has not explored their
+capabilities in detecting ADRs related to psychiatric medications or in
+providing effective harm reduction strategies. To address this, we introduce
+the Psych-ADR benchmark and the Adverse Drug Reaction Response Assessment
+(ADRA) framework to systematically evaluate LLM performance in detecting ADR
+expressions and delivering expert-aligned mitigation strategies. Our analyses
+show that LLMs struggle with understanding the nuances of ADRs and
+differentiating between types of ADRs. While LLMs align with experts in terms
+of expressed emotions and tone of the text, their responses are more complex,
+harder to read, and only 70.86% aligned with expert strategies. Furthermore,
+they provide less actionable advice by a margin of 12.32% on average. Our work
+provides a comprehensive benchmark and evaluation framework for assessing LLMs
+in strategy-driven tasks within high-risk domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 8 figures, 16 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Influence Functions for Scalable Data Attribution in Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.13850v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.13850v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Mlodozeniec, Runa Eschenhagen, Juhan Bae, Alexander Immer, David Krueger, Richard Turner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have led to significant advancements in generative
+modelling. Yet their widespread adoption poses challenges regarding data
+attribution and interpretability. In this paper, we aim to help address such
+challenges in diffusion models by developing an influence functions framework.
+Influence function-based data attribution methods approximate how a model's
+output would have changed if some training data were removed. In supervised
+learning, this is usually used for predicting how the loss on a particular
+example would change. For diffusion models, we focus on predicting the change
+in the probability of generating a particular example via several proxy
+measurements. We show how to formulate influence functions for such quantities
+and how previously proposed methods can be interpreted as particular design
+choices in our framework. To ensure scalability of the Hessian computations in
+influence functions, we systematically develop K-FAC approximations based on
+generalised Gauss-Newton matrices specifically tailored to diffusion models. We
+recast previously proposed methods as specific design choices in our framework
+and show that our recommended method outperforms previous data attribution
+approaches on common evaluations, such as the Linear Data-modelling Score (LDS)
+or retraining without top influences, without the need for method-specific
+hyperparameter tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Piano Transcription by Hierarchical Language Modeling with <span class="highlight-title">Pretrain</span>ed
+  Roll-based Encoders <span class="chip">ICASSP 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03038v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03038v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dichucheng Li, Yongyi Zang, Qiuqiang Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Music Transcription (AMT), aiming to get musical notes from raw
+audio, typically uses frame-level systems with piano-roll outputs or language
+model (LM)-based systems with note-level predictions. However, frame-level
+systems require manual thresholding, while the LM-based systems struggle with
+long sequences. In this paper, we propose a hybrid method combining pre-trained
+roll-based encoders with an LM decoder to leverage the strengths of both
+methods. Besides, our approach employs a hierarchical prediction strategy,
+first predicting onset and pitch, then velocity, and finally offset. The
+hierarchical prediction strategy reduces computational costs by breaking down
+long sequences into different hierarchies. Evaluated on two benchmark
+roll-based encoders, our method outperforms traditional piano-roll outputs 0.01
+and 0.022 in onset-offset-velocity F1 score, demonstrating its potential as a
+performance-enhancing plug-in for arbitrary roll-based music transcription
+encoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GraphLoRA: Structure-Aware Contrastive Low-Rank Adaptation for
+  Cross-Graph Transfer Learning <span class="chip">KDD2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.16670v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.16670v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe-Rui Yang, Jindong Han, Chang-Dong Wang, Hao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have demonstrated remarkable proficiency in
+handling a range of graph analytical tasks across various domains, such as
+e-commerce and social networks. Despite their versatility, GNNs face
+significant challenges in transferability, limiting their utility in real-world
+applications. Existing research in GNN transfer learning overlooks
+discrepancies in distribution among various graph datasets, facing challenges
+when transferring across different distributions. How to effectively adopt a
+well-trained GNN to new graphs with varying feature and structural
+distributions remains an under-explored problem. Taking inspiration from the
+success of Low-Rank Adaptation (LoRA) in adapting large language models to
+various domains, we propose GraphLoRA, an effective and parameter-efficient
+method for transferring well-trained GNNs to diverse graph domains.
+Specifically, we first propose a Structure-aware Maximum Mean Discrepancy
+(SMMD) to align divergent node feature distributions across source and target
+graphs. Moreover, we introduce low-rank adaptation by injecting a small
+trainable GNN alongside the pre-trained one, effectively bridging structural
+distribution gaps while mitigating the catastrophic forgetting. Additionally, a
+structure-aware regularization objective is proposed to enhance the
+adaptability of the pre-trained GNN to target graph with scarce supervision
+labels. Extensive experiments on eight real-world datasets demonstrate the
+effectiveness of GraphLoRA against fourteen baselines by tuning only 20% of
+parameters, even across disparate graph domains. The code is available at
+https://github.com/AllminerLab/GraphLoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hallucination Detox: Sensitivity Dropout (SenD) for Large Language Model
+  Training <span class="chip">ICLR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.15460v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.15460v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahrad Mohammadzadeh, Juan David Guerra, Marco Bonizzato, Reihaneh Rabbany, Golnoosh Farnadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) are increasingly deployed across various
+industries, concerns regarding their reliability, particularly due to
+hallucinations - outputs that are factually inaccurate or irrelevant to user
+input - have grown. Our research investigates the relationship between the
+training process and the emergence of hallucinations to address a key gap in
+existing research that focuses primarily on post hoc detection and mitigation
+strategies. Using models from the Pythia suite (70M - 12B parameters) and
+several hallucination detection metrics, we analyze hallucination trends
+throughout training and explore LLM internal dynamics. We introduce Sensitivity
+Dropout (SenD), a novel training protocol designed to mitigate hallucinations
+by reducing variance during training. SenD achieves this by deterministically
+dropping embedding indices with significant variability, referred to as
+Sensitive Embedding Indices. In addition, we develop an unsupervised
+hallucination detection metric, Efficient EigenScore (EES), which approximates
+the traditional EigenScore at 2x speed. This efficient metric is integrated
+into our protocol, allowing SenD to be both computationally scalable and
+effective at reducing hallucinations. Our empirical evaluation demonstrates
+that our approach improves LLM reliability at test time by up to 40% compared
+to normal training while also providing an efficient method to improve factual
+accuracy when adapting LLMs to Wikipedia, Medical, and LegalBench domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 15 figures, under review at ICLR, accepted to Safe
+  Generative AI Workshop @ NeurIPS 2024, resubmitting to change name to
+  appropriate name</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IDEAL: Leveraging Infinite and Dynamic Characterizations of Large
+  Language Models for Query-focused Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10486v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10486v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Cao, Dian Jiao, Qiang Yan, Wenqiao Zhang, Siliang Tang, Yueting Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query-focused summarization (QFS) aims to produce summaries that answer
+particular questions of interest, enabling greater user control and
+personalization. With the advent of large language models (LLMs), shows their
+impressive capability of textual understanding through large-scale pretraining,
+which implies the great potential of extractive snippet generation. In this
+paper, we systematically investigated two indispensable characteristics that
+the LLMs-based QFS models should be harnessed, Lengthy Document Summarization
+and Efficiently Fine-grained Query-LLM Alignment, respectively.
+Correspondingly, we propose two modules called Query-aware HyperExpert and
+Query-focused Infini-attention to access the aforementioned characteristics.
+These innovations pave the way for broader application and accessibility in the
+field of QFS technology. Extensive experiments conducted on existing QFS
+benchmarks indicate the effectiveness and generalizability of the proposed
+approach. Our code is publicly available at
+https://github.com/DCDmllm/IDEAL_Summary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hyperbolic Contrastive Learning for Hierarchical 3D Point Cloud
+  Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02285v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02285v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingjie Liu, Pengyu Zhang, Ziyao He, Mingsong Chen, Xuan Tang, Xian Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperbolic spaces allow for more efficient modeling of complex, hierarchical
+structures, which is particularly beneficial in tasks involving multi-modal
+data. Although hyperbolic geometries have been proven effective for
+language-image pre-training, their capabilities to unify language, image, and
+3D Point Cloud modalities are under-explored. We extend the 3D Point Cloud
+modality in hyperbolic multi-modal contrastive pre-training. Additionally, we
+explore the entailment, modality gap, and alignment regularizers for learning
+hierarchical 3D embeddings and facilitating the transfer of knowledge from both
+Text and Image modalities. These regularizers enable the learning of
+intra-modal hierarchy within each modality and inter-modal hierarchy across
+text, 2D images, and 3D Point Clouds. Experimental results demonstrate that our
+proposed training strategy yields an outstanding 3D Point Cloud encoder, and
+the obtained 3D Point Cloud hierarchical embeddings significantly improve
+performance on various downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Informative Latent Representation for Quantum State Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00518v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00518v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hailan Ma, Zhenhong Sun, Daoyi Dong, Dong Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum state tomography (QST) is the process of reconstructing the complete
+state of a quantum system (mathematically described as a density matrix)
+through a series of different measurements. These measurements are performed on
+a number of identical copies of the quantum system, with outcomes gathered as
+frequencies. QST aims to recover the density matrix or the properties of the
+quantum state from the measured frequencies. Although an informationally
+complete set of measurements can specify the quantum state accurately in an
+ideal scenario with a large number of identical copies, both the measurements
+and identical copies are restricted and imperfect in practical scenarios,
+making QST highly ill-posed. The conventional QST methods usually assume
+accurate measured frequencies or rely on manually designed regularizers to
+handle the ill-posed reconstruction problem, suffering from limited
+applications in realistic scenarios. Recent advances in deep neural networks
+(DNN) led to the emergence of deep learning in QST. However, existing DL-based
+QST approaches often employ generic DNN models that are not optimized for
+imperfect conditions of QST. In this paper, we propose a transformer-based
+autoencoder architecture tailored for QST with imperfect measurement data. Our
+method leverages a transformer-based encoder to extract an informative latent
+representation (ILR) from imperfect measurement data and employs a decoder to
+predict the quantum states based on the ILR. We anticipate that the
+high-dimensional ILR will capture more comprehensive information about the
+quantum states. To achieve this, we conduct pre-training of the encoder using a
+pretext task that involves reconstructing high-quality frequencies from
+measured frequencies. Extensive simulations and experiments demonstrate the
+remarkable ability of the informative latent representation to deal with
+imperfect measurement data in QST.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AllSpark: A Multimodal Spatio-Temporal General Intelligence Model with
+  Ten Modalities via Language as a Reference Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00546v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00546v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Run Shao, Cheng Yang, Qiujun Li, Qing Zhu, Yongjun Zhang, YanSheng Li, Yu Liu, Yong Tang, Dapeng Liu, Shizhong Yang, Haifeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging multimodal data is an inherent requirement for comprehending
+geographic objects. However, due to the high heterogeneity in structure and
+semantics among various spatio-temporal modalities, the joint interpretation of
+multimodal spatio-temporal data has long been an extremely challenging problem.
+The primary challenge resides in striking a trade-off between the cohesion and
+autonomy of diverse modalities. This trade-off becomes progressively nonlinear
+as the number of modalities expands. Inspired by the human cognitive system and
+linguistic philosophy, where perceptual signals from the five senses converge
+into language, we introduce the Language as Reference Framework (LaRF), a
+fundamental principle for constructing a multimodal unified model. Building
+upon this, we propose AllSpark, a multimodal spatio-temporal general artificial
+intelligence model. Our model integrates ten different modalities into a
+unified framework. To achieve modal cohesion, AllSpark introduces a modal
+bridge and multimodal large language model (LLM) to map diverse modal features
+into the language feature space. To maintain modality autonomy, AllSpark uses
+modality-specific encoders to extract the tokens of various spatio-temporal
+modalities. Finally, observing a gap between the model's interpretability and
+downstream tasks, we designed modality-specific prompts and task heads,
+enhancing the model's generalization capability across specific tasks.
+Experiments indicate that the incorporation of language enables AllSpark to
+excel in few-shot classification tasks for RGB and point cloud modalities
+without additional training, surpassing baseline performance by up to 41.82\%.
+The source code is available at https://github.com/GeoX-Lab/AllSpark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 19 tables, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-Driven Scenarios for Urban Mobility: Quantifying the Role of ODE
+  Models and Scenario Planning in Reducing Traffic Congestion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.19915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.19915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katsiaryna Bahamazava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urbanization and technological advancements are reshaping urban mobility,
+presenting both challenges and opportunities. This paper investigates how
+Artificial Intelligence (AI)-driven technologies can impact traffic congestion
+dynamics and explores their potential to enhance transportation systems'
+efficiency. Specifically, we assess the role of AI innovations, such as
+autonomous vehicles and intelligent traffic management, in mitigating
+congestion under varying regulatory frameworks. Autonomous vehicles reduce
+congestion through optimized traffic flow, real-time route adjustments, and
+decreased human errors.
+  The study employs Ordinary Differential Equations (ODEs) to model the dynamic
+relationship between AI adoption rates and traffic congestion, capturing
+systemic feedback loops. Quantitative outputs include threshold levels of AI
+adoption needed to achieve significant congestion reduction, while qualitative
+insights stem from scenario planning exploring regulatory and societal
+conditions. This dual-method approach offers actionable strategies for
+policymakers to create efficient, sustainable, and equitable urban
+transportation systems. While safety implications of AI are acknowledged, this
+study primarily focuses on congestion reduction dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMAD: The First-Ever Comprehensive Benchmark for Multimodal Large
+  Language Models in Industrial Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.09453v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.09453v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Jiang, Jian Li, Hanqiu Deng, Yong Liu, Bin-Bin Gao, Yifeng Zhou, Jialin Li, Chengjie Wang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of industrial inspection, Multimodal Large Language Models
+(MLLMs) have a high potential to renew the paradigms in practical applications
+due to their robust language capabilities and generalization abilities.
+However, despite their impressive problem-solving skills in many domains,
+MLLMs' ability in industrial anomaly detection has not been systematically
+studied. To bridge this gap, we present MMAD, the first-ever full-spectrum
+MLLMs benchmark in industrial Anomaly Detection. We defined seven key subtasks
+of MLLMs in industrial inspection and designed a novel pipeline to generate the
+MMAD dataset with 39,672 questions for 8,366 industrial images. With MMAD, we
+have conducted a comprehensive, quantitative evaluation of various
+state-of-the-art MLLMs. The commercial models performed the best, with the
+average accuracy of GPT-4o models reaching 74.9%. However, this result falls
+far short of industrial requirements. Our analysis reveals that current MLLMs
+still have significant room for improvement in answering questions related to
+industrial anomalies and defects. We further explore two training-free
+performance enhancement strategies to help models improve in industrial
+scenarios, highlighting their promising potential for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and data are available at https://github.com/jam-cc/MMAD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wavelet-Driven Generalizable Framework for Deepfake Face Forgery
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.18301v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.18301v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lalith Bharadwaj Baru, Rohit Boddeda, Shilhora Akshay Patel, Sai Mohan Gajapaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evolution of digital image manipulation, particularly with the
+advancement of deep generative models, significantly challenges existing
+deepfake detection methods, especially when the origin of the deepfake is
+obscure. To tackle the increasing complexity of these forgeries, we propose
+\textbf{Wavelet-CLIP}, a deepfake detection framework that integrates wavelet
+transforms with features derived from the ViT-L/14 architecture, pre-trained in
+the CLIP fashion. Wavelet-CLIP utilizes Wavelet Transforms to deeply analyze
+both spatial and frequency features from images, thus enhancing the model's
+capability to detect sophisticated deepfakes. To verify the effectiveness of
+our approach, we conducted extensive evaluations against existing
+state-of-the-art methods for cross-dataset generalization and detection of
+unseen images generated by standard diffusion models. Our method showcases
+outstanding performance, achieving an average AUC of 0.749 for cross-data
+generalization and 0.893 for robustness against unseen deepfakes, outperforming
+all compared methods. The code can be reproduced from the repo:
+\url{https://github.com/lalithbharadwajbaru/Wavelet-CLIP}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Pages, 2 Figures, 3 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PRMBench: A Fine-grained and Challenging Benchmark for Process-Level
+  Reward Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03124v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03124v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyang Song, Zhaochen Su, Xiaoye Qu, Jiawei Zhou, Yu Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Process-level Reward Models (PRMs) are crucial for complex reasoning and
+decision-making tasks, where each intermediate step plays an important role in
+the reasoning process. Since language models are prone to various types of
+errors during the reasoning process, PRMs are required to possess nuanced
+capabilities for detecting various implicit error types in real-world
+scenarios. However, current benchmarks primarily focus on step correctness,
+failing to evaluate PRMs' performance systematically. To address this gap, we
+introduce PRMBench, a process-level benchmark specifically designed to assess
+the fine-grained error detection capabilities of PRMs. PRMBench comprises 6,216
+carefully designed problems and 83,456 step-level labels, evaluating models
+across multiple dimensions, including simplicity, soundness, and sensitivity.
+In our experiments on 15 models, spanning both open-source PRMs and
+closed-source large language models prompted as critic models, we uncover
+significant weaknesses in current PRMs. These findings underscore the
+challenges inherent in process-level evaluation and highlight key directions
+for future research. We hope PRMBench can be a robust bench for advancing
+research on PRM evaluation and development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://prmbench.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Large Language Models with Multilingualism: Recent Advances
+  and New Frontiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10936v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10936v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyu Huang, Fengran Mo, Xinyu Zhang, Hongliang Li, You Li, Yuanchi Zhang, Weijian Yi, Yulong Mao, Jinchen Liu, Yuzhuang Xu, Jinan Xu, Jian-Yun Nie, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of Large Language Models (LLMs) demonstrates remarkable
+multilingual capabilities in natural language processing, attracting global
+attention in both academia and industry. To mitigate potential discrimination
+and enhance the overall usability and accessibility for diverse language user
+groups, it is important for the development of language-fair technology.
+Despite the breakthroughs of LLMs, the investigation into the multilingual
+scenario remains insufficient, where a comprehensive survey to summarize recent
+approaches, developments, limitations, and potential solutions is desirable. To
+this end, we provide a survey with multiple perspectives on the utilization of
+LLMs in the multilingual scenario. We first rethink the transitions between
+previous and current research on pre-trained language models. Then we introduce
+several perspectives on the multilingualism of LLMs, including training and
+inference methods, information retrieval, model security, multi-domain with
+language culture, and usage of datasets. We also discuss the major challenges
+that arise in these aspects, along with possible solutions. Besides, we
+highlight future research directions that aim at further enhancing LLMs with
+multilingualism. The survey aims to help the research community address
+multilingual problems and provide a comprehensive understanding of the core
+concepts, key techniques, and latest developments in multilingual natural
+language processing based on LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>65 pages, Work in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In Search of Trees: Decision-Tree Policy Synthesis for Black-Box Systems
+  via Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.03260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.03260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emir Demirović, Christian Schilling, Anna Lukina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision trees, owing to their interpretability, are attractive as control
+policies for (dynamical) systems. Unfortunately, constructing, or synthesising,
+such policies is a challenging task. Previous approaches do so by imitating a
+neural-network policy, approximating a tabular policy obtained via formal
+synthesis, employing reinforcement learning, or modelling the problem as a
+mixed-integer linear program. However, these works may require access to a
+hard-to-obtain accurate policy or a formal model of the environment (within
+reach of formal synthesis), and may not provide guarantees on the quality or
+size of the final tree policy. In contrast, we present an approach to
+synthesise optimal decision-tree policies given a deterministic black-box
+environment and specification, a discretisation of the tree predicates, and an
+initial set of states, where optimality is defined with respect to the number
+of steps to achieve the goal. Our approach is a specialised search algorithm
+which systematically explores the (exponentially large) space of decision trees
+under the given discretisation. The key component is a novel trace-based
+pruning mechanism that significantly reduces the search space. Our approach
+represents a conceptually novel way of synthesising small decision-tree
+policies with optimality guarantees even for black-box environments with
+black-box specifications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages main text incl. references, 2 pages appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NBBOX: Noisy Bounding Box Improves Remote Sensing Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.09424v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.09424v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yechan Kim, SooYeon Kim, Moongu Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation has shown significant advancements in computer vision to
+improve model performance over the years, particularly in scenarios with
+limited and insufficient data. Currently, most studies focus on adjusting the
+image or its features to expand the size, quality, and variety of samples
+during training in various tasks including object detection. However, we argue
+that it is necessary to investigate bounding box transformations as a data
+augmentation technique rather than image-level transformations, especially in
+aerial imagery due to potentially inconsistent bounding box annotations. Hence,
+this letter presents a thorough investigation of bounding box transformation in
+terms of scaling, rotation, and translation for remote sensing object
+detection. We call this augmentation strategy NBBOX (Noise Injection into
+Bounding Box). We conduct extensive experiments on DOTA and DIOR-R, both
+well-known datasets that include a variety of rotated generic objects in aerial
+images. Experimental results show that our approach significantly improves
+remote sensing object detection without whistles and bells and it is more
+time-efficient than other state-of-the-art augmentation strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Geoscience and Remote Sensing Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Neural Backdoor: Fundamentals, Methodologies, Applications, and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10573v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10573v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Yang, Gaolei Li, Jianhua Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have significantly advanced various downstream
+graph-relevant tasks, encompassing recommender systems, molecular structure
+prediction, social media analysis, etc. Despite the boosts of GNN, recent
+research has empirically demonstrated its potential vulnerability to backdoor
+attacks, wherein adversaries employ triggers to poison input samples, inducing
+GNN to adversary-premeditated malicious outputs. This is typically due to the
+controlled training process, or the deployment of untrusted models, such as
+delegating model training to third-party service, leveraging external training
+sets, and employing pre-trained models from online sources. Although there's an
+ongoing increase in research on GNN backdoors, comprehensive investigation into
+this field is lacking. To bridge this gap, we propose the first survey
+dedicated to GNN backdoors. We begin by outlining the fundamental definition of
+GNN, followed by the detailed summarization and categorization of current GNN
+backdoor attacks and defenses based on their technical characteristics and
+application scenarios. Subsequently, the analysis of the applicability and use
+cases of GNN backdoors is undertaken. Finally, the exploration of potential
+research directions of GNN backdoors is presented. This survey aims to explore
+the principles of graph backdoors, provide insights to defenders, and promote
+future security research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PSA-VLM: Enhancing Vision-Language Model Safety through Progressive
+  Concept-Bottleneck-Driven Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.11543v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.11543v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhendong Liu, Yuanbi Nie, Yingshui Tan, Jiaheng Liu, Xiangyu Yue, Qiushi Cui, Chongjun Wang, Xiaoyong Zhu, Bo Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from the powerful capabilities of Large Language Models (LLMs),
+pre-trained visual encoder models connected to LLMs form Vision Language Models
+(VLMs). However, recent research shows that the visual modality in VLMs is
+highly vulnerable, allowing attackers to bypass safety alignment in LLMs
+through visually transmitted content, launching harmful attacks. To address
+this challenge, we propose a progressive concept-based alignment strategy,
+PSA-VLM, which incorporates safety modules as concept bottlenecks to enhance
+visual modality safety alignment. By aligning model predictions with specific
+safety concepts, we improve defenses against risky images, enhancing
+explainability and controllability while minimally impacting general
+performance. Our method is obtained through two-stage training. The low
+computational cost of the first stage brings very effective performance
+improvement, and the fine-tuning of the language model in the second stage
+further improves the safety performance. Our method achieves state-of-the-art
+results on popular VLM safety benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2405.13581</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Diffusion Bridges for Unsupervised Musical Audio Timbre Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.06096v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.06096v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michele Mancusi, Yurii Halychanskyi, Kin Wai Cheuk, Eloi Moliner, Chieh-Hsin Lai, Stefan Uhlich, Junghyun Koo, Marco A. Martínez-Ramírez, Wei-Hsiang Liao, Giorgio Fabbro, Yuki Mitsufuji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music timbre transfer is a challenging task that involves modifying the
+timbral characteristics of an audio signal while preserving its melodic
+structure. In this paper, we propose a novel method based on dual diffusion
+bridges, trained using the CocoChorales Dataset, which consists of unpaired
+monophonic single-instrument audio data. Each diffusion model is trained on a
+specific instrument with a Gaussian prior. During inference, a model is
+designated as the source model to map the input audio to its corresponding
+Gaussian prior, and another model is designated as the target model to
+reconstruct the target audio from this Gaussian prior, thereby facilitating
+timbre transfer. We compare our approach against existing unsupervised timbre
+transfer models such as VAEGAN and Gaussian Flow Bridges (GFB). Experimental
+results demonstrate that our method achieves both better Fr\'echet Audio
+Distance (FAD) and melody preservation, as reflected by lower pitch distances
+(DPD) compared to VAEGAN and GFB. Additionally, we discover that the noise
+level from the Gaussian prior, $\sigma$, can be adjusted to control the degree
+of melody preservation and amount of timbre transferred.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimization of <span class="highlight-title">Transformer</span> heart disease prediction model based on
+  particle swarm optimization algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.02801v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.02801v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyuan Yi, Peiyang Yu, Tianyi Huang, Zeqiu Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aiming at the latest particle swarm optimization algorithm, this paper
+proposes an improved Transformer model to improve the accuracy of heart disease
+prediction and provide a new algorithm idea. We first use three mainstream
+machine learning classification algorithms - decision tree, random forest and
+XGBoost, and then output the confusion matrix of these three models. The
+results showed that the random forest model had the best performance in
+predicting the classification of heart disease, with an accuracy of 92.2%.
+Then, we apply the Transformer model based on particle swarm optimization (PSO)
+algorithm to the same dataset for classification experiment. The results show
+that the classification accuracy of the model is as high as 96.5%, 4.3
+percentage points higher than that of random forest, which verifies the
+effectiveness of PSO in optimizing Transformer model. From the above research,
+we can see that particle swarm optimization significantly improves Transformer
+performance in heart disease prediction. Improving the ability to predict heart
+disease is a global priority with benefits for all humankind. Accurate
+prediction can enhance public health, optimize medical resources, and reduce
+healthcare costs, leading to healthier populations and more productive
+societies worldwide. This advancement paves the way for more efficient health
+management and supports the foundation of a healthier, more resilient global
+community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforcement Learning for an Efficient and Effective Malware
+  Investigation during Cyber Incident Response 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01999v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01999v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dipo Dunsin, Mohamed Chahine Ghanem, Karim Ouazzane, Vassil Vassilev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research focused on enhancing post-incident malware forensic
+investigation using reinforcement learning RL. We proposed an advanced MDP post
+incident malware forensics investigation model and framework to expedite post
+incident forensics. We then implement our RL Malware Investigation Model based
+on structured MDP within the proposed framework. To identify malware artefacts,
+the RL agent acquires and examines forensics evidence files, iteratively
+improving its capabilities using Q Table and temporal difference learning. The
+Q learning algorithm significantly improved the agent ability to identify
+malware. An epsilon greedy exploration strategy and Q learning updates enabled
+efficient learning and decision making. Our experimental testing revealed that
+optimal learning rates depend on the MDP environment complexity, with simpler
+environments benefiting from higher rates for quicker convergence and complex
+ones requiring lower rates for stability. Our model performance in identifying
+and classifying malware reduced malware analysis time compared to human
+experts, demonstrating robustness and adaptability. The study highlighted the
+significance of hyper parameter tuning and suggested adaptive strategies for
+complex environments. Our RL based approach produced promising results and is
+validated as an alternative to traditional methods notably by offering
+continuous learning and adaptation to new and evolving malware threats which
+ultimately enhance the post incident forensics investigations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Samba-ASR: State-Of-The-Art Speech Recognition Leveraging Structured
+  State-Space Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02832v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02832v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Syed Abdul Gaffar Shakhadri, Kruthika KR, Kartik Basavaraj Angadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Samba ASR,the first state of the art Automatic Speech
+Recognition(ASR)model leveraging the novel Mamba architecture as both encoder
+and decoder,built on the foundation of state space models(SSMs).Unlike
+transformerbased ASR models,which rely on self-attention mechanisms to capture
+dependencies,Samba ASR effectively models both local and global temporal
+dependencies using efficient statespace dynamics,achieving remarkable
+performance gains.By addressing the limitations of transformers,such as
+quadratic scaling with input length and difficulty in handling longrange
+dependencies,Samba ASR achieves superior accuracy and efficiency.Experimental
+results demonstrate that Samba ASR surpasses existing opensource
+transformerbased ASR models across various standard benchmarks,establishing it
+as the new state of theart in ASR.Extensive evaluations on the benchmark
+dataset show significant improvements in Word Error Rate(WER),with competitive
+performance even in lowresource scenarios.Furthermore,the inherent
+computational efficiency and parameter optimization of the Mamba architecture
+make Samba ASR a scalable and robust solution for diverse ASR tasks.Our
+contributions include the development of a new Samba ASR architecture for
+automatic speech recognition(ASR),demonstrating the superiority of structured
+statespace models(SSMs)over transformer based models for speech sequence
+processing.We provide a comprehensive evaluation on public
+benchmarks,showcasing stateoftheart(SOTA)performance,and present an indepth
+analysis of computational efficiency,robustness to noise,and sequence
+generalization.This work highlights the viability of Mamba SSMs as a
+transformerfree alternative for efficient and accurate ASR.By leveraging the
+advancements of statespace modeling,Samba ASR redefines ASR performance
+standards and sets a new benchmark for future research in this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deploying Open-Source Large Language Models: A performance Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14887v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14887v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannis Bendi-Ouis, Dan Dutartre, Xavier Hinaut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the release of ChatGPT in November 2022, large language models (LLMs)
+have seen considerable success, including in the open-source community, with
+many open-weight models available. However, the requirements to deploy such a
+service are often unknown and difficult to evaluate in advance. To facilitate
+this process, we conducted numerous tests at the Centre Inria de l'Universit\'e
+de Bordeaux. In this article, we propose a comparison of the performance of
+several models of different sizes (mainly Mistral and LLaMa) depending on the
+available GPUs, using vLLM, a Python library designed to optimize the inference
+of these models. Our results provide valuable information for private and
+public groups wishing to deploy LLMs, allowing them to evaluate the performance
+of different models based on their available hardware. This study thus
+contributes to facilitating the adoption and use of these large language models
+in various application domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Review</span> of Bayesian Uncertainty Quantification in Deep Probabilistic
+  Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.16370v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.16370v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. M. A. Valiuddin, R. J. G. van Sloun, C. G. A. Viviers, P. H. N. de With, F. van der Sommen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in image segmentation play an integral role within the broad
+scope of Deep Learning-based Computer Vision. Furthermore, their widespread
+applicability in critical real-world tasks has resulted in challenges related
+to the reliability of such algorithms. Hence, uncertainty quantification has
+been extensively studied within this context, enabling the expression of model
+ignorance (epistemic uncertainty) or data ambiguity (aleatoric uncertainty) to
+prevent uninformed decision-making. Due to the rapid adoption of Convolutional
+Neural Network (CNN)-based segmentation models in high-stake applications, a
+substantial body of research has been published on this very topic, causing its
+swift expansion into a distinct field. This work provides a comprehensive
+overview of probabilistic segmentation, by discussing fundamental concepts of
+uncertainty quantification, governing advancements in the field as well as the
+application to various tasks. Moreover, literature on both types of
+uncertainties trace back to four key applications: (1) to quantify statistical
+inconsistencies in the annotation process due ambiguous images, (2) correlating
+prediction error with uncertainty, (3) expanding the model hypothesis space for
+better generalization, and (4) Active Learning. An extensive discussion follows
+that includes an overview of utilized datasets for each of the applications and
+evaluation of the available methods. We also highlight challenges related to
+architectures, uncertainty quantification methods, standardization and
+benchmarking, and finally end with recommendations for future work such as
+methods based on single forward passes and models that appropriately leverage
+volumetric data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, revised</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Autonomous Alignment with Human Value on Altruism through Considerate
+  Self-imagination and Theory of Mind 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00320v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00320v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haibo Tong, Enmeng Lu, Yinqian Sun, Zhengqiang Han, Chao Liu, Feifei Zhao, Yi Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the widespread application of Artificial Intelligence (AI) in human
+society, enabling AI to autonomously align with human values has become a
+pressing issue to ensure its sustainable development and benefit to humanity.
+One of the most important aspects of aligning with human values is the
+necessity for agents to autonomously make altruistic, safe, and ethical
+decisions, considering and caring for human well-being. Current AI extremely
+pursues absolute superiority in certain tasks, remaining indifferent to the
+surrounding environment and other agents, which has led to numerous safety
+risks. Altruistic behavior in human society originates from humans' capacity
+for empathizing others, known as Theory of Mind (ToM), combined with predictive
+imaginative interactions before taking action to produce thoughtful and
+altruistic behaviors. Inspired by this, we are committed to endow agents with
+considerate self-imagination and ToM capabilities, driving them through
+implicit intrinsic motivations to autonomously align with human altruistic
+values. By integrating ToM within the imaginative space, agents keep an eye on
+the well-being of other agents in real time, proactively anticipate potential
+risks to themselves and others, and make thoughtful altruistic decisions that
+balance negative effects on the environment. The ancient Chinese story of Sima
+Guang Smashes the Vat illustrates the moral behavior of the young Sima Guang
+smashed a vat to save a child who had accidentally fallen into it, which is an
+excellent reference scenario for this paper. We design an experimental scenario
+similar to Sima Guang Smashes the Vat and its variants with different
+complexities, which reflects the trade-offs and comprehensive considerations
+between self-goals, altruistic rescue, and avoiding negative side effects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Mitigating Architecture Overfitting on Distilled <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.04195v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.04195v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyang Zhong, Chen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation methods have demonstrated remarkable performance for
+neural networks trained with very limited training data. However, a significant
+challenge arises in the form of \textit{architecture overfitting}: the
+distilled training dataset synthesized by a specific network architecture
+(i.e., training network) generates poor performance when trained by other
+network architectures (i.e., test networks), especially when the test networks
+have a larger capacity than the training network. This paper introduces a
+series of approaches to mitigate this issue. Among them, DropPath renders the
+large model to be an implicit ensemble of its sub-networks, and knowledge
+distillation ensures each sub-network acts similarly to the small but
+well-performing teacher network. These methods, characterized by their
+smoothing effects, significantly mitigate architecture overfitting. We conduct
+extensive experiments to demonstrate the effectiveness and generality of our
+methods. Particularly, across various scenarios involving different tasks and
+different sizes of distilled data, our approaches significantly mitigate
+architecture overfitting. Furthermore, our approaches achieve comparable or
+even superior performance when the test network is larger than the training
+network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TNNLS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CONTINUUM: Detecting APT Attacks through Spatial-Temporal Graph Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02981v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02981v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atmane Ayoub Mansour Bahar, Kamel Soaid Ferrahi, Mohamed-Lamine Messai, Hamida Seba, Karima Amrouche
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advanced Persistent Threats (APTs) represent a significant challenge in
+cybersecurity due to their sophisticated and stealthy nature. Traditional
+Intrusion Detection Systems (IDS) often fall short in detecting these
+multi-stage attacks. Recently, Graph Neural Networks (GNNs) have been employed
+to enhance IDS capabilities by analyzing the complex relationships within
+networked data. However, existing GNN-based solutions are hampered by high
+false positive rates and substantial resource consumption. In this paper, we
+present a novel IDS designed to detect APTs using a Spatio-Temporal Graph
+Neural Network Autoencoder. Our approach leverages spatial information to
+understand the interactions between entities within a graph and temporal
+information to capture the evolution of the graph over time. This dual
+perspective is crucial for identifying the sequential stages of APTs.
+Furthermore, to address privacy and scalability concerns, we deploy our
+architecture in a federated learning environment. This setup ensures that local
+data remains on-premise while encrypted model-weights are shared and aggregated
+using homomorphic encryption, maintaining data privacy and security. Our
+evaluation shows that this system effectively detects APTs with lower false
+positive rates and optimized resource usage compared to existing methods,
+highlighting the potential of spatio-temporal analysis and federated learning
+in enhancing cybersecurity defenses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KNN-MMD: Cross Domain Wireless Sensing via Local Distribution Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.04783v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.04783v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Zhao, Zhijie Cai, Tingwei Chen, Xiaoyang Li, Hang Li, Qimei Chen, Guangxu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wireless sensing has recently found widespread applications in diverse
+environments, including homes, offices, and public spaces. By analyzing
+patterns in channel state information (CSI), it is possible to infer human
+actions for tasks such as person identification, gesture recognition, and fall
+detection. However, CSI is highly sensitive to environmental changes, where
+even minor alterations can significantly distort the CSI patterns. This
+sensitivity often leads to performance degradation or outright failure when
+applying wireless sensing models trained in one environment to another. To
+address this challenge, Domain Alignment (DAL) has been widely adopted for
+cross-domain classification tasks, as it focuses on aligning the global
+distributions of the source and target domains in feature space. Despite its
+popularity, DAL often neglects inter-category relationships, which can lead to
+misalignment between categories across domains, even when global alignment is
+achieved. To overcome these limitations, we propose K-Nearest Neighbors Maximum
+Mean Discrepancy (KNN-MMD), a novel few-shot method for cross-domain wireless
+sensing. Our approach begins by constructing a help set using KNN from the
+target domain, enabling local alignment between the source and target domains
+within each category using MMD. Additionally, we address a key instability
+issue commonly observed in cross-domain methods, where model performance
+fluctuates sharply between epochs. Further, most existing methods struggle to
+determine an optimal stopping point during training due to the absence of
+labeled data from the target domain. Our method resolves this by excluding the
+support set from the target domain during training and employing it as a
+validation set to determine the stopping criterion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MRJ-Agent: An Effective Jailbreak Agent for Multi-Round Dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03814v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03814v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengxiang Wang, Ranjie Duan, Peng Xiao, Xiaojun Jia, Shiji Zhao, Cheng Wei, YueFeng Chen, Chongwen Wang, Jialing Tao, Hang Su, Jun Zhu, Hui Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) demonstrate outstanding performance in their
+reservoir of knowledge and understanding capabilities, but they have also been
+shown to be prone to illegal or unethical reactions when subjected to jailbreak
+attacks. To ensure their responsible deployment in critical applications, it is
+crucial to understand the safety capabilities and vulnerabilities of LLMs.
+Previous works mainly focus on jailbreak in single-round dialogue, overlooking
+the potential jailbreak risks in multi-round dialogues, which are a vital way
+humans interact with and extract information from LLMs. Some studies have
+increasingly concentrated on the risks associated with jailbreak in multi-round
+dialogues. These efforts typically involve the use of manually crafted
+templates or prompt engineering techniques. However, due to the inherent
+complexity of multi-round dialogues, their jailbreak performance is limited. To
+solve this problem, we propose a novel multi-round dialogue jailbreaking agent,
+emphasizing the importance of stealthiness in identifying and mitigating
+potential threats to human values posed by LLMs. We propose a risk
+decomposition strategy that distributes risks across multiple rounds of queries
+and utilizes psychological strategies to enhance attack strength. Extensive
+experiments show that our proposed method surpasses other attack methods and
+achieves state-of-the-art attack success rate. We will make the corresponding
+code and dataset available for future research. The code will be released soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing the automatic segmentation and analysis of 3D liver
+  vasculature models <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.15778v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.15778v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassine Machta, Omar Ali, Kevin Hakkakian, Ana Vlasceanu, Amaury Facque, Nicolas Golse, Irene Vignon-Clementel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surgical assessment of liver cancer patients requires identification of the
+vessel trees from medical images. Specifically, the venous trees - the portal
+(perfusing) and the hepatic (draining) trees are important for understanding
+the liver anatomy and disease state, and perform surgery planning. This
+research aims to improve the 3D segmentation, skeletonization, and subsequent
+analysis of vessel trees, by creating an automatic pipeline based on deep
+learning and image processing techniques.
+  The first part of this work explores the impact of differentiable
+skeletonization methods such as ClDice and morphological skeletonization loss,
+on the overall liver vessel segmentation performance. To this aim, it studies
+how to improve vessel tree connectivity.
+  The second part of this study converts a single class vessel segmentation
+into multi-class ones, separating the two venous trees. It builds on the
+previous two-class vessel segmentation model, which vessel tree outputs might
+be entangled, and on connected components and skeleton analyses of the trees.
+  After providing sub-labeling of the specific anatomical branches of each
+venous tree, these algorithms also enable a morphometric analysis of the vessel
+trees by extracting various geometrical markers.
+  In conclusion, we propose a method that successfully improves current
+skeletonization methods, for extensive vascular trees that contain vessels of
+different calibers. The separation algorithm creates a clean multi-class
+segmentation of the vessels, validated by surgeons to provide low error. A new,
+publicly shared high-quality liver vessel dataset of 77 cases is thus created.
+Finally a method to annotate vessel trees according to anatomy is provided,
+enabling a unique liver vessel morphometry analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper presented at MICCAI 2024 Workshop: ADSMI. This work was done in
+  the context of an internship at Simbiotx, Inria</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MULTI: Multimodal Understanding Leaderboard with Text and Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03173v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03173v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichen Zhu, Yang Xu, Lu Chen, Jingkai Yang, Yichuan Ma, Yiming Sun, Hailin Wen, Jiaqi Liu, Jinyu Cai, Yingzi Ma, Situo Zhang, Zihan Zhao, Liangtai Sun, Kai Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of multimodal large language models (MLLMs) raises the
+question of how they compare to human performance. While existing datasets
+often feature synthetic or overly simplistic tasks, some models have already
+surpassed human expert baselines. In this paper, we present MULTI, a Chinese
+multimodal dataset derived from authentic examination questions. Comprising
+over 18,000 carefully selected and refined questions, MULTI evaluates models
+using real-world examination standards, encompassing image-text comprehension,
+complex reasoning, and knowledge recall. Additionally, We also introduce
+MULTI-Elite, a 500-question selected hard subset, and MULTI-Extend with more
+than 4,500 external knowledge context pieces for testing in-context learning
+capabilities. Our evaluation highlights substantial room for MLLM advancement,
+with Qwen2-VL-72B achieving a 76.9% accuracy on MULTI and 53.1% on MULTI-Elite
+leading 25 evaluated models, compared to human expert baselines of 86.1% and
+73.1%. MULTI serves not only as a robust evaluation platform but also paves the
+way for the development of expert-level AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 19 figures, 10 tables. Details and access are available at:
+  https://OpenDFM.github.io/MULTI-Benchmark/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scam Detection for Ethereum Smart Contracts: Leveraging Graph
+  Representation Learning for Secure Blockchain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.12370v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.12370v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihong Jin, Ze Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the increasing abuse of fraudulent activities that result in
+significant financial and reputational harm, Ethereum smart contracts face a
+significant problem in detecting fraud. Existing monitoring methods typically
+rely on lease code analysis or physically extracted features, which suffer from
+scalability and adaptability limitations. In this study, we use graph
+representation learning to observe purchase trends and find fraudulent deals.
+We can achieve powerful categorisation performance by using innovative machine
+learning versions and transforming Ethereum invoice data into graph structures.
+Our method addresses label imbalance through SMOTE-ENN techniques and evaluates
+models like Multi-Layer Perceptron ( MLP ) and Graph Convolutional Networks (
+GCN). Experimental results show that the MLP type surpasses the GCN in this
+environment, with domain-specific assessments closely aligned with real-world
+assessments. This study provides a scalable and efficient way to improve
+Ethereum's ecosystem's confidence and security.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to BDICN 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Edge Graph Intelligence: Reciprocally Empowering Edge Networks with
+  Graph Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15320v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15320v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liekang Zeng, Shengyuan Ye, Xu Chen, Xiaoxi Zhang, Ju Ren, Jian Tang, Yang Yang,  Xuemin,  Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed a thriving growth of computing facilities
+connected at the network edge, cultivating edge networks as a fundamental
+infrastructure for supporting miscellaneous intelligent services.Meanwhile,
+Artificial Intelligence (AI) frontiers have extrapolated to the graph domain
+and promoted Graph Intelligence (GI). Given the inherent relation between
+graphs and networks, the interdiscipline of graph learning and edge networks,
+i.e., Edge GI or EGI, has revealed a novel interplay between them -- GI aids in
+optimizing edge networks, while edge networks facilitate GI model deployment.
+Driven by this delicate closed-loop, EGI is recognized as a promising solution
+to fully unleash the potential of edge computing power and is garnering growing
+attention. Nevertheless, research on EGI remains nascent, and there is a
+soaring demand within both the communications and AI communities for a
+dedicated venue to share recent advancements. To this end, this paper promotes
+the concept of EGI, explores its scope and core principles, and conducts a
+comprehensive survey concerning recent research efforts on this emerging field.
+Specifically, this paper introduces and discusses: 1) fundamentals of edge
+computing and graph learning,2) emerging techniques centering on the closed
+loop between graph intelligence and edge networks, and 3) open challenges and
+research opportunities of future EGI. By bridging the gap across communication,
+networking, and graph learning areas, we believe that this survey can garner
+increased attention, foster meaningful discussions, and inspire further
+research ideas in EGI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Communications Surveys & Tutorials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CausalMob: Causal Human Mobility Prediction with LLMs-derived Human
+  Intentions toward Public Events <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.02155v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.02155v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojie Yang, Hangli Ge, Jiawei Wang, Zipei Fan, Renhe Jiang, Ryosuke Shibasaki, Noboru Koshizuka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale human mobility exhibits spatial and temporal patterns that can
+assist policymakers in decision making. Although traditional prediction models
+attempt to capture these patterns, they often interfered by non-periodic public
+events, such as disasters and occasional celebrations. Since regular human
+mobility patterns are heavily affected by these events, estimating their causal
+effects is critical to accurate mobility predictions. Although news articles
+provide unique perspectives on these events in an unstructured format,
+processing is a challenge. In this study, we propose a causality-augmented
+prediction model, called CausalMob, to analyze the causal effects of public
+events. We first utilize large language models (LLMs) to extract human
+intentions from news articles and transform them into features that act as
+causal treatments. Next, the model learns representations of spatio-temporal
+regional covariates from multiple data sources to serve as confounders for
+causal inference. Finally, we present a causal effect estimation framework to
+ensure event features remain independent of confounders during prediction.
+Based on large-scale real-world data, the experimental results show that the
+proposed model excels in human mobility prediction, outperforming
+state-of-the-art models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenCodeInterpreter: Integrating Code Generation with Execution and
+  Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14658v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14658v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Zheng, Ge Zhang, Tianhao Shen, Xueling Liu, Bill Yuchen Lin, Jie Fu, Wenhu Chen, Xiang Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The introduction of large language models has significantly advanced code
+generation. However, open-source models often lack the execution capabilities
+and iterative refinement of advanced systems like the GPT-4 Code Interpreter.
+To address this, we introduce OpenCodeInterpreter, a family of open-source code
+systems designed for generating, executing, and iteratively refining code.
+Supported by Code-Feedback, a dataset featuring 68K multi-turn interactions,
+OpenCodeInterpreter integrates execution and human feedback for dynamic code
+refinement. Our comprehensive evaluation of OpenCodeInterpreter across key
+benchmarks such as HumanEval, MBPP, and their enhanced versions from EvalPlus
+reveals its exceptional performance. Notably, OpenCodeInterpreter-33B achieves
+an accuracy of 83.2 (76.4) on the average (and plus versions) of HumanEval and
+MBPP, closely rivaling GPT-4's 84.2 (76.2) and further elevates to 91.6 (84.6)
+with synthesized human feedback from GPT-4. OpenCodeInterpreter brings the gap
+between open-source code generation models and proprietary systems like GPT-4
+Code Interpreter.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Race to Efficiency: A New Perspective on AI Scaling Laws 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02156v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02156v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chien-Ping Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large-scale AI models expand, training becomes costlier and sustaining
+progress grows harder. Classical scaling laws (e.g., Kaplan et al. (2020),
+Hoffmann et al. (2022)) predict training loss from a static compute budget yet
+neglect time and efficiency, prompting the question: how can we balance
+ballooning GPU fleets with rapidly improving hardware and algorithms? We
+introduce the relative-loss equation, a time- and efficiency-aware framework
+that extends classical AI scaling laws. Our model shows that, without ongoing
+efficiency gains, advanced performance could demand millennia of training or
+unrealistically large GPU fleets. However, near-exponential progress remains
+achievable if the "efficiency-doubling rate" parallels Moore's Law. By
+formalizing this race to efficiency, we offer a quantitative roadmap for
+balancing front-loaded GPU investments with incremental improvements across the
+AI stack. Empirical trends suggest that sustained efficiency gains can push AI
+scaling well into the coming decade, providing a new perspective on the
+diminishing returns inherent in classical scaling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 3 figures. 2 tables, second draft</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProSparse: Introducing and Enhancing Intrinsic Activation Sparsity
+  within Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13516v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13516v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyang Song, Xu Han, Zhengyan Zhang, Shengding Hu, Xiyu Shi, Kuai Li, Chen Chen, Zhiyuan Liu, Guangli Li, Tao Yang, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Activation sparsity refers to the existence of considerable
+weakly-contributed elements among activation outputs. As a prevalent property
+of the models using the ReLU activation function, activation sparsity has been
+proven a promising paradigm to boost model inference efficiency. Nevertheless,
+most large language models (LLMs) adopt activation functions without intrinsic
+activation sparsity (e.g., GELU and Swish). Some recent efforts have explored
+introducing ReLU or its variants as the substitutive activation function to
+help LLMs achieve activation sparsity and inference acceleration, but few can
+simultaneously obtain high sparsity and comparable model performance. This
+paper introduces a simple and effective sparsification method named "ProSparse"
+to push LLMs for higher activation sparsity while maintaining comparable
+performance. Specifically, after substituting the activation function of LLMs
+with ReLU, ProSparse adopts progressive sparsity regularization with a factor
+smoothly increasing along the multi-stage sine curves. This can enhance
+activation sparsity and mitigate performance degradation by avoiding radical
+shifts in activation distributions. With ProSparse, we obtain high sparsity of
+89.32% for LLaMA2-7B, 88.80% for LLaMA2-13B, and 87.89% for end-size
+MiniCPM-1B, respectively, achieving comparable performance to their original
+Swish-activated versions. These present the most sparsely activated models
+among open-source LLaMA versions and competitive end-size models, considerably
+surpassing ReluLLaMA-7B (66.98%) and ReluLLaMA-13B (71.56%). Our inference
+acceleration experiments further demonstrate the significant practical
+acceleration potential of LLMs with higher activation sparsity, obtaining up to
+4.52$\times$ inference speedup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rescriber: Smaller-LLM-Powered User-Led Data Minimization for Navigating
+  Privacy Trade-offs in LLM-Based Conversational Agent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.11876v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.11876v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jijie Zhou, Eryue Xu, Yaoyao Wu, Tianshi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of LLM-based conversational agents has resulted in
+excessive disclosure of identifiable or sensitive information. However,
+existing technologies fail to offer perceptible control or account for users'
+personal preferences about privacy-utility tradeoffs due to the lack of user
+involvement. To bridge this gap, we designed, built, and evaluated Rescriber, a
+browser extension that supports user-led data minimization in LLM-based
+conversational agents by helping users detect and sanitize personal information
+in their prompts. Our studies (N=12) showed that Rescriber helped users reduce
+unnecessary disclosure and addressed their privacy concerns. Users' subjective
+perceptions of the system powered by Llama3-8B were on par with that by GPT-4o.
+The comprehensiveness and consistency of the detection and sanitization emerge
+as essential factors that affect users' trust and perceived protection. Our
+findings confirm the viability of smaller-LLM-powered, user-facing, on-device
+privacy controls, presenting a promising approach to address the privacy and
+trust challenges of AI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ORGANA: A Robotic Assistant for Automated Chemistry Experimentation and
+  Characterization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06949v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06949v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kourosh Darvish, Marta Skreta, Yuchi Zhao, Naruki Yoshikawa, Sagnik Som, Miroslav Bogdanovic, Yang Cao, Han Hao, Haoping Xu, Alán Aspuru-Guzik, Animesh Garg, Florian Shkurti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chemistry experiments can be resource- and labor-intensive, often requiring
+manual tasks like polishing electrodes in electrochemistry. Traditional lab
+automation infrastructure faces challenges adapting to new experiments. To
+address this, we introduce ORGANA, an assistive robotic system that automates
+diverse chemistry experiments using decision-making and perception tools. It
+makes decisions with chemists in the loop to control robots and lab devices.
+ORGANA interacts with chemists using Large Language Models (LLMs) to derive
+experiment goals, handle disambiguation, and provide experiment logs. ORGANA
+plans and executes complex tasks with visual feedback, while supporting
+scheduling and parallel task execution. We demonstrate ORGANA's capabilities in
+solubility, pH measurement, recrystallization, and electrochemistry
+experiments. In electrochemistry, it executes a 19-step plan in parallel to
+characterize quinone derivatives for flow batteries. Our user study shows
+ORGANA reduces frustration and physical demand by over 50%, with users saving
+an average of 80.3% of their time when using it.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChatBug: A Common Vulnerability of Aligned LLMs Induced by Chat
+  Templates <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12935v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12935v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengqing Jiang, Zhangchen Xu, Luyao Niu, Bill Yuchen Lin, Radha Poovendran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are expected to follow instructions from users
+and engage in conversations. Techniques to enhance LLMs' instruction-following
+capabilities typically fine-tune them using data structured according to a
+predefined chat template. Although chat templates are shown to be effective in
+optimizing LLM performance, their impact on safety alignment of LLMs has been
+less understood, which is crucial for deploying LLMs safely at scale.
+  In this paper, we investigate how chat templates affect safety alignment of
+LLMs. We identify a common vulnerability, named ChatBug, that is introduced by
+chat templates. Our key insight to identify ChatBug is that the chat templates
+provide a rigid format that need to be followed by LLMs, but not by users.
+Hence, a malicious user may not necessarily follow the chat template when
+prompting LLMs. Instead, malicious users could leverage their knowledge of the
+chat template and accordingly craft their prompts to bypass safety alignments
+of LLMs. We develop two attacks to exploit the ChatBug vulnerability. We
+demonstrate that a malicious user can exploit the ChatBug vulnerability of
+eight state-of-the-art (SOTA) LLMs and effectively elicit unintended responses
+from these models. Moreover, we show that ChatBug can be exploited by existing
+jailbreak attacks to enhance their attack success rates. We investigate
+potential countermeasures to ChatBug. Our results show that while adversarial
+training effectively mitigates the ChatBug vulnerability, the victim model
+incurs significant performance degradation. These results highlight the
+trade-off between safety alignment and helpfulness. Developing new methods for
+instruction tuning to balance this trade-off is an open and critical direction
+for future research
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HuRef: HUman-REadable Fingerprint for Large Language Models <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04828v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04828v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyi Zeng, Lizheng Wang, Yuncong Hu, Yi Xu, Chenghu Zhou, Xinbing Wang, Yu Yu, Zhouhan Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Protecting the copyright of large language models (LLMs) has become crucial
+due to their resource-intensive training and accompanying carefully designed
+licenses. However, identifying the original base model of an LLM is challenging
+due to potential parameter alterations. In this study, we introduce HuRef, a
+human-readable fingerprint for LLMs that uniquely identifies the base model
+without interfering with training or exposing model parameters to the public.
+We first observe that the vector direction of LLM parameters remains stable
+after the model has converged during pretraining, with negligible perturbations
+through subsequent training steps, including continued pretraining, supervised
+fine-tuning, and RLHF, which makes it a sufficient condition to identify the
+base model. The necessity is validated by continuing to train an LLM with an
+extra term to drive away the model parameters' direction and the model becomes
+damaged. However, this direction is vulnerable to simple attacks like dimension
+permutation or matrix rotation, which significantly change it without affecting
+performance. To address this, leveraging the Transformer structure, we
+systematically analyze potential attacks and define three invariant terms that
+identify an LLM's base model. Due to the potential risk of information leakage,
+we cannot publish invariant terms directly. Instead, we map them to a Gaussian
+vector using an encoder, then convert it into a natural image using StyleGAN2,
+and finally publish the image. In our black-box setting, all fingerprinting
+steps are internally conducted by the LLMs owners. To ensure the published
+fingerprints are honestly generated, we introduced Zero-Knowledge Proof (ZKP).
+Experimental results across various LLMs demonstrate the effectiveness of our
+method. The code is available at https://github.com/LUMIA-Group/HuRef.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Speech and Audio Coding: Modern AI Technology Meets Traditional
+  Codecs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06954v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06954v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minje Kim, Jan Skoglund
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the integration of model-based and data-driven approaches
+within the realm of neural speech and audio coding systems. It highlights the
+challenges posed by the subjective evaluation processes of speech and audio
+codecs and discusses the limitations of purely data-driven approaches, which
+often require inefficiently large architectures to match the performance of
+model-based methods. The study presents hybrid systems as a viable solution,
+offering significant improvements to the performance of conventional codecs
+through meticulously chosen design enhancements. Specifically, it introduces a
+neural network-based signal enhancer designed to post-process existing codecs'
+output, along with the autoencoder-based end-to-end models and LPCNet--hybrid
+systems that combine linear predictive coding (LPC) with neural networks.
+Furthermore, the paper delves into predictive models operating within custom
+feature spaces (TF-Codec) or predefined transform domains (MDCTNet) and
+examines the use of psychoacoustically calibrated loss functions to train
+end-to-end neural audio codecs. Through these investigations, the paper
+demonstrates the potential of hybrid systems to advance the field of speech and
+audio coding by bridging the gap between traditional model-based approaches and
+modern data-driven techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in IEEE Signal Processing Magazine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LightGNN: Simple Graph Neural Network for Recommendation <span class="chip">WSDM 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.03228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.03228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxuan Chen, Lianghao Xia, Chao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have demonstrated superior performance in
+collaborative recommendation through their ability to conduct high-order
+representation smoothing, effectively capturing structural information within
+users' interaction patterns. However, existing GNN paradigms face significant
+challenges in scalability and robustness when handling large-scale, noisy, and
+real-world datasets. To address these challenges, we present LightGNN, a
+lightweight and distillation-based GNN pruning framework designed to
+substantially reduce model complexity while preserving essential collaboration
+modeling capabilities. Our LightGNN framework introduces a computationally
+efficient pruning module that adaptively identifies and removes redundant edges
+and embedding entries for model compression. The framework is guided by a
+resource-friendly hierarchical knowledge distillation objective, whose
+intermediate layer augments the observed graph to maintain performance,
+particularly in high-rate compression scenarios. Extensive experiments on
+public datasets demonstrate LightGNN's effectiveness, significantly improving
+both computational efficiency and recommendation accuracy. Notably, LightGNN
+achieves an 80% reduction in edge count and 90% reduction in embedding entries
+while maintaining performance comparable to more complex state-of-the-art
+baselines. The implementation of our LightGNN framework is available at the
+github repository: https://github.com/HKUDS/LightGNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WSDM 2025 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging the Language Gap: Dynamic Learning Strategies for Improving
+  Multilingual Performance in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17740v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17740v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somnath Kumar, Vaibhav Balloli, Mercy Ranjit, Kabir Ahuja, Sunayana Sitaram, Kalika Bali, Tanuja Ganu, Akshay Nambi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have revolutionized various domains but still
+struggle with non-Latin scripts and low-resource languages. This paper
+addresses the critical challenge of improving multilingual performance without
+extensive fine-tuning. We introduce a novel dynamic learning approach that
+optimizes prompt strategy, embedding model, and LLM per query at runtime. By
+adapting configurations dynamically, our method achieves significant
+improvements over static, best and random baselines. It operates efficiently in
+both offline and online settings, generalizing seamlessly across new languages
+and datasets. Leveraging Retrieval-Augmented Generation (RAG) with
+state-of-the-art multilingual embeddings, we achieve superior task performance
+across diverse linguistic contexts. Through systematic investigation and
+evaluation across 18 diverse languages using popular question-answering (QA)
+datasets we show our approach results in 10-15% improvements in multilingual
+performance over pre-trained models and 4x gains compared to fine-tuned,
+language-specific models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predictable Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06167v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06167v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lexin Zhou, Pablo A. Moreno-Casares, Fernando Martínez-Plumed, John Burden, Ryan Burnell, Lucy Cheke, Cèsar Ferri, Alexandru Marcoci, Behzad Mehrbakhsh, Yael Moros-Daval, Seán Ó hÉigeartaigh, Danaja Rutar, Wout Schellaert, Konstantinos Voudouris, José Hernández-Orallo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the fundamental ideas and challenges of Predictable AI, a
+nascent research area that explores the ways in which we can anticipate key
+validity indicators (e.g., performance, safety) of present and future AI
+ecosystems. We argue that achieving predictability is crucial for fostering
+trust, liability, control, alignment and safety of AI ecosystems, and thus
+should be prioritised over performance. We formally characterise
+predictability, explore its most relevant components, illustrate what can be
+predicted, describe alternative candidates for predictors, as well as the
+trade-offs between maximising validity and predictability. To illustrate these
+concepts, we bring an array of illustrative examples covering diverse ecosystem
+configurations. Predictable AI is related to other areas of technical and
+non-technical AI research, but have distinctive questions, hypotheses,
+techniques and challenges. This paper aims to elucidate them, calls for
+identifying paths towards a landscape of predictably valid AI systems and
+outlines the potential impact of this emergent field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Gradient Subspaces: Addressing and Overcoming LoRA's
+  Limitations in Federated Fine-Tuning of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.23111v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.23111v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navyansh Mahla, Kshitij Sharad Jadhav, Ganesh Ramakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable capabilities across
+various domains, particularly in task generalization for both text and vision
+data. While fine-tuning these models can significantly enhance their
+performance on specific downstream tasks, it often requires high-quality data
+that cannot be shared due to privacy concerns. Federated Learning (FL) offers a
+promising solution for collaborative training without direct data sharing.
+However, many parameter-efficient fine-tuning strategies for LLMs in FL,
+particularly those based on Low-Rank Adaptation (LoRA), face limitations. In
+this paper, we critically analyze the convergence and performance guarantees of
+popular FL frameworks utilizing LoRA, highlighting its suboptimal nature due to
+constrained subspace learning of low-rank matrices. This limitation hinders
+effective fine-tuning of LLMs in federated settings. Through rigorous
+analytical and empirical evaluations, we demonstrate that direct weight
+averaging outperforms LoRA-based strategies, leading to superior performance
+for fine-tuned models. Our comprehensive comparison unmasks inefficiencies in
+LoRA approaches and underscores the advantages of direct weight aggregation. We
+extend our analysis to low-rank gradient-based optimizers, such as GaLore, used
+during local training steps. Our findings show that GaLore along with
+direct-weight aggregation is a more effective approach, outperforming federated
+LoRA methods like FlexLoRA and FFA-LoRA across both text and image modalities.
+While privacy remains paramount in FL discourse, our focus is on assessing
+performance outcomes of federated fine-tuned models and evaluating various FL
+frameworks from both theoretical and empirical perspectives. Our findings
+advocate reassessing the reliance on LoRA within FL contexts, paving the way
+for more efficient training methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Concept Matching with Agent for Out-of-Distribution Detection <span class="chip">AAAI-25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiao Lee, Xiaofeng Cao, Jingcai Guo, Wei Ye, Qing Guo, Yi Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable achievements of Large Language Models (LLMs) have captivated
+the attention of both academia and industry, transcending their initial role in
+dialogue generation. To expand the usage scenarios of LLM, some works enhance
+the effectiveness and capabilities of the model by introducing more external
+information, which is called the agent paradigm. Based on this idea, we propose
+a new method that integrates the agent paradigm into out-of-distribution (OOD)
+detection task, aiming to improve its robustness and adaptability. Our proposed
+method, Concept Matching with Agent (CMA), employs neutral prompts as agents to
+augment the CLIP-based OOD detection process. These agents function as dynamic
+observers and communication hubs, interacting with both In-distribution (ID)
+labels and data inputs to form vector triangle relationships. This triangular
+framework offers a more nuanced approach than the traditional binary
+relationship, allowing for better separation and identification of ID and OOD
+inputs. Our extensive experimental results showcase the superior performance of
+CMA over both zero-shot and training-required methods in a diverse array of
+real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI-25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Learning for Numeric Planning <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.24080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.24080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dillon Z. Chen, Sylvie Thiébaux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph learning is naturally well suited for use in symbolic, object-centric
+planning due to its ability to exploit relational structures exhibited in
+planning domains and to take as input planning instances with arbitrary numbers
+of objects. Numeric planning is an extension of symbolic planning in which
+states may now also exhibit numeric variables. In this work, we propose
+data-efficient and interpretable machine learning models for learning to solve
+numeric planning tasks. This involves constructing a new graph kernel for
+graphs with both continuous and categorical attributes, as well as new
+optimisation methods for learning heuristic functions for numeric planning.
+Experiments show that our graph kernels are vastly more efficient and
+generalise better than graph neural networks for numeric planning, and also
+yield competitive coverage performance compared to domain-independent numeric
+planners. Code is available at https://github.com/DillonZChen/goose
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of NeurIPS 2024 paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Checking in Medical Imaging for Tumor Detection and Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02024v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02024v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elhoucine Elfatimi, Lahcen El fatimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in model checking have demonstrated significant potential
+across diverse applications, particularly in signal and image analysis. Medical
+imaging stands out as a critical domain where model checking can be effectively
+applied to design and evaluate robust frameworks. These frameworks facilitate
+automatic and semi-automatic delineation of regions of interest within images,
+aiding in accurate segmentation. This paper provides a comprehensive analysis
+of recent works leveraging spatial logic to develop operators and tools for
+identifying regions of interest, including tumorous and non-tumorous areas.
+Additionally, we examine the challenges inherent to spatial model-checking
+techniques, such as variability in ground truth data and the need for
+streamlined procedures suitable for routine clinical practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DistPred: A Distribution-Free Probabilistic Inference Method for
+  Regression and Forecasting <span class="chip">KDD 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11397v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11397v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daojun Liang, Haixia Zhang, Dongfeng Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional regression and prediction tasks often only provide deterministic
+point estimates. To estimate the distribution or uncertainty of the response
+variable, traditional methods either assume that the posterior distribution of
+samples follows a Gaussian process or require thousands of forward passes for
+sample generation. We propose a novel approach called DistPred for regression
+and forecasting tasks, which overcomes the limitations of existing methods
+while remaining simple and powerful. Specifically, we transform proper scoring
+rules that measure the discrepancy between the predicted distribution and the
+target distribution into a differentiable discrete form and use it as a loss
+function to train the model end-to-end. This allows the model to sample
+numerous samples in a single forward pass to estimate the potential
+distribution of the response variable. We have compared our method with several
+existing approaches on multiple datasets and achieved state-of-the-art
+performance. Additionally, our method significantly improves computational
+efficiency. For example, compared to state-of-the-art models, DistPred has a
+180x faster inference speed Experimental results can be reproduced through
+https://github.com/Anoise/DistPred.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at KDD 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An In-Depth Analysis of Adversarial Discriminative Domain Adaptation for
+  Digit Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19391v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19391v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Choi, Julian Rodriguez, Edmund Young
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation is an active area of research driven by the growing demand
+for robust machine learning models that perform well on real-world data.
+Adversarial learning for deep neural networks (DNNs) has emerged as a promising
+approach to improving generalization ability, particularly for image
+classification. In this paper, we implement a specific adversarial learning
+technique known as Adversarial Discriminative Domain Adaptation (ADDA) and
+replicate digit classification experiments from the original ADDA paper. We
+extend their findings by examining a broader range of domain shifts and provide
+a detailed analysis of in-domain classification accuracy post-ADDA. Our results
+demonstrate that ADDA significantly improves accuracy across certain domain
+shifts with minimal impact on in-domain performance. Furthermore, we provide
+qualitative analysis and propose potential explanations for ADDA's limitations
+in less successful domain shifts. Code is at
+https://github.com/eugenechoi2004/COS429_FINAL .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Replacement: Updated methodology section to include grayscale
+  preprocessing of SVHN data</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Network Prediction of Strong Lensing Systems with Domain
+  Adaptation and Uncertainty Quantification <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2411.03334v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2411.03334v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shrihan Agarwal, Aleksandra Ćiprijanović, Brian D. Nord
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling strong gravitational lenses is computationally expensive for the
+complex data from modern and next-generation cosmic surveys. Deep learning has
+emerged as a promising approach for finding lenses and predicting lensing
+parameters, such as the Einstein radius. Mean-variance Estimators (MVEs) are a
+common approach for obtaining aleatoric (data) uncertainties from a neural
+network prediction. However, neural networks have not been demonstrated to
+perform well on out-of-domain target data successfully - e.g., when trained on
+simulated data and applied to real, observational data. In this work, we
+perform the first study of the efficacy of MVEs in combination with
+unsupervised domain adaptation (UDA) on strong lensing data. The source domain
+data is noiseless, and the target domain data has noise mimicking modern
+cosmology surveys. We find that adding UDA to MVE increases the accuracy on the
+target data by a factor of about two over an MVE model without UDA. Including
+UDA also permits much more well-calibrated aleatoric uncertainty predictions.
+Advancements in this approach may enable future applications of MVE models to
+real observational data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Machine Learning for Physical Sciences workshop at
+  NeurIPS 2024; 24 pages, 2 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VidFormer: A novel end-to-end framework fused by 3DCNN and <span class="highlight-title">Transformer</span>
+  for Video-based Remote Physiological Measurement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01691v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01691v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Li, Shisheng Guo, Longzhen Tang, Cuolong Cui, Lingjiang Kong, Xiaobo Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote physiological signal measurement based on facial videos, also known as
+remote photoplethysmography (rPPG), involves predicting changes in facial
+vascular blood flow from facial videos. While most deep learning-based methods
+have achieved good results, they often struggle to balance performance across
+small and large-scale datasets due to the inherent limitations of convolutional
+neural networks (CNNs) and Transformer. In this paper, we introduce VidFormer,
+a novel end-to-end framework that integrates 3-Dimension Convolutional Neural
+Network (3DCNN) and Transformer models for rPPG tasks. Initially, we conduct an
+analysis of the traditional skin reflection model and subsequently introduce an
+enhanced model for the reconstruction of rPPG signals. Based on this improved
+model, VidFormer utilizes 3DCNN and Transformer to extract local and global
+features from input data, respectively. To enhance the spatiotemporal feature
+extraction capabilities of VidFormer, we incorporate temporal-spatial attention
+mechanisms tailored for both 3DCNN and Transformer. Additionally, we design a
+module to facilitate information exchange and fusion between the 3DCNN and
+Transformer. Our evaluation on five publicly available datasets demonstrates
+that VidFormer outperforms current state-of-the-art (SOTA) methods. Finally, we
+discuss the essential roles of each VidFormer module and examine the effects of
+ethnicity, makeup, and exercise on its performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Socratic Questioning: Learn to Self-guide Multimodal Reasoning in the
+  Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02964v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02964v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanpeng Hu, Haodi Liu, Lin Chen, Feng Zhou, Changming Xiao, Qi Yang, Changshui Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Complex visual reasoning remains a key challenge today. Typically, the
+challenge is tackled using methodologies such as Chain of Thought (COT) and
+visual instruction tuning. However, how to organically combine these two
+methodologies for greater success remains unexplored. Also, issues like
+hallucinations and high training cost still need to be addressed. In this work,
+we devise an innovative multi-round training and reasoning framework suitable
+for lightweight Multimodal Large Language Models (MLLMs). Our self-questioning
+approach heuristically guides MLLMs to focus on visual clues relevant to the
+target problem, reducing hallucinations and enhancing the model's ability to
+describe fine-grained image details. This ultimately enables the model to
+perform well in complex visual reasoning and question-answering tasks. We have
+named this framework Socratic Questioning(SQ). To facilitate future research,
+we create a multimodal mini-dataset named CapQA, which includes 1k images of
+fine-grained activities, for visual instruction tuning and evaluation, our
+proposed SQ method leads to a 31.2% improvement in the hallucination score. Our
+extensive experiments on various benchmarks demonstrate SQ's remarkable
+capabilities in heuristic self-questioning, zero-shot visual reasoning and
+hallucination mitigation. Our model and code will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ INFELM: In-depth Fairness Evaluation of Large Text-To-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.01973v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.01973v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Jin, Xing Liu, Yu Liu, Jia Qing Yap, Andrea Wong, Adriana Crespo, Qi Lin, Zhiyuan Yin, Qiang Yan, Ryan Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of large language models (LLMs) and large vision models
+(LVMs) have propelled the evolution of multi-modal AI systems, which have
+demonstrated the remarkable potential for industrial applications by emulating
+human-like cognition. However, they also pose significant ethical challenges,
+including amplifying harmful content and reinforcing societal biases. For
+instance, biases in some industrial image generation models highlighted the
+urgent need for robust fairness assessments. Most existing evaluation
+frameworks focus on the comprehensiveness of various aspects of the models, but
+they exhibit critical limitations, including insufficient attention to content
+generation alignment and social bias-sensitive domains. More importantly, their
+reliance on pixel-detection techniques is prone to inaccuracies.
+  To address these issues, this paper presents INFELM, an in-depth fairness
+evaluation on widely-used text-to-image models. Our key contributions are: (1)
+an advanced skintone classifier incorporating facial topology and refined skin
+pixel representation to enhance classification precision by at least 16.04%,
+(2) a bias-sensitive content alignment measurement for understanding societal
+impacts, (3) a generalizable representation bias evaluation for diverse
+demographic groups, and (4) extensive experiments analyzing large-scale
+text-to-image model outputs across six social-bias-sensitive domains. We find
+that existing models in the study generally do not meet the empirical fairness
+criteria, and representation bias is generally more pronounced than alignment
+errors. INFELM establishes a robust benchmark for fairness assessment,
+supporting the development of multi-modal AI systems that align with ethical
+and human-centric principles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Di Jin and Xing Liu contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlanLLM: Video Procedure Planning with Refinable Large Language Models <span class="chip">AAAI2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.19139v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.19139v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dejie Yang, Zijing Zhao, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video procedure planning, i.e., planning a sequence of action steps given the
+video frames of start and goal states, is an essential ability for embodied AI.
+Recent works utilize Large Language Models (LLMs) to generate enriched action
+step description texts to guide action step decoding. Although LLMs are
+introduced, these methods decode the action steps into a closed-set of one-hot
+vectors, limiting the model's capability of generalizing to new steps or tasks.
+Additionally, fixed action step descriptions based on world-level commonsense
+may contain noise in specific instances of visual states. In this paper, we
+propose PlanLLM, a cross-modal joint learning framework with LLMs for video
+procedure planning. We propose an LLM-Enhanced Planning module which fully uses
+the generalization ability of LLMs to produce free-form planning output and to
+enhance action step decoding. We also propose Mutual Information Maximization
+module to connect world-level commonsense of step descriptions and
+sample-specific information of visual states, enabling LLMs to employ the
+reasoning ability to generate step sequences. With the assistance of LLMs, our
+method can both closed-set and open vocabulary procedure planning tasks. Our
+PlanLLM achieves superior performance on three benchmarks, demonstrating the
+effectiveness of our designs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to AAAI2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rare-to-Frequent: Unlocking Compositional Generation Power of Diffusion
+  Models on Rare Concepts with LLM Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.22376v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.22376v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongmin Park, Sebin Kim, Taehong Moon, Minkyu Kim, Kangwook Lee, Jaewoong Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art text-to-image (T2I) diffusion models often struggle to
+generate rare compositions of concepts, e.g., objects with unusual attributes.
+In this paper, we show that the compositional generation power of diffusion
+models on such rare concepts can be significantly enhanced by the Large
+Language Model (LLM) guidance. We start with empirical and theoretical
+analysis, demonstrating that exposing frequent concepts relevant to the target
+rare concepts during the diffusion sampling process yields more accurate
+concept composition. Based on this, we propose a training-free approach, R2F,
+that plans and executes the overall rare-to-frequent concept guidance
+throughout the diffusion inference by leveraging the abundant semantic
+knowledge in LLMs. Our framework is flexible across any pre-trained diffusion
+models and LLMs, and can be seamlessly integrated with the region-guided
+diffusion approaches. Extensive experiments on three datasets, including our
+newly proposed benchmark, RareBench, containing various prompts with rare
+compositions of concepts, R2F significantly surpasses existing models including
+SD3.0 and FLUX by up to 28.1%p in T2I alignment. Code is available at
+https://github.com/krafton-ai/Rare-to-Frequent.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncovering Latent Chain of Thought Vectors in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2409.14026v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2409.14026v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Zhang, Scott Viteri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As language models grow more influential and trusted in our society, our
+ability to reliably steer them toward favorable behaviors becomes increasingly
+paramount. For this, we investigate the technique of steering vectors: biasing
+the forward pass of language models using a "steering vector" derived from a
+specific task. We apply them to steer language models toward performing Chain
+of Thought (CoT) Reasoning without the need to prompt through natural language.
+We demonstrate this approach on Llama3 8b and Mistral 7b v0.2, and obtain
+competitive results compared to CoT-prompted performances on a series of
+reasoning benchmarks (GSM8k, MMLU, AGI Eval, ARC AI2) and qualitative examples.
+We find this approach yields consistent steering towards CoT responses and
+takes less compute than traditional methods of fine-tuning models towards CoT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards generalization of drug response prediction to single cells and
+  patients utilizing importance-aware multi-source domain transfer learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Liu, Wei Duan, Judong Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of single-cell sequencing technology has promoted the
+generation of a large amount of single-cell transcriptional profiles, providing
+unprecedented opportunities to identify drug-resistant cell subpopulations
+within a tumor. However, few studies have focused on drug response prediction
+at single-cell level, and their performance remains suboptimal. This paper
+proposed scAdaDrug, a novel multi-source domain adaptation model powered by
+adaptive importance-aware representation learning to predict drug response of
+individual cells. We used a shared encoder to extract domain-invariant features
+related to drug response from multiple source domains by utilizing adversarial
+domain adaptation. Particularly, we introduced a plug-and-play module to
+generate importance-aware and mutually independent weights, which could
+adaptively modulate the latent representation of each sample in element-wise
+manner between source and target domains. Extensive experimental results showed
+that our model achieved state-of-the-art performance in predicting drug
+response on multiple independent datasets, including single-cell datasets
+derived from both cell lines and patient-derived xenografts (PDX) models, as
+well as clinical tumor patient cohorts. Moreover, the ablation experiments
+demonstrated our model effectively captured the underlying patterns determining
+drug response from multiple source domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-Powered Multi-Agent System for Automated Crypto Portfolio Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00826v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00826v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichen Luo, Yebo Feng, Jiahua Xu, Paolo Tasca, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cryptocurrency investment is inherently difficult due to its shorter history
+compared to traditional assets, the need to integrate vast amounts of data from
+various modalities, and the requirement for complex reasoning. While deep
+learning approaches have been applied to address these challenges, their
+black-box nature raises concerns about trust and explainability. Recently,
+large language models (LLMs) have shown promise in financial applications due
+to their ability to understand multi-modal data and generate explainable
+decisions. However, single LLM faces limitations in complex, comprehensive
+tasks such as asset investment. These limitations are even more pronounced in
+cryptocurrency investment, where LLMs have less domain-specific knowledge in
+their training corpora.
+  To overcome these challenges, we propose an explainable, multi-modal,
+multi-agent framework for cryptocurrency investment. Our framework uses
+specialized agents that collaborate within and across teams to handle subtasks
+such as data analysis, literature integration, and investment decision-making
+for the top 30 cryptocurrencies by market capitalization. The expert training
+module fine-tunes agents using multi-modal historical data and professional
+investment literature, while the multi-agent investment module employs
+real-time data to make informed cryptocurrency investment decisions. Unique
+intrateam and interteam collaboration mechanisms enhance prediction accuracy by
+adjusting final predictions based on confidence levels within agent teams and
+facilitating information sharing between teams. Empirical evaluation using data
+from November 2023 to September 2024 demonstrates that our framework
+outperforms single-agent models and market benchmarks in classification, asset
+pricing, portfolio, and explainability performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainable Diagnosis Prediction through Neuro-Symbolic Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.01855v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.01855v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiuhao Lu, Rui Li, Elham Sagheb, Andrew Wen, Jinlian Wang, Liwei Wang, Jungwei W. Fan, Hongfang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosis prediction is a critical task in healthcare, where timely and
+accurate identification of medical conditions can significantly impact patient
+outcomes. Traditional machine learning and deep learning models have achieved
+notable success in this domain but often lack interpretability which is a
+crucial requirement in clinical settings. In this study, we explore the use of
+neuro-symbolic methods, specifically Logical Neural Networks (LNNs), to develop
+explainable models for diagnosis prediction. Essentially, we design and
+implement LNN-based models that integrate domain-specific knowledge through
+logical rules with learnable thresholds. Our models, particularly
+$M_{\text{multi-pathway}}$ and $M_{\text{comprehensive}}$, demonstrate superior
+performance over traditional models such as Logistic Regression, SVM, and
+Random Forest, achieving higher accuracy (up to 80.52\%) and AUROC scores (up
+to 0.8457) in the case study of diabetes prediction. The learned weights and
+thresholds within the LNN models provide direct insights into feature
+contributions, enhancing interpretability without compromising predictive
+power. These findings highlight the potential of neuro-symbolic approaches in
+bridging the gap between accuracy and explainability in healthcare AI
+applications. By offering transparent and adaptable diagnostic models, our work
+contributes to the advancement of precision medicine and supports the
+development of equitable healthcare solutions. Future research will focus on
+extending these methods to larger and more diverse datasets to further validate
+their applicability across different medical conditions and populations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of AMIA Informatics Summit 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LENS-XAI: Redefining Lightweight and Explainable Network Security
+  through Knowledge Distillation and Variational Autoencoders for Scalable
+  Intrusion Detection in Cybersecurity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.00790v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.00790v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammet Anil Yagiz, Polat Goktas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid proliferation of Industrial Internet of Things (IIoT) systems
+necessitates advanced, interpretable, and scalable intrusion detection systems
+(IDS) to combat emerging cyber threats. Traditional IDS face challenges such as
+high computational demands, limited explainability, and inflexibility against
+evolving attack patterns. To address these limitations, this study introduces
+the Lightweight Explainable Network Security framework (LENS-XAI), which
+combines robust intrusion detection with enhanced interpretability and
+scalability. LENS-XAI integrates knowledge distillation, variational
+autoencoder models, and attribution-based explainability techniques to achieve
+high detection accuracy and transparency in decision-making. By leveraging a
+training set comprising 10% of the available data, the framework optimizes
+computational efficiency without sacrificing performance. Experimental
+evaluation on four benchmark datasets: Edge-IIoTset, UKM-IDS20, CTU-13, and
+NSL-KDD, demonstrates the framework's superior performance, achieving detection
+accuracies of 95.34%, 99.92%, 98.42%, and 99.34%, respectively. Additionally,
+the framework excels in reducing false positives and adapting to complex attack
+scenarios, outperforming existing state-of-the-art methods. Key strengths of
+LENS-XAI include its lightweight design, suitable for resource-constrained
+environments, and its scalability across diverse IIoT and cybersecurity
+contexts. Moreover, the explainability module enhances trust and transparency,
+critical for practical deployment in dynamic and sensitive applications. This
+research contributes significantly to advancing IDS by addressing computational
+efficiency, feature interpretability, and real-world applicability. Future work
+could focus on extending the framework to ensemble AI systems for distributed
+environments, further enhancing its robustness and adaptability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Minimum Weighted Feedback Arc Sets for Ranking from Pairwise Comparisons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.16181v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.16181v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroush Vahidi, Ioannis Koutis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Minimum Weighted Feedback Arc Set (MWFAS) problem is fundamentally
+connected to the Ranking Problem -- the task of deriving global rankings from
+pairwise comparisons. Recent work [He et al. ICML2022] has advanced the
+state-of-the-art for the Ranking Problem using learning-based methods,
+improving upon multiple previous approaches. However, the connection to MWFAS
+remains underexplored. This paper investigates this relationship and presents
+efficient combinatorial algorithms for solving MWFAS, thus addressing the
+Ranking Problem. Our experimental results demonstrate that these simple,
+learning-free algorithms not only significantly outperform learning-based
+methods in terms of speed but also generally achieve superior ranking accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a preliminary paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COAT: Compressing Optimizer states and Activation for Memory-Efficient
+  FP8 Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2410.19313v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2410.19313v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haocheng Xi, Han Cai, Ligeng Zhu, Yao Lu, Kurt Keutzer, Jianfei Chen, Song Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  FP8 training has emerged as a promising method for improving training
+efficiency. Existing frameworks accelerate training by applying FP8 computation
+to linear layers while leaving optimizer states and activations in higher
+precision, which fails to fully optimize memory usage. This paper introduces
+COAT (Compressing Optimizer States and Activations for FP8 Training), a novel
+FP8 training framework designed to significantly reduce memory footprint when
+training large models. COAT addresses current limitations through two key
+innovations: (1) Dynamic Range Expansion, which aligns optimizer state
+distributions more closely with the FP8 representation range, thereby reducing
+quantization error, and (2) Mixed-Granularity Activation Quantization, which
+optimizes activation memory using a combination of per-tensor and per-group
+quantization strategies. Experiments demonstrate that COAT effectively reduces
+end-to-end training memory footprint by 1.54x compared to BF16 while achieving
+nearly lossless performance across various tasks, such as Large Language Model
+pretraining and fine-tuning and Vision Language Model training. COAT also
+achieves a 1.43x end-to-end training speedup compared to BF16, performing on
+par with or surpassing TransformerEngine's speedup. COAT enables efficient
+full-parameter training of large models on fewer GPUs, and facilitates doubling
+the batch size in distributed training settings, providing a practical solution
+for scaling large-scale model training. The code is available at
+https://github.com/NVlabs/COAT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages. 9 Figures. 13 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Sequential Bayesian Inference for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01828v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01828v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Kessler, Adam Cobb, Tim G. J. Rudner, Stefan Zohren, Stephen J. Roberts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential Bayesian inference can be used for continual learning to prevent
+catastrophic forgetting of past tasks and provide an informative prior when
+learning new tasks. We revisit sequential Bayesian inference and test whether
+having access to the true posterior is guaranteed to prevent catastrophic
+forgetting in Bayesian neural networks. To do this we perform sequential
+Bayesian inference using Hamiltonian Monte Carlo. We propagate the posterior as
+a prior for new tasks by fitting a density estimator on Hamiltonian Monte Carlo
+samples. We find that this approach fails to prevent catastrophic forgetting
+demonstrating the difficulty in performing sequential Bayesian inference in
+neural networks. From there we study simple analytical examples of sequential
+Bayesian inference and CL and highlight the issue of model misspecification
+which can lead to sub-optimal continual learning performance despite exact
+inference. Furthermore, we discuss how task data imbalances can cause
+forgetting. From these limitations, we argue that we need probabilistic models
+of the continual learning generative process rather than relying on sequential
+Bayesian inference over Bayesian neural network weights. In this vein, we also
+propose a simple baseline called Prototypical Bayesian Continual Learning,
+which is competitive with state-of-the-art Bayesian continual learning methods
+on class incremental continual learning vision benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Supercedes Entropy publication with updates to Section 4</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interesting Scientific Idea Generation using Knowledge Graphs and LLMs:
+  Evaluations with 100 Research Group Leaders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17044v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17044v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuemei Gu, Mario Krenn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid growth of scientific literature makes it challenging for
+researchers to identify novel and impactful ideas, especially across
+disciplines. Modern artificial intelligence (AI) systems offer new approaches,
+potentially inspiring ideas not conceived by humans alone. But how compelling
+are these AI-generated ideas, and how can we improve their quality? Here, we
+introduce SciMuse, which uses 58 million research papers and a large-language
+model to generate research ideas. We conduct a large-scale evaluation in which
+over 100 research group leaders -- from natural sciences to humanities --
+ranked more than 4,400 personalized ideas based on their interest. This data
+allows us to predict research interest using (1) supervised neural networks
+trained on human evaluations, and (2) unsupervised zero-shot ranking with
+large-language models. Our results demonstrate how future systems can help
+generating compelling research ideas and foster unforeseen interdisciplinary
+collaborations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages; 4 figures; Appendix: 6 pages, 5 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forecasting high-impact research topics via machine learning on evolving
+  knowledge graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08640v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08640v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuemei Gu, Mario Krenn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exponential growth in scientific publications poses a severe challenge
+for human researchers. It forces attention to more narrow sub-fields, which
+makes it challenging to discover new impactful research ideas and
+collaborations outside one's own field. While there are ways to predict a
+scientific paper's future citation counts, they need the research to be
+finished and the paper written, usually assessing impact long after the idea
+was conceived. Here we show how to predict the impact of onsets of ideas that
+have never been published by researchers. For that, we developed a large
+evolving knowledge graph built from more than 21 million scientific papers. It
+combines a semantic network created from the content of the papers and an
+impact network created from the historic citations of papers. Using machine
+learning, we can predict the dynamic of the evolving network into the future
+with high accuracy (AUC values beyond 0.9 for most experiments), and thereby
+the impact of new research directions. We envision that the ability to predict
+the impact of new ideas will be a crucial component of future artificial muses
+that can inspire new impactful and interesting scientific ideas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 12 figures, Comments welcome!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-Aware Isomorphic Attention for Adaptive Dynamics in <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2501.02393v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2501.02393v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus J. Buehler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an approach to modifying Transformer architectures by integrating
+graph-aware relational reasoning into the attention mechanism, merging concepts
+from graph neural networks and language modeling. Building on the inherent
+connection between attention and graph theory, we reformulate the Transformer's
+attention mechanism as a graph operation and propose Graph-Aware Isomorphic
+Attention. This method leverages advanced graph modeling strategies, including
+Graph Isomorphism Networks (GIN) and Principal Neighborhood Aggregation (PNA),
+to enrich the representation of relational structures. Our approach captures
+complex dependencies and generalizes across tasks, as evidenced by a reduced
+generalization gap and improved learning performance. Additionally, we expand
+the concept of graph-aware attention to introduce Sparse GIN-Attention, a
+fine-tuning approach that employs sparse GINs. By interpreting attention
+matrices as sparse adjacency graphs, this technique enhances the adaptability
+of pre-trained foundational models with minimal computational overhead,
+endowing them with graph-aware capabilities. Sparse GIN-Attention fine-tuning
+achieves improved training dynamics and better generalization compared to
+alternative methods like low-rank adaption (LoRA). We discuss latent graph-like
+structures within traditional attention mechanisms, offering a new lens through
+which Transformers can be understood. By evolving Transformers as hierarchical
+GIN models for relational reasoning. This perspective suggests profound
+implications for foundational model development, enabling the design of
+architectures that dynamically adapt to both local and global dependencies.
+Applications in bioinformatics, materials science, language modeling, and
+beyond could benefit from this synthesis of relational and sequential data
+modeling, setting the stage for interpretable and generalizable modeling
+strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlocking the diagnostic potential of electrocardiograms through
+  information transfer from cardiac magnetic resonance imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05764v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05764v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Özgün Turgut, Philip Müller, Paul Hager, Suprosanna Shit, Sophie Starck, Martin J. Menten, Eimo Martens, Daniel Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiovascular diseases (CVD) can be diagnosed using various diagnostic
+modalities. The electrocardiogram (ECG) is a cost-effective and widely
+available diagnostic aid that provides functional information of the heart.
+However, its ability to classify and spatially localise CVD is limited. In
+contrast, cardiac magnetic resonance (CMR) imaging provides detailed structural
+information of the heart and thus enables evidence-based diagnosis of CVD, but
+long scan times and high costs limit its use in clinical routine. In this work,
+we present a deep learning strategy for cost-effective and comprehensive
+cardiac screening solely from ECG. Our approach combines multimodal contrastive
+learning with masked data modelling to transfer domain-specific information
+from CMR imaging to ECG representations. In extensive experiments using data
+from 40,044 UK Biobank subjects, we demonstrate the utility and
+generalisability of our method for subject-specific risk prediction of CVD and
+the prediction of cardiac phenotypes using only ECG data. Specifically, our
+novel multimodal pre-training paradigm improves performance by up to 12.19 %
+for risk prediction and 27.59 % for phenotype prediction. In a qualitative
+analysis, we demonstrate that our learned ECG representations incorporate
+information from CMR image regions of interest. Our entire pipeline is publicly
+available at https://github.com/oetu/MMCL-ECG-CMR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparing Bad Apples to Good Oranges: Aligning Large Language Models via
+  Joint Preference Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00530v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00530v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hritik Bansal, Ashima Suvarna, Gantavya Bhatt, Nanyun Peng, Kai-Wei Chang, Aditya Grover
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common technique for aligning large language models (LLMs) relies on
+acquiring human preferences by comparing multiple generations conditioned on a
+fixed context. This method, however, relies solely on pairwise comparisons,
+where the generations are evaluated within an identical context. While
+effective to such conditional preferences often fail to encompass the nuanced
+and multidimensional nature of human preferences. In this work, we revisit the
+traditional paradigm of preference acquisition and propose a new axis based on
+eliciting preferences jointly over the instruction-response pairs. Unlike prior
+preference optimizations, which are designed for conditional ranking protocols
+(e.g., DPO), we propose Joint Preference Optimization (JPO), a new preference
+optimization objective that upweights the joint probability of the chosen
+instruction-response pair over the rejected instruction-response pair.
+Interestingly, LLMs trained with joint instruction-response preference data
+using JPO outperform LLM trained with DPO by $5.2\%$ and $3.3\%$ win-rate for
+summarization and open-ended dialogue datasets, respectively. Our findings
+reveal that joint preferences over instruction and response pairs can
+significantly enhance the alignment of LLMs by tapping into a broader spectrum
+of human preference elicitation. The data and code is available at
+https://github.com/Hritikbansal/dove.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 16 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open-Source Acceleration of Stable-Diffusion.cpp Deployable on All
+  Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2412.05781v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2412.05781v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingxu Ng, Cheng Lv, Pu Zhao, Wei Niu, Juyi Lin, Minzhou Pan, Yun Liang, Yanzhi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stable diffusion plays a crucial role in generating high-quality images.
+However, image generation is time-consuming and memory-intensive. To address
+this, stable-diffusion.cpp (Sdcpp) emerges as an efficient inference framework
+to accelerate the diffusion models. Although it is lightweight, the current
+implementation of ggml_conv_2d operator in Sdcpp is suboptimal, exhibiting both
+high inference latency and massive memory usage. To address this, in this work,
+we present an optimized version of Sdcpp leveraging the Winograd algorithm to
+accelerate 2D convolution operations, which is the primary bottleneck in the
+pipeline. By analyzing both dependent and independent computation graphs, we
+exploit the device's locality and parallelism to achieve substantial
+performance improvements. Our framework delivers correct end-to-end results
+across various stable diffusion models, including SDv1.4, v1.5, v2.1, SDXL, and
+SDXL-Turbo. Our evaluation results demonstrate a speedup up to 2.76x for
+individual convolutional layers and an inference speedup up to 4.79x for the
+overall image generation process, compared with the original Sdcpp on M1 pro.
+Homepage: https://github.com/SealAILab/stable-diffusion-cpp
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting the Graph Reasoning Ability of Large Language Models: Case
+  Studies in Translation, Connectivity and Shortest Path 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09529v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09529v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinnan Dai, Qihao Wen, Yifei Shen, Hongzhi Wen, Dongsheng Li, Jiliang Tang, Caihua Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have achieved great success in various reasoning
+tasks. In this work, we focus on the graph reasoning ability of LLMs. Although
+theoretical studies proved that LLMs are capable of handling graph reasoning
+tasks, empirical evaluations reveal numerous failures. To deepen our
+understanding on this discrepancy, we revisit the ability of LLMs on three
+fundamental graph tasks: graph description translation, graph connectivity, and
+the shortest-path problem. Our findings suggest that LLMs can fail to
+understand graph structures through text descriptions and exhibit varying
+performance for all these three fundamental tasks. Meanwhile, we perform a
+real-world investigation on knowledge graphs and make consistent observations
+with our findings. The codes and datasets are available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+
+</body>
+
+<footer>
+    <div>
+        <time id="build-timestamp" datetime="2025-01-22T05:00:59.705761582Z">
+             <a href="https://github.com/MLNLP-World/MyArxiv/actions">
+                <img id="build-timestamp-badge"
+                     src="https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?label=2025-01-22 05:00:59 UTC&style=for-the-badge"
+                alt="2025-01-22 05:00:59 UTC">
+            </a>
+            2025-01-22 05:00:59 UTC
+        </time>
+    </div>
+</footer>
+<script src="index.js"></script>
+</html>
diff --git a/index.js b/index.js
new file mode 100644
index 0000000..69f5da7
--- /dev/null
+++ b/index.js
@@ -0,0 +1,39 @@
+/* Exapand/Collapse with TAB key */
+var expanded = false;
+document.onkeydown = function (e) {
+    if (e.keyCode === 9) {
+        expanded = !expanded;
+        document.querySelectorAll("details").forEach(detail => detail.open = expanded);
+        return false;
+    }
+};
+
+/* Switch Theme */
+const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+
+function switchTheme(e) {
+    if (e.target.checked) {
+        document.documentElement.setAttribute('data-theme', 'light');
+        document.getElementById("theme-icon").className = "ri-sun-line";
+        localStorage.setItem('theme', 'light'); //add this
+    } else {
+        document.documentElement.setAttribute('data-theme', 'dark');
+        document.getElementById("theme-icon").className = "ri-moon-line";
+        localStorage.setItem('theme', 'dark'); //add this
+    }
+}
+
+toggleSwitch.addEventListener('change', switchTheme, false);
+const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+if (currentTheme) {
+    document.documentElement.setAttribute('data-theme', currentTheme);
+    if (currentTheme === 'light') {
+        toggleSwitch.checked = true;
+    }
+}
+
+const timestamp = document.getElementById("build-timestamp");
+const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString();
+
+const badge = document.getElementById("build-timestamp-badge");
+// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`